/* * Dynamic queue limits (dql) - Definitions * * Copyright (c) 2011, Tom Herbert * * This header file contains the definitions for dynamic queue limits (dql). * dql would be used in conjunction with a producer/consumer type queue * (possibly a HW queue). Such a queue would have these general properties: * * 1) Objects are queued up to some limit specified as number of objects. * 2) Periodically a completion process executes which retires consumed * objects. * 3) Starvation occurs when limit has been reached, all queued data has * actually been consumed, but completion processing has not yet run * so queuing new data is blocked. * 4) Minimizing the amount of queued data is desirable. * * The goal of dql is to calculate the limit as the minimum number of objects * needed to prevent starvation. * * The primary functions of dql are: * dql_queued - called when objects are enqueued to record number of objects * dql_avail - returns how many objects are available to be queued based * on the object limit and how many objects are already enqueued * dql_completed - called at completion time to indicate how many objects * were retired from the queue * * The dql implementation does not implement any locking for the dql data * structures, the higher layer should provide this. dql_queued should * be serialized to prevent concurrent execution of the function; this * is also true for dql_completed. However, dql_queued and dlq_completed can * be executed concurrently (i.e. they can be protected by different locks). */ #ifndef _LINUX_DQL_H #define _LINUX_DQL_H #ifdef __KERNEL__ struct dql { /* Fields accessed in enqueue path (dql_queued) */ unsigned int num_queued; /* Total ever queued */ unsigned int adj_limit; /* limit + num_completed */ unsigned int last_obj_cnt; /* Count at last queuing */ /* Fields accessed only by completion path (dql_completed) */ unsigned int limit ____cacheline_aligned_in_smp; /* Current limit */ unsigned int num_completed; /* Total ever completed */ unsigned int prev_ovlimit; /* Previous over limit */ unsigned int prev_num_queued; /* Previous queue total */ unsigned int prev_last_obj_cnt; /* Previous queuing cnt */ unsigned int lowest_slack; /* Lowest slack found */ unsigned long slack_start_time; /* Time slacks seen */ /* Configuration */ unsigned int max_limit; /* Max limit */ unsigned int min_limit; /* Minimum limit */ unsigned int slack_hold_time; /* Time to measure slack */ }; /* Set some static maximums */ #define DQL_MAX_OBJECT (UINT_MAX / 16) #define DQL_MAX_LIMIT ((UINT_MAX / 2) - DQL_MAX_OBJECT) /* * Record number of objects queued. Assumes that caller has already checked * availability in the queue with dql_avail. */ static inline void dql_queued(struct dql *dql, unsigned int count) { BUG_ON(count > DQL_MAX_OBJECT); dql->last_obj_cnt = count; /* We want to force a write first, so that cpu do not attempt * to get cache line containing last_obj_cnt, num_queued, adj_limit * in Shared state, but directly does a Request For Ownership * It is only a hint, we use barrier() only. */ barrier(); dql->num_queued += count; } /* Returns how many objects can be queued, < 0 indicates over limit. */ static inline int dql_avail(const struct dql *dql) { return ACCESS_ONCE(dql->adj_limit) - ACCESS_ONCE(dql->num_queued); } /* Record number of completed objects and recalculate the limit. */ void dql_completed(struct dql *dql, unsigned int count); /* Reset dql state */ void dql_reset(struct dql *dql); /* Initialize dql state */ int dql_init(struct dql *dql, unsigned hold_time); #endif /* _KERNEL_ */ #endif /* _LINUX_DQL_H */ tion> Russell King's ARM Linux kernel treeRussell King
summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/6lowpan_i.h16
-rw-r--r--net/6lowpan/Kconfig33
-rw-r--r--net/6lowpan/core.c20
-rw-r--r--net/6lowpan/debugfs.c141
-rw-r--r--net/6lowpan/iphc.c2
-rw-r--r--net/6lowpan/ndisc.c32
-rw-r--r--net/6lowpan/nhc.c110
-rw-r--r--net/6lowpan/nhc.h38
-rw-r--r--net/6lowpan/nhc_dest.c15
-rw-r--r--net/6lowpan/nhc_fragment.c15
-rw-r--r--net/6lowpan/nhc_ghc_ext_dest.c15
-rw-r--r--net/6lowpan/nhc_ghc_ext_frag.c17
-rw-r--r--net/6lowpan/nhc_ghc_ext_hop.c15
-rw-r--r--net/6lowpan/nhc_ghc_ext_route.c15
-rw-r--r--net/6lowpan/nhc_ghc_icmpv6.c15
-rw-r--r--net/6lowpan/nhc_ghc_udp.c15
-rw-r--r--net/6lowpan/nhc_hop.c15
-rw-r--r--net/6lowpan/nhc_ipv6.c17
-rw-r--r--net/6lowpan/nhc_mobility.c15
-rw-r--r--net/6lowpan/nhc_routing.c15
-rw-r--r--net/6lowpan/nhc_udp.c20
-rw-r--r--net/802/Kconfig1
-rw-r--r--net/802/Makefile6
-rw-r--r--net/802/fc.c6
-rw-r--r--net/802/fddi.c7
-rw-r--r--net/802/garp.c28
-rw-r--r--net/802/hippi.c10
-rw-r--r--net/802/mrp.c56
-rw-r--r--net/802/p8022.c66
-rw-r--r--net/802/p8023.c64
-rw-r--r--net/802/psnap.c15
-rw-r--r--net/802/stp.c6
-rw-r--r--net/8021q/Kconfig3
-rw-r--r--net/8021q/vlan.c206
-rw-r--r--net/8021q/vlan.h41
-rw-r--r--net/8021q/vlan_core.c148
-rw-r--r--net/8021q/vlan_dev.c524
-rw-r--r--net/8021q/vlan_gvrp.c5
-rw-r--r--net/8021q/vlan_mvrp.c5
-rw-r--r--net/8021q/vlan_netlink.c59
-rw-r--r--net/8021q/vlanproc.c57
-rw-r--r--net/9p/Kconfig20
-rw-r--r--net/9p/Makefile10
-rw-r--r--net/9p/client.c1629
-rw-r--r--net/9p/error.c44
-rw-r--r--net/9p/mod.c81
-rw-r--r--net/9p/protocol.c262
-rw-r--r--net/9p/protocol.h23
-rw-r--r--net/9p/trans_common.c15
-rw-r--r--net/9p/trans_common.h12
-rw-r--r--net/9p/trans_fd.c455
-rw-r--r--net/9p/trans_rdma.c223
-rw-r--r--net/9p/trans_usbg.c969
-rw-r--r--net/9p/trans_virtio.c159
-rw-r--r--net/9p/trans_xen.c235
-rw-r--r--net/9p/util.c140
-rw-r--r--net/Kconfig268
-rw-r--r--net/Kconfig.debug41
-rw-r--r--net/Makefile35
-rw-r--r--net/appletalk/Kconfig30
-rw-r--r--net/appletalk/Makefile3
-rw-r--r--net/appletalk/aarp.c116
-rw-r--r--net/appletalk/atalk_proc.c65
-rw-r--r--net/appletalk/ddp.c302
-rw-r--r--net/appletalk/dev.c46
-rw-r--r--net/appletalk/sysctl_net_atalk.c6
-rw-r--r--net/atm/Kconfig5
-rw-r--r--net/atm/atm_sysfs.c52
-rw-r--r--net/atm/br2684.c11
-rw-r--r--net/atm/clip.c96
-rw-r--r--net/atm/common.c76
-rw-r--r--net/atm/common.h2
-rw-r--r--net/atm/ioctl.c121
-rw-r--r--net/atm/lec.c163
-rw-r--r--net/atm/lec_arpc.h2
-rw-r--r--net/atm/mpc.c8
-rw-r--r--net/atm/mpoa_caches.c6
-rw-r--r--net/atm/mpoa_proc.c20
-rw-r--r--net/atm/pppoatm.c24
-rw-r--r--net/atm/proc.c14
-rw-r--r--net/atm/pvc.c8
-rw-r--r--net/atm/raw.c14
-rw-r--r--net/atm/resources.c136
-rw-r--r--net/atm/resources.h5
-rw-r--r--net/atm/signaling.c6
-rw-r--r--net/atm/svc.c27
-rw-r--r--net/ax25/Kconfig31
-rw-r--r--net/ax25/TODO20
-rw-r--r--net/ax25/af_ax25.c242
-rw-r--r--net/ax25/ax25_addr.c5
-rw-r--r--net/ax25/ax25_dev.c90
-rw-r--r--net/ax25/ax25_ds_in.c5
-rw-r--r--net/ax25/ax25_ds_subr.c5
-rw-r--r--net/ax25/ax25_ds_timer.c9
-rw-r--r--net/ax25/ax25_iface.c11
-rw-r--r--net/ax25/ax25_in.c15
-rw-r--r--net/ax25/ax25_ip.c16
-rw-r--r--net/ax25/ax25_out.c42
-rw-r--r--net/ax25/ax25_route.c113
-rw-r--r--net/ax25/ax25_std_in.c5
-rw-r--r--net/ax25/ax25_std_subr.c5
-rw-r--r--net/ax25/ax25_std_timer.c5
-rw-r--r--net/ax25/ax25_subr.c25
-rw-r--r--net/ax25/ax25_timer.c33
-rw-r--r--net/ax25/ax25_uid.c5
-rw-r--r--net/ax25/sysctl_net_ax25.c13
-rw-r--r--net/batman-adv/Kconfig72
-rw-r--r--net/batman-adv/Makefile25
-rw-r--r--net/batman-adv/bat_algo.c57
-rw-r--r--net/batman-adv/bat_algo.h26
-rw-r--r--net/batman-adv/bat_iv_ogm.c930
-rw-r--r--net/batman-adv/bat_iv_ogm.h14
-rw-r--r--net/batman-adv/bat_v.c393
-rw-r--r--net/batman-adv/bat_v.h14
-rw-r--r--net/batman-adv/bat_v_elp.c209
-rw-r--r--net/batman-adv/bat_v_elp.h18
-rw-r--r--net/batman-adv/bat_v_ogm.c384
-rw-r--r--net/batman-adv/bat_v_ogm.h20
-rw-r--r--net/batman-adv/bitarray.c16
-rw-r--r--net/batman-adv/bitarray.h14
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c593
-rw-r--r--net/batman-adv/bridge_loop_avoidance.h42
-rw-r--r--net/batman-adv/debugfs.c474
-rw-r--r--net/batman-adv/debugfs.h80
-rw-r--r--net/batman-adv/distributed-arp-table.c658
-rw-r--r--net/batman-adv/distributed-arp-table.h49
-rw-r--r--net/batman-adv/fragmentation.c76
-rw-r--r--net/batman-adv/fragmentation.h17
-rw-r--r--net/batman-adv/gateway_client.c179
-rw-r--r--net/batman-adv/gateway_client.h39
-rw-r--r--net/batman-adv/gateway_common.c186
-rw-r--r--net/batman-adv/gateway_common.h28
-rw-r--r--net/batman-adv/hard-interface.c471
-rw-r--r--net/batman-adv/hard-interface.h60
-rw-r--r--net/batman-adv/hash.c16
-rw-r--r--net/batman-adv/hash.h25
-rw-r--r--net/batman-adv/icmp_socket.c418
-rw-r--r--net/batman-adv/icmp_socket.h51
-rw-r--r--net/batman-adv/log.c231
-rw-r--r--net/batman-adv/log.h44
-rw-r--r--net/batman-adv/main.c298
-rw-r--r--net/batman-adv/main.h65
-rw-r--r--net/batman-adv/mesh-interface.c (renamed from net/batman-adv/soft-interface.c)528
-rw-r--r--net/batman-adv/mesh-interface.h41
-rw-r--r--net/batman-adv/multicast.c1651
-rw-r--r--net/batman-adv/multicast.h76
-rw-r--r--net/batman-adv/multicast_forw.c1178
-rw-r--r--net/batman-adv/netlink.c1306
-rw-r--r--net/batman-adv/netlink.h23
-rw-r--r--net/batman-adv/network-coding.c1996
-rw-r--r--net/batman-adv/network-coding.h133
-rw-r--r--net/batman-adv/originator.c588
-rw-r--r--net/batman-adv/originator.h126
-rw-r--r--net/batman-adv/routing.c213
-rw-r--r--net/batman-adv/routing.h32
-rw-r--r--net/batman-adv/send.c460
-rw-r--r--net/batman-adv/send.h33
-rw-r--r--net/batman-adv/soft-interface.h44
-rw-r--r--net/batman-adv/sysfs.c1261
-rw-r--r--net/batman-adv/sysfs.h72
-rw-r--r--net/batman-adv/tp_meter.c103
-rw-r--r--net/batman-adv/tp_meter.h17
-rw-r--r--net/batman-adv/trace.c8
-rw-r--r--net/batman-adv/trace.h64
-rw-r--r--net/batman-adv/translation-table.c886
-rw-r--r--net/batman-adv/translation-table.h48
-rw-r--r--net/batman-adv/tvlv.c130
-rw-r--r--net/batman-adv/tvlv.h30
-rw-r--r--net/batman-adv/types.h672
-rw-r--r--net/bluetooth/6lowpan.c260
-rw-r--r--net/bluetooth/Kconfig64
-rw-r--r--net/bluetooth/Makefile9
-rw-r--r--net/bluetooth/a2mp.c1040
-rw-r--r--net/bluetooth/a2mp.h161
-rw-r--r--net/bluetooth/af_bluetooth.c257
-rw-r--r--net/bluetooth/amp.c596
-rw-r--r--net/bluetooth/amp.h68
-rw-r--r--net/bluetooth/aosp.c210
-rw-r--r--net/bluetooth/aosp.h29
-rw-r--r--net/bluetooth/bnep/Kconfig1
-rw-r--r--net/bluetooth/bnep/Makefile1
-rw-r--r--net/bluetooth/bnep/bnep.h18
-rw-r--r--net/bluetooth/bnep/core.c31
-rw-r--r--net/bluetooth/bnep/netdev.c2
-rw-r--r--net/bluetooth/bnep/sock.c32
-rw-r--r--net/bluetooth/cmtp/Kconfig5
-rw-r--r--net/bluetooth/cmtp/Makefile1
-rw-r--r--net/bluetooth/cmtp/capi.c54
-rw-r--r--net/bluetooth/cmtp/cmtp.h2
-rw-r--r--net/bluetooth/cmtp/core.c25
-rw-r--r--net/bluetooth/cmtp/sock.c21
-rw-r--r--net/bluetooth/coredump.c553
-rw-r--r--net/bluetooth/ecdh_helper.c45
-rw-r--r--net/bluetooth/ecdh_helper.h2
-rw-r--r--net/bluetooth/eir.c386
-rw-r--r--net/bluetooth/eir.h99
-rw-r--r--net/bluetooth/hci_codec.c253
-rw-r--r--net/bluetooth/hci_codec.h7
-rw-r--r--net/bluetooth/hci_conn.c2607
-rw-r--r--net/bluetooth/hci_core.c3196
-rw-r--r--net/bluetooth/hci_debugfs.c499
-rw-r--r--net/bluetooth/hci_debugfs.h5
-rw-r--r--net/bluetooth/hci_drv.c105
-rw-r--r--net/bluetooth/hci_event.c5653
-rw-r--r--net/bluetooth/hci_request.c2818
-rw-r--r--net/bluetooth/hci_request.h139
-rw-r--r--net/bluetooth/hci_sock.c528
-rw-r--r--net/bluetooth/hci_sync.c7420
-rw-r--r--net/bluetooth/hci_sysfs.c80
-rw-r--r--net/bluetooth/hidp/Kconfig4
-rw-r--r--net/bluetooth/hidp/Makefile1
-rw-r--r--net/bluetooth/hidp/core.c65
-rw-r--r--net/bluetooth/hidp/hidp.h2
-rw-r--r--net/bluetooth/hidp/sock.c94
-rw-r--r--net/bluetooth/iso.c2734
-rw-r--r--net/bluetooth/l2cap_core.c2923
-rw-r--r--net/bluetooth/l2cap_sock.c554
-rw-r--r--net/bluetooth/leds.c7
-rw-r--r--net/bluetooth/leds.h5
-rw-r--r--net/bluetooth/lib.c185
-rw-r--r--net/bluetooth/mgmt.c5660
-rw-r--r--net/bluetooth/mgmt_config.c346
-rw-r--r--net/bluetooth/mgmt_config.h17
-rw-r--r--net/bluetooth/mgmt_util.c253
-rw-r--r--net/bluetooth/mgmt_util.h37
-rw-r--r--net/bluetooth/msft.c1201
-rw-r--r--net/bluetooth/msft.h78
-rw-r--r--net/bluetooth/rfcomm/Kconfig1
-rw-r--r--net/bluetooth/rfcomm/Makefile1
-rw-r--r--net/bluetooth/rfcomm/core.c101
-rw-r--r--net/bluetooth/rfcomm/sock.c165
-rw-r--r--net/bluetooth/rfcomm/tty.c143
-rw-r--r--net/bluetooth/sco.c611
-rw-r--r--net/bluetooth/selftest.c2
-rw-r--r--net/bluetooth/smp.c751
-rw-r--r--net/bluetooth/smp.h9
-rw-r--r--net/bpf/Makefile6
-rw-r--r--net/bpf/bpf_dummy_struct_ops.c323
-rw-r--r--net/bpf/test_run.c1789
-rw-r--r--net/bpfilter/.gitignore1
-rw-r--r--net/bpfilter/Kconfig15
-rw-r--r--net/bpfilter/Makefile21
-rw-r--r--net/bpfilter/bpfilter_kern.c117
-rw-r--r--net/bpfilter/bpfilter_umh_blob.S7
-rw-r--r--net/bpfilter/main.c63
-rw-r--r--net/bpfilter/msgfmt.h17
-rw-r--r--net/bridge/Kconfig32
-rw-r--r--net/bridge/Makefile8
-rw-r--r--net/bridge/br.c285
-rw-r--r--net/bridge/br_arp_nd_proxy.c124
-rw-r--r--net/bridge/br_cfm.c867
-rw-r--r--net/bridge/br_cfm_netlink.c726
-rw-r--r--net/bridge/br_device.c252
-rw-r--r--net/bridge/br_fdb.c1108
-rw-r--r--net/bridge/br_forward.c118
-rw-r--r--net/bridge/br_if.c167
-rw-r--r--net/bridge/br_input.c317
-rw-r--r--net/bridge/br_ioctl.c164
-rw-r--r--net/bridge/br_mdb.c1793
-rw-r--r--net/bridge/br_mrp.c1260
-rw-r--r--net/bridge/br_mrp_netlink.c571
-rw-r--r--net/bridge/br_mrp_switchdev.c241
-rw-r--r--net/bridge/br_mst.c366
-rw-r--r--net/bridge/br_multicast.c4561
-rw-r--r--net/bridge/br_multicast_eht.c822
-rw-r--r--net/bridge/br_netfilter_hooks.c669
-rw-r--r--net/bridge/br_netfilter_ipv6.c113
-rw-r--r--net/bridge/br_netlink.c743
-rw-r--r--net/bridge/br_netlink_tunnel.c76
-rw-r--r--net/bridge/br_nf_core.c18
-rw-r--r--net/bridge/br_private.h1561
-rw-r--r--net/bridge/br_private_cfm.h147
-rw-r--r--net/bridge/br_private_mcast_eht.h94
-rw-r--r--net/bridge/br_private_mrp.h148
-rw-r--r--net/bridge/br_private_stp.h6
-rw-r--r--net/bridge/br_private_tunnel.h29
-rw-r--r--net/bridge/br_stp.c85
-rw-r--r--net/bridge/br_stp_bpdu.c15
-rw-r--r--net/bridge/br_stp_if.c34
-rw-r--r--net/bridge/br_stp_timer.c21
-rw-r--r--net/bridge/br_switchdev.c853
-rw-r--r--net/bridge/br_sysfs_br.c296
-rw-r--r--net/bridge/br_sysfs_if.c33
-rw-r--r--net/bridge/br_vlan.c1492
-rw-r--r--net/bridge/br_vlan_options.c740
-rw-r--r--net/bridge/br_vlan_tunnel.c88
-rw-r--r--net/bridge/netfilter/Kconfig47
-rw-r--r--net/bridge/netfilter/Makefile7
-rw-r--r--net/bridge/netfilter/ebt_802_3.c9
-rw-r--r--net/bridge/netfilter/ebt_among.c1
-rw-r--r--net/bridge/netfilter/ebt_arp.c1
-rw-r--r--net/bridge/netfilter/ebt_arpreply.c1
-rw-r--r--net/bridge/netfilter/ebt_dnat.c22
-rw-r--r--net/bridge/netfilter/ebt_ip.c1
-rw-r--r--net/bridge/netfilter/ebt_ip6.c1
-rw-r--r--net/bridge/netfilter/ebt_limit.c5
-rw-r--r--net/bridge/netfilter/ebt_log.c1
-rw-r--r--net/bridge/netfilter/ebt_mark.c5
-rw-r--r--net/bridge/netfilter/ebt_mark_m.c5
-rw-r--r--net/bridge/netfilter/ebt_nflog.c1
-rw-r--r--net/bridge/netfilter/ebt_pkttype.c1
-rw-r--r--net/bridge/netfilter/ebt_redirect.c3
-rw-r--r--net/bridge/netfilter/ebt_snat.c3
-rw-r--r--net/bridge/netfilter/ebt_stp.c2
-rw-r--r--net/bridge/netfilter/ebt_vlan.c14
-rw-r--r--net/bridge/netfilter/ebtable_broute.c88
-rw-r--r--net/bridge/netfilter/ebtable_filter.c58
-rw-r--r--net/bridge/netfilter/ebtable_nat.c58
-rw-r--r--net/bridge/netfilter/ebtables.c759
-rw-r--r--net/bridge/netfilter/nf_conntrack_bridge.c455
-rw-r--r--net/bridge/netfilter/nf_log_bridge.c82
-rw-r--r--net/bridge/netfilter/nft_meta_bridge.c253
-rw-r--r--net/bridge/netfilter/nft_reject_bridge.c266
-rw-r--r--net/caif/Kconfig19
-rw-r--r--net/caif/caif_dev.c43
-rw-r--r--net/caif/caif_socket.c50
-rw-r--r--net/caif/caif_usb.c27
-rw-r--r--net/caif/cfcnfg.c24
-rw-r--r--net/caif/cfctrl.c314
-rw-r--r--net/caif/cfdbgl.c4
-rw-r--r--net/caif/cfdgml.c5
-rw-r--r--net/caif/cffrml.c2
-rw-r--r--net/caif/cfmuxl.c2
-rw-r--r--net/caif/cfpkt_skbuff.c31
-rw-r--r--net/caif/cfrfml.c11
-rw-r--r--net/caif/cfserl.c8
-rw-r--r--net/caif/cfsrvl.c9
-rw-r--r--net/caif/cfutill.c4
-rw-r--r--net/caif/cfveil.c4
-rw-r--r--net/caif/cfvidl.c4
-rw-r--r--net/caif/chnl_net.c42
-rw-r--r--net/can/Kconfig41
-rw-r--r--net/can/Makefile5
-rw-r--r--net/can/af_can.c517
-rw-r--r--net/can/af_can.h37
-rw-r--r--net/can/bcm.c531
-rw-r--r--net/can/gw.c734
-rw-r--r--net/can/isotp.c1739
-rw-r--r--net/can/j1939/Kconfig15
-rw-r--r--net/can/j1939/Makefile10
-rw-r--r--net/can/j1939/address-claim.c270
-rw-r--r--net/can/j1939/bus.c336
-rw-r--r--net/can/j1939/j1939-priv.h345
-rw-r--r--net/can/j1939/main.c430
-rw-r--r--net/can/j1939/socket.c1393
-rw-r--r--net/can/j1939/transport.c2220
-rw-r--r--net/can/proc.c217
-rw-r--r--net/can/raw.c561
-rw-r--r--net/ceph/Kconfig9
-rw-r--r--net/ceph/Makefile7
-rw-r--r--net/ceph/auth.c431
-rw-r--r--net/ceph/auth_none.c14
-rw-r--r--net/ceph/auth_none.h1
-rw-r--r--net/ceph/auth_x.c318
-rw-r--r--net/ceph/auth_x_protocol.h5
-rw-r--r--net/ceph/buffer.c4
-rw-r--r--net/ceph/ceph_common.c654
-rw-r--r--net/ceph/ceph_fs.c104
-rw-r--r--net/ceph/ceph_hash.c22
-rw-r--r--net/ceph/ceph_strings.c28
-rw-r--r--net/ceph/cls_lock_client.c68
-rw-r--r--net/ceph/crush/crush.c3
-rw-r--r--net/ceph/crush/hash.c2
-rw-r--r--net/ceph/crush/mapper.c16
-rw-r--r--net/ceph/crypto.c33
-rw-r--r--net/ceph/crypto.h6
-rw-r--r--net/ceph/debugfs.c82
-rw-r--r--net/ceph/decode.c193
-rw-r--r--net/ceph/messenger.c2344
-rw-r--r--net/ceph/messenger_v1.c1620
-rw-r--r--net/ceph/messenger_v2.c3804
-rw-r--r--net/ceph/mon_client.c440
-rw-r--r--net/ceph/msgpool.c27
-rw-r--r--net/ceph/osd_client.c1484
-rw-r--r--net/ceph/osdmap.c720
-rw-r--r--net/ceph/pagelist.c60
-rw-r--r--net/ceph/pagevec.c85
-rw-r--r--net/ceph/snapshot.c15
-rw-r--r--net/ceph/striper.c17
-rw-r--r--net/compat.c628
-rw-r--r--net/core/Makefile26
-rw-r--r--net/core/bpf_sk_storage.c914
-rw-r--r--net/core/datagram.c571
-rw-r--r--net/core/dev.c9128
-rw-r--r--net/core/dev.h406
-rw-r--r--net/core/dev_addr_lists.c334
-rw-r--r--net/core/dev_addr_lists_test.c247
-rw-r--r--net/core/dev_api.c382
-rw-r--r--net/core/dev_ioctl.c648
-rw-r--r--net/core/devlink.c4805
-rw-r--r--net/core/devmem.c522
-rw-r--r--net/core/devmem.h246
-rw-r--r--net/core/drop_monitor.c1577
-rw-r--r--net/core/dst.c177
-rw-r--r--net/core/dst_cache.c68
-rw-r--r--net/core/failover.c10
-rw-r--r--net/core/fib_notifier.c120
-rw-r--r--net/core/fib_rules.c481
-rw-r--r--net/core/filter.c7960
-rw-r--r--net/core/flow_dissector.c953
-rw-r--r--net/core/flow_offload.c638
-rw-r--r--net/core/gen_estimator.c77
-rw-r--r--net/core/gen_stats.c233
-rw-r--r--net/core/gro.c835
-rw-r--r--net/core/gro_cells.c75
-rw-r--r--net/core/gso.c273
-rw-r--r--net/core/hotdata.c29
-rw-r--r--net/core/hwbm.c21
-rw-r--r--net/core/ieee8021q_helpers.c224
-rw-r--r--net/core/link_watch.c131
-rw-r--r--net/core/lock_debug.c122
-rw-r--r--net/core/lwt_bpf.c323
-rw-r--r--net/core/lwtunnel.c155
-rw-r--r--net/core/mp_dmabuf_devmem.h44
-rw-r--r--net/core/neighbour.c2105
-rw-r--r--net/core/net-procfs.c142
-rw-r--r--net/core/net-sysfs.c1258
-rw-r--r--net/core/net-sysfs.h4
-rw-r--r--net/core/net-traces.c20
-rw-r--r--net/core/net_namespace.c805
-rw-r--r--net/core/net_test.c387
-rw-r--r--net/core/netclassid_cgroup.c78
-rw-r--r--net/core/netdev-genl-gen.c238
-rw-r--r--net/core/netdev-genl-gen.h50
-rw-r--r--net/core/netdev-genl.c1203
-rw-r--r--net/core/netdev_queues.c27
-rw-r--r--net/core/netdev_rx_queue.c194
-rw-r--r--net/core/netevent.c8
-rw-r--r--net/core/netmem_priv.h62
-rw-r--r--net/core/netpoll.c722
-rw-r--r--net/core/netprio_cgroup.c27
-rw-r--r--net/core/of_net.c172
-rw-r--r--net/core/page_pool.c1335
-rw-r--r--net/core/page_pool_priv.h60
-rw-r--r--net/core/page_pool_user.c441
-rw-r--r--net/core/pktgen.c955
-rw-r--r--net/core/ptp_classifier.c61
-rw-r--r--net/core/request_sock.c15
-rw-r--r--net/core/rtnetlink.c4464
-rw-r--r--net/core/scm.c356
-rw-r--r--net/core/secure_seq.c70
-rw-r--r--net/core/selftests.c448
-rw-r--r--net/core/skb_fault_injection.c106
-rw-r--r--net/core/skbuff.c3830
-rw-r--r--net/core/skmsg.c1289
-rw-r--r--net/core/sock.c2813
-rw-r--r--net/core/sock_destructor.h12
-rw-r--r--net/core/sock_diag.c142
-rw-r--r--net/core/sock_map.c1959
-rw-r--r--net/core/sock_reuseport.c517
-rw-r--r--net/core/stream.c67
-rw-r--r--net/core/sysctl_net_core.c488
-rw-r--r--net/core/timestamping.c76
-rw-r--r--net/core/tso.c60
-rw-r--r--net/core/utils.c48
-rw-r--r--net/core/xdp.c885
-rw-r--r--net/dcb/Kconfig3
-rw-r--r--net/dcb/Makefile3
-rw-r--r--net/dcb/dcbevent.c13
-rw-r--r--net/dcb/dcbnl.c594
-rw-r--r--net/dccp/Kconfig45
-rw-r--r--net/dccp/Makefile30
-rw-r--r--net/dccp/ackvec.c405
-rw-r--r--net/dccp/ackvec.h138
-rw-r--r--net/dccp/ccid.c222
-rw-r--r--net/dccp/ccid.h265
-rw-r--r--net/dccp/ccids/Kconfig54
-rw-r--r--net/dccp/ccids/ccid2.c801
-rw-r--r--net/dccp/ccids/ccid2.h134
-rw-r--r--net/dccp/ccids/ccid3.c873
-rw-r--r--net/dccp/ccids/ccid3.h161
-rw-r--r--net/dccp/ccids/lib/loss_interval.c185
-rw-r--r--net/dccp/ccids/lib/loss_interval.h73
-rw-r--r--net/dccp/ccids/lib/packet_history.c447
-rw-r--r--net/dccp/ccids/lib/packet_history.h155
-rw-r--r--net/dccp/ccids/lib/tfrc.c46
-rw-r--r--net/dccp/ccids/lib/tfrc.h77
-rw-r--r--net/dccp/ccids/lib/tfrc_equation.c705
-rw-r--r--net/dccp/dccp.h501
-rw-r--r--net/dccp/diag.c88
-rw-r--r--net/dccp/feat.c1564
-rw-r--r--net/dccp/feat.h137
-rw-r--r--net/dccp/input.c742
-rw-r--r--net/dccp/ipv4.c1086
-rw-r--r--net/dccp/ipv6.c1169
-rw-r--r--net/dccp/ipv6.h34
-rw-r--r--net/dccp/minisocks.c272
-rw-r--r--net/dccp/options.c609
-rw-r--r--net/dccp/output.c704
-rw-r--r--net/dccp/proto.c1276
-rw-r--r--net/dccp/qpolicy.c137
-rw-r--r--net/dccp/sysctl.c118
-rw-r--r--net/dccp/timer.c275
-rw-r--r--net/dccp/trace.h84
-rw-r--r--net/decnet/Kconfig42
-rw-r--r--net/decnet/Makefile10
-rw-r--r--net/decnet/README8
-rw-r--r--net/decnet/TODO40
-rw-r--r--net/decnet/af_decnet.c2411
-rw-r--r--net/decnet/dn_dev.c1438
-rw-r--r--net/decnet/dn_fib.c799
-rw-r--r--net/decnet/dn_neigh.c605
-rw-r--r--net/decnet/dn_nsp_in.c914
-rw-r--r--net/decnet/dn_nsp_out.c703
-rw-r--r--net/decnet/dn_route.c1927
-rw-r--r--net/decnet/dn_rules.c258
-rw-r--r--net/decnet/dn_table.c928
-rw-r--r--net/decnet/dn_timer.c104
-rw-r--r--net/decnet/netfilter/Kconfig16
-rw-r--r--net/decnet/netfilter/Makefile5
-rw-r--r--net/decnet/netfilter/dn_rtmsg.c160
-rw-r--r--net/decnet/sysctl_net_decnet.c373
-rw-r--r--net/devlink/Makefile4
-rw-r--r--net/devlink/core.c551
-rw-r--r--net/devlink/dev.c1442
-rw-r--r--net/devlink/devl_internal.h304
-rw-r--r--net/devlink/dpipe.c915
-rw-r--r--net/devlink/health.c1350
-rw-r--r--net/devlink/linecard.c628
-rw-r--r--net/devlink/netlink.c376
-rw-r--r--net/devlink/netlink_gen.c1287
-rw-r--r--net/devlink/netlink_gen.h150
-rw-r--r--net/devlink/param.c950
-rw-r--r--net/devlink/port.c1604
-rw-r--r--net/devlink/rate.c850
-rw-r--r--net/devlink/region.c1258
-rw-r--r--net/devlink/resource.c504
-rw-r--r--net/devlink/sb.c995
-rw-r--r--net/devlink/trap.c1854
-rw-r--r--net/devres.c95
-rw-r--r--net/dns_resolver/Kconfig7
-rw-r--r--net/dns_resolver/Makefile1
-rw-r--r--net/dns_resolver/dns_key.c81
-rw-r--r--net/dns_resolver/dns_query.c26
-rw-r--r--net/dsa/Kconfig205
-rw-r--r--net/dsa/Makefile52
-rw-r--r--net/dsa/conduit.c549
-rw-r--r--net/dsa/conduit.h22
-rw-r--r--net/dsa/devlink.c402
-rw-r--r--net/dsa/devlink.h16
-rw-r--r--net/dsa/dsa.c1889
-rw-r--r--net/dsa/dsa.h40
-rw-r--r--net/dsa/dsa2.c831
-rw-r--r--net/dsa/dsa_priv.h223
-rw-r--r--net/dsa/legacy.c748
-rw-r--r--net/dsa/master.c185
-rw-r--r--net/dsa/netlink.c64
-rw-r--r--net/dsa/netlink.h8
-rw-r--r--net/dsa/port.c1901
-rw-r--r--net/dsa/port.h115
-rw-r--r--net/dsa/slave.c1569
-rw-r--r--net/dsa/stubs.c10
-rw-r--r--net/dsa/switch.c1050
-rw-r--r--net/dsa/switch.h123
-rw-r--r--net/dsa/tag.c244
-rw-r--r--net/dsa/tag.h409
-rw-r--r--net/dsa/tag_8021q.c588
-rw-r--r--net/dsa/tag_8021q.h28
-rw-r--r--net/dsa/tag_ar9331.c95
-rw-r--r--net/dsa/tag_brcm.c294
-rw-r--r--net/dsa/tag_dsa.c410
-rw-r--r--net/dsa/tag_edsa.c171
-rw-r--r--net/dsa/tag_gswip.c112
-rw-r--r--net/dsa/tag_hellcreek.c73
-rw-r--r--net/dsa/tag_ksz.c487
-rw-r--r--net/dsa/tag_lan9303.c87
-rw-r--r--net/dsa/tag_mtk.c88
-rw-r--r--net/dsa/tag_mxl-gsw1xx.c117
-rw-r--r--net/dsa/tag_none.c31
-rw-r--r--net/dsa/tag_ocelot.c186
-rw-r--r--net/dsa/tag_ocelot_8021q.c140
-rw-r--r--net/dsa/tag_qca.c141
-rw-r--r--net/dsa/tag_rtl4_a.c126
-rw-r--r--net/dsa/tag_rtl8_4.c261
-rw-r--r--net/dsa/tag_rzn1_a5psw.c115
-rw-r--r--net/dsa/tag_sja1105.c762
-rw-r--r--net/dsa/tag_trailer.c60
-rw-r--r--net/dsa/tag_vsc73xx_8021q.c68
-rw-r--r--net/dsa/tag_xrs700x.c61
-rw-r--r--net/dsa/tag_yt921x.c139
-rw-r--r--net/dsa/trace.c39
-rw-r--r--net/dsa/trace.h447
-rw-r--r--net/dsa/user.c3877
-rw-r--r--net/dsa/user.h71
-rw-r--r--net/ethernet/Makefile1
-rw-r--r--net/ethernet/eth.c301
-rw-r--r--net/ethtool/Makefile12
-rw-r--r--net/ethtool/bitset.c873
-rw-r--r--net/ethtool/bitset.h34
-rw-r--r--net/ethtool/cabletest.c453
-rw-r--r--net/ethtool/channels.c199
-rw-r--r--net/ethtool/cmis.h128
-rw-r--r--net/ethtool/cmis_cdb.c666
-rw-r--r--net/ethtool/cmis_fw_update.c485
-rw-r--r--net/ethtool/coalesce.c649
-rw-r--r--net/ethtool/common.c1169
-rw-r--r--net/ethtool/common.h92
-rw-r--r--net/ethtool/debug.c117
-rw-r--r--net/ethtool/eee.c172
-rw-r--r--net/ethtool/eeprom.c246
-rw-r--r--net/ethtool/features.c297
-rw-r--r--net/ethtool/fec.c364
-rw-r--r--net/ethtool/ioctl.c (renamed from net/core/ethtool.c)2327
-rw-r--r--net/ethtool/linkinfo.c145
-rw-r--r--net/ethtool/linkmodes.c362
-rw-r--r--net/ethtool/linkstate.c227
-rw-r--r--net/ethtool/mm.c561
-rw-r--r--net/ethtool/module.c557
-rw-r--r--net/ethtool/module_fw.h75
-rw-r--r--net/ethtool/mse.c329
-rw-r--r--net/ethtool/netlink.c1583
-rw-r--r--net/ethtool/netlink.h525
-rw-r--r--net/ethtool/pause.c219
-rw-r--r--net/ethtool/phc_vclocks.c94
-rw-r--r--net/ethtool/phy.c165
-rw-r--r--net/ethtool/plca.c271
-rw-r--r--net/ethtool/privflags.c195
-rw-r--r--net/ethtool/pse-pd.c382
-rw-r--r--net/ethtool/rings.c322
-rw-r--r--net/ethtool/rss.c1205
-rw-r--r--net/ethtool/stats.c623
-rw-r--r--net/ethtool/strset.c499
-rw-r--r--net/ethtool/ts.h20
-rw-r--r--net/ethtool/tsconfig.c455
-rw-r--r--net/ethtool/tsinfo.c564
-rw-r--r--net/ethtool/tunnels.c281
-rw-r--r--net/ethtool/wol.c158
-rw-r--r--net/handshake/.kunitconfig11
-rw-r--r--net/handshake/Makefile13
-rw-r--r--net/handshake/alert.c110
-rw-r--r--net/handshake/genl.c59
-rw-r--r--net/handshake/genl.h25
-rw-r--r--net/handshake/handshake-test.c540
-rw-r--r--net/handshake/handshake.h93
-rw-r--r--net/handshake/netlink.c289
-rw-r--r--net/handshake/request.c343
-rw-r--r--net/handshake/tlshd.c455
-rw-r--r--net/handshake/trace.c22
-rw-r--r--net/hsr/Kconfig56
-rw-r--r--net/hsr/Makefile4
-rw-r--r--net/hsr/hsr_debugfs.c123
-rw-r--r--net/hsr/hsr_device.c700
-rw-r--r--net/hsr/hsr_device.h14
-rw-r--r--net/hsr/hsr_forward.c632
-rw-r--r--net/hsr/hsr_forward.h23
-rw-r--r--net/hsr/hsr_framereg.c599
-rw-r--r--net/hsr/hsr_framereg.h77
-rw-r--r--net/hsr/hsr_main.c93
-rw-r--r--net/hsr/hsr_main.h254
-rw-r--r--net/hsr/hsr_netlink.c300
-rw-r--r--net/hsr/hsr_netlink.h10
-rw-r--r--net/hsr/hsr_slave.c163
-rw-r--r--net/hsr/hsr_slave.h13
-rw-r--r--net/hsr/prp_dup_discard_test.c212
-rw-r--r--net/ieee802154/6lowpan/Kconfig3
-rw-r--r--net/ieee802154/6lowpan/Makefile1
-rw-r--r--net/ieee802154/6lowpan/core.c16
-rw-r--r--net/ieee802154/6lowpan/reassembly.c237
-rw-r--r--net/ieee802154/6lowpan/rx.c16
-rw-r--r--net/ieee802154/6lowpan/tx.c13
-rw-r--r--net/ieee802154/Kconfig7
-rw-r--r--net/ieee802154/Makefile2
-rw-r--r--net/ieee802154/core.c55
-rw-r--r--net/ieee802154/header_ops.c70
-rw-r--r--net/ieee802154/ieee802154.h13
-rw-r--r--net/ieee802154/netlink.c18
-rw-r--r--net/ieee802154/nl-mac.c44
-rw-r--r--net/ieee802154/nl-phy.c29
-rw-r--r--net/ieee802154/nl802154.c890
-rw-r--r--net/ieee802154/nl802154.h6
-rw-r--r--net/ieee802154/nl_policy.c17
-rw-r--r--net/ieee802154/pan.c109
-rw-r--r--net/ieee802154/rdev-ops.h86
-rw-r--r--net/ieee802154/socket.c157
-rw-r--r--net/ieee802154/sysfs.c12
-rw-r--r--net/ieee802154/sysfs.h2
-rw-r--r--net/ieee802154/trace.h103
-rw-r--r--net/ife/Kconfig4
-rw-r--r--net/ife/Makefile1
-rw-r--r--net/ife/ife.c1
-rw-r--r--net/ipv4/Kconfig418
-rw-r--r--net/ipv4/Makefile19
-rw-r--r--net/ipv4/af_inet.c672
-rw-r--r--net/ipv4/ah4.c65
-rw-r--r--net/ipv4/arp.c340
-rw-r--r--net/ipv4/bpf_tcp_ca.c349
-rw-r--r--net/ipv4/bpfilter/Makefile1
-rw-r--r--net/ipv4/bpfilter/sockopt.c43
-rw-r--r--net/ipv4/cipso_ipv4.c197
-rw-r--r--net/ipv4/datagram.c51
-rw-r--r--net/ipv4/devinet.c1287
-rw-r--r--net/ipv4/esp4.c426
-rw-r--r--net/ipv4/esp4_offload.c201
-rw-r--r--net/ipv4/fib_frontend.c516
-rw-r--r--net/ipv4/fib_lookup.h18
-rw-r--r--net/ipv4/fib_notifier.c24
-rw-r--r--net/ipv4/fib_rules.c184
-rw-r--r--net/ipv4/fib_semantics.c1767
-rw-r--r--net/ipv4/fib_trie.c665
-rw-r--r--net/ipv4/fou_bpf.c117
-rw-r--r--net/ipv4/fou_core.c (renamed from net/ipv4/fou.c)358
-rw-r--r--net/ipv4/fou_nl.c49
-rw-r--r--net/ipv4/fou_nl.h26
-rw-r--r--net/ipv4/gre_demux.c59
-rw-r--r--net/ipv4/gre_offload.c50
-rw-r--r--net/ipv4/icmp.c950
-rw-r--r--net/ipv4/igmp.c508
-rw-r--r--net/ipv4/igmp_internal.h17
-rw-r--r--net/ipv4/inet_connection_sock.c1078
-rw-r--r--net/ipv4/inet_diag.c920
-rw-r--r--net/ipv4/inet_fragment.c574
-rw-r--r--net/ipv4/inet_hashtables.c1139
-rw-r--r--net/ipv4/inet_timewait_sock.c191
-rw-r--r--net/ipv4/inetpeer.c105
-rw-r--r--net/ipv4/ip_forward.c26
-rw-r--r--net/ipv4/ip_fragment.c492
-rw-r--r--net/ipv4/ip_gre.c766
-rw-r--r--net/ipv4/ip_input.c270
-rw-r--r--net/ipv4/ip_options.c134
-rw-r--r--net/ipv4/ip_output.c938
-rw-r--r--net/ipv4/ip_sockglue.c1305
-rw-r--r--net/ipv4/ip_tunnel.c525
-rw-r--r--net/ipv4/ip_tunnel_core.c879
-rw-r--r--net/ipv4/ip_vti.c283
-rw-r--r--net/ipv4/ipcomp.c28
-rw-r--r--net/ipv4/ipconfig.c161
-rw-r--r--net/ipv4/ipip.c198
-rw-r--r--net/ipv4/ipmr.c1056
-rw-r--r--net/ipv4/ipmr_base.c193
-rw-r--r--net/ipv4/metrics.c60
-rw-r--r--net/ipv4/netfilter.c45
-rw-r--r--net/ipv4/netfilter/Kconfig192
-rw-r--r--net/ipv4/netfilter/Makefile24
-rw-r--r--net/ipv4/netfilter/arp_tables.c248
-rw-r--r--net/ipv4/netfilter/arpt_mangle.c3
-rw-r--r--net/ipv4/netfilter/arptable_filter.c51
-rw-r--r--net/ipv4/netfilter/ip_tables.c280
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c884
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c9
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c98
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c8
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c403
-rw-r--r--net/ipv4/netfilter/ipt_ah.c5
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c28
-rw-r--r--net/ipv4/netfilter/iptable_filter.c58
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c60
-rw-r--r--net/ipv4/netfilter/iptable_nat.c102
-rw-r--r--net/ipv4/netfilter/iptable_raw.c54
-rw-r--r--net/ipv4/netfilter/iptable_security.c55
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c59
-rw-r--r--net/ipv4/netfilter/nf_dup_ipv4.c23
-rw-r--r--net/ipv4/netfilter/nf_flow_table_ipv4.c33
-rw-r--r--net/ipv4/netfilter/nf_log_arp.c160
-rw-r--r--net/ipv4/netfilter/nf_log_ipv4.c396
-rw-r--r--net/ipv4/netfilter/nf_nat_h323.c123
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c421
-rw-r--r--net/ipv4/netfilter/nf_nat_masquerade_ipv4.c158
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c42
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_gre.c150
-rw-r--r--net/ipv4/netfilter/nf_nat_proto_icmp.c83
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.asn18
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic_main.c22
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c242
-rw-r--r--net/ipv4/netfilter/nf_socket_ipv4.c25
-rw-r--r--net/ipv4/netfilter/nf_tproxy_ipv4.c34
-rw-r--r--net/ipv4/netfilter/nft_chain_nat_ipv4.c87
-rw-r--r--net/ipv4/netfilter/nft_chain_route_ipv4.c89
-rw-r--r--net/ipv4/netfilter/nft_dup_ipv4.c28
-rw-r--r--net/ipv4/netfilter/nft_fib_ipv4.c92
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c88
-rw-r--r--net/ipv4/netfilter/nft_redir_ipv4.c82
-rw-r--r--net/ipv4/netfilter/nft_reject_ipv4.c10
-rw-r--r--net/ipv4/netlink.c18
-rw-r--r--net/ipv4/nexthop.c4157
-rw-r--r--net/ipv4/ping.c344
-rw-r--r--net/ipv4/proc.c161
-rw-r--r--net/ipv4/protocol.c13
-rw-r--r--net/ipv4/raw.c412
-rw-r--r--net/ipv4/raw_diag.c99
-rw-r--r--net/ipv4/route.c1923
-rw-r--r--net/ipv4/syncookies.c330
-rw-r--r--net/ipv4/sysctl_net_ipv4.c1074
-rw-r--r--net/ipv4/tcp.c3405
-rw-r--r--net/ipv4/tcp_ao.c2442
-rw-r--r--net/ipv4/tcp_bbr.c370
-rw-r--r--net/ipv4/tcp_bic.c24
-rw-r--r--net/ipv4/tcp_bpf.c739
-rw-r--r--net/ipv4/tcp_cdg.c39
-rw-r--r--net/ipv4/tcp_cong.c184
-rw-r--r--net/ipv4/tcp_cubic.c168
-rw-r--r--net/ipv4/tcp_dctcp.c217
-rw-r--r--net/ipv4/tcp_dctcp.h40
-rw-r--r--net/ipv4/tcp_diag.c531
-rw-r--r--net/ipv4/tcp_fastopen.c317
-rw-r--r--net/ipv4/tcp_highspeed.c21
-rw-r--r--net/ipv4/tcp_htcp.c15
-rw-r--r--net/ipv4/tcp_hybla.c19
-rw-r--r--net/ipv4/tcp_illinois.c13
-rw-r--r--net/ipv4/tcp_input.c3196
-rw-r--r--net/ipv4/tcp_ipv4.c2430
-rw-r--r--net/ipv4/tcp_lp.c27
-rw-r--r--net/ipv4/tcp_metrics.c195
-rw-r--r--net/ipv4/tcp_minisocks.c458
-rw-r--r--net/ipv4/tcp_nv.c26
-rw-r--r--net/ipv4/tcp_offload.c347
-rw-r--r--net/ipv4/tcp_output.c2204
-rw-r--r--net/ipv4/tcp_plb.c109
-rw-r--r--net/ipv4/tcp_rate.c33
-rw-r--r--net/ipv4/tcp_recovery.c55
-rw-r--r--net/ipv4/tcp_scalable.c20
-rw-r--r--net/ipv4/tcp_sigpool.c366
-rw-r--r--net/ipv4/tcp_timer.c500
-rw-r--r--net/ipv4/tcp_ulp.c94
-rw-r--r--net/ipv4/tcp_vegas.c30
-rw-r--r--net/ipv4/tcp_veno.c66
-rw-r--r--net/ipv4/tcp_westwood.c4
-rw-r--r--net/ipv4/tcp_yeah.c70
-rw-r--r--net/ipv4/tunnel4.c66
-rw-r--r--net/ipv4/udp.c2236
-rw-r--r--net/ipv4/udp_bpf.c157
-rw-r--r--net/ipv4/udp_diag.c68
-rw-r--r--net/ipv4/udp_impl.h20
-rw-r--r--net/ipv4/udp_offload.c622
-rw-r--r--net/ipv4/udp_tunnel_core.c (renamed from net/ipv4/udp_tunnel.c)138
-rw-r--r--net/ipv4/udp_tunnel_nic.c1010
-rw-r--r--net/ipv4/udp_tunnel_stub.c7
-rw-r--r--net/ipv4/udplite.c32
-rw-r--r--net/ipv4/xfrm4_input.c131
-rw-r--r--net/ipv4/xfrm4_mode_beet.c155
-rw-r--r--net/ipv4/xfrm4_mode_transport.c114
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c152
-rw-r--r--net/ipv4/xfrm4_output.c78
-rw-r--r--net/ipv4/xfrm4_policy.c206
-rw-r--r--net/ipv4/xfrm4_protocol.c40
-rw-r--r--net/ipv4/xfrm4_state.c69
-rw-r--r--net/ipv4/xfrm4_tunnel.c22
-rw-r--r--net/ipv6/Kconfig159
-rw-r--r--net/ipv6/Makefile20
-rw-r--r--net/ipv6/addrconf.c2999
-rw-r--r--net/ipv6/addrconf_core.c102
-rw-r--r--net/ipv6/addrlabel.c186
-rw-r--r--net/ipv6/af_inet6.c416
-rw-r--r--net/ipv6/ah6.c140
-rw-r--r--net/ipv6/anycast.c300
-rw-r--r--net/ipv6/calipso.c81
-rw-r--r--net/ipv6/datagram.c119
-rw-r--r--net/ipv6/esp6.c555
-rw-r--r--net/ipv6/esp6_offload.c200
-rw-r--r--net/ipv6/exthdrs.c491
-rw-r--r--net/ipv6/exthdrs_core.c9
-rw-r--r--net/ipv6/exthdrs_offload.c17
-rw-r--r--net/ipv6/fib6_notifier.c13
-rw-r--r--net/ipv6/fib6_rules.c297
-rw-r--r--net/ipv6/fou6.c89
-rw-r--r--net/ipv6/icmp.c697
-rw-r--r--net/ipv6/ila/Makefile1
-rw-r--r--net/ipv6/ila/ila.h13
-rw-r--r--net/ipv6/ila/ila_common.c6
-rw-r--r--net/ipv6/ila/ila_lwt.c20
-rw-r--r--net/ipv6/ila/ila_main.c18
-rw-r--r--net/ipv6/ila/ila_xlat.c70
-rw-r--r--net/ipv6/inet6_connection_sock.c42
-rw-r--r--net/ipv6/inet6_hashtables.c292
-rw-r--r--net/ipv6/ioam6.c1040
-rw-r--r--net/ipv6/ioam6_iptunnel.c570
-rw-r--r--net/ipv6/ip6_checksum.c20
-rw-r--r--net/ipv6/ip6_fib.c939
-rw-r--r--net/ipv6/ip6_flowlabel.c452
-rw-r--r--net/ipv6/ip6_gre.c756
-rw-r--r--net/ipv6/ip6_icmp.c48
-rw-r--r--net/ipv6/ip6_input.c235
-rw-r--r--net/ipv6/ip6_offload.c259
-rw-r--r--net/ipv6/ip6_offload.h6
-rw-r--r--net/ipv6/ip6_output.c1096
-rw-r--r--net/ipv6/ip6_tunnel.c786
-rw-r--r--net/ipv6/ip6_udp_tunnel.c106
-rw-r--r--net/ipv6/ip6_vti.c261
-rw-r--r--net/ipv6/ip6mr.c894
-rw-r--r--net/ipv6/ipcomp6.c33
-rw-r--r--net/ipv6/ipv6_sockglue.c1238
-rw-r--r--net/ipv6/mcast.c1616
-rw-r--r--net/ipv6/mcast_snoop.c104
-rw-r--r--net/ipv6/mip6.c134
-rw-r--r--net/ipv6/ndisc.c730
-rw-r--r--net/ipv6/netfilter.c171
-rw-r--r--net/ipv6/netfilter/Kconfig152
-rw-r--r--net/ipv6/netfilter/Makefile17
-rw-r--r--net/ipv6/netfilter/ip6_tables.c278
-rw-r--r--net/ipv6/netfilter/ip6t_MASQUERADE.c77
-rw-r--r--net/ipv6/netfilter/ip6t_NPT.c44
-rw-r--r--net/ipv6/netfilter/ip6t_REJECT.c8
-rw-r--r--net/ipv6/netfilter/ip6t_SYNPROXY.c428
-rw-r--r--net/ipv6/netfilter/ip6t_ah.c10
-rw-r--r--net/ipv6/netfilter/ip6t_eui64.c5
-rw-r--r--net/ipv6/netfilter/ip6t_frag.c8
-rw-r--r--net/ipv6/netfilter/ip6t_hbh.c8
-rw-r--r--net/ipv6/netfilter/ip6t_ipv6header.c16
-rw-r--r--net/ipv6/netfilter/ip6t_mh.c6
-rw-r--r--net/ipv6/netfilter/ip6t_rpfilter.c11
-rw-r--r--net/ipv6/netfilter/ip6t_rt.c66
-rw-r--r--net/ipv6/netfilter/ip6t_srh.c12
-rw-r--r--net/ipv6/netfilter/ip6table_filter.c57
-rw-r--r--net/ipv6/netfilter/ip6table_mangle.c64
-rw-r--r--net/ipv6/netfilter/ip6table_nat.c99
-rw-r--r--net/ipv6/netfilter/ip6table_raw.c55
-rw-r--r--net/ipv6/netfilter/ip6table_security.c53
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c406
-rw-r--r--net/ipv6/netfilter/nf_defrag_ipv6_hooks.c50
-rw-r--r--net/ipv6/netfilter/nf_dup_ipv6.c22
-rw-r--r--net/ipv6/netfilter/nf_flow_table_ipv6.c34
-rw-r--r--net/ipv6/netfilter/nf_log_ipv6.c428
-rw-r--r--net/ipv6/netfilter/nf_nat_l3proto_ipv6.c444
-rw-r--r--net/ipv6/netfilter/nf_nat_masquerade_ipv6.c187
-rw-r--r--net/ipv6/netfilter/nf_nat_proto_icmpv6.c90
-rw-r--r--net/ipv6/netfilter/nf_reject_ipv6.c295
-rw-r--r--net/ipv6/netfilter/nf_socket_ipv6.c39
-rw-r--r--net/ipv6/netfilter/nf_tproxy_ipv6.c12
-rw-r--r--net/ipv6/netfilter/nft_chain_nat_ipv6.c85
-rw-r--r--net/ipv6/netfilter/nft_chain_route_ipv6.c91
-rw-r--r--net/ipv6/netfilter/nft_dup_ipv6.c28
-rw-r--r--net/ipv6/netfilter/nft_fib_ipv6.c72
-rw-r--r--net/ipv6/netfilter/nft_masq_ipv6.c89
-rw-r--r--net/ipv6/netfilter/nft_redir_ipv6.c83
-rw-r--r--net/ipv6/netfilter/nft_reject_ipv6.c10
-rw-r--r--net/ipv6/output_core.c45
-rw-r--r--net/ipv6/ping.c83
-rw-r--r--net/ipv6/proc.c109
-rw-r--r--net/ipv6/protocol.c6
-rw-r--r--net/ipv6/raw.c371
-rw-r--r--net/ipv6/reassembly.c368
-rw-r--r--net/ipv6/route.c4220
-rw-r--r--net/ipv6/rpl.c118
-rw-r--r--net/ipv6/rpl_iptunnel.c394
-rw-r--r--net/ipv6/seg6.c155
-rw-r--r--net/ipv6/seg6_hmac.c237
-rw-r--r--net/ipv6/seg6_iptunnel.c380
-rw-r--r--net/ipv6/seg6_local.c1894
-rw-r--r--net/ipv6/sit.c777
-rw-r--r--net/ipv6/syncookies.c159
-rw-r--r--net/ipv6/sysctl_net_ipv6.c153
-rw-r--r--net/ipv6/tcp_ao.c168
-rw-r--r--net/ipv6/tcp_ipv6.c1360
-rw-r--r--net/ipv6/tcpv6_offload.c170
-rw-r--r--net/ipv6/tunnel6.c155
-rw-r--r--net/ipv6/udp.c1118
-rw-r--r--net/ipv6/udp_impl.h21
-rw-r--r--net/ipv6/udp_offload.c97
-rw-r--r--net/ipv6/udplite.c35
-rw-r--r--net/ipv6/xfrm6_input.c190
-rw-r--r--net/ipv6/xfrm6_mode_beet.c131
-rw-r--r--net/ipv6/xfrm6_mode_ro.c85
-rw-r--r--net/ipv6/xfrm6_mode_transport.c121
-rw-r--r--net/ipv6/xfrm6_mode_tunnel.c151
-rw-r--r--net/ipv6/xfrm6_output.c127
-rw-r--r--net/ipv6/xfrm6_policy.c210
-rw-r--r--net/ipv6/xfrm6_protocol.c75
-rw-r--r--net/ipv6/xfrm6_state.c163
-rw-r--r--net/ipv6/xfrm6_tunnel.c47
-rw-r--r--net/iucv/Kconfig1
-rw-r--r--net/iucv/Makefile1
-rw-r--r--net/iucv/af_iucv.c719
-rw-r--r--net/iucv/iucv.c532
-rw-r--r--net/kcm/Kconfig7
-rw-r--r--net/kcm/Makefile1
-rw-r--r--net/kcm/kcmproc.c2
-rw-r--r--net/kcm/kcmsock.c553
-rw-r--r--net/key/Makefile1
-rw-r--r--net/key/af_key.c172
-rw-r--r--net/l2tp/Kconfig3
-rw-r--r--net/l2tp/Makefile2
-rw-r--r--net/l2tp/l2tp_core.c1357
-rw-r--r--net/l2tp/l2tp_core.h276
-rw-r--r--net/l2tp/l2tp_debugfs.c128
-rw-r--r--net/l2tp/l2tp_eth.c97
-rw-r--r--net/l2tp/l2tp_ip.c264
-rw-r--r--net/l2tp/l2tp_ip6.c277
-rw-r--r--net/l2tp/l2tp_netlink.c422
-rw-r--r--net/l2tp/l2tp_ppp.c475
-rw-r--r--net/l2tp/trace.h211
-rw-r--r--net/l3mdev/Kconfig3
-rw-r--r--net/l3mdev/Makefile3
-rw-r--r--net/l3mdev/l3mdev.c172
-rw-r--r--net/lapb/Kconfig5
-rw-r--r--net/lapb/Makefile1
-rw-r--r--net/lapb/lapb_iface.c166
-rw-r--r--net/lapb/lapb_in.c7
-rw-r--r--net/lapb/lapb_out.c10
-rw-r--r--net/lapb/lapb_subr.c7
-rw-r--r--net/lapb/lapb_timer.c67
-rw-r--r--net/llc/Kconfig2
-rw-r--r--net/llc/af_llc.c195
-rw-r--r--net/llc/llc_c_ac.c39
-rw-r--r--net/llc/llc_c_ev.c4
-rw-r--r--net/llc/llc_c_st.c500
-rw-r--r--net/llc/llc_conn.c175
-rw-r--r--net/llc/llc_core.c15
-rw-r--r--net/llc/llc_if.c16
-rw-r--r--net/llc/llc_input.c16
-rw-r--r--net/llc/llc_output.c13
-rw-r--r--net/llc/llc_pdu.c4
-rw-r--r--net/llc/llc_proc.c8
-rw-r--r--net/llc/llc_s_ac.c50
-rw-r--r--net/llc/llc_s_st.c26
-rw-r--r--net/llc/llc_sap.c56
-rw-r--r--net/llc/llc_station.c9
-rw-r--r--net/llc/sysctl_net_llc.c12
-rw-r--r--net/mac80211/Kconfig99
-rw-r--r--net/mac80211/Makefile21
-rw-r--r--net/mac80211/aead_api.c14
-rw-r--r--net/mac80211/aead_api.h6
-rw-r--r--net/mac80211/aes_ccm.h5
-rw-r--r--net/mac80211/aes_cmac.c71
-rw-r--r--net/mac80211/aes_cmac.h12
-rw-r--r--net/mac80211/aes_gcm.h5
-rw-r--r--net/mac80211/aes_gmac.c46
-rw-r--r--net/mac80211/aes_gmac.h6
-rw-r--r--net/mac80211/agg-rx.c205
-rw-r--r--net/mac80211/agg-tx.c262
-rw-r--r--net/mac80211/airtime.c837
-rw-r--r--net/mac80211/cfg.c3469
-rw-r--r--net/mac80211/chan.c1859
-rw-r--r--net/mac80211/debug.h48
-rw-r--r--net/mac80211/debugfs.c385
-rw-r--r--net/mac80211/debugfs_key.c63
-rw-r--r--net/mac80211/debugfs_key.h13
-rw-r--r--net/mac80211/debugfs_netdev.c521
-rw-r--r--net/mac80211/debugfs_netdev.h28
-rw-r--r--net/mac80211/debugfs_sta.c1047
-rw-r--r--net/mac80211/debugfs_sta.h12
-rw-r--r--net/mac80211/driver-ops.c365
-rw-r--r--net/mac80211/driver-ops.h782
-rw-r--r--net/mac80211/drop.h97
-rw-r--r--net/mac80211/eht.c104
-rw-r--r--net/mac80211/ethtool.c55
-rw-r--r--net/mac80211/fils_aead.c29
-rw-r--r--net/mac80211/fils_aead.h5
-rw-r--r--net/mac80211/he.c326
-rw-r--r--net/mac80211/ht.c284
-rw-r--r--net/mac80211/ibss.c451
-rw-r--r--net/mac80211/ieee80211_i.h1591
-rw-r--r--net/mac80211/iface.c2063
-rw-r--r--net/mac80211/key.c816
-rw-r--r--net/mac80211/key.h43
-rw-r--r--net/mac80211/led.c22
-rw-r--r--net/mac80211/led.h21
-rw-r--r--net/mac80211/link.c641
-rw-r--r--net/mac80211/main.c1019
-rw-r--r--net/mac80211/mesh.c634
-rw-r--r--net/mac80211/mesh.h127
-rw-r--r--net/mac80211/mesh_hwmp.c245
-rw-r--r--net/mac80211/mesh_pathtbl.c615
-rw-r--r--net/mac80211/mesh_plink.c211
-rw-r--r--net/mac80211/mesh_ps.c29
-rw-r--r--net/mac80211/mesh_sync.c35
-rw-r--r--net/mac80211/michael.c7
-rw-r--r--net/mac80211/michael.h5
-rw-r--r--net/mac80211/mlme.c9952
-rw-r--r--net/mac80211/ocb.c58
-rw-r--r--net/mac80211/offchannel.c349
-rw-r--r--net/mac80211/parse.c1142
-rw-r--r--net/mac80211/pm.c46
-rw-r--r--net/mac80211/rate.c190
-rw-r--r--net/mac80211/rate.h52
-rw-r--r--net/mac80211/rc80211_minstrel.c753
-rw-r--r--net/mac80211/rc80211_minstrel.h171
-rw-r--r--net/mac80211/rc80211_minstrel_debugfs.c232
-rw-r--r--net/mac80211/rc80211_minstrel_ht.c1544
-rw-r--r--net/mac80211/rc80211_minstrel_ht.h161
-rw-r--r--net/mac80211/rc80211_minstrel_ht_debugfs.c151
-rw-r--r--net/mac80211/rx.c2934
-rw-r--r--net/mac80211/s1g.c222
-rw-r--r--net/mac80211/scan.c706
-rw-r--r--net/mac80211/spectmgmt.c357
-rw-r--r--net/mac80211/sta_info.c1804
-rw-r--r--net/mac80211/sta_info.h429
-rw-r--r--net/mac80211/status.c758
-rw-r--r--net/mac80211/tdls.c543
-rw-r--r--net/mac80211/tests/Makefile3
-rw-r--r--net/mac80211/tests/chan-mode.c274
-rw-r--r--net/mac80211/tests/elems.c104
-rw-r--r--net/mac80211/tests/mfp.c286
-rw-r--r--net/mac80211/tests/module.c10
-rw-r--r--net/mac80211/tests/s1g_tim.c356
-rw-r--r--net/mac80211/tests/tpe.c284
-rw-r--r--net/mac80211/tests/util.c309
-rw-r--r--net/mac80211/tests/util.h36
-rw-r--r--net/mac80211/tkip.c35
-rw-r--r--net/mac80211/tkip.h9
-rw-r--r--net/mac80211/trace.h1431
-rw-r--r--net/mac80211/trace_msg.h13
-rw-r--r--net/mac80211/tx.c3152
-rw-r--r--net/mac80211/util.c3307
-rw-r--r--net/mac80211/vht.c422
-rw-r--r--net/mac80211/wbrf.c94
-rw-r--r--net/mac80211/wep.c69
-rw-r--r--net/mac80211/wep.h12
-rw-r--r--net/mac80211/wme.c96
-rw-r--r--net/mac80211/wme.h7
-rw-r--r--net/mac80211/wpa.c509
-rw-r--r--net/mac80211/wpa.h20
-rw-r--r--net/mac802154/Kconfig3
-rw-r--r--net/mac802154/Makefile3
-rw-r--r--net/mac802154/cfg.c250
-rw-r--r--net/mac802154/driver-ops.h253
-rw-r--r--net/mac802154/ieee802154_i.h174
-rw-r--r--net/mac802154/iface.c99
-rw-r--r--net/mac802154/llsec.c71
-rw-r--r--net/mac802154/llsec.h12
-rw-r--r--net/mac802154/mac_cmd.c10
-rw-r--r--net/mac802154/main.c105
-rw-r--r--net/mac802154/mib.c10
-rw-r--r--net/mac802154/rx.c211
-rw-r--r--net/mac802154/scan.c917
-rw-r--r--net/mac802154/trace.h27
-rw-r--r--net/mac802154/tx.c159
-rw-r--r--net/mac802154/util.c101
-rw-r--r--net/mctp/Kconfig24
-rw-r--r--net/mctp/Makefile6
-rw-r--r--net/mctp/af_mctp.c911
-rw-r--r--net/mctp/device.c561
-rw-r--r--net/mctp/neigh.c353
-rw-r--r--net/mctp/route.c1790
-rw-r--r--net/mctp/test/route-test.c1598
-rw-r--r--net/mctp/test/sock-test.c396
-rw-r--r--net/mctp/test/utils.c284
-rw-r--r--net/mctp/test/utils.h74
-rw-r--r--net/mpls/Kconfig8
-rw-r--r--net/mpls/Makefile1
-rw-r--r--net/mpls/af_mpls.c807
-rw-r--r--net/mpls/internal.h53
-rw-r--r--net/mpls/mpls_gso.c15
-rw-r--r--net/mpls/mpls_iptunnel.c56
-rw-r--r--net/mptcp/Kconfig39
-rw-r--r--net/mptcp/Makefile15
-rw-r--r--net/mptcp/bpf.c36
-rw-r--r--net/mptcp/crypto.c52
-rw-r--r--net/mptcp/crypto_test.c73
-rw-r--r--net/mptcp/ctrl.c589
-rw-r--r--net/mptcp/diag.c120
-rw-r--r--net/mptcp/fastopen.c62
-rw-r--r--net/mptcp/mib.c130
-rw-r--r--net/mptcp/mib.h128
-rw-r--r--net/mptcp/mptcp_diag.c243
-rw-r--r--net/mptcp/mptcp_pm_gen.c180
-rw-r--r--net/mptcp/mptcp_pm_gen.h59
-rw-r--r--net/mptcp/options.c1719
-rw-r--r--net/mptcp/pm.c1139
-rw-r--r--net/mptcp/pm_kernel.c1624
-rw-r--r--net/mptcp/pm_netlink.c645
-rw-r--r--net/mptcp/pm_userspace.c698
-rw-r--r--net/mptcp/protocol.c4462
-rw-r--r--net/mptcp/protocol.h1366
-rw-r--r--net/mptcp/sched.c215
-rw-r--r--net/mptcp/sockopt.c1654
-rw-r--r--net/mptcp/subflow.c2219
-rw-r--r--net/mptcp/syncookies.c133
-rw-r--r--net/mptcp/token.c422
-rw-r--r--net/mptcp/token_test.c151
-rw-r--r--net/ncsi/Kconfig15
-rw-r--r--net/ncsi/Makefile1
-rw-r--r--net/ncsi/internal.h124
-rw-r--r--net/ncsi/ncsi-aen.c82
-rw-r--r--net/ncsi/ncsi-cmd.c87
-rw-r--r--net/ncsi/ncsi-manage.c973
-rw-r--r--net/ncsi/ncsi-netlink.c510
-rw-r--r--net/ncsi/ncsi-netlink.h19
-rw-r--r--net/ncsi/ncsi-pkt.h88
-rw-r--r--net/ncsi/ncsi-rsp.c317
-rw-r--r--net/netfilter/Kconfig315
-rw-r--r--net/netfilter/Makefile78
-rw-r--r--net/netfilter/core.c296
-rw-r--r--net/netfilter/ipset/Kconfig3
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_gen.h46
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ip.c56
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ipmac.c49
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_port.c61
-rw-r--r--net/netfilter/ipset/ip_set_core.c955
-rw-r--r--net/netfilter/ipset/ip_set_getport.c31
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h863
-rw-r--r--net/netfilter/ipset/ip_set_hash_ip.c63
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmac.c50
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipmark.c41
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipport.c57
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportip.c35
-rw-r--r--net/netfilter/ipset/ip_set_hash_ipportnet.c51
-rw-r--r--net/netfilter/ipset/ip_set_hash_mac.c32
-rw-r--r--net/netfilter/ipset/ip_set_hash_net.c52
-rw-r--r--net/netfilter/ipset/ip_set_hash_netiface.c91
-rw-r--r--net/netfilter/ipset/ip_set_hash_netnet.c76
-rw-r--r--net/netfilter/ipset/ip_set_hash_netport.c51
-rw-r--r--net/netfilter/ipset/ip_set_hash_netportnet.c78
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c100
-rw-r--r--net/netfilter/ipset/pfxlen.c1
-rw-r--r--net/netfilter/ipvs/Kconfig107
-rw-r--r--net/netfilter/ipvs/Makefile1
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c36
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c247
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c653
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c1001
-rw-r--r--net/netfilter/ipvs/ip_vs_dh.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c893
-rw-r--r--net/netfilter/ipvs/ip_vs_fo.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c28
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c25
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c29
-rw-r--r--net/netfilter/ipvs/ip_vs_lc.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_mh.c8
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c23
-rw-r--r--net/netfilter/ipvs/ip_vs_nq.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_ovf.c13
-rw-r--r--net/netfilter/ipvs/ip_vs_pe.c7
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c5
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c14
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_ah_esp.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c25
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c50
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c39
-rw-r--r--net/netfilter/ipvs/ip_vs_rr.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_sched.c13
-rw-r--r--net/netfilter/ipvs/ip_vs_sed.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c216
-rw-r--r--net/netfilter/ipvs/ip_vs_twos.c139
-rw-r--r--net/netfilter/ipvs/ip_vs_wlc.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_wrr.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c425
-rw-r--r--net/netfilter/nf_bpf_link.c332
-rw-r--r--net/netfilter/nf_conncount.c474
-rw-r--r--net/netfilter/nf_conntrack_acct.c115
-rw-r--r--net/netfilter/nf_conntrack_amanda.c27
-rw-r--r--net/netfilter/nf_conntrack_bpf.c550
-rw-r--r--net/netfilter/nf_conntrack_broadcast.c26
-rw-r--r--net/netfilter/nf_conntrack_core.c1928
-rw-r--r--net/netfilter/nf_conntrack_ecache.c469
-rw-r--r--net/netfilter/nf_conntrack_expect.c107
-rw-r--r--net/netfilter/nf_conntrack_extend.c168
-rw-r--r--net/netfilter/nf_conntrack_ftp.c48
-rw-r--r--net/netfilter/nf_conntrack_h323_asn1.c21
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c313
-rw-r--r--net/netfilter/nf_conntrack_h323_types.c3
-rw-r--r--net/netfilter/nf_conntrack_helper.c260
-rw-r--r--net/netfilter/nf_conntrack_irc.c70
-rw-r--r--net/netfilter/nf_conntrack_labels.c41
-rw-r--r--net/netfilter/nf_conntrack_netbios_ns.c11
-rw-r--r--net/netfilter/nf_conntrack_netlink.c1625
-rw-r--r--net/netfilter/nf_conntrack_ovs.c185
-rw-r--r--net/netfilter/nf_conntrack_pptp.c135
-rw-r--r--net/netfilter/nf_conntrack_proto.c805
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c911
-rw-r--r--net/netfilter/nf_conntrack_proto_generic.c107
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c214
-rw-r--r--net/netfilter/nf_conntrack_proto_icmp.c273
-rw-r--r--net/netfilter/nf_conntrack_proto_icmpv6.c268
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c682
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c1080
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c343
-rw-r--r--net/netfilter/nf_conntrack_sane.c84
-rw-r--r--net/netfilter/nf_conntrack_seqadj.c28
-rw-r--r--net/netfilter/nf_conntrack_sip.c101
-rw-r--r--net/netfilter/nf_conntrack_snmp.c6
-rw-r--r--net/netfilter/nf_conntrack_standalone.c712
-rw-r--r--net/netfilter/nf_conntrack_tftp.c24
-rw-r--r--net/netfilter/nf_conntrack_timeout.c126
-rw-r--r--net/netfilter/nf_conntrack_timestamp.c95
-rw-r--r--net/netfilter/nf_dup_netdev.c67
-rw-r--r--net/netfilter/nf_flow_table_bpf.c121
-rw-r--r--net/netfilter/nf_flow_table_core.c724
-rw-r--r--net/netfilter/nf_flow_table_inet.c74
-rw-r--r--net/netfilter/nf_flow_table_ip.c902
-rw-r--r--net/netfilter/nf_flow_table_offload.c1241
-rw-r--r--net/netfilter/nf_flow_table_path.c330
-rw-r--r--net/netfilter/nf_flow_table_procfs.c80
-rw-r--r--net/netfilter/nf_flow_table_xdp.c147
-rw-r--r--net/netfilter/nf_hooks_lwtunnel.c123
-rw-r--r--net/netfilter/nf_internals.h26
-rw-r--r--net/netfilter/nf_log.c75
-rw-r--r--net/netfilter/nf_log_common.c217
-rw-r--r--net/netfilter/nf_log_netdev.c81
-rw-r--r--net/netfilter/nf_log_syslog.c1085
-rw-r--r--net/netfilter/nf_nat_amanda.c29
-rw-r--r--net/netfilter/nf_nat_bpf.c77
-rw-r--r--net/netfilter/nf_nat_core.c905
-rw-r--r--net/netfilter/nf_nat_ftp.c31
-rw-r--r--net/netfilter/nf_nat_helper.c59
-rw-r--r--net/netfilter/nf_nat_irc.c31
-rw-r--r--net/netfilter/nf_nat_masquerade.c368
-rw-r--r--net/netfilter/nf_nat_ovs.c136
-rw-r--r--net/netfilter/nf_nat_proto.c1119
-rw-r--r--net/netfilter/nf_nat_proto_common.c120
-rw-r--r--net/netfilter/nf_nat_proto_dccp.c82
-rw-r--r--net/netfilter/nf_nat_proto_sctp.c77
-rw-r--r--net/netfilter/nf_nat_proto_tcp.c85
-rw-r--r--net/netfilter/nf_nat_proto_udp.c130
-rw-r--r--net/netfilter/nf_nat_proto_unknown.c54
-rw-r--r--net/netfilter/nf_nat_redirect.c117
-rw-r--r--net/netfilter/nf_nat_sip.c75
-rw-r--r--net/netfilter/nf_nat_tftp.c16
-rw-r--r--net/netfilter/nf_queue.c259
-rw-r--r--net/netfilter/nf_sockopt.c60
-rw-r--r--net/netfilter/nf_synproxy_core.c940
-rw-r--r--net/netfilter/nf_tables_api.c9501
-rw-r--r--net/netfilter/nf_tables_core.c273
-rw-r--r--net/netfilter/nf_tables_offload.c699
-rw-r--r--net/netfilter/nf_tables_set_core.c28
-rw-r--r--net/netfilter/nf_tables_trace.c189
-rw-r--r--net/netfilter/nfnetlink.c303
-rw-r--r--net/netfilter/nfnetlink_acct.c146
-rw-r--r--net/netfilter/nfnetlink_cthelper.c139
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c406
-rw-r--r--net/netfilter/nfnetlink_hook.c486
-rw-r--r--net/netfilter/nfnetlink_log.c197
-rw-r--r--net/netfilter/nfnetlink_osf.c107
-rw-r--r--net/netfilter/nfnetlink_queue.c512
-rw-r--r--net/netfilter/nft_bitwise.c616
-rw-r--r--net/netfilter/nft_byteorder.c72
-rw-r--r--net/netfilter/nft_chain_filter.c197
-rw-r--r--net/netfilter/nft_chain_nat.c149
-rw-r--r--net/netfilter/nft_chain_route.c169
-rw-r--r--net/netfilter/nft_cmp.c292
-rw-r--r--net/netfilter/nft_compat.c442
-rw-r--r--net/netfilter/nft_connlimit.c97
-rw-r--r--net/netfilter/nft_counter.c178
-rw-r--r--net/netfilter/nft_ct.c402
-rw-r--r--net/netfilter/nft_ct_fast.c62
-rw-r--r--net/netfilter/nft_dup_netdev.c36
-rw-r--r--net/netfilter/nft_dynset.c328
-rw-r--r--net/netfilter/nft_exthdr.c522
-rw-r--r--net/netfilter/nft_fib.c92
-rw-r--r--net/netfilter/nft_fib_inet.c8
-rw-r--r--net/netfilter/nft_fib_netdev.c10
-rw-r--r--net/netfilter/nft_flow_offload.c154
-rw-r--r--net/netfilter/nft_fwd_netdev.c71
-rw-r--r--net/netfilter/nft_hash.c178
-rw-r--r--net/netfilter/nft_immediate.c246
-rw-r--r--net/netfilter/nft_inner.c431
-rw-r--r--net/netfilter/nft_last.c138
-rw-r--r--net/netfilter/nft_limit.c225
-rw-r--r--net/netfilter/nft_log.c34
-rw-r--r--net/netfilter/nft_lookup.c224
-rw-r--r--net/netfilter/nft_masq.c248
-rw-r--r--net/netfilter/nft_meta.c837
-rw-r--r--net/netfilter/nft_nat.c242
-rw-r--r--net/netfilter/nft_numgen.c185
-rw-r--r--net/netfilter/nft_objref.c158
-rw-r--r--net/netfilter/nft_osf.c115
-rw-r--r--net/netfilter/nft_payload.c792
-rw-r--r--net/netfilter/nft_queue.c52
-rw-r--r--net/netfilter/nft_quota.c115
-rw-r--r--net/netfilter/nft_range.c39
-rw-r--r--net/netfilter/nft_redir.c227
-rw-r--r--net/netfilter/nft_reject.c35
-rw-r--r--net/netfilter/nft_reject_inet.c80
-rw-r--r--net/netfilter/nft_reject_netdev.c190
-rw-r--r--net/netfilter/nft_rt.c46
-rw-r--r--net/netfilter/nft_set_bitmap.c105
-rw-r--r--net/netfilter/nft_set_hash.c544
-rw-r--r--net/netfilter/nft_set_pipapo.c2400
-rw-r--r--net/netfilter/nft_set_pipapo.h302
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.c1302
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.h16
-rw-r--r--net/netfilter/nft_set_rbtree.c680
-rw-r--r--net/netfilter/nft_socket.c221
-rw-r--r--net/netfilter/nft_synproxy.c397
-rw-r--r--net/netfilter/nft_tproxy.c78
-rw-r--r--net/netfilter/nft_tunnel.c313
-rw-r--r--net/netfilter/nft_xfrm.c324
-rw-r--r--net/netfilter/utils.c81
-rw-r--r--net/netfilter/x_tables.c312
-rw-r--r--net/netfilter/xt_AUDIT.c7
-rw-r--r--net/netfilter/xt_CHECKSUM.c38
-rw-r--r--net/netfilter/xt_CLASSIFY.c21
-rw-r--r--net/netfilter/xt_CONNSECMARK.c44
-rw-r--r--net/netfilter/xt_CT.c244
-rw-r--r--net/netfilter/xt_DSCP.c21
-rw-r--r--net/netfilter/xt_HL.c9
-rw-r--r--net/netfilter/xt_HMARK.c13
-rw-r--r--net/netfilter/xt_IDLETIMER.c373
-rw-r--r--net/netfilter/xt_LED.c69
-rw-r--r--net/netfilter/xt_LOG.c16
-rw-r--r--net/netfilter/xt_MASQUERADE.c128
-rw-r--r--net/netfilter/xt_NETMAP.c5
-rw-r--r--net/netfilter/xt_NFLOG.c52
-rw-r--r--net/netfilter/xt_NFQUEUE.c6
-rw-r--r--net/netfilter/xt_RATEEST.c68
-rw-r--r--net/netfilter/xt_REDIRECT.c15
-rw-r--r--net/netfilter/xt_SECMARK.c121
-rw-r--r--net/netfilter/xt_TCPMSS.c11
-rw-r--r--net/netfilter/xt_TCPOPTSTRIP.c37
-rw-r--r--net/netfilter/xt_TEE.c81
-rw-r--r--net/netfilter/xt_TPROXY.c44
-rw-r--r--net/netfilter/xt_TRACE.c38
-rw-r--r--net/netfilter/xt_addrtype.c36
-rw-r--r--net/netfilter/xt_bpf.c7
-rw-r--r--net/netfilter/xt_cgroup.c103
-rw-r--r--net/netfilter/xt_cluster.c38
-rw-r--r--net/netfilter/xt_comment.c1
-rw-r--r--net/netfilter/xt_connbytes.c4
-rw-r--r--net/netfilter/xt_connlabel.c7
-rw-r--r--net/netfilter/xt_connlimit.c76
-rw-r--r--net/netfilter/xt_connmark.c62
-rw-r--r--net/netfilter/xt_conntrack.c5
-rw-r--r--net/netfilter/xt_cpu.c6
-rw-r--r--net/netfilter/xt_dccp.c5
-rw-r--r--net/netfilter/xt_devgroup.c5
-rw-r--r--net/netfilter/xt_dscp.c5
-rw-r--r--net/netfilter/xt_ecn.c5
-rw-r--r--net/netfilter/xt_esp.c5
-rw-r--r--net/netfilter/xt_hashlimit.c119
-rw-r--r--net/netfilter/xt_helper.c5
-rw-r--r--net/netfilter/xt_hl.c5
-rw-r--r--net/netfilter/xt_ipcomp.c6
-rw-r--r--net/netfilter/xt_iprange.c9
-rw-r--r--net/netfilter/xt_ipvs.c1
-rw-r--r--net/netfilter/xt_l2tp.c5
-rw-r--r--net/netfilter/xt_length.c10
-rw-r--r--net/netfilter/xt_limit.c57
-rw-r--r--net/netfilter/xt_mac.c5
-rw-r--r--net/netfilter/xt_mark.c47
-rw-r--r--net/netfilter/xt_multiport.c5
-rw-r--r--net/netfilter/xt_nat.c10
-rw-r--r--net/netfilter/xt_nfacct.c49
-rw-r--r--net/netfilter/xt_osf.c24
-rw-r--r--net/netfilter/xt_owner.c45
-rw-r--r--net/netfilter/xt_physdev.c29
-rw-r--r--net/netfilter/xt_pkttype.c5
-rw-r--r--net/netfilter/xt_policy.c7
-rw-r--r--net/netfilter/xt_quota.c1
-rw-r--r--net/netfilter/xt_rateest.c5
-rw-r--r--net/netfilter/xt_realm.c5
-rw-r--r--net/netfilter/xt_recent.c60
-rw-r--r--net/netfilter/xt_repldata.h2
-rw-r--r--net/netfilter/xt_sctp.c4
-rw-r--r--net/netfilter/xt_set.c51
-rw-r--r--net/netfilter/xt_socket.c26
-rw-r--r--net/netfilter/xt_state.c5
-rw-r--r--net/netfilter/xt_statistic.c7
-rw-r--r--net/netfilter/xt_string.c5
-rw-r--r--net/netfilter/xt_tcpmss.c5
-rw-r--r--net/netfilter/xt_tcpudp.c111
-rw-r--r--net/netfilter/xt_time.c42
-rw-r--r--net/netfilter/xt_u32.c22
-rw-r--r--net/netlabel/Kconfig5
-rw-r--r--net/netlabel/netlabel_addrlist.c16
-rw-r--r--net/netlabel/netlabel_addrlist.h16
-rw-r--r--net/netlabel/netlabel_calipso.c88
-rw-r--r--net/netlabel/netlabel_calipso.h16
-rw-r--r--net/netlabel/netlabel_cipso_v4.c101
-rw-r--r--net/netlabel/netlabel_cipso_v4.h19
-rw-r--r--net/netlabel/netlabel_domainhash.c87
-rw-r--r--net/netlabel/netlabel_domainhash.h18
-rw-r--r--net/netlabel/netlabel_kapi.c78
-rw-r--r--net/netlabel/netlabel_mgmt.c92
-rw-r--r--net/netlabel/netlabel_mgmt.h16
-rw-r--r--net/netlabel/netlabel_unlabeled.c124
-rw-r--r--net/netlabel/netlabel_unlabeled.h16
-rw-r--r--net/netlabel/netlabel_user.c27
-rw-r--r--net/netlabel/netlabel_user.h22
-rw-r--r--net/netlink/Kconfig3
-rw-r--r--net/netlink/Makefile3
-rw-r--r--net/netlink/af_netlink.c928
-rw-r--r--net/netlink/af_netlink.h32
-rw-r--r--net/netlink/diag.c20
-rw-r--r--net/netlink/genetlink.c1335
-rw-r--r--net/netlink/genetlink.h11
-rw-r--r--net/netlink/policy.c493
-rw-r--r--net/netrom/Makefile1
-rw-r--r--net/netrom/af_netrom.c178
-rw-r--r--net/netrom/nr_dev.c15
-rw-r--r--net/netrom/nr_in.c14
-rw-r--r--net/netrom/nr_loopback.c12
-rw-r--r--net/netrom/nr_out.c7
-rw-r--r--net/netrom/nr_route.c68
-rw-r--r--net/netrom/nr_subr.c17
-rw-r--r--net/netrom/nr_timer.c57
-rw-r--r--net/netrom/sysctl_net_netrom.c11
-rw-r--r--net/nfc/Kconfig4
-rw-r--r--net/nfc/af_nfc.c19
-rw-r--r--net/nfc/core.c109
-rw-r--r--net/nfc/digital.h11
-rw-r--r--net/nfc/digital_core.c30
-rw-r--r--net/nfc/digital_dep.c22
-rw-r--r--net/nfc/digital_technology.c19
-rw-r--r--net/nfc/hci/Kconfig17
-rw-r--r--net/nfc/hci/Makefile1
-rw-r--r--net/nfc/hci/command.c32
-rw-r--r--net/nfc/hci/core.c55
-rw-r--r--net/nfc/hci/hci.h14
-rw-r--r--net/nfc/hci/hcp.c26
-rw-r--r--net/nfc/hci/llc.c46
-rw-r--r--net/nfc/hci/llc.h20
-rw-r--r--net/nfc/hci/llc_nop.c15
-rw-r--r--net/nfc/hci/llc_shdlc.c78
-rw-r--r--net/nfc/llcp.h24
-rw-r--r--net/nfc/llcp_commands.c101
-rw-r--r--net/nfc/llcp_core.c225
-rw-r--r--net/nfc/llcp_sock.c135
-rw-r--r--net/nfc/nci/Kconfig1
-rw-r--r--net/nfc/nci/core.c317
-rw-r--r--net/nfc/nci/data.c36
-rw-r--r--net/nfc/nci/hci.c104
-rw-r--r--net/nfc/nci/lib.c14
-rw-r--r--net/nfc/nci/ntf.c268
-rw-r--r--net/nfc/nci/rsp.c137
-rw-r--r--net/nfc/nci/spi.c26
-rw-r--r--net/nfc/nci/uart.c215
-rw-r--r--net/nfc/netlink.c209
-rw-r--r--net/nfc/nfc.h17
-rw-r--r--net/nfc/rawsock.c39
-rw-r--r--net/nsh/Kconfig3
-rw-r--r--net/nsh/Makefile1
-rw-r--r--net/nsh/nsh.c26
-rw-r--r--net/openvswitch/Kconfig16
-rw-r--r--net/openvswitch/Makefile3
-rw-r--r--net/openvswitch/actions.c681
-rw-r--r--net/openvswitch/conntrack.c735
-rw-r--r--net/openvswitch/conntrack.h16
-rw-r--r--net/openvswitch/datapath.c797
-rw-r--r--net/openvswitch/datapath.h137
-rw-r--r--net/openvswitch/dp_notify.c19
-rw-r--r--net/openvswitch/drop.h41
-rw-r--r--net/openvswitch/flow.c444
-rw-r--r--net/openvswitch/flow.h53
-rw-r--r--net/openvswitch/flow_netlink.c894
-rw-r--r--net/openvswitch/flow_netlink.h18
-rw-r--r--net/openvswitch/flow_table.c790
-rw-r--r--net/openvswitch/flow_table.h65
-rw-r--r--net/openvswitch/meter.c373
-rw-r--r--net/openvswitch/meter.h28
-rw-r--r--net/openvswitch/openvswitch_trace.c10
-rw-r--r--net/openvswitch/openvswitch_trace.h158
-rw-r--r--net/openvswitch/vport-geneve.c10
-rw-r--r--net/openvswitch/vport-gre.c19
-rw-r--r--net/openvswitch/vport-internal_dev.c113
-rw-r--r--net/openvswitch/vport-internal_dev.h15
-rw-r--r--net/openvswitch/vport-netdev.c47
-rw-r--r--net/openvswitch/vport-netdev.h15
-rw-r--r--net/openvswitch/vport-vxlan.c25
-rw-r--r--net/openvswitch/vport.c118
-rw-r--r--net/openvswitch/vport.h48
-rw-r--r--net/packet/Kconfig5
-rw-r--r--net/packet/Makefile1
-rw-r--r--net/packet/af_packet.c1405
-rw-r--r--net/packet/diag.c25
-rw-r--r--net/packet/internal.h68
-rw-r--r--net/phonet/Kconfig1
-rw-r--r--net/phonet/af_phonet.c32
-rw-r--r--net/phonet/datagram.c34
-rw-r--r--net/phonet/pep-gprs.c19
-rw-r--r--net/phonet/pep.c136
-rw-r--r--net/phonet/pn_dev.c103
-rw-r--r--net/phonet/pn_netlink.c177
-rw-r--r--net/phonet/socket.c84
-rw-r--r--net/phonet/sysctl.c21
-rw-r--r--net/psample/Kconfig2
-rw-r--r--net/psample/Makefile1
-rw-r--r--net/psample/psample.c272
-rw-r--r--net/psp/Kconfig15
-rw-r--r--net/psp/Makefile5
-rw-r--r--net/psp/psp-nl-gen.c139
-rw-r--r--net/psp/psp-nl-gen.h42
-rw-r--r--net/psp/psp.h54
-rw-r--r--net/psp/psp_main.c323
-rw-r--r--net/psp/psp_nl.c598
-rw-r--r--net/psp/psp_sock.c294
-rw-r--r--net/qrtr/Kconfig15
-rw-r--r--net/qrtr/Makefile6
-rw-r--r--net/qrtr/af_qrtr.c (renamed from net/qrtr/qrtr.c)567
-rw-r--r--net/qrtr/mhi.c183
-rw-r--r--net/qrtr/ns.c778
-rw-r--r--net/qrtr/qrtr.h4
-rw-r--r--net/qrtr/smd.c10
-rw-r--r--net/qrtr/tun.c29
-rw-r--r--net/rds/Kconfig20
-rw-r--r--net/rds/Makefile7
-rw-r--r--net/rds/af_rds.c190
-rw-r--r--net/rds/bind.c61
-rw-r--r--net/rds/cong.c2
-rw-r--r--net/rds/connection.c76
-rw-r--r--net/rds/ib.c83
-rw-r--r--net/rds/ib.h39
-rw-r--r--net/rds/ib_cm.c365
-rw-r--r--net/rds/ib_fmr.c258
-rw-r--r--net/rds/ib_frmr.c115
-rw-r--r--net/rds/ib_mr.h26
-rw-r--r--net/rds/ib_rdma.c178
-rw-r--r--net/rds/ib_recv.c81
-rw-r--r--net/rds/ib_ring.c2
-rw-r--r--net/rds/ib_send.c121
-rw-r--r--net/rds/ib_stats.c2
-rw-r--r--net/rds/ib_sysctl.c1
-rw-r--r--net/rds/info.c7
-rw-r--r--net/rds/message.c47
-rw-r--r--net/rds/page.c25
-rw-r--r--net/rds/rdma.c318
-rw-r--r--net/rds/rdma_transport.c47
-rw-r--r--net/rds/rdma_transport.h7
-rw-r--r--net/rds/rds.h100
-rw-r--r--net/rds/recv.c78
-rw-r--r--net/rds/send.c130
-rw-r--r--net/rds/stats.c6
-rw-r--r--net/rds/sysctl.c1
-rw-r--r--net/rds/tcp.c103
-rw-r--r--net/rds/tcp.h10
-rw-r--r--net/rds/tcp_connect.c12
-rw-r--r--net/rds/tcp_listen.c90
-rw-r--r--net/rds/tcp_recv.c8
-rw-r--r--net/rds/tcp_send.c32
-rw-r--r--net/rds/threads.c3
-rw-r--r--net/rds/transport.c26
-rw-r--r--net/rfkill/Kconfig1
-rw-r--r--net/rfkill/Makefile1
-rw-r--r--net/rfkill/core.c201
-rw-r--r--net/rfkill/input.c11
-rw-r--r--net/rfkill/rfkill-gpio.c84
-rw-r--r--net/rfkill/rfkill.h6
-rw-r--r--net/rose/Makefile1
-rw-r--r--net/rose/af_rose.c169
-rw-r--r--net/rose/rose_dev.c13
-rw-r--r--net/rose/rose_in.c23
-rw-r--r--net/rose/rose_link.c26
-rw-r--r--net/rose/rose_loopback.c52
-rw-r--r--net/rose/rose_out.c5
-rw-r--r--net/rose/rose_route.c131
-rw-r--r--net/rose/rose_subr.c26
-rw-r--r--net/rose/rose_timer.c62
-rw-r--r--net/rose/sysctl_net_rose.c6
-rw-r--r--net/rxrpc/Kconfig56
-rw-r--r--net/rxrpc/Makefile13
-rw-r--r--net/rxrpc/af_rxrpc.c382
-rw-r--r--net/rxrpc/ar-internal.h1270
-rw-r--r--net/rxrpc/call_accept.c497
-rw-r--r--net/rxrpc/call_event.c738
-rw-r--r--net/rxrpc/call_object.c703
-rw-r--r--net/rxrpc/call_state.c69
-rw-r--r--net/rxrpc/conn_client.c1229
-rw-r--r--net/rxrpc/conn_event.c625
-rw-r--r--net/rxrpc/conn_object.c388
-rw-r--r--net/rxrpc/conn_service.c41
-rw-r--r--net/rxrpc/input.c1959
-rw-r--r--net/rxrpc/input_rack.c418
-rw-r--r--net/rxrpc/insecure.c68
-rw-r--r--net/rxrpc/io_thread.c602
-rw-r--r--net/rxrpc/key.c835
-rw-r--r--net/rxrpc/local_event.c65
-rw-r--r--net/rxrpc/local_object.c398
-rw-r--r--net/rxrpc/misc.c47
-rw-r--r--net/rxrpc/net_ns.c50
-rw-r--r--net/rxrpc/oob.c379
-rw-r--r--net/rxrpc/output.c1135
-rw-r--r--net/rxrpc/peer_event.c353
-rw-r--r--net/rxrpc/peer_object.c219
-rw-r--r--net/rxrpc/proc.c502
-rw-r--r--net/rxrpc/protocol.h59
-rw-r--r--net/rxrpc/recvmsg.c617
-rw-r--r--net/rxrpc/rtt.c208
-rw-r--r--net/rxrpc/rxgk.c1373
-rw-r--r--net/rxrpc/rxgk_app.c297
-rw-r--r--net/rxrpc/rxgk_common.h149
-rw-r--r--net/rxrpc/rxgk_kdf.c288
-rw-r--r--net/rxrpc/rxkad.c1213
-rw-r--r--net/rxrpc/rxperf.c703
-rw-r--r--net/rxrpc/security.c176
-rw-r--r--net/rxrpc/sendmsg.c750
-rw-r--r--net/rxrpc/server_key.c213
-rw-r--r--net/rxrpc/skbuff.c68
-rw-r--r--net/rxrpc/sysctl.c81
-rw-r--r--net/rxrpc/txbuf.c96
-rw-r--r--net/rxrpc/utils.c29
-rw-r--r--net/sched/Kconfig452
-rw-r--r--net/sched/Makefile18
-rw-r--r--net/sched/act_api.c1680
-rw-r--r--net/sched/act_bpf.c113
-rw-r--r--net/sched/act_connmark.c196
-rw-r--r--net/sched/act_csum.c172
-rw-r--r--net/sched/act_ct.c1698
-rw-r--r--net/sched/act_ctinfo.c403
-rw-r--r--net/sched/act_gact.c153
-rw-r--r--net/sched/act_gate.c676
-rw-r--r--net/sched/act_ife.c177
-rw-r--r--net/sched/act_ipt.c452
-rw-r--r--net/sched/act_meta_mark.c7
-rw-r--r--net/sched/act_meta_skbprio.c7
-rw-r--r--net/sched/act_meta_skbtcindex.c7
-rw-r--r--net/sched/act_mirred.c506
-rw-r--r--net/sched/act_mpls.c489
-rw-r--r--net/sched/act_nat.c159
-rw-r--r--net/sched/act_pedit.c531
-rw-r--r--net/sched/act_police.c476
-rw-r--r--net/sched/act_sample.c201
-rw-r--r--net/sched/act_simple.c135
-rw-r--r--net/sched/act_skbedit.c252
-rw-r--r--net/sched/act_skbmod.c165
-rw-r--r--net/sched/act_tunnel_key.c447
-rw-r--r--net/sched/act_vlan.c239
-rw-r--r--net/sched/bpf_qdisc.c472
-rw-r--r--net/sched/cls_api.c2966
-rw-r--r--net/sched/cls_basic.c82
-rw-r--r--net/sched/cls_bpf.c206
-rw-r--r--net/sched/cls_cgroup.c40
-rw-r--r--net/sched/cls_flow.c60
-rw-r--r--net/sched/cls_flower.c2584
-rw-r--r--net/sched/cls_fw.c84
-rw-r--r--net/sched/cls_matchall.c200
-rw-r--r--net/sched/cls_route.c115
-rw-r--r--net/sched/cls_rsvp.c28
-rw-r--r--net/sched/cls_rsvp.h772
-rw-r--r--net/sched/cls_rsvp6.c28
-rw-r--r--net/sched/cls_tcindex.c673
-rw-r--r--net/sched/cls_u32.c484
-rw-r--r--net/sched/em_canid.c11
-rw-r--r--net/sched/em_cmp.c16
-rw-r--r--net/sched/em_ipset.c7
-rw-r--r--net/sched/em_ipt.c62
-rw-r--r--net/sched/em_meta.c39
-rw-r--r--net/sched/em_nbyte.c13
-rw-r--r--net/sched/em_text.c24
-rw-r--r--net/sched/em_u32.c7
-rw-r--r--net/sched/ematch.c25
-rw-r--r--net/sched/sch_api.c1062
-rw-r--r--net/sched/sch_atm.c705
-rw-r--r--net/sched/sch_blackhole.c6
-rw-r--r--net/sched/sch_cake.c781
-rw-r--r--net/sched/sch_cbq.c1807
-rw-r--r--net/sched/sch_cbs.c166
-rw-r--r--net/sched/sch_choke.c62
-rw-r--r--net/sched/sch_codel.c102
-rw-r--r--net/sched/sch_drr.c91
-rw-r--r--net/sched/sch_dsmark.c516
-rw-r--r--net/sched/sch_dualpi2.c1177
-rw-r--r--net/sched/sch_etf.c117
-rw-r--r--net/sched/sch_ets.c839
-rw-r--r--net/sched/sch_fifo.c125
-rw-r--r--net/sched/sch_fq.c883
-rw-r--r--net/sched/sch_fq_codel.c153
-rw-r--r--net/sched/sch_fq_pie.c595
-rw-r--r--net/sched/sch_frag.c160
-rw-r--r--net/sched/sch_generic.c869
-rw-r--r--net/sched/sch_gred.c424
-rw-r--r--net/sched/sch_hfsc.c159
-rw-r--r--net/sched/sch_hhf.c72
-rw-r--r--net/sched/sch_htb.c851
-rw-r--r--net/sched/sch_ingress.c108
-rw-r--r--net/sched/sch_mq.c71
-rw-r--r--net/sched/sch_mqprio.c550
-rw-r--r--net/sched/sch_mqprio_lib.c132
-rw-r--r--net/sched/sch_mqprio_lib.h20
-rw-r--r--net/sched/sch_multiq.c63
-rw-r--r--net/sched/sch_netem.c448
-rw-r--r--net/sched/sch_pie.c457
-rw-r--r--net/sched/sch_plug.c13
-rw-r--r--net/sched/sch_prio.c103
-rw-r--r--net/sched/sch_qfq.c219
-rw-r--r--net/sched/sch_red.c249
-rw-r--r--net/sched/sch_sfb.c73
-rw-r--r--net/sched/sch_sfq.c169
-rw-r--r--net/sched/sch_skbprio.c34
-rw-r--r--net/sched/sch_taprio.c2574
-rw-r--r--net/sched/sch_tbf.c133
-rw-r--r--net/sched/sch_teql.c21
-rw-r--r--net/sctp/Kconfig55
-rw-r--r--net/sctp/Makefile3
-rw-r--r--net/sctp/associola.c193
-rw-r--r--net/sctp/auth.c300
-rw-r--r--net/sctp/bind_addr.c86
-rw-r--r--net/sctp/chunk.c74
-rw-r--r--net/sctp/debug.c18
-rw-r--r--net/sctp/diag.c136
-rw-r--r--net/sctp/endpointola.c158
-rw-r--r--net/sctp/input.c455
-rw-r--r--net/sctp/inqueue.c44
-rw-r--r--net/sctp/ipv6.c328
-rw-r--r--net/sctp/objcnt.c17
-rw-r--r--net/sctp/offload.c25
-rw-r--r--net/sctp/output.c115
-rw-r--r--net/sctp/outqueue.c102
-rw-r--r--net/sctp/primitive.c19
-rw-r--r--net/sctp/proc.c61
-rw-r--r--net/sctp/protocol.c380
-rw-r--r--net/sctp/sm_make_chunk.c339
-rw-r--r--net/sctp/sm_sideeffect.c245
-rw-r--r--net/sctp/sm_statefuns.c529
-rw-r--r--net/sctp/sm_statetable.c90
-rw-r--r--net/sctp/socket.c3556
-rw-r--r--net/sctp/stream.c279
-rw-r--r--net/sctp/stream_interleave.c171
-rw-r--r--net/sctp/stream_sched.c82
-rw-r--r--net/sctp/stream_sched_fc.c225
-rw-r--r--net/sctp/stream_sched_prio.c85
-rw-r--r--net/sctp/stream_sched_rr.c24
-rw-r--r--net/sctp/sysctl.c359
-rw-r--r--net/sctp/transport.c244
-rw-r--r--net/sctp/tsnmap.c19
-rw-r--r--net/sctp/ulpevent.c98
-rw-r--r--net/sctp/ulpqueue.c111
-rw-r--r--net/shaper/Makefile8
-rw-r--r--net/shaper/shaper.c1438
-rw-r--r--net/shaper/shaper_nl_gen.c155
-rw-r--r--net/shaper/shaper_nl_gen.h45
-rw-r--r--net/smc/Kconfig17
-rw-r--r--net/smc/Makefile7
-rw-r--r--net/smc/af_smc.c2938
-rw-r--r--net/smc/smc.h192
-rw-r--r--net/smc/smc_cdc.c299
-rw-r--r--net/smc/smc_cdc.h135
-rw-r--r--net/smc/smc_clc.c1265
-rw-r--r--net/smc/smc_clc.h424
-rw-r--r--net/smc/smc_close.c164
-rw-r--r--net/smc/smc_close.h3
-rw-r--r--net/smc/smc_core.c2580
-rw-r--r--net/smc/smc_core.h459
-rw-r--r--net/smc/smc_diag.c98
-rw-r--r--net/smc/smc_hs_bpf.c140
-rw-r--r--net/smc/smc_hs_bpf.h31
-rw-r--r--net/smc/smc_ib.c640
-rw-r--r--net/smc/smc_ib.h54
-rw-r--r--net/smc/smc_inet.c163
-rw-r--r--net/smc/smc_inet.h22
-rw-r--r--net/smc/smc_ism.c495
-rw-r--r--net/smc/smc_ism.h97
-rw-r--r--net/smc/smc_llc.c2203
-rw-r--r--net/smc/smc_llc.h80
-rw-r--r--net/smc/smc_netlink.c157
-rw-r--r--net/smc/smc_netlink.h34
-rw-r--r--net/smc/smc_netns.h21
-rw-r--r--net/smc/smc_pnet.c1163
-rw-r--r--net/smc/smc_pnet.h42
-rw-r--r--net/smc/smc_rx.c192
-rw-r--r--net/smc/smc_rx.h8
-rw-r--r--net/smc/smc_stats.c419
-rw-r--r--net/smc/smc_stats.h280
-rw-r--r--net/smc/smc_sysctl.c266
-rw-r--r--net/smc/smc_sysctl.h37
-rw-r--r--net/smc/smc_tracepoint.c9
-rw-r--r--net/smc/smc_tracepoint.h125
-rw-r--r--net/smc/smc_tx.c297
-rw-r--r--net/smc/smc_tx.h1
-rw-r--r--net/smc/smc_wr.c498
-rw-r--r--net/smc/smc_wr.h47
-rw-r--r--net/socket.c2438
-rw-r--r--net/strparser/Kconfig5
-rw-r--r--net/strparser/Makefile1
-rw-r--r--net/strparser/strparser.c88
-rw-r--r--net/sunrpc/.kunitconfig29
-rw-r--r--net/sunrpc/Kconfig73
-rw-r--r--net/sunrpc/Makefile4
-rw-r--r--net/sunrpc/addr.c49
-rw-r--r--net/sunrpc/auth.c619
-rw-r--r--net/sunrpc/auth_generic.c299
-rw-r--r--net/sunrpc/auth_gss/Makefile8
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c974
-rw-r--r--net/sunrpc/auth_gss/auth_gss_internal.h47
-rw-r--r--net/sunrpc/auth_gss/gss_generic_token.c233
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c1094
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_internal.h195
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_keys.c432
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c996
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seal.c142
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seqnum.c166
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_test.c1859
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_unseal.c133
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_wrap.c436
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c100
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_upcall.c35
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_upcall.h16
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c50
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.h17
-rw-r--r--net/sunrpc/auth_gss/svcauth_gss.c1442
-rw-r--r--net/sunrpc/auth_gss/trace.c14
-rw-r--r--net/sunrpc/auth_null.c66
-rw-r--r--net/sunrpc/auth_tls.c175
-rw-r--r--net/sunrpc/auth_unix.c239
-rw-r--r--net/sunrpc/backchannel_rqst.c139
-rw-r--r--net/sunrpc/cache.c582
-rw-r--r--net/sunrpc/clnt.c1875
-rw-r--r--net/sunrpc/debugfs.c197
-rw-r--r--net/sunrpc/fail.h25
-rw-r--r--net/sunrpc/netns.h1
-rw-r--r--net/sunrpc/rpc_pipe.c615
-rw-r--r--net/sunrpc/rpcb_clnt.c193
-rw-r--r--net/sunrpc/sched.c692
-rw-r--r--net/sunrpc/socklib.c307
-rw-r--r--net/sunrpc/socklib.h15
-rw-r--r--net/sunrpc/stats.c58
-rw-r--r--net/sunrpc/sunrpc.h25
-rw-r--r--net/sunrpc/sunrpc_syms.c14
-rw-r--r--net/sunrpc/svc.c1309
-rw-r--r--net/sunrpc/svc_xprt.c922
-rw-r--r--net/sunrpc/svcauth.c199
-rw-r--r--net/sunrpc/svcauth_unix.c334
-rw-r--r--net/sunrpc/svcsock.c1206
-rw-r--r--net/sunrpc/sysctl.c92
-rw-r--r--net/sunrpc/sysfs.c829
-rw-r--r--net/sunrpc/sysfs.h35
-rw-r--r--net/sunrpc/timer.c1
-rw-r--r--net/sunrpc/xdr.c1479
-rw-r--r--net/sunrpc/xprt.c1580
-rw-r--r--net/sunrpc/xprtmultipath.c288
-rw-r--r--net/sunrpc/xprtrdma/Makefile5
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c229
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c348
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c914
-rw-r--r--net/sunrpc/xprtrdma/ib_client.c184
-rw-r--r--net/sunrpc/xprtrdma/module.c19
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c994
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c257
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c203
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_pcl.c306
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c876
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_rw.c1130
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c1110
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c448
-rw-r--r--net/sunrpc/xprtrdma/transport.c487
-rw-r--r--net/sunrpc/xprtrdma/verbs.c1732
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h438
-rw-r--r--net/sunrpc/xprtsock.c2590
-rw-r--r--net/switchdev/Kconfig3
-rw-r--r--net/switchdev/Makefile3
-rw-r--r--net/switchdev/switchdev.c1057
-rw-r--r--net/sysctl_net.c65
-rw-r--r--net/tipc/Kconfig24
-rw-r--r--net/tipc/Makefile11
-rw-r--r--net/tipc/addr.c15
-rw-r--r--net/tipc/addr.h48
-rw-r--r--net/tipc/bcast.c346
-rw-r--r--net/tipc/bcast.h25
-rw-r--r--net/tipc/bearer.c403
-rw-r--r--net/tipc/bearer.h51
-rw-r--r--net/tipc/core.c101
-rw-r--r--net/tipc/core.h48
-rw-r--r--net/tipc/crypto.c2484
-rw-r--r--net/tipc/crypto.h200
-rw-r--r--net/tipc/diag.c2
-rw-r--r--net/tipc/discover.c44
-rw-r--r--net/tipc/eth_media.c9
-rw-r--r--net/tipc/group.c30
-rw-r--r--net/tipc/group.h3
-rw-r--r--net/tipc/ib_media.c7
-rw-r--r--net/tipc/link.c1510
-rw-r--r--net/tipc/link.h26
-rw-r--r--net/tipc/monitor.c107
-rw-r--r--net/tipc/monitor.h1
-rw-r--r--net/tipc/msg.c450
-rw-r--r--net/tipc/msg.h307
-rw-r--r--net/tipc/name_distr.c280
-rw-r--r--net/tipc/name_distr.h10
-rw-r--r--net/tipc/name_table.c729
-rw-r--r--net/tipc/name_table.h84
-rw-r--r--net/tipc/net.c105
-rw-r--r--net/tipc/net.h3
-rw-r--r--net/tipc/netlink.c94
-rw-r--r--net/tipc/netlink.h1
-rw-r--r--net/tipc/netlink_compat.c235
-rw-r--r--net/tipc/node.c1029
-rw-r--r--net/tipc/node.h50
-rw-r--r--net/tipc/socket.c1614
-rw-r--r--net/tipc/socket.h6
-rw-r--r--net/tipc/subscr.c89
-rw-r--r--net/tipc/subscr.h42
-rw-r--r--net/tipc/sysctl.c44
-rw-r--r--net/tipc/topsrv.c126
-rw-r--r--net/tipc/trace.c206
-rw-r--r--net/tipc/trace.h434
-rw-r--r--net/tipc/udp_media.c224
-rw-r--r--net/tls/Kconfig16
-rw-r--r--net/tls/Makefile6
-rw-r--r--net/tls/tls.h382
-rw-r--r--net/tls/tls_device.c1153
-rw-r--r--net/tls/tls_device_fallback.c159
-rw-r--r--net/tls/tls_main.c1097
-rw-r--r--net/tls/tls_proc.c63
-rw-r--r--net/tls/tls_strp.c642
-rw-r--r--net/tls/tls_sw.c3115
-rw-r--r--net/tls/tls_toe.c141
-rw-r--r--net/tls/trace.c10
-rw-r--r--net/tls/trace.h202
-rw-r--r--net/unix/Kconfig18
-rw-r--r--net/unix/Makefile1
-rw-r--r--net/unix/af_unix.c3013
-rw-r--r--net/unix/af_unix.h70
-rw-r--r--net/unix/diag.c157
-rw-r--r--net/unix/garbage.c720
-rw-r--r--net/unix/sysctl_net_unix.c39
-rw-r--r--net/unix/unix_bpf.c202
-rw-r--r--net/vmw_vsock/Kconfig15
-rw-r--r--net/vmw_vsock/Makefile2
-rw-r--r--net/vmw_vsock/af_vsock.c1562
-rw-r--r--net/vmw_vsock/af_vsock_tap.c6
-rw-r--r--net/vmw_vsock/diag.c12
-rw-r--r--net/vmw_vsock/hyperv_transport.c497
-rw-r--r--net/vmw_vsock/virtio_transport.c777
-rw-r--r--net/vmw_vsock/virtio_transport_common.c1361
-rw-r--r--net/vmw_vsock/vmci_transport.c275
-rw-r--r--net/vmw_vsock/vmci_transport.h16
-rw-r--r--net/vmw_vsock/vmci_transport_notify.c20
-rw-r--r--net/vmw_vsock/vmci_transport_notify.h11
-rw-r--r--net/vmw_vsock/vmci_transport_notify_qstate.c22
-rw-r--r--net/vmw_vsock/vsock_addr.c16
-rw-r--r--net/vmw_vsock/vsock_bpf.c175
-rw-r--r--net/vmw_vsock/vsock_loopback.c179
-rw-r--r--net/wimax/Kconfig39
-rw-r--r--net/wimax/Makefile13
-rw-r--r--net/wimax/debug-levels.h43
-rw-r--r--net/wimax/debugfs.c78
-rw-r--r--net/wimax/id-table.c145
-rw-r--r--net/wimax/op-msg.c406
-rw-r--r--net/wimax/op-reset.c123
-rw-r--r--net/wimax/op-rfkill.c446
-rw-r--r--net/wimax/op-state-get.c65
-rw-r--r--net/wimax/stack.c632
-rw-r--r--net/wimax/wimax-internal.h103
-rw-r--r--net/wireless/.gitignore1
-rw-r--r--net/wireless/Kconfig66
-rw-r--r--net/wireless/Makefile20
-rw-r--r--net/wireless/ap.c48
-rw-r--r--net/wireless/certs/wens.hex87
-rw-r--r--net/wireless/chan.c1539
-rw-r--r--net/wireless/core.c990
-rw-r--r--net/wireless/core.h298
-rw-r--r--net/wireless/debugfs.c205
-rw-r--r--net/wireless/ethtool.c16
-rw-r--r--net/wireless/ibss.c166
-rw-r--r--net/wireless/lib80211.c258
-rw-r--r--net/wireless/lib80211_crypt_ccmp.c479
-rw-r--r--net/wireless/lib80211_crypt_tkip.c776
-rw-r--r--net/wireless/lib80211_crypt_wep.c297
-rw-r--r--net/wireless/mesh.c61
-rw-r--r--net/wireless/mlme.c994
-rw-r--r--net/wireless/nl80211.c10604
-rw-r--r--net/wireless/nl80211.h50
-rw-r--r--net/wireless/ocb.c54
-rw-r--r--net/wireless/pmsr.c668
-rw-r--r--net/wireless/radiotap.c16
-rw-r--r--net/wireless/rdev-ops.h526
-rw-r--r--net/wireless/reg.c1188
-rw-r--r--net/wireless/reg.h45
-rw-r--r--net/wireless/scan.c2980
-rw-r--r--net/wireless/sme.c765
-rw-r--r--net/wireless/sysfs.c28
-rw-r--r--net/wireless/tests/Makefile3
-rw-r--r--net/wireless/tests/chan.c228
-rw-r--r--net/wireless/tests/fragmentation.c177
-rw-r--r--net/wireless/tests/module.c10
-rw-r--r--net/wireless/tests/scan.c880
-rw-r--r--net/wireless/tests/util.c56
-rw-r--r--net/wireless/tests/util.h66
-rw-r--r--net/wireless/trace.h1853
-rw-r--r--net/wireless/util.c1487
-rw-r--r--net/wireless/wext-compat.c419
-rw-r--r--net/wireless/wext-compat.h15
-rw-r--r--net/wireless/wext-core.c58
-rw-r--r--net/wireless/wext-sme.c144
-rw-r--r--net/wireless/wext-spy.c232
-rw-r--r--net/x25/Kconfig9
-rw-r--r--net/x25/af_x25.c215
-rw-r--r--net/x25/sysctl_net_x25.c1
-rw-r--r--net/x25/x25_dev.c50
-rw-r--r--net/x25/x25_facilities.c23
-rw-r--r--net/x25/x25_forward.c27
-rw-r--r--net/x25/x25_in.c52
-rw-r--r--net/x25/x25_link.c70
-rw-r--r--net/x25/x25_out.c9
-rw-r--r--net/x25/x25_proc.c10
-rw-r--r--net/x25/x25_route.c34
-rw-r--r--net/x25/x25_subr.c13
-rw-r--r--net/x25/x25_timer.c15
-rw-r--r--net/xdp/Kconfig9
-rw-r--r--net/xdp/Makefile5
-rw-r--r--net/xdp/xdp_umem.c277
-rw-r--r--net/xdp/xdp_umem.h19
-rw-r--r--net/xdp/xdp_umem_props.h14
-rw-r--r--net/xdp/xsk.c1621
-rw-r--r--net/xdp/xsk.h48
-rw-r--r--net/xdp/xsk_buff_pool.c755
-rw-r--r--net/xdp/xsk_diag.c216
-rw-r--r--net/xdp/xsk_queue.c47
-rw-r--r--net/xdp/xsk_queue.h489
-rw-r--r--net/xdp/xskmap.c281
-rw-r--r--net/xfrm/Kconfig99
-rw-r--r--net/xfrm/Makefile15
-rw-r--r--net/xfrm/espintcp.c589
-rw-r--r--net/xfrm/trace_iptfs.h218
-rw-r--r--net/xfrm/xfrm_algo.c72
-rw-r--r--net/xfrm/xfrm_compat.c688
-rw-r--r--net/xfrm/xfrm_device.c379
-rw-r--r--net/xfrm/xfrm_hash.c2
-rw-r--r--net/xfrm/xfrm_hash.h12
-rw-r--r--net/xfrm/xfrm_inout.h70
-rw-r--r--net/xfrm/xfrm_input.c569
-rw-r--r--net/xfrm/xfrm_interface_bpf.c110
-rw-r--r--net/xfrm/xfrm_interface_core.c (renamed from net/xfrm/xfrm_interface.c)732
-rw-r--r--net/xfrm/xfrm_ipcomp.c458
-rw-r--r--net/xfrm/xfrm_iptfs.c2762
-rw-r--r--net/xfrm/xfrm_nat_keepalive.c302
-rw-r--r--net/xfrm/xfrm_output.c762
-rw-r--r--net/xfrm/xfrm_policy.c2207
-rw-r--r--net/xfrm/xfrm_proc.c23
-rw-r--r--net/xfrm/xfrm_replay.c217
-rw-r--r--net/xfrm/xfrm_state.c1730
-rw-r--r--net/xfrm/xfrm_state_bpf.c134
-rw-r--r--net/xfrm/xfrm_sysctl.c9
-rw-r--r--net/xfrm/xfrm_user.c1379
2101 files changed, 507931 insertions, 222324 deletions
diff --git a/net/6lowpan/6lowpan_i.h b/net/6lowpan/6lowpan_i.h
index 53cf446ce2e3..01853cec0209 100644
--- a/net/6lowpan/6lowpan_i.h
+++ b/net/6lowpan/6lowpan_i.h
@@ -18,24 +18,16 @@ extern const struct ndisc_ops lowpan_ndisc_ops;
int addrconf_ifid_802154_6lowpan(u8 *eui, struct net_device *dev);
#ifdef CONFIG_6LOWPAN_DEBUGFS
-int lowpan_dev_debugfs_init(struct net_device *dev);
+void lowpan_dev_debugfs_init(struct net_device *dev);
void lowpan_dev_debugfs_exit(struct net_device *dev);
-int __init lowpan_debugfs_init(void);
+void __init lowpan_debugfs_init(void);
void lowpan_debugfs_exit(void);
#else
-static inline int lowpan_dev_debugfs_init(struct net_device *dev)
-{
- return 0;
-}
-
+static inline void lowpan_dev_debugfs_init(struct net_device *dev) { }
static inline void lowpan_dev_debugfs_exit(struct net_device *dev) { }
-static inline int __init lowpan_debugfs_init(void)
-{
- return 0;
-}
-
+static inline void __init lowpan_debugfs_init(void) { }
static inline void lowpan_debugfs_exit(void) { }
#endif /* CONFIG_6LOWPAN_DEBUGFS */
diff --git a/net/6lowpan/Kconfig b/net/6lowpan/Kconfig
index 9c051512d14f..d8fc459492b0 100644
--- a/net/6lowpan/Kconfig
+++ b/net/6lowpan/Kconfig
@@ -1,7 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
menuconfig 6LOWPAN
tristate "6LoWPAN Support"
depends on IPV6
- ---help---
+ help
This enables IPv6 over Low power Wireless Personal Area Network -
"6LoWPAN" which is supported by IEEE 802.15.4 or Bluetooth stacks.
@@ -9,7 +10,7 @@ config 6LOWPAN_DEBUGFS
bool "6LoWPAN debugfs support"
depends on 6LOWPAN
depends on DEBUG_FS
- ---help---
+ help
This enables 6LoWPAN debugfs support. For example to manipulate
IPHC context information at runtime.
@@ -17,7 +18,7 @@ menuconfig 6LOWPAN_NHC
tristate "Next Header and Generic Header Compression Support"
depends on 6LOWPAN
default y
- ---help---
+ help
Support for next header and generic header compression defined in
RFC6282 and RFC7400.
@@ -26,78 +27,78 @@ if 6LOWPAN_NHC
config 6LOWPAN_NHC_DEST
tristate "Destination Options Header Support"
default y
- ---help---
+ help
6LoWPAN IPv6 Destination Options Header compression according to
RFC6282.
config 6LOWPAN_NHC_FRAGMENT
tristate "Fragment Header Support"
default y
- ---help---
+ help
6LoWPAN IPv6 Fragment Header compression according to RFC6282.
config 6LOWPAN_NHC_HOP
tristate "Hop-by-Hop Options Header Support"
default y
- ---help---
+ help
6LoWPAN IPv6 Hop-by-Hop Options Header compression according to
RFC6282.
config 6LOWPAN_NHC_IPV6
tristate "IPv6 Header Support"
default y
- ---help---
+ help
6LoWPAN IPv6 Header compression according to RFC6282.
config 6LOWPAN_NHC_MOBILITY
tristate "Mobility Header Support"
default y
- ---help---
+ help
6LoWPAN IPv6 Mobility Header compression according to RFC6282.
config 6LOWPAN_NHC_ROUTING
tristate "Routing Header Support"
default y
- ---help---
+ help
6LoWPAN IPv6 Routing Header compression according to RFC6282.
config 6LOWPAN_NHC_UDP
tristate "UDP Header Support"
default y
- ---help---
+ help
6LoWPAN IPv6 UDP Header compression according to RFC6282.
config 6LOWPAN_GHC_EXT_HDR_HOP
tristate "GHC Hop-by-Hop Options Header Support"
- ---help---
+ help
6LoWPAN IPv6 Hop-by-Hop option generic header compression according
to RFC7400.
config 6LOWPAN_GHC_UDP
tristate "GHC UDP Support"
- ---help---
+ help
6LoWPAN IPv6 UDP generic header compression according to RFC7400.
config 6LOWPAN_GHC_ICMPV6
tristate "GHC ICMPv6 Support"
- ---help---
+ help
6LoWPAN IPv6 ICMPv6 generic header compression according to RFC7400.
config 6LOWPAN_GHC_EXT_HDR_DEST
tristate "GHC Destination Options Header Support"
- ---help---
+ help
6LoWPAN IPv6 destination option generic header compression according
to RFC7400.
config 6LOWPAN_GHC_EXT_HDR_FRAG
tristate "GHC Fragmentation Options Header Support"
- ---help---
+ help
6LoWPAN IPv6 fragmentation option generic header compression
according to RFC7400.
config 6LOWPAN_GHC_EXT_HDR_ROUTE
tristate "GHC Routing Options Header Support"
- ---help---
+ help
6LoWPAN IPv6 routing option generic header compression according
to RFC7400.
diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c
index 40d3d72beb53..850d4a185f55 100644
--- a/net/6lowpan/core.c
+++ b/net/6lowpan/core.c
@@ -1,16 +1,11 @@
-/* This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
+// SPDX-License-Identifier: GPL-2.0-only
+/*
*
* Authors:
* (C) 2015 Pengutronix, Alexander Aring <aar@pengutronix.de>
*/
+#include <linux/if_arp.h>
#include <linux/module.h>
#include <net/6lowpan.h>
@@ -48,9 +43,7 @@ int lowpan_register_netdevice(struct net_device *dev,
if (ret < 0)
return ret;
- ret = lowpan_dev_debugfs_init(dev);
- if (ret < 0)
- unregister_netdevice(dev);
+ lowpan_dev_debugfs_init(dev);
return ret;
}
@@ -158,9 +151,7 @@ static int __init lowpan_module_init(void)
{
int ret;
- ret = lowpan_debugfs_init();
- if (ret < 0)
- return ret;
+ lowpan_debugfs_init();
ret = register_netdevice_notifier(&lowpan_notifier);
if (ret < 0) {
@@ -188,4 +179,5 @@ static void __exit lowpan_module_exit(void)
module_init(lowpan_module_init);
module_exit(lowpan_module_exit);
+MODULE_DESCRIPTION("IPv6 over Low-Power Wireless Personal Area Network core module");
MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c
index 24915e0bb9ea..600b9563bfc5 100644
--- a/net/6lowpan/debugfs.c
+++ b/net/6lowpan/debugfs.c
@@ -1,11 +1,5 @@
-/* This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
+// SPDX-License-Identifier: GPL-2.0-only
+/*
*
* Authors:
* (C) 2015 Pengutronix, Alexander Aring <aar@pengutronix.de>
@@ -41,9 +35,9 @@ static int lowpan_ctx_flag_active_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(lowpan_ctx_flag_active_fops,
- lowpan_ctx_flag_active_get,
- lowpan_ctx_flag_active_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(lowpan_ctx_flag_active_fops,
+ lowpan_ctx_flag_active_get,
+ lowpan_ctx_flag_active_set, "%llu\n");
static int lowpan_ctx_flag_c_set(void *data, u64 val)
{
@@ -66,8 +60,8 @@ static int lowpan_ctx_flag_c_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(lowpan_ctx_flag_c_fops, lowpan_ctx_flag_c_get,
- lowpan_ctx_flag_c_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(lowpan_ctx_flag_c_fops, lowpan_ctx_flag_c_get,
+ lowpan_ctx_flag_c_set, "%llu\n");
static int lowpan_ctx_plen_set(void *data, u64 val)
{
@@ -97,8 +91,8 @@ static int lowpan_ctx_plen_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(lowpan_ctx_plen_fops, lowpan_ctx_plen_get,
- lowpan_ctx_plen_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(lowpan_ctx_plen_fops, lowpan_ctx_plen_get,
+ lowpan_ctx_plen_set, "%llu\n");
static int lowpan_ctx_pfx_show(struct seq_file *file, void *offset)
{
@@ -169,46 +163,31 @@ static const struct file_operations lowpan_ctx_pfx_fops = {
.release = single_release,
};
-static int lowpan_dev_debugfs_ctx_init(struct net_device *dev,
- struct dentry *ctx, u8 id)
+static void lowpan_dev_debugfs_ctx_init(struct net_device *dev,
+ struct dentry *ctx, u8 id)
{
struct lowpan_dev *ldev = lowpan_dev(dev);
- struct dentry *dentry, *root;
+ struct dentry *root;
char buf[32];
- WARN_ON_ONCE(id > LOWPAN_IPHC_CTX_TABLE_SIZE);
+ if (WARN_ON_ONCE(id >= LOWPAN_IPHC_CTX_TABLE_SIZE))
+ return;
sprintf(buf, "%d", id);
root = debugfs_create_dir(buf, ctx);
- if (!root)
- return -EINVAL;
-
- dentry = debugfs_create_file("active", 0644, root,
- &ldev->ctx.table[id],
- &lowpan_ctx_flag_active_fops);
- if (!dentry)
- return -EINVAL;
- dentry = debugfs_create_file("compression", 0644, root,
- &ldev->ctx.table[id],
- &lowpan_ctx_flag_c_fops);
- if (!dentry)
- return -EINVAL;
+ debugfs_create_file("active", 0644, root, &ldev->ctx.table[id],
+ &lowpan_ctx_flag_active_fops);
- dentry = debugfs_create_file("prefix", 0644, root,
- &ldev->ctx.table[id],
- &lowpan_ctx_pfx_fops);
- if (!dentry)
- return -EINVAL;
+ debugfs_create_file("compression", 0644, root, &ldev->ctx.table[id],
+ &lowpan_ctx_flag_c_fops);
- dentry = debugfs_create_file("prefix_len", 0644, root,
- &ldev->ctx.table[id],
- &lowpan_ctx_plen_fops);
- if (!dentry)
- return -EINVAL;
+ debugfs_create_file("prefix", 0644, root, &ldev->ctx.table[id],
+ &lowpan_ctx_pfx_fops);
- return 0;
+ debugfs_create_file("prefix_len", 0644, root, &ldev->ctx.table[id],
+ &lowpan_ctx_plen_fops);
}
static int lowpan_context_show(struct seq_file *file, void *offset)
@@ -232,18 +211,7 @@ static int lowpan_context_show(struct seq_file *file, void *offset)
return 0;
}
-
-static int lowpan_context_open(struct inode *inode, struct file *file)
-{
- return single_open(file, lowpan_context_show, inode->i_private);
-}
-
-static const struct file_operations lowpan_context_fops = {
- .open = lowpan_context_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(lowpan_context);
static int lowpan_short_addr_get(void *data, u64 *val)
{
@@ -256,67 +224,42 @@ static int lowpan_short_addr_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(lowpan_short_addr_fops, lowpan_short_addr_get,
- NULL, "0x%04llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(lowpan_short_addr_fops, lowpan_short_addr_get, NULL,
+ "0x%04llx\n");
-static int lowpan_dev_debugfs_802154_init(const struct net_device *dev,
+static void lowpan_dev_debugfs_802154_init(const struct net_device *dev,
struct lowpan_dev *ldev)
{
- struct dentry *dentry, *root;
+ struct dentry *root;
if (!lowpan_is_ll(dev, LOWPAN_LLTYPE_IEEE802154))
- return 0;
+ return;
root = debugfs_create_dir("ieee802154", ldev->iface_debugfs);
- if (!root)
- return -EINVAL;
-
- dentry = debugfs_create_file("short_addr", 0444, root,
- lowpan_802154_dev(dev)->wdev->ieee802154_ptr,
- &lowpan_short_addr_fops);
- if (!dentry)
- return -EINVAL;
- return 0;
+ debugfs_create_file("short_addr", 0444, root,
+ lowpan_802154_dev(dev)->wdev->ieee802154_ptr,
+ &lowpan_short_addr_fops);
}
-int lowpan_dev_debugfs_init(struct net_device *dev)
+void lowpan_dev_debugfs_init(struct net_device *dev)
{
struct lowpan_dev *ldev = lowpan_dev(dev);
- struct dentry *contexts, *dentry;
- int ret, i;
+ struct dentry *contexts;
+ int i;
/* creating the root */
ldev->iface_debugfs = debugfs_create_dir(dev->name, lowpan_debugfs);
- if (!ldev->iface_debugfs)
- goto fail;
contexts = debugfs_create_dir("contexts", ldev->iface_debugfs);
- if (!contexts)
- goto remove_root;
-
- dentry = debugfs_create_file("show", 0644, contexts,
- &lowpan_dev(dev)->ctx,
- &lowpan_context_fops);
- if (!dentry)
- goto remove_root;
- for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) {
- ret = lowpan_dev_debugfs_ctx_init(dev, contexts, i);
- if (ret < 0)
- goto remove_root;
- }
+ debugfs_create_file("show", 0644, contexts, &lowpan_dev(dev)->ctx,
+ &lowpan_context_fops);
- ret = lowpan_dev_debugfs_802154_init(dev, ldev);
- if (ret < 0)
- goto remove_root;
-
- return 0;
+ for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++)
+ lowpan_dev_debugfs_ctx_init(dev, contexts, i);
-remove_root:
- lowpan_dev_debugfs_exit(dev);
-fail:
- return -EINVAL;
+ lowpan_dev_debugfs_802154_init(dev, ldev);
}
void lowpan_dev_debugfs_exit(struct net_device *dev)
@@ -324,13 +267,9 @@ void lowpan_dev_debugfs_exit(struct net_device *dev)
debugfs_remove_recursive(lowpan_dev(dev)->iface_debugfs);
}
-int __init lowpan_debugfs_init(void)
+void __init lowpan_debugfs_init(void)
{
lowpan_debugfs = debugfs_create_dir("6lowpan", NULL);
- if (!lowpan_debugfs)
- return -EINVAL;
-
- return 0;
}
void lowpan_debugfs_exit(void)
diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c
index 52fad5dad9f7..e116d308a8df 100644
--- a/net/6lowpan/iphc.c
+++ b/net/6lowpan/iphc.c
@@ -848,7 +848,7 @@ static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct net_device *dev,
const struct lowpan_iphc_ctx *ctx,
const unsigned char *lladdr, bool sam)
{
- struct in6_addr tmp = {};
+ struct in6_addr tmp;
u8 dam;
switch (lowpan_dev(dev)->lltype) {
diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c
index 941df2fa4448..868d28583c0a 100644
--- a/net/6lowpan/ndisc.c
+++ b/net/6lowpan/ndisc.c
@@ -1,11 +1,5 @@
-/* This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
+// SPDX-License-Identifier: GPL-2.0-only
+/*
*
* Authors:
* (C) 2016 Pengutronix, Alexander Aring <aar@pengutronix.de>
@@ -17,11 +11,6 @@
#include "6lowpan_i.h"
-static int lowpan_ndisc_is_useropt(u8 nd_opt_type)
-{
- return nd_opt_type == ND_OPT_6CO;
-}
-
#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
#define NDISC_802154_SHORT_ADDR_LENGTH 1
static int lowpan_ndisc_parse_802154_options(const struct net_device *dev,
@@ -31,9 +20,8 @@ static int lowpan_ndisc_parse_802154_options(const struct net_device *dev,
switch (nd_opt->nd_opt_len) {
case NDISC_802154_SHORT_ADDR_LENGTH:
if (ndopts->nd_802154_opt_array[nd_opt->nd_opt_type])
- ND_PRINTK(2, warn,
- "%s: duplicated short addr ND6 option found: type=%d\n",
- __func__, nd_opt->nd_opt_type);
+ net_dbg_ratelimited("%s: duplicated short addr ND6 option found: type=%d\n",
+ __func__, nd_opt->nd_opt_type);
else
ndopts->nd_802154_opt_array[nd_opt->nd_opt_type] = nd_opt;
return 1;
@@ -74,8 +62,7 @@ static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags,
lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_src_lladdr,
IEEE802154_SHORT_ADDR_LEN, 0);
if (!lladdr_short) {
- ND_PRINTK(2, warn,
- "NA: invalid short link-layer address length\n");
+ net_dbg_ratelimited("NA: invalid short link-layer address length\n");
return;
}
}
@@ -86,8 +73,7 @@ static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags,
lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_tgt_lladdr,
IEEE802154_SHORT_ADDR_LEN, 0);
if (!lladdr_short) {
- ND_PRINTK(2, warn,
- "NA: invalid short link-layer address length\n");
+ net_dbg_ratelimited("NA: invalid short link-layer address length\n");
return;
}
}
@@ -220,15 +206,13 @@ static void lowpan_ndisc_prefix_rcv_add_addr(struct net *net,
sllao, tokenized, valid_lft,
prefered_lft);
if (err)
- ND_PRINTK(2, warn,
- "RA: could not add a short address based address for prefix: %pI6c\n",
- &pinfo->prefix);
+ net_dbg_ratelimited("RA: could not add a short address based address for prefix: %pI6c\n",
+ &pinfo->prefix);
}
}
#endif
const struct ndisc_ops lowpan_ndisc_ops = {
- .is_useropt = lowpan_ndisc_is_useropt,
#if IS_ENABLED(CONFIG_IEEE802154_6LOWPAN)
.parse_options = lowpan_ndisc_parse_options,
.update = lowpan_ndisc_update,
diff --git a/net/6lowpan/nhc.c b/net/6lowpan/nhc.c
index 4fa2fdda174d..7b374595328d 100644
--- a/net/6lowpan/nhc.c
+++ b/net/6lowpan/nhc.c
@@ -1,14 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* 6LoWPAN next header compression
*
- *
* Authors:
* Alexander Aring <aar@pengutronix.de>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/netdevice.h>
@@ -17,77 +12,26 @@
#include "nhc.h"
-static struct rb_root rb_root = RB_ROOT;
-static struct lowpan_nhc *lowpan_nexthdr_nhcs[NEXTHDR_MAX];
+static const struct lowpan_nhc *lowpan_nexthdr_nhcs[NEXTHDR_MAX + 1];
static DEFINE_SPINLOCK(lowpan_nhc_lock);
-static int lowpan_nhc_insert(struct lowpan_nhc *nhc)
+static const struct lowpan_nhc *lowpan_nhc_by_nhcid(struct sk_buff *skb)
{
- struct rb_node **new = &rb_root.rb_node, *parent = NULL;
-
- /* Figure out where to put new node */
- while (*new) {
- struct lowpan_nhc *this = rb_entry(*new, struct lowpan_nhc,
- node);
- int result, len_dif, len;
-
- len_dif = nhc->idlen - this->idlen;
-
- if (nhc->idlen < this->idlen)
- len = nhc->idlen;
- else
- len = this->idlen;
-
- result = memcmp(nhc->id, this->id, len);
- if (!result)
- result = len_dif;
-
- parent = *new;
- if (result < 0)
- new = &((*new)->rb_left);
- else if (result > 0)
- new = &((*new)->rb_right);
- else
- return -EEXIST;
- }
+ const struct lowpan_nhc *nhc;
+ int i;
+ u8 id;
- /* Add new node and rebalance tree. */
- rb_link_node(&nhc->node, parent, new);
- rb_insert_color(&nhc->node, &rb_root);
+ if (!pskb_may_pull(skb, 1))
+ return NULL;
- return 0;
-}
+ id = *skb->data;
-static void lowpan_nhc_remove(struct lowpan_nhc *nhc)
-{
- rb_erase(&nhc->node, &rb_root);
-}
+ for (i = 0; i < NEXTHDR_MAX + 1; i++) {
+ nhc = lowpan_nexthdr_nhcs[i];
+ if (!nhc)
+ continue;
-static struct lowpan_nhc *lowpan_nhc_by_nhcid(const struct sk_buff *skb)
-{
- struct rb_node *node = rb_root.rb_node;
- const u8 *nhcid_skb_ptr = skb->data;
-
- while (node) {
- struct lowpan_nhc *nhc = rb_entry(node, struct lowpan_nhc,
- node);
- u8 nhcid_skb_ptr_masked[LOWPAN_NHC_MAX_ID_LEN];
- int result, i;
-
- if (nhcid_skb_ptr + nhc->idlen > skb->data + skb->len)
- return NULL;
-
- /* copy and mask afterwards the nhid value from skb */
- memcpy(nhcid_skb_ptr_masked, nhcid_skb_ptr, nhc->idlen);
- for (i = 0; i < nhc->idlen; i++)
- nhcid_skb_ptr_masked[i] &= nhc->idmask[i];
-
- result = memcmp(nhcid_skb_ptr_masked, nhc->id, nhc->idlen);
- if (result < 0)
- node = node->rb_left;
- else if (result > 0)
- node = node->rb_right;
- else
+ if ((id & nhc->idmask) == nhc->id)
return nhc;
}
@@ -97,7 +41,7 @@ static struct lowpan_nhc *lowpan_nhc_by_nhcid(const struct sk_buff *skb)
int lowpan_nhc_check_compression(struct sk_buff *skb,
const struct ipv6hdr *hdr, u8 **hc_ptr)
{
- struct lowpan_nhc *nhc;
+ const struct lowpan_nhc *nhc;
int ret = 0;
spin_lock_bh(&lowpan_nhc_lock);
@@ -115,7 +59,7 @@ int lowpan_nhc_do_compression(struct sk_buff *skb, const struct ipv6hdr *hdr,
u8 **hc_ptr)
{
int ret;
- struct lowpan_nhc *nhc;
+ const struct lowpan_nhc *nhc;
spin_lock_bh(&lowpan_nhc_lock);
@@ -158,7 +102,7 @@ int lowpan_nhc_do_uncompression(struct sk_buff *skb,
const struct net_device *dev,
struct ipv6hdr *hdr)
{
- struct lowpan_nhc *nhc;
+ const struct lowpan_nhc *nhc;
int ret;
spin_lock_bh(&lowpan_nhc_lock);
@@ -194,18 +138,9 @@ int lowpan_nhc_do_uncompression(struct sk_buff *skb,
return 0;
}
-int lowpan_nhc_add(struct lowpan_nhc *nhc)
+int lowpan_nhc_add(const struct lowpan_nhc *nhc)
{
- int ret;
-
- if (!nhc->idlen || !nhc->idsetup)
- return -EINVAL;
-
- WARN_ONCE(nhc->idlen > LOWPAN_NHC_MAX_ID_LEN,
- "LOWPAN_NHC_MAX_ID_LEN should be updated to %zd.\n",
- nhc->idlen);
-
- nhc->idsetup(nhc);
+ int ret = 0;
spin_lock_bh(&lowpan_nhc_lock);
@@ -214,10 +149,6 @@ int lowpan_nhc_add(struct lowpan_nhc *nhc)
goto out;
}
- ret = lowpan_nhc_insert(nhc);
- if (ret < 0)
- goto out;
-
lowpan_nexthdr_nhcs[nhc->nexthdr] = nhc;
out:
spin_unlock_bh(&lowpan_nhc_lock);
@@ -225,11 +156,10 @@ out:
}
EXPORT_SYMBOL(lowpan_nhc_add);
-void lowpan_nhc_del(struct lowpan_nhc *nhc)
+void lowpan_nhc_del(const struct lowpan_nhc *nhc)
{
spin_lock_bh(&lowpan_nhc_lock);
- lowpan_nhc_remove(nhc);
lowpan_nexthdr_nhcs[nhc->nexthdr] = NULL;
spin_unlock_bh(&lowpan_nhc_lock);
diff --git a/net/6lowpan/nhc.h b/net/6lowpan/nhc.h
index 67951c40734b..ab7b4977c32b 100644
--- a/net/6lowpan/nhc.h
+++ b/net/6lowpan/nhc.h
@@ -16,24 +16,20 @@
* @_name: const char * of common header compression name.
* @_nexthdr: ipv6 nexthdr field for the header compression.
* @_nexthdrlen: ipv6 nexthdr len for the reserved space.
- * @_idsetup: callback to setup id and mask values.
- * @_idlen: len for the next header id and mask, should be always the same.
+ * @_id: one byte nhc id value.
+ * @_idmask: one byte nhc id mask value.
* @_uncompress: callback for uncompression call.
* @_compress: callback for compression call.
*/
#define LOWPAN_NHC(__nhc, _name, _nexthdr, \
- _hdrlen, _idsetup, _idlen, \
+ _hdrlen, _id, _idmask, \
_uncompress, _compress) \
-static u8 __nhc##_val[_idlen]; \
-static u8 __nhc##_mask[_idlen]; \
-static struct lowpan_nhc __nhc = { \
+static const struct lowpan_nhc __nhc = { \
.name = _name, \
.nexthdr = _nexthdr, \
.nexthdrlen = _hdrlen, \
- .id = __nhc##_val, \
- .idmask = __nhc##_mask, \
- .idlen = _idlen, \
- .idsetup = _idsetup, \
+ .id = _id, \
+ .idmask = _idmask, \
.uncompress = _uncompress, \
.compress = _compress, \
}
@@ -53,27 +49,21 @@ module_exit(__nhc##_exit);
/**
* struct lowpan_nhc - hold 6lowpan next hdr compression ifnformation
*
- * @node: holder for the rbtree.
* @name: name of the specific next header compression
* @nexthdr: next header value of the protocol which should be compressed.
* @nexthdrlen: ipv6 nexthdr len for the reserved space.
- * @id: array for nhc id. Note this need to be in network byteorder.
- * @mask: array for nhc id mask. Note this need to be in network byteorder.
- * @len: the length of the next header id and mask.
- * @setup: callback to setup fill the next header id value and mask.
+ * @id: one byte nhc id value.
+ * @idmask: one byte nhc id mask value.
* @compress: callback to do the header compression.
* @uncompress: callback to do the header uncompression.
*/
struct lowpan_nhc {
- struct rb_node node;
const char *name;
- const u8 nexthdr;
- const size_t nexthdrlen;
- u8 *id;
- u8 *idmask;
- const size_t idlen;
+ u8 nexthdr;
+ size_t nexthdrlen;
+ u8 id;
+ u8 idmask;
- void (*idsetup)(struct lowpan_nhc *nhc);
int (*uncompress)(struct sk_buff *skb, size_t needed);
int (*compress)(struct sk_buff *skb, u8 **hc_ptr);
};
@@ -126,14 +116,14 @@ int lowpan_nhc_do_uncompression(struct sk_buff *skb,
*
* @nhc: nhc which should be add.
*/
-int lowpan_nhc_add(struct lowpan_nhc *nhc);
+int lowpan_nhc_add(const struct lowpan_nhc *nhc);
/**
* lowpan_nhc_del - delete a next header compression from framework
*
* @nhc: nhc which should be delete.
*/
-void lowpan_nhc_del(struct lowpan_nhc *nhc);
+void lowpan_nhc_del(const struct lowpan_nhc *nhc);
/**
* lowpan_nhc_init - adding all default nhcs
diff --git a/net/6lowpan/nhc_dest.c b/net/6lowpan/nhc_dest.c
index 0b292c9646eb..0cbcc7806469 100644
--- a/net/6lowpan/nhc_dest.c
+++ b/net/6lowpan/nhc_dest.c
@@ -1,27 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* 6LoWPAN IPv6 Destination Options Header compression according to
* RFC6282
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include "nhc.h"
-#define LOWPAN_NHC_DEST_IDLEN 1
#define LOWPAN_NHC_DEST_ID_0 0xe6
#define LOWPAN_NHC_DEST_MASK_0 0xfe
-static void dest_nhid_setup(struct lowpan_nhc *nhc)
-{
- nhc->id[0] = LOWPAN_NHC_DEST_ID_0;
- nhc->idmask[0] = LOWPAN_NHC_DEST_MASK_0;
-}
-
LOWPAN_NHC(nhc_dest, "RFC6282 Destination Options", NEXTHDR_DEST, 0,
- dest_nhid_setup, LOWPAN_NHC_DEST_IDLEN, NULL, NULL);
+ LOWPAN_NHC_DEST_ID_0, LOWPAN_NHC_DEST_MASK_0, NULL, NULL);
module_lowpan_nhc(nhc_dest);
MODULE_DESCRIPTION("6LoWPAN next header RFC6282 Destination Options compression");
diff --git a/net/6lowpan/nhc_fragment.c b/net/6lowpan/nhc_fragment.c
index 473dbc58ef84..9414552df0ac 100644
--- a/net/6lowpan/nhc_fragment.c
+++ b/net/6lowpan/nhc_fragment.c
@@ -1,26 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* 6LoWPAN IPv6 Fragment Header compression according to RFC6282
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include "nhc.h"
-#define LOWPAN_NHC_FRAGMENT_IDLEN 1
#define LOWPAN_NHC_FRAGMENT_ID_0 0xe4
#define LOWPAN_NHC_FRAGMENT_MASK_0 0xfe
-static void fragment_nhid_setup(struct lowpan_nhc *nhc)
-{
- nhc->id[0] = LOWPAN_NHC_FRAGMENT_ID_0;
- nhc->idmask[0] = LOWPAN_NHC_FRAGMENT_MASK_0;
-}
-
LOWPAN_NHC(nhc_fragment, "RFC6282 Fragment", NEXTHDR_FRAGMENT, 0,
- fragment_nhid_setup, LOWPAN_NHC_FRAGMENT_IDLEN, NULL, NULL);
+ LOWPAN_NHC_FRAGMENT_ID_0, LOWPAN_NHC_FRAGMENT_MASK_0, NULL, NULL);
module_lowpan_nhc(nhc_fragment);
MODULE_DESCRIPTION("6LoWPAN next header RFC6282 Fragment compression");
diff --git a/net/6lowpan/nhc_ghc_ext_dest.c b/net/6lowpan/nhc_ghc_ext_dest.c
index 9887b3a15348..e4745ddd10a8 100644
--- a/net/6lowpan/nhc_ghc_ext_dest.c
+++ b/net/6lowpan/nhc_ghc_ext_dest.c
@@ -1,26 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* 6LoWPAN Extension Header compression according to RFC7400
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include "nhc.h"
-#define LOWPAN_GHC_EXT_DEST_IDLEN 1
#define LOWPAN_GHC_EXT_DEST_ID_0 0xb6
#define LOWPAN_GHC_EXT_DEST_MASK_0 0xfe
-static void dest_ghid_setup(struct lowpan_nhc *nhc)
-{
- nhc->id[0] = LOWPAN_GHC_EXT_DEST_ID_0;
- nhc->idmask[0] = LOWPAN_GHC_EXT_DEST_MASK_0;
-}
-
LOWPAN_NHC(ghc_ext_dest, "RFC7400 Destination Extension Header", NEXTHDR_DEST,
- 0, dest_ghid_setup, LOWPAN_GHC_EXT_DEST_IDLEN, NULL, NULL);
+ 0, LOWPAN_GHC_EXT_DEST_ID_0, LOWPAN_GHC_EXT_DEST_MASK_0, NULL, NULL);
module_lowpan_nhc(ghc_ext_dest);
MODULE_DESCRIPTION("6LoWPAN generic header destination extension compression");
diff --git a/net/6lowpan/nhc_ghc_ext_frag.c b/net/6lowpan/nhc_ghc_ext_frag.c
index 1308b79e939d..220e5abfa946 100644
--- a/net/6lowpan/nhc_ghc_ext_frag.c
+++ b/net/6lowpan/nhc_ghc_ext_frag.c
@@ -1,27 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* 6LoWPAN Extension Header compression according to RFC7400
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include "nhc.h"
-#define LOWPAN_GHC_EXT_FRAG_IDLEN 1
#define LOWPAN_GHC_EXT_FRAG_ID_0 0xb4
#define LOWPAN_GHC_EXT_FRAG_MASK_0 0xfe
-static void frag_ghid_setup(struct lowpan_nhc *nhc)
-{
- nhc->id[0] = LOWPAN_GHC_EXT_FRAG_ID_0;
- nhc->idmask[0] = LOWPAN_GHC_EXT_FRAG_MASK_0;
-}
-
LOWPAN_NHC(ghc_ext_frag, "RFC7400 Fragmentation Extension Header",
- NEXTHDR_FRAGMENT, 0, frag_ghid_setup,
- LOWPAN_GHC_EXT_FRAG_IDLEN, NULL, NULL);
+ NEXTHDR_FRAGMENT, 0, LOWPAN_GHC_EXT_FRAG_ID_0,
+ LOWPAN_GHC_EXT_FRAG_MASK_0, NULL, NULL);
module_lowpan_nhc(ghc_ext_frag);
MODULE_DESCRIPTION("6LoWPAN generic header fragmentation extension compression");
diff --git a/net/6lowpan/nhc_ghc_ext_hop.c b/net/6lowpan/nhc_ghc_ext_hop.c
index baec86fd1974..9b0de4da7379 100644
--- a/net/6lowpan/nhc_ghc_ext_hop.c
+++ b/net/6lowpan/nhc_ghc_ext_hop.c
@@ -1,26 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* 6LoWPAN Extension Header compression according to RFC7400
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include "nhc.h"
-#define LOWPAN_GHC_EXT_HOP_IDLEN 1
#define LOWPAN_GHC_EXT_HOP_ID_0 0xb0
#define LOWPAN_GHC_EXT_HOP_MASK_0 0xfe
-static void hop_ghid_setup(struct lowpan_nhc *nhc)
-{
- nhc->id[0] = LOWPAN_GHC_EXT_HOP_ID_0;
- nhc->idmask[0] = LOWPAN_GHC_EXT_HOP_MASK_0;
-}
-
LOWPAN_NHC(ghc_ext_hop, "RFC7400 Hop-by-Hop Extension Header", NEXTHDR_HOP, 0,
- hop_ghid_setup, LOWPAN_GHC_EXT_HOP_IDLEN, NULL, NULL);
+ LOWPAN_GHC_EXT_HOP_ID_0, LOWPAN_GHC_EXT_HOP_MASK_0, NULL, NULL);
module_lowpan_nhc(ghc_ext_hop);
MODULE_DESCRIPTION("6LoWPAN generic header hop-by-hop extension compression");
diff --git a/net/6lowpan/nhc_ghc_ext_route.c b/net/6lowpan/nhc_ghc_ext_route.c
index d7e5bd791c62..3e86faec59c9 100644
--- a/net/6lowpan/nhc_ghc_ext_route.c
+++ b/net/6lowpan/nhc_ghc_ext_route.c
@@ -1,26 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* 6LoWPAN Extension Header compression according to RFC7400
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include "nhc.h"
-#define LOWPAN_GHC_EXT_ROUTE_IDLEN 1
#define LOWPAN_GHC_EXT_ROUTE_ID_0 0xb2
#define LOWPAN_GHC_EXT_ROUTE_MASK_0 0xfe
-static void route_ghid_setup(struct lowpan_nhc *nhc)
-{
- nhc->id[0] = LOWPAN_GHC_EXT_ROUTE_ID_0;
- nhc->idmask[0] = LOWPAN_GHC_EXT_ROUTE_MASK_0;
-}
-
LOWPAN_NHC(ghc_ext_route, "RFC7400 Routing Extension Header", NEXTHDR_ROUTING,
- 0, route_ghid_setup, LOWPAN_GHC_EXT_ROUTE_IDLEN, NULL, NULL);
+ 0, LOWPAN_GHC_EXT_ROUTE_ID_0, LOWPAN_GHC_EXT_ROUTE_MASK_0, NULL, NULL);
module_lowpan_nhc(ghc_ext_route);
MODULE_DESCRIPTION("6LoWPAN generic header routing extension compression");
diff --git a/net/6lowpan/nhc_ghc_icmpv6.c b/net/6lowpan/nhc_ghc_icmpv6.c
index 32e7c2c66bbc..1634f3eb0be8 100644
--- a/net/6lowpan/nhc_ghc_icmpv6.c
+++ b/net/6lowpan/nhc_ghc_icmpv6.c
@@ -1,26 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* 6LoWPAN ICMPv6 compression according to RFC7400
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include "nhc.h"
-#define LOWPAN_GHC_ICMPV6_IDLEN 1
#define LOWPAN_GHC_ICMPV6_ID_0 0xdf
#define LOWPAN_GHC_ICMPV6_MASK_0 0xff
-static void icmpv6_ghid_setup(struct lowpan_nhc *nhc)
-{
- nhc->id[0] = LOWPAN_GHC_ICMPV6_ID_0;
- nhc->idmask[0] = LOWPAN_GHC_ICMPV6_MASK_0;
-}
-
LOWPAN_NHC(ghc_icmpv6, "RFC7400 ICMPv6", NEXTHDR_ICMP, 0,
- icmpv6_ghid_setup, LOWPAN_GHC_ICMPV6_IDLEN, NULL, NULL);
+ LOWPAN_GHC_ICMPV6_ID_0, LOWPAN_GHC_ICMPV6_MASK_0, NULL, NULL);
module_lowpan_nhc(ghc_icmpv6);
MODULE_DESCRIPTION("6LoWPAN generic header ICMPv6 compression");
diff --git a/net/6lowpan/nhc_ghc_udp.c b/net/6lowpan/nhc_ghc_udp.c
index 17beefa52ca8..4ac4813b77ad 100644
--- a/net/6lowpan/nhc_ghc_udp.c
+++ b/net/6lowpan/nhc_ghc_udp.c
@@ -1,26 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* 6LoWPAN UDP compression according to RFC7400
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include "nhc.h"
-#define LOWPAN_GHC_UDP_IDLEN 1
#define LOWPAN_GHC_UDP_ID_0 0xd0
#define LOWPAN_GHC_UDP_MASK_0 0xf8
-static void udp_ghid_setup(struct lowpan_nhc *nhc)
-{
- nhc->id[0] = LOWPAN_GHC_UDP_ID_0;
- nhc->idmask[0] = LOWPAN_GHC_UDP_MASK_0;
-}
-
LOWPAN_NHC(ghc_udp, "RFC7400 UDP", NEXTHDR_UDP, 0,
- udp_ghid_setup, LOWPAN_GHC_UDP_IDLEN, NULL, NULL);
+ LOWPAN_GHC_UDP_ID_0, LOWPAN_GHC_UDP_MASK_0, NULL, NULL);
module_lowpan_nhc(ghc_udp);
MODULE_DESCRIPTION("6LoWPAN generic header UDP compression");
diff --git a/net/6lowpan/nhc_hop.c b/net/6lowpan/nhc_hop.c
index 1eb66be16f19..182087dfd09d 100644
--- a/net/6lowpan/nhc_hop.c
+++ b/net/6lowpan/nhc_hop.c
@@ -1,26 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* 6LoWPAN IPv6 Hop-by-Hop Options Header compression according to RFC6282
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include "nhc.h"
-#define LOWPAN_NHC_HOP_IDLEN 1
#define LOWPAN_NHC_HOP_ID_0 0xe0
#define LOWPAN_NHC_HOP_MASK_0 0xfe
-static void hop_nhid_setup(struct lowpan_nhc *nhc)
-{
- nhc->id[0] = LOWPAN_NHC_HOP_ID_0;
- nhc->idmask[0] = LOWPAN_NHC_HOP_MASK_0;
-}
-
LOWPAN_NHC(nhc_hop, "RFC6282 Hop-by-Hop Options", NEXTHDR_HOP, 0,
- hop_nhid_setup, LOWPAN_NHC_HOP_IDLEN, NULL, NULL);
+ LOWPAN_NHC_HOP_ID_0, LOWPAN_NHC_HOP_MASK_0, NULL, NULL);
module_lowpan_nhc(nhc_hop);
MODULE_DESCRIPTION("6LoWPAN next header RFC6282 Hop-by-Hop Options compression");
diff --git a/net/6lowpan/nhc_ipv6.c b/net/6lowpan/nhc_ipv6.c
index 2313d1600af3..20242360b1d4 100644
--- a/net/6lowpan/nhc_ipv6.c
+++ b/net/6lowpan/nhc_ipv6.c
@@ -1,26 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* 6LoWPAN IPv6 Header compression according to RFC6282
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include "nhc.h"
-#define LOWPAN_NHC_IPV6_IDLEN 1
#define LOWPAN_NHC_IPV6_ID_0 0xee
#define LOWPAN_NHC_IPV6_MASK_0 0xfe
-static void ipv6_nhid_setup(struct lowpan_nhc *nhc)
-{
- nhc->id[0] = LOWPAN_NHC_IPV6_ID_0;
- nhc->idmask[0] = LOWPAN_NHC_IPV6_MASK_0;
-}
-
-LOWPAN_NHC(nhc_ipv6, "RFC6282 IPv6", NEXTHDR_IPV6, 0, ipv6_nhid_setup,
- LOWPAN_NHC_IPV6_IDLEN, NULL, NULL);
+LOWPAN_NHC(nhc_ipv6, "RFC6282 IPv6", NEXTHDR_IPV6, 0, LOWPAN_NHC_IPV6_ID_0,
+ LOWPAN_NHC_IPV6_MASK_0, NULL, NULL);
module_lowpan_nhc(nhc_ipv6);
MODULE_DESCRIPTION("6LoWPAN next header RFC6282 IPv6 compression");
diff --git a/net/6lowpan/nhc_mobility.c b/net/6lowpan/nhc_mobility.c
index 60d3f3886c98..1c31d872c804 100644
--- a/net/6lowpan/nhc_mobility.c
+++ b/net/6lowpan/nhc_mobility.c
@@ -1,26 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* 6LoWPAN IPv6 Mobility Header compression according to RFC6282
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include "nhc.h"
-#define LOWPAN_NHC_MOBILITY_IDLEN 1
#define LOWPAN_NHC_MOBILITY_ID_0 0xe8
#define LOWPAN_NHC_MOBILITY_MASK_0 0xfe
-static void mobility_nhid_setup(struct lowpan_nhc *nhc)
-{
- nhc->id[0] = LOWPAN_NHC_MOBILITY_ID_0;
- nhc->idmask[0] = LOWPAN_NHC_MOBILITY_MASK_0;
-}
-
LOWPAN_NHC(nhc_mobility, "RFC6282 Mobility", NEXTHDR_MOBILITY, 0,
- mobility_nhid_setup, LOWPAN_NHC_MOBILITY_IDLEN, NULL, NULL);
+ LOWPAN_NHC_MOBILITY_ID_0, LOWPAN_NHC_MOBILITY_MASK_0, NULL, NULL);
module_lowpan_nhc(nhc_mobility);
MODULE_DESCRIPTION("6LoWPAN next header RFC6282 Mobility compression");
diff --git a/net/6lowpan/nhc_routing.c b/net/6lowpan/nhc_routing.c
index c393280f11c4..dae03ebf7021 100644
--- a/net/6lowpan/nhc_routing.c
+++ b/net/6lowpan/nhc_routing.c
@@ -1,26 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* 6LoWPAN IPv6 Routing Header compression according to RFC6282
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include "nhc.h"
-#define LOWPAN_NHC_ROUTING_IDLEN 1
#define LOWPAN_NHC_ROUTING_ID_0 0xe2
#define LOWPAN_NHC_ROUTING_MASK_0 0xfe
-static void routing_nhid_setup(struct lowpan_nhc *nhc)
-{
- nhc->id[0] = LOWPAN_NHC_ROUTING_ID_0;
- nhc->idmask[0] = LOWPAN_NHC_ROUTING_MASK_0;
-}
-
LOWPAN_NHC(nhc_routing, "RFC6282 Routing", NEXTHDR_ROUTING, 0,
- routing_nhid_setup, LOWPAN_NHC_ROUTING_IDLEN, NULL, NULL);
+ LOWPAN_NHC_ROUTING_ID_0, LOWPAN_NHC_ROUTING_MASK_0, NULL, NULL);
module_lowpan_nhc(nhc_routing);
MODULE_DESCRIPTION("6LoWPAN next header RFC6282 Routing compression");
diff --git a/net/6lowpan/nhc_udp.c b/net/6lowpan/nhc_udp.c
index 225d91906dfa..0a506c77283d 100644
--- a/net/6lowpan/nhc_udp.c
+++ b/net/6lowpan/nhc_udp.c
@@ -1,25 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* 6LoWPAN IPv6 UDP compression according to RFC6282
*
- *
* Authors:
* Alexander Aring <aar@pengutronix.de>
*
- * Orignal written by:
+ * Original written by:
* Alexander Smirnov <alex.bluesman.smirnov@gmail.com>
* Jon Smirl <jonsmirl@gmail.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include "nhc.h"
#define LOWPAN_NHC_UDP_MASK 0xF8
#define LOWPAN_NHC_UDP_ID 0xF0
-#define LOWPAN_NHC_UDP_IDLEN 1
#define LOWPAN_NHC_UDP_4BIT_PORT 0xF0B0
#define LOWPAN_NHC_UDP_4BIT_MASK 0xFFF0
@@ -87,7 +81,7 @@ static int udp_uncompress(struct sk_buff *skb, size_t needed)
if (fail)
return -EINVAL;
- /* UDP length needs to be infered from the lower layers
+ /* UDP length needs to be inferred from the lower layers
* here, we obtain the hint from the remaining size of the
* frame
*/
@@ -174,14 +168,8 @@ static int udp_compress(struct sk_buff *skb, u8 **hc_ptr)
return 0;
}
-static void udp_nhid_setup(struct lowpan_nhc *nhc)
-{
- nhc->id[0] = LOWPAN_NHC_UDP_ID;
- nhc->idmask[0] = LOWPAN_NHC_UDP_MASK;
-}
-
LOWPAN_NHC(nhc_udp, "RFC6282 UDP", NEXTHDR_UDP, sizeof(struct udphdr),
- udp_nhid_setup, LOWPAN_NHC_UDP_IDLEN, udp_uncompress, udp_compress);
+ LOWPAN_NHC_UDP_ID, LOWPAN_NHC_UDP_MASK, udp_uncompress, udp_compress);
module_lowpan_nhc(nhc_udp);
MODULE_DESCRIPTION("6LoWPAN next header RFC6282 UDP compression");
diff --git a/net/802/Kconfig b/net/802/Kconfig
index 80d4bf78905d..aaa83e888240 100644
--- a/net/802/Kconfig
+++ b/net/802/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config STP
tristate
select LLC
diff --git a/net/802/Makefile b/net/802/Makefile
index 19406a87bdaa..99abc29d537c 100644
--- a/net/802/Makefile
+++ b/net/802/Makefile
@@ -3,13 +3,11 @@
# Makefile for the Linux 802.x protocol layers.
#
-# Check the p8022 selections against net/core/Makefile.
-obj-$(CONFIG_LLC) += p8022.o psnap.o
+obj-$(CONFIG_LLC) += psnap.o
obj-$(CONFIG_NET_FC) += fc.o
obj-$(CONFIG_FDDI) += fddi.o
obj-$(CONFIG_HIPPI) += hippi.o
-obj-$(CONFIG_IPX) += p8022.o psnap.o p8023.o
-obj-$(CONFIG_ATALK) += p8022.o psnap.o
+obj-$(CONFIG_ATALK) += psnap.o
obj-$(CONFIG_STP) += stp.o
obj-$(CONFIG_GARP) += garp.o
obj-$(CONFIG_MRP) += mrp.o
diff --git a/net/802/fc.c b/net/802/fc.c
index 058a9f708918..afd3d288a41d 100644
--- a/net/802/fc.c
+++ b/net/802/fc.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* NET3: Fibre Channel device handling subroutines
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Vineet Abraham <vma@iol.unh.edu>
* v 1.0 03/22/99
*/
diff --git a/net/802/fddi.c b/net/802/fddi.c
index 90f1416567a1..888379ae35ec 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -16,11 +17,6 @@
* Florian La Roche, <rzsfl@rz.uni-sb.de>
* Alan Cox, <gw4pts@gw4pts.ampr.org>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes
* Alan Cox : New arp/rebuild header
* Maciej W. Rozycki : IPv6 support
@@ -179,4 +175,5 @@ struct net_device *alloc_fddidev(int sizeof_priv)
}
EXPORT_SYMBOL(alloc_fddidev);
+MODULE_DESCRIPTION("Core routines for FDDI network devices");
MODULE_LICENSE("GPL");
diff --git a/net/802/garp.c b/net/802/garp.c
index 7f50d47470bd..2d1ffc4d9462 100644
--- a/net/802/garp.c
+++ b/net/802/garp.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* IEEE 802.1D Generic Attribute Registration Protocol (GARP)
*
* Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
*/
#include <linux/kernel.h>
#include <linux/timer.h>
@@ -19,11 +16,12 @@
#include <net/llc.h>
#include <net/llc_pdu.h>
#include <net/garp.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
static unsigned int garp_join_time __read_mostly = 200;
module_param(garp_join_time, uint, 0644);
MODULE_PARM_DESC(garp_join_time, "Join time in ms (default 200ms)");
+MODULE_DESCRIPTION("IEEE 802.1D Generic Attribute Registration Protocol (GARP)");
MODULE_LICENSE("GPL");
static const struct garp_state_trans {
@@ -206,6 +204,19 @@ static void garp_attr_destroy(struct garp_applicant *app, struct garp_attr *attr
kfree(attr);
}
+static void garp_attr_destroy_all(struct garp_applicant *app)
+{
+ struct rb_node *node, *next;
+ struct garp_attr *attr;
+
+ for (node = rb_first(&app->gid);
+ next = node ? rb_next(node) : NULL, node != NULL;
+ node = next) {
+ attr = rb_entry(node, struct garp_attr, node);
+ garp_attr_destroy(app, attr);
+ }
+}
+
static int garp_pdu_init(struct garp_applicant *app)
{
struct sk_buff *skb;
@@ -397,13 +408,13 @@ static void garp_join_timer_arm(struct garp_applicant *app)
{
unsigned long delay;
- delay = (u64)msecs_to_jiffies(garp_join_time) * prandom_u32() >> 32;
+ delay = get_random_u32_below(msecs_to_jiffies(garp_join_time));
mod_timer(&app->join_timer, jiffies + delay);
}
static void garp_join_timer(struct timer_list *t)
{
- struct garp_applicant *app = from_timer(app, t, join_timer);
+ struct garp_applicant *app = timer_container_of(app, t, join_timer);
spin_lock(&app->lock);
garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU);
@@ -608,10 +619,11 @@ void garp_uninit_applicant(struct net_device *dev, struct garp_application *appl
/* Delete timer and generate a final TRANSMIT_PDU event to flush out
* all pending messages before the applicant is gone. */
- del_timer_sync(&app->join_timer);
+ timer_shutdown_sync(&app->join_timer);
spin_lock_bh(&app->lock);
garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU);
+ garp_attr_destroy_all(app);
garp_pdu_queue(app);
spin_unlock_bh(&app->lock);
diff --git a/net/802/hippi.c b/net/802/hippi.c
index 690308b9b94a..1997b7dd265e 100644
--- a/net/802/hippi.c
+++ b/net/802/hippi.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -13,11 +14,6 @@
* Florian La Roche, <rzsfl@rz.uni-sb.de>
* Alan Cox, <gw4pts@gw4pts.ampr.org>
* Jes Sorensen, <Jes.Sorensen@cern.ch>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
@@ -69,7 +65,7 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev,
hip->le.src_addr_type = 2; /* 12 bit SC address */
memcpy(hip->le.src_switch_addr, dev->dev_addr + 3, 3);
- memset(&hip->le.reserved, 0, 16);
+ memset_startat(&hip->le, 0, reserved);
hip->snap.dsap = HIPPI_EXTENDED_SAP;
hip->snap.ssap = HIPPI_EXTENDED_SAP;
@@ -125,7 +121,7 @@ int hippi_mac_addr(struct net_device *dev, void *p)
struct sockaddr *addr = p;
if (netif_running(dev))
return -EBUSY;
- memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+ dev_addr_set(dev, addr->sa_data);
return 0;
}
EXPORT_SYMBOL(hippi_mac_addr);
diff --git a/net/802/mrp.c b/net/802/mrp.c
index a808dd5bbb27..23a88305f900 100644
--- a/net/802/mrp.c
+++ b/net/802/mrp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* IEEE 802.1Q Multiple Registration Protocol (MRP)
*
@@ -5,10 +6,6 @@
*
* Adapted from code in net/802/garp.c
* Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
*/
#include <linux/kernel.h>
#include <linux/timer.h>
@@ -19,7 +16,7 @@
#include <linux/slab.h>
#include <linux/module.h>
#include <net/mrp.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
static unsigned int mrp_join_time __read_mostly = 200;
module_param(mrp_join_time, uint, 0644);
@@ -29,6 +26,7 @@ static unsigned int mrp_periodic_time __read_mostly = 1000;
module_param(mrp_periodic_time, uint, 0644);
MODULE_PARM_DESC(mrp_periodic_time, "Periodic time in ms (default 1s)");
+MODULE_DESCRIPTION("IEEE 802.1Q Multiple Registration Protocol (MRP)");
MODULE_LICENSE("GPL");
static const u8
@@ -295,6 +293,19 @@ static void mrp_attr_destroy(struct mrp_applicant *app, struct mrp_attr *attr)
kfree(attr);
}
+static void mrp_attr_destroy_all(struct mrp_applicant *app)
+{
+ struct rb_node *node, *next;
+ struct mrp_attr *attr;
+
+ for (node = rb_first(&app->mad);
+ next = node ? rb_next(node) : NULL, node != NULL;
+ node = next) {
+ attr = rb_entry(node, struct mrp_attr, node);
+ mrp_attr_destroy(app, attr);
+ }
+}
+
static int mrp_pdu_init(struct mrp_applicant *app)
{
struct sk_buff *skb;
@@ -526,7 +537,7 @@ int mrp_request_join(const struct net_device *dev,
struct mrp_attr *attr;
if (sizeof(struct mrp_skb_cb) + len >
- FIELD_SIZEOF(struct sk_buff, cb))
+ sizeof_field(struct sk_buff, cb))
return -ENOMEM;
spin_lock_bh(&app->lock);
@@ -551,7 +562,7 @@ void mrp_request_leave(const struct net_device *dev,
struct mrp_attr *attr;
if (sizeof(struct mrp_skb_cb) + len >
- FIELD_SIZEOF(struct sk_buff, cb))
+ sizeof_field(struct sk_buff, cb))
return;
spin_lock_bh(&app->lock);
@@ -582,13 +593,13 @@ static void mrp_join_timer_arm(struct mrp_applicant *app)
{
unsigned long delay;
- delay = (u64)msecs_to_jiffies(mrp_join_time) * prandom_u32() >> 32;
+ delay = get_random_u32_below(msecs_to_jiffies(mrp_join_time));
mod_timer(&app->join_timer, jiffies + delay);
}
static void mrp_join_timer(struct timer_list *t)
{
- struct mrp_applicant *app = from_timer(app, t, join_timer);
+ struct mrp_applicant *app = timer_container_of(app, t, join_timer);
spin_lock(&app->lock);
mrp_mad_event(app, MRP_EVENT_TX);
@@ -596,7 +607,10 @@ static void mrp_join_timer(struct timer_list *t)
spin_unlock(&app->lock);
mrp_queue_xmit(app);
- mrp_join_timer_arm(app);
+ spin_lock(&app->lock);
+ if (likely(app->active))
+ mrp_join_timer_arm(app);
+ spin_unlock(&app->lock);
}
static void mrp_periodic_timer_arm(struct mrp_applicant *app)
@@ -607,14 +621,15 @@ static void mrp_periodic_timer_arm(struct mrp_applicant *app)
static void mrp_periodic_timer(struct timer_list *t)
{
- struct mrp_applicant *app = from_timer(app, t, periodic_timer);
+ struct mrp_applicant *app = timer_container_of(app, t, periodic_timer);
spin_lock(&app->lock);
- mrp_mad_event(app, MRP_EVENT_PERIODIC);
- mrp_pdu_queue(app);
+ if (likely(app->active)) {
+ mrp_mad_event(app, MRP_EVENT_PERIODIC);
+ mrp_pdu_queue(app);
+ mrp_periodic_timer_arm(app);
+ }
spin_unlock(&app->lock);
-
- mrp_periodic_timer_arm(app);
}
static int mrp_pdu_parse_end_mark(struct sk_buff *skb, int *offset)
@@ -695,7 +710,7 @@ static int mrp_pdu_parse_vecattr(struct mrp_applicant *app,
* advance to the next event in its Vector.
*/
if (sizeof(struct mrp_skb_cb) + mrp_cb(skb)->mh->attrlen >
- FIELD_SIZEOF(struct sk_buff, cb))
+ sizeof_field(struct sk_buff, cb))
return -1;
if (skb_copy_bits(skb, *offset, mrp_cb(skb)->attrvalue,
mrp_cb(skb)->mh->attrlen) < 0)
@@ -862,6 +877,7 @@ int mrp_init_applicant(struct net_device *dev, struct mrp_application *appl)
app->dev = dev;
app->app = appl;
app->mad = RB_ROOT;
+ app->active = true;
spin_lock_init(&app->lock);
skb_queue_head_init(&app->queue);
rcu_assign_pointer(dev->mrp_port->applicants[appl->type], app);
@@ -890,14 +906,18 @@ void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl)
RCU_INIT_POINTER(port->applicants[appl->type], NULL);
+ spin_lock_bh(&app->lock);
+ app->active = false;
+ spin_unlock_bh(&app->lock);
/* Delete timer and generate a final TX event to flush out
* all pending messages before the applicant is gone.
*/
- del_timer_sync(&app->join_timer);
- del_timer_sync(&app->periodic_timer);
+ timer_shutdown_sync(&app->join_timer);
+ timer_shutdown_sync(&app->periodic_timer);
spin_lock_bh(&app->lock);
mrp_mad_event(app, MRP_EVENT_TX);
+ mrp_attr_destroy_all(app);
mrp_pdu_queue(app);
spin_unlock_bh(&app->lock);
diff --git a/net/802/p8022.c b/net/802/p8022.c
deleted file mode 100644
index 0bda8de7df51..000000000000
--- a/net/802/p8022.c
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * NET3: Support for 802.2 demultiplexing off Ethernet
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * Demultiplex 802.2 encoded protocols. We match the entry by the
- * SSAP/DSAP pair and then deliver to the registered datalink that
- * matches. The control byte is ignored and handling of such items
- * is up to the routine passed the frame.
- *
- * Unlike the 802.3 datalink we have a list of 802.2 entries as
- * there are multiple protocols to demux. The list is currently
- * short (3 or 4 entries at most). The current demux assumes this.
- */
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-#include <net/datalink.h>
-#include <linux/mm.h>
-#include <linux/in.h>
-#include <linux/init.h>
-#include <net/llc.h>
-#include <net/p8022.h>
-
-static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb,
- unsigned char *dest)
-{
- llc_build_and_send_ui_pkt(dl->sap, skb, dest, dl->sap->laddr.lsap);
- return 0;
-}
-
-struct datalink_proto *register_8022_client(unsigned char type,
- int (*func)(struct sk_buff *skb,
- struct net_device *dev,
- struct packet_type *pt,
- struct net_device *orig_dev))
-{
- struct datalink_proto *proto;
-
- proto = kmalloc(sizeof(*proto), GFP_ATOMIC);
- if (proto) {
- proto->type[0] = type;
- proto->header_length = 3;
- proto->request = p8022_request;
- proto->sap = llc_sap_open(type, func);
- if (!proto->sap) {
- kfree(proto);
- proto = NULL;
- }
- }
- return proto;
-}
-
-void unregister_8022_client(struct datalink_proto *proto)
-{
- llc_sap_put(proto->sap);
- kfree(proto);
-}
-
-EXPORT_SYMBOL(register_8022_client);
-EXPORT_SYMBOL(unregister_8022_client);
-
-MODULE_LICENSE("GPL");
diff --git a/net/802/p8023.c b/net/802/p8023.c
deleted file mode 100644
index 1256a40da43c..000000000000
--- a/net/802/p8023.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * NET3: 802.3 data link hooks used for IPX 802.3
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * 802.3 isn't really a protocol data link layer. Some old IPX stuff
- * uses it however. Note that there is only one 802.3 protocol layer
- * in the system. We don't currently support different protocols
- * running raw 802.3 on different devices. Thankfully nobody else
- * has done anything like the old IPX.
- */
-
-#include <linux/in.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-
-#include <net/datalink.h>
-#include <net/p8022.h>
-
-/*
- * Place an 802.3 header on a packet. The driver will do the mac
- * addresses, we just need to give it the buffer length.
- */
-static int p8023_request(struct datalink_proto *dl,
- struct sk_buff *skb, unsigned char *dest_node)
-{
- struct net_device *dev = skb->dev;
-
- dev_hard_header(skb, dev, ETH_P_802_3, dest_node, NULL, skb->len);
- return dev_queue_xmit(skb);
-}
-
-/*
- * Create an 802.3 client. Note there can be only one 802.3 client
- */
-struct datalink_proto *make_8023_client(void)
-{
- struct datalink_proto *proto = kmalloc(sizeof(*proto), GFP_ATOMIC);
-
- if (proto) {
- proto->header_length = 0;
- proto->request = p8023_request;
- }
- return proto;
-}
-
-/*
- * Destroy the 802.3 client.
- */
-void destroy_8023_client(struct datalink_proto *dl)
-{
- kfree(dl);
-}
-
-EXPORT_SYMBOL(destroy_8023_client);
-EXPORT_SYMBOL(make_8023_client);
-
-MODULE_LICENSE("GPL");
diff --git a/net/802/psnap.c b/net/802/psnap.c
index db6baf7cf6e9..389df460c8c4 100644
--- a/net/802/psnap.c
+++ b/net/802/psnap.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* SNAP data link layer. Derived from 802.2
*
* Alan Cox <alan@lxorguk.ukuu.org.uk>,
* from the 802.2 layer by Greg Page.
* Merged in additions from Greg Page's psnap.c.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
@@ -34,7 +30,7 @@ static struct datalink_proto *find_snap_client(const unsigned char *desc)
{
struct datalink_proto *proto = NULL, *p;
- list_for_each_entry_rcu(p, &snap_list, node) {
+ list_for_each_entry_rcu(p, &snap_list, node, lockdep_is_held(&snap_lock)) {
if (!memcmp(p->type, desc, 5)) {
proto = p;
break;
@@ -59,11 +55,11 @@ static int snap_rcv(struct sk_buff *skb, struct net_device *dev,
goto drop;
rcu_read_lock();
- proto = find_snap_client(skb_transport_header(skb));
+ proto = find_snap_client(skb->data);
if (proto) {
/* Pass the frame on. */
- skb->transport_header += 5;
skb_pull_rcsum(skb, 5);
+ skb_reset_transport_header(skb);
rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev);
}
rcu_read_unlock();
@@ -83,7 +79,7 @@ drop:
* Put a SNAP header on a frame and pass to 802.2
*/
static int snap_request(struct datalink_proto *dl,
- struct sk_buff *skb, u8 *dest)
+ struct sk_buff *skb, const u8 *dest)
{
memcpy(skb_push(skb, 5), dl->type, 5);
llc_build_and_send_ui_pkt(snap_sap, skb, dest, snap_sap->laddr.lsap);
@@ -164,4 +160,5 @@ void unregister_snap_client(struct datalink_proto *proto)
kfree(proto);
}
+MODULE_DESCRIPTION("SNAP data link layer. Derived from 802.2");
MODULE_LICENSE("GPL");
diff --git a/net/802/stp.c b/net/802/stp.c
index 2c40ba0ec116..03c9f75e92c9 100644
--- a/net/802/stp.c
+++ b/net/802/stp.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* STP SAP demux
*
* Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
*/
#include <linux/mutex.h>
#include <linux/skbuff.h>
@@ -101,4 +98,5 @@ void stp_proto_unregister(const struct stp_proto *proto)
}
EXPORT_SYMBOL_GPL(stp_proto_unregister);
+MODULE_DESCRIPTION("SAP demux for IEEE 802.1D Spanning Tree Protocol (STP)");
MODULE_LICENSE("GPL");
diff --git a/net/8021q/Kconfig b/net/8021q/Kconfig
index 42320180967f..8bf7a1765b78 100644
--- a/net/8021q/Kconfig
+++ b/net/8021q/Kconfig
@@ -1,10 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Configuration for 802.1Q VLAN support
#
config VLAN_8021Q
tristate "802.1Q/802.1ad VLAN Support"
- ---help---
+ help
Select this and you will be able to create 802.1Q VLAN interfaces
on your Ethernet interfaces. 802.1Q VLAN supports almost
everything a regular Ethernet interface does, including
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 5e9950453955..2b74ed56eb16 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET 802.1Q VLAN
* Ethernet-type device handling.
@@ -11,11 +12,6 @@
* Add HW acceleration hooks - David S. Miller <davem@redhat.com>;
* Correct all the locking - David S. Miller <davem@redhat.com>;
* Use hash table for VLAN groups - David S. Miller <davem@redhat.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -27,7 +23,6 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/rculist.h>
-#include <net/p8022.h>
#include <net/arp.h>
#include <linux/rtnetlink.h>
#include <linux/notifier.h>
@@ -55,26 +50,41 @@ static int vlan_group_prealloc_vid(struct vlan_group *vg,
__be16 vlan_proto, u16 vlan_id)
{
struct net_device **array;
- unsigned int pidx, vidx;
+ unsigned int vidx;
unsigned int size;
+ int pidx;
ASSERT_RTNL();
pidx = vlan_proto_idx(vlan_proto);
+ if (pidx < 0)
+ return -EINVAL;
+
vidx = vlan_id / VLAN_GROUP_ARRAY_PART_LEN;
array = vg->vlan_devices_arrays[pidx][vidx];
if (array != NULL)
return 0;
size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN;
- array = kzalloc(size, GFP_KERNEL);
+ array = kzalloc(size, GFP_KERNEL_ACCOUNT);
if (array == NULL)
return -ENOBUFS;
+ /* paired with smp_rmb() in __vlan_group_get_device() */
+ smp_wmb();
+
vg->vlan_devices_arrays[pidx][vidx] = array;
return 0;
}
+static void vlan_stacked_transfer_operstate(const struct net_device *rootdev,
+ struct net_device *dev,
+ struct vlan_dev_priv *vlan)
+{
+ if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING))
+ netif_stacked_transfer_operstate(rootdev, dev);
+}
+
void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
{
struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
@@ -112,9 +122,6 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
}
vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id);
-
- /* Get rid of the vlan's reference to real_dev */
- dev_put(real_dev);
}
int vlan_check_real_dev(struct net_device *real_dev,
@@ -123,7 +130,8 @@ int vlan_check_real_dev(struct net_device *real_dev,
{
const char *name = real_dev->name;
- if (real_dev->features & NETIF_F_VLAN_CHALLENGED) {
+ if (real_dev->features & NETIF_F_VLAN_CHALLENGED ||
+ real_dev->type != ARPHRD_ETHER) {
pr_info("VLANs not supported on %s\n", name);
NL_SET_ERR_MSG_MOD(extack, "VLANs not supported on device");
return -EOPNOTSUPP;
@@ -168,7 +176,6 @@ int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack)
if (err < 0)
goto out_uninit_mvrp;
- vlan->nest_level = dev_get_nest_level(real_dev) + 1;
err = register_netdevice(dev);
if (err < 0)
goto out_uninit_mvrp;
@@ -177,10 +184,7 @@ int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack)
if (err)
goto out_unregister_netdev;
- /* Account for reference in struct vlan_dev_priv */
- dev_hold(real_dev);
-
- netif_stacked_transfer_operstate(real_dev, dev);
+ vlan_stacked_transfer_operstate(real_dev, dev, vlan);
linkwatch_fire_event(dev); /* _MUST_ call rfc2863_policy() */
/* So, got the sucker initialized, now lets place
@@ -189,6 +193,8 @@ int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack)
vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, dev);
grp->nr_vlan_devs++;
+ netdev_update_features(dev);
+
return 0;
out_unregister_netdev:
@@ -277,8 +283,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
return 0;
out_free_newdev:
- if (new_dev->reg_state == NETREG_UNINITIALIZED)
- free_netdev(new_dev);
+ free_netdev(new_dev);
return err;
}
@@ -316,8 +321,7 @@ static void vlan_transfer_features(struct net_device *dev,
{
struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev);
- vlandev->gso_max_size = dev->gso_max_size;
- vlandev->gso_max_segs = dev->gso_max_segs;
+ netif_inherit_tso_max(vlandev, dev);
if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto))
vlandev->hard_header_len = dev->hard_header_len;
@@ -330,6 +334,7 @@ static void vlan_transfer_features(struct net_device *dev,
vlandev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
vlandev->priv_flags |= (vlan->real_dev->priv_flags & IFF_XMIT_DST_RELEASE);
+ vlandev->hw_enc_features = vlan_tnl_features(vlan->real_dev);
netdev_update_features(vlandev);
}
@@ -354,9 +359,39 @@ static int __vlan_device_event(struct net_device *dev, unsigned long event)
return err;
}
+static void vlan_vid0_add(struct net_device *dev)
+{
+ struct vlan_info *vlan_info;
+ int err;
+
+ if (!(dev->features & NETIF_F_HW_VLAN_CTAG_FILTER))
+ return;
+
+ pr_info("adding VLAN 0 to HW filter on device %s\n", dev->name);
+
+ err = vlan_vid_add(dev, htons(ETH_P_8021Q), 0);
+ if (err)
+ return;
+
+ vlan_info = rtnl_dereference(dev->vlan_info);
+ vlan_info->auto_vid0 = true;
+}
+
+static void vlan_vid0_del(struct net_device *dev)
+{
+ struct vlan_info *vlan_info = rtnl_dereference(dev->vlan_info);
+
+ if (!vlan_info || !vlan_info->auto_vid0)
+ return;
+
+ vlan_info->auto_vid0 = false;
+ vlan_vid_del(dev, htons(ETH_P_8021Q), 0);
+}
+
static int vlan_device_event(struct notifier_block *unused, unsigned long event,
void *ptr)
{
+ struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct vlan_group *grp;
struct vlan_info *vlan_info;
@@ -374,15 +409,10 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
return notifier_from_errno(err);
}
- if ((event == NETDEV_UP) &&
- (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) {
- pr_info("adding VLAN 0 to HW filter on device %s\n",
- dev->name);
- vlan_vid_add(dev, htons(ETH_P_8021Q), 0);
- }
- if (event == NETDEV_DOWN &&
- (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER))
- vlan_vid_del(dev, htons(ETH_P_8021Q), 0);
+ if (event == NETDEV_UP)
+ vlan_vid0_add(dev);
+ else if (event == NETDEV_DOWN)
+ vlan_vid0_del(dev);
vlan_info = rtnl_dereference(dev->vlan_info);
if (!vlan_info)
@@ -397,7 +427,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
case NETDEV_CHANGE:
/* Propagate real device state to vlan devices */
vlan_group_for_each_dev(grp, i, vlandev)
- netif_stacked_transfer_operstate(dev, vlandev);
+ vlan_stacked_transfer_operstate(dev, vlandev,
+ vlan_dev_priv(vlandev));
break;
case NETDEV_CHANGEADDR:
@@ -441,10 +472,11 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
list_add(&vlandev->close_list, &close_list);
}
- dev_close_many(&close_list, false);
+ netif_close_many(&close_list, false);
list_for_each_entry_safe(vlandev, tmp, &close_list, close_list) {
- netif_stacked_transfer_operstate(dev, vlandev);
+ vlan_stacked_transfer_operstate(dev, vlandev,
+ vlan_dev_priv(vlandev));
list_del_init(&vlandev->close_list);
}
list_del(&close_list);
@@ -453,14 +485,15 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
case NETDEV_UP:
/* Put all VLANs for this dev in the up state too. */
vlan_group_for_each_dev(grp, i, vlandev) {
- flgs = dev_get_flags(vlandev);
+ flgs = netif_get_flags(vlandev);
if (flgs & IFF_UP)
continue;
vlan = vlan_dev_priv(vlandev);
if (!(vlan->flags & VLAN_FLAG_LOOSE_BINDING))
- dev_change_flags(vlandev, flgs | IFF_UP);
- netif_stacked_transfer_operstate(dev, vlandev);
+ dev_change_flags(vlandev, flgs | IFF_UP,
+ extack);
+ vlan_stacked_transfer_operstate(dev, vlandev, vlan);
}
break;
@@ -624,7 +657,8 @@ static int vlan_ioctl_handler(struct net *net, void __user *arg)
case GET_VLAN_REALDEV_NAME_CMD:
err = 0;
- vlan_dev_get_realdev_name(dev, args.u.device2);
+ vlan_dev_get_realdev_name(dev, args.u.device2,
+ sizeof(args.u.device2));
if (copy_to_user(arg, &args,
sizeof(struct vlan_ioctl_args)))
err = -EFAULT;
@@ -647,93 +681,6 @@ out:
return err;
}
-static struct sk_buff *vlan_gro_receive(struct list_head *head,
- struct sk_buff *skb)
-{
- const struct packet_offload *ptype;
- unsigned int hlen, off_vlan;
- struct sk_buff *pp = NULL;
- struct vlan_hdr *vhdr;
- struct sk_buff *p;
- __be16 type;
- int flush = 1;
-
- off_vlan = skb_gro_offset(skb);
- hlen = off_vlan + sizeof(*vhdr);
- vhdr = skb_gro_header_fast(skb, off_vlan);
- if (skb_gro_header_hard(skb, hlen)) {
- vhdr = skb_gro_header_slow(skb, hlen, off_vlan);
- if (unlikely(!vhdr))
- goto out;
- }
-
- type = vhdr->h_vlan_encapsulated_proto;
-
- rcu_read_lock();
- ptype = gro_find_receive_by_type(type);
- if (!ptype)
- goto out_unlock;
-
- flush = 0;
-
- list_for_each_entry(p, head, list) {
- struct vlan_hdr *vhdr2;
-
- if (!NAPI_GRO_CB(p)->same_flow)
- continue;
-
- vhdr2 = (struct vlan_hdr *)(p->data + off_vlan);
- if (compare_vlan_header(vhdr, vhdr2))
- NAPI_GRO_CB(p)->same_flow = 0;
- }
-
- skb_gro_pull(skb, sizeof(*vhdr));
- skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr));
- pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
-
-out_unlock:
- rcu_read_unlock();
-out:
- skb_gro_flush_final(skb, pp, flush);
-
- return pp;
-}
-
-static int vlan_gro_complete(struct sk_buff *skb, int nhoff)
-{
- struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff);
- __be16 type = vhdr->h_vlan_encapsulated_proto;
- struct packet_offload *ptype;
- int err = -ENOENT;
-
- rcu_read_lock();
- ptype = gro_find_complete_by_type(type);
- if (ptype)
- err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(*vhdr));
-
- rcu_read_unlock();
- return err;
-}
-
-static struct packet_offload vlan_packet_offloads[] __read_mostly = {
- {
- .type = cpu_to_be16(ETH_P_8021Q),
- .priority = 10,
- .callbacks = {
- .gro_receive = vlan_gro_receive,
- .gro_complete = vlan_gro_complete,
- },
- },
- {
- .type = cpu_to_be16(ETH_P_8021AD),
- .priority = 10,
- .callbacks = {
- .gro_receive = vlan_gro_receive,
- .gro_complete = vlan_gro_complete,
- },
- },
-};
-
static int __net_init vlan_init_net(struct net *net)
{
struct vlan_net *vn = net_generic(net, vlan_net_id);
@@ -761,7 +708,6 @@ static struct pernet_operations vlan_net_ops = {
static int __init vlan_proto_init(void)
{
int err;
- unsigned int i;
pr_info("%s v%s\n", vlan_fullname, vlan_version);
@@ -785,9 +731,6 @@ static int __init vlan_proto_init(void)
if (err < 0)
goto err5;
- for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++)
- dev_add_offload(&vlan_packet_offloads[i]);
-
vlan_ioctl_set(vlan_ioctl_handler);
return 0;
@@ -805,13 +748,8 @@ err0:
static void __exit vlan_cleanup_module(void)
{
- unsigned int i;
-
vlan_ioctl_set(NULL);
- for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++)
- dev_remove_offload(&vlan_packet_offloads[i]);
-
vlan_netlink_fini();
unregister_netdevice_notifier(&vlan_notifier_block);
@@ -826,5 +764,7 @@ static void __exit vlan_cleanup_module(void)
module_init(vlan_proto_init);
module_exit(vlan_cleanup_module);
+MODULE_DESCRIPTION("802.1Q/802.1ad VLAN Protocol");
MODULE_LICENSE("GPL");
MODULE_VERSION(DRV_VERSION);
+MODULE_IMPORT_NS("NETDEV_INTERNAL");
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 44df1c3df02d..c7ffe591d593 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -33,10 +33,11 @@ struct vlan_info {
struct vlan_group grp;
struct list_head vid_list;
unsigned int nr_vids;
+ bool auto_vid0;
struct rcu_head rcu;
};
-static inline unsigned int vlan_proto_idx(__be16 proto)
+static inline int vlan_proto_idx(__be16 proto)
{
switch (proto) {
case htons(ETH_P_8021Q):
@@ -44,8 +45,8 @@ static inline unsigned int vlan_proto_idx(__be16 proto)
case htons(ETH_P_8021AD):
return VLAN_PROTO_8021AD;
default:
- BUG();
- return 0;
+ WARN(1, "invalid VLAN protocol: 0x%04x\n", ntohs(proto));
+ return -EINVAL;
}
}
@@ -57,6 +58,10 @@ static inline struct net_device *__vlan_group_get_device(struct vlan_group *vg,
array = vg->vlan_devices_arrays[pidx]
[vlan_id / VLAN_GROUP_ARRAY_PART_LEN];
+
+ /* paired with smp_wmb() in vlan_group_prealloc_vid() */
+ smp_rmb();
+
return array ? array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] : NULL;
}
@@ -64,17 +69,24 @@ static inline struct net_device *vlan_group_get_device(struct vlan_group *vg,
__be16 vlan_proto,
u16 vlan_id)
{
- return __vlan_group_get_device(vg, vlan_proto_idx(vlan_proto), vlan_id);
+ int pidx = vlan_proto_idx(vlan_proto);
+
+ if (pidx < 0)
+ return NULL;
+
+ return __vlan_group_get_device(vg, pidx, vlan_id);
}
static inline void vlan_group_set_device(struct vlan_group *vg,
__be16 vlan_proto, u16 vlan_id,
struct net_device *dev)
{
+ int pidx = vlan_proto_idx(vlan_proto);
struct net_device **array;
- if (!vg)
+
+ if (!vg || pidx < 0)
return;
- array = vg->vlan_devices_arrays[vlan_proto_idx(vlan_proto)]
+ array = vg->vlan_devices_arrays[pidx]
[vlan_id / VLAN_GROUP_ARRAY_PART_LEN];
array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] = dev;
}
@@ -92,6 +104,19 @@ static inline struct net_device *vlan_find_dev(struct net_device *real_dev,
return NULL;
}
+static inline netdev_features_t vlan_tnl_features(struct net_device *real_dev)
+{
+ netdev_features_t ret;
+
+ ret = real_dev->hw_enc_features &
+ (NETIF_F_CSUM_MASK | NETIF_F_GSO_SOFTWARE |
+ NETIF_F_GSO_ENCAP_ALL);
+
+ if ((ret & NETIF_F_GSO_ENCAP_ALL) && (ret & NETIF_F_CSUM_MASK))
+ return (ret & ~NETIF_F_CSUM_MASK) | NETIF_F_HW_CSUM;
+ return 0;
+}
+
#define vlan_group_for_each_dev(grp, i, dev) \
for ((i) = 0; i < VLAN_PROTO_NUM * VLAN_N_VID; i++) \
if (((dev) = __vlan_group_get_device((grp), (i) / VLAN_N_VID, \
@@ -105,8 +130,10 @@ void vlan_dev_set_ingress_priority(const struct net_device *dev,
u32 skb_prio, u16 vlan_prio);
int vlan_dev_set_egress_priority(const struct net_device *dev,
u32 skb_prio, u16 vlan_prio);
+void vlan_dev_free_egress_priority(const struct net_device *dev);
int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask);
-void vlan_dev_get_realdev_name(const struct net_device *dev, char *result);
+void vlan_dev_get_realdev_name(const struct net_device *dev, char *result,
+ size_t size);
int vlan_check_real_dev(struct net_device *real_dev,
__be16 protocol, u16 vlan_id,
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 4f60e86f4b8d..9404dd551dfd 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -4,6 +4,7 @@
#include <linux/if_vlan.h>
#include <linux/netpoll.h>
#include <linux/export.h>
+#include <net/gro.h>
#include "vlan.h"
bool vlan_do_receive(struct sk_buff **skbp)
@@ -57,15 +58,15 @@ bool vlan_do_receive(struct sk_buff **skbp)
}
skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci);
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
rx_stats = this_cpu_ptr(vlan_dev_priv(vlan_dev)->vlan_pcpu_stats);
u64_stats_update_begin(&rx_stats->syncp);
- rx_stats->rx_packets++;
- rx_stats->rx_bytes += skb->len;
+ u64_stats_inc(&rx_stats->rx_packets);
+ u64_stats_add(&rx_stats->rx_bytes, skb->len);
if (skb->pkt_type == PACKET_MULTICAST)
- rx_stats->rx_multicast++;
+ u64_stats_inc(&rx_stats->rx_multicast);
u64_stats_update_end(&rx_stats->syncp);
return true;
@@ -223,6 +224,33 @@ static int vlan_kill_rx_filter_info(struct net_device *dev, __be16 proto, u16 vi
return -ENODEV;
}
+int vlan_for_each(struct net_device *dev,
+ int (*action)(struct net_device *dev, int vid, void *arg),
+ void *arg)
+{
+ struct vlan_vid_info *vid_info;
+ struct vlan_info *vlan_info;
+ struct net_device *vdev;
+ int ret;
+
+ ASSERT_RTNL();
+
+ vlan_info = rtnl_dereference(dev->vlan_info);
+ if (!vlan_info)
+ return 0;
+
+ list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
+ vdev = vlan_group_get_device(&vlan_info->grp, vid_info->proto,
+ vid_info->vid);
+ ret = action(vdev, vid_info->vid, arg);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(vlan_for_each);
+
int vlan_filter_push_vids(struct vlan_info *vlan_info, __be16 proto)
{
struct net_device *real_dev = vlan_info->real_dev;
@@ -332,9 +360,8 @@ static void __vlan_vid_del(struct vlan_info *vlan_info,
int err;
err = vlan_kill_rx_filter_info(dev, proto, vid);
- if (err)
- pr_warn("failed to kill vid %04x/%d for device %s\n",
- proto, vid, dev->name);
+ if (err && dev->reg_state != NETREG_UNREGISTERING)
+ netdev_warn(dev, "failed to kill vid %04x/%d\n", proto, vid);
list_del(&vid_info->list);
kfree(vid_info);
@@ -380,6 +407,8 @@ int vlan_vids_add_by_dev(struct net_device *dev,
return 0;
list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
+ if (!vlan_hw_filter_capable(by_dev, vid_info->proto))
+ continue;
err = vlan_vid_add(dev, vid_info->proto, vid_info->vid);
if (err)
goto unwind;
@@ -390,6 +419,8 @@ unwind:
list_for_each_entry_continue_reverse(vid_info,
&vlan_info->vid_list,
list) {
+ if (!vlan_hw_filter_capable(by_dev, vid_info->proto))
+ continue;
vlan_vid_del(dev, vid_info->proto, vid_info->vid);
}
@@ -409,8 +440,11 @@ void vlan_vids_del_by_dev(struct net_device *dev,
if (!vlan_info)
return;
- list_for_each_entry(vid_info, &vlan_info->vid_list, list)
+ list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
+ if (!vlan_hw_filter_capable(by_dev, vid_info->proto))
+ continue;
vlan_vid_del(dev, vid_info->proto, vid_info->vid);
+ }
}
EXPORT_SYMBOL(vlan_vids_del_by_dev);
@@ -426,3 +460,101 @@ bool vlan_uses_dev(const struct net_device *dev)
return vlan_info->grp.nr_vlan_devs ? true : false;
}
EXPORT_SYMBOL(vlan_uses_dev);
+
+static struct sk_buff *vlan_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
+{
+ const struct packet_offload *ptype;
+ unsigned int hlen, off_vlan;
+ struct sk_buff *pp = NULL;
+ struct vlan_hdr *vhdr;
+ struct sk_buff *p;
+ __be16 type;
+ int flush = 1;
+
+ off_vlan = skb_gro_offset(skb);
+ hlen = off_vlan + sizeof(*vhdr);
+ vhdr = skb_gro_header(skb, hlen, off_vlan);
+ if (unlikely(!vhdr))
+ goto out;
+
+ NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = hlen;
+
+ type = vhdr->h_vlan_encapsulated_proto;
+
+ ptype = gro_find_receive_by_type(type);
+ if (!ptype)
+ goto out;
+
+ flush = 0;
+
+ list_for_each_entry(p, head, list) {
+ struct vlan_hdr *vhdr2;
+
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ vhdr2 = (struct vlan_hdr *)(p->data + off_vlan);
+ if (compare_vlan_header(vhdr, vhdr2))
+ NAPI_GRO_CB(p)->same_flow = 0;
+ }
+
+ skb_gro_pull(skb, sizeof(*vhdr));
+ skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr));
+
+ pp = indirect_call_gro_receive_inet(ptype->callbacks.gro_receive,
+ ipv6_gro_receive, inet_gro_receive,
+ head, skb);
+
+out:
+ skb_gro_flush_final(skb, pp, flush);
+
+ return pp;
+}
+
+static int vlan_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff);
+ __be16 type = vhdr->h_vlan_encapsulated_proto;
+ struct packet_offload *ptype;
+ int err = -ENOENT;
+
+ ptype = gro_find_complete_by_type(type);
+ if (ptype)
+ err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
+ ipv6_gro_complete, inet_gro_complete,
+ skb, nhoff + sizeof(*vhdr));
+
+ return err;
+}
+
+static struct packet_offload vlan_packet_offloads[] __read_mostly = {
+ {
+ .type = cpu_to_be16(ETH_P_8021Q),
+ .priority = 10,
+ .callbacks = {
+ .gro_receive = vlan_gro_receive,
+ .gro_complete = vlan_gro_complete,
+ },
+ },
+ {
+ .type = cpu_to_be16(ETH_P_8021AD),
+ .priority = 10,
+ .callbacks = {
+ .gro_receive = vlan_gro_receive,
+ .gro_complete = vlan_gro_complete,
+ },
+ },
+};
+
+static int __init vlan_offload_init(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++)
+ dev_add_offload(&vlan_packet_offloads[i]);
+
+ return 0;
+}
+
+fs_initcall(vlan_offload_init);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 546af0e73ac3..fbf296137b09 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* -*- linux-c -*-
* INET 802.1Q VLAN
* Ethernet-type device handling.
@@ -12,12 +13,6 @@
* Oct 20, 2001: Ard van Breeman:
* - Fix MC-list, finally.
* - Flush MC-list on VLAN destroy.
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -31,7 +26,8 @@
#include <linux/ethtool.h>
#include <linux/phy.h>
#include <net/arp.h>
-#include <net/switchdev.h>
+#include <net/macsec.h>
+#include <net/netdev_lock.h>
#include "vlan.h"
#include "vlanproc.h"
@@ -94,12 +90,11 @@ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
static inline netdev_tx_t vlan_netpoll_send_skb(struct vlan_dev_priv *vlan, struct sk_buff *skb)
{
#ifdef CONFIG_NET_POLL_CONTROLLER
- if (vlan->netpoll)
- netpoll_send_skb(vlan->netpoll, skb);
+ return netpoll_send_skb(vlan->netpoll, skb);
#else
BUG();
-#endif
return NETDEV_TX_OK;
+#endif
}
static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
@@ -115,8 +110,8 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
* NOTE: THIS ASSUMES DIX ETHERNET, SPECIFICALLY NOT SUPPORTING
* OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs...
*/
- if (veth->h_vlan_proto != vlan->vlan_proto ||
- vlan->flags & VLAN_FLAG_REORDER_HDR) {
+ if (vlan->flags & VLAN_FLAG_REORDER_HDR ||
+ veth->h_vlan_proto != vlan->vlan_proto) {
u16 vlan_tci;
vlan_tci = vlan->vlan_id;
vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb->priority);
@@ -135,8 +130,8 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
stats = this_cpu_ptr(vlan->vlan_pcpu_stats);
u64_stats_update_begin(&stats->syncp);
- stats->tx_packets++;
- stats->tx_bytes += len;
+ u64_stats_inc(&stats->tx_packets);
+ u64_stats_add(&stats->tx_bytes, len);
u64_stats_update_end(&stats->syncp);
} else {
this_cpu_inc(vlan->vlan_pcpu_stats->tx_dropped);
@@ -155,7 +150,7 @@ static int vlan_dev_change_mtu(struct net_device *dev, int new_mtu)
if (max_mtu < new_mtu)
return -ERANGE;
- dev->mtu = new_mtu;
+ WRITE_ONCE(dev->mtu, new_mtu);
return 0;
}
@@ -224,7 +219,8 @@ int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask)
u32 old_flags = vlan->flags;
if (mask & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP |
- VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP))
+ VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP |
+ VLAN_FLAG_BRIDGE_BINDING))
return -EINVAL;
vlan->flags = (old_flags & ~mask) | (flags & mask);
@@ -245,9 +241,9 @@ int vlan_dev_change_flags(const struct net_device *dev, u32 flags, u32 mask)
return 0;
}
-void vlan_dev_get_realdev_name(const struct net_device *dev, char *result)
+void vlan_dev_get_realdev_name(const struct net_device *dev, char *result, size_t size)
{
- strncpy(result, vlan_dev_priv(dev)->real_dev->name, 23);
+ strscpy_pad(result, vlan_dev_priv(dev)->real_dev->name, size);
}
bool vlan_dev_inherit_address(struct net_device *dev,
@@ -256,7 +252,7 @@ bool vlan_dev_inherit_address(struct net_device *dev,
if (dev->addr_assign_type != NET_ADDR_STOLEN)
return false;
- ether_addr_copy(dev->dev_addr, real_dev->dev_addr);
+ eth_hw_addr_set(dev, real_dev->dev_addr);
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
return true;
}
@@ -278,17 +274,6 @@ static int vlan_dev_open(struct net_device *dev)
goto out;
}
- if (dev->flags & IFF_ALLMULTI) {
- err = dev_set_allmulti(real_dev, 1);
- if (err < 0)
- goto del_unicast;
- }
- if (dev->flags & IFF_PROMISC) {
- err = dev_set_promiscuity(real_dev, 1);
- if (err < 0)
- goto clear_allmulti;
- }
-
ether_addr_copy(vlan->real_dev_addr, real_dev->dev_addr);
if (vlan->flags & VLAN_FLAG_GVRP)
@@ -297,16 +282,11 @@ static int vlan_dev_open(struct net_device *dev)
if (vlan->flags & VLAN_FLAG_MVRP)
vlan_mvrp_request_join(dev);
- if (netif_carrier_ok(real_dev))
+ if (netif_carrier_ok(real_dev) &&
+ !(vlan->flags & VLAN_FLAG_BRIDGE_BINDING))
netif_carrier_on(dev);
return 0;
-clear_allmulti:
- if (dev->flags & IFF_ALLMULTI)
- dev_set_allmulti(real_dev, -1);
-del_unicast:
- if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr))
- dev_uc_del(real_dev, dev->dev_addr);
out:
netif_carrier_off(dev);
return err;
@@ -319,15 +299,12 @@ static int vlan_dev_stop(struct net_device *dev)
dev_mc_unsync(real_dev, dev);
dev_uc_unsync(real_dev, dev);
- if (dev->flags & IFF_ALLMULTI)
- dev_set_allmulti(real_dev, -1);
- if (dev->flags & IFF_PROMISC)
- dev_set_promiscuity(real_dev, -1);
if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr))
dev_uc_del(real_dev, dev->dev_addr);
- netif_carrier_off(dev);
+ if (!(vlan->flags & VLAN_FLAG_BRIDGE_BINDING))
+ netif_carrier_off(dev);
return 0;
}
@@ -353,28 +330,44 @@ static int vlan_dev_set_mac_address(struct net_device *dev, void *p)
dev_uc_del(real_dev, dev->dev_addr);
out:
- ether_addr_copy(dev->dev_addr, addr->sa_data);
+ eth_hw_addr_set(dev, addr->sa_data);
return 0;
}
+static int vlan_hwtstamp_get(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+
+ return generic_hwtstamp_get_lower(real_dev, cfg);
+}
+
+static int vlan_hwtstamp_set(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+
+ if (!net_eq(dev_net(dev), dev_net(real_dev)))
+ return -EOPNOTSUPP;
+
+ return generic_hwtstamp_set_lower(real_dev, cfg, extack);
+}
+
static int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
{
struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
- const struct net_device_ops *ops = real_dev->netdev_ops;
struct ifreq ifrr;
int err = -EOPNOTSUPP;
- strncpy(ifrr.ifr_name, real_dev->name, IFNAMSIZ);
+ strscpy_pad(ifrr.ifr_name, real_dev->name, IFNAMSIZ);
ifrr.ifr_ifru = ifr->ifr_ifru;
switch (cmd) {
case SIOCGMIIPHY:
case SIOCGMIIREG:
case SIOCSMIIREG:
- case SIOCSHWTSTAMP:
- case SIOCGHWTSTAMP:
- if (netif_device_present(real_dev) && ops->ndo_do_ioctl)
- err = ops->ndo_do_ioctl(real_dev, &ifrr, cmd);
+ err = dev_eth_ioctl(real_dev, &ifrr, cmd);
break;
}
@@ -444,27 +437,29 @@ static int vlan_dev_fcoe_disable(struct net_device *dev)
return rc;
}
-static int vlan_dev_fcoe_get_wwn(struct net_device *dev, u64 *wwn, int type)
+static int vlan_dev_fcoe_ddp_target(struct net_device *dev, u16 xid,
+ struct scatterlist *sgl, unsigned int sgc)
{
struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
const struct net_device_ops *ops = real_dev->netdev_ops;
- int rc = -EINVAL;
+ int rc = 0;
+
+ if (ops->ndo_fcoe_ddp_target)
+ rc = ops->ndo_fcoe_ddp_target(real_dev, xid, sgl, sgc);
- if (ops->ndo_fcoe_get_wwn)
- rc = ops->ndo_fcoe_get_wwn(real_dev, wwn, type);
return rc;
}
+#endif
-static int vlan_dev_fcoe_ddp_target(struct net_device *dev, u16 xid,
- struct scatterlist *sgl, unsigned int sgc)
+#ifdef NETDEV_FCOE_WWNN
+static int vlan_dev_fcoe_get_wwn(struct net_device *dev, u64 *wwn, int type)
{
struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
const struct net_device_ops *ops = real_dev->netdev_ops;
- int rc = 0;
-
- if (ops->ndo_fcoe_ddp_target)
- rc = ops->ndo_fcoe_ddp_target(real_dev, xid, sgl, sgc);
+ int rc = -EINVAL;
+ if (ops->ndo_fcoe_get_wwn)
+ rc = ops->ndo_fcoe_get_wwn(real_dev, wwn, type);
return rc;
}
#endif
@@ -473,12 +468,10 @@ static void vlan_dev_change_rx_flags(struct net_device *dev, int change)
{
struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
- if (dev->flags & IFF_UP) {
- if (change & IFF_ALLMULTI)
- dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1);
- if (change & IFF_PROMISC)
- dev_set_promiscuity(real_dev, dev->flags & IFF_PROMISC ? 1 : -1);
- }
+ if (change & IFF_ALLMULTI)
+ dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1);
+ if (change & IFF_PROMISC)
+ dev_set_promiscuity(real_dev, dev->flags & IFF_PROMISC ? 1 : -1);
}
static void vlan_dev_set_rx_mode(struct net_device *vlan_dev)
@@ -487,39 +480,17 @@ static void vlan_dev_set_rx_mode(struct net_device *vlan_dev)
dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev);
}
-/*
- * vlan network devices have devices nesting below it, and are a special
- * "super class" of normal network devices; split their locks off into a
- * separate class since they always nest.
- */
-static struct lock_class_key vlan_netdev_xmit_lock_key;
-static struct lock_class_key vlan_netdev_addr_lock_key;
-
-static void vlan_dev_set_lockdep_one(struct net_device *dev,
- struct netdev_queue *txq,
- void *_subclass)
-{
- lockdep_set_class_and_subclass(&txq->_xmit_lock,
- &vlan_netdev_xmit_lock_key,
- *(int *)_subclass);
-}
-
-static void vlan_dev_set_lockdep_class(struct net_device *dev, int subclass)
+static __be16 vlan_parse_protocol(const struct sk_buff *skb)
{
- lockdep_set_class_and_subclass(&dev->addr_list_lock,
- &vlan_netdev_addr_lock_key,
- subclass);
- netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, &subclass);
-}
+ struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data);
-static int vlan_dev_get_lock_subclass(struct net_device *dev)
-{
- return vlan_dev_priv(dev)->nest_level;
+ return __vlan_get_protocol(skb, veth->h_vlan_proto, NULL);
}
static const struct header_ops vlan_header_ops = {
.create = vlan_dev_hard_header,
.parse = eth_header_parse,
+ .parse_protocol = vlan_parse_protocol,
};
static int vlan_passthru_hard_header(struct sk_buff *skb, struct net_device *dev,
@@ -539,9 +510,10 @@ static int vlan_passthru_hard_header(struct sk_buff *skb, struct net_device *dev
static const struct header_ops vlan_passthru_header_ops = {
.create = vlan_passthru_hard_header,
.parse = eth_header_parse,
+ .parse_protocol = vlan_parse_protocol,
};
-static struct device_type vlan_type = {
+static const struct device_type vlan_type = {
.name = "vlan",
};
@@ -549,7 +521,8 @@ static const struct net_device_ops vlan_netdev_ops;
static int vlan_dev_init(struct net_device *dev)
{
- struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct net_device *real_dev = vlan->real_dev;
netif_carrier_off(dev);
@@ -560,24 +533,35 @@ static int vlan_dev_init(struct net_device *dev)
(1<<__LINK_STATE_DORMANT))) |
(1<<__LINK_STATE_PRESENT);
+ if (vlan->flags & VLAN_FLAG_BRIDGE_BINDING)
+ dev->state |= (1 << __LINK_STATE_NOCARRIER);
+
dev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG |
NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE |
+ NETIF_F_GSO_ENCAP_ALL |
NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC |
- NETIF_F_ALL_FCOE;
+ NETIF_F_FCOE_CRC | NETIF_F_FSO;
+
+ if (real_dev->vlan_features & NETIF_F_HW_MACSEC)
+ dev->hw_features |= NETIF_F_HW_MACSEC;
- dev->features |= dev->hw_features | NETIF_F_LLTX;
- dev->gso_max_size = real_dev->gso_max_size;
- dev->gso_max_segs = real_dev->gso_max_segs;
+ dev->features |= dev->hw_features;
+ dev->lltx = true;
+ dev->fcoe_mtu = true;
+ netif_inherit_tso_max(dev, real_dev);
if (dev->features & NETIF_F_VLAN_FEATURES)
netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n");
- dev->vlan_features = real_dev->vlan_features & ~NETIF_F_ALL_FCOE;
+ dev->vlan_features = real_dev->vlan_features &
+ ~(NETIF_F_FCOE_CRC | NETIF_F_FSO);
+ dev->hw_enc_features = vlan_tnl_features(real_dev);
+ dev->mpls_features = real_dev->mpls_features;
/* ipv6 shared card related stuff */
dev->dev_id = real_dev->dev_id;
if (is_zero_ether_addr(dev->dev_addr)) {
- ether_addr_copy(dev->dev_addr, real_dev->dev_addr);
+ eth_hw_addr_set(dev, real_dev->dev_addr);
dev->addr_assign_type = NET_ADDR_STOLEN;
}
if (is_zero_ether_addr(dev->broadcast))
@@ -588,8 +572,7 @@ static int vlan_dev_init(struct net_device *dev)
#endif
dev->needed_headroom = real_dev->needed_headroom;
- if (vlan_hw_offload_capable(real_dev->features,
- vlan_dev_priv(dev)->vlan_proto)) {
+ if (vlan_hw_offload_capable(real_dev->features, vlan->vlan_proto)) {
dev->header_ops = &vlan_passthru_header_ops;
dev->hard_header_len = real_dev->hard_header_len;
} else {
@@ -601,16 +584,20 @@ static int vlan_dev_init(struct net_device *dev)
SET_NETDEV_DEVTYPE(dev, &vlan_type);
- vlan_dev_set_lockdep_class(dev, vlan_dev_get_lock_subclass(dev));
+ netdev_lockdep_set_classes(dev);
- vlan_dev_priv(dev)->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats);
- if (!vlan_dev_priv(dev)->vlan_pcpu_stats)
+ vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats);
+ if (!vlan->vlan_pcpu_stats)
return -ENOMEM;
+ /* Get vlan's reference to real_dev */
+ netdev_hold(real_dev, &vlan->dev_tracker, GFP_KERNEL);
+
return 0;
}
-static void vlan_dev_uninit(struct net_device *dev)
+/* Note: this function might be called multiple times for the same device. */
+void vlan_dev_free_egress_priority(const struct net_device *dev)
{
struct vlan_priority_tci_mapping *pm;
struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
@@ -624,6 +611,11 @@ static void vlan_dev_uninit(struct net_device *dev)
}
}
+static void vlan_dev_uninit(struct net_device *dev)
+{
+ vlan_dev_free_egress_priority(dev);
+}
+
static netdev_features_t vlan_dev_fix_features(struct net_device *dev,
netdev_features_t features)
{
@@ -642,7 +634,6 @@ static netdev_features_t vlan_dev_fix_features(struct net_device *dev,
lower_features |= NETIF_F_HW_CSUM;
features = netdev_intersect_features(features, lower_features);
features |= old_features & (NETIF_F_SOFT_FEATURES | NETIF_F_GSO_SOFTWARE);
- features |= NETIF_F_LLTX;
return features;
}
@@ -658,29 +649,16 @@ static int vlan_ethtool_get_link_ksettings(struct net_device *dev,
static void vlan_ethtool_get_drvinfo(struct net_device *dev,
struct ethtool_drvinfo *info)
{
- strlcpy(info->driver, vlan_fullname, sizeof(info->driver));
- strlcpy(info->version, vlan_version, sizeof(info->version));
- strlcpy(info->fw_version, "N/A", sizeof(info->fw_version));
+ strscpy(info->driver, vlan_fullname, sizeof(info->driver));
+ strscpy(info->version, vlan_version, sizeof(info->version));
+ strscpy(info->fw_version, "N/A", sizeof(info->fw_version));
}
static int vlan_ethtool_get_ts_info(struct net_device *dev,
- struct ethtool_ts_info *info)
+ struct kernel_ethtool_ts_info *info)
{
const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
- const struct ethtool_ops *ops = vlan->real_dev->ethtool_ops;
- struct phy_device *phydev = vlan->real_dev->phydev;
-
- if (phydev && phydev->drv && phydev->drv->ts_info) {
- return phydev->drv->ts_info(phydev, info);
- } else if (ops->get_ts_info) {
- return ops->get_ts_info(vlan->real_dev, info);
- } else {
- info->so_timestamping = SOF_TIMESTAMPING_RX_SOFTWARE |
- SOF_TIMESTAMPING_SOFTWARE;
- info->phc_index = -1;
- }
-
- return 0;
+ return ethtool_get_ts_info_by_layer(vlan->real_dev, info);
}
static void vlan_dev_get_stats64(struct net_device *dev,
@@ -696,13 +674,13 @@ static void vlan_dev_get_stats64(struct net_device *dev,
p = per_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats, i);
do {
- start = u64_stats_fetch_begin_irq(&p->syncp);
- rxpackets = p->rx_packets;
- rxbytes = p->rx_bytes;
- rxmulticast = p->rx_multicast;
- txpackets = p->tx_packets;
- txbytes = p->tx_bytes;
- } while (u64_stats_fetch_retry_irq(&p->syncp, start));
+ start = u64_stats_fetch_begin(&p->syncp);
+ rxpackets = u64_stats_read(&p->rx_packets);
+ rxbytes = u64_stats_read(&p->rx_bytes);
+ rxmulticast = u64_stats_read(&p->rx_multicast);
+ txpackets = u64_stats_read(&p->tx_packets);
+ txbytes = u64_stats_read(&p->tx_bytes);
+ } while (u64_stats_fetch_retry(&p->syncp, start));
stats->rx_packets += rxpackets;
stats->rx_bytes += rxbytes;
@@ -710,8 +688,8 @@ static void vlan_dev_get_stats64(struct net_device *dev,
stats->tx_packets += txpackets;
stats->tx_bytes += txbytes;
/* rx_errors & tx_dropped are u32 */
- rx_errors += p->rx_errors;
- tx_dropped += p->tx_dropped;
+ rx_errors += READ_ONCE(p->rx_errors);
+ tx_dropped += READ_ONCE(p->tx_dropped);
}
stats->rx_errors = rx_errors;
stats->tx_dropped = tx_dropped;
@@ -723,7 +701,7 @@ static void vlan_dev_poll_controller(struct net_device *dev)
return;
}
-static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo)
+static int vlan_dev_netpoll_setup(struct net_device *dev)
{
struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
struct net_device *real_dev = vlan->real_dev;
@@ -756,18 +734,272 @@ static void vlan_dev_netpoll_cleanup(struct net_device *dev)
return;
vlan->netpoll = NULL;
-
- __netpoll_free_async(netpoll);
+ __netpoll_free(netpoll);
}
#endif /* CONFIG_NET_POLL_CONTROLLER */
static int vlan_dev_get_iflink(const struct net_device *dev)
{
- struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+ const struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
+
+ return READ_ONCE(real_dev->ifindex);
+}
+
+static int vlan_dev_fill_forward_path(struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(ctx->dev);
+
+ path->type = DEV_PATH_VLAN;
+ path->encap.id = vlan->vlan_id;
+ path->encap.proto = vlan->vlan_proto;
+ path->dev = ctx->dev;
+ ctx->dev = vlan->real_dev;
+ if (ctx->num_vlans >= ARRAY_SIZE(ctx->vlan))
+ return -ENOSPC;
+
+ ctx->vlan[ctx->num_vlans].id = vlan->vlan_id;
+ ctx->vlan[ctx->num_vlans].proto = vlan->vlan_proto;
+ ctx->num_vlans++;
+
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_MACSEC)
+
+static const struct macsec_ops *vlan_get_macsec_ops(const struct macsec_context *ctx)
+{
+ return vlan_dev_priv(ctx->netdev)->real_dev->macsec_ops;
+}
+
+static int vlan_macsec_offload(int (* const func)(struct macsec_context *),
+ struct macsec_context *ctx)
+{
+ if (unlikely(!func))
+ return 0;
+
+ return (*func)(ctx);
+}
+
+static int vlan_macsec_dev_open(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_dev_open, ctx);
+}
+
+static int vlan_macsec_dev_stop(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_dev_stop, ctx);
+}
+
+static int vlan_macsec_add_secy(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_add_secy, ctx);
+}
+
+static int vlan_macsec_upd_secy(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_upd_secy, ctx);
+}
+
+static int vlan_macsec_del_secy(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_del_secy, ctx);
+}
+
+static int vlan_macsec_add_rxsc(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_add_rxsc, ctx);
+}
+
+static int vlan_macsec_upd_rxsc(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_upd_rxsc, ctx);
+}
+
+static int vlan_macsec_del_rxsc(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_del_rxsc, ctx);
+}
+
+static int vlan_macsec_add_rxsa(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_add_rxsa, ctx);
+}
+
+static int vlan_macsec_upd_rxsa(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_upd_rxsa, ctx);
+}
+
+static int vlan_macsec_del_rxsa(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_del_rxsa, ctx);
+}
+
+static int vlan_macsec_add_txsa(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_add_txsa, ctx);
+}
+
+static int vlan_macsec_upd_txsa(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_upd_txsa, ctx);
+}
+
+static int vlan_macsec_del_txsa(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_del_txsa, ctx);
+}
+
+static int vlan_macsec_get_dev_stats(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_get_dev_stats, ctx);
+}
+
+static int vlan_macsec_get_tx_sc_stats(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
- return real_dev->ifindex;
+ return vlan_macsec_offload(ops->mdo_get_tx_sc_stats, ctx);
}
+static int vlan_macsec_get_tx_sa_stats(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_get_tx_sa_stats, ctx);
+}
+
+static int vlan_macsec_get_rx_sc_stats(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_get_rx_sc_stats, ctx);
+}
+
+static int vlan_macsec_get_rx_sa_stats(struct macsec_context *ctx)
+{
+ const struct macsec_ops *ops = vlan_get_macsec_ops(ctx);
+
+ if (!ops)
+ return -EOPNOTSUPP;
+
+ return vlan_macsec_offload(ops->mdo_get_rx_sa_stats, ctx);
+}
+
+static const struct macsec_ops macsec_offload_ops = {
+ /* Device wide */
+ .mdo_dev_open = vlan_macsec_dev_open,
+ .mdo_dev_stop = vlan_macsec_dev_stop,
+ /* SecY */
+ .mdo_add_secy = vlan_macsec_add_secy,
+ .mdo_upd_secy = vlan_macsec_upd_secy,
+ .mdo_del_secy = vlan_macsec_del_secy,
+ /* Security channels */
+ .mdo_add_rxsc = vlan_macsec_add_rxsc,
+ .mdo_upd_rxsc = vlan_macsec_upd_rxsc,
+ .mdo_del_rxsc = vlan_macsec_del_rxsc,
+ /* Security associations */
+ .mdo_add_rxsa = vlan_macsec_add_rxsa,
+ .mdo_upd_rxsa = vlan_macsec_upd_rxsa,
+ .mdo_del_rxsa = vlan_macsec_del_rxsa,
+ .mdo_add_txsa = vlan_macsec_add_txsa,
+ .mdo_upd_txsa = vlan_macsec_upd_txsa,
+ .mdo_del_txsa = vlan_macsec_del_txsa,
+ /* Statistics */
+ .mdo_get_dev_stats = vlan_macsec_get_dev_stats,
+ .mdo_get_tx_sc_stats = vlan_macsec_get_tx_sc_stats,
+ .mdo_get_tx_sa_stats = vlan_macsec_get_tx_sa_stats,
+ .mdo_get_rx_sc_stats = vlan_macsec_get_rx_sc_stats,
+ .mdo_get_rx_sa_stats = vlan_macsec_get_rx_sa_stats,
+};
+
+#endif
+
static const struct ethtool_ops vlan_ethtool_ops = {
.get_link_ksettings = vlan_ethtool_get_link_ksettings,
.get_drvinfo = vlan_ethtool_get_drvinfo,
@@ -786,7 +1018,7 @@ static const struct net_device_ops vlan_netdev_ops = {
.ndo_set_mac_address = vlan_dev_set_mac_address,
.ndo_set_rx_mode = vlan_dev_set_rx_mode,
.ndo_change_rx_flags = vlan_dev_change_rx_flags,
- .ndo_do_ioctl = vlan_dev_ioctl,
+ .ndo_eth_ioctl = vlan_dev_ioctl,
.ndo_neigh_setup = vlan_dev_neigh_setup,
.ndo_get_stats64 = vlan_dev_get_stats64,
#if IS_ENABLED(CONFIG_FCOE)
@@ -794,17 +1026,21 @@ static const struct net_device_ops vlan_netdev_ops = {
.ndo_fcoe_ddp_done = vlan_dev_fcoe_ddp_done,
.ndo_fcoe_enable = vlan_dev_fcoe_enable,
.ndo_fcoe_disable = vlan_dev_fcoe_disable,
- .ndo_fcoe_get_wwn = vlan_dev_fcoe_get_wwn,
.ndo_fcoe_ddp_target = vlan_dev_fcoe_ddp_target,
#endif
+#ifdef NETDEV_FCOE_WWNN
+ .ndo_fcoe_get_wwn = vlan_dev_fcoe_get_wwn,
+#endif
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = vlan_dev_poll_controller,
.ndo_netpoll_setup = vlan_dev_netpoll_setup,
.ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup,
#endif
.ndo_fix_features = vlan_dev_fix_features,
- .ndo_get_lock_subclass = vlan_dev_get_lock_subclass,
.ndo_get_iflink = vlan_dev_get_iflink,
+ .ndo_fill_forward_path = vlan_dev_fill_forward_path,
+ .ndo_hwtstamp_get = vlan_hwtstamp_get,
+ .ndo_hwtstamp_set = vlan_hwtstamp_set,
};
static void vlan_dev_free(struct net_device *dev)
@@ -813,6 +1049,9 @@ static void vlan_dev_free(struct net_device *dev)
free_percpu(vlan->vlan_pcpu_stats);
vlan->vlan_pcpu_stats = NULL;
+
+ /* Get rid of the vlan's reference to real_dev */
+ netdev_put(vlan->real_dev, &vlan->dev_tracker);
}
void vlan_setup(struct net_device *dev)
@@ -829,6 +1068,9 @@ void vlan_setup(struct net_device *dev)
dev->priv_destructor = vlan_dev_free;
dev->ethtool_ops = &vlan_ethtool_ops;
+#if IS_ENABLED(CONFIG_MACSEC)
+ dev->macsec_ops = &macsec_offload_ops;
+#endif
dev->min_mtu = 0;
dev->max_mtu = ETH_MAX_MTU;
diff --git a/net/8021q/vlan_gvrp.c b/net/8021q/vlan_gvrp.c
index 66a80320b032..6b34b72aa466 100644
--- a/net/8021q/vlan_gvrp.c
+++ b/net/8021q/vlan_gvrp.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* IEEE 802.1Q GARP VLAN Registration Protocol (GVRP)
*
* Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
*/
#include <linux/types.h>
#include <linux/if_vlan.h>
diff --git a/net/8021q/vlan_mvrp.c b/net/8021q/vlan_mvrp.c
index e0fe091801b0..689eceeaa360 100644
--- a/net/8021q/vlan_mvrp.c
+++ b/net/8021q/vlan_mvrp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* IEEE 802.1Q Multiple VLAN Registration Protocol (MVRP)
*
@@ -5,10 +6,6 @@
*
* Adapted from code in net/8021q/vlan_gvrp.c
* Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
*/
#include <linux/types.h>
#include <linux/if_ether.h>
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index 9b60c1e399e2..a000b1ef0520 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* VLAN netlink control interface
*
* Copyright (c) 2007 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
*/
#include <linux/kernel.h>
@@ -35,8 +32,8 @@ static inline int vlan_validate_qos_map(struct nlattr *attr)
{
if (!attr)
return 0;
- return nla_validate_nested(attr, IFLA_VLAN_QOS_MAX, vlan_map_policy,
- NULL);
+ return nla_validate_nested_deprecated(attr, IFLA_VLAN_QOS_MAX,
+ vlan_map_policy, NULL);
}
static int vlan_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -84,7 +81,8 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[],
flags = nla_data(data[IFLA_VLAN_FLAGS]);
if ((flags->flags & flags->mask) &
~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP |
- VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP)) {
+ VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP |
+ VLAN_FLAG_BRIDGE_BINDING)) {
NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN flags");
return -EINVAL;
}
@@ -110,32 +108,41 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[],
struct ifla_vlan_flags *flags;
struct ifla_vlan_qos_mapping *m;
struct nlattr *attr;
- int rem;
+ int rem, err;
if (data[IFLA_VLAN_FLAGS]) {
flags = nla_data(data[IFLA_VLAN_FLAGS]);
- vlan_dev_change_flags(dev, flags->flags, flags->mask);
+ err = vlan_dev_change_flags(dev, flags->flags, flags->mask);
+ if (err)
+ return err;
}
if (data[IFLA_VLAN_INGRESS_QOS]) {
- nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) {
+ nla_for_each_nested_type(attr, IFLA_VLAN_QOS_MAPPING,
+ data[IFLA_VLAN_INGRESS_QOS], rem) {
m = nla_data(attr);
vlan_dev_set_ingress_priority(dev, m->to, m->from);
}
}
if (data[IFLA_VLAN_EGRESS_QOS]) {
- nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) {
+ nla_for_each_nested_type(attr, IFLA_VLAN_QOS_MAPPING,
+ data[IFLA_VLAN_EGRESS_QOS], rem) {
m = nla_data(attr);
- vlan_dev_set_egress_priority(dev, m->from, m->to);
+ err = vlan_dev_set_egress_priority(dev, m->from, m->to);
+ if (err)
+ return err;
}
}
return 0;
}
-static int vlan_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[],
+static int vlan_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
struct netlink_ext_ack *extack)
{
+ struct net *link_net = rtnl_newlink_link_net(params);
struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
struct net_device *real_dev;
unsigned int max_mtu;
__be16 proto;
@@ -151,16 +158,14 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev,
return -EINVAL;
}
- real_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
+ real_dev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK]));
if (!real_dev) {
NL_SET_ERR_MSG_MOD(extack, "link does not exist");
return -ENODEV;
}
- if (data[IFLA_VLAN_PROTOCOL])
- proto = nla_get_be16(data[IFLA_VLAN_PROTOCOL]);
- else
- proto = htons(ETH_P_8021Q);
+ proto = nla_get_be16_default(data[IFLA_VLAN_PROTOCOL],
+ htons(ETH_P_8021Q));
vlan->vlan_proto = proto;
vlan->vlan_id = nla_get_u16(data[IFLA_VLAN_ID]);
@@ -180,11 +185,17 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev,
else if (dev->mtu > max_mtu)
return -EINVAL;
+ /* Note: If this initial vlan_changelink() fails, we need
+ * to call vlan_dev_free_egress_priority() to free memory.
+ */
err = vlan_changelink(dev, tb, data, extack);
- if (err < 0)
- return err;
- return register_vlan_dev(dev, extack);
+ if (!err)
+ err = register_vlan_dev(dev, extack);
+
+ if (err)
+ vlan_dev_free_egress_priority(dev);
+ return err;
}
static inline size_t vlan_qos_map_size(unsigned int n)
@@ -226,7 +237,7 @@ static int vlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
goto nla_put_failure;
}
if (vlan->nr_ingress_mappings) {
- nest = nla_nest_start(skb, IFLA_VLAN_INGRESS_QOS);
+ nest = nla_nest_start_noflag(skb, IFLA_VLAN_INGRESS_QOS);
if (nest == NULL)
goto nla_put_failure;
@@ -244,7 +255,7 @@ static int vlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
}
if (vlan->nr_egress_mappings) {
- nest = nla_nest_start(skb, IFLA_VLAN_EGRESS_QOS);
+ nest = nla_nest_start_noflag(skb, IFLA_VLAN_EGRESS_QOS);
if (nest == NULL)
goto nla_put_failure;
diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c
index d36e8c4b7f56..fa67374bda49 100644
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/******************************************************************************
* vlanproc.c VLAN Module. /proc filesystem interface.
*
@@ -9,10 +10,6 @@
*
* Copyright: (c) 1998 Ben Greear
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
* ============================================================================
* Jan 20, 1998 Ben Greear Initial Version
*****************************************************************************/
@@ -166,48 +163,34 @@ void vlan_proc_rem_dev(struct net_device *vlandev)
* The following few functions build the content of /proc/net/vlan/config
*/
-/* start read of /proc/net/vlan/config */
-static void *vlan_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(rcu)
+static void *vlan_seq_from_index(struct seq_file *seq, loff_t *pos)
{
+ unsigned long ifindex = *pos;
struct net_device *dev;
- struct net *net = seq_file_net(seq);
- loff_t i = 1;
-
- rcu_read_lock();
- if (*pos == 0)
- return SEQ_START_TOKEN;
- for_each_netdev_rcu(net, dev) {
+ for_each_netdev_dump(seq_file_net(seq), dev, ifindex) {
if (!is_vlan_dev(dev))
continue;
-
- if (i++ == *pos)
- return dev;
+ *pos = dev->ifindex;
+ return dev;
}
+ return NULL;
+}
+
+static void *vlan_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
+{
+ rcu_read_lock();
+ if (*pos == 0)
+ return SEQ_START_TOKEN;
- return NULL;
+ return vlan_seq_from_index(seq, pos);
}
static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct net_device *dev;
- struct net *net = seq_file_net(seq);
-
++*pos;
-
- dev = v;
- if (v == SEQ_START_TOKEN)
- dev = net_device_entry(&net->dev_base_head);
-
- for_each_netdev_continue_rcu(net, dev) {
- if (!is_vlan_dev(dev))
- continue;
-
- return dev;
- }
-
- return NULL;
+ return vlan_seq_from_index(seq, pos);
}
static void vlan_seq_stop(struct seq_file *seq, void *v)
@@ -255,9 +238,9 @@ static int vlandev_seq_show(struct seq_file *seq, void *offset)
stats = dev_get_stats(vlandev, &temp);
seq_printf(seq,
- "%s VID: %d REORDER_HDR: %i dev->priv_flags: %hx\n",
+ "%s VID: %d REORDER_HDR: %i dev->priv_flags: %x\n",
vlandev->name, vlan->vlan_id,
- (int)(vlan->flags & 1), vlandev->priv_flags);
+ (int)(vlan->flags & 1), (u32)vlandev->priv_flags);
seq_printf(seq, fmt64, "total frames received", stats->rx_packets);
seq_printf(seq, fmt64, "total bytes received", stats->rx_bytes);
@@ -283,7 +266,7 @@ static int vlandev_seq_show(struct seq_file *seq, void *offset)
const struct vlan_priority_tci_mapping *mp
= vlan->egress_priority_map[i];
while (mp) {
- seq_printf(seq, "%u:%hu ",
+ seq_printf(seq, "%u:%d ",
mp->priority, ((mp->vlan_qos >> 13) & 0x7));
mp = mp->next;
}
diff --git a/net/9p/Kconfig b/net/9p/Kconfig
index e6014e0e51f7..22f8c167845d 100644
--- a/net/9p/Kconfig
+++ b/net/9p/Kconfig
@@ -1,10 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# 9P protocol configuration
#
menuconfig NET_9P
- depends on NET
tristate "Plan 9 Resource Sharing Support (9P2000)"
+ select NETFS_SUPPORT
help
If you say Y here, you will get experimental support for
Plan 9 resource sharing via the 9P2000 protocol.
@@ -15,6 +16,15 @@ menuconfig NET_9P
if NET_9P
+config NET_9P_FD
+ default NET_9P
+ imply INET
+ imply UNIX
+ tristate "9P FD Transport"
+ help
+ This builds support for transports over TCP, Unix sockets and
+ filedescriptors.
+
config NET_9P_VIRTIO
depends on VIRTIO
tristate "9P Virtio Transport"
@@ -30,6 +40,14 @@ config NET_9P_XEN
This builds support for a transport for 9pfs between
two Xen domains.
+config NET_9P_USBG
+ tristate "9P USB Gadget Transport"
+ depends on USB_GADGET
+ select CONFIGFS_FS
+ select USB_LIBCOMPOSITE
+ help
+ This builds support for a transport for 9pfs over
+ usb gadget.
config NET_9P_RDMA
depends on INET && INFINIBAND && INFINIBAND_ADDR_TRANS
diff --git a/net/9p/Makefile b/net/9p/Makefile
index c0486cfc85d9..22794a451c3f 100644
--- a/net/9p/Makefile
+++ b/net/9p/Makefile
@@ -1,18 +1,21 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_NET_9P) := 9pnet.o
+obj-$(CONFIG_NET_9P_FD) += 9pnet_fd.o
obj-$(CONFIG_NET_9P_XEN) += 9pnet_xen.o
obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o
obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o
+obj-$(CONFIG_NET_9P_USBG) += 9pnet_usbg.o
9pnet-objs := \
mod.o \
client.o \
error.o \
- util.o \
protocol.o \
- trans_fd.o \
trans_common.o \
+9pnet_fd-objs := \
+ trans_fd.o \
+
9pnet_virtio-objs := \
trans_virtio.o \
@@ -21,3 +24,6 @@ obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o
9pnet_rdma-objs := \
trans_rdma.o \
+
+9pnet_usbg-objs := \
+ trans_usbg.o \
diff --git a/net/9p/client.c b/net/9p/client.c
index deae53a7dffc..f60d1d041adb 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -1,26 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * net/9p/clnt.c
- *
* 9P Client
*
* Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com>
* Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to:
- * Free Software Foundation
- * 51 Franklin Street, Fifth Floor
- * Boston, MA 02111-1301 USA
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -35,9 +18,10 @@
#include <linux/sched/signal.h>
#include <linux/uaccess.h>
#include <linux/uio.h>
+#include <linux/netfs.h>
#include <net/9p/9p.h>
-#include <linux/parser.h>
#include <linux/seq_file.h>
+#include <linux/fs_context.h>
#include <net/9p/client.h>
#include <net/9p/transport.h>
#include "protocol.h"
@@ -45,26 +29,9 @@
#define CREATE_TRACE_POINTS
#include <trace/events/9p.h>
-/*
- * Client Option Parsing (code inspired by NFS code)
- * - a little lazy - parse all client options
- */
-
-enum {
- Opt_msize,
- Opt_trans,
- Opt_legacy,
- Opt_version,
- Opt_err,
-};
-
-static const match_table_t tokens = {
- {Opt_msize, "msize=%u"},
- {Opt_legacy, "noextend"},
- {Opt_trans, "trans=%s"},
- {Opt_version, "version=%s"},
- {Opt_err, NULL},
-};
+/* Client Option Parsing (code inspired by NFS code)
+ * - a little lazy - parse all client options
+ */
inline int p9_is_proto_dotl(struct p9_client *clnt)
{
@@ -80,7 +47,7 @@ EXPORT_SYMBOL(p9_is_proto_dotu);
int p9_show_client_options(struct seq_file *m, struct p9_client *clnt)
{
- if (clnt->msize != 8192)
+ if (clnt->msize != DEFAULT_MSIZE)
seq_printf(m, ",msize=%u", clnt->msize);
seq_printf(m, ",trans=%s", clnt->trans_mod->name);
@@ -102,273 +69,226 @@ int p9_show_client_options(struct seq_file *m, struct p9_client *clnt)
}
EXPORT_SYMBOL(p9_show_client_options);
-/*
- * Some error codes are taken directly from the server replies,
+/* Some error codes are taken directly from the server replies,
* make sure they are valid.
*/
static int safe_errno(int err)
{
- if ((err > 0) || (err < -MAX_ERRNO)) {
+ if (err > 0 || err < -MAX_ERRNO) {
p9_debug(P9_DEBUG_ERROR, "Invalid error code %d\n", err);
return -EPROTO;
}
return err;
}
-
-/* Interpret mount option for protocol version */
-static int get_protocol_version(char *s)
-{
- int version = -EINVAL;
-
- if (!strcmp(s, "9p2000")) {
- version = p9_proto_legacy;
- p9_debug(P9_DEBUG_9P, "Protocol version: Legacy\n");
- } else if (!strcmp(s, "9p2000.u")) {
- version = p9_proto_2000u;
- p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.u\n");
- } else if (!strcmp(s, "9p2000.L")) {
- version = p9_proto_2000L;
- p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.L\n");
- } else
- pr_info("Unknown protocol version %s\n", s);
-
- return version;
-}
-
-/**
- * parse_options - parse mount options into client structure
- * @opts: options string passed from mount
- * @clnt: existing v9fs client information
- *
- * Return 0 upon success, -ERRNO upon failure
- */
-
-static int parse_opts(char *opts, struct p9_client *clnt)
+static int apply_client_options(struct p9_client *clnt, struct fs_context *fc)
{
- char *options, *tmp_options;
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
- char *s;
- int ret = 0;
+ struct v9fs_context *ctx = fc->fs_private;
- clnt->proto_version = p9_proto_2000L;
- clnt->msize = 8192;
+ clnt->msize = ctx->client_opts.msize;
+ clnt->trans_mod = ctx->client_opts.trans_mod;
+ ctx->client_opts.trans_mod = NULL;
+ clnt->proto_version = ctx->client_opts.proto_version;
- if (!opts)
- return 0;
+ return 0;
+}
- tmp_options = kstrdup(opts, GFP_KERNEL);
- if (!tmp_options) {
- p9_debug(P9_DEBUG_ERROR,
- "failed to allocate copy of option string\n");
- return -ENOMEM;
- }
- options = tmp_options;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token, r;
- if (!*p)
- continue;
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_msize:
- r = match_int(&args[0], &option);
- if (r < 0) {
- p9_debug(P9_DEBUG_ERROR,
- "integer field, but no integer?\n");
- ret = r;
- continue;
- }
- clnt->msize = option;
- break;
- case Opt_trans:
- s = match_strdup(&args[0]);
- if (!s) {
- ret = -ENOMEM;
- p9_debug(P9_DEBUG_ERROR,
- "problem allocating copy of trans arg\n");
- goto free_and_return;
- }
-
- v9fs_put_trans(clnt->trans_mod);
- clnt->trans_mod = v9fs_get_trans_by_name(s);
- if (clnt->trans_mod == NULL) {
- pr_info("Could not find request transport: %s\n",
- s);
- ret = -EINVAL;
- }
- kfree(s);
- break;
- case Opt_legacy:
- clnt->proto_version = p9_proto_legacy;
- break;
- case Opt_version:
- s = match_strdup(&args[0]);
- if (!s) {
- ret = -ENOMEM;
- p9_debug(P9_DEBUG_ERROR,
- "problem allocating copy of version arg\n");
- goto free_and_return;
- }
- r = get_protocol_version(s);
- if (r < 0)
- ret = r;
- else
- clnt->proto_version = r;
- kfree(s);
- break;
- default:
- continue;
+static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc,
+ int alloc_msize)
+{
+ if (likely(c->fcall_cache) && alloc_msize == c->msize) {
+ fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS);
+ fc->cache = c->fcall_cache;
+ if (!fc->sdata && c->trans_mod->supports_vmalloc) {
+ fc->sdata = kvmalloc(alloc_msize, GFP_NOFS);
+ fc->cache = NULL;
}
+ } else {
+ if (c->trans_mod->supports_vmalloc)
+ fc->sdata = kvmalloc(alloc_msize, GFP_NOFS);
+ else
+ fc->sdata = kmalloc(alloc_msize, GFP_NOFS);
+ fc->cache = NULL;
}
-
-free_and_return:
- if (ret)
- v9fs_put_trans(clnt->trans_mod);
- kfree(tmp_options);
- return ret;
+ if (!fc->sdata)
+ return -ENOMEM;
+ fc->capacity = alloc_msize;
+ fc->id = 0;
+ fc->tag = P9_NOTAG;
+ return 0;
}
-static struct p9_fcall *p9_fcall_alloc(int alloc_msize)
+void p9_fcall_fini(struct p9_fcall *fc)
{
- struct p9_fcall *fc;
- fc = kmalloc(sizeof(struct p9_fcall) + alloc_msize, GFP_NOFS);
- if (!fc)
- return NULL;
- fc->capacity = alloc_msize;
- fc->sdata = (char *) fc + sizeof(struct p9_fcall);
- return fc;
+ /* sdata can be NULL for interrupted requests in trans_rdma,
+ * and kmem_cache_free does not do NULL-check for us
+ */
+ if (unlikely(!fc->sdata))
+ return;
+
+ if (fc->cache)
+ kmem_cache_free(fc->cache, fc->sdata);
+ else
+ kvfree(fc->sdata);
}
+EXPORT_SYMBOL(p9_fcall_fini);
+
+static struct kmem_cache *p9_req_cache;
/**
- * p9_tag_alloc - lookup/allocate a request by tag
- * @c: client session to lookup tag within
- * @tag: numeric id for transaction
- *
- * this is a simple array lookup, but will grow the
- * request_slots as necessary to accommodate transaction
- * ids which did not previously have a slot.
- *
- * this code relies on the client spinlock to manage locks, its
- * possible we should switch to something else, but I'd rather
- * stick with something low-overhead for the common case.
+ * p9_tag_alloc - Allocate a new request.
+ * @c: Client session.
+ * @type: Transaction type.
+ * @t_size: Buffer size for holding this request
+ * (automatic calculation by format template if 0).
+ * @r_size: Buffer size for holding server's reply on this request
+ * (automatic calculation by format template if 0).
+ * @fmt: Format template for assembling 9p request message
+ * (see p9pdu_vwritef).
+ * @ap: Variable arguments to be fed to passed format template
+ * (see p9pdu_vwritef).
*
+ * Context: Process context.
+ * Return: Pointer to new request.
*/
-
static struct p9_req_t *
-p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int max_size)
+p9_tag_alloc(struct p9_client *c, int8_t type, uint t_size, uint r_size,
+ const char *fmt, va_list ap)
{
- unsigned long flags;
- int row, col;
- struct p9_req_t *req;
- int alloc_msize = min(c->msize, max_size);
-
- /* This looks up the original request by tag so we know which
- * buffer to read the data into */
- tag++;
-
- if (tag >= c->max_tag) {
- spin_lock_irqsave(&c->lock, flags);
- /* check again since original check was outside of lock */
- while (tag >= c->max_tag) {
- row = (tag / P9_ROW_MAXTAG);
- c->reqs[row] = kcalloc(P9_ROW_MAXTAG,
- sizeof(struct p9_req_t), GFP_ATOMIC);
-
- if (!c->reqs[row]) {
- pr_err("Couldn't grow tag array\n");
- spin_unlock_irqrestore(&c->lock, flags);
- return ERR_PTR(-ENOMEM);
- }
- for (col = 0; col < P9_ROW_MAXTAG; col++) {
- req = &c->reqs[row][col];
- req->status = REQ_STATUS_IDLE;
- init_waitqueue_head(&req->wq);
- }
- c->max_tag += P9_ROW_MAXTAG;
- }
- spin_unlock_irqrestore(&c->lock, flags);
- }
- row = tag / P9_ROW_MAXTAG;
- col = tag % P9_ROW_MAXTAG;
+ struct p9_req_t *req = kmem_cache_alloc(p9_req_cache, GFP_NOFS);
+ int alloc_tsize;
+ int alloc_rsize;
+ int tag;
+ va_list apc;
- req = &c->reqs[row][col];
- if (!req->tc)
- req->tc = p9_fcall_alloc(alloc_msize);
- if (!req->rc)
- req->rc = p9_fcall_alloc(alloc_msize);
- if (!req->tc || !req->rc)
- goto grow_failed;
+ va_copy(apc, ap);
+ alloc_tsize = min_t(size_t, c->msize,
+ t_size ?: p9_msg_buf_size(c, type, fmt, apc));
+ va_end(apc);
+
+ alloc_rsize = min_t(size_t, c->msize,
+ r_size ?: p9_msg_buf_size(c, type + 1, fmt, ap));
+
+ if (!req)
+ return ERR_PTR(-ENOMEM);
- p9pdu_reset(req->tc);
- p9pdu_reset(req->rc);
+ if (p9_fcall_init(c, &req->tc, alloc_tsize))
+ goto free_req;
+ if (p9_fcall_init(c, &req->rc, alloc_rsize))
+ goto free;
- req->tc->tag = tag-1;
+ p9pdu_reset(&req->tc);
+ p9pdu_reset(&req->rc);
+ req->t_err = 0;
req->status = REQ_STATUS_ALLOC;
+ /* refcount needs to be set to 0 before inserting into the idr
+ * so p9_tag_lookup does not accept a request that is not fully
+ * initialized. refcount_set to 2 below will mark request ready.
+ */
+ refcount_set(&req->refcount, 0);
+ init_waitqueue_head(&req->wq);
+ INIT_LIST_HEAD(&req->req_list);
+
+ idr_preload(GFP_NOFS);
+ spin_lock_irq(&c->lock);
+ if (type == P9_TVERSION)
+ tag = idr_alloc(&c->reqs, req, P9_NOTAG, P9_NOTAG + 1,
+ GFP_NOWAIT);
+ else
+ tag = idr_alloc(&c->reqs, req, 0, P9_NOTAG, GFP_NOWAIT);
+ req->tc.tag = tag;
+ spin_unlock_irq(&c->lock);
+ idr_preload_end();
+ if (tag < 0)
+ goto free;
+
+ /* Init ref to two because in the general case there is one ref
+ * that is put asynchronously by a writer thread, one ref
+ * temporarily given by p9_tag_lookup and put by p9_client_cb
+ * in the recv thread, and one ref put by p9_req_put in the
+ * main thread. The only exception is virtio that does not use
+ * p9_tag_lookup but does not have a writer thread either
+ * (the write happens synchronously in the request/zc_request
+ * callback), so p9_client_cb eats the second ref there
+ * as the pointer is duplicated directly by virtqueue_add_sgs()
+ */
+ refcount_set(&req->refcount, 2);
return req;
-grow_failed:
- pr_err("Couldn't grow tag array\n");
- kfree(req->tc);
- kfree(req->rc);
- req->tc = req->rc = NULL;
+free:
+ p9_fcall_fini(&req->tc);
+ p9_fcall_fini(&req->rc);
+free_req:
+ kmem_cache_free(p9_req_cache, req);
return ERR_PTR(-ENOMEM);
}
/**
- * p9_tag_lookup - lookup a request by tag
- * @c: client session to lookup tag within
- * @tag: numeric id for transaction
+ * p9_tag_lookup - Look up a request by tag.
+ * @c: Client session.
+ * @tag: Transaction ID.
*
+ * Context: Any context.
+ * Return: A request, or %NULL if there is no request with that tag.
*/
-
struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag)
{
- int row, col;
-
- /* This looks up the original request by tag so we know which
- * buffer to read the data into */
- tag++;
-
- if (tag >= c->max_tag)
- return NULL;
+ struct p9_req_t *req;
- row = tag / P9_ROW_MAXTAG;
- col = tag % P9_ROW_MAXTAG;
+ rcu_read_lock();
+again:
+ req = idr_find(&c->reqs, tag);
+ if (req) {
+ /* We have to be careful with the req found under rcu_read_lock
+ * Thanks to SLAB_TYPESAFE_BY_RCU we can safely try to get the
+ * ref again without corrupting other data, then check again
+ * that the tag matches once we have the ref
+ */
+ if (!p9_req_try_get(req))
+ goto again;
+ if (req->tc.tag != tag) {
+ p9_req_put(c, req);
+ goto again;
+ }
+ }
+ rcu_read_unlock();
- return &c->reqs[row][col];
+ return req;
}
EXPORT_SYMBOL(p9_tag_lookup);
/**
- * p9_tag_init - setup tags structure and contents
- * @c: v9fs client struct
- *
- * This initializes the tags structure for each client instance.
+ * p9_tag_remove - Remove a tag.
+ * @c: Client session.
+ * @r: Request of reference.
*
+ * Context: Any context.
*/
+static void p9_tag_remove(struct p9_client *c, struct p9_req_t *r)
+{
+ unsigned long flags;
+ u16 tag = r->tc.tag;
-static int p9_tag_init(struct p9_client *c)
+ p9_debug(P9_DEBUG_MUX, "freeing clnt %p req %p tag: %d\n", c, r, tag);
+ spin_lock_irqsave(&c->lock, flags);
+ idr_remove(&c->reqs, tag);
+ spin_unlock_irqrestore(&c->lock, flags);
+}
+
+int p9_req_put(struct p9_client *c, struct p9_req_t *r)
{
- int err = 0;
+ if (refcount_dec_and_test(&r->refcount)) {
+ p9_tag_remove(c, r);
- c->tagpool = p9_idpool_create();
- if (IS_ERR(c->tagpool)) {
- err = PTR_ERR(c->tagpool);
- goto error;
+ p9_fcall_fini(&r->tc);
+ p9_fcall_fini(&r->rc);
+ kmem_cache_free(p9_req_cache, r);
+ return 1;
}
- err = p9_idpool_get(c->tagpool); /* reserve tag 0 */
- if (err < 0) {
- p9_idpool_destroy(c->tagpool);
- goto error;
- }
- c->max_tag = 0;
-error:
- return err;
+ return 0;
}
+EXPORT_SYMBOL(p9_req_put);
/**
* p9_tag_cleanup - cleans up tags structure and reclaims resources
@@ -379,73 +299,39 @@ error:
*/
static void p9_tag_cleanup(struct p9_client *c)
{
- int row, col;
-
- /* check to insure all requests are idle */
- for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) {
- for (col = 0; col < P9_ROW_MAXTAG; col++) {
- if (c->reqs[row][col].status != REQ_STATUS_IDLE) {
- p9_debug(P9_DEBUG_MUX,
- "Attempting to cleanup non-free tag %d,%d\n",
- row, col);
- /* TODO: delay execution of cleanup */
- return;
- }
- }
- }
-
- if (c->tagpool) {
- p9_idpool_put(0, c->tagpool); /* free reserved tag 0 */
- p9_idpool_destroy(c->tagpool);
- }
+ struct p9_req_t *req;
+ int id;
- /* free requests associated with tags */
- for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) {
- for (col = 0; col < P9_ROW_MAXTAG; col++) {
- kfree(c->reqs[row][col].tc);
- kfree(c->reqs[row][col].rc);
- }
- kfree(c->reqs[row]);
+ rcu_read_lock();
+ idr_for_each_entry(&c->reqs, req, id) {
+ pr_info("Tag %d still in use\n", id);
+ if (p9_req_put(c, req) == 0)
+ pr_warn("Packet with tag %d has still references",
+ req->tc.tag);
}
- c->max_tag = 0;
-}
-
-/**
- * p9_free_req - free a request and clean-up as necessary
- * c: client state
- * r: request to release
- *
- */
-
-static void p9_free_req(struct p9_client *c, struct p9_req_t *r)
-{
- int tag = r->tc->tag;
- p9_debug(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag);
-
- r->status = REQ_STATUS_IDLE;
- if (tag != P9_NOTAG && p9_idpool_check(tag, c->tagpool))
- p9_idpool_put(tag, c->tagpool);
+ rcu_read_unlock();
}
/**
* p9_client_cb - call back from transport to client
- * c: client state
- * req: request received
+ * @c: client state
+ * @req: request received
+ * @status: request status, one of REQ_STATUS_*
*
*/
void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status)
{
- p9_debug(P9_DEBUG_MUX, " tag %d\n", req->tc->tag);
+ p9_debug(P9_DEBUG_MUX, " tag %d\n", req->tc.tag);
- /*
- * This barrier is needed to make sure any change made to req before
+ /* This barrier is needed to make sure any change made to req before
* the status change is visible to another thread
*/
smp_wmb();
- req->status = status;
+ WRITE_ONCE(req->status, status);
wake_up(&req->wq);
- p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc->tag);
+ p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc.tag);
+ p9_req_put(c, req);
}
EXPORT_SYMBOL(p9_client_cb);
@@ -459,12 +345,12 @@ EXPORT_SYMBOL(p9_client_cb);
*/
int
-p9_parse_header(struct p9_fcall *pdu, int32_t *size, int8_t *type, int16_t *tag,
- int rewind)
+p9_parse_header(struct p9_fcall *pdu, int32_t *size, int8_t *type,
+ int16_t *tag, int rewind)
{
- int8_t r_type;
- int16_t r_tag;
- int32_t r_size;
+ s8 r_type;
+ s16 r_tag;
+ s32 r_size;
int offset = pdu->offset;
int err;
@@ -512,22 +398,20 @@ EXPORT_SYMBOL(p9_parse_header);
static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
{
- int8_t type;
+ s8 type;
int err;
int ecode;
- err = p9_parse_header(req->rc, NULL, &type, NULL, 0);
- if (req->rc->size >= c->msize) {
- p9_debug(P9_DEBUG_ERROR,
- "requested packet size too big: %d\n",
- req->rc->size);
+ err = p9_parse_header(&req->rc, NULL, &type, NULL, 0);
+ if (req->rc.size > req->rc.capacity && !req->rc.zc) {
+ pr_err("requested packet size too big: %d does not fit %zu (type=%d)\n",
+ req->rc.size, req->rc.capacity, req->rc.id);
return -EIO;
}
- /*
- * dump the response from server
+ /* dump the response from server
* This should be after check errors which poplulate pdu_fcall.
*/
- trace_9p_protocol_dump(c, req->rc);
+ trace_9p_protocol_dump(c, &req->rc);
if (err) {
p9_debug(P9_DEBUG_ERROR, "couldn't parse header %d\n", err);
return err;
@@ -536,11 +420,14 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
return 0;
if (!p9_is_proto_dotl(c)) {
- char *ename;
- err = p9pdu_readf(req->rc, c->proto_version, "s?d",
+ char *ename = NULL;
+
+ err = p9pdu_readf(&req->rc, c->proto_version, "s?d",
&ename, &ecode);
- if (err)
+ if (err) {
+ kfree(ename);
goto out_err;
+ }
if (p9_is_proto_dotu(c) && ecode < 512)
err = -ecode;
@@ -553,101 +440,19 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req)
}
kfree(ename);
} else {
- err = p9pdu_readf(req->rc, c->proto_version, "d", &ecode);
- err = -ecode;
-
- p9_debug(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode);
- }
-
- return err;
-
-out_err:
- p9_debug(P9_DEBUG_ERROR, "couldn't parse error%d\n", err);
-
- return err;
-}
-
-/**
- * p9_check_zc_errors - check 9p packet for error return and process it
- * @c: current client instance
- * @req: request to parse and check for error conditions
- * @in_hdrlen: Size of response protocol buffer.
- *
- * returns error code if one is discovered, otherwise returns 0
- *
- * this will have to be more complicated if we have multiple
- * error packet types
- */
-
-static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req,
- struct iov_iter *uidata, int in_hdrlen)
-{
- int err;
- int ecode;
- int8_t type;
- char *ename = NULL;
-
- err = p9_parse_header(req->rc, NULL, &type, NULL, 0);
- /*
- * dump the response from server
- * This should be after parse_header which poplulate pdu_fcall.
- */
- trace_9p_protocol_dump(c, req->rc);
- if (err) {
- p9_debug(P9_DEBUG_ERROR, "couldn't parse header %d\n", err);
- return err;
- }
-
- if (type != P9_RERROR && type != P9_RLERROR)
- return 0;
-
- if (!p9_is_proto_dotl(c)) {
- /* Error is reported in string format */
- int len;
- /* 7 = header size for RERROR; */
- int inline_len = in_hdrlen - 7;
-
- len = req->rc->size - req->rc->offset;
- if (len > (P9_ZC_HDR_SZ - 7)) {
- err = -EFAULT;
- goto out_err;
- }
-
- ename = &req->rc->sdata[req->rc->offset];
- if (len > inline_len) {
- /* We have error in external buffer */
- if (!copy_from_iter_full(ename + inline_len,
- len - inline_len, uidata)) {
- err = -EFAULT;
- goto out_err;
- }
- }
- ename = NULL;
- err = p9pdu_readf(req->rc, c->proto_version, "s?d",
- &ename, &ecode);
+ err = p9pdu_readf(&req->rc, c->proto_version, "d", &ecode);
if (err)
goto out_err;
-
- if (p9_is_proto_dotu(c) && ecode < 512)
- err = -ecode;
-
- if (!err) {
- err = p9_errstr2errno(ename, strlen(ename));
-
- p9_debug(P9_DEBUG_9P, "<<< RERROR (%d) %s\n",
- -ecode, ename);
- }
- kfree(ename);
- } else {
- err = p9pdu_readf(req->rc, c->proto_version, "d", &ecode);
err = -ecode;
p9_debug(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode);
}
+
return err;
out_err:
p9_debug(P9_DEBUG_ERROR, "couldn't parse error%d\n", err);
+
return err;
}
@@ -669,10 +474,10 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...);
static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq)
{
struct p9_req_t *req;
- int16_t oldtag;
+ s16 oldtag;
int err;
- err = p9_parse_header(oldreq->tc, NULL, NULL, &oldtag, 1);
+ err = p9_parse_header(&oldreq->tc, NULL, NULL, &oldtag, 1);
if (err)
return err;
@@ -682,24 +487,25 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq)
if (IS_ERR(req))
return PTR_ERR(req);
- /*
- * if we haven't received a response for oldreq,
+ /* if we haven't received a response for oldreq,
* remove it from the list
*/
- if (oldreq->status == REQ_STATUS_SENT)
+ if (READ_ONCE(oldreq->status) == REQ_STATUS_SENT) {
if (c->trans_mod->cancelled)
c->trans_mod->cancelled(c, oldreq);
+ }
- p9_free_req(c, req);
+ p9_req_put(c, req);
return 0;
}
static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
- int8_t type, int req_size,
+ int8_t type, uint t_size, uint r_size,
const char *fmt, va_list ap)
{
- int tag, err;
+ int err;
struct p9_req_t *req;
+ va_list apc;
p9_debug(P9_DEBUG_MUX, "client %p op %d\n", c, type);
@@ -708,30 +514,27 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c,
return ERR_PTR(-EIO);
/* if status is begin_disconnected we allow only clunk request */
- if ((c->status == BeginDisconnect) && (type != P9_TCLUNK))
+ if (c->status == BeginDisconnect && type != P9_TCLUNK)
return ERR_PTR(-EIO);
- tag = P9_NOTAG;
- if (type != P9_TVERSION) {
- tag = p9_idpool_get(c->tagpool);
- if (tag < 0)
- return ERR_PTR(-ENOMEM);
- }
-
- req = p9_tag_alloc(c, tag, req_size);
+ va_copy(apc, ap);
+ req = p9_tag_alloc(c, type, t_size, r_size, fmt, apc);
+ va_end(apc);
if (IS_ERR(req))
return req;
/* marshall the data */
- p9pdu_prepare(req->tc, tag, type);
- err = p9pdu_vwritef(req->tc, c->proto_version, fmt, ap);
+ p9pdu_prepare(&req->tc, req->tc.tag, type);
+ err = p9pdu_vwritef(&req->tc, c->proto_version, fmt, ap);
if (err)
goto reterr;
- p9pdu_finalize(c, req->tc);
- trace_9p_client_req(c, type, tag);
+ p9pdu_finalize(c, &req->tc);
+ trace_9p_client_req(c, type, req->tc.tag);
return req;
reterr:
- p9_free_req(c, req);
+ p9_req_put(c, req);
+ /* We have to put also the 2nd reference as it won't be used */
+ p9_req_put(c, req);
return ERR_PTR(err);
}
@@ -741,7 +544,7 @@ reterr:
* @type: type of request
* @fmt: protocol format string (see protocol.c)
*
- * Returns request structure (which client must free using p9_free_req)
+ * Returns request structure (which client must free using p9_req_put)
*/
static struct p9_req_t *
@@ -751,47 +554,62 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
int sigpending, err;
unsigned long flags;
struct p9_req_t *req;
+ /* Passing zero for tsize/rsize to p9_client_prepare_req() tells it to
+ * auto determine an appropriate (small) request/response size
+ * according to actual message data being sent. Currently RDMA
+ * transport is excluded from this response message size optimization,
+ * as it would not cope with it, due to its pooled response buffers
+ * (using an optimized request size for RDMA as well though).
+ */
+ const uint tsize = 0;
+ const uint rsize = c->trans_mod->pooled_rbuffers ? c->msize : 0;
va_start(ap, fmt);
- req = p9_client_prepare_req(c, type, c->msize, fmt, ap);
+ req = p9_client_prepare_req(c, type, tsize, rsize, fmt, ap);
va_end(ap);
if (IS_ERR(req))
return req;
+ req->tc.zc = false;
+ req->rc.zc = false;
+
if (signal_pending(current)) {
sigpending = 1;
clear_thread_flag(TIF_SIGPENDING);
- } else
+ } else {
sigpending = 0;
+ }
err = c->trans_mod->request(c, req);
if (err < 0) {
+ /* write won't happen */
+ p9_req_put(c, req);
if (err != -ERESTARTSYS && err != -EFAULT)
c->status = Disconnected;
goto recalc_sigpending;
}
again:
/* Wait for the response */
- err = wait_event_killable(req->wq, req->status >= REQ_STATUS_RCVD);
+ err = wait_event_killable(req->wq,
+ READ_ONCE(req->status) >= REQ_STATUS_RCVD);
- /*
- * Make sure our req is coherent with regard to updates in other
+ /* Make sure our req is coherent with regard to updates in other
* threads - echoes to wmb() in the callback
*/
smp_rmb();
- if ((err == -ERESTARTSYS) && (c->status == Connected)
- && (type == P9_TFLUSH)) {
+ if (err == -ERESTARTSYS && c->status == Connected &&
+ type == P9_TFLUSH) {
sigpending = 1;
clear_thread_flag(TIF_SIGPENDING);
goto again;
}
- if (req->status == REQ_STATUS_ERROR) {
+ if (READ_ONCE(req->status) == REQ_STATUS_ERROR) {
p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
err = req->t_err;
}
- if ((err == -ERESTARTSYS) && (c->status == Connected)) {
+ if (err == -ERESTARTSYS && c->status == Connected) {
p9_debug(P9_DEBUG_MUX, "flushing\n");
sigpending = 1;
clear_thread_flag(TIF_SIGPENDING);
@@ -800,7 +618,7 @@ again:
p9_client_flush(c, req);
/* if we received the response anyway, don't signal error */
- if (req->status == REQ_STATUS_RCVD)
+ if (READ_ONCE(req->status) == REQ_STATUS_RCVD)
err = 0;
}
recalc_sigpending:
@@ -813,11 +631,11 @@ recalc_sigpending:
goto reterr;
err = p9_check_errors(c, req);
- trace_9p_client_res(c, type, req->rc->tag, err);
+ trace_9p_client_res(c, type, req->rc.tag, err);
if (!err)
return req;
reterr:
- p9_free_req(c, req);
+ p9_req_put(c, req);
return ERR_PTR(safe_errno(err));
}
@@ -829,10 +647,10 @@ reterr:
* @uodata: source for zero copy write
* @inlen: read buffer size
* @olen: write buffer size
- * @hdrlen: reader header size, This is the size of response protocol data
+ * @in_hdrlen: reader header size, This is the size of response protocol data
* @fmt: protocol format string (see protocol.c)
*
- * Returns request structure (which client must free using p9_free_req)
+ * Returns request structure (which client must free using p9_req_put)
*/
static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
struct iov_iter *uidata,
@@ -846,20 +664,23 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
struct p9_req_t *req;
va_start(ap, fmt);
- /*
- * We allocate a inline protocol data of only 4k bytes.
+ /* We allocate a inline protocol data of only 4k bytes.
* The actual content is passed in zero-copy fashion.
*/
- req = p9_client_prepare_req(c, type, P9_ZC_HDR_SZ, fmt, ap);
+ req = p9_client_prepare_req(c, type, P9_ZC_HDR_SZ, P9_ZC_HDR_SZ, fmt, ap);
va_end(ap);
if (IS_ERR(req))
return req;
+ req->tc.zc = true;
+ req->rc.zc = true;
+
if (signal_pending(current)) {
sigpending = 1;
clear_thread_flag(TIF_SIGPENDING);
- } else
+ } else {
sigpending = 0;
+ }
err = c->trans_mod->zc_request(c, req, uidata, uodata,
inlen, olen, in_hdrlen);
@@ -869,11 +690,11 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
if (err != -ERESTARTSYS)
goto recalc_sigpending;
}
- if (req->status == REQ_STATUS_ERROR) {
+ if (READ_ONCE(req->status) == REQ_STATUS_ERROR) {
p9_debug(P9_DEBUG_ERROR, "req_status error %d\n", req->t_err);
err = req->t_err;
}
- if ((err == -ERESTARTSYS) && (c->status == Connected)) {
+ if (err == -ERESTARTSYS && c->status == Connected) {
p9_debug(P9_DEBUG_MUX, "flushing\n");
sigpending = 1;
clear_thread_flag(TIF_SIGPENDING);
@@ -882,7 +703,7 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
p9_client_flush(c, req);
/* if we received the response anyway, don't signal error */
- if (req->status == REQ_STATUS_RCVD)
+ if (READ_ONCE(req->status) == REQ_STATUS_RCVD)
err = 0;
}
recalc_sigpending:
@@ -894,12 +715,12 @@ recalc_sigpending:
if (err < 0)
goto reterr;
- err = p9_check_zc_errors(c, req, uidata, in_hdrlen);
- trace_9p_client_res(c, type, req->rc->tag, err);
+ err = p9_check_errors(c, req);
+ trace_9p_client_res(c, type, req->rc.tag, err);
if (!err)
return req;
reterr:
- p9_free_req(c, req);
+ p9_req_put(c, req);
return ERR_PTR(safe_errno(err));
}
@@ -909,16 +730,14 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt)
struct p9_fid *fid;
p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt);
- fid = kmalloc(sizeof(struct p9_fid), GFP_KERNEL);
+ fid = kzalloc(sizeof(*fid), GFP_KERNEL);
if (!fid)
return NULL;
- memset(&fid->qid, 0, sizeof(struct p9_qid));
fid->mode = -1;
fid->uid = current_fsuid();
fid->clnt = clnt;
- fid->rdir = NULL;
- fid->fid = 0;
+ refcount_set(&fid->count, 1);
idr_preload(GFP_KERNEL);
spin_lock_irq(&clnt->lock);
@@ -926,9 +745,10 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt)
GFP_NOWAIT);
spin_unlock_irq(&clnt->lock);
idr_preload_end();
-
- if (!ret)
+ if (!ret) {
+ trace_9p_fid_ref(fid, P9_FID_REF_CREATE);
return fid;
+ }
kfree(fid);
return NULL;
@@ -940,6 +760,7 @@ static void p9_fid_destroy(struct p9_fid *fid)
unsigned long flags;
p9_debug(P9_DEBUG_FID, "fid %d\n", fid->fid);
+ trace_9p_fid_ref(fid, P9_FID_REF_DESTROY);
clnt = fid->clnt;
spin_lock_irqsave(&clnt->lock, flags);
idr_remove(&clnt->fids, fid->fid);
@@ -948,9 +769,24 @@ static void p9_fid_destroy(struct p9_fid *fid)
kfree(fid);
}
+/* We also need to export tracepoint symbols for tracepoint_enabled() */
+EXPORT_TRACEPOINT_SYMBOL(9p_fid_ref);
+
+void do_trace_9p_fid_get(struct p9_fid *fid)
+{
+ trace_9p_fid_ref(fid, P9_FID_REF_GET);
+}
+EXPORT_SYMBOL(do_trace_9p_fid_get);
+
+void do_trace_9p_fid_put(struct p9_fid *fid)
+{
+ trace_9p_fid_ref(fid, P9_FID_REF_PUT);
+}
+EXPORT_SYMBOL(do_trace_9p_fid_put);
+
static int p9_client_version(struct p9_client *c)
{
- int err = 0;
+ int err;
struct p9_req_t *req;
char *version = NULL;
int msize;
@@ -961,15 +797,15 @@ static int p9_client_version(struct p9_client *c)
switch (c->proto_version) {
case p9_proto_2000L:
req = p9_client_rpc(c, P9_TVERSION, "ds",
- c->msize, "9P2000.L");
+ c->msize, "9P2000.L");
break;
case p9_proto_2000u:
req = p9_client_rpc(c, P9_TVERSION, "ds",
- c->msize, "9P2000.u");
+ c->msize, "9P2000.u");
break;
case p9_proto_legacy:
req = p9_client_rpc(c, P9_TVERSION, "ds",
- c->msize, "9P2000");
+ c->msize, "9P2000");
break;
default:
return -EINVAL;
@@ -978,95 +814,129 @@ static int p9_client_version(struct p9_client *c)
if (IS_ERR(req))
return PTR_ERR(req);
- err = p9pdu_readf(req->rc, c->proto_version, "ds", &msize, &version);
+ err = p9pdu_readf(&req->rc, c->proto_version, "ds", &msize, &version);
if (err) {
p9_debug(P9_DEBUG_9P, "version error %d\n", err);
- trace_9p_protocol_dump(c, req->rc);
+ trace_9p_protocol_dump(c, &req->rc);
goto error;
}
p9_debug(P9_DEBUG_9P, "<<< RVERSION msize %d %s\n", msize, version);
- if (!strncmp(version, "9P2000.L", 8))
+ if (!strncmp(version, "9P2000.L", 8)) {
c->proto_version = p9_proto_2000L;
- else if (!strncmp(version, "9P2000.u", 8))
+ } else if (!strncmp(version, "9P2000.u", 8)) {
c->proto_version = p9_proto_2000u;
- else if (!strncmp(version, "9P2000", 6))
+ } else if (!strncmp(version, "9P2000", 6)) {
c->proto_version = p9_proto_legacy;
- else {
+ } else {
+ p9_debug(P9_DEBUG_ERROR,
+ "server returned an unknown version: %s\n", version);
err = -EREMOTEIO;
goto error;
}
+ if (msize < 4096) {
+ p9_debug(P9_DEBUG_ERROR,
+ "server returned a msize < 4096: %d\n", msize);
+ err = -EREMOTEIO;
+ goto error;
+ }
if (msize < c->msize)
c->msize = msize;
error:
kfree(version);
- p9_free_req(c, req);
+ p9_req_put(c, req);
return err;
}
-struct p9_client *p9_client_create(const char *dev_name, char *options)
+struct p9_client *p9_client_create(struct fs_context *fc)
{
int err;
+ static atomic_t seqno = ATOMIC_INIT(0);
struct p9_client *clnt;
char *client_id;
+ char *cache_name;
- err = 0;
- clnt = kmalloc(sizeof(struct p9_client), GFP_KERNEL);
+ clnt = kmalloc(sizeof(*clnt), GFP_KERNEL);
if (!clnt)
return ERR_PTR(-ENOMEM);
clnt->trans_mod = NULL;
clnt->trans = NULL;
+ clnt->fcall_cache = NULL;
client_id = utsname()->nodename;
memcpy(clnt->name, client_id, strlen(client_id) + 1);
spin_lock_init(&clnt->lock);
idr_init(&clnt->fids);
+ idr_init(&clnt->reqs);
- err = p9_tag_init(clnt);
- if (err < 0)
+ err = apply_client_options(clnt, fc);
+ if (err)
goto free_client;
- err = parse_opts(options, clnt);
- if (err < 0)
- goto destroy_tagpool;
-
if (!clnt->trans_mod)
clnt->trans_mod = v9fs_get_default_trans();
- if (clnt->trans_mod == NULL) {
+ if (!clnt->trans_mod) {
err = -EPROTONOSUPPORT;
p9_debug(P9_DEBUG_ERROR,
"No transport defined or default transport\n");
- goto destroy_tagpool;
+ goto free_client;
}
p9_debug(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n",
clnt, clnt->trans_mod, clnt->msize, clnt->proto_version);
- err = clnt->trans_mod->create(clnt, dev_name, options);
+ err = clnt->trans_mod->create(clnt, fc);
if (err)
goto put_trans;
- if (clnt->msize > clnt->trans_mod->maxsize)
+ if (clnt->msize > clnt->trans_mod->maxsize) {
clnt->msize = clnt->trans_mod->maxsize;
+ pr_info("Limiting 'msize' to %d as this is the maximum "
+ "supported by transport %s\n",
+ clnt->msize, clnt->trans_mod->name
+ );
+ }
+
+ if (clnt->msize < 4096) {
+ p9_debug(P9_DEBUG_ERROR,
+ "Please specify a msize of at least 4k\n");
+ err = -EINVAL;
+ goto close_trans;
+ }
err = p9_client_version(clnt);
if (err)
goto close_trans;
+ cache_name = kasprintf(GFP_KERNEL,
+ "9p-fcall-cache-%u", atomic_inc_return(&seqno));
+ if (!cache_name) {
+ err = -ENOMEM;
+ goto close_trans;
+ }
+
+ /* P9_HDRSZ + 4 is the smallest packet header we can have that is
+ * followed by data accessed from userspace by read
+ */
+ clnt->fcall_cache =
+ kmem_cache_create_usercopy(cache_name, clnt->msize,
+ 0, 0, P9_HDRSZ + 4,
+ clnt->msize - (P9_HDRSZ + 4),
+ NULL);
+
+ kfree(cache_name);
return clnt;
close_trans:
clnt->trans_mod->close(clnt);
put_trans:
v9fs_put_trans(clnt->trans_mod);
-destroy_tagpool:
- p9_idpool_destroy(clnt->tagpool);
free_client:
kfree(clnt);
return ERR_PTR(err);
@@ -1092,6 +962,7 @@ void p9_client_destroy(struct p9_client *clnt)
p9_tag_cleanup(clnt);
+ kmem_cache_destroy(clnt->fcall_cache);
kfree(clnt);
}
EXPORT_SYMBOL(p9_client_destroy);
@@ -1111,14 +982,14 @@ void p9_client_begin_disconnect(struct p9_client *clnt)
EXPORT_SYMBOL(p9_client_begin_disconnect);
struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid,
- const char *uname, kuid_t n_uname, const char *aname)
+ const char *uname, kuid_t n_uname,
+ const char *aname)
{
- int err = 0;
+ int err;
struct p9_req_t *req;
struct p9_fid *fid;
struct p9_qid qid;
-
p9_debug(P9_DEBUG_9P, ">>> TATTACH afid %d uname %s aname %s\n",
afid ? afid->fid : -1, uname, aname);
fid = p9_fid_create(clnt);
@@ -1129,25 +1000,25 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid,
fid->uid = n_uname;
req = p9_client_rpc(clnt, P9_TATTACH, "ddss?u", fid->fid,
- afid ? afid->fid : P9_NOFID, uname, aname, n_uname);
+ afid ? afid->fid : P9_NOFID, uname, aname, n_uname);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto error;
}
- err = p9pdu_readf(req->rc, clnt->proto_version, "Q", &qid);
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", &qid);
if (err) {
- trace_9p_protocol_dump(clnt, req->rc);
- p9_free_req(clnt, req);
+ trace_9p_protocol_dump(clnt, &req->rc);
+ p9_req_put(clnt, req);
goto error;
}
p9_debug(P9_DEBUG_9P, "<<< RATTACH qid %x.%llx.%x\n",
- qid.type, (unsigned long long)qid.path, qid.version);
+ qid.type, qid.path, qid.version);
memmove(&fid->qid, &qid, sizeof(struct p9_qid));
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
return fid;
error:
@@ -1158,16 +1029,15 @@ error:
EXPORT_SYMBOL(p9_client_attach);
struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname,
- const unsigned char * const *wnames, int clone)
+ const unsigned char * const *wnames, int clone)
{
int err;
struct p9_client *clnt;
struct p9_fid *fid;
struct p9_qid *wqids;
struct p9_req_t *req;
- uint16_t nwqids, count;
+ u16 nwqids, count;
- err = 0;
wqids = NULL;
clnt = oldfid->clnt;
if (clone) {
@@ -1178,27 +1048,26 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname,
}
fid->uid = oldfid->uid;
- } else
+ } else {
fid = oldfid;
-
+ }
p9_debug(P9_DEBUG_9P, ">>> TWALK fids %d,%d nwname %ud wname[0] %s\n",
oldfid->fid, fid->fid, nwname, wnames ? wnames[0] : NULL);
-
req = p9_client_rpc(clnt, P9_TWALK, "ddT", oldfid->fid, fid->fid,
- nwname, wnames);
+ nwname, wnames);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto error;
}
- err = p9pdu_readf(req->rc, clnt->proto_version, "R", &nwqids, &wqids);
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "R", &nwqids, &wqids);
if (err) {
- trace_9p_protocol_dump(clnt, req->rc);
- p9_free_req(clnt, req);
+ trace_9p_protocol_dump(clnt, &req->rc);
+ p9_req_put(clnt, req);
goto clunk_fid;
}
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
p9_debug(P9_DEBUG_9P, "<<< RWALK nwqid %d:\n", nwqids);
@@ -1209,25 +1078,25 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname,
for (count = 0; count < nwqids; count++)
p9_debug(P9_DEBUG_9P, "<<< [%d] %x.%llx.%x\n",
- count, wqids[count].type,
- (unsigned long long)wqids[count].path,
- wqids[count].version);
+ count, wqids[count].type,
+ wqids[count].path,
+ wqids[count].version);
if (nwname)
memmove(&fid->qid, &wqids[nwqids - 1], sizeof(struct p9_qid));
else
- fid->qid = oldfid->qid;
+ memmove(&fid->qid, &oldfid->qid, sizeof(struct p9_qid));
kfree(wqids);
return fid;
clunk_fid:
kfree(wqids);
- p9_client_clunk(fid);
+ p9_fid_put(fid);
fid = NULL;
error:
- if (fid && (fid != oldfid))
+ if (fid && fid != oldfid)
p9_fid_destroy(fid);
return ERR_PTR(err);
@@ -1244,81 +1113,80 @@ int p9_client_open(struct p9_fid *fid, int mode)
clnt = fid->clnt;
p9_debug(P9_DEBUG_9P, ">>> %s fid %d mode %d\n",
- p9_is_proto_dotl(clnt) ? "TLOPEN" : "TOPEN", fid->fid, mode);
- err = 0;
+ p9_is_proto_dotl(clnt) ? "TLOPEN" : "TOPEN", fid->fid, mode);
if (fid->mode != -1)
return -EINVAL;
if (p9_is_proto_dotl(clnt))
- req = p9_client_rpc(clnt, P9_TLOPEN, "dd", fid->fid, mode);
+ req = p9_client_rpc(clnt, P9_TLOPEN, "dd", fid->fid, mode & P9L_MODE_MASK);
else
- req = p9_client_rpc(clnt, P9_TOPEN, "db", fid->fid, mode);
+ req = p9_client_rpc(clnt, P9_TOPEN, "db", fid->fid, mode & P9L_MODE_MASK);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto error;
}
- err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit);
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", &qid, &iounit);
if (err) {
- trace_9p_protocol_dump(clnt, req->rc);
+ trace_9p_protocol_dump(clnt, &req->rc);
goto free_and_error;
}
p9_debug(P9_DEBUG_9P, "<<< %s qid %x.%llx.%x iounit %x\n",
- p9_is_proto_dotl(clnt) ? "RLOPEN" : "ROPEN", qid.type,
- (unsigned long long)qid.path, qid.version, iounit);
+ p9_is_proto_dotl(clnt) ? "RLOPEN" : "ROPEN", qid.type,
+ qid.path, qid.version, iounit);
+ memmove(&fid->qid, &qid, sizeof(struct p9_qid));
fid->mode = mode;
fid->iounit = iounit;
free_and_error:
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
error:
return err;
}
EXPORT_SYMBOL(p9_client_open);
-int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags, u32 mode,
- kgid_t gid, struct p9_qid *qid)
+int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags,
+ u32 mode, kgid_t gid, struct p9_qid *qid)
{
- int err = 0;
+ int err;
struct p9_client *clnt;
struct p9_req_t *req;
int iounit;
p9_debug(P9_DEBUG_9P,
- ">>> TLCREATE fid %d name %s flags %d mode %d gid %d\n",
- ofid->fid, name, flags, mode,
- from_kgid(&init_user_ns, gid));
+ ">>> TLCREATE fid %d name %s flags %d mode %d gid %d\n",
+ ofid->fid, name, flags, mode,
+ from_kgid(&init_user_ns, gid));
clnt = ofid->clnt;
if (ofid->mode != -1)
return -EINVAL;
req = p9_client_rpc(clnt, P9_TLCREATE, "dsddg", ofid->fid, name, flags,
- mode, gid);
+ mode & P9L_MODE_MASK, gid);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto error;
}
- err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", qid, &iounit);
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", qid, &iounit);
if (err) {
- trace_9p_protocol_dump(clnt, req->rc);
+ trace_9p_protocol_dump(clnt, &req->rc);
goto free_and_error;
}
p9_debug(P9_DEBUG_9P, "<<< RLCREATE qid %x.%llx.%x iounit %x\n",
- qid->type,
- (unsigned long long)qid->path,
- qid->version, iounit);
+ qid->type, qid->path, qid->version, iounit);
- ofid->mode = mode;
+ memmove(&ofid->qid, qid, sizeof(struct p9_qid));
+ ofid->mode = flags;
ofid->iounit = iounit;
free_and_error:
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
error:
return err;
}
@@ -1334,70 +1202,68 @@ int p9_client_fcreate(struct p9_fid *fid, const char *name, u32 perm, int mode,
int iounit;
p9_debug(P9_DEBUG_9P, ">>> TCREATE fid %d name %s perm %d mode %d\n",
- fid->fid, name, perm, mode);
- err = 0;
+ fid->fid, name, perm, mode);
clnt = fid->clnt;
if (fid->mode != -1)
return -EINVAL;
req = p9_client_rpc(clnt, P9_TCREATE, "dsdb?s", fid->fid, name, perm,
- mode, extension);
+ mode & P9L_MODE_MASK, extension);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto error;
}
- err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit);
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", &qid, &iounit);
if (err) {
- trace_9p_protocol_dump(clnt, req->rc);
+ trace_9p_protocol_dump(clnt, &req->rc);
goto free_and_error;
}
p9_debug(P9_DEBUG_9P, "<<< RCREATE qid %x.%llx.%x iounit %x\n",
- qid.type,
- (unsigned long long)qid.path,
- qid.version, iounit);
+ qid.type, qid.path, qid.version, iounit);
+ memmove(&fid->qid, &qid, sizeof(struct p9_qid));
fid->mode = mode;
fid->iounit = iounit;
free_and_error:
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
error:
return err;
}
EXPORT_SYMBOL(p9_client_fcreate);
int p9_client_symlink(struct p9_fid *dfid, const char *name,
- const char *symtgt, kgid_t gid, struct p9_qid *qid)
+ const char *symtgt, kgid_t gid, struct p9_qid *qid)
{
- int err = 0;
+ int err;
struct p9_client *clnt;
struct p9_req_t *req;
p9_debug(P9_DEBUG_9P, ">>> TSYMLINK dfid %d name %s symtgt %s\n",
- dfid->fid, name, symtgt);
+ dfid->fid, name, symtgt);
clnt = dfid->clnt;
req = p9_client_rpc(clnt, P9_TSYMLINK, "dssg", dfid->fid, name, symtgt,
- gid);
+ gid);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto error;
}
- err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid);
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid);
if (err) {
- trace_9p_protocol_dump(clnt, req->rc);
+ trace_9p_protocol_dump(clnt, &req->rc);
goto free_and_error;
}
p9_debug(P9_DEBUG_9P, "<<< RSYMLINK qid %x.%llx.%x\n",
- qid->type, (unsigned long long)qid->path, qid->version);
+ qid->type, qid->path, qid->version);
free_and_error:
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
error:
return err;
}
@@ -1409,28 +1275,27 @@ int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, const char *newna
struct p9_req_t *req;
p9_debug(P9_DEBUG_9P, ">>> TLINK dfid %d oldfid %d newname %s\n",
- dfid->fid, oldfid->fid, newname);
+ dfid->fid, oldfid->fid, newname);
clnt = dfid->clnt;
req = p9_client_rpc(clnt, P9_TLINK, "dds", dfid->fid, oldfid->fid,
- newname);
+ newname);
if (IS_ERR(req))
return PTR_ERR(req);
p9_debug(P9_DEBUG_9P, "<<< RLINK\n");
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
return 0;
}
EXPORT_SYMBOL(p9_client_link);
int p9_client_fsync(struct p9_fid *fid, int datasync)
{
- int err;
+ int err = 0;
struct p9_client *clnt;
struct p9_req_t *req;
p9_debug(P9_DEBUG_9P, ">>> TFSYNC fid %d datasync:%d\n",
- fid->fid, datasync);
- err = 0;
+ fid->fid, datasync);
clnt = fid->clnt;
req = p9_client_rpc(clnt, P9_TFSYNC, "dd", fid->fid, datasync);
@@ -1441,7 +1306,7 @@ int p9_client_fsync(struct p9_fid *fid, int datasync)
p9_debug(P9_DEBUG_9P, "<<< RFSYNC fid %d\n", fid->fid);
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
error:
return err;
@@ -1450,22 +1315,14 @@ EXPORT_SYMBOL(p9_client_fsync);
int p9_client_clunk(struct p9_fid *fid)
{
- int err;
+ int err = 0;
struct p9_client *clnt;
struct p9_req_t *req;
int retries = 0;
- if (!fid) {
- pr_warn("%s (%d): Trying to clunk with NULL fid\n",
- __func__, task_pid_nr(current));
- dump_stack();
- return 0;
- }
-
again:
- p9_debug(P9_DEBUG_9P, ">>> TCLUNK fid %d (try %d)\n", fid->fid,
- retries);
- err = 0;
+ p9_debug(P9_DEBUG_9P, ">>> TCLUNK fid %d (try %d)\n",
+ fid->fid, retries);
clnt = fid->clnt;
req = p9_client_rpc(clnt, P9_TCLUNK, "d", fid->fid);
@@ -1476,30 +1333,29 @@ again:
p9_debug(P9_DEBUG_9P, "<<< RCLUNK fid %d\n", fid->fid);
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
error:
- /*
- * Fid is not valid even after a failed clunk
+ /* Fid is not valid even after a failed clunk
* If interrupted, retry once then give up and
* leak fid until umount.
*/
if (err == -ERESTARTSYS) {
if (retries++ == 0)
goto again;
- } else
+ } else {
p9_fid_destroy(fid);
+ }
return err;
}
EXPORT_SYMBOL(p9_client_clunk);
int p9_client_remove(struct p9_fid *fid)
{
- int err;
+ int err = 0;
struct p9_client *clnt;
struct p9_req_t *req;
p9_debug(P9_DEBUG_9P, ">>> TREMOVE fid %d\n", fid->fid);
- err = 0;
clnt = fid->clnt;
req = p9_client_rpc(clnt, P9_TREMOVE, "d", fid->fid);
@@ -1510,10 +1366,10 @@ int p9_client_remove(struct p9_fid *fid)
p9_debug(P9_DEBUG_9P, "<<< RREMOVE fid %d\n", fid->fid);
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
error:
if (err == -ERESTARTSYS)
- p9_client_clunk(fid);
+ p9_fid_put(fid);
else
p9_fid_destroy(fid);
return err;
@@ -1527,7 +1383,7 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags)
struct p9_client *clnt;
p9_debug(P9_DEBUG_9P, ">>> TUNLINKAT fid %d %s %d\n",
- dfid->fid, name, flags);
+ dfid->fid, name, flags);
clnt = dfid->clnt;
req = p9_client_rpc(clnt, P9_TUNLINKAT, "dsd", dfid->fid, name, flags);
@@ -1537,7 +1393,7 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags)
}
p9_debug(P9_DEBUG_9P, "<<< RUNLINKAT fid %d %s\n", dfid->fid, name);
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
error:
return err;
}
@@ -1546,82 +1402,97 @@ EXPORT_SYMBOL(p9_client_unlinkat);
int
p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err)
{
- struct p9_client *clnt = fid->clnt;
- struct p9_req_t *req;
int total = 0;
*err = 0;
- p9_debug(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %d\n",
- fid->fid, (unsigned long long) offset, (int)iov_iter_count(to));
-
while (iov_iter_count(to)) {
- int count = iov_iter_count(to);
- int rsize, non_zc = 0;
- char *dataptr;
+ int count;
- rsize = fid->iounit;
- if (!rsize || rsize > clnt->msize-P9_IOHDRSZ)
- rsize = clnt->msize - P9_IOHDRSZ;
+ count = p9_client_read_once(fid, offset, to, err);
+ if (!count || *err)
+ break;
+ offset += count;
+ total += count;
+ }
+ return total;
+}
+EXPORT_SYMBOL(p9_client_read);
- if (count < rsize)
- rsize = count;
+int
+p9_client_read_once(struct p9_fid *fid, u64 offset, struct iov_iter *to,
+ int *err)
+{
+ struct p9_client *clnt = fid->clnt;
+ struct p9_req_t *req;
+ int count = iov_iter_count(to);
+ u32 rsize, received;
+ bool non_zc = false;
+ char *dataptr;
- /* Don't bother zerocopy for small IO (< 1024) */
- if (clnt->trans_mod->zc_request && rsize > 1024) {
- /*
- * response header len is 11
- * PDU Header(7) + IO Size (4)
- */
- req = p9_client_zc_rpc(clnt, P9_TREAD, to, NULL, rsize,
- 0, 11, "dqd", fid->fid,
- offset, rsize);
- } else {
- non_zc = 1;
- req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset,
- rsize);
- }
- if (IS_ERR(req)) {
- *err = PTR_ERR(req);
- break;
- }
+ *err = 0;
+ p9_debug(P9_DEBUG_9P, ">>> TREAD fid %d offset %llu %zu\n",
+ fid->fid, offset, iov_iter_count(to));
- *err = p9pdu_readf(req->rc, clnt->proto_version,
- "D", &count, &dataptr);
- if (*err) {
- trace_9p_protocol_dump(clnt, req->rc);
- p9_free_req(clnt, req);
- break;
- }
- if (rsize < count) {
- pr_err("bogus RREAD count (%d > %d)\n", count, rsize);
- count = rsize;
- }
+ rsize = fid->iounit;
+ if (!rsize || rsize > clnt->msize - P9_IOHDRSZ)
+ rsize = clnt->msize - P9_IOHDRSZ;
- p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count);
- if (!count) {
- p9_free_req(clnt, req);
- break;
- }
+ if (count < rsize)
+ rsize = count;
- if (non_zc) {
- int n = copy_to_iter(dataptr, count, to);
- total += n;
- offset += n;
- if (n != count) {
- *err = -EFAULT;
- p9_free_req(clnt, req);
- break;
- }
- } else {
- iov_iter_advance(to, count);
- total += count;
- offset += count;
+ /* Don't bother zerocopy for small IO (< 1024) */
+ if (clnt->trans_mod->zc_request && rsize > 1024) {
+ /* response header len is 11
+ * PDU Header(7) + IO Size (4)
+ */
+ req = p9_client_zc_rpc(clnt, P9_TREAD, to, NULL, rsize,
+ 0, 11, "dqd", fid->fid,
+ offset, rsize);
+ } else {
+ non_zc = true;
+ req = p9_client_rpc(clnt, P9_TREAD, "dqd", fid->fid, offset,
+ rsize);
+ }
+ if (IS_ERR(req)) {
+ *err = PTR_ERR(req);
+ if (!non_zc)
+ iov_iter_revert(to, count - iov_iter_count(to));
+ return 0;
+ }
+
+ *err = p9pdu_readf(&req->rc, clnt->proto_version,
+ "D", &received, &dataptr);
+ if (*err) {
+ if (!non_zc)
+ iov_iter_revert(to, count - iov_iter_count(to));
+ trace_9p_protocol_dump(clnt, &req->rc);
+ p9_req_put(clnt, req);
+ return 0;
+ }
+ if (rsize < received) {
+ pr_err("bogus RREAD count (%u > %u)\n", received, rsize);
+ *err = -EIO;
+ p9_req_put(clnt, req);
+ return 0;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RREAD count %u\n", received);
+
+ if (non_zc) {
+ int n = copy_to_iter(dataptr, received, to);
+
+ if (n != received) {
+ *err = -EFAULT;
+ p9_req_put(clnt, req);
+ return n;
}
- p9_free_req(clnt, req);
+ } else {
+ iov_iter_revert(to, count - received - iov_iter_count(to));
}
- return total;
+ p9_req_put(clnt, req);
+ return received;
}
-EXPORT_SYMBOL(p9_client_read);
+EXPORT_SYMBOL(p9_client_read_once);
int
p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err)
@@ -1631,19 +1502,20 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err)
int total = 0;
*err = 0;
- p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %zd\n",
- fid->fid, (unsigned long long) offset,
- iov_iter_count(from));
-
while (iov_iter_count(from)) {
- int count = iov_iter_count(from);
- int rsize = fid->iounit;
- if (!rsize || rsize > clnt->msize-P9_IOHDRSZ)
+ size_t count = iov_iter_count(from);
+ u32 rsize = fid->iounit;
+ u32 written;
+
+ if (!rsize || rsize > clnt->msize - P9_IOHDRSZ)
rsize = clnt->msize - P9_IOHDRSZ;
if (count < rsize)
rsize = count;
+ p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu count %u (/%zu)\n",
+ fid->fid, offset, rsize, count);
+
/* Don't bother zerocopy for small IO (< 1024) */
if (clnt->trans_mod->zc_request && rsize > 1024) {
req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, from, 0,
@@ -1651,49 +1523,102 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err)
fid->fid, offset, rsize);
} else {
req = p9_client_rpc(clnt, P9_TWRITE, "dqV", fid->fid,
- offset, rsize, from);
+ offset, rsize, from);
}
if (IS_ERR(req)) {
+ iov_iter_revert(from, count - iov_iter_count(from));
*err = PTR_ERR(req);
break;
}
- *err = p9pdu_readf(req->rc, clnt->proto_version, "d", &count);
+ *err = p9pdu_readf(&req->rc, clnt->proto_version, "d", &written);
if (*err) {
- trace_9p_protocol_dump(clnt, req->rc);
- p9_free_req(clnt, req);
+ iov_iter_revert(from, count - iov_iter_count(from));
+ trace_9p_protocol_dump(clnt, &req->rc);
+ p9_req_put(clnt, req);
break;
}
- if (rsize < count) {
- pr_err("bogus RWRITE count (%d > %d)\n", count, rsize);
- count = rsize;
+ if (rsize < written) {
+ pr_err("bogus RWRITE count (%u > %u)\n", written, rsize);
+ *err = -EIO;
+ iov_iter_revert(from, count - iov_iter_count(from));
+ p9_req_put(clnt, req);
+ break;
}
- p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", count);
+ p9_debug(P9_DEBUG_9P, "<<< RWRITE count %u\n", written);
- p9_free_req(clnt, req);
- iov_iter_advance(from, count);
- total += count;
- offset += count;
+ p9_req_put(clnt, req);
+ iov_iter_revert(from, count - written - iov_iter_count(from));
+ total += written;
+ offset += written;
}
return total;
}
EXPORT_SYMBOL(p9_client_write);
+void
+p9_client_write_subreq(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *wreq = subreq->rreq;
+ struct p9_fid *fid = wreq->netfs_priv;
+ struct p9_client *clnt = fid->clnt;
+ struct p9_req_t *req;
+ unsigned long long start = subreq->start + subreq->transferred;
+ int written, len = subreq->len - subreq->transferred;
+ int err;
+
+ p9_debug(P9_DEBUG_9P, ">>> TWRITE fid %d offset %llu len %d\n",
+ fid->fid, start, len);
+
+ /* Don't bother zerocopy for small IO (< 1024) */
+ if (clnt->trans_mod->zc_request && len > 1024) {
+ req = p9_client_zc_rpc(clnt, P9_TWRITE, NULL, &subreq->io_iter,
+ 0, wreq->len, P9_ZC_HDR_SZ, "dqd",
+ fid->fid, start, len);
+ } else {
+ req = p9_client_rpc(clnt, P9_TWRITE, "dqV", fid->fid,
+ start, len, &subreq->io_iter);
+ }
+ if (IS_ERR(req)) {
+ netfs_write_subrequest_terminated(subreq, PTR_ERR(req));
+ return;
+ }
+
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "d", &written);
+ if (err) {
+ trace_9p_protocol_dump(clnt, &req->rc);
+ p9_req_put(clnt, req);
+ netfs_write_subrequest_terminated(subreq, err);
+ return;
+ }
+
+ if (written > len) {
+ pr_err("bogus RWRITE count (%d > %u)\n", written, len);
+ written = -EIO;
+ }
+
+ p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", len);
+
+ p9_req_put(clnt, req);
+ netfs_write_subrequest_terminated(subreq, written);
+}
+EXPORT_SYMBOL(p9_client_write_subreq);
+
struct p9_wstat *p9_client_stat(struct p9_fid *fid)
{
int err;
struct p9_client *clnt;
- struct p9_wstat *ret = kmalloc(sizeof(struct p9_wstat), GFP_KERNEL);
+ struct p9_wstat *ret;
struct p9_req_t *req;
u16 ignored;
p9_debug(P9_DEBUG_9P, ">>> TSTAT fid %d\n", fid->fid);
+ ret = kmalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return ERR_PTR(-ENOMEM);
- err = 0;
clnt = fid->clnt;
req = p9_client_rpc(clnt, P9_TSTAT, "d", fid->fid);
@@ -1702,27 +1627,27 @@ struct p9_wstat *p9_client_stat(struct p9_fid *fid)
goto error;
}
- err = p9pdu_readf(req->rc, clnt->proto_version, "wS", &ignored, ret);
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "wS", &ignored, ret);
if (err) {
- trace_9p_protocol_dump(clnt, req->rc);
- p9_free_req(clnt, req);
+ trace_9p_protocol_dump(clnt, &req->rc);
+ p9_req_put(clnt, req);
goto error;
}
p9_debug(P9_DEBUG_9P,
- "<<< RSTAT sz=%x type=%x dev=%x qid=%x.%llx.%x\n"
- "<<< mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n"
- "<<< name=%s uid=%s gid=%s muid=%s extension=(%s)\n"
- "<<< uid=%d gid=%d n_muid=%d\n",
- ret->size, ret->type, ret->dev, ret->qid.type,
- (unsigned long long)ret->qid.path, ret->qid.version, ret->mode,
- ret->atime, ret->mtime, (unsigned long long)ret->length,
- ret->name, ret->uid, ret->gid, ret->muid, ret->extension,
- from_kuid(&init_user_ns, ret->n_uid),
- from_kgid(&init_user_ns, ret->n_gid),
- from_kuid(&init_user_ns, ret->n_muid));
-
- p9_free_req(clnt, req);
+ "<<< RSTAT sz=%x type=%x dev=%x qid=%x.%llx.%x\n"
+ "<<< mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n"
+ "<<< name=%s uid=%s gid=%s muid=%s extension=(%s)\n"
+ "<<< uid=%d gid=%d n_muid=%d\n",
+ ret->size, ret->type, ret->dev, ret->qid.type, ret->qid.path,
+ ret->qid.version, ret->mode,
+ ret->atime, ret->mtime, ret->length,
+ ret->name, ret->uid, ret->gid, ret->muid, ret->extension,
+ from_kuid(&init_user_ns, ret->n_uid),
+ from_kgid(&init_user_ns, ret->n_gid),
+ from_kuid(&init_user_ns, ret->n_muid));
+
+ p9_req_put(clnt, req);
return ret;
error:
@@ -1732,21 +1657,20 @@ error:
EXPORT_SYMBOL(p9_client_stat);
struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid,
- u64 request_mask)
+ u64 request_mask)
{
int err;
struct p9_client *clnt;
- struct p9_stat_dotl *ret = kmalloc(sizeof(struct p9_stat_dotl),
- GFP_KERNEL);
+ struct p9_stat_dotl *ret;
struct p9_req_t *req;
p9_debug(P9_DEBUG_9P, ">>> TGETATTR fid %d, request_mask %lld\n",
- fid->fid, request_mask);
+ fid->fid, request_mask);
+ ret = kmalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return ERR_PTR(-ENOMEM);
- err = 0;
clnt = fid->clnt;
req = p9_client_rpc(clnt, P9_TGETATTR, "dq", fid->fid, request_mask);
@@ -1755,35 +1679,36 @@ struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid,
goto error;
}
- err = p9pdu_readf(req->rc, clnt->proto_version, "A", ret);
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "A", ret);
if (err) {
- trace_9p_protocol_dump(clnt, req->rc);
- p9_free_req(clnt, req);
+ trace_9p_protocol_dump(clnt, &req->rc);
+ p9_req_put(clnt, req);
goto error;
}
- p9_debug(P9_DEBUG_9P,
- "<<< RGETATTR st_result_mask=%lld\n"
- "<<< qid=%x.%llx.%x\n"
- "<<< st_mode=%8.8x st_nlink=%llu\n"
- "<<< st_uid=%d st_gid=%d\n"
- "<<< st_rdev=%llx st_size=%llx st_blksize=%llu st_blocks=%llu\n"
- "<<< st_atime_sec=%lld st_atime_nsec=%lld\n"
- "<<< st_mtime_sec=%lld st_mtime_nsec=%lld\n"
- "<<< st_ctime_sec=%lld st_ctime_nsec=%lld\n"
- "<<< st_btime_sec=%lld st_btime_nsec=%lld\n"
- "<<< st_gen=%lld st_data_version=%lld\n",
- ret->st_result_mask, ret->qid.type, ret->qid.path,
- ret->qid.version, ret->st_mode, ret->st_nlink,
- from_kuid(&init_user_ns, ret->st_uid),
- from_kgid(&init_user_ns, ret->st_gid),
- ret->st_rdev, ret->st_size, ret->st_blksize,
- ret->st_blocks, ret->st_atime_sec, ret->st_atime_nsec,
- ret->st_mtime_sec, ret->st_mtime_nsec, ret->st_ctime_sec,
- ret->st_ctime_nsec, ret->st_btime_sec, ret->st_btime_nsec,
- ret->st_gen, ret->st_data_version);
-
- p9_free_req(clnt, req);
+ p9_debug(P9_DEBUG_9P, "<<< RGETATTR st_result_mask=%lld\n"
+ "<<< qid=%x.%llx.%x\n"
+ "<<< st_mode=%8.8x st_nlink=%llu\n"
+ "<<< st_uid=%d st_gid=%d\n"
+ "<<< st_rdev=%llx st_size=%llx st_blksize=%llu st_blocks=%llu\n"
+ "<<< st_atime_sec=%lld st_atime_nsec=%lld\n"
+ "<<< st_mtime_sec=%lld st_mtime_nsec=%lld\n"
+ "<<< st_ctime_sec=%lld st_ctime_nsec=%lld\n"
+ "<<< st_btime_sec=%lld st_btime_nsec=%lld\n"
+ "<<< st_gen=%lld st_data_version=%lld\n",
+ ret->st_result_mask,
+ ret->qid.type, ret->qid.path, ret->qid.version,
+ ret->st_mode, ret->st_nlink,
+ from_kuid(&init_user_ns, ret->st_uid),
+ from_kgid(&init_user_ns, ret->st_gid),
+ ret->st_rdev, ret->st_size, ret->st_blksize, ret->st_blocks,
+ ret->st_atime_sec, ret->st_atime_nsec,
+ ret->st_mtime_sec, ret->st_mtime_nsec,
+ ret->st_ctime_sec, ret->st_ctime_nsec,
+ ret->st_btime_sec, ret->st_btime_nsec,
+ ret->st_gen, ret->st_data_version);
+
+ p9_req_put(clnt, req);
return ret;
error:
@@ -1800,7 +1725,7 @@ static int p9_client_statsize(struct p9_wstat *wst, int proto_version)
/* size[2] type[2] dev[4] qid[13] */
/* mode[4] atime[4] mtime[4] length[8]*/
/* name[s] uid[s] gid[s] muid[s] */
- ret = 2+4+13+4+4+4+8+2+2+2+2;
+ ret = 2 + 4 + 13 + 4 + 4 + 4 + 8 + 2 + 2 + 2 + 2;
if (wst->name)
ret += strlen(wst->name);
@@ -1811,9 +1736,10 @@ static int p9_client_statsize(struct p9_wstat *wst, int proto_version)
if (wst->muid)
ret += strlen(wst->muid);
- if ((proto_version == p9_proto_2000u) ||
- (proto_version == p9_proto_2000L)) {
- ret += 2+4+4+4; /* extension[s] n_uid[4] n_gid[4] n_muid[4] */
+ if (proto_version == p9_proto_2000u ||
+ proto_version == p9_proto_2000L) {
+ /* extension[s] n_uid[4] n_gid[4] n_muid[4] */
+ ret += 2 + 4 + 4 + 4;
if (wst->extension)
ret += strlen(wst->extension);
}
@@ -1823,28 +1749,29 @@ static int p9_client_statsize(struct p9_wstat *wst, int proto_version)
int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst)
{
- int err;
+ int err = 0;
struct p9_req_t *req;
struct p9_client *clnt;
- err = 0;
clnt = fid->clnt;
wst->size = p9_client_statsize(wst, clnt->proto_version);
- p9_debug(P9_DEBUG_9P, ">>> TWSTAT fid %d\n", fid->fid);
+ p9_debug(P9_DEBUG_9P, ">>> TWSTAT fid %d\n",
+ fid->fid);
p9_debug(P9_DEBUG_9P,
- " sz=%x type=%x dev=%x qid=%x.%llx.%x\n"
- " mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n"
- " name=%s uid=%s gid=%s muid=%s extension=(%s)\n"
- " uid=%d gid=%d n_muid=%d\n",
- wst->size, wst->type, wst->dev, wst->qid.type,
- (unsigned long long)wst->qid.path, wst->qid.version, wst->mode,
- wst->atime, wst->mtime, (unsigned long long)wst->length,
- wst->name, wst->uid, wst->gid, wst->muid, wst->extension,
- from_kuid(&init_user_ns, wst->n_uid),
- from_kgid(&init_user_ns, wst->n_gid),
- from_kuid(&init_user_ns, wst->n_muid));
-
- req = p9_client_rpc(clnt, P9_TWSTAT, "dwS", fid->fid, wst->size+2, wst);
+ " sz=%x type=%x dev=%x qid=%x.%llx.%x\n"
+ " mode=%8.8x atime=%8.8x mtime=%8.8x length=%llx\n"
+ " name=%s uid=%s gid=%s muid=%s extension=(%s)\n"
+ " uid=%d gid=%d n_muid=%d\n",
+ wst->size, wst->type, wst->dev, wst->qid.type,
+ wst->qid.path, wst->qid.version,
+ wst->mode, wst->atime, wst->mtime, wst->length,
+ wst->name, wst->uid, wst->gid, wst->muid, wst->extension,
+ from_kuid(&init_user_ns, wst->n_uid),
+ from_kgid(&init_user_ns, wst->n_gid),
+ from_kuid(&init_user_ns, wst->n_muid));
+
+ req = p9_client_rpc(clnt, P9_TWSTAT, "dwS",
+ fid->fid, wst->size + 2, wst);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto error;
@@ -1852,7 +1779,7 @@ int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst)
p9_debug(P9_DEBUG_9P, "<<< RWSTAT fid %d\n", fid->fid);
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
error:
return err;
}
@@ -1860,22 +1787,21 @@ EXPORT_SYMBOL(p9_client_wstat);
int p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *p9attr)
{
- int err;
+ int err = 0;
struct p9_req_t *req;
struct p9_client *clnt;
- err = 0;
clnt = fid->clnt;
p9_debug(P9_DEBUG_9P, ">>> TSETATTR fid %d\n", fid->fid);
- p9_debug(P9_DEBUG_9P,
- " valid=%x mode=%x uid=%d gid=%d size=%lld\n"
- " atime_sec=%lld atime_nsec=%lld\n"
- " mtime_sec=%lld mtime_nsec=%lld\n",
- p9attr->valid, p9attr->mode,
- from_kuid(&init_user_ns, p9attr->uid),
- from_kgid(&init_user_ns, p9attr->gid),
- p9attr->size, p9attr->atime_sec, p9attr->atime_nsec,
- p9attr->mtime_sec, p9attr->mtime_nsec);
+ p9_debug(P9_DEBUG_9P, " valid=%x mode=%x uid=%d gid=%d size=%lld\n",
+ p9attr->valid, p9attr->mode,
+ from_kuid(&init_user_ns, p9attr->uid),
+ from_kgid(&init_user_ns, p9attr->gid),
+ p9attr->size);
+ p9_debug(P9_DEBUG_9P, " atime_sec=%lld atime_nsec=%lld\n",
+ p9attr->atime_sec, p9attr->atime_nsec);
+ p9_debug(P9_DEBUG_9P, " mtime_sec=%lld mtime_nsec=%lld\n",
+ p9attr->mtime_sec, p9attr->mtime_nsec);
req = p9_client_rpc(clnt, P9_TSETATTR, "dI", fid->fid, p9attr);
@@ -1884,7 +1810,7 @@ int p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *p9attr)
goto error;
}
p9_debug(P9_DEBUG_9P, "<<< RSETATTR fid %d\n", fid->fid);
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
error:
return err;
}
@@ -1896,7 +1822,6 @@ int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb)
struct p9_req_t *req;
struct p9_client *clnt;
- err = 0;
clnt = fid->clnt;
p9_debug(P9_DEBUG_9P, ">>> TSTATFS fid %d\n", fid->fid);
@@ -1907,23 +1832,21 @@ int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb)
goto error;
}
- err = p9pdu_readf(req->rc, clnt->proto_version, "ddqqqqqqd", &sb->type,
- &sb->bsize, &sb->blocks, &sb->bfree, &sb->bavail,
- &sb->files, &sb->ffree, &sb->fsid, &sb->namelen);
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "ddqqqqqqd", &sb->type,
+ &sb->bsize, &sb->blocks, &sb->bfree, &sb->bavail,
+ &sb->files, &sb->ffree, &sb->fsid, &sb->namelen);
if (err) {
- trace_9p_protocol_dump(clnt, req->rc);
- p9_free_req(clnt, req);
+ trace_9p_protocol_dump(clnt, &req->rc);
+ p9_req_put(clnt, req);
goto error;
}
- p9_debug(P9_DEBUG_9P, "<<< RSTATFS fid %d type 0x%lx bsize %ld "
- "blocks %llu bfree %llu bavail %llu files %llu ffree %llu "
- "fsid %llu namelen %ld\n",
- fid->fid, (long unsigned int)sb->type, (long int)sb->bsize,
- sb->blocks, sb->bfree, sb->bavail, sb->files, sb->ffree,
- sb->fsid, (long int)sb->namelen);
+ p9_debug(P9_DEBUG_9P,
+ "<<< RSTATFS fid %d type 0x%x bsize %u blocks %llu bfree %llu bavail %llu files %llu ffree %llu fsid %llu namelen %u\n",
+ fid->fid, sb->type, sb->bsize, sb->blocks, sb->bfree,
+ sb->bavail, sb->files, sb->ffree, sb->fsid, sb->namelen);
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
error:
return err;
}
@@ -1932,18 +1855,17 @@ EXPORT_SYMBOL(p9_client_statfs);
int p9_client_rename(struct p9_fid *fid,
struct p9_fid *newdirfid, const char *name)
{
- int err;
+ int err = 0;
struct p9_req_t *req;
struct p9_client *clnt;
- err = 0;
clnt = fid->clnt;
p9_debug(P9_DEBUG_9P, ">>> TRENAME fid %d newdirfid %d name %s\n",
- fid->fid, newdirfid->fid, name);
+ fid->fid, newdirfid->fid, name);
req = p9_client_rpc(clnt, P9_TRENAME, "dds", fid->fid,
- newdirfid->fid, name);
+ newdirfid->fid, name);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto error;
@@ -1951,7 +1873,7 @@ int p9_client_rename(struct p9_fid *fid,
p9_debug(P9_DEBUG_9P, "<<< RRENAME fid %d\n", fid->fid);
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
error:
return err;
}
@@ -1960,16 +1882,15 @@ EXPORT_SYMBOL(p9_client_rename);
int p9_client_renameat(struct p9_fid *olddirfid, const char *old_name,
struct p9_fid *newdirfid, const char *new_name)
{
- int err;
+ int err = 0;
struct p9_req_t *req;
struct p9_client *clnt;
- err = 0;
clnt = olddirfid->clnt;
- p9_debug(P9_DEBUG_9P, ">>> TRENAMEAT olddirfid %d old name %s"
- " newdirfid %d new name %s\n", olddirfid->fid, old_name,
- newdirfid->fid, new_name);
+ p9_debug(P9_DEBUG_9P,
+ ">>> TRENAMEAT olddirfid %d old name %s newdirfid %d new name %s\n",
+ olddirfid->fid, old_name, newdirfid->fid, new_name);
req = p9_client_rpc(clnt, P9_TRENAMEAT, "dsds", olddirfid->fid,
old_name, newdirfid->fid, new_name);
@@ -1979,26 +1900,24 @@ int p9_client_renameat(struct p9_fid *olddirfid, const char *old_name,
}
p9_debug(P9_DEBUG_9P, "<<< RRENAMEAT newdirfid %d new name %s\n",
- newdirfid->fid, new_name);
+ newdirfid->fid, new_name);
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
error:
return err;
}
EXPORT_SYMBOL(p9_client_renameat);
-/*
- * An xattrwalk without @attr_name gives the fid for the lisxattr namespace
+/* An xattrwalk without @attr_name gives the fid for the lisxattr namespace
*/
struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid,
- const char *attr_name, u64 *attr_size)
+ const char *attr_name, u64 *attr_size)
{
int err;
struct p9_req_t *req;
struct p9_client *clnt;
struct p9_fid *attr_fid;
- err = 0;
clnt = file_fid->clnt;
attr_fid = p9_fid_create(clnt);
if (!attr_fid) {
@@ -2006,30 +1925,30 @@ struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid,
goto error;
}
p9_debug(P9_DEBUG_9P,
- ">>> TXATTRWALK file_fid %d, attr_fid %d name %s\n",
- file_fid->fid, attr_fid->fid, attr_name);
+ ">>> TXATTRWALK file_fid %d, attr_fid %d name '%s'\n",
+ file_fid->fid, attr_fid->fid, attr_name);
req = p9_client_rpc(clnt, P9_TXATTRWALK, "dds",
- file_fid->fid, attr_fid->fid, attr_name);
+ file_fid->fid, attr_fid->fid, attr_name);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto error;
}
- err = p9pdu_readf(req->rc, clnt->proto_version, "q", attr_size);
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "q", attr_size);
if (err) {
- trace_9p_protocol_dump(clnt, req->rc);
- p9_free_req(clnt, req);
+ trace_9p_protocol_dump(clnt, &req->rc);
+ p9_req_put(clnt, req);
goto clunk_fid;
}
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
p9_debug(P9_DEBUG_9P, "<<< RXATTRWALK fid %d size %llu\n",
- attr_fid->fid, *attr_size);
+ attr_fid->fid, *attr_size);
return attr_fid;
clunk_fid:
- p9_client_clunk(attr_fid);
+ p9_fid_put(attr_fid);
attr_fid = NULL;
error:
- if (attr_fid && (attr_fid != file_fid))
+ if (attr_fid && attr_fid != file_fid)
p9_fid_destroy(attr_fid);
return ERR_PTR(err);
@@ -2037,25 +1956,24 @@ error:
EXPORT_SYMBOL_GPL(p9_client_xattrwalk);
int p9_client_xattrcreate(struct p9_fid *fid, const char *name,
- u64 attr_size, int flags)
+ u64 attr_size, int flags)
{
- int err;
+ int err = 0;
struct p9_req_t *req;
struct p9_client *clnt;
p9_debug(P9_DEBUG_9P,
- ">>> TXATTRCREATE fid %d name %s size %lld flag %d\n",
- fid->fid, name, (long long)attr_size, flags);
- err = 0;
+ ">>> TXATTRCREATE fid %d name %s size %llu flag %d\n",
+ fid->fid, name, attr_size, flags);
clnt = fid->clnt;
req = p9_client_rpc(clnt, P9_TXATTRCREATE, "dsqd",
- fid->fid, name, attr_size, flags);
+ fid->fid, name, attr_size, flags);
if (IS_ERR(req)) {
err = PTR_ERR(req);
goto error;
}
p9_debug(P9_DEBUG_9P, "<<< RXATTRCREATE fid %d\n", fid->fid);
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
error:
return err;
}
@@ -2063,23 +1981,23 @@ EXPORT_SYMBOL_GPL(p9_client_xattrcreate);
int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset)
{
- int err, rsize, non_zc = 0;
+ int err, non_zc = 0;
+ u32 rsize;
struct p9_client *clnt;
struct p9_req_t *req;
char *dataptr;
struct kvec kv = {.iov_base = data, .iov_len = count};
struct iov_iter to;
- iov_iter_kvec(&to, READ | ITER_KVEC, &kv, 1, count);
+ iov_iter_kvec(&to, ITER_DEST, &kv, 1, count);
- p9_debug(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %d\n",
- fid->fid, (unsigned long long) offset, count);
+ p9_debug(P9_DEBUG_9P, ">>> TREADDIR fid %d offset %llu count %u\n",
+ fid->fid, offset, count);
- err = 0;
clnt = fid->clnt;
rsize = fid->iounit;
- if (!rsize || rsize > clnt->msize-P9_READDIRHDRSZ)
+ if (!rsize || rsize > clnt->msize - P9_READDIRHDRSZ)
rsize = clnt->msize - P9_READDIRHDRSZ;
if (count < rsize)
@@ -2087,8 +2005,7 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset)
/* Don't bother zerocopy for small IO (< 1024) */
if (clnt->trans_mod->zc_request && rsize > 1024) {
- /*
- * response header len is 11
+ /* response header len is 11
* PDU Header(7) + IO Size (4)
*/
req = p9_client_zc_rpc(clnt, P9_TREADDIR, &to, NULL, rsize, 0,
@@ -2103,90 +2020,88 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset)
goto error;
}
- err = p9pdu_readf(req->rc, clnt->proto_version, "D", &count, &dataptr);
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "D", &count, &dataptr);
if (err) {
- trace_9p_protocol_dump(clnt, req->rc);
+ trace_9p_protocol_dump(clnt, &req->rc);
goto free_and_error;
}
if (rsize < count) {
- pr_err("bogus RREADDIR count (%d > %d)\n", count, rsize);
- count = rsize;
+ pr_err("bogus RREADDIR count (%u > %u)\n", count, rsize);
+ err = -EIO;
+ goto free_and_error;
}
- p9_debug(P9_DEBUG_9P, "<<< RREADDIR count %d\n", count);
+ p9_debug(P9_DEBUG_9P, "<<< RREADDIR count %u\n", count);
if (non_zc)
memmove(data, dataptr, count);
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
return count;
free_and_error:
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
error:
return err;
}
EXPORT_SYMBOL(p9_client_readdir);
int p9_client_mknod_dotl(struct p9_fid *fid, const char *name, int mode,
- dev_t rdev, kgid_t gid, struct p9_qid *qid)
+ dev_t rdev, kgid_t gid, struct p9_qid *qid)
{
int err;
struct p9_client *clnt;
struct p9_req_t *req;
- err = 0;
clnt = fid->clnt;
- p9_debug(P9_DEBUG_9P, ">>> TMKNOD fid %d name %s mode %d major %d "
- "minor %d\n", fid->fid, name, mode, MAJOR(rdev), MINOR(rdev));
+ p9_debug(P9_DEBUG_9P,
+ ">>> TMKNOD fid %d name %s mode %d major %d minor %d\n",
+ fid->fid, name, mode, MAJOR(rdev), MINOR(rdev));
req = p9_client_rpc(clnt, P9_TMKNOD, "dsdddg", fid->fid, name, mode,
- MAJOR(rdev), MINOR(rdev), gid);
+ MAJOR(rdev), MINOR(rdev), gid);
if (IS_ERR(req))
return PTR_ERR(req);
- err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid);
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid);
if (err) {
- trace_9p_protocol_dump(clnt, req->rc);
+ trace_9p_protocol_dump(clnt, &req->rc);
goto error;
}
- p9_debug(P9_DEBUG_9P, "<<< RMKNOD qid %x.%llx.%x\n", qid->type,
- (unsigned long long)qid->path, qid->version);
+ p9_debug(P9_DEBUG_9P, "<<< RMKNOD qid %x.%llx.%x\n",
+ qid->type, qid->path, qid->version);
error:
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
return err;
-
}
EXPORT_SYMBOL(p9_client_mknod_dotl);
int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode,
- kgid_t gid, struct p9_qid *qid)
+ kgid_t gid, struct p9_qid *qid)
{
int err;
struct p9_client *clnt;
struct p9_req_t *req;
- err = 0;
clnt = fid->clnt;
p9_debug(P9_DEBUG_9P, ">>> TMKDIR fid %d name %s mode %d gid %d\n",
fid->fid, name, mode, from_kgid(&init_user_ns, gid));
- req = p9_client_rpc(clnt, P9_TMKDIR, "dsdg", fid->fid, name, mode,
- gid);
+ req = p9_client_rpc(clnt, P9_TMKDIR, "dsdg",
+ fid->fid, name, mode, gid);
if (IS_ERR(req))
return PTR_ERR(req);
- err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid);
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid);
if (err) {
- trace_9p_protocol_dump(clnt, req->rc);
+ trace_9p_protocol_dump(clnt, &req->rc);
goto error;
}
p9_debug(P9_DEBUG_9P, "<<< RMKDIR qid %x.%llx.%x\n", qid->type,
- (unsigned long long)qid->path, qid->version);
+ qid->path, qid->version);
error:
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
return err;
-
}
EXPORT_SYMBOL(p9_client_mkdir_dotl);
@@ -2196,30 +2111,28 @@ int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status)
struct p9_client *clnt;
struct p9_req_t *req;
- err = 0;
clnt = fid->clnt;
- p9_debug(P9_DEBUG_9P, ">>> TLOCK fid %d type %i flags %d "
- "start %lld length %lld proc_id %d client_id %s\n",
- fid->fid, flock->type, flock->flags, flock->start,
- flock->length, flock->proc_id, flock->client_id);
+ p9_debug(P9_DEBUG_9P,
+ ">>> TLOCK fid %d type %i flags %d start %lld length %lld proc_id %d client_id %s\n",
+ fid->fid, flock->type, flock->flags, flock->start,
+ flock->length, flock->proc_id, flock->client_id);
req = p9_client_rpc(clnt, P9_TLOCK, "dbdqqds", fid->fid, flock->type,
- flock->flags, flock->start, flock->length,
- flock->proc_id, flock->client_id);
+ flock->flags, flock->start, flock->length,
+ flock->proc_id, flock->client_id);
if (IS_ERR(req))
return PTR_ERR(req);
- err = p9pdu_readf(req->rc, clnt->proto_version, "b", status);
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "b", status);
if (err) {
- trace_9p_protocol_dump(clnt, req->rc);
+ trace_9p_protocol_dump(clnt, &req->rc);
goto error;
}
p9_debug(P9_DEBUG_9P, "<<< RLOCK status %i\n", *status);
error:
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
return err;
-
}
EXPORT_SYMBOL(p9_client_lock_dotl);
@@ -2229,30 +2142,32 @@ int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *glock)
struct p9_client *clnt;
struct p9_req_t *req;
- err = 0;
clnt = fid->clnt;
- p9_debug(P9_DEBUG_9P, ">>> TGETLOCK fid %d, type %i start %lld "
- "length %lld proc_id %d client_id %s\n", fid->fid, glock->type,
- glock->start, glock->length, glock->proc_id, glock->client_id);
+ p9_debug(P9_DEBUG_9P,
+ ">>> TGETLOCK fid %d, type %i start %lld length %lld proc_id %d client_id %s\n",
+ fid->fid, glock->type, glock->start, glock->length,
+ glock->proc_id, glock->client_id);
- req = p9_client_rpc(clnt, P9_TGETLOCK, "dbqqds", fid->fid, glock->type,
- glock->start, glock->length, glock->proc_id, glock->client_id);
+ req = p9_client_rpc(clnt, P9_TGETLOCK, "dbqqds", fid->fid,
+ glock->type, glock->start, glock->length,
+ glock->proc_id, glock->client_id);
if (IS_ERR(req))
return PTR_ERR(req);
- err = p9pdu_readf(req->rc, clnt->proto_version, "bqqds", &glock->type,
- &glock->start, &glock->length, &glock->proc_id,
- &glock->client_id);
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "bqqds", &glock->type,
+ &glock->start, &glock->length, &glock->proc_id,
+ &glock->client_id);
if (err) {
- trace_9p_protocol_dump(clnt, req->rc);
+ trace_9p_protocol_dump(clnt, &req->rc);
goto error;
}
- p9_debug(P9_DEBUG_9P, "<<< RGETLOCK type %i start %lld length %lld "
- "proc_id %d client_id %s\n", glock->type, glock->start,
- glock->length, glock->proc_id, glock->client_id);
+ p9_debug(P9_DEBUG_9P,
+ "<<< RGETLOCK type %i start %lld length %lld proc_id %d client_id %s\n",
+ glock->type, glock->start, glock->length,
+ glock->proc_id, glock->client_id);
error:
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
return err;
}
EXPORT_SYMBOL(p9_client_getlock_dotl);
@@ -2263,7 +2178,6 @@ int p9_client_readlink(struct p9_fid *fid, char **target)
struct p9_client *clnt;
struct p9_req_t *req;
- err = 0;
clnt = fid->clnt;
p9_debug(P9_DEBUG_9P, ">>> TREADLINK fid %d\n", fid->fid);
@@ -2271,14 +2185,25 @@ int p9_client_readlink(struct p9_fid *fid, char **target)
if (IS_ERR(req))
return PTR_ERR(req);
- err = p9pdu_readf(req->rc, clnt->proto_version, "s", target);
+ err = p9pdu_readf(&req->rc, clnt->proto_version, "s", target);
if (err) {
- trace_9p_protocol_dump(clnt, req->rc);
+ trace_9p_protocol_dump(clnt, &req->rc);
goto error;
}
p9_debug(P9_DEBUG_9P, "<<< RREADLINK target %s\n", *target);
error:
- p9_free_req(clnt, req);
+ p9_req_put(clnt, req);
return err;
}
EXPORT_SYMBOL(p9_client_readlink);
+
+int __init p9_client_init(void)
+{
+ p9_req_cache = KMEM_CACHE(p9_req_t, SLAB_TYPESAFE_BY_RCU);
+ return p9_req_cache ? 0 : -ENOMEM;
+}
+
+void __exit p9_client_exit(void)
+{
+ kmem_cache_destroy(p9_req_cache);
+}
diff --git a/net/9p/error.c b/net/9p/error.c
index 126fd0dceea2..8ba8afc91482 100644
--- a/net/9p/error.c
+++ b/net/9p/error.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/error.c
- *
* Error string handling
*
* Plan 9 uses error strings, Unix uses error numbers. These functions
@@ -9,22 +8,6 @@
*
* Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
* Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to:
- * Free Software Foundation
- * 51 Franklin Street, Fifth Floor
- * Boston, MA 02111-1301 USA
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -33,6 +16,7 @@
#include <linux/list.h>
#include <linux/jhash.h>
#include <linux/errno.h>
+#include <linux/hashtable.h>
#include <net/9p/9p.h>
/**
@@ -50,8 +34,8 @@ struct errormap {
struct hlist_node list;
};
-#define ERRHASHSZ 32
-static struct hlist_head hash_errmap[ERRHASHSZ];
+#define ERRHASH_BITS 5
+static DEFINE_HASHTABLE(hash_errmap, ERRHASH_BITS);
/* FixMe - reduce to a reasonable size */
static struct errormap errmap[] = {
@@ -193,18 +177,14 @@ static struct errormap errmap[] = {
int p9_error_init(void)
{
struct errormap *c;
- int bucket;
-
- /* initialize hash table */
- for (bucket = 0; bucket < ERRHASHSZ; bucket++)
- INIT_HLIST_HEAD(&hash_errmap[bucket]);
+ u32 hash;
/* load initial error map into hash table */
- for (c = errmap; c->name != NULL; c++) {
+ for (c = errmap; c->name; c++) {
c->namelen = strlen(c->name);
- bucket = jhash(c->name, c->namelen, 0) % ERRHASHSZ;
+ hash = jhash(c->name, c->namelen, 0);
INIT_HLIST_NODE(&c->list);
- hlist_add_head(&c->list, &hash_errmap[bucket]);
+ hash_add(hash_errmap, &c->list, hash);
}
return 1;
@@ -212,7 +192,7 @@ int p9_error_init(void)
EXPORT_SYMBOL(p9_error_init);
/**
- * errstr2errno - convert error string to error number
+ * p9_errstr2errno - convert error string to error number
* @errstr: error string
* @len: length of error string
*
@@ -222,12 +202,12 @@ int p9_errstr2errno(char *errstr, int len)
{
int errno;
struct errormap *c;
- int bucket;
+ u32 hash;
errno = 0;
c = NULL;
- bucket = jhash(errstr, len, 0) % ERRHASHSZ;
- hlist_for_each_entry(c, &hash_errmap[bucket], list) {
+ hash = jhash(errstr, len, 0);
+ hash_for_each_possible(hash_errmap, c, list, hash) {
if (c->namelen == len && !memcmp(c->name, errstr, len)) {
errno = c->val;
break;
diff --git a/net/9p/mod.c b/net/9p/mod.c
index 253ba824a325..85160b52da55 100644
--- a/net/9p/mod.c
+++ b/net/9p/mod.c
@@ -1,51 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * net/9p/9p.c
- *
* 9P entry point
*
* Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net>
* Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
* Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to:
- * Free Software Foundation
- * 51 Franklin Street, Fifth Floor
- * Boston, MA 02111-1301 USA
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
+#include <linux/kmod.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/moduleparam.h>
#include <net/9p/9p.h>
#include <linux/fs.h>
-#include <linux/parser.h>
#include <net/9p/client.h>
#include <net/9p/transport.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#ifdef CONFIG_NET_9P_DEBUG
-unsigned int p9_debug_level = 0; /* feature-rific global debug level */
+unsigned int p9_debug_level; /* feature-rific global debug level */
EXPORT_SYMBOL(p9_debug_level);
module_param_named(debug, p9_debug_level, uint, 0);
MODULE_PARM_DESC(debug, "9P debugging level");
void _p9_debug(enum p9_debug_flags level, const char *func,
- const char *fmt, ...)
+ const char *fmt, ...)
{
struct va_format vaf;
va_list args;
@@ -68,10 +51,7 @@ void _p9_debug(enum p9_debug_flags level, const char *func,
EXPORT_SYMBOL(_p9_debug);
#endif
-/*
- * Dynamic Transport Registration Routines
- *
- */
+/* Dynamic Transport Registration Routines */
static DEFINE_SPINLOCK(v9fs_trans_lock);
static LIST_HEAD(v9fs_trans_list);
@@ -102,12 +82,7 @@ void v9fs_unregister_trans(struct p9_trans_module *m)
}
EXPORT_SYMBOL(v9fs_unregister_trans);
-/**
- * v9fs_get_trans_by_name - get transport with the matching name
- * @s: string identifying transport
- *
- */
-struct p9_trans_module *v9fs_get_trans_by_name(char *s)
+static struct p9_trans_module *_p9_get_trans_by_name(const char *s)
{
struct p9_trans_module *t, *found = NULL;
@@ -121,10 +96,36 @@ struct p9_trans_module *v9fs_get_trans_by_name(char *s)
}
spin_unlock(&v9fs_trans_lock);
+
+ return found;
+}
+
+/**
+ * v9fs_get_trans_by_name - get transport with the matching name
+ * @s: string identifying transport
+ *
+ */
+struct p9_trans_module *v9fs_get_trans_by_name(const char *s)
+{
+ struct p9_trans_module *found = NULL;
+
+ found = _p9_get_trans_by_name(s);
+
+#ifdef CONFIG_MODULES
+ if (!found) {
+ request_module("9p-%s", s);
+ found = _p9_get_trans_by_name(s);
+ }
+#endif
+
return found;
}
EXPORT_SYMBOL(v9fs_get_trans_by_name);
+static const char * const v9fs_default_transports[] = {
+ "virtio", "tcp", "fd", "unix", "xen", "rdma",
+};
+
/**
* v9fs_get_default_trans - get the default transport
*
@@ -133,6 +134,7 @@ EXPORT_SYMBOL(v9fs_get_trans_by_name);
struct p9_trans_module *v9fs_get_default_trans(void)
{
struct p9_trans_module *t, *found = NULL;
+ int i;
spin_lock(&v9fs_trans_lock);
@@ -150,6 +152,10 @@ struct p9_trans_module *v9fs_get_default_trans(void)
}
spin_unlock(&v9fs_trans_lock);
+
+ for (i = 0; !found && i < ARRAY_SIZE(v9fs_default_transports); i++)
+ found = v9fs_get_trans_by_name(v9fs_default_transports[i]);
+
return found;
}
EXPORT_SYMBOL(v9fs_get_default_trans);
@@ -164,6 +170,7 @@ void v9fs_put_trans(struct p9_trans_module *m)
if (m)
module_put(m->owner);
}
+EXPORT_SYMBOL(v9fs_put_trans);
/**
* init_p9 - Initialize module
@@ -171,11 +178,16 @@ void v9fs_put_trans(struct p9_trans_module *m)
*/
static int __init init_p9(void)
{
+ int ret;
+
+ ret = p9_client_init();
+ if (ret)
+ return ret;
+
p9_error_init();
pr_info("Installing 9P2000 support\n");
- p9_trans_fd_init();
- return 0;
+ return ret;
}
/**
@@ -187,7 +199,7 @@ static void __exit exit_p9(void)
{
pr_info("Unloading 9P2000 support\n");
- p9_trans_fd_exit();
+ p9_client_exit();
}
module_init(init_p9)
@@ -197,3 +209,4 @@ MODULE_AUTHOR("Latchesar Ionkov <lucho@ionkov.net>");
MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
MODULE_AUTHOR("Ron Minnich <rminnich@lanl.gov>");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Plan 9 Resource Sharing Support (9P2000)");
diff --git a/net/9p/protocol.c b/net/9p/protocol.c
index 4a1e1dd30b52..0e6603b1ec90 100644
--- a/net/9p/protocol.c
+++ b/net/9p/protocol.c
@@ -1,28 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * net/9p/protocol.c
- *
* 9P Protocol Support Code
*
* Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com>
*
* Base on code from Anthony Liguori <aliguori@us.ibm.com>
* Copyright (C) 2008 by IBM, Corp.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to:
- * Free Software Foundation
- * 51 Franklin Street, Fifth Floor
- * Boston, MA 02111-1301 USA
- *
*/
#include <linux/module.h>
@@ -40,22 +23,195 @@
#include <trace/events/9p.h>
+/* len[2] text[len] */
+#define P9_STRLEN(s) \
+ (2 + min_t(size_t, s ? strlen(s) : 0, USHRT_MAX))
+
+/**
+ * p9_msg_buf_size - Returns a buffer size sufficiently large to hold the
+ * intended 9p message.
+ * @c: client
+ * @type: message type
+ * @fmt: format template for assembling request message
+ * (see p9pdu_vwritef)
+ * @ap: variable arguments to be fed to passed format template
+ * (see p9pdu_vwritef)
+ *
+ * Note: Even for response types (P9_R*) the format template and variable
+ * arguments must always be for the originating request type (P9_T*).
+ */
+size_t p9_msg_buf_size(struct p9_client *c, enum p9_msg_t type,
+ const char *fmt, va_list ap)
+{
+ /* size[4] type[1] tag[2] */
+ const int hdr = 4 + 1 + 2;
+ /* ename[s] errno[4] */
+ const int rerror_size = hdr + P9_ERRMAX + 4;
+ /* ecode[4] */
+ const int rlerror_size = hdr + 4;
+ const int err_size =
+ c->proto_version == p9_proto_2000L ? rlerror_size : rerror_size;
+
+ static_assert(NAME_MAX <= 4*1024, "p9_msg_buf_size() currently assumes "
+ "a max. allowed directory entry name length of 4k");
+
+ switch (type) {
+
+ /* message types not used at all */
+ case P9_TERROR:
+ case P9_TLERROR:
+ case P9_TAUTH:
+ case P9_RAUTH:
+ BUG();
+
+ /* variable length & potentially large message types */
+ case P9_TATTACH:
+ BUG_ON(strcmp("ddss?u", fmt));
+ va_arg(ap, int32_t);
+ va_arg(ap, int32_t);
+ {
+ const char *uname = va_arg(ap, const char *);
+ const char *aname = va_arg(ap, const char *);
+ /* fid[4] afid[4] uname[s] aname[s] n_uname[4] */
+ return hdr + 4 + 4 + P9_STRLEN(uname) + P9_STRLEN(aname) + 4;
+ }
+ case P9_TWALK:
+ BUG_ON(strcmp("ddT", fmt));
+ va_arg(ap, int32_t);
+ va_arg(ap, int32_t);
+ {
+ uint i, nwname = va_arg(ap, int);
+ size_t wname_all;
+ const char **wnames = va_arg(ap, const char **);
+ for (i = 0, wname_all = 0; i < nwname; ++i) {
+ wname_all += P9_STRLEN(wnames[i]);
+ }
+ /* fid[4] newfid[4] nwname[2] nwname*(wname[s]) */
+ return hdr + 4 + 4 + 2 + wname_all;
+ }
+ case P9_RWALK:
+ BUG_ON(strcmp("ddT", fmt));
+ va_arg(ap, int32_t);
+ va_arg(ap, int32_t);
+ {
+ uint nwname = va_arg(ap, int);
+ /* nwqid[2] nwqid*(wqid[13]) */
+ return max_t(size_t, hdr + 2 + nwname * 13, err_size);
+ }
+ case P9_TCREATE:
+ BUG_ON(strcmp("dsdb?s", fmt));
+ va_arg(ap, int32_t);
+ {
+ const char *name = va_arg(ap, const char *);
+ if (c->proto_version == p9_proto_legacy) {
+ /* fid[4] name[s] perm[4] mode[1] */
+ return hdr + 4 + P9_STRLEN(name) + 4 + 1;
+ } else {
+ va_arg(ap, int32_t);
+ va_arg(ap, int);
+ {
+ const char *ext = va_arg(ap, const char *);
+ /* fid[4] name[s] perm[4] mode[1] extension[s] */
+ return hdr + 4 + P9_STRLEN(name) + 4 + 1 + P9_STRLEN(ext);
+ }
+ }
+ }
+ case P9_TLCREATE:
+ BUG_ON(strcmp("dsddg", fmt));
+ va_arg(ap, int32_t);
+ {
+ const char *name = va_arg(ap, const char *);
+ /* fid[4] name[s] flags[4] mode[4] gid[4] */
+ return hdr + 4 + P9_STRLEN(name) + 4 + 4 + 4;
+ }
+ case P9_RREAD:
+ case P9_RREADDIR:
+ BUG_ON(strcmp("dqd", fmt));
+ va_arg(ap, int32_t);
+ va_arg(ap, int64_t);
+ {
+ const int32_t count = va_arg(ap, int32_t);
+ /* count[4] data[count] */
+ return max_t(size_t, hdr + 4 + count, err_size);
+ }
+ case P9_TWRITE:
+ BUG_ON(strcmp("dqV", fmt));
+ va_arg(ap, int32_t);
+ va_arg(ap, int64_t);
+ {
+ const int32_t count = va_arg(ap, int32_t);
+ /* fid[4] offset[8] count[4] data[count] */
+ return hdr + 4 + 8 + 4 + count;
+ }
+ case P9_TRENAMEAT:
+ BUG_ON(strcmp("dsds", fmt));
+ va_arg(ap, int32_t);
+ {
+ const char *oldname, *newname;
+ oldname = va_arg(ap, const char *);
+ va_arg(ap, int32_t);
+ newname = va_arg(ap, const char *);
+ /* olddirfid[4] oldname[s] newdirfid[4] newname[s] */
+ return hdr + 4 + P9_STRLEN(oldname) + 4 + P9_STRLEN(newname);
+ }
+ case P9_TSYMLINK:
+ BUG_ON(strcmp("dssg", fmt));
+ va_arg(ap, int32_t);
+ {
+ const char *name = va_arg(ap, const char *);
+ const char *symtgt = va_arg(ap, const char *);
+ /* fid[4] name[s] symtgt[s] gid[4] */
+ return hdr + 4 + P9_STRLEN(name) + P9_STRLEN(symtgt) + 4;
+ }
+
+ case P9_RERROR:
+ return rerror_size;
+ case P9_RLERROR:
+ return rlerror_size;
+
+ /* small message types */
+ case P9_TWSTAT:
+ case P9_RSTAT:
+ case P9_RREADLINK:
+ case P9_TXATTRWALK:
+ case P9_TXATTRCREATE:
+ case P9_TLINK:
+ case P9_TMKDIR:
+ case P9_TMKNOD:
+ case P9_TRENAME:
+ case P9_TUNLINKAT:
+ case P9_TLOCK:
+ return 8 * 1024;
+
+ /* tiny message types */
+ default:
+ return 4 * 1024;
+
+ }
+}
+
static int
p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...);
void p9stat_free(struct p9_wstat *stbuf)
{
kfree(stbuf->name);
+ stbuf->name = NULL;
kfree(stbuf->uid);
+ stbuf->uid = NULL;
kfree(stbuf->gid);
+ stbuf->gid = NULL;
kfree(stbuf->muid);
+ stbuf->muid = NULL;
kfree(stbuf->extension);
+ stbuf->extension = NULL;
}
EXPORT_SYMBOL(p9stat_free);
size_t pdu_read(struct p9_fcall *pdu, void *data, size_t size)
{
size_t len = min(pdu->size - pdu->offset, size);
+
memcpy(data, &pdu->sdata[pdu->offset], len);
pdu->offset += len;
return size - len;
@@ -64,6 +220,7 @@ size_t pdu_read(struct p9_fcall *pdu, void *data, size_t size)
static size_t pdu_write(struct p9_fcall *pdu, const void *data, size_t size)
{
size_t len = min(pdu->capacity - pdu->size, size);
+
memcpy(&pdu->sdata[pdu->size], data, len);
pdu->size += len;
return size - len;
@@ -73,34 +230,33 @@ static size_t
pdu_write_u(struct p9_fcall *pdu, struct iov_iter *from, size_t size)
{
size_t len = min(pdu->capacity - pdu->size, size);
- struct iov_iter i = *from;
- if (!copy_from_iter_full(&pdu->sdata[pdu->size], len, &i))
+
+ if (!copy_from_iter_full(&pdu->sdata[pdu->size], len, from))
len = 0;
pdu->size += len;
return size - len;
}
-/*
- b - int8_t
- w - int16_t
- d - int32_t
- q - int64_t
- s - string
- u - numeric uid
- g - numeric gid
- S - stat
- Q - qid
- D - data blob (int32_t size followed by void *, results are not freed)
- T - array of strings (int16_t count, followed by strings)
- R - array of qids (int16_t count, followed by qids)
- A - stat for 9p2000.L (p9_stat_dotl)
- ? - if optional = 1, continue parsing
-*/
+/* b - int8_t
+ * w - int16_t
+ * d - int32_t
+ * q - int64_t
+ * s - string
+ * u - numeric uid
+ * g - numeric gid
+ * S - stat
+ * Q - qid
+ * D - data blob (int32_t size followed by void *, results are not freed)
+ * T - array of strings (int16_t count, followed by strings)
+ * R - array of qids (int16_t count, followed by qids)
+ * A - stat for 9p2000.L (p9_stat_dotl)
+ * ? - if optional = 1, continue parsing
+ */
static int
p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt,
- va_list ap)
+ va_list ap)
{
const char *ptr;
int errcode = 0;
@@ -238,6 +394,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt,
uint16_t *nwname = va_arg(ap, uint16_t *);
char ***wnames = va_arg(ap, char ***);
+ *wnames = NULL;
+
errcode = p9pdu_readf(pdu, proto_version,
"w", nwname);
if (!errcode) {
@@ -247,6 +405,8 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt,
GFP_NOFS);
if (!*wnames)
errcode = -ENOMEM;
+ else
+ (*wnames)[0] = NULL;
}
if (!errcode) {
@@ -258,8 +418,10 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt,
proto_version,
"s",
&(*wnames)[i]);
- if (errcode)
+ if (errcode) {
+ (*wnames)[i] = NULL;
break;
+ }
}
}
@@ -267,11 +429,14 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const char *fmt,
if (*wnames) {
int i;
- for (i = 0; i < *nwname; i++)
+ for (i = 0; i < *nwname; i++) {
+ if (!(*wnames)[i])
+ break;
kfree((*wnames)[i]);
+ }
+ kfree(*wnames);
+ *wnames = NULL;
}
- kfree(*wnames);
- *wnames = NULL;
}
}
break;
@@ -566,9 +731,10 @@ int p9stat_read(struct p9_client *clnt, char *buf, int len, struct p9_wstat *st)
if (ret) {
p9_debug(P9_DEBUG_9P, "<<< p9stat_read failed: %d\n", ret);
trace_9p_protocol_dump(clnt, &fake_pdu);
+ return ret;
}
- return ret;
+ return fake_pdu.offset;
}
EXPORT_SYMBOL(p9stat_read);
@@ -617,13 +783,19 @@ int p9dirent_read(struct p9_client *clnt, char *buf, int len,
if (ret) {
p9_debug(P9_DEBUG_9P, "<<< p9dirent_read failed: %d\n", ret);
trace_9p_protocol_dump(clnt, &fake_pdu);
- goto out;
+ return ret;
}
- strcpy(dirent->d_name, nameptr);
+ ret = strscpy(dirent->d_name, nameptr, sizeof(dirent->d_name));
+ if (ret < 0) {
+ p9_debug(P9_DEBUG_ERROR,
+ "On the wire dirent name too long: %s\n",
+ nameptr);
+ kfree(nameptr);
+ return ret;
+ }
kfree(nameptr);
-out:
return fake_pdu.offset;
}
EXPORT_SYMBOL(p9dirent_read);
diff --git a/net/9p/protocol.h b/net/9p/protocol.h
index 2cc525fa49fa..ad2283d1f96b 100644
--- a/net/9p/protocol.h
+++ b/net/9p/protocol.h
@@ -1,32 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * net/9p/protocol.h
- *
* 9P Protocol Support Code
*
* Copyright (C) 2008 by Eric Van Hensbergen <ericvh@gmail.com>
*
* Base on code from Anthony Liguori <aliguori@us.ibm.com>
* Copyright (C) 2008 by IBM, Corp.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to:
- * Free Software Foundation
- * 51 Franklin Street, Fifth Floor
- * Boston, MA 02111-1301 USA
- *
*/
+size_t p9_msg_buf_size(struct p9_client *c, enum p9_msg_t type,
+ const char *fmt, va_list ap);
int p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt,
- va_list ap);
+ va_list ap);
int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...);
int p9pdu_prepare(struct p9_fcall *pdu, int16_t tag, int8_t type);
int p9pdu_finalize(struct p9_client *clnt, struct p9_fcall *pdu);
diff --git a/net/9p/trans_common.c b/net/9p/trans_common.c
index b718db2085b2..c827f694551c 100644
--- a/net/9p/trans_common.c
+++ b/net/9p/trans_common.c
@@ -1,22 +1,17 @@
+// SPDX-License-Identifier: LGPL-2.1
/*
* Copyright IBM Corporation, 2010
* Author Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
*/
#include <linux/mm.h>
#include <linux/module.h>
+#include "trans_common.h"
/**
- * p9_release_pages - Release pages after the transaction.
+ * p9_release_pages - Release pages after the transaction.
+ * @pages: array of pages to be put
+ * @nr_pages: size of array
*/
void p9_release_pages(struct page **pages, int nr_pages)
{
diff --git a/net/9p/trans_common.h b/net/9p/trans_common.h
index c43babb3f635..32134db6abf3 100644
--- a/net/9p/trans_common.h
+++ b/net/9p/trans_common.h
@@ -1,15 +1,7 @@
+/* SPDX-License-Identifier: LGPL-2.1 */
/*
* Copyright IBM Corporation, 2010
* Author Venkateswararao Jujjuri <jvrao@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
*/
-void p9_release_pages(struct page **, int);
+void p9_release_pages(struct page **pages, int nr_pages);
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index e2ef3c782c53..0e331c1b2112 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -1,33 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/trans_fd.c
- *
* Fd transport layer. Includes deprecated socket layer.
*
* Copyright (C) 2006 by Russ Cox <rsc@swtch.com>
* Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
* Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
* Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to:
- * Free Software Foundation
- * 51 Franklin Street, Fifth Floor
- * Boston, MA 02111-1301 USA
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/in.h>
+#include <linux/in6.h>
#include <linux/module.h>
#include <linux/net.h>
#include <linux/ipv6.h>
@@ -37,9 +21,8 @@
#include <linux/un.h>
#include <linux/uaccess.h>
#include <linux/inet.h>
-#include <linux/idr.h>
#include <linux/file.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <net/9p/9p.h>
@@ -48,48 +31,12 @@
#include <linux/syscalls.h> /* killme */
-#define P9_PORT 564
-#define MAX_SOCK_BUF (64*1024)
+#define MAX_SOCK_BUF (1024*1024)
#define MAXPOLLWADDR 2
static struct p9_trans_module p9_tcp_trans;
static struct p9_trans_module p9_fd_trans;
-/**
- * struct p9_fd_opts - per-transport options
- * @rfd: file descriptor for reading (trans=fd)
- * @wfd: file descriptor for writing (trans=fd)
- * @port: port to connect to (trans=tcp)
- *
- */
-
-struct p9_fd_opts {
- int rfd;
- int wfd;
- u16 port;
- bool privport;
-};
-
-/*
- * Option Parsing (code inspired by NFS code)
- * - a little lazy - parse all fd-transport options
- */
-
-enum {
- /* Options that take integer arguments */
- Opt_port, Opt_rfdno, Opt_wfdno, Opt_err,
- /* Options that take no arguments */
- Opt_privport,
-};
-
-static const match_table_t tokens = {
- {Opt_port, "port=%u"},
- {Opt_rfdno, "rfdno=%u"},
- {Opt_wfdno, "wfdno=%u"},
- {Opt_privport, "privport"},
- {Opt_err, NULL},
-};
-
enum {
Rworksched = 1, /* read work scheduled or running */
Rpending = 2, /* can read */
@@ -108,9 +55,11 @@ struct p9_poll_wait {
* @mux_list: list link for mux to manage multiple connections (?)
* @client: reference to client instance for this connection
* @err: error state
+ * @req_lock: lock protecting req_list and requests statuses
* @req_list: accounting for requests which have been sent
* @unsent_req_list: accounting for requests that haven't been sent
- * @req: current request being processed (if any)
+ * @rreq: read request
+ * @wreq: write request
* @tmp_buf: temporary buffer to read in header
* @rc: temporary fcall for reading current frame
* @wpos: write position for current frame
@@ -129,10 +78,12 @@ struct p9_conn {
struct list_head mux_list;
struct p9_client *client;
int err;
+ spinlock_t req_lock;
struct list_head req_list;
struct list_head unsent_req_list;
- struct p9_req_t *req;
- char tmp_buf[7];
+ struct p9_req_t *rreq;
+ struct p9_req_t *wreq;
+ char tmp_buf[P9_HDRSZ];
struct p9_fcall rc;
int wpos;
int wsize;
@@ -203,22 +154,27 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
p9_debug(P9_DEBUG_ERROR, "mux %p err %d\n", m, err);
- spin_lock(&m->client->lock);
+ spin_lock(&m->req_lock);
- if (m->err) {
- spin_unlock(&m->client->lock);
+ if (READ_ONCE(m->err)) {
+ spin_unlock(&m->req_lock);
return;
}
- m->err = err;
+ WRITE_ONCE(m->err, err);
+ ASSERT_EXCLUSIVE_WRITER(m->err);
list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) {
list_move(&req->req_list, &cancel_list);
+ WRITE_ONCE(req->status, REQ_STATUS_ERROR);
}
list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) {
list_move(&req->req_list, &cancel_list);
+ WRITE_ONCE(req->status, REQ_STATUS_ERROR);
}
+ spin_unlock(&m->req_lock);
+
list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) {
p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req);
list_del(&req->req_list);
@@ -226,7 +182,6 @@ static void p9_conn_cancel(struct p9_conn *m, int err)
req->t_err = err;
p9_client_cb(m->client, req, REQ_STATUS_ERROR);
}
- spin_unlock(&m->client->lock);
}
static __poll_t
@@ -291,11 +246,10 @@ static void p9_read_work(struct work_struct *work)
__poll_t n;
int err;
struct p9_conn *m;
- int status = REQ_STATUS_ERROR;
m = container_of(work, struct p9_conn, rq);
- if (m->err < 0)
+ if (READ_ONCE(m->err) < 0)
return;
p9_debug(P9_DEBUG_TRANS, "start mux %p pos %zd\n", m, m->rc.offset);
@@ -303,7 +257,7 @@ static void p9_read_work(struct work_struct *work)
if (!m->rc.sdata) {
m->rc.sdata = m->tmp_buf;
m->rc.offset = 0;
- m->rc.capacity = 7; /* start by reading header */
+ m->rc.capacity = P9_HDRSZ; /* start by reading header */
}
clear_bit(Rpending, &m->wsched);
@@ -322,11 +276,11 @@ static void p9_read_work(struct work_struct *work)
m->rc.offset += err;
/* header read in */
- if ((!m->req) && (m->rc.offset == m->rc.capacity)) {
+ if ((!m->rreq) && (m->rc.offset == m->rc.capacity)) {
p9_debug(P9_DEBUG_TRANS, "got new header\n");
/* Header size */
- m->rc.size = 7;
+ m->rc.size = P9_HDRSZ;
err = p9_parse_header(&m->rc, &m->rc.size, NULL, NULL, 0);
if (err) {
p9_debug(P9_DEBUG_ERROR,
@@ -334,35 +288,36 @@ static void p9_read_work(struct work_struct *work)
goto error;
}
- if (m->rc.size >= m->client->msize) {
- p9_debug(P9_DEBUG_ERROR,
- "requested packet size too big: %d\n",
- m->rc.size);
- err = -EIO;
- goto error;
- }
-
p9_debug(P9_DEBUG_TRANS,
"mux %p pkt: size: %d bytes tag: %d\n",
m, m->rc.size, m->rc.tag);
- m->req = p9_tag_lookup(m->client, m->rc.tag);
- if (!m->req || (m->req->status != REQ_STATUS_SENT)) {
+ m->rreq = p9_tag_lookup(m->client, m->rc.tag);
+ if (!m->rreq || (m->rreq->status != REQ_STATUS_SENT)) {
p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n",
m->rc.tag);
err = -EIO;
goto error;
}
- if (m->req->rc == NULL) {
+ if (m->rc.size > m->rreq->rc.capacity) {
+ p9_debug(P9_DEBUG_ERROR,
+ "requested packet size too big: %d for tag %d with capacity %zd\n",
+ m->rc.size, m->rc.tag, m->rreq->rc.capacity);
+ err = -EIO;
+ goto error;
+ }
+
+ if (!m->rreq->rc.sdata) {
p9_debug(P9_DEBUG_ERROR,
"No recv fcall for tag %d (req %p), disconnecting!\n",
- m->rc.tag, m->req);
- m->req = NULL;
+ m->rc.tag, m->rreq);
+ p9_req_put(m->client, m->rreq);
+ m->rreq = NULL;
err = -EIO;
goto error;
}
- m->rc.sdata = (char *)m->req->rc + sizeof(struct p9_fcall);
+ m->rc.sdata = m->rreq->rc.sdata;
memcpy(m->rc.sdata, m->tmp_buf, m->rc.capacity);
m->rc.capacity = m->rc.size;
}
@@ -370,20 +325,31 @@ static void p9_read_work(struct work_struct *work)
/* packet is read in
* not an else because some packets (like clunk) have no payload
*/
- if ((m->req) && (m->rc.offset == m->rc.capacity)) {
+ if ((m->rreq) && (m->rc.offset == m->rc.capacity)) {
p9_debug(P9_DEBUG_TRANS, "got new packet\n");
- m->req->rc->size = m->rc.offset;
- spin_lock(&m->client->lock);
- if (m->req->status != REQ_STATUS_ERROR)
- status = REQ_STATUS_RCVD;
- list_del(&m->req->req_list);
- /* update req->status while holding client->lock */
- p9_client_cb(m->client, m->req, status);
- spin_unlock(&m->client->lock);
+ m->rreq->rc.size = m->rc.offset;
+ spin_lock(&m->req_lock);
+ if (m->rreq->status == REQ_STATUS_SENT) {
+ list_del(&m->rreq->req_list);
+ p9_client_cb(m->client, m->rreq, REQ_STATUS_RCVD);
+ } else if (m->rreq->status == REQ_STATUS_FLSHD) {
+ /* Ignore replies associated with a cancelled request. */
+ p9_debug(P9_DEBUG_TRANS,
+ "Ignore replies associated with a cancelled request\n");
+ } else {
+ spin_unlock(&m->req_lock);
+ p9_debug(P9_DEBUG_ERROR,
+ "Request tag %d errored out while we were reading the reply\n",
+ m->rc.tag);
+ err = -EIO;
+ goto error;
+ }
+ spin_unlock(&m->req_lock);
m->rc.sdata = NULL;
m->rc.offset = 0;
m->rc.capacity = 0;
- m->req = NULL;
+ p9_req_put(m->client, m->rreq);
+ m->rreq = NULL;
}
end_clear:
@@ -450,29 +416,31 @@ static void p9_write_work(struct work_struct *work)
m = container_of(work, struct p9_conn, wq);
- if (m->err < 0) {
+ if (READ_ONCE(m->err) < 0) {
clear_bit(Wworksched, &m->wsched);
return;
}
if (!m->wsize) {
- spin_lock(&m->client->lock);
+ spin_lock(&m->req_lock);
if (list_empty(&m->unsent_req_list)) {
clear_bit(Wworksched, &m->wsched);
- spin_unlock(&m->client->lock);
+ spin_unlock(&m->req_lock);
return;
}
req = list_entry(m->unsent_req_list.next, struct p9_req_t,
req_list);
- req->status = REQ_STATUS_SENT;
+ WRITE_ONCE(req->status, REQ_STATUS_SENT);
p9_debug(P9_DEBUG_TRANS, "move req %p\n", req);
list_move_tail(&req->req_list, &m->req_list);
- m->wbuf = req->tc->sdata;
- m->wsize = req->tc->size;
+ m->wbuf = req->tc.sdata;
+ m->wsize = req->tc.size;
m->wpos = 0;
- spin_unlock(&m->client->lock);
+ p9_req_get(req);
+ m->wreq = req;
+ spin_unlock(&m->req_lock);
}
p9_debug(P9_DEBUG_TRANS, "mux %p pos %d size %d\n",
@@ -492,8 +460,11 @@ static void p9_write_work(struct work_struct *work)
}
m->wpos += err;
- if (m->wpos == m->wsize)
+ if (m->wpos == m->wsize) {
m->wpos = m->wsize = 0;
+ p9_req_put(m->client, m->wreq);
+ m->wreq = NULL;
+ }
end_clear:
clear_bit(Wworksched, &m->wsched);
@@ -586,6 +557,7 @@ static void p9_conn_create(struct p9_client *client)
INIT_LIST_HEAD(&m->mux_list);
m->client = client;
+ spin_lock_init(&m->req_lock);
INIT_LIST_HEAD(&m->req_list);
INIT_LIST_HEAD(&m->unsent_req_list);
INIT_WORK(&m->rq, p9_read_work);
@@ -616,7 +588,7 @@ static void p9_poll_mux(struct p9_conn *m)
__poll_t n;
int err = -ECONNRESET;
- if (m->err < 0)
+ if (READ_ONCE(m->err) < 0)
return;
n = p9_fd_poll(m->client, NULL, &err);
@@ -658,59 +630,75 @@ static void p9_poll_mux(struct p9_conn *m)
static int p9_fd_request(struct p9_client *client, struct p9_req_t *req)
{
- __poll_t n;
+ int err;
struct p9_trans_fd *ts = client->trans;
struct p9_conn *m = &ts->conn;
p9_debug(P9_DEBUG_TRANS, "mux %p task %p tcall %p id %d\n",
- m, current, req->tc, req->tc->id);
- if (m->err < 0)
- return m->err;
+ m, current, &req->tc, req->tc.id);
- spin_lock(&client->lock);
- req->status = REQ_STATUS_UNSENT;
- list_add_tail(&req->req_list, &m->unsent_req_list);
- spin_unlock(&client->lock);
+ spin_lock(&m->req_lock);
- if (test_and_clear_bit(Wpending, &m->wsched))
- n = EPOLLOUT;
- else
- n = p9_fd_poll(m->client, NULL, NULL);
+ err = READ_ONCE(m->err);
+ if (err < 0) {
+ spin_unlock(&m->req_lock);
+ return err;
+ }
- if (n & EPOLLOUT && !test_and_set_bit(Wworksched, &m->wsched))
- schedule_work(&m->wq);
+ WRITE_ONCE(req->status, REQ_STATUS_UNSENT);
+ list_add_tail(&req->req_list, &m->unsent_req_list);
+ spin_unlock(&m->req_lock);
+
+ p9_poll_mux(m);
return 0;
}
static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req)
{
+ struct p9_trans_fd *ts = client->trans;
+ struct p9_conn *m = &ts->conn;
int ret = 1;
p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req);
- spin_lock(&client->lock);
+ spin_lock(&m->req_lock);
if (req->status == REQ_STATUS_UNSENT) {
list_del(&req->req_list);
- req->status = REQ_STATUS_FLSHD;
+ WRITE_ONCE(req->status, REQ_STATUS_FLSHD);
+ p9_req_put(client, req);
ret = 0;
}
- spin_unlock(&client->lock);
+ spin_unlock(&m->req_lock);
return ret;
}
static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req)
{
+ struct p9_trans_fd *ts = client->trans;
+ struct p9_conn *m = &ts->conn;
+
p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req);
+ spin_lock(&m->req_lock);
+ /* Ignore cancelled request if status changed since the request was
+ * processed in p9_client_flush()
+ */
+ if (req->status != REQ_STATUS_SENT) {
+ spin_unlock(&m->req_lock);
+ return 0;
+ }
+
/* we haven't received a response for oldreq,
* remove it from the list.
*/
- spin_lock(&client->lock);
list_del(&req->req_list);
- spin_unlock(&client->lock);
+ WRITE_ONCE(req->status, REQ_STATUS_FLSHD);
+ spin_unlock(&m->req_lock);
+
+ p9_req_put(client, req);
return 0;
}
@@ -718,7 +706,7 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req)
static int p9_fd_show_options(struct seq_file *m, struct p9_client *clnt)
{
if (clnt->trans_mod == &p9_tcp_trans) {
- if (clnt->trans_opts.tcp.port != P9_PORT)
+ if (clnt->trans_opts.tcp.port != P9_FD_PORT)
seq_printf(m, ",port=%u", clnt->trans_opts.tcp.port);
} else if (clnt->trans_mod == &p9_fd_trans) {
if (clnt->trans_opts.fd.rfd != ~0)
@@ -729,73 +717,6 @@ static int p9_fd_show_options(struct seq_file *m, struct p9_client *clnt)
return 0;
}
-/**
- * parse_opts - parse mount options into p9_fd_opts structure
- * @params: options string passed from mount
- * @opts: fd transport-specific structure to parse options into
- *
- * Returns 0 upon success, -ERRNO upon failure
- */
-
-static int parse_opts(char *params, struct p9_fd_opts *opts)
-{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
- char *options, *tmp_options;
-
- opts->port = P9_PORT;
- opts->rfd = ~0;
- opts->wfd = ~0;
- opts->privport = false;
-
- if (!params)
- return 0;
-
- tmp_options = kstrdup(params, GFP_KERNEL);
- if (!tmp_options) {
- p9_debug(P9_DEBUG_ERROR,
- "failed to allocate copy of option string\n");
- return -ENOMEM;
- }
- options = tmp_options;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- int r;
- if (!*p)
- continue;
- token = match_token(p, tokens, args);
- if ((token != Opt_err) && (token != Opt_privport)) {
- r = match_int(&args[0], &option);
- if (r < 0) {
- p9_debug(P9_DEBUG_ERROR,
- "integer field, but no integer?\n");
- continue;
- }
- }
- switch (token) {
- case Opt_port:
- opts->port = option;
- break;
- case Opt_rfdno:
- opts->rfd = option;
- break;
- case Opt_wfdno:
- opts->wfd = option;
- break;
- case Opt_privport:
- opts->privport = true;
- break;
- default:
- continue;
- }
- }
-
- kfree(tmp_options);
- return 0;
-}
-
static int p9_fd_open(struct p9_client *client, int rfd, int wfd)
{
struct p9_trans_fd *ts = kzalloc(sizeof(struct p9_trans_fd),
@@ -804,20 +725,38 @@ static int p9_fd_open(struct p9_client *client, int rfd, int wfd)
return -ENOMEM;
ts->rd = fget(rfd);
+ if (!ts->rd)
+ goto out_free_ts;
+ if (!(ts->rd->f_mode & FMODE_READ))
+ goto out_put_rd;
+ /* Prevent workers from hanging on IO when fd is a pipe.
+ * It's technically possible for userspace or concurrent mounts to
+ * modify this flag concurrently, which will likely result in a
+ * broken filesystem. However, just having bad flags here should
+ * not crash the kernel or cause any other sort of bug, so mark this
+ * particular data race as intentional so that tooling (like KCSAN)
+ * can allow it and detect further problems.
+ */
+ data_race(ts->rd->f_flags |= O_NONBLOCK);
ts->wr = fget(wfd);
- if (!ts->rd || !ts->wr) {
- if (ts->rd)
- fput(ts->rd);
- if (ts->wr)
- fput(ts->wr);
- kfree(ts);
- return -EIO;
- }
+ if (!ts->wr)
+ goto out_put_rd;
+ if (!(ts->wr->f_mode & FMODE_WRITE))
+ goto out_put_wr;
+ data_race(ts->wr->f_flags |= O_NONBLOCK);
client->trans = ts;
client->status = Connected;
return 0;
+
+out_put_wr:
+ fput(ts->wr);
+out_put_rd:
+ fput(ts->rd);
+out_free_ts:
+ kfree(ts);
+ return -EIO;
}
static int p9_socket_open(struct p9_client *client, struct socket *csocket)
@@ -826,10 +765,13 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket)
struct file *file;
p = kzalloc(sizeof(struct p9_trans_fd), GFP_KERNEL);
- if (!p)
+ if (!p) {
+ sock_release(csocket);
return -ENOMEM;
+ }
csocket->sk->sk_allocation = GFP_NOIO;
+ csocket->sk->sk_use_task_frag = false;
file = sock_alloc_file(csocket, 0, NULL);
if (IS_ERR(file)) {
pr_err("%s (%d): failed to map fd\n",
@@ -850,7 +792,7 @@ static int p9_socket_open(struct p9_client *client, struct socket *csocket)
}
/**
- * p9_mux_destroy - cancels all pending requests of mux
+ * p9_conn_destroy - cancels all pending requests of mux
* @m: mux to destroy
*
*/
@@ -862,7 +804,15 @@ static void p9_conn_destroy(struct p9_conn *m)
p9_mux_poll_stop(m);
cancel_work_sync(&m->rq);
+ if (m->rreq) {
+ p9_req_put(m->client, m->rreq);
+ m->rreq = NULL;
+ }
cancel_work_sync(&m->wq);
+ if (m->wreq) {
+ p9_req_put(m->client, m->wreq);
+ m->wreq = NULL;
+ }
p9_conn_cancel(m, -ECONNRESET);
@@ -898,64 +848,56 @@ static void p9_fd_close(struct p9_client *client)
kfree(ts);
}
-/*
- * stolen from NFS - maybe should be made a generic function?
- */
-static inline int valid_ipaddr4(const char *buf)
-{
- int rc, count, in[4];
-
- rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]);
- if (rc != 4)
- return -EINVAL;
- for (count = 0; count < 4; count++) {
- if (in[count] > 255)
- return -EINVAL;
- }
- return 0;
-}
-
static int p9_bind_privport(struct socket *sock)
{
- struct sockaddr_in cl;
+ struct sockaddr_storage stor = { 0 };
int port, err = -EINVAL;
- memset(&cl, 0, sizeof(cl));
- cl.sin_family = AF_INET;
- cl.sin_addr.s_addr = INADDR_ANY;
+ stor.ss_family = sock->ops->family;
+ if (stor.ss_family == AF_INET)
+ ((struct sockaddr_in *)&stor)->sin_addr.s_addr = htonl(INADDR_ANY);
+ else
+ ((struct sockaddr_in6 *)&stor)->sin6_addr = in6addr_any;
for (port = p9_ipport_resv_max; port >= p9_ipport_resv_min; port--) {
- cl.sin_port = htons((ushort)port);
- err = kernel_bind(sock, (struct sockaddr *)&cl, sizeof(cl));
+ if (stor.ss_family == AF_INET)
+ ((struct sockaddr_in *)&stor)->sin_port = htons((ushort)port);
+ else
+ ((struct sockaddr_in6 *)&stor)->sin6_port = htons((ushort)port);
+ err = kernel_bind(sock, (struct sockaddr_unsized *)&stor, sizeof(stor));
if (err != -EADDRINUSE)
break;
}
return err;
}
-
static int
-p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args)
+p9_fd_create_tcp(struct p9_client *client, struct fs_context *fc)
{
+ const char *addr = fc->source;
+ struct v9fs_context *ctx = fc->fs_private;
int err;
+ char port_str[6];
struct socket *csocket;
- struct sockaddr_in sin_server;
+ struct sockaddr_storage stor = { 0 };
struct p9_fd_opts opts;
- err = parse_opts(args, &opts);
- if (err < 0)
- return err;
+ /* opts are already parsed in context */
+ opts = ctx->fd_opts;
- if (addr == NULL || valid_ipaddr4(addr) < 0)
+ if (!addr)
return -EINVAL;
+ sprintf(port_str, "%u", opts.port);
+ err = inet_pton_with_scope(current->nsproxy->net_ns, AF_UNSPEC, addr,
+ port_str, &stor);
+ if (err < 0)
+ return err;
+
csocket = NULL;
client->trans_opts.tcp.port = opts.port;
client->trans_opts.tcp.privport = opts.privport;
- sin_server.sin_family = AF_INET;
- sin_server.sin_addr.s_addr = in_aton(addr);
- sin_server.sin_port = htons(opts.port);
- err = __sock_create(current->nsproxy->net_ns, PF_INET,
+ err = __sock_create(current->nsproxy->net_ns, stor.ss_family,
SOCK_STREAM, IPPROTO_TCP, &csocket, 1);
if (err) {
pr_err("%s (%d): problem creating socket\n",
@@ -973,9 +915,9 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args)
}
}
- err = csocket->ops->connect(csocket,
- (struct sockaddr *)&sin_server,
- sizeof(struct sockaddr_in), 0);
+ err = READ_ONCE(csocket->ops)->connect(csocket,
+ (struct sockaddr_unsized *)&stor,
+ sizeof(stor), 0);
if (err < 0) {
pr_err("%s (%d): problem connecting socket to %s\n",
__func__, task_pid_nr(current), addr);
@@ -987,15 +929,16 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args)
}
static int
-p9_fd_create_unix(struct p9_client *client, const char *addr, char *args)
+p9_fd_create_unix(struct p9_client *client, struct fs_context *fc)
{
+ const char *addr = fc->source;
int err;
struct socket *csocket;
struct sockaddr_un sun_server;
csocket = NULL;
- if (addr == NULL)
+ if (!addr || !strlen(addr))
return -EINVAL;
if (strlen(addr) >= UNIX_PATH_MAX) {
@@ -1014,8 +957,8 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args)
return err;
}
- err = csocket->ops->connect(csocket, (struct sockaddr *)&sun_server,
- sizeof(struct sockaddr_un) - 1, 0);
+ err = READ_ONCE(csocket->ops)->connect(csocket, (struct sockaddr_unsized *)&sun_server,
+ sizeof(struct sockaddr_un) - 1, 0);
if (err < 0) {
pr_err("%s (%d): problem connecting socket: %s: %d\n",
__func__, task_pid_nr(current), addr, err);
@@ -1027,12 +970,12 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args)
}
static int
-p9_fd_create(struct p9_client *client, const char *addr, char *args)
+p9_fd_create(struct p9_client *client, struct fs_context *fc)
{
+ struct v9fs_context *ctx = fc->fs_private;
+ struct p9_fd_opts opts = ctx->fd_opts;
int err;
- struct p9_fd_opts opts;
- parse_opts(args, &opts);
client->trans_opts.fd.rfd = opts.rfd;
client->trans_opts.fd.wfd = opts.wfd;
@@ -1053,7 +996,9 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args)
static struct p9_trans_module p9_tcp_trans = {
.name = "tcp",
.maxsize = MAX_SOCK_BUF,
- .def = 0,
+ .pooled_rbuffers = false,
+ .def = false,
+ .supports_vmalloc = true,
.create = p9_fd_create_tcp,
.close = p9_fd_close,
.request = p9_fd_request,
@@ -1062,11 +1007,13 @@ static struct p9_trans_module p9_tcp_trans = {
.show_options = p9_fd_show_options,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_9P("tcp");
static struct p9_trans_module p9_unix_trans = {
.name = "unix",
.maxsize = MAX_SOCK_BUF,
- .def = 0,
+ .def = false,
+ .supports_vmalloc = true,
.create = p9_fd_create_unix,
.close = p9_fd_close,
.request = p9_fd_request,
@@ -1075,11 +1022,13 @@ static struct p9_trans_module p9_unix_trans = {
.show_options = p9_fd_show_options,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_9P("unix");
static struct p9_trans_module p9_fd_trans = {
.name = "fd",
.maxsize = MAX_SOCK_BUF,
- .def = 0,
+ .def = false,
+ .supports_vmalloc = true,
.create = p9_fd_create,
.close = p9_fd_close,
.request = p9_fd_request,
@@ -1088,6 +1037,7 @@ static struct p9_trans_module p9_fd_trans = {
.show_options = p9_fd_show_options,
.owner = THIS_MODULE,
};
+MODULE_ALIAS_9P("fd");
/**
* p9_poll_workfn - poll worker thread
@@ -1121,7 +1071,7 @@ static void p9_poll_workfn(struct work_struct *work)
p9_debug(P9_DEBUG_TRANS, "finish\n");
}
-int p9_trans_fd_init(void)
+static int __init p9_trans_fd_init(void)
{
v9fs_register_trans(&p9_tcp_trans);
v9fs_register_trans(&p9_unix_trans);
@@ -1130,10 +1080,17 @@ int p9_trans_fd_init(void)
return 0;
}
-void p9_trans_fd_exit(void)
+static void __exit p9_trans_fd_exit(void)
{
flush_work(&p9_poll_work);
v9fs_unregister_trans(&p9_tcp_trans);
v9fs_unregister_trans(&p9_unix_trans);
v9fs_unregister_trans(&p9_fd_trans);
}
+
+module_init(p9_trans_fd_init);
+module_exit(p9_trans_fd_exit);
+
+MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
+MODULE_DESCRIPTION("Filedescriptor Transport for 9P");
+MODULE_LICENSE("GPL");
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index b513cffeeb3c..4d406479f83b 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -1,6 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * linux/fs/9p/trans_rdma.c
- *
* RDMA transport layer based on the trans_fd.c implementation.
*
* Copyright (C) 2008 by Tom Tucker <tom@opengridcomputing.com>
@@ -8,22 +7,6 @@
* Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net>
* Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com>
* Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to:
- * Free Software Foundation
- * 51 Franklin Street, Fifth Floor
- * Boston, MA 02111-1301 USA
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -38,9 +21,8 @@
#include <linux/un.h>
#include <linux/uaccess.h>
#include <linux/inet.h>
-#include <linux/idr.h>
#include <linux/file.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
#include <linux/semaphore.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
@@ -50,14 +32,10 @@
#include <rdma/ib_verbs.h>
#include <rdma/rdma_cm.h>
-#define P9_PORT 5640
-#define P9_RDMA_SQ_DEPTH 32
-#define P9_RDMA_RQ_DEPTH 32
#define P9_RDMA_SEND_SGE 4
#define P9_RDMA_RECV_SGE 4
#define P9_RDMA_IRD 0
#define P9_RDMA_ORD 0
-#define P9_RDMA_TIMEOUT 30000 /* 30 seconds */
#define P9_RDMA_MAXSIZE (1024*1024) /* 1MB */
/**
@@ -109,64 +87,30 @@ struct p9_trans_rdma {
struct completion cm_done;
};
+struct p9_rdma_req;
+
/**
- * p9_rdma_context - Keeps track of in-process WR
+ * struct p9_rdma_context - Keeps track of in-process WR
*
+ * @cqe: completion queue entry
* @busa: Bus address to unmap when the WR completes
* @req: Keeps track of requests (send)
* @rc: Keepts track of replies (receive)
*/
-struct p9_rdma_req;
struct p9_rdma_context {
struct ib_cqe cqe;
dma_addr_t busa;
union {
struct p9_req_t *req;
- struct p9_fcall *rc;
+ struct p9_fcall rc;
};
};
-/**
- * p9_rdma_opts - Collection of mount options
- * @port: port of connection
- * @sq_depth: The requested depth of the SQ. This really doesn't need
- * to be any deeper than the number of threads used in the client
- * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth
- * @timeout: Time to wait in msecs for CM events
- */
-struct p9_rdma_opts {
- short port;
- bool privport;
- int sq_depth;
- int rq_depth;
- long timeout;
-};
-
-/*
- * Option Parsing (code inspired by NFS code)
- */
-enum {
- /* Options that take integer arguments */
- Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout,
- /* Options that take no argument */
- Opt_privport,
- Opt_err,
-};
-
-static match_table_t tokens = {
- {Opt_port, "port=%u"},
- {Opt_sq_depth, "sq=%u"},
- {Opt_rq_depth, "rq=%u"},
- {Opt_timeout, "timeout=%u"},
- {Opt_privport, "privport"},
- {Opt_err, NULL},
-};
-
static int p9_rdma_show_options(struct seq_file *m, struct p9_client *clnt)
{
struct p9_trans_rdma *rdma = clnt->trans;
- if (rdma->port != P9_PORT)
+ if (rdma->port != P9_RDMA_PORT)
seq_printf(m, ",port=%u", rdma->port);
if (rdma->sq_depth != P9_RDMA_SQ_DEPTH)
seq_printf(m, ",sq=%u", rdma->sq_depth);
@@ -179,77 +123,6 @@ static int p9_rdma_show_options(struct seq_file *m, struct p9_client *clnt)
return 0;
}
-/**
- * parse_opts - parse mount options into rdma options structure
- * @params: options string passed from mount
- * @opts: rdma transport-specific structure to parse options into
- *
- * Returns 0 upon success, -ERRNO upon failure
- */
-static int parse_opts(char *params, struct p9_rdma_opts *opts)
-{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
- char *options, *tmp_options;
-
- opts->port = P9_PORT;
- opts->sq_depth = P9_RDMA_SQ_DEPTH;
- opts->rq_depth = P9_RDMA_RQ_DEPTH;
- opts->timeout = P9_RDMA_TIMEOUT;
- opts->privport = false;
-
- if (!params)
- return 0;
-
- tmp_options = kstrdup(params, GFP_KERNEL);
- if (!tmp_options) {
- p9_debug(P9_DEBUG_ERROR,
- "failed to allocate copy of option string\n");
- return -ENOMEM;
- }
- options = tmp_options;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- int r;
- if (!*p)
- continue;
- token = match_token(p, tokens, args);
- if ((token != Opt_err) && (token != Opt_privport)) {
- r = match_int(&args[0], &option);
- if (r < 0) {
- p9_debug(P9_DEBUG_ERROR,
- "integer field, but no integer?\n");
- continue;
- }
- }
- switch (token) {
- case Opt_port:
- opts->port = option;
- break;
- case Opt_sq_depth:
- opts->sq_depth = option;
- break;
- case Opt_rq_depth:
- opts->rq_depth = option;
- break;
- case Opt_timeout:
- opts->timeout = option;
- break;
- case Opt_privport:
- opts->privport = true;
- break;
- default:
- continue;
- }
- }
- /* RQ must be at least as large as the SQ */
- opts->rq_depth = max(opts->rq_depth, opts->sq_depth);
- kfree(tmp_options);
- return 0;
-}
-
static int
p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
{
@@ -274,8 +147,7 @@ p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
case RDMA_CM_EVENT_DISCONNECTED:
if (rdma)
rdma->state = P9_RDMA_CLOSED;
- if (c)
- c->status = Disconnected;
+ c->status = Disconnected;
break;
case RDMA_CM_EVENT_TIMEWAIT_EXIT:
@@ -320,8 +192,8 @@ recv_done(struct ib_cq *cq, struct ib_wc *wc)
if (wc->status != IB_WC_SUCCESS)
goto err_out;
- c->rc->size = wc->byte_len;
- err = p9_parse_header(c->rc, NULL, NULL, &tag, 1);
+ c->rc.size = wc->byte_len;
+ err = p9_parse_header(&c->rc, NULL, NULL, &tag, 1);
if (err)
goto err_out;
@@ -331,12 +203,13 @@ recv_done(struct ib_cq *cq, struct ib_wc *wc)
/* Check that we have not yet received a reply for this request.
*/
- if (unlikely(req->rc)) {
+ if (unlikely(req->rc.sdata)) {
pr_err("Duplicate reply for request %d", tag);
goto err_out;
}
- req->rc = c->rc;
+ req->rc.size = c->rc.size;
+ req->rc.sdata = c->rc.sdata;
p9_client_cb(client, req, REQ_STATUS_RCVD);
out:
@@ -361,9 +234,10 @@ send_done(struct ib_cq *cq, struct ib_wc *wc)
container_of(wc->wr_cqe, struct p9_rdma_context, cqe);
ib_dma_unmap_single(rdma->cm_id->device,
- c->busa, c->req->tc->size,
+ c->busa, c->req->tc.size,
DMA_TO_DEVICE);
up(&rdma->sq_sem);
+ p9_req_put(client, c->req);
kfree(c);
}
@@ -399,9 +273,10 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c)
struct p9_trans_rdma *rdma = client->trans;
struct ib_recv_wr wr;
struct ib_sge sge;
+ int ret;
c->busa = ib_dma_map_single(rdma->cm_id->device,
- c->rc->sdata, client->msize,
+ c->rc.sdata, client->msize,
DMA_FROM_DEVICE);
if (ib_dma_mapping_error(rdma->cm_id->device, c->busa))
goto error;
@@ -416,7 +291,12 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c)
wr.wr_cqe = &c->cqe;
wr.sg_list = &sge;
wr.num_sge = 1;
- return ib_post_recv(rdma->qp, &wr, NULL);
+
+ ret = ib_post_recv(rdma->qp, &wr, NULL);
+ if (ret)
+ ib_dma_unmap_single(rdma->cm_id->device, c->busa,
+ client->msize, DMA_FROM_DEVICE);
+ return ret;
error:
p9_debug(P9_DEBUG_ERROR, "EIO\n");
@@ -443,9 +323,9 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
**/
if (unlikely(atomic_read(&rdma->excess_rc) > 0)) {
if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) {
- /* Got one ! */
- kfree(req->rc);
- req->rc = NULL;
+ /* Got one! */
+ p9_fcall_fini(&req->rc);
+ req->rc.sdata = NULL;
goto dont_need_post_recv;
} else {
/* We raced and lost. */
@@ -459,7 +339,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
err = -ENOMEM;
goto recv_error;
}
- rpl_context->rc = req->rc;
+ rpl_context->rc.sdata = req->rc.sdata;
/*
* Post a receive buffer for this request. We need to ensure
@@ -475,11 +355,11 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req)
err = post_recv(client, rpl_context);
if (err) {
- p9_debug(P9_DEBUG_FCALL, "POST RECV failed\n");
+ p9_debug(P9_DEBUG_ERROR, "POST RECV failed: %d\n", err);
goto recv_error;
}
/* remove posted receive buffer from request structure */
- req->rc = NULL;
+ req->rc.sdata = NULL;
dont_need_post_recv:
/* Post the request */
@@ -491,7 +371,7 @@ dont_need_post_recv:
c->req = req;
c->busa = ib_dma_map_single(rdma->cm_id->device,
- c->req->tc->sdata, c->req->tc->size,
+ c->req->tc.sdata, c->req->tc.size,
DMA_TO_DEVICE);
if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) {
err = -EIO;
@@ -501,7 +381,7 @@ dont_need_post_recv:
c->cqe.done = send_done;
sge.addr = c->busa;
- sge.length = c->req->tc->size;
+ sge.length = c->req->tc.size;
sge.lkey = rdma->pd->local_dma_lkey;
wr.next = NULL;
@@ -513,24 +393,27 @@ dont_need_post_recv:
if (down_interruptible(&rdma->sq_sem)) {
err = -EINTR;
- goto send_error;
+ goto dma_unmap;
}
/* Mark request as `sent' *before* we actually send it,
* because doing if after could erase the REQ_STATUS_RCVD
* status in case of a very fast reply.
*/
- req->status = REQ_STATUS_SENT;
+ WRITE_ONCE(req->status, REQ_STATUS_SENT);
err = ib_post_send(rdma->qp, &wr, NULL);
if (err)
- goto send_error;
+ goto dma_unmap;
/* Success */
return 0;
+dma_unmap:
+ ib_dma_unmap_single(rdma->cm_id->device, c->busa,
+ c->req->tc.size, DMA_TO_DEVICE);
/* Handle errors that happened during or while preparing the send: */
send_error:
- req->status = REQ_STATUS_ERROR;
+ WRITE_ONCE(req->status, REQ_STATUS_ERROR);
kfree(c);
p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err);
@@ -544,7 +427,7 @@ dont_need_post_recv:
recv_error:
kfree(rpl_context);
spin_lock_irqsave(&rdma->req_lock, flags);
- if (rdma->state < P9_RDMA_CLOSING) {
+ if (err != -EINTR && rdma->state < P9_RDMA_CLOSING) {
rdma->state = P9_RDMA_CLOSING;
spin_unlock_irqrestore(&rdma->req_lock, flags);
rdma_disconnect(rdma->cm_id);
@@ -633,14 +516,15 @@ static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma)
/**
* rdma_create_trans - Transport method for creating a transport instance
* @client: client instance
- * @addr: IP address string
- * @args: Mount options string
+ * @fc: The filesystem context
*/
static int
-rdma_create_trans(struct p9_client *client, const char *addr, char *args)
+rdma_create_trans(struct p9_client *client, struct fs_context *fc)
{
+ const char *addr = fc->source;
+ struct v9fs_context *ctx = fc->fs_private;
+ struct p9_rdma_opts opts = ctx->rdma_opts;
int err;
- struct p9_rdma_opts opts;
struct p9_trans_rdma *rdma;
struct rdma_conn_param conn_param;
struct ib_qp_init_attr qp_attr;
@@ -648,10 +532,8 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
if (addr == NULL)
return -EINVAL;
- /* Parse the transport specific mount options */
- err = parse_opts(args, &opts);
- if (err < 0)
- return err;
+ /* options are already parsed, in the fs context */
+ opts = ctx->rdma_opts;
/* Create and initialize the RDMA transport structure */
rdma = alloc_rdma(&opts);
@@ -699,9 +581,9 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
goto error;
/* Create the Completion Queue */
- rdma->cq = ib_alloc_cq(rdma->cm_id->device, client,
- opts.sq_depth + opts.rq_depth + 1,
- 0, IB_POLL_SOFTIRQ);
+ rdma->cq = ib_alloc_cq_any(rdma->cm_id->device, client,
+ opts.sq_depth + opts.rq_depth + 1,
+ IB_POLL_SOFTIRQ);
if (IS_ERR(rdma->cq))
goto error;
@@ -752,7 +634,9 @@ error:
static struct p9_trans_module p9_rdma_trans = {
.name = "rdma",
.maxsize = P9_RDMA_MAXSIZE,
- .def = 0,
+ .pooled_rbuffers = true,
+ .def = false,
+ .supports_vmalloc = false,
.owner = THIS_MODULE,
.create = rdma_create_trans,
.close = rdma_close,
@@ -778,6 +662,7 @@ static void __exit p9_trans_rdma_exit(void)
module_init(p9_trans_rdma_init);
module_exit(p9_trans_rdma_exit);
+MODULE_ALIAS_9P("rdma");
MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
MODULE_DESCRIPTION("RDMA Transport for 9P");
diff --git a/net/9p/trans_usbg.c b/net/9p/trans_usbg.c
new file mode 100644
index 000000000000..93547637deae
--- /dev/null
+++ b/net/9p/trans_usbg.c
@@ -0,0 +1,969 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * trans_usbg.c - USB peripheral usb9pfs configuration driver and transport.
+ *
+ * Copyright (C) 2024 Michael Grzeschik <m.grzeschik@pengutronix.de>
+ */
+
+/* Gadget usb9pfs only needs two bulk endpoints, and will use the usb9pfs
+ * transport to mount host exported filesystem via usb gadget.
+ */
+
+/* +--------------------------+ | +--------------------------+
+ * | 9PFS mounting client | | | 9PFS exporting server |
+ * SW | | | | |
+ * | (this:trans_usbg) | | |(e.g. diod or nfs-ganesha)|
+ * +-------------^------------+ | +-------------^------------+
+ * | | |
+ * ------------------|------------------------------------|-------------
+ * | | |
+ * +-------------v------------+ | +-------------v------------+
+ * | | | | |
+ * HW | USB Device Controller <---------> USB Host Controller |
+ * | | | | |
+ * +--------------------------+ | +--------------------------+
+ */
+
+#include <linux/cleanup.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/fs_context.h>
+#include <linux/usb/composite.h>
+#include <linux/usb/func_utils.h>
+
+#include <net/9p/9p.h>
+#include <net/9p/client.h>
+#include <net/9p/transport.h>
+
+#define DEFAULT_BUFLEN 16384
+
+struct f_usb9pfs {
+ struct p9_client *client;
+
+ /* 9p request lock for en/dequeue */
+ spinlock_t lock;
+
+ struct usb_request *in_req;
+ struct usb_request *out_req;
+
+ struct usb_ep *in_ep;
+ struct usb_ep *out_ep;
+
+ struct completion send;
+ struct completion received;
+
+ unsigned int buflen;
+
+ struct usb_function function;
+};
+
+static inline struct f_usb9pfs *func_to_usb9pfs(struct usb_function *f)
+{
+ return container_of(f, struct f_usb9pfs, function);
+}
+
+struct f_usb9pfs_opts {
+ struct usb_function_instance func_inst;
+ unsigned int buflen;
+
+ struct f_usb9pfs_dev *dev;
+
+ /* Read/write access to configfs attributes is handled by configfs.
+ *
+ * This is to protect the data from concurrent access by read/write
+ * and create symlink/remove symlink.
+ */
+ struct mutex lock;
+ int refcnt;
+};
+
+struct f_usb9pfs_dev {
+ struct f_usb9pfs *usb9pfs;
+ struct f_usb9pfs_opts *opts;
+ char tag[41];
+ bool inuse;
+
+ struct list_head usb9pfs_instance;
+};
+
+static DEFINE_MUTEX(usb9pfs_lock);
+static struct list_head usbg_instance_list;
+
+static int usb9pfs_queue_tx(struct f_usb9pfs *usb9pfs, struct p9_req_t *p9_tx_req,
+ gfp_t gfp_flags)
+{
+ struct usb_composite_dev *cdev = usb9pfs->function.config->cdev;
+ struct usb_request *req = usb9pfs->in_req;
+ int ret;
+
+ if (!(p9_tx_req->tc.size % usb9pfs->in_ep->maxpacket))
+ req->zero = 1;
+
+ req->buf = p9_tx_req->tc.sdata;
+ req->length = p9_tx_req->tc.size;
+ req->context = p9_tx_req;
+
+ dev_dbg(&cdev->gadget->dev, "%s usb9pfs send --> %d/%d, zero: %d\n",
+ usb9pfs->in_ep->name, req->actual, req->length, req->zero);
+
+ ret = usb_ep_queue(usb9pfs->in_ep, req, gfp_flags);
+ if (ret)
+ req->context = NULL;
+
+ dev_dbg(&cdev->gadget->dev, "tx submit --> %d\n", ret);
+
+ return ret;
+}
+
+static int usb9pfs_queue_rx(struct f_usb9pfs *usb9pfs, struct usb_request *req,
+ gfp_t gfp_flags)
+{
+ struct usb_composite_dev *cdev = usb9pfs->function.config->cdev;
+ int ret;
+
+ ret = usb_ep_queue(usb9pfs->out_ep, req, gfp_flags);
+
+ dev_dbg(&cdev->gadget->dev, "rx submit --> %d\n", ret);
+
+ return ret;
+}
+
+static int usb9pfs_transmit(struct f_usb9pfs *usb9pfs, struct p9_req_t *p9_req)
+{
+ int ret = 0;
+
+ guard(spinlock_irqsave)(&usb9pfs->lock);
+
+ ret = usb9pfs_queue_tx(usb9pfs, p9_req, GFP_ATOMIC);
+ if (ret)
+ return ret;
+
+ list_del(&p9_req->req_list);
+
+ p9_req_get(p9_req);
+
+ return ret;
+}
+
+static void usb9pfs_tx_complete(struct usb_ep *ep, struct usb_request *req)
+{
+ struct f_usb9pfs *usb9pfs = ep->driver_data;
+ struct usb_composite_dev *cdev = usb9pfs->function.config->cdev;
+ struct p9_req_t *p9_tx_req = req->context;
+ unsigned long flags;
+
+ /* reset zero packages */
+ req->zero = 0;
+
+ if (req->status) {
+ dev_err(&cdev->gadget->dev, "%s usb9pfs complete --> %d, %d/%d\n",
+ ep->name, req->status, req->actual, req->length);
+ return;
+ }
+
+ dev_dbg(&cdev->gadget->dev, "%s usb9pfs complete --> %d, %d/%d\n",
+ ep->name, req->status, req->actual, req->length);
+
+ spin_lock_irqsave(&usb9pfs->lock, flags);
+ WRITE_ONCE(p9_tx_req->status, REQ_STATUS_SENT);
+
+ p9_req_put(usb9pfs->client, p9_tx_req);
+
+ req->context = NULL;
+
+ spin_unlock_irqrestore(&usb9pfs->lock, flags);
+
+ complete(&usb9pfs->send);
+}
+
+static struct p9_req_t *usb9pfs_rx_header(struct f_usb9pfs *usb9pfs, void *buf)
+{
+ struct p9_req_t *p9_rx_req;
+ struct p9_fcall rc;
+ int ret;
+
+ /* start by reading header */
+ rc.sdata = buf;
+ rc.offset = 0;
+ rc.capacity = P9_HDRSZ;
+ rc.size = P9_HDRSZ;
+
+ p9_debug(P9_DEBUG_TRANS, "mux %p got %zu bytes\n", usb9pfs,
+ rc.capacity - rc.offset);
+
+ ret = p9_parse_header(&rc, &rc.size, NULL, NULL, 0);
+ if (ret) {
+ p9_debug(P9_DEBUG_ERROR,
+ "error parsing header: %d\n", ret);
+ return NULL;
+ }
+
+ p9_debug(P9_DEBUG_TRANS,
+ "mux %p pkt: size: %d bytes tag: %d\n",
+ usb9pfs, rc.size, rc.tag);
+
+ p9_rx_req = p9_tag_lookup(usb9pfs->client, rc.tag);
+ if (!p9_rx_req || p9_rx_req->status != REQ_STATUS_SENT) {
+ p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n", rc.tag);
+ return NULL;
+ }
+
+ if (rc.size > p9_rx_req->rc.capacity) {
+ p9_debug(P9_DEBUG_ERROR,
+ "requested packet size too big: %d for tag %d with capacity %zd\n",
+ rc.size, rc.tag, p9_rx_req->rc.capacity);
+ p9_req_put(usb9pfs->client, p9_rx_req);
+ return NULL;
+ }
+
+ if (!p9_rx_req->rc.sdata) {
+ p9_debug(P9_DEBUG_ERROR,
+ "No recv fcall for tag %d (req %p), disconnecting!\n",
+ rc.tag, p9_rx_req);
+ p9_req_put(usb9pfs->client, p9_rx_req);
+ return NULL;
+ }
+
+ return p9_rx_req;
+}
+
+static void usb9pfs_rx_complete(struct usb_ep *ep, struct usb_request *req)
+{
+ struct f_usb9pfs *usb9pfs = ep->driver_data;
+ struct usb_composite_dev *cdev = usb9pfs->function.config->cdev;
+ struct p9_req_t *p9_rx_req;
+ unsigned int req_size = req->actual;
+ int status = REQ_STATUS_RCVD;
+
+ if (req->status) {
+ dev_err(&cdev->gadget->dev, "%s usb9pfs complete --> %d, %d/%d\n",
+ ep->name, req->status, req->actual, req->length);
+ return;
+ }
+
+ p9_rx_req = usb9pfs_rx_header(usb9pfs, req->buf);
+ if (!p9_rx_req)
+ return;
+
+ if (req_size > p9_rx_req->rc.capacity) {
+ dev_err(&cdev->gadget->dev,
+ "%s received data size %u exceeds buffer capacity %zu\n",
+ ep->name, req_size, p9_rx_req->rc.capacity);
+ req_size = 0;
+ status = REQ_STATUS_ERROR;
+ }
+
+ memcpy(p9_rx_req->rc.sdata, req->buf, req_size);
+
+ p9_rx_req->rc.size = req_size;
+
+ p9_client_cb(usb9pfs->client, p9_rx_req, status);
+ p9_req_put(usb9pfs->client, p9_rx_req);
+
+ complete(&usb9pfs->received);
+}
+
+static void disable_ep(struct usb_composite_dev *cdev, struct usb_ep *ep)
+{
+ int value;
+
+ value = usb_ep_disable(ep);
+ if (value < 0)
+ dev_info(&cdev->gadget->dev,
+ "disable %s --> %d\n", ep->name, value);
+}
+
+static void disable_usb9pfs(struct f_usb9pfs *usb9pfs)
+{
+ struct usb_composite_dev *cdev =
+ usb9pfs->function.config->cdev;
+
+ if (usb9pfs->in_req) {
+ usb_ep_free_request(usb9pfs->in_ep, usb9pfs->in_req);
+ usb9pfs->in_req = NULL;
+ }
+
+ if (usb9pfs->out_req) {
+ usb_ep_free_request(usb9pfs->out_ep, usb9pfs->out_req);
+ usb9pfs->out_req = NULL;
+ }
+
+ disable_ep(cdev, usb9pfs->in_ep);
+ disable_ep(cdev, usb9pfs->out_ep);
+ dev_dbg(&cdev->gadget->dev, "%s disabled\n",
+ usb9pfs->function.name);
+}
+
+static int alloc_requests(struct usb_composite_dev *cdev,
+ struct f_usb9pfs *usb9pfs)
+{
+ int ret;
+
+ usb9pfs->in_req = usb_ep_alloc_request(usb9pfs->in_ep, GFP_ATOMIC);
+ if (!usb9pfs->in_req) {
+ ret = -ENOENT;
+ goto fail;
+ }
+
+ usb9pfs->out_req = alloc_ep_req(usb9pfs->out_ep, usb9pfs->buflen);
+ if (!usb9pfs->out_req) {
+ ret = -ENOENT;
+ goto fail_in;
+ }
+
+ usb9pfs->in_req->complete = usb9pfs_tx_complete;
+ usb9pfs->out_req->complete = usb9pfs_rx_complete;
+
+ /* length will be set in complete routine */
+ usb9pfs->in_req->context = usb9pfs;
+ usb9pfs->out_req->context = usb9pfs;
+
+ return 0;
+
+fail_in:
+ usb_ep_free_request(usb9pfs->in_ep, usb9pfs->in_req);
+fail:
+ return ret;
+}
+
+static int enable_endpoint(struct usb_composite_dev *cdev,
+ struct f_usb9pfs *usb9pfs, struct usb_ep *ep)
+{
+ int ret;
+
+ ret = config_ep_by_speed(cdev->gadget, &usb9pfs->function, ep);
+ if (ret)
+ return ret;
+
+ ret = usb_ep_enable(ep);
+ if (ret < 0)
+ return ret;
+
+ ep->driver_data = usb9pfs;
+
+ return 0;
+}
+
+static int
+enable_usb9pfs(struct usb_composite_dev *cdev, struct f_usb9pfs *usb9pfs)
+{
+ struct p9_client *client;
+ int ret = 0;
+
+ ret = enable_endpoint(cdev, usb9pfs, usb9pfs->in_ep);
+ if (ret)
+ goto out;
+
+ ret = enable_endpoint(cdev, usb9pfs, usb9pfs->out_ep);
+ if (ret)
+ goto disable_in;
+
+ ret = alloc_requests(cdev, usb9pfs);
+ if (ret)
+ goto disable_out;
+
+ client = usb9pfs->client;
+ if (client)
+ client->status = Connected;
+
+ dev_dbg(&cdev->gadget->dev, "%s enabled\n", usb9pfs->function.name);
+ return 0;
+
+disable_out:
+ usb_ep_disable(usb9pfs->out_ep);
+disable_in:
+ usb_ep_disable(usb9pfs->in_ep);
+out:
+ return ret;
+}
+
+static int p9_usbg_create(struct p9_client *client, struct fs_context *fc)
+{
+ const char *devname = fc->source;
+ struct f_usb9pfs_dev *dev;
+ struct f_usb9pfs *usb9pfs;
+ int ret = -ENOENT;
+ int found = 0;
+
+ if (!devname)
+ return -EINVAL;
+
+ guard(mutex)(&usb9pfs_lock);
+
+ list_for_each_entry(dev, &usbg_instance_list, usb9pfs_instance) {
+ if (!strncmp(devname, dev->tag, strlen(devname))) {
+ if (!dev->inuse) {
+ dev->inuse = true;
+ found = 1;
+ break;
+ }
+ ret = -EBUSY;
+ break;
+ }
+ }
+
+ if (!found) {
+ pr_err("no channels available for device %s\n", devname);
+ return ret;
+ }
+
+ usb9pfs = dev->usb9pfs;
+ if (!usb9pfs)
+ return -EINVAL;
+
+ client->trans = (void *)usb9pfs;
+ if (!usb9pfs->in_req)
+ client->status = Disconnected;
+ else
+ client->status = Connected;
+ usb9pfs->client = client;
+
+ client->trans_mod->maxsize = usb9pfs->buflen;
+
+ complete(&usb9pfs->received);
+
+ return 0;
+}
+
+static void usb9pfs_clear_tx(struct f_usb9pfs *usb9pfs)
+{
+ struct p9_req_t *req;
+
+ guard(spinlock_irqsave)(&usb9pfs->lock);
+
+ req = usb9pfs->in_req->context;
+ if (!req)
+ return;
+
+ if (!req->t_err)
+ req->t_err = -ECONNRESET;
+
+ p9_client_cb(usb9pfs->client, req, REQ_STATUS_ERROR);
+}
+
+static void p9_usbg_close(struct p9_client *client)
+{
+ struct f_usb9pfs *usb9pfs;
+ struct f_usb9pfs_dev *dev;
+ struct f_usb9pfs_opts *opts;
+
+ if (!client)
+ return;
+
+ usb9pfs = client->trans;
+ if (!usb9pfs)
+ return;
+
+ client->status = Disconnected;
+
+ usb9pfs_clear_tx(usb9pfs);
+
+ opts = container_of(usb9pfs->function.fi,
+ struct f_usb9pfs_opts, func_inst);
+
+ dev = opts->dev;
+
+ mutex_lock(&usb9pfs_lock);
+ dev->inuse = false;
+ mutex_unlock(&usb9pfs_lock);
+}
+
+static int p9_usbg_request(struct p9_client *client, struct p9_req_t *p9_req)
+{
+ struct f_usb9pfs *usb9pfs = client->trans;
+ int ret;
+
+ if (client->status != Connected)
+ return -EBUSY;
+
+ ret = wait_for_completion_killable(&usb9pfs->received);
+ if (ret)
+ return ret;
+
+ ret = usb9pfs_transmit(usb9pfs, p9_req);
+ if (ret)
+ return ret;
+
+ ret = wait_for_completion_killable(&usb9pfs->send);
+ if (ret)
+ return ret;
+
+ return usb9pfs_queue_rx(usb9pfs, usb9pfs->out_req, GFP_ATOMIC);
+}
+
+static int p9_usbg_cancel(struct p9_client *client, struct p9_req_t *req)
+{
+ struct f_usb9pfs *usb9pfs = client->trans;
+ int ret = 1;
+
+ p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req);
+
+ guard(spinlock_irqsave)(&usb9pfs->lock);
+
+ if (req->status == REQ_STATUS_UNSENT) {
+ list_del(&req->req_list);
+ WRITE_ONCE(req->status, REQ_STATUS_FLSHD);
+ p9_req_put(client, req);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static struct p9_trans_module p9_usbg_trans = {
+ .name = "usbg",
+ .create = p9_usbg_create,
+ .close = p9_usbg_close,
+ .request = p9_usbg_request,
+ .cancel = p9_usbg_cancel,
+ .supports_vmalloc = false,
+ .owner = THIS_MODULE,
+};
+
+/*-------------------------------------------------------------------------*/
+
+#define USB_PROTOCOL_9PFS 0x09
+
+static struct usb_interface_descriptor usb9pfs_intf = {
+ .bLength = sizeof(usb9pfs_intf),
+ .bDescriptorType = USB_DT_INTERFACE,
+
+ .bNumEndpoints = 2,
+ .bInterfaceClass = USB_CLASS_VENDOR_SPEC,
+ .bInterfaceSubClass = USB_SUBCLASS_VENDOR_SPEC,
+ .bInterfaceProtocol = USB_PROTOCOL_9PFS,
+
+ /* .iInterface = DYNAMIC */
+};
+
+/* full speed support: */
+
+static struct usb_endpoint_descriptor fs_usb9pfs_source_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+
+ .bEndpointAddress = USB_DIR_IN,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+};
+
+static struct usb_endpoint_descriptor fs_usb9pfs_sink_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+
+ .bEndpointAddress = USB_DIR_OUT,
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+};
+
+static struct usb_descriptor_header *fs_usb9pfs_descs[] = {
+ (struct usb_descriptor_header *)&usb9pfs_intf,
+ (struct usb_descriptor_header *)&fs_usb9pfs_sink_desc,
+ (struct usb_descriptor_header *)&fs_usb9pfs_source_desc,
+ NULL,
+};
+
+/* high speed support: */
+
+static struct usb_endpoint_descriptor hs_usb9pfs_source_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = cpu_to_le16(512),
+};
+
+static struct usb_endpoint_descriptor hs_usb9pfs_sink_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = cpu_to_le16(512),
+};
+
+static struct usb_descriptor_header *hs_usb9pfs_descs[] = {
+ (struct usb_descriptor_header *)&usb9pfs_intf,
+ (struct usb_descriptor_header *)&hs_usb9pfs_source_desc,
+ (struct usb_descriptor_header *)&hs_usb9pfs_sink_desc,
+ NULL,
+};
+
+/* super speed support: */
+
+static struct usb_endpoint_descriptor ss_usb9pfs_source_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = cpu_to_le16(1024),
+};
+
+static struct usb_ss_ep_comp_descriptor ss_usb9pfs_source_comp_desc = {
+ .bLength = USB_DT_SS_EP_COMP_SIZE,
+ .bDescriptorType = USB_DT_SS_ENDPOINT_COMP,
+ .bMaxBurst = 0,
+ .bmAttributes = 0,
+ .wBytesPerInterval = 0,
+};
+
+static struct usb_endpoint_descriptor ss_usb9pfs_sink_desc = {
+ .bLength = USB_DT_ENDPOINT_SIZE,
+ .bDescriptorType = USB_DT_ENDPOINT,
+
+ .bmAttributes = USB_ENDPOINT_XFER_BULK,
+ .wMaxPacketSize = cpu_to_le16(1024),
+};
+
+static struct usb_ss_ep_comp_descriptor ss_usb9pfs_sink_comp_desc = {
+ .bLength = USB_DT_SS_EP_COMP_SIZE,
+ .bDescriptorType = USB_DT_SS_ENDPOINT_COMP,
+ .bMaxBurst = 0,
+ .bmAttributes = 0,
+ .wBytesPerInterval = 0,
+};
+
+static struct usb_descriptor_header *ss_usb9pfs_descs[] = {
+ (struct usb_descriptor_header *)&usb9pfs_intf,
+ (struct usb_descriptor_header *)&ss_usb9pfs_source_desc,
+ (struct usb_descriptor_header *)&ss_usb9pfs_source_comp_desc,
+ (struct usb_descriptor_header *)&ss_usb9pfs_sink_desc,
+ (struct usb_descriptor_header *)&ss_usb9pfs_sink_comp_desc,
+ NULL,
+};
+
+/* function-specific strings: */
+static struct usb_string strings_usb9pfs[] = {
+ [0].s = "usb9pfs input to output",
+ { } /* end of list */
+};
+
+static struct usb_gadget_strings stringtab_usb9pfs = {
+ .language = 0x0409, /* en-us */
+ .strings = strings_usb9pfs,
+};
+
+static struct usb_gadget_strings *usb9pfs_strings[] = {
+ &stringtab_usb9pfs,
+ NULL,
+};
+
+/*-------------------------------------------------------------------------*/
+
+static int usb9pfs_func_bind(struct usb_configuration *c,
+ struct usb_function *f)
+{
+ struct f_usb9pfs *usb9pfs = func_to_usb9pfs(f);
+ struct f_usb9pfs_opts *opts;
+ struct usb_composite_dev *cdev = c->cdev;
+ int ret;
+ int id;
+
+ /* allocate interface ID(s) */
+ id = usb_interface_id(c, f);
+ if (id < 0)
+ return id;
+ usb9pfs_intf.bInterfaceNumber = id;
+
+ id = usb_string_id(cdev);
+ if (id < 0)
+ return id;
+ strings_usb9pfs[0].id = id;
+ usb9pfs_intf.iInterface = id;
+
+ /* allocate endpoints */
+ usb9pfs->in_ep = usb_ep_autoconfig(cdev->gadget,
+ &fs_usb9pfs_source_desc);
+ if (!usb9pfs->in_ep)
+ goto autoconf_fail;
+
+ usb9pfs->out_ep = usb_ep_autoconfig(cdev->gadget,
+ &fs_usb9pfs_sink_desc);
+ if (!usb9pfs->out_ep)
+ goto autoconf_fail;
+
+ /* support high speed hardware */
+ hs_usb9pfs_source_desc.bEndpointAddress =
+ fs_usb9pfs_source_desc.bEndpointAddress;
+ hs_usb9pfs_sink_desc.bEndpointAddress =
+ fs_usb9pfs_sink_desc.bEndpointAddress;
+
+ /* support super speed hardware */
+ ss_usb9pfs_source_desc.bEndpointAddress =
+ fs_usb9pfs_source_desc.bEndpointAddress;
+ ss_usb9pfs_sink_desc.bEndpointAddress =
+ fs_usb9pfs_sink_desc.bEndpointAddress;
+
+ ret = usb_assign_descriptors(f, fs_usb9pfs_descs, hs_usb9pfs_descs,
+ ss_usb9pfs_descs, ss_usb9pfs_descs);
+ if (ret)
+ return ret;
+
+ opts = container_of(f->fi, struct f_usb9pfs_opts, func_inst);
+ opts->dev->usb9pfs = usb9pfs;
+
+ dev_dbg(&cdev->gadget->dev, "%s speed %s: IN/%s, OUT/%s\n",
+ (gadget_is_superspeed(c->cdev->gadget) ? "super" :
+ (gadget_is_dualspeed(c->cdev->gadget) ? "dual" : "full")),
+ f->name, usb9pfs->in_ep->name, usb9pfs->out_ep->name);
+
+ return 0;
+
+autoconf_fail:
+ ERROR(cdev, "%s: can't autoconfigure on %s\n",
+ f->name, cdev->gadget->name);
+ return -ENODEV;
+}
+
+static void usb9pfs_func_unbind(struct usb_configuration *c,
+ struct usb_function *f)
+{
+ struct f_usb9pfs *usb9pfs = func_to_usb9pfs(f);
+
+ disable_usb9pfs(usb9pfs);
+}
+
+static void usb9pfs_free_func(struct usb_function *f)
+{
+ struct f_usb9pfs *usb9pfs = func_to_usb9pfs(f);
+ struct f_usb9pfs_opts *opts;
+
+ kfree(usb9pfs);
+
+ opts = container_of(f->fi, struct f_usb9pfs_opts, func_inst);
+
+ mutex_lock(&opts->lock);
+ opts->refcnt--;
+ mutex_unlock(&opts->lock);
+
+ usb_free_all_descriptors(f);
+}
+
+static int usb9pfs_set_alt(struct usb_function *f,
+ unsigned int intf, unsigned int alt)
+{
+ struct f_usb9pfs *usb9pfs = func_to_usb9pfs(f);
+ struct usb_composite_dev *cdev = f->config->cdev;
+
+ return enable_usb9pfs(cdev, usb9pfs);
+}
+
+static void usb9pfs_disable(struct usb_function *f)
+{
+ struct f_usb9pfs *usb9pfs = func_to_usb9pfs(f);
+
+ usb9pfs_clear_tx(usb9pfs);
+}
+
+static struct usb_function *usb9pfs_alloc(struct usb_function_instance *fi)
+{
+ struct f_usb9pfs_opts *usb9pfs_opts;
+ struct f_usb9pfs *usb9pfs;
+
+ usb9pfs = kzalloc(sizeof(*usb9pfs), GFP_KERNEL);
+ if (!usb9pfs)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_init(&usb9pfs->lock);
+
+ init_completion(&usb9pfs->send);
+ init_completion(&usb9pfs->received);
+
+ usb9pfs_opts = container_of(fi, struct f_usb9pfs_opts, func_inst);
+
+ mutex_lock(&usb9pfs_opts->lock);
+ usb9pfs_opts->refcnt++;
+ mutex_unlock(&usb9pfs_opts->lock);
+
+ usb9pfs->buflen = usb9pfs_opts->buflen;
+
+ usb9pfs->function.name = "usb9pfs";
+ usb9pfs->function.bind = usb9pfs_func_bind;
+ usb9pfs->function.unbind = usb9pfs_func_unbind;
+ usb9pfs->function.set_alt = usb9pfs_set_alt;
+ usb9pfs->function.disable = usb9pfs_disable;
+ usb9pfs->function.strings = usb9pfs_strings;
+
+ usb9pfs->function.free_func = usb9pfs_free_func;
+
+ return &usb9pfs->function;
+}
+
+static inline struct f_usb9pfs_opts *to_f_usb9pfs_opts(struct config_item *item)
+{
+ return container_of(to_config_group(item), struct f_usb9pfs_opts,
+ func_inst.group);
+}
+
+static inline struct f_usb9pfs_opts *fi_to_f_usb9pfs_opts(struct usb_function_instance *fi)
+{
+ return container_of(fi, struct f_usb9pfs_opts, func_inst);
+}
+
+static void usb9pfs_attr_release(struct config_item *item)
+{
+ struct f_usb9pfs_opts *usb9pfs_opts = to_f_usb9pfs_opts(item);
+
+ usb_put_function_instance(&usb9pfs_opts->func_inst);
+}
+
+static struct configfs_item_operations usb9pfs_item_ops = {
+ .release = usb9pfs_attr_release,
+};
+
+static ssize_t f_usb9pfs_opts_buflen_show(struct config_item *item, char *page)
+{
+ struct f_usb9pfs_opts *opts = to_f_usb9pfs_opts(item);
+ int ret;
+
+ mutex_lock(&opts->lock);
+ ret = sysfs_emit(page, "%d\n", opts->buflen);
+ mutex_unlock(&opts->lock);
+
+ return ret;
+}
+
+static ssize_t f_usb9pfs_opts_buflen_store(struct config_item *item,
+ const char *page, size_t len)
+{
+ struct f_usb9pfs_opts *opts = to_f_usb9pfs_opts(item);
+ int ret;
+ u32 num;
+
+ guard(mutex)(&opts->lock);
+
+ if (opts->refcnt)
+ return -EBUSY;
+
+ ret = kstrtou32(page, 0, &num);
+ if (ret)
+ return ret;
+
+ opts->buflen = num;
+
+ return len;
+}
+
+CONFIGFS_ATTR(f_usb9pfs_opts_, buflen);
+
+static struct configfs_attribute *usb9pfs_attrs[] = {
+ &f_usb9pfs_opts_attr_buflen,
+ NULL,
+};
+
+static const struct config_item_type usb9pfs_func_type = {
+ .ct_item_ops = &usb9pfs_item_ops,
+ .ct_attrs = usb9pfs_attrs,
+ .ct_owner = THIS_MODULE,
+};
+
+static struct f_usb9pfs_dev *_usb9pfs_do_find_dev(const char *tag)
+{
+ struct f_usb9pfs_dev *usb9pfs_dev;
+
+ if (!tag)
+ return NULL;
+
+ list_for_each_entry(usb9pfs_dev, &usbg_instance_list, usb9pfs_instance) {
+ if (strcmp(usb9pfs_dev->tag, tag) == 0)
+ return usb9pfs_dev;
+ }
+
+ return NULL;
+}
+
+static int usb9pfs_tag_instance(struct f_usb9pfs_dev *dev, const char *tag)
+{
+ struct f_usb9pfs_dev *existing;
+ int ret = 0;
+
+ guard(mutex)(&usb9pfs_lock);
+
+ existing = _usb9pfs_do_find_dev(tag);
+ if (!existing)
+ strscpy(dev->tag, tag, ARRAY_SIZE(dev->tag));
+ else if (existing != dev)
+ ret = -EBUSY;
+
+ return ret;
+}
+
+static int usb9pfs_set_inst_tag(struct usb_function_instance *fi, const char *tag)
+{
+ if (strlen(tag) >= sizeof_field(struct f_usb9pfs_dev, tag))
+ return -ENAMETOOLONG;
+ return usb9pfs_tag_instance(fi_to_f_usb9pfs_opts(fi)->dev, tag);
+}
+
+static void usb9pfs_free_instance(struct usb_function_instance *fi)
+{
+ struct f_usb9pfs_opts *usb9pfs_opts =
+ container_of(fi, struct f_usb9pfs_opts, func_inst);
+ struct f_usb9pfs_dev *dev = usb9pfs_opts->dev;
+
+ mutex_lock(&usb9pfs_lock);
+ list_del(&dev->usb9pfs_instance);
+ mutex_unlock(&usb9pfs_lock);
+
+ kfree(usb9pfs_opts);
+}
+
+static struct usb_function_instance *usb9pfs_alloc_instance(void)
+{
+ struct f_usb9pfs_opts *usb9pfs_opts;
+ struct f_usb9pfs_dev *dev;
+
+ usb9pfs_opts = kzalloc(sizeof(*usb9pfs_opts), GFP_KERNEL);
+ if (!usb9pfs_opts)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_init(&usb9pfs_opts->lock);
+
+ usb9pfs_opts->func_inst.set_inst_name = usb9pfs_set_inst_tag;
+ usb9pfs_opts->func_inst.free_func_inst = usb9pfs_free_instance;
+
+ usb9pfs_opts->buflen = DEFAULT_BUFLEN;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev) {
+ kfree(usb9pfs_opts);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ usb9pfs_opts->dev = dev;
+ dev->opts = usb9pfs_opts;
+
+ config_group_init_type_name(&usb9pfs_opts->func_inst.group, "",
+ &usb9pfs_func_type);
+
+ mutex_lock(&usb9pfs_lock);
+ list_add_tail(&dev->usb9pfs_instance, &usbg_instance_list);
+ mutex_unlock(&usb9pfs_lock);
+
+ return &usb9pfs_opts->func_inst;
+}
+DECLARE_USB_FUNCTION(usb9pfs, usb9pfs_alloc_instance, usb9pfs_alloc);
+
+static int __init usb9pfs_modinit(void)
+{
+ int ret;
+
+ INIT_LIST_HEAD(&usbg_instance_list);
+
+ ret = usb_function_register(&usb9pfsusb_func);
+ if (!ret)
+ v9fs_register_trans(&p9_usbg_trans);
+
+ return ret;
+}
+
+static void __exit usb9pfs_modexit(void)
+{
+ usb_function_unregister(&usb9pfsusb_func);
+ v9fs_unregister_trans(&p9_usbg_trans);
+}
+
+module_init(usb9pfs_modinit);
+module_exit(usb9pfs_modexit);
+
+MODULE_ALIAS_9P("usbg");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("USB gadget 9pfs transport");
+MODULE_AUTHOR("Michael Grzeschik");
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 7728b0acde09..10c2dd486438 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* The Virtio 9p transport driver
*
@@ -8,22 +9,6 @@
*
* Based on virtio console driver
* Copyright (C) 2006, 2007 Rusty Russell, IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to:
- * Free Software Foundation
- * 51 Franklin Street, Fifth Floor
- * Boston, MA 02111-1301 USA
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -37,12 +22,11 @@
#include <linux/un.h>
#include <linux/uaccess.h>
#include <linux/inet.h>
-#include <linux/idr.h>
#include <linux/file.h>
#include <linux/highmem.h>
#include <linux/slab.h>
#include <net/9p/9p.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
#include <net/9p/client.h>
#include <net/9p/transport.h>
#include <linux/scatterlist.h>
@@ -65,7 +49,11 @@ static atomic_t vp_pinned = ATOMIC_INIT(0);
* @client: client instance
* @vdev: virtio dev associated with this channel
* @vq: virtio queue associated with this channel
+ * @ring_bufs_avail: flag to indicate there is some available in the ring buf
+ * @vc_wq: wait queue for waiting for thing to be added to ring buf
+ * @p9_max_pages: maximum number of pinned pages
* @sg: scatter gather list which is used to pack a request (protected?)
+ * @chan_list: linked list of channels
*
* We keep all per-channel information in a structure.
* This structure is allocated within the devices dev->mem space.
@@ -89,8 +77,8 @@ struct virtio_chan {
unsigned long p9_max_pages;
/* Scatterlist: can be too big for stack. */
struct scatterlist sg[VIRTQUEUE_NUM];
- /*
- * tag name to identify a mount null terminated
+ /**
+ * @tag: name to identify a mount null terminated
*/
char *tag;
@@ -110,7 +98,7 @@ static unsigned int rest_of_page(void *data)
* @client: client instance
*
* This reclaims a channel by freeing its resources and
- * reseting its inuse flag.
+ * resetting its inuse flag.
*
*/
@@ -155,7 +143,7 @@ static void req_done(struct virtqueue *vq)
}
if (len) {
- req->rc->size = len;
+ req->rc.size = len;
p9_client_cb(chan->client, req, REQ_STATUS_RCVD);
}
}
@@ -207,11 +195,19 @@ static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req)
return 1;
}
+/* Reply won't come, so drop req ref */
+static int p9_virtio_cancelled(struct p9_client *client, struct p9_req_t *req)
+{
+ p9_req_put(client, req);
+ return 0;
+}
+
/**
* pack_sg_list_p - Just like pack_sg_list. Instead of taking a buffer,
* this takes a list of pages.
* @sg: scatter/gather list to pack into
* @start: which segment of the sg_list to start at
+ * @limit: maximum number of pages in sg list.
* @pdata: a list of pages to add into sg.
* @nr_pages: number of pages to pack into the scatter/gather list
* @offs: amount of data in the beginning of first page _not_ to pack
@@ -266,19 +262,19 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n");
- req->status = REQ_STATUS_SENT;
+ WRITE_ONCE(req->status, REQ_STATUS_SENT);
req_retry:
spin_lock_irqsave(&chan->lock, flags);
out_sgs = in_sgs = 0;
/* Handle out VirtIO ring buffers */
out = pack_sg_list(chan->sg, 0,
- VIRTQUEUE_NUM, req->tc->sdata, req->tc->size);
+ VIRTQUEUE_NUM, req->tc.sdata, req->tc.size);
if (out)
sgs[out_sgs++] = chan->sg;
in = pack_sg_list(chan->sg, out,
- VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity);
+ VIRTQUEUE_NUM, req->rc.sdata, req->rc.capacity);
if (in)
sgs[out_sgs + in_sgs++] = chan->sg + out;
@@ -322,7 +318,7 @@ static int p9_get_mapped_pages(struct virtio_chan *chan,
if (!iov_iter_count(data))
return 0;
- if (!(data->type & ITER_KVEC)) {
+ if (!iov_iter_is_kvec(data)) {
int n;
/*
* We allow only p9_max_pages pinned. We wait for the
@@ -334,7 +330,7 @@ static int p9_get_mapped_pages(struct virtio_chan *chan,
if (err == -ERESTARTSYS)
return err;
}
- n = iov_iter_get_pages_alloc(data, pages, count, offs);
+ n = iov_iter_get_pages_alloc2(data, pages, count, offs);
if (n < 0)
return n;
*need_drop = 1;
@@ -376,10 +372,40 @@ static int p9_get_mapped_pages(struct virtio_chan *chan,
(*pages)[index] = kmap_to_page(p);
p += PAGE_SIZE;
}
+ iov_iter_advance(data, len);
return len;
}
}
+static void handle_rerror(struct p9_req_t *req, int in_hdr_len,
+ size_t offs, struct page **pages)
+{
+ unsigned size, n;
+ void *to = req->rc.sdata + in_hdr_len;
+
+ // Fits entirely into the static data? Nothing to do.
+ if (req->rc.size < in_hdr_len || !pages)
+ return;
+
+ // Really long error message? Tough, truncate the reply. Might get
+ // rejected (we can't be arsed to adjust the size encoded in header,
+ // or string size for that matter), but it wouldn't be anything valid
+ // anyway.
+ if (unlikely(req->rc.size > P9_ZC_HDR_SZ))
+ req->rc.size = P9_ZC_HDR_SZ;
+
+ // data won't span more than two pages
+ size = req->rc.size - in_hdr_len;
+ n = PAGE_SIZE - offs;
+ if (size > n) {
+ memcpy_from_page(to, *pages++, offs, n);
+ offs = 0;
+ to += n;
+ size -= n;
+ }
+ memcpy_from_page(to, *pages, offs, size);
+}
+
/**
* p9_virtio_zc_request - issue a zero copy request
* @client: client instance issuing the request
@@ -402,8 +428,9 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
struct page **in_pages = NULL, **out_pages = NULL;
struct virtio_chan *chan = client->trans;
struct scatterlist *sgs[4];
- size_t offs;
+ size_t offs = 0;
int need_drop = 0;
+ int kicked = 0;
p9_debug(P9_DEBUG_TRANS, "virtio request\n");
@@ -411,33 +438,37 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
__le32 sz;
int n = p9_get_mapped_pages(chan, &out_pages, uodata,
outlen, &offs, &need_drop);
- if (n < 0)
- return n;
+ if (n < 0) {
+ err = n;
+ goto err_out;
+ }
out_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE);
if (n != outlen) {
__le32 v = cpu_to_le32(n);
- memcpy(&req->tc->sdata[req->tc->size - 4], &v, 4);
+ memcpy(&req->tc.sdata[req->tc.size - 4], &v, 4);
outlen = n;
}
/* The size field of the message must include the length of the
* header and the length of the data. We didn't actually know
* the length of the data until this point so add it in now.
*/
- sz = cpu_to_le32(req->tc->size + outlen);
- memcpy(&req->tc->sdata[0], &sz, sizeof(sz));
+ sz = cpu_to_le32(req->tc.size + outlen);
+ memcpy(&req->tc.sdata[0], &sz, sizeof(sz));
} else if (uidata) {
int n = p9_get_mapped_pages(chan, &in_pages, uidata,
inlen, &offs, &need_drop);
- if (n < 0)
- return n;
+ if (n < 0) {
+ err = n;
+ goto err_out;
+ }
in_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE);
if (n != inlen) {
__le32 v = cpu_to_le32(n);
- memcpy(&req->tc->sdata[req->tc->size - 4], &v, 4);
+ memcpy(&req->tc.sdata[req->tc.size - 4], &v, 4);
inlen = n;
}
}
- req->status = REQ_STATUS_SENT;
+ WRITE_ONCE(req->status, REQ_STATUS_SENT);
req_retry_pinned:
spin_lock_irqsave(&chan->lock, flags);
@@ -445,7 +476,7 @@ req_retry_pinned:
/* out data */
out = pack_sg_list(chan->sg, 0,
- VIRTQUEUE_NUM, req->tc->sdata, req->tc->size);
+ VIRTQUEUE_NUM, req->tc.sdata, req->tc.size);
if (out)
sgs[out_sgs++] = chan->sg;
@@ -461,17 +492,17 @@ req_retry_pinned:
* For example TREAD have 11.
* 11 is the read/write header = PDU Header(7) + IO Size (4).
* Arrange in such a way that server places header in the
- * alloced memory and payload onto the user buffer.
+ * allocated memory and payload onto the user buffer.
*/
in = pack_sg_list(chan->sg, out,
- VIRTQUEUE_NUM, req->rc->sdata, in_hdr_len);
+ VIRTQUEUE_NUM, req->rc.sdata, in_hdr_len);
if (in)
sgs[out_sgs + in_sgs++] = chan->sg + out;
if (in_pages) {
sgs[out_sgs + in_sgs++] = chan->sg + out + in;
- in += pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM,
- in_pages, in_nr_pages, offs, inlen);
+ pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM,
+ in_pages, in_nr_pages, offs, inlen);
}
BUG_ON(out_sgs + in_sgs > ARRAY_SIZE(sgs));
@@ -498,8 +529,15 @@ req_retry_pinned:
}
virtqueue_kick(chan->vq);
spin_unlock_irqrestore(&chan->lock, flags);
+ kicked = 1;
p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n");
- err = wait_event_killable(req->wq, req->status >= REQ_STATUS_RCVD);
+ err = wait_event_killable(req->wq,
+ READ_ONCE(req->status) >= REQ_STATUS_RCVD);
+ // RERROR needs reply (== error string) in static data
+ if (READ_ONCE(req->status) == REQ_STATUS_RCVD &&
+ unlikely(req->rc.sdata[4] == P9_RERROR))
+ handle_rerror(req, in_hdr_len, offs, in_pages);
+
/*
* Non kernel buffers are pinned, unpin them
*/
@@ -518,6 +556,10 @@ err_out:
}
kvfree(in_pages);
kvfree(out_pages);
+ if (!kicked) {
+ /* reply won't come */
+ p9_req_put(client, req);
+ }
return err;
}
@@ -603,7 +645,7 @@ static int p9_virtio_probe(struct virtio_device *vdev)
chan->vc_wq = kmalloc(sizeof(wait_queue_head_t), GFP_KERNEL);
if (!chan->vc_wq) {
err = -ENOMEM;
- goto out_free_tag;
+ goto out_remove_file;
}
init_waitqueue_head(chan->vc_wq);
chan->ring_bufs_avail = 1;
@@ -621,6 +663,8 @@ static int p9_virtio_probe(struct virtio_device *vdev)
return 0;
+out_remove_file:
+ sysfs_remove_file(&vdev->dev.kobj, &dev_attr_mount_tag.attr);
out_free_tag:
kfree(tag);
out_free_vq:
@@ -635,11 +679,10 @@ fail:
/**
* p9_virtio_create - allocate a new virtio channel
* @client: client instance invoking this transport
- * @devname: string identifying the channel to connect to (unused)
- * @args: args passed from sys_mount() for per-transport options (unused)
+ * @fc: the filesystem context
*
* This sets up a transport channel for 9p communication. Right now
- * we only match the first available channel, but eventually we couldlook up
+ * we only match the first available channel, but eventually we could look up
* alternate channels by matching devname versus a virtio_config entry.
* We use a simple reference count mechanism to ensure that only a single
* mount has a channel open at a time.
@@ -647,8 +690,9 @@ fail:
*/
static int
-p9_virtio_create(struct p9_client *client, const char *devname, char *args)
+p9_virtio_create(struct p9_client *client, struct fs_context *fc)
{
+ const char *devname = fc->source;
struct virtio_chan *chan;
int ret = -ENOENT;
int found = 0;
@@ -712,7 +756,7 @@ static void p9_virtio_remove(struct virtio_device *vdev)
mutex_unlock(&virtio_9p_lock);
- vdev->config->reset(vdev);
+ virtio_reset_device(vdev);
vdev->config->del_vqs(vdev);
sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr);
@@ -737,7 +781,6 @@ static struct virtio_driver p9_virtio_drv = {
.feature_table = features,
.feature_table_size = ARRAY_SIZE(features),
.driver.name = KBUILD_MODNAME,
- .driver.owner = THIS_MODULE,
.id_table = id_table,
.probe = p9_virtio_probe,
.remove = p9_virtio_remove,
@@ -750,24 +793,33 @@ static struct p9_trans_module p9_virtio_trans = {
.request = p9_virtio_request,
.zc_request = p9_virtio_zc_request,
.cancel = p9_virtio_cancel,
+ .cancelled = p9_virtio_cancelled,
/*
* We leave one entry for input and one entry for response
- * headers. We also skip one more entry to accomodate, address
+ * headers. We also skip one more entry to accommodate, address
* that are not at page boundary, that can result in an extra
* page in zero copy.
*/
.maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3),
- .def = 1,
+ .pooled_rbuffers = false,
+ .def = true,
+ .supports_vmalloc = false,
.owner = THIS_MODULE,
};
/* The standard init function */
static int __init p9_virtio_init(void)
{
+ int rc;
+
INIT_LIST_HEAD(&virtio_chan_list);
v9fs_register_trans(&p9_virtio_trans);
- return register_virtio_driver(&p9_virtio_drv);
+ rc = register_virtio_driver(&p9_virtio_drv);
+ if (rc)
+ v9fs_unregister_trans(&p9_virtio_trans);
+
+ return rc;
}
static void __exit p9_virtio_cleanup(void)
@@ -778,6 +830,7 @@ static void __exit p9_virtio_cleanup(void)
module_init(p9_virtio_init);
module_exit(p9_virtio_cleanup);
+MODULE_ALIAS_9P("virtio");
MODULE_DEVICE_TABLE(virtio, id_table);
MODULE_AUTHOR("Eric Van Hensbergen <ericvh@gmail.com>");
diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c
index c2d54ac76bfd..12f752a92332 100644
--- a/net/9p/trans_xen.c
+++ b/net/9p/trans_xen.c
@@ -1,33 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/fs/9p/trans_xen
*
* Xen transport layer.
*
* Copyright (C) 2017 by Stefano Stabellini <stefano@aporeto.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation; or, when distributed
- * separately from the Linux kernel or incorporated into other
- * software packages, subject to the following license:
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this source file (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use, copy, modify,
- * merge, publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
*/
#include <xen/events.h>
@@ -38,13 +15,14 @@
#include <linux/module.h>
#include <linux/spinlock.h>
+#include <linux/fs_context.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
#include <net/9p/transport.h>
#define XEN_9PFS_NUM_RINGS 2
-#define XEN_9PFS_RING_ORDER 6
-#define XEN_9PFS_RING_SIZE XEN_FLEX_RING_SIZE(XEN_9PFS_RING_ORDER)
+#define XEN_9PFS_RING_ORDER 9
+#define XEN_9PFS_RING_SIZE(ring) XEN_FLEX_RING_SIZE(ring->intf->ring_order)
struct xen_9pfs_header {
uint32_t size;
@@ -77,7 +55,6 @@ struct xen_9pfs_front_priv {
char *tag;
struct p9_client *client;
- int num_rings;
struct xen_9pfs_dataring *rings;
};
@@ -90,8 +67,9 @@ static int p9_xen_cancel(struct p9_client *client, struct p9_req_t *req)
return 1;
}
-static int p9_xen_create(struct p9_client *client, const char *addr, char *args)
+static int p9_xen_create(struct p9_client *client, struct fs_context *fc)
{
+ const char *addr = fc->source;
struct xen_9pfs_front_priv *priv;
if (addr == NULL)
@@ -132,16 +110,16 @@ static bool p9_xen_write_todo(struct xen_9pfs_dataring *ring, RING_IDX size)
prod = ring->intf->out_prod;
virt_mb();
- return XEN_9PFS_RING_SIZE -
- xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE) >= size;
+ return XEN_9PFS_RING_SIZE(ring) -
+ xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE(ring)) >= size;
}
static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req)
{
- struct xen_9pfs_front_priv *priv = NULL;
+ struct xen_9pfs_front_priv *priv;
RING_IDX cons, prod, masked_cons, masked_prod;
unsigned long flags;
- u32 size = p9_req->tc->size;
+ u32 size = p9_req->tc.size;
struct xen_9pfs_dataring *ring;
int num;
@@ -151,10 +129,10 @@ static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req)
break;
}
read_unlock(&xen_9pfs_lock);
- if (!priv || priv->client != client)
+ if (list_entry_is_head(priv, &xen_9pfs_devs, list))
return -EINVAL;
- num = p9_req->tc->tag % priv->num_rings;
+ num = p9_req->tc.tag % XEN_9PFS_NUM_RINGS;
ring = &priv->rings[num];
again:
@@ -167,24 +145,26 @@ again:
prod = ring->intf->out_prod;
virt_mb();
- if (XEN_9PFS_RING_SIZE - xen_9pfs_queued(prod, cons,
- XEN_9PFS_RING_SIZE) < size) {
+ if (XEN_9PFS_RING_SIZE(ring) -
+ xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE(ring)) < size) {
spin_unlock_irqrestore(&ring->lock, flags);
goto again;
}
- masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE);
- masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE);
+ masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE(ring));
+ masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE(ring));
- xen_9pfs_write_packet(ring->data.out, p9_req->tc->sdata, size,
- &masked_prod, masked_cons, XEN_9PFS_RING_SIZE);
+ xen_9pfs_write_packet(ring->data.out, p9_req->tc.sdata, size,
+ &masked_prod, masked_cons,
+ XEN_9PFS_RING_SIZE(ring));
- p9_req->status = REQ_STATUS_SENT;
+ WRITE_ONCE(p9_req->status, REQ_STATUS_SENT);
virt_wmb(); /* write ring before updating pointer */
prod += size;
ring->intf->out_prod = prod;
spin_unlock_irqrestore(&ring->lock, flags);
notify_remote_via_irq(ring->irq);
+ p9_req_put(client, p9_req);
return 0;
}
@@ -206,19 +186,19 @@ static void p9_xen_response(struct work_struct *work)
prod = ring->intf->in_prod;
virt_rmb();
- if (xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE) <
+ if (xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE(ring)) <
sizeof(h)) {
notify_remote_via_irq(ring->irq);
return;
}
- masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE);
- masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE);
+ masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE(ring));
+ masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE(ring));
/* First, read just the header */
xen_9pfs_read_packet(&h, ring->data.in, sizeof(h),
masked_prod, &masked_cons,
- XEN_9PFS_RING_SIZE);
+ XEN_9PFS_RING_SIZE(ring));
req = p9_tag_lookup(priv->client, h.tag);
if (!req || req->status != REQ_STATUS_SENT) {
@@ -229,15 +209,26 @@ static void p9_xen_response(struct work_struct *work)
continue;
}
- memcpy(req->rc, &h, sizeof(h));
- req->rc->offset = 0;
+ if (h.size > req->rc.capacity) {
+ dev_warn(&priv->dev->dev,
+ "requested packet size too big: %d for tag %d with capacity %zd\n",
+ h.size, h.tag, req->rc.capacity);
+ WRITE_ONCE(req->status, REQ_STATUS_ERROR);
+ goto recv_error;
+ }
+
+ req->rc.size = h.size;
+ req->rc.id = h.id;
+ req->rc.tag = h.tag;
+ req->rc.offset = 0;
- masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE);
+ masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE(ring));
/* Then, read the whole packet (including the header) */
- xen_9pfs_read_packet(req->rc->sdata, ring->data.in, h.size,
+ xen_9pfs_read_packet(req->rc.sdata, ring->data.in, h.size,
masked_prod, &masked_cons,
- XEN_9PFS_RING_SIZE);
+ XEN_9PFS_RING_SIZE(ring));
+recv_error:
virt_mb();
cons += h.size;
ring->intf->in_cons = cons;
@@ -266,8 +257,10 @@ static irqreturn_t xen_9pfs_front_event_handler(int irq, void *r)
static struct p9_trans_module p9_xen_trans = {
.name = "xen",
- .maxsize = 1 << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT),
- .def = 1,
+ .maxsize = 1 << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT - 2),
+ .pooled_rbuffers = false,
+ .def = true,
+ .supports_vmalloc = false,
.create = p9_xen_create,
.close = p9_xen_close,
.request = p9_xen_request,
@@ -288,23 +281,29 @@ static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv)
list_del(&priv->list);
write_unlock(&xen_9pfs_lock);
- for (i = 0; i < priv->num_rings; i++) {
+ for (i = 0; i < XEN_9PFS_NUM_RINGS; i++) {
+ struct xen_9pfs_dataring *ring = &priv->rings[i];
+
+ cancel_work_sync(&ring->work);
+
if (!priv->rings[i].intf)
break;
if (priv->rings[i].irq > 0)
- unbind_from_irqhandler(priv->rings[i].irq, priv->dev);
+ unbind_from_irqhandler(priv->rings[i].irq, ring);
if (priv->rings[i].data.in) {
- for (j = 0; j < (1 << XEN_9PFS_RING_ORDER); j++) {
+ for (j = 0;
+ j < (1 << priv->rings[i].intf->ring_order);
+ j++) {
grant_ref_t ref;
ref = priv->rings[i].intf->ref[j];
- gnttab_end_foreign_access(ref, 0, 0);
+ gnttab_end_foreign_access(ref, NULL);
}
- free_pages((unsigned long)priv->rings[i].data.in,
- XEN_9PFS_RING_ORDER -
- (PAGE_SHIFT - XEN_PAGE_SHIFT));
+ free_pages_exact(priv->rings[i].data.in,
+ 1UL << (priv->rings[i].intf->ring_order +
+ XEN_PAGE_SHIFT));
}
- gnttab_end_foreign_access(priv->rings[i].ref, 0, 0);
+ gnttab_end_foreign_access(priv->rings[i].ref, NULL);
free_page((unsigned long)priv->rings[i].intf);
}
kfree(priv->rings);
@@ -312,17 +311,17 @@ static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv)
kfree(priv);
}
-static int xen_9pfs_front_remove(struct xenbus_device *dev)
+static void xen_9pfs_front_remove(struct xenbus_device *dev)
{
struct xen_9pfs_front_priv *priv = dev_get_drvdata(&dev->dev);
dev_set_drvdata(&dev->dev, NULL);
xen_9pfs_front_free(priv);
- return 0;
}
static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev,
- struct xen_9pfs_dataring *ring)
+ struct xen_9pfs_dataring *ring,
+ unsigned int order)
{
int i = 0;
int ret = -ENOMEM;
@@ -340,22 +339,22 @@ static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev,
if (ret < 0)
goto out;
ring->ref = ret;
- bytes = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
- XEN_9PFS_RING_ORDER - (PAGE_SHIFT - XEN_PAGE_SHIFT));
+ bytes = alloc_pages_exact(1UL << (order + XEN_PAGE_SHIFT),
+ GFP_KERNEL | __GFP_ZERO);
if (!bytes) {
ret = -ENOMEM;
goto out;
}
- for (; i < (1 << XEN_9PFS_RING_ORDER); i++) {
+ for (; i < (1 << order); i++) {
ret = gnttab_grant_foreign_access(
dev->otherend_id, virt_to_gfn(bytes) + i, 0);
if (ret < 0)
goto out;
ring->intf->ref[i] = ret;
}
- ring->intf->ring_order = XEN_9PFS_RING_ORDER;
+ ring->intf->ring_order = order;
ring->data.in = bytes;
- ring->data.out = bytes + XEN_9PFS_RING_SIZE;
+ ring->data.out = bytes + XEN_FLEX_RING_SIZE(order);
ret = xenbus_alloc_evtchn(dev, &ring->evtchn);
if (ret)
@@ -371,29 +370,32 @@ static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev,
out:
if (bytes) {
for (i--; i >= 0; i--)
- gnttab_end_foreign_access(ring->intf->ref[i], 0, 0);
- free_pages((unsigned long)bytes,
- XEN_9PFS_RING_ORDER -
- (PAGE_SHIFT - XEN_PAGE_SHIFT));
+ gnttab_end_foreign_access(ring->intf->ref[i], NULL);
+ free_pages_exact(bytes, 1UL << (order + XEN_PAGE_SHIFT));
}
- gnttab_end_foreign_access(ring->ref, 0, 0);
+ gnttab_end_foreign_access(ring->ref, NULL);
free_page((unsigned long)ring->intf);
return ret;
}
-static int xen_9pfs_front_probe(struct xenbus_device *dev,
- const struct xenbus_device_id *id)
+static int xen_9pfs_front_init(struct xenbus_device *dev)
{
int ret, i;
struct xenbus_transaction xbt;
- struct xen_9pfs_front_priv *priv = NULL;
- char *versions;
+ struct xen_9pfs_front_priv *priv = dev_get_drvdata(&dev->dev);
+ char *versions, *v;
unsigned int max_rings, max_ring_order, len = 0;
versions = xenbus_read(XBT_NIL, dev->otherend, "versions", &len);
- if (!len)
- return -EINVAL;
- if (strcmp(versions, "1")) {
+ if (IS_ERR(versions))
+ return PTR_ERR(versions);
+ for (v = versions; *v; v++) {
+ if (simple_strtoul(v, &v, 10) == 1) {
+ v = NULL;
+ break;
+ }
+ }
+ if (v) {
kfree(versions);
return -EINVAL;
}
@@ -403,25 +405,22 @@ static int xen_9pfs_front_probe(struct xenbus_device *dev,
return -EINVAL;
max_ring_order = xenbus_read_unsigned(dev->otherend,
"max-ring-page-order", 0);
- if (max_ring_order < XEN_9PFS_RING_ORDER)
- return -EINVAL;
-
- priv = kzalloc(sizeof(*priv), GFP_KERNEL);
- if (!priv)
- return -ENOMEM;
+ if (max_ring_order > XEN_9PFS_RING_ORDER)
+ max_ring_order = XEN_9PFS_RING_ORDER;
+ if (p9_xen_trans.maxsize > XEN_FLEX_RING_SIZE(max_ring_order))
+ p9_xen_trans.maxsize = XEN_FLEX_RING_SIZE(max_ring_order) / 2;
- priv->dev = dev;
- priv->num_rings = XEN_9PFS_NUM_RINGS;
- priv->rings = kcalloc(priv->num_rings, sizeof(*priv->rings),
+ priv->rings = kcalloc(XEN_9PFS_NUM_RINGS, sizeof(*priv->rings),
GFP_KERNEL);
if (!priv->rings) {
kfree(priv);
return -ENOMEM;
}
- for (i = 0; i < priv->num_rings; i++) {
+ for (i = 0; i < XEN_9PFS_NUM_RINGS; i++) {
priv->rings[i].priv = priv;
- ret = xen_9pfs_front_alloc_dataring(dev, &priv->rings[i]);
+ ret = xen_9pfs_front_alloc_dataring(dev, &priv->rings[i],
+ max_ring_order);
if (ret < 0)
goto error;
}
@@ -436,20 +435,21 @@ static int xen_9pfs_front_probe(struct xenbus_device *dev,
if (ret)
goto error_xenbus;
ret = xenbus_printf(xbt, dev->nodename, "num-rings", "%u",
- priv->num_rings);
+ XEN_9PFS_NUM_RINGS);
if (ret)
goto error_xenbus;
- for (i = 0; i < priv->num_rings; i++) {
+
+ for (i = 0; i < XEN_9PFS_NUM_RINGS; i++) {
char str[16];
BUILD_BUG_ON(XEN_9PFS_NUM_RINGS > 9);
- sprintf(str, "ring-ref%u", i);
+ sprintf(str, "ring-ref%d", i);
ret = xenbus_printf(xbt, dev->nodename, str, "%d",
priv->rings[i].ref);
if (ret)
goto error_xenbus;
- sprintf(str, "event-channel-%u", i);
+ sprintf(str, "event-channel-%d", i);
ret = xenbus_printf(xbt, dev->nodename, str, "%u",
priv->rings[i].evtchn);
if (ret)
@@ -468,23 +468,36 @@ static int xen_9pfs_front_probe(struct xenbus_device *dev,
goto error;
}
- write_lock(&xen_9pfs_lock);
- list_add_tail(&priv->list, &xen_9pfs_devs);
- write_unlock(&xen_9pfs_lock);
- dev_set_drvdata(&dev->dev, priv);
xenbus_switch_state(dev, XenbusStateInitialised);
-
return 0;
error_xenbus:
xenbus_transaction_end(xbt, 1);
xenbus_dev_fatal(dev, ret, "writing xenstore");
error:
- dev_set_drvdata(&dev->dev, NULL);
xen_9pfs_front_free(priv);
return ret;
}
+static int xen_9pfs_front_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ struct xen_9pfs_front_priv *priv = NULL;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ priv->dev = dev;
+ dev_set_drvdata(&dev->dev, priv);
+
+ write_lock(&xen_9pfs_lock);
+ list_add_tail(&priv->list, &xen_9pfs_devs);
+ write_unlock(&xen_9pfs_lock);
+
+ return 0;
+}
+
static int xen_9pfs_front_resume(struct xenbus_device *dev)
{
dev_warn(&dev->dev, "suspend/resume unsupported\n");
@@ -503,6 +516,10 @@ static void xen_9pfs_front_changed(struct xenbus_device *dev,
break;
case XenbusStateInitWait:
+ if (dev->state != XenbusStateInitialising)
+ break;
+
+ xen_9pfs_front_init(dev);
break;
case XenbusStateConnected:
@@ -512,7 +529,7 @@ static void xen_9pfs_front_changed(struct xenbus_device *dev,
case XenbusStateClosed:
if (dev->state == XenbusStateClosed)
break;
- /* Missed the backend's CLOSING state -- fallthrough */
+ fallthrough; /* Missed the backend's CLOSING state */
case XenbusStateClosing:
xenbus_frontend_closed(dev);
break;
@@ -527,25 +544,33 @@ static struct xenbus_driver xen_9pfs_front_driver = {
.otherend_changed = xen_9pfs_front_changed,
};
-static int p9_trans_xen_init(void)
+static int __init p9_trans_xen_init(void)
{
+ int rc;
+
if (!xen_domain())
return -ENODEV;
pr_info("Initialising Xen transport for 9pfs\n");
v9fs_register_trans(&p9_xen_trans);
- return xenbus_register_frontend(&xen_9pfs_front_driver);
+ rc = xenbus_register_frontend(&xen_9pfs_front_driver);
+ if (rc)
+ v9fs_unregister_trans(&p9_xen_trans);
+
+ return rc;
}
module_init(p9_trans_xen_init);
+MODULE_ALIAS_9P("xen");
-static void p9_trans_xen_exit(void)
+static void __exit p9_trans_xen_exit(void)
{
v9fs_unregister_trans(&p9_xen_trans);
return xenbus_unregister_driver(&xen_9pfs_front_driver);
}
module_exit(p9_trans_xen_exit);
+MODULE_ALIAS("xen:9pfs");
MODULE_AUTHOR("Stefano Stabellini <stefano@aporeto.com>");
MODULE_DESCRIPTION("Xen Transport for 9P");
MODULE_LICENSE("GPL");
diff --git a/net/9p/util.c b/net/9p/util.c
deleted file mode 100644
index 55ad98277e85..000000000000
--- a/net/9p/util.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * net/9p/util.c
- *
- * This file contains some helper functions
- *
- * Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net>
- * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com>
- * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to:
- * Free Software Foundation
- * 51 Franklin Street, Fifth Floor
- * Boston, MA 02111-1301 USA
- *
- */
-
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/sched.h>
-#include <linux/parser.h>
-#include <linux/idr.h>
-#include <linux/slab.h>
-#include <net/9p/9p.h>
-
-/**
- * struct p9_idpool - per-connection accounting for tag idpool
- * @lock: protects the pool
- * @pool: idr to allocate tag id from
- *
- */
-
-struct p9_idpool {
- spinlock_t lock;
- struct idr pool;
-};
-
-/**
- * p9_idpool_create - create a new per-connection id pool
- *
- */
-
-struct p9_idpool *p9_idpool_create(void)
-{
- struct p9_idpool *p;
-
- p = kmalloc(sizeof(struct p9_idpool), GFP_KERNEL);
- if (!p)
- return ERR_PTR(-ENOMEM);
-
- spin_lock_init(&p->lock);
- idr_init(&p->pool);
-
- return p;
-}
-EXPORT_SYMBOL(p9_idpool_create);
-
-/**
- * p9_idpool_destroy - create a new per-connection id pool
- * @p: idpool to destroy
- */
-
-void p9_idpool_destroy(struct p9_idpool *p)
-{
- idr_destroy(&p->pool);
- kfree(p);
-}
-EXPORT_SYMBOL(p9_idpool_destroy);
-
-/**
- * p9_idpool_get - allocate numeric id from pool
- * @p: pool to allocate from
- *
- * Bugs: This seems to be an awful generic function, should it be in idr.c with
- * the lock included in struct idr?
- */
-
-int p9_idpool_get(struct p9_idpool *p)
-{
- int i;
- unsigned long flags;
-
- idr_preload(GFP_NOFS);
- spin_lock_irqsave(&p->lock, flags);
-
- /* no need to store exactly p, we just need something non-null */
- i = idr_alloc(&p->pool, p, 0, 0, GFP_NOWAIT);
-
- spin_unlock_irqrestore(&p->lock, flags);
- idr_preload_end();
- if (i < 0)
- return -1;
-
- p9_debug(P9_DEBUG_MUX, " id %d pool %p\n", i, p);
- return i;
-}
-EXPORT_SYMBOL(p9_idpool_get);
-
-/**
- * p9_idpool_put - release numeric id from pool
- * @id: numeric id which is being released
- * @p: pool to release id into
- *
- * Bugs: This seems to be an awful generic function, should it be in idr.c with
- * the lock included in struct idr?
- */
-
-void p9_idpool_put(int id, struct p9_idpool *p)
-{
- unsigned long flags;
-
- p9_debug(P9_DEBUG_MUX, " id %d pool %p\n", id, p);
-
- spin_lock_irqsave(&p->lock, flags);
- idr_remove(&p->pool, id);
- spin_unlock_irqrestore(&p->lock, flags);
-}
-EXPORT_SYMBOL(p9_idpool_put);
-
-/**
- * p9_idpool_check - check if the specified id is available
- * @id: id to check
- * @p: pool to check
- */
-
-int p9_idpool_check(int id, struct p9_idpool *p)
-{
- return idr_find(&p->pool, id) != NULL;
-}
-EXPORT_SYMBOL(p9_idpool_check);
diff --git a/net/Kconfig b/net/Kconfig
index 228dfa382eec..62266eaf0e95 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Network configuration
#
@@ -7,7 +8,7 @@ menuconfig NET
select NLATTR
select GENERIC_NET_UTILS
select BPF
- ---help---
+ help
Unless you really know what you are doing, you should say Y here.
The reason is that some programs need kernel networking support even
when running on a stand-alone machine that isn't connected to any
@@ -51,21 +52,68 @@ config NET_INGRESS
config NET_EGRESS
bool
+config NET_XGRESS
+ select NET_INGRESS
+ select NET_EGRESS
+ bool
+
+config NET_REDIRECT
+ bool
+
+config SKB_DECRYPTED
+ bool
+
+config SKB_EXTENSIONS
+ bool
+
+config NET_DEVMEM
+ def_bool y
+ select GENERIC_ALLOCATOR
+ depends on DMA_SHARED_BUFFER
+ depends on PAGE_POOL
+
+config NET_SHAPER
+ bool
+
+config NET_CRC32C
+ bool
+ select CRC32
+
menu "Networking options"
source "net/packet/Kconfig"
+source "net/psp/Kconfig"
source "net/unix/Kconfig"
source "net/tls/Kconfig"
source "net/xfrm/Kconfig"
source "net/iucv/Kconfig"
source "net/smc/Kconfig"
+source "drivers/dibs/Kconfig"
source "net/xdp/Kconfig"
+config NET_HANDSHAKE
+ bool
+ depends on SUNRPC || NVME_TARGET_TCP || NVME_TCP
+ default y
+
+config NET_HANDSHAKE_KUNIT_TEST
+ tristate "KUnit tests for the handshake upcall mechanism" if !KUNIT_ALL_TESTS
+ default KUNIT_ALL_TESTS
+ depends on KUNIT
+ help
+ This builds the KUnit tests for the handshake upcall mechanism.
+
+ KUnit tests run during boot and output the results to the debug
+ log in TAP format (https://testanything.org/). Only useful for
+ kernel devs running KUnit test harness and are not for inclusion
+ into a production build.
+
+ For more information on KUnit and unit tests in general, refer
+ to the KUnit documentation in Documentation/dev-tools/kunit/.
+
config INET
bool "TCP/IP networking"
- select CRYPTO
- select CRYPTO_AES
- ---help---
+ help
These are the protocols used on the Internet and on most local
Ethernets. It is highly recommended to say Y here (this will enlarge
your kernel by about 400 KB), since some programs (e.g. the X window
@@ -81,7 +129,7 @@ config INET
"Sysctl support" below, you can change various aspects of the
behavior of the TCP/IP code by writing to the (virtual) files in
/proc/sys/net/ipv4/*; the options are explained in the file
- <file:Documentation/networking/ip-sysctl.txt>.
+ <file:Documentation/networking/ip-sysctl.rst>.
Short answer: say Y.
@@ -89,6 +137,7 @@ if INET
source "net/ipv4/Kconfig"
source "net/ipv6/Kconfig"
source "net/netlabel/Kconfig"
+source "net/mptcp/Kconfig"
endif # if INET
@@ -106,15 +155,16 @@ config NETWORK_PHY_TIMESTAMPING
bool "Timestamping in PHY devices"
select NET_PTP_CLASSIFY
help
- This allows timestamping of network packets by PHYs with
- hardware timestamping capabilities. This option adds some
- overhead in the transmit and receive paths.
+ This allows timestamping of network packets by PHYs (or
+ other MII bus snooping devices) with hardware timestamping
+ capabilities. This option adds some overhead in the transmit
+ and receive paths.
If you are unsure how to answer this question, answer N.
menuconfig NETFILTER
bool "Network packet filtering framework (Netfilter)"
- ---help---
+ help
Netfilter is a framework for filtering and mangling network packets
that pass through your Linux box.
@@ -184,8 +234,8 @@ config BRIDGE_NETFILTER
depends on NETFILTER && INET
depends on NETFILTER_ADVANCED
select NETFILTER_FAMILY_BRIDGE
- default m
- ---help---
+ select SKB_EXTENSIONS
+ help
Enabling this option will let arptables resp. iptables see bridged
ARP resp. IP traffic. If you want a bridging firewall, you probably
want this option enabled.
@@ -197,14 +247,10 @@ config BRIDGE_NETFILTER
source "net/netfilter/Kconfig"
source "net/ipv4/netfilter/Kconfig"
source "net/ipv6/netfilter/Kconfig"
-source "net/decnet/netfilter/Kconfig"
source "net/bridge/netfilter/Kconfig"
-endif
+endif # if NETFILTER
-source "net/bpfilter/Kconfig"
-
-source "net/dccp/Kconfig"
source "net/sctp/Kconfig"
source "net/rds/Kconfig"
source "net/tipc/Kconfig"
@@ -214,9 +260,8 @@ source "net/802/Kconfig"
source "net/bridge/Kconfig"
source "net/dsa/Kconfig"
source "net/8021q/Kconfig"
-source "net/decnet/Kconfig"
source "net/llc/Kconfig"
-source "drivers/net/appletalk/Kconfig"
+source "net/appletalk/Kconfig"
source "net/x25/Kconfig"
source "net/lapb/Kconfig"
source "net/phonet/Kconfig"
@@ -238,30 +283,60 @@ source "net/l3mdev/Kconfig"
source "net/qrtr/Kconfig"
source "net/ncsi/Kconfig"
+config PCPU_DEV_REFCNT
+ bool "Use percpu variables to maintain network device refcount"
+ depends on SMP
+ default y
+ help
+ network device refcount are using per cpu variables if this option is set.
+ This can be forced to N to detect underflows (with a performance drop).
+
+config MAX_SKB_FRAGS
+ int "Maximum number of fragments per skb_shared_info"
+ range 17 45
+ default 17
+ help
+ Having more fragments per skb_shared_info can help GRO efficiency.
+ This helps BIG TCP workloads, but might expose bugs in some
+ legacy drivers.
+ This also increases memory overhead of small packets,
+ and in drivers using build_skb().
+ If unsure, say 17.
+
config RPS
- bool
+ bool "Receive packet steering"
depends on SMP && SYSFS
default y
+ help
+ Software receive side packet steering (RPS) distributes the
+ load of received packet processing across multiple CPUs.
config RFS_ACCEL
- bool
+ bool "Hardware acceleration of RFS"
depends on RPS
select CPU_RMAP
default y
+ help
+ Allowing drivers for multiqueue hardware with flow filter tables to
+ accelerate RFS.
+
+config SOCK_RX_QUEUE_MAPPING
+ bool
config XPS
bool
depends on SMP
+ select SOCK_RX_QUEUE_MAPPING
default y
config HWBM
- bool
+ bool
config CGROUP_NET_PRIO
bool "Network priority cgroup"
depends on CGROUPS
select SOCK_CGROUP_DATA
- ---help---
+ help
Cgroup subsystem for use in assigning processes to network priorities on
a per-interface basis.
@@ -269,52 +344,37 @@ config CGROUP_NET_CLASSID
bool "Network classid cgroup"
depends on CGROUPS
select SOCK_CGROUP_DATA
- ---help---
+ help
Cgroup subsystem for use as general purpose socket classid marker that is
being used in cls_cgroup and for netfilter matching.
config NET_RX_BUSY_POLL
bool
- default y
+ default y if !PREEMPT_RT || (PREEMPT_RT && !NETCONSOLE)
config BQL
bool
+ prompt "Enable Byte Queue Limits"
depends on SYSFS
select DQL
default y
-config BPF_JIT
- bool "enable BPF Just In Time compiler"
- depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT
- depends on MODULES
- ---help---
- Berkeley Packet Filter filtering capabilities are normally handled
- by an interpreter. This option allows kernel to generate a native
- code when filter is loaded in memory. This should speedup
- packet sniffing (libpcap/tcpdump).
-
- Note, admin should enable this feature changing:
- /proc/sys/net/core/bpf_jit_enable
- /proc/sys/net/core/bpf_jit_harden (optional)
- /proc/sys/net/core/bpf_jit_kallsyms (optional)
-
config BPF_STREAM_PARSER
bool "enable BPF STREAM_PARSER"
+ depends on INET
depends on BPF_SYSCALL
+ depends on CGROUP_BPF
select STREAM_PARSER
- ---help---
- Enabling this allows a stream parser to be used with
- BPF_MAP_TYPE_SOCKMAP.
-
- BPF_MAP_TYPE_SOCKMAP provides a map type to use with network sockets.
- It can be used to enforce socket policy, implement socket redirects,
- etc.
+ select NET_SOCK_MSG
+ help
+ Enabling this allows a TCP stream parser to be used with
+ BPF_MAP_TYPE_SOCKMAP.
config NET_FLOW_LIMIT
- bool
+ bool "Net flow limit"
depends on RPS
default y
- ---help---
+ help
The network stack has to drop packets when a receive processing CPU's
backlog reaches netdev_max_backlog. If a few out of many active flows
generate the vast majority of load, drop their traffic earlier to
@@ -327,32 +387,32 @@ menu "Network testing"
config NET_PKTGEN
tristate "Packet Generator (USE WITH CAUTION)"
depends on INET && PROC_FS
- ---help---
+ help
This module will inject preconfigured packets, at a configurable
rate, out of a given interface. It is used for network interface
stress testing and performance analysis. If you don't understand
what was just said, you don't need it: say N.
Documentation on how to use the packet generator can be found
- at <file:Documentation/networking/pktgen.txt>.
+ at <file:Documentation/networking/pktgen.rst>.
To compile this code as a module, choose M here: the
module will be called pktgen.
config NET_DROP_MONITOR
- tristate "Network packet drop alerting service"
+ tristate "Legacy network packet drop alerting service"
depends on INET && TRACEPOINTS
- ---help---
- This feature provides an alerting service to userspace in the
- event that packets are discarded in the network stack. Alerts
- are broadcast via netlink socket to any listening user space
- process. If you don't need network drop alerts, or if you are ok
- just checking the various proc files and other utilities for
- drop statistics, say N here.
+ help
+ This feature provides an alerting service to userspace in the
+ event that packets are discarded in the network stack. Alerts
+ are broadcast via netlink socket to any listening user space
+ process. This feature is NOT related to "perf" based drop monitoring.
+ Say N here unless you need to support older userspace tools like
+ "dropwatch".
-endmenu
+endmenu # Network testing
-endmenu
+endmenu # Networking options
source "net/ax25/Kconfig"
source "net/can/Kconfig"
@@ -360,6 +420,7 @@ source "net/bluetooth/Kconfig"
source "net/rxrpc/Kconfig"
source "net/kcm/Kconfig"
source "net/strparser/Kconfig"
+source "net/mctp/Kconfig"
config FIB_RULES
bool
@@ -376,8 +437,6 @@ source "net/mac80211/Kconfig"
endif # WIRELESS
-source "net/wimax/Kconfig"
-
source "net/rfkill/Kconfig"
source "net/9p/Kconfig"
source "net/caif/Kconfig"
@@ -388,7 +447,7 @@ source "net/ife/Kconfig"
config LWTUNNEL
bool "Network light weight tunnels"
- ---help---
+ help
This feature provides an infrastructure to support light weight
tunnels like mpls. There is no netdevice associated with a light
weight tunnel endpoint. Tunnel encapsulation parameters are stored
@@ -396,9 +455,9 @@ config LWTUNNEL
config LWTUNNEL_BPF
bool "Execute BPF program as route nexthop action"
- depends on LWTUNNEL
+ depends on LWTUNNEL && INET
default y if LWTUNNEL=y
- ---help---
+ help
Allows to run BPF programs as a nexthop action following a route
lookup for incoming and outgoing packets.
@@ -413,24 +472,40 @@ config GRO_CELLS
config SOCK_VALIDATE_XMIT
bool
-config NET_DEVLINK
- tristate "Network physical/parent device Netlink interface"
- help
- Network physical/parent device Netlink interface provides
- infrastructure to support access to physical chip-wide config and
- monitoring.
-
-config MAY_USE_DEVLINK
- tristate
- default m if NET_DEVLINK=m
- default y if NET_DEVLINK=y || NET_DEVLINK=n
+config NET_IEEE8021Q_HELPERS
+ bool
+
+config NET_SELFTESTS
+ def_tristate PHYLIB
+ depends on PHYLIB && INET
+
+config NET_SOCK_MSG
+ bool
+ default n
help
- Drivers using the devlink infrastructure should have a dependency
- on MAY_USE_DEVLINK to ensure they do not cause link errors when
- devlink is a loadable module and the driver using it is built-in.
+ The NET_SOCK_MSG provides a framework for plain sockets (e.g. TCP) or
+ ULPs (upper layer modules, e.g. TLS) to process L7 application data
+ with the help of BPF programs.
+
+config NET_DEVLINK
+ bool
+ default n
config PAGE_POOL
- bool
+ bool
+
+config PAGE_POOL_STATS
+ default n
+ bool "Page pool stats"
+ depends on PAGE_POOL
+ help
+ Enable page pool statistics to track page allocation and recycling
+ in page pools. This option incurs additional CPU cost in allocation
+ and recycle paths and additional memory cost to store the statistics.
+ These statistics are only available if this option is enabled and if
+ the driver using the page pool supports exporting this data.
+
+ If unsure, say N.
config FAILOVER
tristate "Generic failover module"
@@ -445,16 +520,27 @@ config FAILOVER
migration of VMs with direct attached VFs by failing over to the
paravirtual datapath when the VF is unplugged.
-endif # if NET
-
-# Used by archs to tell that they support BPF JIT compiler plus which flavour.
-# Only one of the two can be selected for a specific arch since eBPF JIT supersedes
-# the cBPF JIT.
+config ETHTOOL_NETLINK
+ bool "Netlink interface for ethtool"
+ select DIMLIB
+ default y
+ help
+ An alternative userspace interface for ethtool based on generic
+ netlink. It provides better extensibility and some new features,
+ e.g. notification messages.
+
+config NETDEV_ADDR_LIST_TEST
+ tristate "Unit tests for device address list"
+ default KUNIT_ALL_TESTS
+ depends on KUNIT
+
+config NET_TEST
+ tristate "KUnit tests for networking" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ KUnit tests covering core networking infra, such as sk_buff.
-# Classic BPF JIT (cBPF)
-config HAVE_CBPF_JIT
- bool
+ If unsure, say N.
-# Extended BPF JIT (eBPF)
-config HAVE_EBPF_JIT
- bool
+endif # if NET
diff --git a/net/Kconfig.debug b/net/Kconfig.debug
new file mode 100644
index 000000000000..277fab8c4d77
--- /dev/null
+++ b/net/Kconfig.debug
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config NET_DEV_REFCNT_TRACKER
+ bool "Enable net device refcount tracking"
+ depends on DEBUG_KERNEL && STACKTRACE_SUPPORT && NET
+ select REF_TRACKER
+ default n
+ help
+ Enable debugging feature to track device references.
+ This adds memory and cpu costs.
+
+config NET_NS_REFCNT_TRACKER
+ bool "Enable networking namespace refcount tracking"
+ depends on DEBUG_KERNEL && STACKTRACE_SUPPORT && NET
+ select REF_TRACKER
+ default n
+ help
+ Enable debugging feature to track netns references.
+ This adds memory and cpu costs.
+
+config DEBUG_NET
+ bool "Add generic networking debug"
+ depends on DEBUG_KERNEL && NET
+ help
+ Enable extra sanity checks in networking.
+ This is mostly used by fuzzers, but is safe to select.
+
+config DEBUG_NET_SMALL_RTNL
+ bool "Add extra per-netns mutex inside RTNL"
+ depends on DEBUG_KERNEL && NET && LOCK_DEBUGGING_SUPPORT
+ select PROVE_LOCKING
+ default n
+ help
+ rtnl_lock() is being replaced with rtnl_net_lock() that
+ acquires the global RTNL and a small per-netns RTNL mutex.
+
+ During the conversion, rtnl_net_lock() just adds an extra
+ mutex in every RTNL scope and slows down the operations.
+
+ Once the conversion completes, rtnl_lock() will be removed
+ and rtnetlink will gain per-netns scalability.
diff --git a/net/Makefile b/net/Makefile
index bdaf53925acd..90e3d72bf58b 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -6,25 +6,25 @@
# Rewritten to use lists instead of if-statements.
#
-obj-$(CONFIG_NET) := socket.o core/
+obj-y := devres.o socket.o core/
-tmp-$(CONFIG_COMPAT) := compat.o
-obj-$(CONFIG_NET) += $(tmp-y)
+obj-$(CONFIG_COMPAT) += compat.o
# LLC has to be linked before the files in net/802/
obj-$(CONFIG_LLC) += llc/
-obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ bpf/
+obj-y += ethernet/ 802/ sched/ netlink/ bpf/ ethtool/
obj-$(CONFIG_NETFILTER) += netfilter/
obj-$(CONFIG_INET) += ipv4/
obj-$(CONFIG_TLS) += tls/
obj-$(CONFIG_XFRM) += xfrm/
obj-$(CONFIG_UNIX) += unix/
-obj-$(CONFIG_NET) += ipv6/
-obj-$(CONFIG_BPFILTER) += bpfilter/
+obj-$(CONFIG_INET_PSP) += psp/
+obj-y += ipv6/
obj-$(CONFIG_PACKET) += packet/
obj-$(CONFIG_NET_KEY) += key/
obj-$(CONFIG_BRIDGE) += bridge/
-obj-$(CONFIG_NET_DSA) += dsa/
+obj-$(CONFIG_NET_DEVLINK) += devlink/
+obj-y += dsa/
obj-$(CONFIG_ATALK) += appletalk/
obj-$(CONFIG_X25) += x25/
obj-$(CONFIG_LAPB) += lapb/
@@ -39,12 +39,10 @@ obj-$(CONFIG_AF_KCM) += kcm/
obj-$(CONFIG_STREAM_PARSER) += strparser/
obj-$(CONFIG_ATM) += atm/
obj-$(CONFIG_L2TP) += l2tp/
-obj-$(CONFIG_DECNET) += decnet/
obj-$(CONFIG_PHONET) += phonet/
ifneq ($(CONFIG_VLAN_8021Q),)
obj-y += 8021q/
endif
-obj-$(CONFIG_IP_DCCP) += dccp/
obj-$(CONFIG_IP_SCTP) += sctp/
obj-$(CONFIG_RDS) += rds/
obj-$(CONFIG_WIRELESS) += wireless/
@@ -56,17 +54,12 @@ obj-$(CONFIG_SMC) += smc/
obj-$(CONFIG_RFKILL) += rfkill/
obj-$(CONFIG_NET_9P) += 9p/
obj-$(CONFIG_CAIF) += caif/
-ifneq ($(CONFIG_DCB),)
-obj-y += dcb/
-endif
+obj-$(CONFIG_DCB) += dcb/
obj-$(CONFIG_6LOWPAN) += 6lowpan/
obj-$(CONFIG_IEEE802154) += ieee802154/
obj-$(CONFIG_MAC802154) += mac802154/
-ifeq ($(CONFIG_NET),y)
obj-$(CONFIG_SYSCTL) += sysctl_net.o
-endif
-obj-$(CONFIG_WIMAX) += wimax/
obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/
obj-$(CONFIG_CEPH_LIB) += ceph/
obj-$(CONFIG_BATMAN_ADV) += batman-adv/
@@ -78,12 +71,12 @@ obj-$(CONFIG_VSOCKETS) += vmw_vsock/
obj-$(CONFIG_MPLS) += mpls/
obj-$(CONFIG_NET_NSH) += nsh/
obj-$(CONFIG_HSR) += hsr/
-ifneq ($(CONFIG_NET_SWITCHDEV),)
-obj-y += switchdev/
-endif
-ifneq ($(CONFIG_NET_L3_MASTER_DEV),)
-obj-y += l3mdev/
-endif
+obj-$(CONFIG_NET_SWITCHDEV) += switchdev/
+obj-$(CONFIG_NET_L3_MASTER_DEV) += l3mdev/
obj-$(CONFIG_QRTR) += qrtr/
obj-$(CONFIG_NET_NCSI) += ncsi/
obj-$(CONFIG_XDP_SOCKETS) += xdp/
+obj-$(CONFIG_MPTCP) += mptcp/
+obj-$(CONFIG_MCTP) += mctp/
+obj-$(CONFIG_NET_HANDSHAKE) += handshake/
+obj-$(CONFIG_NET_SHAPER) += shaper/
diff --git a/net/appletalk/Kconfig b/net/appletalk/Kconfig
new file mode 100644
index 000000000000..041141abf925
--- /dev/null
+++ b/net/appletalk/Kconfig
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Appletalk configuration
+#
+config ATALK
+ tristate "Appletalk protocol support"
+ select LLC
+ help
+ AppleTalk is the protocol that Apple computers can use to communicate
+ on a network. If your Linux box is connected to such a network and you
+ wish to connect to it, say Y. You will need to use the netatalk package
+ so that your Linux box can act as a print and file server for Macs as
+ well as access AppleTalk printers. Check out
+ <http://www.zettabyte.net/netatalk/> on the WWW for details.
+ EtherTalk is the name used for AppleTalk over Ethernet and the
+ cheaper and slower LocalTalk is AppleTalk over a proprietary Apple
+ network using serial links. EtherTalk and LocalTalk are fully
+ supported by Linux.
+
+ General information about how to connect Linux, Windows machines and
+ Macs is on the WWW at <http://www.eats.com/linux_mac_win.html>. The
+ NET3-4-HOWTO, available from
+ <http://www.tldp.org/docs.html#howto>, contains valuable
+ information as well.
+
+ To compile this driver as a module, choose M here: the module will be
+ called appletalk. You almost certainly want to compile it as a
+ module so you can restart your AppleTalk stack without rebooting
+ your machine. I hear that the GNU boycott of Apple is over, so
+ even politically correct people are allowed to say Y here.
diff --git a/net/appletalk/Makefile b/net/appletalk/Makefile
index 5cda56edef57..152312a15180 100644
--- a/net/appletalk/Makefile
+++ b/net/appletalk/Makefile
@@ -1,9 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the Linux AppleTalk layer.
#
obj-$(CONFIG_ATALK) += appletalk.o
-appletalk-y := aarp.o ddp.o dev.o
+appletalk-y := aarp.o ddp.o
appletalk-$(CONFIG_PROC_FS) += atalk_proc.o
appletalk-$(CONFIG_SYSCTL) += sysctl_net_atalk.o
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index 49a16cee2aae..4744e3fd4544 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* AARP: An implementation of the AppleTalk AARP protocol for
* Ethernet 'ELAP'.
@@ -13,12 +14,6 @@
* Use neighbour discovery code.
* Token Ring Support.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- *
* References:
* Inside AppleTalk (2nd Ed).
* Fixes:
@@ -26,7 +21,6 @@
* Rob Newberry - Added proxy AARP and AARP proc fs,
* moved probing from DDP module.
* Arnaldo C. Melo - don't mangle rx packets
- *
*/
#include <linux/if_arp.h>
@@ -41,6 +35,7 @@
#include <linux/seq_file.h>
#include <linux/export.h>
#include <linux/etherdevice.h>
+#include <linux/refcount.h>
int sysctl_aarp_expiry_time = AARP_EXPIRY_TIME;
int sysctl_aarp_tick_time = AARP_TICK_TIME;
@@ -50,17 +45,19 @@ int sysctl_aarp_resolve_time = AARP_RESOLVE_TIME;
/* Lists of aarp entries */
/**
* struct aarp_entry - AARP entry
- * @last_sent - Last time we xmitted the aarp request
- * @packet_queue - Queue of frames wait for resolution
- * @status - Used for proxy AARP
- * expires_at - Entry expiry time
- * target_addr - DDP Address
- * dev - Device to use
- * hwaddr - Physical i/f address of target/router
- * xmit_count - When this hits 10 we give up
- * next - Next entry in chain
+ * @refcnt: Reference count
+ * @last_sent: Last time we xmitted the aarp request
+ * @packet_queue: Queue of frames wait for resolution
+ * @status: Used for proxy AARP
+ * @expires_at: Entry expiry time
+ * @target_addr: DDP Address
+ * @dev: Device to use
+ * @hwaddr: Physical i/f address of target/router
+ * @xmit_count: When this hits 10 we give up
+ * @next: Next entry in chain
*/
struct aarp_entry {
+ refcount_t refcnt;
/* These first two are only used for unresolved entries */
unsigned long last_sent;
struct sk_buff_head packet_queue;
@@ -85,6 +82,17 @@ static DEFINE_RWLOCK(aarp_lock);
/* Used to walk the list and purge/kick entries. */
static struct timer_list aarp_timer;
+static inline void aarp_entry_get(struct aarp_entry *a)
+{
+ refcount_inc(&a->refcnt);
+}
+
+static inline void aarp_entry_put(struct aarp_entry *a)
+{
+ if (refcount_dec_and_test(&a->refcnt))
+ kfree(a);
+}
+
/*
* Delete an aarp queue
*
@@ -93,7 +101,7 @@ static struct timer_list aarp_timer;
static void __aarp_expire(struct aarp_entry *a)
{
skb_queue_purge(&a->packet_queue);
- kfree(a);
+ aarp_entry_put(a);
}
/*
@@ -386,9 +394,11 @@ static void aarp_purge(void)
static struct aarp_entry *aarp_alloc(void)
{
struct aarp_entry *a = kmalloc(sizeof(*a), GFP_ATOMIC);
+ if (!a)
+ return NULL;
- if (a)
- skb_queue_head_init(&a->packet_queue);
+ refcount_set(&a->refcnt, 1);
+ skb_queue_head_init(&a->packet_queue);
return a;
}
@@ -438,49 +448,18 @@ static struct atalk_addr *__aarp_proxy_find(struct net_device *dev,
return a ? sa : NULL;
}
-/*
- * Probe a Phase 1 device or a device that requires its Net:Node to
- * be set via an ioctl.
- */
-static void aarp_send_probe_phase1(struct atalk_iface *iface)
-{
- struct ifreq atreq;
- struct sockaddr_at *sa = (struct sockaddr_at *)&atreq.ifr_addr;
- const struct net_device_ops *ops = iface->dev->netdev_ops;
-
- sa->sat_addr.s_node = iface->address.s_node;
- sa->sat_addr.s_net = ntohs(iface->address.s_net);
-
- /* We pass the Net:Node to the drivers/cards by a Device ioctl. */
- if (!(ops->ndo_do_ioctl(iface->dev, &atreq, SIOCSIFADDR))) {
- ops->ndo_do_ioctl(iface->dev, &atreq, SIOCGIFADDR);
- if (iface->address.s_net != htons(sa->sat_addr.s_net) ||
- iface->address.s_node != sa->sat_addr.s_node)
- iface->status |= ATIF_PROBE_FAIL;
-
- iface->address.s_net = htons(sa->sat_addr.s_net);
- iface->address.s_node = sa->sat_addr.s_node;
- }
-}
-
-
void aarp_probe_network(struct atalk_iface *atif)
{
- if (atif->dev->type == ARPHRD_LOCALTLK ||
- atif->dev->type == ARPHRD_PPP)
- aarp_send_probe_phase1(atif);
- else {
- unsigned int count;
+ unsigned int count;
- for (count = 0; count < AARP_RETRANSMIT_LIMIT; count++) {
- aarp_send_probe(atif->dev, &atif->address);
+ for (count = 0; count < AARP_RETRANSMIT_LIMIT; count++) {
+ aarp_send_probe(atif->dev, &atif->address);
- /* Defer 1/10th */
- msleep(100);
+ /* Defer 1/10th */
+ msleep(100);
- if (atif->status & ATIF_PROBE_FAIL)
- break;
- }
+ if (atif->status & ATIF_PROBE_FAIL)
+ break;
}
}
@@ -514,6 +493,7 @@ int aarp_proxy_probe_network(struct atalk_iface *atif, struct atalk_addr *sa)
entry->dev = atif->dev;
write_lock_bh(&aarp_lock);
+ aarp_entry_get(entry);
hash = sa->s_node % (AARP_HASH_SIZE - 1);
entry->next = proxies[hash];
@@ -539,6 +519,7 @@ int aarp_proxy_probe_network(struct atalk_iface *atif, struct atalk_addr *sa)
retval = 1;
}
+ aarp_entry_put(entry);
write_unlock_bh(&aarp_lock);
out:
return retval;
@@ -670,7 +651,7 @@ out_unlock:
sendit:
if (skb->sk)
- skb->priority = skb->sk->sk_priority;
+ skb->priority = READ_ONCE(skb->sk->sk_priority);
if (dev_queue_xmit(skb))
goto drop;
sent:
@@ -774,7 +755,7 @@ static int aarp_rcv(struct sk_buff *skb, struct net_device *dev,
if (a && a->status & ATIF_PROBE) {
a->status |= ATIF_PROBE_FAIL;
/*
- * we do not respond to probe or request packets for
+ * we do not respond to probe or request packets of
* this address while we are probing this address
*/
goto unlock;
@@ -879,15 +860,24 @@ static struct notifier_block aarp_notifier = {
static unsigned char aarp_snap_id[] = { 0x00, 0x00, 0x00, 0x80, 0xF3 };
-void __init aarp_proto_init(void)
+int __init aarp_proto_init(void)
{
+ int rc;
+
aarp_dl = register_snap_client(aarp_snap_id, aarp_rcv);
- if (!aarp_dl)
+ if (!aarp_dl) {
printk(KERN_CRIT "Unable to register AARP with SNAP.\n");
+ return -ENOMEM;
+ }
timer_setup(&aarp_timer, aarp_expire_timeout, 0);
aarp_timer.expires = jiffies + sysctl_aarp_expiry_time;
add_timer(&aarp_timer);
- register_netdevice_notifier(&aarp_notifier);
+ rc = register_netdevice_notifier(&aarp_notifier);
+ if (rc) {
+ timer_delete_sync(&aarp_timer);
+ unregister_snap_client(aarp_dl);
+ }
+ return rc;
}
/* Remove the AARP entries associated with a device. */
@@ -1039,7 +1029,7 @@ const struct seq_operations aarp_seq_ops = {
/* General module cleanup. Called from cleanup_module() in ddp.c. */
void aarp_cleanup_module(void)
{
- del_timer_sync(&aarp_timer);
+ timer_delete_sync(&aarp_timer);
unregister_netdevice_notifier(&aarp_notifier);
unregister_snap_client(aarp_dl);
aarp_purge();
diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c
index 8006295f8bd7..01787fb6a7bc 100644
--- a/net/appletalk/atalk_proc.c
+++ b/net/appletalk/atalk_proc.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* atalk_proc.c - proc support for Appletalk
*
* Copyright(c) Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation, version 2.
*/
#include <linux/init.h>
@@ -184,7 +181,7 @@ static int atalk_seq_socket_show(struct seq_file *seq, void *v)
sk_wmem_alloc_get(s),
sk_rmem_alloc_get(s),
s->sk_state,
- from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)));
+ from_kuid_munged(seq_user_ns(seq), sk_uid(s)));
out:
return 0;
}
@@ -210,56 +207,36 @@ static const struct seq_operations atalk_seq_socket_ops = {
.show = atalk_seq_socket_show,
};
-static struct proc_dir_entry *atalk_proc_dir;
-
int __init atalk_proc_init(void)
{
- struct proc_dir_entry *p;
- int rc = -ENOMEM;
+ if (!proc_mkdir("atalk", init_net.proc_net))
+ return -ENOMEM;
- atalk_proc_dir = proc_mkdir("atalk", init_net.proc_net);
- if (!atalk_proc_dir)
+ if (!proc_create_seq("atalk/interface", 0444, init_net.proc_net,
+ &atalk_seq_interface_ops))
goto out;
- p = proc_create_seq("interface", 0444, atalk_proc_dir,
- &atalk_seq_interface_ops);
- if (!p)
- goto out_interface;
+ if (!proc_create_seq("atalk/route", 0444, init_net.proc_net,
+ &atalk_seq_route_ops))
+ goto out;
- p = proc_create_seq("route", 0444, atalk_proc_dir,
- &atalk_seq_route_ops);
- if (!p)
- goto out_route;
+ if (!proc_create_seq("atalk/socket", 0444, init_net.proc_net,
+ &atalk_seq_socket_ops))
+ goto out;
- p = proc_create_seq("socket", 0444, atalk_proc_dir,
- &atalk_seq_socket_ops);
- if (!p)
- goto out_socket;
+ if (!proc_create_seq_private("atalk/arp", 0444, init_net.proc_net,
+ &aarp_seq_ops,
+ sizeof(struct aarp_iter_state), NULL))
+ goto out;
- p = proc_create_seq_private("arp", 0444, atalk_proc_dir, &aarp_seq_ops,
- sizeof(struct aarp_iter_state), NULL);
- if (!p)
- goto out_arp;
+ return 0;
- rc = 0;
out:
- return rc;
-out_arp:
- remove_proc_entry("socket", atalk_proc_dir);
-out_socket:
- remove_proc_entry("route", atalk_proc_dir);
-out_route:
- remove_proc_entry("interface", atalk_proc_dir);
-out_interface:
- remove_proc_entry("atalk", init_net.proc_net);
- goto out;
+ remove_proc_subtree("atalk", init_net.proc_net);
+ return -ENOMEM;
}
-void __exit atalk_proc_exit(void)
+void atalk_proc_exit(void)
{
- remove_proc_entry("interface", atalk_proc_dir);
- remove_proc_entry("route", atalk_proc_dir);
- remove_proc_entry("socket", atalk_proc_dir);
- remove_proc_entry("arp", atalk_proc_dir);
- remove_proc_entry("atalk", init_net.proc_net);
+ remove_proc_subtree("atalk", init_net.proc_net);
}
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 9b6bc5abe946..2a01fff46c9d 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* DDP: An implementation of the AppleTalk DDP protocol for
* Ethernet 'ELAP'.
@@ -43,12 +44,6 @@
* shared skb support 8)
* Arnaldo C. de Melo : Move proc stuff to atalk_proc.c,
* use seq_file
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/capability.h>
@@ -62,6 +57,7 @@
#include <net/sock.h>
#include <net/tcp_states.h>
#include <net/route.h>
+#include <net/compat.h>
#include <linux/atalk.h>
#include <linux/highmem.h>
@@ -92,6 +88,7 @@ static inline void atalk_remove_socket(struct sock *sk)
static struct sock *atalk_search_socket(struct sockaddr_at *to,
struct atalk_iface *atif)
{
+ struct sock *def_socket = NULL;
struct sock *s;
read_lock_bh(&atalk_sockets_lock);
@@ -102,8 +99,20 @@ static struct sock *atalk_search_socket(struct sockaddr_at *to,
continue;
if (to->sat_addr.s_net == ATADDR_ANYNET &&
- to->sat_addr.s_node == ATADDR_BCAST)
- goto found;
+ to->sat_addr.s_node == ATADDR_BCAST) {
+ if (atif->address.s_node == at->src_node &&
+ atif->address.s_net == at->src_net) {
+ /* This socket's address matches the address of the interface
+ * that received the packet -- use it
+ */
+ goto found;
+ }
+
+ /* Continue searching for a socket matching the interface address,
+ * but use this socket by default if no other one is found
+ */
+ def_socket = s;
+ }
if (to->sat_addr.s_net == at->src_net &&
(to->sat_addr.s_node == at->src_node ||
@@ -120,7 +129,7 @@ static struct sock *atalk_search_socket(struct sockaddr_at *to,
goto found;
}
}
- s = NULL;
+ s = def_socket;
found:
read_unlock_bh(&atalk_sockets_lock);
return s;
@@ -160,7 +169,7 @@ found:
static void atalk_destroy_timer(struct timer_list *t)
{
- struct sock *sk = from_timer(sk, t, sk_timer);
+ struct sock *sk = timer_container_of(sk, t, sk_timer);
if (sk_has_allocations(sk)) {
sk->sk_timer.expires = jiffies + SOCK_DESTROY_TIME;
@@ -567,6 +576,7 @@ static int atrtr_create(struct rtentry *r, struct net_device *devhint)
/* Fill in the routing entry */
rt->target = ta->sat_addr;
+ dev_put(rt->dev); /* Release old device */
dev_hold(devhint);
rt->dev = devhint;
rt->flags = r->rt_flags;
@@ -670,7 +680,7 @@ static int atif_ioctl(int cmd, void __user *arg)
struct rtentry rtdef;
int add_route;
- if (copy_from_user(&atreq, arg, sizeof(atreq)))
+ if (get_user_ifreq(&atreq, NULL, arg))
return -EFAULT;
dev = __dev_get_by_name(&init_net, atreq.ifr_name);
@@ -711,7 +721,7 @@ static int atif_ioctl(int cmd, void __user *arg)
/*
* Phase 1 is fine on LocalTalk but we don't do
- * EtherTalk phase 1. Anyone wanting to add it go ahead.
+ * EtherTalk phase 1. Anyone wanting to add it, go ahead.
*/
if (dev->type == ARPHRD_ETHER && nr->nr_phase != 2)
return -EPROTONOSUPPORT;
@@ -832,7 +842,7 @@ static int atif_ioctl(int cmd, void __user *arg)
nr = (struct atalk_netrange *)&(atif->nets);
/*
* Phase 1 is fine on Localtalk but we don't do
- * Ethertalk phase 1. Anyone wanting to add it go ahead.
+ * Ethertalk phase 1. Anyone wanting to add it, go ahead.
*/
if (dev->type == ARPHRD_ETHER && nr->nr_phase != 2)
return -EPROTONOSUPPORT;
@@ -869,7 +879,25 @@ static int atif_ioctl(int cmd, void __user *arg)
return 0;
}
- return copy_to_user(arg, &atreq, sizeof(atreq)) ? -EFAULT : 0;
+ return put_user_ifreq(&atreq, arg);
+}
+
+static int atrtr_ioctl_addrt(struct rtentry *rt)
+{
+ struct net_device *dev = NULL;
+
+ if (rt->rt_dev) {
+ char name[IFNAMSIZ];
+
+ if (copy_from_user(name, rt->rt_dev, IFNAMSIZ-1))
+ return -EFAULT;
+ name[IFNAMSIZ-1] = '\0';
+
+ dev = __dev_get_by_name(&init_net, name);
+ if (!dev)
+ return -ENODEV;
+ }
+ return atrtr_create(rt, dev);
}
/* Routing ioctl() calls */
@@ -887,19 +915,8 @@ static int atrtr_ioctl(unsigned int cmd, void __user *arg)
return atrtr_delete(&((struct sockaddr_at *)
&rt.rt_dst)->sat_addr);
- case SIOCADDRT: {
- struct net_device *dev = NULL;
- if (rt.rt_dev) {
- char name[IFNAMSIZ];
- if (copy_from_user(name, rt.rt_dev, IFNAMSIZ-1))
- return -EFAULT;
- name[IFNAMSIZ-1] = '\0';
- dev = __dev_get_by_name(&init_net, name);
- if (!dev)
- return -ENODEV;
- }
- return atrtr_create(&rt, dev);
- }
+ case SIOCADDRT:
+ return atrtr_ioctl_addrt(&rt);
}
return -EINVAL;
}
@@ -958,8 +975,8 @@ static unsigned long atalk_sum_skb(const struct sk_buff *skb, int offset,
if (copy > len)
copy = len;
vaddr = kmap_atomic(skb_frag_page(frag));
- sum = atalk_sum_partial(vaddr + frag->page_offset +
- offset - start, copy, sum);
+ sum = atalk_sum_partial(vaddr + skb_frag_off(frag) +
+ offset - start, copy, sum);
kunmap_atomic(vaddr);
if (!(len -= copy))
@@ -1028,6 +1045,11 @@ static int atalk_create(struct net *net, struct socket *sock, int protocol,
*/
if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
goto out;
+
+ rc = -EPERM;
+ if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
+ goto out;
+
rc = -ENOMEM;
sk = sk_alloc(net, PF_APPLETALK, GFP_KERNEL, &ddp_proto, kern);
if (!sk)
@@ -1127,7 +1149,7 @@ out:
}
/* Set the address 'our end' of the connection */
-static int atalk_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+static int atalk_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
{
struct sockaddr_at *addr = (struct sockaddr_at *)uaddr;
struct sock *sk = sock->sk;
@@ -1182,7 +1204,7 @@ out:
}
/* Set the address we talk to */
-static int atalk_connect(struct socket *sock, struct sockaddr *uaddr,
+static int atalk_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
@@ -1276,39 +1298,6 @@ out:
return err;
}
-#if IS_ENABLED(CONFIG_IPDDP)
-static __inline__ int is_ip_over_ddp(struct sk_buff *skb)
-{
- return skb->data[12] == 22;
-}
-
-static int handle_ip_over_ddp(struct sk_buff *skb)
-{
- struct net_device *dev = __dev_get_by_name(&init_net, "ipddp0");
- struct net_device_stats *stats;
-
- /* This needs to be able to handle ipddp"N" devices */
- if (!dev) {
- kfree_skb(skb);
- return NET_RX_DROP;
- }
-
- skb->protocol = htons(ETH_P_IP);
- skb_pull(skb, 13);
- skb->dev = dev;
- skb_reset_transport_header(skb);
-
- stats = netdev_priv(dev);
- stats->rx_packets++;
- stats->rx_bytes += skb->len + 13;
- return netif_rx(skb); /* Send the SKB up to a higher place. */
-}
-#else
-/* make it easy for gcc to optimize this test out, i.e. kill the code */
-#define is_ip_over_ddp(skb) 0
-#define handle_ip_over_ddp(skb) 0
-#endif
-
static int atalk_route_packet(struct sk_buff *skb, struct net_device *dev,
struct ddpehdr *ddp, __u16 len_hops, int origlen)
{
@@ -1399,9 +1388,10 @@ drop:
/**
* atalk_rcv - Receive a packet (in skb) from device dev
- * @skb - packet received
- * @dev - network device where the packet comes from
- * @pt - packet type
+ * @skb: packet received
+ * @dev: network device where the packet comes from
+ * @pt: packet type
+ * @orig_dev: the original receive net device
*
* Receive a packet (in skb) from device dev. This has come from the SNAP
* decoder, and on entry skb->transport_header is the DDP header, skb->len
@@ -1471,9 +1461,6 @@ static int atalk_rcv(struct sk_buff *skb, struct net_device *dev,
return atalk_route_packet(skb, dev, ddp, len_hops, origlen);
}
- /* if IP over DDP is not selected this code will be optimized out */
- if (is_ip_over_ddp(skb))
- return handle_ip_over_ddp(skb);
/*
* Which socket - atalk_search_socket() looks for a *full match*
* of the <net, node, port> tuple.
@@ -1568,8 +1555,8 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
struct sk_buff *skb;
struct net_device *dev;
struct ddpehdr *ddp;
- int size;
- struct atalk_route *rt;
+ int size, hard_header_len;
+ struct atalk_route *rt, *rt_lo = NULL;
int err;
if (flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT))
@@ -1608,7 +1595,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
}
/* Build a packet */
- SOCK_DEBUG(sk, "SK %p: Got address.\n", sk);
+ net_dbg_ratelimited("SK %p: Got address.\n", sk);
/* For headers */
size = sizeof(struct ddpehdr) + len + ddp_dl->header_length;
@@ -1629,10 +1616,25 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
dev = rt->dev;
- SOCK_DEBUG(sk, "SK %p: Size needed %d, device %s\n",
+ net_dbg_ratelimited("SK %p: Size needed %d, device %s\n",
sk, size, dev->name);
- size += dev->hard_header_len;
+ hard_header_len = dev->hard_header_len;
+ /* Leave room for loopback hardware header if necessary */
+ if (usat->sat_addr.s_node == ATADDR_BCAST &&
+ (dev->flags & IFF_LOOPBACK || !(rt->flags & RTF_GATEWAY))) {
+ struct atalk_addr at_lo;
+
+ at_lo.s_node = 0;
+ at_lo.s_net = 0;
+
+ rt_lo = atrtr_find(&at_lo);
+
+ if (rt_lo && rt_lo->dev->hard_header_len > hard_header_len)
+ hard_header_len = rt_lo->dev->hard_header_len;
+ }
+
+ size += hard_header_len;
release_sock(sk);
skb = sock_alloc_send_skb(sk, size, (flags & MSG_DONTWAIT), &err);
lock_sock(sk);
@@ -1640,10 +1642,10 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
goto out;
skb_reserve(skb, ddp_dl->header_length);
- skb_reserve(skb, dev->hard_header_len);
+ skb_reserve(skb, hard_header_len);
skb->dev = dev;
- SOCK_DEBUG(sk, "SK %p: Begin build.\n", sk);
+ net_dbg_ratelimited("SK %p: Begin build.\n", sk);
ddp = skb_put(skb, sizeof(struct ddpehdr));
ddp->deh_len_hops = htons(len + sizeof(*ddp));
@@ -1654,7 +1656,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
ddp->deh_dport = usat->sat_port;
ddp->deh_sport = at->src_port;
- SOCK_DEBUG(sk, "SK %p: Copy user data (%zd bytes).\n", sk, len);
+ net_dbg_ratelimited("SK %p: Copy user data (%zd bytes).\n", sk, len);
err = memcpy_from_msg(skb_put(skb, len), msg, len);
if (err) {
@@ -1678,7 +1680,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
if (skb2) {
loopback = 1;
- SOCK_DEBUG(sk, "SK %p: send out(copy).\n", sk);
+ net_dbg_ratelimited("SK %p: send out(copy).\n", sk);
/*
* If it fails it is queued/sent above in the aarp queue
*/
@@ -1687,27 +1689,21 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
}
if (dev->flags & IFF_LOOPBACK || loopback) {
- SOCK_DEBUG(sk, "SK %p: Loop back.\n", sk);
+ net_dbg_ratelimited("SK %p: Loop back.\n", sk);
/* loop back */
skb_orphan(skb);
if (ddp->deh_dnode == ATADDR_BCAST) {
- struct atalk_addr at_lo;
-
- at_lo.s_node = 0;
- at_lo.s_net = 0;
-
- rt = atrtr_find(&at_lo);
- if (!rt) {
+ if (!rt_lo) {
kfree_skb(skb);
err = -ENETUNREACH;
goto out;
}
- dev = rt->dev;
+ dev = rt_lo->dev;
skb->dev = dev;
}
ddp_dl->request(ddp_dl, skb, dev->dev_addr);
} else {
- SOCK_DEBUG(sk, "SK %p: send out.\n", sk);
+ net_dbg_ratelimited("SK %p: send out.\n", sk);
if (rt->flags & RTF_GATEWAY) {
gsat.sat_addr = rt->gateway;
usat = &gsat;
@@ -1718,7 +1714,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
*/
aarp_send_ddp(dev, skb, &usat->sat_addr, NULL);
}
- SOCK_DEBUG(sk, "SK %p: Done write (%zd).\n", sk, len);
+ net_dbg_ratelimited("SK %p: Done write (%zd).\n", sk, len);
out:
release_sock(sk);
@@ -1735,8 +1731,7 @@ static int atalk_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int err = 0;
struct sk_buff *skb;
- skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
- flags & MSG_DONTWAIT, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
lock_sock(sk);
if (!skb)
@@ -1794,24 +1789,17 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
break;
}
case TIOCINQ: {
- /*
- * These two are safe on a single CPU system as only
- * user tasks fiddle here
- */
- struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+ struct sk_buff *skb;
long amount = 0;
+ spin_lock_irq(&sk->sk_receive_queue.lock);
+ skb = skb_peek(&sk->sk_receive_queue);
if (skb)
amount = skb->len - sizeof(struct ddpehdr);
+ spin_unlock_irq(&sk->sk_receive_queue.lock);
rc = put_user(amount, (int __user *)argp);
break;
}
- case SIOCGSTAMP:
- rc = sock_get_timestamp(sk, argp);
- break;
- case SIOCGSTAMPNS:
- rc = sock_get_timestampns(sk, argp);
- break;
/* Routing */
case SIOCADDRT:
case SIOCDELRT:
@@ -1838,20 +1826,58 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
#ifdef CONFIG_COMPAT
+static int atalk_compat_routing_ioctl(struct sock *sk, unsigned int cmd,
+ struct compat_rtentry __user *ur)
+{
+ compat_uptr_t rtdev;
+ struct rtentry rt;
+
+ if (copy_from_user(&rt.rt_dst, &ur->rt_dst,
+ 3 * sizeof(struct sockaddr)) ||
+ get_user(rt.rt_flags, &ur->rt_flags) ||
+ get_user(rt.rt_metric, &ur->rt_metric) ||
+ get_user(rt.rt_mtu, &ur->rt_mtu) ||
+ get_user(rt.rt_window, &ur->rt_window) ||
+ get_user(rt.rt_irtt, &ur->rt_irtt) ||
+ get_user(rtdev, &ur->rt_dev))
+ return -EFAULT;
+
+ switch (cmd) {
+ case SIOCDELRT:
+ if (rt.rt_dst.sa_family != AF_APPLETALK)
+ return -EINVAL;
+ return atrtr_delete(&((struct sockaddr_at *)
+ &rt.rt_dst)->sat_addr);
+
+ case SIOCADDRT:
+ rt.rt_dev = compat_ptr(rtdev);
+ return atrtr_ioctl_addrt(&rt);
+ default:
+ return -EINVAL;
+ }
+}
static int atalk_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
+ void __user *argp = compat_ptr(arg);
+ struct sock *sk = sock->sk;
+
+ switch (cmd) {
+ case SIOCADDRT:
+ case SIOCDELRT:
+ return atalk_compat_routing_ioctl(sk, cmd, argp);
/*
* SIOCATALKDIFADDR is a SIOCPROTOPRIVATE ioctl number, so we
* cannot handle it in common code. The data we access if ifreq
* here is compatible, so we can simply call the native
* handler.
*/
- if (cmd == SIOCATALKDIFADDR)
- return atalk_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
-
- return -ENOIOCTLCMD;
+ case SIOCATALKDIFADDR:
+ return atalk_ioctl(sock, cmd, (unsigned long)argp);
+ default:
+ return -ENOIOCTLCMD;
+ }
}
-#endif
+#endif /* CONFIG_COMPAT */
static const struct net_proto_family atalk_family_ops = {
@@ -1871,17 +1897,15 @@ static const struct proto_ops atalk_dgram_ops = {
.getname = atalk_getname,
.poll = datagram_poll,
.ioctl = atalk_ioctl,
+ .gettstamp = sock_gettstamp,
#ifdef CONFIG_COMPAT
.compat_ioctl = atalk_compat_ioctl,
#endif
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
- .setsockopt = sock_no_setsockopt,
- .getsockopt = sock_no_getsockopt,
.sendmsg = atalk_sendmsg,
.recvmsg = atalk_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
};
static struct notifier_block ddp_notifier = {
@@ -1904,31 +1928,61 @@ static unsigned char ddp_snap_id[] = { 0x08, 0x00, 0x07, 0x80, 0x9B };
EXPORT_SYMBOL(atrtr_get_dev);
EXPORT_SYMBOL(atalk_find_dev_addr);
-static const char atalk_err_snap[] __initconst =
- KERN_CRIT "Unable to register DDP with SNAP.\n";
-
/* Called by proto.c on kernel start up */
static int __init atalk_init(void)
{
- int rc = proto_register(&ddp_proto, 0);
+ int rc;
- if (rc != 0)
+ rc = proto_register(&ddp_proto, 0);
+ if (rc)
goto out;
- (void)sock_register(&atalk_family_ops);
+ rc = sock_register(&atalk_family_ops);
+ if (rc)
+ goto out_proto;
+
ddp_dl = register_snap_client(ddp_snap_id, atalk_rcv);
- if (!ddp_dl)
- printk(atalk_err_snap);
+ if (!ddp_dl) {
+ pr_crit("Unable to register DDP with SNAP.\n");
+ rc = -ENOMEM;
+ goto out_sock;
+ }
dev_add_pack(&ltalk_packet_type);
dev_add_pack(&ppptalk_packet_type);
- register_netdevice_notifier(&ddp_notifier);
- aarp_proto_init();
- atalk_proc_init();
- atalk_register_sysctl();
+ rc = register_netdevice_notifier(&ddp_notifier);
+ if (rc)
+ goto out_snap;
+
+ rc = aarp_proto_init();
+ if (rc)
+ goto out_dev;
+
+ rc = atalk_proc_init();
+ if (rc)
+ goto out_aarp;
+
+ rc = atalk_register_sysctl();
+ if (rc)
+ goto out_proc;
out:
return rc;
+out_proc:
+ atalk_proc_exit();
+out_aarp:
+ aarp_cleanup_module();
+out_dev:
+ unregister_netdevice_notifier(&ddp_notifier);
+out_snap:
+ dev_remove_pack(&ppptalk_packet_type);
+ dev_remove_pack(&ltalk_packet_type);
+ unregister_snap_client(ddp_dl);
+out_sock:
+ sock_unregister(PF_APPLETALK);
+out_proto:
+ proto_unregister(&ddp_proto);
+ goto out;
}
module_init(atalk_init);
@@ -1939,7 +1993,7 @@ module_init(atalk_init);
* by the network device layer.
*
* Ergo, before the AppleTalk module can be removed, all AppleTalk
- * sockets be closed from user space.
+ * sockets should be closed from user space.
*/
static void __exit atalk_exit(void)
{
diff --git a/net/appletalk/dev.c b/net/appletalk/dev.c
deleted file mode 100644
index 284c8e585533..000000000000
--- a/net/appletalk/dev.c
+++ /dev/null
@@ -1,46 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Moved here from drivers/net/net_init.c, which is:
- * Written 1993,1994,1995 by Donald Becker.
- */
-
-#include <linux/errno.h>
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/if_arp.h>
-#include <linux/if_ltalk.h>
-
-static void ltalk_setup(struct net_device *dev)
-{
- /* Fill in the fields of the device structure with localtalk-generic values. */
-
- dev->type = ARPHRD_LOCALTLK;
- dev->hard_header_len = LTALK_HLEN;
- dev->mtu = LTALK_MTU;
- dev->addr_len = LTALK_ALEN;
- dev->tx_queue_len = 10;
-
- dev->broadcast[0] = 0xFF;
-
- dev->flags = IFF_BROADCAST|IFF_MULTICAST|IFF_NOARP;
-}
-
-/**
- * alloc_ltalkdev - Allocates and sets up an localtalk device
- * @sizeof_priv: Size of additional driver-private structure to be allocated
- * for this localtalk device
- *
- * Fill in the fields of the device structure with localtalk-generic
- * values. Basically does everything except registering the device.
- *
- * Constructs a new net device, complete with a private data area of
- * size @sizeof_priv. A 32-byte (not bit) alignment is enforced for
- * this private data area.
- */
-
-struct net_device *alloc_ltalkdev(int sizeof_priv)
-{
- return alloc_netdev(sizeof_priv, "lt%d", NET_NAME_UNKNOWN,
- ltalk_setup);
-}
-EXPORT_SYMBOL(alloc_ltalkdev);
diff --git a/net/appletalk/sysctl_net_atalk.c b/net/appletalk/sysctl_net_atalk.c
index c744a853fa5f..7aebfe903242 100644
--- a/net/appletalk/sysctl_net_atalk.c
+++ b/net/appletalk/sysctl_net_atalk.c
@@ -40,14 +40,16 @@ static struct ctl_table atalk_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- { },
};
static struct ctl_table_header *atalk_table_header;
-void atalk_register_sysctl(void)
+int __init atalk_register_sysctl(void)
{
atalk_table_header = register_net_sysctl(&init_net, "net/appletalk", atalk_table);
+ if (!atalk_table_header)
+ return -ENOMEM;
+ return 0;
}
void atalk_unregister_sysctl(void)
diff --git a/net/atm/Kconfig b/net/atm/Kconfig
index 754ea103b378..77343d57ff2a 100644
--- a/net/atm/Kconfig
+++ b/net/atm/Kconfig
@@ -1,10 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Asynchronous Transfer Mode (ATM)
#
config ATM
tristate "Asynchronous Transfer Mode (ATM)"
- ---help---
+ help
ATM is a high-speed networking technology for Local Area Networks
and Wide Area Networks. It uses a fixed packet size and is
connection oriented, allowing for the negotiation of minimum
@@ -15,7 +16,7 @@ config ATM
of your ATM card below.
Note that you need a set of user-space programs to actually make use
- of ATM. See the file <file:Documentation/networking/atm.txt> for
+ of ATM. See the file <file:Documentation/networking/atm.rst> for
further details.
config ATM_CLIP
diff --git a/net/atm/atm_sysfs.c b/net/atm/atm_sysfs.c
index 39b94ca5f65d..54e7fb1a4ee5 100644
--- a/net/atm/atm_sysfs.c
+++ b/net/atm/atm_sysfs.c
@@ -11,7 +11,7 @@
#define to_atm_dev(cldev) container_of(cldev, struct atm_dev, class_dev)
-static ssize_t show_type(struct device *cdev,
+static ssize_t type_show(struct device *cdev,
struct device_attribute *attr, char *buf)
{
struct atm_dev *adev = to_atm_dev(cdev);
@@ -19,7 +19,7 @@ static ssize_t show_type(struct device *cdev,
return scnprintf(buf, PAGE_SIZE, "%s\n", adev->type);
}
-static ssize_t show_address(struct device *cdev,
+static ssize_t address_show(struct device *cdev,
struct device_attribute *attr, char *buf)
{
struct atm_dev *adev = to_atm_dev(cdev);
@@ -27,36 +27,30 @@ static ssize_t show_address(struct device *cdev,
return scnprintf(buf, PAGE_SIZE, "%pM\n", adev->esi);
}
-static ssize_t show_atmaddress(struct device *cdev,
+static ssize_t atmaddress_show(struct device *cdev,
struct device_attribute *attr, char *buf)
{
unsigned long flags;
struct atm_dev *adev = to_atm_dev(cdev);
struct atm_dev_addr *aaddr;
- int bin[] = { 1, 2, 10, 6, 1 }, *fmt = bin;
- int i, j, count = 0;
+ int count = 0;
spin_lock_irqsave(&adev->lock, flags);
list_for_each_entry(aaddr, &adev->local, entry) {
- for (i = 0, j = 0; i < ATM_ESA_LEN; ++i, ++j) {
- if (j == *fmt) {
- count += scnprintf(buf + count,
- PAGE_SIZE - count, ".");
- ++fmt;
- j = 0;
- }
- count += scnprintf(buf + count,
- PAGE_SIZE - count, "%02x",
- aaddr->addr.sas_addr.prv[i]);
- }
- count += scnprintf(buf + count, PAGE_SIZE - count, "\n");
+ count += scnprintf(buf + count, PAGE_SIZE - count,
+ "%1phN.%2phN.%10phN.%6phN.%1phN\n",
+ &aaddr->addr.sas_addr.prv[0],
+ &aaddr->addr.sas_addr.prv[1],
+ &aaddr->addr.sas_addr.prv[3],
+ &aaddr->addr.sas_addr.prv[13],
+ &aaddr->addr.sas_addr.prv[19]);
}
spin_unlock_irqrestore(&adev->lock, flags);
return count;
}
-static ssize_t show_atmindex(struct device *cdev,
+static ssize_t atmindex_show(struct device *cdev,
struct device_attribute *attr, char *buf)
{
struct atm_dev *adev = to_atm_dev(cdev);
@@ -64,7 +58,7 @@ static ssize_t show_atmindex(struct device *cdev,
return scnprintf(buf, PAGE_SIZE, "%d\n", adev->number);
}
-static ssize_t show_carrier(struct device *cdev,
+static ssize_t carrier_show(struct device *cdev,
struct device_attribute *attr, char *buf)
{
struct atm_dev *adev = to_atm_dev(cdev);
@@ -73,7 +67,7 @@ static ssize_t show_carrier(struct device *cdev,
adev->signal == ATM_PHY_SIG_LOST ? 0 : 1);
}
-static ssize_t show_link_rate(struct device *cdev,
+static ssize_t link_rate_show(struct device *cdev,
struct device_attribute *attr, char *buf)
{
struct atm_dev *adev = to_atm_dev(cdev);
@@ -96,12 +90,12 @@ static ssize_t show_link_rate(struct device *cdev,
return scnprintf(buf, PAGE_SIZE, "%d\n", link_rate);
}
-static DEVICE_ATTR(address, 0444, show_address, NULL);
-static DEVICE_ATTR(atmaddress, 0444, show_atmaddress, NULL);
-static DEVICE_ATTR(atmindex, 0444, show_atmindex, NULL);
-static DEVICE_ATTR(carrier, 0444, show_carrier, NULL);
-static DEVICE_ATTR(type, 0444, show_type, NULL);
-static DEVICE_ATTR(link_rate, 0444, show_link_rate, NULL);
+static DEVICE_ATTR_RO(address);
+static DEVICE_ATTR_RO(atmaddress);
+static DEVICE_ATTR_RO(atmindex);
+static DEVICE_ATTR_RO(carrier);
+static DEVICE_ATTR_RO(type);
+static DEVICE_ATTR_RO(link_rate);
static struct device_attribute *atm_attrs[] = {
&dev_attr_atmaddress,
@@ -114,16 +108,14 @@ static struct device_attribute *atm_attrs[] = {
};
-static int atm_uevent(struct device *cdev, struct kobj_uevent_env *env)
+static int atm_uevent(const struct device *cdev, struct kobj_uevent_env *env)
{
- struct atm_dev *adev;
+ const struct atm_dev *adev;
if (!cdev)
return -ENODEV;
adev = to_atm_dev(cdev);
- if (!adev)
- return -ENODEV;
if (add_uevent_var(env, "NAME=%s%d", adev->type, adev->number))
return -ENOMEM;
diff --git a/net/atm/br2684.c b/net/atm/br2684.c
index 10462de734ea..f666f2f98ba5 100644
--- a/net/atm/br2684.c
+++ b/net/atm/br2684.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Ethernet netdevice using ATM AAL5 as underlying carrier
* (RFC1483 obsoleted by RFC2684) for Linux
@@ -92,8 +93,8 @@ struct br2684_dev {
* This lock should be held for writing any time the list of devices or
* their attached vcc's could be altered. It should be held for reading
* any time these are being queried. Note that we sometimes need to
- * do read-locking under interrupt context, so write locking must block
- * the current CPU's interrupts
+ * do read-locking under interrupting context, so write locking must block
+ * the current CPU's interrupts.
*/
static DEFINE_RWLOCK(devs_lock);
@@ -576,10 +577,12 @@ static int br2684_regvcc(struct atm_vcc *atmvcc, void __user * arg)
pr_debug("vcc=%p, encaps=%d, brvcc=%p\n", atmvcc, be.encaps, brvcc);
if (list_empty(&brdev->brvccs) && !brdev->mac_was_set) {
unsigned char *esi = atmvcc->dev->esi;
+ const u8 one = 1;
+
if (esi[0] | esi[1] | esi[2] | esi[3] | esi[4] | esi[5])
- memcpy(net_dev->dev_addr, esi, net_dev->addr_len);
+ dev_addr_set(net_dev, esi);
else
- net_dev->dev_addr[2] = 1;
+ dev_addr_mod(net_dev, 2, &one, 1);
}
list_add(&brvcc->brvccs, &brdev->brvccs);
write_unlock_irq(&devs_lock);
diff --git a/net/atm/clip.c b/net/atm/clip.c
index d795b9c5aea4..8f152e5fa659 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* net/atm/clip.c - RFC1577 Classical IP over ATM */
/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
@@ -44,7 +45,8 @@
#include <net/atmclip.h>
static struct net_device *clip_devs;
-static struct atm_vcc *atmarpd;
+static struct atm_vcc __rcu *atmarpd;
+static DEFINE_MUTEX(atmarpd_lock);
static struct timer_list idle_timer;
static const struct neigh_ops clip_neigh_ops;
@@ -52,24 +54,35 @@ static int to_atmarpd(enum atmarp_ctrl_type type, int itf, __be32 ip)
{
struct sock *sk;
struct atmarp_ctrl *ctrl;
+ struct atm_vcc *vcc;
struct sk_buff *skb;
+ int err = 0;
pr_debug("(%d)\n", type);
- if (!atmarpd)
- return -EUNATCH;
+
+ rcu_read_lock();
+ vcc = rcu_dereference(atmarpd);
+ if (!vcc) {
+ err = -EUNATCH;
+ goto unlock;
+ }
skb = alloc_skb(sizeof(struct atmarp_ctrl), GFP_ATOMIC);
- if (!skb)
- return -ENOMEM;
+ if (!skb) {
+ err = -ENOMEM;
+ goto unlock;
+ }
ctrl = skb_put(skb, sizeof(struct atmarp_ctrl));
ctrl->type = type;
ctrl->itf_num = itf;
ctrl->ip = ip;
- atm_force_charge(atmarpd, skb->truesize);
+ atm_force_charge(vcc, skb->truesize);
- sk = sk_atm(atmarpd);
+ sk = sk_atm(vcc);
skb_queue_tail(&sk->sk_receive_queue, skb);
sk->sk_data_ready(sk);
- return 0;
+unlock:
+ rcu_read_unlock();
+ return err;
}
static void link_vcc(struct clip_vcc *clip_vcc, struct atmarp_entry *entry)
@@ -88,7 +101,7 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc)
struct clip_vcc **walk;
if (!entry) {
- pr_crit("!clip_vcc->entry (clip_vcc %p)\n", clip_vcc);
+ pr_err("!clip_vcc->entry (clip_vcc %p)\n", clip_vcc);
return;
}
netif_tx_lock_bh(entry->neigh->dev); /* block clip_start_xmit() */
@@ -108,10 +121,10 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc)
error = neigh_update(entry->neigh, NULL, NUD_NONE,
NEIGH_UPDATE_F_ADMIN, 0);
if (error)
- pr_crit("neigh_update failed with %d\n", error);
+ pr_err("neigh_update failed with %d\n", error);
goto out;
}
- pr_crit("ATMARP: failed (entry %p, vcc 0x%p)\n", entry, clip_vcc);
+ pr_err("ATMARP: failed (entry %p, vcc 0x%p)\n", entry, clip_vcc);
out:
netif_tx_unlock_bh(entry->neigh->dev);
}
@@ -155,10 +168,10 @@ static int neigh_check_cb(struct neighbour *n)
static void idle_timer_check(struct timer_list *unused)
{
- write_lock(&arp_tbl.lock);
+ spin_lock(&arp_tbl.lock);
__neigh_for_each_release(&arp_tbl, neigh_check_cb);
mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ);
- write_unlock(&arp_tbl.lock);
+ spin_unlock(&arp_tbl.lock);
}
static int clip_arp_rcv(struct sk_buff *skb)
@@ -192,12 +205,6 @@ static void clip_push(struct atm_vcc *vcc, struct sk_buff *skb)
pr_debug("\n");
- if (!clip_devs) {
- atm_return(vcc, skb->truesize);
- kfree_skb(skb);
- return;
- }
-
if (!skb) {
pr_debug("removing VCC %p\n", clip_vcc);
if (clip_vcc->entry)
@@ -207,6 +214,11 @@ static void clip_push(struct atm_vcc *vcc, struct sk_buff *skb)
return;
}
atm_return(vcc, skb->truesize);
+ if (!clip_devs) {
+ kfree_skb(skb);
+ return;
+ }
+
skb->dev = clip_vcc->entry ? clip_vcc->entry->neigh->dev : clip_devs;
/* clip_vcc->entry == NULL if we don't have an IP address yet */
if (!skb->dev) {
@@ -344,9 +356,9 @@ static netdev_tx_t clip_start_xmit(struct sk_buff *skb,
dev->stats.tx_dropped++;
return NETDEV_TX_OK;
}
- rt = (struct rtable *) dst;
- if (rt->rt_gateway)
- daddr = &rt->rt_gateway;
+ rt = dst_rtable(dst);
+ if (rt->rt_gw_family == AF_INET)
+ daddr = &rt->rt_gw4;
else
daddr = &ip_hdr(skb)->daddr;
n = dst_neigh_lookup(dst, daddr);
@@ -417,6 +429,8 @@ static int clip_mkip(struct atm_vcc *vcc, int timeout)
if (!vcc->push)
return -EBADFD;
+ if (vcc->user_back)
+ return -EINVAL;
clip_vcc = kmalloc(sizeof(struct clip_vcc), GFP_KERNEL);
if (!clip_vcc)
return -ENOMEM;
@@ -462,7 +476,7 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip)
unlink_clip_vcc(clip_vcc);
return 0;
}
- rt = ip_route_output(&init_net, ip, 0, 1, 0);
+ rt = ip_route_output(&init_net, ip, 0, 0, 0, RT_SCOPE_LINK);
if (IS_ERR(rt))
return PTR_ERR(rt);
neigh = __neigh_lookup(&arp_tbl, &ip, rt->dst.dev, 1);
@@ -607,17 +621,27 @@ static void atmarpd_close(struct atm_vcc *vcc)
{
pr_debug("\n");
- rtnl_lock();
- atmarpd = NULL;
+ mutex_lock(&atmarpd_lock);
+ RCU_INIT_POINTER(atmarpd, NULL);
+ mutex_unlock(&atmarpd_lock);
+
+ synchronize_rcu();
skb_queue_purge(&sk_atm(vcc)->sk_receive_queue);
- rtnl_unlock();
pr_debug("(done)\n");
module_put(THIS_MODULE);
}
+static int atmarpd_send(struct atm_vcc *vcc, struct sk_buff *skb)
+{
+ atm_return_tx(vcc, skb);
+ dev_kfree_skb_any(skb);
+ return 0;
+}
+
static const struct atmdev_ops atmarpd_dev_ops = {
- .close = atmarpd_close
+ .close = atmarpd_close,
+ .send = atmarpd_send
};
@@ -631,15 +655,18 @@ static struct atm_dev atmarpd_dev = {
static int atm_init_atmarp(struct atm_vcc *vcc)
{
- rtnl_lock();
+ if (vcc->push == clip_push)
+ return -EINVAL;
+
+ mutex_lock(&atmarpd_lock);
if (atmarpd) {
- rtnl_unlock();
+ mutex_unlock(&atmarpd_lock);
return -EADDRINUSE;
}
mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ);
- atmarpd = vcc;
+ rcu_assign_pointer(atmarpd, vcc);
set_bit(ATM_VF_META, &vcc->flags);
set_bit(ATM_VF_READY, &vcc->flags);
/* allow replies and avoid getting closed if signaling dies */
@@ -648,13 +675,14 @@ static int atm_init_atmarp(struct atm_vcc *vcc)
vcc->push = NULL;
vcc->pop = NULL; /* crash */
vcc->push_oam = NULL; /* crash */
- rtnl_unlock();
+ mutex_unlock(&atmarpd_lock);
return 0;
}
static int clip_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
struct atm_vcc *vcc = ATM_SD(sock);
+ struct sock *sk = sock->sk;
int err = 0;
switch (cmd) {
@@ -675,14 +703,18 @@ static int clip_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
err = clip_create(arg);
break;
case ATMARPD_CTRL:
+ lock_sock(sk);
err = atm_init_atmarp(vcc);
if (!err) {
sock->state = SS_CONNECTED;
__module_get(THIS_MODULE);
}
+ release_sock(sk);
break;
case ATMARP_MKIP:
+ lock_sock(sk);
err = clip_mkip(vcc, arg);
+ release_sock(sk);
break;
case ATMARP_SETENTRY:
err = clip_setentry(vcc, (__force __be32)arg);
@@ -903,7 +935,7 @@ static void atm_clip_exit_noproc(void)
/* First, stop the idle timer, so it stops banging
* on the table.
*/
- del_timer_sync(&idle_timer);
+ timer_delete_sync(&idle_timer);
dev = clip_devs;
while (dev) {
diff --git a/net/atm/common.c b/net/atm/common.c
index 9f8cb0d2e71e..fe77f51f6ce1 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* net/atm/common.c - ATM sockets (common part for PVC and SVC) */
/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */
@@ -115,7 +116,7 @@ static void vcc_write_space(struct sock *sk)
if (skwq_has_sleeper(wq))
wake_up_interruptible(&wq->wait);
- sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
}
rcu_read_unlock();
@@ -156,7 +157,7 @@ int vcc_create(struct net *net, struct socket *sock, int protocol, int family, i
memset(&vcc->local, 0, sizeof(struct sockaddr_atmsvc));
memset(&vcc->remote, 0, sizeof(struct sockaddr_atmsvc));
vcc->qos.txtp.max_sdu = 1 << 16; /* for meta VCs */
- refcount_set(&sk->sk_wmem_alloc, 1);
+ refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
atomic_set(&sk->sk_rmem_alloc, 0);
vcc->push = NULL;
vcc->pop = NULL;
@@ -176,18 +177,18 @@ static void vcc_destroy_socket(struct sock *sk)
set_bit(ATM_VF_CLOSE, &vcc->flags);
clear_bit(ATM_VF_READY, &vcc->flags);
- if (vcc->dev) {
- if (vcc->dev->ops->close)
- vcc->dev->ops->close(vcc);
- if (vcc->push)
- vcc->push(vcc, NULL); /* atmarpd has no push */
- module_put(vcc->owner);
-
- while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
- atm_return(vcc, skb->truesize);
- kfree_skb(skb);
- }
+ if (vcc->dev && vcc->dev->ops->close)
+ vcc->dev->ops->close(vcc);
+ if (vcc->push)
+ vcc->push(vcc, NULL); /* atmarpd has no push */
+ module_put(vcc->owner);
+
+ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+ atm_return(vcc, skb->truesize);
+ kfree_skb(skb);
+ }
+ if (vcc->dev && vcc->dev->ops->owner) {
module_put(vcc->dev->ops->owner);
atm_dev_put(vcc->dev);
}
@@ -296,7 +297,7 @@ static int adjust_tp(struct atm_trafprm *tp, unsigned char aal)
break;
default:
pr_warn("AAL problems ... (%d)\n", aal);
- /* fall through */
+ fallthrough;
case ATM_AAL5:
max_sdu = ATM_MAX_AAL5_PDU;
}
@@ -416,7 +417,7 @@ static int __vcc_connect(struct atm_vcc *vcc, struct atm_dev *dev, short vpi,
case ATM_NO_AAL:
/* ATM_AAL5 is also used in the "0 for default" case */
vcc->qos.aal = ATM_AAL5;
- /* fall through */
+ fallthrough;
case ATM_AAL5:
error = atm_init_aal5(vcc);
vcc->stats = &dev->stats.aal5;
@@ -539,7 +540,7 @@ int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
!test_bit(ATM_VF_READY, &vcc->flags))
return 0;
- skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &error);
+ skb = skb_recv_datagram(sk, flags, &error);
if (!skb)
return error;
@@ -552,7 +553,7 @@ int vcc_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
error = skb_copy_datagram_msg(skb, 0, msg, copied);
if (error)
return error;
- sock_recv_ts_and_drops(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
if (!(flags & MSG_PEEK)) {
pr_debug("%d -= %d\n", atomic_read(&sk->sk_rmem_alloc),
@@ -634,17 +635,27 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t size)
skb->dev = NULL; /* for paths shared with net_device interfaces */
if (!copy_from_iter_full(skb_put(skb, size), size, &m->msg_iter)) {
- kfree_skb(skb);
error = -EFAULT;
- goto out;
+ goto free_skb;
}
if (eff != size)
memset(skb->data + size, 0, eff-size);
+
+ if (vcc->dev->ops->pre_send) {
+ error = vcc->dev->ops->pre_send(vcc, skb);
+ if (error)
+ goto free_skb;
+ }
+
error = vcc->dev->ops->send(vcc, skb);
error = error ? error : size;
out:
release_sock(sk);
return error;
+free_skb:
+ atm_return_tx(vcc, skb);
+ kfree_skb(skb);
+ goto out;
}
__poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
@@ -653,7 +664,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
struct atm_vcc *vcc;
__poll_t mask;
- sock_poll_wait(file, wait);
+ sock_poll_wait(file, sock, wait);
mask = 0;
vcc = ATM_SD(sock);
@@ -667,7 +678,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
mask |= EPOLLHUP;
/* readable? */
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
mask |= EPOLLIN | EPOLLRDNORM;
/* writable? */
@@ -744,7 +755,7 @@ static int check_qos(const struct atm_qos *qos)
}
int vcc_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct atm_vcc *vcc;
unsigned long value;
@@ -759,7 +770,7 @@ int vcc_setsockopt(struct socket *sock, int level, int optname,
{
struct atm_qos qos;
- if (copy_from_user(&qos, optval, sizeof(qos)))
+ if (copy_from_sockptr(&qos, optval, sizeof(qos)))
return -EFAULT;
error = check_qos(&qos);
if (error)
@@ -773,7 +784,7 @@ int vcc_setsockopt(struct socket *sock, int level, int optname,
return 0;
}
case SO_SETCLP:
- if (get_user(value, (unsigned long __user *)optval))
+ if (copy_from_sockptr(&value, optval, sizeof(value)))
return -EFAULT;
if (value)
vcc->atm_options |= ATM_ATMOPT_CLP;
@@ -781,13 +792,8 @@ int vcc_setsockopt(struct socket *sock, int level, int optname,
vcc->atm_options &= ~ATM_ATMOPT_CLP;
return 0;
default:
- if (level == SOL_SOCKET)
- return -EINVAL;
- break;
- }
- if (!vcc->dev || !vcc->dev->ops->setsockopt)
return -EINVAL;
- return vcc->dev->ops->setsockopt(vcc, level, optname, optval, optlen);
+ }
}
int vcc_getsockopt(struct socket *sock, int level, int optname,
@@ -825,13 +831,8 @@ int vcc_getsockopt(struct socket *sock, int level, int optname,
return copy_to_user(optval, &pvc, sizeof(pvc)) ? -EFAULT : 0;
}
default:
- if (level == SOL_SOCKET)
- return -EINVAL;
- break;
- }
- if (!vcc->dev || !vcc->dev->ops->getsockopt)
return -EINVAL;
- return vcc->dev->ops->getsockopt(vcc, level, optname, optval, len);
+ }
}
int register_atmdevice_notifier(struct notifier_block *nb)
@@ -880,7 +881,7 @@ out_atmproc_exit:
out_atmsvc_exit:
atmsvc_exit();
out_atmpvc_exit:
- atmsvc_exit();
+ atmpvc_exit();
out_unregister_vcc_proto:
proto_unregister(&vcc_proto);
goto out;
@@ -899,6 +900,7 @@ subsys_initcall(atm_init);
module_exit(atm_exit);
+MODULE_DESCRIPTION("Asynchronous Transfer Mode (ATM) networking core");
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_ATMPVC);
MODULE_ALIAS_NETPROTO(PF_ATMSVC);
diff --git a/net/atm/common.h b/net/atm/common.h
index 5850649068bb..a1e56e8de698 100644
--- a/net/atm/common.h
+++ b/net/atm/common.h
@@ -21,7 +21,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait);
int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
int vcc_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
int vcc_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen);
+ sockptr_t optval, unsigned int optlen);
int vcc_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen);
void vcc_process_recv_queue(struct atm_vcc *vcc);
diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c
index 2ff0e5e470e3..0f7a39aeccc8 100644
--- a/net/atm/ioctl.c
+++ b/net/atm/ioctl.c
@@ -56,6 +56,8 @@ static int do_vcc_ioctl(struct socket *sock, unsigned int cmd,
int error;
struct list_head *pos;
void __user *argp = (void __user *)arg;
+ void __user *buf;
+ int __user *len;
vcc = ATM_SD(sock);
switch (cmd) {
@@ -66,37 +68,24 @@ static int do_vcc_ioctl(struct socket *sock, unsigned int cmd,
goto done;
}
error = put_user(sk->sk_sndbuf - sk_wmem_alloc_get(sk),
- (int __user *)argp) ? -EFAULT : 0;
+ (int __user *)argp);
goto done;
case SIOCINQ:
{
struct sk_buff *skb;
+ int amount;
if (sock->state != SS_CONNECTED) {
error = -EINVAL;
goto done;
}
+ spin_lock_irq(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue);
- error = put_user(skb ? skb->len : 0,
- (int __user *)argp) ? -EFAULT : 0;
+ amount = skb ? skb->len : 0;
+ spin_unlock_irq(&sk->sk_receive_queue.lock);
+ error = put_user(amount, (int __user *)argp);
goto done;
}
- case SIOCGSTAMP: /* borrowed from IP */
-#ifdef CONFIG_COMPAT
- if (compat)
- error = compat_sock_get_timestamp(sk, argp);
- else
-#endif
- error = sock_get_timestamp(sk, argp);
- goto done;
- case SIOCGSTAMPNS: /* borrowed from IP */
-#ifdef CONFIG_COMPAT
- if (compat)
- error = compat_sock_get_timestampns(sk, argp);
- else
-#endif
- error = sock_get_timestampns(sk, argp);
- goto done;
case ATM_SETSC:
net_warn_ratelimited("ATM_SETSC is obsolete; used by %s:%d\n",
current->comm, task_pid_nr(current));
@@ -178,7 +167,49 @@ static int do_vcc_ioctl(struct socket *sock, unsigned int cmd,
if (error != -ENOIOCTLCMD)
goto done;
- error = atm_dev_ioctl(cmd, argp, compat);
+ if (cmd == ATM_GETNAMES) {
+ if (IS_ENABLED(CONFIG_COMPAT) && compat) {
+#ifdef CONFIG_COMPAT
+ struct compat_atm_iobuf __user *ciobuf = argp;
+ compat_uptr_t cbuf;
+ len = &ciobuf->length;
+ if (get_user(cbuf, &ciobuf->buffer))
+ return -EFAULT;
+ buf = compat_ptr(cbuf);
+#endif
+ } else {
+ struct atm_iobuf __user *iobuf = argp;
+ len = &iobuf->length;
+ if (get_user(buf, &iobuf->buffer))
+ return -EFAULT;
+ }
+ error = atm_getnames(buf, len);
+ } else {
+ int number;
+
+ if (IS_ENABLED(CONFIG_COMPAT) && compat) {
+#ifdef CONFIG_COMPAT
+ struct compat_atmif_sioc __user *csioc = argp;
+ compat_uptr_t carg;
+
+ len = &csioc->length;
+ if (get_user(carg, &csioc->arg))
+ return -EFAULT;
+ buf = compat_ptr(carg);
+ if (get_user(number, &csioc->number))
+ return -EFAULT;
+#endif
+ } else {
+ struct atmif_sioc __user *sioc = argp;
+
+ len = &sioc->length;
+ if (get_user(buf, &sioc->arg))
+ return -EFAULT;
+ if (get_user(number, &sioc->number))
+ return -EFAULT;
+ }
+ error = atm_dev_ioctl(cmd, buf, len, number, compat);
+ }
done:
return error;
@@ -246,61 +277,25 @@ static struct {
static int do_atm_iobuf(struct socket *sock, unsigned int cmd,
unsigned long arg)
{
- struct atm_iobuf __user *iobuf;
- struct compat_atm_iobuf __user *iobuf32;
+ struct compat_atm_iobuf __user *iobuf32 = compat_ptr(arg);
u32 data;
- void __user *datap;
- int len, err;
-
- iobuf = compat_alloc_user_space(sizeof(*iobuf));
- iobuf32 = compat_ptr(arg);
- if (get_user(len, &iobuf32->length) ||
- get_user(data, &iobuf32->buffer))
+ if (get_user(data, &iobuf32->buffer))
return -EFAULT;
- datap = compat_ptr(data);
- if (put_user(len, &iobuf->length) ||
- put_user(datap, &iobuf->buffer))
- return -EFAULT;
-
- err = do_vcc_ioctl(sock, cmd, (unsigned long) iobuf, 0);
-
- if (!err) {
- if (copy_in_user(&iobuf32->length, &iobuf->length,
- sizeof(int)))
- err = -EFAULT;
- }
- return err;
+ return atm_getnames(&iobuf32->length, compat_ptr(data));
}
static int do_atmif_sioc(struct socket *sock, unsigned int cmd,
unsigned long arg)
{
- struct atmif_sioc __user *sioc;
- struct compat_atmif_sioc __user *sioc32;
+ struct compat_atmif_sioc __user *sioc32 = compat_ptr(arg);
+ int number;
u32 data;
- void __user *datap;
- int err;
-
- sioc = compat_alloc_user_space(sizeof(*sioc));
- sioc32 = compat_ptr(arg);
- if (copy_in_user(&sioc->number, &sioc32->number, 2 * sizeof(int)) ||
- get_user(data, &sioc32->arg))
- return -EFAULT;
- datap = compat_ptr(data);
- if (put_user(datap, &sioc->arg))
+ if (get_user(data, &sioc32->arg) || get_user(number, &sioc32->number))
return -EFAULT;
-
- err = do_vcc_ioctl(sock, cmd, (unsigned long) sioc, 0);
-
- if (!err) {
- if (copy_in_user(&sioc32->length, &sioc->length,
- sizeof(int)))
- err = -EFAULT;
- }
- return err;
+ return atm_dev_ioctl(cmd, compat_ptr(data), &sioc32->length, number, 0);
}
static int do_atm_ioctl(struct socket *sock, unsigned int cmd32,
diff --git a/net/atm/lec.c b/net/atm/lec.c
index d7f5cf5b7594..afb8d3eb2185 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* lec.c: Lan Emulation driver
*
@@ -123,6 +124,7 @@ static unsigned char bus_mac[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
/* Device structures */
static struct net_device *dev_lec[MAX_LEC_ITF];
+static DEFINE_MUTEX(lec_mutex);
#if IS_ENABLED(CONFIG_BRIDGE)
static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev)
@@ -180,6 +182,7 @@ static void
lec_send(struct atm_vcc *vcc, struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
+ unsigned int len = skb->len;
ATM_SKB(skb)->vcc = vcc;
atm_account_tx(vcc, skb);
@@ -190,10 +193,10 @@ lec_send(struct atm_vcc *vcc, struct sk_buff *skb)
}
dev->stats.tx_packets++;
- dev->stats.tx_bytes += skb->len;
+ dev->stats.tx_bytes += len;
}
-static void lec_tx_timeout(struct net_device *dev)
+static void lec_tx_timeout(struct net_device *dev, unsigned int txqueue)
{
pr_info("%s\n", dev->name);
netif_trans_update(dev);
@@ -339,12 +342,12 @@ static int lec_close(struct net_device *dev)
static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
{
+ static const u8 zero_addr[ETH_ALEN] = {};
unsigned long flags;
struct net_device *dev = (struct net_device *)vcc->proto_data;
struct lec_priv *priv = netdev_priv(dev);
struct atmlec_msg *mesg;
struct lec_arp_table *entry;
- int i;
char *tmp; /* FIXME */
WARN_ON(refcount_sub_and_test(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc));
@@ -354,12 +357,10 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
pr_debug("%s: msg from zeppelin:%d\n", dev->name, mesg->type);
switch (mesg->type) {
case l_set_mac_addr:
- for (i = 0; i < 6; i++)
- dev->dev_addr[i] = mesg->content.normal.mac_addr[i];
+ eth_hw_addr_set(dev, mesg->content.normal.mac_addr);
break;
case l_del_mac_addr:
- for (i = 0; i < 6; i++)
- dev->dev_addr[i] = 0;
+ eth_hw_addr_set(dev, zero_addr);
break;
case l_addr_delete:
lec_addr_delete(priv, mesg->content.normal.atm_addr,
@@ -379,7 +380,7 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
if (mesg->content.normal.no_source_le_narp)
break;
- /* FALL THROUGH */
+ fallthrough;
case l_arp_update:
lec_arp_update(priv, mesg->content.normal.mac_addr,
mesg->content.normal.atm_addr,
@@ -685,6 +686,7 @@ static int lec_vcc_attach(struct atm_vcc *vcc, void __user *arg)
int bytes_left;
struct atmlec_ioc ioc_data;
+ lockdep_assert_held(&lec_mutex);
/* Lecd must be up in this case */
bytes_left = copy_from_user(&ioc_data, arg, sizeof(struct atmlec_ioc));
if (bytes_left != 0)
@@ -710,7 +712,11 @@ static int lec_vcc_attach(struct atm_vcc *vcc, void __user *arg)
static int lec_mcast_attach(struct atm_vcc *vcc, int arg)
{
- if (arg < 0 || arg >= MAX_LEC_ITF || !dev_lec[arg])
+ lockdep_assert_held(&lec_mutex);
+ if (arg < 0 || arg >= MAX_LEC_ITF)
+ return -EINVAL;
+ arg = array_index_nospec(arg, MAX_LEC_ITF);
+ if (!dev_lec[arg])
return -EINVAL;
vcc->proto_data = dev_lec[arg];
return lec_mcast_make(netdev_priv(dev_lec[arg]), vcc);
@@ -722,12 +728,12 @@ static int lecd_attach(struct atm_vcc *vcc, int arg)
int i;
struct lec_priv *priv;
+ lockdep_assert_held(&lec_mutex);
if (arg < 0)
- i = 0;
- else
- i = arg;
+ arg = 0;
if (arg >= MAX_LEC_ITF)
return -EINVAL;
+ i = array_index_nospec(arg, MAX_LEC_ITF);
if (!dev_lec[i]) {
int size;
@@ -740,6 +746,7 @@ static int lecd_attach(struct atm_vcc *vcc, int arg)
snprintf(dev_lec[i]->name, IFNAMSIZ, "lec%d", i);
if (register_netdev(dev_lec[i])) {
free_netdev(dev_lec[i]);
+ dev_lec[i] = NULL;
return -EINVAL;
}
@@ -796,14 +803,9 @@ static const char *lec_arp_get_status_string(unsigned char status)
static void lec_info(struct seq_file *seq, struct lec_arp_table *entry)
{
- int i;
-
- for (i = 0; i < ETH_ALEN; i++)
- seq_printf(seq, "%2.2x", entry->mac_addr[i] & 0xff);
- seq_printf(seq, " ");
- for (i = 0; i < ATM_ESA_LEN; i++)
- seq_printf(seq, "%2.2x", entry->atm_addr[i] & 0xff);
- seq_printf(seq, " %s %4.4x", lec_arp_get_status_string(entry->status),
+ seq_printf(seq, "%pM ", entry->mac_addr);
+ seq_printf(seq, "%*phN ", ATM_ESA_LEN, entry->atm_addr);
+ seq_printf(seq, "%s %4.4x", lec_arp_get_status_string(entry->status),
entry->flags & 0xffff);
if (entry->vcc)
seq_printf(seq, "%3d %3d ", entry->vcc->vpi, entry->vcc->vci);
@@ -907,7 +909,6 @@ static void *lec_itf_walk(struct lec_state *state, loff_t *l)
v = (dev && netdev_priv(dev)) ?
lec_priv_walk(state, l, netdev_priv(dev)) : NULL;
if (!v && dev) {
- dev_put(dev);
/* Partial state reset for the next time we get called */
dev = NULL;
}
@@ -931,6 +932,7 @@ static void *lec_seq_start(struct seq_file *seq, loff_t *pos)
{
struct lec_state *state = seq->private;
+ mutex_lock(&lec_mutex);
state->itf = 0;
state->dev = NULL;
state->locked = NULL;
@@ -948,17 +950,17 @@ static void lec_seq_stop(struct seq_file *seq, void *v)
if (state->dev) {
spin_unlock_irqrestore(&state->locked->lec_arp_lock,
state->flags);
- dev_put(state->dev);
+ state->dev = NULL;
}
+ mutex_unlock(&lec_mutex);
}
static void *lec_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
struct lec_state *state = seq->private;
- v = lec_get_idx(state, 1);
- *pos += !!PTR_ERR(v);
- return v;
+ ++*pos;
+ return lec_get_idx(state, 1);
}
static int lec_seq_show(struct seq_file *seq, void *v)
@@ -1007,6 +1009,7 @@ static int lane_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
return -ENOIOCTLCMD;
}
+ mutex_lock(&lec_mutex);
switch (cmd) {
case ATMLEC_CTRL:
err = lecd_attach(vcc, (int)arg);
@@ -1021,6 +1024,7 @@ static int lane_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
break;
}
+ mutex_unlock(&lec_mutex);
return err;
}
@@ -1072,7 +1076,7 @@ module_exit(lane_module_cleanup);
/*
* LANE2: 3.1.3, LE_RESOLVE.request
* Non force allocates memory and fills in *tlvs, fills in *sizeoftlvs.
- * If sizeoftlvs == NULL the default TLVs associated with with this
+ * If sizeoftlvs == NULL the default TLVs associated with this
* lec will be used.
* If dst_mac == NULL, targetless LE_ARP will be sent
*/
@@ -1266,6 +1270,12 @@ static void lec_arp_clear_vccs(struct lec_arp_table *entry)
entry->vcc = NULL;
}
if (entry->recv_vcc) {
+ struct atm_vcc *vcc = entry->recv_vcc;
+ struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc);
+
+ kfree(vpriv);
+ vcc->user_back = NULL;
+
entry->recv_vcc->push = entry->old_recv_push;
vcc_release_async(entry->recv_vcc, -EPIPE);
entry->recv_vcc = NULL;
@@ -1300,7 +1310,7 @@ lec_arp_remove(struct lec_priv *priv, struct lec_arp_table *to_remove)
return -1;
hlist_del(&to_remove->next);
- del_timer(&to_remove->timer);
+ timer_delete(&to_remove->timer);
/*
* If this is the only MAC connected to this VCC,
@@ -1351,7 +1361,7 @@ static void dump_arp_table(struct lec_priv *priv)
{
struct lec_arp_table *rulla;
char buf[256];
- int i, j, offset;
+ int i, offset;
pr_info("Dump %p:\n", priv);
for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
@@ -1359,14 +1369,10 @@ static void dump_arp_table(struct lec_priv *priv)
&priv->lec_arp_tables[i], next) {
offset = 0;
offset += sprintf(buf, "%d: %p\n", i, rulla);
- offset += sprintf(buf + offset, "Mac: %pM",
+ offset += sprintf(buf + offset, "Mac: %pM ",
rulla->mac_addr);
- offset += sprintf(buf + offset, " Atm:");
- for (j = 0; j < ATM_ESA_LEN; j++) {
- offset += sprintf(buf + offset,
- "%2.2x ",
- rulla->atm_addr[j] & 0xff);
- }
+ offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN,
+ rulla->atm_addr);
offset += sprintf(buf + offset,
"Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ",
rulla->vcc ? rulla->vcc->vpi : 0,
@@ -1389,12 +1395,9 @@ static void dump_arp_table(struct lec_priv *priv)
pr_info("No forward\n");
hlist_for_each_entry(rulla, &priv->lec_no_forward, next) {
offset = 0;
- offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr);
- offset += sprintf(buf + offset, " Atm:");
- for (j = 0; j < ATM_ESA_LEN; j++) {
- offset += sprintf(buf + offset, "%2.2x ",
- rulla->atm_addr[j] & 0xff);
- }
+ offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr);
+ offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN,
+ rulla->atm_addr);
offset += sprintf(buf + offset,
"Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ",
rulla->vcc ? rulla->vcc->vpi : 0,
@@ -1414,12 +1417,9 @@ static void dump_arp_table(struct lec_priv *priv)
pr_info("Empty ones\n");
hlist_for_each_entry(rulla, &priv->lec_arp_empty_ones, next) {
offset = 0;
- offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr);
- offset += sprintf(buf + offset, " Atm:");
- for (j = 0; j < ATM_ESA_LEN; j++) {
- offset += sprintf(buf + offset, "%2.2x ",
- rulla->atm_addr[j] & 0xff);
- }
+ offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr);
+ offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN,
+ rulla->atm_addr);
offset += sprintf(buf + offset,
"Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ",
rulla->vcc ? rulla->vcc->vpi : 0,
@@ -1439,12 +1439,9 @@ static void dump_arp_table(struct lec_priv *priv)
pr_info("Multicast Forward VCCs\n");
hlist_for_each_entry(rulla, &priv->mcast_fwds, next) {
offset = 0;
- offset += sprintf(buf + offset, "Mac: %pM", rulla->mac_addr);
- offset += sprintf(buf + offset, " Atm:");
- for (j = 0; j < ATM_ESA_LEN; j++) {
- offset += sprintf(buf + offset, "%2.2x ",
- rulla->atm_addr[j] & 0xff);
- }
+ offset += sprintf(buf + offset, "Mac: %pM ", rulla->mac_addr);
+ offset += sprintf(buf + offset, "Atm: %*ph ", ATM_ESA_LEN,
+ rulla->atm_addr);
offset += sprintf(buf + offset,
"Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ",
rulla->vcc ? rulla->vcc->vpi : 0,
@@ -1493,7 +1490,7 @@ static void lec_arp_destroy(struct lec_priv *priv)
hlist_for_each_entry_safe(entry, next,
&priv->lec_arp_empty_ones, next) {
- del_timer_sync(&entry->timer);
+ timer_delete_sync(&entry->timer);
lec_arp_clear_vccs(entry);
hlist_del(&entry->next);
lec_arp_put(entry);
@@ -1502,7 +1499,7 @@ static void lec_arp_destroy(struct lec_priv *priv)
hlist_for_each_entry_safe(entry, next,
&priv->lec_no_forward, next) {
- del_timer_sync(&entry->timer);
+ timer_delete_sync(&entry->timer);
lec_arp_clear_vccs(entry);
hlist_del(&entry->next);
lec_arp_put(entry);
@@ -1545,10 +1542,8 @@ static struct lec_arp_table *make_entry(struct lec_priv *priv,
struct lec_arp_table *to_return;
to_return = kzalloc(sizeof(struct lec_arp_table), GFP_ATOMIC);
- if (!to_return) {
- pr_info("LEC: Arp entry kmalloc failed\n");
+ if (!to_return)
return NULL;
- }
ether_addr_copy(to_return->mac_addr, mac_addr);
INIT_HLIST_NODE(&to_return->next);
timer_setup(&to_return->timer, lec_arp_expire_arp, 0);
@@ -1564,7 +1559,7 @@ static void lec_arp_expire_arp(struct timer_list *t)
{
struct lec_arp_table *entry;
- entry = from_timer(entry, t, timer);
+ entry = timer_container_of(entry, t, timer);
pr_debug("\n");
if (entry->status == ESI_ARP_PENDING) {
@@ -1585,10 +1580,11 @@ static void lec_arp_expire_arp(struct timer_list *t)
static void lec_arp_expire_vcc(struct timer_list *t)
{
unsigned long flags;
- struct lec_arp_table *to_remove = from_timer(to_remove, t, timer);
+ struct lec_arp_table *to_remove = timer_container_of(to_remove, t,
+ timer);
struct lec_priv *priv = to_remove->priv;
- del_timer(&to_remove->timer);
+ timer_delete(&to_remove->timer);
pr_debug("%p %p: vpi:%d vci:%d\n",
to_remove, priv,
@@ -1856,16 +1852,16 @@ lec_arp_update(struct lec_priv *priv, const unsigned char *mac_addr,
&priv->lec_arp_empty_ones, next) {
if (memcmp(entry->atm_addr, atm_addr, ATM_ESA_LEN) == 0) {
hlist_del(&entry->next);
- del_timer(&entry->timer);
+ timer_delete(&entry->timer);
tmp = lec_arp_find(priv, mac_addr);
if (tmp) {
- del_timer(&tmp->timer);
+ timer_delete(&tmp->timer);
tmp->status = ESI_FORWARD_DIRECT;
memcpy(tmp->atm_addr, atm_addr, ATM_ESA_LEN);
tmp->vcc = entry->vcc;
tmp->old_push = entry->old_push;
tmp->last_used = jiffies;
- del_timer(&entry->timer);
+ timer_delete(&entry->timer);
lec_arp_put(entry);
entry = tmp;
} else {
@@ -1896,7 +1892,7 @@ lec_arp_update(struct lec_priv *priv, const unsigned char *mac_addr,
/* Temporary, changes before end of function */
}
memcpy(entry->atm_addr, atm_addr, ATM_ESA_LEN);
- del_timer(&entry->timer);
+ timer_delete(&entry->timer);
for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
hlist_for_each_entry(tmp,
&priv->lec_arp_tables[i], next) {
@@ -1959,7 +1955,7 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data,
entry = make_entry(priv, bus_mac);
if (entry == NULL)
goto out;
- del_timer(&entry->timer);
+ timer_delete(&entry->timer);
memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN);
entry->recv_vcc = vcc;
entry->old_recv_push = old_push;
@@ -1970,17 +1966,8 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data,
* Vcc which we don't want to make default vcc,
* attach it anyway.
*/
- pr_debug("LEC_ARP:Attaching data direct, not default: %2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n",
- ioc_data->atm_addr[0], ioc_data->atm_addr[1],
- ioc_data->atm_addr[2], ioc_data->atm_addr[3],
- ioc_data->atm_addr[4], ioc_data->atm_addr[5],
- ioc_data->atm_addr[6], ioc_data->atm_addr[7],
- ioc_data->atm_addr[8], ioc_data->atm_addr[9],
- ioc_data->atm_addr[10], ioc_data->atm_addr[11],
- ioc_data->atm_addr[12], ioc_data->atm_addr[13],
- ioc_data->atm_addr[14], ioc_data->atm_addr[15],
- ioc_data->atm_addr[16], ioc_data->atm_addr[17],
- ioc_data->atm_addr[18], ioc_data->atm_addr[19]);
+ pr_debug("LEC_ARP:Attaching data direct, not default: %*phN\n",
+ ATM_ESA_LEN, ioc_data->atm_addr);
entry = make_entry(priv, bus_mac);
if (entry == NULL)
goto out;
@@ -1996,17 +1983,8 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data,
dump_arp_table(priv);
goto out;
}
- pr_debug("LEC_ARP:Attaching data direct, default: %2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n",
- ioc_data->atm_addr[0], ioc_data->atm_addr[1],
- ioc_data->atm_addr[2], ioc_data->atm_addr[3],
- ioc_data->atm_addr[4], ioc_data->atm_addr[5],
- ioc_data->atm_addr[6], ioc_data->atm_addr[7],
- ioc_data->atm_addr[8], ioc_data->atm_addr[9],
- ioc_data->atm_addr[10], ioc_data->atm_addr[11],
- ioc_data->atm_addr[12], ioc_data->atm_addr[13],
- ioc_data->atm_addr[14], ioc_data->atm_addr[15],
- ioc_data->atm_addr[16], ioc_data->atm_addr[17],
- ioc_data->atm_addr[18], ioc_data->atm_addr[19]);
+ pr_debug("LEC_ARP:Attaching data direct, default: %*phN\n",
+ ATM_ESA_LEN, ioc_data->atm_addr);
for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) {
hlist_for_each_entry(entry,
&priv->lec_arp_tables[i], next) {
@@ -2019,7 +1997,7 @@ lec_vcc_added(struct lec_priv *priv, const struct atmlec_ioc *ioc_data,
entry->recv_vcc ? entry->recv_vcc->
vci : 0);
found_entry = 1;
- del_timer(&entry->timer);
+ timer_delete(&entry->timer);
entry->vcc = vcc;
entry->old_push = old_push;
if (entry->status == ESI_VC_PENDING) {
@@ -2203,7 +2181,7 @@ static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc)
&priv->lec_arp_empty_ones, next) {
if (entry->vcc == vcc) {
lec_arp_clear_vccs(entry);
- del_timer(&entry->timer);
+ timer_delete(&entry->timer);
hlist_del(&entry->next);
lec_arp_put(entry);
}
@@ -2213,7 +2191,7 @@ static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc)
&priv->lec_no_forward, next) {
if (entry->recv_vcc == vcc) {
lec_arp_clear_vccs(entry);
- del_timer(&entry->timer);
+ timer_delete(&entry->timer);
hlist_del(&entry->next);
lec_arp_put(entry);
}
@@ -2246,7 +2224,7 @@ lec_arp_check_empties(struct lec_priv *priv,
hlist_for_each_entry_safe(entry, next,
&priv->lec_arp_empty_ones, next) {
if (vcc == entry->vcc) {
- del_timer(&entry->timer);
+ timer_delete(&entry->timer);
ether_addr_copy(entry->mac_addr, src);
entry->status = ESI_FORWARD_DIRECT;
entry->last_used = jiffies;
@@ -2266,4 +2244,5 @@ out:
spin_unlock_irqrestore(&priv->lec_arp_lock, flags);
}
+MODULE_DESCRIPTION("ATM LAN Emulation (LANE) support");
MODULE_LICENSE("GPL");
diff --git a/net/atm/lec_arpc.h b/net/atm/lec_arpc.h
index 1205d8792d28..39115fe074c4 100644
--- a/net/atm/lec_arpc.h
+++ b/net/atm/lec_arpc.h
@@ -44,7 +44,7 @@ struct lec_arp_table {
u8 *tlvs;
u32 sizeoftlvs; /*
* LANE2: Each MAC address can have TLVs
- * associated with it. sizeoftlvs tells the
+ * associated with it. sizeoftlvs tells
* the length of the tlvs array
*/
struct sk_buff_head tx_wait; /* wait queue for outgoing packets */
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 24b53c4c39c6..f6b447bba329 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
#include <linux/kernel.h>
@@ -803,7 +804,7 @@ static int atm_mpoa_mpoad_attach(struct atm_vcc *vcc, int arg)
/* This lets us now how our LECs are doing */
err = register_netdevice_notifier(&mpoa_notifier);
if (err < 0) {
- del_timer(&mpc_timer);
+ timer_delete(&mpc_timer);
return err;
}
}
@@ -1313,6 +1314,8 @@ static void MPOA_cache_impos_rcvd(struct k_message *msg,
holding_time = msg->content.eg_info.holding_time;
dprintk("(%s) entry = %p, holding_time = %u\n",
mpc->dev->name, entry, holding_time);
+ if (entry == NULL && !holding_time)
+ return;
if (entry == NULL && holding_time) {
entry = mpc->eg_ops->add_entry(msg, mpc);
mpc->eg_ops->put(entry);
@@ -1492,7 +1495,7 @@ static void __exit atm_mpoa_cleanup(void)
mpc_proc_clean();
- del_timer_sync(&mpc_timer);
+ timer_delete_sync(&mpc_timer);
unregister_netdevice_notifier(&mpoa_notifier);
deregister_atm_ioctl(&atm_ioctl_ops);
@@ -1531,4 +1534,5 @@ static void __exit atm_mpoa_cleanup(void)
module_init(atm_mpoa_init);
module_exit(atm_mpoa_cleanup);
+MODULE_DESCRIPTION("Multi-Protocol Over ATM (MPOA) driver");
MODULE_LICENSE("GPL");
diff --git a/net/atm/mpoa_caches.c b/net/atm/mpoa_caches.c
index 4bb418313720..f7a2f0e41105 100644
--- a/net/atm/mpoa_caches.c
+++ b/net/atm/mpoa_caches.c
@@ -180,8 +180,7 @@ static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc)
static void in_cache_put(in_cache_entry *entry)
{
if (refcount_dec_and_test(&entry->use)) {
- memset(entry, 0, sizeof(in_cache_entry));
- kfree(entry);
+ kfree_sensitive(entry);
}
}
@@ -416,8 +415,7 @@ static eg_cache_entry *eg_cache_get_by_src_ip(__be32 ipaddr,
static void eg_cache_put(eg_cache_entry *entry)
{
if (refcount_dec_and_test(&entry->use)) {
- memset(entry, 0, sizeof(eg_cache_entry));
- kfree(entry);
+ kfree_sensitive(entry);
}
}
diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c
index 46d6cd9a36ae..aaf64b953915 100644
--- a/net/atm/mpoa_proc.c
+++ b/net/atm/mpoa_proc.c
@@ -53,15 +53,12 @@ static ssize_t proc_mpc_write(struct file *file, const char __user *buff,
static int parse_qos(const char *buff);
-/*
- * Define allowed FILE OPERATIONS
- */
-static const struct file_operations mpc_file_operations = {
- .open = proc_mpc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = proc_mpc_write,
- .release = seq_release,
+static const struct proc_ops mpc_proc_ops = {
+ .proc_open = proc_mpc_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = proc_mpc_write,
+ .proc_release = seq_release,
};
/*
@@ -222,11 +219,12 @@ static ssize_t proc_mpc_write(struct file *file, const char __user *buff,
if (!page)
return -ENOMEM;
- for (p = page, len = 0; len < nbytes; p++, len++) {
+ for (p = page, len = 0; len < nbytes; p++) {
if (get_user(*p, buff++)) {
free_page((unsigned long)page);
return -EFAULT;
}
+ len += 1;
if (*p == '\0' || *p == '\n')
break;
}
@@ -290,7 +288,7 @@ int mpc_proc_init(void)
{
struct proc_dir_entry *p;
- p = proc_create(STAT_FILE_NAME, 0, atm_proc_root, &mpc_file_operations);
+ p = proc_create(STAT_FILE_NAME, 0, atm_proc_root, &mpc_proc_ops);
if (!p) {
pr_err("Unable to initialize /proc/atm/%s\n", STAT_FILE_NAME);
return -ENOMEM;
diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c
index d84227d75717..3e4f17d335fe 100644
--- a/net/atm/pppoatm.c
+++ b/net/atm/pppoatm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* net/atm/pppoatm.c - RFC2364 PPP over ATM/AAL5 */
/* Copyright 1999-2000 by Mitchell Blank Jr */
@@ -6,10 +7,6 @@
/* And help from Jens Axboe */
/*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*
* This driver provides the encapsulation and framing for sending
* and receiving PPP frames in ATM AAL5 PDUs.
@@ -104,9 +101,11 @@ static inline struct pppoatm_vcc *chan_to_pvcc(const struct ppp_channel *chan)
* doesn't want to be called in interrupt context, so we do it from
* a tasklet
*/
-static void pppoatm_wakeup_sender(unsigned long arg)
+static void pppoatm_wakeup_sender(struct tasklet_struct *t)
{
- ppp_output_wakeup((struct ppp_channel *) arg);
+ struct pppoatm_vcc *pvcc = from_tasklet(pvcc, t, wakeup_tasklet);
+
+ ppp_output_wakeup(&pvcc->chan);
}
static void pppoatm_release_cb(struct atm_vcc *atmvcc)
@@ -219,9 +218,7 @@ static void pppoatm_push(struct atm_vcc *atmvcc, struct sk_buff *skb)
pvcc->chan.mtu += LLC_LEN;
break;
}
- pr_debug("Couldn't autodetect yet (skb: %02X %02X %02X %02X %02X %02X)\n",
- skb->data[0], skb->data[1], skb->data[2],
- skb->data[3], skb->data[4], skb->data[5]);
+ pr_debug("Couldn't autodetect yet (skb: %6ph)\n", skb->data);
goto error;
case e_vc:
break;
@@ -394,11 +391,7 @@ static int pppoatm_assign_vcc(struct atm_vcc *atmvcc, void __user *arg)
struct atm_backend_ppp be;
struct pppoatm_vcc *pvcc;
int err;
- /*
- * Each PPPoATM instance has its own tasklet - this is just a
- * prototypical one used to initialize them
- */
- static const DECLARE_TASKLET(tasklet_proto, pppoatm_wakeup_sender, 0);
+
if (copy_from_user(&be, arg, sizeof be))
return -EFAULT;
if (be.encaps != PPPOATM_ENCAPS_AUTODETECT &&
@@ -420,8 +413,7 @@ static int pppoatm_assign_vcc(struct atm_vcc *atmvcc, void __user *arg)
pvcc->chan.ops = &pppoatm_ops;
pvcc->chan.mtu = atmvcc->qos.txtp.max_sdu - PPP_HDRLEN -
(be.encaps == e_vc ? 0 : LLC_LEN);
- pvcc->wakeup_tasklet = tasklet_proto;
- pvcc->wakeup_tasklet.data = (unsigned long) &pvcc->chan;
+ tasklet_setup(&pvcc->wakeup_tasklet, pppoatm_wakeup_sender);
err = ppp_register_channel(&pvcc->chan);
if (err != 0) {
kfree(pvcc);
diff --git a/net/atm/proc.c b/net/atm/proc.c
index 0b0495a41bbe..9bf736290e48 100644
--- a/net/atm/proc.c
+++ b/net/atm/proc.c
@@ -36,9 +36,9 @@
static ssize_t proc_dev_atm_read(struct file *file, char __user *buf,
size_t count, loff_t *pos);
-static const struct file_operations proc_atm_dev_ops = {
- .read = proc_dev_atm_read,
- .llseek = noop_llseek,
+static const struct proc_ops atm_dev_proc_ops = {
+ .proc_read = proc_dev_atm_read,
+ .proc_lseek = noop_llseek,
};
static void add_stats(struct seq_file *seq, const char *aal,
@@ -108,7 +108,7 @@ out:
static inline void *vcc_walk(struct seq_file *seq, loff_t l)
{
struct vcc_state *state = seq->private;
- int family = (uintptr_t)(PDE_DATA(file_inode(seq->file)));
+ int family = (uintptr_t)(pde_data(file_inode(seq->file)));
return __vcc_walk(&state->sk, family, &state->bucket, l) ?
state : NULL;
@@ -134,7 +134,7 @@ static void vcc_seq_stop(struct seq_file *seq, void *v)
static void *vcc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
v = vcc_walk(seq, 1);
- *pos += !!PTR_ERR(v);
+ (*pos)++;
return v;
}
@@ -324,7 +324,7 @@ static ssize_t proc_dev_atm_read(struct file *file, char __user *buf,
page = get_zeroed_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
- dev = PDE_DATA(file_inode(file));
+ dev = pde_data(file_inode(file));
if (!dev->ops->proc_read)
length = -EINVAL;
else {
@@ -359,7 +359,7 @@ int atm_proc_dev_register(struct atm_dev *dev)
goto err_out;
dev->proc_entry = proc_create_data(dev->proc_name, 0, atm_proc_root,
- &proc_atm_dev_ops, dev);
+ &atm_dev_proc_ops, dev);
if (!dev->proc_entry)
goto err_free_name;
return 0;
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index 2cb10af16afc..8f5e76f5dd9e 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -24,7 +24,7 @@ static int pvc_shutdown(struct socket *sock, int how)
return 0;
}
-static int pvc_bind(struct socket *sock, struct sockaddr *sockaddr,
+static int pvc_bind(struct socket *sock, struct sockaddr_unsized *sockaddr,
int sockaddr_len)
{
struct sock *sk = sock->sk;
@@ -56,14 +56,14 @@ out:
return error;
}
-static int pvc_connect(struct socket *sock, struct sockaddr *sockaddr,
+static int pvc_connect(struct socket *sock, struct sockaddr_unsized *sockaddr,
int sockaddr_len, int flags)
{
return pvc_bind(sock, sockaddr, sockaddr_len);
}
static int pvc_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
int error;
@@ -118,6 +118,7 @@ static const struct proto_ops pvc_proto_ops = {
#ifdef CONFIG_COMPAT
.compat_ioctl = vcc_compat_ioctl,
#endif
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = pvc_shutdown,
.setsockopt = pvc_setsockopt,
@@ -125,7 +126,6 @@ static const struct proto_ops pvc_proto_ops = {
.sendmsg = vcc_sendmsg,
.recvmsg = vcc_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
};
diff --git a/net/atm/raw.c b/net/atm/raw.c
index b3ba44aab0ee..1e6511ec842c 100644
--- a/net/atm/raw.c
+++ b/net/atm/raw.c
@@ -36,7 +36,7 @@ static void atm_pop_raw(struct atm_vcc *vcc, struct sk_buff *skb)
pr_debug("(%d) %d -= %d\n",
vcc->vci, sk_wmem_alloc_get(sk), ATM_SKB(skb)->acct_truesize);
- WARN_ON(refcount_sub_and_test(ATM_SKB(skb)->acct_truesize, &sk->sk_wmem_alloc));
+ atm_return_tx(vcc, skb);
dev_kfree_skb_any(skb);
sk->sk_write_space(sk);
}
@@ -54,6 +54,8 @@ static int atm_send_aal0(struct atm_vcc *vcc, struct sk_buff *skb)
kfree_skb(skb);
return -EADDRNOTAVAIL;
}
+ if (vcc->dev->ops->send_bh)
+ return vcc->dev->ops->send_bh(vcc, skb);
return vcc->dev->ops->send(vcc, skb);
}
@@ -71,7 +73,10 @@ int atm_init_aal34(struct atm_vcc *vcc)
vcc->push = atm_push_raw;
vcc->pop = atm_pop_raw;
vcc->push_oam = NULL;
- vcc->send = vcc->dev->ops->send;
+ if (vcc->dev->ops->send_bh)
+ vcc->send = vcc->dev->ops->send_bh;
+ else
+ vcc->send = vcc->dev->ops->send;
return 0;
}
@@ -80,7 +85,10 @@ int atm_init_aal5(struct atm_vcc *vcc)
vcc->push = atm_push_raw;
vcc->pop = atm_pop_raw;
vcc->push_oam = NULL;
- vcc->send = vcc->dev->ops->send;
+ if (vcc->dev->ops->send_bh)
+ vcc->send = vcc->dev->ops->send_bh;
+ else
+ vcc->send = vcc->dev->ops->send;
return 0;
}
EXPORT_SYMBOL(atm_init_aal5);
diff --git a/net/atm/resources.c b/net/atm/resources.c
index bada395ecdb1..7c6fdedbcf4e 100644
--- a/net/atm/resources.c
+++ b/net/atm/resources.c
@@ -52,10 +52,8 @@ static struct atm_dev *__alloc_atm_dev(const char *type)
static struct atm_dev *__atm_dev_lookup(int number)
{
struct atm_dev *dev;
- struct list_head *p;
- list_for_each(p, &atm_devs) {
- dev = list_entry(p, struct atm_dev, dev_list);
+ list_for_each_entry(dev, &atm_devs, dev_list) {
if (dev->number == number) {
atm_dev_hold(dev);
return dev;
@@ -114,7 +112,9 @@ struct atm_dev *atm_dev_register(const char *type, struct device *parent,
if (atm_proc_dev_register(dev) < 0) {
pr_err("atm_proc_dev_register failed for dev %s\n", type);
- goto out_fail;
+ mutex_unlock(&atm_dev_mutex);
+ kfree(dev);
+ return NULL;
}
if (atm_register_sysfs(dev, parent) < 0) {
@@ -130,7 +130,7 @@ out:
return dev;
out_fail:
- kfree(dev);
+ put_device(&dev->class_dev);
dev = NULL;
goto out;
}
@@ -148,11 +148,10 @@ void atm_dev_deregister(struct atm_dev *dev)
*/
mutex_lock(&atm_dev_mutex);
list_del(&dev->dev_list);
- mutex_unlock(&atm_dev_mutex);
-
atm_dev_release_vccs(dev);
atm_unregister_sysfs(dev);
atm_proc_dev_deregister(dev);
+ mutex_unlock(&atm_dev_mutex);
atm_dev_put(dev);
}
@@ -193,92 +192,47 @@ static int fetch_stats(struct atm_dev *dev, struct atm_dev_stats __user *arg,
return error ? -EFAULT : 0;
}
-int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat)
+int atm_getnames(void __user *buf, int __user *iobuf_len)
{
- void __user *buf;
- int error, len, number, size = 0;
+ int error, len, size = 0;
struct atm_dev *dev;
struct list_head *p;
int *tmp_buf, *tmp_p;
- int __user *sioc_len;
- int __user *iobuf_len;
-
-#ifndef CONFIG_COMPAT
- compat = 0; /* Just so the compiler _knows_ */
-#endif
- switch (cmd) {
- case ATM_GETNAMES:
- if (compat) {
-#ifdef CONFIG_COMPAT
- struct compat_atm_iobuf __user *ciobuf = arg;
- compat_uptr_t cbuf;
- iobuf_len = &ciobuf->length;
- if (get_user(cbuf, &ciobuf->buffer))
- return -EFAULT;
- buf = compat_ptr(cbuf);
-#endif
- } else {
- struct atm_iobuf __user *iobuf = arg;
- iobuf_len = &iobuf->length;
- if (get_user(buf, &iobuf->buffer))
- return -EFAULT;
- }
- if (get_user(len, iobuf_len))
- return -EFAULT;
- mutex_lock(&atm_dev_mutex);
- list_for_each(p, &atm_devs)
- size += sizeof(int);
- if (size > len) {
- mutex_unlock(&atm_dev_mutex);
- return -E2BIG;
- }
- tmp_buf = kmalloc(size, GFP_ATOMIC);
- if (!tmp_buf) {
- mutex_unlock(&atm_dev_mutex);
- return -ENOMEM;
- }
- tmp_p = tmp_buf;
- list_for_each(p, &atm_devs) {
- dev = list_entry(p, struct atm_dev, dev_list);
- *tmp_p++ = dev->number;
- }
+ if (get_user(len, iobuf_len))
+ return -EFAULT;
+ mutex_lock(&atm_dev_mutex);
+ list_for_each(p, &atm_devs)
+ size += sizeof(int);
+ if (size > len) {
mutex_unlock(&atm_dev_mutex);
- error = ((copy_to_user(buf, tmp_buf, size)) ||
- put_user(size, iobuf_len))
- ? -EFAULT : 0;
- kfree(tmp_buf);
- return error;
- default:
- break;
+ return -E2BIG;
}
-
- if (compat) {
-#ifdef CONFIG_COMPAT
- struct compat_atmif_sioc __user *csioc = arg;
- compat_uptr_t carg;
-
- sioc_len = &csioc->length;
- if (get_user(carg, &csioc->arg))
- return -EFAULT;
- buf = compat_ptr(carg);
-
- if (get_user(len, &csioc->length))
- return -EFAULT;
- if (get_user(number, &csioc->number))
- return -EFAULT;
-#endif
- } else {
- struct atmif_sioc __user *sioc = arg;
-
- sioc_len = &sioc->length;
- if (get_user(buf, &sioc->arg))
- return -EFAULT;
- if (get_user(len, &sioc->length))
- return -EFAULT;
- if (get_user(number, &sioc->number))
- return -EFAULT;
+ tmp_buf = kmalloc(size, GFP_ATOMIC);
+ if (!tmp_buf) {
+ mutex_unlock(&atm_dev_mutex);
+ return -ENOMEM;
}
+ tmp_p = tmp_buf;
+ list_for_each_entry(dev, &atm_devs, dev_list) {
+ *tmp_p++ = dev->number;
+ }
+ mutex_unlock(&atm_dev_mutex);
+ error = ((copy_to_user(buf, tmp_buf, size)) ||
+ put_user(size, iobuf_len))
+ ? -EFAULT : 0;
+ kfree(tmp_buf);
+ return error;
+}
+
+int atm_dev_ioctl(unsigned int cmd, void __user *buf, int __user *sioc_len,
+ int number, int compat)
+{
+ int error, len, size = 0;
+ struct atm_dev *dev;
+
+ if (get_user(len, sioc_len))
+ return -EFAULT;
dev = try_then_request_module(atm_dev_lookup(number), "atm-device-%d",
number);
@@ -310,7 +264,7 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat)
goto done;
}
}
- /* fall through */
+ fallthrough;
case ATM_SETESIF:
{
unsigned char esi[ESI_LEN];
@@ -332,7 +286,7 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat)
error = -EPERM;
goto done;
}
- /* fall through */
+ fallthrough;
case ATM_GETSTAT:
size = sizeof(struct atm_dev_stats);
error = fetch_stats(dev, buf, cmd == ATM_GETSTATZ);
@@ -405,7 +359,7 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat)
error = -EINVAL;
goto done;
}
- /* fall through */
+ fallthrough;
case ATM_SETCIRANGE:
case SONET_GETSTATZ:
case SONET_SETDIAG:
@@ -415,9 +369,9 @@ int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat)
error = -EPERM;
goto done;
}
- /* fall through */
+ fallthrough;
default:
- if (compat) {
+ if (IS_ENABLED(CONFIG_COMPAT) && compat) {
#ifdef CONFIG_COMPAT
if (!dev->ops->compat_ioctl) {
error = -EINVAL;
@@ -447,6 +401,7 @@ done:
return error;
}
+#ifdef CONFIG_PROC_FS
void *atm_dev_seq_start(struct seq_file *seq, loff_t *pos)
{
mutex_lock(&atm_dev_mutex);
@@ -462,3 +417,4 @@ void *atm_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
return seq_list_next(v, &atm_devs, pos);
}
+#endif
diff --git a/net/atm/resources.h b/net/atm/resources.h
index 048232e4d4c6..4a0839e92ff3 100644
--- a/net/atm/resources.h
+++ b/net/atm/resources.h
@@ -14,8 +14,9 @@
extern struct list_head atm_devs;
extern struct mutex atm_dev_mutex;
-int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat);
-
+int atm_getnames(void __user *buf, int __user *iobuf_len);
+int atm_dev_ioctl(unsigned int cmd, void __user *buf, int __user *sioc_len,
+ int number, int compat);
#ifdef CONFIG_PROC_FS
diff --git a/net/atm/signaling.c b/net/atm/signaling.c
index 6c11cdf4dd4c..e70ae2c113f9 100644
--- a/net/atm/signaling.c
+++ b/net/atm/signaling.c
@@ -52,7 +52,7 @@ static void modify_qos(struct atm_vcc *vcc, struct atmsvc_msg *msg)
msg->type = as_okay;
}
/*
- * Should probably just turn around the old skb. But the, the buffer
+ * Should probably just turn around the old skb. But then, the buffer
* space accounting needs to follow the change too. Maybe later.
*/
while (!(skb = alloc_skb(sizeof(struct atmsvc_msg), GFP_KERNEL)))
@@ -109,7 +109,7 @@ static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb)
dev_kfree_skb(skb);
goto as_indicate_complete;
}
- sk->sk_ack_backlog++;
+ sk_acceptq_added(sk);
skb_queue_tail(&sk->sk_receive_queue, skb);
pr_debug("waking sk_sleep(sk) 0x%p\n", sk_sleep(sk));
sk->sk_state_change(sk);
@@ -125,7 +125,7 @@ as_indicate_complete:
break;
case as_addparty:
case as_dropparty:
- sk->sk_err_soft = -msg->reply;
+ WRITE_ONCE(sk->sk_err_soft, -msg->reply);
/* < 0 failure, otherwise ep_ref */
clear_bit(ATM_VF_WAITING, &vcc->flags);
break;
diff --git a/net/atm/svc.c b/net/atm/svc.c
index 2f91b766ac42..005964250ecd 100644
--- a/net/atm/svc.c
+++ b/net/atm/svc.c
@@ -28,6 +28,11 @@
#include "signaling.h"
#include "addr.h"
+#ifdef CONFIG_COMPAT
+/* It actually takes struct sockaddr_atmsvc, not struct atm_iobuf */
+#define COMPAT_ATM_ADDPARTY _IOW('a', ATMIOC_SPECIAL + 4, struct compat_atm_iobuf)
+#endif
+
static int svc_create(struct net *net, struct socket *sock, int protocol,
int kern);
@@ -92,7 +97,7 @@ static int svc_release(struct socket *sock)
return 0;
}
-static int svc_bind(struct socket *sock, struct sockaddr *sockaddr,
+static int svc_bind(struct socket *sock, struct sockaddr_unsized *sockaddr,
int sockaddr_len)
{
DEFINE_WAIT(wait);
@@ -148,7 +153,7 @@ out:
return error;
}
-static int svc_connect(struct socket *sock, struct sockaddr *sockaddr,
+static int svc_connect(struct socket *sock, struct sockaddr_unsized *sockaddr,
int sockaddr_len, int flags)
{
DEFINE_WAIT(wait);
@@ -319,8 +324,8 @@ out:
return error;
}
-static int svc_accept(struct socket *sock, struct socket *newsock, int flags,
- bool kern)
+static int svc_accept(struct socket *sock, struct socket *newsock,
+ struct proto_accept_arg *arg)
{
struct sock *sk = sock->sk;
struct sk_buff *skb;
@@ -331,7 +336,7 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags,
lock_sock(sk);
- error = svc_create(sock_net(sk), newsock, 0, kern);
+ error = svc_create(sock_net(sk), newsock, 0, arg->kern);
if (error)
goto out;
@@ -350,7 +355,7 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags,
error = -sk->sk_err;
break;
}
- if (flags & O_NONBLOCK) {
+ if (arg->flags & O_NONBLOCK) {
error = -EAGAIN;
break;
}
@@ -381,7 +386,7 @@ static int svc_accept(struct socket *sock, struct socket *newsock, int flags,
msg->pvc.sap_addr.vpi,
msg->pvc.sap_addr.vci);
dev_kfree_skb(skb);
- sk->sk_ack_backlog--;
+ sk_acceptq_removed(sk);
if (error) {
sigd_enq2(NULL, as_reject, old_vcc, NULL, NULL,
&old_vcc->qos, error);
@@ -451,7 +456,7 @@ int svc_change_qos(struct atm_vcc *vcc, struct atm_qos *qos)
}
static int svc_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
struct atm_vcc *vcc = ATM_SD(sock);
@@ -464,7 +469,7 @@ static int svc_setsockopt(struct socket *sock, int level, int optname,
error = -EINVAL;
goto out;
}
- if (copy_from_user(&vcc->sap, optval, optlen)) {
+ if (copy_from_sockptr(&vcc->sap, optval, optlen)) {
error = -EFAULT;
goto out;
}
@@ -475,7 +480,7 @@ static int svc_setsockopt(struct socket *sock, int level, int optname,
error = -EINVAL;
goto out;
}
- if (get_user(value, (int __user *)optval)) {
+ if (copy_from_sockptr(&value, optval, sizeof(int))) {
error = -EFAULT;
goto out;
}
@@ -641,6 +646,7 @@ static const struct proto_ops svc_proto_ops = {
#ifdef CONFIG_COMPAT
.compat_ioctl = svc_compat_ioctl,
#endif
+ .gettstamp = sock_gettstamp,
.listen = svc_listen,
.shutdown = svc_shutdown,
.setsockopt = svc_setsockopt,
@@ -648,7 +654,6 @@ static const struct proto_ops svc_proto_ops = {
.sendmsg = vcc_sendmsg,
.recvmsg = vcc_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
};
diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig
index 705e53ef4af0..e23a3dc14b93 100644
--- a/net/ax25/Kconfig
+++ b/net/ax25/Kconfig
@@ -1,15 +1,16 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Amateur Radio protocols and AX.25 device configuration
#
menuconfig HAMRADIO
- depends on NET && !S390
+ depends on NET
bool "Amateur Radio support"
help
If you want to connect your Linux box to an amateur radio, answer Y
- here. You want to read <http://www.tapr.org/>
+ here. You want to read <https://www.tapr.org/>
and more specifically about AX.25 on Linux
- <http://www.linux-ax25.org/>.
+ <https://linux-ax25.in-berlin.de>.
Note that the answer to this question won't directly affect the
kernel: saying N will just cause the configurator to skip all
@@ -38,11 +39,11 @@ config AX25
Information about where to get supporting software for Linux amateur
radio as well as information about how to configure an AX.25 port is
contained in the AX25-HOWTO, available from
- <http://www.tldp.org/docs.html#howto>. You might also want to
- check out the file <file:Documentation/networking/ax25.txt> in the
+ <https://www.tldp.org/docs.html#howto>. You might also want to
+ check out the file <file:Documentation/networking/ax25.rst> in the
kernel source. More information about digital amateur radio in
general is on the WWW at
- <http://www.tapr.org/>.
+ <https://www.tapr.org/>.
To compile this driver as a module, choose M here: the
module will be called ax25.
@@ -60,7 +61,7 @@ config AX25_DAMA_SLAVE
configuration. Linux cannot yet act as a DAMA server. This option
only compiles DAMA slave support into the kernel. It still needs to
be enabled at runtime. For more about DAMA see
- <http://www.linux-ax25.org>. If unsure, say Y.
+ <https://linux-ax25.in-berlin.de>. If unsure, say Y.
# placeholder until implemented
config AX25_DAMA_MASTER
@@ -86,10 +87,10 @@ config NETROM
A comprehensive listing of all the software for Linux amateur radio
users as well as information about how to configure an AX.25 port is
contained in the Linux Ham Wiki, available from
- <http://www.linux-ax25.org>. You also might want to check out the
- file <file:Documentation/networking/ax25.txt>. More information about
- digital amateur radio in general is on the WWW at
- <http://www.tapr.org/>.
+ <https://linux-ax25.in-berlin.de>. You also might want to check out
+ the file <file:Documentation/networking/ax25.rst>. More information
+ about digital amateur radio in general is on the WWW at
+ <https://www.tapr.org/>.
To compile this driver as a module, choose M here: the
module will be called netrom.
@@ -105,10 +106,10 @@ config ROSE
A comprehensive listing of all the software for Linux amateur radio
users as well as information about how to configure an AX.25 port is
contained in the Linux Ham Wiki, available from
- <http://www.linux-ax25.org>. You also might want to check out the
- file <file:Documentation/networking/ax25.txt>. More information about
- digital amateur radio in general is on the WWW at
- <http://www.tapr.org/>.
+ <https://linux-ax25.in-berlin.de>. You also might want to check out
+ the file <file:Documentation/networking/ax25.rst>. More information
+ about digital amateur radio in general is on the WWW at
+ <https://www.tapr.org/>.
To compile this driver as a module, choose M here: the
module will be called rose.
diff --git a/net/ax25/TODO b/net/ax25/TODO
deleted file mode 100644
index 69fb4e368d92..000000000000
--- a/net/ax25/TODO
+++ /dev/null
@@ -1,20 +0,0 @@
-Do the ax25_list_lock, ax25_dev_lock, linkfail_lockreally, ax25_frag_lock and
-listen_lock have to be bh-safe?
-
-Do the netrom and rose locks have to be bh-safe?
-
-A device might be deleted after lookup in the SIOCADDRT ioctl but before it's
-being used.
-
-Routes to a device being taken down might be deleted by ax25_rt_device_down
-but added by somebody else before the device has been deleted fully.
-
-The ax25_rt_find_route synopsys is pervert but I somehow had to deal with
-the race caused by the static variable in it's previous implementation.
-
-Implement proper socket locking in netrom and rose.
-
-Check socket locking when ax25_rcv is sending to raw sockets. In particular
-ax25_send_to_raw() seems fishy. Heck - ax25_rcv is fishy.
-
-Handle XID and TEST frames properly.
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index c603d33d5410..7ebbff2f0020 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
@@ -65,12 +62,12 @@ static void ax25_free_sock(struct sock *sk)
*/
static void ax25_cb_del(ax25_cb *ax25)
{
+ spin_lock_bh(&ax25_list_lock);
if (!hlist_unhashed(&ax25->ax25_node)) {
- spin_lock_bh(&ax25_list_lock);
hlist_del_init(&ax25->ax25_node);
- spin_unlock_bh(&ax25_list_lock);
ax25_cb_put(ax25);
}
+ spin_unlock_bh(&ax25_list_lock);
}
/*
@@ -80,19 +77,39 @@ static void ax25_kill_by_device(struct net_device *dev)
{
ax25_dev *ax25_dev;
ax25_cb *s;
+ struct sock *sk;
if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
return;
+ ax25_dev->device_up = false;
spin_lock_bh(&ax25_list_lock);
again:
ax25_for_each(s, &ax25_list) {
if (s->ax25_dev == ax25_dev) {
- s->ax25_dev = NULL;
+ sk = s->sk;
+ if (!sk) {
+ spin_unlock_bh(&ax25_list_lock);
+ ax25_disconnect(s, ENETUNREACH);
+ s->ax25_dev = NULL;
+ ax25_cb_del(s);
+ spin_lock_bh(&ax25_list_lock);
+ goto again;
+ }
+ sock_hold(sk);
spin_unlock_bh(&ax25_list_lock);
+ lock_sock(sk);
ax25_disconnect(s, ENETUNREACH);
+ s->ax25_dev = NULL;
+ if (sk->sk_socket) {
+ netdev_put(ax25_dev->dev,
+ &s->dev_tracker);
+ ax25_dev_put(ax25_dev);
+ }
+ ax25_cb_del(s);
+ release_sock(sk);
spin_lock_bh(&ax25_list_lock);
-
+ sock_put(sk);
/* The entry could have been deleted from the
* list meanwhile and thus the next pointer is
* no longer valid. Play it safe and restart
@@ -205,7 +222,7 @@ struct sock *ax25_get_socket(ax25_address *my_addr, ax25_address *dest_addr,
* Find an AX.25 control block given both ends. It will only pick up
* floating AX.25 control blocks or non Raw socket bound control blocks.
*/
-ax25_cb *ax25_find_cb(ax25_address *src_addr, ax25_address *dest_addr,
+ax25_cb *ax25_find_cb(const ax25_address *src_addr, ax25_address *dest_addr,
ax25_digi *digi, struct net_device *dev)
{
ax25_cb *s;
@@ -270,7 +287,7 @@ void ax25_destroy_socket(ax25_cb *);
*/
static void ax25_destroy_timer(struct timer_list *t)
{
- ax25_cb *ax25 = from_timer(ax25, t, dtimer);
+ ax25_cb *ax25 = timer_container_of(ax25, t, dtimer);
struct sock *sk;
sk=ax25->sk;
@@ -356,21 +373,25 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg)
if (copy_from_user(&ax25_ctl, arg, sizeof(ax25_ctl)))
return -EFAULT;
- if ((ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr)) == NULL)
- return -ENODEV;
-
if (ax25_ctl.digi_count > AX25_MAX_DIGIS)
return -EINVAL;
if (ax25_ctl.arg > ULONG_MAX / HZ && ax25_ctl.cmd != AX25_KILL)
return -EINVAL;
+ ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr);
+ if (!ax25_dev)
+ return -ENODEV;
+
digi.ndigi = ax25_ctl.digi_count;
for (k = 0; k < digi.ndigi; k++)
digi.calls[k] = ax25_ctl.digi_addr[k];
- if ((ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev)) == NULL)
+ ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev);
+ if (!ax25) {
+ ax25_dev_put(ax25_dev);
return -ENOTCONN;
+ }
switch (ax25_ctl.cmd) {
case AX25_KILL:
@@ -437,6 +458,7 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg)
}
out_put:
+ ax25_dev_put(ax25_dev);
ax25_cb_put(ax25);
return ret;
@@ -445,7 +467,7 @@ einval_put:
goto out_put;
}
-static void ax25_fillin_cb_from_dev(ax25_cb *ax25, ax25_dev *ax25_dev)
+static void ax25_fillin_cb_from_dev(ax25_cb *ax25, const ax25_dev *ax25_dev)
{
ax25->rtt = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]) / 2;
ax25->t1 = msecs_to_jiffies(ax25_dev->values[AX25_VALUES_T1]);
@@ -531,13 +553,13 @@ ax25_cb *ax25_create_cb(void)
*/
static int ax25_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
ax25_cb *ax25;
struct net_device *dev;
char devname[IFNAMSIZ];
- unsigned long opt;
+ unsigned int opt;
int res = 0;
if (level != SOL_AX25)
@@ -546,7 +568,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
if (optlen < sizeof(unsigned int))
return -EINVAL;
- if (get_user(opt, (unsigned int __user *)optval))
+ if (copy_from_sockptr(&opt, optval, sizeof(unsigned int)))
return -EFAULT;
lock_sock(sk);
@@ -569,7 +591,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
break;
case AX25_T1:
- if (opt < 1 || opt > ULONG_MAX / HZ) {
+ if (opt < 1 || opt > UINT_MAX / HZ) {
res = -EINVAL;
break;
}
@@ -578,7 +600,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
break;
case AX25_T2:
- if (opt < 1 || opt > ULONG_MAX / HZ) {
+ if (opt < 1 || opt > UINT_MAX / HZ) {
res = -EINVAL;
break;
}
@@ -594,7 +616,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
break;
case AX25_T3:
- if (opt < 1 || opt > ULONG_MAX / HZ) {
+ if (opt < 1 || opt > UINT_MAX / HZ) {
res = -EINVAL;
break;
}
@@ -602,7 +624,7 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
break;
case AX25_IDLE:
- if (opt > ULONG_MAX / (60 * HZ)) {
+ if (opt > UINT_MAX / (60 * HZ)) {
res = -EINVAL;
break;
}
@@ -638,10 +660,12 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
break;
case SO_BINDTODEVICE:
- if (optlen > IFNAMSIZ)
- optlen = IFNAMSIZ;
+ if (optlen > IFNAMSIZ - 1)
+ optlen = IFNAMSIZ - 1;
+
+ memset(devname, 0, sizeof(devname));
- if (copy_from_user(devname, optval, optlen)) {
+ if (copy_from_sockptr(devname, optval, optlen)) {
res = -EFAULT;
break;
}
@@ -653,15 +677,33 @@ static int ax25_setsockopt(struct socket *sock, int level, int optname,
break;
}
- dev = dev_get_by_name(&init_net, devname);
+ rcu_read_lock();
+ dev = dev_get_by_name_rcu(&init_net, devname);
if (!dev) {
+ rcu_read_unlock();
res = -ENODEV;
break;
}
+ if (ax25->ax25_dev) {
+ if (dev == ax25->ax25_dev->dev) {
+ rcu_read_unlock();
+ break;
+ }
+ netdev_put(ax25->ax25_dev->dev, &ax25->dev_tracker);
+ ax25_dev_put(ax25->ax25_dev);
+ }
+
ax25->ax25_dev = ax25_dev_ax25dev(dev);
+ if (!ax25->ax25_dev) {
+ rcu_read_unlock();
+ res = -ENODEV;
+ break;
+ }
ax25_fillin_cb(ax25, ax25->ax25_dev);
- dev_put(dev);
+ netdev_hold(dev, &ax25->dev_tracker, GFP_ATOMIC);
+ ax25_dev_hold(ax25->ax25_dev);
+ rcu_read_unlock();
break;
default:
@@ -692,7 +734,7 @@ static int ax25_getsockopt(struct socket *sock, int level, int optname,
if (maxlen < 1)
return -EFAULT;
- valptr = (void *) &val;
+ valptr = &val;
length = min_t(unsigned int, maxlen, sizeof(int));
lock_sock(sk);
@@ -747,14 +789,14 @@ static int ax25_getsockopt(struct socket *sock, int level, int optname,
ax25_dev = ax25->ax25_dev;
if (ax25_dev != NULL && ax25_dev->dev != NULL) {
- strlcpy(devname, ax25_dev->dev->name, sizeof(devname));
+ strscpy(devname, ax25_dev->dev->name, sizeof(devname));
length = strlen(devname) + 1;
} else {
*devname = '\0';
length = 1;
}
- valptr = (void *) devname;
+ valptr = devname;
break;
default:
@@ -804,7 +846,7 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol,
struct sock *sk;
ax25_cb *ax25;
- if (protocol < 0 || protocol > SK_PROTOCOL_MAX)
+ if (protocol < 0 || protocol > U8_MAX)
return -EINVAL;
if (!net_eq(net, &init_net))
@@ -844,6 +886,7 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol,
case AX25_P_ROSE:
if (ax25_protocol_is_registered(AX25_P_ROSE))
return -ESOCKTNOSUPPORT;
+ break;
#endif
default:
break;
@@ -851,6 +894,8 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol,
break;
case SOCK_RAW:
+ if (!capable(CAP_NET_RAW))
+ return -EPERM;
break;
default:
return -ESOCKTNOSUPPORT;
@@ -905,7 +950,7 @@ struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev)
sock_init_data(NULL, sk);
sk->sk_type = osk->sk_type;
- sk->sk_priority = osk->sk_priority;
+ sk->sk_priority = READ_ONCE(osk->sk_priority);
sk->sk_protocol = osk->sk_protocol;
sk->sk_rcvbuf = osk->sk_rcvbuf;
sk->sk_sndbuf = osk->sk_sndbuf;
@@ -951,21 +996,25 @@ static int ax25_release(struct socket *sock)
{
struct sock *sk = sock->sk;
ax25_cb *ax25;
+ ax25_dev *ax25_dev;
if (sk == NULL)
return 0;
sock_hold(sk);
- sock_orphan(sk);
lock_sock(sk);
+ sock_orphan(sk);
ax25 = sk_to_ax25(sk);
+ ax25_dev = ax25->ax25_dev;
if (sk->sk_type == SOCK_SEQPACKET) {
switch (ax25->state) {
case AX25_STATE_0:
- release_sock(sk);
- ax25_disconnect(ax25, 0);
- lock_sock(sk);
+ if (!sock_flag(ax25->sk, SOCK_DEAD)) {
+ release_sock(sk);
+ ax25_disconnect(ax25, 0);
+ lock_sock(sk);
+ }
ax25_destroy_socket(ax25);
break;
@@ -1020,6 +1069,17 @@ static int ax25_release(struct socket *sock)
sk->sk_state_change(sk);
ax25_destroy_socket(ax25);
}
+ if (ax25_dev) {
+ if (!ax25_dev->device_up) {
+ timer_delete_sync(&ax25->timer);
+ timer_delete_sync(&ax25->t1timer);
+ timer_delete_sync(&ax25->t2timer);
+ timer_delete_sync(&ax25->t3timer);
+ timer_delete_sync(&ax25->idletimer);
+ }
+ netdev_put(ax25_dev->dev, &ax25->dev_tracker);
+ ax25_dev_put(ax25_dev);
+ }
sock->sk = NULL;
release_sock(sk);
@@ -1034,7 +1094,7 @@ static int ax25_release(struct socket *sock)
* that we've implemented support for SO_BINDTODEVICE. It is however small
* and trivially backward compatible.
*/
-static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+static int ax25_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
{
struct sock *sk = sock->sk;
struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr;
@@ -1096,8 +1156,10 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
}
}
- if (ax25_dev != NULL)
+ if (ax25_dev) {
ax25_fillin_cb(ax25, ax25_dev);
+ netdev_hold(ax25_dev->dev, &ax25->dev_tracker, GFP_ATOMIC);
+ }
done:
ax25_cb_add(ax25);
@@ -1113,7 +1175,7 @@ out:
* FIXME: nonblock behaviour looks like it may have a bug.
*/
static int __must_check ax25_connect(struct socket *sock,
- struct sockaddr *uaddr, int addr_len, int flags)
+ struct sockaddr_unsized *uaddr, int addr_len, int flags)
{
struct sock *sk = sock->sk;
ax25_cb *ax25 = sk_to_ax25(sk), *ax25t;
@@ -1179,7 +1241,10 @@ static int __must_check ax25_connect(struct socket *sock,
if (addr_len > sizeof(struct sockaddr_ax25) &&
fsa->fsa_ax25.sax25_ndigis != 0) {
/* Valid number of digipeaters ? */
- if (fsa->fsa_ax25.sax25_ndigis < 1 || fsa->fsa_ax25.sax25_ndigis > AX25_MAX_DIGIS) {
+ if (fsa->fsa_ax25.sax25_ndigis < 1 ||
+ fsa->fsa_ax25.sax25_ndigis > AX25_MAX_DIGIS ||
+ addr_len < sizeof(struct sockaddr_ax25) +
+ sizeof(ax25_address) * fsa->fsa_ax25.sax25_ndigis) {
err = -EINVAL;
goto out_release;
}
@@ -1205,28 +1270,18 @@ static int __must_check ax25_connect(struct socket *sock,
}
}
- /*
- * Must bind first - autobinding in this may or may not work. If
- * the socket is already bound, check to see if the device has
- * been filled in, error if it hasn't.
- */
+ /* Must bind first - autobinding does not work. */
if (sock_flag(sk, SOCK_ZAPPED)) {
- /* check if we can remove this feature. It is broken. */
- printk(KERN_WARNING "ax25_connect(): %s uses autobind, please contact jreuter@yaina.de\n",
- current->comm);
- if ((err = ax25_rt_autobind(ax25, &fsa->fsa_ax25.sax25_call)) < 0) {
- kfree(digi);
- goto out_release;
- }
+ kfree(digi);
+ err = -EINVAL;
+ goto out_release;
+ }
- ax25_fillin_cb(ax25, ax25->ax25_dev);
- ax25_cb_add(ax25);
- } else {
- if (ax25->ax25_dev == NULL) {
- kfree(digi);
- err = -EHOSTUNREACH;
- goto out_release;
- }
+ /* Check to see if the device has been filled in, error if it hasn't. */
+ if (ax25->ax25_dev == NULL) {
+ kfree(digi);
+ err = -EHOSTUNREACH;
+ goto out_release;
}
if (sk->sk_type == SOCK_SEQPACKET &&
@@ -1319,13 +1374,15 @@ out_release:
return err;
}
-static int ax25_accept(struct socket *sock, struct socket *newsock, int flags,
- bool kern)
+static int ax25_accept(struct socket *sock, struct socket *newsock,
+ struct proto_accept_arg *arg)
{
struct sk_buff *skb;
struct sock *newsk;
+ ax25_dev *ax25_dev;
DEFINE_WAIT(wait);
struct sock *sk;
+ ax25_cb *ax25;
int err = 0;
if (sock->state != SS_UNCONNECTED)
@@ -1355,7 +1412,7 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, int flags,
if (skb)
break;
- if (flags & O_NONBLOCK) {
+ if (arg->flags & O_NONBLOCK) {
err = -EWOULDBLOCK;
break;
}
@@ -1378,8 +1435,12 @@ static int ax25_accept(struct socket *sock, struct socket *newsock, int flags,
/* Now attach up the new socket */
kfree_skb(skb);
- sk->sk_ack_backlog--;
+ sk_acceptq_removed(sk);
newsock->state = SS_CONNECTED;
+ ax25 = sk_to_ax25(newsk);
+ ax25_dev = ax25->ax25_dev;
+ netdev_hold(ax25_dev->dev, &ax25->dev_tracker, GFP_ATOMIC);
+ ax25_dev_hold(ax25_dev);
out:
release_sock(sk);
@@ -1499,7 +1560,10 @@ static int ax25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)usax;
/* Valid number of digipeaters ? */
- if (usax->sax25_ndigis < 1 || usax->sax25_ndigis > AX25_MAX_DIGIS) {
+ if (usax->sax25_ndigis < 1 ||
+ usax->sax25_ndigis > AX25_MAX_DIGIS ||
+ addr_len < sizeof(struct sockaddr_ax25) +
+ sizeof(ax25_address) * usax->sax25_ndigis) {
err = -EINVAL;
goto out;
}
@@ -1605,9 +1669,12 @@ static int ax25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int flags)
{
struct sock *sk = sock->sk;
- struct sk_buff *skb;
+ struct sk_buff *skb, *last;
+ struct sk_buff_head *sk_queue;
int copied;
int err = 0;
+ int off = 0;
+ long timeo;
lock_sock(sk);
/*
@@ -1619,11 +1686,29 @@ static int ax25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
goto out;
}
- /* Now we can treat all alike */
- skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
- flags & MSG_DONTWAIT, &err);
- if (skb == NULL)
- goto out;
+ /* We need support for non-blocking reads. */
+ sk_queue = &sk->sk_receive_queue;
+ skb = __skb_try_recv_datagram(sk, sk_queue, flags, &off, &err, &last);
+ /* If no packet is available, release_sock(sk) and try again. */
+ if (!skb) {
+ if (err != -EAGAIN)
+ goto out;
+ release_sock(sk);
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+ while (timeo && !__skb_wait_for_more_packets(sk, sk_queue, &err,
+ &timeo, last)) {
+ skb = __skb_try_recv_datagram(sk, sk_queue, flags, &off,
+ &err, &last);
+ if (skb)
+ break;
+
+ if (err != -EAGAIN)
+ goto done;
+ }
+ if (!skb)
+ goto done;
+ lock_sock(sk);
+ }
if (!sk_to_ax25(sk)->pidincl)
skb_pull(skb, 1); /* Remove PID */
@@ -1670,6 +1755,7 @@ static int ax25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
out:
release_sock(sk);
+done:
return err;
}
@@ -1707,14 +1793,6 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
break;
}
- case SIOCGSTAMP:
- res = sock_get_timestamp(sk, argp);
- break;
-
- case SIOCGSTAMPNS:
- res = sock_get_timestampns(sk, argp);
- break;
-
case SIOCAX25ADDUID: /* Add a uid to the uid/call map table */
case SIOCAX25DELUID: /* Delete a uid from the uid/call map table */
case SIOCAX25GETUID: {
@@ -1881,8 +1959,8 @@ static int ax25_info_show(struct seq_file *seq, void *v)
* magic dev src_addr dest_addr,digi1,digi2,.. st vs vr va t1 t1 t2 t2 t3 t3 idle idle n2 n2 rtt window paclen Snd-Q Rcv-Q inode
*/
- seq_printf(seq, "%8.8lx %s %s%s ",
- (long) ax25,
+ seq_printf(seq, "%p %s %s%s ",
+ ax25,
ax25->ax25_dev == NULL? "???" : ax25->ax25_dev->dev->name,
ax2asc(buf, &ax25->source_addr),
ax25->iamdigi? "*":"");
@@ -1943,6 +2021,7 @@ static const struct proto_ops ax25_proto_ops = {
.getname = ax25_getname,
.poll = datagram_poll,
.ioctl = ax25_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = ax25_listen,
.shutdown = ax25_shutdown,
.setsockopt = ax25_setsockopt,
@@ -1950,7 +2029,6 @@ static const struct proto_ops ax25_proto_ops = {
.sendmsg = ax25_sendmsg,
.recvmsg = ax25_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
};
/*
diff --git a/net/ax25/ax25_addr.c b/net/ax25/ax25_addr.c
index a14cfa736b63..f68865a4d0ab 100644
--- a/net/ax25/ax25_addr.c
+++ b/net/ax25/ax25_addr.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
*/
diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c
index 9a3a301e1e2f..3733c0254a50 100644
--- a/net/ax25/ax25_dev.c
+++ b/net/ax25/ax25_dev.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
*/
@@ -25,11 +22,12 @@
#include <net/sock.h>
#include <linux/uaccess.h>
#include <linux/fcntl.h>
+#include <linux/list.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/init.h>
-ax25_dev *ax25_dev_list;
+static LIST_HEAD(ax25_dev_list);
DEFINE_SPINLOCK(ax25_dev_lock);
ax25_dev *ax25_addr_ax25dev(ax25_address *addr)
@@ -37,9 +35,11 @@ ax25_dev *ax25_addr_ax25dev(ax25_address *addr)
ax25_dev *ax25_dev, *res = NULL;
spin_lock_bh(&ax25_dev_lock);
- for (ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next)
- if (ax25cmp(addr, (ax25_address *)ax25_dev->dev->dev_addr) == 0) {
+ list_for_each_entry(ax25_dev, &ax25_dev_list, list)
+ if (ax25cmp(addr, (const ax25_address *)ax25_dev->dev->dev_addr) == 0) {
res = ax25_dev;
+ ax25_dev_hold(ax25_dev);
+ break;
}
spin_unlock_bh(&ax25_dev_lock);
@@ -54,15 +54,17 @@ void ax25_dev_device_up(struct net_device *dev)
{
ax25_dev *ax25_dev;
- if ((ax25_dev = kzalloc(sizeof(*ax25_dev), GFP_ATOMIC)) == NULL) {
+ ax25_dev = kzalloc(sizeof(*ax25_dev), GFP_KERNEL);
+ if (!ax25_dev) {
printk(KERN_ERR "AX.25: ax25_dev_device_up - out of memory\n");
return;
}
- dev->ax25_ptr = ax25_dev;
+ refcount_set(&ax25_dev->refcount, 1);
ax25_dev->dev = dev;
- dev_hold(dev);
+ netdev_hold(dev, &ax25_dev->dev_tracker, GFP_KERNEL);
ax25_dev->forward = NULL;
+ ax25_dev->device_up = true;
ax25_dev->values[AX25_VALUES_IPDEFMODE] = AX25_DEF_IPDEFMODE;
ax25_dev->values[AX25_VALUES_AXDEFMODE] = AX25_DEF_AXDEFMODE;
@@ -77,15 +79,18 @@ void ax25_dev_device_up(struct net_device *dev)
ax25_dev->values[AX25_VALUES_N2] = AX25_DEF_N2;
ax25_dev->values[AX25_VALUES_PACLEN] = AX25_DEF_PACLEN;
ax25_dev->values[AX25_VALUES_PROTOCOL] = AX25_DEF_PROTOCOL;
+
+#ifdef CONFIG_AX25_DAMA_SLAVE
ax25_dev->values[AX25_VALUES_DS_TIMEOUT]= AX25_DEF_DS_TIMEOUT;
+#endif
#if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER)
ax25_ds_setup_timer(ax25_dev);
#endif
spin_lock_bh(&ax25_dev_lock);
- ax25_dev->next = ax25_dev_list;
- ax25_dev_list = ax25_dev;
+ list_add(&ax25_dev->list, &ax25_dev_list);
+ rcu_assign_pointer(dev->ax25_ptr, ax25_dev);
spin_unlock_bh(&ax25_dev_lock);
ax25_register_dev_sysctl(ax25_dev);
@@ -103,37 +108,27 @@ void ax25_dev_device_down(struct net_device *dev)
spin_lock_bh(&ax25_dev_lock);
#ifdef CONFIG_AX25_DAMA_SLAVE
- ax25_ds_del_timer(ax25_dev);
+ timer_shutdown_sync(&ax25_dev->dama.slave_timer);
#endif
/*
* Remove any packet forwarding that points to this device.
*/
- for (s = ax25_dev_list; s != NULL; s = s->next)
+ list_for_each_entry(s, &ax25_dev_list, list)
if (s->forward == dev)
s->forward = NULL;
- if ((s = ax25_dev_list) == ax25_dev) {
- ax25_dev_list = s->next;
- spin_unlock_bh(&ax25_dev_lock);
- dev_put(dev);
- kfree(ax25_dev);
- return;
- }
-
- while (s != NULL && s->next != NULL) {
- if (s->next == ax25_dev) {
- s->next = ax25_dev->next;
- spin_unlock_bh(&ax25_dev_lock);
- dev_put(dev);
- kfree(ax25_dev);
- return;
+ list_for_each_entry(s, &ax25_dev_list, list) {
+ if (s == ax25_dev) {
+ list_del(&s->list);
+ break;
}
-
- s = s->next;
}
+
+ RCU_INIT_POINTER(dev->ax25_ptr, NULL);
spin_unlock_bh(&ax25_dev_lock);
- dev->ax25_ptr = NULL;
+ netdev_put(dev, &ax25_dev->dev_tracker);
+ ax25_dev_put(ax25_dev);
}
int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd)
@@ -145,20 +140,32 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd)
switch (cmd) {
case SIOCAX25ADDFWD:
- if ((fwd_dev = ax25_addr_ax25dev(&fwd->port_to)) == NULL)
+ fwd_dev = ax25_addr_ax25dev(&fwd->port_to);
+ if (!fwd_dev) {
+ ax25_dev_put(ax25_dev);
return -EINVAL;
- if (ax25_dev->forward != NULL)
+ }
+ if (ax25_dev->forward) {
+ ax25_dev_put(fwd_dev);
+ ax25_dev_put(ax25_dev);
return -EINVAL;
+ }
ax25_dev->forward = fwd_dev->dev;
+ ax25_dev_put(fwd_dev);
+ ax25_dev_put(ax25_dev);
break;
case SIOCAX25DELFWD:
- if (ax25_dev->forward == NULL)
+ if (!ax25_dev->forward) {
+ ax25_dev_put(ax25_dev);
return -EINVAL;
+ }
ax25_dev->forward = NULL;
+ ax25_dev_put(ax25_dev);
break;
default:
+ ax25_dev_put(ax25_dev);
return -EINVAL;
}
@@ -183,16 +190,13 @@ struct net_device *ax25_fwd_dev(struct net_device *dev)
*/
void __exit ax25_dev_free(void)
{
- ax25_dev *s, *ax25_dev;
+ ax25_dev *s, *n;
spin_lock_bh(&ax25_dev_lock);
- ax25_dev = ax25_dev_list;
- while (ax25_dev != NULL) {
- s = ax25_dev;
- dev_put(ax25_dev->dev);
- ax25_dev = ax25_dev->next;
- kfree(s);
+ list_for_each_entry_safe(s, n, &ax25_dev_list, list) {
+ netdev_put(s->dev, &s->dev_tracker);
+ list_del(&s->list);
+ ax25_dev_put(s);
}
- ax25_dev_list = NULL;
spin_unlock_bh(&ax25_dev_lock);
}
diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c
index 488fc2d7085a..c62f8fb06189 100644
--- a/net/ax25/ax25_ds_in.c
+++ b/net/ax25/ax25_ds_in.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
* Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
diff --git a/net/ax25/ax25_ds_subr.c b/net/ax25/ax25_ds_subr.c
index bc0329f43013..f00e27df3c76 100644
--- a/net/ax25/ax25_ds_subr.c
+++ b/net/ax25/ax25_ds_subr.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
* Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c
index e9d11313d45b..0c9e7775aa54 100644
--- a/net/ax25/ax25_ds_timer.c
+++ b/net/ax25/ax25_ds_timer.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
* Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de)
@@ -47,7 +44,7 @@ void ax25_ds_setup_timer(ax25_dev *ax25_dev)
void ax25_ds_del_timer(ax25_dev *ax25_dev)
{
if (ax25_dev)
- del_timer(&ax25_dev->dama.slave_timer);
+ timer_delete(&ax25_dev->dama.slave_timer);
}
void ax25_ds_set_timer(ax25_dev *ax25_dev)
@@ -67,7 +64,7 @@ void ax25_ds_set_timer(ax25_dev *ax25_dev)
static void ax25_ds_timeout(struct timer_list *t)
{
- ax25_dev *ax25_dev = from_timer(ax25_dev, t, dama.slave_timer);
+ ax25_dev *ax25_dev = timer_container_of(ax25_dev, t, dama.slave_timer);
ax25_cb *ax25;
if (ax25_dev == NULL || !ax25_dev->dama.slave)
diff --git a/net/ax25/ax25_iface.c b/net/ax25/ax25_iface.c
index 8c07c28569e4..979bc4b828a0 100644
--- a/net/ax25/ax25_iface.c
+++ b/net/ax25/ax25_iface.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
*/
@@ -101,7 +98,7 @@ void ax25_linkfail_release(struct ax25_linkfail *lf)
EXPORT_SYMBOL(ax25_linkfail_release);
-int ax25_listen_register(ax25_address *callsign, struct net_device *dev)
+int ax25_listen_register(const ax25_address *callsign, struct net_device *dev)
{
struct listen_struct *listen;
@@ -124,7 +121,7 @@ int ax25_listen_register(ax25_address *callsign, struct net_device *dev)
EXPORT_SYMBOL(ax25_listen_register);
-void ax25_listen_release(ax25_address *callsign, struct net_device *dev)
+void ax25_listen_release(const ax25_address *callsign, struct net_device *dev)
{
struct listen_struct *s, *listen;
@@ -174,7 +171,7 @@ int (*ax25_protocol_function(unsigned int pid))(struct sk_buff *, ax25_cb *)
return res;
}
-int ax25_listen_mine(ax25_address *callsign, struct net_device *dev)
+int ax25_listen_mine(const ax25_address *callsign, struct net_device *dev)
{
struct listen_struct *listen;
diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c
index 860752639b1a..f2d66af86359 100644
--- a/net/ax25/ax25_in.c
+++ b/net/ax25/ax25_in.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
@@ -184,7 +181,7 @@ static int ax25_process_rx_frame(ax25_cb *ax25, struct sk_buff *skb, int type, i
}
static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
- ax25_address *dev_addr, struct packet_type *ptype)
+ const ax25_address *dev_addr, struct packet_type *ptype)
{
ax25_address src, dest, *next_digi = NULL;
int type = 0, mine = 0, dama;
@@ -359,7 +356,7 @@ static int ax25_rcv(struct sk_buff *skb, struct net_device *dev,
make->sk_state = TCP_ESTABLISHED;
- sk->sk_ack_backlog++;
+ sk_acceptq_added(sk);
bh_unlock_sock(sk);
} else {
if (!mine)
@@ -436,6 +433,10 @@ free:
int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *ptype, struct net_device *orig_dev)
{
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ return NET_RX_DROP;
+
skb_orphan(skb);
if (!net_eq(dev_net(dev), &init_net)) {
@@ -450,5 +451,5 @@ int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev,
skb_pull(skb, AX25_KISS_HEADER_LEN); /* Remove the KISS byte */
- return ax25_rcv(skb, dev, (ax25_address *)dev->dev_addr, ptype);
+ return ax25_rcv(skb, dev, (const ax25_address *)dev->dev_addr, ptype);
}
diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c
index 70417e9b932d..215d4ccf12b9 100644
--- a/net/ax25/ax25_ip.c
+++ b/net/ax25/ax25_ip.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
*/
@@ -114,6 +111,7 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb)
dst = (ax25_address *)(bp + 1);
src = (ax25_address *)(bp + 8);
+ ax25_route_lock_use();
route = ax25_get_route(dst, NULL);
if (route) {
digipeat = route->digipeat;
@@ -124,6 +122,7 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb)
if (dev == NULL)
dev = skb->dev;
+ rcu_read_lock();
if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) {
kfree_skb(skb);
goto put;
@@ -195,10 +194,8 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb)
skb_pull(skb, AX25_KISS_HEADER_LEN);
if (digipeat != NULL) {
- if ((ourskb = ax25_rt_build_path(skb, src, dst, route->digipeat)) == NULL) {
- kfree_skb(skb);
+ if ((ourskb = ax25_rt_build_path(skb, src, dst, route->digipeat)) == NULL)
goto put;
- }
skb = ourskb;
}
@@ -206,9 +203,8 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb)
ax25_queue_xmit(skb, dev);
put:
- if (route)
- ax25_put_route(route);
-
+ rcu_read_unlock();
+ ax25_route_lock_unuse();
return NETDEV_TX_OK;
}
diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c
index 3e5afc8dc93e..8bca2ace98e5 100644
--- a/net/ax25/ax25_out.c
+++ b/net/ax25/ax25_out.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
@@ -32,7 +29,7 @@
static DEFINE_SPINLOCK(ax25_frag_lock);
-ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, ax25_address *src, ax25_address *dest, ax25_digi *digi, struct net_device *dev)
+ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, const ax25_address *src, ax25_address *dest, ax25_digi *digi, struct net_device *dev)
{
ax25_dev *ax25_dev;
ax25_cb *ax25;
@@ -42,10 +39,14 @@ ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, ax25_address *src, ax2
* specified.
*/
if (paclen == 0) {
- if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
+ rcu_read_lock();
+ ax25_dev = ax25_dev_ax25dev(dev);
+ if (!ax25_dev) {
+ rcu_read_unlock();
return NULL;
-
+ }
paclen = ax25_dev->values[AX25_VALUES_PACLEN];
+ rcu_read_unlock();
}
/*
@@ -56,13 +57,19 @@ ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, ax25_address *src, ax2
return ax25; /* It already existed */
}
- if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL)
+ rcu_read_lock();
+ ax25_dev = ax25_dev_ax25dev(dev);
+ if (!ax25_dev) {
+ rcu_read_unlock();
return NULL;
+ }
- if ((ax25 = ax25_create_cb()) == NULL)
+ if ((ax25 = ax25_create_cb()) == NULL) {
+ rcu_read_unlock();
return NULL;
-
+ }
ax25_fillin_cb(ax25, ax25_dev);
+ rcu_read_unlock();
ax25->source_addr = *src;
ax25->dest_addr = *dest;
@@ -328,7 +335,6 @@ void ax25_kick(ax25_cb *ax25)
void ax25_transmit_buffer(ax25_cb *ax25, struct sk_buff *skb, int type)
{
- struct sk_buff *skbn;
unsigned char *ptr;
int headroom;
@@ -339,18 +345,12 @@ void ax25_transmit_buffer(ax25_cb *ax25, struct sk_buff *skb, int type)
headroom = ax25_addr_size(ax25->digipeat);
- if (skb_headroom(skb) < headroom) {
- if ((skbn = skb_realloc_headroom(skb, headroom)) == NULL) {
+ if (unlikely(skb_headroom(skb) < headroom)) {
+ skb = skb_expand_head(skb, headroom);
+ if (!skb) {
printk(KERN_CRIT "AX.25: ax25_transmit_buffer - out of memory\n");
- kfree_skb(skb);
return;
}
-
- if (skb->sk != NULL)
- skb_set_owner_w(skbn, skb->sk);
-
- consume_skb(skb);
- skb = skbn;
}
ptr = skb_push(skb, headroom);
@@ -368,7 +368,9 @@ void ax25_queue_xmit(struct sk_buff *skb, struct net_device *dev)
{
unsigned char *ptr;
+ rcu_read_lock();
skb->protocol = ax25_type_trans(skb, ax25_fwd_dev(dev));
+ rcu_read_unlock();
ptr = skb_push(skb, 1);
*ptr = 0x00; /* KISS */
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index a0eff323af12..10577434f40b 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
@@ -40,7 +37,7 @@
#include <linux/export.h>
static ax25_route *ax25_route_list;
-static DEFINE_RWLOCK(ax25_route_lock);
+DEFINE_RWLOCK(ax25_route_lock);
void ax25_rt_device_down(struct net_device *dev)
{
@@ -78,11 +75,13 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route)
ax25_dev *ax25_dev;
int i;
- if ((ax25_dev = ax25_addr_ax25dev(&route->port_addr)) == NULL)
- return -EINVAL;
if (route->digi_count > AX25_MAX_DIGIS)
return -EINVAL;
+ ax25_dev = ax25_addr_ax25dev(&route->port_addr);
+ if (!ax25_dev)
+ return -EINVAL;
+
write_lock_bh(&ax25_route_lock);
ax25_rt = ax25_route_list;
@@ -94,6 +93,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route)
if (route->digi_count != 0) {
if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) {
write_unlock_bh(&ax25_route_lock);
+ ax25_dev_put(ax25_dev);
return -ENOMEM;
}
ax25_rt->digipeat->lastrepeat = -1;
@@ -104,6 +104,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route)
}
}
write_unlock_bh(&ax25_route_lock);
+ ax25_dev_put(ax25_dev);
return 0;
}
ax25_rt = ax25_rt->next;
@@ -111,10 +112,10 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route)
if ((ax25_rt = kmalloc(sizeof(ax25_route), GFP_ATOMIC)) == NULL) {
write_unlock_bh(&ax25_route_lock);
+ ax25_dev_put(ax25_dev);
return -ENOMEM;
}
- refcount_set(&ax25_rt->refcount, 1);
ax25_rt->callsign = route->dest_addr;
ax25_rt->dev = ax25_dev->dev;
ax25_rt->digipeat = NULL;
@@ -123,6 +124,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route)
if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) {
write_unlock_bh(&ax25_route_lock);
kfree(ax25_rt);
+ ax25_dev_put(ax25_dev);
return -ENOMEM;
}
ax25_rt->digipeat->lastrepeat = -1;
@@ -135,6 +137,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route)
ax25_rt->next = ax25_route_list;
ax25_route_list = ax25_rt;
write_unlock_bh(&ax25_route_lock);
+ ax25_dev_put(ax25_dev);
return 0;
}
@@ -163,12 +166,12 @@ static int ax25_rt_del(struct ax25_routes_struct *route)
ax25cmp(&route->dest_addr, &s->callsign) == 0) {
if (ax25_route_list == s) {
ax25_route_list = s->next;
- ax25_put_route(s);
+ __ax25_put_route(s);
} else {
for (t = ax25_route_list; t != NULL; t = t->next) {
if (t->next == s) {
t->next = s->next;
- ax25_put_route(s);
+ __ax25_put_route(s);
break;
}
}
@@ -176,6 +179,7 @@ static int ax25_rt_del(struct ax25_routes_struct *route)
}
}
write_unlock_bh(&ax25_route_lock);
+ ax25_dev_put(ax25_dev);
return 0;
}
@@ -218,6 +222,7 @@ static int ax25_rt_opt(struct ax25_route_opt_struct *rt_option)
out:
write_unlock_bh(&ax25_route_lock);
+ ax25_dev_put(ax25_dev);
return err;
}
@@ -335,6 +340,7 @@ const struct seq_operations ax25_rt_seqops = {
* Find AX.25 route
*
* Only routes with a reference count of zero can be destroyed.
+ * Must be called with ax25_route_lock read locked.
*/
ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev)
{
@@ -342,7 +348,6 @@ ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev)
ax25_route *ax25_def_rt = NULL;
ax25_route *ax25_rt;
- read_lock(&ax25_route_lock);
/*
* Bind to the physical interface we heard them on, or the default
* route if none is found;
@@ -365,104 +370,24 @@ ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev)
if (ax25_spe_rt != NULL)
ax25_rt = ax25_spe_rt;
- if (ax25_rt != NULL)
- ax25_hold_route(ax25_rt);
-
- read_unlock(&ax25_route_lock);
-
return ax25_rt;
}
-/*
- * Adjust path: If you specify a default route and want to connect
- * a target on the digipeater path but w/o having a special route
- * set before, the path has to be truncated from your target on.
- */
-static inline void ax25_adjust_path(ax25_address *addr, ax25_digi *digipeat)
-{
- int k;
-
- for (k = 0; k < digipeat->ndigi; k++) {
- if (ax25cmp(addr, &digipeat->calls[k]) == 0)
- break;
- }
-
- digipeat->ndigi = k;
-}
-
-
-/*
- * Find which interface to use.
- */
-int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
-{
- ax25_uid_assoc *user;
- ax25_route *ax25_rt;
- int err = 0;
-
- if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL)
- return -EHOSTUNREACH;
-
- if ((ax25->ax25_dev = ax25_dev_ax25dev(ax25_rt->dev)) == NULL) {
- err = -EHOSTUNREACH;
- goto put;
- }
-
- user = ax25_findbyuid(current_euid());
- if (user) {
- ax25->source_addr = user->call;
- ax25_uid_put(user);
- } else {
- if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) {
- err = -EPERM;
- goto put;
- }
- ax25->source_addr = *(ax25_address *)ax25->ax25_dev->dev->dev_addr;
- }
-
- if (ax25_rt->digipeat != NULL) {
- ax25->digipeat = kmemdup(ax25_rt->digipeat, sizeof(ax25_digi),
- GFP_ATOMIC);
- if (ax25->digipeat == NULL) {
- err = -ENOMEM;
- goto put;
- }
- ax25_adjust_path(addr, ax25->digipeat);
- }
-
- if (ax25->sk != NULL) {
- bh_lock_sock(ax25->sk);
- sock_reset_flag(ax25->sk, SOCK_ZAPPED);
- bh_unlock_sock(ax25->sk);
- }
-
-put:
- ax25_put_route(ax25_rt);
-
- return err;
-}
struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src,
ax25_address *dest, ax25_digi *digi)
{
- struct sk_buff *skbn;
unsigned char *bp;
int len;
len = digi->ndigi * AX25_ADDR_LEN;
- if (skb_headroom(skb) < len) {
- if ((skbn = skb_realloc_headroom(skb, len)) == NULL) {
+ if (unlikely(skb_headroom(skb) < len)) {
+ skb = skb_expand_head(skb, len);
+ if (!skb) {
printk(KERN_CRIT "AX.25: ax25_dg_build_path - out of memory\n");
return NULL;
}
-
- if (skb->sk != NULL)
- skb_set_owner_w(skbn, skb->sk);
-
- consume_skb(skb);
-
- skb = skbn;
}
bp = skb_push(skb, len);
diff --git a/net/ax25/ax25_std_in.c b/net/ax25/ax25_std_in.c
index 8632b86e843e..ba176196ae06 100644
--- a/net/ax25/ax25_std_in.c
+++ b/net/ax25/ax25_std_in.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
diff --git a/net/ax25/ax25_std_subr.c b/net/ax25/ax25_std_subr.c
index 94bd06396a43..4c36f1342558 100644
--- a/net/ax25/ax25_std_subr.c
+++ b/net/ax25/ax25_std_subr.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
*/
diff --git a/net/ax25/ax25_std_timer.c b/net/ax25/ax25_std_timer.c
index 30bbc675261d..b17da41210cb 100644
--- a/net/ax25/ax25_std_timer.c
+++ b/net/ax25/ax25_std_timer.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c
index 038b109b2be7..bff4b203a893 100644
--- a/net/ax25/ax25_subr.c
+++ b/net/ax25/ax25_subr.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
@@ -264,12 +261,20 @@ void ax25_disconnect(ax25_cb *ax25, int reason)
{
ax25_clear_queues(ax25);
- if (!ax25->sk || !sock_flag(ax25->sk, SOCK_DESTROY))
- ax25_stop_heartbeat(ax25);
- ax25_stop_t1timer(ax25);
- ax25_stop_t2timer(ax25);
- ax25_stop_t3timer(ax25);
- ax25_stop_idletimer(ax25);
+ if (reason == ENETUNREACH) {
+ timer_delete_sync(&ax25->timer);
+ timer_delete_sync(&ax25->t1timer);
+ timer_delete_sync(&ax25->t2timer);
+ timer_delete_sync(&ax25->t3timer);
+ timer_delete_sync(&ax25->idletimer);
+ } else {
+ if (ax25->sk && !sock_flag(ax25->sk, SOCK_DESTROY))
+ ax25_stop_heartbeat(ax25);
+ ax25_stop_t1timer(ax25);
+ ax25_stop_t2timer(ax25);
+ ax25_stop_t3timer(ax25);
+ ax25_stop_idletimer(ax25);
+ }
ax25->state = AX25_STATE_0;
diff --git a/net/ax25/ax25_timer.c b/net/ax25/ax25_timer.c
index c47b7ee1e4da..a69bfbc8b679 100644
--- a/net/ax25/ax25_timer.c
+++ b/net/ax25/ax25_timer.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk)
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
@@ -68,7 +65,7 @@ void ax25_start_t3timer(ax25_cb *ax25)
if (ax25->t3 > 0)
mod_timer(&ax25->t3timer, jiffies + ax25->t3);
else
- del_timer(&ax25->t3timer);
+ timer_delete(&ax25->t3timer);
}
void ax25_start_idletimer(ax25_cb *ax25)
@@ -76,32 +73,32 @@ void ax25_start_idletimer(ax25_cb *ax25)
if (ax25->idle > 0)
mod_timer(&ax25->idletimer, jiffies + ax25->idle);
else
- del_timer(&ax25->idletimer);
+ timer_delete(&ax25->idletimer);
}
void ax25_stop_heartbeat(ax25_cb *ax25)
{
- del_timer(&ax25->timer);
+ timer_delete(&ax25->timer);
}
void ax25_stop_t1timer(ax25_cb *ax25)
{
- del_timer(&ax25->t1timer);
+ timer_delete(&ax25->t1timer);
}
void ax25_stop_t2timer(ax25_cb *ax25)
{
- del_timer(&ax25->t2timer);
+ timer_delete(&ax25->t2timer);
}
void ax25_stop_t3timer(ax25_cb *ax25)
{
- del_timer(&ax25->t3timer);
+ timer_delete(&ax25->t3timer);
}
void ax25_stop_idletimer(ax25_cb *ax25)
{
- del_timer(&ax25->idletimer);
+ timer_delete(&ax25->idletimer);
}
int ax25_t1timer_running(ax25_cb *ax25)
@@ -111,10 +108,12 @@ int ax25_t1timer_running(ax25_cb *ax25)
unsigned long ax25_display_timer(struct timer_list *timer)
{
+ long delta = timer->expires - jiffies;
+
if (!timer_pending(timer))
return 0;
- return timer->expires - jiffies;
+ return max(0L, delta);
}
EXPORT_SYMBOL(ax25_display_timer);
@@ -122,7 +121,7 @@ EXPORT_SYMBOL(ax25_display_timer);
static void ax25_heartbeat_expiry(struct timer_list *t)
{
int proto = AX25_PROTO_STD_SIMPLEX;
- ax25_cb *ax25 = from_timer(ax25, t, timer);
+ ax25_cb *ax25 = timer_container_of(ax25, t, timer);
if (ax25->ax25_dev)
proto = ax25->ax25_dev->values[AX25_VALUES_PROTOCOL];
@@ -146,7 +145,7 @@ static void ax25_heartbeat_expiry(struct timer_list *t)
static void ax25_t1timer_expiry(struct timer_list *t)
{
- ax25_cb *ax25 = from_timer(ax25, t, t1timer);
+ ax25_cb *ax25 = timer_container_of(ax25, t, t1timer);
switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
case AX25_PROTO_STD_SIMPLEX:
@@ -165,7 +164,7 @@ static void ax25_t1timer_expiry(struct timer_list *t)
static void ax25_t2timer_expiry(struct timer_list *t)
{
- ax25_cb *ax25 = from_timer(ax25, t, t2timer);
+ ax25_cb *ax25 = timer_container_of(ax25, t, t2timer);
switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
case AX25_PROTO_STD_SIMPLEX:
@@ -184,7 +183,7 @@ static void ax25_t2timer_expiry(struct timer_list *t)
static void ax25_t3timer_expiry(struct timer_list *t)
{
- ax25_cb *ax25 = from_timer(ax25, t, t3timer);
+ ax25_cb *ax25 = timer_container_of(ax25, t, t3timer);
switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
case AX25_PROTO_STD_SIMPLEX:
@@ -205,7 +204,7 @@ static void ax25_t3timer_expiry(struct timer_list *t)
static void ax25_idletimer_expiry(struct timer_list *t)
{
- ax25_cb *ax25 = from_timer(ax25, t, idletimer);
+ ax25_cb *ax25 = timer_container_of(ax25, t, idletimer);
switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) {
case AX25_PROTO_STD_SIMPLEX:
diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c
index 99d02e390e43..241e4680ecb1 100644
--- a/net/ax25/ax25_uid.c
+++ b/net/ax25/ax25_uid.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk)
*/
diff --git a/net/ax25/sysctl_net_ax25.c b/net/ax25/sysctl_net_ax25.c
index 919a5ce47515..68753aa30334 100644
--- a/net/ax25/sysctl_net_ax25.c
+++ b/net/ax25/sysctl_net_ax25.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*
* Copyright (C) 1996 Mike Shaver (shaver@zeroknowledge.com)
*/
@@ -144,8 +141,6 @@ static const struct ctl_table ax25_param_table[] = {
.extra2 = &max_ds_timeout
},
#endif
-
- { } /* that's all, folks! */
};
int ax25_register_dev_sysctl(ax25_dev *ax25_dev)
@@ -158,11 +153,13 @@ int ax25_register_dev_sysctl(ax25_dev *ax25_dev)
if (!table)
return -ENOMEM;
+ BUILD_BUG_ON(ARRAY_SIZE(ax25_param_table) != AX25_MAX_VALUES);
for (k = 0; k < AX25_MAX_VALUES; k++)
table[k].data = &ax25_dev->values[k];
snprintf(path, sizeof(path), "net/ax25/%s", ax25_dev->dev->name);
- ax25_dev->sysheader = register_net_sysctl(&init_net, path, table);
+ ax25_dev->sysheader = register_net_sysctl_sz(&init_net, path, table,
+ ARRAY_SIZE(ax25_param_table));
if (!ax25_dev->sysheader) {
kfree(table);
return -ENOMEM;
@@ -173,7 +170,7 @@ int ax25_register_dev_sysctl(ax25_dev *ax25_dev)
void ax25_unregister_dev_sysctl(ax25_dev *ax25_dev)
{
struct ctl_table_header *header = ax25_dev->sysheader;
- struct ctl_table *table;
+ const struct ctl_table *table;
if (header) {
ax25_dev->sysheader = NULL;
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index 361116f77cb9..58c408b7a7d9 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -1,19 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
-# Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
+# Copyright (C) B.A.T.M.A.N. contributors:
#
# Marek Lindner, Simon Wunderlich
-#
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of version 2 of the GNU General Public
-# License as published by the Free Software Foundation.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, see <http://www.gnu.org/licenses/>.
#
# B.A.T.M.A.N meshing protocol
@@ -21,15 +9,13 @@
config BATMAN_ADV
tristate "B.A.T.M.A.N. Advanced Meshing Protocol"
- depends on NET
- select CRC16
- select LIBCRC32C
+ select CRC32
help
- B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is
- a routing protocol for multi-hop ad-hoc mesh networks. The
- networks may be wired or wireless. See
- https://www.open-mesh.org/ for more information and user space
- tools.
+ B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is
+ a routing protocol for multi-hop ad-hoc mesh networks. The
+ networks may be wired or wireless. See
+ https://www.open-mesh.org/ for more information and user space
+ tools.
config BATMAN_ADV_BATMAN_V
bool "B.A.T.M.A.N. V protocol"
@@ -48,6 +34,8 @@ config BATMAN_ADV_BATMAN_V
config BATMAN_ADV_BLA
bool "Bridge Loop Avoidance"
depends on BATMAN_ADV && INET
+ select CRC16
+ select NET_CRC32C
default y
help
This option enables BLA (Bridge Loop Avoidance), a mechanism
@@ -66,43 +54,31 @@ config BATMAN_ADV_DAT
mesh networks. If you think that your network does not need
this option you can safely remove it and save some space.
-config BATMAN_ADV_NC
- bool "Network Coding"
- depends on BATMAN_ADV
- help
- This option enables network coding, a mechanism that aims to
- increase the overall network throughput by fusing multiple
- packets in one transmission.
- Note that interfaces controlled by batman-adv must be manually
- configured to have promiscuous mode enabled in order to make
- network coding work.
- If you think that your network does not need this feature you
- can safely disable it and save some space.
-
config BATMAN_ADV_MCAST
bool "Multicast optimisation"
depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y)
+ default y
help
This option enables the multicast optimisation which aims to
reduce the air overhead while improving the reliability of
multicast messages.
-config BATMAN_ADV_DEBUGFS
- bool "batman-adv debugfs entries"
- depends on BATMAN_ADV
- depends on DEBUG_FS
- help
- Enable this to export routing related debug tables via debugfs.
- The information for each soft-interface and used hard-interface can be
- found under batman_adv/
-
- If unsure, say N.
-
config BATMAN_ADV_DEBUG
bool "B.A.T.M.A.N. debugging"
- depends on BATMAN_ADV_DEBUGFS
+ depends on BATMAN_ADV
help
This is an option for use by developers; most people should
say N here. This enables compilation of support for
- outputting debugging information to the kernel log. The
- output is controlled via the module parameter debug.
+ outputting debugging information to the tracing buffer. The output is
+ controlled via the batadv netdev specific log_level setting.
+
+config BATMAN_ADV_TRACING
+ bool "B.A.T.M.A.N. tracing support"
+ depends on BATMAN_ADV
+ depends on EVENT_TRACING
+ help
+ This is an option for use by developers; most people should
+ say N here. Select this option to gather traces like the debug
+ messages using the generic tracing infrastructure of the kernel.
+ BATMAN_ADV_DEBUG must also be selected to get trace events for
+ batadv_dbg.
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index b97ba6fb8353..d3c4d4143c14 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -1,20 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
-# Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
+# Copyright (C) B.A.T.M.A.N. contributors:
#
# Marek Lindner, Simon Wunderlich
-#
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of version 2 of the GNU General Public
-# License as published by the Free Software Foundation.
-#
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-# General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, see <http://www.gnu.org/licenses/>.
-#
obj-$(CONFIG_BATMAN_ADV) += batman-adv.o
batman-adv-y += bat_algo.o
@@ -24,24 +11,24 @@ batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_elp.o
batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_ogm.o
batman-adv-y += bitarray.o
batman-adv-$(CONFIG_BATMAN_ADV_BLA) += bridge_loop_avoidance.o
-batman-adv-$(CONFIG_BATMAN_ADV_DEBUGFS) += debugfs.o
batman-adv-$(CONFIG_BATMAN_ADV_DAT) += distributed-arp-table.o
batman-adv-y += fragmentation.o
batman-adv-y += gateway_client.o
batman-adv-y += gateway_common.o
batman-adv-y += hard-interface.o
batman-adv-y += hash.o
-batman-adv-$(CONFIG_BATMAN_ADV_DEBUGFS) += icmp_socket.o
batman-adv-$(CONFIG_BATMAN_ADV_DEBUG) += log.o
batman-adv-y += main.o
+batman-adv-y += mesh-interface.o
batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o
+batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast_forw.o
batman-adv-y += netlink.o
-batman-adv-$(CONFIG_BATMAN_ADV_NC) += network-coding.o
batman-adv-y += originator.o
batman-adv-y += routing.o
batman-adv-y += send.o
-batman-adv-y += soft-interface.o
-batman-adv-y += sysfs.o
+batman-adv-$(CONFIG_BATMAN_ADV_TRACING) += trace.o
batman-adv-y += tp_meter.o
batman-adv-y += translation-table.o
batman-adv-y += tvlv.o
+
+CFLAGS_trace.o := -I$(src)
diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c
index ea309ad06175..49e5861b58ec 100644
--- a/net/batman-adv/bat_algo.c
+++ b/net/batman-adv/bat_algo.c
@@ -1,19 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "main.h"
@@ -23,10 +11,10 @@
#include <linux/moduleparam.h>
#include <linux/netlink.h>
#include <linux/printk.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/stddef.h>
#include <linux/string.h>
+#include <linux/types.h>
#include <net/genetlink.h>
#include <net/netlink.h>
#include <uapi/linux/batman_adv.h>
@@ -46,7 +34,13 @@ void batadv_algo_init(void)
INIT_HLIST_HEAD(&batadv_algo_list);
}
-static struct batadv_algo_ops *batadv_algo_get(char *name)
+/**
+ * batadv_algo_get() - Search for algorithm with specific name
+ * @name: algorithm name to find
+ *
+ * Return: Pointer to batadv_algo_ops on success, NULL otherwise
+ */
+struct batadv_algo_ops *batadv_algo_get(const char *name)
{
struct batadv_algo_ops *bat_algo_ops = NULL, *bat_algo_ops_tmp;
@@ -97,19 +91,19 @@ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops)
}
/**
- * batadv_algo_select() - Select algorithm of soft interface
- * @bat_priv: the bat priv with all the soft interface information
+ * batadv_algo_select() - Select algorithm of mesh interface
+ * @bat_priv: the bat priv with all the mesh interface information
* @name: name of the algorithm to select
*
- * The algorithm callbacks for the soft interface will be set when the algorithm
+ * The algorithm callbacks for the mesh interface will be set when the algorithm
* with the correct name was found. Any previous selected algorithm will not be
* deinitialized and the new selected algorithm will also not be initialized.
* It is therefore not allowed to call batadv_algo_select outside the creation
- * function of the soft interface.
+ * function of the mesh interface.
*
* Return: 0 on success or negative error number in case of failure
*/
-int batadv_algo_select(struct batadv_priv *bat_priv, char *name)
+int batadv_algo_select(struct batadv_priv *bat_priv, const char *name)
{
struct batadv_algo_ops *bat_algo_ops;
@@ -122,29 +116,6 @@ int batadv_algo_select(struct batadv_priv *bat_priv, char *name)
return 0;
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-
-/**
- * batadv_algo_seq_print_text() - Print the supported algorithms in a seq file
- * @seq: seq file to print on
- * @offset: not used
- *
- * Return: always 0
- */
-int batadv_algo_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct batadv_algo_ops *bat_algo_ops;
-
- seq_puts(seq, "Available routing algorithms:\n");
-
- hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) {
- seq_printf(seq, " * %s\n", bat_algo_ops->name);
- }
-
- return 0;
-}
-#endif
-
static int batadv_param_set_ra(const char *val, const struct kernel_param *kp)
{
struct batadv_algo_ops *bat_algo_ops;
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
index 534b790c3753..7ce9abbdb4b4 100644
--- a/net/batman-adv/bat_algo.h
+++ b/net/batman-adv/bat_algo.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Linus Lüssing
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_BAT_ALGO_H_
@@ -21,19 +9,15 @@
#include "main.h"
-#include <linux/types.h>
-
-struct netlink_callback;
-struct seq_file;
-struct sk_buff;
+#include <linux/netlink.h>
+#include <linux/skbuff.h>
extern char batadv_routing_algo[];
-extern struct list_head batadv_hardif_list;
void batadv_algo_init(void);
+struct batadv_algo_ops *batadv_algo_get(const char *name);
int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops);
-int batadv_algo_select(struct batadv_priv *bat_priv, char *name);
-int batadv_algo_seq_print_text(struct seq_file *seq, void *offset);
+int batadv_algo_select(struct batadv_priv *bat_priv, const char *name);
int batadv_algo_dump(struct sk_buff *msg, struct netlink_callback *cb);
#endif /* _NET_BATMAN_ADV_BAT_ALGO_H_ */
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 73bf6a93a3cf..b75c2228e69a 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1,19 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "bat_iv_ogm.h"
@@ -25,16 +13,18 @@
#include <linux/bug.h>
#include <linux/byteorder/generic.h>
#include <linux/cache.h>
+#include <linux/container_of.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/gfp.h>
#include <linux/if_ether.h>
#include <linux/init.h>
#include <linux/jiffies.h>
-#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/list.h>
#include <linux/lockdep.h>
+#include <linux/minmax.h>
+#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <linux/pkt_sched.h>
@@ -42,12 +32,12 @@
#include <linux/random.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/stddef.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <net/genetlink.h>
@@ -62,7 +52,6 @@
#include "hash.h"
#include "log.h"
#include "netlink.h"
-#include "network-coding.h"
#include "originator.h"
#include "routing.h"
#include "send.h"
@@ -138,184 +127,20 @@ static u8 batadv_ring_buffer_avg(const u8 lq_recv[])
}
/**
- * batadv_iv_ogm_orig_free() - free the private resources allocated for this
- * orig_node
- * @orig_node: the orig_node for which the resources have to be free'd
- */
-static void batadv_iv_ogm_orig_free(struct batadv_orig_node *orig_node)
-{
- kfree(orig_node->bat_iv.bcast_own);
- kfree(orig_node->bat_iv.bcast_own_sum);
-}
-
-/**
- * batadv_iv_ogm_orig_add_if() - change the private structures of the orig_node
- * to include the new hard-interface
- * @orig_node: the orig_node that has to be changed
- * @max_if_num: the current amount of interfaces
- *
- * Return: 0 on success, a negative error code otherwise.
- */
-static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node,
- unsigned int max_if_num)
-{
- void *data_ptr;
- size_t old_size;
- int ret = -ENOMEM;
-
- spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
-
- old_size = (max_if_num - 1) * sizeof(unsigned long) * BATADV_NUM_WORDS;
- data_ptr = kmalloc_array(max_if_num,
- BATADV_NUM_WORDS * sizeof(unsigned long),
- GFP_ATOMIC);
- if (!data_ptr)
- goto unlock;
-
- memcpy(data_ptr, orig_node->bat_iv.bcast_own, old_size);
- kfree(orig_node->bat_iv.bcast_own);
- orig_node->bat_iv.bcast_own = data_ptr;
-
- data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC);
- if (!data_ptr)
- goto unlock;
-
- memcpy(data_ptr, orig_node->bat_iv.bcast_own_sum,
- (max_if_num - 1) * sizeof(u8));
- kfree(orig_node->bat_iv.bcast_own_sum);
- orig_node->bat_iv.bcast_own_sum = data_ptr;
-
- ret = 0;
-
-unlock:
- spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
-
- return ret;
-}
-
-/**
- * batadv_iv_ogm_drop_bcast_own_entry() - drop section of bcast_own
- * @orig_node: the orig_node that has to be changed
- * @max_if_num: the current amount of interfaces
- * @del_if_num: the index of the interface being removed
- */
-static void
-batadv_iv_ogm_drop_bcast_own_entry(struct batadv_orig_node *orig_node,
- unsigned int max_if_num,
- unsigned int del_if_num)
-{
- size_t chunk_size;
- size_t if_offset;
- void *data_ptr;
-
- lockdep_assert_held(&orig_node->bat_iv.ogm_cnt_lock);
-
- chunk_size = sizeof(unsigned long) * BATADV_NUM_WORDS;
- data_ptr = kmalloc_array(max_if_num, chunk_size, GFP_ATOMIC);
- if (!data_ptr)
- /* use old buffer when new one could not be allocated */
- data_ptr = orig_node->bat_iv.bcast_own;
-
- /* copy first part */
- memmove(data_ptr, orig_node->bat_iv.bcast_own, del_if_num * chunk_size);
-
- /* copy second part */
- if_offset = (del_if_num + 1) * chunk_size;
- memmove((char *)data_ptr + del_if_num * chunk_size,
- (uint8_t *)orig_node->bat_iv.bcast_own + if_offset,
- (max_if_num - del_if_num) * chunk_size);
-
- /* bcast_own was shrunk down in new buffer; free old one */
- if (orig_node->bat_iv.bcast_own != data_ptr) {
- kfree(orig_node->bat_iv.bcast_own);
- orig_node->bat_iv.bcast_own = data_ptr;
- }
-}
-
-/**
- * batadv_iv_ogm_drop_bcast_own_sum_entry() - drop section of bcast_own_sum
- * @orig_node: the orig_node that has to be changed
- * @max_if_num: the current amount of interfaces
- * @del_if_num: the index of the interface being removed
- */
-static void
-batadv_iv_ogm_drop_bcast_own_sum_entry(struct batadv_orig_node *orig_node,
- unsigned int max_if_num,
- unsigned int del_if_num)
-{
- size_t if_offset;
- void *data_ptr;
-
- lockdep_assert_held(&orig_node->bat_iv.ogm_cnt_lock);
-
- data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC);
- if (!data_ptr)
- /* use old buffer when new one could not be allocated */
- data_ptr = orig_node->bat_iv.bcast_own_sum;
-
- memmove(data_ptr, orig_node->bat_iv.bcast_own_sum,
- del_if_num * sizeof(u8));
-
- if_offset = (del_if_num + 1) * sizeof(u8);
- memmove((char *)data_ptr + del_if_num * sizeof(u8),
- orig_node->bat_iv.bcast_own_sum + if_offset,
- (max_if_num - del_if_num) * sizeof(u8));
-
- /* bcast_own_sum was shrunk down in new buffer; free old one */
- if (orig_node->bat_iv.bcast_own_sum != data_ptr) {
- kfree(orig_node->bat_iv.bcast_own_sum);
- orig_node->bat_iv.bcast_own_sum = data_ptr;
- }
-}
-
-/**
- * batadv_iv_ogm_orig_del_if() - change the private structures of the orig_node
- * to exclude the removed interface
- * @orig_node: the orig_node that has to be changed
- * @max_if_num: the current amount of interfaces
- * @del_if_num: the index of the interface being removed
- *
- * Return: 0 on success, a negative error code otherwise.
- */
-static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node,
- unsigned int max_if_num,
- unsigned int del_if_num)
-{
- spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
-
- if (max_if_num == 0) {
- kfree(orig_node->bat_iv.bcast_own);
- kfree(orig_node->bat_iv.bcast_own_sum);
- orig_node->bat_iv.bcast_own = NULL;
- orig_node->bat_iv.bcast_own_sum = NULL;
- } else {
- batadv_iv_ogm_drop_bcast_own_entry(orig_node, max_if_num,
- del_if_num);
- batadv_iv_ogm_drop_bcast_own_sum_entry(orig_node, max_if_num,
- del_if_num);
- }
-
- spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
-
- return 0;
-}
-
-/**
* batadv_iv_ogm_orig_get() - retrieve or create (if does not exist) an
* originator
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @addr: mac address of the originator
*
* Return: the originator object corresponding to the passed mac address or NULL
* on failure.
- * If the object does not exists it is created an initialised.
+ * If the object does not exist, it is created and initialised.
*/
static struct batadv_orig_node *
batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr)
{
struct batadv_orig_node *orig_node;
int hash_added;
- size_t size;
orig_node = batadv_orig_hash_find(bat_priv, addr);
if (orig_node)
@@ -327,16 +152,6 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr)
spin_lock_init(&orig_node->bat_iv.ogm_cnt_lock);
- size = bat_priv->num_ifaces * sizeof(unsigned long) * BATADV_NUM_WORDS;
- orig_node->bat_iv.bcast_own = kzalloc(size, GFP_ATOMIC);
- if (!orig_node->bat_iv.bcast_own)
- goto free_orig_node;
-
- size = bat_priv->num_ifaces * sizeof(u8);
- orig_node->bat_iv.bcast_own_sum = kzalloc(size, GFP_ATOMIC);
- if (!orig_node->bat_iv.bcast_own_sum)
- goto free_orig_node;
-
kref_get(&orig_node->refcount);
hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig,
batadv_choose_orig, orig_node,
@@ -347,8 +162,9 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr)
return orig_node;
free_orig_node_hash:
+ /* reference for batadv_hash_add */
batadv_orig_node_put(orig_node);
-free_orig_node:
+ /* reference from batadv_orig_node_new */
batadv_orig_node_put(orig_node);
return NULL;
@@ -379,14 +195,18 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface)
unsigned char *ogm_buff;
u32 random_seqno;
+ mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex);
+
/* randomize initial seqno to avoid collision */
get_random_bytes(&random_seqno, sizeof(random_seqno));
atomic_set(&hard_iface->bat_iv.ogm_seqno, random_seqno);
hard_iface->bat_iv.ogm_buff_len = BATADV_OGM_HLEN;
ogm_buff = kmalloc(hard_iface->bat_iv.ogm_buff_len, GFP_ATOMIC);
- if (!ogm_buff)
+ if (!ogm_buff) {
+ mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex);
return -ENOMEM;
+ }
hard_iface->bat_iv.ogm_buff = ogm_buff;
@@ -398,35 +218,59 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface)
batadv_ogm_packet->reserved = 0;
batadv_ogm_packet->tq = BATADV_TQ_MAX_VALUE;
+ mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex);
+
return 0;
}
static void batadv_iv_ogm_iface_disable(struct batadv_hard_iface *hard_iface)
{
+ mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex);
+
kfree(hard_iface->bat_iv.ogm_buff);
hard_iface->bat_iv.ogm_buff = NULL;
+
+ mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex);
}
static void batadv_iv_ogm_iface_update_mac(struct batadv_hard_iface *hard_iface)
{
struct batadv_ogm_packet *batadv_ogm_packet;
- unsigned char *ogm_buff = hard_iface->bat_iv.ogm_buff;
+ void *ogm_buff;
- batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff;
+ mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex);
+
+ ogm_buff = hard_iface->bat_iv.ogm_buff;
+ if (!ogm_buff)
+ goto unlock;
+
+ batadv_ogm_packet = ogm_buff;
ether_addr_copy(batadv_ogm_packet->orig,
hard_iface->net_dev->dev_addr);
ether_addr_copy(batadv_ogm_packet->prev_sender,
hard_iface->net_dev->dev_addr);
+
+unlock:
+ mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex);
}
static void
batadv_iv_ogm_primary_iface_set(struct batadv_hard_iface *hard_iface)
{
struct batadv_ogm_packet *batadv_ogm_packet;
- unsigned char *ogm_buff = hard_iface->bat_iv.ogm_buff;
+ void *ogm_buff;
- batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff;
+ mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex);
+
+ ogm_buff = hard_iface->bat_iv.ogm_buff;
+ if (!ogm_buff)
+ goto unlock;
+
+ batadv_ogm_packet = ogm_buff;
batadv_ogm_packet->ttl = BATADV_TTL;
+
+unlock:
+ mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex);
}
/* when do we schedule our own ogm to be sent */
@@ -436,7 +280,7 @@ batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv)
unsigned int msecs;
msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER;
- msecs += prandom_u32() % (2 * BATADV_JITTER);
+ msecs += get_random_u32_below(2 * BATADV_JITTER);
return jiffies + msecs_to_jiffies(msecs);
}
@@ -444,7 +288,7 @@ batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv)
/* when do we schedule a ogm packet to be sent */
static unsigned long batadv_iv_ogm_fwd_send_time(void)
{
- return jiffies + msecs_to_jiffies(prandom_u32() % (BATADV_JITTER / 2));
+ return jiffies + msecs_to_jiffies(get_random_u32_below(BATADV_JITTER / 2));
}
/* apply hop penalty for a normal link */
@@ -463,27 +307,32 @@ static u8 batadv_hop_penalty(u8 tq, const struct batadv_priv *bat_priv)
* batadv_iv_ogm_aggr_packet() - checks if there is another OGM attached
* @buff_pos: current position in the skb
* @packet_len: total length of the skb
- * @tvlv_len: tvlv length of the previously considered OGM
+ * @ogm_packet: potential OGM in buffer
*
* Return: true if there is enough space for another OGM, false otherwise.
*/
-static bool batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len,
- __be16 tvlv_len)
+static bool
+batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len,
+ const struct batadv_ogm_packet *ogm_packet)
{
int next_buff_pos = 0;
- next_buff_pos += buff_pos + BATADV_OGM_HLEN;
- next_buff_pos += ntohs(tvlv_len);
+ /* check if there is enough space for the header */
+ next_buff_pos += buff_pos + sizeof(*ogm_packet);
+ if (next_buff_pos > packet_len)
+ return false;
- return (next_buff_pos <= packet_len) &&
- (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES);
+ /* check if there is enough space for the optional TVLV */
+ next_buff_pos += ntohs(ogm_packet->tvlv_len);
+
+ return next_buff_pos <= packet_len;
}
/* send a batman ogm to a given interface */
static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet,
struct batadv_hard_iface *hard_iface)
{
- struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
const char *fwd_str;
u8 packet_num;
s16 buff_pos;
@@ -501,11 +350,11 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet,
/* adjust all flags and log packets */
while (batadv_iv_ogm_aggr_packet(buff_pos, forw_packet->packet_len,
- batadv_ogm_packet->tvlv_len)) {
+ batadv_ogm_packet)) {
/* we might have aggregated direct link packets with an
* ordinary base packet
*/
- if (forw_packet->direct_link_flags & BIT(packet_num) &&
+ if (test_bit(packet_num, forw_packet->direct_link_flags) &&
forw_packet->if_incoming == hard_iface)
batadv_ogm_packet->flags |= BATADV_DIRECTLINK;
else
@@ -522,8 +371,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet,
batadv_ogm_packet->orig,
ntohl(batadv_ogm_packet->seqno),
batadv_ogm_packet->tq, batadv_ogm_packet->ttl,
- ((batadv_ogm_packet->flags & BATADV_DIRECTLINK) ?
- "on" : "off"),
+ str_on_off(batadv_ogm_packet->flags & BATADV_DIRECTLINK),
hard_iface->net_dev->name,
hard_iface->net_dev->dev_addr);
@@ -547,20 +395,22 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet,
/* send a batman ogm packet */
static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet)
{
- struct net_device *soft_iface;
+ struct net_device *mesh_iface;
if (!forw_packet->if_incoming) {
pr_err("Error - can't forward packet: incoming iface not specified\n");
return;
}
- soft_iface = forw_packet->if_incoming->soft_iface;
+ mesh_iface = forw_packet->if_incoming->mesh_iface;
if (WARN_ON(!forw_packet->if_outgoing))
return;
- if (WARN_ON(forw_packet->if_outgoing->soft_iface != soft_iface))
+ if (forw_packet->if_outgoing->mesh_iface != mesh_iface) {
+ pr_warn("%s: mesh interface switch for queued OGM\n", __func__);
return;
+ }
if (forw_packet->if_incoming->if_status != BATADV_IF_ACTIVE)
return;
@@ -573,7 +423,7 @@ static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet)
* batadv_iv_ogm_can_aggregate() - find out if an OGM can be aggregated on an
* existing forward packet
* @new_bat_ogm_packet: OGM packet to be aggregated
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @packet_len: (total) length of the OGM
* @send_time: timestamp (jiffies) when the packet is to be sent
* @directlink: true if this is a direct link packet
@@ -593,28 +443,37 @@ batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet,
const struct batadv_forw_packet *forw_packet)
{
struct batadv_ogm_packet *batadv_ogm_packet;
- int aggregated_bytes = forw_packet->packet_len + packet_len;
+ unsigned int aggregated_bytes = forw_packet->packet_len + packet_len;
struct batadv_hard_iface *primary_if = NULL;
+ u8 packet_num = forw_packet->num_packets;
bool res = false;
unsigned long aggregation_end_time;
+ unsigned int max_bytes;
batadv_ogm_packet = (struct batadv_ogm_packet *)forw_packet->skb->data;
aggregation_end_time = send_time;
aggregation_end_time += msecs_to_jiffies(BATADV_MAX_AGGREGATION_MS);
+ max_bytes = min_t(unsigned int, if_outgoing->net_dev->mtu,
+ BATADV_MAX_AGGREGATION_BYTES);
+
/* we can aggregate the current packet to this aggregated packet
* if:
*
* - the send time is within our MAX_AGGREGATION_MS time
- * - the resulting packet wont be bigger than
- * MAX_AGGREGATION_BYTES
+ * - the resulting packet won't be bigger than
+ * MAX_AGGREGATION_BYTES and MTU of the outgoing interface
+ * - the number of packets is lower than MAX_AGGREGATION_PACKETS
* otherwise aggregation is not possible
*/
if (!time_before(send_time, forw_packet->send_time) ||
!time_after_eq(aggregation_end_time, forw_packet->send_time))
return false;
- if (aggregated_bytes > BATADV_MAX_AGGREGATION_BYTES)
+ if (aggregated_bytes > max_bytes)
+ return false;
+
+ if (packet_num >= BATADV_MAX_AGGREGATION_PACKETS)
return false;
/* packet is not leaving on the same interface. */
@@ -667,8 +526,7 @@ batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet,
}
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
+ batadv_hardif_put(primary_if);
return res;
}
@@ -690,16 +548,16 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
struct batadv_hard_iface *if_outgoing,
int own_packet)
{
- struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface);
struct batadv_forw_packet *forw_packet_aggr;
struct sk_buff *skb;
unsigned char *skb_buff;
unsigned int skb_size;
atomic_t *queue_left = own_packet ? NULL : &bat_priv->batman_queue_left;
- if (atomic_read(&bat_priv->aggregated_ogms) &&
- packet_len < BATADV_MAX_AGGREGATION_BYTES)
- skb_size = BATADV_MAX_AGGREGATION_BYTES;
+ if (atomic_read(&bat_priv->aggregated_ogms))
+ skb_size = max_t(unsigned int, BATADV_MAX_AGGREGATION_BYTES,
+ packet_len);
else
skb_size = packet_len;
@@ -724,12 +582,13 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
memcpy(skb_buff, packet_buff, packet_len);
forw_packet_aggr->own = own_packet;
- forw_packet_aggr->direct_link_flags = BATADV_NO_FLAGS;
+ bitmap_zero(forw_packet_aggr->direct_link_flags,
+ BATADV_MAX_AGGREGATION_PACKETS);
forw_packet_aggr->send_time = send_time;
/* save packet direct link flag status */
if (direct_link)
- forw_packet_aggr->direct_link_flags |= 1;
+ set_bit(0, forw_packet_aggr->direct_link_flags);
INIT_DELAYED_WORK(&forw_packet_aggr->delayed_work,
batadv_iv_send_outstanding_bat_ogm_packet);
@@ -742,22 +601,20 @@ static void batadv_iv_ogm_aggregate(struct batadv_forw_packet *forw_packet_aggr,
const unsigned char *packet_buff,
int packet_len, bool direct_link)
{
- unsigned long new_direct_link_flag;
-
skb_put_data(forw_packet_aggr->skb, packet_buff, packet_len);
forw_packet_aggr->packet_len += packet_len;
- forw_packet_aggr->num_packets++;
/* save packet direct link flag status */
- if (direct_link) {
- new_direct_link_flag = BIT(forw_packet_aggr->num_packets);
- forw_packet_aggr->direct_link_flags |= new_direct_link_flag;
- }
+ if (direct_link)
+ set_bit(forw_packet_aggr->num_packets,
+ forw_packet_aggr->direct_link_flags);
+
+ forw_packet_aggr->num_packets++;
}
/**
* batadv_iv_ogm_queue_add() - queue up an OGM for transmission
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @packet_buff: pointer to the OGM
* @packet_len: (total) length of the OGM
* @if_incoming: interface where the packet was received
@@ -836,7 +693,7 @@ static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node,
struct batadv_hard_iface *if_incoming,
struct batadv_hard_iface *if_outgoing)
{
- struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface);
u16 tvlv_len;
if (batadv_ogm_packet->ttl <= 1) {
@@ -889,48 +746,59 @@ static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node,
static void
batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface)
{
- struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
struct batadv_hashtable *hash = bat_priv->orig_hash;
struct hlist_head *head;
struct batadv_orig_node *orig_node;
+ struct batadv_orig_ifinfo *orig_ifinfo;
unsigned long *word;
u32 i;
- size_t word_index;
u8 *w;
- unsigned int if_num;
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
rcu_read_lock();
hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
- spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
- word_index = hard_iface->if_num * BATADV_NUM_WORDS;
- word = &orig_node->bat_iv.bcast_own[word_index];
-
- batadv_bit_get_packet(bat_priv, word, 1, 0);
- if_num = hard_iface->if_num;
- w = &orig_node->bat_iv.bcast_own_sum[if_num];
- *w = bitmap_weight(word, BATADV_TQ_LOCAL_WINDOW_SIZE);
- spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ hlist_for_each_entry_rcu(orig_ifinfo,
+ &orig_node->ifinfo_list,
+ list) {
+ if (orig_ifinfo->if_outgoing != hard_iface)
+ continue;
+
+ spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ word = orig_ifinfo->bat_iv.bcast_own;
+ batadv_bit_get_packet(bat_priv, word, 1, 0);
+ w = &orig_ifinfo->bat_iv.bcast_own_sum;
+ *w = bitmap_weight(word,
+ BATADV_TQ_LOCAL_WINDOW_SIZE);
+ spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ }
}
rcu_read_unlock();
}
}
-static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface)
+/**
+ * batadv_iv_ogm_schedule_buff() - schedule submission of hardif ogm buffer
+ * @hard_iface: interface whose ogm buffer should be transmitted
+ */
+static void batadv_iv_ogm_schedule_buff(struct batadv_hard_iface *hard_iface)
{
- struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
unsigned char **ogm_buff = &hard_iface->bat_iv.ogm_buff;
struct batadv_ogm_packet *batadv_ogm_packet;
struct batadv_hard_iface *primary_if, *tmp_hard_iface;
int *ogm_buff_len = &hard_iface->bat_iv.ogm_buff_len;
+ struct list_head *iter;
u32 seqno;
u16 tvlv_len = 0;
unsigned long send_time;
- if (hard_iface->if_status == BATADV_IF_NOT_IN_USE ||
- hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)
+ lockdep_assert_held(&hard_iface->bat_iv.ogm_buff_mutex);
+
+ /* interface already disabled by batadv_iv_ogm_iface_disable */
+ if (!*ogm_buff)
return;
/* the interface gets activated here to avoid race conditions between
@@ -979,10 +847,7 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface)
* interfaces.
*/
rcu_read_lock();
- list_for_each_entry_rcu(tmp_hard_iface, &batadv_hardif_list, list) {
- if (tmp_hard_iface->soft_iface != hard_iface->soft_iface)
- continue;
-
+ netdev_for_each_lower_private_rcu(hard_iface->mesh_iface, tmp_hard_iface, iter) {
if (!kref_get_unless_zero(&tmp_hard_iface->refcount))
continue;
@@ -995,14 +860,53 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface)
rcu_read_unlock();
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
+ batadv_hardif_put(primary_if);
+}
+
+static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface)
+{
+ if (hard_iface->if_status == BATADV_IF_NOT_IN_USE ||
+ hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)
+ return;
+
+ mutex_lock(&hard_iface->bat_iv.ogm_buff_mutex);
+ batadv_iv_ogm_schedule_buff(hard_iface);
+ mutex_unlock(&hard_iface->bat_iv.ogm_buff_mutex);
+}
+
+/**
+ * batadv_iv_orig_ifinfo_sum() - Get bcast_own sum for originator over interface
+ * @orig_node: originator which reproadcasted the OGMs directly
+ * @if_outgoing: interface which transmitted the original OGM and received the
+ * direct rebroadcast
+ *
+ * Return: Number of replied (rebroadcasted) OGMs which were transmitted by
+ * an originator and directly (without intermediate hop) received by a specific
+ * interface
+ */
+static u8 batadv_iv_orig_ifinfo_sum(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_orig_ifinfo *orig_ifinfo;
+ u8 sum;
+
+ orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_outgoing);
+ if (!orig_ifinfo)
+ return 0;
+
+ spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ sum = orig_ifinfo->bat_iv.bcast_own_sum;
+ spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+
+ batadv_orig_ifinfo_put(orig_ifinfo);
+
+ return sum;
}
/**
* batadv_iv_ogm_orig_update() - use OGM to update corresponding data in an
* originator
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig_node: the orig node who originally emitted the ogm packet
* @orig_ifinfo: ifinfo for the outgoing interface of the orig_node
* @ethhdr: Ethernet header of the OGM
@@ -1026,8 +930,6 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv,
struct batadv_neigh_node *neigh_node = NULL;
struct batadv_neigh_node *tmp_neigh_node = NULL;
struct batadv_neigh_node *router = NULL;
- struct batadv_orig_node *orig_node_tmp;
- unsigned int if_num;
u8 sum_orig, sum_neigh;
u8 *neigh_addr;
u8 tq_avg;
@@ -1132,18 +1034,10 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv,
*/
if (router_ifinfo &&
neigh_ifinfo->bat_iv.tq_avg == router_ifinfo->bat_iv.tq_avg) {
- orig_node_tmp = router->orig_node;
- spin_lock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock);
- if_num = router->if_incoming->if_num;
- sum_orig = orig_node_tmp->bat_iv.bcast_own_sum[if_num];
- spin_unlock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock);
-
- orig_node_tmp = neigh_node->orig_node;
- spin_lock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock);
- if_num = neigh_node->if_incoming->if_num;
- sum_neigh = orig_node_tmp->bat_iv.bcast_own_sum[if_num];
- spin_unlock_bh(&orig_node_tmp->bat_iv.ogm_cnt_lock);
-
+ sum_orig = batadv_iv_orig_ifinfo_sum(router->orig_node,
+ router->if_incoming);
+ sum_neigh = batadv_iv_orig_ifinfo_sum(neigh_node->orig_node,
+ neigh_node->if_incoming);
if (sum_orig >= sum_neigh)
goto out;
}
@@ -1154,14 +1048,10 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv,
unlock:
rcu_read_unlock();
out:
- if (neigh_node)
- batadv_neigh_node_put(neigh_node);
- if (router)
- batadv_neigh_node_put(router);
- if (neigh_ifinfo)
- batadv_neigh_ifinfo_put(neigh_ifinfo);
- if (router_ifinfo)
- batadv_neigh_ifinfo_put(router_ifinfo);
+ batadv_neigh_node_put(neigh_node);
+ batadv_neigh_node_put(router);
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
+ batadv_neigh_ifinfo_put(router_ifinfo);
}
/**
@@ -1180,16 +1070,15 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
struct batadv_hard_iface *if_incoming,
struct batadv_hard_iface *if_outgoing)
{
- struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface);
struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node;
struct batadv_neigh_ifinfo *neigh_ifinfo;
u8 total_count;
u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own;
+ unsigned int tq_iface_hop_penalty = BATADV_TQ_MAX_VALUE;
unsigned int neigh_rq_inv_cube, neigh_rq_max_cube;
- unsigned int if_num;
unsigned int tq_asym_penalty, inv_asym_penalty;
unsigned int combined_tq;
- unsigned int tq_iface_penalty;
bool ret = false;
/* find corresponding one hop neighbor */
@@ -1227,9 +1116,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
orig_node->last_seen = jiffies;
/* find packet count of corresponding one hop neighbor */
- spin_lock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock);
- if_num = if_incoming->if_num;
- orig_eq_count = orig_neigh_node->bat_iv.bcast_own_sum[if_num];
+ orig_eq_count = batadv_iv_orig_ifinfo_sum(orig_neigh_node, if_incoming);
neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing);
if (neigh_ifinfo) {
neigh_rq_count = neigh_ifinfo->bat_iv.real_packet_count;
@@ -1237,7 +1124,6 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
} else {
neigh_rq_count = 0;
}
- spin_unlock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock);
/* pay attention to not get a value bigger than 100 % */
if (orig_eq_count > neigh_rq_count)
@@ -1271,31 +1157,32 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
inv_asym_penalty = BATADV_TQ_MAX_VALUE * neigh_rq_inv_cube;
inv_asym_penalty /= neigh_rq_max_cube;
tq_asym_penalty = BATADV_TQ_MAX_VALUE - inv_asym_penalty;
+ tq_iface_hop_penalty -= atomic_read(&if_incoming->hop_penalty);
/* penalize if the OGM is forwarded on the same interface. WiFi
* interfaces and other half duplex devices suffer from throughput
* drops as they can't send and receive at the same time.
*/
- tq_iface_penalty = BATADV_TQ_MAX_VALUE;
if (if_outgoing && if_incoming == if_outgoing &&
batadv_is_wifi_hardif(if_outgoing))
- tq_iface_penalty = batadv_hop_penalty(BATADV_TQ_MAX_VALUE,
- bat_priv);
+ tq_iface_hop_penalty = batadv_hop_penalty(tq_iface_hop_penalty,
+ bat_priv);
combined_tq = batadv_ogm_packet->tq *
tq_own *
tq_asym_penalty *
- tq_iface_penalty;
+ tq_iface_hop_penalty;
combined_tq /= BATADV_TQ_MAX_VALUE *
BATADV_TQ_MAX_VALUE *
BATADV_TQ_MAX_VALUE;
batadv_ogm_packet->tq = combined_tq;
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
- "bidirectional: orig = %pM neigh = %pM => own_bcast = %2i, real recv = %2i, local tq: %3i, asym_penalty: %3i, iface_penalty: %3i, total tq: %3i, if_incoming = %s, if_outgoing = %s\n",
+ "bidirectional: orig = %pM neigh = %pM => own_bcast = %2i, real recv = %2i, local tq: %3i, asym_penalty: %3i, iface_hop_penalty: %3i, total tq: %3i, if_incoming = %s, if_outgoing = %s\n",
orig_node->orig, orig_neigh_node->orig, total_count,
- neigh_rq_count, tq_own, tq_asym_penalty, tq_iface_penalty,
- batadv_ogm_packet->tq, if_incoming->net_dev->name,
+ neigh_rq_count, tq_own, tq_asym_penalty,
+ tq_iface_hop_penalty, batadv_ogm_packet->tq,
+ if_incoming->net_dev->name,
if_outgoing ? if_outgoing->net_dev->name : "DEFAULT");
/* if link has the minimum required transmission quality
@@ -1305,8 +1192,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
ret = true;
out:
- if (neigh_node)
- batadv_neigh_node_put(neigh_node);
+ batadv_neigh_node_put(neigh_node);
return ret;
}
@@ -1326,7 +1212,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr,
const struct batadv_hard_iface *if_incoming,
struct batadv_hard_iface *if_outgoing)
{
- struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface);
struct batadv_orig_node *orig_node;
struct batadv_orig_ifinfo *orig_ifinfo = NULL;
struct batadv_neigh_node *neigh_node;
@@ -1428,7 +1314,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
struct batadv_hard_iface *if_incoming,
struct batadv_hard_iface *if_outgoing)
{
- struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface);
struct batadv_hardif_neigh_node *hardif_neigh = NULL;
struct batadv_neigh_node *router = NULL;
struct batadv_neigh_node *router_router = NULL;
@@ -1519,10 +1405,6 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
if (!orig_neigh_node)
goto out;
- /* Update nc_nodes of the originator */
- batadv_nc_update_nc_node(bat_priv, orig_node, orig_neigh_node,
- ogm_packet, is_single_hop_neigh);
-
orig_neigh_router = batadv_orig_router_get(orig_neigh_node,
if_outgoing);
@@ -1607,30 +1489,68 @@ out_neigh:
if (orig_neigh_node && !is_single_hop_neigh)
batadv_orig_node_put(orig_neigh_node);
out:
- if (router_ifinfo)
- batadv_neigh_ifinfo_put(router_ifinfo);
- if (router)
- batadv_neigh_node_put(router);
- if (router_router)
- batadv_neigh_node_put(router_router);
- if (orig_neigh_router)
- batadv_neigh_node_put(orig_neigh_router);
- if (hardif_neigh)
- batadv_hardif_neigh_put(hardif_neigh);
+ batadv_neigh_ifinfo_put(router_ifinfo);
+ batadv_neigh_node_put(router);
+ batadv_neigh_node_put(router_router);
+ batadv_neigh_node_put(orig_neigh_router);
+ batadv_hardif_neigh_put(hardif_neigh);
consume_skb(skb_priv);
}
/**
+ * batadv_iv_ogm_process_reply() - Check OGM for direct reply and process it
+ * @ogm_packet: rebroadcast OGM packet to process
+ * @if_incoming: the interface where this packet was received
+ * @orig_node: originator which reproadcasted the OGMs
+ * @if_incoming_seqno: OGM sequence number when rebroadcast was received
+ */
+static void batadv_iv_ogm_process_reply(struct batadv_ogm_packet *ogm_packet,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_orig_node *orig_node,
+ u32 if_incoming_seqno)
+{
+ struct batadv_orig_ifinfo *orig_ifinfo;
+ s32 bit_pos;
+ u8 *weight;
+
+ /* neighbor has to indicate direct link and it has to
+ * come via the corresponding interface
+ */
+ if (!(ogm_packet->flags & BATADV_DIRECTLINK))
+ return;
+
+ if (!batadv_compare_eth(if_incoming->net_dev->dev_addr,
+ ogm_packet->orig))
+ return;
+
+ orig_ifinfo = batadv_orig_ifinfo_get(orig_node, if_incoming);
+ if (!orig_ifinfo)
+ return;
+
+ /* save packet seqno for bidirectional check */
+ spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+ bit_pos = if_incoming_seqno - 2;
+ bit_pos -= ntohl(ogm_packet->seqno);
+ batadv_set_bit(orig_ifinfo->bat_iv.bcast_own, bit_pos);
+ weight = &orig_ifinfo->bat_iv.bcast_own_sum;
+ *weight = bitmap_weight(orig_ifinfo->bat_iv.bcast_own,
+ BATADV_TQ_LOCAL_WINDOW_SIZE);
+ spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
+
+ batadv_orig_ifinfo_put(orig_ifinfo);
+}
+
+/**
* batadv_iv_ogm_process() - process an incoming batman iv OGM
* @skb: the skb containing the OGM
* @ogm_offset: offset to the OGM which should be processed (for aggregates)
- * @if_incoming: the interface where this packet was receved
+ * @if_incoming: the interface where this packet was received
*/
static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset,
struct batadv_hard_iface *if_incoming)
{
- struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface);
struct batadv_orig_node *orig_neigh_node, *orig_node;
struct batadv_hard_iface *hard_iface;
struct batadv_ogm_packet *ogm_packet;
@@ -1640,6 +1560,7 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset,
bool is_my_oldorig = false;
bool is_my_addr = false;
bool is_my_orig = false;
+ struct list_head *iter;
ogm_packet = (struct batadv_ogm_packet *)(skb->data + ogm_offset);
ethhdr = eth_hdr(skb);
@@ -1676,11 +1597,9 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset,
ogm_packet->version, has_directlink_flag);
rcu_read_lock();
- list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
- if (hard_iface->if_status != BATADV_IF_ACTIVE)
- continue;
- if (hard_iface->soft_iface != if_incoming->soft_iface)
+ netdev_for_each_lower_private_rcu(if_incoming->mesh_iface, hard_iface, iter) {
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
continue;
if (batadv_compare_eth(ethhdr->h_source,
@@ -1705,37 +1624,13 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset,
}
if (is_my_orig) {
- unsigned long *word;
- size_t offset;
- s32 bit_pos;
- unsigned int if_num;
- u8 *weight;
-
orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv,
ethhdr->h_source);
if (!orig_neigh_node)
return;
- /* neighbor has to indicate direct link and it has to
- * come via the corresponding interface
- * save packet seqno for bidirectional check
- */
- if (has_directlink_flag &&
- batadv_compare_eth(if_incoming->net_dev->dev_addr,
- ogm_packet->orig)) {
- if_num = if_incoming->if_num;
- offset = if_num * BATADV_NUM_WORDS;
-
- spin_lock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock);
- word = &orig_neigh_node->bat_iv.bcast_own[offset];
- bit_pos = if_incoming_seqno - 2;
- bit_pos -= ntohl(ogm_packet->seqno);
- batadv_set_bit(word, bit_pos);
- weight = &orig_neigh_node->bat_iv.bcast_own_sum[if_num];
- *weight = bitmap_weight(word,
- BATADV_TQ_LOCAL_WINDOW_SIZE);
- spin_unlock_bh(&orig_neigh_node->bat_iv.ogm_cnt_lock);
- }
+ batadv_iv_ogm_process_reply(ogm_packet, if_incoming,
+ orig_neigh_node, if_incoming_seqno);
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
"Drop packet: originator packet from myself (via neighbor)\n");
@@ -1765,13 +1660,10 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset,
if_incoming, BATADV_IF_DEFAULT);
rcu_read_lock();
- list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) {
if (hard_iface->if_status != BATADV_IF_ACTIVE)
continue;
- if (hard_iface->soft_iface != bat_priv->soft_iface)
- continue;
-
if (!kref_get_unless_zero(&hard_iface->refcount))
continue;
@@ -1795,7 +1687,7 @@ static void batadv_iv_send_outstanding_bat_ogm_packet(struct work_struct *work)
delayed_work = to_delayed_work(work);
forw_packet = container_of(delayed_work, struct batadv_forw_packet,
delayed_work);
- bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface);
+ bat_priv = netdev_priv(forw_packet->if_incoming->mesh_iface);
if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) {
dropped = true;
@@ -1826,7 +1718,7 @@ out:
static int batadv_iv_ogm_receive(struct sk_buff *skb,
struct batadv_hard_iface *if_incoming)
{
- struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface);
struct batadv_ogm_packet *ogm_packet;
u8 *packet_pos;
int ogm_offset;
@@ -1852,7 +1744,7 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb,
/* unpack the aggregated packets and process them one by one */
while (batadv_iv_ogm_aggr_packet(ogm_offset, skb_headlen(skb),
- ogm_packet->tvlv_len)) {
+ ogm_packet)) {
batadv_iv_ogm_process(skb, ogm_offset, if_incoming);
ogm_offset += BATADV_OGM_HLEN;
@@ -1873,106 +1765,6 @@ free_skb:
return ret;
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_iv_ogm_orig_print_neigh() - print neighbors for the originator table
- * @orig_node: the orig_node for which the neighbors are printed
- * @if_outgoing: outgoing interface for these entries
- * @seq: debugfs table seq_file struct
- *
- * Must be called while holding an rcu lock.
- */
-static void
-batadv_iv_ogm_orig_print_neigh(struct batadv_orig_node *orig_node,
- struct batadv_hard_iface *if_outgoing,
- struct seq_file *seq)
-{
- struct batadv_neigh_node *neigh_node;
- struct batadv_neigh_ifinfo *n_ifinfo;
-
- hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) {
- n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
- if (!n_ifinfo)
- continue;
-
- seq_printf(seq, " %pM (%3i)",
- neigh_node->addr,
- n_ifinfo->bat_iv.tq_avg);
-
- batadv_neigh_ifinfo_put(n_ifinfo);
- }
-}
-
-/**
- * batadv_iv_ogm_orig_print() - print the originator table
- * @bat_priv: the bat priv with all the soft interface information
- * @seq: debugfs table seq_file struct
- * @if_outgoing: the outgoing interface for which this should be printed
- */
-static void batadv_iv_ogm_orig_print(struct batadv_priv *bat_priv,
- struct seq_file *seq,
- struct batadv_hard_iface *if_outgoing)
-{
- struct batadv_neigh_node *neigh_node;
- struct batadv_hashtable *hash = bat_priv->orig_hash;
- int last_seen_msecs, last_seen_secs;
- struct batadv_orig_node *orig_node;
- struct batadv_neigh_ifinfo *n_ifinfo;
- unsigned long last_seen_jiffies;
- struct hlist_head *head;
- int batman_count = 0;
- u32 i;
-
- seq_puts(seq,
- " Originator last-seen (#/255) Nexthop [outgoingIF]: Potential nexthops ...\n");
-
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
- neigh_node = batadv_orig_router_get(orig_node,
- if_outgoing);
- if (!neigh_node)
- continue;
-
- n_ifinfo = batadv_neigh_ifinfo_get(neigh_node,
- if_outgoing);
- if (!n_ifinfo)
- goto next;
-
- if (n_ifinfo->bat_iv.tq_avg == 0)
- goto next;
-
- last_seen_jiffies = jiffies - orig_node->last_seen;
- last_seen_msecs = jiffies_to_msecs(last_seen_jiffies);
- last_seen_secs = last_seen_msecs / 1000;
- last_seen_msecs = last_seen_msecs % 1000;
-
- seq_printf(seq, "%pM %4i.%03is (%3i) %pM [%10s]:",
- orig_node->orig, last_seen_secs,
- last_seen_msecs, n_ifinfo->bat_iv.tq_avg,
- neigh_node->addr,
- neigh_node->if_incoming->net_dev->name);
-
- batadv_iv_ogm_orig_print_neigh(orig_node, if_outgoing,
- seq);
- seq_putc(seq, '\n');
- batman_count++;
-
-next:
- batadv_neigh_node_put(neigh_node);
- if (n_ifinfo)
- batadv_neigh_ifinfo_put(n_ifinfo);
- }
- rcu_read_unlock();
- }
-
- if (batman_count == 0)
- seq_puts(seq, "No batman nodes in range ...\n");
-}
-#endif
-
/**
* batadv_iv_ogm_neigh_get_tq_avg() - Get the TQ average for a neighbour on a
* given outgoing interface.
@@ -2005,7 +1797,7 @@ batadv_iv_ogm_neigh_get_tq_avg(struct batadv_neigh_node *neigh_node,
* @msg: Netlink message to dump into
* @portid: Port making netlink request
* @seq: Sequence number of netlink message
- * @bat_priv: The bat priv with all the soft interface information
+ * @bat_priv: The bat priv with all the mesh interface information
* @if_outgoing: Limit dump to entries with this outgoing interface
* @orig_node: Originator to dump
* @neigh_node: Single hops neighbour
@@ -2043,6 +1835,8 @@ batadv_iv_ogm_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq,
orig_node->orig) ||
nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN,
neigh_node->addr) ||
+ nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
+ neigh_node->if_incoming->net_dev->name) ||
nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
neigh_node->if_incoming->net_dev->ifindex) ||
nla_put_u8(msg, BATADV_ATTR_TQ, tq_avg) ||
@@ -2066,7 +1860,7 @@ batadv_iv_ogm_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq,
* @msg: Netlink message to dump into
* @portid: Port making netlink request
* @seq: Sequence number of netlink message
- * @bat_priv: The bat priv with all the soft interface information
+ * @bat_priv: The bat priv with all the mesh interface information
* @if_outgoing: Limit dump to entries with this outgoing interface
* @orig_node: Originator to dump
* @sub_s: Number of sub entries to skip
@@ -2116,8 +1910,7 @@ batadv_iv_ogm_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
}
out:
- if (neigh_node_best)
- batadv_neigh_node_put(neigh_node_best);
+ batadv_neigh_node_put(neigh_node_best);
*sub_s = 0;
return 0;
@@ -2129,7 +1922,7 @@ batadv_iv_ogm_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
* @msg: Netlink message to dump into
* @portid: Port making netlink request
* @seq: Sequence number of netlink message
- * @bat_priv: The bat priv with all the soft interface information
+ * @bat_priv: The bat priv with all the mesh interface information
* @if_outgoing: Limit dump to entries with this outgoing interface
* @head: Bucket to be dumped
* @idx_s: Number of entries to be skipped
@@ -2170,7 +1963,7 @@ batadv_iv_ogm_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
* batadv_iv_ogm_orig_dump() - Dump the originators into a message
* @msg: Netlink message to dump into
* @cb: Control block containing additional options
- * @bat_priv: The bat priv with all the soft interface information
+ * @bat_priv: The bat priv with all the mesh interface information
* @if_outgoing: Limit dump to entries with this outgoing interface
*/
static void
@@ -2202,59 +1995,6 @@ batadv_iv_ogm_orig_dump(struct sk_buff *msg, struct netlink_callback *cb,
cb->args[2] = sub;
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_iv_hardif_neigh_print() - print a single hop neighbour node
- * @seq: neighbour table seq_file struct
- * @hardif_neigh: hardif neighbour information
- */
-static void
-batadv_iv_hardif_neigh_print(struct seq_file *seq,
- struct batadv_hardif_neigh_node *hardif_neigh)
-{
- int last_secs, last_msecs;
-
- last_secs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) / 1000;
- last_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) % 1000;
-
- seq_printf(seq, " %10s %pM %4i.%03is\n",
- hardif_neigh->if_incoming->net_dev->name,
- hardif_neigh->addr, last_secs, last_msecs);
-}
-
-/**
- * batadv_iv_ogm_neigh_print() - print the single hop neighbour list
- * @bat_priv: the bat priv with all the soft interface information
- * @seq: neighbour table seq_file struct
- */
-static void batadv_iv_neigh_print(struct batadv_priv *bat_priv,
- struct seq_file *seq)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_hardif_neigh_node *hardif_neigh;
- struct batadv_hard_iface *hard_iface;
- int batman_count = 0;
-
- seq_puts(seq, " IF Neighbor last-seen\n");
-
- rcu_read_lock();
- list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
- if (hard_iface->soft_iface != net_dev)
- continue;
-
- hlist_for_each_entry_rcu(hardif_neigh,
- &hard_iface->neigh_list, list) {
- batadv_iv_hardif_neigh_print(seq, hardif_neigh);
- batman_count++;
- }
- }
- rcu_read_unlock();
-
- if (batman_count == 0)
- seq_puts(seq, "No batman nodes in range ...\n");
-}
-#endif
-
/**
* batadv_iv_ogm_neigh_diff() - calculate tq difference of two neighbors
* @neigh1: the first neighbor object of the comparison
@@ -2292,10 +2032,8 @@ static bool batadv_iv_ogm_neigh_diff(struct batadv_neigh_node *neigh1,
*diff = (int)tq1 - (int)tq2;
out:
- if (neigh1_ifinfo)
- batadv_neigh_ifinfo_put(neigh1_ifinfo);
- if (neigh2_ifinfo)
- batadv_neigh_ifinfo_put(neigh2_ifinfo);
+ batadv_neigh_ifinfo_put(neigh1_ifinfo);
+ batadv_neigh_ifinfo_put(neigh2_ifinfo);
return ret;
}
@@ -2325,6 +2063,8 @@ batadv_iv_ogm_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq,
if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN,
hardif_neigh->addr) ||
+ nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
+ hardif_neigh->if_incoming->net_dev->name) ||
nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
hardif_neigh->if_incoming->net_dev->ifindex) ||
nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS,
@@ -2345,7 +2085,7 @@ batadv_iv_ogm_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq,
* @msg: Netlink message to dump into
* @portid: Port making netlink request
* @seq: Sequence number of netlink message
- * @bat_priv: The bat priv with all the soft interface information
+ * @bat_priv: The bat priv with all the mesh interface information
* @hard_iface: Hard interface to dump the neighbours for
* @idx_s: Number of entries to skip
*
@@ -2382,8 +2122,8 @@ batadv_iv_ogm_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq,
* batadv_iv_ogm_neigh_dump() - Dump the neighbours into a message
* @msg: Netlink message to dump into
* @cb: Control block containing additional options
- * @bat_priv: The bat priv with all the soft interface information
- * @single_hardif: Limit dump to this hard interfaace
+ * @bat_priv: The bat priv with all the mesh interface information
+ * @single_hardif: Limit dump to this hard interface
*/
static void
batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb,
@@ -2391,6 +2131,7 @@ batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb,
struct batadv_hard_iface *single_hardif)
{
struct batadv_hard_iface *hard_iface;
+ struct list_head *iter;
int i_hardif = 0;
int i_hardif_s = cb->args[0];
int idx = cb->args[1];
@@ -2407,11 +2148,7 @@ batadv_iv_ogm_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb,
i_hardif++;
}
} else {
- list_for_each_entry_rcu(hard_iface, &batadv_hardif_list,
- list) {
- if (hard_iface->soft_iface != bat_priv->soft_iface)
- continue;
-
+ netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) {
if (i_hardif++ < i_hardif_s)
continue;
@@ -2485,7 +2222,7 @@ batadv_iv_ogm_neigh_is_sob(struct batadv_neigh_node *neigh1,
return ret;
}
-static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface)
+static void batadv_iv_iface_enabled(struct batadv_hard_iface *hard_iface)
{
/* begin scheduling originator messages on that interface */
batadv_iv_ogm_schedule(hard_iface);
@@ -2493,7 +2230,7 @@ static void batadv_iv_iface_activate(struct batadv_hard_iface *hard_iface)
/**
* batadv_iv_init_sel_class() - initialize GW selection class
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*/
static void batadv_iv_init_sel_class(struct batadv_priv *bat_priv)
{
@@ -2540,8 +2277,7 @@ batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
if (tmp_gw_factor > max_gw_factor ||
(tmp_gw_factor == max_gw_factor &&
tq_avg > max_tq)) {
- if (curr_gw)
- batadv_gw_node_put(curr_gw);
+ batadv_gw_node_put(curr_gw);
curr_gw = gw_node;
kref_get(&curr_gw->refcount);
}
@@ -2555,8 +2291,7 @@ batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
* $routing_class more tq points)
*/
if (tq_avg > max_tq) {
- if (curr_gw)
- batadv_gw_node_put(curr_gw);
+ batadv_gw_node_put(curr_gw);
curr_gw = gw_node;
kref_get(&curr_gw->refcount);
}
@@ -2573,8 +2308,7 @@ batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
next:
batadv_neigh_node_put(router);
- if (router_ifinfo)
- batadv_neigh_ifinfo_put(router_ifinfo);
+ batadv_neigh_ifinfo_put(router_ifinfo);
}
rcu_read_unlock();
@@ -2638,95 +2372,26 @@ static bool batadv_iv_gw_is_eligible(struct batadv_priv *bat_priv,
ret = true;
out:
- if (router_gw_ifinfo)
- batadv_neigh_ifinfo_put(router_gw_ifinfo);
- if (router_orig_ifinfo)
- batadv_neigh_ifinfo_put(router_orig_ifinfo);
- if (router_gw)
- batadv_neigh_node_put(router_gw);
- if (router_orig)
- batadv_neigh_node_put(router_orig);
-
- return ret;
-}
-
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/* fails if orig_node has no router */
-static int batadv_iv_gw_write_buffer_text(struct batadv_priv *bat_priv,
- struct seq_file *seq,
- const struct batadv_gw_node *gw_node)
-{
- struct batadv_gw_node *curr_gw;
- struct batadv_neigh_node *router;
- struct batadv_neigh_ifinfo *router_ifinfo = NULL;
- int ret = -1;
-
- router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT);
- if (!router)
- goto out;
-
- router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT);
- if (!router_ifinfo)
- goto out;
-
- curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
+ batadv_neigh_ifinfo_put(router_gw_ifinfo);
+ batadv_neigh_ifinfo_put(router_orig_ifinfo);
+ batadv_neigh_node_put(router_gw);
+ batadv_neigh_node_put(router_orig);
- seq_printf(seq, "%s %pM (%3i) %pM [%10s]: %u.%u/%u.%u MBit\n",
- (curr_gw == gw_node ? "=>" : " "),
- gw_node->orig_node->orig,
- router_ifinfo->bat_iv.tq_avg, router->addr,
- router->if_incoming->net_dev->name,
- gw_node->bandwidth_down / 10,
- gw_node->bandwidth_down % 10,
- gw_node->bandwidth_up / 10,
- gw_node->bandwidth_up % 10);
- ret = seq_has_overflowed(seq) ? -1 : 0;
-
- if (curr_gw)
- batadv_gw_node_put(curr_gw);
-out:
- if (router_ifinfo)
- batadv_neigh_ifinfo_put(router_ifinfo);
- if (router)
- batadv_neigh_node_put(router);
return ret;
}
-static void batadv_iv_gw_print(struct batadv_priv *bat_priv,
- struct seq_file *seq)
-{
- struct batadv_gw_node *gw_node;
- int gw_count = 0;
-
- seq_puts(seq,
- " Gateway (#/255) Nexthop [outgoingIF]: advertised uplink bandwidth\n");
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
- /* fails if orig_node has no router */
- if (batadv_iv_gw_write_buffer_text(bat_priv, seq, gw_node) < 0)
- continue;
-
- gw_count++;
- }
- rcu_read_unlock();
-
- if (gw_count == 0)
- seq_puts(seq, "No gateways in range ...\n");
-}
-#endif
-
/**
* batadv_iv_gw_dump_entry() - Dump a gateway into a message
* @msg: Netlink message to dump into
* @portid: Port making netlink request
- * @seq: Sequence number of netlink message
- * @bat_priv: The bat priv with all the soft interface information
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the mesh interface information
* @gw_node: Gateway to be dumped
*
* Return: Error code, or 0 on success
*/
-static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_priv *bat_priv,
struct batadv_gw_node *gw_node)
{
@@ -2746,13 +2411,16 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
- hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
- NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS);
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_GATEWAYS);
if (!hdr) {
ret = -ENOBUFS;
goto out;
}
+ genl_dump_check_consistent(cb, hdr);
+
ret = -EMSGSIZE;
if (curr_gw == gw_node)
@@ -2768,6 +2436,8 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
router->addr) ||
nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
router->if_incoming->net_dev->name) ||
+ nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
+ router->if_incoming->net_dev->ifindex) ||
nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN,
gw_node->bandwidth_down) ||
nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_UP,
@@ -2780,12 +2450,9 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
ret = 0;
out:
- if (curr_gw)
- batadv_gw_node_put(curr_gw);
- if (router_ifinfo)
- batadv_neigh_ifinfo_put(router_ifinfo);
- if (router)
- batadv_neigh_node_put(router);
+ batadv_gw_node_put(curr_gw);
+ batadv_neigh_ifinfo_put(router_ifinfo);
+ batadv_neigh_node_put(router);
return ret;
}
@@ -2793,7 +2460,7 @@ out:
* batadv_iv_gw_dump() - Dump gateways into a message
* @msg: Netlink message to dump into
* @cb: Control block containing additional options
- * @bat_priv: The bat priv with all the soft interface information
+ * @bat_priv: The bat priv with all the mesh interface information
*/
static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb,
struct batadv_priv *bat_priv)
@@ -2803,13 +2470,15 @@ static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb,
int idx_skip = cb->args[0];
int idx = 0;
- rcu_read_lock();
- hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
+ spin_lock_bh(&bat_priv->gw.list_lock);
+ cb->seq = bat_priv->gw.generation << 1 | 1;
+
+ hlist_for_each_entry(gw_node, &bat_priv->gw.gateway_list, list) {
if (idx++ < idx_skip)
continue;
- if (batadv_iv_gw_dump_entry(msg, portid, cb->nlh->nlmsg_seq,
- bat_priv, gw_node)) {
+ if (batadv_iv_gw_dump_entry(msg, portid, cb, bat_priv,
+ gw_node)) {
idx_skip = idx - 1;
goto unlock;
}
@@ -2817,7 +2486,7 @@ static void batadv_iv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb,
idx_skip = idx;
unlock:
- rcu_read_unlock();
+ spin_unlock_bh(&bat_priv->gw.list_lock);
cb->args[0] = idx_skip;
}
@@ -2825,8 +2494,8 @@ unlock:
static struct batadv_algo_ops batadv_batman_iv __read_mostly = {
.name = "BATMAN_IV",
.iface = {
- .activate = batadv_iv_iface_activate,
.enable = batadv_iv_ogm_iface_enable,
+ .enabled = batadv_iv_iface_enabled,
.disable = batadv_iv_ogm_iface_disable,
.update_mac = batadv_iv_ogm_iface_update_mac,
.primary_set = batadv_iv_ogm_primary_iface_set,
@@ -2834,27 +2503,16 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = {
.neigh = {
.cmp = batadv_iv_ogm_neigh_cmp,
.is_similar_or_better = batadv_iv_ogm_neigh_is_sob,
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- .print = batadv_iv_neigh_print,
-#endif
.dump = batadv_iv_ogm_neigh_dump,
},
.orig = {
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- .print = batadv_iv_ogm_orig_print,
-#endif
.dump = batadv_iv_ogm_orig_dump,
- .free = batadv_iv_ogm_orig_free,
- .add_if = batadv_iv_ogm_orig_add_if,
- .del_if = batadv_iv_ogm_orig_del_if,
},
.gw = {
.init_sel_class = batadv_iv_init_sel_class,
+ .sel_class_max = BATADV_TQ_MAX_VALUE,
.get_best_gw_node = batadv_iv_gw_get_best_gw_node,
.is_eligible = batadv_iv_gw_is_eligible,
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- .print = batadv_iv_gw_print,
-#endif
.dump = batadv_iv_gw_dump,
},
};
diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h
index 3dc6a7a43eb7..04b01bd684e8 100644
--- a/net/batman-adv/bat_iv_ogm.h
+++ b/net/batman-adv/bat_iv_ogm.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_BAT_IV_OGM_H_
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index 6baec4e68898..de9444714264 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -1,19 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Linus Lüssing, Marek Lindner
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "bat_v.h"
@@ -25,13 +13,16 @@
#include <linux/if_ether.h>
#include <linux/init.h>
#include <linux/jiffies.h>
-#include <linux/kernel.h>
#include <linux/kref.h>
+#include <linux/limits.h>
+#include <linux/list.h>
+#include <linux/minmax.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
-#include <linux/seq_file.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
#include <linux/stddef.h>
#include <linux/types.h>
#include <linux/workqueue.h>
@@ -44,18 +35,15 @@
#include "bat_v_elp.h"
#include "bat_v_ogm.h"
#include "gateway_client.h"
-#include "gateway_common.h"
#include "hard-interface.h"
#include "hash.h"
#include "log.h"
#include "netlink.h"
#include "originator.h"
-struct sk_buff;
-
static void batadv_v_iface_activate(struct batadv_hard_iface *hard_iface)
{
- struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
struct batadv_hard_iface *primary_if;
primary_if = batadv_primary_if_get_selected(bat_priv);
@@ -90,6 +78,7 @@ static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface)
static void batadv_v_iface_disable(struct batadv_hard_iface *hard_iface)
{
+ batadv_v_ogm_iface_disable(hard_iface);
batadv_v_elp_iface_disable(hard_iface);
}
@@ -108,7 +97,7 @@ static void batadv_v_primary_iface_set(struct batadv_hard_iface *hard_iface)
*/
static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface)
{
- struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
struct batadv_hard_iface *primary_if;
primary_if = batadv_primary_if_get_selected(bat_priv);
@@ -117,105 +106,16 @@ static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface)
batadv_v_primary_iface_set(hard_iface);
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
+ batadv_hardif_put(primary_if);
}
static void
batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh)
{
ewma_throughput_init(&hardif_neigh->bat_v.throughput);
- INIT_WORK(&hardif_neigh->bat_v.metric_work,
- batadv_v_elp_throughput_metric_update);
-}
-
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_v_orig_print_neigh() - print neighbors for the originator table
- * @orig_node: the orig_node for which the neighbors are printed
- * @if_outgoing: outgoing interface for these entries
- * @seq: debugfs table seq_file struct
- *
- * Must be called while holding an rcu lock.
- */
-static void
-batadv_v_orig_print_neigh(struct batadv_orig_node *orig_node,
- struct batadv_hard_iface *if_outgoing,
- struct seq_file *seq)
-{
- struct batadv_neigh_node *neigh_node;
- struct batadv_neigh_ifinfo *n_ifinfo;
-
- hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) {
- n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
- if (!n_ifinfo)
- continue;
-
- seq_printf(seq, " %pM (%9u.%1u)",
- neigh_node->addr,
- n_ifinfo->bat_v.throughput / 10,
- n_ifinfo->bat_v.throughput % 10);
-
- batadv_neigh_ifinfo_put(n_ifinfo);
- }
}
/**
- * batadv_v_hardif_neigh_print() - print a single ELP neighbour node
- * @seq: neighbour table seq_file struct
- * @hardif_neigh: hardif neighbour information
- */
-static void
-batadv_v_hardif_neigh_print(struct seq_file *seq,
- struct batadv_hardif_neigh_node *hardif_neigh)
-{
- int last_secs, last_msecs;
- u32 throughput;
-
- last_secs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) / 1000;
- last_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) % 1000;
- throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput);
-
- seq_printf(seq, "%pM %4i.%03is (%9u.%1u) [%10s]\n",
- hardif_neigh->addr, last_secs, last_msecs, throughput / 10,
- throughput % 10, hardif_neigh->if_incoming->net_dev->name);
-}
-
-/**
- * batadv_v_neigh_print() - print the single hop neighbour list
- * @bat_priv: the bat priv with all the soft interface information
- * @seq: neighbour table seq_file struct
- */
-static void batadv_v_neigh_print(struct batadv_priv *bat_priv,
- struct seq_file *seq)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_hardif_neigh_node *hardif_neigh;
- struct batadv_hard_iface *hard_iface;
- int batman_count = 0;
-
- seq_puts(seq,
- " Neighbor last-seen ( throughput) [ IF]\n");
-
- rcu_read_lock();
- list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
- if (hard_iface->soft_iface != net_dev)
- continue;
-
- hlist_for_each_entry_rcu(hardif_neigh,
- &hard_iface->neigh_list, list) {
- batadv_v_hardif_neigh_print(seq, hardif_neigh);
- batman_count++;
- }
- }
- rcu_read_unlock();
-
- if (batman_count == 0)
- seq_puts(seq, "No batman nodes in range ...\n");
-}
-#endif
-
-/**
* batadv_v_neigh_dump_neigh() - Dump a neighbour into a message
* @msg: Netlink message to dump into
* @portid: Port making netlink request
@@ -243,6 +143,8 @@ batadv_v_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq,
if (nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN,
hardif_neigh->addr) ||
+ nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
+ hardif_neigh->if_incoming->net_dev->name) ||
nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
hardif_neigh->if_incoming->net_dev->ifindex) ||
nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS,
@@ -264,7 +166,7 @@ batadv_v_neigh_dump_neigh(struct sk_buff *msg, u32 portid, u32 seq,
* @msg: Netlink message to dump into
* @portid: Port making netlink request
* @seq: Sequence number of netlink message
- * @bat_priv: The bat priv with all the soft interface information
+ * @bat_priv: The bat priv with all the mesh interface information
* @hard_iface: The hard interface to be dumped
* @idx_s: Entries to be skipped
*
@@ -301,7 +203,7 @@ batadv_v_neigh_dump_hardif(struct sk_buff *msg, u32 portid, u32 seq,
* message
* @msg: Netlink message to dump into
* @cb: Control block containing additional options
- * @bat_priv: The bat priv with all the soft interface information
+ * @bat_priv: The bat priv with all the mesh interface information
* @single_hardif: Limit dumping to this hard interface
*/
static void
@@ -310,6 +212,7 @@ batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb,
struct batadv_hard_iface *single_hardif)
{
struct batadv_hard_iface *hard_iface;
+ struct list_head *iter;
int i_hardif = 0;
int i_hardif_s = cb->args[0];
int idx = cb->args[1];
@@ -325,10 +228,7 @@ batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb,
i_hardif++;
}
} else {
- list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
- if (hard_iface->soft_iface != bat_priv->soft_iface)
- continue;
-
+ netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) {
if (i_hardif++ < i_hardif_s)
continue;
@@ -347,81 +247,12 @@ batadv_v_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb,
cb->args[1] = idx;
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_v_orig_print() - print the originator table
- * @bat_priv: the bat priv with all the soft interface information
- * @seq: debugfs table seq_file struct
- * @if_outgoing: the outgoing interface for which this should be printed
- */
-static void batadv_v_orig_print(struct batadv_priv *bat_priv,
- struct seq_file *seq,
- struct batadv_hard_iface *if_outgoing)
-{
- struct batadv_neigh_node *neigh_node;
- struct batadv_hashtable *hash = bat_priv->orig_hash;
- int last_seen_msecs, last_seen_secs;
- struct batadv_orig_node *orig_node;
- struct batadv_neigh_ifinfo *n_ifinfo;
- unsigned long last_seen_jiffies;
- struct hlist_head *head;
- int batman_count = 0;
- u32 i;
-
- seq_puts(seq,
- " Originator last-seen ( throughput) Nexthop [outgoingIF]: Potential nexthops ...\n");
-
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
- neigh_node = batadv_orig_router_get(orig_node,
- if_outgoing);
- if (!neigh_node)
- continue;
-
- n_ifinfo = batadv_neigh_ifinfo_get(neigh_node,
- if_outgoing);
- if (!n_ifinfo)
- goto next;
-
- last_seen_jiffies = jiffies - orig_node->last_seen;
- last_seen_msecs = jiffies_to_msecs(last_seen_jiffies);
- last_seen_secs = last_seen_msecs / 1000;
- last_seen_msecs = last_seen_msecs % 1000;
-
- seq_printf(seq, "%pM %4i.%03is (%9u.%1u) %pM [%10s]:",
- orig_node->orig, last_seen_secs,
- last_seen_msecs,
- n_ifinfo->bat_v.throughput / 10,
- n_ifinfo->bat_v.throughput % 10,
- neigh_node->addr,
- neigh_node->if_incoming->net_dev->name);
-
- batadv_v_orig_print_neigh(orig_node, if_outgoing, seq);
- seq_putc(seq, '\n');
- batman_count++;
-
-next:
- batadv_neigh_node_put(neigh_node);
- if (n_ifinfo)
- batadv_neigh_ifinfo_put(n_ifinfo);
- }
- rcu_read_unlock();
- }
-
- if (batman_count == 0)
- seq_puts(seq, "No batman nodes in range ...\n");
-}
-#endif
-
/**
* batadv_v_orig_dump_subentry() - Dump an originator subentry into a message
* @msg: Netlink message to dump into
* @portid: Port making netlink request
* @seq: Sequence number of netlink message
- * @bat_priv: The bat priv with all the soft interface information
+ * @bat_priv: The bat priv with all the mesh interface information
* @if_outgoing: Limit dump to entries with this outgoing interface
* @orig_node: Originator to dump
* @neigh_node: Single hops neighbour
@@ -464,6 +295,8 @@ batadv_v_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq,
if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig) ||
nla_put(msg, BATADV_ATTR_NEIGH_ADDRESS, ETH_ALEN,
neigh_node->addr) ||
+ nla_put_string(msg, BATADV_ATTR_HARD_IFNAME,
+ neigh_node->if_incoming->net_dev->name) ||
nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
neigh_node->if_incoming->net_dev->ifindex) ||
nla_put_u32(msg, BATADV_ATTR_THROUGHPUT, throughput) ||
@@ -487,7 +320,7 @@ batadv_v_orig_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq,
* @msg: Netlink message to dump into
* @portid: Port making netlink request
* @seq: Sequence number of netlink message
- * @bat_priv: The bat priv with all the soft interface information
+ * @bat_priv: The bat priv with all the mesh interface information
* @if_outgoing: Limit dump to entries with this outgoing interface
* @orig_node: Originator to dump
* @sub_s: Number of sub entries to skip
@@ -528,8 +361,7 @@ batadv_v_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
}
out:
- if (neigh_node_best)
- batadv_neigh_node_put(neigh_node_best);
+ batadv_neigh_node_put(neigh_node_best);
*sub_s = 0;
return 0;
@@ -540,7 +372,7 @@ batadv_v_orig_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
* @msg: Netlink message to dump into
* @portid: Port making netlink request
* @seq: Sequence number of netlink message
- * @bat_priv: The bat priv with all the soft interface information
+ * @bat_priv: The bat priv with all the mesh interface information
* @if_outgoing: Limit dump to entries with this outgoing interface
* @head: Bucket to be dumped
* @idx_s: Number of entries to be skipped
@@ -580,7 +412,7 @@ batadv_v_orig_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
* batadv_v_orig_dump() - Dump the originators into a message
* @msg: Netlink message to dump into
* @cb: Control block containing additional options
- * @bat_priv: The bat priv with all the soft interface information
+ * @bat_priv: The bat priv with all the mesh interface information
* @if_outgoing: Limit dump to entries with this outgoing interface
*/
static void
@@ -668,7 +500,7 @@ err_ifinfo1:
/**
* batadv_v_init_sel_class() - initialize GW selection class
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*/
static void batadv_v_init_sel_class(struct batadv_priv *bat_priv)
{
@@ -676,32 +508,6 @@ static void batadv_v_init_sel_class(struct batadv_priv *bat_priv)
atomic_set(&bat_priv->gw.sel_class, 50);
}
-static ssize_t batadv_v_store_sel_class(struct batadv_priv *bat_priv,
- char *buff, size_t count)
-{
- u32 old_class, class;
-
- if (!batadv_parse_throughput(bat_priv->soft_iface, buff,
- "B.A.T.M.A.N. V GW selection class",
- &class))
- return -EINVAL;
-
- old_class = atomic_read(&bat_priv->gw.sel_class);
- atomic_set(&bat_priv->gw.sel_class, class);
-
- if (old_class != class)
- batadv_gw_reselect(bat_priv);
-
- return count;
-}
-
-static ssize_t batadv_v_show_sel_class(struct batadv_priv *bat_priv, char *buff)
-{
- u32 class = atomic_read(&bat_priv->gw.sel_class);
-
- return sprintf(buff, "%u.%u MBit\n", class / 10, class % 10);
-}
-
/**
* batadv_v_gw_throughput_get() - retrieve the GW-bandwidth for a given GW
* @gw_node: the GW to retrieve the metric for
@@ -737,17 +543,15 @@ static int batadv_v_gw_throughput_get(struct batadv_gw_node *gw_node, u32 *bw)
ret = 0;
out:
- if (router)
- batadv_neigh_node_put(router);
- if (router_ifinfo)
- batadv_neigh_ifinfo_put(router_ifinfo);
+ batadv_neigh_node_put(router);
+ batadv_neigh_ifinfo_put(router_ifinfo);
return ret;
}
/**
* batadv_v_gw_get_best_gw_node() - retrieve the best GW node
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*
* Return: the GW node having the best GW-metric, NULL if no GW is known
*/
@@ -768,8 +572,7 @@ batadv_v_gw_get_best_gw_node(struct batadv_priv *bat_priv)
if (curr_gw && bw <= max_bw)
goto next;
- if (curr_gw)
- batadv_gw_node_put(curr_gw);
+ batadv_gw_node_put(curr_gw);
curr_gw = gw_node;
kref_get(&curr_gw->refcount);
@@ -785,7 +588,7 @@ next:
/**
* batadv_v_gw_is_eligible() - check if a originator would be selected as GW
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @curr_gw_orig: originator representing the currently selected GW
* @orig_node: the originator representing the new candidate
*
@@ -831,97 +634,24 @@ static bool batadv_v_gw_is_eligible(struct batadv_priv *bat_priv,
ret = true;
out:
- if (curr_gw)
- batadv_gw_node_put(curr_gw);
- if (orig_gw)
- batadv_gw_node_put(orig_gw);
-
- return ret;
-}
-
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/* fails if orig_node has no router */
-static int batadv_v_gw_write_buffer_text(struct batadv_priv *bat_priv,
- struct seq_file *seq,
- const struct batadv_gw_node *gw_node)
-{
- struct batadv_gw_node *curr_gw;
- struct batadv_neigh_node *router;
- struct batadv_neigh_ifinfo *router_ifinfo = NULL;
- int ret = -1;
-
- router = batadv_orig_router_get(gw_node->orig_node, BATADV_IF_DEFAULT);
- if (!router)
- goto out;
-
- router_ifinfo = batadv_neigh_ifinfo_get(router, BATADV_IF_DEFAULT);
- if (!router_ifinfo)
- goto out;
-
- curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
+ batadv_gw_node_put(curr_gw);
+ batadv_gw_node_put(orig_gw);
- seq_printf(seq, "%s %pM (%9u.%1u) %pM [%10s]: %u.%u/%u.%u MBit\n",
- (curr_gw == gw_node ? "=>" : " "),
- gw_node->orig_node->orig,
- router_ifinfo->bat_v.throughput / 10,
- router_ifinfo->bat_v.throughput % 10, router->addr,
- router->if_incoming->net_dev->name,
- gw_node->bandwidth_down / 10,
- gw_node->bandwidth_down % 10,
- gw_node->bandwidth_up / 10,
- gw_node->bandwidth_up % 10);
- ret = seq_has_overflowed(seq) ? -1 : 0;
-
- if (curr_gw)
- batadv_gw_node_put(curr_gw);
-out:
- if (router_ifinfo)
- batadv_neigh_ifinfo_put(router_ifinfo);
- if (router)
- batadv_neigh_node_put(router);
return ret;
}
/**
- * batadv_v_gw_print() - print the gateway list
- * @bat_priv: the bat priv with all the soft interface information
- * @seq: gateway table seq_file struct
- */
-static void batadv_v_gw_print(struct batadv_priv *bat_priv,
- struct seq_file *seq)
-{
- struct batadv_gw_node *gw_node;
- int gw_count = 0;
-
- seq_puts(seq,
- " Gateway ( throughput) Nexthop [outgoingIF]: advertised uplink bandwidth\n");
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
- /* fails if orig_node has no router */
- if (batadv_v_gw_write_buffer_text(bat_priv, seq, gw_node) < 0)
- continue;
-
- gw_count++;
- }
- rcu_read_unlock();
-
- if (gw_count == 0)
- seq_puts(seq, "No gateways in range ...\n");
-}
-#endif
-
-/**
* batadv_v_gw_dump_entry() - Dump a gateway into a message
* @msg: Netlink message to dump into
* @portid: Port making netlink request
- * @seq: Sequence number of netlink message
- * @bat_priv: The bat priv with all the soft interface information
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the mesh interface information
* @gw_node: Gateway to be dumped
*
* Return: Error code, or 0 on success
*/
-static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_priv *bat_priv,
struct batadv_gw_node *gw_node)
{
@@ -941,13 +671,16 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
- hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
- NLM_F_MULTI, BATADV_CMD_GET_GATEWAYS);
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_GATEWAYS);
if (!hdr) {
ret = -ENOBUFS;
goto out;
}
+ genl_dump_check_consistent(cb, hdr);
+
ret = -EMSGSIZE;
if (curr_gw == gw_node) {
@@ -980,6 +713,12 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
goto out;
}
+ if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
+ router->if_incoming->net_dev->ifindex)) {
+ genlmsg_cancel(msg, hdr);
+ goto out;
+ }
+
if (nla_put_u32(msg, BATADV_ATTR_BANDWIDTH_DOWN,
gw_node->bandwidth_down)) {
genlmsg_cancel(msg, hdr);
@@ -995,12 +734,9 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
ret = 0;
out:
- if (curr_gw)
- batadv_gw_node_put(curr_gw);
- if (router_ifinfo)
- batadv_neigh_ifinfo_put(router_ifinfo);
- if (router)
- batadv_neigh_node_put(router);
+ batadv_gw_node_put(curr_gw);
+ batadv_neigh_ifinfo_put(router_ifinfo);
+ batadv_neigh_node_put(router);
return ret;
}
@@ -1008,7 +744,7 @@ out:
* batadv_v_gw_dump() - Dump gateways into a message
* @msg: Netlink message to dump into
* @cb: Control block containing additional options
- * @bat_priv: The bat priv with all the soft interface information
+ * @bat_priv: The bat priv with all the mesh interface information
*/
static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb,
struct batadv_priv *bat_priv)
@@ -1018,13 +754,15 @@ static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb,
int idx_skip = cb->args[0];
int idx = 0;
- rcu_read_lock();
- hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.gateway_list, list) {
+ spin_lock_bh(&bat_priv->gw.list_lock);
+ cb->seq = bat_priv->gw.generation << 1 | 1;
+
+ hlist_for_each_entry(gw_node, &bat_priv->gw.gateway_list, list) {
if (idx++ < idx_skip)
continue;
- if (batadv_v_gw_dump_entry(msg, portid, cb->nlh->nlmsg_seq,
- bat_priv, gw_node)) {
+ if (batadv_v_gw_dump_entry(msg, portid, cb, bat_priv,
+ gw_node)) {
idx_skip = idx - 1;
goto unlock;
}
@@ -1032,7 +770,7 @@ static void batadv_v_gw_dump(struct sk_buff *msg, struct netlink_callback *cb,
idx_skip = idx;
unlock:
- rcu_read_unlock();
+ spin_unlock_bh(&bat_priv->gw.list_lock);
cb->args[0] = idx_skip;
}
@@ -1050,26 +788,16 @@ static struct batadv_algo_ops batadv_batman_v __read_mostly = {
.hardif_init = batadv_v_hardif_neigh_init,
.cmp = batadv_v_neigh_cmp,
.is_similar_or_better = batadv_v_neigh_is_sob,
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- .print = batadv_v_neigh_print,
-#endif
.dump = batadv_v_neigh_dump,
},
.orig = {
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- .print = batadv_v_orig_print,
-#endif
.dump = batadv_v_orig_dump,
},
.gw = {
.init_sel_class = batadv_v_init_sel_class,
- .store_sel_class = batadv_v_store_sel_class,
- .show_sel_class = batadv_v_show_sel_class,
+ .sel_class_max = U32_MAX,
.get_best_gw_node = batadv_v_gw_get_best_gw_node,
.is_eligible = batadv_v_gw_is_eligible,
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- .print = batadv_v_gw_print,
-#endif
.dump = batadv_v_gw_dump,
},
};
@@ -1086,6 +814,11 @@ void batadv_v_hardif_init(struct batadv_hard_iface *hard_iface)
*/
atomic_set(&hard_iface->bat_v.throughput_override, 0);
atomic_set(&hard_iface->bat_v.elp_interval, 500);
+
+ hard_iface->bat_v.aggr_len = 0;
+ skb_queue_head_init(&hard_iface->bat_v.aggr_list);
+ INIT_DELAYED_WORK(&hard_iface->bat_v.aggr_wq,
+ batadv_v_ogm_aggr_work);
}
/**
diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h
index ec4a2a569750..964431f4dc8d 100644
--- a/net/batman-adv/bat_v.h
+++ b/net/batman-adv/bat_v.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Linus Lüssing
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_BAT_V_H_
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
index 9f481cfdf77d..cb16c1ed2a58 100644
--- a/net/batman-adv/bat_v_elp.c
+++ b/net/batman-adv/bat_v_elp.c
@@ -1,19 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Linus Lüssing, Marek Lindner
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "bat_v_elp.h"
@@ -22,14 +10,16 @@
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/byteorder/generic.h>
+#include <linux/container_of.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
#include <linux/gfp.h>
#include <linux/if_ether.h>
#include <linux/jiffies.h>
-#include <linux/kernel.h>
#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/minmax.h>
#include <linux/netdevice.h>
#include <linux/nl80211.h>
#include <linux/random.h>
@@ -37,6 +27,7 @@
#include <linux/rcupdate.h>
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/types.h>
@@ -44,7 +35,6 @@
#include <net/cfg80211.h>
#include <uapi/linux/batadv_packet.h>
-#include "bat_algo.h"
#include "bat_v_ogm.h"
#include "hard-interface.h"
#include "log.h"
@@ -53,6 +43,18 @@
#include "send.h"
/**
+ * struct batadv_v_metric_queue_entry - list of hardif neighbors which require
+ * and metric update
+ */
+struct batadv_v_metric_queue_entry {
+ /** @hardif_neigh: hardif neighbor scheduled for metric update */
+ struct batadv_hardif_neigh_node *hardif_neigh;
+
+ /** @list: list node for metric_queue */
+ struct list_head list;
+};
+
+/**
* batadv_v_elp_start_timer() - restart timer for ELP periodic work
* @hard_iface: the interface for which the timer has to be reset
*/
@@ -61,7 +63,7 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface)
unsigned int msecs;
msecs = atomic_read(&hard_iface->bat_v.elp_interval) - BATADV_JITTER;
- msecs += prandom_u32() % (2 * BATADV_JITTER);
+ msecs += get_random_u32_below(2 * BATADV_JITTER);
queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.elp_wq,
msecs_to_jiffies(msecs));
@@ -70,25 +72,36 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface)
/**
* batadv_v_elp_get_throughput() - get the throughput towards a neighbour
* @neigh: the neighbour for which the throughput has to be obtained
+ * @pthroughput: calculated throughput towards the given neighbour in multiples
+ * of 100kpbs (a value of '1' equals 0.1Mbps, '10' equals 1Mbps, etc).
*
- * Return: The throughput towards the given neighbour in multiples of 100kpbs
- * (a value of '1' equals to 0.1Mbps, '10' equals 1Mbps, etc).
+ * Return: true when value behind @pthroughput was set
*/
-static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
+static bool batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh,
+ u32 *pthroughput)
{
struct batadv_hard_iface *hard_iface = neigh->if_incoming;
+ struct net_device *mesh_iface = hard_iface->mesh_iface;
struct ethtool_link_ksettings link_settings;
struct net_device *real_netdev;
struct station_info sinfo;
u32 throughput;
int ret;
+ /* don't query throughput when no longer associated with any
+ * batman-adv interface
+ */
+ if (!mesh_iface)
+ return false;
+
/* if the user specified a customised value for this interface, then
* return it directly
*/
throughput = atomic_read(&hard_iface->bat_v.throughput_override);
- if (throughput != 0)
- return throughput;
+ if (throughput != 0) {
+ *pthroughput = throughput;
+ return true;
+ }
/* if this is a wireless device, then ask its throughput through
* cfg80211 API
@@ -104,43 +117,53 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
ret = cfg80211_get_station(real_netdev, neigh->addr, &sinfo);
+ if (!ret) {
+ /* free the TID stats immediately */
+ cfg80211_sinfo_release_content(&sinfo);
+ }
+
dev_put(real_netdev);
if (ret == -ENOENT) {
/* Node is not associated anymore! It would be
* possible to delete this neighbor. For now set
* the throughput metric to 0.
*/
- return 0;
+ *pthroughput = 0;
+ return true;
}
if (ret)
goto default_throughput;
- if (!(sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT)))
- goto default_throughput;
- return sinfo.expected_throughput / 100;
+ if (sinfo.filled & BIT(NL80211_STA_INFO_EXPECTED_THROUGHPUT)) {
+ *pthroughput = sinfo.expected_throughput / 100;
+ return true;
+ }
+
+ /* try to estimate the expected throughput based on reported tx
+ * rates
+ */
+ if (sinfo.filled & BIT(NL80211_STA_INFO_TX_BITRATE)) {
+ *pthroughput = cfg80211_calculate_bitrate(&sinfo.txrate) / 3;
+ return true;
+ }
+
+ goto default_throughput;
}
+ /* only use rtnl_trylock because the elp worker will be cancelled while
+ * the rntl_lock is held. the cancel_delayed_work_sync() would otherwise
+ * wait forever when the elp work_item was started and it is then also
+ * trying to rtnl_lock
+ */
+ if (!rtnl_trylock())
+ return false;
+
/* if not a wifi interface, check if this device provides data via
* ethtool (e.g. an Ethernet adapter)
*/
- memset(&link_settings, 0, sizeof(link_settings));
- rtnl_lock();
ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings);
rtnl_unlock();
-
- /* Virtual interface drivers such as tun / tap interfaces, VLAN, etc
- * tend to initialize the interface throughput with some value for the
- * sake of having a throughput number to export via ethtool. This
- * exported throughput leaves batman-adv to conclude the interface
- * throughput is genuine (reflecting reality), thus no measurements
- * are necessary.
- *
- * Based on the observation that those interface types also tend to set
- * the link auto-negotiation to 'off', batman-adv shall check this
- * setting to differentiate between genuine link throughput information
- * and placeholders installed by virtual interfaces.
- */
- if (ret == 0 && link_settings.base.autoneg == AUTONEG_ENABLE) {
+ if (ret == 0) {
/* link characteristics might change over time */
if (link_settings.base.duplex == DUPLEX_FULL)
hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX;
@@ -148,13 +171,15 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX;
throughput = link_settings.base.speed;
- if (throughput && throughput != SPEED_UNKNOWN)
- return throughput * 10;
+ if (throughput && throughput != SPEED_UNKNOWN) {
+ *pthroughput = throughput * 10;
+ return true;
+ }
}
default_throughput:
if (!(hard_iface->bat_v.flags & BATADV_WARNING_DEFAULT)) {
- batadv_info(hard_iface->soft_iface,
+ batadv_info(mesh_iface,
"WiFi driver or ethtool info does not provide information about link speeds on interface %s, therefore defaulting to hardcoded throughput values of %u.%1u Mbps. Consider overriding the throughput manually or checking your driver.\n",
hard_iface->net_dev->name,
BATADV_THROUGHPUT_DEFAULT_VALUE / 10,
@@ -163,31 +188,26 @@ default_throughput:
}
/* if none of the above cases apply, return the base_throughput */
- return BATADV_THROUGHPUT_DEFAULT_VALUE;
+ *pthroughput = BATADV_THROUGHPUT_DEFAULT_VALUE;
+ return true;
}
/**
* batadv_v_elp_throughput_metric_update() - worker updating the throughput
* metric of a single hop neighbour
- * @work: the work queue item
+ * @neigh: the neighbour to probe
*/
-void batadv_v_elp_throughput_metric_update(struct work_struct *work)
+static void
+batadv_v_elp_throughput_metric_update(struct batadv_hardif_neigh_node *neigh)
{
- struct batadv_hardif_neigh_node_bat_v *neigh_bat_v;
- struct batadv_hardif_neigh_node *neigh;
-
- neigh_bat_v = container_of(work, struct batadv_hardif_neigh_node_bat_v,
- metric_work);
- neigh = container_of(neigh_bat_v, struct batadv_hardif_neigh_node,
- bat_v);
+ u32 throughput;
+ bool valid;
- ewma_throughput_add(&neigh->bat_v.throughput,
- batadv_v_elp_get_throughput(neigh));
+ valid = batadv_v_elp_get_throughput(neigh, &throughput);
+ if (!valid)
+ return;
- /* decrement refcounter to balance increment performed before scheduling
- * this task
- */
- batadv_hardif_neigh_put(neigh);
+ ewma_throughput_add(&neigh->bat_v.throughput, throughput);
}
/**
@@ -196,8 +216,8 @@ void batadv_v_elp_throughput_metric_update(struct work_struct *work)
*
* Sends a predefined number of unicast wifi packets to a given neighbour in
* order to trigger the throughput estimation on this link by the RC algorithm.
- * Packets are sent only if there there is not enough payload unicast traffic
- * towards this neighbour..
+ * Packets are sent only if there is not enough payload unicast traffic towards
+ * this neighbour..
*
* Return: True on success and false in case of error during skb preparation.
*/
@@ -205,7 +225,7 @@ static bool
batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh)
{
struct batadv_hard_iface *hard_iface = neigh->if_incoming;
- struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
unsigned long last_tx_diff;
struct sk_buff *skb;
int probe_len, i;
@@ -257,22 +277,24 @@ batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh)
* batadv_v_elp_periodic_work() - ELP periodic task per interface
* @work: work queue item
*
- * Emits broadcast ELP message in regular intervals.
+ * Emits broadcast ELP messages in regular intervals.
*/
static void batadv_v_elp_periodic_work(struct work_struct *work)
{
+ struct batadv_v_metric_queue_entry *metric_entry;
+ struct batadv_v_metric_queue_entry *metric_safe;
struct batadv_hardif_neigh_node *hardif_neigh;
struct batadv_hard_iface *hard_iface;
struct batadv_hard_iface_bat_v *bat_v;
struct batadv_elp_packet *elp_packet;
+ struct list_head metric_queue;
struct batadv_priv *bat_priv;
struct sk_buff *skb;
u32 elp_interval;
- bool ret;
bat_v = container_of(work, struct batadv_hard_iface_bat_v, elp_wq.work);
hard_iface = container_of(bat_v, struct batadv_hard_iface, bat_v);
- bat_priv = netdev_priv(hard_iface->soft_iface);
+ bat_priv = netdev_priv(hard_iface->mesh_iface);
if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING)
goto out;
@@ -304,6 +326,8 @@ static void batadv_v_elp_periodic_work(struct work_struct *work)
atomic_inc(&hard_iface->bat_v.elp_seqno);
+ INIT_LIST_HEAD(&metric_queue);
+
/* The throughput metric is updated on each sent packet. This way, if a
* node is dead and no longer sends packets, batman-adv is still able to
* react timely to its death.
@@ -328,16 +352,28 @@ static void batadv_v_elp_periodic_work(struct work_struct *work)
/* Reading the estimated throughput from cfg80211 is a task that
* may sleep and that is not allowed in an rcu protected
- * context. Therefore schedule a task for that.
+ * context. Therefore add it to metric_queue and process it
+ * outside rcu protected context.
*/
- ret = queue_work(batadv_event_workqueue,
- &hardif_neigh->bat_v.metric_work);
-
- if (!ret)
+ metric_entry = kzalloc(sizeof(*metric_entry), GFP_ATOMIC);
+ if (!metric_entry) {
batadv_hardif_neigh_put(hardif_neigh);
+ continue;
+ }
+
+ metric_entry->hardif_neigh = hardif_neigh;
+ list_add(&metric_entry->list, &metric_queue);
}
rcu_read_unlock();
+ list_for_each_entry_safe(metric_entry, metric_safe, &metric_queue, list) {
+ batadv_v_elp_throughput_metric_update(metric_entry->hardif_neigh);
+
+ batadv_hardif_neigh_put(metric_entry->hardif_neigh);
+ list_del(&metric_entry->list);
+ kfree(metric_entry);
+ }
+
restart_timer:
batadv_v_elp_start_timer(hard_iface);
out:
@@ -352,19 +388,21 @@ out:
*/
int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface)
{
+ static const size_t tvlv_padding = sizeof(__be32);
struct batadv_elp_packet *elp_packet;
unsigned char *elp_buff;
u32 random_seqno;
size_t size;
int res = -ENOMEM;
- size = ETH_HLEN + NET_IP_ALIGN + BATADV_ELP_HLEN;
+ size = ETH_HLEN + NET_IP_ALIGN + BATADV_ELP_HLEN + tvlv_padding;
hard_iface->bat_v.elp_skb = dev_alloc_skb(size);
if (!hard_iface->bat_v.elp_skb)
goto out;
skb_reserve(hard_iface->bat_v.elp_skb, ETH_HLEN + NET_IP_ALIGN);
- elp_buff = skb_put_zero(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN);
+ elp_buff = skb_put_zero(hard_iface->bat_v.elp_skb,
+ BATADV_ELP_HLEN + tvlv_padding);
elp_packet = (struct batadv_elp_packet *)elp_buff;
elp_packet->packet_type = BATADV_ELP;
@@ -433,21 +471,18 @@ void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface,
void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface)
{
struct batadv_hard_iface *hard_iface;
+ struct list_head *iter;
/* update orig field of every elp iface belonging to this mesh */
rcu_read_lock();
- list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
- if (primary_iface->soft_iface != hard_iface->soft_iface)
- continue;
-
+ netdev_for_each_lower_private_rcu(primary_iface->mesh_iface, hard_iface, iter)
batadv_v_elp_iface_activate(primary_iface, hard_iface);
- }
rcu_read_unlock();
}
/**
* batadv_v_elp_neigh_update() - update an ELP neighbour node
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @neigh_addr: the neighbour interface address
* @if_incoming: the interface the packet was received through
* @elp_packet: the received ELP packet
@@ -495,14 +530,11 @@ static void batadv_v_elp_neigh_update(struct batadv_priv *bat_priv,
hardif_neigh->bat_v.elp_interval = ntohl(elp_packet->elp_interval);
hardif_free:
- if (hardif_neigh)
- batadv_hardif_neigh_put(hardif_neigh);
+ batadv_hardif_neigh_put(hardif_neigh);
neigh_free:
- if (neigh)
- batadv_neigh_node_put(neigh);
+ batadv_neigh_node_put(neigh);
orig_free:
- if (orig_neigh)
- batadv_orig_node_put(orig_neigh);
+ batadv_orig_node_put(orig_neigh);
}
/**
@@ -510,16 +542,16 @@ orig_free:
* @skb: the received packet
* @if_incoming: the interface this packet was received through
*
- * Return: NET_RX_SUCCESS and consumes the skb if the packet was peoperly
+ * Return: NET_RX_SUCCESS and consumes the skb if the packet was properly
* processed or NET_RX_DROP in case of failure.
*/
int batadv_v_elp_packet_recv(struct sk_buff *skb,
struct batadv_hard_iface *if_incoming)
{
- struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface);
struct batadv_elp_packet *elp_packet;
struct batadv_hard_iface *primary_if;
- struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb);
+ struct ethhdr *ethhdr;
bool res;
int ret = NET_RX_DROP;
@@ -527,6 +559,7 @@ int batadv_v_elp_packet_recv(struct sk_buff *skb,
if (!res)
goto free_skb;
+ ethhdr = eth_hdr(skb);
if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
goto free_skb;
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
index e8c7b7fd290d..c9cb0a307100 100644
--- a/net/batman-adv/bat_v_elp.h
+++ b/net/batman-adv/bat_v_elp.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Linus Lüssing, Marek Lindner
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_BAT_V_ELP_H_
@@ -21,8 +9,7 @@
#include "main.h"
-struct sk_buff;
-struct work_struct;
+#include <linux/skbuff.h>
int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface);
void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface);
@@ -31,6 +18,5 @@ void batadv_v_elp_iface_activate(struct batadv_hard_iface *primary_iface,
void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface);
int batadv_v_elp_packet_recv(struct sk_buff *skb,
struct batadv_hard_iface *if_incoming);
-void batadv_v_elp_throughput_metric_update(struct work_struct *work);
#endif /* _NET_BATMAN_ADV_BAT_V_ELP_H_ */
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index 2948b41b06d4..e3870492dab7 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -1,19 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Antonio Quartulli
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "bat_v_ogm.h"
@@ -21,27 +9,29 @@
#include <linux/atomic.h>
#include <linux/byteorder/generic.h>
+#include <linux/container_of.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/gfp.h>
#include <linux/if_ether.h>
#include <linux/jiffies.h>
-#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/minmax.h>
+#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/random.h>
-#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
+#include <linux/spinlock.h>
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/workqueue.h>
#include <uapi/linux/batadv_packet.h>
-#include "bat_algo.h"
#include "hard-interface.h"
#include "hash.h"
#include "log.h"
@@ -53,12 +43,12 @@
/**
* batadv_v_ogm_orig_get() - retrieve and possibly create an originator node
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @addr: the address of the originator
*
- * Return: the orig_node corresponding to the specified address. If such object
- * does not exist it is allocated here. In case of allocation failure returns
- * NULL.
+ * Return: the orig_node corresponding to the specified address. If such an
+ * object does not exist, it is allocated here. In case of allocation failure
+ * returns NULL.
*/
struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv,
const u8 *addr)
@@ -89,8 +79,22 @@ struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv,
}
/**
+ * batadv_v_ogm_start_queue_timer() - restart the OGM aggregation timer
+ * @hard_iface: the interface to use to send the OGM
+ */
+static void batadv_v_ogm_start_queue_timer(struct batadv_hard_iface *hard_iface)
+{
+ unsigned int msecs = BATADV_MAX_AGGREGATION_MS * 1000;
+
+ /* msecs * [0.9, 1.1] */
+ msecs += get_random_u32_below(msecs / 5) - (msecs / 10);
+ queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.aggr_wq,
+ msecs_to_jiffies(msecs / 1000));
+}
+
+/**
* batadv_v_ogm_start_timer() - restart the OGM sending timer
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*/
static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv)
{
@@ -102,7 +106,7 @@ static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv)
return;
msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER;
- msecs += prandom_u32() % (2 * BATADV_JITTER);
+ msecs += get_random_u32_below(2 * BATADV_JITTER);
queue_delayed_work(batadv_event_workqueue, &bat_priv->bat_v.ogm_wq,
msecs_to_jiffies(msecs));
}
@@ -115,10 +119,12 @@ static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv)
static void batadv_v_ogm_send_to_if(struct sk_buff *skb,
struct batadv_hard_iface *hard_iface)
{
- struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
- if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ if (hard_iface->if_status != BATADV_IF_ACTIVE) {
+ kfree_skb(skb);
return;
+ }
batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX);
batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES,
@@ -128,23 +134,141 @@ static void batadv_v_ogm_send_to_if(struct sk_buff *skb,
}
/**
- * batadv_v_ogm_send() - periodic worker broadcasting the own OGM
- * @work: work queue item
+ * batadv_v_ogm_len() - OGMv2 packet length
+ * @skb: the OGM to check
+ *
+ * Return: Length of the given OGMv2 packet, including tvlv length, excluding
+ * ethernet header length.
*/
-static void batadv_v_ogm_send(struct work_struct *work)
+static unsigned int batadv_v_ogm_len(struct sk_buff *skb)
+{
+ struct batadv_ogm2_packet *ogm_packet;
+
+ ogm_packet = (struct batadv_ogm2_packet *)skb->data;
+ return BATADV_OGM2_HLEN + ntohs(ogm_packet->tvlv_len);
+}
+
+/**
+ * batadv_v_ogm_queue_left() - check if given OGM still fits aggregation queue
+ * @skb: the OGM to check
+ * @hard_iface: the interface to use to send the OGM
+ *
+ * Caller needs to hold the hard_iface->bat_v.aggr_list.lock.
+ *
+ * Return: True, if the given OGMv2 packet still fits, false otherwise.
+ */
+static bool batadv_v_ogm_queue_left(struct sk_buff *skb,
+ struct batadv_hard_iface *hard_iface)
+{
+ unsigned int max = min_t(unsigned int, hard_iface->net_dev->mtu,
+ BATADV_MAX_AGGREGATION_BYTES);
+ unsigned int ogm_len = batadv_v_ogm_len(skb);
+
+ lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock);
+
+ return hard_iface->bat_v.aggr_len + ogm_len <= max;
+}
+
+/**
+ * batadv_v_ogm_aggr_list_free - free all elements in an aggregation queue
+ * @hard_iface: the interface holding the aggregation queue
+ *
+ * Empties the OGMv2 aggregation queue and frees all the skbs it contains.
+ *
+ * Caller needs to hold the hard_iface->bat_v.aggr_list.lock.
+ */
+static void batadv_v_ogm_aggr_list_free(struct batadv_hard_iface *hard_iface)
+{
+ lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock);
+
+ __skb_queue_purge(&hard_iface->bat_v.aggr_list);
+ hard_iface->bat_v.aggr_len = 0;
+}
+
+/**
+ * batadv_v_ogm_aggr_send() - flush & send aggregation queue
+ * @hard_iface: the interface with the aggregation queue to flush
+ *
+ * Aggregates all OGMv2 packets currently in the aggregation queue into a
+ * single OGMv2 packet and transmits this aggregate.
+ *
+ * The aggregation queue is empty after this call.
+ *
+ * Caller needs to hold the hard_iface->bat_v.aggr_list.lock.
+ */
+static void batadv_v_ogm_aggr_send(struct batadv_hard_iface *hard_iface)
+{
+ unsigned int aggr_len = hard_iface->bat_v.aggr_len;
+ struct sk_buff *skb_aggr;
+ unsigned int ogm_len;
+ struct sk_buff *skb;
+
+ lockdep_assert_held(&hard_iface->bat_v.aggr_list.lock);
+
+ if (!aggr_len)
+ return;
+
+ skb_aggr = dev_alloc_skb(aggr_len + ETH_HLEN + NET_IP_ALIGN);
+ if (!skb_aggr) {
+ batadv_v_ogm_aggr_list_free(hard_iface);
+ return;
+ }
+
+ skb_reserve(skb_aggr, ETH_HLEN + NET_IP_ALIGN);
+ skb_reset_network_header(skb_aggr);
+
+ while ((skb = __skb_dequeue(&hard_iface->bat_v.aggr_list))) {
+ hard_iface->bat_v.aggr_len -= batadv_v_ogm_len(skb);
+
+ ogm_len = batadv_v_ogm_len(skb);
+ skb_put_data(skb_aggr, skb->data, ogm_len);
+
+ consume_skb(skb);
+ }
+
+ batadv_v_ogm_send_to_if(skb_aggr, hard_iface);
+}
+
+/**
+ * batadv_v_ogm_queue_on_if() - queue a batman ogm on a given interface
+ * @skb: the OGM to queue
+ * @hard_iface: the interface to queue the OGM on
+ */
+static void batadv_v_ogm_queue_on_if(struct sk_buff *skb,
+ struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
+
+ if (!atomic_read(&bat_priv->aggregated_ogms)) {
+ batadv_v_ogm_send_to_if(skb, hard_iface);
+ return;
+ }
+
+ spin_lock_bh(&hard_iface->bat_v.aggr_list.lock);
+ if (!batadv_v_ogm_queue_left(skb, hard_iface))
+ batadv_v_ogm_aggr_send(hard_iface);
+
+ hard_iface->bat_v.aggr_len += batadv_v_ogm_len(skb);
+ __skb_queue_tail(&hard_iface->bat_v.aggr_list, skb);
+ spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock);
+}
+
+/**
+ * batadv_v_ogm_send_meshif() - periodic worker broadcasting the own OGM
+ * @bat_priv: the bat priv with all the mesh interface information
+ */
+static void batadv_v_ogm_send_meshif(struct batadv_priv *bat_priv)
{
struct batadv_hard_iface *hard_iface;
- struct batadv_priv_bat_v *bat_v;
- struct batadv_priv *bat_priv;
struct batadv_ogm2_packet *ogm_packet;
struct sk_buff *skb, *skb_tmp;
unsigned char *ogm_buff;
+ struct list_head *iter;
int ogm_buff_len;
u16 tvlv_len = 0;
int ret;
- bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work);
- bat_priv = container_of(bat_v, struct batadv_priv, bat_v);
+ lockdep_assert_held(&bat_priv->bat_v.ogm_buff_mutex);
if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING)
goto out;
@@ -176,10 +300,7 @@ static void batadv_v_ogm_send(struct work_struct *work)
/* broadcast on every interface */
rcu_read_lock();
- list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
- if (hard_iface->soft_iface != bat_priv->soft_iface)
- continue;
-
+ netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) {
if (!kref_get_unless_zero(&hard_iface->refcount))
continue;
@@ -222,7 +343,7 @@ static void batadv_v_ogm_send(struct work_struct *work)
break;
}
- batadv_v_ogm_send_to_if(skb_tmp, hard_iface);
+ batadv_v_ogm_queue_on_if(skb_tmp, hard_iface);
batadv_hardif_put(hard_iface);
}
rcu_read_unlock();
@@ -236,56 +357,114 @@ out:
}
/**
+ * batadv_v_ogm_send() - periodic worker broadcasting the own OGM
+ * @work: work queue item
+ */
+static void batadv_v_ogm_send(struct work_struct *work)
+{
+ struct batadv_priv_bat_v *bat_v;
+ struct batadv_priv *bat_priv;
+
+ bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work);
+ bat_priv = container_of(bat_v, struct batadv_priv, bat_v);
+
+ mutex_lock(&bat_priv->bat_v.ogm_buff_mutex);
+ batadv_v_ogm_send_meshif(bat_priv);
+ mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex);
+}
+
+/**
+ * batadv_v_ogm_aggr_work() - OGM queue periodic task per interface
+ * @work: work queue item
+ *
+ * Emits aggregated OGM messages in regular intervals.
+ */
+void batadv_v_ogm_aggr_work(struct work_struct *work)
+{
+ struct batadv_hard_iface_bat_v *batv;
+ struct batadv_hard_iface *hard_iface;
+
+ batv = container_of(work, struct batadv_hard_iface_bat_v, aggr_wq.work);
+ hard_iface = container_of(batv, struct batadv_hard_iface, bat_v);
+
+ spin_lock_bh(&hard_iface->bat_v.aggr_list.lock);
+ batadv_v_ogm_aggr_send(hard_iface);
+ spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock);
+
+ batadv_v_ogm_start_queue_timer(hard_iface);
+}
+
+/**
* batadv_v_ogm_iface_enable() - prepare an interface for B.A.T.M.A.N. V
* @hard_iface: the interface to prepare
*
- * Takes care of scheduling own OGM sending routine for this interface.
+ * Takes care of scheduling its own OGM sending routine for this interface.
*
* Return: 0 on success or a negative error code otherwise
*/
int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface)
{
- struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
+ batadv_v_ogm_start_queue_timer(hard_iface);
batadv_v_ogm_start_timer(bat_priv);
return 0;
}
/**
+ * batadv_v_ogm_iface_disable() - release OGM interface private resources
+ * @hard_iface: interface for which the resources have to be released
+ */
+void batadv_v_ogm_iface_disable(struct batadv_hard_iface *hard_iface)
+{
+ cancel_delayed_work_sync(&hard_iface->bat_v.aggr_wq);
+
+ spin_lock_bh(&hard_iface->bat_v.aggr_list.lock);
+ batadv_v_ogm_aggr_list_free(hard_iface);
+ spin_unlock_bh(&hard_iface->bat_v.aggr_list.lock);
+}
+
+/**
* batadv_v_ogm_primary_iface_set() - set a new primary interface
* @primary_iface: the new primary interface
*/
void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface)
{
- struct batadv_priv *bat_priv = netdev_priv(primary_iface->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(primary_iface->mesh_iface);
struct batadv_ogm2_packet *ogm_packet;
+ mutex_lock(&bat_priv->bat_v.ogm_buff_mutex);
if (!bat_priv->bat_v.ogm_buff)
- return;
+ goto unlock;
ogm_packet = (struct batadv_ogm2_packet *)bat_priv->bat_v.ogm_buff;
ether_addr_copy(ogm_packet->orig, primary_iface->net_dev->dev_addr);
+
+unlock:
+ mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex);
}
/**
* batadv_v_forward_penalty() - apply a penalty to the throughput metric
* forwarded with B.A.T.M.A.N. V OGMs
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @if_incoming: the interface where the OGM has been received
* @if_outgoing: the interface where the OGM has to be forwarded to
* @throughput: the current throughput
*
* Apply a penalty on the current throughput metric value based on the
- * characteristic of the interface where the OGM has been received. The return
- * value is computed as follows:
+ * characteristic of the interface where the OGM has been received.
+ *
+ * Initially the per hardif hop penalty is applied to the throughput. After
+ * that the return value is then computed as follows:
* - throughput * 50% if the incoming and outgoing interface are the
* same WiFi interface and the throughput is above
* 1MBit/s
* - throughput if the outgoing interface is the default
* interface (i.e. this OGM is processed for the
* internal table and not forwarded)
- * - throughput * hop penalty otherwise
+ * - throughput * node hop penalty otherwise
*
* Return: the penalised throughput metric.
*/
@@ -294,9 +473,14 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv,
struct batadv_hard_iface *if_outgoing,
u32 throughput)
{
+ int if_hop_penalty = atomic_read(&if_incoming->hop_penalty);
int hop_penalty = atomic_read(&bat_priv->hop_penalty);
int hop_penalty_max = BATADV_TQ_MAX_VALUE;
+ /* Apply per hardif hop penalty */
+ throughput = throughput * (hop_penalty_max - if_hop_penalty) /
+ hop_penalty_max;
+
/* Don't apply hop penalty in default originator table. */
if (if_outgoing == BATADV_IF_DEFAULT)
return throughput;
@@ -317,7 +501,7 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv,
/**
* batadv_v_ogm_forward() - check conditions and forward an OGM to the given
* outgoing interface
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @ogm_received: previously received OGM to be forwarded
* @orig_node: the originator which has been updated
* @neigh_node: the neigh_node through with the OGM has been received
@@ -394,20 +578,17 @@ static void batadv_v_ogm_forward(struct batadv_priv *bat_priv,
if_outgoing->net_dev->name, ntohl(ogm_forward->throughput),
ogm_forward->ttl, if_incoming->net_dev->name);
- batadv_v_ogm_send_to_if(skb, if_outgoing);
+ batadv_v_ogm_queue_on_if(skb, if_outgoing);
out:
- if (orig_ifinfo)
- batadv_orig_ifinfo_put(orig_ifinfo);
- if (router)
- batadv_neigh_node_put(router);
- if (neigh_ifinfo)
- batadv_neigh_ifinfo_put(neigh_ifinfo);
+ batadv_orig_ifinfo_put(orig_ifinfo);
+ batadv_neigh_node_put(router);
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
}
/**
* batadv_v_ogm_metric_update() - update route metric based on OGM
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @ogm2: OGM2 structure
* @orig_node: Originator structure for which the OGM has been received
* @neigh_node: the neigh_node through with the OGM has been received
@@ -482,17 +663,15 @@ static int batadv_v_ogm_metric_update(struct batadv_priv *bat_priv,
else
ret = 0;
out:
- if (orig_ifinfo)
- batadv_orig_ifinfo_put(orig_ifinfo);
- if (neigh_ifinfo)
- batadv_neigh_ifinfo_put(neigh_ifinfo);
+ batadv_orig_ifinfo_put(orig_ifinfo);
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
return ret;
}
/**
* batadv_v_ogm_route_update() - update routes based on OGM
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @ethhdr: the Ethernet header of the OGM2
* @ogm2: OGM2 structure
* @orig_node: Originator structure for which the OGM has been received
@@ -576,23 +755,18 @@ static bool batadv_v_ogm_route_update(struct batadv_priv *bat_priv,
batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node);
out:
- if (router)
- batadv_neigh_node_put(router);
- if (orig_neigh_router)
- batadv_neigh_node_put(orig_neigh_router);
- if (orig_neigh_node)
- batadv_orig_node_put(orig_neigh_node);
- if (router_ifinfo)
- batadv_neigh_ifinfo_put(router_ifinfo);
- if (neigh_ifinfo)
- batadv_neigh_ifinfo_put(neigh_ifinfo);
+ batadv_neigh_node_put(router);
+ batadv_neigh_node_put(orig_neigh_router);
+ batadv_orig_node_put(orig_neigh_node);
+ batadv_neigh_ifinfo_put(router_ifinfo);
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
return forward;
}
/**
* batadv_v_ogm_process_per_outif() - process a batman v OGM for an outgoing if
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @ethhdr: the Ethernet header of the OGM2
* @ogm2: OGM2 structure
* @orig_node: Originator structure for which the OGM has been received
@@ -623,8 +797,8 @@ batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv,
/* only unknown & newer OGMs contain TVLVs we are interested in */
if (seqno_age > 0 && if_outgoing == BATADV_IF_DEFAULT)
- batadv_tvlv_containers_process(bat_priv, true, orig_node,
- NULL, NULL,
+ batadv_tvlv_containers_process(bat_priv, BATADV_OGM2, orig_node,
+ NULL,
(unsigned char *)(ogm2 + 1),
ntohs(ogm2->tvlv_len));
@@ -643,32 +817,37 @@ batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv,
* batadv_v_ogm_aggr_packet() - checks if there is another OGM aggregated
* @buff_pos: current position in the skb
* @packet_len: total length of the skb
- * @tvlv_len: tvlv length of the previously considered OGM
+ * @ogm2_packet: potential OGM2 in buffer
*
* Return: true if there is enough space for another OGM, false otherwise.
*/
-static bool batadv_v_ogm_aggr_packet(int buff_pos, int packet_len,
- __be16 tvlv_len)
+static bool
+batadv_v_ogm_aggr_packet(int buff_pos, int packet_len,
+ const struct batadv_ogm2_packet *ogm2_packet)
{
int next_buff_pos = 0;
- next_buff_pos += buff_pos + BATADV_OGM2_HLEN;
- next_buff_pos += ntohs(tvlv_len);
+ /* check if there is enough space for the header */
+ next_buff_pos += buff_pos + sizeof(*ogm2_packet);
+ if (next_buff_pos > packet_len)
+ return false;
- return (next_buff_pos <= packet_len) &&
- (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES);
+ /* check if there is enough space for the optional TVLV */
+ next_buff_pos += ntohs(ogm2_packet->tvlv_len);
+
+ return next_buff_pos <= packet_len;
}
/**
* batadv_v_ogm_process() - process an incoming batman v OGM
* @skb: the skb containing the OGM
* @ogm_offset: offset to the OGM which should be processed (for aggregates)
- * @if_incoming: the interface where this packet was receved
+ * @if_incoming: the interface where this packet was received
*/
static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset,
struct batadv_hard_iface *if_incoming)
{
- struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface);
struct ethhdr *ethhdr;
struct batadv_orig_node *orig_node = NULL;
struct batadv_hardif_neigh_node *hardif_neigh = NULL;
@@ -676,6 +855,7 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset,
struct batadv_hard_iface *hard_iface;
struct batadv_ogm2_packet *ogm_packet;
u32 ogm_throughput, link_throughput, path_throughput;
+ struct list_head *iter;
int ret;
ethhdr = eth_hdr(skb);
@@ -690,6 +870,12 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset,
ntohl(ogm_packet->seqno), ogm_throughput, ogm_packet->ttl,
ogm_packet->version, ntohs(ogm_packet->tvlv_len));
+ if (batadv_is_my_mac(bat_priv, ogm_packet->orig)) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: originator packet from ourself\n");
+ return;
+ }
+
/* If the throughput metric is 0, immediately drop the packet. No need
* to create orig_node / neigh_node for an unusable route.
*/
@@ -709,7 +895,7 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset,
orig_node = batadv_v_ogm_orig_get(bat_priv, ogm_packet->orig);
if (!orig_node)
- return;
+ goto out;
neigh_node = batadv_neigh_node_get_or_create(orig_node, if_incoming,
ethhdr->h_source);
@@ -732,13 +918,10 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset,
BATADV_IF_DEFAULT);
rcu_read_lock();
- list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) {
if (hard_iface->if_status != BATADV_IF_ACTIVE)
continue;
- if (hard_iface->soft_iface != bat_priv->soft_iface)
- continue;
-
if (!kref_get_unless_zero(&hard_iface->refcount))
continue;
@@ -779,12 +962,9 @@ static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset,
}
rcu_read_unlock();
out:
- if (orig_node)
- batadv_orig_node_put(orig_node);
- if (neigh_node)
- batadv_neigh_node_put(neigh_node);
- if (hardif_neigh)
- batadv_hardif_neigh_put(hardif_neigh);
+ batadv_orig_node_put(orig_node);
+ batadv_neigh_node_put(neigh_node);
+ batadv_hardif_neigh_put(hardif_neigh);
}
/**
@@ -798,9 +978,9 @@ out:
int batadv_v_ogm_packet_recv(struct sk_buff *skb,
struct batadv_hard_iface *if_incoming)
{
- struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->mesh_iface);
struct batadv_ogm2_packet *ogm_packet;
- struct ethhdr *ethhdr = eth_hdr(skb);
+ struct ethhdr *ethhdr;
int ogm_offset;
u8 *packet_pos;
int ret = NET_RX_DROP;
@@ -814,14 +994,10 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb,
if (!batadv_check_management_packet(skb, if_incoming, BATADV_OGM2_HLEN))
goto free_skb;
+ ethhdr = eth_hdr(skb);
if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
goto free_skb;
- ogm_packet = (struct batadv_ogm2_packet *)skb->data;
-
- if (batadv_is_my_mac(bat_priv, ogm_packet->orig))
- goto free_skb;
-
batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX);
batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES,
skb->len + ETH_HLEN);
@@ -830,7 +1006,7 @@ int batadv_v_ogm_packet_recv(struct sk_buff *skb,
ogm_packet = (struct batadv_ogm2_packet *)skb->data;
while (batadv_v_ogm_aggr_packet(ogm_offset, skb_headlen(skb),
- ogm_packet->tvlv_len)) {
+ ogm_packet)) {
batadv_v_ogm_process(skb, ogm_offset, if_incoming);
ogm_offset += BATADV_OGM2_HLEN;
@@ -853,7 +1029,7 @@ free_skb:
/**
* batadv_v_ogm_init() - initialise the OGM2 engine
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*
* Return: 0 on success or a negative error code in case of failure
*/
@@ -881,18 +1057,24 @@ int batadv_v_ogm_init(struct batadv_priv *bat_priv)
atomic_set(&bat_priv->bat_v.ogm_seqno, random_seqno);
INIT_DELAYED_WORK(&bat_priv->bat_v.ogm_wq, batadv_v_ogm_send);
+ mutex_init(&bat_priv->bat_v.ogm_buff_mutex);
+
return 0;
}
/**
* batadv_v_ogm_free() - free OGM private resources
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*/
void batadv_v_ogm_free(struct batadv_priv *bat_priv)
{
cancel_delayed_work_sync(&bat_priv->bat_v.ogm_wq);
+ mutex_lock(&bat_priv->bat_v.ogm_buff_mutex);
+
kfree(bat_priv->bat_v.ogm_buff);
bat_priv->bat_v.ogm_buff = NULL;
bat_priv->bat_v.ogm_buff_len = 0;
+
+ mutex_unlock(&bat_priv->bat_v.ogm_buff_mutex);
}
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
index e5be14c908c6..edeffedecade 100644
--- a/net/batman-adv/bat_v_ogm.h
+++ b/net/batman-adv/bat_v_ogm.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Antonio Quartulli
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_BAT_V_OGM_H_
@@ -21,13 +9,15 @@
#include "main.h"
+#include <linux/skbuff.h>
#include <linux/types.h>
-
-struct sk_buff;
+#include <linux/workqueue.h>
int batadv_v_ogm_init(struct batadv_priv *bat_priv);
void batadv_v_ogm_free(struct batadv_priv *bat_priv);
+void batadv_v_ogm_aggr_work(struct work_struct *work);
int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface);
+void batadv_v_ogm_iface_disable(struct batadv_hard_iface *hard_iface);
struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv,
const u8 *addr);
void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface);
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
index a296a4d851f5..2c49b2711650 100644
--- a/net/batman-adv/bitarray.c
+++ b/net/batman-adv/bitarray.c
@@ -1,19 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2006-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "bitarray.h"
@@ -35,7 +23,7 @@ static void batadv_bitmap_shift_left(unsigned long *seq_bits, s32 n)
/**
* batadv_bit_get_packet() - receive and process one packet within the sequence
* number window
- * @priv: the bat priv with all the soft interface information
+ * @priv: the bat priv with all the mesh interface information
* @seq_bits: pointer to the sequence number receive packet
* @seq_num_diff: difference between the current/received sequence number and
* the last sequence number
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index 48f683289531..37f7ae413bc6 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2006-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_BITARRAY_H_
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index 5f1aeeded0e3..3dc791c15bf7 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -1,19 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "bridge_loop_avoidance.h"
@@ -22,7 +10,9 @@
#include <linux/atomic.h>
#include <linux/byteorder/generic.h>
#include <linux/compiler.h>
+#include <linux/container_of.h>
#include <linux/crc16.h>
+#include <linux/err.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/gfp.h>
@@ -31,7 +21,6 @@
#include <linux/if_vlan.h>
#include <linux/jhash.h>
#include <linux/jiffies.h>
-#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/list.h>
#include <linux/lockdep.h>
@@ -39,17 +28,17 @@
#include <linux/netlink.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <linux/sprintf.h>
#include <linux/stddef.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#include <linux/workqueue.h>
#include <net/arp.h>
#include <net/genetlink.h>
#include <net/netlink.h>
-#include <net/sock.h>
#include <uapi/linux/batadv_packet.h>
#include <uapi/linux/batman_adv.h>
@@ -58,8 +47,6 @@
#include "log.h"
#include "netlink.h"
#include "originator.h"
-#include "soft-interface.h"
-#include "sysfs.h"
#include "translation-table.h"
static const u8 batadv_announce_mac[4] = {0x43, 0x05, 0x43, 0x05};
@@ -78,7 +65,7 @@ batadv_bla_send_announce(struct batadv_priv *bat_priv,
*/
static inline u32 batadv_choose_claim(const void *data, u32 size)
{
- struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data;
+ const struct batadv_bla_claim *claim = data;
u32 hash = 0;
hash = jhash(&claim->addr, sizeof(claim->addr), hash);
@@ -96,11 +83,12 @@ static inline u32 batadv_choose_claim(const void *data, u32 size)
*/
static inline u32 batadv_choose_backbone_gw(const void *data, u32 size)
{
- const struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data;
+ const struct batadv_bla_backbone_gw *gw;
u32 hash = 0;
- hash = jhash(&claim->addr, sizeof(claim->addr), hash);
- hash = jhash(&claim->vid, sizeof(claim->vid), hash);
+ gw = data;
+ hash = jhash(&gw->orig, sizeof(gw->orig), hash);
+ hash = jhash(&gw->vid, sizeof(gw->vid), hash);
return hash % size;
}
@@ -175,6 +163,9 @@ static void batadv_backbone_gw_release(struct kref *ref)
*/
static void batadv_backbone_gw_put(struct batadv_bla_backbone_gw *backbone_gw)
{
+ if (!backbone_gw)
+ return;
+
kref_put(&backbone_gw->refcount, batadv_backbone_gw_release);
}
@@ -210,12 +201,15 @@ static void batadv_claim_release(struct kref *ref)
*/
static void batadv_claim_put(struct batadv_bla_claim *claim)
{
+ if (!claim)
+ return;
+
kref_put(&claim->refcount, batadv_claim_release);
}
/**
* batadv_claim_hash_find() - looks for a claim in the claim hash
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @data: search data (may be local/static data)
*
* Return: claim if found or NULL otherwise.
@@ -254,14 +248,14 @@ batadv_claim_hash_find(struct batadv_priv *bat_priv,
/**
* batadv_backbone_hash_find() - looks for a backbone gateway in the hash
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @addr: the address of the originator
* @vid: the VLAN ID
*
* Return: backbone gateway if found or NULL otherwise
*/
static struct batadv_bla_backbone_gw *
-batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr,
+batadv_backbone_hash_find(struct batadv_priv *bat_priv, const u8 *addr,
unsigned short vid)
{
struct batadv_hashtable *hash = bat_priv->bla.backbone_hash;
@@ -338,18 +332,18 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw)
/**
* batadv_bla_send_claim() - sends a claim frame according to the provided info
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @mac: the mac address to be announced within the claim
* @vid: the VLAN ID
* @claimtype: the type of the claim (CLAIM, UNCLAIM, ANNOUNCE, ...)
*/
-static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac,
+static void batadv_bla_send_claim(struct batadv_priv *bat_priv, const u8 *mac,
unsigned short vid, int claimtype)
{
struct sk_buff *skb;
struct ethhdr *ethhdr;
struct batadv_hard_iface *primary_if;
- struct net_device *soft_iface;
+ struct net_device *mesh_iface;
u8 *hw_src;
struct batadv_bla_claim_dst local_claim_dest;
__be32 zeroip = 0;
@@ -362,12 +356,12 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac,
sizeof(local_claim_dest));
local_claim_dest.type = claimtype;
- soft_iface = primary_if->soft_iface;
+ mesh_iface = primary_if->mesh_iface;
skb = arp_create(ARPOP_REPLY, ETH_P_ARP,
/* IP DST: 0.0.0.0 */
zeroip,
- primary_if->soft_iface,
+ primary_if->mesh_iface,
/* IP SRC: 0.0.0.0 */
zeroip,
/* Ethernet DST: Broadcast */
@@ -408,7 +402,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac,
break;
case BATADV_CLAIM_TYPE_ANNOUNCE:
/* announcement frame
- * set HW SRC to the special mac containg the crc
+ * set HW SRC to the special mac containing the crc
*/
ether_addr_copy(hw_src, mac);
batadv_dbg(BATADV_DBG_BLA, bat_priv,
@@ -445,15 +439,14 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac,
}
skb_reset_mac_header(skb);
- skb->protocol = eth_type_trans(skb, soft_iface);
+ skb->protocol = eth_type_trans(skb, mesh_iface);
batadv_inc_counter(bat_priv, BATADV_CNT_RX);
batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
skb->len + ETH_HLEN);
netif_rx(skb);
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
+ batadv_hardif_put(primary_if);
}
/**
@@ -473,7 +466,7 @@ static void batadv_bla_loopdetect_report(struct work_struct *work)
report_work);
bat_priv = backbone_gw->bat_priv;
- batadv_info(bat_priv->soft_iface,
+ batadv_info(bat_priv->mesh_iface,
"Possible loop on VLAN %d detected which can't be handled by BLA - please check your network setup!\n",
batadv_print_vid(backbone_gw->vid));
snprintf(vid_str, sizeof(vid_str), "%d",
@@ -488,7 +481,7 @@ static void batadv_bla_loopdetect_report(struct work_struct *work)
/**
* batadv_bla_get_backbone_gw() - finds or creates a backbone gateway
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig: the mac address of the originator
* @vid: the VLAN ID
* @own_backbone: set if the requested backbone is local
@@ -496,7 +489,7 @@ static void batadv_bla_loopdetect_report(struct work_struct *work)
* Return: the (possibly created) backbone gateway or NULL on error
*/
static struct batadv_bla_backbone_gw *
-batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig,
+batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, const u8 *orig,
unsigned short vid, bool own_backbone)
{
struct batadv_bla_backbone_gw *entry;
@@ -561,7 +554,7 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig,
/**
* batadv_bla_update_own_backbone_gw() - updates the own backbone gw for a VLAN
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @primary_if: the selected primary interface
* @vid: VLAN identifier
*
@@ -587,7 +580,7 @@ batadv_bla_update_own_backbone_gw(struct batadv_priv *bat_priv,
/**
* batadv_bla_answer_request() - answer a bla request by sending own claims
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @primary_if: interface where the request came on
* @vid: the vid where the request came on
*
@@ -664,7 +657,7 @@ static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw)
/**
* batadv_bla_send_announce() - Send an announcement frame
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @backbone_gw: our backbone gateway which should be announced
*/
static void batadv_bla_send_announce(struct batadv_priv *bat_priv,
@@ -685,7 +678,7 @@ static void batadv_bla_send_announce(struct batadv_priv *bat_priv,
/**
* batadv_bla_add_claim() - Adds a claim in the claim hash
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @mac: the mac address of the claim
* @vid: the VLAN ID of the frame
* @backbone_gw: the backbone gateway which claims it
@@ -795,7 +788,7 @@ batadv_bla_claim_get_backbone_gw(struct batadv_bla_claim *claim)
/**
* batadv_bla_del_claim() - delete a claim from the claim hash
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @mac: mac address of the claim to be removed
* @vid: VLAN id for the claim to be removed
*/
@@ -803,6 +796,8 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv,
const u8 *mac, const unsigned short vid)
{
struct batadv_bla_claim search_claim, *claim;
+ struct batadv_bla_claim *claim_removed_entry;
+ struct hlist_node *claim_removed_node;
ether_addr_copy(search_claim.addr, mac);
search_claim.vid = vid;
@@ -813,17 +808,25 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv,
batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): %pM, vid %d\n", __func__,
mac, batadv_print_vid(vid));
- batadv_hash_remove(bat_priv->bla.claim_hash, batadv_compare_claim,
- batadv_choose_claim, claim);
- batadv_claim_put(claim); /* reference from the hash is gone */
+ claim_removed_node = batadv_hash_remove(bat_priv->bla.claim_hash,
+ batadv_compare_claim,
+ batadv_choose_claim, claim);
+ if (!claim_removed_node)
+ goto free_claim;
+
+ /* reference from the hash is gone */
+ claim_removed_entry = hlist_entry(claim_removed_node,
+ struct batadv_bla_claim, hash_entry);
+ batadv_claim_put(claim_removed_entry);
+free_claim:
/* don't need the reference from hash_find() anymore */
batadv_claim_put(claim);
}
/**
* batadv_handle_announce() - check for ANNOUNCE frame
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @an_addr: announcement mac address (ARP Sender HW address)
* @backbone_addr: originator address of the sender (Ethernet source MAC)
* @vid: the VLAN ID of the frame
@@ -847,7 +850,7 @@ static bool batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr,
/* handle as ANNOUNCE frame */
backbone_gw->lasttime = jiffies;
- crc = ntohs(*((__be16 *)(&an_addr[4])));
+ crc = ntohs(*((__force __be16 *)(&an_addr[4])));
batadv_dbg(BATADV_DBG_BLA, bat_priv,
"%s(): ANNOUNCE vid %d (sent by %pM)... CRC = %#.4x\n",
@@ -881,8 +884,8 @@ static bool batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr,
/**
* batadv_handle_request() - check for REQUEST frame
- * @bat_priv: the bat priv with all the soft interface information
- * @primary_if: the primary hard interface of this batman soft interface
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @primary_if: the primary hard interface of this batman mesh interface
* @backbone_addr: backbone address to be requested (ARP sender HW MAC)
* @ethhdr: ethernet header of a packet
* @vid: the VLAN ID of the frame
@@ -914,8 +917,8 @@ static bool batadv_handle_request(struct batadv_priv *bat_priv,
/**
* batadv_handle_unclaim() - check for UNCLAIM frame
- * @bat_priv: the bat priv with all the soft interface information
- * @primary_if: the primary hard interface of this batman soft interface
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @primary_if: the primary hard interface of this batman mesh interface
* @backbone_addr: originator address of the backbone (Ethernet source)
* @claim_addr: Client to be unclaimed (ARP sender HW MAC)
* @vid: the VLAN ID of the frame
@@ -924,7 +927,7 @@ static bool batadv_handle_request(struct batadv_priv *bat_priv,
*/
static bool batadv_handle_unclaim(struct batadv_priv *bat_priv,
struct batadv_hard_iface *primary_if,
- u8 *backbone_addr, u8 *claim_addr,
+ const u8 *backbone_addr, const u8 *claim_addr,
unsigned short vid)
{
struct batadv_bla_backbone_gw *backbone_gw;
@@ -952,8 +955,8 @@ static bool batadv_handle_unclaim(struct batadv_priv *bat_priv,
/**
* batadv_handle_claim() - check for CLAIM frame
- * @bat_priv: the bat priv with all the soft interface information
- * @primary_if: the primary hard interface of this batman soft interface
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @primary_if: the primary hard interface of this batman mesh interface
* @backbone_addr: originator address of the backbone (Ethernet Source)
* @claim_addr: client mac address to be claimed (ARP sender HW MAC)
* @vid: the VLAN ID of the frame
@@ -962,7 +965,7 @@ static bool batadv_handle_unclaim(struct batadv_priv *bat_priv,
*/
static bool batadv_handle_claim(struct batadv_priv *bat_priv,
struct batadv_hard_iface *primary_if,
- u8 *backbone_addr, u8 *claim_addr,
+ const u8 *backbone_addr, const u8 *claim_addr,
unsigned short vid)
{
struct batadv_bla_backbone_gw *backbone_gw;
@@ -989,13 +992,13 @@ static bool batadv_handle_claim(struct batadv_priv *bat_priv,
/**
* batadv_check_claim_group() - check for claim group membership
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @primary_if: the primary interface of this batman interface
* @hw_src: the Hardware source in the ARP Header
* @hw_dst: the Hardware destination in the ARP Header
* @ethhdr: pointer to the Ethernet header of the claim frame
*
- * checks if it is a claim packet and if its on the same group.
+ * checks if it is a claim packet and if it's on the same group.
* This function also applies the group ID of the sender
* if it is in the same mesh.
*
@@ -1043,7 +1046,7 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv,
/* lets see if this originator is in our mesh */
orig_node = batadv_orig_hash_find(bat_priv, backbone_addr);
- /* dont accept claims from gateways which are not in
+ /* don't accept claims from gateways which are not in
* the same mesh or group.
*/
if (!orig_node)
@@ -1064,8 +1067,8 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv,
/**
* batadv_bla_process_claim() - Check if this is a claim frame, and process it
- * @bat_priv: the bat priv with all the soft interface information
- * @primary_if: the primary hard interface of this batman soft interface
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @primary_if: the primary hard interface of this batman mesh interface
* @skb: the frame to be checked
*
* Return: true if it was a claim frame, otherwise return false to
@@ -1207,7 +1210,7 @@ static bool batadv_bla_process_claim(struct batadv_priv *bat_priv,
/**
* batadv_bla_purge_backbone_gw() - Remove backbone gateways after a timeout or
* immediately
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @now: whether the whole hash shall be wiped now
*
* Check when we last heard from other nodes, and remove them in case of
@@ -1259,7 +1262,7 @@ purge_now:
/**
* batadv_bla_purge_claims() - Remove claims after a timeout or immediately
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @primary_if: the selected primary interface, may be NULL if now is set
* @now: whether the whole hash shall be wiped now
*
@@ -1318,7 +1321,7 @@ skip:
/**
* batadv_bla_update_orig_address() - Update the backbone gateways when the own
* originator address changes
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @primary_if: the new selected primary_if
* @oldif: the old primary interface, may be NULL
*/
@@ -1373,7 +1376,7 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv,
/**
* batadv_bla_send_loopdetect() - send a loopdetect frame
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @backbone_gw: the backbone gateway for which a loop should be detected
*
* To detect loops that the bridge loop avoidance can't handle, send a loop
@@ -1393,7 +1396,7 @@ batadv_bla_send_loopdetect(struct batadv_priv *bat_priv,
/**
* batadv_bla_status_update() - purge bla interfaces if necessary
- * @net_dev: the soft interface net device
+ * @net_dev: the mesh interface net device
*/
void batadv_bla_status_update(struct net_device *net_dev)
{
@@ -1501,8 +1504,7 @@ static void batadv_bla_periodic_work(struct work_struct *work)
rcu_read_unlock();
}
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
+ batadv_hardif_put(primary_if);
queue_delayed_work(batadv_event_workqueue, &bat_priv->bla.work,
msecs_to_jiffies(BATADV_BLA_PERIOD_LENGTH));
@@ -1518,7 +1520,7 @@ static struct lock_class_key batadv_backbone_hash_lock_class_key;
/**
* batadv_bla_init() - initialize all bla structures
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*
* Return: 0 on success, < 0 on error.
*/
@@ -1559,10 +1561,14 @@ int batadv_bla_init(struct batadv_priv *bat_priv)
return 0;
bat_priv->bla.claim_hash = batadv_hash_new(128);
- bat_priv->bla.backbone_hash = batadv_hash_new(32);
+ if (!bat_priv->bla.claim_hash)
+ return -ENOMEM;
- if (!bat_priv->bla.claim_hash || !bat_priv->bla.backbone_hash)
+ bat_priv->bla.backbone_hash = batadv_hash_new(32);
+ if (!bat_priv->bla.backbone_hash) {
+ batadv_hash_destroy(bat_priv->bla.claim_hash);
return -ENOMEM;
+ }
batadv_hash_set_lock_class(bat_priv->bla.claim_hash,
&batadv_claim_hash_lock_class_key);
@@ -1579,13 +1585,15 @@ int batadv_bla_init(struct batadv_priv *bat_priv)
}
/**
- * batadv_bla_check_bcast_duplist() - Check if a frame is in the broadcast dup.
- * @bat_priv: the bat priv with all the soft interface information
- * @skb: contains the bcast_packet to be checked
+ * batadv_bla_check_duplist() - Check if a frame is in the broadcast dup.
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: contains the multicast packet to be checked
+ * @payload_offset: offset in the skb, marking the start of the data to be CRC'ed
+ * @orig: originator mac address, NULL if unknown
*
- * check if it is on our broadcast list. Another gateway might
- * have sent the same packet because it is connected to the same backbone,
- * so we have to remove this duplicate.
+ * Check if it is on our broadcast list. Another gateway might have sent the
+ * same packet because it is connected to the same backbone, so we have to
+ * remove this duplicate.
*
* This is performed by checking the CRC, which will tell us
* with a good chance that it is the same packet. If it is furthermore
@@ -1594,19 +1602,19 @@ int batadv_bla_init(struct batadv_priv *bat_priv)
*
* Return: true if a packet is in the duplicate list, false otherwise.
*/
-bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
- struct sk_buff *skb)
+static bool batadv_bla_check_duplist(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int payload_offset,
+ const u8 *orig)
{
- int i, curr;
- __be32 crc;
- struct batadv_bcast_packet *bcast_packet;
struct batadv_bcast_duplist_entry *entry;
bool ret = false;
-
- bcast_packet = (struct batadv_bcast_packet *)skb->data;
+ int payload_len;
+ int i, curr;
+ u32 crc;
/* calculate the crc ... */
- crc = batadv_skb_crc32(skb, (u8 *)(bcast_packet + 1));
+ payload_len = skb->len - payload_offset;
+ crc = skb_crc32c(skb, payload_offset, payload_len, 0);
spin_lock_bh(&bat_priv->bla.bcast_duplist_lock);
@@ -1625,8 +1633,21 @@ bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
if (entry->crc != crc)
continue;
- if (batadv_compare_eth(entry->orig, bcast_packet->orig))
- continue;
+ /* are the originators both known and not anonymous? */
+ if (orig && !is_zero_ether_addr(orig) &&
+ !is_zero_ether_addr(entry->orig)) {
+ /* If known, check if the new frame came from
+ * the same originator:
+ * We are safe to take identical frames from the
+ * same orig, if known, as multiplications in
+ * the mesh are detected via the (orig, seqno) pair.
+ * So we can be a bit more liberal here and allow
+ * identical frames from the same orig which the source
+ * host might have sent multiple times on purpose.
+ */
+ if (batadv_compare_eth(entry->orig, orig))
+ continue;
+ }
/* this entry seems to match: same crc, not too old,
* and from another gw. therefore return true to forbid it.
@@ -1642,7 +1663,14 @@ bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
entry = &bat_priv->bla.bcast_duplist[curr];
entry->crc = crc;
entry->entrytime = jiffies;
- ether_addr_copy(entry->orig, bcast_packet->orig);
+
+ /* known originator */
+ if (orig)
+ ether_addr_copy(entry->orig, orig);
+ /* anonymous originator */
+ else
+ eth_zero_addr(entry->orig);
+
bat_priv->bla.bcast_duplist_curr = curr;
out:
@@ -1652,9 +1680,49 @@ out:
}
/**
+ * batadv_bla_check_ucast_duplist() - Check if a frame is in the broadcast dup.
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: contains the multicast packet to be checked, decapsulated from a
+ * unicast_packet
+ *
+ * Check if it is on our broadcast list. Another gateway might have sent the
+ * same packet because it is connected to the same backbone, so we have to
+ * remove this duplicate.
+ *
+ * Return: true if a packet is in the duplicate list, false otherwise.
+ */
+static bool batadv_bla_check_ucast_duplist(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ return batadv_bla_check_duplist(bat_priv, skb, 0, NULL);
+}
+
+/**
+ * batadv_bla_check_bcast_duplist() - Check if a frame is in the broadcast dup.
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: contains the bcast_packet to be checked
+ *
+ * Check if it is on our broadcast list. Another gateway might have sent the
+ * same packet because it is connected to the same backbone, so we have to
+ * remove this duplicate.
+ *
+ * Return: true if a packet is in the duplicate list, false otherwise.
+ */
+bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ struct batadv_bcast_packet *bcast_packet;
+
+ bcast_packet = (struct batadv_bcast_packet *)skb->data;
+
+ return batadv_bla_check_duplist(bat_priv, skb, sizeof(*bcast_packet),
+ bcast_packet->orig);
+}
+
+/**
* batadv_bla_is_backbone_gw_orig() - Check if the originator is a gateway for
* the VLAN identified by vid.
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig: originator mac address
* @vid: VLAN identifier
*
@@ -1697,7 +1765,7 @@ bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig,
* @orig_node: the orig_node of the frame
* @hdr_size: maximum length of the frame
*
- * Return: true if the orig_node is also a gateway on the soft interface,
+ * Return: true if the orig_node is also a gateway on the mesh interface,
* otherwise it returns false.
*/
bool batadv_bla_is_backbone_gw(struct sk_buff *skb,
@@ -1727,9 +1795,9 @@ bool batadv_bla_is_backbone_gw(struct sk_buff *skb,
/**
* batadv_bla_free() - free all bla structures
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*
- * for softinterface free or module unload
+ * for meshinterface free or module unload
*/
void batadv_bla_free(struct batadv_priv *bat_priv)
{
@@ -1748,19 +1816,18 @@ void batadv_bla_free(struct batadv_priv *bat_priv)
batadv_hash_destroy(bat_priv->bla.backbone_hash);
bat_priv->bla.backbone_hash = NULL;
}
- if (primary_if)
- batadv_hardif_put(primary_if);
+ batadv_hardif_put(primary_if);
}
/**
* batadv_bla_loopdetect_check() - check and handle a detected loop
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: the packet to check
* @primary_if: interface where the request came on
* @vid: the VLAN ID of the frame
*
* Checks if this packet is a loop detect frame which has been sent by us,
- * throw an uevent and log the event if that is the case.
+ * throws an uevent and logs the event if that is the case.
*
* Return: true if it is a loop detect frame which is to be dropped, false
* otherwise.
@@ -1798,7 +1865,7 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb,
ret = queue_work(batadv_event_workqueue, &backbone_gw->report_work);
- /* backbone_gw is unreferenced in the report work function function
+ /* backbone_gw is unreferenced in the report work function
* if queue_work() call was successful
*/
if (!ret)
@@ -1809,22 +1876,22 @@ batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb,
/**
* batadv_bla_rx() - check packets coming from the mesh.
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: the frame to be checked
* @vid: the VLAN ID of the frame
- * @is_bcast: the packet came in a broadcast packet type.
+ * @packet_type: the batman packet type this frame came in
*
* batadv_bla_rx avoidance checks if:
* * we have to race for a claim
* * if the frame is allowed on the LAN
*
- * in these cases, the skb is further handled by this function
+ * In these cases, the skb is further handled by this function
*
* Return: true if handled, otherwise it returns false and the caller shall
* further process the skb.
*/
bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
- unsigned short vid, bool is_bcast)
+ unsigned short vid, int packet_type)
{
struct batadv_bla_backbone_gw *backbone_gw;
struct ethhdr *ethhdr;
@@ -1846,25 +1913,47 @@ bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
goto handled;
if (unlikely(atomic_read(&bat_priv->bla.num_requests)))
- /* don't allow broadcasts while requests are in flight */
- if (is_multicast_ether_addr(ethhdr->h_dest) && is_bcast)
- goto handled;
+ /* don't allow multicast packets while requests are in flight */
+ if (is_multicast_ether_addr(ethhdr->h_dest))
+ /* Both broadcast flooding or multicast-via-unicasts
+ * delivery might send to multiple backbone gateways
+ * sharing the same LAN and therefore need to coordinate
+ * which backbone gateway forwards into the LAN,
+ * by claiming the payload source address.
+ *
+ * Broadcast flooding and multicast-via-unicasts
+ * delivery use the following two batman packet types.
+ * Note: explicitly exclude BATADV_UNICAST_4ADDR,
+ * as the DHCP gateway feature will send explicitly
+ * to only one BLA gateway, so the claiming process
+ * should be avoided there.
+ */
+ if (packet_type == BATADV_BCAST ||
+ packet_type == BATADV_UNICAST)
+ goto handled;
+
+ /* potential duplicates from foreign BLA backbone gateways via
+ * multicast-in-unicast packets
+ */
+ if (is_multicast_ether_addr(ethhdr->h_dest) &&
+ packet_type == BATADV_UNICAST &&
+ batadv_bla_check_ucast_duplist(bat_priv, skb))
+ goto handled;
ether_addr_copy(search_claim.addr, ethhdr->h_source);
search_claim.vid = vid;
claim = batadv_claim_hash_find(bat_priv, &search_claim);
if (!claim) {
+ bool local = batadv_is_my_client(bat_priv, ethhdr->h_source, vid);
+
/* possible optimization: race for a claim */
/* No claim exists yet, claim it for us!
*/
batadv_dbg(BATADV_DBG_BLA, bat_priv,
"%s(): Unclaimed MAC %pM found. Claim it. Local: %s\n",
- __func__, ethhdr->h_source,
- batadv_is_my_client(bat_priv,
- ethhdr->h_source, vid) ?
- "yes" : "no");
+ __func__, ethhdr->h_source, str_yes_no(local));
batadv_handle_claim(bat_priv, primary_if,
primary_if->net_dev->dev_addr,
ethhdr->h_source, vid);
@@ -1883,13 +1972,14 @@ bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
goto allow;
}
- /* if it is a broadcast ... */
- if (is_multicast_ether_addr(ethhdr->h_dest) && is_bcast) {
+ /* if it is a multicast ... */
+ if (is_multicast_ether_addr(ethhdr->h_dest) &&
+ (packet_type == BATADV_BCAST || packet_type == BATADV_UNICAST)) {
/* ... drop it. the responsible gateway is in charge.
*
- * We need to check is_bcast because with the gateway
+ * We need to check packet type because with the gateway
* feature, broadcasts (like DHCP requests) may be sent
- * using a unicast packet type.
+ * using a unicast 4 address packet type. See comment above.
*/
goto handled;
} else {
@@ -1912,16 +2002,14 @@ handled:
ret = true;
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- if (claim)
- batadv_claim_put(claim);
+ batadv_hardif_put(primary_if);
+ batadv_claim_put(claim);
return ret;
}
/**
* batadv_bla_tx() - check packets going into the mesh
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: the frame to be checked
* @vid: the VLAN ID of the frame
*
@@ -2019,105 +2107,44 @@ allow:
handled:
ret = true;
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- if (claim)
- batadv_claim_put(claim);
+ batadv_hardif_put(primary_if);
+ batadv_claim_put(claim);
return ret;
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_bla_claim_table_seq_print_text() - print the claim table in a seq file
- * @seq: seq file to print on
- * @offset: not used
- *
- * Return: always 0
- */
-int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hashtable *hash = bat_priv->bla.claim_hash;
- struct batadv_bla_backbone_gw *backbone_gw;
- struct batadv_bla_claim *claim;
- struct batadv_hard_iface *primary_if;
- struct hlist_head *head;
- u16 backbone_crc;
- u32 i;
- bool is_own;
- u8 *primary_addr;
-
- primary_if = batadv_seq_print_text_primary_if_get(seq);
- if (!primary_if)
- goto out;
-
- primary_addr = primary_if->net_dev->dev_addr;
- seq_printf(seq,
- "Claims announced for the mesh %s (orig %pM, group id %#.4x)\n",
- net_dev->name, primary_addr,
- ntohs(bat_priv->bla.claim_dest.group));
- seq_puts(seq,
- " Client VID Originator [o] (CRC )\n");
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(claim, head, hash_entry) {
- backbone_gw = batadv_bla_claim_get_backbone_gw(claim);
-
- is_own = batadv_compare_eth(backbone_gw->orig,
- primary_addr);
-
- spin_lock_bh(&backbone_gw->crc_lock);
- backbone_crc = backbone_gw->crc;
- spin_unlock_bh(&backbone_gw->crc_lock);
- seq_printf(seq, " * %pM on %5d by %pM [%c] (%#.4x)\n",
- claim->addr, batadv_print_vid(claim->vid),
- backbone_gw->orig,
- (is_own ? 'x' : ' '),
- backbone_crc);
-
- batadv_backbone_gw_put(backbone_gw);
- }
- rcu_read_unlock();
- }
-out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- return 0;
-}
-#endif
-
/**
* batadv_bla_claim_dump_entry() - dump one entry of the claim table
* to a netlink socket
* @msg: buffer for the message
* @portid: netlink port
- * @seq: Sequence number of netlink message
+ * @cb: Control block containing additional options
* @primary_if: primary interface
* @claim: entry to dump
*
* Return: 0 or error code.
*/
static int
-batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_hard_iface *primary_if,
struct batadv_bla_claim *claim)
{
- u8 *primary_addr = primary_if->net_dev->dev_addr;
+ const u8 *primary_addr = primary_if->net_dev->dev_addr;
u16 backbone_crc;
bool is_own;
void *hdr;
int ret = -EINVAL;
- hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
- NLM_F_MULTI, BATADV_CMD_GET_BLA_CLAIM);
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_BLA_CLAIM);
if (!hdr) {
ret = -ENOBUFS;
goto out;
}
+ genl_dump_check_consistent(cb, hdr);
+
is_own = batadv_compare_eth(claim->backbone_gw->orig,
primary_addr);
@@ -2153,28 +2180,33 @@ out:
* to a netlink socket
* @msg: buffer for the message
* @portid: netlink port
- * @seq: Sequence number of netlink message
+ * @cb: Control block containing additional options
* @primary_if: primary interface
- * @head: bucket to dump
+ * @hash: hash to dump
+ * @bucket: bucket index to dump
* @idx_skip: How many entries to skip
*
* Return: always 0.
*/
static int
-batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_hard_iface *primary_if,
- struct hlist_head *head, int *idx_skip)
+ struct batadv_hashtable *hash, unsigned int bucket,
+ int *idx_skip)
{
struct batadv_bla_claim *claim;
int idx = 0;
int ret = 0;
- rcu_read_lock();
- hlist_for_each_entry_rcu(claim, head, hash_entry) {
+ spin_lock_bh(&hash->list_locks[bucket]);
+ cb->seq = atomic_read(&hash->generation) << 1 | 1;
+
+ hlist_for_each_entry(claim, &hash->table[bucket], hash_entry) {
if (idx++ < *idx_skip)
continue;
- ret = batadv_bla_claim_dump_entry(msg, portid, seq,
+ ret = batadv_bla_claim_dump_entry(msg, portid, cb,
primary_if, claim);
if (ret) {
*idx_skip = idx - 1;
@@ -2184,7 +2216,7 @@ batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
*idx_skip = 0;
unlock:
- rcu_read_unlock();
+ spin_unlock_bh(&hash->list_locks[bucket]);
return ret;
}
@@ -2199,28 +2231,18 @@ int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb)
{
struct batadv_hard_iface *primary_if = NULL;
int portid = NETLINK_CB(cb->skb).portid;
- struct net *net = sock_net(cb->skb->sk);
- struct net_device *soft_iface;
+ struct net_device *mesh_iface;
struct batadv_hashtable *hash;
struct batadv_priv *bat_priv;
int bucket = cb->args[0];
- struct hlist_head *head;
int idx = cb->args[1];
- int ifindex;
int ret = 0;
- ifindex = batadv_netlink_get_ifindex(cb->nlh,
- BATADV_ATTR_MESH_IFINDEX);
- if (!ifindex)
- return -EINVAL;
-
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
- ret = -ENODEV;
- goto out;
- }
+ mesh_iface = batadv_netlink_get_meshif(cb);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
- bat_priv = netdev_priv(soft_iface);
+ bat_priv = netdev_priv(mesh_iface);
hash = bat_priv->bla.claim_hash;
primary_if = batadv_primary_if_get_selected(bat_priv);
@@ -2230,11 +2252,8 @@ int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb)
}
while (bucket < hash->size) {
- head = &hash->table[bucket];
-
- if (batadv_bla_claim_dump_bucket(msg, portid,
- cb->nlh->nlmsg_seq,
- primary_if, head, &idx))
+ if (batadv_bla_claim_dump_bucket(msg, portid, cb, primary_if,
+ hash, bucket, &idx))
break;
bucket++;
}
@@ -2245,111 +2264,47 @@ int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb)
ret = msg->len;
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
+ batadv_hardif_put(primary_if);
- if (soft_iface)
- dev_put(soft_iface);
+ dev_put(mesh_iface);
return ret;
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_bla_backbone_table_seq_print_text() - print the backbone table in a
- * seq file
- * @seq: seq file to print on
- * @offset: not used
- *
- * Return: always 0
- */
-int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hashtable *hash = bat_priv->bla.backbone_hash;
- struct batadv_bla_backbone_gw *backbone_gw;
- struct batadv_hard_iface *primary_if;
- struct hlist_head *head;
- int secs, msecs;
- u16 backbone_crc;
- u32 i;
- bool is_own;
- u8 *primary_addr;
-
- primary_if = batadv_seq_print_text_primary_if_get(seq);
- if (!primary_if)
- goto out;
-
- primary_addr = primary_if->net_dev->dev_addr;
- seq_printf(seq,
- "Backbones announced for the mesh %s (orig %pM, group id %#.4x)\n",
- net_dev->name, primary_addr,
- ntohs(bat_priv->bla.claim_dest.group));
- seq_puts(seq, " Originator VID last seen (CRC )\n");
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) {
- msecs = jiffies_to_msecs(jiffies -
- backbone_gw->lasttime);
- secs = msecs / 1000;
- msecs = msecs % 1000;
-
- is_own = batadv_compare_eth(backbone_gw->orig,
- primary_addr);
- if (is_own)
- continue;
-
- spin_lock_bh(&backbone_gw->crc_lock);
- backbone_crc = backbone_gw->crc;
- spin_unlock_bh(&backbone_gw->crc_lock);
-
- seq_printf(seq, " * %pM on %5d %4i.%03is (%#.4x)\n",
- backbone_gw->orig,
- batadv_print_vid(backbone_gw->vid), secs,
- msecs, backbone_crc);
- }
- rcu_read_unlock();
- }
-out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- return 0;
-}
-#endif
-
/**
* batadv_bla_backbone_dump_entry() - dump one entry of the backbone table to a
* netlink socket
* @msg: buffer for the message
* @portid: netlink port
- * @seq: Sequence number of netlink message
+ * @cb: Control block containing additional options
* @primary_if: primary interface
* @backbone_gw: entry to dump
*
* Return: 0 or error code.
*/
static int
-batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_hard_iface *primary_if,
struct batadv_bla_backbone_gw *backbone_gw)
{
- u8 *primary_addr = primary_if->net_dev->dev_addr;
+ const u8 *primary_addr = primary_if->net_dev->dev_addr;
u16 backbone_crc;
bool is_own;
int msecs;
void *hdr;
int ret = -EINVAL;
- hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
- NLM_F_MULTI, BATADV_CMD_GET_BLA_BACKBONE);
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_BLA_BACKBONE);
if (!hdr) {
ret = -ENOBUFS;
goto out;
}
+ genl_dump_check_consistent(cb, hdr);
+
is_own = batadv_compare_eth(backbone_gw->orig, primary_addr);
spin_lock_bh(&backbone_gw->crc_lock);
@@ -2386,28 +2341,33 @@ out:
* a netlink socket
* @msg: buffer for the message
* @portid: netlink port
- * @seq: Sequence number of netlink message
+ * @cb: Control block containing additional options
* @primary_if: primary interface
- * @head: bucket to dump
+ * @hash: hash to dump
+ * @bucket: bucket index to dump
* @idx_skip: How many entries to skip
*
* Return: always 0.
*/
static int
-batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_hard_iface *primary_if,
- struct hlist_head *head, int *idx_skip)
+ struct batadv_hashtable *hash,
+ unsigned int bucket, int *idx_skip)
{
struct batadv_bla_backbone_gw *backbone_gw;
int idx = 0;
int ret = 0;
- rcu_read_lock();
- hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) {
+ spin_lock_bh(&hash->list_locks[bucket]);
+ cb->seq = atomic_read(&hash->generation) << 1 | 1;
+
+ hlist_for_each_entry(backbone_gw, &hash->table[bucket], hash_entry) {
if (idx++ < *idx_skip)
continue;
- ret = batadv_bla_backbone_dump_entry(msg, portid, seq,
+ ret = batadv_bla_backbone_dump_entry(msg, portid, cb,
primary_if, backbone_gw);
if (ret) {
*idx_skip = idx - 1;
@@ -2417,7 +2377,7 @@ batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
*idx_skip = 0;
unlock:
- rcu_read_unlock();
+ spin_unlock_bh(&hash->list_locks[bucket]);
return ret;
}
@@ -2432,28 +2392,18 @@ int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb)
{
struct batadv_hard_iface *primary_if = NULL;
int portid = NETLINK_CB(cb->skb).portid;
- struct net *net = sock_net(cb->skb->sk);
- struct net_device *soft_iface;
+ struct net_device *mesh_iface;
struct batadv_hashtable *hash;
struct batadv_priv *bat_priv;
int bucket = cb->args[0];
- struct hlist_head *head;
int idx = cb->args[1];
- int ifindex;
int ret = 0;
- ifindex = batadv_netlink_get_ifindex(cb->nlh,
- BATADV_ATTR_MESH_IFINDEX);
- if (!ifindex)
- return -EINVAL;
-
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
- ret = -ENODEV;
- goto out;
- }
+ mesh_iface = batadv_netlink_get_meshif(cb);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
- bat_priv = netdev_priv(soft_iface);
+ bat_priv = netdev_priv(mesh_iface);
hash = bat_priv->bla.backbone_hash;
primary_if = batadv_primary_if_get_selected(bat_priv);
@@ -2463,11 +2413,8 @@ int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb)
}
while (bucket < hash->size) {
- head = &hash->table[bucket];
-
- if (batadv_bla_backbone_dump_bucket(msg, portid,
- cb->nlh->nlmsg_seq,
- primary_if, head, &idx))
+ if (batadv_bla_backbone_dump_bucket(msg, portid, cb, primary_if,
+ hash, bucket, &idx))
break;
bucket++;
}
@@ -2478,11 +2425,9 @@ int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb)
ret = msg->len;
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
+ batadv_hardif_put(primary_if);
- if (soft_iface)
- dev_put(soft_iface);
+ dev_put(mesh_iface);
return ret;
}
@@ -2491,7 +2436,7 @@ out:
/**
* batadv_bla_check_claim() - check if address is claimed
*
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @addr: mac address of which the claim status is checked
* @vid: the VLAN ID
*
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index 71f95a3e4d3f..8673a265995f 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_BLA_H_
@@ -22,14 +10,12 @@
#include "main.h"
#include <linux/compiler.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/skbuff.h>
#include <linux/stddef.h>
#include <linux/types.h>
-struct net_device;
-struct netlink_callback;
-struct seq_file;
-struct sk_buff;
-
/**
* batadv_bla_is_loopdetect_mac() - check if the mac address is from a loop
* detect frame sent by bridge loop avoidance
@@ -48,16 +34,13 @@ static inline bool batadv_bla_is_loopdetect_mac(const uint8_t *mac)
#ifdef CONFIG_BATMAN_ADV_BLA
bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
- unsigned short vid, bool is_bcast);
+ unsigned short vid, int packet_type);
bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb,
unsigned short vid);
bool batadv_bla_is_backbone_gw(struct sk_buff *skb,
struct batadv_orig_node *orig_node,
int hdr_size);
-int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset);
int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb);
-int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq,
- void *offset);
int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb);
bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig,
unsigned short vid);
@@ -69,7 +52,6 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv,
void batadv_bla_status_update(struct net_device *net_dev);
int batadv_bla_init(struct batadv_priv *bat_priv);
void batadv_bla_free(struct batadv_priv *bat_priv);
-int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb);
#ifdef CONFIG_BATMAN_ADV_DAT
bool batadv_bla_check_claim(struct batadv_priv *bat_priv, u8 *addr,
unsigned short vid);
@@ -79,7 +61,7 @@ bool batadv_bla_check_claim(struct batadv_priv *bat_priv, u8 *addr,
static inline bool batadv_bla_rx(struct batadv_priv *bat_priv,
struct sk_buff *skb, unsigned short vid,
- bool is_bcast)
+ int packet_type)
{
return false;
}
@@ -97,18 +79,6 @@ static inline bool batadv_bla_is_backbone_gw(struct sk_buff *skb,
return false;
}
-static inline int batadv_bla_claim_table_seq_print_text(struct seq_file *seq,
- void *offset)
-{
- return 0;
-}
-
-static inline int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq,
- void *offset)
-{
- return 0;
-}
-
static inline bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv,
u8 *orig, unsigned short vid)
{
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
deleted file mode 100644
index 3cb82378300b..000000000000
--- a/net/batman-adv/debugfs.c
+++ /dev/null
@@ -1,474 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors:
- *
- * Marek Lindner
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "debugfs.h"
-#include "main.h"
-
-#include <linux/dcache.h>
-#include <linux/debugfs.h>
-#include <linux/err.h>
-#include <linux/errno.h>
-#include <linux/export.h>
-#include <linux/fs.h>
-#include <linux/netdevice.h>
-#include <linux/printk.h>
-#include <linux/seq_file.h>
-#include <linux/stat.h>
-#include <linux/stddef.h>
-#include <linux/stringify.h>
-#include <linux/sysfs.h>
-#include <net/net_namespace.h>
-
-#include "bat_algo.h"
-#include "bridge_loop_avoidance.h"
-#include "distributed-arp-table.h"
-#include "gateway_client.h"
-#include "icmp_socket.h"
-#include "log.h"
-#include "multicast.h"
-#include "network-coding.h"
-#include "originator.h"
-#include "translation-table.h"
-
-static struct dentry *batadv_debugfs;
-
-static int batadv_algorithms_open(struct inode *inode, struct file *file)
-{
- return single_open(file, batadv_algo_seq_print_text, NULL);
-}
-
-static int neighbors_open(struct inode *inode, struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- return single_open(file, batadv_hardif_neigh_seq_print_text, net_dev);
-}
-
-static int batadv_originators_open(struct inode *inode, struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- return single_open(file, batadv_orig_seq_print_text, net_dev);
-}
-
-/**
- * batadv_originators_hardif_open() - handles debugfs output for the originator
- * table of an hard interface
- * @inode: inode pointer to debugfs file
- * @file: pointer to the seq_file
- *
- * Return: 0 on success or negative error number in case of failure
- */
-static int batadv_originators_hardif_open(struct inode *inode,
- struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- return single_open(file, batadv_orig_hardif_seq_print_text, net_dev);
-}
-
-static int batadv_gateways_open(struct inode *inode, struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- return single_open(file, batadv_gw_client_seq_print_text, net_dev);
-}
-
-static int batadv_transtable_global_open(struct inode *inode, struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- return single_open(file, batadv_tt_global_seq_print_text, net_dev);
-}
-
-#ifdef CONFIG_BATMAN_ADV_BLA
-static int batadv_bla_claim_table_open(struct inode *inode, struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- return single_open(file, batadv_bla_claim_table_seq_print_text,
- net_dev);
-}
-
-static int batadv_bla_backbone_table_open(struct inode *inode,
- struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- return single_open(file, batadv_bla_backbone_table_seq_print_text,
- net_dev);
-}
-
-#endif
-
-#ifdef CONFIG_BATMAN_ADV_DAT
-/**
- * batadv_dat_cache_open() - Prepare file handler for reads from dat_cache
- * @inode: inode which was opened
- * @file: file handle to be initialized
- *
- * Return: 0 on success or negative error number in case of failure
- */
-static int batadv_dat_cache_open(struct inode *inode, struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- return single_open(file, batadv_dat_cache_seq_print_text, net_dev);
-}
-#endif
-
-static int batadv_transtable_local_open(struct inode *inode, struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- return single_open(file, batadv_tt_local_seq_print_text, net_dev);
-}
-
-struct batadv_debuginfo {
- struct attribute attr;
- const struct file_operations fops;
-};
-
-#ifdef CONFIG_BATMAN_ADV_NC
-static int batadv_nc_nodes_open(struct inode *inode, struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- return single_open(file, batadv_nc_nodes_seq_print_text, net_dev);
-}
-#endif
-
-#ifdef CONFIG_BATMAN_ADV_MCAST
-/**
- * batadv_mcast_flags_open() - prepare file handler for reads from mcast_flags
- * @inode: inode which was opened
- * @file: file handle to be initialized
- *
- * Return: 0 on success or negative error number in case of failure
- */
-static int batadv_mcast_flags_open(struct inode *inode, struct file *file)
-{
- struct net_device *net_dev = (struct net_device *)inode->i_private;
-
- return single_open(file, batadv_mcast_flags_seq_print_text, net_dev);
-}
-#endif
-
-#define BATADV_DEBUGINFO(_name, _mode, _open) \
-struct batadv_debuginfo batadv_debuginfo_##_name = { \
- .attr = { \
- .name = __stringify(_name), \
- .mode = _mode, \
- }, \
- .fops = { \
- .owner = THIS_MODULE, \
- .open = _open, \
- .read = seq_read, \
- .llseek = seq_lseek, \
- .release = single_release, \
- }, \
-}
-
-/* the following attributes are general and therefore they will be directly
- * placed in the BATADV_DEBUGFS_SUBDIR subdirectory of debugfs
- */
-static BATADV_DEBUGINFO(routing_algos, 0444, batadv_algorithms_open);
-
-static struct batadv_debuginfo *batadv_general_debuginfos[] = {
- &batadv_debuginfo_routing_algos,
- NULL,
-};
-
-/* The following attributes are per soft interface */
-static BATADV_DEBUGINFO(neighbors, 0444, neighbors_open);
-static BATADV_DEBUGINFO(originators, 0444, batadv_originators_open);
-static BATADV_DEBUGINFO(gateways, 0444, batadv_gateways_open);
-static BATADV_DEBUGINFO(transtable_global, 0444, batadv_transtable_global_open);
-#ifdef CONFIG_BATMAN_ADV_BLA
-static BATADV_DEBUGINFO(bla_claim_table, 0444, batadv_bla_claim_table_open);
-static BATADV_DEBUGINFO(bla_backbone_table, 0444,
- batadv_bla_backbone_table_open);
-#endif
-#ifdef CONFIG_BATMAN_ADV_DAT
-static BATADV_DEBUGINFO(dat_cache, 0444, batadv_dat_cache_open);
-#endif
-static BATADV_DEBUGINFO(transtable_local, 0444, batadv_transtable_local_open);
-#ifdef CONFIG_BATMAN_ADV_NC
-static BATADV_DEBUGINFO(nc_nodes, 0444, batadv_nc_nodes_open);
-#endif
-#ifdef CONFIG_BATMAN_ADV_MCAST
-static BATADV_DEBUGINFO(mcast_flags, 0444, batadv_mcast_flags_open);
-#endif
-
-static struct batadv_debuginfo *batadv_mesh_debuginfos[] = {
- &batadv_debuginfo_neighbors,
- &batadv_debuginfo_originators,
- &batadv_debuginfo_gateways,
- &batadv_debuginfo_transtable_global,
-#ifdef CONFIG_BATMAN_ADV_BLA
- &batadv_debuginfo_bla_claim_table,
- &batadv_debuginfo_bla_backbone_table,
-#endif
-#ifdef CONFIG_BATMAN_ADV_DAT
- &batadv_debuginfo_dat_cache,
-#endif
- &batadv_debuginfo_transtable_local,
-#ifdef CONFIG_BATMAN_ADV_NC
- &batadv_debuginfo_nc_nodes,
-#endif
-#ifdef CONFIG_BATMAN_ADV_MCAST
- &batadv_debuginfo_mcast_flags,
-#endif
- NULL,
-};
-
-#define BATADV_HARDIF_DEBUGINFO(_name, _mode, _open) \
-struct batadv_debuginfo batadv_hardif_debuginfo_##_name = { \
- .attr = { \
- .name = __stringify(_name), \
- .mode = _mode, \
- }, \
- .fops = { \
- .owner = THIS_MODULE, \
- .open = _open, \
- .read = seq_read, \
- .llseek = seq_lseek, \
- .release = single_release, \
- }, \
-}
-
-static BATADV_HARDIF_DEBUGINFO(originators, 0444,
- batadv_originators_hardif_open);
-
-static struct batadv_debuginfo *batadv_hardif_debuginfos[] = {
- &batadv_hardif_debuginfo_originators,
- NULL,
-};
-
-/**
- * batadv_debugfs_init() - Initialize soft interface independent debugfs entries
- */
-void batadv_debugfs_init(void)
-{
- struct batadv_debuginfo **bat_debug;
- struct dentry *file;
-
- batadv_debugfs = debugfs_create_dir(BATADV_DEBUGFS_SUBDIR, NULL);
- if (batadv_debugfs == ERR_PTR(-ENODEV))
- batadv_debugfs = NULL;
-
- if (!batadv_debugfs)
- goto err;
-
- for (bat_debug = batadv_general_debuginfos; *bat_debug; ++bat_debug) {
- file = debugfs_create_file(((*bat_debug)->attr).name,
- S_IFREG | ((*bat_debug)->attr).mode,
- batadv_debugfs, NULL,
- &(*bat_debug)->fops);
- if (!file) {
- pr_err("Can't add general debugfs file: %s\n",
- ((*bat_debug)->attr).name);
- goto err;
- }
- }
-
- return;
-err:
- debugfs_remove_recursive(batadv_debugfs);
- batadv_debugfs = NULL;
-}
-
-/**
- * batadv_debugfs_destroy() - Remove all debugfs entries
- */
-void batadv_debugfs_destroy(void)
-{
- debugfs_remove_recursive(batadv_debugfs);
- batadv_debugfs = NULL;
-}
-
-/**
- * batadv_debugfs_add_hardif() - creates the base directory for a hard interface
- * in debugfs.
- * @hard_iface: hard interface which should be added.
- *
- * Return: 0 on success or negative error number in case of failure
- */
-int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface)
-{
- struct net *net = dev_net(hard_iface->net_dev);
- struct batadv_debuginfo **bat_debug;
- struct dentry *file;
-
- if (!batadv_debugfs)
- goto out;
-
- if (net != &init_net)
- return 0;
-
- hard_iface->debug_dir = debugfs_create_dir(hard_iface->net_dev->name,
- batadv_debugfs);
- if (!hard_iface->debug_dir)
- goto out;
-
- for (bat_debug = batadv_hardif_debuginfos; *bat_debug; ++bat_debug) {
- file = debugfs_create_file(((*bat_debug)->attr).name,
- S_IFREG | ((*bat_debug)->attr).mode,
- hard_iface->debug_dir,
- hard_iface->net_dev,
- &(*bat_debug)->fops);
- if (!file)
- goto rem_attr;
- }
-
- return 0;
-rem_attr:
- debugfs_remove_recursive(hard_iface->debug_dir);
- hard_iface->debug_dir = NULL;
-out:
- return -ENOMEM;
-}
-
-/**
- * batadv_debugfs_rename_hardif() - Fix debugfs path for renamed hardif
- * @hard_iface: hard interface which was renamed
- */
-void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface)
-{
- const char *name = hard_iface->net_dev->name;
- struct dentry *dir;
- struct dentry *d;
-
- dir = hard_iface->debug_dir;
- if (!dir)
- return;
-
- d = debugfs_rename(dir->d_parent, dir, dir->d_parent, name);
- if (!d)
- pr_err("Can't rename debugfs dir to %s\n", name);
-}
-
-/**
- * batadv_debugfs_del_hardif() - delete the base directory for a hard interface
- * in debugfs.
- * @hard_iface: hard interface which is deleted.
- */
-void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface)
-{
- struct net *net = dev_net(hard_iface->net_dev);
-
- if (net != &init_net)
- return;
-
- if (batadv_debugfs) {
- debugfs_remove_recursive(hard_iface->debug_dir);
- hard_iface->debug_dir = NULL;
- }
-}
-
-/**
- * batadv_debugfs_add_meshif() - Initialize interface dependent debugfs entries
- * @dev: netdev struct of the soft interface
- *
- * Return: 0 on success or negative error number in case of failure
- */
-int batadv_debugfs_add_meshif(struct net_device *dev)
-{
- struct batadv_priv *bat_priv = netdev_priv(dev);
- struct batadv_debuginfo **bat_debug;
- struct net *net = dev_net(dev);
- struct dentry *file;
-
- if (!batadv_debugfs)
- goto out;
-
- if (net != &init_net)
- return 0;
-
- bat_priv->debug_dir = debugfs_create_dir(dev->name, batadv_debugfs);
- if (!bat_priv->debug_dir)
- goto out;
-
- if (batadv_socket_setup(bat_priv) < 0)
- goto rem_attr;
-
- if (batadv_debug_log_setup(bat_priv) < 0)
- goto rem_attr;
-
- for (bat_debug = batadv_mesh_debuginfos; *bat_debug; ++bat_debug) {
- file = debugfs_create_file(((*bat_debug)->attr).name,
- S_IFREG | ((*bat_debug)->attr).mode,
- bat_priv->debug_dir,
- dev, &(*bat_debug)->fops);
- if (!file) {
- batadv_err(dev, "Can't add debugfs file: %s/%s\n",
- dev->name, ((*bat_debug)->attr).name);
- goto rem_attr;
- }
- }
-
- if (batadv_nc_init_debugfs(bat_priv) < 0)
- goto rem_attr;
-
- return 0;
-rem_attr:
- debugfs_remove_recursive(bat_priv->debug_dir);
- bat_priv->debug_dir = NULL;
-out:
- return -ENOMEM;
-}
-
-/**
- * batadv_debugfs_rename_meshif() - Fix debugfs path for renamed softif
- * @dev: net_device which was renamed
- */
-void batadv_debugfs_rename_meshif(struct net_device *dev)
-{
- struct batadv_priv *bat_priv = netdev_priv(dev);
- const char *name = dev->name;
- struct dentry *dir;
- struct dentry *d;
-
- dir = bat_priv->debug_dir;
- if (!dir)
- return;
-
- d = debugfs_rename(dir->d_parent, dir, dir->d_parent, name);
- if (!d)
- pr_err("Can't rename debugfs dir to %s\n", name);
-}
-
-/**
- * batadv_debugfs_del_meshif() - Remove interface dependent debugfs entries
- * @dev: netdev struct of the soft interface
- */
-void batadv_debugfs_del_meshif(struct net_device *dev)
-{
- struct batadv_priv *bat_priv = netdev_priv(dev);
- struct net *net = dev_net(dev);
-
- if (net != &init_net)
- return;
-
- batadv_debug_log_cleanup(bat_priv);
-
- if (batadv_debugfs) {
- debugfs_remove_recursive(bat_priv->debug_dir);
- bat_priv->debug_dir = NULL;
- }
-}
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
deleted file mode 100644
index 08a592ffbee5..000000000000
--- a/net/batman-adv/debugfs.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors:
- *
- * Marek Lindner
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _NET_BATMAN_ADV_DEBUGFS_H_
-#define _NET_BATMAN_ADV_DEBUGFS_H_
-
-#include "main.h"
-
-struct net_device;
-
-#define BATADV_DEBUGFS_SUBDIR "batman_adv"
-
-#if IS_ENABLED(CONFIG_BATMAN_ADV_DEBUGFS)
-
-void batadv_debugfs_init(void);
-void batadv_debugfs_destroy(void);
-int batadv_debugfs_add_meshif(struct net_device *dev);
-void batadv_debugfs_rename_meshif(struct net_device *dev);
-void batadv_debugfs_del_meshif(struct net_device *dev);
-int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface);
-void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface);
-void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface);
-
-#else
-
-static inline void batadv_debugfs_init(void)
-{
-}
-
-static inline void batadv_debugfs_destroy(void)
-{
-}
-
-static inline int batadv_debugfs_add_meshif(struct net_device *dev)
-{
- return 0;
-}
-
-static inline void batadv_debugfs_rename_meshif(struct net_device *dev)
-{
-}
-
-static inline void batadv_debugfs_del_meshif(struct net_device *dev)
-{
-}
-
-static inline
-int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface)
-{
- return 0;
-}
-
-static inline
-void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface)
-{
-}
-
-static inline
-void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface)
-{
-}
-
-#endif
-
-#endif /* _NET_BATMAN_ADV_DEBUGFS_H_ */
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index a60bacf7120b..8b8132eb0a79 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -1,19 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Antonio Quartulli
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "distributed-arp-table.h"
@@ -22,6 +10,8 @@
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/byteorder/generic.h>
+#include <linux/container_of.h>
+#include <linux/err.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/gfp.h>
@@ -29,24 +19,24 @@
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <linux/in.h>
+#include <linux/ip.h>
#include <linux/jiffies.h>
-#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/list.h>
#include <linux/netlink.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/stddef.h>
#include <linux/string.h>
+#include <linux/udp.h>
+#include <linux/unaligned.h>
#include <linux/workqueue.h>
#include <net/arp.h>
#include <net/genetlink.h>
#include <net/netlink.h>
-#include <net/sock.h>
#include <uapi/linux/batman_adv.h>
#include "bridge_loop_avoidance.h"
@@ -56,19 +46,60 @@
#include "netlink.h"
#include "originator.h"
#include "send.h"
-#include "soft-interface.h"
#include "translation-table.h"
#include "tvlv.h"
+enum batadv_bootpop {
+ BATADV_BOOTREPLY = 2,
+};
+
+enum batadv_boothtype {
+ BATADV_HTYPE_ETHERNET = 1,
+};
+
+enum batadv_dhcpoptioncode {
+ BATADV_DHCP_OPT_PAD = 0,
+ BATADV_DHCP_OPT_MSG_TYPE = 53,
+ BATADV_DHCP_OPT_END = 255,
+};
+
+enum batadv_dhcptype {
+ BATADV_DHCPACK = 5,
+};
+
+/* { 99, 130, 83, 99 } */
+#define BATADV_DHCP_MAGIC 1669485411
+
+struct batadv_dhcp_packet {
+ __u8 op;
+ __u8 htype;
+ __u8 hlen;
+ __u8 hops;
+ __be32 xid;
+ __be16 secs;
+ __be16 flags;
+ __be32 ciaddr;
+ __be32 yiaddr;
+ __be32 siaddr;
+ __be32 giaddr;
+ __u8 chaddr[16];
+ __u8 sname[64];
+ __u8 file[128];
+ __be32 magic;
+ /* __u8 options[]; */
+};
+
+#define BATADV_DHCP_YIADDR_LEN sizeof(((struct batadv_dhcp_packet *)0)->yiaddr)
+#define BATADV_DHCP_CHADDR_LEN sizeof(((struct batadv_dhcp_packet *)0)->chaddr)
+
static void batadv_dat_purge(struct work_struct *work);
/**
* batadv_dat_start_timer() - initialise the DAT periodic worker
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*/
static void batadv_dat_start_timer(struct batadv_priv *bat_priv)
{
- INIT_DELAYED_WORK(&bat_priv->dat.work, batadv_dat_purge);
queue_delayed_work(batadv_event_workqueue, &bat_priv->dat.work,
msecs_to_jiffies(10000));
}
@@ -94,6 +125,9 @@ static void batadv_dat_entry_release(struct kref *ref)
*/
static void batadv_dat_entry_put(struct batadv_dat_entry *dat_entry)
{
+ if (!dat_entry)
+ return;
+
kref_put(&dat_entry->refcount, batadv_dat_entry_release);
}
@@ -111,7 +145,7 @@ static bool batadv_dat_to_purge(struct batadv_dat_entry *dat_entry)
/**
* __batadv_dat_purge() - delete entries from the DAT local storage
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @to_purge: function in charge to decide whether an entry has to be purged or
* not. This function takes the dat_entry as argument and has to
* returns a boolean value: true is the entry has to be deleted,
@@ -212,7 +246,7 @@ static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size)
*/
static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size)
{
- return *(__be32 *)(batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN);
+ return *(__force __be32 *)(batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN);
}
/**
@@ -236,7 +270,9 @@ static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size)
*/
static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size)
{
- return *(__be32 *)(batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN * 2 + 4);
+ u8 *dst = batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN * 2 + 4;
+
+ return *(__force __be32 *)dst;
}
/**
@@ -251,16 +287,18 @@ static u32 batadv_hash_dat(const void *data, u32 size)
u32 hash = 0;
const struct batadv_dat_entry *dat = data;
const unsigned char *key;
+ __be16 vid;
u32 i;
- key = (const unsigned char *)&dat->ip;
+ key = (__force const unsigned char *)&dat->ip;
for (i = 0; i < sizeof(dat->ip); i++) {
hash += key[i];
hash += (hash << 10);
hash ^= (hash >> 6);
}
- key = (const unsigned char *)&dat->vid;
+ vid = htons(dat->vid);
+ key = (__force const unsigned char *)&vid;
for (i = 0; i < sizeof(dat->vid); i++) {
hash += key[i];
hash += (hash << 10);
@@ -277,7 +315,7 @@ static u32 batadv_hash_dat(const void *data, u32 size)
/**
* batadv_dat_entry_hash_find() - look for a given dat_entry in the local hash
* table
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @ip: search key
* @vid: VLAN identifier
*
@@ -319,7 +357,7 @@ batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip,
/**
* batadv_dat_entry_add() - add a new dat entry or update it if already exists
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @ip: ipv4 to add/edit
* @mac_addr: mac address to assign to the given ipv4
* @vid: VLAN identifier
@@ -368,8 +406,7 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip,
&dat_entry->ip, dat_entry->mac_addr, batadv_print_vid(vid));
out:
- if (dat_entry)
- batadv_dat_entry_put(dat_entry);
+ batadv_dat_entry_put(dat_entry);
}
#ifdef CONFIG_BATMAN_ADV_DEBUG
@@ -377,7 +414,7 @@ out:
/**
* batadv_dbg_arp() - print a debug message containing all the ARP packet
* details
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: ARP packet
* @hdr_size: size of the possible header before the ARP packet
* @msg: message to print together with the debugging information
@@ -512,7 +549,7 @@ out:
/**
* batadv_choose_next_candidate() - select the next DHT candidate
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @cands: candidates array
* @select: number of candidates already present in the array
* @ip_key: key to look up in the DHT
@@ -557,8 +594,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv,
continue;
max = tmp_max;
- if (max_orig_node)
- batadv_orig_node_put(max_orig_node);
+ batadv_orig_node_put(max_orig_node);
max_orig_node = orig_node;
}
rcu_read_unlock();
@@ -577,7 +613,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv,
/**
* batadv_dat_select_candidates() - select the nodes which the DHT message has
* to be sent to
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @ip_dst: ipv4 to look up in the DHT
* @vid: VLAN identifier
*
@@ -621,22 +657,22 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst,
}
/**
- * batadv_dat_send_data() - send a payload to the selected candidates
- * @bat_priv: the bat priv with all the soft interface information
+ * batadv_dat_forward_data() - copy and send payload to the selected candidates
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: payload to send
* @ip: the DHT key
* @vid: VLAN identifier
* @packet_subtype: unicast4addr packet subtype to use
*
- * This function copies the skb with pskb_copy() and is sent as unicast packet
+ * This function copies the skb with pskb_copy() and is sent as a unicast packet
* to each of the selected candidates.
*
* Return: true if the packet is sent to at least one candidate, false
* otherwise.
*/
-static bool batadv_dat_send_data(struct batadv_priv *bat_priv,
- struct sk_buff *skb, __be32 ip,
- unsigned short vid, int packet_subtype)
+static bool batadv_dat_forward_data(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, __be32 ip,
+ unsigned short vid, int packet_subtype)
{
int i;
bool ret = false;
@@ -647,7 +683,7 @@ static bool batadv_dat_send_data(struct batadv_priv *bat_priv,
cand = batadv_dat_select_candidates(bat_priv, ip, vid);
if (!cand)
- goto out;
+ return ret;
batadv_dbg(BATADV_DBG_DAT, bat_priv, "DHT_SEND for %pI4\n", &ip);
@@ -691,7 +727,6 @@ free_orig:
batadv_orig_node_put(cand[i].orig_node);
}
-out:
kfree(cand);
return ret;
}
@@ -699,7 +734,7 @@ out:
/**
* batadv_dat_tvlv_container_update() - update the dat tvlv container after dat
* setting change
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*/
static void batadv_dat_tvlv_container_update(struct batadv_priv *bat_priv)
{
@@ -721,7 +756,7 @@ static void batadv_dat_tvlv_container_update(struct batadv_priv *bat_priv)
/**
* batadv_dat_status_update() - update the dat tvlv container after dat
* setting change
- * @net_dev: the soft interface net device
+ * @net_dev: the mesh interface net device
*/
void batadv_dat_status_update(struct net_device *net_dev)
{
@@ -732,7 +767,7 @@ void batadv_dat_status_update(struct net_device *net_dev)
/**
* batadv_dat_tvlv_ogm_handler_v1() - process incoming dat tvlv container
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig: the orig_node of the ogm
* @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
* @tvlv_value: tvlv buffer containing the gateway data
@@ -751,7 +786,7 @@ static void batadv_dat_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
/**
* batadv_dat_hash_free() - free the local DAT hash table
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*/
static void batadv_dat_hash_free(struct batadv_priv *bat_priv)
{
@@ -767,7 +802,7 @@ static void batadv_dat_hash_free(struct batadv_priv *bat_priv)
/**
* batadv_dat_init() - initialise the DAT internals
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*
* Return: 0 in case of success, a negative error code otherwise
*/
@@ -781,10 +816,11 @@ int batadv_dat_init(struct batadv_priv *bat_priv)
if (!bat_priv->dat.hash)
return -ENOMEM;
+ INIT_DELAYED_WORK(&bat_priv->dat.work, batadv_dat_purge);
batadv_dat_start_timer(bat_priv);
batadv_tvlv_handler_register(bat_priv, batadv_dat_tvlv_ogm_handler_v1,
- NULL, BATADV_TVLV_DAT, 1,
+ NULL, NULL, BATADV_TVLV_DAT, 1,
BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
batadv_dat_tvlv_container_update(bat_priv);
return 0;
@@ -792,7 +828,7 @@ int batadv_dat_init(struct batadv_priv *bat_priv)
/**
* batadv_dat_free() - free the DAT internals
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*/
void batadv_dat_free(struct batadv_priv *bat_priv)
{
@@ -804,82 +840,32 @@ void batadv_dat_free(struct batadv_priv *bat_priv)
batadv_dat_hash_free(bat_priv);
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_dat_cache_seq_print_text() - print the local DAT hash table
- * @seq: seq file to print on
- * @offset: not used
- *
- * Return: always 0
- */
-int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hashtable *hash = bat_priv->dat.hash;
- struct batadv_dat_entry *dat_entry;
- struct batadv_hard_iface *primary_if;
- struct hlist_head *head;
- unsigned long last_seen_jiffies;
- int last_seen_msecs, last_seen_secs, last_seen_mins;
- u32 i;
-
- primary_if = batadv_seq_print_text_primary_if_get(seq);
- if (!primary_if)
- goto out;
-
- seq_printf(seq, "Distributed ARP Table (%s):\n", net_dev->name);
- seq_puts(seq,
- " IPv4 MAC VID last-seen\n");
-
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(dat_entry, head, hash_entry) {
- last_seen_jiffies = jiffies - dat_entry->last_update;
- last_seen_msecs = jiffies_to_msecs(last_seen_jiffies);
- last_seen_mins = last_seen_msecs / 60000;
- last_seen_msecs = last_seen_msecs % 60000;
- last_seen_secs = last_seen_msecs / 1000;
-
- seq_printf(seq, " * %15pI4 %pM %4i %6i:%02i\n",
- &dat_entry->ip, dat_entry->mac_addr,
- batadv_print_vid(dat_entry->vid),
- last_seen_mins, last_seen_secs);
- }
- rcu_read_unlock();
- }
-
-out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- return 0;
-}
-#endif
-
/**
* batadv_dat_cache_dump_entry() - dump one entry of the DAT cache table to a
* netlink socket
* @msg: buffer for the message
* @portid: netlink port
- * @seq: Sequence number of netlink message
+ * @cb: Control block containing additional options
* @dat_entry: entry to dump
*
* Return: 0 or error code.
*/
static int
-batadv_dat_cache_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+batadv_dat_cache_dump_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_dat_entry *dat_entry)
{
int msecs;
void *hdr;
- hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
- NLM_F_MULTI, BATADV_CMD_GET_DAT_CACHE);
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_DAT_CACHE);
if (!hdr)
return -ENOBUFS;
+ genl_dump_check_consistent(cb, hdr);
+
msecs = jiffies_to_msecs(jiffies - dat_entry->last_update);
if (nla_put_in_addr(msg, BATADV_ATTR_DAT_CACHE_IP4ADDRESS,
@@ -901,27 +887,31 @@ batadv_dat_cache_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
* a netlink socket
* @msg: buffer for the message
* @portid: netlink port
- * @seq: Sequence number of netlink message
- * @head: bucket to dump
+ * @cb: Control block containing additional options
+ * @hash: hash to dump
+ * @bucket: bucket index to dump
* @idx_skip: How many entries to skip
*
* Return: 0 or error code.
*/
static int
-batadv_dat_cache_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
- struct hlist_head *head, int *idx_skip)
+batadv_dat_cache_dump_bucket(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
+ struct batadv_hashtable *hash, unsigned int bucket,
+ int *idx_skip)
{
struct batadv_dat_entry *dat_entry;
int idx = 0;
- rcu_read_lock();
- hlist_for_each_entry_rcu(dat_entry, head, hash_entry) {
+ spin_lock_bh(&hash->list_locks[bucket]);
+ cb->seq = atomic_read(&hash->generation) << 1 | 1;
+
+ hlist_for_each_entry(dat_entry, &hash->table[bucket], hash_entry) {
if (idx < *idx_skip)
goto skip;
- if (batadv_dat_cache_dump_entry(msg, portid, seq,
- dat_entry)) {
- rcu_read_unlock();
+ if (batadv_dat_cache_dump_entry(msg, portid, cb, dat_entry)) {
+ spin_unlock_bh(&hash->list_locks[bucket]);
*idx_skip = idx;
return -EMSGSIZE;
@@ -930,7 +920,7 @@ batadv_dat_cache_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
skip:
idx++;
}
- rcu_read_unlock();
+ spin_unlock_bh(&hash->list_locks[bucket]);
return 0;
}
@@ -946,28 +936,18 @@ int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb)
{
struct batadv_hard_iface *primary_if = NULL;
int portid = NETLINK_CB(cb->skb).portid;
- struct net *net = sock_net(cb->skb->sk);
- struct net_device *soft_iface;
+ struct net_device *mesh_iface;
struct batadv_hashtable *hash;
struct batadv_priv *bat_priv;
int bucket = cb->args[0];
- struct hlist_head *head;
int idx = cb->args[1];
- int ifindex;
int ret = 0;
- ifindex = batadv_netlink_get_ifindex(cb->nlh,
- BATADV_ATTR_MESH_IFINDEX);
- if (!ifindex)
- return -EINVAL;
+ mesh_iface = batadv_netlink_get_meshif(cb);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
- ret = -ENODEV;
- goto out;
- }
-
- bat_priv = netdev_priv(soft_iface);
+ bat_priv = netdev_priv(mesh_iface);
hash = bat_priv->dat.hash;
primary_if = batadv_primary_if_get_selected(bat_priv);
@@ -977,10 +957,7 @@ int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb)
}
while (bucket < hash->size) {
- head = &hash->table[bucket];
-
- if (batadv_dat_cache_dump_bucket(msg, portid,
- cb->nlh->nlmsg_seq, head,
+ if (batadv_dat_cache_dump_bucket(msg, portid, cb, hash, bucket,
&idx))
break;
@@ -994,18 +971,16 @@ int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb)
ret = msg->len;
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
+ batadv_hardif_put(primary_if);
- if (soft_iface)
- dev_put(soft_iface);
+ dev_put(mesh_iface);
return ret;
}
/**
* batadv_arp_get_type() - parse an ARP packet and gets the type
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: packet to analyse
* @hdr_size: size of the possible header before the ARP packet in the skb
*
@@ -1105,7 +1080,7 @@ static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size)
/**
* batadv_dat_arp_create_reply() - create an ARP Reply
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @ip_src: ARP sender IP
* @ip_dst: ARP target IP
* @hw_src: Ethernet source and ARP sender MAC
@@ -1124,7 +1099,7 @@ batadv_dat_arp_create_reply(struct batadv_priv *bat_priv, __be32 ip_src,
{
struct sk_buff *skb;
- skb = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_dst, bat_priv->soft_iface,
+ skb = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_dst, bat_priv->mesh_iface,
ip_src, hw_dst, hw_src, hw_dst);
if (!skb)
return NULL;
@@ -1141,7 +1116,7 @@ batadv_dat_arp_create_reply(struct batadv_priv *bat_priv, __be32 ip_src,
/**
* batadv_dat_snoop_outgoing_arp_request() - snoop the ARP request and try to
* answer using DAT
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: packet to check
*
* Return: true if the message has been sent to the dht candidates, false
@@ -1157,7 +1132,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
bool ret = false;
struct batadv_dat_entry *dat_entry = NULL;
struct sk_buff *skb_new;
- struct net_device *soft_iface = bat_priv->soft_iface;
+ struct net_device *mesh_iface = bat_priv->mesh_iface;
int hdr_size = 0;
unsigned short vid;
@@ -1187,7 +1162,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
* client will answer itself. DAT would only generate a
* duplicate packet.
*
- * Moreover, if the soft-interface is enslaved into a bridge, an
+ * Moreover, if the mesh-interface is enslaved into a bridge, an
* additional DAT answer may trigger kernel warnings about
* a packet coming from the wrong port.
*/
@@ -1216,7 +1191,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
if (!skb_new)
goto out;
- skb_new->protocol = eth_type_trans(skb_new, soft_iface);
+ skb_new->protocol = eth_type_trans(skb_new, mesh_iface);
batadv_inc_counter(bat_priv, BATADV_CNT_RX);
batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES,
@@ -1227,19 +1202,18 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
ret = true;
} else {
/* Send the request to the DHT */
- ret = batadv_dat_send_data(bat_priv, skb, ip_dst, vid,
- BATADV_P_DAT_DHT_GET);
+ ret = batadv_dat_forward_data(bat_priv, skb, ip_dst, vid,
+ BATADV_P_DAT_DHT_GET);
}
out:
- if (dat_entry)
- batadv_dat_entry_put(dat_entry);
+ batadv_dat_entry_put(dat_entry);
return ret;
}
/**
* batadv_dat_snoop_incoming_arp_request() - snoop the ARP request and try to
* answer using the local DAT storage
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: packet to check
* @hdr_size: size of the encapsulation header
*
@@ -1299,8 +1273,7 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
ret = true;
}
out:
- if (dat_entry)
- batadv_dat_entry_put(dat_entry);
+ batadv_dat_entry_put(dat_entry);
if (ret)
kfree_skb(skb);
return ret;
@@ -1308,7 +1281,7 @@ out:
/**
* batadv_dat_snoop_outgoing_arp_reply() - snoop the ARP reply and fill the DHT
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: packet to check
*/
void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv,
@@ -1342,14 +1315,16 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv,
/* Send the ARP reply to the candidates for both the IP addresses that
* the node obtained from the ARP reply
*/
- batadv_dat_send_data(bat_priv, skb, ip_src, vid, BATADV_P_DAT_DHT_PUT);
- batadv_dat_send_data(bat_priv, skb, ip_dst, vid, BATADV_P_DAT_DHT_PUT);
+ batadv_dat_forward_data(bat_priv, skb, ip_src, vid,
+ BATADV_P_DAT_DHT_PUT);
+ batadv_dat_forward_data(bat_priv, skb, ip_dst, vid,
+ BATADV_P_DAT_DHT_PUT);
}
/**
* batadv_dat_snoop_incoming_arp_reply() - snoop the ARP reply and fill the
* local DAT storage only
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: packet to check
* @hdr_size: size of the encapsulation header
*
@@ -1394,7 +1369,6 @@ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv,
hw_src, &ip_src, hw_dst, &ip_dst,
dat_entry->mac_addr, &dat_entry->ip);
dropped = true;
- goto out;
}
/* Update our internal cache with both the IP addresses the node got
@@ -1403,6 +1377,9 @@ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv,
batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid);
batadv_dat_entry_add(bat_priv, ip_dst, hw_dst, vid);
+ if (dropped)
+ goto out;
+
/* If BLA is enabled, only forward ARP replies if we have claimed the
* source of the ARP reply or if no one else of the same backbone has
* already claimed that client. This prevents that different gateways
@@ -1429,16 +1406,372 @@ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv,
out:
if (dropped)
kfree_skb(skb);
- if (dat_entry)
- batadv_dat_entry_put(dat_entry);
+ batadv_dat_entry_put(dat_entry);
/* if dropped == false -> deliver to the interface */
return dropped;
}
/**
+ * batadv_dat_check_dhcp_ipudp() - check skb for IP+UDP headers valid for DHCP
+ * @skb: the packet to check
+ * @ip_src: a buffer to store the IPv4 source address in
+ *
+ * Checks whether the given skb has an IP and UDP header valid for a DHCP
+ * message from a DHCP server. And if so, stores the IPv4 source address in
+ * the provided buffer.
+ *
+ * Return: True if valid, false otherwise.
+ */
+static bool
+batadv_dat_check_dhcp_ipudp(struct sk_buff *skb, __be32 *ip_src)
+{
+ unsigned int offset = skb_network_offset(skb);
+ struct udphdr *udphdr, _udphdr;
+ struct iphdr *iphdr, _iphdr;
+
+ iphdr = skb_header_pointer(skb, offset, sizeof(_iphdr), &_iphdr);
+ if (!iphdr || iphdr->version != 4 || iphdr->ihl * 4 < sizeof(_iphdr))
+ return false;
+
+ if (iphdr->protocol != IPPROTO_UDP)
+ return false;
+
+ offset += iphdr->ihl * 4;
+ skb_set_transport_header(skb, offset);
+
+ udphdr = skb_header_pointer(skb, offset, sizeof(_udphdr), &_udphdr);
+ if (!udphdr || udphdr->source != htons(67))
+ return false;
+
+ *ip_src = get_unaligned(&iphdr->saddr);
+
+ return true;
+}
+
+/**
+ * batadv_dat_check_dhcp() - examine packet for valid DHCP message
+ * @skb: the packet to check
+ * @proto: ethernet protocol hint (behind a potential vlan)
+ * @ip_src: a buffer to store the IPv4 source address in
+ *
+ * Checks whether the given skb is a valid DHCP packet. And if so, stores the
+ * IPv4 source address in the provided buffer.
+ *
+ * Caller needs to ensure that the skb network header is set correctly.
+ *
+ * Return: If skb is a valid DHCP packet, then returns its op code
+ * (e.g. BOOTREPLY vs. BOOTREQUEST). Otherwise returns -EINVAL.
+ */
+static int
+batadv_dat_check_dhcp(struct sk_buff *skb, __be16 proto, __be32 *ip_src)
+{
+ __be32 *magic, _magic;
+ unsigned int offset;
+ struct {
+ __u8 op;
+ __u8 htype;
+ __u8 hlen;
+ __u8 hops;
+ } *dhcp_h, _dhcp_h;
+
+ if (proto != htons(ETH_P_IP))
+ return -EINVAL;
+
+ if (!batadv_dat_check_dhcp_ipudp(skb, ip_src))
+ return -EINVAL;
+
+ offset = skb_transport_offset(skb) + sizeof(struct udphdr);
+ if (skb->len < offset + sizeof(struct batadv_dhcp_packet))
+ return -EINVAL;
+
+ dhcp_h = skb_header_pointer(skb, offset, sizeof(_dhcp_h), &_dhcp_h);
+ if (!dhcp_h || dhcp_h->htype != BATADV_HTYPE_ETHERNET ||
+ dhcp_h->hlen != ETH_ALEN)
+ return -EINVAL;
+
+ offset += offsetof(struct batadv_dhcp_packet, magic);
+
+ magic = skb_header_pointer(skb, offset, sizeof(_magic), &_magic);
+ if (!magic || get_unaligned(magic) != htonl(BATADV_DHCP_MAGIC))
+ return -EINVAL;
+
+ return dhcp_h->op;
+}
+
+/**
+ * batadv_dat_get_dhcp_message_type() - get message type of a DHCP packet
+ * @skb: the DHCP packet to parse
+ *
+ * Iterates over the DHCP options of the given DHCP packet to find a
+ * DHCP Message Type option and parse it.
+ *
+ * Caller needs to ensure that the given skb is a valid DHCP packet and
+ * that the skb transport header is set correctly.
+ *
+ * Return: The found DHCP message type value, if found. -EINVAL otherwise.
+ */
+static int batadv_dat_get_dhcp_message_type(struct sk_buff *skb)
+{
+ unsigned int offset = skb_transport_offset(skb) + sizeof(struct udphdr);
+ u8 *type, _type;
+ struct {
+ u8 type;
+ u8 len;
+ } *tl, _tl;
+
+ offset += sizeof(struct batadv_dhcp_packet);
+
+ while ((tl = skb_header_pointer(skb, offset, sizeof(_tl), &_tl))) {
+ if (tl->type == BATADV_DHCP_OPT_MSG_TYPE)
+ break;
+
+ if (tl->type == BATADV_DHCP_OPT_END)
+ break;
+
+ if (tl->type == BATADV_DHCP_OPT_PAD)
+ offset++;
+ else
+ offset += tl->len + sizeof(_tl);
+ }
+
+ /* Option Overload Code not supported */
+ if (!tl || tl->type != BATADV_DHCP_OPT_MSG_TYPE ||
+ tl->len != sizeof(_type))
+ return -EINVAL;
+
+ offset += sizeof(_tl);
+
+ type = skb_header_pointer(skb, offset, sizeof(_type), &_type);
+ if (!type)
+ return -EINVAL;
+
+ return *type;
+}
+
+/**
+ * batadv_dat_dhcp_get_yiaddr() - get yiaddr from a DHCP packet
+ * @skb: the DHCP packet to parse
+ * @buf: a buffer to store the yiaddr in
+ *
+ * Caller needs to ensure that the given skb is a valid DHCP packet and
+ * that the skb transport header is set correctly.
+ *
+ * Return: True on success, false otherwise.
+ */
+static bool batadv_dat_dhcp_get_yiaddr(struct sk_buff *skb, __be32 *buf)
+{
+ unsigned int offset = skb_transport_offset(skb) + sizeof(struct udphdr);
+ __be32 *yiaddr;
+
+ offset += offsetof(struct batadv_dhcp_packet, yiaddr);
+ yiaddr = skb_header_pointer(skb, offset, BATADV_DHCP_YIADDR_LEN, buf);
+
+ if (!yiaddr)
+ return false;
+
+ if (yiaddr != buf)
+ *buf = get_unaligned(yiaddr);
+
+ return true;
+}
+
+/**
+ * batadv_dat_get_dhcp_chaddr() - get chaddr from a DHCP packet
+ * @skb: the DHCP packet to parse
+ * @buf: a buffer to store the chaddr in
+ *
+ * Caller needs to ensure that the given skb is a valid DHCP packet and
+ * that the skb transport header is set correctly.
+ *
+ * Return: True on success, false otherwise
+ */
+static bool batadv_dat_get_dhcp_chaddr(struct sk_buff *skb, u8 *buf)
+{
+ unsigned int offset = skb_transport_offset(skb) + sizeof(struct udphdr);
+ u8 *chaddr;
+
+ offset += offsetof(struct batadv_dhcp_packet, chaddr);
+ chaddr = skb_header_pointer(skb, offset, BATADV_DHCP_CHADDR_LEN, buf);
+
+ if (!chaddr)
+ return false;
+
+ if (chaddr != buf)
+ memcpy(buf, chaddr, BATADV_DHCP_CHADDR_LEN);
+
+ return true;
+}
+
+/**
+ * batadv_dat_put_dhcp() - puts addresses from a DHCP packet into the DHT and
+ * DAT cache
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @chaddr: the DHCP client MAC address
+ * @yiaddr: the DHCP client IP address
+ * @hw_dst: the DHCP server MAC address
+ * @ip_dst: the DHCP server IP address
+ * @vid: VLAN identifier
+ *
+ * Adds given MAC/IP pairs to the local DAT cache and propagates them further
+ * into the DHT.
+ *
+ * For the DHT propagation, client MAC + IP will appear as the ARP Reply
+ * transmitter (and hw_dst/ip_dst as the target).
+ */
+static void batadv_dat_put_dhcp(struct batadv_priv *bat_priv, u8 *chaddr,
+ __be32 yiaddr, u8 *hw_dst, __be32 ip_dst,
+ unsigned short vid)
+{
+ struct sk_buff *skb;
+
+ skb = batadv_dat_arp_create_reply(bat_priv, yiaddr, ip_dst, chaddr,
+ hw_dst, vid);
+ if (!skb)
+ return;
+
+ skb_set_network_header(skb, ETH_HLEN);
+
+ batadv_dat_entry_add(bat_priv, yiaddr, chaddr, vid);
+ batadv_dat_entry_add(bat_priv, ip_dst, hw_dst, vid);
+
+ batadv_dat_forward_data(bat_priv, skb, yiaddr, vid,
+ BATADV_P_DAT_DHT_PUT);
+ batadv_dat_forward_data(bat_priv, skb, ip_dst, vid,
+ BATADV_P_DAT_DHT_PUT);
+
+ consume_skb(skb);
+
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "Snooped from outgoing DHCPACK (server address): %pI4, %pM (vid: %i)\n",
+ &ip_dst, hw_dst, batadv_print_vid(vid));
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "Snooped from outgoing DHCPACK (client address): %pI4, %pM (vid: %i)\n",
+ &yiaddr, chaddr, batadv_print_vid(vid));
+}
+
+/**
+ * batadv_dat_check_dhcp_ack() - examine packet for valid DHCP message
+ * @skb: the packet to check
+ * @proto: ethernet protocol hint (behind a potential vlan)
+ * @ip_src: a buffer to store the IPv4 source address in
+ * @chaddr: a buffer to store the DHCP Client Hardware Address in
+ * @yiaddr: a buffer to store the DHCP Your IP Address in
+ *
+ * Checks whether the given skb is a valid DHCPACK. And if so, stores the
+ * IPv4 server source address (ip_src), client MAC address (chaddr) and client
+ * IPv4 address (yiaddr) in the provided buffers.
+ *
+ * Caller needs to ensure that the skb network header is set correctly.
+ *
+ * Return: True if the skb is a valid DHCPACK. False otherwise.
+ */
+static bool
+batadv_dat_check_dhcp_ack(struct sk_buff *skb, __be16 proto, __be32 *ip_src,
+ u8 *chaddr, __be32 *yiaddr)
+{
+ int type;
+
+ type = batadv_dat_check_dhcp(skb, proto, ip_src);
+ if (type != BATADV_BOOTREPLY)
+ return false;
+
+ type = batadv_dat_get_dhcp_message_type(skb);
+ if (type != BATADV_DHCPACK)
+ return false;
+
+ if (!batadv_dat_dhcp_get_yiaddr(skb, yiaddr))
+ return false;
+
+ if (!batadv_dat_get_dhcp_chaddr(skb, chaddr))
+ return false;
+
+ return true;
+}
+
+/**
+ * batadv_dat_snoop_outgoing_dhcp_ack() - snoop DHCPACK and fill DAT with it
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the packet to snoop
+ * @proto: ethernet protocol hint (behind a potential vlan)
+ * @vid: VLAN identifier
+ *
+ * This function first checks whether the given skb is a valid DHCPACK. If
+ * so then its source MAC and IP as well as its DHCP Client Hardware Address
+ * field and DHCP Your IP Address field are added to the local DAT cache and
+ * propagated into the DHT.
+ *
+ * Caller needs to ensure that the skb mac and network headers are set
+ * correctly.
+ */
+void batadv_dat_snoop_outgoing_dhcp_ack(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ __be16 proto,
+ unsigned short vid)
+{
+ u8 chaddr[BATADV_DHCP_CHADDR_LEN];
+ __be32 ip_src, yiaddr;
+
+ if (!atomic_read(&bat_priv->distributed_arp_table))
+ return;
+
+ if (!batadv_dat_check_dhcp_ack(skb, proto, &ip_src, chaddr, &yiaddr))
+ return;
+
+ batadv_dat_put_dhcp(bat_priv, chaddr, yiaddr, eth_hdr(skb)->h_source,
+ ip_src, vid);
+}
+
+/**
+ * batadv_dat_snoop_incoming_dhcp_ack() - snoop DHCPACK and fill DAT cache
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the packet to snoop
+ * @hdr_size: header size, up to the tail of the batman-adv header
+ *
+ * This function first checks whether the given skb is a valid DHCPACK. If
+ * so then its source MAC and IP as well as its DHCP Client Hardware Address
+ * field and DHCP Your IP Address field are added to the local DAT cache.
+ */
+void batadv_dat_snoop_incoming_dhcp_ack(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size)
+{
+ u8 chaddr[BATADV_DHCP_CHADDR_LEN];
+ struct ethhdr *ethhdr;
+ __be32 ip_src, yiaddr;
+ unsigned short vid;
+ __be16 proto;
+ u8 *hw_src;
+
+ if (!atomic_read(&bat_priv->distributed_arp_table))
+ return;
+
+ if (unlikely(!pskb_may_pull(skb, hdr_size + ETH_HLEN)))
+ return;
+
+ ethhdr = (struct ethhdr *)(skb->data + hdr_size);
+ skb_set_network_header(skb, hdr_size + ETH_HLEN);
+ proto = ethhdr->h_proto;
+
+ if (!batadv_dat_check_dhcp_ack(skb, proto, &ip_src, chaddr, &yiaddr))
+ return;
+
+ hw_src = ethhdr->h_source;
+ vid = batadv_dat_get_vid(skb, &hdr_size);
+
+ batadv_dat_entry_add(bat_priv, yiaddr, chaddr, vid);
+ batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid);
+
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "Snooped from incoming DHCPACK (server address): %pI4, %pM (vid: %i)\n",
+ &ip_src, hw_src, batadv_print_vid(vid));
+ batadv_dbg(BATADV_DBG_DAT, bat_priv,
+ "Snooped from incoming DHCPACK (client address): %pI4, %pM (vid: %i)\n",
+ &yiaddr, chaddr, batadv_print_vid(vid));
+}
+
+/**
* batadv_dat_drop_broadcast_packet() - check if an ARP request has to be
* dropped (because the node has already obtained the reply via DAT) or not
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @forw_packet: the broadcast packet
*
* Return: true if the node can drop the packet, false otherwise.
@@ -1482,7 +1815,6 @@ bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv,
ret = true;
out:
- if (dat_entry)
- batadv_dat_entry_put(dat_entry);
+ batadv_dat_entry_put(dat_entry);
return ret;
}
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
index a04596028337..e7b75e82eb1d 100644
--- a/net/batman-adv/distributed-arp-table.h
+++ b/net/batman-adv/distributed-arp-table.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Antonio Quartulli
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_DISTRIBUTED_ARP_TABLE_H_
@@ -23,15 +11,13 @@
#include <linux/compiler.h>
#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/skbuff.h>
#include <linux/types.h>
#include <uapi/linux/batadv_packet.h>
#include "originator.h"
-struct netlink_callback;
-struct seq_file;
-struct sk_buff;
-
#ifdef CONFIG_BATMAN_ADV_DAT
/* BATADV_DAT_ADDR_MAX - maximum address value in the DHT space */
@@ -46,6 +32,12 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv,
struct sk_buff *skb);
bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv,
struct sk_buff *skb, int hdr_size);
+void batadv_dat_snoop_outgoing_dhcp_ack(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ __be16 proto,
+ unsigned short vid);
+void batadv_dat_snoop_incoming_dhcp_ack(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size);
bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv,
struct batadv_forw_packet *forw_packet);
@@ -64,7 +56,7 @@ batadv_dat_init_orig_node_addr(struct batadv_orig_node *orig_node)
/**
* batadv_dat_init_own_addr() - assign a DAT address to the node itself
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @primary_if: a pointer to the primary interface
*/
static inline void
@@ -81,12 +73,11 @@ batadv_dat_init_own_addr(struct batadv_priv *bat_priv,
int batadv_dat_init(struct batadv_priv *bat_priv);
void batadv_dat_free(struct batadv_priv *bat_priv);
-int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset);
int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb);
/**
* batadv_dat_inc_counter() - increment the correct DAT packet counter
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @subtype: the 4addr subtype of the packet to be counted
*
* Updates the ethtool statistics for the received packet if it is a DAT subtype
@@ -140,6 +131,19 @@ batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv,
return false;
}
+static inline void
+batadv_dat_snoop_outgoing_dhcp_ack(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, __be16 proto,
+ unsigned short vid)
+{
+}
+
+static inline void
+batadv_dat_snoop_incoming_dhcp_ack(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, int hdr_size)
+{
+}
+
static inline bool
batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv,
struct batadv_forw_packet *forw_packet)
@@ -157,11 +161,6 @@ static inline void batadv_dat_init_own_addr(struct batadv_priv *bat_priv,
{
}
-static inline void batadv_arp_change_timeout(struct net_device *soft_iface,
- const char *name)
-{
-}
-
static inline int batadv_dat_init(struct batadv_priv *bat_priv)
{
return 0;
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 0fddc17106bd..cc14bc41381e 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -1,19 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Martin Hundebøll <martin@hundeboll.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "fragmentation.h"
@@ -26,8 +14,8 @@
#include <linux/gfp.h>
#include <linux/if_ether.h>
#include <linux/jiffies.h>
-#include <linux/kernel.h>
#include <linux/lockdep.h>
+#include <linux/minmax.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
@@ -37,9 +25,7 @@
#include "hard-interface.h"
#include "originator.h"
-#include "routing.h"
#include "send.h"
-#include "soft-interface.h"
/**
* batadv_frag_clear_chain() - delete entries in the fragment buffer chain
@@ -114,8 +100,8 @@ static int batadv_frag_size_limit(void)
*
* Caller must hold chain->lock.
*
- * Return: true if chain is empty and caller can just insert the new fragment
- * without searching for the right position.
+ * Return: true if chain is empty and the caller can just insert the new
+ * fragment without searching for the right position.
*/
static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain,
u16 seqno)
@@ -275,7 +261,7 @@ batadv_frag_merge_packets(struct hlist_head *chain)
kfree(entry);
packet = (struct batadv_frag_packet *)skb_out->data;
- size = ntohs(packet->total_size);
+ size = ntohs(packet->total_size) + hdr_size;
/* Make room for the rest of the fragments. */
if (pskb_expand_head(skb_out, 0, size - skb_out->len, GFP_ATOMIC) < 0) {
@@ -318,7 +304,7 @@ free:
* set *skb to merged packet; 2) Packet is buffered: Return true and set *skb
* to NULL; 3) Error: Return false and free skb.
*
- * Return: true when packet is merged or buffered, false when skb is not not
+ * Return: true when the packet is merged or buffered, false when skb is not
* used.
*/
bool batadv_frag_skb_buffer(struct sk_buff **skb,
@@ -363,19 +349,15 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb,
struct batadv_hard_iface *recv_if,
struct batadv_orig_node *orig_node_src)
{
- struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
- struct batadv_orig_node *orig_node_dst;
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface);
struct batadv_neigh_node *neigh_node = NULL;
struct batadv_frag_packet *packet;
u16 total_size;
bool ret = false;
packet = (struct batadv_frag_packet *)skb->data;
- orig_node_dst = batadv_orig_hash_find(bat_priv, packet->dest);
- if (!orig_node_dst)
- goto out;
- neigh_node = batadv_find_router(bat_priv, orig_node_dst, recv_if);
+ neigh_node = batadv_orig_to_router(bat_priv, packet->dest, recv_if);
if (!neigh_node)
goto out;
@@ -394,15 +376,13 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb,
}
out:
- if (orig_node_dst)
- batadv_orig_node_put(orig_node_dst);
- if (neigh_node)
- batadv_neigh_node_put(neigh_node);
+ batadv_neigh_node_put(neigh_node);
return ret;
}
/**
* batadv_frag_create() - create a fragment from skb
+ * @net_dev: outgoing device for fragment
* @skb: skb to create fragment from
* @frag_head: header to use in new fragment
* @fragment_size: size of new fragment
@@ -413,22 +393,25 @@ out:
*
* Return: the new fragment, NULL on error.
*/
-static struct sk_buff *batadv_frag_create(struct sk_buff *skb,
+static struct sk_buff *batadv_frag_create(struct net_device *net_dev,
+ struct sk_buff *skb,
struct batadv_frag_packet *frag_head,
unsigned int fragment_size)
{
+ unsigned int ll_reserved = LL_RESERVED_SPACE(net_dev);
+ unsigned int tailroom = net_dev->needed_tailroom;
struct sk_buff *skb_fragment;
unsigned int header_size = sizeof(*frag_head);
unsigned int mtu = fragment_size + header_size;
- skb_fragment = netdev_alloc_skb(NULL, mtu + ETH_HLEN);
+ skb_fragment = dev_alloc_skb(ll_reserved + mtu + tailroom);
if (!skb_fragment)
goto err;
skb_fragment->priority = skb->priority;
/* Eat the last mtu-bytes of the skb */
- skb_reserve(skb_fragment, header_size + ETH_HLEN);
+ skb_reserve(skb_fragment, ll_reserved + header_size);
skb_split(skb, skb_fragment, skb->len - fragment_size);
/* Add the header */
@@ -451,11 +434,12 @@ int batadv_frag_send_packet(struct sk_buff *skb,
struct batadv_orig_node *orig_node,
struct batadv_neigh_node *neigh_node)
{
+ struct net_device *net_dev = neigh_node->if_incoming->net_dev;
struct batadv_priv *bat_priv;
struct batadv_hard_iface *primary_if = NULL;
struct batadv_frag_packet frag_header;
struct sk_buff *skb_fragment;
- unsigned int mtu = neigh_node->if_incoming->net_dev->mtu;
+ unsigned int mtu = net_dev->mtu;
unsigned int header_size = sizeof(frag_header);
unsigned int max_fragment_size, num_fragments;
int ret;
@@ -485,6 +469,17 @@ int batadv_frag_send_packet(struct sk_buff *skb,
goto free_skb;
}
+ /* GRO might have added fragments to the fragment list instead of
+ * frags[]. But this is not handled by skb_split and must be
+ * linearized to avoid incorrect length information after all
+ * batman-adv fragments were created and submitted to the
+ * hard-interface
+ */
+ if (skb_has_frag_list(skb) && __skb_linearize(skb)) {
+ ret = -ENOMEM;
+ goto free_skb;
+ }
+
/* Create one header to be copied to all fragments */
frag_header.packet_type = BATADV_UNICAST_FRAG;
frag_header.version = BATADV_COMPAT_VERSION;
@@ -515,7 +510,7 @@ int batadv_frag_send_packet(struct sk_buff *skb,
goto put_primary_if;
}
- skb_fragment = batadv_frag_create(skb, &frag_header,
+ skb_fragment = batadv_frag_create(net_dev, skb, &frag_header,
max_fragment_size);
if (!skb_fragment) {
ret = -ENOMEM;
@@ -534,13 +529,14 @@ int batadv_frag_send_packet(struct sk_buff *skb,
frag_header.no++;
}
- /* Make room for the fragment header. */
- if (batadv_skb_head_push(skb, header_size) < 0 ||
- pskb_expand_head(skb, header_size + ETH_HLEN, 0, GFP_ATOMIC) < 0) {
- ret = -ENOMEM;
+ /* make sure that there is at least enough head for the fragmentation
+ * and ethernet headers
+ */
+ ret = skb_cow_head(skb, ETH_HLEN + header_size);
+ if (ret < 0)
goto put_primary_if;
- }
+ skb_push(skb, header_size);
memcpy(skb->data, &frag_header, header_size);
/* Send the last fragment */
diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h
index 944512e07782..dbf0871f8703 100644
--- a/net/batman-adv/fragmentation.h
+++ b/net/batman-adv/fragmentation.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Martin Hundebøll <martin@hundeboll.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_FRAGMENTATION_H_
@@ -23,11 +11,10 @@
#include <linux/compiler.h>
#include <linux/list.h>
+#include <linux/skbuff.h>
#include <linux/stddef.h>
#include <linux/types.h>
-struct sk_buff;
-
void batadv_frag_purge_orig(struct batadv_orig_node *orig,
bool (*check_cb)(struct batadv_frag_table_entry *));
bool batadv_frag_skb_fwd(struct sk_buff *skb,
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 140c61a3f1ec..7a11b245e9f4 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -1,19 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "gateway_client.h"
@@ -21,6 +9,8 @@
#include <linux/atomic.h>
#include <linux/byteorder/generic.h>
+#include <linux/container_of.h>
+#include <linux/err.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/gfp.h>
@@ -29,7 +19,6 @@
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
-#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/list.h>
#include <linux/lockdep.h>
@@ -37,24 +26,20 @@
#include <linux/netlink.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <linux/sprintf.h>
#include <linux/stddef.h>
#include <linux/udp.h>
-#include <net/sock.h>
#include <uapi/linux/batadv_packet.h>
#include <uapi/linux/batman_adv.h>
-#include "gateway_common.h"
#include "hard-interface.h"
#include "log.h"
#include "netlink.h"
#include "originator.h"
#include "routing.h"
-#include "soft-interface.h"
-#include "sysfs.h"
#include "translation-table.h"
/* These are the offsets of the "hw type" and "hw address length" in the dhcp
@@ -74,7 +59,7 @@
* after rcu grace period
* @ref: kref pointer of the gw_node
*/
-static void batadv_gw_node_release(struct kref *ref)
+void batadv_gw_node_release(struct kref *ref)
{
struct batadv_gw_node *gw_node;
@@ -85,18 +70,8 @@ static void batadv_gw_node_release(struct kref *ref)
}
/**
- * batadv_gw_node_put() - decrement the gw_node refcounter and possibly release
- * it
- * @gw_node: gateway node to free
- */
-void batadv_gw_node_put(struct batadv_gw_node *gw_node)
-{
- kref_put(&gw_node->refcount, batadv_gw_node_release);
-}
-
-/**
* batadv_gw_get_selected_gw_node() - Get currently selected gateway
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*
* Return: selected gateway (with increased refcnt), NULL on errors
*/
@@ -120,7 +95,7 @@ out:
/**
* batadv_gw_get_selected_orig() - Get originator of currently selected gateway
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*
* Return: orig_node of selected gateway (with increased refcnt), NULL on errors
*/
@@ -145,8 +120,7 @@ batadv_gw_get_selected_orig(struct batadv_priv *bat_priv)
unlock:
rcu_read_unlock();
out:
- if (gw_node)
- batadv_gw_node_put(gw_node);
+ batadv_gw_node_put(gw_node);
return orig_node;
}
@@ -160,18 +134,17 @@ static void batadv_gw_select(struct batadv_priv *bat_priv,
if (new_gw_node)
kref_get(&new_gw_node->refcount);
- curr_gw_node = rcu_dereference_protected(bat_priv->gw.curr_gw, 1);
- rcu_assign_pointer(bat_priv->gw.curr_gw, new_gw_node);
+ curr_gw_node = rcu_replace_pointer(bat_priv->gw.curr_gw, new_gw_node,
+ true);
- if (curr_gw_node)
- batadv_gw_node_put(curr_gw_node);
+ batadv_gw_node_put(curr_gw_node);
spin_unlock_bh(&bat_priv->gw.list_lock);
}
/**
* batadv_gw_reselect() - force a gateway reselection
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*
* Set a flag to remind the GW component to perform a new gateway reselection.
* However this function does not ensure that the current gateway is going to be
@@ -187,7 +160,7 @@ void batadv_gw_reselect(struct batadv_priv *bat_priv)
/**
* batadv_gw_check_client_stop() - check if client mode has been switched off
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*
* This function assumes the caller has checked that the gw state *is actually
* changing*. This function is not supposed to be called when there is no state
@@ -219,7 +192,7 @@ void batadv_gw_check_client_stop(struct batadv_priv *bat_priv)
/**
* batadv_gw_election() - Elect the best gateway
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*/
void batadv_gw_election(struct batadv_priv *bat_priv)
{
@@ -299,19 +272,15 @@ void batadv_gw_election(struct batadv_priv *bat_priv)
batadv_gw_select(bat_priv, next_gw);
out:
- if (curr_gw)
- batadv_gw_node_put(curr_gw);
- if (next_gw)
- batadv_gw_node_put(next_gw);
- if (router)
- batadv_neigh_node_put(router);
- if (router_ifinfo)
- batadv_neigh_ifinfo_put(router_ifinfo);
+ batadv_gw_node_put(curr_gw);
+ batadv_gw_node_put(next_gw);
+ batadv_neigh_node_put(router);
+ batadv_neigh_ifinfo_put(router_ifinfo);
}
/**
* batadv_gw_check_election() - Elect orig node as best gateway when eligible
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig_node: orig node which is to be checked
*/
void batadv_gw_check_election(struct batadv_priv *bat_priv,
@@ -340,13 +309,12 @@ void batadv_gw_check_election(struct batadv_priv *bat_priv,
reselect:
batadv_gw_reselect(bat_priv);
out:
- if (curr_gw_orig)
- batadv_orig_node_put(curr_gw_orig);
+ batadv_orig_node_put(curr_gw_orig);
}
/**
* batadv_gw_node_add() - add gateway node to list of available gateways
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig_node: originator announcing gateway capabilities
* @gateway: announced bandwidth information
*
@@ -377,6 +345,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
kref_get(&gw_node->refcount);
hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.gateway_list);
+ bat_priv->gw.generation++;
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
"Found new gateway %pM -> gw bandwidth: %u.%u/%u.%u MBit\n",
@@ -392,7 +361,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
/**
* batadv_gw_node_get() - retrieve gateway node from list of available gateways
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig_node: originator announcing gateway capabilities
*
* Return: gateway node if found or NULL otherwise.
@@ -422,7 +391,7 @@ struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv,
/**
* batadv_gw_node_update() - update list of available gateways with changed
* bandwidth information
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig_node: originator announcing gateway capabilities
* @gateway: announced bandwidth information
*/
@@ -472,6 +441,7 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv,
if (!hlist_unhashed(&gw_node->list)) {
hlist_del_init_rcu(&gw_node->list);
batadv_gw_node_put(gw_node);
+ bat_priv->gw.generation++;
}
spin_unlock_bh(&bat_priv->gw.list_lock);
@@ -479,18 +449,16 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv,
if (gw_node == curr_gw)
batadv_gw_reselect(bat_priv);
- if (curr_gw)
- batadv_gw_node_put(curr_gw);
+ batadv_gw_node_put(curr_gw);
}
out:
- if (gw_node)
- batadv_gw_node_put(gw_node);
+ batadv_gw_node_put(gw_node);
}
/**
* batadv_gw_node_delete() - Remove orig_node from gateway list
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig_node: orig node which is currently in process of being removed
*/
void batadv_gw_node_delete(struct batadv_priv *bat_priv,
@@ -505,8 +473,8 @@ void batadv_gw_node_delete(struct batadv_priv *bat_priv,
}
/**
- * batadv_gw_node_free() - Free gateway information from soft interface
- * @bat_priv: the bat priv with all the soft interface information
+ * batadv_gw_node_free() - Free gateway information from mesh interface
+ * @bat_priv: the bat priv with all the mesh interface information
*/
void batadv_gw_node_free(struct batadv_priv *bat_priv)
{
@@ -518,48 +486,11 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv)
&bat_priv->gw.gateway_list, list) {
hlist_del_init_rcu(&gw_node->list);
batadv_gw_node_put(gw_node);
+ bat_priv->gw.generation++;
}
spin_unlock_bh(&bat_priv->gw.list_lock);
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-
-/**
- * batadv_gw_client_seq_print_text() - Print the gateway table in a seq file
- * @seq: seq file to print on
- * @offset: not used
- *
- * Return: always 0
- */
-int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hard_iface *primary_if;
-
- primary_if = batadv_seq_print_text_primary_if_get(seq);
- if (!primary_if)
- return 0;
-
- seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n",
- BATADV_SOURCE_VERSION, primary_if->net_dev->name,
- primary_if->net_dev->dev_addr, net_dev->name,
- bat_priv->algo_ops->name);
-
- batadv_hardif_put(primary_if);
-
- if (!bat_priv->algo_ops->gw.print) {
- seq_puts(seq,
- "No printing function for this routing protocol\n");
- return 0;
- }
-
- bat_priv->algo_ops->gw.print(bat_priv, seq);
-
- return 0;
-}
-#endif
-
/**
* batadv_gw_dump() - Dump gateways into a message
* @msg: Netlink message to dump into
@@ -570,24 +501,15 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset)
int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb)
{
struct batadv_hard_iface *primary_if = NULL;
- struct net *net = sock_net(cb->skb->sk);
- struct net_device *soft_iface;
+ struct net_device *mesh_iface;
struct batadv_priv *bat_priv;
- int ifindex;
int ret;
- ifindex = batadv_netlink_get_ifindex(cb->nlh,
- BATADV_ATTR_MESH_IFINDEX);
- if (!ifindex)
- return -EINVAL;
+ mesh_iface = batadv_netlink_get_meshif(cb);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
- ret = -ENODEV;
- goto out;
- }
-
- bat_priv = netdev_priv(soft_iface);
+ bat_priv = netdev_priv(mesh_iface);
primary_if = batadv_primary_if_get_selected(bat_priv);
if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
@@ -605,10 +527,8 @@ int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb)
ret = msg->len;
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- if (soft_iface)
- dev_put(soft_iface);
+ batadv_hardif_put(primary_if);
+ dev_put(mesh_iface);
return ret;
}
@@ -714,8 +634,10 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len,
chaddr_offset = *header_len + BATADV_DHCP_CHADDR_OFFSET;
/* store the client address if the message is going to a client */
- if (ret == BATADV_DHCP_TO_CLIENT &&
- pskb_may_pull(skb, chaddr_offset + ETH_ALEN)) {
+ if (ret == BATADV_DHCP_TO_CLIENT) {
+ if (!pskb_may_pull(skb, chaddr_offset + ETH_ALEN))
+ return BATADV_DHCP_NO;
+
/* check if the DHCP packet carries an Ethernet DHCP */
p = skb->data + *header_len + BATADV_DHCP_HTYPE_OFFSET;
if (*p != BATADV_DHCP_HTYPE_ETHERNET)
@@ -735,7 +657,7 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len,
/**
* batadv_gw_out_of_range() - check if the dhcp request destination is the best
* gateway
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: the outgoing packet
*
* Check if the skb is a DHCP request and if it is sent to the current best GW
@@ -828,15 +750,10 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv,
batadv_neigh_ifinfo_put(old_ifinfo);
out:
- if (orig_dst_node)
- batadv_orig_node_put(orig_dst_node);
- if (curr_gw)
- batadv_gw_node_put(curr_gw);
- if (gw_node)
- batadv_gw_node_put(gw_node);
- if (neigh_old)
- batadv_neigh_node_put(neigh_old);
- if (neigh_curr)
- batadv_neigh_node_put(neigh_curr);
+ batadv_orig_node_put(orig_dst_node);
+ batadv_gw_node_put(curr_gw);
+ batadv_gw_node_put(gw_node);
+ batadv_neigh_node_put(neigh_old);
+ batadv_neigh_node_put(neigh_curr);
return out_of_range;
}
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index f0b86fcb2493..95c2ccdaa554 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_GATEWAY_CLIENT_H_
@@ -21,12 +9,11 @@
#include "main.h"
+#include <linux/kref.h>
+#include <linux/netlink.h>
+#include <linux/skbuff.h>
#include <linux/types.h>
-
-struct batadv_tvlv_gateway_data;
-struct netlink_callback;
-struct seq_file;
-struct sk_buff;
+#include <uapi/linux/batadv_packet.h>
void batadv_gw_check_client_stop(struct batadv_priv *bat_priv);
void batadv_gw_reselect(struct batadv_priv *bat_priv);
@@ -41,10 +28,9 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv,
void batadv_gw_node_delete(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node);
void batadv_gw_node_free(struct batadv_priv *bat_priv);
-void batadv_gw_node_put(struct batadv_gw_node *gw_node);
+void batadv_gw_node_release(struct kref *ref);
struct batadv_gw_node *
batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv);
-int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset);
int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb);
bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb);
enum batadv_dhcp_recipient
@@ -53,4 +39,17 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len,
struct batadv_gw_node *batadv_gw_node_get(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node);
+/**
+ * batadv_gw_node_put() - decrement the gw_node refcounter and possibly release
+ * it
+ * @gw_node: gateway node to free
+ */
+static inline void batadv_gw_node_put(struct batadv_gw_node *gw_node)
+{
+ if (!gw_node)
+ return;
+
+ kref_put(&gw_node->refcount, batadv_gw_node_release);
+}
+
#endif /* _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ */
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index 936c107f3199..315fa90f0c94 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -1,19 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "gateway_common.h"
@@ -21,125 +9,18 @@
#include <linux/atomic.h>
#include <linux/byteorder/generic.h>
-#include <linux/errno.h>
-#include <linux/kernel.h>
-#include <linux/math64.h>
-#include <linux/netdevice.h>
#include <linux/stddef.h>
-#include <linux/string.h>
+#include <linux/types.h>
#include <uapi/linux/batadv_packet.h>
+#include <uapi/linux/batman_adv.h>
#include "gateway_client.h"
-#include "log.h"
#include "tvlv.h"
/**
- * batadv_parse_throughput() - parse supplied string buffer to extract
- * throughput information
- * @net_dev: the soft interface net device
- * @buff: string buffer to parse
- * @description: text shown when throughput string cannot be parsed
- * @throughput: pointer holding the returned throughput information
- *
- * Return: false on parse error and true otherwise.
- */
-bool batadv_parse_throughput(struct net_device *net_dev, char *buff,
- const char *description, u32 *throughput)
-{
- enum batadv_bandwidth_units bw_unit_type = BATADV_BW_UNIT_KBIT;
- u64 lthroughput;
- char *tmp_ptr;
- int ret;
-
- if (strlen(buff) > 4) {
- tmp_ptr = buff + strlen(buff) - 4;
-
- if (strncasecmp(tmp_ptr, "mbit", 4) == 0)
- bw_unit_type = BATADV_BW_UNIT_MBIT;
-
- if (strncasecmp(tmp_ptr, "kbit", 4) == 0 ||
- bw_unit_type == BATADV_BW_UNIT_MBIT)
- *tmp_ptr = '\0';
- }
-
- ret = kstrtou64(buff, 10, &lthroughput);
- if (ret) {
- batadv_err(net_dev,
- "Invalid throughput speed for %s: %s\n",
- description, buff);
- return false;
- }
-
- switch (bw_unit_type) {
- case BATADV_BW_UNIT_MBIT:
- /* prevent overflow */
- if (U64_MAX / 10 < lthroughput) {
- batadv_err(net_dev,
- "Throughput speed for %s too large: %s\n",
- description, buff);
- return false;
- }
-
- lthroughput *= 10;
- break;
- case BATADV_BW_UNIT_KBIT:
- default:
- lthroughput = div_u64(lthroughput, 100);
- break;
- }
-
- if (lthroughput > U32_MAX) {
- batadv_err(net_dev,
- "Throughput speed for %s too large: %s\n",
- description, buff);
- return false;
- }
-
- *throughput = lthroughput;
-
- return true;
-}
-
-/**
- * batadv_parse_gw_bandwidth() - parse supplied string buffer to extract
- * download and upload bandwidth information
- * @net_dev: the soft interface net device
- * @buff: string buffer to parse
- * @down: pointer holding the returned download bandwidth information
- * @up: pointer holding the returned upload bandwidth information
- *
- * Return: false on parse error and true otherwise.
- */
-static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff,
- u32 *down, u32 *up)
-{
- char *slash_ptr;
- bool ret;
-
- slash_ptr = strchr(buff, '/');
- if (slash_ptr)
- *slash_ptr = 0;
-
- ret = batadv_parse_throughput(net_dev, buff, "download gateway speed",
- down);
- if (!ret)
- return false;
-
- /* we also got some upload info */
- if (slash_ptr) {
- ret = batadv_parse_throughput(net_dev, slash_ptr + 1,
- "upload gateway speed", up);
- if (!ret)
- return false;
- }
-
- return true;
-}
-
-/**
* batadv_gw_tvlv_container_update() - update the gw tvlv container after
* gateway setting change
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*/
void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv)
{
@@ -166,59 +47,8 @@ void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv)
}
/**
- * batadv_gw_bandwidth_set() - Parse and set download/upload gateway bandwidth
- * from supplied string buffer
- * @net_dev: netdev struct of the soft interface
- * @buff: the buffer containing the user data
- * @count: number of bytes in the buffer
- *
- * Return: 'count' on success or a negative error code in case of failure
- */
-ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff,
- size_t count)
-{
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- u32 down_curr;
- u32 up_curr;
- u32 down_new = 0;
- u32 up_new = 0;
- bool ret;
-
- down_curr = (unsigned int)atomic_read(&bat_priv->gw.bandwidth_down);
- up_curr = (unsigned int)atomic_read(&bat_priv->gw.bandwidth_up);
-
- ret = batadv_parse_gw_bandwidth(net_dev, buff, &down_new, &up_new);
- if (!ret)
- return -EINVAL;
-
- if (!down_new)
- down_new = 1;
-
- if (!up_new)
- up_new = down_new / 5;
-
- if (!up_new)
- up_new = 1;
-
- if (down_curr == down_new && up_curr == up_new)
- return count;
-
- batadv_gw_reselect(bat_priv);
- batadv_info(net_dev,
- "Changing gateway bandwidth from: '%u.%u/%u.%u MBit' to: '%u.%u/%u.%u MBit'\n",
- down_curr / 10, down_curr % 10, up_curr / 10, up_curr % 10,
- down_new / 10, down_new % 10, up_new / 10, up_new % 10);
-
- atomic_set(&bat_priv->gw.bandwidth_down, down_new);
- atomic_set(&bat_priv->gw.bandwidth_up, up_new);
- batadv_gw_tvlv_container_update(bat_priv);
-
- return count;
-}
-
-/**
* batadv_gw_tvlv_ogm_handler_v1() - process incoming gateway tvlv container
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig: the orig_node of the ogm
* @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
* @tvlv_value: tvlv buffer containing the gateway data
@@ -259,7 +89,7 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
/**
* batadv_gw_init() - initialise the gateway handling internals
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*/
void batadv_gw_init(struct batadv_priv *bat_priv)
{
@@ -269,13 +99,13 @@ void batadv_gw_init(struct batadv_priv *bat_priv)
atomic_set(&bat_priv->gw.sel_class, 1);
batadv_tvlv_handler_register(bat_priv, batadv_gw_tvlv_ogm_handler_v1,
- NULL, BATADV_TVLV_GW, 1,
+ NULL, NULL, BATADV_TVLV_GW, 1,
BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
}
/**
* batadv_gw_free() - free the gateway handling internals
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*/
void batadv_gw_free(struct batadv_priv *bat_priv)
{
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
index 80afb2793687..5d097d6a1dd9 100644
--- a/net/batman-adv/gateway_common.h
+++ b/net/batman-adv/gateway_common.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_GATEWAY_COMMON_H_
@@ -21,16 +9,6 @@
#include "main.h"
-#include <linux/types.h>
-
-struct net_device;
-
-enum batadv_gw_modes {
- BATADV_GW_MODE_OFF,
- BATADV_GW_MODE_CLIENT,
- BATADV_GW_MODE_SERVER,
-};
-
/**
* enum batadv_bandwidth_units - bandwidth unit types
*/
@@ -46,12 +24,8 @@ enum batadv_bandwidth_units {
#define BATADV_GW_MODE_CLIENT_NAME "client"
#define BATADV_GW_MODE_SERVER_NAME "server"
-ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff,
- size_t count);
void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv);
void batadv_gw_init(struct batadv_priv *bat_priv);
void batadv_gw_free(struct batadv_priv *bat_priv);
-bool batadv_parse_throughput(struct net_device *net_dev, char *buff,
- const char *description, u32 *throughput);
#endif /* _NET_BATMAN_ADV_GATEWAY_COMMON_H_ */
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 2f0d42f2f913..5113f879736b 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -1,36 +1,28 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "hard-interface.h"
#include "main.h"
#include <linux/atomic.h>
-#include <linux/bug.h>
#include <linux/byteorder/generic.h>
+#include <linux/compiler.h>
+#include <linux/container_of.h>
#include <linux/errno.h>
#include <linux/gfp.h>
#include <linux/if.h>
#include <linux/if_arp.h>
#include <linux/if_ether.h>
-#include <linux/kernel.h>
#include <linux/kref.h>
+#include <linux/limits.h>
#include <linux/list.h>
+#include <linux/minmax.h>
+#include <linux/mutex.h>
#include <linux/netdevice.h>
+#include <linux/notifier.h>
#include <linux/printk.h>
#include <linux/rculist.h>
#include <linux/rtnetlink.h>
@@ -42,14 +34,12 @@
#include "bat_v.h"
#include "bridge_loop_avoidance.h"
-#include "debugfs.h"
#include "distributed-arp-table.h"
#include "gateway_client.h"
#include "log.h"
+#include "mesh-interface.h"
#include "originator.h"
#include "send.h"
-#include "soft-interface.h"
-#include "sysfs.h"
#include "translation-table.h"
/**
@@ -62,7 +52,7 @@ void batadv_hardif_release(struct kref *ref)
struct batadv_hard_iface *hard_iface;
hard_iface = container_of(ref, struct batadv_hard_iface, refcount);
- dev_put(hard_iface->net_dev);
+ netdev_put(hard_iface->net_dev, &hard_iface->dev_tracker);
kfree_rcu(hard_iface, rcu);
}
@@ -149,10 +139,10 @@ static bool batadv_mutual_parents(const struct net_device *dev1,
* @net_dev: the device to check
*
* If the user creates any virtual device on top of a batman-adv interface, it
- * is important to prevent this new interface to be used to create a new mesh
- * network (this behaviour would lead to a batman-over-batman configuration).
- * This function recursively checks all the fathers of the device passed as
- * argument looking for a batman-adv soft interface.
+ * is important to prevent this new interface from being used to create a new
+ * mesh network (this behaviour would lead to a batman-over-batman
+ * configuration). This function recursively checks all the fathers of the
+ * device passed as argument looking for a batman-adv mesh interface.
*
* Return: true if the device is descendant of a batman-adv mesh interface (or
* if it is a batman-adv interface itself), false otherwise
@@ -162,25 +152,30 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev)
struct net *net = dev_net(net_dev);
struct net_device *parent_dev;
struct net *parent_net;
+ int iflink;
bool ret;
/* check if this is a batman-adv mesh interface */
- if (batadv_softif_is_valid(net_dev))
+ if (batadv_meshif_is_valid(net_dev))
return true;
- /* no more parents..stop recursion */
- if (dev_get_iflink(net_dev) == 0 ||
- dev_get_iflink(net_dev) == net_dev->ifindex)
+ iflink = dev_get_iflink(net_dev);
+ if (iflink == 0)
return false;
parent_net = batadv_getlink_net(net_dev, net);
+ /* iflink to itself, most likely physical device */
+ if (net == parent_net && iflink == net_dev->ifindex)
+ return false;
+
/* recurse over the parent device */
- parent_dev = __dev_get_by_index((struct net *)parent_net,
- dev_get_iflink(net_dev));
- /* if we got a NULL parent_dev there is something broken.. */
- if (WARN(!parent_dev, "Cannot find parent device"))
+ parent_dev = __dev_get_by_index((struct net *)parent_net, iflink);
+ if (!parent_dev) {
+ pr_warn("Cannot find parent device. Skipping batadv-on-batadv check for %s\n",
+ net_dev->name);
return false;
+ }
if (batadv_mutual_parents(net_dev, net, parent_dev, parent_net))
return false;
@@ -225,30 +220,37 @@ static struct net_device *batadv_get_real_netdevice(struct net_device *netdev)
struct net_device *real_netdev = NULL;
struct net *real_net;
struct net *net;
- int ifindex;
+ int iflink;
ASSERT_RTNL();
if (!netdev)
return NULL;
- if (netdev->ifindex == dev_get_iflink(netdev)) {
+ iflink = dev_get_iflink(netdev);
+ if (iflink == 0) {
dev_hold(netdev);
return netdev;
}
hard_iface = batadv_hardif_get_by_netdev(netdev);
- if (!hard_iface || !hard_iface->soft_iface)
+ if (!hard_iface || !hard_iface->mesh_iface)
goto out;
- net = dev_net(hard_iface->soft_iface);
- ifindex = dev_get_iflink(netdev);
+ net = dev_net(hard_iface->mesh_iface);
real_net = batadv_getlink_net(netdev, net);
- real_netdev = dev_get_by_index(real_net, ifindex);
+
+ /* iflink to itself, most likely physical device */
+ if (net == real_net && netdev->ifindex == iflink) {
+ real_netdev = netdev;
+ dev_hold(real_netdev);
+ goto out;
+ }
+
+ real_netdev = dev_get_by_index(real_net, iflink);
out:
- if (hard_iface)
- batadv_hardif_put(hard_iface);
+ batadv_hardif_put(hard_iface);
return real_netdev;
}
@@ -308,9 +310,11 @@ static bool batadv_is_cfg80211_netdev(struct net_device *net_device)
if (!net_device)
return false;
+#if IS_ENABLED(CONFIG_CFG80211)
/* cfg80211 drivers have to set ieee80211_ptr */
if (net_device->ieee80211_ptr)
return true;
+#endif
return false;
}
@@ -413,7 +417,7 @@ int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing,
goto out;
}
- /* >1 neighbors -> (re)brodcast */
+ /* >1 neighbors -> (re)broadcast */
if (rcu_dereference(hlist_next_rcu(first)))
goto out;
@@ -435,15 +439,13 @@ out:
}
static struct batadv_hard_iface *
-batadv_hardif_get_active(const struct net_device *soft_iface)
+batadv_hardif_get_active(struct net_device *mesh_iface)
{
struct batadv_hard_iface *hard_iface;
+ struct list_head *iter;
rcu_read_lock();
- list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
- if (hard_iface->soft_iface != soft_iface)
- continue;
-
+ netdev_for_each_lower_private_rcu(mesh_iface, hard_iface, iter) {
if (hard_iface->if_status == BATADV_IF_ACTIVE &&
kref_get_unless_zero(&hard_iface->refcount))
goto out;
@@ -468,8 +470,7 @@ static void batadv_primary_if_update_addr(struct batadv_priv *bat_priv,
batadv_dat_init_own_addr(bat_priv, primary_if);
batadv_bla_update_orig_address(bat_priv, primary_if, oldif);
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
+ batadv_hardif_put(primary_if);
}
static void batadv_primary_if_select(struct batadv_priv *bat_priv,
@@ -482,8 +483,8 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv,
if (new_hard_iface)
kref_get(&new_hard_iface->refcount);
- curr_hard_iface = rcu_dereference_protected(bat_priv->primary_if, 1);
- rcu_assign_pointer(bat_priv->primary_if, new_hard_iface);
+ curr_hard_iface = rcu_replace_pointer(bat_priv->primary_if,
+ new_hard_iface, 1);
if (!new_hard_iface)
goto out;
@@ -492,8 +493,7 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv,
batadv_primary_if_update_addr(bat_priv, curr_hard_iface);
out:
- if (curr_hard_iface)
- batadv_hardif_put(curr_hard_iface);
+ batadv_hardif_put(curr_hard_iface);
}
static bool
@@ -505,50 +505,50 @@ batadv_hardif_is_iface_up(const struct batadv_hard_iface *hard_iface)
return false;
}
-static void batadv_check_known_mac_addr(const struct net_device *net_dev)
+static void batadv_check_known_mac_addr(const struct batadv_hard_iface *hard_iface)
{
- const struct batadv_hard_iface *hard_iface;
+ struct net_device *mesh_iface = hard_iface->mesh_iface;
+ const struct batadv_hard_iface *tmp_hard_iface;
+ struct list_head *iter;
- rcu_read_lock();
- list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
- if (hard_iface->if_status != BATADV_IF_ACTIVE &&
- hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED)
+ if (!mesh_iface)
+ return;
+
+ netdev_for_each_lower_private(mesh_iface, tmp_hard_iface, iter) {
+ if (tmp_hard_iface == hard_iface)
continue;
- if (hard_iface->net_dev == net_dev)
+ if (tmp_hard_iface->if_status == BATADV_IF_NOT_IN_USE)
continue;
- if (!batadv_compare_eth(hard_iface->net_dev->dev_addr,
- net_dev->dev_addr))
+ if (!batadv_compare_eth(tmp_hard_iface->net_dev->dev_addr,
+ hard_iface->net_dev->dev_addr))
continue;
pr_warn("The newly added mac address (%pM) already exists on: %s\n",
- net_dev->dev_addr, hard_iface->net_dev->name);
+ hard_iface->net_dev->dev_addr, tmp_hard_iface->net_dev->name);
pr_warn("It is strongly recommended to keep mac addresses unique to avoid problems!\n");
}
- rcu_read_unlock();
}
/**
* batadv_hardif_recalc_extra_skbroom() - Recalculate skbuff extra head/tailroom
- * @soft_iface: netdev struct of the mesh interface
+ * @mesh_iface: netdev struct of the mesh interface
*/
-static void batadv_hardif_recalc_extra_skbroom(struct net_device *soft_iface)
+static void batadv_hardif_recalc_extra_skbroom(struct net_device *mesh_iface)
{
const struct batadv_hard_iface *hard_iface;
unsigned short lower_header_len = ETH_HLEN;
unsigned short lower_headroom = 0;
unsigned short lower_tailroom = 0;
unsigned short needed_headroom;
+ struct list_head *iter;
rcu_read_lock();
- list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ netdev_for_each_lower_private_rcu(mesh_iface, hard_iface, iter) {
if (hard_iface->if_status == BATADV_IF_NOT_IN_USE)
continue;
- if (hard_iface->soft_iface != soft_iface)
- continue;
-
lower_header_len = max_t(unsigned short, lower_header_len,
hard_iface->net_dev->hard_header_len);
@@ -563,32 +563,33 @@ static void batadv_hardif_recalc_extra_skbroom(struct net_device *soft_iface)
needed_headroom = lower_headroom + (lower_header_len - ETH_HLEN);
needed_headroom += batadv_max_header_len();
- soft_iface->needed_headroom = needed_headroom;
- soft_iface->needed_tailroom = lower_tailroom;
+ /* fragmentation headers don't strip the unicast/... header */
+ needed_headroom += sizeof(struct batadv_frag_packet);
+
+ mesh_iface->needed_headroom = needed_headroom;
+ mesh_iface->needed_tailroom = lower_tailroom;
}
/**
- * batadv_hardif_min_mtu() - Calculate maximum MTU for soft interface
- * @soft_iface: netdev struct of the soft interface
+ * batadv_hardif_min_mtu() - Calculate maximum MTU for mesh interface
+ * @mesh_iface: netdev struct of the mesh interface
*
- * Return: MTU for the soft-interface (limited by the minimal MTU of all active
+ * Return: MTU for the mesh-interface (limited by the minimal MTU of all active
* slave interfaces)
*/
-int batadv_hardif_min_mtu(struct net_device *soft_iface)
+int batadv_hardif_min_mtu(struct net_device *mesh_iface)
{
- struct batadv_priv *bat_priv = netdev_priv(soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(mesh_iface);
const struct batadv_hard_iface *hard_iface;
+ struct list_head *iter;
int min_mtu = INT_MAX;
rcu_read_lock();
- list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ netdev_for_each_lower_private_rcu(mesh_iface, hard_iface, iter) {
if (hard_iface->if_status != BATADV_IF_ACTIVE &&
hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED)
continue;
- if (hard_iface->soft_iface != soft_iface)
- continue;
-
min_mtu = min_t(int, hard_iface->net_dev->mtu, min_mtu);
}
rcu_read_unlock();
@@ -608,31 +609,41 @@ out:
/* report to the other components the maximum amount of bytes that
* batman-adv can send over the wire (without considering the payload
* overhead). For example, this value is used by TT to compute the
- * maximum local table table size
+ * maximum local table size
*/
atomic_set(&bat_priv->packet_size_max, min_mtu);
- /* the real soft-interface MTU is computed by removing the payload
+ /* the real mesh-interface MTU is computed by removing the payload
* overhead from the maximum amount of bytes that was just computed.
- *
- * However batman-adv does not support MTUs bigger than ETH_DATA_LEN
*/
- return min_t(int, min_mtu - batadv_max_header_len(), ETH_DATA_LEN);
+ return min_t(int, min_mtu - batadv_max_header_len(), BATADV_MAX_MTU);
}
/**
* batadv_update_min_mtu() - Adjusts the MTU if a new interface with a smaller
* MTU appeared
- * @soft_iface: netdev struct of the soft interface
+ * @mesh_iface: netdev struct of the mesh interface
*/
-void batadv_update_min_mtu(struct net_device *soft_iface)
+void batadv_update_min_mtu(struct net_device *mesh_iface)
{
- soft_iface->mtu = batadv_hardif_min_mtu(soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(mesh_iface);
+ int limit_mtu;
+ int mtu;
+
+ mtu = batadv_hardif_min_mtu(mesh_iface);
+
+ if (bat_priv->mtu_set_by_user)
+ limit_mtu = bat_priv->mtu_set_by_user;
+ else
+ limit_mtu = ETH_DATA_LEN;
+
+ mtu = min(mtu, limit_mtu);
+ dev_set_mtu(mesh_iface, mtu);
/* Check if the local translate table should be cleaned up to match a
* new (and smaller) MTU.
*/
- batadv_tt_local_resize_to_mtu(soft_iface);
+ batadv_tt_local_resize_to_mtu(mesh_iface);
}
static void
@@ -644,7 +655,7 @@ batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface)
if (hard_iface->if_status != BATADV_IF_INACTIVE)
goto out;
- bat_priv = netdev_priv(hard_iface->soft_iface);
+ bat_priv = netdev_priv(hard_iface->mesh_iface);
bat_priv->algo_ops->iface.update_mac(hard_iface);
hard_iface->if_status = BATADV_IF_TO_BE_ACTIVATED;
@@ -656,17 +667,16 @@ batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface)
if (!primary_if)
batadv_primary_if_select(bat_priv, hard_iface);
- batadv_info(hard_iface->soft_iface, "Interface activated: %s\n",
+ batadv_info(hard_iface->mesh_iface, "Interface activated: %s\n",
hard_iface->net_dev->name);
- batadv_update_min_mtu(hard_iface->soft_iface);
+ batadv_update_min_mtu(hard_iface->mesh_iface);
if (bat_priv->algo_ops->iface.activate)
bat_priv->algo_ops->iface.activate(hard_iface);
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
+ batadv_hardif_put(primary_if);
}
static void
@@ -678,98 +688,46 @@ batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface)
hard_iface->if_status = BATADV_IF_INACTIVE;
- batadv_info(hard_iface->soft_iface, "Interface deactivated: %s\n",
+ batadv_info(hard_iface->mesh_iface, "Interface deactivated: %s\n",
hard_iface->net_dev->name);
- batadv_update_min_mtu(hard_iface->soft_iface);
+ batadv_update_min_mtu(hard_iface->mesh_iface);
}
/**
- * batadv_master_del_slave() - remove hard_iface from the current master iface
- * @slave: the interface enslaved in another master
- * @master: the master from which slave has to be removed
- *
- * Invoke ndo_del_slave on master passing slave as argument. In this way slave
- * is free'd and master can correctly change its internal state.
- *
- * Return: 0 on success, a negative value representing the error otherwise
- */
-static int batadv_master_del_slave(struct batadv_hard_iface *slave,
- struct net_device *master)
-{
- int ret;
-
- if (!master)
- return 0;
-
- ret = -EBUSY;
- if (master->netdev_ops->ndo_del_slave)
- ret = master->netdev_ops->ndo_del_slave(master, slave->net_dev);
-
- return ret;
-}
-
-/**
- * batadv_hardif_enable_interface() - Enslave hard interface to soft interface
- * @hard_iface: hard interface to add to soft interface
- * @net: the applicable net namespace
- * @iface_name: name of the soft interface
+ * batadv_hardif_enable_interface() - Enslave hard interface to mesh interface
+ * @hard_iface: hard interface to add to mesh interface
+ * @mesh_iface: netdev struct of the mesh interface
*
* Return: 0 on success or negative error number in case of failure
*/
int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
- struct net *net, const char *iface_name)
+ struct net_device *mesh_iface)
{
struct batadv_priv *bat_priv;
- struct net_device *soft_iface, *master;
__be16 ethertype = htons(ETH_P_BATMAN);
int max_header_len = batadv_max_header_len();
+ unsigned int required_mtu;
+ unsigned int hardif_mtu;
int ret;
+ hardif_mtu = READ_ONCE(hard_iface->net_dev->mtu);
+ required_mtu = READ_ONCE(mesh_iface->mtu) + max_header_len;
+
+ if (hardif_mtu < ETH_MIN_MTU + max_header_len)
+ return -EINVAL;
+
if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
goto out;
kref_get(&hard_iface->refcount);
- soft_iface = dev_get_by_name(net, iface_name);
-
- if (!soft_iface) {
- soft_iface = batadv_softif_create(net, iface_name);
-
- if (!soft_iface) {
- ret = -ENOMEM;
- goto err;
- }
-
- /* dev_get_by_name() increases the reference counter for us */
- dev_hold(soft_iface);
- }
-
- if (!batadv_softif_is_valid(soft_iface)) {
- pr_err("Can't create batman mesh interface %s: already exists as regular interface\n",
- soft_iface->name);
- ret = -EINVAL;
- goto err_dev;
- }
-
- /* check if the interface is enslaved in another virtual one and
- * in that case unlink it first
- */
- master = netdev_master_upper_dev_get(hard_iface->net_dev);
- ret = batadv_master_del_slave(hard_iface, master);
- if (ret)
- goto err_dev;
-
- hard_iface->soft_iface = soft_iface;
- bat_priv = netdev_priv(hard_iface->soft_iface);
-
- if (bat_priv->num_ifaces >= UINT_MAX) {
- ret = -ENOSPC;
- goto err_dev;
- }
+ netdev_hold(mesh_iface, &hard_iface->meshif_dev_tracker, GFP_ATOMIC);
+ hard_iface->mesh_iface = mesh_iface;
+ bat_priv = netdev_priv(hard_iface->mesh_iface);
ret = netdev_master_upper_dev_link(hard_iface->net_dev,
- soft_iface, NULL, NULL, NULL);
+ mesh_iface, hard_iface, NULL, NULL);
if (ret)
goto err_dev;
@@ -777,16 +735,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
if (ret < 0)
goto err_upper;
- hard_iface->if_num = bat_priv->num_ifaces;
- bat_priv->num_ifaces++;
hard_iface->if_status = BATADV_IF_INACTIVE;
- ret = batadv_orig_hash_add_if(hard_iface, bat_priv->num_ifaces);
- if (ret < 0) {
- bat_priv->algo_ops->iface.disable(hard_iface);
- bat_priv->num_ifaces--;
- hard_iface->if_status = BATADV_IF_NOT_IN_USE;
- goto err_upper;
- }
kref_get(&hard_iface->refcount);
hard_iface->batman_adv_ptype.type = ethertype;
@@ -794,55 +743,80 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
hard_iface->batman_adv_ptype.dev = hard_iface->net_dev;
dev_add_pack(&hard_iface->batman_adv_ptype);
- batadv_info(hard_iface->soft_iface, "Adding interface: %s\n",
+ batadv_info(hard_iface->mesh_iface, "Adding interface: %s\n",
hard_iface->net_dev->name);
if (atomic_read(&bat_priv->fragmentation) &&
- hard_iface->net_dev->mtu < ETH_DATA_LEN + max_header_len)
- batadv_info(hard_iface->soft_iface,
+ hardif_mtu < required_mtu)
+ batadv_info(hard_iface->mesh_iface,
"The MTU of interface %s is too small (%i) to handle the transport of batman-adv packets. Packets going over this interface will be fragmented on layer2 which could impact the performance. Setting the MTU to %i would solve the problem.\n",
- hard_iface->net_dev->name, hard_iface->net_dev->mtu,
- ETH_DATA_LEN + max_header_len);
+ hard_iface->net_dev->name, hardif_mtu,
+ required_mtu);
if (!atomic_read(&bat_priv->fragmentation) &&
- hard_iface->net_dev->mtu < ETH_DATA_LEN + max_header_len)
- batadv_info(hard_iface->soft_iface,
+ hardif_mtu < required_mtu)
+ batadv_info(hard_iface->mesh_iface,
"The MTU of interface %s is too small (%i) to handle the transport of batman-adv packets. If you experience problems getting traffic through try increasing the MTU to %i.\n",
- hard_iface->net_dev->name, hard_iface->net_dev->mtu,
- ETH_DATA_LEN + max_header_len);
+ hard_iface->net_dev->name, hardif_mtu,
+ required_mtu);
+
+ batadv_check_known_mac_addr(hard_iface);
if (batadv_hardif_is_iface_up(hard_iface))
batadv_hardif_activate_interface(hard_iface);
else
- batadv_err(hard_iface->soft_iface,
+ batadv_err(hard_iface->mesh_iface,
"Not using interface %s (retrying later): interface not active\n",
hard_iface->net_dev->name);
- batadv_hardif_recalc_extra_skbroom(soft_iface);
+ batadv_hardif_recalc_extra_skbroom(mesh_iface);
+
+ if (bat_priv->algo_ops->iface.enabled)
+ bat_priv->algo_ops->iface.enabled(hard_iface);
out:
return 0;
err_upper:
- netdev_upper_dev_unlink(hard_iface->net_dev, soft_iface);
+ netdev_upper_dev_unlink(hard_iface->net_dev, mesh_iface);
err_dev:
- hard_iface->soft_iface = NULL;
- dev_put(soft_iface);
-err:
+ hard_iface->mesh_iface = NULL;
+ netdev_put(mesh_iface, &hard_iface->meshif_dev_tracker);
batadv_hardif_put(hard_iface);
return ret;
}
/**
- * batadv_hardif_disable_interface() - Remove hard interface from soft interface
+ * batadv_hardif_cnt() - get number of interfaces enslaved to mesh interface
+ * @mesh_iface: mesh interface to check
+ *
+ * This function is only using RCU for locking - the result can therefore be
+ * off when another function is modifying the list at the same time. The
+ * caller can use the rtnl_lock to make sure that the count is accurate.
+ *
+ * Return: number of connected/enslaved hard interfaces
+ */
+static size_t batadv_hardif_cnt(struct net_device *mesh_iface)
+{
+ struct batadv_hard_iface *hard_iface;
+ struct list_head *iter;
+ size_t count = 0;
+
+ rcu_read_lock();
+ netdev_for_each_lower_private_rcu(mesh_iface, hard_iface, iter)
+ count++;
+ rcu_read_unlock();
+
+ return count;
+}
+
+/**
+ * batadv_hardif_disable_interface() - Remove hard interface from mesh interface
* @hard_iface: hard interface to be removed
- * @autodel: whether to delete soft interface when it doesn't contain any other
- * slave interfaces
*/
-void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
- enum batadv_hard_if_cleanup autodel)
+void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface)
{
- struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
struct batadv_hard_iface *primary_if = NULL;
batadv_hardif_deactivate_interface(hard_iface);
@@ -850,23 +824,19 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
if (hard_iface->if_status != BATADV_IF_INACTIVE)
goto out;
- batadv_info(hard_iface->soft_iface, "Removing interface: %s\n",
+ batadv_info(hard_iface->mesh_iface, "Removing interface: %s\n",
hard_iface->net_dev->name);
dev_remove_pack(&hard_iface->batman_adv_ptype);
batadv_hardif_put(hard_iface);
- bat_priv->num_ifaces--;
- batadv_orig_hash_del_if(hard_iface, bat_priv->num_ifaces);
-
primary_if = batadv_primary_if_get_selected(bat_priv);
if (hard_iface == primary_if) {
struct batadv_hard_iface *new_if;
- new_if = batadv_hardif_get_active(hard_iface->soft_iface);
+ new_if = batadv_hardif_get_active(hard_iface->mesh_iface);
batadv_primary_if_select(bat_priv, new_if);
- if (new_if)
- batadv_hardif_put(new_if);
+ batadv_hardif_put(new_if);
}
bat_priv->algo_ops->iface.disable(hard_iface);
@@ -875,60 +845,46 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
/* delete all references to this hard_iface */
batadv_purge_orig_ref(bat_priv);
batadv_purge_outstanding_packets(bat_priv, hard_iface);
- dev_put(hard_iface->soft_iface);
+ netdev_put(hard_iface->mesh_iface, &hard_iface->meshif_dev_tracker);
- netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->soft_iface);
- batadv_hardif_recalc_extra_skbroom(hard_iface->soft_iface);
+ netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->mesh_iface);
+ batadv_hardif_recalc_extra_skbroom(hard_iface->mesh_iface);
/* nobody uses this interface anymore */
- if (bat_priv->num_ifaces == 0) {
+ if (batadv_hardif_cnt(hard_iface->mesh_iface) <= 1)
batadv_gw_check_client_stop(bat_priv);
- if (autodel == BATADV_IF_CLEANUP_AUTO)
- batadv_softif_destroy_sysfs(hard_iface->soft_iface);
- }
-
- hard_iface->soft_iface = NULL;
+ hard_iface->mesh_iface = NULL;
batadv_hardif_put(hard_iface);
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
+ batadv_hardif_put(primary_if);
}
static struct batadv_hard_iface *
batadv_hardif_add_interface(struct net_device *net_dev)
{
struct batadv_hard_iface *hard_iface;
- int ret;
ASSERT_RTNL();
if (!batadv_is_valid_iface(net_dev))
- goto out;
-
- dev_hold(net_dev);
+ return NULL;
hard_iface = kzalloc(sizeof(*hard_iface), GFP_ATOMIC);
if (!hard_iface)
- goto release_dev;
-
- ret = batadv_sysfs_add_hardif(&hard_iface->hardif_obj, net_dev);
- if (ret)
- goto free_if;
+ return NULL;
- hard_iface->if_num = 0;
+ netdev_hold(net_dev, &hard_iface->dev_tracker, GFP_ATOMIC);
hard_iface->net_dev = net_dev;
- hard_iface->soft_iface = NULL;
- hard_iface->if_status = BATADV_IF_NOT_IN_USE;
- ret = batadv_debugfs_add_hardif(hard_iface);
- if (ret)
- goto free_sysfs;
+ hard_iface->mesh_iface = NULL;
+ hard_iface->if_status = BATADV_IF_NOT_IN_USE;
INIT_LIST_HEAD(&hard_iface->list);
INIT_HLIST_HEAD(&hard_iface->neigh_list);
+ mutex_init(&hard_iface->bat_iv.ogm_buff_mutex);
spin_lock_init(&hard_iface->neigh_list_lock);
kref_init(&hard_iface->refcount);
@@ -937,22 +893,15 @@ batadv_hardif_add_interface(struct net_device *net_dev)
if (batadv_is_wifi_hardif(hard_iface))
hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS;
+ atomic_set(&hard_iface->hop_penalty, 0);
+
batadv_v_hardif_init(hard_iface);
- batadv_check_known_mac_addr(hard_iface->net_dev);
kref_get(&hard_iface->refcount);
list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list);
+ batadv_hardif_generation++;
return hard_iface;
-
-free_sysfs:
- batadv_sysfs_del_hardif(&hard_iface->hardif_obj);
-free_if:
- kfree(hard_iface);
-release_dev:
- dev_put(net_dev);
-out:
- return NULL;
}
static void batadv_hardif_remove_interface(struct batadv_hard_iface *hard_iface)
@@ -961,54 +910,31 @@ static void batadv_hardif_remove_interface(struct batadv_hard_iface *hard_iface)
/* first deactivate interface */
if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
- batadv_hardif_disable_interface(hard_iface,
- BATADV_IF_CLEANUP_KEEP);
+ batadv_hardif_disable_interface(hard_iface);
if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
return;
hard_iface->if_status = BATADV_IF_TO_BE_REMOVED;
- batadv_debugfs_del_hardif(hard_iface);
- batadv_sysfs_del_hardif(&hard_iface->hardif_obj);
batadv_hardif_put(hard_iface);
}
/**
- * batadv_hardif_remove_interfaces() - Remove all hard interfaces
- */
-void batadv_hardif_remove_interfaces(void)
-{
- struct batadv_hard_iface *hard_iface, *hard_iface_tmp;
-
- rtnl_lock();
- list_for_each_entry_safe(hard_iface, hard_iface_tmp,
- &batadv_hardif_list, list) {
- list_del_rcu(&hard_iface->list);
- batadv_hardif_remove_interface(hard_iface);
- }
- rtnl_unlock();
-}
-
-/**
- * batadv_hard_if_event_softif() - Handle events for soft interfaces
+ * batadv_hard_if_event_meshif() - Handle events for mesh interfaces
* @event: NETDEV_* event to handle
* @net_dev: net_device which generated an event
*
* Return: NOTIFY_* result
*/
-static int batadv_hard_if_event_softif(unsigned long event,
+static int batadv_hard_if_event_meshif(unsigned long event,
struct net_device *net_dev)
{
struct batadv_priv *bat_priv;
switch (event) {
case NETDEV_REGISTER:
- batadv_sysfs_add_meshif(net_dev);
bat_priv = netdev_priv(net_dev);
- batadv_softif_create_vlan(bat_priv, BATADV_NO_FLAGS);
- break;
- case NETDEV_CHANGENAME:
- batadv_debugfs_rename_meshif(net_dev);
+ batadv_meshif_create_vlan(bat_priv, BATADV_NO_FLAGS);
break;
}
@@ -1023,8 +949,8 @@ static int batadv_hard_if_event(struct notifier_block *this,
struct batadv_hard_iface *primary_if = NULL;
struct batadv_priv *bat_priv;
- if (batadv_softif_is_valid(net_dev))
- return batadv_hard_if_event_softif(event, net_dev);
+ if (batadv_meshif_is_valid(net_dev))
+ return batadv_hard_if_event_meshif(event, net_dev);
hard_iface = batadv_hardif_get_by_netdev(net_dev);
if (!hard_iface && (event == NETDEV_REGISTER ||
@@ -1045,20 +971,21 @@ static int batadv_hard_if_event(struct notifier_block *this,
case NETDEV_UNREGISTER:
case NETDEV_PRE_TYPE_CHANGE:
list_del_rcu(&hard_iface->list);
+ batadv_hardif_generation++;
batadv_hardif_remove_interface(hard_iface);
break;
case NETDEV_CHANGEMTU:
- if (hard_iface->soft_iface)
- batadv_update_min_mtu(hard_iface->soft_iface);
+ if (hard_iface->mesh_iface)
+ batadv_update_min_mtu(hard_iface->mesh_iface);
break;
case NETDEV_CHANGEADDR:
if (hard_iface->if_status == BATADV_IF_NOT_IN_USE)
goto hardif_put;
- batadv_check_known_mac_addr(hard_iface->net_dev);
+ batadv_check_known_mac_addr(hard_iface);
- bat_priv = netdev_priv(hard_iface->soft_iface);
+ bat_priv = netdev_priv(hard_iface->mesh_iface);
bat_priv->algo_ops->iface.update_mac(hard_iface);
primary_if = batadv_primary_if_get_selected(bat_priv);
@@ -1073,9 +1000,6 @@ static int batadv_hard_if_event(struct notifier_block *this,
if (batadv_is_wifi_hardif(hard_iface))
hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS;
break;
- case NETDEV_CHANGENAME:
- batadv_debugfs_rename_hardif(hard_iface);
- break;
default:
break;
}
@@ -1083,8 +1007,7 @@ static int batadv_hard_if_event(struct notifier_block *this,
hardif_put:
batadv_hardif_put(hard_iface);
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
+ batadv_hardif_put(primary_if);
return NOTIFY_DONE;
}
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index d1c0f6189301..9db8a310961e 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_HARD_INTERFACE_H_
@@ -23,26 +11,23 @@
#include <linux/compiler.h>
#include <linux/kref.h>
-#include <linux/notifier.h>
+#include <linux/netdevice.h>
#include <linux/rcupdate.h>
#include <linux/stddef.h>
#include <linux/types.h>
-struct net_device;
-struct net;
-
/**
* enum batadv_hard_if_state - State of a hard interface
*/
enum batadv_hard_if_state {
/**
* @BATADV_IF_NOT_IN_USE: interface is not used as slave interface of a
- * batman-adv soft interface
+ * batman-adv mesh interface
*/
BATADV_IF_NOT_IN_USE,
/**
- * @BATADV_IF_TO_BE_REMOVED: interface will be removed from soft
+ * @BATADV_IF_TO_BE_REMOVED: interface will be removed from mesh
* interface
*/
BATADV_IF_TO_BE_REMOVED,
@@ -55,12 +40,6 @@ enum batadv_hard_if_state {
/** @BATADV_IF_TO_BE_ACTIVATED: interface is getting activated */
BATADV_IF_TO_BE_ACTIVATED,
-
- /**
- * @BATADV_IF_I_WANT_YOU: interface is queued up (using sysfs) for being
- * added as slave interface of a batman-adv soft interface
- */
- BATADV_IF_I_WANT_YOU,
};
/**
@@ -86,22 +65,6 @@ enum batadv_hard_if_bcast {
BATADV_HARDIF_BCAST_DUPORIG,
};
-/**
- * enum batadv_hard_if_cleanup - Cleanup modi for soft_iface after slave removal
- */
-enum batadv_hard_if_cleanup {
- /**
- * @BATADV_IF_CLEANUP_KEEP: Don't automatically delete soft-interface
- */
- BATADV_IF_CLEANUP_KEEP,
-
- /**
- * @BATADV_IF_CLEANUP_AUTO: Delete soft-interface after last slave was
- * removed
- */
- BATADV_IF_CLEANUP_AUTO,
-};
-
extern struct notifier_block batadv_hard_if_notifier;
struct net_device *batadv_get_real_netdev(struct net_device *net_device);
@@ -110,12 +73,10 @@ bool batadv_is_wifi_hardif(struct batadv_hard_iface *hard_iface);
struct batadv_hard_iface*
batadv_hardif_get_by_netdev(const struct net_device *net_dev);
int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
- struct net *net, const char *iface_name);
-void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
- enum batadv_hard_if_cleanup autodel);
-void batadv_hardif_remove_interfaces(void);
-int batadv_hardif_min_mtu(struct net_device *soft_iface);
-void batadv_update_min_mtu(struct net_device *soft_iface);
+ struct net_device *mesh_iface);
+void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface);
+int batadv_hardif_min_mtu(struct net_device *mesh_iface);
+void batadv_update_min_mtu(struct net_device *mesh_iface);
void batadv_hardif_release(struct kref *ref);
int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing,
u8 *orig_addr, u8 *orig_neigh);
@@ -127,12 +88,15 @@ int batadv_hardif_no_broadcast(struct batadv_hard_iface *if_outgoing,
*/
static inline void batadv_hardif_put(struct batadv_hard_iface *hard_iface)
{
+ if (!hard_iface)
+ return;
+
kref_put(&hard_iface->refcount, batadv_hardif_release);
}
/**
* batadv_primary_if_get_selected() - Get reference to primary interface
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*
* Return: primary interface (with increased refcnt), otherwise NULL
*/
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
index 7b49e4001778..8016e619787f 100644
--- a/net/batman-adv/hash.c
+++ b/net/batman-adv/hash.c
@@ -1,19 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2006-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "hash.h"
@@ -32,6 +20,8 @@ static void batadv_hash_init(struct batadv_hashtable *hash)
INIT_HLIST_HEAD(&hash->table[i]);
spin_lock_init(&hash->list_locks[i]);
}
+
+ atomic_set(&hash->generation, 0);
}
/**
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index 9490a7ca2ba6..fb251c385a1b 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2006-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_HASH_H_
@@ -21,16 +9,16 @@
#include "main.h"
+#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/list.h>
+#include <linux/lockdep.h>
#include <linux/rculist.h>
#include <linux/spinlock.h>
#include <linux/stddef.h>
#include <linux/types.h>
-struct lock_class_key;
-
-/* callback to a compare function. should compare 2 element datas for their
+/* callback to a compare function. should compare 2 element data for their
* keys
*
* Return: true if same and false if not same
@@ -58,6 +46,9 @@ struct batadv_hashtable {
/** @size: size of hashtable */
u32 size;
+
+ /** @generation: current (generation) sequence number */
+ atomic_t generation;
};
/* allocates and clears the hash */
@@ -112,6 +103,7 @@ static inline int batadv_hash_add(struct batadv_hashtable *hash,
/* no duplicate found in list, add new element */
hlist_add_head_rcu(data_node, head);
+ atomic_inc(&hash->generation);
ret = 0;
@@ -154,6 +146,7 @@ static inline void *batadv_hash_remove(struct batadv_hashtable *hash,
data_save = node;
hlist_del_rcu(node);
+ atomic_inc(&hash->generation);
break;
}
spin_unlock_bh(&hash->list_locks[index]);
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
deleted file mode 100644
index 55c358ad3331..000000000000
--- a/net/batman-adv/icmp_socket.c
+++ /dev/null
@@ -1,418 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
- *
- * Marek Lindner
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "icmp_socket.h"
-#include "main.h"
-
-#include <linux/atomic.h>
-#include <linux/compiler.h>
-#include <linux/debugfs.h>
-#include <linux/errno.h>
-#include <linux/etherdevice.h>
-#include <linux/eventpoll.h>
-#include <linux/export.h>
-#include <linux/fcntl.h>
-#include <linux/fs.h>
-#include <linux/gfp.h>
-#include <linux/if_ether.h>
-#include <linux/kernel.h>
-#include <linux/list.h>
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/pkt_sched.h>
-#include <linux/poll.h>
-#include <linux/printk.h>
-#include <linux/sched.h> /* for linux/wait.h */
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/stddef.h>
-#include <linux/string.h>
-#include <linux/uaccess.h>
-#include <linux/wait.h>
-#include <uapi/linux/batadv_packet.h>
-
-#include "hard-interface.h"
-#include "log.h"
-#include "originator.h"
-#include "send.h"
-
-static struct batadv_socket_client *batadv_socket_client_hash[256];
-
-static void batadv_socket_add_packet(struct batadv_socket_client *socket_client,
- struct batadv_icmp_header *icmph,
- size_t icmp_len);
-
-/**
- * batadv_socket_init() - Initialize soft interface independent socket data
- */
-void batadv_socket_init(void)
-{
- memset(batadv_socket_client_hash, 0, sizeof(batadv_socket_client_hash));
-}
-
-static int batadv_socket_open(struct inode *inode, struct file *file)
-{
- unsigned int i;
- struct batadv_socket_client *socket_client;
-
- if (!try_module_get(THIS_MODULE))
- return -EBUSY;
-
- nonseekable_open(inode, file);
-
- socket_client = kmalloc(sizeof(*socket_client), GFP_KERNEL);
- if (!socket_client) {
- module_put(THIS_MODULE);
- return -ENOMEM;
- }
-
- for (i = 0; i < ARRAY_SIZE(batadv_socket_client_hash); i++) {
- if (!batadv_socket_client_hash[i]) {
- batadv_socket_client_hash[i] = socket_client;
- break;
- }
- }
-
- if (i == ARRAY_SIZE(batadv_socket_client_hash)) {
- pr_err("Error - can't add another packet client: maximum number of clients reached\n");
- kfree(socket_client);
- module_put(THIS_MODULE);
- return -EXFULL;
- }
-
- INIT_LIST_HEAD(&socket_client->queue_list);
- socket_client->queue_len = 0;
- socket_client->index = i;
- socket_client->bat_priv = inode->i_private;
- spin_lock_init(&socket_client->lock);
- init_waitqueue_head(&socket_client->queue_wait);
-
- file->private_data = socket_client;
-
- return 0;
-}
-
-static int batadv_socket_release(struct inode *inode, struct file *file)
-{
- struct batadv_socket_client *client = file->private_data;
- struct batadv_socket_packet *packet, *tmp;
-
- spin_lock_bh(&client->lock);
-
- /* for all packets in the queue ... */
- list_for_each_entry_safe(packet, tmp, &client->queue_list, list) {
- list_del(&packet->list);
- kfree(packet);
- }
-
- batadv_socket_client_hash[client->index] = NULL;
- spin_unlock_bh(&client->lock);
-
- kfree(client);
- module_put(THIS_MODULE);
-
- return 0;
-}
-
-static ssize_t batadv_socket_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct batadv_socket_client *socket_client = file->private_data;
- struct batadv_socket_packet *socket_packet;
- size_t packet_len;
- int error;
-
- if ((file->f_flags & O_NONBLOCK) && socket_client->queue_len == 0)
- return -EAGAIN;
-
- if (!buf || count < sizeof(struct batadv_icmp_packet))
- return -EINVAL;
-
- if (!access_ok(VERIFY_WRITE, buf, count))
- return -EFAULT;
-
- error = wait_event_interruptible(socket_client->queue_wait,
- socket_client->queue_len);
-
- if (error)
- return error;
-
- spin_lock_bh(&socket_client->lock);
-
- socket_packet = list_first_entry(&socket_client->queue_list,
- struct batadv_socket_packet, list);
- list_del(&socket_packet->list);
- socket_client->queue_len--;
-
- spin_unlock_bh(&socket_client->lock);
-
- packet_len = min(count, socket_packet->icmp_len);
- error = copy_to_user(buf, &socket_packet->icmp_packet, packet_len);
-
- kfree(socket_packet);
-
- if (error)
- return -EFAULT;
-
- return packet_len;
-}
-
-static ssize_t batadv_socket_write(struct file *file, const char __user *buff,
- size_t len, loff_t *off)
-{
- struct batadv_socket_client *socket_client = file->private_data;
- struct batadv_priv *bat_priv = socket_client->bat_priv;
- struct batadv_hard_iface *primary_if = NULL;
- struct sk_buff *skb;
- struct batadv_icmp_packet_rr *icmp_packet_rr;
- struct batadv_icmp_header *icmp_header;
- struct batadv_orig_node *orig_node = NULL;
- struct batadv_neigh_node *neigh_node = NULL;
- size_t packet_len = sizeof(struct batadv_icmp_packet);
- u8 *addr;
-
- if (len < sizeof(struct batadv_icmp_header)) {
- batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
- "Error - can't send packet from char device: invalid packet size\n");
- return -EINVAL;
- }
-
- primary_if = batadv_primary_if_get_selected(bat_priv);
-
- if (!primary_if) {
- len = -EFAULT;
- goto out;
- }
-
- if (len >= BATADV_ICMP_MAX_PACKET_SIZE)
- packet_len = BATADV_ICMP_MAX_PACKET_SIZE;
- else
- packet_len = len;
-
- skb = netdev_alloc_skb_ip_align(NULL, packet_len + ETH_HLEN);
- if (!skb) {
- len = -ENOMEM;
- goto out;
- }
-
- skb->priority = TC_PRIO_CONTROL;
- skb_reserve(skb, ETH_HLEN);
- icmp_header = skb_put(skb, packet_len);
-
- if (copy_from_user(icmp_header, buff, packet_len)) {
- len = -EFAULT;
- goto free_skb;
- }
-
- if (icmp_header->packet_type != BATADV_ICMP) {
- batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
- "Error - can't send packet from char device: got bogus packet type (expected: BAT_ICMP)\n");
- len = -EINVAL;
- goto free_skb;
- }
-
- switch (icmp_header->msg_type) {
- case BATADV_ECHO_REQUEST:
- if (len < sizeof(struct batadv_icmp_packet)) {
- batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
- "Error - can't send packet from char device: invalid packet size\n");
- len = -EINVAL;
- goto free_skb;
- }
-
- if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
- goto dst_unreach;
-
- orig_node = batadv_orig_hash_find(bat_priv, icmp_header->dst);
- if (!orig_node)
- goto dst_unreach;
-
- neigh_node = batadv_orig_router_get(orig_node,
- BATADV_IF_DEFAULT);
- if (!neigh_node)
- goto dst_unreach;
-
- if (!neigh_node->if_incoming)
- goto dst_unreach;
-
- if (neigh_node->if_incoming->if_status != BATADV_IF_ACTIVE)
- goto dst_unreach;
-
- icmp_packet_rr = (struct batadv_icmp_packet_rr *)icmp_header;
- if (packet_len == sizeof(*icmp_packet_rr)) {
- addr = neigh_node->if_incoming->net_dev->dev_addr;
- ether_addr_copy(icmp_packet_rr->rr[0], addr);
- }
-
- break;
- default:
- batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
- "Error - can't send packet from char device: got unknown message type\n");
- len = -EINVAL;
- goto free_skb;
- }
-
- icmp_header->uid = socket_client->index;
-
- if (icmp_header->version != BATADV_COMPAT_VERSION) {
- icmp_header->msg_type = BATADV_PARAMETER_PROBLEM;
- icmp_header->version = BATADV_COMPAT_VERSION;
- batadv_socket_add_packet(socket_client, icmp_header,
- packet_len);
- goto free_skb;
- }
-
- ether_addr_copy(icmp_header->orig, primary_if->net_dev->dev_addr);
-
- batadv_send_unicast_skb(skb, neigh_node);
- goto out;
-
-dst_unreach:
- icmp_header->msg_type = BATADV_DESTINATION_UNREACHABLE;
- batadv_socket_add_packet(socket_client, icmp_header, packet_len);
-free_skb:
- kfree_skb(skb);
-out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- if (neigh_node)
- batadv_neigh_node_put(neigh_node);
- if (orig_node)
- batadv_orig_node_put(orig_node);
- return len;
-}
-
-static __poll_t batadv_socket_poll(struct file *file, poll_table *wait)
-{
- struct batadv_socket_client *socket_client = file->private_data;
-
- poll_wait(file, &socket_client->queue_wait, wait);
-
- if (socket_client->queue_len > 0)
- return EPOLLIN | EPOLLRDNORM;
-
- return 0;
-}
-
-static const struct file_operations batadv_fops = {
- .owner = THIS_MODULE,
- .open = batadv_socket_open,
- .release = batadv_socket_release,
- .read = batadv_socket_read,
- .write = batadv_socket_write,
- .poll = batadv_socket_poll,
- .llseek = no_llseek,
-};
-
-/**
- * batadv_socket_setup() - Create debugfs "socket" file
- * @bat_priv: the bat priv with all the soft interface information
- *
- * Return: 0 on success or negative error number in case of failure
- */
-int batadv_socket_setup(struct batadv_priv *bat_priv)
-{
- struct dentry *d;
-
- if (!bat_priv->debug_dir)
- goto err;
-
- d = debugfs_create_file(BATADV_ICMP_SOCKET, 0600, bat_priv->debug_dir,
- bat_priv, &batadv_fops);
- if (!d)
- goto err;
-
- return 0;
-
-err:
- return -ENOMEM;
-}
-
-/**
- * batadv_socket_add_packet() - schedule an icmp packet to be sent to
- * userspace on an icmp socket.
- * @socket_client: the socket this packet belongs to
- * @icmph: pointer to the header of the icmp packet
- * @icmp_len: total length of the icmp packet
- */
-static void batadv_socket_add_packet(struct batadv_socket_client *socket_client,
- struct batadv_icmp_header *icmph,
- size_t icmp_len)
-{
- struct batadv_socket_packet *socket_packet;
- size_t len;
-
- socket_packet = kmalloc(sizeof(*socket_packet), GFP_ATOMIC);
-
- if (!socket_packet)
- return;
-
- len = icmp_len;
- /* check the maximum length before filling the buffer */
- if (len > sizeof(socket_packet->icmp_packet))
- len = sizeof(socket_packet->icmp_packet);
-
- INIT_LIST_HEAD(&socket_packet->list);
- memcpy(&socket_packet->icmp_packet, icmph, len);
- socket_packet->icmp_len = len;
-
- spin_lock_bh(&socket_client->lock);
-
- /* while waiting for the lock the socket_client could have been
- * deleted
- */
- if (!batadv_socket_client_hash[icmph->uid]) {
- spin_unlock_bh(&socket_client->lock);
- kfree(socket_packet);
- return;
- }
-
- list_add_tail(&socket_packet->list, &socket_client->queue_list);
- socket_client->queue_len++;
-
- if (socket_client->queue_len > 100) {
- socket_packet = list_first_entry(&socket_client->queue_list,
- struct batadv_socket_packet,
- list);
-
- list_del(&socket_packet->list);
- kfree(socket_packet);
- socket_client->queue_len--;
- }
-
- spin_unlock_bh(&socket_client->lock);
-
- wake_up(&socket_client->queue_wait);
-}
-
-/**
- * batadv_socket_receive_packet() - schedule an icmp packet to be received
- * locally and sent to userspace.
- * @icmph: pointer to the header of the icmp packet
- * @icmp_len: total length of the icmp packet
- */
-void batadv_socket_receive_packet(struct batadv_icmp_header *icmph,
- size_t icmp_len)
-{
- struct batadv_socket_client *hash;
-
- hash = batadv_socket_client_hash[icmph->uid];
- if (hash)
- batadv_socket_add_packet(hash, icmph, icmp_len);
-}
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
deleted file mode 100644
index 958be22beda9..000000000000
--- a/net/batman-adv/icmp_socket.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
- *
- * Marek Lindner
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _NET_BATMAN_ADV_ICMP_SOCKET_H_
-#define _NET_BATMAN_ADV_ICMP_SOCKET_H_
-
-#include "main.h"
-
-#include <linux/types.h>
-
-struct batadv_icmp_header;
-
-#define BATADV_ICMP_SOCKET "socket"
-
-int batadv_socket_setup(struct batadv_priv *bat_priv);
-
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-
-void batadv_socket_init(void);
-void batadv_socket_receive_packet(struct batadv_icmp_header *icmph,
- size_t icmp_len);
-
-#else
-
-static inline void batadv_socket_init(void)
-{
-}
-
-static inline void
-batadv_socket_receive_packet(struct batadv_icmp_header *icmph, size_t icmp_len)
-{
-}
-
-#endif
-
-#endif /* _NET_BATMAN_ADV_ICMP_SOCKET_H_ */
diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c
index 853773e45f79..c19d07eeb070 100644
--- a/net/batman-adv/log.c
+++ b/net/batman-adv/log.c
@@ -1,249 +1,36 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "log.h"
#include "main.h"
-#include <linux/compiler.h>
-#include <linux/debugfs.h>
-#include <linux/errno.h>
-#include <linux/eventpoll.h>
-#include <linux/export.h>
-#include <linux/fcntl.h>
-#include <linux/fs.h>
-#include <linux/gfp.h>
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/poll.h>
-#include <linux/sched.h> /* for linux/wait.h */
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/stddef.h>
-#include <linux/types.h>
-#include <linux/uaccess.h>
-#include <linux/wait.h>
-#include <stdarg.h>
-
-#define BATADV_LOG_BUFF_MASK (batadv_log_buff_len - 1)
-
-static const int batadv_log_buff_len = BATADV_LOG_BUF_LEN;
-
-static char *batadv_log_char_addr(struct batadv_priv_debug_log *debug_log,
- size_t idx)
-{
- return &debug_log->log_buff[idx & BATADV_LOG_BUFF_MASK];
-}
-
-static void batadv_emit_log_char(struct batadv_priv_debug_log *debug_log,
- char c)
-{
- char *char_addr;
-
- char_addr = batadv_log_char_addr(debug_log, debug_log->log_end);
- *char_addr = c;
- debug_log->log_end++;
-
- if (debug_log->log_end - debug_log->log_start > batadv_log_buff_len)
- debug_log->log_start = debug_log->log_end - batadv_log_buff_len;
-}
-
-__printf(2, 3)
-static int batadv_fdebug_log(struct batadv_priv_debug_log *debug_log,
- const char *fmt, ...)
-{
- va_list args;
- static char debug_log_buf[256];
- char *p;
-
- if (!debug_log)
- return 0;
-
- spin_lock_bh(&debug_log->lock);
- va_start(args, fmt);
- vscnprintf(debug_log_buf, sizeof(debug_log_buf), fmt, args);
- va_end(args);
-
- for (p = debug_log_buf; *p != 0; p++)
- batadv_emit_log_char(debug_log, *p);
+#include <linux/stdarg.h>
- spin_unlock_bh(&debug_log->lock);
-
- wake_up(&debug_log->queue_wait);
-
- return 0;
-}
+#include "trace.h"
/**
* batadv_debug_log() - Add debug log entry
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @fmt: format string
*
* Return: 0 on success or negative error number in case of failure
*/
int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...)
{
+ struct va_format vaf;
va_list args;
- char tmp_log_buf[256];
va_start(args, fmt);
- vscnprintf(tmp_log_buf, sizeof(tmp_log_buf), fmt, args);
- batadv_fdebug_log(bat_priv->debug_log, "[%10u] %s",
- jiffies_to_msecs(jiffies), tmp_log_buf);
- va_end(args);
-
- return 0;
-}
-
-static int batadv_log_open(struct inode *inode, struct file *file)
-{
- if (!try_module_get(THIS_MODULE))
- return -EBUSY;
-
- nonseekable_open(inode, file);
- file->private_data = inode->i_private;
- return 0;
-}
-
-static int batadv_log_release(struct inode *inode, struct file *file)
-{
- module_put(THIS_MODULE);
- return 0;
-}
-
-static bool batadv_log_empty(struct batadv_priv_debug_log *debug_log)
-{
- return !(debug_log->log_start - debug_log->log_end);
-}
-static ssize_t batadv_log_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct batadv_priv *bat_priv = file->private_data;
- struct batadv_priv_debug_log *debug_log = bat_priv->debug_log;
- int error, i = 0;
- char *char_addr;
- char c;
-
- if ((file->f_flags & O_NONBLOCK) && batadv_log_empty(debug_log))
- return -EAGAIN;
-
- if (!buf)
- return -EINVAL;
-
- if (count == 0)
- return 0;
-
- if (!access_ok(VERIFY_WRITE, buf, count))
- return -EFAULT;
-
- error = wait_event_interruptible(debug_log->queue_wait,
- (!batadv_log_empty(debug_log)));
-
- if (error)
- return error;
-
- spin_lock_bh(&debug_log->lock);
-
- while ((!error) && (i < count) &&
- (debug_log->log_start != debug_log->log_end)) {
- char_addr = batadv_log_char_addr(debug_log,
- debug_log->log_start);
- c = *char_addr;
-
- debug_log->log_start++;
-
- spin_unlock_bh(&debug_log->lock);
-
- error = __put_user(c, buf);
-
- spin_lock_bh(&debug_log->lock);
-
- buf++;
- i++;
- }
-
- spin_unlock_bh(&debug_log->lock);
-
- if (!error)
- return i;
-
- return error;
-}
-
-static __poll_t batadv_log_poll(struct file *file, poll_table *wait)
-{
- struct batadv_priv *bat_priv = file->private_data;
- struct batadv_priv_debug_log *debug_log = bat_priv->debug_log;
-
- poll_wait(file, &debug_log->queue_wait, wait);
+ vaf.fmt = fmt;
+ vaf.va = &args;
- if (!batadv_log_empty(debug_log))
- return EPOLLIN | EPOLLRDNORM;
+ trace_batadv_dbg(bat_priv, &vaf);
- return 0;
-}
-
-static const struct file_operations batadv_log_fops = {
- .open = batadv_log_open,
- .release = batadv_log_release,
- .read = batadv_log_read,
- .poll = batadv_log_poll,
- .llseek = no_llseek,
-};
-
-/**
- * batadv_debug_log_setup() - Initialize debug log
- * @bat_priv: the bat priv with all the soft interface information
- *
- * Return: 0 on success or negative error number in case of failure
- */
-int batadv_debug_log_setup(struct batadv_priv *bat_priv)
-{
- struct dentry *d;
-
- if (!bat_priv->debug_dir)
- goto err;
-
- bat_priv->debug_log = kzalloc(sizeof(*bat_priv->debug_log), GFP_ATOMIC);
- if (!bat_priv->debug_log)
- goto err;
-
- spin_lock_init(&bat_priv->debug_log->lock);
- init_waitqueue_head(&bat_priv->debug_log->queue_wait);
-
- d = debugfs_create_file("log", 0400, bat_priv->debug_dir, bat_priv,
- &batadv_log_fops);
- if (!d)
- goto err;
+ va_end(args);
return 0;
-
-err:
- return -ENOMEM;
-}
-
-/**
- * batadv_debug_log_cleanup() - Destroy debug log
- * @bat_priv: the bat priv with all the soft interface information
- */
-void batadv_debug_log_cleanup(struct batadv_priv *bat_priv)
-{
- kfree(bat_priv->debug_log);
- bat_priv->debug_log = NULL;
}
diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h
index 35f4f397ed57..225b747a2048 100644
--- a/net/batman-adv/log.h
+++ b/net/batman-adv/log.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_LOG_H_
@@ -21,6 +9,7 @@
#include "main.h"
+#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/compiler.h>
#include <linux/printk.h>
@@ -62,9 +51,6 @@ enum batadv_dbg_level {
/** @BATADV_DBG_DAT: ARP snooping and DAT related messages */
BATADV_DBG_DAT = BIT(4),
- /** @BATADV_DBG_NC: network coding related messages */
- BATADV_DBG_NC = BIT(5),
-
/** @BATADV_DBG_MCAST: multicast related messages */
BATADV_DBG_MCAST = BIT(6),
@@ -80,12 +66,12 @@ int batadv_debug_log(struct batadv_priv *bat_priv, const char *fmt, ...)
__printf(2, 3);
/**
- * _batadv_dbg() - Store debug output with(out) ratelimiting
+ * _batadv_dbg() - Store debug output with(out) rate limiting
* @type: type of debug message
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @ratelimited: whether output should be rate limited
* @fmt: format string
- * @arg...: variable arguments
+ * @arg: variable arguments
*/
#define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \
do { \
@@ -106,28 +92,28 @@ static inline void _batadv_dbg(int type __always_unused,
#endif
/**
- * batadv_dbg() - Store debug output without ratelimiting
+ * batadv_dbg() - Store debug output without rate limiting
* @type: type of debug message
- * @bat_priv: the bat priv with all the soft interface information
- * @arg...: format string and variable arguments
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @arg: format string and variable arguments
*/
#define batadv_dbg(type, bat_priv, arg...) \
_batadv_dbg(type, bat_priv, 0, ## arg)
/**
- * batadv_dbg_ratelimited() - Store debug output with ratelimiting
+ * batadv_dbg_ratelimited() - Store debug output with rate limiting
* @type: type of debug message
- * @bat_priv: the bat priv with all the soft interface information
- * @arg...: format string and variable arguments
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @arg: format string and variable arguments
*/
#define batadv_dbg_ratelimited(type, bat_priv, arg...) \
_batadv_dbg(type, bat_priv, 1, ## arg)
/**
* batadv_info() - Store message in debug buffer and print it to kmsg buffer
- * @net_dev: the soft interface net device
+ * @net_dev: the mesh interface net device
* @fmt: format string
- * @arg...: variable arguments
+ * @arg: variable arguments
*/
#define batadv_info(net_dev, fmt, arg...) \
do { \
@@ -139,9 +125,9 @@ static inline void _batadv_dbg(int type __always_unused,
/**
* batadv_err() - Store error in debug buffer and print it to kmsg buffer
- * @net_dev: the soft interface net device
+ * @net_dev: the mesh interface net device
* @fmt: format string
- * @arg...: variable arguments
+ * @arg: variable arguments
*/
#define batadv_err(net_dev, fmt, arg...) \
do { \
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 69c0d85bceb3..3a35aadd8b41 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -1,50 +1,41 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "main.h"
+#include <linux/array_size.h>
#include <linux/atomic.h>
#include <linux/build_bug.h>
#include <linux/byteorder/generic.h>
-#include <linux/crc32c.h>
+#include <linux/container_of.h>
+#include <linux/device.h>
#include <linux/errno.h>
-#include <linux/genetlink.h>
#include <linux/gfp.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <linux/init.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
-#include <linux/kernel.h>
+#include <linux/kobject.h>
#include <linux/kref.h>
#include <linux/list.h>
+#include <linux/minmax.h>
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/printk.h>
-#include <linux/rculist.h>
#include <linux/rcupdate.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <linux/sprintf.h>
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/workqueue.h>
#include <net/dsfield.h>
+#include <net/genetlink.h>
#include <net/rtnetlink.h>
#include <uapi/linux/batadv_packet.h>
#include <uapi/linux/batman_adv.h>
@@ -53,20 +44,17 @@
#include "bat_iv_ogm.h"
#include "bat_v.h"
#include "bridge_loop_avoidance.h"
-#include "debugfs.h"
#include "distributed-arp-table.h"
#include "gateway_client.h"
#include "gateway_common.h"
#include "hard-interface.h"
-#include "icmp_socket.h"
#include "log.h"
+#include "mesh-interface.h"
#include "multicast.h"
#include "netlink.h"
-#include "network-coding.h"
#include "originator.h"
#include "routing.h"
#include "send.h"
-#include "soft-interface.h"
#include "tp_meter.h"
#include "translation-table.h"
@@ -74,15 +62,30 @@
* list traversals just rcu-locked
*/
struct list_head batadv_hardif_list;
+unsigned int batadv_hardif_generation;
static int (*batadv_rx_handler[256])(struct sk_buff *skb,
struct batadv_hard_iface *recv_if);
-unsigned char batadv_broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
-
struct workqueue_struct *batadv_event_workqueue;
static void batadv_recv_handler_init(void);
+#define BATADV_UEV_TYPE_VAR "BATTYPE="
+#define BATADV_UEV_ACTION_VAR "BATACTION="
+#define BATADV_UEV_DATA_VAR "BATDATA="
+
+static char *batadv_uev_action_str[] = {
+ "add",
+ "del",
+ "change",
+ "loopdetect",
+};
+
+static char *batadv_uev_type_str[] = {
+ "gw",
+ "bla",
+};
+
static int __init batadv_init(void)
{
int ret;
@@ -98,16 +101,12 @@ static int __init batadv_init(void)
batadv_v_init();
batadv_iv_init();
- batadv_nc_init();
batadv_tp_meter_init();
batadv_event_workqueue = create_singlethread_workqueue("bat_events");
if (!batadv_event_workqueue)
goto err_create_wq;
- batadv_socket_init();
- batadv_debugfs_init();
-
register_netdevice_notifier(&batadv_hard_if_notifier);
rtnl_link_register(&batadv_link_ops);
batadv_netlink_register();
@@ -125,13 +124,10 @@ err_create_wq:
static void __exit batadv_exit(void)
{
- batadv_debugfs_destroy();
batadv_netlink_unregister();
rtnl_link_unregister(&batadv_link_ops);
unregister_netdevice_notifier(&batadv_hard_if_notifier);
- batadv_hardif_remove_interfaces();
- flush_workqueue(batadv_event_workqueue);
destroy_workqueue(batadv_event_workqueue);
batadv_event_workqueue = NULL;
@@ -141,14 +137,14 @@ static void __exit batadv_exit(void)
}
/**
- * batadv_mesh_init() - Initialize soft interface
- * @soft_iface: netdev struct of the soft interface
+ * batadv_mesh_init() - Initialize mesh interface
+ * @mesh_iface: netdev struct of the mesh interface
*
* Return: 0 on success or negative error number in case of failure
*/
-int batadv_mesh_init(struct net_device *soft_iface)
+int batadv_mesh_init(struct net_device *mesh_iface)
{
- struct batadv_priv *bat_priv = netdev_priv(soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(mesh_iface);
int ret;
spin_lock_init(&bat_priv->forw_bat_list_lock);
@@ -160,11 +156,12 @@ int batadv_mesh_init(struct net_device *soft_iface)
spin_lock_init(&bat_priv->tt.commit_lock);
spin_lock_init(&bat_priv->gw.list_lock);
#ifdef CONFIG_BATMAN_ADV_MCAST
+ spin_lock_init(&bat_priv->mcast.mla_lock);
spin_lock_init(&bat_priv->mcast.want_lists_lock);
#endif
spin_lock_init(&bat_priv->tvlv.container_list_lock);
spin_lock_init(&bat_priv->tvlv.handler_list_lock);
- spin_lock_init(&bat_priv->softif_vlan_list_lock);
+ spin_lock_init(&bat_priv->meshif_vlan_list_lock);
spin_lock_init(&bat_priv->tp_list_lock);
INIT_HLIST_HEAD(&bat_priv->forw_bat_list);
@@ -183,32 +180,40 @@ int batadv_mesh_init(struct net_device *soft_iface)
#endif
INIT_HLIST_HEAD(&bat_priv->tvlv.container_list);
INIT_HLIST_HEAD(&bat_priv->tvlv.handler_list);
- INIT_HLIST_HEAD(&bat_priv->softif_vlan_list);
+ INIT_HLIST_HEAD(&bat_priv->meshif_vlan_list);
INIT_HLIST_HEAD(&bat_priv->tp_list);
- ret = batadv_v_mesh_init(bat_priv);
- if (ret < 0)
- goto err;
+ bat_priv->gw.generation = 0;
ret = batadv_originator_init(bat_priv);
- if (ret < 0)
- goto err;
+ if (ret < 0) {
+ atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+ goto err_orig;
+ }
ret = batadv_tt_init(bat_priv);
- if (ret < 0)
- goto err;
+ if (ret < 0) {
+ atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+ goto err_tt;
+ }
+
+ ret = batadv_v_mesh_init(bat_priv);
+ if (ret < 0) {
+ atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+ goto err_v;
+ }
ret = batadv_bla_init(bat_priv);
- if (ret < 0)
- goto err;
+ if (ret < 0) {
+ atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+ goto err_bla;
+ }
ret = batadv_dat_init(bat_priv);
- if (ret < 0)
- goto err;
-
- ret = batadv_nc_mesh_init(bat_priv);
- if (ret < 0)
- goto err;
+ if (ret < 0) {
+ atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
+ goto err_dat;
+ }
batadv_gw_init(bat_priv);
batadv_mcast_init(bat_priv);
@@ -218,18 +223,28 @@ int batadv_mesh_init(struct net_device *soft_iface)
return 0;
-err:
- batadv_mesh_free(soft_iface);
+err_dat:
+ batadv_bla_free(bat_priv);
+err_bla:
+ batadv_v_mesh_free(bat_priv);
+err_v:
+ batadv_tt_free(bat_priv);
+err_tt:
+ batadv_originator_free(bat_priv);
+err_orig:
+ batadv_purge_outstanding_packets(bat_priv, NULL);
+ atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE);
+
return ret;
}
/**
- * batadv_mesh_free() - Deinitialize soft interface
- * @soft_iface: netdev struct of the soft interface
+ * batadv_mesh_free() - Deinitialize mesh interface
+ * @mesh_iface: netdev struct of the mesh interface
*/
-void batadv_mesh_free(struct net_device *soft_iface)
+void batadv_mesh_free(struct net_device *mesh_iface)
{
- struct batadv_priv *bat_priv = netdev_priv(soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(mesh_iface);
atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
@@ -238,7 +253,6 @@ void batadv_mesh_free(struct net_device *soft_iface)
batadv_gw_node_free(bat_priv);
batadv_v_mesh_free(bat_priv);
- batadv_nc_mesh_free(bat_priv);
batadv_dat_free(bat_priv);
batadv_bla_free(bat_priv);
@@ -268,7 +282,7 @@ void batadv_mesh_free(struct net_device *soft_iface)
/**
* batadv_is_my_mac() - check if the given mac address belongs to any of the
* real interfaces in the current mesh
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @addr: the address to check
*
* Return: 'true' if the mac address was found, false otherwise.
@@ -276,16 +290,14 @@ void batadv_mesh_free(struct net_device *soft_iface)
bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr)
{
const struct batadv_hard_iface *hard_iface;
+ struct list_head *iter;
bool is_my_mac = false;
rcu_read_lock();
- list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) {
if (hard_iface->if_status != BATADV_IF_ACTIVE)
continue;
- if (hard_iface->soft_iface != bat_priv->soft_iface)
- continue;
-
if (batadv_compare_eth(hard_iface->net_dev->dev_addr, addr)) {
is_my_mac = true;
break;
@@ -295,44 +307,6 @@ bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr)
return is_my_mac;
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_seq_print_text_primary_if_get() - called from debugfs table printing
- * function that requires the primary interface
- * @seq: debugfs table seq_file struct
- *
- * Return: primary interface if found or NULL otherwise.
- */
-struct batadv_hard_iface *
-batadv_seq_print_text_primary_if_get(struct seq_file *seq)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hard_iface *primary_if;
-
- primary_if = batadv_primary_if_get_selected(bat_priv);
-
- if (!primary_if) {
- seq_printf(seq,
- "BATMAN mesh %s disabled - please specify interfaces to enable it\n",
- net_dev->name);
- goto out;
- }
-
- if (primary_if->if_status == BATADV_IF_ACTIVE)
- goto out;
-
- seq_printf(seq,
- "BATMAN mesh %s disabled - primary interface not active\n",
- net_dev->name);
- batadv_hardif_put(primary_if);
- primary_if = NULL;
-
-out:
- return primary_if;
-}
-#endif
-
/**
* batadv_max_header_len() - calculate maximum encapsulation overhead for a
* payload packet
@@ -350,11 +324,6 @@ int batadv_max_header_len(void)
header_len = max_t(int, header_len,
sizeof(struct batadv_bcast_packet));
-#ifdef CONFIG_BATMAN_ADV_NC
- header_len = max_t(int, header_len,
- sizeof(struct batadv_coded_packet));
-#endif
-
return header_len + ETH_HLEN;
}
@@ -466,10 +435,10 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
if (unlikely(skb->mac_len != ETH_HLEN || !skb_mac_header(skb)))
goto err_free;
- if (!hard_iface->soft_iface)
+ if (!hard_iface->mesh_iface)
goto err_free;
- bat_priv = netdev_priv(hard_iface->soft_iface);
+ bat_priv = netdev_priv(hard_iface->mesh_iface);
if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
goto err_free;
@@ -537,11 +506,13 @@ static void batadv_recv_handler_init(void)
BUILD_BUG_ON(sizeof(struct batadv_tvlv_tt_change) != 12);
BUILD_BUG_ON(sizeof(struct batadv_tvlv_roam_adv) != 8);
- i = FIELD_SIZEOF(struct sk_buff, cb);
+ i = sizeof_field(struct sk_buff, cb);
BUILD_BUG_ON(sizeof(struct batadv_skb_cb) > i);
/* broadcast packet */
batadv_rx_handler[BATADV_BCAST] = batadv_recv_bcast_packet;
+ /* multicast packet */
+ batadv_rx_handler[BATADV_MCAST] = batadv_recv_mcast_packet;
/* unicast packets ... */
/* unicast with 4 addresses packet */
@@ -590,39 +561,6 @@ void batadv_recv_handler_unregister(u8 packet_type)
}
/**
- * batadv_skb_crc32() - calculate CRC32 of the whole packet and skip bytes in
- * the header
- * @skb: skb pointing to fragmented socket buffers
- * @payload_ptr: Pointer to position inside the head buffer of the skb
- * marking the start of the data to be CRC'ed
- *
- * payload_ptr must always point to an address in the skb head buffer and not to
- * a fragment.
- *
- * Return: big endian crc32c of the checksummed data
- */
-__be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr)
-{
- u32 crc = 0;
- unsigned int from;
- unsigned int to = skb->len;
- struct skb_seq_state st;
- const u8 *data;
- unsigned int len;
- unsigned int consumed = 0;
-
- from = (unsigned int)(payload_ptr - skb->data);
-
- skb_prepare_seq_read(skb, from, to, &st);
- while ((len = skb_seq_read(consumed, &data, &st)) != 0) {
- crc = crc32c(crc, data, len);
- consumed += len;
- }
-
- return htonl(crc);
-}
-
-/**
* batadv_get_vid() - extract the VLAN identifier from skb if any
* @skb: the buffer containing the packet
* @header_len: length of the batman header preceding the ethernet header
@@ -644,6 +582,13 @@ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len)
vhdr = (struct vlan_ethhdr *)(skb->data + header_len);
vid = ntohs(vhdr->h_vlan_TCI) & VLAN_VID_MASK;
+
+ /* VID 0 is only used to indicate "priority tag" frames which only
+ * contain priority information and no VID.
+ */
+ if (vid == 0)
+ return BATADV_NO_FLAGS;
+
vid |= BATADV_VLAN_HAS_TAG;
return vid;
@@ -651,30 +596,86 @@ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len)
/**
* batadv_vlan_ap_isola_get() - return AP isolation status for the given vlan
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @vid: the VLAN identifier for which the AP isolation attributed as to be
* looked up
*
- * Return: true if AP isolation is on for the VLAN idenfied by vid, false
+ * Return: true if AP isolation is on for the VLAN identified by vid, false
* otherwise
*/
bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid)
{
bool ap_isolation_enabled = false;
- struct batadv_softif_vlan *vlan;
+ struct batadv_meshif_vlan *vlan;
/* if the AP isolation is requested on a VLAN, then check for its
* setting in the proper VLAN private data structure
*/
- vlan = batadv_softif_vlan_get(bat_priv, vid);
+ vlan = batadv_meshif_vlan_get(bat_priv, vid);
if (vlan) {
ap_isolation_enabled = atomic_read(&vlan->ap_isolation);
- batadv_softif_vlan_put(vlan);
+ batadv_meshif_vlan_put(vlan);
}
return ap_isolation_enabled;
}
+/**
+ * batadv_throw_uevent() - Send an uevent with batman-adv specific env data
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @type: subsystem type of event. Stored in uevent's BATTYPE
+ * @action: action type of event. Stored in uevent's BATACTION
+ * @data: string with additional information to the event (ignored for
+ * BATADV_UEV_DEL). Stored in uevent's BATDATA
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type,
+ enum batadv_uev_action action, const char *data)
+{
+ int ret = -ENOMEM;
+ struct kobject *bat_kobj;
+ char *uevent_env[4] = { NULL, NULL, NULL, NULL };
+
+ bat_kobj = &bat_priv->mesh_iface->dev.kobj;
+
+ uevent_env[0] = kasprintf(GFP_ATOMIC,
+ "%s%s", BATADV_UEV_TYPE_VAR,
+ batadv_uev_type_str[type]);
+ if (!uevent_env[0])
+ goto report_error;
+
+ uevent_env[1] = kasprintf(GFP_ATOMIC,
+ "%s%s", BATADV_UEV_ACTION_VAR,
+ batadv_uev_action_str[action]);
+ if (!uevent_env[1])
+ goto free_first_env;
+
+ /* If the event is DEL, ignore the data field */
+ if (action != BATADV_UEV_DEL) {
+ uevent_env[2] = kasprintf(GFP_ATOMIC,
+ "%s%s", BATADV_UEV_DATA_VAR, data);
+ if (!uevent_env[2])
+ goto free_second_env;
+ }
+
+ ret = kobject_uevent_env(bat_kobj, KOBJ_CHANGE, uevent_env);
+ kfree(uevent_env[2]);
+free_second_env:
+ kfree(uevent_env[1]);
+free_first_env:
+ kfree(uevent_env[0]);
+
+ if (ret)
+report_error:
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Impossible to send uevent for (%s,%s,%s) event (err: %d)\n",
+ batadv_uev_type_str[type],
+ batadv_uev_action_str[action],
+ (action == BATADV_UEV_DEL ? "NULL" : data), ret);
+ return ret;
+}
+
module_init(batadv_init);
module_exit(batadv_exit);
@@ -682,7 +683,6 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR(BATADV_DRIVER_AUTHOR);
MODULE_DESCRIPTION(BATADV_DRIVER_DESC);
-MODULE_SUPPORTED_DEVICE(BATADV_DRIVER_DEVICE);
MODULE_VERSION(BATADV_SOURCE_VERSION);
MODULE_ALIAS_RTNL_LINK("batadv");
MODULE_ALIAS_GENL_FAMILY(BATADV_NL_NAME);
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 3ccc75ee719c..af230b017bc1 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -1,31 +1,19 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_MAIN_H_
#define _NET_BATMAN_ADV_MAIN_H_
-#define BATADV_DRIVER_AUTHOR "Marek Lindner <mareklindner@neomailbox.ch>, " \
+#define BATADV_DRIVER_AUTHOR "Marek Lindner <marek.lindner@mailbox.org>, " \
"Simon Wunderlich <sw@simonwunderlich.de>"
#define BATADV_DRIVER_DESC "B.A.T.M.A.N. advanced"
#define BATADV_DRIVER_DEVICE "batman-adv"
#ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2018.3"
+#define BATADV_SOURCE_VERSION "2025.5"
#endif
/* B.A.T.M.A.N. parameters */
@@ -34,6 +22,8 @@
#define BATADV_THROUGHPUT_MAX_VALUE 0xFFFFFFFF
#define BATADV_JITTER 20
+#define BATADV_MAX_MTU (ETH_MAX_MTU - batadv_max_header_len())
+
/* Time To Live of broadcast messages */
#define BATADV_TTL 50
@@ -100,7 +90,6 @@
/* number of packets to send for broadcasts on different interface types */
#define BATADV_NUM_BCASTS_DEFAULT 1
#define BATADV_NUM_BCASTS_WIRELESS 3
-#define BATADV_NUM_BCASTS_MAX 3
/* length of the single packet used by the TP meter */
#define BATADV_TP_PACKET_LEN ETH_DATA_LEN
@@ -115,9 +104,7 @@
*/
#define BATADV_TQ_SIMILARITY_THRESHOLD 50
-/* should not be bigger than 512 bytes or change the size of
- * forw_packet->direct_link_flags
- */
+#define BATADV_MAX_AGGREGATION_PACKETS 32
#define BATADV_MAX_AGGREGATION_BYTES 512
#define BATADV_MAX_AGGREGATION_MS 100
@@ -134,18 +121,16 @@
#define BATADV_RESET_PROTECTION_MS 30000
#define BATADV_EXPECTED_SEQNO_RANGE 65536
-#define BATADV_NC_NODE_TIMEOUT 10000 /* Milliseconds */
-
/**
* BATADV_TP_MAX_NUM - maximum number of simultaneously active tp sessions
*/
#define BATADV_TP_MAX_NUM 5
/**
- * enum batadv_mesh_state - State of a soft interface
+ * enum batadv_mesh_state - State of a mesh interface
*/
enum batadv_mesh_state {
- /** @BATADV_MESH_INACTIVE: soft interface is not yet running */
+ /** @BATADV_MESH_INACTIVE: mesh interface is not yet running */
BATADV_MESH_INACTIVE,
/** @BATADV_MESH_ACTIVE: interface is up and running */
@@ -217,20 +202,19 @@ enum batadv_uev_type {
/* Kernel headers */
+#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/etherdevice.h>
#include <linux/if_vlan.h>
#include <linux/jiffies.h>
+#include <linux/netdevice.h>
#include <linux/percpu.h>
+#include <linux/skbuff.h>
#include <linux/types.h>
#include <uapi/linux/batadv_packet.h>
#include "types.h"
-
-struct net_device;
-struct packet_type;
-struct seq_file;
-struct sk_buff;
+#include "main.h"
/**
* batadv_print_vid() - return printable version of vid information
@@ -247,15 +231,13 @@ static inline int batadv_print_vid(unsigned short vid)
}
extern struct list_head batadv_hardif_list;
+extern unsigned int batadv_hardif_generation;
-extern unsigned char batadv_broadcast_addr[];
extern struct workqueue_struct *batadv_event_workqueue;
-int batadv_mesh_init(struct net_device *soft_iface);
-void batadv_mesh_free(struct net_device *soft_iface);
+int batadv_mesh_init(struct net_device *mesh_iface);
+void batadv_mesh_free(struct net_device *mesh_iface);
bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr);
-struct batadv_hard_iface *
-batadv_seq_print_text_primary_if_get(struct seq_file *seq);
int batadv_max_header_len(void);
void batadv_skb_set_priority(struct sk_buff *skb, int offset);
int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev,
@@ -266,7 +248,6 @@ batadv_recv_handler_register(u8 packet_type,
int (*recv_handler)(struct sk_buff *,
struct batadv_hard_iface *));
void batadv_recv_handler_unregister(u8 packet_type);
-__be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr);
/**
* batadv_compare_eth() - Compare two not u16 aligned Ethernet addresses
@@ -319,7 +300,7 @@ static inline bool batadv_has_timed_out(unsigned long timestamp,
* @y: value to compare @x against
*
* It handles overflows/underflows and can correctly check for a predecessor
- * unless the variable sequence number has grown by more then
+ * unless the variable sequence number has grown by more than
* 2**(bitwidth(x)-1)-1.
*
* This means that for a u8 with the maximum value 255, it would think:
@@ -341,11 +322,11 @@ static inline bool batadv_has_timed_out(unsigned long timestamp,
/**
* batadv_seq_after() - Checks if a sequence number x is a successor of y
- * @x: potential sucessor of @y
+ * @x: potential successor of @y
* @y: value to compare @x against
*
* It handles overflows/underflows and can correctly check for a successor
- * unless the variable sequence number has grown by more then
+ * unless the variable sequence number has grown by more than
* 2**(bitwidth(x)-1)-1.
*
* This means that for a u8 with the maximum value 255, it would think:
@@ -360,8 +341,8 @@ static inline bool batadv_has_timed_out(unsigned long timestamp,
#define batadv_seq_after(x, y) batadv_seq_before(y, x)
/**
- * batadv_add_counter() - Add to per cpu statistics counter of soft interface
- * @bat_priv: the bat priv with all the soft interface information
+ * batadv_add_counter() - Add to per cpu statistics counter of mesh interface
+ * @bat_priv: the bat priv with all the mesh interface information
* @idx: counter index which should be modified
* @count: value to increase counter by
*
@@ -374,8 +355,8 @@ static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx,
}
/**
- * batadv_inc_counter() - Increase per cpu statistics counter of soft interface
- * @b: the bat priv with all the soft interface information
+ * batadv_inc_counter() - Increase per cpu statistics counter of mesh interface
+ * @b: the bat priv with all the mesh interface information
* @i: counter index which should be modified
*/
#define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1)
@@ -393,5 +374,7 @@ static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx,
unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len);
bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid);
+int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type,
+ enum batadv_uev_action action, const char *data);
#endif /* _NET_BATMAN_ADV_MAIN_H_ */
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/mesh-interface.c
index 626ddca332db..df7e95811ef5 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/mesh-interface.c
@@ -1,28 +1,17 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#include "soft-interface.h"
+#include "mesh-interface.h"
#include "main.h"
#include <linux/atomic.h>
#include <linux/byteorder/generic.h>
#include <linux/cache.h>
#include <linux/compiler.h>
+#include <linux/container_of.h>
#include <linux/cpumask.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
@@ -31,17 +20,15 @@
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <linux/jiffies.h>
-#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
+#include <linux/netlink.h>
#include <linux/percpu.h>
-#include <linux/printk.h>
#include <linux/random.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
-#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/socket.h>
@@ -49,20 +36,18 @@
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/types.h>
+#include <net/netlink.h>
+#include <net/rtnetlink.h>
#include <uapi/linux/batadv_packet.h>
+#include <uapi/linux/batman_adv.h>
#include "bat_algo.h"
#include "bridge_loop_avoidance.h"
-#include "debugfs.h"
#include "distributed-arp-table.h"
#include "gateway_client.h"
-#include "gateway_common.h"
#include "hard-interface.h"
#include "multicast.h"
-#include "network-coding.h"
-#include "originator.h"
#include "send.h"
-#include "sysfs.h"
#include "translation-table.h"
/**
@@ -91,21 +76,9 @@ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len)
return 0;
}
-static int batadv_interface_open(struct net_device *dev)
-{
- netif_start_queue(dev);
- return 0;
-}
-
-static int batadv_interface_release(struct net_device *dev)
-{
- netif_stop_queue(dev);
- return 0;
-}
-
/**
* batadv_sum_counter() - Sum the cpu-local counters for index 'idx'
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @idx: index of counter to sum up
*
* Return: sum of all cpu-local counters
@@ -139,7 +112,7 @@ static struct net_device_stats *batadv_interface_stats(struct net_device *dev)
static int batadv_interface_set_mac_addr(struct net_device *dev, void *p)
{
struct batadv_priv *bat_priv = netdev_priv(dev);
- struct batadv_softif_vlan *vlan;
+ struct batadv_meshif_vlan *vlan;
struct sockaddr *addr = p;
u8 old_addr[ETH_ALEN];
@@ -147,14 +120,14 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p)
return -EADDRNOTAVAIL;
ether_addr_copy(old_addr, dev->dev_addr);
- ether_addr_copy(dev->dev_addr, addr->sa_data);
+ eth_hw_addr_set(dev, addr->sa_data);
/* only modify transtable if it has been initialized before */
if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
return 0;
rcu_read_lock();
- hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) {
+ hlist_for_each_entry_rcu(vlan, &bat_priv->meshif_vlan_list, list) {
batadv_tt_local_remove(bat_priv, old_addr, vlan->vid,
"mac address changed", false);
batadv_tt_local_add(dev, addr->sa_data, vlan->vid,
@@ -167,11 +140,14 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p)
static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu)
{
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+
/* check ranges */
- if (new_mtu < 68 || new_mtu > batadv_hardif_min_mtu(dev))
+ if (new_mtu < ETH_MIN_MTU || new_mtu > batadv_hardif_min_mtu(dev))
return -EINVAL;
- dev->mtu = new_mtu;
+ WRITE_ONCE(dev->mtu, new_mtu);
+ bat_priv->mtu_set_by_user = new_mtu;
return 0;
}
@@ -181,7 +157,7 @@ static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu)
* @dev: registered network device to modify
*
* We do not actually need to set any rx filters for the virtual batman
- * soft interface. However a dummy handler enables a user to set static
+ * mesh interface. However a dummy handler enables a user to set static
* multicast listeners for instance.
*/
static void batadv_interface_set_rx_mode(struct net_device *dev)
@@ -189,10 +165,10 @@ static void batadv_interface_set_rx_mode(struct net_device *dev)
}
static netdev_tx_t batadv_interface_tx(struct sk_buff *skb,
- struct net_device *soft_iface)
+ struct net_device *mesh_iface)
{
struct ethhdr *ethhdr;
- struct batadv_priv *bat_priv = netdev_priv(soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(mesh_iface);
struct batadv_hard_iface *primary_if = NULL;
struct batadv_bcast_packet *bcast_packet;
static const u8 stp_addr[ETH_ALEN] = {0x01, 0x80, 0xC2, 0x00,
@@ -204,14 +180,15 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb,
struct vlan_ethhdr *vhdr;
unsigned int header_len = 0;
int data_len = skb->len, ret;
- unsigned long brd_delay = 1;
+ unsigned long brd_delay = 0;
bool do_bcast = false, client_added;
unsigned short vid;
u32 seqno;
int gw_mode;
- enum batadv_forw_mode forw_mode;
- struct batadv_orig_node *mcast_single_orig = NULL;
+ enum batadv_forw_mode forw_mode = BATADV_FORW_BCAST;
+ int mcast_is_routable = 0;
int network_offset = ETH_HLEN;
+ __be16 proto;
if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
goto dropped;
@@ -219,21 +196,28 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb,
/* reset control block to avoid left overs from previous users */
memset(skb->cb, 0, sizeof(struct batadv_skb_cb));
- netif_trans_update(soft_iface);
+ netif_trans_update(mesh_iface);
vid = batadv_get_vid(skb, 0);
+
+ skb_reset_mac_header(skb);
ethhdr = eth_hdr(skb);
- switch (ntohs(ethhdr->h_proto)) {
+ proto = ethhdr->h_proto;
+
+ switch (ntohs(proto)) {
case ETH_P_8021Q:
+ if (!pskb_may_pull(skb, sizeof(*vhdr)))
+ goto dropped;
vhdr = vlan_eth_hdr(skb);
+ proto = vhdr->h_vlan_encapsulated_proto;
/* drop batman-in-batman packets to prevent loops */
- if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN)) {
+ if (proto != htons(ETH_P_BATMAN)) {
network_offset += VLAN_HLEN;
break;
}
- /* fall through */
+ fallthrough;
case ETH_P_BATMAN:
goto dropped;
}
@@ -249,13 +233,16 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb,
/* Register the client MAC in the transtable */
if (!is_multicast_ether_addr(ethhdr->h_source) &&
!batadv_bla_is_loopdetect_mac(ethhdr->h_source)) {
- client_added = batadv_tt_local_add(soft_iface, ethhdr->h_source,
+ client_added = batadv_tt_local_add(mesh_iface, ethhdr->h_source,
vid, skb->skb_iif,
skb->mark);
if (!client_added)
goto dropped;
}
+ /* Snoop address candidates from DHCPACKs for early DAT filling */
+ batadv_dat_snoop_outgoing_dhcp_ack(bat_priv, skb, proto, vid);
+
/* don't accept stp packets. STP does not help in meshes.
* better use the bridge loop avoidance ...
*
@@ -301,13 +288,20 @@ static netdev_tx_t batadv_interface_tx(struct sk_buff *skb,
send:
if (do_bcast && !is_broadcast_ether_addr(ethhdr->h_dest)) {
- forw_mode = batadv_mcast_forw_mode(bat_priv, skb,
- &mcast_single_orig);
- if (forw_mode == BATADV_FORW_NONE)
- goto dropped;
-
- if (forw_mode == BATADV_FORW_SINGLE)
+ forw_mode = batadv_mcast_forw_mode(bat_priv, skb, vid,
+ &mcast_is_routable);
+ switch (forw_mode) {
+ case BATADV_FORW_BCAST:
+ break;
+ case BATADV_FORW_UCASTS:
+ case BATADV_FORW_MCAST:
do_bcast = false;
+ break;
+ case BATADV_FORW_NONE:
+ fallthrough;
+ default:
+ goto dropped;
+ }
}
}
@@ -331,7 +325,7 @@ send:
bcast_packet = (struct batadv_bcast_packet *)skb->data;
bcast_packet->version = BATADV_COMPAT_VERSION;
- bcast_packet->ttl = BATADV_TTL;
+ bcast_packet->ttl = BATADV_TTL - 1;
/* batman packet type: broadcast */
bcast_packet->packet_type = BATADV_BCAST;
@@ -347,13 +341,7 @@ send:
seqno = atomic_inc_return(&bat_priv->bcast_seqno);
bcast_packet->seqno = htonl(seqno);
- batadv_add_bcast_packet_to_list(bat_priv, skb, brd_delay, true);
-
- /* a copy is stored in the bcast list, therefore removing
- * the original skb.
- */
- consume_skb(skb);
-
+ batadv_send_bcast_packet(bat_priv, skb, brd_delay, true);
/* unicast packet */
} else {
/* DHCP packets going to a server will use the GW feature */
@@ -362,10 +350,11 @@ send:
if (ret)
goto dropped;
ret = batadv_send_skb_via_gw(bat_priv, skb, vid);
- } else if (mcast_single_orig) {
- ret = batadv_send_skb_unicast(bat_priv, skb,
- BATADV_UNICAST, 0,
- mcast_single_orig, vid);
+ } else if (forw_mode == BATADV_FORW_UCASTS) {
+ ret = batadv_mcast_forw_send(bat_priv, skb, vid,
+ mcast_is_routable);
+ } else if (forw_mode == BATADV_FORW_MCAST) {
+ ret = batadv_mcast_forw_mcsend(bat_priv, skb);
} else {
if (batadv_dat_snoop_outgoing_arp_request(bat_priv,
skb))
@@ -389,21 +378,18 @@ dropped:
dropped_freed:
batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED);
end:
- if (mcast_single_orig)
- batadv_orig_node_put(mcast_single_orig);
- if (primary_if)
- batadv_hardif_put(primary_if);
+ batadv_hardif_put(primary_if);
return NETDEV_TX_OK;
}
/**
* batadv_interface_rx() - receive ethernet frame on local batman-adv interface
- * @soft_iface: local interface which will receive the ethernet frame
- * @skb: ethernet frame for @soft_iface
+ * @mesh_iface: local interface which will receive the ethernet frame
+ * @skb: ethernet frame for @mesh_iface
* @hdr_size: size of already parsed batman-adv header
* @orig_node: originator from which the batman-adv packet was sent
*
- * Sends a ethernet frame to the receive path of the local @soft_iface.
+ * Sends an ethernet frame to the receive path of the local @mesh_iface.
* skb->data has still point to the batman-adv header with the size @hdr_size.
* The caller has to have parsed this header already and made sure that at least
* @hdr_size bytes are still available for pull in @skb.
@@ -413,19 +399,19 @@ end:
* unicast packets will be dropped directly when it was sent between two
* isolated clients.
*/
-void batadv_interface_rx(struct net_device *soft_iface,
+void batadv_interface_rx(struct net_device *mesh_iface,
struct sk_buff *skb, int hdr_size,
struct batadv_orig_node *orig_node)
{
struct batadv_bcast_packet *batadv_bcast_packet;
- struct batadv_priv *bat_priv = netdev_priv(soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(mesh_iface);
struct vlan_ethhdr *vhdr;
struct ethhdr *ethhdr;
unsigned short vid;
- bool is_bcast;
+ int packet_type;
batadv_bcast_packet = (struct batadv_bcast_packet *)skb->data;
- is_bcast = (batadv_bcast_packet->packet_type == BATADV_BCAST);
+ packet_type = batadv_bcast_packet->packet_type;
skb_pull_rcsum(skb, hdr_size);
skb_reset_mac_header(skb);
@@ -433,7 +419,7 @@ void batadv_interface_rx(struct net_device *soft_iface,
/* clean the netfilter state now that the batman-adv header has been
* removed
*/
- nf_reset(skb);
+ nf_reset_ct(skb);
if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
goto dropped;
@@ -446,19 +432,19 @@ void batadv_interface_rx(struct net_device *soft_iface,
if (!pskb_may_pull(skb, VLAN_ETH_HLEN))
goto dropped;
- vhdr = (struct vlan_ethhdr *)skb->data;
+ vhdr = skb_vlan_eth_hdr(skb);
/* drop batman-in-batman packets to prevent loops */
if (vhdr->h_vlan_encapsulated_proto != htons(ETH_P_BATMAN))
break;
- /* fall through */
+ fallthrough;
case ETH_P_BATMAN:
goto dropped;
}
/* skb->dev & skb->pkt_type are set here */
- skb->protocol = eth_type_trans(skb, soft_iface);
+ skb->protocol = eth_type_trans(skb, mesh_iface);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
batadv_inc_counter(bat_priv, BATADV_CNT_RX);
@@ -468,7 +454,7 @@ void batadv_interface_rx(struct net_device *soft_iface,
/* Let the bridge loop avoidance check the packet. If will
* not handle it, we can safely push it up.
*/
- if (batadv_bla_rx(bat_priv, skb, vid, is_bcast))
+ if (batadv_bla_rx(bat_priv, skb, vid, packet_type))
goto out;
if (orig_node)
@@ -503,51 +489,38 @@ out:
}
/**
- * batadv_softif_vlan_release() - release vlan from lists and queue for free
+ * batadv_meshif_vlan_release() - release vlan from lists and queue for free
* after rcu grace period
* @ref: kref pointer of the vlan object
*/
-static void batadv_softif_vlan_release(struct kref *ref)
+void batadv_meshif_vlan_release(struct kref *ref)
{
- struct batadv_softif_vlan *vlan;
+ struct batadv_meshif_vlan *vlan;
- vlan = container_of(ref, struct batadv_softif_vlan, refcount);
+ vlan = container_of(ref, struct batadv_meshif_vlan, refcount);
- spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock);
+ spin_lock_bh(&vlan->bat_priv->meshif_vlan_list_lock);
hlist_del_rcu(&vlan->list);
- spin_unlock_bh(&vlan->bat_priv->softif_vlan_list_lock);
+ spin_unlock_bh(&vlan->bat_priv->meshif_vlan_list_lock);
kfree_rcu(vlan, rcu);
}
/**
- * batadv_softif_vlan_put() - decrease the vlan object refcounter and
- * possibly release it
- * @vlan: the vlan object to release
- */
-void batadv_softif_vlan_put(struct batadv_softif_vlan *vlan)
-{
- if (!vlan)
- return;
-
- kref_put(&vlan->refcount, batadv_softif_vlan_release);
-}
-
-/**
- * batadv_softif_vlan_get() - get the vlan object for a specific vid
- * @bat_priv: the bat priv with all the soft interface information
+ * batadv_meshif_vlan_get() - get the vlan object for a specific vid
+ * @bat_priv: the bat priv with all the mesh interface information
* @vid: the identifier of the vlan object to retrieve
*
* Return: the private data of the vlan matching the vid passed as argument or
* NULL otherwise. The refcounter of the returned object is incremented by 1.
*/
-struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv,
+struct batadv_meshif_vlan *batadv_meshif_vlan_get(struct batadv_priv *bat_priv,
unsigned short vid)
{
- struct batadv_softif_vlan *vlan_tmp, *vlan = NULL;
+ struct batadv_meshif_vlan *vlan_tmp, *vlan = NULL;
rcu_read_lock();
- hlist_for_each_entry_rcu(vlan_tmp, &bat_priv->softif_vlan_list, list) {
+ hlist_for_each_entry_rcu(vlan_tmp, &bat_priv->meshif_vlan_list, list) {
if (vlan_tmp->vid != vid)
continue;
@@ -563,29 +536,28 @@ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv,
}
/**
- * batadv_softif_create_vlan() - allocate the needed resources for a new vlan
- * @bat_priv: the bat priv with all the soft interface information
+ * batadv_meshif_create_vlan() - allocate the needed resources for a new vlan
+ * @bat_priv: the bat priv with all the mesh interface information
* @vid: the VLAN identifier
*
* Return: 0 on success, a negative error otherwise.
*/
-int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid)
+int batadv_meshif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid)
{
- struct batadv_softif_vlan *vlan;
- int err;
+ struct batadv_meshif_vlan *vlan;
- spin_lock_bh(&bat_priv->softif_vlan_list_lock);
+ spin_lock_bh(&bat_priv->meshif_vlan_list_lock);
- vlan = batadv_softif_vlan_get(bat_priv, vid);
+ vlan = batadv_meshif_vlan_get(bat_priv, vid);
if (vlan) {
- batadv_softif_vlan_put(vlan);
- spin_unlock_bh(&bat_priv->softif_vlan_list_lock);
+ batadv_meshif_vlan_put(vlan);
+ spin_unlock_bh(&bat_priv->meshif_vlan_list_lock);
return -EEXIST;
}
vlan = kzalloc(sizeof(*vlan), GFP_ATOMIC);
if (!vlan) {
- spin_unlock_bh(&bat_priv->softif_vlan_list_lock);
+ spin_unlock_bh(&bat_priv->meshif_vlan_list_lock);
return -ENOMEM;
}
@@ -596,57 +568,43 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid)
atomic_set(&vlan->ap_isolation, 0);
kref_get(&vlan->refcount);
- hlist_add_head_rcu(&vlan->list, &bat_priv->softif_vlan_list);
- spin_unlock_bh(&bat_priv->softif_vlan_list_lock);
-
- /* batadv_sysfs_add_vlan cannot be in the spinlock section due to the
- * sleeping behavior of the sysfs functions and the fs_reclaim lock
- */
- err = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan);
- if (err) {
- /* ref for the function */
- batadv_softif_vlan_put(vlan);
-
- /* ref for the list */
- batadv_softif_vlan_put(vlan);
- return err;
- }
+ hlist_add_head_rcu(&vlan->list, &bat_priv->meshif_vlan_list);
+ spin_unlock_bh(&bat_priv->meshif_vlan_list_lock);
/* add a new TT local entry. This one will be marked with the NOPURGE
* flag
*/
- batadv_tt_local_add(bat_priv->soft_iface,
- bat_priv->soft_iface->dev_addr, vid,
+ batadv_tt_local_add(bat_priv->mesh_iface,
+ bat_priv->mesh_iface->dev_addr, vid,
BATADV_NULL_IFINDEX, BATADV_NO_MARK);
- /* don't return reference to new softif_vlan */
- batadv_softif_vlan_put(vlan);
+ /* don't return reference to new meshif_vlan */
+ batadv_meshif_vlan_put(vlan);
return 0;
}
/**
- * batadv_softif_destroy_vlan() - remove and destroy a softif_vlan object
- * @bat_priv: the bat priv with all the soft interface information
+ * batadv_meshif_destroy_vlan() - remove and destroy a meshif_vlan object
+ * @bat_priv: the bat priv with all the mesh interface information
* @vlan: the object to remove
*/
-static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv,
- struct batadv_softif_vlan *vlan)
+static void batadv_meshif_destroy_vlan(struct batadv_priv *bat_priv,
+ struct batadv_meshif_vlan *vlan)
{
/* explicitly remove the associated TT local entry because it is marked
* with the NOPURGE flag
*/
- batadv_tt_local_remove(bat_priv, bat_priv->soft_iface->dev_addr,
+ batadv_tt_local_remove(bat_priv, bat_priv->mesh_iface->dev_addr,
vlan->vid, "vlan interface destroyed", false);
- batadv_sysfs_del_vlan(bat_priv, vlan);
- batadv_softif_vlan_put(vlan);
+ batadv_meshif_vlan_put(vlan);
}
/**
* batadv_interface_add_vid() - ndo_add_vid API implementation
* @dev: the netdev of the mesh interface
- * @proto: protocol of the the vlan id
+ * @proto: protocol of the vlan id
* @vid: identifier of the new vlan
*
* Set up all the internal structures for handling the new vlan on top of the
@@ -658,8 +616,7 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto,
unsigned short vid)
{
struct batadv_priv *bat_priv = netdev_priv(dev);
- struct batadv_softif_vlan *vlan;
- int ret;
+ struct batadv_meshif_vlan *vlan;
/* only 802.1Q vlans are supported.
* batman-adv does not know how to handle other types
@@ -667,35 +624,32 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto,
if (proto != htons(ETH_P_8021Q))
return -EINVAL;
+ /* VID 0 is only used to indicate "priority tag" frames which only
+ * contain priority information and no VID. No management structures
+ * should be created for this VID and it should be handled like an
+ * untagged frame.
+ */
+ if (vid == 0)
+ return 0;
+
vid |= BATADV_VLAN_HAS_TAG;
/* if a new vlan is getting created and it already exists, it means that
- * it was not deleted yet. batadv_softif_vlan_get() increases the
+ * it was not deleted yet. batadv_meshif_vlan_get() increases the
* refcount in order to revive the object.
*
* if it does not exist then create it.
*/
- vlan = batadv_softif_vlan_get(bat_priv, vid);
+ vlan = batadv_meshif_vlan_get(bat_priv, vid);
if (!vlan)
- return batadv_softif_create_vlan(bat_priv, vid);
-
- /* recreate the sysfs object if it was already destroyed (and it should
- * be since we received a kill_vid() for this vlan
- */
- if (!vlan->kobj) {
- ret = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan);
- if (ret) {
- batadv_softif_vlan_put(vlan);
- return ret;
- }
- }
+ return batadv_meshif_create_vlan(bat_priv, vid);
/* add a new TT local entry. This one will be marked with the NOPURGE
* flag. This must be added again, even if the vlan object already
* exists, because the entry was deleted by kill_vid()
*/
- batadv_tt_local_add(bat_priv->soft_iface,
- bat_priv->soft_iface->dev_addr, vid,
+ batadv_tt_local_add(bat_priv->mesh_iface,
+ bat_priv->mesh_iface->dev_addr, vid,
BATADV_NULL_IFINDEX, BATADV_NO_MARK);
return 0;
@@ -704,7 +658,7 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto,
/**
* batadv_interface_kill_vid() - ndo_kill_vid API implementation
* @dev: the netdev of the mesh interface
- * @proto: protocol of the the vlan id
+ * @proto: protocol of the vlan id
* @vid: identifier of the deleted vlan
*
* Destroy all the internal structures used to handle the vlan identified by vid
@@ -717,7 +671,7 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto,
unsigned short vid)
{
struct batadv_priv *bat_priv = netdev_priv(dev);
- struct batadv_softif_vlan *vlan;
+ struct batadv_meshif_vlan *vlan;
/* only 802.1Q vlans are supported. batman-adv does not know how to
* handle other types
@@ -725,14 +679,20 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto,
if (proto != htons(ETH_P_8021Q))
return -EINVAL;
- vlan = batadv_softif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG);
+ /* "priority tag" frames are handled like "untagged" frames
+ * and no meshif_vlan needs to be destroyed
+ */
+ if (vid == 0)
+ return 0;
+
+ vlan = batadv_meshif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG);
if (!vlan)
return -ENOENT;
- batadv_softif_destroy_vlan(bat_priv, vlan);
+ batadv_meshif_destroy_vlan(bat_priv, vlan);
/* finally free the vlan object */
- batadv_softif_vlan_put(vlan);
+ batadv_meshif_vlan_put(vlan);
return 0;
}
@@ -768,12 +728,12 @@ static void batadv_set_lockdep_class(struct net_device *dev)
}
/**
- * batadv_softif_init_late() - late stage initialization of soft interface
+ * batadv_meshif_init_late() - late stage initialization of mesh interface
* @dev: registered network device to modify
*
* Return: error code on failures
*/
-static int batadv_softif_init_late(struct net_device *dev)
+static int batadv_meshif_init_late(struct net_device *dev)
{
struct batadv_priv *bat_priv;
u32 random_seqno;
@@ -783,7 +743,7 @@ static int batadv_softif_init_late(struct net_device *dev)
batadv_set_lockdep_class(dev);
bat_priv = netdev_priv(dev);
- bat_priv->soft_iface = dev;
+ bat_priv->mesh_iface = dev;
/* batadv_interface_stats() needs to be available as soon as
* register_netdevice() has been called
@@ -801,15 +761,12 @@ static int batadv_softif_init_late(struct net_device *dev)
atomic_set(&bat_priv->distributed_arp_table, 1);
#endif
#ifdef CONFIG_BATMAN_ADV_MCAST
- bat_priv->mcast.querier_ipv4.exists = false;
- bat_priv->mcast.querier_ipv4.shadowing = false;
- bat_priv->mcast.querier_ipv6.exists = false;
- bat_priv->mcast.querier_ipv6.shadowing = false;
- bat_priv->mcast.flags = BATADV_NO_FLAGS;
atomic_set(&bat_priv->multicast_mode, 1);
+ atomic_set(&bat_priv->multicast_fanout, 16);
atomic_set(&bat_priv->mcast.num_want_all_unsnoopables, 0);
atomic_set(&bat_priv->mcast.num_want_all_ipv4, 0);
atomic_set(&bat_priv->mcast.num_want_all_ipv6, 0);
+ atomic_set(&bat_priv->mcast.num_no_mc_ptype_capa, 0);
#endif
atomic_set(&bat_priv->gw.mode, BATADV_GW_MODE_OFF);
atomic_set(&bat_priv->gw.bandwidth_down, 100);
@@ -820,20 +777,20 @@ static int batadv_softif_init_late(struct net_device *dev)
atomic_set(&bat_priv->log_level, 0);
#endif
atomic_set(&bat_priv->fragmentation, 1);
- atomic_set(&bat_priv->packet_size_max, ETH_DATA_LEN);
+ atomic_set(&bat_priv->packet_size_max, BATADV_MAX_MTU);
atomic_set(&bat_priv->bcast_queue_left, BATADV_BCAST_QUEUE_LEN);
atomic_set(&bat_priv->batman_queue_left, BATADV_BATMAN_QUEUE_LEN);
atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE);
atomic_set(&bat_priv->bcast_seqno, 1);
atomic_set(&bat_priv->tt.vn, 0);
- atomic_set(&bat_priv->tt.local_changes, 0);
atomic_set(&bat_priv->tt.ogm_append_cnt, 0);
#ifdef CONFIG_BATMAN_ADV_BLA
atomic_set(&bat_priv->bla.num_requests, 0);
#endif
atomic_set(&bat_priv->tp_num, 0);
+ WRITE_ONCE(bat_priv->tt.local_changes, 0);
bat_priv->tt.last_changeset = NULL;
bat_priv->tt.last_changeset_len = 0;
bat_priv->isolation_mark = 0;
@@ -844,26 +801,19 @@ static int batadv_softif_init_late(struct net_device *dev)
atomic_set(&bat_priv->frag_seqno, random_seqno);
bat_priv->primary_if = NULL;
- bat_priv->num_ifaces = 0;
-
- batadv_nc_init_bat_priv(bat_priv);
- ret = batadv_algo_select(bat_priv, batadv_routing_algo);
- if (ret < 0)
- goto free_bat_counters;
-
- ret = batadv_debugfs_add_meshif(dev);
- if (ret < 0)
- goto free_bat_counters;
+ if (!bat_priv->algo_ops) {
+ ret = batadv_algo_select(bat_priv, batadv_routing_algo);
+ if (ret < 0)
+ goto free_bat_counters;
+ }
ret = batadv_mesh_init(dev);
if (ret < 0)
- goto unreg_debugfs;
+ goto free_bat_counters;
return 0;
-unreg_debugfs:
- batadv_debugfs_del_meshif(dev);
free_bat_counters:
free_percpu(bat_priv->bat_counters);
bat_priv->bat_counters = NULL;
@@ -872,41 +822,39 @@ free_bat_counters:
}
/**
- * batadv_softif_slave_add() - Add a slave interface to a batadv_soft_interface
- * @dev: batadv_soft_interface used as master interface
+ * batadv_meshif_slave_add() - Add a slave interface to a batadv_mesh_interface
+ * @dev: batadv_mesh_interface used as master interface
* @slave_dev: net_device which should become the slave interface
* @extack: extended ACK report struct
*
* Return: 0 if successful or error otherwise.
*/
-static int batadv_softif_slave_add(struct net_device *dev,
+static int batadv_meshif_slave_add(struct net_device *dev,
struct net_device *slave_dev,
struct netlink_ext_ack *extack)
{
struct batadv_hard_iface *hard_iface;
- struct net *net = dev_net(dev);
int ret = -EINVAL;
hard_iface = batadv_hardif_get_by_netdev(slave_dev);
- if (!hard_iface || hard_iface->soft_iface)
+ if (!hard_iface || hard_iface->mesh_iface)
goto out;
- ret = batadv_hardif_enable_interface(hard_iface, net, dev->name);
+ ret = batadv_hardif_enable_interface(hard_iface, dev);
out:
- if (hard_iface)
- batadv_hardif_put(hard_iface);
+ batadv_hardif_put(hard_iface);
return ret;
}
/**
- * batadv_softif_slave_del() - Delete a slave iface from a batadv_soft_interface
- * @dev: batadv_soft_interface used as master interface
+ * batadv_meshif_slave_del() - Delete a slave iface from a batadv_mesh_interface
+ * @dev: batadv_mesh_interface used as master interface
* @slave_dev: net_device which should be removed from the master interface
*
* Return: 0 if successful or error otherwise.
*/
-static int batadv_softif_slave_del(struct net_device *dev,
+static int batadv_meshif_slave_del(struct net_device *dev,
struct net_device *slave_dev)
{
struct batadv_hard_iface *hard_iface;
@@ -914,22 +862,19 @@ static int batadv_softif_slave_del(struct net_device *dev,
hard_iface = batadv_hardif_get_by_netdev(slave_dev);
- if (!hard_iface || hard_iface->soft_iface != dev)
+ if (!hard_iface || hard_iface->mesh_iface != dev)
goto out;
- batadv_hardif_disable_interface(hard_iface, BATADV_IF_CLEANUP_KEEP);
+ batadv_hardif_disable_interface(hard_iface);
ret = 0;
out:
- if (hard_iface)
- batadv_hardif_put(hard_iface);
+ batadv_hardif_put(hard_iface);
return ret;
}
static const struct net_device_ops batadv_netdev_ops = {
- .ndo_init = batadv_softif_init_late,
- .ndo_open = batadv_interface_open,
- .ndo_stop = batadv_interface_release,
+ .ndo_init = batadv_meshif_init_late,
.ndo_get_stats = batadv_interface_stats,
.ndo_vlan_rx_add_vid = batadv_interface_add_vid,
.ndo_vlan_rx_kill_vid = batadv_interface_kill_vid,
@@ -938,17 +883,17 @@ static const struct net_device_ops batadv_netdev_ops = {
.ndo_set_rx_mode = batadv_interface_set_rx_mode,
.ndo_start_xmit = batadv_interface_tx,
.ndo_validate_addr = eth_validate_addr,
- .ndo_add_slave = batadv_softif_slave_add,
- .ndo_del_slave = batadv_softif_slave_del,
+ .ndo_add_slave = batadv_meshif_slave_add,
+ .ndo_del_slave = batadv_meshif_slave_del,
};
static void batadv_get_drvinfo(struct net_device *dev,
struct ethtool_drvinfo *info)
{
- strlcpy(info->driver, "B.A.T.M.A.N. advanced", sizeof(info->driver));
- strlcpy(info->version, BATADV_SOURCE_VERSION, sizeof(info->version));
- strlcpy(info->fw_version, "N/A", sizeof(info->fw_version));
- strlcpy(info->bus_info, "batman", sizeof(info->bus_info));
+ strscpy(info->driver, "B.A.T.M.A.N. advanced", sizeof(info->driver));
+ strscpy(info->version, BATADV_SOURCE_VERSION, sizeof(info->version));
+ strscpy(info->fw_version, "N/A", sizeof(info->fw_version));
+ strscpy(info->bus_info, "batman", sizeof(info->bus_info));
}
/* Inspired by drivers/net/ethernet/dlink/sundance.c:1702
@@ -981,6 +926,18 @@ static const struct {
{ "tt_response_rx" },
{ "tt_roam_adv_tx" },
{ "tt_roam_adv_rx" },
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ { "mcast_tx" },
+ { "mcast_tx_bytes" },
+ { "mcast_tx_local" },
+ { "mcast_tx_local_bytes" },
+ { "mcast_rx" },
+ { "mcast_rx_bytes" },
+ { "mcast_rx_local" },
+ { "mcast_rx_local_bytes" },
+ { "mcast_fwd" },
+ { "mcast_fwd_bytes" },
+#endif
#ifdef CONFIG_BATMAN_ADV_DAT
{ "dat_get_tx" },
{ "dat_get_rx" },
@@ -988,17 +945,6 @@ static const struct {
{ "dat_put_rx" },
{ "dat_cached_reply_tx" },
#endif
-#ifdef CONFIG_BATMAN_ADV_NC
- { "nc_code" },
- { "nc_code_bytes" },
- { "nc_recode" },
- { "nc_recode_bytes" },
- { "nc_buffer" },
- { "nc_decode" },
- { "nc_decode_bytes" },
- { "nc_decode_failed" },
- { "nc_sniffed" },
-#endif
};
static void batadv_get_strings(struct net_device *dev, u32 stringset, u8 *data)
@@ -1035,12 +981,11 @@ static const struct ethtool_ops batadv_ethtool_ops = {
};
/**
- * batadv_softif_free() - Deconstructor of batadv_soft_interface
+ * batadv_meshif_free() - Deconstructor of batadv_mesh_interface
* @dev: Device to cleanup and remove
*/
-static void batadv_softif_free(struct net_device *dev)
+static void batadv_meshif_free(struct net_device *dev)
{
- batadv_debugfs_del_meshif(dev);
batadv_mesh_free(dev);
/* some scheduled RCU callbacks need the bat_priv struct to accomplish
@@ -1051,23 +996,26 @@ static void batadv_softif_free(struct net_device *dev)
}
/**
- * batadv_softif_init_early() - early stage initialization of soft interface
+ * batadv_meshif_init_early() - early stage initialization of mesh interface
* @dev: registered network device to modify
*/
-static void batadv_softif_init_early(struct net_device *dev)
+static void batadv_meshif_init_early(struct net_device *dev)
{
ether_setup(dev);
dev->netdev_ops = &batadv_netdev_ops;
dev->needs_free_netdev = true;
- dev->priv_destructor = batadv_softif_free;
- dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_NETNS_LOCAL;
+ dev->priv_destructor = batadv_meshif_free;
+ dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
dev->priv_flags |= IFF_NO_QUEUE;
+ dev->lltx = true;
+ dev->netns_immutable = true;
/* can't call min_mtu, because the needed variables
* have not been initialized yet
*/
dev->mtu = ETH_DATA_LEN;
+ dev->max_mtu = BATADV_MAX_MTU;
/* generate random address */
eth_hw_addr_random(dev);
@@ -1076,96 +1024,92 @@ static void batadv_softif_init_early(struct net_device *dev)
}
/**
- * batadv_softif_create() - Create and register soft interface
- * @net: the applicable net namespace
- * @name: name of the new soft interface
+ * batadv_meshif_validate() - validate configuration of new batadv link
+ * @tb: IFLA_INFO_DATA netlink attributes
+ * @data: enum batadv_ifla_attrs attributes
+ * @extack: extended ACK report struct
*
- * Return: newly allocated soft_interface, NULL on errors
+ * Return: 0 if successful or error otherwise.
*/
-struct net_device *batadv_softif_create(struct net *net, const char *name)
+static int batadv_meshif_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
{
- struct net_device *soft_iface;
- int ret;
-
- soft_iface = alloc_netdev(sizeof(struct batadv_priv), name,
- NET_NAME_UNKNOWN, batadv_softif_init_early);
- if (!soft_iface)
- return NULL;
+ struct batadv_algo_ops *algo_ops;
- dev_net_set(soft_iface, net);
-
- soft_iface->rtnl_link_ops = &batadv_link_ops;
+ if (!data)
+ return 0;
- ret = register_netdevice(soft_iface);
- if (ret < 0) {
- pr_err("Unable to register the batman interface '%s': %i\n",
- name, ret);
- free_netdev(soft_iface);
- return NULL;
+ if (data[IFLA_BATADV_ALGO_NAME]) {
+ algo_ops = batadv_algo_get(nla_data(data[IFLA_BATADV_ALGO_NAME]));
+ if (!algo_ops)
+ return -EINVAL;
}
- return soft_iface;
+ return 0;
}
/**
- * batadv_softif_destroy_sysfs() - deletion of batadv_soft_interface via sysfs
- * @soft_iface: the to-be-removed batman-adv interface
+ * batadv_meshif_newlink() - pre-initialize and register new batadv link
+ * @dev: network device to register
+ * @params: rtnl newlink parameters
+ * @extack: extended ACK report struct
+ *
+ * Return: 0 if successful or error otherwise.
*/
-void batadv_softif_destroy_sysfs(struct net_device *soft_iface)
+static int batadv_meshif_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
+ struct netlink_ext_ack *extack)
{
- struct batadv_priv *bat_priv = netdev_priv(soft_iface);
- struct batadv_softif_vlan *vlan;
-
- ASSERT_RTNL();
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+ struct nlattr **data = params->data;
+ const char *algo_name;
+ int err;
- /* destroy the "untagged" VLAN */
- vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS);
- if (vlan) {
- batadv_softif_destroy_vlan(bat_priv, vlan);
- batadv_softif_vlan_put(vlan);
+ if (data && data[IFLA_BATADV_ALGO_NAME]) {
+ algo_name = nla_data(data[IFLA_BATADV_ALGO_NAME]);
+ err = batadv_algo_select(bat_priv, algo_name);
+ if (err)
+ return -EINVAL;
}
- batadv_sysfs_del_meshif(soft_iface);
- unregister_netdevice(soft_iface);
+ return register_netdevice(dev);
}
/**
- * batadv_softif_destroy_netlink() - deletion of batadv_soft_interface via
+ * batadv_meshif_destroy_netlink() - deletion of batadv_mesh_interface via
* netlink
- * @soft_iface: the to-be-removed batman-adv interface
+ * @mesh_iface: the to-be-removed batman-adv interface
* @head: list pointer
*/
-static void batadv_softif_destroy_netlink(struct net_device *soft_iface,
+static void batadv_meshif_destroy_netlink(struct net_device *mesh_iface,
struct list_head *head)
{
- struct batadv_priv *bat_priv = netdev_priv(soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(mesh_iface);
struct batadv_hard_iface *hard_iface;
- struct batadv_softif_vlan *vlan;
+ struct batadv_meshif_vlan *vlan;
- list_for_each_entry(hard_iface, &batadv_hardif_list, list) {
- if (hard_iface->soft_iface == soft_iface)
- batadv_hardif_disable_interface(hard_iface,
- BATADV_IF_CLEANUP_KEEP);
+ while (!list_empty(&mesh_iface->adj_list.lower)) {
+ hard_iface = netdev_adjacent_get_private(mesh_iface->adj_list.lower.next);
+ batadv_hardif_disable_interface(hard_iface);
}
/* destroy the "untagged" VLAN */
- vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS);
+ vlan = batadv_meshif_vlan_get(bat_priv, BATADV_NO_FLAGS);
if (vlan) {
- batadv_softif_destroy_vlan(bat_priv, vlan);
- batadv_softif_vlan_put(vlan);
+ batadv_meshif_destroy_vlan(bat_priv, vlan);
+ batadv_meshif_vlan_put(vlan);
}
- batadv_sysfs_del_meshif(soft_iface);
- unregister_netdevice_queue(soft_iface, head);
+ unregister_netdevice_queue(mesh_iface, head);
}
/**
- * batadv_softif_is_valid() - Check whether device is a batadv soft interface
+ * batadv_meshif_is_valid() - Check whether device is a batadv mesh interface
* @net_dev: device which should be checked
*
* Return: true when net_dev is a batman-adv interface, false otherwise
*/
-bool batadv_softif_is_valid(const struct net_device *net_dev)
+bool batadv_meshif_is_valid(const struct net_device *net_dev)
{
if (net_dev->netdev_ops->ndo_start_xmit == batadv_interface_tx)
return true;
@@ -1173,9 +1117,17 @@ bool batadv_softif_is_valid(const struct net_device *net_dev)
return false;
}
+static const struct nla_policy batadv_ifla_policy[IFLA_BATADV_MAX + 1] = {
+ [IFLA_BATADV_ALGO_NAME] = { .type = NLA_NUL_STRING },
+};
+
struct rtnl_link_ops batadv_link_ops __read_mostly = {
.kind = "batadv",
.priv_size = sizeof(struct batadv_priv),
- .setup = batadv_softif_init_early,
- .dellink = batadv_softif_destroy_netlink,
+ .setup = batadv_meshif_init_early,
+ .maxtype = IFLA_BATADV_MAX,
+ .policy = batadv_ifla_policy,
+ .validate = batadv_meshif_validate,
+ .newlink = batadv_meshif_newlink,
+ .dellink = batadv_meshif_destroy_netlink,
};
diff --git a/net/batman-adv/mesh-interface.h b/net/batman-adv/mesh-interface.h
new file mode 100644
index 000000000000..53756c5a45e0
--- /dev/null
+++ b/net/batman-adv/mesh-interface.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Marek Lindner
+ */
+
+#ifndef _NET_BATMAN_ADV_MESH_INTERFACE_H_
+#define _NET_BATMAN_ADV_MESH_INTERFACE_H_
+
+#include "main.h"
+
+#include <linux/kref.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+
+int batadv_skb_head_push(struct sk_buff *skb, unsigned int len);
+void batadv_interface_rx(struct net_device *mesh_iface,
+ struct sk_buff *skb, int hdr_size,
+ struct batadv_orig_node *orig_node);
+bool batadv_meshif_is_valid(const struct net_device *net_dev);
+extern struct rtnl_link_ops batadv_link_ops;
+int batadv_meshif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid);
+void batadv_meshif_vlan_release(struct kref *ref);
+struct batadv_meshif_vlan *batadv_meshif_vlan_get(struct batadv_priv *bat_priv,
+ unsigned short vid);
+
+/**
+ * batadv_meshif_vlan_put() - decrease the vlan object refcounter and
+ * possibly release it
+ * @vlan: the vlan object to release
+ */
+static inline void batadv_meshif_vlan_put(struct batadv_meshif_vlan *vlan)
+{
+ if (!vlan)
+ return;
+
+ kref_put(&vlan->refcount, batadv_meshif_vlan_release);
+}
+
+#endif /* _NET_BATMAN_ADV_MESH_INTERFACE_H_ */
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 86725d792e15..e8c6b0bf670f 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -1,19 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2014-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Linus Lüssing
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "multicast.h"
@@ -23,6 +11,8 @@
#include <linux/bitops.h>
#include <linux/bug.h>
#include <linux/byteorder/generic.h>
+#include <linux/container_of.h>
+#include <linux/err.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/gfp.h>
@@ -32,11 +22,10 @@
#include <linux/igmp.h>
#include <linux/in.h>
#include <linux/in6.h>
+#include <linux/inetdevice.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/kref.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
@@ -44,10 +33,10 @@
#include <linux/printk.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <linux/sprintf.h>
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/types.h>
@@ -58,15 +47,15 @@
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/netlink.h>
-#include <net/sock.h>
#include <uapi/linux/batadv_packet.h>
#include <uapi/linux/batman_adv.h>
+#include "bridge_loop_avoidance.h"
#include "hard-interface.h"
#include "hash.h"
#include "log.h"
#include "netlink.h"
-#include "soft-interface.h"
+#include "send.h"
#include "translation-table.h"
#include "tvlv.h"
@@ -74,7 +63,7 @@ static void batadv_mcast_mla_update(struct work_struct *work);
/**
* batadv_mcast_start_timer() - schedule the multicast periodic worker
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*/
static void batadv_mcast_start_timer(struct batadv_priv *bat_priv)
{
@@ -83,95 +72,331 @@ static void batadv_mcast_start_timer(struct batadv_priv *bat_priv)
}
/**
- * batadv_mcast_get_bridge() - get the bridge on top of the softif if it exists
- * @soft_iface: netdev struct of the mesh interface
+ * batadv_mcast_get_bridge() - get the bridge on top of the meshif if it exists
+ * @mesh_iface: netdev struct of the mesh interface
*
- * If the given soft interface has a bridge on top then the refcount
+ * If the given mesh interface has a bridge on top then the refcount
* of the according net device is increased.
*
* Return: NULL if no such bridge exists. Otherwise the net device of the
* bridge.
*/
-static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface)
+static struct net_device *batadv_mcast_get_bridge(struct net_device *mesh_iface)
{
- struct net_device *upper = soft_iface;
+ struct net_device *upper = mesh_iface;
rcu_read_lock();
do {
upper = netdev_master_upper_dev_get_rcu(upper);
- } while (upper && !(upper->priv_flags & IFF_EBRIDGE));
+ } while (upper && !netif_is_bridge_master(upper));
- if (upper)
- dev_hold(upper);
+ dev_hold(upper);
rcu_read_unlock();
return upper;
}
/**
- * batadv_mcast_addr_is_ipv4() - check if multicast MAC is IPv4
- * @addr: the MAC address to check
+ * batadv_mcast_mla_rtr_flags_meshif_get_ipv4() - get mcast router flags from
+ * node for IPv4
+ * @dev: the interface to check
*
- * Return: True, if MAC address is one reserved for IPv4 multicast, false
- * otherwise.
+ * Checks the presence of an IPv4 multicast router on this node.
+ *
+ * Caller needs to hold rcu read lock.
+ *
+ * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR4 otherwise.
*/
-static bool batadv_mcast_addr_is_ipv4(const u8 *addr)
+static u8 batadv_mcast_mla_rtr_flags_meshif_get_ipv4(struct net_device *dev)
{
- static const u8 prefix[] = {0x01, 0x00, 0x5E};
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
- return memcmp(prefix, addr, sizeof(prefix)) == 0;
+ if (in_dev && IN_DEV_MFORWARD(in_dev))
+ return BATADV_NO_FLAGS;
+ else
+ return BATADV_MCAST_WANT_NO_RTR4;
}
/**
- * batadv_mcast_addr_is_ipv6() - check if multicast MAC is IPv6
- * @addr: the MAC address to check
+ * batadv_mcast_mla_rtr_flags_meshif_get_ipv6() - get mcast router flags from
+ * node for IPv6
+ * @dev: the interface to check
*
- * Return: True, if MAC address is one reserved for IPv6 multicast, false
- * otherwise.
+ * Checks the presence of an IPv6 multicast router on this node.
+ *
+ * Caller needs to hold rcu read lock.
+ *
+ * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR6 otherwise.
+ */
+#if IS_ENABLED(CONFIG_IPV6_MROUTE)
+static u8 batadv_mcast_mla_rtr_flags_meshif_get_ipv6(struct net_device *dev)
+{
+ struct inet6_dev *in6_dev = __in6_dev_get(dev);
+
+ if (in6_dev && atomic_read(&in6_dev->cnf.mc_forwarding))
+ return BATADV_NO_FLAGS;
+ else
+ return BATADV_MCAST_WANT_NO_RTR6;
+}
+#else
+static inline u8
+batadv_mcast_mla_rtr_flags_meshif_get_ipv6(struct net_device *dev)
+{
+ return BATADV_MCAST_WANT_NO_RTR6;
+}
+#endif
+
+/**
+ * batadv_mcast_mla_rtr_flags_meshif_get() - get mcast router flags from node
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @bridge: bridge interface on top of the mesh_iface if present,
+ * otherwise pass NULL
+ *
+ * Checks the presence of IPv4 and IPv6 multicast routers on this
+ * node.
+ *
+ * Return:
+ * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present
+ * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present
+ * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present
+ * The former two OR'd: no multicast router is present
+ */
+static u8 batadv_mcast_mla_rtr_flags_meshif_get(struct batadv_priv *bat_priv,
+ struct net_device *bridge)
+{
+ struct net_device *dev = bridge ? bridge : bat_priv->mesh_iface;
+ u8 flags = BATADV_NO_FLAGS;
+
+ rcu_read_lock();
+
+ flags |= batadv_mcast_mla_rtr_flags_meshif_get_ipv4(dev);
+ flags |= batadv_mcast_mla_rtr_flags_meshif_get_ipv6(dev);
+
+ rcu_read_unlock();
+
+ return flags;
+}
+
+/**
+ * batadv_mcast_mla_rtr_flags_bridge_get() - get mcast router flags from bridge
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @bridge: bridge interface on top of the mesh_iface if present,
+ * otherwise pass NULL
+ *
+ * Checks the presence of IPv4 and IPv6 multicast routers behind a bridge.
+ *
+ * Return:
+ * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present
+ * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present
+ * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present
+ * The former two OR'd: no multicast router is present
+ */
+static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv,
+ struct net_device *bridge)
+{
+ struct net_device *dev = bat_priv->mesh_iface;
+ u8 flags = BATADV_NO_FLAGS;
+
+ if (!bridge)
+ return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6;
+
+ if (!br_multicast_has_router_adjacent(dev, ETH_P_IP))
+ flags |= BATADV_MCAST_WANT_NO_RTR4;
+ if (!br_multicast_has_router_adjacent(dev, ETH_P_IPV6))
+ flags |= BATADV_MCAST_WANT_NO_RTR6;
+
+ return flags;
+}
+
+/**
+ * batadv_mcast_mla_rtr_flags_get() - get multicast router flags
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @bridge: bridge interface on top of the mesh_iface if present,
+ * otherwise pass NULL
+ *
+ * Checks the presence of IPv4 and IPv6 multicast routers on this
+ * node or behind its bridge.
+ *
+ * Return:
+ * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present
+ * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present
+ * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present
+ * The former two OR'd: no multicast router is present
+ */
+static u8 batadv_mcast_mla_rtr_flags_get(struct batadv_priv *bat_priv,
+ struct net_device *bridge)
+{
+ u8 flags = BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6;
+
+ flags &= batadv_mcast_mla_rtr_flags_meshif_get(bat_priv, bridge);
+ flags &= batadv_mcast_mla_rtr_flags_bridge_get(bat_priv, bridge);
+
+ return flags;
+}
+
+/**
+ * batadv_mcast_mla_forw_flags_get() - get multicast forwarding flags
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Checks if all active hard interfaces have an MTU larger or equal to 1280
+ * bytes (IPv6 minimum MTU).
+ *
+ * Return: BATADV_MCAST_HAVE_MC_PTYPE_CAPA if yes, BATADV_NO_FLAGS otherwise.
+ */
+static u8 batadv_mcast_mla_forw_flags_get(struct batadv_priv *bat_priv)
+{
+ const struct batadv_hard_iface *hard_iface;
+ struct list_head *iter;
+
+ rcu_read_lock();
+ netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) {
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ continue;
+
+ if (hard_iface->net_dev->mtu < IPV6_MIN_MTU) {
+ rcu_read_unlock();
+ return BATADV_NO_FLAGS;
+ }
+ }
+ rcu_read_unlock();
+
+ return BATADV_MCAST_HAVE_MC_PTYPE_CAPA;
+}
+
+/**
+ * batadv_mcast_mla_flags_get() - get the new multicast flags
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Return: A set of flags for the current/next TVLV, querier and
+ * bridge state.
+ */
+static struct batadv_mcast_mla_flags
+batadv_mcast_mla_flags_get(struct batadv_priv *bat_priv)
+{
+ struct net_device *dev = bat_priv->mesh_iface;
+ struct batadv_mcast_querier_state *qr4, *qr6;
+ struct batadv_mcast_mla_flags mla_flags;
+ struct net_device *bridge;
+
+ bridge = batadv_mcast_get_bridge(dev);
+
+ memset(&mla_flags, 0, sizeof(mla_flags));
+ mla_flags.enabled = 1;
+ mla_flags.tvlv_flags |= batadv_mcast_mla_rtr_flags_get(bat_priv,
+ bridge);
+ mla_flags.tvlv_flags |= batadv_mcast_mla_forw_flags_get(bat_priv);
+
+ if (!bridge)
+ return mla_flags;
+
+ dev_put(bridge);
+
+ mla_flags.bridged = 1;
+ qr4 = &mla_flags.querier_ipv4;
+ qr6 = &mla_flags.querier_ipv6;
+
+ if (!IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING))
+ pr_warn_once("No bridge IGMP snooping compiled - multicast optimizations disabled\n");
+
+ qr4->exists = br_multicast_has_querier_anywhere(dev, ETH_P_IP);
+ qr4->shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IP);
+
+ qr6->exists = br_multicast_has_querier_anywhere(dev, ETH_P_IPV6);
+ qr6->shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IPV6);
+
+ mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_UNSNOOPABLES;
+
+ /* 1) If no querier exists at all, then multicast listeners on
+ * our local TT clients behind the bridge will keep silent.
+ * 2) If the selected querier is on one of our local TT clients,
+ * behind the bridge, then this querier might shadow multicast
+ * listeners on our local TT clients, behind this bridge.
+ *
+ * In both cases, we will signalize other batman nodes that
+ * we need all multicast traffic of the according protocol.
+ */
+ if (!qr4->exists || qr4->shadowing) {
+ mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV4;
+ mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR4;
+ }
+
+ if (!qr6->exists || qr6->shadowing) {
+ mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV6;
+ mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR6;
+ }
+
+ return mla_flags;
+}
+
+/**
+ * batadv_mcast_mla_is_duplicate() - check whether an address is in a list
+ * @mcast_addr: the multicast address to check
+ * @mcast_list: the list with multicast addresses to search in
+ *
+ * Return: true if the given address is already in the given list.
+ * Otherwise returns false.
*/
-static bool batadv_mcast_addr_is_ipv6(const u8 *addr)
+static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr,
+ struct hlist_head *mcast_list)
{
- static const u8 prefix[] = {0x33, 0x33};
+ struct batadv_hw_addr *mcast_entry;
- return memcmp(prefix, addr, sizeof(prefix)) == 0;
+ hlist_for_each_entry(mcast_entry, mcast_list, list)
+ if (batadv_compare_eth(mcast_entry->addr, mcast_addr))
+ return true;
+
+ return false;
}
/**
- * batadv_mcast_mla_softif_get() - get softif multicast listeners
- * @bat_priv: the bat priv with all the soft interface information
+ * batadv_mcast_mla_meshif_get_ipv4() - get meshif IPv4 multicast listeners
* @dev: the device to collect multicast addresses from
* @mcast_list: a list to put found addresses into
+ * @flags: flags indicating the new multicast state
*
- * Collects multicast addresses of multicast listeners residing
- * on this kernel on the given soft interface, dev, in
+ * Collects multicast addresses of IPv4 multicast listeners residing
+ * on this kernel on the given mesh interface, dev, in
* the given mcast_list. In general, multicast listeners provided by
* your multicast receiving applications run directly on this node.
*
- * If there is a bridge interface on top of dev, collects from that one
- * instead. Just like with IP addresses and routes, multicast listeners
- * will(/should) register to the bridge interface instead of an
- * enslaved bat0.
- *
* Return: -ENOMEM on memory allocation error or the number of
* items added to the mcast_list otherwise.
*/
-static int batadv_mcast_mla_softif_get(struct batadv_priv *bat_priv,
- struct net_device *dev,
- struct hlist_head *mcast_list)
+static int
+batadv_mcast_mla_meshif_get_ipv4(struct net_device *dev,
+ struct hlist_head *mcast_list,
+ struct batadv_mcast_mla_flags *flags)
{
- bool all_ipv4 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV4;
- bool all_ipv6 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV6;
- struct net_device *bridge = batadv_mcast_get_bridge(dev);
- struct netdev_hw_addr *mc_list_entry;
struct batadv_hw_addr *new;
+ struct in_device *in_dev;
+ u8 mcast_addr[ETH_ALEN];
+ struct ip_mc_list *pmc;
int ret = 0;
- netif_addr_lock_bh(bridge ? bridge : dev);
- netdev_for_each_mc_addr(mc_list_entry, bridge ? bridge : dev) {
- if (all_ipv4 && batadv_mcast_addr_is_ipv4(mc_list_entry->addr))
+ if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_IPV4)
+ return 0;
+
+ rcu_read_lock();
+
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ for (pmc = rcu_dereference(in_dev->mc_list); pmc;
+ pmc = rcu_dereference(pmc->next_rcu)) {
+ if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES &&
+ ipv4_is_local_multicast(pmc->multiaddr))
+ continue;
+
+ if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) &&
+ !ipv4_is_local_multicast(pmc->multiaddr))
continue;
- if (all_ipv6 && batadv_mcast_addr_is_ipv6(mc_list_entry->addr))
+ ip_eth_mc_map(pmc->multiaddr, mcast_addr);
+
+ if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list))
continue;
new = kmalloc(sizeof(*new), GFP_ATOMIC);
@@ -180,36 +405,141 @@ static int batadv_mcast_mla_softif_get(struct batadv_priv *bat_priv,
break;
}
- ether_addr_copy(new->addr, mc_list_entry->addr);
+ ether_addr_copy(new->addr, mcast_addr);
hlist_add_head(&new->list, mcast_list);
ret++;
}
- netif_addr_unlock_bh(bridge ? bridge : dev);
+ rcu_read_unlock();
- if (bridge)
- dev_put(bridge);
+ return ret;
+}
+
+/**
+ * batadv_mcast_mla_meshif_get_ipv6() - get meshif IPv6 multicast listeners
+ * @dev: the device to collect multicast addresses from
+ * @mcast_list: a list to put found addresses into
+ * @flags: flags indicating the new multicast state
+ *
+ * Collects multicast addresses of IPv6 multicast listeners residing
+ * on this kernel on the given mesh interface, dev, in
+ * the given mcast_list. In general, multicast listeners provided by
+ * your multicast receiving applications run directly on this node.
+ *
+ * Return: -ENOMEM on memory allocation error or the number of
+ * items added to the mcast_list otherwise.
+ */
+#if IS_ENABLED(CONFIG_IPV6)
+static int
+batadv_mcast_mla_meshif_get_ipv6(struct net_device *dev,
+ struct hlist_head *mcast_list,
+ struct batadv_mcast_mla_flags *flags)
+{
+ struct batadv_hw_addr *new;
+ struct inet6_dev *in6_dev;
+ u8 mcast_addr[ETH_ALEN];
+ struct ifmcaddr6 *pmc6;
+ int ret = 0;
+
+ if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_IPV6)
+ return 0;
+
+ rcu_read_lock();
+
+ in6_dev = __in6_dev_get(dev);
+ if (!in6_dev) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ for (pmc6 = rcu_dereference(in6_dev->mc_list);
+ pmc6;
+ pmc6 = rcu_dereference(pmc6->next)) {
+ if (IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) <
+ IPV6_ADDR_SCOPE_LINKLOCAL)
+ continue;
+
+ if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES &&
+ ipv6_addr_is_ll_all_nodes(&pmc6->mca_addr))
+ continue;
+
+ if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) &&
+ IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) >
+ IPV6_ADDR_SCOPE_LINKLOCAL)
+ continue;
+
+ ipv6_eth_mc_map(&pmc6->mca_addr, mcast_addr);
+
+ if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list))
+ continue;
+
+ new = kmalloc(sizeof(*new), GFP_ATOMIC);
+ if (!new) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ ether_addr_copy(new->addr, mcast_addr);
+ hlist_add_head(&new->list, mcast_list);
+ ret++;
+ }
+ rcu_read_unlock();
return ret;
}
+#else
+static inline int
+batadv_mcast_mla_meshif_get_ipv6(struct net_device *dev,
+ struct hlist_head *mcast_list,
+ struct batadv_mcast_mla_flags *flags)
+{
+ return 0;
+}
+#endif
/**
- * batadv_mcast_mla_is_duplicate() - check whether an address is in a list
- * @mcast_addr: the multicast address to check
- * @mcast_list: the list with multicast addresses to search in
+ * batadv_mcast_mla_meshif_get() - get meshif multicast listeners
+ * @dev: the device to collect multicast addresses from
+ * @mcast_list: a list to put found addresses into
+ * @flags: flags indicating the new multicast state
*
- * Return: true if the given address is already in the given list.
- * Otherwise returns false.
+ * Collects multicast addresses of multicast listeners residing
+ * on this kernel on the given mesh interface, dev, in
+ * the given mcast_list. In general, multicast listeners provided by
+ * your multicast receiving applications run directly on this node.
+ *
+ * If there is a bridge interface on top of dev, collect from that one
+ * instead. Just like with IP addresses and routes, multicast listeners
+ * will(/should) register to the bridge interface instead of an
+ * enslaved bat0.
+ *
+ * Return: -ENOMEM on memory allocation error or the number of
+ * items added to the mcast_list otherwise.
*/
-static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr,
- struct hlist_head *mcast_list)
+static int
+batadv_mcast_mla_meshif_get(struct net_device *dev,
+ struct hlist_head *mcast_list,
+ struct batadv_mcast_mla_flags *flags)
{
- struct batadv_hw_addr *mcast_entry;
+ struct net_device *bridge = batadv_mcast_get_bridge(dev);
+ int ret4, ret6 = 0;
- hlist_for_each_entry(mcast_entry, mcast_list, list)
- if (batadv_compare_eth(mcast_entry->addr, mcast_addr))
- return true;
+ if (bridge)
+ dev = bridge;
- return false;
+ ret4 = batadv_mcast_mla_meshif_get_ipv4(dev, mcast_list, flags);
+ if (ret4 < 0)
+ goto out;
+
+ ret6 = batadv_mcast_mla_meshif_get_ipv6(dev, mcast_list, flags);
+ if (ret6 < 0) {
+ ret4 = 0;
+ goto out;
+ }
+
+out:
+ dev_put(bridge);
+
+ return ret4 + ret6;
}
/**
@@ -227,10 +557,10 @@ static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr,
static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src)
{
if (src->proto == htons(ETH_P_IP))
- ip_eth_mc_map(src->u.ip4, dst);
+ ip_eth_mc_map(src->dst.ip4, dst);
#if IS_ENABLED(CONFIG_IPV6)
else if (src->proto == htons(ETH_P_IPV6))
- ipv6_eth_mc_map(&src->u.ip6, dst);
+ ipv6_eth_mc_map(&src->dst.ip6, dst);
#endif
else
eth_zero_addr(dst);
@@ -238,26 +568,25 @@ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src)
/**
* batadv_mcast_mla_bridge_get() - get bridged-in multicast listeners
- * @bat_priv: the bat priv with all the soft interface information
* @dev: a bridge slave whose bridge to collect multicast addresses from
* @mcast_list: a list to put found addresses into
+ * @flags: flags indicating the new multicast state
*
* Collects multicast addresses of multicast listeners residing
* on foreign, non-mesh devices which we gave access to our mesh via
- * a bridge on top of the given soft interface, dev, in the given
+ * a bridge on top of the given mesh interface, dev, in the given
* mcast_list.
*
* Return: -ENOMEM on memory allocation error or the number of
* items added to the mcast_list otherwise.
*/
-static int batadv_mcast_mla_bridge_get(struct batadv_priv *bat_priv,
- struct net_device *dev,
- struct hlist_head *mcast_list)
+static int batadv_mcast_mla_bridge_get(struct net_device *dev,
+ struct hlist_head *mcast_list,
+ struct batadv_mcast_mla_flags *flags)
{
struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list);
- bool all_ipv4 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV4;
- bool all_ipv6 = bat_priv->mcast.flags & BATADV_MCAST_WANT_ALL_IPV6;
struct br_ip_list *br_ip_entry, *tmp;
+ u8 tvlv_flags = flags->tvlv_flags;
struct batadv_hw_addr *new;
u8 mcast_addr[ETH_ALEN];
int ret;
@@ -270,11 +599,34 @@ static int batadv_mcast_mla_bridge_get(struct batadv_priv *bat_priv,
goto out;
list_for_each_entry(br_ip_entry, &bridge_mcast_list, list) {
- if (all_ipv4 && br_ip_entry->addr.proto == htons(ETH_P_IP))
- continue;
+ if (br_ip_entry->addr.proto == htons(ETH_P_IP)) {
+ if (tvlv_flags & BATADV_MCAST_WANT_ALL_IPV4)
+ continue;
- if (all_ipv6 && br_ip_entry->addr.proto == htons(ETH_P_IPV6))
- continue;
+ if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES &&
+ ipv4_is_local_multicast(br_ip_entry->addr.dst.ip4))
+ continue;
+
+ if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) &&
+ !ipv4_is_local_multicast(br_ip_entry->addr.dst.ip4))
+ continue;
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (br_ip_entry->addr.proto == htons(ETH_P_IPV6)) {
+ if (tvlv_flags & BATADV_MCAST_WANT_ALL_IPV6)
+ continue;
+
+ if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES &&
+ ipv6_addr_is_ll_all_nodes(&br_ip_entry->addr.dst.ip6))
+ continue;
+
+ if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) &&
+ IPV6_ADDR_MC_SCOPE(&br_ip_entry->addr.dst.ip6) >
+ IPV6_ADDR_SCOPE_LINKLOCAL)
+ continue;
+ }
+#endif
batadv_mcast_mla_br_addr_cpy(mcast_addr, &br_ip_entry->addr);
if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list))
@@ -318,15 +670,13 @@ static void batadv_mcast_mla_list_free(struct hlist_head *mcast_list)
/**
* batadv_mcast_mla_tt_retract() - clean up multicast listener announcements
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @mcast_list: a list of addresses which should _not_ be removed
*
* Retracts the announcement of any multicast listener from the
* translation table except the ones listed in the given mcast_list.
*
* If mcast_list is NULL then all are retracted.
- *
- * Do not call outside of the mcast worker! (or cancel mcast worker first)
*/
static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv,
struct hlist_head *mcast_list)
@@ -334,8 +684,6 @@ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv,
struct batadv_hw_addr *mcast_entry;
struct hlist_node *tmp;
- WARN_ON(delayed_work_pending(&bat_priv->mcast.work));
-
hlist_for_each_entry_safe(mcast_entry, tmp, &bat_priv->mcast.mla_list,
list) {
if (mcast_list &&
@@ -354,13 +702,11 @@ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv,
/**
* batadv_mcast_mla_tt_add() - add multicast listener announcements
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @mcast_list: a list of addresses which are going to get added
*
* Adds multicast listener announcements from the given mcast_list to the
* translation table if they have not been added yet.
- *
- * Do not call outside of the mcast worker! (or cancel mcast worker first)
*/
static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv,
struct hlist_head *mcast_list)
@@ -368,8 +714,6 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv,
struct batadv_hw_addr *mcast_entry;
struct hlist_node *tmp;
- WARN_ON(delayed_work_pending(&bat_priv->mcast.work));
-
if (!mcast_list)
return;
@@ -378,7 +722,7 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv,
&bat_priv->mcast.mla_list))
continue;
- if (!batadv_tt_local_add(bat_priv->soft_iface,
+ if (!batadv_tt_local_add(bat_priv->mesh_iface,
mcast_entry->addr, BATADV_NO_FLAGS,
BATADV_NULL_IFINDEX, BATADV_NO_MARK))
continue;
@@ -389,30 +733,9 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv,
}
/**
- * batadv_mcast_has_bridge() - check whether the soft-iface is bridged
- * @bat_priv: the bat priv with all the soft interface information
- *
- * Checks whether there is a bridge on top of our soft interface.
- *
- * Return: true if there is a bridge, false otherwise.
- */
-static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv)
-{
- struct net_device *upper = bat_priv->soft_iface;
-
- rcu_read_lock();
- do {
- upper = netdev_master_upper_dev_get_rcu(upper);
- } while (upper && !(upper->priv_flags & IFF_EBRIDGE));
- rcu_read_unlock();
-
- return upper;
-}
-
-/**
* batadv_mcast_querier_log() - debug output regarding the querier status on
* link
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @str_proto: a string for the querier protocol (e.g. "IGMP" or "MLD")
* @old_state: the previous querier state on our link
* @new_state: the new querier state on our link
@@ -429,7 +752,7 @@ static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv)
* potentially shadowing listeners from us then.
*
* This is only interesting for nodes with a bridge on top of their
- * soft interface.
+ * mesh interface.
*/
static void
batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto,
@@ -437,14 +760,14 @@ batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto,
struct batadv_mcast_querier_state *new_state)
{
if (!old_state->exists && new_state->exists)
- batadv_info(bat_priv->soft_iface, "%s Querier appeared\n",
+ batadv_info(bat_priv->mesh_iface, "%s Querier appeared\n",
str_proto);
else if (old_state->exists && !new_state->exists)
- batadv_info(bat_priv->soft_iface,
+ batadv_info(bat_priv->mesh_iface,
"%s Querier disappeared - multicast optimizations disabled\n",
str_proto);
- else if (!bat_priv->mcast.bridged && !new_state->exists)
- batadv_info(bat_priv->soft_iface,
+ else if (!bat_priv->mcast.mla_flags.bridged && !new_state->exists)
+ batadv_info(bat_priv->mesh_iface,
"No %s Querier present - multicast optimizations disabled\n",
str_proto);
@@ -464,10 +787,8 @@ batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto,
/**
* batadv_mcast_bridge_log() - debug output for topology changes in bridged
* setups
- * @bat_priv: the bat priv with all the soft interface information
- * @bridged: a flag about whether the soft interface is currently bridged or not
- * @querier_ipv4: (maybe) new status of a potential, selected IGMP querier
- * @querier_ipv6: (maybe) new status of a potential, selected MLD querier
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @new_flags: flags indicating the new multicast state
*
* If no bridges are ever used on this node, then this function does nothing.
*
@@ -475,136 +796,98 @@ batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto,
* which might be relevant to our multicast optimizations.
*
* More precisely, it outputs information when a bridge interface is added or
- * removed from a soft interface. And when a bridge is present, it further
+ * removed from a mesh interface. And when a bridge is present, it further
* outputs information about the querier state which is relevant for the
* multicast flags this node is going to set.
*/
static void
-batadv_mcast_bridge_log(struct batadv_priv *bat_priv, bool bridged,
- struct batadv_mcast_querier_state *querier_ipv4,
- struct batadv_mcast_querier_state *querier_ipv6)
+batadv_mcast_bridge_log(struct batadv_priv *bat_priv,
+ struct batadv_mcast_mla_flags *new_flags)
{
- if (!bat_priv->mcast.bridged && bridged)
+ struct batadv_mcast_mla_flags *old_flags = &bat_priv->mcast.mla_flags;
+
+ if (!old_flags->bridged && new_flags->bridged)
batadv_dbg(BATADV_DBG_MCAST, bat_priv,
"Bridge added: Setting Unsnoopables(U)-flag\n");
- else if (bat_priv->mcast.bridged && !bridged)
+ else if (old_flags->bridged && !new_flags->bridged)
batadv_dbg(BATADV_DBG_MCAST, bat_priv,
"Bridge removed: Unsetting Unsnoopables(U)-flag\n");
- if (bridged) {
+ if (new_flags->bridged) {
batadv_mcast_querier_log(bat_priv, "IGMP",
- &bat_priv->mcast.querier_ipv4,
- querier_ipv4);
+ &old_flags->querier_ipv4,
+ &new_flags->querier_ipv4);
batadv_mcast_querier_log(bat_priv, "MLD",
- &bat_priv->mcast.querier_ipv6,
- querier_ipv6);
+ &old_flags->querier_ipv6,
+ &new_flags->querier_ipv6);
}
}
/**
- * batadv_mcast_flags_logs() - output debug information about mcast flag changes
- * @bat_priv: the bat priv with all the soft interface information
- * @flags: flags indicating the new multicast state
+ * batadv_mcast_flags_log() - output debug information about mcast flag changes
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @flags: TVLV flags indicating the new multicast state
*
- * Whenever the multicast flags this nodes announces changes (@mcast_flags vs.
- * bat_priv->mcast.flags), this notifies userspace via the 'mcast' log level.
+ * Whenever the multicast TVLV flags this node announces change, this function
+ * should be used to notify userspace about the change.
*/
static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags)
{
- u8 old_flags = bat_priv->mcast.flags;
- char str_old_flags[] = "[...]";
+ bool old_enabled = bat_priv->mcast.mla_flags.enabled;
+ u8 old_flags = bat_priv->mcast.mla_flags.tvlv_flags;
+ char str_old_flags[] = "[.... . .]";
- sprintf(str_old_flags, "[%c%c%c]",
+ sprintf(str_old_flags, "[%c%c%c%s%s%c]",
(old_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.',
(old_flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.',
- (old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.');
+ (old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.',
+ !(old_flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ",
+ !(old_flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". ",
+ !(old_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) ? 'P' : '.');
batadv_dbg(BATADV_DBG_MCAST, bat_priv,
- "Changing multicast flags from '%s' to '[%c%c%c]'\n",
- bat_priv->mcast.enabled ? str_old_flags : "<undefined>",
+ "Changing multicast flags from '%s' to '[%c%c%c%s%s%c]'\n",
+ old_enabled ? str_old_flags : "<undefined>",
(flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.',
(flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.',
- (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.');
+ (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.',
+ !(flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ",
+ !(flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". ",
+ !(flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) ? 'P' : '.');
}
/**
- * batadv_mcast_mla_tvlv_update() - update multicast tvlv
- * @bat_priv: the bat priv with all the soft interface information
+ * batadv_mcast_mla_flags_update() - update multicast flags
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @flags: flags indicating the new multicast state
*
* Updates the own multicast tvlv with our current multicast related settings,
* capabilities and inabilities.
- *
- * Return: false if we want all IPv4 && IPv6 multicast traffic and true
- * otherwise.
*/
-static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv)
+static void
+batadv_mcast_mla_flags_update(struct batadv_priv *bat_priv,
+ struct batadv_mcast_mla_flags *flags)
{
struct batadv_tvlv_mcast_data mcast_data;
- struct batadv_mcast_querier_state querier4 = {false, false};
- struct batadv_mcast_querier_state querier6 = {false, false};
- struct net_device *dev = bat_priv->soft_iface;
- bool bridged;
-
- mcast_data.flags = BATADV_NO_FLAGS;
- memset(mcast_data.reserved, 0, sizeof(mcast_data.reserved));
-
- bridged = batadv_mcast_has_bridge(bat_priv);
- if (!bridged)
- goto update;
-
- if (!IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING))
- pr_warn_once("No bridge IGMP snooping compiled - multicast optimizations disabled\n");
-
- querier4.exists = br_multicast_has_querier_anywhere(dev, ETH_P_IP);
- querier4.shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IP);
-
- querier6.exists = br_multicast_has_querier_anywhere(dev, ETH_P_IPV6);
- querier6.shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IPV6);
-
- mcast_data.flags |= BATADV_MCAST_WANT_ALL_UNSNOOPABLES;
-
- /* 1) If no querier exists at all, then multicast listeners on
- * our local TT clients behind the bridge will keep silent.
- * 2) If the selected querier is on one of our local TT clients,
- * behind the bridge, then this querier might shadow multicast
- * listeners on our local TT clients, behind this bridge.
- *
- * In both cases, we will signalize other batman nodes that
- * we need all multicast traffic of the according protocol.
- */
- if (!querier4.exists || querier4.shadowing)
- mcast_data.flags |= BATADV_MCAST_WANT_ALL_IPV4;
-
- if (!querier6.exists || querier6.shadowing)
- mcast_data.flags |= BATADV_MCAST_WANT_ALL_IPV6;
-update:
- batadv_mcast_bridge_log(bat_priv, bridged, &querier4, &querier6);
+ if (!memcmp(flags, &bat_priv->mcast.mla_flags, sizeof(*flags)))
+ return;
- bat_priv->mcast.querier_ipv4.exists = querier4.exists;
- bat_priv->mcast.querier_ipv4.shadowing = querier4.shadowing;
+ batadv_mcast_bridge_log(bat_priv, flags);
+ batadv_mcast_flags_log(bat_priv, flags->tvlv_flags);
- bat_priv->mcast.querier_ipv6.exists = querier6.exists;
- bat_priv->mcast.querier_ipv6.shadowing = querier6.shadowing;
+ mcast_data.flags = flags->tvlv_flags;
+ memset(mcast_data.reserved, 0, sizeof(mcast_data.reserved));
- bat_priv->mcast.bridged = bridged;
+ batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 2,
+ &mcast_data, sizeof(mcast_data));
- if (!bat_priv->mcast.enabled ||
- mcast_data.flags != bat_priv->mcast.flags) {
- batadv_mcast_flags_log(bat_priv, mcast_data.flags);
- batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 2,
- &mcast_data, sizeof(mcast_data));
- bat_priv->mcast.flags = mcast_data.flags;
- bat_priv->mcast.enabled = true;
- }
-
- return !(mcast_data.flags & BATADV_MCAST_WANT_ALL_IPV4 &&
- mcast_data.flags & BATADV_MCAST_WANT_ALL_IPV6);
+ bat_priv->mcast.mla_flags = *flags;
}
/**
* __batadv_mcast_mla_update() - update the own MLAs
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*
* Updates the own multicast listener announcements in the translation
* table as well as the own, announced multicast tvlv container.
@@ -616,24 +899,26 @@ update:
*/
static void __batadv_mcast_mla_update(struct batadv_priv *bat_priv)
{
- struct net_device *soft_iface = bat_priv->soft_iface;
+ struct net_device *mesh_iface = bat_priv->mesh_iface;
struct hlist_head mcast_list = HLIST_HEAD_INIT;
+ struct batadv_mcast_mla_flags flags;
int ret;
- if (!batadv_mcast_mla_tvlv_update(bat_priv))
- goto update;
+ flags = batadv_mcast_mla_flags_get(bat_priv);
- ret = batadv_mcast_mla_softif_get(bat_priv, soft_iface, &mcast_list);
+ ret = batadv_mcast_mla_meshif_get(mesh_iface, &mcast_list, &flags);
if (ret < 0)
goto out;
- ret = batadv_mcast_mla_bridge_get(bat_priv, soft_iface, &mcast_list);
+ ret = batadv_mcast_mla_bridge_get(mesh_iface, &mcast_list, &flags);
if (ret < 0)
goto out;
-update:
+ spin_lock(&bat_priv->mcast.mla_lock);
batadv_mcast_mla_tt_retract(bat_priv, &mcast_list);
batadv_mcast_mla_tt_add(bat_priv, &mcast_list);
+ batadv_mcast_mla_flags_update(bat_priv, &flags);
+ spin_unlock(&bat_priv->mcast.mla_lock);
out:
batadv_mcast_mla_list_free(&mcast_list);
@@ -674,7 +959,7 @@ static void batadv_mcast_mla_update(struct work_struct *work)
*/
static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb)
{
- if (ip_mc_check_igmp(skb, NULL) < 0)
+ if (ip_mc_check_igmp(skb) < 0)
return false;
switch (igmp_hdr(skb)->type) {
@@ -690,9 +975,10 @@ static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb)
/**
* batadv_mcast_forw_mode_check_ipv4() - check for optimized forwarding
* potential
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: the IPv4 packet to check
* @is_unsnoopable: stores whether the destination is snoopable
+ * @is_routable: stores whether the destination is routable
*
* Checks whether the given IPv4 packet has the potential to be forwarded with a
* mode more optimal than classic flooding.
@@ -702,7 +988,8 @@ static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb)
*/
static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv,
struct sk_buff *skb,
- bool *is_unsnoopable)
+ bool *is_unsnoopable,
+ int *is_routable)
{
struct iphdr *iphdr;
@@ -715,16 +1002,13 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv,
iphdr = ip_hdr(skb);
- /* TODO: Implement Multicast Router Discovery (RFC4286),
- * then allow scope > link local, too
- */
- if (!ipv4_is_local_multicast(iphdr->daddr))
- return -EINVAL;
-
/* link-local multicast listeners behind a bridge are
* not snoopable (see RFC4541, section 2.1.2.2)
*/
- *is_unsnoopable = true;
+ if (ipv4_is_local_multicast(iphdr->daddr))
+ *is_unsnoopable = true;
+ else
+ *is_routable = ETH_P_IP;
return 0;
}
@@ -741,7 +1025,7 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv,
*/
static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb)
{
- if (ipv6_mc_check_mld(skb, NULL) < 0)
+ if (ipv6_mc_check_mld(skb) < 0)
return false;
switch (icmp6_hdr(skb)->icmp6_type) {
@@ -756,9 +1040,10 @@ static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb)
/**
* batadv_mcast_forw_mode_check_ipv6() - check for optimized forwarding
* potential
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: the IPv6 packet to check
* @is_unsnoopable: stores whether the destination is snoopable
+ * @is_routable: stores whether the destination is routable
*
* Checks whether the given IPv6 packet has the potential to be forwarded with a
* mode more optimal than classic flooding.
@@ -767,7 +1052,8 @@ static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb)
*/
static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv,
struct sk_buff *skb,
- bool *is_unsnoopable)
+ bool *is_unsnoopable,
+ int *is_routable)
{
struct ipv6hdr *ip6hdr;
@@ -780,10 +1066,7 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv,
ip6hdr = ipv6_hdr(skb);
- /* TODO: Implement Multicast Router Discovery (RFC4286),
- * then allow scope > link local, too
- */
- if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) != IPV6_ADDR_SCOPE_LINKLOCAL)
+ if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) < IPV6_ADDR_SCOPE_LINKLOCAL)
return -EINVAL;
/* link-local-all-nodes multicast listeners behind a bridge are
@@ -791,15 +1074,18 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv,
*/
if (ipv6_addr_is_ll_all_nodes(&ip6hdr->daddr))
*is_unsnoopable = true;
+ else if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) > IPV6_ADDR_SCOPE_LINKLOCAL)
+ *is_routable = ETH_P_IPV6;
return 0;
}
/**
* batadv_mcast_forw_mode_check() - check for optimized forwarding potential
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: the multicast frame to check
* @is_unsnoopable: stores whether the destination is snoopable
+ * @is_routable: stores whether the destination is routable
*
* Checks whether the given multicast ethernet frame has the potential to be
* forwarded with a mode more optimal than classic flooding.
@@ -808,7 +1094,8 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv,
*/
static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv,
struct sk_buff *skb,
- bool *is_unsnoopable)
+ bool *is_unsnoopable,
+ int *is_routable)
{
struct ethhdr *ethhdr = eth_hdr(skb);
@@ -818,13 +1105,15 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv,
switch (ntohs(ethhdr->h_proto)) {
case ETH_P_IP:
return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb,
- is_unsnoopable);
+ is_unsnoopable,
+ is_routable);
case ETH_P_IPV6:
if (!IS_ENABLED(CONFIG_IPV6))
return -EINVAL;
return batadv_mcast_forw_mode_check_ipv6(bat_priv, skb,
- is_unsnoopable);
+ is_unsnoopable,
+ is_routable);
default:
return -EINVAL;
}
@@ -833,7 +1122,7 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv,
/**
* batadv_mcast_forw_want_all_ip_count() - count nodes with unspecific mcast
* interest
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @ethhdr: ethernet header of a packet
*
* Return: the number of nodes which want all IPv4 multicast traffic if the
@@ -855,188 +1144,457 @@ static int batadv_mcast_forw_want_all_ip_count(struct batadv_priv *bat_priv,
}
/**
- * batadv_mcast_forw_tt_node_get() - get a multicast tt node
- * @bat_priv: the bat priv with all the soft interface information
- * @ethhdr: the ether header containing the multicast destination
+ * batadv_mcast_forw_rtr_count() - count nodes with a multicast router
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @protocol: the ethernet protocol type to count multicast routers for
*
- * Return: an orig_node matching the multicast address provided by ethhdr
- * via a translation table lookup. This increases the returned nodes refcount.
+ * Return: the number of nodes which want all routable IPv4 multicast traffic
+ * if the protocol is ETH_P_IP or the number of nodes which want all routable
+ * IPv6 traffic if the protocol is ETH_P_IPV6. Otherwise returns 0.
*/
-static struct batadv_orig_node *
-batadv_mcast_forw_tt_node_get(struct batadv_priv *bat_priv,
- struct ethhdr *ethhdr)
+
+static int batadv_mcast_forw_rtr_count(struct batadv_priv *bat_priv,
+ int protocol)
{
- return batadv_transtable_search(bat_priv, NULL, ethhdr->h_dest,
- BATADV_NO_FLAGS);
+ switch (protocol) {
+ case ETH_P_IP:
+ return atomic_read(&bat_priv->mcast.num_want_all_rtr4);
+ case ETH_P_IPV6:
+ return atomic_read(&bat_priv->mcast.num_want_all_rtr6);
+ default:
+ return 0;
+ }
}
/**
- * batadv_mcast_forw_ipv4_node_get() - get a node with an ipv4 flag
- * @bat_priv: the bat priv with all the soft interface information
+ * batadv_mcast_forw_mode_by_count() - get forwarding mode by count
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to check
+ * @vid: the vlan identifier
+ * @is_routable: stores whether the destination is routable
+ * @count: the number of originators the multicast packet need to be sent to
+ *
+ * For a multicast packet with multiple destination originators, checks which
+ * mode to use. For BATADV_FORW_MCAST it also encapsulates the packet with a
+ * complete batman-adv multicast header.
+ *
+ * Return:
+ * BATADV_FORW_MCAST: If all nodes have multicast packet routing
+ * capabilities and an MTU >= 1280 on all hard interfaces (including us)
+ * and the encapsulated multicast packet with all destination addresses
+ * would still fit into an 1280 bytes batman-adv multicast packet
+ * (excluding the outer ethernet frame) and we could successfully push
+ * the full batman-adv multicast packet header.
+ * BATADV_FORW_UCASTS: If the packet cannot be sent in a batman-adv
+ * multicast packet and the amount of batman-adv unicast packets needed
+ * is smaller or equal to the configured multicast fanout.
+ * BATADV_FORW_BCAST: Otherwise.
+ */
+static enum batadv_forw_mode
+batadv_mcast_forw_mode_by_count(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid,
+ int is_routable, int count)
+{
+ unsigned int mcast_hdrlen = batadv_mcast_forw_packet_hdrlen(count);
+ u8 own_tvlv_flags = bat_priv->mcast.mla_flags.tvlv_flags;
+
+ if (!atomic_read(&bat_priv->mcast.num_no_mc_ptype_capa) &&
+ own_tvlv_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA &&
+ skb->len + mcast_hdrlen <= IPV6_MIN_MTU &&
+ batadv_mcast_forw_push(bat_priv, skb, vid, is_routable, count))
+ return BATADV_FORW_MCAST;
+
+ if (count <= atomic_read(&bat_priv->multicast_fanout))
+ return BATADV_FORW_UCASTS;
+
+ return BATADV_FORW_BCAST;
+}
+
+/**
+ * batadv_mcast_forw_mode() - check on how to forward a multicast packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to check
+ * @vid: the vlan identifier
+ * @is_routable: stores whether the destination is routable
+ *
+ * Return: The forwarding mode as enum batadv_forw_mode.
+ */
+enum batadv_forw_mode
+batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, int *is_routable)
+{
+ int ret, tt_count, ip_count, unsnoop_count, total_count;
+ bool is_unsnoopable = false;
+ struct ethhdr *ethhdr;
+ int rtr_count = 0;
+
+ ret = batadv_mcast_forw_mode_check(bat_priv, skb, &is_unsnoopable,
+ is_routable);
+ if (ret == -ENOMEM)
+ return BATADV_FORW_NONE;
+ else if (ret < 0)
+ return BATADV_FORW_BCAST;
+
+ ethhdr = eth_hdr(skb);
+
+ tt_count = batadv_tt_global_hash_count(bat_priv, ethhdr->h_dest,
+ BATADV_NO_FLAGS);
+ ip_count = batadv_mcast_forw_want_all_ip_count(bat_priv, ethhdr);
+ unsnoop_count = !is_unsnoopable ? 0 :
+ atomic_read(&bat_priv->mcast.num_want_all_unsnoopables);
+ rtr_count = batadv_mcast_forw_rtr_count(bat_priv, *is_routable);
+
+ total_count = tt_count + ip_count + unsnoop_count + rtr_count;
+
+ if (!total_count)
+ return BATADV_FORW_NONE;
+ else if (unsnoop_count)
+ return BATADV_FORW_BCAST;
+
+ return batadv_mcast_forw_mode_by_count(bat_priv, skb, vid, *is_routable,
+ total_count);
+}
+
+/**
+ * batadv_mcast_forw_send_orig() - send a multicast packet to an originator
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to send
+ * @vid: the vlan identifier
+ * @orig_node: the originator to send the packet to
*
- * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and
- * increases its refcount.
+ * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
*/
-static struct batadv_orig_node *
-batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv)
+static int batadv_mcast_forw_send_orig(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned short vid,
+ struct batadv_orig_node *orig_node)
{
- struct batadv_orig_node *tmp_orig_node, *orig_node = NULL;
+ /* Avoid sending multicast-in-unicast packets to other BLA
+ * gateways - they already got the frame from the LAN side
+ * we share with them.
+ * TODO: Refactor to take BLA into account earlier, to avoid
+ * reducing the mcast_fanout count.
+ */
+ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vid)) {
+ dev_kfree_skb(skb);
+ return NET_XMIT_SUCCESS;
+ }
+
+ return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST, 0,
+ orig_node, vid);
+}
+
+/**
+ * batadv_mcast_forw_tt() - forwards a packet to multicast listeners
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to transmit
+ * @vid: the vlan identifier
+ *
+ * Sends copies of a frame with multicast destination to any multicast
+ * listener registered in the translation table. A transmission is performed
+ * via a batman-adv unicast packet for each such destination node.
+ *
+ * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS
+ * otherwise.
+ */
+static int
+batadv_mcast_forw_tt(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid)
+{
+ int ret = NET_XMIT_SUCCESS;
+ struct sk_buff *newskb;
+
+ struct batadv_tt_orig_list_entry *orig_entry;
+
+ struct batadv_tt_global_entry *tt_global;
+ const u8 *addr = eth_hdr(skb)->h_dest;
+
+ tt_global = batadv_tt_global_hash_find(bat_priv, addr, vid);
+ if (!tt_global)
+ goto out;
rcu_read_lock();
- hlist_for_each_entry_rcu(tmp_orig_node,
+ hlist_for_each_entry_rcu(orig_entry, &tt_global->orig_list, list) {
+ newskb = skb_copy(skb, GFP_ATOMIC);
+ if (!newskb) {
+ ret = NET_XMIT_DROP;
+ break;
+ }
+
+ batadv_mcast_forw_send_orig(bat_priv, newskb, vid,
+ orig_entry->orig_node);
+ }
+ rcu_read_unlock();
+
+ batadv_tt_global_entry_put(tt_global);
+
+out:
+ return ret;
+}
+
+/**
+ * batadv_mcast_forw_want_all_ipv4() - forward to nodes with want-all-ipv4
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to transmit
+ * @vid: the vlan identifier
+ *
+ * Sends copies of a frame with multicast destination to any node with a
+ * BATADV_MCAST_WANT_ALL_IPV4 flag set. A transmission is performed via a
+ * batman-adv unicast packet for each such destination node.
+ *
+ * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS
+ * otherwise.
+ */
+static int
+batadv_mcast_forw_want_all_ipv4(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid)
+{
+ struct batadv_orig_node *orig_node;
+ int ret = NET_XMIT_SUCCESS;
+ struct sk_buff *newskb;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node,
&bat_priv->mcast.want_all_ipv4_list,
mcast_want_all_ipv4_node) {
- if (!kref_get_unless_zero(&tmp_orig_node->refcount))
- continue;
+ newskb = skb_copy(skb, GFP_ATOMIC);
+ if (!newskb) {
+ ret = NET_XMIT_DROP;
+ break;
+ }
- orig_node = tmp_orig_node;
- break;
+ batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node);
}
rcu_read_unlock();
-
- return orig_node;
+ return ret;
}
/**
- * batadv_mcast_forw_ipv6_node_get() - get a node with an ipv6 flag
- * @bat_priv: the bat priv with all the soft interface information
+ * batadv_mcast_forw_want_all_ipv6() - forward to nodes with want-all-ipv6
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: The multicast packet to transmit
+ * @vid: the vlan identifier
+ *
+ * Sends copies of a frame with multicast destination to any node with a
+ * BATADV_MCAST_WANT_ALL_IPV6 flag set. A transmission is performed via a
+ * batman-adv unicast packet for each such destination node.
*
- * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set
- * and increases its refcount.
+ * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS
+ * otherwise.
*/
-static struct batadv_orig_node *
-batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv)
+static int
+batadv_mcast_forw_want_all_ipv6(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid)
{
- struct batadv_orig_node *tmp_orig_node, *orig_node = NULL;
+ struct batadv_orig_node *orig_node;
+ int ret = NET_XMIT_SUCCESS;
+ struct sk_buff *newskb;
rcu_read_lock();
- hlist_for_each_entry_rcu(tmp_orig_node,
+ hlist_for_each_entry_rcu(orig_node,
&bat_priv->mcast.want_all_ipv6_list,
mcast_want_all_ipv6_node) {
- if (!kref_get_unless_zero(&tmp_orig_node->refcount))
- continue;
+ newskb = skb_copy(skb, GFP_ATOMIC);
+ if (!newskb) {
+ ret = NET_XMIT_DROP;
+ break;
+ }
- orig_node = tmp_orig_node;
- break;
+ batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node);
}
rcu_read_unlock();
-
- return orig_node;
+ return ret;
}
/**
- * batadv_mcast_forw_ip_node_get() - get a node with an ipv4/ipv6 flag
- * @bat_priv: the bat priv with all the soft interface information
- * @ethhdr: an ethernet header to determine the protocol family from
- *
- * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or
- * BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, set and
- * increases its refcount.
+ * batadv_mcast_forw_want_all() - forward packet to nodes in a want-all list
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to transmit
+ * @vid: the vlan identifier
+ *
+ * Sends copies of a frame with multicast destination to any node with a
+ * BATADV_MCAST_WANT_ALL_IPV4 or BATADV_MCAST_WANT_ALL_IPV6 flag set. A
+ * transmission is performed via a batman-adv unicast packet for each such
+ * destination node.
+ *
+ * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family
+ * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise.
*/
-static struct batadv_orig_node *
-batadv_mcast_forw_ip_node_get(struct batadv_priv *bat_priv,
- struct ethhdr *ethhdr)
+static int
+batadv_mcast_forw_want_all(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid)
{
- switch (ntohs(ethhdr->h_proto)) {
+ switch (ntohs(eth_hdr(skb)->h_proto)) {
case ETH_P_IP:
- return batadv_mcast_forw_ipv4_node_get(bat_priv);
+ return batadv_mcast_forw_want_all_ipv4(bat_priv, skb, vid);
case ETH_P_IPV6:
- return batadv_mcast_forw_ipv6_node_get(bat_priv);
+ return batadv_mcast_forw_want_all_ipv6(bat_priv, skb, vid);
default:
/* we shouldn't be here... */
- return NULL;
+ return NET_XMIT_DROP;
}
}
/**
- * batadv_mcast_forw_unsnoop_node_get() - get a node with an unsnoopable flag
- * @bat_priv: the bat priv with all the soft interface information
+ * batadv_mcast_forw_want_all_rtr4() - forward to nodes with want-all-rtr4
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to transmit
+ * @vid: the vlan identifier
*
- * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag
- * set and increases its refcount.
+ * Sends copies of a frame with multicast destination to any node with a
+ * BATADV_MCAST_WANT_NO_RTR4 flag unset. A transmission is performed via a
+ * batman-adv unicast packet for each such destination node.
+ *
+ * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS
+ * otherwise.
*/
-static struct batadv_orig_node *
-batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv)
+static int
+batadv_mcast_forw_want_all_rtr4(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid)
{
- struct batadv_orig_node *tmp_orig_node, *orig_node = NULL;
+ struct batadv_orig_node *orig_node;
+ int ret = NET_XMIT_SUCCESS;
+ struct sk_buff *newskb;
rcu_read_lock();
- hlist_for_each_entry_rcu(tmp_orig_node,
- &bat_priv->mcast.want_all_unsnoopables_list,
- mcast_want_all_unsnoopables_node) {
- if (!kref_get_unless_zero(&tmp_orig_node->refcount))
- continue;
+ hlist_for_each_entry_rcu(orig_node,
+ &bat_priv->mcast.want_all_rtr4_list,
+ mcast_want_all_rtr4_node) {
+ newskb = skb_copy(skb, GFP_ATOMIC);
+ if (!newskb) {
+ ret = NET_XMIT_DROP;
+ break;
+ }
- orig_node = tmp_orig_node;
- break;
+ batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node);
}
rcu_read_unlock();
-
- return orig_node;
+ return ret;
}
/**
- * batadv_mcast_forw_mode() - check on how to forward a multicast packet
- * @bat_priv: the bat priv with all the soft interface information
- * @skb: The multicast packet to check
- * @orig: an originator to be set to forward the skb to
+ * batadv_mcast_forw_want_all_rtr6() - forward to nodes with want-all-rtr6
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: The multicast packet to transmit
+ * @vid: the vlan identifier
+ *
+ * Sends copies of a frame with multicast destination to any node with a
+ * BATADV_MCAST_WANT_NO_RTR6 flag unset. A transmission is performed via a
+ * batman-adv unicast packet for each such destination node.
*
- * Return: the forwarding mode as enum batadv_forw_mode and in case of
- * BATADV_FORW_SINGLE set the orig to the single originator the skb
- * should be forwarded to.
+ * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS
+ * otherwise.
*/
-enum batadv_forw_mode
-batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
- struct batadv_orig_node **orig)
+static int
+batadv_mcast_forw_want_all_rtr6(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid)
{
- int ret, tt_count, ip_count, unsnoop_count, total_count;
- bool is_unsnoopable = false;
- struct ethhdr *ethhdr;
+ struct batadv_orig_node *orig_node;
+ int ret = NET_XMIT_SUCCESS;
+ struct sk_buff *newskb;
- ret = batadv_mcast_forw_mode_check(bat_priv, skb, &is_unsnoopable);
- if (ret == -ENOMEM)
- return BATADV_FORW_NONE;
- else if (ret < 0)
- return BATADV_FORW_ALL;
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node,
+ &bat_priv->mcast.want_all_rtr6_list,
+ mcast_want_all_rtr6_node) {
+ newskb = skb_copy(skb, GFP_ATOMIC);
+ if (!newskb) {
+ ret = NET_XMIT_DROP;
+ break;
+ }
- ethhdr = eth_hdr(skb);
+ batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node);
+ }
+ rcu_read_unlock();
+ return ret;
+}
- tt_count = batadv_tt_global_hash_count(bat_priv, ethhdr->h_dest,
- BATADV_NO_FLAGS);
- ip_count = batadv_mcast_forw_want_all_ip_count(bat_priv, ethhdr);
- unsnoop_count = !is_unsnoopable ? 0 :
- atomic_read(&bat_priv->mcast.num_want_all_unsnoopables);
+/**
+ * batadv_mcast_forw_want_rtr() - forward packet to nodes in a want-all-rtr list
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to transmit
+ * @vid: the vlan identifier
+ *
+ * Sends copies of a frame with multicast destination to any node with a
+ * BATADV_MCAST_WANT_NO_RTR4 or BATADV_MCAST_WANT_NO_RTR6 flag unset. A
+ * transmission is performed via a batman-adv unicast packet for each such
+ * destination node.
+ *
+ * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family
+ * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise.
+ */
+static int
+batadv_mcast_forw_want_rtr(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid)
+{
+ switch (ntohs(eth_hdr(skb)->h_proto)) {
+ case ETH_P_IP:
+ return batadv_mcast_forw_want_all_rtr4(bat_priv, skb, vid);
+ case ETH_P_IPV6:
+ return batadv_mcast_forw_want_all_rtr6(bat_priv, skb, vid);
+ default:
+ /* we shouldn't be here... */
+ return NET_XMIT_DROP;
+ }
+}
+
+/**
+ * batadv_mcast_forw_send() - send packet to any detected multicast recipient
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to transmit
+ * @vid: the vlan identifier
+ * @is_routable: stores whether the destination is routable
+ *
+ * Sends copies of a frame with multicast destination to any node that signaled
+ * interest in it, that is either via the translation table or the according
+ * want-all flags. A transmission is performed via a batman-adv unicast packet
+ * for each such destination node.
+ *
+ * The given skb is consumed/freed.
+ *
+ * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family
+ * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise.
+ */
+int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, int is_routable)
+{
+ int ret;
- total_count = tt_count + ip_count + unsnoop_count;
+ ret = batadv_mcast_forw_tt(bat_priv, skb, vid);
+ if (ret != NET_XMIT_SUCCESS) {
+ kfree_skb(skb);
+ return ret;
+ }
- switch (total_count) {
- case 1:
- if (tt_count)
- *orig = batadv_mcast_forw_tt_node_get(bat_priv, ethhdr);
- else if (ip_count)
- *orig = batadv_mcast_forw_ip_node_get(bat_priv, ethhdr);
- else if (unsnoop_count)
- *orig = batadv_mcast_forw_unsnoop_node_get(bat_priv);
+ ret = batadv_mcast_forw_want_all(bat_priv, skb, vid);
+ if (ret != NET_XMIT_SUCCESS) {
+ kfree_skb(skb);
+ return ret;
+ }
- if (*orig)
- return BATADV_FORW_SINGLE;
+ if (!is_routable)
+ goto skip_mc_router;
- /* fall through */
- case 0:
- return BATADV_FORW_NONE;
- default:
- return BATADV_FORW_ALL;
+ ret = batadv_mcast_forw_want_rtr(bat_priv, skb, vid);
+ if (ret != NET_XMIT_SUCCESS) {
+ kfree_skb(skb);
+ return ret;
}
+
+skip_mc_router:
+ consume_skb(skb);
+ return ret;
}
/**
* batadv_mcast_want_unsnoop_update() - update unsnoop counter and list
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig: the orig_node which multicast state might have changed of
* @mcast_flags: flags indicating the new multicast state
*
* If the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag of this originator,
- * orig, has toggled then this method updates counter and list accordingly.
+ * orig, has toggled then this method updates the counter and the list
+ * accordingly.
*
* Caller needs to hold orig->mcast_handler_lock.
*/
@@ -1076,12 +1634,12 @@ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv,
/**
* batadv_mcast_want_ipv4_update() - update want-all-ipv4 counter and list
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig: the orig_node which multicast state might have changed of
* @mcast_flags: flags indicating the new multicast state
*
* If the BATADV_MCAST_WANT_ALL_IPV4 flag of this originator, orig, has
- * toggled then this method updates counter and list accordingly.
+ * toggled then this method updates the counter and the list accordingly.
*
* Caller needs to hold orig->mcast_handler_lock.
*/
@@ -1121,12 +1679,12 @@ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv,
/**
* batadv_mcast_want_ipv6_update() - update want-all-ipv6 counter and list
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig: the orig_node which multicast state might have changed of
* @mcast_flags: flags indicating the new multicast state
*
* If the BATADV_MCAST_WANT_ALL_IPV6 flag of this originator, orig, has
- * toggled then this method updates counter and list accordingly.
+ * toggled then this method updates the counter and the list accordingly.
*
* Caller needs to hold orig->mcast_handler_lock.
*/
@@ -1165,8 +1723,154 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv,
}
/**
+ * batadv_mcast_want_rtr4_update() - update want-all-rtr4 counter and list
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig: the orig_node which multicast state might have changed of
+ * @mcast_flags: flags indicating the new multicast state
+ *
+ * If the BATADV_MCAST_WANT_NO_RTR4 flag of this originator, orig, has
+ * toggled then this method updates the counter and the list accordingly.
+ *
+ * Caller needs to hold orig->mcast_handler_lock.
+ */
+static void batadv_mcast_want_rtr4_update(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ u8 mcast_flags)
+{
+ struct hlist_node *node = &orig->mcast_want_all_rtr4_node;
+ struct hlist_head *head = &bat_priv->mcast.want_all_rtr4_list;
+
+ lockdep_assert_held(&orig->mcast_handler_lock);
+
+ /* switched from flag set to unset */
+ if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR4) &&
+ orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4) {
+ atomic_inc(&bat_priv->mcast.num_want_all_rtr4);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(!hlist_unhashed(node));
+
+ hlist_add_head_rcu(node, head);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ /* switched from flag unset to set */
+ } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR4 &&
+ !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4)) {
+ atomic_dec(&bat_priv->mcast.num_want_all_rtr4);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(hlist_unhashed(node));
+
+ hlist_del_init_rcu(node);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ }
+}
+
+/**
+ * batadv_mcast_want_rtr6_update() - update want-all-rtr6 counter and list
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig: the orig_node which multicast state might have changed of
+ * @mcast_flags: flags indicating the new multicast state
+ *
+ * If the BATADV_MCAST_WANT_NO_RTR6 flag of this originator, orig, has
+ * toggled then this method updates the counter and the list accordingly.
+ *
+ * Caller needs to hold orig->mcast_handler_lock.
+ */
+static void batadv_mcast_want_rtr6_update(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ u8 mcast_flags)
+{
+ struct hlist_node *node = &orig->mcast_want_all_rtr6_node;
+ struct hlist_head *head = &bat_priv->mcast.want_all_rtr6_list;
+
+ lockdep_assert_held(&orig->mcast_handler_lock);
+
+ /* switched from flag set to unset */
+ if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR6) &&
+ orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6) {
+ atomic_inc(&bat_priv->mcast.num_want_all_rtr6);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(!hlist_unhashed(node));
+
+ hlist_add_head_rcu(node, head);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ /* switched from flag unset to set */
+ } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR6 &&
+ !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6)) {
+ atomic_dec(&bat_priv->mcast.num_want_all_rtr6);
+
+ spin_lock_bh(&bat_priv->mcast.want_lists_lock);
+ /* flag checks above + mcast_handler_lock prevents this */
+ WARN_ON(hlist_unhashed(node));
+
+ hlist_del_init_rcu(node);
+ spin_unlock_bh(&bat_priv->mcast.want_lists_lock);
+ }
+}
+
+/**
+ * batadv_mcast_have_mc_ptype_update() - update multicast packet type counter
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig: the orig_node which multicast state might have changed of
+ * @mcast_flags: flags indicating the new multicast state
+ *
+ * If the BATADV_MCAST_HAVE_MC_PTYPE_CAPA flag of this originator, orig, has
+ * toggled then this method updates the counter accordingly.
+ */
+static void batadv_mcast_have_mc_ptype_update(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig,
+ u8 mcast_flags)
+{
+ lockdep_assert_held(&orig->mcast_handler_lock);
+
+ /* switched from flag set to unset */
+ if (!(mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA) &&
+ orig->mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA)
+ atomic_inc(&bat_priv->mcast.num_no_mc_ptype_capa);
+ /* switched from flag unset to set */
+ else if (mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA &&
+ !(orig->mcast_flags & BATADV_MCAST_HAVE_MC_PTYPE_CAPA))
+ atomic_dec(&bat_priv->mcast.num_no_mc_ptype_capa);
+}
+
+/**
+ * batadv_mcast_tvlv_flags_get() - get multicast flags from an OGM TVLV
+ * @enabled: whether the originator has multicast TVLV support enabled
+ * @tvlv_value: tvlv buffer containing the multicast flags
+ * @tvlv_value_len: tvlv buffer length
+ *
+ * Return: multicast flags for the given tvlv buffer
+ */
+static u8
+batadv_mcast_tvlv_flags_get(bool enabled, void *tvlv_value, u16 tvlv_value_len)
+{
+ u8 mcast_flags = BATADV_NO_FLAGS;
+
+ if (enabled && tvlv_value && tvlv_value_len >= sizeof(mcast_flags))
+ mcast_flags = *(u8 *)tvlv_value;
+
+ if (!enabled) {
+ mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4;
+ mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6;
+ }
+
+ /* remove redundant flags to avoid sending duplicate packets later */
+ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4)
+ mcast_flags |= BATADV_MCAST_WANT_NO_RTR4;
+
+ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6)
+ mcast_flags |= BATADV_MCAST_WANT_NO_RTR6;
+
+ return mcast_flags;
+}
+
+/**
* batadv_mcast_tvlv_ogm_handler() - process incoming multicast tvlv container
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig: the orig_node of the ogm
* @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
* @tvlv_value: tvlv buffer containing the multicast data
@@ -1179,16 +1883,10 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv,
u16 tvlv_value_len)
{
bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
- u8 mcast_flags = BATADV_NO_FLAGS;
+ u8 mcast_flags;
- if (orig_mcast_enabled && tvlv_value &&
- tvlv_value_len >= sizeof(mcast_flags))
- mcast_flags = *(u8 *)tvlv_value;
-
- if (!orig_mcast_enabled) {
- mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4;
- mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6;
- }
+ mcast_flags = batadv_mcast_tvlv_flags_get(orig_mcast_enabled,
+ tvlv_value, tvlv_value_len);
spin_lock_bh(&orig->mcast_handler_lock);
@@ -1205,6 +1903,9 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv,
batadv_mcast_want_unsnoop_update(bat_priv, orig, mcast_flags);
batadv_mcast_want_ipv4_update(bat_priv, orig, mcast_flags);
batadv_mcast_want_ipv6_update(bat_priv, orig, mcast_flags);
+ batadv_mcast_want_rtr4_update(bat_priv, orig, mcast_flags);
+ batadv_mcast_want_rtr6_update(bat_priv, orig, mcast_flags);
+ batadv_mcast_have_mc_ptype_update(bat_priv, orig, mcast_flags);
orig->mcast_flags = mcast_flags;
spin_unlock_bh(&orig->mcast_handler_lock);
@@ -1212,144 +1913,45 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv,
/**
* batadv_mcast_init() - initialize the multicast optimizations structures
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*/
void batadv_mcast_init(struct batadv_priv *bat_priv)
{
batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler,
- NULL, BATADV_TVLV_MCAST, 2,
+ NULL, NULL, BATADV_TVLV_MCAST, 2,
+ BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
+ batadv_tvlv_handler_register(bat_priv, NULL, NULL,
+ batadv_mcast_forw_tracker_tvlv_handler,
+ BATADV_TVLV_MCAST_TRACKER, 1,
BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
INIT_DELAYED_WORK(&bat_priv->mcast.work, batadv_mcast_mla_update);
batadv_mcast_start_timer(bat_priv);
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_mcast_flags_print_header() - print own mcast flags to debugfs table
- * @bat_priv: the bat priv with all the soft interface information
- * @seq: debugfs table seq_file struct
- *
- * Prints our own multicast flags including a more specific reason why
- * they are set, that is prints the bridge and querier state too, to
- * the debugfs table specified via @seq.
- */
-static void batadv_mcast_flags_print_header(struct batadv_priv *bat_priv,
- struct seq_file *seq)
-{
- u8 flags = bat_priv->mcast.flags;
- char querier4, querier6, shadowing4, shadowing6;
- bool bridged = bat_priv->mcast.bridged;
-
- if (bridged) {
- querier4 = bat_priv->mcast.querier_ipv4.exists ? '.' : '4';
- querier6 = bat_priv->mcast.querier_ipv6.exists ? '.' : '6';
- shadowing4 = bat_priv->mcast.querier_ipv4.shadowing ? '4' : '.';
- shadowing6 = bat_priv->mcast.querier_ipv6.shadowing ? '6' : '.';
- } else {
- querier4 = '?';
- querier6 = '?';
- shadowing4 = '?';
- shadowing6 = '?';
- }
-
- seq_printf(seq, "Multicast flags (own flags: [%c%c%c])\n",
- (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.',
- (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.',
- (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.');
- seq_printf(seq, "* Bridged [U]\t\t\t\t%c\n", bridged ? 'U' : '.');
- seq_printf(seq, "* No IGMP/MLD Querier [4/6]:\t\t%c/%c\n",
- querier4, querier6);
- seq_printf(seq, "* Shadowing IGMP/MLD Querier [4/6]:\t%c/%c\n",
- shadowing4, shadowing6);
- seq_puts(seq, "-------------------------------------------\n");
- seq_printf(seq, " %-10s %s\n", "Originator", "Flags");
-}
-
-/**
- * batadv_mcast_flags_seq_print_text() - print the mcast flags of other nodes
- * @seq: seq file to print on
- * @offset: not used
- *
- * This prints a table of (primary) originators and their according
- * multicast flags, including (in the header) our own.
- *
- * Return: always 0
- */
-int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hard_iface *primary_if;
- struct batadv_hashtable *hash = bat_priv->orig_hash;
- struct batadv_orig_node *orig_node;
- struct hlist_head *head;
- u8 flags;
- u32 i;
-
- primary_if = batadv_seq_print_text_primary_if_get(seq);
- if (!primary_if)
- return 0;
-
- batadv_mcast_flags_print_header(bat_priv, seq);
-
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
- if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST,
- &orig_node->capa_initialized))
- continue;
-
- if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST,
- &orig_node->capabilities)) {
- seq_printf(seq, "%pM -\n", orig_node->orig);
- continue;
- }
-
- flags = orig_node->mcast_flags;
-
- seq_printf(seq, "%pM [%c%c%c]\n", orig_node->orig,
- (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES)
- ? 'U' : '.',
- (flags & BATADV_MCAST_WANT_ALL_IPV4)
- ? '4' : '.',
- (flags & BATADV_MCAST_WANT_ALL_IPV6)
- ? '6' : '.');
- }
- rcu_read_unlock();
- }
-
- batadv_hardif_put(primary_if);
-
- return 0;
-}
-#endif
-
/**
* batadv_mcast_mesh_info_put() - put multicast info into a netlink message
* @msg: buffer for the message
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*
* Return: 0 or error code.
*/
int batadv_mcast_mesh_info_put(struct sk_buff *msg,
struct batadv_priv *bat_priv)
{
- u32 flags = bat_priv->mcast.flags;
+ u32 flags = bat_priv->mcast.mla_flags.tvlv_flags;
u32 flags_priv = BATADV_NO_FLAGS;
- if (bat_priv->mcast.bridged) {
+ if (bat_priv->mcast.mla_flags.bridged) {
flags_priv |= BATADV_MCAST_FLAGS_BRIDGED;
- if (bat_priv->mcast.querier_ipv4.exists)
+ if (bat_priv->mcast.mla_flags.querier_ipv4.exists)
flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_EXISTS;
- if (bat_priv->mcast.querier_ipv6.exists)
+ if (bat_priv->mcast.mla_flags.querier_ipv6.exists)
flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_EXISTS;
- if (bat_priv->mcast.querier_ipv4.shadowing)
+ if (bat_priv->mcast.mla_flags.querier_ipv4.shadowing)
flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_SHADOWING;
- if (bat_priv->mcast.querier_ipv6.shadowing)
+ if (bat_priv->mcast.mla_flags.querier_ipv6.shadowing)
flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING;
}
@@ -1365,22 +1967,26 @@ int batadv_mcast_mesh_info_put(struct sk_buff *msg,
* to a netlink socket
* @msg: buffer for the message
* @portid: netlink port
- * @seq: Sequence number of netlink message
+ * @cb: Control block containing additional options
* @orig_node: originator to dump the multicast flags of
*
* Return: 0 or error code.
*/
static int
-batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_orig_node *orig_node)
{
void *hdr;
- hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
- NLM_F_MULTI, BATADV_CMD_GET_MCAST_FLAGS);
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
+ BATADV_CMD_GET_MCAST_FLAGS);
if (!hdr)
return -ENOBUFS;
+ genl_dump_check_consistent(cb, hdr);
+
if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN,
orig_node->orig)) {
genlmsg_cancel(msg, hdr);
@@ -1405,21 +2011,26 @@ batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
* table to a netlink socket
* @msg: buffer for the message
* @portid: netlink port
- * @seq: Sequence number of netlink message
- * @head: bucket to dump
+ * @cb: Control block containing additional options
+ * @hash: hash to dump
+ * @bucket: bucket index to dump
* @idx_skip: How many entries to skip
*
* Return: 0 or error code.
*/
static int
-batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
- struct hlist_head *head, long *idx_skip)
+batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
+ struct batadv_hashtable *hash,
+ unsigned int bucket, long *idx_skip)
{
struct batadv_orig_node *orig_node;
long idx = 0;
- rcu_read_lock();
- hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+ spin_lock_bh(&hash->list_locks[bucket]);
+ cb->seq = atomic_read(&hash->generation) << 1 | 1;
+
+ hlist_for_each_entry(orig_node, &hash->table[bucket], hash_entry) {
if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST,
&orig_node->capa_initialized))
continue;
@@ -1427,9 +2038,8 @@ batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
if (idx < *idx_skip)
goto skip;
- if (batadv_mcast_flags_dump_entry(msg, portid, seq,
- orig_node)) {
- rcu_read_unlock();
+ if (batadv_mcast_flags_dump_entry(msg, portid, cb, orig_node)) {
+ spin_unlock_bh(&hash->list_locks[bucket]);
*idx_skip = idx;
return -EMSGSIZE;
@@ -1438,7 +2048,7 @@ batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
skip:
idx++;
}
- rcu_read_unlock();
+ spin_unlock_bh(&hash->list_locks[bucket]);
return 0;
}
@@ -1447,27 +2057,25 @@ skip:
* __batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket
* @msg: buffer for the message
* @portid: netlink port
- * @seq: Sequence number of netlink message
- * @bat_priv: the bat priv with all the soft interface information
+ * @cb: Control block containing additional options
+ * @bat_priv: the bat priv with all the mesh interface information
* @bucket: current bucket to dump
* @idx: index in current bucket to the next entry to dump
*
* Return: 0 or error code.
*/
static int
-__batadv_mcast_flags_dump(struct sk_buff *msg, u32 portid, u32 seq,
+__batadv_mcast_flags_dump(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_priv *bat_priv, long *bucket, long *idx)
{
struct batadv_hashtable *hash = bat_priv->orig_hash;
long bucket_tmp = *bucket;
- struct hlist_head *head;
long idx_tmp = *idx;
while (bucket_tmp < hash->size) {
- head = &hash->table[bucket_tmp];
-
- if (batadv_mcast_flags_dump_bucket(msg, portid, seq, head,
- &idx_tmp))
+ if (batadv_mcast_flags_dump_bucket(msg, portid, cb, hash,
+ bucket_tmp, &idx_tmp))
break;
bucket_tmp++;
@@ -1493,23 +2101,15 @@ batadv_mcast_netlink_get_primary(struct netlink_callback *cb,
struct batadv_hard_iface **primary_if)
{
struct batadv_hard_iface *hard_iface = NULL;
- struct net *net = sock_net(cb->skb->sk);
- struct net_device *soft_iface;
+ struct net_device *mesh_iface;
struct batadv_priv *bat_priv;
- int ifindex;
int ret = 0;
- ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
- if (!ifindex)
- return -EINVAL;
+ mesh_iface = batadv_netlink_get_meshif(cb);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
- ret = -ENODEV;
- goto out;
- }
-
- bat_priv = netdev_priv(soft_iface);
+ bat_priv = netdev_priv(mesh_iface);
hard_iface = batadv_primary_if_get_selected(bat_priv);
if (!hard_iface || hard_iface->if_status != BATADV_IF_ACTIVE) {
@@ -1518,12 +2118,11 @@ batadv_mcast_netlink_get_primary(struct netlink_callback *cb,
}
out:
- if (soft_iface)
- dev_put(soft_iface);
+ dev_put(mesh_iface);
if (!ret && primary_if)
*primary_if = hard_iface;
- else if (hard_iface)
+ else
batadv_hardif_put(hard_iface);
return ret;
@@ -1549,9 +2148,8 @@ int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb)
if (ret)
return ret;
- bat_priv = netdev_priv(primary_if->soft_iface);
- ret = __batadv_mcast_flags_dump(msg, portid, cb->nlh->nlmsg_seq,
- bat_priv, bucket, idx);
+ bat_priv = netdev_priv(primary_if->mesh_iface);
+ ret = __batadv_mcast_flags_dump(msg, portid, cb, bat_priv, bucket, idx);
batadv_hardif_put(primary_if);
return ret;
@@ -1559,13 +2157,14 @@ int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb)
/**
* batadv_mcast_free() - free the multicast optimizations structures
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*/
void batadv_mcast_free(struct batadv_priv *bat_priv)
{
cancel_delayed_work_sync(&bat_priv->mcast.work);
batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 2);
+ batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST_TRACKER, 1);
batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 2);
/* safely calling outside of worker, as worker was canceled above */
@@ -1585,6 +2184,12 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig)
batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS);
batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS);
batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS);
+ batadv_mcast_want_rtr4_update(bat_priv, orig,
+ BATADV_MCAST_WANT_NO_RTR4);
+ batadv_mcast_want_rtr6_update(bat_priv, orig,
+ BATADV_MCAST_WANT_NO_RTR6);
+ batadv_mcast_have_mc_ptype_update(bat_priv, orig,
+ BATADV_MCAST_HAVE_MC_PTYPE_CAPA);
spin_unlock_bh(&orig->mcast_handler_lock);
}
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
index 3b04ab13f0eb..d97ee51d26f2 100644
--- a/net/batman-adv/multicast.h
+++ b/net/batman-adv/multicast.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2014-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Linus Lüssing
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_MULTICAST_H_
@@ -21,25 +9,31 @@
#include "main.h"
-struct netlink_callback;
-struct seq_file;
-struct sk_buff;
+#include <linux/netlink.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
/**
* enum batadv_forw_mode - the way a packet should be forwarded as
*/
enum batadv_forw_mode {
/**
- * @BATADV_FORW_ALL: forward the packet to all nodes (currently via
- * classic flooding)
+ * @BATADV_FORW_BCAST: forward the packet to all nodes via a batman-adv
+ * broadcast packet
+ */
+ BATADV_FORW_BCAST,
+
+ /**
+ * @BATADV_FORW_UCASTS: forward the packet to some nodes via one
+ * or more batman-adv unicast packets
*/
- BATADV_FORW_ALL,
+ BATADV_FORW_UCASTS,
/**
- * @BATADV_FORW_SINGLE: forward the packet to a single node (currently
- * via the BATMAN unicast routing protocol)
+ * @BATADV_FORW_MCAST: forward the packet to some nodes via a
+ * batman-adv multicast packet
*/
- BATADV_FORW_SINGLE,
+ BATADV_FORW_MCAST,
/** @BATADV_FORW_NONE: don't forward, drop it */
BATADV_FORW_NONE,
@@ -49,11 +43,12 @@ enum batadv_forw_mode {
enum batadv_forw_mode
batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
- struct batadv_orig_node **mcast_single_orig);
+ unsigned short vid, int *is_routable);
-void batadv_mcast_init(struct batadv_priv *bat_priv);
+int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, int is_routable);
-int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset);
+void batadv_mcast_init(struct batadv_priv *bat_priv);
int batadv_mcast_mesh_info_put(struct sk_buff *msg,
struct batadv_priv *bat_priv);
@@ -64,13 +59,33 @@ void batadv_mcast_free(struct batadv_priv *bat_priv);
void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node);
+/* multicast_forw.c */
+
+int batadv_mcast_forw_tracker_tvlv_handler(struct batadv_priv *bat_priv,
+ struct sk_buff *skb);
+
+unsigned int batadv_mcast_forw_packet_hdrlen(unsigned int num_dests);
+
+bool batadv_mcast_forw_push(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, int is_routable, int count);
+
+int batadv_mcast_forw_mcsend(struct batadv_priv *bat_priv, struct sk_buff *skb);
+
#else
static inline enum batadv_forw_mode
batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb,
- struct batadv_orig_node **mcast_single_orig)
+ unsigned short vid, int *is_routable)
+{
+ return BATADV_FORW_BCAST;
+}
+
+static inline int
+batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, int is_routable)
{
- return BATADV_FORW_ALL;
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
}
static inline int batadv_mcast_init(struct batadv_priv *bat_priv)
@@ -98,6 +113,13 @@ static inline void batadv_mcast_purge_orig(struct batadv_orig_node *orig_node)
{
}
+static inline int batadv_mcast_forw_mcsend(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+}
+
#endif /* CONFIG_BATMAN_ADV_MCAST */
#endif /* _NET_BATMAN_ADV_MULTICAST_H_ */
diff --git a/net/batman-adv/multicast_forw.c b/net/batman-adv/multicast_forw.c
new file mode 100644
index 000000000000..b8668a80b94a
--- /dev/null
+++ b/net/batman-adv/multicast_forw.c
@@ -0,0 +1,1178 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Linus Lüssing
+ */
+
+#include "multicast.h"
+#include "main.h"
+
+#include <linux/bug.h>
+#include <linux/build_bug.h>
+#include <linux/byteorder/generic.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/gfp.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/ipv6.h>
+#include <linux/limits.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <uapi/linux/batadv_packet.h>
+
+#include "bridge_loop_avoidance.h"
+#include "originator.h"
+#include "send.h"
+#include "translation-table.h"
+
+#define batadv_mcast_forw_tracker_for_each_dest(dest, num_dests) \
+ for (; num_dests; num_dests--, (dest) += ETH_ALEN)
+
+#define batadv_mcast_forw_tracker_for_each_dest2(dest1, dest2, num_dests) \
+ for (; num_dests; num_dests--, (dest1) += ETH_ALEN, (dest2) += ETH_ALEN)
+
+/**
+ * batadv_mcast_forw_skb_push() - skb_push and memorize amount of pushed bytes
+ * @skb: the skb to push onto
+ * @size: the amount of bytes to push
+ * @len: stores the total amount of bytes pushed
+ *
+ * Performs an skb_push() onto the given skb and adds the amount of pushed bytes
+ * to the given len pointer.
+ *
+ * Return: the return value of the skb_push() call.
+ */
+static void *batadv_mcast_forw_skb_push(struct sk_buff *skb, size_t size,
+ unsigned short *len)
+{
+ *len += size;
+ return skb_push(skb, size);
+}
+
+/**
+ * batadv_mcast_forw_push_padding() - push 2 padding bytes to skb's front
+ * @skb: the skb to push onto
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Pushes two padding bytes to the front of the given skb.
+ *
+ * Return: On success a pointer to the first byte of the two pushed padding
+ * bytes within the skb. NULL otherwise.
+ */
+static char *
+batadv_mcast_forw_push_padding(struct sk_buff *skb, unsigned short *tvlv_len)
+{
+ const int pad_len = 2;
+ char *padding;
+
+ if (skb_headroom(skb) < pad_len)
+ return NULL;
+
+ padding = batadv_mcast_forw_skb_push(skb, pad_len, tvlv_len);
+ memset(padding, 0, pad_len);
+
+ return padding;
+}
+
+/**
+ * batadv_mcast_forw_push_est_padding() - push padding bytes if necessary
+ * @skb: the skb to potentially push the padding onto
+ * @count: the (estimated) number of originators the multicast packet needs to
+ * be sent to
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * If the number of destination entries is even then this adds two
+ * padding bytes to the end of the tracker TVLV.
+ *
+ * Return: true on success or if no padding is needed, false otherwise.
+ */
+static bool
+batadv_mcast_forw_push_est_padding(struct sk_buff *skb, int count,
+ unsigned short *tvlv_len)
+{
+ if (!(count % 2) && !batadv_mcast_forw_push_padding(skb, tvlv_len))
+ return false;
+
+ return true;
+}
+
+/**
+ * batadv_mcast_forw_orig_entry() - get orig_node from an hlist node
+ * @node: the hlist node to get the orig_node from
+ * @entry_offset: the offset of the hlist node within the orig_node struct
+ *
+ * Return: The orig_node containing the hlist node on success, NULL on error.
+ */
+static struct batadv_orig_node *
+batadv_mcast_forw_orig_entry(struct hlist_node *node,
+ size_t entry_offset)
+{
+ /* sanity check */
+ switch (entry_offset) {
+ case offsetof(struct batadv_orig_node, mcast_want_all_ipv4_node):
+ case offsetof(struct batadv_orig_node, mcast_want_all_ipv6_node):
+ case offsetof(struct batadv_orig_node, mcast_want_all_rtr4_node):
+ case offsetof(struct batadv_orig_node, mcast_want_all_rtr6_node):
+ break;
+ default:
+ WARN_ON(1);
+ return NULL;
+ }
+
+ return (struct batadv_orig_node *)((void *)node - entry_offset);
+}
+
+/**
+ * batadv_mcast_forw_push_dest() - push an originator MAC address onto an skb
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the skb to push the destination address onto
+ * @vid: the vlan identifier
+ * @orig_node: the originator node to get the MAC address from
+ * @num_dests: a pointer to store the number of pushed addresses in
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * If the orig_node is a BLA backbone gateway, if there is not enough skb
+ * headroom available or if num_dests is already at its maximum (65535) then
+ * neither the skb nor num_dests is changed. Otherwise the originator's MAC
+ * address is pushed onto the given skb and num_dests incremented by one.
+ *
+ * Return: true if the orig_node is a backbone gateway or if an orig address
+ * was pushed successfully, false otherwise.
+ */
+static bool batadv_mcast_forw_push_dest(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, unsigned short vid,
+ struct batadv_orig_node *orig_node,
+ unsigned short *num_dests,
+ unsigned short *tvlv_len)
+{
+ BUILD_BUG_ON(sizeof_field(struct batadv_tvlv_mcast_tracker, num_dests)
+ != sizeof(__be16));
+
+ /* Avoid sending to other BLA gateways - they already got the frame from
+ * the LAN side we share with them.
+ * TODO: Refactor to take BLA into account earlier in mode check.
+ */
+ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vid))
+ return true;
+
+ if (skb_headroom(skb) < ETH_ALEN || *num_dests == U16_MAX)
+ return false;
+
+ batadv_mcast_forw_skb_push(skb, ETH_ALEN, tvlv_len);
+ ether_addr_copy(skb->data, orig_node->orig);
+ (*num_dests)++;
+
+ return true;
+}
+
+/**
+ * batadv_mcast_forw_push_dests_list() - push originators from list onto an skb
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the skb to push the destination addresses onto
+ * @vid: the vlan identifier
+ * @head: the list to gather originators from
+ * @entry_offset: offset of an hlist node in an orig_node structure
+ * @num_dests: a pointer to store the number of pushed addresses in
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Push the MAC addresses of all originators in the given list onto the given
+ * skb.
+ *
+ * Return: true on success, false otherwise.
+ */
+static int batadv_mcast_forw_push_dests_list(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned short vid,
+ struct hlist_head *head,
+ size_t entry_offset,
+ unsigned short *num_dests,
+ unsigned short *tvlv_len)
+{
+ struct hlist_node *node;
+ struct batadv_orig_node *orig_node;
+
+ rcu_read_lock();
+ __hlist_for_each_rcu(node, head) {
+ orig_node = batadv_mcast_forw_orig_entry(node, entry_offset);
+ if (!orig_node ||
+ !batadv_mcast_forw_push_dest(bat_priv, skb, vid, orig_node,
+ num_dests, tvlv_len)) {
+ rcu_read_unlock();
+ return false;
+ }
+ }
+ rcu_read_unlock();
+
+ return true;
+}
+
+/**
+ * batadv_mcast_forw_push_tt() - push originators with interest through TT
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the skb to push the destination addresses onto
+ * @vid: the vlan identifier
+ * @num_dests: a pointer to store the number of pushed addresses in
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Push the MAC addresses of all originators which have indicated interest in
+ * this multicast packet through the translation table onto the given skb.
+ *
+ * Return: true on success, false otherwise.
+ */
+static bool
+batadv_mcast_forw_push_tt(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, unsigned short *num_dests,
+ unsigned short *tvlv_len)
+{
+ struct batadv_tt_orig_list_entry *orig_entry;
+
+ struct batadv_tt_global_entry *tt_global;
+ const u8 *addr = eth_hdr(skb)->h_dest;
+
+ /* ok */
+ int ret = true;
+
+ tt_global = batadv_tt_global_hash_find(bat_priv, addr, vid);
+ if (!tt_global)
+ goto out;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_entry, &tt_global->orig_list, list) {
+ if (!batadv_mcast_forw_push_dest(bat_priv, skb, vid,
+ orig_entry->orig_node,
+ num_dests, tvlv_len)) {
+ ret = false;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ batadv_tt_global_entry_put(tt_global);
+
+out:
+ return ret;
+}
+
+/**
+ * batadv_mcast_forw_push_want_all() - push originators with want-all flag
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the skb to push the destination addresses onto
+ * @vid: the vlan identifier
+ * @num_dests: a pointer to store the number of pushed addresses in
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Push the MAC addresses of all originators which have indicated interest in
+ * this multicast packet through the want-all flag onto the given skb.
+ *
+ * Return: true on success, false otherwise.
+ */
+static bool batadv_mcast_forw_push_want_all(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned short vid,
+ unsigned short *num_dests,
+ unsigned short *tvlv_len)
+{
+ struct hlist_head *head = NULL;
+ size_t offset;
+ int ret;
+
+ switch (eth_hdr(skb)->h_proto) {
+ case htons(ETH_P_IP):
+ head = &bat_priv->mcast.want_all_ipv4_list;
+ offset = offsetof(struct batadv_orig_node,
+ mcast_want_all_ipv4_node);
+ break;
+ case htons(ETH_P_IPV6):
+ head = &bat_priv->mcast.want_all_ipv6_list;
+ offset = offsetof(struct batadv_orig_node,
+ mcast_want_all_ipv6_node);
+ break;
+ default:
+ return false;
+ }
+
+ ret = batadv_mcast_forw_push_dests_list(bat_priv, skb, vid, head,
+ offset, num_dests, tvlv_len);
+ if (!ret)
+ return false;
+
+ return true;
+}
+
+/**
+ * batadv_mcast_forw_push_want_rtr() - push originators with want-router flag
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the skb to push the destination addresses onto
+ * @vid: the vlan identifier
+ * @num_dests: a pointer to store the number of pushed addresses in
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Push the MAC addresses of all originators which have indicated interest in
+ * this multicast packet through the want-all-rtr flag onto the given skb.
+ *
+ * Return: true on success, false otherwise.
+ */
+static bool batadv_mcast_forw_push_want_rtr(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned short vid,
+ unsigned short *num_dests,
+ unsigned short *tvlv_len)
+{
+ struct hlist_head *head = NULL;
+ size_t offset;
+ int ret;
+
+ switch (eth_hdr(skb)->h_proto) {
+ case htons(ETH_P_IP):
+ head = &bat_priv->mcast.want_all_rtr4_list;
+ offset = offsetof(struct batadv_orig_node,
+ mcast_want_all_rtr4_node);
+ break;
+ case htons(ETH_P_IPV6):
+ head = &bat_priv->mcast.want_all_rtr6_list;
+ offset = offsetof(struct batadv_orig_node,
+ mcast_want_all_rtr6_node);
+ break;
+ default:
+ return false;
+ }
+
+ ret = batadv_mcast_forw_push_dests_list(bat_priv, skb, vid, head,
+ offset, num_dests, tvlv_len);
+ if (!ret)
+ return false;
+
+ return true;
+}
+
+/**
+ * batadv_mcast_forw_scrape() - remove bytes within skb data
+ * @skb: the skb to remove bytes from
+ * @offset: the offset from the skb data from which to scrape
+ * @len: the amount of bytes to scrape starting from the offset
+ *
+ * Scrapes/removes len bytes from the given skb at the given offset from the
+ * skb data.
+ *
+ * Caller needs to ensure that the region from the skb data's start up
+ * to/including the to be removed bytes are linearized.
+ */
+static void batadv_mcast_forw_scrape(struct sk_buff *skb,
+ unsigned short offset,
+ unsigned short len)
+{
+ char *to, *from;
+
+ SKB_LINEAR_ASSERT(skb);
+
+ to = skb_pull(skb, len);
+ from = to - len;
+
+ memmove(to, from, offset);
+}
+
+/**
+ * batadv_mcast_forw_push_scrape_padding() - remove TVLV padding
+ * @skb: the skb to potentially adjust the TVLV's padding on
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Remove two padding bytes from the end of the multicast tracker TVLV,
+ * from before the payload data.
+ *
+ * Caller needs to ensure that the TVLV bytes are linearized.
+ */
+static void batadv_mcast_forw_push_scrape_padding(struct sk_buff *skb,
+ unsigned short *tvlv_len)
+{
+ const int pad_len = 2;
+
+ batadv_mcast_forw_scrape(skb, *tvlv_len - pad_len, pad_len);
+ *tvlv_len -= pad_len;
+}
+
+/**
+ * batadv_mcast_forw_push_insert_padding() - insert TVLV padding
+ * @skb: the skb to potentially adjust the TVLV's padding on
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Inserts two padding bytes at the end of the multicast tracker TVLV,
+ * before the payload data in the given skb.
+ *
+ * Return: true on success, false otherwise.
+ */
+static bool batadv_mcast_forw_push_insert_padding(struct sk_buff *skb,
+ unsigned short *tvlv_len)
+{
+ unsigned short offset = *tvlv_len;
+ char *to, *from = skb->data;
+
+ to = batadv_mcast_forw_push_padding(skb, tvlv_len);
+ if (!to)
+ return false;
+
+ memmove(to, from, offset);
+ memset(to + offset, 0, *tvlv_len - offset);
+ return true;
+}
+
+/**
+ * batadv_mcast_forw_push_adjust_padding() - adjust padding if necessary
+ * @skb: the skb to potentially adjust the TVLV's padding on
+ * @count: the estimated number of originators the multicast packet needs to
+ * be sent to
+ * @num_dests_pushed: the number of originators that were actually added to the
+ * multicast packet's tracker TVLV
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Adjusts the padding in the multicast packet's tracker TVLV depending on the
+ * initially estimated amount of destinations versus the amount of destinations
+ * that were actually added to the tracker TVLV.
+ *
+ * If the initial estimate was correct or at least the oddness was the same then
+ * no padding adjustment is performed.
+ * If the initially estimated number was even, so padding was initially added,
+ * but it turned out to be odd then padding is removed.
+ * If the initially estimated number was odd, so no padding was initially added,
+ * but it turned out to be even then padding is added.
+ *
+ * Return: true if no padding adjustment is needed or the adjustment was
+ * successful, false otherwise.
+ */
+static bool
+batadv_mcast_forw_push_adjust_padding(struct sk_buff *skb, int *count,
+ unsigned short num_dests_pushed,
+ unsigned short *tvlv_len)
+{
+ int ret = true;
+
+ if (likely((num_dests_pushed % 2) == (*count % 2)))
+ goto out;
+
+ /**
+ * estimated even number of destinations, but turned out to be odd
+ * -> remove padding
+ */
+ if (!(*count % 2) && (num_dests_pushed % 2))
+ batadv_mcast_forw_push_scrape_padding(skb, tvlv_len);
+ /**
+ * estimated odd number of destinations, but turned out to be even
+ * -> add padding
+ */
+ else if ((*count % 2) && (!(num_dests_pushed % 2)))
+ ret = batadv_mcast_forw_push_insert_padding(skb, tvlv_len);
+
+out:
+ *count = num_dests_pushed;
+ return ret;
+}
+
+/**
+ * batadv_mcast_forw_push_dests() - push originator addresses onto an skb
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the skb to push the destination addresses onto
+ * @vid: the vlan identifier
+ * @is_routable: indicates whether the destination is routable
+ * @count: the number of originators the multicast packet needs to be sent to
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Push the MAC addresses of all originators which have indicated interest in
+ * this multicast packet onto the given skb.
+ *
+ * Return: -ENOMEM if there is not enough skb headroom available. Otherwise, on
+ * success 0.
+ */
+static int
+batadv_mcast_forw_push_dests(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, int is_routable, int *count,
+ unsigned short *tvlv_len)
+{
+ unsigned short num_dests = 0;
+
+ if (!batadv_mcast_forw_push_est_padding(skb, *count, tvlv_len))
+ goto err;
+
+ if (!batadv_mcast_forw_push_tt(bat_priv, skb, vid, &num_dests,
+ tvlv_len))
+ goto err;
+
+ if (!batadv_mcast_forw_push_want_all(bat_priv, skb, vid, &num_dests,
+ tvlv_len))
+ goto err;
+
+ if (is_routable &&
+ !batadv_mcast_forw_push_want_rtr(bat_priv, skb, vid, &num_dests,
+ tvlv_len))
+ goto err;
+
+ if (!batadv_mcast_forw_push_adjust_padding(skb, count, num_dests,
+ tvlv_len))
+ goto err;
+
+ return 0;
+err:
+ return -ENOMEM;
+}
+
+/**
+ * batadv_mcast_forw_push_tracker() - push a multicast tracker TVLV header
+ * @skb: the skb to push the tracker TVLV onto
+ * @num_dests: the number of destination addresses to set in the header
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Pushes a multicast tracker TVLV header onto the given skb, including the
+ * generic TVLV header but excluding the destination MAC addresses.
+ *
+ * The provided num_dests value is taken into consideration to set the
+ * num_dests field in the tracker header and to set the appropriate TVLV length
+ * value fields.
+ *
+ * Return: -ENOMEM if there is not enough skb headroom available. Otherwise, on
+ * success 0.
+ */
+static int batadv_mcast_forw_push_tracker(struct sk_buff *skb, int num_dests,
+ unsigned short *tvlv_len)
+{
+ struct batadv_tvlv_mcast_tracker *mcast_tracker;
+ struct batadv_tvlv_hdr *tvlv_hdr;
+ unsigned int tvlv_value_len;
+
+ if (skb_headroom(skb) < sizeof(*mcast_tracker) + sizeof(*tvlv_hdr))
+ return -ENOMEM;
+
+ tvlv_value_len = sizeof(*mcast_tracker) + *tvlv_len;
+ if (tvlv_value_len + sizeof(*tvlv_hdr) > U16_MAX)
+ return -ENOMEM;
+
+ batadv_mcast_forw_skb_push(skb, sizeof(*mcast_tracker), tvlv_len);
+ mcast_tracker = (struct batadv_tvlv_mcast_tracker *)skb->data;
+ mcast_tracker->num_dests = htons(num_dests);
+
+ skb_reset_network_header(skb);
+
+ batadv_mcast_forw_skb_push(skb, sizeof(*tvlv_hdr), tvlv_len);
+ tvlv_hdr = (struct batadv_tvlv_hdr *)skb->data;
+ tvlv_hdr->type = BATADV_TVLV_MCAST_TRACKER;
+ tvlv_hdr->version = 1;
+ tvlv_hdr->len = htons(tvlv_value_len);
+
+ return 0;
+}
+
+/**
+ * batadv_mcast_forw_push_tvlvs() - push a multicast tracker TVLV onto an skb
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the skb to push the tracker TVLV onto
+ * @vid: the vlan identifier
+ * @is_routable: indicates whether the destination is routable
+ * @count: the number of originators the multicast packet needs to be sent to
+ * @tvlv_len: stores the amount of currently pushed TVLV bytes
+ *
+ * Pushes a multicast tracker TVLV onto the given skb, including the collected
+ * destination MAC addresses and the generic TVLV header.
+ *
+ * Return: -ENOMEM if there is not enough skb headroom available. Otherwise, on
+ * success 0.
+ */
+static int
+batadv_mcast_forw_push_tvlvs(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, int is_routable, int count,
+ unsigned short *tvlv_len)
+{
+ int ret;
+
+ ret = batadv_mcast_forw_push_dests(bat_priv, skb, vid, is_routable,
+ &count, tvlv_len);
+ if (ret < 0)
+ return ret;
+
+ ret = batadv_mcast_forw_push_tracker(skb, count, tvlv_len);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+/**
+ * batadv_mcast_forw_push_hdr() - push a multicast packet header onto an skb
+ * @skb: the skb to push the header onto
+ * @tvlv_len: the total TVLV length value to set in the header
+ *
+ * Pushes a batman-adv multicast packet header onto the given skb and sets
+ * the provided total TVLV length value in it.
+ *
+ * Caller needs to ensure enough skb headroom is available.
+ *
+ * Return: -ENOMEM if there is not enough skb headroom available. Otherwise, on
+ * success 0.
+ */
+static int
+batadv_mcast_forw_push_hdr(struct sk_buff *skb, unsigned short tvlv_len)
+{
+ struct batadv_mcast_packet *mcast_packet;
+
+ if (skb_headroom(skb) < sizeof(*mcast_packet))
+ return -ENOMEM;
+
+ skb_push(skb, sizeof(*mcast_packet));
+
+ mcast_packet = (struct batadv_mcast_packet *)skb->data;
+ mcast_packet->version = BATADV_COMPAT_VERSION;
+ mcast_packet->ttl = BATADV_TTL;
+ mcast_packet->packet_type = BATADV_MCAST;
+ mcast_packet->reserved = 0;
+ mcast_packet->tvlv_len = htons(tvlv_len);
+
+ return 0;
+}
+
+/**
+ * batadv_mcast_forw_scrub_dests() - scrub destinations in a tracker TVLV
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @comp_neigh: next hop neighbor to scrub+collect destinations for
+ * @dest: start MAC entry in original skb's tracker TVLV
+ * @next_dest: start MAC entry in to be sent skb's tracker TVLV
+ * @num_dests: number of remaining destination MAC entries to iterate over
+ *
+ * This sorts destination entries into either the original batman-adv
+ * multicast packet or the skb (copy) that is going to be sent to comp_neigh
+ * next.
+ *
+ * In preparation for the next, to be (unicast) transmitted batman-adv multicast
+ * packet skb to be sent to the given neighbor node, tries to collect all
+ * originator MAC addresses that have the given neighbor node as their next hop
+ * in the to be transmitted skb (copy), which next_dest points into. That is we
+ * zero all destination entries in next_dest which do not have comp_neigh as
+ * their next hop. And zero all destination entries in the original skb that
+ * would have comp_neigh as their next hop (to avoid redundant transmissions and
+ * duplicated payload later).
+ */
+static void
+batadv_mcast_forw_scrub_dests(struct batadv_priv *bat_priv,
+ struct batadv_neigh_node *comp_neigh, u8 *dest,
+ u8 *next_dest, u16 num_dests)
+{
+ struct batadv_neigh_node *next_neigh;
+
+ /* skip first entry, this is what we are comparing with */
+ eth_zero_addr(dest);
+ dest += ETH_ALEN;
+ next_dest += ETH_ALEN;
+ num_dests--;
+
+ batadv_mcast_forw_tracker_for_each_dest2(dest, next_dest, num_dests) {
+ if (is_zero_ether_addr(next_dest))
+ continue;
+
+ /* sanity check, we expect unicast destinations */
+ if (is_multicast_ether_addr(next_dest)) {
+ eth_zero_addr(dest);
+ eth_zero_addr(next_dest);
+ continue;
+ }
+
+ next_neigh = batadv_orig_to_router(bat_priv, next_dest, NULL);
+ if (!next_neigh) {
+ eth_zero_addr(next_dest);
+ continue;
+ }
+
+ if (!batadv_compare_eth(next_neigh->addr, comp_neigh->addr)) {
+ eth_zero_addr(next_dest);
+ batadv_neigh_node_put(next_neigh);
+ continue;
+ }
+
+ /* found an entry for our next packet to transmit, so remove it
+ * from the original packet
+ */
+ eth_zero_addr(dest);
+ batadv_neigh_node_put(next_neigh);
+ }
+}
+
+/**
+ * batadv_mcast_forw_shrink_fill() - swap slot with next non-zero destination
+ * @slot: the to be filled zero-MAC destination entry in a tracker TVLV
+ * @num_dests_slot: remaining entries in tracker TVLV from/including slot
+ *
+ * Searches for the next non-zero-MAC destination entry in a tracker TVLV after
+ * the given slot pointer. And if found, swaps it with the zero-MAC destination
+ * entry which the slot points to.
+ *
+ * Return: true if slot was swapped/filled successfully, false otherwise.
+ */
+static bool batadv_mcast_forw_shrink_fill(u8 *slot, u16 num_dests_slot)
+{
+ u16 num_dests_filler;
+ u8 *filler;
+
+ /* sanity check, should not happen */
+ if (!num_dests_slot)
+ return false;
+
+ num_dests_filler = num_dests_slot - 1;
+ filler = slot + ETH_ALEN;
+
+ /* find a candidate to fill the empty slot */
+ batadv_mcast_forw_tracker_for_each_dest(filler, num_dests_filler) {
+ if (is_zero_ether_addr(filler))
+ continue;
+
+ ether_addr_copy(slot, filler);
+ eth_zero_addr(filler);
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * batadv_mcast_forw_shrink_pack_dests() - pack destinations of a tracker TVLV
+ * @skb: the batman-adv multicast packet to compact destinations in
+ *
+ * Compacts the originator destination MAC addresses in the multicast tracker
+ * TVLV of the given multicast packet. This is done by moving all non-zero
+ * MAC addresses in direction of the skb head and all zero MAC addresses in skb
+ * tail direction, within the multicast tracker TVLV.
+ *
+ * Return: The number of consecutive zero MAC address destinations which are
+ * now at the end of the multicast tracker TVLV.
+ */
+static int batadv_mcast_forw_shrink_pack_dests(struct sk_buff *skb)
+{
+ struct batadv_tvlv_mcast_tracker *mcast_tracker;
+ unsigned char *skb_net_hdr;
+ u16 num_dests_slot;
+ u8 *slot;
+
+ skb_net_hdr = skb_network_header(skb);
+ mcast_tracker = (struct batadv_tvlv_mcast_tracker *)skb_net_hdr;
+ num_dests_slot = ntohs(mcast_tracker->num_dests);
+
+ slot = (u8 *)mcast_tracker + sizeof(*mcast_tracker);
+
+ batadv_mcast_forw_tracker_for_each_dest(slot, num_dests_slot) {
+ /* find an empty slot */
+ if (!is_zero_ether_addr(slot))
+ continue;
+
+ if (!batadv_mcast_forw_shrink_fill(slot, num_dests_slot))
+ /* could not find a filler, so we successfully packed
+ * and can stop - and must not reduce num_dests_slot!
+ */
+ break;
+ }
+
+ /* num_dests_slot is now the amount of reduced, zeroed
+ * destinations at the end of the tracker TVLV
+ */
+ return num_dests_slot;
+}
+
+/**
+ * batadv_mcast_forw_shrink_align_offset() - get new alignment offset
+ * @num_dests_old: the old, to be updated amount of destination nodes
+ * @num_dests_reduce: the number of destinations that were removed
+ *
+ * Calculates the amount of potential extra alignment offset that is needed to
+ * adjust the TVLV padding after the change in destination nodes.
+ *
+ * Return:
+ * 0: If no change to padding is needed.
+ * 2: If padding needs to be removed.
+ * -2: If padding needs to be added.
+ */
+static short
+batadv_mcast_forw_shrink_align_offset(unsigned int num_dests_old,
+ unsigned int num_dests_reduce)
+{
+ /* even amount of removed destinations -> no alignment change */
+ if (!(num_dests_reduce % 2))
+ return 0;
+
+ /* even to odd amount of destinations -> remove padding */
+ if (!(num_dests_old % 2))
+ return 2;
+
+ /* odd to even amount of destinations -> add padding */
+ return -2;
+}
+
+/**
+ * batadv_mcast_forw_shrink_update_headers() - update shrunk mc packet headers
+ * @skb: the batman-adv multicast packet to update headers of
+ * @num_dests_reduce: the number of destinations that were removed
+ *
+ * This updates any fields of a batman-adv multicast packet that are affected
+ * by the reduced number of destinations in the multicast tracket TVLV. In
+ * particular this updates:
+ *
+ * The num_dest field of the multicast tracker TVLV.
+ * The TVLV length field of the according generic TVLV header.
+ * The batman-adv multicast packet's total TVLV length field.
+ *
+ * Return: The offset in skb's tail direction at which the new batman-adv
+ * multicast packet header needs to start.
+ */
+static unsigned int
+batadv_mcast_forw_shrink_update_headers(struct sk_buff *skb,
+ unsigned int num_dests_reduce)
+{
+ struct batadv_tvlv_mcast_tracker *mcast_tracker;
+ struct batadv_mcast_packet *mcast_packet;
+ struct batadv_tvlv_hdr *tvlv_hdr;
+ unsigned char *skb_net_hdr;
+ unsigned int offset;
+ short align_offset;
+ u16 num_dests;
+
+ skb_net_hdr = skb_network_header(skb);
+ mcast_tracker = (struct batadv_tvlv_mcast_tracker *)skb_net_hdr;
+ num_dests = ntohs(mcast_tracker->num_dests);
+
+ align_offset = batadv_mcast_forw_shrink_align_offset(num_dests,
+ num_dests_reduce);
+ offset = ETH_ALEN * num_dests_reduce + align_offset;
+ num_dests -= num_dests_reduce;
+
+ /* update tracker header */
+ mcast_tracker->num_dests = htons(num_dests);
+
+ /* update tracker's tvlv header's length field */
+ tvlv_hdr = (struct batadv_tvlv_hdr *)(skb_network_header(skb) -
+ sizeof(*tvlv_hdr));
+ tvlv_hdr->len = htons(ntohs(tvlv_hdr->len) - offset);
+
+ /* update multicast packet header's tvlv length field */
+ mcast_packet = (struct batadv_mcast_packet *)skb->data;
+ mcast_packet->tvlv_len = htons(ntohs(mcast_packet->tvlv_len) - offset);
+
+ return offset;
+}
+
+/**
+ * batadv_mcast_forw_shrink_move_headers() - move multicast headers by offset
+ * @skb: the batman-adv multicast packet to move headers for
+ * @offset: a non-negative offset to move headers by, towards the skb tail
+ *
+ * Moves the batman-adv multicast packet header, its multicast tracker TVLV and
+ * any TVLVs in between by the given offset in direction towards the tail.
+ */
+static void
+batadv_mcast_forw_shrink_move_headers(struct sk_buff *skb, unsigned int offset)
+{
+ struct batadv_tvlv_mcast_tracker *mcast_tracker;
+ unsigned char *skb_net_hdr;
+ unsigned int len;
+ u16 num_dests;
+
+ skb_net_hdr = skb_network_header(skb);
+ mcast_tracker = (struct batadv_tvlv_mcast_tracker *)skb_net_hdr;
+ num_dests = ntohs(mcast_tracker->num_dests);
+ len = skb_network_offset(skb) + sizeof(*mcast_tracker);
+ len += num_dests * ETH_ALEN;
+
+ batadv_mcast_forw_scrape(skb, len, offset);
+}
+
+/**
+ * batadv_mcast_forw_shrink_tracker() - remove zero addresses in a tracker tvlv
+ * @skb: the batman-adv multicast packet to (potentially) shrink
+ *
+ * Removes all destinations with a zero MAC addresses (00:00:00:00:00:00) from
+ * the given batman-adv multicast packet's tracker TVLV and updates headers
+ * accordingly to maintain a valid batman-adv multicast packet.
+ */
+static void batadv_mcast_forw_shrink_tracker(struct sk_buff *skb)
+{
+ unsigned int offset;
+ u16 dests_reduced;
+
+ dests_reduced = batadv_mcast_forw_shrink_pack_dests(skb);
+ if (!dests_reduced)
+ return;
+
+ offset = batadv_mcast_forw_shrink_update_headers(skb, dests_reduced);
+ batadv_mcast_forw_shrink_move_headers(skb, offset);
+}
+
+/**
+ * batadv_mcast_forw_packet() - forward a batman-adv multicast packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the received or locally generated batman-adv multicast packet
+ * @local_xmit: indicates that the packet was locally generated and not received
+ *
+ * Parses the tracker TVLV of a batman-adv multicast packet and forwards the
+ * packet as indicated in this TVLV.
+ *
+ * Caller needs to set the skb network header to the start of the multicast
+ * tracker TVLV (excluding the generic TVLV header) and the skb transport header
+ * to the next byte after this multicast tracker TVLV.
+ *
+ * Caller needs to free the skb.
+ *
+ * Return: NET_RX_SUCCESS or NET_RX_DROP on success or a negative error
+ * code on failure. NET_RX_SUCCESS if the received packet is supposed to be
+ * decapsulated and forwarded to the own mesh interface, NET_RX_DROP otherwise.
+ */
+static int batadv_mcast_forw_packet(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, bool local_xmit)
+{
+ struct batadv_tvlv_mcast_tracker *mcast_tracker;
+ struct batadv_neigh_node *neigh_node;
+ unsigned long offset, num_dests_off;
+ struct sk_buff *nexthop_skb;
+ unsigned char *skb_net_hdr;
+ bool local_recv = false;
+ unsigned int tvlv_len;
+ bool xmitted = false;
+ u8 *dest, *next_dest;
+ u16 num_dests;
+ int ret;
+
+ /* (at least) TVLV part needs to be linearized */
+ SKB_LINEAR_ASSERT(skb);
+
+ /* check if num_dests is within skb length */
+ num_dests_off = offsetof(struct batadv_tvlv_mcast_tracker, num_dests);
+ if (num_dests_off > skb_network_header_len(skb))
+ return -EINVAL;
+
+ skb_net_hdr = skb_network_header(skb);
+ mcast_tracker = (struct batadv_tvlv_mcast_tracker *)skb_net_hdr;
+ num_dests = ntohs(mcast_tracker->num_dests);
+
+ dest = (u8 *)mcast_tracker + sizeof(*mcast_tracker);
+
+ /* check if full tracker tvlv is within skb length */
+ tvlv_len = sizeof(*mcast_tracker) + ETH_ALEN * num_dests;
+ if (tvlv_len > skb_network_header_len(skb))
+ return -EINVAL;
+
+ /* invalidate checksum: */
+ skb->ip_summed = CHECKSUM_NONE;
+
+ batadv_mcast_forw_tracker_for_each_dest(dest, num_dests) {
+ if (is_zero_ether_addr(dest))
+ continue;
+
+ /* only unicast originator addresses supported */
+ if (is_multicast_ether_addr(dest)) {
+ eth_zero_addr(dest);
+ continue;
+ }
+
+ if (batadv_is_my_mac(bat_priv, dest)) {
+ eth_zero_addr(dest);
+ local_recv = true;
+ continue;
+ }
+
+ neigh_node = batadv_orig_to_router(bat_priv, dest, NULL);
+ if (!neigh_node) {
+ eth_zero_addr(dest);
+ continue;
+ }
+
+ nexthop_skb = skb_copy(skb, GFP_ATOMIC);
+ if (!nexthop_skb) {
+ batadv_neigh_node_put(neigh_node);
+ return -ENOMEM;
+ }
+
+ offset = dest - skb->data;
+ next_dest = nexthop_skb->data + offset;
+
+ batadv_mcast_forw_scrub_dests(bat_priv, neigh_node, dest,
+ next_dest, num_dests);
+ batadv_mcast_forw_shrink_tracker(nexthop_skb);
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_MCAST_TX);
+ batadv_add_counter(bat_priv, BATADV_CNT_MCAST_TX_BYTES,
+ nexthop_skb->len + ETH_HLEN);
+ xmitted = true;
+ ret = batadv_send_unicast_skb(nexthop_skb, neigh_node);
+
+ batadv_neigh_node_put(neigh_node);
+
+ if (ret < 0)
+ return ret;
+ }
+
+ if (xmitted) {
+ if (local_xmit) {
+ batadv_inc_counter(bat_priv, BATADV_CNT_MCAST_TX_LOCAL);
+ batadv_add_counter(bat_priv,
+ BATADV_CNT_MCAST_TX_LOCAL_BYTES,
+ skb->len -
+ skb_transport_offset(skb));
+ } else {
+ batadv_inc_counter(bat_priv, BATADV_CNT_MCAST_FWD);
+ batadv_add_counter(bat_priv, BATADV_CNT_MCAST_FWD_BYTES,
+ skb->len + ETH_HLEN);
+ }
+ }
+
+ if (local_recv)
+ return NET_RX_SUCCESS;
+ else
+ return NET_RX_DROP;
+}
+
+/**
+ * batadv_mcast_forw_tracker_tvlv_handler() - handle an mcast tracker tvlv
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the received batman-adv multicast packet
+ *
+ * Parses the tracker TVLV of an incoming batman-adv multicast packet and
+ * forwards the packet as indicated in this TVLV.
+ *
+ * Caller needs to set the skb network header to the start of the multicast
+ * tracker TVLV (excluding the generic TVLV header) and the skb transport header
+ * to the next byte after this multicast tracker TVLV.
+ *
+ * Caller needs to free the skb.
+ *
+ * Return: NET_RX_SUCCESS or NET_RX_DROP on success or a negative error
+ * code on failure. NET_RX_SUCCESS if the received packet is supposed to be
+ * decapsulated and forwarded to the own mesh interface, NET_RX_DROP otherwise.
+ */
+int batadv_mcast_forw_tracker_tvlv_handler(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ return batadv_mcast_forw_packet(bat_priv, skb, false);
+}
+
+/**
+ * batadv_mcast_forw_packet_hdrlen() - multicast packet header length
+ * @num_dests: number of destination nodes
+ *
+ * Calculates the total batman-adv multicast packet header length for a given
+ * number of destination nodes (excluding the outer ethernet frame).
+ *
+ * Return: The calculated total batman-adv multicast packet header length.
+ */
+unsigned int batadv_mcast_forw_packet_hdrlen(unsigned int num_dests)
+{
+ /**
+ * If the number of destination entries is even then we need to add
+ * two byte padding to the tracker TVLV.
+ */
+ int padding = (!(num_dests % 2)) ? 2 : 0;
+
+ return padding + num_dests * ETH_ALEN +
+ sizeof(struct batadv_tvlv_mcast_tracker) +
+ sizeof(struct batadv_tvlv_hdr) +
+ sizeof(struct batadv_mcast_packet);
+}
+
+/**
+ * batadv_mcast_forw_expand_head() - expand headroom for an mcast packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to send
+ *
+ * Tries to expand an skb's headroom so that its head to tail is 1298
+ * bytes (minimum IPv6 MTU + vlan ethernet header size) large.
+ *
+ * Return: -EINVAL if the given skb's length is too large or -ENOMEM on memory
+ * allocation failure. Otherwise, on success, zero is returned.
+ */
+static int batadv_mcast_forw_expand_head(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ int hdr_size = VLAN_ETH_HLEN + IPV6_MIN_MTU - skb->len;
+
+ /* TODO: Could be tightened to actual number of destination nodes?
+ * But it's tricky, number of destinations might have increased since
+ * we last checked.
+ */
+ if (hdr_size < 0) {
+ /* batadv_mcast_forw_mode_check_count() should ensure we do not
+ * end up here
+ */
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ if (skb_headroom(skb) < hdr_size &&
+ pskb_expand_head(skb, hdr_size, 0, GFP_ATOMIC) < 0)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/**
+ * batadv_mcast_forw_push() - encapsulate skb in a batman-adv multicast packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to encapsulate and send
+ * @vid: the vlan identifier
+ * @is_routable: indicates whether the destination is routable
+ * @count: the number of originators the multicast packet needs to be sent to
+ *
+ * Encapsulates the given multicast packet in a batman-adv multicast packet.
+ * A multicast tracker TVLV with destination originator addresses for any node
+ * that signaled interest in it, that is either via the translation table or the
+ * according want-all flags, is attached accordingly.
+ *
+ * Return: true on success, false otherwise.
+ */
+bool batadv_mcast_forw_push(struct batadv_priv *bat_priv, struct sk_buff *skb,
+ unsigned short vid, int is_routable, int count)
+{
+ unsigned short tvlv_len = 0;
+ int ret;
+
+ if (batadv_mcast_forw_expand_head(bat_priv, skb) < 0)
+ goto err;
+
+ skb_reset_transport_header(skb);
+
+ ret = batadv_mcast_forw_push_tvlvs(bat_priv, skb, vid, is_routable,
+ count, &tvlv_len);
+ if (ret < 0)
+ goto err;
+
+ ret = batadv_mcast_forw_push_hdr(skb, tvlv_len);
+ if (ret < 0)
+ goto err;
+
+ return true;
+
+err:
+ if (tvlv_len)
+ skb_pull(skb, tvlv_len);
+
+ return false;
+}
+
+/**
+ * batadv_mcast_forw_mcsend() - send a self prepared batman-adv multicast packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: the multicast packet to encapsulate and send
+ *
+ * Transmits a batman-adv multicast packet that was locally prepared and
+ * consumes/frees it.
+ *
+ * Return: NET_XMIT_DROP on memory allocation failure. NET_XMIT_SUCCESS
+ * otherwise.
+ */
+int batadv_mcast_forw_mcsend(struct batadv_priv *bat_priv,
+ struct sk_buff *skb)
+{
+ int ret = batadv_mcast_forw_packet(bat_priv, skb, true);
+
+ if (ret < 0) {
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+ }
+
+ consume_skb(skb);
+ return NET_XMIT_SUCCESS;
+}
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 0d9459b69bdb..78c651f634cd 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -1,43 +1,35 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2016-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Matthias Schiffer
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "netlink.h"
#include "main.h"
+#include <linux/array_size.h>
#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/bug.h>
#include <linux/byteorder/generic.h>
#include <linux/cache.h>
+#include <linux/err.h>
#include <linux/errno.h>
-#include <linux/export.h>
-#include <linux/genetlink.h>
#include <linux/gfp.h>
#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
#include <linux/init.h>
-#include <linux/kernel.h>
+#include <linux/limits.h>
+#include <linux/minmax.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <linux/printk.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
+#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
#include <linux/stddef.h>
#include <linux/types.h>
#include <net/genetlink.h>
+#include <net/net_namespace.h>
#include <net/netlink.h>
#include <net/sock.h>
#include <uapi/linux/batadv_packet.h>
@@ -47,10 +39,12 @@
#include "bridge_loop_avoidance.h"
#include "distributed-arp-table.h"
#include "gateway_client.h"
+#include "gateway_common.h"
#include "hard-interface.h"
+#include "log.h"
+#include "mesh-interface.h"
#include "multicast.h"
#include "originator.h"
-#include "soft-interface.h"
#include "tp_meter.h"
#include "translation-table.h"
@@ -58,10 +52,38 @@ struct genl_family batadv_netlink_family;
/* multicast groups */
enum batadv_netlink_multicast_groups {
+ BATADV_NL_MCGRP_CONFIG,
BATADV_NL_MCGRP_TPMETER,
};
+/**
+ * enum batadv_genl_ops_flags - flags for genl_ops's internal_flags
+ */
+enum batadv_genl_ops_flags {
+ /**
+ * @BATADV_FLAG_NEED_MESH: request requires valid mesh interface in
+ * attribute BATADV_ATTR_MESH_IFINDEX and expects a pointer to it to be
+ * saved in info->user_ptr[0]
+ */
+ BATADV_FLAG_NEED_MESH = BIT(0),
+
+ /**
+ * @BATADV_FLAG_NEED_HARDIF: request requires valid hard interface in
+ * attribute BATADV_ATTR_HARD_IFINDEX and expects a pointer to it to be
+ * saved in info->user_ptr[1]
+ */
+ BATADV_FLAG_NEED_HARDIF = BIT(1),
+
+ /**
+ * @BATADV_FLAG_NEED_VLAN: request requires valid vlan in
+ * attribute BATADV_ATTR_VLANID and expects a pointer to it to be
+ * saved in info->user_ptr[1]
+ */
+ BATADV_FLAG_NEED_VLAN = BIT(2),
+};
+
static const struct genl_multicast_group batadv_netlink_mcgrps[] = {
+ [BATADV_NL_MCGRP_CONFIG] = { .name = BATADV_NL_MCAST_GROUP_CONFIG },
[BATADV_NL_MCGRP_TPMETER] = { .name = BATADV_NL_MCAST_GROUP_TPMETER },
};
@@ -104,6 +126,26 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
[BATADV_ATTR_DAT_CACHE_VID] = { .type = NLA_U16 },
[BATADV_ATTR_MCAST_FLAGS] = { .type = NLA_U32 },
[BATADV_ATTR_MCAST_FLAGS_PRIV] = { .type = NLA_U32 },
+ [BATADV_ATTR_VLANID] = { .type = NLA_U16 },
+ [BATADV_ATTR_AGGREGATED_OGMS_ENABLED] = { .type = NLA_U8 },
+ [BATADV_ATTR_AP_ISOLATION_ENABLED] = { .type = NLA_U8 },
+ [BATADV_ATTR_ISOLATION_MARK] = { .type = NLA_U32 },
+ [BATADV_ATTR_ISOLATION_MASK] = { .type = NLA_U32 },
+ [BATADV_ATTR_BONDING_ENABLED] = { .type = NLA_U8 },
+ [BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED] = { .type = NLA_U8 },
+ [BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED] = { .type = NLA_U8 },
+ [BATADV_ATTR_FRAGMENTATION_ENABLED] = { .type = NLA_U8 },
+ [BATADV_ATTR_GW_BANDWIDTH_DOWN] = { .type = NLA_U32 },
+ [BATADV_ATTR_GW_BANDWIDTH_UP] = { .type = NLA_U32 },
+ [BATADV_ATTR_GW_MODE] = { .type = NLA_U8 },
+ [BATADV_ATTR_GW_SEL_CLASS] = { .type = NLA_U32 },
+ [BATADV_ATTR_HOP_PENALTY] = { .type = NLA_U8 },
+ [BATADV_ATTR_LOG_LEVEL] = { .type = NLA_U32 },
+ [BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED] = { .type = NLA_U8 },
+ [BATADV_ATTR_MULTICAST_FANOUT] = { .type = NLA_U32 },
+ [BATADV_ATTR_ORIG_INTERVAL] = { .type = NLA_U32 },
+ [BATADV_ATTR_ELP_INTERVAL] = { .type = NLA_U32 },
+ [BATADV_ATTR_THROUGHPUT_OVERRIDE] = { .type = NLA_U32 },
};
/**
@@ -113,49 +155,103 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
*
* Return: interface index, or 0.
*/
-int
-batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype)
+static int batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype)
{
struct nlattr *attr = nlmsg_find_attr(nlh, GENL_HDRLEN, attrtype);
- return attr ? nla_get_u32(attr) : 0;
+ return (attr && nla_len(attr) == sizeof(u32)) ? nla_get_u32(attr) : 0;
}
/**
- * batadv_netlink_mesh_info_put() - fill in generic information about mesh
- * interface
- * @msg: netlink message to be sent back
- * @soft_iface: interface for which the data should be taken
+ * batadv_netlink_mesh_fill_ap_isolation() - Add ap_isolation meshif attribute
+ * @msg: Netlink message to dump into
+ * @bat_priv: the bat priv with all the mesh interface information
*
- * Return: 0 on success, < 0 on error
+ * Return: 0 on success or negative error number in case of failure
*/
-static int
-batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface)
+static int batadv_netlink_mesh_fill_ap_isolation(struct sk_buff *msg,
+ struct batadv_priv *bat_priv)
{
- struct batadv_priv *bat_priv = netdev_priv(soft_iface);
+ struct batadv_meshif_vlan *vlan;
+ u8 ap_isolation;
+
+ vlan = batadv_meshif_vlan_get(bat_priv, BATADV_NO_FLAGS);
+ if (!vlan)
+ return 0;
+
+ ap_isolation = atomic_read(&vlan->ap_isolation);
+ batadv_meshif_vlan_put(vlan);
+
+ return nla_put_u8(msg, BATADV_ATTR_AP_ISOLATION_ENABLED,
+ !!ap_isolation);
+}
+
+/**
+ * batadv_netlink_set_mesh_ap_isolation() - Set ap_isolation from genl msg
+ * @attr: parsed BATADV_ATTR_AP_ISOLATION_ENABLED attribute
+ * @bat_priv: the bat priv with all the mesh interface information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_set_mesh_ap_isolation(struct nlattr *attr,
+ struct batadv_priv *bat_priv)
+{
+ struct batadv_meshif_vlan *vlan;
+
+ vlan = batadv_meshif_vlan_get(bat_priv, BATADV_NO_FLAGS);
+ if (!vlan)
+ return -ENOENT;
+
+ atomic_set(&vlan->ap_isolation, !!nla_get_u8(attr));
+ batadv_meshif_vlan_put(vlan);
+
+ return 0;
+}
+
+/**
+ * batadv_netlink_mesh_fill() - Fill message with mesh attributes
+ * @msg: Netlink message to dump into
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @cmd: type of message to generate
+ * @portid: Port making netlink request
+ * @seq: sequence number for message
+ * @flags: Additional flags for message
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_mesh_fill(struct sk_buff *msg,
+ struct batadv_priv *bat_priv,
+ enum batadv_nl_commands cmd,
+ u32 portid, u32 seq, int flags)
+{
+ struct net_device *mesh_iface = bat_priv->mesh_iface;
struct batadv_hard_iface *primary_if = NULL;
struct net_device *hard_iface;
- int ret = -ENOBUFS;
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, flags, cmd);
+ if (!hdr)
+ return -ENOBUFS;
if (nla_put_string(msg, BATADV_ATTR_VERSION, BATADV_SOURCE_VERSION) ||
nla_put_string(msg, BATADV_ATTR_ALGO_NAME,
bat_priv->algo_ops->name) ||
- nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, soft_iface->ifindex) ||
- nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, soft_iface->name) ||
+ nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, mesh_iface->ifindex) ||
+ nla_put_string(msg, BATADV_ATTR_MESH_IFNAME, mesh_iface->name) ||
nla_put(msg, BATADV_ATTR_MESH_ADDRESS, ETH_ALEN,
- soft_iface->dev_addr) ||
+ mesh_iface->dev_addr) ||
nla_put_u8(msg, BATADV_ATTR_TT_TTVN,
(u8)atomic_read(&bat_priv->tt.vn)))
- goto out;
+ goto nla_put_failure;
#ifdef CONFIG_BATMAN_ADV_BLA
if (nla_put_u16(msg, BATADV_ATTR_BLA_CRC,
ntohs(bat_priv->bla.claim_dest.group)))
- goto out;
+ goto nla_put_failure;
#endif
if (batadv_mcast_mesh_info_put(msg, bat_priv))
- goto out;
+ goto nla_put_failure;
primary_if = batadv_primary_if_get_selected(bat_priv);
if (primary_if && primary_if->if_status == BATADV_IF_ACTIVE) {
@@ -167,77 +263,338 @@ batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface)
hard_iface->name) ||
nla_put(msg, BATADV_ATTR_HARD_ADDRESS, ETH_ALEN,
hard_iface->dev_addr))
- goto out;
+ goto nla_put_failure;
}
- ret = 0;
+ if (nla_put_u8(msg, BATADV_ATTR_AGGREGATED_OGMS_ENABLED,
+ !!atomic_read(&bat_priv->aggregated_ogms)))
+ goto nla_put_failure;
- out:
- if (primary_if)
- batadv_hardif_put(primary_if);
+ if (batadv_netlink_mesh_fill_ap_isolation(msg, bat_priv))
+ goto nla_put_failure;
- return ret;
+ if (nla_put_u32(msg, BATADV_ATTR_ISOLATION_MARK,
+ bat_priv->isolation_mark))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, BATADV_ATTR_ISOLATION_MASK,
+ bat_priv->isolation_mark_mask))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, BATADV_ATTR_BONDING_ENABLED,
+ !!atomic_read(&bat_priv->bonding)))
+ goto nla_put_failure;
+
+#ifdef CONFIG_BATMAN_ADV_BLA
+ if (nla_put_u8(msg, BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED,
+ !!atomic_read(&bat_priv->bridge_loop_avoidance)))
+ goto nla_put_failure;
+#endif /* CONFIG_BATMAN_ADV_BLA */
+
+#ifdef CONFIG_BATMAN_ADV_DAT
+ if (nla_put_u8(msg, BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED,
+ !!atomic_read(&bat_priv->distributed_arp_table)))
+ goto nla_put_failure;
+#endif /* CONFIG_BATMAN_ADV_DAT */
+
+ if (nla_put_u8(msg, BATADV_ATTR_FRAGMENTATION_ENABLED,
+ !!atomic_read(&bat_priv->fragmentation)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, BATADV_ATTR_GW_BANDWIDTH_DOWN,
+ atomic_read(&bat_priv->gw.bandwidth_down)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, BATADV_ATTR_GW_BANDWIDTH_UP,
+ atomic_read(&bat_priv->gw.bandwidth_up)))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, BATADV_ATTR_GW_MODE,
+ atomic_read(&bat_priv->gw.mode)))
+ goto nla_put_failure;
+
+ if (bat_priv->algo_ops->gw.get_best_gw_node &&
+ bat_priv->algo_ops->gw.is_eligible) {
+ /* GW selection class is not available if the routing algorithm
+ * in use does not implement the GW API
+ */
+ if (nla_put_u32(msg, BATADV_ATTR_GW_SEL_CLASS,
+ atomic_read(&bat_priv->gw.sel_class)))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u8(msg, BATADV_ATTR_HOP_PENALTY,
+ atomic_read(&bat_priv->hop_penalty)))
+ goto nla_put_failure;
+
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+ if (nla_put_u32(msg, BATADV_ATTR_LOG_LEVEL,
+ atomic_read(&bat_priv->log_level)))
+ goto nla_put_failure;
+#endif /* CONFIG_BATMAN_ADV_DEBUG */
+
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ if (nla_put_u8(msg, BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED,
+ !atomic_read(&bat_priv->multicast_mode)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, BATADV_ATTR_MULTICAST_FANOUT,
+ atomic_read(&bat_priv->multicast_fanout)))
+ goto nla_put_failure;
+#endif /* CONFIG_BATMAN_ADV_MCAST */
+
+ if (nla_put_u32(msg, BATADV_ATTR_ORIG_INTERVAL,
+ atomic_read(&bat_priv->orig_interval)))
+ goto nla_put_failure;
+
+ batadv_hardif_put(primary_if);
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ batadv_hardif_put(primary_if);
+
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
}
/**
- * batadv_netlink_get_mesh_info() - handle incoming BATADV_CMD_GET_MESH_INFO
- * netlink request
- * @skb: received netlink message
- * @info: receiver information
+ * batadv_netlink_notify_mesh() - send meshif attributes to listener
+ * @bat_priv: the bat priv with all the mesh interface information
*
* Return: 0 on success, < 0 on error
*/
-static int
-batadv_netlink_get_mesh_info(struct sk_buff *skb, struct genl_info *info)
+static int batadv_netlink_notify_mesh(struct batadv_priv *bat_priv)
{
- struct net *net = genl_info_net(info);
- struct net_device *soft_iface;
- struct sk_buff *msg = NULL;
- void *msg_head;
- int ifindex;
+ struct sk_buff *msg;
int ret;
- if (!info->attrs[BATADV_ATTR_MESH_IFINDEX])
- return -EINVAL;
-
- ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]);
- if (!ifindex)
- return -EINVAL;
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
- ret = -ENODEV;
- goto out;
+ ret = batadv_netlink_mesh_fill(msg, bat_priv, BATADV_CMD_SET_MESH,
+ 0, 0, 0);
+ if (ret < 0) {
+ nlmsg_free(msg);
+ return ret;
}
+ genlmsg_multicast_netns(&batadv_netlink_family,
+ dev_net(bat_priv->mesh_iface), msg, 0,
+ BATADV_NL_MCGRP_CONFIG, GFP_KERNEL);
+
+ return 0;
+}
+
+/**
+ * batadv_netlink_get_mesh() - Get meshif attributes
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_get_mesh(struct sk_buff *skb, struct genl_info *info)
+{
+ struct batadv_priv *bat_priv = info->user_ptr[0];
+ struct sk_buff *msg;
+ int ret;
+
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg) {
- ret = -ENOMEM;
- goto out;
+ if (!msg)
+ return -ENOMEM;
+
+ ret = batadv_netlink_mesh_fill(msg, bat_priv, BATADV_CMD_GET_MESH,
+ info->snd_portid, info->snd_seq, 0);
+ if (ret < 0) {
+ nlmsg_free(msg);
+ return ret;
}
- msg_head = genlmsg_put(msg, info->snd_portid, info->snd_seq,
- &batadv_netlink_family, 0,
- BATADV_CMD_GET_MESH_INFO);
- if (!msg_head) {
- ret = -ENOBUFS;
- goto out;
+ ret = genlmsg_reply(msg, info);
+
+ return ret;
+}
+
+/**
+ * batadv_netlink_set_mesh() - Set meshif attributes
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info)
+{
+ struct batadv_priv *bat_priv = info->user_ptr[0];
+ struct nlattr *attr;
+
+ if (info->attrs[BATADV_ATTR_AGGREGATED_OGMS_ENABLED]) {
+ attr = info->attrs[BATADV_ATTR_AGGREGATED_OGMS_ENABLED];
+
+ atomic_set(&bat_priv->aggregated_ogms, !!nla_get_u8(attr));
}
- ret = batadv_netlink_mesh_info_put(msg, soft_iface);
+ if (info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]) {
+ attr = info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED];
- out:
- if (soft_iface)
- dev_put(soft_iface);
+ batadv_netlink_set_mesh_ap_isolation(attr, bat_priv);
+ }
- if (ret) {
- if (msg)
- nlmsg_free(msg);
- return ret;
+ if (info->attrs[BATADV_ATTR_ISOLATION_MARK]) {
+ attr = info->attrs[BATADV_ATTR_ISOLATION_MARK];
+
+ bat_priv->isolation_mark = nla_get_u32(attr);
}
- genlmsg_end(msg, msg_head);
- return genlmsg_reply(msg, info);
+ if (info->attrs[BATADV_ATTR_ISOLATION_MASK]) {
+ attr = info->attrs[BATADV_ATTR_ISOLATION_MASK];
+
+ bat_priv->isolation_mark_mask = nla_get_u32(attr);
+ }
+
+ if (info->attrs[BATADV_ATTR_BONDING_ENABLED]) {
+ attr = info->attrs[BATADV_ATTR_BONDING_ENABLED];
+
+ atomic_set(&bat_priv->bonding, !!nla_get_u8(attr));
+ }
+
+#ifdef CONFIG_BATMAN_ADV_BLA
+ if (info->attrs[BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED]) {
+ attr = info->attrs[BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED];
+
+ atomic_set(&bat_priv->bridge_loop_avoidance,
+ !!nla_get_u8(attr));
+ batadv_bla_status_update(bat_priv->mesh_iface);
+ }
+#endif /* CONFIG_BATMAN_ADV_BLA */
+
+#ifdef CONFIG_BATMAN_ADV_DAT
+ if (info->attrs[BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED]) {
+ attr = info->attrs[BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED];
+
+ atomic_set(&bat_priv->distributed_arp_table,
+ !!nla_get_u8(attr));
+ batadv_dat_status_update(bat_priv->mesh_iface);
+ }
+#endif /* CONFIG_BATMAN_ADV_DAT */
+
+ if (info->attrs[BATADV_ATTR_FRAGMENTATION_ENABLED]) {
+ attr = info->attrs[BATADV_ATTR_FRAGMENTATION_ENABLED];
+
+ atomic_set(&bat_priv->fragmentation, !!nla_get_u8(attr));
+
+ rtnl_lock();
+ batadv_update_min_mtu(bat_priv->mesh_iface);
+ rtnl_unlock();
+ }
+
+ if (info->attrs[BATADV_ATTR_GW_BANDWIDTH_DOWN]) {
+ attr = info->attrs[BATADV_ATTR_GW_BANDWIDTH_DOWN];
+
+ atomic_set(&bat_priv->gw.bandwidth_down, nla_get_u32(attr));
+ batadv_gw_tvlv_container_update(bat_priv);
+ }
+
+ if (info->attrs[BATADV_ATTR_GW_BANDWIDTH_UP]) {
+ attr = info->attrs[BATADV_ATTR_GW_BANDWIDTH_UP];
+
+ atomic_set(&bat_priv->gw.bandwidth_up, nla_get_u32(attr));
+ batadv_gw_tvlv_container_update(bat_priv);
+ }
+
+ if (info->attrs[BATADV_ATTR_GW_MODE]) {
+ u8 gw_mode;
+
+ attr = info->attrs[BATADV_ATTR_GW_MODE];
+ gw_mode = nla_get_u8(attr);
+
+ if (gw_mode <= BATADV_GW_MODE_SERVER) {
+ /* Invoking batadv_gw_reselect() is not enough to really
+ * de-select the current GW. It will only instruct the
+ * gateway client code to perform a re-election the next
+ * time that this is needed.
+ *
+ * When gw client mode is being switched off the current
+ * GW must be de-selected explicitly otherwise no GW_ADD
+ * uevent is thrown on client mode re-activation. This
+ * is operation is performed in
+ * batadv_gw_check_client_stop().
+ */
+ batadv_gw_reselect(bat_priv);
+
+ /* always call batadv_gw_check_client_stop() before
+ * changing the gateway state
+ */
+ batadv_gw_check_client_stop(bat_priv);
+ atomic_set(&bat_priv->gw.mode, gw_mode);
+ batadv_gw_tvlv_container_update(bat_priv);
+ }
+ }
+
+ if (info->attrs[BATADV_ATTR_GW_SEL_CLASS] &&
+ bat_priv->algo_ops->gw.get_best_gw_node &&
+ bat_priv->algo_ops->gw.is_eligible) {
+ /* setting the GW selection class is allowed only if the routing
+ * algorithm in use implements the GW API
+ */
+
+ u32 sel_class_max = bat_priv->algo_ops->gw.sel_class_max;
+ u32 sel_class;
+
+ attr = info->attrs[BATADV_ATTR_GW_SEL_CLASS];
+ sel_class = nla_get_u32(attr);
+
+ if (sel_class >= 1 && sel_class <= sel_class_max) {
+ atomic_set(&bat_priv->gw.sel_class, sel_class);
+ batadv_gw_reselect(bat_priv);
+ }
+ }
+
+ if (info->attrs[BATADV_ATTR_HOP_PENALTY]) {
+ attr = info->attrs[BATADV_ATTR_HOP_PENALTY];
+
+ atomic_set(&bat_priv->hop_penalty, nla_get_u8(attr));
+ }
+
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+ if (info->attrs[BATADV_ATTR_LOG_LEVEL]) {
+ attr = info->attrs[BATADV_ATTR_LOG_LEVEL];
+
+ atomic_set(&bat_priv->log_level,
+ nla_get_u32(attr) & BATADV_DBG_ALL);
+ }
+#endif /* CONFIG_BATMAN_ADV_DEBUG */
+
+#ifdef CONFIG_BATMAN_ADV_MCAST
+ if (info->attrs[BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED]) {
+ attr = info->attrs[BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED];
+
+ atomic_set(&bat_priv->multicast_mode, !nla_get_u8(attr));
+ }
+
+ if (info->attrs[BATADV_ATTR_MULTICAST_FANOUT]) {
+ attr = info->attrs[BATADV_ATTR_MULTICAST_FANOUT];
+
+ atomic_set(&bat_priv->multicast_fanout, nla_get_u32(attr));
+ }
+#endif /* CONFIG_BATMAN_ADV_MCAST */
+
+ if (info->attrs[BATADV_ATTR_ORIG_INTERVAL]) {
+ u32 orig_interval;
+
+ attr = info->attrs[BATADV_ATTR_ORIG_INTERVAL];
+ orig_interval = nla_get_u32(attr);
+
+ orig_interval = min_t(u32, orig_interval, INT_MAX);
+ orig_interval = max_t(u32, orig_interval, 2 * BATADV_JITTER);
+
+ atomic_set(&bat_priv->orig_interval, orig_interval);
+ }
+
+ batadv_netlink_notify_mesh(bat_priv);
+
+ return 0;
}
/**
@@ -258,10 +615,10 @@ batadv_netlink_tp_meter_put(struct sk_buff *msg, u32 cookie)
/**
* batadv_netlink_tpmeter_notify() - send tp_meter result via netlink to client
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @dst: destination of tp_meter session
* @result: reason for tp meter session stop
- * @test_time: total time ot the tp_meter session
+ * @test_time: total time of the tp_meter session
* @total_bytes: bytes acked to the receiver
* @cookie: cookie of tp_meter session
*
@@ -305,7 +662,7 @@ int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst,
genlmsg_end(msg, hdr);
genlmsg_multicast_netns(&batadv_netlink_family,
- dev_net(bat_priv->soft_iface), msg, 0,
+ dev_net(bat_priv->mesh_iface), msg, 0,
BATADV_NL_MCGRP_TPMETER, GFP_KERNEL);
return 0;
@@ -329,40 +686,24 @@ err_genlmsg:
static int
batadv_netlink_tp_meter_start(struct sk_buff *skb, struct genl_info *info)
{
- struct net *net = genl_info_net(info);
- struct net_device *soft_iface;
- struct batadv_priv *bat_priv;
+ struct batadv_priv *bat_priv = info->user_ptr[0];
struct sk_buff *msg = NULL;
u32 test_length;
void *msg_head;
- int ifindex;
u32 cookie;
u8 *dst;
int ret;
- if (!info->attrs[BATADV_ATTR_MESH_IFINDEX])
- return -EINVAL;
-
if (!info->attrs[BATADV_ATTR_ORIG_ADDRESS])
return -EINVAL;
if (!info->attrs[BATADV_ATTR_TPMETER_TEST_TIME])
return -EINVAL;
- ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]);
- if (!ifindex)
- return -EINVAL;
-
dst = nla_data(info->attrs[BATADV_ATTR_ORIG_ADDRESS]);
test_length = nla_get_u32(info->attrs[BATADV_ATTR_TPMETER_TEST_TIME]);
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
- ret = -ENODEV;
- goto out;
- }
-
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg) {
ret = -ENOMEM;
@@ -377,15 +718,11 @@ batadv_netlink_tp_meter_start(struct sk_buff *skb, struct genl_info *info)
goto out;
}
- bat_priv = netdev_priv(soft_iface);
batadv_tp_start(bat_priv, dst, test_length, &cookie);
ret = batadv_netlink_tp_meter_put(msg, cookie);
out:
- if (soft_iface)
- dev_put(soft_iface);
-
if (ret) {
if (msg)
nlmsg_free(msg);
@@ -397,7 +734,7 @@ batadv_netlink_tp_meter_start(struct sk_buff *skb, struct genl_info *info)
}
/**
- * batadv_netlink_tp_meter_start() - Cancel a running tp_meter session
+ * batadv_netlink_tp_meter_cancel() - Cancel a running tp_meter session
* @skb: received netlink message
* @info: receiver information
*
@@ -406,61 +743,57 @@ batadv_netlink_tp_meter_start(struct sk_buff *skb, struct genl_info *info)
static int
batadv_netlink_tp_meter_cancel(struct sk_buff *skb, struct genl_info *info)
{
- struct net *net = genl_info_net(info);
- struct net_device *soft_iface;
- struct batadv_priv *bat_priv;
- int ifindex;
+ struct batadv_priv *bat_priv = info->user_ptr[0];
u8 *dst;
int ret = 0;
- if (!info->attrs[BATADV_ATTR_MESH_IFINDEX])
- return -EINVAL;
-
if (!info->attrs[BATADV_ATTR_ORIG_ADDRESS])
return -EINVAL;
- ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]);
- if (!ifindex)
- return -EINVAL;
-
dst = nla_data(info->attrs[BATADV_ATTR_ORIG_ADDRESS]);
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
- ret = -ENODEV;
- goto out;
- }
-
- bat_priv = netdev_priv(soft_iface);
batadv_tp_stop(bat_priv, dst, BATADV_TP_REASON_CANCEL);
-out:
- if (soft_iface)
- dev_put(soft_iface);
-
return ret;
}
/**
- * batadv_netlink_dump_hardif_entry() - Dump one hard interface into a message
+ * batadv_netlink_hardif_fill() - Fill message with hardif attributes
* @msg: Netlink message to dump into
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @hard_iface: hard interface which was modified
+ * @cmd: type of message to generate
* @portid: Port making netlink request
- * @seq: Sequence number of netlink message
- * @hard_iface: Hard interface to dump
+ * @seq: sequence number for message
+ * @flags: Additional flags for message
+ * @cb: Control block containing additional options
*
- * Return: error code, or 0 on success
+ * Return: 0 on success or negative error number in case of failure
*/
-static int
-batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid, u32 seq,
- struct batadv_hard_iface *hard_iface)
+static int batadv_netlink_hardif_fill(struct sk_buff *msg,
+ struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *hard_iface,
+ enum batadv_nl_commands cmd,
+ u32 portid, u32 seq, int flags,
+ struct netlink_callback *cb)
{
struct net_device *net_dev = hard_iface->net_dev;
void *hdr;
- hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, NLM_F_MULTI,
- BATADV_CMD_GET_HARDIFS);
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, flags, cmd);
if (!hdr)
- return -EMSGSIZE;
+ return -ENOBUFS;
+
+ if (cb)
+ genl_dump_check_consistent(cb, hdr);
+
+ if (nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX,
+ bat_priv->mesh_iface->ifindex))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, BATADV_ATTR_MESH_IFNAME,
+ bat_priv->mesh_iface->name))
+ goto nla_put_failure;
if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX,
net_dev->ifindex) ||
@@ -475,158 +808,717 @@ batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid, u32 seq,
goto nla_put_failure;
}
+ if (nla_put_u8(msg, BATADV_ATTR_HOP_PENALTY,
+ atomic_read(&hard_iface->hop_penalty)))
+ goto nla_put_failure;
+
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+ if (nla_put_u32(msg, BATADV_ATTR_ELP_INTERVAL,
+ atomic_read(&hard_iface->bat_v.elp_interval)))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, BATADV_ATTR_THROUGHPUT_OVERRIDE,
+ atomic_read(&hard_iface->bat_v.throughput_override)))
+ goto nla_put_failure;
+#endif /* CONFIG_BATMAN_ADV_BATMAN_V */
+
genlmsg_end(msg, hdr);
return 0;
- nla_put_failure:
+nla_put_failure:
genlmsg_cancel(msg, hdr);
return -EMSGSIZE;
}
/**
- * batadv_netlink_dump_hardifs() - Dump all hard interface into a messages
+ * batadv_netlink_notify_hardif() - send hardif attributes to listener
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @hard_iface: hard interface which was modified
+ *
+ * Return: 0 on success, < 0 on error
+ */
+static int batadv_netlink_notify_hardif(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *hard_iface)
+{
+ struct sk_buff *msg;
+ int ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ ret = batadv_netlink_hardif_fill(msg, bat_priv, hard_iface,
+ BATADV_CMD_SET_HARDIF, 0, 0, 0, NULL);
+ if (ret < 0) {
+ nlmsg_free(msg);
+ return ret;
+ }
+
+ genlmsg_multicast_netns(&batadv_netlink_family,
+ dev_net(bat_priv->mesh_iface), msg, 0,
+ BATADV_NL_MCGRP_CONFIG, GFP_KERNEL);
+
+ return 0;
+}
+
+/**
+ * batadv_netlink_cmd_get_hardif() - Get hardif attributes
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_cmd_get_hardif(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct batadv_hard_iface *hard_iface = info->user_ptr[1];
+ struct batadv_priv *bat_priv = info->user_ptr[0];
+ struct sk_buff *msg;
+ int ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ ret = batadv_netlink_hardif_fill(msg, bat_priv, hard_iface,
+ BATADV_CMD_GET_HARDIF,
+ info->snd_portid, info->snd_seq, 0,
+ NULL);
+ if (ret < 0) {
+ nlmsg_free(msg);
+ return ret;
+ }
+
+ ret = genlmsg_reply(msg, info);
+
+ return ret;
+}
+
+/**
+ * batadv_netlink_set_hardif() - Set hardif attributes
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_set_hardif(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct batadv_hard_iface *hard_iface = info->user_ptr[1];
+ struct batadv_priv *bat_priv = info->user_ptr[0];
+ struct nlattr *attr;
+
+ if (info->attrs[BATADV_ATTR_HOP_PENALTY]) {
+ attr = info->attrs[BATADV_ATTR_HOP_PENALTY];
+
+ atomic_set(&hard_iface->hop_penalty, nla_get_u8(attr));
+ }
+
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+
+ if (info->attrs[BATADV_ATTR_ELP_INTERVAL]) {
+ attr = info->attrs[BATADV_ATTR_ELP_INTERVAL];
+
+ atomic_set(&hard_iface->bat_v.elp_interval, nla_get_u32(attr));
+ }
+
+ if (info->attrs[BATADV_ATTR_THROUGHPUT_OVERRIDE]) {
+ attr = info->attrs[BATADV_ATTR_THROUGHPUT_OVERRIDE];
+
+ atomic_set(&hard_iface->bat_v.throughput_override,
+ nla_get_u32(attr));
+ }
+#endif /* CONFIG_BATMAN_ADV_BATMAN_V */
+
+ batadv_netlink_notify_hardif(bat_priv, hard_iface);
+
+ return 0;
+}
+
+/**
+ * batadv_netlink_dump_hardif() - Dump all hard interface into a messages
* @msg: Netlink message to dump into
* @cb: Parameters from query
*
* Return: error code, or length of reply message on success
*/
static int
-batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb)
+batadv_netlink_dump_hardif(struct sk_buff *msg, struct netlink_callback *cb)
{
- struct net *net = sock_net(cb->skb->sk);
- struct net_device *soft_iface;
+ struct net_device *mesh_iface;
struct batadv_hard_iface *hard_iface;
- int ifindex;
+ struct batadv_priv *bat_priv;
int portid = NETLINK_CB(cb->skb).portid;
- int seq = cb->nlh->nlmsg_seq;
int skip = cb->args[0];
+ struct list_head *iter;
int i = 0;
- ifindex = batadv_netlink_get_ifindex(cb->nlh,
- BATADV_ATTR_MESH_IFINDEX);
- if (!ifindex)
- return -EINVAL;
-
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface)
- return -ENODEV;
-
- if (!batadv_softif_is_valid(soft_iface)) {
- dev_put(soft_iface);
- return -ENODEV;
- }
+ mesh_iface = batadv_netlink_get_meshif(cb);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
- rcu_read_lock();
+ bat_priv = netdev_priv(mesh_iface);
- list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
- if (hard_iface->soft_iface != soft_iface)
- continue;
+ rtnl_lock();
+ cb->seq = batadv_hardif_generation << 1 | 1;
+ netdev_for_each_lower_private(mesh_iface, hard_iface, iter) {
if (i++ < skip)
continue;
- if (batadv_netlink_dump_hardif_entry(msg, portid, seq,
- hard_iface)) {
+ if (batadv_netlink_hardif_fill(msg, bat_priv, hard_iface,
+ BATADV_CMD_GET_HARDIF,
+ portid, cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, cb)) {
i--;
break;
}
}
- rcu_read_unlock();
+ rtnl_unlock();
- dev_put(soft_iface);
+ dev_put(mesh_iface);
cb->args[0] = i;
return msg->len;
}
-static const struct genl_ops batadv_netlink_ops[] = {
+/**
+ * batadv_netlink_vlan_fill() - Fill message with vlan attributes
+ * @msg: Netlink message to dump into
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @vlan: vlan which was modified
+ * @cmd: type of message to generate
+ * @portid: Port making netlink request
+ * @seq: sequence number for message
+ * @flags: Additional flags for message
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_vlan_fill(struct sk_buff *msg,
+ struct batadv_priv *bat_priv,
+ struct batadv_meshif_vlan *vlan,
+ enum batadv_nl_commands cmd,
+ u32 portid, u32 seq, int flags)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, flags, cmd);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX,
+ bat_priv->mesh_iface->ifindex))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, BATADV_ATTR_MESH_IFNAME,
+ bat_priv->mesh_iface->name))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, BATADV_ATTR_VLANID, vlan->vid & VLAN_VID_MASK))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, BATADV_ATTR_AP_ISOLATION_ENABLED,
+ !!atomic_read(&vlan->ap_isolation)))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+/**
+ * batadv_netlink_notify_vlan() - send vlan attributes to listener
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @vlan: vlan which was modified
+ *
+ * Return: 0 on success, < 0 on error
+ */
+static int batadv_netlink_notify_vlan(struct batadv_priv *bat_priv,
+ struct batadv_meshif_vlan *vlan)
+{
+ struct sk_buff *msg;
+ int ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ ret = batadv_netlink_vlan_fill(msg, bat_priv, vlan,
+ BATADV_CMD_SET_VLAN, 0, 0, 0);
+ if (ret < 0) {
+ nlmsg_free(msg);
+ return ret;
+ }
+
+ genlmsg_multicast_netns(&batadv_netlink_family,
+ dev_net(bat_priv->mesh_iface), msg, 0,
+ BATADV_NL_MCGRP_CONFIG, GFP_KERNEL);
+
+ return 0;
+}
+
+/**
+ * batadv_netlink_get_vlan() - Get vlan attributes
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_get_vlan(struct sk_buff *skb, struct genl_info *info)
+{
+ struct batadv_meshif_vlan *vlan = info->user_ptr[1];
+ struct batadv_priv *bat_priv = info->user_ptr[0];
+ struct sk_buff *msg;
+ int ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ ret = batadv_netlink_vlan_fill(msg, bat_priv, vlan, BATADV_CMD_GET_VLAN,
+ info->snd_portid, info->snd_seq, 0);
+ if (ret < 0) {
+ nlmsg_free(msg);
+ return ret;
+ }
+
+ ret = genlmsg_reply(msg, info);
+
+ return ret;
+}
+
+/**
+ * batadv_netlink_set_vlan() - Get vlan attributes
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_netlink_set_vlan(struct sk_buff *skb, struct genl_info *info)
+{
+ struct batadv_meshif_vlan *vlan = info->user_ptr[1];
+ struct batadv_priv *bat_priv = info->user_ptr[0];
+ struct nlattr *attr;
+
+ if (info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]) {
+ attr = info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED];
+
+ atomic_set(&vlan->ap_isolation, !!nla_get_u8(attr));
+ }
+
+ batadv_netlink_notify_vlan(bat_priv, vlan);
+
+ return 0;
+}
+
+/**
+ * batadv_netlink_get_meshif_from_ifindex() - Get mesh-iface from ifindex
+ * @net: the applicable net namespace
+ * @ifindex: index of the mesh interface
+ *
+ * Return: Pointer to mesh interface (with increased refcnt) on success, error
+ * pointer on error
+ */
+static struct net_device *
+batadv_netlink_get_meshif_from_ifindex(struct net *net, int ifindex)
+{
+ struct net_device *mesh_iface;
+
+ mesh_iface = dev_get_by_index(net, ifindex);
+ if (!mesh_iface)
+ return ERR_PTR(-ENODEV);
+
+ if (!batadv_meshif_is_valid(mesh_iface))
+ goto err_put_meshif;
+
+ return mesh_iface;
+
+err_put_meshif:
+ dev_put(mesh_iface);
+
+ return ERR_PTR(-EINVAL);
+}
+
+/**
+ * batadv_netlink_get_meshif_from_info() - Get mesh-iface from genl attributes
+ * @net: the applicable net namespace
+ * @info: receiver information
+ *
+ * Return: Pointer to mesh interface (with increased refcnt) on success, error
+ * pointer on error
+ */
+static struct net_device *
+batadv_netlink_get_meshif_from_info(struct net *net, struct genl_info *info)
+{
+ int ifindex;
+
+ if (!info->attrs[BATADV_ATTR_MESH_IFINDEX])
+ return ERR_PTR(-EINVAL);
+
+ ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]);
+
+ return batadv_netlink_get_meshif_from_ifindex(net, ifindex);
+}
+
+/**
+ * batadv_netlink_get_meshif() - Retrieve mesh interface from netlink callback
+ * @cb: callback structure containing arguments
+ *
+ * Return: Pointer to mesh interface (with increased refcnt) on success, error
+ * pointer on error
+ */
+struct net_device *batadv_netlink_get_meshif(struct netlink_callback *cb)
+{
+ int ifindex = batadv_netlink_get_ifindex(cb->nlh,
+ BATADV_ATTR_MESH_IFINDEX);
+ if (!ifindex)
+ return ERR_PTR(-ENONET);
+
+ return batadv_netlink_get_meshif_from_ifindex(sock_net(cb->skb->sk),
+ ifindex);
+}
+
+/**
+ * batadv_netlink_get_hardif_from_ifindex() - Get hard-iface from ifindex
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @net: the applicable net namespace
+ * @ifindex: index of the hard interface
+ *
+ * Return: Pointer to hard interface (with increased refcnt) on success, error
+ * pointer on error
+ */
+static struct batadv_hard_iface *
+batadv_netlink_get_hardif_from_ifindex(struct batadv_priv *bat_priv,
+ struct net *net, int ifindex)
+{
+ struct batadv_hard_iface *hard_iface;
+ struct net_device *hard_dev;
+
+ hard_dev = dev_get_by_index(net, ifindex);
+ if (!hard_dev)
+ return ERR_PTR(-ENODEV);
+
+ hard_iface = batadv_hardif_get_by_netdev(hard_dev);
+ if (!hard_iface)
+ goto err_put_harddev;
+
+ if (hard_iface->mesh_iface != bat_priv->mesh_iface)
+ goto err_put_hardif;
+
+ /* hard_dev is referenced by hard_iface and not needed here */
+ dev_put(hard_dev);
+
+ return hard_iface;
+
+err_put_hardif:
+ batadv_hardif_put(hard_iface);
+err_put_harddev:
+ dev_put(hard_dev);
+
+ return ERR_PTR(-EINVAL);
+}
+
+/**
+ * batadv_netlink_get_hardif_from_info() - Get hard-iface from genl attributes
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @net: the applicable net namespace
+ * @info: receiver information
+ *
+ * Return: Pointer to hard interface (with increased refcnt) on success, error
+ * pointer on error
+ */
+static struct batadv_hard_iface *
+batadv_netlink_get_hardif_from_info(struct batadv_priv *bat_priv,
+ struct net *net, struct genl_info *info)
+{
+ int ifindex;
+
+ if (!info->attrs[BATADV_ATTR_HARD_IFINDEX])
+ return ERR_PTR(-EINVAL);
+
+ ifindex = nla_get_u32(info->attrs[BATADV_ATTR_HARD_IFINDEX]);
+
+ return batadv_netlink_get_hardif_from_ifindex(bat_priv, net, ifindex);
+}
+
+/**
+ * batadv_netlink_get_hardif() - Retrieve hard interface from netlink callback
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @cb: callback structure containing arguments
+ *
+ * Return: Pointer to hard interface (with increased refcnt) on success, error
+ * pointer on error
+ */
+struct batadv_hard_iface *
+batadv_netlink_get_hardif(struct batadv_priv *bat_priv,
+ struct netlink_callback *cb)
+{
+ int ifindex = batadv_netlink_get_ifindex(cb->nlh,
+ BATADV_ATTR_HARD_IFINDEX);
+ if (!ifindex)
+ return ERR_PTR(-ENONET);
+
+ return batadv_netlink_get_hardif_from_ifindex(bat_priv,
+ sock_net(cb->skb->sk),
+ ifindex);
+}
+
+/**
+ * batadv_get_vlan_from_info() - Retrieve vlan from genl attributes
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @net: the applicable net namespace
+ * @info: receiver information
+ *
+ * Return: Pointer to vlan on success (with increased refcnt), error pointer
+ * on error
+ */
+static struct batadv_meshif_vlan *
+batadv_get_vlan_from_info(struct batadv_priv *bat_priv, struct net *net,
+ struct genl_info *info)
+{
+ struct batadv_meshif_vlan *vlan;
+ u16 vid;
+
+ if (!info->attrs[BATADV_ATTR_VLANID])
+ return ERR_PTR(-EINVAL);
+
+ vid = nla_get_u16(info->attrs[BATADV_ATTR_VLANID]);
+
+ vlan = batadv_meshif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG);
+ if (!vlan)
+ return ERR_PTR(-ENOENT);
+
+ return vlan;
+}
+
+/**
+ * batadv_pre_doit() - Prepare batman-adv genl doit request
+ * @ops: requested netlink operation
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ *
+ * Return: 0 on success or negative error number in case of failure
+ */
+static int batadv_pre_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct batadv_hard_iface *hard_iface;
+ struct batadv_priv *bat_priv = NULL;
+ struct batadv_meshif_vlan *vlan;
+ struct net_device *mesh_iface;
+ u8 user_ptr1_flags;
+ u8 mesh_dep_flags;
+ int ret;
+
+ user_ptr1_flags = BATADV_FLAG_NEED_HARDIF | BATADV_FLAG_NEED_VLAN;
+ if (WARN_ON(hweight8(ops->internal_flags & user_ptr1_flags) > 1))
+ return -EINVAL;
+
+ mesh_dep_flags = BATADV_FLAG_NEED_HARDIF | BATADV_FLAG_NEED_VLAN;
+ if (WARN_ON((ops->internal_flags & mesh_dep_flags) &&
+ (~ops->internal_flags & BATADV_FLAG_NEED_MESH)))
+ return -EINVAL;
+
+ if (ops->internal_flags & BATADV_FLAG_NEED_MESH) {
+ mesh_iface = batadv_netlink_get_meshif_from_info(net, info);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
+
+ bat_priv = netdev_priv(mesh_iface);
+ info->user_ptr[0] = bat_priv;
+ }
+
+ if (ops->internal_flags & BATADV_FLAG_NEED_HARDIF) {
+ hard_iface = batadv_netlink_get_hardif_from_info(bat_priv, net,
+ info);
+ if (IS_ERR(hard_iface)) {
+ ret = PTR_ERR(hard_iface);
+ goto err_put_meshif;
+ }
+
+ info->user_ptr[1] = hard_iface;
+ }
+
+ if (ops->internal_flags & BATADV_FLAG_NEED_VLAN) {
+ vlan = batadv_get_vlan_from_info(bat_priv, net, info);
+ if (IS_ERR(vlan)) {
+ ret = PTR_ERR(vlan);
+ goto err_put_meshif;
+ }
+
+ info->user_ptr[1] = vlan;
+ }
+
+ return 0;
+
+err_put_meshif:
+ if (bat_priv)
+ dev_put(bat_priv->mesh_iface);
+
+ return ret;
+}
+
+/**
+ * batadv_post_doit() - End batman-adv genl doit request
+ * @ops: requested netlink operation
+ * @skb: Netlink message with request data
+ * @info: receiver information
+ */
+static void batadv_post_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct batadv_hard_iface *hard_iface;
+ struct batadv_meshif_vlan *vlan;
+ struct batadv_priv *bat_priv;
+
+ if (ops->internal_flags & BATADV_FLAG_NEED_HARDIF &&
+ info->user_ptr[1]) {
+ hard_iface = info->user_ptr[1];
+
+ batadv_hardif_put(hard_iface);
+ }
+
+ if (ops->internal_flags & BATADV_FLAG_NEED_VLAN && info->user_ptr[1]) {
+ vlan = info->user_ptr[1];
+ batadv_meshif_vlan_put(vlan);
+ }
+
+ if (ops->internal_flags & BATADV_FLAG_NEED_MESH && info->user_ptr[0]) {
+ bat_priv = info->user_ptr[0];
+ dev_put(bat_priv->mesh_iface);
+ }
+}
+
+static const struct genl_small_ops batadv_netlink_ops[] = {
{
- .cmd = BATADV_CMD_GET_MESH_INFO,
- .flags = GENL_ADMIN_PERM,
- .policy = batadv_netlink_policy,
- .doit = batadv_netlink_get_mesh_info,
+ .cmd = BATADV_CMD_GET_MESH,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ /* can be retrieved by unprivileged users */
+ .doit = batadv_netlink_get_mesh,
+ .internal_flags = BATADV_FLAG_NEED_MESH,
},
{
.cmd = BATADV_CMD_TP_METER,
- .flags = GENL_ADMIN_PERM,
- .policy = batadv_netlink_policy,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
.doit = batadv_netlink_tp_meter_start,
+ .internal_flags = BATADV_FLAG_NEED_MESH,
},
{
.cmd = BATADV_CMD_TP_METER_CANCEL,
- .flags = GENL_ADMIN_PERM,
- .policy = batadv_netlink_policy,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
.doit = batadv_netlink_tp_meter_cancel,
+ .internal_flags = BATADV_FLAG_NEED_MESH,
},
{
.cmd = BATADV_CMD_GET_ROUTING_ALGOS,
- .flags = GENL_ADMIN_PERM,
- .policy = batadv_netlink_policy,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
.dumpit = batadv_algo_dump,
},
{
- .cmd = BATADV_CMD_GET_HARDIFS,
- .flags = GENL_ADMIN_PERM,
- .policy = batadv_netlink_policy,
- .dumpit = batadv_netlink_dump_hardifs,
+ .cmd = BATADV_CMD_GET_HARDIF,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ /* can be retrieved by unprivileged users */
+ .dumpit = batadv_netlink_dump_hardif,
+ .doit = batadv_netlink_cmd_get_hardif,
+ .internal_flags = BATADV_FLAG_NEED_MESH |
+ BATADV_FLAG_NEED_HARDIF,
},
{
.cmd = BATADV_CMD_GET_TRANSTABLE_LOCAL,
- .flags = GENL_ADMIN_PERM,
- .policy = batadv_netlink_policy,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
.dumpit = batadv_tt_local_dump,
},
{
.cmd = BATADV_CMD_GET_TRANSTABLE_GLOBAL,
- .flags = GENL_ADMIN_PERM,
- .policy = batadv_netlink_policy,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
.dumpit = batadv_tt_global_dump,
},
{
.cmd = BATADV_CMD_GET_ORIGINATORS,
- .flags = GENL_ADMIN_PERM,
- .policy = batadv_netlink_policy,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
.dumpit = batadv_orig_dump,
},
{
.cmd = BATADV_CMD_GET_NEIGHBORS,
- .flags = GENL_ADMIN_PERM,
- .policy = batadv_netlink_policy,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
.dumpit = batadv_hardif_neigh_dump,
},
{
.cmd = BATADV_CMD_GET_GATEWAYS,
- .flags = GENL_ADMIN_PERM,
- .policy = batadv_netlink_policy,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
.dumpit = batadv_gw_dump,
},
{
.cmd = BATADV_CMD_GET_BLA_CLAIM,
- .flags = GENL_ADMIN_PERM,
- .policy = batadv_netlink_policy,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
.dumpit = batadv_bla_claim_dump,
},
{
.cmd = BATADV_CMD_GET_BLA_BACKBONE,
- .flags = GENL_ADMIN_PERM,
- .policy = batadv_netlink_policy,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
.dumpit = batadv_bla_backbone_dump,
},
{
.cmd = BATADV_CMD_GET_DAT_CACHE,
- .flags = GENL_ADMIN_PERM,
- .policy = batadv_netlink_policy,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
.dumpit = batadv_dat_cache_dump,
},
{
.cmd = BATADV_CMD_GET_MCAST_FLAGS,
- .flags = GENL_ADMIN_PERM,
- .policy = batadv_netlink_policy,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
.dumpit = batadv_mcast_flags_dump,
},
-
+ {
+ .cmd = BATADV_CMD_SET_MESH,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = batadv_netlink_set_mesh,
+ .internal_flags = BATADV_FLAG_NEED_MESH,
+ },
+ {
+ .cmd = BATADV_CMD_SET_HARDIF,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = batadv_netlink_set_hardif,
+ .internal_flags = BATADV_FLAG_NEED_MESH |
+ BATADV_FLAG_NEED_HARDIF,
+ },
+ {
+ .cmd = BATADV_CMD_GET_VLAN,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ /* can be retrieved by unprivileged users */
+ .doit = batadv_netlink_get_vlan,
+ .internal_flags = BATADV_FLAG_NEED_MESH |
+ BATADV_FLAG_NEED_VLAN,
+ },
+ {
+ .cmd = BATADV_CMD_SET_VLAN,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = batadv_netlink_set_vlan,
+ .internal_flags = BATADV_FLAG_NEED_MESH |
+ BATADV_FLAG_NEED_VLAN,
+ },
};
struct genl_family batadv_netlink_family __ro_after_init = {
@@ -634,10 +1526,14 @@ struct genl_family batadv_netlink_family __ro_after_init = {
.name = BATADV_NL_NAME,
.version = 1,
.maxattr = BATADV_ATTR_MAX,
+ .policy = batadv_netlink_policy,
.netnsok = true,
+ .pre_doit = batadv_pre_doit,
+ .post_doit = batadv_post_doit,
.module = THIS_MODULE,
- .ops = batadv_netlink_ops,
- .n_ops = ARRAY_SIZE(batadv_netlink_ops),
+ .small_ops = batadv_netlink_ops,
+ .n_small_ops = ARRAY_SIZE(batadv_netlink_ops),
+ .resv_start_op = BATADV_CMD_SET_VLAN + 1,
.mcgrps = batadv_netlink_mcgrps,
.n_mcgrps = ARRAY_SIZE(batadv_netlink_mcgrps),
};
@@ -651,7 +1547,7 @@ void __init batadv_netlink_register(void)
ret = genl_register_family(&batadv_netlink_family);
if (ret)
- pr_warn("unable to register netlink family");
+ pr_warn("unable to register netlink family\n");
}
/**
diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h
index 571d9a5ae7aa..4eae9e5ff135 100644
--- a/net/batman-adv/netlink.h
+++ b/net/batman-adv/netlink.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2016-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Matthias Schiffer
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_NETLINK_H_
@@ -21,14 +9,15 @@
#include "main.h"
+#include <linux/netlink.h>
#include <linux/types.h>
-#include <net/genetlink.h>
-
-struct nlmsghdr;
void batadv_netlink_register(void);
void batadv_netlink_unregister(void);
-int batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype);
+struct net_device *batadv_netlink_get_meshif(struct netlink_callback *cb);
+struct batadv_hard_iface *
+batadv_netlink_get_hardif(struct batadv_priv *bat_priv,
+ struct netlink_callback *cb);
int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst,
u8 result, u32 test_time, u64 total_bytes,
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
deleted file mode 100644
index 34caf129a9bf..000000000000
--- a/net/batman-adv/network-coding.c
+++ /dev/null
@@ -1,1996 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2012-2018 B.A.T.M.A.N. contributors:
- *
- * Martin Hundebøll, Jeppe Ledet-Pedersen
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "network-coding.h"
-#include "main.h"
-
-#include <linux/atomic.h>
-#include <linux/bitops.h>
-#include <linux/byteorder/generic.h>
-#include <linux/compiler.h>
-#include <linux/debugfs.h>
-#include <linux/errno.h>
-#include <linux/etherdevice.h>
-#include <linux/gfp.h>
-#include <linux/if_ether.h>
-#include <linux/if_packet.h>
-#include <linux/init.h>
-#include <linux/jhash.h>
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/kref.h>
-#include <linux/list.h>
-#include <linux/lockdep.h>
-#include <linux/net.h>
-#include <linux/netdevice.h>
-#include <linux/printk.h>
-#include <linux/random.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-#include <linux/seq_file.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/stddef.h>
-#include <linux/string.h>
-#include <linux/workqueue.h>
-#include <uapi/linux/batadv_packet.h>
-
-#include "hard-interface.h"
-#include "hash.h"
-#include "log.h"
-#include "originator.h"
-#include "routing.h"
-#include "send.h"
-#include "tvlv.h"
-
-static struct lock_class_key batadv_nc_coding_hash_lock_class_key;
-static struct lock_class_key batadv_nc_decoding_hash_lock_class_key;
-
-static void batadv_nc_worker(struct work_struct *work);
-static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
- struct batadv_hard_iface *recv_if);
-
-/**
- * batadv_nc_init() - one-time initialization for network coding
- *
- * Return: 0 on success or negative error number in case of failure
- */
-int __init batadv_nc_init(void)
-{
- int ret;
-
- /* Register our packet type */
- ret = batadv_recv_handler_register(BATADV_CODED,
- batadv_nc_recv_coded_packet);
-
- return ret;
-}
-
-/**
- * batadv_nc_start_timer() - initialise the nc periodic worker
- * @bat_priv: the bat priv with all the soft interface information
- */
-static void batadv_nc_start_timer(struct batadv_priv *bat_priv)
-{
- queue_delayed_work(batadv_event_workqueue, &bat_priv->nc.work,
- msecs_to_jiffies(10));
-}
-
-/**
- * batadv_nc_tvlv_container_update() - update the network coding tvlv container
- * after network coding setting change
- * @bat_priv: the bat priv with all the soft interface information
- */
-static void batadv_nc_tvlv_container_update(struct batadv_priv *bat_priv)
-{
- char nc_mode;
-
- nc_mode = atomic_read(&bat_priv->network_coding);
-
- switch (nc_mode) {
- case 0:
- batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_NC, 1);
- break;
- case 1:
- batadv_tvlv_container_register(bat_priv, BATADV_TVLV_NC, 1,
- NULL, 0);
- break;
- }
-}
-
-/**
- * batadv_nc_status_update() - update the network coding tvlv container after
- * network coding setting change
- * @net_dev: the soft interface net device
- */
-void batadv_nc_status_update(struct net_device *net_dev)
-{
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
-
- batadv_nc_tvlv_container_update(bat_priv);
-}
-
-/**
- * batadv_nc_tvlv_ogm_handler_v1() - process incoming nc tvlv container
- * @bat_priv: the bat priv with all the soft interface information
- * @orig: the orig_node of the ogm
- * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
- * @tvlv_value: tvlv buffer containing the gateway data
- * @tvlv_value_len: tvlv buffer length
- */
-static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig,
- u8 flags,
- void *tvlv_value, u16 tvlv_value_len)
-{
- if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND)
- clear_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities);
- else
- set_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities);
-}
-
-/**
- * batadv_nc_mesh_init() - initialise coding hash table and start house keeping
- * @bat_priv: the bat priv with all the soft interface information
- *
- * Return: 0 on success or negative error number in case of failure
- */
-int batadv_nc_mesh_init(struct batadv_priv *bat_priv)
-{
- bat_priv->nc.timestamp_fwd_flush = jiffies;
- bat_priv->nc.timestamp_sniffed_purge = jiffies;
-
- if (bat_priv->nc.coding_hash || bat_priv->nc.decoding_hash)
- return 0;
-
- bat_priv->nc.coding_hash = batadv_hash_new(128);
- if (!bat_priv->nc.coding_hash)
- goto err;
-
- batadv_hash_set_lock_class(bat_priv->nc.coding_hash,
- &batadv_nc_coding_hash_lock_class_key);
-
- bat_priv->nc.decoding_hash = batadv_hash_new(128);
- if (!bat_priv->nc.decoding_hash)
- goto err;
-
- batadv_hash_set_lock_class(bat_priv->nc.decoding_hash,
- &batadv_nc_decoding_hash_lock_class_key);
-
- INIT_DELAYED_WORK(&bat_priv->nc.work, batadv_nc_worker);
- batadv_nc_start_timer(bat_priv);
-
- batadv_tvlv_handler_register(bat_priv, batadv_nc_tvlv_ogm_handler_v1,
- NULL, BATADV_TVLV_NC, 1,
- BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
- batadv_nc_tvlv_container_update(bat_priv);
- return 0;
-
-err:
- return -ENOMEM;
-}
-
-/**
- * batadv_nc_init_bat_priv() - initialise the nc specific bat_priv variables
- * @bat_priv: the bat priv with all the soft interface information
- */
-void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv)
-{
- atomic_set(&bat_priv->network_coding, 0);
- bat_priv->nc.min_tq = 200;
- bat_priv->nc.max_fwd_delay = 10;
- bat_priv->nc.max_buffer_time = 200;
-}
-
-/**
- * batadv_nc_init_orig() - initialise the nc fields of an orig_node
- * @orig_node: the orig_node which is going to be initialised
- */
-void batadv_nc_init_orig(struct batadv_orig_node *orig_node)
-{
- INIT_LIST_HEAD(&orig_node->in_coding_list);
- INIT_LIST_HEAD(&orig_node->out_coding_list);
- spin_lock_init(&orig_node->in_coding_list_lock);
- spin_lock_init(&orig_node->out_coding_list_lock);
-}
-
-/**
- * batadv_nc_node_release() - release nc_node from lists and queue for free
- * after rcu grace period
- * @ref: kref pointer of the nc_node
- */
-static void batadv_nc_node_release(struct kref *ref)
-{
- struct batadv_nc_node *nc_node;
-
- nc_node = container_of(ref, struct batadv_nc_node, refcount);
-
- batadv_orig_node_put(nc_node->orig_node);
- kfree_rcu(nc_node, rcu);
-}
-
-/**
- * batadv_nc_node_put() - decrement the nc_node refcounter and possibly
- * release it
- * @nc_node: nc_node to be free'd
- */
-static void batadv_nc_node_put(struct batadv_nc_node *nc_node)
-{
- kref_put(&nc_node->refcount, batadv_nc_node_release);
-}
-
-/**
- * batadv_nc_path_release() - release nc_path from lists and queue for free
- * after rcu grace period
- * @ref: kref pointer of the nc_path
- */
-static void batadv_nc_path_release(struct kref *ref)
-{
- struct batadv_nc_path *nc_path;
-
- nc_path = container_of(ref, struct batadv_nc_path, refcount);
-
- kfree_rcu(nc_path, rcu);
-}
-
-/**
- * batadv_nc_path_put() - decrement the nc_path refcounter and possibly
- * release it
- * @nc_path: nc_path to be free'd
- */
-static void batadv_nc_path_put(struct batadv_nc_path *nc_path)
-{
- kref_put(&nc_path->refcount, batadv_nc_path_release);
-}
-
-/**
- * batadv_nc_packet_free() - frees nc packet
- * @nc_packet: the nc packet to free
- * @dropped: whether the packet is freed because is is dropped
- */
-static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet,
- bool dropped)
-{
- if (dropped)
- kfree_skb(nc_packet->skb);
- else
- consume_skb(nc_packet->skb);
-
- batadv_nc_path_put(nc_packet->nc_path);
- kfree(nc_packet);
-}
-
-/**
- * batadv_nc_to_purge_nc_node() - checks whether an nc node has to be purged
- * @bat_priv: the bat priv with all the soft interface information
- * @nc_node: the nc node to check
- *
- * Return: true if the entry has to be purged now, false otherwise
- */
-static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv,
- struct batadv_nc_node *nc_node)
-{
- if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
- return true;
-
- return batadv_has_timed_out(nc_node->last_seen, BATADV_NC_NODE_TIMEOUT);
-}
-
-/**
- * batadv_nc_to_purge_nc_path_coding() - checks whether an nc path has timed out
- * @bat_priv: the bat priv with all the soft interface information
- * @nc_path: the nc path to check
- *
- * Return: true if the entry has to be purged now, false otherwise
- */
-static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv,
- struct batadv_nc_path *nc_path)
-{
- if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
- return true;
-
- /* purge the path when no packets has been added for 10 times the
- * max_fwd_delay time
- */
- return batadv_has_timed_out(nc_path->last_valid,
- bat_priv->nc.max_fwd_delay * 10);
-}
-
-/**
- * batadv_nc_to_purge_nc_path_decoding() - checks whether an nc path has timed
- * out
- * @bat_priv: the bat priv with all the soft interface information
- * @nc_path: the nc path to check
- *
- * Return: true if the entry has to be purged now, false otherwise
- */
-static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv,
- struct batadv_nc_path *nc_path)
-{
- if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
- return true;
-
- /* purge the path when no packets has been added for 10 times the
- * max_buffer time
- */
- return batadv_has_timed_out(nc_path->last_valid,
- bat_priv->nc.max_buffer_time * 10);
-}
-
-/**
- * batadv_nc_purge_orig_nc_nodes() - go through list of nc nodes and purge stale
- * entries
- * @bat_priv: the bat priv with all the soft interface information
- * @list: list of nc nodes
- * @lock: nc node list lock
- * @to_purge: function in charge to decide whether an entry has to be purged or
- * not. This function takes the nc node as argument and has to return
- * a boolean value: true if the entry has to be deleted, false
- * otherwise
- */
-static void
-batadv_nc_purge_orig_nc_nodes(struct batadv_priv *bat_priv,
- struct list_head *list,
- spinlock_t *lock,
- bool (*to_purge)(struct batadv_priv *,
- struct batadv_nc_node *))
-{
- struct batadv_nc_node *nc_node, *nc_node_tmp;
-
- /* For each nc_node in list */
- spin_lock_bh(lock);
- list_for_each_entry_safe(nc_node, nc_node_tmp, list, list) {
- /* if an helper function has been passed as parameter,
- * ask it if the entry has to be purged or not
- */
- if (to_purge && !to_purge(bat_priv, nc_node))
- continue;
-
- batadv_dbg(BATADV_DBG_NC, bat_priv,
- "Removing nc_node %pM -> %pM\n",
- nc_node->addr, nc_node->orig_node->orig);
- list_del_rcu(&nc_node->list);
- batadv_nc_node_put(nc_node);
- }
- spin_unlock_bh(lock);
-}
-
-/**
- * batadv_nc_purge_orig() - purges all nc node data attached of the given
- * originator
- * @bat_priv: the bat priv with all the soft interface information
- * @orig_node: orig_node with the nc node entries to be purged
- * @to_purge: function in charge to decide whether an entry has to be purged or
- * not. This function takes the nc node as argument and has to return
- * a boolean value: true is the entry has to be deleted, false
- * otherwise
- */
-void batadv_nc_purge_orig(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig_node,
- bool (*to_purge)(struct batadv_priv *,
- struct batadv_nc_node *))
-{
- /* Check ingoing nc_node's of this orig_node */
- batadv_nc_purge_orig_nc_nodes(bat_priv, &orig_node->in_coding_list,
- &orig_node->in_coding_list_lock,
- to_purge);
-
- /* Check outgoing nc_node's of this orig_node */
- batadv_nc_purge_orig_nc_nodes(bat_priv, &orig_node->out_coding_list,
- &orig_node->out_coding_list_lock,
- to_purge);
-}
-
-/**
- * batadv_nc_purge_orig_hash() - traverse entire originator hash to check if
- * they have timed out nc nodes
- * @bat_priv: the bat priv with all the soft interface information
- */
-static void batadv_nc_purge_orig_hash(struct batadv_priv *bat_priv)
-{
- struct batadv_hashtable *hash = bat_priv->orig_hash;
- struct hlist_head *head;
- struct batadv_orig_node *orig_node;
- u32 i;
-
- if (!hash)
- return;
-
- /* For each orig_node */
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(orig_node, head, hash_entry)
- batadv_nc_purge_orig(bat_priv, orig_node,
- batadv_nc_to_purge_nc_node);
- rcu_read_unlock();
- }
-}
-
-/**
- * batadv_nc_purge_paths() - traverse all nc paths part of the hash and remove
- * unused ones
- * @bat_priv: the bat priv with all the soft interface information
- * @hash: hash table containing the nc paths to check
- * @to_purge: function in charge to decide whether an entry has to be purged or
- * not. This function takes the nc node as argument and has to return
- * a boolean value: true is the entry has to be deleted, false
- * otherwise
- */
-static void batadv_nc_purge_paths(struct batadv_priv *bat_priv,
- struct batadv_hashtable *hash,
- bool (*to_purge)(struct batadv_priv *,
- struct batadv_nc_path *))
-{
- struct hlist_head *head;
- struct hlist_node *node_tmp;
- struct batadv_nc_path *nc_path;
- spinlock_t *lock; /* Protects lists in hash */
- u32 i;
-
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
- lock = &hash->list_locks[i];
-
- /* For each nc_path in this bin */
- spin_lock_bh(lock);
- hlist_for_each_entry_safe(nc_path, node_tmp, head, hash_entry) {
- /* if an helper function has been passed as parameter,
- * ask it if the entry has to be purged or not
- */
- if (to_purge && !to_purge(bat_priv, nc_path))
- continue;
-
- /* purging an non-empty nc_path should never happen, but
- * is observed under high CPU load. Delay the purging
- * until next iteration to allow the packet_list to be
- * emptied first.
- */
- if (!unlikely(list_empty(&nc_path->packet_list))) {
- net_ratelimited_function(printk,
- KERN_WARNING
- "Skipping free of non-empty nc_path (%pM -> %pM)!\n",
- nc_path->prev_hop,
- nc_path->next_hop);
- continue;
- }
-
- /* nc_path is unused, so remove it */
- batadv_dbg(BATADV_DBG_NC, bat_priv,
- "Remove nc_path %pM -> %pM\n",
- nc_path->prev_hop, nc_path->next_hop);
- hlist_del_rcu(&nc_path->hash_entry);
- batadv_nc_path_put(nc_path);
- }
- spin_unlock_bh(lock);
- }
-}
-
-/**
- * batadv_nc_hash_key_gen() - computes the nc_path hash key
- * @key: buffer to hold the final hash key
- * @src: source ethernet mac address going into the hash key
- * @dst: destination ethernet mac address going into the hash key
- */
-static void batadv_nc_hash_key_gen(struct batadv_nc_path *key, const char *src,
- const char *dst)
-{
- memcpy(key->prev_hop, src, sizeof(key->prev_hop));
- memcpy(key->next_hop, dst, sizeof(key->next_hop));
-}
-
-/**
- * batadv_nc_hash_choose() - compute the hash value for an nc path
- * @data: data to hash
- * @size: size of the hash table
- *
- * Return: the selected index in the hash table for the given data.
- */
-static u32 batadv_nc_hash_choose(const void *data, u32 size)
-{
- const struct batadv_nc_path *nc_path = data;
- u32 hash = 0;
-
- hash = jhash(&nc_path->prev_hop, sizeof(nc_path->prev_hop), hash);
- hash = jhash(&nc_path->next_hop, sizeof(nc_path->next_hop), hash);
-
- return hash % size;
-}
-
-/**
- * batadv_nc_hash_compare() - comparing function used in the network coding hash
- * tables
- * @node: node in the local table
- * @data2: second object to compare the node to
- *
- * Return: true if the two entry are the same, false otherwise
- */
-static bool batadv_nc_hash_compare(const struct hlist_node *node,
- const void *data2)
-{
- const struct batadv_nc_path *nc_path1, *nc_path2;
-
- nc_path1 = container_of(node, struct batadv_nc_path, hash_entry);
- nc_path2 = data2;
-
- /* Return 1 if the two keys are identical */
- if (!batadv_compare_eth(nc_path1->prev_hop, nc_path2->prev_hop))
- return false;
-
- if (!batadv_compare_eth(nc_path1->next_hop, nc_path2->next_hop))
- return false;
-
- return true;
-}
-
-/**
- * batadv_nc_hash_find() - search for an existing nc path and return it
- * @hash: hash table containing the nc path
- * @data: search key
- *
- * Return: the nc_path if found, NULL otherwise.
- */
-static struct batadv_nc_path *
-batadv_nc_hash_find(struct batadv_hashtable *hash,
- void *data)
-{
- struct hlist_head *head;
- struct batadv_nc_path *nc_path, *nc_path_tmp = NULL;
- int index;
-
- if (!hash)
- return NULL;
-
- index = batadv_nc_hash_choose(data, hash->size);
- head = &hash->table[index];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(nc_path, head, hash_entry) {
- if (!batadv_nc_hash_compare(&nc_path->hash_entry, data))
- continue;
-
- if (!kref_get_unless_zero(&nc_path->refcount))
- continue;
-
- nc_path_tmp = nc_path;
- break;
- }
- rcu_read_unlock();
-
- return nc_path_tmp;
-}
-
-/**
- * batadv_nc_send_packet() - send non-coded packet and free nc_packet struct
- * @nc_packet: the nc packet to send
- */
-static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet)
-{
- batadv_send_unicast_skb(nc_packet->skb, nc_packet->neigh_node);
- nc_packet->skb = NULL;
- batadv_nc_packet_free(nc_packet, false);
-}
-
-/**
- * batadv_nc_sniffed_purge() - Checks timestamp of given sniffed nc_packet.
- * @bat_priv: the bat priv with all the soft interface information
- * @nc_path: the nc path the packet belongs to
- * @nc_packet: the nc packet to be checked
- *
- * Checks whether the given sniffed (overheard) nc_packet has hit its buffering
- * timeout. If so, the packet is no longer kept and the entry deleted from the
- * queue. Has to be called with the appropriate locks.
- *
- * Return: false as soon as the entry in the fifo queue has not been timed out
- * yet and true otherwise.
- */
-static bool batadv_nc_sniffed_purge(struct batadv_priv *bat_priv,
- struct batadv_nc_path *nc_path,
- struct batadv_nc_packet *nc_packet)
-{
- unsigned long timeout = bat_priv->nc.max_buffer_time;
- bool res = false;
-
- lockdep_assert_held(&nc_path->packet_list_lock);
-
- /* Packets are added to tail, so the remaining packets did not time
- * out and we can stop processing the current queue
- */
- if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE &&
- !batadv_has_timed_out(nc_packet->timestamp, timeout))
- goto out;
-
- /* purge nc packet */
- list_del(&nc_packet->list);
- batadv_nc_packet_free(nc_packet, true);
-
- res = true;
-
-out:
- return res;
-}
-
-/**
- * batadv_nc_fwd_flush() - Checks the timestamp of the given nc packet.
- * @bat_priv: the bat priv with all the soft interface information
- * @nc_path: the nc path the packet belongs to
- * @nc_packet: the nc packet to be checked
- *
- * Checks whether the given nc packet has hit its forward timeout. If so, the
- * packet is no longer delayed, immediately sent and the entry deleted from the
- * queue. Has to be called with the appropriate locks.
- *
- * Return: false as soon as the entry in the fifo queue has not been timed out
- * yet and true otherwise.
- */
-static bool batadv_nc_fwd_flush(struct batadv_priv *bat_priv,
- struct batadv_nc_path *nc_path,
- struct batadv_nc_packet *nc_packet)
-{
- unsigned long timeout = bat_priv->nc.max_fwd_delay;
-
- lockdep_assert_held(&nc_path->packet_list_lock);
-
- /* Packets are added to tail, so the remaining packets did not time
- * out and we can stop processing the current queue
- */
- if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE &&
- !batadv_has_timed_out(nc_packet->timestamp, timeout))
- return false;
-
- /* Send packet */
- batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD);
- batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES,
- nc_packet->skb->len + ETH_HLEN);
- list_del(&nc_packet->list);
- batadv_nc_send_packet(nc_packet);
-
- return true;
-}
-
-/**
- * batadv_nc_process_nc_paths() - traverse given nc packet pool and free timed
- * out nc packets
- * @bat_priv: the bat priv with all the soft interface information
- * @hash: to be processed hash table
- * @process_fn: Function called to process given nc packet. Should return true
- * to encourage this function to proceed with the next packet.
- * Otherwise the rest of the current queue is skipped.
- */
-static void
-batadv_nc_process_nc_paths(struct batadv_priv *bat_priv,
- struct batadv_hashtable *hash,
- bool (*process_fn)(struct batadv_priv *,
- struct batadv_nc_path *,
- struct batadv_nc_packet *))
-{
- struct hlist_head *head;
- struct batadv_nc_packet *nc_packet, *nc_packet_tmp;
- struct batadv_nc_path *nc_path;
- bool ret;
- int i;
-
- if (!hash)
- return;
-
- /* Loop hash table bins */
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- /* Loop coding paths */
- rcu_read_lock();
- hlist_for_each_entry_rcu(nc_path, head, hash_entry) {
- /* Loop packets */
- spin_lock_bh(&nc_path->packet_list_lock);
- list_for_each_entry_safe(nc_packet, nc_packet_tmp,
- &nc_path->packet_list, list) {
- ret = process_fn(bat_priv, nc_path, nc_packet);
- if (!ret)
- break;
- }
- spin_unlock_bh(&nc_path->packet_list_lock);
- }
- rcu_read_unlock();
- }
-}
-
-/**
- * batadv_nc_worker() - periodic task for house keeping related to network
- * coding
- * @work: kernel work struct
- */
-static void batadv_nc_worker(struct work_struct *work)
-{
- struct delayed_work *delayed_work;
- struct batadv_priv_nc *priv_nc;
- struct batadv_priv *bat_priv;
- unsigned long timeout;
-
- delayed_work = to_delayed_work(work);
- priv_nc = container_of(delayed_work, struct batadv_priv_nc, work);
- bat_priv = container_of(priv_nc, struct batadv_priv, nc);
-
- batadv_nc_purge_orig_hash(bat_priv);
- batadv_nc_purge_paths(bat_priv, bat_priv->nc.coding_hash,
- batadv_nc_to_purge_nc_path_coding);
- batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash,
- batadv_nc_to_purge_nc_path_decoding);
-
- timeout = bat_priv->nc.max_fwd_delay;
-
- if (batadv_has_timed_out(bat_priv->nc.timestamp_fwd_flush, timeout)) {
- batadv_nc_process_nc_paths(bat_priv, bat_priv->nc.coding_hash,
- batadv_nc_fwd_flush);
- bat_priv->nc.timestamp_fwd_flush = jiffies;
- }
-
- if (batadv_has_timed_out(bat_priv->nc.timestamp_sniffed_purge,
- bat_priv->nc.max_buffer_time)) {
- batadv_nc_process_nc_paths(bat_priv, bat_priv->nc.decoding_hash,
- batadv_nc_sniffed_purge);
- bat_priv->nc.timestamp_sniffed_purge = jiffies;
- }
-
- /* Schedule a new check */
- batadv_nc_start_timer(bat_priv);
-}
-
-/**
- * batadv_can_nc_with_orig() - checks whether the given orig node is suitable
- * for coding or not
- * @bat_priv: the bat priv with all the soft interface information
- * @orig_node: neighboring orig node which may be used as nc candidate
- * @ogm_packet: incoming ogm packet also used for the checks
- *
- * Return: true if:
- * 1) The OGM must have the most recent sequence number.
- * 2) The TTL must be decremented by one and only one.
- * 3) The OGM must be received from the first hop from orig_node.
- * 4) The TQ value of the OGM must be above bat_priv->nc.min_tq.
- */
-static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig_node,
- struct batadv_ogm_packet *ogm_packet)
-{
- struct batadv_orig_ifinfo *orig_ifinfo;
- u32 last_real_seqno;
- u8 last_ttl;
-
- orig_ifinfo = batadv_orig_ifinfo_get(orig_node, BATADV_IF_DEFAULT);
- if (!orig_ifinfo)
- return false;
-
- last_ttl = orig_ifinfo->last_ttl;
- last_real_seqno = orig_ifinfo->last_real_seqno;
- batadv_orig_ifinfo_put(orig_ifinfo);
-
- if (last_real_seqno != ntohl(ogm_packet->seqno))
- return false;
- if (last_ttl != ogm_packet->ttl + 1)
- return false;
- if (!batadv_compare_eth(ogm_packet->orig, ogm_packet->prev_sender))
- return false;
- if (ogm_packet->tq < bat_priv->nc.min_tq)
- return false;
-
- return true;
-}
-
-/**
- * batadv_nc_find_nc_node() - search for an existing nc node and return it
- * @orig_node: orig node originating the ogm packet
- * @orig_neigh_node: neighboring orig node from which we received the ogm packet
- * (can be equal to orig_node)
- * @in_coding: traverse incoming or outgoing network coding list
- *
- * Return: the nc_node if found, NULL otherwise.
- */
-static struct batadv_nc_node *
-batadv_nc_find_nc_node(struct batadv_orig_node *orig_node,
- struct batadv_orig_node *orig_neigh_node,
- bool in_coding)
-{
- struct batadv_nc_node *nc_node, *nc_node_out = NULL;
- struct list_head *list;
-
- if (in_coding)
- list = &orig_neigh_node->in_coding_list;
- else
- list = &orig_neigh_node->out_coding_list;
-
- /* Traverse list of nc_nodes to orig_node */
- rcu_read_lock();
- list_for_each_entry_rcu(nc_node, list, list) {
- if (!batadv_compare_eth(nc_node->addr, orig_node->orig))
- continue;
-
- if (!kref_get_unless_zero(&nc_node->refcount))
- continue;
-
- /* Found a match */
- nc_node_out = nc_node;
- break;
- }
- rcu_read_unlock();
-
- return nc_node_out;
-}
-
-/**
- * batadv_nc_get_nc_node() - retrieves an nc node or creates the entry if it was
- * not found
- * @bat_priv: the bat priv with all the soft interface information
- * @orig_node: orig node originating the ogm packet
- * @orig_neigh_node: neighboring orig node from which we received the ogm packet
- * (can be equal to orig_node)
- * @in_coding: traverse incoming or outgoing network coding list
- *
- * Return: the nc_node if found or created, NULL in case of an error.
- */
-static struct batadv_nc_node *
-batadv_nc_get_nc_node(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig_node,
- struct batadv_orig_node *orig_neigh_node,
- bool in_coding)
-{
- struct batadv_nc_node *nc_node;
- spinlock_t *lock; /* Used to lock list selected by "int in_coding" */
- struct list_head *list;
-
- /* Select ingoing or outgoing coding node */
- if (in_coding) {
- lock = &orig_neigh_node->in_coding_list_lock;
- list = &orig_neigh_node->in_coding_list;
- } else {
- lock = &orig_neigh_node->out_coding_list_lock;
- list = &orig_neigh_node->out_coding_list;
- }
-
- spin_lock_bh(lock);
-
- /* Check if nc_node is already added */
- nc_node = batadv_nc_find_nc_node(orig_node, orig_neigh_node, in_coding);
-
- /* Node found */
- if (nc_node)
- goto unlock;
-
- nc_node = kzalloc(sizeof(*nc_node), GFP_ATOMIC);
- if (!nc_node)
- goto unlock;
-
- /* Initialize nc_node */
- INIT_LIST_HEAD(&nc_node->list);
- kref_init(&nc_node->refcount);
- ether_addr_copy(nc_node->addr, orig_node->orig);
- kref_get(&orig_neigh_node->refcount);
- nc_node->orig_node = orig_neigh_node;
-
- batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_node %pM -> %pM\n",
- nc_node->addr, nc_node->orig_node->orig);
-
- /* Add nc_node to orig_node */
- kref_get(&nc_node->refcount);
- list_add_tail_rcu(&nc_node->list, list);
-
-unlock:
- spin_unlock_bh(lock);
-
- return nc_node;
-}
-
-/**
- * batadv_nc_update_nc_node() - updates stored incoming and outgoing nc node
- * structs (best called on incoming OGMs)
- * @bat_priv: the bat priv with all the soft interface information
- * @orig_node: orig node originating the ogm packet
- * @orig_neigh_node: neighboring orig node from which we received the ogm packet
- * (can be equal to orig_node)
- * @ogm_packet: incoming ogm packet
- * @is_single_hop_neigh: orig_node is a single hop neighbor
- */
-void batadv_nc_update_nc_node(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig_node,
- struct batadv_orig_node *orig_neigh_node,
- struct batadv_ogm_packet *ogm_packet,
- int is_single_hop_neigh)
-{
- struct batadv_nc_node *in_nc_node = NULL;
- struct batadv_nc_node *out_nc_node = NULL;
-
- /* Check if network coding is enabled */
- if (!atomic_read(&bat_priv->network_coding))
- goto out;
-
- /* check if orig node is network coding enabled */
- if (!test_bit(BATADV_ORIG_CAPA_HAS_NC, &orig_node->capabilities))
- goto out;
-
- /* accept ogms from 'good' neighbors and single hop neighbors */
- if (!batadv_can_nc_with_orig(bat_priv, orig_node, ogm_packet) &&
- !is_single_hop_neigh)
- goto out;
-
- /* Add orig_node as in_nc_node on hop */
- in_nc_node = batadv_nc_get_nc_node(bat_priv, orig_node,
- orig_neigh_node, true);
- if (!in_nc_node)
- goto out;
-
- in_nc_node->last_seen = jiffies;
-
- /* Add hop as out_nc_node on orig_node */
- out_nc_node = batadv_nc_get_nc_node(bat_priv, orig_neigh_node,
- orig_node, false);
- if (!out_nc_node)
- goto out;
-
- out_nc_node->last_seen = jiffies;
-
-out:
- if (in_nc_node)
- batadv_nc_node_put(in_nc_node);
- if (out_nc_node)
- batadv_nc_node_put(out_nc_node);
-}
-
-/**
- * batadv_nc_get_path() - get existing nc_path or allocate a new one
- * @bat_priv: the bat priv with all the soft interface information
- * @hash: hash table containing the nc path
- * @src: ethernet source address - first half of the nc path search key
- * @dst: ethernet destination address - second half of the nc path search key
- *
- * Return: pointer to nc_path if the path was found or created, returns NULL
- * on error.
- */
-static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv,
- struct batadv_hashtable *hash,
- u8 *src,
- u8 *dst)
-{
- int hash_added;
- struct batadv_nc_path *nc_path, nc_path_key;
-
- batadv_nc_hash_key_gen(&nc_path_key, src, dst);
-
- /* Search for existing nc_path */
- nc_path = batadv_nc_hash_find(hash, (void *)&nc_path_key);
-
- if (nc_path) {
- /* Set timestamp to delay removal of nc_path */
- nc_path->last_valid = jiffies;
- return nc_path;
- }
-
- /* No existing nc_path was found; create a new */
- nc_path = kzalloc(sizeof(*nc_path), GFP_ATOMIC);
-
- if (!nc_path)
- return NULL;
-
- /* Initialize nc_path */
- INIT_LIST_HEAD(&nc_path->packet_list);
- spin_lock_init(&nc_path->packet_list_lock);
- kref_init(&nc_path->refcount);
- nc_path->last_valid = jiffies;
- ether_addr_copy(nc_path->next_hop, dst);
- ether_addr_copy(nc_path->prev_hop, src);
-
- batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_path %pM -> %pM\n",
- nc_path->prev_hop,
- nc_path->next_hop);
-
- /* Add nc_path to hash table */
- kref_get(&nc_path->refcount);
- hash_added = batadv_hash_add(hash, batadv_nc_hash_compare,
- batadv_nc_hash_choose, &nc_path_key,
- &nc_path->hash_entry);
-
- if (hash_added < 0) {
- kfree(nc_path);
- return NULL;
- }
-
- return nc_path;
-}
-
-/**
- * batadv_nc_random_weight_tq() - scale the receivers TQ-value to avoid unfair
- * selection of a receiver with slightly lower TQ than the other
- * @tq: to be weighted tq value
- *
- * Return: scaled tq value
- */
-static u8 batadv_nc_random_weight_tq(u8 tq)
-{
- u8 rand_val, rand_tq;
-
- get_random_bytes(&rand_val, sizeof(rand_val));
-
- /* randomize the estimated packet loss (max TQ - estimated TQ) */
- rand_tq = rand_val * (BATADV_TQ_MAX_VALUE - tq);
-
- /* normalize the randomized packet loss */
- rand_tq /= BATADV_TQ_MAX_VALUE;
-
- /* convert to (randomized) estimated tq again */
- return BATADV_TQ_MAX_VALUE - rand_tq;
-}
-
-/**
- * batadv_nc_memxor() - XOR destination with source
- * @dst: byte array to XOR into
- * @src: byte array to XOR from
- * @len: length of destination array
- */
-static void batadv_nc_memxor(char *dst, const char *src, unsigned int len)
-{
- unsigned int i;
-
- for (i = 0; i < len; ++i)
- dst[i] ^= src[i];
-}
-
-/**
- * batadv_nc_code_packets() - code a received unicast_packet with an nc packet
- * into a coded_packet and send it
- * @bat_priv: the bat priv with all the soft interface information
- * @skb: data skb to forward
- * @ethhdr: pointer to the ethernet header inside the skb
- * @nc_packet: structure containing the packet to the skb can be coded with
- * @neigh_node: next hop to forward packet to
- *
- * Return: true if both packets are consumed, false otherwise.
- */
-static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
- struct sk_buff *skb,
- struct ethhdr *ethhdr,
- struct batadv_nc_packet *nc_packet,
- struct batadv_neigh_node *neigh_node)
-{
- u8 tq_weighted_neigh, tq_weighted_coding, tq_tmp;
- struct sk_buff *skb_dest, *skb_src;
- struct batadv_unicast_packet *packet1;
- struct batadv_unicast_packet *packet2;
- struct batadv_coded_packet *coded_packet;
- struct batadv_neigh_node *neigh_tmp, *router_neigh, *first_dest;
- struct batadv_neigh_node *router_coding = NULL, *second_dest;
- struct batadv_neigh_ifinfo *router_neigh_ifinfo = NULL;
- struct batadv_neigh_ifinfo *router_coding_ifinfo = NULL;
- u8 *first_source, *second_source;
- __be32 packet_id1, packet_id2;
- size_t count;
- bool res = false;
- int coding_len;
- int unicast_size = sizeof(*packet1);
- int coded_size = sizeof(*coded_packet);
- int header_add = coded_size - unicast_size;
-
- /* TODO: do we need to consider the outgoing interface for
- * coded packets?
- */
- router_neigh = batadv_orig_router_get(neigh_node->orig_node,
- BATADV_IF_DEFAULT);
- if (!router_neigh)
- goto out;
-
- router_neigh_ifinfo = batadv_neigh_ifinfo_get(router_neigh,
- BATADV_IF_DEFAULT);
- if (!router_neigh_ifinfo)
- goto out;
-
- neigh_tmp = nc_packet->neigh_node;
- router_coding = batadv_orig_router_get(neigh_tmp->orig_node,
- BATADV_IF_DEFAULT);
- if (!router_coding)
- goto out;
-
- router_coding_ifinfo = batadv_neigh_ifinfo_get(router_coding,
- BATADV_IF_DEFAULT);
- if (!router_coding_ifinfo)
- goto out;
-
- tq_tmp = router_neigh_ifinfo->bat_iv.tq_avg;
- tq_weighted_neigh = batadv_nc_random_weight_tq(tq_tmp);
- tq_tmp = router_coding_ifinfo->bat_iv.tq_avg;
- tq_weighted_coding = batadv_nc_random_weight_tq(tq_tmp);
-
- /* Select one destination for the MAC-header dst-field based on
- * weighted TQ-values.
- */
- if (tq_weighted_neigh >= tq_weighted_coding) {
- /* Destination from nc_packet is selected for MAC-header */
- first_dest = nc_packet->neigh_node;
- first_source = nc_packet->nc_path->prev_hop;
- second_dest = neigh_node;
- second_source = ethhdr->h_source;
- packet1 = (struct batadv_unicast_packet *)nc_packet->skb->data;
- packet2 = (struct batadv_unicast_packet *)skb->data;
- packet_id1 = nc_packet->packet_id;
- packet_id2 = batadv_skb_crc32(skb,
- skb->data + sizeof(*packet2));
- } else {
- /* Destination for skb is selected for MAC-header */
- first_dest = neigh_node;
- first_source = ethhdr->h_source;
- second_dest = nc_packet->neigh_node;
- second_source = nc_packet->nc_path->prev_hop;
- packet1 = (struct batadv_unicast_packet *)skb->data;
- packet2 = (struct batadv_unicast_packet *)nc_packet->skb->data;
- packet_id1 = batadv_skb_crc32(skb,
- skb->data + sizeof(*packet1));
- packet_id2 = nc_packet->packet_id;
- }
-
- /* Instead of zero padding the smallest data buffer, we
- * code into the largest.
- */
- if (skb->len <= nc_packet->skb->len) {
- skb_dest = nc_packet->skb;
- skb_src = skb;
- } else {
- skb_dest = skb;
- skb_src = nc_packet->skb;
- }
-
- /* coding_len is used when decoding the packet shorter packet */
- coding_len = skb_src->len - unicast_size;
-
- if (skb_linearize(skb_dest) < 0 || skb_linearize(skb_src) < 0)
- goto out;
-
- skb_push(skb_dest, header_add);
-
- coded_packet = (struct batadv_coded_packet *)skb_dest->data;
- skb_reset_mac_header(skb_dest);
-
- coded_packet->packet_type = BATADV_CODED;
- coded_packet->version = BATADV_COMPAT_VERSION;
- coded_packet->ttl = packet1->ttl;
-
- /* Info about first unicast packet */
- ether_addr_copy(coded_packet->first_source, first_source);
- ether_addr_copy(coded_packet->first_orig_dest, packet1->dest);
- coded_packet->first_crc = packet_id1;
- coded_packet->first_ttvn = packet1->ttvn;
-
- /* Info about second unicast packet */
- ether_addr_copy(coded_packet->second_dest, second_dest->addr);
- ether_addr_copy(coded_packet->second_source, second_source);
- ether_addr_copy(coded_packet->second_orig_dest, packet2->dest);
- coded_packet->second_crc = packet_id2;
- coded_packet->second_ttl = packet2->ttl;
- coded_packet->second_ttvn = packet2->ttvn;
- coded_packet->coded_len = htons(coding_len);
-
- /* This is where the magic happens: Code skb_src into skb_dest */
- batadv_nc_memxor(skb_dest->data + coded_size,
- skb_src->data + unicast_size, coding_len);
-
- /* Update counters accordingly */
- if (BATADV_SKB_CB(skb_src)->decoded &&
- BATADV_SKB_CB(skb_dest)->decoded) {
- /* Both packets are recoded */
- count = skb_src->len + ETH_HLEN;
- count += skb_dest->len + ETH_HLEN;
- batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE, 2);
- batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, count);
- } else if (!BATADV_SKB_CB(skb_src)->decoded &&
- !BATADV_SKB_CB(skb_dest)->decoded) {
- /* Both packets are newly coded */
- count = skb_src->len + ETH_HLEN;
- count += skb_dest->len + ETH_HLEN;
- batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE, 2);
- batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, count);
- } else if (BATADV_SKB_CB(skb_src)->decoded &&
- !BATADV_SKB_CB(skb_dest)->decoded) {
- /* skb_src recoded and skb_dest is newly coded */
- batadv_inc_counter(bat_priv, BATADV_CNT_NC_RECODE);
- batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES,
- skb_src->len + ETH_HLEN);
- batadv_inc_counter(bat_priv, BATADV_CNT_NC_CODE);
- batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES,
- skb_dest->len + ETH_HLEN);
- } else if (!BATADV_SKB_CB(skb_src)->decoded &&
- BATADV_SKB_CB(skb_dest)->decoded) {
- /* skb_src is newly coded and skb_dest is recoded */
- batadv_inc_counter(bat_priv, BATADV_CNT_NC_CODE);
- batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES,
- skb_src->len + ETH_HLEN);
- batadv_inc_counter(bat_priv, BATADV_CNT_NC_RECODE);
- batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES,
- skb_dest->len + ETH_HLEN);
- }
-
- /* skb_src is now coded into skb_dest, so free it */
- consume_skb(skb_src);
-
- /* avoid duplicate free of skb from nc_packet */
- nc_packet->skb = NULL;
- batadv_nc_packet_free(nc_packet, false);
-
- /* Send the coded packet and return true */
- batadv_send_unicast_skb(skb_dest, first_dest);
- res = true;
-out:
- if (router_neigh)
- batadv_neigh_node_put(router_neigh);
- if (router_coding)
- batadv_neigh_node_put(router_coding);
- if (router_neigh_ifinfo)
- batadv_neigh_ifinfo_put(router_neigh_ifinfo);
- if (router_coding_ifinfo)
- batadv_neigh_ifinfo_put(router_coding_ifinfo);
- return res;
-}
-
-/**
- * batadv_nc_skb_coding_possible() - true if a decoded skb is available at dst.
- * @skb: data skb to forward
- * @dst: destination mac address of the other skb to code with
- * @src: source mac address of skb
- *
- * Whenever we network code a packet we have to check whether we received it in
- * a network coded form. If so, we may not be able to use it for coding because
- * some neighbors may also have received (overheard) the packet in the network
- * coded form without being able to decode it. It is hard to know which of the
- * neighboring nodes was able to decode the packet, therefore we can only
- * re-code the packet if the source of the previous encoded packet is involved.
- * Since the source encoded the packet we can be certain it has all necessary
- * decode information.
- *
- * Return: true if coding of a decoded packet is allowed.
- */
-static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src)
-{
- if (BATADV_SKB_CB(skb)->decoded && !batadv_compare_eth(dst, src))
- return false;
- return true;
-}
-
-/**
- * batadv_nc_path_search() - Find the coding path matching in_nc_node and
- * out_nc_node to retrieve a buffered packet that can be used for coding.
- * @bat_priv: the bat priv with all the soft interface information
- * @in_nc_node: pointer to skb next hop's neighbor nc node
- * @out_nc_node: pointer to skb source's neighbor nc node
- * @skb: data skb to forward
- * @eth_dst: next hop mac address of skb
- *
- * Return: true if coding of a decoded skb is allowed.
- */
-static struct batadv_nc_packet *
-batadv_nc_path_search(struct batadv_priv *bat_priv,
- struct batadv_nc_node *in_nc_node,
- struct batadv_nc_node *out_nc_node,
- struct sk_buff *skb,
- u8 *eth_dst)
-{
- struct batadv_nc_path *nc_path, nc_path_key;
- struct batadv_nc_packet *nc_packet_out = NULL;
- struct batadv_nc_packet *nc_packet, *nc_packet_tmp;
- struct batadv_hashtable *hash = bat_priv->nc.coding_hash;
- int idx;
-
- if (!hash)
- return NULL;
-
- /* Create almost path key */
- batadv_nc_hash_key_gen(&nc_path_key, in_nc_node->addr,
- out_nc_node->addr);
- idx = batadv_nc_hash_choose(&nc_path_key, hash->size);
-
- /* Check for coding opportunities in this nc_path */
- rcu_read_lock();
- hlist_for_each_entry_rcu(nc_path, &hash->table[idx], hash_entry) {
- if (!batadv_compare_eth(nc_path->prev_hop, in_nc_node->addr))
- continue;
-
- if (!batadv_compare_eth(nc_path->next_hop, out_nc_node->addr))
- continue;
-
- spin_lock_bh(&nc_path->packet_list_lock);
- if (list_empty(&nc_path->packet_list)) {
- spin_unlock_bh(&nc_path->packet_list_lock);
- continue;
- }
-
- list_for_each_entry_safe(nc_packet, nc_packet_tmp,
- &nc_path->packet_list, list) {
- if (!batadv_nc_skb_coding_possible(nc_packet->skb,
- eth_dst,
- in_nc_node->addr))
- continue;
-
- /* Coding opportunity is found! */
- list_del(&nc_packet->list);
- nc_packet_out = nc_packet;
- break;
- }
-
- spin_unlock_bh(&nc_path->packet_list_lock);
- break;
- }
- rcu_read_unlock();
-
- return nc_packet_out;
-}
-
-/**
- * batadv_nc_skb_src_search() - Loops through the list of neighoring nodes of
- * the skb's sender (may be equal to the originator).
- * @bat_priv: the bat priv with all the soft interface information
- * @skb: data skb to forward
- * @eth_dst: next hop mac address of skb
- * @eth_src: source mac address of skb
- * @in_nc_node: pointer to skb next hop's neighbor nc node
- *
- * Return: an nc packet if a suitable coding packet was found, NULL otherwise.
- */
-static struct batadv_nc_packet *
-batadv_nc_skb_src_search(struct batadv_priv *bat_priv,
- struct sk_buff *skb,
- u8 *eth_dst,
- u8 *eth_src,
- struct batadv_nc_node *in_nc_node)
-{
- struct batadv_orig_node *orig_node;
- struct batadv_nc_node *out_nc_node;
- struct batadv_nc_packet *nc_packet = NULL;
-
- orig_node = batadv_orig_hash_find(bat_priv, eth_src);
- if (!orig_node)
- return NULL;
-
- rcu_read_lock();
- list_for_each_entry_rcu(out_nc_node,
- &orig_node->out_coding_list, list) {
- /* Check if the skb is decoded and if recoding is possible */
- if (!batadv_nc_skb_coding_possible(skb,
- out_nc_node->addr, eth_src))
- continue;
-
- /* Search for an opportunity in this nc_path */
- nc_packet = batadv_nc_path_search(bat_priv, in_nc_node,
- out_nc_node, skb, eth_dst);
- if (nc_packet)
- break;
- }
- rcu_read_unlock();
-
- batadv_orig_node_put(orig_node);
- return nc_packet;
-}
-
-/**
- * batadv_nc_skb_store_before_coding() - set the ethernet src and dst of the
- * unicast skb before it is stored for use in later decoding
- * @bat_priv: the bat priv with all the soft interface information
- * @skb: data skb to store
- * @eth_dst_new: new destination mac address of skb
- */
-static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv,
- struct sk_buff *skb,
- u8 *eth_dst_new)
-{
- struct ethhdr *ethhdr;
-
- /* Copy skb header to change the mac header */
- skb = pskb_copy_for_clone(skb, GFP_ATOMIC);
- if (!skb)
- return;
-
- /* Set the mac header as if we actually sent the packet uncoded */
- ethhdr = eth_hdr(skb);
- ether_addr_copy(ethhdr->h_source, ethhdr->h_dest);
- ether_addr_copy(ethhdr->h_dest, eth_dst_new);
-
- /* Set data pointer to MAC header to mimic packets from our tx path */
- skb_push(skb, ETH_HLEN);
-
- /* Add the packet to the decoding packet pool */
- batadv_nc_skb_store_for_decoding(bat_priv, skb);
-
- /* batadv_nc_skb_store_for_decoding() clones the skb, so we must free
- * our ref
- */
- consume_skb(skb);
-}
-
-/**
- * batadv_nc_skb_dst_search() - Loops through list of neighboring nodes to dst.
- * @skb: data skb to forward
- * @neigh_node: next hop to forward packet to
- * @ethhdr: pointer to the ethernet header inside the skb
- *
- * Loops through list of neighboring nodes the next hop has a good connection to
- * (receives OGMs with a sufficient quality). We need to find a neighbor of our
- * next hop that potentially sent a packet which our next hop also received
- * (overheard) and has stored for later decoding.
- *
- * Return: true if the skb was consumed (encoded packet sent) or false otherwise
- */
-static bool batadv_nc_skb_dst_search(struct sk_buff *skb,
- struct batadv_neigh_node *neigh_node,
- struct ethhdr *ethhdr)
-{
- struct net_device *netdev = neigh_node->if_incoming->soft_iface;
- struct batadv_priv *bat_priv = netdev_priv(netdev);
- struct batadv_orig_node *orig_node = neigh_node->orig_node;
- struct batadv_nc_node *nc_node;
- struct batadv_nc_packet *nc_packet = NULL;
-
- rcu_read_lock();
- list_for_each_entry_rcu(nc_node, &orig_node->in_coding_list, list) {
- /* Search for coding opportunity with this in_nc_node */
- nc_packet = batadv_nc_skb_src_search(bat_priv, skb,
- neigh_node->addr,
- ethhdr->h_source, nc_node);
-
- /* Opportunity was found, so stop searching */
- if (nc_packet)
- break;
- }
- rcu_read_unlock();
-
- if (!nc_packet)
- return false;
-
- /* Save packets for later decoding */
- batadv_nc_skb_store_before_coding(bat_priv, skb,
- neigh_node->addr);
- batadv_nc_skb_store_before_coding(bat_priv, nc_packet->skb,
- nc_packet->neigh_node->addr);
-
- /* Code and send packets */
- if (batadv_nc_code_packets(bat_priv, skb, ethhdr, nc_packet,
- neigh_node))
- return true;
-
- /* out of mem ? Coding failed - we have to free the buffered packet
- * to avoid memleaks. The skb passed as argument will be dealt with
- * by the calling function.
- */
- batadv_nc_send_packet(nc_packet);
- return false;
-}
-
-/**
- * batadv_nc_skb_add_to_path() - buffer skb for later encoding / decoding
- * @skb: skb to add to path
- * @nc_path: path to add skb to
- * @neigh_node: next hop to forward packet to
- * @packet_id: checksum to identify packet
- *
- * Return: true if the packet was buffered or false in case of an error.
- */
-static bool batadv_nc_skb_add_to_path(struct sk_buff *skb,
- struct batadv_nc_path *nc_path,
- struct batadv_neigh_node *neigh_node,
- __be32 packet_id)
-{
- struct batadv_nc_packet *nc_packet;
-
- nc_packet = kzalloc(sizeof(*nc_packet), GFP_ATOMIC);
- if (!nc_packet)
- return false;
-
- /* Initialize nc_packet */
- nc_packet->timestamp = jiffies;
- nc_packet->packet_id = packet_id;
- nc_packet->skb = skb;
- nc_packet->neigh_node = neigh_node;
- nc_packet->nc_path = nc_path;
-
- /* Add coding packet to list */
- spin_lock_bh(&nc_path->packet_list_lock);
- list_add_tail(&nc_packet->list, &nc_path->packet_list);
- spin_unlock_bh(&nc_path->packet_list_lock);
-
- return true;
-}
-
-/**
- * batadv_nc_skb_forward() - try to code a packet or add it to the coding packet
- * buffer
- * @skb: data skb to forward
- * @neigh_node: next hop to forward packet to
- *
- * Return: true if the skb was consumed (encoded packet sent) or false otherwise
- */
-bool batadv_nc_skb_forward(struct sk_buff *skb,
- struct batadv_neigh_node *neigh_node)
-{
- const struct net_device *netdev = neigh_node->if_incoming->soft_iface;
- struct batadv_priv *bat_priv = netdev_priv(netdev);
- struct batadv_unicast_packet *packet;
- struct batadv_nc_path *nc_path;
- struct ethhdr *ethhdr = eth_hdr(skb);
- __be32 packet_id;
- u8 *payload;
-
- /* Check if network coding is enabled */
- if (!atomic_read(&bat_priv->network_coding))
- goto out;
-
- /* We only handle unicast packets */
- payload = skb_network_header(skb);
- packet = (struct batadv_unicast_packet *)payload;
- if (packet->packet_type != BATADV_UNICAST)
- goto out;
-
- /* Try to find a coding opportunity and send the skb if one is found */
- if (batadv_nc_skb_dst_search(skb, neigh_node, ethhdr))
- return true;
-
- /* Find or create a nc_path for this src-dst pair */
- nc_path = batadv_nc_get_path(bat_priv,
- bat_priv->nc.coding_hash,
- ethhdr->h_source,
- neigh_node->addr);
-
- if (!nc_path)
- goto out;
-
- /* Add skb to nc_path */
- packet_id = batadv_skb_crc32(skb, payload + sizeof(*packet));
- if (!batadv_nc_skb_add_to_path(skb, nc_path, neigh_node, packet_id))
- goto free_nc_path;
-
- /* Packet is consumed */
- return true;
-
-free_nc_path:
- batadv_nc_path_put(nc_path);
-out:
- /* Packet is not consumed */
- return false;
-}
-
-/**
- * batadv_nc_skb_store_for_decoding() - save a clone of the skb which can be
- * used when decoding coded packets
- * @bat_priv: the bat priv with all the soft interface information
- * @skb: data skb to store
- */
-void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv,
- struct sk_buff *skb)
-{
- struct batadv_unicast_packet *packet;
- struct batadv_nc_path *nc_path;
- struct ethhdr *ethhdr = eth_hdr(skb);
- __be32 packet_id;
- u8 *payload;
-
- /* Check if network coding is enabled */
- if (!atomic_read(&bat_priv->network_coding))
- goto out;
-
- /* Check for supported packet type */
- payload = skb_network_header(skb);
- packet = (struct batadv_unicast_packet *)payload;
- if (packet->packet_type != BATADV_UNICAST)
- goto out;
-
- /* Find existing nc_path or create a new */
- nc_path = batadv_nc_get_path(bat_priv,
- bat_priv->nc.decoding_hash,
- ethhdr->h_source,
- ethhdr->h_dest);
-
- if (!nc_path)
- goto out;
-
- /* Clone skb and adjust skb->data to point at batman header */
- skb = skb_clone(skb, GFP_ATOMIC);
- if (unlikely(!skb))
- goto free_nc_path;
-
- if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
- goto free_skb;
-
- if (unlikely(!skb_pull_rcsum(skb, ETH_HLEN)))
- goto free_skb;
-
- /* Add skb to nc_path */
- packet_id = batadv_skb_crc32(skb, payload + sizeof(*packet));
- if (!batadv_nc_skb_add_to_path(skb, nc_path, NULL, packet_id))
- goto free_skb;
-
- batadv_inc_counter(bat_priv, BATADV_CNT_NC_BUFFER);
- return;
-
-free_skb:
- kfree_skb(skb);
-free_nc_path:
- batadv_nc_path_put(nc_path);
-out:
- return;
-}
-
-/**
- * batadv_nc_skb_store_sniffed_unicast() - check if a received unicast packet
- * should be saved in the decoding buffer and, if so, store it there
- * @bat_priv: the bat priv with all the soft interface information
- * @skb: unicast skb to store
- */
-void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv,
- struct sk_buff *skb)
-{
- struct ethhdr *ethhdr = eth_hdr(skb);
-
- if (batadv_is_my_mac(bat_priv, ethhdr->h_dest))
- return;
-
- /* Set data pointer to MAC header to mimic packets from our tx path */
- skb_push(skb, ETH_HLEN);
-
- batadv_nc_skb_store_for_decoding(bat_priv, skb);
-}
-
-/**
- * batadv_nc_skb_decode_packet() - decode given skb using the decode data stored
- * in nc_packet
- * @bat_priv: the bat priv with all the soft interface information
- * @skb: unicast skb to decode
- * @nc_packet: decode data needed to decode the skb
- *
- * Return: pointer to decoded unicast packet if the packet was decoded or NULL
- * in case of an error.
- */
-static struct batadv_unicast_packet *
-batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb,
- struct batadv_nc_packet *nc_packet)
-{
- const int h_size = sizeof(struct batadv_unicast_packet);
- const int h_diff = sizeof(struct batadv_coded_packet) - h_size;
- struct batadv_unicast_packet *unicast_packet;
- struct batadv_coded_packet coded_packet_tmp;
- struct ethhdr *ethhdr, ethhdr_tmp;
- u8 *orig_dest, ttl, ttvn;
- unsigned int coding_len;
- int err;
-
- /* Save headers temporarily */
- memcpy(&coded_packet_tmp, skb->data, sizeof(coded_packet_tmp));
- memcpy(&ethhdr_tmp, skb_mac_header(skb), sizeof(ethhdr_tmp));
-
- if (skb_cow(skb, 0) < 0)
- return NULL;
-
- if (unlikely(!skb_pull_rcsum(skb, h_diff)))
- return NULL;
-
- /* Data points to batman header, so set mac header 14 bytes before
- * and network to data
- */
- skb_set_mac_header(skb, -ETH_HLEN);
- skb_reset_network_header(skb);
-
- /* Reconstruct original mac header */
- ethhdr = eth_hdr(skb);
- *ethhdr = ethhdr_tmp;
-
- /* Select the correct unicast header information based on the location
- * of our mac address in the coded_packet header
- */
- if (batadv_is_my_mac(bat_priv, coded_packet_tmp.second_dest)) {
- /* If we are the second destination the packet was overheard,
- * so the Ethernet address must be copied to h_dest and
- * pkt_type changed from PACKET_OTHERHOST to PACKET_HOST
- */
- ether_addr_copy(ethhdr->h_dest, coded_packet_tmp.second_dest);
- skb->pkt_type = PACKET_HOST;
-
- orig_dest = coded_packet_tmp.second_orig_dest;
- ttl = coded_packet_tmp.second_ttl;
- ttvn = coded_packet_tmp.second_ttvn;
- } else {
- orig_dest = coded_packet_tmp.first_orig_dest;
- ttl = coded_packet_tmp.ttl;
- ttvn = coded_packet_tmp.first_ttvn;
- }
-
- coding_len = ntohs(coded_packet_tmp.coded_len);
-
- if (coding_len > skb->len)
- return NULL;
-
- /* Here the magic is reversed:
- * extract the missing packet from the received coded packet
- */
- batadv_nc_memxor(skb->data + h_size,
- nc_packet->skb->data + h_size,
- coding_len);
-
- /* Resize decoded skb if decoded with larger packet */
- if (nc_packet->skb->len > coding_len + h_size) {
- err = pskb_trim_rcsum(skb, coding_len + h_size);
- if (err)
- return NULL;
- }
-
- /* Create decoded unicast packet */
- unicast_packet = (struct batadv_unicast_packet *)skb->data;
- unicast_packet->packet_type = BATADV_UNICAST;
- unicast_packet->version = BATADV_COMPAT_VERSION;
- unicast_packet->ttl = ttl;
- ether_addr_copy(unicast_packet->dest, orig_dest);
- unicast_packet->ttvn = ttvn;
-
- batadv_nc_packet_free(nc_packet, false);
- return unicast_packet;
-}
-
-/**
- * batadv_nc_find_decoding_packet() - search through buffered decoding data to
- * find the data needed to decode the coded packet
- * @bat_priv: the bat priv with all the soft interface information
- * @ethhdr: pointer to the ethernet header inside the coded packet
- * @coded: coded packet we try to find decode data for
- *
- * Return: pointer to nc packet if the needed data was found or NULL otherwise.
- */
-static struct batadv_nc_packet *
-batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv,
- struct ethhdr *ethhdr,
- struct batadv_coded_packet *coded)
-{
- struct batadv_hashtable *hash = bat_priv->nc.decoding_hash;
- struct batadv_nc_packet *tmp_nc_packet, *nc_packet = NULL;
- struct batadv_nc_path *nc_path, nc_path_key;
- u8 *dest, *source;
- __be32 packet_id;
- int index;
-
- if (!hash)
- return NULL;
-
- /* Select the correct packet id based on the location of our mac-addr */
- dest = ethhdr->h_source;
- if (!batadv_is_my_mac(bat_priv, coded->second_dest)) {
- source = coded->second_source;
- packet_id = coded->second_crc;
- } else {
- source = coded->first_source;
- packet_id = coded->first_crc;
- }
-
- batadv_nc_hash_key_gen(&nc_path_key, source, dest);
- index = batadv_nc_hash_choose(&nc_path_key, hash->size);
-
- /* Search for matching coding path */
- rcu_read_lock();
- hlist_for_each_entry_rcu(nc_path, &hash->table[index], hash_entry) {
- /* Find matching nc_packet */
- spin_lock_bh(&nc_path->packet_list_lock);
- list_for_each_entry(tmp_nc_packet,
- &nc_path->packet_list, list) {
- if (packet_id == tmp_nc_packet->packet_id) {
- list_del(&tmp_nc_packet->list);
-
- nc_packet = tmp_nc_packet;
- break;
- }
- }
- spin_unlock_bh(&nc_path->packet_list_lock);
-
- if (nc_packet)
- break;
- }
- rcu_read_unlock();
-
- if (!nc_packet)
- batadv_dbg(BATADV_DBG_NC, bat_priv,
- "No decoding packet found for %u\n", packet_id);
-
- return nc_packet;
-}
-
-/**
- * batadv_nc_recv_coded_packet() - try to decode coded packet and enqueue the
- * resulting unicast packet
- * @skb: incoming coded packet
- * @recv_if: pointer to interface this packet was received on
- *
- * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
- * otherwise.
- */
-static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
- struct batadv_hard_iface *recv_if)
-{
- struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
- struct batadv_unicast_packet *unicast_packet;
- struct batadv_coded_packet *coded_packet;
- struct batadv_nc_packet *nc_packet;
- struct ethhdr *ethhdr;
- int hdr_size = sizeof(*coded_packet);
-
- /* Check if network coding is enabled */
- if (!atomic_read(&bat_priv->network_coding))
- goto free_skb;
-
- /* Make sure we can access (and remove) header */
- if (unlikely(!pskb_may_pull(skb, hdr_size)))
- goto free_skb;
-
- coded_packet = (struct batadv_coded_packet *)skb->data;
- ethhdr = eth_hdr(skb);
-
- /* Verify frame is destined for us */
- if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest) &&
- !batadv_is_my_mac(bat_priv, coded_packet->second_dest))
- goto free_skb;
-
- /* Update stat counter */
- if (batadv_is_my_mac(bat_priv, coded_packet->second_dest))
- batadv_inc_counter(bat_priv, BATADV_CNT_NC_SNIFFED);
-
- nc_packet = batadv_nc_find_decoding_packet(bat_priv, ethhdr,
- coded_packet);
- if (!nc_packet) {
- batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED);
- goto free_skb;
- }
-
- /* Make skb's linear, because decoding accesses the entire buffer */
- if (skb_linearize(skb) < 0)
- goto free_nc_packet;
-
- if (skb_linearize(nc_packet->skb) < 0)
- goto free_nc_packet;
-
- /* Decode the packet */
- unicast_packet = batadv_nc_skb_decode_packet(bat_priv, skb, nc_packet);
- if (!unicast_packet) {
- batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED);
- goto free_nc_packet;
- }
-
- /* Mark packet as decoded to do correct recoding when forwarding */
- BATADV_SKB_CB(skb)->decoded = true;
- batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE);
- batadv_add_counter(bat_priv, BATADV_CNT_NC_DECODE_BYTES,
- skb->len + ETH_HLEN);
- return batadv_recv_unicast_packet(skb, recv_if);
-
-free_nc_packet:
- batadv_nc_packet_free(nc_packet, true);
-free_skb:
- kfree_skb(skb);
-
- return NET_RX_DROP;
-}
-
-/**
- * batadv_nc_mesh_free() - clean up network coding memory
- * @bat_priv: the bat priv with all the soft interface information
- */
-void batadv_nc_mesh_free(struct batadv_priv *bat_priv)
-{
- batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_NC, 1);
- batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_NC, 1);
- cancel_delayed_work_sync(&bat_priv->nc.work);
-
- batadv_nc_purge_paths(bat_priv, bat_priv->nc.coding_hash, NULL);
- batadv_hash_destroy(bat_priv->nc.coding_hash);
- batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash, NULL);
- batadv_hash_destroy(bat_priv->nc.decoding_hash);
-}
-
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_nc_nodes_seq_print_text() - print the nc node information
- * @seq: seq file to print on
- * @offset: not used
- *
- * Return: always 0
- */
-int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hashtable *hash = bat_priv->orig_hash;
- struct batadv_hard_iface *primary_if;
- struct hlist_head *head;
- struct batadv_orig_node *orig_node;
- struct batadv_nc_node *nc_node;
- int i;
-
- primary_if = batadv_seq_print_text_primary_if_get(seq);
- if (!primary_if)
- goto out;
-
- /* Traverse list of originators */
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- /* For each orig_node in this bin */
- rcu_read_lock();
- hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
- /* no need to print the orig node if it does not have
- * network coding neighbors
- */
- if (list_empty(&orig_node->in_coding_list) &&
- list_empty(&orig_node->out_coding_list))
- continue;
-
- seq_printf(seq, "Node: %pM\n", orig_node->orig);
-
- seq_puts(seq, " Ingoing: ");
- /* For each in_nc_node to this orig_node */
- list_for_each_entry_rcu(nc_node,
- &orig_node->in_coding_list,
- list)
- seq_printf(seq, "%pM ",
- nc_node->addr);
- seq_puts(seq, "\n Outgoing: ");
- /* For out_nc_node to this orig_node */
- list_for_each_entry_rcu(nc_node,
- &orig_node->out_coding_list,
- list)
- seq_printf(seq, "%pM ",
- nc_node->addr);
- seq_puts(seq, "\n\n");
- }
- rcu_read_unlock();
- }
-
-out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- return 0;
-}
-
-/**
- * batadv_nc_init_debugfs() - create nc folder and related files in debugfs
- * @bat_priv: the bat priv with all the soft interface information
- *
- * Return: 0 on success or negative error number in case of failure
- */
-int batadv_nc_init_debugfs(struct batadv_priv *bat_priv)
-{
- struct dentry *nc_dir, *file;
-
- nc_dir = debugfs_create_dir("nc", bat_priv->debug_dir);
- if (!nc_dir)
- goto out;
-
- file = debugfs_create_u8("min_tq", 0644, nc_dir, &bat_priv->nc.min_tq);
- if (!file)
- goto out;
-
- file = debugfs_create_u32("max_fwd_delay", 0644, nc_dir,
- &bat_priv->nc.max_fwd_delay);
- if (!file)
- goto out;
-
- file = debugfs_create_u32("max_buffer_time", 0644, nc_dir,
- &bat_priv->nc.max_buffer_time);
- if (!file)
- goto out;
-
- return 0;
-
-out:
- return -ENOMEM;
-}
-#endif
diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h
deleted file mode 100644
index 65c346812bc1..000000000000
--- a/net/batman-adv/network-coding.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2012-2018 B.A.T.M.A.N. contributors:
- *
- * Martin Hundebøll, Jeppe Ledet-Pedersen
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _NET_BATMAN_ADV_NETWORK_CODING_H_
-#define _NET_BATMAN_ADV_NETWORK_CODING_H_
-
-#include "main.h"
-
-#include <linux/types.h>
-
-struct batadv_ogm_packet;
-struct net_device;
-struct seq_file;
-struct sk_buff;
-
-#ifdef CONFIG_BATMAN_ADV_NC
-
-void batadv_nc_status_update(struct net_device *net_dev);
-int batadv_nc_init(void);
-int batadv_nc_mesh_init(struct batadv_priv *bat_priv);
-void batadv_nc_mesh_free(struct batadv_priv *bat_priv);
-void batadv_nc_update_nc_node(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig_node,
- struct batadv_orig_node *orig_neigh_node,
- struct batadv_ogm_packet *ogm_packet,
- int is_single_hop_neigh);
-void batadv_nc_purge_orig(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig_node,
- bool (*to_purge)(struct batadv_priv *,
- struct batadv_nc_node *));
-void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv);
-void batadv_nc_init_orig(struct batadv_orig_node *orig_node);
-bool batadv_nc_skb_forward(struct sk_buff *skb,
- struct batadv_neigh_node *neigh_node);
-void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv,
- struct sk_buff *skb);
-void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv,
- struct sk_buff *skb);
-int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset);
-int batadv_nc_init_debugfs(struct batadv_priv *bat_priv);
-
-#else /* ifdef CONFIG_BATMAN_ADV_NC */
-
-static inline void batadv_nc_status_update(struct net_device *net_dev)
-{
-}
-
-static inline int batadv_nc_init(void)
-{
- return 0;
-}
-
-static inline int batadv_nc_mesh_init(struct batadv_priv *bat_priv)
-{
- return 0;
-}
-
-static inline void batadv_nc_mesh_free(struct batadv_priv *bat_priv)
-{
-}
-
-static inline void
-batadv_nc_update_nc_node(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig_node,
- struct batadv_orig_node *orig_neigh_node,
- struct batadv_ogm_packet *ogm_packet,
- int is_single_hop_neigh)
-{
-}
-
-static inline void
-batadv_nc_purge_orig(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig_node,
- bool (*to_purge)(struct batadv_priv *,
- struct batadv_nc_node *))
-{
-}
-
-static inline void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv)
-{
-}
-
-static inline void batadv_nc_init_orig(struct batadv_orig_node *orig_node)
-{
-}
-
-static inline bool batadv_nc_skb_forward(struct sk_buff *skb,
- struct batadv_neigh_node *neigh_node)
-{
- return false;
-}
-
-static inline void
-batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv,
- struct sk_buff *skb)
-{
-}
-
-static inline void
-batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv,
- struct sk_buff *skb)
-{
-}
-
-static inline int batadv_nc_nodes_seq_print_text(struct seq_file *seq,
- void *offset)
-{
- return 0;
-}
-
-static inline int batadv_nc_init_debugfs(struct batadv_priv *bat_priv)
-{
- return 0;
-}
-
-#endif /* ifdef CONFIG_BATMAN_ADV_NC */
-
-#endif /* _NET_BATMAN_ADV_NETWORK_CODING_H_ */
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 1d295da3e342..a662408ad867 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -1,30 +1,20 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "originator.h"
#include "main.h"
#include <linux/atomic.h>
+#include <linux/container_of.h>
+#include <linux/err.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/gfp.h>
+#include <linux/if_vlan.h>
#include <linux/jiffies.h>
-#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/list.h>
#include <linux/lockdep.h>
@@ -32,16 +22,13 @@
#include <linux/netlink.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/stddef.h>
#include <linux/workqueue.h>
-#include <net/sock.h>
-#include <uapi/linux/batman_adv.h>
+#include <uapi/linux/batadv_packet.h>
-#include "bat_algo.h"
#include "distributed-arp-table.h"
#include "fragmentation.h"
#include "gateway_client.h"
@@ -50,9 +37,7 @@
#include "log.h"
#include "multicast.h"
#include "netlink.h"
-#include "network-coding.h"
#include "routing.h"
-#include "soft-interface.h"
#include "translation-table.h"
/* hash class keys */
@@ -60,7 +45,7 @@ static struct lock_class_key batadv_orig_hash_lock_class_key;
/**
* batadv_orig_hash_find() - Find and return originator from orig_hash
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @data: mac address of the originator
*
* Return: orig_node (with increased refcnt), NULL on errors
@@ -144,6 +129,29 @@ batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node,
}
/**
+ * batadv_vlan_id_valid() - check if vlan id is in valid batman-adv encoding
+ * @vid: the VLAN identifier
+ *
+ * Return: true when either no vlan is set or if VLAN is in correct range,
+ * false otherwise
+ */
+static bool batadv_vlan_id_valid(unsigned short vid)
+{
+ unsigned short non_vlan = vid & ~(BATADV_VLAN_HAS_TAG | VLAN_VID_MASK);
+
+ if (vid == 0)
+ return true;
+
+ if (!(vid & BATADV_VLAN_HAS_TAG))
+ return false;
+
+ if (non_vlan)
+ return false;
+
+ return true;
+}
+
+/**
* batadv_orig_node_vlan_new() - search and possibly create an orig_node_vlan
* object
* @orig_node: the originator serving the VLAN
@@ -161,6 +169,9 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node,
{
struct batadv_orig_node_vlan *vlan;
+ if (!batadv_vlan_id_valid(vid))
+ return NULL;
+
spin_lock_bh(&orig_node->vlan_list_lock);
/* first look if an object for this vid already exists */
@@ -189,7 +200,7 @@ out:
* and queue for free after rcu grace period
* @ref: kref pointer of the originator-vlan object
*/
-static void batadv_orig_node_vlan_release(struct kref *ref)
+void batadv_orig_node_vlan_release(struct kref *ref)
{
struct batadv_orig_node_vlan *orig_vlan;
@@ -199,18 +210,8 @@ static void batadv_orig_node_vlan_release(struct kref *ref)
}
/**
- * batadv_orig_node_vlan_put() - decrement the refcounter and possibly release
- * the originator-vlan object
- * @orig_vlan: the originator-vlan object to release
- */
-void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan)
-{
- kref_put(&orig_vlan->refcount, batadv_orig_node_vlan_release);
-}
-
-/**
* batadv_originator_init() - Initialize all originator structures
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*
* Return: 0 on success or negative error number in case of failure
*/
@@ -243,7 +244,7 @@ err:
* free after rcu grace period
* @ref: kref pointer of the neigh_ifinfo
*/
-static void batadv_neigh_ifinfo_release(struct kref *ref)
+void batadv_neigh_ifinfo_release(struct kref *ref)
{
struct batadv_neigh_ifinfo *neigh_ifinfo;
@@ -256,21 +257,11 @@ static void batadv_neigh_ifinfo_release(struct kref *ref)
}
/**
- * batadv_neigh_ifinfo_put() - decrement the refcounter and possibly release
- * the neigh_ifinfo
- * @neigh_ifinfo: the neigh_ifinfo object to release
- */
-void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo)
-{
- kref_put(&neigh_ifinfo->refcount, batadv_neigh_ifinfo_release);
-}
-
-/**
* batadv_hardif_neigh_release() - release hardif neigh node from lists and
* queue for free after rcu grace period
* @ref: kref pointer of the neigh_node
*/
-static void batadv_hardif_neigh_release(struct kref *ref)
+void batadv_hardif_neigh_release(struct kref *ref)
{
struct batadv_hardif_neigh_node *hardif_neigh;
@@ -286,21 +277,11 @@ static void batadv_hardif_neigh_release(struct kref *ref)
}
/**
- * batadv_hardif_neigh_put() - decrement the hardif neighbors refcounter
- * and possibly release it
- * @hardif_neigh: hardif neigh neighbor to free
- */
-void batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh)
-{
- kref_put(&hardif_neigh->refcount, batadv_hardif_neigh_release);
-}
-
-/**
* batadv_neigh_node_release() - release neigh_node from lists and queue for
* free after rcu grace period
* @ref: kref pointer of the neigh_node
*/
-static void batadv_neigh_node_release(struct kref *ref)
+void batadv_neigh_node_release(struct kref *ref)
{
struct hlist_node *node_tmp;
struct batadv_neigh_node *neigh_node;
@@ -321,22 +302,12 @@ static void batadv_neigh_node_release(struct kref *ref)
}
/**
- * batadv_neigh_node_put() - decrement the neighbors refcounter and possibly
- * release it
- * @neigh_node: neigh neighbor to free
- */
-void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node)
-{
- kref_put(&neigh_node->refcount, batadv_neigh_node_release);
-}
-
-/**
* batadv_orig_router_get() - router to the originator depending on iface
* @orig_node: the orig node for the router
* @if_outgoing: the interface where the payload packet has been received or
* the OGM should be sent to
*
- * Return: the neighbor which should be router for this orig_node/iface.
+ * Return: the neighbor which should be the router for this orig_node/iface.
*
* The object is returned with refcounter increased by 1.
*/
@@ -364,6 +335,33 @@ batadv_orig_router_get(struct batadv_orig_node *orig_node,
}
/**
+ * batadv_orig_to_router() - get next hop neighbor to an orig address
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @orig_addr: the originator MAC address to search the best next hop router for
+ * @if_outgoing: the interface where the payload packet has been received or
+ * the OGM should be sent to
+ *
+ * Return: A neighbor node which is the best router towards the given originator
+ * address.
+ */
+struct batadv_neigh_node *
+batadv_orig_to_router(struct batadv_priv *bat_priv, u8 *orig_addr,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_neigh_node *neigh_node;
+ struct batadv_orig_node *orig_node;
+
+ orig_node = batadv_orig_hash_find(bat_priv, orig_addr);
+ if (!orig_node)
+ return NULL;
+
+ neigh_node = batadv_find_router(bat_priv, orig_node, if_outgoing);
+ batadv_orig_node_put(orig_node);
+
+ return neigh_node;
+}
+
+/**
* batadv_orig_ifinfo_get() - find the ifinfo from an orig_node
* @orig_node: the orig node to be queried
* @if_outgoing: the interface for which the ifinfo should be acquired
@@ -526,7 +524,7 @@ out:
* Looks for and possibly returns a neighbour belonging to this originator list
* which is connected through the provided hard interface.
*
- * Return: neighbor when found. Othwerwise NULL
+ * Return: neighbor when found. Otherwise NULL
*/
static struct batadv_neigh_node *
batadv_neigh_node_get(const struct batadv_orig_node *orig_node,
@@ -567,7 +565,7 @@ batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface,
const u8 *neigh_addr,
struct batadv_orig_node *orig_node)
{
- struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->mesh_iface);
struct batadv_hardif_neigh_node *hardif_neigh;
spin_lock_bh(&hard_iface->neigh_list_lock);
@@ -631,7 +629,7 @@ batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface,
*
* Looks for and possibly returns a neighbour belonging to this hard interface.
*
- * Return: neighbor when found. Othwerwise NULL
+ * Return: neighbor when found. Otherwise NULL
*/
struct batadv_hardif_neigh_node *
batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface,
@@ -716,8 +714,7 @@ batadv_neigh_node_create(struct batadv_orig_node *orig_node,
out:
spin_unlock_bh(&orig_node->neigh_list_lock);
- if (hardif_neigh)
- batadv_hardif_neigh_put(hardif_neigh);
+ batadv_hardif_neigh_put(hardif_neigh);
return neigh_node;
}
@@ -744,42 +741,6 @@ batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node,
return batadv_neigh_node_create(orig_node, hard_iface, neigh_addr);
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_hardif_neigh_seq_print_text() - print the single hop neighbour list
- * @seq: neighbour table seq_file struct
- * @offset: not used
- *
- * Return: always 0
- */
-int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hard_iface *primary_if;
-
- primary_if = batadv_seq_print_text_primary_if_get(seq);
- if (!primary_if)
- return 0;
-
- seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n",
- BATADV_SOURCE_VERSION, primary_if->net_dev->name,
- primary_if->net_dev->dev_addr, net_dev->name,
- bat_priv->algo_ops->name);
-
- batadv_hardif_put(primary_if);
-
- if (!bat_priv->algo_ops->neigh.print) {
- seq_puts(seq,
- "No printing function for this routing protocol\n");
- return 0;
- }
-
- bat_priv->algo_ops->neigh.print(bat_priv, seq);
- return 0;
-}
-#endif
-
/**
* batadv_hardif_neigh_dump() - Dump to netlink the neighbor infos for a
* specific outgoing interface
@@ -790,69 +751,54 @@ int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset)
*/
int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb)
{
- struct net *net = sock_net(cb->skb->sk);
- struct net_device *soft_iface;
- struct net_device *hard_iface = NULL;
- struct batadv_hard_iface *hardif = BATADV_IF_DEFAULT;
+ struct batadv_hard_iface *primary_if, *hard_iface;
+ struct net_device *mesh_iface;
struct batadv_priv *bat_priv;
- struct batadv_hard_iface *primary_if = NULL;
int ret;
- int ifindex, hard_ifindex;
-
- ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
- if (!ifindex)
- return -EINVAL;
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
- ret = -ENODEV;
- goto out;
- }
+ mesh_iface = batadv_netlink_get_meshif(cb);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
- bat_priv = netdev_priv(soft_iface);
+ bat_priv = netdev_priv(mesh_iface);
primary_if = batadv_primary_if_get_selected(bat_priv);
- if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+ if (!primary_if) {
ret = -ENOENT;
- goto out;
+ goto out_put_mesh_iface;
}
- hard_ifindex = batadv_netlink_get_ifindex(cb->nlh,
- BATADV_ATTR_HARD_IFINDEX);
- if (hard_ifindex) {
- hard_iface = dev_get_by_index(net, hard_ifindex);
- if (hard_iface)
- hardif = batadv_hardif_get_by_netdev(hard_iface);
-
- if (!hardif) {
- ret = -ENODEV;
- goto out;
- }
+ if (primary_if->if_status != BATADV_IF_ACTIVE) {
+ ret = -ENOENT;
+ goto out_put_primary_if;
+ }
- if (hardif->soft_iface != soft_iface) {
- ret = -ENOENT;
- goto out;
- }
+ hard_iface = batadv_netlink_get_hardif(bat_priv, cb);
+ if (IS_ERR(hard_iface) && PTR_ERR(hard_iface) != -ENONET) {
+ ret = PTR_ERR(hard_iface);
+ goto out_put_primary_if;
+ } else if (IS_ERR(hard_iface)) {
+ /* => PTR_ERR(hard_iface) == -ENONET
+ * => no hard-iface given, ok
+ */
+ hard_iface = BATADV_IF_DEFAULT;
}
if (!bat_priv->algo_ops->neigh.dump) {
ret = -EOPNOTSUPP;
- goto out;
+ goto out_put_hard_iface;
}
- bat_priv->algo_ops->neigh.dump(msg, cb, bat_priv, hardif);
+ bat_priv->algo_ops->neigh.dump(msg, cb, bat_priv, hard_iface);
ret = msg->len;
- out:
- if (hardif)
- batadv_hardif_put(hardif);
- if (hard_iface)
- dev_put(hard_iface);
- if (primary_if)
- batadv_hardif_put(primary_if);
- if (soft_iface)
- dev_put(soft_iface);
+out_put_hard_iface:
+ batadv_hardif_put(hard_iface);
+out_put_primary_if:
+ batadv_hardif_put(primary_if);
+out_put_mesh_iface:
+ dev_put(mesh_iface);
return ret;
}
@@ -862,7 +808,7 @@ int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb)
* free after rcu grace period
* @ref: kref pointer of the orig_ifinfo
*/
-static void batadv_orig_ifinfo_release(struct kref *ref)
+void batadv_orig_ifinfo_release(struct kref *ref)
{
struct batadv_orig_ifinfo *orig_ifinfo;
struct batadv_neigh_node *router;
@@ -874,23 +820,12 @@ static void batadv_orig_ifinfo_release(struct kref *ref)
/* this is the last reference to this object */
router = rcu_dereference_protected(orig_ifinfo->router, true);
- if (router)
- batadv_neigh_node_put(router);
+ batadv_neigh_node_put(router);
kfree_rcu(orig_ifinfo, rcu);
}
/**
- * batadv_orig_ifinfo_put() - decrement the refcounter and possibly release
- * the orig_ifinfo
- * @orig_ifinfo: the orig_ifinfo object to release
- */
-void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo)
-{
- kref_put(&orig_ifinfo->refcount, batadv_orig_ifinfo_release);
-}
-
-/**
* batadv_orig_node_free_rcu() - free the orig_node
* @rcu: rcu pointer of the orig_node
*/
@@ -904,9 +839,6 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu)
batadv_frag_purge_orig(orig_node, NULL);
- if (orig_node->bat_priv->algo_ops->orig.free)
- orig_node->bat_priv->algo_ops->orig.free(orig_node);
-
kfree(orig_node->tt_buff);
kfree(orig_node);
}
@@ -916,7 +848,7 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu)
* free after rcu grace period
* @ref: kref pointer of the orig_node
*/
-static void batadv_orig_node_release(struct kref *ref)
+void batadv_orig_node_release(struct kref *ref)
{
struct hlist_node *node_tmp;
struct batadv_neigh_node *neigh_node;
@@ -946,8 +878,7 @@ static void batadv_orig_node_release(struct kref *ref)
orig_node->last_bonding_candidate = NULL;
spin_unlock_bh(&orig_node->neigh_list_lock);
- if (last_candidate)
- batadv_orig_ifinfo_put(last_candidate);
+ batadv_orig_ifinfo_put(last_candidate);
spin_lock_bh(&orig_node->vlan_list_lock);
hlist_for_each_entry_safe(vlan, node_tmp, &orig_node->vlan_list, list) {
@@ -956,25 +887,12 @@ static void batadv_orig_node_release(struct kref *ref)
}
spin_unlock_bh(&orig_node->vlan_list_lock);
- /* Free nc_nodes */
- batadv_nc_purge_orig(orig_node->bat_priv, orig_node, NULL);
-
call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu);
}
/**
- * batadv_orig_node_put() - decrement the orig node refcounter and possibly
- * release it
- * @orig_node: the orig node to free
- */
-void batadv_orig_node_put(struct batadv_orig_node *orig_node)
-{
- kref_put(&orig_node->refcount, batadv_orig_node_release);
-}
-
-/**
* batadv_originator_free() - Free all originator structures
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*/
void batadv_originator_free(struct batadv_priv *bat_priv)
{
@@ -1010,10 +928,10 @@ void batadv_originator_free(struct batadv_priv *bat_priv)
/**
* batadv_orig_node_new() - creates a new orig_node
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @addr: the mac address of the originator
*
- * Creates a new originator object and initialise all the generic fields.
+ * Creates a new originator object and initialises all the generic fields.
* The new object is not added to the originator list.
*
* Return: the newly created object or NULL on failure.
@@ -1042,8 +960,6 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
spin_lock_init(&orig_node->tt_lock);
spin_lock_init(&orig_node->vlan_list_lock);
- batadv_nc_init_orig(orig_node);
-
/* extra reference for return */
kref_init(&orig_node->refcount);
@@ -1058,7 +974,9 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
orig_node->bcast_seqno_reset = reset_time;
#ifdef CONFIG_BATMAN_ADV_MCAST
- orig_node->mcast_flags = BATADV_NO_FLAGS;
+ orig_node->mcast_flags = BATADV_MCAST_WANT_NO_RTR4;
+ orig_node->mcast_flags |= BATADV_MCAST_WANT_NO_RTR6;
+ orig_node->mcast_flags |= BATADV_MCAST_HAVE_MC_PTYPE_CAPA;
INIT_HLIST_NODE(&orig_node->mcast_want_all_unsnoopables_node);
INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv4_node);
INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv6_node);
@@ -1089,7 +1007,7 @@ free_orig_node:
/**
* batadv_purge_neigh_ifinfo() - purge obsolete ifinfo entries from neighbor
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @neigh: orig node which is to be checked
*/
static void
@@ -1130,7 +1048,7 @@ batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv,
/**
* batadv_purge_orig_ifinfo() - purge obsolete ifinfo entries from originator
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig_node: orig node which is to be checked
*
* Return: true if any ifinfo entry was purged, false otherwise.
@@ -1182,7 +1100,7 @@ batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv,
/**
* batadv_purge_orig_neighbors() - purges neighbors from originator
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig_node: orig node which is to be checked
*
* Return: true if any neighbor was purged, false otherwise
@@ -1240,7 +1158,7 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv,
/**
* batadv_find_best_neighbor() - finds the best neighbor after purging
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig_node: orig node which is to be checked
* @if_outgoing: the interface for which the metric should be compared
*
@@ -1263,8 +1181,7 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv,
if (!kref_get_unless_zero(&neigh->refcount))
continue;
- if (best)
- batadv_neigh_node_put(best);
+ batadv_neigh_node_put(best);
best = neigh;
}
@@ -1275,7 +1192,7 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv,
/**
* batadv_purge_orig_node() - purges obsolete information from an orig_node
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig_node: orig node which is to be checked
*
* This function checks if the orig_node or substructures of it have become
@@ -1289,6 +1206,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv,
struct batadv_neigh_node *best_neigh_node;
struct batadv_hard_iface *hard_iface;
bool changed_ifinfo, changed_neigh;
+ struct list_head *iter;
if (batadv_has_timed_out(orig_node->last_seen,
2 * BATADV_PURGE_TIMEOUT)) {
@@ -1309,18 +1227,14 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv,
BATADV_IF_DEFAULT);
batadv_update_route(bat_priv, orig_node, BATADV_IF_DEFAULT,
best_neigh_node);
- if (best_neigh_node)
- batadv_neigh_node_put(best_neigh_node);
+ batadv_neigh_node_put(best_neigh_node);
/* ... then for all other interfaces. */
rcu_read_lock();
- list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) {
if (hard_iface->if_status != BATADV_IF_ACTIVE)
continue;
- if (hard_iface->soft_iface != bat_priv->soft_iface)
- continue;
-
if (!kref_get_unless_zero(&hard_iface->refcount))
continue;
@@ -1329,8 +1243,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv,
hard_iface);
batadv_update_route(bat_priv, orig_node, hard_iface,
best_neigh_node);
- if (best_neigh_node)
- batadv_neigh_node_put(best_neigh_node);
+ batadv_neigh_node_put(best_neigh_node);
batadv_hardif_put(hard_iface);
}
@@ -1341,7 +1254,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv,
/**
* batadv_purge_orig_ref() - Purge all outdated originators
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*/
void batadv_purge_orig_ref(struct batadv_priv *bat_priv)
{
@@ -1358,6 +1271,8 @@ void batadv_purge_orig_ref(struct batadv_priv *bat_priv)
/* for all origins... */
for (i = 0; i < hash->size; i++) {
head = &hash->table[i];
+ if (hlist_empty(head))
+ continue;
list_lock = &hash->list_locks[i];
spin_lock_bh(list_lock);
@@ -1395,90 +1310,6 @@ static void batadv_purge_orig(struct work_struct *work)
msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD));
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-
-/**
- * batadv_orig_seq_print_text() - Print the originator table in a seq file
- * @seq: seq file to print on
- * @offset: not used
- *
- * Return: always 0
- */
-int batadv_orig_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hard_iface *primary_if;
-
- primary_if = batadv_seq_print_text_primary_if_get(seq);
- if (!primary_if)
- return 0;
-
- seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n",
- BATADV_SOURCE_VERSION, primary_if->net_dev->name,
- primary_if->net_dev->dev_addr, net_dev->name,
- bat_priv->algo_ops->name);
-
- batadv_hardif_put(primary_if);
-
- if (!bat_priv->algo_ops->orig.print) {
- seq_puts(seq,
- "No printing function for this routing protocol\n");
- return 0;
- }
-
- bat_priv->algo_ops->orig.print(bat_priv, seq, BATADV_IF_DEFAULT);
-
- return 0;
-}
-
-/**
- * batadv_orig_hardif_seq_print_text() - writes originator infos for a specific
- * outgoing interface
- * @seq: debugfs table seq_file struct
- * @offset: not used
- *
- * Return: 0
- */
-int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_hard_iface *hard_iface;
- struct batadv_priv *bat_priv;
-
- hard_iface = batadv_hardif_get_by_netdev(net_dev);
-
- if (!hard_iface || !hard_iface->soft_iface) {
- seq_puts(seq, "Interface not known to B.A.T.M.A.N.\n");
- goto out;
- }
-
- bat_priv = netdev_priv(hard_iface->soft_iface);
- if (!bat_priv->algo_ops->orig.print) {
- seq_puts(seq,
- "No printing function for this routing protocol\n");
- goto out;
- }
-
- if (hard_iface->if_status != BATADV_IF_ACTIVE) {
- seq_puts(seq, "Interface not active\n");
- goto out;
- }
-
- seq_printf(seq, "[B.A.T.M.A.N. adv %s, IF/MAC: %s/%pM (%s %s)]\n",
- BATADV_SOURCE_VERSION, hard_iface->net_dev->name,
- hard_iface->net_dev->dev_addr,
- hard_iface->soft_iface->name, bat_priv->algo_ops->name);
-
- bat_priv->algo_ops->orig.print(bat_priv, seq, hard_iface);
-
-out:
- if (hard_iface)
- batadv_hardif_put(hard_iface);
- return 0;
-}
-#endif
-
/**
* batadv_orig_dump() - Dump to netlink the originator infos for a specific
* outgoing interface
@@ -1489,173 +1320,54 @@ out:
*/
int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb)
{
- struct net *net = sock_net(cb->skb->sk);
- struct net_device *soft_iface;
- struct net_device *hard_iface = NULL;
- struct batadv_hard_iface *hardif = BATADV_IF_DEFAULT;
+ struct batadv_hard_iface *primary_if, *hard_iface;
+ struct net_device *mesh_iface;
struct batadv_priv *bat_priv;
- struct batadv_hard_iface *primary_if = NULL;
int ret;
- int ifindex, hard_ifindex;
- ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
- if (!ifindex)
- return -EINVAL;
+ mesh_iface = batadv_netlink_get_meshif(cb);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
- ret = -ENODEV;
- goto out;
- }
-
- bat_priv = netdev_priv(soft_iface);
+ bat_priv = netdev_priv(mesh_iface);
primary_if = batadv_primary_if_get_selected(bat_priv);
- if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
+ if (!primary_if) {
ret = -ENOENT;
- goto out;
+ goto out_put_mesh_iface;
}
- hard_ifindex = batadv_netlink_get_ifindex(cb->nlh,
- BATADV_ATTR_HARD_IFINDEX);
- if (hard_ifindex) {
- hard_iface = dev_get_by_index(net, hard_ifindex);
- if (hard_iface)
- hardif = batadv_hardif_get_by_netdev(hard_iface);
-
- if (!hardif) {
- ret = -ENODEV;
- goto out;
- }
+ if (primary_if->if_status != BATADV_IF_ACTIVE) {
+ ret = -ENOENT;
+ goto out_put_primary_if;
+ }
- if (hardif->soft_iface != soft_iface) {
- ret = -ENOENT;
- goto out;
- }
+ hard_iface = batadv_netlink_get_hardif(bat_priv, cb);
+ if (IS_ERR(hard_iface) && PTR_ERR(hard_iface) != -ENONET) {
+ ret = PTR_ERR(hard_iface);
+ goto out_put_primary_if;
+ } else if (IS_ERR(hard_iface)) {
+ /* => PTR_ERR(hard_iface) == -ENONET
+ * => no hard-iface given, ok
+ */
+ hard_iface = BATADV_IF_DEFAULT;
}
if (!bat_priv->algo_ops->orig.dump) {
ret = -EOPNOTSUPP;
- goto out;
+ goto out_put_hard_iface;
}
- bat_priv->algo_ops->orig.dump(msg, cb, bat_priv, hardif);
+ bat_priv->algo_ops->orig.dump(msg, cb, bat_priv, hard_iface);
ret = msg->len;
- out:
- if (hardif)
- batadv_hardif_put(hardif);
- if (hard_iface)
- dev_put(hard_iface);
- if (primary_if)
- batadv_hardif_put(primary_if);
- if (soft_iface)
- dev_put(soft_iface);
+out_put_hard_iface:
+ batadv_hardif_put(hard_iface);
+out_put_primary_if:
+ batadv_hardif_put(primary_if);
+out_put_mesh_iface:
+ dev_put(mesh_iface);
return ret;
}
-
-/**
- * batadv_orig_hash_add_if() - Add interface to originators in orig_hash
- * @hard_iface: hard interface to add (already slave of the soft interface)
- * @max_if_num: new number of interfaces
- *
- * Return: 0 on success or negative error number in case of failure
- */
-int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface,
- unsigned int max_if_num)
-{
- struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
- struct batadv_algo_ops *bao = bat_priv->algo_ops;
- struct batadv_hashtable *hash = bat_priv->orig_hash;
- struct hlist_head *head;
- struct batadv_orig_node *orig_node;
- u32 i;
- int ret;
-
- /* resize all orig nodes because orig_node->bcast_own(_sum) depend on
- * if_num
- */
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
- ret = 0;
- if (bao->orig.add_if)
- ret = bao->orig.add_if(orig_node, max_if_num);
- if (ret == -ENOMEM)
- goto err;
- }
- rcu_read_unlock();
- }
-
- return 0;
-
-err:
- rcu_read_unlock();
- return -ENOMEM;
-}
-
-/**
- * batadv_orig_hash_del_if() - Remove interface from originators in orig_hash
- * @hard_iface: hard interface to remove (still slave of the soft interface)
- * @max_if_num: new number of interfaces
- *
- * Return: 0 on success or negative error number in case of failure
- */
-int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface,
- unsigned int max_if_num)
-{
- struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
- struct batadv_hashtable *hash = bat_priv->orig_hash;
- struct hlist_head *head;
- struct batadv_hard_iface *hard_iface_tmp;
- struct batadv_orig_node *orig_node;
- struct batadv_algo_ops *bao = bat_priv->algo_ops;
- u32 i;
- int ret;
-
- /* resize all orig nodes because orig_node->bcast_own(_sum) depend on
- * if_num
- */
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
- ret = 0;
- if (bao->orig.del_if)
- ret = bao->orig.del_if(orig_node, max_if_num,
- hard_iface->if_num);
- if (ret == -ENOMEM)
- goto err;
- }
- rcu_read_unlock();
- }
-
- /* renumber remaining batman interfaces _inside_ of orig_hash_lock */
- rcu_read_lock();
- list_for_each_entry_rcu(hard_iface_tmp, &batadv_hardif_list, list) {
- if (hard_iface_tmp->if_status == BATADV_IF_NOT_IN_USE)
- continue;
-
- if (hard_iface == hard_iface_tmp)
- continue;
-
- if (hard_iface->soft_iface != hard_iface_tmp->soft_iface)
- continue;
-
- if (hard_iface_tmp->if_num > hard_iface->if_num)
- hard_iface_tmp->if_num--;
- }
- rcu_read_unlock();
-
- hard_iface->if_num = -1;
- return 0;
-
-err:
- rcu_read_unlock();
- return -ENOMEM;
-}
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index 3b3f59b881e1..db0c55128170 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_ORIGINATOR_H_
@@ -24,42 +12,42 @@
#include <linux/compiler.h>
#include <linux/if_ether.h>
#include <linux/jhash.h>
+#include <linux/kref.h>
+#include <linux/netlink.h>
+#include <linux/skbuff.h>
#include <linux/types.h>
-struct netlink_callback;
-struct seq_file;
-struct sk_buff;
-
bool batadv_compare_orig(const struct hlist_node *node, const void *data2);
int batadv_originator_init(struct batadv_priv *bat_priv);
void batadv_originator_free(struct batadv_priv *bat_priv);
void batadv_purge_orig_ref(struct batadv_priv *bat_priv);
-void batadv_orig_node_put(struct batadv_orig_node *orig_node);
+void batadv_orig_node_release(struct kref *ref);
struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
const u8 *addr);
struct batadv_hardif_neigh_node *
batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface,
const u8 *neigh_addr);
-void
-batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh);
+void batadv_hardif_neigh_release(struct kref *ref);
struct batadv_neigh_node *
batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node,
struct batadv_hard_iface *hard_iface,
const u8 *neigh_addr);
-void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node);
+void batadv_neigh_node_release(struct kref *ref);
struct batadv_neigh_node *
batadv_orig_router_get(struct batadv_orig_node *orig_node,
const struct batadv_hard_iface *if_outgoing);
+struct batadv_neigh_node *
+batadv_orig_to_router(struct batadv_priv *bat_priv, u8 *orig_addr,
+ struct batadv_hard_iface *if_outgoing);
struct batadv_neigh_ifinfo *
batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh,
struct batadv_hard_iface *if_outgoing);
struct batadv_neigh_ifinfo *
batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh,
struct batadv_hard_iface *if_outgoing);
-void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo);
+void batadv_neigh_ifinfo_release(struct kref *ref);
int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb);
-int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset);
struct batadv_orig_ifinfo *
batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node,
@@ -67,22 +55,16 @@ batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node,
struct batadv_orig_ifinfo *
batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node,
struct batadv_hard_iface *if_outgoing);
-void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo);
+void batadv_orig_ifinfo_release(struct kref *ref);
-int batadv_orig_seq_print_text(struct seq_file *seq, void *offset);
int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb);
-int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset);
-int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface,
- unsigned int max_if_num);
-int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface,
- unsigned int max_if_num);
struct batadv_orig_node_vlan *
batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node,
unsigned short vid);
struct batadv_orig_node_vlan *
batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node,
unsigned short vid);
-void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan);
+void batadv_orig_node_vlan_release(struct kref *ref);
/**
* batadv_choose_orig() - Return the index of the orig entry in the hash table
@@ -103,4 +85,86 @@ static inline u32 batadv_choose_orig(const void *data, u32 size)
struct batadv_orig_node *
batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data);
+/**
+ * batadv_orig_node_vlan_put() - decrement the refcounter and possibly release
+ * the originator-vlan object
+ * @orig_vlan: the originator-vlan object to release
+ */
+static inline void
+batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan)
+{
+ if (!orig_vlan)
+ return;
+
+ kref_put(&orig_vlan->refcount, batadv_orig_node_vlan_release);
+}
+
+/**
+ * batadv_neigh_ifinfo_put() - decrement the refcounter and possibly release
+ * the neigh_ifinfo
+ * @neigh_ifinfo: the neigh_ifinfo object to release
+ */
+static inline void
+batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo)
+{
+ if (!neigh_ifinfo)
+ return;
+
+ kref_put(&neigh_ifinfo->refcount, batadv_neigh_ifinfo_release);
+}
+
+/**
+ * batadv_hardif_neigh_put() - decrement the hardif neighbors refcounter
+ * and possibly release it
+ * @hardif_neigh: hardif neigh neighbor to free
+ */
+static inline void
+batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh)
+{
+ if (!hardif_neigh)
+ return;
+
+ kref_put(&hardif_neigh->refcount, batadv_hardif_neigh_release);
+}
+
+/**
+ * batadv_neigh_node_put() - decrement the neighbors refcounter and possibly
+ * release it
+ * @neigh_node: neigh neighbor to free
+ */
+static inline void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node)
+{
+ if (!neigh_node)
+ return;
+
+ kref_put(&neigh_node->refcount, batadv_neigh_node_release);
+}
+
+/**
+ * batadv_orig_ifinfo_put() - decrement the refcounter and possibly release
+ * the orig_ifinfo
+ * @orig_ifinfo: the orig_ifinfo object to release
+ */
+static inline void
+batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo)
+{
+ if (!orig_ifinfo)
+ return;
+
+ kref_put(&orig_ifinfo->refcount, batadv_orig_ifinfo_release);
+}
+
+/**
+ * batadv_orig_node_put() - decrement the orig node refcounter and possibly
+ * release it
+ * @orig_node: the orig node to free
+ */
+static inline void batadv_orig_node_put(struct batadv_orig_node *orig_node)
+{
+ if (!orig_node)
+ return;
+
+ kref_put(&orig_node->refcount, batadv_orig_node_release);
+}
+
#endif /* _NET_BATMAN_ADV_ORIGINATOR_H_ */
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index cc3ed93a6d51..12c16f81cc51 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -1,19 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "routing.h"
@@ -41,12 +29,10 @@
#include "distributed-arp-table.h"
#include "fragmentation.h"
#include "hard-interface.h"
-#include "icmp_socket.h"
#include "log.h"
-#include "network-coding.h"
+#include "mesh-interface.h"
#include "originator.h"
#include "send.h"
-#include "soft-interface.h"
#include "tp_meter.h"
#include "translation-table.h"
#include "tvlv.h"
@@ -56,7 +42,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
/**
* _batadv_update_route() - set the router for this originator
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig_node: orig node which is to be configured
* @recv_if: the receive interface for which this route is set
* @neigh_node: neighbor which should be the next router
@@ -83,13 +69,13 @@ static void _batadv_update_route(struct batadv_priv *bat_priv,
* the code needs to ensure the curr_router variable contains a pointer
* to the replaced best neighbor.
*/
- curr_router = rcu_dereference_protected(orig_ifinfo->router, true);
/* increase refcount of new best neighbor */
if (neigh_node)
kref_get(&neigh_node->refcount);
- rcu_assign_pointer(orig_ifinfo->router, neigh_node);
+ curr_router = rcu_replace_pointer(orig_ifinfo->router, neigh_node,
+ true);
spin_unlock_bh(&orig_node->neigh_list_lock);
batadv_orig_ifinfo_put(orig_ifinfo);
@@ -114,13 +100,12 @@ static void _batadv_update_route(struct batadv_priv *bat_priv,
}
/* decrease refcount of previous best neighbor */
- if (curr_router)
- batadv_neigh_node_put(curr_router);
+ batadv_neigh_node_put(curr_router);
}
/**
* batadv_update_route() - set the router for this originator
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig_node: orig node which is to be configured
* @recv_if: the receive interface for which this route is set
* @neigh_node: neighbor which should be the next router
@@ -141,14 +126,13 @@ void batadv_update_route(struct batadv_priv *bat_priv,
_batadv_update_route(bat_priv, orig_node, recv_if, neigh_node);
out:
- if (router)
- batadv_neigh_node_put(router);
+ batadv_neigh_node_put(router);
}
/**
* batadv_window_protected() - checks whether the host restarted and is in the
* protection time.
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @seq_num_diff: difference between the current/received sequence number and
* the last sequence number
* @seq_old_max_diff: maximum age of sequence number not considered as restart
@@ -222,7 +206,7 @@ bool batadv_check_management_packet(struct sk_buff *skb,
/**
* batadv_recv_my_icmp_packet() - receive an icmp packet locally
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: icmp packet to process
*
* Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
@@ -239,15 +223,6 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv,
icmph = (struct batadv_icmp_header *)skb->data;
switch (icmph->msg_type) {
- case BATADV_ECHO_REPLY:
- case BATADV_DESTINATION_UNREACHABLE:
- case BATADV_TTL_EXCEEDED:
- /* receive the packet */
- if (skb_linearize(skb) < 0)
- break;
-
- batadv_socket_receive_packet(icmph, skb->len);
- break;
case BATADV_ECHO_REQUEST:
/* answer echo request (ping) */
primary_if = batadv_primary_if_get_selected(bat_priv);
@@ -291,10 +266,8 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv,
goto out;
}
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- if (orig_node)
- batadv_orig_node_put(orig_node);
+ batadv_hardif_put(primary_if);
+ batadv_orig_node_put(orig_node);
kfree_skb(skb);
@@ -346,10 +319,8 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv,
skb = NULL;
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- if (orig_node)
- batadv_orig_node_put(orig_node);
+ batadv_hardif_put(primary_if);
+ batadv_orig_node_put(orig_node);
kfree_skb(skb);
@@ -366,7 +337,7 @@ out:
int batadv_recv_icmp_packet(struct sk_buff *skb,
struct batadv_hard_iface *recv_if)
{
- struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface);
struct batadv_icmp_header *icmph;
struct batadv_icmp_packet_rr *icmp_packet_rr;
struct ethhdr *ethhdr;
@@ -447,8 +418,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
skb = NULL;
put_orig_node:
- if (orig_node)
- batadv_orig_node_put(orig_node);
+ batadv_orig_node_put(orig_node);
free_skb:
kfree_skb(skb);
@@ -457,11 +427,11 @@ free_skb:
/**
* batadv_check_unicast_packet() - Check for malformed unicast packets
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: packet to check
* @hdr_size: size of header to pull
*
- * Check for short header and bad addresses in given packet.
+ * Checks for short header and bad addresses in the given packet.
*
* Return: negative value when check fails and 0 otherwise. The negative value
* depends on the reason: -ENODATA for bad header, -EBADR for broadcast
@@ -535,13 +505,12 @@ batadv_last_bonding_replace(struct batadv_orig_node *orig_node,
orig_node->last_bonding_candidate = new_candidate;
spin_unlock_bh(&orig_node->neigh_list_lock);
- if (old_candidate)
- batadv_orig_ifinfo_put(old_candidate);
+ batadv_orig_ifinfo_put(old_candidate);
}
/**
* batadv_find_router() - find a suitable router for this originator
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig_node: the destination node
* @recv_if: pointer to interface this packet was received on
*
@@ -678,8 +647,7 @@ next:
batadv_orig_ifinfo_put(next_candidate);
}
- if (last_candidate)
- batadv_orig_ifinfo_put(last_candidate);
+ batadv_orig_ifinfo_put(last_candidate);
return router;
}
@@ -687,7 +655,7 @@ next:
static int batadv_route_unicast_packet(struct sk_buff *skb,
struct batadv_hard_iface *recv_if)
{
- struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface);
struct batadv_orig_node *orig_node = NULL;
struct batadv_unicast_packet *unicast_packet;
struct ethhdr *ethhdr = eth_hdr(skb);
@@ -758,7 +726,7 @@ free_skb:
/**
* batadv_reroute_unicast_packet() - update the unicast header for re-routing
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: unicast packet to process
* @unicast_packet: the unicast header to be updated
* @dst_addr: the payload destination
@@ -778,7 +746,8 @@ batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, struct sk_buff *skb,
struct batadv_orig_node *orig_node = NULL;
struct batadv_hard_iface *primary_if = NULL;
bool ret = false;
- u8 *orig_addr, orig_ttvn;
+ const u8 *orig_addr;
+ u8 orig_ttvn;
if (batadv_is_my_client(bat_priv, dst_addr, vid)) {
primary_if = batadv_primary_if_get_selected(bat_priv);
@@ -807,10 +776,8 @@ batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, struct sk_buff *skb,
ret = true;
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- if (orig_node)
- batadv_orig_node_put(orig_node);
+ batadv_hardif_put(primary_if);
+ batadv_orig_node_put(orig_node);
return ret;
}
@@ -838,6 +805,10 @@ static bool batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
vid = batadv_get_vid(skb, hdr_len);
ethhdr = (struct ethhdr *)(skb->data + hdr_len);
+ /* do not reroute multicast frames in a unicast header */
+ if (is_multicast_ether_addr(ethhdr->h_dest))
+ return true;
+
/* check if the destination client was served by this node and it is now
* roaming. In this case, it means that the node has got a ROAM_ADV
* message and that it knows the new destination in the mesh to re-route
@@ -907,7 +878,7 @@ static bool batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
return false;
/* update the header in order to let the packet be delivered to this
- * node's soft interface
+ * node's mesh interface
*/
primary_if = batadv_primary_if_get_selected(bat_priv);
if (!primary_if)
@@ -937,7 +908,7 @@ int batadv_recv_unhandled_unicast_packet(struct sk_buff *skb,
struct batadv_hard_iface *recv_if)
{
struct batadv_unicast_packet *unicast_packet;
- struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface);
int check, hdr_size = sizeof(*unicast_packet);
check = batadv_check_unicast_packet(bat_priv, skb, hdr_size);
@@ -966,7 +937,7 @@ free_skb:
int batadv_recv_unicast_packet(struct sk_buff *skb,
struct batadv_hard_iface *recv_if)
{
- struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface);
struct batadv_unicast_packet *unicast_packet;
struct batadv_unicast_4addr_packet *unicast_4addr_packet;
u8 *orig_addr, *orig_addr_gw;
@@ -984,15 +955,9 @@ int batadv_recv_unicast_packet(struct sk_buff *skb,
/* function returns -EREMOTE for promiscuous packets */
check = batadv_check_unicast_packet(bat_priv, skb, hdr_size);
-
- /* Even though the packet is not for us, we might save it to use for
- * decoding a later received coded packet
- */
- if (check == -EREMOTE)
- batadv_nc_skb_store_sniffed_unicast(bat_priv, skb);
-
if (check < 0)
goto free_skb;
+
if (!batadv_check_unicast_ttvn(bat_priv, skb, hdr_size))
goto free_skb;
@@ -1043,12 +1008,13 @@ int batadv_recv_unicast_packet(struct sk_buff *skb,
hdr_size))
goto rx_success;
- batadv_interface_rx(recv_if->soft_iface, skb, hdr_size,
+ batadv_dat_snoop_incoming_dhcp_ack(bat_priv, skb, hdr_size);
+
+ batadv_interface_rx(recv_if->mesh_iface, skb, hdr_size,
orig_node);
rx_success:
- if (orig_node)
- batadv_orig_node_put(orig_node);
+ batadv_orig_node_put(orig_node);
return NET_RX_SUCCESS;
}
@@ -1074,7 +1040,7 @@ free_skb:
int batadv_recv_unicast_tvlv(struct sk_buff *skb,
struct batadv_hard_iface *recv_if)
{
- struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface);
struct batadv_unicast_tvlv_packet *unicast_tvlv_packet;
unsigned char *tvlv_buff;
u16 tvlv_buff_len;
@@ -1100,10 +1066,9 @@ int batadv_recv_unicast_tvlv(struct sk_buff *skb,
if (tvlv_buff_len > skb->len - hdr_size)
goto free_skb;
- ret = batadv_tvlv_containers_process(bat_priv, false, NULL,
- unicast_tvlv_packet->src,
- unicast_tvlv_packet->dst,
- tvlv_buff, tvlv_buff_len);
+ ret = batadv_tvlv_containers_process(bat_priv, BATADV_UNICAST_TVLV,
+ NULL, skb, tvlv_buff,
+ tvlv_buff_len);
if (ret != NET_RX_SUCCESS) {
ret = batadv_route_unicast_packet(skb, recv_if);
@@ -1123,7 +1088,7 @@ free_skb:
* @recv_if: interface that the skb is received on
*
* This function does one of the three following things: 1) Forward fragment, if
- * the assembled packet will exceed our MTU; 2) Buffer fragment, if we till
+ * the assembled packet will exceed our MTU; 2) Buffer fragment, if we still
* lack further fragments; 3) Merge fragments, if we have all needed parts.
*
* Return: NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise.
@@ -1131,7 +1096,7 @@ free_skb:
int batadv_recv_frag_packet(struct sk_buff *skb,
struct batadv_hard_iface *recv_if)
{
- struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface);
struct batadv_orig_node *orig_node_src = NULL;
struct batadv_frag_packet *frag_packet;
int ret = NET_RX_DROP;
@@ -1193,14 +1158,14 @@ free_skb:
int batadv_recv_bcast_packet(struct sk_buff *skb,
struct batadv_hard_iface *recv_if)
{
- struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface);
struct batadv_orig_node *orig_node = NULL;
struct batadv_bcast_packet *bcast_packet;
struct ethhdr *ethhdr;
int hdr_size = sizeof(*bcast_packet);
- int ret = NET_RX_DROP;
s32 seq_diff;
u32 seqno;
+ int ret;
/* drop packet if it has not necessary minimum size */
if (unlikely(!pskb_may_pull(skb, hdr_size)))
@@ -1226,7 +1191,7 @@ int batadv_recv_bcast_packet(struct sk_buff *skb,
if (batadv_is_my_mac(bat_priv, bcast_packet->orig))
goto free_skb;
- if (bcast_packet->ttl < 2)
+ if (bcast_packet->ttl-- < 2)
goto free_skb;
orig_node = batadv_orig_hash_find(bat_priv, bcast_packet->orig);
@@ -1265,7 +1230,9 @@ int batadv_recv_bcast_packet(struct sk_buff *skb,
batadv_skb_set_priority(skb, sizeof(struct batadv_bcast_packet));
/* rebroadcast packet */
- batadv_add_bcast_packet_to_list(bat_priv, skb, 1, false);
+ ret = batadv_forw_bcast_packet(bat_priv, skb, 0, false);
+ if (ret == NETDEV_TX_BUSY)
+ goto free_skb;
/* don't hand the broadcast up if it is from an originator
* from the same backbone.
@@ -1278,8 +1245,10 @@ int batadv_recv_bcast_packet(struct sk_buff *skb,
if (batadv_dat_snoop_incoming_arp_reply(bat_priv, skb, hdr_size))
goto rx_success;
+ batadv_dat_snoop_incoming_dhcp_ack(bat_priv, skb, hdr_size);
+
/* broadcast for me */
- batadv_interface_rx(recv_if->soft_iface, skb, hdr_size, orig_node);
+ batadv_interface_rx(recv_if->mesh_iface, skb, hdr_size, orig_node);
rx_success:
ret = NET_RX_SUCCESS;
@@ -1289,8 +1258,78 @@ spin_unlock:
spin_unlock_bh(&orig_node->bcast_seqno_lock);
free_skb:
kfree_skb(skb);
+ ret = NET_RX_DROP;
out:
- if (orig_node)
- batadv_orig_node_put(orig_node);
+ batadv_orig_node_put(orig_node);
+ return ret;
+}
+
+#ifdef CONFIG_BATMAN_ADV_MCAST
+/**
+ * batadv_recv_mcast_packet() - process received batman-adv multicast packet
+ * @skb: the received batman-adv multicast packet
+ * @recv_if: interface that the skb is received on
+ *
+ * Parses the given, received batman-adv multicast packet. Depending on the
+ * contents of its TVLV forwards it and/or decapsulates it to hand it to the
+ * mesh interface.
+ *
+ * Return: NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise.
+ */
+int batadv_recv_mcast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface);
+ struct batadv_mcast_packet *mcast_packet;
+ int hdr_size = sizeof(*mcast_packet);
+ unsigned char *tvlv_buff;
+ int ret = NET_RX_DROP;
+ u16 tvlv_buff_len;
+
+ if (batadv_check_unicast_packet(bat_priv, skb, hdr_size) < 0)
+ goto free_skb;
+
+ /* create a copy of the skb, if needed, to modify it. */
+ if (skb_cow(skb, ETH_HLEN) < 0)
+ goto free_skb;
+
+ /* packet needs to be linearized to access the tvlv content */
+ if (skb_linearize(skb) < 0)
+ goto free_skb;
+
+ mcast_packet = (struct batadv_mcast_packet *)skb->data;
+ if (mcast_packet->ttl-- < 2)
+ goto free_skb;
+
+ tvlv_buff = (unsigned char *)(skb->data + hdr_size);
+ tvlv_buff_len = ntohs(mcast_packet->tvlv_len);
+
+ if (tvlv_buff_len > skb->len - hdr_size)
+ goto free_skb;
+
+ ret = batadv_tvlv_containers_process(bat_priv, BATADV_MCAST, NULL, skb,
+ tvlv_buff, tvlv_buff_len);
+ if (ret >= 0) {
+ batadv_inc_counter(bat_priv, BATADV_CNT_MCAST_RX);
+ batadv_add_counter(bat_priv, BATADV_CNT_MCAST_RX_BYTES,
+ skb->len + ETH_HLEN);
+ }
+
+ hdr_size += tvlv_buff_len;
+
+ if (ret == NET_RX_SUCCESS && (skb->len - hdr_size >= ETH_HLEN)) {
+ batadv_inc_counter(bat_priv, BATADV_CNT_MCAST_RX_LOCAL);
+ batadv_add_counter(bat_priv, BATADV_CNT_MCAST_RX_LOCAL_BYTES,
+ skb->len - hdr_size);
+
+ batadv_interface_rx(bat_priv->mesh_iface, skb, hdr_size, NULL);
+ /* skb was consumed */
+ skb = NULL;
+ }
+
+free_skb:
+ kfree_skb(skb);
+
return ret;
}
+#endif /* CONFIG_BATMAN_ADV_MCAST */
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
index db54c2d9b8bf..e9849f032a24 100644
--- a/net/batman-adv/routing.h
+++ b/net/batman-adv/routing.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_ROUTING_H_
@@ -21,10 +9,9 @@
#include "main.h"
+#include <linux/skbuff.h>
#include <linux/types.h>
-struct sk_buff;
-
bool batadv_check_management_packet(struct sk_buff *skb,
struct batadv_hard_iface *hard_iface,
int header_len);
@@ -40,10 +27,17 @@ int batadv_recv_frag_packet(struct sk_buff *skb,
struct batadv_hard_iface *iface);
int batadv_recv_bcast_packet(struct sk_buff *skb,
struct batadv_hard_iface *recv_if);
-int batadv_recv_tt_query(struct sk_buff *skb,
- struct batadv_hard_iface *recv_if);
-int batadv_recv_roam_adv(struct sk_buff *skb,
- struct batadv_hard_iface *recv_if);
+#ifdef CONFIG_BATMAN_ADV_MCAST
+int batadv_recv_mcast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if);
+#else
+static inline int batadv_recv_mcast_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *recv_if)
+{
+ kfree_skb(skb);
+ return NET_RX_DROP;
+}
+#endif
int batadv_recv_unicast_tvlv(struct sk_buff *skb,
struct batadv_hard_iface *recv_if);
int batadv_recv_unhandled_unicast_packet(struct sk_buff *skb,
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 4a35f5c2f52b..20d85c681064 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -1,19 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "send.h"
@@ -22,18 +10,17 @@
#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/byteorder/generic.h>
+#include <linux/container_of.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/gfp.h>
#include <linux/if.h>
#include <linux/if_ether.h>
#include <linux/jiffies.h>
-#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/list.h>
#include <linux/netdevice.h>
#include <linux/printk.h>
-#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
@@ -46,10 +33,9 @@
#include "gateway_client.h"
#include "hard-interface.h"
#include "log.h"
-#include "network-coding.h"
+#include "mesh-interface.h"
#include "originator.h"
#include "routing.h"
-#include "soft-interface.h"
#include "translation-table.h"
static void batadv_send_outstanding_bcast_packet(struct work_struct *work);
@@ -76,12 +62,9 @@ int batadv_send_skb_packet(struct sk_buff *skb,
struct batadv_hard_iface *hard_iface,
const u8 *dst_addr)
{
- struct batadv_priv *bat_priv;
struct ethhdr *ethhdr;
int ret;
- bat_priv = netdev_priv(hard_iface->soft_iface);
-
if (hard_iface->if_status != BATADV_IF_ACTIVE)
goto send_skb_err;
@@ -110,9 +93,6 @@ int batadv_send_skb_packet(struct sk_buff *skb,
skb->dev = hard_iface->net_dev;
- /* Save a clone of the skb to use when decoding coded packets */
- batadv_nc_skb_store_for_decoding(bat_priv, skb);
-
/* dev_queue_xmit() returns a negative result on error. However on
* congestion and traffic shaping, it drops and returns NET_XMIT_DROP
* (which is > 0). This will not be treated as an error.
@@ -136,7 +116,9 @@ send_skb_err:
int batadv_send_broadcast_skb(struct sk_buff *skb,
struct batadv_hard_iface *hard_iface)
{
- return batadv_send_skb_packet(skb, hard_iface, batadv_broadcast_addr);
+ static const u8 broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+
+ return batadv_send_skb_packet(skb, hard_iface, broadcast_addr);
}
/**
@@ -164,8 +146,7 @@ int batadv_send_unicast_skb(struct sk_buff *skb,
if (hardif_neigh && ret != NET_XMIT_DROP)
hardif_neigh->bat_v.last_unicast_tx = jiffies;
- if (hardif_neigh)
- batadv_hardif_neigh_put(hardif_neigh);
+ batadv_hardif_neigh_put(hardif_neigh);
#endif
return ret;
@@ -214,14 +195,7 @@ int batadv_send_skb_to_orig(struct sk_buff *skb,
goto put_neigh_node;
}
- /* try to network code the packet, if it is received on an interface
- * (i.e. being forwarded). If the packet originates from this node or if
- * network coding fails, then send the packet as usual.
- */
- if (recv_if && batadv_nc_skb_forward(skb, neigh_node))
- ret = -EINPROGRESS;
- else
- ret = batadv_send_unicast_skb(skb, neigh_node);
+ ret = batadv_send_unicast_skb(skb, neigh_node);
/* skb was consumed */
skb = NULL;
@@ -285,7 +259,7 @@ static bool batadv_send_skb_prepare_unicast(struct sk_buff *skb,
/**
* batadv_send_skb_prepare_unicast_4addr() - encapsulate an skb with a
* unicast 4addr header
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: the skb containing the payload to encapsulate
* @orig: the destination node
* @packet_subtype: the unicast 4addr packet subtype to use
@@ -321,14 +295,13 @@ bool batadv_send_skb_prepare_unicast_4addr(struct batadv_priv *bat_priv,
ret = true;
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
+ batadv_hardif_put(primary_if);
return ret;
}
/**
* batadv_send_skb_unicast() - encapsulate and send an skb via unicast
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: payload to send
* @packet_type: the batman unicast packet type to use
* @packet_subtype: the unicast 4addr packet subtype (only relevant for unicast
@@ -398,7 +371,7 @@ out:
/**
* batadv_send_skb_via_tt_generic() - send an skb via TT lookup
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: payload to send
* @packet_type: the batman unicast packet type to use
* @packet_subtype: the unicast 4addr packet subtype (only relevant for unicast
@@ -437,15 +410,14 @@ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv,
ret = batadv_send_skb_unicast(bat_priv, skb, packet_type,
packet_subtype, orig_node, vid);
- if (orig_node)
- batadv_orig_node_put(orig_node);
+ batadv_orig_node_put(orig_node);
return ret;
}
/**
* batadv_send_skb_via_gw() - send an skb via gateway lookup
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: payload to send
* @vid: the vid to be used to search the translation table
*
@@ -464,8 +436,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
ret = batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST_4ADDR,
BATADV_P_DATA, orig_node, vid);
- if (orig_node)
- batadv_orig_node_put(orig_node);
+ batadv_orig_node_put(orig_node);
return ret;
}
@@ -473,7 +444,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
/**
* batadv_forw_packet_free() - free a forwarding packet
* @forw_packet: The packet to free
- * @dropped: whether the packet is freed because is is dropped
+ * @dropped: whether the packet is freed because is dropped
*
* This frees a forwarding packet and releases any resources it might
* have claimed.
@@ -486,10 +457,8 @@ void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet,
else
consume_skb(forw_packet->skb);
- if (forw_packet->if_incoming)
- batadv_hardif_put(forw_packet->if_incoming);
- if (forw_packet->if_outgoing)
- batadv_hardif_put(forw_packet->if_outgoing);
+ batadv_hardif_put(forw_packet->if_incoming);
+ batadv_hardif_put(forw_packet->if_outgoing);
if (forw_packet->queue_left)
atomic_inc(forw_packet->queue_left);
kfree(forw_packet);
@@ -550,7 +519,7 @@ batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming,
forw_packet->queue_left = queue_left;
forw_packet->if_incoming = if_incoming;
forw_packet->if_outgoing = if_outgoing;
- forw_packet->num_packets = 0;
+ forw_packet->num_packets = 1;
return forw_packet;
@@ -617,8 +586,8 @@ bool batadv_forw_packet_steal(struct batadv_forw_packet *forw_packet,
* given hard_iface. If hard_iface is NULL forwarding packets on all hard
* interfaces will be claimed.
*
- * The packets are being moved from the forw_list to the cleanup_list and
- * by that allows already running threads to notice the claiming.
+ * The packets are being moved from the forw_list to the cleanup_list. This
+ * makes it possible for already running threads to notice the claim.
*/
static void
batadv_forw_packet_list_steal(struct hlist_head *forw_list,
@@ -713,7 +682,7 @@ static void batadv_forw_packet_queue(struct batadv_forw_packet *forw_packet,
/**
* batadv_forw_packet_bcast_queue() - try to queue a broadcast packet
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @forw_packet: the forwarding packet to queue
* @send_time: timestamp (jiffies) when the packet is to be sent
*
@@ -732,7 +701,7 @@ batadv_forw_packet_bcast_queue(struct batadv_priv *bat_priv,
/**
* batadv_forw_packet_ogmv1_queue() - try to queue an OGMv1 packet
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @forw_packet: the forwarding packet to queue
* @send_time: timestamp (jiffies) when the packet is to be sent
*
@@ -749,57 +718,52 @@ void batadv_forw_packet_ogmv1_queue(struct batadv_priv *bat_priv,
}
/**
- * batadv_add_bcast_packet_to_list() - queue broadcast packet for multiple sends
- * @bat_priv: the bat priv with all the soft interface information
+ * batadv_forw_bcast_packet_to_list() - queue broadcast packet for transmissions
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: broadcast packet to add
* @delay: number of jiffies to wait before sending
* @own_packet: true if it is a self-generated broadcast packet
+ * @if_in: the interface where the packet was received on
+ * @if_out: the outgoing interface to queue on
*
- * add a broadcast packet to the queue and setup timers. broadcast packets
+ * Adds a broadcast packet to the queue and sets up timers. Broadcast packets
* are sent multiple times to increase probability for being received.
*
- * The skb is not consumed, so the caller should make sure that the
- * skb is freed.
+ * This call clones the given skb, hence the caller needs to take into
+ * account that the data segment of the original skb might not be
+ * modifiable anymore.
*
* Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors.
*/
-int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
- const struct sk_buff *skb,
- unsigned long delay,
- bool own_packet)
+static int batadv_forw_bcast_packet_to_list(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned long delay,
+ bool own_packet,
+ struct batadv_hard_iface *if_in,
+ struct batadv_hard_iface *if_out)
{
- struct batadv_hard_iface *primary_if;
struct batadv_forw_packet *forw_packet;
- struct batadv_bcast_packet *bcast_packet;
+ unsigned long send_time = jiffies;
struct sk_buff *newskb;
- primary_if = batadv_primary_if_get_selected(bat_priv);
- if (!primary_if)
- goto err;
-
- newskb = skb_copy(skb, GFP_ATOMIC);
- if (!newskb) {
- batadv_hardif_put(primary_if);
+ newskb = skb_clone(skb, GFP_ATOMIC);
+ if (!newskb)
goto err;
- }
- forw_packet = batadv_forw_packet_alloc(primary_if, NULL,
+ forw_packet = batadv_forw_packet_alloc(if_in, if_out,
&bat_priv->bcast_queue_left,
bat_priv, newskb);
- batadv_hardif_put(primary_if);
if (!forw_packet)
goto err_packet_free;
- /* as we have a copy now, it is safe to decrease the TTL */
- bcast_packet = (struct batadv_bcast_packet *)newskb->data;
- bcast_packet->ttl--;
-
forw_packet->own = own_packet;
INIT_DELAYED_WORK(&forw_packet->delayed_work,
batadv_send_outstanding_bcast_packet);
- batadv_forw_packet_bcast_queue(bat_priv, forw_packet, jiffies + delay);
+ send_time += delay ? delay : msecs_to_jiffies(5);
+
+ batadv_forw_packet_bcast_queue(bat_priv, forw_packet, send_time);
return NETDEV_TX_OK;
err_packet_free:
@@ -809,9 +773,220 @@ err:
}
/**
+ * batadv_forw_bcast_packet_if() - forward and queue a broadcast packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: broadcast packet to add
+ * @delay: number of jiffies to wait before sending
+ * @own_packet: true if it is a self-generated broadcast packet
+ * @if_in: the interface where the packet was received on
+ * @if_out: the outgoing interface to forward to
+ *
+ * Transmits a broadcast packet on the specified interface either immediately
+ * or if a delay is given after that. Furthermore, queues additional
+ * retransmissions if this interface is a wireless one.
+ *
+ * This call clones the given skb, hence the caller needs to take into
+ * account that the data segment of the original skb might not be
+ * modifiable anymore.
+ *
+ * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors.
+ */
+static int batadv_forw_bcast_packet_if(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned long delay,
+ bool own_packet,
+ struct batadv_hard_iface *if_in,
+ struct batadv_hard_iface *if_out)
+{
+ unsigned int num_bcasts = if_out->num_bcasts;
+ struct sk_buff *newskb;
+ int ret = NETDEV_TX_OK;
+
+ if (!delay) {
+ newskb = skb_clone(skb, GFP_ATOMIC);
+ if (!newskb)
+ return NETDEV_TX_BUSY;
+
+ batadv_send_broadcast_skb(newskb, if_out);
+ num_bcasts--;
+ }
+
+ /* delayed broadcast or rebroadcasts? */
+ if (num_bcasts >= 1) {
+ BATADV_SKB_CB(skb)->num_bcasts = num_bcasts;
+
+ ret = batadv_forw_bcast_packet_to_list(bat_priv, skb, delay,
+ own_packet, if_in,
+ if_out);
+ }
+
+ return ret;
+}
+
+/**
+ * batadv_send_no_broadcast() - check whether (re)broadcast is necessary
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: broadcast packet to check
+ * @own_packet: true if it is a self-generated broadcast packet
+ * @if_out: the outgoing interface checked and considered for (re)broadcast
+ *
+ * Return: False if a packet needs to be (re)broadcasted on the given interface,
+ * true otherwise.
+ */
+static bool batadv_send_no_broadcast(struct batadv_priv *bat_priv,
+ struct sk_buff *skb, bool own_packet,
+ struct batadv_hard_iface *if_out)
+{
+ struct batadv_hardif_neigh_node *neigh_node = NULL;
+ struct batadv_bcast_packet *bcast_packet;
+ u8 *orig_neigh;
+ u8 *neigh_addr;
+ char *type;
+ int ret;
+
+ if (!own_packet) {
+ neigh_addr = eth_hdr(skb)->h_source;
+ neigh_node = batadv_hardif_neigh_get(if_out,
+ neigh_addr);
+ }
+
+ bcast_packet = (struct batadv_bcast_packet *)skb->data;
+ orig_neigh = neigh_node ? neigh_node->orig : NULL;
+
+ ret = batadv_hardif_no_broadcast(if_out, bcast_packet->orig,
+ orig_neigh);
+
+ batadv_hardif_neigh_put(neigh_node);
+
+ /* ok, may broadcast */
+ if (!ret)
+ return false;
+
+ /* no broadcast */
+ switch (ret) {
+ case BATADV_HARDIF_BCAST_NORECIPIENT:
+ type = "no neighbor";
+ break;
+ case BATADV_HARDIF_BCAST_DUPFWD:
+ type = "single neighbor is source";
+ break;
+ case BATADV_HARDIF_BCAST_DUPORIG:
+ type = "single neighbor is originator";
+ break;
+ default:
+ type = "unknown";
+ }
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "BCAST packet from orig %pM on %s suppressed: %s\n",
+ bcast_packet->orig,
+ if_out->net_dev->name, type);
+
+ return true;
+}
+
+/**
+ * __batadv_forw_bcast_packet() - forward and queue a broadcast packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: broadcast packet to add
+ * @delay: number of jiffies to wait before sending
+ * @own_packet: true if it is a self-generated broadcast packet
+ *
+ * Transmits a broadcast packet either immediately or if a delay is given
+ * after that. Furthermore, queues additional retransmissions on wireless
+ * interfaces.
+ *
+ * This call clones the given skb, hence the caller needs to take into
+ * account that the data segment of the given skb might not be
+ * modifiable anymore.
+ *
+ * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors.
+ */
+static int __batadv_forw_bcast_packet(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned long delay,
+ bool own_packet)
+{
+ struct batadv_hard_iface *hard_iface;
+ struct batadv_hard_iface *primary_if;
+ struct list_head *iter;
+ int ret = NETDEV_TX_OK;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ return NETDEV_TX_BUSY;
+
+ rcu_read_lock();
+ netdev_for_each_lower_private_rcu(bat_priv->mesh_iface, hard_iface, iter) {
+ if (!kref_get_unless_zero(&hard_iface->refcount))
+ continue;
+
+ if (batadv_send_no_broadcast(bat_priv, skb, own_packet,
+ hard_iface)) {
+ batadv_hardif_put(hard_iface);
+ continue;
+ }
+
+ ret = batadv_forw_bcast_packet_if(bat_priv, skb, delay,
+ own_packet, primary_if,
+ hard_iface);
+ batadv_hardif_put(hard_iface);
+
+ if (ret == NETDEV_TX_BUSY)
+ break;
+ }
+ rcu_read_unlock();
+
+ batadv_hardif_put(primary_if);
+ return ret;
+}
+
+/**
+ * batadv_forw_bcast_packet() - forward and queue a broadcast packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: broadcast packet to add
+ * @delay: number of jiffies to wait before sending
+ * @own_packet: true if it is a self-generated broadcast packet
+ *
+ * Transmits a broadcast packet either immediately or if a delay is given
+ * after that. Furthermore, queues additional retransmissions on wireless
+ * interfaces.
+ *
+ * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors.
+ */
+int batadv_forw_bcast_packet(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned long delay,
+ bool own_packet)
+{
+ return __batadv_forw_bcast_packet(bat_priv, skb, delay, own_packet);
+}
+
+/**
+ * batadv_send_bcast_packet() - send and queue a broadcast packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @skb: broadcast packet to add
+ * @delay: number of jiffies to wait before sending
+ * @own_packet: true if it is a self-generated broadcast packet
+ *
+ * Transmits a broadcast packet either immediately or if a delay is given
+ * after that. Furthermore, queues additional retransmissions on wireless
+ * interfaces.
+ *
+ * Consumes the provided skb.
+ */
+void batadv_send_bcast_packet(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned long delay,
+ bool own_packet)
+{
+ __batadv_forw_bcast_packet(bat_priv, skb, delay, own_packet);
+ consume_skb(skb);
+}
+
+/**
* batadv_forw_packet_bcasts_left() - check if a retransmission is necessary
* @forw_packet: the forwarding packet to check
- * @hard_iface: the interface to check on
*
* Checks whether a given packet has any (re)transmissions left on the provided
* interface.
@@ -823,28 +998,20 @@ err:
* Return: True if (re)transmissions are left, false otherwise.
*/
static bool
-batadv_forw_packet_bcasts_left(struct batadv_forw_packet *forw_packet,
- struct batadv_hard_iface *hard_iface)
+batadv_forw_packet_bcasts_left(struct batadv_forw_packet *forw_packet)
{
- unsigned int max;
-
- if (hard_iface)
- max = hard_iface->num_bcasts;
- else
- max = BATADV_NUM_BCASTS_MAX;
-
- return BATADV_SKB_CB(forw_packet->skb)->num_bcasts < max;
+ return BATADV_SKB_CB(forw_packet->skb)->num_bcasts;
}
/**
- * batadv_forw_packet_bcasts_inc() - increment retransmission counter of a
+ * batadv_forw_packet_bcasts_dec() - decrement retransmission counter of a
* packet
- * @forw_packet: the packet to increase the counter for
+ * @forw_packet: the packet to decrease the counter for
*/
static void
-batadv_forw_packet_bcasts_inc(struct batadv_forw_packet *forw_packet)
+batadv_forw_packet_bcasts_dec(struct batadv_forw_packet *forw_packet)
{
- BATADV_SKB_CB(forw_packet->skb)->num_bcasts++;
+ BATADV_SKB_CB(forw_packet->skb)->num_bcasts--;
}
/**
@@ -855,30 +1022,30 @@ batadv_forw_packet_bcasts_inc(struct batadv_forw_packet *forw_packet)
*/
bool batadv_forw_packet_is_rebroadcast(struct batadv_forw_packet *forw_packet)
{
- return BATADV_SKB_CB(forw_packet->skb)->num_bcasts > 0;
+ unsigned char num_bcasts = BATADV_SKB_CB(forw_packet->skb)->num_bcasts;
+
+ return num_bcasts != forw_packet->if_outgoing->num_bcasts;
}
+/**
+ * batadv_send_outstanding_bcast_packet() - transmit a queued broadcast packet
+ * @work: work queue item
+ *
+ * Transmits a queued broadcast packet and if necessary reschedules it.
+ */
static void batadv_send_outstanding_bcast_packet(struct work_struct *work)
{
- struct batadv_hard_iface *hard_iface;
- struct batadv_hardif_neigh_node *neigh_node;
- struct delayed_work *delayed_work;
+ unsigned long send_time = jiffies + msecs_to_jiffies(5);
struct batadv_forw_packet *forw_packet;
- struct batadv_bcast_packet *bcast_packet;
- struct sk_buff *skb1;
- struct net_device *soft_iface;
+ struct delayed_work *delayed_work;
struct batadv_priv *bat_priv;
- unsigned long send_time = jiffies + msecs_to_jiffies(5);
+ struct sk_buff *skb1;
bool dropped = false;
- u8 *neigh_addr;
- u8 *orig_neigh;
- int ret = 0;
delayed_work = to_delayed_work(work);
forw_packet = container_of(delayed_work, struct batadv_forw_packet,
delayed_work);
- soft_iface = forw_packet->if_incoming->soft_iface;
- bat_priv = netdev_priv(soft_iface);
+ bat_priv = netdev_priv(forw_packet->if_incoming->mesh_iface);
if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) {
dropped = true;
@@ -890,76 +1057,15 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work)
goto out;
}
- bcast_packet = (struct batadv_bcast_packet *)forw_packet->skb->data;
-
- /* rebroadcast packet */
- rcu_read_lock();
- list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
- if (hard_iface->soft_iface != soft_iface)
- continue;
-
- if (!batadv_forw_packet_bcasts_left(forw_packet, hard_iface))
- continue;
-
- if (forw_packet->own) {
- neigh_node = NULL;
- } else {
- neigh_addr = eth_hdr(forw_packet->skb)->h_source;
- neigh_node = batadv_hardif_neigh_get(hard_iface,
- neigh_addr);
- }
-
- orig_neigh = neigh_node ? neigh_node->orig : NULL;
-
- ret = batadv_hardif_no_broadcast(hard_iface, bcast_packet->orig,
- orig_neigh);
-
- if (ret) {
- char *type;
-
- switch (ret) {
- case BATADV_HARDIF_BCAST_NORECIPIENT:
- type = "no neighbor";
- break;
- case BATADV_HARDIF_BCAST_DUPFWD:
- type = "single neighbor is source";
- break;
- case BATADV_HARDIF_BCAST_DUPORIG:
- type = "single neighbor is originator";
- break;
- default:
- type = "unknown";
- }
-
- batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "BCAST packet from orig %pM on %s suppressed: %s\n",
- bcast_packet->orig,
- hard_iface->net_dev->name, type);
-
- if (neigh_node)
- batadv_hardif_neigh_put(neigh_node);
-
- continue;
- }
-
- if (neigh_node)
- batadv_hardif_neigh_put(neigh_node);
-
- if (!kref_get_unless_zero(&hard_iface->refcount))
- continue;
-
- /* send a copy of the saved skb */
- skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC);
- if (skb1)
- batadv_send_broadcast_skb(skb1, hard_iface);
-
- batadv_hardif_put(hard_iface);
- }
- rcu_read_unlock();
+ /* send a copy of the saved skb */
+ skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC);
+ if (!skb1)
+ goto out;
- batadv_forw_packet_bcasts_inc(forw_packet);
+ batadv_send_broadcast_skb(skb1, forw_packet->if_outgoing);
+ batadv_forw_packet_bcasts_dec(forw_packet);
- /* if we still have some more bcasts to send */
- if (batadv_forw_packet_bcasts_left(forw_packet, NULL)) {
+ if (batadv_forw_packet_bcasts_left(forw_packet)) {
batadv_forw_packet_bcast_queue(bat_priv, forw_packet,
send_time);
return;
@@ -974,7 +1080,7 @@ out:
/**
* batadv_purge_outstanding_packets() - stop/purge scheduled bcast/OGMv1 packets
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @hard_iface: the hard interface to cancel and purge bcast/ogm packets on
*
* This method cancels and purges any broadcast and OGMv1 packet on the given
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index 64cce07b8fe6..3415afec4a0c 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_SEND_H_
@@ -22,12 +10,11 @@
#include "main.h"
#include <linux/compiler.h>
+#include <linux/skbuff.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <uapi/linux/batadv_packet.h>
-struct sk_buff;
-
void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet,
bool dropped);
struct batadv_forw_packet *
@@ -52,10 +39,14 @@ int batadv_send_broadcast_skb(struct sk_buff *skb,
struct batadv_hard_iface *hard_iface);
int batadv_send_unicast_skb(struct sk_buff *skb,
struct batadv_neigh_node *neigh_node);
-int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
- const struct sk_buff *skb,
- unsigned long delay,
- bool own_packet);
+int batadv_forw_bcast_packet(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned long delay,
+ bool own_packet);
+void batadv_send_bcast_packet(struct batadv_priv *bat_priv,
+ struct sk_buff *skb,
+ unsigned long delay,
+ bool own_packet);
void
batadv_purge_outstanding_packets(struct batadv_priv *bat_priv,
const struct batadv_hard_iface *hard_iface);
@@ -77,7 +68,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
/**
* batadv_send_skb_via_tt() - send an skb via TT lookup
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: the payload to send
* @dst_hint: can be used to override the destination contained in the skb
* @vid: the vid to be used to search the translation table
@@ -98,7 +89,7 @@ static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv,
/**
* batadv_send_skb_via_tt_4addr() - send an skb via TT lookup
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: the payload to send
* @packet_subtype: the unicast 4addr packet subtype to use
* @dst_hint: can be used to override the destination contained in the skb
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
deleted file mode 100644
index daf87f07fadd..000000000000
--- a/net/batman-adv/soft-interface.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
- *
- * Marek Lindner
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _NET_BATMAN_ADV_SOFT_INTERFACE_H_
-#define _NET_BATMAN_ADV_SOFT_INTERFACE_H_
-
-#include "main.h"
-
-#include <linux/types.h>
-#include <net/rtnetlink.h>
-
-struct net_device;
-struct net;
-struct sk_buff;
-
-int batadv_skb_head_push(struct sk_buff *skb, unsigned int len);
-void batadv_interface_rx(struct net_device *soft_iface,
- struct sk_buff *skb, int hdr_size,
- struct batadv_orig_node *orig_node);
-struct net_device *batadv_softif_create(struct net *net, const char *name);
-void batadv_softif_destroy_sysfs(struct net_device *soft_iface);
-bool batadv_softif_is_valid(const struct net_device *net_dev);
-extern struct rtnl_link_ops batadv_link_ops;
-int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid);
-void batadv_softif_vlan_put(struct batadv_softif_vlan *softif_vlan);
-struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv,
- unsigned short vid);
-
-#endif /* _NET_BATMAN_ADV_SOFT_INTERFACE_H_ */
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
deleted file mode 100644
index 09427fc6494a..000000000000
--- a/net/batman-adv/sysfs.c
+++ /dev/null
@@ -1,1261 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors:
- *
- * Marek Lindner
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#include "sysfs.h"
-#include "main.h"
-
-#include <linux/atomic.h>
-#include <linux/compiler.h>
-#include <linux/device.h>
-#include <linux/errno.h>
-#include <linux/gfp.h>
-#include <linux/if.h>
-#include <linux/if_vlan.h>
-#include <linux/kernel.h>
-#include <linux/kobject.h>
-#include <linux/kref.h>
-#include <linux/netdevice.h>
-#include <linux/printk.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-#include <linux/rtnetlink.h>
-#include <linux/slab.h>
-#include <linux/stddef.h>
-#include <linux/string.h>
-#include <linux/stringify.h>
-#include <linux/workqueue.h>
-#include <uapi/linux/batadv_packet.h>
-
-#include "bridge_loop_avoidance.h"
-#include "distributed-arp-table.h"
-#include "gateway_client.h"
-#include "gateway_common.h"
-#include "hard-interface.h"
-#include "log.h"
-#include "network-coding.h"
-#include "soft-interface.h"
-
-static struct net_device *batadv_kobj_to_netdev(struct kobject *obj)
-{
- struct device *dev = container_of(obj->parent, struct device, kobj);
-
- return to_net_dev(dev);
-}
-
-static struct batadv_priv *batadv_kobj_to_batpriv(struct kobject *obj)
-{
- struct net_device *net_dev = batadv_kobj_to_netdev(obj);
-
- return netdev_priv(net_dev);
-}
-
-/**
- * batadv_vlan_kobj_to_batpriv() - convert a vlan kobj in the associated batpriv
- * @obj: kobject to covert
- *
- * Return: the associated batadv_priv struct.
- */
-static struct batadv_priv *batadv_vlan_kobj_to_batpriv(struct kobject *obj)
-{
- /* VLAN specific attributes are located in the root sysfs folder if they
- * refer to the untagged VLAN..
- */
- if (!strcmp(BATADV_SYSFS_IF_MESH_SUBDIR, obj->name))
- return batadv_kobj_to_batpriv(obj);
-
- /* ..while the attributes for the tagged vlans are located in
- * the in the corresponding "vlan%VID" subfolder
- */
- return batadv_kobj_to_batpriv(obj->parent);
-}
-
-/**
- * batadv_kobj_to_vlan() - convert a kobj in the associated softif_vlan struct
- * @bat_priv: the bat priv with all the soft interface information
- * @obj: kobject to covert
- *
- * Return: the associated softif_vlan struct if found, NULL otherwise.
- */
-static struct batadv_softif_vlan *
-batadv_kobj_to_vlan(struct batadv_priv *bat_priv, struct kobject *obj)
-{
- struct batadv_softif_vlan *vlan_tmp, *vlan = NULL;
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(vlan_tmp, &bat_priv->softif_vlan_list, list) {
- if (vlan_tmp->kobj != obj)
- continue;
-
- if (!kref_get_unless_zero(&vlan_tmp->refcount))
- continue;
-
- vlan = vlan_tmp;
- break;
- }
- rcu_read_unlock();
-
- return vlan;
-}
-
-#define BATADV_UEV_TYPE_VAR "BATTYPE="
-#define BATADV_UEV_ACTION_VAR "BATACTION="
-#define BATADV_UEV_DATA_VAR "BATDATA="
-
-static char *batadv_uev_action_str[] = {
- "add",
- "del",
- "change",
- "loopdetect",
-};
-
-static char *batadv_uev_type_str[] = {
- "gw",
- "bla",
-};
-
-/* Use this, if you have customized show and store functions for vlan attrs */
-#define BATADV_ATTR_VLAN(_name, _mode, _show, _store) \
-struct batadv_attribute batadv_attr_vlan_##_name = { \
- .attr = {.name = __stringify(_name), \
- .mode = _mode }, \
- .show = _show, \
- .store = _store, \
-}
-
-/* Use this, if you have customized show and store functions */
-#define BATADV_ATTR(_name, _mode, _show, _store) \
-struct batadv_attribute batadv_attr_##_name = { \
- .attr = {.name = __stringify(_name), \
- .mode = _mode }, \
- .show = _show, \
- .store = _store, \
-}
-
-#define BATADV_ATTR_SIF_STORE_BOOL(_name, _post_func) \
-ssize_t batadv_store_##_name(struct kobject *kobj, \
- struct attribute *attr, char *buff, \
- size_t count) \
-{ \
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \
- struct batadv_priv *bat_priv = netdev_priv(net_dev); \
- \
- return __batadv_store_bool_attr(buff, count, _post_func, attr, \
- &bat_priv->_name, net_dev); \
-}
-
-#define BATADV_ATTR_SIF_SHOW_BOOL(_name) \
-ssize_t batadv_show_##_name(struct kobject *kobj, \
- struct attribute *attr, char *buff) \
-{ \
- struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); \
- \
- return sprintf(buff, "%s\n", \
- atomic_read(&bat_priv->_name) == 0 ? \
- "disabled" : "enabled"); \
-} \
-
-/* Use this, if you are going to turn a [name] in the soft-interface
- * (bat_priv) on or off
- */
-#define BATADV_ATTR_SIF_BOOL(_name, _mode, _post_func) \
- static BATADV_ATTR_SIF_STORE_BOOL(_name, _post_func) \
- static BATADV_ATTR_SIF_SHOW_BOOL(_name) \
- static BATADV_ATTR(_name, _mode, batadv_show_##_name, \
- batadv_store_##_name)
-
-#define BATADV_ATTR_SIF_STORE_UINT(_name, _var, _min, _max, _post_func) \
-ssize_t batadv_store_##_name(struct kobject *kobj, \
- struct attribute *attr, char *buff, \
- size_t count) \
-{ \
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \
- struct batadv_priv *bat_priv = netdev_priv(net_dev); \
- \
- return __batadv_store_uint_attr(buff, count, _min, _max, \
- _post_func, attr, \
- &bat_priv->_var, net_dev, \
- NULL); \
-}
-
-#define BATADV_ATTR_SIF_SHOW_UINT(_name, _var) \
-ssize_t batadv_show_##_name(struct kobject *kobj, \
- struct attribute *attr, char *buff) \
-{ \
- struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); \
- \
- return sprintf(buff, "%i\n", atomic_read(&bat_priv->_var)); \
-} \
-
-/* Use this, if you are going to set [name] in the soft-interface
- * (bat_priv) to an unsigned integer value
- */
-#define BATADV_ATTR_SIF_UINT(_name, _var, _mode, _min, _max, _post_func)\
- static BATADV_ATTR_SIF_STORE_UINT(_name, _var, _min, _max, _post_func)\
- static BATADV_ATTR_SIF_SHOW_UINT(_name, _var) \
- static BATADV_ATTR(_name, _mode, batadv_show_##_name, \
- batadv_store_##_name)
-
-#define BATADV_ATTR_VLAN_STORE_BOOL(_name, _post_func) \
-ssize_t batadv_store_vlan_##_name(struct kobject *kobj, \
- struct attribute *attr, char *buff, \
- size_t count) \
-{ \
- struct batadv_priv *bat_priv = batadv_vlan_kobj_to_batpriv(kobj);\
- struct batadv_softif_vlan *vlan = batadv_kobj_to_vlan(bat_priv, \
- kobj); \
- size_t res = __batadv_store_bool_attr(buff, count, _post_func, \
- attr, &vlan->_name, \
- bat_priv->soft_iface); \
- \
- batadv_softif_vlan_put(vlan); \
- return res; \
-}
-
-#define BATADV_ATTR_VLAN_SHOW_BOOL(_name) \
-ssize_t batadv_show_vlan_##_name(struct kobject *kobj, \
- struct attribute *attr, char *buff) \
-{ \
- struct batadv_priv *bat_priv = batadv_vlan_kobj_to_batpriv(kobj);\
- struct batadv_softif_vlan *vlan = batadv_kobj_to_vlan(bat_priv, \
- kobj); \
- size_t res = sprintf(buff, "%s\n", \
- atomic_read(&vlan->_name) == 0 ? \
- "disabled" : "enabled"); \
- \
- batadv_softif_vlan_put(vlan); \
- return res; \
-}
-
-/* Use this, if you are going to turn a [name] in the vlan struct on or off */
-#define BATADV_ATTR_VLAN_BOOL(_name, _mode, _post_func) \
- static BATADV_ATTR_VLAN_STORE_BOOL(_name, _post_func) \
- static BATADV_ATTR_VLAN_SHOW_BOOL(_name) \
- static BATADV_ATTR_VLAN(_name, _mode, batadv_show_vlan_##_name, \
- batadv_store_vlan_##_name)
-
-#define BATADV_ATTR_HIF_STORE_UINT(_name, _var, _min, _max, _post_func) \
-ssize_t batadv_store_##_name(struct kobject *kobj, \
- struct attribute *attr, char *buff, \
- size_t count) \
-{ \
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \
- struct batadv_hard_iface *hard_iface; \
- ssize_t length; \
- \
- hard_iface = batadv_hardif_get_by_netdev(net_dev); \
- if (!hard_iface) \
- return 0; \
- \
- length = __batadv_store_uint_attr(buff, count, _min, _max, \
- _post_func, attr, \
- &hard_iface->_var, \
- hard_iface->soft_iface, \
- net_dev); \
- \
- batadv_hardif_put(hard_iface); \
- return length; \
-}
-
-#define BATADV_ATTR_HIF_SHOW_UINT(_name, _var) \
-ssize_t batadv_show_##_name(struct kobject *kobj, \
- struct attribute *attr, char *buff) \
-{ \
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \
- struct batadv_hard_iface *hard_iface; \
- ssize_t length; \
- \
- hard_iface = batadv_hardif_get_by_netdev(net_dev); \
- if (!hard_iface) \
- return 0; \
- \
- length = sprintf(buff, "%i\n", atomic_read(&hard_iface->_var)); \
- \
- batadv_hardif_put(hard_iface); \
- return length; \
-}
-
-/* Use this, if you are going to set [name] in hard_iface to an
- * unsigned integer value
- */
-#define BATADV_ATTR_HIF_UINT(_name, _var, _mode, _min, _max, _post_func)\
- static BATADV_ATTR_HIF_STORE_UINT(_name, _var, _min, \
- _max, _post_func) \
- static BATADV_ATTR_HIF_SHOW_UINT(_name, _var) \
- static BATADV_ATTR(_name, _mode, batadv_show_##_name, \
- batadv_store_##_name)
-
-static int batadv_store_bool_attr(char *buff, size_t count,
- struct net_device *net_dev,
- const char *attr_name, atomic_t *attr,
- bool *changed)
-{
- int enabled = -1;
-
- *changed = false;
-
- if (buff[count - 1] == '\n')
- buff[count - 1] = '\0';
-
- if ((strncmp(buff, "1", 2) == 0) ||
- (strncmp(buff, "enable", 7) == 0) ||
- (strncmp(buff, "enabled", 8) == 0))
- enabled = 1;
-
- if ((strncmp(buff, "0", 2) == 0) ||
- (strncmp(buff, "disable", 8) == 0) ||
- (strncmp(buff, "disabled", 9) == 0))
- enabled = 0;
-
- if (enabled < 0) {
- batadv_info(net_dev, "%s: Invalid parameter received: %s\n",
- attr_name, buff);
- return -EINVAL;
- }
-
- if (atomic_read(attr) == enabled)
- return count;
-
- batadv_info(net_dev, "%s: Changing from: %s to: %s\n", attr_name,
- atomic_read(attr) == 1 ? "enabled" : "disabled",
- enabled == 1 ? "enabled" : "disabled");
-
- *changed = true;
-
- atomic_set(attr, (unsigned int)enabled);
- return count;
-}
-
-static inline ssize_t
-__batadv_store_bool_attr(char *buff, size_t count,
- void (*post_func)(struct net_device *),
- struct attribute *attr,
- atomic_t *attr_store, struct net_device *net_dev)
-{
- bool changed;
- int ret;
-
- ret = batadv_store_bool_attr(buff, count, net_dev, attr->name,
- attr_store, &changed);
- if (post_func && changed)
- post_func(net_dev);
-
- return ret;
-}
-
-static int batadv_store_uint_attr(const char *buff, size_t count,
- struct net_device *net_dev,
- struct net_device *slave_dev,
- const char *attr_name,
- unsigned int min, unsigned int max,
- atomic_t *attr)
-{
- char ifname[IFNAMSIZ + 3] = "";
- unsigned long uint_val;
- int ret;
-
- ret = kstrtoul(buff, 10, &uint_val);
- if (ret) {
- batadv_info(net_dev, "%s: Invalid parameter received: %s\n",
- attr_name, buff);
- return -EINVAL;
- }
-
- if (uint_val < min) {
- batadv_info(net_dev, "%s: Value is too small: %lu min: %u\n",
- attr_name, uint_val, min);
- return -EINVAL;
- }
-
- if (uint_val > max) {
- batadv_info(net_dev, "%s: Value is too big: %lu max: %u\n",
- attr_name, uint_val, max);
- return -EINVAL;
- }
-
- if (atomic_read(attr) == uint_val)
- return count;
-
- if (slave_dev)
- snprintf(ifname, sizeof(ifname), "%s: ", slave_dev->name);
-
- batadv_info(net_dev, "%s: %sChanging from: %i to: %lu\n",
- attr_name, ifname, atomic_read(attr), uint_val);
-
- atomic_set(attr, uint_val);
- return count;
-}
-
-static ssize_t __batadv_store_uint_attr(const char *buff, size_t count,
- int min, int max,
- void (*post_func)(struct net_device *),
- const struct attribute *attr,
- atomic_t *attr_store,
- struct net_device *net_dev,
- struct net_device *slave_dev)
-{
- int ret;
-
- ret = batadv_store_uint_attr(buff, count, net_dev, slave_dev,
- attr->name, min, max, attr_store);
- if (post_func && ret)
- post_func(net_dev);
-
- return ret;
-}
-
-static ssize_t batadv_show_bat_algo(struct kobject *kobj,
- struct attribute *attr, char *buff)
-{
- struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
-
- return sprintf(buff, "%s\n", bat_priv->algo_ops->name);
-}
-
-static void batadv_post_gw_reselect(struct net_device *net_dev)
-{
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
-
- batadv_gw_reselect(bat_priv);
-}
-
-static ssize_t batadv_show_gw_mode(struct kobject *kobj, struct attribute *attr,
- char *buff)
-{
- struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
- int bytes_written;
-
- /* GW mode is not available if the routing algorithm in use does not
- * implement the GW API
- */
- if (!bat_priv->algo_ops->gw.get_best_gw_node ||
- !bat_priv->algo_ops->gw.is_eligible)
- return -ENOENT;
-
- switch (atomic_read(&bat_priv->gw.mode)) {
- case BATADV_GW_MODE_CLIENT:
- bytes_written = sprintf(buff, "%s\n",
- BATADV_GW_MODE_CLIENT_NAME);
- break;
- case BATADV_GW_MODE_SERVER:
- bytes_written = sprintf(buff, "%s\n",
- BATADV_GW_MODE_SERVER_NAME);
- break;
- default:
- bytes_written = sprintf(buff, "%s\n",
- BATADV_GW_MODE_OFF_NAME);
- break;
- }
-
- return bytes_written;
-}
-
-static ssize_t batadv_store_gw_mode(struct kobject *kobj,
- struct attribute *attr, char *buff,
- size_t count)
-{
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- char *curr_gw_mode_str;
- int gw_mode_tmp = -1;
-
- /* toggling GW mode is allowed only if the routing algorithm in use
- * provides the GW API
- */
- if (!bat_priv->algo_ops->gw.get_best_gw_node ||
- !bat_priv->algo_ops->gw.is_eligible)
- return -EINVAL;
-
- if (buff[count - 1] == '\n')
- buff[count - 1] = '\0';
-
- if (strncmp(buff, BATADV_GW_MODE_OFF_NAME,
- strlen(BATADV_GW_MODE_OFF_NAME)) == 0)
- gw_mode_tmp = BATADV_GW_MODE_OFF;
-
- if (strncmp(buff, BATADV_GW_MODE_CLIENT_NAME,
- strlen(BATADV_GW_MODE_CLIENT_NAME)) == 0)
- gw_mode_tmp = BATADV_GW_MODE_CLIENT;
-
- if (strncmp(buff, BATADV_GW_MODE_SERVER_NAME,
- strlen(BATADV_GW_MODE_SERVER_NAME)) == 0)
- gw_mode_tmp = BATADV_GW_MODE_SERVER;
-
- if (gw_mode_tmp < 0) {
- batadv_info(net_dev,
- "Invalid parameter for 'gw mode' setting received: %s\n",
- buff);
- return -EINVAL;
- }
-
- if (atomic_read(&bat_priv->gw.mode) == gw_mode_tmp)
- return count;
-
- switch (atomic_read(&bat_priv->gw.mode)) {
- case BATADV_GW_MODE_CLIENT:
- curr_gw_mode_str = BATADV_GW_MODE_CLIENT_NAME;
- break;
- case BATADV_GW_MODE_SERVER:
- curr_gw_mode_str = BATADV_GW_MODE_SERVER_NAME;
- break;
- default:
- curr_gw_mode_str = BATADV_GW_MODE_OFF_NAME;
- break;
- }
-
- batadv_info(net_dev, "Changing gw mode from: %s to: %s\n",
- curr_gw_mode_str, buff);
-
- /* Invoking batadv_gw_reselect() is not enough to really de-select the
- * current GW. It will only instruct the gateway client code to perform
- * a re-election the next time that this is needed.
- *
- * When gw client mode is being switched off the current GW must be
- * de-selected explicitly otherwise no GW_ADD uevent is thrown on
- * client mode re-activation. This is operation is performed in
- * batadv_gw_check_client_stop().
- */
- batadv_gw_reselect(bat_priv);
- /* always call batadv_gw_check_client_stop() before changing the gateway
- * state
- */
- batadv_gw_check_client_stop(bat_priv);
- atomic_set(&bat_priv->gw.mode, (unsigned int)gw_mode_tmp);
- batadv_gw_tvlv_container_update(bat_priv);
- return count;
-}
-
-static ssize_t batadv_show_gw_sel_class(struct kobject *kobj,
- struct attribute *attr, char *buff)
-{
- struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
-
- /* GW selection class is not available if the routing algorithm in use
- * does not implement the GW API
- */
- if (!bat_priv->algo_ops->gw.get_best_gw_node ||
- !bat_priv->algo_ops->gw.is_eligible)
- return -ENOENT;
-
- if (bat_priv->algo_ops->gw.show_sel_class)
- return bat_priv->algo_ops->gw.show_sel_class(bat_priv, buff);
-
- return sprintf(buff, "%i\n", atomic_read(&bat_priv->gw.sel_class));
-}
-
-static ssize_t batadv_store_gw_sel_class(struct kobject *kobj,
- struct attribute *attr, char *buff,
- size_t count)
-{
- struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
-
- /* setting the GW selection class is allowed only if the routing
- * algorithm in use implements the GW API
- */
- if (!bat_priv->algo_ops->gw.get_best_gw_node ||
- !bat_priv->algo_ops->gw.is_eligible)
- return -EINVAL;
-
- if (buff[count - 1] == '\n')
- buff[count - 1] = '\0';
-
- if (bat_priv->algo_ops->gw.store_sel_class)
- return bat_priv->algo_ops->gw.store_sel_class(bat_priv, buff,
- count);
-
- return __batadv_store_uint_attr(buff, count, 1, BATADV_TQ_MAX_VALUE,
- batadv_post_gw_reselect, attr,
- &bat_priv->gw.sel_class,
- bat_priv->soft_iface, NULL);
-}
-
-static ssize_t batadv_show_gw_bwidth(struct kobject *kobj,
- struct attribute *attr, char *buff)
-{
- struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
- u32 down, up;
-
- down = atomic_read(&bat_priv->gw.bandwidth_down);
- up = atomic_read(&bat_priv->gw.bandwidth_up);
-
- return sprintf(buff, "%u.%u/%u.%u MBit\n", down / 10,
- down % 10, up / 10, up % 10);
-}
-
-static ssize_t batadv_store_gw_bwidth(struct kobject *kobj,
- struct attribute *attr, char *buff,
- size_t count)
-{
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
-
- if (buff[count - 1] == '\n')
- buff[count - 1] = '\0';
-
- return batadv_gw_bandwidth_set(net_dev, buff, count);
-}
-
-/**
- * batadv_show_isolation_mark() - print the current isolation mark/mask
- * @kobj: kobject representing the private mesh sysfs directory
- * @attr: the batman-adv attribute the user is interacting with
- * @buff: the buffer that will contain the data to send back to the user
- *
- * Return: the number of bytes written into 'buff' on success or a negative
- * error code in case of failure
- */
-static ssize_t batadv_show_isolation_mark(struct kobject *kobj,
- struct attribute *attr, char *buff)
-{
- struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj);
-
- return sprintf(buff, "%#.8x/%#.8x\n", bat_priv->isolation_mark,
- bat_priv->isolation_mark_mask);
-}
-
-/**
- * batadv_store_isolation_mark() - parse and store the isolation mark/mask
- * entered by the user
- * @kobj: kobject representing the private mesh sysfs directory
- * @attr: the batman-adv attribute the user is interacting with
- * @buff: the buffer containing the user data
- * @count: number of bytes in the buffer
- *
- * Return: 'count' on success or a negative error code in case of failure
- */
-static ssize_t batadv_store_isolation_mark(struct kobject *kobj,
- struct attribute *attr, char *buff,
- size_t count)
-{
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- u32 mark, mask;
- char *mask_ptr;
-
- /* parse the mask if it has been specified, otherwise assume the mask is
- * the biggest possible
- */
- mask = 0xFFFFFFFF;
- mask_ptr = strchr(buff, '/');
- if (mask_ptr) {
- *mask_ptr = '\0';
- mask_ptr++;
-
- /* the mask must be entered in hex base as it is going to be a
- * bitmask and not a prefix length
- */
- if (kstrtou32(mask_ptr, 16, &mask) < 0)
- return -EINVAL;
- }
-
- /* the mark can be entered in any base */
- if (kstrtou32(buff, 0, &mark) < 0)
- return -EINVAL;
-
- bat_priv->isolation_mark_mask = mask;
- /* erase bits not covered by the mask */
- bat_priv->isolation_mark = mark & bat_priv->isolation_mark_mask;
-
- batadv_info(net_dev,
- "New skb mark for extended isolation: %#.8x/%#.8x\n",
- bat_priv->isolation_mark, bat_priv->isolation_mark_mask);
-
- return count;
-}
-
-BATADV_ATTR_SIF_BOOL(aggregated_ogms, 0644, NULL);
-BATADV_ATTR_SIF_BOOL(bonding, 0644, NULL);
-#ifdef CONFIG_BATMAN_ADV_BLA
-BATADV_ATTR_SIF_BOOL(bridge_loop_avoidance, 0644, batadv_bla_status_update);
-#endif
-#ifdef CONFIG_BATMAN_ADV_DAT
-BATADV_ATTR_SIF_BOOL(distributed_arp_table, 0644, batadv_dat_status_update);
-#endif
-BATADV_ATTR_SIF_BOOL(fragmentation, 0644, batadv_update_min_mtu);
-static BATADV_ATTR(routing_algo, 0444, batadv_show_bat_algo, NULL);
-static BATADV_ATTR(gw_mode, 0644, batadv_show_gw_mode, batadv_store_gw_mode);
-BATADV_ATTR_SIF_UINT(orig_interval, orig_interval, 0644, 2 * BATADV_JITTER,
- INT_MAX, NULL);
-BATADV_ATTR_SIF_UINT(hop_penalty, hop_penalty, 0644, 0, BATADV_TQ_MAX_VALUE,
- NULL);
-static BATADV_ATTR(gw_sel_class, 0644, batadv_show_gw_sel_class,
- batadv_store_gw_sel_class);
-static BATADV_ATTR(gw_bandwidth, 0644, batadv_show_gw_bwidth,
- batadv_store_gw_bwidth);
-#ifdef CONFIG_BATMAN_ADV_MCAST
-BATADV_ATTR_SIF_BOOL(multicast_mode, 0644, NULL);
-#endif
-#ifdef CONFIG_BATMAN_ADV_DEBUG
-BATADV_ATTR_SIF_UINT(log_level, log_level, 0644, 0, BATADV_DBG_ALL, NULL);
-#endif
-#ifdef CONFIG_BATMAN_ADV_NC
-BATADV_ATTR_SIF_BOOL(network_coding, 0644, batadv_nc_status_update);
-#endif
-static BATADV_ATTR(isolation_mark, 0644, batadv_show_isolation_mark,
- batadv_store_isolation_mark);
-
-static struct batadv_attribute *batadv_mesh_attrs[] = {
- &batadv_attr_aggregated_ogms,
- &batadv_attr_bonding,
-#ifdef CONFIG_BATMAN_ADV_BLA
- &batadv_attr_bridge_loop_avoidance,
-#endif
-#ifdef CONFIG_BATMAN_ADV_DAT
- &batadv_attr_distributed_arp_table,
-#endif
-#ifdef CONFIG_BATMAN_ADV_MCAST
- &batadv_attr_multicast_mode,
-#endif
- &batadv_attr_fragmentation,
- &batadv_attr_routing_algo,
- &batadv_attr_gw_mode,
- &batadv_attr_orig_interval,
- &batadv_attr_hop_penalty,
- &batadv_attr_gw_sel_class,
- &batadv_attr_gw_bandwidth,
-#ifdef CONFIG_BATMAN_ADV_DEBUG
- &batadv_attr_log_level,
-#endif
-#ifdef CONFIG_BATMAN_ADV_NC
- &batadv_attr_network_coding,
-#endif
- &batadv_attr_isolation_mark,
- NULL,
-};
-
-BATADV_ATTR_VLAN_BOOL(ap_isolation, 0644, NULL);
-
-/* array of vlan specific sysfs attributes */
-static struct batadv_attribute *batadv_vlan_attrs[] = {
- &batadv_attr_vlan_ap_isolation,
- NULL,
-};
-
-/**
- * batadv_sysfs_add_meshif() - Add soft interface specific sysfs entries
- * @dev: netdev struct of the soft interface
- *
- * Return: 0 on success or negative error number in case of failure
- */
-int batadv_sysfs_add_meshif(struct net_device *dev)
-{
- struct kobject *batif_kobject = &dev->dev.kobj;
- struct batadv_priv *bat_priv = netdev_priv(dev);
- struct batadv_attribute **bat_attr;
- int err;
-
- bat_priv->mesh_obj = kobject_create_and_add(BATADV_SYSFS_IF_MESH_SUBDIR,
- batif_kobject);
- if (!bat_priv->mesh_obj) {
- batadv_err(dev, "Can't add sysfs directory: %s/%s\n", dev->name,
- BATADV_SYSFS_IF_MESH_SUBDIR);
- goto out;
- }
-
- for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr) {
- err = sysfs_create_file(bat_priv->mesh_obj,
- &((*bat_attr)->attr));
- if (err) {
- batadv_err(dev, "Can't add sysfs file: %s/%s/%s\n",
- dev->name, BATADV_SYSFS_IF_MESH_SUBDIR,
- ((*bat_attr)->attr).name);
- goto rem_attr;
- }
- }
-
- return 0;
-
-rem_attr:
- for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr)
- sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr));
-
- kobject_uevent(bat_priv->mesh_obj, KOBJ_REMOVE);
- kobject_del(bat_priv->mesh_obj);
- kobject_put(bat_priv->mesh_obj);
- bat_priv->mesh_obj = NULL;
-out:
- return -ENOMEM;
-}
-
-/**
- * batadv_sysfs_del_meshif() - Remove soft interface specific sysfs entries
- * @dev: netdev struct of the soft interface
- */
-void batadv_sysfs_del_meshif(struct net_device *dev)
-{
- struct batadv_priv *bat_priv = netdev_priv(dev);
- struct batadv_attribute **bat_attr;
-
- for (bat_attr = batadv_mesh_attrs; *bat_attr; ++bat_attr)
- sysfs_remove_file(bat_priv->mesh_obj, &((*bat_attr)->attr));
-
- kobject_uevent(bat_priv->mesh_obj, KOBJ_REMOVE);
- kobject_del(bat_priv->mesh_obj);
- kobject_put(bat_priv->mesh_obj);
- bat_priv->mesh_obj = NULL;
-}
-
-/**
- * batadv_sysfs_add_vlan() - add all the needed sysfs objects for the new vlan
- * @dev: netdev of the mesh interface
- * @vlan: private data of the newly added VLAN interface
- *
- * Return: 0 on success and -ENOMEM if any of the structure allocations fails.
- */
-int batadv_sysfs_add_vlan(struct net_device *dev,
- struct batadv_softif_vlan *vlan)
-{
- char vlan_subdir[sizeof(BATADV_SYSFS_VLAN_SUBDIR_PREFIX) + 5];
- struct batadv_priv *bat_priv = netdev_priv(dev);
- struct batadv_attribute **bat_attr;
- int err;
-
- if (vlan->vid & BATADV_VLAN_HAS_TAG) {
- sprintf(vlan_subdir, BATADV_SYSFS_VLAN_SUBDIR_PREFIX "%hu",
- vlan->vid & VLAN_VID_MASK);
-
- vlan->kobj = kobject_create_and_add(vlan_subdir,
- bat_priv->mesh_obj);
- if (!vlan->kobj) {
- batadv_err(dev, "Can't add sysfs directory: %s/%s\n",
- dev->name, vlan_subdir);
- goto out;
- }
- } else {
- /* the untagged LAN uses the root folder to store its "VLAN
- * specific attributes"
- */
- vlan->kobj = bat_priv->mesh_obj;
- kobject_get(bat_priv->mesh_obj);
- }
-
- for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr) {
- err = sysfs_create_file(vlan->kobj,
- &((*bat_attr)->attr));
- if (err) {
- batadv_err(dev, "Can't add sysfs file: %s/%s/%s\n",
- dev->name, vlan_subdir,
- ((*bat_attr)->attr).name);
- goto rem_attr;
- }
- }
-
- return 0;
-
-rem_attr:
- for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr)
- sysfs_remove_file(vlan->kobj, &((*bat_attr)->attr));
-
- if (vlan->kobj != bat_priv->mesh_obj) {
- kobject_uevent(vlan->kobj, KOBJ_REMOVE);
- kobject_del(vlan->kobj);
- }
- kobject_put(vlan->kobj);
- vlan->kobj = NULL;
-out:
- return -ENOMEM;
-}
-
-/**
- * batadv_sysfs_del_vlan() - remove all the sysfs objects for a given VLAN
- * @bat_priv: the bat priv with all the soft interface information
- * @vlan: the private data of the VLAN to destroy
- */
-void batadv_sysfs_del_vlan(struct batadv_priv *bat_priv,
- struct batadv_softif_vlan *vlan)
-{
- struct batadv_attribute **bat_attr;
-
- for (bat_attr = batadv_vlan_attrs; *bat_attr; ++bat_attr)
- sysfs_remove_file(vlan->kobj, &((*bat_attr)->attr));
-
- if (vlan->kobj != bat_priv->mesh_obj) {
- kobject_uevent(vlan->kobj, KOBJ_REMOVE);
- kobject_del(vlan->kobj);
- }
- kobject_put(vlan->kobj);
- vlan->kobj = NULL;
-}
-
-static ssize_t batadv_show_mesh_iface(struct kobject *kobj,
- struct attribute *attr, char *buff)
-{
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
- struct batadv_hard_iface *hard_iface;
- ssize_t length;
- const char *ifname;
-
- hard_iface = batadv_hardif_get_by_netdev(net_dev);
- if (!hard_iface)
- return 0;
-
- if (hard_iface->if_status == BATADV_IF_NOT_IN_USE)
- ifname = "none";
- else
- ifname = hard_iface->soft_iface->name;
-
- length = sprintf(buff, "%s\n", ifname);
-
- batadv_hardif_put(hard_iface);
-
- return length;
-}
-
-/**
- * batadv_store_mesh_iface_finish() - store new hardif mesh_iface state
- * @net_dev: netdevice to add/remove to/from batman-adv soft-interface
- * @ifname: name of soft-interface to modify
- *
- * Changes the parts of the hard+soft interface which can not be modified under
- * sysfs lock (to prevent deadlock situations).
- *
- * Return: 0 on success, 0 < on failure
- */
-static int batadv_store_mesh_iface_finish(struct net_device *net_dev,
- char ifname[IFNAMSIZ])
-{
- struct net *net = dev_net(net_dev);
- struct batadv_hard_iface *hard_iface;
- int status_tmp;
- int ret = 0;
-
- ASSERT_RTNL();
-
- hard_iface = batadv_hardif_get_by_netdev(net_dev);
- if (!hard_iface)
- return 0;
-
- if (strncmp(ifname, "none", 4) == 0)
- status_tmp = BATADV_IF_NOT_IN_USE;
- else
- status_tmp = BATADV_IF_I_WANT_YOU;
-
- if (hard_iface->if_status == status_tmp)
- goto out;
-
- if (hard_iface->soft_iface &&
- strncmp(hard_iface->soft_iface->name, ifname, IFNAMSIZ) == 0)
- goto out;
-
- if (status_tmp == BATADV_IF_NOT_IN_USE) {
- batadv_hardif_disable_interface(hard_iface,
- BATADV_IF_CLEANUP_AUTO);
- goto out;
- }
-
- /* if the interface already is in use */
- if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
- batadv_hardif_disable_interface(hard_iface,
- BATADV_IF_CLEANUP_AUTO);
-
- ret = batadv_hardif_enable_interface(hard_iface, net, ifname);
-out:
- batadv_hardif_put(hard_iface);
- return ret;
-}
-
-/**
- * batadv_store_mesh_iface_work() - store new hardif mesh_iface state
- * @work: work queue item
- *
- * Changes the parts of the hard+soft interface which can not be modified under
- * sysfs lock (to prevent deadlock situations).
- */
-static void batadv_store_mesh_iface_work(struct work_struct *work)
-{
- struct batadv_store_mesh_work *store_work;
- int ret;
-
- store_work = container_of(work, struct batadv_store_mesh_work, work);
-
- rtnl_lock();
- ret = batadv_store_mesh_iface_finish(store_work->net_dev,
- store_work->soft_iface_name);
- rtnl_unlock();
-
- if (ret < 0)
- pr_err("Failed to store new mesh_iface state %s for %s: %d\n",
- store_work->soft_iface_name, store_work->net_dev->name,
- ret);
-
- dev_put(store_work->net_dev);
- kfree(store_work);
-}
-
-static ssize_t batadv_store_mesh_iface(struct kobject *kobj,
- struct attribute *attr, char *buff,
- size_t count)
-{
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
- struct batadv_store_mesh_work *store_work;
-
- if (buff[count - 1] == '\n')
- buff[count - 1] = '\0';
-
- if (strlen(buff) >= IFNAMSIZ) {
- pr_err("Invalid parameter for 'mesh_iface' setting received: interface name too long '%s'\n",
- buff);
- return -EINVAL;
- }
-
- store_work = kmalloc(sizeof(*store_work), GFP_KERNEL);
- if (!store_work)
- return -ENOMEM;
-
- dev_hold(net_dev);
- INIT_WORK(&store_work->work, batadv_store_mesh_iface_work);
- store_work->net_dev = net_dev;
- strlcpy(store_work->soft_iface_name, buff,
- sizeof(store_work->soft_iface_name));
-
- queue_work(batadv_event_workqueue, &store_work->work);
-
- return count;
-}
-
-static ssize_t batadv_show_iface_status(struct kobject *kobj,
- struct attribute *attr, char *buff)
-{
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
- struct batadv_hard_iface *hard_iface;
- ssize_t length;
-
- hard_iface = batadv_hardif_get_by_netdev(net_dev);
- if (!hard_iface)
- return 0;
-
- switch (hard_iface->if_status) {
- case BATADV_IF_TO_BE_REMOVED:
- length = sprintf(buff, "disabling\n");
- break;
- case BATADV_IF_INACTIVE:
- length = sprintf(buff, "inactive\n");
- break;
- case BATADV_IF_ACTIVE:
- length = sprintf(buff, "active\n");
- break;
- case BATADV_IF_TO_BE_ACTIVATED:
- length = sprintf(buff, "enabling\n");
- break;
- case BATADV_IF_NOT_IN_USE:
- default:
- length = sprintf(buff, "not in use\n");
- break;
- }
-
- batadv_hardif_put(hard_iface);
-
- return length;
-}
-
-#ifdef CONFIG_BATMAN_ADV_BATMAN_V
-
-/**
- * batadv_store_throughput_override() - parse and store throughput override
- * entered by the user
- * @kobj: kobject representing the private mesh sysfs directory
- * @attr: the batman-adv attribute the user is interacting with
- * @buff: the buffer containing the user data
- * @count: number of bytes in the buffer
- *
- * Return: 'count' on success or a negative error code in case of failure
- */
-static ssize_t batadv_store_throughput_override(struct kobject *kobj,
- struct attribute *attr,
- char *buff, size_t count)
-{
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
- struct batadv_hard_iface *hard_iface;
- u32 tp_override;
- u32 old_tp_override;
- bool ret;
-
- hard_iface = batadv_hardif_get_by_netdev(net_dev);
- if (!hard_iface)
- return -EINVAL;
-
- if (buff[count - 1] == '\n')
- buff[count - 1] = '\0';
-
- ret = batadv_parse_throughput(net_dev, buff, "throughput_override",
- &tp_override);
- if (!ret)
- return count;
-
- old_tp_override = atomic_read(&hard_iface->bat_v.throughput_override);
- if (old_tp_override == tp_override)
- goto out;
-
- batadv_info(hard_iface->soft_iface,
- "%s: %s: Changing from: %u.%u MBit to: %u.%u MBit\n",
- "throughput_override", net_dev->name,
- old_tp_override / 10, old_tp_override % 10,
- tp_override / 10, tp_override % 10);
-
- atomic_set(&hard_iface->bat_v.throughput_override, tp_override);
-
-out:
- batadv_hardif_put(hard_iface);
- return count;
-}
-
-static ssize_t batadv_show_throughput_override(struct kobject *kobj,
- struct attribute *attr,
- char *buff)
-{
- struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
- struct batadv_hard_iface *hard_iface;
- u32 tp_override;
-
- hard_iface = batadv_hardif_get_by_netdev(net_dev);
- if (!hard_iface)
- return -EINVAL;
-
- tp_override = atomic_read(&hard_iface->bat_v.throughput_override);
-
- return sprintf(buff, "%u.%u MBit\n", tp_override / 10,
- tp_override % 10);
-}
-
-#endif
-
-static BATADV_ATTR(mesh_iface, 0644, batadv_show_mesh_iface,
- batadv_store_mesh_iface);
-static BATADV_ATTR(iface_status, 0444, batadv_show_iface_status, NULL);
-#ifdef CONFIG_BATMAN_ADV_BATMAN_V
-BATADV_ATTR_HIF_UINT(elp_interval, bat_v.elp_interval, 0644,
- 2 * BATADV_JITTER, INT_MAX, NULL);
-static BATADV_ATTR(throughput_override, 0644, batadv_show_throughput_override,
- batadv_store_throughput_override);
-#endif
-
-static struct batadv_attribute *batadv_batman_attrs[] = {
- &batadv_attr_mesh_iface,
- &batadv_attr_iface_status,
-#ifdef CONFIG_BATMAN_ADV_BATMAN_V
- &batadv_attr_elp_interval,
- &batadv_attr_throughput_override,
-#endif
- NULL,
-};
-
-/**
- * batadv_sysfs_add_hardif() - Add hard interface specific sysfs entries
- * @hardif_obj: address where to store the pointer to new sysfs folder
- * @dev: netdev struct of the hard interface
- *
- * Return: 0 on success or negative error number in case of failure
- */
-int batadv_sysfs_add_hardif(struct kobject **hardif_obj, struct net_device *dev)
-{
- struct kobject *hardif_kobject = &dev->dev.kobj;
- struct batadv_attribute **bat_attr;
- int err;
-
- *hardif_obj = kobject_create_and_add(BATADV_SYSFS_IF_BAT_SUBDIR,
- hardif_kobject);
-
- if (!*hardif_obj) {
- batadv_err(dev, "Can't add sysfs directory: %s/%s\n", dev->name,
- BATADV_SYSFS_IF_BAT_SUBDIR);
- goto out;
- }
-
- for (bat_attr = batadv_batman_attrs; *bat_attr; ++bat_attr) {
- err = sysfs_create_file(*hardif_obj, &((*bat_attr)->attr));
- if (err) {
- batadv_err(dev, "Can't add sysfs file: %s/%s/%s\n",
- dev->name, BATADV_SYSFS_IF_BAT_SUBDIR,
- ((*bat_attr)->attr).name);
- goto rem_attr;
- }
- }
-
- return 0;
-
-rem_attr:
- for (bat_attr = batadv_batman_attrs; *bat_attr; ++bat_attr)
- sysfs_remove_file(*hardif_obj, &((*bat_attr)->attr));
-out:
- return -ENOMEM;
-}
-
-/**
- * batadv_sysfs_del_hardif() - Remove hard interface specific sysfs entries
- * @hardif_obj: address to the pointer to which stores batman-adv sysfs folder
- * of the hard interface
- */
-void batadv_sysfs_del_hardif(struct kobject **hardif_obj)
-{
- kobject_uevent(*hardif_obj, KOBJ_REMOVE);
- kobject_del(*hardif_obj);
- kobject_put(*hardif_obj);
- *hardif_obj = NULL;
-}
-
-/**
- * batadv_throw_uevent() - Send an uevent with batman-adv specific env data
- * @bat_priv: the bat priv with all the soft interface information
- * @type: subsystem type of event. Stored in uevent's BATTYPE
- * @action: action type of event. Stored in uevent's BATACTION
- * @data: string with additional information to the event (ignored for
- * BATADV_UEV_DEL). Stored in uevent's BATDATA
- *
- * Return: 0 on success or negative error number in case of failure
- */
-int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type,
- enum batadv_uev_action action, const char *data)
-{
- int ret = -ENOMEM;
- struct kobject *bat_kobj;
- char *uevent_env[4] = { NULL, NULL, NULL, NULL };
-
- bat_kobj = &bat_priv->soft_iface->dev.kobj;
-
- uevent_env[0] = kasprintf(GFP_ATOMIC,
- "%s%s", BATADV_UEV_TYPE_VAR,
- batadv_uev_type_str[type]);
- if (!uevent_env[0])
- goto out;
-
- uevent_env[1] = kasprintf(GFP_ATOMIC,
- "%s%s", BATADV_UEV_ACTION_VAR,
- batadv_uev_action_str[action]);
- if (!uevent_env[1])
- goto out;
-
- /* If the event is DEL, ignore the data field */
- if (action != BATADV_UEV_DEL) {
- uevent_env[2] = kasprintf(GFP_ATOMIC,
- "%s%s", BATADV_UEV_DATA_VAR, data);
- if (!uevent_env[2])
- goto out;
- }
-
- ret = kobject_uevent_env(bat_kobj, KOBJ_CHANGE, uevent_env);
-out:
- kfree(uevent_env[0]);
- kfree(uevent_env[1]);
- kfree(uevent_env[2]);
-
- if (ret)
- batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
- "Impossible to send uevent for (%s,%s,%s) event (err: %d)\n",
- batadv_uev_type_str[type],
- batadv_uev_action_str[action],
- (action == BATADV_UEV_DEL ? "NULL" : data), ret);
- return ret;
-}
diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h
deleted file mode 100644
index c1e3fb69952d..000000000000
--- a/net/batman-adv/sysfs.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors:
- *
- * Marek Lindner
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _NET_BATMAN_ADV_SYSFS_H_
-#define _NET_BATMAN_ADV_SYSFS_H_
-
-#include "main.h"
-
-#include <linux/sysfs.h>
-#include <linux/types.h>
-
-struct kobject;
-struct net_device;
-
-#define BATADV_SYSFS_IF_MESH_SUBDIR "mesh"
-#define BATADV_SYSFS_IF_BAT_SUBDIR "batman_adv"
-/**
- * BATADV_SYSFS_VLAN_SUBDIR_PREFIX - prefix of the subfolder that will be
- * created in the sysfs hierarchy for each VLAN interface. The subfolder will
- * be named "BATADV_SYSFS_VLAN_SUBDIR_PREFIX%vid".
- */
-#define BATADV_SYSFS_VLAN_SUBDIR_PREFIX "vlan"
-
-/**
- * struct batadv_attribute - sysfs export helper for batman-adv attributes
- */
-struct batadv_attribute {
- /** @attr: sysfs attribute file */
- struct attribute attr;
-
- /**
- * @show: function to export the current attribute's content to sysfs
- */
- ssize_t (*show)(struct kobject *kobj, struct attribute *attr,
- char *buf);
-
- /**
- * @store: function to load new value from character buffer and save it
- * in batman-adv attribute
- */
- ssize_t (*store)(struct kobject *kobj, struct attribute *attr,
- char *buf, size_t count);
-};
-
-int batadv_sysfs_add_meshif(struct net_device *dev);
-void batadv_sysfs_del_meshif(struct net_device *dev);
-int batadv_sysfs_add_hardif(struct kobject **hardif_obj,
- struct net_device *dev);
-void batadv_sysfs_del_hardif(struct kobject **hardif_obj);
-int batadv_sysfs_add_vlan(struct net_device *dev,
- struct batadv_softif_vlan *vlan);
-void batadv_sysfs_del_vlan(struct batadv_priv *bat_priv,
- struct batadv_softif_vlan *vlan);
-int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type,
- enum batadv_uev_action action, const char *data);
-
-#endif /* _NET_BATMAN_ADV_SYSFS_H_ */
diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 11520de96ccb..350b149e48be 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -1,19 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2012-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Edo Monticelli, Antonio Quartulli
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "tp_meter.h"
@@ -24,16 +12,18 @@
#include <linux/byteorder/generic.h>
#include <linux/cache.h>
#include <linux/compiler.h>
+#include <linux/container_of.h>
#include <linux/err.h>
#include <linux/etherdevice.h>
#include <linux/gfp.h>
#include <linux/if_ether.h>
#include <linux/init.h>
#include <linux/jiffies.h>
-#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/kthread.h>
+#include <linux/limits.h>
#include <linux/list.h>
+#include <linux/minmax.h>
#include <linux/netdevice.h>
#include <linux/param.h>
#include <linux/printk.h>
@@ -77,7 +67,7 @@
/**
* BATADV_TP_MAX_RTO - Maximum sender timeout. If the sender RTO gets beyond
- * such amound of milliseconds, the receiver is considered unreachable and the
+ * such amount of milliseconds, the receiver is considered unreachable and the
* connection is killed
*/
#define BATADV_TP_MAX_RTO 30000
@@ -119,10 +109,10 @@ static u32 batadv_tp_session_cookie(const u8 session[2], u8 icmp_uid)
* batadv_tp_cwnd() - compute the new cwnd size
* @base: base cwnd size value
* @increment: the value to add to base to get the new size
- * @min: minumim cwnd value (usually MSS)
+ * @min: minimum cwnd value (usually MSS)
*
- * Return the new cwnd size and ensures it does not exceed the Advertised
- * Receiver Window size. It is wrap around safe.
+ * Return the new cwnd size and ensure it does not exceed the Advertised
+ * Receiver Window size. It is wrapped around safely.
* For details refer to Section 3.1 of RFC5681
*
* Return: new congestion window size in bytes
@@ -141,7 +131,7 @@ static u32 batadv_tp_cwnd(u32 base, u32 increment, u32 min)
}
/**
- * batadv_tp_updated_cwnd() - update the Congestion Windows
+ * batadv_tp_update_cwnd() - update the Congestion Windows
* @tp_vars: the private data of the current TP meter session
* @mss: maximum segment size of transmission
*
@@ -216,7 +206,7 @@ static void batadv_tp_update_rto(struct batadv_tp_vars *tp_vars,
* batadv_tp_batctl_notify() - send client status result to client
* @reason: reason for tp meter session stop
* @dst: destination of tp_meter session
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @start_time: start of transmission in jiffies
* @total_sent: bytes acked to the receiver
* @cookie: cookie of tp_meter session
@@ -248,7 +238,7 @@ static void batadv_tp_batctl_notify(enum batadv_tp_meter_reason reason,
* batadv_tp_batctl_error_notify() - send client error result to client
* @reason: reason for tp meter session stop
* @dst: destination of tp_meter session
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @cookie: cookie of tp_meter session
*/
static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason,
@@ -261,11 +251,11 @@ static void batadv_tp_batctl_error_notify(enum batadv_tp_meter_reason reason,
/**
* batadv_tp_list_find() - find a tp_vars object in the global list
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @dst: the other endpoint MAC address to look for
*
* Look for a tp_vars object matching dst as end_point and return it after
- * having incremented the refcounter. Return NULL is not found
+ * having increment the refcounter. Return NULL is not found
*
* Return: matching tp_vars or NULL when no tp_vars with @dst was found
*/
@@ -297,12 +287,12 @@ static struct batadv_tp_vars *batadv_tp_list_find(struct batadv_priv *bat_priv,
/**
* batadv_tp_list_find_session() - find tp_vars session object in the global
* list
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @dst: the other endpoint MAC address to look for
* @session: session identifier
*
* Look for a tp_vars object matching dst as end_point, session as tp meter
- * session and return it after having incremented the refcounter. Return NULL
+ * session and return it after having increment the refcounter. Return NULL
* is not found
*
* Return: matching tp_vars or NULL when no tp_vars was found
@@ -368,12 +358,15 @@ static void batadv_tp_vars_release(struct kref *ref)
*/
static void batadv_tp_vars_put(struct batadv_tp_vars *tp_vars)
{
+ if (!tp_vars)
+ return;
+
kref_put(&tp_vars->refcount, batadv_tp_vars_release);
}
/**
* batadv_tp_sender_cleanup() - cleanup sender data and drop and timer
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @tp_vars: the private data of the current TP meter session to cleanup
*/
static void batadv_tp_sender_cleanup(struct batadv_priv *bat_priv,
@@ -391,19 +384,19 @@ static void batadv_tp_sender_cleanup(struct batadv_priv *bat_priv,
atomic_dec(&tp_vars->bat_priv->tp_num);
/* kill the timer and remove its reference */
- del_timer_sync(&tp_vars->timer);
+ timer_delete_sync(&tp_vars->timer);
/* the worker might have rearmed itself therefore we kill it again. Note
* that if the worker should run again before invoking the following
- * del_timer(), it would not re-arm itself once again because the status
+ * timer_delete(), it would not re-arm itself once again because the status
* is OFF now
*/
- del_timer(&tp_vars->timer);
+ timer_delete(&tp_vars->timer);
batadv_tp_vars_put(tp_vars);
}
/**
* batadv_tp_sender_end() - print info about ended session and inform client
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @tp_vars: the private data of the current TP meter session
*/
static void batadv_tp_sender_end(struct batadv_priv *bat_priv,
@@ -492,7 +485,7 @@ static void batadv_tp_reset_sender_timer(struct batadv_tp_vars *tp_vars)
*/
static void batadv_tp_sender_timeout(struct timer_list *t)
{
- struct batadv_tp_vars *tp_vars = from_timer(tp_vars, t, timer);
+ struct batadv_tp_vars *tp_vars = timer_container_of(tp_vars, t, timer);
struct batadv_priv *bat_priv = tp_vars->bat_priv;
if (atomic_read(&tp_vars->sending) == 0)
@@ -626,7 +619,7 @@ static int batadv_tp_send_msg(struct batadv_tp_vars *tp_vars, const u8 *src,
/**
* batadv_tp_recv_ack() - ACK receiving function
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: the buffer containing the received packet
*
* Process a received TP ACK packet
@@ -638,9 +631,9 @@ static void batadv_tp_recv_ack(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node = NULL;
const struct batadv_icmp_tp_packet *icmp;
struct batadv_tp_vars *tp_vars;
+ const unsigned char *dev_addr;
size_t packet_len, mss;
u32 rtt, recv_ack, cwnd;
- unsigned char *dev_addr;
packet_len = BATADV_TP_PLEN;
mss = BATADV_TP_PLEN;
@@ -758,12 +751,9 @@ move_twnd:
wake_up(&tp_vars->more_bytes);
out:
- if (likely(primary_if))
- batadv_hardif_put(primary_if);
- if (likely(orig_node))
- batadv_orig_node_put(orig_node);
- if (likely(tp_vars))
- batadv_tp_vars_put(tp_vars);
+ batadv_hardif_put(primary_if);
+ batadv_orig_node_put(orig_node);
+ batadv_tp_vars_put(tp_vars);
}
/**
@@ -842,7 +832,7 @@ static int batadv_tp_send(void *arg)
}
/* assume that all the hard_interfaces have a correctly
- * configured MTU, so use the soft_iface MTU as MSS.
+ * configured MTU, so use the mesh_iface MTU as MSS.
* This might not be true and in that case the fragmentation
* should be used.
* Now, try to send the packet as it is
@@ -892,17 +882,15 @@ static int batadv_tp_send(void *arg)
}
out:
- if (likely(primary_if))
- batadv_hardif_put(primary_if);
- if (likely(orig_node))
- batadv_orig_node_put(orig_node);
+ batadv_hardif_put(primary_if);
+ batadv_orig_node_put(orig_node);
batadv_tp_sender_end(bat_priv, tp_vars);
batadv_tp_sender_cleanup(bat_priv, tp_vars);
batadv_tp_vars_put(tp_vars);
- do_exit(0);
+ return 0;
}
/**
@@ -939,7 +927,7 @@ static void batadv_tp_start_kthread(struct batadv_tp_vars *tp_vars)
/**
* batadv_tp_start() - start a new tp meter session
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @dst: the receiver MAC address
* @test_length: test length in milliseconds
* @cookie: session cookie
@@ -1005,7 +993,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst,
/* initialise the CWND to 3*MSS (Section 3.1 in RFC5681).
* For batman-adv the MSS is the size of the payload received by the
- * soft_interface, hence its MTU
+ * mesh_interface, hence its MTU
*/
tp_vars->cwnd = BATADV_TP_PLEN * 3;
/* at the beginning initialise the SS threshold to the biggest possible
@@ -1064,7 +1052,7 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst,
/**
* batadv_tp_stop() - stop currently running tp meter session
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @dst: the receiver MAC address
* @return_value: reason for tp meter session stop
*/
@@ -1113,7 +1101,7 @@ static void batadv_tp_reset_receiver_timer(struct batadv_tp_vars *tp_vars)
*/
static void batadv_tp_receiver_shutdown(struct timer_list *t)
{
- struct batadv_tp_vars *tp_vars = from_timer(tp_vars, t, timer);
+ struct batadv_tp_vars *tp_vars = timer_container_of(tp_vars, t, timer);
struct batadv_tp_unacked *un, *safe;
struct batadv_priv *bat_priv;
@@ -1153,7 +1141,7 @@ static void batadv_tp_receiver_shutdown(struct timer_list *t)
/**
* batadv_tp_send_ack() - send an ACK packet
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @dst: the mac address of the destination originator
* @seq: the sequence number to ACK
* @timestamp: the timestamp to echo back in the ACK
@@ -1215,10 +1203,8 @@ static int batadv_tp_send_ack(struct batadv_priv *bat_priv, const u8 *dst,
ret = 0;
out:
- if (likely(orig_node))
- batadv_orig_node_put(orig_node);
- if (likely(primary_if))
- batadv_hardif_put(primary_if);
+ batadv_orig_node_put(orig_node);
+ batadv_hardif_put(primary_if);
return ret;
}
@@ -1334,7 +1320,7 @@ static void batadv_tp_ack_unordered(struct batadv_tp_vars *tp_vars)
/**
* batadv_tp_init_recv() - return matching or create new receiver tp_vars
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @icmp: received icmp tp msg
*
* Return: corresponding tp_vars or NULL on errors
@@ -1387,7 +1373,7 @@ out_unlock:
/**
* batadv_tp_recv_msg() - process a single data message
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: the buffer containing the received packet
*
* Process a received TP MSG packet
@@ -1466,13 +1452,12 @@ send_ack:
batadv_tp_send_ack(bat_priv, icmp->orig, tp_vars->last_recv,
icmp->timestamp, icmp->session, icmp->uid);
out:
- if (likely(tp_vars))
- batadv_tp_vars_put(tp_vars);
+ batadv_tp_vars_put(tp_vars);
}
/**
* batadv_tp_meter_recv() - main TP Meter receiving function
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @skb: the buffer containing the received packet
*/
void batadv_tp_meter_recv(struct batadv_priv *bat_priv, struct sk_buff *skb)
diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h
index 68e600974759..f0046d366eac 100644
--- a/net/batman-adv/tp_meter.h
+++ b/net/batman-adv/tp_meter.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2012-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Edo Monticelli, Antonio Quartulli
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_TP_METER_H_
@@ -21,10 +9,9 @@
#include "main.h"
+#include <linux/skbuff.h>
#include <linux/types.h>
-struct sk_buff;
-
void batadv_tp_meter_init(void);
void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst,
u32 test_length, u32 *cookie);
diff --git a/net/batman-adv/trace.c b/net/batman-adv/trace.c
new file mode 100644
index 000000000000..ec8b9519076b
--- /dev/null
+++ b/net/batman-adv/trace.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Sven Eckelmann
+ */
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h
new file mode 100644
index 000000000000..7da692ec38e9
--- /dev/null
+++ b/net/batman-adv/trace.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) B.A.T.M.A.N. contributors:
+ *
+ * Sven Eckelmann
+ */
+
+#if !defined(_NET_BATMAN_ADV_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ)
+#define _NET_BATMAN_ADV_TRACE_H_
+
+#include "main.h"
+
+#include <linux/netdevice.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM batadv
+
+/* provide dummy function when tracing is disabled */
+#if !defined(CONFIG_BATMAN_ADV_TRACING)
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(name, proto, ...) \
+ static inline void trace_ ## name(proto) {}
+
+#endif /* CONFIG_BATMAN_ADV_TRACING */
+
+TRACE_EVENT(batadv_dbg,
+
+ TP_PROTO(struct batadv_priv *bat_priv,
+ struct va_format *vaf),
+
+ TP_ARGS(bat_priv, vaf),
+
+ TP_STRUCT__entry(
+ __string(device, bat_priv->mesh_iface->name)
+ __string(driver, KBUILD_MODNAME)
+ __vstring(msg, vaf->fmt, vaf->va)
+ ),
+
+ TP_fast_assign(
+ __assign_str(device);
+ __assign_str(driver);
+ __assign_vstr(msg, vaf->fmt, vaf->va);
+ ),
+
+ TP_printk(
+ "%s %s %s",
+ __get_str(driver),
+ __get_str(device),
+ __get_str(msg)
+ )
+);
+
+#endif /* _NET_BATMAN_ADV_TRACE_H_ || TRACE_HEADER_MULTI_READ */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index d21624c44665..6e95e883c2bf 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1,19 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich, Antonio Quartulli
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "translation-table.h"
@@ -25,7 +13,9 @@
#include <linux/byteorder/generic.h>
#include <linux/cache.h>
#include <linux/compiler.h>
-#include <linux/crc32c.h>
+#include <linux/container_of.h>
+#include <linux/crc32.h>
+#include <linux/err.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/gfp.h>
@@ -33,16 +23,15 @@
#include <linux/init.h>
#include <linux/jhash.h>
#include <linux/jiffies.h>
-#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
+#include <linux/overflow.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
-#include <linux/seq_file.h>
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
@@ -51,7 +40,6 @@
#include <linux/workqueue.h>
#include <net/genetlink.h>
#include <net/netlink.h>
-#include <net/sock.h>
#include <uapi/linux/batadv_packet.h>
#include <uapi/linux/batman_adv.h>
@@ -59,9 +47,9 @@
#include "hard-interface.h"
#include "hash.h"
#include "log.h"
+#include "mesh-interface.h"
#include "netlink.h"
#include "originator.h"
-#include "soft-interface.h"
#include "tvlv.h"
static struct kmem_cache *batadv_tl_cache __read_mostly;
@@ -116,10 +104,10 @@ static bool batadv_compare_tt(const struct hlist_node *node, const void *data2)
*/
static inline u32 batadv_choose_tt(const void *data, u32 size)
{
- struct batadv_tt_common_entry *tt;
+ const struct batadv_tt_common_entry *tt;
u32 hash = 0;
- tt = (struct batadv_tt_common_entry *)data;
+ tt = data;
hash = jhash(&tt->addr, ETH_ALEN, hash);
hash = jhash(&tt->vid, sizeof(tt->vid), hash);
@@ -173,7 +161,7 @@ batadv_tt_hash_find(struct batadv_hashtable *hash, const u8 *addr,
/**
* batadv_tt_local_hash_find() - search the local table for a given client
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @addr: the mac address of the client to look for
* @vid: VLAN identifier
*
@@ -198,14 +186,14 @@ batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const u8 *addr,
/**
* batadv_tt_global_hash_find() - search the global table for a given client
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @addr: the mac address of the client to look for
* @vid: VLAN identifier
*
* Return: a pointer to the corresponding tt_global_entry struct if the client
* is found, NULL otherwise.
*/
-static struct batadv_tt_global_entry *
+struct batadv_tt_global_entry *
batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr,
unsigned short vid)
{
@@ -222,23 +210,9 @@ batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr,
}
/**
- * batadv_tt_local_entry_free_rcu() - free the tt_local_entry
- * @rcu: rcu pointer of the tt_local_entry
- */
-static void batadv_tt_local_entry_free_rcu(struct rcu_head *rcu)
-{
- struct batadv_tt_local_entry *tt_local_entry;
-
- tt_local_entry = container_of(rcu, struct batadv_tt_local_entry,
- common.rcu);
-
- kmem_cache_free(batadv_tl_cache, tt_local_entry);
-}
-
-/**
* batadv_tt_local_entry_release() - release tt_local_entry from lists and queue
* for free after rcu grace period
- * @ref: kref pointer of the nc_node
+ * @ref: kref pointer of the batadv_tt_local_entry
*/
static void batadv_tt_local_entry_release(struct kref *ref)
{
@@ -247,9 +221,9 @@ static void batadv_tt_local_entry_release(struct kref *ref)
tt_local_entry = container_of(ref, struct batadv_tt_local_entry,
common.refcount);
- batadv_softif_vlan_put(tt_local_entry->vlan);
+ batadv_meshif_vlan_put(tt_local_entry->vlan);
- call_rcu(&tt_local_entry->common.rcu, batadv_tt_local_entry_free_rcu);
+ kfree_rcu(tt_local_entry, common.rcu);
}
/**
@@ -260,30 +234,19 @@ static void batadv_tt_local_entry_release(struct kref *ref)
static void
batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry)
{
+ if (!tt_local_entry)
+ return;
+
kref_put(&tt_local_entry->common.refcount,
batadv_tt_local_entry_release);
}
/**
- * batadv_tt_global_entry_free_rcu() - free the tt_global_entry
- * @rcu: rcu pointer of the tt_global_entry
- */
-static void batadv_tt_global_entry_free_rcu(struct rcu_head *rcu)
-{
- struct batadv_tt_global_entry *tt_global_entry;
-
- tt_global_entry = container_of(rcu, struct batadv_tt_global_entry,
- common.rcu);
-
- kmem_cache_free(batadv_tg_cache, tt_global_entry);
-}
-
-/**
* batadv_tt_global_entry_release() - release tt_global_entry from lists and
* queue for free after rcu grace period
- * @ref: kref pointer of the nc_node
+ * @ref: kref pointer of the batadv_tt_global_entry
*/
-static void batadv_tt_global_entry_release(struct kref *ref)
+void batadv_tt_global_entry_release(struct kref *ref)
{
struct batadv_tt_global_entry *tt_global_entry;
@@ -292,29 +255,17 @@ static void batadv_tt_global_entry_release(struct kref *ref)
batadv_tt_global_del_orig_list(tt_global_entry);
- call_rcu(&tt_global_entry->common.rcu, batadv_tt_global_entry_free_rcu);
-}
-
-/**
- * batadv_tt_global_entry_put() - decrement the tt_global_entry refcounter and
- * possibly release it
- * @tt_global_entry: tt_global_entry to be free'd
- */
-static void
-batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry)
-{
- kref_put(&tt_global_entry->common.refcount,
- batadv_tt_global_entry_release);
+ kfree_rcu(tt_global_entry, common.rcu);
}
/**
* batadv_tt_global_hash_count() - count the number of orig entries
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @addr: the mac address of the client to count entries for
* @vid: VLAN identifier
*
* Return: the number of originators advertising the given address/data
- * (excluding ourself).
+ * (excluding our self).
*/
int batadv_tt_global_hash_count(struct batadv_priv *bat_priv,
const u8 *addr, unsigned short vid)
@@ -335,28 +286,28 @@ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv,
/**
* batadv_tt_local_size_mod() - change the size by v of the local table
* identified by vid
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @vid: the VLAN identifier of the sub-table to change
* @v: the amount to sum to the local table size
*/
static void batadv_tt_local_size_mod(struct batadv_priv *bat_priv,
unsigned short vid, int v)
{
- struct batadv_softif_vlan *vlan;
+ struct batadv_meshif_vlan *vlan;
- vlan = batadv_softif_vlan_get(bat_priv, vid);
+ vlan = batadv_meshif_vlan_get(bat_priv, vid);
if (!vlan)
return;
atomic_add(v, &vlan->tt.num_entries);
- batadv_softif_vlan_put(vlan);
+ batadv_meshif_vlan_put(vlan);
}
/**
* batadv_tt_local_size_inc() - increase by one the local table size for the
* given vid
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @vid: the VLAN identifier
*/
static void batadv_tt_local_size_inc(struct batadv_priv *bat_priv,
@@ -368,7 +319,7 @@ static void batadv_tt_local_size_inc(struct batadv_priv *bat_priv,
/**
* batadv_tt_local_size_dec() - decrease by one the local table size for the
* given vid
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @vid: the VLAN identifier
*/
static void batadv_tt_local_size_dec(struct batadv_priv *bat_priv,
@@ -430,19 +381,6 @@ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node,
}
/**
- * batadv_tt_orig_list_entry_free_rcu() - free the orig_entry
- * @rcu: rcu pointer of the orig_entry
- */
-static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu)
-{
- struct batadv_tt_orig_list_entry *orig_entry;
-
- orig_entry = container_of(rcu, struct batadv_tt_orig_list_entry, rcu);
-
- kmem_cache_free(batadv_tt_orig_cache, orig_entry);
-}
-
-/**
* batadv_tt_orig_list_entry_release() - release tt orig entry from lists and
* queue for free after rcu grace period
* @ref: kref pointer of the tt orig entry
@@ -455,7 +393,7 @@ static void batadv_tt_orig_list_entry_release(struct kref *ref)
refcount);
batadv_orig_node_put(orig_entry->orig_node);
- call_rcu(&orig_entry->rcu, batadv_tt_orig_list_entry_free_rcu);
+ kfree_rcu(orig_entry, rcu);
}
/**
@@ -466,12 +404,15 @@ static void batadv_tt_orig_list_entry_release(struct kref *ref)
static void
batadv_tt_orig_list_entry_put(struct batadv_tt_orig_list_entry *orig_entry)
{
+ if (!orig_entry)
+ return;
+
kref_put(&orig_entry->refcount, batadv_tt_orig_list_entry_release);
}
/**
* batadv_tt_local_event() - store a local TT event (ADD/DEL)
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @tt_local_entry: the TT entry involved in the event
* @event_flags: flags to store in the event structure
*/
@@ -482,8 +423,8 @@ static void batadv_tt_local_event(struct batadv_priv *bat_priv,
struct batadv_tt_change_node *tt_change_node, *entry, *safe;
struct batadv_tt_common_entry *common = &tt_local_entry->common;
u8 flags = common->flags | event_flags;
- bool event_removed = false;
bool del_op_requested, del_op_entry;
+ size_t changes;
tt_change_node = kmem_cache_alloc(batadv_tt_change_cache, GFP_ATOMIC);
if (!tt_change_node)
@@ -497,51 +438,45 @@ static void batadv_tt_local_event(struct batadv_priv *bat_priv,
del_op_requested = flags & BATADV_TT_CLIENT_DEL;
- /* check for ADD+DEL or DEL+ADD events */
+ /* check for ADD+DEL, DEL+ADD, ADD+ADD or DEL+DEL events */
spin_lock_bh(&bat_priv->tt.changes_list_lock);
+ changes = READ_ONCE(bat_priv->tt.local_changes);
list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list,
list) {
if (!batadv_compare_eth(entry->change.addr, common->addr))
continue;
- /* DEL+ADD in the same orig interval have no effect and can be
- * removed to avoid silly behaviour on the receiver side. The
- * other way around (ADD+DEL) can happen in case of roaming of
- * a client still in the NEW state. Roaming of NEW clients is
- * now possible due to automatically recognition of "temporary"
- * clients
- */
del_op_entry = entry->change.flags & BATADV_TT_CLIENT_DEL;
- if (!del_op_requested && del_op_entry)
- goto del;
- if (del_op_requested && !del_op_entry)
- goto del;
-
- /* this is a second add in the same originator interval. It
- * means that flags have been changed: update them!
- */
- if (!del_op_requested && !del_op_entry)
+ if (del_op_requested != del_op_entry) {
+ /* DEL+ADD in the same orig interval have no effect and
+ * can be removed to avoid silly behaviour on the
+ * receiver side. The other way around (ADD+DEL) can
+ * happen in case of roaming of a client still in the
+ * NEW state. Roaming of NEW clients is now possible due
+ * to automatically recognition of "temporary" clients
+ */
+ list_del(&entry->list);
+ kmem_cache_free(batadv_tt_change_cache, entry);
+ changes--;
+ } else {
+ /* this is a second add or del in the same originator
+ * interval. It could mean that flags have been changed
+ * (e.g. double add): update them
+ */
entry->change.flags = flags;
+ }
- continue;
-del:
- list_del(&entry->list);
- kmem_cache_free(batadv_tt_change_cache, entry);
kmem_cache_free(batadv_tt_change_cache, tt_change_node);
- event_removed = true;
- goto unlock;
+ goto update_changes;
}
/* track the change in the OGMinterval list */
list_add_tail(&tt_change_node->list, &bat_priv->tt.changes_list);
+ changes++;
-unlock:
+update_changes:
+ WRITE_ONCE(bat_priv->tt.local_changes, changes);
spin_unlock_bh(&bat_priv->tt.changes_list_lock);
-
- if (event_removed)
- atomic_dec(&bat_priv->tt.local_changes);
- else
- atomic_inc(&bat_priv->tt.local_changes);
}
/**
@@ -569,7 +504,7 @@ static u16 batadv_tt_entries(u16 tt_len)
/**
* batadv_tt_local_table_transmit_size() - calculates the local translation
* table size when transmitted over the air
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*
* Return: local translation table size in bytes.
*/
@@ -577,11 +512,11 @@ static int batadv_tt_local_table_transmit_size(struct batadv_priv *bat_priv)
{
u16 num_vlan = 0;
u16 tt_local_entries = 0;
- struct batadv_softif_vlan *vlan;
+ struct batadv_meshif_vlan *vlan;
int hdr_size;
rcu_read_lock();
- hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) {
+ hlist_for_each_entry_rcu(vlan, &bat_priv->meshif_vlan_list, list) {
num_vlan++;
tt_local_entries += atomic_read(&vlan->tt.num_entries);
}
@@ -616,20 +551,32 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv,
struct batadv_tt_global_entry *tt_global,
const char *message)
{
+ struct batadv_tt_global_entry *tt_removed_entry;
+ struct hlist_node *tt_removed_node;
+
batadv_dbg(BATADV_DBG_TT, bat_priv,
"Deleting global tt entry %pM (vid: %d): %s\n",
tt_global->common.addr,
batadv_print_vid(tt_global->common.vid), message);
- batadv_hash_remove(bat_priv->tt.global_hash, batadv_compare_tt,
- batadv_choose_tt, &tt_global->common);
- batadv_tt_global_entry_put(tt_global);
+ tt_removed_node = batadv_hash_remove(bat_priv->tt.global_hash,
+ batadv_compare_tt,
+ batadv_choose_tt,
+ &tt_global->common);
+ if (!tt_removed_node)
+ return;
+
+ /* drop reference of remove hash entry */
+ tt_removed_entry = hlist_entry(tt_removed_node,
+ struct batadv_tt_global_entry,
+ common.hash_entry);
+ batadv_tt_global_entry_put(tt_removed_entry);
}
/**
* batadv_tt_local_add() - add a new client to the local table or update an
* existing client
- * @soft_iface: netdev struct of the mesh interface
+ * @mesh_iface: netdev struct of the mesh interface
* @addr: the mac address of the client to add
* @vid: VLAN identifier
* @ifindex: index of the interface where the client is connected to (useful to
@@ -639,14 +586,14 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv,
*
* Return: true if the client was successfully added, false otherwise.
*/
-bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
+bool batadv_tt_local_add(struct net_device *mesh_iface, const u8 *addr,
unsigned short vid, int ifindex, u32 mark)
{
- struct batadv_priv *bat_priv = netdev_priv(soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(mesh_iface);
struct batadv_tt_local_entry *tt_local;
struct batadv_tt_global_entry *tt_global = NULL;
- struct net *net = dev_net(soft_iface);
- struct batadv_softif_vlan *vlan;
+ struct net *net = dev_net(mesh_iface);
+ struct batadv_meshif_vlan *vlan;
struct net_device *in_dev = NULL;
struct batadv_hard_iface *in_hardif = NULL;
struct hlist_head *head;
@@ -703,7 +650,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
table_size += batadv_tt_len(1);
packet_size_max = atomic_read(&bat_priv->packet_size_max);
if (table_size > packet_size_max) {
- net_ratelimited_function(batadv_info, soft_iface,
+ net_ratelimited_function(batadv_info, mesh_iface,
"Local translation table size (%i) exceeds maximum packet size (%i); Ignoring new local tt entry: %pM\n",
table_size, packet_size_max, addr);
goto out;
@@ -714,9 +661,9 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
goto out;
/* increase the refcounter of the related vlan */
- vlan = batadv_softif_vlan_get(bat_priv, vid);
+ vlan = batadv_meshif_vlan_get(bat_priv, vid);
if (!vlan) {
- net_ratelimited_function(batadv_info, soft_iface,
+ net_ratelimited_function(batadv_info, mesh_iface,
"adding TT local entry %pM to non-existent VLAN %d\n",
addr, batadv_print_vid(vid));
kmem_cache_free(batadv_tl_cache, tt_local);
@@ -746,7 +693,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
/* the batman interface mac and multicast addresses should never be
* purged
*/
- if (batadv_compare_eth(addr, soft_iface->dev_addr) ||
+ if (batadv_compare_eth(addr, mesh_iface->dev_addr) ||
is_multicast_ether_addr(addr))
tt_local->common.flags |= BATADV_TT_CLIENT_NOPURGE;
@@ -781,7 +728,6 @@ check_roaming:
if (roamed_back) {
batadv_tt_global_free(bat_priv, tt_global,
"Roaming canceled");
- tt_global = NULL;
} else {
/* The global entry has to be marked as ROAMING and
* has to be kept for consistency purpose
@@ -820,14 +766,10 @@ check_roaming:
ret = true;
out:
- if (in_hardif)
- batadv_hardif_put(in_hardif);
- if (in_dev)
- dev_put(in_dev);
- if (tt_local)
- batadv_tt_local_entry_put(tt_local);
- if (tt_global)
- batadv_tt_global_entry_put(tt_global);
+ batadv_hardif_put(in_hardif);
+ dev_put(in_dev);
+ batadv_tt_local_entry_put(tt_local);
+ batadv_tt_global_entry_put(tt_global);
return ret;
}
@@ -843,7 +785,7 @@ out:
* table. In case of success the value is updated with the real amount of
* reserved bytes
* Allocate the needed amount of memory for the entire TT TVLV and write its
- * header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data
+ * header made up of one tvlv_tt_data object and a series of tvlv_tt_vlan_data
* objects, one per active VLAN served by the originator node.
*
* Return: the size of the allocated buffer or 0 in case of failure.
@@ -863,13 +805,12 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node,
u8 *tt_change_ptr;
spin_lock_bh(&orig_node->vlan_list_lock);
- hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) {
+ hlist_for_each_entry(vlan, &orig_node->vlan_list, list) {
num_vlan++;
num_entries += atomic_read(&vlan->tt.num_entries);
}
- change_offset = sizeof(**tt_data);
- change_offset += num_vlan * sizeof(*tt_vlan);
+ change_offset = struct_size(*tt_data, vlan_data, num_vlan);
/* if tt_len is negative, allocate the space needed by the full table */
if (*tt_len < 0)
@@ -888,10 +829,11 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node,
(*tt_data)->ttvn = atomic_read(&orig_node->last_ttvn);
(*tt_data)->num_vlan = htons(num_vlan);
- tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(*tt_data + 1);
- hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) {
+ tt_vlan = (*tt_data)->vlan_data;
+ hlist_for_each_entry(vlan, &orig_node->vlan_list, list) {
tt_vlan->vid = htons(vlan->vid);
tt_vlan->crc = htonl(vlan->tt.crc);
+ tt_vlan->reserved = 0;
tt_vlan++;
}
@@ -907,7 +849,7 @@ out:
/**
* batadv_tt_prepare_tvlv_local_data() - allocate and prepare the TT TVLV for
* this node
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @tt_data: uninitialised pointer to the address of the TVLV buffer
* @tt_change: uninitialised pointer to the address of the area where the TT
* changes can be stored
@@ -929,7 +871,7 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv,
s32 *tt_len)
{
struct batadv_tvlv_tt_vlan_data *tt_vlan;
- struct batadv_softif_vlan *vlan;
+ struct batadv_meshif_vlan *vlan;
u16 num_vlan = 0;
u16 vlan_entries = 0;
u16 total_entries = 0;
@@ -937,8 +879,8 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv,
u8 *tt_change_ptr;
int change_offset;
- spin_lock_bh(&bat_priv->softif_vlan_list_lock);
- hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) {
+ spin_lock_bh(&bat_priv->meshif_vlan_list_lock);
+ hlist_for_each_entry(vlan, &bat_priv->meshif_vlan_list, list) {
vlan_entries = atomic_read(&vlan->tt.num_entries);
if (vlan_entries < 1)
continue;
@@ -947,8 +889,7 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv,
total_entries += vlan_entries;
}
- change_offset = sizeof(**tt_data);
- change_offset += num_vlan * sizeof(*tt_vlan);
+ change_offset = struct_size(*tt_data, vlan_data, num_vlan);
/* if tt_len is negative, allocate the space needed by the full table */
if (*tt_len < 0)
@@ -967,14 +908,15 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv,
(*tt_data)->ttvn = atomic_read(&bat_priv->tt.vn);
(*tt_data)->num_vlan = htons(num_vlan);
- tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(*tt_data + 1);
- hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) {
+ tt_vlan = (*tt_data)->vlan_data;
+ hlist_for_each_entry(vlan, &bat_priv->meshif_vlan_list, list) {
vlan_entries = atomic_read(&vlan->tt.num_entries);
if (vlan_entries < 1)
continue;
tt_vlan->vid = htons(vlan->vid);
tt_vlan->crc = htonl(vlan->tt.crc);
+ tt_vlan->reserved = 0;
tt_vlan++;
}
@@ -983,14 +925,14 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv,
*tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr;
out:
- spin_unlock_bh(&bat_priv->softif_vlan_list_lock);
+ spin_unlock_bh(&bat_priv->meshif_vlan_list_lock);
return tvlv_len;
}
/**
* batadv_tt_tvlv_container_update() - update the translation table tvlv
* container after local tt changes have been committed
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*/
static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv)
{
@@ -1000,16 +942,25 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv)
int tt_diff_len, tt_change_len = 0;
int tt_diff_entries_num = 0;
int tt_diff_entries_count = 0;
+ bool drop_changes = false;
+ size_t tt_extra_len = 0;
u16 tvlv_len;
- tt_diff_entries_num = atomic_read(&bat_priv->tt.local_changes);
+ tt_diff_entries_num = READ_ONCE(bat_priv->tt.local_changes);
tt_diff_len = batadv_tt_len(tt_diff_entries_num);
/* if we have too many changes for one packet don't send any
- * and wait for the tt table request which will be fragmented
+ * and wait for the tt table request so we can reply with the full
+ * (fragmented) table.
+ *
+ * The local change history should still be cleaned up so the next
+ * TT round can start again with a clean state.
*/
- if (tt_diff_len > bat_priv->soft_iface->mtu)
+ if (tt_diff_len > bat_priv->mesh_iface->mtu) {
tt_diff_len = 0;
+ tt_diff_entries_num = 0;
+ drop_changes = true;
+ }
tvlv_len = batadv_tt_prepare_tvlv_local_data(bat_priv, &tt_data,
&tt_change, &tt_diff_len);
@@ -1018,11 +969,11 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv)
tt_data->flags = BATADV_TT_OGM_DIFF;
- if (tt_diff_len == 0)
+ if (!drop_changes && tt_diff_len == 0)
goto container_register;
spin_lock_bh(&bat_priv->tt.changes_list_lock);
- atomic_set(&bat_priv->tt.local_changes, 0);
+ WRITE_ONCE(bat_priv->tt.local_changes, 0);
list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list,
list) {
@@ -1037,6 +988,9 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv)
}
spin_unlock_bh(&bat_priv->tt.changes_list_lock);
+ tt_extra_len = batadv_tt_len(tt_diff_entries_num -
+ tt_diff_entries_count);
+
/* Keep the buffer for possible tt_request */
spin_lock_bh(&bat_priv->tt.last_changeset_lock);
kfree(bat_priv->tt.last_changeset);
@@ -1045,6 +999,7 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv)
tt_change_len = batadv_tt_len(tt_diff_entries_count);
/* check whether this new OGM has no changes due to size problems */
if (tt_diff_entries_count > 0) {
+ tt_diff_len -= tt_extra_len;
/* if kmalloc() fails we will reply with the full table
* instead of providing the diff
*/
@@ -1057,107 +1012,32 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv)
}
spin_unlock_bh(&bat_priv->tt.last_changeset_lock);
+ /* Remove extra packet space for OGM */
+ tvlv_len -= tt_extra_len;
container_register:
batadv_tvlv_container_register(bat_priv, BATADV_TVLV_TT, 1, tt_data,
tvlv_len);
kfree(tt_data);
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-
-/**
- * batadv_tt_local_seq_print_text() - Print the local tt table in a seq file
- * @seq: seq file to print on
- * @offset: not used
- *
- * Return: always 0
- */
-int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hashtable *hash = bat_priv->tt.local_hash;
- struct batadv_tt_common_entry *tt_common_entry;
- struct batadv_tt_local_entry *tt_local;
- struct batadv_hard_iface *primary_if;
- struct hlist_head *head;
- u32 i;
- int last_seen_secs;
- int last_seen_msecs;
- unsigned long last_seen_jiffies;
- bool no_purge;
- u16 np_flag = BATADV_TT_CLIENT_NOPURGE;
-
- primary_if = batadv_seq_print_text_primary_if_get(seq);
- if (!primary_if)
- goto out;
-
- seq_printf(seq,
- "Locally retrieved addresses (from %s) announced via TT (TTVN: %u):\n",
- net_dev->name, (u8)atomic_read(&bat_priv->tt.vn));
- seq_puts(seq,
- " Client VID Flags Last seen (CRC )\n");
-
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(tt_common_entry,
- head, hash_entry) {
- tt_local = container_of(tt_common_entry,
- struct batadv_tt_local_entry,
- common);
- last_seen_jiffies = jiffies - tt_local->last_seen;
- last_seen_msecs = jiffies_to_msecs(last_seen_jiffies);
- last_seen_secs = last_seen_msecs / 1000;
- last_seen_msecs = last_seen_msecs % 1000;
-
- no_purge = tt_common_entry->flags & np_flag;
- seq_printf(seq,
- " * %pM %4i [%c%c%c%c%c%c] %3u.%03u (%#.8x)\n",
- tt_common_entry->addr,
- batadv_print_vid(tt_common_entry->vid),
- ((tt_common_entry->flags &
- BATADV_TT_CLIENT_ROAM) ? 'R' : '.'),
- no_purge ? 'P' : '.',
- ((tt_common_entry->flags &
- BATADV_TT_CLIENT_NEW) ? 'N' : '.'),
- ((tt_common_entry->flags &
- BATADV_TT_CLIENT_PENDING) ? 'X' : '.'),
- ((tt_common_entry->flags &
- BATADV_TT_CLIENT_WIFI) ? 'W' : '.'),
- ((tt_common_entry->flags &
- BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'),
- no_purge ? 0 : last_seen_secs,
- no_purge ? 0 : last_seen_msecs,
- tt_local->vlan->tt.crc);
- }
- rcu_read_unlock();
- }
-out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- return 0;
-}
-#endif
-
/**
* batadv_tt_local_dump_entry() - Dump one TT local entry into a message
* @msg :Netlink message to dump into
* @portid: Port making netlink request
- * @seq: Sequence number of netlink message
- * @bat_priv: The bat priv with all the soft interface information
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the mesh interface information
* @common: tt local & tt global common data
*
* Return: Error code, or 0 on success
*/
static int
-batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
+batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_priv *bat_priv,
struct batadv_tt_common_entry *common)
{
void *hdr;
- struct batadv_softif_vlan *vlan;
+ struct batadv_meshif_vlan *vlan;
struct batadv_tt_local_entry *local;
unsigned int last_seen_msecs;
u32 crc;
@@ -1165,20 +1045,22 @@ batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
local = container_of(common, struct batadv_tt_local_entry, common);
last_seen_msecs = jiffies_to_msecs(jiffies - local->last_seen);
- vlan = batadv_softif_vlan_get(bat_priv, common->vid);
+ vlan = batadv_meshif_vlan_get(bat_priv, common->vid);
if (!vlan)
return 0;
crc = vlan->tt.crc;
- batadv_softif_vlan_put(vlan);
+ batadv_meshif_vlan_put(vlan);
- hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family,
- NLM_F_MULTI,
+ hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq,
+ &batadv_netlink_family, NLM_F_MULTI,
BATADV_CMD_GET_TRANSTABLE_LOCAL);
if (!hdr)
return -ENOBUFS;
+ genl_dump_check_consistent(cb, hdr);
+
if (nla_put(msg, BATADV_ATTR_TT_ADDRESS, ETH_ALEN, common->addr) ||
nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) ||
nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) ||
@@ -1201,34 +1083,39 @@ batadv_tt_local_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
* batadv_tt_local_dump_bucket() - Dump one TT local bucket into a message
* @msg: Netlink message to dump into
* @portid: Port making netlink request
- * @seq: Sequence number of netlink message
- * @bat_priv: The bat priv with all the soft interface information
- * @head: Pointer to the list containing the local tt entries
+ * @cb: Control block containing additional options
+ * @bat_priv: The bat priv with all the mesh interface information
+ * @hash: hash to dump
+ * @bucket: bucket index to dump
* @idx_s: Number of entries to skip
*
* Return: Error code, or 0 on success
*/
static int
-batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
+batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid,
+ struct netlink_callback *cb,
struct batadv_priv *bat_priv,
- struct hlist_head *head, int *idx_s)
+ struct batadv_hashtable *hash, unsigned int bucket,
+ int *idx_s)
{
struct batadv_tt_common_entry *common;
int idx = 0;
- rcu_read_lock();
- hlist_for_each_entry_rcu(common, head, hash_entry) {
+ spin_lock_bh(&hash->list_locks[bucket]);
+ cb->seq = atomic_read(&hash->generation) << 1 | 1;
+
+ hlist_for_each_entry(common, &hash->table[bucket], hash_entry) {
if (idx++ < *idx_s)
continue;
- if (batadv_tt_local_dump_entry(msg, portid, seq, bat_priv,
+ if (batadv_tt_local_dump_entry(msg, portid, cb, bat_priv,
common)) {
- rcu_read_unlock();
+ spin_unlock_bh(&hash->list_locks[bucket]);
*idx_s = idx - 1;
return -EMSGSIZE;
}
}
- rcu_read_unlock();
+ spin_unlock_bh(&hash->list_locks[bucket]);
*idx_s = 0;
return 0;
@@ -1243,29 +1130,20 @@ batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
*/
int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb)
{
- struct net *net = sock_net(cb->skb->sk);
- struct net_device *soft_iface;
+ struct net_device *mesh_iface;
struct batadv_priv *bat_priv;
struct batadv_hard_iface *primary_if = NULL;
struct batadv_hashtable *hash;
- struct hlist_head *head;
int ret;
- int ifindex;
int bucket = cb->args[0];
int idx = cb->args[1];
int portid = NETLINK_CB(cb->skb).portid;
- ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
- if (!ifindex)
- return -EINVAL;
-
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
- ret = -ENODEV;
- goto out;
- }
+ mesh_iface = batadv_netlink_get_meshif(cb);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
- bat_priv = netdev_priv(soft_iface);
+ bat_priv = netdev_priv(mesh_iface);
primary_if = batadv_primary_if_get_selected(bat_priv);
if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
@@ -1276,10 +1154,8 @@ int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb)
hash = bat_priv->tt.local_hash;
while (bucket < hash->size) {
- head = &hash->table[bucket];
-
- if (batadv_tt_local_dump_bucket(msg, portid, cb->nlh->nlmsg_seq,
- bat_priv, head, &idx))
+ if (batadv_tt_local_dump_bucket(msg, portid, cb, bat_priv,
+ hash, bucket, &idx))
break;
bucket++;
@@ -1288,10 +1164,8 @@ int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb)
ret = msg->len;
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- if (soft_iface)
- dev_put(soft_iface);
+ batadv_hardif_put(primary_if);
+ dev_put(mesh_iface);
cb->args[0] = bucket;
cb->args[1] = idx;
@@ -1320,7 +1194,7 @@ batadv_tt_local_set_pending(struct batadv_priv *bat_priv,
/**
* batadv_tt_local_remove() - logically remove an entry from the local table
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @addr: the MAC address of the client to remove
* @vid: VLAN identifier
* @message: message to append to the log on deletion
@@ -1332,9 +1206,10 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr,
unsigned short vid, const char *message,
bool roaming)
{
+ struct batadv_tt_local_entry *tt_removed_entry;
struct batadv_tt_local_entry *tt_local_entry;
u16 flags, curr_flags = BATADV_NO_FLAGS;
- void *tt_entry_exists;
+ struct hlist_node *tt_removed_node;
tt_local_entry = batadv_tt_local_hash_find(bat_priv, addr, vid);
if (!tt_local_entry)
@@ -1363,26 +1238,28 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr,
*/
batadv_tt_local_event(bat_priv, tt_local_entry, BATADV_TT_CLIENT_DEL);
- tt_entry_exists = batadv_hash_remove(bat_priv->tt.local_hash,
+ tt_removed_node = batadv_hash_remove(bat_priv->tt.local_hash,
batadv_compare_tt,
batadv_choose_tt,
&tt_local_entry->common);
- if (!tt_entry_exists)
+ if (!tt_removed_node)
goto out;
- /* extra call to free the local tt entry */
- batadv_tt_local_entry_put(tt_local_entry);
+ /* drop reference of remove hash entry */
+ tt_removed_entry = hlist_entry(tt_removed_node,
+ struct batadv_tt_local_entry,
+ common.hash_entry);
+ batadv_tt_local_entry_put(tt_removed_entry);
out:
- if (tt_local_entry)
- batadv_tt_local_entry_put(tt_local_entry);
+ batadv_tt_local_entry_put(tt_local_entry);
return curr_flags;
}
/**
* batadv_tt_local_purge_list() - purge inactive tt local entries
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @head: pointer to the list containing the local tt entries
* @timeout: parameter deciding whether a given tt local entry is considered
* inactive or not
@@ -1417,7 +1294,7 @@ static void batadv_tt_local_purge_list(struct batadv_priv *bat_priv,
/**
* batadv_tt_local_purge() - purge inactive tt local entries
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @timeout: parameter deciding whether a given tt local entry is considered
* inactive or not
*/
@@ -1504,7 +1381,7 @@ static void batadv_tt_changes_list_free(struct batadv_priv *bat_priv)
kmem_cache_free(batadv_tt_change_cache, entry);
}
- atomic_set(&bat_priv->tt.local_changes, 0);
+ WRITE_ONCE(bat_priv->tt.local_changes, 0);
spin_unlock_bh(&bat_priv->tt.changes_list_lock);
}
@@ -1645,15 +1522,14 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global,
sync_flags:
batadv_tt_global_sync_flags(tt_global);
out:
- if (orig_entry)
- batadv_tt_orig_list_entry_put(orig_entry);
+ batadv_tt_orig_list_entry_put(orig_entry);
spin_unlock_bh(&tt_global->list_lock);
}
/**
* batadv_tt_global_add() - add a new TT global entry or update an existing one
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig_node: the originator announcing the client
* @tt_addr: the mac address of the non-mesh client
* @vid: VLAN identifier
@@ -1666,7 +1542,7 @@ out:
* the function argument.
* If a TT local entry exists for this non-mesh client remove it.
*
- * The caller must hold orig_node refcount.
+ * The caller must hold the orig_node refcount.
*
* Return: true if the new entry has been added, false otherwise
*/
@@ -1819,19 +1695,17 @@ out_remove:
tt_global_entry->common.flags &= ~BATADV_TT_CLIENT_ROAM;
out:
- if (tt_global_entry)
- batadv_tt_global_entry_put(tt_global_entry);
- if (tt_local_entry)
- batadv_tt_local_entry_put(tt_local_entry);
+ batadv_tt_global_entry_put(tt_global_entry);
+ batadv_tt_local_entry_put(tt_local_entry);
return ret;
}
/**
* batadv_transtable_best_orig() - Get best originator list entry from tt entry
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @tt_global_entry: global translation table entry to be analyzed
*
- * This functon assumes the caller holds rcu_read_lock().
+ * This function assumes the caller holds rcu_read_lock().
* Return: best originator list entry or NULL on errors.
*/
static struct batadv_tt_orig_list_entry *
@@ -1858,152 +1732,17 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv,
}
/* release the refcount for the "old" best */
- if (best_router)
- batadv_neigh_node_put(best_router);
+ batadv_neigh_node_put(best_router);
best_entry = orig_entry;
best_router = router;
}
- if (best_router)
- batadv_neigh_node_put(best_router);
+ batadv_neigh_node_put(best_router);
return best_entry;
}
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
-/**
- * batadv_tt_global_print_entry() - print all orig nodes who announce the
- * address for this global entry
- * @bat_priv: the bat priv with all the soft interface information
- * @tt_global_entry: global translation table entry to be printed
- * @seq: debugfs table seq_file struct
- *
- * This functon assumes the caller holds rcu_read_lock().
- */
-static void
-batadv_tt_global_print_entry(struct batadv_priv *bat_priv,
- struct batadv_tt_global_entry *tt_global_entry,
- struct seq_file *seq)
-{
- struct batadv_tt_orig_list_entry *orig_entry, *best_entry;
- struct batadv_tt_common_entry *tt_common_entry;
- struct batadv_orig_node_vlan *vlan;
- struct hlist_head *head;
- u8 last_ttvn;
- u16 flags;
-
- tt_common_entry = &tt_global_entry->common;
- flags = tt_common_entry->flags;
-
- best_entry = batadv_transtable_best_orig(bat_priv, tt_global_entry);
- if (best_entry) {
- vlan = batadv_orig_node_vlan_get(best_entry->orig_node,
- tt_common_entry->vid);
- if (!vlan) {
- seq_printf(seq,
- " * Cannot retrieve VLAN %d for originator %pM\n",
- batadv_print_vid(tt_common_entry->vid),
- best_entry->orig_node->orig);
- goto print_list;
- }
-
- last_ttvn = atomic_read(&best_entry->orig_node->last_ttvn);
- seq_printf(seq,
- " %c %pM %4i (%3u) via %pM (%3u) (%#.8x) [%c%c%c%c]\n",
- '*', tt_global_entry->common.addr,
- batadv_print_vid(tt_global_entry->common.vid),
- best_entry->ttvn, best_entry->orig_node->orig,
- last_ttvn, vlan->tt.crc,
- ((flags & BATADV_TT_CLIENT_ROAM) ? 'R' : '.'),
- ((flags & BATADV_TT_CLIENT_WIFI) ? 'W' : '.'),
- ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'),
- ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.'));
-
- batadv_orig_node_vlan_put(vlan);
- }
-
-print_list:
- head = &tt_global_entry->orig_list;
-
- hlist_for_each_entry_rcu(orig_entry, head, list) {
- if (best_entry == orig_entry)
- continue;
-
- vlan = batadv_orig_node_vlan_get(orig_entry->orig_node,
- tt_common_entry->vid);
- if (!vlan) {
- seq_printf(seq,
- " + Cannot retrieve VLAN %d for originator %pM\n",
- batadv_print_vid(tt_common_entry->vid),
- orig_entry->orig_node->orig);
- continue;
- }
-
- last_ttvn = atomic_read(&orig_entry->orig_node->last_ttvn);
- seq_printf(seq,
- " %c %pM %4d (%3u) via %pM (%3u) (%#.8x) [%c%c%c%c]\n",
- '+', tt_global_entry->common.addr,
- batadv_print_vid(tt_global_entry->common.vid),
- orig_entry->ttvn, orig_entry->orig_node->orig,
- last_ttvn, vlan->tt.crc,
- ((flags & BATADV_TT_CLIENT_ROAM) ? 'R' : '.'),
- ((flags & BATADV_TT_CLIENT_WIFI) ? 'W' : '.'),
- ((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'),
- ((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.'));
-
- batadv_orig_node_vlan_put(vlan);
- }
-}
-
-/**
- * batadv_tt_global_seq_print_text() - Print the global tt table in a seq file
- * @seq: seq file to print on
- * @offset: not used
- *
- * Return: always 0
- */
-int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset)
-{
- struct net_device *net_dev = (struct net_device *)seq->private;
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
- struct batadv_hashtable *hash = bat_priv->tt.global_hash;
- struct batadv_tt_common_entry *tt_common_entry;
- struct batadv_tt_global_entry *tt_global;
- struct batadv_hard_iface *primary_if;
- struct hlist_head *head;
- u32 i;
-
- primary_if = batadv_seq_print_text_primary_if_get(seq);
- if (!primary_if)
- goto out;
-
- seq_printf(seq,
- "Globally announced TT entries received via the mesh %s\n",
- net_dev->name);
- seq_puts(seq,
- " Client VID (TTVN) Originator (Curr TTVN) (CRC ) Flags\n");
-
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(tt_common_entry,
- head, hash_entry) {
- tt_global = container_of(tt_common_entry,
- struct batadv_tt_global_entry,
- common);
- batadv_tt_global_print_entry(bat_priv, tt_global, seq);
- }
- rcu_read_unlock();
- }
-out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- return 0;
-}
-#endif
-
/**
* batadv_tt_global_dump_subentry() - Dump all TT local entries into a message
* @msg: Netlink message to dump into
@@ -2070,7 +1809,7 @@ batadv_tt_global_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq,
* @msg: Netlink message to dump into
* @portid: Port making netlink request
* @seq: Sequence number of netlink message
- * @bat_priv: The bat priv with all the soft interface information
+ * @bat_priv: The bat priv with all the mesh interface information
* @common: tt local & tt global common data
* @sub_s: Number of entries to skip
*
@@ -2115,7 +1854,7 @@ batadv_tt_global_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
* @msg: Netlink message to dump into
* @portid: Port making netlink request
* @seq: Sequence number of netlink message
- * @bat_priv: The bat priv with all the soft interface information
+ * @bat_priv: The bat priv with all the mesh interface information
* @head: Pointer to the list containing the global tt entries
* @idx_s: Number of entries to skip
* @sub: Number of entries to skip
@@ -2158,30 +1897,22 @@ batadv_tt_global_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
*/
int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb)
{
- struct net *net = sock_net(cb->skb->sk);
- struct net_device *soft_iface;
+ struct net_device *mesh_iface;
struct batadv_priv *bat_priv;
struct batadv_hard_iface *primary_if = NULL;
struct batadv_hashtable *hash;
struct hlist_head *head;
int ret;
- int ifindex;
int bucket = cb->args[0];
int idx = cb->args[1];
int sub = cb->args[2];
int portid = NETLINK_CB(cb->skb).portid;
- ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
- if (!ifindex)
- return -EINVAL;
+ mesh_iface = batadv_netlink_get_meshif(cb);
+ if (IS_ERR(mesh_iface))
+ return PTR_ERR(mesh_iface);
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
- ret = -ENODEV;
- goto out;
- }
-
- bat_priv = netdev_priv(soft_iface);
+ bat_priv = netdev_priv(mesh_iface);
primary_if = batadv_primary_if_get_selected(bat_priv);
if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
@@ -2205,10 +1936,8 @@ int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb)
ret = msg->len;
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
- if (soft_iface)
- dev_put(soft_iface);
+ batadv_hardif_put(primary_if);
+ dev_put(mesh_iface);
cb->args[0] = bucket;
cb->args[1] = idx;
@@ -2261,7 +1990,7 @@ batadv_tt_global_del_orig_list(struct batadv_tt_global_entry *tt_global_entry)
/**
* batadv_tt_global_del_orig_node() - remove orig_node from a global tt entry
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @tt_global_entry: the global entry to remove the orig_node from
* @orig_node: the originator announcing the client
* @message: message to append to the log on deletion
@@ -2340,7 +2069,7 @@ batadv_tt_global_del_roaming(struct batadv_priv *bat_priv,
/**
* batadv_tt_global_del() - remove a client from the global table
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig_node: an originator serving this client
* @addr: the mac address of the client
* @vid: VLAN identifier
@@ -2398,16 +2127,14 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv,
}
out:
- if (tt_global_entry)
- batadv_tt_global_entry_put(tt_global_entry);
- if (local_entry)
- batadv_tt_local_entry_put(local_entry);
+ batadv_tt_global_entry_put(tt_global_entry);
+ batadv_tt_local_entry_put(local_entry);
}
/**
* batadv_tt_global_del_orig() - remove all the TT global entries belonging to
* the given originator matching the provided vid
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig_node: the originator owning the entries to remove
* @match_vid: the VLAN identifier to match. If negative all the entries will be
* removed
@@ -2578,7 +2305,7 @@ _batadv_is_ap_isolated(struct batadv_tt_local_entry *tt_local_entry,
/**
* batadv_transtable_search() - get the mesh destination for a given client
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @src: mac address of the source client
* @addr: mac address of the destination client
* @vid: VLAN identifier
@@ -2628,10 +2355,8 @@ struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv,
rcu_read_unlock();
out:
- if (tt_global_entry)
- batadv_tt_global_entry_put(tt_global_entry);
- if (tt_local_entry)
- batadv_tt_local_entry_put(tt_local_entry);
+ batadv_tt_global_entry_put(tt_global_entry);
+ batadv_tt_local_entry_put(tt_local_entry);
return orig_node;
}
@@ -2639,7 +2364,7 @@ out:
/**
* batadv_tt_global_crc() - calculates the checksum of the local table belonging
* to the given orig_node
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig_node: originator for which the CRC should be computed
* @vid: VLAN identifier for which the CRC32 has to be computed
*
@@ -2733,7 +2458,7 @@ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv,
/**
* batadv_tt_local_crc() - calculates the checksum of the local table
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @vid: VLAN identifier for which the CRC32 has to be computed
*
* For details about the computation, please refer to the documentation for
@@ -2808,6 +2533,9 @@ static void batadv_tt_req_node_release(struct kref *ref)
*/
static void batadv_tt_req_node_put(struct batadv_tt_req_node *tt_req_node)
{
+ if (!tt_req_node)
+ return;
+
kref_put(&tt_req_node->refcount, batadv_tt_req_node_release);
}
@@ -2865,7 +2593,7 @@ static void batadv_tt_req_purge(struct batadv_priv *bat_priv)
/**
* batadv_tt_req_node_new() - search and possibly create a tt_req_node object
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig_node: orig node this request is being issued for
*
* Return: the pointer to the new tt_req_node struct if no request
@@ -2961,7 +2689,7 @@ static bool batadv_tt_global_valid(const void *entry_ptr,
/**
* batadv_tt_tvlv_generate() - fill the tvlv buff with the tt entries from the
* specified tt hash
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @hash: hash table containing the tt entries
* @tt_len: expected tvlv tt data buffer length in number of bytes
* @tvlv_buff: pointer to the buffer to fill with the TT data
@@ -2970,14 +2698,16 @@ static bool batadv_tt_global_valid(const void *entry_ptr,
*
* Fills the tvlv buff with the tt entries from the specified hash. If valid_cb
* is not provided then this becomes a no-op.
+ *
+ * Return: Remaining unused length in tvlv_buff.
*/
-static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv,
- struct batadv_hashtable *hash,
- void *tvlv_buff, u16 tt_len,
- bool (*valid_cb)(const void *,
- const void *,
- u8 *flags),
- void *cb_data)
+static u16 batadv_tt_tvlv_generate(struct batadv_priv *bat_priv,
+ struct batadv_hashtable *hash,
+ void *tvlv_buff, u16 tt_len,
+ bool (*valid_cb)(const void *,
+ const void *,
+ u8 *flags),
+ void *cb_data)
{
struct batadv_tt_common_entry *tt_common_entry;
struct batadv_tvlv_tt_change *tt_change;
@@ -2988,10 +2718,10 @@ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv,
u32 i;
tt_tot = batadv_tt_entries(tt_len);
- tt_change = (struct batadv_tvlv_tt_change *)tvlv_buff;
+ tt_change = tvlv_buff;
if (!valid_cb)
- return;
+ return tt_len;
rcu_read_lock();
for (i = 0; i < hash->size; i++) {
@@ -3017,6 +2747,8 @@ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv,
}
}
rcu_read_unlock();
+
+ return batadv_tt_len(tt_tot - tt_num_entries);
}
/**
@@ -3078,15 +2810,15 @@ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node,
/**
* batadv_tt_local_update_crc() - update all the local CRCs
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*/
static void batadv_tt_local_update_crc(struct batadv_priv *bat_priv)
{
- struct batadv_softif_vlan *vlan;
+ struct batadv_meshif_vlan *vlan;
/* recompute the global CRC for each VLAN */
rcu_read_lock();
- hlist_for_each_entry_rcu(vlan, &bat_priv->softif_vlan_list, list) {
+ hlist_for_each_entry_rcu(vlan, &bat_priv->meshif_vlan_list, list) {
vlan->tt.crc = batadv_tt_local_crc(bat_priv, vlan->vid);
}
rcu_read_unlock();
@@ -3094,7 +2826,7 @@ static void batadv_tt_local_update_crc(struct batadv_priv *bat_priv)
/**
* batadv_tt_global_update_crc() - update all the global CRCs for this orig_node
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig_node: the orig_node for which the CRCs have to be updated
*/
static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv,
@@ -3121,7 +2853,7 @@ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv,
/**
* batadv_send_tt_request() - send a TT Request message to a given node
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @dst_orig_node: the destination of the message
* @ttvn: the version number that the source of the message is looking for
* @tt_vlan: pointer to the first tvlv VLAN object to request
@@ -3139,7 +2871,6 @@ static bool batadv_send_tt_request(struct batadv_priv *bat_priv,
{
struct batadv_tvlv_tt_data *tvlv_tt_data = NULL;
struct batadv_tt_req_node *tt_req_node = NULL;
- struct batadv_tvlv_tt_vlan_data *tt_vlan_req;
struct batadv_hard_iface *primary_if;
bool ret = false;
int i, size;
@@ -3155,7 +2886,7 @@ static bool batadv_send_tt_request(struct batadv_priv *bat_priv,
if (!tt_req_node)
goto out;
- size = sizeof(*tvlv_tt_data) + sizeof(*tt_vlan_req) * num_vlan;
+ size = struct_size(tvlv_tt_data, vlan_data, num_vlan);
tvlv_tt_data = kzalloc(size, GFP_ATOMIC);
if (!tvlv_tt_data)
goto out;
@@ -3167,12 +2898,10 @@ static bool batadv_send_tt_request(struct batadv_priv *bat_priv,
/* send all the CRCs within the request. This is needed by intermediate
* nodes to ensure they have the correct table before replying
*/
- tt_vlan_req = (struct batadv_tvlv_tt_vlan_data *)(tvlv_tt_data + 1);
for (i = 0; i < num_vlan; i++) {
- tt_vlan_req->vid = tt_vlan->vid;
- tt_vlan_req->crc = tt_vlan->crc;
+ tvlv_tt_data->vlan_data[i].vid = tt_vlan->vid;
+ tvlv_tt_data->vlan_data[i].crc = tt_vlan->crc;
- tt_vlan_req++;
tt_vlan++;
}
@@ -3189,8 +2918,7 @@ static bool batadv_send_tt_request(struct batadv_priv *bat_priv,
ret = true;
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
+ batadv_hardif_put(primary_if);
if (ret && tt_req_node) {
spin_lock_bh(&bat_priv->tt.req_list_lock);
@@ -3201,8 +2929,7 @@ out:
spin_unlock_bh(&bat_priv->tt.req_list_lock);
}
- if (tt_req_node)
- batadv_tt_req_node_put(tt_req_node);
+ batadv_tt_req_node_put(tt_req_node);
kfree(tvlv_tt_data);
return ret;
@@ -3211,7 +2938,7 @@ out:
/**
* batadv_send_other_tt_response() - send reply to tt request concerning another
* node's translation table
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @tt_data: tt data containing the tt request information
* @req_src: mac address of tt request sender
* @req_dst: mac address of tt request recipient
@@ -3226,7 +2953,6 @@ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv,
struct batadv_orig_node *res_dst_orig_node = NULL;
struct batadv_tvlv_tt_change *tt_change;
struct batadv_tvlv_tt_data *tvlv_tt_data = NULL;
- struct batadv_tvlv_tt_vlan_data *tt_vlan;
bool ret = false, full_table;
u8 orig_ttvn, req_ttvn;
u16 tvlv_len;
@@ -3249,10 +2975,9 @@ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv,
orig_ttvn = (u8)atomic_read(&req_dst_orig_node->last_ttvn);
req_ttvn = tt_data->ttvn;
- tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(tt_data + 1);
/* this node doesn't have the requested data */
if (orig_ttvn != req_ttvn ||
- !batadv_tt_global_check_crc(req_dst_orig_node, tt_vlan,
+ !batadv_tt_global_check_crc(req_dst_orig_node, tt_data->vlan_data,
ntohs(tt_data->num_vlan)))
goto out;
@@ -3294,16 +3019,17 @@ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv,
goto out;
/* fill the rest of the tvlv with the real TT entries */
- batadv_tt_tvlv_generate(bat_priv, bat_priv->tt.global_hash,
- tt_change, tt_len,
- batadv_tt_global_valid,
- req_dst_orig_node);
+ tvlv_len -= batadv_tt_tvlv_generate(bat_priv,
+ bat_priv->tt.global_hash,
+ tt_change, tt_len,
+ batadv_tt_global_valid,
+ req_dst_orig_node);
}
/* Don't send the response, if larger than fragmented packet. */
tt_len = sizeof(struct batadv_unicast_tvlv_packet) + tvlv_len;
if (tt_len > atomic_read(&bat_priv->packet_size_max)) {
- net_ratelimited_function(batadv_info, bat_priv->soft_iface,
+ net_ratelimited_function(batadv_info, bat_priv->mesh_iface,
"Ignoring TT_REQUEST from %pM; Response size exceeds max packet size.\n",
res_dst_orig_node->orig);
goto out;
@@ -3333,10 +3059,8 @@ unlock:
spin_unlock_bh(&req_dst_orig_node->tt_buff_lock);
out:
- if (res_dst_orig_node)
- batadv_orig_node_put(res_dst_orig_node);
- if (req_dst_orig_node)
- batadv_orig_node_put(req_dst_orig_node);
+ batadv_orig_node_put(res_dst_orig_node);
+ batadv_orig_node_put(req_dst_orig_node);
kfree(tvlv_tt_data);
return ret;
}
@@ -3344,7 +3068,7 @@ out:
/**
* batadv_send_my_tt_response() - send reply to tt request concerning this
* node's translation table
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @tt_data: tt data containing the tt request information
* @req_src: mac address of tt request sender
*
@@ -3423,9 +3147,11 @@ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv,
goto out;
/* fill the rest of the tvlv with the real TT entries */
- batadv_tt_tvlv_generate(bat_priv, bat_priv->tt.local_hash,
- tt_change, tt_len,
- batadv_tt_local_valid, NULL);
+ tvlv_len -= batadv_tt_tvlv_generate(bat_priv,
+ bat_priv->tt.local_hash,
+ tt_change, tt_len,
+ batadv_tt_local_valid,
+ NULL);
}
tvlv_tt_data->flags = BATADV_TT_RESPONSE;
@@ -3450,10 +3176,8 @@ unlock:
spin_unlock_bh(&bat_priv->tt.last_changeset_lock);
out:
spin_unlock_bh(&bat_priv->tt.commit_lock);
- if (orig_node)
- batadv_orig_node_put(orig_node);
- if (primary_if)
- batadv_hardif_put(primary_if);
+ batadv_orig_node_put(orig_node);
+ batadv_hardif_put(primary_if);
kfree(tvlv_tt_data);
/* The packet was for this host, so it doesn't need to be re-routed */
return true;
@@ -3461,7 +3185,7 @@ out:
/**
* batadv_send_tt_response() - send reply to tt request
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @tt_data: tt data containing the tt request information
* @req_src: mac address of tt request sender
* @req_dst: mac address of tt request recipient
@@ -3538,8 +3262,7 @@ static void batadv_tt_fill_gtable(struct batadv_priv *bat_priv,
atomic_set(&orig_node->last_ttvn, ttvn);
out:
- if (orig_node)
- batadv_orig_node_put(orig_node);
+ batadv_orig_node_put(orig_node);
}
static void batadv_tt_update_changes(struct batadv_priv *bat_priv,
@@ -3557,7 +3280,7 @@ static void batadv_tt_update_changes(struct batadv_priv *bat_priv,
/**
* batadv_is_my_client() - check if a client is served by the local node
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @addr: the mac address of the client to check
* @vid: VLAN identifier
*
@@ -3580,14 +3303,13 @@ bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr,
goto out;
ret = true;
out:
- if (tt_local_entry)
- batadv_tt_local_entry_put(tt_local_entry);
+ batadv_tt_local_entry_put(tt_local_entry);
return ret;
}
/**
* batadv_handle_tt_response() - process incoming tt reply
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @tt_data: tt data containing the tt request information
* @resp_src: mac address of tt reply sender
* @num_entries: number of tt change entries appended to the tt data
@@ -3601,7 +3323,6 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node = NULL;
struct batadv_tvlv_tt_change *tt_change;
u8 *tvlv_ptr = (u8 *)tt_data;
- u16 change_offset;
batadv_dbg(BATADV_DBG_TT, bat_priv,
"Received TT_RESPONSE from %pM for ttvn %d t_size: %d [%c]\n",
@@ -3614,10 +3335,7 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv,
spin_lock_bh(&orig_node->tt_lock);
- change_offset = sizeof(struct batadv_tvlv_tt_vlan_data);
- change_offset *= ntohs(tt_data->num_vlan);
- change_offset += sizeof(*tt_data);
- tvlv_ptr += change_offset;
+ tvlv_ptr += struct_size(tt_data, vlan_data, ntohs(tt_data->num_vlan));
tt_change = (struct batadv_tvlv_tt_change *)tvlv_ptr;
if (tt_data->flags & BATADV_TT_FULL_TABLE) {
@@ -3644,8 +3362,7 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv,
spin_unlock_bh(&bat_priv->tt.req_list_lock);
out:
- if (orig_node)
- batadv_orig_node_put(orig_node);
+ batadv_orig_node_put(orig_node);
}
static void batadv_tt_roam_list_free(struct batadv_priv *bat_priv)
@@ -3680,7 +3397,7 @@ static void batadv_tt_roam_purge(struct batadv_priv *bat_priv)
/**
* batadv_tt_check_roam_count() - check if a client has roamed too frequently
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @client: mac address of the roaming client
*
* This function checks whether the client already reached the
@@ -3735,7 +3452,7 @@ unlock:
/**
* batadv_send_roam_adv() - send a roaming advertisement message
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @client: mac address of the roaming client
* @vid: VLAN identifier
* @orig_node: message destination
@@ -3776,8 +3493,7 @@ static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client,
&tvlv_roam, sizeof(tvlv_roam));
out:
- if (primary_if)
- batadv_hardif_put(primary_if);
+ batadv_hardif_put(primary_if);
}
static void batadv_tt_purge(struct work_struct *work)
@@ -3800,11 +3516,13 @@ static void batadv_tt_purge(struct work_struct *work)
}
/**
- * batadv_tt_free() - Free translation table of soft interface
- * @bat_priv: the bat priv with all the soft interface information
+ * batadv_tt_free() - Free translation table of mesh interface
+ * @bat_priv: the bat priv with all the mesh interface information
*/
void batadv_tt_free(struct batadv_priv *bat_priv)
{
+ batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_ROAM, 1);
+
batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_TT, 1);
batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_TT, 1);
@@ -3822,7 +3540,7 @@ void batadv_tt_free(struct batadv_priv *bat_priv)
/**
* batadv_tt_local_set_flags() - set or unset the specified flags on the local
* table and possibly count them in the TT size
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @flags: the flag to switch
* @enable: whether to set or unset the flag
* @count: whether to increase the TT size by the number of changed entries
@@ -3908,7 +3626,7 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv)
/**
* batadv_tt_local_commit_changes_nolock() - commit all pending local tt changes
* which have been queued in the time since the last commit
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*
* Caller must hold tt->commit_lock.
*/
@@ -3916,7 +3634,7 @@ static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv)
{
lockdep_assert_held(&bat_priv->tt.commit_lock);
- if (atomic_read(&bat_priv->tt.local_changes) < 1) {
+ if (READ_ONCE(bat_priv->tt.local_changes) == 0) {
if (!batadv_atomic_dec_not_zero(&bat_priv->tt.ogm_append_cnt))
batadv_tt_tvlv_container_update(bat_priv);
return;
@@ -3941,7 +3659,7 @@ static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv)
/**
* batadv_tt_local_commit_changes() - commit all pending local tt changes which
* have been queued in the time since the last commit
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*/
void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv)
{
@@ -3952,7 +3670,7 @@ void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv)
/**
* batadv_is_ap_isolated() - Check if packet from upper layer should be dropped
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @src: source mac address of packet
* @dst: destination mac address of packet
* @vid: vlan id of packet
@@ -3964,10 +3682,10 @@ bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst,
{
struct batadv_tt_local_entry *tt_local_entry;
struct batadv_tt_global_entry *tt_global_entry;
- struct batadv_softif_vlan *vlan;
+ struct batadv_meshif_vlan *vlan;
bool ret = false;
- vlan = batadv_softif_vlan_get(bat_priv, vid);
+ vlan = batadv_meshif_vlan_get(bat_priv, vid);
if (!vlan)
return false;
@@ -3989,14 +3707,14 @@ bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst,
local_entry_put:
batadv_tt_local_entry_put(tt_local_entry);
vlan_put:
- batadv_softif_vlan_put(vlan);
+ batadv_meshif_vlan_put(vlan);
return ret;
}
/**
* batadv_tt_update_orig() - update global translation table with new tt
* information received via ogms
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig_node: the orig_node of the ogm
* @tt_buff: pointer to the first tvlv VLAN entry
* @tt_num_vlan: number of tvlv VLAN entries
@@ -4080,7 +3798,7 @@ request_table:
/**
* batadv_tt_global_client_is_roaming() - check if a client is marked as roaming
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @addr: the mac address of the client to check
* @vid: VLAN identifier
*
@@ -4106,7 +3824,7 @@ out:
/**
* batadv_tt_local_client_is_roaming() - tells whether the client is roaming
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @addr: the mac address of the local client to query
* @vid: VLAN identifier
*
@@ -4132,7 +3850,7 @@ out:
/**
* batadv_tt_add_temporary_global_entry() - Add temporary entry to global TT
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig_node: orig node which the temporary entry should be associated with
* @addr: mac address of the client
* @vid: VLAN id of the new temporary global translation table
@@ -4165,21 +3883,21 @@ bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv,
/**
* batadv_tt_local_resize_to_mtu() - resize the local translation table fit the
* maximum packet size that can be transported through the mesh
- * @soft_iface: netdev struct of the mesh interface
+ * @mesh_iface: netdev struct of the mesh interface
*
* Remove entries older than 'timeout' and half timeout if more entries need
* to be removed.
*/
-void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface)
+void batadv_tt_local_resize_to_mtu(struct net_device *mesh_iface)
{
- struct batadv_priv *bat_priv = netdev_priv(soft_iface);
+ struct batadv_priv *bat_priv = netdev_priv(mesh_iface);
int packet_size_max = atomic_read(&bat_priv->packet_size_max);
int table_size, timeout = BATADV_TT_LOCAL_TIMEOUT / 2;
bool reduced = false;
spin_lock_bh(&bat_priv->tt.commit_lock);
- while (true) {
+ while (timeout) {
table_size = batadv_tt_local_table_transmit_size(bat_priv);
if (packet_size_max >= table_size)
break;
@@ -4189,7 +3907,7 @@ void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface)
timeout /= 2;
reduced = true;
- net_ratelimited_function(batadv_info, soft_iface,
+ net_ratelimited_function(batadv_info, mesh_iface,
"Forced to purge local tt entries to fit new maximum fragment MTU (%i)\n",
packet_size_max);
}
@@ -4205,7 +3923,7 @@ void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface)
/**
* batadv_tt_tvlv_ogm_handler_v1() - process incoming tt tvlv container
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @orig: the orig_node of the ogm
* @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
* @tvlv_value: tvlv buffer containing the gateway data
@@ -4216,36 +3934,35 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
u8 flags, void *tvlv_value,
u16 tvlv_value_len)
{
- struct batadv_tvlv_tt_vlan_data *tt_vlan;
struct batadv_tvlv_tt_change *tt_change;
struct batadv_tvlv_tt_data *tt_data;
u16 num_entries, num_vlan;
+ size_t tt_data_sz;
if (tvlv_value_len < sizeof(*tt_data))
return;
- tt_data = (struct batadv_tvlv_tt_data *)tvlv_value;
- tvlv_value_len -= sizeof(*tt_data);
-
+ tt_data = tvlv_value;
num_vlan = ntohs(tt_data->num_vlan);
- if (tvlv_value_len < sizeof(*tt_vlan) * num_vlan)
+ tt_data_sz = struct_size(tt_data, vlan_data, num_vlan);
+ if (tvlv_value_len < tt_data_sz)
return;
- tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(tt_data + 1);
- tt_change = (struct batadv_tvlv_tt_change *)(tt_vlan + num_vlan);
- tvlv_value_len -= sizeof(*tt_vlan) * num_vlan;
+ tt_change = (struct batadv_tvlv_tt_change *)((void *)tt_data
+ + tt_data_sz);
+ tvlv_value_len -= tt_data_sz;
num_entries = batadv_tt_entries(tvlv_value_len);
- batadv_tt_update_orig(bat_priv, orig, tt_vlan, num_vlan, tt_change,
- num_entries, tt_data->ttvn);
+ batadv_tt_update_orig(bat_priv, orig, tt_data->vlan_data, num_vlan,
+ tt_change, num_entries, tt_data->ttvn);
}
/**
* batadv_tt_tvlv_unicast_handler_v1() - process incoming (unicast) tt tvlv
* container
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @src: mac address of tt tvlv sender
* @dst: mac address of tt tvlv recipient
* @tvlv_value: tvlv buffer containing the tt data
@@ -4267,11 +3984,11 @@ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv,
if (tvlv_value_len < sizeof(*tt_data))
return NET_RX_SUCCESS;
- tt_data = (struct batadv_tvlv_tt_data *)tvlv_value;
+ tt_data = tvlv_value;
tvlv_value_len -= sizeof(*tt_data);
- tt_vlan_len = sizeof(struct batadv_tvlv_tt_vlan_data);
- tt_vlan_len *= ntohs(tt_data->num_vlan);
+ tt_vlan_len = flex_array_size(tt_data, vlan_data,
+ ntohs(tt_data->num_vlan));
if (tvlv_value_len < tt_vlan_len)
return NET_RX_SUCCESS;
@@ -4327,7 +4044,7 @@ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv,
/**
* batadv_roam_tvlv_unicast_handler_v1() - process incoming tt roam tvlv
* container
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @src: mac address of tt tvlv sender
* @dst: mac address of tt tvlv recipient
* @tvlv_value: tvlv buffer containing the tt data
@@ -4359,7 +4076,7 @@ static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv,
goto out;
batadv_inc_counter(bat_priv, BATADV_CNT_TT_ROAM_ADV_RX);
- roaming_adv = (struct batadv_tvlv_roam_adv *)tvlv_value;
+ roaming_adv = tvlv_value;
batadv_dbg(BATADV_DBG_TT, bat_priv,
"Received ROAMING_ADV from %pM (client %pM)\n",
@@ -4370,14 +4087,13 @@ static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv,
atomic_read(&orig_node->last_ttvn) + 1);
out:
- if (orig_node)
- batadv_orig_node_put(orig_node);
+ batadv_orig_node_put(orig_node);
return NET_RX_SUCCESS;
}
/**
* batadv_tt_init() - initialise the translation table internals
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*
* Return: 0 on success or negative error number in case of failure.
*/
@@ -4393,15 +4109,17 @@ int batadv_tt_init(struct batadv_priv *bat_priv)
return ret;
ret = batadv_tt_global_init(bat_priv);
- if (ret < 0)
+ if (ret < 0) {
+ batadv_tt_local_table_free(bat_priv);
return ret;
+ }
batadv_tvlv_handler_register(bat_priv, batadv_tt_tvlv_ogm_handler_v1,
- batadv_tt_tvlv_unicast_handler_v1,
+ batadv_tt_tvlv_unicast_handler_v1, NULL,
BATADV_TVLV_TT, 1, BATADV_NO_FLAGS);
batadv_tvlv_handler_register(bat_priv, NULL,
- batadv_roam_tvlv_unicast_handler_v1,
+ batadv_roam_tvlv_unicast_handler_v1, NULL,
BATADV_TVLV_ROAM, 1, BATADV_NO_FLAGS);
INIT_DELAYED_WORK(&bat_priv->tt.work, batadv_tt_purge);
@@ -4413,7 +4131,7 @@ int batadv_tt_init(struct batadv_priv *bat_priv)
/**
* batadv_tt_global_is_isolated() - check if a client is marked as isolated
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @addr: the mac address of the client
* @vid: the identifier of the VLAN where this client is connected
*
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
index 01b6c8eafaf9..618d9dbca5ea 100644
--- a/net/batman-adv/translation-table.h
+++ b/net/batman-adv/translation-table.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich, Antonio Quartulli
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_TRANSLATION_TABLE_H_
@@ -21,26 +9,27 @@
#include "main.h"
+#include <linux/kref.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <linux/skbuff.h>
#include <linux/types.h>
-struct netlink_callback;
-struct net_device;
-struct seq_file;
-struct sk_buff;
-
int batadv_tt_init(struct batadv_priv *bat_priv);
-bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
+bool batadv_tt_local_add(struct net_device *mesh_iface, const u8 *addr,
unsigned short vid, int ifindex, u32 mark);
u16 batadv_tt_local_remove(struct batadv_priv *bat_priv,
const u8 *addr, unsigned short vid,
const char *message, bool roaming);
-int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset);
-int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset);
int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb);
int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb);
void batadv_tt_global_del_orig(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node,
s32 match_vid, const char *message);
+struct batadv_tt_global_entry *
+batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr,
+ unsigned short vid);
+void batadv_tt_global_entry_release(struct kref *ref);
int batadv_tt_global_hash_count(struct batadv_priv *bat_priv,
const u8 *addr, unsigned short vid);
struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv,
@@ -56,7 +45,7 @@ bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv,
u8 *addr, unsigned short vid);
bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv,
u8 *addr, unsigned short vid);
-void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface);
+void batadv_tt_local_resize_to_mtu(struct net_device *mesh_iface);
bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node,
const unsigned char *addr,
@@ -67,4 +56,19 @@ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv,
int batadv_tt_cache_init(void);
void batadv_tt_cache_destroy(void);
+/**
+ * batadv_tt_global_entry_put() - decrement the tt_global_entry refcounter and
+ * possibly release it
+ * @tt_global_entry: tt_global_entry to be free'd
+ */
+static inline void
+batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry)
+{
+ if (!tt_global_entry)
+ return;
+
+ kref_put(&tt_global_entry->common.refcount,
+ batadv_tt_global_entry_release);
+}
+
#endif /* _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ */
diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c
index 40e69c9346d2..76dff1f9c559 100644
--- a/net/batman-adv/tvlv.c
+++ b/net/batman-adv/tvlv.c
@@ -1,28 +1,16 @@
// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include "main.h"
#include <linux/byteorder/generic.h>
+#include <linux/container_of.h>
#include <linux/etherdevice.h>
#include <linux/gfp.h>
#include <linux/if_ether.h>
-#include <linux/kernel.h>
#include <linux/kref.h>
#include <linux/list.h>
#include <linux/lockdep.h>
@@ -62,13 +50,16 @@ static void batadv_tvlv_handler_release(struct kref *ref)
*/
static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler)
{
+ if (!tvlv_handler)
+ return;
+
kref_put(&tvlv_handler->refcount, batadv_tvlv_handler_release);
}
/**
* batadv_tvlv_handler_get() - retrieve tvlv handler from the tvlv handler list
* based on the provided type and version (both need to match)
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @type: tvlv handler type to look for
* @version: tvlv handler version to look for
*
@@ -118,13 +109,16 @@ static void batadv_tvlv_container_release(struct kref *ref)
*/
static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv)
{
+ if (!tvlv)
+ return;
+
kref_put(&tvlv->refcount, batadv_tvlv_container_release);
}
/**
* batadv_tvlv_container_get() - retrieve tvlv container from the tvlv container
* list based on the provided type and version (both need to match)
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @type: tvlv container type to look for
* @version: tvlv container version to look for
*
@@ -158,7 +152,7 @@ batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version)
/**
* batadv_tvlv_container_list_size() - calculate the size of the tvlv container
* list entries
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
*
* Has to be called with the appropriate locks being acquired
* (tvlv.container_list_lock).
@@ -183,7 +177,7 @@ static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv)
/**
* batadv_tvlv_container_remove() - remove tvlv container from the tvlv
* container list
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @tvlv: the to be removed tvlv container
*
* Has to be called with the appropriate locks being acquired
@@ -207,7 +201,7 @@ static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv,
/**
* batadv_tvlv_container_unregister() - unregister tvlv container based on the
* provided type and version (both need to match)
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @type: tvlv container type to unregister
* @version: tvlv container type to unregister
*/
@@ -225,7 +219,7 @@ void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv,
/**
* batadv_tvlv_container_register() - register tvlv type, version and content
* to be propagated with each (primary interface) OGM
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @type: tvlv container type
* @version: tvlv container version
* @tvlv_value: tvlv container content
@@ -303,7 +297,7 @@ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff,
/**
* batadv_tvlv_container_ogm_append() - append tvlv container content to given
* OGM packet buffer
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @packet_buff: ogm packet buffer
* @packet_buff_len: ogm packet buffer size including ogm header and tvlv
* content
@@ -356,29 +350,33 @@ end:
/**
* batadv_tvlv_call_handler() - parse the given tvlv buffer to call the
* appropriate handlers
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @tvlv_handler: tvlv callback function handling the tvlv content
- * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet
+ * @packet_type: indicates for which packet type the TVLV handler is called
* @orig_node: orig node emitting the ogm packet
- * @src: source mac address of the unicast packet
- * @dst: destination mac address of the unicast packet
+ * @skb: the skb the TVLV handler is called for
* @tvlv_value: tvlv content
* @tvlv_value_len: tvlv content length
*
- * Return: success if handler was not found or the return value of the handler
- * callback.
+ * Return: success if the handler was not found or the return value of the
+ * handler callback.
*/
static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv,
struct batadv_tvlv_handler *tvlv_handler,
- bool ogm_source,
+ u8 packet_type,
struct batadv_orig_node *orig_node,
- u8 *src, u8 *dst,
- void *tvlv_value, u16 tvlv_value_len)
+ struct sk_buff *skb, void *tvlv_value,
+ u16 tvlv_value_len)
{
+ unsigned int tvlv_offset;
+ u8 *src, *dst;
+
if (!tvlv_handler)
return NET_RX_SUCCESS;
- if (ogm_source) {
+ switch (packet_type) {
+ case BATADV_IV_OGM:
+ case BATADV_OGM2:
if (!tvlv_handler->ogm_handler)
return NET_RX_SUCCESS;
@@ -389,19 +387,32 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv,
BATADV_NO_FLAGS,
tvlv_value, tvlv_value_len);
tvlv_handler->flags |= BATADV_TVLV_HANDLER_OGM_CALLED;
- } else {
- if (!src)
- return NET_RX_SUCCESS;
-
- if (!dst)
+ break;
+ case BATADV_UNICAST_TVLV:
+ if (!skb)
return NET_RX_SUCCESS;
if (!tvlv_handler->unicast_handler)
return NET_RX_SUCCESS;
+ src = ((struct batadv_unicast_tvlv_packet *)skb->data)->src;
+ dst = ((struct batadv_unicast_tvlv_packet *)skb->data)->dst;
+
return tvlv_handler->unicast_handler(bat_priv, src,
dst, tvlv_value,
tvlv_value_len);
+ case BATADV_MCAST:
+ if (!skb)
+ return NET_RX_SUCCESS;
+
+ if (!tvlv_handler->mcast_handler)
+ return NET_RX_SUCCESS;
+
+ tvlv_offset = (unsigned char *)tvlv_value - skb->data;
+ skb_set_network_header(skb, tvlv_offset);
+ skb_set_transport_header(skb, tvlv_offset + tvlv_value_len);
+
+ return tvlv_handler->mcast_handler(bat_priv, skb);
}
return NET_RX_SUCCESS;
@@ -410,11 +421,10 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv,
/**
* batadv_tvlv_containers_process() - parse the given tvlv buffer to call the
* appropriate handlers
- * @bat_priv: the bat priv with all the soft interface information
- * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet
+ * @bat_priv: the bat priv with all the mesh interface information
+ * @packet_type: indicates for which packet type the TVLV handler is called
* @orig_node: orig node emitting the ogm packet
- * @src: source mac address of the unicast packet
- * @dst: destination mac address of the unicast packet
+ * @skb: the skb the TVLV handler is called for
* @tvlv_value: tvlv content
* @tvlv_value_len: tvlv content length
*
@@ -422,10 +432,10 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv,
* handler callbacks.
*/
int batadv_tvlv_containers_process(struct batadv_priv *bat_priv,
- bool ogm_source,
+ u8 packet_type,
struct batadv_orig_node *orig_node,
- u8 *src, u8 *dst,
- void *tvlv_value, u16 tvlv_value_len)
+ struct sk_buff *skb, void *tvlv_value,
+ u16 tvlv_value_len)
{
struct batadv_tvlv_handler *tvlv_handler;
struct batadv_tvlv_hdr *tvlv_hdr;
@@ -447,21 +457,24 @@ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv,
tvlv_hdr->version);
ret |= batadv_tvlv_call_handler(bat_priv, tvlv_handler,
- ogm_source, orig_node,
- src, dst, tvlv_value,
+ packet_type, orig_node, skb,
+ tvlv_value,
tvlv_value_cont_len);
- if (tvlv_handler)
- batadv_tvlv_handler_put(tvlv_handler);
+ batadv_tvlv_handler_put(tvlv_handler);
tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len;
tvlv_value_len -= tvlv_value_cont_len;
}
- if (!ogm_source)
+ if (packet_type != BATADV_IV_OGM &&
+ packet_type != BATADV_OGM2)
return ret;
rcu_read_lock();
hlist_for_each_entry_rcu(tvlv_handler,
&bat_priv->tvlv.handler_list, list) {
+ if (!tvlv_handler->ogm_handler)
+ continue;
+
if ((tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) &&
!(tvlv_handler->flags & BATADV_TVLV_HANDLER_OGM_CALLED))
tvlv_handler->ogm_handler(bat_priv, orig_node,
@@ -477,7 +490,7 @@ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv,
/**
* batadv_tvlv_ogm_receive() - process an incoming ogm and call the appropriate
* handlers
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @batadv_ogm_packet: ogm packet containing the tvlv containers
* @orig_node: orig node emitting the ogm packet
*/
@@ -497,7 +510,7 @@ void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv,
tvlv_value = batadv_ogm_packet + 1;
- batadv_tvlv_containers_process(bat_priv, true, orig_node, NULL, NULL,
+ batadv_tvlv_containers_process(bat_priv, BATADV_IV_OGM, orig_node, NULL,
tvlv_value, tvlv_value_len);
}
@@ -505,12 +518,16 @@ void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv,
* batadv_tvlv_handler_register() - register tvlv handler based on the provided
* type and version (both need to match) for ogm tvlv payload and/or unicast
* payload
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @optr: ogm tvlv handler callback function. This function receives the orig
* node, flags and the tvlv content as argument to process.
* @uptr: unicast tvlv handler callback function. This function receives the
* source & destination of the unicast packet as well as the tvlv content
* to process.
+ * @mptr: multicast packet tvlv handler callback function. This function
+ * receives the full skb to process, with the skb network header pointing
+ * to the current tvlv and the skb transport header pointing to the first
+ * byte after the current tvlv.
* @type: tvlv handler type to be registered
* @version: tvlv handler version to be registered
* @flags: flags to enable or disable TVLV API behavior
@@ -525,6 +542,8 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
u8 *src, u8 *dst,
void *tvlv_value,
u16 tvlv_value_len),
+ int (*mptr)(struct batadv_priv *bat_priv,
+ struct sk_buff *skb),
u8 type, u8 version, u8 flags)
{
struct batadv_tvlv_handler *tvlv_handler;
@@ -546,6 +565,7 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
tvlv_handler->ogm_handler = optr;
tvlv_handler->unicast_handler = uptr;
+ tvlv_handler->mcast_handler = mptr;
tvlv_handler->type = type;
tvlv_handler->version = version;
tvlv_handler->flags = flags;
@@ -563,7 +583,7 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
/**
* batadv_tvlv_handler_unregister() - unregister tvlv handler based on the
* provided type and version (both need to match)
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @type: tvlv handler type to be unregistered
* @version: tvlv handler version to be unregistered
*/
@@ -586,7 +606,7 @@ void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv,
/**
* batadv_tvlv_unicast_send() - send a unicast packet with tvlv payload to the
* specified host
- * @bat_priv: the bat priv with all the soft interface information
+ * @bat_priv: the bat priv with all the mesh interface information
* @src: source mac address of the unicast packet
* @dst: destination mac address of the unicast packet
* @type: tvlv type
@@ -594,8 +614,8 @@ void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv,
* @tvlv_value: tvlv content
* @tvlv_value_len: tvlv content length
*/
-void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src,
- u8 *dst, u8 type, u8 version,
+void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, const u8 *src,
+ const u8 *dst, u8 type, u8 version,
void *tvlv_value, u16 tvlv_value_len)
{
struct batadv_unicast_tvlv_packet *unicast_tvlv_packet;
diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h
index ef5867f49824..e5697230d991 100644
--- a/net/batman-adv/tvlv.h
+++ b/net/batman-adv/tvlv.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_TVLV_H_
@@ -21,9 +9,9 @@
#include "main.h"
+#include <linux/skbuff.h>
#include <linux/types.h>
-
-struct batadv_ogm_packet;
+#include <uapi/linux/batadv_packet.h>
void batadv_tvlv_container_register(struct batadv_priv *bat_priv,
u8 type, u8 version,
@@ -47,16 +35,18 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
u8 *src, u8 *dst,
void *tvlv_value,
u16 tvlv_value_len),
+ int (*mptr)(struct batadv_priv *bat_priv,
+ struct sk_buff *skb),
u8 type, u8 version, u8 flags);
void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv,
u8 type, u8 version);
int batadv_tvlv_containers_process(struct batadv_priv *bat_priv,
- bool ogm_source,
+ u8 packet_type,
struct batadv_orig_node *orig_node,
- u8 *src, u8 *dst,
- void *tvlv_buff, u16 tvlv_buff_len);
-void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src,
- u8 *dst, u8 type, u8 version,
+ struct sk_buff *skb, void *tvlv_buff,
+ u16 tvlv_buff_len);
+void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, const u8 *src,
+ const u8 *dst, u8 type, u8 version,
void *tvlv_value, u16 tvlv_value_len);
#endif /* _NET_BATMAN_ADV_TVLV_H_ */
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 343d304851a5..8fc5fe0e9b05 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1,19 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors:
+/* Copyright (C) B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _NET_BATMAN_ADV_TYPES_H_
@@ -26,20 +14,22 @@
#include <linux/average.h>
#include <linux/bitops.h>
#include <linux/compiler.h>
+#include <linux/if.h>
#include <linux/if_ether.h>
#include <linux/kref.h>
+#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/netlink.h>
#include <linux/sched.h> /* for linux/wait.h */
+#include <linux/skbuff.h>
#include <linux/spinlock.h>
+#include <linux/timer.h>
#include <linux/types.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
#include <uapi/linux/batadv_packet.h>
#include <uapi/linux/batman_adv.h>
-struct seq_file;
-
#ifdef CONFIG_BATMAN_ADV_DAT
/**
@@ -91,6 +81,9 @@ struct batadv_hard_iface_bat_iv {
/** @ogm_seqno: OGM sequence number - used to identify each OGM */
atomic_t ogm_seqno;
+
+ /** @ogm_buff_mutex: lock protecting ogm_buff and ogm_buff_len */
+ struct mutex ogm_buff_mutex;
};
/**
@@ -127,6 +120,15 @@ struct batadv_hard_iface_bat_v {
/** @elp_wq: workqueue used to schedule ELP transmissions */
struct delayed_work elp_wq;
+ /** @aggr_wq: workqueue used to transmit queued OGM packets */
+ struct delayed_work aggr_wq;
+
+ /** @aggr_list: queue for to be aggregated OGM packets */
+ struct sk_buff_head aggr_list;
+
+ /** @aggr_len: size of the OGM aggregate (excluding ethernet header) */
+ unsigned int aggr_len;
+
/**
* @throughput_override: throughput override to disable link
* auto-detection
@@ -167,9 +169,6 @@ struct batadv_hard_iface {
/** @list: list node for batadv_hardif_list */
struct list_head list;
- /** @if_num: identificator of the interface */
- unsigned int if_num;
-
/** @if_status: status of the interface for batman-adv */
char if_status;
@@ -187,8 +186,8 @@ struct batadv_hard_iface {
/** @net_dev: pointer to the net_device */
struct net_device *net_dev;
- /** @hardif_obj: kobject of the per interface sysfs "mesh" directory */
- struct kobject *hardif_obj;
+ /** @dev_tracker: device tracker for @net_dev */
+ netdevice_tracker dev_tracker;
/** @refcount: number of contexts the object is used */
struct kref refcount;
@@ -200,14 +199,23 @@ struct batadv_hard_iface {
struct packet_type batman_adv_ptype;
/**
- * @soft_iface: the batman-adv interface which uses this network
+ * @mesh_iface: the batman-adv interface which uses this network
* interface
*/
- struct net_device *soft_iface;
+ struct net_device *mesh_iface;
+
+ /** @meshif_dev_tracker: device tracker for @mesh_iface */
+ netdevice_tracker meshif_dev_tracker;
/** @rcu: struct used for freeing in an RCU-safe manner */
struct rcu_head rcu;
+ /**
+ * @hop_penalty: penalty which will be applied to the tq-field
+ * of an OGM received via this interface
+ */
+ atomic_t hop_penalty;
+
/** @bat_iv: per hard-interface B.A.T.M.A.N. IV data */
struct batadv_hard_iface_bat_iv bat_iv;
@@ -216,13 +224,6 @@ struct batadv_hard_iface {
struct batadv_hard_iface_bat_v bat_v;
#endif
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- /**
- * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
- */
- struct dentry *debug_dir;
-#endif
-
/**
* @neigh_list: list of unique single hop neighbors via this interface
*/
@@ -233,6 +234,21 @@ struct batadv_hard_iface {
};
/**
+ * struct batadv_orig_ifinfo_bat_iv - B.A.T.M.A.N. IV private orig_ifinfo
+ * members
+ */
+struct batadv_orig_ifinfo_bat_iv {
+ /**
+ * @bcast_own: bitfield which counts the number of our OGMs this
+ * orig_node rebroadcasted "back" to us (relative to last_real_seqno)
+ */
+ DECLARE_BITMAP(bcast_own, BATADV_TQ_LOCAL_WINDOW_SIZE);
+
+ /** @bcast_own_sum: sum of bcast_own */
+ u8 bcast_own_sum;
+};
+
+/**
* struct batadv_orig_ifinfo - originator info per outgoing interface
*/
struct batadv_orig_ifinfo {
@@ -257,6 +273,9 @@ struct batadv_orig_ifinfo {
/** @batman_seqno_reset: time when the batman seqno window was reset */
unsigned long batman_seqno_reset;
+ /** @bat_iv: B.A.T.M.A.N. IV private structure */
+ struct batadv_orig_ifinfo_bat_iv bat_iv;
+
/** @refcount: number of contexts the object is used */
struct kref refcount;
@@ -274,7 +293,7 @@ struct batadv_frag_table_entry {
/** @lock: lock to protect the list of fragments */
spinlock_t lock;
- /** @timestamp: time (jiffie) of last received fragment */
+ /** @timestamp: time (jiffy) of last received fragment */
unsigned long timestamp;
/** @seqno: sequence number of the fragments in the list */
@@ -339,19 +358,10 @@ struct batadv_orig_node_vlan {
*/
struct batadv_orig_bat_iv {
/**
- * @bcast_own: set of bitfields (one per hard-interface) where each one
- * counts the number of our OGMs this orig_node rebroadcasted "back" to
- * us (relative to last_real_seqno). Every bitfield is
- * BATADV_TQ_LOCAL_WINDOW_SIZE bits long.
- */
- unsigned long *bcast_own;
-
- /** @bcast_own_sum: sum of bcast_own */
- u8 *bcast_own_sum;
-
- /**
- * @ogm_cnt_lock: lock protecting bcast_own, bcast_own_sum,
- * neigh_node->bat_iv.real_bits & neigh_node->bat_iv.real_packet_count
+ * @ogm_cnt_lock: lock protecting &batadv_orig_ifinfo_bat_iv.bcast_own,
+ * &batadv_orig_ifinfo_bat_iv.bcast_own_sum,
+ * &batadv_neigh_ifinfo_bat_iv.bat_iv.real_bits and
+ * &batadv_neigh_ifinfo_bat_iv.real_packet_count
*/
spinlock_t ogm_cnt_lock;
};
@@ -409,6 +419,17 @@ struct batadv_orig_node {
* list
*/
struct hlist_node mcast_want_all_ipv6_node;
+
+ /**
+ * @mcast_want_all_rtr4_node: a list node for the mcast.want_all_rtr4
+ * list
+ */
+ struct hlist_node mcast_want_all_rtr4_node;
+ /**
+ * @mcast_want_all_rtr6_node: a list node for the mcast.want_all_rtr6
+ * list
+ */
+ struct hlist_node mcast_want_all_rtr6_node;
#endif
/** @capabilities: announced capabilities of this originator */
@@ -436,9 +457,9 @@ struct batadv_orig_node {
spinlock_t tt_buff_lock;
/**
- * @tt_lock: prevents from updating the table while reading it. Table
- * update is made up by two operations (data structure update and
- * metdata -CRC/TTVN-recalculation) and they have to be executed
+ * @tt_lock: avoids concurrent read from and write to the table. Table
+ * update is made up of two operations (data structure update and
+ * metadata -CRC/TTVN-recalculation) and they have to be executed
* atomically in order to avoid another thread to read the
* table/metadata between those.
*/
@@ -472,7 +493,7 @@ struct batadv_orig_node {
/** @hash_entry: hlist node for &batadv_priv.orig_hash */
struct hlist_node hash_entry;
- /** @bat_priv: pointer to soft_iface this orig node belongs to */
+ /** @bat_priv: pointer to mesh_iface this orig node belongs to */
struct batadv_priv *bat_priv;
/** @bcast_seqno_lock: lock protecting bcast_bits & last_bcast_seqno */
@@ -484,20 +505,6 @@ struct batadv_orig_node {
/** @rcu: struct used for freeing in an RCU-safe manner */
struct rcu_head rcu;
-#ifdef CONFIG_BATMAN_ADV_NC
- /** @in_coding_list: list of nodes this orig can hear */
- struct list_head in_coding_list;
-
- /** @out_coding_list: list of nodes that can hear this orig */
- struct list_head out_coding_list;
-
- /** @in_coding_list_lock: protects in_coding_list */
- spinlock_t in_coding_list_lock;
-
- /** @out_coding_list_lock: protects out_coding_list */
- spinlock_t out_coding_list_lock;
-#endif
-
/** @fragments: array with heads for fragment chains */
struct batadv_frag_table_entry fragments[BATADV_FRAG_BUFFER_COUNT];
@@ -524,9 +531,6 @@ enum batadv_orig_capabilities {
*/
BATADV_ORIG_CAPA_HAS_DAT,
- /** @BATADV_ORIG_CAPA_HAS_NC: orig node has network coding enabled */
- BATADV_ORIG_CAPA_HAS_NC,
-
/** @BATADV_ORIG_CAPA_HAS_TT: orig node has tt capability */
BATADV_ORIG_CAPA_HAS_TT,
@@ -581,9 +585,6 @@ struct batadv_hardif_neigh_node_bat_v {
* neighbor
*/
unsigned long last_unicast_tx;
-
- /** @metric_work: work queue callback item for metric update */
- struct work_struct metric_work;
};
/**
@@ -729,11 +730,11 @@ struct batadv_neigh_ifinfo {
* struct batadv_bcast_duplist_entry - structure for LAN broadcast suppression
*/
struct batadv_bcast_duplist_entry {
- /** @orig: mac address of orig node orginating the broadcast */
+ /** @orig: mac address of orig node originating the broadcast */
u8 orig[ETH_ALEN];
/** @crc: crc32 checksum of broadcast payload */
- __be32 crc;
+ u32 crc;
/** @entrytime: time when the broadcast packet was received */
unsigned long entrytime;
@@ -847,82 +848,92 @@ enum batadv_counters {
*/
BATADV_CNT_TT_ROAM_ADV_RX,
-#ifdef CONFIG_BATMAN_ADV_DAT
+#ifdef CONFIG_BATMAN_ADV_MCAST
/**
- * @BATADV_CNT_DAT_GET_TX: transmitted dht GET traffic packet counter
+ * @BATADV_CNT_MCAST_TX: transmitted batman-adv multicast packets
+ * counter
*/
- BATADV_CNT_DAT_GET_TX,
-
- /** @BATADV_CNT_DAT_GET_RX: received dht GET traffic packet counter */
- BATADV_CNT_DAT_GET_RX,
+ BATADV_CNT_MCAST_TX,
/**
- * @BATADV_CNT_DAT_PUT_TX: transmitted dht PUT traffic packet counter
+ * @BATADV_CNT_MCAST_TX_BYTES: transmitted batman-adv multicast packets
+ * bytes counter
*/
- BATADV_CNT_DAT_PUT_TX,
+ BATADV_CNT_MCAST_TX_BYTES,
- /** @BATADV_CNT_DAT_PUT_RX: received dht PUT traffic packet counter */
- BATADV_CNT_DAT_PUT_RX,
+ /**
+ * @BATADV_CNT_MCAST_TX_LOCAL: counter for multicast packets which
+ * were locally encapsulated and transmitted as batman-adv multicast
+ * packets
+ */
+ BATADV_CNT_MCAST_TX_LOCAL,
/**
- * @BATADV_CNT_DAT_CACHED_REPLY_TX: transmitted dat cache reply traffic
- * packet counter
+ * @BATADV_CNT_MCAST_TX_LOCAL_BYTES: bytes counter for multicast packets
+ * which were locally encapsulated and transmitted as batman-adv
+ * multicast packets
*/
- BATADV_CNT_DAT_CACHED_REPLY_TX,
-#endif
+ BATADV_CNT_MCAST_TX_LOCAL_BYTES,
-#ifdef CONFIG_BATMAN_ADV_NC
/**
- * @BATADV_CNT_NC_CODE: transmitted nc-combined traffic packet counter
+ * @BATADV_CNT_MCAST_RX: received batman-adv multicast packet counter
*/
- BATADV_CNT_NC_CODE,
+ BATADV_CNT_MCAST_RX,
/**
- * @BATADV_CNT_NC_CODE_BYTES: transmitted nc-combined traffic bytes
- * counter
+ * @BATADV_CNT_MCAST_RX_BYTES: received batman-adv multicast packet
+ * bytes counter
*/
- BATADV_CNT_NC_CODE_BYTES,
+ BATADV_CNT_MCAST_RX_BYTES,
/**
- * @BATADV_CNT_NC_RECODE: transmitted nc-recombined traffic packet
- * counter
+ * @BATADV_CNT_MCAST_RX_LOCAL: counter for received batman-adv multicast
+ * packets which were forwarded to the local mesh interface
*/
- BATADV_CNT_NC_RECODE,
+ BATADV_CNT_MCAST_RX_LOCAL,
/**
- * @BATADV_CNT_NC_RECODE_BYTES: transmitted nc-recombined traffic bytes
- * counter
+ * @BATADV_CNT_MCAST_RX_LOCAL_BYTES: bytes counter for received
+ * batman-adv multicast packets which were forwarded to the local mesh
+ * interface
*/
- BATADV_CNT_NC_RECODE_BYTES,
+ BATADV_CNT_MCAST_RX_LOCAL_BYTES,
/**
- * @BATADV_CNT_NC_BUFFER: counter for packets buffered for later nc
- * decoding
+ * @BATADV_CNT_MCAST_FWD: counter for received batman-adv multicast
+ * packets which were forwarded to other, neighboring nodes
*/
- BATADV_CNT_NC_BUFFER,
+ BATADV_CNT_MCAST_FWD,
/**
- * @BATADV_CNT_NC_DECODE: received and nc-decoded traffic packet counter
+ * @BATADV_CNT_MCAST_FWD_BYTES: bytes counter for received batman-adv
+ * multicast packets which were forwarded to other, neighboring nodes
*/
- BATADV_CNT_NC_DECODE,
+ BATADV_CNT_MCAST_FWD_BYTES,
+#endif
+#ifdef CONFIG_BATMAN_ADV_DAT
/**
- * @BATADV_CNT_NC_DECODE_BYTES: received and nc-decoded traffic bytes
- * counter
+ * @BATADV_CNT_DAT_GET_TX: transmitted dht GET traffic packet counter
*/
- BATADV_CNT_NC_DECODE_BYTES,
+ BATADV_CNT_DAT_GET_TX,
+
+ /** @BATADV_CNT_DAT_GET_RX: received dht GET traffic packet counter */
+ BATADV_CNT_DAT_GET_RX,
/**
- * @BATADV_CNT_NC_DECODE_FAILED: received and decode-failed traffic
- * packet counter
+ * @BATADV_CNT_DAT_PUT_TX: transmitted dht PUT traffic packet counter
*/
- BATADV_CNT_NC_DECODE_FAILED,
+ BATADV_CNT_DAT_PUT_TX,
+
+ /** @BATADV_CNT_DAT_PUT_RX: received dht PUT traffic packet counter */
+ BATADV_CNT_DAT_PUT_RX,
/**
- * @BATADV_CNT_NC_SNIFFED: counter for nc-decoded packets received in
- * promisc mode.
+ * @BATADV_CNT_DAT_CACHED_REPLY_TX: transmitted dat cache reply traffic
+ * packet counter
*/
- BATADV_CNT_NC_SNIFFED,
+ BATADV_CNT_DAT_CACHED_REPLY_TX,
#endif
/** @BATADV_CNT_NUM: number of traffic counters */
@@ -943,7 +954,7 @@ struct batadv_priv_tt {
atomic_t ogm_append_cnt;
/** @local_changes: changes registered in an originator interval */
- atomic_t local_changes;
+ size_t local_changes;
/**
* @changes_list: tracks tt local changes within an originator interval
@@ -965,7 +976,7 @@ struct batadv_priv_tt {
*/
struct list_head roam_list;
- /** @changes_list_lock: lock protecting changes_list */
+ /** @changes_list_lock: lock protecting changes_list & local_changes */
spinlock_t changes_list_lock;
/** @req_list_lock: lock protecting req_list */
@@ -991,8 +1002,8 @@ struct batadv_priv_tt {
/**
* @commit_lock: prevents from executing a local TT commit while reading
- * the local table. The local TT commit is made up by two operations
- * (data structure update and metdata -CRC/TTVN- recalculation) and
+ * the local table. The local TT commit is made up of two operations
+ * (data structure update and metadata -CRC/TTVN- recalculation) and
* they have to be executed atomically in order to avoid another thread
* to read the table/metadata between those.
*/
@@ -1005,7 +1016,7 @@ struct batadv_priv_tt {
#ifdef CONFIG_BATMAN_ADV_BLA
/**
- * struct batadv_priv_bla - per mesh interface bridge loope avoidance data
+ * struct batadv_priv_bla - per mesh interface bridge loop avoidance data
*/
struct batadv_priv_bla {
/** @num_requests: number of bla requests in flight */
@@ -1061,29 +1072,6 @@ struct batadv_priv_bla {
};
#endif
-#ifdef CONFIG_BATMAN_ADV_DEBUG
-
-/**
- * struct batadv_priv_debug_log - debug logging data
- */
-struct batadv_priv_debug_log {
- /** @log_buff: buffer holding the logs (ring bufer) */
- char log_buff[BATADV_LOG_BUF_LEN];
-
- /** @log_start: index of next character to read */
- unsigned long log_start;
-
- /** @log_end: index of next character to write */
- unsigned long log_end;
-
- /** @lock: lock protecting log_buff, log_start & log_end */
- spinlock_t lock;
-
- /** @queue_wait: log reader's wait queue */
- wait_queue_head_t queue_wait;
-};
-#endif
-
/**
* struct batadv_priv_gw - per mesh interface gateway data
*/
@@ -1091,12 +1079,15 @@ struct batadv_priv_gw {
/** @gateway_list: list of available gateway nodes */
struct hlist_head gateway_list;
- /** @list_lock: lock protecting gateway_list & curr_gw */
+ /** @list_lock: lock protecting gateway_list, curr_gw, generation */
spinlock_t list_lock;
/** @curr_gw: pointer to currently selected gateway node */
struct batadv_gw_node __rcu *curr_gw;
+ /** @generation: current (generation) sequence number */
+ unsigned int generation;
+
/**
* @mode: gateway operation: off, client or server (see batadv_gw_modes)
*/
@@ -1173,6 +1164,26 @@ struct batadv_mcast_querier_state {
};
/**
+ * struct batadv_mcast_mla_flags - flags for the querier, bridge and tvlv state
+ */
+struct batadv_mcast_mla_flags {
+ /** @querier_ipv4: the current state of an IGMP querier in the mesh */
+ struct batadv_mcast_querier_state querier_ipv4;
+
+ /** @querier_ipv6: the current state of an MLD querier in the mesh */
+ struct batadv_mcast_querier_state querier_ipv6;
+
+ /** @enabled: whether the multicast tvlv is currently enabled */
+ unsigned char enabled:1;
+
+ /** @bridged: whether the mesh interface has a bridge on top */
+ unsigned char bridged:1;
+
+ /** @tvlv_flags: the flags we have last sent in our mcast tvlv */
+ u8 tvlv_flags;
+};
+
+/**
* struct batadv_priv_mcast - per mesh interface mcast data
*/
struct batadv_priv_mcast {
@@ -1200,20 +1211,27 @@ struct batadv_priv_mcast {
*/
struct hlist_head want_all_ipv6_list;
- /** @querier_ipv4: the current state of an IGMP querier in the mesh */
- struct batadv_mcast_querier_state querier_ipv4;
-
- /** @querier_ipv6: the current state of an MLD querier in the mesh */
- struct batadv_mcast_querier_state querier_ipv6;
+ /**
+ * @want_all_rtr4_list: a list of orig_nodes wanting all routable IPv4
+ * multicast traffic
+ */
+ struct hlist_head want_all_rtr4_list;
- /** @flags: the flags we have last sent in our mcast tvlv */
- u8 flags;
+ /**
+ * @want_all_rtr6_list: a list of orig_nodes wanting all routable IPv6
+ * multicast traffic
+ */
+ struct hlist_head want_all_rtr6_list;
- /** @enabled: whether the multicast tvlv is currently enabled */
- unsigned char enabled:1;
+ /**
+ * @mla_flags: flags for the querier, bridge and tvlv state
+ */
+ struct batadv_mcast_mla_flags mla_flags;
- /** @bridged: whether the soft interface has a bridge on top */
- unsigned char bridged:1;
+ /**
+ * @mla_lock: a lock protecting mla_list and mla_flags
+ */
+ spinlock_t mla_lock;
/**
* @num_want_all_unsnoopables: number of nodes wanting unsnoopable IP
@@ -1227,6 +1245,18 @@ struct batadv_priv_mcast {
/** @num_want_all_ipv6: counter for items in want_all_ipv6_list */
atomic_t num_want_all_ipv6;
+ /** @num_want_all_rtr4: counter for items in want_all_rtr4_list */
+ atomic_t num_want_all_rtr4;
+
+ /** @num_want_all_rtr6: counter for items in want_all_rtr6_list */
+ atomic_t num_want_all_rtr6;
+
+ /**
+ * @num_no_mc_ptype_capa: counter for number of nodes without the
+ * BATADV_MCAST_HAVE_MC_PTYPE_CAPA flag
+ */
+ atomic_t num_no_mc_ptype_capa;
+
/**
* @want_lists_lock: lock for protecting modifications to mcasts
* want_all_{unsnoopables,ipv4,ipv6}_list (traversals are rcu-locked)
@@ -1239,63 +1269,6 @@ struct batadv_priv_mcast {
#endif
/**
- * struct batadv_priv_nc - per mesh interface network coding private data
- */
-struct batadv_priv_nc {
- /** @work: work queue callback item for cleanup */
- struct delayed_work work;
-
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- /**
- * @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
- */
- struct dentry *debug_dir;
-#endif
-
- /**
- * @min_tq: only consider neighbors for encoding if neigh_tq > min_tq
- */
- u8 min_tq;
-
- /**
- * @max_fwd_delay: maximum packet forward delay to allow coding of
- * packets
- */
- u32 max_fwd_delay;
-
- /**
- * @max_buffer_time: buffer time for sniffed packets used to decoding
- */
- u32 max_buffer_time;
-
- /**
- * @timestamp_fwd_flush: timestamp of last forward packet queue flush
- */
- unsigned long timestamp_fwd_flush;
-
- /**
- * @timestamp_sniffed_purge: timestamp of last sniffed packet queue
- * purge
- */
- unsigned long timestamp_sniffed_purge;
-
- /**
- * @coding_hash: Hash table used to buffer skbs while waiting for
- * another incoming skb to code it with. Skbs are added to the buffer
- * just before being forwarded in routing.c
- */
- struct batadv_hashtable *coding_hash;
-
- /**
- * @decoding_hash: Hash table used to buffer skbs that might be needed
- * to decode a received coded skb. The buffer is used for 1) skbs
- * arriving on the soft-interface; 2) skbs overheard on the
- * hard-interface; and 3) skbs forwarded by batman-adv.
- */
- struct batadv_hashtable *decoding_hash;
-};
-
-/**
* struct batadv_tp_unacked - unacked packet meta-information
*
* This struct is supposed to represent a buffer unacked packet. However, since
@@ -1431,7 +1404,7 @@ struct batadv_tp_vars {
/** @unacked_lock: protect unacked_list */
spinlock_t unacked_lock;
- /** @last_recv_time: time time (jiffies) a msg was received */
+ /** @last_recv_time: time (jiffies) a msg was received */
unsigned long last_recv_time;
/** @refcount: number of context where the object is used */
@@ -1442,25 +1415,22 @@ struct batadv_tp_vars {
};
/**
- * struct batadv_softif_vlan - per VLAN attributes set
+ * struct batadv_meshif_vlan - per VLAN attributes set
*/
-struct batadv_softif_vlan {
+struct batadv_meshif_vlan {
/** @bat_priv: pointer to the mesh object */
struct batadv_priv *bat_priv;
/** @vid: VLAN identifier */
unsigned short vid;
- /** @kobj: kobject for sysfs vlan subdirectory */
- struct kobject *kobj;
-
/** @ap_isolation: AP isolation state */
atomic_t ap_isolation; /* boolean */
/** @tt: TT private attributes (VLAN specific) */
struct batadv_vlan_tt tt;
- /** @list: list node for &bat_priv.softif_vlan_list */
+ /** @list: list node for &bat_priv.meshif_vlan_list */
struct hlist_node list;
/**
@@ -1473,7 +1443,7 @@ struct batadv_softif_vlan {
};
/**
- * struct batadv_priv_bat_v - B.A.T.M.A.N. V per soft-interface private data
+ * struct batadv_priv_bat_v - B.A.T.M.A.N. V per mesh-interface private data
*/
struct batadv_priv_bat_v {
/** @ogm_buff: buffer holding the OGM packet */
@@ -1485,6 +1455,9 @@ struct batadv_priv_bat_v {
/** @ogm_seqno: OGM sequence number - used to identify each OGM */
atomic_t ogm_seqno;
+ /** @ogm_buff_mutex: lock protecting ogm_buff and ogm_buff_len */
+ struct mutex ogm_buff_mutex;
+
/** @ogm_wq: workqueue used to schedule OGM transmissions */
struct delayed_work ogm_wq;
};
@@ -1499,8 +1472,14 @@ struct batadv_priv {
*/
atomic_t mesh_state;
- /** @soft_iface: net device which holds this struct as private data */
- struct net_device *soft_iface;
+ /** @mesh_iface: net device which holds this struct as private data */
+ struct net_device *mesh_iface;
+
+ /**
+ * @mtu_set_by_user: MTU was set once by user
+ * protected by rtnl_lock
+ */
+ int mtu_set_by_user;
/**
* @bat_counters: mesh internal traffic statistic counters (see
@@ -1557,6 +1536,12 @@ struct batadv_priv {
* node's sender/originating side
*/
atomic_t multicast_mode;
+
+ /**
+ * @multicast_fanout: Maximum number of packet copies to generate for a
+ * multicast-to-unicast conversion
+ */
+ atomic_t multicast_fanout;
#endif
/** @orig_interval: OGM broadcast interval in milliseconds */
@@ -1597,17 +1582,6 @@ struct batadv_priv {
/** @batman_queue_left: number of remaining OGM packet slots */
atomic_t batman_queue_left;
- /** @num_ifaces: number of interfaces assigned to this mesh interface */
- unsigned int num_ifaces;
-
- /** @mesh_obj: kobject for sysfs mesh subdirectory */
- struct kobject *mesh_obj;
-
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- /** @debug_dir: dentry for debugfs batman-adv subdirectory */
- struct dentry *debug_dir;
-#endif
-
/** @forw_bat_list: list of aggregated OGMs that will be forwarded */
struct hlist_head forw_bat_list;
@@ -1620,19 +1594,19 @@ struct batadv_priv {
/** @tp_list: list of tp sessions */
struct hlist_head tp_list;
- /** @tp_num: number of currently active tp sessions */
+ /** @orig_hash: hash table containing mesh participants (orig nodes) */
struct batadv_hashtable *orig_hash;
- /** @orig_hash: hash table containing mesh participants (orig nodes) */
+ /** @forw_bat_list_lock: lock protecting forw_bat_list */
spinlock_t forw_bat_list_lock;
- /** @forw_bat_list_lock: lock protecting forw_bat_list */
+ /** @forw_bcast_list_lock: lock protecting forw_bcast_list */
spinlock_t forw_bcast_list_lock;
- /** @forw_bcast_list_lock: lock protecting forw_bcast_list */
+ /** @tp_list_lock: spinlock protecting @tp_list */
spinlock_t tp_list_lock;
- /** @tp_list_lock: spinlock protecting @tp_list */
+ /** @tp_num: number of currently active tp sessions */
atomic_t tp_num;
/** @orig_work: work queue callback item for orig node purging */
@@ -1648,24 +1622,19 @@ struct batadv_priv {
struct batadv_algo_ops *algo_ops;
/**
- * @softif_vlan_list: a list of softif_vlan structs, one per VLAN
+ * @meshif_vlan_list: a list of meshif_vlan structs, one per VLAN
* created on top of the mesh interface represented by this object
*/
- struct hlist_head softif_vlan_list;
+ struct hlist_head meshif_vlan_list;
- /** @softif_vlan_list_lock: lock protecting softif_vlan_list */
- spinlock_t softif_vlan_list_lock;
+ /** @meshif_vlan_list_lock: lock protecting meshif_vlan_list */
+ spinlock_t meshif_vlan_list_lock;
#ifdef CONFIG_BATMAN_ADV_BLA
- /** @bla: bridge loope avoidance data */
+ /** @bla: bridge loop avoidance data */
struct batadv_priv_bla bla;
#endif
-#ifdef CONFIG_BATMAN_ADV_DEBUG
- /** @debug_log: holding debug logging relevant data */
- struct batadv_priv_debug_log *debug_log;
-#endif
-
/** @gw: gateway data */
struct batadv_priv_gw gw;
@@ -1685,61 +1654,12 @@ struct batadv_priv {
struct batadv_priv_mcast mcast;
#endif
-#ifdef CONFIG_BATMAN_ADV_NC
- /**
- * @network_coding: bool indicating whether network coding is enabled
- */
- atomic_t network_coding;
-
- /** @nc: network coding data */
- struct batadv_priv_nc nc;
-#endif /* CONFIG_BATMAN_ADV_NC */
-
#ifdef CONFIG_BATMAN_ADV_BATMAN_V
- /** @bat_v: B.A.T.M.A.N. V per soft-interface private data */
+ /** @bat_v: B.A.T.M.A.N. V per mesh-interface private data */
struct batadv_priv_bat_v bat_v;
#endif
};
-/**
- * struct batadv_socket_client - layer2 icmp socket client data
- */
-struct batadv_socket_client {
- /**
- * @queue_list: packet queue for packets destined for this socket client
- */
- struct list_head queue_list;
-
- /** @queue_len: number of packets in the packet queue (queue_list) */
- unsigned int queue_len;
-
- /** @index: socket client's index in the batadv_socket_client_hash */
- unsigned char index;
-
- /** @lock: lock protecting queue_list, queue_len & index */
- spinlock_t lock;
-
- /** @queue_wait: socket client's wait queue */
- wait_queue_head_t queue_wait;
-
- /** @bat_priv: pointer to soft_iface this client belongs to */
- struct batadv_priv *bat_priv;
-};
-
-/**
- * struct batadv_socket_packet - layer2 icmp packet for socket client
- */
-struct batadv_socket_packet {
- /** @list: list node for &batadv_socket_client.queue_list */
- struct list_head list;
-
- /** @icmp_len: size of the layer2 icmp packet */
- size_t icmp_len;
-
- /** @icmp_packet: layer2 icmp packet */
- u8 icmp_packet[BATADV_ICMP_MAX_PACKET_SIZE];
-};
-
#ifdef CONFIG_BATMAN_ADV_BLA
/**
@@ -1758,7 +1678,7 @@ struct batadv_bla_backbone_gw {
/** @hash_entry: hlist node for &batadv_priv_bla.backbone_hash */
struct hlist_node hash_entry;
- /** @bat_priv: pointer to soft_iface this backbone gateway belongs to */
+ /** @bat_priv: pointer to mesh_iface this backbone gateway belongs to */
struct batadv_priv *bat_priv;
/** @lasttime: last time we heard of this backbone gw */
@@ -1863,8 +1783,8 @@ struct batadv_tt_local_entry {
/** @last_seen: timestamp used for purging stale tt local entries */
unsigned long last_seen;
- /** @vlan: soft-interface vlan of the entry */
- struct batadv_softif_vlan *vlan;
+ /** @vlan: mesh-interface vlan of the entry */
+ struct batadv_meshif_vlan *vlan;
};
/**
@@ -1929,7 +1849,7 @@ struct batadv_tt_change_node {
*/
struct batadv_tt_req_node {
/**
- * @addr: mac address address of the originator this request was sent to
+ * @addr: mac address of the originator this request was sent to
*/
u8 addr[ETH_ALEN];
@@ -1966,95 +1886,10 @@ struct batadv_tt_roam_node {
};
/**
- * struct batadv_nc_node - network coding node
- */
-struct batadv_nc_node {
- /** @list: next and prev pointer for the list handling */
- struct list_head list;
-
- /** @addr: the node's mac address */
- u8 addr[ETH_ALEN];
-
- /** @refcount: number of contexts the object is used by */
- struct kref refcount;
-
- /** @rcu: struct used for freeing in an RCU-safe manner */
- struct rcu_head rcu;
-
- /** @orig_node: pointer to corresponding orig node struct */
- struct batadv_orig_node *orig_node;
-
- /** @last_seen: timestamp of last ogm received from this node */
- unsigned long last_seen;
-};
-
-/**
- * struct batadv_nc_path - network coding path
- */
-struct batadv_nc_path {
- /** @hash_entry: next and prev pointer for the list handling */
- struct hlist_node hash_entry;
-
- /** @rcu: struct used for freeing in an RCU-safe manner */
- struct rcu_head rcu;
-
- /** @refcount: number of contexts the object is used by */
- struct kref refcount;
-
- /** @packet_list: list of buffered packets for this path */
- struct list_head packet_list;
-
- /** @packet_list_lock: access lock for packet list */
- spinlock_t packet_list_lock;
-
- /** @next_hop: next hop (destination) of path */
- u8 next_hop[ETH_ALEN];
-
- /** @prev_hop: previous hop (source) of path */
- u8 prev_hop[ETH_ALEN];
-
- /** @last_valid: timestamp for last validation of path */
- unsigned long last_valid;
-};
-
-/**
- * struct batadv_nc_packet - network coding packet used when coding and
- * decoding packets
- */
-struct batadv_nc_packet {
- /** @list: next and prev pointer for the list handling */
- struct list_head list;
-
- /** @packet_id: crc32 checksum of skb data */
- __be32 packet_id;
-
- /**
- * @timestamp: field containing the info when the packet was added to
- * path
- */
- unsigned long timestamp;
-
- /** @neigh_node: pointer to original next hop neighbor of skb */
- struct batadv_neigh_node *neigh_node;
-
- /** @skb: skb which can be encoded or used for decoding */
- struct sk_buff *skb;
-
- /** @nc_path: pointer to path this nc packet is attached to */
- struct batadv_nc_path *nc_path;
-};
-
-/**
* struct batadv_skb_cb - control buffer structure used to store private data
* relevant to batman-adv in the skb->cb buffer in skbs.
*/
struct batadv_skb_cb {
- /**
- * @decoded: Marks a skb as decoded, which is checked when searching for
- * coding opportunities in network-coding.c
- */
- unsigned char decoded:1;
-
/** @num_bcasts: Counter for broadcast packet retransmissions */
unsigned char num_bcasts;
};
@@ -2088,7 +1923,7 @@ struct batadv_forw_packet {
u16 packet_len;
/** @direct_link_flags: direct link flags for aggregated OGM packets */
- u32 direct_link_flags;
+ DECLARE_BITMAP(direct_link_flags, BATADV_MAX_AGGREGATION_PACKETS);
/** @num_packets: counter for aggregated OGMv1 packets */
u8 num_packets;
@@ -2125,6 +1960,9 @@ struct batadv_algo_iface_ops {
/** @enable: init routing info when hard-interface is enabled */
int (*enable)(struct batadv_hard_iface *hard_iface);
+ /** @enabled: notification when hard-interface was enabled (optional) */
+ void (*enabled)(struct batadv_hard_iface *hard_iface);
+
/** @disable: de-init routing info when hard-interface is disabled */
void (*disable)(struct batadv_hard_iface *hard_iface);
@@ -2164,11 +2002,6 @@ struct batadv_algo_neigh_ops {
struct batadv_neigh_node *neigh2,
struct batadv_hard_iface *if_outgoing2);
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- /** @print: print the single hop neighbor list (optional) */
- void (*print)(struct batadv_priv *priv, struct seq_file *seq);
-#endif
-
/** @dump: dump neighbors to a netlink socket (optional) */
void (*dump)(struct sk_buff *msg, struct netlink_callback *cb,
struct batadv_priv *priv,
@@ -2179,34 +2012,6 @@ struct batadv_algo_neigh_ops {
* struct batadv_algo_orig_ops - mesh algorithm callbacks (originator specific)
*/
struct batadv_algo_orig_ops {
- /**
- * @free: free the resources allocated by the routing algorithm for an
- * orig_node object (optional)
- */
- void (*free)(struct batadv_orig_node *orig_node);
-
- /**
- * @add_if: ask the routing algorithm to apply the needed changes to the
- * orig_node due to a new hard-interface being added into the mesh
- * (optional)
- */
- int (*add_if)(struct batadv_orig_node *orig_node,
- unsigned int max_if_num);
-
- /**
- * @del_if: ask the routing algorithm to apply the needed changes to the
- * orig_node due to an hard-interface being removed from the mesh
- * (optional)
- */
- int (*del_if)(struct batadv_orig_node *orig_node,
- unsigned int max_if_num, unsigned int del_if_num);
-
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- /** @print: print the originator table (optional) */
- void (*print)(struct batadv_priv *priv, struct seq_file *seq,
- struct batadv_hard_iface *hard_iface);
-#endif
-
/** @dump: dump originators to a netlink socket (optional) */
void (*dump)(struct sk_buff *msg, struct netlink_callback *cb,
struct batadv_priv *priv,
@@ -2221,14 +2026,9 @@ struct batadv_algo_gw_ops {
void (*init_sel_class)(struct batadv_priv *bat_priv);
/**
- * @store_sel_class: parse and stores a new GW selection class
- * (optional)
+ * @sel_class_max: maximum allowed GW selection class
*/
- ssize_t (*store_sel_class)(struct batadv_priv *bat_priv, char *buff,
- size_t count);
-
- /** @show_sel_class: prints the current GW selection class (optional) */
- ssize_t (*show_sel_class)(struct batadv_priv *bat_priv, char *buff);
+ u32 sel_class_max;
/**
* @get_best_gw_node: select the best GW from the list of available
@@ -2245,11 +2045,6 @@ struct batadv_algo_gw_ops {
struct batadv_orig_node *curr_gw_orig,
struct batadv_orig_node *orig_node);
-#ifdef CONFIG_BATMAN_ADV_DEBUGFS
- /** @print: print the gateway table (optional) */
- void (*print)(struct batadv_priv *bat_priv, struct seq_file *seq);
-#endif
-
/** @dump: dump gateways to a netlink socket (optional) */
void (*dump)(struct sk_buff *msg, struct netlink_callback *cb,
struct batadv_priv *priv);
@@ -2374,6 +2169,12 @@ struct batadv_tvlv_handler {
u8 *src, u8 *dst,
void *tvlv_value, u16 tvlv_value_len);
+ /**
+ * @mcast_handler: handler callback which is given the tvlv payload to
+ * process on incoming mcast packet
+ */
+ int (*mcast_handler)(struct batadv_priv *bat_priv, struct sk_buff *skb);
+
/** @type: tvlv type this handler feels responsible for */
u8 type;
@@ -2408,21 +2209,4 @@ enum batadv_tvlv_handler_flags {
BATADV_TVLV_HANDLER_OGM_CALLED = BIT(2),
};
-/**
- * struct batadv_store_mesh_work - Work queue item to detach add/del interface
- * from sysfs locks
- */
-struct batadv_store_mesh_work {
- /**
- * @net_dev: netdevice to add/remove to/from batman-adv soft-interface
- */
- struct net_device *net_dev;
-
- /** @soft_iface_name: name of soft-interface to modify */
- char soft_iface_name[IFNAMSIZ];
-
- /** @work: work queue item */
- struct work_struct work;
-};
-
#endif /* _NET_BATMAN_ADV_TYPES_H_ */
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 4e2576fc0c59..2c21ae8abadc 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -1,14 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
Copyright (c) 2013-2014 Intel Corp.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License version 2 and
- only version 2 as published by the Free Software Foundation.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
*/
#include <linux/if_arp.h>
@@ -20,6 +13,7 @@
#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
+#include <net/netdev_lock.h>
#include <net/pkt_sched.h>
#include <net/bluetooth/bluetooth.h>
@@ -57,6 +51,12 @@ static bool enable_6lowpan;
/* We are listening incoming connections via this channel
*/
static struct l2cap_chan *listen_chan;
+static DEFINE_MUTEX(set_lock);
+
+enum {
+ LOWPAN_PEER_CLOSING,
+ LOWPAN_PEER_MAXBITS
+};
struct lowpan_peer {
struct list_head list;
@@ -66,6 +66,8 @@ struct lowpan_peer {
/* peer addresses in various formats */
unsigned char lladdr[ETH_ALEN];
struct in6_addr peer_addr;
+
+ DECLARE_BITMAP(flags, LOWPAN_PEER_MAXBITS);
};
struct lowpan_btle_dev {
@@ -109,34 +111,6 @@ static inline bool peer_del(struct lowpan_btle_dev *dev,
return false;
}
-static inline struct lowpan_peer *peer_lookup_ba(struct lowpan_btle_dev *dev,
- bdaddr_t *ba, __u8 type)
-{
- struct lowpan_peer *peer;
-
- BT_DBG("peers %d addr %pMR type %d", atomic_read(&dev->peer_count),
- ba, type);
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(peer, &dev->peers, list) {
- BT_DBG("dst addr %pMR dst type %d",
- &peer->chan->dst, peer->chan->dst_type);
-
- if (bacmp(&peer->chan->dst, ba))
- continue;
-
- if (type == peer->chan->dst_type) {
- rcu_read_unlock();
- return peer;
- }
- }
-
- rcu_read_unlock();
-
- return NULL;
-}
-
static inline struct lowpan_peer *
__peer_lookup_chan(struct lowpan_btle_dev *dev, struct l2cap_chan *chan)
{
@@ -167,30 +141,25 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev,
struct in6_addr *daddr,
struct sk_buff *skb)
{
- struct lowpan_peer *peer;
- struct in6_addr *nexthop;
- struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
+ struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
int count = atomic_read(&dev->peer_count);
+ const struct in6_addr *nexthop;
+ struct lowpan_peer *peer;
+ struct neighbour *neigh;
BT_DBG("peers %d addr %pI6c rt %p", count, daddr, rt);
- /* If we have multiple 6lowpan peers, then check where we should
- * send the packet. If only one peer exists, then we can send the
- * packet right away.
- */
- if (count == 1) {
- rcu_read_lock();
- peer = list_first_or_null_rcu(&dev->peers, struct lowpan_peer,
- list);
- rcu_read_unlock();
- return peer;
- }
-
if (!rt) {
- nexthop = &lowpan_cb(skb)->gw;
-
- if (ipv6_addr_any(nexthop))
- return NULL;
+ if (ipv6_addr_any(&lowpan_cb(skb)->gw)) {
+ /* There is neither route nor gateway,
+ * probably the destination is a direct peer.
+ */
+ nexthop = daddr;
+ } else {
+ /* There is a known gateway
+ */
+ nexthop = &lowpan_cb(skb)->gw;
+ }
} else {
nexthop = rt6_nexthop(rt, daddr);
@@ -206,7 +175,7 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev,
rcu_read_lock();
list_for_each_entry_rcu(peer, &dev->peers, list) {
- BT_DBG("dst addr %pMR dst type %d ip %pI6c",
+ BT_DBG("dst addr %pMR dst type %u ip %pI6c",
&peer->chan->dst, peer->chan->dst_type,
&peer->peer_addr);
@@ -216,6 +185,19 @@ static inline struct lowpan_peer *peer_lookup_dst(struct lowpan_btle_dev *dev,
}
}
+ /* use the neighbour cache for matching addresses assigned by SLAAC */
+ neigh = __ipv6_neigh_lookup(dev->netdev, nexthop);
+ if (neigh) {
+ list_for_each_entry_rcu(peer, &dev->peers, list) {
+ if (!memcmp(neigh->ha, peer->lladdr, ETH_ALEN)) {
+ neigh_release(neigh);
+ rcu_read_unlock();
+ return peer;
+ }
+ }
+ neigh_release(neigh);
+ }
+
rcu_read_unlock();
return NULL;
@@ -266,7 +248,7 @@ static int give_skb_to_upper(struct sk_buff *skb, struct net_device *dev)
if (!skb_cp)
return NET_RX_DROP;
- return netif_rx_ni(skb_cp);
+ return netif_rx(skb_cp);
}
static int iphc_decompress(struct sk_buff *skb, struct net_device *netdev,
@@ -314,6 +296,7 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev,
local_skb->pkt_type = PACKET_HOST;
local_skb->dev = dev;
+ skb_reset_mac_header(local_skb);
skb_set_transport_header(local_skb, sizeof(struct ipv6hdr));
if (give_skb_to_upper(local_skb, dev) != NET_RX_SUCCESS) {
@@ -467,9 +450,9 @@ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb,
iv.iov_len = skb->len;
memset(&msg, 0, sizeof(msg));
- iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iv, 1, skb->len);
+ iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iv, 1, skb->len);
- err = l2cap_chan_send(chan, &msg, skb->len);
+ err = l2cap_chan_send(chan, &msg, skb->len, NULL);
if (err > 0) {
netdev->stats.tx_bytes += err;
netdev->stats.tx_packets++;
@@ -504,7 +487,7 @@ static int send_mcast_pkt(struct sk_buff *skb, struct net_device *netdev)
local_skb = skb_clone(skb, GFP_ATOMIC);
- BT_DBG("xmit %s to %pMR type %d IP %pI6c chan %p",
+ BT_DBG("xmit %s to %pMR type %u IP %pI6c chan %p",
netdev->name,
&pentry->chan->dst, pentry->chan->dst_type,
&pentry->peer_addr, pentry->chan);
@@ -547,7 +530,7 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev)
if (err) {
if (lowpan_cb(skb)->chan) {
- BT_DBG("xmit %s to %pMR type %d IP %pI6c chan %p",
+ BT_DBG("xmit %s to %pMR type %u IP %pI6c chan %p",
netdev->name, &addr, addr_type,
&lowpan_cb(skb)->addr, lowpan_cb(skb)->chan);
err = send_pkt(lowpan_cb(skb)->chan, skb, netdev);
@@ -581,7 +564,7 @@ static const struct net_device_ops netdev_ops = {
.ndo_start_xmit = bt_xmit,
};
-static struct header_ops header_ops = {
+static const struct header_ops header_ops = {
.create = header_create,
};
@@ -598,7 +581,7 @@ static void netdev_setup(struct net_device *dev)
dev->needs_free_netdev = true;
}
-static struct device_type bt_type = {
+static const struct device_type bt_type = {
.name = "bluetooth",
};
@@ -607,7 +590,7 @@ static void ifup(struct net_device *netdev)
int err;
rtnl_lock();
- err = dev_open(netdev);
+ err = dev_open(netdev, NULL);
if (err < 0)
BT_INFO("iface %s cannot be opened (%d)", netdev->name, err);
rtnl_unlock();
@@ -667,7 +650,6 @@ static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan,
return NULL;
peer->chan = chan;
- memset(&peer->peer_addr, 0, sizeof(struct in6_addr));
baswap((void *)peer->lladdr, &chan->dst);
@@ -689,7 +671,8 @@ static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan,
static int setup_netdev(struct l2cap_chan *chan, struct lowpan_btle_dev **dev)
{
struct net_device *netdev;
- int err = 0;
+ bdaddr_t addr;
+ int err;
netdev = alloc_netdev(LOWPAN_PRIV_SIZE(sizeof(struct lowpan_btle_dev)),
IFACE_NAME_TEMPLATE, NET_NAME_UNKNOWN,
@@ -698,7 +681,8 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_btle_dev **dev)
return -ENOMEM;
netdev->addr_assign_type = NET_ADDR_PERM;
- baswap((void *)netdev->dev_addr, &chan->src);
+ baswap(&addr, &chan->src);
+ __dev_addr_set(netdev, &addr, sizeof(addr));
netdev->netdev_ops = &netdev_ops;
SET_NETDEV_DEV(netdev, &chan->conn->hcon->hdev->dev);
@@ -816,7 +800,7 @@ static void chan_close_cb(struct l2cap_chan *chan)
BT_DBG("dev %p removing %speer %p", dev,
last ? "last " : "1 ", peer);
- BT_DBG("chan %p orig refcnt %d", chan,
+ BT_DBG("chan %p orig refcnt %u", chan,
kref_read(&chan->kref));
l2cap_chan_put(chan);
@@ -838,8 +822,6 @@ static void chan_close_cb(struct l2cap_chan *chan)
} else {
spin_unlock(&devices_lock);
}
-
- return;
}
static void chan_state_change_cb(struct l2cap_chan *chan, int state, int err)
@@ -852,11 +834,16 @@ static struct sk_buff *chan_alloc_skb_cb(struct l2cap_chan *chan,
unsigned long hdr_len,
unsigned long len, int nb)
{
+ struct sk_buff *skb;
+
/* Note that we must allocate using GFP_ATOMIC here as
* this function is called originally from netdev hard xmit
* function in atomic context.
*/
- return bt_skb_alloc(hdr_len + len, GFP_ATOMIC);
+ skb = bt_skb_alloc(hdr_len + len, GFP_ATOMIC);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+ return skb;
}
static void chan_suspend_cb(struct l2cap_chan *chan)
@@ -907,14 +894,6 @@ static const struct l2cap_ops bt_6lowpan_chan_ops = {
.set_shutdown = l2cap_chan_no_set_shutdown,
};
-static inline __u8 bdaddr_type(__u8 type)
-{
- if (type == ADDR_LE_DEV_PUBLIC)
- return BDADDR_LE_PUBLIC;
- else
- return BDADDR_LE_RANDOM;
-}
-
static int bt_6lowpan_connect(bdaddr_t *addr, u8 dst_type)
{
struct l2cap_chan *chan;
@@ -927,7 +906,7 @@ static int bt_6lowpan_connect(bdaddr_t *addr, u8 dst_type)
chan->ops = &bt_6lowpan_chan_ops;
err = l2cap_chan_connect(chan, cpu_to_le16(L2CAP_PSM_IPSP), 0,
- addr, dst_type);
+ addr, dst_type, L2CAP_CONN_TIMEOUT);
BT_DBG("chan %p err %d", chan, err);
if (err < 0)
@@ -940,7 +919,7 @@ static int bt_6lowpan_disconnect(struct l2cap_conn *conn, u8 dst_type)
{
struct lowpan_peer *peer;
- BT_DBG("conn %p dst type %d", conn, dst_type);
+ BT_DBG("conn %p dst type %u", conn, dst_type);
peer = lookup_peer(conn);
if (!peer)
@@ -948,7 +927,9 @@ static int bt_6lowpan_disconnect(struct l2cap_conn *conn, u8 dst_type)
BT_DBG("peer %p chan %p", peer, peer->chan);
+ l2cap_chan_lock(peer->chan);
l2cap_chan_close(peer->chan, ENOENT);
+ l2cap_chan_unlock(peer->chan);
return 0;
}
@@ -972,7 +953,7 @@ static struct l2cap_chan *bt_6lowpan_listen(void)
atomic_set(&chan->nesting, L2CAP_NESTING_PARENT);
- BT_DBG("chan %p src type %d", chan, chan->src_type);
+ BT_DBG("chan %p src type %u", chan, chan->src_type);
err = l2cap_add_psm(chan, addr, cpu_to_le16(L2CAP_PSM_IPSP));
if (err) {
@@ -985,10 +966,11 @@ static struct l2cap_chan *bt_6lowpan_listen(void)
}
static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type,
- struct l2cap_conn **conn)
+ struct l2cap_conn **conn, bool disconnect)
{
struct hci_conn *hcon;
struct hci_dev *hdev;
+ int le_addr_type;
int n;
n = sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx %hhu",
@@ -999,21 +981,41 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type,
if (n < 7)
return -EINVAL;
+ if (disconnect) {
+ /* The "disconnect" debugfs command has used different address
+ * type constants than "connect" since 2015. Let's retain that
+ * for now even though it's obviously buggy...
+ */
+ *addr_type += 1;
+ }
+
+ switch (*addr_type) {
+ case BDADDR_LE_PUBLIC:
+ le_addr_type = ADDR_LE_DEV_PUBLIC;
+ break;
+ case BDADDR_LE_RANDOM:
+ le_addr_type = ADDR_LE_DEV_RANDOM;
+ break;
+ default:
+ return -EINVAL;
+ }
+
/* The LE_PUBLIC address type is ignored because of BDADDR_ANY */
hdev = hci_get_route(addr, BDADDR_ANY, BDADDR_LE_PUBLIC);
if (!hdev)
return -ENOENT;
hci_dev_lock(hdev);
- hcon = hci_conn_hash_lookup_le(hdev, addr, *addr_type);
+ hcon = hci_conn_hash_lookup_le(hdev, addr, le_addr_type);
hci_dev_unlock(hdev);
+ hci_dev_put(hdev);
if (!hcon)
return -ENOENT;
*conn = (struct l2cap_conn *)hcon->l2cap_data;
- BT_DBG("conn %p dst %pMR type %d", *conn, &hcon->dst, hcon->dst_type);
+ BT_DBG("conn %p dst %pMR type %u", *conn, &hcon->dst, hcon->dst_type);
return 0;
}
@@ -1021,41 +1023,52 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type,
static void disconnect_all_peers(void)
{
struct lowpan_btle_dev *entry;
- struct lowpan_peer *peer, *tmp_peer, *new_peer;
- struct list_head peers;
-
- INIT_LIST_HEAD(&peers);
+ struct lowpan_peer *peer;
+ int nchans;
- /* We make a separate list of peers as the close_cb() will
- * modify the device peers list so it is better not to mess
- * with the same list at the same time.
+ /* l2cap_chan_close() cannot be called from RCU, and lock ordering
+ * chan->lock > devices_lock prevents taking write side lock, so copy
+ * then close.
*/
rcu_read_lock();
+ list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list)
+ list_for_each_entry_rcu(peer, &entry->peers, list)
+ clear_bit(LOWPAN_PEER_CLOSING, peer->flags);
+ rcu_read_unlock();
- list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
- list_for_each_entry_rcu(peer, &entry->peers, list) {
- new_peer = kmalloc(sizeof(*new_peer), GFP_ATOMIC);
- if (!new_peer)
- break;
+ do {
+ struct l2cap_chan *chans[32];
+ int i;
- new_peer->chan = peer->chan;
- INIT_LIST_HEAD(&new_peer->list);
+ nchans = 0;
- list_add(&new_peer->list, &peers);
- }
- }
+ spin_lock(&devices_lock);
- rcu_read_unlock();
+ list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) {
+ list_for_each_entry_rcu(peer, &entry->peers, list) {
+ if (test_and_set_bit(LOWPAN_PEER_CLOSING,
+ peer->flags))
+ continue;
- spin_lock(&devices_lock);
- list_for_each_entry_safe(peer, tmp_peer, &peers, list) {
- l2cap_chan_close(peer->chan, ENOENT);
+ l2cap_chan_hold(peer->chan);
+ chans[nchans++] = peer->chan;
- list_del_rcu(&peer->list);
- kfree_rcu(peer, rcu);
- }
- spin_unlock(&devices_lock);
+ if (nchans >= ARRAY_SIZE(chans))
+ goto done;
+ }
+ }
+
+done:
+ spin_unlock(&devices_lock);
+
+ for (i = 0; i < nchans; ++i) {
+ l2cap_chan_lock(chans[i]);
+ l2cap_chan_close(chans[i], ENOENT);
+ l2cap_chan_unlock(chans[i]);
+ l2cap_chan_put(chans[i]);
+ }
+ } while (nchans);
}
struct set_enable {
@@ -1076,12 +1089,16 @@ static void do_enable_set(struct work_struct *work)
enable_6lowpan = set_enable->flag;
+ mutex_lock(&set_lock);
if (listen_chan) {
+ l2cap_chan_lock(listen_chan);
l2cap_chan_close(listen_chan, 0);
+ l2cap_chan_unlock(listen_chan);
l2cap_chan_put(listen_chan);
}
listen_chan = bt_6lowpan_listen();
+ mutex_unlock(&set_lock);
kfree(set_enable);
}
@@ -1108,8 +1125,8 @@ static int lowpan_enable_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(lowpan_enable_fops, lowpan_enable_get,
- lowpan_enable_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(lowpan_enable_fops, lowpan_enable_get,
+ lowpan_enable_set, "%llu\n");
static ssize_t lowpan_control_write(struct file *fp,
const char __user *user_buffer,
@@ -1129,15 +1146,19 @@ static ssize_t lowpan_control_write(struct file *fp,
buf[buf_size] = '\0';
if (memcmp(buf, "connect ", 8) == 0) {
- ret = get_l2cap_conn(&buf[8], &addr, &addr_type, &conn);
+ ret = get_l2cap_conn(&buf[8], &addr, &addr_type, &conn, false);
if (ret == -EINVAL)
return ret;
+ mutex_lock(&set_lock);
if (listen_chan) {
+ l2cap_chan_lock(listen_chan);
l2cap_chan_close(listen_chan, 0);
+ l2cap_chan_unlock(listen_chan);
l2cap_chan_put(listen_chan);
listen_chan = NULL;
}
+ mutex_unlock(&set_lock);
if (conn) {
struct lowpan_peer *peer;
@@ -1151,7 +1172,7 @@ static ssize_t lowpan_control_write(struct file *fp,
return -EALREADY;
}
- BT_DBG("conn %p dst %pMR type %d user %d", conn,
+ BT_DBG("conn %p dst %pMR type %d user %u", conn,
&conn->hcon->dst, conn->hcon->dst_type,
addr_type);
}
@@ -1164,7 +1185,7 @@ static ssize_t lowpan_control_write(struct file *fp,
}
if (memcmp(buf, "disconnect ", 11) == 0) {
- ret = get_l2cap_conn(&buf[11], &addr, &addr_type, &conn);
+ ret = get_l2cap_conn(&buf[11], &addr, &addr_type, &conn, true);
if (ret < 0)
return ret;
@@ -1278,9 +1299,10 @@ static struct notifier_block bt_6lowpan_dev_notifier = {
static int __init bt_6lowpan_init(void)
{
- lowpan_enable_debugfs = debugfs_create_file("6lowpan_enable", 0644,
- bt_debugfs, NULL,
- &lowpan_enable_fops);
+ lowpan_enable_debugfs = debugfs_create_file_unsafe("6lowpan_enable",
+ 0644, bt_debugfs,
+ NULL,
+ &lowpan_enable_fops);
lowpan_control_debugfs = debugfs_create_file("6lowpan_control", 0644,
bt_debugfs, NULL,
&lowpan_control_fops);
@@ -1294,7 +1316,9 @@ static void __exit bt_6lowpan_exit(void)
debugfs_remove(lowpan_control_debugfs);
if (listen_chan) {
+ l2cap_chan_lock(listen_chan);
l2cap_chan_close(listen_chan, 0);
+ l2cap_chan_unlock(listen_chan);
l2cap_chan_put(listen_chan);
}
diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig
index db82a40875e8..6b2b65a66700 100644
--- a/net/bluetooth/Kconfig
+++ b/net/bluetooth/Kconfig
@@ -1,15 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Bluetooth subsystem configuration
#
menuconfig BT
tristate "Bluetooth subsystem support"
- depends on NET && !S390
+ depends on !S390
depends on RFKILL || !RFKILL
select CRC16
select CRYPTO
- select CRYPTO_BLKCIPHER
- select CRYPTO_AES
+ select CRYPTO_SKCIPHER
+ select CRYPTO_LIB_AES
+ imply CRYPTO_AES
select CRYPTO_CMAC
select CRYPTO_ECB
select CRYPTO_SHA256
@@ -19,7 +21,7 @@ menuconfig BT
It was designed as a replacement for cables and other short-range
technologies like IrDA. Bluetooth operates in personal area range
that typically extends up to 10 meters. More information about
- Bluetooth can be found at <http://www.bluetooth.com/>.
+ Bluetooth can be found at <https://www.bluetooth.com/>.
Linux Bluetooth subsystem consist of several layers:
Bluetooth Core
@@ -27,6 +29,7 @@ menuconfig BT
SCO audio links
L2CAP (Logical Link Control and Adaptation Protocol)
SMP (Security Manager Protocol) on LE (Low Energy) links
+ ISO isochronous links
HCI Device drivers (Interface to the hardware)
RFCOMM Module (RFCOMM Protocol)
BNEP Module (Bluetooth Network Encapsulation Protocol)
@@ -59,15 +62,6 @@ source "net/bluetooth/cmtp/Kconfig"
source "net/bluetooth/hidp/Kconfig"
-config BT_HS
- bool "Bluetooth High Speed (HS) features"
- depends on BT_BREDR
- default y
- help
- Bluetooth High Speed includes support for off-loading
- Bluetooth connections via 802.11 (wifi) physical layer
- available with Bluetooth version 3.0 or later.
-
config BT_LE
bool "Bluetooth Low Energy (LE) features"
depends on BT
@@ -76,6 +70,17 @@ config BT_LE
Bluetooth Low Energy includes support low-energy physical
layer available with Bluetooth version 4.0 or later.
+config BT_LE_L2CAP_ECRED
+ bool "Bluetooth L2CAP Enhanced Credit Flow Control"
+ depends on BT_LE
+ default y
+ help
+ Bluetooth Low Energy L2CAP Enhanced Credit Flow Control available with
+ Bluetooth version 5.2 or later.
+
+ This can be overridden by passing bluetooth.enable_ecred=[1|0]
+ on the kernel commandline.
+
config BT_6LOWPAN
tristate "Bluetooth 6LoWPAN support"
depends on BT_LE && 6LOWPAN
@@ -91,6 +96,28 @@ config BT_LEDS
This option selects a few LED triggers for different
Bluetooth events.
+config BT_MSFTEXT
+ bool "Enable Microsoft extensions"
+ depends on BT
+ help
+ This options enables support for the Microsoft defined HCI
+ vendor extensions.
+
+config BT_AOSPEXT
+ bool "Enable Android Open Source Project extensions"
+ depends on BT
+ help
+ This options enables support for the Android Open Source
+ Project defined HCI vendor extensions.
+
+config BT_DEBUGFS
+ bool "Export Bluetooth internals in debugfs"
+ depends on BT && DEBUG_FS
+ default y
+ help
+ Provide extensive information about internal Bluetooth states
+ in debugfs.
+
config BT_SELFTEST
bool "Bluetooth self testing support"
depends on BT && DEBUG_KERNEL
@@ -118,12 +145,11 @@ config BT_SELFTEST_SMP
Run test cases for SMP cryptographic functionality, including both
legacy SMP as well as the Secure Connections features.
-config BT_DEBUGFS
- bool "Export Bluetooth internals in debugfs"
- depends on BT && DEBUG_FS
- default y
+config BT_FEATURE_DEBUG
+ bool "Enable runtime option for debugging statements"
+ depends on BT && !DYNAMIC_DEBUG
help
- Provide extensive information about internal Bluetooth states
- in debugfs.
+ This provides an option to enable/disable debugging statements
+ at runtime via the experimental features interface.
source "drivers/bluetooth/Kconfig"
diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile
index fda41c0b4781..a7eede7616d8 100644
--- a/net/bluetooth/Makefile
+++ b/net/bluetooth/Makefile
@@ -14,10 +14,15 @@ bluetooth_6lowpan-y := 6lowpan.o
bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \
hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o lib.o \
- ecdh_helper.o hci_request.o mgmt_util.o
+ ecdh_helper.o mgmt_util.o mgmt_config.o hci_codec.o eir.o hci_sync.o \
+ hci_drv.o
+
+bluetooth-$(CONFIG_DEV_COREDUMP) += coredump.o
bluetooth-$(CONFIG_BT_BREDR) += sco.o
-bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o
+bluetooth-$(CONFIG_BT_LE) += iso.o
bluetooth-$(CONFIG_BT_LEDS) += leds.o
+bluetooth-$(CONFIG_BT_MSFTEXT) += msft.o
+bluetooth-$(CONFIG_BT_AOSPEXT) += aosp.o
bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o
bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o
diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c
deleted file mode 100644
index 51c2cf2d8923..000000000000
--- a/net/bluetooth/a2mp.c
+++ /dev/null
@@ -1,1040 +0,0 @@
-/*
- Copyright (c) 2010,2011 Code Aurora Forum. All rights reserved.
- Copyright (c) 2011,2012 Intel Corp.
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License version 2 and
- only version 2 as published by the Free Software Foundation.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-*/
-
-#include <net/bluetooth/bluetooth.h>
-#include <net/bluetooth/hci_core.h>
-#include <net/bluetooth/l2cap.h>
-
-#include "hci_request.h"
-#include "a2mp.h"
-#include "amp.h"
-
-#define A2MP_FEAT_EXT 0x8000
-
-/* Global AMP Manager list */
-static LIST_HEAD(amp_mgr_list);
-static DEFINE_MUTEX(amp_mgr_list_lock);
-
-/* A2MP build & send command helper functions */
-static struct a2mp_cmd *__a2mp_build(u8 code, u8 ident, u16 len, void *data)
-{
- struct a2mp_cmd *cmd;
- int plen;
-
- plen = sizeof(*cmd) + len;
- cmd = kzalloc(plen, GFP_KERNEL);
- if (!cmd)
- return NULL;
-
- cmd->code = code;
- cmd->ident = ident;
- cmd->len = cpu_to_le16(len);
-
- memcpy(cmd->data, data, len);
-
- return cmd;
-}
-
-static void a2mp_send(struct amp_mgr *mgr, u8 code, u8 ident, u16 len, void *data)
-{
- struct l2cap_chan *chan = mgr->a2mp_chan;
- struct a2mp_cmd *cmd;
- u16 total_len = len + sizeof(*cmd);
- struct kvec iv;
- struct msghdr msg;
-
- cmd = __a2mp_build(code, ident, len, data);
- if (!cmd)
- return;
-
- iv.iov_base = cmd;
- iv.iov_len = total_len;
-
- memset(&msg, 0, sizeof(msg));
-
- iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, &iv, 1, total_len);
-
- l2cap_chan_send(chan, &msg, total_len);
-
- kfree(cmd);
-}
-
-static u8 __next_ident(struct amp_mgr *mgr)
-{
- if (++mgr->ident == 0)
- mgr->ident = 1;
-
- return mgr->ident;
-}
-
-static struct amp_mgr *amp_mgr_lookup_by_state(u8 state)
-{
- struct amp_mgr *mgr;
-
- mutex_lock(&amp_mgr_list_lock);
- list_for_each_entry(mgr, &amp_mgr_list, list) {
- if (test_and_clear_bit(state, &mgr->state)) {
- amp_mgr_get(mgr);
- mutex_unlock(&amp_mgr_list_lock);
- return mgr;
- }
- }
- mutex_unlock(&amp_mgr_list_lock);
-
- return NULL;
-}
-
-/* hci_dev_list shall be locked */
-static void __a2mp_add_cl(struct amp_mgr *mgr, struct a2mp_cl *cl)
-{
- struct hci_dev *hdev;
- int i = 1;
-
- cl[0].id = AMP_ID_BREDR;
- cl[0].type = AMP_TYPE_BREDR;
- cl[0].status = AMP_STATUS_BLUETOOTH_ONLY;
-
- list_for_each_entry(hdev, &hci_dev_list, list) {
- if (hdev->dev_type == HCI_AMP) {
- cl[i].id = hdev->id;
- cl[i].type = hdev->amp_type;
- if (test_bit(HCI_UP, &hdev->flags))
- cl[i].status = hdev->amp_status;
- else
- cl[i].status = AMP_STATUS_POWERED_DOWN;
- i++;
- }
- }
-}
-
-/* Processing A2MP messages */
-static int a2mp_command_rej(struct amp_mgr *mgr, struct sk_buff *skb,
- struct a2mp_cmd *hdr)
-{
- struct a2mp_cmd_rej *rej = (void *) skb->data;
-
- if (le16_to_cpu(hdr->len) < sizeof(*rej))
- return -EINVAL;
-
- BT_DBG("ident %d reason %d", hdr->ident, le16_to_cpu(rej->reason));
-
- skb_pull(skb, sizeof(*rej));
-
- return 0;
-}
-
-static int a2mp_discover_req(struct amp_mgr *mgr, struct sk_buff *skb,
- struct a2mp_cmd *hdr)
-{
- struct a2mp_discov_req *req = (void *) skb->data;
- u16 len = le16_to_cpu(hdr->len);
- struct a2mp_discov_rsp *rsp;
- u16 ext_feat;
- u8 num_ctrl;
- struct hci_dev *hdev;
-
- if (len < sizeof(*req))
- return -EINVAL;
-
- skb_pull(skb, sizeof(*req));
-
- ext_feat = le16_to_cpu(req->ext_feat);
-
- BT_DBG("mtu %d efm 0x%4.4x", le16_to_cpu(req->mtu), ext_feat);
-
- /* check that packet is not broken for now */
- while (ext_feat & A2MP_FEAT_EXT) {
- if (len < sizeof(ext_feat))
- return -EINVAL;
-
- ext_feat = get_unaligned_le16(skb->data);
- BT_DBG("efm 0x%4.4x", ext_feat);
- len -= sizeof(ext_feat);
- skb_pull(skb, sizeof(ext_feat));
- }
-
- read_lock(&hci_dev_list_lock);
-
- /* at minimum the BR/EDR needs to be listed */
- num_ctrl = 1;
-
- list_for_each_entry(hdev, &hci_dev_list, list) {
- if (hdev->dev_type == HCI_AMP)
- num_ctrl++;
- }
-
- len = num_ctrl * sizeof(struct a2mp_cl) + sizeof(*rsp);
- rsp = kmalloc(len, GFP_ATOMIC);
- if (!rsp) {
- read_unlock(&hci_dev_list_lock);
- return -ENOMEM;
- }
-
- rsp->mtu = cpu_to_le16(L2CAP_A2MP_DEFAULT_MTU);
- rsp->ext_feat = 0;
-
- __a2mp_add_cl(mgr, rsp->cl);
-
- read_unlock(&hci_dev_list_lock);
-
- a2mp_send(mgr, A2MP_DISCOVER_RSP, hdr->ident, len, rsp);
-
- kfree(rsp);
- return 0;
-}
-
-static int a2mp_discover_rsp(struct amp_mgr *mgr, struct sk_buff *skb,
- struct a2mp_cmd *hdr)
-{
- struct a2mp_discov_rsp *rsp = (void *) skb->data;
- u16 len = le16_to_cpu(hdr->len);
- struct a2mp_cl *cl;
- u16 ext_feat;
- bool found = false;
-
- if (len < sizeof(*rsp))
- return -EINVAL;
-
- len -= sizeof(*rsp);
- skb_pull(skb, sizeof(*rsp));
-
- ext_feat = le16_to_cpu(rsp->ext_feat);
-
- BT_DBG("mtu %d efm 0x%4.4x", le16_to_cpu(rsp->mtu), ext_feat);
-
- /* check that packet is not broken for now */
- while (ext_feat & A2MP_FEAT_EXT) {
- if (len < sizeof(ext_feat))
- return -EINVAL;
-
- ext_feat = get_unaligned_le16(skb->data);
- BT_DBG("efm 0x%4.4x", ext_feat);
- len -= sizeof(ext_feat);
- skb_pull(skb, sizeof(ext_feat));
- }
-
- cl = (void *) skb->data;
- while (len >= sizeof(*cl)) {
- BT_DBG("Remote AMP id %d type %d status %d", cl->id, cl->type,
- cl->status);
-
- if (cl->id != AMP_ID_BREDR && cl->type != AMP_TYPE_BREDR) {
- struct a2mp_info_req req;
-
- found = true;
- req.id = cl->id;
- a2mp_send(mgr, A2MP_GETINFO_REQ, __next_ident(mgr),
- sizeof(req), &req);
- }
-
- len -= sizeof(*cl);
- cl = skb_pull(skb, sizeof(*cl));
- }
-
- /* Fall back to L2CAP init sequence */
- if (!found) {
- struct l2cap_conn *conn = mgr->l2cap_conn;
- struct l2cap_chan *chan;
-
- mutex_lock(&conn->chan_lock);
-
- list_for_each_entry(chan, &conn->chan_l, list) {
-
- BT_DBG("chan %p state %s", chan,
- state_to_string(chan->state));
-
- if (chan->scid == L2CAP_CID_A2MP)
- continue;
-
- l2cap_chan_lock(chan);
-
- if (chan->state == BT_CONNECT)
- l2cap_send_conn_req(chan);
-
- l2cap_chan_unlock(chan);
- }
-
- mutex_unlock(&conn->chan_lock);
- }
-
- return 0;
-}
-
-static int a2mp_change_notify(struct amp_mgr *mgr, struct sk_buff *skb,
- struct a2mp_cmd *hdr)
-{
- struct a2mp_cl *cl = (void *) skb->data;
-
- while (skb->len >= sizeof(*cl)) {
- BT_DBG("Controller id %d type %d status %d", cl->id, cl->type,
- cl->status);
- cl = skb_pull(skb, sizeof(*cl));
- }
-
- /* TODO send A2MP_CHANGE_RSP */
-
- return 0;
-}
-
-static void read_local_amp_info_complete(struct hci_dev *hdev, u8 status,
- u16 opcode)
-{
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
-
- a2mp_send_getinfo_rsp(hdev);
-}
-
-static int a2mp_getinfo_req(struct amp_mgr *mgr, struct sk_buff *skb,
- struct a2mp_cmd *hdr)
-{
- struct a2mp_info_req *req = (void *) skb->data;
- struct hci_dev *hdev;
- struct hci_request hreq;
- int err = 0;
-
- if (le16_to_cpu(hdr->len) < sizeof(*req))
- return -EINVAL;
-
- BT_DBG("id %d", req->id);
-
- hdev = hci_dev_get(req->id);
- if (!hdev || hdev->dev_type != HCI_AMP) {
- struct a2mp_info_rsp rsp;
-
- rsp.id = req->id;
- rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
-
- a2mp_send(mgr, A2MP_GETINFO_RSP, hdr->ident, sizeof(rsp),
- &rsp);
-
- goto done;
- }
-
- set_bit(READ_LOC_AMP_INFO, &mgr->state);
- hci_req_init(&hreq, hdev);
- hci_req_add(&hreq, HCI_OP_READ_LOCAL_AMP_INFO, 0, NULL);
- err = hci_req_run(&hreq, read_local_amp_info_complete);
- if (err < 0)
- a2mp_send_getinfo_rsp(hdev);
-
-done:
- if (hdev)
- hci_dev_put(hdev);
-
- skb_pull(skb, sizeof(*req));
- return 0;
-}
-
-static int a2mp_getinfo_rsp(struct amp_mgr *mgr, struct sk_buff *skb,
- struct a2mp_cmd *hdr)
-{
- struct a2mp_info_rsp *rsp = (struct a2mp_info_rsp *) skb->data;
- struct a2mp_amp_assoc_req req;
- struct amp_ctrl *ctrl;
-
- if (le16_to_cpu(hdr->len) < sizeof(*rsp))
- return -EINVAL;
-
- BT_DBG("id %d status 0x%2.2x", rsp->id, rsp->status);
-
- if (rsp->status)
- return -EINVAL;
-
- ctrl = amp_ctrl_add(mgr, rsp->id);
- if (!ctrl)
- return -ENOMEM;
-
- req.id = rsp->id;
- a2mp_send(mgr, A2MP_GETAMPASSOC_REQ, __next_ident(mgr), sizeof(req),
- &req);
-
- skb_pull(skb, sizeof(*rsp));
- return 0;
-}
-
-static int a2mp_getampassoc_req(struct amp_mgr *mgr, struct sk_buff *skb,
- struct a2mp_cmd *hdr)
-{
- struct a2mp_amp_assoc_req *req = (void *) skb->data;
- struct hci_dev *hdev;
- struct amp_mgr *tmp;
-
- if (le16_to_cpu(hdr->len) < sizeof(*req))
- return -EINVAL;
-
- BT_DBG("id %d", req->id);
-
- /* Make sure that other request is not processed */
- tmp = amp_mgr_lookup_by_state(READ_LOC_AMP_ASSOC);
-
- hdev = hci_dev_get(req->id);
- if (!hdev || hdev->amp_type == AMP_TYPE_BREDR || tmp) {
- struct a2mp_amp_assoc_rsp rsp;
- rsp.id = req->id;
-
- if (tmp) {
- rsp.status = A2MP_STATUS_COLLISION_OCCURED;
- amp_mgr_put(tmp);
- } else {
- rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
- }
-
- a2mp_send(mgr, A2MP_GETAMPASSOC_RSP, hdr->ident, sizeof(rsp),
- &rsp);
-
- goto done;
- }
-
- amp_read_loc_assoc(hdev, mgr);
-
-done:
- if (hdev)
- hci_dev_put(hdev);
-
- skb_pull(skb, sizeof(*req));
- return 0;
-}
-
-static int a2mp_getampassoc_rsp(struct amp_mgr *mgr, struct sk_buff *skb,
- struct a2mp_cmd *hdr)
-{
- struct a2mp_amp_assoc_rsp *rsp = (void *) skb->data;
- u16 len = le16_to_cpu(hdr->len);
- struct hci_dev *hdev;
- struct amp_ctrl *ctrl;
- struct hci_conn *hcon;
- size_t assoc_len;
-
- if (len < sizeof(*rsp))
- return -EINVAL;
-
- assoc_len = len - sizeof(*rsp);
-
- BT_DBG("id %d status 0x%2.2x assoc len %zu", rsp->id, rsp->status,
- assoc_len);
-
- if (rsp->status)
- return -EINVAL;
-
- /* Save remote ASSOC data */
- ctrl = amp_ctrl_lookup(mgr, rsp->id);
- if (ctrl) {
- u8 *assoc;
-
- assoc = kmemdup(rsp->amp_assoc, assoc_len, GFP_KERNEL);
- if (!assoc) {
- amp_ctrl_put(ctrl);
- return -ENOMEM;
- }
-
- ctrl->assoc = assoc;
- ctrl->assoc_len = assoc_len;
- ctrl->assoc_rem_len = assoc_len;
- ctrl->assoc_len_so_far = 0;
-
- amp_ctrl_put(ctrl);
- }
-
- /* Create Phys Link */
- hdev = hci_dev_get(rsp->id);
- if (!hdev)
- return -EINVAL;
-
- hcon = phylink_add(hdev, mgr, rsp->id, true);
- if (!hcon)
- goto done;
-
- BT_DBG("Created hcon %p: loc:%d -> rem:%d", hcon, hdev->id, rsp->id);
-
- mgr->bredr_chan->remote_amp_id = rsp->id;
-
- amp_create_phylink(hdev, mgr, hcon);
-
-done:
- hci_dev_put(hdev);
- skb_pull(skb, len);
- return 0;
-}
-
-static int a2mp_createphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb,
- struct a2mp_cmd *hdr)
-{
- struct a2mp_physlink_req *req = (void *) skb->data;
-
- struct a2mp_physlink_rsp rsp;
- struct hci_dev *hdev;
- struct hci_conn *hcon;
- struct amp_ctrl *ctrl;
-
- if (le16_to_cpu(hdr->len) < sizeof(*req))
- return -EINVAL;
-
- BT_DBG("local_id %d, remote_id %d", req->local_id, req->remote_id);
-
- rsp.local_id = req->remote_id;
- rsp.remote_id = req->local_id;
-
- hdev = hci_dev_get(req->remote_id);
- if (!hdev || hdev->amp_type == AMP_TYPE_BREDR) {
- rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
- goto send_rsp;
- }
-
- ctrl = amp_ctrl_lookup(mgr, rsp.remote_id);
- if (!ctrl) {
- ctrl = amp_ctrl_add(mgr, rsp.remote_id);
- if (ctrl) {
- amp_ctrl_get(ctrl);
- } else {
- rsp.status = A2MP_STATUS_UNABLE_START_LINK_CREATION;
- goto send_rsp;
- }
- }
-
- if (ctrl) {
- size_t assoc_len = le16_to_cpu(hdr->len) - sizeof(*req);
- u8 *assoc;
-
- assoc = kmemdup(req->amp_assoc, assoc_len, GFP_KERNEL);
- if (!assoc) {
- amp_ctrl_put(ctrl);
- return -ENOMEM;
- }
-
- ctrl->assoc = assoc;
- ctrl->assoc_len = assoc_len;
- ctrl->assoc_rem_len = assoc_len;
- ctrl->assoc_len_so_far = 0;
-
- amp_ctrl_put(ctrl);
- }
-
- hcon = phylink_add(hdev, mgr, req->local_id, false);
- if (hcon) {
- amp_accept_phylink(hdev, mgr, hcon);
- rsp.status = A2MP_STATUS_SUCCESS;
- } else {
- rsp.status = A2MP_STATUS_UNABLE_START_LINK_CREATION;
- }
-
-send_rsp:
- if (hdev)
- hci_dev_put(hdev);
-
- /* Reply error now and success after HCI Write Remote AMP Assoc
- command complete with success status
- */
- if (rsp.status != A2MP_STATUS_SUCCESS) {
- a2mp_send(mgr, A2MP_CREATEPHYSLINK_RSP, hdr->ident,
- sizeof(rsp), &rsp);
- } else {
- set_bit(WRITE_REMOTE_AMP_ASSOC, &mgr->state);
- mgr->ident = hdr->ident;
- }
-
- skb_pull(skb, le16_to_cpu(hdr->len));
- return 0;
-}
-
-static int a2mp_discphyslink_req(struct amp_mgr *mgr, struct sk_buff *skb,
- struct a2mp_cmd *hdr)
-{
- struct a2mp_physlink_req *req = (void *) skb->data;
- struct a2mp_physlink_rsp rsp;
- struct hci_dev *hdev;
- struct hci_conn *hcon;
-
- if (le16_to_cpu(hdr->len) < sizeof(*req))
- return -EINVAL;
-
- BT_DBG("local_id %d remote_id %d", req->local_id, req->remote_id);
-
- rsp.local_id = req->remote_id;
- rsp.remote_id = req->local_id;
- rsp.status = A2MP_STATUS_SUCCESS;
-
- hdev = hci_dev_get(req->remote_id);
- if (!hdev) {
- rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
- goto send_rsp;
- }
-
- hcon = hci_conn_hash_lookup_ba(hdev, AMP_LINK,
- &mgr->l2cap_conn->hcon->dst);
- if (!hcon) {
- bt_dev_err(hdev, "no phys link exist");
- rsp.status = A2MP_STATUS_NO_PHYSICAL_LINK_EXISTS;
- goto clean;
- }
-
- /* TODO Disconnect Phys Link here */
-
-clean:
- hci_dev_put(hdev);
-
-send_rsp:
- a2mp_send(mgr, A2MP_DISCONNPHYSLINK_RSP, hdr->ident, sizeof(rsp), &rsp);
-
- skb_pull(skb, sizeof(*req));
- return 0;
-}
-
-static inline int a2mp_cmd_rsp(struct amp_mgr *mgr, struct sk_buff *skb,
- struct a2mp_cmd *hdr)
-{
- BT_DBG("ident %d code 0x%2.2x", hdr->ident, hdr->code);
-
- skb_pull(skb, le16_to_cpu(hdr->len));
- return 0;
-}
-
-/* Handle A2MP signalling */
-static int a2mp_chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
-{
- struct a2mp_cmd *hdr;
- struct amp_mgr *mgr = chan->data;
- int err = 0;
-
- amp_mgr_get(mgr);
-
- while (skb->len >= sizeof(*hdr)) {
- u16 len;
-
- hdr = (void *) skb->data;
- len = le16_to_cpu(hdr->len);
-
- BT_DBG("code 0x%2.2x id %d len %u", hdr->code, hdr->ident, len);
-
- skb_pull(skb, sizeof(*hdr));
-
- if (len > skb->len || !hdr->ident) {
- err = -EINVAL;
- break;
- }
-
- mgr->ident = hdr->ident;
-
- switch (hdr->code) {
- case A2MP_COMMAND_REJ:
- a2mp_command_rej(mgr, skb, hdr);
- break;
-
- case A2MP_DISCOVER_REQ:
- err = a2mp_discover_req(mgr, skb, hdr);
- break;
-
- case A2MP_CHANGE_NOTIFY:
- err = a2mp_change_notify(mgr, skb, hdr);
- break;
-
- case A2MP_GETINFO_REQ:
- err = a2mp_getinfo_req(mgr, skb, hdr);
- break;
-
- case A2MP_GETAMPASSOC_REQ:
- err = a2mp_getampassoc_req(mgr, skb, hdr);
- break;
-
- case A2MP_CREATEPHYSLINK_REQ:
- err = a2mp_createphyslink_req(mgr, skb, hdr);
- break;
-
- case A2MP_DISCONNPHYSLINK_REQ:
- err = a2mp_discphyslink_req(mgr, skb, hdr);
- break;
-
- case A2MP_DISCOVER_RSP:
- err = a2mp_discover_rsp(mgr, skb, hdr);
- break;
-
- case A2MP_GETINFO_RSP:
- err = a2mp_getinfo_rsp(mgr, skb, hdr);
- break;
-
- case A2MP_GETAMPASSOC_RSP:
- err = a2mp_getampassoc_rsp(mgr, skb, hdr);
- break;
-
- case A2MP_CHANGE_RSP:
- case A2MP_CREATEPHYSLINK_RSP:
- case A2MP_DISCONNPHYSLINK_RSP:
- err = a2mp_cmd_rsp(mgr, skb, hdr);
- break;
-
- default:
- BT_ERR("Unknown A2MP sig cmd 0x%2.2x", hdr->code);
- err = -EINVAL;
- break;
- }
- }
-
- if (err) {
- struct a2mp_cmd_rej rej;
-
- rej.reason = cpu_to_le16(0);
- hdr = (void *) skb->data;
-
- BT_DBG("Send A2MP Rej: cmd 0x%2.2x err %d", hdr->code, err);
-
- a2mp_send(mgr, A2MP_COMMAND_REJ, hdr->ident, sizeof(rej),
- &rej);
- }
-
- /* Always free skb and return success error code to prevent
- from sending L2CAP Disconnect over A2MP channel */
- kfree_skb(skb);
-
- amp_mgr_put(mgr);
-
- return 0;
-}
-
-static void a2mp_chan_close_cb(struct l2cap_chan *chan)
-{
- l2cap_chan_put(chan);
-}
-
-static void a2mp_chan_state_change_cb(struct l2cap_chan *chan, int state,
- int err)
-{
- struct amp_mgr *mgr = chan->data;
-
- if (!mgr)
- return;
-
- BT_DBG("chan %p state %s", chan, state_to_string(state));
-
- chan->state = state;
-
- switch (state) {
- case BT_CLOSED:
- if (mgr)
- amp_mgr_put(mgr);
- break;
- }
-}
-
-static struct sk_buff *a2mp_chan_alloc_skb_cb(struct l2cap_chan *chan,
- unsigned long hdr_len,
- unsigned long len, int nb)
-{
- struct sk_buff *skb;
-
- skb = bt_skb_alloc(hdr_len + len, GFP_KERNEL);
- if (!skb)
- return ERR_PTR(-ENOMEM);
-
- return skb;
-}
-
-static const struct l2cap_ops a2mp_chan_ops = {
- .name = "L2CAP A2MP channel",
- .recv = a2mp_chan_recv_cb,
- .close = a2mp_chan_close_cb,
- .state_change = a2mp_chan_state_change_cb,
- .alloc_skb = a2mp_chan_alloc_skb_cb,
-
- /* Not implemented for A2MP */
- .new_connection = l2cap_chan_no_new_connection,
- .teardown = l2cap_chan_no_teardown,
- .ready = l2cap_chan_no_ready,
- .defer = l2cap_chan_no_defer,
- .resume = l2cap_chan_no_resume,
- .set_shutdown = l2cap_chan_no_set_shutdown,
- .get_sndtimeo = l2cap_chan_no_get_sndtimeo,
-};
-
-static struct l2cap_chan *a2mp_chan_open(struct l2cap_conn *conn, bool locked)
-{
- struct l2cap_chan *chan;
- int err;
-
- chan = l2cap_chan_create();
- if (!chan)
- return NULL;
-
- BT_DBG("chan %p", chan);
-
- chan->chan_type = L2CAP_CHAN_FIXED;
- chan->scid = L2CAP_CID_A2MP;
- chan->dcid = L2CAP_CID_A2MP;
- chan->omtu = L2CAP_A2MP_DEFAULT_MTU;
- chan->imtu = L2CAP_A2MP_DEFAULT_MTU;
- chan->flush_to = L2CAP_DEFAULT_FLUSH_TO;
-
- chan->ops = &a2mp_chan_ops;
-
- l2cap_chan_set_defaults(chan);
- chan->remote_max_tx = chan->max_tx;
- chan->remote_tx_win = chan->tx_win;
-
- chan->retrans_timeout = L2CAP_DEFAULT_RETRANS_TO;
- chan->monitor_timeout = L2CAP_DEFAULT_MONITOR_TO;
-
- skb_queue_head_init(&chan->tx_q);
-
- chan->mode = L2CAP_MODE_ERTM;
-
- err = l2cap_ertm_init(chan);
- if (err < 0) {
- l2cap_chan_del(chan, 0);
- return NULL;
- }
-
- chan->conf_state = 0;
-
- if (locked)
- __l2cap_chan_add(conn, chan);
- else
- l2cap_chan_add(conn, chan);
-
- chan->remote_mps = chan->omtu;
- chan->mps = chan->omtu;
-
- chan->state = BT_CONNECTED;
-
- return chan;
-}
-
-/* AMP Manager functions */
-struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr)
-{
- BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));
-
- kref_get(&mgr->kref);
-
- return mgr;
-}
-
-static void amp_mgr_destroy(struct kref *kref)
-{
- struct amp_mgr *mgr = container_of(kref, struct amp_mgr, kref);
-
- BT_DBG("mgr %p", mgr);
-
- mutex_lock(&amp_mgr_list_lock);
- list_del(&mgr->list);
- mutex_unlock(&amp_mgr_list_lock);
-
- amp_ctrl_list_flush(mgr);
- kfree(mgr);
-}
-
-int amp_mgr_put(struct amp_mgr *mgr)
-{
- BT_DBG("mgr %p orig refcnt %d", mgr, kref_read(&mgr->kref));
-
- return kref_put(&mgr->kref, &amp_mgr_destroy);
-}
-
-static struct amp_mgr *amp_mgr_create(struct l2cap_conn *conn, bool locked)
-{
- struct amp_mgr *mgr;
- struct l2cap_chan *chan;
-
- mgr = kzalloc(sizeof(*mgr), GFP_KERNEL);
- if (!mgr)
- return NULL;
-
- BT_DBG("conn %p mgr %p", conn, mgr);
-
- mgr->l2cap_conn = conn;
-
- chan = a2mp_chan_open(conn, locked);
- if (!chan) {
- kfree(mgr);
- return NULL;
- }
-
- mgr->a2mp_chan = chan;
- chan->data = mgr;
-
- conn->hcon->amp_mgr = mgr;
-
- kref_init(&mgr->kref);
-
- /* Remote AMP ctrl list initialization */
- INIT_LIST_HEAD(&mgr->amp_ctrls);
- mutex_init(&mgr->amp_ctrls_lock);
-
- mutex_lock(&amp_mgr_list_lock);
- list_add(&mgr->list, &amp_mgr_list);
- mutex_unlock(&amp_mgr_list_lock);
-
- return mgr;
-}
-
-struct l2cap_chan *a2mp_channel_create(struct l2cap_conn *conn,
- struct sk_buff *skb)
-{
- struct amp_mgr *mgr;
-
- if (conn->hcon->type != ACL_LINK)
- return NULL;
-
- mgr = amp_mgr_create(conn, false);
- if (!mgr) {
- BT_ERR("Could not create AMP manager");
- return NULL;
- }
-
- BT_DBG("mgr: %p chan %p", mgr, mgr->a2mp_chan);
-
- return mgr->a2mp_chan;
-}
-
-void a2mp_send_getinfo_rsp(struct hci_dev *hdev)
-{
- struct amp_mgr *mgr;
- struct a2mp_info_rsp rsp;
-
- mgr = amp_mgr_lookup_by_state(READ_LOC_AMP_INFO);
- if (!mgr)
- return;
-
- BT_DBG("%s mgr %p", hdev->name, mgr);
-
- rsp.id = hdev->id;
- rsp.status = A2MP_STATUS_INVALID_CTRL_ID;
-
- if (hdev->amp_type != AMP_TYPE_BREDR) {
- rsp.status = 0;
- rsp.total_bw = cpu_to_le32(hdev->amp_total_bw);
- rsp.max_bw = cpu_to_le32(hdev->amp_max_bw);
- rsp.min_latency = cpu_to_le32(hdev->amp_min_latency);
- rsp.pal_cap = cpu_to_le16(hdev->amp_pal_cap);
- rsp.assoc_size = cpu_to_le16(hdev->amp_assoc_size);
- }
-
- a2mp_send(mgr, A2MP_GETINFO_RSP, mgr->ident, sizeof(rsp), &rsp);
- amp_mgr_put(mgr);
-}
-
-void a2mp_send_getampassoc_rsp(struct hci_dev *hdev, u8 status)
-{
- struct amp_mgr *mgr;
- struct amp_assoc *loc_assoc = &hdev->loc_assoc;
- struct a2mp_amp_assoc_rsp *rsp;
- size_t len;
-
- mgr = amp_mgr_lookup_by_state(READ_LOC_AMP_ASSOC);
- if (!mgr)
- return;
-
- BT_DBG("%s mgr %p", hdev->name, mgr);
-
- len = sizeof(struct a2mp_amp_assoc_rsp) + loc_assoc->len;
- rsp = kzalloc(len, GFP_KERNEL);
- if (!rsp) {
- amp_mgr_put(mgr);
- return;
- }
-
- rsp->id = hdev->id;
-
- if (status) {
- rsp->status = A2MP_STATUS_INVALID_CTRL_ID;
- } else {
- rsp->status = A2MP_STATUS_SUCCESS;
- memcpy(rsp->amp_assoc, loc_assoc->data, loc_assoc->len);
- }
-
- a2mp_send(mgr, A2MP_GETAMPASSOC_RSP, mgr->ident, len, rsp);
- amp_mgr_put(mgr);
- kfree(rsp);
-}
-
-void a2mp_send_create_phy_link_req(struct hci_dev *hdev, u8 status)
-{
- struct amp_mgr *mgr;
- struct amp_assoc *loc_assoc = &hdev->loc_assoc;
- struct a2mp_physlink_req *req;
- struct l2cap_chan *bredr_chan;
- size_t len;
-
- mgr = amp_mgr_lookup_by_state(READ_LOC_AMP_ASSOC_FINAL);
- if (!mgr)
- return;
-
- len = sizeof(*req) + loc_assoc->len;
-
- BT_DBG("%s mgr %p assoc_len %zu", hdev->name, mgr, len);
-
- req = kzalloc(len, GFP_KERNEL);
- if (!req) {
- amp_mgr_put(mgr);
- return;
- }
-
- bredr_chan = mgr->bredr_chan;
- if (!bredr_chan)
- goto clean;
-
- req->local_id = hdev->id;
- req->remote_id = bredr_chan->remote_amp_id;
- memcpy(req->amp_assoc, loc_assoc->data, loc_assoc->len);
-
- a2mp_send(mgr, A2MP_CREATEPHYSLINK_REQ, __next_ident(mgr), len, req);
-
-clean:
- amp_mgr_put(mgr);
- kfree(req);
-}
-
-void a2mp_send_create_phy_link_rsp(struct hci_dev *hdev, u8 status)
-{
- struct amp_mgr *mgr;
- struct a2mp_physlink_rsp rsp;
- struct hci_conn *hs_hcon;
-
- mgr = amp_mgr_lookup_by_state(WRITE_REMOTE_AMP_ASSOC);
- if (!mgr)
- return;
-
- hs_hcon = hci_conn_hash_lookup_state(hdev, AMP_LINK, BT_CONNECT);
- if (!hs_hcon) {
- rsp.status = A2MP_STATUS_UNABLE_START_LINK_CREATION;
- } else {
- rsp.remote_id = hs_hcon->remote_id;
- rsp.status = A2MP_STATUS_SUCCESS;
- }
-
- BT_DBG("%s mgr %p hs_hcon %p status %u", hdev->name, mgr, hs_hcon,
- status);
-
- rsp.local_id = hdev->id;
- a2mp_send(mgr, A2MP_CREATEPHYSLINK_RSP, mgr->ident, sizeof(rsp), &rsp);
- amp_mgr_put(mgr);
-}
-
-void a2mp_discover_amp(struct l2cap_chan *chan)
-{
- struct l2cap_conn *conn = chan->conn;
- struct amp_mgr *mgr = conn->hcon->amp_mgr;
- struct a2mp_discov_req req;
-
- BT_DBG("chan %p conn %p mgr %p", chan, conn, mgr);
-
- if (!mgr) {
- mgr = amp_mgr_create(conn, true);
- if (!mgr)
- return;
- }
-
- mgr->bredr_chan = chan;
-
- req.mtu = cpu_to_le16(L2CAP_A2MP_DEFAULT_MTU);
- req.ext_feat = 0;
- a2mp_send(mgr, A2MP_DISCOVER_REQ, 1, sizeof(req), &req);
-}
diff --git a/net/bluetooth/a2mp.h b/net/bluetooth/a2mp.h
deleted file mode 100644
index a4ff3ea9b38a..000000000000
--- a/net/bluetooth/a2mp.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- Copyright (c) 2010,2011 Code Aurora Forum. All rights reserved.
- Copyright (c) 2011,2012 Intel Corp.
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License version 2 and
- only version 2 as published by the Free Software Foundation.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-*/
-
-#ifndef __A2MP_H
-#define __A2MP_H
-
-#include <net/bluetooth/l2cap.h>
-
-enum amp_mgr_state {
- READ_LOC_AMP_INFO,
- READ_LOC_AMP_ASSOC,
- READ_LOC_AMP_ASSOC_FINAL,
- WRITE_REMOTE_AMP_ASSOC,
-};
-
-struct amp_mgr {
- struct list_head list;
- struct l2cap_conn *l2cap_conn;
- struct l2cap_chan *a2mp_chan;
- struct l2cap_chan *bredr_chan;
- struct kref kref;
- __u8 ident;
- __u8 handle;
- unsigned long state;
- unsigned long flags;
-
- struct list_head amp_ctrls;
- struct mutex amp_ctrls_lock;
-};
-
-struct a2mp_cmd {
- __u8 code;
- __u8 ident;
- __le16 len;
- __u8 data[0];
-} __packed;
-
-/* A2MP command codes */
-#define A2MP_COMMAND_REJ 0x01
-struct a2mp_cmd_rej {
- __le16 reason;
- __u8 data[0];
-} __packed;
-
-#define A2MP_DISCOVER_REQ 0x02
-struct a2mp_discov_req {
- __le16 mtu;
- __le16 ext_feat;
-} __packed;
-
-struct a2mp_cl {
- __u8 id;
- __u8 type;
- __u8 status;
-} __packed;
-
-#define A2MP_DISCOVER_RSP 0x03
-struct a2mp_discov_rsp {
- __le16 mtu;
- __le16 ext_feat;
- struct a2mp_cl cl[0];
-} __packed;
-
-#define A2MP_CHANGE_NOTIFY 0x04
-#define A2MP_CHANGE_RSP 0x05
-
-#define A2MP_GETINFO_REQ 0x06
-struct a2mp_info_req {
- __u8 id;
-} __packed;
-
-#define A2MP_GETINFO_RSP 0x07
-struct a2mp_info_rsp {
- __u8 id;
- __u8 status;
- __le32 total_bw;
- __le32 max_bw;
- __le32 min_latency;
- __le16 pal_cap;
- __le16 assoc_size;
-} __packed;
-
-#define A2MP_GETAMPASSOC_REQ 0x08
-struct a2mp_amp_assoc_req {
- __u8 id;
-} __packed;
-
-#define A2MP_GETAMPASSOC_RSP 0x09
-struct a2mp_amp_assoc_rsp {
- __u8 id;
- __u8 status;
- __u8 amp_assoc[0];
-} __packed;
-
-#define A2MP_CREATEPHYSLINK_REQ 0x0A
-#define A2MP_DISCONNPHYSLINK_REQ 0x0C
-struct a2mp_physlink_req {
- __u8 local_id;
- __u8 remote_id;
- __u8 amp_assoc[0];
-} __packed;
-
-#define A2MP_CREATEPHYSLINK_RSP 0x0B
-#define A2MP_DISCONNPHYSLINK_RSP 0x0D
-struct a2mp_physlink_rsp {
- __u8 local_id;
- __u8 remote_id;
- __u8 status;
-} __packed;
-
-/* A2MP response status */
-#define A2MP_STATUS_SUCCESS 0x00
-#define A2MP_STATUS_INVALID_CTRL_ID 0x01
-#define A2MP_STATUS_UNABLE_START_LINK_CREATION 0x02
-#define A2MP_STATUS_NO_PHYSICAL_LINK_EXISTS 0x02
-#define A2MP_STATUS_COLLISION_OCCURED 0x03
-#define A2MP_STATUS_DISCONN_REQ_RECVD 0x04
-#define A2MP_STATUS_PHYS_LINK_EXISTS 0x05
-#define A2MP_STATUS_SECURITY_VIOLATION 0x06
-
-struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr);
-
-#if IS_ENABLED(CONFIG_BT_HS)
-int amp_mgr_put(struct amp_mgr *mgr);
-struct l2cap_chan *a2mp_channel_create(struct l2cap_conn *conn,
- struct sk_buff *skb);
-void a2mp_discover_amp(struct l2cap_chan *chan);
-#else
-static inline int amp_mgr_put(struct amp_mgr *mgr)
-{
- return 0;
-}
-
-static inline struct l2cap_chan *a2mp_channel_create(struct l2cap_conn *conn,
- struct sk_buff *skb)
-{
- return NULL;
-}
-
-static inline void a2mp_discover_amp(struct l2cap_chan *chan)
-{
-}
-#endif
-
-void a2mp_send_getinfo_rsp(struct hci_dev *hdev);
-void a2mp_send_getampassoc_rsp(struct hci_dev *hdev, u8 status);
-void a2mp_send_create_phy_link_req(struct hci_dev *hdev, u8 status);
-void a2mp_send_create_phy_link_rsp(struct hci_dev *hdev, u8 status);
-
-#endif /* __A2MP_H */
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index deacc52d7ff1..2b94e2077203 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -34,11 +34,14 @@
#include <net/bluetooth/bluetooth.h>
#include <linux/proc_fs.h>
+#include <linux/ethtool.h>
+#include <linux/sockios.h>
+
#include "leds.h"
#include "selftest.h"
/* Bluetooth sockets */
-#define BT_MAX_PROTO 8
+#define BT_MAX_PROTO (BTPROTO_LAST + 1)
static const struct net_proto_family *bt_proto[BT_MAX_PROTO];
static DEFINE_RWLOCK(bt_proto_lock);
@@ -52,6 +55,7 @@ static const char *const bt_key_strings[BT_MAX_PROTO] = {
"sk_lock-AF_BLUETOOTH-BTPROTO_CMTP",
"sk_lock-AF_BLUETOOTH-BTPROTO_HIDP",
"sk_lock-AF_BLUETOOTH-BTPROTO_AVDTP",
+ "sk_lock-AF_BLUETOOTH-BTPROTO_ISO",
};
static struct lock_class_key bt_slock_key[BT_MAX_PROTO];
@@ -64,6 +68,7 @@ static const char *const bt_slock_key_strings[BT_MAX_PROTO] = {
"slock-AF_BLUETOOTH-BTPROTO_CMTP",
"slock-AF_BLUETOOTH-BTPROTO_HIDP",
"slock-AF_BLUETOOTH-BTPROTO_AVDTP",
+ "slock-AF_BLUETOOTH-BTPROTO_ISO",
};
void bt_sock_reclassify_lock(struct sock *sk, int proto)
@@ -72,8 +77,8 @@ void bt_sock_reclassify_lock(struct sock *sk, int proto)
BUG_ON(!sock_allow_reclassification(sk));
sock_lock_init_class_and_name(sk,
- bt_slock_key_strings[proto], &bt_slock_key[proto],
- bt_key_strings[proto], &bt_lock_key[proto]);
+ bt_slock_key_strings[proto], &bt_slock_key[proto],
+ bt_key_strings[proto], &bt_lock_key[proto]);
}
EXPORT_SYMBOL(bt_sock_reclassify_lock);
@@ -138,6 +143,35 @@ static int bt_sock_create(struct net *net, struct socket *sock, int proto,
return err;
}
+struct sock *bt_sock_alloc(struct net *net, struct socket *sock,
+ struct proto *prot, int proto, gfp_t prio, int kern)
+{
+ struct sock *sk;
+
+ sk = sk_alloc(net, PF_BLUETOOTH, prio, prot, kern);
+ if (!sk)
+ return NULL;
+
+ sock_init_data(sock, sk);
+ INIT_LIST_HEAD(&bt_sk(sk)->accept_q);
+
+ sock_reset_flag(sk, SOCK_ZAPPED);
+
+ sk->sk_protocol = proto;
+ sk->sk_state = BT_OPEN;
+
+ /* Init peer information so it can be properly monitored */
+ if (!kern) {
+ spin_lock(&sk->sk_peer_lock);
+ sk->sk_peer_pid = get_pid(task_tgid(current));
+ sk->sk_peer_cred = get_current_cred();
+ spin_unlock(&sk->sk_peer_lock);
+ }
+
+ return sk;
+}
+EXPORT_SYMBOL(bt_sock_alloc);
+
void bt_sock_link(struct bt_sock_list *l, struct sock *sk)
{
write_lock(&l->lock);
@@ -154,16 +188,64 @@ void bt_sock_unlink(struct bt_sock_list *l, struct sock *sk)
}
EXPORT_SYMBOL(bt_sock_unlink);
-void bt_accept_enqueue(struct sock *parent, struct sock *sk)
+bool bt_sock_linked(struct bt_sock_list *l, struct sock *s)
{
+ struct sock *sk;
+
+ if (!l || !s)
+ return false;
+
+ read_lock(&l->lock);
+
+ sk_for_each(sk, &l->head) {
+ if (s == sk) {
+ read_unlock(&l->lock);
+ return true;
+ }
+ }
+
+ read_unlock(&l->lock);
+
+ return false;
+}
+EXPORT_SYMBOL(bt_sock_linked);
+
+void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh)
+{
+ const struct cred *old_cred;
+ struct pid *old_pid;
+
BT_DBG("parent %p, sk %p", parent, sk);
sock_hold(sk);
- lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+
+ if (bh)
+ bh_lock_sock_nested(sk);
+ else
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+
list_add_tail(&bt_sk(sk)->accept_q, &bt_sk(parent)->accept_q);
bt_sk(sk)->parent = parent;
- release_sock(sk);
- parent->sk_ack_backlog++;
+
+ /* Copy credentials from parent since for incoming connections the
+ * socket is allocated by the kernel.
+ */
+ spin_lock(&sk->sk_peer_lock);
+ old_pid = sk->sk_peer_pid;
+ old_cred = sk->sk_peer_cred;
+ sk->sk_peer_pid = get_pid(parent->sk_peer_pid);
+ sk->sk_peer_cred = get_cred(parent->sk_peer_cred);
+ spin_unlock(&sk->sk_peer_lock);
+
+ put_pid(old_pid);
+ put_cred(old_cred);
+
+ if (bh)
+ bh_unlock_sock(sk);
+ else
+ release_sock(sk);
+
+ sk_acceptq_added(parent);
}
EXPORT_SYMBOL(bt_accept_enqueue);
@@ -175,7 +257,7 @@ void bt_accept_unlink(struct sock *sk)
BT_DBG("sk %p state %d", sk, sk->sk_state);
list_del_init(&bt_sk(sk)->accept_q);
- bt_sk(sk)->parent->sk_ack_backlog--;
+ sk_acceptq_removed(bt_sk(sk)->parent);
bt_sk(sk)->parent = NULL;
sock_put(sk);
}
@@ -241,7 +323,6 @@ EXPORT_SYMBOL(bt_accept_dequeue);
int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
int flags)
{
- int noblock = flags & MSG_DONTWAIT;
struct sock *sk = sock->sk;
struct sk_buff *skb;
size_t copied;
@@ -253,10 +334,10 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
if (flags & MSG_OOB)
return -EOPNOTSUPP;
- skb = skb_recv_datagram(sk, flags, noblock, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
if (!skb) {
if (sk->sk_shutdown & RCV_SHUTDOWN)
- return 0;
+ err = 0;
return err;
}
@@ -271,11 +352,25 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
skb_reset_transport_header(skb);
err = skb_copy_datagram_msg(skb, 0, msg, copied);
if (err == 0) {
- sock_recv_ts_and_drops(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
if (msg->msg_name && bt_sk(sk)->skb_msg_name)
bt_sk(sk)->skb_msg_name(skb, msg->msg_name,
&msg->msg_namelen);
+
+ if (test_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags)) {
+ u8 pkt_status = hci_skb_pkt_status(skb);
+
+ put_cmsg(msg, SOL_BLUETOOTH, BT_SCM_PKT_STATUS,
+ sizeof(pkt_status), &pkt_status);
+ }
+
+ if (test_bit(BT_SK_PKT_SEQNUM, &bt_sk(sk)->flags)) {
+ u16 pkt_seqnum = hci_skb_pkt_seqnum(skb);
+
+ put_cmsg(msg, SOL_BLUETOOTH, BT_SCM_PKT_SEQNUM,
+ sizeof(pkt_seqnum), &pkt_seqnum);
+ }
}
skb_free_datagram(sk, skb);
@@ -372,7 +467,7 @@ int bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg,
copied += chunk;
size -= chunk;
- sock_recv_ts_and_drops(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
if (!(flags & MSG_PEEK)) {
int skb_len = skb_headlen(skb);
@@ -438,19 +533,17 @@ static inline __poll_t bt_accept_poll(struct sock *parent)
}
__poll_t bt_sock_poll(struct file *file, struct socket *sock,
- poll_table *wait)
+ poll_table *wait)
{
struct sock *sk = sock->sk;
__poll_t mask = 0;
- BT_DBG("sock %p, sk %p", sock, sk);
-
poll_wait(file, sk_sleep(sk), wait);
if (sk->sk_state == BT_LISTEN)
return bt_accept_poll(sk);
- if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
mask |= EPOLLERR |
(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
@@ -460,15 +553,15 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock,
if (sk->sk_shutdown == SHUTDOWN_MASK)
mask |= EPOLLHUP;
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
mask |= EPOLLIN | EPOLLRDNORM;
if (sk->sk_state == BT_CLOSED)
mask |= EPOLLHUP;
if (sk->sk_state == BT_CONNECT ||
- sk->sk_state == BT_CONNECT2 ||
- sk->sk_state == BT_CONFIG)
+ sk->sk_state == BT_CONNECT2 ||
+ sk->sk_state == BT_CONFIG)
return mask;
if (!test_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags) && sock_writeable(sk))
@@ -480,6 +573,86 @@ __poll_t bt_sock_poll(struct file *file, struct socket *sock,
}
EXPORT_SYMBOL(bt_sock_poll);
+static int bt_ethtool_get_ts_info(struct sock *sk, unsigned int index,
+ void __user *useraddr)
+{
+ struct ethtool_ts_info info;
+ struct kernel_ethtool_ts_info ts_info = {};
+ int ret;
+
+ ret = hci_ethtool_ts_info(index, sk->sk_protocol, &ts_info);
+ if (ret == -ENODEV)
+ return ret;
+ else if (ret < 0)
+ return -EIO;
+
+ memset(&info, 0, sizeof(info));
+
+ info.cmd = ETHTOOL_GET_TS_INFO;
+ info.so_timestamping = ts_info.so_timestamping;
+ info.phc_index = ts_info.phc_index;
+ info.tx_types = ts_info.tx_types;
+ info.rx_filters = ts_info.rx_filters;
+
+ if (copy_to_user(useraddr, &info, sizeof(info)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int bt_ethtool(struct sock *sk, const struct ifreq *ifr,
+ void __user *useraddr)
+{
+ unsigned int index;
+ u32 ethcmd;
+ int n;
+
+ if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
+ return -EFAULT;
+
+ if (sscanf(ifr->ifr_name, "hci%u%n", &index, &n) != 1 ||
+ n != strlen(ifr->ifr_name))
+ return -ENODEV;
+
+ switch (ethcmd) {
+ case ETHTOOL_GET_TS_INFO:
+ return bt_ethtool_get_ts_info(sk, index, useraddr);
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int bt_dev_ioctl(struct socket *sock, unsigned int cmd, void __user *arg)
+{
+ struct sock *sk = sock->sk;
+ struct ifreq ifr = {};
+ void __user *data;
+ char *colon;
+ int ret = -ENOIOCTLCMD;
+
+ if (get_user_ifreq(&ifr, &data, arg))
+ return -EFAULT;
+
+ ifr.ifr_name[IFNAMSIZ - 1] = 0;
+ colon = strchr(ifr.ifr_name, ':');
+ if (colon)
+ *colon = 0;
+
+ switch (cmd) {
+ case SIOCETHTOOL:
+ ret = bt_ethtool(sk, &ifr, data);
+ break;
+ }
+
+ if (colon)
+ *colon = ':';
+
+ if (put_user_ifreq(&ifr, arg))
+ return -EFAULT;
+
+ return ret;
+}
+
int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
struct sock *sk = sock->sk;
@@ -497,26 +670,23 @@ int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
if (amount < 0)
amount = 0;
- err = put_user(amount, (int __user *) arg);
+ err = put_user(amount, (int __user *)arg);
break;
case TIOCINQ:
if (sk->sk_state == BT_LISTEN)
return -EINVAL;
- lock_sock(sk);
+ spin_lock(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue);
amount = skb ? skb->len : 0;
- release_sock(sk);
- err = put_user(amount, (int __user *) arg);
- break;
+ spin_unlock(&sk->sk_receive_queue.lock);
- case SIOCGSTAMP:
- err = sock_get_timestamp(sk, (struct timeval __user *) arg);
+ err = put_user(amount, (int __user *)arg);
break;
- case SIOCGSTAMPNS:
- err = sock_get_timestampns(sk, (struct timespec __user *) arg);
+ case SIOCETHTOOL:
+ err = bt_dev_ioctl(sock, cmd, (void __user *)arg);
break;
default:
@@ -565,7 +735,7 @@ int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo)
EXPORT_SYMBOL(bt_sock_wait_state);
/* This function expects the sk lock to be held when called */
-int bt_sock_wait_ready(struct sock *sk, unsigned long flags)
+int bt_sock_wait_ready(struct sock *sk, unsigned int msg_flags)
{
DECLARE_WAITQUEUE(wait, current);
unsigned long timeo;
@@ -573,7 +743,7 @@ int bt_sock_wait_ready(struct sock *sk, unsigned long flags)
BT_DBG("sk %p", sk);
- timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+ timeo = sock_sndtimeo(sk, !!(msg_flags & MSG_DONTWAIT));
add_wait_queue(sk_sleep(sk), &wait);
set_current_state(TASK_INTERRUPTIBLE);
@@ -608,7 +778,7 @@ EXPORT_SYMBOL(bt_sock_wait_ready);
static void *bt_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(seq->private->l->lock)
{
- struct bt_sock_list *l = PDE_DATA(file_inode(seq->file));
+ struct bt_sock_list *l = pde_data(file_inode(seq->file));
read_lock(&l->lock);
return seq_hlist_start_head(&l->head, *pos);
@@ -616,7 +786,7 @@ static void *bt_seq_start(struct seq_file *seq, loff_t *pos)
static void *bt_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct bt_sock_list *l = PDE_DATA(file_inode(seq->file));
+ struct bt_sock_list *l = pde_data(file_inode(seq->file));
return seq_hlist_next(v, &l->head, pos);
}
@@ -624,17 +794,17 @@ static void *bt_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static void bt_seq_stop(struct seq_file *seq, void *v)
__releases(seq->private->l->lock)
{
- struct bt_sock_list *l = PDE_DATA(file_inode(seq->file));
+ struct bt_sock_list *l = pde_data(file_inode(seq->file));
read_unlock(&l->lock);
}
static int bt_seq_show(struct seq_file *seq, void *v)
{
- struct bt_sock_list *l = PDE_DATA(file_inode(seq->file));
+ struct bt_sock_list *l = pde_data(file_inode(seq->file));
if (v == SEQ_START_TOKEN) {
- seq_puts(seq ,"sk RefCnt Rmem Wmem User Inode Parent");
+ seq_puts(seq, "sk RefCnt Rmem Wmem User Inode Parent");
if (l->custom_seq_show) {
seq_putc(seq, ' ');
@@ -652,9 +822,9 @@ static int bt_seq_show(struct seq_file *seq, void *v)
refcount_read(&sk->sk_refcnt),
sk_rmem_alloc_get(sk),
sk_wmem_alloc_get(sk),
- from_kuid(seq_user_ns(seq), sock_i_uid(sk)),
+ from_kuid(seq_user_ns(seq), sk_uid(sk)),
sock_i_ino(sk),
- bt->parent? sock_i_ino(bt->parent): 0LU);
+ bt->parent ? sock_i_ino(bt->parent) : 0LU);
if (l->custom_seq_show) {
seq_putc(seq, ' ');
@@ -675,7 +845,7 @@ static const struct seq_operations bt_seq_ops = {
int bt_procfs_init(struct net *net, const char *name,
struct bt_sock_list *sk_list,
- int (* seq_show)(struct seq_file *, void *))
+ int (*seq_show)(struct seq_file *, void *))
{
sk_list->custom_seq_show = seq_show;
@@ -691,7 +861,7 @@ void bt_procfs_cleanup(struct net *net, const char *name)
#else
int bt_procfs_init(struct net *net, const char *name,
struct bt_sock_list *sk_list,
- int (* seq_show)(struct seq_file *, void *))
+ int (*seq_show)(struct seq_file *, void *))
{
return 0;
}
@@ -733,7 +903,7 @@ static int __init bt_init(void)
err = bt_sysfs_init();
if (err < 0)
- return err;
+ goto cleanup_led;
err = sock_register(&bt_sock_family_ops);
if (err)
@@ -769,11 +939,16 @@ unregister_socket:
sock_unregister(PF_BLUETOOTH);
cleanup_sysfs:
bt_sysfs_cleanup();
+cleanup_led:
+ bt_leds_cleanup();
+ debugfs_remove_recursive(bt_debugfs);
return err;
}
static void __exit bt_exit(void)
{
+ iso_exit();
+
mgmt_exit();
sco_exit();
diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c
deleted file mode 100644
index 78bec8df8525..000000000000
--- a/net/bluetooth/amp.c
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- Copyright (c) 2011,2012 Intel Corp.
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License version 2 and
- only version 2 as published by the Free Software Foundation.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-*/
-
-#include <net/bluetooth/bluetooth.h>
-#include <net/bluetooth/hci.h>
-#include <net/bluetooth/hci_core.h>
-#include <crypto/hash.h>
-
-#include "hci_request.h"
-#include "a2mp.h"
-#include "amp.h"
-
-/* Remote AMP Controllers interface */
-void amp_ctrl_get(struct amp_ctrl *ctrl)
-{
- BT_DBG("ctrl %p orig refcnt %d", ctrl,
- kref_read(&ctrl->kref));
-
- kref_get(&ctrl->kref);
-}
-
-static void amp_ctrl_destroy(struct kref *kref)
-{
- struct amp_ctrl *ctrl = container_of(kref, struct amp_ctrl, kref);
-
- BT_DBG("ctrl %p", ctrl);
-
- kfree(ctrl->assoc);
- kfree(ctrl);
-}
-
-int amp_ctrl_put(struct amp_ctrl *ctrl)
-{
- BT_DBG("ctrl %p orig refcnt %d", ctrl,
- kref_read(&ctrl->kref));
-
- return kref_put(&ctrl->kref, &amp_ctrl_destroy);
-}
-
-struct amp_ctrl *amp_ctrl_add(struct amp_mgr *mgr, u8 id)
-{
- struct amp_ctrl *ctrl;
-
- ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
- if (!ctrl)
- return NULL;
-
- kref_init(&ctrl->kref);
- ctrl->id = id;
-
- mutex_lock(&mgr->amp_ctrls_lock);
- list_add(&ctrl->list, &mgr->amp_ctrls);
- mutex_unlock(&mgr->amp_ctrls_lock);
-
- BT_DBG("mgr %p ctrl %p", mgr, ctrl);
-
- return ctrl;
-}
-
-void amp_ctrl_list_flush(struct amp_mgr *mgr)
-{
- struct amp_ctrl *ctrl, *n;
-
- BT_DBG("mgr %p", mgr);
-
- mutex_lock(&mgr->amp_ctrls_lock);
- list_for_each_entry_safe(ctrl, n, &mgr->amp_ctrls, list) {
- list_del(&ctrl->list);
- amp_ctrl_put(ctrl);
- }
- mutex_unlock(&mgr->amp_ctrls_lock);
-}
-
-struct amp_ctrl *amp_ctrl_lookup(struct amp_mgr *mgr, u8 id)
-{
- struct amp_ctrl *ctrl;
-
- BT_DBG("mgr %p id %d", mgr, id);
-
- mutex_lock(&mgr->amp_ctrls_lock);
- list_for_each_entry(ctrl, &mgr->amp_ctrls, list) {
- if (ctrl->id == id) {
- amp_ctrl_get(ctrl);
- mutex_unlock(&mgr->amp_ctrls_lock);
- return ctrl;
- }
- }
- mutex_unlock(&mgr->amp_ctrls_lock);
-
- return NULL;
-}
-
-/* Physical Link interface */
-static u8 __next_handle(struct amp_mgr *mgr)
-{
- if (++mgr->handle == 0)
- mgr->handle = 1;
-
- return mgr->handle;
-}
-
-struct hci_conn *phylink_add(struct hci_dev *hdev, struct amp_mgr *mgr,
- u8 remote_id, bool out)
-{
- bdaddr_t *dst = &mgr->l2cap_conn->hcon->dst;
- struct hci_conn *hcon;
- u8 role = out ? HCI_ROLE_MASTER : HCI_ROLE_SLAVE;
-
- hcon = hci_conn_add(hdev, AMP_LINK, dst, role);
- if (!hcon)
- return NULL;
-
- BT_DBG("hcon %p dst %pMR", hcon, dst);
-
- hcon->state = BT_CONNECT;
- hcon->attempt++;
- hcon->handle = __next_handle(mgr);
- hcon->remote_id = remote_id;
- hcon->amp_mgr = amp_mgr_get(mgr);
-
- return hcon;
-}
-
-/* AMP crypto key generation interface */
-static int hmac_sha256(u8 *key, u8 ksize, char *plaintext, u8 psize, u8 *output)
-{
- struct crypto_shash *tfm;
- struct shash_desc *shash;
- int ret;
-
- if (!ksize)
- return -EINVAL;
-
- tfm = crypto_alloc_shash("hmac(sha256)", 0, 0);
- if (IS_ERR(tfm)) {
- BT_DBG("crypto_alloc_ahash failed: err %ld", PTR_ERR(tfm));
- return PTR_ERR(tfm);
- }
-
- ret = crypto_shash_setkey(tfm, key, ksize);
- if (ret) {
- BT_DBG("crypto_ahash_setkey failed: err %d", ret);
- goto failed;
- }
-
- shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(tfm),
- GFP_KERNEL);
- if (!shash) {
- ret = -ENOMEM;
- goto failed;
- }
-
- shash->tfm = tfm;
- shash->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-
- ret = crypto_shash_digest(shash, plaintext, psize, output);
-
- kfree(shash);
-
-failed:
- crypto_free_shash(tfm);
- return ret;
-}
-
-int phylink_gen_key(struct hci_conn *conn, u8 *data, u8 *len, u8 *type)
-{
- struct hci_dev *hdev = conn->hdev;
- struct link_key *key;
- u8 keybuf[HCI_AMP_LINK_KEY_SIZE];
- u8 gamp_key[HCI_AMP_LINK_KEY_SIZE];
- int err;
-
- if (!hci_conn_check_link_mode(conn))
- return -EACCES;
-
- BT_DBG("conn %p key_type %d", conn, conn->key_type);
-
- /* Legacy key */
- if (conn->key_type < 3) {
- bt_dev_err(hdev, "legacy key type %d", conn->key_type);
- return -EACCES;
- }
-
- *type = conn->key_type;
- *len = HCI_AMP_LINK_KEY_SIZE;
-
- key = hci_find_link_key(hdev, &conn->dst);
- if (!key) {
- BT_DBG("No Link key for conn %p dst %pMR", conn, &conn->dst);
- return -EACCES;
- }
-
- /* BR/EDR Link Key concatenated together with itself */
- memcpy(&keybuf[0], key->val, HCI_LINK_KEY_SIZE);
- memcpy(&keybuf[HCI_LINK_KEY_SIZE], key->val, HCI_LINK_KEY_SIZE);
-
- /* Derive Generic AMP Link Key (gamp) */
- err = hmac_sha256(keybuf, HCI_AMP_LINK_KEY_SIZE, "gamp", 4, gamp_key);
- if (err) {
- bt_dev_err(hdev, "could not derive Generic AMP Key: err %d", err);
- return err;
- }
-
- if (conn->key_type == HCI_LK_DEBUG_COMBINATION) {
- BT_DBG("Use Generic AMP Key (gamp)");
- memcpy(data, gamp_key, HCI_AMP_LINK_KEY_SIZE);
- return err;
- }
-
- /* Derive Dedicated AMP Link Key: "802b" is 802.11 PAL keyID */
- return hmac_sha256(gamp_key, HCI_AMP_LINK_KEY_SIZE, "802b", 4, data);
-}
-
-static void read_local_amp_assoc_complete(struct hci_dev *hdev, u8 status,
- u16 opcode, struct sk_buff *skb)
-{
- struct hci_rp_read_local_amp_assoc *rp = (void *)skb->data;
- struct amp_assoc *assoc = &hdev->loc_assoc;
- size_t rem_len, frag_len;
-
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
-
- if (rp->status)
- goto send_rsp;
-
- frag_len = skb->len - sizeof(*rp);
- rem_len = __le16_to_cpu(rp->rem_len);
-
- if (rem_len > frag_len) {
- BT_DBG("frag_len %zu rem_len %zu", frag_len, rem_len);
-
- memcpy(assoc->data + assoc->offset, rp->frag, frag_len);
- assoc->offset += frag_len;
-
- /* Read other fragments */
- amp_read_loc_assoc_frag(hdev, rp->phy_handle);
-
- return;
- }
-
- memcpy(assoc->data + assoc->offset, rp->frag, rem_len);
- assoc->len = assoc->offset + rem_len;
- assoc->offset = 0;
-
-send_rsp:
- /* Send A2MP Rsp when all fragments are received */
- a2mp_send_getampassoc_rsp(hdev, rp->status);
- a2mp_send_create_phy_link_req(hdev, rp->status);
-}
-
-void amp_read_loc_assoc_frag(struct hci_dev *hdev, u8 phy_handle)
-{
- struct hci_cp_read_local_amp_assoc cp;
- struct amp_assoc *loc_assoc = &hdev->loc_assoc;
- struct hci_request req;
- int err;
-
- BT_DBG("%s handle %d", hdev->name, phy_handle);
-
- cp.phy_handle = phy_handle;
- cp.max_len = cpu_to_le16(hdev->amp_assoc_size);
- cp.len_so_far = cpu_to_le16(loc_assoc->offset);
-
- hci_req_init(&req, hdev);
- hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
- err = hci_req_run_skb(&req, read_local_amp_assoc_complete);
- if (err < 0)
- a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID);
-}
-
-void amp_read_loc_assoc(struct hci_dev *hdev, struct amp_mgr *mgr)
-{
- struct hci_cp_read_local_amp_assoc cp;
- struct hci_request req;
- int err;
-
- memset(&hdev->loc_assoc, 0, sizeof(struct amp_assoc));
- memset(&cp, 0, sizeof(cp));
-
- cp.max_len = cpu_to_le16(hdev->amp_assoc_size);
-
- set_bit(READ_LOC_AMP_ASSOC, &mgr->state);
- hci_req_init(&req, hdev);
- hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
- err = hci_req_run_skb(&req, read_local_amp_assoc_complete);
- if (err < 0)
- a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID);
-}
-
-void amp_read_loc_assoc_final_data(struct hci_dev *hdev,
- struct hci_conn *hcon)
-{
- struct hci_cp_read_local_amp_assoc cp;
- struct amp_mgr *mgr = hcon->amp_mgr;
- struct hci_request req;
- int err;
-
- cp.phy_handle = hcon->handle;
- cp.len_so_far = cpu_to_le16(0);
- cp.max_len = cpu_to_le16(hdev->amp_assoc_size);
-
- set_bit(READ_LOC_AMP_ASSOC_FINAL, &mgr->state);
-
- /* Read Local AMP Assoc final link information data */
- hci_req_init(&req, hdev);
- hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp);
- err = hci_req_run_skb(&req, read_local_amp_assoc_complete);
- if (err < 0)
- a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID);
-}
-
-static void write_remote_amp_assoc_complete(struct hci_dev *hdev, u8 status,
- u16 opcode, struct sk_buff *skb)
-{
- struct hci_rp_write_remote_amp_assoc *rp = (void *)skb->data;
-
- BT_DBG("%s status 0x%2.2x phy_handle 0x%2.2x",
- hdev->name, rp->status, rp->phy_handle);
-
- if (rp->status)
- return;
-
- amp_write_rem_assoc_continue(hdev, rp->phy_handle);
-}
-
-/* Write AMP Assoc data fragments, returns true with last fragment written*/
-static bool amp_write_rem_assoc_frag(struct hci_dev *hdev,
- struct hci_conn *hcon)
-{
- struct hci_cp_write_remote_amp_assoc *cp;
- struct amp_mgr *mgr = hcon->amp_mgr;
- struct amp_ctrl *ctrl;
- struct hci_request req;
- u16 frag_len, len;
-
- ctrl = amp_ctrl_lookup(mgr, hcon->remote_id);
- if (!ctrl)
- return false;
-
- if (!ctrl->assoc_rem_len) {
- BT_DBG("all fragments are written");
- ctrl->assoc_rem_len = ctrl->assoc_len;
- ctrl->assoc_len_so_far = 0;
-
- amp_ctrl_put(ctrl);
- return true;
- }
-
- frag_len = min_t(u16, 248, ctrl->assoc_rem_len);
- len = frag_len + sizeof(*cp);
-
- cp = kzalloc(len, GFP_KERNEL);
- if (!cp) {
- amp_ctrl_put(ctrl);
- return false;
- }
-
- BT_DBG("hcon %p ctrl %p frag_len %u assoc_len %u rem_len %u",
- hcon, ctrl, frag_len, ctrl->assoc_len, ctrl->assoc_rem_len);
-
- cp->phy_handle = hcon->handle;
- cp->len_so_far = cpu_to_le16(ctrl->assoc_len_so_far);
- cp->rem_len = cpu_to_le16(ctrl->assoc_rem_len);
- memcpy(cp->frag, ctrl->assoc, frag_len);
-
- ctrl->assoc_len_so_far += frag_len;
- ctrl->assoc_rem_len -= frag_len;
-
- amp_ctrl_put(ctrl);
-
- hci_req_init(&req, hdev);
- hci_req_add(&req, HCI_OP_WRITE_REMOTE_AMP_ASSOC, len, cp);
- hci_req_run_skb(&req, write_remote_amp_assoc_complete);
-
- kfree(cp);
-
- return false;
-}
-
-void amp_write_rem_assoc_continue(struct hci_dev *hdev, u8 handle)
-{
- struct hci_conn *hcon;
-
- BT_DBG("%s phy handle 0x%2.2x", hdev->name, handle);
-
- hcon = hci_conn_hash_lookup_handle(hdev, handle);
- if (!hcon)
- return;
-
- /* Send A2MP create phylink rsp when all fragments are written */
- if (amp_write_rem_assoc_frag(hdev, hcon))
- a2mp_send_create_phy_link_rsp(hdev, 0);
-}
-
-void amp_write_remote_assoc(struct hci_dev *hdev, u8 handle)
-{
- struct hci_conn *hcon;
-
- BT_DBG("%s phy handle 0x%2.2x", hdev->name, handle);
-
- hcon = hci_conn_hash_lookup_handle(hdev, handle);
- if (!hcon)
- return;
-
- BT_DBG("%s phy handle 0x%2.2x hcon %p", hdev->name, handle, hcon);
-
- amp_write_rem_assoc_frag(hdev, hcon);
-}
-
-static void create_phylink_complete(struct hci_dev *hdev, u8 status,
- u16 opcode)
-{
- struct hci_cp_create_phy_link *cp;
-
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
-
- cp = hci_sent_cmd_data(hdev, HCI_OP_CREATE_PHY_LINK);
- if (!cp)
- return;
-
- hci_dev_lock(hdev);
-
- if (status) {
- struct hci_conn *hcon;
-
- hcon = hci_conn_hash_lookup_handle(hdev, cp->phy_handle);
- if (hcon)
- hci_conn_del(hcon);
- } else {
- amp_write_remote_assoc(hdev, cp->phy_handle);
- }
-
- hci_dev_unlock(hdev);
-}
-
-void amp_create_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
- struct hci_conn *hcon)
-{
- struct hci_cp_create_phy_link cp;
- struct hci_request req;
-
- cp.phy_handle = hcon->handle;
-
- BT_DBG("%s hcon %p phy handle 0x%2.2x", hdev->name, hcon,
- hcon->handle);
-
- if (phylink_gen_key(mgr->l2cap_conn->hcon, cp.key, &cp.key_len,
- &cp.key_type)) {
- BT_DBG("Cannot create link key");
- return;
- }
-
- hci_req_init(&req, hdev);
- hci_req_add(&req, HCI_OP_CREATE_PHY_LINK, sizeof(cp), &cp);
- hci_req_run(&req, create_phylink_complete);
-}
-
-static void accept_phylink_complete(struct hci_dev *hdev, u8 status,
- u16 opcode)
-{
- struct hci_cp_accept_phy_link *cp;
-
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
-
- if (status)
- return;
-
- cp = hci_sent_cmd_data(hdev, HCI_OP_ACCEPT_PHY_LINK);
- if (!cp)
- return;
-
- amp_write_remote_assoc(hdev, cp->phy_handle);
-}
-
-void amp_accept_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
- struct hci_conn *hcon)
-{
- struct hci_cp_accept_phy_link cp;
- struct hci_request req;
-
- cp.phy_handle = hcon->handle;
-
- BT_DBG("%s hcon %p phy handle 0x%2.2x", hdev->name, hcon,
- hcon->handle);
-
- if (phylink_gen_key(mgr->l2cap_conn->hcon, cp.key, &cp.key_len,
- &cp.key_type)) {
- BT_DBG("Cannot create link key");
- return;
- }
-
- hci_req_init(&req, hdev);
- hci_req_add(&req, HCI_OP_ACCEPT_PHY_LINK, sizeof(cp), &cp);
- hci_req_run(&req, accept_phylink_complete);
-}
-
-void amp_physical_cfm(struct hci_conn *bredr_hcon, struct hci_conn *hs_hcon)
-{
- struct hci_dev *bredr_hdev = hci_dev_hold(bredr_hcon->hdev);
- struct amp_mgr *mgr = hs_hcon->amp_mgr;
- struct l2cap_chan *bredr_chan;
-
- BT_DBG("bredr_hcon %p hs_hcon %p mgr %p", bredr_hcon, hs_hcon, mgr);
-
- if (!bredr_hdev || !mgr || !mgr->bredr_chan)
- return;
-
- bredr_chan = mgr->bredr_chan;
-
- l2cap_chan_lock(bredr_chan);
-
- set_bit(FLAG_EFS_ENABLE, &bredr_chan->flags);
- bredr_chan->remote_amp_id = hs_hcon->remote_id;
- bredr_chan->local_amp_id = hs_hcon->hdev->id;
- bredr_chan->hs_hcon = hs_hcon;
- bredr_chan->conn->mtu = hs_hcon->hdev->block_mtu;
-
- __l2cap_physical_cfm(bredr_chan, 0);
-
- l2cap_chan_unlock(bredr_chan);
-
- hci_dev_put(bredr_hdev);
-}
-
-void amp_create_logical_link(struct l2cap_chan *chan)
-{
- struct hci_conn *hs_hcon = chan->hs_hcon;
- struct hci_cp_create_accept_logical_link cp;
- struct hci_dev *hdev;
-
- BT_DBG("chan %p hs_hcon %p dst %pMR", chan, hs_hcon,
- &chan->conn->hcon->dst);
-
- if (!hs_hcon)
- return;
-
- hdev = hci_dev_hold(chan->hs_hcon->hdev);
- if (!hdev)
- return;
-
- cp.phy_handle = hs_hcon->handle;
-
- cp.tx_flow_spec.id = chan->local_id;
- cp.tx_flow_spec.stype = chan->local_stype;
- cp.tx_flow_spec.msdu = cpu_to_le16(chan->local_msdu);
- cp.tx_flow_spec.sdu_itime = cpu_to_le32(chan->local_sdu_itime);
- cp.tx_flow_spec.acc_lat = cpu_to_le32(chan->local_acc_lat);
- cp.tx_flow_spec.flush_to = cpu_to_le32(chan->local_flush_to);
-
- cp.rx_flow_spec.id = chan->remote_id;
- cp.rx_flow_spec.stype = chan->remote_stype;
- cp.rx_flow_spec.msdu = cpu_to_le16(chan->remote_msdu);
- cp.rx_flow_spec.sdu_itime = cpu_to_le32(chan->remote_sdu_itime);
- cp.rx_flow_spec.acc_lat = cpu_to_le32(chan->remote_acc_lat);
- cp.rx_flow_spec.flush_to = cpu_to_le32(chan->remote_flush_to);
-
- if (hs_hcon->out)
- hci_send_cmd(hdev, HCI_OP_CREATE_LOGICAL_LINK, sizeof(cp),
- &cp);
- else
- hci_send_cmd(hdev, HCI_OP_ACCEPT_LOGICAL_LINK, sizeof(cp),
- &cp);
-
- hci_dev_put(hdev);
-}
-
-void amp_disconnect_logical_link(struct hci_chan *hchan)
-{
- struct hci_conn *hcon = hchan->conn;
- struct hci_cp_disconn_logical_link cp;
-
- if (hcon->state != BT_CONNECTED) {
- BT_DBG("hchan %p not connected", hchan);
- return;
- }
-
- cp.log_handle = cpu_to_le16(hchan->handle);
- hci_send_cmd(hcon->hdev, HCI_OP_DISCONN_LOGICAL_LINK, sizeof(cp), &cp);
-}
-
-void amp_destroy_logical_link(struct hci_chan *hchan, u8 reason)
-{
- BT_DBG("hchan %p", hchan);
-
- hci_chan_del(hchan);
-}
diff --git a/net/bluetooth/amp.h b/net/bluetooth/amp.h
deleted file mode 100644
index 8848f8158ae4..000000000000
--- a/net/bluetooth/amp.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- Copyright (c) 2011,2012 Intel Corp.
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License version 2 and
- only version 2 as published by the Free Software Foundation.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-*/
-
-#ifndef __AMP_H
-#define __AMP_H
-
-struct amp_ctrl {
- struct list_head list;
- struct kref kref;
- __u8 id;
- __u16 assoc_len_so_far;
- __u16 assoc_rem_len;
- __u16 assoc_len;
- __u8 *assoc;
-};
-
-int amp_ctrl_put(struct amp_ctrl *ctrl);
-void amp_ctrl_get(struct amp_ctrl *ctrl);
-struct amp_ctrl *amp_ctrl_add(struct amp_mgr *mgr, u8 id);
-struct amp_ctrl *amp_ctrl_lookup(struct amp_mgr *mgr, u8 id);
-void amp_ctrl_list_flush(struct amp_mgr *mgr);
-
-struct hci_conn *phylink_add(struct hci_dev *hdev, struct amp_mgr *mgr,
- u8 remote_id, bool out);
-
-int phylink_gen_key(struct hci_conn *hcon, u8 *data, u8 *len, u8 *type);
-
-void amp_read_loc_info(struct hci_dev *hdev, struct amp_mgr *mgr);
-void amp_read_loc_assoc_frag(struct hci_dev *hdev, u8 phy_handle);
-void amp_read_loc_assoc(struct hci_dev *hdev, struct amp_mgr *mgr);
-void amp_read_loc_assoc_final_data(struct hci_dev *hdev,
- struct hci_conn *hcon);
-void amp_create_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
- struct hci_conn *hcon);
-void amp_accept_phylink(struct hci_dev *hdev, struct amp_mgr *mgr,
- struct hci_conn *hcon);
-
-#if IS_ENABLED(CONFIG_BT_HS)
-void amp_create_logical_link(struct l2cap_chan *chan);
-void amp_disconnect_logical_link(struct hci_chan *hchan);
-#else
-static inline void amp_create_logical_link(struct l2cap_chan *chan)
-{
-}
-
-static inline void amp_disconnect_logical_link(struct hci_chan *hchan)
-{
-}
-#endif
-
-void amp_write_remote_assoc(struct hci_dev *hdev, u8 handle);
-void amp_write_rem_assoc_continue(struct hci_dev *hdev, u8 handle);
-void amp_physical_cfm(struct hci_conn *bredr_hcon, struct hci_conn *hs_hcon);
-void amp_create_logical_link(struct l2cap_chan *chan);
-void amp_disconnect_logical_link(struct hci_chan *hchan);
-void amp_destroy_logical_link(struct hci_chan *hchan, u8 reason);
-
-#endif /* __AMP_H */
diff --git a/net/bluetooth/aosp.c b/net/bluetooth/aosp.c
new file mode 100644
index 000000000000..59025771af53
--- /dev/null
+++ b/net/bluetooth/aosp.c
@@ -0,0 +1,210 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 Intel Corporation
+ */
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+#include "aosp.h"
+
+/* Command complete parameters of LE_Get_Vendor_Capabilities_Command
+ * The parameters grow over time. The base version that declares the
+ * version_supported field is v0.95. Refer to
+ * https://cs.android.com/android/platform/superproject/+/master:system/
+ * bt/gd/hci/controller.cc;l=452?q=le_get_vendor_capabilities_handler
+ */
+struct aosp_rp_le_get_vendor_capa {
+ /* v0.95: 15 octets */
+ __u8 status;
+ __u8 max_advt_instances;
+ __u8 offloaded_resolution_of_private_address;
+ __le16 total_scan_results_storage;
+ __u8 max_irk_list_sz;
+ __u8 filtering_support;
+ __u8 max_filter;
+ __u8 activity_energy_info_support;
+ __le16 version_supported;
+ __le16 total_num_of_advt_tracked;
+ __u8 extended_scan_support;
+ __u8 debug_logging_supported;
+ /* v0.96: 16 octets */
+ __u8 le_address_generation_offloading_support;
+ /* v0.98: 21 octets */
+ __le32 a2dp_source_offload_capability_mask;
+ __u8 bluetooth_quality_report_support;
+ /* v1.00: 25 octets */
+ __le32 dynamic_audio_buffer_support;
+} __packed;
+
+#define VENDOR_CAPA_BASE_SIZE 15
+#define VENDOR_CAPA_0_98_SIZE 21
+
+void aosp_do_open(struct hci_dev *hdev)
+{
+ struct sk_buff *skb;
+ struct aosp_rp_le_get_vendor_capa *rp;
+ u16 version_supported;
+
+ if (!hdev->aosp_capable)
+ return;
+
+ bt_dev_dbg(hdev, "Initialize AOSP extension");
+
+ /* LE Get Vendor Capabilities Command */
+ skb = __hci_cmd_sync(hdev, hci_opcode_pack(0x3f, 0x153), 0, NULL,
+ HCI_CMD_TIMEOUT);
+ if (IS_ERR_OR_NULL(skb)) {
+ if (!skb)
+ skb = ERR_PTR(-EIO);
+
+ bt_dev_err(hdev, "AOSP get vendor capabilities (%ld)",
+ PTR_ERR(skb));
+ return;
+ }
+
+ /* A basic length check */
+ if (skb->len < VENDOR_CAPA_BASE_SIZE)
+ goto length_error;
+
+ rp = (struct aosp_rp_le_get_vendor_capa *)skb->data;
+
+ version_supported = le16_to_cpu(rp->version_supported);
+ /* AOSP displays the version number like v0.98, v1.00, etc. */
+ bt_dev_info(hdev, "AOSP extensions version v%u.%02u",
+ version_supported >> 8, version_supported & 0xff);
+
+ /* Do not support very old versions. */
+ if (version_supported < 95) {
+ bt_dev_warn(hdev, "AOSP capabilities version %u too old",
+ version_supported);
+ goto done;
+ }
+
+ if (version_supported < 98) {
+ bt_dev_warn(hdev, "AOSP quality report is not supported");
+ goto done;
+ }
+
+ if (skb->len < VENDOR_CAPA_0_98_SIZE)
+ goto length_error;
+
+ /* The bluetooth_quality_report_support is defined at version
+ * v0.98. Refer to
+ * https://cs.android.com/android/platform/superproject/+/
+ * master:system/bt/gd/hci/controller.cc;l=477
+ */
+ if (rp->bluetooth_quality_report_support) {
+ hdev->aosp_quality_report = true;
+ bt_dev_info(hdev, "AOSP quality report is supported");
+ }
+
+ goto done;
+
+length_error:
+ bt_dev_err(hdev, "AOSP capabilities length %d too short", skb->len);
+
+done:
+ kfree_skb(skb);
+}
+
+void aosp_do_close(struct hci_dev *hdev)
+{
+ if (!hdev->aosp_capable)
+ return;
+
+ bt_dev_dbg(hdev, "Cleanup of AOSP extension");
+}
+
+/* BQR command */
+#define BQR_OPCODE hci_opcode_pack(0x3f, 0x015e)
+
+/* BQR report action */
+#define REPORT_ACTION_ADD 0x00
+#define REPORT_ACTION_DELETE 0x01
+#define REPORT_ACTION_CLEAR 0x02
+
+/* BQR event masks */
+#define QUALITY_MONITORING BIT(0)
+#define APPRAOCHING_LSTO BIT(1)
+#define A2DP_AUDIO_CHOPPY BIT(2)
+#define SCO_VOICE_CHOPPY BIT(3)
+
+#define DEFAULT_BQR_EVENT_MASK (QUALITY_MONITORING | APPRAOCHING_LSTO | \
+ A2DP_AUDIO_CHOPPY | SCO_VOICE_CHOPPY)
+
+/* Reporting at milliseconds so as not to stress the controller too much.
+ * Range: 0 ~ 65535 ms
+ */
+#define DEFALUT_REPORT_INTERVAL_MS 5000
+
+struct aosp_bqr_cp {
+ __u8 report_action;
+ __u32 event_mask;
+ __u16 min_report_interval;
+} __packed;
+
+static int enable_quality_report(struct hci_dev *hdev)
+{
+ struct sk_buff *skb;
+ struct aosp_bqr_cp cp;
+
+ cp.report_action = REPORT_ACTION_ADD;
+ cp.event_mask = DEFAULT_BQR_EVENT_MASK;
+ cp.min_report_interval = DEFALUT_REPORT_INTERVAL_MS;
+
+ skb = __hci_cmd_sync(hdev, BQR_OPCODE, sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+ if (IS_ERR_OR_NULL(skb)) {
+ if (!skb)
+ skb = ERR_PTR(-EIO);
+
+ bt_dev_err(hdev, "Enabling Android BQR failed (%ld)",
+ PTR_ERR(skb));
+ return PTR_ERR(skb);
+ }
+
+ kfree_skb(skb);
+ return 0;
+}
+
+static int disable_quality_report(struct hci_dev *hdev)
+{
+ struct sk_buff *skb;
+ struct aosp_bqr_cp cp = { 0 };
+
+ cp.report_action = REPORT_ACTION_CLEAR;
+
+ skb = __hci_cmd_sync(hdev, BQR_OPCODE, sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+ if (IS_ERR_OR_NULL(skb)) {
+ if (!skb)
+ skb = ERR_PTR(-EIO);
+
+ bt_dev_err(hdev, "Disabling Android BQR failed (%ld)",
+ PTR_ERR(skb));
+ return PTR_ERR(skb);
+ }
+
+ kfree_skb(skb);
+ return 0;
+}
+
+bool aosp_has_quality_report(struct hci_dev *hdev)
+{
+ return hdev->aosp_quality_report;
+}
+
+int aosp_set_quality_report(struct hci_dev *hdev, bool enable)
+{
+ if (!aosp_has_quality_report(hdev))
+ return -EOPNOTSUPP;
+
+ bt_dev_dbg(hdev, "quality report enable %d", enable);
+
+ /* Enable or disable the quality report feature. */
+ if (enable)
+ return enable_quality_report(hdev);
+ else
+ return disable_quality_report(hdev);
+}
diff --git a/net/bluetooth/aosp.h b/net/bluetooth/aosp.h
new file mode 100644
index 000000000000..2fd8886d51b2
--- /dev/null
+++ b/net/bluetooth/aosp.h
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 Intel Corporation
+ */
+
+#if IS_ENABLED(CONFIG_BT_AOSPEXT)
+
+void aosp_do_open(struct hci_dev *hdev);
+void aosp_do_close(struct hci_dev *hdev);
+
+bool aosp_has_quality_report(struct hci_dev *hdev);
+int aosp_set_quality_report(struct hci_dev *hdev, bool enable);
+
+#else
+
+static inline void aosp_do_open(struct hci_dev *hdev) {}
+static inline void aosp_do_close(struct hci_dev *hdev) {}
+
+static inline bool aosp_has_quality_report(struct hci_dev *hdev)
+{
+ return false;
+}
+
+static inline int aosp_set_quality_report(struct hci_dev *hdev, bool enable)
+{
+ return -EOPNOTSUPP;
+}
+
+#endif
diff --git a/net/bluetooth/bnep/Kconfig b/net/bluetooth/bnep/Kconfig
index 9b70317c49dc..aac02b5b0d17 100644
--- a/net/bluetooth/bnep/Kconfig
+++ b/net/bluetooth/bnep/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config BT_BNEP
tristate "BNEP protocol support"
depends on BT_BREDR
diff --git a/net/bluetooth/bnep/Makefile b/net/bluetooth/bnep/Makefile
index c7821e76ca56..8af9d56bb012 100644
--- a/net/bluetooth/bnep/Makefile
+++ b/net/bluetooth/bnep/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the Linux Bluetooth BNEP layer.
#
diff --git a/net/bluetooth/bnep/bnep.h b/net/bluetooth/bnep/bnep.h
index 40854c99bc1e..9680473ed7ef 100644
--- a/net/bluetooth/bnep/bnep.h
+++ b/net/bluetooth/bnep/bnep.h
@@ -1,18 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
BNEP protocol definition for Linux Bluetooth stack (BlueZ).
Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com>
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License, version 2, as
- published by the Free Software Foundation.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _BNEP_H
@@ -84,14 +74,14 @@ struct bnep_setup_conn_req {
__u8 type;
__u8 ctrl;
__u8 uuid_size;
- __u8 service[0];
+ __u8 service[];
} __packed;
struct bnep_set_filter_req {
__u8 type;
__u8 ctrl;
__be16 len;
- __u8 list[0];
+ __u8 list[];
} __packed;
struct bnep_control_rsp {
@@ -103,7 +93,7 @@ struct bnep_control_rsp {
struct bnep_ext_hdr {
__u8 type;
__u8 len;
- __u8 data[0];
+ __u8 data[];
} __packed;
/* BNEP ioctl defines */
diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c
index 7b3965861013..d44987d4515c 100644
--- a/net/bluetooth/bnep/core.c
+++ b/net/bluetooth/bnep/core.c
@@ -29,7 +29,7 @@
#include <linux/kthread.h>
#include <linux/file.h>
#include <linux/etherdevice.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/l2cap.h>
@@ -126,8 +126,8 @@ static int bnep_ctrl_set_netfilter(struct bnep_session *s, __be16 *data, int len
f[i].start = get_unaligned_be16(data++);
f[i].end = get_unaligned_be16(data++);
- BT_DBG("proto filter start %d end %d",
- f[i].start, f[i].end);
+ BT_DBG("proto filter start %u end %u",
+ f[i].start, f[i].end);
}
if (i < BNEP_MAX_PROTO_FILTERS)
@@ -266,7 +266,7 @@ static int bnep_rx_extension(struct bnep_session *s, struct sk_buff *skb)
break;
}
- BT_DBG("type 0x%x len %d", h->type, h->len);
+ BT_DBG("type 0x%x len %u", h->type, h->len);
switch (h->type & BNEP_TYPE_MASK) {
case BNEP_EXT_CONTROL:
@@ -385,7 +385,8 @@ static int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb)
case BNEP_COMPRESSED_DST_ONLY:
__skb_put_data(nskb, skb_mac_header(skb), ETH_ALEN);
- __skb_put_data(nskb, s->eh.h_source, ETH_ALEN + 2);
+ __skb_put_data(nskb, s->eh.h_source, ETH_ALEN);
+ put_unaligned(s->eh.h_proto, (__be16 *)__skb_put(nskb, 2));
break;
case BNEP_GENERAL:
@@ -400,7 +401,7 @@ static int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb)
dev->stats.rx_packets++;
nskb->ip_summed = CHECKSUM_NONE;
nskb->protocol = eth_type_trans(nskb, dev);
- netif_rx_ni(nskb);
+ netif_rx(nskb);
return 0;
badframe:
@@ -424,7 +425,7 @@ static int bnep_tx_frame(struct bnep_session *s, struct sk_buff *skb)
int len = 0, il = 0;
u8 type = 0;
- BT_DBG("skb %p dev %p type %d", skb, skb->dev, skb->pkt_type);
+ BT_DBG("skb %p dev %p type %u", skb, skb->dev, skb->pkt_type);
if (!skb->dev) {
/* Control frame sent by us */
@@ -489,9 +490,6 @@ static int bnep_session(void *arg)
add_wait_queue(sk_sleep(sk), &wait);
while (1) {
- /* Ensure session->terminate is updated */
- smp_mb__before_atomic();
-
if (atomic_read(&s->terminate))
break;
/* RX */
@@ -512,6 +510,10 @@ static int bnep_session(void *arg)
break;
netif_wake_queue(dev);
+ /*
+ * wait_woken() performs the necessary memory barriers
+ * for us; see the header comment for this primitive.
+ */
wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
remove_wait_queue(sk_sleep(sk), &wait);
@@ -534,7 +536,7 @@ static int bnep_session(void *arg)
up_write(&bnep_session_sem);
free_netdev(dev);
- module_put_and_exit(0);
+ module_put_and_kthread_exit(0);
return 0;
}
@@ -548,7 +550,7 @@ static struct device *bnep_get_device(struct bnep_session *session)
return &conn->hcon->dev;
}
-static struct device_type bnep_type = {
+static const struct device_type bnep_type = {
.name = "bluetooth",
};
@@ -593,7 +595,7 @@ int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock)
* ie. eh.h_dest is our local address. */
memcpy(s->eh.h_dest, &src, ETH_ALEN);
memcpy(s->eh.h_source, &dst, ETH_ALEN);
- memcpy(dev->dev_addr, s->eh.h_dest, ETH_ALEN);
+ eth_hw_addr_set(dev, s->eh.h_dest);
s->dev = dev;
s->sock = sock;
@@ -743,8 +745,7 @@ static int __init bnep_init(void)
if (flt[0])
BT_INFO("BNEP filters: %s", flt);
- bnep_sock_init();
- return 0;
+ return bnep_sock_init();
}
static void __exit bnep_exit(void)
diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c
index 1d4d7d415730..cc1cff63194f 100644
--- a/net/bluetooth/bnep/netdev.c
+++ b/net/bluetooth/bnep/netdev.c
@@ -112,7 +112,7 @@ static int bnep_net_set_mac_addr(struct net_device *dev, void *arg)
return 0;
}
-static void bnep_net_timeout(struct net_device *dev)
+static void bnep_net_timeout(struct net_device *dev, unsigned int txqueue)
{
BT_DBG("net_timeout");
netif_wake_queue(dev);
diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c
index 00deacdcb51c..00d47bcf4d7d 100644
--- a/net/bluetooth/bnep/sock.c
+++ b/net/bluetooth/bnep/sock.c
@@ -24,6 +24,7 @@
SOFTWARE IS DISCLAIMED.
*/
+#include <linux/compat.h>
#include <linux/export.h>
#include <linux/file.h>
@@ -49,18 +50,17 @@ static int bnep_sock_release(struct socket *sock)
return 0;
}
-static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+static int do_bnep_sock_ioctl(struct socket *sock, unsigned int cmd, void __user *argp)
{
struct bnep_connlist_req cl;
struct bnep_connadd_req ca;
struct bnep_conndel_req cd;
struct bnep_conninfo ci;
struct socket *nsock;
- void __user *argp = (void __user *)arg;
__u32 supp_feat = BIT(BNEP_SETUP_RESPONSE);
int err;
- BT_DBG("cmd %x arg %lx", cmd, arg);
+ BT_DBG("cmd %x arg %p", cmd, argp);
switch (cmd) {
case BNEPCONNADD:
@@ -134,16 +134,22 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
return 0;
}
+static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ return do_bnep_sock_ioctl(sock, cmd, (void __user *)arg);
+}
+
#ifdef CONFIG_COMPAT
static int bnep_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
+ void __user *argp = compat_ptr(arg);
if (cmd == BNEPGETCONNLIST) {
struct bnep_connlist_req cl;
+ unsigned __user *p = argp;
u32 uci;
int err;
- if (get_user(cl.cnum, (u32 __user *) arg) ||
- get_user(uci, (u32 __user *) (arg + 4)))
+ if (get_user(cl.cnum, p) || get_user(uci, p + 1))
return -EFAULT;
cl.ci = compat_ptr(uci);
@@ -153,13 +159,13 @@ static int bnep_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigne
err = bnep_get_connlist(&cl);
- if (!err && put_user(cl.cnum, (u32 __user *) arg))
+ if (!err && put_user(cl.cnum, p))
err = -EFAULT;
return err;
}
- return bnep_sock_ioctl(sock, cmd, arg);
+ return do_bnep_sock_ioctl(sock, cmd, argp);
}
#endif
@@ -177,8 +183,6 @@ static const struct proto_ops bnep_sock_ops = {
.recvmsg = sock_no_recvmsg,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
- .setsockopt = sock_no_setsockopt,
- .getsockopt = sock_no_getsockopt,
.connect = sock_no_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
@@ -201,21 +205,13 @@ static int bnep_sock_create(struct net *net, struct socket *sock, int protocol,
if (sock->type != SOCK_RAW)
return -ESOCKTNOSUPPORT;
- sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &bnep_proto, kern);
+ sk = bt_sock_alloc(net, sock, &bnep_proto, protocol, GFP_ATOMIC, kern);
if (!sk)
return -ENOMEM;
- sock_init_data(sock, sk);
-
sock->ops = &bnep_sock_ops;
-
sock->state = SS_UNCONNECTED;
- sock_reset_flag(sk, SOCK_ZAPPED);
-
- sk->sk_protocol = protocol;
- sk->sk_state = BT_OPEN;
-
bt_sock_link(&bnep_sk_list, sk);
return 0;
}
diff --git a/net/bluetooth/cmtp/Kconfig b/net/bluetooth/cmtp/Kconfig
index 939da0fbdd88..34e923466236 100644
--- a/net/bluetooth/cmtp/Kconfig
+++ b/net/bluetooth/cmtp/Kconfig
@@ -1,6 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
config BT_CMTP
- tristate "CMTP protocol support"
- depends on BT_BREDR && ISDN_CAPI
+ tristate "CMTP protocol support (DEPRECATED)"
+ depends on BT_BREDR && ISDN_CAPI && DEPRECATED
help
CMTP (CAPI Message Transport Protocol) is a transport layer
for CAPI messages. CMTP is required for the Bluetooth Common
diff --git a/net/bluetooth/cmtp/Makefile b/net/bluetooth/cmtp/Makefile
index 890a9a5a6861..b2262ca97499 100644
--- a/net/bluetooth/cmtp/Makefile
+++ b/net/bluetooth/cmtp/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the Linux Bluetooth CMTP layer
#
diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c
index eb41556002e3..884703fda979 100644
--- a/net/bluetooth/cmtp/capi.c
+++ b/net/bluetooth/cmtp/capi.c
@@ -74,7 +74,7 @@ static struct cmtp_application *cmtp_application_add(struct cmtp_session *sessio
{
struct cmtp_application *app = kzalloc(sizeof(*app), GFP_KERNEL);
- BT_DBG("session %p application %p appl %d", session, app, appl);
+ BT_DBG("session %p application %p appl %u", session, app, appl);
if (!app)
return NULL;
@@ -135,7 +135,7 @@ static void cmtp_send_capimsg(struct cmtp_session *session, struct sk_buff *skb)
{
struct cmtp_scb *scb = (void *) skb->cb;
- BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+ BT_DBG("session %p skb %p len %u", session, skb, skb->len);
scb->id = -1;
scb->data = (CAPIMSG_COMMAND(skb->data) == CAPI_DATA_B3);
@@ -152,7 +152,7 @@ static void cmtp_send_interopmsg(struct cmtp_session *session,
struct sk_buff *skb;
unsigned char *s;
- BT_DBG("session %p subcmd 0x%02x appl %d msgnum %d", session, subcmd, appl, msgnum);
+ BT_DBG("session %p subcmd 0x%02x appl %u msgnum %u", session, subcmd, appl, msgnum);
skb = alloc_skb(CAPI_MSG_BASELEN + 6 + len, GFP_ATOMIC);
if (!skb) {
@@ -188,7 +188,7 @@ static void cmtp_recv_interopmsg(struct cmtp_session *session, struct sk_buff *s
__u16 appl, msgnum, func, info;
__u32 controller;
- BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+ BT_DBG("session %p skb %p len %u", session, skb, skb->len);
switch (CAPIMSG_SUBCOMMAND(skb->data)) {
case CAPI_CONF:
@@ -248,18 +248,10 @@ static void cmtp_recv_interopmsg(struct cmtp_session *session, struct sk_buff *s
break;
case CAPI_FUNCTION_GET_MANUFACTURER:
- if (skb->len < CAPI_MSG_BASELEN + 15)
- break;
-
- if (!info && ctrl) {
- int len = min_t(uint, CAPI_MANUFACTURER_LEN,
- skb->data[CAPI_MSG_BASELEN + 14]);
-
- memset(ctrl->manu, 0, CAPI_MANUFACTURER_LEN);
- strncpy(ctrl->manu,
- skb->data + CAPI_MSG_BASELEN + 15, len);
- }
-
+ if (!info && ctrl && skb->len > CAPI_MSG_BASELEN + 14)
+ strscpy_pad(ctrl->manu,
+ skb->data + CAPI_MSG_BASELEN + 15,
+ skb->data[CAPI_MSG_BASELEN + 14]);
break;
case CAPI_FUNCTION_GET_VERSION:
@@ -276,18 +268,10 @@ static void cmtp_recv_interopmsg(struct cmtp_session *session, struct sk_buff *s
break;
case CAPI_FUNCTION_GET_SERIAL_NUMBER:
- if (skb->len < CAPI_MSG_BASELEN + 17)
- break;
-
- if (!info && ctrl) {
- int len = min_t(uint, CAPI_SERIAL_LEN,
- skb->data[CAPI_MSG_BASELEN + 16]);
-
- memset(ctrl->serial, 0, CAPI_SERIAL_LEN);
- strncpy(ctrl->serial,
- skb->data + CAPI_MSG_BASELEN + 17, len);
- }
-
+ if (!info && ctrl && skb->len > CAPI_MSG_BASELEN + 16)
+ strscpy_pad(ctrl->serial,
+ skb->data + CAPI_MSG_BASELEN + 17,
+ skb->data[CAPI_MSG_BASELEN + 16]);
break;
}
@@ -321,7 +305,7 @@ void cmtp_recv_capimsg(struct cmtp_session *session, struct sk_buff *skb)
__u16 appl;
__u32 contr;
- BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+ BT_DBG("session %p skb %p len %u", session, skb, skb->len);
if (skb->len < CAPI_MSG_BASELEN)
return;
@@ -344,7 +328,7 @@ void cmtp_recv_capimsg(struct cmtp_session *session, struct sk_buff *skb)
appl = application->appl;
CAPIMSG_SETAPPID(skb->data, appl);
} else {
- BT_ERR("Can't find application with id %d", appl);
+ BT_ERR("Can't find application with id %u", appl);
kfree_skb(skb);
return;
}
@@ -385,8 +369,8 @@ static void cmtp_register_appl(struct capi_ctr *ctrl, __u16 appl, capi_register_
unsigned char buf[8];
int err = 0, nconn, want = rp->level3cnt;
- BT_DBG("ctrl %p appl %d level3cnt %d datablkcnt %d datablklen %d",
- ctrl, appl, rp->level3cnt, rp->datablkcnt, rp->datablklen);
+ BT_DBG("ctrl %p appl %u level3cnt %u datablkcnt %u datablklen %u",
+ ctrl, appl, rp->level3cnt, rp->datablkcnt, rp->datablklen);
application = cmtp_application_add(session, appl);
if (!application) {
@@ -450,7 +434,7 @@ static void cmtp_release_appl(struct capi_ctr *ctrl, __u16 appl)
struct cmtp_session *session = ctrl->driverdata;
struct cmtp_application *application;
- BT_DBG("ctrl %p appl %d", ctrl, appl);
+ BT_DBG("ctrl %p appl %u", ctrl, appl);
application = cmtp_application_get(session, CMTP_APPLID, appl);
if (!application) {
@@ -483,7 +467,7 @@ static u16 cmtp_send_message(struct capi_ctr *ctrl, struct sk_buff *skb)
application = cmtp_application_get(session, CMTP_APPLID, appl);
if ((!application) || (application->state != BT_CONNECTED)) {
- BT_ERR("Can't find application with id %d", appl);
+ BT_ERR("Can't find application with id %u", appl);
return CAPI_ILLAPPNR;
}
@@ -515,7 +499,7 @@ static int cmtp_proc_show(struct seq_file *m, void *v)
seq_printf(m, "ctrl %d\n", session->num);
list_for_each_entry(app, &session->applications, list) {
- seq_printf(m, "appl %d -> %d\n", app->appl, app->mapping);
+ seq_printf(m, "appl %u -> %u\n", app->appl, app->mapping);
}
return 0;
diff --git a/net/bluetooth/cmtp/cmtp.h b/net/bluetooth/cmtp/cmtp.h
index c32638dddbf9..f6b9dc4e408f 100644
--- a/net/bluetooth/cmtp/cmtp.h
+++ b/net/bluetooth/cmtp/cmtp.h
@@ -26,7 +26,7 @@
#include <linux/types.h>
#include <net/bluetooth/bluetooth.h>
-#define BTNAMSIZ 18
+#define BTNAMSIZ 21
/* CMTP ioctl defines */
#define CMTPCONNADD _IOW('C', 200, int)
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index 7f26a5a19ff6..90d130588a3e 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -288,9 +288,6 @@ static int cmtp_session(void *arg)
add_wait_queue(sk_sleep(sk), &wait);
while (1) {
- /* Ensure session->terminate is updated */
- smp_mb__before_atomic();
-
if (atomic_read(&session->terminate))
break;
if (sk->sk_state != BT_CONNECTED)
@@ -306,6 +303,10 @@ static int cmtp_session(void *arg)
cmtp_process_transmit(session);
+ /*
+ * wait_woken() performs the necessary memory barriers
+ * for us; see the header comment for this primitive.
+ */
wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
remove_wait_queue(sk_sleep(sk), &wait);
@@ -322,7 +323,7 @@ static int cmtp_session(void *arg)
up_write(&cmtp_session_sem);
kfree(session);
- module_put_and_exit(0);
+ module_put_and_kthread_exit(0);
return 0;
}
@@ -391,6 +392,11 @@ int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock)
if (!(session->flags & BIT(CMTP_LOOPBACK))) {
err = cmtp_attach_device(session);
if (err < 0) {
+ /* Caller will call fput in case of failure, and so
+ * will cmtp_session kthread.
+ */
+ get_file(session->sock->file);
+
atomic_inc(&session->terminate);
wake_up_interruptible(sk_sleep(session->sock->sk));
up_write(&cmtp_session_sem);
@@ -431,9 +437,10 @@ int cmtp_del_connection(struct cmtp_conndel_req *req)
/* Stop session thread */
atomic_inc(&session->terminate);
- /* Ensure session->terminate is updated */
- smp_mb__after_atomic();
-
+ /*
+ * See the comment preceding the call to wait_woken()
+ * in cmtp_session().
+ */
wake_up_interruptible(sk_sleep(session->sock->sk));
} else
err = -ENOENT;
@@ -494,9 +501,7 @@ static int __init cmtp_init(void)
{
BT_INFO("CMTP (CAPI Emulation) ver %s", VERSION);
- cmtp_init_sockets();
-
- return 0;
+ return cmtp_init_sockets();
}
static void __exit cmtp_exit(void)
diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c
index e08f28fadd65..96d49d9fae96 100644
--- a/net/bluetooth/cmtp/sock.c
+++ b/net/bluetooth/cmtp/sock.c
@@ -63,17 +63,16 @@ static int cmtp_sock_release(struct socket *sock)
return 0;
}
-static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+static int do_cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, void __user *argp)
{
struct cmtp_connadd_req ca;
struct cmtp_conndel_req cd;
struct cmtp_connlist_req cl;
struct cmtp_conninfo ci;
struct socket *nsock;
- void __user *argp = (void __user *)arg;
int err;
- BT_DBG("cmd %x arg %lx", cmd, arg);
+ BT_DBG("cmd %x arg %p", cmd, argp);
switch (cmd) {
case CMTPCONNADD:
@@ -137,16 +136,22 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
return -EINVAL;
}
+static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ return do_cmtp_sock_ioctl(sock, cmd, (void __user *)arg);
+}
+
#ifdef CONFIG_COMPAT
static int cmtp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
+ void __user *argp = compat_ptr(arg);
if (cmd == CMTPGETCONNLIST) {
struct cmtp_connlist_req cl;
+ u32 __user *p = argp;
u32 uci;
int err;
- if (get_user(cl.cnum, (u32 __user *) arg) ||
- get_user(uci, (u32 __user *) (arg + 4)))
+ if (get_user(cl.cnum, p) || get_user(uci, p + 1))
return -EFAULT;
cl.ci = compat_ptr(uci);
@@ -156,13 +161,13 @@ static int cmtp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigne
err = cmtp_get_connlist(&cl);
- if (!err && put_user(cl.cnum, (u32 __user *) arg))
+ if (!err && put_user(cl.cnum, p))
err = -EFAULT;
return err;
}
- return cmtp_sock_ioctl(sock, cmd, arg);
+ return do_cmtp_sock_ioctl(sock, cmd, argp);
}
#endif
@@ -180,8 +185,6 @@ static const struct proto_ops cmtp_sock_ops = {
.recvmsg = sock_no_recvmsg,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
- .setsockopt = sock_no_setsockopt,
- .getsockopt = sock_no_getsockopt,
.connect = sock_no_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
diff --git a/net/bluetooth/coredump.c b/net/bluetooth/coredump.c
new file mode 100644
index 000000000000..720cb79adf96
--- /dev/null
+++ b/net/bluetooth/coredump.c
@@ -0,0 +1,553 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 Google Corporation
+ */
+
+#include <linux/devcoredump.h>
+
+#include <linux/unaligned.h>
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+enum hci_devcoredump_pkt_type {
+ HCI_DEVCOREDUMP_PKT_INIT,
+ HCI_DEVCOREDUMP_PKT_SKB,
+ HCI_DEVCOREDUMP_PKT_PATTERN,
+ HCI_DEVCOREDUMP_PKT_COMPLETE,
+ HCI_DEVCOREDUMP_PKT_ABORT,
+};
+
+struct hci_devcoredump_skb_cb {
+ u16 pkt_type;
+};
+
+struct hci_devcoredump_skb_pattern {
+ u8 pattern;
+ u32 len;
+} __packed;
+
+#define hci_dmp_cb(skb) ((struct hci_devcoredump_skb_cb *)((skb)->cb))
+
+#define DBG_UNEXPECTED_STATE() \
+ bt_dev_dbg(hdev, \
+ "Unexpected packet (%d) for state (%d). ", \
+ hci_dmp_cb(skb)->pkt_type, hdev->dump.state)
+
+#define MAX_DEVCOREDUMP_HDR_SIZE 512 /* bytes */
+
+static int hci_devcd_update_hdr_state(char *buf, size_t size, int state)
+{
+ int len = 0;
+
+ if (!buf)
+ return 0;
+
+ len = scnprintf(buf, size, "Bluetooth devcoredump\nState: %d\n", state);
+
+ return len + 1; /* scnprintf adds \0 at the end upon state rewrite */
+}
+
+/* Call with hci_dev_lock only. */
+static int hci_devcd_update_state(struct hci_dev *hdev, int state)
+{
+ bt_dev_dbg(hdev, "Updating devcoredump state from %d to %d.",
+ hdev->dump.state, state);
+
+ hdev->dump.state = state;
+
+ return hci_devcd_update_hdr_state(hdev->dump.head,
+ hdev->dump.alloc_size, state);
+}
+
+static int hci_devcd_mkheader(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ char dump_start[] = "--- Start dump ---\n";
+ char hdr[80];
+ int hdr_len;
+
+ hdr_len = hci_devcd_update_hdr_state(hdr, sizeof(hdr),
+ HCI_DEVCOREDUMP_IDLE);
+ skb_put_data(skb, hdr, hdr_len);
+
+ if (hdev->dump.dmp_hdr)
+ hdev->dump.dmp_hdr(hdev, skb);
+
+ skb_put_data(skb, dump_start, strlen(dump_start));
+
+ return skb->len;
+}
+
+/* Do not call with hci_dev_lock since this calls driver code. */
+static void hci_devcd_notify(struct hci_dev *hdev, int state)
+{
+ if (hdev->dump.notify_change)
+ hdev->dump.notify_change(hdev, state);
+}
+
+/* Call with hci_dev_lock only. */
+void hci_devcd_reset(struct hci_dev *hdev)
+{
+ hdev->dump.head = NULL;
+ hdev->dump.tail = NULL;
+ hdev->dump.alloc_size = 0;
+
+ hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_IDLE);
+
+ cancel_delayed_work(&hdev->dump.dump_timeout);
+ skb_queue_purge(&hdev->dump.dump_q);
+}
+
+/* Call with hci_dev_lock only. */
+static void hci_devcd_free(struct hci_dev *hdev)
+{
+ vfree(hdev->dump.head);
+
+ hci_devcd_reset(hdev);
+}
+
+/* Call with hci_dev_lock only. */
+static int hci_devcd_alloc(struct hci_dev *hdev, u32 size)
+{
+ hdev->dump.head = vmalloc(size);
+ if (!hdev->dump.head)
+ return -ENOMEM;
+
+ hdev->dump.alloc_size = size;
+ hdev->dump.tail = hdev->dump.head;
+ hdev->dump.end = hdev->dump.head + size;
+
+ hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_IDLE);
+
+ return 0;
+}
+
+/* Call with hci_dev_lock only. */
+static bool hci_devcd_copy(struct hci_dev *hdev, char *buf, u32 size)
+{
+ if (hdev->dump.tail + size > hdev->dump.end)
+ return false;
+
+ memcpy(hdev->dump.tail, buf, size);
+ hdev->dump.tail += size;
+
+ return true;
+}
+
+/* Call with hci_dev_lock only. */
+static bool hci_devcd_memset(struct hci_dev *hdev, u8 pattern, u32 len)
+{
+ if (hdev->dump.tail + len > hdev->dump.end)
+ return false;
+
+ memset(hdev->dump.tail, pattern, len);
+ hdev->dump.tail += len;
+
+ return true;
+}
+
+/* Call with hci_dev_lock only. */
+static int hci_devcd_prepare(struct hci_dev *hdev, u32 dump_size)
+{
+ struct sk_buff *skb;
+ int dump_hdr_size;
+ int err = 0;
+
+ skb = alloc_skb(MAX_DEVCOREDUMP_HDR_SIZE, GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ dump_hdr_size = hci_devcd_mkheader(hdev, skb);
+
+ if (hci_devcd_alloc(hdev, dump_hdr_size + dump_size)) {
+ err = -ENOMEM;
+ goto hdr_free;
+ }
+
+ /* Insert the device header */
+ if (!hci_devcd_copy(hdev, skb->data, skb->len)) {
+ bt_dev_err(hdev, "Failed to insert header");
+ hci_devcd_free(hdev);
+
+ err = -ENOMEM;
+ goto hdr_free;
+ }
+
+hdr_free:
+ kfree_skb(skb);
+
+ return err;
+}
+
+static void hci_devcd_handle_pkt_init(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ u32 dump_size;
+
+ if (hdev->dump.state != HCI_DEVCOREDUMP_IDLE) {
+ DBG_UNEXPECTED_STATE();
+ return;
+ }
+
+ if (skb->len != sizeof(dump_size)) {
+ bt_dev_dbg(hdev, "Invalid dump init pkt");
+ return;
+ }
+
+ dump_size = get_unaligned_le32(skb_pull_data(skb, 4));
+ if (!dump_size) {
+ bt_dev_err(hdev, "Zero size dump init pkt");
+ return;
+ }
+
+ if (hci_devcd_prepare(hdev, dump_size)) {
+ bt_dev_err(hdev, "Failed to prepare for dump");
+ return;
+ }
+
+ hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_ACTIVE);
+ queue_delayed_work(hdev->workqueue, &hdev->dump.dump_timeout,
+ hdev->dump.timeout);
+}
+
+static void hci_devcd_handle_pkt_skb(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ if (hdev->dump.state != HCI_DEVCOREDUMP_ACTIVE) {
+ DBG_UNEXPECTED_STATE();
+ return;
+ }
+
+ if (!hci_devcd_copy(hdev, skb->data, skb->len))
+ bt_dev_dbg(hdev, "Failed to insert skb");
+}
+
+static void hci_devcd_handle_pkt_pattern(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_devcoredump_skb_pattern *pattern;
+
+ if (hdev->dump.state != HCI_DEVCOREDUMP_ACTIVE) {
+ DBG_UNEXPECTED_STATE();
+ return;
+ }
+
+ if (skb->len != sizeof(*pattern)) {
+ bt_dev_dbg(hdev, "Invalid pattern skb");
+ return;
+ }
+
+ pattern = skb_pull_data(skb, sizeof(*pattern));
+
+ if (!hci_devcd_memset(hdev, pattern->pattern, pattern->len))
+ bt_dev_dbg(hdev, "Failed to set pattern");
+}
+
+static void hci_devcd_dump(struct hci_dev *hdev)
+{
+ struct sk_buff *skb;
+ u32 size;
+
+ bt_dev_dbg(hdev, "state %d", hdev->dump.state);
+
+ size = hdev->dump.tail - hdev->dump.head;
+
+ /* Send a copy to monitor as a diagnostic packet */
+ skb = bt_skb_alloc(size, GFP_ATOMIC);
+ if (skb) {
+ skb_put_data(skb, hdev->dump.head, size);
+ hci_recv_diag(hdev, skb);
+ }
+
+ /* Emit a devcoredump with the available data */
+ dev_coredumpv(&hdev->dev, hdev->dump.head, size, GFP_KERNEL);
+}
+
+static void hci_devcd_handle_pkt_complete(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ u32 dump_size;
+
+ if (hdev->dump.state != HCI_DEVCOREDUMP_ACTIVE) {
+ DBG_UNEXPECTED_STATE();
+ return;
+ }
+
+ hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_DONE);
+ dump_size = hdev->dump.tail - hdev->dump.head;
+
+ bt_dev_dbg(hdev, "complete with size %u (expect %zu)", dump_size,
+ hdev->dump.alloc_size);
+
+ hci_devcd_dump(hdev);
+}
+
+static void hci_devcd_handle_pkt_abort(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ u32 dump_size;
+
+ if (hdev->dump.state != HCI_DEVCOREDUMP_ACTIVE) {
+ DBG_UNEXPECTED_STATE();
+ return;
+ }
+
+ hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_ABORT);
+ dump_size = hdev->dump.tail - hdev->dump.head;
+
+ bt_dev_dbg(hdev, "aborted with size %u (expect %zu)", dump_size,
+ hdev->dump.alloc_size);
+
+ hci_devcd_dump(hdev);
+}
+
+/* Bluetooth devcoredump state machine.
+ *
+ * Devcoredump states:
+ *
+ * HCI_DEVCOREDUMP_IDLE: The default state.
+ *
+ * HCI_DEVCOREDUMP_ACTIVE: A devcoredump will be in this state once it has
+ * been initialized using hci_devcd_init(). Once active, the driver
+ * can append data using hci_devcd_append() or insert a pattern
+ * using hci_devcd_append_pattern().
+ *
+ * HCI_DEVCOREDUMP_DONE: Once the dump collection is complete, the drive
+ * can signal the completion using hci_devcd_complete(). A
+ * devcoredump is generated indicating the completion event and
+ * then the state machine is reset to the default state.
+ *
+ * HCI_DEVCOREDUMP_ABORT: The driver can cancel ongoing dump collection in
+ * case of any error using hci_devcd_abort(). A devcoredump is
+ * still generated with the available data indicating the abort
+ * event and then the state machine is reset to the default state.
+ *
+ * HCI_DEVCOREDUMP_TIMEOUT: A timeout timer for HCI_DEVCOREDUMP_TIMEOUT sec
+ * is started during devcoredump initialization. Once the timeout
+ * occurs, the driver is notified, a devcoredump is generated with
+ * the available data indicating the timeout event and then the
+ * state machine is reset to the default state.
+ *
+ * The driver must register using hci_devcd_register() before using the hci
+ * devcoredump APIs.
+ */
+void hci_devcd_rx(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev, dump.dump_rx);
+ struct sk_buff *skb;
+ int start_state;
+
+ while ((skb = skb_dequeue(&hdev->dump.dump_q))) {
+ /* Return if timeout occurs. The timeout handler function
+ * hci_devcd_timeout() will report the available dump data.
+ */
+ if (hdev->dump.state == HCI_DEVCOREDUMP_TIMEOUT) {
+ kfree_skb(skb);
+ return;
+ }
+
+ hci_dev_lock(hdev);
+ start_state = hdev->dump.state;
+
+ switch (hci_dmp_cb(skb)->pkt_type) {
+ case HCI_DEVCOREDUMP_PKT_INIT:
+ hci_devcd_handle_pkt_init(hdev, skb);
+ break;
+
+ case HCI_DEVCOREDUMP_PKT_SKB:
+ hci_devcd_handle_pkt_skb(hdev, skb);
+ break;
+
+ case HCI_DEVCOREDUMP_PKT_PATTERN:
+ hci_devcd_handle_pkt_pattern(hdev, skb);
+ break;
+
+ case HCI_DEVCOREDUMP_PKT_COMPLETE:
+ hci_devcd_handle_pkt_complete(hdev, skb);
+ break;
+
+ case HCI_DEVCOREDUMP_PKT_ABORT:
+ hci_devcd_handle_pkt_abort(hdev, skb);
+ break;
+
+ default:
+ bt_dev_dbg(hdev, "Unknown packet (%d) for state (%d). ",
+ hci_dmp_cb(skb)->pkt_type, hdev->dump.state);
+ break;
+ }
+
+ hci_dev_unlock(hdev);
+ kfree_skb(skb);
+
+ /* Notify the driver about any state changes before resetting
+ * the state machine
+ */
+ if (start_state != hdev->dump.state)
+ hci_devcd_notify(hdev, hdev->dump.state);
+
+ /* Reset the state machine if the devcoredump is complete */
+ hci_dev_lock(hdev);
+ if (hdev->dump.state == HCI_DEVCOREDUMP_DONE ||
+ hdev->dump.state == HCI_DEVCOREDUMP_ABORT)
+ hci_devcd_reset(hdev);
+ hci_dev_unlock(hdev);
+ }
+}
+EXPORT_SYMBOL(hci_devcd_rx);
+
+void hci_devcd_timeout(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ dump.dump_timeout.work);
+ u32 dump_size;
+
+ hci_devcd_notify(hdev, HCI_DEVCOREDUMP_TIMEOUT);
+
+ hci_dev_lock(hdev);
+
+ cancel_work(&hdev->dump.dump_rx);
+
+ hci_devcd_update_state(hdev, HCI_DEVCOREDUMP_TIMEOUT);
+
+ dump_size = hdev->dump.tail - hdev->dump.head;
+ bt_dev_dbg(hdev, "timeout with size %u (expect %zu)", dump_size,
+ hdev->dump.alloc_size);
+
+ hci_devcd_dump(hdev);
+
+ hci_devcd_reset(hdev);
+
+ hci_dev_unlock(hdev);
+}
+EXPORT_SYMBOL(hci_devcd_timeout);
+
+int hci_devcd_register(struct hci_dev *hdev, coredump_t coredump,
+ dmp_hdr_t dmp_hdr, notify_change_t notify_change)
+{
+ /* Driver must implement coredump() and dmp_hdr() functions for
+ * bluetooth devcoredump. The coredump() should trigger a coredump
+ * event on the controller when the device's coredump sysfs entry is
+ * written to. The dmp_hdr() should create a dump header to identify
+ * the controller/fw/driver info.
+ */
+ if (!coredump || !dmp_hdr)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->dump.coredump = coredump;
+ hdev->dump.dmp_hdr = dmp_hdr;
+ hdev->dump.notify_change = notify_change;
+ hdev->dump.supported = true;
+ hdev->dump.timeout = DEVCOREDUMP_TIMEOUT;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+EXPORT_SYMBOL(hci_devcd_register);
+
+static inline bool hci_devcd_enabled(struct hci_dev *hdev)
+{
+ return hdev->dump.supported;
+}
+
+int hci_devcd_init(struct hci_dev *hdev, u32 dump_size)
+{
+ struct sk_buff *skb;
+
+ if (!hci_devcd_enabled(hdev))
+ return -EOPNOTSUPP;
+
+ skb = alloc_skb(sizeof(dump_size), GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ hci_dmp_cb(skb)->pkt_type = HCI_DEVCOREDUMP_PKT_INIT;
+ put_unaligned_le32(dump_size, skb_put(skb, 4));
+
+ skb_queue_tail(&hdev->dump.dump_q, skb);
+ queue_work(hdev->workqueue, &hdev->dump.dump_rx);
+
+ return 0;
+}
+EXPORT_SYMBOL(hci_devcd_init);
+
+int hci_devcd_append(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ if (!skb)
+ return -ENOMEM;
+
+ if (!hci_devcd_enabled(hdev)) {
+ kfree_skb(skb);
+ return -EOPNOTSUPP;
+ }
+
+ hci_dmp_cb(skb)->pkt_type = HCI_DEVCOREDUMP_PKT_SKB;
+
+ skb_queue_tail(&hdev->dump.dump_q, skb);
+ queue_work(hdev->workqueue, &hdev->dump.dump_rx);
+
+ return 0;
+}
+EXPORT_SYMBOL(hci_devcd_append);
+
+int hci_devcd_append_pattern(struct hci_dev *hdev, u8 pattern, u32 len)
+{
+ struct hci_devcoredump_skb_pattern p;
+ struct sk_buff *skb;
+
+ if (!hci_devcd_enabled(hdev))
+ return -EOPNOTSUPP;
+
+ skb = alloc_skb(sizeof(p), GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ p.pattern = pattern;
+ p.len = len;
+
+ hci_dmp_cb(skb)->pkt_type = HCI_DEVCOREDUMP_PKT_PATTERN;
+ skb_put_data(skb, &p, sizeof(p));
+
+ skb_queue_tail(&hdev->dump.dump_q, skb);
+ queue_work(hdev->workqueue, &hdev->dump.dump_rx);
+
+ return 0;
+}
+EXPORT_SYMBOL(hci_devcd_append_pattern);
+
+int hci_devcd_complete(struct hci_dev *hdev)
+{
+ struct sk_buff *skb;
+
+ if (!hci_devcd_enabled(hdev))
+ return -EOPNOTSUPP;
+
+ skb = alloc_skb(0, GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ hci_dmp_cb(skb)->pkt_type = HCI_DEVCOREDUMP_PKT_COMPLETE;
+
+ skb_queue_tail(&hdev->dump.dump_q, skb);
+ queue_work(hdev->workqueue, &hdev->dump.dump_rx);
+
+ return 0;
+}
+EXPORT_SYMBOL(hci_devcd_complete);
+
+int hci_devcd_abort(struct hci_dev *hdev)
+{
+ struct sk_buff *skb;
+
+ if (!hci_devcd_enabled(hdev))
+ return -EOPNOTSUPP;
+
+ skb = alloc_skb(0, GFP_ATOMIC);
+ if (!skb)
+ return -ENOMEM;
+
+ hci_dmp_cb(skb)->pkt_type = HCI_DEVCOREDUMP_PKT_ABORT;
+
+ skb_queue_tail(&hdev->dump.dump_q, skb);
+ queue_work(hdev->workqueue, &hdev->dump.dump_rx);
+
+ return 0;
+}
+EXPORT_SYMBOL(hci_devcd_abort);
diff --git a/net/bluetooth/ecdh_helper.c b/net/bluetooth/ecdh_helper.c
index 2155ce802877..0efc93fdae8a 100644
--- a/net/bluetooth/ecdh_helper.c
+++ b/net/bluetooth/ecdh_helper.c
@@ -25,22 +25,6 @@
#include <linux/scatterlist.h>
#include <crypto/ecdh.h>
-struct ecdh_completion {
- struct completion completion;
- int err;
-};
-
-static void ecdh_complete(struct crypto_async_request *req, int err)
-{
- struct ecdh_completion *res = req->data;
-
- if (err == -EINPROGRESS)
- return;
-
- res->err = err;
- complete(&res->completion);
-}
-
static inline void swap_digits(u64 *in, u64 *out, unsigned int ndigits)
{
int i;
@@ -60,9 +44,9 @@ static inline void swap_digits(u64 *in, u64 *out, unsigned int ndigits)
int compute_ecdh_secret(struct crypto_kpp *tfm, const u8 public_key[64],
u8 secret[32])
{
+ DECLARE_CRYPTO_WAIT(result);
struct kpp_request *req;
u8 *tmp;
- struct ecdh_completion result;
struct scatterlist src, dst;
int err;
@@ -76,8 +60,6 @@ int compute_ecdh_secret(struct crypto_kpp *tfm, const u8 public_key[64],
goto free_tmp;
}
- init_completion(&result.completion);
-
swap_digits((u64 *)public_key, (u64 *)tmp, 4); /* x */
swap_digits((u64 *)&public_key[32], (u64 *)&tmp[32], 4); /* y */
@@ -86,12 +68,9 @@ int compute_ecdh_secret(struct crypto_kpp *tfm, const u8 public_key[64],
kpp_request_set_input(req, &src, 64);
kpp_request_set_output(req, &dst, 32);
kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
- ecdh_complete, &result);
+ crypto_req_done, &result);
err = crypto_kpp_compute_shared_secret(req);
- if (err == -EINPROGRESS) {
- wait_for_completion(&result.completion);
- err = result.err;
- }
+ err = crypto_wait_req(err, &result);
if (err < 0) {
pr_err("alg: ecdh: compute shared secret failed. err %d\n",
err);
@@ -104,7 +83,7 @@ int compute_ecdh_secret(struct crypto_kpp *tfm, const u8 public_key[64],
free_all:
kpp_request_free(req);
free_tmp:
- kzfree(tmp);
+ kfree_sensitive(tmp);
return err;
}
@@ -126,8 +105,6 @@ int set_ecdh_privkey(struct crypto_kpp *tfm, const u8 private_key[32])
int err;
struct ecdh p = {0};
- p.curve_id = ECC_CURVE_NIST_P256;
-
if (private_key) {
tmp = kmalloc(32, GFP_KERNEL);
if (!tmp)
@@ -151,9 +128,9 @@ int set_ecdh_privkey(struct crypto_kpp *tfm, const u8 private_key[32])
err = crypto_kpp_set_secret(tfm, buf, buf_len);
/* fall through */
free_all:
- kzfree(buf);
+ kfree_sensitive(buf);
free_tmp:
- kzfree(tmp);
+ kfree_sensitive(tmp);
return err;
}
@@ -167,9 +144,9 @@ free_tmp:
*/
int generate_ecdh_public_key(struct crypto_kpp *tfm, u8 public_key[64])
{
+ DECLARE_CRYPTO_WAIT(result);
struct kpp_request *req;
u8 *tmp;
- struct ecdh_completion result;
struct scatterlist dst;
int err;
@@ -183,18 +160,14 @@ int generate_ecdh_public_key(struct crypto_kpp *tfm, u8 public_key[64])
goto free_tmp;
}
- init_completion(&result.completion);
sg_init_one(&dst, tmp, 64);
kpp_request_set_input(req, NULL, 0);
kpp_request_set_output(req, &dst, 64);
kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
- ecdh_complete, &result);
+ crypto_req_done, &result);
err = crypto_kpp_generate_public_key(req);
- if (err == -EINPROGRESS) {
- wait_for_completion(&result.completion);
- err = result.err;
- }
+ err = crypto_wait_req(err, &result);
if (err < 0)
goto free_all;
diff --git a/net/bluetooth/ecdh_helper.h b/net/bluetooth/ecdh_helper.h
index a6f8d03d4aaf..830723971cf8 100644
--- a/net/bluetooth/ecdh_helper.h
+++ b/net/bluetooth/ecdh_helper.h
@@ -25,6 +25,6 @@
int compute_ecdh_secret(struct crypto_kpp *tfm, const u8 pair_public_key[64],
u8 secret[32]);
-int set_ecdh_privkey(struct crypto_kpp *tfm, const u8 *private_key);
+int set_ecdh_privkey(struct crypto_kpp *tfm, const u8 private_key[32]);
int generate_ecdh_public_key(struct crypto_kpp *tfm, u8 public_key[64]);
int generate_ecdh_keys(struct crypto_kpp *tfm, u8 public_key[64]);
diff --git a/net/bluetooth/eir.c b/net/bluetooth/eir.c
new file mode 100644
index 000000000000..3f72111ba651
--- /dev/null
+++ b/net/bluetooth/eir.c
@@ -0,0 +1,386 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BlueZ - Bluetooth protocol stack for Linux
+ *
+ * Copyright (C) 2021 Intel Corporation
+ */
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/mgmt.h>
+
+#include "eir.h"
+
+#define PNP_INFO_SVCLASS_ID 0x1200
+
+u8 eir_append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len)
+{
+ size_t short_len;
+ size_t complete_len;
+
+ /* no space left for name (+ type + len) */
+ if ((max_adv_len(hdev) - ad_len) < HCI_MAX_SHORT_NAME_LENGTH + 2)
+ return ad_len;
+
+ /* use complete name if present and fits */
+ complete_len = strnlen(hdev->dev_name, sizeof(hdev->dev_name));
+ if (complete_len && complete_len <= HCI_MAX_SHORT_NAME_LENGTH)
+ return eir_append_data(ptr, ad_len, EIR_NAME_COMPLETE,
+ hdev->dev_name, complete_len);
+
+ /* use short name if present */
+ short_len = strnlen(hdev->short_name, sizeof(hdev->short_name));
+ if (short_len)
+ return eir_append_data(ptr, ad_len, EIR_NAME_SHORT,
+ hdev->short_name,
+ short_len);
+
+ /* use shortened full name if present, we already know that name
+ * is longer then HCI_MAX_SHORT_NAME_LENGTH
+ */
+ if (complete_len)
+ return eir_append_data(ptr, ad_len, EIR_NAME_SHORT,
+ hdev->dev_name,
+ HCI_MAX_SHORT_NAME_LENGTH);
+
+ return ad_len;
+}
+
+u8 eir_append_appearance(struct hci_dev *hdev, u8 *ptr, u8 ad_len)
+{
+ return eir_append_le16(ptr, ad_len, EIR_APPEARANCE, hdev->appearance);
+}
+
+u8 eir_append_service_data(u8 *eir, u16 eir_len, u16 uuid, u8 *data,
+ u8 data_len)
+{
+ eir[eir_len++] = sizeof(u8) + sizeof(uuid) + data_len;
+ eir[eir_len++] = EIR_SERVICE_DATA;
+ put_unaligned_le16(uuid, &eir[eir_len]);
+ eir_len += sizeof(uuid);
+ memcpy(&eir[eir_len], data, data_len);
+ eir_len += data_len;
+
+ return eir_len;
+}
+
+static u8 *create_uuid16_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len)
+{
+ u8 *ptr = data, *uuids_start = NULL;
+ struct bt_uuid *uuid;
+
+ if (len < 4)
+ return ptr;
+
+ list_for_each_entry(uuid, &hdev->uuids, list) {
+ u16 uuid16;
+
+ if (uuid->size != 16)
+ continue;
+
+ uuid16 = get_unaligned_le16(&uuid->uuid[12]);
+ if (uuid16 < 0x1100)
+ continue;
+
+ if (uuid16 == PNP_INFO_SVCLASS_ID)
+ continue;
+
+ if (!uuids_start) {
+ uuids_start = ptr;
+ uuids_start[0] = 1;
+ uuids_start[1] = EIR_UUID16_ALL;
+ ptr += 2;
+ }
+
+ /* Stop if not enough space to put next UUID */
+ if ((ptr - data) + sizeof(u16) > len) {
+ uuids_start[1] = EIR_UUID16_SOME;
+ break;
+ }
+
+ *ptr++ = (uuid16 & 0x00ff);
+ *ptr++ = (uuid16 & 0xff00) >> 8;
+ uuids_start[0] += sizeof(uuid16);
+ }
+
+ return ptr;
+}
+
+static u8 *create_uuid32_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len)
+{
+ u8 *ptr = data, *uuids_start = NULL;
+ struct bt_uuid *uuid;
+
+ if (len < 6)
+ return ptr;
+
+ list_for_each_entry(uuid, &hdev->uuids, list) {
+ if (uuid->size != 32)
+ continue;
+
+ if (!uuids_start) {
+ uuids_start = ptr;
+ uuids_start[0] = 1;
+ uuids_start[1] = EIR_UUID32_ALL;
+ ptr += 2;
+ }
+
+ /* Stop if not enough space to put next UUID */
+ if ((ptr - data) + sizeof(u32) > len) {
+ uuids_start[1] = EIR_UUID32_SOME;
+ break;
+ }
+
+ memcpy(ptr, &uuid->uuid[12], sizeof(u32));
+ ptr += sizeof(u32);
+ uuids_start[0] += sizeof(u32);
+ }
+
+ return ptr;
+}
+
+static u8 *create_uuid128_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len)
+{
+ u8 *ptr = data, *uuids_start = NULL;
+ struct bt_uuid *uuid;
+
+ if (len < 18)
+ return ptr;
+
+ list_for_each_entry(uuid, &hdev->uuids, list) {
+ if (uuid->size != 128)
+ continue;
+
+ if (!uuids_start) {
+ uuids_start = ptr;
+ uuids_start[0] = 1;
+ uuids_start[1] = EIR_UUID128_ALL;
+ ptr += 2;
+ }
+
+ /* Stop if not enough space to put next UUID */
+ if ((ptr - data) + 16 > len) {
+ uuids_start[1] = EIR_UUID128_SOME;
+ break;
+ }
+
+ memcpy(ptr, uuid->uuid, 16);
+ ptr += 16;
+ uuids_start[0] += 16;
+ }
+
+ return ptr;
+}
+
+void eir_create(struct hci_dev *hdev, u8 *data)
+{
+ u8 *ptr = data;
+ size_t name_len;
+
+ name_len = strnlen(hdev->dev_name, sizeof(hdev->dev_name));
+
+ if (name_len > 0) {
+ /* EIR Data type */
+ if (name_len > 48) {
+ name_len = 48;
+ ptr[1] = EIR_NAME_SHORT;
+ } else {
+ ptr[1] = EIR_NAME_COMPLETE;
+ }
+
+ /* EIR Data length */
+ ptr[0] = name_len + 1;
+
+ memcpy(ptr + 2, hdev->dev_name, name_len);
+
+ ptr += (name_len + 2);
+ }
+
+ if (hdev->inq_tx_power != HCI_TX_POWER_INVALID) {
+ ptr[0] = 2;
+ ptr[1] = EIR_TX_POWER;
+ ptr[2] = (u8)hdev->inq_tx_power;
+
+ ptr += 3;
+ }
+
+ if (hdev->devid_source > 0) {
+ ptr[0] = 9;
+ ptr[1] = EIR_DEVICE_ID;
+
+ put_unaligned_le16(hdev->devid_source, ptr + 2);
+ put_unaligned_le16(hdev->devid_vendor, ptr + 4);
+ put_unaligned_le16(hdev->devid_product, ptr + 6);
+ put_unaligned_le16(hdev->devid_version, ptr + 8);
+
+ ptr += 10;
+ }
+
+ ptr = create_uuid16_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data));
+ ptr = create_uuid32_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data));
+ ptr = create_uuid128_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data));
+}
+
+u8 eir_create_per_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
+{
+ struct adv_info *adv = NULL;
+ u8 ad_len = 0;
+
+ /* Return 0 when the current instance identifier is invalid. */
+ if (instance) {
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv)
+ return 0;
+ }
+
+ if (adv) {
+ memcpy(ptr, adv->per_adv_data, adv->per_adv_data_len);
+ ad_len += adv->per_adv_data_len;
+ ptr += adv->per_adv_data_len;
+ }
+
+ return ad_len;
+}
+
+u8 eir_create_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr, u8 size)
+{
+ struct adv_info *adv = NULL;
+ u8 ad_len = 0, flags = 0;
+ u32 instance_flags;
+
+ /* Return 0 when the current instance identifier is invalid. */
+ if (instance) {
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv)
+ return 0;
+ }
+
+ instance_flags = hci_adv_instance_flags(hdev, instance);
+
+ /* If instance already has the flags set skip adding it once
+ * again.
+ */
+ if (adv && eir_get_data(adv->adv_data, adv->adv_data_len, EIR_FLAGS,
+ NULL))
+ goto skip_flags;
+
+ /* The Add Advertising command allows userspace to set both the general
+ * and limited discoverable flags.
+ */
+ if (instance_flags & MGMT_ADV_FLAG_DISCOV)
+ flags |= LE_AD_GENERAL;
+
+ if (instance_flags & MGMT_ADV_FLAG_LIMITED_DISCOV)
+ flags |= LE_AD_LIMITED;
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ flags |= LE_AD_NO_BREDR;
+
+ if (flags || (instance_flags & MGMT_ADV_FLAG_MANAGED_FLAGS)) {
+ /* If a discovery flag wasn't provided, simply use the global
+ * settings.
+ */
+ if (!flags)
+ flags |= mgmt_get_adv_discov_flags(hdev);
+
+ /* If flags would still be empty, then there is no need to
+ * include the "Flags" AD field".
+ */
+ if (flags && (ad_len + eir_precalc_len(1) <= size)) {
+ ptr[0] = 0x02;
+ ptr[1] = EIR_FLAGS;
+ ptr[2] = flags;
+
+ ad_len += 3;
+ ptr += 3;
+ }
+ }
+
+skip_flags:
+ if (adv) {
+ memcpy(ptr, adv->adv_data, adv->adv_data_len);
+ ad_len += adv->adv_data_len;
+ ptr += adv->adv_data_len;
+ }
+
+ if (instance_flags & MGMT_ADV_FLAG_TX_POWER) {
+ s8 adv_tx_power;
+
+ if (ext_adv_capable(hdev)) {
+ if (adv)
+ adv_tx_power = adv->tx_power;
+ else
+ adv_tx_power = hdev->adv_tx_power;
+ } else {
+ adv_tx_power = hdev->adv_tx_power;
+ }
+
+ /* Provide Tx Power only if we can provide a valid value for it */
+ if (adv_tx_power != HCI_TX_POWER_INVALID &&
+ (ad_len + eir_precalc_len(1) <= size)) {
+ ptr[0] = 0x02;
+ ptr[1] = EIR_TX_POWER;
+ ptr[2] = (u8)adv_tx_power;
+
+ ad_len += 3;
+ ptr += 3;
+ }
+ }
+
+ return ad_len;
+}
+
+static u8 create_default_scan_rsp(struct hci_dev *hdev, u8 *ptr)
+{
+ u8 scan_rsp_len = 0;
+
+ if (hdev->appearance)
+ scan_rsp_len = eir_append_appearance(hdev, ptr, scan_rsp_len);
+
+ return eir_append_local_name(hdev, ptr, scan_rsp_len);
+}
+
+u8 eir_create_scan_rsp(struct hci_dev *hdev, u8 instance, u8 *ptr)
+{
+ struct adv_info *adv;
+ u8 scan_rsp_len = 0;
+
+ if (!instance)
+ return create_default_scan_rsp(hdev, ptr);
+
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv)
+ return 0;
+
+ if ((adv->flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance)
+ scan_rsp_len = eir_append_appearance(hdev, ptr, scan_rsp_len);
+
+ memcpy(&ptr[scan_rsp_len], adv->scan_rsp_data, adv->scan_rsp_len);
+
+ scan_rsp_len += adv->scan_rsp_len;
+
+ if (adv->flags & MGMT_ADV_FLAG_LOCAL_NAME)
+ scan_rsp_len = eir_append_local_name(hdev, ptr, scan_rsp_len);
+
+ return scan_rsp_len;
+}
+
+void *eir_get_service_data(u8 *eir, size_t eir_len, u16 uuid, size_t *len)
+{
+ size_t dlen;
+
+ while ((eir = eir_get_data(eir, eir_len, EIR_SERVICE_DATA, &dlen))) {
+ u16 value = get_unaligned_le16(eir);
+
+ if (uuid == value) {
+ if (len)
+ *len = dlen - 2;
+ return &eir[2];
+ }
+
+ eir += dlen;
+ eir_len -= dlen;
+ }
+
+ return NULL;
+}
diff --git a/net/bluetooth/eir.h b/net/bluetooth/eir.h
new file mode 100644
index 000000000000..9372db83f912
--- /dev/null
+++ b/net/bluetooth/eir.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BlueZ - Bluetooth protocol stack for Linux
+ *
+ * Copyright (C) 2021 Intel Corporation
+ */
+
+#include <linux/unaligned.h>
+
+void eir_create(struct hci_dev *hdev, u8 *data);
+
+u8 eir_create_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr, u8 size);
+u8 eir_create_scan_rsp(struct hci_dev *hdev, u8 instance, u8 *ptr);
+u8 eir_create_per_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr);
+
+u8 eir_append_local_name(struct hci_dev *hdev, u8 *eir, u8 ad_len);
+u8 eir_append_appearance(struct hci_dev *hdev, u8 *ptr, u8 ad_len);
+u8 eir_append_service_data(u8 *eir, u16 eir_len, u16 uuid, u8 *data,
+ u8 data_len);
+
+static inline u16 eir_precalc_len(u8 data_len)
+{
+ return sizeof(u8) * 2 + data_len;
+}
+
+static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type,
+ u8 *data, u8 data_len)
+{
+ eir[eir_len++] = sizeof(type) + data_len;
+ eir[eir_len++] = type;
+ memcpy(&eir[eir_len], data, data_len);
+ eir_len += data_len;
+
+ return eir_len;
+}
+
+static inline u16 eir_append_le16(u8 *eir, u16 eir_len, u8 type, u16 data)
+{
+ eir[eir_len++] = sizeof(type) + sizeof(data);
+ eir[eir_len++] = type;
+ put_unaligned_le16(data, &eir[eir_len]);
+ eir_len += sizeof(data);
+
+ return eir_len;
+}
+
+static inline u16 eir_skb_put_data(struct sk_buff *skb, u8 type, u8 *data, u8 data_len)
+{
+ u8 *eir;
+ u16 eir_len;
+
+ eir_len = eir_precalc_len(data_len);
+ eir = skb_put(skb, eir_len);
+ WARN_ON(sizeof(type) + data_len > U8_MAX);
+ eir[0] = sizeof(type) + data_len;
+ eir[1] = type;
+ memcpy(&eir[2], data, data_len);
+
+ return eir_len;
+}
+
+static inline void *eir_get_data(u8 *eir, size_t eir_len, u8 type,
+ size_t *data_len)
+{
+ size_t parsed = 0;
+
+ if (eir_len < 2)
+ return NULL;
+
+ while (parsed < eir_len - 1) {
+ u8 field_len = eir[0];
+
+ if (field_len == 0)
+ break;
+
+ parsed += field_len + 1;
+
+ if (parsed > eir_len)
+ break;
+
+ if (eir[1] != type) {
+ eir += field_len + 1;
+ continue;
+ }
+
+ /* Zero length data */
+ if (field_len == 1)
+ return NULL;
+
+ if (data_len)
+ *data_len = field_len - 1;
+
+ return &eir[2];
+ }
+
+ return NULL;
+}
+
+void *eir_get_service_data(u8 *eir, size_t eir_len, u16 uuid, size_t *len);
diff --git a/net/bluetooth/hci_codec.c b/net/bluetooth/hci_codec.c
new file mode 100644
index 000000000000..3cc135bb1d30
--- /dev/null
+++ b/net/bluetooth/hci_codec.c
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* Copyright (C) 2021 Intel Corporation */
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include "hci_codec.h"
+
+static int hci_codec_list_add(struct list_head *list,
+ struct hci_op_read_local_codec_caps *sent,
+ struct hci_rp_read_local_codec_caps *rp,
+ void *caps,
+ __u32 len)
+{
+ struct codec_list *entry;
+
+ entry = kzalloc(sizeof(*entry) + len, GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->id = sent->id;
+ if (sent->id == 0xFF) {
+ entry->cid = __le16_to_cpu(sent->cid);
+ entry->vid = __le16_to_cpu(sent->vid);
+ }
+ entry->transport = sent->transport;
+ entry->len = len;
+ entry->num_caps = 0;
+ if (rp) {
+ entry->num_caps = rp->num_caps;
+ memcpy(entry->caps, caps, len);
+ }
+ list_add(&entry->list, list);
+
+ return 0;
+}
+
+void hci_codec_list_clear(struct list_head *codec_list)
+{
+ struct codec_list *c, *n;
+
+ list_for_each_entry_safe(c, n, codec_list, list) {
+ list_del(&c->list);
+ kfree(c);
+ }
+}
+
+static void hci_read_codec_capabilities(struct hci_dev *hdev, __u8 transport,
+ struct hci_op_read_local_codec_caps
+ *cmd)
+{
+ __u8 i;
+
+ for (i = 0; i < TRANSPORT_TYPE_MAX; i++) {
+ if (transport & BIT(i)) {
+ struct hci_rp_read_local_codec_caps *rp;
+ struct hci_codec_caps *caps;
+ struct sk_buff *skb;
+ __u8 j;
+ __u32 len;
+
+ cmd->transport = i;
+
+ /* If Read_Codec_Capabilities command is not supported
+ * then just add codec to the list without caps
+ */
+ if (!(hdev->commands[45] & 0x08)) {
+ hci_dev_lock(hdev);
+ hci_codec_list_add(&hdev->local_codecs, cmd,
+ NULL, NULL, 0);
+ hci_dev_unlock(hdev);
+ continue;
+ }
+
+ skb = __hci_cmd_sync_sk(hdev, HCI_OP_READ_LOCAL_CODEC_CAPS,
+ sizeof(*cmd), cmd, 0, HCI_CMD_TIMEOUT, NULL);
+ if (IS_ERR(skb)) {
+ bt_dev_err(hdev, "Failed to read codec capabilities (%ld)",
+ PTR_ERR(skb));
+ continue;
+ }
+
+ if (skb->len < sizeof(*rp))
+ goto error;
+
+ rp = (void *)skb->data;
+
+ if (rp->status)
+ goto error;
+
+ if (!rp->num_caps) {
+ len = 0;
+ /* this codec doesn't have capabilities */
+ goto skip_caps_parse;
+ }
+
+ skb_pull(skb, sizeof(*rp));
+
+ for (j = 0, len = 0; j < rp->num_caps; j++) {
+ caps = (void *)skb->data;
+ if (skb->len < sizeof(*caps))
+ goto error;
+ if (skb->len < caps->len)
+ goto error;
+ len += sizeof(caps->len) + caps->len;
+ skb_pull(skb, sizeof(caps->len) + caps->len);
+ }
+
+skip_caps_parse:
+ hci_dev_lock(hdev);
+ hci_codec_list_add(&hdev->local_codecs, cmd, rp,
+ (__u8 *)rp + sizeof(*rp), len);
+ hci_dev_unlock(hdev);
+error:
+ kfree_skb(skb);
+ }
+ }
+}
+
+void hci_read_supported_codecs(struct hci_dev *hdev)
+{
+ struct sk_buff *skb;
+ struct hci_rp_read_local_supported_codecs *rp;
+ struct hci_std_codecs *std_codecs;
+ struct hci_vnd_codecs *vnd_codecs;
+ struct hci_op_read_local_codec_caps caps;
+ __u8 i;
+
+ skb = __hci_cmd_sync_sk(hdev, HCI_OP_READ_LOCAL_CODECS, 0, NULL,
+ 0, HCI_CMD_TIMEOUT, NULL);
+
+ if (IS_ERR(skb)) {
+ bt_dev_err(hdev, "Failed to read local supported codecs (%ld)",
+ PTR_ERR(skb));
+ return;
+ }
+
+ if (skb->len < sizeof(*rp))
+ goto error;
+
+ rp = (void *)skb->data;
+
+ if (rp->status)
+ goto error;
+
+ skb_pull(skb, sizeof(rp->status));
+
+ std_codecs = (void *)skb->data;
+
+ /* validate codecs length before accessing */
+ if (skb->len < flex_array_size(std_codecs, codec, std_codecs->num)
+ + sizeof(std_codecs->num))
+ goto error;
+
+ /* enumerate codec capabilities of standard codecs */
+ memset(&caps, 0, sizeof(caps));
+ for (i = 0; i < std_codecs->num; i++) {
+ caps.id = std_codecs->codec[i];
+ caps.direction = 0x00;
+ hci_read_codec_capabilities(hdev,
+ LOCAL_CODEC_ACL_MASK | LOCAL_CODEC_SCO_MASK, &caps);
+ }
+
+ skb_pull(skb, flex_array_size(std_codecs, codec, std_codecs->num)
+ + sizeof(std_codecs->num));
+
+ vnd_codecs = (void *)skb->data;
+
+ /* validate vendor codecs length before accessing */
+ if (skb->len <
+ flex_array_size(vnd_codecs, codec, vnd_codecs->num)
+ + sizeof(vnd_codecs->num))
+ goto error;
+
+ /* enumerate vendor codec capabilities */
+ for (i = 0; i < vnd_codecs->num; i++) {
+ caps.id = 0xFF;
+ caps.cid = vnd_codecs->codec[i].cid;
+ caps.vid = vnd_codecs->codec[i].vid;
+ caps.direction = 0x00;
+ hci_read_codec_capabilities(hdev,
+ LOCAL_CODEC_ACL_MASK | LOCAL_CODEC_SCO_MASK, &caps);
+ }
+
+error:
+ kfree_skb(skb);
+}
+
+void hci_read_supported_codecs_v2(struct hci_dev *hdev)
+{
+ struct sk_buff *skb;
+ struct hci_rp_read_local_supported_codecs_v2 *rp;
+ struct hci_std_codecs_v2 *std_codecs;
+ struct hci_vnd_codecs_v2 *vnd_codecs;
+ struct hci_op_read_local_codec_caps caps;
+ __u8 i;
+
+ skb = __hci_cmd_sync_sk(hdev, HCI_OP_READ_LOCAL_CODECS_V2, 0, NULL,
+ 0, HCI_CMD_TIMEOUT, NULL);
+
+ if (IS_ERR(skb)) {
+ bt_dev_err(hdev, "Failed to read local supported codecs (%ld)",
+ PTR_ERR(skb));
+ return;
+ }
+
+ if (skb->len < sizeof(*rp))
+ goto error;
+
+ rp = (void *)skb->data;
+
+ if (rp->status)
+ goto error;
+
+ skb_pull(skb, sizeof(rp->status));
+
+ std_codecs = (void *)skb->data;
+
+ /* check for payload data length before accessing */
+ if (skb->len < flex_array_size(std_codecs, codec, std_codecs->num)
+ + sizeof(std_codecs->num))
+ goto error;
+
+ memset(&caps, 0, sizeof(caps));
+
+ for (i = 0; i < std_codecs->num; i++) {
+ caps.id = std_codecs->codec[i].id;
+ hci_read_codec_capabilities(hdev, std_codecs->codec[i].transport,
+ &caps);
+ }
+
+ skb_pull(skb, flex_array_size(std_codecs, codec, std_codecs->num)
+ + sizeof(std_codecs->num));
+
+ vnd_codecs = (void *)skb->data;
+
+ /* check for payload data length before accessing */
+ if (skb->len <
+ flex_array_size(vnd_codecs, codec, vnd_codecs->num)
+ + sizeof(vnd_codecs->num))
+ goto error;
+
+ for (i = 0; i < vnd_codecs->num; i++) {
+ caps.id = 0xFF;
+ caps.cid = vnd_codecs->codec[i].cid;
+ caps.vid = vnd_codecs->codec[i].vid;
+ hci_read_codec_capabilities(hdev, vnd_codecs->codec[i].transport,
+ &caps);
+ }
+
+error:
+ kfree_skb(skb);
+}
diff --git a/net/bluetooth/hci_codec.h b/net/bluetooth/hci_codec.h
new file mode 100644
index 000000000000..a2751930f123
--- /dev/null
+++ b/net/bluetooth/hci_codec.h
@@ -0,0 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/* Copyright (C) 2014 Intel Corporation */
+
+void hci_read_supported_codecs(struct hci_dev *hdev);
+void hci_read_supported_codecs_v2(struct hci_dev *hdev);
+void hci_codec_list_clear(struct list_head *codec_list);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index bd4978ce8c45..c3f7828bf9d5 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1,6 +1,7 @@
/*
BlueZ - Bluetooth protocol stack for Linux
Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved.
+ Copyright 2023-2024 NXP
Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
@@ -26,14 +27,16 @@
#include <linux/export.h>
#include <linux/debugfs.h>
+#include <linux/errqueue.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
#include <net/bluetooth/l2cap.h>
+#include <net/bluetooth/iso.h>
+#include <net/bluetooth/mgmt.h>
-#include "hci_request.h"
#include "smp.h"
-#include "a2mp.h"
+#include "eir.h"
struct sco_param {
u16 pkt_type;
@@ -41,6 +44,11 @@ struct sco_param {
u8 retrans_effort;
};
+struct conn_handle_t {
+ struct hci_conn *conn;
+ __u16 handle;
+};
+
static const struct sco_param esco_param_cvsd[] = {
{ EDR_ESCO_MASK & ~ESCO_2EV3, 0x000a, 0x01 }, /* S3 */
{ EDR_ESCO_MASK & ~ESCO_2EV3, 0x0007, 0x01 }, /* S2 */
@@ -60,7 +68,7 @@ static const struct sco_param esco_param_msbc[] = {
};
/* This function requires the caller holds hdev->lock */
-static void hci_connect_le_scan_cleanup(struct hci_conn *conn)
+void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status)
{
struct hci_conn_params *params;
struct hci_dev *hdev = conn->hdev;
@@ -80,9 +88,27 @@ static void hci_connect_le_scan_cleanup(struct hci_conn *conn)
params = hci_pend_le_action_lookup(&hdev->pend_le_conns, bdaddr,
bdaddr_type);
- if (!params || !params->explicit_connect)
+ if (!params)
+ return;
+
+ if (params->conn) {
+ hci_conn_drop(params->conn);
+ hci_conn_put(params->conn);
+ params->conn = NULL;
+ }
+
+ if (!params->explicit_connect)
return;
+ /* If the status indicates successful cancellation of
+ * the attempt (i.e. Unknown Connection Id) there's no point of
+ * notifying failure since we'll go back to keep trying to
+ * connect. The only exception is explicit connect requests
+ * where a timeout + cancel does indicate an actual failure.
+ */
+ if (status && status != HCI_ERROR_UNKNOWN_CONN_ID)
+ mgmt_connect_failed(hdev, conn, status);
+
/* The connection attempt was doing scan for new RPA, and is
* in scan phase. If params are not associated with any other
* autoconnect action, remove them completely. If they are, just unmark
@@ -90,7 +116,7 @@ static void hci_connect_le_scan_cleanup(struct hci_conn *conn)
*/
params->explicit_connect = false;
- list_del_init(&params->action);
+ hci_pend_le_list_del_init(params);
switch (params->auto_connect) {
case HCI_AUTO_CONN_EXPLICIT:
@@ -99,16 +125,16 @@ static void hci_connect_le_scan_cleanup(struct hci_conn *conn)
return;
case HCI_AUTO_CONN_DIRECT:
case HCI_AUTO_CONN_ALWAYS:
- list_add(&params->action, &hdev->pend_le_conns);
+ hci_pend_le_list_add(params, &hdev->pend_le_conns);
break;
case HCI_AUTO_CONN_REPORT:
- list_add(&params->action, &hdev->pend_le_reports);
+ hci_pend_le_list_add(params, &hdev->pend_le_reports);
break;
default:
break;
}
- hci_update_background_scan(hdev);
+ hci_update_passive_scan(hdev);
}
static void hci_conn_cleanup(struct hci_conn *conn)
@@ -118,163 +144,273 @@ static void hci_conn_cleanup(struct hci_conn *conn)
if (test_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags))
hci_conn_params_del(conn->hdev, &conn->dst, conn->dst_type);
- hci_chan_list_flush(conn);
-
- hci_conn_hash_del(hdev, conn);
-
- if (hdev->notify)
- hdev->notify(hdev, HCI_NOTIFY_CONN_DEL);
-
- hci_conn_del_sysfs(conn);
-
- debugfs_remove_recursive(conn->debugfs);
+ if (test_and_clear_bit(HCI_CONN_FLUSH_KEY, &conn->flags))
+ hci_remove_link_key(hdev, &conn->dst);
- hci_dev_put(hdev);
-
- hci_conn_put(conn);
-}
-
-static void le_scan_cleanup(struct work_struct *work)
-{
- struct hci_conn *conn = container_of(work, struct hci_conn,
- le_scan_cleanup);
- struct hci_dev *hdev = conn->hdev;
- struct hci_conn *c = NULL;
+ hci_chan_list_flush(conn);
- BT_DBG("%s hcon %p", hdev->name, conn);
+ if (HCI_CONN_HANDLE_UNSET(conn->handle))
+ ida_free(&hdev->unset_handle_ida, conn->handle);
- hci_dev_lock(hdev);
+ if (conn->cleanup)
+ conn->cleanup(conn);
- /* Check that the hci_conn is still around */
- rcu_read_lock();
- list_for_each_entry_rcu(c, &hdev->conn_hash.list, list) {
- if (c == conn)
+ if (conn->type == SCO_LINK || conn->type == ESCO_LINK) {
+ switch (conn->setting & SCO_AIRMODE_MASK) {
+ case SCO_AIRMODE_CVSD:
+ case SCO_AIRMODE_TRANSP:
+ if (hdev->notify)
+ hdev->notify(hdev, HCI_NOTIFY_DISABLE_SCO);
break;
+ }
+ } else {
+ if (hdev->notify)
+ hdev->notify(hdev, HCI_NOTIFY_CONN_DEL);
}
- rcu_read_unlock();
- if (c == conn) {
- hci_connect_le_scan_cleanup(conn);
- hci_conn_cleanup(conn);
- }
+ debugfs_remove_recursive(conn->debugfs);
+
+ hci_conn_del_sysfs(conn);
- hci_dev_unlock(hdev);
hci_dev_put(hdev);
- hci_conn_put(conn);
}
-static void hci_connect_le_scan_remove(struct hci_conn *conn)
+int hci_disconnect(struct hci_conn *conn, __u8 reason)
{
- BT_DBG("%s hcon %p", conn->hdev->name, conn);
+ BT_DBG("hcon %p", conn);
- /* We can't call hci_conn_del/hci_conn_cleanup here since that
- * could deadlock with another hci_conn_del() call that's holding
- * hci_dev_lock and doing cancel_delayed_work_sync(&conn->disc_work).
- * Instead, grab temporary extra references to the hci_dev and
- * hci_conn and perform the necessary cleanup in a separate work
- * callback.
+ /* When we are central of an established connection and it enters
+ * the disconnect timeout, then go ahead and try to read the
+ * current clock offset. Processing of the result is done
+ * within the event handling and hci_clock_offset_evt function.
*/
+ if (conn->type == ACL_LINK && conn->role == HCI_ROLE_MASTER &&
+ (conn->state == BT_CONNECTED || conn->state == BT_CONFIG)) {
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_cp_read_clock_offset clkoff_cp;
- hci_dev_hold(conn->hdev);
- hci_conn_get(conn);
+ clkoff_cp.handle = cpu_to_le16(conn->handle);
+ hci_send_cmd(hdev, HCI_OP_READ_CLOCK_OFFSET, sizeof(clkoff_cp),
+ &clkoff_cp);
+ }
- /* Even though we hold a reference to the hdev, many other
- * things might get cleaned up meanwhile, including the hdev's
- * own workqueue, so we can't use that for scheduling.
- */
- schedule_work(&conn->le_scan_cleanup);
+ return hci_abort_conn(conn, reason);
}
-static void hci_acl_create_connection(struct hci_conn *conn)
+static void hci_add_sco(struct hci_conn *conn, __u16 handle)
{
struct hci_dev *hdev = conn->hdev;
- struct inquiry_entry *ie;
- struct hci_cp_create_conn cp;
+ struct hci_cp_add_sco cp;
BT_DBG("hcon %p", conn);
conn->state = BT_CONNECT;
conn->out = true;
- conn->role = HCI_ROLE_MASTER;
conn->attempt++;
- conn->link_policy = hdev->link_policy;
+ cp.handle = cpu_to_le16(handle);
+ cp.pkt_type = cpu_to_le16(conn->pkt_type);
- memset(&cp, 0, sizeof(cp));
- bacpy(&cp.bdaddr, &conn->dst);
- cp.pscan_rep_mode = 0x02;
-
- ie = hci_inquiry_cache_lookup(hdev, &conn->dst);
- if (ie) {
- if (inquiry_entry_age(ie) <= INQUIRY_ENTRY_AGE_MAX) {
- cp.pscan_rep_mode = ie->data.pscan_rep_mode;
- cp.pscan_mode = ie->data.pscan_mode;
- cp.clock_offset = ie->data.clock_offset |
- cpu_to_le16(0x8000);
- }
+ hci_send_cmd(hdev, HCI_OP_ADD_SCO, sizeof(cp), &cp);
+}
- memcpy(conn->dev_class, ie->data.dev_class, 3);
- if (ie->data.ssp_mode > 0)
- set_bit(HCI_CONN_SSP_ENABLED, &conn->flags);
- }
+static bool find_next_esco_param(struct hci_conn *conn,
+ const struct sco_param *esco_param, int size)
+{
+ if (!conn->parent)
+ return false;
- cp.pkt_type = cpu_to_le16(conn->pkt_type);
- if (lmp_rswitch_capable(hdev) && !(hdev->link_mode & HCI_LM_MASTER))
- cp.role_switch = 0x01;
- else
- cp.role_switch = 0x00;
+ for (; conn->attempt <= size; conn->attempt++) {
+ if (lmp_esco_2m_capable(conn->parent) ||
+ (esco_param[conn->attempt - 1].pkt_type & ESCO_2EV3))
+ break;
+ BT_DBG("hcon %p skipped attempt %d, eSCO 2M not supported",
+ conn, conn->attempt);
+ }
- hci_send_cmd(hdev, HCI_OP_CREATE_CONN, sizeof(cp), &cp);
+ return conn->attempt <= size;
}
-int hci_disconnect(struct hci_conn *conn, __u8 reason)
+static int configure_datapath_sync(struct hci_dev *hdev, struct bt_codec *codec)
{
- BT_DBG("hcon %p", conn);
+ int err;
+ __u8 vnd_len, *vnd_data = NULL;
+ struct hci_op_configure_data_path *cmd = NULL;
- /* When we are master of an established connection and it enters
- * the disconnect timeout, then go ahead and try to read the
- * current clock offset. Processing of the result is done
- * within the event handling and hci_clock_offset_evt function.
+ /* Do not take below 2 checks as error since the 1st means user do not
+ * want to use HFP offload mode and the 2nd means the vendor controller
+ * do not need to send below HCI command for offload mode.
*/
- if (conn->type == ACL_LINK && conn->role == HCI_ROLE_MASTER &&
- (conn->state == BT_CONNECTED || conn->state == BT_CONFIG)) {
- struct hci_dev *hdev = conn->hdev;
- struct hci_cp_read_clock_offset clkoff_cp;
+ if (!codec->data_path || !hdev->get_codec_config_data)
+ return 0;
- clkoff_cp.handle = cpu_to_le16(conn->handle);
- hci_send_cmd(hdev, HCI_OP_READ_CLOCK_OFFSET, sizeof(clkoff_cp),
- &clkoff_cp);
+ err = hdev->get_codec_config_data(hdev, ESCO_LINK, codec, &vnd_len,
+ &vnd_data);
+ if (err < 0)
+ goto error;
+
+ cmd = kzalloc(sizeof(*cmd) + vnd_len, GFP_KERNEL);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto error;
}
- return hci_abort_conn(conn, reason);
+ err = hdev->get_data_path_id(hdev, &cmd->data_path_id);
+ if (err < 0)
+ goto error;
+
+ cmd->vnd_len = vnd_len;
+ memcpy(cmd->vnd_data, vnd_data, vnd_len);
+
+ cmd->direction = 0x00;
+ __hci_cmd_sync_status(hdev, HCI_CONFIGURE_DATA_PATH,
+ sizeof(*cmd) + vnd_len, cmd, HCI_CMD_TIMEOUT);
+
+ cmd->direction = 0x01;
+ err = __hci_cmd_sync_status(hdev, HCI_CONFIGURE_DATA_PATH,
+ sizeof(*cmd) + vnd_len, cmd,
+ HCI_CMD_TIMEOUT);
+error:
+
+ kfree(cmd);
+ kfree(vnd_data);
+ return err;
}
-static void hci_add_sco(struct hci_conn *conn, __u16 handle)
+static int hci_enhanced_setup_sync(struct hci_dev *hdev, void *data)
{
- struct hci_dev *hdev = conn->hdev;
- struct hci_cp_add_sco cp;
+ struct conn_handle_t *conn_handle = data;
+ struct hci_conn *conn = conn_handle->conn;
+ __u16 handle = conn_handle->handle;
+ struct hci_cp_enhanced_setup_sync_conn cp;
+ const struct sco_param *param;
- BT_DBG("hcon %p", conn);
+ kfree(conn_handle);
+
+ if (!hci_conn_valid(hdev, conn))
+ return -ECANCELED;
+
+ bt_dev_dbg(hdev, "hcon %p", conn);
+
+ configure_datapath_sync(hdev, &conn->codec);
conn->state = BT_CONNECT;
conn->out = true;
conn->attempt++;
+ memset(&cp, 0x00, sizeof(cp));
+
cp.handle = cpu_to_le16(handle);
- cp.pkt_type = cpu_to_le16(conn->pkt_type);
- hci_send_cmd(hdev, HCI_OP_ADD_SCO, sizeof(cp), &cp);
+ cp.tx_bandwidth = cpu_to_le32(0x00001f40);
+ cp.rx_bandwidth = cpu_to_le32(0x00001f40);
+
+ switch (conn->codec.id) {
+ case BT_CODEC_MSBC:
+ if (!find_next_esco_param(conn, esco_param_msbc,
+ ARRAY_SIZE(esco_param_msbc)))
+ return -EINVAL;
+
+ param = &esco_param_msbc[conn->attempt - 1];
+ cp.tx_coding_format.id = 0x05;
+ cp.rx_coding_format.id = 0x05;
+ cp.tx_codec_frame_size = __cpu_to_le16(60);
+ cp.rx_codec_frame_size = __cpu_to_le16(60);
+ cp.in_bandwidth = __cpu_to_le32(32000);
+ cp.out_bandwidth = __cpu_to_le32(32000);
+ cp.in_coding_format.id = 0x04;
+ cp.out_coding_format.id = 0x04;
+ cp.in_coded_data_size = __cpu_to_le16(16);
+ cp.out_coded_data_size = __cpu_to_le16(16);
+ cp.in_pcm_data_format = 2;
+ cp.out_pcm_data_format = 2;
+ cp.in_pcm_sample_payload_msb_pos = 0;
+ cp.out_pcm_sample_payload_msb_pos = 0;
+ cp.in_data_path = conn->codec.data_path;
+ cp.out_data_path = conn->codec.data_path;
+ cp.in_transport_unit_size = 1;
+ cp.out_transport_unit_size = 1;
+ break;
+
+ case BT_CODEC_TRANSPARENT:
+ if (!find_next_esco_param(conn, esco_param_msbc,
+ ARRAY_SIZE(esco_param_msbc)))
+ return -EINVAL;
+
+ param = &esco_param_msbc[conn->attempt - 1];
+ cp.tx_coding_format.id = 0x03;
+ cp.rx_coding_format.id = 0x03;
+ cp.tx_codec_frame_size = __cpu_to_le16(60);
+ cp.rx_codec_frame_size = __cpu_to_le16(60);
+ cp.in_bandwidth = __cpu_to_le32(0x1f40);
+ cp.out_bandwidth = __cpu_to_le32(0x1f40);
+ cp.in_coding_format.id = 0x03;
+ cp.out_coding_format.id = 0x03;
+ cp.in_coded_data_size = __cpu_to_le16(16);
+ cp.out_coded_data_size = __cpu_to_le16(16);
+ cp.in_pcm_data_format = 2;
+ cp.out_pcm_data_format = 2;
+ cp.in_pcm_sample_payload_msb_pos = 0;
+ cp.out_pcm_sample_payload_msb_pos = 0;
+ cp.in_data_path = conn->codec.data_path;
+ cp.out_data_path = conn->codec.data_path;
+ cp.in_transport_unit_size = 1;
+ cp.out_transport_unit_size = 1;
+ break;
+
+ case BT_CODEC_CVSD:
+ if (conn->parent && lmp_esco_capable(conn->parent)) {
+ if (!find_next_esco_param(conn, esco_param_cvsd,
+ ARRAY_SIZE(esco_param_cvsd)))
+ return -EINVAL;
+ param = &esco_param_cvsd[conn->attempt - 1];
+ } else {
+ if (conn->attempt > ARRAY_SIZE(sco_param_cvsd))
+ return -EINVAL;
+ param = &sco_param_cvsd[conn->attempt - 1];
+ }
+ cp.tx_coding_format.id = 2;
+ cp.rx_coding_format.id = 2;
+ cp.tx_codec_frame_size = __cpu_to_le16(60);
+ cp.rx_codec_frame_size = __cpu_to_le16(60);
+ cp.in_bandwidth = __cpu_to_le32(16000);
+ cp.out_bandwidth = __cpu_to_le32(16000);
+ cp.in_coding_format.id = 4;
+ cp.out_coding_format.id = 4;
+ cp.in_coded_data_size = __cpu_to_le16(16);
+ cp.out_coded_data_size = __cpu_to_le16(16);
+ cp.in_pcm_data_format = 2;
+ cp.out_pcm_data_format = 2;
+ cp.in_pcm_sample_payload_msb_pos = 0;
+ cp.out_pcm_sample_payload_msb_pos = 0;
+ cp.in_data_path = conn->codec.data_path;
+ cp.out_data_path = conn->codec.data_path;
+ cp.in_transport_unit_size = 16;
+ cp.out_transport_unit_size = 16;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ cp.retrans_effort = param->retrans_effort;
+ cp.pkt_type = __cpu_to_le16(param->pkt_type);
+ cp.max_latency = __cpu_to_le16(param->max_latency);
+
+ if (hci_send_cmd(hdev, HCI_OP_ENHANCED_SETUP_SYNC_CONN, sizeof(cp), &cp) < 0)
+ return -EIO;
+
+ return 0;
}
-bool hci_setup_sync(struct hci_conn *conn, __u16 handle)
+static bool hci_setup_sync_conn(struct hci_conn *conn, __u16 handle)
{
struct hci_dev *hdev = conn->hdev;
struct hci_cp_setup_sync_conn cp;
const struct sco_param *param;
- BT_DBG("hcon %p", conn);
+ bt_dev_dbg(hdev, "hcon %p", conn);
conn->state = BT_CONNECT;
conn->out = true;
@@ -289,13 +425,15 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle)
switch (conn->setting & SCO_AIRMODE_MASK) {
case SCO_AIRMODE_TRANSP:
- if (conn->attempt > ARRAY_SIZE(esco_param_msbc))
+ if (!find_next_esco_param(conn, esco_param_msbc,
+ ARRAY_SIZE(esco_param_msbc)))
return false;
param = &esco_param_msbc[conn->attempt - 1];
break;
case SCO_AIRMODE_CVSD:
- if (lmp_esco_capable(conn->link)) {
- if (conn->attempt > ARRAY_SIZE(esco_param_cvsd))
+ if (conn->parent && lmp_esco_capable(conn->parent)) {
+ if (!find_next_esco_param(conn, esco_param_cvsd,
+ ARRAY_SIZE(esco_param_cvsd)))
return false;
param = &esco_param_cvsd[conn->attempt - 1];
} else {
@@ -318,6 +456,30 @@ bool hci_setup_sync(struct hci_conn *conn, __u16 handle)
return true;
}
+bool hci_setup_sync(struct hci_conn *conn, __u16 handle)
+{
+ int result;
+ struct conn_handle_t *conn_handle;
+
+ if (enhanced_sync_conn_capable(conn->hdev)) {
+ conn_handle = kzalloc(sizeof(*conn_handle), GFP_KERNEL);
+
+ if (!conn_handle)
+ return false;
+
+ conn_handle->conn = conn;
+ conn_handle->handle = handle;
+ result = hci_cmd_sync_queue(conn->hdev, hci_enhanced_setup_sync,
+ conn_handle, NULL);
+ if (result < 0)
+ kfree(conn_handle);
+
+ return result == 0;
+ }
+
+ return hci_setup_sync_conn(conn, handle);
+}
+
u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency,
u16 to_multiplier)
{
@@ -375,21 +537,22 @@ void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand,
/* Device _must_ be locked */
void hci_sco_setup(struct hci_conn *conn, __u8 status)
{
- struct hci_conn *sco = conn->link;
+ struct hci_link *link;
- if (!sco)
+ link = list_first_entry_or_null(&conn->link_list, struct hci_link, list);
+ if (!link || !link->conn)
return;
BT_DBG("hcon %p", conn);
if (!status) {
if (lmp_esco_capable(conn->hdev))
- hci_setup_sync(sco, conn->handle);
+ hci_setup_sync(link->conn, conn->handle);
else
- hci_add_sco(sco, conn->handle);
+ hci_add_sco(link->conn, conn->handle);
} else {
- hci_connect_cfm(sco, status);
- hci_conn_del(sco);
+ hci_connect_cfm(link->conn, status);
+ hci_conn_del(link->conn);
}
}
@@ -413,13 +576,6 @@ static void hci_conn_timeout(struct work_struct *work)
if (refcnt > 0)
return;
- /* LE connections in scanning state need special handling */
- if (conn->state == BT_CONNECT && conn->type == LE_LINK &&
- test_bit(HCI_CONN_SCANNING, &conn->flags)) {
- hci_connect_le_scan_remove(conn);
- return;
- }
-
hci_abort_conn(conn, hci_proto_disconn_ind(conn));
}
@@ -467,6 +623,23 @@ static void hci_conn_auto_accept(struct work_struct *work)
&conn->dst);
}
+static void le_disable_advertising(struct hci_dev *hdev)
+{
+ if (ext_adv_capable(hdev)) {
+ struct hci_cp_le_set_ext_adv_enable cp;
+
+ cp.enable = 0x00;
+ cp.num_of_sets = 0x00;
+
+ hci_send_cmd(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE, sizeof(cp),
+ &cp);
+ } else {
+ u8 enable = 0x00;
+ hci_send_cmd(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable),
+ &enable);
+ }
+}
+
static void le_conn_timeout(struct work_struct *work)
{
struct hci_conn *conn = container_of(work, struct hci_conn,
@@ -481,29 +654,327 @@ static void le_conn_timeout(struct work_struct *work)
* (which doesn't have a timeout of its own).
*/
if (conn->role == HCI_ROLE_SLAVE) {
- u8 enable = 0x00;
- hci_send_cmd(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable),
- &enable);
- hci_le_conn_failed(conn, HCI_ERROR_ADVERTISING_TIMEOUT);
+ /* Disable LE Advertising */
+ le_disable_advertising(hdev);
+ hci_dev_lock(hdev);
+ hci_conn_failed(conn, HCI_ERROR_ADVERTISING_TIMEOUT);
+ hci_dev_unlock(hdev);
return;
}
hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM);
}
-struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
- u8 role)
+struct iso_list_data {
+ union {
+ u8 cig;
+ u8 big;
+ };
+ union {
+ u8 cis;
+ u8 bis;
+ u16 sync_handle;
+ };
+ int count;
+ bool big_term;
+ bool pa_sync_term;
+ bool big_sync_term;
+};
+
+static void bis_list(struct hci_conn *conn, void *data)
+{
+ struct iso_list_data *d = data;
+
+ /* Skip if not broadcast/ANY address */
+ if (bacmp(&conn->dst, BDADDR_ANY))
+ return;
+
+ if (d->big != conn->iso_qos.bcast.big || d->bis == BT_ISO_QOS_BIS_UNSET ||
+ d->bis != conn->iso_qos.bcast.bis)
+ return;
+
+ d->count++;
+}
+
+static int terminate_big_sync(struct hci_dev *hdev, void *data)
+{
+ struct iso_list_data *d = data;
+
+ bt_dev_dbg(hdev, "big 0x%2.2x bis 0x%2.2x", d->big, d->bis);
+
+ hci_disable_per_advertising_sync(hdev, d->bis);
+ hci_remove_ext_adv_instance_sync(hdev, d->bis, NULL);
+
+ /* Only terminate BIG if it has been created */
+ if (!d->big_term)
+ return 0;
+
+ return hci_le_terminate_big_sync(hdev, d->big,
+ HCI_ERROR_LOCAL_HOST_TERM);
+}
+
+static void terminate_big_destroy(struct hci_dev *hdev, void *data, int err)
+{
+ kfree(data);
+}
+
+static int hci_le_terminate_big(struct hci_dev *hdev, struct hci_conn *conn)
+{
+ struct iso_list_data *d;
+ int ret;
+
+ bt_dev_dbg(hdev, "big 0x%2.2x bis 0x%2.2x", conn->iso_qos.bcast.big,
+ conn->iso_qos.bcast.bis);
+
+ d = kzalloc(sizeof(*d), GFP_KERNEL);
+ if (!d)
+ return -ENOMEM;
+
+ d->big = conn->iso_qos.bcast.big;
+ d->bis = conn->iso_qos.bcast.bis;
+ d->big_term = test_and_clear_bit(HCI_CONN_BIG_CREATED, &conn->flags);
+
+ ret = hci_cmd_sync_queue(hdev, terminate_big_sync, d,
+ terminate_big_destroy);
+ if (ret)
+ kfree(d);
+
+ return ret;
+}
+
+static int big_terminate_sync(struct hci_dev *hdev, void *data)
+{
+ struct iso_list_data *d = data;
+
+ bt_dev_dbg(hdev, "big 0x%2.2x sync_handle 0x%4.4x", d->big,
+ d->sync_handle);
+
+ if (d->big_sync_term)
+ hci_le_big_terminate_sync(hdev, d->big);
+
+ if (d->pa_sync_term)
+ return hci_le_pa_terminate_sync(hdev, d->sync_handle);
+
+ return 0;
+}
+
+static void find_bis(struct hci_conn *conn, void *data)
+{
+ struct iso_list_data *d = data;
+
+ /* Ignore if BIG doesn't match */
+ if (d->big != conn->iso_qos.bcast.big)
+ return;
+
+ d->count++;
+}
+
+static int hci_le_big_terminate(struct hci_dev *hdev, struct hci_conn *conn)
+{
+ struct iso_list_data *d;
+ int ret;
+
+ bt_dev_dbg(hdev, "hcon %p big 0x%2.2x sync_handle 0x%4.4x", conn,
+ conn->iso_qos.bcast.big, conn->sync_handle);
+
+ d = kzalloc(sizeof(*d), GFP_KERNEL);
+ if (!d)
+ return -ENOMEM;
+
+ d->big = conn->iso_qos.bcast.big;
+ d->sync_handle = conn->sync_handle;
+
+ if (conn->type == PA_LINK &&
+ test_and_clear_bit(HCI_CONN_PA_SYNC, &conn->flags)) {
+ hci_conn_hash_list_flag(hdev, find_bis, PA_LINK,
+ HCI_CONN_PA_SYNC, d);
+
+ if (!d->count)
+ d->pa_sync_term = true;
+
+ d->count = 0;
+ }
+
+ if (test_and_clear_bit(HCI_CONN_BIG_SYNC, &conn->flags)) {
+ hci_conn_hash_list_flag(hdev, find_bis, BIS_LINK,
+ HCI_CONN_BIG_SYNC, d);
+
+ if (!d->count)
+ d->big_sync_term = true;
+ }
+
+ if (!d->pa_sync_term && !d->big_sync_term)
+ return 0;
+
+ ret = hci_cmd_sync_queue(hdev, big_terminate_sync, d,
+ terminate_big_destroy);
+ if (ret)
+ kfree(d);
+
+ return ret;
+}
+
+/* Cleanup BIS connection
+ *
+ * Detects if there any BIS left connected in a BIG
+ * broadcaster: Remove advertising instance and terminate BIG.
+ * broadcaster receiver: Terminate BIG sync and terminate PA sync.
+ */
+static void bis_cleanup(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_conn *bis;
+
+ bt_dev_dbg(hdev, "conn %p", conn);
+
+ if (conn->role == HCI_ROLE_MASTER) {
+ if (!test_and_clear_bit(HCI_CONN_PER_ADV, &conn->flags))
+ return;
+
+ /* Check if ISO connection is a BIS and terminate advertising
+ * set and BIG if there are no other connections using it.
+ */
+ bis = hci_conn_hash_lookup_big_state(hdev,
+ conn->iso_qos.bcast.big,
+ BT_CONNECTED,
+ HCI_ROLE_MASTER);
+ if (bis)
+ return;
+
+ bis = hci_conn_hash_lookup_big_state(hdev,
+ conn->iso_qos.bcast.big,
+ BT_CONNECT,
+ HCI_ROLE_MASTER);
+ if (bis)
+ return;
+
+ bis = hci_conn_hash_lookup_big_state(hdev,
+ conn->iso_qos.bcast.big,
+ BT_OPEN,
+ HCI_ROLE_MASTER);
+ if (bis)
+ return;
+
+ hci_le_terminate_big(hdev, conn);
+ } else {
+ hci_le_big_terminate(hdev, conn);
+ }
+}
+
+static int remove_cig_sync(struct hci_dev *hdev, void *data)
+{
+ u8 handle = PTR_UINT(data);
+
+ return hci_le_remove_cig_sync(hdev, handle);
+}
+
+static int hci_le_remove_cig(struct hci_dev *hdev, u8 handle)
+{
+ bt_dev_dbg(hdev, "handle 0x%2.2x", handle);
+
+ return hci_cmd_sync_queue(hdev, remove_cig_sync, UINT_PTR(handle),
+ NULL);
+}
+
+static void find_cis(struct hci_conn *conn, void *data)
+{
+ struct iso_list_data *d = data;
+
+ /* Ignore broadcast or if CIG don't match */
+ if (!bacmp(&conn->dst, BDADDR_ANY) || d->cig != conn->iso_qos.ucast.cig)
+ return;
+
+ d->count++;
+}
+
+/* Cleanup CIS connection:
+ *
+ * Detects if there any CIS left connected in a CIG and remove it.
+ */
+static void cis_cleanup(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct iso_list_data d;
+
+ if (conn->iso_qos.ucast.cig == BT_ISO_QOS_CIG_UNSET)
+ return;
+
+ memset(&d, 0, sizeof(d));
+ d.cig = conn->iso_qos.ucast.cig;
+
+ /* Check if ISO connection is a CIS and remove CIG if there are
+ * no other connections using it.
+ */
+ hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_BOUND, &d);
+ hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECT,
+ &d);
+ hci_conn_hash_list_state(hdev, find_cis, CIS_LINK, BT_CONNECTED,
+ &d);
+ if (d.count)
+ return;
+
+ hci_le_remove_cig(hdev, conn->iso_qos.ucast.cig);
+}
+
+static int hci_conn_hash_alloc_unset(struct hci_dev *hdev)
+{
+ return ida_alloc_range(&hdev->unset_handle_ida, HCI_CONN_HANDLE_MAX + 1,
+ U16_MAX, GFP_ATOMIC);
+}
+
+static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type,
+ bdaddr_t *dst, u8 dst_type,
+ u8 role, u16 handle)
{
struct hci_conn *conn;
+ struct smp_irk *irk = NULL;
+
+ switch (type) {
+ case ACL_LINK:
+ if (!hdev->acl_mtu)
+ return ERR_PTR(-ECONNREFUSED);
+ break;
+ case CIS_LINK:
+ case BIS_LINK:
+ case PA_LINK:
+ if (!hdev->iso_mtu)
+ return ERR_PTR(-ECONNREFUSED);
+ irk = hci_get_irk(hdev, dst, dst_type);
+ break;
+ case LE_LINK:
+ if (hdev->le_mtu && hdev->le_mtu < HCI_MIN_LE_MTU)
+ return ERR_PTR(-ECONNREFUSED);
+ if (!hdev->le_mtu && hdev->acl_mtu < HCI_MIN_LE_MTU)
+ return ERR_PTR(-ECONNREFUSED);
+ irk = hci_get_irk(hdev, dst, dst_type);
+ break;
+ case SCO_LINK:
+ case ESCO_LINK:
+ if (!hdev->sco_pkts)
+ /* Controller does not support SCO or eSCO over HCI */
+ return ERR_PTR(-ECONNREFUSED);
+ break;
+ default:
+ return ERR_PTR(-ECONNREFUSED);
+ }
- BT_DBG("%s dst %pMR", hdev->name, dst);
+ bt_dev_dbg(hdev, "dst %pMR handle 0x%4.4x", dst, handle);
conn = kzalloc(sizeof(*conn), GFP_KERNEL);
if (!conn)
- return NULL;
+ return ERR_PTR(-ENOMEM);
+
+ /* If and IRK exists use its identity address */
+ if (!irk) {
+ bacpy(&conn->dst, dst);
+ conn->dst_type = dst_type;
+ } else {
+ bacpy(&conn->dst, &irk->bdaddr);
+ conn->dst_type = irk->addr_type;
+ }
- bacpy(&conn->dst, dst);
bacpy(&conn->src, &hdev->bdaddr);
+ conn->handle = handle;
conn->hdev = hdev;
conn->type = type;
conn->role = role;
@@ -516,20 +987,43 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
conn->rssi = HCI_RSSI_INVALID;
conn->tx_power = HCI_TX_POWER_INVALID;
conn->max_tx_power = HCI_TX_POWER_INVALID;
+ conn->sync_handle = HCI_SYNC_HANDLE_INVALID;
+ conn->sid = HCI_SID_INVALID;
set_bit(HCI_CONN_POWER_SAVE, &conn->flags);
conn->disc_timeout = HCI_DISCONN_TIMEOUT;
+ /* Set Default Authenticated payload timeout to 30s */
+ conn->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT;
+
if (conn->role == HCI_ROLE_MASTER)
conn->out = true;
switch (type) {
case ACL_LINK:
conn->pkt_type = hdev->pkt_type & ACL_PTYPE_MASK;
+ conn->mtu = hdev->acl_mtu;
break;
case LE_LINK:
/* conn->src should reflect the local identity address */
hci_copy_identity_address(hdev, &conn->src, &conn->src_type);
+ conn->mtu = hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu;
+ break;
+ case CIS_LINK:
+ /* conn->src should reflect the local identity address */
+ hci_copy_identity_address(hdev, &conn->src, &conn->src_type);
+
+ if (conn->role == HCI_ROLE_MASTER)
+ conn->cleanup = cis_cleanup;
+
+ conn->mtu = hdev->iso_mtu;
+ break;
+ case PA_LINK:
+ case BIS_LINK:
+ /* conn->src should reflect the local identity address */
+ hci_copy_identity_address(hdev, &conn->src, &conn->src_type);
+ conn->cleanup = bis_cleanup;
+ conn->mtu = hdev->iso_mtu;
break;
case SCO_LINK:
if (lmp_esco_capable(hdev))
@@ -537,71 +1031,200 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
(hdev->esco_type & EDR_ESCO_MASK);
else
conn->pkt_type = hdev->pkt_type & SCO_PTYPE_MASK;
+
+ conn->mtu = hdev->sco_mtu;
break;
case ESCO_LINK:
conn->pkt_type = hdev->esco_type & ~EDR_ESCO_MASK;
+ conn->mtu = hdev->sco_mtu;
break;
}
skb_queue_head_init(&conn->data_q);
+ skb_queue_head_init(&conn->tx_q.queue);
INIT_LIST_HEAD(&conn->chan_list);
+ INIT_LIST_HEAD(&conn->link_list);
INIT_DELAYED_WORK(&conn->disc_work, hci_conn_timeout);
INIT_DELAYED_WORK(&conn->auto_accept_work, hci_conn_auto_accept);
INIT_DELAYED_WORK(&conn->idle_work, hci_conn_idle);
INIT_DELAYED_WORK(&conn->le_conn_timeout, le_conn_timeout);
- INIT_WORK(&conn->le_scan_cleanup, le_scan_cleanup);
atomic_set(&conn->refcnt, 0);
hci_dev_hold(hdev);
hci_conn_hash_add(hdev, conn);
- if (hdev->notify)
- hdev->notify(hdev, HCI_NOTIFY_CONN_ADD);
- hci_conn_init_sysfs(conn);
+ /* The SCO and eSCO connections will only be notified when their
+ * setup has been completed. This is different to ACL links which
+ * can be notified right away.
+ */
+ if (conn->type != SCO_LINK && conn->type != ESCO_LINK) {
+ if (hdev->notify)
+ hdev->notify(hdev, HCI_NOTIFY_CONN_ADD);
+ }
+ hci_conn_init_sysfs(conn);
return conn;
}
-int hci_conn_del(struct hci_conn *conn)
+struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type,
+ bdaddr_t *dst, u8 dst_type, u8 role)
+{
+ int handle;
+
+ bt_dev_dbg(hdev, "dst %pMR", dst);
+
+ handle = hci_conn_hash_alloc_unset(hdev);
+ if (unlikely(handle < 0))
+ return ERR_PTR(-ECONNREFUSED);
+
+ return __hci_conn_add(hdev, type, dst, dst_type, role, handle);
+}
+
+struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst,
+ u8 dst_type, u8 role, u16 handle)
+{
+ if (handle > HCI_CONN_HANDLE_MAX)
+ return ERR_PTR(-EINVAL);
+
+ return __hci_conn_add(hdev, type, dst, dst_type, role, handle);
+}
+
+static void hci_conn_cleanup_child(struct hci_conn *conn, u8 reason)
+{
+ if (!reason)
+ reason = HCI_ERROR_REMOTE_USER_TERM;
+
+ /* Due to race, SCO/ISO conn might be not established yet at this point,
+ * and nothing else will clean it up. In other cases it is done via HCI
+ * events.
+ */
+ switch (conn->type) {
+ case SCO_LINK:
+ case ESCO_LINK:
+ if (HCI_CONN_HANDLE_UNSET(conn->handle))
+ hci_conn_failed(conn, reason);
+ break;
+ case CIS_LINK:
+ case BIS_LINK:
+ case PA_LINK:
+ if ((conn->state != BT_CONNECTED &&
+ !test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) ||
+ test_bit(HCI_CONN_BIG_CREATED, &conn->flags))
+ hci_conn_failed(conn, reason);
+ break;
+ }
+}
+
+static void hci_conn_unlink(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ bt_dev_dbg(hdev, "hcon %p", conn);
+
+ if (!conn->parent) {
+ struct hci_link *link, *t;
+
+ list_for_each_entry_safe(link, t, &conn->link_list, list) {
+ struct hci_conn *child = link->conn;
+
+ hci_conn_unlink(child);
+
+ /* If hdev is down it means
+ * hci_dev_close_sync/hci_conn_hash_flush is in progress
+ * and links don't need to be cleanup as all connections
+ * would be cleanup.
+ */
+ if (!test_bit(HCI_UP, &hdev->flags))
+ continue;
+
+ hci_conn_cleanup_child(child, conn->abort_reason);
+ }
+
+ return;
+ }
+
+ if (!conn->link)
+ return;
+
+ list_del_rcu(&conn->link->list);
+ synchronize_rcu();
+
+ hci_conn_drop(conn->parent);
+ hci_conn_put(conn->parent);
+ conn->parent = NULL;
+
+ kfree(conn->link);
+ conn->link = NULL;
+}
+
+void hci_conn_del(struct hci_conn *conn)
{
struct hci_dev *hdev = conn->hdev;
BT_DBG("%s hcon %p handle %d", hdev->name, conn, conn->handle);
- cancel_delayed_work_sync(&conn->disc_work);
- cancel_delayed_work_sync(&conn->auto_accept_work);
- cancel_delayed_work_sync(&conn->idle_work);
+ hci_conn_unlink(conn);
- if (conn->type == ACL_LINK) {
- struct hci_conn *sco = conn->link;
- if (sco)
- sco->link = NULL;
+ disable_delayed_work_sync(&conn->disc_work);
+ disable_delayed_work_sync(&conn->auto_accept_work);
+ disable_delayed_work_sync(&conn->idle_work);
- /* Unacked frames */
- hdev->acl_cnt += conn->sent;
- } else if (conn->type == LE_LINK) {
- cancel_delayed_work(&conn->le_conn_timeout);
+ /* Remove the connection from the list so unacked logic can detect when
+ * a certain pool is not being utilized.
+ */
+ hci_conn_hash_del(hdev, conn);
- if (hdev->le_pkts)
- hdev->le_cnt += conn->sent;
+ /* Handle unacked frames:
+ *
+ * - In case there are no connection, or if restoring the buffers
+ * considered in transist would overflow, restore all buffers to the
+ * pool.
+ * - Otherwise restore just the buffers considered in transit for the
+ * hci_conn
+ */
+ switch (conn->type) {
+ case ACL_LINK:
+ if (!hci_conn_num(hdev, ACL_LINK) ||
+ hdev->acl_cnt + conn->sent > hdev->acl_pkts)
+ hdev->acl_cnt = hdev->acl_pkts;
else
hdev->acl_cnt += conn->sent;
- } else {
- struct hci_conn *acl = conn->link;
- if (acl) {
- acl->link = NULL;
- hci_conn_drop(acl);
+ break;
+ case LE_LINK:
+ cancel_delayed_work(&conn->le_conn_timeout);
+
+ if (hdev->le_pkts) {
+ if (!hci_conn_num(hdev, LE_LINK) ||
+ hdev->le_cnt + conn->sent > hdev->le_pkts)
+ hdev->le_cnt = hdev->le_pkts;
+ else
+ hdev->le_cnt += conn->sent;
+ } else {
+ if ((!hci_conn_num(hdev, LE_LINK) &&
+ !hci_conn_num(hdev, ACL_LINK)) ||
+ hdev->acl_cnt + conn->sent > hdev->acl_pkts)
+ hdev->acl_cnt = hdev->acl_pkts;
+ else
+ hdev->acl_cnt += conn->sent;
}
+ break;
+ case CIS_LINK:
+ case BIS_LINK:
+ case PA_LINK:
+ if (!hci_iso_count(hdev) ||
+ hdev->iso_cnt + conn->sent > hdev->iso_pkts)
+ hdev->iso_cnt = hdev->iso_pkts;
+ else
+ hdev->iso_cnt += conn->sent;
+ break;
}
- if (conn->amp_mgr)
- amp_mgr_put(conn->amp_mgr);
-
skb_queue_purge(&conn->data_q);
+ skb_queue_purge(&conn->tx_q.queue);
/* Remove the connection from the list and cleanup its remaining
* state. This is a separate function since for some cases like
@@ -610,7 +1233,8 @@ int hci_conn_del(struct hci_conn *conn)
*/
hci_conn_cleanup(conn);
- return 0;
+ /* Dequeue callbacks using connection pointer as data */
+ hci_cmd_sync_dequeue(hdev, NULL, conn, NULL);
}
struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, uint8_t src_type)
@@ -624,8 +1248,7 @@ struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, uint8_t src_type)
list_for_each_entry(d, &hci_dev_list, list) {
if (!test_bit(HCI_UP, &d->flags) ||
- hci_dev_test_flag(d, HCI_USER_CHANNEL) ||
- d->dev_type != HCI_PRIMARY)
+ hci_dev_test_flag(d, HCI_USER_CHANNEL))
continue;
/* Simple routing:
@@ -675,286 +1298,81 @@ struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, uint8_t src_type)
EXPORT_SYMBOL(hci_get_route);
/* This function requires the caller holds hdev->lock */
-void hci_le_conn_failed(struct hci_conn *conn, u8 status)
+static void hci_le_conn_failed(struct hci_conn *conn, u8 status)
{
struct hci_dev *hdev = conn->hdev;
- struct hci_conn_params *params;
-
- params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst,
- conn->dst_type);
- if (params && params->conn) {
- hci_conn_drop(params->conn);
- hci_conn_put(params->conn);
- params->conn = NULL;
- }
-
- conn->state = BT_CLOSED;
-
- /* If the status indicates successful cancellation of
- * the attempt (i.e. Unkown Connection Id) there's no point of
- * notifying failure since we'll go back to keep trying to
- * connect. The only exception is explicit connect requests
- * where a timeout + cancel does indicate an actual failure.
- */
- if (status != HCI_ERROR_UNKNOWN_CONN_ID ||
- (params && params->explicit_connect))
- mgmt_connect_failed(hdev, &conn->dst, conn->type,
- conn->dst_type, status);
- hci_connect_cfm(conn, status);
+ hci_connect_le_scan_cleanup(conn, status);
- hci_conn_del(conn);
-
- /* Since we may have temporarily stopped the background scanning in
- * favor of connection establishment, we should restart it.
- */
- hci_update_background_scan(hdev);
-
- /* Re-enable advertising in case this was a failed connection
+ /* Enable advertising in case this was a failed connection
* attempt as a peripheral.
*/
- hci_req_reenable_advertising(hdev);
-}
-
-static void create_le_conn_complete(struct hci_dev *hdev, u8 status, u16 opcode)
-{
- struct hci_conn *conn;
-
- hci_dev_lock(hdev);
-
- conn = hci_lookup_le_connect(hdev);
-
- if (!status) {
- hci_connect_le_scan_cleanup(conn);
- goto done;
- }
-
- bt_dev_err(hdev, "request failed to create LE connection: "
- "status 0x%2.2x", status);
-
- if (!conn)
- goto done;
-
- hci_le_conn_failed(conn, status);
-
-done:
- hci_dev_unlock(hdev);
+ hci_enable_advertising(hdev);
}
-static bool conn_use_rpa(struct hci_conn *conn)
+/* This function requires the caller holds hdev->lock */
+void hci_conn_failed(struct hci_conn *conn, u8 status)
{
struct hci_dev *hdev = conn->hdev;
- return hci_dev_test_flag(hdev, HCI_PRIVACY);
-}
-
-static void set_ext_conn_params(struct hci_conn *conn,
- struct hci_cp_le_ext_conn_param *p)
-{
- struct hci_dev *hdev = conn->hdev;
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
- memset(p, 0, sizeof(*p));
+ switch (conn->type) {
+ case LE_LINK:
+ hci_le_conn_failed(conn, status);
+ break;
+ case ACL_LINK:
+ mgmt_connect_failed(hdev, conn, status);
+ break;
+ }
- /* Set window to be the same value as the interval to
- * enable continuous scanning.
+ /* In case of BIG/PA sync failed, clear conn flags so that
+ * the conns will be correctly cleaned up by ISO layer
*/
- p->scan_interval = cpu_to_le16(hdev->le_scan_interval);
- p->scan_window = p->scan_interval;
- p->conn_interval_min = cpu_to_le16(conn->le_conn_min_interval);
- p->conn_interval_max = cpu_to_le16(conn->le_conn_max_interval);
- p->conn_latency = cpu_to_le16(conn->le_conn_latency);
- p->supervision_timeout = cpu_to_le16(conn->le_supv_timeout);
- p->min_ce_len = cpu_to_le16(0x0000);
- p->max_ce_len = cpu_to_le16(0x0000);
+ test_and_clear_bit(HCI_CONN_BIG_SYNC_FAILED, &conn->flags);
+ test_and_clear_bit(HCI_CONN_PA_SYNC_FAILED, &conn->flags);
+
+ conn->state = BT_CLOSED;
+ hci_connect_cfm(conn, status);
+ hci_conn_del(conn);
}
-static void hci_req_add_le_create_conn(struct hci_request *req,
- struct hci_conn *conn,
- bdaddr_t *direct_rpa)
+/* This function requires the caller holds hdev->lock */
+u8 hci_conn_set_handle(struct hci_conn *conn, u16 handle)
{
struct hci_dev *hdev = conn->hdev;
- u8 own_addr_type;
-
- /* If direct address was provided we use it instead of current
- * address.
- */
- if (direct_rpa) {
- if (bacmp(&req->hdev->random_addr, direct_rpa))
- hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6,
- direct_rpa);
-
- /* direct address is always RPA */
- own_addr_type = ADDR_LE_DEV_RANDOM;
- } else {
- /* Update random address, but set require_privacy to false so
- * that we never connect with an non-resolvable address.
- */
- if (hci_update_random_address(req, false, conn_use_rpa(conn),
- &own_addr_type))
- return;
- }
-
- if (use_ext_conn(hdev)) {
- struct hci_cp_le_ext_create_conn *cp;
- struct hci_cp_le_ext_conn_param *p;
- u8 data[sizeof(*cp) + sizeof(*p) * 3];
- u32 plen;
- cp = (void *) data;
- p = (void *) cp->data;
+ bt_dev_dbg(hdev, "hcon %p handle 0x%4.4x", conn, handle);
- memset(cp, 0, sizeof(*cp));
-
- bacpy(&cp->peer_addr, &conn->dst);
- cp->peer_addr_type = conn->dst_type;
- cp->own_addr_type = own_addr_type;
-
- plen = sizeof(*cp);
-
- if (scan_1m(hdev)) {
- cp->phys |= LE_SCAN_PHY_1M;
- set_ext_conn_params(conn, p);
-
- p++;
- plen += sizeof(*p);
- }
-
- if (scan_2m(hdev)) {
- cp->phys |= LE_SCAN_PHY_2M;
- set_ext_conn_params(conn, p);
-
- p++;
- plen += sizeof(*p);
- }
-
- if (scan_coded(hdev)) {
- cp->phys |= LE_SCAN_PHY_CODED;
- set_ext_conn_params(conn, p);
-
- plen += sizeof(*p);
- }
-
- hci_req_add(req, HCI_OP_LE_EXT_CREATE_CONN, plen, data);
-
- } else {
- struct hci_cp_le_create_conn cp;
-
- memset(&cp, 0, sizeof(cp));
-
- /* Set window to be the same value as the interval to enable
- * continuous scanning.
- */
- cp.scan_interval = cpu_to_le16(hdev->le_scan_interval);
- cp.scan_window = cp.scan_interval;
-
- bacpy(&cp.peer_addr, &conn->dst);
- cp.peer_addr_type = conn->dst_type;
- cp.own_address_type = own_addr_type;
- cp.conn_interval_min = cpu_to_le16(conn->le_conn_min_interval);
- cp.conn_interval_max = cpu_to_le16(conn->le_conn_max_interval);
- cp.conn_latency = cpu_to_le16(conn->le_conn_latency);
- cp.supervision_timeout = cpu_to_le16(conn->le_supv_timeout);
- cp.min_ce_len = cpu_to_le16(0x0000);
- cp.max_ce_len = cpu_to_le16(0x0000);
+ if (conn->handle == handle)
+ return 0;
- hci_req_add(req, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp);
+ if (handle > HCI_CONN_HANDLE_MAX) {
+ bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x",
+ handle, HCI_CONN_HANDLE_MAX);
+ return HCI_ERROR_INVALID_PARAMETERS;
}
- conn->state = BT_CONNECT;
- clear_bit(HCI_CONN_SCANNING, &conn->flags);
-}
-
-static void hci_req_directed_advertising(struct hci_request *req,
- struct hci_conn *conn)
-{
- struct hci_dev *hdev = req->hdev;
- u8 own_addr_type;
- u8 enable;
-
- if (ext_adv_capable(hdev)) {
- struct hci_cp_le_set_ext_adv_params cp;
- bdaddr_t random_addr;
-
- /* Set require_privacy to false so that the remote device has a
- * chance of identifying us.
- */
- if (hci_get_random_address(hdev, false, conn_use_rpa(conn), NULL,
- &own_addr_type, &random_addr) < 0)
- return;
-
- memset(&cp, 0, sizeof(cp));
-
- cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_DIRECT_IND);
- cp.own_addr_type = own_addr_type;
- cp.channel_map = hdev->le_adv_channel_map;
- cp.tx_power = HCI_TX_POWER_INVALID;
- cp.primary_phy = HCI_ADV_PHY_1M;
- cp.secondary_phy = HCI_ADV_PHY_1M;
- cp.handle = 0; /* Use instance 0 for directed adv */
- cp.own_addr_type = own_addr_type;
- cp.peer_addr_type = conn->dst_type;
- bacpy(&cp.peer_addr, &conn->dst);
-
- hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp);
-
- if (own_addr_type == ADDR_LE_DEV_RANDOM &&
- bacmp(&random_addr, BDADDR_ANY) &&
- bacmp(&random_addr, &hdev->random_addr)) {
- struct hci_cp_le_set_adv_set_rand_addr cp;
-
- memset(&cp, 0, sizeof(cp));
-
- cp.handle = 0;
- bacpy(&cp.bdaddr, &random_addr);
-
- hci_req_add(req,
- HCI_OP_LE_SET_ADV_SET_RAND_ADDR,
- sizeof(cp), &cp);
- }
-
- __hci_req_enable_ext_advertising(req);
- } else {
- struct hci_cp_le_set_adv_param cp;
-
- /* Clear the HCI_LE_ADV bit temporarily so that the
- * hci_update_random_address knows that it's safe to go ahead
- * and write a new random address. The flag will be set back on
- * as soon as the SET_ADV_ENABLE HCI command completes.
- */
- hci_dev_clear_flag(hdev, HCI_LE_ADV);
-
- /* Set require_privacy to false so that the remote device has a
- * chance of identifying us.
- */
- if (hci_update_random_address(req, false, conn_use_rpa(conn),
- &own_addr_type) < 0)
- return;
-
- memset(&cp, 0, sizeof(cp));
- cp.type = LE_ADV_DIRECT_IND;
- cp.own_address_type = own_addr_type;
- cp.direct_addr_type = conn->dst_type;
- bacpy(&cp.direct_addr, &conn->dst);
- cp.channel_map = hdev->le_adv_channel_map;
+ /* If abort_reason has been sent it means the connection is being
+ * aborted and the handle shall not be changed.
+ */
+ if (conn->abort_reason)
+ return conn->abort_reason;
- hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp);
+ if (HCI_CONN_HANDLE_UNSET(conn->handle))
+ ida_free(&hdev->unset_handle_ida, conn->handle);
- enable = 0x01;
- hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable),
- &enable);
- }
+ conn->handle = handle;
- conn->state = BT_CONNECT;
+ return 0;
}
struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
- u8 dst_type, u8 sec_level, u16 conn_timeout,
- u8 role, bdaddr_t *direct_rpa)
+ u8 dst_type, bool dst_resolved, u8 sec_level,
+ u16 conn_timeout, u8 role, u8 phy, u8 sec_phy)
{
- struct hci_conn_params *params;
struct hci_conn *conn;
struct smp_irk *irk;
- struct hci_request req;
int err;
/* Let's make sure that le is enabled.*/
@@ -981,93 +1399,42 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
return ERR_PTR(-EBUSY);
}
- /* When given an identity address with existing identity
- * resolving key, the connection needs to be established
- * to a resolvable random address.
- *
- * Storing the resolvable random address is required here
- * to handle connection failures. The address will later
- * be resolved back into the original identity address
- * from the connect request.
+ /* Check if the destination address has been resolved by the controller
+ * since if it did then the identity address shall be used.
*/
- irk = hci_find_irk_by_addr(hdev, dst, dst_type);
- if (irk && bacmp(&irk->rpa, BDADDR_ANY)) {
- dst = &irk->rpa;
- dst_type = ADDR_LE_DEV_RANDOM;
+ if (!dst_resolved) {
+ /* When given an identity address with existing identity
+ * resolving key, the connection needs to be established
+ * to a resolvable random address.
+ *
+ * Storing the resolvable random address is required here
+ * to handle connection failures. The address will later
+ * be resolved back into the original identity address
+ * from the connect request.
+ */
+ irk = hci_find_irk_by_addr(hdev, dst, dst_type);
+ if (irk && bacmp(&irk->rpa, BDADDR_ANY)) {
+ dst = &irk->rpa;
+ dst_type = ADDR_LE_DEV_RANDOM;
+ }
}
if (conn) {
bacpy(&conn->dst, dst);
} else {
- conn = hci_conn_add(hdev, LE_LINK, dst, role);
- if (!conn)
- return ERR_PTR(-ENOMEM);
+ conn = hci_conn_add_unset(hdev, LE_LINK, dst, dst_type, role);
+ if (IS_ERR(conn))
+ return conn;
hci_conn_hold(conn);
conn->pending_sec_level = sec_level;
}
- conn->dst_type = dst_type;
conn->sec_level = BT_SECURITY_LOW;
conn->conn_timeout = conn_timeout;
+ conn->le_adv_phy = phy;
+ conn->le_adv_sec_phy = sec_phy;
- hci_req_init(&req, hdev);
-
- /* Disable advertising if we're active. For master role
- * connections most controllers will refuse to connect if
- * advertising is enabled, and for slave role connections we
- * anyway have to disable it in order to start directed
- * advertising.
- */
- if (hci_dev_test_flag(hdev, HCI_LE_ADV)) {
- u8 enable = 0x00;
- hci_req_add(&req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable),
- &enable);
- }
-
- /* If requested to connect as slave use directed advertising */
- if (conn->role == HCI_ROLE_SLAVE) {
- /* If we're active scanning most controllers are unable
- * to initiate advertising. Simply reject the attempt.
- */
- if (hci_dev_test_flag(hdev, HCI_LE_SCAN) &&
- hdev->le_scan_type == LE_SCAN_ACTIVE) {
- hci_req_purge(&req);
- hci_conn_del(conn);
- return ERR_PTR(-EBUSY);
- }
-
- hci_req_directed_advertising(&req, conn);
- goto create_conn;
- }
-
- params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type);
- if (params) {
- conn->le_conn_min_interval = params->conn_min_interval;
- conn->le_conn_max_interval = params->conn_max_interval;
- conn->le_conn_latency = params->conn_latency;
- conn->le_supv_timeout = params->supervision_timeout;
- } else {
- conn->le_conn_min_interval = hdev->le_conn_min_interval;
- conn->le_conn_max_interval = hdev->le_conn_max_interval;
- conn->le_conn_latency = hdev->le_conn_latency;
- conn->le_supv_timeout = hdev->le_supv_timeout;
- }
-
- /* If controller is scanning, we stop it since some controllers are
- * not able to scan and connect at the same time. Also set the
- * HCI_LE_SCAN_INTERRUPTED flag so that the command complete
- * handler for scan disabling knows to set the correct discovery
- * state.
- */
- if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
- hci_req_add_le_scan_disable(&req);
- hci_dev_set_flag(hdev, HCI_LE_SCAN_INTERRUPTED);
- }
-
- hci_req_add_le_create_conn(&req, conn, direct_rpa);
-
-create_conn:
- err = hci_req_run(&req, create_le_conn_complete);
+ err = hci_connect_le_sync(hdev, conn);
if (err) {
hci_conn_del(conn);
return ERR_PTR(err);
@@ -1116,8 +1483,8 @@ static int hci_explicit_conn_params_set(struct hci_dev *hdev,
if (params->auto_connect == HCI_AUTO_CONN_DISABLED ||
params->auto_connect == HCI_AUTO_CONN_REPORT ||
params->auto_connect == HCI_AUTO_CONN_EXPLICIT) {
- list_del_init(&params->action);
- list_add(&params->action, &hdev->pend_le_conns);
+ hci_pend_le_list_del_init(params);
+ hci_pend_le_list_add(params, &hdev->pend_le_conns);
}
params->explicit_connect = true;
@@ -1128,10 +1495,126 @@ static int hci_explicit_conn_params_set(struct hci_dev *hdev,
return 0;
}
+static int qos_set_big(struct hci_dev *hdev, struct bt_iso_qos *qos)
+{
+ struct hci_conn *conn;
+ u8 big;
+
+ /* Allocate a BIG if not set */
+ if (qos->bcast.big == BT_ISO_QOS_BIG_UNSET) {
+ for (big = 0x00; big < 0xef; big++) {
+
+ conn = hci_conn_hash_lookup_big(hdev, big);
+ if (!conn)
+ break;
+ }
+
+ if (big == 0xef)
+ return -EADDRNOTAVAIL;
+
+ /* Update BIG */
+ qos->bcast.big = big;
+ }
+
+ return 0;
+}
+
+static int qos_set_bis(struct hci_dev *hdev, struct bt_iso_qos *qos)
+{
+ struct hci_conn *conn;
+ u8 bis;
+
+ /* Allocate BIS if not set */
+ if (qos->bcast.bis == BT_ISO_QOS_BIS_UNSET) {
+ if (qos->bcast.big != BT_ISO_QOS_BIG_UNSET) {
+ conn = hci_conn_hash_lookup_big(hdev, qos->bcast.big);
+
+ if (conn) {
+ /* If the BIG handle is already matched to an advertising
+ * handle, do not allocate a new one.
+ */
+ qos->bcast.bis = conn->iso_qos.bcast.bis;
+ return 0;
+ }
+ }
+
+ /* Find an unused adv set to advertise BIS, skip instance 0x00
+ * since it is reserved as general purpose set.
+ */
+ for (bis = 0x01; bis < hdev->le_num_of_adv_sets;
+ bis++) {
+
+ conn = hci_conn_hash_lookup_bis(hdev, BDADDR_ANY, bis);
+ if (!conn)
+ break;
+ }
+
+ if (bis == hdev->le_num_of_adv_sets)
+ return -EADDRNOTAVAIL;
+
+ /* Update BIS */
+ qos->bcast.bis = bis;
+ }
+
+ return 0;
+}
+
+/* This function requires the caller holds hdev->lock */
+static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst,
+ __u8 sid, struct bt_iso_qos *qos,
+ __u8 base_len, __u8 *base, u16 timeout)
+{
+ struct hci_conn *conn;
+ int err;
+
+ /* Let's make sure that le is enabled.*/
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
+ if (lmp_le_capable(hdev))
+ return ERR_PTR(-ECONNREFUSED);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ err = qos_set_big(hdev, qos);
+ if (err)
+ return ERR_PTR(err);
+
+ err = qos_set_bis(hdev, qos);
+ if (err)
+ return ERR_PTR(err);
+
+ /* Check if the LE Create BIG command has already been sent */
+ conn = hci_conn_hash_lookup_per_adv_bis(hdev, dst, qos->bcast.big,
+ qos->bcast.big);
+ if (conn)
+ return ERR_PTR(-EADDRINUSE);
+
+ /* Check BIS settings against other bound BISes, since all
+ * BISes in a BIG must have the same value for all parameters
+ */
+ conn = hci_conn_hash_lookup_big(hdev, qos->bcast.big);
+
+ if (conn && (memcmp(qos, &conn->iso_qos, sizeof(*qos)) ||
+ base_len != conn->le_per_adv_data_len ||
+ memcmp(conn->le_per_adv_data, base, base_len)))
+ return ERR_PTR(-EADDRINUSE);
+
+ conn = hci_conn_add_unset(hdev, BIS_LINK, dst, 0, HCI_ROLE_MASTER);
+ if (IS_ERR(conn))
+ return conn;
+
+ conn->state = BT_CONNECT;
+ conn->sid = sid;
+ conn->conn_timeout = timeout;
+
+ hci_conn_hold(conn);
+ return conn;
+}
+
/* This function requires the caller holds hdev->lock */
struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst,
u8 dst_type, u8 sec_level,
- u16 conn_timeout)
+ u16 conn_timeout,
+ enum conn_reasons conn_reason)
{
struct hci_conn *conn;
@@ -1161,21 +1644,24 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst,
BT_DBG("requesting refresh of dst_addr");
- conn = hci_conn_add(hdev, LE_LINK, dst, HCI_ROLE_MASTER);
- if (!conn)
- return ERR_PTR(-ENOMEM);
+ conn = hci_conn_add_unset(hdev, LE_LINK, dst, dst_type,
+ HCI_ROLE_MASTER);
+ if (IS_ERR(conn))
+ return conn;
- if (hci_explicit_conn_params_set(hdev, dst, dst_type) < 0)
+ if (hci_explicit_conn_params_set(hdev, dst, dst_type) < 0) {
+ hci_conn_del(conn);
return ERR_PTR(-EBUSY);
+ }
conn->state = BT_CONNECT;
set_bit(HCI_CONN_SCANNING, &conn->flags);
- conn->dst_type = dst_type;
conn->sec_level = BT_SECURITY_LOW;
conn->pending_sec_level = sec_level;
conn->conn_timeout = conn_timeout;
+ conn->conn_reason = conn_reason;
- hci_update_background_scan(hdev);
+ hci_update_passive_scan(hdev);
done:
hci_conn_hold(conn);
@@ -1183,7 +1669,8 @@ done:
}
struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
- u8 sec_level, u8 auth_type)
+ u8 sec_level, u8 auth_type,
+ enum conn_reasons conn_reason, u16 timeout)
{
struct hci_conn *acl;
@@ -1194,50 +1681,103 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst,
return ERR_PTR(-EOPNOTSUPP);
}
+ /* Reject outgoing connection to device with same BD ADDR against
+ * CVE-2020-26555
+ */
+ if (!bacmp(&hdev->bdaddr, dst)) {
+ bt_dev_dbg(hdev, "Reject connection with same BD_ADDR %pMR\n",
+ dst);
+ return ERR_PTR(-ECONNREFUSED);
+ }
+
acl = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst);
if (!acl) {
- acl = hci_conn_add(hdev, ACL_LINK, dst, HCI_ROLE_MASTER);
- if (!acl)
- return ERR_PTR(-ENOMEM);
+ acl = hci_conn_add_unset(hdev, ACL_LINK, dst, 0,
+ HCI_ROLE_MASTER);
+ if (IS_ERR(acl))
+ return acl;
}
hci_conn_hold(acl);
+ acl->conn_reason = conn_reason;
if (acl->state == BT_OPEN || acl->state == BT_CLOSED) {
+ int err;
+
acl->sec_level = BT_SECURITY_LOW;
acl->pending_sec_level = sec_level;
acl->auth_type = auth_type;
- hci_acl_create_connection(acl);
+ acl->conn_timeout = timeout;
+
+ err = hci_connect_acl_sync(hdev, acl);
+ if (err) {
+ hci_conn_del(acl);
+ return ERR_PTR(err);
+ }
}
return acl;
}
+static struct hci_link *hci_conn_link(struct hci_conn *parent,
+ struct hci_conn *conn)
+{
+ struct hci_dev *hdev = parent->hdev;
+ struct hci_link *link;
+
+ bt_dev_dbg(hdev, "parent %p hcon %p", parent, conn);
+
+ if (conn->link)
+ return conn->link;
+
+ if (conn->parent)
+ return NULL;
+
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
+ if (!link)
+ return NULL;
+
+ link->conn = hci_conn_hold(conn);
+ conn->link = link;
+ conn->parent = hci_conn_get(parent);
+
+ /* Use list_add_tail_rcu append to the list */
+ list_add_tail_rcu(&link->list, &parent->link_list);
+
+ return link;
+}
+
struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
- __u16 setting)
+ __u16 setting, struct bt_codec *codec,
+ u16 timeout)
{
struct hci_conn *acl;
struct hci_conn *sco;
+ struct hci_link *link;
- acl = hci_connect_acl(hdev, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING);
+ acl = hci_connect_acl(hdev, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING,
+ CONN_REASON_SCO_CONNECT, timeout);
if (IS_ERR(acl))
return acl;
sco = hci_conn_hash_lookup_ba(hdev, type, dst);
if (!sco) {
- sco = hci_conn_add(hdev, type, dst, HCI_ROLE_MASTER);
- if (!sco) {
+ sco = hci_conn_add_unset(hdev, type, dst, 0, HCI_ROLE_MASTER);
+ if (IS_ERR(sco)) {
hci_conn_drop(acl);
- return ERR_PTR(-ENOMEM);
+ return sco;
}
}
- acl->link = sco;
- sco->link = acl;
-
- hci_conn_hold(sco);
+ link = hci_conn_link(acl, sco);
+ if (!link) {
+ hci_conn_drop(acl);
+ hci_conn_drop(sco);
+ return ERR_PTR(-ENOLINK);
+ }
sco->setting = setting;
+ sco->codec = *codec;
if (acl->state == BT_CONNECTED &&
(sco->state == BT_OPEN || sco->state == BT_CLOSED)) {
@@ -1256,6 +1796,591 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst,
return sco;
}
+static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_cp_le_create_big cp;
+ struct iso_list_data data;
+
+ memset(&cp, 0, sizeof(cp));
+
+ data.big = qos->bcast.big;
+ data.bis = qos->bcast.bis;
+ data.count = 0;
+
+ /* Create a BIS for each bound connection */
+ hci_conn_hash_list_state(hdev, bis_list, BIS_LINK,
+ BT_BOUND, &data);
+
+ cp.handle = qos->bcast.big;
+ cp.adv_handle = qos->bcast.bis;
+ cp.num_bis = data.count;
+ hci_cpu_to_le24(qos->bcast.out.interval, cp.bis.sdu_interval);
+ cp.bis.sdu = cpu_to_le16(qos->bcast.out.sdu);
+ cp.bis.latency = cpu_to_le16(qos->bcast.out.latency);
+ cp.bis.rtn = qos->bcast.out.rtn;
+ cp.bis.phy = qos->bcast.out.phy;
+ cp.bis.packing = qos->bcast.packing;
+ cp.bis.framing = qos->bcast.framing;
+ cp.bis.encryption = qos->bcast.encryption;
+ memcpy(cp.bis.bcode, qos->bcast.bcode, sizeof(cp.bis.bcode));
+
+ return hci_send_cmd(hdev, HCI_OP_LE_CREATE_BIG, sizeof(cp), &cp);
+}
+
+static int set_cig_params_sync(struct hci_dev *hdev, void *data)
+{
+ DEFINE_FLEX(struct hci_cp_le_set_cig_params, pdu, cis, num_cis, 0x1f);
+ u8 cig_id = PTR_UINT(data);
+ struct hci_conn *conn;
+ struct bt_iso_qos *qos;
+ u8 aux_num_cis = 0;
+ u8 cis_id;
+
+ conn = hci_conn_hash_lookup_cig(hdev, cig_id);
+ if (!conn)
+ return 0;
+
+ qos = &conn->iso_qos;
+ pdu->cig_id = cig_id;
+ hci_cpu_to_le24(qos->ucast.out.interval, pdu->c_interval);
+ hci_cpu_to_le24(qos->ucast.in.interval, pdu->p_interval);
+ pdu->sca = qos->ucast.sca;
+ pdu->packing = qos->ucast.packing;
+ pdu->framing = qos->ucast.framing;
+ pdu->c_latency = cpu_to_le16(qos->ucast.out.latency);
+ pdu->p_latency = cpu_to_le16(qos->ucast.in.latency);
+
+ /* Reprogram all CIS(s) with the same CIG, valid range are:
+ * num_cis: 0x00 to 0x1F
+ * cis_id: 0x00 to 0xEF
+ */
+ for (cis_id = 0x00; cis_id < 0xf0 &&
+ aux_num_cis < pdu->num_cis; cis_id++) {
+ struct hci_cis_params *cis;
+
+ conn = hci_conn_hash_lookup_cis(hdev, NULL, 0, cig_id, cis_id);
+ if (!conn)
+ continue;
+
+ qos = &conn->iso_qos;
+
+ cis = &pdu->cis[aux_num_cis++];
+ cis->cis_id = cis_id;
+ cis->c_sdu = cpu_to_le16(conn->iso_qos.ucast.out.sdu);
+ cis->p_sdu = cpu_to_le16(conn->iso_qos.ucast.in.sdu);
+ cis->c_phy = qos->ucast.out.phy ? qos->ucast.out.phy :
+ qos->ucast.in.phy;
+ cis->p_phy = qos->ucast.in.phy ? qos->ucast.in.phy :
+ qos->ucast.out.phy;
+ cis->c_rtn = qos->ucast.out.rtn;
+ cis->p_rtn = qos->ucast.in.rtn;
+ }
+ pdu->num_cis = aux_num_cis;
+
+ if (!pdu->num_cis)
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_CIG_PARAMS,
+ struct_size(pdu, cis, pdu->num_cis),
+ pdu, HCI_CMD_TIMEOUT);
+}
+
+static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct iso_list_data data;
+
+ memset(&data, 0, sizeof(data));
+
+ /* Allocate first still reconfigurable CIG if not set */
+ if (qos->ucast.cig == BT_ISO_QOS_CIG_UNSET) {
+ for (data.cig = 0x00; data.cig < 0xf0; data.cig++) {
+ data.count = 0;
+
+ hci_conn_hash_list_state(hdev, find_cis, CIS_LINK,
+ BT_CONNECT, &data);
+ if (data.count)
+ continue;
+
+ hci_conn_hash_list_state(hdev, find_cis, CIS_LINK,
+ BT_CONNECTED, &data);
+ if (!data.count)
+ break;
+ }
+
+ if (data.cig == 0xf0)
+ return false;
+
+ /* Update CIG */
+ qos->ucast.cig = data.cig;
+ }
+
+ if (qos->ucast.cis != BT_ISO_QOS_CIS_UNSET) {
+ if (hci_conn_hash_lookup_cis(hdev, NULL, 0, qos->ucast.cig,
+ qos->ucast.cis))
+ return false;
+ goto done;
+ }
+
+ /* Allocate first available CIS if not set */
+ for (data.cig = qos->ucast.cig, data.cis = 0x00; data.cis < 0xf0;
+ data.cis++) {
+ if (!hci_conn_hash_lookup_cis(hdev, NULL, 0, data.cig,
+ data.cis)) {
+ /* Update CIS */
+ qos->ucast.cis = data.cis;
+ break;
+ }
+ }
+
+ if (qos->ucast.cis == BT_ISO_QOS_CIS_UNSET)
+ return false;
+
+done:
+ if (hci_cmd_sync_queue(hdev, set_cig_params_sync,
+ UINT_PTR(qos->ucast.cig), NULL) < 0)
+ return false;
+
+ return true;
+}
+
+struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
+ __u8 dst_type, struct bt_iso_qos *qos,
+ u16 timeout)
+{
+ struct hci_conn *cis;
+
+ cis = hci_conn_hash_lookup_cis(hdev, dst, dst_type, qos->ucast.cig,
+ qos->ucast.cis);
+ if (!cis) {
+ cis = hci_conn_add_unset(hdev, CIS_LINK, dst, dst_type,
+ HCI_ROLE_MASTER);
+ if (IS_ERR(cis))
+ return cis;
+ cis->cleanup = cis_cleanup;
+ cis->dst_type = dst_type;
+ cis->iso_qos.ucast.cig = BT_ISO_QOS_CIG_UNSET;
+ cis->iso_qos.ucast.cis = BT_ISO_QOS_CIS_UNSET;
+ cis->conn_timeout = timeout;
+ }
+
+ if (cis->state == BT_CONNECTED)
+ return cis;
+
+ /* Check if CIS has been set and the settings matches */
+ if (cis->state == BT_BOUND &&
+ !memcmp(&cis->iso_qos, qos, sizeof(*qos)))
+ return cis;
+
+ /* Update LINK PHYs according to QoS preference */
+ cis->le_tx_phy = qos->ucast.out.phy;
+ cis->le_rx_phy = qos->ucast.in.phy;
+
+ /* If output interval is not set use the input interval as it cannot be
+ * 0x000000.
+ */
+ if (!qos->ucast.out.interval)
+ qos->ucast.out.interval = qos->ucast.in.interval;
+
+ /* If input interval is not set use the output interval as it cannot be
+ * 0x000000.
+ */
+ if (!qos->ucast.in.interval)
+ qos->ucast.in.interval = qos->ucast.out.interval;
+
+ /* If output latency is not set use the input latency as it cannot be
+ * 0x0000.
+ */
+ if (!qos->ucast.out.latency)
+ qos->ucast.out.latency = qos->ucast.in.latency;
+
+ /* If input latency is not set use the output latency as it cannot be
+ * 0x0000.
+ */
+ if (!qos->ucast.in.latency)
+ qos->ucast.in.latency = qos->ucast.out.latency;
+
+ if (!hci_le_set_cig_params(cis, qos)) {
+ hci_conn_drop(cis);
+ return ERR_PTR(-EINVAL);
+ }
+
+ hci_conn_hold(cis);
+
+ cis->iso_qos = *qos;
+ cis->state = BT_BOUND;
+
+ return cis;
+}
+
+bool hci_iso_setup_path(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_cp_le_setup_iso_path cmd;
+
+ memset(&cmd, 0, sizeof(cmd));
+
+ if (conn->iso_qos.ucast.out.sdu) {
+ cmd.handle = cpu_to_le16(conn->handle);
+ cmd.direction = 0x00; /* Input (Host to Controller) */
+ cmd.path = 0x00; /* HCI path if enabled */
+ cmd.codec = 0x03; /* Transparent Data */
+
+ if (hci_send_cmd(hdev, HCI_OP_LE_SETUP_ISO_PATH, sizeof(cmd),
+ &cmd) < 0)
+ return false;
+ }
+
+ if (conn->iso_qos.ucast.in.sdu) {
+ cmd.handle = cpu_to_le16(conn->handle);
+ cmd.direction = 0x01; /* Output (Controller to Host) */
+ cmd.path = 0x00; /* HCI path if enabled */
+ cmd.codec = 0x03; /* Transparent Data */
+
+ if (hci_send_cmd(hdev, HCI_OP_LE_SETUP_ISO_PATH, sizeof(cmd),
+ &cmd) < 0)
+ return false;
+ }
+
+ return true;
+}
+
+int hci_conn_check_create_cis(struct hci_conn *conn)
+{
+ if (conn->type != CIS_LINK)
+ return -EINVAL;
+
+ if (!conn->parent || conn->parent->state != BT_CONNECTED ||
+ conn->state != BT_CONNECT || HCI_CONN_HANDLE_UNSET(conn->handle))
+ return 1;
+
+ return 0;
+}
+
+static int hci_create_cis_sync(struct hci_dev *hdev, void *data)
+{
+ return hci_le_create_cis_sync(hdev);
+}
+
+int hci_le_create_cis_pending(struct hci_dev *hdev)
+{
+ struct hci_conn *conn;
+ bool pending = false;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) {
+ if (test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) {
+ rcu_read_unlock();
+ return -EBUSY;
+ }
+
+ if (!hci_conn_check_create_cis(conn))
+ pending = true;
+ }
+
+ rcu_read_unlock();
+
+ if (!pending)
+ return 0;
+
+ /* Queue Create CIS */
+ return hci_cmd_sync_queue(hdev, hci_create_cis_sync, NULL, NULL);
+}
+
+static void hci_iso_qos_setup(struct hci_dev *hdev, struct hci_conn *conn,
+ struct bt_iso_io_qos *qos, __u8 phy)
+{
+ /* Only set MTU if PHY is enabled */
+ if (!qos->sdu && qos->phy)
+ qos->sdu = conn->mtu;
+
+ /* Use the same PHY as ACL if set to any */
+ if (qos->phy == BT_ISO_PHY_ANY)
+ qos->phy = phy;
+
+ /* Use LE ACL connection interval if not set */
+ if (!qos->interval)
+ /* ACL interval unit in 1.25 ms to us */
+ qos->interval = conn->le_conn_interval * 1250;
+
+ /* Use LE ACL connection latency if not set */
+ if (!qos->latency)
+ qos->latency = conn->le_conn_latency;
+}
+
+static int create_big_sync(struct hci_dev *hdev, void *data)
+{
+ struct hci_conn *conn = data;
+ struct bt_iso_qos *qos = &conn->iso_qos;
+ u16 interval, sync_interval = 0;
+ u32 flags = 0;
+ int err;
+
+ if (qos->bcast.out.phy == 0x02)
+ flags |= MGMT_ADV_FLAG_SEC_2M;
+
+ /* Align intervals */
+ interval = (qos->bcast.out.interval / 1250) * qos->bcast.sync_factor;
+
+ if (qos->bcast.bis)
+ sync_interval = interval * 4;
+
+ err = hci_start_per_adv_sync(hdev, qos->bcast.bis, conn->sid,
+ conn->le_per_adv_data_len,
+ conn->le_per_adv_data, flags, interval,
+ interval, sync_interval);
+ if (err)
+ return err;
+
+ return hci_le_create_big(conn, &conn->iso_qos);
+}
+
+struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst,
+ __u8 dst_type, __u8 sid,
+ struct bt_iso_qos *qos)
+{
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "dst %pMR type %d sid %d", dst, dst_type, sid);
+
+ conn = hci_conn_add_unset(hdev, PA_LINK, dst, dst_type, HCI_ROLE_SLAVE);
+ if (IS_ERR(conn))
+ return conn;
+
+ conn->iso_qos = *qos;
+ conn->sid = sid;
+ conn->state = BT_LISTEN;
+ conn->conn_timeout = msecs_to_jiffies(qos->bcast.sync_timeout * 10);
+
+ hci_conn_hold(conn);
+
+ hci_connect_pa_sync(hdev, conn);
+
+ return conn;
+}
+
+int hci_conn_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon,
+ struct bt_iso_qos *qos, __u16 sync_handle,
+ __u8 num_bis, __u8 bis[])
+{
+ int err;
+
+ if (num_bis < 0x01 || num_bis > ISO_MAX_NUM_BIS)
+ return -EINVAL;
+
+ err = qos_set_big(hdev, qos);
+ if (err)
+ return err;
+
+ if (hcon) {
+ /* Update hcon QoS */
+ hcon->iso_qos = *qos;
+
+ hcon->num_bis = num_bis;
+ memcpy(hcon->bis, bis, num_bis);
+ hcon->conn_timeout = msecs_to_jiffies(qos->bcast.timeout * 10);
+ }
+
+ return hci_connect_big_sync(hdev, hcon);
+}
+
+static void create_big_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct hci_conn *conn = data;
+
+ bt_dev_dbg(hdev, "conn %p", conn);
+
+ if (err) {
+ bt_dev_err(hdev, "Unable to create BIG: %d", err);
+ hci_connect_cfm(conn, err);
+ hci_conn_del(conn);
+ }
+}
+
+struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid,
+ struct bt_iso_qos *qos,
+ __u8 base_len, __u8 *base, u16 timeout)
+{
+ struct hci_conn *conn;
+ struct hci_conn *parent;
+ __u8 eir[HCI_MAX_PER_AD_LENGTH];
+ struct hci_link *link;
+
+ /* Look for any BIS that is open for rebinding */
+ conn = hci_conn_hash_lookup_big_state(hdev, qos->bcast.big, BT_OPEN,
+ HCI_ROLE_MASTER);
+ if (conn) {
+ memcpy(qos, &conn->iso_qos, sizeof(*qos));
+ conn->state = BT_CONNECTED;
+ return conn;
+ }
+
+ if (base_len && base)
+ base_len = eir_append_service_data(eir, 0, 0x1851,
+ base, base_len);
+
+ /* We need hci_conn object using the BDADDR_ANY as dst */
+ conn = hci_add_bis(hdev, dst, sid, qos, base_len, eir, timeout);
+ if (IS_ERR(conn))
+ return conn;
+
+ /* Update LINK PHYs according to QoS preference */
+ conn->le_tx_phy = qos->bcast.out.phy;
+ conn->le_tx_phy = qos->bcast.out.phy;
+
+ /* Add Basic Announcement into Peridic Adv Data if BASE is set */
+ if (base_len && base) {
+ memcpy(conn->le_per_adv_data, eir, sizeof(eir));
+ conn->le_per_adv_data_len = base_len;
+ }
+
+ hci_iso_qos_setup(hdev, conn, &qos->bcast.out,
+ conn->le_tx_phy ? conn->le_tx_phy :
+ hdev->le_tx_def_phys);
+
+ conn->iso_qos = *qos;
+ conn->state = BT_BOUND;
+
+ /* Link BISes together */
+ parent = hci_conn_hash_lookup_big(hdev,
+ conn->iso_qos.bcast.big);
+ if (parent && parent != conn) {
+ link = hci_conn_link(parent, conn);
+ hci_conn_drop(conn);
+ if (!link)
+ return ERR_PTR(-ENOLINK);
+ }
+
+ return conn;
+}
+
+int hci_past_bis(struct hci_conn *conn, bdaddr_t *dst, __u8 dst_type)
+{
+ struct hci_conn *le;
+
+ /* Lookup existing LE connection to rebind to */
+ le = hci_conn_hash_lookup_le(conn->hdev, dst, dst_type);
+ if (!le)
+ return -EINVAL;
+
+ return hci_past_sync(conn, le);
+}
+
+static void bis_mark_per_adv(struct hci_conn *conn, void *data)
+{
+ struct iso_list_data *d = data;
+
+ /* Skip if not broadcast/ANY address */
+ if (bacmp(&conn->dst, BDADDR_ANY))
+ return;
+
+ if (d->big != conn->iso_qos.bcast.big ||
+ d->bis == BT_ISO_QOS_BIS_UNSET ||
+ d->bis != conn->iso_qos.bcast.bis)
+ return;
+
+ set_bit(HCI_CONN_PER_ADV, &conn->flags);
+}
+
+struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst,
+ __u8 dst_type, __u8 sid,
+ struct bt_iso_qos *qos,
+ __u8 base_len, __u8 *base, u16 timeout)
+{
+ struct hci_conn *conn;
+ int err;
+ struct iso_list_data data;
+
+ conn = hci_bind_bis(hdev, dst, sid, qos, base_len, base, timeout);
+ if (IS_ERR(conn))
+ return conn;
+
+ if (conn->state == BT_CONNECTED)
+ return conn;
+
+ /* Check if SID needs to be allocated then search for the first
+ * available.
+ */
+ if (conn->sid == HCI_SID_INVALID) {
+ u8 sid;
+
+ for (sid = 0; sid <= 0x0f; sid++) {
+ if (!hci_find_adv_sid(hdev, sid)) {
+ conn->sid = sid;
+ break;
+ }
+ }
+ }
+
+ data.big = qos->bcast.big;
+ data.bis = qos->bcast.bis;
+
+ /* Set HCI_CONN_PER_ADV for all bound connections, to mark that
+ * the start periodic advertising and create BIG commands have
+ * been queued
+ */
+ hci_conn_hash_list_state(hdev, bis_mark_per_adv, BIS_LINK,
+ BT_BOUND, &data);
+
+ /* Queue start periodic advertising and create BIG */
+ err = hci_cmd_sync_queue(hdev, create_big_sync, conn,
+ create_big_complete);
+ if (err < 0) {
+ hci_conn_drop(conn);
+ return ERR_PTR(err);
+ }
+
+ return conn;
+}
+
+struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst,
+ __u8 dst_type, struct bt_iso_qos *qos,
+ u16 timeout)
+{
+ struct hci_conn *le;
+ struct hci_conn *cis;
+ struct hci_link *link;
+
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ le = hci_connect_le(hdev, dst, dst_type, false,
+ BT_SECURITY_LOW,
+ HCI_LE_CONN_TIMEOUT,
+ HCI_ROLE_SLAVE, 0, 0);
+ else
+ le = hci_connect_le_scan(hdev, dst, dst_type,
+ BT_SECURITY_LOW,
+ HCI_LE_CONN_TIMEOUT,
+ CONN_REASON_ISO_CONNECT);
+ if (IS_ERR(le))
+ return le;
+
+ hci_iso_qos_setup(hdev, le, &qos->ucast.out,
+ le->le_tx_phy ? le->le_tx_phy : hdev->le_tx_def_phys);
+ hci_iso_qos_setup(hdev, le, &qos->ucast.in,
+ le->le_rx_phy ? le->le_rx_phy : hdev->le_rx_def_phys);
+
+ cis = hci_bind_cis(hdev, dst, dst_type, qos, timeout);
+ if (IS_ERR(cis)) {
+ hci_conn_drop(le);
+ return cis;
+ }
+
+ link = hci_conn_link(le, cis);
+ hci_conn_drop(cis);
+ if (!link) {
+ hci_conn_drop(le);
+ return ERR_PTR(-ENOLINK);
+ }
+
+ cis->state = BT_CONNECT;
+
+ hci_le_create_cis_pending(hdev);
+
+ return cis;
+}
+
/* Check link security requirement */
int hci_conn_check_link_mode(struct hci_conn *conn)
{
@@ -1272,6 +2397,23 @@ int hci_conn_check_link_mode(struct hci_conn *conn)
return 0;
}
+ /* AES encryption is required for Level 4:
+ *
+ * BLUETOOTH CORE SPECIFICATION Version 5.2 | Vol 3, Part C
+ * page 1319:
+ *
+ * 128-bit equivalent strength for link and encryption keys
+ * required using FIPS approved algorithms (E0 not allowed,
+ * SAFER+ not allowed, and P-192 not allowed; encryption key
+ * not shortened)
+ */
+ if (conn->sec_level == BT_SECURITY_FIPS &&
+ !test_bit(HCI_CONN_AES_CCM, &conn->flags)) {
+ bt_dev_err(conn->hdev,
+ "Invalid security: Missing AES-CCM usage");
+ return 0;
+ }
+
if (hci_conn_ssp_enabled(conn) &&
!test_bit(HCI_CONN_ENCRYPT, &conn->flags))
return 0;
@@ -1304,19 +2446,17 @@ static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type)
hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED,
sizeof(cp), &cp);
- /* If we're already encrypted set the REAUTH_PEND flag,
- * otherwise set the ENCRYPT_PEND.
+ /* Set the ENCRYPT_PEND to trigger encryption after
+ * authentication.
*/
- if (test_bit(HCI_CONN_ENCRYPT, &conn->flags))
- set_bit(HCI_CONN_REAUTH_PEND, &conn->flags);
- else
+ if (!test_bit(HCI_CONN_ENCRYPT, &conn->flags))
set_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags);
}
return 0;
}
-/* Encrypt the the link */
+/* Encrypt the link */
static void hci_conn_encrypt(struct hci_conn *conn)
{
BT_DBG("hcon %p", conn);
@@ -1352,34 +2492,41 @@ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type,
if (!test_bit(HCI_CONN_AUTH, &conn->flags))
goto auth;
- /* An authenticated FIPS approved combination key has sufficient
- * security for security level 4. */
- if (conn->key_type == HCI_LK_AUTH_COMBINATION_P256 &&
- sec_level == BT_SECURITY_FIPS)
- goto encrypt;
-
- /* An authenticated combination key has sufficient security for
- security level 3. */
- if ((conn->key_type == HCI_LK_AUTH_COMBINATION_P192 ||
- conn->key_type == HCI_LK_AUTH_COMBINATION_P256) &&
- sec_level == BT_SECURITY_HIGH)
- goto encrypt;
-
- /* An unauthenticated combination key has sufficient security for
- security level 1 and 2. */
- if ((conn->key_type == HCI_LK_UNAUTH_COMBINATION_P192 ||
- conn->key_type == HCI_LK_UNAUTH_COMBINATION_P256) &&
- (sec_level == BT_SECURITY_MEDIUM || sec_level == BT_SECURITY_LOW))
- goto encrypt;
-
- /* A combination key has always sufficient security for the security
- levels 1 or 2. High security level requires the combination key
- is generated using maximum PIN code length (16).
- For pre 2.1 units. */
- if (conn->key_type == HCI_LK_COMBINATION &&
- (sec_level == BT_SECURITY_MEDIUM || sec_level == BT_SECURITY_LOW ||
- conn->pin_length == 16))
- goto encrypt;
+ switch (conn->key_type) {
+ case HCI_LK_AUTH_COMBINATION_P256:
+ /* An authenticated FIPS approved combination key has
+ * sufficient security for security level 4 or lower.
+ */
+ if (sec_level <= BT_SECURITY_FIPS)
+ goto encrypt;
+ break;
+ case HCI_LK_AUTH_COMBINATION_P192:
+ /* An authenticated combination key has sufficient security for
+ * security level 3 or lower.
+ */
+ if (sec_level <= BT_SECURITY_HIGH)
+ goto encrypt;
+ break;
+ case HCI_LK_UNAUTH_COMBINATION_P192:
+ case HCI_LK_UNAUTH_COMBINATION_P256:
+ /* An unauthenticated combination key has sufficient security
+ * for security level 2 or lower.
+ */
+ if (sec_level <= BT_SECURITY_MEDIUM)
+ goto encrypt;
+ break;
+ case HCI_LK_COMBINATION:
+ /* A combination key has always sufficient security for the
+ * security levels 2 or lower. High security level requires the
+ * combination key is generated using maximum PIN code length
+ * (16). For pre 2.1 units.
+ */
+ if (sec_level <= BT_SECURITY_MEDIUM || conn->pin_length == 16)
+ goto encrypt;
+ break;
+ default:
+ break;
+ }
auth:
if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags))
@@ -1392,8 +2539,16 @@ auth:
return 0;
encrypt:
- if (test_bit(HCI_CONN_ENCRYPT, &conn->flags))
+ if (test_bit(HCI_CONN_ENCRYPT, &conn->flags)) {
+ /* Ensure that the encryption key size has been read,
+ * otherwise stall the upper layer responses.
+ */
+ if (!conn->enc_key_size)
+ return 0;
+
+ /* Nothing else needed, all requirements are met */
return 1;
+ }
hci_conn_encrypt(conn);
return 0;
@@ -1466,33 +2621,22 @@ timer:
/* Drop all connection on the device */
void hci_conn_hash_flush(struct hci_dev *hdev)
{
- struct hci_conn_hash *h = &hdev->conn_hash;
- struct hci_conn *c, *n;
-
- BT_DBG("hdev %s", hdev->name);
-
- list_for_each_entry_safe(c, n, &h->list, list) {
- c->state = BT_CLOSED;
-
- hci_disconn_cfm(c, HCI_ERROR_LOCAL_HOST_TERM);
- hci_conn_del(c);
- }
-}
-
-/* Check pending connect attempts */
-void hci_conn_check_pending(struct hci_dev *hdev)
-{
+ struct list_head *head = &hdev->conn_hash.list;
struct hci_conn *conn;
BT_DBG("hdev %s", hdev->name);
- hci_dev_lock(hdev);
-
- conn = hci_conn_hash_lookup_state(hdev, ACL_LINK, BT_CONNECT2);
- if (conn)
- hci_acl_create_connection(conn);
-
- hci_dev_unlock(hdev);
+ /* We should not traverse the list here, because hci_conn_del
+ * can remove extra links, which may cause the list traversal
+ * to hit items that have already been released.
+ */
+ while ((conn = list_first_entry_or_null(head,
+ struct hci_conn,
+ list)) != NULL) {
+ conn->state = BT_CLOSED;
+ hci_disconn_cfm(conn, HCI_ERROR_LOCAL_HOST_TERM);
+ hci_conn_del(conn);
+ }
}
static u32 get_link_mode(struct hci_conn *conn)
@@ -1704,3 +2848,340 @@ struct hci_chan *hci_chan_lookup_handle(struct hci_dev *hdev, __u16 handle)
return hchan;
}
+
+u32 hci_conn_get_phy(struct hci_conn *conn)
+{
+ u32 phys = 0;
+
+ /* BLUETOOTH CORE SPECIFICATION Version 5.2 | Vol 2, Part B page 471:
+ * Table 6.2: Packets defined for synchronous, asynchronous, and
+ * CPB logical transport types.
+ */
+ switch (conn->type) {
+ case SCO_LINK:
+ /* SCO logical transport (1 Mb/s):
+ * HV1, HV2, HV3 and DV.
+ */
+ phys |= BT_PHY_BR_1M_1SLOT;
+
+ break;
+
+ case ACL_LINK:
+ /* ACL logical transport (1 Mb/s) ptt=0:
+ * DH1, DM3, DH3, DM5 and DH5.
+ */
+ phys |= BT_PHY_BR_1M_1SLOT;
+
+ if (conn->pkt_type & (HCI_DM3 | HCI_DH3))
+ phys |= BT_PHY_BR_1M_3SLOT;
+
+ if (conn->pkt_type & (HCI_DM5 | HCI_DH5))
+ phys |= BT_PHY_BR_1M_5SLOT;
+
+ /* ACL logical transport (2 Mb/s) ptt=1:
+ * 2-DH1, 2-DH3 and 2-DH5.
+ */
+ if (!(conn->pkt_type & HCI_2DH1))
+ phys |= BT_PHY_EDR_2M_1SLOT;
+
+ if (!(conn->pkt_type & HCI_2DH3))
+ phys |= BT_PHY_EDR_2M_3SLOT;
+
+ if (!(conn->pkt_type & HCI_2DH5))
+ phys |= BT_PHY_EDR_2M_5SLOT;
+
+ /* ACL logical transport (3 Mb/s) ptt=1:
+ * 3-DH1, 3-DH3 and 3-DH5.
+ */
+ if (!(conn->pkt_type & HCI_3DH1))
+ phys |= BT_PHY_EDR_3M_1SLOT;
+
+ if (!(conn->pkt_type & HCI_3DH3))
+ phys |= BT_PHY_EDR_3M_3SLOT;
+
+ if (!(conn->pkt_type & HCI_3DH5))
+ phys |= BT_PHY_EDR_3M_5SLOT;
+
+ break;
+
+ case ESCO_LINK:
+ /* eSCO logical transport (1 Mb/s): EV3, EV4 and EV5 */
+ phys |= BT_PHY_BR_1M_1SLOT;
+
+ if (!(conn->pkt_type & (ESCO_EV4 | ESCO_EV5)))
+ phys |= BT_PHY_BR_1M_3SLOT;
+
+ /* eSCO logical transport (2 Mb/s): 2-EV3, 2-EV5 */
+ if (!(conn->pkt_type & ESCO_2EV3))
+ phys |= BT_PHY_EDR_2M_1SLOT;
+
+ if (!(conn->pkt_type & ESCO_2EV5))
+ phys |= BT_PHY_EDR_2M_3SLOT;
+
+ /* eSCO logical transport (3 Mb/s): 3-EV3, 3-EV5 */
+ if (!(conn->pkt_type & ESCO_3EV3))
+ phys |= BT_PHY_EDR_3M_1SLOT;
+
+ if (!(conn->pkt_type & ESCO_3EV5))
+ phys |= BT_PHY_EDR_3M_3SLOT;
+
+ break;
+
+ case LE_LINK:
+ if (conn->le_tx_phy & HCI_LE_SET_PHY_1M)
+ phys |= BT_PHY_LE_1M_TX;
+
+ if (conn->le_rx_phy & HCI_LE_SET_PHY_1M)
+ phys |= BT_PHY_LE_1M_RX;
+
+ if (conn->le_tx_phy & HCI_LE_SET_PHY_2M)
+ phys |= BT_PHY_LE_2M_TX;
+
+ if (conn->le_rx_phy & HCI_LE_SET_PHY_2M)
+ phys |= BT_PHY_LE_2M_RX;
+
+ if (conn->le_tx_phy & HCI_LE_SET_PHY_CODED)
+ phys |= BT_PHY_LE_CODED_TX;
+
+ if (conn->le_rx_phy & HCI_LE_SET_PHY_CODED)
+ phys |= BT_PHY_LE_CODED_RX;
+
+ break;
+ }
+
+ return phys;
+}
+
+static int abort_conn_sync(struct hci_dev *hdev, void *data)
+{
+ struct hci_conn *conn = data;
+
+ if (!hci_conn_valid(hdev, conn))
+ return -ECANCELED;
+
+ return hci_abort_conn_sync(hdev, conn, conn->abort_reason);
+}
+
+int hci_abort_conn(struct hci_conn *conn, u8 reason)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ /* If abort_reason has already been set it means the connection is
+ * already being aborted so don't attempt to overwrite it.
+ */
+ if (conn->abort_reason)
+ return 0;
+
+ bt_dev_dbg(hdev, "handle 0x%2.2x reason 0x%2.2x", conn->handle, reason);
+
+ conn->abort_reason = reason;
+
+ /* If the connection is pending check the command opcode since that
+ * might be blocking on hci_cmd_sync_work while waiting its respective
+ * event so we need to hci_cmd_sync_cancel to cancel it.
+ *
+ * hci_connect_le serializes the connection attempts so only one
+ * connection can be in BT_CONNECT at time.
+ */
+ if (conn->state == BT_CONNECT && hdev->req_status == HCI_REQ_PEND) {
+ switch (hci_skb_event(hdev->sent_cmd)) {
+ case HCI_EV_CONN_COMPLETE:
+ case HCI_EV_LE_CONN_COMPLETE:
+ case HCI_EV_LE_ENHANCED_CONN_COMPLETE:
+ case HCI_EVT_LE_CIS_ESTABLISHED:
+ hci_cmd_sync_cancel(hdev, ECANCELED);
+ break;
+ }
+ /* Cancel connect attempt if still queued/pending */
+ } else if (!hci_cancel_connect_sync(hdev, conn)) {
+ return 0;
+ }
+
+ /* Run immediately if on cmd_sync_work since this may be called
+ * as a result to MGMT_OP_DISCONNECT/MGMT_OP_UNPAIR which does
+ * already queue its callback on cmd_sync_work.
+ */
+ return hci_cmd_sync_run_once(hdev, abort_conn_sync, conn, NULL);
+}
+
+void hci_setup_tx_timestamp(struct sk_buff *skb, size_t key_offset,
+ const struct sockcm_cookie *sockc)
+{
+ struct sock *sk = skb ? skb->sk : NULL;
+ int key;
+
+ /* This shall be called on a single skb of those generated by user
+ * sendmsg(), and only when the sendmsg() does not return error to
+ * user. This is required for keeping the tskey that increments here in
+ * sync with possible sendmsg() counting by user.
+ *
+ * Stream sockets shall set key_offset to sendmsg() length in bytes
+ * and call with the last fragment, others to 1 and first fragment.
+ */
+
+ if (!skb || !sockc || !sk || !key_offset)
+ return;
+
+ sock_tx_timestamp(sk, sockc, &skb_shinfo(skb)->tx_flags);
+
+ if (sk->sk_type == SOCK_STREAM)
+ key = atomic_add_return(key_offset, &sk->sk_tskey);
+
+ if (sockc->tsflags & SOF_TIMESTAMPING_OPT_ID &&
+ sockc->tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) {
+ if (sockc->tsflags & SOCKCM_FLAG_TS_OPT_ID) {
+ skb_shinfo(skb)->tskey = sockc->ts_opt_id;
+ } else {
+ if (sk->sk_type != SOCK_STREAM)
+ key = atomic_inc_return(&sk->sk_tskey);
+ skb_shinfo(skb)->tskey = key - 1;
+ }
+ }
+}
+
+void hci_conn_tx_queue(struct hci_conn *conn, struct sk_buff *skb)
+{
+ struct tx_queue *comp = &conn->tx_q;
+ bool track = false;
+
+ /* Emit SND now, ie. just before sending to driver */
+ if (skb_shinfo(skb)->tx_flags & SKBTX_SW_TSTAMP)
+ __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SND);
+
+ /* COMPLETION tstamp is emitted for tracked skb later in Number of
+ * Completed Packets event. Available only for flow controlled cases.
+ *
+ * TODO: SCO support without flowctl (needs to be done in drivers)
+ */
+ switch (conn->type) {
+ case CIS_LINK:
+ case BIS_LINK:
+ case PA_LINK:
+ case ACL_LINK:
+ case LE_LINK:
+ break;
+ case SCO_LINK:
+ case ESCO_LINK:
+ if (!hci_dev_test_flag(conn->hdev, HCI_SCO_FLOWCTL))
+ return;
+ break;
+ default:
+ return;
+ }
+
+ if (skb->sk && (skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP))
+ track = true;
+
+ /* If nothing is tracked, just count extra skbs at the queue head */
+ if (!track && !comp->tracked) {
+ comp->extra++;
+ return;
+ }
+
+ if (track) {
+ skb = skb_clone_sk(skb);
+ if (!skb)
+ goto count_only;
+
+ comp->tracked++;
+ } else {
+ skb = skb_clone(skb, GFP_KERNEL);
+ if (!skb)
+ goto count_only;
+ }
+
+ skb_queue_tail(&comp->queue, skb);
+ return;
+
+count_only:
+ /* Stop tracking skbs, and only count. This will not emit timestamps for
+ * the packets, but if we get here something is more seriously wrong.
+ */
+ comp->tracked = 0;
+ comp->extra += skb_queue_len(&comp->queue) + 1;
+ skb_queue_purge(&comp->queue);
+}
+
+void hci_conn_tx_dequeue(struct hci_conn *conn)
+{
+ struct tx_queue *comp = &conn->tx_q;
+ struct sk_buff *skb;
+
+ /* If there are tracked skbs, the counted extra go before dequeuing real
+ * skbs, to keep ordering. When nothing is tracked, the ordering doesn't
+ * matter so dequeue real skbs first to get rid of them ASAP.
+ */
+ if (comp->extra && (comp->tracked || skb_queue_empty(&comp->queue))) {
+ comp->extra--;
+ return;
+ }
+
+ skb = skb_dequeue(&comp->queue);
+ if (!skb)
+ return;
+
+ if (skb->sk) {
+ comp->tracked--;
+ __skb_tstamp_tx(skb, NULL, NULL, skb->sk,
+ SCM_TSTAMP_COMPLETION);
+ }
+
+ kfree_skb(skb);
+}
+
+u8 *hci_conn_key_enc_size(struct hci_conn *conn)
+{
+ if (conn->type == ACL_LINK) {
+ struct link_key *key;
+
+ key = hci_find_link_key(conn->hdev, &conn->dst);
+ if (!key)
+ return NULL;
+
+ return &key->pin_len;
+ } else if (conn->type == LE_LINK) {
+ struct smp_ltk *ltk;
+
+ ltk = hci_find_ltk(conn->hdev, &conn->dst, conn->dst_type,
+ conn->role);
+ if (!ltk)
+ return NULL;
+
+ return &ltk->enc_size;
+ }
+
+ return NULL;
+}
+
+int hci_ethtool_ts_info(unsigned int index, int sk_proto,
+ struct kernel_ethtool_ts_info *info)
+{
+ struct hci_dev *hdev;
+
+ hdev = hci_dev_get(index);
+ if (!hdev)
+ return -ENODEV;
+
+ info->so_timestamping =
+ SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+ info->phc_index = -1;
+ info->tx_types = BIT(HWTSTAMP_TX_OFF);
+ info->rx_filters = BIT(HWTSTAMP_FILTER_NONE);
+
+ switch (sk_proto) {
+ case BTPROTO_ISO:
+ case BTPROTO_L2CAP:
+ info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE;
+ info->so_timestamping |= SOF_TIMESTAMPING_TX_COMPLETION;
+ break;
+ case BTPROTO_SCO:
+ info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE;
+ if (hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL))
+ info->so_timestamping |= SOF_TIMESTAMPING_TX_COMPLETION;
+ break;
+ }
+
+ hci_dev_put(hdev);
+ return 0;
+}
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 74b29c7d841c..8ccec73dce45 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -26,21 +26,26 @@
/* Bluetooth HCI core. */
#include <linux/export.h>
-#include <linux/idr.h>
#include <linux/rfkill.h>
#include <linux/debugfs.h>
#include <linux/crypto.h>
-#include <asm/unaligned.h>
+#include <linux/kcov.h>
+#include <linux/property.h>
+#include <linux/suspend.h>
+#include <linux/wait.h>
+#include <linux/unaligned.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
#include <net/bluetooth/l2cap.h>
#include <net/bluetooth/mgmt.h>
-#include "hci_request.h"
#include "hci_debugfs.h"
#include "smp.h"
#include "leds.h"
+#include "msft.h"
+#include "aosp.h"
+#include "hci_codec.h"
static void hci_rx_work(struct work_struct *work);
static void hci_cmd_work(struct work_struct *work);
@@ -57,950 +62,9 @@ DEFINE_MUTEX(hci_cb_list_lock);
/* HCI ID Numbering */
static DEFINE_IDA(hci_index_ida);
-/* ---- HCI debugfs entries ---- */
-
-static ssize_t dut_mode_read(struct file *file, char __user *user_buf,
- size_t count, loff_t *ppos)
-{
- struct hci_dev *hdev = file->private_data;
- char buf[3];
-
- buf[0] = hci_dev_test_flag(hdev, HCI_DUT_MODE) ? 'Y' : 'N';
- buf[1] = '\n';
- buf[2] = '\0';
- return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
-}
-
-static ssize_t dut_mode_write(struct file *file, const char __user *user_buf,
- size_t count, loff_t *ppos)
-{
- struct hci_dev *hdev = file->private_data;
- struct sk_buff *skb;
- bool enable;
- int err;
-
- if (!test_bit(HCI_UP, &hdev->flags))
- return -ENETDOWN;
-
- err = kstrtobool_from_user(user_buf, count, &enable);
- if (err)
- return err;
-
- if (enable == hci_dev_test_flag(hdev, HCI_DUT_MODE))
- return -EALREADY;
-
- hci_req_sync_lock(hdev);
- if (enable)
- skb = __hci_cmd_sync(hdev, HCI_OP_ENABLE_DUT_MODE, 0, NULL,
- HCI_CMD_TIMEOUT);
- else
- skb = __hci_cmd_sync(hdev, HCI_OP_RESET, 0, NULL,
- HCI_CMD_TIMEOUT);
- hci_req_sync_unlock(hdev);
-
- if (IS_ERR(skb))
- return PTR_ERR(skb);
-
- kfree_skb(skb);
-
- hci_dev_change_flag(hdev, HCI_DUT_MODE);
-
- return count;
-}
-
-static const struct file_operations dut_mode_fops = {
- .open = simple_open,
- .read = dut_mode_read,
- .write = dut_mode_write,
- .llseek = default_llseek,
-};
-
-static ssize_t vendor_diag_read(struct file *file, char __user *user_buf,
- size_t count, loff_t *ppos)
-{
- struct hci_dev *hdev = file->private_data;
- char buf[3];
-
- buf[0] = hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) ? 'Y' : 'N';
- buf[1] = '\n';
- buf[2] = '\0';
- return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
-}
-
-static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf,
- size_t count, loff_t *ppos)
-{
- struct hci_dev *hdev = file->private_data;
- bool enable;
- int err;
-
- err = kstrtobool_from_user(user_buf, count, &enable);
- if (err)
- return err;
-
- /* When the diagnostic flags are not persistent and the transport
- * is not active or in user channel operation, then there is no need
- * for the vendor callback. Instead just store the desired value and
- * the setting will be programmed when the controller gets powered on.
- */
- if (test_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks) &&
- (!test_bit(HCI_RUNNING, &hdev->flags) ||
- hci_dev_test_flag(hdev, HCI_USER_CHANNEL)))
- goto done;
-
- hci_req_sync_lock(hdev);
- err = hdev->set_diag(hdev, enable);
- hci_req_sync_unlock(hdev);
-
- if (err < 0)
- return err;
-
-done:
- if (enable)
- hci_dev_set_flag(hdev, HCI_VENDOR_DIAG);
- else
- hci_dev_clear_flag(hdev, HCI_VENDOR_DIAG);
-
- return count;
-}
-
-static const struct file_operations vendor_diag_fops = {
- .open = simple_open,
- .read = vendor_diag_read,
- .write = vendor_diag_write,
- .llseek = default_llseek,
-};
-
-static void hci_debugfs_create_basic(struct hci_dev *hdev)
-{
- debugfs_create_file("dut_mode", 0644, hdev->debugfs, hdev,
- &dut_mode_fops);
-
- if (hdev->set_diag)
- debugfs_create_file("vendor_diag", 0644, hdev->debugfs, hdev,
- &vendor_diag_fops);
-}
-
-static int hci_reset_req(struct hci_request *req, unsigned long opt)
-{
- BT_DBG("%s %ld", req->hdev->name, opt);
-
- /* Reset device */
- set_bit(HCI_RESET, &req->hdev->flags);
- hci_req_add(req, HCI_OP_RESET, 0, NULL);
- return 0;
-}
-
-static void bredr_init(struct hci_request *req)
-{
- req->hdev->flow_ctl_mode = HCI_FLOW_CTL_MODE_PACKET_BASED;
-
- /* Read Local Supported Features */
- hci_req_add(req, HCI_OP_READ_LOCAL_FEATURES, 0, NULL);
-
- /* Read Local Version */
- hci_req_add(req, HCI_OP_READ_LOCAL_VERSION, 0, NULL);
-
- /* Read BD Address */
- hci_req_add(req, HCI_OP_READ_BD_ADDR, 0, NULL);
-}
-
-static void amp_init1(struct hci_request *req)
-{
- req->hdev->flow_ctl_mode = HCI_FLOW_CTL_MODE_BLOCK_BASED;
-
- /* Read Local Version */
- hci_req_add(req, HCI_OP_READ_LOCAL_VERSION, 0, NULL);
-
- /* Read Local Supported Commands */
- hci_req_add(req, HCI_OP_READ_LOCAL_COMMANDS, 0, NULL);
-
- /* Read Local AMP Info */
- hci_req_add(req, HCI_OP_READ_LOCAL_AMP_INFO, 0, NULL);
-
- /* Read Data Blk size */
- hci_req_add(req, HCI_OP_READ_DATA_BLOCK_SIZE, 0, NULL);
-
- /* Read Flow Control Mode */
- hci_req_add(req, HCI_OP_READ_FLOW_CONTROL_MODE, 0, NULL);
-
- /* Read Location Data */
- hci_req_add(req, HCI_OP_READ_LOCATION_DATA, 0, NULL);
-}
-
-static int amp_init2(struct hci_request *req)
-{
- /* Read Local Supported Features. Not all AMP controllers
- * support this so it's placed conditionally in the second
- * stage init.
- */
- if (req->hdev->commands[14] & 0x20)
- hci_req_add(req, HCI_OP_READ_LOCAL_FEATURES, 0, NULL);
-
- return 0;
-}
-
-static int hci_init1_req(struct hci_request *req, unsigned long opt)
-{
- struct hci_dev *hdev = req->hdev;
-
- BT_DBG("%s %ld", hdev->name, opt);
-
- /* Reset */
- if (!test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks))
- hci_reset_req(req, 0);
-
- switch (hdev->dev_type) {
- case HCI_PRIMARY:
- bredr_init(req);
- break;
- case HCI_AMP:
- amp_init1(req);
- break;
- default:
- bt_dev_err(hdev, "Unknown device type %d", hdev->dev_type);
- break;
- }
-
- return 0;
-}
-
-static void bredr_setup(struct hci_request *req)
-{
- __le16 param;
- __u8 flt_type;
-
- /* Read Buffer Size (ACL mtu, max pkt, etc.) */
- hci_req_add(req, HCI_OP_READ_BUFFER_SIZE, 0, NULL);
-
- /* Read Class of Device */
- hci_req_add(req, HCI_OP_READ_CLASS_OF_DEV, 0, NULL);
-
- /* Read Local Name */
- hci_req_add(req, HCI_OP_READ_LOCAL_NAME, 0, NULL);
-
- /* Read Voice Setting */
- hci_req_add(req, HCI_OP_READ_VOICE_SETTING, 0, NULL);
-
- /* Read Number of Supported IAC */
- hci_req_add(req, HCI_OP_READ_NUM_SUPPORTED_IAC, 0, NULL);
-
- /* Read Current IAC LAP */
- hci_req_add(req, HCI_OP_READ_CURRENT_IAC_LAP, 0, NULL);
-
- /* Clear Event Filters */
- flt_type = HCI_FLT_CLEAR_ALL;
- hci_req_add(req, HCI_OP_SET_EVENT_FLT, 1, &flt_type);
-
- /* Connection accept timeout ~20 secs */
- param = cpu_to_le16(0x7d00);
- hci_req_add(req, HCI_OP_WRITE_CA_TIMEOUT, 2, &param);
-}
-
-static void le_setup(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
-
- /* Read LE Buffer Size */
- hci_req_add(req, HCI_OP_LE_READ_BUFFER_SIZE, 0, NULL);
-
- /* Read LE Local Supported Features */
- hci_req_add(req, HCI_OP_LE_READ_LOCAL_FEATURES, 0, NULL);
-
- /* Read LE Supported States */
- hci_req_add(req, HCI_OP_LE_READ_SUPPORTED_STATES, 0, NULL);
-
- /* LE-only controllers have LE implicitly enabled */
- if (!lmp_bredr_capable(hdev))
- hci_dev_set_flag(hdev, HCI_LE_ENABLED);
-}
-
-static void hci_setup_event_mask(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
-
- /* The second byte is 0xff instead of 0x9f (two reserved bits
- * disabled) since a Broadcom 1.2 dongle doesn't respond to the
- * command otherwise.
- */
- u8 events[8] = { 0xff, 0xff, 0xfb, 0xff, 0x00, 0x00, 0x00, 0x00 };
-
- /* CSR 1.1 dongles does not accept any bitfield so don't try to set
- * any event mask for pre 1.2 devices.
- */
- if (hdev->hci_ver < BLUETOOTH_VER_1_2)
- return;
-
- if (lmp_bredr_capable(hdev)) {
- events[4] |= 0x01; /* Flow Specification Complete */
- } else {
- /* Use a different default for LE-only devices */
- memset(events, 0, sizeof(events));
- events[1] |= 0x20; /* Command Complete */
- events[1] |= 0x40; /* Command Status */
- events[1] |= 0x80; /* Hardware Error */
-
- /* If the controller supports the Disconnect command, enable
- * the corresponding event. In addition enable packet flow
- * control related events.
- */
- if (hdev->commands[0] & 0x20) {
- events[0] |= 0x10; /* Disconnection Complete */
- events[2] |= 0x04; /* Number of Completed Packets */
- events[3] |= 0x02; /* Data Buffer Overflow */
- }
-
- /* If the controller supports the Read Remote Version
- * Information command, enable the corresponding event.
- */
- if (hdev->commands[2] & 0x80)
- events[1] |= 0x08; /* Read Remote Version Information
- * Complete
- */
-
- if (hdev->le_features[0] & HCI_LE_ENCRYPTION) {
- events[0] |= 0x80; /* Encryption Change */
- events[5] |= 0x80; /* Encryption Key Refresh Complete */
- }
- }
-
- if (lmp_inq_rssi_capable(hdev) ||
- test_bit(HCI_QUIRK_FIXUP_INQUIRY_MODE, &hdev->quirks))
- events[4] |= 0x02; /* Inquiry Result with RSSI */
-
- if (lmp_ext_feat_capable(hdev))
- events[4] |= 0x04; /* Read Remote Extended Features Complete */
-
- if (lmp_esco_capable(hdev)) {
- events[5] |= 0x08; /* Synchronous Connection Complete */
- events[5] |= 0x10; /* Synchronous Connection Changed */
- }
-
- if (lmp_sniffsubr_capable(hdev))
- events[5] |= 0x20; /* Sniff Subrating */
-
- if (lmp_pause_enc_capable(hdev))
- events[5] |= 0x80; /* Encryption Key Refresh Complete */
-
- if (lmp_ext_inq_capable(hdev))
- events[5] |= 0x40; /* Extended Inquiry Result */
-
- if (lmp_no_flush_capable(hdev))
- events[7] |= 0x01; /* Enhanced Flush Complete */
-
- if (lmp_lsto_capable(hdev))
- events[6] |= 0x80; /* Link Supervision Timeout Changed */
-
- if (lmp_ssp_capable(hdev)) {
- events[6] |= 0x01; /* IO Capability Request */
- events[6] |= 0x02; /* IO Capability Response */
- events[6] |= 0x04; /* User Confirmation Request */
- events[6] |= 0x08; /* User Passkey Request */
- events[6] |= 0x10; /* Remote OOB Data Request */
- events[6] |= 0x20; /* Simple Pairing Complete */
- events[7] |= 0x04; /* User Passkey Notification */
- events[7] |= 0x08; /* Keypress Notification */
- events[7] |= 0x10; /* Remote Host Supported
- * Features Notification
- */
- }
-
- if (lmp_le_capable(hdev))
- events[7] |= 0x20; /* LE Meta-Event */
-
- hci_req_add(req, HCI_OP_SET_EVENT_MASK, sizeof(events), events);
-}
-
-static int hci_init2_req(struct hci_request *req, unsigned long opt)
-{
- struct hci_dev *hdev = req->hdev;
-
- if (hdev->dev_type == HCI_AMP)
- return amp_init2(req);
-
- if (lmp_bredr_capable(hdev))
- bredr_setup(req);
- else
- hci_dev_clear_flag(hdev, HCI_BREDR_ENABLED);
-
- if (lmp_le_capable(hdev))
- le_setup(req);
-
- /* All Bluetooth 1.2 and later controllers should support the
- * HCI command for reading the local supported commands.
- *
- * Unfortunately some controllers indicate Bluetooth 1.2 support,
- * but do not have support for this command. If that is the case,
- * the driver can quirk the behavior and skip reading the local
- * supported commands.
- */
- if (hdev->hci_ver > BLUETOOTH_VER_1_1 &&
- !test_bit(HCI_QUIRK_BROKEN_LOCAL_COMMANDS, &hdev->quirks))
- hci_req_add(req, HCI_OP_READ_LOCAL_COMMANDS, 0, NULL);
-
- if (lmp_ssp_capable(hdev)) {
- /* When SSP is available, then the host features page
- * should also be available as well. However some
- * controllers list the max_page as 0 as long as SSP
- * has not been enabled. To achieve proper debugging
- * output, force the minimum max_page to 1 at least.
- */
- hdev->max_page = 0x01;
-
- if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) {
- u8 mode = 0x01;
-
- hci_req_add(req, HCI_OP_WRITE_SSP_MODE,
- sizeof(mode), &mode);
- } else {
- struct hci_cp_write_eir cp;
-
- memset(hdev->eir, 0, sizeof(hdev->eir));
- memset(&cp, 0, sizeof(cp));
-
- hci_req_add(req, HCI_OP_WRITE_EIR, sizeof(cp), &cp);
- }
- }
-
- if (lmp_inq_rssi_capable(hdev) ||
- test_bit(HCI_QUIRK_FIXUP_INQUIRY_MODE, &hdev->quirks)) {
- u8 mode;
-
- /* If Extended Inquiry Result events are supported, then
- * they are clearly preferred over Inquiry Result with RSSI
- * events.
- */
- mode = lmp_ext_inq_capable(hdev) ? 0x02 : 0x01;
-
- hci_req_add(req, HCI_OP_WRITE_INQUIRY_MODE, 1, &mode);
- }
-
- if (lmp_inq_tx_pwr_capable(hdev))
- hci_req_add(req, HCI_OP_READ_INQ_RSP_TX_POWER, 0, NULL);
-
- if (lmp_ext_feat_capable(hdev)) {
- struct hci_cp_read_local_ext_features cp;
-
- cp.page = 0x01;
- hci_req_add(req, HCI_OP_READ_LOCAL_EXT_FEATURES,
- sizeof(cp), &cp);
- }
-
- if (hci_dev_test_flag(hdev, HCI_LINK_SECURITY)) {
- u8 enable = 1;
- hci_req_add(req, HCI_OP_WRITE_AUTH_ENABLE, sizeof(enable),
- &enable);
- }
-
- return 0;
-}
-
-static void hci_setup_link_policy(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
- struct hci_cp_write_def_link_policy cp;
- u16 link_policy = 0;
-
- if (lmp_rswitch_capable(hdev))
- link_policy |= HCI_LP_RSWITCH;
- if (lmp_hold_capable(hdev))
- link_policy |= HCI_LP_HOLD;
- if (lmp_sniff_capable(hdev))
- link_policy |= HCI_LP_SNIFF;
- if (lmp_park_capable(hdev))
- link_policy |= HCI_LP_PARK;
-
- cp.policy = cpu_to_le16(link_policy);
- hci_req_add(req, HCI_OP_WRITE_DEF_LINK_POLICY, sizeof(cp), &cp);
-}
-
-static void hci_set_le_support(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
- struct hci_cp_write_le_host_supported cp;
-
- /* LE-only devices do not support explicit enablement */
- if (!lmp_bredr_capable(hdev))
- return;
-
- memset(&cp, 0, sizeof(cp));
-
- if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
- cp.le = 0x01;
- cp.simul = 0x00;
- }
-
- if (cp.le != lmp_host_le_capable(hdev))
- hci_req_add(req, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(cp),
- &cp);
-}
-
-static void hci_set_event_mask_page_2(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
- u8 events[8] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
- bool changed = false;
-
- /* If Connectionless Slave Broadcast master role is supported
- * enable all necessary events for it.
- */
- if (lmp_csb_master_capable(hdev)) {
- events[1] |= 0x40; /* Triggered Clock Capture */
- events[1] |= 0x80; /* Synchronization Train Complete */
- events[2] |= 0x10; /* Slave Page Response Timeout */
- events[2] |= 0x20; /* CSB Channel Map Change */
- changed = true;
- }
-
- /* If Connectionless Slave Broadcast slave role is supported
- * enable all necessary events for it.
- */
- if (lmp_csb_slave_capable(hdev)) {
- events[2] |= 0x01; /* Synchronization Train Received */
- events[2] |= 0x02; /* CSB Receive */
- events[2] |= 0x04; /* CSB Timeout */
- events[2] |= 0x08; /* Truncated Page Complete */
- changed = true;
- }
-
- /* Enable Authenticated Payload Timeout Expired event if supported */
- if (lmp_ping_capable(hdev) || hdev->le_features[0] & HCI_LE_PING) {
- events[2] |= 0x80;
- changed = true;
- }
-
- /* Some Broadcom based controllers indicate support for Set Event
- * Mask Page 2 command, but then actually do not support it. Since
- * the default value is all bits set to zero, the command is only
- * required if the event mask has to be changed. In case no change
- * to the event mask is needed, skip this command.
- */
- if (changed)
- hci_req_add(req, HCI_OP_SET_EVENT_MASK_PAGE_2,
- sizeof(events), events);
-}
-
-static int hci_init3_req(struct hci_request *req, unsigned long opt)
-{
- struct hci_dev *hdev = req->hdev;
- u8 p;
-
- hci_setup_event_mask(req);
-
- if (hdev->commands[6] & 0x20 &&
- !test_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks)) {
- struct hci_cp_read_stored_link_key cp;
-
- bacpy(&cp.bdaddr, BDADDR_ANY);
- cp.read_all = 0x01;
- hci_req_add(req, HCI_OP_READ_STORED_LINK_KEY, sizeof(cp), &cp);
- }
-
- if (hdev->commands[5] & 0x10)
- hci_setup_link_policy(req);
-
- if (hdev->commands[8] & 0x01)
- hci_req_add(req, HCI_OP_READ_PAGE_SCAN_ACTIVITY, 0, NULL);
-
- /* Some older Broadcom based Bluetooth 1.2 controllers do not
- * support the Read Page Scan Type command. Check support for
- * this command in the bit mask of supported commands.
- */
- if (hdev->commands[13] & 0x01)
- hci_req_add(req, HCI_OP_READ_PAGE_SCAN_TYPE, 0, NULL);
-
- if (lmp_le_capable(hdev)) {
- u8 events[8];
-
- memset(events, 0, sizeof(events));
-
- if (hdev->le_features[0] & HCI_LE_ENCRYPTION)
- events[0] |= 0x10; /* LE Long Term Key Request */
-
- /* If controller supports the Connection Parameters Request
- * Link Layer Procedure, enable the corresponding event.
- */
- if (hdev->le_features[0] & HCI_LE_CONN_PARAM_REQ_PROC)
- events[0] |= 0x20; /* LE Remote Connection
- * Parameter Request
- */
-
- /* If the controller supports the Data Length Extension
- * feature, enable the corresponding event.
- */
- if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT)
- events[0] |= 0x40; /* LE Data Length Change */
-
- /* If the controller supports Extended Scanner Filter
- * Policies, enable the correspondig event.
- */
- if (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY)
- events[1] |= 0x04; /* LE Direct Advertising
- * Report
- */
-
- /* If the controller supports Channel Selection Algorithm #2
- * feature, enable the corresponding event.
- */
- if (hdev->le_features[1] & HCI_LE_CHAN_SEL_ALG2)
- events[2] |= 0x08; /* LE Channel Selection
- * Algorithm
- */
-
- /* If the controller supports the LE Set Scan Enable command,
- * enable the corresponding advertising report event.
- */
- if (hdev->commands[26] & 0x08)
- events[0] |= 0x02; /* LE Advertising Report */
-
- /* If the controller supports the LE Create Connection
- * command, enable the corresponding event.
- */
- if (hdev->commands[26] & 0x10)
- events[0] |= 0x01; /* LE Connection Complete */
-
- /* If the controller supports the LE Connection Update
- * command, enable the corresponding event.
- */
- if (hdev->commands[27] & 0x04)
- events[0] |= 0x04; /* LE Connection Update
- * Complete
- */
-
- /* If the controller supports the LE Read Remote Used Features
- * command, enable the corresponding event.
- */
- if (hdev->commands[27] & 0x20)
- events[0] |= 0x08; /* LE Read Remote Used
- * Features Complete
- */
-
- /* If the controller supports the LE Read Local P-256
- * Public Key command, enable the corresponding event.
- */
- if (hdev->commands[34] & 0x02)
- events[0] |= 0x80; /* LE Read Local P-256
- * Public Key Complete
- */
-
- /* If the controller supports the LE Generate DHKey
- * command, enable the corresponding event.
- */
- if (hdev->commands[34] & 0x04)
- events[1] |= 0x01; /* LE Generate DHKey Complete */
-
- /* If the controller supports the LE Set Default PHY or
- * LE Set PHY commands, enable the corresponding event.
- */
- if (hdev->commands[35] & (0x20 | 0x40))
- events[1] |= 0x08; /* LE PHY Update Complete */
-
- /* If the controller supports LE Set Extended Scan Parameters
- * and LE Set Extended Scan Enable commands, enable the
- * corresponding event.
- */
- if (use_ext_scan(hdev))
- events[1] |= 0x10; /* LE Extended Advertising
- * Report
- */
-
- /* If the controller supports the LE Extended Create Connection
- * command, enable the corresponding event.
- */
- if (use_ext_conn(hdev))
- events[1] |= 0x02; /* LE Enhanced Connection
- * Complete
- */
-
- /* If the controller supports the LE Extended Advertising
- * command, enable the corresponding event.
- */
- if (ext_adv_capable(hdev))
- events[2] |= 0x02; /* LE Advertising Set
- * Terminated
- */
-
- hci_req_add(req, HCI_OP_LE_SET_EVENT_MASK, sizeof(events),
- events);
-
- /* Read LE Advertising Channel TX Power */
- if ((hdev->commands[25] & 0x40) && !ext_adv_capable(hdev)) {
- /* HCI TS spec forbids mixing of legacy and extended
- * advertising commands wherein READ_ADV_TX_POWER is
- * also included. So do not call it if extended adv
- * is supported otherwise controller will return
- * COMMAND_DISALLOWED for extended commands.
- */
- hci_req_add(req, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL);
- }
-
- if (hdev->commands[26] & 0x40) {
- /* Read LE White List Size */
- hci_req_add(req, HCI_OP_LE_READ_WHITE_LIST_SIZE,
- 0, NULL);
- }
-
- if (hdev->commands[26] & 0x80) {
- /* Clear LE White List */
- hci_req_add(req, HCI_OP_LE_CLEAR_WHITE_LIST, 0, NULL);
- }
-
- if (hdev->commands[34] & 0x40) {
- /* Read LE Resolving List Size */
- hci_req_add(req, HCI_OP_LE_READ_RESOLV_LIST_SIZE,
- 0, NULL);
- }
-
- if (hdev->commands[34] & 0x20) {
- /* Clear LE Resolving List */
- hci_req_add(req, HCI_OP_LE_CLEAR_RESOLV_LIST, 0, NULL);
- }
-
- if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) {
- /* Read LE Maximum Data Length */
- hci_req_add(req, HCI_OP_LE_READ_MAX_DATA_LEN, 0, NULL);
-
- /* Read LE Suggested Default Data Length */
- hci_req_add(req, HCI_OP_LE_READ_DEF_DATA_LEN, 0, NULL);
- }
-
- if (ext_adv_capable(hdev)) {
- /* Read LE Number of Supported Advertising Sets */
- hci_req_add(req, HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS,
- 0, NULL);
- }
-
- hci_set_le_support(req);
- }
-
- /* Read features beyond page 1 if available */
- for (p = 2; p < HCI_MAX_PAGES && p <= hdev->max_page; p++) {
- struct hci_cp_read_local_ext_features cp;
-
- cp.page = p;
- hci_req_add(req, HCI_OP_READ_LOCAL_EXT_FEATURES,
- sizeof(cp), &cp);
- }
-
- return 0;
-}
-
-static int hci_init4_req(struct hci_request *req, unsigned long opt)
-{
- struct hci_dev *hdev = req->hdev;
-
- /* Some Broadcom based Bluetooth controllers do not support the
- * Delete Stored Link Key command. They are clearly indicating its
- * absence in the bit mask of supported commands.
- *
- * Check the supported commands and only if the the command is marked
- * as supported send it. If not supported assume that the controller
- * does not have actual support for stored link keys which makes this
- * command redundant anyway.
- *
- * Some controllers indicate that they support handling deleting
- * stored link keys, but they don't. The quirk lets a driver
- * just disable this command.
- */
- if (hdev->commands[6] & 0x80 &&
- !test_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks)) {
- struct hci_cp_delete_stored_link_key cp;
-
- bacpy(&cp.bdaddr, BDADDR_ANY);
- cp.delete_all = 0x01;
- hci_req_add(req, HCI_OP_DELETE_STORED_LINK_KEY,
- sizeof(cp), &cp);
- }
-
- /* Set event mask page 2 if the HCI command for it is supported */
- if (hdev->commands[22] & 0x04)
- hci_set_event_mask_page_2(req);
-
- /* Read local codec list if the HCI command is supported */
- if (hdev->commands[29] & 0x20)
- hci_req_add(req, HCI_OP_READ_LOCAL_CODECS, 0, NULL);
-
- /* Get MWS transport configuration if the HCI command is supported */
- if (hdev->commands[30] & 0x08)
- hci_req_add(req, HCI_OP_GET_MWS_TRANSPORT_CONFIG, 0, NULL);
-
- /* Check for Synchronization Train support */
- if (lmp_sync_train_capable(hdev))
- hci_req_add(req, HCI_OP_READ_SYNC_TRAIN_PARAMS, 0, NULL);
-
- /* Enable Secure Connections if supported and configured */
- if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED) &&
- bredr_sc_enabled(hdev)) {
- u8 support = 0x01;
-
- hci_req_add(req, HCI_OP_WRITE_SC_SUPPORT,
- sizeof(support), &support);
- }
-
- /* Set Suggested Default Data Length to maximum if supported */
- if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) {
- struct hci_cp_le_write_def_data_len cp;
-
- cp.tx_len = hdev->le_max_tx_len;
- cp.tx_time = hdev->le_max_tx_time;
- hci_req_add(req, HCI_OP_LE_WRITE_DEF_DATA_LEN, sizeof(cp), &cp);
- }
-
- /* Set Default PHY parameters if command is supported */
- if (hdev->commands[35] & 0x20) {
- struct hci_cp_le_set_default_phy cp;
-
- cp.all_phys = 0x00;
- cp.tx_phys = hdev->le_tx_def_phys;
- cp.rx_phys = hdev->le_rx_def_phys;
-
- hci_req_add(req, HCI_OP_LE_SET_DEFAULT_PHY, sizeof(cp), &cp);
- }
-
- return 0;
-}
-
-static int __hci_init(struct hci_dev *hdev)
-{
- int err;
-
- err = __hci_req_sync(hdev, hci_init1_req, 0, HCI_INIT_TIMEOUT, NULL);
- if (err < 0)
- return err;
-
- if (hci_dev_test_flag(hdev, HCI_SETUP))
- hci_debugfs_create_basic(hdev);
-
- err = __hci_req_sync(hdev, hci_init2_req, 0, HCI_INIT_TIMEOUT, NULL);
- if (err < 0)
- return err;
-
- /* HCI_PRIMARY covers both single-mode LE, BR/EDR and dual-mode
- * BR/EDR/LE type controllers. AMP controllers only need the
- * first two stages of init.
- */
- if (hdev->dev_type != HCI_PRIMARY)
- return 0;
-
- err = __hci_req_sync(hdev, hci_init3_req, 0, HCI_INIT_TIMEOUT, NULL);
- if (err < 0)
- return err;
-
- err = __hci_req_sync(hdev, hci_init4_req, 0, HCI_INIT_TIMEOUT, NULL);
- if (err < 0)
- return err;
-
- /* This function is only called when the controller is actually in
- * configured state. When the controller is marked as unconfigured,
- * this initialization procedure is not run.
- *
- * It means that it is possible that a controller runs through its
- * setup phase and then discovers missing settings. If that is the
- * case, then this function will not be called. It then will only
- * be called during the config phase.
- *
- * So only when in setup phase or config phase, create the debugfs
- * entries and register the SMP channels.
- */
- if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
- !hci_dev_test_flag(hdev, HCI_CONFIG))
- return 0;
-
- hci_debugfs_create_common(hdev);
-
- if (lmp_bredr_capable(hdev))
- hci_debugfs_create_bredr(hdev);
-
- if (lmp_le_capable(hdev))
- hci_debugfs_create_le(hdev);
-
- return 0;
-}
-
-static int hci_init0_req(struct hci_request *req, unsigned long opt)
-{
- struct hci_dev *hdev = req->hdev;
-
- BT_DBG("%s %ld", hdev->name, opt);
-
- /* Reset */
- if (!test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks))
- hci_reset_req(req, 0);
-
- /* Read Local Version */
- hci_req_add(req, HCI_OP_READ_LOCAL_VERSION, 0, NULL);
-
- /* Read BD Address */
- if (hdev->set_bdaddr)
- hci_req_add(req, HCI_OP_READ_BD_ADDR, 0, NULL);
-
- return 0;
-}
-
-static int __hci_unconf_init(struct hci_dev *hdev)
-{
- int err;
-
- if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks))
- return 0;
-
- err = __hci_req_sync(hdev, hci_init0_req, 0, HCI_INIT_TIMEOUT, NULL);
- if (err < 0)
- return err;
-
- if (hci_dev_test_flag(hdev, HCI_SETUP))
- hci_debugfs_create_basic(hdev);
-
- return 0;
-}
-
-static int hci_scan_req(struct hci_request *req, unsigned long opt)
-{
- __u8 scan = opt;
-
- BT_DBG("%s %x", req->hdev->name, scan);
-
- /* Inquiry and Page scans */
- hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
- return 0;
-}
-
-static int hci_auth_req(struct hci_request *req, unsigned long opt)
-{
- __u8 auth = opt;
-
- BT_DBG("%s %x", req->hdev->name, auth);
-
- /* Authentication */
- hci_req_add(req, HCI_OP_WRITE_AUTH_ENABLE, 1, &auth);
- return 0;
-}
-
-static int hci_encrypt_req(struct hci_request *req, unsigned long opt)
-{
- __u8 encrypt = opt;
-
- BT_DBG("%s %x", req->hdev->name, encrypt);
-
- /* Encryption */
- hci_req_add(req, HCI_OP_WRITE_ENCRYPT_MODE, 1, &encrypt);
- return 0;
-}
-
-static int hci_linkpol_req(struct hci_request *req, unsigned long opt)
-{
- __le16 policy = cpu_to_le16(opt);
-
- BT_DBG("%s %x", req->hdev->name, policy);
-
- /* Default link policy */
- hci_req_add(req, HCI_OP_WRITE_DEF_LINK_POLICY, 2, &policy);
- return 0;
-}
-
/* Get HCI device by index.
* Device is held on return. */
-struct hci_dev *hci_dev_get(int index)
+static struct hci_dev *__hci_dev_get(int index, int *srcu_index)
{
struct hci_dev *hdev = NULL, *d;
@@ -1013,6 +77,8 @@ struct hci_dev *hci_dev_get(int index)
list_for_each_entry(d, &hci_dev_list, list) {
if (d->id == index) {
hdev = hci_dev_hold(d);
+ if (srcu_index)
+ *srcu_index = srcu_read_lock(&d->srcu);
break;
}
}
@@ -1020,6 +86,22 @@ struct hci_dev *hci_dev_get(int index)
return hdev;
}
+struct hci_dev *hci_dev_get(int index)
+{
+ return __hci_dev_get(index, NULL);
+}
+
+static struct hci_dev *hci_dev_get_srcu(int index, int *srcu_index)
+{
+ return __hci_dev_get(index, srcu_index);
+}
+
+static void hci_dev_put_srcu(struct hci_dev *hdev, int srcu_index)
+{
+ srcu_read_unlock(&hdev->srcu, srcu_index);
+ hci_dev_put(hdev);
+}
+
/* ---- Inquiry support ---- */
bool hci_discovery_active(struct hci_dev *hdev)
@@ -1040,8 +122,6 @@ void hci_discovery_set_state(struct hci_dev *hdev, int state)
{
int old_state = hdev->discovery.state;
- BT_DBG("%s state %u -> %u", hdev->name, hdev->discovery.state, state);
-
if (old_state == state)
return;
@@ -1049,7 +129,7 @@ void hci_discovery_set_state(struct hci_dev *hdev, int state)
switch (state) {
case DISCOVERY_STOPPED:
- hci_update_background_scan(hdev);
+ hci_update_passive_scan(hdev);
if (old_state != DISCOVERY_STARTING)
mgmt_discovering(hdev, 0);
@@ -1064,6 +144,8 @@ void hci_discovery_set_state(struct hci_dev *hdev, int state)
case DISCOVERY_STOPPING:
break;
}
+
+ bt_dev_dbg(hdev, "state %u -> %u", old_state, state);
}
void hci_inquiry_cache_flush(struct hci_dev *hdev)
@@ -1240,33 +322,12 @@ static int inquiry_cache_dump(struct hci_dev *hdev, int num, __u8 *buf)
return copied;
}
-static int hci_inq_req(struct hci_request *req, unsigned long opt)
-{
- struct hci_inquiry_req *ir = (struct hci_inquiry_req *) opt;
- struct hci_dev *hdev = req->hdev;
- struct hci_cp_inquiry cp;
-
- BT_DBG("%s", hdev->name);
-
- if (test_bit(HCI_INQUIRY, &hdev->flags))
- return 0;
-
- /* Start Inquiry */
- memcpy(&cp.lap, &ir->lap, 3);
- cp.length = ir->length;
- cp.num_rsp = ir->num_rsp;
- hci_req_add(req, HCI_OP_INQUIRY, sizeof(cp), &cp);
-
- return 0;
-}
-
int hci_inquiry(void __user *arg)
{
__u8 __user *ptr = arg;
struct hci_inquiry_req ir;
struct hci_dev *hdev;
int err = 0, do_inquiry = 0, max_rsp;
- long timeo;
__u8 *buf;
if (copy_from_user(&ir, ptr, sizeof(ir)))
@@ -1286,13 +347,14 @@ int hci_inquiry(void __user *arg)
goto done;
}
- if (hdev->dev_type != HCI_PRIMARY) {
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
err = -EOPNOTSUPP;
goto done;
}
- if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
- err = -EOPNOTSUPP;
+ /* Restrict maximum inquiry length to 60 seconds */
+ if (ir.length > 60) {
+ err = -EINVAL;
goto done;
}
@@ -1304,11 +366,11 @@ int hci_inquiry(void __user *arg)
}
hci_dev_unlock(hdev);
- timeo = ir.length * msecs_to_jiffies(2000);
-
if (do_inquiry) {
- err = hci_req_sync(hdev, hci_inq_req, (unsigned long) &ir,
- timeo, NULL);
+ hci_req_sync_lock(hdev);
+ err = hci_inquiry_sync(hdev, ir.length, ir.num_rsp);
+ hci_req_sync_unlock(hdev);
+
if (err < 0)
goto done;
@@ -1316,8 +378,10 @@ int hci_inquiry(void __user *arg)
* cleared). If it is interrupted by a signal, return -EINTR.
*/
if (wait_on_bit(&hdev->flags, HCI_INQUIRY,
- TASK_INTERRUPTIBLE))
- return -EINTR;
+ TASK_INTERRUPTIBLE)) {
+ err = -EINTR;
+ goto done;
+ }
}
/* for unlimited number of responses we will use buffer with
@@ -1363,161 +427,8 @@ static int hci_dev_do_open(struct hci_dev *hdev)
hci_req_sync_lock(hdev);
- if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) {
- ret = -ENODEV;
- goto done;
- }
-
- if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
- !hci_dev_test_flag(hdev, HCI_CONFIG)) {
- /* Check for rfkill but allow the HCI setup stage to
- * proceed (which in itself doesn't cause any RF activity).
- */
- if (hci_dev_test_flag(hdev, HCI_RFKILLED)) {
- ret = -ERFKILL;
- goto done;
- }
-
- /* Check for valid public address or a configured static
- * random adddress, but let the HCI setup proceed to
- * be able to determine if there is a public address
- * or not.
- *
- * In case of user channel usage, it is not important
- * if a public address or static random address is
- * available.
- *
- * This check is only valid for BR/EDR controllers
- * since AMP controllers do not have an address.
- */
- if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
- hdev->dev_type == HCI_PRIMARY &&
- !bacmp(&hdev->bdaddr, BDADDR_ANY) &&
- !bacmp(&hdev->static_addr, BDADDR_ANY)) {
- ret = -EADDRNOTAVAIL;
- goto done;
- }
- }
-
- if (test_bit(HCI_UP, &hdev->flags)) {
- ret = -EALREADY;
- goto done;
- }
-
- if (hdev->open(hdev)) {
- ret = -EIO;
- goto done;
- }
-
- set_bit(HCI_RUNNING, &hdev->flags);
- hci_sock_dev_event(hdev, HCI_DEV_OPEN);
-
- atomic_set(&hdev->cmd_cnt, 1);
- set_bit(HCI_INIT, &hdev->flags);
+ ret = hci_dev_open_sync(hdev);
- if (hci_dev_test_flag(hdev, HCI_SETUP) ||
- test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks)) {
- hci_sock_dev_event(hdev, HCI_DEV_SETUP);
-
- if (hdev->setup)
- ret = hdev->setup(hdev);
-
- /* The transport driver can set these quirks before
- * creating the HCI device or in its setup callback.
- *
- * In case any of them is set, the controller has to
- * start up as unconfigured.
- */
- if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) ||
- test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks))
- hci_dev_set_flag(hdev, HCI_UNCONFIGURED);
-
- /* For an unconfigured controller it is required to
- * read at least the version information provided by
- * the Read Local Version Information command.
- *
- * If the set_bdaddr driver callback is provided, then
- * also the original Bluetooth public device address
- * will be read using the Read BD Address command.
- */
- if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
- ret = __hci_unconf_init(hdev);
- }
-
- if (hci_dev_test_flag(hdev, HCI_CONFIG)) {
- /* If public address change is configured, ensure that
- * the address gets programmed. If the driver does not
- * support changing the public address, fail the power
- * on procedure.
- */
- if (bacmp(&hdev->public_addr, BDADDR_ANY) &&
- hdev->set_bdaddr)
- ret = hdev->set_bdaddr(hdev, &hdev->public_addr);
- else
- ret = -EADDRNOTAVAIL;
- }
-
- if (!ret) {
- if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
- !hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
- ret = __hci_init(hdev);
- if (!ret && hdev->post_init)
- ret = hdev->post_init(hdev);
- }
- }
-
- /* If the HCI Reset command is clearing all diagnostic settings,
- * then they need to be reprogrammed after the init procedure
- * completed.
- */
- if (test_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks) &&
- !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
- hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) && hdev->set_diag)
- ret = hdev->set_diag(hdev, true);
-
- clear_bit(HCI_INIT, &hdev->flags);
-
- if (!ret) {
- hci_dev_hold(hdev);
- hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
- hci_adv_instances_set_rpa_expired(hdev, true);
- set_bit(HCI_UP, &hdev->flags);
- hci_sock_dev_event(hdev, HCI_DEV_UP);
- hci_leds_update_powered(hdev, true);
- if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
- !hci_dev_test_flag(hdev, HCI_CONFIG) &&
- !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
- !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
- hci_dev_test_flag(hdev, HCI_MGMT) &&
- hdev->dev_type == HCI_PRIMARY) {
- ret = __hci_req_hci_power_on(hdev);
- mgmt_power_on(hdev, ret);
- }
- } else {
- /* Init failed, cleanup */
- flush_work(&hdev->tx_work);
- flush_work(&hdev->cmd_work);
- flush_work(&hdev->rx_work);
-
- skb_queue_purge(&hdev->cmd_q);
- skb_queue_purge(&hdev->rx_q);
-
- if (hdev->flush)
- hdev->flush(hdev);
-
- if (hdev->sent_cmd) {
- kfree_skb(hdev->sent_cmd);
- hdev->sent_cmd = NULL;
- }
-
- clear_bit(HCI_RUNNING, &hdev->flags);
- hci_sock_dev_event(hdev, HCI_DEV_CLOSE);
-
- hdev->close(hdev);
- hdev->flags &= BIT(HCI_RAW);
- }
-
-done:
hci_req_sync_unlock(hdev);
return ret;
}
@@ -1579,147 +490,19 @@ done:
return err;
}
-/* This function requires the caller holds hdev->lock */
-static void hci_pend_le_actions_clear(struct hci_dev *hdev)
-{
- struct hci_conn_params *p;
-
- list_for_each_entry(p, &hdev->le_conn_params, list) {
- if (p->conn) {
- hci_conn_drop(p->conn);
- hci_conn_put(p->conn);
- p->conn = NULL;
- }
- list_del_init(&p->action);
- }
-
- BT_DBG("All LE pending actions cleared");
-}
-
int hci_dev_do_close(struct hci_dev *hdev)
{
- bool auto_off;
+ int err;
BT_DBG("%s %p", hdev->name, hdev);
- if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) &&
- !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
- test_bit(HCI_UP, &hdev->flags)) {
- /* Execute vendor specific shutdown routine */
- if (hdev->shutdown)
- hdev->shutdown(hdev);
- }
-
- cancel_delayed_work(&hdev->power_off);
-
- hci_request_cancel_all(hdev);
hci_req_sync_lock(hdev);
- if (!test_and_clear_bit(HCI_UP, &hdev->flags)) {
- cancel_delayed_work_sync(&hdev->cmd_timer);
- hci_req_sync_unlock(hdev);
- return 0;
- }
-
- hci_leds_update_powered(hdev, false);
-
- /* Flush RX and TX works */
- flush_work(&hdev->tx_work);
- flush_work(&hdev->rx_work);
-
- if (hdev->discov_timeout > 0) {
- hdev->discov_timeout = 0;
- hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
- hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
- }
-
- if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE))
- cancel_delayed_work(&hdev->service_cache);
-
- if (hci_dev_test_flag(hdev, HCI_MGMT)) {
- struct adv_info *adv_instance;
-
- cancel_delayed_work_sync(&hdev->rpa_expired);
-
- list_for_each_entry(adv_instance, &hdev->adv_instances, list)
- cancel_delayed_work_sync(&adv_instance->rpa_expired_cb);
- }
-
- /* Avoid potential lockdep warnings from the *_flush() calls by
- * ensuring the workqueue is empty up front.
- */
- drain_workqueue(hdev->workqueue);
-
- hci_dev_lock(hdev);
-
- hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
-
- auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF);
-
- if (!auto_off && hdev->dev_type == HCI_PRIMARY &&
- !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
- hci_dev_test_flag(hdev, HCI_MGMT))
- __mgmt_power_off(hdev);
-
- hci_inquiry_cache_flush(hdev);
- hci_pend_le_actions_clear(hdev);
- hci_conn_hash_flush(hdev);
- hci_dev_unlock(hdev);
-
- smp_unregister(hdev);
-
- hci_sock_dev_event(hdev, HCI_DEV_DOWN);
-
- if (hdev->flush)
- hdev->flush(hdev);
-
- /* Reset device */
- skb_queue_purge(&hdev->cmd_q);
- atomic_set(&hdev->cmd_cnt, 1);
- if (test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks) &&
- !auto_off && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
- set_bit(HCI_INIT, &hdev->flags);
- __hci_req_sync(hdev, hci_reset_req, 0, HCI_CMD_TIMEOUT, NULL);
- clear_bit(HCI_INIT, &hdev->flags);
- }
-
- /* flush cmd work */
- flush_work(&hdev->cmd_work);
-
- /* Drop queues */
- skb_queue_purge(&hdev->rx_q);
- skb_queue_purge(&hdev->cmd_q);
- skb_queue_purge(&hdev->raw_q);
-
- /* Drop last sent command */
- if (hdev->sent_cmd) {
- cancel_delayed_work_sync(&hdev->cmd_timer);
- kfree_skb(hdev->sent_cmd);
- hdev->sent_cmd = NULL;
- }
-
- clear_bit(HCI_RUNNING, &hdev->flags);
- hci_sock_dev_event(hdev, HCI_DEV_CLOSE);
-
- /* After this point our queues are empty
- * and no tasks are scheduled. */
- hdev->close(hdev);
-
- /* Clear flags */
- hdev->flags &= BIT(HCI_RAW);
- hci_dev_clear_volatile_flags(hdev);
-
- /* Controller radio is available but is currently powered down */
- hdev->amp_status = AMP_STATUS_POWERED_DOWN;
-
- memset(hdev->eir, 0, sizeof(hdev->eir));
- memset(hdev->dev_class, 0, sizeof(hdev->dev_class));
- bacpy(&hdev->random_addr, BDADDR_ANY);
+ err = hci_dev_close_sync(hdev);
hci_req_sync_unlock(hdev);
- hci_dev_put(hdev);
- return 0;
+ return err;
}
int hci_dev_close(__u16 dev)
@@ -1736,6 +519,7 @@ int hci_dev_close(__u16 dev)
goto done;
}
+ cancel_work_sync(&hdev->power_on);
if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF))
cancel_delayed_work(&hdev->power_off);
@@ -1758,6 +542,20 @@ static int hci_dev_do_reset(struct hci_dev *hdev)
skb_queue_purge(&hdev->rx_q);
skb_queue_purge(&hdev->cmd_q);
+ /* Cancel these to avoid queueing non-chained pending work */
+ hci_dev_set_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE);
+ /* Wait for
+ *
+ * if (!hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE))
+ * queue_delayed_work(&hdev->{cmd,ncmd}_timer)
+ *
+ * inside RCU section to see the flag or complete scheduling.
+ */
+ synchronize_rcu();
+ /* Explicitly cancel works in case scheduled after setting the flag. */
+ cancel_delayed_work(&hdev->cmd_timer);
+ cancel_delayed_work(&hdev->ncmd_timer);
+
/* Avoid potential lockdep warnings from the *_flush() calls by
* ensuring the workqueue is empty up front.
*/
@@ -1771,10 +569,15 @@ static int hci_dev_do_reset(struct hci_dev *hdev)
if (hdev->flush)
hdev->flush(hdev);
+ hci_dev_clear_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE);
+
atomic_set(&hdev->cmd_cnt, 1);
- hdev->acl_cnt = 0; hdev->sco_cnt = 0; hdev->le_cnt = 0;
+ hdev->acl_cnt = 0;
+ hdev->sco_cnt = 0;
+ hdev->le_cnt = 0;
+ hdev->iso_cnt = 0;
- ret = __hci_req_sync(hdev, hci_reset_req, 0, HCI_INIT_TIMEOUT, NULL);
+ ret = hci_reset_sync(hdev);
hci_req_sync_unlock(hdev);
return ret;
@@ -1783,9 +586,9 @@ static int hci_dev_do_reset(struct hci_dev *hdev)
int hci_dev_reset(__u16 dev)
{
struct hci_dev *hdev;
- int err;
+ int err, srcu_index;
- hdev = hci_dev_get(dev);
+ hdev = hci_dev_get_srcu(dev, &srcu_index);
if (!hdev)
return -ENODEV;
@@ -1807,7 +610,7 @@ int hci_dev_reset(__u16 dev)
err = hci_dev_do_reset(hdev);
done:
- hci_dev_put(hdev);
+ hci_dev_put_srcu(hdev, srcu_index);
return err;
}
@@ -1837,7 +640,7 @@ done:
return ret;
}
-static void hci_update_scan_state(struct hci_dev *hdev, u8 scan)
+static void hci_update_passive_scan_state(struct hci_dev *hdev, u8 scan)
{
bool conn_changed, discov_changed;
@@ -1867,7 +670,7 @@ static void hci_update_scan_state(struct hci_dev *hdev, u8 scan)
hci_dev_set_flag(hdev, HCI_BREDR_ENABLED);
if (hci_dev_test_flag(hdev, HCI_LE_ENABLED))
- hci_req_update_adv_data(hdev, hdev->cur_adv_instance);
+ hci_update_adv_data(hdev, hdev->cur_adv_instance);
mgmt_new_settings(hdev);
}
@@ -1877,6 +680,7 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg)
{
struct hci_dev *hdev;
struct hci_dev_req dr;
+ __le16 policy;
int err = 0;
if (copy_from_user(&dr, arg, sizeof(dr)))
@@ -1896,11 +700,6 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg)
goto done;
}
- if (hdev->dev_type != HCI_PRIMARY) {
- err = -EOPNOTSUPP;
- goto done;
- }
-
if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
err = -EOPNOTSUPP;
goto done;
@@ -1908,8 +707,8 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg)
switch (cmd) {
case HCISETAUTH:
- err = hci_req_sync(hdev, hci_auth_req, dr.dev_opt,
- HCI_INIT_TIMEOUT, NULL);
+ err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_AUTH_ENABLE,
+ 1, &dr.dev_opt, HCI_CMD_TIMEOUT);
break;
case HCISETENCRYPT:
@@ -1920,30 +719,34 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg)
if (!test_bit(HCI_AUTH, &hdev->flags)) {
/* Auth must be enabled first */
- err = hci_req_sync(hdev, hci_auth_req, dr.dev_opt,
- HCI_INIT_TIMEOUT, NULL);
+ err = hci_cmd_sync_status(hdev,
+ HCI_OP_WRITE_AUTH_ENABLE,
+ 1, &dr.dev_opt,
+ HCI_CMD_TIMEOUT);
if (err)
break;
}
- err = hci_req_sync(hdev, hci_encrypt_req, dr.dev_opt,
- HCI_INIT_TIMEOUT, NULL);
+ err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_ENCRYPT_MODE,
+ 1, &dr.dev_opt, HCI_CMD_TIMEOUT);
break;
case HCISETSCAN:
- err = hci_req_sync(hdev, hci_scan_req, dr.dev_opt,
- HCI_INIT_TIMEOUT, NULL);
+ err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_SCAN_ENABLE,
+ 1, &dr.dev_opt, HCI_CMD_TIMEOUT);
/* Ensure that the connectable and discoverable states
* get correctly modified as this was a non-mgmt change.
*/
if (!err)
- hci_update_scan_state(hdev, dr.dev_opt);
+ hci_update_passive_scan_state(hdev, dr.dev_opt);
break;
case HCISETLINKPOL:
- err = hci_req_sync(hdev, hci_linkpol_req, dr.dev_opt,
- HCI_INIT_TIMEOUT, NULL);
+ policy = cpu_to_le16(dr.dev_opt);
+
+ err = hci_cmd_sync_status(hdev, HCI_OP_WRITE_DEF_LINK_POLICY,
+ 2, &policy, HCI_CMD_TIMEOUT);
break;
case HCISETLINKMODE:
@@ -1984,7 +787,7 @@ int hci_get_dev_list(void __user *arg)
struct hci_dev *hdev;
struct hci_dev_list_req *dl;
struct hci_dev_req *dr;
- int n = 0, size, err;
+ int n = 0, err;
__u16 dev_num;
if (get_user(dev_num, (__u16 __user *) arg))
@@ -1993,12 +796,11 @@ int hci_get_dev_list(void __user *arg)
if (!dev_num || dev_num > (PAGE_SIZE * 2) / sizeof(*dr))
return -EINVAL;
- size = sizeof(*dl) + dev_num * sizeof(*dr);
-
- dl = kzalloc(size, GFP_KERNEL);
+ dl = kzalloc(struct_size(dl, dev_req, dev_num), GFP_KERNEL);
if (!dl)
return -ENOMEM;
+ dl->dev_num = dev_num;
dr = dl->dev_req;
read_lock(&hci_dev_list_lock);
@@ -2012,8 +814,8 @@ int hci_get_dev_list(void __user *arg)
if (hci_dev_test_flag(hdev, HCI_AUTO_OFF))
flags &= ~BIT(HCI_UP);
- (dr + n)->dev_id = hdev->id;
- (dr + n)->dev_opt = flags;
+ dr[n].dev_id = hdev->id;
+ dr[n].dev_opt = flags;
if (++n >= dev_num)
break;
@@ -2021,9 +823,7 @@ int hci_get_dev_list(void __user *arg)
read_unlock(&hci_dev_list_lock);
dl->dev_num = n;
- size = sizeof(*dl) + n * sizeof(*dr);
-
- err = copy_to_user(arg, dl, size);
+ err = copy_to_user(arg, dl, struct_size(dl, dev_req, n));
kfree(dl);
return err ? -EFAULT : 0;
@@ -2052,9 +852,9 @@ int hci_get_dev_info(void __user *arg)
else
flags = hdev->flags;
- strcpy(di.name, hdev->name);
+ strscpy(di.name, hdev->name, sizeof(di.name));
di.bdaddr = hdev->bdaddr;
- di.type = (hdev->bus & 0x0f) | ((hdev->dev_type & 0x03) << 4);
+ di.type = (hdev->bus & 0x0f);
di.flags = flags;
di.pkt_type = hdev->pkt_type;
if (lmp_bredr_capable(hdev)) {
@@ -2084,20 +884,51 @@ int hci_get_dev_info(void __user *arg)
/* ---- Interface to HCI drivers ---- */
+static int hci_dev_do_poweroff(struct hci_dev *hdev)
+{
+ int err;
+
+ BT_DBG("%s %p", hdev->name, hdev);
+
+ hci_req_sync_lock(hdev);
+
+ err = hci_set_powered_sync(hdev, false);
+
+ hci_req_sync_unlock(hdev);
+
+ return err;
+}
+
static int hci_rfkill_set_block(void *data, bool blocked)
{
struct hci_dev *hdev = data;
+ int err;
BT_DBG("%p name %s blocked %d", hdev, hdev->name, blocked);
if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL))
return -EBUSY;
+ if (blocked == hci_dev_test_flag(hdev, HCI_RFKILLED))
+ return 0;
+
if (blocked) {
hci_dev_set_flag(hdev, HCI_RFKILLED);
+
if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
- !hci_dev_test_flag(hdev, HCI_CONFIG))
- hci_dev_do_close(hdev);
+ !hci_dev_test_flag(hdev, HCI_CONFIG)) {
+ err = hci_dev_do_poweroff(hdev);
+ if (err) {
+ bt_dev_err(hdev, "Error when powering off device on rfkill (%d)",
+ err);
+
+ /* Make sure the device is still closed even if
+ * anything during power off sequence (eg.
+ * disconnecting devices) failed.
+ */
+ hci_dev_do_close(hdev);
+ }
+ }
} else {
hci_dev_clear_flag(hdev, HCI_RFKILLED);
}
@@ -2120,9 +951,7 @@ static void hci_power_on(struct work_struct *work)
hci_dev_test_flag(hdev, HCI_MGMT) &&
hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) {
cancel_delayed_work(&hdev->power_off);
- hci_req_sync_lock(hdev);
- err = __hci_req_hci_power_on(hdev);
- hci_req_sync_unlock(hdev);
+ err = hci_powered_update_sync(hdev);
mgmt_power_on(hdev, err);
return;
}
@@ -2141,8 +970,7 @@ static void hci_power_on(struct work_struct *work)
*/
if (hci_dev_test_flag(hdev, HCI_RFKILLED) ||
hci_dev_test_flag(hdev, HCI_UNCONFIGURED) ||
- (hdev->dev_type == HCI_PRIMARY &&
- !bacmp(&hdev->bdaddr, BDADDR_ANY) &&
+ (!bacmp(&hdev->bdaddr, BDADDR_ANY) &&
!bacmp(&hdev->static_addr, BDADDR_ANY))) {
hci_dev_clear_flag(hdev, HCI_AUTO_OFF);
hci_dev_do_close(hdev);
@@ -2195,6 +1023,7 @@ static void hci_error_reset(struct work_struct *work)
{
struct hci_dev *hdev = container_of(work, struct hci_dev, error_reset);
+ hci_dev_hold(hdev);
BT_DBG("%s", hdev->name);
if (hdev->hw_error)
@@ -2202,10 +1031,10 @@ static void hci_error_reset(struct work_struct *work)
else
bt_dev_err(hdev, "hardware error 0x%2.2x", hdev->hw_error_code);
- if (hci_dev_do_close(hdev))
- return;
+ if (!hci_dev_do_close(hdev))
+ hci_dev_do_open(hdev);
- hci_dev_do_open(hdev);
+ hci_dev_put(hdev);
}
void hci_uuids_clear(struct hci_dev *hdev)
@@ -2220,9 +1049,9 @@ void hci_uuids_clear(struct hci_dev *hdev)
void hci_link_keys_clear(struct hci_dev *hdev)
{
- struct link_key *key;
+ struct link_key *key, *tmp;
- list_for_each_entry_rcu(key, &hdev->link_keys, list) {
+ list_for_each_entry_safe(key, tmp, &hdev->link_keys, list) {
list_del_rcu(&key->list);
kfree_rcu(key, rcu);
}
@@ -2230,9 +1059,9 @@ void hci_link_keys_clear(struct hci_dev *hdev)
void hci_smp_ltks_clear(struct hci_dev *hdev)
{
- struct smp_ltk *k;
+ struct smp_ltk *k, *tmp;
- list_for_each_entry_rcu(k, &hdev->long_term_keys, list) {
+ list_for_each_entry_safe(k, tmp, &hdev->long_term_keys, list) {
list_del_rcu(&k->list);
kfree_rcu(k, rcu);
}
@@ -2240,14 +1069,41 @@ void hci_smp_ltks_clear(struct hci_dev *hdev)
void hci_smp_irks_clear(struct hci_dev *hdev)
{
- struct smp_irk *k;
+ struct smp_irk *k, *tmp;
- list_for_each_entry_rcu(k, &hdev->identity_resolving_keys, list) {
+ list_for_each_entry_safe(k, tmp, &hdev->identity_resolving_keys, list) {
list_del_rcu(&k->list);
kfree_rcu(k, rcu);
}
}
+void hci_blocked_keys_clear(struct hci_dev *hdev)
+{
+ struct blocked_key *b, *tmp;
+
+ list_for_each_entry_safe(b, tmp, &hdev->blocked_keys, list) {
+ list_del_rcu(&b->list);
+ kfree_rcu(b, rcu);
+ }
+}
+
+bool hci_is_blocked_key(struct hci_dev *hdev, u8 type, u8 val[16])
+{
+ bool blocked = false;
+ struct blocked_key *b;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(b, &hdev->blocked_keys, list) {
+ if (b->type == type && !memcmp(b->val, val, sizeof(b->val))) {
+ blocked = true;
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+ return blocked;
+}
+
struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr)
{
struct link_key *k;
@@ -2256,6 +1112,16 @@ struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr)
list_for_each_entry_rcu(k, &hdev->link_keys, list) {
if (bacmp(bdaddr, &k->bdaddr) == 0) {
rcu_read_unlock();
+
+ if (hci_is_blocked_key(hdev,
+ HCI_BLOCKED_KEY_TYPE_LINKKEY,
+ k->val)) {
+ bt_dev_warn_ratelimited(hdev,
+ "Link key blocked for %pMR",
+ &k->bdaddr);
+ return NULL;
+ }
+
return k;
}
}
@@ -2324,6 +1190,15 @@ struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr,
if (smp_ltk_is_sc(k) || ltk_role(k->type) == role) {
rcu_read_unlock();
+
+ if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_LTK,
+ k->val)) {
+ bt_dev_warn_ratelimited(hdev,
+ "LTK blocked for %pMR",
+ &k->bdaddr);
+ return NULL;
+ }
+
return k;
}
}
@@ -2334,31 +1209,42 @@ struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr,
struct smp_irk *hci_find_irk_by_rpa(struct hci_dev *hdev, bdaddr_t *rpa)
{
+ struct smp_irk *irk_to_return = NULL;
struct smp_irk *irk;
rcu_read_lock();
list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) {
if (!bacmp(&irk->rpa, rpa)) {
- rcu_read_unlock();
- return irk;
+ irk_to_return = irk;
+ goto done;
}
}
list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) {
if (smp_irk_matches(hdev, irk->val, rpa)) {
bacpy(&irk->rpa, rpa);
- rcu_read_unlock();
- return irk;
+ irk_to_return = irk;
+ goto done;
}
}
+
+done:
+ if (irk_to_return && hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK,
+ irk_to_return->val)) {
+ bt_dev_warn_ratelimited(hdev, "Identity key blocked for %pMR",
+ &irk_to_return->bdaddr);
+ irk_to_return = NULL;
+ }
+
rcu_read_unlock();
- return NULL;
+ return irk_to_return;
}
struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr,
u8 addr_type)
{
+ struct smp_irk *irk_to_return = NULL;
struct smp_irk *irk;
/* Identity Address must be public or static random */
@@ -2369,13 +1255,21 @@ struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr,
list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) {
if (addr_type == irk->addr_type &&
bacmp(bdaddr, &irk->bdaddr) == 0) {
- rcu_read_unlock();
- return irk;
+ irk_to_return = irk;
+ break;
}
}
+
+ if (irk_to_return && hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK,
+ irk_to_return->val)) {
+ bt_dev_warn_ratelimited(hdev, "Identity key blocked for %pMR",
+ &irk_to_return->bdaddr);
+ irk_to_return = NULL;
+ }
+
rcu_read_unlock();
- return NULL;
+ return irk_to_return;
}
struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn,
@@ -2495,10 +1389,10 @@ int hci_remove_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr)
int hci_remove_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type)
{
- struct smp_ltk *k;
+ struct smp_ltk *k, *tmp;
int removed = 0;
- list_for_each_entry_rcu(k, &hdev->long_term_keys, list) {
+ list_for_each_entry_safe(k, tmp, &hdev->long_term_keys, list) {
if (bacmp(bdaddr, &k->bdaddr) || k->bdaddr_type != bdaddr_type)
continue;
@@ -2514,9 +1408,9 @@ int hci_remove_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type)
void hci_remove_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type)
{
- struct smp_irk *k;
+ struct smp_irk *k, *tmp;
- list_for_each_entry_rcu(k, &hdev->identity_resolving_keys, list) {
+ list_for_each_entry_safe(k, tmp, &hdev->identity_resolving_keys, list) {
if (bacmp(bdaddr, &k->bdaddr) || k->addr_type != addr_type)
continue;
@@ -2569,19 +1463,41 @@ static void hci_cmd_timeout(struct work_struct *work)
struct hci_dev *hdev = container_of(work, struct hci_dev,
cmd_timer.work);
- if (hdev->sent_cmd) {
- struct hci_command_hdr *sent = (void *) hdev->sent_cmd->data;
- u16 opcode = __le16_to_cpu(sent->opcode);
+ if (hdev->req_skb) {
+ u16 opcode = hci_skb_opcode(hdev->req_skb);
bt_dev_err(hdev, "command 0x%4.4x tx timeout", opcode);
+
+ hci_cmd_sync_cancel_sync(hdev, ETIMEDOUT);
} else {
bt_dev_err(hdev, "command tx timeout");
}
+ if (hdev->reset)
+ hdev->reset(hdev);
+
atomic_set(&hdev->cmd_cnt, 1);
queue_work(hdev->workqueue, &hdev->cmd_work);
}
+/* HCI ncmd timer function */
+static void hci_ncmd_timeout(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ ncmd_timer.work);
+
+ bt_dev_err(hdev, "Controller not accepting commands anymore: ncmd = 0");
+
+ /* During HCI_INIT phase no events can be injected if the ncmd timer
+ * triggers since the procedure has its own timeout handling.
+ */
+ if (test_bit(HCI_INIT, &hdev->flags))
+ return;
+
+ /* This is an irrecoverable state, inject hardware error event */
+ hci_reset_dev(hdev);
+}
+
struct oob_data *hci_find_remote_oob_data(struct hci_dev *hdev,
bdaddr_t *bdaddr, u8 bdaddr_type)
{
@@ -2685,6 +1601,19 @@ struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance)
}
/* This function requires the caller holds hdev->lock */
+struct adv_info *hci_find_adv_sid(struct hci_dev *hdev, u8 sid)
+{
+ struct adv_info *adv;
+
+ list_for_each_entry(adv, &hdev->adv_instances, list) {
+ if (adv->sid == sid)
+ return adv;
+ }
+
+ return NULL;
+}
+
+/* This function requires the caller holds hdev->lock */
struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance)
{
struct adv_info *cur_instance;
@@ -2744,12 +1673,12 @@ void hci_adv_instances_clear(struct hci_dev *hdev)
struct adv_info *adv_instance, *n;
if (hdev->adv_instance_timeout) {
- cancel_delayed_work(&hdev->adv_instance_expire);
+ disable_delayed_work(&hdev->adv_instance_expire);
hdev->adv_instance_timeout = 0;
}
list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) {
- cancel_delayed_work_sync(&adv_instance->rpa_expired_cb);
+ disable_delayed_work_sync(&adv_instance->rpa_expired_cb);
list_del(&adv_instance->list);
kfree(adv_instance);
}
@@ -2769,63 +1698,353 @@ static void adv_instance_rpa_expired(struct work_struct *work)
}
/* This function requires the caller holds hdev->lock */
-int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags,
- u16 adv_data_len, u8 *adv_data,
- u16 scan_rsp_len, u8 *scan_rsp_data,
- u16 timeout, u16 duration)
-{
- struct adv_info *adv_instance;
-
- adv_instance = hci_find_adv_instance(hdev, instance);
- if (adv_instance) {
- memset(adv_instance->adv_data, 0,
- sizeof(adv_instance->adv_data));
- memset(adv_instance->scan_rsp_data, 0,
- sizeof(adv_instance->scan_rsp_data));
+struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance,
+ u32 flags, u16 adv_data_len, u8 *adv_data,
+ u16 scan_rsp_len, u8 *scan_rsp_data,
+ u16 timeout, u16 duration, s8 tx_power,
+ u32 min_interval, u32 max_interval,
+ u8 mesh_handle)
+{
+ struct adv_info *adv;
+
+ adv = hci_find_adv_instance(hdev, instance);
+ if (adv) {
+ memset(adv->adv_data, 0, sizeof(adv->adv_data));
+ memset(adv->scan_rsp_data, 0, sizeof(adv->scan_rsp_data));
+ memset(adv->per_adv_data, 0, sizeof(adv->per_adv_data));
} else {
- if (hdev->adv_instance_cnt >= HCI_MAX_ADV_INSTANCES ||
- instance < 1 || instance > HCI_MAX_ADV_INSTANCES)
- return -EOVERFLOW;
+ if (hdev->adv_instance_cnt >= hdev->le_num_of_adv_sets ||
+ instance < 1 || instance > hdev->le_num_of_adv_sets + 1)
+ return ERR_PTR(-EOVERFLOW);
- adv_instance = kzalloc(sizeof(*adv_instance), GFP_KERNEL);
- if (!adv_instance)
- return -ENOMEM;
+ adv = kzalloc(sizeof(*adv), GFP_KERNEL);
+ if (!adv)
+ return ERR_PTR(-ENOMEM);
+
+ adv->pending = true;
+ adv->instance = instance;
+
+ /* If controller support only one set and the instance is set to
+ * 1 then there is no option other than using handle 0x00.
+ */
+ if (hdev->le_num_of_adv_sets == 1 && instance == 1)
+ adv->handle = 0x00;
+ else
+ adv->handle = instance;
- adv_instance->pending = true;
- adv_instance->instance = instance;
- list_add(&adv_instance->list, &hdev->adv_instances);
+ list_add(&adv->list, &hdev->adv_instances);
hdev->adv_instance_cnt++;
}
- adv_instance->flags = flags;
- adv_instance->adv_data_len = adv_data_len;
- adv_instance->scan_rsp_len = scan_rsp_len;
-
- if (adv_data_len)
- memcpy(adv_instance->adv_data, adv_data, adv_data_len);
+ adv->flags = flags;
+ adv->min_interval = min_interval;
+ adv->max_interval = max_interval;
+ adv->tx_power = tx_power;
+ /* Defining a mesh_handle changes the timing units to ms,
+ * rather than seconds, and ties the instance to the requested
+ * mesh_tx queue.
+ */
+ adv->mesh = mesh_handle;
- if (scan_rsp_len)
- memcpy(adv_instance->scan_rsp_data,
- scan_rsp_data, scan_rsp_len);
+ hci_set_adv_instance_data(hdev, instance, adv_data_len, adv_data,
+ scan_rsp_len, scan_rsp_data);
- adv_instance->timeout = timeout;
- adv_instance->remaining_time = timeout;
+ adv->timeout = timeout;
+ adv->remaining_time = timeout;
if (duration == 0)
- adv_instance->duration = HCI_DEFAULT_ADV_DURATION;
+ adv->duration = hdev->def_multi_adv_rotation_duration;
else
- adv_instance->duration = duration;
-
- adv_instance->tx_power = HCI_TX_POWER_INVALID;
+ adv->duration = duration;
- INIT_DELAYED_WORK(&adv_instance->rpa_expired_cb,
- adv_instance_rpa_expired);
+ INIT_DELAYED_WORK(&adv->rpa_expired_cb, adv_instance_rpa_expired);
BT_DBG("%s for %dMR", hdev->name, instance);
+ return adv;
+}
+
+/* This function requires the caller holds hdev->lock */
+struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, u8 sid,
+ u32 flags, u8 data_len, u8 *data,
+ u32 min_interval, u32 max_interval)
+{
+ struct adv_info *adv;
+
+ adv = hci_add_adv_instance(hdev, instance, flags, 0, NULL, 0, NULL,
+ 0, 0, HCI_ADV_TX_POWER_NO_PREFERENCE,
+ min_interval, max_interval, 0);
+ if (IS_ERR(adv))
+ return adv;
+
+ adv->sid = sid;
+ adv->periodic = true;
+ adv->per_adv_data_len = data_len;
+
+ if (data)
+ memcpy(adv->per_adv_data, data, data_len);
+
+ return adv;
+}
+
+/* This function requires the caller holds hdev->lock */
+int hci_set_adv_instance_data(struct hci_dev *hdev, u8 instance,
+ u16 adv_data_len, u8 *adv_data,
+ u16 scan_rsp_len, u8 *scan_rsp_data)
+{
+ struct adv_info *adv;
+
+ adv = hci_find_adv_instance(hdev, instance);
+
+ /* If advertisement doesn't exist, we can't modify its data */
+ if (!adv)
+ return -ENOENT;
+
+ if (adv_data_len && ADV_DATA_CMP(adv, adv_data, adv_data_len)) {
+ memset(adv->adv_data, 0, sizeof(adv->adv_data));
+ memcpy(adv->adv_data, adv_data, adv_data_len);
+ adv->adv_data_len = adv_data_len;
+ adv->adv_data_changed = true;
+ }
+
+ if (scan_rsp_len && SCAN_RSP_CMP(adv, scan_rsp_data, scan_rsp_len)) {
+ memset(adv->scan_rsp_data, 0, sizeof(adv->scan_rsp_data));
+ memcpy(adv->scan_rsp_data, scan_rsp_data, scan_rsp_len);
+ adv->scan_rsp_len = scan_rsp_len;
+ adv->scan_rsp_changed = true;
+ }
+
+ /* Mark as changed if there are flags which would affect it */
+ if (((adv->flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) ||
+ adv->flags & MGMT_ADV_FLAG_LOCAL_NAME)
+ adv->scan_rsp_changed = true;
+
return 0;
}
+/* This function requires the caller holds hdev->lock */
+u32 hci_adv_instance_flags(struct hci_dev *hdev, u8 instance)
+{
+ u32 flags;
+ struct adv_info *adv;
+
+ if (instance == 0x00) {
+ /* Instance 0 always manages the "Tx Power" and "Flags"
+ * fields
+ */
+ flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS;
+
+ /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting
+ * corresponds to the "connectable" instance flag.
+ */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE))
+ flags |= MGMT_ADV_FLAG_CONNECTABLE;
+
+ if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE))
+ flags |= MGMT_ADV_FLAG_LIMITED_DISCOV;
+ else if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE))
+ flags |= MGMT_ADV_FLAG_DISCOV;
+
+ return flags;
+ }
+
+ adv = hci_find_adv_instance(hdev, instance);
+
+ /* Return 0 when we got an invalid instance identifier. */
+ if (!adv)
+ return 0;
+
+ return adv->flags;
+}
+
+bool hci_adv_instance_is_scannable(struct hci_dev *hdev, u8 instance)
+{
+ struct adv_info *adv;
+
+ /* Instance 0x00 always set local name */
+ if (instance == 0x00)
+ return true;
+
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv)
+ return false;
+
+ if (adv->flags & MGMT_ADV_FLAG_APPEARANCE ||
+ adv->flags & MGMT_ADV_FLAG_LOCAL_NAME)
+ return true;
+
+ return adv->scan_rsp_len ? true : false;
+}
+
+/* This function requires the caller holds hdev->lock */
+void hci_adv_monitors_clear(struct hci_dev *hdev)
+{
+ struct adv_monitor *monitor;
+ int handle;
+
+ idr_for_each_entry(&hdev->adv_monitors_idr, monitor, handle)
+ hci_free_adv_monitor(hdev, monitor);
+
+ idr_destroy(&hdev->adv_monitors_idr);
+}
+
+/* Frees the monitor structure and do some bookkeepings.
+ * This function requires the caller holds hdev->lock.
+ */
+void hci_free_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor)
+{
+ struct adv_pattern *pattern;
+ struct adv_pattern *tmp;
+
+ if (!monitor)
+ return;
+
+ list_for_each_entry_safe(pattern, tmp, &monitor->patterns, list) {
+ list_del(&pattern->list);
+ kfree(pattern);
+ }
+
+ if (monitor->handle)
+ idr_remove(&hdev->adv_monitors_idr, monitor->handle);
+
+ if (monitor->state != ADV_MONITOR_STATE_NOT_REGISTERED)
+ hdev->adv_monitors_cnt--;
+
+ kfree(monitor);
+}
+
+/* Assigns handle to a monitor, and if offloading is supported and power is on,
+ * also attempts to forward the request to the controller.
+ * This function requires the caller holds hci_req_sync_lock.
+ */
+int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor)
+{
+ int min, max, handle;
+ int status = 0;
+
+ if (!monitor)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+
+ min = HCI_MIN_ADV_MONITOR_HANDLE;
+ max = HCI_MIN_ADV_MONITOR_HANDLE + HCI_MAX_ADV_MONITOR_NUM_HANDLES;
+ handle = idr_alloc(&hdev->adv_monitors_idr, monitor, min, max,
+ GFP_KERNEL);
+
+ hci_dev_unlock(hdev);
+
+ if (handle < 0)
+ return handle;
+
+ monitor->handle = handle;
+
+ if (!hdev_is_powered(hdev))
+ return status;
+
+ switch (hci_get_adv_monitor_offload_ext(hdev)) {
+ case HCI_ADV_MONITOR_EXT_NONE:
+ bt_dev_dbg(hdev, "add monitor %d status %d",
+ monitor->handle, status);
+ /* Message was not forwarded to controller - not an error */
+ break;
+
+ case HCI_ADV_MONITOR_EXT_MSFT:
+ status = msft_add_monitor_pattern(hdev, monitor);
+ bt_dev_dbg(hdev, "add monitor %d msft status %d",
+ handle, status);
+ break;
+ }
+
+ return status;
+}
+
+/* Attempts to tell the controller and free the monitor. If somehow the
+ * controller doesn't have a corresponding handle, remove anyway.
+ * This function requires the caller holds hci_req_sync_lock.
+ */
+static int hci_remove_adv_monitor(struct hci_dev *hdev,
+ struct adv_monitor *monitor)
+{
+ int status = 0;
+ int handle;
+
+ switch (hci_get_adv_monitor_offload_ext(hdev)) {
+ case HCI_ADV_MONITOR_EXT_NONE: /* also goes here when powered off */
+ bt_dev_dbg(hdev, "remove monitor %d status %d",
+ monitor->handle, status);
+ goto free_monitor;
+
+ case HCI_ADV_MONITOR_EXT_MSFT:
+ handle = monitor->handle;
+ status = msft_remove_monitor(hdev, monitor);
+ bt_dev_dbg(hdev, "remove monitor %d msft status %d",
+ handle, status);
+ break;
+ }
+
+ /* In case no matching handle registered, just free the monitor */
+ if (status == -ENOENT)
+ goto free_monitor;
+
+ return status;
+
+free_monitor:
+ if (status == -ENOENT)
+ bt_dev_warn(hdev, "Removing monitor with no matching handle %d",
+ monitor->handle);
+ hci_free_adv_monitor(hdev, monitor);
+
+ return status;
+}
+
+/* This function requires the caller holds hci_req_sync_lock */
+int hci_remove_single_adv_monitor(struct hci_dev *hdev, u16 handle)
+{
+ struct adv_monitor *monitor = idr_find(&hdev->adv_monitors_idr, handle);
+
+ if (!monitor)
+ return -EINVAL;
+
+ return hci_remove_adv_monitor(hdev, monitor);
+}
+
+/* This function requires the caller holds hci_req_sync_lock */
+int hci_remove_all_adv_monitor(struct hci_dev *hdev)
+{
+ struct adv_monitor *monitor;
+ int idr_next_id = 0;
+ int status = 0;
+
+ while (1) {
+ monitor = idr_get_next(&hdev->adv_monitors_idr, &idr_next_id);
+ if (!monitor)
+ break;
+
+ status = hci_remove_adv_monitor(hdev, monitor);
+ if (status)
+ return status;
+
+ idr_next_id++;
+ }
+
+ return status;
+}
+
+/* This function requires the caller holds hdev->lock */
+bool hci_is_adv_monitoring(struct hci_dev *hdev)
+{
+ return !idr_is_empty(&hdev->adv_monitors_idr);
+}
+
+int hci_get_adv_monitor_offload_ext(struct hci_dev *hdev)
+{
+ if (msft_monitor_supported(hdev))
+ return HCI_ADV_MONITOR_EXT_MSFT;
+
+ return HCI_ADV_MONITOR_EXT_NONE;
+}
+
struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list,
bdaddr_t *bdaddr, u8 type)
{
@@ -2839,6 +2058,34 @@ struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list,
return NULL;
}
+struct bdaddr_list_with_irk *hci_bdaddr_list_lookup_with_irk(
+ struct list_head *bdaddr_list, bdaddr_t *bdaddr,
+ u8 type)
+{
+ struct bdaddr_list_with_irk *b;
+
+ list_for_each_entry(b, bdaddr_list, list) {
+ if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type)
+ return b;
+ }
+
+ return NULL;
+}
+
+struct bdaddr_list_with_flags *
+hci_bdaddr_list_lookup_with_flags(struct list_head *bdaddr_list,
+ bdaddr_t *bdaddr, u8 type)
+{
+ struct bdaddr_list_with_flags *b;
+
+ list_for_each_entry(b, bdaddr_list, list) {
+ if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type)
+ return b;
+ }
+
+ return NULL;
+}
+
void hci_bdaddr_list_clear(struct list_head *bdaddr_list)
{
struct bdaddr_list *b, *n;
@@ -2871,6 +2118,59 @@ int hci_bdaddr_list_add(struct list_head *list, bdaddr_t *bdaddr, u8 type)
return 0;
}
+int hci_bdaddr_list_add_with_irk(struct list_head *list, bdaddr_t *bdaddr,
+ u8 type, u8 *peer_irk, u8 *local_irk)
+{
+ struct bdaddr_list_with_irk *entry;
+
+ if (!bacmp(bdaddr, BDADDR_ANY))
+ return -EBADF;
+
+ if (hci_bdaddr_list_lookup(list, bdaddr, type))
+ return -EEXIST;
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ bacpy(&entry->bdaddr, bdaddr);
+ entry->bdaddr_type = type;
+
+ if (peer_irk)
+ memcpy(entry->peer_irk, peer_irk, 16);
+
+ if (local_irk)
+ memcpy(entry->local_irk, local_irk, 16);
+
+ list_add(&entry->list, list);
+
+ return 0;
+}
+
+int hci_bdaddr_list_add_with_flags(struct list_head *list, bdaddr_t *bdaddr,
+ u8 type, u32 flags)
+{
+ struct bdaddr_list_with_flags *entry;
+
+ if (!bacmp(bdaddr, BDADDR_ANY))
+ return -EBADF;
+
+ if (hci_bdaddr_list_lookup(list, bdaddr, type))
+ return -EEXIST;
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ bacpy(&entry->bdaddr, bdaddr);
+ entry->bdaddr_type = type;
+ entry->flags = flags;
+
+ list_add(&entry->list, list);
+
+ return 0;
+}
+
int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type)
{
struct bdaddr_list *entry;
@@ -2890,6 +2190,26 @@ int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type)
return 0;
}
+int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr,
+ u8 type)
+{
+ struct bdaddr_list_with_irk *entry;
+
+ if (!bacmp(bdaddr, BDADDR_ANY)) {
+ hci_bdaddr_list_clear(list);
+ return 0;
+ }
+
+ entry = hci_bdaddr_list_lookup_with_irk(list, bdaddr, type);
+ if (!entry)
+ return -ENOENT;
+
+ list_del(&entry->list);
+ kfree(entry);
+
+ return 0;
+}
+
/* This function requires the caller holds hdev->lock */
struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev,
bdaddr_t *addr, u8 addr_type)
@@ -2906,22 +2226,46 @@ struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev,
return NULL;
}
-/* This function requires the caller holds hdev->lock */
+/* This function requires the caller holds hdev->lock or rcu_read_lock */
struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list,
bdaddr_t *addr, u8 addr_type)
{
struct hci_conn_params *param;
- list_for_each_entry(param, list, action) {
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(param, list, action) {
if (bacmp(&param->addr, addr) == 0 &&
- param->addr_type == addr_type)
+ param->addr_type == addr_type) {
+ rcu_read_unlock();
return param;
+ }
}
+ rcu_read_unlock();
+
return NULL;
}
/* This function requires the caller holds hdev->lock */
+void hci_pend_le_list_del_init(struct hci_conn_params *param)
+{
+ if (list_empty(&param->action))
+ return;
+
+ list_del_rcu(&param->action);
+ synchronize_rcu();
+ INIT_LIST_HEAD(&param->action);
+}
+
+/* This function requires the caller holds hdev->lock */
+void hci_pend_le_list_add(struct hci_conn_params *param,
+ struct list_head *list)
+{
+ list_add_rcu(&param->action, list);
+}
+
+/* This function requires the caller holds hdev->lock */
struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev,
bdaddr_t *addr, u8 addr_type)
{
@@ -2954,14 +2298,15 @@ struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev,
return params;
}
-static void hci_conn_params_free(struct hci_conn_params *params)
+void hci_conn_params_free(struct hci_conn_params *params)
{
+ hci_pend_le_list_del_init(params);
+
if (params->conn) {
hci_conn_drop(params->conn);
hci_conn_put(params->conn);
}
- list_del(&params->action);
list_del(&params->list);
kfree(params);
}
@@ -2977,7 +2322,7 @@ void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type)
hci_conn_params_free(params);
- hci_update_background_scan(hdev);
+ hci_update_passive_scan(hdev);
BT_DBG("addr %pMR (type %u)", addr, addr_type);
}
@@ -2991,7 +2336,7 @@ void hci_conn_params_clear_disabled(struct hci_dev *hdev)
if (params->auto_connect != HCI_AUTO_CONN_DISABLED)
continue;
- /* If trying to estabilish one time connection to disabled
+ /* If trying to establish one time connection to disabled
* device, leave the params, but mark them as just once.
*/
if (params->explicit_connect) {
@@ -2999,8 +2344,7 @@ void hci_conn_params_clear_disabled(struct hci_dev *hdev)
continue;
}
- list_del(&params->list);
- kfree(params);
+ hci_conn_params_free(params);
}
BT_DBG("All LE disabled connection parameters were removed");
@@ -3045,15 +2389,71 @@ void hci_copy_identity_address(struct hci_dev *hdev, bdaddr_t *bdaddr,
}
}
+static void hci_clear_wake_reason(struct hci_dev *hdev)
+{
+ hci_dev_lock(hdev);
+
+ hdev->wake_reason = 0;
+ bacpy(&hdev->wake_addr, BDADDR_ANY);
+ hdev->wake_addr_type = 0;
+
+ hci_dev_unlock(hdev);
+}
+
+static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action,
+ void *data)
+{
+ struct hci_dev *hdev =
+ container_of(nb, struct hci_dev, suspend_notifier);
+ int ret = 0;
+
+ /* Userspace has full control of this device. Do nothing. */
+ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL))
+ return NOTIFY_DONE;
+
+ /* To avoid a potential race with hci_unregister_dev. */
+ hci_dev_hold(hdev);
+
+ switch (action) {
+ case PM_HIBERNATION_PREPARE:
+ case PM_SUSPEND_PREPARE:
+ ret = hci_suspend_dev(hdev);
+ break;
+ case PM_POST_HIBERNATION:
+ case PM_POST_SUSPEND:
+ ret = hci_resume_dev(hdev);
+ break;
+ }
+
+ if (ret)
+ bt_dev_err(hdev, "Suspend notifier action (%lu) failed: %d",
+ action, ret);
+
+ hci_dev_put(hdev);
+ return NOTIFY_DONE;
+}
+
/* Alloc HCI device */
-struct hci_dev *hci_alloc_dev(void)
+struct hci_dev *hci_alloc_dev_priv(int sizeof_priv)
{
struct hci_dev *hdev;
+ unsigned int alloc_size;
- hdev = kzalloc(sizeof(*hdev), GFP_KERNEL);
+ alloc_size = sizeof(*hdev);
+ if (sizeof_priv) {
+ /* Fixme: May need ALIGN-ment? */
+ alloc_size += sizeof_priv;
+ }
+
+ hdev = kzalloc(alloc_size, GFP_KERNEL);
if (!hdev)
return NULL;
+ if (init_srcu_struct(&hdev->srcu)) {
+ kfree(hdev);
+ return NULL;
+ }
+
hdev->pkt_type = (HCI_DM1 | HCI_DH1 | HCI_HV1);
hdev->esco_type = (ESCO_HV1);
hdev->link_mode = (HCI_LM_ACCEPT);
@@ -3066,14 +2466,26 @@ struct hci_dev *hci_alloc_dev(void)
hdev->cur_adv_instance = 0x00;
hdev->adv_instance_timeout = 0;
+ hdev->advmon_allowlist_duration = 300;
+ hdev->advmon_no_filter_duration = 500;
+ hdev->enable_advmon_interleave_scan = 0x00; /* Default to disable */
+
hdev->sniff_max_interval = 800;
hdev->sniff_min_interval = 80;
hdev->le_adv_channel_map = 0x07;
hdev->le_adv_min_interval = 0x0800;
hdev->le_adv_max_interval = 0x0800;
- hdev->le_scan_interval = 0x0060;
- hdev->le_scan_window = 0x0030;
+ hdev->le_scan_interval = DISCOV_LE_SCAN_INT_FAST;
+ hdev->le_scan_window = DISCOV_LE_SCAN_WIN_FAST;
+ hdev->le_scan_int_suspend = DISCOV_LE_SCAN_INT_SLOW1;
+ hdev->le_scan_window_suspend = DISCOV_LE_SCAN_WIN_SLOW1;
+ hdev->le_scan_int_discovery = DISCOV_LE_SCAN_INT;
+ hdev->le_scan_window_discovery = DISCOV_LE_SCAN_WIN;
+ hdev->le_scan_int_adv_monitor = DISCOV_LE_SCAN_INT_FAST;
+ hdev->le_scan_window_adv_monitor = DISCOV_LE_SCAN_WIN_FAST;
+ hdev->le_scan_int_connect = DISCOV_LE_SCAN_INT_CONN;
+ hdev->le_scan_window_connect = DISCOV_LE_SCAN_WIN_CONN;
hdev->le_conn_min_interval = 0x0018;
hdev->le_conn_max_interval = 0x0028;
hdev->le_conn_latency = 0x0000;
@@ -3084,39 +2496,62 @@ struct hci_dev *hci_alloc_dev(void)
hdev->le_max_tx_time = 0x0148;
hdev->le_max_rx_len = 0x001b;
hdev->le_max_rx_time = 0x0148;
+ hdev->le_max_key_size = SMP_MAX_ENC_KEY_SIZE;
+ hdev->le_min_key_size = SMP_MIN_ENC_KEY_SIZE;
hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M;
hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M;
+ hdev->le_num_of_adv_sets = HCI_MAX_ADV_INSTANCES;
+ hdev->def_multi_adv_rotation_duration = HCI_DEFAULT_ADV_DURATION;
+ hdev->def_le_autoconnect_timeout = HCI_LE_CONN_TIMEOUT;
+ hdev->min_le_tx_power = HCI_TX_POWER_INVALID;
+ hdev->max_le_tx_power = HCI_TX_POWER_INVALID;
hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT;
hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT;
hdev->conn_info_min_age = DEFAULT_CONN_INFO_MIN_AGE;
hdev->conn_info_max_age = DEFAULT_CONN_INFO_MAX_AGE;
+ hdev->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT;
+ hdev->min_enc_key_size = HCI_MIN_ENC_KEY_SIZE;
+
+ /* default 1.28 sec page scan */
+ hdev->def_page_scan_type = PAGE_SCAN_TYPE_STANDARD;
+ hdev->def_page_scan_int = 0x0800;
+ hdev->def_page_scan_window = 0x0012;
mutex_init(&hdev->lock);
mutex_init(&hdev->req_lock);
+ mutex_init(&hdev->mgmt_pending_lock);
+
+ ida_init(&hdev->unset_handle_ida);
+ INIT_LIST_HEAD(&hdev->mesh_pending);
INIT_LIST_HEAD(&hdev->mgmt_pending);
- INIT_LIST_HEAD(&hdev->blacklist);
- INIT_LIST_HEAD(&hdev->whitelist);
+ INIT_LIST_HEAD(&hdev->reject_list);
+ INIT_LIST_HEAD(&hdev->accept_list);
INIT_LIST_HEAD(&hdev->uuids);
INIT_LIST_HEAD(&hdev->link_keys);
INIT_LIST_HEAD(&hdev->long_term_keys);
INIT_LIST_HEAD(&hdev->identity_resolving_keys);
INIT_LIST_HEAD(&hdev->remote_oob_data);
- INIT_LIST_HEAD(&hdev->le_white_list);
+ INIT_LIST_HEAD(&hdev->le_accept_list);
INIT_LIST_HEAD(&hdev->le_resolv_list);
INIT_LIST_HEAD(&hdev->le_conn_params);
INIT_LIST_HEAD(&hdev->pend_le_conns);
INIT_LIST_HEAD(&hdev->pend_le_reports);
INIT_LIST_HEAD(&hdev->conn_hash.list);
INIT_LIST_HEAD(&hdev->adv_instances);
+ INIT_LIST_HEAD(&hdev->blocked_keys);
+ INIT_LIST_HEAD(&hdev->monitored_devices);
+ INIT_LIST_HEAD(&hdev->local_codecs);
INIT_WORK(&hdev->rx_work, hci_rx_work);
INIT_WORK(&hdev->cmd_work, hci_cmd_work);
INIT_WORK(&hdev->tx_work, hci_tx_work);
INIT_WORK(&hdev->power_on, hci_power_on);
INIT_WORK(&hdev->error_reset, hci_error_reset);
+ hci_cmd_sync_init(hdev);
+
INIT_DELAYED_WORK(&hdev->power_off, hci_power_off);
skb_queue_head_init(&hdev->rx_q);
@@ -3126,15 +2561,16 @@ struct hci_dev *hci_alloc_dev(void)
init_waitqueue_head(&hdev->req_wait_q);
INIT_DELAYED_WORK(&hdev->cmd_timer, hci_cmd_timeout);
+ INIT_DELAYED_WORK(&hdev->ncmd_timer, hci_ncmd_timeout);
- hci_request_setup(hdev);
+ hci_devcd_setup(hdev);
hci_init_sysfs(hdev);
discovery_init(hdev);
return hdev;
}
-EXPORT_SYMBOL(hci_alloc_dev);
+EXPORT_SYMBOL(hci_alloc_dev_priv);
/* Free HCI device */
void hci_free_dev(struct hci_dev *hdev)
@@ -3152,24 +2588,15 @@ int hci_register_dev(struct hci_dev *hdev)
if (!hdev->open || !hdev->close || !hdev->send)
return -EINVAL;
- /* Do not allow HCI_AMP devices to register at index 0,
- * so the index can be used as the AMP controller ID.
- */
- switch (hdev->dev_type) {
- case HCI_PRIMARY:
- id = ida_simple_get(&hci_index_ida, 0, 0, GFP_KERNEL);
- break;
- case HCI_AMP:
- id = ida_simple_get(&hci_index_ida, 1, 0, GFP_KERNEL);
- break;
- default:
- return -EINVAL;
- }
-
+ id = ida_alloc_max(&hci_index_ida, HCI_MAX_ID - 1, GFP_KERNEL);
if (id < 0)
return id;
- sprintf(hdev->name, "hci%d", id);
+ error = dev_set_name(&hdev->dev, "hci%u", id);
+ if (error)
+ return error;
+
+ hdev->name = dev_name(&hdev->dev);
hdev->id = id;
BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus);
@@ -3191,8 +2618,6 @@ int hci_register_dev(struct hci_dev *hdev)
if (!IS_ERR_OR_NULL(bt_debugfs))
hdev->debugfs = debugfs_create_dir(hdev->name, bt_debugfs);
- dev_set_name(&hdev->dev, "%s", hdev->name);
-
error = device_add(&hdev->dev);
if (error < 0)
goto err_wqueue;
@@ -3215,12 +2640,10 @@ int hci_register_dev(struct hci_dev *hdev)
hci_dev_set_flag(hdev, HCI_SETUP);
hci_dev_set_flag(hdev, HCI_AUTO_OFF);
- if (hdev->dev_type == HCI_PRIMARY) {
- /* Assume BR/EDR support until proven otherwise (such as
- * through reading supported features during init.
- */
- hci_dev_set_flag(hdev, HCI_BREDR_ENABLED);
- }
+ /* Assume BR/EDR support until proven otherwise (such as
+ * through reading supported features during init.
+ */
+ hci_dev_set_flag(hdev, HCI_BREDR_ENABLED);
write_lock(&hci_dev_list_lock);
list_add(&hdev->list, &hci_dev_list);
@@ -3229,21 +2652,35 @@ int hci_register_dev(struct hci_dev *hdev)
/* Devices that are marked for raw-only usage are unconfigured
* and should not be included in normal operation.
*/
- if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks))
+ if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE))
hci_dev_set_flag(hdev, HCI_UNCONFIGURED);
+ /* Mark Remote Wakeup connection flag as supported if driver has wakeup
+ * callback.
+ */
+ if (hdev->wakeup)
+ hdev->conn_flags |= HCI_CONN_FLAG_REMOTE_WAKEUP;
+
hci_sock_dev_event(hdev, HCI_DEV_REG);
hci_dev_hold(hdev);
+ error = hci_register_suspend_notifier(hdev);
+ if (error)
+ BT_WARN("register suspend notifier failed error:%d\n", error);
+
queue_work(hdev->req_workqueue, &hdev->power_on);
+ idr_init(&hdev->adv_monitors_idr);
+ msft_register(hdev);
+
return id;
err_wqueue:
+ debugfs_remove_recursive(hdev->debugfs);
destroy_workqueue(hdev->workqueue);
destroy_workqueue(hdev->req_workqueue);
err:
- ida_simple_remove(&hci_index_ida, hdev->id);
+ ida_free(&hci_index_ida, hdev->id);
return error;
}
@@ -3252,19 +2689,28 @@ EXPORT_SYMBOL(hci_register_dev);
/* Unregister HCI device */
void hci_unregister_dev(struct hci_dev *hdev)
{
- int id;
-
BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus);
+ mutex_lock(&hdev->unregister_lock);
hci_dev_set_flag(hdev, HCI_UNREGISTER);
-
- id = hdev->id;
+ mutex_unlock(&hdev->unregister_lock);
write_lock(&hci_dev_list_lock);
list_del(&hdev->list);
write_unlock(&hci_dev_list_lock);
- cancel_work_sync(&hdev->power_on);
+ synchronize_srcu(&hdev->srcu);
+ cleanup_srcu_struct(&hdev->srcu);
+
+ disable_work_sync(&hdev->rx_work);
+ disable_work_sync(&hdev->cmd_work);
+ disable_work_sync(&hdev->tx_work);
+ disable_work_sync(&hdev->power_on);
+ disable_work_sync(&hdev->error_reset);
+
+ hci_cmd_sync_clear(hdev);
+
+ hci_unregister_suspend_notifier(hdev);
hci_dev_do_close(hdev);
@@ -3288,7 +2734,14 @@ void hci_unregister_dev(struct hci_dev *hdev)
}
device_del(&hdev->dev);
+ /* Actual cleanup is deferred until hci_release_dev(). */
+ hci_dev_put(hdev);
+}
+EXPORT_SYMBOL(hci_unregister_dev);
+/* Release HCI device */
+void hci_release_dev(struct hci_dev *hdev)
+{
debugfs_remove_recursive(hdev->debugfs);
kfree_const(hdev->hw_info);
kfree_const(hdev->fw_info);
@@ -3297,46 +2750,145 @@ void hci_unregister_dev(struct hci_dev *hdev)
destroy_workqueue(hdev->req_workqueue);
hci_dev_lock(hdev);
- hci_bdaddr_list_clear(&hdev->blacklist);
- hci_bdaddr_list_clear(&hdev->whitelist);
+ hci_bdaddr_list_clear(&hdev->reject_list);
+ hci_bdaddr_list_clear(&hdev->accept_list);
hci_uuids_clear(hdev);
hci_link_keys_clear(hdev);
hci_smp_ltks_clear(hdev);
hci_smp_irks_clear(hdev);
hci_remote_oob_data_clear(hdev);
hci_adv_instances_clear(hdev);
- hci_bdaddr_list_clear(&hdev->le_white_list);
+ hci_adv_monitors_clear(hdev);
+ hci_bdaddr_list_clear(&hdev->le_accept_list);
hci_bdaddr_list_clear(&hdev->le_resolv_list);
hci_conn_params_clear_all(hdev);
hci_discovery_filter_clear(hdev);
+ hci_blocked_keys_clear(hdev);
+ hci_codec_list_clear(&hdev->local_codecs);
+ msft_release(hdev);
hci_dev_unlock(hdev);
- hci_dev_put(hdev);
+ ida_destroy(&hdev->unset_handle_ida);
+ ida_free(&hci_index_ida, hdev->id);
+ kfree_skb(hdev->sent_cmd);
+ kfree_skb(hdev->req_skb);
+ kfree_skb(hdev->recv_event);
+ kfree(hdev);
+}
+EXPORT_SYMBOL(hci_release_dev);
+
+int hci_register_suspend_notifier(struct hci_dev *hdev)
+{
+ int ret = 0;
+
+ if (!hdev->suspend_notifier.notifier_call &&
+ !hci_test_quirk(hdev, HCI_QUIRK_NO_SUSPEND_NOTIFIER)) {
+ hdev->suspend_notifier.notifier_call = hci_suspend_notifier;
+ ret = register_pm_notifier(&hdev->suspend_notifier);
+ }
- ida_simple_remove(&hci_index_ida, id);
+ return ret;
+}
+
+int hci_unregister_suspend_notifier(struct hci_dev *hdev)
+{
+ int ret = 0;
+
+ if (hdev->suspend_notifier.notifier_call) {
+ ret = unregister_pm_notifier(&hdev->suspend_notifier);
+ if (!ret)
+ hdev->suspend_notifier.notifier_call = NULL;
+ }
+
+ return ret;
+}
+
+/* Cancel ongoing command synchronously:
+ *
+ * - Cancel command timer
+ * - Reset command counter
+ * - Cancel command request
+ */
+static void hci_cancel_cmd_sync(struct hci_dev *hdev, int err)
+{
+ bt_dev_dbg(hdev, "err 0x%2.2x", err);
+
+ if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) {
+ disable_delayed_work_sync(&hdev->cmd_timer);
+ disable_delayed_work_sync(&hdev->ncmd_timer);
+ } else {
+ cancel_delayed_work_sync(&hdev->cmd_timer);
+ cancel_delayed_work_sync(&hdev->ncmd_timer);
+ }
+
+ atomic_set(&hdev->cmd_cnt, 1);
+
+ hci_cmd_sync_cancel_sync(hdev, err);
}
-EXPORT_SYMBOL(hci_unregister_dev);
/* Suspend HCI device */
int hci_suspend_dev(struct hci_dev *hdev)
{
+ int ret;
+
+ bt_dev_dbg(hdev, "");
+
+ /* Suspend should only act on when powered. */
+ if (!hdev_is_powered(hdev) ||
+ hci_dev_test_flag(hdev, HCI_UNREGISTER))
+ return 0;
+
+ /* If powering down don't attempt to suspend */
+ if (mgmt_powering_down(hdev))
+ return 0;
+
+ /* Cancel potentially blocking sync operation before suspend */
+ hci_cancel_cmd_sync(hdev, EHOSTDOWN);
+
+ hci_req_sync_lock(hdev);
+ ret = hci_suspend_sync(hdev);
+ hci_req_sync_unlock(hdev);
+
+ hci_clear_wake_reason(hdev);
+ mgmt_suspending(hdev, hdev->suspend_state);
+
hci_sock_dev_event(hdev, HCI_DEV_SUSPEND);
- return 0;
+ return ret;
}
EXPORT_SYMBOL(hci_suspend_dev);
/* Resume HCI device */
int hci_resume_dev(struct hci_dev *hdev)
{
+ int ret;
+
+ bt_dev_dbg(hdev, "");
+
+ /* Resume should only act on when powered. */
+ if (!hdev_is_powered(hdev) ||
+ hci_dev_test_flag(hdev, HCI_UNREGISTER))
+ return 0;
+
+ /* If powering down don't attempt to resume */
+ if (mgmt_powering_down(hdev))
+ return 0;
+
+ hci_req_sync_lock(hdev);
+ ret = hci_resume_sync(hdev);
+ hci_req_sync_unlock(hdev);
+
+ mgmt_resuming(hdev, hdev->wake_reason, &hdev->wake_addr,
+ hdev->wake_addr_type);
+
hci_sock_dev_event(hdev, HCI_DEV_RESUME);
- return 0;
+ return ret;
}
EXPORT_SYMBOL(hci_resume_dev);
/* Reset HCI device */
int hci_reset_dev(struct hci_dev *hdev)
{
- const u8 hw_err[] = { HCI_EV_HARDWARE_ERROR, 0x01, 0x00 };
+ static const u8 hw_err[] = { HCI_EV_HARDWARE_ERROR, 0x01, 0x00 };
struct sk_buff *skb;
skb = bt_skb_alloc(3, GFP_ATOMIC);
@@ -3346,23 +2898,62 @@ int hci_reset_dev(struct hci_dev *hdev)
hci_skb_pkt_type(skb) = HCI_EVENT_PKT;
skb_put_data(skb, hw_err, 3);
+ bt_dev_err(hdev, "Injecting HCI hardware error event");
+
/* Send Hardware Error to upper stack */
return hci_recv_frame(hdev, skb);
}
EXPORT_SYMBOL(hci_reset_dev);
+static u8 hci_dev_classify_pkt_type(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ if (hdev->classify_pkt_type)
+ return hdev->classify_pkt_type(hdev, skb);
+
+ return hci_skb_pkt_type(skb);
+}
+
/* Receive frame from HCI drivers */
int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb)
{
+ u8 dev_pkt_type;
+
if (!hdev || (!test_bit(HCI_UP, &hdev->flags)
&& !test_bit(HCI_INIT, &hdev->flags))) {
kfree_skb(skb);
return -ENXIO;
}
- if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT &&
- hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT &&
- hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) {
+ /* Check if the driver agree with packet type classification */
+ dev_pkt_type = hci_dev_classify_pkt_type(hdev, skb);
+ if (hci_skb_pkt_type(skb) != dev_pkt_type) {
+ hci_skb_pkt_type(skb) = dev_pkt_type;
+ }
+
+ switch (hci_skb_pkt_type(skb)) {
+ case HCI_EVENT_PKT:
+ break;
+ case HCI_ACLDATA_PKT:
+ /* Detect if ISO packet has been sent as ACL */
+ if (hci_conn_num(hdev, CIS_LINK) ||
+ hci_conn_num(hdev, BIS_LINK) ||
+ hci_conn_num(hdev, PA_LINK)) {
+ __u16 handle = __le16_to_cpu(hci_acl_hdr(skb)->handle);
+ __u8 type;
+
+ type = hci_conn_lookup_type(hdev, hci_handle(handle));
+ if (type == CIS_LINK || type == BIS_LINK ||
+ type == PA_LINK)
+ hci_skb_pkt_type(skb) = HCI_ISODATA_PKT;
+ }
+ break;
+ case HCI_SCODATA_PKT:
+ break;
+ case HCI_ISODATA_PKT:
+ break;
+ case HCI_DRV_PKT:
+ break;
+ default:
kfree_skb(skb);
return -EINVAL;
}
@@ -3444,7 +3035,7 @@ int hci_unregister_cb(struct hci_cb *cb)
}
EXPORT_SYMBOL(hci_unregister_cb);
-static void hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb)
+static int hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb)
{
int err;
@@ -3467,14 +3058,33 @@ static void hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb)
if (!test_bit(HCI_RUNNING, &hdev->flags)) {
kfree_skb(skb);
- return;
+ return -EINVAL;
+ }
+
+ if (hci_skb_pkt_type(skb) == HCI_DRV_PKT) {
+ /* Intercept HCI Drv packet here and don't go with hdev->send
+ * callback.
+ */
+ err = hci_drv_process_cmd(hdev, skb);
+ kfree_skb(skb);
+ return err;
}
err = hdev->send(hdev, skb);
if (err < 0) {
bt_dev_err(hdev, "sending frame failed (%d)", err);
kfree_skb(skb);
+ return err;
}
+
+ return 0;
+}
+
+static int hci_send_conn_frame(struct hci_dev *hdev, struct hci_conn *conn,
+ struct sk_buff *skb)
+{
+ hci_conn_tx_queue(conn, skb);
+ return hci_send_frame(hdev, skb);
}
/* Send HCI command */
@@ -3485,7 +3095,7 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen,
BT_DBG("%s opcode 0x%4.4x plen %d", hdev->name, opcode, plen);
- skb = hci_prepare_cmd(hdev, opcode, plen, param);
+ skb = hci_cmd_sync_alloc(hdev, opcode, plen, param, NULL);
if (!skb) {
bt_dev_err(hdev, "no memory for command");
return -ENOMEM;
@@ -3520,7 +3130,7 @@ int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen,
return -EINVAL;
}
- skb = hci_prepare_cmd(hdev, opcode, plen, param);
+ skb = hci_cmd_sync_alloc(hdev, opcode, plen, param, NULL);
if (!skb) {
bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)",
opcode);
@@ -3534,41 +3144,65 @@ int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen,
EXPORT_SYMBOL(__hci_cmd_send);
/* Get data from the previously sent command */
-void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode)
+static void *hci_cmd_data(struct sk_buff *skb, __u16 opcode)
{
struct hci_command_hdr *hdr;
- if (!hdev->sent_cmd)
+ if (!skb || skb->len < HCI_COMMAND_HDR_SIZE)
return NULL;
- hdr = (void *) hdev->sent_cmd->data;
+ hdr = (void *)skb->data;
if (hdr->opcode != cpu_to_le16(opcode))
return NULL;
- BT_DBG("%s opcode 0x%4.4x", hdev->name, opcode);
+ return skb->data + HCI_COMMAND_HDR_SIZE;
+}
+
+/* Get data from the previously sent command */
+void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode)
+{
+ void *data;
- return hdev->sent_cmd->data + HCI_COMMAND_HDR_SIZE;
+ /* Check if opcode matches last sent command */
+ data = hci_cmd_data(hdev->sent_cmd, opcode);
+ if (!data)
+ /* Check if opcode matches last request */
+ data = hci_cmd_data(hdev->req_skb, opcode);
+
+ return data;
}
-/* Send HCI command and wait for command commplete event */
-struct sk_buff *hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen,
- const void *param, u32 timeout)
+/* Get data from last received event */
+void *hci_recv_event_data(struct hci_dev *hdev, __u8 event)
{
- struct sk_buff *skb;
+ struct hci_event_hdr *hdr;
+ int offset;
- if (!test_bit(HCI_UP, &hdev->flags))
- return ERR_PTR(-ENETDOWN);
+ if (!hdev->recv_event)
+ return NULL;
- bt_dev_dbg(hdev, "opcode 0x%4.4x plen %d", opcode, plen);
+ hdr = (void *)hdev->recv_event->data;
+ offset = sizeof(*hdr);
- hci_req_sync_lock(hdev);
- skb = __hci_cmd_sync(hdev, opcode, plen, param, timeout);
- hci_req_sync_unlock(hdev);
+ if (hdr->evt != event) {
+ /* In case of LE metaevent check the subevent match */
+ if (hdr->evt == HCI_EV_LE_META) {
+ struct hci_ev_le_meta *ev;
+
+ ev = (void *)hdev->recv_event->data + offset;
+ offset += sizeof(*ev);
+ if (ev->subevent == event)
+ goto found;
+ }
+ return NULL;
+ }
+
+found:
+ bt_dev_dbg(hdev, "event 0x%2.2x", event);
- return skb;
+ return hdev->recv_event->data + offset;
}
-EXPORT_SYMBOL(hci_cmd_sync);
/* Send ACL data */
static void hci_add_acl_hdr(struct sk_buff *skb, __u16 handle, __u16 flags)
@@ -3595,17 +3229,7 @@ static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue,
hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT;
- switch (hdev->dev_type) {
- case HCI_PRIMARY:
- hci_add_acl_hdr(skb, conn->handle, flags);
- break;
- case HCI_AMP:
- hci_add_acl_hdr(skb, chan->handle, flags);
- break;
- default:
- bt_dev_err(hdev, "unknown dev_type %d", hdev->dev_type);
- return;
- }
+ hci_add_acl_hdr(skb, conn->handle, flags);
list = skb_shinfo(skb)->frag_list;
if (!list) {
@@ -3643,6 +3267,8 @@ static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue,
spin_unlock_bh(&queue->lock);
}
+
+ bt_dev_dbg(hdev, "chan %p queued %d", chan, skb_queue_len(queue));
}
void hci_send_acl(struct hci_chan *chan, struct sk_buff *skb, __u16 flags)
@@ -3674,12 +3300,124 @@ void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb)
hci_skb_pkt_type(skb) = HCI_SCODATA_PKT;
skb_queue_tail(&conn->data_q, skb);
+
+ bt_dev_dbg(hdev, "hcon %p queued %d", conn,
+ skb_queue_len(&conn->data_q));
+
+ queue_work(hdev->workqueue, &hdev->tx_work);
+}
+
+/* Send ISO data */
+static void hci_add_iso_hdr(struct sk_buff *skb, __u16 handle, __u8 flags)
+{
+ struct hci_iso_hdr *hdr;
+ int len = skb->len;
+
+ skb_push(skb, HCI_ISO_HDR_SIZE);
+ skb_reset_transport_header(skb);
+ hdr = (struct hci_iso_hdr *)skb_transport_header(skb);
+ hdr->handle = cpu_to_le16(hci_handle_pack(handle, flags));
+ hdr->dlen = cpu_to_le16(len);
+}
+
+static void hci_queue_iso(struct hci_conn *conn, struct sk_buff_head *queue,
+ struct sk_buff *skb)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct sk_buff *list;
+ __u16 flags;
+
+ skb->len = skb_headlen(skb);
+ skb->data_len = 0;
+
+ hci_skb_pkt_type(skb) = HCI_ISODATA_PKT;
+
+ list = skb_shinfo(skb)->frag_list;
+
+ flags = hci_iso_flags_pack(list ? ISO_START : ISO_SINGLE, 0x00);
+ hci_add_iso_hdr(skb, conn->handle, flags);
+
+ if (!list) {
+ /* Non fragmented */
+ BT_DBG("%s nonfrag skb %p len %d", hdev->name, skb, skb->len);
+
+ skb_queue_tail(queue, skb);
+ } else {
+ /* Fragmented */
+ BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len);
+
+ skb_shinfo(skb)->frag_list = NULL;
+
+ __skb_queue_tail(queue, skb);
+
+ do {
+ skb = list; list = list->next;
+
+ hci_skb_pkt_type(skb) = HCI_ISODATA_PKT;
+ flags = hci_iso_flags_pack(list ? ISO_CONT : ISO_END,
+ 0x00);
+ hci_add_iso_hdr(skb, conn->handle, flags);
+
+ BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len);
+
+ __skb_queue_tail(queue, skb);
+ } while (list);
+ }
+
+ bt_dev_dbg(hdev, "hcon %p queued %d", conn, skb_queue_len(queue));
+}
+
+void hci_send_iso(struct hci_conn *conn, struct sk_buff *skb)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ BT_DBG("%s len %d", hdev->name, skb->len);
+
+ hci_queue_iso(conn, &conn->data_q, skb);
+
queue_work(hdev->workqueue, &hdev->tx_work);
}
/* ---- HCI TX task (outgoing data) ---- */
/* HCI Connection scheduler */
+static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote)
+{
+ struct hci_dev *hdev;
+ int cnt, q;
+
+ if (!conn) {
+ *quote = 0;
+ return;
+ }
+
+ hdev = conn->hdev;
+
+ switch (conn->type) {
+ case ACL_LINK:
+ cnt = hdev->acl_cnt;
+ break;
+ case SCO_LINK:
+ case ESCO_LINK:
+ cnt = hdev->sco_cnt;
+ break;
+ case LE_LINK:
+ cnt = hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt;
+ break;
+ case CIS_LINK:
+ case BIS_LINK:
+ case PA_LINK:
+ cnt = hdev->iso_cnt;
+ break;
+ default:
+ cnt = 0;
+ bt_dev_err(hdev, "unknown link type %d", conn->type);
+ }
+
+ q = cnt / num;
+ *quote = q ? q : 1;
+}
+
static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type,
int *quote)
{
@@ -3693,9 +3431,14 @@ static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type,
rcu_read_lock();
list_for_each_entry_rcu(c, &h->list, list) {
- if (c->type != type || skb_queue_empty(&c->data_q))
+ if (c->type != type ||
+ skb_queue_empty(&c->data_q))
continue;
+ bt_dev_dbg(hdev, "hcon %p state %s queued %d", c,
+ state_to_string(c->state),
+ skb_queue_len(&c->data_q));
+
if (c->state != BT_CONNECTED && c->state != BT_CONFIG)
continue;
@@ -3712,29 +3455,7 @@ static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type,
rcu_read_unlock();
- if (conn) {
- int cnt, q;
-
- switch (conn->type) {
- case ACL_LINK:
- cnt = hdev->acl_cnt;
- break;
- case SCO_LINK:
- case ESCO_LINK:
- cnt = hdev->sco_cnt;
- break;
- case LE_LINK:
- cnt = hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt;
- break;
- default:
- cnt = 0;
- bt_dev_err(hdev, "unknown link type %d", conn->type);
- }
-
- q = cnt / num;
- *quote = q ? q : 1;
- } else
- *quote = 0;
+ hci_quote_sent(conn, num, quote);
BT_DBG("conn %p quote %d", conn, *quote);
return conn;
@@ -3747,10 +3468,10 @@ static void hci_link_tx_to(struct hci_dev *hdev, __u8 type)
bt_dev_err(hdev, "link tx timeout");
- rcu_read_lock();
+ hci_dev_lock(hdev);
/* Kill stalled connections */
- list_for_each_entry_rcu(c, &h->list, list) {
+ list_for_each_entry(c, &h->list, list) {
if (c->type == type && c->sent) {
bt_dev_err(hdev, "killing stalled connection %pMR",
&c->dst);
@@ -3758,7 +3479,7 @@ static void hci_link_tx_to(struct hci_dev *hdev, __u8 type)
}
}
- rcu_read_unlock();
+ hci_dev_unlock(hdev);
}
static struct hci_chan *hci_chan_sent(struct hci_dev *hdev, __u8 type,
@@ -3768,7 +3489,7 @@ static struct hci_chan *hci_chan_sent(struct hci_dev *hdev, __u8 type,
struct hci_chan *chan = NULL;
unsigned int num = 0, min = ~0, cur_prio = 0;
struct hci_conn *conn;
- int cnt, q, conn_num = 0;
+ int conn_num = 0;
BT_DBG("%s", hdev->name);
@@ -3818,27 +3539,8 @@ static struct hci_chan *hci_chan_sent(struct hci_dev *hdev, __u8 type,
if (!chan)
return NULL;
- switch (chan->conn->type) {
- case ACL_LINK:
- cnt = hdev->acl_cnt;
- break;
- case AMP_LINK:
- cnt = hdev->block_cnt;
- break;
- case SCO_LINK:
- case ESCO_LINK:
- cnt = hdev->sco_cnt;
- break;
- case LE_LINK:
- cnt = hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt;
- break;
- default:
- cnt = 0;
- bt_dev_err(hdev, "unknown link type %d", chan->conn->type);
- }
+ hci_quote_sent(chan->conn, num, quote);
- q = cnt / num;
- *quote = q ? q : 1;
BT_DBG("chan %p quote %d", chan, *quote);
return chan;
}
@@ -3893,21 +3595,82 @@ static void hci_prio_recalculate(struct hci_dev *hdev, __u8 type)
}
-static inline int __get_blocks(struct hci_dev *hdev, struct sk_buff *skb)
+static void __check_timeout(struct hci_dev *hdev, unsigned int cnt, u8 type)
{
- /* Calculate count of blocks used by this packet */
- return DIV_ROUND_UP(skb->len - HCI_ACL_HDR_SIZE, hdev->block_len);
+ unsigned long timeout;
+
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
+ return;
+
+ switch (type) {
+ case ACL_LINK:
+ /* tx timeout must be longer than maximum link supervision
+ * timeout (40.9 seconds)
+ */
+ timeout = hdev->acl_last_tx + HCI_ACL_TX_TIMEOUT;
+ break;
+ case LE_LINK:
+ /* tx timeout must be longer than maximum link supervision
+ * timeout (40.9 seconds)
+ */
+ timeout = hdev->le_last_tx + HCI_ACL_TX_TIMEOUT;
+ break;
+ case CIS_LINK:
+ case BIS_LINK:
+ case PA_LINK:
+ /* tx timeout must be longer than the maximum transport latency
+ * (8.388607 seconds)
+ */
+ timeout = hdev->iso_last_tx + HCI_ISO_TX_TIMEOUT;
+ break;
+ default:
+ return;
+ }
+
+ if (!cnt && time_after(jiffies, timeout))
+ hci_link_tx_to(hdev, type);
}
-static void __check_timeout(struct hci_dev *hdev, unsigned int cnt)
+/* Schedule SCO */
+static void hci_sched_sco(struct hci_dev *hdev, __u8 type)
{
- if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
- /* ACL tx timeout must be longer than maximum
- * link supervision timeout (40.9 seconds) */
- if (!cnt && time_after(jiffies, hdev->acl_last_tx +
- HCI_ACL_TX_TIMEOUT))
- hci_link_tx_to(hdev, ACL_LINK);
+ struct hci_conn *conn;
+ struct sk_buff *skb;
+ int quote, *cnt;
+ unsigned int pkts = hdev->sco_pkts;
+
+ bt_dev_dbg(hdev, "type %u", type);
+
+ if (!hci_conn_num(hdev, type) || !pkts)
+ return;
+
+ /* Use sco_pkts if flow control has not been enabled which will limit
+ * the amount of buffer sent in a row.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL))
+ cnt = &pkts;
+ else
+ cnt = &hdev->sco_cnt;
+
+ while (*cnt && (conn = hci_low_sent(hdev, type, &quote))) {
+ while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
+ BT_DBG("skb %p len %d", skb, skb->len);
+ hci_send_conn_frame(hdev, conn, skb);
+
+ conn->sent++;
+ if (conn->sent == ~0)
+ conn->sent = 0;
+ (*cnt)--;
+ }
}
+
+ /* Rescheduled if all packets were sent and flow control is not enabled
+ * as there could be more packets queued that could not be sent and
+ * since no HCI_EV_NUM_COMP_PKTS event will be generated the reschedule
+ * needs to be forced.
+ */
+ if (!pkts && !hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL))
+ queue_work(hdev->workqueue, &hdev->tx_work);
}
static void hci_sched_acl_pkt(struct hci_dev *hdev)
@@ -3917,7 +3680,7 @@ static void hci_sched_acl_pkt(struct hci_dev *hdev)
struct sk_buff *skb;
int quote;
- __check_timeout(hdev, cnt);
+ __check_timeout(hdev, cnt, ACL_LINK);
while (hdev->acl_cnt &&
(chan = hci_chan_sent(hdev, ACL_LINK, &quote))) {
@@ -3935,12 +3698,16 @@ static void hci_sched_acl_pkt(struct hci_dev *hdev)
hci_conn_enter_active_mode(chan->conn,
bt_cb(skb)->force_active);
- hci_send_frame(hdev, skb);
+ hci_send_conn_frame(hdev, chan->conn, skb);
hdev->acl_last_tx = jiffies;
hdev->acl_cnt--;
chan->sent++;
chan->conn->sent++;
+
+ /* Send pending SCO packets right away */
+ hci_sched_sco(hdev, SCO_LINK);
+ hci_sched_sco(hdev, ESCO_LINK);
}
}
@@ -3948,29 +3715,36 @@ static void hci_sched_acl_pkt(struct hci_dev *hdev)
hci_prio_recalculate(hdev, ACL_LINK);
}
-static void hci_sched_acl_blk(struct hci_dev *hdev)
+static void hci_sched_acl(struct hci_dev *hdev)
+{
+ BT_DBG("%s", hdev->name);
+
+ /* No ACL link over BR/EDR controller */
+ if (!hci_conn_num(hdev, ACL_LINK))
+ return;
+
+ hci_sched_acl_pkt(hdev);
+}
+
+static void hci_sched_le(struct hci_dev *hdev)
{
- unsigned int cnt = hdev->block_cnt;
struct hci_chan *chan;
struct sk_buff *skb;
- int quote;
- u8 type;
-
- __check_timeout(hdev, cnt);
+ int quote, *cnt, tmp;
BT_DBG("%s", hdev->name);
- if (hdev->dev_type == HCI_AMP)
- type = AMP_LINK;
- else
- type = ACL_LINK;
+ if (!hci_conn_num(hdev, LE_LINK))
+ return;
- while (hdev->block_cnt > 0 &&
- (chan = hci_chan_sent(hdev, type, &quote))) {
- u32 priority = (skb_peek(&chan->data_q))->priority;
- while (quote > 0 && (skb = skb_peek(&chan->data_q))) {
- int blocks;
+ cnt = hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt;
+
+ __check_timeout(hdev, *cnt, LE_LINK);
+ tmp = *cnt;
+ while (*cnt && (chan = hci_chan_sent(hdev, LE_LINK, &quote))) {
+ u32 priority = (skb_peek(&chan->data_q))->priority;
+ while (quote-- && (skb = skb_peek(&chan->data_q))) {
BT_DBG("chan %p skb %p len %d priority %u", chan, skb,
skb->len, skb->priority);
@@ -3980,163 +3754,70 @@ static void hci_sched_acl_blk(struct hci_dev *hdev)
skb = skb_dequeue(&chan->data_q);
- blocks = __get_blocks(hdev, skb);
- if (blocks > hdev->block_cnt)
- return;
-
- hci_conn_enter_active_mode(chan->conn,
- bt_cb(skb)->force_active);
-
- hci_send_frame(hdev, skb);
- hdev->acl_last_tx = jiffies;
+ hci_send_conn_frame(hdev, chan->conn, skb);
+ hdev->le_last_tx = jiffies;
- hdev->block_cnt -= blocks;
- quote -= blocks;
+ (*cnt)--;
+ chan->sent++;
+ chan->conn->sent++;
- chan->sent += blocks;
- chan->conn->sent += blocks;
+ /* Send pending SCO packets right away */
+ hci_sched_sco(hdev, SCO_LINK);
+ hci_sched_sco(hdev, ESCO_LINK);
}
}
- if (cnt != hdev->block_cnt)
- hci_prio_recalculate(hdev, type);
-}
-
-static void hci_sched_acl(struct hci_dev *hdev)
-{
- BT_DBG("%s", hdev->name);
-
- /* No ACL link over BR/EDR controller */
- if (!hci_conn_num(hdev, ACL_LINK) && hdev->dev_type == HCI_PRIMARY)
- return;
-
- /* No AMP link over AMP controller */
- if (!hci_conn_num(hdev, AMP_LINK) && hdev->dev_type == HCI_AMP)
- return;
-
- switch (hdev->flow_ctl_mode) {
- case HCI_FLOW_CTL_MODE_PACKET_BASED:
- hci_sched_acl_pkt(hdev);
- break;
-
- case HCI_FLOW_CTL_MODE_BLOCK_BASED:
- hci_sched_acl_blk(hdev);
- break;
- }
+ if (*cnt != tmp)
+ hci_prio_recalculate(hdev, LE_LINK);
}
-/* Schedule SCO */
-static void hci_sched_sco(struct hci_dev *hdev)
+/* Schedule iso */
+static void hci_sched_iso(struct hci_dev *hdev, __u8 type)
{
struct hci_conn *conn;
struct sk_buff *skb;
- int quote;
+ int quote, *cnt;
BT_DBG("%s", hdev->name);
- if (!hci_conn_num(hdev, SCO_LINK))
+ if (!hci_conn_num(hdev, type))
return;
- while (hdev->sco_cnt && (conn = hci_low_sent(hdev, SCO_LINK, &quote))) {
- while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
- BT_DBG("skb %p len %d", skb, skb->len);
- hci_send_frame(hdev, skb);
+ cnt = &hdev->iso_cnt;
- conn->sent++;
- if (conn->sent == ~0)
- conn->sent = 0;
- }
- }
-}
+ __check_timeout(hdev, *cnt, type);
-static void hci_sched_esco(struct hci_dev *hdev)
-{
- struct hci_conn *conn;
- struct sk_buff *skb;
- int quote;
-
- BT_DBG("%s", hdev->name);
-
- if (!hci_conn_num(hdev, ESCO_LINK))
- return;
-
- while (hdev->sco_cnt && (conn = hci_low_sent(hdev, ESCO_LINK,
- &quote))) {
+ while (*cnt && (conn = hci_low_sent(hdev, type, &quote))) {
while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
BT_DBG("skb %p len %d", skb, skb->len);
- hci_send_frame(hdev, skb);
+
+ hci_send_conn_frame(hdev, conn, skb);
+ hdev->iso_last_tx = jiffies;
conn->sent++;
if (conn->sent == ~0)
conn->sent = 0;
+ (*cnt)--;
}
}
}
-static void hci_sched_le(struct hci_dev *hdev)
-{
- struct hci_chan *chan;
- struct sk_buff *skb;
- int quote, cnt, tmp;
-
- BT_DBG("%s", hdev->name);
-
- if (!hci_conn_num(hdev, LE_LINK))
- return;
-
- if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
- /* LE tx timeout must be longer than maximum
- * link supervision timeout (40.9 seconds) */
- if (!hdev->le_cnt && hdev->le_pkts &&
- time_after(jiffies, hdev->le_last_tx + HZ * 45))
- hci_link_tx_to(hdev, LE_LINK);
- }
-
- cnt = hdev->le_pkts ? hdev->le_cnt : hdev->acl_cnt;
- tmp = cnt;
- while (cnt && (chan = hci_chan_sent(hdev, LE_LINK, &quote))) {
- u32 priority = (skb_peek(&chan->data_q))->priority;
- while (quote-- && (skb = skb_peek(&chan->data_q))) {
- BT_DBG("chan %p skb %p len %d priority %u", chan, skb,
- skb->len, skb->priority);
-
- /* Stop if priority has changed */
- if (skb->priority < priority)
- break;
-
- skb = skb_dequeue(&chan->data_q);
-
- hci_send_frame(hdev, skb);
- hdev->le_last_tx = jiffies;
-
- cnt--;
- chan->sent++;
- chan->conn->sent++;
- }
- }
-
- if (hdev->le_pkts)
- hdev->le_cnt = cnt;
- else
- hdev->acl_cnt = cnt;
-
- if (cnt != tmp)
- hci_prio_recalculate(hdev, LE_LINK);
-}
-
static void hci_tx_work(struct work_struct *work)
{
struct hci_dev *hdev = container_of(work, struct hci_dev, tx_work);
struct sk_buff *skb;
- BT_DBG("%s acl %d sco %d le %d", hdev->name, hdev->acl_cnt,
- hdev->sco_cnt, hdev->le_cnt);
+ BT_DBG("%s acl %d sco %d le %d iso %d", hdev->name, hdev->acl_cnt,
+ hdev->sco_cnt, hdev->le_cnt, hdev->iso_cnt);
if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
/* Schedule queues and send stuff to HCI driver */
+ hci_sched_sco(hdev, SCO_LINK);
+ hci_sched_sco(hdev, ESCO_LINK);
+ hci_sched_iso(hdev, CIS_LINK);
+ hci_sched_iso(hdev, BIS_LINK);
+ hci_sched_iso(hdev, PA_LINK);
hci_sched_acl(hdev);
- hci_sched_sco(hdev);
- hci_sched_esco(hdev);
hci_sched_le(hdev);
}
@@ -4150,68 +3831,96 @@ static void hci_tx_work(struct work_struct *work)
/* ACL data packet */
static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb)
{
- struct hci_acl_hdr *hdr = (void *) skb->data;
- struct hci_conn *conn;
+ struct hci_acl_hdr *hdr;
__u16 handle, flags;
+ int err;
- skb_pull(skb, HCI_ACL_HDR_SIZE);
+ hdr = skb_pull_data(skb, sizeof(*hdr));
+ if (!hdr) {
+ bt_dev_err(hdev, "ACL packet too small");
+ kfree_skb(skb);
+ return;
+ }
handle = __le16_to_cpu(hdr->handle);
flags = hci_flags(handle);
handle = hci_handle(handle);
- BT_DBG("%s len %d handle 0x%4.4x flags 0x%4.4x", hdev->name, skb->len,
- handle, flags);
+ bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len,
+ handle, flags);
hdev->stat.acl_rx++;
- hci_dev_lock(hdev);
- conn = hci_conn_hash_lookup_handle(hdev, handle);
- hci_dev_unlock(hdev);
-
- if (conn) {
- hci_conn_enter_active_mode(conn, BT_POWER_FORCE_ACTIVE_OFF);
-
- /* Send to upper protocol */
- l2cap_recv_acldata(conn, skb, flags);
- return;
- } else {
+ err = l2cap_recv_acldata(hdev, handle, skb, flags);
+ if (err == -ENOENT)
bt_dev_err(hdev, "ACL packet for unknown connection handle %d",
handle);
- }
-
- kfree_skb(skb);
+ else if (err)
+ bt_dev_dbg(hdev, "ACL packet recv for handle %d failed: %d",
+ handle, err);
}
/* SCO data packet */
static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
{
- struct hci_sco_hdr *hdr = (void *) skb->data;
- struct hci_conn *conn;
- __u16 handle;
+ struct hci_sco_hdr *hdr;
+ __u16 handle, flags;
+ int err;
- skb_pull(skb, HCI_SCO_HDR_SIZE);
+ hdr = skb_pull_data(skb, sizeof(*hdr));
+ if (!hdr) {
+ bt_dev_err(hdev, "SCO packet too small");
+ kfree_skb(skb);
+ return;
+ }
handle = __le16_to_cpu(hdr->handle);
+ flags = hci_flags(handle);
+ handle = hci_handle(handle);
- BT_DBG("%s len %d handle 0x%4.4x", hdev->name, skb->len, handle);
+ bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len,
+ handle, flags);
hdev->stat.sco_rx++;
- hci_dev_lock(hdev);
- conn = hci_conn_hash_lookup_handle(hdev, handle);
- hci_dev_unlock(hdev);
+ hci_skb_pkt_status(skb) = flags & 0x03;
+
+ err = sco_recv_scodata(hdev, handle, skb);
+ if (err == -ENOENT)
+ bt_dev_err_ratelimited(hdev, "SCO packet for unknown connection handle %d",
+ handle);
+ else if (err)
+ bt_dev_dbg(hdev, "SCO packet recv for handle %d failed: %d",
+ handle, err);
+}
+
+static void hci_isodata_packet(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_iso_hdr *hdr;
+ __u16 handle, flags;
+ int err;
- if (conn) {
- /* Send to upper protocol */
- sco_recv_scodata(conn, skb);
+ hdr = skb_pull_data(skb, sizeof(*hdr));
+ if (!hdr) {
+ bt_dev_err(hdev, "ISO packet too small");
+ kfree_skb(skb);
return;
- } else {
- bt_dev_err(hdev, "SCO packet for unknown connection handle %d",
- handle);
}
- kfree_skb(skb);
+ handle = __le16_to_cpu(hdr->handle);
+ flags = hci_flags(handle);
+ handle = hci_handle(handle);
+
+ bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len,
+ handle, flags);
+
+ err = iso_recv(hdev, handle, skb, flags);
+ if (err == -ENOENT)
+ bt_dev_err(hdev, "ISO packet for unknown connection handle %d",
+ handle);
+ else if (err)
+ bt_dev_dbg(hdev, "ISO packet recv for handle %d failed: %d",
+ handle, err);
}
static bool hci_req_is_complete(struct hci_dev *hdev)
@@ -4272,23 +3981,28 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status,
return;
}
+ /* If we reach this point this event matches the last command sent */
+ hci_dev_clear_flag(hdev, HCI_CMD_PENDING);
+
/* If the command succeeded and there's still more commands in
* this request the request is not yet complete.
*/
if (!status && !hci_req_is_complete(hdev))
return;
+ skb = hdev->req_skb;
+
/* If this was the last command in a request the complete
- * callback would be found in hdev->sent_cmd instead of the
+ * callback would be found in hdev->req_skb instead of the
* command queue (hdev->cmd_q).
*/
- if (bt_cb(hdev->sent_cmd)->hci.req_flags & HCI_REQ_SKB) {
- *req_complete_skb = bt_cb(hdev->sent_cmd)->hci.req_complete_skb;
+ if (skb && bt_cb(skb)->hci.req_flags & HCI_REQ_SKB) {
+ *req_complete_skb = bt_cb(skb)->hci.req_complete_skb;
return;
}
- if (bt_cb(hdev->sent_cmd)->hci.req_complete) {
- *req_complete = bt_cb(hdev->sent_cmd)->hci.req_complete;
+ if (skb && bt_cb(skb)->hci.req_complete) {
+ *req_complete = bt_cb(skb)->hci.req_complete;
return;
}
@@ -4304,7 +4018,7 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status,
*req_complete_skb = bt_cb(skb)->hci.req_complete_skb;
else
*req_complete = bt_cb(skb)->hci.req_complete;
- kfree_skb(skb);
+ dev_kfree_skb_irq(skb);
}
spin_unlock_irqrestore(&hdev->cmd_q.lock, flags);
}
@@ -4316,7 +4030,14 @@ static void hci_rx_work(struct work_struct *work)
BT_DBG("%s", hdev->name);
- while ((skb = skb_dequeue(&hdev->rx_q))) {
+ /* The kcov_remote functions used for collecting packet parsing
+ * coverage information from this background thread and associate
+ * the coverage with the syscall's thread which originally injected
+ * the packet. This helps fuzzing the kernel.
+ */
+ for (; (skb = skb_dequeue(&hdev->rx_q)); kcov_remote_stop()) {
+ kcov_remote_start_common(skb_get_kcov_handle(skb));
+
/* Send copy to monitor */
hci_send_to_monitor(hdev, skb);
@@ -4325,7 +4046,14 @@ static void hci_rx_work(struct work_struct *work)
hci_send_to_sock(hdev, skb);
}
- if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ /* If the device has been opened in HCI_USER_CHANNEL,
+ * the userspace has exclusive access to device.
+ * When device is HCI_INIT, we still need to process
+ * the data packets to the driver in order
+ * to complete its setup().
+ */
+ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
+ !test_bit(HCI_INIT, &hdev->flags)) {
kfree_skb(skb);
continue;
}
@@ -4335,6 +4063,7 @@ static void hci_rx_work(struct work_struct *work)
switch (hci_skb_pkt_type(skb)) {
case HCI_ACLDATA_PKT:
case HCI_SCODATA_PKT:
+ case HCI_ISODATA_PKT:
kfree_skb(skb);
continue;
}
@@ -4357,6 +4086,11 @@ static void hci_rx_work(struct work_struct *work)
hci_scodata_packet(hdev, skb);
break;
+ case HCI_ISODATA_PKT:
+ BT_DBG("%s ISO data packet", hdev->name);
+ hci_isodata_packet(hdev, skb);
+ break;
+
default:
kfree_skb(skb);
break;
@@ -4364,10 +4098,47 @@ static void hci_rx_work(struct work_struct *work)
}
}
+static int hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ int err;
+
+ bt_dev_dbg(hdev, "skb %p", skb);
+
+ kfree_skb(hdev->sent_cmd);
+
+ hdev->sent_cmd = skb_clone(skb, GFP_KERNEL);
+ if (!hdev->sent_cmd) {
+ skb_queue_head(&hdev->cmd_q, skb);
+ queue_work(hdev->workqueue, &hdev->cmd_work);
+ return -EINVAL;
+ }
+
+ if (hci_skb_opcode(skb) != HCI_OP_NOP) {
+ err = hci_send_frame(hdev, skb);
+ if (err < 0) {
+ hci_cmd_sync_cancel_sync(hdev, -err);
+ return err;
+ }
+ atomic_dec(&hdev->cmd_cnt);
+ } else {
+ err = -ENODATA;
+ kfree_skb(skb);
+ }
+
+ if (hdev->req_status == HCI_REQ_PEND &&
+ !hci_dev_test_and_set_flag(hdev, HCI_CMD_PENDING)) {
+ kfree_skb(hdev->req_skb);
+ hdev->req_skb = skb_clone(hdev->sent_cmd, GFP_KERNEL);
+ }
+
+ return err;
+}
+
static void hci_cmd_work(struct work_struct *work)
{
struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_work);
struct sk_buff *skb;
+ int err;
BT_DBG("%s cmd_cnt %d cmd queued %d", hdev->name,
atomic_read(&hdev->cmd_cnt), skb_queue_len(&hdev->cmd_q));
@@ -4378,20 +4149,17 @@ static void hci_cmd_work(struct work_struct *work)
if (!skb)
return;
- kfree_skb(hdev->sent_cmd);
-
- hdev->sent_cmd = skb_clone(skb, GFP_KERNEL);
- if (hdev->sent_cmd) {
- atomic_dec(&hdev->cmd_cnt);
- hci_send_frame(hdev, skb);
- if (test_bit(HCI_RESET, &hdev->flags))
- cancel_delayed_work(&hdev->cmd_timer);
- else
- schedule_delayed_work(&hdev->cmd_timer,
- HCI_CMD_TIMEOUT);
- } else {
- skb_queue_head(&hdev->cmd_q, skb);
- queue_work(hdev->workqueue, &hdev->cmd_work);
- }
+ err = hci_send_cmd_sync(hdev, skb);
+ if (err)
+ return;
+
+ rcu_read_lock();
+ if (test_bit(HCI_RESET, &hdev->flags) ||
+ hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE))
+ cancel_delayed_work(&hdev->cmd_timer);
+ else
+ queue_delayed_work(hdev->workqueue, &hdev->cmd_timer,
+ HCI_CMD_TIMEOUT);
+ rcu_read_unlock();
}
}
diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c
index 51f5b1efc3a5..99e2e9fc70e8 100644
--- a/net/bluetooth/hci_debugfs.c
+++ b/net/bluetooth/hci_debugfs.c
@@ -22,10 +22,12 @@
*/
#include <linux/debugfs.h>
+#include <linux/kstrtox.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
+#include "smp.h"
#include "hci_debugfs.h"
#define DEFINE_QUIRK_ATTRIBUTE(__name, __quirk) \
@@ -36,7 +38,7 @@ static ssize_t __name ## _read(struct file *file, \
struct hci_dev *hdev = file->private_data; \
char buf[3]; \
\
- buf[0] = test_bit(__quirk, &hdev->quirks) ? 'Y' : 'N'; \
+ buf[0] = test_bit(__quirk, hdev->quirk_flags) ? 'Y' : 'N'; \
buf[1] = '\n'; \
buf[2] = '\0'; \
return simple_read_from_buffer(user_buf, count, ppos, buf, 2); \
@@ -57,10 +59,10 @@ static ssize_t __name ## _write(struct file *file, \
if (err) \
return err; \
\
- if (enable == test_bit(__quirk, &hdev->quirks)) \
+ if (enable == test_bit(__quirk, hdev->quirk_flags)) \
return -EALREADY; \
\
- change_bit(__quirk, &hdev->quirks); \
+ change_bit(__quirk, hdev->quirk_flags); \
\
return count; \
} \
@@ -124,7 +126,7 @@ static int device_list_show(struct seq_file *f, void *ptr)
struct bdaddr_list *b;
hci_dev_lock(hdev);
- list_for_each_entry(b, &hdev->whitelist, list)
+ list_for_each_entry(b, &hdev->accept_list, list)
seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type);
list_for_each_entry(p, &hdev->le_conn_params, list) {
seq_printf(f, "%pMR (type %u) %u\n", &p->addr, p->addr_type,
@@ -143,7 +145,7 @@ static int blacklist_show(struct seq_file *f, void *p)
struct bdaddr_list *b;
hci_dev_lock(hdev);
- list_for_each_entry(b, &hdev->blacklist, list)
+ list_for_each_entry(b, &hdev->reject_list, list)
seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type);
hci_dev_unlock(hdev);
@@ -152,6 +154,21 @@ static int blacklist_show(struct seq_file *f, void *p)
DEFINE_SHOW_ATTRIBUTE(blacklist);
+static int blocked_keys_show(struct seq_file *f, void *p)
+{
+ struct hci_dev *hdev = f->private;
+ struct blocked_key *key;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(key, &hdev->blocked_keys, list)
+ seq_printf(f, "%u %*phN\n", key->type, 16, key->val);
+ rcu_read_unlock();
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(blocked_keys);
+
static int uuids_show(struct seq_file *f, void *p)
{
struct hci_dev *hdev = f->private;
@@ -172,7 +189,7 @@ static int uuids_show(struct seq_file *f, void *p)
}
hci_dev_unlock(hdev);
- return 0;
+ return 0;
}
DEFINE_SHOW_ATTRIBUTE(uuids);
@@ -200,10 +217,12 @@ static int conn_info_min_age_set(void *data, u64 val)
{
struct hci_dev *hdev = data;
- if (val == 0 || val > hdev->conn_info_max_age)
+ hci_dev_lock(hdev);
+ if (val == 0 || val > hdev->conn_info_max_age) {
+ hci_dev_unlock(hdev);
return -EINVAL;
+ }
- hci_dev_lock(hdev);
hdev->conn_info_min_age = val;
hci_dev_unlock(hdev);
@@ -221,17 +240,19 @@ static int conn_info_min_age_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(conn_info_min_age_fops, conn_info_min_age_get,
- conn_info_min_age_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(conn_info_min_age_fops, conn_info_min_age_get,
+ conn_info_min_age_set, "%llu\n");
static int conn_info_max_age_set(void *data, u64 val)
{
struct hci_dev *hdev = data;
- if (val == 0 || val < hdev->conn_info_min_age)
+ hci_dev_lock(hdev);
+ if (val == 0 || val < hdev->conn_info_min_age) {
+ hci_dev_unlock(hdev);
return -EINVAL;
+ }
- hci_dev_lock(hdev);
hdev->conn_info_max_age = val;
hci_dev_unlock(hdev);
@@ -249,8 +270,8 @@ static int conn_info_max_age_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(conn_info_max_age_fops, conn_info_max_age_get,
- conn_info_max_age_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(conn_info_max_age_fops, conn_info_max_age_get,
+ conn_info_max_age_set, "%llu\n");
static ssize_t use_debug_keys_read(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
@@ -258,7 +279,7 @@ static ssize_t use_debug_keys_read(struct file *file, char __user *user_buf,
struct hci_dev *hdev = file->private_data;
char buf[3];
- buf[0] = hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS) ? 'Y': 'N';
+ buf[0] = hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS) ? 'Y' : 'N';
buf[1] = '\n';
buf[2] = '\0';
return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
@@ -276,7 +297,7 @@ static ssize_t sc_only_mode_read(struct file *file, char __user *user_buf,
struct hci_dev *hdev = file->private_data;
char buf[3];
- buf[0] = hci_dev_test_flag(hdev, HCI_SC_ONLY) ? 'Y': 'N';
+ buf[0] = hci_dev_test_flag(hdev, HCI_SC_ONLY) ? 'Y' : 'N';
buf[1] = '\n';
buf[2] = '\0';
return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
@@ -308,6 +329,8 @@ void hci_debugfs_create_common(struct hci_dev *hdev)
&device_list_fops);
debugfs_create_file("blacklist", 0444, hdev->debugfs, hdev,
&blacklist_fops);
+ debugfs_create_file("blocked_keys", 0444, hdev->debugfs, hdev,
+ &blocked_keys_fops);
debugfs_create_file("uuids", 0444, hdev->debugfs, hdev, &uuids_fops);
debugfs_create_file("remote_oob", 0400, hdev->debugfs, hdev,
&remote_oob_fops);
@@ -401,8 +424,8 @@ static int voice_setting_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(voice_setting_fops, voice_setting_get,
- NULL, "0x%4.4llx\n");
+DEFINE_DEBUGFS_ATTRIBUTE(voice_setting_fops, voice_setting_get,
+ NULL, "0x%4.4llx\n");
static ssize_t ssp_debug_mode_read(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
@@ -410,7 +433,7 @@ static ssize_t ssp_debug_mode_read(struct file *file, char __user *user_buf,
struct hci_dev *hdev = file->private_data;
char buf[3];
- buf[0] = hdev->ssp_debug_mode ? 'Y': 'N';
+ buf[0] = hdev->ssp_debug_mode ? 'Y' : 'N';
buf[1] = '\n';
buf[2] = '\0';
return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
@@ -433,6 +456,35 @@ static int auto_accept_delay_set(void *data, u64 val)
return 0;
}
+static int min_encrypt_key_size_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val < 1 || val > 16)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->min_enc_key_size = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int min_encrypt_key_size_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->min_enc_key_size;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(min_encrypt_key_size_fops,
+ min_encrypt_key_size_get,
+ min_encrypt_key_size_set, "%llu\n");
+
static int auto_accept_delay_get(void *data, u64 *val)
{
struct hci_dev *hdev = data;
@@ -444,8 +496,47 @@ static int auto_accept_delay_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(auto_accept_delay_fops, auto_accept_delay_get,
- auto_accept_delay_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(auto_accept_delay_fops, auto_accept_delay_get,
+ auto_accept_delay_set, "%llu\n");
+
+static ssize_t force_bredr_smp_read(struct file *file,
+ char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[3];
+
+ buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP) ? 'Y' : 'N';
+ buf[1] = '\n';
+ buf[2] = '\0';
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t force_bredr_smp_write(struct file *file,
+ const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ bool enable;
+ int err;
+
+ err = kstrtobool_from_user(user_buf, count, &enable);
+ if (err)
+ return err;
+
+ err = smp_force_bredr(hdev, enable);
+ if (err)
+ return err;
+
+ return count;
+}
+
+static const struct file_operations force_bredr_smp_fops = {
+ .open = simple_open,
+ .read = force_bredr_smp_read,
+ .write = force_bredr_smp_write,
+ .llseek = default_llseek,
+};
static int idle_timeout_set(void *data, u64 val)
{
@@ -472,17 +563,19 @@ static int idle_timeout_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(idle_timeout_fops, idle_timeout_get,
- idle_timeout_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(idle_timeout_fops, idle_timeout_get,
+ idle_timeout_set, "%llu\n");
static int sniff_min_interval_set(void *data, u64 val)
{
struct hci_dev *hdev = data;
- if (val == 0 || val % 2 || val > hdev->sniff_max_interval)
+ hci_dev_lock(hdev);
+ if (val == 0 || val % 2 || val > hdev->sniff_max_interval) {
+ hci_dev_unlock(hdev);
return -EINVAL;
+ }
- hci_dev_lock(hdev);
hdev->sniff_min_interval = val;
hci_dev_unlock(hdev);
@@ -500,17 +593,19 @@ static int sniff_min_interval_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(sniff_min_interval_fops, sniff_min_interval_get,
- sniff_min_interval_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(sniff_min_interval_fops, sniff_min_interval_get,
+ sniff_min_interval_set, "%llu\n");
static int sniff_max_interval_set(void *data, u64 val)
{
struct hci_dev *hdev = data;
- if (val == 0 || val % 2 || val < hdev->sniff_min_interval)
+ hci_dev_lock(hdev);
+ if (val == 0 || val % 2 || val < hdev->sniff_min_interval) {
+ hci_dev_unlock(hdev);
return -EINVAL;
+ }
- hci_dev_lock(hdev);
hdev->sniff_max_interval = val;
hci_dev_unlock(hdev);
@@ -528,8 +623,8 @@ static int sniff_max_interval_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(sniff_max_interval_fops, sniff_max_interval_get,
- sniff_max_interval_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(sniff_max_interval_fops, sniff_max_interval_get,
+ sniff_max_interval_set, "%llu\n");
void hci_debugfs_create_bredr(struct hci_dev *hdev)
{
@@ -542,9 +637,22 @@ void hci_debugfs_create_bredr(struct hci_dev *hdev)
debugfs_create_file("voice_setting", 0444, hdev->debugfs, hdev,
&voice_setting_fops);
+ /* If the controller does not support BR/EDR Secure Connections
+ * feature, then the BR/EDR SMP channel shall not be present.
+ *
+ * To test this with Bluetooth 4.0 controllers, create a debugfs
+ * switch that allows forcing BR/EDR SMP support and accepting
+ * cross-transport pairing on non-AES encrypted connections.
+ */
+ if (!lmp_sc_capable(hdev))
+ debugfs_create_file("force_bredr_smp", 0644, hdev->debugfs,
+ hdev, &force_bredr_smp_fops);
+
if (lmp_ssp_capable(hdev)) {
debugfs_create_file("ssp_debug_mode", 0444, hdev->debugfs,
hdev, &ssp_debug_mode_fops);
+ debugfs_create_file("min_encrypt_key_size", 0644, hdev->debugfs,
+ hdev, &min_encrypt_key_size_fops);
debugfs_create_file("auto_accept_delay", 0644, hdev->debugfs,
hdev, &auto_accept_delay_fops);
}
@@ -607,8 +715,8 @@ static int rpa_timeout_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(rpa_timeout_fops, rpa_timeout_get,
- rpa_timeout_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(rpa_timeout_fops, rpa_timeout_get,
+ rpa_timeout_set, "%llu\n");
static int random_address_show(struct seq_file *f, void *p)
{
@@ -643,7 +751,7 @@ static ssize_t force_static_address_read(struct file *file,
struct hci_dev *hdev = file->private_data;
char buf[3];
- buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ? 'Y': 'N';
+ buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ? 'Y' : 'N';
buf[1] = '\n';
buf[2] = '\0';
return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
@@ -657,7 +765,7 @@ static ssize_t force_static_address_write(struct file *file,
bool enable;
int err;
- if (test_bit(HCI_UP, &hdev->flags))
+ if (hdev_is_powered(hdev))
return -EBUSY;
err = kstrtobool_from_user(user_buf, count, &enable);
@@ -685,7 +793,7 @@ static int white_list_show(struct seq_file *f, void *ptr)
struct bdaddr_list *b;
hci_dev_lock(hdev);
- list_for_each_entry(b, &hdev->le_white_list, list)
+ list_for_each_entry(b, &hdev->le_accept_list, list)
seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type);
hci_dev_unlock(hdev);
@@ -749,10 +857,12 @@ static int conn_min_interval_set(void *data, u64 val)
{
struct hci_dev *hdev = data;
- if (val < 0x0006 || val > 0x0c80 || val > hdev->le_conn_max_interval)
+ hci_dev_lock(hdev);
+ if (val < 0x0006 || val > 0x0c80 || val > hdev->le_conn_max_interval) {
+ hci_dev_unlock(hdev);
return -EINVAL;
+ }
- hci_dev_lock(hdev);
hdev->le_conn_min_interval = val;
hci_dev_unlock(hdev);
@@ -770,17 +880,19 @@ static int conn_min_interval_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(conn_min_interval_fops, conn_min_interval_get,
- conn_min_interval_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(conn_min_interval_fops, conn_min_interval_get,
+ conn_min_interval_set, "%llu\n");
static int conn_max_interval_set(void *data, u64 val)
{
struct hci_dev *hdev = data;
- if (val < 0x0006 || val > 0x0c80 || val < hdev->le_conn_min_interval)
+ hci_dev_lock(hdev);
+ if (val < 0x0006 || val > 0x0c80 || val < hdev->le_conn_min_interval) {
+ hci_dev_unlock(hdev);
return -EINVAL;
+ }
- hci_dev_lock(hdev);
hdev->le_conn_max_interval = val;
hci_dev_unlock(hdev);
@@ -798,8 +910,8 @@ static int conn_max_interval_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(conn_max_interval_fops, conn_max_interval_get,
- conn_max_interval_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(conn_max_interval_fops, conn_max_interval_get,
+ conn_max_interval_set, "%llu\n");
static int conn_latency_set(void *data, u64 val)
{
@@ -826,8 +938,8 @@ static int conn_latency_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(conn_latency_fops, conn_latency_get,
- conn_latency_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(conn_latency_fops, conn_latency_get,
+ conn_latency_set, "%llu\n");
static int supervision_timeout_set(void *data, u64 val)
{
@@ -854,8 +966,8 @@ static int supervision_timeout_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(supervision_timeout_fops, supervision_timeout_get,
- supervision_timeout_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(supervision_timeout_fops, supervision_timeout_get,
+ supervision_timeout_set, "%llu\n");
static int adv_channel_map_set(void *data, u64 val)
{
@@ -882,17 +994,19 @@ static int adv_channel_map_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(adv_channel_map_fops, adv_channel_map_get,
- adv_channel_map_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(adv_channel_map_fops, adv_channel_map_get,
+ adv_channel_map_set, "%llu\n");
static int adv_min_interval_set(void *data, u64 val)
{
struct hci_dev *hdev = data;
- if (val < 0x0020 || val > 0x4000 || val > hdev->le_adv_max_interval)
+ hci_dev_lock(hdev);
+ if (val < 0x0020 || val > 0x4000 || val > hdev->le_adv_max_interval) {
+ hci_dev_unlock(hdev);
return -EINVAL;
+ }
- hci_dev_lock(hdev);
hdev->le_adv_min_interval = val;
hci_dev_unlock(hdev);
@@ -910,17 +1024,19 @@ static int adv_min_interval_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(adv_min_interval_fops, adv_min_interval_get,
- adv_min_interval_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(adv_min_interval_fops, adv_min_interval_get,
+ adv_min_interval_set, "%llu\n");
static int adv_max_interval_set(void *data, u64 val)
{
struct hci_dev *hdev = data;
- if (val < 0x0020 || val > 0x4000 || val < hdev->le_adv_min_interval)
+ hci_dev_lock(hdev);
+ if (val < 0x0020 || val > 0x4000 || val < hdev->le_adv_min_interval) {
+ hci_dev_unlock(hdev);
return -EINVAL;
+ }
- hci_dev_lock(hdev);
hdev->le_adv_max_interval = val;
hci_dev_unlock(hdev);
@@ -938,8 +1054,141 @@ static int adv_max_interval_get(void *data, u64 *val)
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(adv_max_interval_fops, adv_max_interval_get,
- adv_max_interval_set, "%llu\n");
+DEFINE_DEBUGFS_ATTRIBUTE(adv_max_interval_fops, adv_max_interval_get,
+ adv_max_interval_set, "%llu\n");
+
+static int min_key_size_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ if (val > hdev->le_max_key_size || val < SMP_MIN_ENC_KEY_SIZE) {
+ hci_dev_unlock(hdev);
+ return -EINVAL;
+ }
+
+ hdev->le_min_key_size = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int min_key_size_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->le_min_key_size;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(min_key_size_fops, min_key_size_get,
+ min_key_size_set, "%llu\n");
+
+static int max_key_size_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ if (val > SMP_MAX_ENC_KEY_SIZE || val < hdev->le_min_key_size) {
+ hci_dev_unlock(hdev);
+ return -EINVAL;
+ }
+
+ hdev->le_max_key_size = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int max_key_size_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->le_max_key_size;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(max_key_size_fops, max_key_size_get,
+ max_key_size_set, "%llu\n");
+
+static int auth_payload_timeout_set(void *data, u64 val)
+{
+ struct hci_dev *hdev = data;
+
+ if (val < 0x0001 || val > 0xffff)
+ return -EINVAL;
+
+ hci_dev_lock(hdev);
+ hdev->auth_payload_timeout = val;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int auth_payload_timeout_get(void *data, u64 *val)
+{
+ struct hci_dev *hdev = data;
+
+ hci_dev_lock(hdev);
+ *val = hdev->auth_payload_timeout;
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(auth_payload_timeout_fops,
+ auth_payload_timeout_get,
+ auth_payload_timeout_set, "%llu\n");
+
+static ssize_t force_no_mitm_read(struct file *file,
+ char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[3];
+
+ buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_NO_MITM) ? 'Y' : 'N';
+ buf[1] = '\n';
+ buf[2] = '\0';
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t force_no_mitm_write(struct file *file,
+ const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[32];
+ size_t buf_size = min(count, (sizeof(buf) - 1));
+ bool enable;
+
+ if (copy_from_user(buf, user_buf, buf_size))
+ return -EFAULT;
+
+ buf[buf_size] = '\0';
+ if (kstrtobool(buf, &enable))
+ return -EINVAL;
+
+ if (enable == hci_dev_test_flag(hdev, HCI_FORCE_NO_MITM))
+ return -EALREADY;
+
+ hci_dev_change_flag(hdev, HCI_FORCE_NO_MITM);
+
+ return count;
+}
+
+static const struct file_operations force_no_mitm_fops = {
+ .open = simple_open,
+ .read = force_no_mitm_read,
+ .write = force_no_mitm_write,
+ .llseek = default_llseek,
+};
DEFINE_QUIRK_ATTRIBUTE(quirk_strict_duplicate_filter,
HCI_QUIRK_STRICT_DUPLICATE_FILTER);
@@ -967,7 +1216,7 @@ void hci_debugfs_create_le(struct hci_dev *hdev)
&force_static_address_fops);
debugfs_create_u8("white_list_size", 0444, hdev->debugfs,
- &hdev->le_white_list_size);
+ &hdev->le_accept_list_size);
debugfs_create_file("white_list", 0444, hdev->debugfs, hdev,
&white_list_fops);
debugfs_create_u8("resolv_list_size", 0444, hdev->debugfs,
@@ -994,6 +1243,14 @@ void hci_debugfs_create_le(struct hci_dev *hdev)
&adv_max_interval_fops);
debugfs_create_u16("discov_interleaved_timeout", 0644, hdev->debugfs,
&hdev->discov_interleaved_timeout);
+ debugfs_create_file("min_key_size", 0644, hdev->debugfs, hdev,
+ &min_key_size_fops);
+ debugfs_create_file("max_key_size", 0644, hdev->debugfs, hdev,
+ &max_key_size_fops);
+ debugfs_create_file("auth_payload_timeout", 0644, hdev->debugfs, hdev,
+ &auth_payload_timeout_fops);
+ debugfs_create_file("force_no_mitm", 0644, hdev->debugfs, hdev,
+ &force_no_mitm_fops);
debugfs_create_file("quirk_strict_duplicate_filter", 0644,
hdev->debugfs, hdev,
@@ -1008,9 +1265,131 @@ void hci_debugfs_create_conn(struct hci_conn *conn)
struct hci_dev *hdev = conn->hdev;
char name[6];
- if (IS_ERR_OR_NULL(hdev->debugfs))
+ if (IS_ERR_OR_NULL(hdev->debugfs) || conn->debugfs)
return;
snprintf(name, sizeof(name), "%u", conn->handle);
conn->debugfs = debugfs_create_dir(name, hdev->debugfs);
}
+
+static ssize_t dut_mode_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[3];
+
+ buf[0] = hci_dev_test_flag(hdev, HCI_DUT_MODE) ? 'Y' : 'N';
+ buf[1] = '\n';
+ buf[2] = '\0';
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t dut_mode_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ struct sk_buff *skb;
+ bool enable;
+ int err;
+
+ if (!test_bit(HCI_UP, &hdev->flags))
+ return -ENETDOWN;
+
+ err = kstrtobool_from_user(user_buf, count, &enable);
+ if (err)
+ return err;
+
+ if (enable == hci_dev_test_flag(hdev, HCI_DUT_MODE))
+ return -EALREADY;
+
+ hci_req_sync_lock(hdev);
+ if (enable)
+ skb = __hci_cmd_sync(hdev, HCI_OP_ENABLE_DUT_MODE, 0, NULL,
+ HCI_CMD_TIMEOUT);
+ else
+ skb = __hci_cmd_sync(hdev, HCI_OP_RESET, 0, NULL,
+ HCI_CMD_TIMEOUT);
+ hci_req_sync_unlock(hdev);
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ kfree_skb(skb);
+
+ hci_dev_change_flag(hdev, HCI_DUT_MODE);
+
+ return count;
+}
+
+static const struct file_operations dut_mode_fops = {
+ .open = simple_open,
+ .read = dut_mode_read,
+ .write = dut_mode_write,
+ .llseek = default_llseek,
+};
+
+static ssize_t vendor_diag_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ char buf[3];
+
+ buf[0] = hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) ? 'Y' : 'N';
+ buf[1] = '\n';
+ buf[2] = '\0';
+ return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct hci_dev *hdev = file->private_data;
+ bool enable;
+ int err;
+
+ err = kstrtobool_from_user(user_buf, count, &enable);
+ if (err)
+ return err;
+
+ /* When the diagnostic flags are not persistent and the transport
+ * is not active or in user channel operation, then there is no need
+ * for the vendor callback. Instead just store the desired value and
+ * the setting will be programmed when the controller gets powered on.
+ */
+ if (hci_test_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_DIAG) &&
+ (!test_bit(HCI_RUNNING, &hdev->flags) ||
+ hci_dev_test_flag(hdev, HCI_USER_CHANNEL)))
+ goto done;
+
+ hci_req_sync_lock(hdev);
+ err = hdev->set_diag(hdev, enable);
+ hci_req_sync_unlock(hdev);
+
+ if (err < 0)
+ return err;
+
+done:
+ if (enable)
+ hci_dev_set_flag(hdev, HCI_VENDOR_DIAG);
+ else
+ hci_dev_clear_flag(hdev, HCI_VENDOR_DIAG);
+
+ return count;
+}
+
+static const struct file_operations vendor_diag_fops = {
+ .open = simple_open,
+ .read = vendor_diag_read,
+ .write = vendor_diag_write,
+ .llseek = default_llseek,
+};
+
+void hci_debugfs_create_basic(struct hci_dev *hdev)
+{
+ debugfs_create_file("dut_mode", 0644, hdev->debugfs, hdev,
+ &dut_mode_fops);
+
+ if (hdev->set_diag)
+ debugfs_create_file("vendor_diag", 0644, hdev->debugfs, hdev,
+ &vendor_diag_fops);
+}
diff --git a/net/bluetooth/hci_debugfs.h b/net/bluetooth/hci_debugfs.h
index 4444dc8cedc2..9a8a7c93bb12 100644
--- a/net/bluetooth/hci_debugfs.h
+++ b/net/bluetooth/hci_debugfs.h
@@ -26,6 +26,7 @@ void hci_debugfs_create_common(struct hci_dev *hdev);
void hci_debugfs_create_bredr(struct hci_dev *hdev);
void hci_debugfs_create_le(struct hci_dev *hdev);
void hci_debugfs_create_conn(struct hci_conn *conn);
+void hci_debugfs_create_basic(struct hci_dev *hdev);
#else
@@ -45,4 +46,8 @@ static inline void hci_debugfs_create_conn(struct hci_conn *conn)
{
}
+static inline void hci_debugfs_create_basic(struct hci_dev *hdev)
+{
+}
+
#endif
diff --git a/net/bluetooth/hci_drv.c b/net/bluetooth/hci_drv.c
new file mode 100644
index 000000000000..3dd2d8a006b9
--- /dev/null
+++ b/net/bluetooth/hci_drv.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025 Google Corporation
+ */
+
+#include <linux/skbuff.h>
+#include <linux/types.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/hci_drv.h>
+
+int hci_drv_cmd_status(struct hci_dev *hdev, u16 cmd, u8 status)
+{
+ struct hci_drv_ev_hdr *hdr;
+ struct hci_drv_ev_cmd_status *ev;
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(sizeof(*hdr) + sizeof(*ev), GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ hdr = skb_put(skb, sizeof(*hdr));
+ hdr->opcode = __cpu_to_le16(HCI_DRV_EV_CMD_STATUS);
+ hdr->len = __cpu_to_le16(sizeof(*ev));
+
+ ev = skb_put(skb, sizeof(*ev));
+ ev->opcode = __cpu_to_le16(cmd);
+ ev->status = status;
+
+ hci_skb_pkt_type(skb) = HCI_DRV_PKT;
+
+ return hci_recv_frame(hdev, skb);
+}
+EXPORT_SYMBOL(hci_drv_cmd_status);
+
+int hci_drv_cmd_complete(struct hci_dev *hdev, u16 cmd, u8 status, void *rp,
+ size_t rp_len)
+{
+ struct hci_drv_ev_hdr *hdr;
+ struct hci_drv_ev_cmd_complete *ev;
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(sizeof(*hdr) + sizeof(*ev) + rp_len, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ hdr = skb_put(skb, sizeof(*hdr));
+ hdr->opcode = __cpu_to_le16(HCI_DRV_EV_CMD_COMPLETE);
+ hdr->len = __cpu_to_le16(sizeof(*ev) + rp_len);
+
+ ev = skb_put(skb, sizeof(*ev));
+ ev->opcode = __cpu_to_le16(cmd);
+ ev->status = status;
+
+ skb_put_data(skb, rp, rp_len);
+
+ hci_skb_pkt_type(skb) = HCI_DRV_PKT;
+
+ return hci_recv_frame(hdev, skb);
+}
+EXPORT_SYMBOL(hci_drv_cmd_complete);
+
+int hci_drv_process_cmd(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_drv_cmd_hdr *hdr;
+ const struct hci_drv_handler *handler = NULL;
+ u16 opcode, len, ogf, ocf;
+
+ hdr = skb_pull_data(skb, sizeof(*hdr));
+ if (!hdr)
+ return -EILSEQ;
+
+ opcode = __le16_to_cpu(hdr->opcode);
+ len = __le16_to_cpu(hdr->len);
+ if (len != skb->len)
+ return -EILSEQ;
+
+ ogf = hci_opcode_ogf(opcode);
+ ocf = hci_opcode_ocf(opcode);
+
+ if (!hdev->hci_drv)
+ return hci_drv_cmd_status(hdev, opcode,
+ HCI_DRV_STATUS_UNKNOWN_COMMAND);
+
+ if (ogf != HCI_DRV_OGF_DRIVER_SPECIFIC) {
+ if (opcode < hdev->hci_drv->common_handler_count)
+ handler = &hdev->hci_drv->common_handlers[opcode];
+ } else {
+ if (ocf < hdev->hci_drv->specific_handler_count)
+ handler = &hdev->hci_drv->specific_handlers[ocf];
+ }
+
+ if (!handler || !handler->func)
+ return hci_drv_cmd_status(hdev, opcode,
+ HCI_DRV_STATUS_UNKNOWN_COMMAND);
+
+ if (len != handler->data_len)
+ return hci_drv_cmd_status(hdev, opcode,
+ HCI_DRV_STATUS_INVALID_PARAMETERS);
+
+ return handler->func(hdev, skb->data, len);
+}
+EXPORT_SYMBOL(hci_drv_process_cmd);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index f12555f23a49..a9868f17ef40 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -1,6 +1,7 @@
/*
BlueZ - Bluetooth protocol stack for Linux
Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved.
+ Copyright 2023-2024 NXP
Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com>
@@ -24,31 +25,82 @@
/* Bluetooth HCI event handling. */
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
+#include <linux/crypto.h>
+#include <crypto/algapi.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
#include <net/bluetooth/mgmt.h>
-#include "hci_request.h"
#include "hci_debugfs.h"
-#include "a2mp.h"
-#include "amp.h"
+#include "hci_codec.h"
#include "smp.h"
+#include "msft.h"
+#include "eir.h"
#define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \
"\x00\x00\x00\x00\x00\x00\x00\x00"
/* Handle HCI Event packets */
-static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb)
+static void *hci_ev_skb_pull(struct hci_dev *hdev, struct sk_buff *skb,
+ u8 ev, size_t len)
{
- __u8 status = *((__u8 *) skb->data);
+ void *data;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ data = skb_pull_data(skb, len);
+ if (!data)
+ bt_dev_err(hdev, "Malformed Event: 0x%2.2x", ev);
- if (status)
- return;
+ return data;
+}
+
+static void *hci_cc_skb_pull(struct hci_dev *hdev, struct sk_buff *skb,
+ u16 op, size_t len)
+{
+ void *data;
+
+ data = skb_pull_data(skb, len);
+ if (!data)
+ bt_dev_err(hdev, "Malformed Command Complete: 0x%4.4x", op);
+
+ return data;
+}
+
+static void *hci_le_ev_skb_pull(struct hci_dev *hdev, struct sk_buff *skb,
+ u8 ev, size_t len)
+{
+ void *data;
+
+ data = skb_pull_data(skb, len);
+ if (!data)
+ bt_dev_err(hdev, "Malformed LE Event: 0x%2.2x", ev);
+
+ return data;
+}
+
+static u8 hci_cc_inquiry_cancel(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ /* It is possible that we receive Inquiry Complete event right
+ * before we receive Inquiry Cancel Command Complete event, in
+ * which case the latter event should have status of Command
+ * Disallowed. This should not be treated as error, since
+ * we actually achieve what Inquiry Cancel wants to achieve,
+ * which is to end the last Inquiry session.
+ */
+ if (rp->status == HCI_ERROR_COMMAND_DISALLOWED && !test_bit(HCI_INQUIRY, &hdev->flags)) {
+ bt_dev_warn(hdev, "Ignoring error of Inquiry Cancel command");
+ rp->status = 0x00;
+ }
+
+ if (rp->status)
+ return rp->status;
clear_bit(HCI_INQUIRY, &hdev->flags);
smp_mb__after_atomic(); /* wake_up_bit advises about this barrier */
@@ -63,50 +115,59 @@ static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb)
hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
hci_dev_unlock(hdev);
- hci_conn_check_pending(hdev);
+ return rp->status;
}
-static void hci_cc_periodic_inq(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_periodic_inq(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
hci_dev_set_flag(hdev, HCI_PERIODIC_INQ);
+
+ return rp->status;
}
-static void hci_cc_exit_periodic_inq(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_exit_periodic_inq(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
hci_dev_clear_flag(hdev, HCI_PERIODIC_INQ);
- hci_conn_check_pending(hdev);
+ return rp->status;
}
-static void hci_cc_remote_name_req_cancel(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_remote_name_req_cancel(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- BT_DBG("%s", hdev->name);
+ struct hci_rp_remote_name_req_cancel *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ return rp->status;
}
-static void hci_cc_role_discovery(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_role_discovery(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_role_discovery *rp = (void *) skb->data;
+ struct hci_rp_role_discovery *rp = data;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
hci_dev_lock(hdev);
@@ -115,17 +176,20 @@ static void hci_cc_role_discovery(struct hci_dev *hdev, struct sk_buff *skb)
conn->role = rp->role;
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_read_link_policy(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_read_link_policy(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_read_link_policy *rp = (void *) skb->data;
+ struct hci_rp_read_link_policy *rp = data;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
hci_dev_lock(hdev);
@@ -134,22 +198,25 @@ static void hci_cc_read_link_policy(struct hci_dev *hdev, struct sk_buff *skb)
conn->link_policy = __le16_to_cpu(rp->policy);
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_write_link_policy(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_write_link_policy(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_write_link_policy *rp = (void *) skb->data;
+ struct hci_rp_write_link_policy *rp = data;
struct hci_conn *conn;
void *sent;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_LINK_POLICY);
if (!sent)
- return;
+ return rp->status;
hci_dev_lock(hdev);
@@ -158,49 +225,55 @@ static void hci_cc_write_link_policy(struct hci_dev *hdev, struct sk_buff *skb)
conn->link_policy = get_unaligned_le16(sent + 2);
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_read_def_link_policy(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_read_def_link_policy(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_read_def_link_policy *rp = (void *) skb->data;
+ struct hci_rp_read_def_link_policy *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
hdev->link_policy = __le16_to_cpu(rp->policy);
+
+ return rp->status;
}
-static void hci_cc_write_def_link_policy(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_write_def_link_policy(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
void *sent;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_DEF_LINK_POLICY);
if (!sent)
- return;
+ return rp->status;
hdev->link_policy = get_unaligned_le16(sent);
+
+ return rp->status;
}
-static void hci_cc_reset(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_reset(struct hci_dev *hdev, void *data, struct sk_buff *skb)
{
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
clear_bit(HCI_RESET, &hdev->flags);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
/* Reset all non-persistent flags */
hci_dev_clear_volatile_flags(hdev);
@@ -220,93 +293,109 @@ static void hci_cc_reset(struct hci_dev *hdev, struct sk_buff *skb)
hdev->ssp_debug_mode = 0;
- hci_bdaddr_list_clear(&hdev->le_white_list);
+ hci_bdaddr_list_clear(&hdev->le_accept_list);
hci_bdaddr_list_clear(&hdev->le_resolv_list);
+
+ return rp->status;
}
-static void hci_cc_read_stored_link_key(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_read_stored_link_key(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_read_stored_link_key *rp = (void *)skb->data;
+ struct hci_rp_read_stored_link_key *rp = data;
struct hci_cp_read_stored_link_key *sent;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
sent = hci_sent_cmd_data(hdev, HCI_OP_READ_STORED_LINK_KEY);
if (!sent)
- return;
+ return rp->status;
if (!rp->status && sent->read_all == 0x01) {
- hdev->stored_max_keys = rp->max_keys;
- hdev->stored_num_keys = rp->num_keys;
+ hdev->stored_max_keys = le16_to_cpu(rp->max_keys);
+ hdev->stored_num_keys = le16_to_cpu(rp->num_keys);
}
+
+ return rp->status;
}
-static void hci_cc_delete_stored_link_key(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_delete_stored_link_key(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_delete_stored_link_key *rp = (void *)skb->data;
+ struct hci_rp_delete_stored_link_key *rp = data;
+ u16 num_keys;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
- if (rp->num_keys <= hdev->stored_num_keys)
- hdev->stored_num_keys -= rp->num_keys;
+ num_keys = le16_to_cpu(rp->num_keys);
+
+ if (num_keys <= hdev->stored_num_keys)
+ hdev->stored_num_keys -= num_keys;
else
hdev->stored_num_keys = 0;
+
+ return rp->status;
}
-static void hci_cc_write_local_name(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_write_local_name(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
void *sent;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_LOCAL_NAME);
if (!sent)
- return;
+ return rp->status;
hci_dev_lock(hdev);
if (hci_dev_test_flag(hdev, HCI_MGMT))
- mgmt_set_local_name_complete(hdev, sent, status);
- else if (!status)
+ mgmt_set_local_name_complete(hdev, sent, rp->status);
+ else if (!rp->status)
memcpy(hdev->dev_name, sent, HCI_MAX_NAME_LENGTH);
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_read_local_name(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_read_local_name(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_read_local_name *rp = (void *) skb->data;
+ struct hci_rp_read_local_name *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
if (hci_dev_test_flag(hdev, HCI_SETUP) ||
hci_dev_test_flag(hdev, HCI_CONFIG))
memcpy(hdev->dev_name, rp->name, HCI_MAX_NAME_LENGTH);
+
+ return rp->status;
}
-static void hci_cc_write_auth_enable(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_write_auth_enable(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
void *sent;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_AUTH_ENABLE);
if (!sent)
- return;
+ return rp->status;
hci_dev_lock(hdev);
- if (!status) {
+ if (!rp->status) {
__u8 param = *((__u8 *) sent);
if (param == AUTH_ENABLED)
@@ -316,25 +405,28 @@ static void hci_cc_write_auth_enable(struct hci_dev *hdev, struct sk_buff *skb)
}
if (hci_dev_test_flag(hdev, HCI_MGMT))
- mgmt_auth_enable_complete(hdev, status);
+ mgmt_auth_enable_complete(hdev, rp->status);
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_write_encrypt_mode(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_write_encrypt_mode(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
__u8 param;
void *sent;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_ENCRYPT_MODE);
if (!sent)
- return;
+ return rp->status;
param = *((__u8 *) sent);
@@ -342,25 +434,28 @@ static void hci_cc_write_encrypt_mode(struct hci_dev *hdev, struct sk_buff *skb)
set_bit(HCI_ENCRYPT, &hdev->flags);
else
clear_bit(HCI_ENCRYPT, &hdev->flags);
+
+ return rp->status;
}
-static void hci_cc_write_scan_enable(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_write_scan_enable(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
__u8 param;
void *sent;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_SCAN_ENABLE);
if (!sent)
- return;
+ return rp->status;
param = *((__u8 *) sent);
hci_dev_lock(hdev);
- if (status) {
+ if (rp->status) {
hdev->discov_timeout = 0;
goto done;
}
@@ -377,135 +472,178 @@ static void hci_cc_write_scan_enable(struct hci_dev *hdev, struct sk_buff *skb)
done:
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_read_class_of_dev(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_set_event_filter(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_read_class_of_dev *rp = (void *) skb->data;
+ struct hci_ev_status *rp = data;
+ struct hci_cp_set_event_filter *cp;
+ void *sent;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_SET_EVENT_FLT);
+ if (!sent)
+ return rp->status;
+
+ cp = (struct hci_cp_set_event_filter *)sent;
+
+ if (cp->flt_type == HCI_FLT_CLEAR_ALL)
+ hci_dev_clear_flag(hdev, HCI_EVENT_FILTER_CONFIGURED);
+ else
+ hci_dev_set_flag(hdev, HCI_EVENT_FILTER_CONFIGURED);
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_class_of_dev(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_class_of_dev *rp = data;
+
+ if (WARN_ON(!hdev))
+ return HCI_ERROR_UNSPECIFIED;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
memcpy(hdev->dev_class, rp->dev_class, 3);
- BT_DBG("%s class 0x%.2x%.2x%.2x", hdev->name,
- hdev->dev_class[2], hdev->dev_class[1], hdev->dev_class[0]);
+ bt_dev_dbg(hdev, "class 0x%.2x%.2x%.2x", hdev->dev_class[2],
+ hdev->dev_class[1], hdev->dev_class[0]);
+
+ return rp->status;
}
-static void hci_cc_write_class_of_dev(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_write_class_of_dev(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
void *sent;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_CLASS_OF_DEV);
if (!sent)
- return;
+ return rp->status;
hci_dev_lock(hdev);
- if (status == 0)
+ if (!rp->status)
memcpy(hdev->dev_class, sent, 3);
if (hci_dev_test_flag(hdev, HCI_MGMT))
- mgmt_set_class_of_dev_complete(hdev, sent, status);
+ mgmt_set_class_of_dev_complete(hdev, sent, rp->status);
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_read_voice_setting(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_read_voice_setting(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_read_voice_setting *rp = (void *) skb->data;
+ struct hci_rp_read_voice_setting *rp = data;
__u16 setting;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
setting = __le16_to_cpu(rp->voice_setting);
if (hdev->voice_setting == setting)
- return;
+ return rp->status;
hdev->voice_setting = setting;
- BT_DBG("%s voice setting 0x%4.4x", hdev->name, setting);
+ bt_dev_dbg(hdev, "voice setting 0x%4.4x", setting);
if (hdev->notify)
hdev->notify(hdev, HCI_NOTIFY_VOICE_SETTING);
+
+ return rp->status;
}
-static void hci_cc_write_voice_setting(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_write_voice_setting(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
__u16 setting;
void *sent;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_VOICE_SETTING);
if (!sent)
- return;
+ return rp->status;
setting = get_unaligned_le16(sent);
if (hdev->voice_setting == setting)
- return;
+ return rp->status;
hdev->voice_setting = setting;
- BT_DBG("%s voice setting 0x%4.4x", hdev->name, setting);
+ bt_dev_dbg(hdev, "voice setting 0x%4.4x", setting);
if (hdev->notify)
hdev->notify(hdev, HCI_NOTIFY_VOICE_SETTING);
+
+ return rp->status;
}
-static void hci_cc_read_num_supported_iac(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_read_num_supported_iac(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_read_num_supported_iac *rp = (void *) skb->data;
+ struct hci_rp_read_num_supported_iac *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
hdev->num_iac = rp->num_iac;
- BT_DBG("%s num iac %d", hdev->name, hdev->num_iac);
+ bt_dev_dbg(hdev, "num iac %d", hdev->num_iac);
+
+ return rp->status;
}
-static void hci_cc_write_ssp_mode(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_write_ssp_mode(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
struct hci_cp_write_ssp_mode *sent;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_SSP_MODE);
if (!sent)
- return;
+ return rp->status;
hci_dev_lock(hdev);
- if (!status) {
+ if (!rp->status) {
if (sent->mode)
hdev->features[1][0] |= LMP_HOST_SSP;
else
hdev->features[1][0] &= ~LMP_HOST_SSP;
}
- if (hci_dev_test_flag(hdev, HCI_MGMT))
- mgmt_ssp_enable_complete(hdev, sent->mode, status);
- else if (!status) {
+ if (!rp->status) {
if (sent->mode)
hci_dev_set_flag(hdev, HCI_SSP_ENABLED);
else
@@ -513,29 +651,32 @@ static void hci_cc_write_ssp_mode(struct hci_dev *hdev, struct sk_buff *skb)
}
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_write_sc_support(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_write_sc_support(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- u8 status = *((u8 *) skb->data);
+ struct hci_ev_status *rp = data;
struct hci_cp_write_sc_support *sent;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_SC_SUPPORT);
if (!sent)
- return;
+ return rp->status;
hci_dev_lock(hdev);
- if (!status) {
+ if (!rp->status) {
if (sent->support)
hdev->features[1][0] |= LMP_HOST_SC;
else
hdev->features[1][0] &= ~LMP_HOST_SC;
}
- if (!hci_dev_test_flag(hdev, HCI_MGMT) && !status) {
+ if (!hci_dev_test_flag(hdev, HCI_MGMT) && !rp->status) {
if (sent->support)
hci_dev_set_flag(hdev, HCI_SC_ENABLED);
else
@@ -543,16 +684,19 @@ static void hci_cc_write_sc_support(struct hci_dev *hdev, struct sk_buff *skb)
}
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_read_local_version(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_read_local_version(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_read_local_version *rp = (void *) skb->data;
+ struct hci_rp_read_local_version *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
if (hci_dev_test_flag(hdev, HCI_SETUP) ||
hci_dev_test_flag(hdev, HCI_CONFIG)) {
@@ -562,32 +706,154 @@ static void hci_cc_read_local_version(struct hci_dev *hdev, struct sk_buff *skb)
hdev->manufacturer = __le16_to_cpu(rp->manufacturer);
hdev->lmp_subver = __le16_to_cpu(rp->lmp_subver);
}
+
+ return rp->status;
}
-static void hci_cc_read_local_commands(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_read_enc_key_size(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_read_local_commands *rp = (void *) skb->data;
+ struct hci_rp_read_enc_key_size *rp = data;
+ struct hci_conn *conn;
+ u16 handle;
+ u8 status = rp->status;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
+
+ handle = le16_to_cpu(rp->handle);
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!conn) {
+ status = 0xFF;
+ goto done;
+ }
+
+ /* While unexpected, the read_enc_key_size command may fail. The most
+ * secure approach is to then assume the key size is 0 to force a
+ * disconnection.
+ */
+ if (status) {
+ bt_dev_err(hdev, "failed to read key size for handle %u",
+ handle);
+ conn->enc_key_size = 0;
+ } else {
+ u8 *key_enc_size = hci_conn_key_enc_size(conn);
+
+ conn->enc_key_size = rp->key_size;
+ status = 0;
+
+ /* Attempt to check if the key size is too small or if it has
+ * been downgraded from the last time it was stored as part of
+ * the link_key.
+ */
+ if (conn->enc_key_size < hdev->min_enc_key_size ||
+ (key_enc_size && conn->enc_key_size < *key_enc_size)) {
+ /* As slave role, the conn->state has been set to
+ * BT_CONNECTED and l2cap conn req might not be received
+ * yet, at this moment the l2cap layer almost does
+ * nothing with the non-zero status.
+ * So we also clear encrypt related bits, and then the
+ * handler of l2cap conn req will get the right secure
+ * state at a later time.
+ */
+ status = HCI_ERROR_AUTH_FAILURE;
+ clear_bit(HCI_CONN_ENCRYPT, &conn->flags);
+ clear_bit(HCI_CONN_AES_CCM, &conn->flags);
+ }
+
+ /* Update the key encryption size with the connection one */
+ if (key_enc_size && *key_enc_size != conn->enc_key_size)
+ *key_enc_size = conn->enc_key_size;
+ }
+
+ hci_encrypt_cfm(conn, status);
+
+done:
+ hci_dev_unlock(hdev);
+
+ return status;
+}
+
+static u8 hci_cc_read_local_commands(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_local_commands *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
if (hci_dev_test_flag(hdev, HCI_SETUP) ||
hci_dev_test_flag(hdev, HCI_CONFIG))
memcpy(hdev->commands, rp->commands, sizeof(hdev->commands));
+
+ return rp->status;
}
-static void hci_cc_read_local_features(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_read_auth_payload_timeout(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_read_local_features *rp = (void *) skb->data;
+ struct hci_rp_read_auth_payload_to *rp = data;
+ struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+ if (conn)
+ conn->auth_payload_timeout = __le16_to_cpu(rp->timeout);
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_write_auth_payload_timeout(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_write_auth_payload_to *rp = data;
+ struct hci_conn *conn;
+ void *sent;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_AUTH_PAYLOAD_TO);
+ if (!sent)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rp->handle));
+ if (!conn) {
+ rp->status = 0xff;
+ goto unlock;
+ }
+
+ if (!rp->status)
+ conn->auth_payload_timeout = get_unaligned_le16(sent + 2);
+
+unlock:
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_local_features(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_local_features *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
memcpy(hdev->features, rp->features, 8);
@@ -627,178 +893,191 @@ static void hci_cc_read_local_features(struct hci_dev *hdev,
if (hdev->features[0][5] & LMP_EDR_3S_ESCO)
hdev->esco_type |= (ESCO_2EV5 | ESCO_3EV5);
+
+ return rp->status;
}
-static void hci_cc_read_local_ext_features(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_read_local_ext_features(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_read_local_ext_features *rp = (void *) skb->data;
+ struct hci_rp_read_local_ext_features *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
- if (hdev->max_page < rp->max_page)
- hdev->max_page = rp->max_page;
+ if (hdev->max_page < rp->max_page) {
+ if (hci_test_quirk(hdev,
+ HCI_QUIRK_BROKEN_LOCAL_EXT_FEATURES_PAGE_2))
+ bt_dev_warn(hdev, "broken local ext features page 2");
+ else
+ hdev->max_page = rp->max_page;
+ }
if (rp->page < HCI_MAX_PAGES)
memcpy(hdev->features[rp->page], rp->features, 8);
-}
-
-static void hci_cc_read_flow_control_mode(struct hci_dev *hdev,
- struct sk_buff *skb)
-{
- struct hci_rp_read_flow_control_mode *rp = (void *) skb->data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
-
- if (rp->status)
- return;
-
- hdev->flow_ctl_mode = rp->mode;
+ return rp->status;
}
-static void hci_cc_read_buffer_size(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_read_buffer_size(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_read_buffer_size *rp = (void *) skb->data;
+ struct hci_rp_read_buffer_size *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
hdev->acl_mtu = __le16_to_cpu(rp->acl_mtu);
hdev->sco_mtu = rp->sco_mtu;
hdev->acl_pkts = __le16_to_cpu(rp->acl_max_pkt);
hdev->sco_pkts = __le16_to_cpu(rp->sco_max_pkt);
- if (test_bit(HCI_QUIRK_FIXUP_BUFFER_SIZE, &hdev->quirks)) {
+ if (hci_test_quirk(hdev, HCI_QUIRK_FIXUP_BUFFER_SIZE)) {
hdev->sco_mtu = 64;
hdev->sco_pkts = 8;
}
+ if (!read_voice_setting_capable(hdev))
+ hdev->sco_pkts = 0;
+
hdev->acl_cnt = hdev->acl_pkts;
hdev->sco_cnt = hdev->sco_pkts;
BT_DBG("%s acl mtu %d:%d sco mtu %d:%d", hdev->name, hdev->acl_mtu,
hdev->acl_pkts, hdev->sco_mtu, hdev->sco_pkts);
+
+ if (!hdev->acl_mtu || !hdev->acl_pkts)
+ return HCI_ERROR_INVALID_PARAMETERS;
+
+ return rp->status;
}
-static void hci_cc_read_bd_addr(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_read_bd_addr(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_read_bd_addr *rp = (void *) skb->data;
+ struct hci_rp_read_bd_addr *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
if (test_bit(HCI_INIT, &hdev->flags))
bacpy(&hdev->bdaddr, &rp->bdaddr);
if (hci_dev_test_flag(hdev, HCI_SETUP))
bacpy(&hdev->setup_addr, &rp->bdaddr);
+
+ return rp->status;
}
-static void hci_cc_read_page_scan_activity(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_read_local_pairing_opts(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_read_page_scan_activity *rp = (void *) skb->data;
+ struct hci_rp_read_local_pairing_opts *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
+
+ if (hci_dev_test_flag(hdev, HCI_SETUP) ||
+ hci_dev_test_flag(hdev, HCI_CONFIG)) {
+ hdev->pairing_opts = rp->pairing_opts;
+ hdev->max_enc_key_size = rp->max_key_size;
+ }
+
+ return rp->status;
+}
+
+static u8 hci_cc_read_page_scan_activity(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_read_page_scan_activity *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
if (test_bit(HCI_INIT, &hdev->flags)) {
hdev->page_scan_interval = __le16_to_cpu(rp->interval);
hdev->page_scan_window = __le16_to_cpu(rp->window);
}
+
+ return rp->status;
}
-static void hci_cc_write_page_scan_activity(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_write_page_scan_activity(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- u8 status = *((u8 *) skb->data);
+ struct hci_ev_status *rp = data;
struct hci_cp_write_page_scan_activity *sent;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_PAGE_SCAN_ACTIVITY);
if (!sent)
- return;
+ return rp->status;
hdev->page_scan_interval = __le16_to_cpu(sent->interval);
hdev->page_scan_window = __le16_to_cpu(sent->window);
+
+ return rp->status;
}
-static void hci_cc_read_page_scan_type(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_read_page_scan_type(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_read_page_scan_type *rp = (void *) skb->data;
+ struct hci_rp_read_page_scan_type *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
if (test_bit(HCI_INIT, &hdev->flags))
hdev->page_scan_type = rp->type;
+
+ return rp->status;
}
-static void hci_cc_write_page_scan_type(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_write_page_scan_type(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- u8 status = *((u8 *) skb->data);
+ struct hci_ev_status *rp = data;
u8 *type;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
type = hci_sent_cmd_data(hdev, HCI_OP_WRITE_PAGE_SCAN_TYPE);
if (type)
hdev->page_scan_type = *type;
-}
-
-static void hci_cc_read_data_block_size(struct hci_dev *hdev,
- struct sk_buff *skb)
-{
- struct hci_rp_read_data_block_size *rp = (void *) skb->data;
-
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
- if (rp->status)
- return;
-
- hdev->block_mtu = __le16_to_cpu(rp->max_acl_len);
- hdev->block_len = __le16_to_cpu(rp->block_len);
- hdev->num_blocks = __le16_to_cpu(rp->num_blocks);
-
- hdev->block_cnt = hdev->num_blocks;
-
- BT_DBG("%s blk mtu %d cnt %d len %d", hdev->name, hdev->block_mtu,
- hdev->block_cnt, hdev->block_len);
+ return rp->status;
}
-static void hci_cc_read_clock(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_read_clock(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_read_clock *rp = (void *) skb->data;
+ struct hci_rp_read_clock *rp = data;
struct hci_cp_read_clock *cp;
struct hci_conn *conn;
- BT_DBG("%s", hdev->name);
-
- if (skb->len < sizeof(*rp))
- return;
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
hci_dev_lock(hdev);
@@ -819,50 +1098,67 @@ static void hci_cc_read_clock(struct hci_dev *hdev, struct sk_buff *skb)
unlock:
hci_dev_unlock(hdev);
+ return rp->status;
}
-static void hci_cc_read_local_amp_info(struct hci_dev *hdev,
+static u8 hci_cc_read_inq_rsp_tx_power(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
- struct hci_rp_read_local_amp_info *rp = (void *) skb->data;
+ struct hci_rp_read_inq_rsp_tx_power *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
+
+ hdev->inq_tx_power = rp->tx_power;
- hdev->amp_status = rp->amp_status;
- hdev->amp_total_bw = __le32_to_cpu(rp->total_bw);
- hdev->amp_max_bw = __le32_to_cpu(rp->max_bw);
- hdev->amp_min_latency = __le32_to_cpu(rp->min_latency);
- hdev->amp_max_pdu = __le32_to_cpu(rp->max_pdu);
- hdev->amp_type = rp->amp_type;
- hdev->amp_pal_cap = __le16_to_cpu(rp->pal_cap);
- hdev->amp_assoc_size = __le16_to_cpu(rp->max_assoc_size);
- hdev->amp_be_flush_to = __le32_to_cpu(rp->be_flush_to);
- hdev->amp_max_flush_to = __le32_to_cpu(rp->max_flush_to);
+ return rp->status;
}
-static void hci_cc_read_inq_rsp_tx_power(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_read_def_err_data_reporting(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_read_inq_rsp_tx_power *rp = (void *) skb->data;
+ struct hci_rp_read_def_err_data_reporting *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
- hdev->inq_tx_power = rp->tx_power;
+ hdev->err_data_reporting = rp->err_data_reporting;
+
+ return rp->status;
+}
+
+static u8 hci_cc_write_def_err_data_reporting(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ struct hci_cp_write_def_err_data_reporting *cp;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_WRITE_DEF_ERR_DATA_REPORTING);
+ if (!cp)
+ return rp->status;
+
+ hdev->err_data_reporting = cp->err_data_reporting;
+
+ return rp->status;
}
-static void hci_cc_pin_code_reply(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_pin_code_reply(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_pin_code_reply *rp = (void *) skb->data;
+ struct hci_rp_pin_code_reply *rp = data;
struct hci_cp_pin_code_reply *cp;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
hci_dev_lock(hdev);
@@ -882,13 +1178,15 @@ static void hci_cc_pin_code_reply(struct hci_dev *hdev, struct sk_buff *skb)
unlock:
hci_dev_unlock(hdev);
+ return rp->status;
}
-static void hci_cc_pin_code_neg_reply(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_pin_code_neg_reply(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_pin_code_neg_reply *rp = (void *) skb->data;
+ struct hci_rp_pin_code_neg_reply *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
hci_dev_lock(hdev);
@@ -897,17 +1195,19 @@ static void hci_cc_pin_code_neg_reply(struct hci_dev *hdev, struct sk_buff *skb)
rp->status);
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_le_read_buffer_size(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_le_read_buffer_size(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_le_read_buffer_size *rp = (void *) skb->data;
+ struct hci_rp_le_read_buffer_size *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
hdev->le_mtu = __le16_to_cpu(rp->le_mtu);
hdev->le_pkts = rp->le_max_pkt;
@@ -915,39 +1215,49 @@ static void hci_cc_le_read_buffer_size(struct hci_dev *hdev,
hdev->le_cnt = hdev->le_pkts;
BT_DBG("%s le mtu %d:%d", hdev->name, hdev->le_mtu, hdev->le_pkts);
+
+ if (hdev->le_mtu && hdev->le_mtu < HCI_MIN_LE_MTU)
+ return HCI_ERROR_INVALID_PARAMETERS;
+
+ return rp->status;
}
-static void hci_cc_le_read_local_features(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_le_read_local_features(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_le_read_local_features *rp = (void *) skb->data;
+ struct hci_rp_le_read_local_features *rp = data;
BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
if (rp->status)
- return;
+ return rp->status;
memcpy(hdev->le_features, rp->features, 8);
+
+ return rp->status;
}
-static void hci_cc_le_read_adv_tx_power(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_le_read_adv_tx_power(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_le_read_adv_tx_power *rp = (void *) skb->data;
+ struct hci_rp_le_read_adv_tx_power *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
hdev->adv_tx_power = rp->tx_power;
+
+ return rp->status;
}
-static void hci_cc_user_confirm_reply(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_user_confirm_reply(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_user_confirm_reply *rp = (void *) skb->data;
+ struct hci_rp_user_confirm_reply *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
hci_dev_lock(hdev);
@@ -956,14 +1266,16 @@ static void hci_cc_user_confirm_reply(struct hci_dev *hdev, struct sk_buff *skb)
rp->status);
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_user_confirm_neg_reply(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_user_confirm_neg_reply(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_user_confirm_reply *rp = (void *) skb->data;
+ struct hci_rp_user_confirm_reply *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
hci_dev_lock(hdev);
@@ -972,13 +1284,16 @@ static void hci_cc_user_confirm_neg_reply(struct hci_dev *hdev,
ACL_LINK, 0, rp->status);
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_user_passkey_reply(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_user_passkey_reply(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_user_confirm_reply *rp = (void *) skb->data;
+ struct hci_rp_user_confirm_reply *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
hci_dev_lock(hdev);
@@ -987,14 +1302,16 @@ static void hci_cc_user_passkey_reply(struct hci_dev *hdev, struct sk_buff *skb)
0, rp->status);
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_user_passkey_neg_reply(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_user_passkey_neg_reply(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_user_confirm_reply *rp = (void *) skb->data;
+ struct hci_rp_user_confirm_reply *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
hci_dev_lock(hdev);
@@ -1003,58 +1320,74 @@ static void hci_cc_user_passkey_neg_reply(struct hci_dev *hdev,
ACL_LINK, 0, rp->status);
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_read_local_oob_data(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_read_local_oob_data(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_read_local_oob_data *rp = (void *) skb->data;
+ struct hci_rp_read_local_oob_data *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ return rp->status;
}
-static void hci_cc_read_local_oob_ext_data(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_read_local_oob_ext_data(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_read_local_oob_ext_data *rp = (void *) skb->data;
+ struct hci_rp_read_local_oob_ext_data *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ return rp->status;
}
-static void hci_cc_le_set_random_addr(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_le_set_random_addr(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
bdaddr_t *sent;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
sent = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_RANDOM_ADDR);
if (!sent)
- return;
+ return rp->status;
hci_dev_lock(hdev);
bacpy(&hdev->random_addr, sent);
+ if (!bacmp(&hdev->rpa, sent)) {
+ hci_dev_clear_flag(hdev, HCI_RPA_EXPIRED);
+ queue_delayed_work(hdev->workqueue, &hdev->rpa_expired,
+ secs_to_jiffies(hdev->rpa_timeout));
+ }
+
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_le_set_default_phy(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_le_set_default_phy(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
struct hci_cp_le_set_default_phy *cp;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_DEFAULT_PHY);
if (!cp)
- return;
+ return rp->status;
hci_dev_lock(hdev);
@@ -1062,49 +1395,164 @@ static void hci_cc_le_set_default_phy(struct hci_dev *hdev, struct sk_buff *skb)
hdev->le_rx_def_phys = cp->rx_phys;
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_le_set_adv_set_random_addr(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_le_set_adv_set_random_addr(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
struct hci_cp_le_set_adv_set_rand_addr *cp;
- struct adv_info *adv_instance;
+ struct adv_info *adv;
- if (status)
- return;
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADV_SET_RAND_ADDR);
- if (!cp)
- return;
+ /* Update only in case the adv instance since handle 0x00 shall be using
+ * HCI_OP_LE_SET_RANDOM_ADDR since that allows both extended and
+ * non-extended adverting.
+ */
+ if (!cp || !cp->handle)
+ return rp->status;
hci_dev_lock(hdev);
- if (!hdev->cur_adv_instance) {
- /* Store in hdev for instance 0 (Set adv and Directed advs) */
- bacpy(&hdev->random_addr, &cp->bdaddr);
- } else {
- adv_instance = hci_find_adv_instance(hdev,
- hdev->cur_adv_instance);
- if (adv_instance)
- bacpy(&adv_instance->random_addr, &cp->bdaddr);
+ adv = hci_find_adv_instance(hdev, cp->handle);
+ if (adv) {
+ bacpy(&adv->random_addr, &cp->bdaddr);
+ if (!bacmp(&hdev->rpa, &cp->bdaddr)) {
+ adv->rpa_expired = false;
+ queue_delayed_work(hdev->workqueue,
+ &adv->rpa_expired_cb,
+ secs_to_jiffies(hdev->rpa_timeout));
+ }
}
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_le_remove_adv_set(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- __u8 *sent, status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
+ u8 *instance;
+ int err;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
+
+ instance = hci_sent_cmd_data(hdev, HCI_OP_LE_REMOVE_ADV_SET);
+ if (!instance)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ err = hci_remove_adv_instance(hdev, *instance);
+ if (!err)
+ mgmt_advertising_removed(hci_skb_sk(hdev->sent_cmd), hdev,
+ *instance);
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_clear_adv_sets(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ struct adv_info *adv, *n;
+ int err;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ if (!hci_sent_cmd_data(hdev, HCI_OP_LE_CLEAR_ADV_SETS))
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ list_for_each_entry_safe(adv, n, &hdev->adv_instances, list) {
+ u8 instance = adv->instance;
+
+ err = hci_remove_adv_instance(hdev, instance);
+ if (!err)
+ mgmt_advertising_removed(hci_skb_sk(hdev->sent_cmd),
+ hdev, instance);
+ }
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_read_transmit_power(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_transmit_power *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ hdev->min_le_tx_power = rp->min_le_tx_power;
+ hdev->max_le_tx_power = rp->max_le_tx_power;
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_set_privacy_mode(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ struct hci_cp_le_set_privacy_mode *cp;
+ struct hci_conn_params *params;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_PRIVACY_MODE);
+ if (!cp)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+
+ params = hci_conn_params_lookup(hdev, &cp->bdaddr, cp->bdaddr_type);
+ if (params)
+ WRITE_ONCE(params->privacy_mode, cp->mode);
+
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_set_adv_enable(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ __u8 *sent;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
sent = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADV_ENABLE);
if (!sent)
- return;
+ return rp->status;
hci_dev_lock(hdev);
@@ -1126,78 +1574,118 @@ static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb)
}
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
struct hci_cp_le_set_ext_adv_enable *cp;
- __u8 status = *((__u8 *) skb->data);
+ struct hci_cp_ext_adv_set *set;
+ struct adv_info *adv = NULL, *n;
+ struct hci_ev_status *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE);
if (!cp)
- return;
+ return rp->status;
+
+ set = (void *)cp->data;
hci_dev_lock(hdev);
+ if (cp->num_of_sets)
+ adv = hci_find_adv_instance(hdev, set->handle);
+
if (cp->enable) {
struct hci_conn *conn;
hci_dev_set_flag(hdev, HCI_LE_ADV);
+ if (adv)
+ adv->enabled = true;
+ else if (!set->handle)
+ hci_dev_set_flag(hdev, HCI_LE_ADV_0);
+
conn = hci_lookup_le_connect(hdev);
if (conn)
queue_delayed_work(hdev->workqueue,
&conn->le_conn_timeout,
conn->conn_timeout);
} else {
+ if (cp->num_of_sets) {
+ if (adv)
+ adv->enabled = false;
+ else if (!set->handle)
+ hci_dev_clear_flag(hdev, HCI_LE_ADV_0);
+
+ /* If just one instance was disabled check if there are
+ * any other instance enabled before clearing HCI_LE_ADV
+ */
+ list_for_each_entry_safe(adv, n, &hdev->adv_instances,
+ list) {
+ if (adv->enabled)
+ goto unlock;
+ }
+ } else {
+ /* All instances shall be considered disabled */
+ list_for_each_entry_safe(adv, n, &hdev->adv_instances,
+ list)
+ adv->enabled = false;
+ }
+
hci_dev_clear_flag(hdev, HCI_LE_ADV);
}
+unlock:
hci_dev_unlock(hdev);
+ return rp->status;
}
-static void hci_cc_le_set_scan_param(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_le_set_scan_param(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
struct hci_cp_le_set_scan_param *cp;
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_SCAN_PARAM);
if (!cp)
- return;
+ return rp->status;
hci_dev_lock(hdev);
hdev->le_scan_type = cp->type;
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_le_set_ext_scan_param(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_le_set_ext_scan_param(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
struct hci_cp_le_set_ext_scan_params *cp;
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
struct hci_cp_le_scan_phy_params *phy_param;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_SCAN_PARAMS);
if (!cp)
- return;
+ return rp->status;
phy_param = (void *)cp->data;
@@ -1206,6 +1694,8 @@ static void hci_cc_le_set_ext_scan_param(struct hci_dev *hdev,
hdev->le_scan_type = phy_param->type;
hci_dev_unlock(hdev);
+
+ return rp->status;
}
static bool has_pending_adv_report(struct hci_dev *hdev)
@@ -1229,6 +1719,9 @@ static void store_pending_adv_report(struct hci_dev *hdev, bdaddr_t *bdaddr,
{
struct discovery_state *d = &hdev->discovery;
+ if (len > max_adv_len(hdev))
+ return;
+
bacpy(&d->last_adv_addr, bdaddr);
d->last_adv_addr_type = bdaddr_type;
d->last_adv_rssi = rssi;
@@ -1244,8 +1737,10 @@ static void le_set_scan_enable_complete(struct hci_dev *hdev, u8 enable)
switch (enable) {
case LE_SCAN_ENABLE:
hci_dev_set_flag(hdev, HCI_LE_SCAN);
- if (hdev->le_scan_type == LE_SCAN_ACTIVE)
+ if (hdev->le_scan_type == LE_SCAN_ACTIVE) {
clear_pending_adv_report(hdev);
+ hci_discovery_set_state(hdev, DISCOVERY_FINDING);
+ }
break;
case LE_SCAN_DISABLE:
@@ -1260,7 +1755,7 @@ static void le_set_scan_enable_complete(struct hci_dev *hdev, u8 enable)
d->last_adv_addr_type, NULL,
d->last_adv_rssi, d->last_adv_flags,
d->last_adv_data,
- d->last_adv_data_len, NULL, 0);
+ d->last_adv_data_len, NULL, 0, 0);
}
/* Cancel this timer so that we don't try to disable scanning
@@ -1272,16 +1767,13 @@ static void le_set_scan_enable_complete(struct hci_dev *hdev, u8 enable)
/* The HCI_LE_SCAN_INTERRUPTED flag indicates that we
* interrupted scanning due to a connect request. Mark
- * therefore discovery as stopped. If this was not
- * because of a connect request advertising might have
- * been disabled because of active scanning, so
- * re-enable it again if necessary.
+ * therefore discovery as stopped.
*/
if (hci_dev_test_and_clear_flag(hdev, HCI_LE_SCAN_INTERRUPTED))
hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
else if (!hci_dev_test_flag(hdev, HCI_LE_ADV) &&
hdev->discovery.state == DISCOVERY_FINDING)
- hci_req_reenable_advertising(hdev);
+ queue_work(hdev->workqueue, &hdev->reenable_adv_work);
break;
@@ -1294,205 +1786,285 @@ static void le_set_scan_enable_complete(struct hci_dev *hdev, u8 enable)
hci_dev_unlock(hdev);
}
-static void hci_cc_le_set_scan_enable(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_le_set_scan_enable(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
struct hci_cp_le_set_scan_enable *cp;
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_SCAN_ENABLE);
if (!cp)
- return;
+ return rp->status;
le_set_scan_enable_complete(hdev, cp->enable);
+
+ return rp->status;
}
-static void hci_cc_le_set_ext_scan_enable(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_le_set_ext_scan_enable(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
struct hci_cp_le_set_ext_scan_enable *cp;
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_SCAN_ENABLE);
if (!cp)
- return;
+ return rp->status;
le_set_scan_enable_complete(hdev, cp->enable);
+
+ return rp->status;
}
-static void hci_cc_le_read_num_adv_sets(struct hci_dev *hdev,
+static u8 hci_cc_le_read_num_adv_sets(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
- struct hci_rp_le_read_num_supported_adv_sets *rp = (void *) skb->data;
+ struct hci_rp_le_read_num_supported_adv_sets *rp = data;
- BT_DBG("%s status 0x%2.2x No of Adv sets %u", hdev->name, rp->status,
- rp->num_of_sets);
+ bt_dev_dbg(hdev, "status 0x%2.2x No of Adv sets %u", rp->status,
+ rp->num_of_sets);
if (rp->status)
- return;
+ return rp->status;
hdev->le_num_of_adv_sets = rp->num_of_sets;
+
+ return rp->status;
}
-static void hci_cc_le_read_white_list_size(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_le_read_accept_list_size(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_le_read_white_list_size *rp = (void *) skb->data;
+ struct hci_rp_le_read_accept_list_size *rp = data;
- BT_DBG("%s status 0x%2.2x size %u", hdev->name, rp->status, rp->size);
+ bt_dev_dbg(hdev, "status 0x%2.2x size %u", rp->status, rp->size);
if (rp->status)
- return;
+ return rp->status;
- hdev->le_white_list_size = rp->size;
+ hdev->le_accept_list_size = rp->size;
+
+ return rp->status;
}
-static void hci_cc_le_clear_white_list(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_le_clear_accept_list(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+ hci_bdaddr_list_clear(&hdev->le_accept_list);
+ hci_dev_unlock(hdev);
- hci_bdaddr_list_clear(&hdev->le_white_list);
+ return rp->status;
}
-static void hci_cc_le_add_to_white_list(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_le_add_to_accept_list(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_cp_le_add_to_white_list *sent;
- __u8 status = *((__u8 *) skb->data);
+ struct hci_cp_le_add_to_accept_list *sent;
+ struct hci_ev_status *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
- sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_WHITE_LIST);
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_ACCEPT_LIST);
if (!sent)
- return;
+ return rp->status;
+
+ hci_dev_lock(hdev);
+ hci_bdaddr_list_add(&hdev->le_accept_list, &sent->bdaddr,
+ sent->bdaddr_type);
+ hci_dev_unlock(hdev);
- hci_bdaddr_list_add(&hdev->le_white_list, &sent->bdaddr,
- sent->bdaddr_type);
+ return rp->status;
}
-static void hci_cc_le_del_from_white_list(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_le_del_from_accept_list(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_cp_le_del_from_white_list *sent;
- __u8 status = *((__u8 *) skb->data);
+ struct hci_cp_le_del_from_accept_list *sent;
+ struct hci_ev_status *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
- sent = hci_sent_cmd_data(hdev, HCI_OP_LE_DEL_FROM_WHITE_LIST);
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_DEL_FROM_ACCEPT_LIST);
if (!sent)
- return;
+ return rp->status;
- hci_bdaddr_list_del(&hdev->le_white_list, &sent->bdaddr,
+ hci_dev_lock(hdev);
+ hci_bdaddr_list_del(&hdev->le_accept_list, &sent->bdaddr,
sent->bdaddr_type);
+ hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_le_read_supported_states(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_le_read_supported_states(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_le_read_supported_states *rp = (void *) skb->data;
+ struct hci_rp_le_read_supported_states *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
memcpy(hdev->le_states, rp->le_states, 8);
+
+ return rp->status;
}
-static void hci_cc_le_read_def_data_len(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_le_read_def_data_len(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_le_read_def_data_len *rp = (void *) skb->data;
+ struct hci_rp_le_read_def_data_len *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
hdev->le_def_tx_len = le16_to_cpu(rp->tx_len);
hdev->le_def_tx_time = le16_to_cpu(rp->tx_time);
+
+ return rp->status;
}
-static void hci_cc_le_write_def_data_len(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_le_write_def_data_len(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
struct hci_cp_le_write_def_data_len *sent;
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
sent = hci_sent_cmd_data(hdev, HCI_OP_LE_WRITE_DEF_DATA_LEN);
if (!sent)
- return;
+ return rp->status;
hdev->le_def_tx_len = le16_to_cpu(sent->tx_len);
hdev->le_def_tx_time = le16_to_cpu(sent->tx_time);
+
+ return rp->status;
}
-static void hci_cc_le_clear_resolv_list(struct hci_dev *hdev,
+static u8 hci_cc_le_add_to_resolv_list(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
- __u8 status = *((__u8 *) skb->data);
+ struct hci_cp_le_add_to_resolv_list *sent;
+ struct hci_ev_status *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_RESOLV_LIST);
+ if (!sent)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+ hci_bdaddr_list_add_with_irk(&hdev->le_resolv_list, &sent->bdaddr,
+ sent->bdaddr_type, sent->peer_irk,
+ sent->local_irk);
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_del_from_resolv_list(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_del_from_resolv_list *sent;
+ struct hci_ev_status *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_DEL_FROM_RESOLV_LIST);
+ if (!sent)
+ return rp->status;
+
+ hci_dev_lock(hdev);
+ hci_bdaddr_list_del_with_irk(&hdev->le_resolv_list, &sent->bdaddr,
+ sent->bdaddr_type);
+ hci_dev_unlock(hdev);
+
+ return rp->status;
+}
+
+static u8 hci_cc_le_clear_resolv_list(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
+
+ if (rp->status)
+ return rp->status;
+
+ hci_dev_lock(hdev);
hci_bdaddr_list_clear(&hdev->le_resolv_list);
+ hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_le_read_resolv_list_size(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_le_read_resolv_list_size(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_le_read_resolv_list_size *rp = (void *) skb->data;
+ struct hci_rp_le_read_resolv_list_size *rp = data;
- BT_DBG("%s status 0x%2.2x size %u", hdev->name, rp->status, rp->size);
+ bt_dev_dbg(hdev, "status 0x%2.2x size %u", rp->status, rp->size);
if (rp->status)
- return;
+ return rp->status;
hdev->le_resolv_list_size = rp->size;
+
+ return rp->status;
}
-static void hci_cc_le_set_addr_resolution_enable(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_le_set_addr_resolution_enable(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- __u8 *sent, status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
+ __u8 *sent;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
sent = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADDR_RESOLV_ENABLE);
if (!sent)
- return;
+ return rp->status;
hci_dev_lock(hdev);
@@ -1502,38 +2074,42 @@ static void hci_cc_le_set_addr_resolution_enable(struct hci_dev *hdev,
hci_dev_clear_flag(hdev, HCI_LL_RPA_RESOLUTION);
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_le_read_max_data_len(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_le_read_max_data_len(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_le_read_max_data_len *rp = (void *) skb->data;
+ struct hci_rp_le_read_max_data_len *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
hdev->le_max_tx_len = le16_to_cpu(rp->tx_len);
hdev->le_max_tx_time = le16_to_cpu(rp->tx_time);
hdev->le_max_rx_len = le16_to_cpu(rp->rx_len);
hdev->le_max_rx_time = le16_to_cpu(rp->rx_time);
+
+ return rp->status;
}
-static void hci_cc_write_le_host_supported(struct hci_dev *hdev,
- struct sk_buff *skb)
+static u8 hci_cc_write_le_host_supported(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
struct hci_cp_write_le_host_supported *sent;
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
sent = hci_sent_cmd_data(hdev, HCI_OP_WRITE_LE_HOST_SUPPORTED);
if (!sent)
- return;
+ return rp->status;
hci_dev_lock(hdev);
@@ -1552,67 +2128,42 @@ static void hci_cc_write_le_host_supported(struct hci_dev *hdev,
hdev->features[1][0] &= ~LMP_HOST_LE_BREDR;
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_set_adv_param(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_set_adv_param(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
struct hci_cp_le_set_adv_param *cp;
- u8 status = *((u8 *) skb->data);
+ struct hci_ev_status *rp = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADV_PARAM);
if (!cp)
- return;
+ return rp->status;
hci_dev_lock(hdev);
hdev->adv_addr_type = cp->own_address_type;
hci_dev_unlock(hdev);
-}
-
-static void hci_cc_set_ext_adv_param(struct hci_dev *hdev, struct sk_buff *skb)
-{
- struct hci_rp_le_set_ext_adv_params *rp = (void *) skb->data;
- struct hci_cp_le_set_ext_adv_params *cp;
- struct adv_info *adv_instance;
-
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
-
- if (rp->status)
- return;
- cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_ADV_PARAMS);
- if (!cp)
- return;
-
- hci_dev_lock(hdev);
- hdev->adv_addr_type = cp->own_addr_type;
- if (!hdev->cur_adv_instance) {
- /* Store in hdev for instance 0 */
- hdev->adv_tx_power = rp->tx_power;
- } else {
- adv_instance = hci_find_adv_instance(hdev,
- hdev->cur_adv_instance);
- if (adv_instance)
- adv_instance->tx_power = rp->tx_power;
- }
- /* Update adv data as tx power is known now */
- hci_req_update_adv_data(hdev, hdev->cur_adv_instance);
- hci_dev_unlock(hdev);
+ return rp->status;
}
-static void hci_cc_read_rssi(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_read_rssi(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_rp_read_rssi *rp = (void *) skb->data;
+ struct hci_rp_read_rssi *rp = data;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
hci_dev_lock(hdev);
@@ -1621,22 +2172,25 @@ static void hci_cc_read_rssi(struct hci_dev *hdev, struct sk_buff *skb)
conn->rssi = rp->rssi;
hci_dev_unlock(hdev);
+
+ return rp->status;
}
-static void hci_cc_read_tx_power(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_read_tx_power(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
struct hci_cp_read_tx_power *sent;
- struct hci_rp_read_tx_power *rp = (void *) skb->data;
+ struct hci_rp_read_tx_power *rp = data;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
if (rp->status)
- return;
+ return rp->status;
sent = hci_sent_cmd_data(hdev, HCI_OP_READ_TX_POWER);
if (!sent)
- return;
+ return rp->status;
hci_dev_lock(hdev);
@@ -1655,33 +2209,36 @@ static void hci_cc_read_tx_power(struct hci_dev *hdev, struct sk_buff *skb)
unlock:
hci_dev_unlock(hdev);
+ return rp->status;
}
-static void hci_cc_write_ssp_debug_mode(struct hci_dev *hdev, struct sk_buff *skb)
+static u8 hci_cc_write_ssp_debug_mode(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- u8 status = *((u8 *) skb->data);
+ struct hci_ev_status *rp = data;
u8 *mode;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- if (status)
- return;
+ if (rp->status)
+ return rp->status;
mode = hci_sent_cmd_data(hdev, HCI_OP_WRITE_SSP_DEBUG_MODE);
if (mode)
hdev->ssp_debug_mode = *mode;
+
+ return rp->status;
}
static void hci_cs_inquiry(struct hci_dev *hdev, __u8 status)
{
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
- if (status) {
- hci_conn_check_pending(hdev);
+ if (status)
return;
- }
- set_bit(HCI_INQUIRY, &hdev->flags);
+ if (hci_sent_cmd_data(hdev, HCI_OP_INQUIRY))
+ set_bit(HCI_INQUIRY, &hdev->flags);
}
static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status)
@@ -1689,7 +2246,7 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status)
struct hci_cp_create_conn *cp;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
cp = hci_sent_cmd_data(hdev, HCI_OP_CREATE_CONN);
if (!cp)
@@ -1699,23 +2256,20 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status)
conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr);
- BT_DBG("%s bdaddr %pMR hcon %p", hdev->name, &cp->bdaddr, conn);
+ bt_dev_dbg(hdev, "bdaddr %pMR hcon %p", &cp->bdaddr, conn);
if (status) {
if (conn && conn->state == BT_CONNECT) {
- if (status != 0x0c || conn->attempt > 2) {
- conn->state = BT_CLOSED;
- hci_connect_cfm(conn, status);
- hci_conn_del(conn);
- } else
- conn->state = BT_CONNECT2;
+ conn->state = BT_CLOSED;
+ hci_connect_cfm(conn, status);
+ hci_conn_del(conn);
}
} else {
if (!conn) {
- conn = hci_conn_add(hdev, ACL_LINK, &cp->bdaddr,
- HCI_ROLE_MASTER);
- if (!conn)
- bt_dev_err(hdev, "no memory for new connection");
+ conn = hci_conn_add_unset(hdev, ACL_LINK, &cp->bdaddr,
+ 0, HCI_ROLE_MASTER);
+ if (IS_ERR(conn))
+ bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn));
}
}
@@ -1725,10 +2279,11 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status)
static void hci_cs_add_sco(struct hci_dev *hdev, __u8 status)
{
struct hci_cp_add_sco *cp;
- struct hci_conn *acl, *sco;
+ struct hci_conn *acl;
+ struct hci_link *link;
__u16 handle;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
if (!status)
return;
@@ -1739,18 +2294,19 @@ static void hci_cs_add_sco(struct hci_dev *hdev, __u8 status)
handle = __le16_to_cpu(cp->handle);
- BT_DBG("%s handle 0x%4.4x", hdev->name, handle);
+ bt_dev_dbg(hdev, "handle 0x%4.4x", handle);
hci_dev_lock(hdev);
acl = hci_conn_hash_lookup_handle(hdev, handle);
if (acl) {
- sco = acl->link;
- if (sco) {
- sco->state = BT_CLOSED;
+ link = list_first_entry_or_null(&acl->link_list,
+ struct hci_link, list);
+ if (link && link->conn) {
+ link->conn->state = BT_CLOSED;
- hci_connect_cfm(sco, status);
- hci_conn_del(sco);
+ hci_connect_cfm(link->conn, status);
+ hci_conn_del(link->conn);
}
}
@@ -1762,7 +2318,7 @@ static void hci_cs_auth_requested(struct hci_dev *hdev, __u8 status)
struct hci_cp_auth_requested *cp;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
if (!status)
return;
@@ -1789,7 +2345,7 @@ static void hci_cs_set_conn_encrypt(struct hci_dev *hdev, __u8 status)
struct hci_cp_set_conn_encrypt *cp;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
if (!status)
return;
@@ -1856,6 +2412,12 @@ static bool hci_resolve_next_name(struct hci_dev *hdev)
if (list_empty(&discov->resolve))
return false;
+ /* We should stop if we already spent too much time resolving names. */
+ if (time_after(jiffies, discov->name_resolve_timeout)) {
+ bt_dev_warn_ratelimited(hdev, "Name resolve takes too long.");
+ return false;
+ }
+
e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, NAME_NEEDED);
if (!e)
return false;
@@ -1879,10 +2441,8 @@ static void hci_check_pending_name(struct hci_dev *hdev, struct hci_conn *conn,
* Only those in BT_CONFIG or BT_CONNECTED states can be
* considered connected.
*/
- if (conn &&
- (conn->state == BT_CONFIG || conn->state == BT_CONNECTED) &&
- !test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags))
- mgmt_device_connected(hdev, conn, 0, name, name_len);
+ if (conn && (conn->state == BT_CONFIG || conn->state == BT_CONNECTED))
+ mgmt_device_connected(hdev, conn, name, name_len);
if (discov->state == DISCOVERY_STOPPED)
return;
@@ -1902,13 +2462,10 @@ static void hci_check_pending_name(struct hci_dev *hdev, struct hci_conn *conn,
return;
list_del(&e->list);
- if (name) {
- e->name_state = NAME_KNOWN;
- mgmt_remote_name(hdev, bdaddr, ACL_LINK, 0x00,
- e->data.rssi, name, name_len);
- } else {
- e->name_state = NAME_NOT_KNOWN;
- }
+
+ e->name_state = name ? NAME_KNOWN : NAME_NOT_KNOWN;
+ mgmt_remote_name(hdev, bdaddr, ACL_LINK, 0x00, e->data.rssi,
+ name, name_len);
if (hci_resolve_next_name(hdev))
return;
@@ -1922,7 +2479,7 @@ static void hci_cs_remote_name_req(struct hci_dev *hdev, __u8 status)
struct hci_cp_remote_name_req *cp;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
/* If successful wait for the name req complete event before
* checking for the need to do authentication */
@@ -1965,7 +2522,7 @@ static void hci_cs_read_remote_features(struct hci_dev *hdev, __u8 status)
struct hci_cp_read_remote_features *cp;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
if (!status)
return;
@@ -1992,7 +2549,7 @@ static void hci_cs_read_remote_ext_features(struct hci_dev *hdev, __u8 status)
struct hci_cp_read_remote_ext_features *cp;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
if (!status)
return;
@@ -2014,13 +2571,36 @@ static void hci_cs_read_remote_ext_features(struct hci_dev *hdev, __u8 status)
hci_dev_unlock(hdev);
}
+static void hci_setup_sync_conn_status(struct hci_dev *hdev, __u16 handle,
+ __u8 status)
+{
+ struct hci_conn *acl;
+ struct hci_link *link;
+
+ bt_dev_dbg(hdev, "handle 0x%4.4x status 0x%2.2x", handle, status);
+
+ hci_dev_lock(hdev);
+
+ acl = hci_conn_hash_lookup_handle(hdev, handle);
+ if (acl) {
+ link = list_first_entry_or_null(&acl->link_list,
+ struct hci_link, list);
+ if (link && link->conn) {
+ link->conn->state = BT_CLOSED;
+
+ hci_connect_cfm(link->conn, status);
+ hci_conn_del(link->conn);
+ }
+ }
+
+ hci_dev_unlock(hdev);
+}
+
static void hci_cs_setup_sync_conn(struct hci_dev *hdev, __u8 status)
{
struct hci_cp_setup_sync_conn *cp;
- struct hci_conn *acl, *sco;
- __u16 handle;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
if (!status)
return;
@@ -2029,24 +2609,23 @@ static void hci_cs_setup_sync_conn(struct hci_dev *hdev, __u8 status)
if (!cp)
return;
- handle = __le16_to_cpu(cp->handle);
+ hci_setup_sync_conn_status(hdev, __le16_to_cpu(cp->handle), status);
+}
- BT_DBG("%s handle 0x%4.4x", hdev->name, handle);
+static void hci_cs_enhanced_setup_sync_conn(struct hci_dev *hdev, __u8 status)
+{
+ struct hci_cp_enhanced_setup_sync_conn *cp;
- hci_dev_lock(hdev);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
- acl = hci_conn_hash_lookup_handle(hdev, handle);
- if (acl) {
- sco = acl->link;
- if (sco) {
- sco->state = BT_CLOSED;
+ if (!status)
+ return;
- hci_connect_cfm(sco, status);
- hci_conn_del(sco);
- }
- }
+ cp = hci_sent_cmd_data(hdev, HCI_OP_ENHANCED_SETUP_SYNC_CONN);
+ if (!cp)
+ return;
- hci_dev_unlock(hdev);
+ hci_setup_sync_conn_status(hdev, __le16_to_cpu(cp->handle), status);
}
static void hci_cs_sniff_mode(struct hci_dev *hdev, __u8 status)
@@ -2054,7 +2633,7 @@ static void hci_cs_sniff_mode(struct hci_dev *hdev, __u8 status)
struct hci_cp_sniff_mode *cp;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
if (!status)
return;
@@ -2081,7 +2660,7 @@ static void hci_cs_exit_sniff_mode(struct hci_dev *hdev, __u8 status)
struct hci_cp_exit_sniff_mode *cp;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
if (!status)
return;
@@ -2106,9 +2685,16 @@ static void hci_cs_exit_sniff_mode(struct hci_dev *hdev, __u8 status)
static void hci_cs_disconnect(struct hci_dev *hdev, u8 status)
{
struct hci_cp_disconnect *cp;
+ struct hci_conn_params *params;
struct hci_conn *conn;
+ bool mgmt_conn;
- if (!status)
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
+
+ /* Wait for HCI_EV_DISCONN_COMPLETE if status 0x00 and not suspended
+ * otherwise cleanup the connection immediately.
+ */
+ if (!status && !hdev->suspended)
return;
cp = hci_sent_cmd_data(hdev, HCI_OP_DISCONNECT);
@@ -2118,13 +2704,94 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status)
hci_dev_lock(hdev);
conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
- if (conn)
+ if (!conn)
+ goto unlock;
+
+ if (status && status != HCI_ERROR_UNKNOWN_CONN_ID) {
mgmt_disconnect_failed(hdev, &conn->dst, conn->type,
conn->dst_type, status);
+ if (conn->type == LE_LINK && conn->role == HCI_ROLE_SLAVE) {
+ hdev->cur_adv_instance = conn->adv_instance;
+ hci_enable_advertising(hdev);
+ }
+
+ /* Inform sockets conn is gone before we delete it */
+ hci_disconn_cfm(conn, HCI_ERROR_UNSPECIFIED);
+
+ goto done;
+ }
+
+ /* During suspend, mark connection as closed immediately
+ * since we might not receive HCI_EV_DISCONN_COMPLETE
+ */
+ if (hdev->suspended)
+ conn->state = BT_CLOSED;
+
+ mgmt_conn = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags);
+
+ if (conn->type == ACL_LINK) {
+ if (test_and_clear_bit(HCI_CONN_FLUSH_KEY, &conn->flags))
+ hci_remove_link_key(hdev, &conn->dst);
+ }
+
+ params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type);
+ if (params) {
+ switch (params->auto_connect) {
+ case HCI_AUTO_CONN_LINK_LOSS:
+ if (cp->reason != HCI_ERROR_CONNECTION_TIMEOUT)
+ break;
+ fallthrough;
+
+ case HCI_AUTO_CONN_DIRECT:
+ case HCI_AUTO_CONN_ALWAYS:
+ hci_pend_le_list_del_init(params);
+ hci_pend_le_list_add(params, &hdev->pend_le_conns);
+ break;
+
+ default:
+ break;
+ }
+ }
+
+ mgmt_device_disconnected(hdev, &conn->dst, conn->type, conn->dst_type,
+ cp->reason, mgmt_conn);
+
+ hci_disconn_cfm(conn, cp->reason);
+
+done:
+ /* If the disconnection failed for any reason, the upper layer
+ * does not retry to disconnect in current implementation.
+ * Hence, we need to do some basic cleanup here and re-enable
+ * advertising if necessary.
+ */
+ hci_conn_del(conn);
+unlock:
hci_dev_unlock(hdev);
}
+static u8 ev_bdaddr_type(struct hci_dev *hdev, u8 type, bool *resolved)
+{
+ /* When using controller based address resolution, then the new
+ * address types 0x02 and 0x03 are used. These types need to be
+ * converted back into either public address or random address type
+ */
+ switch (type) {
+ case ADDR_LE_DEV_PUBLIC_RESOLVED:
+ if (resolved)
+ *resolved = true;
+ return ADDR_LE_DEV_PUBLIC;
+ case ADDR_LE_DEV_RANDOM_RESOLVED:
+ if (resolved)
+ *resolved = true;
+ return ADDR_LE_DEV_RANDOM;
+ }
+
+ if (resolved)
+ *resolved = false;
+ return type;
+}
+
static void cs_le_create_conn(struct hci_dev *hdev, bdaddr_t *peer_addr,
u8 peer_addr_type, u8 own_address_type,
u8 filter_policy)
@@ -2136,6 +2803,8 @@ static void cs_le_create_conn(struct hci_dev *hdev, bdaddr_t *peer_addr,
if (!conn)
return;
+ own_address_type = ev_bdaddr_type(hdev, own_address_type, NULL);
+
/* Store the initiator and responder address information which
* is needed for SMP. These values will not change during the
* lifetime of the connection.
@@ -2148,26 +2817,16 @@ static void cs_le_create_conn(struct hci_dev *hdev, bdaddr_t *peer_addr,
conn->resp_addr_type = peer_addr_type;
bacpy(&conn->resp_addr, peer_addr);
-
- /* We don't want the connection attempt to stick around
- * indefinitely since LE doesn't have a page timeout concept
- * like BR/EDR. Set a timer for any connection that doesn't use
- * the white list for connecting.
- */
- if (filter_policy == HCI_LE_USE_PEER_ADDR)
- queue_delayed_work(conn->hdev->workqueue,
- &conn->le_conn_timeout,
- conn->conn_timeout);
}
static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status)
{
struct hci_cp_le_create_conn *cp;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
/* All connection failure handling is taken care of by the
- * hci_le_conn_failed function which is triggered by the HCI
+ * hci_conn_failed function which is triggered by the HCI
* request completion callbacks used for connecting.
*/
if (status)
@@ -2189,10 +2848,10 @@ static void hci_cs_le_ext_create_conn(struct hci_dev *hdev, u8 status)
{
struct hci_cp_le_ext_create_conn *cp;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
/* All connection failure handling is taken care of by the
- * hci_le_conn_failed function which is triggered by the HCI
+ * hci_conn_failed function which is triggered by the HCI
* request completion callbacks used for connecting.
*/
if (status)
@@ -2215,7 +2874,7 @@ static void hci_cs_le_read_remote_features(struct hci_dev *hdev, u8 status)
struct hci_cp_le_read_remote_features *cp;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
if (!status)
return;
@@ -2227,12 +2886,8 @@ static void hci_cs_le_read_remote_features(struct hci_dev *hdev, u8 status)
hci_dev_lock(hdev);
conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
- if (conn) {
- if (conn->state == BT_CONFIG) {
- hci_connect_cfm(conn, status);
- hci_conn_drop(conn);
- }
- }
+ if (conn && conn->state == BT_CONFIG)
+ hci_connect_cfm(conn, status);
hci_dev_unlock(hdev);
}
@@ -2242,7 +2897,7 @@ static void hci_cs_le_start_enc(struct hci_dev *hdev, u8 status)
struct hci_cp_le_start_enc *cp;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
if (!status)
return;
@@ -2290,15 +2945,14 @@ static void hci_cs_switch_role(struct hci_dev *hdev, u8 status)
hci_dev_unlock(hdev);
}
-static void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_inquiry_complete_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- __u8 status = *((__u8 *) skb->data);
+ struct hci_ev_status *ev = data;
struct discovery_state *discov = &hdev->discovery;
struct inquiry_entry *e;
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
-
- hci_conn_check_pending(hdev);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
if (!test_and_clear_bit(HCI_INQUIRY, &hdev->flags))
return;
@@ -2323,7 +2977,7 @@ static void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
* state to indicate completion.
*/
if (!hci_dev_test_flag(hdev, HCI_LE_SCAN) ||
- !test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks))
+ !hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY))
hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
goto unlock;
}
@@ -2332,6 +2986,7 @@ static void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
if (e && hci_resolve_name(hdev, e) == 0) {
e->name_state = NAME_PENDING;
hci_discovery_set_state(hdev, DISCOVERY_RESOLVING);
+ discov->name_resolve_timeout = jiffies + NAME_RESOLVE_DURATION;
} else {
/* When BR/EDR inquiry is active and no LE scanning is in
* progress, then change discovery state to indicate completion.
@@ -2341,7 +2996,7 @@ static void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
* state to indicate completion.
*/
if (!hci_dev_test_flag(hdev, HCI_LE_SCAN) ||
- !test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks))
+ !hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY))
hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
}
@@ -2349,15 +3004,20 @@ unlock:
hci_dev_unlock(hdev);
}
-static void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_inquiry_result_evt(struct hci_dev *hdev, void *edata,
+ struct sk_buff *skb)
{
+ struct hci_ev_inquiry_result *ev = edata;
struct inquiry_data data;
- struct inquiry_info *info = (void *) (skb->data + 1);
- int num_rsp = *((__u8 *) skb->data);
+ int i;
+
+ if (!hci_ev_skb_pull(hdev, skb, HCI_EV_INQUIRY_RESULT,
+ flex_array_size(ev, info, ev->num)))
+ return;
- BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
+ bt_dev_dbg(hdev, "num %d", ev->num);
- if (!num_rsp)
+ if (!ev->num)
return;
if (hci_dev_test_flag(hdev, HCI_PERIODIC_INQ))
@@ -2365,7 +3025,8 @@ static void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb)
hci_dev_lock(hdev);
- for (; num_rsp; num_rsp--, info++) {
+ for (i = 0; i < ev->num; i++) {
+ struct inquiry_info *info = &ev->info[i];
u32 flags;
bacpy(&data.bdaddr, &info->bdaddr);
@@ -2381,35 +3042,117 @@ static void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb)
mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00,
info->dev_class, HCI_RSSI_INVALID,
- flags, NULL, 0, NULL, 0);
+ flags, NULL, 0, NULL, 0, 0);
}
hci_dev_unlock(hdev);
}
-static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static int hci_read_enc_key_size(struct hci_dev *hdev, struct hci_conn *conn)
{
- struct hci_ev_conn_complete *ev = (void *) skb->data;
+ struct hci_cp_read_enc_key_size cp;
+ u8 *key_enc_size = hci_conn_key_enc_size(conn);
+
+ if (!read_key_size_capable(hdev)) {
+ conn->enc_key_size = HCI_LINK_KEY_SIZE;
+ return -EOPNOTSUPP;
+ }
+
+ bt_dev_dbg(hdev, "hcon %p", conn);
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(conn->handle);
+
+ /* If the key enc_size is already known, use it as conn->enc_key_size,
+ * otherwise use hdev->min_enc_key_size so the likes of
+ * l2cap_check_enc_key_size don't fail while waiting for
+ * HCI_OP_READ_ENC_KEY_SIZE response.
+ */
+ if (key_enc_size && *key_enc_size)
+ conn->enc_key_size = *key_enc_size;
+ else
+ conn->enc_key_size = hdev->min_enc_key_size;
+
+ return hci_send_cmd(hdev, HCI_OP_READ_ENC_KEY_SIZE, sizeof(cp), &cp);
+}
+
+static void hci_conn_complete_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_conn_complete *ev = data;
struct hci_conn *conn;
+ u8 status = ev->status;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
hci_dev_lock(hdev);
+ /* Check for existing connection:
+ *
+ * 1. If it doesn't exist then it must be receiver/slave role.
+ * 2. If it does exist confirm that it is connecting/BT_CONNECT in case
+ * of initiator/master role since there could be a collision where
+ * either side is attempting to connect or something like a fuzzing
+ * testing is trying to play tricks to destroy the hcon object before
+ * it even attempts to connect (e.g. hcon->state == BT_OPEN).
+ */
conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr);
- if (!conn) {
- if (ev->link_type != SCO_LINK)
+ if (!conn ||
+ (conn->role == HCI_ROLE_MASTER && conn->state != BT_CONNECT)) {
+ /* In case of error status and there is no connection pending
+ * just unlock as there is nothing to cleanup.
+ */
+ if (ev->status)
goto unlock;
- conn = hci_conn_hash_lookup_ba(hdev, ESCO_LINK, &ev->bdaddr);
- if (!conn)
- goto unlock;
+ /* Connection may not exist if auto-connected. Check the bredr
+ * allowlist to see if this device is allowed to auto connect.
+ * If link is an ACL type, create a connection class
+ * automatically.
+ *
+ * Auto-connect will only occur if the event filter is
+ * programmed with a given address. Right now, event filter is
+ * only used during suspend.
+ */
+ if (ev->link_type == ACL_LINK &&
+ hci_bdaddr_list_lookup_with_flags(&hdev->accept_list,
+ &ev->bdaddr,
+ BDADDR_BREDR)) {
+ conn = hci_conn_add_unset(hdev, ev->link_type,
+ &ev->bdaddr, 0,
+ HCI_ROLE_SLAVE);
+ if (IS_ERR(conn)) {
+ bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn));
+ goto unlock;
+ }
+ } else {
+ if (ev->link_type != SCO_LINK)
+ goto unlock;
+
+ conn = hci_conn_hash_lookup_ba(hdev, ESCO_LINK,
+ &ev->bdaddr);
+ if (!conn)
+ goto unlock;
- conn->type = SCO_LINK;
+ conn->type = SCO_LINK;
+ }
}
- if (!ev->status) {
- conn->handle = __le16_to_cpu(ev->handle);
+ /* The HCI_Connection_Complete event is only sent once per connection.
+ * Processing it more than once per connection can corrupt kernel memory.
+ *
+ * As the connection handle is set here for the first time, it indicates
+ * whether the connection is already set up.
+ */
+ if (!HCI_CONN_HANDLE_UNSET(conn->handle)) {
+ bt_dev_err(hdev, "Ignoring HCI_Connection_Complete for existing connection");
+ goto unlock;
+ }
+
+ if (!status) {
+ status = hci_conn_set_handle(conn, __le16_to_cpu(ev->handle));
+ if (status)
+ goto done;
if (conn->type == ACL_LINK) {
conn->state = BT_CONFIG;
@@ -2432,6 +3175,19 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
if (test_bit(HCI_ENCRYPT, &hdev->flags))
set_bit(HCI_CONN_ENCRYPT, &conn->flags);
+ /* "Link key request" completed ahead of "connect request" completes */
+ if (ev->encr_mode == 1 && !test_bit(HCI_CONN_ENCRYPT, &conn->flags) &&
+ ev->link_type == ACL_LINK) {
+ struct link_key *key;
+
+ key = hci_find_link_key(hdev, &ev->bdaddr);
+ if (key) {
+ set_bit(HCI_CONN_ENCRYPT, &conn->flags);
+ hci_read_enc_key_size(hdev, conn);
+ hci_encrypt_cfm(conn, ev->status);
+ }
+ }
+
/* Get remote features */
if (conn->type == ACL_LINK) {
struct hci_cp_read_remote_features cp;
@@ -2439,7 +3195,7 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
hci_send_cmd(hdev, HCI_OP_READ_REMOTE_FEATURES,
sizeof(cp), &cp);
- hci_req_update_scan(hdev);
+ hci_update_scan(hdev);
}
/* Set packet type for incoming connection */
@@ -2450,26 +3206,27 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
hci_send_cmd(hdev, HCI_OP_CHANGE_CONN_PTYPE, sizeof(cp),
&cp);
}
- } else {
- conn->state = BT_CLOSED;
- if (conn->type == ACL_LINK)
- mgmt_connect_failed(hdev, &conn->dst, conn->type,
- conn->dst_type, ev->status);
}
if (conn->type == ACL_LINK)
hci_sco_setup(conn, ev->status);
- if (ev->status) {
- hci_connect_cfm(conn, ev->status);
- hci_conn_del(conn);
- } else if (ev->link_type != ACL_LINK)
- hci_connect_cfm(conn, ev->status);
+done:
+ if (status) {
+ hci_conn_failed(conn, status);
+ } else if (ev->link_type == SCO_LINK) {
+ switch (conn->setting & SCO_AIRMODE_MASK) {
+ case SCO_AIRMODE_CVSD:
+ if (hdev->notify)
+ hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_CVSD);
+ break;
+ }
+
+ hci_connect_cfm(conn, status);
+ }
unlock:
hci_dev_unlock(hdev);
-
- hci_conn_check_pending(hdev);
}
static void hci_reject_conn(struct hci_dev *hdev, bdaddr_t *bdaddr)
@@ -2481,16 +3238,26 @@ static void hci_reject_conn(struct hci_dev *hdev, bdaddr_t *bdaddr)
hci_send_cmd(hdev, HCI_OP_REJECT_CONN_REQ, sizeof(cp), &cp);
}
-static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_conn_request_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_conn_request *ev = (void *) skb->data;
+ struct hci_ev_conn_request *ev = data;
int mask = hdev->link_mode;
struct inquiry_entry *ie;
struct hci_conn *conn;
__u8 flags = 0;
- BT_DBG("%s bdaddr %pMR type 0x%x", hdev->name, &ev->bdaddr,
- ev->link_type);
+ bt_dev_dbg(hdev, "bdaddr %pMR type 0x%x", &ev->bdaddr, ev->link_type);
+
+ /* Reject incoming connection from device with same BD ADDR against
+ * CVE-2020-26555
+ */
+ if (hdev && !bacmp(&hdev->bdaddr, &ev->bdaddr)) {
+ bt_dev_dbg(hdev, "Reject connection with same BD_ADDR %pMR\n",
+ &ev->bdaddr);
+ hci_reject_conn(hdev, &ev->bdaddr);
+ return;
+ }
mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, ev->link_type,
&flags);
@@ -2500,28 +3267,28 @@ static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
return;
}
- if (hci_bdaddr_list_lookup(&hdev->blacklist, &ev->bdaddr,
+ hci_dev_lock(hdev);
+
+ if (hci_bdaddr_list_lookup(&hdev->reject_list, &ev->bdaddr,
BDADDR_BREDR)) {
hci_reject_conn(hdev, &ev->bdaddr);
- return;
+ goto unlock;
}
- /* Require HCI_CONNECTABLE or a whitelist entry to accept the
+ /* Require HCI_CONNECTABLE or an accept list entry to accept the
* connection. These features are only touched through mgmt so
* only do the checks if HCI_MGMT is set.
*/
if (hci_dev_test_flag(hdev, HCI_MGMT) &&
!hci_dev_test_flag(hdev, HCI_CONNECTABLE) &&
- !hci_bdaddr_list_lookup(&hdev->whitelist, &ev->bdaddr,
- BDADDR_BREDR)) {
- hci_reject_conn(hdev, &ev->bdaddr);
- return;
+ !hci_bdaddr_list_lookup_with_flags(&hdev->accept_list, &ev->bdaddr,
+ BDADDR_BREDR)) {
+ hci_reject_conn(hdev, &ev->bdaddr);
+ goto unlock;
}
/* Connection accepted */
- hci_dev_lock(hdev);
-
ie = hci_inquiry_cache_lookup(hdev, &ev->bdaddr);
if (ie)
memcpy(ie->data.dev_class, ev->dev_class, 3);
@@ -2529,12 +3296,11 @@ static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
conn = hci_conn_hash_lookup_ba(hdev, ev->link_type,
&ev->bdaddr);
if (!conn) {
- conn = hci_conn_add(hdev, ev->link_type, &ev->bdaddr,
- HCI_ROLE_SLAVE);
- if (!conn) {
- bt_dev_err(hdev, "no memory for new connection");
- hci_dev_unlock(hdev);
- return;
+ conn = hci_conn_add_unset(hdev, ev->link_type, &ev->bdaddr, 0,
+ HCI_ROLE_SLAVE);
+ if (IS_ERR(conn)) {
+ bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn));
+ goto unlock;
}
}
@@ -2550,9 +3316,9 @@ static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
bacpy(&cp.bdaddr, &ev->bdaddr);
if (lmp_rswitch_capable(hdev) && (mask & HCI_LM_MASTER))
- cp.role = 0x00; /* Become master */
+ cp.role = 0x00; /* Become central */
else
- cp.role = 0x01; /* Remain slave */
+ cp.role = 0x01; /* Remain peripheral */
hci_send_cmd(hdev, HCI_OP_ACCEPT_CONN_REQ, sizeof(cp), &cp);
} else if (!(flags & HCI_PROTO_DEFER)) {
@@ -2574,6 +3340,10 @@ static void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
conn->state = BT_CONNECT2;
hci_connect_cfm(conn, 0);
}
+
+ return;
+unlock:
+ hci_dev_unlock(hdev);
}
static u8 hci_to_mgmt_reason(u8 err)
@@ -2592,16 +3362,16 @@ static u8 hci_to_mgmt_reason(u8 err)
}
}
-static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_disconn_complete_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_disconn_complete *ev = (void *) skb->data;
+ struct hci_ev_disconn_complete *ev = data;
u8 reason;
struct hci_conn_params *params;
struct hci_conn *conn;
bool mgmt_connected;
- u8 type;
- BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
hci_dev_lock(hdev);
@@ -2628,36 +3398,40 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
reason, mgmt_connected);
if (conn->type == ACL_LINK) {
- if (test_bit(HCI_CONN_FLUSH_KEY, &conn->flags))
+ if (test_and_clear_bit(HCI_CONN_FLUSH_KEY, &conn->flags))
hci_remove_link_key(hdev, &conn->dst);
- hci_req_update_scan(hdev);
+ hci_update_scan(hdev);
}
- params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type);
- if (params) {
- switch (params->auto_connect) {
- case HCI_AUTO_CONN_LINK_LOSS:
- if (ev->reason != HCI_ERROR_CONNECTION_TIMEOUT)
+ /* Re-enable passive scanning if disconnected device is marked
+ * as auto-connectable.
+ */
+ if (conn->type == LE_LINK) {
+ params = hci_conn_params_lookup(hdev, &conn->dst,
+ conn->dst_type);
+ if (params) {
+ switch (params->auto_connect) {
+ case HCI_AUTO_CONN_LINK_LOSS:
+ if (ev->reason != HCI_ERROR_CONNECTION_TIMEOUT)
+ break;
+ fallthrough;
+
+ case HCI_AUTO_CONN_DIRECT:
+ case HCI_AUTO_CONN_ALWAYS:
+ hci_pend_le_list_del_init(params);
+ hci_pend_le_list_add(params,
+ &hdev->pend_le_conns);
+ hci_update_passive_scan(hdev);
break;
- /* Fall through */
-
- case HCI_AUTO_CONN_DIRECT:
- case HCI_AUTO_CONN_ALWAYS:
- list_del_init(&params->action);
- list_add(&params->action, &hdev->pend_le_conns);
- hci_update_background_scan(hdev);
- break;
- default:
- break;
+ default:
+ break;
+ }
}
}
- type = conn->type;
-
hci_disconn_cfm(conn, ev->reason);
- hci_conn_del(conn);
/* Re-enable advertising if necessary, since it might
* have been disabled by the connection. From the
@@ -2669,19 +3443,24 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
* or until a connection is created or until the Advertising
* is timed out due to Directed Advertising."
*/
- if (type == LE_LINK)
- hci_req_reenable_advertising(hdev);
+ if (conn->type == LE_LINK && conn->role == HCI_ROLE_SLAVE) {
+ hdev->cur_adv_instance = conn->adv_instance;
+ hci_enable_advertising(hdev);
+ }
+
+ hci_conn_del(conn);
unlock:
hci_dev_unlock(hdev);
}
-static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_auth_complete_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_auth_complete *ev = (void *) skb->data;
+ struct hci_ev_auth_complete *ev = data;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
hci_dev_lock(hdev);
@@ -2691,14 +3470,8 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
if (!ev->status) {
clear_bit(HCI_CONN_AUTH_FAILURE, &conn->flags);
-
- if (!hci_conn_ssp_enabled(conn) &&
- test_bit(HCI_CONN_REAUTH_PEND, &conn->flags)) {
- bt_dev_info(hdev, "re-auth of legacy device is not possible.");
- } else {
- set_bit(HCI_CONN_AUTH, &conn->flags);
- conn->sec_level = conn->pending_sec_level;
- }
+ set_bit(HCI_CONN_AUTH, &conn->flags);
+ conn->sec_level = conn->pending_sec_level;
} else {
if (ev->status == HCI_ERROR_PIN_OR_KEY_MISSING)
set_bit(HCI_CONN_AUTH_FAILURE, &conn->flags);
@@ -2707,7 +3480,6 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
}
clear_bit(HCI_CONN_AUTH_PEND, &conn->flags);
- clear_bit(HCI_CONN_REAUTH_PEND, &conn->flags);
if (conn->state == BT_CONFIG) {
if (!ev->status && hci_conn_ssp_enabled(conn)) {
@@ -2738,7 +3510,7 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
&cp);
} else {
clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags);
- hci_encrypt_cfm(conn, ev->status, 0x00);
+ hci_encrypt_cfm(conn, ev->status);
}
}
@@ -2746,14 +3518,13 @@ unlock:
hci_dev_unlock(hdev);
}
-static void hci_remote_name_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_remote_name_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_remote_name *ev = (void *) skb->data;
+ struct hci_ev_remote_name *ev = data;
struct hci_conn *conn;
- BT_DBG("%s", hdev->name);
-
- hci_conn_check_pending(hdev);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
hci_dev_lock(hdev);
@@ -2788,68 +3559,13 @@ unlock:
hci_dev_unlock(hdev);
}
-static void read_enc_key_size_complete(struct hci_dev *hdev, u8 status,
- u16 opcode, struct sk_buff *skb)
-{
- const struct hci_rp_read_enc_key_size *rp;
- struct hci_conn *conn;
- u16 handle;
-
- BT_DBG("%s status 0x%02x", hdev->name, status);
-
- if (!skb || skb->len < sizeof(*rp)) {
- bt_dev_err(hdev, "invalid read key size response");
- return;
- }
-
- rp = (void *)skb->data;
- handle = le16_to_cpu(rp->handle);
-
- hci_dev_lock(hdev);
-
- conn = hci_conn_hash_lookup_handle(hdev, handle);
- if (!conn)
- goto unlock;
-
- /* If we fail to read the encryption key size, assume maximum
- * (which is the same we do also when this HCI command isn't
- * supported.
- */
- if (rp->status) {
- bt_dev_err(hdev, "failed to read key size for handle %u",
- handle);
- conn->enc_key_size = HCI_LINK_KEY_SIZE;
- } else {
- conn->enc_key_size = rp->key_size;
- }
-
- if (conn->state == BT_CONFIG) {
- conn->state = BT_CONNECTED;
- hci_connect_cfm(conn, 0);
- hci_conn_drop(conn);
- } else {
- u8 encrypt;
-
- if (!test_bit(HCI_CONN_ENCRYPT, &conn->flags))
- encrypt = 0x00;
- else if (test_bit(HCI_CONN_AES_CCM, &conn->flags))
- encrypt = 0x02;
- else
- encrypt = 0x01;
-
- hci_encrypt_cfm(conn, 0, encrypt);
- }
-
-unlock:
- hci_dev_unlock(hdev);
-}
-
-static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_encrypt_change_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_encrypt_change *ev = (void *) skb->data;
+ struct hci_ev_encrypt_change *ev = data;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
hci_dev_lock(hdev);
@@ -2887,76 +3603,71 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags);
+ /* Check link security requirements are met */
+ if (!hci_conn_check_link_mode(conn))
+ ev->status = HCI_ERROR_AUTH_FAILURE;
+
if (ev->status && conn->state == BT_CONNECTED) {
if (ev->status == HCI_ERROR_PIN_OR_KEY_MISSING)
set_bit(HCI_CONN_AUTH_FAILURE, &conn->flags);
+ /* Notify upper layers so they can cleanup before
+ * disconnecting.
+ */
+ hci_encrypt_cfm(conn, ev->status);
hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE);
hci_conn_drop(conn);
goto unlock;
}
- /* In Secure Connections Only mode, do not allow any connections
- * that are not encrypted with AES-CCM using a P-256 authenticated
- * combination key.
- */
- if (hci_dev_test_flag(hdev, HCI_SC_ONLY) &&
- (!test_bit(HCI_CONN_AES_CCM, &conn->flags) ||
- conn->key_type != HCI_LK_AUTH_COMBINATION_P256)) {
- hci_connect_cfm(conn, HCI_ERROR_AUTH_FAILURE);
- hci_conn_drop(conn);
- goto unlock;
- }
-
/* Try reading the encryption key size for encrypted ACL links */
if (!ev->status && ev->encrypt && conn->type == ACL_LINK) {
- struct hci_cp_read_enc_key_size cp;
- struct hci_request req;
-
- /* Only send HCI_Read_Encryption_Key_Size if the
- * controller really supports it. If it doesn't, assume
- * the default size (16).
- */
- if (!(hdev->commands[20] & 0x10)) {
- conn->enc_key_size = HCI_LINK_KEY_SIZE;
+ if (hci_read_enc_key_size(hdev, conn))
goto notify;
- }
- hci_req_init(&req, hdev);
-
- cp.handle = cpu_to_le16(conn->handle);
- hci_req_add(&req, HCI_OP_READ_ENC_KEY_SIZE, sizeof(cp), &cp);
+ goto unlock;
+ }
- if (hci_req_run_skb(&req, read_enc_key_size_complete)) {
- bt_dev_err(hdev, "sending read key size failed");
- conn->enc_key_size = HCI_LINK_KEY_SIZE;
- goto notify;
- }
+ /* We skip the WRITE_AUTH_PAYLOAD_TIMEOUT for ATS2851 based controllers
+ * to avoid unexpected SMP command errors when pairing.
+ */
+ if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_WRITE_AUTH_PAYLOAD_TIMEOUT))
+ goto notify;
+
+ /* Set the default Authenticated Payload Timeout after
+ * an LE Link is established. As per Core Spec v5.0, Vol 2, Part B
+ * Section 3.3, the HCI command WRITE_AUTH_PAYLOAD_TIMEOUT should be
+ * sent when the link is active and Encryption is enabled, the conn
+ * type can be either LE or ACL and controller must support LMP Ping.
+ * Ensure for AES-CCM encryption as well.
+ */
+ if (test_bit(HCI_CONN_ENCRYPT, &conn->flags) &&
+ test_bit(HCI_CONN_AES_CCM, &conn->flags) &&
+ ((conn->type == ACL_LINK && lmp_ping_capable(hdev)) ||
+ (conn->type == LE_LINK && (hdev->le_features[0] & HCI_LE_PING)))) {
+ struct hci_cp_write_auth_payload_to cp;
- goto unlock;
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.timeout = cpu_to_le16(hdev->auth_payload_timeout);
+ if (hci_send_cmd(conn->hdev, HCI_OP_WRITE_AUTH_PAYLOAD_TO,
+ sizeof(cp), &cp))
+ bt_dev_err(hdev, "write auth payload timeout failed");
}
notify:
- if (conn->state == BT_CONFIG) {
- if (!ev->status)
- conn->state = BT_CONNECTED;
-
- hci_connect_cfm(conn, ev->status);
- hci_conn_drop(conn);
- } else
- hci_encrypt_cfm(conn, ev->status, ev->encrypt);
+ hci_encrypt_cfm(conn, ev->status);
unlock:
hci_dev_unlock(hdev);
}
-static void hci_change_link_key_complete_evt(struct hci_dev *hdev,
+static void hci_change_link_key_complete_evt(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
- struct hci_ev_change_link_key_complete *ev = (void *) skb->data;
+ struct hci_ev_change_link_key_complete *ev = data;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
hci_dev_lock(hdev);
@@ -2973,13 +3684,13 @@ static void hci_change_link_key_complete_evt(struct hci_dev *hdev,
hci_dev_unlock(hdev);
}
-static void hci_remote_features_evt(struct hci_dev *hdev,
+static void hci_remote_features_evt(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
- struct hci_ev_remote_features *ev = (void *) skb->data;
+ struct hci_ev_remote_features *ev = data;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
hci_dev_lock(hdev);
@@ -3003,14 +3714,15 @@ static void hci_remote_features_evt(struct hci_dev *hdev,
goto unlock;
}
- if (!ev->status && !test_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags)) {
+ if (!ev->status) {
struct hci_cp_remote_name_req cp;
memset(&cp, 0, sizeof(cp));
bacpy(&cp.bdaddr, &conn->dst);
cp.pscan_rep_mode = 0x02;
hci_send_cmd(hdev, HCI_OP_REMOTE_NAME_REQ, sizeof(cp), &cp);
- } else if (!test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags))
- mgmt_device_connected(hdev, conn, 0, NULL, 0);
+ } else {
+ mgmt_device_connected(hdev, conn, NULL, 0);
+ }
if (!hci_outgoing_auth_needed(hdev, conn)) {
conn->state = BT_CONNECTED;
@@ -3022,436 +3734,659 @@ unlock:
hci_dev_unlock(hdev);
}
-static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb,
- u16 *opcode, u8 *status,
- hci_req_complete_t *req_complete,
- hci_req_complete_skb_t *req_complete_skb)
+static inline void handle_cmd_cnt_and_timer(struct hci_dev *hdev, u8 ncmd)
{
- struct hci_ev_cmd_complete *ev = (void *) skb->data;
-
- *opcode = __le16_to_cpu(ev->opcode);
- *status = skb->data[sizeof(*ev)];
-
- skb_pull(skb, sizeof(*ev));
-
- switch (*opcode) {
- case HCI_OP_INQUIRY_CANCEL:
- hci_cc_inquiry_cancel(hdev, skb);
- break;
-
- case HCI_OP_PERIODIC_INQ:
- hci_cc_periodic_inq(hdev, skb);
- break;
-
- case HCI_OP_EXIT_PERIODIC_INQ:
- hci_cc_exit_periodic_inq(hdev, skb);
- break;
+ cancel_delayed_work(&hdev->cmd_timer);
- case HCI_OP_REMOTE_NAME_REQ_CANCEL:
- hci_cc_remote_name_req_cancel(hdev, skb);
- break;
-
- case HCI_OP_ROLE_DISCOVERY:
- hci_cc_role_discovery(hdev, skb);
- break;
+ rcu_read_lock();
+ if (!test_bit(HCI_RESET, &hdev->flags)) {
+ if (ncmd) {
+ cancel_delayed_work(&hdev->ncmd_timer);
+ atomic_set(&hdev->cmd_cnt, 1);
+ } else {
+ if (!hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE))
+ queue_delayed_work(hdev->workqueue, &hdev->ncmd_timer,
+ HCI_NCMD_TIMEOUT);
+ }
+ }
+ rcu_read_unlock();
+}
- case HCI_OP_READ_LINK_POLICY:
- hci_cc_read_link_policy(hdev, skb);
- break;
+static u8 hci_cc_le_read_buffer_size_v2(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_buffer_size_v2 *rp = data;
- case HCI_OP_WRITE_LINK_POLICY:
- hci_cc_write_link_policy(hdev, skb);
- break;
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- case HCI_OP_READ_DEF_LINK_POLICY:
- hci_cc_read_def_link_policy(hdev, skb);
- break;
+ if (rp->status)
+ return rp->status;
- case HCI_OP_WRITE_DEF_LINK_POLICY:
- hci_cc_write_def_link_policy(hdev, skb);
- break;
+ hdev->le_mtu = __le16_to_cpu(rp->acl_mtu);
+ hdev->le_pkts = rp->acl_max_pkt;
+ hdev->iso_mtu = __le16_to_cpu(rp->iso_mtu);
+ hdev->iso_pkts = rp->iso_max_pkt;
- case HCI_OP_RESET:
- hci_cc_reset(hdev, skb);
- break;
+ hdev->le_cnt = hdev->le_pkts;
+ hdev->iso_cnt = hdev->iso_pkts;
- case HCI_OP_READ_STORED_LINK_KEY:
- hci_cc_read_stored_link_key(hdev, skb);
- break;
+ BT_DBG("%s acl mtu %d:%d iso mtu %d:%d", hdev->name, hdev->acl_mtu,
+ hdev->acl_pkts, hdev->iso_mtu, hdev->iso_pkts);
- case HCI_OP_DELETE_STORED_LINK_KEY:
- hci_cc_delete_stored_link_key(hdev, skb);
- break;
+ if (hdev->le_mtu && hdev->le_mtu < HCI_MIN_LE_MTU)
+ return HCI_ERROR_INVALID_PARAMETERS;
- case HCI_OP_WRITE_LOCAL_NAME:
- hci_cc_write_local_name(hdev, skb);
- break;
+ return rp->status;
+}
- case HCI_OP_READ_LOCAL_NAME:
- hci_cc_read_local_name(hdev, skb);
- break;
+static void hci_unbound_cis_failed(struct hci_dev *hdev, u8 cig, u8 status)
+{
+ struct hci_conn *conn, *tmp;
- case HCI_OP_WRITE_AUTH_ENABLE:
- hci_cc_write_auth_enable(hdev, skb);
- break;
+ lockdep_assert_held(&hdev->lock);
- case HCI_OP_WRITE_ENCRYPT_MODE:
- hci_cc_write_encrypt_mode(hdev, skb);
- break;
+ list_for_each_entry_safe(conn, tmp, &hdev->conn_hash.list, list) {
+ if (conn->type != CIS_LINK ||
+ conn->state == BT_OPEN || conn->iso_qos.ucast.cig != cig)
+ continue;
- case HCI_OP_WRITE_SCAN_ENABLE:
- hci_cc_write_scan_enable(hdev, skb);
- break;
+ if (HCI_CONN_HANDLE_UNSET(conn->handle))
+ hci_conn_failed(conn, status);
+ }
+}
- case HCI_OP_READ_CLASS_OF_DEV:
- hci_cc_read_class_of_dev(hdev, skb);
- break;
+static u8 hci_cc_le_set_cig_params(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_set_cig_params *rp = data;
+ struct hci_cp_le_set_cig_params *cp;
+ struct hci_conn *conn;
+ u8 status = rp->status;
+ bool pending = false;
+ int i;
- case HCI_OP_WRITE_CLASS_OF_DEV:
- hci_cc_write_class_of_dev(hdev, skb);
- break;
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- case HCI_OP_READ_VOICE_SETTING:
- hci_cc_read_voice_setting(hdev, skb);
- break;
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_CIG_PARAMS);
+ if (!rp->status && (!cp || rp->num_handles != cp->num_cis ||
+ rp->cig_id != cp->cig_id)) {
+ bt_dev_err(hdev, "unexpected Set CIG Parameters response data");
+ status = HCI_ERROR_UNSPECIFIED;
+ }
- case HCI_OP_WRITE_VOICE_SETTING:
- hci_cc_write_voice_setting(hdev, skb);
- break;
+ hci_dev_lock(hdev);
- case HCI_OP_READ_NUM_SUPPORTED_IAC:
- hci_cc_read_num_supported_iac(hdev, skb);
- break;
+ /* BLUETOOTH CORE SPECIFICATION Version 5.4 | Vol 4, Part E page 2554
+ *
+ * If the Status return parameter is non-zero, then the state of the CIG
+ * and its CIS configurations shall not be changed by the command. If
+ * the CIG did not already exist, it shall not be created.
+ */
+ if (status) {
+ /* Keep current configuration, fail only the unbound CIS */
+ hci_unbound_cis_failed(hdev, rp->cig_id, status);
+ goto unlock;
+ }
- case HCI_OP_WRITE_SSP_MODE:
- hci_cc_write_ssp_mode(hdev, skb);
- break;
+ /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 2553
+ *
+ * If the Status return parameter is zero, then the Controller shall
+ * set the Connection_Handle arrayed return parameter to the connection
+ * handle(s) corresponding to the CIS configurations specified in
+ * the CIS_IDs command parameter, in the same order.
+ */
+ for (i = 0; i < rp->num_handles; ++i) {
+ conn = hci_conn_hash_lookup_cis(hdev, NULL, 0, rp->cig_id,
+ cp->cis[i].cis_id);
+ if (!conn || !bacmp(&conn->dst, BDADDR_ANY))
+ continue;
- case HCI_OP_WRITE_SC_SUPPORT:
- hci_cc_write_sc_support(hdev, skb);
- break;
+ if (conn->state != BT_BOUND && conn->state != BT_CONNECT)
+ continue;
- case HCI_OP_READ_LOCAL_VERSION:
- hci_cc_read_local_version(hdev, skb);
- break;
+ if (hci_conn_set_handle(conn, __le16_to_cpu(rp->handle[i])))
+ continue;
- case HCI_OP_READ_LOCAL_COMMANDS:
- hci_cc_read_local_commands(hdev, skb);
- break;
+ if (conn->state == BT_CONNECT)
+ pending = true;
+ }
- case HCI_OP_READ_LOCAL_FEATURES:
- hci_cc_read_local_features(hdev, skb);
- break;
+unlock:
+ if (pending)
+ hci_le_create_cis_pending(hdev);
- case HCI_OP_READ_LOCAL_EXT_FEATURES:
- hci_cc_read_local_ext_features(hdev, skb);
- break;
+ hci_dev_unlock(hdev);
- case HCI_OP_READ_BUFFER_SIZE:
- hci_cc_read_buffer_size(hdev, skb);
- break;
+ return rp->status;
+}
- case HCI_OP_READ_BD_ADDR:
- hci_cc_read_bd_addr(hdev, skb);
- break;
+static u8 hci_cc_le_setup_iso_path(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_setup_iso_path *rp = data;
+ struct hci_cp_le_setup_iso_path *cp;
+ struct hci_conn *conn;
- case HCI_OP_READ_PAGE_SCAN_ACTIVITY:
- hci_cc_read_page_scan_activity(hdev, skb);
- break;
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- case HCI_OP_WRITE_PAGE_SCAN_ACTIVITY:
- hci_cc_write_page_scan_activity(hdev, skb);
- break;
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SETUP_ISO_PATH);
+ if (!cp)
+ return rp->status;
- case HCI_OP_READ_PAGE_SCAN_TYPE:
- hci_cc_read_page_scan_type(hdev, skb);
- break;
+ hci_dev_lock(hdev);
- case HCI_OP_WRITE_PAGE_SCAN_TYPE:
- hci_cc_write_page_scan_type(hdev, skb);
- break;
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (!conn)
+ goto unlock;
- case HCI_OP_READ_DATA_BLOCK_SIZE:
- hci_cc_read_data_block_size(hdev, skb);
- break;
+ if (rp->status) {
+ hci_connect_cfm(conn, rp->status);
+ hci_conn_del(conn);
+ goto unlock;
+ }
- case HCI_OP_READ_FLOW_CONTROL_MODE:
- hci_cc_read_flow_control_mode(hdev, skb);
+ switch (cp->direction) {
+ /* Input (Host to Controller) */
+ case 0x00:
+ /* Only confirm connection if output only */
+ if (conn->iso_qos.ucast.out.sdu && !conn->iso_qos.ucast.in.sdu)
+ hci_connect_cfm(conn, rp->status);
break;
+ /* Output (Controller to Host) */
+ case 0x01:
+ /* Confirm connection since conn->iso_qos is always configured
+ * last.
+ */
+ hci_connect_cfm(conn, rp->status);
- case HCI_OP_READ_LOCAL_AMP_INFO:
- hci_cc_read_local_amp_info(hdev, skb);
- break;
+ /* Notify device connected in case it is a BIG Sync */
+ if (!rp->status && test_bit(HCI_CONN_BIG_SYNC, &conn->flags))
+ mgmt_device_connected(hdev, conn, NULL, 0);
- case HCI_OP_READ_CLOCK:
- hci_cc_read_clock(hdev, skb);
break;
+ }
- case HCI_OP_READ_INQ_RSP_TX_POWER:
- hci_cc_read_inq_rsp_tx_power(hdev, skb);
- break;
+unlock:
+ hci_dev_unlock(hdev);
+ return rp->status;
+}
- case HCI_OP_PIN_CODE_REPLY:
- hci_cc_pin_code_reply(hdev, skb);
- break;
+static u8 hci_cc_le_read_all_local_features(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_all_local_features *rp = data;
- case HCI_OP_PIN_CODE_NEG_REPLY:
- hci_cc_pin_code_neg_reply(hdev, skb);
- break;
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- case HCI_OP_READ_LOCAL_OOB_DATA:
- hci_cc_read_local_oob_data(hdev, skb);
- break;
+ if (rp->status)
+ return rp->status;
- case HCI_OP_READ_LOCAL_OOB_EXT_DATA:
- hci_cc_read_local_oob_ext_data(hdev, skb);
- break;
+ memcpy(hdev->le_features, rp->features, 248);
- case HCI_OP_LE_READ_BUFFER_SIZE:
- hci_cc_le_read_buffer_size(hdev, skb);
- break;
+ return rp->status;
+}
- case HCI_OP_LE_READ_LOCAL_FEATURES:
- hci_cc_le_read_local_features(hdev, skb);
- break;
+static void hci_cs_le_create_big(struct hci_dev *hdev, u8 status)
+{
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
+}
- case HCI_OP_LE_READ_ADV_TX_POWER:
- hci_cc_le_read_adv_tx_power(hdev, skb);
- break;
+static void hci_cs_le_read_all_remote_features(struct hci_dev *hdev, u8 status)
+{
+ struct hci_cp_le_read_remote_features *cp;
+ struct hci_conn *conn;
- case HCI_OP_USER_CONFIRM_REPLY:
- hci_cc_user_confirm_reply(hdev, skb);
- break;
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
- case HCI_OP_USER_CONFIRM_NEG_REPLY:
- hci_cc_user_confirm_neg_reply(hdev, skb);
- break;
+ if (!status)
+ return;
- case HCI_OP_USER_PASSKEY_REPLY:
- hci_cc_user_passkey_reply(hdev, skb);
- break;
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_READ_ALL_REMOTE_FEATURES);
+ if (!cp)
+ return;
- case HCI_OP_USER_PASSKEY_NEG_REPLY:
- hci_cc_user_passkey_neg_reply(hdev, skb);
- break;
+ hci_dev_lock(hdev);
- case HCI_OP_LE_SET_RANDOM_ADDR:
- hci_cc_le_set_random_addr(hdev, skb);
- break;
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (conn && conn->state == BT_CONFIG)
+ hci_connect_cfm(conn, status);
- case HCI_OP_LE_SET_ADV_ENABLE:
- hci_cc_le_set_adv_enable(hdev, skb);
- break;
+ hci_dev_unlock(hdev);
+}
- case HCI_OP_LE_SET_SCAN_PARAM:
- hci_cc_le_set_scan_param(hdev, skb);
- break;
+static u8 hci_cc_set_per_adv_param(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ struct hci_cp_le_set_per_adv_params *cp;
- case HCI_OP_LE_SET_SCAN_ENABLE:
- hci_cc_le_set_scan_enable(hdev, skb);
- break;
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- case HCI_OP_LE_READ_WHITE_LIST_SIZE:
- hci_cc_le_read_white_list_size(hdev, skb);
- break;
+ if (rp->status)
+ return rp->status;
- case HCI_OP_LE_CLEAR_WHITE_LIST:
- hci_cc_le_clear_white_list(hdev, skb);
- break;
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_PER_ADV_PARAMS);
+ if (!cp)
+ return rp->status;
- case HCI_OP_LE_ADD_TO_WHITE_LIST:
- hci_cc_le_add_to_white_list(hdev, skb);
- break;
+ /* TODO: set the conn state */
+ return rp->status;
+}
- case HCI_OP_LE_DEL_FROM_WHITE_LIST:
- hci_cc_le_del_from_white_list(hdev, skb);
- break;
+static u8 hci_cc_le_set_per_adv_enable(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_status *rp = data;
+ struct hci_cp_le_set_per_adv_enable *cp;
+ struct adv_info *adv = NULL, *n;
+ u8 per_adv_cnt = 0;
- case HCI_OP_LE_READ_SUPPORTED_STATES:
- hci_cc_le_read_supported_states(hdev, skb);
- break;
+ bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
- case HCI_OP_LE_READ_DEF_DATA_LEN:
- hci_cc_le_read_def_data_len(hdev, skb);
- break;
+ if (rp->status)
+ return rp->status;
- case HCI_OP_LE_WRITE_DEF_DATA_LEN:
- hci_cc_le_write_def_data_len(hdev, skb);
- break;
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_PER_ADV_ENABLE);
+ if (!cp)
+ return rp->status;
- case HCI_OP_LE_CLEAR_RESOLV_LIST:
- hci_cc_le_clear_resolv_list(hdev, skb);
- break;
+ hci_dev_lock(hdev);
- case HCI_OP_LE_READ_RESOLV_LIST_SIZE:
- hci_cc_le_read_resolv_list_size(hdev, skb);
- break;
+ adv = hci_find_adv_instance(hdev, cp->handle);
- case HCI_OP_LE_SET_ADDR_RESOLV_ENABLE:
- hci_cc_le_set_addr_resolution_enable(hdev, skb);
- break;
+ if (cp->enable) {
+ hci_dev_set_flag(hdev, HCI_LE_PER_ADV);
- case HCI_OP_LE_READ_MAX_DATA_LEN:
- hci_cc_le_read_max_data_len(hdev, skb);
- break;
+ if (adv)
+ adv->periodic_enabled = true;
+ } else {
+ if (adv)
+ adv->periodic_enabled = false;
- case HCI_OP_WRITE_LE_HOST_SUPPORTED:
- hci_cc_write_le_host_supported(hdev, skb);
- break;
+ /* If just one instance was disabled check if there are
+ * any other instance enabled before clearing HCI_LE_PER_ADV.
+ * The current periodic adv instance will be marked as
+ * disabled once extended advertising is also disabled.
+ */
+ list_for_each_entry_safe(adv, n, &hdev->adv_instances,
+ list) {
+ if (adv->periodic && adv->enabled)
+ per_adv_cnt++;
+ }
- case HCI_OP_LE_SET_ADV_PARAM:
- hci_cc_set_adv_param(hdev, skb);
- break;
+ if (per_adv_cnt > 1)
+ goto unlock;
- case HCI_OP_READ_RSSI:
- hci_cc_read_rssi(hdev, skb);
- break;
+ hci_dev_clear_flag(hdev, HCI_LE_PER_ADV);
+ }
- case HCI_OP_READ_TX_POWER:
- hci_cc_read_tx_power(hdev, skb);
- break;
+unlock:
+ hci_dev_unlock(hdev);
- case HCI_OP_WRITE_SSP_DEBUG_MODE:
- hci_cc_write_ssp_debug_mode(hdev, skb);
- break;
+ return rp->status;
+}
+
+#define HCI_CC_VL(_op, _func, _min, _max) \
+{ \
+ .op = _op, \
+ .func = _func, \
+ .min_len = _min, \
+ .max_len = _max, \
+}
+
+#define HCI_CC(_op, _func, _len) \
+ HCI_CC_VL(_op, _func, _len, _len)
+
+#define HCI_CC_STATUS(_op, _func) \
+ HCI_CC(_op, _func, sizeof(struct hci_ev_status))
+
+static const struct hci_cc {
+ u16 op;
+ u8 (*func)(struct hci_dev *hdev, void *data, struct sk_buff *skb);
+ u16 min_len;
+ u16 max_len;
+} hci_cc_table[] = {
+ HCI_CC_STATUS(HCI_OP_INQUIRY_CANCEL, hci_cc_inquiry_cancel),
+ HCI_CC_STATUS(HCI_OP_PERIODIC_INQ, hci_cc_periodic_inq),
+ HCI_CC_STATUS(HCI_OP_EXIT_PERIODIC_INQ, hci_cc_exit_periodic_inq),
+ HCI_CC(HCI_OP_REMOTE_NAME_REQ_CANCEL, hci_cc_remote_name_req_cancel,
+ sizeof(struct hci_rp_remote_name_req_cancel)),
+ HCI_CC(HCI_OP_ROLE_DISCOVERY, hci_cc_role_discovery,
+ sizeof(struct hci_rp_role_discovery)),
+ HCI_CC(HCI_OP_READ_LINK_POLICY, hci_cc_read_link_policy,
+ sizeof(struct hci_rp_read_link_policy)),
+ HCI_CC(HCI_OP_WRITE_LINK_POLICY, hci_cc_write_link_policy,
+ sizeof(struct hci_rp_write_link_policy)),
+ HCI_CC(HCI_OP_READ_DEF_LINK_POLICY, hci_cc_read_def_link_policy,
+ sizeof(struct hci_rp_read_def_link_policy)),
+ HCI_CC_STATUS(HCI_OP_WRITE_DEF_LINK_POLICY,
+ hci_cc_write_def_link_policy),
+ HCI_CC_STATUS(HCI_OP_RESET, hci_cc_reset),
+ HCI_CC(HCI_OP_READ_STORED_LINK_KEY, hci_cc_read_stored_link_key,
+ sizeof(struct hci_rp_read_stored_link_key)),
+ HCI_CC(HCI_OP_DELETE_STORED_LINK_KEY, hci_cc_delete_stored_link_key,
+ sizeof(struct hci_rp_delete_stored_link_key)),
+ HCI_CC_STATUS(HCI_OP_WRITE_LOCAL_NAME, hci_cc_write_local_name),
+ HCI_CC(HCI_OP_READ_LOCAL_NAME, hci_cc_read_local_name,
+ sizeof(struct hci_rp_read_local_name)),
+ HCI_CC_STATUS(HCI_OP_WRITE_AUTH_ENABLE, hci_cc_write_auth_enable),
+ HCI_CC_STATUS(HCI_OP_WRITE_ENCRYPT_MODE, hci_cc_write_encrypt_mode),
+ HCI_CC_STATUS(HCI_OP_WRITE_SCAN_ENABLE, hci_cc_write_scan_enable),
+ HCI_CC_STATUS(HCI_OP_SET_EVENT_FLT, hci_cc_set_event_filter),
+ HCI_CC(HCI_OP_READ_CLASS_OF_DEV, hci_cc_read_class_of_dev,
+ sizeof(struct hci_rp_read_class_of_dev)),
+ HCI_CC_STATUS(HCI_OP_WRITE_CLASS_OF_DEV, hci_cc_write_class_of_dev),
+ HCI_CC(HCI_OP_READ_VOICE_SETTING, hci_cc_read_voice_setting,
+ sizeof(struct hci_rp_read_voice_setting)),
+ HCI_CC_STATUS(HCI_OP_WRITE_VOICE_SETTING, hci_cc_write_voice_setting),
+ HCI_CC(HCI_OP_READ_NUM_SUPPORTED_IAC, hci_cc_read_num_supported_iac,
+ sizeof(struct hci_rp_read_num_supported_iac)),
+ HCI_CC_STATUS(HCI_OP_WRITE_SSP_MODE, hci_cc_write_ssp_mode),
+ HCI_CC_STATUS(HCI_OP_WRITE_SC_SUPPORT, hci_cc_write_sc_support),
+ HCI_CC(HCI_OP_READ_AUTH_PAYLOAD_TO, hci_cc_read_auth_payload_timeout,
+ sizeof(struct hci_rp_read_auth_payload_to)),
+ HCI_CC(HCI_OP_WRITE_AUTH_PAYLOAD_TO, hci_cc_write_auth_payload_timeout,
+ sizeof(struct hci_rp_write_auth_payload_to)),
+ HCI_CC(HCI_OP_READ_LOCAL_VERSION, hci_cc_read_local_version,
+ sizeof(struct hci_rp_read_local_version)),
+ HCI_CC(HCI_OP_READ_LOCAL_COMMANDS, hci_cc_read_local_commands,
+ sizeof(struct hci_rp_read_local_commands)),
+ HCI_CC(HCI_OP_READ_LOCAL_FEATURES, hci_cc_read_local_features,
+ sizeof(struct hci_rp_read_local_features)),
+ HCI_CC(HCI_OP_READ_LOCAL_EXT_FEATURES, hci_cc_read_local_ext_features,
+ sizeof(struct hci_rp_read_local_ext_features)),
+ HCI_CC(HCI_OP_READ_BUFFER_SIZE, hci_cc_read_buffer_size,
+ sizeof(struct hci_rp_read_buffer_size)),
+ HCI_CC(HCI_OP_READ_BD_ADDR, hci_cc_read_bd_addr,
+ sizeof(struct hci_rp_read_bd_addr)),
+ HCI_CC(HCI_OP_READ_LOCAL_PAIRING_OPTS, hci_cc_read_local_pairing_opts,
+ sizeof(struct hci_rp_read_local_pairing_opts)),
+ HCI_CC(HCI_OP_READ_PAGE_SCAN_ACTIVITY, hci_cc_read_page_scan_activity,
+ sizeof(struct hci_rp_read_page_scan_activity)),
+ HCI_CC_STATUS(HCI_OP_WRITE_PAGE_SCAN_ACTIVITY,
+ hci_cc_write_page_scan_activity),
+ HCI_CC(HCI_OP_READ_PAGE_SCAN_TYPE, hci_cc_read_page_scan_type,
+ sizeof(struct hci_rp_read_page_scan_type)),
+ HCI_CC_STATUS(HCI_OP_WRITE_PAGE_SCAN_TYPE, hci_cc_write_page_scan_type),
+ HCI_CC(HCI_OP_READ_CLOCK, hci_cc_read_clock,
+ sizeof(struct hci_rp_read_clock)),
+ HCI_CC(HCI_OP_READ_ENC_KEY_SIZE, hci_cc_read_enc_key_size,
+ sizeof(struct hci_rp_read_enc_key_size)),
+ HCI_CC(HCI_OP_READ_INQ_RSP_TX_POWER, hci_cc_read_inq_rsp_tx_power,
+ sizeof(struct hci_rp_read_inq_rsp_tx_power)),
+ HCI_CC(HCI_OP_READ_DEF_ERR_DATA_REPORTING,
+ hci_cc_read_def_err_data_reporting,
+ sizeof(struct hci_rp_read_def_err_data_reporting)),
+ HCI_CC_STATUS(HCI_OP_WRITE_DEF_ERR_DATA_REPORTING,
+ hci_cc_write_def_err_data_reporting),
+ HCI_CC(HCI_OP_PIN_CODE_REPLY, hci_cc_pin_code_reply,
+ sizeof(struct hci_rp_pin_code_reply)),
+ HCI_CC(HCI_OP_PIN_CODE_NEG_REPLY, hci_cc_pin_code_neg_reply,
+ sizeof(struct hci_rp_pin_code_neg_reply)),
+ HCI_CC(HCI_OP_READ_LOCAL_OOB_DATA, hci_cc_read_local_oob_data,
+ sizeof(struct hci_rp_read_local_oob_data)),
+ HCI_CC(HCI_OP_READ_LOCAL_OOB_EXT_DATA, hci_cc_read_local_oob_ext_data,
+ sizeof(struct hci_rp_read_local_oob_ext_data)),
+ HCI_CC(HCI_OP_LE_READ_BUFFER_SIZE, hci_cc_le_read_buffer_size,
+ sizeof(struct hci_rp_le_read_buffer_size)),
+ HCI_CC(HCI_OP_LE_READ_LOCAL_FEATURES, hci_cc_le_read_local_features,
+ sizeof(struct hci_rp_le_read_local_features)),
+ HCI_CC(HCI_OP_LE_READ_ADV_TX_POWER, hci_cc_le_read_adv_tx_power,
+ sizeof(struct hci_rp_le_read_adv_tx_power)),
+ HCI_CC(HCI_OP_USER_CONFIRM_REPLY, hci_cc_user_confirm_reply,
+ sizeof(struct hci_rp_user_confirm_reply)),
+ HCI_CC(HCI_OP_USER_CONFIRM_NEG_REPLY, hci_cc_user_confirm_neg_reply,
+ sizeof(struct hci_rp_user_confirm_reply)),
+ HCI_CC(HCI_OP_USER_PASSKEY_REPLY, hci_cc_user_passkey_reply,
+ sizeof(struct hci_rp_user_confirm_reply)),
+ HCI_CC(HCI_OP_USER_PASSKEY_NEG_REPLY, hci_cc_user_passkey_neg_reply,
+ sizeof(struct hci_rp_user_confirm_reply)),
+ HCI_CC_STATUS(HCI_OP_LE_SET_RANDOM_ADDR, hci_cc_le_set_random_addr),
+ HCI_CC_STATUS(HCI_OP_LE_SET_ADV_ENABLE, hci_cc_le_set_adv_enable),
+ HCI_CC_STATUS(HCI_OP_LE_SET_SCAN_PARAM, hci_cc_le_set_scan_param),
+ HCI_CC_STATUS(HCI_OP_LE_SET_SCAN_ENABLE, hci_cc_le_set_scan_enable),
+ HCI_CC(HCI_OP_LE_READ_ACCEPT_LIST_SIZE,
+ hci_cc_le_read_accept_list_size,
+ sizeof(struct hci_rp_le_read_accept_list_size)),
+ HCI_CC_STATUS(HCI_OP_LE_CLEAR_ACCEPT_LIST, hci_cc_le_clear_accept_list),
+ HCI_CC_STATUS(HCI_OP_LE_ADD_TO_ACCEPT_LIST,
+ hci_cc_le_add_to_accept_list),
+ HCI_CC_STATUS(HCI_OP_LE_DEL_FROM_ACCEPT_LIST,
+ hci_cc_le_del_from_accept_list),
+ HCI_CC(HCI_OP_LE_READ_SUPPORTED_STATES, hci_cc_le_read_supported_states,
+ sizeof(struct hci_rp_le_read_supported_states)),
+ HCI_CC(HCI_OP_LE_READ_DEF_DATA_LEN, hci_cc_le_read_def_data_len,
+ sizeof(struct hci_rp_le_read_def_data_len)),
+ HCI_CC_STATUS(HCI_OP_LE_WRITE_DEF_DATA_LEN,
+ hci_cc_le_write_def_data_len),
+ HCI_CC_STATUS(HCI_OP_LE_ADD_TO_RESOLV_LIST,
+ hci_cc_le_add_to_resolv_list),
+ HCI_CC_STATUS(HCI_OP_LE_DEL_FROM_RESOLV_LIST,
+ hci_cc_le_del_from_resolv_list),
+ HCI_CC_STATUS(HCI_OP_LE_CLEAR_RESOLV_LIST,
+ hci_cc_le_clear_resolv_list),
+ HCI_CC(HCI_OP_LE_READ_RESOLV_LIST_SIZE, hci_cc_le_read_resolv_list_size,
+ sizeof(struct hci_rp_le_read_resolv_list_size)),
+ HCI_CC_STATUS(HCI_OP_LE_SET_ADDR_RESOLV_ENABLE,
+ hci_cc_le_set_addr_resolution_enable),
+ HCI_CC(HCI_OP_LE_READ_MAX_DATA_LEN, hci_cc_le_read_max_data_len,
+ sizeof(struct hci_rp_le_read_max_data_len)),
+ HCI_CC_STATUS(HCI_OP_WRITE_LE_HOST_SUPPORTED,
+ hci_cc_write_le_host_supported),
+ HCI_CC_STATUS(HCI_OP_LE_SET_ADV_PARAM, hci_cc_set_adv_param),
+ HCI_CC(HCI_OP_READ_RSSI, hci_cc_read_rssi,
+ sizeof(struct hci_rp_read_rssi)),
+ HCI_CC(HCI_OP_READ_TX_POWER, hci_cc_read_tx_power,
+ sizeof(struct hci_rp_read_tx_power)),
+ HCI_CC_STATUS(HCI_OP_WRITE_SSP_DEBUG_MODE, hci_cc_write_ssp_debug_mode),
+ HCI_CC_STATUS(HCI_OP_LE_SET_EXT_SCAN_PARAMS,
+ hci_cc_le_set_ext_scan_param),
+ HCI_CC_STATUS(HCI_OP_LE_SET_EXT_SCAN_ENABLE,
+ hci_cc_le_set_ext_scan_enable),
+ HCI_CC_STATUS(HCI_OP_LE_SET_DEFAULT_PHY, hci_cc_le_set_default_phy),
+ HCI_CC(HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS,
+ hci_cc_le_read_num_adv_sets,
+ sizeof(struct hci_rp_le_read_num_supported_adv_sets)),
+ HCI_CC_STATUS(HCI_OP_LE_SET_EXT_ADV_ENABLE,
+ hci_cc_le_set_ext_adv_enable),
+ HCI_CC_STATUS(HCI_OP_LE_SET_ADV_SET_RAND_ADDR,
+ hci_cc_le_set_adv_set_random_addr),
+ HCI_CC_STATUS(HCI_OP_LE_REMOVE_ADV_SET, hci_cc_le_remove_adv_set),
+ HCI_CC_STATUS(HCI_OP_LE_CLEAR_ADV_SETS, hci_cc_le_clear_adv_sets),
+ HCI_CC_STATUS(HCI_OP_LE_SET_PER_ADV_PARAMS, hci_cc_set_per_adv_param),
+ HCI_CC_STATUS(HCI_OP_LE_SET_PER_ADV_ENABLE,
+ hci_cc_le_set_per_adv_enable),
+ HCI_CC(HCI_OP_LE_READ_TRANSMIT_POWER, hci_cc_le_read_transmit_power,
+ sizeof(struct hci_rp_le_read_transmit_power)),
+ HCI_CC_STATUS(HCI_OP_LE_SET_PRIVACY_MODE, hci_cc_le_set_privacy_mode),
+ HCI_CC(HCI_OP_LE_READ_BUFFER_SIZE_V2, hci_cc_le_read_buffer_size_v2,
+ sizeof(struct hci_rp_le_read_buffer_size_v2)),
+ HCI_CC_VL(HCI_OP_LE_SET_CIG_PARAMS, hci_cc_le_set_cig_params,
+ sizeof(struct hci_rp_le_set_cig_params), HCI_MAX_EVENT_SIZE),
+ HCI_CC(HCI_OP_LE_SETUP_ISO_PATH, hci_cc_le_setup_iso_path,
+ sizeof(struct hci_rp_le_setup_iso_path)),
+ HCI_CC(HCI_OP_LE_READ_ALL_LOCAL_FEATURES,
+ hci_cc_le_read_all_local_features,
+ sizeof(struct hci_rp_le_read_all_local_features)),
+};
+
+static u8 hci_cc_func(struct hci_dev *hdev, const struct hci_cc *cc,
+ struct sk_buff *skb)
+{
+ void *data;
+
+ if (skb->len < cc->min_len) {
+ bt_dev_err(hdev, "unexpected cc 0x%4.4x length: %u < %u",
+ cc->op, skb->len, cc->min_len);
+ return HCI_ERROR_UNSPECIFIED;
+ }
+
+ /* Just warn if the length is over max_len size it still be possible to
+ * partially parse the cc so leave to callback to decide if that is
+ * acceptable.
+ */
+ if (skb->len > cc->max_len)
+ bt_dev_warn(hdev, "unexpected cc 0x%4.4x length: %u > %u",
+ cc->op, skb->len, cc->max_len);
- case HCI_OP_LE_SET_EXT_SCAN_PARAMS:
- hci_cc_le_set_ext_scan_param(hdev, skb);
- break;
+ data = hci_cc_skb_pull(hdev, skb, cc->op, cc->min_len);
+ if (!data)
+ return HCI_ERROR_UNSPECIFIED;
- case HCI_OP_LE_SET_EXT_SCAN_ENABLE:
- hci_cc_le_set_ext_scan_enable(hdev, skb);
- break;
+ return cc->func(hdev, data, skb);
+}
- case HCI_OP_LE_SET_DEFAULT_PHY:
- hci_cc_le_set_default_phy(hdev, skb);
- break;
+static void hci_cmd_complete_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb, u16 *opcode, u8 *status,
+ hci_req_complete_t *req_complete,
+ hci_req_complete_skb_t *req_complete_skb)
+{
+ struct hci_ev_cmd_complete *ev = data;
+ int i;
- case HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS:
- hci_cc_le_read_num_adv_sets(hdev, skb);
- break;
+ *opcode = __le16_to_cpu(ev->opcode);
- case HCI_OP_LE_SET_EXT_ADV_PARAMS:
- hci_cc_set_ext_adv_param(hdev, skb);
- break;
+ bt_dev_dbg(hdev, "opcode 0x%4.4x", *opcode);
- case HCI_OP_LE_SET_EXT_ADV_ENABLE:
- hci_cc_le_set_ext_adv_enable(hdev, skb);
- break;
+ for (i = 0; i < ARRAY_SIZE(hci_cc_table); i++) {
+ if (hci_cc_table[i].op == *opcode) {
+ *status = hci_cc_func(hdev, &hci_cc_table[i], skb);
+ break;
+ }
+ }
- case HCI_OP_LE_SET_ADV_SET_RAND_ADDR:
- hci_cc_le_set_adv_set_random_addr(hdev, skb);
- break;
+ if (i == ARRAY_SIZE(hci_cc_table)) {
+ if (!skb->len) {
+ bt_dev_err(hdev, "Unexpected cc 0x%4.4x with no status",
+ *opcode);
+ *status = HCI_ERROR_UNSPECIFIED;
+ return;
+ }
- default:
- BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode);
- break;
+ /* Unknown opcode, assume byte 0 contains the status, so
+ * that e.g. __hci_cmd_sync() properly returns errors
+ * for vendor specific commands send by HCI drivers.
+ * If a vendor doesn't actually follow this convention we may
+ * need to introduce a vendor CC table in order to properly set
+ * the status.
+ */
+ *status = skb->data[0];
}
- if (*opcode != HCI_OP_NOP)
- cancel_delayed_work(&hdev->cmd_timer);
-
- if (ev->ncmd && !test_bit(HCI_RESET, &hdev->flags))
- atomic_set(&hdev->cmd_cnt, 1);
+ handle_cmd_cnt_and_timer(hdev, ev->ncmd);
hci_req_cmd_complete(hdev, *opcode, *status, req_complete,
req_complete_skb);
+ if (hci_dev_test_flag(hdev, HCI_CMD_PENDING)) {
+ bt_dev_err(hdev,
+ "unexpected event for opcode 0x%4.4x", *opcode);
+ return;
+ }
+
if (atomic_read(&hdev->cmd_cnt) && !skb_queue_empty(&hdev->cmd_q))
queue_work(hdev->workqueue, &hdev->cmd_work);
}
-static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb,
- u16 *opcode, u8 *status,
- hci_req_complete_t *req_complete,
- hci_req_complete_skb_t *req_complete_skb)
+static void hci_cs_le_create_cis(struct hci_dev *hdev, u8 status)
{
- struct hci_ev_cmd_status *ev = (void *) skb->data;
-
- skb_pull(skb, sizeof(*ev));
-
- *opcode = __le16_to_cpu(ev->opcode);
- *status = ev->status;
-
- switch (*opcode) {
- case HCI_OP_INQUIRY:
- hci_cs_inquiry(hdev, ev->status);
- break;
-
- case HCI_OP_CREATE_CONN:
- hci_cs_create_conn(hdev, ev->status);
- break;
-
- case HCI_OP_DISCONNECT:
- hci_cs_disconnect(hdev, ev->status);
- break;
-
- case HCI_OP_ADD_SCO:
- hci_cs_add_sco(hdev, ev->status);
- break;
-
- case HCI_OP_AUTH_REQUESTED:
- hci_cs_auth_requested(hdev, ev->status);
- break;
+ struct hci_cp_le_create_cis *cp;
+ bool pending = false;
+ int i;
- case HCI_OP_SET_CONN_ENCRYPT:
- hci_cs_set_conn_encrypt(hdev, ev->status);
- break;
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
- case HCI_OP_REMOTE_NAME_REQ:
- hci_cs_remote_name_req(hdev, ev->status);
- break;
+ if (!status)
+ return;
- case HCI_OP_READ_REMOTE_FEATURES:
- hci_cs_read_remote_features(hdev, ev->status);
- break;
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_CREATE_CIS);
+ if (!cp)
+ return;
- case HCI_OP_READ_REMOTE_EXT_FEATURES:
- hci_cs_read_remote_ext_features(hdev, ev->status);
- break;
+ hci_dev_lock(hdev);
- case HCI_OP_SETUP_SYNC_CONN:
- hci_cs_setup_sync_conn(hdev, ev->status);
- break;
+ /* Remove connection if command failed */
+ for (i = 0; i < cp->num_cis; i++) {
+ struct hci_conn *conn;
+ u16 handle;
- case HCI_OP_SNIFF_MODE:
- hci_cs_sniff_mode(hdev, ev->status);
- break;
+ handle = __le16_to_cpu(cp->cis[i].cis_handle);
- case HCI_OP_EXIT_SNIFF_MODE:
- hci_cs_exit_sniff_mode(hdev, ev->status);
- break;
+ conn = hci_conn_hash_lookup_handle(hdev, handle);
+ if (conn) {
+ if (test_and_clear_bit(HCI_CONN_CREATE_CIS,
+ &conn->flags))
+ pending = true;
+ conn->state = BT_CLOSED;
+ hci_connect_cfm(conn, status);
+ hci_conn_del(conn);
+ }
+ }
+ cp->num_cis = 0;
- case HCI_OP_SWITCH_ROLE:
- hci_cs_switch_role(hdev, ev->status);
- break;
+ if (pending)
+ hci_le_create_cis_pending(hdev);
- case HCI_OP_LE_CREATE_CONN:
- hci_cs_le_create_conn(hdev, ev->status);
- break;
+ hci_dev_unlock(hdev);
+}
- case HCI_OP_LE_READ_REMOTE_FEATURES:
- hci_cs_le_read_remote_features(hdev, ev->status);
- break;
+#define HCI_CS(_op, _func) \
+{ \
+ .op = _op, \
+ .func = _func, \
+}
+
+static const struct hci_cs {
+ u16 op;
+ void (*func)(struct hci_dev *hdev, __u8 status);
+} hci_cs_table[] = {
+ HCI_CS(HCI_OP_INQUIRY, hci_cs_inquiry),
+ HCI_CS(HCI_OP_CREATE_CONN, hci_cs_create_conn),
+ HCI_CS(HCI_OP_DISCONNECT, hci_cs_disconnect),
+ HCI_CS(HCI_OP_ADD_SCO, hci_cs_add_sco),
+ HCI_CS(HCI_OP_AUTH_REQUESTED, hci_cs_auth_requested),
+ HCI_CS(HCI_OP_SET_CONN_ENCRYPT, hci_cs_set_conn_encrypt),
+ HCI_CS(HCI_OP_REMOTE_NAME_REQ, hci_cs_remote_name_req),
+ HCI_CS(HCI_OP_READ_REMOTE_FEATURES, hci_cs_read_remote_features),
+ HCI_CS(HCI_OP_READ_REMOTE_EXT_FEATURES,
+ hci_cs_read_remote_ext_features),
+ HCI_CS(HCI_OP_SETUP_SYNC_CONN, hci_cs_setup_sync_conn),
+ HCI_CS(HCI_OP_ENHANCED_SETUP_SYNC_CONN,
+ hci_cs_enhanced_setup_sync_conn),
+ HCI_CS(HCI_OP_SNIFF_MODE, hci_cs_sniff_mode),
+ HCI_CS(HCI_OP_EXIT_SNIFF_MODE, hci_cs_exit_sniff_mode),
+ HCI_CS(HCI_OP_SWITCH_ROLE, hci_cs_switch_role),
+ HCI_CS(HCI_OP_LE_CREATE_CONN, hci_cs_le_create_conn),
+ HCI_CS(HCI_OP_LE_READ_REMOTE_FEATURES, hci_cs_le_read_remote_features),
+ HCI_CS(HCI_OP_LE_START_ENC, hci_cs_le_start_enc),
+ HCI_CS(HCI_OP_LE_EXT_CREATE_CONN, hci_cs_le_ext_create_conn),
+ HCI_CS(HCI_OP_LE_CREATE_CIS, hci_cs_le_create_cis),
+ HCI_CS(HCI_OP_LE_CREATE_BIG, hci_cs_le_create_big),
+ HCI_CS(HCI_OP_LE_READ_ALL_REMOTE_FEATURES,
+ hci_cs_le_read_all_remote_features),
+};
+
+static void hci_cmd_status_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb, u16 *opcode, u8 *status,
+ hci_req_complete_t *req_complete,
+ hci_req_complete_skb_t *req_complete_skb)
+{
+ struct hci_ev_cmd_status *ev = data;
+ int i;
- case HCI_OP_LE_START_ENC:
- hci_cs_le_start_enc(hdev, ev->status);
- break;
+ *opcode = __le16_to_cpu(ev->opcode);
+ *status = ev->status;
- case HCI_OP_LE_EXT_CREATE_CONN:
- hci_cs_le_ext_create_conn(hdev, ev->status);
- break;
+ bt_dev_dbg(hdev, "opcode 0x%4.4x", *opcode);
- default:
- BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode);
- break;
+ for (i = 0; i < ARRAY_SIZE(hci_cs_table); i++) {
+ if (hci_cs_table[i].op == *opcode) {
+ hci_cs_table[i].func(hdev, ev->status);
+ break;
+ }
}
- if (*opcode != HCI_OP_NOP)
- cancel_delayed_work(&hdev->cmd_timer);
-
- if (ev->ncmd && !test_bit(HCI_RESET, &hdev->flags))
- atomic_set(&hdev->cmd_cnt, 1);
+ handle_cmd_cnt_and_timer(hdev, ev->ncmd);
/* Indicate request completion if the command failed. Also, if
* we're not waiting for a special event and we get a success
@@ -3459,30 +4394,39 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb,
* (since for this kind of commands there will not be a command
* complete event).
*/
- if (ev->status ||
- (hdev->sent_cmd && !bt_cb(hdev->sent_cmd)->hci.req_event))
+ if (ev->status || (hdev->req_skb && !hci_skb_event(hdev->req_skb))) {
hci_req_cmd_complete(hdev, *opcode, ev->status, req_complete,
req_complete_skb);
+ if (hci_dev_test_flag(hdev, HCI_CMD_PENDING)) {
+ bt_dev_err(hdev, "unexpected event for opcode 0x%4.4x",
+ *opcode);
+ return;
+ }
+ }
if (atomic_read(&hdev->cmd_cnt) && !skb_queue_empty(&hdev->cmd_q))
queue_work(hdev->workqueue, &hdev->cmd_work);
}
-static void hci_hardware_error_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_hardware_error_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_hardware_error *ev = (void *) skb->data;
+ struct hci_ev_hardware_error *ev = data;
+
+ bt_dev_dbg(hdev, "code 0x%2.2x", ev->code);
hdev->hw_error_code = ev->code;
queue_work(hdev->req_workqueue, &hdev->error_reset);
}
-static void hci_role_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_role_change_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_role_change *ev = (void *) skb->data;
+ struct hci_ev_role_change *ev = data;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
hci_dev_lock(hdev);
@@ -3499,28 +4443,25 @@ static void hci_role_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
hci_dev_unlock(hdev);
}
-static void hci_num_comp_pkts_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_num_comp_pkts *ev = (void *) skb->data;
+ struct hci_ev_num_comp_pkts *ev = data;
int i;
- if (hdev->flow_ctl_mode != HCI_FLOW_CTL_MODE_PACKET_BASED) {
- bt_dev_err(hdev, "wrong event for mode %d", hdev->flow_ctl_mode);
+ if (!hci_ev_skb_pull(hdev, skb, HCI_EV_NUM_COMP_PKTS,
+ flex_array_size(ev, handles, ev->num)))
return;
- }
- if (skb->len < sizeof(*ev) || skb->len < sizeof(*ev) +
- ev->num_hndl * sizeof(struct hci_comp_pkts_info)) {
- BT_DBG("%s bad parameters", hdev->name);
- return;
- }
+ bt_dev_dbg(hdev, "num %d", ev->num);
- BT_DBG("%s num_hndl %d", hdev->name, ev->num_hndl);
+ hci_dev_lock(hdev);
- for (i = 0; i < ev->num_hndl; i++) {
+ for (i = 0; i < ev->num; i++) {
struct hci_comp_pkts_info *info = &ev->handles[i];
struct hci_conn *conn;
__u16 handle, count;
+ unsigned int i;
handle = __le16_to_cpu(info->handle);
count = __le16_to_cpu(info->count);
@@ -3529,7 +4470,20 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, struct sk_buff *skb)
if (!conn)
continue;
- conn->sent -= count;
+ /* Check if there is really enough packets outstanding before
+ * attempting to decrease the sent counter otherwise it could
+ * underflow..
+ */
+ if (conn->sent >= count) {
+ conn->sent -= count;
+ } else {
+ bt_dev_warn(hdev, "hcon %p sent %u < count %u",
+ conn, conn->sent, count);
+ conn->sent = 0;
+ }
+
+ for (i = 0; i < count; ++i)
+ hci_conn_tx_dequeue(conn);
switch (conn->type) {
case ACL_LINK:
@@ -3551,81 +4505,19 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, struct sk_buff *skb)
break;
case SCO_LINK:
+ case ESCO_LINK:
hdev->sco_cnt += count;
if (hdev->sco_cnt > hdev->sco_pkts)
hdev->sco_cnt = hdev->sco_pkts;
- break;
- default:
- bt_dev_err(hdev, "unknown type %d conn %p",
- conn->type, conn);
break;
- }
- }
-
- queue_work(hdev->workqueue, &hdev->tx_work);
-}
-
-static struct hci_conn *__hci_conn_lookup_handle(struct hci_dev *hdev,
- __u16 handle)
-{
- struct hci_chan *chan;
-
- switch (hdev->dev_type) {
- case HCI_PRIMARY:
- return hci_conn_hash_lookup_handle(hdev, handle);
- case HCI_AMP:
- chan = hci_chan_lookup_handle(hdev, handle);
- if (chan)
- return chan->conn;
- break;
- default:
- bt_dev_err(hdev, "unknown dev_type %d", hdev->dev_type);
- break;
- }
-
- return NULL;
-}
-
-static void hci_num_comp_blocks_evt(struct hci_dev *hdev, struct sk_buff *skb)
-{
- struct hci_ev_num_comp_blocks *ev = (void *) skb->data;
- int i;
- if (hdev->flow_ctl_mode != HCI_FLOW_CTL_MODE_BLOCK_BASED) {
- bt_dev_err(hdev, "wrong event for mode %d", hdev->flow_ctl_mode);
- return;
- }
-
- if (skb->len < sizeof(*ev) || skb->len < sizeof(*ev) +
- ev->num_hndl * sizeof(struct hci_comp_blocks_info)) {
- BT_DBG("%s bad parameters", hdev->name);
- return;
- }
-
- BT_DBG("%s num_blocks %d num_hndl %d", hdev->name, ev->num_blocks,
- ev->num_hndl);
-
- for (i = 0; i < ev->num_hndl; i++) {
- struct hci_comp_blocks_info *info = &ev->handles[i];
- struct hci_conn *conn = NULL;
- __u16 handle, block_count;
-
- handle = __le16_to_cpu(info->handle);
- block_count = __le16_to_cpu(info->blocks);
-
- conn = __hci_conn_lookup_handle(hdev, handle);
- if (!conn)
- continue;
-
- conn->sent -= block_count;
-
- switch (conn->type) {
- case ACL_LINK:
- case AMP_LINK:
- hdev->block_cnt += block_count;
- if (hdev->block_cnt > hdev->num_blocks)
- hdev->block_cnt = hdev->num_blocks;
+ case CIS_LINK:
+ case BIS_LINK:
+ case PA_LINK:
+ hdev->iso_cnt += count;
+ if (hdev->iso_cnt > hdev->iso_pkts)
+ hdev->iso_cnt = hdev->iso_pkts;
break;
default:
@@ -3636,14 +4528,17 @@ static void hci_num_comp_blocks_evt(struct hci_dev *hdev, struct sk_buff *skb)
}
queue_work(hdev->workqueue, &hdev->tx_work);
+
+ hci_dev_unlock(hdev);
}
-static void hci_mode_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_mode_change_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_mode_change *ev = (void *) skb->data;
+ struct hci_ev_mode_change *ev = data;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
hci_dev_lock(hdev);
@@ -3666,12 +4561,13 @@ static void hci_mode_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
hci_dev_unlock(hdev);
}
-static void hci_pin_code_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_pin_code_request_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_pin_code_req *ev = (void *) skb->data;
+ struct hci_ev_pin_code_req *ev = data;
struct hci_conn *conn;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
hci_dev_lock(hdev);
@@ -3736,14 +4632,15 @@ static void conn_set_key(struct hci_conn *conn, u8 key_type, u8 pin_len)
}
}
-static void hci_link_key_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_link_key_request_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_link_key_req *ev = (void *) skb->data;
+ struct hci_ev_link_key_req *ev = data;
struct hci_cp_link_key_reply cp;
struct hci_conn *conn;
struct link_key *key;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
if (!hci_dev_test_flag(hdev, HCI_MGMT))
return;
@@ -3752,13 +4649,11 @@ static void hci_link_key_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
key = hci_find_link_key(hdev, &ev->bdaddr);
if (!key) {
- BT_DBG("%s link key not found for %pMR", hdev->name,
- &ev->bdaddr);
+ bt_dev_dbg(hdev, "link key not found for %pMR", &ev->bdaddr);
goto not_found;
}
- BT_DBG("%s found key type %u for %pMR", hdev->name, key->type,
- &ev->bdaddr);
+ bt_dev_dbg(hdev, "found key type %u for %pMR", key->type, &ev->bdaddr);
conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
if (conn) {
@@ -3767,15 +4662,14 @@ static void hci_link_key_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
if ((key->type == HCI_LK_UNAUTH_COMBINATION_P192 ||
key->type == HCI_LK_UNAUTH_COMBINATION_P256) &&
conn->auth_type != 0xff && (conn->auth_type & 0x01)) {
- BT_DBG("%s ignoring unauthenticated key", hdev->name);
+ bt_dev_dbg(hdev, "ignoring unauthenticated key");
goto not_found;
}
if (key->type == HCI_LK_COMBINATION && key->pin_len < 16 &&
(conn->pending_sec_level == BT_SECURITY_HIGH ||
conn->pending_sec_level == BT_SECURITY_FIPS)) {
- BT_DBG("%s ignoring key unauthenticated for high security",
- hdev->name);
+ bt_dev_dbg(hdev, "ignoring key unauthenticated for high security");
goto not_found;
}
@@ -3796,15 +4690,16 @@ not_found:
hci_dev_unlock(hdev);
}
-static void hci_link_key_notify_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_link_key_notify_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_link_key_notify *ev = (void *) skb->data;
+ struct hci_ev_link_key_notify *ev = data;
struct hci_conn *conn;
struct link_key *key;
bool persistent;
u8 pin_len = 0;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
hci_dev_lock(hdev);
@@ -3812,6 +4707,15 @@ static void hci_link_key_notify_evt(struct hci_dev *hdev, struct sk_buff *skb)
if (!conn)
goto unlock;
+ /* Ignore NULL link key against CVE-2020-26555 */
+ if (!crypto_memneq(ev->link_key, ZERO_KEY, HCI_LINK_KEY_SIZE)) {
+ bt_dev_dbg(hdev, "Ignore NULL link key (ZERO KEY) for %pMR",
+ &ev->bdaddr);
+ hci_disconnect(conn, HCI_ERROR_AUTH_FAILURE);
+ hci_conn_drop(conn);
+ goto unlock;
+ }
+
hci_conn_hold(conn);
conn->disc_timeout = HCI_DISCONN_TIMEOUT;
hci_conn_drop(conn);
@@ -3856,12 +4760,13 @@ unlock:
hci_dev_unlock(hdev);
}
-static void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_clock_offset_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_clock_offset *ev = (void *) skb->data;
+ struct hci_ev_clock_offset *ev = data;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
hci_dev_lock(hdev);
@@ -3879,12 +4784,13 @@ static void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *skb)
hci_dev_unlock(hdev);
}
-static void hci_pkt_type_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_pkt_type_change_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_pkt_type_change *ev = (void *) skb->data;
+ struct hci_ev_pkt_type_change *ev = data;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
hci_dev_lock(hdev);
@@ -3895,12 +4801,13 @@ static void hci_pkt_type_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
hci_dev_unlock(hdev);
}
-static void hci_pscan_rep_mode_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_pscan_rep_mode_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_pscan_rep_mode *ev = (void *) skb->data;
+ struct hci_ev_pscan_rep_mode *ev = data;
struct inquiry_entry *ie;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
hci_dev_lock(hdev);
@@ -3913,15 +4820,16 @@ static void hci_pscan_rep_mode_evt(struct hci_dev *hdev, struct sk_buff *skb)
hci_dev_unlock(hdev);
}
-static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev,
+static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, void *edata,
struct sk_buff *skb)
{
+ struct hci_ev_inquiry_result_rssi *ev = edata;
struct inquiry_data data;
- int num_rsp = *((__u8 *) skb->data);
+ int i;
- BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
+ bt_dev_dbg(hdev, "num_rsp %d", ev->num);
- if (!num_rsp)
+ if (!ev->num)
return;
if (hci_dev_test_flag(hdev, HCI_PERIODIC_INQ))
@@ -3929,13 +4837,22 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev,
hci_dev_lock(hdev);
- if ((skb->len - 1) / num_rsp != sizeof(struct inquiry_info_with_rssi)) {
- struct inquiry_info_with_rssi_and_pscan_mode *info;
- info = (void *) (skb->data + 1);
+ if (skb->len == array_size(ev->num,
+ sizeof(struct inquiry_info_rssi_pscan))) {
+ struct inquiry_info_rssi_pscan *info;
- for (; num_rsp; num_rsp--, info++) {
+ for (i = 0; i < ev->num; i++) {
u32 flags;
+ info = hci_ev_skb_pull(hdev, skb,
+ HCI_EV_INQUIRY_RESULT_WITH_RSSI,
+ sizeof(*info));
+ if (!info) {
+ bt_dev_err(hdev, "Malformed HCI Event: 0x%2.2x",
+ HCI_EV_INQUIRY_RESULT_WITH_RSSI);
+ goto unlock;
+ }
+
bacpy(&data.bdaddr, &info->bdaddr);
data.pscan_rep_mode = info->pscan_rep_mode;
data.pscan_period_mode = info->pscan_period_mode;
@@ -3949,14 +4866,24 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev,
mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00,
info->dev_class, info->rssi,
- flags, NULL, 0, NULL, 0);
+ flags, NULL, 0, NULL, 0, 0);
}
- } else {
- struct inquiry_info_with_rssi *info = (void *) (skb->data + 1);
+ } else if (skb->len == array_size(ev->num,
+ sizeof(struct inquiry_info_rssi))) {
+ struct inquiry_info_rssi *info;
- for (; num_rsp; num_rsp--, info++) {
+ for (i = 0; i < ev->num; i++) {
u32 flags;
+ info = hci_ev_skb_pull(hdev, skb,
+ HCI_EV_INQUIRY_RESULT_WITH_RSSI,
+ sizeof(*info));
+ if (!info) {
+ bt_dev_err(hdev, "Malformed HCI Event: 0x%2.2x",
+ HCI_EV_INQUIRY_RESULT_WITH_RSSI);
+ goto unlock;
+ }
+
bacpy(&data.bdaddr, &info->bdaddr);
data.pscan_rep_mode = info->pscan_rep_mode;
data.pscan_period_mode = info->pscan_period_mode;
@@ -3970,20 +4897,23 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev,
mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00,
info->dev_class, info->rssi,
- flags, NULL, 0, NULL, 0);
+ flags, NULL, 0, NULL, 0, 0);
}
+ } else {
+ bt_dev_err(hdev, "Malformed HCI Event: 0x%2.2x",
+ HCI_EV_INQUIRY_RESULT_WITH_RSSI);
}
-
+unlock:
hci_dev_unlock(hdev);
}
-static void hci_remote_ext_features_evt(struct hci_dev *hdev,
+static void hci_remote_ext_features_evt(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
- struct hci_ev_remote_ext_features *ev = (void *) skb->data;
+ struct hci_ev_remote_ext_features *ev = data;
struct hci_conn *conn;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
hci_dev_lock(hdev);
@@ -4028,8 +4958,9 @@ static void hci_remote_ext_features_evt(struct hci_dev *hdev,
bacpy(&cp.bdaddr, &conn->dst);
cp.pscan_rep_mode = 0x02;
hci_send_cmd(hdev, HCI_OP_REMOTE_NAME_REQ, sizeof(cp), &cp);
- } else if (!test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags))
- mgmt_device_connected(hdev, conn, 0, NULL, 0);
+ } else {
+ mgmt_device_connected(hdev, conn, NULL, 0);
+ }
if (!hci_outgoing_auth_needed(hdev, conn)) {
conn->state = BT_CONNECTED;
@@ -4041,13 +4972,27 @@ unlock:
hci_dev_unlock(hdev);
}
-static void hci_sync_conn_complete_evt(struct hci_dev *hdev,
+static void hci_sync_conn_complete_evt(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
- struct hci_ev_sync_conn_complete *ev = (void *) skb->data;
+ struct hci_ev_sync_conn_complete *ev = data;
struct hci_conn *conn;
+ u8 status = ev->status;
- BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+ switch (ev->link_type) {
+ case SCO_LINK:
+ case ESCO_LINK:
+ break;
+ default:
+ /* As per Core 5.3 Vol 4 Part E 7.7.35 (p.2219), Link_Type
+ * for HCI_Synchronous_Connection_Complete is limited to
+ * either SCO or eSCO
+ */
+ bt_dev_err(hdev, "Ignoring connect complete event for invalid link type");
+ return;
+ }
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
hci_dev_lock(hdev);
@@ -4070,9 +5015,25 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev,
goto unlock;
}
- switch (ev->status) {
+ /* The HCI_Synchronous_Connection_Complete event is only sent once per connection.
+ * Processing it more than once per connection can corrupt kernel memory.
+ *
+ * As the connection handle is set here for the first time, it indicates
+ * whether the connection is already set up.
+ */
+ if (!HCI_CONN_HANDLE_UNSET(conn->handle)) {
+ bt_dev_err(hdev, "Ignoring HCI_Sync_Conn_Complete event for existing connection");
+ goto unlock;
+ }
+
+ switch (status) {
case 0x00:
- conn->handle = __le16_to_cpu(ev->handle);
+ status = hci_conn_set_handle(conn, __le16_to_cpu(ev->handle));
+ if (status) {
+ conn->state = BT_CLOSED;
+ break;
+ }
+
conn->state = BT_CONNECTED;
conn->type = ev->link_type;
@@ -4085,23 +5046,39 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev,
case 0x11: /* Unsupported Feature or Parameter Value */
case 0x1c: /* SCO interval rejected */
case 0x1a: /* Unsupported Remote Feature */
+ case 0x1e: /* Invalid LMP Parameters */
case 0x1f: /* Unspecified error */
case 0x20: /* Unsupported LMP Parameter value */
if (conn->out) {
conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) |
(hdev->esco_type & EDR_ESCO_MASK);
- if (hci_setup_sync(conn, conn->link->handle))
+ if (hci_setup_sync(conn, conn->parent->handle))
goto unlock;
}
- /* fall through */
+ fallthrough;
default:
conn->state = BT_CLOSED;
break;
}
- hci_connect_cfm(conn, ev->status);
- if (ev->status)
+ bt_dev_dbg(hdev, "SCO connected with air mode: %02x", ev->air_mode);
+ /* Notify only in case of SCO over HCI transport data path which
+ * is zero and non-zero value shall be non-HCI transport data path
+ */
+ if (conn->codec.data_path == 0 && hdev->notify) {
+ switch (ev->air_mode) {
+ case 0x02:
+ hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_CVSD);
+ break;
+ case 0x03:
+ hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_TRANSP);
+ break;
+ }
+ }
+
+ hci_connect_cfm(conn, status);
+ if (status)
hci_conn_del(conn);
unlock:
@@ -4125,17 +5102,21 @@ static inline size_t eir_get_length(u8 *eir, size_t eir_len)
return eir_len;
}
-static void hci_extended_inquiry_result_evt(struct hci_dev *hdev,
+static void hci_extended_inquiry_result_evt(struct hci_dev *hdev, void *edata,
struct sk_buff *skb)
{
+ struct hci_ev_ext_inquiry_result *ev = edata;
struct inquiry_data data;
- struct extended_inquiry_info *info = (void *) (skb->data + 1);
- int num_rsp = *((__u8 *) skb->data);
size_t eir_len;
+ int i;
- BT_DBG("%s num_rsp %d", hdev->name, num_rsp);
+ if (!hci_ev_skb_pull(hdev, skb, HCI_EV_EXTENDED_INQUIRY_RESULT,
+ flex_array_size(ev, info, ev->num)))
+ return;
+
+ bt_dev_dbg(hdev, "num %d", ev->num);
- if (!num_rsp)
+ if (!ev->num)
return;
if (hci_dev_test_flag(hdev, HCI_PERIODIC_INQ))
@@ -4143,7 +5124,8 @@ static void hci_extended_inquiry_result_evt(struct hci_dev *hdev,
hci_dev_lock(hdev);
- for (; num_rsp; num_rsp--, info++) {
+ for (i = 0; i < ev->num; i++) {
+ struct extended_inquiry_info *info = &ev->info[i];
u32 flags;
bool name_known;
@@ -4169,20 +5151,20 @@ static void hci_extended_inquiry_result_evt(struct hci_dev *hdev,
mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00,
info->dev_class, info->rssi,
- flags, info->data, eir_len, NULL, 0);
+ flags, info->data, eir_len, NULL, 0, 0);
}
hci_dev_unlock(hdev);
}
-static void hci_key_refresh_complete_evt(struct hci_dev *hdev,
+static void hci_key_refresh_complete_evt(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
- struct hci_ev_key_refresh_complete *ev = (void *) skb->data;
+ struct hci_ev_key_refresh_complete *ev = data;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x handle 0x%4.4x", hdev->name, ev->status,
- __le16_to_cpu(ev->handle));
+ bt_dev_dbg(hdev, "status 0x%2.2x handle 0x%4.4x", ev->status,
+ __le16_to_cpu(ev->handle));
hci_dev_lock(hdev);
@@ -4267,8 +5249,8 @@ static u8 bredr_oob_data_present(struct hci_conn *conn)
* available, then do not declare that OOB data is
* present.
*/
- if (!memcmp(data->rand256, ZERO_KEY, 16) ||
- !memcmp(data->hash256, ZERO_KEY, 16))
+ if (!crypto_memneq(data->rand256, ZERO_KEY, 16) ||
+ !crypto_memneq(data->hash256, ZERO_KEY, 16))
return 0x00;
return 0x02;
@@ -4278,26 +5260,30 @@ static u8 bredr_oob_data_present(struct hci_conn *conn)
* not supported by the hardware, then check that if
* P-192 data values are present.
*/
- if (!memcmp(data->rand192, ZERO_KEY, 16) ||
- !memcmp(data->hash192, ZERO_KEY, 16))
+ if (!crypto_memneq(data->rand192, ZERO_KEY, 16) ||
+ !crypto_memneq(data->hash192, ZERO_KEY, 16))
return 0x00;
return 0x01;
}
-static void hci_io_capa_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_io_capa_request_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_io_capa_request *ev = (void *) skb->data;
+ struct hci_ev_io_capa_request *ev = data;
struct hci_conn *conn;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
hci_dev_lock(hdev);
conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
- if (!conn)
+ if (!conn || !hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
goto unlock;
+ /* Assume remote supports SSP since it has triggered this event */
+ set_bit(HCI_CONN_SSP_ENABLED, &conn->flags);
+
hci_conn_hold(conn);
if (!hci_dev_test_flag(hdev, HCI_MGMT))
@@ -4354,12 +5340,13 @@ unlock:
hci_dev_unlock(hdev);
}
-static void hci_io_capa_reply_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_io_capa_reply_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_io_capa_reply *ev = (void *) skb->data;
+ struct hci_ev_io_capa_reply *ev = data;
struct hci_conn *conn;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
hci_dev_lock(hdev);
@@ -4374,14 +5361,14 @@ unlock:
hci_dev_unlock(hdev);
}
-static void hci_user_confirm_request_evt(struct hci_dev *hdev,
+static void hci_user_confirm_request_evt(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
- struct hci_ev_user_confirm_req *ev = (void *) skb->data;
+ struct hci_ev_user_confirm_req *ev = data;
int loc_mitm, rem_mitm, confirm_hint = 0;
struct hci_conn *conn;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
hci_dev_lock(hdev);
@@ -4402,26 +5389,33 @@ static void hci_user_confirm_request_evt(struct hci_dev *hdev,
*/
if (conn->pending_sec_level > BT_SECURITY_MEDIUM &&
conn->remote_cap == HCI_IO_NO_INPUT_OUTPUT) {
- BT_DBG("Rejecting request: remote device can't provide MITM");
+ bt_dev_dbg(hdev, "Rejecting request: remote device can't provide MITM");
hci_send_cmd(hdev, HCI_OP_USER_CONFIRM_NEG_REPLY,
sizeof(ev->bdaddr), &ev->bdaddr);
goto unlock;
}
- /* If no side requires MITM protection; auto-accept */
+ /* If no side requires MITM protection; use JUST_CFM method */
if ((!loc_mitm || conn->remote_cap == HCI_IO_NO_INPUT_OUTPUT) &&
(!rem_mitm || conn->io_capability == HCI_IO_NO_INPUT_OUTPUT)) {
- /* If we're not the initiators request authorization to
- * proceed from user space (mgmt_user_confirm with
- * confirm_hint set to 1). The exception is if neither
- * side had MITM or if the local IO capability is
- * NoInputNoOutput, in which case we do auto-accept
+ /* If we're not the initiator of request authorization and the
+ * local IO capability is not NoInputNoOutput, use JUST_WORKS
+ * method (mgmt_user_confirm with confirm_hint set to 1).
*/
if (!test_bit(HCI_CONN_AUTH_PEND, &conn->flags) &&
- conn->io_capability != HCI_IO_NO_INPUT_OUTPUT &&
- (loc_mitm || rem_mitm)) {
- BT_DBG("Confirming auto-accept as acceptor");
+ conn->io_capability != HCI_IO_NO_INPUT_OUTPUT) {
+ bt_dev_dbg(hdev, "Confirming auto-accept as acceptor");
+ confirm_hint = 1;
+ goto confirm;
+ }
+
+ /* If there already exists link key in local host, leave the
+ * decision to user space since the remote device could be
+ * legitimate or malicious.
+ */
+ if (hci_find_link_key(hdev, &ev->bdaddr)) {
+ bt_dev_dbg(hdev, "Local host already has link key");
confirm_hint = 1;
goto confirm;
}
@@ -4449,24 +5443,24 @@ unlock:
hci_dev_unlock(hdev);
}
-static void hci_user_passkey_request_evt(struct hci_dev *hdev,
+static void hci_user_passkey_request_evt(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
- struct hci_ev_user_passkey_req *ev = (void *) skb->data;
+ struct hci_ev_user_passkey_req *ev = data;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
if (hci_dev_test_flag(hdev, HCI_MGMT))
mgmt_user_passkey_request(hdev, &ev->bdaddr, ACL_LINK, 0);
}
-static void hci_user_passkey_notify_evt(struct hci_dev *hdev,
+static void hci_user_passkey_notify_evt(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
- struct hci_ev_user_passkey_notify *ev = (void *) skb->data;
+ struct hci_ev_user_passkey_notify *ev = data;
struct hci_conn *conn;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
if (!conn)
@@ -4481,12 +5475,13 @@ static void hci_user_passkey_notify_evt(struct hci_dev *hdev,
conn->passkey_entered);
}
-static void hci_keypress_notify_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_keypress_notify_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_keypress_notify *ev = (void *) skb->data;
+ struct hci_ev_keypress_notify *ev = data;
struct hci_conn *conn;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
if (!conn)
@@ -4519,18 +5514,18 @@ static void hci_keypress_notify_evt(struct hci_dev *hdev, struct sk_buff *skb)
conn->passkey_entered);
}
-static void hci_simple_pair_complete_evt(struct hci_dev *hdev,
+static void hci_simple_pair_complete_evt(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
- struct hci_ev_simple_pair_complete *ev = (void *) skb->data;
+ struct hci_ev_simple_pair_complete *ev = data;
struct hci_conn *conn;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
hci_dev_lock(hdev);
conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
- if (!conn)
+ if (!conn || !hci_conn_ssp_enabled(conn))
goto unlock;
/* Reset the authentication requirement to unknown */
@@ -4550,14 +5545,14 @@ unlock:
hci_dev_unlock(hdev);
}
-static void hci_remote_host_features_evt(struct hci_dev *hdev,
+static void hci_remote_host_features_evt(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
- struct hci_ev_remote_host_features *ev = (void *) skb->data;
+ struct hci_ev_remote_host_features *ev = data;
struct inquiry_entry *ie;
struct hci_conn *conn;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
hci_dev_lock(hdev);
@@ -4572,13 +5567,13 @@ static void hci_remote_host_features_evt(struct hci_dev *hdev,
hci_dev_unlock(hdev);
}
-static void hci_remote_oob_data_request_evt(struct hci_dev *hdev,
+static void hci_remote_oob_data_request_evt(struct hci_dev *hdev, void *edata,
struct sk_buff *skb)
{
- struct hci_ev_remote_oob_data_request *ev = (void *) skb->data;
+ struct hci_ev_remote_oob_data_request *ev = edata;
struct oob_data *data;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
hci_dev_lock(hdev);
@@ -4626,156 +5621,70 @@ unlock:
hci_dev_unlock(hdev);
}
-#if IS_ENABLED(CONFIG_BT_HS)
-static void hci_chan_selected_evt(struct hci_dev *hdev, struct sk_buff *skb)
-{
- struct hci_ev_channel_selected *ev = (void *)skb->data;
- struct hci_conn *hcon;
-
- BT_DBG("%s handle 0x%2.2x", hdev->name, ev->phy_handle);
-
- skb_pull(skb, sizeof(*ev));
-
- hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle);
- if (!hcon)
- return;
-
- amp_read_loc_assoc_final_data(hdev, hcon);
-}
-
-static void hci_phy_link_complete_evt(struct hci_dev *hdev,
- struct sk_buff *skb)
-{
- struct hci_ev_phy_link_complete *ev = (void *) skb->data;
- struct hci_conn *hcon, *bredr_hcon;
-
- BT_DBG("%s handle 0x%2.2x status 0x%2.2x", hdev->name, ev->phy_handle,
- ev->status);
-
- hci_dev_lock(hdev);
-
- hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle);
- if (!hcon) {
- hci_dev_unlock(hdev);
- return;
- }
-
- if (ev->status) {
- hci_conn_del(hcon);
- hci_dev_unlock(hdev);
- return;
- }
-
- bredr_hcon = hcon->amp_mgr->l2cap_conn->hcon;
-
- hcon->state = BT_CONNECTED;
- bacpy(&hcon->dst, &bredr_hcon->dst);
-
- hci_conn_hold(hcon);
- hcon->disc_timeout = HCI_DISCONN_TIMEOUT;
- hci_conn_drop(hcon);
-
- hci_debugfs_create_conn(hcon);
- hci_conn_add_sysfs(hcon);
-
- amp_physical_cfm(bredr_hcon, hcon);
-
- hci_dev_unlock(hdev);
-}
-
-static void hci_loglink_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
-{
- struct hci_ev_logical_link_complete *ev = (void *) skb->data;
- struct hci_conn *hcon;
- struct hci_chan *hchan;
- struct amp_mgr *mgr;
-
- BT_DBG("%s log_handle 0x%4.4x phy_handle 0x%2.2x status 0x%2.2x",
- hdev->name, le16_to_cpu(ev->handle), ev->phy_handle,
- ev->status);
-
- hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle);
- if (!hcon)
- return;
-
- /* Create AMP hchan */
- hchan = hci_chan_create(hcon);
- if (!hchan)
- return;
-
- hchan->handle = le16_to_cpu(ev->handle);
-
- BT_DBG("hcon %p mgr %p hchan %p", hcon, hcon->amp_mgr, hchan);
-
- mgr = hcon->amp_mgr;
- if (mgr && mgr->bredr_chan) {
- struct l2cap_chan *bredr_chan = mgr->bredr_chan;
-
- l2cap_chan_lock(bredr_chan);
-
- bredr_chan->conn->mtu = hdev->block_mtu;
- l2cap_logical_cfm(bredr_chan, hchan, 0);
- hci_conn_hold(hcon);
-
- l2cap_chan_unlock(bredr_chan);
- }
-}
-
-static void hci_disconn_loglink_complete_evt(struct hci_dev *hdev,
- struct sk_buff *skb)
-{
- struct hci_ev_disconn_logical_link_complete *ev = (void *) skb->data;
- struct hci_chan *hchan;
-
- BT_DBG("%s log handle 0x%4.4x status 0x%2.2x", hdev->name,
- le16_to_cpu(ev->handle), ev->status);
-
- if (ev->status)
- return;
-
- hci_dev_lock(hdev);
-
- hchan = hci_chan_lookup_handle(hdev, le16_to_cpu(ev->handle));
- if (!hchan)
- goto unlock;
-
- amp_destroy_logical_link(hchan, ev->reason);
-
-unlock:
- hci_dev_unlock(hdev);
-}
-
-static void hci_disconn_phylink_complete_evt(struct hci_dev *hdev,
- struct sk_buff *skb)
+static void le_conn_update_addr(struct hci_conn *conn, bdaddr_t *bdaddr,
+ u8 bdaddr_type, bdaddr_t *local_rpa)
{
- struct hci_ev_disconn_phy_link_complete *ev = (void *) skb->data;
- struct hci_conn *hcon;
-
- BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+ if (conn->out) {
+ conn->dst_type = bdaddr_type;
+ conn->resp_addr_type = bdaddr_type;
+ bacpy(&conn->resp_addr, bdaddr);
- if (ev->status)
- return;
+ /* Check if the controller has set a Local RPA then it must be
+ * used instead or hdev->rpa.
+ */
+ if (local_rpa && bacmp(local_rpa, BDADDR_ANY)) {
+ conn->init_addr_type = ADDR_LE_DEV_RANDOM;
+ bacpy(&conn->init_addr, local_rpa);
+ } else if (hci_dev_test_flag(conn->hdev, HCI_PRIVACY)) {
+ conn->init_addr_type = ADDR_LE_DEV_RANDOM;
+ bacpy(&conn->init_addr, &conn->hdev->rpa);
+ } else {
+ hci_copy_identity_address(conn->hdev, &conn->init_addr,
+ &conn->init_addr_type);
+ }
+ } else {
+ conn->resp_addr_type = conn->hdev->adv_addr_type;
+ /* Check if the controller has set a Local RPA then it must be
+ * used instead or hdev->rpa.
+ */
+ if (local_rpa && bacmp(local_rpa, BDADDR_ANY)) {
+ conn->resp_addr_type = ADDR_LE_DEV_RANDOM;
+ bacpy(&conn->resp_addr, local_rpa);
+ } else if (conn->hdev->adv_addr_type == ADDR_LE_DEV_RANDOM) {
+ /* In case of ext adv, resp_addr will be updated in
+ * Adv Terminated event.
+ */
+ if (!ext_adv_capable(conn->hdev))
+ bacpy(&conn->resp_addr,
+ &conn->hdev->random_addr);
+ } else {
+ bacpy(&conn->resp_addr, &conn->hdev->bdaddr);
+ }
- hci_dev_lock(hdev);
+ conn->init_addr_type = bdaddr_type;
+ bacpy(&conn->init_addr, bdaddr);
- hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle);
- if (hcon) {
- hcon->state = BT_CLOSED;
- hci_conn_del(hcon);
+ /* For incoming connections, set the default minimum
+ * and maximum connection interval. They will be used
+ * to check if the parameters are in range and if not
+ * trigger the connection update procedure.
+ */
+ conn->le_conn_min_interval = conn->hdev->le_conn_min_interval;
+ conn->le_conn_max_interval = conn->hdev->le_conn_max_interval;
}
-
- hci_dev_unlock(hdev);
}
-#endif
static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
- bdaddr_t *bdaddr, u8 bdaddr_type, u8 role, u16 handle,
- u16 interval, u16 latency, u16 supervision_timeout)
+ bdaddr_t *bdaddr, u8 bdaddr_type,
+ bdaddr_t *local_rpa, u8 role, u16 handle,
+ u16 interval, u16 latency,
+ u16 supervision_timeout)
{
struct hci_conn_params *params;
struct hci_conn *conn;
struct smp_irk *irk;
u8 addr_type;
+ int err;
hci_dev_lock(hdev);
@@ -4784,19 +5693,34 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
*/
hci_dev_clear_flag(hdev, HCI_LE_ADV);
- conn = hci_lookup_le_connect(hdev);
- if (!conn) {
- conn = hci_conn_add(hdev, LE_LINK, bdaddr, role);
- if (!conn) {
- bt_dev_err(hdev, "no memory for new connection");
+ /* Check for existing connection:
+ *
+ * 1. If it doesn't exist then use the role to create a new object.
+ * 2. If it does exist confirm that it is connecting/BT_CONNECT in case
+ * of initiator/master role since there could be a collision where
+ * either side is attempting to connect or something like a fuzzing
+ * testing is trying to play tricks to destroy the hcon object before
+ * it even attempts to connect (e.g. hcon->state == BT_OPEN).
+ */
+ conn = hci_conn_hash_lookup_role(hdev, LE_LINK, role, bdaddr);
+ if (!conn ||
+ (conn->role == HCI_ROLE_MASTER && conn->state != BT_CONNECT)) {
+ /* In case of error status and there is no connection pending
+ * just unlock as there is nothing to cleanup.
+ */
+ if (status)
goto unlock;
- }
- conn->dst_type = bdaddr_type;
+ conn = hci_conn_add_unset(hdev, LE_LINK, bdaddr, bdaddr_type,
+ role);
+ if (IS_ERR(conn)) {
+ bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn));
+ goto unlock;
+ }
/* If we didn't have a hci_conn object previously
- * but we're in master role this must be something
- * initiated using a white list. Since white list based
+ * but we're in central role this must be something
+ * initiated using an accept list. Since accept list based
* connections are not "first class citizens" we don't
* have full tracking of them. Therefore, we go ahead
* with a "best effort" approach of determining the
@@ -4818,33 +5742,19 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
cancel_delayed_work(&conn->le_conn_timeout);
}
- if (!conn->out) {
- /* Set the responder (our side) address type based on
- * the advertising address type.
- */
- conn->resp_addr_type = hdev->adv_addr_type;
- if (hdev->adv_addr_type == ADDR_LE_DEV_RANDOM) {
- /* In case of ext adv, resp_addr will be updated in
- * Adv Terminated event.
- */
- if (!ext_adv_capable(hdev))
- bacpy(&conn->resp_addr, &hdev->random_addr);
- } else {
- bacpy(&conn->resp_addr, &hdev->bdaddr);
- }
-
- conn->init_addr_type = bdaddr_type;
- bacpy(&conn->init_addr, bdaddr);
-
- /* For incoming connections, set the default minimum
- * and maximum connection interval. They will be used
- * to check if the parameters are in range and if not
- * trigger the connection update procedure.
- */
- conn->le_conn_min_interval = hdev->le_conn_min_interval;
- conn->le_conn_max_interval = hdev->le_conn_max_interval;
+ /* The HCI_LE_Connection_Complete event is only sent once per connection.
+ * Processing it more than once per connection can corrupt kernel memory.
+ *
+ * As the connection handle is set here for the first time, it indicates
+ * whether the connection is already set up.
+ */
+ if (!HCI_CONN_HANDLE_UNSET(conn->handle)) {
+ bt_dev_err(hdev, "Ignoring HCI_Connection_Complete for existing connection");
+ goto unlock;
}
+ le_conn_update_addr(conn, bdaddr, bdaddr_type, local_rpa);
+
/* Lookup the identity address from the stored connection
* address and address type.
*
@@ -4860,8 +5770,18 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
conn->dst_type = irk->addr_type;
}
- if (status) {
- hci_le_conn_failed(conn, status);
+ conn->dst_type = ev_bdaddr_type(hdev, conn->dst_type, NULL);
+
+ /* All connection failure handling is taken care of by the
+ * hci_conn_failed function which is triggered by the HCI
+ * request completion callbacks used for connecting.
+ */
+ if (status || hci_conn_set_handle(conn, handle))
+ goto unlock;
+
+ /* Drop the connection if it has been aborted */
+ if (test_bit(HCI_CONN_CANCEL, &conn->flags)) {
+ hci_conn_drop(conn);
goto unlock;
}
@@ -4871,18 +5791,23 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
addr_type = BDADDR_LE_RANDOM;
/* Drop the connection if the device is blocked */
- if (hci_bdaddr_list_lookup(&hdev->blacklist, &conn->dst, addr_type)) {
+ if (hci_bdaddr_list_lookup(&hdev->reject_list, &conn->dst, addr_type)) {
hci_conn_drop(conn);
goto unlock;
}
- if (!test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags))
- mgmt_device_connected(hdev, conn, 0, NULL, 0);
+ mgmt_device_connected(hdev, conn, NULL, 0);
conn->sec_level = BT_SECURITY_LOW;
- conn->handle = handle;
conn->state = BT_CONFIG;
+ /* Store current advertising instance as connection advertising instance
+ * when software rotation is in use so it can be re-enabled when
+ * disconnected.
+ */
+ if (!ext_adv_capable(hdev))
+ conn->adv_instance = hdev->cur_adv_instance;
+
conn->le_conn_interval = interval;
conn->le_conn_latency = latency;
conn->le_supv_timeout = supervision_timeout;
@@ -4890,38 +5815,16 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
hci_debugfs_create_conn(conn);
hci_conn_add_sysfs(conn);
- if (!status) {
- /* The remote features procedure is defined for master
- * role only. So only in case of an initiated connection
- * request the remote features.
- *
- * If the local controller supports slave-initiated features
- * exchange, then requesting the remote features in slave
- * role is possible. Otherwise just transition into the
- * connected state without requesting the remote features.
- */
- if (conn->out ||
- (hdev->le_features[0] & HCI_LE_SLAVE_FEATURES)) {
- struct hci_cp_le_read_remote_features cp;
-
- cp.handle = __cpu_to_le16(conn->handle);
-
- hci_send_cmd(hdev, HCI_OP_LE_READ_REMOTE_FEATURES,
- sizeof(cp), &cp);
-
- hci_conn_hold(conn);
- } else {
- conn->state = BT_CONNECTED;
- hci_connect_cfm(conn, status);
- }
- } else {
+ err = hci_le_read_remote_features(conn);
+ if (err) {
+ conn->state = BT_CONNECTED;
hci_connect_cfm(conn, status);
}
params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst,
conn->dst_type);
if (params) {
- list_del_init(&params->action);
+ hci_pend_le_list_del_init(params);
if (params->conn) {
hci_conn_drop(params->conn);
hci_conn_put(params->conn);
@@ -4930,72 +5833,203 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
}
unlock:
- hci_update_background_scan(hdev);
+ hci_update_passive_scan(hdev);
hci_dev_unlock(hdev);
}
-static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_le_conn_complete_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_le_conn_complete *ev = (void *) skb->data;
+ struct hci_ev_le_conn_complete *ev = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
le_conn_complete_evt(hdev, ev->status, &ev->bdaddr, ev->bdaddr_type,
- ev->role, le16_to_cpu(ev->handle),
+ NULL, ev->role, le16_to_cpu(ev->handle),
le16_to_cpu(ev->interval),
le16_to_cpu(ev->latency),
le16_to_cpu(ev->supervision_timeout));
}
-static void hci_le_enh_conn_complete_evt(struct hci_dev *hdev,
+static void hci_le_enh_conn_complete_evt(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
- struct hci_ev_le_enh_conn_complete *ev = (void *) skb->data;
+ struct hci_ev_le_enh_conn_complete *ev = data;
- BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
le_conn_complete_evt(hdev, ev->status, &ev->bdaddr, ev->bdaddr_type,
- ev->role, le16_to_cpu(ev->handle),
+ &ev->local_rpa, ev->role, le16_to_cpu(ev->handle),
le16_to_cpu(ev->interval),
le16_to_cpu(ev->latency),
le16_to_cpu(ev->supervision_timeout));
}
-static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_le_pa_sync_lost_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_evt_le_ext_adv_set_term *ev = (void *) skb->data;
+ struct hci_ev_le_pa_sync_lost *ev = data;
+ u16 handle = le16_to_cpu(ev->handle);
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "sync handle 0x%4.4x", handle);
- if (ev->status)
+ hci_dev_lock(hdev);
+
+ /* Delete the pa sync connection */
+ conn = hci_conn_hash_lookup_pa_sync_handle(hdev, handle);
+ if (conn) {
+ clear_bit(HCI_CONN_BIG_SYNC, &conn->flags);
+ clear_bit(HCI_CONN_PA_SYNC, &conn->flags);
+ hci_disconn_cfm(conn, HCI_ERROR_REMOTE_USER_TERM);
+ hci_conn_del(conn);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_evt_le_ext_adv_set_term *ev = data;
+ struct hci_conn *conn;
+ struct adv_info *adv, *n;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+ /* The Bluetooth Core 5.3 specification clearly states that this event
+ * shall not be sent when the Host disables the advertising set. So in
+ * case of HCI_ERROR_CANCELLED_BY_HOST, just ignore the event.
+ *
+ * When the Host disables an advertising set, all cleanup is done via
+ * its command callback and not needed to be duplicated here.
+ */
+ if (ev->status == HCI_ERROR_CANCELLED_BY_HOST) {
+ bt_dev_warn_ratelimited(hdev, "Unexpected advertising set terminated event");
return;
+ }
+
+ hci_dev_lock(hdev);
+
+ adv = hci_find_adv_instance(hdev, ev->handle);
+
+ if (ev->status) {
+ if (!adv)
+ goto unlock;
+
+ /* Remove advertising as it has been terminated */
+ hci_remove_adv_instance(hdev, ev->handle);
+ mgmt_advertising_removed(NULL, hdev, ev->handle);
+
+ list_for_each_entry_safe(adv, n, &hdev->adv_instances, list) {
+ if (adv->enabled)
+ goto unlock;
+ }
+
+ /* We are no longer advertising, clear HCI_LE_ADV */
+ hci_dev_clear_flag(hdev, HCI_LE_ADV);
+ goto unlock;
+ }
+
+ if (adv)
+ adv->enabled = false;
conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->conn_handle));
if (conn) {
- struct adv_info *adv_instance;
+ /* Store handle in the connection so the correct advertising
+ * instance can be re-enabled when disconnected.
+ */
+ conn->adv_instance = ev->handle;
- if (hdev->adv_addr_type != ADDR_LE_DEV_RANDOM)
- return;
+ if (hdev->adv_addr_type != ADDR_LE_DEV_RANDOM ||
+ bacmp(&conn->resp_addr, BDADDR_ANY))
+ goto unlock;
- if (!hdev->cur_adv_instance) {
+ if (!ev->handle) {
bacpy(&conn->resp_addr, &hdev->random_addr);
- return;
+ goto unlock;
}
- adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance);
- if (adv_instance)
- bacpy(&conn->resp_addr, &adv_instance->random_addr);
+ if (adv)
+ bacpy(&conn->resp_addr, &adv->random_addr);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static int hci_le_pa_term_sync(struct hci_dev *hdev, __le16 handle)
+{
+ struct hci_cp_le_pa_term_sync cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = handle;
+
+ return hci_send_cmd(hdev, HCI_OP_LE_PA_TERM_SYNC, sizeof(cp), &cp);
+}
+
+static void hci_le_past_received_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_past_received *ev = data;
+ int mask = hdev->link_mode;
+ __u8 flags = 0;
+ struct hci_conn *pa_sync, *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+ hci_dev_lock(hdev);
+
+ hci_dev_clear_flag(hdev, HCI_PA_SYNC);
+
+ conn = hci_conn_hash_lookup_create_pa_sync(hdev);
+ if (!conn) {
+ bt_dev_err(hdev,
+ "Unable to find connection for dst %pMR sid 0x%2.2x",
+ &ev->bdaddr, ev->sid);
+ goto unlock;
+ }
+
+ conn->sync_handle = le16_to_cpu(ev->sync_handle);
+ conn->sid = HCI_SID_INVALID;
+
+ mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, PA_LINK,
+ &flags);
+ if (!(mask & HCI_LM_ACCEPT)) {
+ hci_le_pa_term_sync(hdev, ev->sync_handle);
+ goto unlock;
+ }
+
+ if (!(flags & HCI_PROTO_DEFER))
+ goto unlock;
+
+ /* Add connection to indicate PA sync event */
+ pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, 0,
+ HCI_ROLE_SLAVE);
+
+ if (IS_ERR(pa_sync))
+ goto unlock;
+
+ pa_sync->sync_handle = le16_to_cpu(ev->sync_handle);
+
+ if (ev->status) {
+ set_bit(HCI_CONN_PA_SYNC_FAILED, &pa_sync->flags);
+
+ /* Notify iso layer */
+ hci_connect_cfm(pa_sync, ev->status);
}
+
+unlock:
+ hci_dev_unlock(hdev);
}
-static void hci_le_conn_update_complete_evt(struct hci_dev *hdev,
+static void hci_le_conn_update_complete_evt(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
- struct hci_ev_le_conn_update_complete *ev = (void *) skb->data;
+ struct hci_ev_le_conn_update_complete *ev = data;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
if (ev->status)
return;
@@ -5015,8 +6049,8 @@ static void hci_le_conn_update_complete_evt(struct hci_dev *hdev,
/* This function requires the caller holds hdev->lock */
static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev,
bdaddr_t *addr,
- u8 addr_type, u8 adv_type,
- bdaddr_t *direct_rpa)
+ u8 addr_type, bool addr_resolved,
+ u8 adv_type, u8 phy, u8 sec_phy)
{
struct hci_conn *conn;
struct hci_conn_params *params;
@@ -5025,14 +6059,17 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev,
if (adv_type != LE_ADV_IND && adv_type != LE_ADV_DIRECT_IND)
return NULL;
- /* Ignore if the device is blocked */
- if (hci_bdaddr_list_lookup(&hdev->blacklist, addr, addr_type))
+ /* Ignore if the device is blocked or hdev is suspended */
+ if (hci_bdaddr_list_lookup(&hdev->reject_list, addr, addr_type) ||
+ hdev->suspended)
return NULL;
/* Most controller will fail if we try to create new connections
- * while we have an existing one in slave role.
+ * while we have an existing one in peripheral role.
*/
- if (hdev->conn_hash.le_num_slave > 0)
+ if (hdev->conn_hash.le_num_peripheral > 0 &&
+ (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_LE_STATES) ||
+ !(hdev->le_states[3] & 0x10)))
return NULL;
/* If we're not connectable only connect devices that we have in
@@ -5048,7 +6085,7 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev,
case HCI_AUTO_CONN_DIRECT:
/* Only devices advertising with ADV_DIRECT_IND are
* triggering a connection attempt. This is allowing
- * incoming connections from slave devices.
+ * incoming connections from peripheral devices.
*/
if (adv_type != LE_ADV_DIRECT_IND)
return NULL;
@@ -5056,8 +6093,8 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev,
case HCI_AUTO_CONN_ALWAYS:
/* Devices advertising with ADV_IND or ADV_DIRECT_IND
* are triggering a connection attempt. This means
- * that incoming connectioms from slave device are
- * accepted and also outgoing connections to slave
+ * that incoming connections from peripheral device are
+ * accepted and also outgoing connections to peripheral
* devices are established when found.
*/
break;
@@ -5066,9 +6103,9 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev,
}
}
- conn = hci_connect_le(hdev, addr, addr_type, BT_SECURITY_LOW,
- HCI_LE_AUTOCONN_TIMEOUT, HCI_ROLE_MASTER,
- direct_rpa);
+ conn = hci_connect_le(hdev, addr, addr_type, addr_resolved,
+ BT_SECURITY_LOW, hdev->def_le_autoconnect_timeout,
+ HCI_ROLE_MASTER, phy, sec_phy);
if (!IS_ERR(conn)) {
/* If HCI_AUTO_CONN_EXPLICIT is set, conn is already owned
* by higher layer that tried to connect, if no then
@@ -5103,14 +6140,16 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev,
static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
u8 bdaddr_type, bdaddr_t *direct_addr,
- u8 direct_addr_type, s8 rssi, u8 *data, u8 len)
+ u8 direct_addr_type, u8 phy, u8 sec_phy, s8 rssi,
+ u8 *data, u8 len, bool ext_adv, bool ctl_time,
+ u64 instant)
{
struct discovery_state *d = &hdev->discovery;
struct smp_irk *irk;
struct hci_conn *conn;
- bool match;
+ bool match, bdaddr_resolved;
u32 flags;
- u8 *ptr, real_len;
+ u8 *ptr;
switch (type) {
case LE_ADV_IND:
@@ -5125,6 +6164,12 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
return;
}
+ if (len > max_adv_len(hdev)) {
+ bt_dev_err_ratelimited(hdev,
+ "adv larger than maximum supported");
+ return;
+ }
+
/* Find the end of the data in case the report contains padded zero
* bytes at the end causing an invalid length value.
*
@@ -5136,32 +6181,35 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
break;
}
- real_len = ptr - data;
-
- /* Adjust for actual length */
- if (len != real_len) {
- bt_dev_err_ratelimited(hdev, "advertising data len corrected");
- len = real_len;
- }
+ /* Adjust for actual length. This handles the case when remote
+ * device is advertising with incorrect data length.
+ */
+ len = ptr - data;
/* If the direct address is present, then this report is from
* a LE Direct Advertising Report event. In that case it is
* important to see if the address is matching the local
* controller address.
+ *
+ * If local privacy is not enable the controller shall not be
+ * generating such event since according to its documentation it is only
+ * valid for filter_policy 0x02 and 0x03, but the fact that it did
+ * generate LE Direct Advertising Report means it is probably broken and
+ * won't generate any other event which can potentially break
+ * auto-connect logic so in case local privacy is not enable this
+ * ignores the direct_addr so it works as a regular report.
*/
- if (direct_addr) {
+ if (!hci_dev_test_flag(hdev, HCI_MESH) && direct_addr &&
+ hci_dev_test_flag(hdev, HCI_PRIVACY)) {
+ direct_addr_type = ev_bdaddr_type(hdev, direct_addr_type,
+ &bdaddr_resolved);
+
/* Only resolvable random addresses are valid for these
* kind of reports and others can be ignored.
*/
if (!hci_bdaddr_is_rpa(direct_addr, direct_addr_type))
return;
- /* If the controller is not using resolvable random
- * addresses, then this report can be ignored.
- */
- if (!hci_dev_test_flag(hdev, HCI_PRIVACY))
- return;
-
/* If the local IRK of the controller does not match
* with the resolvable random address provided, then
* this report can be ignored.
@@ -5177,14 +6225,17 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
bdaddr_type = irk->addr_type;
}
+ bdaddr_type = ev_bdaddr_type(hdev, bdaddr_type, &bdaddr_resolved);
+
/* Check if we have been requested to connect to this device.
*
* direct_addr is set only for directed advertising reports (it is NULL
* for advertising reports) and is already verified to be RPA above.
*/
- conn = check_pending_le_conn(hdev, bdaddr, bdaddr_type, type,
- direct_addr);
- if (conn && type == LE_ADV_IND) {
+ conn = check_pending_le_conn(hdev, bdaddr, bdaddr_type, bdaddr_resolved,
+ type, phy, sec_phy);
+ if (!ext_adv && conn && type == LE_ADV_IND &&
+ len <= max_adv_len(hdev)) {
/* Store report for later inclusion by
* mgmt_device_connected
*/
@@ -5192,47 +6243,48 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
conn->le_adv_data_len = len;
}
+ if (type == LE_ADV_NONCONN_IND || type == LE_ADV_SCAN_IND)
+ flags = MGMT_DEV_FOUND_NOT_CONNECTABLE;
+ else
+ flags = 0;
+
+ /* All scan results should be sent up for Mesh systems */
+ if (hci_dev_test_flag(hdev, HCI_MESH)) {
+ mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL,
+ rssi, flags, data, len, NULL, 0, instant);
+ return;
+ }
+
/* Passive scanning shouldn't trigger any device found events,
* except for devices marked as CONN_REPORT for which we do send
- * device found events.
+ * device found events, or advertisement monitoring requested.
*/
if (hdev->le_scan_type == LE_SCAN_PASSIVE) {
if (type == LE_ADV_DIRECT_IND)
return;
if (!hci_pend_le_action_lookup(&hdev->pend_le_reports,
- bdaddr, bdaddr_type))
+ bdaddr, bdaddr_type) &&
+ idr_is_empty(&hdev->adv_monitors_idr))
return;
- if (type == LE_ADV_NONCONN_IND || type == LE_ADV_SCAN_IND)
- flags = MGMT_DEV_FOUND_NOT_CONNECTABLE;
- else
- flags = 0;
mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL,
- rssi, flags, data, len, NULL, 0);
+ rssi, flags, data, len, NULL, 0, 0);
return;
}
- /* When receiving non-connectable or scannable undirected
- * advertising reports, this means that the remote device is
- * not connectable and then clearly indicate this in the
- * device found event.
- *
- * When receiving a scan response, then there is no way to
+ /* When receiving a scan response, then there is no way to
* know if the remote device is connectable or not. However
* since scan responses are merged with a previously seen
* advertising report, the flags field from that report
* will be used.
*
- * In the really unlikely case that a controller get confused
- * and just sends a scan response event, then it is marked as
- * not connectable as well.
+ * In the unlikely case that a controller just sends a scan
+ * response event that doesn't match the pending report, then
+ * it is marked as a standalone SCAN_RSP.
*/
- if (type == LE_ADV_NONCONN_IND || type == LE_ADV_SCAN_IND ||
- type == LE_ADV_SCAN_RSP)
- flags = MGMT_DEV_FOUND_NOT_CONNECTABLE;
- else
- flags = 0;
+ if (type == LE_ADV_SCAN_RSP)
+ flags = MGMT_DEV_FOUND_SCAN_RSP;
/* If there's nothing pending either store the data from this
* event or send an immediate device found event if the data
@@ -5242,14 +6294,15 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
/* If the report will trigger a SCAN_REQ store it for
* later merging.
*/
- if (type == LE_ADV_IND || type == LE_ADV_SCAN_IND) {
+ if (!ext_adv && (type == LE_ADV_IND ||
+ type == LE_ADV_SCAN_IND)) {
store_pending_adv_report(hdev, bdaddr, bdaddr_type,
rssi, flags, data, len);
return;
}
mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL,
- rssi, flags, data, len, NULL, 0);
+ rssi, flags, data, len, NULL, 0, 0);
return;
}
@@ -5268,12 +6321,13 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
d->last_adv_addr_type, NULL,
d->last_adv_rssi, d->last_adv_flags,
d->last_adv_data,
- d->last_adv_data_len, NULL, 0);
+ d->last_adv_data_len, NULL, 0, 0);
/* If the new report will trigger a SCAN_REQ store it for
* later merging.
*/
- if (type == LE_ADV_IND || type == LE_ADV_SCAN_IND) {
+ if (!ext_adv && (type == LE_ADV_IND ||
+ type == LE_ADV_SCAN_IND)) {
store_pending_adv_report(hdev, bdaddr, bdaddr_type,
rssi, flags, data, len);
return;
@@ -5284,7 +6338,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
*/
clear_pending_adv_report(hdev);
mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL,
- rssi, flags, data, len, NULL, 0);
+ rssi, flags, data, len, NULL, 0, 0);
return;
}
@@ -5294,38 +6348,57 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
*/
mgmt_device_found(hdev, &d->last_adv_addr, LE_LINK,
d->last_adv_addr_type, NULL, rssi, d->last_adv_flags,
- d->last_adv_data, d->last_adv_data_len, data, len);
+ d->last_adv_data, d->last_adv_data_len, data, len, 0);
clear_pending_adv_report(hdev);
}
-static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_le_adv_report_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- u8 num_reports = skb->data[0];
- void *ptr = &skb->data[1];
+ struct hci_ev_le_advertising_report *ev = data;
+ u64 instant = jiffies;
+
+ if (!ev->num)
+ return;
hci_dev_lock(hdev);
- while (num_reports--) {
- struct hci_ev_le_advertising_info *ev = ptr;
+ while (ev->num--) {
+ struct hci_ev_le_advertising_info *info;
s8 rssi;
- if (ev->length <= HCI_MAX_AD_LENGTH) {
- rssi = ev->data[ev->length];
- process_adv_report(hdev, ev->evt_type, &ev->bdaddr,
- ev->bdaddr_type, NULL, 0, rssi,
- ev->data, ev->length);
+ info = hci_le_ev_skb_pull(hdev, skb,
+ HCI_EV_LE_ADVERTISING_REPORT,
+ sizeof(*info));
+ if (!info)
+ break;
+
+ if (!hci_le_ev_skb_pull(hdev, skb, HCI_EV_LE_ADVERTISING_REPORT,
+ info->length + 1))
+ break;
+
+ if (info->length <= max_adv_len(hdev)) {
+ rssi = info->data[info->length];
+ process_adv_report(hdev, info->type, &info->bdaddr,
+ info->bdaddr_type, NULL, 0,
+ HCI_ADV_PHY_1M, 0, rssi,
+ info->data, info->length, false,
+ false, instant);
} else {
bt_dev_err(hdev, "Dropping invalid advertising data");
}
-
- ptr += sizeof(*ev) + ev->length + 1;
}
hci_dev_unlock(hdev);
}
-static u8 ext_evt_type_to_legacy(u16 evt_type)
+static u8 ext_evt_type_to_legacy(struct hci_dev *hdev, u16 evt_type)
{
+ u16 pdu_type = evt_type & ~LE_EXT_ADV_DATA_STATUS_MASK;
+
+ if (!pdu_type)
+ return LE_ADV_NONCONN_IND;
+
if (evt_type & LE_EXT_ADV_LEGACY_PDU) {
switch (evt_type) {
case LE_LEGACY_ADV_IND:
@@ -5341,10 +6414,7 @@ static u8 ext_evt_type_to_legacy(u16 evt_type)
return LE_ADV_SCAN_RSP;
}
- BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x",
- evt_type);
-
- return LE_ADV_INVALID;
+ goto invalid;
}
if (evt_type & LE_EXT_ADV_CONN_IND) {
@@ -5360,49 +6430,178 @@ static u8 ext_evt_type_to_legacy(u16 evt_type)
if (evt_type & LE_EXT_ADV_SCAN_IND)
return LE_ADV_SCAN_IND;
- if (evt_type == LE_EXT_ADV_NON_CONN_IND ||
- evt_type & LE_EXT_ADV_DIRECT_IND)
+ if (evt_type & LE_EXT_ADV_DIRECT_IND)
return LE_ADV_NONCONN_IND;
- BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x",
- evt_type);
+invalid:
+ bt_dev_err_ratelimited(hdev, "Unknown advertising packet type: 0x%02x",
+ evt_type);
return LE_ADV_INVALID;
}
-static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- u8 num_reports = skb->data[0];
- void *ptr = &skb->data[1];
+ struct hci_ev_le_ext_adv_report *ev = data;
+ u64 instant = jiffies;
+
+ if (!ev->num)
+ return;
hci_dev_lock(hdev);
- while (num_reports--) {
- struct hci_ev_le_ext_adv_report *ev = ptr;
+ while (ev->num--) {
+ struct hci_ev_le_ext_adv_info *info;
u8 legacy_evt_type;
u16 evt_type;
- evt_type = __le16_to_cpu(ev->evt_type);
- legacy_evt_type = ext_evt_type_to_legacy(evt_type);
+ info = hci_le_ev_skb_pull(hdev, skb, HCI_EV_LE_EXT_ADV_REPORT,
+ sizeof(*info));
+ if (!info)
+ break;
+
+ if (!hci_le_ev_skb_pull(hdev, skb, HCI_EV_LE_EXT_ADV_REPORT,
+ info->length))
+ break;
+
+ evt_type = __le16_to_cpu(info->type) & LE_EXT_ADV_EVT_TYPE_MASK;
+ legacy_evt_type = ext_evt_type_to_legacy(hdev, evt_type);
+
+ if (hci_test_quirk(hdev,
+ HCI_QUIRK_FIXUP_LE_EXT_ADV_REPORT_PHY)) {
+ info->primary_phy &= 0x1f;
+ info->secondary_phy &= 0x1f;
+ }
+
+ /* Check if PA Sync is pending and if the hci_conn SID has not
+ * been set update it.
+ */
+ if (hci_dev_test_flag(hdev, HCI_PA_SYNC)) {
+ struct hci_conn *conn;
+
+ conn = hci_conn_hash_lookup_create_pa_sync(hdev);
+ if (conn && conn->sid == HCI_SID_INVALID)
+ conn->sid = info->sid;
+ }
+
if (legacy_evt_type != LE_ADV_INVALID) {
- process_adv_report(hdev, legacy_evt_type, &ev->bdaddr,
- ev->bdaddr_type, NULL, 0, ev->rssi,
- ev->data, ev->length);
+ process_adv_report(hdev, legacy_evt_type, &info->bdaddr,
+ info->bdaddr_type, NULL, 0,
+ info->primary_phy,
+ info->secondary_phy,
+ info->rssi, info->data, info->length,
+ !(evt_type & LE_EXT_ADV_LEGACY_PDU),
+ false, instant);
}
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_pa_sync_established_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_pa_sync_established *ev = data;
+ int mask = hdev->link_mode;
+ __u8 flags = 0;
+ struct hci_conn *pa_sync, *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+ hci_dev_lock(hdev);
+
+ hci_dev_clear_flag(hdev, HCI_PA_SYNC);
+
+ conn = hci_conn_hash_lookup_create_pa_sync(hdev);
+ if (!conn) {
+ bt_dev_err(hdev,
+ "Unable to find connection for dst %pMR sid 0x%2.2x",
+ &ev->bdaddr, ev->sid);
+ goto unlock;
+ }
+
+ clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags);
- ptr += sizeof(*ev) + ev->length + 1;
+ conn->sync_handle = le16_to_cpu(ev->handle);
+ conn->sid = HCI_SID_INVALID;
+
+ mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, PA_LINK,
+ &flags);
+ if (!(mask & HCI_LM_ACCEPT)) {
+ hci_le_pa_term_sync(hdev, ev->handle);
+ goto unlock;
+ }
+
+ if (!(flags & HCI_PROTO_DEFER))
+ goto unlock;
+
+ /* Add connection to indicate PA sync event */
+ pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, 0,
+ HCI_ROLE_SLAVE);
+
+ if (IS_ERR(pa_sync))
+ goto unlock;
+
+ pa_sync->sync_handle = le16_to_cpu(ev->handle);
+
+ if (ev->status) {
+ set_bit(HCI_CONN_PA_SYNC_FAILED, &pa_sync->flags);
+
+ /* Notify iso layer */
+ hci_connect_cfm(pa_sync, ev->status);
}
+unlock:
hci_dev_unlock(hdev);
}
-static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev,
+static void hci_le_per_adv_report_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_per_adv_report *ev = data;
+ int mask = hdev->link_mode;
+ __u8 flags = 0;
+ struct hci_conn *pa_sync;
+
+ bt_dev_dbg(hdev, "sync_handle 0x%4.4x", le16_to_cpu(ev->sync_handle));
+
+ hci_dev_lock(hdev);
+
+ mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, PA_LINK, &flags);
+ if (!(mask & HCI_LM_ACCEPT))
+ goto unlock;
+
+ if (!(flags & HCI_PROTO_DEFER))
+ goto unlock;
+
+ pa_sync = hci_conn_hash_lookup_pa_sync_handle
+ (hdev,
+ le16_to_cpu(ev->sync_handle));
+
+ if (!pa_sync)
+ goto unlock;
+
+ if (ev->data_status == LE_PA_DATA_COMPLETE &&
+ !test_and_set_bit(HCI_CONN_PA_SYNC, &pa_sync->flags)) {
+ /* Notify iso layer */
+ hci_connect_cfm(pa_sync, 0);
+
+ /* Notify MGMT layer */
+ mgmt_device_connected(hdev, pa_sync, NULL, 0);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
- struct hci_ev_le_remote_feat_complete *ev = (void *)skb->data;
+ struct hci_ev_le_remote_feat_complete *ev = data;
struct hci_conn *conn;
- BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
hci_dev_lock(hdev);
@@ -5414,7 +6613,7 @@ static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev,
if (conn->state == BT_CONFIG) {
__u8 status;
- /* If the local controller supports slave-initiated
+ /* If the local controller supports peripheral-initiated
* features exchange, but the remote controller does
* not, then it is possible that the error code 0x1a
* for unsupported remote feature gets returned.
@@ -5423,30 +6622,30 @@ static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev,
* transition into connected state and mark it as
* successful.
*/
- if ((hdev->le_features[0] & HCI_LE_SLAVE_FEATURES) &&
- !conn->out && ev->status == 0x1a)
+ if (!conn->out && ev->status == HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE &&
+ (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES))
status = 0x00;
else
status = ev->status;
conn->state = BT_CONNECTED;
hci_connect_cfm(conn, status);
- hci_conn_drop(conn);
}
}
hci_dev_unlock(hdev);
}
-static void hci_le_ltk_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_le_ltk_request_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_le_ltk_req *ev = (void *) skb->data;
+ struct hci_ev_le_ltk_req *ev = data;
struct hci_cp_le_ltk_reply cp;
struct hci_cp_le_ltk_neg_reply neg;
struct hci_conn *conn;
struct smp_ltk *ltk;
- BT_DBG("%s handle 0x%4.4x", hdev->name, __le16_to_cpu(ev->handle));
+ bt_dev_dbg(hdev, "handle 0x%4.4x", __le16_to_cpu(ev->handle));
hci_dev_lock(hdev);
@@ -5514,14 +6713,16 @@ static void send_conn_param_neg_reply(struct hci_dev *hdev, u16 handle,
&cp);
}
-static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev,
+static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
- struct hci_ev_le_remote_conn_param_req *ev = (void *) skb->data;
+ struct hci_ev_le_remote_conn_param_req *ev = data;
struct hci_cp_le_conn_param_req_reply cp;
struct hci_conn *hcon;
u16 handle, min, max, latency, timeout;
+ bt_dev_dbg(hdev, "handle 0x%4.4x", __le16_to_cpu(ev->handle));
+
handle = le16_to_cpu(ev->handle);
min = le16_to_cpu(ev->interval_min);
max = le16_to_cpu(ev->interval_max);
@@ -5533,6 +6734,10 @@ static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev,
return send_conn_param_neg_reply(hdev, handle,
HCI_ERROR_UNKNOWN_CONN_ID);
+ if (max > hcon->le_conn_max_interval)
+ return send_conn_param_neg_reply(hdev, handle,
+ HCI_ERROR_INVALID_LL_PARAMS);
+
if (hci_check_conn_params(min, max, latency, timeout))
return send_conn_param_neg_reply(hdev, handle,
HCI_ERROR_INVALID_LL_PARAMS);
@@ -5551,7 +6756,7 @@ static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev,
params->conn_latency = latency;
params->supervision_timeout = timeout;
store_hint = 0x01;
- } else{
+ } else {
store_hint = 0x00;
}
@@ -5572,335 +6777,1002 @@ static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev,
hci_send_cmd(hdev, HCI_OP_LE_CONN_PARAM_REQ_REPLY, sizeof(cp), &cp);
}
-static void hci_le_direct_adv_report_evt(struct hci_dev *hdev,
+static void hci_le_direct_adv_report_evt(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
- u8 num_reports = skb->data[0];
- void *ptr = &skb->data[1];
+ struct hci_ev_le_direct_adv_report *ev = data;
+ u64 instant = jiffies;
+ int i;
- hci_dev_lock(hdev);
+ if (!hci_le_ev_skb_pull(hdev, skb, HCI_EV_LE_DIRECT_ADV_REPORT,
+ flex_array_size(ev, info, ev->num)))
+ return;
- while (num_reports--) {
- struct hci_ev_le_direct_adv_info *ev = ptr;
+ if (!ev->num)
+ return;
+
+ hci_dev_lock(hdev);
- process_adv_report(hdev, ev->evt_type, &ev->bdaddr,
- ev->bdaddr_type, &ev->direct_addr,
- ev->direct_addr_type, ev->rssi, NULL, 0);
+ for (i = 0; i < ev->num; i++) {
+ struct hci_ev_le_direct_adv_info *info = &ev->info[i];
- ptr += sizeof(*ev);
+ process_adv_report(hdev, info->type, &info->bdaddr,
+ info->bdaddr_type, &info->direct_addr,
+ info->direct_addr_type, HCI_ADV_PHY_1M, 0,
+ info->rssi, NULL, 0, false, false, instant);
}
hci_dev_unlock(hdev);
}
-static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_le_phy_update_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_ev_le_meta *le_ev = (void *) skb->data;
+ struct hci_ev_le_phy_update_complete *ev = data;
+ struct hci_conn *conn;
- skb_pull(skb, sizeof(*le_ev));
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
- switch (le_ev->subevent) {
- case HCI_EV_LE_CONN_COMPLETE:
- hci_le_conn_complete_evt(hdev, skb);
- break;
+ if (ev->status)
+ return;
- case HCI_EV_LE_CONN_UPDATE_COMPLETE:
- hci_le_conn_update_complete_evt(hdev, skb);
- break;
+ hci_dev_lock(hdev);
- case HCI_EV_LE_ADVERTISING_REPORT:
- hci_le_adv_report_evt(hdev, skb);
- break;
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (!conn)
+ goto unlock;
- case HCI_EV_LE_REMOTE_FEAT_COMPLETE:
- hci_le_remote_feat_complete_evt(hdev, skb);
- break;
+ conn->le_tx_phy = ev->tx_phy;
+ conn->le_rx_phy = ev->rx_phy;
- case HCI_EV_LE_LTK_REQ:
- hci_le_ltk_request_evt(hdev, skb);
- break;
+unlock:
+ hci_dev_unlock(hdev);
+}
- case HCI_EV_LE_REMOTE_CONN_PARAM_REQ:
- hci_le_remote_conn_param_req_evt(hdev, skb);
- break;
+static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_evt_le_cis_established *ev = data;
+ struct hci_conn *conn;
+ struct bt_iso_qos *qos;
+ bool pending = false;
+ u16 handle = __le16_to_cpu(ev->handle);
+ u32 c_sdu_interval, p_sdu_interval;
- case HCI_EV_LE_DIRECT_ADV_REPORT:
- hci_le_direct_adv_report_evt(hdev, skb);
- break;
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
- case HCI_EV_LE_EXT_ADV_REPORT:
- hci_le_ext_adv_report_evt(hdev, skb);
- break;
+ hci_dev_lock(hdev);
- case HCI_EV_LE_ENHANCED_CONN_COMPLETE:
- hci_le_enh_conn_complete_evt(hdev, skb);
- break;
+ conn = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!conn) {
+ bt_dev_err(hdev,
+ "Unable to find connection with handle 0x%4.4x",
+ handle);
+ goto unlock;
+ }
- case HCI_EV_LE_EXT_ADV_SET_TERM:
- hci_le_ext_adv_term_evt(hdev, skb);
- break;
+ if (conn->type != CIS_LINK) {
+ bt_dev_err(hdev,
+ "Invalid connection link type handle 0x%4.4x",
+ handle);
+ goto unlock;
+ }
- default:
+ qos = &conn->iso_qos;
+
+ pending = test_and_clear_bit(HCI_CONN_CREATE_CIS, &conn->flags);
+
+ /* BLUETOOTH CORE SPECIFICATION Version 5.4 | Vol 6, Part G
+ * page 3075:
+ * Transport_Latency_C_To_P = CIG_Sync_Delay + (FT_C_To_P) ×
+ * ISO_Interval + SDU_Interval_C_To_P
+ * ...
+ * SDU_Interval = (CIG_Sync_Delay + (FT) x ISO_Interval) -
+ * Transport_Latency
+ */
+ c_sdu_interval = (get_unaligned_le24(ev->cig_sync_delay) +
+ (ev->c_ft * le16_to_cpu(ev->interval) * 1250)) -
+ get_unaligned_le24(ev->c_latency);
+ p_sdu_interval = (get_unaligned_le24(ev->cig_sync_delay) +
+ (ev->p_ft * le16_to_cpu(ev->interval) * 1250)) -
+ get_unaligned_le24(ev->p_latency);
+
+ switch (conn->role) {
+ case HCI_ROLE_SLAVE:
+ qos->ucast.in.interval = c_sdu_interval;
+ qos->ucast.out.interval = p_sdu_interval;
+ /* Convert Transport Latency (us) to Latency (msec) */
+ qos->ucast.in.latency =
+ DIV_ROUND_CLOSEST(get_unaligned_le24(ev->c_latency),
+ 1000);
+ qos->ucast.out.latency =
+ DIV_ROUND_CLOSEST(get_unaligned_le24(ev->p_latency),
+ 1000);
+ qos->ucast.in.sdu = ev->c_bn ? le16_to_cpu(ev->c_mtu) : 0;
+ qos->ucast.out.sdu = ev->p_bn ? le16_to_cpu(ev->p_mtu) : 0;
+ qos->ucast.in.phy = ev->c_phy;
+ qos->ucast.out.phy = ev->p_phy;
+ break;
+ case HCI_ROLE_MASTER:
+ qos->ucast.in.interval = p_sdu_interval;
+ qos->ucast.out.interval = c_sdu_interval;
+ /* Convert Transport Latency (us) to Latency (msec) */
+ qos->ucast.out.latency =
+ DIV_ROUND_CLOSEST(get_unaligned_le24(ev->c_latency),
+ 1000);
+ qos->ucast.in.latency =
+ DIV_ROUND_CLOSEST(get_unaligned_le24(ev->p_latency),
+ 1000);
+ qos->ucast.out.sdu = ev->c_bn ? le16_to_cpu(ev->c_mtu) : 0;
+ qos->ucast.in.sdu = ev->p_bn ? le16_to_cpu(ev->p_mtu) : 0;
+ qos->ucast.out.phy = ev->c_phy;
+ qos->ucast.in.phy = ev->p_phy;
break;
}
+
+ if (!ev->status) {
+ conn->state = BT_CONNECTED;
+ hci_debugfs_create_conn(conn);
+ hci_conn_add_sysfs(conn);
+ hci_iso_setup_path(conn);
+ goto unlock;
+ }
+
+ conn->state = BT_CLOSED;
+ hci_connect_cfm(conn, ev->status);
+ hci_conn_del(conn);
+
+unlock:
+ if (pending)
+ hci_le_create_cis_pending(hdev);
+
+ hci_dev_unlock(hdev);
}
-static bool hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode,
- u8 event, struct sk_buff *skb)
+static void hci_le_reject_cis(struct hci_dev *hdev, __le16 handle)
{
- struct hci_ev_cmd_complete *ev;
- struct hci_event_hdr *hdr;
+ struct hci_cp_le_reject_cis cp;
- if (!skb)
- return false;
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = handle;
+ cp.reason = HCI_ERROR_REJ_BAD_ADDR;
+ hci_send_cmd(hdev, HCI_OP_LE_REJECT_CIS, sizeof(cp), &cp);
+}
- if (skb->len < sizeof(*hdr)) {
- bt_dev_err(hdev, "too short HCI event");
- return false;
- }
+static void hci_le_accept_cis(struct hci_dev *hdev, __le16 handle)
+{
+ struct hci_cp_le_accept_cis cp;
- hdr = (void *) skb->data;
- skb_pull(skb, HCI_EVENT_HDR_SIZE);
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = handle;
+ hci_send_cmd(hdev, HCI_OP_LE_ACCEPT_CIS, sizeof(cp), &cp);
+}
- if (event) {
- if (hdr->evt != event)
- return false;
- return true;
+static void hci_le_cis_req_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_evt_le_cis_req *ev = data;
+ u16 acl_handle, cis_handle;
+ struct hci_conn *acl, *cis;
+ int mask;
+ __u8 flags = 0;
+
+ acl_handle = __le16_to_cpu(ev->acl_handle);
+ cis_handle = __le16_to_cpu(ev->cis_handle);
+
+ bt_dev_dbg(hdev, "acl 0x%4.4x handle 0x%4.4x cig 0x%2.2x cis 0x%2.2x",
+ acl_handle, cis_handle, ev->cig_id, ev->cis_id);
+
+ hci_dev_lock(hdev);
+
+ acl = hci_conn_hash_lookup_handle(hdev, acl_handle);
+ if (!acl)
+ goto unlock;
+
+ mask = hci_proto_connect_ind(hdev, &acl->dst, CIS_LINK, &flags);
+ if (!(mask & HCI_LM_ACCEPT)) {
+ hci_le_reject_cis(hdev, ev->cis_handle);
+ goto unlock;
}
- if (hdr->evt != HCI_EV_CMD_COMPLETE) {
- bt_dev_err(hdev, "last event is not cmd complete (0x%2.2x)",
- hdr->evt);
- return false;
+ cis = hci_conn_hash_lookup_handle(hdev, cis_handle);
+ if (!cis) {
+ cis = hci_conn_add(hdev, CIS_LINK, &acl->dst, acl->dst_type,
+ HCI_ROLE_SLAVE, cis_handle);
+ if (IS_ERR(cis)) {
+ hci_le_reject_cis(hdev, ev->cis_handle);
+ goto unlock;
+ }
}
- if (skb->len < sizeof(*ev)) {
- bt_dev_err(hdev, "too short cmd_complete event");
- return false;
+ cis->iso_qos.ucast.cig = ev->cig_id;
+ cis->iso_qos.ucast.cis = ev->cis_id;
+
+ if (!(flags & HCI_PROTO_DEFER)) {
+ hci_le_accept_cis(hdev, ev->cis_handle);
+ } else {
+ cis->state = BT_CONNECT2;
+ hci_connect_cfm(cis, 0);
}
- ev = (void *) skb->data;
- skb_pull(skb, sizeof(*ev));
+unlock:
+ hci_dev_unlock(hdev);
+}
- if (opcode != __le16_to_cpu(ev->opcode)) {
- BT_DBG("opcode doesn't match (0x%2.2x != 0x%2.2x)", opcode,
- __le16_to_cpu(ev->opcode));
- return false;
+static int hci_iso_term_big_sync(struct hci_dev *hdev, void *data)
+{
+ u8 handle = PTR_UINT(data);
+
+ return hci_le_terminate_big_sync(hdev, handle,
+ HCI_ERROR_LOCAL_HOST_TERM);
+}
+
+static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_evt_le_create_big_complete *ev = data;
+ struct hci_conn *conn;
+ __u8 i = 0;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ if (!hci_le_ev_skb_pull(hdev, skb, HCI_EVT_LE_CREATE_BIG_COMPLETE,
+ flex_array_size(ev, bis_handle, ev->num_bis)))
+ return;
+
+ hci_dev_lock(hdev);
+
+ /* Connect all BISes that are bound to the BIG */
+ while ((conn = hci_conn_hash_lookup_big_state(hdev, ev->handle,
+ BT_BOUND,
+ HCI_ROLE_MASTER))) {
+ if (ev->status) {
+ hci_connect_cfm(conn, ev->status);
+ hci_conn_del(conn);
+ continue;
+ }
+
+ if (hci_conn_set_handle(conn,
+ __le16_to_cpu(ev->bis_handle[i++])))
+ continue;
+
+ conn->state = BT_CONNECTED;
+ set_bit(HCI_CONN_BIG_CREATED, &conn->flags);
+ hci_debugfs_create_conn(conn);
+ hci_conn_add_sysfs(conn);
+ hci_iso_setup_path(conn);
}
- return true;
+ if (!ev->status && !i)
+ /* If no BISes have been connected for the BIG,
+ * terminate. This is in case all bound connections
+ * have been closed before the BIG creation
+ * has completed.
+ */
+ hci_cmd_sync_queue(hdev, hci_iso_term_big_sync,
+ UINT_PTR(ev->handle), NULL);
+
+ hci_dev_unlock(hdev);
}
-void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
+static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
{
- struct hci_event_hdr *hdr = (void *) skb->data;
- hci_req_complete_t req_complete = NULL;
- hci_req_complete_skb_t req_complete_skb = NULL;
- struct sk_buff *orig_skb = NULL;
- u8 status = 0, event = hdr->evt, req_evt = 0;
- u16 opcode = HCI_OP_NOP;
+ struct hci_evt_le_big_sync_established *ev = data;
+ struct hci_conn *bis, *conn;
+ int i;
- if (hdev->sent_cmd && bt_cb(hdev->sent_cmd)->hci.req_event == event) {
- struct hci_command_hdr *cmd_hdr = (void *) hdev->sent_cmd->data;
- opcode = __le16_to_cpu(cmd_hdr->opcode);
- hci_req_cmd_complete(hdev, opcode, status, &req_complete,
- &req_complete_skb);
- req_evt = event;
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
+
+ if (!hci_le_ev_skb_pull(hdev, skb, HCI_EVT_LE_BIG_SYNC_ESTABLISHED,
+ flex_array_size(ev, bis, ev->num_bis)))
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_big_sync_pend(hdev, ev->handle,
+ ev->num_bis);
+ if (!conn) {
+ bt_dev_err(hdev,
+ "Unable to find connection for big 0x%2.2x",
+ ev->handle);
+ goto unlock;
}
- /* If it looks like we might end up having to call
- * req_complete_skb, store a pristine copy of the skb since the
- * various handlers may modify the original one through
- * skb_pull() calls, etc.
+ clear_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags);
+
+ conn->num_bis = 0;
+ memset(conn->bis, 0, sizeof(conn->num_bis));
+
+ for (i = 0; i < ev->num_bis; i++) {
+ u16 handle = le16_to_cpu(ev->bis[i]);
+ __le32 interval;
+
+ bis = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!bis) {
+ if (handle > HCI_CONN_HANDLE_MAX) {
+ bt_dev_dbg(hdev, "ignore too large handle %u", handle);
+ continue;
+ }
+ bis = hci_conn_add(hdev, BIS_LINK, BDADDR_ANY, 0,
+ HCI_ROLE_SLAVE, handle);
+ if (IS_ERR(bis))
+ continue;
+ }
+
+ if (ev->status != 0x42)
+ /* Mark PA sync as established */
+ set_bit(HCI_CONN_PA_SYNC, &bis->flags);
+
+ bis->sync_handle = conn->sync_handle;
+ bis->iso_qos.bcast.big = ev->handle;
+ memset(&interval, 0, sizeof(interval));
+ memcpy(&interval, ev->latency, sizeof(ev->latency));
+ bis->iso_qos.bcast.in.interval = le32_to_cpu(interval);
+ /* Convert ISO Interval (1.25 ms slots) to latency (ms) */
+ bis->iso_qos.bcast.in.latency = le16_to_cpu(ev->interval) * 125 / 100;
+ bis->iso_qos.bcast.in.sdu = le16_to_cpu(ev->max_pdu);
+
+ if (!ev->status) {
+ bis->state = BT_CONNECTED;
+ set_bit(HCI_CONN_BIG_SYNC, &bis->flags);
+ hci_debugfs_create_conn(bis);
+ hci_conn_add_sysfs(bis);
+ hci_iso_setup_path(bis);
+ }
+ }
+
+ /* In case BIG sync failed, notify each failed connection to
+ * the user after all hci connections have been added
*/
- if (req_complete_skb || event == HCI_EV_CMD_STATUS ||
- event == HCI_EV_CMD_COMPLETE)
- orig_skb = skb_clone(skb, GFP_KERNEL);
+ if (ev->status)
+ for (i = 0; i < ev->num_bis; i++) {
+ u16 handle = le16_to_cpu(ev->bis[i]);
- skb_pull(skb, HCI_EVENT_HDR_SIZE);
+ bis = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!bis)
+ continue;
- switch (event) {
- case HCI_EV_INQUIRY_COMPLETE:
- hci_inquiry_complete_evt(hdev, skb);
- break;
+ set_bit(HCI_CONN_BIG_SYNC_FAILED, &bis->flags);
+ hci_connect_cfm(bis, ev->status);
+ }
- case HCI_EV_INQUIRY_RESULT:
- hci_inquiry_result_evt(hdev, skb);
- break;
+unlock:
+ hci_dev_unlock(hdev);
+}
- case HCI_EV_CONN_COMPLETE:
- hci_conn_complete_evt(hdev, skb);
- break;
+static void hci_le_big_sync_lost_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_evt_le_big_sync_lost *ev = data;
+ struct hci_conn *bis;
+ bool mgmt_conn = false;
- case HCI_EV_CONN_REQUEST:
- hci_conn_request_evt(hdev, skb);
- break;
+ bt_dev_dbg(hdev, "big handle 0x%2.2x", ev->handle);
- case HCI_EV_DISCONN_COMPLETE:
- hci_disconn_complete_evt(hdev, skb);
- break;
+ hci_dev_lock(hdev);
- case HCI_EV_AUTH_COMPLETE:
- hci_auth_complete_evt(hdev, skb);
- break;
+ /* Delete each bis connection */
+ while ((bis = hci_conn_hash_lookup_big_state(hdev, ev->handle,
+ BT_CONNECTED,
+ HCI_ROLE_SLAVE))) {
+ if (!mgmt_conn) {
+ mgmt_conn = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED,
+ &bis->flags);
+ mgmt_device_disconnected(hdev, &bis->dst, bis->type,
+ bis->dst_type, ev->reason,
+ mgmt_conn);
+ }
- case HCI_EV_REMOTE_NAME:
- hci_remote_name_evt(hdev, skb);
- break;
+ clear_bit(HCI_CONN_BIG_SYNC, &bis->flags);
+ hci_disconn_cfm(bis, ev->reason);
+ hci_conn_del(bis);
+ }
- case HCI_EV_ENCRYPT_CHANGE:
- hci_encrypt_change_evt(hdev, skb);
- break;
+ hci_dev_unlock(hdev);
+}
- case HCI_EV_CHANGE_LINK_KEY_COMPLETE:
- hci_change_link_key_complete_evt(hdev, skb);
- break;
+static void hci_le_big_info_adv_report_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb)
+{
+ struct hci_evt_le_big_info_adv_report *ev = data;
+ int mask = hdev->link_mode;
+ __u8 flags = 0;
+ struct hci_conn *pa_sync;
- case HCI_EV_REMOTE_FEATURES:
- hci_remote_features_evt(hdev, skb);
- break;
+ bt_dev_dbg(hdev, "sync_handle 0x%4.4x", le16_to_cpu(ev->sync_handle));
- case HCI_EV_CMD_COMPLETE:
- hci_cmd_complete_evt(hdev, skb, &opcode, &status,
- &req_complete, &req_complete_skb);
- break;
+ hci_dev_lock(hdev);
- case HCI_EV_CMD_STATUS:
- hci_cmd_status_evt(hdev, skb, &opcode, &status, &req_complete,
- &req_complete_skb);
- break;
+ mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, BIS_LINK, &flags);
+ if (!(mask & HCI_LM_ACCEPT))
+ goto unlock;
- case HCI_EV_HARDWARE_ERROR:
- hci_hardware_error_evt(hdev, skb);
- break;
+ if (!(flags & HCI_PROTO_DEFER))
+ goto unlock;
- case HCI_EV_ROLE_CHANGE:
- hci_role_change_evt(hdev, skb);
- break;
+ pa_sync = hci_conn_hash_lookup_pa_sync_handle
+ (hdev,
+ le16_to_cpu(ev->sync_handle));
- case HCI_EV_NUM_COMP_PKTS:
- hci_num_comp_pkts_evt(hdev, skb);
- break;
+ if (!pa_sync)
+ goto unlock;
- case HCI_EV_MODE_CHANGE:
- hci_mode_change_evt(hdev, skb);
- break;
+ pa_sync->iso_qos.bcast.encryption = ev->encryption;
- case HCI_EV_PIN_CODE_REQ:
- hci_pin_code_request_evt(hdev, skb);
- break;
+ /* Notify iso layer */
+ hci_connect_cfm(pa_sync, 0);
- case HCI_EV_LINK_KEY_REQ:
- hci_link_key_request_evt(hdev, skb);
- break;
+unlock:
+ hci_dev_unlock(hdev);
+}
- case HCI_EV_LINK_KEY_NOTIFY:
- hci_link_key_notify_evt(hdev, skb);
- break;
+static void hci_le_read_all_remote_features_evt(struct hci_dev *hdev,
+ void *data, struct sk_buff *skb)
+{
+ struct hci_evt_le_read_all_remote_features_complete *ev = data;
+ struct hci_conn *conn;
- case HCI_EV_CLOCK_OFFSET:
- hci_clock_offset_evt(hdev, skb);
- break;
+ bt_dev_dbg(hdev, "status 0x%2.2x", ev->status);
- case HCI_EV_PKT_TYPE_CHANGE:
- hci_pkt_type_change_evt(hdev, skb);
- break;
+ hci_dev_lock(hdev);
- case HCI_EV_PSCAN_REP_MODE:
- hci_pscan_rep_mode_evt(hdev, skb);
- break;
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
+ if (!conn)
+ goto unlock;
- case HCI_EV_INQUIRY_RESULT_WITH_RSSI:
- hci_inquiry_result_with_rssi_evt(hdev, skb);
- break;
+ if (!ev->status)
+ memcpy(conn->le_features, ev->features, 248);
- case HCI_EV_REMOTE_EXT_FEATURES:
- hci_remote_ext_features_evt(hdev, skb);
- break;
+ if (conn->state == BT_CONFIG) {
+ __u8 status;
- case HCI_EV_SYNC_CONN_COMPLETE:
- hci_sync_conn_complete_evt(hdev, skb);
- break;
+ /* If the local controller supports peripheral-initiated
+ * features exchange, but the remote controller does
+ * not, then it is possible that the error code 0x1a
+ * for unsupported remote feature gets returned.
+ *
+ * In this specific case, allow the connection to
+ * transition into connected state and mark it as
+ * successful.
+ */
+ if (!conn->out &&
+ ev->status == HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE &&
+ (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES))
+ status = 0x00;
+ else
+ status = ev->status;
- case HCI_EV_EXTENDED_INQUIRY_RESULT:
- hci_extended_inquiry_result_evt(hdev, skb);
- break;
+ conn->state = BT_CONNECTED;
+ hci_connect_cfm(conn, status);
+ }
- case HCI_EV_KEY_REFRESH_COMPLETE:
- hci_key_refresh_complete_evt(hdev, skb);
- break;
+unlock:
+ hci_dev_unlock(hdev);
+}
- case HCI_EV_IO_CAPA_REQUEST:
- hci_io_capa_request_evt(hdev, skb);
- break;
+#define HCI_LE_EV_VL(_op, _func, _min_len, _max_len) \
+[_op] = { \
+ .func = _func, \
+ .min_len = _min_len, \
+ .max_len = _max_len, \
+}
+
+#define HCI_LE_EV(_op, _func, _len) \
+ HCI_LE_EV_VL(_op, _func, _len, _len)
+
+#define HCI_LE_EV_STATUS(_op, _func) \
+ HCI_LE_EV(_op, _func, sizeof(struct hci_ev_status))
+
+/* Entries in this table shall have their position according to the subevent
+ * opcode they handle so the use of the macros above is recommend since it does
+ * attempt to initialize at its proper index using Designated Initializers that
+ * way events without a callback function can be omitted.
+ */
+static const struct hci_le_ev {
+ void (*func)(struct hci_dev *hdev, void *data, struct sk_buff *skb);
+ u16 min_len;
+ u16 max_len;
+} hci_le_ev_table[U8_MAX + 1] = {
+ /* [0x01 = HCI_EV_LE_CONN_COMPLETE] */
+ HCI_LE_EV(HCI_EV_LE_CONN_COMPLETE, hci_le_conn_complete_evt,
+ sizeof(struct hci_ev_le_conn_complete)),
+ /* [0x02 = HCI_EV_LE_ADVERTISING_REPORT] */
+ HCI_LE_EV_VL(HCI_EV_LE_ADVERTISING_REPORT, hci_le_adv_report_evt,
+ sizeof(struct hci_ev_le_advertising_report),
+ HCI_MAX_EVENT_SIZE),
+ /* [0x03 = HCI_EV_LE_CONN_UPDATE_COMPLETE] */
+ HCI_LE_EV(HCI_EV_LE_CONN_UPDATE_COMPLETE,
+ hci_le_conn_update_complete_evt,
+ sizeof(struct hci_ev_le_conn_update_complete)),
+ /* [0x04 = HCI_EV_LE_REMOTE_FEAT_COMPLETE] */
+ HCI_LE_EV(HCI_EV_LE_REMOTE_FEAT_COMPLETE,
+ hci_le_remote_feat_complete_evt,
+ sizeof(struct hci_ev_le_remote_feat_complete)),
+ /* [0x05 = HCI_EV_LE_LTK_REQ] */
+ HCI_LE_EV(HCI_EV_LE_LTK_REQ, hci_le_ltk_request_evt,
+ sizeof(struct hci_ev_le_ltk_req)),
+ /* [0x06 = HCI_EV_LE_REMOTE_CONN_PARAM_REQ] */
+ HCI_LE_EV(HCI_EV_LE_REMOTE_CONN_PARAM_REQ,
+ hci_le_remote_conn_param_req_evt,
+ sizeof(struct hci_ev_le_remote_conn_param_req)),
+ /* [0x0a = HCI_EV_LE_ENHANCED_CONN_COMPLETE] */
+ HCI_LE_EV(HCI_EV_LE_ENHANCED_CONN_COMPLETE,
+ hci_le_enh_conn_complete_evt,
+ sizeof(struct hci_ev_le_enh_conn_complete)),
+ /* [0x0b = HCI_EV_LE_DIRECT_ADV_REPORT] */
+ HCI_LE_EV_VL(HCI_EV_LE_DIRECT_ADV_REPORT, hci_le_direct_adv_report_evt,
+ sizeof(struct hci_ev_le_direct_adv_report),
+ HCI_MAX_EVENT_SIZE),
+ /* [0x0c = HCI_EV_LE_PHY_UPDATE_COMPLETE] */
+ HCI_LE_EV(HCI_EV_LE_PHY_UPDATE_COMPLETE, hci_le_phy_update_evt,
+ sizeof(struct hci_ev_le_phy_update_complete)),
+ /* [0x0d = HCI_EV_LE_EXT_ADV_REPORT] */
+ HCI_LE_EV_VL(HCI_EV_LE_EXT_ADV_REPORT, hci_le_ext_adv_report_evt,
+ sizeof(struct hci_ev_le_ext_adv_report),
+ HCI_MAX_EVENT_SIZE),
+ /* [0x0e = HCI_EV_LE_PA_SYNC_ESTABLISHED] */
+ HCI_LE_EV(HCI_EV_LE_PA_SYNC_ESTABLISHED,
+ hci_le_pa_sync_established_evt,
+ sizeof(struct hci_ev_le_pa_sync_established)),
+ /* [0x0f = HCI_EV_LE_PER_ADV_REPORT] */
+ HCI_LE_EV_VL(HCI_EV_LE_PER_ADV_REPORT,
+ hci_le_per_adv_report_evt,
+ sizeof(struct hci_ev_le_per_adv_report),
+ HCI_MAX_EVENT_SIZE),
+ /* [0x10 = HCI_EV_LE_PA_SYNC_LOST] */
+ HCI_LE_EV(HCI_EV_LE_PA_SYNC_LOST, hci_le_pa_sync_lost_evt,
+ sizeof(struct hci_ev_le_pa_sync_lost)),
+ /* [0x12 = HCI_EV_LE_EXT_ADV_SET_TERM] */
+ HCI_LE_EV(HCI_EV_LE_EXT_ADV_SET_TERM, hci_le_ext_adv_term_evt,
+ sizeof(struct hci_evt_le_ext_adv_set_term)),
+ /* [0x18 = HCI_EVT_LE_PAST_RECEIVED] */
+ HCI_LE_EV(HCI_EV_LE_PAST_RECEIVED,
+ hci_le_past_received_evt,
+ sizeof(struct hci_ev_le_past_received)),
+ /* [0x19 = HCI_EVT_LE_CIS_ESTABLISHED] */
+ HCI_LE_EV(HCI_EVT_LE_CIS_ESTABLISHED, hci_le_cis_established_evt,
+ sizeof(struct hci_evt_le_cis_established)),
+ /* [0x1a = HCI_EVT_LE_CIS_REQ] */
+ HCI_LE_EV(HCI_EVT_LE_CIS_REQ, hci_le_cis_req_evt,
+ sizeof(struct hci_evt_le_cis_req)),
+ /* [0x1b = HCI_EVT_LE_CREATE_BIG_COMPLETE] */
+ HCI_LE_EV_VL(HCI_EVT_LE_CREATE_BIG_COMPLETE,
+ hci_le_create_big_complete_evt,
+ sizeof(struct hci_evt_le_create_big_complete),
+ HCI_MAX_EVENT_SIZE),
+ /* [0x1d = HCI_EV_LE_BIG_SYNC_ESTABLISHED] */
+ HCI_LE_EV_VL(HCI_EVT_LE_BIG_SYNC_ESTABLISHED,
+ hci_le_big_sync_established_evt,
+ sizeof(struct hci_evt_le_big_sync_established),
+ HCI_MAX_EVENT_SIZE),
+ /* [0x1e = HCI_EVT_LE_BIG_SYNC_LOST] */
+ HCI_LE_EV_VL(HCI_EVT_LE_BIG_SYNC_LOST,
+ hci_le_big_sync_lost_evt,
+ sizeof(struct hci_evt_le_big_sync_lost),
+ HCI_MAX_EVENT_SIZE),
+ /* [0x22 = HCI_EVT_LE_BIG_INFO_ADV_REPORT] */
+ HCI_LE_EV_VL(HCI_EVT_LE_BIG_INFO_ADV_REPORT,
+ hci_le_big_info_adv_report_evt,
+ sizeof(struct hci_evt_le_big_info_adv_report),
+ HCI_MAX_EVENT_SIZE),
+ /* [0x2b = HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE] */
+ HCI_LE_EV_VL(HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE,
+ hci_le_read_all_remote_features_evt,
+ sizeof(struct
+ hci_evt_le_read_all_remote_features_complete),
+ HCI_MAX_EVENT_SIZE),
+};
+
+static void hci_le_meta_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb, u16 *opcode, u8 *status,
+ hci_req_complete_t *req_complete,
+ hci_req_complete_skb_t *req_complete_skb)
+{
+ struct hci_ev_le_meta *ev = data;
+ const struct hci_le_ev *subev;
+
+ bt_dev_dbg(hdev, "subevent 0x%2.2x", ev->subevent);
+
+ /* Only match event if command OGF is for LE */
+ if (hdev->req_skb &&
+ (hci_opcode_ogf(hci_skb_opcode(hdev->req_skb)) == 0x08 ||
+ hci_skb_opcode(hdev->req_skb) == HCI_OP_NOP) &&
+ hci_skb_event(hdev->req_skb) == ev->subevent) {
+ *opcode = hci_skb_opcode(hdev->req_skb);
+ hci_req_cmd_complete(hdev, *opcode, 0x00, req_complete,
+ req_complete_skb);
+ }
- case HCI_EV_IO_CAPA_REPLY:
- hci_io_capa_reply_evt(hdev, skb);
- break;
+ subev = &hci_le_ev_table[ev->subevent];
+ if (!subev->func)
+ return;
- case HCI_EV_USER_CONFIRM_REQUEST:
- hci_user_confirm_request_evt(hdev, skb);
- break;
+ if (skb->len < subev->min_len) {
+ bt_dev_err(hdev, "unexpected subevent 0x%2.2x length: %u < %u",
+ ev->subevent, skb->len, subev->min_len);
+ return;
+ }
- case HCI_EV_USER_PASSKEY_REQUEST:
- hci_user_passkey_request_evt(hdev, skb);
- break;
+ /* Just warn if the length is over max_len size it still be
+ * possible to partially parse the event so leave to callback to
+ * decide if that is acceptable.
+ */
+ if (skb->len > subev->max_len)
+ bt_dev_warn(hdev, "unexpected subevent 0x%2.2x length: %u > %u",
+ ev->subevent, skb->len, subev->max_len);
+ data = hci_le_ev_skb_pull(hdev, skb, ev->subevent, subev->min_len);
+ if (!data)
+ return;
- case HCI_EV_USER_PASSKEY_NOTIFY:
- hci_user_passkey_notify_evt(hdev, skb);
- break;
+ subev->func(hdev, data, skb);
+}
- case HCI_EV_KEYPRESS_NOTIFY:
- hci_keypress_notify_evt(hdev, skb);
- break;
+static bool hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode,
+ u8 event, struct sk_buff *skb)
+{
+ struct hci_ev_cmd_complete *ev;
+ struct hci_event_hdr *hdr;
- case HCI_EV_SIMPLE_PAIR_COMPLETE:
- hci_simple_pair_complete_evt(hdev, skb);
- break;
+ if (!skb)
+ return false;
- case HCI_EV_REMOTE_HOST_FEATURES:
- hci_remote_host_features_evt(hdev, skb);
- break;
+ hdr = hci_ev_skb_pull(hdev, skb, event, sizeof(*hdr));
+ if (!hdr)
+ return false;
- case HCI_EV_LE_META:
- hci_le_meta_evt(hdev, skb);
- break;
+ if (event) {
+ if (hdr->evt != event)
+ return false;
+ return true;
+ }
- case HCI_EV_REMOTE_OOB_DATA_REQUEST:
- hci_remote_oob_data_request_evt(hdev, skb);
- break;
+ /* Check if request ended in Command Status - no way to retrieve
+ * any extra parameters in this case.
+ */
+ if (hdr->evt == HCI_EV_CMD_STATUS)
+ return false;
-#if IS_ENABLED(CONFIG_BT_HS)
- case HCI_EV_CHANNEL_SELECTED:
- hci_chan_selected_evt(hdev, skb);
- break;
+ if (hdr->evt != HCI_EV_CMD_COMPLETE) {
+ bt_dev_err(hdev, "last event is not cmd complete (0x%2.2x)",
+ hdr->evt);
+ return false;
+ }
- case HCI_EV_PHY_LINK_COMPLETE:
- hci_phy_link_complete_evt(hdev, skb);
- break;
+ ev = hci_cc_skb_pull(hdev, skb, opcode, sizeof(*ev));
+ if (!ev)
+ return false;
- case HCI_EV_LOGICAL_LINK_COMPLETE:
- hci_loglink_complete_evt(hdev, skb);
- break;
+ if (opcode != __le16_to_cpu(ev->opcode)) {
+ BT_DBG("opcode doesn't match (0x%2.2x != 0x%2.2x)", opcode,
+ __le16_to_cpu(ev->opcode));
+ return false;
+ }
- case HCI_EV_DISCONN_LOGICAL_LINK_COMPLETE:
- hci_disconn_loglink_complete_evt(hdev, skb);
- break;
+ return true;
+}
- case HCI_EV_DISCONN_PHY_LINK_COMPLETE:
- hci_disconn_phylink_complete_evt(hdev, skb);
- break;
-#endif
+static void hci_store_wake_reason(struct hci_dev *hdev, u8 event,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_advertising_info *adv;
+ struct hci_ev_le_direct_adv_info *direct_adv;
+ struct hci_ev_le_ext_adv_info *ext_adv;
+ const struct hci_ev_conn_complete *conn_complete = (void *)skb->data;
+ const struct hci_ev_conn_request *conn_request = (void *)skb->data;
- case HCI_EV_NUM_COMP_BLOCKS:
- hci_num_comp_blocks_evt(hdev, skb);
- break;
+ hci_dev_lock(hdev);
- default:
- BT_DBG("%s event 0x%2.2x", hdev->name, event);
- break;
+ /* If we are currently suspended and this is the first BT event seen,
+ * save the wake reason associated with the event.
+ */
+ if (!hdev->suspended || hdev->wake_reason)
+ goto unlock;
+
+ /* Default to remote wake. Values for wake_reason are documented in the
+ * Bluez mgmt api docs.
+ */
+ hdev->wake_reason = MGMT_WAKE_REASON_REMOTE_WAKE;
+
+ /* Once configured for remote wakeup, we should only wake up for
+ * reconnections. It's useful to see which device is waking us up so
+ * keep track of the bdaddr of the connection event that woke us up.
+ */
+ if (event == HCI_EV_CONN_REQUEST) {
+ bacpy(&hdev->wake_addr, &conn_request->bdaddr);
+ hdev->wake_addr_type = BDADDR_BREDR;
+ } else if (event == HCI_EV_CONN_COMPLETE) {
+ bacpy(&hdev->wake_addr, &conn_complete->bdaddr);
+ hdev->wake_addr_type = BDADDR_BREDR;
+ } else if (event == HCI_EV_LE_META) {
+ struct hci_ev_le_meta *le_ev = (void *)skb->data;
+ u8 subevent = le_ev->subevent;
+ u8 *ptr = &skb->data[sizeof(*le_ev)];
+ u8 num_reports = *ptr;
+
+ if ((subevent == HCI_EV_LE_ADVERTISING_REPORT ||
+ subevent == HCI_EV_LE_DIRECT_ADV_REPORT ||
+ subevent == HCI_EV_LE_EXT_ADV_REPORT) &&
+ num_reports) {
+ adv = (void *)(ptr + 1);
+ direct_adv = (void *)(ptr + 1);
+ ext_adv = (void *)(ptr + 1);
+
+ switch (subevent) {
+ case HCI_EV_LE_ADVERTISING_REPORT:
+ bacpy(&hdev->wake_addr, &adv->bdaddr);
+ hdev->wake_addr_type = adv->bdaddr_type;
+ break;
+ case HCI_EV_LE_DIRECT_ADV_REPORT:
+ bacpy(&hdev->wake_addr, &direct_adv->bdaddr);
+ hdev->wake_addr_type = direct_adv->bdaddr_type;
+ break;
+ case HCI_EV_LE_EXT_ADV_REPORT:
+ bacpy(&hdev->wake_addr, &ext_adv->bdaddr);
+ hdev->wake_addr_type = ext_adv->bdaddr_type;
+ break;
+ }
+ }
+ } else {
+ hdev->wake_reason = MGMT_WAKE_REASON_UNEXPECTED;
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+#define HCI_EV_VL(_op, _func, _min_len, _max_len) \
+[_op] = { \
+ .req = false, \
+ .func = _func, \
+ .min_len = _min_len, \
+ .max_len = _max_len, \
+}
+
+#define HCI_EV(_op, _func, _len) \
+ HCI_EV_VL(_op, _func, _len, _len)
+
+#define HCI_EV_STATUS(_op, _func) \
+ HCI_EV(_op, _func, sizeof(struct hci_ev_status))
+
+#define HCI_EV_REQ_VL(_op, _func, _min_len, _max_len) \
+[_op] = { \
+ .req = true, \
+ .func_req = _func, \
+ .min_len = _min_len, \
+ .max_len = _max_len, \
+}
+
+#define HCI_EV_REQ(_op, _func, _len) \
+ HCI_EV_REQ_VL(_op, _func, _len, _len)
+
+/* Entries in this table shall have their position according to the event opcode
+ * they handle so the use of the macros above is recommend since it does attempt
+ * to initialize at its proper index using Designated Initializers that way
+ * events without a callback function don't have entered.
+ */
+static const struct hci_ev {
+ bool req;
+ union {
+ void (*func)(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb);
+ void (*func_req)(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb, u16 *opcode, u8 *status,
+ hci_req_complete_t *req_complete,
+ hci_req_complete_skb_t *req_complete_skb);
+ };
+ u16 min_len;
+ u16 max_len;
+} hci_ev_table[U8_MAX + 1] = {
+ /* [0x01 = HCI_EV_INQUIRY_COMPLETE] */
+ HCI_EV_STATUS(HCI_EV_INQUIRY_COMPLETE, hci_inquiry_complete_evt),
+ /* [0x02 = HCI_EV_INQUIRY_RESULT] */
+ HCI_EV_VL(HCI_EV_INQUIRY_RESULT, hci_inquiry_result_evt,
+ sizeof(struct hci_ev_inquiry_result), HCI_MAX_EVENT_SIZE),
+ /* [0x03 = HCI_EV_CONN_COMPLETE] */
+ HCI_EV(HCI_EV_CONN_COMPLETE, hci_conn_complete_evt,
+ sizeof(struct hci_ev_conn_complete)),
+ /* [0x04 = HCI_EV_CONN_REQUEST] */
+ HCI_EV(HCI_EV_CONN_REQUEST, hci_conn_request_evt,
+ sizeof(struct hci_ev_conn_request)),
+ /* [0x05 = HCI_EV_DISCONN_COMPLETE] */
+ HCI_EV(HCI_EV_DISCONN_COMPLETE, hci_disconn_complete_evt,
+ sizeof(struct hci_ev_disconn_complete)),
+ /* [0x06 = HCI_EV_AUTH_COMPLETE] */
+ HCI_EV(HCI_EV_AUTH_COMPLETE, hci_auth_complete_evt,
+ sizeof(struct hci_ev_auth_complete)),
+ /* [0x07 = HCI_EV_REMOTE_NAME] */
+ HCI_EV(HCI_EV_REMOTE_NAME, hci_remote_name_evt,
+ sizeof(struct hci_ev_remote_name)),
+ /* [0x08 = HCI_EV_ENCRYPT_CHANGE] */
+ HCI_EV(HCI_EV_ENCRYPT_CHANGE, hci_encrypt_change_evt,
+ sizeof(struct hci_ev_encrypt_change)),
+ /* [0x09 = HCI_EV_CHANGE_LINK_KEY_COMPLETE] */
+ HCI_EV(HCI_EV_CHANGE_LINK_KEY_COMPLETE,
+ hci_change_link_key_complete_evt,
+ sizeof(struct hci_ev_change_link_key_complete)),
+ /* [0x0b = HCI_EV_REMOTE_FEATURES] */
+ HCI_EV(HCI_EV_REMOTE_FEATURES, hci_remote_features_evt,
+ sizeof(struct hci_ev_remote_features)),
+ /* [0x0e = HCI_EV_CMD_COMPLETE] */
+ HCI_EV_REQ_VL(HCI_EV_CMD_COMPLETE, hci_cmd_complete_evt,
+ sizeof(struct hci_ev_cmd_complete), HCI_MAX_EVENT_SIZE),
+ /* [0x0f = HCI_EV_CMD_STATUS] */
+ HCI_EV_REQ(HCI_EV_CMD_STATUS, hci_cmd_status_evt,
+ sizeof(struct hci_ev_cmd_status)),
+ /* [0x10 = HCI_EV_CMD_STATUS] */
+ HCI_EV(HCI_EV_HARDWARE_ERROR, hci_hardware_error_evt,
+ sizeof(struct hci_ev_hardware_error)),
+ /* [0x12 = HCI_EV_ROLE_CHANGE] */
+ HCI_EV(HCI_EV_ROLE_CHANGE, hci_role_change_evt,
+ sizeof(struct hci_ev_role_change)),
+ /* [0x13 = HCI_EV_NUM_COMP_PKTS] */
+ HCI_EV_VL(HCI_EV_NUM_COMP_PKTS, hci_num_comp_pkts_evt,
+ sizeof(struct hci_ev_num_comp_pkts), HCI_MAX_EVENT_SIZE),
+ /* [0x14 = HCI_EV_MODE_CHANGE] */
+ HCI_EV(HCI_EV_MODE_CHANGE, hci_mode_change_evt,
+ sizeof(struct hci_ev_mode_change)),
+ /* [0x16 = HCI_EV_PIN_CODE_REQ] */
+ HCI_EV(HCI_EV_PIN_CODE_REQ, hci_pin_code_request_evt,
+ sizeof(struct hci_ev_pin_code_req)),
+ /* [0x17 = HCI_EV_LINK_KEY_REQ] */
+ HCI_EV(HCI_EV_LINK_KEY_REQ, hci_link_key_request_evt,
+ sizeof(struct hci_ev_link_key_req)),
+ /* [0x18 = HCI_EV_LINK_KEY_NOTIFY] */
+ HCI_EV(HCI_EV_LINK_KEY_NOTIFY, hci_link_key_notify_evt,
+ sizeof(struct hci_ev_link_key_notify)),
+ /* [0x1c = HCI_EV_CLOCK_OFFSET] */
+ HCI_EV(HCI_EV_CLOCK_OFFSET, hci_clock_offset_evt,
+ sizeof(struct hci_ev_clock_offset)),
+ /* [0x1d = HCI_EV_PKT_TYPE_CHANGE] */
+ HCI_EV(HCI_EV_PKT_TYPE_CHANGE, hci_pkt_type_change_evt,
+ sizeof(struct hci_ev_pkt_type_change)),
+ /* [0x20 = HCI_EV_PSCAN_REP_MODE] */
+ HCI_EV(HCI_EV_PSCAN_REP_MODE, hci_pscan_rep_mode_evt,
+ sizeof(struct hci_ev_pscan_rep_mode)),
+ /* [0x22 = HCI_EV_INQUIRY_RESULT_WITH_RSSI] */
+ HCI_EV_VL(HCI_EV_INQUIRY_RESULT_WITH_RSSI,
+ hci_inquiry_result_with_rssi_evt,
+ sizeof(struct hci_ev_inquiry_result_rssi),
+ HCI_MAX_EVENT_SIZE),
+ /* [0x23 = HCI_EV_REMOTE_EXT_FEATURES] */
+ HCI_EV(HCI_EV_REMOTE_EXT_FEATURES, hci_remote_ext_features_evt,
+ sizeof(struct hci_ev_remote_ext_features)),
+ /* [0x2c = HCI_EV_SYNC_CONN_COMPLETE] */
+ HCI_EV(HCI_EV_SYNC_CONN_COMPLETE, hci_sync_conn_complete_evt,
+ sizeof(struct hci_ev_sync_conn_complete)),
+ /* [0x2f = HCI_EV_EXTENDED_INQUIRY_RESULT] */
+ HCI_EV_VL(HCI_EV_EXTENDED_INQUIRY_RESULT,
+ hci_extended_inquiry_result_evt,
+ sizeof(struct hci_ev_ext_inquiry_result), HCI_MAX_EVENT_SIZE),
+ /* [0x30 = HCI_EV_KEY_REFRESH_COMPLETE] */
+ HCI_EV(HCI_EV_KEY_REFRESH_COMPLETE, hci_key_refresh_complete_evt,
+ sizeof(struct hci_ev_key_refresh_complete)),
+ /* [0x31 = HCI_EV_IO_CAPA_REQUEST] */
+ HCI_EV(HCI_EV_IO_CAPA_REQUEST, hci_io_capa_request_evt,
+ sizeof(struct hci_ev_io_capa_request)),
+ /* [0x32 = HCI_EV_IO_CAPA_REPLY] */
+ HCI_EV(HCI_EV_IO_CAPA_REPLY, hci_io_capa_reply_evt,
+ sizeof(struct hci_ev_io_capa_reply)),
+ /* [0x33 = HCI_EV_USER_CONFIRM_REQUEST] */
+ HCI_EV(HCI_EV_USER_CONFIRM_REQUEST, hci_user_confirm_request_evt,
+ sizeof(struct hci_ev_user_confirm_req)),
+ /* [0x34 = HCI_EV_USER_PASSKEY_REQUEST] */
+ HCI_EV(HCI_EV_USER_PASSKEY_REQUEST, hci_user_passkey_request_evt,
+ sizeof(struct hci_ev_user_passkey_req)),
+ /* [0x35 = HCI_EV_REMOTE_OOB_DATA_REQUEST] */
+ HCI_EV(HCI_EV_REMOTE_OOB_DATA_REQUEST, hci_remote_oob_data_request_evt,
+ sizeof(struct hci_ev_remote_oob_data_request)),
+ /* [0x36 = HCI_EV_SIMPLE_PAIR_COMPLETE] */
+ HCI_EV(HCI_EV_SIMPLE_PAIR_COMPLETE, hci_simple_pair_complete_evt,
+ sizeof(struct hci_ev_simple_pair_complete)),
+ /* [0x3b = HCI_EV_USER_PASSKEY_NOTIFY] */
+ HCI_EV(HCI_EV_USER_PASSKEY_NOTIFY, hci_user_passkey_notify_evt,
+ sizeof(struct hci_ev_user_passkey_notify)),
+ /* [0x3c = HCI_EV_KEYPRESS_NOTIFY] */
+ HCI_EV(HCI_EV_KEYPRESS_NOTIFY, hci_keypress_notify_evt,
+ sizeof(struct hci_ev_keypress_notify)),
+ /* [0x3d = HCI_EV_REMOTE_HOST_FEATURES] */
+ HCI_EV(HCI_EV_REMOTE_HOST_FEATURES, hci_remote_host_features_evt,
+ sizeof(struct hci_ev_remote_host_features)),
+ /* [0x3e = HCI_EV_LE_META] */
+ HCI_EV_REQ_VL(HCI_EV_LE_META, hci_le_meta_evt,
+ sizeof(struct hci_ev_le_meta), HCI_MAX_EVENT_SIZE),
+ /* [0xff = HCI_EV_VENDOR] */
+ HCI_EV_VL(HCI_EV_VENDOR, msft_vendor_evt, 0, HCI_MAX_EVENT_SIZE),
+};
+
+static void hci_event_func(struct hci_dev *hdev, u8 event, struct sk_buff *skb,
+ u16 *opcode, u8 *status,
+ hci_req_complete_t *req_complete,
+ hci_req_complete_skb_t *req_complete_skb)
+{
+ const struct hci_ev *ev = &hci_ev_table[event];
+ void *data;
+
+ if (!ev->func)
+ return;
+
+ if (skb->len < ev->min_len) {
+ bt_dev_err(hdev, "unexpected event 0x%2.2x length: %u < %u",
+ event, skb->len, ev->min_len);
+ return;
+ }
+
+ /* Just warn if the length is over max_len size it still be
+ * possible to partially parse the event so leave to callback to
+ * decide if that is acceptable.
+ */
+ if (skb->len > ev->max_len)
+ bt_dev_warn_ratelimited(hdev,
+ "unexpected event 0x%2.2x length: %u > %u",
+ event, skb->len, ev->max_len);
+
+ data = hci_ev_skb_pull(hdev, skb, event, ev->min_len);
+ if (!data)
+ return;
+
+ if (ev->req)
+ ev->func_req(hdev, data, skb, opcode, status, req_complete,
+ req_complete_skb);
+ else
+ ev->func(hdev, data, skb);
+}
+
+void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_event_hdr *hdr = (void *) skb->data;
+ hci_req_complete_t req_complete = NULL;
+ hci_req_complete_skb_t req_complete_skb = NULL;
+ struct sk_buff *orig_skb = NULL;
+ u8 status = 0, event, req_evt = 0;
+ u16 opcode = HCI_OP_NOP;
+
+ if (skb->len < sizeof(*hdr)) {
+ bt_dev_err(hdev, "Malformed HCI Event");
+ goto done;
+ }
+
+ hci_dev_lock(hdev);
+ kfree_skb(hdev->recv_event);
+ hdev->recv_event = skb_clone(skb, GFP_KERNEL);
+ hci_dev_unlock(hdev);
+
+ event = hdr->evt;
+ if (!event) {
+ bt_dev_warn(hdev, "Received unexpected HCI Event 0x%2.2x",
+ event);
+ goto done;
+ }
+
+ /* Only match event if command OGF is not for LE */
+ if (hdev->req_skb &&
+ hci_opcode_ogf(hci_skb_opcode(hdev->req_skb)) != 0x08 &&
+ hci_skb_event(hdev->req_skb) == event) {
+ hci_req_cmd_complete(hdev, hci_skb_opcode(hdev->req_skb),
+ status, &req_complete, &req_complete_skb);
+ req_evt = event;
}
+ /* If it looks like we might end up having to call
+ * req_complete_skb, store a pristine copy of the skb since the
+ * various handlers may modify the original one through
+ * skb_pull() calls, etc.
+ */
+ if (req_complete_skb || event == HCI_EV_CMD_STATUS ||
+ event == HCI_EV_CMD_COMPLETE)
+ orig_skb = skb_clone(skb, GFP_KERNEL);
+
+ skb_pull(skb, HCI_EVENT_HDR_SIZE);
+
+ /* Store wake reason if we're suspended */
+ hci_store_wake_reason(hdev, event, skb);
+
+ bt_dev_dbg(hdev, "event 0x%2.2x", event);
+
+ hci_event_func(hdev, event, skb, &opcode, &status, &req_complete,
+ &req_complete_skb);
+
if (req_complete) {
req_complete(hdev, status, opcode);
} else if (req_complete_skb) {
@@ -5911,6 +7783,7 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb)
req_complete_skb(hdev, status, opcode, orig_skb);
}
+done:
kfree_skb(orig_skb);
kfree_skb(skb);
hdev->stat.evt_rx++;
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
deleted file mode 100644
index e8c9ef1e1922..000000000000
--- a/net/bluetooth/hci_request.c
+++ /dev/null
@@ -1,2818 +0,0 @@
-/*
- BlueZ - Bluetooth protocol stack for Linux
-
- Copyright (C) 2014 Intel Corporation
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License version 2 as
- published by the Free Software Foundation;
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
- IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
- SOFTWARE IS DISCLAIMED.
-*/
-
-#include <linux/sched/signal.h>
-
-#include <net/bluetooth/bluetooth.h>
-#include <net/bluetooth/hci_core.h>
-#include <net/bluetooth/mgmt.h>
-
-#include "smp.h"
-#include "hci_request.h"
-
-#define HCI_REQ_DONE 0
-#define HCI_REQ_PEND 1
-#define HCI_REQ_CANCELED 2
-
-void hci_req_init(struct hci_request *req, struct hci_dev *hdev)
-{
- skb_queue_head_init(&req->cmd_q);
- req->hdev = hdev;
- req->err = 0;
-}
-
-void hci_req_purge(struct hci_request *req)
-{
- skb_queue_purge(&req->cmd_q);
-}
-
-static int req_run(struct hci_request *req, hci_req_complete_t complete,
- hci_req_complete_skb_t complete_skb)
-{
- struct hci_dev *hdev = req->hdev;
- struct sk_buff *skb;
- unsigned long flags;
-
- BT_DBG("length %u", skb_queue_len(&req->cmd_q));
-
- /* If an error occurred during request building, remove all HCI
- * commands queued on the HCI request queue.
- */
- if (req->err) {
- skb_queue_purge(&req->cmd_q);
- return req->err;
- }
-
- /* Do not allow empty requests */
- if (skb_queue_empty(&req->cmd_q))
- return -ENODATA;
-
- skb = skb_peek_tail(&req->cmd_q);
- if (complete) {
- bt_cb(skb)->hci.req_complete = complete;
- } else if (complete_skb) {
- bt_cb(skb)->hci.req_complete_skb = complete_skb;
- bt_cb(skb)->hci.req_flags |= HCI_REQ_SKB;
- }
-
- spin_lock_irqsave(&hdev->cmd_q.lock, flags);
- skb_queue_splice_tail(&req->cmd_q, &hdev->cmd_q);
- spin_unlock_irqrestore(&hdev->cmd_q.lock, flags);
-
- queue_work(hdev->workqueue, &hdev->cmd_work);
-
- return 0;
-}
-
-int hci_req_run(struct hci_request *req, hci_req_complete_t complete)
-{
- return req_run(req, complete, NULL);
-}
-
-int hci_req_run_skb(struct hci_request *req, hci_req_complete_skb_t complete)
-{
- return req_run(req, NULL, complete);
-}
-
-static void hci_req_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode,
- struct sk_buff *skb)
-{
- BT_DBG("%s result 0x%2.2x", hdev->name, result);
-
- if (hdev->req_status == HCI_REQ_PEND) {
- hdev->req_result = result;
- hdev->req_status = HCI_REQ_DONE;
- if (skb)
- hdev->req_skb = skb_get(skb);
- wake_up_interruptible(&hdev->req_wait_q);
- }
-}
-
-void hci_req_sync_cancel(struct hci_dev *hdev, int err)
-{
- BT_DBG("%s err 0x%2.2x", hdev->name, err);
-
- if (hdev->req_status == HCI_REQ_PEND) {
- hdev->req_result = err;
- hdev->req_status = HCI_REQ_CANCELED;
- wake_up_interruptible(&hdev->req_wait_q);
- }
-}
-
-struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen,
- const void *param, u8 event, u32 timeout)
-{
- struct hci_request req;
- struct sk_buff *skb;
- int err = 0;
-
- BT_DBG("%s", hdev->name);
-
- hci_req_init(&req, hdev);
-
- hci_req_add_ev(&req, opcode, plen, param, event);
-
- hdev->req_status = HCI_REQ_PEND;
-
- err = hci_req_run_skb(&req, hci_req_sync_complete);
- if (err < 0)
- return ERR_PTR(err);
-
- err = wait_event_interruptible_timeout(hdev->req_wait_q,
- hdev->req_status != HCI_REQ_PEND, timeout);
-
- if (err == -ERESTARTSYS)
- return ERR_PTR(-EINTR);
-
- switch (hdev->req_status) {
- case HCI_REQ_DONE:
- err = -bt_to_errno(hdev->req_result);
- break;
-
- case HCI_REQ_CANCELED:
- err = -hdev->req_result;
- break;
-
- default:
- err = -ETIMEDOUT;
- break;
- }
-
- hdev->req_status = hdev->req_result = 0;
- skb = hdev->req_skb;
- hdev->req_skb = NULL;
-
- BT_DBG("%s end: err %d", hdev->name, err);
-
- if (err < 0) {
- kfree_skb(skb);
- return ERR_PTR(err);
- }
-
- if (!skb)
- return ERR_PTR(-ENODATA);
-
- return skb;
-}
-EXPORT_SYMBOL(__hci_cmd_sync_ev);
-
-struct sk_buff *__hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen,
- const void *param, u32 timeout)
-{
- return __hci_cmd_sync_ev(hdev, opcode, plen, param, 0, timeout);
-}
-EXPORT_SYMBOL(__hci_cmd_sync);
-
-/* Execute request and wait for completion. */
-int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
- unsigned long opt),
- unsigned long opt, u32 timeout, u8 *hci_status)
-{
- struct hci_request req;
- int err = 0;
-
- BT_DBG("%s start", hdev->name);
-
- hci_req_init(&req, hdev);
-
- hdev->req_status = HCI_REQ_PEND;
-
- err = func(&req, opt);
- if (err) {
- if (hci_status)
- *hci_status = HCI_ERROR_UNSPECIFIED;
- return err;
- }
-
- err = hci_req_run_skb(&req, hci_req_sync_complete);
- if (err < 0) {
- hdev->req_status = 0;
-
- /* ENODATA means the HCI request command queue is empty.
- * This can happen when a request with conditionals doesn't
- * trigger any commands to be sent. This is normal behavior
- * and should not trigger an error return.
- */
- if (err == -ENODATA) {
- if (hci_status)
- *hci_status = 0;
- return 0;
- }
-
- if (hci_status)
- *hci_status = HCI_ERROR_UNSPECIFIED;
-
- return err;
- }
-
- err = wait_event_interruptible_timeout(hdev->req_wait_q,
- hdev->req_status != HCI_REQ_PEND, timeout);
-
- if (err == -ERESTARTSYS)
- return -EINTR;
-
- switch (hdev->req_status) {
- case HCI_REQ_DONE:
- err = -bt_to_errno(hdev->req_result);
- if (hci_status)
- *hci_status = hdev->req_result;
- break;
-
- case HCI_REQ_CANCELED:
- err = -hdev->req_result;
- if (hci_status)
- *hci_status = HCI_ERROR_UNSPECIFIED;
- break;
-
- default:
- err = -ETIMEDOUT;
- if (hci_status)
- *hci_status = HCI_ERROR_UNSPECIFIED;
- break;
- }
-
- kfree_skb(hdev->req_skb);
- hdev->req_skb = NULL;
- hdev->req_status = hdev->req_result = 0;
-
- BT_DBG("%s end: err %d", hdev->name, err);
-
- return err;
-}
-
-int hci_req_sync(struct hci_dev *hdev, int (*req)(struct hci_request *req,
- unsigned long opt),
- unsigned long opt, u32 timeout, u8 *hci_status)
-{
- int ret;
-
- if (!test_bit(HCI_UP, &hdev->flags))
- return -ENETDOWN;
-
- /* Serialize all requests */
- hci_req_sync_lock(hdev);
- ret = __hci_req_sync(hdev, req, opt, timeout, hci_status);
- hci_req_sync_unlock(hdev);
-
- return ret;
-}
-
-struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen,
- const void *param)
-{
- int len = HCI_COMMAND_HDR_SIZE + plen;
- struct hci_command_hdr *hdr;
- struct sk_buff *skb;
-
- skb = bt_skb_alloc(len, GFP_ATOMIC);
- if (!skb)
- return NULL;
-
- hdr = skb_put(skb, HCI_COMMAND_HDR_SIZE);
- hdr->opcode = cpu_to_le16(opcode);
- hdr->plen = plen;
-
- if (plen)
- skb_put_data(skb, param, plen);
-
- BT_DBG("skb len %d", skb->len);
-
- hci_skb_pkt_type(skb) = HCI_COMMAND_PKT;
- hci_skb_opcode(skb) = opcode;
-
- return skb;
-}
-
-/* Queue a command to an asynchronous HCI request */
-void hci_req_add_ev(struct hci_request *req, u16 opcode, u32 plen,
- const void *param, u8 event)
-{
- struct hci_dev *hdev = req->hdev;
- struct sk_buff *skb;
-
- BT_DBG("%s opcode 0x%4.4x plen %d", hdev->name, opcode, plen);
-
- /* If an error occurred during request building, there is no point in
- * queueing the HCI command. We can simply return.
- */
- if (req->err)
- return;
-
- skb = hci_prepare_cmd(hdev, opcode, plen, param);
- if (!skb) {
- bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)",
- opcode);
- req->err = -ENOMEM;
- return;
- }
-
- if (skb_queue_empty(&req->cmd_q))
- bt_cb(skb)->hci.req_flags |= HCI_REQ_START;
-
- bt_cb(skb)->hci.req_event = event;
-
- skb_queue_tail(&req->cmd_q, skb);
-}
-
-void hci_req_add(struct hci_request *req, u16 opcode, u32 plen,
- const void *param)
-{
- hci_req_add_ev(req, opcode, plen, param, 0);
-}
-
-void __hci_req_write_fast_connectable(struct hci_request *req, bool enable)
-{
- struct hci_dev *hdev = req->hdev;
- struct hci_cp_write_page_scan_activity acp;
- u8 type;
-
- if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
- return;
-
- if (hdev->hci_ver < BLUETOOTH_VER_1_2)
- return;
-
- if (enable) {
- type = PAGE_SCAN_TYPE_INTERLACED;
-
- /* 160 msec page scan interval */
- acp.interval = cpu_to_le16(0x0100);
- } else {
- type = PAGE_SCAN_TYPE_STANDARD; /* default */
-
- /* default 1.28 sec page scan */
- acp.interval = cpu_to_le16(0x0800);
- }
-
- acp.window = cpu_to_le16(0x0012);
-
- if (__cpu_to_le16(hdev->page_scan_interval) != acp.interval ||
- __cpu_to_le16(hdev->page_scan_window) != acp.window)
- hci_req_add(req, HCI_OP_WRITE_PAGE_SCAN_ACTIVITY,
- sizeof(acp), &acp);
-
- if (hdev->page_scan_type != type)
- hci_req_add(req, HCI_OP_WRITE_PAGE_SCAN_TYPE, 1, &type);
-}
-
-/* This function controls the background scanning based on hdev->pend_le_conns
- * list. If there are pending LE connection we start the background scanning,
- * otherwise we stop it.
- *
- * This function requires the caller holds hdev->lock.
- */
-static void __hci_update_background_scan(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
-
- if (!test_bit(HCI_UP, &hdev->flags) ||
- test_bit(HCI_INIT, &hdev->flags) ||
- hci_dev_test_flag(hdev, HCI_SETUP) ||
- hci_dev_test_flag(hdev, HCI_CONFIG) ||
- hci_dev_test_flag(hdev, HCI_AUTO_OFF) ||
- hci_dev_test_flag(hdev, HCI_UNREGISTER))
- return;
-
- /* No point in doing scanning if LE support hasn't been enabled */
- if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
- return;
-
- /* If discovery is active don't interfere with it */
- if (hdev->discovery.state != DISCOVERY_STOPPED)
- return;
-
- /* Reset RSSI and UUID filters when starting background scanning
- * since these filters are meant for service discovery only.
- *
- * The Start Discovery and Start Service Discovery operations
- * ensure to set proper values for RSSI threshold and UUID
- * filter list. So it is safe to just reset them here.
- */
- hci_discovery_filter_clear(hdev);
-
- if (list_empty(&hdev->pend_le_conns) &&
- list_empty(&hdev->pend_le_reports)) {
- /* If there is no pending LE connections or devices
- * to be scanned for, we should stop the background
- * scanning.
- */
-
- /* If controller is not scanning we are done. */
- if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
- return;
-
- hci_req_add_le_scan_disable(req);
-
- BT_DBG("%s stopping background scanning", hdev->name);
- } else {
- /* If there is at least one pending LE connection, we should
- * keep the background scan running.
- */
-
- /* If controller is connecting, we should not start scanning
- * since some controllers are not able to scan and connect at
- * the same time.
- */
- if (hci_lookup_le_connect(hdev))
- return;
-
- /* If controller is currently scanning, we stop it to ensure we
- * don't miss any advertising (due to duplicates filter).
- */
- if (hci_dev_test_flag(hdev, HCI_LE_SCAN))
- hci_req_add_le_scan_disable(req);
-
- hci_req_add_le_passive_scan(req);
-
- BT_DBG("%s starting background scanning", hdev->name);
- }
-}
-
-void __hci_req_update_name(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
- struct hci_cp_write_local_name cp;
-
- memcpy(cp.name, hdev->dev_name, sizeof(cp.name));
-
- hci_req_add(req, HCI_OP_WRITE_LOCAL_NAME, sizeof(cp), &cp);
-}
-
-#define PNP_INFO_SVCLASS_ID 0x1200
-
-static u8 *create_uuid16_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len)
-{
- u8 *ptr = data, *uuids_start = NULL;
- struct bt_uuid *uuid;
-
- if (len < 4)
- return ptr;
-
- list_for_each_entry(uuid, &hdev->uuids, list) {
- u16 uuid16;
-
- if (uuid->size != 16)
- continue;
-
- uuid16 = get_unaligned_le16(&uuid->uuid[12]);
- if (uuid16 < 0x1100)
- continue;
-
- if (uuid16 == PNP_INFO_SVCLASS_ID)
- continue;
-
- if (!uuids_start) {
- uuids_start = ptr;
- uuids_start[0] = 1;
- uuids_start[1] = EIR_UUID16_ALL;
- ptr += 2;
- }
-
- /* Stop if not enough space to put next UUID */
- if ((ptr - data) + sizeof(u16) > len) {
- uuids_start[1] = EIR_UUID16_SOME;
- break;
- }
-
- *ptr++ = (uuid16 & 0x00ff);
- *ptr++ = (uuid16 & 0xff00) >> 8;
- uuids_start[0] += sizeof(uuid16);
- }
-
- return ptr;
-}
-
-static u8 *create_uuid32_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len)
-{
- u8 *ptr = data, *uuids_start = NULL;
- struct bt_uuid *uuid;
-
- if (len < 6)
- return ptr;
-
- list_for_each_entry(uuid, &hdev->uuids, list) {
- if (uuid->size != 32)
- continue;
-
- if (!uuids_start) {
- uuids_start = ptr;
- uuids_start[0] = 1;
- uuids_start[1] = EIR_UUID32_ALL;
- ptr += 2;
- }
-
- /* Stop if not enough space to put next UUID */
- if ((ptr - data) + sizeof(u32) > len) {
- uuids_start[1] = EIR_UUID32_SOME;
- break;
- }
-
- memcpy(ptr, &uuid->uuid[12], sizeof(u32));
- ptr += sizeof(u32);
- uuids_start[0] += sizeof(u32);
- }
-
- return ptr;
-}
-
-static u8 *create_uuid128_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len)
-{
- u8 *ptr = data, *uuids_start = NULL;
- struct bt_uuid *uuid;
-
- if (len < 18)
- return ptr;
-
- list_for_each_entry(uuid, &hdev->uuids, list) {
- if (uuid->size != 128)
- continue;
-
- if (!uuids_start) {
- uuids_start = ptr;
- uuids_start[0] = 1;
- uuids_start[1] = EIR_UUID128_ALL;
- ptr += 2;
- }
-
- /* Stop if not enough space to put next UUID */
- if ((ptr - data) + 16 > len) {
- uuids_start[1] = EIR_UUID128_SOME;
- break;
- }
-
- memcpy(ptr, uuid->uuid, 16);
- ptr += 16;
- uuids_start[0] += 16;
- }
-
- return ptr;
-}
-
-static void create_eir(struct hci_dev *hdev, u8 *data)
-{
- u8 *ptr = data;
- size_t name_len;
-
- name_len = strlen(hdev->dev_name);
-
- if (name_len > 0) {
- /* EIR Data type */
- if (name_len > 48) {
- name_len = 48;
- ptr[1] = EIR_NAME_SHORT;
- } else
- ptr[1] = EIR_NAME_COMPLETE;
-
- /* EIR Data length */
- ptr[0] = name_len + 1;
-
- memcpy(ptr + 2, hdev->dev_name, name_len);
-
- ptr += (name_len + 2);
- }
-
- if (hdev->inq_tx_power != HCI_TX_POWER_INVALID) {
- ptr[0] = 2;
- ptr[1] = EIR_TX_POWER;
- ptr[2] = (u8) hdev->inq_tx_power;
-
- ptr += 3;
- }
-
- if (hdev->devid_source > 0) {
- ptr[0] = 9;
- ptr[1] = EIR_DEVICE_ID;
-
- put_unaligned_le16(hdev->devid_source, ptr + 2);
- put_unaligned_le16(hdev->devid_vendor, ptr + 4);
- put_unaligned_le16(hdev->devid_product, ptr + 6);
- put_unaligned_le16(hdev->devid_version, ptr + 8);
-
- ptr += 10;
- }
-
- ptr = create_uuid16_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data));
- ptr = create_uuid32_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data));
- ptr = create_uuid128_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data));
-}
-
-void __hci_req_update_eir(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
- struct hci_cp_write_eir cp;
-
- if (!hdev_is_powered(hdev))
- return;
-
- if (!lmp_ext_inq_capable(hdev))
- return;
-
- if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
- return;
-
- if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE))
- return;
-
- memset(&cp, 0, sizeof(cp));
-
- create_eir(hdev, cp.data);
-
- if (memcmp(cp.data, hdev->eir, sizeof(cp.data)) == 0)
- return;
-
- memcpy(hdev->eir, cp.data, sizeof(cp.data));
-
- hci_req_add(req, HCI_OP_WRITE_EIR, sizeof(cp), &cp);
-}
-
-void hci_req_add_le_scan_disable(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
-
- if (use_ext_scan(hdev)) {
- struct hci_cp_le_set_ext_scan_enable cp;
-
- memset(&cp, 0, sizeof(cp));
- cp.enable = LE_SCAN_DISABLE;
- hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE, sizeof(cp),
- &cp);
- } else {
- struct hci_cp_le_set_scan_enable cp;
-
- memset(&cp, 0, sizeof(cp));
- cp.enable = LE_SCAN_DISABLE;
- hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp);
- }
-}
-
-static void add_to_white_list(struct hci_request *req,
- struct hci_conn_params *params)
-{
- struct hci_cp_le_add_to_white_list cp;
-
- cp.bdaddr_type = params->addr_type;
- bacpy(&cp.bdaddr, &params->addr);
-
- hci_req_add(req, HCI_OP_LE_ADD_TO_WHITE_LIST, sizeof(cp), &cp);
-}
-
-static u8 update_white_list(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
- struct hci_conn_params *params;
- struct bdaddr_list *b;
- uint8_t white_list_entries = 0;
-
- /* Go through the current white list programmed into the
- * controller one by one and check if that address is still
- * in the list of pending connections or list of devices to
- * report. If not present in either list, then queue the
- * command to remove it from the controller.
- */
- list_for_each_entry(b, &hdev->le_white_list, list) {
- /* If the device is neither in pend_le_conns nor
- * pend_le_reports then remove it from the whitelist.
- */
- if (!hci_pend_le_action_lookup(&hdev->pend_le_conns,
- &b->bdaddr, b->bdaddr_type) &&
- !hci_pend_le_action_lookup(&hdev->pend_le_reports,
- &b->bdaddr, b->bdaddr_type)) {
- struct hci_cp_le_del_from_white_list cp;
-
- cp.bdaddr_type = b->bdaddr_type;
- bacpy(&cp.bdaddr, &b->bdaddr);
-
- hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST,
- sizeof(cp), &cp);
- continue;
- }
-
- if (hci_find_irk_by_addr(hdev, &b->bdaddr, b->bdaddr_type)) {
- /* White list can not be used with RPAs */
- return 0x00;
- }
-
- white_list_entries++;
- }
-
- /* Since all no longer valid white list entries have been
- * removed, walk through the list of pending connections
- * and ensure that any new device gets programmed into
- * the controller.
- *
- * If the list of the devices is larger than the list of
- * available white list entries in the controller, then
- * just abort and return filer policy value to not use the
- * white list.
- */
- list_for_each_entry(params, &hdev->pend_le_conns, action) {
- if (hci_bdaddr_list_lookup(&hdev->le_white_list,
- &params->addr, params->addr_type))
- continue;
-
- if (white_list_entries >= hdev->le_white_list_size) {
- /* Select filter policy to accept all advertising */
- return 0x00;
- }
-
- if (hci_find_irk_by_addr(hdev, &params->addr,
- params->addr_type)) {
- /* White list can not be used with RPAs */
- return 0x00;
- }
-
- white_list_entries++;
- add_to_white_list(req, params);
- }
-
- /* After adding all new pending connections, walk through
- * the list of pending reports and also add these to the
- * white list if there is still space.
- */
- list_for_each_entry(params, &hdev->pend_le_reports, action) {
- if (hci_bdaddr_list_lookup(&hdev->le_white_list,
- &params->addr, params->addr_type))
- continue;
-
- if (white_list_entries >= hdev->le_white_list_size) {
- /* Select filter policy to accept all advertising */
- return 0x00;
- }
-
- if (hci_find_irk_by_addr(hdev, &params->addr,
- params->addr_type)) {
- /* White list can not be used with RPAs */
- return 0x00;
- }
-
- white_list_entries++;
- add_to_white_list(req, params);
- }
-
- /* Select filter policy to use white list */
- return 0x01;
-}
-
-static bool scan_use_rpa(struct hci_dev *hdev)
-{
- return hci_dev_test_flag(hdev, HCI_PRIVACY);
-}
-
-static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval,
- u16 window, u8 own_addr_type, u8 filter_policy)
-{
- struct hci_dev *hdev = req->hdev;
-
- /* Use ext scanning if set ext scan param and ext scan enable is
- * supported
- */
- if (use_ext_scan(hdev)) {
- struct hci_cp_le_set_ext_scan_params *ext_param_cp;
- struct hci_cp_le_set_ext_scan_enable ext_enable_cp;
- struct hci_cp_le_scan_phy_params *phy_params;
- u8 data[sizeof(*ext_param_cp) + sizeof(*phy_params) * 2];
- u32 plen;
-
- ext_param_cp = (void *)data;
- phy_params = (void *)ext_param_cp->data;
-
- memset(ext_param_cp, 0, sizeof(*ext_param_cp));
- ext_param_cp->own_addr_type = own_addr_type;
- ext_param_cp->filter_policy = filter_policy;
-
- plen = sizeof(*ext_param_cp);
-
- if (scan_1m(hdev) || scan_2m(hdev)) {
- ext_param_cp->scanning_phys |= LE_SCAN_PHY_1M;
-
- memset(phy_params, 0, sizeof(*phy_params));
- phy_params->type = type;
- phy_params->interval = cpu_to_le16(interval);
- phy_params->window = cpu_to_le16(window);
-
- plen += sizeof(*phy_params);
- phy_params++;
- }
-
- if (scan_coded(hdev)) {
- ext_param_cp->scanning_phys |= LE_SCAN_PHY_CODED;
-
- memset(phy_params, 0, sizeof(*phy_params));
- phy_params->type = type;
- phy_params->interval = cpu_to_le16(interval);
- phy_params->window = cpu_to_le16(window);
-
- plen += sizeof(*phy_params);
- phy_params++;
- }
-
- hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_PARAMS,
- plen, ext_param_cp);
-
- memset(&ext_enable_cp, 0, sizeof(ext_enable_cp));
- ext_enable_cp.enable = LE_SCAN_ENABLE;
- ext_enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
-
- hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE,
- sizeof(ext_enable_cp), &ext_enable_cp);
- } else {
- struct hci_cp_le_set_scan_param param_cp;
- struct hci_cp_le_set_scan_enable enable_cp;
-
- memset(&param_cp, 0, sizeof(param_cp));
- param_cp.type = type;
- param_cp.interval = cpu_to_le16(interval);
- param_cp.window = cpu_to_le16(window);
- param_cp.own_address_type = own_addr_type;
- param_cp.filter_policy = filter_policy;
- hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp),
- &param_cp);
-
- memset(&enable_cp, 0, sizeof(enable_cp));
- enable_cp.enable = LE_SCAN_ENABLE;
- enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
- hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp),
- &enable_cp);
- }
-}
-
-void hci_req_add_le_passive_scan(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
- u8 own_addr_type;
- u8 filter_policy;
-
- /* Set require_privacy to false since no SCAN_REQ are send
- * during passive scanning. Not using an non-resolvable address
- * here is important so that peer devices using direct
- * advertising with our address will be correctly reported
- * by the controller.
- */
- if (hci_update_random_address(req, false, scan_use_rpa(hdev),
- &own_addr_type))
- return;
-
- /* Adding or removing entries from the white list must
- * happen before enabling scanning. The controller does
- * not allow white list modification while scanning.
- */
- filter_policy = update_white_list(req);
-
- /* When the controller is using random resolvable addresses and
- * with that having LE privacy enabled, then controllers with
- * Extended Scanner Filter Policies support can now enable support
- * for handling directed advertising.
- *
- * So instead of using filter polices 0x00 (no whitelist)
- * and 0x01 (whitelist enabled) use the new filter policies
- * 0x02 (no whitelist) and 0x03 (whitelist enabled).
- */
- if (hci_dev_test_flag(hdev, HCI_PRIVACY) &&
- (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY))
- filter_policy |= 0x02;
-
- hci_req_start_scan(req, LE_SCAN_PASSIVE, hdev->le_scan_interval,
- hdev->le_scan_window, own_addr_type, filter_policy);
-}
-
-static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance)
-{
- struct adv_info *adv_instance;
-
- /* Ignore instance 0 */
- if (instance == 0x00)
- return 0;
-
- adv_instance = hci_find_adv_instance(hdev, instance);
- if (!adv_instance)
- return 0;
-
- /* TODO: Take into account the "appearance" and "local-name" flags here.
- * These are currently being ignored as they are not supported.
- */
- return adv_instance->scan_rsp_len;
-}
-
-static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev)
-{
- u8 instance = hdev->cur_adv_instance;
- struct adv_info *adv_instance;
-
- /* Ignore instance 0 */
- if (instance == 0x00)
- return 0;
-
- adv_instance = hci_find_adv_instance(hdev, instance);
- if (!adv_instance)
- return 0;
-
- /* TODO: Take into account the "appearance" and "local-name" flags here.
- * These are currently being ignored as they are not supported.
- */
- return adv_instance->scan_rsp_len;
-}
-
-void __hci_req_disable_advertising(struct hci_request *req)
-{
- if (ext_adv_capable(req->hdev)) {
- struct hci_cp_le_set_ext_adv_enable cp;
-
- cp.enable = 0x00;
- /* Disable all sets since we only support one set at the moment */
- cp.num_of_sets = 0x00;
-
- hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE, sizeof(cp), &cp);
- } else {
- u8 enable = 0x00;
-
- hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
- }
-}
-
-static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance)
-{
- u32 flags;
- struct adv_info *adv_instance;
-
- if (instance == 0x00) {
- /* Instance 0 always manages the "Tx Power" and "Flags"
- * fields
- */
- flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS;
-
- /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting
- * corresponds to the "connectable" instance flag.
- */
- if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE))
- flags |= MGMT_ADV_FLAG_CONNECTABLE;
-
- if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE))
- flags |= MGMT_ADV_FLAG_LIMITED_DISCOV;
- else if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE))
- flags |= MGMT_ADV_FLAG_DISCOV;
-
- return flags;
- }
-
- adv_instance = hci_find_adv_instance(hdev, instance);
-
- /* Return 0 when we got an invalid instance identifier. */
- if (!adv_instance)
- return 0;
-
- return adv_instance->flags;
-}
-
-static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags)
-{
- /* If privacy is not enabled don't use RPA */
- if (!hci_dev_test_flag(hdev, HCI_PRIVACY))
- return false;
-
- /* If basic privacy mode is enabled use RPA */
- if (!hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY))
- return true;
-
- /* If limited privacy mode is enabled don't use RPA if we're
- * both discoverable and bondable.
- */
- if ((flags & MGMT_ADV_FLAG_DISCOV) &&
- hci_dev_test_flag(hdev, HCI_BONDABLE))
- return false;
-
- /* We're neither bondable nor discoverable in the limited
- * privacy mode, therefore use RPA.
- */
- return true;
-}
-
-static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable)
-{
- /* If there is no connection we are OK to advertise. */
- if (hci_conn_num(hdev, LE_LINK) == 0)
- return true;
-
- /* Check le_states if there is any connection in slave role. */
- if (hdev->conn_hash.le_num_slave > 0) {
- /* Slave connection state and non connectable mode bit 20. */
- if (!connectable && !(hdev->le_states[2] & 0x10))
- return false;
-
- /* Slave connection state and connectable mode bit 38
- * and scannable bit 21.
- */
- if (connectable && (!(hdev->le_states[4] & 0x40) ||
- !(hdev->le_states[2] & 0x20)))
- return false;
- }
-
- /* Check le_states if there is any connection in master role. */
- if (hci_conn_num(hdev, LE_LINK) != hdev->conn_hash.le_num_slave) {
- /* Master connection state and non connectable mode bit 18. */
- if (!connectable && !(hdev->le_states[2] & 0x02))
- return false;
-
- /* Master connection state and connectable mode bit 35 and
- * scannable 19.
- */
- if (connectable && (!(hdev->le_states[4] & 0x08) ||
- !(hdev->le_states[2] & 0x08)))
- return false;
- }
-
- return true;
-}
-
-void __hci_req_enable_advertising(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
- struct hci_cp_le_set_adv_param cp;
- u8 own_addr_type, enable = 0x01;
- bool connectable;
- u32 flags;
-
- flags = get_adv_instance_flags(hdev, hdev->cur_adv_instance);
-
- /* If the "connectable" instance flag was not set, then choose between
- * ADV_IND and ADV_NONCONN_IND based on the global connectable setting.
- */
- connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) ||
- mgmt_get_connectable(hdev);
-
- if (!is_advertising_allowed(hdev, connectable))
- return;
-
- if (hci_dev_test_flag(hdev, HCI_LE_ADV))
- __hci_req_disable_advertising(req);
-
- /* Clear the HCI_LE_ADV bit temporarily so that the
- * hci_update_random_address knows that it's safe to go ahead
- * and write a new random address. The flag will be set back on
- * as soon as the SET_ADV_ENABLE HCI command completes.
- */
- hci_dev_clear_flag(hdev, HCI_LE_ADV);
-
- /* Set require_privacy to true only when non-connectable
- * advertising is used. In that case it is fine to use a
- * non-resolvable private address.
- */
- if (hci_update_random_address(req, !connectable,
- adv_use_rpa(hdev, flags),
- &own_addr_type) < 0)
- return;
-
- memset(&cp, 0, sizeof(cp));
- cp.min_interval = cpu_to_le16(hdev->le_adv_min_interval);
- cp.max_interval = cpu_to_le16(hdev->le_adv_max_interval);
-
- if (connectable)
- cp.type = LE_ADV_IND;
- else if (get_cur_adv_instance_scan_rsp_len(hdev))
- cp.type = LE_ADV_SCAN_IND;
- else
- cp.type = LE_ADV_NONCONN_IND;
-
- cp.own_address_type = own_addr_type;
- cp.channel_map = hdev->le_adv_channel_map;
-
- hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp);
-
- hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
-}
-
-u8 append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len)
-{
- size_t short_len;
- size_t complete_len;
-
- /* no space left for name (+ NULL + type + len) */
- if ((HCI_MAX_AD_LENGTH - ad_len) < HCI_MAX_SHORT_NAME_LENGTH + 3)
- return ad_len;
-
- /* use complete name if present and fits */
- complete_len = strlen(hdev->dev_name);
- if (complete_len && complete_len <= HCI_MAX_SHORT_NAME_LENGTH)
- return eir_append_data(ptr, ad_len, EIR_NAME_COMPLETE,
- hdev->dev_name, complete_len + 1);
-
- /* use short name if present */
- short_len = strlen(hdev->short_name);
- if (short_len)
- return eir_append_data(ptr, ad_len, EIR_NAME_SHORT,
- hdev->short_name, short_len + 1);
-
- /* use shortened full name if present, we already know that name
- * is longer then HCI_MAX_SHORT_NAME_LENGTH
- */
- if (complete_len) {
- u8 name[HCI_MAX_SHORT_NAME_LENGTH + 1];
-
- memcpy(name, hdev->dev_name, HCI_MAX_SHORT_NAME_LENGTH);
- name[HCI_MAX_SHORT_NAME_LENGTH] = '\0';
-
- return eir_append_data(ptr, ad_len, EIR_NAME_SHORT, name,
- sizeof(name));
- }
-
- return ad_len;
-}
-
-static u8 append_appearance(struct hci_dev *hdev, u8 *ptr, u8 ad_len)
-{
- return eir_append_le16(ptr, ad_len, EIR_APPEARANCE, hdev->appearance);
-}
-
-static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr)
-{
- u8 scan_rsp_len = 0;
-
- if (hdev->appearance) {
- scan_rsp_len = append_appearance(hdev, ptr, scan_rsp_len);
- }
-
- return append_local_name(hdev, ptr, scan_rsp_len);
-}
-
-static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance,
- u8 *ptr)
-{
- struct adv_info *adv_instance;
- u32 instance_flags;
- u8 scan_rsp_len = 0;
-
- adv_instance = hci_find_adv_instance(hdev, instance);
- if (!adv_instance)
- return 0;
-
- instance_flags = adv_instance->flags;
-
- if ((instance_flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) {
- scan_rsp_len = append_appearance(hdev, ptr, scan_rsp_len);
- }
-
- memcpy(&ptr[scan_rsp_len], adv_instance->scan_rsp_data,
- adv_instance->scan_rsp_len);
-
- scan_rsp_len += adv_instance->scan_rsp_len;
-
- if (instance_flags & MGMT_ADV_FLAG_LOCAL_NAME)
- scan_rsp_len = append_local_name(hdev, ptr, scan_rsp_len);
-
- return scan_rsp_len;
-}
-
-void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance)
-{
- struct hci_dev *hdev = req->hdev;
- u8 len;
-
- if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
- return;
-
- if (ext_adv_capable(hdev)) {
- struct hci_cp_le_set_ext_scan_rsp_data cp;
-
- memset(&cp, 0, sizeof(cp));
-
- if (instance)
- len = create_instance_scan_rsp_data(hdev, instance,
- cp.data);
- else
- len = create_default_scan_rsp_data(hdev, cp.data);
-
- if (hdev->scan_rsp_data_len == len &&
- !memcmp(cp.data, hdev->scan_rsp_data, len))
- return;
-
- memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data));
- hdev->scan_rsp_data_len = len;
-
- cp.handle = 0;
- cp.length = len;
- cp.operation = LE_SET_ADV_DATA_OP_COMPLETE;
- cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG;
-
- hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA, sizeof(cp),
- &cp);
- } else {
- struct hci_cp_le_set_scan_rsp_data cp;
-
- memset(&cp, 0, sizeof(cp));
-
- if (instance)
- len = create_instance_scan_rsp_data(hdev, instance,
- cp.data);
- else
- len = create_default_scan_rsp_data(hdev, cp.data);
-
- if (hdev->scan_rsp_data_len == len &&
- !memcmp(cp.data, hdev->scan_rsp_data, len))
- return;
-
- memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data));
- hdev->scan_rsp_data_len = len;
-
- cp.length = len;
-
- hci_req_add(req, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp);
- }
-}
-
-static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
-{
- struct adv_info *adv_instance = NULL;
- u8 ad_len = 0, flags = 0;
- u32 instance_flags;
-
- /* Return 0 when the current instance identifier is invalid. */
- if (instance) {
- adv_instance = hci_find_adv_instance(hdev, instance);
- if (!adv_instance)
- return 0;
- }
-
- instance_flags = get_adv_instance_flags(hdev, instance);
-
- /* The Add Advertising command allows userspace to set both the general
- * and limited discoverable flags.
- */
- if (instance_flags & MGMT_ADV_FLAG_DISCOV)
- flags |= LE_AD_GENERAL;
-
- if (instance_flags & MGMT_ADV_FLAG_LIMITED_DISCOV)
- flags |= LE_AD_LIMITED;
-
- if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
- flags |= LE_AD_NO_BREDR;
-
- if (flags || (instance_flags & MGMT_ADV_FLAG_MANAGED_FLAGS)) {
- /* If a discovery flag wasn't provided, simply use the global
- * settings.
- */
- if (!flags)
- flags |= mgmt_get_adv_discov_flags(hdev);
-
- /* If flags would still be empty, then there is no need to
- * include the "Flags" AD field".
- */
- if (flags) {
- ptr[0] = 0x02;
- ptr[1] = EIR_FLAGS;
- ptr[2] = flags;
-
- ad_len += 3;
- ptr += 3;
- }
- }
-
- if (adv_instance) {
- memcpy(ptr, adv_instance->adv_data,
- adv_instance->adv_data_len);
- ad_len += adv_instance->adv_data_len;
- ptr += adv_instance->adv_data_len;
- }
-
- if (instance_flags & MGMT_ADV_FLAG_TX_POWER) {
- s8 adv_tx_power;
-
- if (ext_adv_capable(hdev)) {
- if (adv_instance)
- adv_tx_power = adv_instance->tx_power;
- else
- adv_tx_power = hdev->adv_tx_power;
- } else {
- adv_tx_power = hdev->adv_tx_power;
- }
-
- /* Provide Tx Power only if we can provide a valid value for it */
- if (adv_tx_power != HCI_TX_POWER_INVALID) {
- ptr[0] = 0x02;
- ptr[1] = EIR_TX_POWER;
- ptr[2] = (u8)adv_tx_power;
-
- ad_len += 3;
- ptr += 3;
- }
- }
-
- return ad_len;
-}
-
-void __hci_req_update_adv_data(struct hci_request *req, u8 instance)
-{
- struct hci_dev *hdev = req->hdev;
- u8 len;
-
- if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
- return;
-
- if (ext_adv_capable(hdev)) {
- struct hci_cp_le_set_ext_adv_data cp;
-
- memset(&cp, 0, sizeof(cp));
-
- len = create_instance_adv_data(hdev, instance, cp.data);
-
- /* There's nothing to do if the data hasn't changed */
- if (hdev->adv_data_len == len &&
- memcmp(cp.data, hdev->adv_data, len) == 0)
- return;
-
- memcpy(hdev->adv_data, cp.data, sizeof(cp.data));
- hdev->adv_data_len = len;
-
- cp.length = len;
- cp.handle = 0;
- cp.operation = LE_SET_ADV_DATA_OP_COMPLETE;
- cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG;
-
- hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_DATA, sizeof(cp), &cp);
- } else {
- struct hci_cp_le_set_adv_data cp;
-
- memset(&cp, 0, sizeof(cp));
-
- len = create_instance_adv_data(hdev, instance, cp.data);
-
- /* There's nothing to do if the data hasn't changed */
- if (hdev->adv_data_len == len &&
- memcmp(cp.data, hdev->adv_data, len) == 0)
- return;
-
- memcpy(hdev->adv_data, cp.data, sizeof(cp.data));
- hdev->adv_data_len = len;
-
- cp.length = len;
-
- hci_req_add(req, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp);
- }
-}
-
-int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance)
-{
- struct hci_request req;
-
- hci_req_init(&req, hdev);
- __hci_req_update_adv_data(&req, instance);
-
- return hci_req_run(&req, NULL);
-}
-
-static void adv_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode)
-{
- BT_DBG("%s status %u", hdev->name, status);
-}
-
-void hci_req_reenable_advertising(struct hci_dev *hdev)
-{
- struct hci_request req;
-
- if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
- list_empty(&hdev->adv_instances))
- return;
-
- hci_req_init(&req, hdev);
-
- if (hdev->cur_adv_instance) {
- __hci_req_schedule_adv_instance(&req, hdev->cur_adv_instance,
- true);
- } else {
- if (ext_adv_capable(hdev)) {
- __hci_req_start_ext_adv(&req, 0x00);
- } else {
- __hci_req_update_adv_data(&req, 0x00);
- __hci_req_update_scan_rsp_data(&req, 0x00);
- __hci_req_enable_advertising(&req);
- }
- }
-
- hci_req_run(&req, adv_enable_complete);
-}
-
-static void adv_timeout_expire(struct work_struct *work)
-{
- struct hci_dev *hdev = container_of(work, struct hci_dev,
- adv_instance_expire.work);
-
- struct hci_request req;
- u8 instance;
-
- BT_DBG("%s", hdev->name);
-
- hci_dev_lock(hdev);
-
- hdev->adv_instance_timeout = 0;
-
- instance = hdev->cur_adv_instance;
- if (instance == 0x00)
- goto unlock;
-
- hci_req_init(&req, hdev);
-
- hci_req_clear_adv_instance(hdev, NULL, &req, instance, false);
-
- if (list_empty(&hdev->adv_instances))
- __hci_req_disable_advertising(&req);
-
- hci_req_run(&req, NULL);
-
-unlock:
- hci_dev_unlock(hdev);
-}
-
-int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
- bool use_rpa, struct adv_info *adv_instance,
- u8 *own_addr_type, bdaddr_t *rand_addr)
-{
- int err;
-
- bacpy(rand_addr, BDADDR_ANY);
-
- /* If privacy is enabled use a resolvable private address. If
- * current RPA has expired then generate a new one.
- */
- if (use_rpa) {
- int to;
-
- *own_addr_type = ADDR_LE_DEV_RANDOM;
-
- if (adv_instance) {
- if (!adv_instance->rpa_expired &&
- !bacmp(&adv_instance->random_addr, &hdev->rpa))
- return 0;
-
- adv_instance->rpa_expired = false;
- } else {
- if (!hci_dev_test_and_clear_flag(hdev, HCI_RPA_EXPIRED) &&
- !bacmp(&hdev->random_addr, &hdev->rpa))
- return 0;
- }
-
- err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa);
- if (err < 0) {
- BT_ERR("%s failed to generate new RPA", hdev->name);
- return err;
- }
-
- bacpy(rand_addr, &hdev->rpa);
-
- to = msecs_to_jiffies(hdev->rpa_timeout * 1000);
- if (adv_instance)
- queue_delayed_work(hdev->workqueue,
- &adv_instance->rpa_expired_cb, to);
- else
- queue_delayed_work(hdev->workqueue,
- &hdev->rpa_expired, to);
-
- return 0;
- }
-
- /* In case of required privacy without resolvable private address,
- * use an non-resolvable private address. This is useful for
- * non-connectable advertising.
- */
- if (require_privacy) {
- bdaddr_t nrpa;
-
- while (true) {
- /* The non-resolvable private address is generated
- * from random six bytes with the two most significant
- * bits cleared.
- */
- get_random_bytes(&nrpa, 6);
- nrpa.b[5] &= 0x3f;
-
- /* The non-resolvable private address shall not be
- * equal to the public address.
- */
- if (bacmp(&hdev->bdaddr, &nrpa))
- break;
- }
-
- *own_addr_type = ADDR_LE_DEV_RANDOM;
- bacpy(rand_addr, &nrpa);
-
- return 0;
- }
-
- /* No privacy so use a public address. */
- *own_addr_type = ADDR_LE_DEV_PUBLIC;
-
- return 0;
-}
-
-void __hci_req_clear_ext_adv_sets(struct hci_request *req)
-{
- hci_req_add(req, HCI_OP_LE_CLEAR_ADV_SETS, 0, NULL);
-}
-
-int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance)
-{
- struct hci_cp_le_set_ext_adv_params cp;
- struct hci_dev *hdev = req->hdev;
- bool connectable;
- u32 flags;
- bdaddr_t random_addr;
- u8 own_addr_type;
- int err;
- struct adv_info *adv_instance;
- bool secondary_adv;
- /* In ext adv set param interval is 3 octets */
- const u8 adv_interval[3] = { 0x00, 0x08, 0x00 };
-
- if (instance > 0) {
- adv_instance = hci_find_adv_instance(hdev, instance);
- if (!adv_instance)
- return -EINVAL;
- } else {
- adv_instance = NULL;
- }
-
- flags = get_adv_instance_flags(hdev, instance);
-
- /* If the "connectable" instance flag was not set, then choose between
- * ADV_IND and ADV_NONCONN_IND based on the global connectable setting.
- */
- connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) ||
- mgmt_get_connectable(hdev);
-
- if (!is_advertising_allowed(hdev, connectable))
- return -EPERM;
-
- /* Set require_privacy to true only when non-connectable
- * advertising is used. In that case it is fine to use a
- * non-resolvable private address.
- */
- err = hci_get_random_address(hdev, !connectable,
- adv_use_rpa(hdev, flags), adv_instance,
- &own_addr_type, &random_addr);
- if (err < 0)
- return err;
-
- memset(&cp, 0, sizeof(cp));
-
- memcpy(cp.min_interval, adv_interval, sizeof(cp.min_interval));
- memcpy(cp.max_interval, adv_interval, sizeof(cp.max_interval));
-
- secondary_adv = (flags & MGMT_ADV_FLAG_SEC_MASK);
-
- if (connectable) {
- if (secondary_adv)
- cp.evt_properties = cpu_to_le16(LE_EXT_ADV_CONN_IND);
- else
- cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND);
- } else if (get_adv_instance_scan_rsp_len(hdev, instance)) {
- if (secondary_adv)
- cp.evt_properties = cpu_to_le16(LE_EXT_ADV_SCAN_IND);
- else
- cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_SCAN_IND);
- } else {
- if (secondary_adv)
- cp.evt_properties = cpu_to_le16(LE_EXT_ADV_NON_CONN_IND);
- else
- cp.evt_properties = cpu_to_le16(LE_LEGACY_NONCONN_IND);
- }
-
- cp.own_addr_type = own_addr_type;
- cp.channel_map = hdev->le_adv_channel_map;
- cp.tx_power = 127;
- cp.handle = 0;
-
- if (flags & MGMT_ADV_FLAG_SEC_2M) {
- cp.primary_phy = HCI_ADV_PHY_1M;
- cp.secondary_phy = HCI_ADV_PHY_2M;
- } else if (flags & MGMT_ADV_FLAG_SEC_CODED) {
- cp.primary_phy = HCI_ADV_PHY_CODED;
- cp.secondary_phy = HCI_ADV_PHY_CODED;
- } else {
- /* In all other cases use 1M */
- cp.primary_phy = HCI_ADV_PHY_1M;
- cp.secondary_phy = HCI_ADV_PHY_1M;
- }
-
- hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp);
-
- if (own_addr_type == ADDR_LE_DEV_RANDOM &&
- bacmp(&random_addr, BDADDR_ANY)) {
- struct hci_cp_le_set_adv_set_rand_addr cp;
-
- /* Check if random address need to be updated */
- if (adv_instance) {
- if (!bacmp(&random_addr, &adv_instance->random_addr))
- return 0;
- } else {
- if (!bacmp(&random_addr, &hdev->random_addr))
- return 0;
- }
-
- memset(&cp, 0, sizeof(cp));
-
- cp.handle = 0;
- bacpy(&cp.bdaddr, &random_addr);
-
- hci_req_add(req,
- HCI_OP_LE_SET_ADV_SET_RAND_ADDR,
- sizeof(cp), &cp);
- }
-
- return 0;
-}
-
-void __hci_req_enable_ext_advertising(struct hci_request *req)
-{
- struct hci_cp_le_set_ext_adv_enable *cp;
- struct hci_cp_ext_adv_set *adv_set;
- u8 data[sizeof(*cp) + sizeof(*adv_set) * 1];
-
- cp = (void *) data;
- adv_set = (void *) cp->data;
-
- memset(cp, 0, sizeof(*cp));
-
- cp->enable = 0x01;
- cp->num_of_sets = 0x01;
-
- memset(adv_set, 0, sizeof(*adv_set));
-
- adv_set->handle = 0;
-
- hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE,
- sizeof(*cp) + sizeof(*adv_set) * cp->num_of_sets,
- data);
-}
-
-int __hci_req_start_ext_adv(struct hci_request *req, u8 instance)
-{
- struct hci_dev *hdev = req->hdev;
- int err;
-
- if (hci_dev_test_flag(hdev, HCI_LE_ADV))
- __hci_req_disable_advertising(req);
-
- err = __hci_req_setup_ext_adv_instance(req, instance);
- if (err < 0)
- return err;
-
- __hci_req_update_scan_rsp_data(req, instance);
- __hci_req_enable_ext_advertising(req);
-
- return 0;
-}
-
-int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
- bool force)
-{
- struct hci_dev *hdev = req->hdev;
- struct adv_info *adv_instance = NULL;
- u16 timeout;
-
- if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
- list_empty(&hdev->adv_instances))
- return -EPERM;
-
- if (hdev->adv_instance_timeout)
- return -EBUSY;
-
- adv_instance = hci_find_adv_instance(hdev, instance);
- if (!adv_instance)
- return -ENOENT;
-
- /* A zero timeout means unlimited advertising. As long as there is
- * only one instance, duration should be ignored. We still set a timeout
- * in case further instances are being added later on.
- *
- * If the remaining lifetime of the instance is more than the duration
- * then the timeout corresponds to the duration, otherwise it will be
- * reduced to the remaining instance lifetime.
- */
- if (adv_instance->timeout == 0 ||
- adv_instance->duration <= adv_instance->remaining_time)
- timeout = adv_instance->duration;
- else
- timeout = adv_instance->remaining_time;
-
- /* The remaining time is being reduced unless the instance is being
- * advertised without time limit.
- */
- if (adv_instance->timeout)
- adv_instance->remaining_time =
- adv_instance->remaining_time - timeout;
-
- hdev->adv_instance_timeout = timeout;
- queue_delayed_work(hdev->req_workqueue,
- &hdev->adv_instance_expire,
- msecs_to_jiffies(timeout * 1000));
-
- /* If we're just re-scheduling the same instance again then do not
- * execute any HCI commands. This happens when a single instance is
- * being advertised.
- */
- if (!force && hdev->cur_adv_instance == instance &&
- hci_dev_test_flag(hdev, HCI_LE_ADV))
- return 0;
-
- hdev->cur_adv_instance = instance;
- if (ext_adv_capable(hdev)) {
- __hci_req_start_ext_adv(req, instance);
- } else {
- __hci_req_update_adv_data(req, instance);
- __hci_req_update_scan_rsp_data(req, instance);
- __hci_req_enable_advertising(req);
- }
-
- return 0;
-}
-
-static void cancel_adv_timeout(struct hci_dev *hdev)
-{
- if (hdev->adv_instance_timeout) {
- hdev->adv_instance_timeout = 0;
- cancel_delayed_work(&hdev->adv_instance_expire);
- }
-}
-
-/* For a single instance:
- * - force == true: The instance will be removed even when its remaining
- * lifetime is not zero.
- * - force == false: the instance will be deactivated but kept stored unless
- * the remaining lifetime is zero.
- *
- * For instance == 0x00:
- * - force == true: All instances will be removed regardless of their timeout
- * setting.
- * - force == false: Only instances that have a timeout will be removed.
- */
-void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk,
- struct hci_request *req, u8 instance,
- bool force)
-{
- struct adv_info *adv_instance, *n, *next_instance = NULL;
- int err;
- u8 rem_inst;
-
- /* Cancel any timeout concerning the removed instance(s). */
- if (!instance || hdev->cur_adv_instance == instance)
- cancel_adv_timeout(hdev);
-
- /* Get the next instance to advertise BEFORE we remove
- * the current one. This can be the same instance again
- * if there is only one instance.
- */
- if (instance && hdev->cur_adv_instance == instance)
- next_instance = hci_get_next_instance(hdev, instance);
-
- if (instance == 0x00) {
- list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances,
- list) {
- if (!(force || adv_instance->timeout))
- continue;
-
- rem_inst = adv_instance->instance;
- err = hci_remove_adv_instance(hdev, rem_inst);
- if (!err)
- mgmt_advertising_removed(sk, hdev, rem_inst);
- }
- } else {
- adv_instance = hci_find_adv_instance(hdev, instance);
-
- if (force || (adv_instance && adv_instance->timeout &&
- !adv_instance->remaining_time)) {
- /* Don't advertise a removed instance. */
- if (next_instance &&
- next_instance->instance == instance)
- next_instance = NULL;
-
- err = hci_remove_adv_instance(hdev, instance);
- if (!err)
- mgmt_advertising_removed(sk, hdev, instance);
- }
- }
-
- if (!req || !hdev_is_powered(hdev) ||
- hci_dev_test_flag(hdev, HCI_ADVERTISING))
- return;
-
- if (next_instance)
- __hci_req_schedule_adv_instance(req, next_instance->instance,
- false);
-}
-
-static void set_random_addr(struct hci_request *req, bdaddr_t *rpa)
-{
- struct hci_dev *hdev = req->hdev;
-
- /* If we're advertising or initiating an LE connection we can't
- * go ahead and change the random address at this time. This is
- * because the eventual initiator address used for the
- * subsequently created connection will be undefined (some
- * controllers use the new address and others the one we had
- * when the operation started).
- *
- * In this kind of scenario skip the update and let the random
- * address be updated at the next cycle.
- */
- if (hci_dev_test_flag(hdev, HCI_LE_ADV) ||
- hci_lookup_le_connect(hdev)) {
- BT_DBG("Deferring random address update");
- hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
- return;
- }
-
- hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6, rpa);
-}
-
-int hci_update_random_address(struct hci_request *req, bool require_privacy,
- bool use_rpa, u8 *own_addr_type)
-{
- struct hci_dev *hdev = req->hdev;
- int err;
-
- /* If privacy is enabled use a resolvable private address. If
- * current RPA has expired or there is something else than
- * the current RPA in use, then generate a new one.
- */
- if (use_rpa) {
- int to;
-
- *own_addr_type = ADDR_LE_DEV_RANDOM;
-
- if (!hci_dev_test_and_clear_flag(hdev, HCI_RPA_EXPIRED) &&
- !bacmp(&hdev->random_addr, &hdev->rpa))
- return 0;
-
- err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa);
- if (err < 0) {
- bt_dev_err(hdev, "failed to generate new RPA");
- return err;
- }
-
- set_random_addr(req, &hdev->rpa);
-
- to = msecs_to_jiffies(hdev->rpa_timeout * 1000);
- queue_delayed_work(hdev->workqueue, &hdev->rpa_expired, to);
-
- return 0;
- }
-
- /* In case of required privacy without resolvable private address,
- * use an non-resolvable private address. This is useful for active
- * scanning and non-connectable advertising.
- */
- if (require_privacy) {
- bdaddr_t nrpa;
-
- while (true) {
- /* The non-resolvable private address is generated
- * from random six bytes with the two most significant
- * bits cleared.
- */
- get_random_bytes(&nrpa, 6);
- nrpa.b[5] &= 0x3f;
-
- /* The non-resolvable private address shall not be
- * equal to the public address.
- */
- if (bacmp(&hdev->bdaddr, &nrpa))
- break;
- }
-
- *own_addr_type = ADDR_LE_DEV_RANDOM;
- set_random_addr(req, &nrpa);
- return 0;
- }
-
- /* If forcing static address is in use or there is no public
- * address use the static address as random address (but skip
- * the HCI command if the current random address is already the
- * static one.
- *
- * In case BR/EDR has been disabled on a dual-mode controller
- * and a static address has been configured, then use that
- * address instead of the public BR/EDR address.
- */
- if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ||
- !bacmp(&hdev->bdaddr, BDADDR_ANY) ||
- (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) &&
- bacmp(&hdev->static_addr, BDADDR_ANY))) {
- *own_addr_type = ADDR_LE_DEV_RANDOM;
- if (bacmp(&hdev->static_addr, &hdev->random_addr))
- hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6,
- &hdev->static_addr);
- return 0;
- }
-
- /* Neither privacy nor static address is being used so use a
- * public address.
- */
- *own_addr_type = ADDR_LE_DEV_PUBLIC;
-
- return 0;
-}
-
-static bool disconnected_whitelist_entries(struct hci_dev *hdev)
-{
- struct bdaddr_list *b;
-
- list_for_each_entry(b, &hdev->whitelist, list) {
- struct hci_conn *conn;
-
- conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &b->bdaddr);
- if (!conn)
- return true;
-
- if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG)
- return true;
- }
-
- return false;
-}
-
-void __hci_req_update_scan(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
- u8 scan;
-
- if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
- return;
-
- if (!hdev_is_powered(hdev))
- return;
-
- if (mgmt_powering_down(hdev))
- return;
-
- if (hci_dev_test_flag(hdev, HCI_CONNECTABLE) ||
- disconnected_whitelist_entries(hdev))
- scan = SCAN_PAGE;
- else
- scan = SCAN_DISABLED;
-
- if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE))
- scan |= SCAN_INQUIRY;
-
- if (test_bit(HCI_PSCAN, &hdev->flags) == !!(scan & SCAN_PAGE) &&
- test_bit(HCI_ISCAN, &hdev->flags) == !!(scan & SCAN_INQUIRY))
- return;
-
- hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
-}
-
-static int update_scan(struct hci_request *req, unsigned long opt)
-{
- hci_dev_lock(req->hdev);
- __hci_req_update_scan(req);
- hci_dev_unlock(req->hdev);
- return 0;
-}
-
-static void scan_update_work(struct work_struct *work)
-{
- struct hci_dev *hdev = container_of(work, struct hci_dev, scan_update);
-
- hci_req_sync(hdev, update_scan, 0, HCI_CMD_TIMEOUT, NULL);
-}
-
-static int connectable_update(struct hci_request *req, unsigned long opt)
-{
- struct hci_dev *hdev = req->hdev;
-
- hci_dev_lock(hdev);
-
- __hci_req_update_scan(req);
-
- /* If BR/EDR is not enabled and we disable advertising as a
- * by-product of disabling connectable, we need to update the
- * advertising flags.
- */
- if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
- __hci_req_update_adv_data(req, hdev->cur_adv_instance);
-
- /* Update the advertising parameters if necessary */
- if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
- !list_empty(&hdev->adv_instances)) {
- if (ext_adv_capable(hdev))
- __hci_req_start_ext_adv(req, hdev->cur_adv_instance);
- else
- __hci_req_enable_advertising(req);
- }
-
- __hci_update_background_scan(req);
-
- hci_dev_unlock(hdev);
-
- return 0;
-}
-
-static void connectable_update_work(struct work_struct *work)
-{
- struct hci_dev *hdev = container_of(work, struct hci_dev,
- connectable_update);
- u8 status;
-
- hci_req_sync(hdev, connectable_update, 0, HCI_CMD_TIMEOUT, &status);
- mgmt_set_connectable_complete(hdev, status);
-}
-
-static u8 get_service_classes(struct hci_dev *hdev)
-{
- struct bt_uuid *uuid;
- u8 val = 0;
-
- list_for_each_entry(uuid, &hdev->uuids, list)
- val |= uuid->svc_hint;
-
- return val;
-}
-
-void __hci_req_update_class(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
- u8 cod[3];
-
- BT_DBG("%s", hdev->name);
-
- if (!hdev_is_powered(hdev))
- return;
-
- if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
- return;
-
- if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE))
- return;
-
- cod[0] = hdev->minor_class;
- cod[1] = hdev->major_class;
- cod[2] = get_service_classes(hdev);
-
- if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE))
- cod[1] |= 0x20;
-
- if (memcmp(cod, hdev->dev_class, 3) == 0)
- return;
-
- hci_req_add(req, HCI_OP_WRITE_CLASS_OF_DEV, sizeof(cod), cod);
-}
-
-static void write_iac(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
- struct hci_cp_write_current_iac_lap cp;
-
- if (!hci_dev_test_flag(hdev, HCI_DISCOVERABLE))
- return;
-
- if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) {
- /* Limited discoverable mode */
- cp.num_iac = min_t(u8, hdev->num_iac, 2);
- cp.iac_lap[0] = 0x00; /* LIAC */
- cp.iac_lap[1] = 0x8b;
- cp.iac_lap[2] = 0x9e;
- cp.iac_lap[3] = 0x33; /* GIAC */
- cp.iac_lap[4] = 0x8b;
- cp.iac_lap[5] = 0x9e;
- } else {
- /* General discoverable mode */
- cp.num_iac = 1;
- cp.iac_lap[0] = 0x33; /* GIAC */
- cp.iac_lap[1] = 0x8b;
- cp.iac_lap[2] = 0x9e;
- }
-
- hci_req_add(req, HCI_OP_WRITE_CURRENT_IAC_LAP,
- (cp.num_iac * 3) + 1, &cp);
-}
-
-static int discoverable_update(struct hci_request *req, unsigned long opt)
-{
- struct hci_dev *hdev = req->hdev;
-
- hci_dev_lock(hdev);
-
- if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
- write_iac(req);
- __hci_req_update_scan(req);
- __hci_req_update_class(req);
- }
-
- /* Advertising instances don't use the global discoverable setting, so
- * only update AD if advertising was enabled using Set Advertising.
- */
- if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) {
- __hci_req_update_adv_data(req, 0x00);
-
- /* Discoverable mode affects the local advertising
- * address in limited privacy mode.
- */
- if (hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) {
- if (ext_adv_capable(hdev))
- __hci_req_start_ext_adv(req, 0x00);
- else
- __hci_req_enable_advertising(req);
- }
- }
-
- hci_dev_unlock(hdev);
-
- return 0;
-}
-
-static void discoverable_update_work(struct work_struct *work)
-{
- struct hci_dev *hdev = container_of(work, struct hci_dev,
- discoverable_update);
- u8 status;
-
- hci_req_sync(hdev, discoverable_update, 0, HCI_CMD_TIMEOUT, &status);
- mgmt_set_discoverable_complete(hdev, status);
-}
-
-void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn,
- u8 reason)
-{
- switch (conn->state) {
- case BT_CONNECTED:
- case BT_CONFIG:
- if (conn->type == AMP_LINK) {
- struct hci_cp_disconn_phy_link cp;
-
- cp.phy_handle = HCI_PHY_HANDLE(conn->handle);
- cp.reason = reason;
- hci_req_add(req, HCI_OP_DISCONN_PHY_LINK, sizeof(cp),
- &cp);
- } else {
- struct hci_cp_disconnect dc;
-
- dc.handle = cpu_to_le16(conn->handle);
- dc.reason = reason;
- hci_req_add(req, HCI_OP_DISCONNECT, sizeof(dc), &dc);
- }
-
- conn->state = BT_DISCONN;
-
- break;
- case BT_CONNECT:
- if (conn->type == LE_LINK) {
- if (test_bit(HCI_CONN_SCANNING, &conn->flags))
- break;
- hci_req_add(req, HCI_OP_LE_CREATE_CONN_CANCEL,
- 0, NULL);
- } else if (conn->type == ACL_LINK) {
- if (req->hdev->hci_ver < BLUETOOTH_VER_1_2)
- break;
- hci_req_add(req, HCI_OP_CREATE_CONN_CANCEL,
- 6, &conn->dst);
- }
- break;
- case BT_CONNECT2:
- if (conn->type == ACL_LINK) {
- struct hci_cp_reject_conn_req rej;
-
- bacpy(&rej.bdaddr, &conn->dst);
- rej.reason = reason;
-
- hci_req_add(req, HCI_OP_REJECT_CONN_REQ,
- sizeof(rej), &rej);
- } else if (conn->type == SCO_LINK || conn->type == ESCO_LINK) {
- struct hci_cp_reject_sync_conn_req rej;
-
- bacpy(&rej.bdaddr, &conn->dst);
-
- /* SCO rejection has its own limited set of
- * allowed error values (0x0D-0x0F) which isn't
- * compatible with most values passed to this
- * function. To be safe hard-code one of the
- * values that's suitable for SCO.
- */
- rej.reason = HCI_ERROR_REJ_LIMITED_RESOURCES;
-
- hci_req_add(req, HCI_OP_REJECT_SYNC_CONN_REQ,
- sizeof(rej), &rej);
- }
- break;
- default:
- conn->state = BT_CLOSED;
- break;
- }
-}
-
-static void abort_conn_complete(struct hci_dev *hdev, u8 status, u16 opcode)
-{
- if (status)
- BT_DBG("Failed to abort connection: status 0x%2.2x", status);
-}
-
-int hci_abort_conn(struct hci_conn *conn, u8 reason)
-{
- struct hci_request req;
- int err;
-
- hci_req_init(&req, conn->hdev);
-
- __hci_abort_conn(&req, conn, reason);
-
- err = hci_req_run(&req, abort_conn_complete);
- if (err && err != -ENODATA) {
- bt_dev_err(conn->hdev, "failed to run HCI request: err %d", err);
- return err;
- }
-
- return 0;
-}
-
-static int update_bg_scan(struct hci_request *req, unsigned long opt)
-{
- hci_dev_lock(req->hdev);
- __hci_update_background_scan(req);
- hci_dev_unlock(req->hdev);
- return 0;
-}
-
-static void bg_scan_update(struct work_struct *work)
-{
- struct hci_dev *hdev = container_of(work, struct hci_dev,
- bg_scan_update);
- struct hci_conn *conn;
- u8 status;
- int err;
-
- err = hci_req_sync(hdev, update_bg_scan, 0, HCI_CMD_TIMEOUT, &status);
- if (!err)
- return;
-
- hci_dev_lock(hdev);
-
- conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT);
- if (conn)
- hci_le_conn_failed(conn, status);
-
- hci_dev_unlock(hdev);
-}
-
-static int le_scan_disable(struct hci_request *req, unsigned long opt)
-{
- hci_req_add_le_scan_disable(req);
- return 0;
-}
-
-static int bredr_inquiry(struct hci_request *req, unsigned long opt)
-{
- u8 length = opt;
- const u8 giac[3] = { 0x33, 0x8b, 0x9e };
- const u8 liac[3] = { 0x00, 0x8b, 0x9e };
- struct hci_cp_inquiry cp;
-
- BT_DBG("%s", req->hdev->name);
-
- hci_dev_lock(req->hdev);
- hci_inquiry_cache_flush(req->hdev);
- hci_dev_unlock(req->hdev);
-
- memset(&cp, 0, sizeof(cp));
-
- if (req->hdev->discovery.limited)
- memcpy(&cp.lap, liac, sizeof(cp.lap));
- else
- memcpy(&cp.lap, giac, sizeof(cp.lap));
-
- cp.length = length;
-
- hci_req_add(req, HCI_OP_INQUIRY, sizeof(cp), &cp);
-
- return 0;
-}
-
-static void le_scan_disable_work(struct work_struct *work)
-{
- struct hci_dev *hdev = container_of(work, struct hci_dev,
- le_scan_disable.work);
- u8 status;
-
- BT_DBG("%s", hdev->name);
-
- if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
- return;
-
- cancel_delayed_work(&hdev->le_scan_restart);
-
- hci_req_sync(hdev, le_scan_disable, 0, HCI_CMD_TIMEOUT, &status);
- if (status) {
- bt_dev_err(hdev, "failed to disable LE scan: status 0x%02x",
- status);
- return;
- }
-
- hdev->discovery.scan_start = 0;
-
- /* If we were running LE only scan, change discovery state. If
- * we were running both LE and BR/EDR inquiry simultaneously,
- * and BR/EDR inquiry is already finished, stop discovery,
- * otherwise BR/EDR inquiry will stop discovery when finished.
- * If we will resolve remote device name, do not change
- * discovery state.
- */
-
- if (hdev->discovery.type == DISCOV_TYPE_LE)
- goto discov_stopped;
-
- if (hdev->discovery.type != DISCOV_TYPE_INTERLEAVED)
- return;
-
- if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) {
- if (!test_bit(HCI_INQUIRY, &hdev->flags) &&
- hdev->discovery.state != DISCOVERY_RESOLVING)
- goto discov_stopped;
-
- return;
- }
-
- hci_req_sync(hdev, bredr_inquiry, DISCOV_INTERLEAVED_INQUIRY_LEN,
- HCI_CMD_TIMEOUT, &status);
- if (status) {
- bt_dev_err(hdev, "inquiry failed: status 0x%02x", status);
- goto discov_stopped;
- }
-
- return;
-
-discov_stopped:
- hci_dev_lock(hdev);
- hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
- hci_dev_unlock(hdev);
-}
-
-static int le_scan_restart(struct hci_request *req, unsigned long opt)
-{
- struct hci_dev *hdev = req->hdev;
-
- /* If controller is not scanning we are done. */
- if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
- return 0;
-
- hci_req_add_le_scan_disable(req);
-
- if (use_ext_scan(hdev)) {
- struct hci_cp_le_set_ext_scan_enable ext_enable_cp;
-
- memset(&ext_enable_cp, 0, sizeof(ext_enable_cp));
- ext_enable_cp.enable = LE_SCAN_ENABLE;
- ext_enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
-
- hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE,
- sizeof(ext_enable_cp), &ext_enable_cp);
- } else {
- struct hci_cp_le_set_scan_enable cp;
-
- memset(&cp, 0, sizeof(cp));
- cp.enable = LE_SCAN_ENABLE;
- cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
- hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp);
- }
-
- return 0;
-}
-
-static void le_scan_restart_work(struct work_struct *work)
-{
- struct hci_dev *hdev = container_of(work, struct hci_dev,
- le_scan_restart.work);
- unsigned long timeout, duration, scan_start, now;
- u8 status;
-
- BT_DBG("%s", hdev->name);
-
- hci_req_sync(hdev, le_scan_restart, 0, HCI_CMD_TIMEOUT, &status);
- if (status) {
- bt_dev_err(hdev, "failed to restart LE scan: status %d",
- status);
- return;
- }
-
- hci_dev_lock(hdev);
-
- if (!test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) ||
- !hdev->discovery.scan_start)
- goto unlock;
-
- /* When the scan was started, hdev->le_scan_disable has been queued
- * after duration from scan_start. During scan restart this job
- * has been canceled, and we need to queue it again after proper
- * timeout, to make sure that scan does not run indefinitely.
- */
- duration = hdev->discovery.scan_duration;
- scan_start = hdev->discovery.scan_start;
- now = jiffies;
- if (now - scan_start <= duration) {
- int elapsed;
-
- if (now >= scan_start)
- elapsed = now - scan_start;
- else
- elapsed = ULONG_MAX - scan_start + now;
-
- timeout = duration - elapsed;
- } else {
- timeout = 0;
- }
-
- queue_delayed_work(hdev->req_workqueue,
- &hdev->le_scan_disable, timeout);
-
-unlock:
- hci_dev_unlock(hdev);
-}
-
-static int active_scan(struct hci_request *req, unsigned long opt)
-{
- uint16_t interval = opt;
- struct hci_dev *hdev = req->hdev;
- u8 own_addr_type;
- int err;
-
- BT_DBG("%s", hdev->name);
-
- if (hci_dev_test_flag(hdev, HCI_LE_ADV)) {
- hci_dev_lock(hdev);
-
- /* Don't let discovery abort an outgoing connection attempt
- * that's using directed advertising.
- */
- if (hci_lookup_le_connect(hdev)) {
- hci_dev_unlock(hdev);
- return -EBUSY;
- }
-
- cancel_adv_timeout(hdev);
- hci_dev_unlock(hdev);
-
- __hci_req_disable_advertising(req);
- }
-
- /* If controller is scanning, it means the background scanning is
- * running. Thus, we should temporarily stop it in order to set the
- * discovery scanning parameters.
- */
- if (hci_dev_test_flag(hdev, HCI_LE_SCAN))
- hci_req_add_le_scan_disable(req);
-
- /* All active scans will be done with either a resolvable private
- * address (when privacy feature has been enabled) or non-resolvable
- * private address.
- */
- err = hci_update_random_address(req, true, scan_use_rpa(hdev),
- &own_addr_type);
- if (err < 0)
- own_addr_type = ADDR_LE_DEV_PUBLIC;
-
- hci_req_start_scan(req, LE_SCAN_ACTIVE, interval, DISCOV_LE_SCAN_WIN,
- own_addr_type, 0);
- return 0;
-}
-
-static int interleaved_discov(struct hci_request *req, unsigned long opt)
-{
- int err;
-
- BT_DBG("%s", req->hdev->name);
-
- err = active_scan(req, opt);
- if (err)
- return err;
-
- return bredr_inquiry(req, DISCOV_BREDR_INQUIRY_LEN);
-}
-
-static void start_discovery(struct hci_dev *hdev, u8 *status)
-{
- unsigned long timeout;
-
- BT_DBG("%s type %u", hdev->name, hdev->discovery.type);
-
- switch (hdev->discovery.type) {
- case DISCOV_TYPE_BREDR:
- if (!hci_dev_test_flag(hdev, HCI_INQUIRY))
- hci_req_sync(hdev, bredr_inquiry,
- DISCOV_BREDR_INQUIRY_LEN, HCI_CMD_TIMEOUT,
- status);
- return;
- case DISCOV_TYPE_INTERLEAVED:
- /* When running simultaneous discovery, the LE scanning time
- * should occupy the whole discovery time sine BR/EDR inquiry
- * and LE scanning are scheduled by the controller.
- *
- * For interleaving discovery in comparison, BR/EDR inquiry
- * and LE scanning are done sequentially with separate
- * timeouts.
- */
- if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY,
- &hdev->quirks)) {
- timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT);
- /* During simultaneous discovery, we double LE scan
- * interval. We must leave some time for the controller
- * to do BR/EDR inquiry.
- */
- hci_req_sync(hdev, interleaved_discov,
- DISCOV_LE_SCAN_INT * 2, HCI_CMD_TIMEOUT,
- status);
- break;
- }
-
- timeout = msecs_to_jiffies(hdev->discov_interleaved_timeout);
- hci_req_sync(hdev, active_scan, DISCOV_LE_SCAN_INT,
- HCI_CMD_TIMEOUT, status);
- break;
- case DISCOV_TYPE_LE:
- timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT);
- hci_req_sync(hdev, active_scan, DISCOV_LE_SCAN_INT,
- HCI_CMD_TIMEOUT, status);
- break;
- default:
- *status = HCI_ERROR_UNSPECIFIED;
- return;
- }
-
- if (*status)
- return;
-
- BT_DBG("%s timeout %u ms", hdev->name, jiffies_to_msecs(timeout));
-
- /* When service discovery is used and the controller has a
- * strict duplicate filter, it is important to remember the
- * start and duration of the scan. This is required for
- * restarting scanning during the discovery phase.
- */
- if (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) &&
- hdev->discovery.result_filtering) {
- hdev->discovery.scan_start = jiffies;
- hdev->discovery.scan_duration = timeout;
- }
-
- queue_delayed_work(hdev->req_workqueue, &hdev->le_scan_disable,
- timeout);
-}
-
-bool hci_req_stop_discovery(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
- struct discovery_state *d = &hdev->discovery;
- struct hci_cp_remote_name_req_cancel cp;
- struct inquiry_entry *e;
- bool ret = false;
-
- BT_DBG("%s state %u", hdev->name, hdev->discovery.state);
-
- if (d->state == DISCOVERY_FINDING || d->state == DISCOVERY_STOPPING) {
- if (test_bit(HCI_INQUIRY, &hdev->flags))
- hci_req_add(req, HCI_OP_INQUIRY_CANCEL, 0, NULL);
-
- if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
- cancel_delayed_work(&hdev->le_scan_disable);
- hci_req_add_le_scan_disable(req);
- }
-
- ret = true;
- } else {
- /* Passive scanning */
- if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
- hci_req_add_le_scan_disable(req);
- ret = true;
- }
- }
-
- /* No further actions needed for LE-only discovery */
- if (d->type == DISCOV_TYPE_LE)
- return ret;
-
- if (d->state == DISCOVERY_RESOLVING || d->state == DISCOVERY_STOPPING) {
- e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY,
- NAME_PENDING);
- if (!e)
- return ret;
-
- bacpy(&cp.bdaddr, &e->data.bdaddr);
- hci_req_add(req, HCI_OP_REMOTE_NAME_REQ_CANCEL, sizeof(cp),
- &cp);
- ret = true;
- }
-
- return ret;
-}
-
-static int stop_discovery(struct hci_request *req, unsigned long opt)
-{
- hci_dev_lock(req->hdev);
- hci_req_stop_discovery(req);
- hci_dev_unlock(req->hdev);
-
- return 0;
-}
-
-static void discov_update(struct work_struct *work)
-{
- struct hci_dev *hdev = container_of(work, struct hci_dev,
- discov_update);
- u8 status = 0;
-
- switch (hdev->discovery.state) {
- case DISCOVERY_STARTING:
- start_discovery(hdev, &status);
- mgmt_start_discovery_complete(hdev, status);
- if (status)
- hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
- else
- hci_discovery_set_state(hdev, DISCOVERY_FINDING);
- break;
- case DISCOVERY_STOPPING:
- hci_req_sync(hdev, stop_discovery, 0, HCI_CMD_TIMEOUT, &status);
- mgmt_stop_discovery_complete(hdev, status);
- if (!status)
- hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
- break;
- case DISCOVERY_STOPPED:
- default:
- return;
- }
-}
-
-static void discov_off(struct work_struct *work)
-{
- struct hci_dev *hdev = container_of(work, struct hci_dev,
- discov_off.work);
-
- BT_DBG("%s", hdev->name);
-
- hci_dev_lock(hdev);
-
- /* When discoverable timeout triggers, then just make sure
- * the limited discoverable flag is cleared. Even in the case
- * of a timeout triggered from general discoverable, it is
- * safe to unconditionally clear the flag.
- */
- hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
- hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
- hdev->discov_timeout = 0;
-
- hci_dev_unlock(hdev);
-
- hci_req_sync(hdev, discoverable_update, 0, HCI_CMD_TIMEOUT, NULL);
- mgmt_new_settings(hdev);
-}
-
-static int powered_update_hci(struct hci_request *req, unsigned long opt)
-{
- struct hci_dev *hdev = req->hdev;
- u8 link_sec;
-
- hci_dev_lock(hdev);
-
- if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED) &&
- !lmp_host_ssp_capable(hdev)) {
- u8 mode = 0x01;
-
- hci_req_add(req, HCI_OP_WRITE_SSP_MODE, sizeof(mode), &mode);
-
- if (bredr_sc_enabled(hdev) && !lmp_host_sc_capable(hdev)) {
- u8 support = 0x01;
-
- hci_req_add(req, HCI_OP_WRITE_SC_SUPPORT,
- sizeof(support), &support);
- }
- }
-
- if (hci_dev_test_flag(hdev, HCI_LE_ENABLED) &&
- lmp_bredr_capable(hdev)) {
- struct hci_cp_write_le_host_supported cp;
-
- cp.le = 0x01;
- cp.simul = 0x00;
-
- /* Check first if we already have the right
- * host state (host features set)
- */
- if (cp.le != lmp_host_le_capable(hdev) ||
- cp.simul != lmp_host_le_br_capable(hdev))
- hci_req_add(req, HCI_OP_WRITE_LE_HOST_SUPPORTED,
- sizeof(cp), &cp);
- }
-
- if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
- /* Make sure the controller has a good default for
- * advertising data. This also applies to the case
- * where BR/EDR was toggled during the AUTO_OFF phase.
- */
- if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
- list_empty(&hdev->adv_instances)) {
- int err;
-
- if (ext_adv_capable(hdev)) {
- err = __hci_req_setup_ext_adv_instance(req,
- 0x00);
- if (!err)
- __hci_req_update_scan_rsp_data(req,
- 0x00);
- } else {
- err = 0;
- __hci_req_update_adv_data(req, 0x00);
- __hci_req_update_scan_rsp_data(req, 0x00);
- }
-
- if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) {
- if (!ext_adv_capable(hdev))
- __hci_req_enable_advertising(req);
- else if (!err)
- __hci_req_enable_ext_advertising(req);
- }
- } else if (!list_empty(&hdev->adv_instances)) {
- struct adv_info *adv_instance;
-
- adv_instance = list_first_entry(&hdev->adv_instances,
- struct adv_info, list);
- __hci_req_schedule_adv_instance(req,
- adv_instance->instance,
- true);
- }
- }
-
- link_sec = hci_dev_test_flag(hdev, HCI_LINK_SECURITY);
- if (link_sec != test_bit(HCI_AUTH, &hdev->flags))
- hci_req_add(req, HCI_OP_WRITE_AUTH_ENABLE,
- sizeof(link_sec), &link_sec);
-
- if (lmp_bredr_capable(hdev)) {
- if (hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE))
- __hci_req_write_fast_connectable(req, true);
- else
- __hci_req_write_fast_connectable(req, false);
- __hci_req_update_scan(req);
- __hci_req_update_class(req);
- __hci_req_update_name(req);
- __hci_req_update_eir(req);
- }
-
- hci_dev_unlock(hdev);
- return 0;
-}
-
-int __hci_req_hci_power_on(struct hci_dev *hdev)
-{
- /* Register the available SMP channels (BR/EDR and LE) only when
- * successfully powering on the controller. This late
- * registration is required so that LE SMP can clearly decide if
- * the public address or static address is used.
- */
- smp_register(hdev);
-
- return __hci_req_sync(hdev, powered_update_hci, 0, HCI_CMD_TIMEOUT,
- NULL);
-}
-
-void hci_request_setup(struct hci_dev *hdev)
-{
- INIT_WORK(&hdev->discov_update, discov_update);
- INIT_WORK(&hdev->bg_scan_update, bg_scan_update);
- INIT_WORK(&hdev->scan_update, scan_update_work);
- INIT_WORK(&hdev->connectable_update, connectable_update_work);
- INIT_WORK(&hdev->discoverable_update, discoverable_update_work);
- INIT_DELAYED_WORK(&hdev->discov_off, discov_off);
- INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable_work);
- INIT_DELAYED_WORK(&hdev->le_scan_restart, le_scan_restart_work);
- INIT_DELAYED_WORK(&hdev->adv_instance_expire, adv_timeout_expire);
-}
-
-void hci_request_cancel_all(struct hci_dev *hdev)
-{
- hci_req_sync_cancel(hdev, ENODEV);
-
- cancel_work_sync(&hdev->discov_update);
- cancel_work_sync(&hdev->bg_scan_update);
- cancel_work_sync(&hdev->scan_update);
- cancel_work_sync(&hdev->connectable_update);
- cancel_work_sync(&hdev->discoverable_update);
- cancel_delayed_work_sync(&hdev->discov_off);
- cancel_delayed_work_sync(&hdev->le_scan_disable);
- cancel_delayed_work_sync(&hdev->le_scan_restart);
-
- if (hdev->adv_instance_timeout) {
- cancel_delayed_work_sync(&hdev->adv_instance_expire);
- hdev->adv_instance_timeout = 0;
- }
-}
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
deleted file mode 100644
index 692cc8b13368..000000000000
--- a/net/bluetooth/hci_request.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- BlueZ - Bluetooth protocol stack for Linux
- Copyright (C) 2014 Intel Corporation
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License version 2 as
- published by the Free Software Foundation;
-
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
- IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY
- CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES
- WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-
- ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS,
- COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS
- SOFTWARE IS DISCLAIMED.
-*/
-
-#include <asm/unaligned.h>
-
-#define hci_req_sync_lock(hdev) mutex_lock(&hdev->req_lock)
-#define hci_req_sync_unlock(hdev) mutex_unlock(&hdev->req_lock)
-
-struct hci_request {
- struct hci_dev *hdev;
- struct sk_buff_head cmd_q;
-
- /* If something goes wrong when building the HCI request, the error
- * value is stored in this field.
- */
- int err;
-};
-
-void hci_req_init(struct hci_request *req, struct hci_dev *hdev);
-void hci_req_purge(struct hci_request *req);
-int hci_req_run(struct hci_request *req, hci_req_complete_t complete);
-int hci_req_run_skb(struct hci_request *req, hci_req_complete_skb_t complete);
-void hci_req_add(struct hci_request *req, u16 opcode, u32 plen,
- const void *param);
-void hci_req_add_ev(struct hci_request *req, u16 opcode, u32 plen,
- const void *param, u8 event);
-void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status,
- hci_req_complete_t *req_complete,
- hci_req_complete_skb_t *req_complete_skb);
-
-int hci_req_sync(struct hci_dev *hdev, int (*req)(struct hci_request *req,
- unsigned long opt),
- unsigned long opt, u32 timeout, u8 *hci_status);
-int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
- unsigned long opt),
- unsigned long opt, u32 timeout, u8 *hci_status);
-void hci_req_sync_cancel(struct hci_dev *hdev, int err);
-
-struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen,
- const void *param);
-
-int __hci_req_hci_power_on(struct hci_dev *hdev);
-
-void __hci_req_write_fast_connectable(struct hci_request *req, bool enable);
-void __hci_req_update_name(struct hci_request *req);
-void __hci_req_update_eir(struct hci_request *req);
-
-void hci_req_add_le_scan_disable(struct hci_request *req);
-void hci_req_add_le_passive_scan(struct hci_request *req);
-
-void hci_req_reenable_advertising(struct hci_dev *hdev);
-void __hci_req_enable_advertising(struct hci_request *req);
-void __hci_req_disable_advertising(struct hci_request *req);
-void __hci_req_update_adv_data(struct hci_request *req, u8 instance);
-int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance);
-void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance);
-
-int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
- bool force);
-void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk,
- struct hci_request *req, u8 instance,
- bool force);
-
-int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance);
-int __hci_req_start_ext_adv(struct hci_request *req, u8 instance);
-void __hci_req_enable_ext_advertising(struct hci_request *req);
-void __hci_req_clear_ext_adv_sets(struct hci_request *req);
-int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
- bool use_rpa, struct adv_info *adv_instance,
- u8 *own_addr_type, bdaddr_t *rand_addr);
-
-void __hci_req_update_class(struct hci_request *req);
-
-/* Returns true if HCI commands were queued */
-bool hci_req_stop_discovery(struct hci_request *req);
-
-static inline void hci_req_update_scan(struct hci_dev *hdev)
-{
- queue_work(hdev->req_workqueue, &hdev->scan_update);
-}
-
-void __hci_req_update_scan(struct hci_request *req);
-
-int hci_update_random_address(struct hci_request *req, bool require_privacy,
- bool use_rpa, u8 *own_addr_type);
-
-int hci_abort_conn(struct hci_conn *conn, u8 reason);
-void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn,
- u8 reason);
-
-static inline void hci_update_background_scan(struct hci_dev *hdev)
-{
- queue_work(hdev->req_workqueue, &hdev->bg_scan_update);
-}
-
-void hci_request_setup(struct hci_dev *hdev);
-void hci_request_cancel_all(struct hci_dev *hdev);
-
-u8 append_local_name(struct hci_dev *hdev, u8 *ptr, u8 ad_len);
-
-static inline u16 eir_append_data(u8 *eir, u16 eir_len, u8 type,
- u8 *data, u8 data_len)
-{
- eir[eir_len++] = sizeof(type) + data_len;
- eir[eir_len++] = type;
- memcpy(&eir[eir_len], data, data_len);
- eir_len += data_len;
-
- return eir_len;
-}
-
-static inline u16 eir_append_le16(u8 *eir, u16 eir_len, u8 type, u16 data)
-{
- eir[eir_len++] = sizeof(type) + sizeof(data);
- eir[eir_len++] = type;
- put_unaligned_le16(data, &eir[eir_len]);
- eir_len += sizeof(data);
-
- return eir_len;
-}
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 1506e1632394..4e7bf63af9c5 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -23,11 +23,11 @@
*/
/* Bluetooth HCI sockets. */
-
+#include <linux/compat.h>
#include <linux/export.h>
#include <linux/utsname.h>
#include <linux/sched.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
@@ -52,13 +52,25 @@ struct hci_pinfo {
struct bt_sock bt;
struct hci_dev *hdev;
struct hci_filter filter;
- __u32 cmsg_mask;
+ __u8 cmsg_mask;
unsigned short channel;
unsigned long flags;
__u32 cookie;
char comm[TASK_COMM_LEN];
+ __u16 mtu;
};
+static struct hci_dev *hci_hdev_from_sock(struct sock *sk)
+{
+ struct hci_dev *hdev = hci_pi(sk)->hdev;
+
+ if (!hdev)
+ return ERR_PTR(-EBADFD);
+ if (hci_dev_test_flag(hdev, HCI_UNREGISTER))
+ return ERR_PTR(-EPIPE);
+ return hdev;
+}
+
void hci_sock_set_flag(struct sock *sk, int nr)
{
set_bit(nr, &hci_pi(sk)->flags);
@@ -89,7 +101,7 @@ static bool hci_sock_gen_cookie(struct sock *sk)
int id = hci_pi(sk)->cookie;
if (!id) {
- id = ida_simple_get(&sock_cookie_ida, 1, 0, GFP_KERNEL);
+ id = ida_alloc_min(&sock_cookie_ida, 1, GFP_KERNEL);
if (id < 0)
id = 0xffffffff;
@@ -106,8 +118,8 @@ static void hci_sock_free_cookie(struct sock *sk)
int id = hci_pi(sk)->cookie;
if (id) {
- hci_pi(sk)->cookie = 0xffffffff;
- ida_simple_remove(&sock_cookie_ida, id);
+ hci_pi(sk)->cookie = 0;
+ ida_free(&sock_cookie_ida, id);
}
}
@@ -211,7 +223,8 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT &&
hci_skb_pkt_type(skb) != HCI_EVENT_PKT &&
hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT &&
- hci_skb_pkt_type(skb) != HCI_SCODATA_PKT)
+ hci_skb_pkt_type(skb) != HCI_SCODATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_ISODATA_PKT)
continue;
if (is_filtered_packet(sk, skb))
continue;
@@ -220,7 +233,9 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
continue;
if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT &&
hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT &&
- hci_skb_pkt_type(skb) != HCI_SCODATA_PKT)
+ hci_skb_pkt_type(skb) != HCI_SCODATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_ISODATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_DRV_PKT)
continue;
} else {
/* Don't send frame to other channel types */
@@ -250,6 +265,53 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
kfree_skb(skb_copy);
}
+static void hci_sock_copy_creds(struct sock *sk, struct sk_buff *skb)
+{
+ struct scm_creds *creds;
+
+ if (!sk || WARN_ON(!skb))
+ return;
+
+ creds = &bt_cb(skb)->creds;
+
+ /* Check if peer credentials is set */
+ if (!sk->sk_peer_pid) {
+ /* Check if parent peer credentials is set */
+ if (bt_sk(sk)->parent && bt_sk(sk)->parent->sk_peer_pid)
+ sk = bt_sk(sk)->parent;
+ else
+ return;
+ }
+
+ /* Check if scm_creds already set */
+ if (creds->pid == pid_vnr(sk->sk_peer_pid))
+ return;
+
+ memset(creds, 0, sizeof(*creds));
+
+ creds->pid = pid_vnr(sk->sk_peer_pid);
+ if (sk->sk_peer_cred) {
+ creds->uid = sk->sk_peer_cred->uid;
+ creds->gid = sk->sk_peer_cred->gid;
+ }
+}
+
+static struct sk_buff *hci_skb_clone(struct sk_buff *skb)
+{
+ struct sk_buff *nskb;
+
+ if (!skb)
+ return NULL;
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+
+ hci_sock_copy_creds(skb->sk, nskb);
+
+ return nskb;
+}
+
/* Send frame to sockets with specific channel */
static void __hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
int flag, struct sock *skip_sk)
@@ -275,7 +337,7 @@ static void __hci_send_to_channel(unsigned short channel, struct sk_buff *skb,
if (hci_pi(sk)->channel != channel)
continue;
- nskb = skb_clone(skb, GFP_ATOMIC);
+ nskb = hci_skb_clone(skb);
if (!nskb)
continue;
@@ -324,6 +386,18 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb)
else
opcode = cpu_to_le16(HCI_MON_SCO_TX_PKT);
break;
+ case HCI_ISODATA_PKT:
+ if (bt_cb(skb)->incoming)
+ opcode = cpu_to_le16(HCI_MON_ISO_RX_PKT);
+ else
+ opcode = cpu_to_le16(HCI_MON_ISO_TX_PKT);
+ break;
+ case HCI_DRV_PKT:
+ if (bt_cb(skb)->incoming)
+ opcode = cpu_to_le16(HCI_MON_DRV_RX_PKT);
+ else
+ opcode = cpu_to_le16(HCI_MON_DRV_TX_PKT);
+ break;
case HCI_DIAG_PKT:
opcode = cpu_to_le16(HCI_MON_VENDOR_DIAG);
break;
@@ -336,6 +410,8 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb)
if (!skb_copy)
return;
+ hci_sock_copy_creds(skb->sk, skb_copy);
+
/* Put header before the data */
hdr = skb_push(skb_copy, HCI_MON_HDR_SIZE);
hdr->opcode = opcode;
@@ -416,10 +492,11 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
return NULL;
ni = skb_put(skb, HCI_MON_NEW_INDEX_SIZE);
- ni->type = hdev->dev_type;
+ ni->type = 0x00; /* Old hdev->dev_type */
ni->bus = hdev->bus;
bacpy(&ni->bdaddr, &hdev->bdaddr);
- memcpy(ni->name, hdev->name, 8);
+ memcpy_and_pad(ni->name, sizeof(ni->name), hdev->name,
+ strnlen(hdev->name, sizeof(ni->name)), '\0');
opcode = cpu_to_le16(HCI_MON_NEW_INDEX);
break;
@@ -435,8 +512,7 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
case HCI_DEV_SETUP:
if (hdev->manufacturer == 0xffff)
return NULL;
-
- /* fall through */
+ fallthrough;
case HCI_DEV_UP:
skb = bt_skb_alloc(HCI_MON_INDEX_INFO_SIZE, GFP_ATOMIC);
@@ -512,10 +588,12 @@ static struct sk_buff *create_monitor_ctrl_open(struct sock *sk)
return NULL;
}
- skb = bt_skb_alloc(14 + TASK_COMM_LEN , GFP_ATOMIC);
+ skb = bt_skb_alloc(14 + TASK_COMM_LEN, GFP_ATOMIC);
if (!skb)
return NULL;
+ hci_sock_copy_creds(sk, skb);
+
flags = hci_sock_test_flag(sk, HCI_SOCK_TRUSTED) ? 0x1 : 0x0;
put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
@@ -561,6 +639,8 @@ static struct sk_buff *create_monitor_ctrl_close(struct sock *sk)
if (!skb)
return NULL;
+ hci_sock_copy_creds(sk, skb);
+
put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
__net_timestamp(skb);
@@ -587,6 +667,8 @@ static struct sk_buff *create_monitor_ctrl_command(struct sock *sk, u16 index,
if (!skb)
return NULL;
+ hci_sock_copy_creds(sk, skb);
+
put_unaligned_le32(hci_pi(sk)->cookie, skb_put(skb, 4));
put_unaligned_le16(opcode, skb_put(skb, 2));
@@ -619,6 +701,8 @@ send_monitor_note(struct sock *sk, const char *fmt, ...)
if (!skb)
return;
+ hci_sock_copy_creds(sk, skb);
+
va_start(args, fmt);
vsprintf(skb_put(skb, len), fmt, args);
*(u8 *)skb_put(skb, 1) = 0;
@@ -752,19 +836,13 @@ void hci_sock_dev_event(struct hci_dev *hdev, int event)
if (event == HCI_DEV_UNREG) {
struct sock *sk;
- /* Detach sockets from device */
+ /* Wake up sockets using this dead device */
read_lock(&hci_sk_list.lock);
sk_for_each(sk, &hci_sk_list.head) {
- bh_lock_sock_nested(sk);
if (hci_pi(sk)->hdev == hdev) {
- hci_pi(sk)->hdev = NULL;
sk->sk_err = EPIPE;
- sk->sk_state = BT_OPEN;
sk->sk_state_change(sk);
-
- hci_dev_put(hdev);
}
- bh_unlock_sock(sk);
}
read_unlock(&hci_sk_list.lock);
}
@@ -831,7 +909,7 @@ static int hci_sock_release(struct socket *sock)
if (!sk)
return 0;
- hdev = hci_pi(sk)->hdev;
+ lock_sock(sk);
switch (hci_pi(sk)->channel) {
case HCI_CHANNEL_MONITOR:
@@ -854,8 +932,10 @@ static int hci_sock_release(struct socket *sock)
bt_sock_unlink(&hci_sk_list, sk);
+ hdev = hci_pi(sk)->hdev;
if (hdev) {
- if (hci_pi(sk)->channel == HCI_CHANNEL_USER) {
+ if (hci_pi(sk)->channel == HCI_CHANNEL_USER &&
+ !hci_dev_test_flag(hdev, HCI_UNREGISTER)) {
/* When releasing a user channel exclusive access,
* call hci_dev_do_close directly instead of calling
* hci_dev_close to ensure the exclusive access will
@@ -864,6 +944,11 @@ static int hci_sock_release(struct socket *sock)
* The checking of HCI_AUTO_OFF is not needed in this
* case since it will have been cleared already when
* opening the user channel.
+ *
+ * Make sure to also check that we haven't already
+ * unregistered since all the cleanup will have already
+ * been complete and hdev will get released when we put
+ * below.
*/
hci_dev_do_close(hdev);
hci_dev_clear_flag(hdev, HCI_USER_CHANNEL);
@@ -875,15 +960,12 @@ static int hci_sock_release(struct socket *sock)
}
sock_orphan(sk);
-
- skb_queue_purge(&sk->sk_receive_queue);
- skb_queue_purge(&sk->sk_write_queue);
-
+ release_sock(sk);
sock_put(sk);
return 0;
}
-static int hci_sock_blacklist_add(struct hci_dev *hdev, void __user *arg)
+static int hci_sock_reject_list_add(struct hci_dev *hdev, void __user *arg)
{
bdaddr_t bdaddr;
int err;
@@ -893,14 +975,14 @@ static int hci_sock_blacklist_add(struct hci_dev *hdev, void __user *arg)
hci_dev_lock(hdev);
- err = hci_bdaddr_list_add(&hdev->blacklist, &bdaddr, BDADDR_BREDR);
+ err = hci_bdaddr_list_add(&hdev->reject_list, &bdaddr, BDADDR_BREDR);
hci_dev_unlock(hdev);
return err;
}
-static int hci_sock_blacklist_del(struct hci_dev *hdev, void __user *arg)
+static int hci_sock_reject_list_del(struct hci_dev *hdev, void __user *arg)
{
bdaddr_t bdaddr;
int err;
@@ -910,7 +992,7 @@ static int hci_sock_blacklist_del(struct hci_dev *hdev, void __user *arg)
hci_dev_lock(hdev);
- err = hci_bdaddr_list_del(&hdev->blacklist, &bdaddr, BDADDR_BREDR);
+ err = hci_bdaddr_list_del(&hdev->reject_list, &bdaddr, BDADDR_BREDR);
hci_dev_unlock(hdev);
@@ -921,10 +1003,10 @@ static int hci_sock_blacklist_del(struct hci_dev *hdev, void __user *arg)
static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd,
unsigned long arg)
{
- struct hci_dev *hdev = hci_pi(sk)->hdev;
+ struct hci_dev *hdev = hci_hdev_from_sock(sk);
- if (!hdev)
- return -EBADFD;
+ if (IS_ERR(hdev))
+ return PTR_ERR(hdev);
if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL))
return -EBUSY;
@@ -932,9 +1014,6 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd,
if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
return -EOPNOTSUPP;
- if (hdev->dev_type != HCI_PRIMARY)
- return -EOPNOTSUPP;
-
switch (cmd) {
case HCISETRAW:
if (!capable(CAP_NET_ADMIN))
@@ -950,12 +1029,12 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd,
case HCIBLOCKADDR:
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- return hci_sock_blacklist_add(hdev, (void __user *)arg);
+ return hci_sock_reject_list_add(hdev, (void __user *)arg);
case HCIUNBLOCKADDR:
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- return hci_sock_blacklist_del(hdev, (void __user *)arg);
+ return hci_sock_reject_list_del(hdev, (void __user *)arg);
}
return -ENOIOCTLCMD;
@@ -970,6 +1049,34 @@ static int hci_sock_ioctl(struct socket *sock, unsigned int cmd,
BT_DBG("cmd %x arg %lx", cmd, arg);
+ /* Make sure the cmd is valid before doing anything */
+ switch (cmd) {
+ case HCIGETDEVLIST:
+ case HCIGETDEVINFO:
+ case HCIGETCONNLIST:
+ case HCIDEVUP:
+ case HCIDEVDOWN:
+ case HCIDEVRESET:
+ case HCIDEVRESTAT:
+ case HCISETSCAN:
+ case HCISETAUTH:
+ case HCISETENCRYPT:
+ case HCISETPTYPE:
+ case HCISETLINKPOL:
+ case HCISETLINKMODE:
+ case HCISETACLMTU:
+ case HCISETSCOMTU:
+ case HCIINQUIRY:
+ case HCISETRAW:
+ case HCIGETCONNINFO:
+ case HCIGETAUTHINFO:
+ case HCIBLOCKADDR:
+ case HCIUNBLOCKADDR:
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
lock_sock(sk);
if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) {
@@ -986,7 +1093,14 @@ static int hci_sock_ioctl(struct socket *sock, unsigned int cmd,
if (hci_sock_gen_cookie(sk)) {
struct sk_buff *skb;
- if (capable(CAP_NET_ADMIN))
+ /* Perform careful checks before setting the HCI_SOCK_TRUSTED
+ * flag. Make sure that not only the current task but also
+ * the socket opener has the required capability, since
+ * privileged programs can be tricked into making ioctl calls
+ * on HCI sockets, and the socket should not be marked as
+ * trusted simply because the ioctl caller is privileged.
+ */
+ if (sk_capable(sk, CAP_NET_ADMIN))
hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
/* Send event to monitor */
@@ -1055,7 +1169,23 @@ done:
return err;
}
-static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
+#ifdef CONFIG_COMPAT
+static int hci_sock_compat_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (cmd) {
+ case HCIDEVUP:
+ case HCIDEVDOWN:
+ case HCIDEVRESET:
+ case HCIDEVRESTAT:
+ return hci_sock_ioctl(sock, cmd, arg);
+ }
+
+ return hci_sock_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
+static int hci_sock_bind(struct socket *sock, struct sockaddr_unsized *addr,
int addr_len)
{
struct sockaddr_hci haddr;
@@ -1078,6 +1208,18 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
lock_sock(sk);
+ /* Allow detaching from dead device and attaching to alive device, if
+ * the caller wants to re-bind (instead of close) this socket in
+ * response to hci_sock_dev_event(HCI_DEV_UNREG) notification.
+ */
+ hdev = hci_pi(sk)->hdev;
+ if (hdev && hci_dev_test_flag(hdev, HCI_UNREGISTER)) {
+ hci_pi(sk)->hdev = NULL;
+ sk->sk_state = BT_OPEN;
+ hci_dev_put(hdev);
+ }
+ hdev = NULL;
+
if (sk->sk_state == BT_BOUND) {
err = -EALREADY;
goto done;
@@ -1105,7 +1247,7 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
if (!hci_sock_gen_cookie(sk)) {
/* In the case when a cookie has already been assigned,
* then there has been already an ioctl issued against
- * an unbound socket and with that triggerd an open
+ * an unbound socket and with that triggered an open
* notification. Send a close notification first to
* allow the state transition to bounded.
*/
@@ -1169,7 +1311,9 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
goto done;
}
+ hci_dev_lock(hdev);
mgmt_index_removed(hdev);
+ hci_dev_unlock(hdev);
err = hci_dev_open(hdev->id);
if (err) {
@@ -1301,9 +1445,9 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
if (hci_pi(sk)->channel == HCI_CHANNEL_CONTROL) {
if (!hci_sock_gen_cookie(sk)) {
/* In the case when a cookie has already been
- * assigned, this socket will transtion from
+ * assigned, this socket will transition from
* a raw socket into a control socket. To
- * allow for a clean transtion, send the
+ * allow for a clean transition, send the
* close notification first.
*/
skb = create_monitor_ctrl_close(sk);
@@ -1332,6 +1476,10 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
break;
}
+ /* Default MTU to HCI_MAX_FRAME_SIZE if not set */
+ if (!hci_pi(sk)->mtu)
+ hci_pi(sk)->mtu = HCI_MAX_FRAME_SIZE;
+
sk->sk_state = BT_BOUND;
done:
@@ -1354,9 +1502,9 @@ static int hci_sock_getname(struct socket *sock, struct sockaddr *addr,
lock_sock(sk);
- hdev = hci_pi(sk)->hdev;
- if (!hdev) {
- err = -EBADFD;
+ hdev = hci_hdev_from_sock(sk);
+ if (IS_ERR(hdev)) {
+ err = PTR_ERR(hdev);
goto done;
}
@@ -1373,7 +1521,7 @@ done:
static void hci_sock_cmsg(struct sock *sk, struct msghdr *msg,
struct sk_buff *skb)
{
- __u32 mask = hci_pi(sk)->cmsg_mask;
+ __u8 mask = hci_pi(sk)->cmsg_mask;
if (mask & HCI_CMSG_DIR) {
int incoming = bt_cb(skb)->incoming;
@@ -1383,9 +1531,9 @@ static void hci_sock_cmsg(struct sock *sk, struct msghdr *msg,
if (mask & HCI_CMSG_TSTAMP) {
#ifdef CONFIG_COMPAT
- struct compat_timeval ctv;
+ struct old_timeval32 ctv;
#endif
- struct timeval tv;
+ struct __kernel_old_timeval tv;
void *data;
int len;
@@ -1410,7 +1558,7 @@ static void hci_sock_cmsg(struct sock *sk, struct msghdr *msg,
static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg,
size_t len, int flags)
{
- int noblock = flags & MSG_DONTWAIT;
+ struct scm_cookie scm;
struct sock *sk = sock->sk;
struct sk_buff *skb;
int copied, err;
@@ -1427,7 +1575,7 @@ static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg,
if (sk->sk_state == BT_CLOSED)
return 0;
- skb = skb_recv_datagram(sk, flags, noblock, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
if (!skb)
return err;
@@ -1455,18 +1603,22 @@ static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg,
break;
}
+ memset(&scm, 0, sizeof(scm));
+ scm.creds = bt_cb(skb)->creds;
+
skb_free_datagram(sk, skb);
if (flags & MSG_TRUNC)
copied = skblen;
+ scm_recv(sock, msg, &scm, flags);
+
return err ? : copied;
}
static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk,
- struct msghdr *msg, size_t msglen)
+ struct sk_buff *skb)
{
- void *buf;
u8 *cp;
struct mgmt_hdr *hdr;
u16 opcode, index, len;
@@ -1475,40 +1627,31 @@ static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk,
bool var_len, no_hdev;
int err;
- BT_DBG("got %zu bytes", msglen);
+ BT_DBG("got %d bytes", skb->len);
- if (msglen < sizeof(*hdr))
+ if (skb->len < sizeof(*hdr))
return -EINVAL;
- buf = kmalloc(msglen, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- if (memcpy_from_msg(buf, msg, msglen)) {
- err = -EFAULT;
- goto done;
- }
-
- hdr = buf;
+ hdr = (void *)skb->data;
opcode = __le16_to_cpu(hdr->opcode);
index = __le16_to_cpu(hdr->index);
len = __le16_to_cpu(hdr->len);
- if (len != msglen - sizeof(*hdr)) {
+ if (len != skb->len - sizeof(*hdr)) {
err = -EINVAL;
goto done;
}
if (chan->channel == HCI_CHANNEL_CONTROL) {
- struct sk_buff *skb;
+ struct sk_buff *cmd;
/* Send event to monitor */
- skb = create_monitor_ctrl_command(sk, index, opcode, len,
- buf + sizeof(*hdr));
- if (skb) {
- hci_send_to_channel(HCI_CHANNEL_MONITOR, skb,
+ cmd = create_monitor_ctrl_command(sk, index, opcode, len,
+ skb->data + sizeof(*hdr));
+ if (cmd) {
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, cmd,
HCI_SOCK_TRUSTED, NULL);
- kfree_skb(skb);
+ kfree_skb(cmd);
}
}
@@ -1553,11 +1696,13 @@ static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk,
}
}
- no_hdev = (handler->flags & HCI_MGMT_NO_HDEV);
- if (no_hdev != !hdev) {
- err = mgmt_cmd_status(sk, index, opcode,
- MGMT_STATUS_INVALID_INDEX);
- goto done;
+ if (!(handler->flags & HCI_MGMT_HDEV_OPTIONAL)) {
+ no_hdev = (handler->flags & HCI_MGMT_NO_HDEV);
+ if (no_hdev != !hdev) {
+ err = mgmt_cmd_status(sk, index, opcode,
+ MGMT_STATUS_INVALID_INDEX);
+ goto done;
+ }
}
var_len = (handler->flags & HCI_MGMT_VAR_LEN);
@@ -1571,26 +1716,25 @@ static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk,
if (hdev && chan->hdev_init)
chan->hdev_init(sk, hdev);
- cp = buf + sizeof(*hdr);
+ cp = skb->data + sizeof(*hdr);
err = handler->func(sk, hdev, cp, len);
if (err < 0)
goto done;
- err = msglen;
+ err = skb->len;
done:
if (hdev)
hci_dev_put(hdev);
- kfree(buf);
return err;
}
-static int hci_logging_frame(struct sock *sk, struct msghdr *msg, int len)
+static int hci_logging_frame(struct sock *sk, struct sk_buff *skb,
+ unsigned int flags)
{
struct hci_mon_hdr *hdr;
- struct sk_buff *skb;
struct hci_dev *hdev;
u16 index;
int err;
@@ -1599,24 +1743,13 @@ static int hci_logging_frame(struct sock *sk, struct msghdr *msg, int len)
* the priority byte, the ident length byte and at least one string
* terminator NUL byte. Anything shorter are invalid packets.
*/
- if (len < sizeof(*hdr) + 3)
+ if (skb->len < sizeof(*hdr) + 3)
return -EINVAL;
- skb = bt_skb_send_alloc(sk, len, msg->msg_flags & MSG_DONTWAIT, &err);
- if (!skb)
- return err;
-
- if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
- err = -EFAULT;
- goto drop;
- }
-
hdr = (void *)skb->data;
- if (__le16_to_cpu(hdr->len) != len - sizeof(*hdr)) {
- err = -EINVAL;
- goto drop;
- }
+ if (__le16_to_cpu(hdr->len) != skb->len - sizeof(*hdr))
+ return -EINVAL;
if (__le16_to_cpu(hdr->opcode) == 0x0000) {
__u8 priority = skb->data[sizeof(*hdr)];
@@ -1635,25 +1768,20 @@ static int hci_logging_frame(struct sock *sk, struct msghdr *msg, int len)
* The message follows the ident string (if present) and
* must be NUL terminated. Otherwise it is not a valid packet.
*/
- if (priority > 7 || skb->data[len - 1] != 0x00 ||
- ident_len > len - sizeof(*hdr) - 3 ||
- skb->data[sizeof(*hdr) + ident_len + 1] != 0x00) {
- err = -EINVAL;
- goto drop;
- }
+ if (priority > 7 || skb->data[skb->len - 1] != 0x00 ||
+ ident_len > skb->len - sizeof(*hdr) - 3 ||
+ skb->data[sizeof(*hdr) + ident_len + 1] != 0x00)
+ return -EINVAL;
} else {
- err = -EINVAL;
- goto drop;
+ return -EINVAL;
}
index = __le16_to_cpu(hdr->index);
if (index != MGMT_INDEX_NONE) {
hdev = hci_dev_get(index);
- if (!hdev) {
- err = -ENODEV;
- goto drop;
- }
+ if (!hdev)
+ return -ENODEV;
} else {
hdev = NULL;
}
@@ -1661,13 +1789,11 @@ static int hci_logging_frame(struct sock *sk, struct msghdr *msg, int len)
hdr->opcode = cpu_to_le16(HCI_MON_USER_LOGGING);
hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL);
- err = len;
+ err = skb->len;
if (hdev)
hci_dev_put(hdev);
-drop:
- kfree_skb(skb);
return err;
}
@@ -1679,19 +1805,23 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg,
struct hci_dev *hdev;
struct sk_buff *skb;
int err;
+ const unsigned int flags = msg->msg_flags;
BT_DBG("sock %p sk %p", sock, sk);
- if (msg->msg_flags & MSG_OOB)
+ if (flags & MSG_OOB)
return -EOPNOTSUPP;
- if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_NOSIGNAL|MSG_ERRQUEUE|
- MSG_CMSG_COMPAT))
+ if (flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL | MSG_ERRQUEUE | MSG_CMSG_COMPAT))
return -EINVAL;
- if (len < 4 || len > HCI_MAX_FRAME_SIZE)
+ if (len < 4 || len > hci_pi(sk)->mtu)
return -EINVAL;
+ skb = bt_skb_sendmsg(sk, msg, len, len, 0, 0);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
lock_sock(sk);
switch (hci_pi(sk)->channel) {
@@ -1700,39 +1830,30 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg,
break;
case HCI_CHANNEL_MONITOR:
err = -EOPNOTSUPP;
- goto done;
+ goto drop;
case HCI_CHANNEL_LOGGING:
- err = hci_logging_frame(sk, msg, len);
- goto done;
+ err = hci_logging_frame(sk, skb, flags);
+ goto drop;
default:
mutex_lock(&mgmt_chan_list_lock);
chan = __hci_mgmt_chan_find(hci_pi(sk)->channel);
if (chan)
- err = hci_mgmt_cmd(chan, sk, msg, len);
+ err = hci_mgmt_cmd(chan, sk, skb);
else
err = -EINVAL;
mutex_unlock(&mgmt_chan_list_lock);
- goto done;
+ goto drop;
}
- hdev = hci_pi(sk)->hdev;
- if (!hdev) {
- err = -EBADFD;
- goto done;
+ hdev = hci_hdev_from_sock(sk);
+ if (IS_ERR(hdev)) {
+ err = PTR_ERR(hdev);
+ goto drop;
}
if (!test_bit(HCI_UP, &hdev->flags)) {
err = -ENETDOWN;
- goto done;
- }
-
- skb = bt_skb_send_alloc(sk, len, msg->msg_flags & MSG_DONTWAIT, &err);
- if (!skb)
- goto done;
-
- if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
- err = -EFAULT;
goto drop;
}
@@ -1747,7 +1868,9 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg,
*/
if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT &&
hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT &&
- hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) {
+ hci_skb_pkt_type(skb) != HCI_SCODATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_ISODATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_DRV_PKT) {
err = -EINVAL;
goto drop;
}
@@ -1791,7 +1914,8 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg,
}
if (hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT &&
- hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) {
+ hci_skb_pkt_type(skb) != HCI_SCODATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_ISODATA_PKT) {
err = -EINVAL;
goto drop;
}
@@ -1811,8 +1935,8 @@ drop:
goto done;
}
-static int hci_sock_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int len)
+static int hci_sock_setsockopt_old(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct hci_ufilter uf = { .opcode = 0 };
struct sock *sk = sock->sk;
@@ -1820,9 +1944,6 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname,
BT_DBG("sk %p, opt %d", sk, optname);
- if (level != SOL_HCI)
- return -ENOPROTOOPT;
-
lock_sock(sk);
if (hci_pi(sk)->channel != HCI_CHANNEL_RAW) {
@@ -1832,10 +1953,9 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname,
switch (optname) {
case HCI_DATA_DIR:
- if (get_user(opt, (int __user *)optval)) {
- err = -EFAULT;
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
break;
- }
if (opt)
hci_pi(sk)->cmsg_mask |= HCI_CMSG_DIR;
@@ -1844,10 +1964,9 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname,
break;
case HCI_TIME_STAMP:
- if (get_user(opt, (int __user *)optval)) {
- err = -EFAULT;
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
break;
- }
if (opt)
hci_pi(sk)->cmsg_mask |= HCI_CMSG_TSTAMP;
@@ -1865,11 +1984,9 @@ static int hci_sock_setsockopt(struct socket *sock, int level, int optname,
uf.event_mask[1] = *((u32 *) f->event_mask + 1);
}
- len = min_t(unsigned int, len, sizeof(uf));
- if (copy_from_user(&uf, optval, len)) {
- err = -EFAULT;
+ err = copy_safe_from_sockptr(&uf, sizeof(uf), optval, optlen);
+ if (err)
break;
- }
if (!capable(CAP_NET_RAW)) {
uf.type_mask &= hci_sec_filter.type_mask;
@@ -1897,18 +2014,63 @@ done:
return err;
}
-static int hci_sock_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
+static int hci_sock_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
- struct hci_ufilter uf;
struct sock *sk = sock->sk;
- int len, opt, err = 0;
+ int err = 0;
+ u16 opt;
BT_DBG("sk %p, opt %d", sk, optname);
- if (level != SOL_HCI)
+ if (level == SOL_HCI)
+ return hci_sock_setsockopt_old(sock, level, optname, optval,
+ optlen);
+
+ if (level != SOL_BLUETOOTH)
return -ENOPROTOOPT;
+ lock_sock(sk);
+
+ switch (optname) {
+ case BT_SNDMTU:
+ case BT_RCVMTU:
+ switch (hci_pi(sk)->channel) {
+ /* Don't allow changing MTU for channels that are meant for HCI
+ * traffic only.
+ */
+ case HCI_CHANNEL_RAW:
+ case HCI_CHANNEL_USER:
+ err = -ENOPROTOOPT;
+ goto done;
+ }
+
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
+ break;
+
+ hci_pi(sk)->mtu = opt;
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int hci_sock_getsockopt_old(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct hci_ufilter uf;
+ struct sock *sk = sock->sk;
+ int len, opt, err = 0;
+
+ BT_DBG("sk %p, opt %d", sk, optname);
+
if (get_user(len, optlen))
return -EFAULT;
@@ -1966,6 +2128,46 @@ done:
return err;
}
+static int hci_sock_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ BT_DBG("sk %p, opt %d", sk, optname);
+
+ if (level == SOL_HCI)
+ return hci_sock_getsockopt_old(sock, level, optname, optval,
+ optlen);
+
+ if (level != SOL_BLUETOOTH)
+ return -ENOPROTOOPT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case BT_SNDMTU:
+ case BT_RCVMTU:
+ if (put_user(hci_pi(sk)->mtu, (u16 __user *)optval))
+ err = -EFAULT;
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static void hci_sock_destruct(struct sock *sk)
+{
+ mgmt_cleanup(sk);
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_queue_purge(&sk->sk_write_queue);
+}
+
static const struct proto_ops hci_sock_ops = {
.family = PF_BLUETOOTH,
.owner = THIS_MODULE,
@@ -1975,6 +2177,9 @@ static const struct proto_ops hci_sock_ops = {
.sendmsg = hci_sock_sendmsg,
.recvmsg = hci_sock_recvmsg,
.ioctl = hci_sock_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = hci_sock_compat_ioctl,
+#endif
.poll = datagram_poll,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
@@ -2004,18 +2209,13 @@ static int hci_sock_create(struct net *net, struct socket *sock, int protocol,
sock->ops = &hci_sock_ops;
- sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hci_sk_proto, kern);
+ sk = bt_sock_alloc(net, sock, &hci_sk_proto, protocol, GFP_ATOMIC,
+ kern);
if (!sk)
return -ENOMEM;
- sock_init_data(sock, sk);
-
- sock_reset_flag(sk, SOCK_ZAPPED);
-
- sk->sk_protocol = protocol;
-
sock->state = SS_UNCONNECTED;
- sk->sk_state = BT_OPEN;
+ sk->sk_destruct = hci_sock_destruct;
bt_sock_link(&hci_sk_list, sk);
return 0;
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
new file mode 100644
index 000000000000..a9f5b1a68356
--- /dev/null
+++ b/net/bluetooth/hci_sync.c
@@ -0,0 +1,7420 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BlueZ - Bluetooth protocol stack for Linux
+ *
+ * Copyright (C) 2021 Intel Corporation
+ * Copyright 2023 NXP
+ */
+
+#include <linux/property.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/mgmt.h>
+
+#include "hci_codec.h"
+#include "hci_debugfs.h"
+#include "smp.h"
+#include "eir.h"
+#include "msft.h"
+#include "aosp.h"
+#include "leds.h"
+
+static void hci_cmd_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode,
+ struct sk_buff *skb)
+{
+ bt_dev_dbg(hdev, "result 0x%2.2x", result);
+
+ if (hdev->req_status != HCI_REQ_PEND)
+ return;
+
+ hdev->req_result = result;
+ hdev->req_status = HCI_REQ_DONE;
+
+ /* Free the request command so it is not used as response */
+ kfree_skb(hdev->req_skb);
+ hdev->req_skb = NULL;
+
+ if (skb) {
+ struct sock *sk = hci_skb_sk(skb);
+
+ /* Drop sk reference if set */
+ if (sk)
+ sock_put(sk);
+
+ hdev->req_rsp = skb_get(skb);
+ }
+
+ wake_up_interruptible(&hdev->req_wait_q);
+}
+
+struct sk_buff *hci_cmd_sync_alloc(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param, struct sock *sk)
+{
+ int len = HCI_COMMAND_HDR_SIZE + plen;
+ struct hci_command_hdr *hdr;
+ struct sk_buff *skb;
+
+ skb = bt_skb_alloc(len, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ hdr = skb_put(skb, HCI_COMMAND_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(opcode);
+ hdr->plen = plen;
+
+ if (plen)
+ skb_put_data(skb, param, plen);
+
+ bt_dev_dbg(hdev, "skb len %d", skb->len);
+
+ hci_skb_pkt_type(skb) = HCI_COMMAND_PKT;
+ hci_skb_opcode(skb) = opcode;
+
+ /* Grab a reference if command needs to be associated with a sock (e.g.
+ * likely mgmt socket that initiated the command).
+ */
+ if (sk) {
+ hci_skb_sk(skb) = sk;
+ sock_hold(sk);
+ }
+
+ return skb;
+}
+
+static void hci_cmd_sync_add(struct hci_request *req, u16 opcode, u32 plen,
+ const void *param, u8 event, struct sock *sk)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct sk_buff *skb;
+
+ bt_dev_dbg(hdev, "opcode 0x%4.4x plen %d", opcode, plen);
+
+ /* If an error occurred during request building, there is no point in
+ * queueing the HCI command. We can simply return.
+ */
+ if (req->err)
+ return;
+
+ skb = hci_cmd_sync_alloc(hdev, opcode, plen, param, sk);
+ if (!skb) {
+ bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)",
+ opcode);
+ req->err = -ENOMEM;
+ return;
+ }
+
+ if (skb_queue_empty(&req->cmd_q))
+ bt_cb(skb)->hci.req_flags |= HCI_REQ_START;
+
+ hci_skb_event(skb) = event;
+
+ skb_queue_tail(&req->cmd_q, skb);
+}
+
+static int hci_req_sync_run(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ bt_dev_dbg(hdev, "length %u", skb_queue_len(&req->cmd_q));
+
+ /* If an error occurred during request building, remove all HCI
+ * commands queued on the HCI request queue.
+ */
+ if (req->err) {
+ skb_queue_purge(&req->cmd_q);
+ return req->err;
+ }
+
+ /* Do not allow empty requests */
+ if (skb_queue_empty(&req->cmd_q))
+ return -ENODATA;
+
+ skb = skb_peek_tail(&req->cmd_q);
+ bt_cb(skb)->hci.req_complete_skb = hci_cmd_sync_complete;
+ bt_cb(skb)->hci.req_flags |= HCI_REQ_SKB;
+
+ spin_lock_irqsave(&hdev->cmd_q.lock, flags);
+ skb_queue_splice_tail(&req->cmd_q, &hdev->cmd_q);
+ spin_unlock_irqrestore(&hdev->cmd_q.lock, flags);
+
+ queue_work(hdev->workqueue, &hdev->cmd_work);
+
+ return 0;
+}
+
+static void hci_request_init(struct hci_request *req, struct hci_dev *hdev)
+{
+ skb_queue_head_init(&req->cmd_q);
+ req->hdev = hdev;
+ req->err = 0;
+}
+
+/* This function requires the caller holds hdev->req_lock. */
+struct sk_buff *__hci_cmd_sync_sk(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param, u8 event, u32 timeout,
+ struct sock *sk)
+{
+ struct hci_request req;
+ struct sk_buff *skb;
+ int err = 0;
+
+ bt_dev_dbg(hdev, "Opcode 0x%4.4x", opcode);
+
+ hci_request_init(&req, hdev);
+
+ hci_cmd_sync_add(&req, opcode, plen, param, event, sk);
+
+ hdev->req_status = HCI_REQ_PEND;
+
+ err = hci_req_sync_run(&req);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ err = wait_event_interruptible_timeout(hdev->req_wait_q,
+ hdev->req_status != HCI_REQ_PEND,
+ timeout);
+
+ if (err == -ERESTARTSYS)
+ return ERR_PTR(-EINTR);
+
+ switch (hdev->req_status) {
+ case HCI_REQ_DONE:
+ err = -bt_to_errno(hdev->req_result);
+ break;
+
+ case HCI_REQ_CANCELED:
+ err = -hdev->req_result;
+ break;
+
+ default:
+ err = -ETIMEDOUT;
+ break;
+ }
+
+ hdev->req_status = 0;
+ hdev->req_result = 0;
+ skb = hdev->req_rsp;
+ hdev->req_rsp = NULL;
+
+ bt_dev_dbg(hdev, "end: err %d", err);
+
+ if (err < 0) {
+ kfree_skb(skb);
+ return ERR_PTR(err);
+ }
+
+ /* If command return a status event skb will be set to NULL as there are
+ * no parameters.
+ */
+ if (!skb)
+ return ERR_PTR(-ENODATA);
+
+ return skb;
+}
+EXPORT_SYMBOL(__hci_cmd_sync_sk);
+
+/* This function requires the caller holds hdev->req_lock. */
+struct sk_buff *__hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param, u32 timeout)
+{
+ return __hci_cmd_sync_sk(hdev, opcode, plen, param, 0, timeout, NULL);
+}
+EXPORT_SYMBOL(__hci_cmd_sync);
+
+/* Send HCI command and wait for command complete event */
+struct sk_buff *hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param, u32 timeout)
+{
+ struct sk_buff *skb;
+
+ if (!test_bit(HCI_UP, &hdev->flags))
+ return ERR_PTR(-ENETDOWN);
+
+ bt_dev_dbg(hdev, "opcode 0x%4.4x plen %d", opcode, plen);
+
+ hci_req_sync_lock(hdev);
+ skb = __hci_cmd_sync(hdev, opcode, plen, param, timeout);
+ hci_req_sync_unlock(hdev);
+
+ return skb;
+}
+EXPORT_SYMBOL(hci_cmd_sync);
+
+/* This function requires the caller holds hdev->req_lock. */
+struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param, u8 event, u32 timeout)
+{
+ return __hci_cmd_sync_sk(hdev, opcode, plen, param, event, timeout,
+ NULL);
+}
+EXPORT_SYMBOL(__hci_cmd_sync_ev);
+
+/* This function requires the caller holds hdev->req_lock. */
+int __hci_cmd_sync_status_sk(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param, u8 event, u32 timeout,
+ struct sock *sk)
+{
+ struct sk_buff *skb;
+ u8 status;
+
+ skb = __hci_cmd_sync_sk(hdev, opcode, plen, param, event, timeout, sk);
+
+ /* If command return a status event, skb will be set to -ENODATA */
+ if (skb == ERR_PTR(-ENODATA))
+ return 0;
+
+ if (IS_ERR(skb)) {
+ if (!event)
+ bt_dev_err(hdev, "Opcode 0x%4.4x failed: %ld", opcode,
+ PTR_ERR(skb));
+ return PTR_ERR(skb);
+ }
+
+ status = skb->data[0];
+
+ kfree_skb(skb);
+
+ return status;
+}
+EXPORT_SYMBOL(__hci_cmd_sync_status_sk);
+
+int __hci_cmd_sync_status(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param, u32 timeout)
+{
+ return __hci_cmd_sync_status_sk(hdev, opcode, plen, param, 0, timeout,
+ NULL);
+}
+EXPORT_SYMBOL(__hci_cmd_sync_status);
+
+int hci_cmd_sync_status(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param, u32 timeout)
+{
+ int err;
+
+ hci_req_sync_lock(hdev);
+ err = __hci_cmd_sync_status(hdev, opcode, plen, param, timeout);
+ hci_req_sync_unlock(hdev);
+
+ return err;
+}
+EXPORT_SYMBOL(hci_cmd_sync_status);
+
+static void hci_cmd_sync_work(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_sync_work);
+
+ bt_dev_dbg(hdev, "");
+
+ /* Dequeue all entries and run them */
+ while (1) {
+ struct hci_cmd_sync_work_entry *entry;
+
+ mutex_lock(&hdev->cmd_sync_work_lock);
+ entry = list_first_entry_or_null(&hdev->cmd_sync_work_list,
+ struct hci_cmd_sync_work_entry,
+ list);
+ if (entry)
+ list_del(&entry->list);
+ mutex_unlock(&hdev->cmd_sync_work_lock);
+
+ if (!entry)
+ break;
+
+ bt_dev_dbg(hdev, "entry %p", entry);
+
+ if (entry->func) {
+ int err;
+
+ hci_req_sync_lock(hdev);
+ err = entry->func(hdev, entry->data);
+ if (entry->destroy)
+ entry->destroy(hdev, entry->data, err);
+ hci_req_sync_unlock(hdev);
+ }
+
+ kfree(entry);
+ }
+}
+
+static void hci_cmd_sync_cancel_work(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_sync_cancel_work);
+
+ cancel_delayed_work_sync(&hdev->cmd_timer);
+ cancel_delayed_work_sync(&hdev->ncmd_timer);
+ atomic_set(&hdev->cmd_cnt, 1);
+
+ wake_up_interruptible(&hdev->req_wait_q);
+}
+
+static int hci_scan_disable_sync(struct hci_dev *hdev);
+static int scan_disable_sync(struct hci_dev *hdev, void *data)
+{
+ return hci_scan_disable_sync(hdev);
+}
+
+static int interleaved_inquiry_sync(struct hci_dev *hdev, void *data)
+{
+ return hci_inquiry_sync(hdev, DISCOV_INTERLEAVED_INQUIRY_LEN, 0);
+}
+
+static void le_scan_disable(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ le_scan_disable.work);
+ int status;
+
+ bt_dev_dbg(hdev, "");
+ hci_dev_lock(hdev);
+
+ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
+ goto _return;
+
+ status = hci_cmd_sync_queue(hdev, scan_disable_sync, NULL, NULL);
+ if (status) {
+ bt_dev_err(hdev, "failed to disable LE scan: %d", status);
+ goto _return;
+ }
+
+ /* If we were running LE only scan, change discovery state. If
+ * we were running both LE and BR/EDR inquiry simultaneously,
+ * and BR/EDR inquiry is already finished, stop discovery,
+ * otherwise BR/EDR inquiry will stop discovery when finished.
+ * If we will resolve remote device name, do not change
+ * discovery state.
+ */
+
+ if (hdev->discovery.type == DISCOV_TYPE_LE)
+ goto discov_stopped;
+
+ if (hdev->discovery.type != DISCOV_TYPE_INTERLEAVED)
+ goto _return;
+
+ if (hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY)) {
+ if (!test_bit(HCI_INQUIRY, &hdev->flags) &&
+ hdev->discovery.state != DISCOVERY_RESOLVING)
+ goto discov_stopped;
+
+ goto _return;
+ }
+
+ status = hci_cmd_sync_queue(hdev, interleaved_inquiry_sync, NULL, NULL);
+ if (status) {
+ bt_dev_err(hdev, "inquiry failed: status %d", status);
+ goto discov_stopped;
+ }
+
+ goto _return;
+
+discov_stopped:
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+
+_return:
+ hci_dev_unlock(hdev);
+}
+
+static int hci_le_set_scan_enable_sync(struct hci_dev *hdev, u8 val,
+ u8 filter_dup);
+
+static int reenable_adv_sync(struct hci_dev *hdev, void *data)
+{
+ bt_dev_dbg(hdev, "");
+
+ if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
+ list_empty(&hdev->adv_instances))
+ return 0;
+
+ if (hdev->cur_adv_instance) {
+ return hci_schedule_adv_instance_sync(hdev,
+ hdev->cur_adv_instance,
+ true);
+ } else {
+ if (ext_adv_capable(hdev)) {
+ hci_start_ext_adv_sync(hdev, 0x00);
+ } else {
+ hci_update_adv_data_sync(hdev, 0x00);
+ hci_update_scan_rsp_data_sync(hdev, 0x00);
+ hci_enable_advertising_sync(hdev);
+ }
+ }
+
+ return 0;
+}
+
+static void reenable_adv(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ reenable_adv_work);
+ int status;
+
+ bt_dev_dbg(hdev, "");
+
+ hci_dev_lock(hdev);
+
+ status = hci_cmd_sync_queue(hdev, reenable_adv_sync, NULL, NULL);
+ if (status)
+ bt_dev_err(hdev, "failed to reenable ADV: %d", status);
+
+ hci_dev_unlock(hdev);
+}
+
+static void cancel_adv_timeout(struct hci_dev *hdev)
+{
+ if (hdev->adv_instance_timeout) {
+ hdev->adv_instance_timeout = 0;
+ cancel_delayed_work(&hdev->adv_instance_expire);
+ }
+}
+
+/* For a single instance:
+ * - force == true: The instance will be removed even when its remaining
+ * lifetime is not zero.
+ * - force == false: the instance will be deactivated but kept stored unless
+ * the remaining lifetime is zero.
+ *
+ * For instance == 0x00:
+ * - force == true: All instances will be removed regardless of their timeout
+ * setting.
+ * - force == false: Only instances that have a timeout will be removed.
+ */
+int hci_clear_adv_instance_sync(struct hci_dev *hdev, struct sock *sk,
+ u8 instance, bool force)
+{
+ struct adv_info *adv_instance, *n, *next_instance = NULL;
+ int err;
+ u8 rem_inst;
+
+ /* Cancel any timeout concerning the removed instance(s). */
+ if (!instance || hdev->cur_adv_instance == instance)
+ cancel_adv_timeout(hdev);
+
+ /* Get the next instance to advertise BEFORE we remove
+ * the current one. This can be the same instance again
+ * if there is only one instance.
+ */
+ if (instance && hdev->cur_adv_instance == instance)
+ next_instance = hci_get_next_instance(hdev, instance);
+
+ if (instance == 0x00) {
+ list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances,
+ list) {
+ if (!(force || adv_instance->timeout))
+ continue;
+
+ rem_inst = adv_instance->instance;
+ err = hci_remove_adv_instance(hdev, rem_inst);
+ if (!err)
+ mgmt_advertising_removed(sk, hdev, rem_inst);
+ }
+ } else {
+ adv_instance = hci_find_adv_instance(hdev, instance);
+
+ if (force || (adv_instance && adv_instance->timeout &&
+ !adv_instance->remaining_time)) {
+ /* Don't advertise a removed instance. */
+ if (next_instance &&
+ next_instance->instance == instance)
+ next_instance = NULL;
+
+ err = hci_remove_adv_instance(hdev, instance);
+ if (!err)
+ mgmt_advertising_removed(sk, hdev, instance);
+ }
+ }
+
+ if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ return 0;
+
+ if (next_instance && !ext_adv_capable(hdev))
+ return hci_schedule_adv_instance_sync(hdev,
+ next_instance->instance,
+ false);
+
+ return 0;
+}
+
+static int adv_timeout_expire_sync(struct hci_dev *hdev, void *data)
+{
+ u8 instance = *(u8 *)data;
+
+ kfree(data);
+
+ hci_clear_adv_instance_sync(hdev, NULL, instance, false);
+
+ if (list_empty(&hdev->adv_instances))
+ return hci_disable_advertising_sync(hdev);
+
+ return 0;
+}
+
+static void adv_timeout_expire(struct work_struct *work)
+{
+ u8 *inst_ptr;
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ adv_instance_expire.work);
+
+ bt_dev_dbg(hdev, "");
+
+ hci_dev_lock(hdev);
+
+ hdev->adv_instance_timeout = 0;
+
+ if (hdev->cur_adv_instance == 0x00)
+ goto unlock;
+
+ inst_ptr = kmalloc(1, GFP_KERNEL);
+ if (!inst_ptr)
+ goto unlock;
+
+ *inst_ptr = hdev->cur_adv_instance;
+ hci_cmd_sync_queue(hdev, adv_timeout_expire_sync, inst_ptr, NULL);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static bool is_interleave_scanning(struct hci_dev *hdev)
+{
+ return hdev->interleave_scan_state != INTERLEAVE_SCAN_NONE;
+}
+
+static int hci_passive_scan_sync(struct hci_dev *hdev);
+
+static void interleave_scan_work(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ interleave_scan.work);
+ unsigned long timeout;
+
+ if (hdev->interleave_scan_state == INTERLEAVE_SCAN_ALLOWLIST) {
+ timeout = msecs_to_jiffies(hdev->advmon_allowlist_duration);
+ } else if (hdev->interleave_scan_state == INTERLEAVE_SCAN_NO_FILTER) {
+ timeout = msecs_to_jiffies(hdev->advmon_no_filter_duration);
+ } else {
+ bt_dev_err(hdev, "unexpected error");
+ return;
+ }
+
+ hci_passive_scan_sync(hdev);
+
+ hci_dev_lock(hdev);
+
+ switch (hdev->interleave_scan_state) {
+ case INTERLEAVE_SCAN_ALLOWLIST:
+ bt_dev_dbg(hdev, "next state: allowlist");
+ hdev->interleave_scan_state = INTERLEAVE_SCAN_NO_FILTER;
+ break;
+ case INTERLEAVE_SCAN_NO_FILTER:
+ bt_dev_dbg(hdev, "next state: no filter");
+ hdev->interleave_scan_state = INTERLEAVE_SCAN_ALLOWLIST;
+ break;
+ case INTERLEAVE_SCAN_NONE:
+ bt_dev_err(hdev, "unexpected error");
+ }
+
+ hci_dev_unlock(hdev);
+
+ /* Don't continue interleaving if it was canceled */
+ if (is_interleave_scanning(hdev))
+ queue_delayed_work(hdev->req_workqueue,
+ &hdev->interleave_scan, timeout);
+}
+
+void hci_cmd_sync_init(struct hci_dev *hdev)
+{
+ INIT_WORK(&hdev->cmd_sync_work, hci_cmd_sync_work);
+ INIT_LIST_HEAD(&hdev->cmd_sync_work_list);
+ mutex_init(&hdev->cmd_sync_work_lock);
+ mutex_init(&hdev->unregister_lock);
+
+ INIT_WORK(&hdev->cmd_sync_cancel_work, hci_cmd_sync_cancel_work);
+ INIT_WORK(&hdev->reenable_adv_work, reenable_adv);
+ INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable);
+ INIT_DELAYED_WORK(&hdev->adv_instance_expire, adv_timeout_expire);
+ INIT_DELAYED_WORK(&hdev->interleave_scan, interleave_scan_work);
+}
+
+static void _hci_cmd_sync_cancel_entry(struct hci_dev *hdev,
+ struct hci_cmd_sync_work_entry *entry,
+ int err)
+{
+ if (entry->destroy)
+ entry->destroy(hdev, entry->data, err);
+
+ list_del(&entry->list);
+ kfree(entry);
+}
+
+void hci_cmd_sync_clear(struct hci_dev *hdev)
+{
+ struct hci_cmd_sync_work_entry *entry, *tmp;
+
+ cancel_work_sync(&hdev->cmd_sync_work);
+ cancel_work_sync(&hdev->reenable_adv_work);
+
+ mutex_lock(&hdev->cmd_sync_work_lock);
+ list_for_each_entry_safe(entry, tmp, &hdev->cmd_sync_work_list, list)
+ _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED);
+ mutex_unlock(&hdev->cmd_sync_work_lock);
+}
+
+void hci_cmd_sync_cancel(struct hci_dev *hdev, int err)
+{
+ bt_dev_dbg(hdev, "err 0x%2.2x", err);
+
+ if (hdev->req_status == HCI_REQ_PEND) {
+ hdev->req_result = err;
+ hdev->req_status = HCI_REQ_CANCELED;
+
+ queue_work(hdev->workqueue, &hdev->cmd_sync_cancel_work);
+ }
+}
+EXPORT_SYMBOL(hci_cmd_sync_cancel);
+
+/* Cancel ongoing command request synchronously:
+ *
+ * - Set result and mark status to HCI_REQ_CANCELED
+ * - Wakeup command sync thread
+ */
+void hci_cmd_sync_cancel_sync(struct hci_dev *hdev, int err)
+{
+ bt_dev_dbg(hdev, "err 0x%2.2x", err);
+
+ if (hdev->req_status == HCI_REQ_PEND) {
+ /* req_result is __u32 so error must be positive to be properly
+ * propagated.
+ */
+ hdev->req_result = err < 0 ? -err : err;
+ hdev->req_status = HCI_REQ_CANCELED;
+
+ wake_up_interruptible(&hdev->req_wait_q);
+ }
+}
+EXPORT_SYMBOL(hci_cmd_sync_cancel_sync);
+
+/* Submit HCI command to be run in as cmd_sync_work:
+ *
+ * - hdev must _not_ be unregistered
+ */
+int hci_cmd_sync_submit(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+ void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+ struct hci_cmd_sync_work_entry *entry;
+ int err = 0;
+
+ mutex_lock(&hdev->unregister_lock);
+ if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) {
+ err = -ENODEV;
+ goto unlock;
+ }
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+ entry->func = func;
+ entry->data = data;
+ entry->destroy = destroy;
+
+ mutex_lock(&hdev->cmd_sync_work_lock);
+ list_add_tail(&entry->list, &hdev->cmd_sync_work_list);
+ mutex_unlock(&hdev->cmd_sync_work_lock);
+
+ queue_work(hdev->req_workqueue, &hdev->cmd_sync_work);
+
+unlock:
+ mutex_unlock(&hdev->unregister_lock);
+ return err;
+}
+EXPORT_SYMBOL(hci_cmd_sync_submit);
+
+/* Queue HCI command:
+ *
+ * - hdev must be running
+ */
+int hci_cmd_sync_queue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+ void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+ /* Only queue command if hdev is running which means it had been opened
+ * and is either on init phase or is already up.
+ */
+ if (!test_bit(HCI_RUNNING, &hdev->flags))
+ return -ENETDOWN;
+
+ return hci_cmd_sync_submit(hdev, func, data, destroy);
+}
+EXPORT_SYMBOL(hci_cmd_sync_queue);
+
+static struct hci_cmd_sync_work_entry *
+_hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+ void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+ struct hci_cmd_sync_work_entry *entry, *tmp;
+
+ list_for_each_entry_safe(entry, tmp, &hdev->cmd_sync_work_list, list) {
+ if (func && entry->func != func)
+ continue;
+
+ if (data && entry->data != data)
+ continue;
+
+ if (destroy && entry->destroy != destroy)
+ continue;
+
+ return entry;
+ }
+
+ return NULL;
+}
+
+/* Queue HCI command entry once:
+ *
+ * - Lookup if an entry already exist and only if it doesn't creates a new entry
+ * and queue it.
+ */
+int hci_cmd_sync_queue_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+ void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+ if (hci_cmd_sync_lookup_entry(hdev, func, data, destroy))
+ return 0;
+
+ return hci_cmd_sync_queue(hdev, func, data, destroy);
+}
+EXPORT_SYMBOL(hci_cmd_sync_queue_once);
+
+/* Run HCI command:
+ *
+ * - hdev must be running
+ * - if on cmd_sync_work then run immediately otherwise queue
+ */
+int hci_cmd_sync_run(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+ void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+ /* Only queue command if hdev is running which means it had been opened
+ * and is either on init phase or is already up.
+ */
+ if (!test_bit(HCI_RUNNING, &hdev->flags))
+ return -ENETDOWN;
+
+ /* If on cmd_sync_work then run immediately otherwise queue */
+ if (current_work() == &hdev->cmd_sync_work)
+ return func(hdev, data);
+
+ return hci_cmd_sync_submit(hdev, func, data, destroy);
+}
+EXPORT_SYMBOL(hci_cmd_sync_run);
+
+/* Run HCI command entry once:
+ *
+ * - Lookup if an entry already exist and only if it doesn't creates a new entry
+ * and run it.
+ * - if on cmd_sync_work then run immediately otherwise queue
+ */
+int hci_cmd_sync_run_once(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+ void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+ if (hci_cmd_sync_lookup_entry(hdev, func, data, destroy))
+ return 0;
+
+ return hci_cmd_sync_run(hdev, func, data, destroy);
+}
+EXPORT_SYMBOL(hci_cmd_sync_run_once);
+
+/* Lookup HCI command entry:
+ *
+ * - Return first entry that matches by function callback or data or
+ * destroy callback.
+ */
+struct hci_cmd_sync_work_entry *
+hci_cmd_sync_lookup_entry(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+ void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+ struct hci_cmd_sync_work_entry *entry;
+
+ mutex_lock(&hdev->cmd_sync_work_lock);
+ entry = _hci_cmd_sync_lookup_entry(hdev, func, data, destroy);
+ mutex_unlock(&hdev->cmd_sync_work_lock);
+
+ return entry;
+}
+EXPORT_SYMBOL(hci_cmd_sync_lookup_entry);
+
+/* Cancel HCI command entry */
+void hci_cmd_sync_cancel_entry(struct hci_dev *hdev,
+ struct hci_cmd_sync_work_entry *entry)
+{
+ mutex_lock(&hdev->cmd_sync_work_lock);
+ _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED);
+ mutex_unlock(&hdev->cmd_sync_work_lock);
+}
+EXPORT_SYMBOL(hci_cmd_sync_cancel_entry);
+
+/* Dequeue one HCI command entry:
+ *
+ * - Lookup and cancel first entry that matches.
+ */
+bool hci_cmd_sync_dequeue_once(struct hci_dev *hdev,
+ hci_cmd_sync_work_func_t func,
+ void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+ struct hci_cmd_sync_work_entry *entry;
+
+ mutex_lock(&hdev->cmd_sync_work_lock);
+
+ entry = _hci_cmd_sync_lookup_entry(hdev, func, data, destroy);
+ if (!entry) {
+ mutex_unlock(&hdev->cmd_sync_work_lock);
+ return false;
+ }
+
+ _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED);
+
+ mutex_unlock(&hdev->cmd_sync_work_lock);
+
+ return true;
+}
+EXPORT_SYMBOL(hci_cmd_sync_dequeue_once);
+
+/* Dequeue HCI command entry:
+ *
+ * - Lookup and cancel any entry that matches by function callback or data or
+ * destroy callback.
+ */
+bool hci_cmd_sync_dequeue(struct hci_dev *hdev, hci_cmd_sync_work_func_t func,
+ void *data, hci_cmd_sync_work_destroy_t destroy)
+{
+ struct hci_cmd_sync_work_entry *entry;
+ bool ret = false;
+
+ mutex_lock(&hdev->cmd_sync_work_lock);
+ while ((entry = _hci_cmd_sync_lookup_entry(hdev, func, data,
+ destroy))) {
+ _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED);
+ ret = true;
+ }
+ mutex_unlock(&hdev->cmd_sync_work_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(hci_cmd_sync_dequeue);
+
+int hci_update_eir_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_write_eir cp;
+
+ bt_dev_dbg(hdev, "");
+
+ if (!hdev_is_powered(hdev))
+ return 0;
+
+ if (!lmp_ext_inq_capable(hdev))
+ return 0;
+
+ if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
+ return 0;
+
+ if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+
+ eir_create(hdev, cp.data);
+
+ if (memcmp(cp.data, hdev->eir, sizeof(cp.data)) == 0)
+ return 0;
+
+ memcpy(hdev->eir, cp.data, sizeof(cp.data));
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_EIR, sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+}
+
+static u8 get_service_classes(struct hci_dev *hdev)
+{
+ struct bt_uuid *uuid;
+ u8 val = 0;
+
+ list_for_each_entry(uuid, &hdev->uuids, list)
+ val |= uuid->svc_hint;
+
+ return val;
+}
+
+int hci_update_class_sync(struct hci_dev *hdev)
+{
+ u8 cod[3];
+
+ bt_dev_dbg(hdev, "");
+
+ if (!hdev_is_powered(hdev))
+ return 0;
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ return 0;
+
+ if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE))
+ return 0;
+
+ cod[0] = hdev->minor_class;
+ cod[1] = hdev->major_class;
+ cod[2] = get_service_classes(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE))
+ cod[1] |= 0x20;
+
+ if (memcmp(cod, hdev->dev_class, 3) == 0)
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_CLASS_OF_DEV,
+ sizeof(cod), cod, HCI_CMD_TIMEOUT);
+}
+
+static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable)
+{
+ /* If there is no connection we are OK to advertise. */
+ if (hci_conn_num(hdev, LE_LINK) == 0)
+ return true;
+
+ /* Check le_states if there is any connection in peripheral role. */
+ if (hdev->conn_hash.le_num_peripheral > 0) {
+ /* Peripheral connection state and non connectable mode
+ * bit 20.
+ */
+ if (!connectable && !(hdev->le_states[2] & 0x10))
+ return false;
+
+ /* Peripheral connection state and connectable mode bit 38
+ * and scannable bit 21.
+ */
+ if (connectable && (!(hdev->le_states[4] & 0x40) ||
+ !(hdev->le_states[2] & 0x20)))
+ return false;
+ }
+
+ /* Check le_states if there is any connection in central role. */
+ if (hci_conn_num(hdev, LE_LINK) != hdev->conn_hash.le_num_peripheral) {
+ /* Central connection state and non connectable mode bit 18. */
+ if (!connectable && !(hdev->le_states[2] & 0x02))
+ return false;
+
+ /* Central connection state and connectable mode bit 35 and
+ * scannable 19.
+ */
+ if (connectable && (!(hdev->le_states[4] & 0x08) ||
+ !(hdev->le_states[2] & 0x08)))
+ return false;
+ }
+
+ return true;
+}
+
+static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags)
+{
+ /* If privacy is not enabled don't use RPA */
+ if (!hci_dev_test_flag(hdev, HCI_PRIVACY))
+ return false;
+
+ /* If basic privacy mode is enabled use RPA */
+ if (!hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY))
+ return true;
+
+ /* If limited privacy mode is enabled don't use RPA if we're
+ * both discoverable and bondable.
+ */
+ if ((flags & MGMT_ADV_FLAG_DISCOV) &&
+ hci_dev_test_flag(hdev, HCI_BONDABLE))
+ return false;
+
+ /* We're neither bondable nor discoverable in the limited
+ * privacy mode, therefore use RPA.
+ */
+ return true;
+}
+
+static int hci_set_random_addr_sync(struct hci_dev *hdev, bdaddr_t *rpa)
+{
+ /* If a random_addr has been set we're advertising or initiating an LE
+ * connection we can't go ahead and change the random address at this
+ * time. This is because the eventual initiator address used for the
+ * subsequently created connection will be undefined (some
+ * controllers use the new address and others the one we had
+ * when the operation started).
+ *
+ * In this kind of scenario skip the update and let the random
+ * address be updated at the next cycle.
+ */
+ if (bacmp(&hdev->random_addr, BDADDR_ANY) &&
+ (hci_dev_test_flag(hdev, HCI_LE_ADV) ||
+ hci_lookup_le_connect(hdev))) {
+ bt_dev_dbg(hdev, "Deferring random address update");
+ hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+ return 0;
+ }
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_RANDOM_ADDR,
+ 6, rpa, HCI_CMD_TIMEOUT);
+}
+
+int hci_update_random_address_sync(struct hci_dev *hdev, bool require_privacy,
+ bool rpa, u8 *own_addr_type)
+{
+ int err;
+
+ /* If privacy is enabled use a resolvable private address. If
+ * current RPA has expired or there is something else than
+ * the current RPA in use, then generate a new one.
+ */
+ if (rpa) {
+ /* If Controller supports LL Privacy use own address type is
+ * 0x03
+ */
+ if (ll_privacy_capable(hdev))
+ *own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED;
+ else
+ *own_addr_type = ADDR_LE_DEV_RANDOM;
+
+ /* Check if RPA is valid */
+ if (rpa_valid(hdev))
+ return 0;
+
+ err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa);
+ if (err < 0) {
+ bt_dev_err(hdev, "failed to generate new RPA");
+ return err;
+ }
+
+ err = hci_set_random_addr_sync(hdev, &hdev->rpa);
+ if (err)
+ return err;
+
+ return 0;
+ }
+
+ /* In case of required privacy without resolvable private address,
+ * use an non-resolvable private address. This is useful for active
+ * scanning and non-connectable advertising.
+ */
+ if (require_privacy) {
+ bdaddr_t nrpa;
+
+ while (true) {
+ /* The non-resolvable private address is generated
+ * from random six bytes with the two most significant
+ * bits cleared.
+ */
+ get_random_bytes(&nrpa, 6);
+ nrpa.b[5] &= 0x3f;
+
+ /* The non-resolvable private address shall not be
+ * equal to the public address.
+ */
+ if (bacmp(&hdev->bdaddr, &nrpa))
+ break;
+ }
+
+ *own_addr_type = ADDR_LE_DEV_RANDOM;
+
+ return hci_set_random_addr_sync(hdev, &nrpa);
+ }
+
+ /* If forcing static address is in use or there is no public
+ * address use the static address as random address (but skip
+ * the HCI command if the current random address is already the
+ * static one.
+ *
+ * In case BR/EDR has been disabled on a dual-mode controller
+ * and a static address has been configured, then use that
+ * address instead of the public BR/EDR address.
+ */
+ if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ||
+ !bacmp(&hdev->bdaddr, BDADDR_ANY) ||
+ (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) &&
+ bacmp(&hdev->static_addr, BDADDR_ANY))) {
+ *own_addr_type = ADDR_LE_DEV_RANDOM;
+ if (bacmp(&hdev->static_addr, &hdev->random_addr))
+ return hci_set_random_addr_sync(hdev,
+ &hdev->static_addr);
+ return 0;
+ }
+
+ /* Neither privacy nor static address is being used so use a
+ * public address.
+ */
+ *own_addr_type = ADDR_LE_DEV_PUBLIC;
+
+ return 0;
+}
+
+static int hci_disable_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance)
+{
+ struct hci_cp_le_set_ext_adv_enable *cp;
+ struct hci_cp_ext_adv_set *set;
+ u8 data[sizeof(*cp) + sizeof(*set) * 1];
+ u8 size;
+ struct adv_info *adv = NULL;
+
+ /* If request specifies an instance that doesn't exist, fail */
+ if (instance > 0) {
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv)
+ return -EINVAL;
+
+ /* If not enabled there is nothing to do */
+ if (!adv->enabled)
+ return 0;
+ }
+
+ memset(data, 0, sizeof(data));
+
+ cp = (void *)data;
+ set = (void *)cp->data;
+
+ /* Instance 0x00 indicates all advertising instances will be disabled */
+ cp->num_of_sets = !!instance;
+ cp->enable = 0x00;
+
+ set->handle = adv ? adv->handle : instance;
+
+ size = sizeof(*cp) + sizeof(*set) * cp->num_of_sets;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE,
+ size, data, HCI_CMD_TIMEOUT);
+}
+
+static int hci_set_adv_set_random_addr_sync(struct hci_dev *hdev, u8 instance,
+ bdaddr_t *random_addr)
+{
+ struct hci_cp_le_set_adv_set_rand_addr cp;
+ int err;
+
+ if (!instance) {
+ /* Instance 0x00 doesn't have an adv_info, instead it uses
+ * hdev->random_addr to track its address so whenever it needs
+ * to be updated this also set the random address since
+ * hdev->random_addr is shared with scan state machine.
+ */
+ err = hci_set_random_addr_sync(hdev, random_addr);
+ if (err)
+ return err;
+ }
+
+ memset(&cp, 0, sizeof(cp));
+
+ cp.handle = instance;
+ bacpy(&cp.bdaddr, random_addr);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_SET_RAND_ADDR,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int
+hci_set_ext_adv_params_sync(struct hci_dev *hdev, struct adv_info *adv,
+ const struct hci_cp_le_set_ext_adv_params *cp,
+ struct hci_rp_le_set_ext_adv_params *rp)
+{
+ struct sk_buff *skb;
+
+ skb = __hci_cmd_sync(hdev, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(*cp),
+ cp, HCI_CMD_TIMEOUT);
+
+ /* If command return a status event, skb will be set to -ENODATA */
+ if (skb == ERR_PTR(-ENODATA))
+ return 0;
+
+ if (IS_ERR(skb)) {
+ bt_dev_err(hdev, "Opcode 0x%4.4x failed: %ld",
+ HCI_OP_LE_SET_EXT_ADV_PARAMS, PTR_ERR(skb));
+ return PTR_ERR(skb);
+ }
+
+ if (skb->len != sizeof(*rp)) {
+ bt_dev_err(hdev, "Invalid response length for 0x%4.4x: %u",
+ HCI_OP_LE_SET_EXT_ADV_PARAMS, skb->len);
+ kfree_skb(skb);
+ return -EIO;
+ }
+
+ memcpy(rp, skb->data, sizeof(*rp));
+ kfree_skb(skb);
+
+ if (!rp->status) {
+ hdev->adv_addr_type = cp->own_addr_type;
+ if (!cp->handle) {
+ /* Store in hdev for instance 0 */
+ hdev->adv_tx_power = rp->tx_power;
+ } else if (adv) {
+ adv->tx_power = rp->tx_power;
+ }
+ }
+
+ return rp->status;
+}
+
+static int hci_set_ext_adv_data_sync(struct hci_dev *hdev, u8 instance)
+{
+ DEFINE_FLEX(struct hci_cp_le_set_ext_adv_data, pdu, data, length,
+ HCI_MAX_EXT_AD_LENGTH);
+ u8 len;
+ struct adv_info *adv = NULL;
+ int err;
+
+ if (instance) {
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv || !adv->adv_data_changed)
+ return 0;
+ }
+
+ len = eir_create_adv_data(hdev, instance, pdu->data,
+ HCI_MAX_EXT_AD_LENGTH);
+
+ pdu->length = len;
+ pdu->handle = adv ? adv->handle : instance;
+ pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE;
+ pdu->frag_pref = LE_SET_ADV_DATA_NO_FRAG;
+
+ err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_DATA,
+ struct_size(pdu, data, len), pdu,
+ HCI_CMD_TIMEOUT);
+ if (err)
+ return err;
+
+ /* Update data if the command succeed */
+ if (adv) {
+ adv->adv_data_changed = false;
+ } else {
+ memcpy(hdev->adv_data, pdu->data, len);
+ hdev->adv_data_len = len;
+ }
+
+ return 0;
+}
+
+static int hci_set_adv_data_sync(struct hci_dev *hdev, u8 instance)
+{
+ struct hci_cp_le_set_adv_data cp;
+ u8 len;
+
+ memset(&cp, 0, sizeof(cp));
+
+ len = eir_create_adv_data(hdev, instance, cp.data, sizeof(cp.data));
+
+ /* There's nothing to do if the data hasn't changed */
+ if (hdev->adv_data_len == len &&
+ memcmp(cp.data, hdev->adv_data, len) == 0)
+ return 0;
+
+ memcpy(hdev->adv_data, cp.data, sizeof(cp.data));
+ hdev->adv_data_len = len;
+
+ cp.length = len;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_DATA,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_update_adv_data_sync(struct hci_dev *hdev, u8 instance)
+{
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return 0;
+
+ if (ext_adv_capable(hdev))
+ return hci_set_ext_adv_data_sync(hdev, instance);
+
+ return hci_set_adv_data_sync(hdev, instance);
+}
+
+int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance)
+{
+ struct hci_cp_le_set_ext_adv_params cp;
+ struct hci_rp_le_set_ext_adv_params rp;
+ bool connectable, require_privacy;
+ u32 flags;
+ bdaddr_t random_addr;
+ u8 own_addr_type;
+ int err;
+ struct adv_info *adv;
+ bool secondary_adv;
+
+ if (instance > 0) {
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv)
+ return -EINVAL;
+ } else {
+ adv = NULL;
+ }
+
+ /* Updating parameters of an active instance will return a
+ * Command Disallowed error, so we must first disable the
+ * instance if it is active.
+ */
+ if (adv) {
+ err = hci_disable_ext_adv_instance_sync(hdev, instance);
+ if (err)
+ return err;
+ }
+
+ flags = hci_adv_instance_flags(hdev, instance);
+
+ /* If the "connectable" instance flag was not set, then choose between
+ * ADV_IND and ADV_NONCONN_IND based on the global connectable setting.
+ */
+ connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) ||
+ mgmt_get_connectable(hdev);
+
+ if (!is_advertising_allowed(hdev, connectable))
+ return -EPERM;
+
+ /* Set require_privacy to true only when non-connectable
+ * advertising is used and it is not periodic.
+ * In that case it is fine to use a non-resolvable private address.
+ */
+ require_privacy = !connectable && !(adv && adv->periodic);
+
+ err = hci_get_random_address(hdev, require_privacy,
+ adv_use_rpa(hdev, flags), adv,
+ &own_addr_type, &random_addr);
+ if (err < 0)
+ return err;
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (adv) {
+ hci_cpu_to_le24(adv->min_interval, cp.min_interval);
+ hci_cpu_to_le24(adv->max_interval, cp.max_interval);
+ cp.tx_power = adv->tx_power;
+ cp.sid = adv->sid;
+ } else {
+ hci_cpu_to_le24(hdev->le_adv_min_interval, cp.min_interval);
+ hci_cpu_to_le24(hdev->le_adv_max_interval, cp.max_interval);
+ cp.tx_power = HCI_ADV_TX_POWER_NO_PREFERENCE;
+ cp.sid = 0x00;
+ }
+
+ secondary_adv = (flags & MGMT_ADV_FLAG_SEC_MASK);
+
+ if (connectable) {
+ if (secondary_adv)
+ cp.evt_properties = cpu_to_le16(LE_EXT_ADV_CONN_IND);
+ else
+ cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND);
+ } else if (hci_adv_instance_is_scannable(hdev, instance) ||
+ (flags & MGMT_ADV_PARAM_SCAN_RSP)) {
+ if (secondary_adv)
+ cp.evt_properties = cpu_to_le16(LE_EXT_ADV_SCAN_IND);
+ else
+ cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_SCAN_IND);
+ } else {
+ if (secondary_adv)
+ cp.evt_properties = cpu_to_le16(LE_EXT_ADV_NON_CONN_IND);
+ else
+ cp.evt_properties = cpu_to_le16(LE_LEGACY_NONCONN_IND);
+ }
+
+ /* If Own_Address_Type equals 0x02 or 0x03, the Peer_Address parameter
+ * contains the peer’s Identity Address and the Peer_Address_Type
+ * parameter contains the peer’s Identity Type (i.e., 0x00 or 0x01).
+ * These parameters are used to locate the corresponding local IRK in
+ * the resolving list; this IRK is used to generate their own address
+ * used in the advertisement.
+ */
+ if (own_addr_type == ADDR_LE_DEV_RANDOM_RESOLVED)
+ hci_copy_identity_address(hdev, &cp.peer_addr,
+ &cp.peer_addr_type);
+
+ cp.own_addr_type = own_addr_type;
+ cp.channel_map = hdev->le_adv_channel_map;
+ cp.handle = adv ? adv->handle : instance;
+
+ if (flags & MGMT_ADV_FLAG_SEC_2M) {
+ cp.primary_phy = HCI_ADV_PHY_1M;
+ cp.secondary_phy = HCI_ADV_PHY_2M;
+ } else if (flags & MGMT_ADV_FLAG_SEC_CODED) {
+ cp.primary_phy = HCI_ADV_PHY_CODED;
+ cp.secondary_phy = HCI_ADV_PHY_CODED;
+ } else {
+ /* In all other cases use 1M */
+ cp.primary_phy = HCI_ADV_PHY_1M;
+ cp.secondary_phy = HCI_ADV_PHY_1M;
+ }
+
+ err = hci_set_ext_adv_params_sync(hdev, adv, &cp, &rp);
+ if (err)
+ return err;
+
+ /* Update adv data as tx power is known now */
+ err = hci_set_ext_adv_data_sync(hdev, cp.handle);
+ if (err)
+ return err;
+
+ if ((own_addr_type == ADDR_LE_DEV_RANDOM ||
+ own_addr_type == ADDR_LE_DEV_RANDOM_RESOLVED) &&
+ bacmp(&random_addr, BDADDR_ANY)) {
+ /* Check if random address need to be updated */
+ if (adv) {
+ if (!bacmp(&random_addr, &adv->random_addr))
+ return 0;
+ } else {
+ if (!bacmp(&random_addr, &hdev->random_addr))
+ return 0;
+ }
+
+ return hci_set_adv_set_random_addr_sync(hdev, instance,
+ &random_addr);
+ }
+
+ return 0;
+}
+
+static int hci_set_ext_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance)
+{
+ DEFINE_FLEX(struct hci_cp_le_set_ext_scan_rsp_data, pdu, data, length,
+ HCI_MAX_EXT_AD_LENGTH);
+ u8 len;
+ struct adv_info *adv = NULL;
+ int err;
+
+ if (instance) {
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv || !adv->scan_rsp_changed)
+ return 0;
+ }
+
+ len = eir_create_scan_rsp(hdev, instance, pdu->data);
+
+ pdu->handle = adv ? adv->handle : instance;
+ pdu->length = len;
+ pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE;
+ pdu->frag_pref = LE_SET_ADV_DATA_NO_FRAG;
+
+ err = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA,
+ struct_size(pdu, data, len), pdu,
+ HCI_CMD_TIMEOUT);
+ if (err)
+ return err;
+
+ if (adv) {
+ adv->scan_rsp_changed = false;
+ } else {
+ memcpy(hdev->scan_rsp_data, pdu->data, len);
+ hdev->scan_rsp_data_len = len;
+ }
+
+ return 0;
+}
+
+static int __hci_set_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance)
+{
+ struct hci_cp_le_set_scan_rsp_data cp;
+ u8 len;
+
+ memset(&cp, 0, sizeof(cp));
+
+ len = eir_create_scan_rsp(hdev, instance, cp.data);
+
+ if (hdev->scan_rsp_data_len == len &&
+ !memcmp(cp.data, hdev->scan_rsp_data, len))
+ return 0;
+
+ memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data));
+ hdev->scan_rsp_data_len = len;
+
+ cp.length = len;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_SCAN_RSP_DATA,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_update_scan_rsp_data_sync(struct hci_dev *hdev, u8 instance)
+{
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return 0;
+
+ if (ext_adv_capable(hdev))
+ return hci_set_ext_scan_rsp_data_sync(hdev, instance);
+
+ return __hci_set_scan_rsp_data_sync(hdev, instance);
+}
+
+int hci_enable_ext_advertising_sync(struct hci_dev *hdev, u8 instance)
+{
+ struct hci_cp_le_set_ext_adv_enable *cp;
+ struct hci_cp_ext_adv_set *set;
+ u8 data[sizeof(*cp) + sizeof(*set) * 1];
+ struct adv_info *adv;
+
+ if (instance > 0) {
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv)
+ return -EINVAL;
+ /* If already enabled there is nothing to do */
+ if (adv->enabled)
+ return 0;
+ } else {
+ adv = NULL;
+ }
+
+ cp = (void *)data;
+ set = (void *)cp->data;
+
+ memset(cp, 0, sizeof(*cp));
+
+ cp->enable = 0x01;
+ cp->num_of_sets = 0x01;
+
+ memset(set, 0, sizeof(*set));
+
+ set->handle = adv ? adv->handle : instance;
+
+ /* Set duration per instance since controller is responsible for
+ * scheduling it.
+ */
+ if (adv && adv->timeout) {
+ u16 duration = adv->timeout * MSEC_PER_SEC;
+
+ /* Time = N * 10 ms */
+ set->duration = cpu_to_le16(duration / 10);
+ }
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE,
+ sizeof(*cp) +
+ sizeof(*set) * cp->num_of_sets,
+ data, HCI_CMD_TIMEOUT);
+}
+
+int hci_start_ext_adv_sync(struct hci_dev *hdev, u8 instance)
+{
+ int err;
+
+ err = hci_setup_ext_adv_instance_sync(hdev, instance);
+ if (err)
+ return err;
+
+ err = hci_set_ext_scan_rsp_data_sync(hdev, instance);
+ if (err)
+ return err;
+
+ return hci_enable_ext_advertising_sync(hdev, instance);
+}
+
+int hci_disable_per_advertising_sync(struct hci_dev *hdev, u8 instance)
+{
+ struct hci_cp_le_set_per_adv_enable cp;
+ struct adv_info *adv = NULL;
+
+ /* If periodic advertising already disabled there is nothing to do. */
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv || !adv->periodic_enabled)
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+
+ cp.enable = 0x00;
+ cp.handle = instance;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_ENABLE,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_set_per_adv_params_sync(struct hci_dev *hdev, u8 instance,
+ u16 min_interval, u16 max_interval)
+{
+ struct hci_cp_le_set_per_adv_params cp;
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (!min_interval)
+ min_interval = DISCOV_LE_PER_ADV_INT_MIN;
+
+ if (!max_interval)
+ max_interval = DISCOV_LE_PER_ADV_INT_MAX;
+
+ cp.handle = instance;
+ cp.min_interval = cpu_to_le16(min_interval);
+ cp.max_interval = cpu_to_le16(max_interval);
+ cp.periodic_properties = 0x0000;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_PARAMS,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_set_per_adv_data_sync(struct hci_dev *hdev, u8 instance)
+{
+ DEFINE_FLEX(struct hci_cp_le_set_per_adv_data, pdu, data, length,
+ HCI_MAX_PER_AD_LENGTH);
+ u8 len;
+ struct adv_info *adv = NULL;
+
+ if (instance) {
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv || !adv->periodic)
+ return 0;
+ }
+
+ len = eir_create_per_adv_data(hdev, instance, pdu->data);
+
+ pdu->length = len;
+ pdu->handle = adv ? adv->handle : instance;
+ pdu->operation = LE_SET_ADV_DATA_OP_COMPLETE;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_DATA,
+ struct_size(pdu, data, len), pdu,
+ HCI_CMD_TIMEOUT);
+}
+
+static int hci_enable_per_advertising_sync(struct hci_dev *hdev, u8 instance)
+{
+ struct hci_cp_le_set_per_adv_enable cp;
+ struct adv_info *adv = NULL;
+
+ /* If periodic advertising already enabled there is nothing to do. */
+ adv = hci_find_adv_instance(hdev, instance);
+ if (adv && adv->periodic_enabled)
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+
+ cp.enable = 0x01;
+ cp.handle = instance;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PER_ADV_ENABLE,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+/* Checks if periodic advertising data contains a Basic Announcement and if it
+ * does generates a Broadcast ID and add Broadcast Announcement.
+ */
+static int hci_adv_bcast_annoucement(struct hci_dev *hdev, struct adv_info *adv)
+{
+ u8 bid[3];
+ u8 ad[HCI_MAX_EXT_AD_LENGTH];
+ u8 len;
+
+ /* Skip if NULL adv as instance 0x00 is used for general purpose
+ * advertising so it cannot used for the likes of Broadcast Announcement
+ * as it can be overwritten at any point.
+ */
+ if (!adv)
+ return 0;
+
+ /* Check if PA data doesn't contains a Basic Audio Announcement then
+ * there is nothing to do.
+ */
+ if (!eir_get_service_data(adv->per_adv_data, adv->per_adv_data_len,
+ 0x1851, NULL))
+ return 0;
+
+ /* Check if advertising data already has a Broadcast Announcement since
+ * the process may want to control the Broadcast ID directly and in that
+ * case the kernel shall no interfere.
+ */
+ if (eir_get_service_data(adv->adv_data, adv->adv_data_len, 0x1852,
+ NULL))
+ return 0;
+
+ /* Generate Broadcast ID */
+ get_random_bytes(bid, sizeof(bid));
+ len = eir_append_service_data(ad, 0, 0x1852, bid, sizeof(bid));
+ memcpy(ad + len, adv->adv_data, adv->adv_data_len);
+ hci_set_adv_instance_data(hdev, adv->instance, len + adv->adv_data_len,
+ ad, 0, NULL);
+
+ return hci_update_adv_data_sync(hdev, adv->instance);
+}
+
+int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 sid,
+ u8 data_len, u8 *data, u32 flags, u16 min_interval,
+ u16 max_interval, u16 sync_interval)
+{
+ struct adv_info *adv = NULL;
+ int err;
+ bool added = false;
+
+ hci_disable_per_advertising_sync(hdev, instance);
+
+ if (instance) {
+ adv = hci_find_adv_instance(hdev, instance);
+ if (adv) {
+ if (sid != HCI_SID_INVALID && adv->sid != sid) {
+ /* If the SID don't match attempt to find by
+ * SID.
+ */
+ adv = hci_find_adv_sid(hdev, sid);
+ if (!adv) {
+ bt_dev_err(hdev,
+ "Unable to find adv_info");
+ return -EINVAL;
+ }
+ }
+
+ /* Turn it into periodic advertising */
+ adv->periodic = true;
+ adv->per_adv_data_len = data_len;
+ if (data)
+ memcpy(adv->per_adv_data, data, data_len);
+ adv->flags = flags;
+ } else if (!adv) {
+ /* Create an instance if that could not be found */
+ adv = hci_add_per_instance(hdev, instance, sid, flags,
+ data_len, data,
+ sync_interval,
+ sync_interval);
+ if (IS_ERR(adv))
+ return PTR_ERR(adv);
+ adv->pending = false;
+ added = true;
+ }
+ }
+
+ /* Start advertising */
+ err = hci_start_ext_adv_sync(hdev, instance);
+ if (err < 0)
+ goto fail;
+
+ err = hci_adv_bcast_annoucement(hdev, adv);
+ if (err < 0)
+ goto fail;
+
+ err = hci_set_per_adv_params_sync(hdev, instance, min_interval,
+ max_interval);
+ if (err < 0)
+ goto fail;
+
+ err = hci_set_per_adv_data_sync(hdev, instance);
+ if (err < 0)
+ goto fail;
+
+ err = hci_enable_per_advertising_sync(hdev, instance);
+ if (err < 0)
+ goto fail;
+
+ return 0;
+
+fail:
+ if (added)
+ hci_remove_adv_instance(hdev, instance);
+
+ return err;
+}
+
+static int hci_start_adv_sync(struct hci_dev *hdev, u8 instance)
+{
+ int err;
+
+ if (ext_adv_capable(hdev))
+ return hci_start_ext_adv_sync(hdev, instance);
+
+ err = hci_update_adv_data_sync(hdev, instance);
+ if (err)
+ return err;
+
+ err = hci_update_scan_rsp_data_sync(hdev, instance);
+ if (err)
+ return err;
+
+ return hci_enable_advertising_sync(hdev);
+}
+
+int hci_enable_advertising_sync(struct hci_dev *hdev)
+{
+ struct adv_info *adv_instance;
+ struct hci_cp_le_set_adv_param cp;
+ u8 own_addr_type, enable = 0x01;
+ bool connectable;
+ u16 adv_min_interval, adv_max_interval;
+ u32 flags;
+ u8 status;
+
+ if (ext_adv_capable(hdev))
+ return hci_enable_ext_advertising_sync(hdev,
+ hdev->cur_adv_instance);
+
+ flags = hci_adv_instance_flags(hdev, hdev->cur_adv_instance);
+ adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance);
+
+ /* If the "connectable" instance flag was not set, then choose between
+ * ADV_IND and ADV_NONCONN_IND based on the global connectable setting.
+ */
+ connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) ||
+ mgmt_get_connectable(hdev);
+
+ if (!is_advertising_allowed(hdev, connectable))
+ return -EINVAL;
+
+ status = hci_disable_advertising_sync(hdev);
+ if (status)
+ return status;
+
+ /* Clear the HCI_LE_ADV bit temporarily so that the
+ * hci_update_random_address knows that it's safe to go ahead
+ * and write a new random address. The flag will be set back on
+ * as soon as the SET_ADV_ENABLE HCI command completes.
+ */
+ hci_dev_clear_flag(hdev, HCI_LE_ADV);
+
+ /* Set require_privacy to true only when non-connectable
+ * advertising is used. In that case it is fine to use a
+ * non-resolvable private address.
+ */
+ status = hci_update_random_address_sync(hdev, !connectable,
+ adv_use_rpa(hdev, flags),
+ &own_addr_type);
+ if (status)
+ return status;
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (adv_instance) {
+ adv_min_interval = adv_instance->min_interval;
+ adv_max_interval = adv_instance->max_interval;
+ } else {
+ adv_min_interval = hdev->le_adv_min_interval;
+ adv_max_interval = hdev->le_adv_max_interval;
+ }
+
+ if (connectable) {
+ cp.type = LE_ADV_IND;
+ } else {
+ if (hci_adv_instance_is_scannable(hdev, hdev->cur_adv_instance))
+ cp.type = LE_ADV_SCAN_IND;
+ else
+ cp.type = LE_ADV_NONCONN_IND;
+
+ if (!hci_dev_test_flag(hdev, HCI_DISCOVERABLE) ||
+ hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) {
+ adv_min_interval = DISCOV_LE_FAST_ADV_INT_MIN;
+ adv_max_interval = DISCOV_LE_FAST_ADV_INT_MAX;
+ }
+ }
+
+ cp.min_interval = cpu_to_le16(adv_min_interval);
+ cp.max_interval = cpu_to_le16(adv_max_interval);
+ cp.own_address_type = own_addr_type;
+ cp.channel_map = hdev->le_adv_channel_map;
+
+ status = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_PARAM,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+ if (status)
+ return status;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_ENABLE,
+ sizeof(enable), &enable, HCI_CMD_TIMEOUT);
+}
+
+static int enable_advertising_sync(struct hci_dev *hdev, void *data)
+{
+ return hci_enable_advertising_sync(hdev);
+}
+
+int hci_enable_advertising(struct hci_dev *hdev)
+{
+ if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
+ list_empty(&hdev->adv_instances))
+ return 0;
+
+ return hci_cmd_sync_queue(hdev, enable_advertising_sync, NULL, NULL);
+}
+
+int hci_remove_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance,
+ struct sock *sk)
+{
+ int err;
+
+ if (!ext_adv_capable(hdev))
+ return 0;
+
+ err = hci_disable_ext_adv_instance_sync(hdev, instance);
+ if (err)
+ return err;
+
+ /* If request specifies an instance that doesn't exist, fail */
+ if (instance > 0 && !hci_find_adv_instance(hdev, instance))
+ return -EINVAL;
+
+ return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_REMOVE_ADV_SET,
+ sizeof(instance), &instance, 0,
+ HCI_CMD_TIMEOUT, sk);
+}
+
+int hci_le_terminate_big_sync(struct hci_dev *hdev, u8 handle, u8 reason)
+{
+ struct hci_cp_le_term_big cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = handle;
+ cp.reason = reason;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_TERM_BIG,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_schedule_adv_instance_sync(struct hci_dev *hdev, u8 instance,
+ bool force)
+{
+ struct adv_info *adv = NULL;
+ u16 timeout;
+
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) && !ext_adv_capable(hdev))
+ return -EPERM;
+
+ if (hdev->adv_instance_timeout)
+ return -EBUSY;
+
+ adv = hci_find_adv_instance(hdev, instance);
+ if (!adv)
+ return -ENOENT;
+
+ /* A zero timeout means unlimited advertising. As long as there is
+ * only one instance, duration should be ignored. We still set a timeout
+ * in case further instances are being added later on.
+ *
+ * If the remaining lifetime of the instance is more than the duration
+ * then the timeout corresponds to the duration, otherwise it will be
+ * reduced to the remaining instance lifetime.
+ */
+ if (adv->timeout == 0 || adv->duration <= adv->remaining_time)
+ timeout = adv->duration;
+ else
+ timeout = adv->remaining_time;
+
+ /* The remaining time is being reduced unless the instance is being
+ * advertised without time limit.
+ */
+ if (adv->timeout)
+ adv->remaining_time = adv->remaining_time - timeout;
+
+ /* Only use work for scheduling instances with legacy advertising */
+ if (!ext_adv_capable(hdev)) {
+ hdev->adv_instance_timeout = timeout;
+ queue_delayed_work(hdev->req_workqueue,
+ &hdev->adv_instance_expire,
+ secs_to_jiffies(timeout));
+ }
+
+ /* If we're just re-scheduling the same instance again then do not
+ * execute any HCI commands. This happens when a single instance is
+ * being advertised.
+ */
+ if (!force && hdev->cur_adv_instance == instance &&
+ hci_dev_test_flag(hdev, HCI_LE_ADV))
+ return 0;
+
+ hdev->cur_adv_instance = instance;
+
+ return hci_start_adv_sync(hdev, instance);
+}
+
+static int hci_clear_adv_sets_sync(struct hci_dev *hdev, struct sock *sk)
+{
+ int err;
+
+ if (!ext_adv_capable(hdev))
+ return 0;
+
+ /* Disable instance 0x00 to disable all instances */
+ err = hci_disable_ext_adv_instance_sync(hdev, 0x00);
+ if (err)
+ return err;
+
+ return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CLEAR_ADV_SETS,
+ 0, NULL, 0, HCI_CMD_TIMEOUT, sk);
+}
+
+static int hci_clear_adv_sync(struct hci_dev *hdev, struct sock *sk, bool force)
+{
+ struct adv_info *adv, *n;
+
+ if (ext_adv_capable(hdev))
+ /* Remove all existing sets */
+ return hci_clear_adv_sets_sync(hdev, sk);
+
+ /* This is safe as long as there is no command send while the lock is
+ * held.
+ */
+ hci_dev_lock(hdev);
+
+ /* Cleanup non-ext instances */
+ list_for_each_entry_safe(adv, n, &hdev->adv_instances, list) {
+ u8 instance = adv->instance;
+ int err;
+
+ if (!(force || adv->timeout))
+ continue;
+
+ err = hci_remove_adv_instance(hdev, instance);
+ if (!err)
+ mgmt_advertising_removed(sk, hdev, instance);
+ }
+
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static int hci_remove_adv_sync(struct hci_dev *hdev, u8 instance,
+ struct sock *sk)
+{
+ int err;
+
+ /* If we use extended advertising, instance has to be removed first. */
+ if (ext_adv_capable(hdev))
+ return hci_remove_ext_adv_instance_sync(hdev, instance, sk);
+
+ /* This is safe as long as there is no command send while the lock is
+ * held.
+ */
+ hci_dev_lock(hdev);
+
+ err = hci_remove_adv_instance(hdev, instance);
+ if (!err)
+ mgmt_advertising_removed(sk, hdev, instance);
+
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
+/* For a single instance:
+ * - force == true: The instance will be removed even when its remaining
+ * lifetime is not zero.
+ * - force == false: the instance will be deactivated but kept stored unless
+ * the remaining lifetime is zero.
+ *
+ * For instance == 0x00:
+ * - force == true: All instances will be removed regardless of their timeout
+ * setting.
+ * - force == false: Only instances that have a timeout will be removed.
+ */
+int hci_remove_advertising_sync(struct hci_dev *hdev, struct sock *sk,
+ u8 instance, bool force)
+{
+ struct adv_info *next = NULL;
+ int err;
+
+ /* Cancel any timeout concerning the removed instance(s). */
+ if (!instance || hdev->cur_adv_instance == instance)
+ cancel_adv_timeout(hdev);
+
+ /* Get the next instance to advertise BEFORE we remove
+ * the current one. This can be the same instance again
+ * if there is only one instance.
+ */
+ if (hdev->cur_adv_instance == instance)
+ next = hci_get_next_instance(hdev, instance);
+
+ if (!instance) {
+ err = hci_clear_adv_sync(hdev, sk, force);
+ if (err)
+ return err;
+ } else {
+ struct adv_info *adv = hci_find_adv_instance(hdev, instance);
+
+ if (force || (adv && adv->timeout && !adv->remaining_time)) {
+ /* Don't advertise a removed instance. */
+ if (next && next->instance == instance)
+ next = NULL;
+
+ err = hci_remove_adv_sync(hdev, instance, sk);
+ if (err)
+ return err;
+ }
+ }
+
+ if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ return 0;
+
+ if (next && !ext_adv_capable(hdev))
+ hci_schedule_adv_instance_sync(hdev, next->instance, false);
+
+ return 0;
+}
+
+int hci_read_rssi_sync(struct hci_dev *hdev, __le16 handle)
+{
+ struct hci_cp_read_rssi cp;
+
+ cp.handle = handle;
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_RSSI,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_read_clock_sync(struct hci_dev *hdev, struct hci_cp_read_clock *cp)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_CLOCK,
+ sizeof(*cp), cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_read_tx_power_sync(struct hci_dev *hdev, __le16 handle, u8 type)
+{
+ struct hci_cp_read_tx_power cp;
+
+ cp.handle = handle;
+ cp.type = type;
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_TX_POWER,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_disable_advertising_sync(struct hci_dev *hdev)
+{
+ u8 enable = 0x00;
+
+ /* If controller is not advertising we are done. */
+ if (!hci_dev_test_flag(hdev, HCI_LE_ADV))
+ return 0;
+
+ if (ext_adv_capable(hdev))
+ return hci_disable_ext_adv_instance_sync(hdev, 0x00);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_ENABLE,
+ sizeof(enable), &enable, HCI_CMD_TIMEOUT);
+}
+
+static int hci_le_set_ext_scan_enable_sync(struct hci_dev *hdev, u8 val,
+ u8 filter_dup)
+{
+ struct hci_cp_le_set_ext_scan_enable cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.enable = val;
+
+ if (hci_dev_test_flag(hdev, HCI_MESH))
+ cp.filter_dup = LE_SCAN_FILTER_DUP_DISABLE;
+ else
+ cp.filter_dup = filter_dup;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_ENABLE,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_le_set_scan_enable_sync(struct hci_dev *hdev, u8 val,
+ u8 filter_dup)
+{
+ struct hci_cp_le_set_scan_enable cp;
+
+ if (use_ext_scan(hdev))
+ return hci_le_set_ext_scan_enable_sync(hdev, val, filter_dup);
+
+ memset(&cp, 0, sizeof(cp));
+ cp.enable = val;
+
+ if (val && hci_dev_test_flag(hdev, HCI_MESH))
+ cp.filter_dup = LE_SCAN_FILTER_DUP_DISABLE;
+ else
+ cp.filter_dup = filter_dup;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_SCAN_ENABLE,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_le_set_addr_resolution_enable_sync(struct hci_dev *hdev, u8 val)
+{
+ if (!ll_privacy_capable(hdev))
+ return 0;
+
+ /* If controller is not/already resolving we are done. */
+ if (val == hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADDR_RESOLV_ENABLE,
+ sizeof(val), &val, HCI_CMD_TIMEOUT);
+}
+
+static int hci_scan_disable_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ /* If controller is not scanning we are done. */
+ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
+ return 0;
+
+ if (hdev->scanning_paused) {
+ bt_dev_dbg(hdev, "Scanning is paused for suspend");
+ return 0;
+ }
+
+ err = hci_le_set_scan_enable_sync(hdev, LE_SCAN_DISABLE, 0x00);
+ if (err) {
+ bt_dev_err(hdev, "Unable to disable scanning: %d", err);
+ return err;
+ }
+
+ return err;
+}
+
+static bool scan_use_rpa(struct hci_dev *hdev)
+{
+ return hci_dev_test_flag(hdev, HCI_PRIVACY);
+}
+
+static void hci_start_interleave_scan(struct hci_dev *hdev)
+{
+ hdev->interleave_scan_state = INTERLEAVE_SCAN_NO_FILTER;
+ queue_delayed_work(hdev->req_workqueue,
+ &hdev->interleave_scan, 0);
+}
+
+static void cancel_interleave_scan(struct hci_dev *hdev)
+{
+ bt_dev_dbg(hdev, "cancelling interleave scan");
+
+ cancel_delayed_work_sync(&hdev->interleave_scan);
+
+ hdev->interleave_scan_state = INTERLEAVE_SCAN_NONE;
+}
+
+/* Return true if interleave_scan wasn't started until exiting this function,
+ * otherwise, return false
+ */
+static bool hci_update_interleaved_scan_sync(struct hci_dev *hdev)
+{
+ /* Do interleaved scan only if all of the following are true:
+ * - There is at least one ADV monitor
+ * - At least one pending LE connection or one device to be scanned for
+ * - Monitor offloading is not supported
+ * If so, we should alternate between allowlist scan and one without
+ * any filters to save power.
+ */
+ bool use_interleaving = hci_is_adv_monitoring(hdev) &&
+ !(list_empty(&hdev->pend_le_conns) &&
+ list_empty(&hdev->pend_le_reports)) &&
+ hci_get_adv_monitor_offload_ext(hdev) ==
+ HCI_ADV_MONITOR_EXT_NONE;
+ bool is_interleaving = is_interleave_scanning(hdev);
+
+ if (use_interleaving && !is_interleaving) {
+ hci_start_interleave_scan(hdev);
+ bt_dev_dbg(hdev, "starting interleave scan");
+ return true;
+ }
+
+ if (!use_interleaving && is_interleaving)
+ cancel_interleave_scan(hdev);
+
+ return false;
+}
+
+/* Removes connection to resolve list if needed.*/
+static int hci_le_del_resolve_list_sync(struct hci_dev *hdev,
+ bdaddr_t *bdaddr, u8 bdaddr_type)
+{
+ struct hci_cp_le_del_from_resolv_list cp;
+ struct bdaddr_list_with_irk *entry;
+
+ if (!ll_privacy_capable(hdev))
+ return 0;
+
+ /* Check if the IRK has been programmed */
+ entry = hci_bdaddr_list_lookup_with_irk(&hdev->le_resolv_list, bdaddr,
+ bdaddr_type);
+ if (!entry)
+ return 0;
+
+ cp.bdaddr_type = bdaddr_type;
+ bacpy(&cp.bdaddr, bdaddr);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_DEL_FROM_RESOLV_LIST,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_le_del_accept_list_sync(struct hci_dev *hdev,
+ bdaddr_t *bdaddr, u8 bdaddr_type)
+{
+ struct hci_cp_le_del_from_accept_list cp;
+ int err;
+
+ /* Check if device is on accept list before removing it */
+ if (!hci_bdaddr_list_lookup(&hdev->le_accept_list, bdaddr, bdaddr_type))
+ return 0;
+
+ cp.bdaddr_type = bdaddr_type;
+ bacpy(&cp.bdaddr, bdaddr);
+
+ /* Ignore errors when removing from resolving list as that is likely
+ * that the device was never added.
+ */
+ hci_le_del_resolve_list_sync(hdev, &cp.bdaddr, cp.bdaddr_type);
+
+ err = __hci_cmd_sync_status(hdev, HCI_OP_LE_DEL_FROM_ACCEPT_LIST,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+ if (err) {
+ bt_dev_err(hdev, "Unable to remove from allow list: %d", err);
+ return err;
+ }
+
+ bt_dev_dbg(hdev, "Remove %pMR (0x%x) from allow list", &cp.bdaddr,
+ cp.bdaddr_type);
+
+ return 0;
+}
+
+struct conn_params {
+ bdaddr_t addr;
+ u8 addr_type;
+ hci_conn_flags_t flags;
+ u8 privacy_mode;
+};
+
+/* Adds connection to resolve list if needed.
+ * Setting params to NULL programs local hdev->irk
+ */
+static int hci_le_add_resolve_list_sync(struct hci_dev *hdev,
+ struct conn_params *params)
+{
+ struct hci_cp_le_add_to_resolv_list cp;
+ struct smp_irk *irk;
+ struct bdaddr_list_with_irk *entry;
+ struct hci_conn_params *p;
+
+ if (!ll_privacy_capable(hdev))
+ return 0;
+
+ /* Attempt to program local identity address, type and irk if params is
+ * NULL.
+ */
+ if (!params) {
+ if (!hci_dev_test_flag(hdev, HCI_PRIVACY))
+ return 0;
+
+ hci_copy_identity_address(hdev, &cp.bdaddr, &cp.bdaddr_type);
+ memcpy(cp.peer_irk, hdev->irk, 16);
+ goto done;
+ } else if (!(params->flags & HCI_CONN_FLAG_ADDRESS_RESOLUTION))
+ return 0;
+
+ irk = hci_find_irk_by_addr(hdev, &params->addr, params->addr_type);
+ if (!irk)
+ return 0;
+
+ /* Check if the IK has _not_ been programmed yet. */
+ entry = hci_bdaddr_list_lookup_with_irk(&hdev->le_resolv_list,
+ &params->addr,
+ params->addr_type);
+ if (entry)
+ return 0;
+
+ cp.bdaddr_type = params->addr_type;
+ bacpy(&cp.bdaddr, &params->addr);
+ memcpy(cp.peer_irk, irk->val, 16);
+
+ /* Default privacy mode is always Network */
+ params->privacy_mode = HCI_NETWORK_PRIVACY;
+
+ rcu_read_lock();
+ p = hci_pend_le_action_lookup(&hdev->pend_le_conns,
+ &params->addr, params->addr_type);
+ if (!p)
+ p = hci_pend_le_action_lookup(&hdev->pend_le_reports,
+ &params->addr, params->addr_type);
+ if (p)
+ WRITE_ONCE(p->privacy_mode, HCI_NETWORK_PRIVACY);
+ rcu_read_unlock();
+
+done:
+ if (hci_dev_test_flag(hdev, HCI_PRIVACY))
+ memcpy(cp.local_irk, hdev->irk, 16);
+ else
+ memset(cp.local_irk, 0, 16);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_ADD_TO_RESOLV_LIST,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+/* Set Device Privacy Mode. */
+static int hci_le_set_privacy_mode_sync(struct hci_dev *hdev,
+ struct conn_params *params)
+{
+ struct hci_cp_le_set_privacy_mode cp;
+ struct smp_irk *irk;
+
+ if (!ll_privacy_capable(hdev) ||
+ !(params->flags & HCI_CONN_FLAG_ADDRESS_RESOLUTION))
+ return 0;
+
+ /* If device privacy mode has already been set there is nothing to do */
+ if (params->privacy_mode == HCI_DEVICE_PRIVACY)
+ return 0;
+
+ /* Check if HCI_CONN_FLAG_DEVICE_PRIVACY has been set as it also
+ * indicates that LL Privacy has been enabled and
+ * HCI_OP_LE_SET_PRIVACY_MODE is supported.
+ */
+ if (!(params->flags & HCI_CONN_FLAG_DEVICE_PRIVACY))
+ return 0;
+
+ irk = hci_find_irk_by_addr(hdev, &params->addr, params->addr_type);
+ if (!irk)
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.bdaddr_type = irk->addr_type;
+ bacpy(&cp.bdaddr, &irk->bdaddr);
+ cp.mode = HCI_DEVICE_PRIVACY;
+
+ /* Note: params->privacy_mode is not updated since it is a copy */
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PRIVACY_MODE,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+/* Adds connection to allow list if needed, if the device uses RPA (has IRK)
+ * this attempts to program the device in the resolving list as well and
+ * properly set the privacy mode.
+ */
+static int hci_le_add_accept_list_sync(struct hci_dev *hdev,
+ struct conn_params *params,
+ u8 *num_entries)
+{
+ struct hci_cp_le_add_to_accept_list cp;
+ int err;
+
+ /* During suspend, only wakeable devices can be in acceptlist */
+ if (hdev->suspended &&
+ !(params->flags & HCI_CONN_FLAG_REMOTE_WAKEUP)) {
+ hci_le_del_accept_list_sync(hdev, &params->addr,
+ params->addr_type);
+ return 0;
+ }
+
+ /* Select filter policy to accept all advertising */
+ if (*num_entries >= hdev->le_accept_list_size)
+ return -ENOSPC;
+
+ /* Attempt to program the device in the resolving list first to avoid
+ * having to rollback in case it fails since the resolving list is
+ * dynamic it can probably be smaller than the accept list.
+ */
+ err = hci_le_add_resolve_list_sync(hdev, params);
+ if (err) {
+ bt_dev_err(hdev, "Unable to add to resolve list: %d", err);
+ return err;
+ }
+
+ /* Set Privacy Mode */
+ err = hci_le_set_privacy_mode_sync(hdev, params);
+ if (err) {
+ bt_dev_err(hdev, "Unable to set privacy mode: %d", err);
+ return err;
+ }
+
+ /* Check if already in accept list */
+ if (hci_bdaddr_list_lookup(&hdev->le_accept_list, &params->addr,
+ params->addr_type))
+ return 0;
+
+ *num_entries += 1;
+ cp.bdaddr_type = params->addr_type;
+ bacpy(&cp.bdaddr, &params->addr);
+
+ err = __hci_cmd_sync_status(hdev, HCI_OP_LE_ADD_TO_ACCEPT_LIST,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+ if (err) {
+ bt_dev_err(hdev, "Unable to add to allow list: %d", err);
+ /* Rollback the device from the resolving list */
+ hci_le_del_resolve_list_sync(hdev, &cp.bdaddr, cp.bdaddr_type);
+ return err;
+ }
+
+ bt_dev_dbg(hdev, "Add %pMR (0x%x) to allow list", &cp.bdaddr,
+ cp.bdaddr_type);
+
+ return 0;
+}
+
+/* This function disables/pause all advertising instances */
+static int hci_pause_advertising_sync(struct hci_dev *hdev)
+{
+ int err;
+ int old_state;
+
+ /* If controller is not advertising we are done. */
+ if (!hci_dev_test_flag(hdev, HCI_LE_ADV))
+ return 0;
+
+ /* If already been paused there is nothing to do. */
+ if (hdev->advertising_paused)
+ return 0;
+
+ bt_dev_dbg(hdev, "Pausing directed advertising");
+
+ /* Stop directed advertising */
+ old_state = hci_dev_test_flag(hdev, HCI_ADVERTISING);
+ if (old_state) {
+ /* When discoverable timeout triggers, then just make sure
+ * the limited discoverable flag is cleared. Even in the case
+ * of a timeout triggered from general discoverable, it is
+ * safe to unconditionally clear the flag.
+ */
+ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
+ hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
+ hdev->discov_timeout = 0;
+ }
+
+ bt_dev_dbg(hdev, "Pausing advertising instances");
+
+ /* Call to disable any advertisements active on the controller.
+ * This will succeed even if no advertisements are configured.
+ */
+ err = hci_disable_advertising_sync(hdev);
+ if (err)
+ return err;
+
+ /* If we are using software rotation, pause the loop */
+ if (!ext_adv_capable(hdev))
+ cancel_adv_timeout(hdev);
+
+ hdev->advertising_paused = true;
+ hdev->advertising_old_state = old_state;
+
+ return 0;
+}
+
+/* This function enables all user advertising instances */
+static int hci_resume_advertising_sync(struct hci_dev *hdev)
+{
+ struct adv_info *adv, *tmp;
+ int err;
+
+ /* If advertising has not been paused there is nothing to do. */
+ if (!hdev->advertising_paused)
+ return 0;
+
+ /* Resume directed advertising */
+ hdev->advertising_paused = false;
+ if (hdev->advertising_old_state) {
+ hci_dev_set_flag(hdev, HCI_ADVERTISING);
+ hdev->advertising_old_state = 0;
+ }
+
+ bt_dev_dbg(hdev, "Resuming advertising instances");
+
+ if (ext_adv_capable(hdev)) {
+ /* Call for each tracked instance to be re-enabled */
+ list_for_each_entry_safe(adv, tmp, &hdev->adv_instances, list) {
+ err = hci_enable_ext_advertising_sync(hdev,
+ adv->instance);
+ if (!err)
+ continue;
+
+ /* If the instance cannot be resumed remove it */
+ hci_remove_ext_adv_instance_sync(hdev, adv->instance,
+ NULL);
+ }
+
+ /* If current advertising instance is set to instance 0x00
+ * then we need to re-enable it.
+ */
+ if (hci_dev_test_and_clear_flag(hdev, HCI_LE_ADV_0))
+ err = hci_enable_ext_advertising_sync(hdev, 0x00);
+ } else {
+ /* Schedule for most recent instance to be restarted and begin
+ * the software rotation loop
+ */
+ err = hci_schedule_adv_instance_sync(hdev,
+ hdev->cur_adv_instance,
+ true);
+ }
+
+ hdev->advertising_paused = false;
+
+ return err;
+}
+
+static int hci_pause_addr_resolution(struct hci_dev *hdev)
+{
+ int err;
+
+ if (!ll_privacy_capable(hdev))
+ return 0;
+
+ if (!hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION))
+ return 0;
+
+ /* Cannot disable addr resolution if scanning is enabled or
+ * when initiating an LE connection.
+ */
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN) ||
+ hci_lookup_le_connect(hdev)) {
+ bt_dev_err(hdev, "Command not allowed when scan/LE connect");
+ return -EPERM;
+ }
+
+ /* Cannot disable addr resolution if advertising is enabled. */
+ err = hci_pause_advertising_sync(hdev);
+ if (err) {
+ bt_dev_err(hdev, "Pause advertising failed: %d", err);
+ return err;
+ }
+
+ err = hci_le_set_addr_resolution_enable_sync(hdev, 0x00);
+ if (err)
+ bt_dev_err(hdev, "Unable to disable Address Resolution: %d",
+ err);
+
+ /* Return if address resolution is disabled and RPA is not used. */
+ if (!err && scan_use_rpa(hdev))
+ return 0;
+
+ hci_resume_advertising_sync(hdev);
+ return err;
+}
+
+struct sk_buff *hci_read_local_oob_data_sync(struct hci_dev *hdev,
+ bool extended, struct sock *sk)
+{
+ u16 opcode = extended ? HCI_OP_READ_LOCAL_OOB_EXT_DATA :
+ HCI_OP_READ_LOCAL_OOB_DATA;
+
+ return __hci_cmd_sync_sk(hdev, opcode, 0, NULL, 0, HCI_CMD_TIMEOUT, sk);
+}
+
+static struct conn_params *conn_params_copy(struct list_head *list, size_t *n)
+{
+ struct hci_conn_params *params;
+ struct conn_params *p;
+ size_t i;
+
+ rcu_read_lock();
+
+ i = 0;
+ list_for_each_entry_rcu(params, list, action)
+ ++i;
+ *n = i;
+
+ rcu_read_unlock();
+
+ p = kvcalloc(*n, sizeof(struct conn_params), GFP_KERNEL);
+ if (!p)
+ return NULL;
+
+ rcu_read_lock();
+
+ i = 0;
+ list_for_each_entry_rcu(params, list, action) {
+ /* Racing adds are handled in next scan update */
+ if (i >= *n)
+ break;
+
+ /* No hdev->lock, but: addr, addr_type are immutable.
+ * privacy_mode is only written by us or in
+ * hci_cc_le_set_privacy_mode that we wait for.
+ * We should be idempotent so MGMT updating flags
+ * while we are processing is OK.
+ */
+ bacpy(&p[i].addr, &params->addr);
+ p[i].addr_type = params->addr_type;
+ p[i].flags = READ_ONCE(params->flags);
+ p[i].privacy_mode = READ_ONCE(params->privacy_mode);
+ ++i;
+ }
+
+ rcu_read_unlock();
+
+ *n = i;
+ return p;
+}
+
+/* Clear LE Accept List */
+static int hci_le_clear_accept_list_sync(struct hci_dev *hdev)
+{
+ if (!(hdev->commands[26] & 0x80))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_CLEAR_ACCEPT_LIST, 0, NULL,
+ HCI_CMD_TIMEOUT);
+}
+
+/* Device must not be scanning when updating the accept list.
+ *
+ * Update is done using the following sequence:
+ *
+ * ll_privacy_capable((Disable Advertising) -> Disable Resolving List) ->
+ * Remove Devices From Accept List ->
+ * (has IRK && ll_privacy_capable(Remove Devices From Resolving List))->
+ * Add Devices to Accept List ->
+ * (has IRK && ll_privacy_capable(Remove Devices From Resolving List)) ->
+ * ll_privacy_capable(Enable Resolving List -> (Enable Advertising)) ->
+ * Enable Scanning
+ *
+ * In case of failure advertising shall be restored to its original state and
+ * return would disable accept list since either accept or resolving list could
+ * not be programmed.
+ *
+ */
+static u8 hci_update_accept_list_sync(struct hci_dev *hdev)
+{
+ struct conn_params *params;
+ struct bdaddr_list *b, *t;
+ u8 num_entries = 0;
+ bool pend_conn, pend_report;
+ u8 filter_policy;
+ size_t i, n;
+ int err;
+
+ /* Pause advertising if resolving list can be used as controllers
+ * cannot accept resolving list modifications while advertising.
+ */
+ if (ll_privacy_capable(hdev)) {
+ err = hci_pause_advertising_sync(hdev);
+ if (err) {
+ bt_dev_err(hdev, "pause advertising failed: %d", err);
+ return 0x00;
+ }
+ }
+
+ /* Disable address resolution while reprogramming accept list since
+ * devices that do have an IRK will be programmed in the resolving list
+ * when LL Privacy is enabled.
+ */
+ err = hci_le_set_addr_resolution_enable_sync(hdev, 0x00);
+ if (err) {
+ bt_dev_err(hdev, "Unable to disable LL privacy: %d", err);
+ goto done;
+ }
+
+ /* Force address filtering if PA Sync is in progress */
+ if (hci_dev_test_flag(hdev, HCI_PA_SYNC)) {
+ struct hci_conn *conn;
+
+ conn = hci_conn_hash_lookup_create_pa_sync(hdev);
+ if (conn) {
+ struct conn_params pa;
+
+ memset(&pa, 0, sizeof(pa));
+
+ bacpy(&pa.addr, &conn->dst);
+ pa.addr_type = conn->dst_type;
+
+ /* Clear first since there could be addresses left
+ * behind.
+ */
+ hci_le_clear_accept_list_sync(hdev);
+
+ num_entries = 1;
+ err = hci_le_add_accept_list_sync(hdev, &pa,
+ &num_entries);
+ goto done;
+ }
+ }
+
+ /* Go through the current accept list programmed into the
+ * controller one by one and check if that address is connected or is
+ * still in the list of pending connections or list of devices to
+ * report. If not present in either list, then remove it from
+ * the controller.
+ */
+ list_for_each_entry_safe(b, t, &hdev->le_accept_list, list) {
+ if (hci_conn_hash_lookup_le(hdev, &b->bdaddr, b->bdaddr_type))
+ continue;
+
+ /* Pointers not dereferenced, no locks needed */
+ pend_conn = hci_pend_le_action_lookup(&hdev->pend_le_conns,
+ &b->bdaddr,
+ b->bdaddr_type);
+ pend_report = hci_pend_le_action_lookup(&hdev->pend_le_reports,
+ &b->bdaddr,
+ b->bdaddr_type);
+
+ /* If the device is not likely to connect or report,
+ * remove it from the acceptlist.
+ */
+ if (!pend_conn && !pend_report) {
+ hci_le_del_accept_list_sync(hdev, &b->bdaddr,
+ b->bdaddr_type);
+ continue;
+ }
+
+ num_entries++;
+ }
+
+ /* Since all no longer valid accept list entries have been
+ * removed, walk through the list of pending connections
+ * and ensure that any new device gets programmed into
+ * the controller.
+ *
+ * If the list of the devices is larger than the list of
+ * available accept list entries in the controller, then
+ * just abort and return filer policy value to not use the
+ * accept list.
+ *
+ * The list and params may be mutated while we wait for events,
+ * so make a copy and iterate it.
+ */
+
+ params = conn_params_copy(&hdev->pend_le_conns, &n);
+ if (!params) {
+ err = -ENOMEM;
+ goto done;
+ }
+
+ for (i = 0; i < n; ++i) {
+ err = hci_le_add_accept_list_sync(hdev, &params[i],
+ &num_entries);
+ if (err) {
+ kvfree(params);
+ goto done;
+ }
+ }
+
+ kvfree(params);
+
+ /* After adding all new pending connections, walk through
+ * the list of pending reports and also add these to the
+ * accept list if there is still space. Abort if space runs out.
+ */
+
+ params = conn_params_copy(&hdev->pend_le_reports, &n);
+ if (!params) {
+ err = -ENOMEM;
+ goto done;
+ }
+
+ for (i = 0; i < n; ++i) {
+ err = hci_le_add_accept_list_sync(hdev, &params[i],
+ &num_entries);
+ if (err) {
+ kvfree(params);
+ goto done;
+ }
+ }
+
+ kvfree(params);
+
+ /* Use the allowlist unless the following conditions are all true:
+ * - We are not currently suspending
+ * - There are 1 or more ADV monitors registered and it's not offloaded
+ * - Interleaved scanning is not currently using the allowlist
+ */
+ if (!idr_is_empty(&hdev->adv_monitors_idr) && !hdev->suspended &&
+ hci_get_adv_monitor_offload_ext(hdev) == HCI_ADV_MONITOR_EXT_NONE &&
+ hdev->interleave_scan_state != INTERLEAVE_SCAN_ALLOWLIST)
+ err = -EINVAL;
+
+done:
+ filter_policy = err ? 0x00 : 0x01;
+
+ /* Enable address resolution when LL Privacy is enabled. */
+ err = hci_le_set_addr_resolution_enable_sync(hdev, 0x01);
+ if (err)
+ bt_dev_err(hdev, "Unable to enable LL privacy: %d", err);
+
+ /* Resume advertising if it was paused */
+ if (ll_privacy_capable(hdev))
+ hci_resume_advertising_sync(hdev);
+
+ /* Select filter policy to use accept list */
+ return filter_policy;
+}
+
+static void hci_le_scan_phy_params(struct hci_cp_le_scan_phy_params *cp,
+ u8 type, u16 interval, u16 window)
+{
+ cp->type = type;
+ cp->interval = cpu_to_le16(interval);
+ cp->window = cpu_to_le16(window);
+}
+
+static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type,
+ u16 interval, u16 window,
+ u8 own_addr_type, u8 filter_policy)
+{
+ struct hci_cp_le_set_ext_scan_params *cp;
+ struct hci_cp_le_scan_phy_params *phy;
+ u8 data[sizeof(*cp) + sizeof(*phy) * 2];
+ u8 num_phy = 0x00;
+
+ cp = (void *)data;
+ phy = (void *)cp->data;
+
+ memset(data, 0, sizeof(data));
+
+ cp->own_addr_type = own_addr_type;
+ cp->filter_policy = filter_policy;
+
+ /* Check if PA Sync is in progress then select the PHY based on the
+ * hci_conn.iso_qos.
+ */
+ if (hci_dev_test_flag(hdev, HCI_PA_SYNC)) {
+ struct hci_cp_le_add_to_accept_list *sent;
+
+ sent = hci_sent_cmd_data(hdev, HCI_OP_LE_ADD_TO_ACCEPT_LIST);
+ if (sent) {
+ struct hci_conn *conn;
+
+ conn = hci_conn_hash_lookup_ba(hdev, PA_LINK,
+ &sent->bdaddr);
+ if (conn) {
+ struct bt_iso_qos *qos = &conn->iso_qos;
+
+ if (qos->bcast.in.phy & BT_ISO_PHY_1M ||
+ qos->bcast.in.phy & BT_ISO_PHY_2M) {
+ cp->scanning_phys |= LE_SCAN_PHY_1M;
+ hci_le_scan_phy_params(phy, type,
+ interval,
+ window);
+ num_phy++;
+ phy++;
+ }
+
+ if (qos->bcast.in.phy & BT_ISO_PHY_CODED) {
+ cp->scanning_phys |= LE_SCAN_PHY_CODED;
+ hci_le_scan_phy_params(phy, type,
+ interval * 3,
+ window * 3);
+ num_phy++;
+ phy++;
+ }
+
+ if (num_phy)
+ goto done;
+ }
+ }
+ }
+
+ if (scan_1m(hdev) || scan_2m(hdev)) {
+ cp->scanning_phys |= LE_SCAN_PHY_1M;
+ hci_le_scan_phy_params(phy, type, interval, window);
+ num_phy++;
+ phy++;
+ }
+
+ if (scan_coded(hdev)) {
+ cp->scanning_phys |= LE_SCAN_PHY_CODED;
+ hci_le_scan_phy_params(phy, type, interval * 3, window * 3);
+ num_phy++;
+ phy++;
+ }
+
+done:
+ if (!num_phy)
+ return -EINVAL;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_PARAMS,
+ sizeof(*cp) + sizeof(*phy) * num_phy,
+ data, HCI_CMD_TIMEOUT);
+}
+
+static int hci_le_set_scan_param_sync(struct hci_dev *hdev, u8 type,
+ u16 interval, u16 window,
+ u8 own_addr_type, u8 filter_policy)
+{
+ struct hci_cp_le_set_scan_param cp;
+
+ if (use_ext_scan(hdev))
+ return hci_le_set_ext_scan_param_sync(hdev, type, interval,
+ window, own_addr_type,
+ filter_policy);
+
+ memset(&cp, 0, sizeof(cp));
+ cp.type = type;
+ cp.interval = cpu_to_le16(interval);
+ cp.window = cpu_to_le16(window);
+ cp.own_address_type = own_addr_type;
+ cp.filter_policy = filter_policy;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_SCAN_PARAM,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_start_scan_sync(struct hci_dev *hdev, u8 type, u16 interval,
+ u16 window, u8 own_addr_type, u8 filter_policy,
+ u8 filter_dup)
+{
+ int err;
+
+ if (hdev->scanning_paused) {
+ bt_dev_dbg(hdev, "Scanning is paused for suspend");
+ return 0;
+ }
+
+ err = hci_le_set_scan_param_sync(hdev, type, interval, window,
+ own_addr_type, filter_policy);
+ if (err)
+ return err;
+
+ return hci_le_set_scan_enable_sync(hdev, LE_SCAN_ENABLE, filter_dup);
+}
+
+static int hci_passive_scan_sync(struct hci_dev *hdev)
+{
+ u8 own_addr_type;
+ u8 filter_policy;
+ u16 window, interval;
+ u8 filter_dups = LE_SCAN_FILTER_DUP_ENABLE;
+ int err;
+
+ if (hdev->scanning_paused) {
+ bt_dev_dbg(hdev, "Scanning is paused for suspend");
+ return 0;
+ }
+
+ err = hci_scan_disable_sync(hdev);
+ if (err) {
+ bt_dev_err(hdev, "disable scanning failed: %d", err);
+ return err;
+ }
+
+ /* Set require_privacy to false since no SCAN_REQ are send
+ * during passive scanning. Not using an non-resolvable address
+ * here is important so that peer devices using direct
+ * advertising with our address will be correctly reported
+ * by the controller.
+ */
+ if (hci_update_random_address_sync(hdev, false, scan_use_rpa(hdev),
+ &own_addr_type))
+ return 0;
+
+ if (hdev->enable_advmon_interleave_scan &&
+ hci_update_interleaved_scan_sync(hdev))
+ return 0;
+
+ bt_dev_dbg(hdev, "interleave state %d", hdev->interleave_scan_state);
+
+ /* Adding or removing entries from the accept list must
+ * happen before enabling scanning. The controller does
+ * not allow accept list modification while scanning.
+ */
+ filter_policy = hci_update_accept_list_sync(hdev);
+
+ /* If suspended and filter_policy set to 0x00 (no acceptlist) then
+ * passive scanning cannot be started since that would require the host
+ * to be woken up to process the reports.
+ */
+ if (hdev->suspended && !filter_policy) {
+ /* Check if accept list is empty then there is no need to scan
+ * while suspended.
+ */
+ if (list_empty(&hdev->le_accept_list))
+ return 0;
+
+ /* If there are devices is the accept_list that means some
+ * devices could not be programmed which in non-suspended case
+ * means filter_policy needs to be set to 0x00 so the host needs
+ * to filter, but since this is treating suspended case we
+ * can ignore device needing host to filter to allow devices in
+ * the acceptlist to be able to wakeup the system.
+ */
+ filter_policy = 0x01;
+ }
+
+ /* When the controller is using random resolvable addresses and
+ * with that having LE privacy enabled, then controllers with
+ * Extended Scanner Filter Policies support can now enable support
+ * for handling directed advertising.
+ *
+ * So instead of using filter polices 0x00 (no acceptlist)
+ * and 0x01 (acceptlist enabled) use the new filter policies
+ * 0x02 (no acceptlist) and 0x03 (acceptlist enabled).
+ */
+ if (hci_dev_test_flag(hdev, HCI_PRIVACY) &&
+ (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY))
+ filter_policy |= 0x02;
+
+ if (hdev->suspended) {
+ window = hdev->le_scan_window_suspend;
+ interval = hdev->le_scan_int_suspend;
+ } else if (hci_is_le_conn_scanning(hdev)) {
+ window = hdev->le_scan_window_connect;
+ interval = hdev->le_scan_int_connect;
+ } else if (hci_is_adv_monitoring(hdev)) {
+ window = hdev->le_scan_window_adv_monitor;
+ interval = hdev->le_scan_int_adv_monitor;
+
+ /* Disable duplicates filter when scanning for advertisement
+ * monitor for the following reasons.
+ *
+ * For HW pattern filtering (ex. MSFT), Realtek and Qualcomm
+ * controllers ignore RSSI_Sampling_Period when the duplicates
+ * filter is enabled.
+ *
+ * For SW pattern filtering, when we're not doing interleaved
+ * scanning, it is necessary to disable duplicates filter,
+ * otherwise hosts can only receive one advertisement and it's
+ * impossible to know if a peer is still in range.
+ */
+ filter_dups = LE_SCAN_FILTER_DUP_DISABLE;
+ } else {
+ window = hdev->le_scan_window;
+ interval = hdev->le_scan_interval;
+ }
+
+ /* Disable all filtering for Mesh */
+ if (hci_dev_test_flag(hdev, HCI_MESH)) {
+ filter_policy = 0;
+ filter_dups = LE_SCAN_FILTER_DUP_DISABLE;
+ }
+
+ bt_dev_dbg(hdev, "LE passive scan with acceptlist = %d", filter_policy);
+
+ return hci_start_scan_sync(hdev, LE_SCAN_PASSIVE, interval, window,
+ own_addr_type, filter_policy, filter_dups);
+}
+
+/* This function controls the passive scanning based on hdev->pend_le_conns
+ * list. If there are pending LE connection we start the background scanning,
+ * otherwise we stop it in the following sequence:
+ *
+ * If there are devices to scan:
+ *
+ * Disable Scanning -> Update Accept List ->
+ * ll_privacy_capable((Disable Advertising) -> Disable Resolving List ->
+ * Update Resolving List -> Enable Resolving List -> (Enable Advertising)) ->
+ * Enable Scanning
+ *
+ * Otherwise:
+ *
+ * Disable Scanning
+ */
+int hci_update_passive_scan_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ if (!test_bit(HCI_UP, &hdev->flags) ||
+ test_bit(HCI_INIT, &hdev->flags) ||
+ hci_dev_test_flag(hdev, HCI_SETUP) ||
+ hci_dev_test_flag(hdev, HCI_CONFIG) ||
+ hci_dev_test_flag(hdev, HCI_AUTO_OFF) ||
+ hci_dev_test_flag(hdev, HCI_UNREGISTER))
+ return 0;
+
+ /* No point in doing scanning if LE support hasn't been enabled */
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return 0;
+
+ /* If discovery is active don't interfere with it */
+ if (hdev->discovery.state != DISCOVERY_STOPPED)
+ return 0;
+
+ /* Reset RSSI and UUID filters when starting background scanning
+ * since these filters are meant for service discovery only.
+ *
+ * The Start Discovery and Start Service Discovery operations
+ * ensure to set proper values for RSSI threshold and UUID
+ * filter list. So it is safe to just reset them here.
+ */
+ hci_discovery_filter_clear(hdev);
+
+ bt_dev_dbg(hdev, "ADV monitoring is %s",
+ hci_is_adv_monitoring(hdev) ? "on" : "off");
+
+ if (!hci_dev_test_flag(hdev, HCI_MESH) &&
+ list_empty(&hdev->pend_le_conns) &&
+ list_empty(&hdev->pend_le_reports) &&
+ !hci_is_adv_monitoring(hdev) &&
+ !hci_dev_test_flag(hdev, HCI_PA_SYNC)) {
+ /* If there is no pending LE connections or devices
+ * to be scanned for or no ADV monitors, we should stop the
+ * background scanning.
+ */
+
+ bt_dev_dbg(hdev, "stopping background scanning");
+
+ err = hci_scan_disable_sync(hdev);
+ if (err)
+ bt_dev_err(hdev, "stop background scanning failed: %d",
+ err);
+ } else {
+ /* If there is at least one pending LE connection, we should
+ * keep the background scan running.
+ */
+
+ /* If controller is connecting, we should not start scanning
+ * since some controllers are not able to scan and connect at
+ * the same time.
+ */
+ if (hci_lookup_le_connect(hdev))
+ return 0;
+
+ bt_dev_dbg(hdev, "start background scanning");
+
+ err = hci_passive_scan_sync(hdev);
+ if (err)
+ bt_dev_err(hdev, "start background scanning failed: %d",
+ err);
+ }
+
+ return err;
+}
+
+static int update_scan_sync(struct hci_dev *hdev, void *data)
+{
+ return hci_update_scan_sync(hdev);
+}
+
+int hci_update_scan(struct hci_dev *hdev)
+{
+ return hci_cmd_sync_queue(hdev, update_scan_sync, NULL, NULL);
+}
+
+static int update_passive_scan_sync(struct hci_dev *hdev, void *data)
+{
+ return hci_update_passive_scan_sync(hdev);
+}
+
+int hci_update_passive_scan(struct hci_dev *hdev)
+{
+ /* Only queue if it would have any effect */
+ if (!test_bit(HCI_UP, &hdev->flags) ||
+ test_bit(HCI_INIT, &hdev->flags) ||
+ hci_dev_test_flag(hdev, HCI_SETUP) ||
+ hci_dev_test_flag(hdev, HCI_CONFIG) ||
+ hci_dev_test_flag(hdev, HCI_AUTO_OFF) ||
+ hci_dev_test_flag(hdev, HCI_UNREGISTER))
+ return 0;
+
+ return hci_cmd_sync_queue_once(hdev, update_passive_scan_sync, NULL,
+ NULL);
+}
+
+int hci_write_sc_support_sync(struct hci_dev *hdev, u8 val)
+{
+ int err;
+
+ if (!bredr_sc_enabled(hdev) || lmp_host_sc_capable(hdev))
+ return 0;
+
+ err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SC_SUPPORT,
+ sizeof(val), &val, HCI_CMD_TIMEOUT);
+
+ if (!err) {
+ if (val) {
+ hdev->features[1][0] |= LMP_HOST_SC;
+ hci_dev_set_flag(hdev, HCI_SC_ENABLED);
+ } else {
+ hdev->features[1][0] &= ~LMP_HOST_SC;
+ hci_dev_clear_flag(hdev, HCI_SC_ENABLED);
+ }
+ }
+
+ return err;
+}
+
+int hci_write_ssp_mode_sync(struct hci_dev *hdev, u8 mode)
+{
+ int err;
+
+ if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED) ||
+ lmp_host_ssp_capable(hdev))
+ return 0;
+
+ if (!mode && hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) {
+ __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SSP_DEBUG_MODE,
+ sizeof(mode), &mode, HCI_CMD_TIMEOUT);
+ }
+
+ err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SSP_MODE,
+ sizeof(mode), &mode, HCI_CMD_TIMEOUT);
+ if (err)
+ return err;
+
+ return hci_write_sc_support_sync(hdev, 0x01);
+}
+
+int hci_write_le_host_supported_sync(struct hci_dev *hdev, u8 le, u8 simul)
+{
+ struct hci_cp_write_le_host_supported cp;
+
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) ||
+ !lmp_bredr_capable(hdev))
+ return 0;
+
+ /* Check first if we already have the right host state
+ * (host features set)
+ */
+ if (le == lmp_host_le_capable(hdev) &&
+ simul == lmp_host_le_br_capable(hdev))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+
+ cp.le = le;
+ cp.simul = simul;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_LE_HOST_SUPPORTED,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_powered_update_adv_sync(struct hci_dev *hdev)
+{
+ struct adv_info *adv, *tmp;
+ int err;
+
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return 0;
+
+ /* If RPA Resolution has not been enable yet it means the
+ * resolving list is empty and we should attempt to program the
+ * local IRK in order to support using own_addr_type
+ * ADDR_LE_DEV_RANDOM_RESOLVED (0x03).
+ */
+ if (!hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) {
+ hci_le_add_resolve_list_sync(hdev, NULL);
+ hci_le_set_addr_resolution_enable_sync(hdev, 0x01);
+ }
+
+ /* Make sure the controller has a good default for
+ * advertising data. This also applies to the case
+ * where BR/EDR was toggled during the AUTO_OFF phase.
+ */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
+ list_empty(&hdev->adv_instances)) {
+ if (ext_adv_capable(hdev)) {
+ err = hci_setup_ext_adv_instance_sync(hdev, 0x00);
+ if (!err)
+ hci_update_scan_rsp_data_sync(hdev, 0x00);
+ } else {
+ err = hci_update_adv_data_sync(hdev, 0x00);
+ if (!err)
+ hci_update_scan_rsp_data_sync(hdev, 0x00);
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ hci_enable_advertising_sync(hdev);
+ }
+
+ /* Call for each tracked instance to be scheduled */
+ list_for_each_entry_safe(adv, tmp, &hdev->adv_instances, list)
+ hci_schedule_adv_instance_sync(hdev, adv->instance, true);
+
+ return 0;
+}
+
+static int hci_write_auth_enable_sync(struct hci_dev *hdev)
+{
+ u8 link_sec;
+
+ link_sec = hci_dev_test_flag(hdev, HCI_LINK_SECURITY);
+ if (link_sec == test_bit(HCI_AUTH, &hdev->flags))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_AUTH_ENABLE,
+ sizeof(link_sec), &link_sec,
+ HCI_CMD_TIMEOUT);
+}
+
+int hci_write_fast_connectable_sync(struct hci_dev *hdev, bool enable)
+{
+ struct hci_cp_write_page_scan_activity cp;
+ u8 type;
+ int err = 0;
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ return 0;
+
+ if (hdev->hci_ver < BLUETOOTH_VER_1_2)
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (enable) {
+ type = PAGE_SCAN_TYPE_INTERLACED;
+
+ /* 160 msec page scan interval */
+ cp.interval = cpu_to_le16(0x0100);
+ } else {
+ type = hdev->def_page_scan_type;
+ cp.interval = cpu_to_le16(hdev->def_page_scan_int);
+ }
+
+ cp.window = cpu_to_le16(hdev->def_page_scan_window);
+
+ if (__cpu_to_le16(hdev->page_scan_interval) != cp.interval ||
+ __cpu_to_le16(hdev->page_scan_window) != cp.window) {
+ err = __hci_cmd_sync_status(hdev,
+ HCI_OP_WRITE_PAGE_SCAN_ACTIVITY,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+ if (err)
+ return err;
+ }
+
+ if (hdev->page_scan_type != type)
+ err = __hci_cmd_sync_status(hdev,
+ HCI_OP_WRITE_PAGE_SCAN_TYPE,
+ sizeof(type), &type,
+ HCI_CMD_TIMEOUT);
+
+ return err;
+}
+
+static bool disconnected_accept_list_entries(struct hci_dev *hdev)
+{
+ struct bdaddr_list *b;
+
+ list_for_each_entry(b, &hdev->accept_list, list) {
+ struct hci_conn *conn;
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &b->bdaddr);
+ if (!conn)
+ return true;
+
+ if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG)
+ return true;
+ }
+
+ return false;
+}
+
+static int hci_write_scan_enable_sync(struct hci_dev *hdev, u8 val)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SCAN_ENABLE,
+ sizeof(val), &val,
+ HCI_CMD_TIMEOUT);
+}
+
+int hci_update_scan_sync(struct hci_dev *hdev)
+{
+ u8 scan;
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ return 0;
+
+ if (!hdev_is_powered(hdev))
+ return 0;
+
+ if (mgmt_powering_down(hdev))
+ return 0;
+
+ if (hdev->scanning_paused)
+ return 0;
+
+ if (hci_dev_test_flag(hdev, HCI_CONNECTABLE) ||
+ disconnected_accept_list_entries(hdev))
+ scan = SCAN_PAGE;
+ else
+ scan = SCAN_DISABLED;
+
+ if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE))
+ scan |= SCAN_INQUIRY;
+
+ if (test_bit(HCI_PSCAN, &hdev->flags) == !!(scan & SCAN_PAGE) &&
+ test_bit(HCI_ISCAN, &hdev->flags) == !!(scan & SCAN_INQUIRY))
+ return 0;
+
+ return hci_write_scan_enable_sync(hdev, scan);
+}
+
+int hci_update_name_sync(struct hci_dev *hdev, const u8 *name)
+{
+ struct hci_cp_write_local_name cp;
+
+ memset(&cp, 0, sizeof(cp));
+
+ memcpy(cp.name, name, sizeof(cp.name));
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_LOCAL_NAME,
+ sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+}
+
+/* This function perform powered update HCI command sequence after the HCI init
+ * sequence which end up resetting all states, the sequence is as follows:
+ *
+ * HCI_SSP_ENABLED(Enable SSP)
+ * HCI_LE_ENABLED(Enable LE)
+ * HCI_LE_ENABLED(ll_privacy_capable(Add local IRK to Resolving List) ->
+ * Update adv data)
+ * Enable Authentication
+ * lmp_bredr_capable(Set Fast Connectable -> Set Scan Type -> Set Class ->
+ * Set Name -> Set EIR)
+ * HCI_FORCE_STATIC_ADDR | BDADDR_ANY && !HCI_BREDR_ENABLED (Set Static Address)
+ */
+int hci_powered_update_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ /* Register the available SMP channels (BR/EDR and LE) only when
+ * successfully powering on the controller. This late
+ * registration is required so that LE SMP can clearly decide if
+ * the public address or static address is used.
+ */
+ smp_register(hdev);
+
+ err = hci_write_ssp_mode_sync(hdev, 0x01);
+ if (err)
+ return err;
+
+ err = hci_write_le_host_supported_sync(hdev, 0x01, 0x00);
+ if (err)
+ return err;
+
+ err = hci_powered_update_adv_sync(hdev);
+ if (err)
+ return err;
+
+ err = hci_write_auth_enable_sync(hdev);
+ if (err)
+ return err;
+
+ if (lmp_bredr_capable(hdev)) {
+ if (hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE))
+ hci_write_fast_connectable_sync(hdev, true);
+ else
+ hci_write_fast_connectable_sync(hdev, false);
+ hci_update_scan_sync(hdev);
+ hci_update_class_sync(hdev);
+ hci_update_name_sync(hdev, hdev->dev_name);
+ hci_update_eir_sync(hdev);
+ }
+
+ /* If forcing static address is in use or there is no public
+ * address use the static address as random address (but skip
+ * the HCI command if the current random address is already the
+ * static one.
+ *
+ * In case BR/EDR has been disabled on a dual-mode controller
+ * and a static address has been configured, then use that
+ * address instead of the public BR/EDR address.
+ */
+ if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) ||
+ (!bacmp(&hdev->bdaddr, BDADDR_ANY) &&
+ !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))) {
+ if (bacmp(&hdev->static_addr, BDADDR_ANY))
+ return hci_set_random_addr_sync(hdev,
+ &hdev->static_addr);
+ }
+
+ return 0;
+}
+
+/**
+ * hci_dev_get_bd_addr_from_property - Get the Bluetooth Device Address
+ * (BD_ADDR) for a HCI device from
+ * a firmware node property.
+ * @hdev: The HCI device
+ *
+ * Search the firmware node for 'local-bd-address'.
+ *
+ * All-zero BD addresses are rejected, because those could be properties
+ * that exist in the firmware tables, but were not updated by the firmware. For
+ * example, the DTS could define 'local-bd-address', with zero BD addresses.
+ */
+static void hci_dev_get_bd_addr_from_property(struct hci_dev *hdev)
+{
+ struct fwnode_handle *fwnode = dev_fwnode(hdev->dev.parent);
+ bdaddr_t ba;
+ int ret;
+
+ ret = fwnode_property_read_u8_array(fwnode, "local-bd-address",
+ (u8 *)&ba, sizeof(ba));
+ if (ret < 0 || !bacmp(&ba, BDADDR_ANY))
+ return;
+
+ if (hci_test_quirk(hdev, HCI_QUIRK_BDADDR_PROPERTY_BROKEN))
+ baswap(&hdev->public_addr, &ba);
+ else
+ bacpy(&hdev->public_addr, &ba);
+}
+
+struct hci_init_stage {
+ int (*func)(struct hci_dev *hdev);
+};
+
+/* Run init stage NULL terminated function table */
+static int hci_init_stage_sync(struct hci_dev *hdev,
+ const struct hci_init_stage *stage)
+{
+ size_t i;
+
+ for (i = 0; stage[i].func; i++) {
+ int err;
+
+ err = stage[i].func(hdev);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/* Read Local Version */
+static int hci_read_local_version_sync(struct hci_dev *hdev)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_VERSION,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Read BD Address */
+static int hci_read_bd_addr_sync(struct hci_dev *hdev)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_BD_ADDR,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+#define HCI_INIT(_func) \
+{ \
+ .func = _func, \
+}
+
+static const struct hci_init_stage hci_init0[] = {
+ /* HCI_OP_READ_LOCAL_VERSION */
+ HCI_INIT(hci_read_local_version_sync),
+ /* HCI_OP_READ_BD_ADDR */
+ HCI_INIT(hci_read_bd_addr_sync),
+ {}
+};
+
+int hci_reset_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ set_bit(HCI_RESET, &hdev->flags);
+
+ err = __hci_cmd_sync_status(hdev, HCI_OP_RESET, 0, NULL,
+ HCI_CMD_TIMEOUT);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static int hci_init0_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ bt_dev_dbg(hdev, "");
+
+ /* Reset */
+ if (!hci_test_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE)) {
+ err = hci_reset_sync(hdev);
+ if (err)
+ return err;
+ }
+
+ return hci_init_stage_sync(hdev, hci_init0);
+}
+
+static int hci_unconf_init_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE))
+ return 0;
+
+ err = hci_init0_sync(hdev);
+ if (err < 0)
+ return err;
+
+ if (hci_dev_test_flag(hdev, HCI_SETUP))
+ hci_debugfs_create_basic(hdev);
+
+ return 0;
+}
+
+/* Read Local Supported Features. */
+static int hci_read_local_features_sync(struct hci_dev *hdev)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_FEATURES,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* BR Controller init stage 1 command sequence */
+static const struct hci_init_stage br_init1[] = {
+ /* HCI_OP_READ_LOCAL_FEATURES */
+ HCI_INIT(hci_read_local_features_sync),
+ /* HCI_OP_READ_LOCAL_VERSION */
+ HCI_INIT(hci_read_local_version_sync),
+ /* HCI_OP_READ_BD_ADDR */
+ HCI_INIT(hci_read_bd_addr_sync),
+ {}
+};
+
+/* Read Local Commands */
+static int hci_read_local_cmds_sync(struct hci_dev *hdev)
+{
+ /* All Bluetooth 1.2 and later controllers should support the
+ * HCI command for reading the local supported commands.
+ *
+ * Unfortunately some controllers indicate Bluetooth 1.2 support,
+ * but do not have support for this command. If that is the case,
+ * the driver can quirk the behavior and skip reading the local
+ * supported commands.
+ */
+ if (hdev->hci_ver > BLUETOOTH_VER_1_1 &&
+ !hci_test_quirk(hdev, HCI_QUIRK_BROKEN_LOCAL_COMMANDS))
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_COMMANDS,
+ 0, NULL, HCI_CMD_TIMEOUT);
+
+ return 0;
+}
+
+static int hci_init1_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ bt_dev_dbg(hdev, "");
+
+ /* Reset */
+ if (!hci_test_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE)) {
+ err = hci_reset_sync(hdev);
+ if (err)
+ return err;
+ }
+
+ return hci_init_stage_sync(hdev, br_init1);
+}
+
+/* Read Buffer Size (ACL mtu, max pkt, etc.) */
+static int hci_read_buffer_size_sync(struct hci_dev *hdev)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_BUFFER_SIZE,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Read Class of Device */
+static int hci_read_dev_class_sync(struct hci_dev *hdev)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_CLASS_OF_DEV,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Read Local Name */
+static int hci_read_local_name_sync(struct hci_dev *hdev)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_NAME,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Read Voice Setting */
+static int hci_read_voice_setting_sync(struct hci_dev *hdev)
+{
+ if (!read_voice_setting_capable(hdev))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_VOICE_SETTING,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Read Number of Supported IAC */
+static int hci_read_num_supported_iac_sync(struct hci_dev *hdev)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_NUM_SUPPORTED_IAC,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Read Current IAC LAP */
+static int hci_read_current_iac_lap_sync(struct hci_dev *hdev)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_CURRENT_IAC_LAP,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+static int hci_set_event_filter_sync(struct hci_dev *hdev, u8 flt_type,
+ u8 cond_type, bdaddr_t *bdaddr,
+ u8 auto_accept)
+{
+ struct hci_cp_set_event_filter cp;
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ return 0;
+
+ if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.flt_type = flt_type;
+
+ if (flt_type != HCI_FLT_CLEAR_ALL) {
+ cp.cond_type = cond_type;
+ bacpy(&cp.addr_conn_flt.bdaddr, bdaddr);
+ cp.addr_conn_flt.auto_accept = auto_accept;
+ }
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_SET_EVENT_FLT,
+ flt_type == HCI_FLT_CLEAR_ALL ?
+ sizeof(cp.flt_type) : sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+}
+
+static int hci_clear_event_filter_sync(struct hci_dev *hdev)
+{
+ if (!hci_dev_test_flag(hdev, HCI_EVENT_FILTER_CONFIGURED))
+ return 0;
+
+ /* In theory the state machine should not reach here unless
+ * a hci_set_event_filter_sync() call succeeds, but we do
+ * the check both for parity and as a future reminder.
+ */
+ if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL))
+ return 0;
+
+ return hci_set_event_filter_sync(hdev, HCI_FLT_CLEAR_ALL, 0x00,
+ BDADDR_ANY, 0x00);
+}
+
+/* Connection accept timeout ~20 secs */
+static int hci_write_ca_timeout_sync(struct hci_dev *hdev)
+{
+ __le16 param = cpu_to_le16(0x7d00);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_CA_TIMEOUT,
+ sizeof(param), &param, HCI_CMD_TIMEOUT);
+}
+
+/* Enable SCO flow control if supported */
+static int hci_write_sync_flowctl_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_write_sync_flowctl cp;
+ int err;
+
+ /* Check if the controller supports SCO and HCI_OP_WRITE_SYNC_FLOWCTL */
+ if (!lmp_sco_capable(hdev) || !(hdev->commands[10] & BIT(4)) ||
+ !hci_test_quirk(hdev, HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.enable = 0x01;
+
+ err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SYNC_FLOWCTL,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+ if (!err)
+ hci_dev_set_flag(hdev, HCI_SCO_FLOWCTL);
+
+ return err;
+}
+
+/* BR Controller init stage 2 command sequence */
+static const struct hci_init_stage br_init2[] = {
+ /* HCI_OP_READ_BUFFER_SIZE */
+ HCI_INIT(hci_read_buffer_size_sync),
+ /* HCI_OP_READ_CLASS_OF_DEV */
+ HCI_INIT(hci_read_dev_class_sync),
+ /* HCI_OP_READ_LOCAL_NAME */
+ HCI_INIT(hci_read_local_name_sync),
+ /* HCI_OP_READ_VOICE_SETTING */
+ HCI_INIT(hci_read_voice_setting_sync),
+ /* HCI_OP_READ_NUM_SUPPORTED_IAC */
+ HCI_INIT(hci_read_num_supported_iac_sync),
+ /* HCI_OP_READ_CURRENT_IAC_LAP */
+ HCI_INIT(hci_read_current_iac_lap_sync),
+ /* HCI_OP_SET_EVENT_FLT */
+ HCI_INIT(hci_clear_event_filter_sync),
+ /* HCI_OP_WRITE_CA_TIMEOUT */
+ HCI_INIT(hci_write_ca_timeout_sync),
+ /* HCI_OP_WRITE_SYNC_FLOWCTL */
+ HCI_INIT(hci_write_sync_flowctl_sync),
+ {}
+};
+
+static int hci_write_ssp_mode_1_sync(struct hci_dev *hdev)
+{
+ u8 mode = 0x01;
+
+ if (!lmp_ssp_capable(hdev) || !hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
+ return 0;
+
+ /* When SSP is available, then the host features page
+ * should also be available as well. However some
+ * controllers list the max_page as 0 as long as SSP
+ * has not been enabled. To achieve proper debugging
+ * output, force the minimum max_page to 1 at least.
+ */
+ hdev->max_page = 0x01;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SSP_MODE,
+ sizeof(mode), &mode, HCI_CMD_TIMEOUT);
+}
+
+static int hci_write_eir_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_write_eir cp;
+
+ if (!lmp_ssp_capable(hdev) || hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
+ return 0;
+
+ memset(hdev->eir, 0, sizeof(hdev->eir));
+ memset(&cp, 0, sizeof(cp));
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_EIR, sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+}
+
+static int hci_write_inquiry_mode_sync(struct hci_dev *hdev)
+{
+ u8 mode;
+
+ if (!lmp_inq_rssi_capable(hdev) &&
+ !hci_test_quirk(hdev, HCI_QUIRK_FIXUP_INQUIRY_MODE))
+ return 0;
+
+ /* If Extended Inquiry Result events are supported, then
+ * they are clearly preferred over Inquiry Result with RSSI
+ * events.
+ */
+ mode = lmp_ext_inq_capable(hdev) ? 0x02 : 0x01;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_INQUIRY_MODE,
+ sizeof(mode), &mode, HCI_CMD_TIMEOUT);
+}
+
+static int hci_read_inq_rsp_tx_power_sync(struct hci_dev *hdev)
+{
+ if (!lmp_inq_tx_pwr_capable(hdev))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_INQ_RSP_TX_POWER,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+static int hci_read_local_ext_features_sync(struct hci_dev *hdev, u8 page)
+{
+ struct hci_cp_read_local_ext_features cp;
+
+ if (!lmp_ext_feat_capable(hdev))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.page = page;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_EXT_FEATURES,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_read_local_ext_features_1_sync(struct hci_dev *hdev)
+{
+ return hci_read_local_ext_features_sync(hdev, 0x01);
+}
+
+/* HCI Controller init stage 2 command sequence */
+static const struct hci_init_stage hci_init2[] = {
+ /* HCI_OP_READ_LOCAL_COMMANDS */
+ HCI_INIT(hci_read_local_cmds_sync),
+ /* HCI_OP_WRITE_SSP_MODE */
+ HCI_INIT(hci_write_ssp_mode_1_sync),
+ /* HCI_OP_WRITE_EIR */
+ HCI_INIT(hci_write_eir_sync),
+ /* HCI_OP_WRITE_INQUIRY_MODE */
+ HCI_INIT(hci_write_inquiry_mode_sync),
+ /* HCI_OP_READ_INQ_RSP_TX_POWER */
+ HCI_INIT(hci_read_inq_rsp_tx_power_sync),
+ /* HCI_OP_READ_LOCAL_EXT_FEATURES */
+ HCI_INIT(hci_read_local_ext_features_1_sync),
+ /* HCI_OP_WRITE_AUTH_ENABLE */
+ HCI_INIT(hci_write_auth_enable_sync),
+ {}
+};
+
+/* Read LE Buffer Size */
+static int hci_le_read_buffer_size_sync(struct hci_dev *hdev)
+{
+ /* Use Read LE Buffer Size V2 if supported */
+ if (iso_capable(hdev) && hdev->commands[41] & 0x20)
+ return __hci_cmd_sync_status(hdev,
+ HCI_OP_LE_READ_BUFFER_SIZE_V2,
+ 0, NULL, HCI_CMD_TIMEOUT);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_BUFFER_SIZE,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Read LE Local Supported Features */
+static int hci_le_read_local_features_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ err = __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_LOCAL_FEATURES,
+ 0, NULL, HCI_CMD_TIMEOUT);
+ if (err)
+ return err;
+
+ if (ll_ext_feature_capable(hdev) && hdev->commands[47] & BIT(2))
+ return __hci_cmd_sync_status(hdev,
+ HCI_OP_LE_READ_ALL_LOCAL_FEATURES,
+ 0, NULL, HCI_CMD_TIMEOUT);
+
+ return err;
+}
+
+/* Read LE Supported States */
+static int hci_le_read_supported_states_sync(struct hci_dev *hdev)
+{
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_SUPPORTED_STATES,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* LE Controller init stage 2 command sequence */
+static const struct hci_init_stage le_init2[] = {
+ /* HCI_OP_LE_READ_LOCAL_FEATURES */
+ HCI_INIT(hci_le_read_local_features_sync),
+ /* HCI_OP_LE_READ_BUFFER_SIZE */
+ HCI_INIT(hci_le_read_buffer_size_sync),
+ /* HCI_OP_LE_READ_SUPPORTED_STATES */
+ HCI_INIT(hci_le_read_supported_states_sync),
+ {}
+};
+
+static int hci_init2_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ bt_dev_dbg(hdev, "");
+
+ err = hci_init_stage_sync(hdev, hci_init2);
+ if (err)
+ return err;
+
+ if (lmp_bredr_capable(hdev)) {
+ err = hci_init_stage_sync(hdev, br_init2);
+ if (err)
+ return err;
+ } else {
+ hci_dev_clear_flag(hdev, HCI_BREDR_ENABLED);
+ }
+
+ if (lmp_le_capable(hdev)) {
+ err = hci_init_stage_sync(hdev, le_init2);
+ if (err)
+ return err;
+ /* LE-only controllers have LE implicitly enabled */
+ if (!lmp_bredr_capable(hdev))
+ hci_dev_set_flag(hdev, HCI_LE_ENABLED);
+ }
+
+ return 0;
+}
+
+static int hci_set_event_mask_sync(struct hci_dev *hdev)
+{
+ /* The second byte is 0xff instead of 0x9f (two reserved bits
+ * disabled) since a Broadcom 1.2 dongle doesn't respond to the
+ * command otherwise.
+ */
+ u8 events[8] = { 0xff, 0xff, 0xfb, 0xff, 0x00, 0x00, 0x00, 0x00 };
+
+ /* CSR 1.1 dongles does not accept any bitfield so don't try to set
+ * any event mask for pre 1.2 devices.
+ */
+ if (hdev->hci_ver < BLUETOOTH_VER_1_2)
+ return 0;
+
+ if (lmp_bredr_capable(hdev)) {
+ events[4] |= 0x01; /* Flow Specification Complete */
+
+ /* Don't set Disconnect Complete and mode change when
+ * suspended as that would wakeup the host when disconnecting
+ * due to suspend.
+ */
+ if (hdev->suspended) {
+ events[0] &= 0xef;
+ events[2] &= 0xf7;
+ }
+ } else {
+ /* Use a different default for LE-only devices */
+ memset(events, 0, sizeof(events));
+ events[1] |= 0x20; /* Command Complete */
+ events[1] |= 0x40; /* Command Status */
+ events[1] |= 0x80; /* Hardware Error */
+
+ /* If the controller supports the Disconnect command, enable
+ * the corresponding event. In addition enable packet flow
+ * control related events.
+ */
+ if (hdev->commands[0] & 0x20) {
+ /* Don't set Disconnect Complete when suspended as that
+ * would wakeup the host when disconnecting due to
+ * suspend.
+ */
+ if (!hdev->suspended)
+ events[0] |= 0x10; /* Disconnection Complete */
+ events[2] |= 0x04; /* Number of Completed Packets */
+ events[3] |= 0x02; /* Data Buffer Overflow */
+ }
+
+ /* If the controller supports the Read Remote Version
+ * Information command, enable the corresponding event.
+ */
+ if (hdev->commands[2] & 0x80)
+ events[1] |= 0x08; /* Read Remote Version Information
+ * Complete
+ */
+
+ if (hdev->le_features[0] & HCI_LE_ENCRYPTION) {
+ events[0] |= 0x80; /* Encryption Change */
+ events[5] |= 0x80; /* Encryption Key Refresh Complete */
+ }
+ }
+
+ if (lmp_inq_rssi_capable(hdev) ||
+ hci_test_quirk(hdev, HCI_QUIRK_FIXUP_INQUIRY_MODE))
+ events[4] |= 0x02; /* Inquiry Result with RSSI */
+
+ if (lmp_ext_feat_capable(hdev))
+ events[4] |= 0x04; /* Read Remote Extended Features Complete */
+
+ if (lmp_esco_capable(hdev)) {
+ events[5] |= 0x08; /* Synchronous Connection Complete */
+ events[5] |= 0x10; /* Synchronous Connection Changed */
+ }
+
+ if (lmp_sniffsubr_capable(hdev))
+ events[5] |= 0x20; /* Sniff Subrating */
+
+ if (lmp_pause_enc_capable(hdev))
+ events[5] |= 0x80; /* Encryption Key Refresh Complete */
+
+ if (lmp_ext_inq_capable(hdev))
+ events[5] |= 0x40; /* Extended Inquiry Result */
+
+ if (lmp_no_flush_capable(hdev))
+ events[7] |= 0x01; /* Enhanced Flush Complete */
+
+ if (lmp_lsto_capable(hdev))
+ events[6] |= 0x80; /* Link Supervision Timeout Changed */
+
+ if (lmp_ssp_capable(hdev)) {
+ events[6] |= 0x01; /* IO Capability Request */
+ events[6] |= 0x02; /* IO Capability Response */
+ events[6] |= 0x04; /* User Confirmation Request */
+ events[6] |= 0x08; /* User Passkey Request */
+ events[6] |= 0x10; /* Remote OOB Data Request */
+ events[6] |= 0x20; /* Simple Pairing Complete */
+ events[7] |= 0x04; /* User Passkey Notification */
+ events[7] |= 0x08; /* Keypress Notification */
+ events[7] |= 0x10; /* Remote Host Supported
+ * Features Notification
+ */
+ }
+
+ if (lmp_le_capable(hdev))
+ events[7] |= 0x20; /* LE Meta-Event */
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_SET_EVENT_MASK,
+ sizeof(events), events, HCI_CMD_TIMEOUT);
+}
+
+static int hci_read_stored_link_key_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_read_stored_link_key cp;
+
+ if (!(hdev->commands[6] & 0x20) ||
+ hci_test_quirk(hdev, HCI_QUIRK_BROKEN_STORED_LINK_KEY))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+ bacpy(&cp.bdaddr, BDADDR_ANY);
+ cp.read_all = 0x01;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_STORED_LINK_KEY,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_setup_link_policy_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_write_def_link_policy cp;
+ u16 link_policy = 0;
+
+ if (!(hdev->commands[5] & 0x10))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (lmp_rswitch_capable(hdev))
+ link_policy |= HCI_LP_RSWITCH;
+ if (lmp_hold_capable(hdev))
+ link_policy |= HCI_LP_HOLD;
+ if (lmp_sniff_capable(hdev))
+ link_policy |= HCI_LP_SNIFF;
+ if (lmp_park_capable(hdev))
+ link_policy |= HCI_LP_PARK;
+
+ cp.policy = cpu_to_le16(link_policy);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_DEF_LINK_POLICY,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_read_page_scan_activity_sync(struct hci_dev *hdev)
+{
+ if (!(hdev->commands[8] & 0x01))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_PAGE_SCAN_ACTIVITY,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+static int hci_read_def_err_data_reporting_sync(struct hci_dev *hdev)
+{
+ if (!(hdev->commands[18] & 0x04) ||
+ !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) ||
+ hci_test_quirk(hdev, HCI_QUIRK_BROKEN_ERR_DATA_REPORTING))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_DEF_ERR_DATA_REPORTING,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+static int hci_read_page_scan_type_sync(struct hci_dev *hdev)
+{
+ /* Some older Broadcom based Bluetooth 1.2 controllers do not
+ * support the Read Page Scan Type command. Check support for
+ * this command in the bit mask of supported commands.
+ */
+ if (!(hdev->commands[13] & 0x01) ||
+ hci_test_quirk(hdev, HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_PAGE_SCAN_TYPE,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Read features beyond page 1 if available */
+static int hci_read_local_ext_features_all_sync(struct hci_dev *hdev)
+{
+ u8 page;
+ int err;
+
+ if (!lmp_ext_feat_capable(hdev))
+ return 0;
+
+ for (page = 2; page < HCI_MAX_PAGES && page <= hdev->max_page;
+ page++) {
+ err = hci_read_local_ext_features_sync(hdev, page);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/* HCI Controller init stage 3 command sequence */
+static const struct hci_init_stage hci_init3[] = {
+ /* HCI_OP_SET_EVENT_MASK */
+ HCI_INIT(hci_set_event_mask_sync),
+ /* HCI_OP_READ_STORED_LINK_KEY */
+ HCI_INIT(hci_read_stored_link_key_sync),
+ /* HCI_OP_WRITE_DEF_LINK_POLICY */
+ HCI_INIT(hci_setup_link_policy_sync),
+ /* HCI_OP_READ_PAGE_SCAN_ACTIVITY */
+ HCI_INIT(hci_read_page_scan_activity_sync),
+ /* HCI_OP_READ_DEF_ERR_DATA_REPORTING */
+ HCI_INIT(hci_read_def_err_data_reporting_sync),
+ /* HCI_OP_READ_PAGE_SCAN_TYPE */
+ HCI_INIT(hci_read_page_scan_type_sync),
+ /* HCI_OP_READ_LOCAL_EXT_FEATURES */
+ HCI_INIT(hci_read_local_ext_features_all_sync),
+ {}
+};
+
+static int hci_le_set_event_mask_sync(struct hci_dev *hdev)
+{
+ u8 events[8];
+
+ if (!lmp_le_capable(hdev))
+ return 0;
+
+ memset(events, 0, sizeof(events));
+
+ if (hdev->le_features[0] & HCI_LE_ENCRYPTION)
+ events[0] |= 0x10; /* LE Long Term Key Request */
+
+ /* If controller supports the Connection Parameters Request
+ * Link Layer Procedure, enable the corresponding event.
+ */
+ if (hdev->le_features[0] & HCI_LE_CONN_PARAM_REQ_PROC)
+ /* LE Remote Connection Parameter Request */
+ events[0] |= 0x20;
+
+ /* If the controller supports the Data Length Extension
+ * feature, enable the corresponding event.
+ */
+ if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT)
+ events[0] |= 0x40; /* LE Data Length Change */
+
+ /* If the controller supports LL Privacy feature or LE Extended Adv,
+ * enable the corresponding event.
+ */
+ if (use_enhanced_conn_complete(hdev))
+ events[1] |= 0x02; /* LE Enhanced Connection Complete */
+
+ /* Mark Device Privacy if Privacy Mode is supported */
+ if (privacy_mode_capable(hdev))
+ hdev->conn_flags |= HCI_CONN_FLAG_DEVICE_PRIVACY;
+
+ /* Mark Address Resolution if LL Privacy is supported */
+ if (ll_privacy_capable(hdev))
+ hdev->conn_flags |= HCI_CONN_FLAG_ADDRESS_RESOLUTION;
+
+ /* Mark PAST if supported */
+ if (past_capable(hdev))
+ hdev->conn_flags |= HCI_CONN_FLAG_PAST;
+
+ /* If the controller supports Extended Scanner Filter
+ * Policies, enable the corresponding event.
+ */
+ if (hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY)
+ events[1] |= 0x04; /* LE Direct Advertising Report */
+
+ /* If the controller supports Channel Selection Algorithm #2
+ * feature, enable the corresponding event.
+ */
+ if (hdev->le_features[1] & HCI_LE_CHAN_SEL_ALG2)
+ events[2] |= 0x08; /* LE Channel Selection Algorithm */
+
+ /* If the controller supports the LE Set Scan Enable command,
+ * enable the corresponding advertising report event.
+ */
+ if (hdev->commands[26] & 0x08)
+ events[0] |= 0x02; /* LE Advertising Report */
+
+ /* If the controller supports the LE Create Connection
+ * command, enable the corresponding event.
+ */
+ if (hdev->commands[26] & 0x10)
+ events[0] |= 0x01; /* LE Connection Complete */
+
+ /* If the controller supports the LE Connection Update
+ * command, enable the corresponding event.
+ */
+ if (hdev->commands[27] & 0x04)
+ events[0] |= 0x04; /* LE Connection Update Complete */
+
+ /* If the controller supports the LE Read Remote Used Features
+ * command, enable the corresponding event.
+ */
+ if (hdev->commands[27] & 0x20)
+ /* LE Read Remote Used Features Complete */
+ events[0] |= 0x08;
+
+ /* If the controller supports the LE Read Local P-256
+ * Public Key command, enable the corresponding event.
+ */
+ if (hdev->commands[34] & 0x02)
+ /* LE Read Local P-256 Public Key Complete */
+ events[0] |= 0x80;
+
+ /* If the controller supports the LE Generate DHKey
+ * command, enable the corresponding event.
+ */
+ if (hdev->commands[34] & 0x04)
+ events[1] |= 0x01; /* LE Generate DHKey Complete */
+
+ /* If the controller supports the LE Set Default PHY or
+ * LE Set PHY commands, enable the corresponding event.
+ */
+ if (hdev->commands[35] & (0x20 | 0x40))
+ events[1] |= 0x08; /* LE PHY Update Complete */
+
+ /* If the controller supports LE Set Extended Scan Parameters
+ * and LE Set Extended Scan Enable commands, enable the
+ * corresponding event.
+ */
+ if (use_ext_scan(hdev))
+ events[1] |= 0x10; /* LE Extended Advertising Report */
+
+ /* If the controller supports the LE Extended Advertising
+ * command, enable the corresponding event.
+ */
+ if (ext_adv_capable(hdev))
+ events[2] |= 0x02; /* LE Advertising Set Terminated */
+
+ if (past_receiver_capable(hdev))
+ events[2] |= 0x80; /* LE PAST Received */
+
+ if (cis_capable(hdev)) {
+ events[3] |= 0x01; /* LE CIS Established */
+ if (cis_peripheral_capable(hdev))
+ events[3] |= 0x02; /* LE CIS Request */
+ }
+
+ if (bis_capable(hdev)) {
+ events[1] |= 0x20; /* LE PA Report */
+ events[1] |= 0x40; /* LE PA Sync Established */
+ events[3] |= 0x04; /* LE Create BIG Complete */
+ events[3] |= 0x08; /* LE Terminate BIG Complete */
+ events[3] |= 0x10; /* LE BIG Sync Established */
+ events[3] |= 0x20; /* LE BIG Sync Loss */
+ events[4] |= 0x02; /* LE BIG Info Advertising Report */
+ }
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EVENT_MASK,
+ sizeof(events), events, HCI_CMD_TIMEOUT);
+}
+
+/* Read LE Advertising Channel TX Power */
+static int hci_le_read_adv_tx_power_sync(struct hci_dev *hdev)
+{
+ if ((hdev->commands[25] & 0x40) && !ext_adv_capable(hdev)) {
+ /* HCI TS spec forbids mixing of legacy and extended
+ * advertising commands wherein READ_ADV_TX_POWER is
+ * also included. So do not call it if extended adv
+ * is supported otherwise controller will return
+ * COMMAND_DISALLOWED for extended commands.
+ */
+ return __hci_cmd_sync_status(hdev,
+ HCI_OP_LE_READ_ADV_TX_POWER,
+ 0, NULL, HCI_CMD_TIMEOUT);
+ }
+
+ return 0;
+}
+
+/* Read LE Min/Max Tx Power*/
+static int hci_le_read_tx_power_sync(struct hci_dev *hdev)
+{
+ if (!(hdev->commands[38] & 0x80) ||
+ hci_test_quirk(hdev, HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_TRANSMIT_POWER,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Read LE Accept List Size */
+static int hci_le_read_accept_list_size_sync(struct hci_dev *hdev)
+{
+ if (!(hdev->commands[26] & 0x40))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_ACCEPT_LIST_SIZE,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Read LE Resolving List Size */
+static int hci_le_read_resolv_list_size_sync(struct hci_dev *hdev)
+{
+ if (!(hdev->commands[34] & 0x40))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_RESOLV_LIST_SIZE,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Clear LE Resolving List */
+static int hci_le_clear_resolv_list_sync(struct hci_dev *hdev)
+{
+ if (!(hdev->commands[34] & 0x20))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_CLEAR_RESOLV_LIST, 0, NULL,
+ HCI_CMD_TIMEOUT);
+}
+
+/* Set RPA timeout */
+static int hci_le_set_rpa_timeout_sync(struct hci_dev *hdev)
+{
+ __le16 timeout = cpu_to_le16(hdev->rpa_timeout);
+
+ if (!(hdev->commands[35] & 0x04) ||
+ hci_test_quirk(hdev, HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_RPA_TIMEOUT,
+ sizeof(timeout), &timeout,
+ HCI_CMD_TIMEOUT);
+}
+
+/* Read LE Maximum Data Length */
+static int hci_le_read_max_data_len_sync(struct hci_dev *hdev)
+{
+ if (!(hdev->le_features[0] & HCI_LE_DATA_LEN_EXT))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_MAX_DATA_LEN, 0, NULL,
+ HCI_CMD_TIMEOUT);
+}
+
+/* Read LE Suggested Default Data Length */
+static int hci_le_read_def_data_len_sync(struct hci_dev *hdev)
+{
+ if (!(hdev->le_features[0] & HCI_LE_DATA_LEN_EXT))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_DEF_DATA_LEN, 0, NULL,
+ HCI_CMD_TIMEOUT);
+}
+
+/* Read LE Number of Supported Advertising Sets */
+static int hci_le_read_num_support_adv_sets_sync(struct hci_dev *hdev)
+{
+ if (!ext_adv_capable(hdev))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev,
+ HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Write LE Host Supported */
+static int hci_set_le_support_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_write_le_host_supported cp;
+
+ /* LE-only devices do not support explicit enablement */
+ if (!lmp_bredr_capable(hdev))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
+ cp.le = 0x01;
+ cp.simul = 0x00;
+ }
+
+ if (cp.le == lmp_host_le_capable(hdev))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_LE_HOST_SUPPORTED,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+/* LE Set Host Feature */
+static int hci_le_set_host_feature_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_le_set_host_feature cp;
+
+ if (!iso_capable(hdev))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+
+ /* Connected Isochronous Channels (Host Support) */
+ cp.bit_number = 32;
+ cp.bit_value = iso_enabled(hdev) ? 0x01 : 0x00;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_HOST_FEATURE,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+/* LE Controller init stage 3 command sequence */
+static const struct hci_init_stage le_init3[] = {
+ /* HCI_OP_LE_SET_EVENT_MASK */
+ HCI_INIT(hci_le_set_event_mask_sync),
+ /* HCI_OP_LE_READ_ADV_TX_POWER */
+ HCI_INIT(hci_le_read_adv_tx_power_sync),
+ /* HCI_OP_LE_READ_TRANSMIT_POWER */
+ HCI_INIT(hci_le_read_tx_power_sync),
+ /* HCI_OP_LE_READ_ACCEPT_LIST_SIZE */
+ HCI_INIT(hci_le_read_accept_list_size_sync),
+ /* HCI_OP_LE_CLEAR_ACCEPT_LIST */
+ HCI_INIT(hci_le_clear_accept_list_sync),
+ /* HCI_OP_LE_READ_RESOLV_LIST_SIZE */
+ HCI_INIT(hci_le_read_resolv_list_size_sync),
+ /* HCI_OP_LE_CLEAR_RESOLV_LIST */
+ HCI_INIT(hci_le_clear_resolv_list_sync),
+ /* HCI_OP_LE_SET_RPA_TIMEOUT */
+ HCI_INIT(hci_le_set_rpa_timeout_sync),
+ /* HCI_OP_LE_READ_MAX_DATA_LEN */
+ HCI_INIT(hci_le_read_max_data_len_sync),
+ /* HCI_OP_LE_READ_DEF_DATA_LEN */
+ HCI_INIT(hci_le_read_def_data_len_sync),
+ /* HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS */
+ HCI_INIT(hci_le_read_num_support_adv_sets_sync),
+ /* HCI_OP_WRITE_LE_HOST_SUPPORTED */
+ HCI_INIT(hci_set_le_support_sync),
+ /* HCI_OP_LE_SET_HOST_FEATURE */
+ HCI_INIT(hci_le_set_host_feature_sync),
+ {}
+};
+
+static int hci_init3_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ bt_dev_dbg(hdev, "");
+
+ err = hci_init_stage_sync(hdev, hci_init3);
+ if (err)
+ return err;
+
+ if (lmp_le_capable(hdev))
+ return hci_init_stage_sync(hdev, le_init3);
+
+ return 0;
+}
+
+static int hci_delete_stored_link_key_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_delete_stored_link_key cp;
+
+ /* Some Broadcom based Bluetooth controllers do not support the
+ * Delete Stored Link Key command. They are clearly indicating its
+ * absence in the bit mask of supported commands.
+ *
+ * Check the supported commands and only if the command is marked
+ * as supported send it. If not supported assume that the controller
+ * does not have actual support for stored link keys which makes this
+ * command redundant anyway.
+ *
+ * Some controllers indicate that they support handling deleting
+ * stored link keys, but they don't. The quirk lets a driver
+ * just disable this command.
+ */
+ if (!(hdev->commands[6] & 0x80) ||
+ hci_test_quirk(hdev, HCI_QUIRK_BROKEN_STORED_LINK_KEY))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+ bacpy(&cp.bdaddr, BDADDR_ANY);
+ cp.delete_all = 0x01;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_DELETE_STORED_LINK_KEY,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_set_event_mask_page_2_sync(struct hci_dev *hdev)
+{
+ u8 events[8] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+ bool changed = false;
+
+ /* Set event mask page 2 if the HCI command for it is supported */
+ if (!(hdev->commands[22] & 0x04))
+ return 0;
+
+ /* If Connectionless Peripheral Broadcast central role is supported
+ * enable all necessary events for it.
+ */
+ if (lmp_cpb_central_capable(hdev)) {
+ events[1] |= 0x40; /* Triggered Clock Capture */
+ events[1] |= 0x80; /* Synchronization Train Complete */
+ events[2] |= 0x08; /* Truncated Page Complete */
+ events[2] |= 0x20; /* CPB Channel Map Change */
+ changed = true;
+ }
+
+ /* If Connectionless Peripheral Broadcast peripheral role is supported
+ * enable all necessary events for it.
+ */
+ if (lmp_cpb_peripheral_capable(hdev)) {
+ events[2] |= 0x01; /* Synchronization Train Received */
+ events[2] |= 0x02; /* CPB Receive */
+ events[2] |= 0x04; /* CPB Timeout */
+ events[2] |= 0x10; /* Peripheral Page Response Timeout */
+ changed = true;
+ }
+
+ /* Enable Authenticated Payload Timeout Expired event if supported */
+ if (lmp_ping_capable(hdev) || hdev->le_features[0] & HCI_LE_PING) {
+ events[2] |= 0x80;
+ changed = true;
+ }
+
+ /* Some Broadcom based controllers indicate support for Set Event
+ * Mask Page 2 command, but then actually do not support it. Since
+ * the default value is all bits set to zero, the command is only
+ * required if the event mask has to be changed. In case no change
+ * to the event mask is needed, skip this command.
+ */
+ if (!changed)
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_SET_EVENT_MASK_PAGE_2,
+ sizeof(events), events, HCI_CMD_TIMEOUT);
+}
+
+/* Read local codec list if the HCI command is supported */
+static int hci_read_local_codecs_sync(struct hci_dev *hdev)
+{
+ if (hdev->commands[45] & 0x04)
+ hci_read_supported_codecs_v2(hdev);
+ else if (hdev->commands[29] & 0x20)
+ hci_read_supported_codecs(hdev);
+
+ return 0;
+}
+
+/* Read local pairing options if the HCI command is supported */
+static int hci_read_local_pairing_opts_sync(struct hci_dev *hdev)
+{
+ if (!(hdev->commands[41] & 0x08))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_PAIRING_OPTS,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Get MWS transport configuration if the HCI command is supported */
+static int hci_get_mws_transport_config_sync(struct hci_dev *hdev)
+{
+ if (!mws_transport_config_capable(hdev))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_GET_MWS_TRANSPORT_CONFIG,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Check for Synchronization Train support */
+static int hci_read_sync_train_params_sync(struct hci_dev *hdev)
+{
+ if (!lmp_sync_train_capable(hdev))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_READ_SYNC_TRAIN_PARAMS,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+/* Enable Secure Connections if supported and configured */
+static int hci_write_sc_support_1_sync(struct hci_dev *hdev)
+{
+ u8 support = 0x01;
+
+ if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED) ||
+ !bredr_sc_enabled(hdev))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SC_SUPPORT,
+ sizeof(support), &support,
+ HCI_CMD_TIMEOUT);
+}
+
+/* Set erroneous data reporting if supported to the wideband speech
+ * setting value
+ */
+static int hci_set_err_data_report_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_write_def_err_data_reporting cp;
+ bool enabled = hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED);
+
+ if (!(hdev->commands[18] & 0x08) ||
+ !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) ||
+ hci_test_quirk(hdev, HCI_QUIRK_BROKEN_ERR_DATA_REPORTING))
+ return 0;
+
+ if (enabled == hdev->err_data_reporting)
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.err_data_reporting = enabled ? ERR_DATA_REPORTING_ENABLED :
+ ERR_DATA_REPORTING_DISABLED;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_DEF_ERR_DATA_REPORTING,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static const struct hci_init_stage hci_init4[] = {
+ /* HCI_OP_DELETE_STORED_LINK_KEY */
+ HCI_INIT(hci_delete_stored_link_key_sync),
+ /* HCI_OP_SET_EVENT_MASK_PAGE_2 */
+ HCI_INIT(hci_set_event_mask_page_2_sync),
+ /* HCI_OP_READ_LOCAL_CODECS */
+ HCI_INIT(hci_read_local_codecs_sync),
+ /* HCI_OP_READ_LOCAL_PAIRING_OPTS */
+ HCI_INIT(hci_read_local_pairing_opts_sync),
+ /* HCI_OP_GET_MWS_TRANSPORT_CONFIG */
+ HCI_INIT(hci_get_mws_transport_config_sync),
+ /* HCI_OP_READ_SYNC_TRAIN_PARAMS */
+ HCI_INIT(hci_read_sync_train_params_sync),
+ /* HCI_OP_WRITE_SC_SUPPORT */
+ HCI_INIT(hci_write_sc_support_1_sync),
+ /* HCI_OP_WRITE_DEF_ERR_DATA_REPORTING */
+ HCI_INIT(hci_set_err_data_report_sync),
+ {}
+};
+
+/* Set Suggested Default Data Length to maximum if supported */
+static int hci_le_set_write_def_data_len_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_le_write_def_data_len cp;
+
+ if (!(hdev->le_features[0] & HCI_LE_DATA_LEN_EXT))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.tx_len = cpu_to_le16(hdev->le_max_tx_len);
+ cp.tx_time = cpu_to_le16(hdev->le_max_tx_time);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_WRITE_DEF_DATA_LEN,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+/* Set Default PHY parameters if command is supported, enables all supported
+ * PHYs according to the LE Features bits.
+ */
+static int hci_le_set_default_phy_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_le_set_default_phy cp;
+
+ if (!(hdev->commands[35] & 0x20)) {
+ /* If the command is not supported it means only 1M PHY is
+ * supported.
+ */
+ hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M;
+ hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M;
+ return 0;
+ }
+
+ memset(&cp, 0, sizeof(cp));
+ cp.all_phys = 0x00;
+ cp.tx_phys = HCI_LE_SET_PHY_1M;
+ cp.rx_phys = HCI_LE_SET_PHY_1M;
+
+ /* Enables 2M PHY if supported */
+ if (le_2m_capable(hdev)) {
+ cp.tx_phys |= HCI_LE_SET_PHY_2M;
+ cp.rx_phys |= HCI_LE_SET_PHY_2M;
+ }
+
+ /* Enables Coded PHY if supported */
+ if (le_coded_capable(hdev)) {
+ cp.tx_phys |= HCI_LE_SET_PHY_CODED;
+ cp.rx_phys |= HCI_LE_SET_PHY_CODED;
+ }
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_DEFAULT_PHY,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static const struct hci_init_stage le_init4[] = {
+ /* HCI_OP_LE_WRITE_DEF_DATA_LEN */
+ HCI_INIT(hci_le_set_write_def_data_len_sync),
+ /* HCI_OP_LE_SET_DEFAULT_PHY */
+ HCI_INIT(hci_le_set_default_phy_sync),
+ {}
+};
+
+static int hci_init4_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ bt_dev_dbg(hdev, "");
+
+ err = hci_init_stage_sync(hdev, hci_init4);
+ if (err)
+ return err;
+
+ if (lmp_le_capable(hdev))
+ return hci_init_stage_sync(hdev, le_init4);
+
+ return 0;
+}
+
+static int hci_init_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ err = hci_init1_sync(hdev);
+ if (err < 0)
+ return err;
+
+ if (hci_dev_test_flag(hdev, HCI_SETUP))
+ hci_debugfs_create_basic(hdev);
+
+ err = hci_init2_sync(hdev);
+ if (err < 0)
+ return err;
+
+ err = hci_init3_sync(hdev);
+ if (err < 0)
+ return err;
+
+ err = hci_init4_sync(hdev);
+ if (err < 0)
+ return err;
+
+ /* This function is only called when the controller is actually in
+ * configured state. When the controller is marked as unconfigured,
+ * this initialization procedure is not run.
+ *
+ * It means that it is possible that a controller runs through its
+ * setup phase and then discovers missing settings. If that is the
+ * case, then this function will not be called. It then will only
+ * be called during the config phase.
+ *
+ * So only when in setup phase or config phase, create the debugfs
+ * entries and register the SMP channels.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
+ !hci_dev_test_flag(hdev, HCI_CONFIG))
+ return 0;
+
+ if (hci_dev_test_and_set_flag(hdev, HCI_DEBUGFS_CREATED))
+ return 0;
+
+ hci_debugfs_create_common(hdev);
+
+ if (lmp_bredr_capable(hdev))
+ hci_debugfs_create_bredr(hdev);
+
+ if (lmp_le_capable(hdev))
+ hci_debugfs_create_le(hdev);
+
+ return 0;
+}
+
+#define HCI_QUIRK_BROKEN(_quirk, _desc) { HCI_QUIRK_BROKEN_##_quirk, _desc }
+
+static const struct {
+ unsigned long quirk;
+ const char *desc;
+} hci_broken_table[] = {
+ HCI_QUIRK_BROKEN(LOCAL_COMMANDS,
+ "HCI Read Local Supported Commands not supported"),
+ HCI_QUIRK_BROKEN(STORED_LINK_KEY,
+ "HCI Delete Stored Link Key command is advertised, "
+ "but not supported."),
+ HCI_QUIRK_BROKEN(ERR_DATA_REPORTING,
+ "HCI Read Default Erroneous Data Reporting command is "
+ "advertised, but not supported."),
+ HCI_QUIRK_BROKEN(READ_TRANSMIT_POWER,
+ "HCI Read Transmit Power Level command is advertised, "
+ "but not supported."),
+ HCI_QUIRK_BROKEN(FILTER_CLEAR_ALL,
+ "HCI Set Event Filter command not supported."),
+ HCI_QUIRK_BROKEN(ENHANCED_SETUP_SYNC_CONN,
+ "HCI Enhanced Setup Synchronous Connection command is "
+ "advertised, but not supported."),
+ HCI_QUIRK_BROKEN(SET_RPA_TIMEOUT,
+ "HCI LE Set Random Private Address Timeout command is "
+ "advertised, but not supported."),
+ HCI_QUIRK_BROKEN(EXT_CREATE_CONN,
+ "HCI LE Extended Create Connection command is "
+ "advertised, but not supported."),
+ HCI_QUIRK_BROKEN(WRITE_AUTH_PAYLOAD_TIMEOUT,
+ "HCI WRITE AUTH PAYLOAD TIMEOUT command leads "
+ "to unexpected SMP errors when pairing "
+ "and will not be used."),
+ HCI_QUIRK_BROKEN(LE_CODED,
+ "HCI LE Coded PHY feature bit is set, "
+ "but its usage is not supported.")
+};
+
+/* This function handles hdev setup stage:
+ *
+ * Calls hdev->setup
+ * Setup address if HCI_QUIRK_USE_BDADDR_PROPERTY is set.
+ */
+static int hci_dev_setup_sync(struct hci_dev *hdev)
+{
+ int ret = 0;
+ bool invalid_bdaddr;
+ size_t i;
+
+ if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
+ !hci_test_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_SETUP))
+ return 0;
+
+ bt_dev_dbg(hdev, "");
+
+ hci_sock_dev_event(hdev, HCI_DEV_SETUP);
+
+ if (hdev->setup)
+ ret = hdev->setup(hdev);
+
+ for (i = 0; i < ARRAY_SIZE(hci_broken_table); i++) {
+ if (hci_test_quirk(hdev, hci_broken_table[i].quirk))
+ bt_dev_warn(hdev, "%s", hci_broken_table[i].desc);
+ }
+
+ /* The transport driver can set the quirk to mark the
+ * BD_ADDR invalid before creating the HCI device or in
+ * its setup callback.
+ */
+ invalid_bdaddr = hci_test_quirk(hdev, HCI_QUIRK_INVALID_BDADDR) ||
+ hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY);
+ if (!ret) {
+ if (hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY) &&
+ !bacmp(&hdev->public_addr, BDADDR_ANY))
+ hci_dev_get_bd_addr_from_property(hdev);
+
+ if (invalid_bdaddr && bacmp(&hdev->public_addr, BDADDR_ANY) &&
+ hdev->set_bdaddr) {
+ ret = hdev->set_bdaddr(hdev, &hdev->public_addr);
+ if (!ret)
+ invalid_bdaddr = false;
+ }
+ }
+
+ /* The transport driver can set these quirks before
+ * creating the HCI device or in its setup callback.
+ *
+ * For the invalid BD_ADDR quirk it is possible that
+ * it becomes a valid address if the bootloader does
+ * provide it (see above).
+ *
+ * In case any of them is set, the controller has to
+ * start up as unconfigured.
+ */
+ if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG) ||
+ invalid_bdaddr)
+ hci_dev_set_flag(hdev, HCI_UNCONFIGURED);
+
+ /* For an unconfigured controller it is required to
+ * read at least the version information provided by
+ * the Read Local Version Information command.
+ *
+ * If the set_bdaddr driver callback is provided, then
+ * also the original Bluetooth public device address
+ * will be read using the Read BD Address command.
+ */
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
+ return hci_unconf_init_sync(hdev);
+
+ return ret;
+}
+
+/* This function handles hdev init stage:
+ *
+ * Calls hci_dev_setup_sync to perform setup stage
+ * Calls hci_init_sync to perform HCI command init sequence
+ */
+static int hci_dev_init_sync(struct hci_dev *hdev)
+{
+ int ret;
+
+ bt_dev_dbg(hdev, "");
+
+ atomic_set(&hdev->cmd_cnt, 1);
+ set_bit(HCI_INIT, &hdev->flags);
+
+ ret = hci_dev_setup_sync(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_CONFIG)) {
+ /* If public address change is configured, ensure that
+ * the address gets programmed. If the driver does not
+ * support changing the public address, fail the power
+ * on procedure.
+ */
+ if (bacmp(&hdev->public_addr, BDADDR_ANY) &&
+ hdev->set_bdaddr)
+ ret = hdev->set_bdaddr(hdev, &hdev->public_addr);
+ else
+ ret = -EADDRNOTAVAIL;
+ }
+
+ if (!ret) {
+ if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
+ !hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ ret = hci_init_sync(hdev);
+ if (!ret && hdev->post_init)
+ ret = hdev->post_init(hdev);
+ }
+ }
+
+ /* If the HCI Reset command is clearing all diagnostic settings,
+ * then they need to be reprogrammed after the init procedure
+ * completed.
+ */
+ if (hci_test_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_DIAG) &&
+ !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
+ hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) && hdev->set_diag)
+ ret = hdev->set_diag(hdev, true);
+
+ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ msft_do_open(hdev);
+ aosp_do_open(hdev);
+ }
+
+ clear_bit(HCI_INIT, &hdev->flags);
+
+ return ret;
+}
+
+int hci_dev_open_sync(struct hci_dev *hdev)
+{
+ int ret;
+
+ bt_dev_dbg(hdev, "");
+
+ if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) {
+ ret = -ENODEV;
+ goto done;
+ }
+
+ if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
+ !hci_dev_test_flag(hdev, HCI_CONFIG)) {
+ /* Check for rfkill but allow the HCI setup stage to
+ * proceed (which in itself doesn't cause any RF activity).
+ */
+ if (hci_dev_test_flag(hdev, HCI_RFKILLED)) {
+ ret = -ERFKILL;
+ goto done;
+ }
+
+ /* Check for valid public address or a configured static
+ * random address, but let the HCI setup proceed to
+ * be able to determine if there is a public address
+ * or not.
+ *
+ * In case of user channel usage, it is not important
+ * if a public address or static random address is
+ * available.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
+ !bacmp(&hdev->bdaddr, BDADDR_ANY) &&
+ !bacmp(&hdev->static_addr, BDADDR_ANY)) {
+ ret = -EADDRNOTAVAIL;
+ goto done;
+ }
+ }
+
+ if (test_bit(HCI_UP, &hdev->flags)) {
+ ret = -EALREADY;
+ goto done;
+ }
+
+ if (hdev->open(hdev)) {
+ ret = -EIO;
+ goto done;
+ }
+
+ hci_devcd_reset(hdev);
+
+ set_bit(HCI_RUNNING, &hdev->flags);
+ hci_sock_dev_event(hdev, HCI_DEV_OPEN);
+
+ ret = hci_dev_init_sync(hdev);
+ if (!ret) {
+ hci_dev_hold(hdev);
+ hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+ hci_adv_instances_set_rpa_expired(hdev, true);
+ set_bit(HCI_UP, &hdev->flags);
+ hci_sock_dev_event(hdev, HCI_DEV_UP);
+ hci_leds_update_powered(hdev, true);
+ if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
+ !hci_dev_test_flag(hdev, HCI_CONFIG) &&
+ !hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
+ !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
+ hci_dev_test_flag(hdev, HCI_MGMT)) {
+ ret = hci_powered_update_sync(hdev);
+ mgmt_power_on(hdev, ret);
+ }
+ } else {
+ /* Init failed, cleanup */
+ flush_work(&hdev->tx_work);
+
+ /* Since hci_rx_work() is possible to awake new cmd_work
+ * it should be flushed first to avoid unexpected call of
+ * hci_cmd_work()
+ */
+ flush_work(&hdev->rx_work);
+ flush_work(&hdev->cmd_work);
+
+ skb_queue_purge(&hdev->cmd_q);
+ skb_queue_purge(&hdev->rx_q);
+
+ if (hdev->flush)
+ hdev->flush(hdev);
+
+ if (hdev->sent_cmd) {
+ cancel_delayed_work_sync(&hdev->cmd_timer);
+ kfree_skb(hdev->sent_cmd);
+ hdev->sent_cmd = NULL;
+ }
+
+ if (hdev->req_skb) {
+ kfree_skb(hdev->req_skb);
+ hdev->req_skb = NULL;
+ }
+
+ clear_bit(HCI_RUNNING, &hdev->flags);
+ hci_sock_dev_event(hdev, HCI_DEV_CLOSE);
+
+ hdev->close(hdev);
+ hdev->flags &= BIT(HCI_RAW);
+ }
+
+done:
+ return ret;
+}
+
+/* This function requires the caller holds hdev->lock */
+static void hci_pend_le_actions_clear(struct hci_dev *hdev)
+{
+ struct hci_conn_params *p;
+
+ list_for_each_entry(p, &hdev->le_conn_params, list) {
+ hci_pend_le_list_del_init(p);
+ if (p->conn) {
+ hci_conn_drop(p->conn);
+ hci_conn_put(p->conn);
+ p->conn = NULL;
+ }
+ }
+
+ BT_DBG("All LE pending actions cleared");
+}
+
+static int hci_dev_shutdown(struct hci_dev *hdev)
+{
+ int err = 0;
+ /* Similar to how we first do setup and then set the exclusive access
+ * bit for userspace, we must first unset userchannel and then clean up.
+ * Otherwise, the kernel can't properly use the hci channel to clean up
+ * the controller (some shutdown routines require sending additional
+ * commands to the controller for example).
+ */
+ bool was_userchannel =
+ hci_dev_test_and_clear_flag(hdev, HCI_USER_CHANNEL);
+
+ if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) &&
+ test_bit(HCI_UP, &hdev->flags)) {
+ /* Execute vendor specific shutdown routine */
+ if (hdev->shutdown)
+ err = hdev->shutdown(hdev);
+ }
+
+ if (was_userchannel)
+ hci_dev_set_flag(hdev, HCI_USER_CHANNEL);
+
+ return err;
+}
+
+int hci_dev_close_sync(struct hci_dev *hdev)
+{
+ bool auto_off;
+ int err = 0;
+
+ bt_dev_dbg(hdev, "");
+
+ if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) {
+ disable_delayed_work(&hdev->power_off);
+ disable_delayed_work(&hdev->ncmd_timer);
+ disable_delayed_work(&hdev->le_scan_disable);
+ } else {
+ cancel_delayed_work(&hdev->power_off);
+ cancel_delayed_work(&hdev->ncmd_timer);
+ cancel_delayed_work(&hdev->le_scan_disable);
+ }
+
+ hci_cmd_sync_cancel_sync(hdev, ENODEV);
+
+ cancel_interleave_scan(hdev);
+
+ if (hdev->adv_instance_timeout) {
+ cancel_delayed_work_sync(&hdev->adv_instance_expire);
+ hdev->adv_instance_timeout = 0;
+ }
+
+ err = hci_dev_shutdown(hdev);
+
+ if (!test_and_clear_bit(HCI_UP, &hdev->flags)) {
+ cancel_delayed_work_sync(&hdev->cmd_timer);
+ return err;
+ }
+
+ hci_leds_update_powered(hdev, false);
+
+ /* Flush RX and TX works */
+ flush_work(&hdev->tx_work);
+ flush_work(&hdev->rx_work);
+
+ if (hdev->discov_timeout > 0) {
+ hdev->discov_timeout = 0;
+ hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
+ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
+ }
+
+ if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE))
+ cancel_delayed_work(&hdev->service_cache);
+
+ if (hci_dev_test_flag(hdev, HCI_MGMT)) {
+ struct adv_info *adv_instance;
+
+ cancel_delayed_work_sync(&hdev->rpa_expired);
+
+ list_for_each_entry(adv_instance, &hdev->adv_instances, list)
+ cancel_delayed_work_sync(&adv_instance->rpa_expired_cb);
+ }
+
+ /* Avoid potential lockdep warnings from the *_flush() calls by
+ * ensuring the workqueue is empty up front.
+ */
+ drain_workqueue(hdev->workqueue);
+
+ hci_dev_lock(hdev);
+
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+
+ auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF);
+
+ if (!auto_off && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
+ hci_dev_test_flag(hdev, HCI_MGMT))
+ __mgmt_power_off(hdev);
+
+ hci_inquiry_cache_flush(hdev);
+ hci_pend_le_actions_clear(hdev);
+ hci_conn_hash_flush(hdev);
+ /* Prevent data races on hdev->smp_data or hdev->smp_bredr_data */
+ smp_unregister(hdev);
+ hci_dev_unlock(hdev);
+
+ hci_sock_dev_event(hdev, HCI_DEV_DOWN);
+
+ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
+ aosp_do_close(hdev);
+ msft_do_close(hdev);
+ }
+
+ if (hdev->flush)
+ hdev->flush(hdev);
+
+ /* Reset device */
+ skb_queue_purge(&hdev->cmd_q);
+ atomic_set(&hdev->cmd_cnt, 1);
+ if (hci_test_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE) &&
+ !auto_off && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
+ set_bit(HCI_INIT, &hdev->flags);
+ hci_reset_sync(hdev);
+ clear_bit(HCI_INIT, &hdev->flags);
+ }
+
+ /* flush cmd work */
+ flush_work(&hdev->cmd_work);
+
+ /* Drop queues */
+ skb_queue_purge(&hdev->rx_q);
+ skb_queue_purge(&hdev->cmd_q);
+ skb_queue_purge(&hdev->raw_q);
+
+ /* Drop last sent command */
+ if (hdev->sent_cmd) {
+ cancel_delayed_work_sync(&hdev->cmd_timer);
+ kfree_skb(hdev->sent_cmd);
+ hdev->sent_cmd = NULL;
+ }
+
+ /* Drop last request */
+ if (hdev->req_skb) {
+ kfree_skb(hdev->req_skb);
+ hdev->req_skb = NULL;
+ }
+
+ clear_bit(HCI_RUNNING, &hdev->flags);
+ hci_sock_dev_event(hdev, HCI_DEV_CLOSE);
+
+ /* After this point our queues are empty and no tasks are scheduled. */
+ hdev->close(hdev);
+
+ /* Clear flags */
+ hdev->flags &= BIT(HCI_RAW);
+ hci_dev_clear_volatile_flags(hdev);
+
+ memset(hdev->eir, 0, sizeof(hdev->eir));
+ memset(hdev->dev_class, 0, sizeof(hdev->dev_class));
+ bacpy(&hdev->random_addr, BDADDR_ANY);
+ hci_codec_list_clear(&hdev->local_codecs);
+
+ hci_dev_put(hdev);
+ return err;
+}
+
+/* This function perform power on HCI command sequence as follows:
+ *
+ * If controller is already up (HCI_UP) performs hci_powered_update_sync
+ * sequence otherwise run hci_dev_open_sync which will follow with
+ * hci_powered_update_sync after the init sequence is completed.
+ */
+static int hci_power_on_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ if (test_bit(HCI_UP, &hdev->flags) &&
+ hci_dev_test_flag(hdev, HCI_MGMT) &&
+ hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) {
+ cancel_delayed_work(&hdev->power_off);
+ return hci_powered_update_sync(hdev);
+ }
+
+ err = hci_dev_open_sync(hdev);
+ if (err < 0)
+ return err;
+
+ /* During the HCI setup phase, a few error conditions are
+ * ignored and they need to be checked now. If they are still
+ * valid, it is important to return the device back off.
+ */
+ if (hci_dev_test_flag(hdev, HCI_RFKILLED) ||
+ hci_dev_test_flag(hdev, HCI_UNCONFIGURED) ||
+ (!bacmp(&hdev->bdaddr, BDADDR_ANY) &&
+ !bacmp(&hdev->static_addr, BDADDR_ANY))) {
+ hci_dev_clear_flag(hdev, HCI_AUTO_OFF);
+ hci_dev_close_sync(hdev);
+ } else if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) {
+ queue_delayed_work(hdev->req_workqueue, &hdev->power_off,
+ HCI_AUTO_OFF_TIMEOUT);
+ }
+
+ if (hci_dev_test_and_clear_flag(hdev, HCI_SETUP)) {
+ /* For unconfigured devices, set the HCI_RAW flag
+ * so that userspace can easily identify them.
+ */
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
+ set_bit(HCI_RAW, &hdev->flags);
+
+ /* For fully configured devices, this will send
+ * the Index Added event. For unconfigured devices,
+ * it will send Unconfigued Index Added event.
+ *
+ * Devices with HCI_QUIRK_RAW_DEVICE are ignored
+ * and no event will be send.
+ */
+ mgmt_index_added(hdev);
+ } else if (hci_dev_test_and_clear_flag(hdev, HCI_CONFIG)) {
+ /* When the controller is now configured, then it
+ * is important to clear the HCI_RAW flag.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
+ clear_bit(HCI_RAW, &hdev->flags);
+
+ /* Powering on the controller with HCI_CONFIG set only
+ * happens with the transition from unconfigured to
+ * configured. This will send the Index Added event.
+ */
+ mgmt_index_added(hdev);
+ }
+
+ return 0;
+}
+
+static int hci_remote_name_cancel_sync(struct hci_dev *hdev, bdaddr_t *addr)
+{
+ struct hci_cp_remote_name_req_cancel cp;
+
+ memset(&cp, 0, sizeof(cp));
+ bacpy(&cp.bdaddr, addr);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_REMOTE_NAME_REQ_CANCEL,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_stop_discovery_sync(struct hci_dev *hdev)
+{
+ struct discovery_state *d = &hdev->discovery;
+ struct inquiry_entry *e;
+ int err;
+
+ bt_dev_dbg(hdev, "state %u", hdev->discovery.state);
+
+ if (d->state == DISCOVERY_FINDING || d->state == DISCOVERY_STOPPING) {
+ if (test_bit(HCI_INQUIRY, &hdev->flags)) {
+ err = __hci_cmd_sync_status(hdev, HCI_OP_INQUIRY_CANCEL,
+ 0, NULL, HCI_CMD_TIMEOUT);
+ if (err)
+ return err;
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
+ cancel_delayed_work(&hdev->le_scan_disable);
+
+ err = hci_scan_disable_sync(hdev);
+ if (err)
+ return err;
+ }
+
+ } else {
+ err = hci_scan_disable_sync(hdev);
+ if (err)
+ return err;
+ }
+
+ /* Resume advertising if it was paused */
+ if (ll_privacy_capable(hdev))
+ hci_resume_advertising_sync(hdev);
+
+ /* No further actions needed for LE-only discovery */
+ if (d->type == DISCOV_TYPE_LE)
+ return 0;
+
+ if (d->state == DISCOVERY_RESOLVING || d->state == DISCOVERY_STOPPING) {
+ e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY,
+ NAME_PENDING);
+ if (!e)
+ return 0;
+
+ /* Ignore cancel errors since it should interfere with stopping
+ * of the discovery.
+ */
+ hci_remote_name_cancel_sync(hdev, &e->data.bdaddr);
+ }
+
+ return 0;
+}
+
+static int hci_disconnect_sync(struct hci_dev *hdev, struct hci_conn *conn,
+ u8 reason)
+{
+ struct hci_cp_disconnect cp;
+
+ if (conn->type == BIS_LINK || conn->type == PA_LINK) {
+ /* This is a BIS connection, hci_conn_del will
+ * do the necessary cleanup.
+ */
+ hci_dev_lock(hdev);
+ hci_conn_failed(conn, reason);
+ hci_dev_unlock(hdev);
+
+ return 0;
+ }
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.reason = reason;
+
+ /* Wait for HCI_EV_DISCONN_COMPLETE, not HCI_EV_CMD_STATUS, when the
+ * reason is anything but HCI_ERROR_REMOTE_POWER_OFF. This reason is
+ * used when suspending or powering off, where we don't want to wait
+ * for the peer's response.
+ */
+ if (reason != HCI_ERROR_REMOTE_POWER_OFF)
+ return __hci_cmd_sync_status_sk(hdev, HCI_OP_DISCONNECT,
+ sizeof(cp), &cp,
+ HCI_EV_DISCONN_COMPLETE,
+ HCI_CMD_TIMEOUT, NULL);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_DISCONNECT, sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+}
+
+static int hci_le_connect_cancel_sync(struct hci_dev *hdev,
+ struct hci_conn *conn, u8 reason)
+{
+ /* Return reason if scanning since the connection shall probably be
+ * cleanup directly.
+ */
+ if (test_bit(HCI_CONN_SCANNING, &conn->flags))
+ return reason;
+
+ if (conn->role == HCI_ROLE_SLAVE ||
+ test_and_set_bit(HCI_CONN_CANCEL, &conn->flags))
+ return 0;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_CREATE_CONN_CANCEL,
+ 0, NULL, HCI_CMD_TIMEOUT);
+}
+
+static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn,
+ u8 reason)
+{
+ if (conn->type == LE_LINK)
+ return hci_le_connect_cancel_sync(hdev, conn, reason);
+
+ if (conn->type == CIS_LINK) {
+ /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E
+ * page 1857:
+ *
+ * If this command is issued for a CIS on the Central and the
+ * CIS is successfully terminated before being established,
+ * then an HCI_LE_CIS_Established event shall also be sent for
+ * this CIS with the Status Operation Cancelled by Host (0x44).
+ */
+ if (test_bit(HCI_CONN_CREATE_CIS, &conn->flags))
+ return hci_disconnect_sync(hdev, conn, reason);
+
+ /* CIS with no Create CIS sent have nothing to cancel */
+ return HCI_ERROR_LOCAL_HOST_TERM;
+ }
+
+ if (conn->type == BIS_LINK || conn->type == PA_LINK) {
+ /* There is no way to cancel a BIS without terminating the BIG
+ * which is done later on connection cleanup.
+ */
+ return 0;
+ }
+
+ if (hdev->hci_ver < BLUETOOTH_VER_1_2)
+ return 0;
+
+ /* Wait for HCI_EV_CONN_COMPLETE, not HCI_EV_CMD_STATUS, when the
+ * reason is anything but HCI_ERROR_REMOTE_POWER_OFF. This reason is
+ * used when suspending or powering off, where we don't want to wait
+ * for the peer's response.
+ */
+ if (reason != HCI_ERROR_REMOTE_POWER_OFF)
+ return __hci_cmd_sync_status_sk(hdev, HCI_OP_CREATE_CONN_CANCEL,
+ 6, &conn->dst,
+ HCI_EV_CONN_COMPLETE,
+ HCI_CMD_TIMEOUT, NULL);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_CREATE_CONN_CANCEL,
+ 6, &conn->dst, HCI_CMD_TIMEOUT);
+}
+
+static int hci_reject_sco_sync(struct hci_dev *hdev, struct hci_conn *conn,
+ u8 reason)
+{
+ struct hci_cp_reject_sync_conn_req cp;
+
+ memset(&cp, 0, sizeof(cp));
+ bacpy(&cp.bdaddr, &conn->dst);
+ cp.reason = reason;
+
+ /* SCO rejection has its own limited set of
+ * allowed error values (0x0D-0x0F).
+ */
+ if (reason < 0x0d || reason > 0x0f)
+ cp.reason = HCI_ERROR_REJ_LIMITED_RESOURCES;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_REJECT_SYNC_CONN_REQ,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_le_reject_cis_sync(struct hci_dev *hdev, struct hci_conn *conn,
+ u8 reason)
+{
+ struct hci_cp_le_reject_cis cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.reason = reason;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_REJECT_CIS,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_reject_conn_sync(struct hci_dev *hdev, struct hci_conn *conn,
+ u8 reason)
+{
+ struct hci_cp_reject_conn_req cp;
+
+ if (conn->type == CIS_LINK)
+ return hci_le_reject_cis_sync(hdev, conn, reason);
+
+ if (conn->type == BIS_LINK || conn->type == PA_LINK)
+ return -EINVAL;
+
+ if (conn->type == SCO_LINK || conn->type == ESCO_LINK)
+ return hci_reject_sco_sync(hdev, conn, reason);
+
+ memset(&cp, 0, sizeof(cp));
+ bacpy(&cp.bdaddr, &conn->dst);
+ cp.reason = reason;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_REJECT_CONN_REQ,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_abort_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, u8 reason)
+{
+ int err = 0;
+ u16 handle = conn->handle;
+ bool disconnect = false;
+ struct hci_conn *c;
+
+ switch (conn->state) {
+ case BT_CONNECTED:
+ case BT_CONFIG:
+ err = hci_disconnect_sync(hdev, conn, reason);
+ break;
+ case BT_CONNECT:
+ err = hci_connect_cancel_sync(hdev, conn, reason);
+ break;
+ case BT_CONNECT2:
+ err = hci_reject_conn_sync(hdev, conn, reason);
+ break;
+ case BT_OPEN:
+ case BT_BOUND:
+ break;
+ default:
+ disconnect = true;
+ break;
+ }
+
+ hci_dev_lock(hdev);
+
+ /* Check if the connection has been cleaned up concurrently */
+ c = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!c || c != conn) {
+ err = 0;
+ goto unlock;
+ }
+
+ /* Cleanup hci_conn object if it cannot be cancelled as it
+ * likely means the controller and host stack are out of sync
+ * or in case of LE it was still scanning so it can be cleanup
+ * safely.
+ */
+ if (disconnect) {
+ conn->state = BT_CLOSED;
+ hci_disconn_cfm(conn, reason);
+ hci_conn_del(conn);
+ } else {
+ hci_conn_failed(conn, reason);
+ }
+
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static int hci_disconnect_all_sync(struct hci_dev *hdev, u8 reason)
+{
+ struct list_head *head = &hdev->conn_hash.list;
+ struct hci_conn *conn;
+
+ rcu_read_lock();
+ while ((conn = list_first_or_null_rcu(head, struct hci_conn, list))) {
+ /* Make sure the connection is not freed while unlocking */
+ conn = hci_conn_get(conn);
+ rcu_read_unlock();
+ /* Disregard possible errors since hci_conn_del shall have been
+ * called even in case of errors had occurred since it would
+ * then cause hci_conn_failed to be called which calls
+ * hci_conn_del internally.
+ */
+ hci_abort_conn_sync(hdev, conn, reason);
+ hci_conn_put(conn);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+/* This function perform power off HCI command sequence as follows:
+ *
+ * Clear Advertising
+ * Stop Discovery
+ * Disconnect all connections
+ * hci_dev_close_sync
+ */
+static int hci_power_off_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ /* If controller is already down there is nothing to do */
+ if (!test_bit(HCI_UP, &hdev->flags))
+ return 0;
+
+ hci_dev_set_flag(hdev, HCI_POWERING_DOWN);
+
+ if (test_bit(HCI_ISCAN, &hdev->flags) ||
+ test_bit(HCI_PSCAN, &hdev->flags)) {
+ err = hci_write_scan_enable_sync(hdev, 0x00);
+ if (err)
+ goto out;
+ }
+
+ err = hci_clear_adv_sync(hdev, NULL, false);
+ if (err)
+ goto out;
+
+ err = hci_stop_discovery_sync(hdev);
+ if (err)
+ goto out;
+
+ /* Terminated due to Power Off */
+ err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF);
+ if (err)
+ goto out;
+
+ err = hci_dev_close_sync(hdev);
+
+out:
+ hci_dev_clear_flag(hdev, HCI_POWERING_DOWN);
+ return err;
+}
+
+int hci_set_powered_sync(struct hci_dev *hdev, u8 val)
+{
+ if (val)
+ return hci_power_on_sync(hdev);
+
+ return hci_power_off_sync(hdev);
+}
+
+static int hci_write_iac_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_write_current_iac_lap cp;
+
+ if (!hci_dev_test_flag(hdev, HCI_DISCOVERABLE))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) {
+ /* Limited discoverable mode */
+ cp.num_iac = min_t(u8, hdev->num_iac, 2);
+ cp.iac_lap[0] = 0x00; /* LIAC */
+ cp.iac_lap[1] = 0x8b;
+ cp.iac_lap[2] = 0x9e;
+ cp.iac_lap[3] = 0x33; /* GIAC */
+ cp.iac_lap[4] = 0x8b;
+ cp.iac_lap[5] = 0x9e;
+ } else {
+ /* General discoverable mode */
+ cp.num_iac = 1;
+ cp.iac_lap[0] = 0x33; /* GIAC */
+ cp.iac_lap[1] = 0x8b;
+ cp.iac_lap[2] = 0x9e;
+ }
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_CURRENT_IAC_LAP,
+ (cp.num_iac * 3) + 1, &cp,
+ HCI_CMD_TIMEOUT);
+}
+
+int hci_update_discoverable_sync(struct hci_dev *hdev)
+{
+ int err = 0;
+
+ if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
+ err = hci_write_iac_sync(hdev);
+ if (err)
+ return err;
+
+ err = hci_update_scan_sync(hdev);
+ if (err)
+ return err;
+
+ err = hci_update_class_sync(hdev);
+ if (err)
+ return err;
+ }
+
+ /* Advertising instances don't use the global discoverable setting, so
+ * only update AD if advertising was enabled using Set Advertising.
+ */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) {
+ err = hci_update_adv_data_sync(hdev, 0x00);
+ if (err)
+ return err;
+
+ /* Discoverable mode affects the local advertising
+ * address in limited privacy mode.
+ */
+ if (hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) {
+ if (ext_adv_capable(hdev))
+ err = hci_start_ext_adv_sync(hdev, 0x00);
+ else
+ err = hci_enable_advertising_sync(hdev);
+ }
+ }
+
+ return err;
+}
+
+static int update_discoverable_sync(struct hci_dev *hdev, void *data)
+{
+ return hci_update_discoverable_sync(hdev);
+}
+
+int hci_update_discoverable(struct hci_dev *hdev)
+{
+ /* Only queue if it would have any effect */
+ if (hdev_is_powered(hdev) &&
+ hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
+ hci_dev_test_flag(hdev, HCI_DISCOVERABLE) &&
+ hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY))
+ return hci_cmd_sync_queue(hdev, update_discoverable_sync, NULL,
+ NULL);
+
+ return 0;
+}
+
+int hci_update_connectable_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ err = hci_update_scan_sync(hdev);
+ if (err)
+ return err;
+
+ /* If BR/EDR is not enabled and we disable advertising as a
+ * by-product of disabling connectable, we need to update the
+ * advertising flags.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ err = hci_update_adv_data_sync(hdev, hdev->cur_adv_instance);
+
+ /* Update the advertising parameters if necessary */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
+ !list_empty(&hdev->adv_instances)) {
+ if (ext_adv_capable(hdev))
+ err = hci_start_ext_adv_sync(hdev,
+ hdev->cur_adv_instance);
+ else
+ err = hci_enable_advertising_sync(hdev);
+
+ if (err)
+ return err;
+ }
+
+ return hci_update_passive_scan_sync(hdev);
+}
+
+int hci_inquiry_sync(struct hci_dev *hdev, u8 length, u8 num_rsp)
+{
+ const u8 giac[3] = { 0x33, 0x8b, 0x9e };
+ const u8 liac[3] = { 0x00, 0x8b, 0x9e };
+ struct hci_cp_inquiry cp;
+
+ bt_dev_dbg(hdev, "");
+
+ if (test_bit(HCI_INQUIRY, &hdev->flags))
+ return 0;
+
+ hci_dev_lock(hdev);
+ hci_inquiry_cache_flush(hdev);
+ hci_dev_unlock(hdev);
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (hdev->discovery.limited)
+ memcpy(&cp.lap, liac, sizeof(cp.lap));
+ else
+ memcpy(&cp.lap, giac, sizeof(cp.lap));
+
+ cp.length = length;
+ cp.num_rsp = num_rsp;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_INQUIRY,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_active_scan_sync(struct hci_dev *hdev, uint16_t interval)
+{
+ u8 own_addr_type;
+ /* Accept list is not used for discovery */
+ u8 filter_policy = 0x00;
+ /* Default is to enable duplicates filter */
+ u8 filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
+ int err;
+
+ bt_dev_dbg(hdev, "");
+
+ /* If controller is scanning, it means the passive scanning is
+ * running. Thus, we should temporarily stop it in order to set the
+ * discovery scanning parameters.
+ */
+ err = hci_scan_disable_sync(hdev);
+ if (err) {
+ bt_dev_err(hdev, "Unable to disable scanning: %d", err);
+ return err;
+ }
+
+ cancel_interleave_scan(hdev);
+
+ /* Pause address resolution for active scan and stop advertising if
+ * privacy is enabled.
+ */
+ err = hci_pause_addr_resolution(hdev);
+ if (err)
+ goto failed;
+
+ /* All active scans will be done with either a resolvable private
+ * address (when privacy feature has been enabled) or non-resolvable
+ * private address.
+ */
+ err = hci_update_random_address_sync(hdev, true, scan_use_rpa(hdev),
+ &own_addr_type);
+ if (err < 0)
+ own_addr_type = ADDR_LE_DEV_PUBLIC;
+
+ if (hci_is_adv_monitoring(hdev) ||
+ (hci_test_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER) &&
+ hdev->discovery.result_filtering)) {
+ /* Duplicate filter should be disabled when some advertisement
+ * monitor is activated, otherwise AdvMon can only receive one
+ * advertisement for one peer(*) during active scanning, and
+ * might report loss to these peers.
+ *
+ * If controller does strict duplicate filtering and the
+ * discovery requires result filtering disables controller based
+ * filtering since that can cause reports that would match the
+ * host filter to not be reported.
+ */
+ filter_dup = LE_SCAN_FILTER_DUP_DISABLE;
+ }
+
+ err = hci_start_scan_sync(hdev, LE_SCAN_ACTIVE, interval,
+ hdev->le_scan_window_discovery,
+ own_addr_type, filter_policy, filter_dup);
+ if (!err)
+ return err;
+
+failed:
+ /* Resume advertising if it was paused */
+ if (ll_privacy_capable(hdev))
+ hci_resume_advertising_sync(hdev);
+
+ /* Resume passive scanning */
+ hci_update_passive_scan_sync(hdev);
+ return err;
+}
+
+static int hci_start_interleaved_discovery_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ bt_dev_dbg(hdev, "");
+
+ err = hci_active_scan_sync(hdev, hdev->le_scan_int_discovery * 2);
+ if (err)
+ return err;
+
+ return hci_inquiry_sync(hdev, DISCOV_BREDR_INQUIRY_LEN, 0);
+}
+
+int hci_start_discovery_sync(struct hci_dev *hdev)
+{
+ unsigned long timeout;
+ int err;
+
+ bt_dev_dbg(hdev, "type %u", hdev->discovery.type);
+
+ switch (hdev->discovery.type) {
+ case DISCOV_TYPE_BREDR:
+ return hci_inquiry_sync(hdev, DISCOV_BREDR_INQUIRY_LEN, 0);
+ case DISCOV_TYPE_INTERLEAVED:
+ /* When running simultaneous discovery, the LE scanning time
+ * should occupy the whole discovery time sine BR/EDR inquiry
+ * and LE scanning are scheduled by the controller.
+ *
+ * For interleaving discovery in comparison, BR/EDR inquiry
+ * and LE scanning are done sequentially with separate
+ * timeouts.
+ */
+ if (hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY)) {
+ timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT);
+ /* During simultaneous discovery, we double LE scan
+ * interval. We must leave some time for the controller
+ * to do BR/EDR inquiry.
+ */
+ err = hci_start_interleaved_discovery_sync(hdev);
+ break;
+ }
+
+ timeout = msecs_to_jiffies(hdev->discov_interleaved_timeout);
+ err = hci_active_scan_sync(hdev, hdev->le_scan_int_discovery);
+ break;
+ case DISCOV_TYPE_LE:
+ timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT);
+ err = hci_active_scan_sync(hdev, hdev->le_scan_int_discovery);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (err)
+ return err;
+
+ bt_dev_dbg(hdev, "timeout %u ms", jiffies_to_msecs(timeout));
+
+ queue_delayed_work(hdev->req_workqueue, &hdev->le_scan_disable,
+ timeout);
+ return 0;
+}
+
+static void hci_suspend_monitor_sync(struct hci_dev *hdev)
+{
+ switch (hci_get_adv_monitor_offload_ext(hdev)) {
+ case HCI_ADV_MONITOR_EXT_MSFT:
+ msft_suspend_sync(hdev);
+ break;
+ default:
+ return;
+ }
+}
+
+/* This function disables discovery and mark it as paused */
+static int hci_pause_discovery_sync(struct hci_dev *hdev)
+{
+ int old_state = hdev->discovery.state;
+ int err;
+
+ /* If discovery already stopped/stopping/paused there nothing to do */
+ if (old_state == DISCOVERY_STOPPED || old_state == DISCOVERY_STOPPING ||
+ hdev->discovery_paused)
+ return 0;
+
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPING);
+ err = hci_stop_discovery_sync(hdev);
+ if (err)
+ return err;
+
+ hdev->discovery_paused = true;
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+
+ return 0;
+}
+
+static int hci_update_event_filter_sync(struct hci_dev *hdev)
+{
+ struct bdaddr_list_with_flags *b;
+ u8 scan = SCAN_DISABLED;
+ bool scanning = test_bit(HCI_PSCAN, &hdev->flags);
+ int err;
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ return 0;
+
+ /* Some fake CSR controllers lock up after setting this type of
+ * filter, so avoid sending the request altogether.
+ */
+ if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL))
+ return 0;
+
+ /* Always clear event filter when starting */
+ hci_clear_event_filter_sync(hdev);
+
+ list_for_each_entry(b, &hdev->accept_list, list) {
+ if (!(b->flags & HCI_CONN_FLAG_REMOTE_WAKEUP))
+ continue;
+
+ bt_dev_dbg(hdev, "Adding event filters for %pMR", &b->bdaddr);
+
+ err = hci_set_event_filter_sync(hdev, HCI_FLT_CONN_SETUP,
+ HCI_CONN_SETUP_ALLOW_BDADDR,
+ &b->bdaddr,
+ HCI_CONN_SETUP_AUTO_ON);
+ if (err)
+ bt_dev_err(hdev, "Failed to set event filter for %pMR",
+ &b->bdaddr);
+ else
+ scan = SCAN_PAGE;
+ }
+
+ if (scan && !scanning)
+ hci_write_scan_enable_sync(hdev, scan);
+ else if (!scan && scanning)
+ hci_write_scan_enable_sync(hdev, scan);
+
+ return 0;
+}
+
+/* This function disables scan (BR and LE) and mark it as paused */
+static int hci_pause_scan_sync(struct hci_dev *hdev)
+{
+ if (hdev->scanning_paused)
+ return 0;
+
+ /* Disable page scan if enabled */
+ if (test_bit(HCI_PSCAN, &hdev->flags))
+ hci_write_scan_enable_sync(hdev, SCAN_DISABLED);
+
+ hci_scan_disable_sync(hdev);
+
+ hdev->scanning_paused = true;
+
+ return 0;
+}
+
+/* This function performs the HCI suspend procedures in the follow order:
+ *
+ * Pause discovery (active scanning/inquiry)
+ * Pause Directed Advertising/Advertising
+ * Pause Scanning (passive scanning in case discovery was not active)
+ * Disconnect all connections
+ * Set suspend_status to BT_SUSPEND_DISCONNECT if hdev cannot wakeup
+ * otherwise:
+ * Update event mask (only set events that are allowed to wake up the host)
+ * Update event filter (with devices marked with HCI_CONN_FLAG_REMOTE_WAKEUP)
+ * Update passive scanning (lower duty cycle)
+ * Set suspend_status to BT_SUSPEND_CONFIGURE_WAKE
+ */
+int hci_suspend_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ /* If marked as suspended there nothing to do */
+ if (hdev->suspended)
+ return 0;
+
+ /* Mark device as suspended */
+ hdev->suspended = true;
+
+ /* Pause discovery if not already stopped */
+ hci_pause_discovery_sync(hdev);
+
+ /* Pause other advertisements */
+ hci_pause_advertising_sync(hdev);
+
+ /* Suspend monitor filters */
+ hci_suspend_monitor_sync(hdev);
+
+ /* Prevent disconnects from causing scanning to be re-enabled */
+ hci_pause_scan_sync(hdev);
+
+ if (hci_conn_count(hdev)) {
+ /* Soft disconnect everything (power off) */
+ err = hci_disconnect_all_sync(hdev, HCI_ERROR_REMOTE_POWER_OFF);
+ if (err) {
+ /* Set state to BT_RUNNING so resume doesn't notify */
+ hdev->suspend_state = BT_RUNNING;
+ hci_resume_sync(hdev);
+ return err;
+ }
+
+ /* Update event mask so only the allowed event can wakeup the
+ * host.
+ */
+ hci_set_event_mask_sync(hdev);
+ }
+
+ /* Only configure accept list if disconnect succeeded and wake
+ * isn't being prevented.
+ */
+ if (!hdev->wakeup || !hdev->wakeup(hdev)) {
+ hdev->suspend_state = BT_SUSPEND_DISCONNECT;
+ return 0;
+ }
+
+ /* Unpause to take care of updating scanning params */
+ hdev->scanning_paused = false;
+
+ /* Enable event filter for paired devices */
+ hci_update_event_filter_sync(hdev);
+
+ /* Update LE passive scan if enabled */
+ hci_update_passive_scan_sync(hdev);
+
+ /* Pause scan changes again. */
+ hdev->scanning_paused = true;
+
+ hdev->suspend_state = BT_SUSPEND_CONFIGURE_WAKE;
+
+ return 0;
+}
+
+/* This function resumes discovery */
+static int hci_resume_discovery_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ /* If discovery not paused there nothing to do */
+ if (!hdev->discovery_paused)
+ return 0;
+
+ hdev->discovery_paused = false;
+
+ hci_discovery_set_state(hdev, DISCOVERY_STARTING);
+
+ err = hci_start_discovery_sync(hdev);
+
+ hci_discovery_set_state(hdev, err ? DISCOVERY_STOPPED :
+ DISCOVERY_FINDING);
+
+ return err;
+}
+
+static void hci_resume_monitor_sync(struct hci_dev *hdev)
+{
+ switch (hci_get_adv_monitor_offload_ext(hdev)) {
+ case HCI_ADV_MONITOR_EXT_MSFT:
+ msft_resume_sync(hdev);
+ break;
+ default:
+ return;
+ }
+}
+
+/* This function resume scan and reset paused flag */
+static int hci_resume_scan_sync(struct hci_dev *hdev)
+{
+ if (!hdev->scanning_paused)
+ return 0;
+
+ hdev->scanning_paused = false;
+
+ hci_update_scan_sync(hdev);
+
+ /* Reset passive scanning to normal */
+ hci_update_passive_scan_sync(hdev);
+
+ return 0;
+}
+
+/* This function performs the HCI suspend procedures in the follow order:
+ *
+ * Restore event mask
+ * Clear event filter
+ * Update passive scanning (normal duty cycle)
+ * Resume Directed Advertising/Advertising
+ * Resume discovery (active scanning/inquiry)
+ */
+int hci_resume_sync(struct hci_dev *hdev)
+{
+ /* If not marked as suspended there nothing to do */
+ if (!hdev->suspended)
+ return 0;
+
+ hdev->suspended = false;
+
+ /* Restore event mask */
+ hci_set_event_mask_sync(hdev);
+
+ /* Clear any event filters and restore scan state */
+ hci_clear_event_filter_sync(hdev);
+
+ /* Resume scanning */
+ hci_resume_scan_sync(hdev);
+
+ /* Resume monitor filters */
+ hci_resume_monitor_sync(hdev);
+
+ /* Resume other advertisements */
+ hci_resume_advertising_sync(hdev);
+
+ /* Resume discovery */
+ hci_resume_discovery_sync(hdev);
+
+ return 0;
+}
+
+static bool conn_use_rpa(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ return hci_dev_test_flag(hdev, HCI_PRIVACY);
+}
+
+static int hci_le_ext_directed_advertising_sync(struct hci_dev *hdev,
+ struct hci_conn *conn)
+{
+ struct hci_cp_le_set_ext_adv_params cp;
+ struct hci_rp_le_set_ext_adv_params rp;
+ int err;
+ bdaddr_t random_addr;
+ u8 own_addr_type;
+
+ err = hci_update_random_address_sync(hdev, false, conn_use_rpa(conn),
+ &own_addr_type);
+ if (err)
+ return err;
+
+ /* Set require_privacy to false so that the remote device has a
+ * chance of identifying us.
+ */
+ err = hci_get_random_address(hdev, false, conn_use_rpa(conn), NULL,
+ &own_addr_type, &random_addr);
+ if (err)
+ return err;
+
+ memset(&cp, 0, sizeof(cp));
+
+ cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_DIRECT_IND);
+ cp.channel_map = hdev->le_adv_channel_map;
+ cp.tx_power = HCI_TX_POWER_INVALID;
+ cp.primary_phy = HCI_ADV_PHY_1M;
+ cp.secondary_phy = HCI_ADV_PHY_1M;
+ cp.handle = 0x00; /* Use instance 0 for directed adv */
+ cp.own_addr_type = own_addr_type;
+ cp.peer_addr_type = conn->dst_type;
+ bacpy(&cp.peer_addr, &conn->dst);
+
+ /* As per Core Spec 5.2 Vol 2, PART E, Sec 7.8.53, for
+ * advertising_event_property LE_LEGACY_ADV_DIRECT_IND
+ * does not supports advertising data when the advertising set already
+ * contains some, the controller shall return erroc code 'Invalid
+ * HCI Command Parameters(0x12).
+ * So it is required to remove adv set for handle 0x00. since we use
+ * instance 0 for directed adv.
+ */
+ err = hci_remove_ext_adv_instance_sync(hdev, cp.handle, NULL);
+ if (err)
+ return err;
+
+ err = hci_set_ext_adv_params_sync(hdev, NULL, &cp, &rp);
+ if (err)
+ return err;
+
+ /* Update adv data as tx power is known now */
+ err = hci_set_ext_adv_data_sync(hdev, cp.handle);
+ if (err)
+ return err;
+
+ /* Check if random address need to be updated */
+ if (own_addr_type == ADDR_LE_DEV_RANDOM &&
+ bacmp(&random_addr, BDADDR_ANY) &&
+ bacmp(&random_addr, &hdev->random_addr)) {
+ err = hci_set_adv_set_random_addr_sync(hdev, 0x00,
+ &random_addr);
+ if (err)
+ return err;
+ }
+
+ return hci_enable_ext_advertising_sync(hdev, 0x00);
+}
+
+static int hci_le_directed_advertising_sync(struct hci_dev *hdev,
+ struct hci_conn *conn)
+{
+ struct hci_cp_le_set_adv_param cp;
+ u8 status;
+ u8 own_addr_type;
+ u8 enable;
+
+ if (ext_adv_capable(hdev))
+ return hci_le_ext_directed_advertising_sync(hdev, conn);
+
+ /* Clear the HCI_LE_ADV bit temporarily so that the
+ * hci_update_random_address knows that it's safe to go ahead
+ * and write a new random address. The flag will be set back on
+ * as soon as the SET_ADV_ENABLE HCI command completes.
+ */
+ hci_dev_clear_flag(hdev, HCI_LE_ADV);
+
+ /* Set require_privacy to false so that the remote device has a
+ * chance of identifying us.
+ */
+ status = hci_update_random_address_sync(hdev, false, conn_use_rpa(conn),
+ &own_addr_type);
+ if (status)
+ return status;
+
+ memset(&cp, 0, sizeof(cp));
+
+ /* Some controllers might reject command if intervals are not
+ * within range for undirected advertising.
+ * BCM20702A0 is known to be affected by this.
+ */
+ cp.min_interval = cpu_to_le16(0x0020);
+ cp.max_interval = cpu_to_le16(0x0020);
+
+ cp.type = LE_ADV_DIRECT_IND;
+ cp.own_address_type = own_addr_type;
+ cp.direct_addr_type = conn->dst_type;
+ bacpy(&cp.direct_addr, &conn->dst);
+ cp.channel_map = hdev->le_adv_channel_map;
+
+ status = __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_PARAM,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+ if (status)
+ return status;
+
+ enable = 0x01;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_ENABLE,
+ sizeof(enable), &enable, HCI_CMD_TIMEOUT);
+}
+
+static void set_ext_conn_params(struct hci_conn *conn,
+ struct hci_cp_le_ext_conn_param *p)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ memset(p, 0, sizeof(*p));
+
+ p->scan_interval = cpu_to_le16(hdev->le_scan_int_connect);
+ p->scan_window = cpu_to_le16(hdev->le_scan_window_connect);
+ p->conn_interval_min = cpu_to_le16(conn->le_conn_min_interval);
+ p->conn_interval_max = cpu_to_le16(conn->le_conn_max_interval);
+ p->conn_latency = cpu_to_le16(conn->le_conn_latency);
+ p->supervision_timeout = cpu_to_le16(conn->le_supv_timeout);
+ p->min_ce_len = cpu_to_le16(0x0000);
+ p->max_ce_len = cpu_to_le16(0x0000);
+}
+
+static int hci_le_ext_create_conn_sync(struct hci_dev *hdev,
+ struct hci_conn *conn, u8 own_addr_type)
+{
+ struct hci_cp_le_ext_create_conn *cp;
+ struct hci_cp_le_ext_conn_param *p;
+ u8 data[sizeof(*cp) + sizeof(*p) * 3];
+ u32 plen;
+
+ cp = (void *)data;
+ p = (void *)cp->data;
+
+ memset(cp, 0, sizeof(*cp));
+
+ bacpy(&cp->peer_addr, &conn->dst);
+ cp->peer_addr_type = conn->dst_type;
+ cp->own_addr_type = own_addr_type;
+
+ plen = sizeof(*cp);
+
+ if (scan_1m(hdev) && (conn->le_adv_phy == HCI_ADV_PHY_1M ||
+ conn->le_adv_sec_phy == HCI_ADV_PHY_1M)) {
+ cp->phys |= LE_SCAN_PHY_1M;
+ set_ext_conn_params(conn, p);
+
+ p++;
+ plen += sizeof(*p);
+ }
+
+ if (scan_2m(hdev) && (conn->le_adv_phy == HCI_ADV_PHY_2M ||
+ conn->le_adv_sec_phy == HCI_ADV_PHY_2M)) {
+ cp->phys |= LE_SCAN_PHY_2M;
+ set_ext_conn_params(conn, p);
+
+ p++;
+ plen += sizeof(*p);
+ }
+
+ if (scan_coded(hdev) && (conn->le_adv_phy == HCI_ADV_PHY_CODED ||
+ conn->le_adv_sec_phy == HCI_ADV_PHY_CODED)) {
+ cp->phys |= LE_SCAN_PHY_CODED;
+ set_ext_conn_params(conn, p);
+
+ plen += sizeof(*p);
+ }
+
+ return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_EXT_CREATE_CONN,
+ plen, data,
+ HCI_EV_LE_ENHANCED_CONN_COMPLETE,
+ conn->conn_timeout, NULL);
+}
+
+static int hci_le_create_conn_sync(struct hci_dev *hdev, void *data)
+{
+ struct hci_cp_le_create_conn cp;
+ struct hci_conn_params *params;
+ u8 own_addr_type;
+ int err;
+ struct hci_conn *conn = data;
+
+ if (!hci_conn_valid(hdev, conn))
+ return -ECANCELED;
+
+ bt_dev_dbg(hdev, "conn %p", conn);
+
+ clear_bit(HCI_CONN_SCANNING, &conn->flags);
+ conn->state = BT_CONNECT;
+
+ /* If requested to connect as peripheral use directed advertising */
+ if (conn->role == HCI_ROLE_SLAVE) {
+ /* If we're active scanning and simultaneous roles is not
+ * enabled simply reject the attempt.
+ */
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN) &&
+ hdev->le_scan_type == LE_SCAN_ACTIVE &&
+ !hci_dev_test_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES)) {
+ hci_conn_del(conn);
+ return -EBUSY;
+ }
+
+ /* Pause advertising while doing directed advertising. */
+ hci_pause_advertising_sync(hdev);
+
+ err = hci_le_directed_advertising_sync(hdev, conn);
+ goto done;
+ }
+
+ /* Disable advertising if simultaneous roles is not in use. */
+ if (!hci_dev_test_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES))
+ hci_pause_advertising_sync(hdev);
+
+ params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type);
+ if (params) {
+ conn->le_conn_min_interval = params->conn_min_interval;
+ conn->le_conn_max_interval = params->conn_max_interval;
+ conn->le_conn_latency = params->conn_latency;
+ conn->le_supv_timeout = params->supervision_timeout;
+ } else {
+ conn->le_conn_min_interval = hdev->le_conn_min_interval;
+ conn->le_conn_max_interval = hdev->le_conn_max_interval;
+ conn->le_conn_latency = hdev->le_conn_latency;
+ conn->le_supv_timeout = hdev->le_supv_timeout;
+ }
+
+ /* If controller is scanning, we stop it since some controllers are
+ * not able to scan and connect at the same time. Also set the
+ * HCI_LE_SCAN_INTERRUPTED flag so that the command complete
+ * handler for scan disabling knows to set the correct discovery
+ * state.
+ */
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
+ hci_scan_disable_sync(hdev);
+ hci_dev_set_flag(hdev, HCI_LE_SCAN_INTERRUPTED);
+ }
+
+ /* Update random address, but set require_privacy to false so
+ * that we never connect with an non-resolvable address.
+ */
+ err = hci_update_random_address_sync(hdev, false, conn_use_rpa(conn),
+ &own_addr_type);
+ if (err)
+ goto done;
+ /* Send command LE Extended Create Connection if supported */
+ if (use_ext_conn(hdev)) {
+ err = hci_le_ext_create_conn_sync(hdev, conn, own_addr_type);
+ goto done;
+ }
+
+ memset(&cp, 0, sizeof(cp));
+
+ cp.scan_interval = cpu_to_le16(hdev->le_scan_int_connect);
+ cp.scan_window = cpu_to_le16(hdev->le_scan_window_connect);
+
+ bacpy(&cp.peer_addr, &conn->dst);
+ cp.peer_addr_type = conn->dst_type;
+ cp.own_address_type = own_addr_type;
+ cp.conn_interval_min = cpu_to_le16(conn->le_conn_min_interval);
+ cp.conn_interval_max = cpu_to_le16(conn->le_conn_max_interval);
+ cp.conn_latency = cpu_to_le16(conn->le_conn_latency);
+ cp.supervision_timeout = cpu_to_le16(conn->le_supv_timeout);
+ cp.min_ce_len = cpu_to_le16(0x0000);
+ cp.max_ce_len = cpu_to_le16(0x0000);
+
+ /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 2261:
+ *
+ * If this event is unmasked and the HCI_LE_Connection_Complete event
+ * is unmasked, only the HCI_LE_Enhanced_Connection_Complete event is
+ * sent when a new connection has been created.
+ */
+ err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CONN,
+ sizeof(cp), &cp,
+ use_enhanced_conn_complete(hdev) ?
+ HCI_EV_LE_ENHANCED_CONN_COMPLETE :
+ HCI_EV_LE_CONN_COMPLETE,
+ conn->conn_timeout, NULL);
+
+done:
+ if (err == -ETIMEDOUT)
+ hci_le_connect_cancel_sync(hdev, conn, 0x00);
+
+ /* Re-enable advertising after the connection attempt is finished. */
+ hci_resume_advertising_sync(hdev);
+ return err;
+}
+
+int hci_le_create_cis_sync(struct hci_dev *hdev)
+{
+ DEFINE_FLEX(struct hci_cp_le_create_cis, cmd, cis, num_cis, 0x1f);
+ size_t aux_num_cis = 0;
+ struct hci_conn *conn;
+ u8 cig = BT_ISO_QOS_CIG_UNSET;
+
+ /* The spec allows only one pending LE Create CIS command at a time. If
+ * the command is pending now, don't do anything. We check for pending
+ * connections after each CIS Established event.
+ *
+ * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E
+ * page 2566:
+ *
+ * If the Host issues this command before all the
+ * HCI_LE_CIS_Established events from the previous use of the
+ * command have been generated, the Controller shall return the
+ * error code Command Disallowed (0x0C).
+ *
+ * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E
+ * page 2567:
+ *
+ * When the Controller receives the HCI_LE_Create_CIS command, the
+ * Controller sends the HCI_Command_Status event to the Host. An
+ * HCI_LE_CIS_Established event will be generated for each CIS when it
+ * is established or if it is disconnected or considered lost before
+ * being established; until all the events are generated, the command
+ * remains pending.
+ */
+
+ hci_dev_lock(hdev);
+
+ rcu_read_lock();
+
+ /* Wait until previous Create CIS has completed */
+ list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) {
+ if (test_bit(HCI_CONN_CREATE_CIS, &conn->flags))
+ goto done;
+ }
+
+ /* Find CIG with all CIS ready */
+ list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) {
+ struct hci_conn *link;
+
+ if (hci_conn_check_create_cis(conn))
+ continue;
+
+ cig = conn->iso_qos.ucast.cig;
+
+ list_for_each_entry_rcu(link, &hdev->conn_hash.list, list) {
+ if (hci_conn_check_create_cis(link) > 0 &&
+ link->iso_qos.ucast.cig == cig &&
+ link->state != BT_CONNECTED) {
+ cig = BT_ISO_QOS_CIG_UNSET;
+ break;
+ }
+ }
+
+ if (cig != BT_ISO_QOS_CIG_UNSET)
+ break;
+ }
+
+ if (cig == BT_ISO_QOS_CIG_UNSET)
+ goto done;
+
+ list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) {
+ struct hci_cis *cis = &cmd->cis[aux_num_cis];
+
+ if (hci_conn_check_create_cis(conn) ||
+ conn->iso_qos.ucast.cig != cig)
+ continue;
+
+ set_bit(HCI_CONN_CREATE_CIS, &conn->flags);
+ cis->acl_handle = cpu_to_le16(conn->parent->handle);
+ cis->cis_handle = cpu_to_le16(conn->handle);
+ aux_num_cis++;
+
+ if (aux_num_cis >= cmd->num_cis)
+ break;
+ }
+ cmd->num_cis = aux_num_cis;
+
+done:
+ rcu_read_unlock();
+
+ hci_dev_unlock(hdev);
+
+ if (!aux_num_cis)
+ return 0;
+
+ /* Wait for HCI_LE_CIS_Established */
+ return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_CREATE_CIS,
+ struct_size(cmd, cis, cmd->num_cis),
+ cmd, HCI_EVT_LE_CIS_ESTABLISHED,
+ conn->conn_timeout, NULL);
+}
+
+int hci_le_remove_cig_sync(struct hci_dev *hdev, u8 handle)
+{
+ struct hci_cp_le_remove_cig cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.cig_id = handle;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_REMOVE_CIG, sizeof(cp),
+ &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_le_big_terminate_sync(struct hci_dev *hdev, u8 handle)
+{
+ struct hci_cp_le_big_term_sync cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = handle;
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_BIG_TERM_SYNC,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_le_pa_terminate_sync(struct hci_dev *hdev, u16 handle)
+{
+ struct hci_cp_le_pa_term_sync cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(handle);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_TERM_SYNC,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
+ bool use_rpa, struct adv_info *adv_instance,
+ u8 *own_addr_type, bdaddr_t *rand_addr)
+{
+ int err;
+
+ bacpy(rand_addr, BDADDR_ANY);
+
+ /* If privacy is enabled use a resolvable private address. If
+ * current RPA has expired then generate a new one.
+ */
+ if (use_rpa) {
+ /* If Controller supports LL Privacy use own address type is
+ * 0x03
+ */
+ if (ll_privacy_capable(hdev))
+ *own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED;
+ else
+ *own_addr_type = ADDR_LE_DEV_RANDOM;
+
+ if (adv_instance) {
+ if (adv_rpa_valid(adv_instance))
+ return 0;
+ } else {
+ if (rpa_valid(hdev))
+ return 0;
+ }
+
+ err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa);
+ if (err < 0) {
+ bt_dev_err(hdev, "failed to generate new RPA");
+ return err;
+ }
+
+ bacpy(rand_addr, &hdev->rpa);
+
+ return 0;
+ }
+
+ /* In case of required privacy without resolvable private address,
+ * use an non-resolvable private address. This is useful for
+ * non-connectable advertising.
+ */
+ if (require_privacy) {
+ bdaddr_t nrpa;
+
+ while (true) {
+ /* The non-resolvable private address is generated
+ * from random six bytes with the two most significant
+ * bits cleared.
+ */
+ get_random_bytes(&nrpa, 6);
+ nrpa.b[5] &= 0x3f;
+
+ /* The non-resolvable private address shall not be
+ * equal to the public address.
+ */
+ if (bacmp(&hdev->bdaddr, &nrpa))
+ break;
+ }
+
+ *own_addr_type = ADDR_LE_DEV_RANDOM;
+ bacpy(rand_addr, &nrpa);
+
+ return 0;
+ }
+
+ /* No privacy, use the current address */
+ hci_copy_identity_address(hdev, rand_addr, own_addr_type);
+
+ return 0;
+}
+
+static int _update_adv_data_sync(struct hci_dev *hdev, void *data)
+{
+ u8 instance = PTR_UINT(data);
+
+ return hci_update_adv_data_sync(hdev, instance);
+}
+
+int hci_update_adv_data(struct hci_dev *hdev, u8 instance)
+{
+ return hci_cmd_sync_queue(hdev, _update_adv_data_sync,
+ UINT_PTR(instance), NULL);
+}
+
+static int hci_acl_create_conn_sync(struct hci_dev *hdev, void *data)
+{
+ struct hci_conn *conn = data;
+ struct inquiry_entry *ie;
+ struct hci_cp_create_conn cp;
+ int err;
+
+ if (!hci_conn_valid(hdev, conn))
+ return -ECANCELED;
+
+ /* Many controllers disallow HCI Create Connection while it is doing
+ * HCI Inquiry. So we cancel the Inquiry first before issuing HCI Create
+ * Connection. This may cause the MGMT discovering state to become false
+ * without user space's request but it is okay since the MGMT Discovery
+ * APIs do not promise that discovery should be done forever. Instead,
+ * the user space monitors the status of MGMT discovering and it may
+ * request for discovery again when this flag becomes false.
+ */
+ if (test_bit(HCI_INQUIRY, &hdev->flags)) {
+ err = __hci_cmd_sync_status(hdev, HCI_OP_INQUIRY_CANCEL, 0,
+ NULL, HCI_CMD_TIMEOUT);
+ if (err)
+ bt_dev_warn(hdev, "Failed to cancel inquiry %d", err);
+ }
+
+ conn->state = BT_CONNECT;
+ conn->out = true;
+ conn->role = HCI_ROLE_MASTER;
+
+ conn->attempt++;
+
+ conn->link_policy = hdev->link_policy;
+
+ memset(&cp, 0, sizeof(cp));
+ bacpy(&cp.bdaddr, &conn->dst);
+ cp.pscan_rep_mode = 0x02;
+
+ ie = hci_inquiry_cache_lookup(hdev, &conn->dst);
+ if (ie) {
+ if (inquiry_entry_age(ie) <= INQUIRY_ENTRY_AGE_MAX) {
+ cp.pscan_rep_mode = ie->data.pscan_rep_mode;
+ cp.pscan_mode = ie->data.pscan_mode;
+ cp.clock_offset = ie->data.clock_offset |
+ cpu_to_le16(0x8000);
+ }
+
+ memcpy(conn->dev_class, ie->data.dev_class, 3);
+ }
+
+ cp.pkt_type = cpu_to_le16(conn->pkt_type);
+ if (lmp_rswitch_capable(hdev) && !(hdev->link_mode & HCI_LM_MASTER))
+ cp.role_switch = 0x01;
+ else
+ cp.role_switch = 0x00;
+
+ return __hci_cmd_sync_status_sk(hdev, HCI_OP_CREATE_CONN,
+ sizeof(cp), &cp,
+ HCI_EV_CONN_COMPLETE,
+ conn->conn_timeout, NULL);
+}
+
+int hci_connect_acl_sync(struct hci_dev *hdev, struct hci_conn *conn)
+{
+ return hci_cmd_sync_queue_once(hdev, hci_acl_create_conn_sync, conn,
+ NULL);
+}
+
+static void create_le_conn_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct hci_conn *conn = data;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ if (err == -ECANCELED)
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (!hci_conn_valid(hdev, conn))
+ goto done;
+
+ if (!err) {
+ hci_connect_le_scan_cleanup(conn, 0x00);
+ goto done;
+ }
+
+ /* Check if connection is still pending */
+ if (conn != hci_lookup_le_connect(hdev))
+ goto done;
+
+ /* Flush to make sure we send create conn cancel command if needed */
+ flush_delayed_work(&conn->le_conn_timeout);
+ hci_conn_failed(conn, bt_status(err));
+
+done:
+ hci_dev_unlock(hdev);
+}
+
+int hci_connect_le_sync(struct hci_dev *hdev, struct hci_conn *conn)
+{
+ return hci_cmd_sync_queue_once(hdev, hci_le_create_conn_sync, conn,
+ create_le_conn_complete);
+}
+
+int hci_cancel_connect_sync(struct hci_dev *hdev, struct hci_conn *conn)
+{
+ if (conn->state != BT_OPEN)
+ return -EINVAL;
+
+ switch (conn->type) {
+ case ACL_LINK:
+ return !hci_cmd_sync_dequeue_once(hdev,
+ hci_acl_create_conn_sync,
+ conn, NULL);
+ case LE_LINK:
+ return !hci_cmd_sync_dequeue_once(hdev, hci_le_create_conn_sync,
+ conn, create_le_conn_complete);
+ }
+
+ return -ENOENT;
+}
+
+int hci_le_conn_update_sync(struct hci_dev *hdev, struct hci_conn *conn,
+ struct hci_conn_params *params)
+{
+ struct hci_cp_le_conn_update cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.conn_interval_min = cpu_to_le16(params->conn_min_interval);
+ cp.conn_interval_max = cpu_to_le16(params->conn_max_interval);
+ cp.conn_latency = cpu_to_le16(params->conn_latency);
+ cp.supervision_timeout = cpu_to_le16(params->supervision_timeout);
+ cp.min_ce_len = cpu_to_le16(0x0000);
+ cp.max_ce_len = cpu_to_le16(0x0000);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_CONN_UPDATE,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static void create_pa_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct hci_conn *conn = data;
+ struct hci_conn *pa_sync;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ if (err == -ECANCELED)
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (hci_conn_valid(hdev, conn))
+ clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags);
+
+ if (!err)
+ goto unlock;
+
+ /* Add connection to indicate PA sync error */
+ pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, 0,
+ HCI_ROLE_SLAVE);
+
+ if (IS_ERR(pa_sync))
+ goto unlock;
+
+ set_bit(HCI_CONN_PA_SYNC_FAILED, &pa_sync->flags);
+
+ /* Notify iso layer */
+ hci_connect_cfm(pa_sync, bt_status(err));
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static int hci_le_past_params_sync(struct hci_dev *hdev, struct hci_conn *conn,
+ struct hci_conn *acl, struct bt_iso_qos *qos)
+{
+ struct hci_cp_le_past_params cp;
+ int err;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(acl->handle);
+ /* An HCI_LE_Periodic_Advertising_Sync_Transfer_Received event is sent
+ * to the Host. HCI_LE_Periodic_Advertising_Report events will be
+ * enabled with duplicate filtering enabled.
+ */
+ cp.mode = 0x03;
+ cp.skip = cpu_to_le16(qos->bcast.skip);
+ cp.sync_timeout = cpu_to_le16(qos->bcast.sync_timeout);
+ cp.cte_type = qos->bcast.sync_cte_type;
+
+ /* HCI_LE_PAST_PARAMS command returns a command complete event so it
+ * cannot wait for HCI_EV_LE_PAST_RECEIVED.
+ */
+ err = __hci_cmd_sync_status(hdev, HCI_OP_LE_PAST_PARAMS,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+ if (err)
+ return err;
+
+ /* Wait for HCI_EV_LE_PAST_RECEIVED event */
+ return __hci_cmd_sync_status_sk(hdev, HCI_OP_NOP, 0, NULL,
+ HCI_EV_LE_PAST_RECEIVED,
+ conn->conn_timeout, NULL);
+}
+
+static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data)
+{
+ struct hci_cp_le_pa_create_sync cp;
+ struct hci_conn *conn = data, *le;
+ struct bt_iso_qos *qos = &conn->iso_qos;
+ int err;
+
+ if (!hci_conn_valid(hdev, conn))
+ return -ECANCELED;
+
+ if (conn->sync_handle != HCI_SYNC_HANDLE_INVALID)
+ return -EINVAL;
+
+ if (hci_dev_test_and_set_flag(hdev, HCI_PA_SYNC))
+ return -EBUSY;
+
+ /* Stop scanning if SID has not been set and active scanning is enabled
+ * so we use passive scanning which will be scanning using the allow
+ * list programmed to contain only the connection address.
+ */
+ if (conn->sid == HCI_SID_INVALID &&
+ hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
+ hci_scan_disable_sync(hdev);
+ hci_dev_set_flag(hdev, HCI_LE_SCAN_INTERRUPTED);
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+ }
+
+ /* Mark HCI_CONN_CREATE_PA_SYNC so hci_update_passive_scan_sync can
+ * program the address in the allow list so PA advertisements can be
+ * received.
+ */
+ set_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags);
+
+ hci_update_passive_scan_sync(hdev);
+
+ /* Check if PAST is possible:
+ *
+ * 1. Check if an ACL connection with the destination address exists
+ * 2. Check if that HCI_CONN_FLAG_PAST has been set which indicates that
+ * user really intended to use PAST.
+ */
+ le = hci_conn_hash_lookup_le(hdev, &conn->dst, conn->dst_type);
+ if (le) {
+ struct hci_conn_params *params;
+
+ params = hci_conn_params_lookup(hdev, &le->dst, le->dst_type);
+ if (params && params->flags & HCI_CONN_FLAG_PAST) {
+ err = hci_le_past_params_sync(hdev, conn, le, qos);
+ if (!err)
+ goto done;
+ }
+ }
+
+ /* SID has not been set listen for HCI_EV_LE_EXT_ADV_REPORT to update
+ * it.
+ */
+ if (conn->sid == HCI_SID_INVALID) {
+ err = __hci_cmd_sync_status_sk(hdev, HCI_OP_NOP, 0, NULL,
+ HCI_EV_LE_EXT_ADV_REPORT,
+ conn->conn_timeout, NULL);
+ if (err == -ETIMEDOUT)
+ goto done;
+ }
+
+ memset(&cp, 0, sizeof(cp));
+ cp.options = qos->bcast.options;
+ cp.sid = conn->sid;
+ cp.addr_type = conn->dst_type;
+ bacpy(&cp.addr, &conn->dst);
+ cp.skip = cpu_to_le16(qos->bcast.skip);
+ cp.sync_timeout = cpu_to_le16(qos->bcast.sync_timeout);
+ cp.sync_cte_type = qos->bcast.sync_cte_type;
+
+ /* The spec allows only one pending LE Periodic Advertising Create
+ * Sync command at a time so we forcefully wait for PA Sync Established
+ * event since cmd_work can only schedule one command at a time.
+ *
+ * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E
+ * page 2493:
+ *
+ * If the Host issues this command when another HCI_LE_Periodic_
+ * Advertising_Create_Sync command is pending, the Controller shall
+ * return the error code Command Disallowed (0x0C).
+ */
+ err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_PA_CREATE_SYNC,
+ sizeof(cp), &cp,
+ HCI_EV_LE_PA_SYNC_ESTABLISHED,
+ conn->conn_timeout, NULL);
+ if (err == -ETIMEDOUT)
+ __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC_CANCEL,
+ 0, NULL, HCI_CMD_TIMEOUT);
+
+done:
+ hci_dev_clear_flag(hdev, HCI_PA_SYNC);
+
+ /* Update passive scan since HCI_PA_SYNC flag has been cleared */
+ hci_update_passive_scan_sync(hdev);
+
+ return err;
+}
+
+int hci_connect_pa_sync(struct hci_dev *hdev, struct hci_conn *conn)
+{
+ return hci_cmd_sync_queue_once(hdev, hci_le_pa_create_sync, conn,
+ create_pa_complete);
+}
+
+static void create_big_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct hci_conn *conn = data;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ if (err == -ECANCELED)
+ return;
+
+ if (hci_conn_valid(hdev, conn))
+ clear_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags);
+}
+
+static int hci_le_big_create_sync(struct hci_dev *hdev, void *data)
+{
+ DEFINE_FLEX(struct hci_cp_le_big_create_sync, cp, bis, num_bis, 0x11);
+ struct hci_conn *conn = data;
+ struct bt_iso_qos *qos = &conn->iso_qos;
+ int err;
+
+ if (!hci_conn_valid(hdev, conn))
+ return -ECANCELED;
+
+ set_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags);
+
+ memset(cp, 0, sizeof(*cp));
+ cp->handle = qos->bcast.big;
+ cp->sync_handle = cpu_to_le16(conn->sync_handle);
+ cp->encryption = qos->bcast.encryption;
+ memcpy(cp->bcode, qos->bcast.bcode, sizeof(cp->bcode));
+ cp->mse = qos->bcast.mse;
+ cp->timeout = cpu_to_le16(qos->bcast.timeout);
+ cp->num_bis = conn->num_bis;
+ memcpy(cp->bis, conn->bis, conn->num_bis);
+
+ /* The spec allows only one pending LE BIG Create Sync command at
+ * a time, so we forcefully wait for BIG Sync Established event since
+ * cmd_work can only schedule one command at a time.
+ *
+ * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E
+ * page 2586:
+ *
+ * If the Host sends this command when the Controller is in the
+ * process of synchronizing to any BIG, i.e. the HCI_LE_BIG_Sync_
+ * Established event has not been generated, the Controller shall
+ * return the error code Command Disallowed (0x0C).
+ */
+ err = __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_BIG_CREATE_SYNC,
+ struct_size(cp, bis, cp->num_bis), cp,
+ HCI_EVT_LE_BIG_SYNC_ESTABLISHED,
+ conn->conn_timeout, NULL);
+ if (err == -ETIMEDOUT)
+ hci_le_big_terminate_sync(hdev, cp->handle);
+
+ return err;
+}
+
+int hci_connect_big_sync(struct hci_dev *hdev, struct hci_conn *conn)
+{
+ return hci_cmd_sync_queue_once(hdev, hci_le_big_create_sync, conn,
+ create_big_complete);
+}
+
+struct past_data {
+ struct hci_conn *conn;
+ struct hci_conn *le;
+};
+
+static void past_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct past_data *past = data;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ kfree(past);
+}
+
+static int hci_le_past_set_info_sync(struct hci_dev *hdev, void *data)
+{
+ struct past_data *past = data;
+ struct hci_cp_le_past_set_info cp;
+
+ hci_dev_lock(hdev);
+
+ if (!hci_conn_valid(hdev, past->conn) ||
+ !hci_conn_valid(hdev, past->le)) {
+ hci_dev_unlock(hdev);
+ return -ECANCELED;
+ }
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(past->le->handle);
+ cp.adv_handle = past->conn->iso_qos.bcast.bis;
+
+ hci_dev_unlock(hdev);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_PAST_SET_INFO,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_le_past_sync(struct hci_dev *hdev, void *data)
+{
+ struct past_data *past = data;
+ struct hci_cp_le_past cp;
+
+ hci_dev_lock(hdev);
+
+ if (!hci_conn_valid(hdev, past->conn) ||
+ !hci_conn_valid(hdev, past->le)) {
+ hci_dev_unlock(hdev);
+ return -ECANCELED;
+ }
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(past->le->handle);
+ cp.sync_handle = cpu_to_le16(past->conn->sync_handle);
+
+ hci_dev_unlock(hdev);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_PAST,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+int hci_past_sync(struct hci_conn *conn, struct hci_conn *le)
+{
+ struct past_data *data;
+ int err;
+
+ if (conn->type != BIS_LINK && conn->type != PA_LINK)
+ return -EINVAL;
+
+ if (!past_sender_capable(conn->hdev))
+ return -EOPNOTSUPP;
+
+ data = kmalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ data->conn = conn;
+ data->le = le;
+
+ if (conn->role == HCI_ROLE_MASTER)
+ err = hci_cmd_sync_queue_once(conn->hdev,
+ hci_le_past_set_info_sync, data,
+ past_complete);
+ else
+ err = hci_cmd_sync_queue_once(conn->hdev, hci_le_past_sync,
+ data, past_complete);
+
+ if (err)
+ kfree(data);
+
+ return err;
+}
+
+static void le_read_features_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct hci_conn *conn = data;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ if (err == -ECANCELED)
+ return;
+
+ hci_conn_drop(conn);
+}
+
+static int hci_le_read_all_remote_features_sync(struct hci_dev *hdev,
+ void *data)
+{
+ struct hci_conn *conn = data;
+ struct hci_cp_le_read_all_remote_features cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(conn->handle);
+ cp.pages = 10; /* Attempt to read all pages */
+
+ /* Wait for HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE event otherwise
+ * hci_conn_drop may run prematurely causing a disconnection.
+ */
+ return __hci_cmd_sync_status_sk(hdev,
+ HCI_OP_LE_READ_ALL_REMOTE_FEATURES,
+ sizeof(cp), &cp,
+ HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE,
+ HCI_CMD_TIMEOUT, NULL);
+
+ return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_ALL_REMOTE_FEATURES,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+}
+
+static int hci_le_read_remote_features_sync(struct hci_dev *hdev, void *data)
+{
+ struct hci_conn *conn = data;
+ struct hci_cp_le_read_remote_features cp;
+
+ if (!hci_conn_valid(hdev, conn))
+ return -ECANCELED;
+
+ /* Check if LL Extended Feature Set is supported and
+ * HCI_OP_LE_READ_ALL_REMOTE_FEATURES is supported then use that to read
+ * all features.
+ */
+ if (ll_ext_feature_capable(hdev) && hdev->commands[47] & BIT(3))
+ return hci_le_read_all_remote_features_sync(hdev, data);
+
+ memset(&cp, 0, sizeof(cp));
+ cp.handle = cpu_to_le16(conn->handle);
+
+ /* Wait for HCI_EV_LE_REMOTE_FEAT_COMPLETE event otherwise
+ * hci_conn_drop may run prematurely causing a disconnection.
+ */
+ return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_READ_REMOTE_FEATURES,
+ sizeof(cp), &cp,
+ HCI_EV_LE_REMOTE_FEAT_COMPLETE,
+ HCI_CMD_TIMEOUT, NULL);
+}
+
+int hci_le_read_remote_features(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+ int err;
+
+ /* The remote features procedure is defined for central
+ * role only. So only in case of an initiated connection
+ * request the remote features.
+ *
+ * If the local controller supports peripheral-initiated features
+ * exchange, then requesting the remote features in peripheral
+ * role is possible. Otherwise just transition into the
+ * connected state without requesting the remote features.
+ */
+ if (conn->out || (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES))
+ err = hci_cmd_sync_queue_once(hdev,
+ hci_le_read_remote_features_sync,
+ hci_conn_hold(conn),
+ le_read_features_complete);
+ else
+ err = -EOPNOTSUPP;
+
+ return err;
+}
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index 9874844a95a9..041ce9adc378 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -6,7 +6,9 @@
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
-static struct class *bt_class;
+static const struct class bt_class = {
+ .name = "bluetooth",
+};
static void bt_link_release(struct device *dev)
{
@@ -19,24 +21,14 @@ static const struct device_type bt_link = {
.release = bt_link_release,
};
-/*
- * The rfcomm tty device will possibly retain even when conn
- * is down, and sysfs doesn't support move zombie device,
- * so we should move the device before conn device is destroyed.
- */
-static int __match_tty(struct device *dev, void *data)
-{
- return !strncmp(dev_name(dev), "rfcomm", 6);
-}
-
void hci_conn_init_sysfs(struct hci_conn *conn)
{
struct hci_dev *hdev = conn->hdev;
- BT_DBG("conn %p", conn);
+ bt_dev_dbg(hdev, "conn %p", conn);
conn->dev.type = &bt_link;
- conn->dev.class = bt_class;
+ conn->dev.class = &bt_class;
conn->dev.parent = &hdev->dev;
device_initialize(&conn->dev);
@@ -46,50 +38,80 @@ void hci_conn_add_sysfs(struct hci_conn *conn)
{
struct hci_dev *hdev = conn->hdev;
- BT_DBG("conn %p", conn);
+ bt_dev_dbg(hdev, "conn %p", conn);
+
+ if (device_is_registered(&conn->dev))
+ return;
dev_set_name(&conn->dev, "%s:%d", hdev->name, conn->handle);
- if (device_add(&conn->dev) < 0) {
+ if (device_add(&conn->dev) < 0)
bt_dev_err(hdev, "failed to register connection device");
- return;
- }
-
- hci_dev_hold(hdev);
}
void hci_conn_del_sysfs(struct hci_conn *conn)
{
struct hci_dev *hdev = conn->hdev;
- if (!device_is_registered(&conn->dev))
+ bt_dev_dbg(hdev, "conn %p", conn);
+
+ if (!device_is_registered(&conn->dev)) {
+ /* If device_add() has *not* succeeded, use *only* put_device()
+ * to drop the reference count.
+ */
+ put_device(&conn->dev);
return;
+ }
+ /* If there are devices using the connection as parent reset it to NULL
+ * before unregistering the device.
+ */
while (1) {
struct device *dev;
- dev = device_find_child(&conn->dev, NULL, __match_tty);
+ dev = device_find_any_child(&conn->dev);
if (!dev)
break;
device_move(dev, NULL, DPM_ORDER_DEV_LAST);
put_device(dev);
}
- device_del(&conn->dev);
-
- hci_dev_put(hdev);
+ device_unregister(&conn->dev);
}
static void bt_host_release(struct device *dev)
{
struct hci_dev *hdev = to_hci_dev(dev);
- kfree(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_UNREGISTER))
+ hci_release_dev(hdev);
+ else
+ kfree(hdev);
module_put(THIS_MODULE);
}
+static ssize_t reset_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct hci_dev *hdev = to_hci_dev(dev);
+
+ if (hdev->reset)
+ hdev->reset(hdev);
+
+ return count;
+}
+static DEVICE_ATTR_WO(reset);
+
+static struct attribute *bt_host_attrs[] = {
+ &dev_attr_reset.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(bt_host);
+
static const struct device_type bt_host = {
.name = "host",
.release = bt_host_release,
+ .groups = bt_host_groups,
};
void hci_init_sysfs(struct hci_dev *hdev)
@@ -97,7 +119,7 @@ void hci_init_sysfs(struct hci_dev *hdev)
struct device *dev = &hdev->dev;
dev->type = &bt_host;
- dev->class = bt_class;
+ dev->class = &bt_class;
__module_get(THIS_MODULE);
device_initialize(dev);
@@ -105,12 +127,10 @@ void hci_init_sysfs(struct hci_dev *hdev)
int __init bt_sysfs_init(void)
{
- bt_class = class_create(THIS_MODULE, "bluetooth");
-
- return PTR_ERR_OR_ZERO(bt_class);
+ return class_register(&bt_class);
}
void bt_sysfs_cleanup(void)
{
- class_destroy(bt_class);
+ class_unregister(&bt_class);
}
diff --git a/net/bluetooth/hidp/Kconfig b/net/bluetooth/hidp/Kconfig
index bc8610b24077..e08aae35351a 100644
--- a/net/bluetooth/hidp/Kconfig
+++ b/net/bluetooth/hidp/Kconfig
@@ -1,7 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
config BT_HIDP
tristate "HIDP protocol support"
- depends on BT_BREDR && INPUT
- select HID
+ depends on BT_BREDR && HID
help
HIDP (Human Interface Device Protocol) is a transport layer
for HID reports. HIDP is required for the Bluetooth Human
diff --git a/net/bluetooth/hidp/Makefile b/net/bluetooth/hidp/Makefile
index a9ee115696ae..f41b0aa02b23 100644
--- a/net/bluetooth/hidp/Makefile
+++ b/net/bluetooth/hidp/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the Linux Bluetooth HIDP layer
#
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 253975cce943..6724adce615b 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -83,14 +83,14 @@ static void hidp_copy_session(struct hidp_session *session, struct hidp_conninfo
ci->product = session->input->id.product;
ci->version = session->input->id.version;
if (session->input->name)
- strlcpy(ci->name, session->input->name, 128);
+ strscpy(ci->name, session->input->name, 128);
else
- strlcpy(ci->name, "HID Boot Device", 128);
+ strscpy(ci->name, "HID Boot Device", 128);
} else if (session->hid) {
ci->vendor = session->hid->vendor;
ci->product = session->hid->product;
ci->version = session->hid->version;
- strlcpy(ci->name, session->hid->name, 128);
+ strscpy(ci->name, session->hid->name, 128);
}
}
@@ -101,6 +101,7 @@ static int hidp_send_message(struct hidp_session *session, struct socket *sock,
{
struct sk_buff *skb;
struct sock *sk = sock->sk;
+ int ret;
BT_DBG("session %p data %p size %d", session, data, size);
@@ -114,13 +115,17 @@ static int hidp_send_message(struct hidp_session *session, struct socket *sock,
}
skb_put_u8(skb, hdr);
- if (data && size > 0)
+ if (data && size > 0) {
skb_put_data(skb, data, size);
+ ret = size;
+ } else {
+ ret = 0;
+ }
skb_queue_tail(transmit, skb);
wake_up_interruptible(sk_sleep(sk));
- return 0;
+ return ret;
}
static int hidp_send_ctrl_message(struct hidp_session *session,
@@ -262,7 +267,7 @@ static int hidp_get_raw_report(struct hid_device *hid,
set_bit(HIDP_WAITING_FOR_RETURN, &session->flags);
data[0] = report_number;
ret = hidp_send_ctrl_message(session, report_type, data, 1);
- if (ret)
+ if (ret < 0)
goto err;
/* Wait for the return of the report. The returned report
@@ -338,7 +343,7 @@ static int hidp_set_raw_report(struct hid_device *hid, unsigned char reportnum,
data[0] = reportnum;
set_bit(HIDP_WAITING_FOR_SEND_ACK, &session->flags);
ret = hidp_send_ctrl_message(session, report_type, data, count);
- if (ret)
+ if (ret < 0)
goto err;
/* Wait for the ACK from the device. */
@@ -400,7 +405,7 @@ static int hidp_raw_request(struct hid_device *hid, unsigned char reportnum,
static void hidp_idle_timeout(struct timer_list *t)
{
- struct hidp_session *session = from_timer(session, t, timer);
+ struct hidp_session *session = timer_container_of(session, t, timer);
/* The HIDP user-space API only contains calls to add and remove
* devices. There is no way to forward events of any kind. Therefore,
@@ -428,7 +433,7 @@ static void hidp_set_timer(struct hidp_session *session)
static void hidp_del_timer(struct hidp_session *session)
{
if (session->idle_to > 0)
- del_timer(&session->timer);
+ timer_delete_sync(&session->timer);
}
static void hidp_process_report(struct hidp_session *session, int type,
@@ -503,7 +508,7 @@ static int hidp_process_data(struct hidp_session *session, struct sk_buff *skb,
unsigned char param)
{
int done_with_skb = 1;
- BT_DBG("session %p skb %p len %d param 0x%02x", session, skb, skb->len, param);
+ BT_DBG("session %p skb %p len %u param 0x%02x", session, skb, skb->len, param);
switch (param) {
case HIDP_DATA_RTYPE_INPUT:
@@ -548,7 +553,7 @@ static void hidp_recv_ctrl_frame(struct hidp_session *session,
unsigned char hdr, type, param;
int free_skb = 1;
- BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+ BT_DBG("session %p skb %p len %u", session, skb, skb->len);
hdr = skb->data[0];
skb_pull(skb, 1);
@@ -584,7 +589,7 @@ static void hidp_recv_intr_frame(struct hidp_session *session,
{
unsigned char hdr;
- BT_DBG("session %p skb %p len %d", session, skb, skb->len);
+ BT_DBG("session %p skb %p len %u", session, skb, skb->len);
hdr = skb->data[0];
skb_pull(skb, 1);
@@ -649,7 +654,7 @@ static void hidp_process_transmit(struct hidp_session *session,
}
static int hidp_setup_input(struct hidp_session *session,
- struct hidp_connadd_req *req)
+ const struct hidp_connadd_req *req)
{
struct input_dev *input;
int i;
@@ -734,7 +739,7 @@ static void hidp_stop(struct hid_device *hid)
hid->claimed = 0;
}
-struct hid_ll_driver hidp_hid_driver = {
+static const struct hid_ll_driver hidp_hid_driver = {
.parse = hidp_parse,
.start = hidp_start,
.stop = hidp_stop,
@@ -743,12 +748,11 @@ struct hid_ll_driver hidp_hid_driver = {
.raw_request = hidp_raw_request,
.output_report = hidp_output_report,
};
-EXPORT_SYMBOL_GPL(hidp_hid_driver);
/* This function sets up the hid device. It does not add it
to the HID system. That is done in hidp_add_connection(). */
static int hidp_setup_hid(struct hidp_session *session,
- struct hidp_connadd_req *req)
+ const struct hidp_connadd_req *req)
{
struct hid_device *hid;
int err;
@@ -775,7 +779,7 @@ static int hidp_setup_hid(struct hidp_session *session,
hid->version = req->version;
hid->country = req->country;
- strncpy(hid->name, req->name, sizeof(hid->name));
+ strscpy(hid->name, req->name, sizeof(hid->name));
snprintf(hid->phys, sizeof(hid->phys), "%pMR",
&l2cap_pi(session->ctrl_sock->sk)->chan->src);
@@ -789,7 +793,7 @@ static int hidp_setup_hid(struct hidp_session *session,
hid->dev.parent = &session->conn->hcon->dev;
hid->ll_driver = &hidp_hid_driver;
- /* True if device is blacklisted in drivers/hid/hid-quirks.c */
+ /* True if device is blocked in drivers/hid/hid-quirks.c */
if (hid_ignore(hid)) {
hid_destroy_device(session->hid);
session->hid = NULL;
@@ -807,7 +811,7 @@ fault:
/* initialize session devices */
static int hidp_session_dev_init(struct hidp_session *session,
- struct hidp_connadd_req *req)
+ const struct hidp_connadd_req *req)
{
int ret;
@@ -906,7 +910,7 @@ static void hidp_session_dev_work(struct work_struct *work)
static int hidp_session_new(struct hidp_session **out, const bdaddr_t *bdaddr,
struct socket *ctrl_sock,
struct socket *intr_sock,
- struct hidp_connadd_req *req,
+ const struct hidp_connadd_req *req,
struct l2cap_conn *conn)
{
struct hidp_session *session;
@@ -1074,6 +1078,10 @@ static int hidp_session_start_sync(struct hidp_session *session)
static void hidp_session_terminate(struct hidp_session *session)
{
atomic_inc(&session->terminate);
+ /*
+ * See the comment preceding the call to wait_woken()
+ * in hidp_session_run().
+ */
wake_up_interruptible(&hidp_session_wq);
}
@@ -1193,8 +1201,6 @@ static void hidp_session_run(struct hidp_session *session)
* thread is woken up by ->sk_state_changed().
*/
- /* Ensure session->terminate is updated */
- smp_mb__before_atomic();
if (atomic_read(&session->terminate))
break;
@@ -1228,14 +1234,15 @@ static void hidp_session_run(struct hidp_session *session)
hidp_process_transmit(session, &session->ctrl_transmit,
session->ctrl_sock);
+ /*
+ * wait_woken() performs the necessary memory barriers
+ * for us; see the header comment for this primitive.
+ */
wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
}
remove_wait_queue(&hidp_session_wq, &wait);
atomic_inc(&session->terminate);
-
- /* Ensure session->terminate is updated */
- smp_mb__after_atomic();
}
static int hidp_session_wake_function(wait_queue_entry_t *wait,
@@ -1271,7 +1278,7 @@ static int hidp_session_thread(void *arg)
add_wait_queue(sk_sleep(session->intr_sock->sk), &intr_wait);
/* This memory barrier is paired with wq_has_sleeper(). See
* sock_poll_wait() for more information why this is needed. */
- smp_mb();
+ smp_mb__before_atomic();
/* notify synchronous startup that we're ready */
atomic_inc(&session->state);
@@ -1282,7 +1289,7 @@ static int hidp_session_thread(void *arg)
/* cleanup runtime environment */
remove_wait_queue(sk_sleep(session->intr_sock->sk), &intr_wait);
- remove_wait_queue(sk_sleep(session->intr_sock->sk), &ctrl_wait);
+ remove_wait_queue(sk_sleep(session->ctrl_sock->sk), &ctrl_wait);
wake_up_interruptible(&session->report_queue);
hidp_del_timer(session);
@@ -1297,7 +1304,7 @@ static int hidp_session_thread(void *arg)
l2cap_unregister_user(session->conn, &session->user);
hidp_session_put(session);
- module_put_and_exit(0);
+ module_put_and_kthread_exit(0);
return 0;
}
@@ -1335,7 +1342,7 @@ static int hidp_verify_sockets(struct socket *ctrl_sock,
return 0;
}
-int hidp_connection_add(struct hidp_connadd_req *req,
+int hidp_connection_add(const struct hidp_connadd_req *req,
struct socket *ctrl_sock,
struct socket *intr_sock)
{
diff --git a/net/bluetooth/hidp/hidp.h b/net/bluetooth/hidp/hidp.h
index 8798492a6e99..6ef88d0a1919 100644
--- a/net/bluetooth/hidp/hidp.h
+++ b/net/bluetooth/hidp/hidp.h
@@ -122,7 +122,7 @@ struct hidp_connlist_req {
struct hidp_conninfo __user *ci;
};
-int hidp_connection_add(struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock);
+int hidp_connection_add(const struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock);
int hidp_connection_del(struct hidp_conndel_req *req);
int hidp_get_connlist(struct hidp_connlist_req *req);
int hidp_get_conninfo(struct hidp_conninfo *ci);
diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c
index 1eaac01f85de..c93aaeb3a3fa 100644
--- a/net/bluetooth/hidp/sock.c
+++ b/net/bluetooth/hidp/sock.c
@@ -20,6 +20,7 @@
SOFTWARE IS DISCLAIMED.
*/
+#include <linux/compat.h>
#include <linux/export.h>
#include <linux/file.h>
@@ -46,9 +47,8 @@ static int hidp_sock_release(struct socket *sock)
return 0;
}
-static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+static int do_hidp_sock_ioctl(struct socket *sock, unsigned int cmd, void __user *argp)
{
- void __user *argp = (void __user *) arg;
struct hidp_connadd_req ca;
struct hidp_conndel_req cd;
struct hidp_connlist_req cl;
@@ -57,7 +57,7 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
struct socket *isock;
int err;
- BT_DBG("cmd %x arg %lx", cmd, arg);
+ BT_DBG("cmd %x arg %p", cmd, argp);
switch (cmd) {
case HIDPCONNADD:
@@ -76,6 +76,7 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
sockfd_put(csock);
return err;
}
+ ca.name[sizeof(ca.name)-1] = 0;
err = hidp_connection_add(&ca, csock, isock);
if (!err && copy_to_user(argp, &ca, sizeof(ca)))
@@ -122,6 +123,11 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long
return -EINVAL;
}
+static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ return do_hidp_sock_ioctl(sock, cmd, (void __user *)arg);
+}
+
#ifdef CONFIG_COMPAT
struct compat_hidp_connadd_req {
int ctrl_sock; /* Connected control socket */
@@ -141,13 +147,15 @@ struct compat_hidp_connadd_req {
static int hidp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
+ void __user *argp = compat_ptr(arg);
+ int err;
+
if (cmd == HIDPGETCONNLIST) {
struct hidp_connlist_req cl;
+ u32 __user *p = argp;
u32 uci;
- int err;
- if (get_user(cl.cnum, (u32 __user *) arg) ||
- get_user(uci, (u32 __user *) (arg + 4)))
+ if (get_user(cl.cnum, p) || get_user(uci, p + 1))
return -EFAULT;
cl.ci = compat_ptr(uci);
@@ -157,39 +165,55 @@ static int hidp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigne
err = hidp_get_connlist(&cl);
- if (!err && put_user(cl.cnum, (u32 __user *) arg))
+ if (!err && put_user(cl.cnum, p))
err = -EFAULT;
return err;
} else if (cmd == HIDPCONNADD) {
- struct compat_hidp_connadd_req ca;
- struct hidp_connadd_req __user *uca;
+ struct compat_hidp_connadd_req ca32;
+ struct hidp_connadd_req ca;
+ struct socket *csock;
+ struct socket *isock;
- uca = compat_alloc_user_space(sizeof(*uca));
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
- if (copy_from_user(&ca, (void __user *) arg, sizeof(ca)))
+ if (copy_from_user(&ca32, (void __user *) arg, sizeof(ca32)))
return -EFAULT;
- if (put_user(ca.ctrl_sock, &uca->ctrl_sock) ||
- put_user(ca.intr_sock, &uca->intr_sock) ||
- put_user(ca.parser, &uca->parser) ||
- put_user(ca.rd_size, &uca->rd_size) ||
- put_user(compat_ptr(ca.rd_data), &uca->rd_data) ||
- put_user(ca.country, &uca->country) ||
- put_user(ca.subclass, &uca->subclass) ||
- put_user(ca.vendor, &uca->vendor) ||
- put_user(ca.product, &uca->product) ||
- put_user(ca.version, &uca->version) ||
- put_user(ca.flags, &uca->flags) ||
- put_user(ca.idle_to, &uca->idle_to) ||
- copy_to_user(&uca->name[0], &ca.name[0], 128))
- return -EFAULT;
+ ca.ctrl_sock = ca32.ctrl_sock;
+ ca.intr_sock = ca32.intr_sock;
+ ca.parser = ca32.parser;
+ ca.rd_size = ca32.rd_size;
+ ca.rd_data = compat_ptr(ca32.rd_data);
+ ca.country = ca32.country;
+ ca.subclass = ca32.subclass;
+ ca.vendor = ca32.vendor;
+ ca.product = ca32.product;
+ ca.version = ca32.version;
+ ca.flags = ca32.flags;
+ ca.idle_to = ca32.idle_to;
+ ca32.name[sizeof(ca32.name) - 1] = '\0';
+ memcpy(ca.name, ca32.name, 128);
+
+ csock = sockfd_lookup(ca.ctrl_sock, &err);
+ if (!csock)
+ return err;
+
+ isock = sockfd_lookup(ca.intr_sock, &err);
+ if (!isock) {
+ sockfd_put(csock);
+ return err;
+ }
- arg = (unsigned long) uca;
+ err = hidp_connection_add(&ca, csock, isock);
+ if (!err && copy_to_user(argp, &ca32, sizeof(ca32)))
+ err = -EFAULT;
- /* Fall through. We don't actually write back any _changes_
- to the structure anyway, so there's no need to copy back
- into the original compat version */
+ sockfd_put(csock);
+ sockfd_put(isock);
+
+ return err;
}
return hidp_sock_ioctl(sock, cmd, arg);
@@ -210,8 +234,6 @@ static const struct proto_ops hidp_sock_ops = {
.recvmsg = sock_no_recvmsg,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
- .setsockopt = sock_no_setsockopt,
- .getsockopt = sock_no_getsockopt,
.connect = sock_no_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
@@ -234,21 +256,13 @@ static int hidp_sock_create(struct net *net, struct socket *sock, int protocol,
if (sock->type != SOCK_RAW)
return -ESOCKTNOSUPPORT;
- sk = sk_alloc(net, PF_BLUETOOTH, GFP_ATOMIC, &hidp_proto, kern);
+ sk = bt_sock_alloc(net, sock, &hidp_proto, protocol, GFP_ATOMIC, kern);
if (!sk)
return -ENOMEM;
- sock_init_data(sock, sk);
-
sock->ops = &hidp_sock_ops;
-
sock->state = SS_UNCONNECTED;
- sock_reset_flag(sk, SOCK_ZAPPED);
-
- sk->sk_protocol = protocol;
- sk->sk_state = BT_OPEN;
-
bt_sock_link(&hidp_sk_list, sk);
return 0;
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
new file mode 100644
index 000000000000..e36d24a9098b
--- /dev/null
+++ b/net/bluetooth/iso.c
@@ -0,0 +1,2734 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BlueZ - Bluetooth protocol stack for Linux
+ *
+ * Copyright (C) 2022 Intel Corporation
+ * Copyright 2023-2024 NXP
+ */
+
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/sched/signal.h>
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/iso.h>
+#include "eir.h"
+
+static const struct proto_ops iso_sock_ops;
+
+static struct bt_sock_list iso_sk_list = {
+ .lock = __RW_LOCK_UNLOCKED(iso_sk_list.lock)
+};
+
+/* ---- ISO connections ---- */
+struct iso_conn {
+ struct hci_conn *hcon;
+
+ /* @lock: spinlock protecting changes to iso_conn fields */
+ spinlock_t lock;
+ struct sock *sk;
+
+ struct delayed_work timeout_work;
+
+ struct sk_buff *rx_skb;
+ __u32 rx_len;
+ __u16 tx_sn;
+ struct kref ref;
+};
+
+#define iso_conn_lock(c) spin_lock(&(c)->lock)
+#define iso_conn_unlock(c) spin_unlock(&(c)->lock)
+
+static void iso_sock_close(struct sock *sk);
+static void iso_sock_kill(struct sock *sk);
+
+/* ----- ISO socket info ----- */
+#define iso_pi(sk) ((struct iso_pinfo *)sk)
+
+#define EIR_SERVICE_DATA_LENGTH 4
+#define BASE_MAX_LENGTH (HCI_MAX_PER_AD_LENGTH - EIR_SERVICE_DATA_LENGTH)
+#define EIR_BAA_SERVICE_UUID 0x1851
+
+/* iso_pinfo flags values */
+enum {
+ BT_SK_BIG_SYNC,
+ BT_SK_PA_SYNC,
+};
+
+struct iso_pinfo {
+ struct bt_sock bt;
+ bdaddr_t src;
+ __u8 src_type;
+ bdaddr_t dst;
+ __u8 dst_type;
+ __u8 bc_sid;
+ __u8 bc_num_bis;
+ __u8 bc_bis[ISO_MAX_NUM_BIS];
+ __u16 sync_handle;
+ unsigned long flags;
+ struct bt_iso_qos qos;
+ bool qos_user_set;
+ __u8 base_len;
+ __u8 base[BASE_MAX_LENGTH];
+ struct iso_conn *conn;
+};
+
+static struct bt_iso_qos default_qos;
+
+static bool check_ucast_qos(struct bt_iso_qos *qos);
+static bool check_bcast_qos(struct bt_iso_qos *qos);
+static bool iso_match_sid(struct sock *sk, void *data);
+static bool iso_match_sid_past(struct sock *sk, void *data);
+static bool iso_match_sync_handle(struct sock *sk, void *data);
+static bool iso_match_sync_handle_pa_report(struct sock *sk, void *data);
+static void iso_sock_disconn(struct sock *sk);
+
+typedef bool (*iso_sock_match_t)(struct sock *sk, void *data);
+
+static struct sock *iso_get_sock(struct hci_dev *hdev, bdaddr_t *src,
+ bdaddr_t *dst, enum bt_sock_state state,
+ iso_sock_match_t match, void *data);
+
+/* ---- ISO timers ---- */
+#define ISO_CONN_TIMEOUT secs_to_jiffies(20)
+#define ISO_DISCONN_TIMEOUT secs_to_jiffies(2)
+
+static void iso_conn_free(struct kref *ref)
+{
+ struct iso_conn *conn = container_of(ref, struct iso_conn, ref);
+
+ BT_DBG("conn %p", conn);
+
+ if (conn->sk)
+ iso_pi(conn->sk)->conn = NULL;
+
+ if (conn->hcon) {
+ conn->hcon->iso_data = NULL;
+ hci_conn_drop(conn->hcon);
+ }
+
+ /* Ensure no more work items will run since hci_conn has been dropped */
+ disable_delayed_work_sync(&conn->timeout_work);
+
+ kfree_skb(conn->rx_skb);
+
+ kfree(conn);
+}
+
+static void iso_conn_put(struct iso_conn *conn)
+{
+ if (!conn)
+ return;
+
+ BT_DBG("conn %p refcnt %d", conn, kref_read(&conn->ref));
+
+ kref_put(&conn->ref, iso_conn_free);
+}
+
+static struct iso_conn *iso_conn_hold_unless_zero(struct iso_conn *conn)
+{
+ if (!conn)
+ return NULL;
+
+ BT_DBG("conn %p refcnt %u", conn, kref_read(&conn->ref));
+
+ if (!kref_get_unless_zero(&conn->ref))
+ return NULL;
+
+ return conn;
+}
+
+static struct sock *iso_sock_hold(struct iso_conn *conn)
+{
+ if (!conn || !bt_sock_linked(&iso_sk_list, conn->sk))
+ return NULL;
+
+ sock_hold(conn->sk);
+
+ return conn->sk;
+}
+
+static void iso_sock_timeout(struct work_struct *work)
+{
+ struct iso_conn *conn = container_of(work, struct iso_conn,
+ timeout_work.work);
+ struct sock *sk;
+
+ conn = iso_conn_hold_unless_zero(conn);
+ if (!conn)
+ return;
+
+ iso_conn_lock(conn);
+ sk = iso_sock_hold(conn);
+ iso_conn_unlock(conn);
+ iso_conn_put(conn);
+
+ if (!sk)
+ return;
+
+ BT_DBG("sock %p state %d", sk, sk->sk_state);
+
+ lock_sock(sk);
+ sk->sk_err = ETIMEDOUT;
+ sk->sk_state_change(sk);
+ release_sock(sk);
+ sock_put(sk);
+}
+
+static void iso_sock_set_timer(struct sock *sk, long timeout)
+{
+ if (!iso_pi(sk)->conn)
+ return;
+
+ BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout);
+ cancel_delayed_work(&iso_pi(sk)->conn->timeout_work);
+ schedule_delayed_work(&iso_pi(sk)->conn->timeout_work, timeout);
+}
+
+static void iso_sock_clear_timer(struct sock *sk)
+{
+ if (!iso_pi(sk)->conn)
+ return;
+
+ BT_DBG("sock %p state %d", sk, sk->sk_state);
+ cancel_delayed_work(&iso_pi(sk)->conn->timeout_work);
+}
+
+/* ---- ISO connections ---- */
+static struct iso_conn *iso_conn_add(struct hci_conn *hcon)
+{
+ struct iso_conn *conn = hcon->iso_data;
+
+ conn = iso_conn_hold_unless_zero(conn);
+ if (conn) {
+ if (!conn->hcon) {
+ iso_conn_lock(conn);
+ conn->hcon = hcon;
+ iso_conn_unlock(conn);
+ }
+ iso_conn_put(conn);
+ return conn;
+ }
+
+ conn = kzalloc(sizeof(*conn), GFP_KERNEL);
+ if (!conn)
+ return NULL;
+
+ kref_init(&conn->ref);
+ spin_lock_init(&conn->lock);
+ INIT_DELAYED_WORK(&conn->timeout_work, iso_sock_timeout);
+
+ hcon->iso_data = conn;
+ conn->hcon = hcon;
+ conn->tx_sn = 0;
+
+ BT_DBG("hcon %p conn %p", hcon, conn);
+
+ return conn;
+}
+
+/* Delete channel. Must be called on the locked socket. */
+static void iso_chan_del(struct sock *sk, int err)
+{
+ struct iso_conn *conn;
+ struct sock *parent;
+
+ conn = iso_pi(sk)->conn;
+ iso_pi(sk)->conn = NULL;
+
+ BT_DBG("sk %p, conn %p, err %d", sk, conn, err);
+
+ if (conn) {
+ iso_conn_lock(conn);
+ conn->sk = NULL;
+ iso_conn_unlock(conn);
+ iso_conn_put(conn);
+ }
+
+ sk->sk_state = BT_CLOSED;
+ sk->sk_err = err;
+
+ parent = bt_sk(sk)->parent;
+ if (parent) {
+ bt_accept_unlink(sk);
+ parent->sk_data_ready(parent);
+ } else {
+ sk->sk_state_change(sk);
+ }
+
+ sock_set_flag(sk, SOCK_ZAPPED);
+}
+
+static void iso_conn_del(struct hci_conn *hcon, int err)
+{
+ struct iso_conn *conn = hcon->iso_data;
+ struct sock *sk;
+
+ conn = iso_conn_hold_unless_zero(conn);
+ if (!conn)
+ return;
+
+ BT_DBG("hcon %p conn %p, err %d", hcon, conn, err);
+
+ /* Kill socket */
+ iso_conn_lock(conn);
+ sk = iso_sock_hold(conn);
+ iso_conn_unlock(conn);
+ iso_conn_put(conn);
+
+ if (!sk) {
+ iso_conn_put(conn);
+ return;
+ }
+
+ lock_sock(sk);
+ iso_sock_clear_timer(sk);
+ iso_chan_del(sk, err);
+ release_sock(sk);
+ sock_put(sk);
+}
+
+static int __iso_chan_add(struct iso_conn *conn, struct sock *sk,
+ struct sock *parent)
+{
+ BT_DBG("conn %p", conn);
+
+ if (iso_pi(sk)->conn == conn && conn->sk == sk)
+ return 0;
+
+ if (conn->sk) {
+ BT_ERR("conn->sk already set");
+ return -EBUSY;
+ }
+
+ iso_pi(sk)->conn = conn;
+ conn->sk = sk;
+
+ if (parent)
+ bt_accept_enqueue(parent, sk, true);
+
+ return 0;
+}
+
+static int iso_chan_add(struct iso_conn *conn, struct sock *sk,
+ struct sock *parent)
+{
+ int err;
+
+ iso_conn_lock(conn);
+ err = __iso_chan_add(conn, sk, parent);
+ iso_conn_unlock(conn);
+
+ return err;
+}
+
+static inline u8 le_addr_type(u8 bdaddr_type)
+{
+ if (bdaddr_type == BDADDR_LE_PUBLIC)
+ return ADDR_LE_DEV_PUBLIC;
+ else
+ return ADDR_LE_DEV_RANDOM;
+}
+
+static int iso_connect_bis(struct sock *sk)
+{
+ struct iso_conn *conn;
+ struct hci_conn *hcon;
+ struct hci_dev *hdev;
+ int err;
+
+ BT_DBG("%pMR (SID 0x%2.2x)", &iso_pi(sk)->src, iso_pi(sk)->bc_sid);
+
+ hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src,
+ iso_pi(sk)->src_type);
+ if (!hdev)
+ return -EHOSTUNREACH;
+
+ hci_dev_lock(hdev);
+
+ if (!bis_capable(hdev)) {
+ err = -EOPNOTSUPP;
+ goto unlock;
+ }
+
+ /* Fail if user set invalid QoS */
+ if (iso_pi(sk)->qos_user_set && !check_bcast_qos(&iso_pi(sk)->qos)) {
+ iso_pi(sk)->qos = default_qos;
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ /* Fail if out PHYs are marked as disabled */
+ if (!iso_pi(sk)->qos.bcast.out.phy) {
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ /* Just bind if DEFER_SETUP has been set */
+ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
+ hcon = hci_bind_bis(hdev, &iso_pi(sk)->dst, iso_pi(sk)->bc_sid,
+ &iso_pi(sk)->qos, iso_pi(sk)->base_len,
+ iso_pi(sk)->base,
+ READ_ONCE(sk->sk_sndtimeo));
+ if (IS_ERR(hcon)) {
+ err = PTR_ERR(hcon);
+ goto unlock;
+ }
+ } else {
+ hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst,
+ le_addr_type(iso_pi(sk)->dst_type),
+ iso_pi(sk)->bc_sid, &iso_pi(sk)->qos,
+ iso_pi(sk)->base_len, iso_pi(sk)->base,
+ READ_ONCE(sk->sk_sndtimeo));
+ if (IS_ERR(hcon)) {
+ err = PTR_ERR(hcon);
+ goto unlock;
+ }
+
+ /* Update SID if it was not set */
+ if (iso_pi(sk)->bc_sid == HCI_SID_INVALID)
+ iso_pi(sk)->bc_sid = hcon->sid;
+ }
+
+ conn = iso_conn_add(hcon);
+ if (!conn) {
+ hci_conn_drop(hcon);
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ lock_sock(sk);
+
+ err = iso_chan_add(conn, sk, NULL);
+ if (err) {
+ release_sock(sk);
+ goto unlock;
+ }
+
+ /* Update source addr of the socket */
+ bacpy(&iso_pi(sk)->src, &hcon->src);
+
+ if (hcon->state == BT_CONNECTED) {
+ iso_sock_clear_timer(sk);
+ sk->sk_state = BT_CONNECTED;
+ } else if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
+ iso_sock_clear_timer(sk);
+ sk->sk_state = BT_CONNECT;
+ } else {
+ sk->sk_state = BT_CONNECT;
+ iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo));
+ }
+
+ release_sock(sk);
+
+unlock:
+ hci_dev_unlock(hdev);
+ hci_dev_put(hdev);
+ return err;
+}
+
+static int iso_connect_cis(struct sock *sk)
+{
+ struct iso_conn *conn;
+ struct hci_conn *hcon;
+ struct hci_dev *hdev;
+ int err;
+
+ BT_DBG("%pMR -> %pMR", &iso_pi(sk)->src, &iso_pi(sk)->dst);
+
+ hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src,
+ iso_pi(sk)->src_type);
+ if (!hdev)
+ return -EHOSTUNREACH;
+
+ hci_dev_lock(hdev);
+
+ if (!cis_central_capable(hdev)) {
+ err = -EOPNOTSUPP;
+ goto unlock;
+ }
+
+ /* Fail if user set invalid QoS */
+ if (iso_pi(sk)->qos_user_set && !check_ucast_qos(&iso_pi(sk)->qos)) {
+ iso_pi(sk)->qos = default_qos;
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ /* Fail if either PHYs are marked as disabled */
+ if (!iso_pi(sk)->qos.ucast.in.phy && !iso_pi(sk)->qos.ucast.out.phy) {
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ /* Check if there are available buffers for output/TX. */
+ if (iso_pi(sk)->qos.ucast.out.sdu && !hci_iso_count(hdev) &&
+ (hdev->iso_pkts && !hdev->iso_cnt)) {
+ err = -ENOBUFS;
+ goto unlock;
+ }
+
+ /* Just bind if DEFER_SETUP has been set */
+ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
+ hcon = hci_bind_cis(hdev, &iso_pi(sk)->dst,
+ le_addr_type(iso_pi(sk)->dst_type),
+ &iso_pi(sk)->qos,
+ READ_ONCE(sk->sk_sndtimeo));
+ if (IS_ERR(hcon)) {
+ err = PTR_ERR(hcon);
+ goto unlock;
+ }
+ } else {
+ hcon = hci_connect_cis(hdev, &iso_pi(sk)->dst,
+ le_addr_type(iso_pi(sk)->dst_type),
+ &iso_pi(sk)->qos,
+ READ_ONCE(sk->sk_sndtimeo));
+ if (IS_ERR(hcon)) {
+ err = PTR_ERR(hcon);
+ goto unlock;
+ }
+ }
+
+ conn = iso_conn_add(hcon);
+ if (!conn) {
+ hci_conn_drop(hcon);
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ lock_sock(sk);
+
+ err = iso_chan_add(conn, sk, NULL);
+ if (err) {
+ release_sock(sk);
+ goto unlock;
+ }
+
+ /* Update source addr of the socket */
+ bacpy(&iso_pi(sk)->src, &hcon->src);
+
+ if (hcon->state == BT_CONNECTED) {
+ iso_sock_clear_timer(sk);
+ sk->sk_state = BT_CONNECTED;
+ } else if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
+ iso_sock_clear_timer(sk);
+ sk->sk_state = BT_CONNECT;
+ } else {
+ sk->sk_state = BT_CONNECT;
+ iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo));
+ }
+
+ release_sock(sk);
+
+unlock:
+ hci_dev_unlock(hdev);
+ hci_dev_put(hdev);
+ return err;
+}
+
+static struct bt_iso_qos *iso_sock_get_qos(struct sock *sk)
+{
+ if (sk->sk_state == BT_CONNECTED || sk->sk_state == BT_CONNECT2)
+ return &iso_pi(sk)->conn->hcon->iso_qos;
+
+ return &iso_pi(sk)->qos;
+}
+
+static int iso_send_frame(struct sock *sk, struct sk_buff *skb,
+ const struct sockcm_cookie *sockc)
+{
+ struct iso_conn *conn = iso_pi(sk)->conn;
+ struct bt_iso_qos *qos = iso_sock_get_qos(sk);
+ struct hci_iso_data_hdr *hdr;
+ int len = 0;
+
+ BT_DBG("sk %p len %d", sk, skb->len);
+
+ if (skb->len > qos->ucast.out.sdu)
+ return -EMSGSIZE;
+
+ len = skb->len;
+
+ /* Push ISO data header */
+ hdr = skb_push(skb, HCI_ISO_DATA_HDR_SIZE);
+ hdr->sn = cpu_to_le16(conn->tx_sn++);
+ hdr->slen = cpu_to_le16(hci_iso_data_len_pack(len,
+ HCI_ISO_STATUS_VALID));
+
+ if (sk->sk_state == BT_CONNECTED) {
+ hci_setup_tx_timestamp(skb, 1, sockc);
+ hci_send_iso(conn->hcon, skb);
+ } else {
+ len = -ENOTCONN;
+ }
+
+ return len;
+}
+
+static void iso_recv_frame(struct iso_conn *conn, struct sk_buff *skb)
+{
+ struct sock *sk;
+
+ iso_conn_lock(conn);
+ sk = conn->sk;
+ iso_conn_unlock(conn);
+
+ if (!sk)
+ goto drop;
+
+ BT_DBG("sk %p len %d", sk, skb->len);
+
+ if (sk->sk_state != BT_CONNECTED)
+ goto drop;
+
+ if (!sock_queue_rcv_skb(sk, skb))
+ return;
+
+drop:
+ kfree_skb(skb);
+}
+
+/* -------- Socket interface ---------- */
+static struct sock *__iso_get_sock_listen_by_addr(bdaddr_t *src, bdaddr_t *dst)
+{
+ struct sock *sk;
+
+ sk_for_each(sk, &iso_sk_list.head) {
+ if (sk->sk_state != BT_LISTEN)
+ continue;
+
+ if (bacmp(&iso_pi(sk)->dst, dst))
+ continue;
+
+ if (!bacmp(&iso_pi(sk)->src, src))
+ return sk;
+ }
+
+ return NULL;
+}
+
+static struct sock *__iso_get_sock_listen_by_sid(bdaddr_t *ba, bdaddr_t *bc,
+ __u8 sid)
+{
+ struct sock *sk;
+
+ sk_for_each(sk, &iso_sk_list.head) {
+ if (sk->sk_state != BT_LISTEN)
+ continue;
+
+ if (bacmp(&iso_pi(sk)->src, ba))
+ continue;
+
+ if (bacmp(&iso_pi(sk)->dst, bc))
+ continue;
+
+ if (iso_pi(sk)->bc_sid == sid)
+ return sk;
+ }
+
+ return NULL;
+}
+
+/* Find socket in given state:
+ * source bdaddr (Unicast)
+ * destination bdaddr (Broadcast only)
+ * match func - pass NULL to ignore
+ * match func data - pass -1 to ignore
+ * Returns closest match.
+ */
+static struct sock *iso_get_sock(struct hci_dev *hdev, bdaddr_t *src,
+ bdaddr_t *dst, enum bt_sock_state state,
+ iso_sock_match_t match, void *data)
+{
+ struct sock *sk = NULL, *sk1 = NULL;
+
+ read_lock(&iso_sk_list.lock);
+
+ sk_for_each(sk, &iso_sk_list.head) {
+ if (sk->sk_state != state)
+ continue;
+
+ /* Match Broadcast destination */
+ if (bacmp(dst, BDADDR_ANY) && bacmp(&iso_pi(sk)->dst, dst)) {
+ struct smp_irk *irk1, *irk2;
+
+ /* Check if destination is an RPA that we can resolve */
+ irk1 = hci_find_irk_by_rpa(hdev, dst);
+ if (!irk1)
+ continue;
+
+ /* Match with identity address */
+ if (bacmp(&iso_pi(sk)->dst, &irk1->bdaddr)) {
+ /* Check if socket destination address is also
+ * an RPA and if the IRK matches.
+ */
+ irk2 = hci_find_irk_by_rpa(hdev,
+ &iso_pi(sk)->dst);
+ if (!irk2 || irk1 != irk2)
+ continue;
+ }
+ }
+
+ /* Use Match function if provided */
+ if (match && !match(sk, data))
+ continue;
+
+ /* Exact match. */
+ if (!bacmp(&iso_pi(sk)->src, src)) {
+ sock_hold(sk);
+ break;
+ }
+
+ /* Closest match */
+ if (!bacmp(&iso_pi(sk)->src, BDADDR_ANY)) {
+ if (sk1)
+ sock_put(sk1);
+
+ sk1 = sk;
+ sock_hold(sk1);
+ }
+ }
+
+ if (sk && sk1)
+ sock_put(sk1);
+
+ read_unlock(&iso_sk_list.lock);
+
+ return sk ? sk : sk1;
+}
+
+static struct sock *iso_get_sock_big(struct sock *match_sk, bdaddr_t *src,
+ bdaddr_t *dst, uint8_t big)
+{
+ struct sock *sk = NULL;
+
+ read_lock(&iso_sk_list.lock);
+
+ sk_for_each(sk, &iso_sk_list.head) {
+ if (match_sk == sk)
+ continue;
+
+ /* Look for sockets that have already been
+ * connected to the BIG
+ */
+ if (sk->sk_state != BT_CONNECTED &&
+ sk->sk_state != BT_CONNECT)
+ continue;
+
+ /* Match Broadcast destination */
+ if (bacmp(&iso_pi(sk)->dst, dst))
+ continue;
+
+ /* Match BIG handle */
+ if (iso_pi(sk)->qos.bcast.big != big)
+ continue;
+
+ /* Match source address */
+ if (bacmp(&iso_pi(sk)->src, src))
+ continue;
+
+ sock_hold(sk);
+ break;
+ }
+
+ read_unlock(&iso_sk_list.lock);
+
+ return sk;
+}
+
+static void iso_sock_destruct(struct sock *sk)
+{
+ BT_DBG("sk %p", sk);
+
+ iso_conn_put(iso_pi(sk)->conn);
+
+ skb_queue_purge(&sk->sk_receive_queue);
+ skb_queue_purge(&sk->sk_write_queue);
+}
+
+static void iso_sock_cleanup_listen(struct sock *parent)
+{
+ struct sock *sk;
+
+ BT_DBG("parent %p", parent);
+
+ /* Close not yet accepted channels */
+ while ((sk = bt_accept_dequeue(parent, NULL))) {
+ iso_sock_close(sk);
+ iso_sock_kill(sk);
+ }
+
+ /* If listening socket has a hcon, properly disconnect it */
+ if (iso_pi(parent)->conn && iso_pi(parent)->conn->hcon) {
+ iso_sock_disconn(parent);
+ return;
+ }
+
+ parent->sk_state = BT_CLOSED;
+ sock_set_flag(parent, SOCK_ZAPPED);
+}
+
+/* Kill socket (only if zapped and orphan)
+ * Must be called on unlocked socket.
+ */
+static void iso_sock_kill(struct sock *sk)
+{
+ if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket ||
+ sock_flag(sk, SOCK_DEAD))
+ return;
+
+ BT_DBG("sk %p state %d", sk, sk->sk_state);
+
+ /* Sock is dead, so set conn->sk to NULL to avoid possible UAF */
+ if (iso_pi(sk)->conn) {
+ iso_conn_lock(iso_pi(sk)->conn);
+ iso_pi(sk)->conn->sk = NULL;
+ iso_conn_unlock(iso_pi(sk)->conn);
+ }
+
+ /* Kill poor orphan */
+ bt_sock_unlink(&iso_sk_list, sk);
+ sock_set_flag(sk, SOCK_DEAD);
+ sock_put(sk);
+}
+
+static void iso_sock_disconn(struct sock *sk)
+{
+ struct sock *bis_sk;
+ struct hci_conn *hcon = iso_pi(sk)->conn->hcon;
+
+ if (test_bit(HCI_CONN_BIG_CREATED, &hcon->flags)) {
+ bis_sk = iso_get_sock_big(sk, &iso_pi(sk)->src,
+ &iso_pi(sk)->dst,
+ iso_pi(sk)->qos.bcast.big);
+
+ /* If there are any other connected sockets for the
+ * same BIG, just delete the sk and leave the bis
+ * hcon active, in case later rebinding is needed.
+ */
+ if (bis_sk) {
+ hcon->state = BT_OPEN;
+ hcon->iso_data = NULL;
+ iso_pi(sk)->conn->hcon = NULL;
+ iso_sock_clear_timer(sk);
+ iso_chan_del(sk, bt_to_errno(hcon->abort_reason));
+ sock_put(bis_sk);
+ return;
+ }
+ }
+
+ sk->sk_state = BT_DISCONN;
+ iso_conn_lock(iso_pi(sk)->conn);
+ hci_conn_drop(iso_pi(sk)->conn->hcon);
+ iso_pi(sk)->conn->hcon = NULL;
+ iso_conn_unlock(iso_pi(sk)->conn);
+}
+
+static void __iso_sock_close(struct sock *sk)
+{
+ BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket);
+
+ switch (sk->sk_state) {
+ case BT_LISTEN:
+ iso_sock_cleanup_listen(sk);
+ break;
+
+ case BT_CONNECT:
+ case BT_CONNECTED:
+ case BT_CONFIG:
+ if (iso_pi(sk)->conn->hcon)
+ iso_sock_disconn(sk);
+ else
+ iso_chan_del(sk, ECONNRESET);
+ break;
+
+ case BT_CONNECT2:
+ if (iso_pi(sk)->conn->hcon &&
+ (test_bit(HCI_CONN_PA_SYNC, &iso_pi(sk)->conn->hcon->flags) ||
+ test_bit(HCI_CONN_PA_SYNC_FAILED, &iso_pi(sk)->conn->hcon->flags)))
+ iso_sock_disconn(sk);
+ else
+ iso_chan_del(sk, ECONNRESET);
+ break;
+ case BT_DISCONN:
+ iso_chan_del(sk, ECONNRESET);
+ break;
+
+ default:
+ sock_set_flag(sk, SOCK_ZAPPED);
+ break;
+ }
+}
+
+/* Must be called on unlocked socket. */
+static void iso_sock_close(struct sock *sk)
+{
+ iso_sock_clear_timer(sk);
+ lock_sock(sk);
+ __iso_sock_close(sk);
+ release_sock(sk);
+ iso_sock_kill(sk);
+}
+
+static void iso_sock_init(struct sock *sk, struct sock *parent)
+{
+ BT_DBG("sk %p", sk);
+
+ if (parent) {
+ sk->sk_type = parent->sk_type;
+ bt_sk(sk)->flags = bt_sk(parent)->flags;
+ security_sk_clone(parent, sk);
+ }
+}
+
+static struct proto iso_proto = {
+ .name = "ISO",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct iso_pinfo)
+};
+
+#define DEFAULT_IO_QOS \
+{ \
+ .interval = 10000u, \
+ .latency = 10u, \
+ .sdu = 40u, \
+ .phy = BT_ISO_PHY_2M, \
+ .rtn = 2u, \
+}
+
+static struct bt_iso_qos default_qos = {
+ .bcast = {
+ .big = BT_ISO_QOS_BIG_UNSET,
+ .bis = BT_ISO_QOS_BIS_UNSET,
+ .sync_factor = 0x01,
+ .packing = 0x00,
+ .framing = 0x00,
+ .in = DEFAULT_IO_QOS,
+ .out = DEFAULT_IO_QOS,
+ .encryption = 0x00,
+ .bcode = {0x00},
+ .options = 0x00,
+ .skip = 0x0000,
+ .sync_timeout = BT_ISO_SYNC_TIMEOUT,
+ .sync_cte_type = 0x00,
+ .mse = 0x00,
+ .timeout = BT_ISO_SYNC_TIMEOUT,
+ },
+};
+
+static struct sock *iso_sock_alloc(struct net *net, struct socket *sock,
+ int proto, gfp_t prio, int kern)
+{
+ struct sock *sk;
+
+ sk = bt_sock_alloc(net, sock, &iso_proto, proto, prio, kern);
+ if (!sk)
+ return NULL;
+
+ sk->sk_destruct = iso_sock_destruct;
+ sk->sk_sndtimeo = ISO_CONN_TIMEOUT;
+
+ /* Set address type as public as default src address is BDADDR_ANY */
+ iso_pi(sk)->src_type = BDADDR_LE_PUBLIC;
+
+ iso_pi(sk)->qos = default_qos;
+ iso_pi(sk)->sync_handle = -1;
+
+ bt_sock_link(&iso_sk_list, sk);
+ return sk;
+}
+
+static int iso_sock_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+
+ BT_DBG("sock %p", sock);
+
+ sock->state = SS_UNCONNECTED;
+
+ if (sock->type != SOCK_SEQPACKET)
+ return -ESOCKTNOSUPPORT;
+
+ sock->ops = &iso_sock_ops;
+
+ sk = iso_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern);
+ if (!sk)
+ return -ENOMEM;
+
+ iso_sock_init(sk, NULL);
+ return 0;
+}
+
+static int iso_sock_bind_bc(struct socket *sock, struct sockaddr_unsized *addr,
+ int addr_len)
+{
+ struct sockaddr_iso *sa = (struct sockaddr_iso *)addr;
+ struct sock *sk = sock->sk;
+ int i;
+
+ BT_DBG("sk %p bc_sid %u bc_num_bis %u", sk, sa->iso_bc->bc_sid,
+ sa->iso_bc->bc_num_bis);
+
+ if (addr_len != sizeof(*sa) + sizeof(*sa->iso_bc))
+ return -EINVAL;
+
+ bacpy(&iso_pi(sk)->dst, &sa->iso_bc->bc_bdaddr);
+
+ /* Check if the address type is of LE type */
+ if (!bdaddr_type_is_le(sa->iso_bc->bc_bdaddr_type))
+ return -EINVAL;
+
+ iso_pi(sk)->dst_type = sa->iso_bc->bc_bdaddr_type;
+
+ if (sa->iso_bc->bc_sid > 0x0f && sa->iso_bc->bc_sid != HCI_SID_INVALID)
+ return -EINVAL;
+
+ iso_pi(sk)->bc_sid = sa->iso_bc->bc_sid;
+
+ if (sa->iso_bc->bc_num_bis > ISO_MAX_NUM_BIS)
+ return -EINVAL;
+
+ iso_pi(sk)->bc_num_bis = sa->iso_bc->bc_num_bis;
+
+ for (i = 0; i < iso_pi(sk)->bc_num_bis; i++)
+ if (sa->iso_bc->bc_bis[i] < 0x01 ||
+ sa->iso_bc->bc_bis[i] > 0x1f)
+ return -EINVAL;
+
+ memcpy(iso_pi(sk)->bc_bis, sa->iso_bc->bc_bis,
+ iso_pi(sk)->bc_num_bis);
+
+ return 0;
+}
+
+/* Must be called on the locked socket. */
+static int iso_sock_rebind_bis(struct sock *sk, struct sockaddr_iso *sa,
+ int addr_len)
+{
+ int err = 0;
+
+ if (!test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags))
+ return -EBADFD;
+
+ if (sa->iso_bc->bc_num_bis > ISO_MAX_NUM_BIS) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ iso_pi(sk)->bc_num_bis = sa->iso_bc->bc_num_bis;
+
+ for (int i = 0; i < iso_pi(sk)->bc_num_bis; i++)
+ if (sa->iso_bc->bc_bis[i] < 0x01 ||
+ sa->iso_bc->bc_bis[i] > 0x1f) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ memcpy(iso_pi(sk)->bc_bis, sa->iso_bc->bc_bis,
+ iso_pi(sk)->bc_num_bis);
+
+done:
+ return err;
+}
+
+static struct hci_dev *iso_conn_get_hdev(struct iso_conn *conn)
+{
+ struct hci_dev *hdev = NULL;
+
+ iso_conn_lock(conn);
+ if (conn->hcon)
+ hdev = hci_dev_hold(conn->hcon->hdev);
+ iso_conn_unlock(conn);
+
+ return hdev;
+}
+
+/* Must be called on the locked socket. */
+static int iso_sock_rebind_bc(struct sock *sk, struct sockaddr_iso *sa,
+ int addr_len)
+{
+ struct hci_dev *hdev;
+ struct hci_conn *bis;
+ int err;
+
+ if (sk->sk_type != SOCK_SEQPACKET || !iso_pi(sk)->conn)
+ return -EINVAL;
+
+ /* Check if it is really a Broadcast address being requested */
+ if (addr_len != sizeof(*sa) + sizeof(*sa->iso_bc))
+ return -EINVAL;
+
+ /* Check if the address hasn't changed then perhaps only the number of
+ * bis has changed.
+ */
+ if (!bacmp(&iso_pi(sk)->dst, &sa->iso_bc->bc_bdaddr) ||
+ !bacmp(&sa->iso_bc->bc_bdaddr, BDADDR_ANY))
+ return iso_sock_rebind_bis(sk, sa, addr_len);
+
+ /* Check if the address type is of LE type */
+ if (!bdaddr_type_is_le(sa->iso_bc->bc_bdaddr_type))
+ return -EINVAL;
+
+ hdev = iso_conn_get_hdev(iso_pi(sk)->conn);
+ if (!hdev)
+ return -EINVAL;
+
+ bis = iso_pi(sk)->conn->hcon;
+
+ /* Release the socket before lookups since that requires hci_dev_lock
+ * which shall not be acquired while holding sock_lock for proper
+ * ordering.
+ */
+ release_sock(sk);
+ hci_dev_lock(bis->hdev);
+ lock_sock(sk);
+
+ if (!iso_pi(sk)->conn || iso_pi(sk)->conn->hcon != bis) {
+ /* raced with iso_conn_del() or iso_disconn_sock() */
+ err = -ENOTCONN;
+ goto unlock;
+ }
+
+ BT_DBG("sk %p %pMR type %u", sk, &sa->iso_bc->bc_bdaddr,
+ sa->iso_bc->bc_bdaddr_type);
+
+ err = hci_past_bis(bis, &sa->iso_bc->bc_bdaddr,
+ le_addr_type(sa->iso_bc->bc_bdaddr_type));
+
+unlock:
+ hci_dev_unlock(hdev);
+ hci_dev_put(hdev);
+
+ return err;
+}
+
+static int iso_sock_bind(struct socket *sock, struct sockaddr_unsized *addr,
+ int addr_len)
+{
+ struct sockaddr_iso *sa = (struct sockaddr_iso *)addr;
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ BT_DBG("sk %p %pMR type %u", sk, &sa->iso_bdaddr, sa->iso_bdaddr_type);
+
+ if (!addr || addr_len < sizeof(struct sockaddr_iso) ||
+ addr->sa_family != AF_BLUETOOTH)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ if ((sk->sk_state == BT_CONNECT2 || sk->sk_state == BT_CONNECTED) &&
+ addr_len > sizeof(*sa)) {
+ /* Allow the user to rebind to a different address using
+ * PAST procedures.
+ */
+ err = iso_sock_rebind_bc(sk, sa, addr_len);
+ goto done;
+ }
+
+ if (sk->sk_state != BT_OPEN) {
+ err = -EBADFD;
+ goto done;
+ }
+
+ if (sk->sk_type != SOCK_SEQPACKET) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ /* Check if the address type is of LE type */
+ if (!bdaddr_type_is_le(sa->iso_bdaddr_type)) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ bacpy(&iso_pi(sk)->src, &sa->iso_bdaddr);
+ iso_pi(sk)->src_type = sa->iso_bdaddr_type;
+
+ /* Check for Broadcast address */
+ if (addr_len > sizeof(*sa)) {
+ err = iso_sock_bind_bc(sock, addr, addr_len);
+ if (err)
+ goto done;
+ }
+
+ sk->sk_state = BT_BOUND;
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int iso_sock_connect(struct socket *sock, struct sockaddr_unsized *addr,
+ int alen, int flags)
+{
+ struct sockaddr_iso *sa = (struct sockaddr_iso *)addr;
+ struct sock *sk = sock->sk;
+ int err;
+
+ BT_DBG("sk %p", sk);
+
+ if (alen < sizeof(struct sockaddr_iso) ||
+ addr->sa_family != AF_BLUETOOTH)
+ return -EINVAL;
+
+ if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND)
+ return -EBADFD;
+
+ if (sk->sk_type != SOCK_SEQPACKET)
+ return -EINVAL;
+
+ /* Check if the address type is of LE type */
+ if (!bdaddr_type_is_le(sa->iso_bdaddr_type))
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ bacpy(&iso_pi(sk)->dst, &sa->iso_bdaddr);
+ iso_pi(sk)->dst_type = sa->iso_bdaddr_type;
+
+ release_sock(sk);
+
+ if (bacmp(&iso_pi(sk)->dst, BDADDR_ANY))
+ err = iso_connect_cis(sk);
+ else
+ err = iso_connect_bis(sk);
+
+ if (err)
+ return err;
+
+ lock_sock(sk);
+
+ if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
+ err = bt_sock_wait_state(sk, BT_CONNECTED,
+ sock_sndtimeo(sk, flags & O_NONBLOCK));
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int iso_listen_bis(struct sock *sk)
+{
+ struct hci_dev *hdev;
+ int err = 0;
+ struct iso_conn *conn;
+ struct hci_conn *hcon;
+
+ BT_DBG("%pMR -> %pMR (SID 0x%2.2x)", &iso_pi(sk)->src,
+ &iso_pi(sk)->dst, iso_pi(sk)->bc_sid);
+
+ write_lock(&iso_sk_list.lock);
+
+ if (__iso_get_sock_listen_by_sid(&iso_pi(sk)->src, &iso_pi(sk)->dst,
+ iso_pi(sk)->bc_sid))
+ err = -EADDRINUSE;
+
+ write_unlock(&iso_sk_list.lock);
+
+ if (err)
+ return err;
+
+ hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src,
+ iso_pi(sk)->src_type);
+ if (!hdev)
+ return -EHOSTUNREACH;
+
+ hci_dev_lock(hdev);
+ lock_sock(sk);
+
+ /* Fail if user set invalid QoS */
+ if (iso_pi(sk)->qos_user_set && !check_bcast_qos(&iso_pi(sk)->qos)) {
+ iso_pi(sk)->qos = default_qos;
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ hcon = hci_pa_create_sync(hdev, &iso_pi(sk)->dst,
+ le_addr_type(iso_pi(sk)->dst_type),
+ iso_pi(sk)->bc_sid, &iso_pi(sk)->qos);
+ if (IS_ERR(hcon)) {
+ err = PTR_ERR(hcon);
+ goto unlock;
+ }
+
+ conn = iso_conn_add(hcon);
+ if (!conn) {
+ hci_conn_drop(hcon);
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ err = iso_chan_add(conn, sk, NULL);
+ if (err) {
+ hci_conn_drop(hcon);
+ goto unlock;
+ }
+
+unlock:
+ release_sock(sk);
+ hci_dev_unlock(hdev);
+ hci_dev_put(hdev);
+ return err;
+}
+
+static int iso_listen_cis(struct sock *sk)
+{
+ int err = 0;
+
+ BT_DBG("%pMR", &iso_pi(sk)->src);
+
+ write_lock(&iso_sk_list.lock);
+
+ if (__iso_get_sock_listen_by_addr(&iso_pi(sk)->src, &iso_pi(sk)->dst))
+ err = -EADDRINUSE;
+
+ write_unlock(&iso_sk_list.lock);
+
+ return err;
+}
+
+static int iso_sock_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ BT_DBG("sk %p backlog %d", sk, backlog);
+
+ sock_hold(sk);
+ lock_sock(sk);
+
+ if (sk->sk_state != BT_BOUND) {
+ err = -EBADFD;
+ goto done;
+ }
+
+ if (sk->sk_type != SOCK_SEQPACKET) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (!bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) {
+ err = iso_listen_cis(sk);
+ } else {
+ /* Drop sock lock to avoid potential
+ * deadlock with the hdev lock.
+ */
+ release_sock(sk);
+ err = iso_listen_bis(sk);
+ lock_sock(sk);
+ }
+
+ if (err)
+ goto done;
+
+ sk->sk_max_ack_backlog = backlog;
+ sk->sk_ack_backlog = 0;
+
+ sk->sk_state = BT_LISTEN;
+
+done:
+ release_sock(sk);
+ sock_put(sk);
+ return err;
+}
+
+static int iso_sock_accept(struct socket *sock, struct socket *newsock,
+ struct proto_accept_arg *arg)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ struct sock *sk = sock->sk, *ch;
+ long timeo;
+ int err = 0;
+
+ /* Use explicit nested locking to avoid lockdep warnings generated
+ * because the parent socket and the child socket are locked on the
+ * same thread.
+ */
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+
+ timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
+
+ BT_DBG("sk %p timeo %ld", sk, timeo);
+
+ /* Wait for an incoming connection. (wake-one). */
+ add_wait_queue_exclusive(sk_sleep(sk), &wait);
+ while (1) {
+ if (sk->sk_state != BT_LISTEN) {
+ err = -EBADFD;
+ break;
+ }
+
+ ch = bt_accept_dequeue(sk, newsock);
+ if (ch)
+ break;
+
+ if (!timeo) {
+ err = -EAGAIN;
+ break;
+ }
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeo);
+ break;
+ }
+
+ release_sock(sk);
+
+ timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
+ }
+ remove_wait_queue(sk_sleep(sk), &wait);
+
+ if (err)
+ goto done;
+
+ newsock->state = SS_CONNECTED;
+
+ BT_DBG("new socket %p", ch);
+
+ /* A Broadcast Sink might require BIG sync to be terminated
+ * and re-established multiple times, while keeping the same
+ * PA sync handle active. To allow this, once all BIS
+ * connections have been accepted on a PA sync parent socket,
+ * "reset" socket state, to allow future BIG re-sync procedures.
+ */
+ if (test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) {
+ /* Iterate through the list of bound BIS indices
+ * and clear each BIS as they are accepted by the
+ * user space, one by one.
+ */
+ for (int i = 0; i < iso_pi(sk)->bc_num_bis; i++) {
+ if (iso_pi(sk)->bc_bis[i] > 0) {
+ iso_pi(sk)->bc_bis[i] = 0;
+ iso_pi(sk)->bc_num_bis--;
+ break;
+ }
+ }
+
+ if (iso_pi(sk)->bc_num_bis == 0) {
+ /* Once the last BIS was accepted, reset parent
+ * socket parameters to mark that the listening
+ * process for BIS connections has been completed:
+ *
+ * 1. Reset the DEFER setup flag on the parent sk.
+ * 2. Clear the flag marking that the BIG create
+ * sync command is pending.
+ * 3. Transition socket state from BT_LISTEN to
+ * BT_CONNECTED.
+ */
+ set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
+ clear_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags);
+ sk->sk_state = BT_CONNECTED;
+ }
+ }
+
+done:
+ release_sock(sk);
+ return err;
+}
+
+static int iso_sock_getname(struct socket *sock, struct sockaddr *addr,
+ int peer)
+{
+ struct sockaddr_iso *sa = (struct sockaddr_iso *)addr;
+ struct sock *sk = sock->sk;
+ int len = sizeof(struct sockaddr_iso);
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ addr->sa_family = AF_BLUETOOTH;
+
+ if (peer) {
+ struct hci_conn *hcon = iso_pi(sk)->conn ?
+ iso_pi(sk)->conn->hcon : NULL;
+
+ bacpy(&sa->iso_bdaddr, &iso_pi(sk)->dst);
+ sa->iso_bdaddr_type = iso_pi(sk)->dst_type;
+
+ if (hcon && (hcon->type == BIS_LINK || hcon->type == PA_LINK)) {
+ sa->iso_bc->bc_sid = iso_pi(sk)->bc_sid;
+ sa->iso_bc->bc_num_bis = iso_pi(sk)->bc_num_bis;
+ memcpy(sa->iso_bc->bc_bis, iso_pi(sk)->bc_bis,
+ ISO_MAX_NUM_BIS);
+ len += sizeof(struct sockaddr_iso_bc);
+ }
+ } else {
+ bacpy(&sa->iso_bdaddr, &iso_pi(sk)->src);
+ sa->iso_bdaddr_type = iso_pi(sk)->src_type;
+ }
+
+ return len;
+}
+
+static int iso_sock_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb, **frag;
+ struct sockcm_cookie sockc;
+ size_t mtu;
+ int err;
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ err = sock_error(sk);
+ if (err)
+ return err;
+
+ if (msg->msg_flags & MSG_OOB)
+ return -EOPNOTSUPP;
+
+ hci_sockcm_init(&sockc, sk);
+
+ if (msg->msg_controllen) {
+ err = sock_cmsg_send(sk, msg, &sockc);
+ if (err)
+ return err;
+ }
+
+ lock_sock(sk);
+
+ if (sk->sk_state != BT_CONNECTED) {
+ release_sock(sk);
+ return -ENOTCONN;
+ }
+
+ mtu = iso_pi(sk)->conn->hcon->mtu;
+
+ release_sock(sk);
+
+ skb = bt_skb_sendmsg(sk, msg, len, mtu, HCI_ISO_DATA_HDR_SIZE, 0);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ len -= skb->len;
+
+ BT_DBG("skb %p len %d", sk, skb->len);
+
+ /* Continuation fragments */
+ frag = &skb_shinfo(skb)->frag_list;
+ while (len) {
+ struct sk_buff *tmp;
+
+ tmp = bt_skb_sendmsg(sk, msg, len, mtu, 0, 0);
+ if (IS_ERR(tmp)) {
+ kfree_skb(skb);
+ return PTR_ERR(tmp);
+ }
+
+ *frag = tmp;
+
+ len -= tmp->len;
+
+ skb->len += tmp->len;
+ skb->data_len += tmp->len;
+
+ BT_DBG("frag %p len %d", *frag, tmp->len);
+
+ frag = &(*frag)->next;
+ }
+
+ lock_sock(sk);
+
+ if (sk->sk_state == BT_CONNECTED)
+ err = iso_send_frame(sk, skb, &sockc);
+ else
+ err = -ENOTCONN;
+
+ release_sock(sk);
+
+ if (err < 0)
+ kfree_skb(skb);
+ return err;
+}
+
+static void iso_conn_defer_accept(struct hci_conn *conn)
+{
+ struct hci_cp_le_accept_cis cp;
+ struct hci_dev *hdev = conn->hdev;
+
+ BT_DBG("conn %p", conn);
+
+ conn->state = BT_CONFIG;
+
+ cp.handle = cpu_to_le16(conn->handle);
+
+ hci_send_cmd(hdev, HCI_OP_LE_ACCEPT_CIS, sizeof(cp), &cp);
+}
+
+static void iso_conn_big_sync(struct sock *sk)
+{
+ int err;
+ struct hci_dev *hdev;
+
+ hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src,
+ iso_pi(sk)->src_type);
+
+ if (!hdev)
+ return;
+
+ /* hci_le_big_create_sync requires hdev lock to be held, since
+ * it enqueues the HCI LE BIG Create Sync command via
+ * hci_cmd_sync_queue_once, which checks hdev flags that might
+ * change.
+ */
+ hci_dev_lock(hdev);
+ lock_sock(sk);
+
+ if (!test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) {
+ err = hci_conn_big_create_sync(hdev, iso_pi(sk)->conn->hcon,
+ &iso_pi(sk)->qos,
+ iso_pi(sk)->sync_handle,
+ iso_pi(sk)->bc_num_bis,
+ iso_pi(sk)->bc_bis);
+ if (err)
+ bt_dev_err(hdev, "hci_big_create_sync: %d", err);
+ }
+
+ release_sock(sk);
+ hci_dev_unlock(hdev);
+}
+
+static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct iso_pinfo *pi = iso_pi(sk);
+ bool early_ret = false;
+ int err = 0;
+
+ BT_DBG("sk %p", sk);
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return sock_recv_errqueue(sk, msg, len, SOL_BLUETOOTH,
+ BT_SCM_ERROR);
+
+ if (test_and_clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
+ sock_hold(sk);
+ lock_sock(sk);
+
+ switch (sk->sk_state) {
+ case BT_CONNECT2:
+ if (test_bit(BT_SK_PA_SYNC, &pi->flags)) {
+ release_sock(sk);
+ iso_conn_big_sync(sk);
+ lock_sock(sk);
+
+ sk->sk_state = BT_LISTEN;
+ } else {
+ iso_conn_defer_accept(pi->conn->hcon);
+ sk->sk_state = BT_CONFIG;
+ }
+
+ early_ret = true;
+ break;
+ case BT_CONNECTED:
+ if (test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) {
+ release_sock(sk);
+ iso_conn_big_sync(sk);
+ lock_sock(sk);
+
+ sk->sk_state = BT_LISTEN;
+ early_ret = true;
+ }
+
+ break;
+ case BT_CONNECT:
+ release_sock(sk);
+ err = iso_connect_cis(sk);
+ lock_sock(sk);
+
+ early_ret = true;
+ break;
+ default:
+ break;
+ }
+
+ release_sock(sk);
+ sock_put(sk);
+
+ if (early_ret)
+ return err;
+ }
+
+ return bt_sock_recvmsg(sock, msg, len, flags);
+}
+
+static bool check_io_qos(struct bt_iso_io_qos *qos)
+{
+ /* If no PHY is enable SDU must be 0 */
+ if (!qos->phy && qos->sdu)
+ return false;
+
+ if (qos->interval && (qos->interval < 0xff || qos->interval > 0xfffff))
+ return false;
+
+ if (qos->latency && (qos->latency < 0x05 || qos->latency > 0xfa0))
+ return false;
+
+ if (qos->phy > BT_ISO_PHY_ANY)
+ return false;
+
+ return true;
+}
+
+static bool check_ucast_qos(struct bt_iso_qos *qos)
+{
+ if (qos->ucast.cig > 0xef && qos->ucast.cig != BT_ISO_QOS_CIG_UNSET)
+ return false;
+
+ if (qos->ucast.cis > 0xef && qos->ucast.cis != BT_ISO_QOS_CIS_UNSET)
+ return false;
+
+ if (qos->ucast.sca > 0x07)
+ return false;
+
+ if (qos->ucast.packing > 0x01)
+ return false;
+
+ if (qos->ucast.framing > 0x01)
+ return false;
+
+ if (!check_io_qos(&qos->ucast.in))
+ return false;
+
+ if (!check_io_qos(&qos->ucast.out))
+ return false;
+
+ return true;
+}
+
+static bool check_bcast_qos(struct bt_iso_qos *qos)
+{
+ if (!qos->bcast.sync_factor)
+ qos->bcast.sync_factor = 0x01;
+
+ if (qos->bcast.packing > 0x01)
+ return false;
+
+ if (qos->bcast.framing > 0x01)
+ return false;
+
+ if (!check_io_qos(&qos->bcast.in))
+ return false;
+
+ if (!check_io_qos(&qos->bcast.out))
+ return false;
+
+ if (qos->bcast.encryption > 0x01)
+ return false;
+
+ if (qos->bcast.options > 0x07)
+ return false;
+
+ if (qos->bcast.skip > 0x01f3)
+ return false;
+
+ if (!qos->bcast.sync_timeout)
+ qos->bcast.sync_timeout = BT_ISO_SYNC_TIMEOUT;
+
+ if (qos->bcast.sync_timeout < 0x000a || qos->bcast.sync_timeout > 0x4000)
+ return false;
+
+ if (qos->bcast.sync_cte_type > 0x1f)
+ return false;
+
+ if (qos->bcast.mse > 0x1f)
+ return false;
+
+ if (!qos->bcast.timeout)
+ qos->bcast.sync_timeout = BT_ISO_SYNC_TIMEOUT;
+
+ if (qos->bcast.timeout < 0x000a || qos->bcast.timeout > 0x4000)
+ return false;
+
+ return true;
+}
+
+static int iso_sock_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+ struct bt_iso_qos qos = default_qos;
+ u32 opt;
+
+ BT_DBG("sk %p", sk);
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case BT_DEFER_SETUP:
+ if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
+ err = -EINVAL;
+ break;
+ }
+
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
+ break;
+
+ if (opt)
+ set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
+ else
+ clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
+ break;
+
+ case BT_PKT_STATUS:
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
+ break;
+
+ if (opt)
+ set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags);
+ else
+ clear_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags);
+ break;
+
+ case BT_PKT_SEQNUM:
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
+ break;
+
+ if (opt)
+ set_bit(BT_SK_PKT_SEQNUM, &bt_sk(sk)->flags);
+ else
+ clear_bit(BT_SK_PKT_SEQNUM, &bt_sk(sk)->flags);
+ break;
+
+ case BT_ISO_QOS:
+ if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND &&
+ sk->sk_state != BT_CONNECT2 &&
+ (!test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags) ||
+ sk->sk_state != BT_CONNECTED)) {
+ err = -EINVAL;
+ break;
+ }
+
+ err = copy_safe_from_sockptr(&qos, sizeof(qos), optval, optlen);
+ if (err)
+ break;
+
+ iso_pi(sk)->qos = qos;
+ iso_pi(sk)->qos_user_set = true;
+
+ break;
+
+ case BT_ISO_BASE:
+ if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND &&
+ sk->sk_state != BT_CONNECT2) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (optlen > sizeof(iso_pi(sk)->base)) {
+ err = -EINVAL;
+ break;
+ }
+
+ err = copy_safe_from_sockptr(iso_pi(sk)->base, optlen, optval,
+ optlen);
+ if (err)
+ break;
+
+ iso_pi(sk)->base_len = optlen;
+
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int iso_sock_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ int len, err = 0;
+ struct bt_iso_qos *qos;
+ u8 base_len;
+ u8 *base;
+
+ BT_DBG("sk %p", sk);
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ lock_sock(sk);
+
+ switch (optname) {
+ case BT_DEFER_SETUP:
+ if (sk->sk_state == BT_CONNECTED) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (put_user(test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags),
+ (u32 __user *)optval))
+ err = -EFAULT;
+
+ break;
+
+ case BT_PKT_STATUS:
+ if (put_user(test_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags),
+ (int __user *)optval))
+ err = -EFAULT;
+ break;
+
+ case BT_ISO_QOS:
+ qos = iso_sock_get_qos(sk);
+
+ len = min_t(unsigned int, len, sizeof(*qos));
+ if (copy_to_user(optval, qos, len))
+ err = -EFAULT;
+
+ break;
+
+ case BT_ISO_BASE:
+ if (sk->sk_state == BT_CONNECTED &&
+ !bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) {
+ base_len = iso_pi(sk)->conn->hcon->le_per_adv_data_len;
+ base = iso_pi(sk)->conn->hcon->le_per_adv_data;
+ } else {
+ base_len = iso_pi(sk)->base_len;
+ base = iso_pi(sk)->base;
+ }
+
+ len = min_t(unsigned int, len, base_len);
+ if (copy_to_user(optval, base, len))
+ err = -EFAULT;
+ if (put_user(len, optlen))
+ err = -EFAULT;
+
+ break;
+
+ default:
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ release_sock(sk);
+ return err;
+}
+
+static int iso_sock_shutdown(struct socket *sock, int how)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ BT_DBG("sock %p, sk %p, how %d", sock, sk, how);
+
+ if (!sk)
+ return 0;
+
+ sock_hold(sk);
+ lock_sock(sk);
+
+ switch (how) {
+ case SHUT_RD:
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ goto unlock;
+ sk->sk_shutdown |= RCV_SHUTDOWN;
+ break;
+ case SHUT_WR:
+ if (sk->sk_shutdown & SEND_SHUTDOWN)
+ goto unlock;
+ sk->sk_shutdown |= SEND_SHUTDOWN;
+ break;
+ case SHUT_RDWR:
+ if (sk->sk_shutdown & SHUTDOWN_MASK)
+ goto unlock;
+ sk->sk_shutdown |= SHUTDOWN_MASK;
+ break;
+ }
+
+ iso_sock_clear_timer(sk);
+ __iso_sock_close(sk);
+
+ if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
+ !(current->flags & PF_EXITING))
+ err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime);
+
+unlock:
+ release_sock(sk);
+ sock_put(sk);
+
+ return err;
+}
+
+static int iso_sock_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ int err = 0;
+
+ BT_DBG("sock %p, sk %p", sock, sk);
+
+ if (!sk)
+ return 0;
+
+ iso_sock_close(sk);
+
+ if (sock_flag(sk, SOCK_LINGER) && READ_ONCE(sk->sk_lingertime) &&
+ !(current->flags & PF_EXITING)) {
+ lock_sock(sk);
+ err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime);
+ release_sock(sk);
+ }
+
+ sock_orphan(sk);
+ iso_sock_kill(sk);
+ return err;
+}
+
+static void iso_sock_ready(struct sock *sk)
+{
+ BT_DBG("sk %p", sk);
+
+ if (!sk)
+ return;
+
+ lock_sock(sk);
+ iso_sock_clear_timer(sk);
+ sk->sk_state = BT_CONNECTED;
+ sk->sk_state_change(sk);
+ release_sock(sk);
+}
+
+static bool iso_match_big(struct sock *sk, void *data)
+{
+ struct hci_evt_le_big_sync_established *ev = data;
+
+ return ev->handle == iso_pi(sk)->qos.bcast.big;
+}
+
+static bool iso_match_big_hcon(struct sock *sk, void *data)
+{
+ struct hci_conn *hcon = data;
+
+ return hcon->iso_qos.bcast.big == iso_pi(sk)->qos.bcast.big;
+}
+
+static bool iso_match_pa_sync_flag(struct sock *sk, void *data)
+{
+ return test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags);
+}
+
+static bool iso_match_dst(struct sock *sk, void *data)
+{
+ return !bacmp(&iso_pi(sk)->dst, (bdaddr_t *)data);
+}
+
+static void iso_conn_ready(struct iso_conn *conn)
+{
+ struct sock *parent = NULL;
+ struct sock *sk = conn->sk;
+ struct hci_ev_le_big_sync_established *ev = NULL;
+ struct hci_ev_le_pa_sync_established *ev2 = NULL;
+ struct hci_ev_le_per_adv_report *ev3 = NULL;
+ struct hci_conn *hcon;
+ struct hci_dev *hdev;
+
+ BT_DBG("conn %p", conn);
+
+ if (sk) {
+ /* Attempt to update source address in case of BIS Sender if
+ * the advertisement is using a random address.
+ */
+ if (conn->hcon->type == BIS_LINK &&
+ conn->hcon->role == HCI_ROLE_MASTER &&
+ !bacmp(&conn->hcon->dst, BDADDR_ANY)) {
+ struct hci_conn *bis = conn->hcon;
+ struct adv_info *adv;
+
+ adv = hci_find_adv_instance(bis->hdev,
+ bis->iso_qos.bcast.bis);
+ if (adv && bacmp(&adv->random_addr, BDADDR_ANY)) {
+ lock_sock(sk);
+ iso_pi(sk)->src_type = BDADDR_LE_RANDOM;
+ bacpy(&iso_pi(sk)->src, &adv->random_addr);
+ release_sock(sk);
+ }
+ }
+
+ iso_sock_ready(conn->sk);
+ } else {
+ hcon = conn->hcon;
+ if (!hcon)
+ return;
+
+ hdev = hcon->hdev;
+
+ if (test_bit(HCI_CONN_BIG_SYNC, &hcon->flags)) {
+ /* A BIS slave hcon is notified to the ISO layer
+ * after the Command Complete for the LE Setup
+ * ISO Data Path command is received. Get the
+ * parent socket that matches the hcon BIG handle.
+ */
+ parent = iso_get_sock(hdev, &hcon->src, &hcon->dst,
+ BT_LISTEN, iso_match_big_hcon,
+ hcon);
+ } else if (test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags)) {
+ ev = hci_recv_event_data(hcon->hdev,
+ HCI_EVT_LE_BIG_SYNC_ESTABLISHED);
+
+ /* Get reference to PA sync parent socket, if it exists */
+ parent = iso_get_sock(hdev, &hcon->src, &hcon->dst,
+ BT_LISTEN,
+ iso_match_pa_sync_flag,
+ NULL);
+ if (!parent && ev)
+ parent = iso_get_sock(hdev, &hcon->src,
+ &hcon->dst,
+ BT_LISTEN,
+ iso_match_big, ev);
+ } else if (test_bit(HCI_CONN_PA_SYNC_FAILED, &hcon->flags)) {
+ ev2 = hci_recv_event_data(hcon->hdev,
+ HCI_EV_LE_PA_SYNC_ESTABLISHED);
+ if (ev2)
+ parent = iso_get_sock(hdev, &hcon->src,
+ &hcon->dst,
+ BT_LISTEN,
+ iso_match_sid, ev2);
+ } else if (test_bit(HCI_CONN_PA_SYNC, &hcon->flags)) {
+ ev3 = hci_recv_event_data(hcon->hdev,
+ HCI_EV_LE_PER_ADV_REPORT);
+ if (ev3)
+ parent = iso_get_sock(hdev, &hcon->src,
+ &hcon->dst,
+ BT_LISTEN,
+ iso_match_sync_handle_pa_report,
+ ev3);
+ }
+
+ if (!parent)
+ parent = iso_get_sock(hdev, &hcon->src, BDADDR_ANY,
+ BT_LISTEN, iso_match_dst, BDADDR_ANY);
+
+ if (!parent)
+ return;
+
+ lock_sock(parent);
+
+ sk = iso_sock_alloc(sock_net(parent), NULL,
+ BTPROTO_ISO, GFP_ATOMIC, 0);
+ if (!sk) {
+ release_sock(parent);
+ return;
+ }
+
+ iso_sock_init(sk, parent);
+
+ bacpy(&iso_pi(sk)->src, &hcon->src);
+
+ /* Convert from HCI to three-value type */
+ if (hcon->src_type == ADDR_LE_DEV_PUBLIC)
+ iso_pi(sk)->src_type = BDADDR_LE_PUBLIC;
+ else
+ iso_pi(sk)->src_type = BDADDR_LE_RANDOM;
+
+ /* If hcon has no destination address (BDADDR_ANY) it means it
+ * was created by HCI_EV_LE_BIG_SYNC_ESTABILISHED or
+ * HCI_EV_LE_PA_SYNC_ESTABLISHED so we need to initialize using
+ * the parent socket destination address.
+ */
+ if (!bacmp(&hcon->dst, BDADDR_ANY)) {
+ bacpy(&hcon->dst, &iso_pi(parent)->dst);
+ hcon->dst_type = le_addr_type(iso_pi(parent)->dst_type);
+ }
+
+ if (test_bit(HCI_CONN_PA_SYNC, &hcon->flags)) {
+ iso_pi(sk)->qos = iso_pi(parent)->qos;
+ hcon->iso_qos = iso_pi(sk)->qos;
+ iso_pi(sk)->bc_sid = iso_pi(parent)->bc_sid;
+ iso_pi(sk)->bc_num_bis = iso_pi(parent)->bc_num_bis;
+ memcpy(iso_pi(sk)->bc_bis, iso_pi(parent)->bc_bis,
+ ISO_MAX_NUM_BIS);
+ set_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags);
+ }
+
+ bacpy(&iso_pi(sk)->dst, &hcon->dst);
+
+ /* Convert from HCI to three-value type */
+ if (hcon->dst_type == ADDR_LE_DEV_PUBLIC)
+ iso_pi(sk)->dst_type = BDADDR_LE_PUBLIC;
+ else
+ iso_pi(sk)->dst_type = BDADDR_LE_RANDOM;
+
+ iso_pi(sk)->sync_handle = iso_pi(parent)->sync_handle;
+ memcpy(iso_pi(sk)->base, iso_pi(parent)->base, iso_pi(parent)->base_len);
+ iso_pi(sk)->base_len = iso_pi(parent)->base_len;
+
+ hci_conn_hold(hcon);
+ iso_chan_add(conn, sk, parent);
+
+ if ((ev && ((struct hci_evt_le_big_sync_established *)ev)->status) ||
+ (ev2 && ev2->status)) {
+ /* Trigger error signal on child socket */
+ sk->sk_err = ECONNREFUSED;
+ sk->sk_error_report(sk);
+ }
+
+ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags))
+ sk->sk_state = BT_CONNECT2;
+ else
+ sk->sk_state = BT_CONNECTED;
+
+ /* Wake up parent */
+ parent->sk_data_ready(parent);
+
+ release_sock(parent);
+ sock_put(parent);
+ }
+}
+
+static bool iso_match_sid(struct sock *sk, void *data)
+{
+ struct hci_ev_le_pa_sync_established *ev = data;
+
+ if (iso_pi(sk)->bc_sid == HCI_SID_INVALID)
+ return true;
+
+ return ev->sid == iso_pi(sk)->bc_sid;
+}
+
+static bool iso_match_sid_past(struct sock *sk, void *data)
+{
+ struct hci_ev_le_past_received *ev = data;
+
+ if (iso_pi(sk)->bc_sid == HCI_SID_INVALID)
+ return true;
+
+ return ev->sid == iso_pi(sk)->bc_sid;
+}
+
+static bool iso_match_sync_handle(struct sock *sk, void *data)
+{
+ struct hci_evt_le_big_info_adv_report *ev = data;
+
+ return le16_to_cpu(ev->sync_handle) == iso_pi(sk)->sync_handle;
+}
+
+static bool iso_match_sync_handle_pa_report(struct sock *sk, void *data)
+{
+ struct hci_ev_le_per_adv_report *ev = data;
+
+ return le16_to_cpu(ev->sync_handle) == iso_pi(sk)->sync_handle;
+}
+
+/* ----- ISO interface with lower layer (HCI) ----- */
+
+int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
+{
+ struct hci_ev_le_pa_sync_established *ev1;
+ struct hci_ev_le_past_received *ev1a;
+ struct hci_evt_le_big_info_adv_report *ev2;
+ struct hci_ev_le_per_adv_report *ev3;
+ struct sock *sk;
+
+ bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr);
+
+ /* Broadcast receiver requires handling of some events before it can
+ * proceed to establishing a BIG sync:
+ *
+ * 1. HCI_EV_LE_PA_SYNC_ESTABLISHED: The socket may specify a specific
+ * SID to listen to and once sync is established its handle needs to
+ * be stored in iso_pi(sk)->sync_handle so it can be matched once
+ * receiving the BIG Info.
+ * 1a. HCI_EV_LE_PAST_RECEIVED: alternative to 1.
+ * 2. HCI_EVT_LE_BIG_INFO_ADV_REPORT: When connect_ind is triggered by a
+ * a BIG Info it attempts to check if there any listening socket with
+ * the same sync_handle and if it does then attempt to create a sync.
+ * 3. HCI_EV_LE_PER_ADV_REPORT: When a PA report is received, it is stored
+ * in iso_pi(sk)->base so it can be passed up to user, in the case of a
+ * broadcast sink.
+ */
+ ev1 = hci_recv_event_data(hdev, HCI_EV_LE_PA_SYNC_ESTABLISHED);
+ if (ev1) {
+ sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN,
+ iso_match_sid, ev1);
+ if (sk && !ev1->status) {
+ iso_pi(sk)->sync_handle = le16_to_cpu(ev1->handle);
+ iso_pi(sk)->bc_sid = ev1->sid;
+ }
+
+ goto done;
+ }
+
+ ev1a = hci_recv_event_data(hdev, HCI_EV_LE_PAST_RECEIVED);
+ if (ev1a) {
+ sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN,
+ iso_match_sid_past, ev1a);
+ if (sk && !ev1a->status) {
+ iso_pi(sk)->sync_handle = le16_to_cpu(ev1a->sync_handle);
+ iso_pi(sk)->bc_sid = ev1a->sid;
+ }
+
+ goto done;
+ }
+
+ ev2 = hci_recv_event_data(hdev, HCI_EVT_LE_BIG_INFO_ADV_REPORT);
+ if (ev2) {
+ /* Check if BIGInfo report has already been handled */
+ sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_CONNECTED,
+ iso_match_sync_handle, ev2);
+ if (sk) {
+ sock_put(sk);
+ sk = NULL;
+ goto done;
+ }
+
+ /* Try to get PA sync socket, if it exists */
+ sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_CONNECT2,
+ iso_match_sync_handle, ev2);
+ if (!sk)
+ sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr,
+ BT_LISTEN,
+ iso_match_sync_handle,
+ ev2);
+
+ if (sk) {
+ int err;
+ struct hci_conn *hcon = iso_pi(sk)->conn->hcon;
+
+ iso_pi(sk)->qos.bcast.encryption = ev2->encryption;
+
+ if (ev2->num_bis < iso_pi(sk)->bc_num_bis)
+ iso_pi(sk)->bc_num_bis = ev2->num_bis;
+
+ if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) &&
+ !test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) {
+ err = hci_conn_big_create_sync(hdev, hcon,
+ &iso_pi(sk)->qos,
+ iso_pi(sk)->sync_handle,
+ iso_pi(sk)->bc_num_bis,
+ iso_pi(sk)->bc_bis);
+ if (err) {
+ bt_dev_err(hdev, "hci_le_big_create_sync: %d",
+ err);
+ sock_put(sk);
+ sk = NULL;
+ }
+ }
+ }
+
+ goto done;
+ }
+
+ ev3 = hci_recv_event_data(hdev, HCI_EV_LE_PER_ADV_REPORT);
+ if (ev3) {
+ size_t base_len = 0;
+ u8 *base;
+ struct hci_conn *hcon;
+
+ sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN,
+ iso_match_sync_handle_pa_report, ev3);
+ if (!sk)
+ goto done;
+
+ hcon = iso_pi(sk)->conn->hcon;
+ if (!hcon)
+ goto done;
+
+ if (ev3->data_status == LE_PA_DATA_TRUNCATED) {
+ /* The controller was unable to retrieve PA data. */
+ memset(hcon->le_per_adv_data, 0,
+ HCI_MAX_PER_AD_TOT_LEN);
+ hcon->le_per_adv_data_len = 0;
+ hcon->le_per_adv_data_offset = 0;
+ goto done;
+ }
+
+ if (hcon->le_per_adv_data_offset + ev3->length >
+ HCI_MAX_PER_AD_TOT_LEN)
+ goto done;
+
+ memcpy(hcon->le_per_adv_data + hcon->le_per_adv_data_offset,
+ ev3->data, ev3->length);
+ hcon->le_per_adv_data_offset += ev3->length;
+
+ if (ev3->data_status == LE_PA_DATA_COMPLETE) {
+ /* All PA data has been received. */
+ hcon->le_per_adv_data_len =
+ hcon->le_per_adv_data_offset;
+ hcon->le_per_adv_data_offset = 0;
+
+ /* Extract BASE */
+ base = eir_get_service_data(hcon->le_per_adv_data,
+ hcon->le_per_adv_data_len,
+ EIR_BAA_SERVICE_UUID,
+ &base_len);
+
+ if (!base || base_len > BASE_MAX_LENGTH)
+ goto done;
+
+ memcpy(iso_pi(sk)->base, base, base_len);
+ iso_pi(sk)->base_len = base_len;
+ } else {
+ /* This is a PA data fragment. Keep pa_data_len set to 0
+ * until all data has been reassembled.
+ */
+ hcon->le_per_adv_data_len = 0;
+ }
+ } else {
+ sk = iso_get_sock(hdev, &hdev->bdaddr, BDADDR_ANY,
+ BT_LISTEN, iso_match_dst, BDADDR_ANY);
+ }
+
+done:
+ if (!sk)
+ return 0;
+
+ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags))
+ *flags |= HCI_PROTO_DEFER;
+
+ sock_put(sk);
+
+ return HCI_LM_ACCEPT;
+}
+
+static void iso_connect_cfm(struct hci_conn *hcon, __u8 status)
+{
+ if (hcon->type != CIS_LINK && hcon->type != BIS_LINK &&
+ hcon->type != PA_LINK) {
+ if (hcon->type != LE_LINK)
+ return;
+
+ /* Check if LE link has failed */
+ if (status) {
+ struct hci_link *link, *t;
+
+ list_for_each_entry_safe(link, t, &hcon->link_list,
+ list)
+ iso_conn_del(link->conn, bt_to_errno(status));
+
+ return;
+ }
+
+ /* Create CIS if pending */
+ hci_le_create_cis_pending(hcon->hdev);
+ return;
+ }
+
+ BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status);
+
+ /* Similar to the success case, if HCI_CONN_BIG_SYNC_FAILED or
+ * HCI_CONN_PA_SYNC_FAILED is set, queue the failed connection
+ * into the accept queue of the listening socket and wake up
+ * userspace, to inform the user about the event.
+ */
+ if (!status || test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags) ||
+ test_bit(HCI_CONN_PA_SYNC_FAILED, &hcon->flags)) {
+ struct iso_conn *conn;
+
+ conn = iso_conn_add(hcon);
+ if (conn)
+ iso_conn_ready(conn);
+ } else {
+ iso_conn_del(hcon, bt_to_errno(status));
+ }
+}
+
+static void iso_disconn_cfm(struct hci_conn *hcon, __u8 reason)
+{
+ if (hcon->type != CIS_LINK && hcon->type != BIS_LINK &&
+ hcon->type != PA_LINK)
+ return;
+
+ BT_DBG("hcon %p reason %d", hcon, reason);
+
+ iso_conn_del(hcon, bt_to_errno(reason));
+}
+
+int iso_recv(struct hci_dev *hdev, u16 handle, struct sk_buff *skb, u16 flags)
+{
+ struct hci_conn *hcon;
+ struct iso_conn *conn;
+ struct skb_shared_hwtstamps *hwts;
+ __u16 pb, ts, len, sn;
+
+ hci_dev_lock(hdev);
+
+ hcon = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!hcon) {
+ hci_dev_unlock(hdev);
+ kfree_skb(skb);
+ return -ENOENT;
+ }
+
+ conn = iso_conn_hold_unless_zero(hcon->iso_data);
+ hcon = NULL;
+
+ hci_dev_unlock(hdev);
+
+ if (!conn) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ pb = hci_iso_flags_pb(flags);
+ ts = hci_iso_flags_ts(flags);
+
+ BT_DBG("conn %p len %d pb 0x%x ts 0x%x", conn, skb->len, pb, ts);
+
+ switch (pb) {
+ case ISO_START:
+ case ISO_SINGLE:
+ if (conn->rx_len) {
+ BT_ERR("Unexpected start frame (len %d)", skb->len);
+ kfree_skb(conn->rx_skb);
+ conn->rx_skb = NULL;
+ conn->rx_len = 0;
+ }
+
+ if (ts) {
+ struct hci_iso_ts_data_hdr *hdr;
+
+ hdr = skb_pull_data(skb, HCI_ISO_TS_DATA_HDR_SIZE);
+ if (!hdr) {
+ BT_ERR("Frame is too short (len %d)", skb->len);
+ goto drop;
+ }
+
+ /* Record the timestamp to skb */
+ hwts = skb_hwtstamps(skb);
+ hwts->hwtstamp = us_to_ktime(le32_to_cpu(hdr->ts));
+
+ sn = __le16_to_cpu(hdr->sn);
+ len = __le16_to_cpu(hdr->slen);
+ } else {
+ struct hci_iso_data_hdr *hdr;
+
+ hdr = skb_pull_data(skb, HCI_ISO_DATA_HDR_SIZE);
+ if (!hdr) {
+ BT_ERR("Frame is too short (len %d)", skb->len);
+ goto drop;
+ }
+
+ sn = __le16_to_cpu(hdr->sn);
+ len = __le16_to_cpu(hdr->slen);
+ }
+
+ flags = hci_iso_data_flags(len);
+ len = hci_iso_data_len(len);
+
+ BT_DBG("Start: total len %d, frag len %d flags 0x%4.4x sn %d",
+ len, skb->len, flags, sn);
+
+ if (len == skb->len) {
+ /* Complete frame received */
+ hci_skb_pkt_status(skb) = flags & 0x03;
+ hci_skb_pkt_seqnum(skb) = sn;
+ iso_recv_frame(conn, skb);
+ goto done;
+ }
+
+ if (pb == ISO_SINGLE) {
+ BT_ERR("Frame malformed (len %d, expected len %d)",
+ skb->len, len);
+ goto drop;
+ }
+
+ if (skb->len > len) {
+ BT_ERR("Frame is too long (len %d, expected len %d)",
+ skb->len, len);
+ goto drop;
+ }
+
+ /* Allocate skb for the complete frame (with header) */
+ conn->rx_skb = bt_skb_alloc(len, GFP_KERNEL);
+ if (!conn->rx_skb)
+ goto drop;
+
+ hci_skb_pkt_status(conn->rx_skb) = flags & 0x03;
+ hci_skb_pkt_seqnum(conn->rx_skb) = sn;
+ skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
+ skb->len);
+ conn->rx_len = len - skb->len;
+
+ /* Copy hw timestamp from skb to rx_skb if present */
+ if (ts) {
+ hwts = skb_hwtstamps(conn->rx_skb);
+ hwts->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
+ }
+
+ break;
+
+ case ISO_CONT:
+ BT_DBG("Cont: frag len %d (expecting %d)", skb->len,
+ conn->rx_len);
+
+ if (!conn->rx_len) {
+ BT_ERR("Unexpected continuation frame (len %d)",
+ skb->len);
+ goto drop;
+ }
+
+ if (skb->len > conn->rx_len) {
+ BT_ERR("Fragment is too long (len %d, expected %d)",
+ skb->len, conn->rx_len);
+ kfree_skb(conn->rx_skb);
+ conn->rx_skb = NULL;
+ conn->rx_len = 0;
+ goto drop;
+ }
+
+ skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
+ skb->len);
+ conn->rx_len -= skb->len;
+ break;
+
+ case ISO_END:
+ skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
+ skb->len);
+ conn->rx_len -= skb->len;
+
+ if (!conn->rx_len) {
+ struct sk_buff *rx_skb = conn->rx_skb;
+
+ /* Complete frame received. iso_recv_frame
+ * takes ownership of the skb so set the global
+ * rx_skb pointer to NULL first.
+ */
+ conn->rx_skb = NULL;
+ iso_recv_frame(conn, rx_skb);
+ }
+ break;
+ }
+
+drop:
+ kfree_skb(skb);
+done:
+ iso_conn_put(conn);
+ return 0;
+}
+
+static struct hci_cb iso_cb = {
+ .name = "ISO",
+ .connect_cfm = iso_connect_cfm,
+ .disconn_cfm = iso_disconn_cfm,
+};
+
+static int iso_debugfs_show(struct seq_file *f, void *p)
+{
+ struct sock *sk;
+
+ read_lock(&iso_sk_list.lock);
+
+ sk_for_each(sk, &iso_sk_list.head) {
+ seq_printf(f, "%pMR %pMR %d\n", &iso_pi(sk)->src,
+ &iso_pi(sk)->dst, sk->sk_state);
+ }
+
+ read_unlock(&iso_sk_list.lock);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(iso_debugfs);
+
+static struct dentry *iso_debugfs;
+
+static const struct proto_ops iso_sock_ops = {
+ .family = PF_BLUETOOTH,
+ .owner = THIS_MODULE,
+ .release = iso_sock_release,
+ .bind = iso_sock_bind,
+ .connect = iso_sock_connect,
+ .listen = iso_sock_listen,
+ .accept = iso_sock_accept,
+ .getname = iso_sock_getname,
+ .sendmsg = iso_sock_sendmsg,
+ .recvmsg = iso_sock_recvmsg,
+ .poll = bt_sock_poll,
+ .ioctl = bt_sock_ioctl,
+ .mmap = sock_no_mmap,
+ .socketpair = sock_no_socketpair,
+ .shutdown = iso_sock_shutdown,
+ .setsockopt = iso_sock_setsockopt,
+ .getsockopt = iso_sock_getsockopt
+};
+
+static const struct net_proto_family iso_sock_family_ops = {
+ .family = PF_BLUETOOTH,
+ .owner = THIS_MODULE,
+ .create = iso_sock_create,
+};
+
+static bool inited;
+
+bool iso_inited(void)
+{
+ return inited;
+}
+
+int iso_init(void)
+{
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct sockaddr_iso) > sizeof(struct sockaddr));
+
+ if (inited)
+ return -EALREADY;
+
+ err = proto_register(&iso_proto, 0);
+ if (err < 0)
+ return err;
+
+ err = bt_sock_register(BTPROTO_ISO, &iso_sock_family_ops);
+ if (err < 0) {
+ BT_ERR("ISO socket registration failed");
+ goto error;
+ }
+
+ err = bt_procfs_init(&init_net, "iso", &iso_sk_list, NULL);
+ if (err < 0) {
+ BT_ERR("Failed to create ISO proc file");
+ bt_sock_unregister(BTPROTO_ISO);
+ goto error;
+ }
+
+ BT_INFO("ISO socket layer initialized");
+
+ hci_register_cb(&iso_cb);
+
+ if (!IS_ERR_OR_NULL(bt_debugfs))
+ iso_debugfs = debugfs_create_file("iso", 0444, bt_debugfs,
+ NULL, &iso_debugfs_fops);
+
+ inited = true;
+
+ return 0;
+
+error:
+ proto_unregister(&iso_proto);
+ return err;
+}
+
+int iso_exit(void)
+{
+ if (!inited)
+ return -EALREADY;
+
+ bt_procfs_cleanup(&init_net, "iso");
+
+ debugfs_remove(iso_debugfs);
+ iso_debugfs = NULL;
+
+ hci_unregister_cb(&iso_cb);
+
+ bt_sock_unregister(BTPROTO_ISO);
+
+ proto_unregister(&iso_proto);
+
+ inited = false;
+
+ return 0;
+}
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index d17a4736e47c..07b493331fd7 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -39,21 +39,17 @@
#include <net/bluetooth/l2cap.h>
#include "smp.h"
-#include "a2mp.h"
-#include "amp.h"
#define LE_FLOWCTL_MAX_CREDITS 65535
bool disable_ertm;
+bool enable_ecred = IS_ENABLED(CONFIG_BT_LE_L2CAP_ECRED);
static u32 l2cap_feat_mask = L2CAP_FEAT_FIXED_CHAN | L2CAP_FEAT_UCD;
static LIST_HEAD(chan_list);
static DEFINE_RWLOCK(chan_list_lock);
-static u16 le_max_credits = L2CAP_LE_MAX_CREDITS;
-static u16 le_default_mps = L2CAP_LE_DEFAULT_MPS;
-
static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn,
u8 code, u8 ident, u16 dlen, void *data);
static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len,
@@ -63,6 +59,9 @@ static void l2cap_send_disconn_req(struct l2cap_chan *chan, int err);
static void l2cap_tx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
struct sk_buff_head *skbs, u8 event);
+static void l2cap_retrans_timeout(struct work_struct *work);
+static void l2cap_monitor_timeout(struct work_struct *work);
+static void l2cap_ack_timeout(struct work_struct *work);
static inline u8 bdaddr_type(u8 link_type, u8 bdaddr_type)
{
@@ -113,34 +112,39 @@ static struct l2cap_chan *__l2cap_get_chan_by_scid(struct l2cap_conn *conn,
}
/* Find channel with given SCID.
- * Returns locked channel. */
+ * Returns a reference locked channel.
+ */
static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn,
u16 cid)
{
struct l2cap_chan *c;
- mutex_lock(&conn->chan_lock);
c = __l2cap_get_chan_by_scid(conn, cid);
- if (c)
- l2cap_chan_lock(c);
- mutex_unlock(&conn->chan_lock);
+ if (c) {
+ /* Only lock if chan reference is not 0 */
+ c = l2cap_chan_hold_unless_zero(c);
+ if (c)
+ l2cap_chan_lock(c);
+ }
return c;
}
/* Find channel with given DCID.
- * Returns locked channel.
+ * Returns a reference locked channel.
*/
static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn,
u16 cid)
{
struct l2cap_chan *c;
- mutex_lock(&conn->chan_lock);
c = __l2cap_get_chan_by_dcid(conn, cid);
- if (c)
- l2cap_chan_lock(c);
- mutex_unlock(&conn->chan_lock);
+ if (c) {
+ /* Only lock if chan reference is not 0 */
+ c = l2cap_chan_hold_unless_zero(c);
+ if (c)
+ l2cap_chan_lock(c);
+ }
return c;
}
@@ -157,25 +161,18 @@ static struct l2cap_chan *__l2cap_get_chan_by_ident(struct l2cap_conn *conn,
return NULL;
}
-static struct l2cap_chan *l2cap_get_chan_by_ident(struct l2cap_conn *conn,
- u8 ident)
+static struct l2cap_chan *__l2cap_global_chan_by_addr(__le16 psm, bdaddr_t *src,
+ u8 src_type)
{
struct l2cap_chan *c;
- mutex_lock(&conn->chan_lock);
- c = __l2cap_get_chan_by_ident(conn, ident);
- if (c)
- l2cap_chan_lock(c);
- mutex_unlock(&conn->chan_lock);
-
- return c;
-}
+ list_for_each_entry(c, &chan_list, global_l) {
+ if (src_type == BDADDR_BREDR && c->src_type != BDADDR_BREDR)
+ continue;
-static struct l2cap_chan *__l2cap_global_chan_by_addr(__le16 psm, bdaddr_t *src)
-{
- struct l2cap_chan *c;
+ if (src_type != BDADDR_BREDR && c->src_type == BDADDR_BREDR)
+ continue;
- list_for_each_entry(c, &chan_list, global_l) {
if (c->sport == psm && !bacmp(&c->src, src))
return c;
}
@@ -188,7 +185,7 @@ int l2cap_add_psm(struct l2cap_chan *chan, bdaddr_t *src, __le16 psm)
write_lock(&chan_list_lock);
- if (psm && __l2cap_global_chan_by_addr(psm, src)) {
+ if (psm && __l2cap_global_chan_by_addr(psm, src, chan->src_type)) {
err = -EADDRINUSE;
goto done;
}
@@ -212,7 +209,8 @@ int l2cap_add_psm(struct l2cap_chan *chan, bdaddr_t *src, __le16 psm)
err = -EINVAL;
for (p = start; p <= end; p += incr)
- if (!__l2cap_global_chan_by_addr(cpu_to_le16(p), src)) {
+ if (!__l2cap_global_chan_by_addr(cpu_to_le16(p), src,
+ chan->src_type)) {
chan->psm = cpu_to_le16(p);
chan->sport = cpu_to_le16(p);
err = 0;
@@ -413,7 +411,13 @@ static void l2cap_chan_timeout(struct work_struct *work)
BT_DBG("chan %p state %s", chan, state_to_string(chan->state));
- mutex_lock(&conn->chan_lock);
+ if (!conn)
+ return;
+
+ mutex_lock(&conn->lock);
+ /* __set_chan_timer() calls l2cap_chan_hold(chan) while scheduling
+ * this work. No need to call l2cap_chan_hold(chan) here again.
+ */
l2cap_chan_lock(chan);
if (chan->state == BT_CONNECTED || chan->state == BT_CONFIG)
@@ -426,12 +430,12 @@ static void l2cap_chan_timeout(struct work_struct *work)
l2cap_chan_close(chan, reason);
- l2cap_chan_unlock(chan);
-
chan->ops->close(chan);
- mutex_unlock(&conn->chan_lock);
+ l2cap_chan_unlock(chan);
l2cap_chan_put(chan);
+
+ mutex_unlock(&conn->lock);
}
struct l2cap_chan *l2cap_chan_create(void)
@@ -442,16 +446,24 @@ struct l2cap_chan *l2cap_chan_create(void)
if (!chan)
return NULL;
+ skb_queue_head_init(&chan->tx_q);
+ skb_queue_head_init(&chan->srej_q);
mutex_init(&chan->lock);
/* Set default lock nesting level */
atomic_set(&chan->nesting, L2CAP_NESTING_NORMAL);
+ /* Available receive buffer space is initially unknown */
+ chan->rx_avail = -1;
+
write_lock(&chan_list_lock);
list_add(&chan->global_l, &chan_list);
write_unlock(&chan_list_lock);
INIT_DELAYED_WORK(&chan->chan_timer, l2cap_chan_timeout);
+ INIT_DELAYED_WORK(&chan->retrans_timer, l2cap_retrans_timeout);
+ INIT_DELAYED_WORK(&chan->monitor_timer, l2cap_monitor_timeout);
+ INIT_DELAYED_WORK(&chan->ack_timer, l2cap_ack_timeout);
chan->state = BT_OPEN;
@@ -481,14 +493,25 @@ static void l2cap_chan_destroy(struct kref *kref)
void l2cap_chan_hold(struct l2cap_chan *c)
{
- BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref));
+ BT_DBG("chan %p orig refcnt %u", c, kref_read(&c->kref));
kref_get(&c->kref);
}
+EXPORT_SYMBOL_GPL(l2cap_chan_hold);
+
+struct l2cap_chan *l2cap_chan_hold_unless_zero(struct l2cap_chan *c)
+{
+ BT_DBG("chan %p orig refcnt %u", c, kref_read(&c->kref));
+
+ if (!kref_get_unless_zero(&c->kref))
+ return NULL;
+
+ return c;
+}
void l2cap_chan_put(struct l2cap_chan *c)
{
- BT_DBG("chan %p orig refcnt %d", c, kref_read(&c->kref));
+ BT_DBG("chan %p orig refcnt %u", c, kref_read(&c->kref));
kref_put(&c->kref, l2cap_chan_destroy);
}
@@ -507,24 +530,60 @@ void l2cap_chan_set_defaults(struct l2cap_chan *chan)
chan->flush_to = L2CAP_DEFAULT_FLUSH_TO;
chan->retrans_timeout = L2CAP_DEFAULT_RETRANS_TO;
chan->monitor_timeout = L2CAP_DEFAULT_MONITOR_TO;
+
chan->conf_state = 0;
+ set_bit(CONF_NOT_COMPLETE, &chan->conf_state);
set_bit(FLAG_FORCE_ACTIVE, &chan->flags);
}
EXPORT_SYMBOL_GPL(l2cap_chan_set_defaults);
-static void l2cap_le_flowctl_init(struct l2cap_chan *chan)
+static __u16 l2cap_le_rx_credits(struct l2cap_chan *chan)
+{
+ size_t sdu_len = chan->sdu ? chan->sdu->len : 0;
+
+ if (chan->mps == 0)
+ return 0;
+
+ /* If we don't know the available space in the receiver buffer, give
+ * enough credits for a full packet.
+ */
+ if (chan->rx_avail == -1)
+ return (chan->imtu / chan->mps) + 1;
+
+ /* If we know how much space is available in the receive buffer, give
+ * out as many credits as would fill the buffer.
+ */
+ if (chan->rx_avail <= sdu_len)
+ return 0;
+
+ return DIV_ROUND_UP(chan->rx_avail - sdu_len, chan->mps);
+}
+
+static void l2cap_le_flowctl_init(struct l2cap_chan *chan, u16 tx_credits)
{
chan->sdu = NULL;
chan->sdu_last_frag = NULL;
chan->sdu_len = 0;
- chan->tx_credits = 0;
- chan->rx_credits = le_max_credits;
- chan->mps = min_t(u16, chan->imtu, le_default_mps);
+ chan->tx_credits = tx_credits;
+ /* Derive MPS from connection MTU to stop HCI fragmentation */
+ chan->mps = min_t(u16, chan->imtu, chan->conn->mtu - L2CAP_HDR_SIZE);
+ chan->rx_credits = l2cap_le_rx_credits(chan);
skb_queue_head_init(&chan->tx_q);
}
+static void l2cap_ecred_init(struct l2cap_chan *chan, u16 tx_credits)
+{
+ l2cap_le_flowctl_init(chan, tx_credits);
+
+ /* L2CAP implementations shall support a minimum MPS of 64 octets */
+ if (chan->mps < L2CAP_ECRED_MIN_MPS) {
+ chan->mps = L2CAP_ECRED_MIN_MPS;
+ chan->rx_credits = l2cap_le_rx_credits(chan);
+ }
+}
+
void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan)
{
BT_DBG("conn %p, psm 0x%2.2x, dcid 0x%4.4x", conn,
@@ -574,14 +633,15 @@ void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan)
test_bit(FLAG_HOLD_HCI_CONN, &chan->flags))
hci_conn_hold(conn->hcon);
- list_add(&chan->list, &conn->chan_l);
+ /* Append to the list since the order matters for ECRED */
+ list_add_tail(&chan->list, &conn->chan_l);
}
void l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan)
{
- mutex_lock(&conn->chan_lock);
+ mutex_lock(&conn->lock);
__l2cap_chan_add(conn, chan);
- mutex_unlock(&conn->chan_lock);
+ mutex_unlock(&conn->lock);
}
void l2cap_chan_del(struct l2cap_chan *chan, int err)
@@ -596,7 +656,6 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err)
chan->ops->teardown(chan, err);
if (conn) {
- struct amp_mgr *mgr = conn->hcon->amp_mgr;
/* Delete from channel list */
list_del(&chan->list);
@@ -611,26 +670,17 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err)
if (chan->chan_type != L2CAP_CHAN_FIXED ||
test_bit(FLAG_HOLD_HCI_CONN, &chan->flags))
hci_conn_drop(conn->hcon);
-
- if (mgr && mgr->bredr_chan == chan)
- mgr->bredr_chan = NULL;
- }
-
- if (chan->hs_hchan) {
- struct hci_chan *hs_hchan = chan->hs_hchan;
-
- BT_DBG("chan %p disconnect hs_hchan %p", chan, hs_hchan);
- amp_disconnect_logical_link(hs_hchan);
}
if (test_bit(CONF_NOT_COMPLETE, &chan->conf_state))
return;
- switch(chan->mode) {
+ switch (chan->mode) {
case L2CAP_MODE_BASIC:
break;
case L2CAP_MODE_LE_FLOWCTL:
+ case L2CAP_MODE_EXT_FLOWCTL:
skb_queue_purge(&chan->tx_q);
break;
@@ -643,26 +693,57 @@ void l2cap_chan_del(struct l2cap_chan *chan, int err)
l2cap_seq_list_free(&chan->srej_list);
l2cap_seq_list_free(&chan->retrans_list);
-
- /* fall through */
+ fallthrough;
case L2CAP_MODE_STREAMING:
skb_queue_purge(&chan->tx_q);
break;
}
-
- return;
}
EXPORT_SYMBOL_GPL(l2cap_chan_del);
+static void __l2cap_chan_list_id(struct l2cap_conn *conn, u16 id,
+ l2cap_chan_func_t func, void *data)
+{
+ struct l2cap_chan *chan, *l;
+
+ list_for_each_entry_safe(chan, l, &conn->chan_l, list) {
+ if (chan->ident == id)
+ func(chan, data);
+ }
+}
+
+static void __l2cap_chan_list(struct l2cap_conn *conn, l2cap_chan_func_t func,
+ void *data)
+{
+ struct l2cap_chan *chan;
+
+ list_for_each_entry(chan, &conn->chan_l, list) {
+ func(chan, data);
+ }
+}
+
+void l2cap_chan_list(struct l2cap_conn *conn, l2cap_chan_func_t func,
+ void *data)
+{
+ if (!conn)
+ return;
+
+ mutex_lock(&conn->lock);
+ __l2cap_chan_list(conn, func, data);
+ mutex_unlock(&conn->lock);
+}
+
+EXPORT_SYMBOL_GPL(l2cap_chan_list);
+
static void l2cap_conn_update_id_addr(struct work_struct *work)
{
struct l2cap_conn *conn = container_of(work, struct l2cap_conn,
- id_addr_update_work);
+ id_addr_timer.work);
struct hci_conn *hcon = conn->hcon;
struct l2cap_chan *chan;
- mutex_lock(&conn->chan_lock);
+ mutex_lock(&conn->lock);
list_for_each_entry(chan, &conn->chan_l, list) {
l2cap_chan_lock(chan);
@@ -671,7 +752,7 @@ static void l2cap_conn_update_id_addr(struct work_struct *work)
l2cap_chan_unlock(chan);
}
- mutex_unlock(&conn->chan_lock);
+ mutex_unlock(&conn->lock);
}
static void l2cap_chan_le_connect_reject(struct l2cap_chan *chan)
@@ -681,9 +762,9 @@ static void l2cap_chan_le_connect_reject(struct l2cap_chan *chan)
u16 result;
if (test_bit(FLAG_DEFER_SETUP, &chan->flags))
- result = L2CAP_CR_AUTHORIZATION;
+ result = L2CAP_CR_LE_AUTHORIZATION;
else
- result = L2CAP_CR_BAD_PSM;
+ result = L2CAP_CR_LE_BAD_PSM;
l2cap_state_change(chan, BT_DISCONN);
@@ -697,6 +778,13 @@ static void l2cap_chan_le_connect_reject(struct l2cap_chan *chan)
&rsp);
}
+static void l2cap_chan_ecred_connect_reject(struct l2cap_chan *chan)
+{
+ l2cap_state_change(chan, BT_DISCONN);
+
+ __l2cap_ecred_conn_rsp_defer(chan);
+}
+
static void l2cap_chan_connect_reject(struct l2cap_chan *chan)
{
struct l2cap_conn *conn = chan->conn;
@@ -742,8 +830,16 @@ void l2cap_chan_close(struct l2cap_chan *chan, int reason)
if (chan->chan_type == L2CAP_CHAN_CONN_ORIENTED) {
if (conn->hcon->type == ACL_LINK)
l2cap_chan_connect_reject(chan);
- else if (conn->hcon->type == LE_LINK)
- l2cap_chan_le_connect_reject(chan);
+ else if (conn->hcon->type == LE_LINK) {
+ switch (chan->mode) {
+ case L2CAP_MODE_LE_FLOWCTL:
+ l2cap_chan_le_connect_reject(chan);
+ break;
+ case L2CAP_MODE_EXT_FLOWCTL:
+ l2cap_chan_ecred_connect_reject(chan);
+ return;
+ }
+ }
}
l2cap_chan_del(chan, reason);
@@ -797,7 +893,8 @@ static inline u8 l2cap_get_auth_type(struct l2cap_chan *chan)
else
return HCI_AT_NO_BONDING;
}
- /* fall through */
+ fallthrough;
+
default:
switch (chan->sec_level) {
case BT_SECURITY_HIGH:
@@ -849,6 +946,16 @@ static u8 l2cap_get_ident(struct l2cap_conn *conn)
return id;
}
+static void l2cap_send_acl(struct l2cap_conn *conn, struct sk_buff *skb,
+ u8 flags)
+{
+ /* Check if the hcon still valid before attempting to send */
+ if (hci_conn_valid(conn->hcon->hdev, conn->hcon))
+ hci_send_acl(conn->hchan, skb, flags);
+ else
+ kfree_skb(skb);
+}
+
static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len,
void *data)
{
@@ -871,13 +978,7 @@ static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len,
bt_cb(skb)->force_active = BT_POWER_FORCE_ACTIVE_ON;
skb->priority = HCI_PRIO_MAX;
- hci_send_acl(conn->hchan, skb, flags);
-}
-
-static bool __chan_is_moving(struct l2cap_chan *chan)
-{
- return chan->move_state != L2CAP_MOVE_STABLE &&
- chan->move_state != L2CAP_MOVE_WAIT_PREPARE;
+ l2cap_send_acl(conn, skb, flags);
}
static void l2cap_do_send(struct l2cap_chan *chan, struct sk_buff *skb)
@@ -888,15 +989,6 @@ static void l2cap_do_send(struct l2cap_chan *chan, struct sk_buff *skb)
BT_DBG("chan %p, skb %p len %d priority %u", chan, skb, skb->len,
skb->priority);
- if (chan->hs_hcon && !__chan_is_moving(chan)) {
- if (chan->hs_hchan)
- hci_send_acl(chan->hs_hchan, skb, ACL_COMPLETE);
- else
- kfree_skb(skb);
-
- return;
- }
-
/* Use NO_FLUSH for LE links (where this is the only option) or
* if the BR/EDR link supports it and flushing has not been
* explicitly requested (through FLAG_FLUSHABLE).
@@ -1077,9 +1169,6 @@ static void l2cap_send_sframe(struct l2cap_chan *chan,
if (!control->sframe)
return;
- if (__chan_is_moving(chan))
- return;
-
if (test_and_clear_bit(CONN_SEND_FBIT, &chan->conn_state) &&
!control->poll)
control->final = 1;
@@ -1134,40 +1223,6 @@ static inline int __l2cap_no_conn_pending(struct l2cap_chan *chan)
return !test_bit(CONF_CONNECT_PEND, &chan->conf_state);
}
-static bool __amp_capable(struct l2cap_chan *chan)
-{
- struct l2cap_conn *conn = chan->conn;
- struct hci_dev *hdev;
- bool amp_available = false;
-
- if (!(conn->local_fixed_chan & L2CAP_FC_A2MP))
- return false;
-
- if (!(conn->remote_fixed_chan & L2CAP_FC_A2MP))
- return false;
-
- read_lock(&hci_dev_list_lock);
- list_for_each_entry(hdev, &hci_dev_list, list) {
- if (hdev->amp_type != AMP_TYPE_BREDR &&
- test_bit(HCI_UP, &hdev->flags)) {
- amp_available = true;
- break;
- }
- }
- read_unlock(&hci_dev_list_lock);
-
- if (chan->chan_policy == BT_CHANNEL_POLICY_AMP_PREFERRED)
- return amp_available;
-
- return false;
-}
-
-static bool l2cap_check_efs(struct l2cap_chan *chan)
-{
- /* Check EFS parameters */
- return true;
-}
-
void l2cap_send_conn_req(struct l2cap_chan *chan)
{
struct l2cap_conn *conn = chan->conn;
@@ -1183,76 +1238,6 @@ void l2cap_send_conn_req(struct l2cap_chan *chan)
l2cap_send_cmd(conn, chan->ident, L2CAP_CONN_REQ, sizeof(req), &req);
}
-static void l2cap_send_create_chan_req(struct l2cap_chan *chan, u8 amp_id)
-{
- struct l2cap_create_chan_req req;
- req.scid = cpu_to_le16(chan->scid);
- req.psm = chan->psm;
- req.amp_id = amp_id;
-
- chan->ident = l2cap_get_ident(chan->conn);
-
- l2cap_send_cmd(chan->conn, chan->ident, L2CAP_CREATE_CHAN_REQ,
- sizeof(req), &req);
-}
-
-static void l2cap_move_setup(struct l2cap_chan *chan)
-{
- struct sk_buff *skb;
-
- BT_DBG("chan %p", chan);
-
- if (chan->mode != L2CAP_MODE_ERTM)
- return;
-
- __clear_retrans_timer(chan);
- __clear_monitor_timer(chan);
- __clear_ack_timer(chan);
-
- chan->retry_count = 0;
- skb_queue_walk(&chan->tx_q, skb) {
- if (bt_cb(skb)->l2cap.retries)
- bt_cb(skb)->l2cap.retries = 1;
- else
- break;
- }
-
- chan->expected_tx_seq = chan->buffer_seq;
-
- clear_bit(CONN_REJ_ACT, &chan->conn_state);
- clear_bit(CONN_SREJ_ACT, &chan->conn_state);
- l2cap_seq_list_clear(&chan->retrans_list);
- l2cap_seq_list_clear(&chan->srej_list);
- skb_queue_purge(&chan->srej_q);
-
- chan->tx_state = L2CAP_TX_STATE_XMIT;
- chan->rx_state = L2CAP_RX_STATE_MOVE;
-
- set_bit(CONN_REMOTE_BUSY, &chan->conn_state);
-}
-
-static void l2cap_move_done(struct l2cap_chan *chan)
-{
- u8 move_role = chan->move_role;
- BT_DBG("chan %p", chan);
-
- chan->move_state = L2CAP_MOVE_STABLE;
- chan->move_role = L2CAP_MOVE_ROLE_NONE;
-
- if (chan->mode != L2CAP_MODE_ERTM)
- return;
-
- switch (move_role) {
- case L2CAP_MOVE_ROLE_INITIATOR:
- l2cap_tx(chan, NULL, NULL, L2CAP_EV_EXPLICIT_POLL);
- chan->rx_state = L2CAP_RX_STATE_WAIT_F;
- break;
- case L2CAP_MOVE_ROLE_RESPONDER:
- chan->rx_state = L2CAP_RX_STATE_WAIT_P;
- break;
- }
-}
-
static void l2cap_chan_ready(struct l2cap_chan *chan)
{
/* The channel may have already been flagged as connected in
@@ -1266,8 +1251,13 @@ static void l2cap_chan_ready(struct l2cap_chan *chan)
chan->conf_state = 0;
__clear_chan_timer(chan);
- if (chan->mode == L2CAP_MODE_LE_FLOWCTL && !chan->tx_credits)
- chan->ops->suspend(chan);
+ switch (chan->mode) {
+ case L2CAP_MODE_LE_FLOWCTL:
+ case L2CAP_MODE_EXT_FLOWCTL:
+ if (!chan->tx_credits)
+ chan->ops->suspend(chan);
+ break;
+ }
chan->state = BT_CONNECTED;
@@ -1282,6 +1272,12 @@ static void l2cap_le_connect(struct l2cap_chan *chan)
if (test_and_set_bit(FLAG_LE_CONN_REQ_SENT, &chan->flags))
return;
+ if (!chan->imtu)
+ chan->imtu = chan->conn->mtu;
+
+ l2cap_le_flowctl_init(chan, 0);
+
+ memset(&req, 0, sizeof(req));
req.psm = chan->psm;
req.scid = cpu_to_le16(chan->scid);
req.mtu = cpu_to_le16(chan->imtu);
@@ -1294,6 +1290,81 @@ static void l2cap_le_connect(struct l2cap_chan *chan)
sizeof(req), &req);
}
+struct l2cap_ecred_conn_data {
+ struct {
+ struct l2cap_ecred_conn_req_hdr req;
+ __le16 scid[5];
+ } __packed pdu;
+ struct l2cap_chan *chan;
+ struct pid *pid;
+ int count;
+};
+
+static void l2cap_ecred_defer_connect(struct l2cap_chan *chan, void *data)
+{
+ struct l2cap_ecred_conn_data *conn = data;
+ struct pid *pid;
+
+ if (chan == conn->chan)
+ return;
+
+ if (!test_and_clear_bit(FLAG_DEFER_SETUP, &chan->flags))
+ return;
+
+ pid = chan->ops->get_peer_pid(chan);
+
+ /* Only add deferred channels with the same PID/PSM */
+ if (conn->pid != pid || chan->psm != conn->chan->psm || chan->ident ||
+ chan->mode != L2CAP_MODE_EXT_FLOWCTL || chan->state != BT_CONNECT)
+ return;
+
+ if (test_and_set_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags))
+ return;
+
+ l2cap_ecred_init(chan, 0);
+
+ /* Set the same ident so we can match on the rsp */
+ chan->ident = conn->chan->ident;
+
+ /* Include all channels deferred */
+ conn->pdu.scid[conn->count] = cpu_to_le16(chan->scid);
+
+ conn->count++;
+}
+
+static void l2cap_ecred_connect(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct l2cap_ecred_conn_data data;
+
+ if (test_bit(FLAG_DEFER_SETUP, &chan->flags))
+ return;
+
+ if (test_and_set_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags))
+ return;
+
+ l2cap_ecred_init(chan, 0);
+
+ memset(&data, 0, sizeof(data));
+ data.pdu.req.psm = chan->psm;
+ data.pdu.req.mtu = cpu_to_le16(chan->imtu);
+ data.pdu.req.mps = cpu_to_le16(chan->mps);
+ data.pdu.req.credits = cpu_to_le16(chan->rx_credits);
+ data.pdu.scid[0] = cpu_to_le16(chan->scid);
+
+ chan->ident = l2cap_get_ident(conn);
+
+ data.count = 1;
+ data.chan = chan;
+ data.pid = chan->ops->get_peer_pid(chan);
+
+ __l2cap_chan_list(conn, l2cap_ecred_defer_connect, &data);
+
+ l2cap_send_cmd(conn, chan->ident, L2CAP_ECRED_CONN_REQ,
+ sizeof(data.pdu.req) + data.count * sizeof(__le16),
+ &data.pdu);
+}
+
static void l2cap_le_start(struct l2cap_chan *chan)
{
struct l2cap_conn *conn = chan->conn;
@@ -1306,16 +1377,17 @@ static void l2cap_le_start(struct l2cap_chan *chan)
return;
}
- if (chan->state == BT_CONNECT)
- l2cap_le_connect(chan);
+ if (chan->state == BT_CONNECT) {
+ if (chan->mode == L2CAP_MODE_EXT_FLOWCTL)
+ l2cap_ecred_connect(chan);
+ else
+ l2cap_le_connect(chan);
+ }
}
static void l2cap_start_connection(struct l2cap_chan *chan)
{
- if (__amp_capable(chan)) {
- BT_DBG("chan %p AMP capable: discover AMPs", chan);
- a2mp_discover_amp(chan);
- } else if (chan->conn->hcon->type == LE_LINK) {
+ if (chan->conn->hcon->type == LE_LINK) {
l2cap_le_start(chan);
} else {
l2cap_send_conn_req(chan);
@@ -1340,6 +1412,28 @@ static void l2cap_request_info(struct l2cap_conn *conn)
sizeof(req), &req);
}
+static bool l2cap_check_enc_key_size(struct hci_conn *hcon,
+ struct l2cap_chan *chan)
+{
+ /* The minimum encryption key size needs to be enforced by the
+ * host stack before establishing any L2CAP connections. The
+ * specification in theory allows a minimum of 1, but to align
+ * BR/EDR and LE transports, a minimum of 7 is chosen.
+ *
+ * This check might also be called for unencrypted connections
+ * that have no key size requirements. Ensure that the link is
+ * actually encrypted before enforcing a key size.
+ */
+ int min_key_size = hcon->hdev->min_enc_key_size;
+
+ /* On FIPS security level, key size must be 16 bytes */
+ if (chan->sec_level == BT_SECURITY_FIPS)
+ min_key_size = 16;
+
+ return (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags) ||
+ hcon->enc_key_size >= min_key_size);
+}
+
static void l2cap_do_start(struct l2cap_chan *chan)
{
struct l2cap_conn *conn = chan->conn;
@@ -1357,9 +1451,14 @@ static void l2cap_do_start(struct l2cap_chan *chan)
if (!(conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE))
return;
- if (l2cap_chan_check_security(chan, true) &&
- __l2cap_no_conn_pending(chan))
+ if (!l2cap_chan_check_security(chan, true) ||
+ !__l2cap_no_conn_pending(chan))
+ return;
+
+ if (l2cap_check_enc_key_size(conn->hcon, chan))
l2cap_start_connection(chan);
+ else
+ __set_chan_timer(chan, L2CAP_DISC_TIMEOUT);
}
static inline int l2cap_mode_supported(__u8 mode, __u32 feat_mask)
@@ -1392,11 +1491,6 @@ static void l2cap_send_disconn_req(struct l2cap_chan *chan, int err)
__clear_ack_timer(chan);
}
- if (chan->scid == L2CAP_CID_A2MP) {
- l2cap_state_change(chan, BT_DISCONN);
- return;
- }
-
req.dcid = cpu_to_le16(chan->dcid);
req.scid = cpu_to_le16(chan->scid);
l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_DISCONN_REQ,
@@ -1412,8 +1506,6 @@ static void l2cap_conn_start(struct l2cap_conn *conn)
BT_DBG("conn %p", conn);
- mutex_lock(&conn->chan_lock);
-
list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) {
l2cap_chan_lock(chan);
@@ -1438,7 +1530,10 @@ static void l2cap_conn_start(struct l2cap_conn *conn)
continue;
}
- l2cap_start_connection(chan);
+ if (l2cap_check_enc_key_size(conn->hcon, chan))
+ l2cap_start_connection(chan);
+ else
+ l2cap_chan_close(chan, ECONNREFUSED);
} else if (chan->state == BT_CONNECT2) {
struct l2cap_conn_rsp rsp;
@@ -1479,8 +1574,6 @@ static void l2cap_conn_start(struct l2cap_conn *conn)
l2cap_chan_unlock(chan);
}
-
- mutex_unlock(&conn->chan_lock);
}
static void l2cap_le_conn_ready(struct l2cap_conn *conn)
@@ -1496,8 +1589,8 @@ static void l2cap_le_conn_ready(struct l2cap_conn *conn)
if (hcon->out)
smp_conn_security(hcon, hcon->pending_sec_level);
- /* For LE slave connections, make sure the connection interval
- * is in the range of the minium and maximum interval that has
+ /* For LE peripheral connections, make sure the connection interval
+ * is in the range of the minimum and maximum interval that has
* been configured for this connection. If not, then trigger
* the connection update procedure.
*/
@@ -1526,17 +1619,12 @@ static void l2cap_conn_ready(struct l2cap_conn *conn)
if (hcon->type == ACL_LINK)
l2cap_request_info(conn);
- mutex_lock(&conn->chan_lock);
+ mutex_lock(&conn->lock);
list_for_each_entry(chan, &conn->chan_l, list) {
l2cap_chan_lock(chan);
- if (chan->scid == L2CAP_CID_A2MP) {
- l2cap_chan_unlock(chan);
- continue;
- }
-
if (hcon->type == LE_LINK) {
l2cap_le_start(chan);
} else if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) {
@@ -1549,7 +1637,7 @@ static void l2cap_conn_ready(struct l2cap_conn *conn)
l2cap_chan_unlock(chan);
}
- mutex_unlock(&conn->chan_lock);
+ mutex_unlock(&conn->lock);
if (hcon->type == LE_LINK)
l2cap_le_conn_ready(conn);
@@ -1564,14 +1652,10 @@ static void l2cap_conn_unreliable(struct l2cap_conn *conn, int err)
BT_DBG("conn %p", conn);
- mutex_lock(&conn->chan_lock);
-
list_for_each_entry(chan, &conn->chan_l, list) {
if (test_bit(FLAG_FORCE_RELIABLE, &chan->flags))
l2cap_chan_set_err(chan, err);
}
-
- mutex_unlock(&conn->chan_lock);
}
static void l2cap_info_timeout(struct work_struct *work)
@@ -1582,7 +1666,9 @@ static void l2cap_info_timeout(struct work_struct *work)
conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
conn->info_ident = 0;
+ mutex_lock(&conn->lock);
l2cap_conn_start(conn);
+ mutex_unlock(&conn->lock);
}
/*
@@ -1674,6 +1760,8 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err)
BT_DBG("hcon %p conn %p, err %d", hcon, conn, err);
+ mutex_lock(&conn->lock);
+
kfree_skb(conn->rx_skb);
skb_queue_purge(&conn->pending_rx);
@@ -1685,16 +1773,13 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err)
if (work_pending(&conn->pending_rx_work))
cancel_work_sync(&conn->pending_rx_work);
- if (work_pending(&conn->id_addr_update_work))
- cancel_work_sync(&conn->id_addr_update_work);
+ cancel_delayed_work_sync(&conn->id_addr_timer);
l2cap_unregister_all_users(conn);
/* Force the connection to be immediately dropped */
hcon->disc_timeout = 0;
- mutex_lock(&conn->chan_lock);
-
/* Kill channels */
list_for_each_entry_safe(chan, l, &conn->chan_l, list) {
l2cap_chan_hold(chan);
@@ -1702,21 +1787,20 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err)
l2cap_chan_del(chan, err);
- l2cap_chan_unlock(chan);
-
chan->ops->close(chan);
+
+ l2cap_chan_unlock(chan);
l2cap_chan_put(chan);
}
- mutex_unlock(&conn->chan_lock);
-
- hci_chan_del(conn->hchan);
-
if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT)
cancel_delayed_work_sync(&conn->info_timer);
- hcon->l2cap_data = NULL;
+ hci_chan_del(conn->hchan);
conn->hchan = NULL;
+
+ hcon->l2cap_data = NULL;
+ mutex_unlock(&conn->lock);
l2cap_conn_put(conn);
}
@@ -1751,11 +1835,11 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm,
bdaddr_t *dst,
u8 link_type)
{
- struct l2cap_chan *c, *c1 = NULL;
+ struct l2cap_chan *c, *tmp, *c1 = NULL;
read_lock(&chan_list_lock);
- list_for_each_entry(c, &chan_list, global_l) {
+ list_for_each_entry_safe(c, tmp, &chan_list, global_l) {
if (state && c->state != state)
continue;
@@ -1765,7 +1849,7 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm,
if (link_type == LE_LINK && c->src_type == BDADDR_BREDR)
continue;
- if (c->psm == psm) {
+ if (c->chan_type != L2CAP_CHAN_FIXED && c->psm == psm) {
int src_match, dst_match;
int src_any, dst_any;
@@ -1773,7 +1857,9 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm,
src_match = !bacmp(&c->src, src);
dst_match = !bacmp(&c->dst, dst);
if (src_match && dst_match) {
- l2cap_chan_hold(c);
+ if (!l2cap_chan_hold_unless_zero(c))
+ continue;
+
read_unlock(&chan_list_lock);
return c;
}
@@ -1788,7 +1874,7 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm,
}
if (c1)
- l2cap_chan_hold(c1);
+ c1 = l2cap_chan_hold_unless_zero(c1);
read_unlock(&chan_list_lock);
@@ -1844,9 +1930,6 @@ static void l2cap_streaming_send(struct l2cap_chan *chan,
BT_DBG("chan %p, skbs %p", chan, skbs);
- if (__chan_is_moving(chan))
- return;
-
skb_queue_splice_tail_init(skbs, &chan->tx_q);
while (!skb_queue_empty(&chan->tx_q)) {
@@ -1889,9 +1972,6 @@ static int l2cap_ertm_send(struct l2cap_chan *chan)
if (test_bit(CONN_REMOTE_BUSY, &chan->conn_state))
return 0;
- if (__chan_is_moving(chan))
- return 0;
-
while (chan->tx_send_head &&
chan->unacked_frames < chan->remote_tx_win &&
chan->tx_state == L2CAP_TX_STATE_XMIT) {
@@ -1957,9 +2037,6 @@ static void l2cap_ertm_resend(struct l2cap_chan *chan)
if (test_bit(CONN_REMOTE_BUSY, &chan->conn_state))
return;
- if (__chan_is_moving(chan))
- return;
-
while (chan->retrans_list.head != L2CAP_SEQ_LIST_CLEAR) {
seq = l2cap_seq_list_pop(&chan->retrans_list);
@@ -2299,8 +2376,7 @@ static int l2cap_segment_sdu(struct l2cap_chan *chan,
pdu_len = chan->conn->mtu;
/* Constrain PDU size for BR/EDR connections */
- if (!chan->hs_hcon)
- pdu_len = min_t(size_t, pdu_len, L2CAP_BREDR_MAX_PAYLOAD);
+ pdu_len = min_t(size_t, pdu_len, L2CAP_BREDR_MAX_PAYLOAD);
/* Adjust for largest possible L2CAP overhead. */
if (chan->fcs)
@@ -2441,7 +2517,33 @@ static void l2cap_le_flowctl_send(struct l2cap_chan *chan)
skb_queue_len(&chan->tx_q));
}
-int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len)
+static void l2cap_tx_timestamp(struct sk_buff *skb,
+ const struct sockcm_cookie *sockc,
+ size_t len)
+{
+ struct sock *sk = skb ? skb->sk : NULL;
+
+ if (sk && sk->sk_type == SOCK_STREAM)
+ hci_setup_tx_timestamp(skb, len, sockc);
+ else
+ hci_setup_tx_timestamp(skb, 1, sockc);
+}
+
+static void l2cap_tx_timestamp_seg(struct sk_buff_head *queue,
+ const struct sockcm_cookie *sockc,
+ size_t len)
+{
+ struct sk_buff *skb = skb_peek(queue);
+ struct sock *sk = skb ? skb->sk : NULL;
+
+ if (sk && sk->sk_type == SOCK_STREAM)
+ l2cap_tx_timestamp(skb_peek_tail(queue), sockc, len);
+ else
+ l2cap_tx_timestamp(skb, sockc, len);
+}
+
+int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len,
+ const struct sockcm_cookie *sockc)
{
struct sk_buff *skb;
int err;
@@ -2456,13 +2558,7 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len)
if (IS_ERR(skb))
return PTR_ERR(skb);
- /* Channel lock is released before requesting new skb and then
- * reacquired thus we need to recheck channel state.
- */
- if (chan->state != BT_CONNECTED) {
- kfree_skb(skb);
- return -ENOTCONN;
- }
+ l2cap_tx_timestamp(skb, sockc, len);
l2cap_do_send(chan, skb);
return len;
@@ -2470,6 +2566,7 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len)
switch (chan->mode) {
case L2CAP_MODE_LE_FLOWCTL:
+ case L2CAP_MODE_EXT_FLOWCTL:
/* Check outgoing MTU */
if (len > chan->omtu)
return -EMSGSIZE;
@@ -2486,6 +2583,8 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len)
if (err)
return err;
+ l2cap_tx_timestamp_seg(&seg_queue, sockc, len);
+
skb_queue_splice_tail_init(&seg_queue, &chan->tx_q);
l2cap_le_flowctl_send(chan);
@@ -2507,13 +2606,7 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len)
if (IS_ERR(skb))
return PTR_ERR(skb);
- /* Channel lock is released before requesting new skb and then
- * reacquired thus we need to recheck channel state.
- */
- if (chan->state != BT_CONNECTED) {
- kfree_skb(skb);
- return -ENOTCONN;
- }
+ l2cap_tx_timestamp(skb, sockc, len);
l2cap_do_send(chan, skb);
err = len;
@@ -2535,21 +2628,16 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len)
*/
err = l2cap_segment_sdu(chan, &seg_queue, msg, len);
- /* The channel could have been closed while segmenting,
- * check that it is still connected.
- */
- if (chan->state != BT_CONNECTED) {
- __skb_queue_purge(&seg_queue);
- err = -ENOTCONN;
- }
-
if (err)
break;
- if (chan->mode == L2CAP_MODE_ERTM)
+ if (chan->mode == L2CAP_MODE_ERTM) {
+ /* TODO: ERTM mode timestamping */
l2cap_tx(chan, NULL, &seg_queue, L2CAP_EV_DATA_REQUEST);
- else
+ } else {
+ l2cap_tx_timestamp_seg(&seg_queue, sockc, len);
l2cap_streaming_send(chan, &seg_queue);
+ }
err = len;
@@ -2795,8 +2883,7 @@ static void l2cap_tx_state_wait_f(struct l2cap_chan *chan,
break;
case L2CAP_EV_RECV_REQSEQ_AND_FBIT:
l2cap_process_reqseq(chan, control->reqseq);
-
- /* Fall through */
+ fallthrough;
case L2CAP_EV_RECV_FBIT:
if (control && control->final) {
@@ -2866,8 +2953,6 @@ static void l2cap_raw_recv(struct l2cap_conn *conn, struct sk_buff *skb)
BT_DBG("conn %p", conn);
- mutex_lock(&conn->chan_lock);
-
list_for_each_entry(chan, &conn->chan_l, list) {
if (chan->chan_type != L2CAP_CHAN_RAW)
continue;
@@ -2882,8 +2967,6 @@ static void l2cap_raw_recv(struct l2cap_conn *conn, struct sk_buff *skb)
if (chan->ops->recv(chan, nskb))
kfree_skb(nskb);
}
-
- mutex_unlock(&conn->chan_lock);
}
/* ---- L2CAP signalling commands ---- */
@@ -3088,21 +3171,12 @@ int l2cap_ertm_init(struct l2cap_chan *chan)
skb_queue_head_init(&chan->tx_q);
- chan->local_amp_id = AMP_ID_BREDR;
- chan->move_id = AMP_ID_BREDR;
- chan->move_state = L2CAP_MOVE_STABLE;
- chan->move_role = L2CAP_MOVE_ROLE_NONE;
-
if (chan->mode != L2CAP_MODE_ERTM)
return 0;
chan->rx_state = L2CAP_RX_STATE_RECV;
chan->tx_state = L2CAP_TX_STATE_XMIT;
- INIT_DELAYED_WORK(&chan->retrans_timer, l2cap_retrans_timeout);
- INIT_DELAYED_WORK(&chan->monitor_timer, l2cap_monitor_timeout);
- INIT_DELAYED_WORK(&chan->ack_timer, l2cap_ack_timeout);
-
skb_queue_head_init(&chan->srej_q);
err = l2cap_seq_list_init(&chan->srej_list, chan->tx_win);
@@ -3123,7 +3197,7 @@ static inline __u8 l2cap_select_mode(__u8 mode, __u16 remote_feat_mask)
case L2CAP_MODE_ERTM:
if (l2cap_mode_supported(mode, remote_feat_mask))
return mode;
- /* fall through */
+ fallthrough;
default:
return L2CAP_MODE_BASIC;
}
@@ -3131,52 +3205,19 @@ static inline __u8 l2cap_select_mode(__u8 mode, __u16 remote_feat_mask)
static inline bool __l2cap_ews_supported(struct l2cap_conn *conn)
{
- return ((conn->local_fixed_chan & L2CAP_FC_A2MP) &&
- (conn->feat_mask & L2CAP_FEAT_EXT_WINDOW));
+ return (conn->feat_mask & L2CAP_FEAT_EXT_WINDOW);
}
static inline bool __l2cap_efs_supported(struct l2cap_conn *conn)
{
- return ((conn->local_fixed_chan & L2CAP_FC_A2MP) &&
- (conn->feat_mask & L2CAP_FEAT_EXT_FLOW));
+ return (conn->feat_mask & L2CAP_FEAT_EXT_FLOW);
}
static void __l2cap_set_ertm_timeouts(struct l2cap_chan *chan,
struct l2cap_conf_rfc *rfc)
{
- if (chan->local_amp_id != AMP_ID_BREDR && chan->hs_hcon) {
- u64 ertm_to = chan->hs_hcon->hdev->amp_be_flush_to;
-
- /* Class 1 devices have must have ERTM timeouts
- * exceeding the Link Supervision Timeout. The
- * default Link Supervision Timeout for AMP
- * controllers is 10 seconds.
- *
- * Class 1 devices use 0xffffffff for their
- * best-effort flush timeout, so the clamping logic
- * will result in a timeout that meets the above
- * requirement. ERTM timeouts are 16-bit values, so
- * the maximum timeout is 65.535 seconds.
- */
-
- /* Convert timeout to milliseconds and round */
- ertm_to = DIV_ROUND_UP_ULL(ertm_to, 1000);
-
- /* This is the recommended formula for class 2 devices
- * that start ERTM timers when packets are sent to the
- * controller.
- */
- ertm_to = 3 * ertm_to + 500;
-
- if (ertm_to > 0xffff)
- ertm_to = 0xffff;
-
- rfc->retrans_timeout = cpu_to_le16((u16) ertm_to);
- rfc->monitor_timeout = rfc->retrans_timeout;
- } else {
- rfc->retrans_timeout = cpu_to_le16(L2CAP_DEFAULT_RETRANS_TO);
- rfc->monitor_timeout = cpu_to_le16(L2CAP_DEFAULT_MONITOR_TO);
- }
+ rfc->retrans_timeout = cpu_to_le16(L2CAP_DEFAULT_RETRANS_TO);
+ rfc->monitor_timeout = cpu_to_le16(L2CAP_DEFAULT_MONITOR_TO);
}
static inline void l2cap_txwin_setup(struct l2cap_chan *chan)
@@ -3194,6 +3235,49 @@ static inline void l2cap_txwin_setup(struct l2cap_chan *chan)
chan->ack_win = chan->tx_win;
}
+static void l2cap_mtu_auto(struct l2cap_chan *chan)
+{
+ struct hci_conn *conn = chan->conn->hcon;
+
+ chan->imtu = L2CAP_DEFAULT_MIN_MTU;
+
+ /* The 2-DH1 packet has between 2 and 56 information bytes
+ * (including the 2-byte payload header)
+ */
+ if (!(conn->pkt_type & HCI_2DH1))
+ chan->imtu = 54;
+
+ /* The 3-DH1 packet has between 2 and 85 information bytes
+ * (including the 2-byte payload header)
+ */
+ if (!(conn->pkt_type & HCI_3DH1))
+ chan->imtu = 83;
+
+ /* The 2-DH3 packet has between 2 and 369 information bytes
+ * (including the 2-byte payload header)
+ */
+ if (!(conn->pkt_type & HCI_2DH3))
+ chan->imtu = 367;
+
+ /* The 3-DH3 packet has between 2 and 554 information bytes
+ * (including the 2-byte payload header)
+ */
+ if (!(conn->pkt_type & HCI_3DH3))
+ chan->imtu = 552;
+
+ /* The 2-DH5 packet has between 2 and 681 information bytes
+ * (including the 2-byte payload header)
+ */
+ if (!(conn->pkt_type & HCI_2DH5))
+ chan->imtu = 679;
+
+ /* The 3-DH5 packet has between 2 and 1023 information bytes
+ * (including the 2-byte payload header)
+ */
+ if (!(conn->pkt_type & HCI_3DH5))
+ chan->imtu = 1021;
+}
+
static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data_size)
{
struct l2cap_conf_req *req = data;
@@ -3216,15 +3300,19 @@ static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data
if (__l2cap_efs_supported(chan->conn))
set_bit(FLAG_EFS_ENABLE, &chan->flags);
- /* fall through */
+ fallthrough;
default:
chan->mode = l2cap_select_mode(rfc.mode, chan->conn->feat_mask);
break;
}
done:
- if (chan->imtu != L2CAP_DEFAULT_MTU)
- l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu, endptr - ptr);
+ if (chan->imtu != L2CAP_DEFAULT_MTU) {
+ if (!chan->imtu)
+ l2cap_mtu_auto(chan);
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu,
+ endptr - ptr);
+ }
switch (chan->mode) {
case L2CAP_MODE_BASIC:
@@ -3328,7 +3416,7 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data
struct l2cap_conf_rfc rfc = { .mode = L2CAP_MODE_BASIC };
struct l2cap_conf_efs efs;
u8 remote_efs = 0;
- u16 mtu = L2CAP_DEFAULT_MTU;
+ u16 mtu = 0;
u16 result = L2CAP_CONF_SUCCESS;
u16 size;
@@ -3336,16 +3424,22 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data
while (len >= L2CAP_CONF_OPT_SIZE) {
len -= l2cap_get_conf_opt(&req, &type, &olen, &val);
+ if (len < 0)
+ break;
hint = type & L2CAP_CONF_HINT;
type &= L2CAP_CONF_MASK;
switch (type) {
case L2CAP_CONF_MTU:
+ if (olen != 2)
+ break;
mtu = val;
break;
case L2CAP_CONF_FLUSH_TO:
+ if (olen != 2)
+ break;
chan->flush_to = val;
break;
@@ -3353,38 +3447,35 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data
break;
case L2CAP_CONF_RFC:
- if (olen == sizeof(rfc))
- memcpy(&rfc, (void *) val, olen);
+ if (olen != sizeof(rfc))
+ break;
+ memcpy(&rfc, (void *) val, olen);
break;
case L2CAP_CONF_FCS:
+ if (olen != 1)
+ break;
if (val == L2CAP_FCS_NONE)
set_bit(CONF_RECV_NO_FCS, &chan->conf_state);
break;
case L2CAP_CONF_EFS:
- if (olen == sizeof(efs)) {
- remote_efs = 1;
- memcpy(&efs, (void *) val, olen);
- }
+ if (olen != sizeof(efs))
+ break;
+ remote_efs = 1;
+ memcpy(&efs, (void *) val, olen);
break;
case L2CAP_CONF_EWS:
- if (!(chan->conn->local_fixed_chan & L2CAP_FC_A2MP))
- return -ECONNREFUSED;
-
- set_bit(FLAG_EXT_CTRL, &chan->flags);
- set_bit(CONF_EWS_RECV, &chan->conf_state);
- chan->tx_win_max = L2CAP_DEFAULT_EXT_WINDOW;
- chan->remote_tx_win = val;
- break;
+ if (olen != 2)
+ break;
+ return -ECONNREFUSED;
default:
if (hint)
break;
-
result = L2CAP_CONF_UNKNOWN;
- *((u8 *) ptr++) = type;
+ l2cap_add_conf_opt(&ptr, (u8)type, sizeof(u8), type, endptr - ptr);
break;
}
}
@@ -3430,6 +3521,29 @@ done:
/* Configure output options and let the other side know
* which ones we don't like. */
+ /* If MTU is not provided in configure request, try adjusting it
+ * to the current output MTU if it has been set
+ *
+ * Bluetooth Core 6.1, Vol 3, Part A, Section 4.5
+ *
+ * Each configuration parameter value (if any is present) in an
+ * L2CAP_CONFIGURATION_RSP packet reflects an ‘adjustment’ to a
+ * configuration parameter value that has been sent (or, in case
+ * of default values, implied) in the corresponding
+ * L2CAP_CONFIGURATION_REQ packet.
+ */
+ if (!mtu) {
+ /* Only adjust for ERTM channels as for older modes the
+ * remote stack may not be able to detect that the
+ * adjustment causing it to silently drop packets.
+ */
+ if (chan->mode == L2CAP_MODE_ERTM &&
+ chan->omtu && chan->omtu != L2CAP_DEFAULT_MTU)
+ mtu = chan->omtu;
+ else
+ mtu = L2CAP_DEFAULT_MTU;
+ }
+
if (mtu < L2CAP_DEFAULT_MIN_MTU)
result = L2CAP_CONF_UNACCEPT;
else {
@@ -3485,7 +3599,8 @@ done:
l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC,
sizeof(rfc), (unsigned long) &rfc, endptr - ptr);
- if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) {
+ if (remote_efs &&
+ test_bit(FLAG_EFS_ENABLE, &chan->flags)) {
chan->remote_id = efs.id;
chan->remote_stype = efs.stype;
chan->remote_msdu = le16_to_cpu(efs.msdu);
@@ -3547,58 +3662,65 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len,
while (len >= L2CAP_CONF_OPT_SIZE) {
len -= l2cap_get_conf_opt(&rsp, &type, &olen, &val);
+ if (len < 0)
+ break;
switch (type) {
case L2CAP_CONF_MTU:
+ if (olen != 2)
+ break;
if (val < L2CAP_DEFAULT_MIN_MTU) {
*result = L2CAP_CONF_UNACCEPT;
chan->imtu = L2CAP_DEFAULT_MIN_MTU;
} else
chan->imtu = val;
- l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu, endptr - ptr);
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu,
+ endptr - ptr);
break;
case L2CAP_CONF_FLUSH_TO:
+ if (olen != 2)
+ break;
chan->flush_to = val;
- l2cap_add_conf_opt(&ptr, L2CAP_CONF_FLUSH_TO,
- 2, chan->flush_to, endptr - ptr);
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_FLUSH_TO, 2,
+ chan->flush_to, endptr - ptr);
break;
case L2CAP_CONF_RFC:
- if (olen == sizeof(rfc))
- memcpy(&rfc, (void *)val, olen);
-
+ if (olen != sizeof(rfc))
+ break;
+ memcpy(&rfc, (void *)val, olen);
if (test_bit(CONF_STATE2_DEVICE, &chan->conf_state) &&
rfc.mode != chan->mode)
return -ECONNREFUSED;
-
chan->fcs = 0;
-
- l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC,
- sizeof(rfc), (unsigned long) &rfc, endptr - ptr);
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc),
+ (unsigned long) &rfc, endptr - ptr);
break;
case L2CAP_CONF_EWS:
+ if (olen != 2)
+ break;
chan->ack_win = min_t(u16, val, chan->ack_win);
l2cap_add_conf_opt(&ptr, L2CAP_CONF_EWS, 2,
chan->tx_win, endptr - ptr);
break;
case L2CAP_CONF_EFS:
- if (olen == sizeof(efs)) {
- memcpy(&efs, (void *)val, olen);
-
- if (chan->local_stype != L2CAP_SERV_NOTRAFIC &&
- efs.stype != L2CAP_SERV_NOTRAFIC &&
- efs.stype != chan->local_stype)
- return -ECONNREFUSED;
-
- l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs),
- (unsigned long) &efs, endptr - ptr);
- }
+ if (olen != sizeof(efs))
+ break;
+ memcpy(&efs, (void *)val, olen);
+ if (chan->local_stype != L2CAP_SERV_NOTRAFIC &&
+ efs.stype != L2CAP_SERV_NOTRAFIC &&
+ efs.stype != chan->local_stype)
+ return -ECONNREFUSED;
+ l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs),
+ (unsigned long) &efs, endptr - ptr);
break;
case L2CAP_CONF_FCS:
+ if (olen != 1)
+ break;
if (*result == L2CAP_CONF_PENDING)
if (val == L2CAP_FCS_NONE)
set_bit(CONF_RECV_NO_FCS,
@@ -3669,12 +3791,100 @@ void __l2cap_le_connect_rsp_defer(struct l2cap_chan *chan)
rsp.mtu = cpu_to_le16(chan->imtu);
rsp.mps = cpu_to_le16(chan->mps);
rsp.credits = cpu_to_le16(chan->rx_credits);
- rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
+ rsp.result = cpu_to_le16(L2CAP_CR_LE_SUCCESS);
l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CONN_RSP, sizeof(rsp),
&rsp);
}
+static void l2cap_ecred_list_defer(struct l2cap_chan *chan, void *data)
+{
+ int *result = data;
+
+ if (*result || test_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags))
+ return;
+
+ switch (chan->state) {
+ case BT_CONNECT2:
+ /* If channel still pending accept add to result */
+ (*result)++;
+ return;
+ case BT_CONNECTED:
+ return;
+ default:
+ /* If not connected or pending accept it has been refused */
+ *result = -ECONNREFUSED;
+ return;
+ }
+}
+
+struct l2cap_ecred_rsp_data {
+ struct {
+ struct l2cap_ecred_conn_rsp_hdr rsp;
+ __le16 scid[L2CAP_ECRED_MAX_CID];
+ } __packed pdu;
+ int count;
+};
+
+static void l2cap_ecred_rsp_defer(struct l2cap_chan *chan, void *data)
+{
+ struct l2cap_ecred_rsp_data *rsp = data;
+ struct l2cap_ecred_conn_rsp *rsp_flex =
+ container_of(&rsp->pdu.rsp, struct l2cap_ecred_conn_rsp, hdr);
+
+ /* Check if channel for outgoing connection or if it wasn't deferred
+ * since in those cases it must be skipped.
+ */
+ if (test_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags) ||
+ !test_and_clear_bit(FLAG_DEFER_SETUP, &chan->flags))
+ return;
+
+ /* Reset ident so only one response is sent */
+ chan->ident = 0;
+
+ /* Include all channels pending with the same ident */
+ if (!rsp->pdu.rsp.result)
+ rsp_flex->dcid[rsp->count++] = cpu_to_le16(chan->scid);
+ else
+ l2cap_chan_del(chan, ECONNRESET);
+}
+
+void __l2cap_ecred_conn_rsp_defer(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ struct l2cap_ecred_rsp_data data;
+ u16 id = chan->ident;
+ int result = 0;
+
+ if (!id)
+ return;
+
+ BT_DBG("chan %p id %d", chan, id);
+
+ memset(&data, 0, sizeof(data));
+
+ data.pdu.rsp.mtu = cpu_to_le16(chan->imtu);
+ data.pdu.rsp.mps = cpu_to_le16(chan->mps);
+ data.pdu.rsp.credits = cpu_to_le16(chan->rx_credits);
+ data.pdu.rsp.result = cpu_to_le16(L2CAP_CR_LE_SUCCESS);
+
+ /* Verify that all channels are ready */
+ __l2cap_chan_list_id(conn, id, l2cap_ecred_list_defer, &result);
+
+ if (result > 0)
+ return;
+
+ if (result < 0)
+ data.pdu.rsp.result = cpu_to_le16(L2CAP_CR_LE_AUTHORIZATION);
+
+ /* Build response */
+ __l2cap_chan_list_id(conn, id, l2cap_ecred_rsp_defer, &data);
+
+ l2cap_send_cmd(conn, id, L2CAP_ECRED_CONN_RSP,
+ sizeof(data.pdu.rsp) + (data.count * sizeof(__le16)),
+ &data.pdu);
+}
+
void __l2cap_connect_rsp_defer(struct l2cap_chan *chan)
{
struct l2cap_conn_rsp rsp;
@@ -3686,11 +3896,7 @@ void __l2cap_connect_rsp_defer(struct l2cap_chan *chan)
rsp.dcid = cpu_to_le16(chan->scid);
rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
-
- if (chan->hs_hcon)
- rsp_code = L2CAP_CREATE_CHAN_RSP;
- else
- rsp_code = L2CAP_CONN_RSP;
+ rsp_code = L2CAP_CONN_RSP;
BT_DBG("chan %p rsp_code %u", chan, rsp_code);
@@ -3727,13 +3933,18 @@ static void l2cap_conf_rfc_get(struct l2cap_chan *chan, void *rsp, int len)
while (len >= L2CAP_CONF_OPT_SIZE) {
len -= l2cap_get_conf_opt(&rsp, &type, &olen, &val);
+ if (len < 0)
+ break;
switch (type) {
case L2CAP_CONF_RFC:
- if (olen == sizeof(rfc))
- memcpy(&rfc, (void *)val, olen);
+ if (olen != sizeof(rfc))
+ break;
+ memcpy(&rfc, (void *)val, olen);
break;
case L2CAP_CONF_EWS:
+ if (olen != 2)
+ break;
txwin_ext = val;
break;
}
@@ -3780,13 +3991,12 @@ static inline int l2cap_command_rej(struct l2cap_conn *conn,
return 0;
}
-static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn,
- struct l2cap_cmd_hdr *cmd,
- u8 *data, u8 rsp_code, u8 amp_id)
+static void l2cap_connect(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd,
+ u8 *data, u8 rsp_code)
{
struct l2cap_conn_req *req = (struct l2cap_conn_req *) data;
struct l2cap_conn_rsp rsp;
- struct l2cap_chan *chan = NULL, *pchan;
+ struct l2cap_chan *chan = NULL, *pchan = NULL;
int result, status = L2CAP_CS_NO_INFO;
u16 dcid = 0, scid = __le16_to_cpu(req->scid);
@@ -3799,15 +4009,15 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn,
&conn->hcon->dst, ACL_LINK);
if (!pchan) {
result = L2CAP_CR_BAD_PSM;
- goto sendresp;
+ goto response;
}
- mutex_lock(&conn->chan_lock);
l2cap_chan_lock(pchan);
/* Check if the ACL is secure enough (if not SDP) */
if (psm != cpu_to_le16(L2CAP_PSM_SDP) &&
- !hci_conn_check_link_mode(conn->hcon)) {
+ (!hci_conn_check_link_mode(conn->hcon) ||
+ !l2cap_check_enc_key_size(conn->hcon, pchan))) {
conn->disc_reason = HCI_ERROR_AUTH_FAILURE;
result = L2CAP_CR_SEC_BLOCK;
goto response;
@@ -3815,9 +4025,17 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn,
result = L2CAP_CR_NO_MEM;
+ /* Check for valid dynamic CID range (as per Erratum 3253) */
+ if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_DYN_END) {
+ result = L2CAP_CR_INVALID_SCID;
+ goto response;
+ }
+
/* Check if we already have channel with that dcid */
- if (__l2cap_get_chan_by_dcid(conn, scid))
+ if (__l2cap_get_chan_by_dcid(conn, scid)) {
+ result = L2CAP_CR_SCID_IN_USE;
goto response;
+ }
chan = pchan->ops->new_connection(pchan);
if (!chan)
@@ -3836,7 +4054,6 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn,
chan->dst_type = bdaddr_dst_type(conn->hcon);
chan->psm = psm;
chan->dcid = scid;
- chan->local_amp_id = amp_id;
__l2cap_chan_add(conn, chan);
@@ -3854,17 +4071,8 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn,
status = L2CAP_CS_AUTHOR_PEND;
chan->ops->defer(chan);
} else {
- /* Force pending result for AMP controllers.
- * The connection will succeed after the
- * physical link is up.
- */
- if (amp_id == AMP_ID_BREDR) {
- l2cap_state_change(chan, BT_CONFIG);
- result = L2CAP_CR_SUCCESS;
- } else {
- l2cap_state_change(chan, BT_CONNECT2);
- result = L2CAP_CR_PEND;
- }
+ l2cap_state_change(chan, BT_CONFIG);
+ result = L2CAP_CR_SUCCESS;
status = L2CAP_CS_NO_INFO;
}
} else {
@@ -3879,17 +4087,15 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn,
}
response:
- l2cap_chan_unlock(pchan);
- mutex_unlock(&conn->chan_lock);
- l2cap_chan_put(pchan);
-
-sendresp:
rsp.scid = cpu_to_le16(scid);
rsp.dcid = cpu_to_le16(dcid);
rsp.result = cpu_to_le16(result);
rsp.status = cpu_to_le16(status);
l2cap_send_cmd(conn, cmd->ident, rsp_code, sizeof(rsp), &rsp);
+ if (!pchan)
+ return;
+
if (result == L2CAP_CR_PEND && status == L2CAP_CS_NO_INFO) {
struct l2cap_info_req info;
info.type = cpu_to_le16(L2CAP_IT_FEAT_MASK);
@@ -3912,25 +4118,17 @@ sendresp:
chan->num_conf_req++;
}
- return chan;
+ l2cap_chan_unlock(pchan);
+ l2cap_chan_put(pchan);
}
static int l2cap_connect_req(struct l2cap_conn *conn,
struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data)
{
- struct hci_dev *hdev = conn->hcon->hdev;
- struct hci_conn *hcon = conn->hcon;
-
if (cmd_len < sizeof(struct l2cap_conn_req))
return -EPROTO;
- hci_dev_lock(hdev);
- if (hci_dev_test_flag(hdev, HCI_MGMT) &&
- !test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &hcon->flags))
- mgmt_device_connected(hdev, hcon, 0, NULL, 0);
- hci_dev_unlock(hdev);
-
- l2cap_connect(conn, cmd, data, L2CAP_CONN_RSP, 0);
+ l2cap_connect(conn, cmd, data, L2CAP_CONN_RSP);
return 0;
}
@@ -3952,31 +4150,38 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn,
result = __le16_to_cpu(rsp->result);
status = __le16_to_cpu(rsp->status);
+ if (result == L2CAP_CR_SUCCESS && (dcid < L2CAP_CID_DYN_START ||
+ dcid > L2CAP_CID_DYN_END))
+ return -EPROTO;
+
BT_DBG("dcid 0x%4.4x scid 0x%4.4x result 0x%2.2x status 0x%2.2x",
dcid, scid, result, status);
- mutex_lock(&conn->chan_lock);
-
if (scid) {
chan = __l2cap_get_chan_by_scid(conn, scid);
- if (!chan) {
- err = -EBADSLT;
- goto unlock;
- }
+ if (!chan)
+ return -EBADSLT;
} else {
chan = __l2cap_get_chan_by_ident(conn, cmd->ident);
- if (!chan) {
- err = -EBADSLT;
- goto unlock;
- }
+ if (!chan)
+ return -EBADSLT;
}
+ chan = l2cap_chan_hold_unless_zero(chan);
+ if (!chan)
+ return -EBADSLT;
+
err = 0;
l2cap_chan_lock(chan);
switch (result) {
case L2CAP_CR_SUCCESS:
+ if (__l2cap_get_chan_by_dcid(conn, dcid)) {
+ err = -EBADSLT;
+ break;
+ }
+
l2cap_state_change(chan, BT_CONFIG);
chan->ident = 0;
chan->dcid = dcid;
@@ -4000,9 +4205,7 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn,
}
l2cap_chan_unlock(chan);
-
-unlock:
- mutex_unlock(&conn->chan_lock);
+ l2cap_chan_put(chan);
return err;
}
@@ -4070,7 +4273,8 @@ static inline int l2cap_config_req(struct l2cap_conn *conn,
return 0;
}
- if (chan->state != BT_CONFIG && chan->state != BT_CONNECT2) {
+ if (chan->state != BT_CONFIG && chan->state != BT_CONNECT2 &&
+ chan->state != BT_CONNECTED) {
cmd_reject_invalid_cid(conn, cmd->ident, chan->scid,
chan->dcid);
goto unlock;
@@ -4106,7 +4310,8 @@ static inline int l2cap_config_req(struct l2cap_conn *conn,
chan->ident = cmd->ident;
l2cap_send_cmd(conn, cmd->ident, L2CAP_CONF_RSP, len, rsp);
- chan->num_conf_rsp++;
+ if (chan->num_conf_rsp < L2CAP_CONF_MAX_CONF_RSP)
+ chan->num_conf_rsp++;
/* Reset config buffer. */
chan->conf_len = 0;
@@ -4144,14 +4349,12 @@ static inline int l2cap_config_req(struct l2cap_conn *conn,
/* check compatibility */
/* Send rsp for BR/EDR channel */
- if (!chan->hs_hcon)
- l2cap_send_efs_conf_rsp(chan, rsp, cmd->ident, flags);
- else
- chan->ident = cmd->ident;
+ l2cap_send_efs_conf_rsp(chan, rsp, cmd->ident, flags);
}
unlock:
l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
return err;
}
@@ -4198,18 +4401,11 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn,
goto done;
}
- if (!chan->hs_hcon) {
- l2cap_send_efs_conf_rsp(chan, buf, cmd->ident,
- 0);
- } else {
- if (l2cap_check_efs(chan)) {
- amp_create_logical_link(chan);
- chan->ident = cmd->ident;
- }
- }
+ l2cap_send_efs_conf_rsp(chan, buf, cmd->ident, 0);
}
goto done;
+ case L2CAP_CONF_UNKNOWN:
case L2CAP_CONF_UNACCEPT:
if (chan->num_conf_rsp <= L2CAP_CONF_MAX_CONF_RSP) {
char req[64];
@@ -4235,6 +4431,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn,
goto done;
break;
}
+ fallthrough;
default:
l2cap_chan_set_err(chan, ECONNRESET);
@@ -4264,6 +4461,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn,
done:
l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
return err;
}
@@ -4284,32 +4482,24 @@ static inline int l2cap_disconnect_req(struct l2cap_conn *conn,
BT_DBG("scid 0x%4.4x dcid 0x%4.4x", scid, dcid);
- mutex_lock(&conn->chan_lock);
-
- chan = __l2cap_get_chan_by_scid(conn, dcid);
+ chan = l2cap_get_chan_by_scid(conn, dcid);
if (!chan) {
- mutex_unlock(&conn->chan_lock);
cmd_reject_invalid_cid(conn, cmd->ident, dcid, scid);
return 0;
}
- l2cap_chan_lock(chan);
-
rsp.dcid = cpu_to_le16(chan->scid);
rsp.scid = cpu_to_le16(chan->dcid);
l2cap_send_cmd(conn, cmd->ident, L2CAP_DISCONN_RSP, sizeof(rsp), &rsp);
chan->ops->set_shutdown(chan);
- l2cap_chan_hold(chan);
l2cap_chan_del(chan, ECONNRESET);
- l2cap_chan_unlock(chan);
-
chan->ops->close(chan);
- l2cap_chan_put(chan);
- mutex_unlock(&conn->chan_lock);
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
return 0;
}
@@ -4330,25 +4520,23 @@ static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn,
BT_DBG("dcid 0x%4.4x scid 0x%4.4x", dcid, scid);
- mutex_lock(&conn->chan_lock);
-
- chan = __l2cap_get_chan_by_scid(conn, scid);
+ chan = l2cap_get_chan_by_scid(conn, scid);
if (!chan) {
- mutex_unlock(&conn->chan_lock);
return 0;
}
- l2cap_chan_lock(chan);
+ if (chan->state != BT_DISCONN) {
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+ return 0;
+ }
- l2cap_chan_hold(chan);
l2cap_chan_del(chan, 0);
- l2cap_chan_unlock(chan);
-
chan->ops->close(chan);
- l2cap_chan_put(chan);
- mutex_unlock(&conn->chan_lock);
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
return 0;
}
@@ -4376,9 +4564,6 @@ static inline int l2cap_information_req(struct l2cap_conn *conn,
if (!disable_ertm)
feat_mask |= L2CAP_FEAT_ERTM | L2CAP_FEAT_STREAMING
| L2CAP_FEAT_FCS;
- if (conn->local_fixed_chan & L2CAP_FC_A2MP)
- feat_mask |= L2CAP_FEAT_EXT_FLOW
- | L2CAP_FEAT_EXT_WINDOW;
put_unaligned_le32(feat_mask, rsp->data);
l2cap_send_cmd(conn, cmd->ident, L2CAP_INFO_RSP, sizeof(buf),
@@ -4467,749 +4652,6 @@ static inline int l2cap_information_rsp(struct l2cap_conn *conn,
return 0;
}
-static int l2cap_create_channel_req(struct l2cap_conn *conn,
- struct l2cap_cmd_hdr *cmd,
- u16 cmd_len, void *data)
-{
- struct l2cap_create_chan_req *req = data;
- struct l2cap_create_chan_rsp rsp;
- struct l2cap_chan *chan;
- struct hci_dev *hdev;
- u16 psm, scid;
-
- if (cmd_len != sizeof(*req))
- return -EPROTO;
-
- if (!(conn->local_fixed_chan & L2CAP_FC_A2MP))
- return -EINVAL;
-
- psm = le16_to_cpu(req->psm);
- scid = le16_to_cpu(req->scid);
-
- BT_DBG("psm 0x%2.2x, scid 0x%4.4x, amp_id %d", psm, scid, req->amp_id);
-
- /* For controller id 0 make BR/EDR connection */
- if (req->amp_id == AMP_ID_BREDR) {
- l2cap_connect(conn, cmd, data, L2CAP_CREATE_CHAN_RSP,
- req->amp_id);
- return 0;
- }
-
- /* Validate AMP controller id */
- hdev = hci_dev_get(req->amp_id);
- if (!hdev)
- goto error;
-
- if (hdev->dev_type != HCI_AMP || !test_bit(HCI_UP, &hdev->flags)) {
- hci_dev_put(hdev);
- goto error;
- }
-
- chan = l2cap_connect(conn, cmd, data, L2CAP_CREATE_CHAN_RSP,
- req->amp_id);
- if (chan) {
- struct amp_mgr *mgr = conn->hcon->amp_mgr;
- struct hci_conn *hs_hcon;
-
- hs_hcon = hci_conn_hash_lookup_ba(hdev, AMP_LINK,
- &conn->hcon->dst);
- if (!hs_hcon) {
- hci_dev_put(hdev);
- cmd_reject_invalid_cid(conn, cmd->ident, chan->scid,
- chan->dcid);
- return 0;
- }
-
- BT_DBG("mgr %p bredr_chan %p hs_hcon %p", mgr, chan, hs_hcon);
-
- mgr->bredr_chan = chan;
- chan->hs_hcon = hs_hcon;
- chan->fcs = L2CAP_FCS_NONE;
- conn->mtu = hdev->block_mtu;
- }
-
- hci_dev_put(hdev);
-
- return 0;
-
-error:
- rsp.dcid = 0;
- rsp.scid = cpu_to_le16(scid);
- rsp.result = cpu_to_le16(L2CAP_CR_BAD_AMP);
- rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
-
- l2cap_send_cmd(conn, cmd->ident, L2CAP_CREATE_CHAN_RSP,
- sizeof(rsp), &rsp);
-
- return 0;
-}
-
-static void l2cap_send_move_chan_req(struct l2cap_chan *chan, u8 dest_amp_id)
-{
- struct l2cap_move_chan_req req;
- u8 ident;
-
- BT_DBG("chan %p, dest_amp_id %d", chan, dest_amp_id);
-
- ident = l2cap_get_ident(chan->conn);
- chan->ident = ident;
-
- req.icid = cpu_to_le16(chan->scid);
- req.dest_amp_id = dest_amp_id;
-
- l2cap_send_cmd(chan->conn, ident, L2CAP_MOVE_CHAN_REQ, sizeof(req),
- &req);
-
- __set_chan_timer(chan, L2CAP_MOVE_TIMEOUT);
-}
-
-static void l2cap_send_move_chan_rsp(struct l2cap_chan *chan, u16 result)
-{
- struct l2cap_move_chan_rsp rsp;
-
- BT_DBG("chan %p, result 0x%4.4x", chan, result);
-
- rsp.icid = cpu_to_le16(chan->dcid);
- rsp.result = cpu_to_le16(result);
-
- l2cap_send_cmd(chan->conn, chan->ident, L2CAP_MOVE_CHAN_RSP,
- sizeof(rsp), &rsp);
-}
-
-static void l2cap_send_move_chan_cfm(struct l2cap_chan *chan, u16 result)
-{
- struct l2cap_move_chan_cfm cfm;
-
- BT_DBG("chan %p, result 0x%4.4x", chan, result);
-
- chan->ident = l2cap_get_ident(chan->conn);
-
- cfm.icid = cpu_to_le16(chan->scid);
- cfm.result = cpu_to_le16(result);
-
- l2cap_send_cmd(chan->conn, chan->ident, L2CAP_MOVE_CHAN_CFM,
- sizeof(cfm), &cfm);
-
- __set_chan_timer(chan, L2CAP_MOVE_TIMEOUT);
-}
-
-static void l2cap_send_move_chan_cfm_icid(struct l2cap_conn *conn, u16 icid)
-{
- struct l2cap_move_chan_cfm cfm;
-
- BT_DBG("conn %p, icid 0x%4.4x", conn, icid);
-
- cfm.icid = cpu_to_le16(icid);
- cfm.result = cpu_to_le16(L2CAP_MC_UNCONFIRMED);
-
- l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_MOVE_CHAN_CFM,
- sizeof(cfm), &cfm);
-}
-
-static void l2cap_send_move_chan_cfm_rsp(struct l2cap_conn *conn, u8 ident,
- u16 icid)
-{
- struct l2cap_move_chan_cfm_rsp rsp;
-
- BT_DBG("icid 0x%4.4x", icid);
-
- rsp.icid = cpu_to_le16(icid);
- l2cap_send_cmd(conn, ident, L2CAP_MOVE_CHAN_CFM_RSP, sizeof(rsp), &rsp);
-}
-
-static void __release_logical_link(struct l2cap_chan *chan)
-{
- chan->hs_hchan = NULL;
- chan->hs_hcon = NULL;
-
- /* Placeholder - release the logical link */
-}
-
-static void l2cap_logical_fail(struct l2cap_chan *chan)
-{
- /* Logical link setup failed */
- if (chan->state != BT_CONNECTED) {
- /* Create channel failure, disconnect */
- l2cap_send_disconn_req(chan, ECONNRESET);
- return;
- }
-
- switch (chan->move_role) {
- case L2CAP_MOVE_ROLE_RESPONDER:
- l2cap_move_done(chan);
- l2cap_send_move_chan_rsp(chan, L2CAP_MR_NOT_SUPP);
- break;
- case L2CAP_MOVE_ROLE_INITIATOR:
- if (chan->move_state == L2CAP_MOVE_WAIT_LOGICAL_COMP ||
- chan->move_state == L2CAP_MOVE_WAIT_LOGICAL_CFM) {
- /* Remote has only sent pending or
- * success responses, clean up
- */
- l2cap_move_done(chan);
- }
-
- /* Other amp move states imply that the move
- * has already aborted
- */
- l2cap_send_move_chan_cfm(chan, L2CAP_MC_UNCONFIRMED);
- break;
- }
-}
-
-static void l2cap_logical_finish_create(struct l2cap_chan *chan,
- struct hci_chan *hchan)
-{
- struct l2cap_conf_rsp rsp;
-
- chan->hs_hchan = hchan;
- chan->hs_hcon->l2cap_data = chan->conn;
-
- l2cap_send_efs_conf_rsp(chan, &rsp, chan->ident, 0);
-
- if (test_bit(CONF_INPUT_DONE, &chan->conf_state)) {
- int err;
-
- set_default_fcs(chan);
-
- err = l2cap_ertm_init(chan);
- if (err < 0)
- l2cap_send_disconn_req(chan, -err);
- else
- l2cap_chan_ready(chan);
- }
-}
-
-static void l2cap_logical_finish_move(struct l2cap_chan *chan,
- struct hci_chan *hchan)
-{
- chan->hs_hcon = hchan->conn;
- chan->hs_hcon->l2cap_data = chan->conn;
-
- BT_DBG("move_state %d", chan->move_state);
-
- switch (chan->move_state) {
- case L2CAP_MOVE_WAIT_LOGICAL_COMP:
- /* Move confirm will be sent after a success
- * response is received
- */
- chan->move_state = L2CAP_MOVE_WAIT_RSP_SUCCESS;
- break;
- case L2CAP_MOVE_WAIT_LOGICAL_CFM:
- if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) {
- chan->move_state = L2CAP_MOVE_WAIT_LOCAL_BUSY;
- } else if (chan->move_role == L2CAP_MOVE_ROLE_INITIATOR) {
- chan->move_state = L2CAP_MOVE_WAIT_CONFIRM_RSP;
- l2cap_send_move_chan_cfm(chan, L2CAP_MC_CONFIRMED);
- } else if (chan->move_role == L2CAP_MOVE_ROLE_RESPONDER) {
- chan->move_state = L2CAP_MOVE_WAIT_CONFIRM;
- l2cap_send_move_chan_rsp(chan, L2CAP_MR_SUCCESS);
- }
- break;
- default:
- /* Move was not in expected state, free the channel */
- __release_logical_link(chan);
-
- chan->move_state = L2CAP_MOVE_STABLE;
- }
-}
-
-/* Call with chan locked */
-void l2cap_logical_cfm(struct l2cap_chan *chan, struct hci_chan *hchan,
- u8 status)
-{
- BT_DBG("chan %p, hchan %p, status %d", chan, hchan, status);
-
- if (status) {
- l2cap_logical_fail(chan);
- __release_logical_link(chan);
- return;
- }
-
- if (chan->state != BT_CONNECTED) {
- /* Ignore logical link if channel is on BR/EDR */
- if (chan->local_amp_id != AMP_ID_BREDR)
- l2cap_logical_finish_create(chan, hchan);
- } else {
- l2cap_logical_finish_move(chan, hchan);
- }
-}
-
-void l2cap_move_start(struct l2cap_chan *chan)
-{
- BT_DBG("chan %p", chan);
-
- if (chan->local_amp_id == AMP_ID_BREDR) {
- if (chan->chan_policy != BT_CHANNEL_POLICY_AMP_PREFERRED)
- return;
- chan->move_role = L2CAP_MOVE_ROLE_INITIATOR;
- chan->move_state = L2CAP_MOVE_WAIT_PREPARE;
- /* Placeholder - start physical link setup */
- } else {
- chan->move_role = L2CAP_MOVE_ROLE_INITIATOR;
- chan->move_state = L2CAP_MOVE_WAIT_RSP_SUCCESS;
- chan->move_id = 0;
- l2cap_move_setup(chan);
- l2cap_send_move_chan_req(chan, 0);
- }
-}
-
-static void l2cap_do_create(struct l2cap_chan *chan, int result,
- u8 local_amp_id, u8 remote_amp_id)
-{
- BT_DBG("chan %p state %s %u -> %u", chan, state_to_string(chan->state),
- local_amp_id, remote_amp_id);
-
- chan->fcs = L2CAP_FCS_NONE;
-
- /* Outgoing channel on AMP */
- if (chan->state == BT_CONNECT) {
- if (result == L2CAP_CR_SUCCESS) {
- chan->local_amp_id = local_amp_id;
- l2cap_send_create_chan_req(chan, remote_amp_id);
- } else {
- /* Revert to BR/EDR connect */
- l2cap_send_conn_req(chan);
- }
-
- return;
- }
-
- /* Incoming channel on AMP */
- if (__l2cap_no_conn_pending(chan)) {
- struct l2cap_conn_rsp rsp;
- char buf[128];
- rsp.scid = cpu_to_le16(chan->dcid);
- rsp.dcid = cpu_to_le16(chan->scid);
-
- if (result == L2CAP_CR_SUCCESS) {
- /* Send successful response */
- rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
- rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
- } else {
- /* Send negative response */
- rsp.result = cpu_to_le16(L2CAP_CR_NO_MEM);
- rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
- }
-
- l2cap_send_cmd(chan->conn, chan->ident, L2CAP_CREATE_CHAN_RSP,
- sizeof(rsp), &rsp);
-
- if (result == L2CAP_CR_SUCCESS) {
- l2cap_state_change(chan, BT_CONFIG);
- set_bit(CONF_REQ_SENT, &chan->conf_state);
- l2cap_send_cmd(chan->conn, l2cap_get_ident(chan->conn),
- L2CAP_CONF_REQ,
- l2cap_build_conf_req(chan, buf, sizeof(buf)), buf);
- chan->num_conf_req++;
- }
- }
-}
-
-static void l2cap_do_move_initiate(struct l2cap_chan *chan, u8 local_amp_id,
- u8 remote_amp_id)
-{
- l2cap_move_setup(chan);
- chan->move_id = local_amp_id;
- chan->move_state = L2CAP_MOVE_WAIT_RSP;
-
- l2cap_send_move_chan_req(chan, remote_amp_id);
-}
-
-static void l2cap_do_move_respond(struct l2cap_chan *chan, int result)
-{
- struct hci_chan *hchan = NULL;
-
- /* Placeholder - get hci_chan for logical link */
-
- if (hchan) {
- if (hchan->state == BT_CONNECTED) {
- /* Logical link is ready to go */
- chan->hs_hcon = hchan->conn;
- chan->hs_hcon->l2cap_data = chan->conn;
- chan->move_state = L2CAP_MOVE_WAIT_CONFIRM;
- l2cap_send_move_chan_rsp(chan, L2CAP_MR_SUCCESS);
-
- l2cap_logical_cfm(chan, hchan, L2CAP_MR_SUCCESS);
- } else {
- /* Wait for logical link to be ready */
- chan->move_state = L2CAP_MOVE_WAIT_LOGICAL_CFM;
- }
- } else {
- /* Logical link not available */
- l2cap_send_move_chan_rsp(chan, L2CAP_MR_NOT_ALLOWED);
- }
-}
-
-static void l2cap_do_move_cancel(struct l2cap_chan *chan, int result)
-{
- if (chan->move_role == L2CAP_MOVE_ROLE_RESPONDER) {
- u8 rsp_result;
- if (result == -EINVAL)
- rsp_result = L2CAP_MR_BAD_ID;
- else
- rsp_result = L2CAP_MR_NOT_ALLOWED;
-
- l2cap_send_move_chan_rsp(chan, rsp_result);
- }
-
- chan->move_role = L2CAP_MOVE_ROLE_NONE;
- chan->move_state = L2CAP_MOVE_STABLE;
-
- /* Restart data transmission */
- l2cap_ertm_send(chan);
-}
-
-/* Invoke with locked chan */
-void __l2cap_physical_cfm(struct l2cap_chan *chan, int result)
-{
- u8 local_amp_id = chan->local_amp_id;
- u8 remote_amp_id = chan->remote_amp_id;
-
- BT_DBG("chan %p, result %d, local_amp_id %d, remote_amp_id %d",
- chan, result, local_amp_id, remote_amp_id);
-
- if (chan->state == BT_DISCONN || chan->state == BT_CLOSED) {
- l2cap_chan_unlock(chan);
- return;
- }
-
- if (chan->state != BT_CONNECTED) {
- l2cap_do_create(chan, result, local_amp_id, remote_amp_id);
- } else if (result != L2CAP_MR_SUCCESS) {
- l2cap_do_move_cancel(chan, result);
- } else {
- switch (chan->move_role) {
- case L2CAP_MOVE_ROLE_INITIATOR:
- l2cap_do_move_initiate(chan, local_amp_id,
- remote_amp_id);
- break;
- case L2CAP_MOVE_ROLE_RESPONDER:
- l2cap_do_move_respond(chan, result);
- break;
- default:
- l2cap_do_move_cancel(chan, result);
- break;
- }
- }
-}
-
-static inline int l2cap_move_channel_req(struct l2cap_conn *conn,
- struct l2cap_cmd_hdr *cmd,
- u16 cmd_len, void *data)
-{
- struct l2cap_move_chan_req *req = data;
- struct l2cap_move_chan_rsp rsp;
- struct l2cap_chan *chan;
- u16 icid = 0;
- u16 result = L2CAP_MR_NOT_ALLOWED;
-
- if (cmd_len != sizeof(*req))
- return -EPROTO;
-
- icid = le16_to_cpu(req->icid);
-
- BT_DBG("icid 0x%4.4x, dest_amp_id %d", icid, req->dest_amp_id);
-
- if (!(conn->local_fixed_chan & L2CAP_FC_A2MP))
- return -EINVAL;
-
- chan = l2cap_get_chan_by_dcid(conn, icid);
- if (!chan) {
- rsp.icid = cpu_to_le16(icid);
- rsp.result = cpu_to_le16(L2CAP_MR_NOT_ALLOWED);
- l2cap_send_cmd(conn, cmd->ident, L2CAP_MOVE_CHAN_RSP,
- sizeof(rsp), &rsp);
- return 0;
- }
-
- chan->ident = cmd->ident;
-
- if (chan->scid < L2CAP_CID_DYN_START ||
- chan->chan_policy == BT_CHANNEL_POLICY_BREDR_ONLY ||
- (chan->mode != L2CAP_MODE_ERTM &&
- chan->mode != L2CAP_MODE_STREAMING)) {
- result = L2CAP_MR_NOT_ALLOWED;
- goto send_move_response;
- }
-
- if (chan->local_amp_id == req->dest_amp_id) {
- result = L2CAP_MR_SAME_ID;
- goto send_move_response;
- }
-
- if (req->dest_amp_id != AMP_ID_BREDR) {
- struct hci_dev *hdev;
- hdev = hci_dev_get(req->dest_amp_id);
- if (!hdev || hdev->dev_type != HCI_AMP ||
- !test_bit(HCI_UP, &hdev->flags)) {
- if (hdev)
- hci_dev_put(hdev);
-
- result = L2CAP_MR_BAD_ID;
- goto send_move_response;
- }
- hci_dev_put(hdev);
- }
-
- /* Detect a move collision. Only send a collision response
- * if this side has "lost", otherwise proceed with the move.
- * The winner has the larger bd_addr.
- */
- if ((__chan_is_moving(chan) ||
- chan->move_role != L2CAP_MOVE_ROLE_NONE) &&
- bacmp(&conn->hcon->src, &conn->hcon->dst) > 0) {
- result = L2CAP_MR_COLLISION;
- goto send_move_response;
- }
-
- chan->move_role = L2CAP_MOVE_ROLE_RESPONDER;
- l2cap_move_setup(chan);
- chan->move_id = req->dest_amp_id;
- icid = chan->dcid;
-
- if (req->dest_amp_id == AMP_ID_BREDR) {
- /* Moving to BR/EDR */
- if (test_bit(CONN_LOCAL_BUSY, &chan->conn_state)) {
- chan->move_state = L2CAP_MOVE_WAIT_LOCAL_BUSY;
- result = L2CAP_MR_PEND;
- } else {
- chan->move_state = L2CAP_MOVE_WAIT_CONFIRM;
- result = L2CAP_MR_SUCCESS;
- }
- } else {
- chan->move_state = L2CAP_MOVE_WAIT_PREPARE;
- /* Placeholder - uncomment when amp functions are available */
- /*amp_accept_physical(chan, req->dest_amp_id);*/
- result = L2CAP_MR_PEND;
- }
-
-send_move_response:
- l2cap_send_move_chan_rsp(chan, result);
-
- l2cap_chan_unlock(chan);
-
- return 0;
-}
-
-static void l2cap_move_continue(struct l2cap_conn *conn, u16 icid, u16 result)
-{
- struct l2cap_chan *chan;
- struct hci_chan *hchan = NULL;
-
- chan = l2cap_get_chan_by_scid(conn, icid);
- if (!chan) {
- l2cap_send_move_chan_cfm_icid(conn, icid);
- return;
- }
-
- __clear_chan_timer(chan);
- if (result == L2CAP_MR_PEND)
- __set_chan_timer(chan, L2CAP_MOVE_ERTX_TIMEOUT);
-
- switch (chan->move_state) {
- case L2CAP_MOVE_WAIT_LOGICAL_COMP:
- /* Move confirm will be sent when logical link
- * is complete.
- */
- chan->move_state = L2CAP_MOVE_WAIT_LOGICAL_CFM;
- break;
- case L2CAP_MOVE_WAIT_RSP_SUCCESS:
- if (result == L2CAP_MR_PEND) {
- break;
- } else if (test_bit(CONN_LOCAL_BUSY,
- &chan->conn_state)) {
- chan->move_state = L2CAP_MOVE_WAIT_LOCAL_BUSY;
- } else {
- /* Logical link is up or moving to BR/EDR,
- * proceed with move
- */
- chan->move_state = L2CAP_MOVE_WAIT_CONFIRM_RSP;
- l2cap_send_move_chan_cfm(chan, L2CAP_MC_CONFIRMED);
- }
- break;
- case L2CAP_MOVE_WAIT_RSP:
- /* Moving to AMP */
- if (result == L2CAP_MR_SUCCESS) {
- /* Remote is ready, send confirm immediately
- * after logical link is ready
- */
- chan->move_state = L2CAP_MOVE_WAIT_LOGICAL_CFM;
- } else {
- /* Both logical link and move success
- * are required to confirm
- */
- chan->move_state = L2CAP_MOVE_WAIT_LOGICAL_COMP;
- }
-
- /* Placeholder - get hci_chan for logical link */
- if (!hchan) {
- /* Logical link not available */
- l2cap_send_move_chan_cfm(chan, L2CAP_MC_UNCONFIRMED);
- break;
- }
-
- /* If the logical link is not yet connected, do not
- * send confirmation.
- */
- if (hchan->state != BT_CONNECTED)
- break;
-
- /* Logical link is already ready to go */
-
- chan->hs_hcon = hchan->conn;
- chan->hs_hcon->l2cap_data = chan->conn;
-
- if (result == L2CAP_MR_SUCCESS) {
- /* Can confirm now */
- l2cap_send_move_chan_cfm(chan, L2CAP_MC_CONFIRMED);
- } else {
- /* Now only need move success
- * to confirm
- */
- chan->move_state = L2CAP_MOVE_WAIT_RSP_SUCCESS;
- }
-
- l2cap_logical_cfm(chan, hchan, L2CAP_MR_SUCCESS);
- break;
- default:
- /* Any other amp move state means the move failed. */
- chan->move_id = chan->local_amp_id;
- l2cap_move_done(chan);
- l2cap_send_move_chan_cfm(chan, L2CAP_MC_UNCONFIRMED);
- }
-
- l2cap_chan_unlock(chan);
-}
-
-static void l2cap_move_fail(struct l2cap_conn *conn, u8 ident, u16 icid,
- u16 result)
-{
- struct l2cap_chan *chan;
-
- chan = l2cap_get_chan_by_ident(conn, ident);
- if (!chan) {
- /* Could not locate channel, icid is best guess */
- l2cap_send_move_chan_cfm_icid(conn, icid);
- return;
- }
-
- __clear_chan_timer(chan);
-
- if (chan->move_role == L2CAP_MOVE_ROLE_INITIATOR) {
- if (result == L2CAP_MR_COLLISION) {
- chan->move_role = L2CAP_MOVE_ROLE_RESPONDER;
- } else {
- /* Cleanup - cancel move */
- chan->move_id = chan->local_amp_id;
- l2cap_move_done(chan);
- }
- }
-
- l2cap_send_move_chan_cfm(chan, L2CAP_MC_UNCONFIRMED);
-
- l2cap_chan_unlock(chan);
-}
-
-static int l2cap_move_channel_rsp(struct l2cap_conn *conn,
- struct l2cap_cmd_hdr *cmd,
- u16 cmd_len, void *data)
-{
- struct l2cap_move_chan_rsp *rsp = data;
- u16 icid, result;
-
- if (cmd_len != sizeof(*rsp))
- return -EPROTO;
-
- icid = le16_to_cpu(rsp->icid);
- result = le16_to_cpu(rsp->result);
-
- BT_DBG("icid 0x%4.4x, result 0x%4.4x", icid, result);
-
- if (result == L2CAP_MR_SUCCESS || result == L2CAP_MR_PEND)
- l2cap_move_continue(conn, icid, result);
- else
- l2cap_move_fail(conn, cmd->ident, icid, result);
-
- return 0;
-}
-
-static int l2cap_move_channel_confirm(struct l2cap_conn *conn,
- struct l2cap_cmd_hdr *cmd,
- u16 cmd_len, void *data)
-{
- struct l2cap_move_chan_cfm *cfm = data;
- struct l2cap_chan *chan;
- u16 icid, result;
-
- if (cmd_len != sizeof(*cfm))
- return -EPROTO;
-
- icid = le16_to_cpu(cfm->icid);
- result = le16_to_cpu(cfm->result);
-
- BT_DBG("icid 0x%4.4x, result 0x%4.4x", icid, result);
-
- chan = l2cap_get_chan_by_dcid(conn, icid);
- if (!chan) {
- /* Spec requires a response even if the icid was not found */
- l2cap_send_move_chan_cfm_rsp(conn, cmd->ident, icid);
- return 0;
- }
-
- if (chan->move_state == L2CAP_MOVE_WAIT_CONFIRM) {
- if (result == L2CAP_MC_CONFIRMED) {
- chan->local_amp_id = chan->move_id;
- if (chan->local_amp_id == AMP_ID_BREDR)
- __release_logical_link(chan);
- } else {
- chan->move_id = chan->local_amp_id;
- }
-
- l2cap_move_done(chan);
- }
-
- l2cap_send_move_chan_cfm_rsp(conn, cmd->ident, icid);
-
- l2cap_chan_unlock(chan);
-
- return 0;
-}
-
-static inline int l2cap_move_channel_confirm_rsp(struct l2cap_conn *conn,
- struct l2cap_cmd_hdr *cmd,
- u16 cmd_len, void *data)
-{
- struct l2cap_move_chan_cfm_rsp *rsp = data;
- struct l2cap_chan *chan;
- u16 icid;
-
- if (cmd_len != sizeof(*rsp))
- return -EPROTO;
-
- icid = le16_to_cpu(rsp->icid);
-
- BT_DBG("icid 0x%4.4x", icid);
-
- chan = l2cap_get_chan_by_scid(conn, icid);
- if (!chan)
- return 0;
-
- __clear_chan_timer(chan);
-
- if (chan->move_state == L2CAP_MOVE_WAIT_CONFIRM_RSP) {
- chan->local_amp_id = chan->move_id;
-
- if (chan->local_amp_id == AMP_ID_BREDR && chan->hs_hchan)
- __release_logical_link(chan);
-
- l2cap_move_done(chan);
- }
-
- l2cap_chan_unlock(chan);
-
- return 0;
-}
-
static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn,
struct l2cap_cmd_hdr *cmd,
u16 cmd_len, u8 *data)
@@ -5279,7 +4721,7 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
credits = __le16_to_cpu(rsp->credits);
result = __le16_to_cpu(rsp->result);
- if (result == L2CAP_CR_SUCCESS && (mtu < 23 || mps < 23 ||
+ if (result == L2CAP_CR_LE_SUCCESS && (mtu < 23 || mps < 23 ||
dcid < L2CAP_CID_DYN_START ||
dcid > L2CAP_CID_LE_DYN_END))
return -EPROTO;
@@ -5287,20 +4729,16 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
BT_DBG("dcid 0x%4.4x mtu %u mps %u credits %u result 0x%2.2x",
dcid, mtu, mps, credits, result);
- mutex_lock(&conn->chan_lock);
-
chan = __l2cap_get_chan_by_ident(conn, cmd->ident);
- if (!chan) {
- err = -EBADSLT;
- goto unlock;
- }
+ if (!chan)
+ return -EBADSLT;
err = 0;
l2cap_chan_lock(chan);
switch (result) {
- case L2CAP_CR_SUCCESS:
+ case L2CAP_CR_LE_SUCCESS:
if (__l2cap_get_chan_by_dcid(conn, dcid)) {
err = -EBADSLT;
break;
@@ -5314,8 +4752,8 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
l2cap_chan_ready(chan);
break;
- case L2CAP_CR_AUTHENTICATION:
- case L2CAP_CR_ENCRYPTION:
+ case L2CAP_CR_LE_AUTHENTICATION:
+ case L2CAP_CR_LE_ENCRYPTION:
/* If we already have MITM protection we can't do
* anything.
*/
@@ -5341,9 +4779,6 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
l2cap_chan_unlock(chan);
-unlock:
- mutex_unlock(&conn->chan_lock);
-
return err;
}
@@ -5363,7 +4798,6 @@ static inline int l2cap_bredr_sig_cmd(struct l2cap_conn *conn,
break;
case L2CAP_CONN_RSP:
- case L2CAP_CREATE_CHAN_RSP:
l2cap_connect_create_rsp(conn, cmd, cmd_len, data);
break;
@@ -5398,26 +4832,6 @@ static inline int l2cap_bredr_sig_cmd(struct l2cap_conn *conn,
l2cap_information_rsp(conn, cmd, cmd_len, data);
break;
- case L2CAP_CREATE_CHAN_REQ:
- err = l2cap_create_channel_req(conn, cmd, cmd_len, data);
- break;
-
- case L2CAP_MOVE_CHAN_REQ:
- err = l2cap_move_channel_req(conn, cmd, cmd_len, data);
- break;
-
- case L2CAP_MOVE_CHAN_RSP:
- l2cap_move_channel_rsp(conn, cmd, cmd_len, data);
- break;
-
- case L2CAP_MOVE_CHAN_CFM:
- err = l2cap_move_channel_confirm(conn, cmd, cmd_len, data);
- break;
-
- case L2CAP_MOVE_CHAN_CFM_RSP:
- l2cap_move_channel_confirm_rsp(conn, cmd, cmd_len, data);
- break;
-
default:
BT_ERR("Unknown BR/EDR signaling command 0x%2.2x", cmd->code);
err = -EINVAL;
@@ -5454,47 +4868,58 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
BT_DBG("psm 0x%2.2x scid 0x%4.4x mtu %u mps %u", __le16_to_cpu(psm),
scid, mtu, mps);
+ /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 3, Part A
+ * page 1059:
+ *
+ * Valid range: 0x0001-0x00ff
+ *
+ * Table 4.15: L2CAP_LE_CREDIT_BASED_CONNECTION_REQ SPSM ranges
+ */
+ if (!psm || __le16_to_cpu(psm) > L2CAP_PSM_LE_DYN_END) {
+ result = L2CAP_CR_LE_BAD_PSM;
+ chan = NULL;
+ goto response;
+ }
+
/* Check if we have socket listening on psm */
pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src,
&conn->hcon->dst, LE_LINK);
if (!pchan) {
- result = L2CAP_CR_BAD_PSM;
+ result = L2CAP_CR_LE_BAD_PSM;
chan = NULL;
goto response;
}
- mutex_lock(&conn->chan_lock);
l2cap_chan_lock(pchan);
if (!smp_sufficient_security(conn->hcon, pchan->sec_level,
SMP_ALLOW_STK)) {
- result = L2CAP_CR_AUTHENTICATION;
+ result = pchan->sec_level == BT_SECURITY_MEDIUM ?
+ L2CAP_CR_LE_ENCRYPTION : L2CAP_CR_LE_AUTHENTICATION;
chan = NULL;
goto response_unlock;
}
/* Check for valid dynamic CID range */
if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_LE_DYN_END) {
- result = L2CAP_CR_INVALID_SCID;
+ result = L2CAP_CR_LE_INVALID_SCID;
chan = NULL;
goto response_unlock;
}
/* Check if we already have channel with that dcid */
if (__l2cap_get_chan_by_dcid(conn, scid)) {
- result = L2CAP_CR_SCID_IN_USE;
+ result = L2CAP_CR_LE_SCID_IN_USE;
chan = NULL;
goto response_unlock;
}
chan = pchan->ops->new_connection(pchan);
if (!chan) {
- result = L2CAP_CR_NO_MEM;
+ result = L2CAP_CR_LE_NO_MEM;
goto response_unlock;
}
- l2cap_le_flowctl_init(chan);
-
bacpy(&chan->src, &conn->hcon->src);
bacpy(&chan->dst, &conn->hcon->dst);
chan->src_type = bdaddr_src_type(conn->hcon);
@@ -5503,9 +4928,11 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
chan->dcid = scid;
chan->omtu = mtu;
chan->remote_mps = mps;
- chan->tx_credits = __le16_to_cpu(req->credits);
__l2cap_chan_add(conn, chan);
+
+ l2cap_le_flowctl_init(chan, __le16_to_cpu(req->credits));
+
dcid = chan->scid;
credits = chan->rx_credits;
@@ -5524,12 +4951,11 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
chan->ops->defer(chan);
} else {
l2cap_chan_ready(chan);
- result = L2CAP_CR_SUCCESS;
+ result = L2CAP_CR_LE_SUCCESS;
}
response_unlock:
l2cap_chan_unlock(pchan);
- mutex_unlock(&conn->chan_lock);
l2cap_chan_put(pchan);
if (result == L2CAP_CR_PEND)
@@ -5578,12 +5004,11 @@ static inline int l2cap_le_credits(struct l2cap_conn *conn,
if (credits > max_credits) {
BT_ERR("LE credits overflow");
l2cap_send_disconn_req(chan, ECONNRESET);
- l2cap_chan_unlock(chan);
/* Return 0 so that we don't trigger an unnecessary
* command reject packet.
*/
- return 0;
+ goto unlock;
}
chan->tx_credits += credits;
@@ -5594,7 +5019,369 @@ static inline int l2cap_le_credits(struct l2cap_conn *conn,
if (chan->tx_credits)
chan->ops->resume(chan);
+unlock:
l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+
+ return 0;
+}
+
+static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_ecred_conn_req *req = (void *) data;
+ DEFINE_RAW_FLEX(struct l2cap_ecred_conn_rsp, pdu, dcid, L2CAP_ECRED_MAX_CID);
+ struct l2cap_chan *chan, *pchan;
+ u16 mtu, mps;
+ __le16 psm;
+ u8 result, len = 0;
+ int i, num_scid;
+ bool defer = false;
+
+ if (!enable_ecred)
+ return -EINVAL;
+
+ if (cmd_len < sizeof(*req) || (cmd_len - sizeof(*req)) % sizeof(u16)) {
+ result = L2CAP_CR_LE_INVALID_PARAMS;
+ goto response;
+ }
+
+ cmd_len -= sizeof(*req);
+ num_scid = cmd_len / sizeof(u16);
+
+ if (num_scid > L2CAP_ECRED_MAX_CID) {
+ result = L2CAP_CR_LE_INVALID_PARAMS;
+ goto response;
+ }
+
+ mtu = __le16_to_cpu(req->mtu);
+ mps = __le16_to_cpu(req->mps);
+
+ if (mtu < L2CAP_ECRED_MIN_MTU || mps < L2CAP_ECRED_MIN_MPS) {
+ result = L2CAP_CR_LE_UNACCEPT_PARAMS;
+ goto response;
+ }
+
+ psm = req->psm;
+
+ /* BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 3, Part A
+ * page 1059:
+ *
+ * Valid range: 0x0001-0x00ff
+ *
+ * Table 4.15: L2CAP_LE_CREDIT_BASED_CONNECTION_REQ SPSM ranges
+ */
+ if (!psm || __le16_to_cpu(psm) > L2CAP_PSM_LE_DYN_END) {
+ result = L2CAP_CR_LE_BAD_PSM;
+ goto response;
+ }
+
+ BT_DBG("psm 0x%2.2x mtu %u mps %u", __le16_to_cpu(psm), mtu, mps);
+
+ memset(pdu, 0, sizeof(*pdu));
+
+ /* Check if we have socket listening on psm */
+ pchan = l2cap_global_chan_by_psm(BT_LISTEN, psm, &conn->hcon->src,
+ &conn->hcon->dst, LE_LINK);
+ if (!pchan) {
+ result = L2CAP_CR_LE_BAD_PSM;
+ goto response;
+ }
+
+ l2cap_chan_lock(pchan);
+
+ if (!smp_sufficient_security(conn->hcon, pchan->sec_level,
+ SMP_ALLOW_STK)) {
+ result = L2CAP_CR_LE_AUTHENTICATION;
+ goto unlock;
+ }
+
+ result = L2CAP_CR_LE_SUCCESS;
+
+ for (i = 0; i < num_scid; i++) {
+ u16 scid = __le16_to_cpu(req->scid[i]);
+
+ BT_DBG("scid[%d] 0x%4.4x", i, scid);
+
+ pdu->dcid[i] = 0x0000;
+ len += sizeof(*pdu->dcid);
+
+ /* Check for valid dynamic CID range */
+ if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_LE_DYN_END) {
+ result = L2CAP_CR_LE_INVALID_SCID;
+ continue;
+ }
+
+ /* Check if we already have channel with that dcid */
+ if (__l2cap_get_chan_by_dcid(conn, scid)) {
+ result = L2CAP_CR_LE_SCID_IN_USE;
+ continue;
+ }
+
+ chan = pchan->ops->new_connection(pchan);
+ if (!chan) {
+ result = L2CAP_CR_LE_NO_MEM;
+ continue;
+ }
+
+ bacpy(&chan->src, &conn->hcon->src);
+ bacpy(&chan->dst, &conn->hcon->dst);
+ chan->src_type = bdaddr_src_type(conn->hcon);
+ chan->dst_type = bdaddr_dst_type(conn->hcon);
+ chan->psm = psm;
+ chan->dcid = scid;
+ chan->omtu = mtu;
+ chan->remote_mps = mps;
+
+ __l2cap_chan_add(conn, chan);
+
+ l2cap_ecred_init(chan, __le16_to_cpu(req->credits));
+
+ /* Init response */
+ if (!pdu->credits) {
+ pdu->mtu = cpu_to_le16(chan->imtu);
+ pdu->mps = cpu_to_le16(chan->mps);
+ pdu->credits = cpu_to_le16(chan->rx_credits);
+ }
+
+ pdu->dcid[i] = cpu_to_le16(chan->scid);
+
+ __set_chan_timer(chan, chan->ops->get_sndtimeo(chan));
+
+ chan->ident = cmd->ident;
+ chan->mode = L2CAP_MODE_EXT_FLOWCTL;
+
+ if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) {
+ l2cap_state_change(chan, BT_CONNECT2);
+ defer = true;
+ chan->ops->defer(chan);
+ } else {
+ l2cap_chan_ready(chan);
+ }
+ }
+
+unlock:
+ l2cap_chan_unlock(pchan);
+ l2cap_chan_put(pchan);
+
+response:
+ pdu->result = cpu_to_le16(result);
+
+ if (defer)
+ return 0;
+
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_ECRED_CONN_RSP,
+ sizeof(*pdu) + len, pdu);
+
+ return 0;
+}
+
+static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_ecred_conn_rsp *rsp = (void *) data;
+ struct hci_conn *hcon = conn->hcon;
+ u16 mtu, mps, credits, result;
+ struct l2cap_chan *chan, *tmp;
+ int err = 0, sec_level;
+ int i = 0;
+
+ if (cmd_len < sizeof(*rsp))
+ return -EPROTO;
+
+ mtu = __le16_to_cpu(rsp->mtu);
+ mps = __le16_to_cpu(rsp->mps);
+ credits = __le16_to_cpu(rsp->credits);
+ result = __le16_to_cpu(rsp->result);
+
+ BT_DBG("mtu %u mps %u credits %u result 0x%4.4x", mtu, mps, credits,
+ result);
+
+ cmd_len -= sizeof(*rsp);
+
+ list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) {
+ u16 dcid;
+
+ if (chan->ident != cmd->ident ||
+ chan->mode != L2CAP_MODE_EXT_FLOWCTL ||
+ chan->state == BT_CONNECTED)
+ continue;
+
+ l2cap_chan_lock(chan);
+
+ /* Check that there is a dcid for each pending channel */
+ if (cmd_len < sizeof(dcid)) {
+ l2cap_chan_del(chan, ECONNREFUSED);
+ l2cap_chan_unlock(chan);
+ continue;
+ }
+
+ dcid = __le16_to_cpu(rsp->dcid[i++]);
+ cmd_len -= sizeof(u16);
+
+ BT_DBG("dcid[%d] 0x%4.4x", i, dcid);
+
+ /* Check if dcid is already in use */
+ if (dcid && __l2cap_get_chan_by_dcid(conn, dcid)) {
+ /* If a device receives a
+ * L2CAP_CREDIT_BASED_CONNECTION_RSP packet with an
+ * already-assigned Destination CID, then both the
+ * original channel and the new channel shall be
+ * immediately discarded and not used.
+ */
+ l2cap_chan_del(chan, ECONNREFUSED);
+ l2cap_chan_unlock(chan);
+ chan = __l2cap_get_chan_by_dcid(conn, dcid);
+ l2cap_chan_lock(chan);
+ l2cap_chan_del(chan, ECONNRESET);
+ l2cap_chan_unlock(chan);
+ continue;
+ }
+
+ switch (result) {
+ case L2CAP_CR_LE_AUTHENTICATION:
+ case L2CAP_CR_LE_ENCRYPTION:
+ /* If we already have MITM protection we can't do
+ * anything.
+ */
+ if (hcon->sec_level > BT_SECURITY_MEDIUM) {
+ l2cap_chan_del(chan, ECONNREFUSED);
+ break;
+ }
+
+ sec_level = hcon->sec_level + 1;
+ if (chan->sec_level < sec_level)
+ chan->sec_level = sec_level;
+
+ /* We'll need to send a new Connect Request */
+ clear_bit(FLAG_ECRED_CONN_REQ_SENT, &chan->flags);
+
+ smp_conn_security(hcon, chan->sec_level);
+ break;
+
+ case L2CAP_CR_LE_BAD_PSM:
+ l2cap_chan_del(chan, ECONNREFUSED);
+ break;
+
+ default:
+ /* If dcid was not set it means channels was refused */
+ if (!dcid) {
+ l2cap_chan_del(chan, ECONNREFUSED);
+ break;
+ }
+
+ chan->ident = 0;
+ chan->dcid = dcid;
+ chan->omtu = mtu;
+ chan->remote_mps = mps;
+ chan->tx_credits = credits;
+ l2cap_chan_ready(chan);
+ break;
+ }
+
+ l2cap_chan_unlock(chan);
+ }
+
+ return err;
+}
+
+static inline int l2cap_ecred_reconf_req(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_ecred_reconf_req *req = (void *) data;
+ struct l2cap_ecred_reconf_rsp rsp;
+ u16 mtu, mps, result;
+ struct l2cap_chan *chan;
+ int i, num_scid;
+
+ if (!enable_ecred)
+ return -EINVAL;
+
+ if (cmd_len < sizeof(*req) || cmd_len - sizeof(*req) % sizeof(u16)) {
+ result = L2CAP_CR_LE_INVALID_PARAMS;
+ goto respond;
+ }
+
+ mtu = __le16_to_cpu(req->mtu);
+ mps = __le16_to_cpu(req->mps);
+
+ BT_DBG("mtu %u mps %u", mtu, mps);
+
+ if (mtu < L2CAP_ECRED_MIN_MTU) {
+ result = L2CAP_RECONF_INVALID_MTU;
+ goto respond;
+ }
+
+ if (mps < L2CAP_ECRED_MIN_MPS) {
+ result = L2CAP_RECONF_INVALID_MPS;
+ goto respond;
+ }
+
+ cmd_len -= sizeof(*req);
+ num_scid = cmd_len / sizeof(u16);
+ result = L2CAP_RECONF_SUCCESS;
+
+ for (i = 0; i < num_scid; i++) {
+ u16 scid;
+
+ scid = __le16_to_cpu(req->scid[i]);
+ if (!scid)
+ return -EPROTO;
+
+ chan = __l2cap_get_chan_by_dcid(conn, scid);
+ if (!chan)
+ continue;
+
+ /* If the MTU value is decreased for any of the included
+ * channels, then the receiver shall disconnect all
+ * included channels.
+ */
+ if (chan->omtu > mtu) {
+ BT_ERR("chan %p decreased MTU %u -> %u", chan,
+ chan->omtu, mtu);
+ result = L2CAP_RECONF_INVALID_MTU;
+ }
+
+ chan->omtu = mtu;
+ chan->remote_mps = mps;
+ }
+
+respond:
+ rsp.result = cpu_to_le16(result);
+
+ l2cap_send_cmd(conn, cmd->ident, L2CAP_ECRED_RECONF_RSP, sizeof(rsp),
+ &rsp);
+
+ return 0;
+}
+
+static inline int l2cap_ecred_reconf_rsp(struct l2cap_conn *conn,
+ struct l2cap_cmd_hdr *cmd, u16 cmd_len,
+ u8 *data)
+{
+ struct l2cap_chan *chan, *tmp;
+ struct l2cap_ecred_conn_rsp *rsp = (void *) data;
+ u16 result;
+
+ if (cmd_len < sizeof(*rsp))
+ return -EPROTO;
+
+ result = __le16_to_cpu(rsp->result);
+
+ BT_DBG("result 0x%4.4x", rsp->result);
+
+ if (!result)
+ return 0;
+
+ list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) {
+ if (chan->ident != cmd->ident)
+ continue;
+
+ l2cap_chan_del(chan, ECONNRESET);
+ }
return 0;
}
@@ -5609,18 +5396,20 @@ static inline int l2cap_le_command_rej(struct l2cap_conn *conn,
if (cmd_len < sizeof(*rej))
return -EPROTO;
- mutex_lock(&conn->chan_lock);
-
chan = __l2cap_get_chan_by_ident(conn, cmd->ident);
if (!chan)
goto done;
+ chan = l2cap_chan_hold_unless_zero(chan);
+ if (!chan)
+ goto done;
+
l2cap_chan_lock(chan);
l2cap_chan_del(chan, ECONNREFUSED);
l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
done:
- mutex_unlock(&conn->chan_lock);
return 0;
}
@@ -5654,6 +5443,22 @@ static inline int l2cap_le_sig_cmd(struct l2cap_conn *conn,
err = l2cap_le_credits(conn, cmd, cmd_len, data);
break;
+ case L2CAP_ECRED_CONN_REQ:
+ err = l2cap_ecred_conn_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_ECRED_CONN_RSP:
+ err = l2cap_ecred_conn_rsp(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_ECRED_RECONF_REQ:
+ err = l2cap_ecred_reconf_req(conn, cmd, cmd_len, data);
+ break;
+
+ case L2CAP_ECRED_RECONF_RSP:
+ err = l2cap_ecred_reconf_rsp(conn, cmd, cmd_len, data);
+ break;
+
case L2CAP_DISCONN_REQ:
err = l2cap_disconnect_req(conn, cmd, cmd_len, data);
break;
@@ -5712,13 +5517,19 @@ drop:
kfree_skb(skb);
}
+static inline void l2cap_sig_send_rej(struct l2cap_conn *conn, u16 ident)
+{
+ struct l2cap_cmd_rej_unk rej;
+
+ rej.reason = cpu_to_le16(L2CAP_REJ_NOT_UNDERSTOOD);
+ l2cap_send_cmd(conn, ident, L2CAP_COMMAND_REJ, sizeof(rej), &rej);
+}
+
static inline void l2cap_sig_channel(struct l2cap_conn *conn,
struct sk_buff *skb)
{
struct hci_conn *hcon = conn->hcon;
- u8 *data = skb->data;
- int len = skb->len;
- struct l2cap_cmd_hdr cmd;
+ struct l2cap_cmd_hdr *cmd;
int err;
l2cap_raw_recv(conn, skb);
@@ -5726,35 +5537,36 @@ static inline void l2cap_sig_channel(struct l2cap_conn *conn,
if (hcon->type != ACL_LINK)
goto drop;
- while (len >= L2CAP_CMD_HDR_SIZE) {
- u16 cmd_len;
- memcpy(&cmd, data, L2CAP_CMD_HDR_SIZE);
- data += L2CAP_CMD_HDR_SIZE;
- len -= L2CAP_CMD_HDR_SIZE;
+ while (skb->len >= L2CAP_CMD_HDR_SIZE) {
+ u16 len;
+
+ cmd = (void *) skb->data;
+ skb_pull(skb, L2CAP_CMD_HDR_SIZE);
- cmd_len = le16_to_cpu(cmd.len);
+ len = le16_to_cpu(cmd->len);
- BT_DBG("code 0x%2.2x len %d id 0x%2.2x", cmd.code, cmd_len,
- cmd.ident);
+ BT_DBG("code 0x%2.2x len %d id 0x%2.2x", cmd->code, len,
+ cmd->ident);
- if (cmd_len > len || !cmd.ident) {
+ if (len > skb->len || !cmd->ident) {
BT_DBG("corrupted command");
- break;
+ l2cap_sig_send_rej(conn, cmd->ident);
+ skb_pull(skb, len > skb->len ? skb->len : len);
+ continue;
}
- err = l2cap_bredr_sig_cmd(conn, &cmd, cmd_len, data);
+ err = l2cap_bredr_sig_cmd(conn, cmd, len, skb->data);
if (err) {
- struct l2cap_cmd_rej_unk rej;
-
BT_ERR("Wrong link type (%d)", err);
-
- rej.reason = cpu_to_le16(L2CAP_REJ_NOT_UNDERSTOOD);
- l2cap_send_cmd(conn, cmd.ident, L2CAP_COMMAND_REJ,
- sizeof(rej), &rej);
+ l2cap_sig_send_rej(conn, cmd->ident);
}
- data += cmd_len;
- len -= cmd_len;
+ skb_pull(skb, len);
+ }
+
+ if (skb->len > 0) {
+ BT_DBG("corrupted command");
+ l2cap_sig_send_rej(conn, 0);
}
drop:
@@ -6156,6 +5968,7 @@ static int l2cap_rx_state_recv(struct l2cap_chan *chan,
struct l2cap_ctrl *control,
struct sk_buff *skb, u8 event)
{
+ struct l2cap_ctrl local_control;
int err = 0;
bool skb_in_use = false;
@@ -6180,15 +5993,32 @@ static int l2cap_rx_state_recv(struct l2cap_chan *chan,
chan->buffer_seq = chan->expected_tx_seq;
skb_in_use = true;
+ /* l2cap_reassemble_sdu may free skb, hence invalidate
+ * control, so make a copy in advance to use it after
+ * l2cap_reassemble_sdu returns and to avoid the race
+ * condition, for example:
+ *
+ * The current thread calls:
+ * l2cap_reassemble_sdu
+ * chan->ops->recv == l2cap_sock_recv_cb
+ * __sock_queue_rcv_skb
+ * Another thread calls:
+ * bt_sock_recvmsg
+ * skb_recv_datagram
+ * skb_free_datagram
+ * Then the current thread tries to access control, but
+ * it was freed by skb_free_datagram.
+ */
+ local_control = *control;
err = l2cap_reassemble_sdu(chan, skb, control);
if (err)
break;
- if (control->final) {
+ if (local_control.final) {
if (!test_and_clear_bit(CONN_REJ_ACT,
&chan->conn_state)) {
- control->final = 0;
- l2cap_retransmit_all(chan, control);
+ local_control.final = 0;
+ l2cap_retransmit_all(chan, &local_control);
l2cap_ertm_send(chan);
}
}
@@ -6240,8 +6070,8 @@ static int l2cap_rx_state_recv(struct l2cap_chan *chan,
if (control->final) {
clear_bit(CONN_REMOTE_BUSY, &chan->conn_state);
- if (!test_and_clear_bit(CONN_REJ_ACT, &chan->conn_state) &&
- !__chan_is_moving(chan)) {
+ if (!test_and_clear_bit(CONN_REJ_ACT,
+ &chan->conn_state)) {
control->final = 0;
l2cap_retransmit_all(chan, control);
}
@@ -6434,11 +6264,7 @@ static int l2cap_finish_move(struct l2cap_chan *chan)
BT_DBG("chan %p", chan);
chan->rx_state = L2CAP_RX_STATE_RECV;
-
- if (chan->hs_hcon)
- chan->conn->mtu = chan->hs_hcon->hdev->block_mtu;
- else
- chan->conn->mtu = chan->conn->hcon->hdev->acl_mtu;
+ chan->conn->mtu = chan->conn->hcon->mtu;
return l2cap_resegment(chan);
}
@@ -6505,11 +6331,7 @@ static int l2cap_rx_state_wait_f(struct l2cap_chan *chan,
*/
chan->next_tx_seq = control->reqseq;
chan->unacked_frames = 0;
-
- if (chan->hs_hcon)
- chan->conn->mtu = chan->hs_hcon->hdev->block_mtu;
- else
- chan->conn->mtu = chan->conn->hcon->hdev->acl_mtu;
+ chan->conn->mtu = chan->conn->hcon->mtu;
err = l2cap_resegment(chan);
@@ -6568,14 +6390,30 @@ static int l2cap_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
static int l2cap_stream_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
struct sk_buff *skb)
{
+ /* l2cap_reassemble_sdu may free skb, hence invalidate control, so store
+ * the txseq field in advance to use it after l2cap_reassemble_sdu
+ * returns and to avoid the race condition, for example:
+ *
+ * The current thread calls:
+ * l2cap_reassemble_sdu
+ * chan->ops->recv == l2cap_sock_recv_cb
+ * __sock_queue_rcv_skb
+ * Another thread calls:
+ * bt_sock_recvmsg
+ * skb_recv_datagram
+ * skb_free_datagram
+ * Then the current thread tries to access control, but it was freed by
+ * skb_free_datagram.
+ */
+ u16 txseq = control->txseq;
+
BT_DBG("chan %p, control %p, skb %p, state %d", chan, control, skb,
chan->rx_state);
- if (l2cap_classify_txseq(chan, control->txseq) ==
- L2CAP_TXSEQ_EXPECTED) {
+ if (l2cap_classify_txseq(chan, txseq) == L2CAP_TXSEQ_EXPECTED) {
l2cap_pass_to_tx(chan, control);
- BT_DBG("buffer_seq %d->%d", chan->buffer_seq,
+ BT_DBG("buffer_seq %u->%u", chan->buffer_seq,
__next_seq(chan, chan->buffer_seq));
chan->buffer_seq = __next_seq(chan, chan->buffer_seq);
@@ -6595,8 +6433,8 @@ static int l2cap_stream_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
}
}
- chan->last_acked_seq = control->txseq;
- chan->expected_tx_seq = __next_seq(chan, control->txseq);
+ chan->last_acked_seq = txseq;
+ chan->expected_tx_seq = __next_seq(chan, txseq);
return 0;
}
@@ -6630,9 +6468,10 @@ static int l2cap_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
goto drop;
}
- if ((chan->mode == L2CAP_MODE_ERTM ||
- chan->mode == L2CAP_MODE_STREAMING) && sk_filter(chan->data, skb))
- goto drop;
+ if (chan->ops->filter) {
+ if (chan->ops->filter(chan, skb))
+ goto drop;
+ }
if (!control->sframe) {
int err;
@@ -6697,15 +6536,12 @@ static void l2cap_chan_le_send_credits(struct l2cap_chan *chan)
{
struct l2cap_conn *conn = chan->conn;
struct l2cap_le_credits pkt;
- u16 return_credits;
+ u16 return_credits = l2cap_le_rx_credits(chan);
- /* We return more credits to the sender only after the amount of
- * credits falls below half of the initial amount.
- */
- if (chan->rx_credits >= (le_max_credits + 1) / 2)
+ if (chan->rx_credits >= return_credits)
return;
- return_credits = le_max_credits - chan->rx_credits;
+ return_credits -= chan->rx_credits;
BT_DBG("chan %p returning %u credits to sender", chan, return_credits);
@@ -6719,7 +6555,41 @@ static void l2cap_chan_le_send_credits(struct l2cap_chan *chan)
l2cap_send_cmd(conn, chan->ident, L2CAP_LE_CREDITS, sizeof(pkt), &pkt);
}
-static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
+void l2cap_chan_rx_avail(struct l2cap_chan *chan, ssize_t rx_avail)
+{
+ if (chan->rx_avail == rx_avail)
+ return;
+
+ BT_DBG("chan %p has %zd bytes avail for rx", chan, rx_avail);
+
+ chan->rx_avail = rx_avail;
+
+ if (chan->state == BT_CONNECTED)
+ l2cap_chan_le_send_credits(chan);
+}
+
+static int l2cap_ecred_recv(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ int err;
+
+ BT_DBG("SDU reassemble complete: chan %p skb->len %u", chan, skb->len);
+
+ /* Wait recv to confirm reception before updating the credits */
+ err = chan->ops->recv(chan, skb);
+
+ if (err < 0 && chan->rx_avail != -1) {
+ BT_ERR("Queueing received LE L2CAP data failed");
+ l2cap_send_disconn_req(chan, ECONNRESET);
+ return err;
+ }
+
+ /* Update credits whenever an SDU is received */
+ l2cap_chan_le_send_credits(chan);
+
+ return err;
+}
+
+static int l2cap_ecred_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
{
int err;
@@ -6735,9 +6605,14 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
}
chan->rx_credits--;
- BT_DBG("rx_credits %u -> %u", chan->rx_credits + 1, chan->rx_credits);
+ BT_DBG("chan %p: rx_credits %u -> %u",
+ chan, chan->rx_credits + 1, chan->rx_credits);
- l2cap_chan_le_send_credits(chan);
+ /* Update if remote had run out of credits, this should only happens
+ * if the remote is not using the entire MPS.
+ */
+ if (!chan->rx_credits)
+ l2cap_chan_le_send_credits(chan);
err = 0;
@@ -6763,12 +6638,22 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
}
if (skb->len == sdu_len)
- return chan->ops->recv(chan, skb);
+ return l2cap_ecred_recv(chan, skb);
chan->sdu = skb;
chan->sdu_len = sdu_len;
chan->sdu_last_frag = skb;
+ /* Detect if remote is not able to use the selected MPS */
+ if (skb->len + L2CAP_SDULEN_SIZE < chan->mps) {
+ u16 mps_len = skb->len + L2CAP_SDULEN_SIZE;
+
+ /* Adjust the number of credits */
+ BT_DBG("chan->mps %u -> %u", chan->mps, mps_len);
+ chan->mps = mps_len;
+ l2cap_chan_le_send_credits(chan);
+ }
+
return 0;
}
@@ -6785,7 +6670,7 @@ static int l2cap_le_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
skb = NULL;
if (chan->sdu->len == chan->sdu_len) {
- err = chan->ops->recv(chan, chan->sdu);
+ err = l2cap_ecred_recv(chan, chan->sdu);
if (!err) {
chan->sdu = NULL;
chan->sdu_last_frag = NULL;
@@ -6816,26 +6701,16 @@ static void l2cap_data_channel(struct l2cap_conn *conn, u16 cid,
chan = l2cap_get_chan_by_scid(conn, cid);
if (!chan) {
- if (cid == L2CAP_CID_A2MP) {
- chan = a2mp_channel_create(conn, skb);
- if (!chan) {
- kfree_skb(skb);
- return;
- }
-
- l2cap_chan_lock(chan);
- } else {
- BT_DBG("unknown cid 0x%4.4x", cid);
- /* Drop packet and return */
- kfree_skb(skb);
- return;
- }
+ BT_DBG("unknown cid 0x%4.4x", cid);
+ /* Drop packet and return */
+ kfree_skb(skb);
+ return;
}
BT_DBG("chan %p, len %d", chan, skb->len);
/* If we receive data on a fixed channel before the info req/rsp
- * procdure is done simply assume that the channel is supported
+ * procedure is done simply assume that the channel is supported
* and mark it as ready.
*/
if (chan->chan_type == L2CAP_CHAN_FIXED)
@@ -6846,7 +6721,8 @@ static void l2cap_data_channel(struct l2cap_conn *conn, u16 cid,
switch (chan->mode) {
case L2CAP_MODE_LE_FLOWCTL:
- if (l2cap_le_data_rcv(chan, skb) < 0)
+ case L2CAP_MODE_EXT_FLOWCTL:
+ if (l2cap_ecred_data_rcv(chan, skb) < 0)
goto drop;
goto done;
@@ -6881,6 +6757,7 @@ drop:
done:
l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
}
static void l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm,
@@ -6899,6 +6776,8 @@ static void l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm,
BT_DBG("chan %p, len %d", chan, skb->len);
+ l2cap_chan_lock(chan);
+
if (chan->state != BT_BOUND && chan->state != BT_CONNECTED)
goto drop;
@@ -6910,11 +6789,13 @@ static void l2cap_conless_channel(struct l2cap_conn *conn, __le16 psm,
bt_cb(skb)->l2cap.psm = psm;
if (!chan->ops->recv(chan, skb)) {
+ l2cap_chan_unlock(chan);
l2cap_chan_put(chan);
return;
}
drop:
+ l2cap_chan_unlock(chan);
l2cap_chan_put(chan);
free_skb:
kfree_skb(skb);
@@ -6946,7 +6827,7 @@ static void l2cap_recv_frame(struct l2cap_conn *conn, struct sk_buff *skb)
* at least ensure that we ignore incoming data from them.
*/
if (hcon->type == LE_LINK &&
- hci_bdaddr_list_lookup(&hcon->hdev->blacklist, &hcon->dst,
+ hci_bdaddr_list_lookup(&hcon->hdev->reject_list, &hcon->dst,
bdaddr_dst_type(hcon))) {
kfree_skb(skb);
return;
@@ -6983,8 +6864,12 @@ static void process_pending_rx(struct work_struct *work)
BT_DBG("");
+ mutex_lock(&conn->lock);
+
while ((skb = skb_dequeue(&conn->pending_rx)))
l2cap_recv_frame(conn, skb);
+
+ mutex_unlock(&conn->lock);
}
static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon)
@@ -7012,33 +6897,18 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon)
BT_DBG("hcon %p conn %p hchan %p", hcon, conn, hchan);
- switch (hcon->type) {
- case LE_LINK:
- if (hcon->hdev->le_mtu) {
- conn->mtu = hcon->hdev->le_mtu;
- break;
- }
- /* fall through */
- default:
- conn->mtu = hcon->hdev->acl_mtu;
- break;
- }
-
+ conn->mtu = hcon->mtu;
conn->feat_mask = 0;
conn->local_fixed_chan = L2CAP_FC_SIG_BREDR | L2CAP_FC_CONNLESS;
- if (hcon->type == ACL_LINK &&
- hci_dev_test_flag(hcon->hdev, HCI_HS_ENABLED))
- conn->local_fixed_chan |= L2CAP_FC_A2MP;
-
if (hci_dev_test_flag(hcon->hdev, HCI_LE_ENABLED) &&
(bredr_sc_enabled(hcon->hdev) ||
hci_dev_test_flag(hcon->hdev, HCI_FORCE_BREDR_SMP)))
conn->local_fixed_chan |= L2CAP_FC_SMP_BREDR;
mutex_init(&conn->ident_lock);
- mutex_init(&conn->chan_lock);
+ mutex_init(&conn->lock);
INIT_LIST_HEAD(&conn->chan_l);
INIT_LIST_HEAD(&conn->users);
@@ -7047,14 +6917,15 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon)
skb_queue_head_init(&conn->pending_rx);
INIT_WORK(&conn->pending_rx_work, process_pending_rx);
- INIT_WORK(&conn->id_addr_update_work, l2cap_conn_update_id_addr);
+ INIT_DELAYED_WORK(&conn->id_addr_timer, l2cap_conn_update_id_addr);
conn->disc_reason = HCI_ERROR_REMOTE_USER_TERM;
return conn;
}
-static bool is_valid_psm(u16 psm, u8 dst_type) {
+static bool is_valid_psm(u16 psm, u8 dst_type)
+{
if (!psm)
return false;
@@ -7065,16 +6936,43 @@ static bool is_valid_psm(u16 psm, u8 dst_type) {
return ((psm & 0x0101) == 0x0001);
}
+struct l2cap_chan_data {
+ struct l2cap_chan *chan;
+ struct pid *pid;
+ int count;
+};
+
+static void l2cap_chan_by_pid(struct l2cap_chan *chan, void *data)
+{
+ struct l2cap_chan_data *d = data;
+ struct pid *pid;
+
+ if (chan == d->chan)
+ return;
+
+ if (!test_bit(FLAG_DEFER_SETUP, &chan->flags))
+ return;
+
+ pid = chan->ops->get_peer_pid(chan);
+
+ /* Only count deferred channels with the same PID/PSM */
+ if (d->pid != pid || chan->psm != d->chan->psm || chan->ident ||
+ chan->mode != L2CAP_MODE_EXT_FLOWCTL || chan->state != BT_CONNECT)
+ return;
+
+ d->count++;
+}
+
int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
- bdaddr_t *dst, u8 dst_type)
+ bdaddr_t *dst, u8 dst_type, u16 timeout)
{
struct l2cap_conn *conn;
struct hci_conn *hcon;
struct hci_dev *hdev;
int err;
- BT_DBG("%pMR -> %pMR (type %u) psm 0x%2.2x", &chan->src, dst,
- dst_type, __le16_to_cpu(psm));
+ BT_DBG("%pMR -> %pMR (type %u) psm 0x%4.4x mode 0x%2.2x", &chan->src,
+ dst, dst_type, __le16_to_cpu(psm), chan->mode);
hdev = hci_get_route(dst, &chan->src, chan->src_type);
if (!hdev)
@@ -7102,13 +7000,18 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
case L2CAP_MODE_BASIC:
break;
case L2CAP_MODE_LE_FLOWCTL:
- l2cap_le_flowctl_init(chan);
+ break;
+ case L2CAP_MODE_EXT_FLOWCTL:
+ if (!enable_ecred) {
+ err = -EOPNOTSUPP;
+ goto done;
+ }
break;
case L2CAP_MODE_ERTM:
case L2CAP_MODE_STREAMING:
if (!disable_ertm)
break;
- /* fall through */
+ fallthrough;
default:
err = -EOPNOTSUPP;
goto done;
@@ -7153,18 +7056,18 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
dst_type = ADDR_LE_DEV_RANDOM;
if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
- hcon = hci_connect_le(hdev, dst, dst_type,
- chan->sec_level,
- HCI_LE_CONN_TIMEOUT,
- HCI_ROLE_SLAVE, NULL);
+ hcon = hci_connect_le(hdev, dst, dst_type, false,
+ chan->sec_level, timeout,
+ HCI_ROLE_SLAVE, 0, 0);
else
hcon = hci_connect_le_scan(hdev, dst, dst_type,
- chan->sec_level,
- HCI_LE_CONN_TIMEOUT);
+ chan->sec_level, timeout,
+ CONN_REASON_L2CAP_CHAN);
} else {
u8 auth_type = l2cap_get_auth_type(chan);
- hcon = hci_connect_acl(hdev, dst, chan->sec_level, auth_type);
+ hcon = hci_connect_acl(hdev, dst, chan->sec_level, auth_type,
+ CONN_REASON_L2CAP_CHAN, timeout);
}
if (IS_ERR(hcon)) {
@@ -7179,7 +7082,24 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
goto done;
}
- mutex_lock(&conn->chan_lock);
+ if (chan->mode == L2CAP_MODE_EXT_FLOWCTL) {
+ struct l2cap_chan_data data;
+
+ data.chan = chan;
+ data.pid = chan->ops->get_peer_pid(chan);
+ data.count = 1;
+
+ l2cap_chan_list(conn, l2cap_chan_by_pid, &data);
+
+ /* Check if there isn't too many channels being connected */
+ if (data.count > L2CAP_ECRED_CONN_SCID_MAX) {
+ hci_conn_drop(hcon);
+ err = -EPROTO;
+ goto done;
+ }
+ }
+
+ mutex_lock(&conn->lock);
l2cap_chan_lock(chan);
if (cid && __l2cap_get_chan_by_dcid(conn, cid)) {
@@ -7220,7 +7140,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
chan_unlock:
l2cap_chan_unlock(chan);
- mutex_unlock(&conn->chan_lock);
+ mutex_unlock(&conn->lock);
done:
hci_dev_unlock(hdev);
hci_dev_put(hdev);
@@ -7228,6 +7148,35 @@ done:
}
EXPORT_SYMBOL_GPL(l2cap_chan_connect);
+static void l2cap_ecred_reconfigure(struct l2cap_chan *chan)
+{
+ struct l2cap_conn *conn = chan->conn;
+ DEFINE_RAW_FLEX(struct l2cap_ecred_reconf_req, pdu, scid, 1);
+
+ pdu->mtu = cpu_to_le16(chan->imtu);
+ pdu->mps = cpu_to_le16(chan->mps);
+ pdu->scid[0] = cpu_to_le16(chan->scid);
+
+ chan->ident = l2cap_get_ident(conn);
+
+ l2cap_send_cmd(conn, chan->ident, L2CAP_ECRED_RECONF_REQ,
+ sizeof(pdu), &pdu);
+}
+
+int l2cap_chan_reconfigure(struct l2cap_chan *chan, __u16 mtu)
+{
+ if (chan->imtu > mtu)
+ return -EINVAL;
+
+ BT_DBG("chan %p mtu 0x%4.4x", chan, mtu);
+
+ chan->imtu = mtu;
+
+ l2cap_ecred_reconfigure(chan);
+
+ return 0;
+}
+
/* ---- L2CAP interface with lower layer (HCI) ---- */
int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr)
@@ -7285,7 +7234,7 @@ static struct l2cap_chan *l2cap_global_fixed_chan(struct l2cap_chan *c,
if (src_type != c->src_type)
continue;
- l2cap_chan_hold(c);
+ c = l2cap_chan_hold_unless_zero(c);
read_unlock(&chan_list_lock);
return c;
}
@@ -7319,7 +7268,7 @@ static void l2cap_connect_cfm(struct hci_conn *hcon, u8 status)
dst_type = bdaddr_dst_type(hcon);
/* If device is blocked, do not create channels for it */
- if (hci_bdaddr_list_lookup(&hdev->blacklist, &hcon->dst, dst_type))
+ if (hci_bdaddr_list_lookup(&hdev->reject_list, &hcon->dst, dst_type))
return;
/* Find fixed channels and notify them of the new connection. We
@@ -7404,7 +7353,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
BT_DBG("conn %p status 0x%2.2x encrypt %u", conn, status, encrypt);
- mutex_lock(&conn->chan_lock);
+ mutex_lock(&conn->lock);
list_for_each_entry(chan, &conn->chan_l, list) {
l2cap_chan_lock(chan);
@@ -7412,11 +7361,6 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
BT_DBG("chan %p scid 0x%4.4x state %s", chan, chan->scid,
state_to_string(chan->state));
- if (chan->scid == L2CAP_CID_A2MP) {
- l2cap_chan_unlock(chan);
- continue;
- }
-
if (!status && encrypt)
chan->sec_level = hcon->sec_level;
@@ -7434,16 +7378,17 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
}
if (chan->state == BT_CONNECT) {
- if (!status)
+ if (!status && l2cap_check_enc_key_size(hcon, chan))
l2cap_start_connection(chan);
else
__set_chan_timer(chan, L2CAP_DISC_TIMEOUT);
} else if (chan->state == BT_CONNECT2 &&
- chan->mode != L2CAP_MODE_LE_FLOWCTL) {
+ !(chan->mode == L2CAP_MODE_EXT_FLOWCTL ||
+ chan->mode == L2CAP_MODE_LE_FLOWCTL)) {
struct l2cap_conn_rsp rsp;
__u16 res, stat;
- if (!status) {
+ if (!status && l2cap_check_enc_key_size(hcon, chan)) {
if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) {
res = L2CAP_CR_PEND;
stat = L2CAP_CS_AUTHOR_PEND;
@@ -7482,96 +7427,216 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
l2cap_chan_unlock(chan);
}
- mutex_unlock(&conn->chan_lock);
+ mutex_unlock(&conn->lock);
}
-void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
+/* Append fragment into frame respecting the maximum len of rx_skb */
+static int l2cap_recv_frag(struct l2cap_conn *conn, struct sk_buff *skb,
+ u16 len)
{
- struct l2cap_conn *conn = hcon->l2cap_data;
- struct l2cap_hdr *hdr;
+ if (!conn->rx_skb) {
+ /* Allocate skb for the complete frame (with header) */
+ conn->rx_skb = bt_skb_alloc(len, GFP_KERNEL);
+ if (!conn->rx_skb)
+ return -ENOMEM;
+ /* Init rx_len */
+ conn->rx_len = len;
+
+ skb_set_delivery_time(conn->rx_skb, skb->tstamp,
+ skb->tstamp_type);
+ }
+
+ /* Copy as much as the rx_skb can hold */
+ len = min_t(u16, len, skb->len);
+ skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, len), len);
+ skb_pull(skb, len);
+ conn->rx_len -= len;
+
+ return len;
+}
+
+static int l2cap_recv_len(struct l2cap_conn *conn, struct sk_buff *skb)
+{
+ struct sk_buff *rx_skb;
int len;
- /* For AMP controller do not create l2cap conn */
- if (!conn && hcon->hdev->dev_type != HCI_PRIMARY)
- goto drop;
+ /* Append just enough to complete the header */
+ len = l2cap_recv_frag(conn, skb, L2CAP_LEN_SIZE - conn->rx_skb->len);
+
+ /* If header could not be read just continue */
+ if (len < 0 || conn->rx_skb->len < L2CAP_LEN_SIZE)
+ return len;
+
+ rx_skb = conn->rx_skb;
+ len = get_unaligned_le16(rx_skb->data);
+
+ /* Check if rx_skb has enough space to received all fragments */
+ if (len + (L2CAP_HDR_SIZE - L2CAP_LEN_SIZE) <= skb_tailroom(rx_skb)) {
+ /* Update expected len */
+ conn->rx_len = len + (L2CAP_HDR_SIZE - L2CAP_LEN_SIZE);
+ return L2CAP_LEN_SIZE;
+ }
+
+ /* Reset conn->rx_skb since it will need to be reallocated in order to
+ * fit all fragments.
+ */
+ conn->rx_skb = NULL;
+
+ /* Reallocates rx_skb using the exact expected length */
+ len = l2cap_recv_frag(conn, rx_skb,
+ len + (L2CAP_HDR_SIZE - L2CAP_LEN_SIZE));
+ kfree_skb(rx_skb);
+
+ return len;
+}
+
+static void l2cap_recv_reset(struct l2cap_conn *conn)
+{
+ kfree_skb(conn->rx_skb);
+ conn->rx_skb = NULL;
+ conn->rx_len = 0;
+}
+
+struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *c)
+{
+ if (!c)
+ return NULL;
+
+ BT_DBG("conn %p orig refcnt %u", c, kref_read(&c->ref));
+
+ if (!kref_get_unless_zero(&c->ref))
+ return NULL;
+
+ return c;
+}
+
+int l2cap_recv_acldata(struct hci_dev *hdev, u16 handle,
+ struct sk_buff *skb, u16 flags)
+{
+ struct hci_conn *hcon;
+ struct l2cap_conn *conn;
+ int len;
+
+ /* Lock hdev for hci_conn, and race on l2cap_data vs. l2cap_conn_del */
+ hci_dev_lock(hdev);
+
+ hcon = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!hcon) {
+ hci_dev_unlock(hdev);
+ kfree_skb(skb);
+ return -ENOENT;
+ }
+
+ hci_conn_enter_active_mode(hcon, BT_POWER_FORCE_ACTIVE_OFF);
+
+ conn = hcon->l2cap_data;
if (!conn)
conn = l2cap_conn_add(hcon);
- if (!conn)
- goto drop;
+ conn = l2cap_conn_hold_unless_zero(conn);
+ hcon = NULL;
+
+ hci_dev_unlock(hdev);
+
+ if (!conn) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
- BT_DBG("conn %p len %d flags 0x%x", conn, skb->len, flags);
+ BT_DBG("conn %p len %u flags 0x%x", conn, skb->len, flags);
+
+ mutex_lock(&conn->lock);
switch (flags) {
case ACL_START:
case ACL_START_NO_FLUSH:
case ACL_COMPLETE:
- if (conn->rx_len) {
+ if (conn->rx_skb) {
BT_ERR("Unexpected start frame (len %d)", skb->len);
- kfree_skb(conn->rx_skb);
- conn->rx_skb = NULL;
- conn->rx_len = 0;
+ l2cap_recv_reset(conn);
l2cap_conn_unreliable(conn, ECOMM);
}
- /* Start fragment always begin with Basic L2CAP header */
- if (skb->len < L2CAP_HDR_SIZE) {
- BT_ERR("Frame is too short (len %d)", skb->len);
- l2cap_conn_unreliable(conn, ECOMM);
- goto drop;
+ /* Start fragment may not contain the L2CAP length so just
+ * copy the initial byte when that happens and use conn->mtu as
+ * expected length.
+ */
+ if (skb->len < L2CAP_LEN_SIZE) {
+ l2cap_recv_frag(conn, skb, conn->mtu);
+ break;
}
- hdr = (struct l2cap_hdr *) skb->data;
- len = __le16_to_cpu(hdr->len) + L2CAP_HDR_SIZE;
+ len = get_unaligned_le16(skb->data) + L2CAP_HDR_SIZE;
if (len == skb->len) {
/* Complete frame received */
l2cap_recv_frame(conn, skb);
- return;
+ goto unlock;
}
- BT_DBG("Start: total len %d, frag len %d", len, skb->len);
+ BT_DBG("Start: total len %d, frag len %u", len, skb->len);
if (skb->len > len) {
- BT_ERR("Frame is too long (len %d, expected len %d)",
+ BT_ERR("Frame is too long (len %u, expected len %d)",
skb->len, len);
+ /* PTS test cases L2CAP/COS/CED/BI-14-C and BI-15-C
+ * (Multiple Signaling Command in one PDU, Data
+ * Truncated, BR/EDR) send a C-frame to the IUT with
+ * PDU Length set to 8 and Channel ID set to the
+ * correct signaling channel for the logical link.
+ * The Information payload contains one L2CAP_ECHO_REQ
+ * packet with Data Length set to 0 with 0 octets of
+ * echo data and one invalid command packet due to
+ * data truncated in PDU but present in HCI packet.
+ *
+ * Shorter the socket buffer to the PDU length to
+ * allow to process valid commands from the PDU before
+ * setting the socket unreliable.
+ */
+ skb->len = len;
+ l2cap_recv_frame(conn, skb);
l2cap_conn_unreliable(conn, ECOMM);
- goto drop;
+ goto unlock;
}
- /* Allocate skb for the complete frame (with header) */
- conn->rx_skb = bt_skb_alloc(len, GFP_KERNEL);
- if (!conn->rx_skb)
+ /* Append fragment into frame (with header) */
+ if (l2cap_recv_frag(conn, skb, len) < 0)
goto drop;
- skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
- skb->len);
- conn->rx_len = len - skb->len;
break;
case ACL_CONT:
- BT_DBG("Cont: frag len %d (expecting %d)", skb->len, conn->rx_len);
+ BT_DBG("Cont: frag len %u (expecting %u)", skb->len, conn->rx_len);
- if (!conn->rx_len) {
+ if (!conn->rx_skb) {
BT_ERR("Unexpected continuation frame (len %d)", skb->len);
l2cap_conn_unreliable(conn, ECOMM);
goto drop;
}
+ /* Complete the L2CAP length if it has not been read */
+ if (conn->rx_skb->len < L2CAP_LEN_SIZE) {
+ if (l2cap_recv_len(conn, skb) < 0) {
+ l2cap_conn_unreliable(conn, ECOMM);
+ goto drop;
+ }
+
+ /* Header still could not be read just continue */
+ if (conn->rx_skb->len < L2CAP_LEN_SIZE)
+ break;
+ }
+
if (skb->len > conn->rx_len) {
- BT_ERR("Fragment is too long (len %d, expected %d)",
+ BT_ERR("Fragment is too long (len %u, expected %u)",
skb->len, conn->rx_len);
- kfree_skb(conn->rx_skb);
- conn->rx_skb = NULL;
- conn->rx_len = 0;
+ l2cap_recv_reset(conn);
l2cap_conn_unreliable(conn, ECOMM);
goto drop;
}
- skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
- skb->len);
- conn->rx_len -= skb->len;
+ /* Append fragment into frame (with header) */
+ l2cap_recv_frag(conn, skb, skb->len);
if (!conn->rx_len) {
/* Complete frame received. l2cap_recv_frame
@@ -7587,6 +7652,10 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
drop:
kfree_skb(skb);
+unlock:
+ mutex_unlock(&conn->lock);
+ l2cap_conn_put(conn);
+ return 0;
}
static struct hci_cb l2cap_cb = {
@@ -7615,17 +7684,7 @@ static int l2cap_debugfs_show(struct seq_file *f, void *p)
return 0;
}
-static int l2cap_debugfs_open(struct inode *inode, struct file *file)
-{
- return single_open(file, l2cap_debugfs_show, inode->i_private);
-}
-
-static const struct file_operations l2cap_debugfs_fops = {
- .open = l2cap_debugfs_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(l2cap_debugfs);
static struct dentry *l2cap_debugfs;
@@ -7645,11 +7704,6 @@ int __init l2cap_init(void)
l2cap_debugfs = debugfs_create_file("l2cap", 0444, bt_debugfs,
NULL, &l2cap_debugfs_fops);
- debugfs_create_u16("l2cap_le_max_credits", 0644, bt_debugfs,
- &le_max_credits);
- debugfs_create_u16("l2cap_le_default_mps", 0644, bt_debugfs,
- &le_default_mps);
-
return 0;
}
@@ -7662,3 +7716,6 @@ void l2cap_exit(void)
module_param(disable_ertm, bool, 0644);
MODULE_PARM_DESC(disable_ertm, "Disable enhanced retransmission mode");
+
+module_param(enable_ecred, bool, 0644);
+MODULE_PARM_DESC(enable_ecred, "Enable enhanced credit flow control mode");
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 686bdc6b35b0..9ee189c815d4 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -29,6 +29,7 @@
#include <linux/module.h>
#include <linux/export.h>
+#include <linux/filter.h>
#include <linux/sched/signal.h>
#include <net/bluetooth/bluetooth.h>
@@ -45,6 +46,7 @@ static const struct proto_ops l2cap_sock_ops;
static void l2cap_sock_init(struct sock *sk, struct sock *parent);
static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock,
int proto, gfp_t prio, int kern);
+static void l2cap_sock_cleanup_listen(struct sock *parent);
bool l2cap_is_socket(struct socket *sock)
{
@@ -78,7 +80,7 @@ static int l2cap_validate_le_psm(u16 psm)
return 0;
}
-static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
+static int l2cap_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int alen)
{
struct sock *sk = sock->sk;
struct l2cap_chan *chan = l2cap_pi(sk)->chan;
@@ -161,7 +163,11 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
break;
}
- if (chan->psm && bdaddr_type_is_le(chan->src_type))
+ /* Use L2CAP_MODE_LE_FLOWCTL (CoC) in case of LE address and
+ * L2CAP_MODE_EXT_FLOWCTL (ECRED) has not been set.
+ */
+ if (chan->psm && bdaddr_type_is_le(chan->src_type) &&
+ chan->mode != L2CAP_MODE_EXT_FLOWCTL)
chan->mode = L2CAP_MODE_LE_FLOWCTL;
chan->state = BT_BOUND;
@@ -172,16 +178,24 @@ done:
return err;
}
-static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr,
+static int l2cap_sock_connect(struct socket *sock, struct sockaddr_unsized *addr,
int alen, int flags)
{
struct sock *sk = sock->sk;
struct l2cap_chan *chan = l2cap_pi(sk)->chan;
struct sockaddr_l2 la;
int len, err = 0;
+ bool zapped;
BT_DBG("sk %p", sk);
+ lock_sock(sk);
+ zapped = sock_flag(sk, SOCK_ZAPPED);
+ release_sock(sk);
+
+ if (zapped)
+ return -EINVAL;
+
if (!addr || alen < offsetofend(struct sockaddr, sa_family) ||
addr->sa_family != AF_BLUETOOTH)
return -EINVAL;
@@ -232,11 +246,16 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr,
return -EINVAL;
}
- if (chan->psm && bdaddr_type_is_le(chan->src_type))
+ /* Use L2CAP_MODE_LE_FLOWCTL (CoC) in case of LE address and
+ * L2CAP_MODE_EXT_FLOWCTL (ECRED) has not been set.
+ */
+ if (chan->psm && bdaddr_type_is_le(chan->src_type) &&
+ chan->mode != L2CAP_MODE_EXT_FLOWCTL)
chan->mode = L2CAP_MODE_LE_FLOWCTL;
err = l2cap_chan_connect(chan, la.l2_psm, __le16_to_cpu(la.l2_cid),
- &la.l2_bdaddr, la.l2_bdaddr_type);
+ &la.l2_bdaddr, la.l2_bdaddr_type,
+ READ_ONCE(sk->sk_sndtimeo));
if (err)
return err;
@@ -274,11 +293,17 @@ static int l2cap_sock_listen(struct socket *sock, int backlog)
case L2CAP_MODE_BASIC:
case L2CAP_MODE_LE_FLOWCTL:
break;
+ case L2CAP_MODE_EXT_FLOWCTL:
+ if (!enable_ecred) {
+ err = -EOPNOTSUPP;
+ goto done;
+ }
+ break;
case L2CAP_MODE_ERTM:
case L2CAP_MODE_STREAMING:
if (!disable_ertm)
break;
- /* fall through */
+ fallthrough;
default:
err = -EOPNOTSUPP;
goto done;
@@ -302,7 +327,7 @@ done:
}
static int l2cap_sock_accept(struct socket *sock, struct socket *newsock,
- int flags, bool kern)
+ struct proto_accept_arg *arg)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct sock *sk = sock->sk, *nsk;
@@ -311,7 +336,7 @@ static int l2cap_sock_accept(struct socket *sock, struct socket *newsock,
lock_sock_nested(sk, L2CAP_NESTING_PARENT);
- timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+ timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
BT_DBG("sk %p timeo %ld", sk, timeo);
@@ -389,6 +414,24 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr,
return sizeof(struct sockaddr_l2);
}
+static int l2cap_get_mode(struct l2cap_chan *chan)
+{
+ switch (chan->mode) {
+ case L2CAP_MODE_BASIC:
+ return BT_MODE_BASIC;
+ case L2CAP_MODE_ERTM:
+ return BT_MODE_ERTM;
+ case L2CAP_MODE_STREAMING:
+ return BT_MODE_STREAMING;
+ case L2CAP_MODE_LE_FLOWCTL:
+ return BT_MODE_LE_FLOWCTL;
+ case L2CAP_MODE_EXT_FLOWCTL:
+ return BT_MODE_EXT_FLOWCTL;
+ }
+
+ return -EINVAL;
+}
+
static int l2cap_sock_getsockopt_old(struct socket *sock, int optname,
char __user *optval, int __user *optlen)
{
@@ -396,7 +439,8 @@ static int l2cap_sock_getsockopt_old(struct socket *sock, int optname,
struct l2cap_chan *chan = l2cap_pi(sk)->chan;
struct l2cap_options opts;
struct l2cap_conninfo cinfo;
- int len, err = 0;
+ int err = 0;
+ size_t len;
u32 opt;
BT_DBG("sk %p", sk);
@@ -418,6 +462,20 @@ static int l2cap_sock_getsockopt_old(struct socket *sock, int optname,
break;
}
+ /* Only BR/EDR modes are supported here */
+ switch (chan->mode) {
+ case L2CAP_MODE_BASIC:
+ case L2CAP_MODE_ERTM:
+ case L2CAP_MODE_STREAMING:
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+ if (err < 0)
+ break;
+
memset(&opts, 0, sizeof(opts));
opts.imtu = chan->imtu;
opts.omtu = chan->omtu;
@@ -427,7 +485,9 @@ static int l2cap_sock_getsockopt_old(struct socket *sock, int optname,
opts.max_tx = chan->max_tx;
opts.txwin_size = chan->tx_win;
- len = min_t(unsigned int, len, sizeof(opts));
+ BT_DBG("mode 0x%2.2x", chan->mode);
+
+ len = min(len, sizeof(opts));
if (copy_to_user(optval, (char *) &opts, len))
err = -EFAULT;
@@ -477,7 +537,7 @@ static int l2cap_sock_getsockopt_old(struct socket *sock, int optname,
cinfo.hci_handle = chan->conn->hcon->handle;
memcpy(cinfo.dev_class, chan->conn->hcon->dev_class, 3);
- len = min_t(unsigned int, len, sizeof(cinfo));
+ len = min(len, sizeof(cinfo));
if (copy_to_user(optval, (char *) &cinfo, len))
err = -EFAULT;
@@ -499,7 +559,8 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname,
struct l2cap_chan *chan = l2cap_pi(sk)->chan;
struct bt_security sec;
struct bt_power pwr;
- int len, err = 0;
+ u32 phys;
+ int len, mode, err = 0;
BT_DBG("sk %p", sk);
@@ -603,6 +664,39 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname,
err = -EFAULT;
break;
+ case BT_PHY:
+ if (sk->sk_state != BT_CONNECTED) {
+ err = -ENOTCONN;
+ break;
+ }
+
+ phys = hci_conn_get_phy(chan->conn->hcon);
+
+ if (put_user(phys, (u32 __user *) optval))
+ err = -EFAULT;
+ break;
+
+ case BT_MODE:
+ if (!enable_ecred) {
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) {
+ err = -EINVAL;
+ break;
+ }
+
+ mode = l2cap_get_mode(chan);
+ if (mode < 0) {
+ err = mode;
+ break;
+ }
+
+ if (put_user(mode, (u8 __user *) optval))
+ err = -EFAULT;
+ break;
+
default:
err = -ENOPROTOOPT;
break;
@@ -616,12 +710,12 @@ static bool l2cap_valid_mtu(struct l2cap_chan *chan, u16 mtu)
{
switch (chan->scid) {
case L2CAP_CID_ATT:
- if (mtu < L2CAP_LE_MIN_MTU)
+ if (mtu && mtu < L2CAP_LE_MIN_MTU)
return false;
break;
default:
- if (mtu < L2CAP_DEFAULT_MIN_MTU)
+ if (mtu && mtu < L2CAP_DEFAULT_MIN_MTU)
return false;
}
@@ -629,12 +723,12 @@ static bool l2cap_valid_mtu(struct l2cap_chan *chan, u16 mtu)
}
static int l2cap_sock_setsockopt_old(struct socket *sock, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
struct l2cap_chan *chan = l2cap_pi(sk)->chan;
struct l2cap_options opts;
- int len, err = 0;
+ int err = 0;
u32 opt;
BT_DBG("sk %p", sk);
@@ -661,11 +755,10 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname,
opts.max_tx = chan->max_tx;
opts.txwin_size = chan->tx_win;
- len = min_t(unsigned int, sizeof(opts), optlen);
- if (copy_from_user((char *) &opts, optval, len)) {
- err = -EFAULT;
+ err = copy_safe_from_sockptr(&opts, sizeof(opts), optval,
+ optlen);
+ if (err)
break;
- }
if (opts.txwin_size > L2CAP_DEFAULT_EXT_WINDOW) {
err = -EINVAL;
@@ -677,10 +770,8 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname,
break;
}
- chan->mode = opts.mode;
- switch (chan->mode) {
- case L2CAP_MODE_LE_FLOWCTL:
- break;
+ /* Only BR/EDR modes are supported here */
+ switch (opts.mode) {
case L2CAP_MODE_BASIC:
clear_bit(CONF_STATE2_DEVICE, &chan->conf_state);
break;
@@ -688,12 +779,19 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname,
case L2CAP_MODE_STREAMING:
if (!disable_ertm)
break;
- /* fall through */
+ fallthrough;
default:
err = -EINVAL;
break;
}
+ if (err < 0)
+ break;
+
+ chan->mode = opts.mode;
+
+ BT_DBG("mode 0x%2.2x", chan->mode);
+
chan->imtu = opts.imtu;
chan->omtu = opts.omtu;
chan->fcs = opts.fcs;
@@ -703,10 +801,9 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname,
break;
case L2CAP_LM:
- if (get_user(opt, (u32 __user *) optval)) {
- err = -EFAULT;
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
break;
- }
if (opt & L2CAP_LM_FIPS) {
err = -EINVAL;
@@ -740,16 +837,57 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname,
return err;
}
+static int l2cap_set_mode(struct l2cap_chan *chan, u8 mode)
+{
+ switch (mode) {
+ case BT_MODE_BASIC:
+ if (bdaddr_type_is_le(chan->src_type))
+ return -EINVAL;
+ mode = L2CAP_MODE_BASIC;
+ clear_bit(CONF_STATE2_DEVICE, &chan->conf_state);
+ break;
+ case BT_MODE_ERTM:
+ if (!disable_ertm || bdaddr_type_is_le(chan->src_type))
+ return -EINVAL;
+ mode = L2CAP_MODE_ERTM;
+ break;
+ case BT_MODE_STREAMING:
+ if (!disable_ertm || bdaddr_type_is_le(chan->src_type))
+ return -EINVAL;
+ mode = L2CAP_MODE_STREAMING;
+ break;
+ case BT_MODE_LE_FLOWCTL:
+ if (!bdaddr_type_is_le(chan->src_type))
+ return -EINVAL;
+ mode = L2CAP_MODE_LE_FLOWCTL;
+ break;
+ case BT_MODE_EXT_FLOWCTL:
+ /* TODO: Add support for ECRED PDUs to BR/EDR */
+ if (!bdaddr_type_is_le(chan->src_type))
+ return -EINVAL;
+ mode = L2CAP_MODE_EXT_FLOWCTL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ chan->mode = mode;
+
+ return 0;
+}
+
static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
struct l2cap_chan *chan = l2cap_pi(sk)->chan;
struct bt_security sec;
struct bt_power pwr;
struct l2cap_conn *conn;
- int len, err = 0;
+ int err = 0;
u32 opt;
+ u16 mtu;
+ u8 mode;
BT_DBG("sk %p", sk);
@@ -772,11 +910,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname,
sec.level = BT_SECURITY_LOW;
- len = min_t(unsigned int, sizeof(sec), optlen);
- if (copy_from_user((char *) &sec, optval, len)) {
- err = -EFAULT;
+ err = copy_safe_from_sockptr(&sec, sizeof(sec), optval, optlen);
+ if (err)
break;
- }
if (sec.level < BT_SECURITY_LOW ||
sec.level > BT_SECURITY_FIPS) {
@@ -791,10 +927,13 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname,
conn = chan->conn;
- /*change security for LE channels */
+ /* change security for LE channels */
if (chan->scid == L2CAP_CID_ATT) {
- if (smp_conn_security(conn->hcon, sec.level))
+ if (smp_conn_security(conn->hcon, sec.level)) {
+ err = -EINVAL;
break;
+ }
+
set_bit(FLAG_PENDING_SECURITY, &chan->flags);
sk->sk_state = BT_CONFIG;
chan->state = BT_CONFIG;
@@ -818,10 +957,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname,
break;
}
- if (get_user(opt, (u32 __user *) optval)) {
- err = -EFAULT;
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
break;
- }
if (opt) {
set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
@@ -833,10 +971,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname,
break;
case BT_FLUSHABLE:
- if (get_user(opt, (u32 __user *) optval)) {
- err = -EFAULT;
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
break;
- }
if (opt > BT_FLUSHABLE_ON) {
err = -EINVAL;
@@ -868,11 +1005,9 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname,
pwr.force_active = BT_POWER_FORCE_ACTIVE_ON;
- len = min_t(unsigned int, sizeof(pwr), optlen);
- if (copy_from_user((char *) &pwr, optval, len)) {
- err = -EFAULT;
+ err = copy_safe_from_sockptr(&pwr, sizeof(pwr), optval, optlen);
+ if (err)
break;
- }
if (pwr.force_active)
set_bit(FLAG_FORCE_ACTIVE, &chan->flags);
@@ -881,28 +1016,11 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname,
break;
case BT_CHANNEL_POLICY:
- if (get_user(opt, (u32 __user *) optval)) {
- err = -EFAULT;
- break;
- }
-
- if (opt > BT_CHANNEL_POLICY_AMP_PREFERRED) {
- err = -EINVAL;
- break;
- }
-
- if (chan->mode != L2CAP_MODE_ERTM &&
- chan->mode != L2CAP_MODE_STREAMING) {
- err = -EOPNOTSUPP;
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
break;
- }
-
- chan->chan_policy = (u8) opt;
-
- if (sk->sk_state == BT_CONNECTED &&
- chan->move_role == L2CAP_MOVE_ROLE_NONE)
- l2cap_move_start(chan);
+ err = -EOPNOTSUPP;
break;
case BT_SNDMTU:
@@ -923,17 +1041,55 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname,
break;
}
- if (sk->sk_state == BT_CONNECTED) {
+ if (chan->mode == L2CAP_MODE_LE_FLOWCTL &&
+ sk->sk_state == BT_CONNECTED) {
err = -EISCONN;
break;
}
- if (get_user(opt, (u16 __user *) optval)) {
- err = -EFAULT;
+ err = copy_safe_from_sockptr(&mtu, sizeof(mtu), optval, optlen);
+ if (err)
+ break;
+
+ if (chan->mode == L2CAP_MODE_EXT_FLOWCTL &&
+ sk->sk_state == BT_CONNECTED)
+ err = l2cap_chan_reconfigure(chan, mtu);
+ else
+ chan->imtu = mtu;
+
+ break;
+
+ case BT_MODE:
+ if (!enable_ecred) {
+ err = -ENOPROTOOPT;
+ break;
+ }
+
+ BT_DBG("sk->sk_state %u", sk->sk_state);
+
+ if (sk->sk_state != BT_BOUND) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) {
+ err = -EINVAL;
break;
}
- chan->imtu = opt;
+ err = copy_safe_from_sockptr(&mode, sizeof(mode), optval,
+ optlen);
+ if (err)
+ break;
+
+ BT_DBG("mode %u", mode);
+
+ err = l2cap_set_mode(chan, mode);
+ if (err)
+ break;
+
+ BT_DBG("mode 0x%2.2x", chan->mode);
+
break;
default:
@@ -950,6 +1106,7 @@ static int l2cap_sock_sendmsg(struct socket *sock, struct msghdr *msg,
{
struct sock *sk = sock->sk;
struct l2cap_chan *chan = l2cap_pi(sk)->chan;
+ struct sockcm_cookie sockc;
int err;
BT_DBG("sock %p, sk %p", sock, sk);
@@ -964,6 +1121,14 @@ static int l2cap_sock_sendmsg(struct socket *sock, struct msghdr *msg,
if (sk->sk_state != BT_CONNECTED)
return -ENOTCONN;
+ hci_sockcm_init(&sockc, sk);
+
+ if (msg->msg_controllen) {
+ err = sock_cmsg_send(sk, msg, &sockc);
+ if (err)
+ return err;
+ }
+
lock_sock(sk);
err = bt_sock_wait_ready(sk, msg->msg_flags);
release_sock(sk);
@@ -971,12 +1136,40 @@ static int l2cap_sock_sendmsg(struct socket *sock, struct msghdr *msg,
return err;
l2cap_chan_lock(chan);
- err = l2cap_chan_send(chan, msg, len);
+ err = l2cap_chan_send(chan, msg, len, &sockc);
l2cap_chan_unlock(chan);
return err;
}
+static void l2cap_publish_rx_avail(struct l2cap_chan *chan)
+{
+ struct sock *sk = chan->data;
+ ssize_t avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc);
+ int expected_skbs, skb_overhead;
+
+ if (avail <= 0) {
+ l2cap_chan_rx_avail(chan, 0);
+ return;
+ }
+
+ if (!chan->mps) {
+ l2cap_chan_rx_avail(chan, -1);
+ return;
+ }
+
+ /* Correct available memory by estimated sk_buff overhead.
+ * This is significant due to small transfer sizes. However, accept
+ * at least one full packet if receive space is non-zero.
+ */
+ expected_skbs = DIV_ROUND_UP(avail, chan->mps);
+ skb_overhead = expected_skbs * sizeof(struct sk_buff);
+ if (skb_overhead < avail)
+ l2cap_chan_rx_avail(chan, avail - skb_overhead);
+ else
+ l2cap_chan_rx_avail(chan, -1);
+}
+
static int l2cap_sock_recvmsg(struct socket *sock, struct msghdr *msg,
size_t len, int flags)
{
@@ -984,11 +1177,19 @@ static int l2cap_sock_recvmsg(struct socket *sock, struct msghdr *msg,
struct l2cap_pinfo *pi = l2cap_pi(sk);
int err;
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return sock_recv_errqueue(sk, msg, len, SOL_BLUETOOTH,
+ BT_SCM_ERROR);
+
lock_sock(sk);
if (sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_DEFER_SETUP,
&bt_sk(sk)->flags)) {
- if (bdaddr_type_is_le(pi->chan->src_type)) {
+ if (pi->chan->mode == L2CAP_MODE_EXT_FLOWCTL) {
+ sk->sk_state = BT_CONNECTED;
+ pi->chan->state = BT_CONNECTED;
+ __l2cap_ecred_conn_rsp_defer(pi->chan);
+ } else if (bdaddr_type_is_le(pi->chan->src_type)) {
sk->sk_state = BT_CONNECTED;
pi->chan->state = BT_CONNECTED;
__l2cap_le_connect_rsp_defer(pi->chan);
@@ -1009,28 +1210,33 @@ static int l2cap_sock_recvmsg(struct socket *sock, struct msghdr *msg,
else
err = bt_sock_recvmsg(sock, msg, len, flags);
- if (pi->chan->mode != L2CAP_MODE_ERTM)
+ if (pi->chan->mode != L2CAP_MODE_ERTM &&
+ pi->chan->mode != L2CAP_MODE_LE_FLOWCTL &&
+ pi->chan->mode != L2CAP_MODE_EXT_FLOWCTL)
return err;
- /* Attempt to put pending rx data in the socket buffer */
-
lock_sock(sk);
- if (!test_bit(CONN_LOCAL_BUSY, &pi->chan->conn_state))
- goto done;
+ l2cap_publish_rx_avail(pi->chan);
- if (pi->rx_busy_skb) {
- if (!__sock_queue_rcv_skb(sk, pi->rx_busy_skb))
- pi->rx_busy_skb = NULL;
- else
+ /* Attempt to put pending rx data in the socket buffer */
+ while (!list_empty(&pi->rx_busy)) {
+ struct l2cap_rx_busy *rx_busy =
+ list_first_entry(&pi->rx_busy,
+ struct l2cap_rx_busy,
+ list);
+ if (__sock_queue_rcv_skb(sk, rx_busy->skb) < 0)
goto done;
+ list_del(&rx_busy->list);
+ kfree(rx_busy);
}
/* Restore data flow when half of the receive buffer is
* available. This avoids resending large numbers of
* frames.
*/
- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf >> 1)
+ if (test_bit(CONN_LOCAL_BUSY, &pi->chan->conn_state) &&
+ atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf >> 1)
l2cap_chan_busy(pi->chan, 0);
done:
@@ -1039,7 +1245,7 @@ done:
}
/* Kill socket (only if zapped and orphan)
- * Must be called on unlocked socket.
+ * Must be called on unlocked socket, with l2cap channel lock.
*/
static void l2cap_sock_kill(struct sock *sk)
{
@@ -1048,6 +1254,10 @@ static void l2cap_sock_kill(struct sock *sk)
BT_DBG("sk %p state %s", sk, state_to_string(sk->sk_state));
+ /* Sock is dead, so set chan data to NULL, avoid other task use invalid
+ * sock pointer.
+ */
+ l2cap_pi(sk)->chan->data = NULL;
/* Kill poor orphan */
l2cap_chan_put(l2cap_pi(sk)->chan);
@@ -1107,14 +1317,21 @@ static int l2cap_sock_shutdown(struct socket *sock, int how)
struct l2cap_conn *conn;
int err = 0;
- BT_DBG("sock %p, sk %p", sock, sk);
+ BT_DBG("sock %p, sk %p, how %d", sock, sk, how);
+
+ /* 'how' parameter is mapped to sk_shutdown as follows:
+ * SHUT_RD (0) --> RCV_SHUTDOWN (1)
+ * SHUT_WR (1) --> SEND_SHUTDOWN (2)
+ * SHUT_RDWR (2) --> SHUTDOWN_MASK (3)
+ */
+ how++;
if (!sk)
return 0;
lock_sock(sk);
- if (sk->sk_shutdown)
+ if ((sk->sk_shutdown & how) == how)
goto shutdown_already;
BT_DBG("Handling sock shutdown");
@@ -1122,9 +1339,10 @@ static int l2cap_sock_shutdown(struct socket *sock, int how)
/* prevent sk structure from being freed whilst unlocked */
sock_hold(sk);
- chan = l2cap_pi(sk)->chan;
/* prevent chan structure from being freed whilst unlocked */
- l2cap_chan_hold(chan);
+ chan = l2cap_chan_hold_unless_zero(l2cap_pi(sk)->chan);
+ if (!chan)
+ goto shutdown_already;
BT_DBG("chan %p state %s", chan, state_to_string(chan->state));
@@ -1137,30 +1355,37 @@ static int l2cap_sock_shutdown(struct socket *sock, int how)
* has already been actioned to close the L2CAP
* link such as by l2cap_disconnection_req().
*/
- if (sk->sk_shutdown)
- goto has_shutdown;
+ if ((sk->sk_shutdown & how) == how)
+ goto shutdown_matched;
}
- sk->sk_shutdown = SHUTDOWN_MASK;
+ /* Try setting the RCV_SHUTDOWN bit, return early if SEND_SHUTDOWN
+ * is already set
+ */
+ if ((how & RCV_SHUTDOWN) && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
+ sk->sk_shutdown |= RCV_SHUTDOWN;
+ if ((sk->sk_shutdown & how) == how)
+ goto shutdown_matched;
+ }
+
+ sk->sk_shutdown |= SEND_SHUTDOWN;
release_sock(sk);
l2cap_chan_lock(chan);
- conn = chan->conn;
- if (conn)
- /* prevent conn structure from being freed */
- l2cap_conn_get(conn);
+ /* prevent conn structure from being freed */
+ conn = l2cap_conn_hold_unless_zero(chan->conn);
l2cap_chan_unlock(chan);
if (conn)
/* mutex lock must be taken before l2cap_chan_lock() */
- mutex_lock(&conn->chan_lock);
+ mutex_lock(&conn->lock);
l2cap_chan_lock(chan);
l2cap_chan_close(chan, 0);
l2cap_chan_unlock(chan);
if (conn) {
- mutex_unlock(&conn->chan_lock);
+ mutex_unlock(&conn->lock);
l2cap_conn_put(conn);
}
@@ -1171,7 +1396,7 @@ static int l2cap_sock_shutdown(struct socket *sock, int how)
err = bt_sock_wait_state(sk, BT_CLOSED,
sk->sk_lingertime);
-has_shutdown:
+shutdown_matched:
l2cap_chan_put(chan);
sock_put(sk);
@@ -1190,18 +1415,31 @@ static int l2cap_sock_release(struct socket *sock)
{
struct sock *sk = sock->sk;
int err;
+ struct l2cap_chan *chan;
BT_DBG("sock %p, sk %p", sock, sk);
if (!sk)
return 0;
+ lock_sock_nested(sk, L2CAP_NESTING_PARENT);
+ l2cap_sock_cleanup_listen(sk);
+ release_sock(sk);
+
bt_sock_unlink(&l2cap_sk_list, sk);
- err = l2cap_sock_shutdown(sock, 2);
+ err = l2cap_sock_shutdown(sock, SHUT_RDWR);
+ chan = l2cap_pi(sk)->chan;
+
+ l2cap_chan_hold(chan);
+ l2cap_chan_lock(chan);
sock_orphan(sk);
l2cap_sock_kill(sk);
+
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
+
return err;
}
@@ -1219,12 +1457,15 @@ static void l2cap_sock_cleanup_listen(struct sock *parent)
BT_DBG("child chan %p state %s", chan,
state_to_string(chan->state));
+ l2cap_chan_hold(chan);
l2cap_chan_lock(chan);
+
__clear_chan_timer(chan);
l2cap_chan_close(chan, ECONNRESET);
- l2cap_chan_unlock(chan);
-
l2cap_sock_kill(sk);
+
+ l2cap_chan_unlock(chan);
+ l2cap_chan_put(chan);
}
}
@@ -1252,7 +1493,7 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan)
l2cap_sock_init(sk, parent);
- bt_accept_enqueue(parent, sk);
+ bt_accept_enqueue(parent, sk, false);
release_sock(parent);
@@ -1261,18 +1502,25 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(struct l2cap_chan *chan)
static int l2cap_sock_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
{
- struct sock *sk = chan->data;
+ struct sock *sk;
+ struct l2cap_pinfo *pi;
int err;
- lock_sock(sk);
+ sk = chan->data;
+ if (!sk)
+ return -ENXIO;
- if (l2cap_pi(sk)->rx_busy_skb) {
+ pi = l2cap_pi(sk);
+ lock_sock(sk);
+ if (chan->mode == L2CAP_MODE_ERTM && !list_empty(&pi->rx_busy)) {
err = -ENOMEM;
goto done;
}
if (chan->mode != L2CAP_MODE_ERTM &&
- chan->mode != L2CAP_MODE_STREAMING) {
+ chan->mode != L2CAP_MODE_STREAMING &&
+ chan->mode != L2CAP_MODE_LE_FLOWCTL &&
+ chan->mode != L2CAP_MODE_EXT_FLOWCTL) {
/* Even if no filter is attached, we could potentially
* get errors from security modules, etc.
*/
@@ -1283,7 +1531,9 @@ static int l2cap_sock_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
err = __sock_queue_rcv_skb(sk, skb);
- /* For ERTM, handle one skb that doesn't fit into the recv
+ l2cap_publish_rx_avail(chan);
+
+ /* For ERTM and LE, handle a skb that doesn't fit into the recv
* buffer. This is important to do because the data frames
* have already been acked, so the skb cannot be discarded.
*
@@ -1292,8 +1542,18 @@ static int l2cap_sock_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
* acked and reassembled until there is buffer space
* available.
*/
- if (err < 0 && chan->mode == L2CAP_MODE_ERTM) {
- l2cap_pi(sk)->rx_busy_skb = skb;
+ if (err < 0 &&
+ (chan->mode == L2CAP_MODE_ERTM ||
+ chan->mode == L2CAP_MODE_LE_FLOWCTL ||
+ chan->mode == L2CAP_MODE_EXT_FLOWCTL)) {
+ struct l2cap_rx_busy *rx_busy =
+ kmalloc(sizeof(*rx_busy), GFP_KERNEL);
+ if (!rx_busy) {
+ err = -ENOMEM;
+ goto done;
+ }
+ rx_busy->skb = skb;
+ list_add_tail(&rx_busy->list, &pi->rx_busy);
l2cap_chan_busy(chan, 1);
err = 0;
}
@@ -1308,6 +1568,9 @@ static void l2cap_sock_close_cb(struct l2cap_chan *chan)
{
struct sock *sk = chan->data;
+ if (!sk)
+ return;
+
l2cap_sock_kill(sk);
}
@@ -1316,6 +1579,9 @@ static void l2cap_sock_teardown_cb(struct l2cap_chan *chan, int err)
struct sock *sk = chan->data;
struct sock *parent;
+ if (!sk)
+ return;
+
BT_DBG("chan %p state %s", chan, state_to_string(chan->state));
/* This callback can be called both for server (BT_LISTEN)
@@ -1329,8 +1595,6 @@ static void l2cap_sock_teardown_cb(struct l2cap_chan *chan, int err)
parent = bt_sk(sk)->parent;
- sock_set_flag(sk, SOCK_ZAPPED);
-
switch (chan->state) {
case BT_OPEN:
case BT_BOUND:
@@ -1357,8 +1621,11 @@ static void l2cap_sock_teardown_cb(struct l2cap_chan *chan, int err)
break;
}
-
release_sock(sk);
+
+ /* Only zap after cleanup to avoid use after free race */
+ sock_set_flag(sk, SOCK_ZAPPED);
+
}
static void l2cap_sock_state_change_cb(struct l2cap_chan *chan, int state,
@@ -1387,7 +1654,15 @@ static struct sk_buff *l2cap_sock_alloc_skb_cb(struct l2cap_chan *chan,
if (!skb)
return ERR_PTR(err);
- skb->priority = sk->sk_priority;
+ /* Channel lock is released before requesting new skb and then
+ * reacquired thus we need to recheck channel state.
+ */
+ if (chan->state != BT_CONNECTED) {
+ kfree_skb(skb);
+ return ERR_PTR(-ENOTCONN);
+ }
+
+ skb->priority = READ_ONCE(sk->sk_priority);
bt_cb(skb)->l2cap.chan = chan;
@@ -1431,6 +1706,9 @@ static void l2cap_sock_resume_cb(struct l2cap_chan *chan)
{
struct sock *sk = chan->data;
+ if (!sk)
+ return;
+
if (test_and_clear_bit(FLAG_PENDING_SECURITY, &chan->flags)) {
sk->sk_state = BT_CONNECTED;
chan->state = BT_CONNECTED;
@@ -1453,7 +1731,14 @@ static long l2cap_sock_get_sndtimeo_cb(struct l2cap_chan *chan)
{
struct sock *sk = chan->data;
- return sk->sk_sndtimeo;
+ return READ_ONCE(sk->sk_sndtimeo);
+}
+
+static struct pid *l2cap_sock_get_peer_pid_cb(struct l2cap_chan *chan)
+{
+ struct sock *sk = chan->data;
+
+ return sk->sk_peer_pid;
}
static void l2cap_sock_suspend_cb(struct l2cap_chan *chan)
@@ -1464,6 +1749,19 @@ static void l2cap_sock_suspend_cb(struct l2cap_chan *chan)
sk->sk_state_change(sk);
}
+static int l2cap_sock_filter(struct l2cap_chan *chan, struct sk_buff *skb)
+{
+ struct sock *sk = chan->data;
+
+ switch (chan->mode) {
+ case L2CAP_MODE_ERTM:
+ case L2CAP_MODE_STREAMING:
+ return sk_filter(sk, skb);
+ }
+
+ return 0;
+}
+
static const struct l2cap_ops l2cap_chan_ops = {
.name = "L2CAP Socket Interface",
.new_connection = l2cap_sock_new_connection_cb,
@@ -1477,19 +1775,26 @@ static const struct l2cap_ops l2cap_chan_ops = {
.suspend = l2cap_sock_suspend_cb,
.set_shutdown = l2cap_sock_set_shutdown_cb,
.get_sndtimeo = l2cap_sock_get_sndtimeo_cb,
+ .get_peer_pid = l2cap_sock_get_peer_pid_cb,
.alloc_skb = l2cap_sock_alloc_skb_cb,
+ .filter = l2cap_sock_filter,
};
static void l2cap_sock_destruct(struct sock *sk)
{
+ struct l2cap_rx_busy *rx_busy, *next;
+
BT_DBG("sk %p", sk);
- if (l2cap_pi(sk)->chan)
+ if (l2cap_pi(sk)->chan) {
+ l2cap_pi(sk)->chan->data = NULL;
l2cap_chan_put(l2cap_pi(sk)->chan);
+ }
- if (l2cap_pi(sk)->rx_busy_skb) {
- kfree_skb(l2cap_pi(sk)->rx_busy_skb);
- l2cap_pi(sk)->rx_busy_skb = NULL;
+ list_for_each_entry_safe(rx_busy, next, &l2cap_pi(sk)->rx_busy, list) {
+ kfree_skb(rx_busy->skb);
+ list_del(&rx_busy->list);
+ kfree(rx_busy);
}
skb_queue_purge(&sk->sk_receive_queue);
@@ -1573,6 +1878,8 @@ static void l2cap_sock_init(struct sock *sk, struct sock *parent)
chan->data = sk;
chan->ops = &l2cap_chan_ops;
+
+ l2cap_publish_rx_avail(chan);
}
static struct proto l2cap_proto = {
@@ -1587,24 +1894,20 @@ static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock,
struct sock *sk;
struct l2cap_chan *chan;
- sk = sk_alloc(net, PF_BLUETOOTH, prio, &l2cap_proto, kern);
+ sk = bt_sock_alloc(net, sock, &l2cap_proto, proto, prio, kern);
if (!sk)
return NULL;
- sock_init_data(sock, sk);
- INIT_LIST_HEAD(&bt_sk(sk)->accept_q);
-
sk->sk_destruct = l2cap_sock_destruct;
sk->sk_sndtimeo = L2CAP_CONN_TIMEOUT;
- sock_reset_flag(sk, SOCK_ZAPPED);
-
- sk->sk_protocol = proto;
- sk->sk_state = BT_OPEN;
+ INIT_LIST_HEAD(&l2cap_pi(sk)->rx_busy);
chan = l2cap_chan_create();
if (!chan) {
sk_free(sk);
+ if (sock)
+ sock->sk = NULL;
return NULL;
}
@@ -1655,6 +1958,7 @@ static const struct proto_ops l2cap_sock_ops = {
.recvmsg = l2cap_sock_recvmsg,
.poll = bt_sock_poll,
.ioctl = bt_sock_ioctl,
+ .gettstamp = sock_gettstamp,
.mmap = sock_no_mmap,
.socketpair = sock_no_socketpair,
.shutdown = l2cap_sock_shutdown,
diff --git a/net/bluetooth/leds.c b/net/bluetooth/leds.c
index 6d59a5023231..6e349704efe4 100644
--- a/net/bluetooth/leds.c
+++ b/net/bluetooth/leds.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2015, Heiner Kallweit <hkallweit1@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <net/bluetooth/bluetooth.h>
@@ -51,7 +48,7 @@ static int power_activate(struct led_classdev *led_cdev)
htrig = to_hci_basic_led_trigger(led_cdev->trigger);
powered = test_bit(HCI_UP, &htrig->hdev->flags);
- led_trigger_event(led_cdev->trigger, powered ? LED_FULL : LED_OFF);
+ led_set_brightness(led_cdev, powered ? LED_FULL : LED_OFF);
return 0;
}
diff --git a/net/bluetooth/leds.h b/net/bluetooth/leds.h
index 08725a2fbd9b..bb5e09204436 100644
--- a/net/bluetooth/leds.h
+++ b/net/bluetooth/leds.h
@@ -1,9 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright 2015, Heiner Kallweit <hkallweit1@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#if IS_ENABLED(CONFIG_BT_LEDS)
diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c
index 63e65d9b4b24..305044a84478 100644
--- a/net/bluetooth/lib.c
+++ b/net/bluetooth/lib.c
@@ -30,6 +30,15 @@
#include <net/bluetooth/bluetooth.h>
+/**
+ * baswap() - Swaps the order of a bd address
+ * @dst: Pointer to a bdaddr_t struct that will store the swapped
+ * bd address.
+ * @src: Pointer to the bdaddr_t struct to be swapped.
+ *
+ * This function reverses the byte order of a Bluetooth device
+ * address.
+ */
void baswap(bdaddr_t *dst, const bdaddr_t *src)
{
const unsigned char *s = (const unsigned char *)src;
@@ -41,7 +50,19 @@ void baswap(bdaddr_t *dst, const bdaddr_t *src)
}
EXPORT_SYMBOL(baswap);
-/* Bluetooth error codes to Unix errno mapping */
+/**
+ * bt_to_errno() - Bluetooth error codes to standard errno
+ * @code: Bluetooth error code to be converted
+ *
+ * This function takes a Bluetooth error code as input and converts
+ * it to an equivalent Unix/standard errno value.
+ *
+ * Return:
+ *
+ * If the bt error code is known, an equivalent Unix errno value
+ * is returned.
+ * If the given bt error code is not known, ENOSYS is returned.
+ */
int bt_to_errno(__u16 code)
{
switch (code) {
@@ -135,6 +156,93 @@ int bt_to_errno(__u16 code)
}
EXPORT_SYMBOL(bt_to_errno);
+/**
+ * bt_status() - Standard errno value to Bluetooth error code
+ * @err: Unix/standard errno value to be converted
+ *
+ * This function converts a standard/Unix errno value to an
+ * equivalent Bluetooth error code.
+ *
+ * Return: Bluetooth error code.
+ *
+ * If the given errno is not found, 0x1f is returned by default
+ * which indicates an unspecified error.
+ * For err >= 0, no conversion is performed, and the same value
+ * is immediately returned.
+ */
+__u8 bt_status(int err)
+{
+ if (err >= 0)
+ return err;
+
+ switch (err) {
+ case -EBADRQC:
+ return 0x01;
+
+ case -ENOTCONN:
+ return 0x02;
+
+ case -EIO:
+ return 0x03;
+
+ case -EHOSTDOWN:
+ return 0x04;
+
+ case -EACCES:
+ return 0x05;
+
+ case -EBADE:
+ return 0x06;
+
+ case -ENOMEM:
+ return 0x07;
+
+ case -ETIMEDOUT:
+ return 0x08;
+
+ case -EMLINK:
+ return 0x09;
+
+ case -EALREADY:
+ return 0x0b;
+
+ case -EBUSY:
+ return 0x0c;
+
+ case -ECONNREFUSED:
+ return 0x0d;
+
+ case -EOPNOTSUPP:
+ return 0x11;
+
+ case -EINVAL:
+ return 0x12;
+
+ case -ECONNRESET:
+ return 0x13;
+
+ case -ECONNABORTED:
+ return 0x16;
+
+ case -ELOOP:
+ return 0x17;
+
+ case -EPROTONOSUPPORT:
+ return 0x1a;
+
+ case -EPROTO:
+ return 0x19;
+
+ default:
+ return 0x1f;
+ }
+}
+EXPORT_SYMBOL(bt_status);
+
+/**
+ * bt_info() - Log Bluetooth information message
+ * @format: Message's format string
+ */
void bt_info(const char *format, ...)
{
struct va_format vaf;
@@ -151,6 +259,10 @@ void bt_info(const char *format, ...)
}
EXPORT_SYMBOL(bt_info);
+/**
+ * bt_warn() - Log Bluetooth warning message
+ * @format: Message's format string
+ */
void bt_warn(const char *format, ...)
{
struct va_format vaf;
@@ -167,6 +279,10 @@ void bt_warn(const char *format, ...)
}
EXPORT_SYMBOL(bt_warn);
+/**
+ * bt_err() - Log Bluetooth error message
+ * @format: Message's format string
+ */
void bt_err(const char *format, ...)
{
struct va_format vaf;
@@ -183,6 +299,73 @@ void bt_err(const char *format, ...)
}
EXPORT_SYMBOL(bt_err);
+#ifdef CONFIG_BT_FEATURE_DEBUG
+static bool debug_enable;
+
+void bt_dbg_set(bool enable)
+{
+ debug_enable = enable;
+}
+
+bool bt_dbg_get(void)
+{
+ return debug_enable;
+}
+
+/**
+ * bt_dbg() - Log Bluetooth debugging message
+ * @format: Message's format string
+ */
+void bt_dbg(const char *format, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ if (likely(!debug_enable))
+ return;
+
+ va_start(args, format);
+
+ vaf.fmt = format;
+ vaf.va = &args;
+
+ printk(KERN_DEBUG pr_fmt("%pV"), &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL(bt_dbg);
+#endif
+
+/**
+ * bt_warn_ratelimited() - Log rate-limited Bluetooth warning message
+ * @format: Message's format string
+ *
+ * This functions works like bt_warn, but it uses rate limiting
+ * to prevent the message from being logged too often.
+ */
+void bt_warn_ratelimited(const char *format, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, format);
+
+ vaf.fmt = format;
+ vaf.va = &args;
+
+ pr_warn_ratelimited("%pV", &vaf);
+
+ va_end(args);
+}
+EXPORT_SYMBOL(bt_warn_ratelimited);
+
+/**
+ * bt_err_ratelimited() - Log rate-limited Bluetooth error message
+ * @format: Message's format string
+ *
+ * This functions works like bt_err, but it uses rate limiting
+ * to prevent the message from being logged too often.
+ */
void bt_err_ratelimited(const char *format, ...)
{
struct va_format vaf;
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index ccce954f8146..c11cdef42b6f 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -25,7 +25,7 @@
/* Bluetooth HCI Management interface */
#include <linux/module.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
@@ -33,12 +33,15 @@
#include <net/bluetooth/l2cap.h>
#include <net/bluetooth/mgmt.h>
-#include "hci_request.h"
#include "smp.h"
#include "mgmt_util.h"
+#include "mgmt_config.h"
+#include "msft.h"
+#include "eir.h"
+#include "aosp.h"
#define MGMT_VERSION 1
-#define MGMT_REVISION 14
+#define MGMT_REVISION 23
static const u16 mgmt_commands[] = {
MGMT_OP_READ_INDEX_LIST,
@@ -106,6 +109,30 @@ static const u16 mgmt_commands[] = {
MGMT_OP_START_LIMITED_DISCOVERY,
MGMT_OP_READ_EXT_INFO,
MGMT_OP_SET_APPEARANCE,
+ MGMT_OP_GET_PHY_CONFIGURATION,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ MGMT_OP_SET_BLOCKED_KEYS,
+ MGMT_OP_SET_WIDEBAND_SPEECH,
+ MGMT_OP_READ_CONTROLLER_CAP,
+ MGMT_OP_READ_EXP_FEATURES_INFO,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_OP_READ_DEF_SYSTEM_CONFIG,
+ MGMT_OP_SET_DEF_SYSTEM_CONFIG,
+ MGMT_OP_READ_DEF_RUNTIME_CONFIG,
+ MGMT_OP_SET_DEF_RUNTIME_CONFIG,
+ MGMT_OP_GET_DEVICE_FLAGS,
+ MGMT_OP_SET_DEVICE_FLAGS,
+ MGMT_OP_READ_ADV_MONITOR_FEATURES,
+ MGMT_OP_ADD_ADV_PATTERNS_MONITOR,
+ MGMT_OP_REMOVE_ADV_MONITOR,
+ MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_OP_ADD_EXT_ADV_DATA,
+ MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI,
+ MGMT_OP_SET_MESH_RECEIVER,
+ MGMT_OP_MESH_READ_FEATURES,
+ MGMT_OP_MESH_SEND,
+ MGMT_OP_MESH_SEND_CANCEL,
+ MGMT_OP_HCI_CMD_SYNC,
};
static const u16 mgmt_events[] = {
@@ -144,6 +171,15 @@ static const u16 mgmt_events[] = {
MGMT_EV_ADVERTISING_ADDED,
MGMT_EV_ADVERTISING_REMOVED,
MGMT_EV_EXT_INFO_CHANGED,
+ MGMT_EV_PHY_CONFIGURATION_CHANGED,
+ MGMT_EV_EXP_FEATURE_CHANGED,
+ MGMT_EV_DEVICE_FLAGS_CHANGED,
+ MGMT_EV_ADV_MONITOR_ADDED,
+ MGMT_EV_ADV_MONITOR_REMOVED,
+ MGMT_EV_CONTROLLER_SUSPEND,
+ MGMT_EV_CONTROLLER_RESUME,
+ MGMT_EV_ADV_MONITOR_DEVICE_FOUND,
+ MGMT_EV_ADV_MONITOR_DEVICE_LOST,
};
static const u16 mgmt_untrusted_commands[] = {
@@ -153,6 +189,10 @@ static const u16 mgmt_untrusted_commands[] = {
MGMT_OP_READ_CONFIG_INFO,
MGMT_OP_READ_EXT_INDEX_LIST,
MGMT_OP_READ_EXT_INFO,
+ MGMT_OP_READ_CONTROLLER_CAP,
+ MGMT_OP_READ_EXP_FEATURES_INFO,
+ MGMT_OP_READ_DEF_SYSTEM_CONFIG,
+ MGMT_OP_READ_DEF_RUNTIME_CONFIG,
};
static const u16 mgmt_untrusted_events[] = {
@@ -167,15 +207,16 @@ static const u16 mgmt_untrusted_events[] = {
MGMT_EV_EXT_INDEX_ADDED,
MGMT_EV_EXT_INDEX_REMOVED,
MGMT_EV_EXT_INFO_CHANGED,
+ MGMT_EV_EXP_FEATURE_CHANGED,
};
-#define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000)
+#define CACHE_TIMEOUT secs_to_jiffies(2)
#define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \
"\x00\x00\x00\x00\x00\x00\x00\x00"
/* HCI to MGMT error code conversion table */
-static u8 mgmt_status_table[] = {
+static const u8 mgmt_status_table[] = {
MGMT_STATUS_SUCCESS,
MGMT_STATUS_UNKNOWN_COMMAND, /* Unknown Command */
MGMT_STATUS_NOT_CONNECTED, /* No Connection */
@@ -219,12 +260,15 @@ static u8 mgmt_status_table[] = {
MGMT_STATUS_TIMEOUT, /* Instant Passed */
MGMT_STATUS_NOT_SUPPORTED, /* Pairing Not Supported */
MGMT_STATUS_FAILED, /* Transaction Collision */
+ MGMT_STATUS_FAILED, /* Reserved for future use */
MGMT_STATUS_INVALID_PARAMS, /* Unacceptable Parameter */
MGMT_STATUS_REJECTED, /* QoS Rejected */
MGMT_STATUS_NOT_SUPPORTED, /* Classification Not Supported */
MGMT_STATUS_REJECTED, /* Insufficient Security */
MGMT_STATUS_INVALID_PARAMS, /* Parameter Out Of Range */
+ MGMT_STATUS_FAILED, /* Reserved for future use */
MGMT_STATUS_BUSY, /* Role Switch Pending */
+ MGMT_STATUS_FAILED, /* Reserved for future use */
MGMT_STATUS_FAILED, /* Slot Violation */
MGMT_STATUS_FAILED, /* Role Switch Failed */
MGMT_STATUS_INVALID_PARAMS, /* EIR Too Large */
@@ -239,10 +283,39 @@ static u8 mgmt_status_table[] = {
MGMT_STATUS_CONNECT_FAILED, /* MAC Connection Failed */
};
-static u8 mgmt_status(u8 hci_status)
+static u8 mgmt_errno_status(int err)
{
- if (hci_status < ARRAY_SIZE(mgmt_status_table))
- return mgmt_status_table[hci_status];
+ switch (err) {
+ case 0:
+ return MGMT_STATUS_SUCCESS;
+ case -EPERM:
+ return MGMT_STATUS_REJECTED;
+ case -EINVAL:
+ return MGMT_STATUS_INVALID_PARAMS;
+ case -EOPNOTSUPP:
+ return MGMT_STATUS_NOT_SUPPORTED;
+ case -EBUSY:
+ return MGMT_STATUS_BUSY;
+ case -ETIMEDOUT:
+ return MGMT_STATUS_AUTH_FAILED;
+ case -ENOMEM:
+ return MGMT_STATUS_NO_RESOURCES;
+ case -EISCONN:
+ return MGMT_STATUS_ALREADY_CONNECTED;
+ case -ENOTCONN:
+ return MGMT_STATUS_DISCONNECTED;
+ }
+
+ return MGMT_STATUS_FAILED;
+}
+
+static u8 mgmt_status(int err)
+{
+ if (err < 0)
+ return mgmt_errno_status(err);
+
+ if (err < ARRAY_SIZE(mgmt_status_table))
+ return mgmt_status_table[err];
return MGMT_STATUS_FAILED;
}
@@ -268,6 +341,12 @@ static int mgmt_event(u16 event, struct hci_dev *hdev, void *data, u16 len,
HCI_SOCK_TRUSTED, skip_sk);
}
+static int mgmt_event_skb(struct sk_buff *skb, struct sock *skip_sk)
+{
+ return mgmt_send_event_skb(HCI_CHANNEL_CONTROL, skb, HCI_SOCK_TRUSTED,
+ skip_sk);
+}
+
static u8 le_addr_type(u8 mgmt_addr_type)
{
if (mgmt_addr_type == BDADDR_LE_PUBLIC)
@@ -289,7 +368,7 @@ static int read_version(struct sock *sk, struct hci_dev *hdev, void *data,
{
struct mgmt_rp_read_version rp;
- BT_DBG("sock %p", sk);
+ bt_dev_dbg(hdev, "sock %p", sk);
mgmt_fill_version_info(&rp);
@@ -305,7 +384,7 @@ static int read_commands(struct sock *sk, struct hci_dev *hdev, void *data,
size_t rp_size;
int i, err;
- BT_DBG("sock %p", sk);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (hci_sock_test_flag(sk, HCI_SOCK_TRUSTED)) {
num_commands = ARRAY_SIZE(mgmt_commands);
@@ -358,14 +437,13 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data,
u16 count;
int err;
- BT_DBG("sock %p", sk);
+ bt_dev_dbg(hdev, "sock %p", sk);
read_lock(&hci_dev_list_lock);
count = 0;
list_for_each_entry(d, &hci_dev_list, list) {
- if (d->dev_type == HCI_PRIMARY &&
- !hci_dev_test_flag(d, HCI_UNCONFIGURED))
+ if (!hci_dev_test_flag(d, HCI_UNCONFIGURED))
count++;
}
@@ -386,13 +464,12 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data,
/* Devices marked as raw-only are neither configured
* nor unconfigured controllers.
*/
- if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks))
+ if (hci_test_quirk(d, HCI_QUIRK_RAW_DEVICE))
continue;
- if (d->dev_type == HCI_PRIMARY &&
- !hci_dev_test_flag(d, HCI_UNCONFIGURED)) {
+ if (!hci_dev_test_flag(d, HCI_UNCONFIGURED)) {
rp->index[count++] = cpu_to_le16(d->id);
- BT_DBG("Added hci%u", d->id);
+ bt_dev_dbg(hdev, "Added hci%u", d->id);
}
}
@@ -418,14 +495,13 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev,
u16 count;
int err;
- BT_DBG("sock %p", sk);
+ bt_dev_dbg(hdev, "sock %p", sk);
read_lock(&hci_dev_list_lock);
count = 0;
list_for_each_entry(d, &hci_dev_list, list) {
- if (d->dev_type == HCI_PRIMARY &&
- hci_dev_test_flag(d, HCI_UNCONFIGURED))
+ if (hci_dev_test_flag(d, HCI_UNCONFIGURED))
count++;
}
@@ -446,13 +522,12 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev,
/* Devices marked as raw-only are neither configured
* nor unconfigured controllers.
*/
- if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks))
+ if (hci_test_quirk(d, HCI_QUIRK_RAW_DEVICE))
continue;
- if (d->dev_type == HCI_PRIMARY &&
- hci_dev_test_flag(d, HCI_UNCONFIGURED)) {
+ if (hci_dev_test_flag(d, HCI_UNCONFIGURED)) {
rp->index[count++] = cpu_to_le16(d->id);
- BT_DBG("Added hci%u", d->id);
+ bt_dev_dbg(hdev, "Added hci%u", d->id);
}
}
@@ -474,22 +549,18 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev,
{
struct mgmt_rp_read_ext_index_list *rp;
struct hci_dev *d;
- size_t rp_len;
u16 count;
int err;
- BT_DBG("sock %p", sk);
+ bt_dev_dbg(hdev, "sock %p", sk);
read_lock(&hci_dev_list_lock);
count = 0;
- list_for_each_entry(d, &hci_dev_list, list) {
- if (d->dev_type == HCI_PRIMARY || d->dev_type == HCI_AMP)
- count++;
- }
+ list_for_each_entry(d, &hci_dev_list, list)
+ count++;
- rp_len = sizeof(*rp) + (sizeof(rp->entry[0]) * count);
- rp = kmalloc(rp_len, GFP_ATOMIC);
+ rp = kmalloc(struct_size(rp, entry, count), GFP_ATOMIC);
if (!rp) {
read_unlock(&hci_dev_list_lock);
return -ENOMEM;
@@ -505,27 +576,20 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev,
/* Devices marked as raw-only are neither configured
* nor unconfigured controllers.
*/
- if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks))
+ if (hci_test_quirk(d, HCI_QUIRK_RAW_DEVICE))
continue;
- if (d->dev_type == HCI_PRIMARY) {
- if (hci_dev_test_flag(d, HCI_UNCONFIGURED))
- rp->entry[count].type = 0x01;
- else
- rp->entry[count].type = 0x00;
- } else if (d->dev_type == HCI_AMP) {
- rp->entry[count].type = 0x02;
- } else {
- continue;
- }
+ if (hci_dev_test_flag(d, HCI_UNCONFIGURED))
+ rp->entry[count].type = 0x01;
+ else
+ rp->entry[count].type = 0x00;
rp->entry[count].bus = d->bus;
rp->entry[count++].index = cpu_to_le16(d->id);
- BT_DBG("Added hci%u", d->id);
+ bt_dev_dbg(hdev, "Added hci%u", d->id);
}
rp->num_controllers = cpu_to_le16(count);
- rp_len = sizeof(*rp) + (sizeof(rp->entry[0]) * count);
read_unlock(&hci_dev_list_lock);
@@ -538,7 +602,8 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev,
hci_sock_clear_flag(sk, HCI_MGMT_UNCONF_INDEX_EVENTS);
err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE,
- MGMT_OP_READ_EXT_INDEX_LIST, 0, rp, rp_len);
+ MGMT_OP_READ_EXT_INDEX_LIST, 0, rp,
+ struct_size(rp, entry, count));
kfree(rp);
@@ -547,11 +612,12 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev,
static bool is_configured(struct hci_dev *hdev)
{
- if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) &&
+ if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG) &&
!hci_dev_test_flag(hdev, HCI_EXT_CONFIGURED))
return false;
- if (test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks) &&
+ if ((hci_test_quirk(hdev, HCI_QUIRK_INVALID_BDADDR) ||
+ hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY)) &&
!bacmp(&hdev->public_addr, BDADDR_ANY))
return false;
@@ -562,11 +628,12 @@ static __le32 get_missing_options(struct hci_dev *hdev)
{
u32 options = 0;
- if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) &&
+ if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG) &&
!hci_dev_test_flag(hdev, HCI_EXT_CONFIGURED))
options |= MGMT_OPTION_EXTERNAL_CONFIG;
- if (test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks) &&
+ if ((hci_test_quirk(hdev, HCI_QUIRK_INVALID_BDADDR) ||
+ hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY)) &&
!bacmp(&hdev->public_addr, BDADDR_ANY))
options |= MGMT_OPTION_PUBLIC_ADDRESS;
@@ -595,14 +662,14 @@ static int read_config_info(struct sock *sk, struct hci_dev *hdev,
struct mgmt_rp_read_config_info rp;
u32 options = 0;
- BT_DBG("sock %p %s", sk, hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
hci_dev_lock(hdev);
memset(&rp, 0, sizeof(rp));
rp.manufacturer = cpu_to_le16(hdev->manufacturer);
- if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks))
+ if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG))
options |= MGMT_OPTION_EXTERNAL_CONFIG;
if (hdev->set_bdaddr)
@@ -756,25 +823,41 @@ static u32 get_supported_settings(struct hci_dev *hdev)
if (lmp_ssp_capable(hdev)) {
settings |= MGMT_SETTING_SSP;
- settings |= MGMT_SETTING_HS;
}
if (lmp_sc_capable(hdev))
settings |= MGMT_SETTING_SECURE_CONN;
+
+ if (hci_test_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED))
+ settings |= MGMT_SETTING_WIDEBAND_SPEECH;
}
if (lmp_le_capable(hdev)) {
settings |= MGMT_SETTING_LE;
- settings |= MGMT_SETTING_ADVERTISING;
settings |= MGMT_SETTING_SECURE_CONN;
settings |= MGMT_SETTING_PRIVACY;
settings |= MGMT_SETTING_STATIC_ADDRESS;
+ settings |= MGMT_SETTING_ADVERTISING;
}
- if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) ||
- hdev->set_bdaddr)
+ if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG) || hdev->set_bdaddr)
settings |= MGMT_SETTING_CONFIGURATION;
+ if (cis_central_capable(hdev))
+ settings |= MGMT_SETTING_CIS_CENTRAL;
+
+ if (cis_peripheral_capable(hdev))
+ settings |= MGMT_SETTING_CIS_PERIPHERAL;
+
+ if (ll_privacy_capable(hdev))
+ settings |= MGMT_SETTING_LL_PRIVACY;
+
+ if (past_sender_capable(hdev))
+ settings |= MGMT_SETTING_PAST_SENDER;
+
+ if (past_receiver_capable(hdev))
+ settings |= MGMT_SETTING_PAST_RECEIVER;
+
settings |= MGMT_SETTING_PHY_CONFIGURATION;
return settings;
@@ -811,9 +894,6 @@ static u32 get_current_settings(struct hci_dev *hdev)
if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
settings |= MGMT_SETTING_SSP;
- if (hci_dev_test_flag(hdev, HCI_HS_ENABLED))
- settings |= MGMT_SETTING_HS;
-
if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
settings |= MGMT_SETTING_ADVERTISING;
@@ -845,6 +925,30 @@ static u32 get_current_settings(struct hci_dev *hdev)
settings |= MGMT_SETTING_STATIC_ADDRESS;
}
+ if (hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED))
+ settings |= MGMT_SETTING_WIDEBAND_SPEECH;
+
+ if (cis_central_enabled(hdev))
+ settings |= MGMT_SETTING_CIS_CENTRAL;
+
+ if (cis_peripheral_enabled(hdev))
+ settings |= MGMT_SETTING_CIS_PERIPHERAL;
+
+ if (bis_enabled(hdev))
+ settings |= MGMT_SETTING_ISO_BROADCASTER;
+
+ if (sync_recv_enabled(hdev))
+ settings |= MGMT_SETTING_ISO_SYNC_RECEIVER;
+
+ if (ll_privacy_enabled(hdev))
+ settings |= MGMT_SETTING_LL_PRIVACY;
+
+ if (past_sender_enabled(hdev))
+ settings |= MGMT_SETTING_PAST_SENDER;
+
+ if (past_receiver_enabled(hdev))
+ settings |= MGMT_SETTING_PAST_RECEIVER;
+
return settings;
}
@@ -853,13 +957,6 @@ static struct mgmt_pending_cmd *pending_find(u16 opcode, struct hci_dev *hdev)
return mgmt_pending_find(HCI_CHANNEL_CONTROL, opcode, hdev);
}
-static struct mgmt_pending_cmd *pending_find_data(u16 opcode,
- struct hci_dev *hdev,
- const void *data)
-{
- return mgmt_pending_find_data(HCI_CHANNEL_CONTROL, opcode, hdev, data);
-}
-
u8 mgmt_get_adv_discov_flags(struct hci_dev *hdev)
{
struct mgmt_pending_cmd *cmd;
@@ -901,59 +998,148 @@ bool mgmt_get_connectable(struct hci_dev *hdev)
return hci_dev_test_flag(hdev, HCI_CONNECTABLE);
}
+static int service_cache_sync(struct hci_dev *hdev, void *data)
+{
+ hci_update_eir_sync(hdev);
+ hci_update_class_sync(hdev);
+
+ return 0;
+}
+
static void service_cache_off(struct work_struct *work)
{
struct hci_dev *hdev = container_of(work, struct hci_dev,
service_cache.work);
- struct hci_request req;
if (!hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE))
return;
- hci_req_init(&req, hdev);
-
- hci_dev_lock(hdev);
-
- __hci_req_update_eir(&req);
- __hci_req_update_class(&req);
-
- hci_dev_unlock(hdev);
+ hci_cmd_sync_queue(hdev, service_cache_sync, NULL, NULL);
+}
- hci_req_run(&req, NULL);
+static int rpa_expired_sync(struct hci_dev *hdev, void *data)
+{
+ /* The generation of a new RPA and programming it into the
+ * controller happens in the hci_req_enable_advertising()
+ * function.
+ */
+ if (ext_adv_capable(hdev))
+ return hci_start_ext_adv_sync(hdev, hdev->cur_adv_instance);
+ else
+ return hci_enable_advertising_sync(hdev);
}
static void rpa_expired(struct work_struct *work)
{
struct hci_dev *hdev = container_of(work, struct hci_dev,
rpa_expired.work);
- struct hci_request req;
- BT_DBG("");
+ bt_dev_dbg(hdev, "");
hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
if (!hci_dev_test_flag(hdev, HCI_ADVERTISING))
return;
- /* The generation of a new RPA and programming it into the
- * controller happens in the hci_req_enable_advertising()
- * function.
+ hci_cmd_sync_queue(hdev, rpa_expired_sync, NULL, NULL);
+}
+
+static int set_discoverable_sync(struct hci_dev *hdev, void *data);
+
+static void discov_off(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ discov_off.work);
+
+ bt_dev_dbg(hdev, "");
+
+ hci_dev_lock(hdev);
+
+ /* When discoverable timeout triggers, then just make sure
+ * the limited discoverable flag is cleared. Even in the case
+ * of a timeout triggered from general discoverable, it is
+ * safe to unconditionally clear the flag.
*/
- hci_req_init(&req, hdev);
- if (ext_adv_capable(hdev))
- __hci_req_start_ext_adv(&req, hdev->cur_adv_instance);
+ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
+ hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
+ hdev->discov_timeout = 0;
+
+ hci_cmd_sync_queue(hdev, set_discoverable_sync, NULL, NULL);
+
+ mgmt_new_settings(hdev);
+
+ hci_dev_unlock(hdev);
+}
+
+static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev);
+
+static void mesh_send_complete(struct hci_dev *hdev,
+ struct mgmt_mesh_tx *mesh_tx, bool silent)
+{
+ u8 handle = mesh_tx->handle;
+
+ if (!silent)
+ mgmt_event(MGMT_EV_MESH_PACKET_CMPLT, hdev, &handle,
+ sizeof(handle), NULL);
+
+ mgmt_mesh_remove(mesh_tx);
+}
+
+static int mesh_send_done_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_mesh_tx *mesh_tx;
+
+ hci_dev_clear_flag(hdev, HCI_MESH_SENDING);
+ if (list_empty(&hdev->adv_instances))
+ hci_disable_advertising_sync(hdev);
+ mesh_tx = mgmt_mesh_next(hdev, NULL);
+
+ if (mesh_tx)
+ mesh_send_complete(hdev, mesh_tx, false);
+
+ return 0;
+}
+
+static int mesh_send_sync(struct hci_dev *hdev, void *data);
+static void mesh_send_start_complete(struct hci_dev *hdev, void *data, int err);
+static void mesh_next(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_mesh_tx *mesh_tx = mgmt_mesh_next(hdev, NULL);
+
+ if (!mesh_tx)
+ return;
+
+ err = hci_cmd_sync_queue(hdev, mesh_send_sync, mesh_tx,
+ mesh_send_start_complete);
+
+ if (err < 0)
+ mesh_send_complete(hdev, mesh_tx, false);
else
- __hci_req_enable_advertising(&req);
- hci_req_run(&req, NULL);
+ hci_dev_set_flag(hdev, HCI_MESH_SENDING);
+}
+
+static void mesh_send_done(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ mesh_send_done.work);
+
+ if (!hci_dev_test_flag(hdev, HCI_MESH_SENDING))
+ return;
+
+ hci_cmd_sync_queue(hdev, mesh_send_done_sync, NULL, mesh_next);
}
static void mgmt_init_hdev(struct sock *sk, struct hci_dev *hdev)
{
- if (hci_dev_test_and_set_flag(hdev, HCI_MGMT))
+ if (hci_dev_test_flag(hdev, HCI_MGMT))
return;
+ BT_INFO("MGMT ver %d.%d", MGMT_VERSION, MGMT_REVISION);
+
+ INIT_DELAYED_WORK(&hdev->discov_off, discov_off);
INIT_DELAYED_WORK(&hdev->service_cache, service_cache_off);
INIT_DELAYED_WORK(&hdev->rpa_expired, rpa_expired);
+ INIT_DELAYED_WORK(&hdev->mesh_send_done, mesh_send_done);
/* Non-mgmt controlled devices get this bit set
* implicitly so that pairing works for them, however
@@ -961,6 +1147,8 @@ static void mgmt_init_hdev(struct sock *sk, struct hci_dev *hdev)
* it
*/
hci_dev_clear_flag(hdev, HCI_BONDABLE);
+
+ hci_dev_set_flag(hdev, HCI_MGMT);
}
static int read_controller_info(struct sock *sk, struct hci_dev *hdev,
@@ -968,7 +1156,7 @@ static int read_controller_info(struct sock *sk, struct hci_dev *hdev,
{
struct mgmt_rp_read_info rp;
- BT_DBG("sock %p %s", sk, hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
hci_dev_lock(hdev);
@@ -1006,11 +1194,11 @@ static u16 append_eir_data_to_buf(struct hci_dev *hdev, u8 *eir)
eir_len = eir_append_le16(eir, eir_len, EIR_APPEARANCE,
hdev->appearance);
- name_len = strlen(hdev->dev_name);
+ name_len = strnlen(hdev->dev_name, sizeof(hdev->dev_name));
eir_len = eir_append_data(eir, eir_len, EIR_NAME_COMPLETE,
hdev->dev_name, name_len);
- name_len = strlen(hdev->short_name);
+ name_len = strnlen(hdev->short_name, sizeof(hdev->short_name));
eir_len = eir_append_data(eir, eir_len, EIR_NAME_SHORT,
hdev->short_name, name_len);
@@ -1024,7 +1212,7 @@ static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev,
struct mgmt_rp_read_ext_info *rp = (void *)buf;
u16 eir_len;
- BT_DBG("sock %p %s", sk, hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
memset(&buf, 0, sizeof(buf));
@@ -1081,16 +1269,6 @@ static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev)
sizeof(settings));
}
-static void clean_up_hci_complete(struct hci_dev *hdev, u8 status, u16 opcode)
-{
- BT_DBG("%s status 0x%02x", hdev->name, status);
-
- if (hci_conn_count(hdev) == 0) {
- cancel_delayed_work(&hdev->power_off);
- queue_work(hdev->req_workqueue, &hdev->power_off.work);
- }
-}
-
void mgmt_advertising_added(struct sock *sk, struct hci_dev *hdev, u8 instance)
{
struct mgmt_ev_advertising_added ev;
@@ -1118,38 +1296,95 @@ static void cancel_adv_timeout(struct hci_dev *hdev)
}
}
-static int clean_up_hci_state(struct hci_dev *hdev)
+/* This function requires the caller holds hdev->lock */
+static void restart_le_actions(struct hci_dev *hdev)
{
- struct hci_request req;
- struct hci_conn *conn;
- bool discov_stopped;
- int err;
+ struct hci_conn_params *p;
- hci_req_init(&req, hdev);
+ list_for_each_entry(p, &hdev->le_conn_params, list) {
+ /* Needed for AUTO_OFF case where might not "really"
+ * have been powered off.
+ */
+ hci_pend_le_list_del_init(p);
- if (test_bit(HCI_ISCAN, &hdev->flags) ||
- test_bit(HCI_PSCAN, &hdev->flags)) {
- u8 scan = 0x00;
- hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
+ switch (p->auto_connect) {
+ case HCI_AUTO_CONN_DIRECT:
+ case HCI_AUTO_CONN_ALWAYS:
+ hci_pend_le_list_add(p, &hdev->pend_le_conns);
+ break;
+ case HCI_AUTO_CONN_REPORT:
+ hci_pend_le_list_add(p, &hdev->pend_le_reports);
+ break;
+ default:
+ break;
+ }
}
+}
- hci_req_clear_adv_instance(hdev, NULL, NULL, 0x00, false);
+static int new_settings(struct hci_dev *hdev, struct sock *skip)
+{
+ __le32 ev = cpu_to_le32(get_current_settings(hdev));
- if (hci_dev_test_flag(hdev, HCI_LE_ADV))
- __hci_req_disable_advertising(&req);
+ return mgmt_limited_event(MGMT_EV_NEW_SETTINGS, hdev, &ev,
+ sizeof(ev), HCI_MGMT_SETTING_EVENTS, skip);
+}
- discov_stopped = hci_req_stop_discovery(&req);
+static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_mode *cp;
+
+ /* Make sure cmd still outstanding. */
+ if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
+ return;
- list_for_each_entry(conn, &hdev->conn_hash.list, list) {
- /* 0x15 == Terminated due to Power Off */
- __hci_abort_conn(&req, conn, 0x15);
+ cp = cmd->param;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ if (!err) {
+ if (cp->val) {
+ hci_dev_lock(hdev);
+ restart_le_actions(hdev);
+ hci_update_passive_scan(hdev);
+ hci_dev_unlock(hdev);
+ }
+
+ send_settings_rsp(cmd->sk, cmd->opcode, hdev);
+
+ /* Only call new_setting for power on as power off is deferred
+ * to hdev->power_off work which does call hci_dev_do_close.
+ */
+ if (cp->val)
+ new_settings(hdev, cmd->sk);
+ } else {
+ mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_POWERED,
+ mgmt_status(err));
}
- err = hci_req_run(&req, clean_up_hci_complete);
- if (!err && discov_stopped)
- hci_discovery_set_state(hdev, DISCOVERY_STOPPING);
+ mgmt_pending_free(cmd);
+}
- return err;
+static int set_powered_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_mode cp;
+
+ mutex_lock(&hdev->mgmt_pending_lock);
+
+ /* Make sure cmd still outstanding. */
+ if (!__mgmt_pending_listed(hdev, cmd)) {
+ mutex_unlock(&hdev->mgmt_pending_lock);
+ return -ECANCELED;
+ }
+
+ memcpy(&cp, cmd->param, sizeof(cp));
+
+ mutex_unlock(&hdev->mgmt_pending_lock);
+
+ BT_DBG("%s", hdev->name);
+
+ return hci_set_powered_sync(hdev, cp.val);
}
static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data,
@@ -1159,7 +1394,7 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data,
struct mgmt_pending_cmd *cmd;
int err;
- BT_DBG("request for %s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (cp->val != 0x00 && cp->val != 0x01)
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED,
@@ -1167,6 +1402,14 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data,
hci_dev_lock(hdev);
+ if (!cp->val) {
+ if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED,
+ MGMT_STATUS_BUSY);
+ goto failed;
+ }
+ }
+
if (pending_find(MGMT_OP_SET_POWERED, hdev)) {
err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED,
MGMT_STATUS_BUSY);
@@ -1184,37 +1427,25 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data,
goto failed;
}
- if (cp->val) {
- queue_work(hdev->req_workqueue, &hdev->power_on);
- err = 0;
+ /* Cancel potentially blocking sync operation before power off */
+ if (cp->val == 0x00) {
+ hci_cmd_sync_cancel_sync(hdev, -EHOSTDOWN);
+ err = hci_cmd_sync_queue(hdev, set_powered_sync, cmd,
+ mgmt_set_powered_complete);
} else {
- /* Disconnect connections, stop scans, etc */
- err = clean_up_hci_state(hdev);
- if (!err)
- queue_delayed_work(hdev->req_workqueue, &hdev->power_off,
- HCI_POWER_OFF_TIMEOUT);
-
- /* ENODATA means there were no HCI commands queued */
- if (err == -ENODATA) {
- cancel_delayed_work(&hdev->power_off);
- queue_work(hdev->req_workqueue, &hdev->power_off.work);
- err = 0;
- }
+ /* Use hci_cmd_sync_submit since hdev might not be running */
+ err = hci_cmd_sync_submit(hdev, set_powered_sync, cmd,
+ mgmt_set_powered_complete);
}
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
failed:
hci_dev_unlock(hdev);
return err;
}
-static int new_settings(struct hci_dev *hdev, struct sock *skip)
-{
- __le32 ev = cpu_to_le32(get_current_settings(hdev));
-
- return mgmt_limited_event(MGMT_EV_NEW_SETTINGS, hdev, &ev,
- sizeof(ev), HCI_MGMT_SETTING_EVENTS, skip);
-}
-
int mgmt_new_settings(struct hci_dev *hdev)
{
return new_settings(hdev, NULL);
@@ -1232,32 +1463,30 @@ static void settings_rsp(struct mgmt_pending_cmd *cmd, void *data)
send_settings_rsp(cmd->sk, cmd->opcode, match->hdev);
- list_del(&cmd->list);
-
if (match->sk == NULL) {
match->sk = cmd->sk;
sock_hold(match->sk);
}
-
- mgmt_pending_free(cmd);
}
static void cmd_status_rsp(struct mgmt_pending_cmd *cmd, void *data)
{
u8 *status = data;
- mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, *status);
- mgmt_pending_remove(cmd);
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, *status);
}
static void cmd_complete_rsp(struct mgmt_pending_cmd *cmd, void *data)
{
- if (cmd->cmd_complete) {
- u8 *status = data;
+ struct cmd_lookup *match = data;
- cmd->cmd_complete(cmd, *status);
- mgmt_pending_remove(cmd);
+ /* dequeue cmd_sync entries using cmd as data as that is about to be
+ * removed/freed.
+ */
+ hci_cmd_sync_dequeue(match->hdev, NULL, cmd, NULL);
+ if (cmd->cmd_complete) {
+ cmd->cmd_complete(cmd, match->mgmt_status);
return;
}
@@ -1266,13 +1495,13 @@ static void cmd_complete_rsp(struct mgmt_pending_cmd *cmd, void *data)
static int generic_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status)
{
- return mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status,
+ return mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, status,
cmd->param, cmd->param_len);
}
static int addr_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status)
{
- return mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status,
+ return mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, status,
cmd->param, sizeof(struct mgmt_addr_info));
}
@@ -1296,41 +1525,50 @@ static u8 mgmt_le_support(struct hci_dev *hdev)
return MGMT_STATUS_SUCCESS;
}
-void mgmt_set_discoverable_complete(struct hci_dev *hdev, u8 status)
+static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data,
+ int err)
{
- struct mgmt_pending_cmd *cmd;
+ struct mgmt_pending_cmd *cmd = data;
- BT_DBG("status 0x%02x", status);
+ bt_dev_dbg(hdev, "err %d", err);
- hci_dev_lock(hdev);
+ /* Make sure cmd still outstanding. */
+ if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
+ return;
- cmd = pending_find(MGMT_OP_SET_DISCOVERABLE, hdev);
- if (!cmd)
- goto unlock;
+ hci_dev_lock(hdev);
- if (status) {
- u8 mgmt_err = mgmt_status(status);
- mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err);
+ if (err) {
+ u8 mgmt_err = mgmt_status(err);
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err);
hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
- goto remove_cmd;
+ goto done;
}
if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE) &&
hdev->discov_timeout > 0) {
- int to = msecs_to_jiffies(hdev->discov_timeout * 1000);
+ int to = secs_to_jiffies(hdev->discov_timeout);
queue_delayed_work(hdev->req_workqueue, &hdev->discov_off, to);
}
send_settings_rsp(cmd->sk, MGMT_OP_SET_DISCOVERABLE, hdev);
new_settings(hdev, cmd->sk);
-remove_cmd:
- mgmt_pending_remove(cmd);
-
-unlock:
+done:
+ mgmt_pending_free(cmd);
hci_dev_unlock(hdev);
}
+static int set_discoverable_sync(struct hci_dev *hdev, void *data)
+{
+ if (!mgmt_pending_listed(hdev, data))
+ return -ECANCELED;
+
+ BT_DBG("%s", hdev->name);
+
+ return hci_update_discoverable_sync(hdev);
+}
+
static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data,
u16 len)
{
@@ -1339,7 +1577,7 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data,
u16 timeout;
int err;
- BT_DBG("request for %s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) &&
!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
@@ -1381,6 +1619,12 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data,
goto failed;
}
+ if (hdev->advertising_paused) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DISCOVERABLE,
+ MGMT_STATUS_BUSY);
+ goto failed;
+ }
+
if (!hdev_is_powered(hdev)) {
bool changed = false;
@@ -1414,7 +1658,7 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data,
hdev->discov_timeout = timeout;
if (cp->val && hdev->discov_timeout > 0) {
- int to = msecs_to_jiffies(hdev->discov_timeout * 1000);
+ int to = secs_to_jiffies(hdev->discov_timeout);
queue_delayed_work(hdev->req_workqueue,
&hdev->discov_off, to);
}
@@ -1447,39 +1691,42 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data,
else
hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
- queue_work(hdev->req_workqueue, &hdev->discoverable_update);
- err = 0;
+ err = hci_cmd_sync_queue(hdev, set_discoverable_sync, cmd,
+ mgmt_set_discoverable_complete);
+
+ if (err < 0)
+ mgmt_pending_remove(cmd);
failed:
hci_dev_unlock(hdev);
return err;
}
-void mgmt_set_connectable_complete(struct hci_dev *hdev, u8 status)
+static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data,
+ int err)
{
- struct mgmt_pending_cmd *cmd;
+ struct mgmt_pending_cmd *cmd = data;
- BT_DBG("status 0x%02x", status);
+ bt_dev_dbg(hdev, "err %d", err);
- hci_dev_lock(hdev);
+ /* Make sure cmd still outstanding. */
+ if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
+ return;
- cmd = pending_find(MGMT_OP_SET_CONNECTABLE, hdev);
- if (!cmd)
- goto unlock;
+ hci_dev_lock(hdev);
- if (status) {
- u8 mgmt_err = mgmt_status(status);
- mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err);
- goto remove_cmd;
+ if (err) {
+ u8 mgmt_err = mgmt_status(err);
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err);
+ goto done;
}
send_settings_rsp(cmd->sk, MGMT_OP_SET_CONNECTABLE, hdev);
new_settings(hdev, cmd->sk);
-remove_cmd:
- mgmt_pending_remove(cmd);
+done:
+ mgmt_pending_free(cmd);
-unlock:
hci_dev_unlock(hdev);
}
@@ -1504,14 +1751,24 @@ static int set_connectable_update_settings(struct hci_dev *hdev,
return err;
if (changed) {
- hci_req_update_scan(hdev);
- hci_update_background_scan(hdev);
+ hci_update_scan(hdev);
+ hci_update_passive_scan(hdev);
return new_settings(hdev, sk);
}
return 0;
}
+static int set_connectable_sync(struct hci_dev *hdev, void *data)
+{
+ if (!mgmt_pending_listed(hdev, data))
+ return -ECANCELED;
+
+ BT_DBG("%s", hdev->name);
+
+ return hci_update_connectable_sync(hdev);
+}
+
static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data,
u16 len)
{
@@ -1519,7 +1776,7 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data,
struct mgmt_pending_cmd *cmd;
int err;
- BT_DBG("request for %s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) &&
!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
@@ -1561,8 +1818,11 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data,
hci_dev_clear_flag(hdev, HCI_CONNECTABLE);
}
- queue_work(hdev->req_workqueue, &hdev->connectable_update);
- err = 0;
+ err = hci_cmd_sync_queue(hdev, set_connectable_sync, cmd,
+ mgmt_set_connectable_complete);
+
+ if (err < 0)
+ mgmt_pending_remove(cmd);
failed:
hci_dev_unlock(hdev);
@@ -1576,7 +1836,7 @@ static int set_bondable(struct sock *sk, struct hci_dev *hdev, void *data,
bool changed;
int err;
- BT_DBG("request for %s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (cp->val != 0x00 && cp->val != 0x01)
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BONDABLE,
@@ -1597,12 +1857,7 @@ static int set_bondable(struct sock *sk, struct hci_dev *hdev, void *data,
/* In limited privacy mode the change of bondable mode
* may affect the local advertising address.
*/
- if (hdev_is_powered(hdev) &&
- hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
- hci_dev_test_flag(hdev, HCI_DISCOVERABLE) &&
- hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY))
- queue_work(hdev->req_workqueue,
- &hdev->discoverable_update);
+ hci_update_discoverable(hdev);
err = new_settings(hdev, sk);
}
@@ -1620,7 +1875,7 @@ static int set_link_security(struct sock *sk, struct hci_dev *hdev, void *data,
u8 val, status;
int err;
- BT_DBG("request for %s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
status = mgmt_bredr_support(hdev);
if (status)
@@ -1681,6 +1936,79 @@ failed:
return err;
}
+static void set_ssp_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct cmd_lookup match = { NULL, hdev };
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_mode *cp;
+ u8 enable;
+ bool changed;
+
+ /* Make sure cmd still outstanding. */
+ if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
+ return;
+
+ cp = cmd->param;
+ enable = cp->val;
+
+ if (err) {
+ u8 mgmt_err = mgmt_status(err);
+
+ if (enable && hci_dev_test_and_clear_flag(hdev,
+ HCI_SSP_ENABLED)) {
+ new_settings(hdev, NULL);
+ }
+
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err);
+ return;
+ }
+
+ if (enable) {
+ changed = !hci_dev_test_and_set_flag(hdev, HCI_SSP_ENABLED);
+ } else {
+ changed = hci_dev_test_and_clear_flag(hdev, HCI_SSP_ENABLED);
+ }
+
+ settings_rsp(cmd, &match);
+
+ if (changed)
+ new_settings(hdev, match.sk);
+
+ if (match.sk)
+ sock_put(match.sk);
+
+ hci_update_eir_sync(hdev);
+}
+
+static int set_ssp_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_mode cp;
+ bool changed = false;
+ int err;
+
+ mutex_lock(&hdev->mgmt_pending_lock);
+
+ if (!__mgmt_pending_listed(hdev, cmd)) {
+ mutex_unlock(&hdev->mgmt_pending_lock);
+ return -ECANCELED;
+ }
+
+ memcpy(&cp, cmd->param, sizeof(cp));
+
+ mutex_unlock(&hdev->mgmt_pending_lock);
+
+ if (cp.val)
+ changed = !hci_dev_test_and_set_flag(hdev, HCI_SSP_ENABLED);
+
+ err = hci_write_ssp_mode_sync(hdev, cp.val);
+
+ if (!err && changed)
+ hci_dev_clear_flag(hdev, HCI_SSP_ENABLED);
+
+ return err;
+}
+
static int set_ssp(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
{
struct mgmt_mode *cp = data;
@@ -1688,7 +2016,7 @@ static int set_ssp(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
u8 status;
int err;
- BT_DBG("request for %s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
status = mgmt_bredr_support(hdev);
if (status)
@@ -1713,11 +2041,6 @@ static int set_ssp(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
} else {
changed = hci_dev_test_and_clear_flag(hdev,
HCI_SSP_ENABLED);
- if (!changed)
- changed = hci_dev_test_and_clear_flag(hdev,
- HCI_HS_ENABLED);
- else
- hci_dev_clear_flag(hdev, HCI_HS_ENABLED);
}
err = send_settings_rsp(sk, MGMT_OP_SET_SSP, hdev);
@@ -1742,19 +2065,18 @@ static int set_ssp(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
}
cmd = mgmt_pending_add(sk, MGMT_OP_SET_SSP, hdev, data, len);
- if (!cmd) {
+ if (!cmd)
err = -ENOMEM;
- goto failed;
- }
-
- if (!cp->val && hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS))
- hci_send_cmd(hdev, HCI_OP_WRITE_SSP_DEBUG_MODE,
- sizeof(cp->val), &cp->val);
+ else
+ err = hci_cmd_sync_queue(hdev, set_ssp_sync, cmd,
+ set_ssp_complete);
- err = hci_send_cmd(hdev, HCI_OP_WRITE_SSP_MODE, 1, &cp->val);
if (err < 0) {
- mgmt_pending_remove(cmd);
- goto failed;
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SSP,
+ MGMT_STATUS_FAILED);
+
+ if (cmd)
+ mgmt_pending_remove(cmd);
}
failed:
@@ -1764,118 +2086,457 @@ failed:
static int set_hs(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
{
- struct mgmt_mode *cp = data;
- bool changed;
- u8 status;
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
+ MGMT_STATUS_NOT_SUPPORTED);
+}
+
+static void set_le_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct cmd_lookup match = { NULL, hdev };
+ u8 status = mgmt_status(err);
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ if (err == -ECANCELED || !mgmt_pending_valid(hdev, data))
+ return;
+
+ if (status) {
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, status);
+ goto done;
+ }
+
+ settings_rsp(cmd, &match);
+
+ new_settings(hdev, match.sk);
+
+ if (match.sk)
+ sock_put(match.sk);
+
+done:
+ mgmt_pending_free(cmd);
+}
+
+static int set_le_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_mode cp;
+ u8 val;
int err;
- BT_DBG("request for %s", hdev->name);
+ mutex_lock(&hdev->mgmt_pending_lock);
- status = mgmt_bredr_support(hdev);
- if (status)
- return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS, status);
+ if (!__mgmt_pending_listed(hdev, cmd)) {
+ mutex_unlock(&hdev->mgmt_pending_lock);
+ return -ECANCELED;
+ }
- if (!lmp_ssp_capable(hdev))
- return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
+ memcpy(&cp, cmd->param, sizeof(cp));
+ val = !!cp.val;
+
+ mutex_unlock(&hdev->mgmt_pending_lock);
+
+ if (!val) {
+ hci_clear_adv_instance_sync(hdev, NULL, 0x00, true);
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+ hci_disable_advertising_sync(hdev);
+
+ if (ext_adv_capable(hdev))
+ hci_remove_ext_adv_instance_sync(hdev, 0, cmd->sk);
+ } else {
+ hci_dev_set_flag(hdev, HCI_LE_ENABLED);
+ }
+
+ err = hci_write_le_host_supported_sync(hdev, val, 0);
+
+ /* Make sure the controller has a good default for
+ * advertising data. Restrict the update to when LE
+ * has actually been enabled. During power on, the
+ * update in powered_update_hci will take care of it.
+ */
+ if (!err && hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
+ if (ext_adv_capable(hdev)) {
+ int status;
+
+ status = hci_setup_ext_adv_instance_sync(hdev, 0x00);
+ if (!status)
+ hci_update_scan_rsp_data_sync(hdev, 0x00);
+ } else {
+ hci_update_adv_data_sync(hdev, 0x00);
+ hci_update_scan_rsp_data_sync(hdev, 0x00);
+ }
+
+ hci_update_passive_scan(hdev);
+ }
+
+ return err;
+}
+
+static void set_mesh_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ u8 status = mgmt_status(err);
+ struct sock *sk;
+
+ if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
+ return;
+
+ sk = cmd->sk;
+
+ if (status) {
+ mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER,
+ status);
+ mgmt_pending_foreach(MGMT_OP_SET_MESH_RECEIVER, hdev, true,
+ cmd_status_rsp, &status);
+ goto done;
+ }
+
+ mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, 0, NULL, 0);
+
+done:
+ mgmt_pending_free(cmd);
+}
+
+static int set_mesh_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ DEFINE_FLEX(struct mgmt_cp_set_mesh, cp, ad_types, num_ad_types,
+ sizeof(hdev->mesh_ad_types));
+ size_t len;
+
+ mutex_lock(&hdev->mgmt_pending_lock);
+
+ if (!__mgmt_pending_listed(hdev, cmd)) {
+ mutex_unlock(&hdev->mgmt_pending_lock);
+ return -ECANCELED;
+ }
+
+ len = cmd->param_len;
+ memcpy(cp, cmd->param, min(__struct_size(cp), len));
+
+ mutex_unlock(&hdev->mgmt_pending_lock);
+
+ memset(hdev->mesh_ad_types, 0, sizeof(hdev->mesh_ad_types));
+
+ if (cp->enable)
+ hci_dev_set_flag(hdev, HCI_MESH);
+ else
+ hci_dev_clear_flag(hdev, HCI_MESH);
+
+ hdev->le_scan_interval = __le16_to_cpu(cp->period);
+ hdev->le_scan_window = __le16_to_cpu(cp->window);
+
+ len -= sizeof(struct mgmt_cp_set_mesh);
+
+ /* If filters don't fit, forward all adv pkts */
+ if (len <= sizeof(hdev->mesh_ad_types))
+ memcpy(hdev->mesh_ad_types, cp->ad_types, len);
+
+ hci_update_passive_scan_sync(hdev);
+ return 0;
+}
+
+static int set_mesh(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
+{
+ struct mgmt_cp_set_mesh *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ __u16 period, window;
+ int err = 0;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!lmp_le_capable(hdev) ||
+ !hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER,
MGMT_STATUS_NOT_SUPPORTED);
- if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
- return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
- MGMT_STATUS_REJECTED);
+ if (cp->enable != 0x00 && cp->enable != 0x01)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER,
+ MGMT_STATUS_INVALID_PARAMS);
- if (cp->val != 0x00 && cp->val != 0x01)
- return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
+ /* Keep allowed ranges in sync with set_scan_params() */
+ period = __le16_to_cpu(cp->period);
+
+ if (period < 0x0004 || period > 0x4000)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ window = __le16_to_cpu(cp->window);
+
+ if (window < 0x0004 || window > 0x4000)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ if (window > period)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER,
MGMT_STATUS_INVALID_PARAMS);
hci_dev_lock(hdev);
- if (pending_find(MGMT_OP_SET_SSP, hdev)) {
- err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
- MGMT_STATUS_BUSY);
- goto unlock;
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_MESH_RECEIVER, hdev, data, len);
+ if (!cmd)
+ err = -ENOMEM;
+ else
+ err = hci_cmd_sync_queue(hdev, set_mesh_sync, cmd,
+ set_mesh_complete);
+
+ if (err < 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER,
+ MGMT_STATUS_FAILED);
+
+ if (cmd)
+ mgmt_pending_remove(cmd);
}
- if (cp->val) {
- changed = !hci_dev_test_and_set_flag(hdev, HCI_HS_ENABLED);
+ hci_dev_unlock(hdev);
+ return err;
+}
+
+static void mesh_send_start_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_mesh_tx *mesh_tx = data;
+ struct mgmt_cp_mesh_send *send = (void *)mesh_tx->param;
+ unsigned long mesh_send_interval;
+ u8 mgmt_err = mgmt_status(err);
+
+ /* Report any errors here, but don't report completion */
+
+ if (mgmt_err) {
+ hci_dev_clear_flag(hdev, HCI_MESH_SENDING);
+ /* Send Complete Error Code for handle */
+ mesh_send_complete(hdev, mesh_tx, false);
+ return;
+ }
+
+ mesh_send_interval = msecs_to_jiffies((send->cnt) * 25);
+ queue_delayed_work(hdev->req_workqueue, &hdev->mesh_send_done,
+ mesh_send_interval);
+}
+
+static int mesh_send_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_mesh_tx *mesh_tx = data;
+ struct mgmt_cp_mesh_send *send = (void *)mesh_tx->param;
+ struct adv_info *adv, *next_instance;
+ u8 instance = hdev->le_num_of_adv_sets + 1;
+ u16 timeout, duration;
+ int err = 0;
+
+ if (hdev->le_num_of_adv_sets <= hdev->adv_instance_cnt)
+ return MGMT_STATUS_BUSY;
+
+ timeout = 1000;
+ duration = send->cnt * INTERVAL_TO_MS(hdev->le_adv_max_interval);
+ adv = hci_add_adv_instance(hdev, instance, 0,
+ send->adv_data_len, send->adv_data,
+ 0, NULL,
+ timeout, duration,
+ HCI_ADV_TX_POWER_NO_PREFERENCE,
+ hdev->le_adv_min_interval,
+ hdev->le_adv_max_interval,
+ mesh_tx->handle);
+
+ if (!IS_ERR(adv))
+ mesh_tx->instance = instance;
+ else
+ err = PTR_ERR(adv);
+
+ if (hdev->cur_adv_instance == instance) {
+ /* If the currently advertised instance is being changed then
+ * cancel the current advertising and schedule the next
+ * instance. If there is only one instance then the overridden
+ * advertising data will be visible right away.
+ */
+ cancel_adv_timeout(hdev);
+
+ next_instance = hci_get_next_instance(hdev, instance);
+ if (next_instance)
+ instance = next_instance->instance;
+ else
+ instance = 0;
+ } else if (hdev->adv_instance_timeout) {
+ /* Immediately advertise the new instance if no other, or
+ * let it go naturally from queue if ADV is already happening
+ */
+ instance = 0;
+ }
+
+ if (instance)
+ return hci_schedule_adv_instance_sync(hdev, instance, true);
+
+ return err;
+}
+
+static void send_count(struct mgmt_mesh_tx *mesh_tx, void *data)
+{
+ struct mgmt_rp_mesh_read_features *rp = data;
+
+ if (rp->used_handles >= rp->max_handles)
+ return;
+
+ rp->handles[rp->used_handles++] = mesh_tx->handle;
+}
+
+static int mesh_features(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_rp_mesh_read_features rp;
+
+ if (!lmp_le_capable(hdev) ||
+ !hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_READ_FEATURES,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ memset(&rp, 0, sizeof(rp));
+ rp.index = cpu_to_le16(hdev->id);
+ if (hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ rp.max_handles = MESH_HANDLES_MAX;
+
+ hci_dev_lock(hdev);
+
+ if (rp.max_handles)
+ mgmt_mesh_foreach(hdev, send_count, &rp, sk);
+
+ mgmt_cmd_complete(sk, hdev->id, MGMT_OP_MESH_READ_FEATURES, 0, &rp,
+ rp.used_handles + sizeof(rp) - MESH_HANDLES_MAX);
+
+ hci_dev_unlock(hdev);
+ return 0;
+}
+
+static int send_cancel(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_mesh_send_cancel *cancel = (void *)cmd->param;
+ struct mgmt_mesh_tx *mesh_tx;
+
+ if (!cancel->handle) {
+ do {
+ mesh_tx = mgmt_mesh_next(hdev, cmd->sk);
+
+ if (mesh_tx)
+ mesh_send_complete(hdev, mesh_tx, false);
+ } while (mesh_tx);
} else {
- if (hdev_is_powered(hdev)) {
- err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_HS,
- MGMT_STATUS_REJECTED);
- goto unlock;
- }
+ mesh_tx = mgmt_mesh_find(hdev, cancel->handle);
- changed = hci_dev_test_and_clear_flag(hdev, HCI_HS_ENABLED);
+ if (mesh_tx && mesh_tx->sk == cmd->sk)
+ mesh_send_complete(hdev, mesh_tx, false);
}
- err = send_settings_rsp(sk, MGMT_OP_SET_HS, hdev);
- if (err < 0)
- goto unlock;
+ mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_MESH_SEND_CANCEL,
+ 0, NULL, 0);
+ mgmt_pending_free(cmd);
- if (changed)
- err = new_settings(hdev, sk);
+ return 0;
+}
+
+static int mesh_send_cancel(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_pending_cmd *cmd;
+ int err;
+
+ if (!lmp_le_capable(hdev) ||
+ !hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND_CANCEL,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND_CANCEL,
+ MGMT_STATUS_REJECTED);
+
+ hci_dev_lock(hdev);
+ cmd = mgmt_pending_new(sk, MGMT_OP_MESH_SEND_CANCEL, hdev, data, len);
+ if (!cmd)
+ err = -ENOMEM;
+ else
+ err = hci_cmd_sync_queue(hdev, send_cancel, cmd, NULL);
+
+ if (err < 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND_CANCEL,
+ MGMT_STATUS_FAILED);
+
+ if (cmd)
+ mgmt_pending_free(cmd);
+ }
-unlock:
hci_dev_unlock(hdev);
return err;
}
-static void le_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+static int mesh_send(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
{
- struct cmd_lookup match = { NULL, hdev };
+ struct mgmt_mesh_tx *mesh_tx;
+ struct mgmt_cp_mesh_send *send = data;
+ struct mgmt_rp_mesh_read_features rp;
+ bool sending;
+ int err = 0;
+
+ if (!lmp_le_capable(hdev) ||
+ !hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND,
+ MGMT_STATUS_NOT_SUPPORTED);
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) ||
+ len <= MGMT_MESH_SEND_SIZE ||
+ len > (MGMT_MESH_SEND_SIZE + 31))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND,
+ MGMT_STATUS_REJECTED);
hci_dev_lock(hdev);
- if (status) {
- u8 mgmt_err = mgmt_status(status);
+ memset(&rp, 0, sizeof(rp));
+ rp.max_handles = MESH_HANDLES_MAX;
- mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, cmd_status_rsp,
- &mgmt_err);
- goto unlock;
- }
+ mgmt_mesh_foreach(hdev, send_count, &rp, sk);
- mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, settings_rsp, &match);
+ if (rp.max_handles <= rp.used_handles) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND,
+ MGMT_STATUS_BUSY);
+ goto done;
+ }
- new_settings(hdev, match.sk);
+ sending = hci_dev_test_flag(hdev, HCI_MESH_SENDING);
+ mesh_tx = mgmt_mesh_add(sk, hdev, send, len);
- if (match.sk)
- sock_put(match.sk);
+ if (!mesh_tx)
+ err = -ENOMEM;
+ else if (!sending)
+ err = hci_cmd_sync_queue(hdev, mesh_send_sync, mesh_tx,
+ mesh_send_start_complete);
- /* Make sure the controller has a good default for
- * advertising data. Restrict the update to when LE
- * has actually been enabled. During power on, the
- * update in powered_update_hci will take care of it.
- */
- if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
- struct hci_request req;
- hci_req_init(&req, hdev);
- if (ext_adv_capable(hdev)) {
- int err;
+ if (err < 0) {
+ bt_dev_err(hdev, "Send Mesh Failed %d", err);
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND,
+ MGMT_STATUS_FAILED);
- err = __hci_req_setup_ext_adv_instance(&req, 0x00);
- if (!err)
- __hci_req_update_scan_rsp_data(&req, 0x00);
- } else {
- __hci_req_update_adv_data(&req, 0x00);
- __hci_req_update_scan_rsp_data(&req, 0x00);
+ if (mesh_tx) {
+ if (sending)
+ mgmt_mesh_remove(mesh_tx);
}
- hci_req_run(&req, NULL);
- hci_update_background_scan(hdev);
+ } else {
+ hci_dev_set_flag(hdev, HCI_MESH_SENDING);
+
+ mgmt_cmd_complete(sk, hdev->id, MGMT_OP_MESH_SEND, 0,
+ &mesh_tx->handle, 1);
}
-unlock:
+done:
hci_dev_unlock(hdev);
+ return err;
}
static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
{
struct mgmt_mode *cp = data;
- struct hci_cp_write_le_host_supported hci_cp;
struct mgmt_pending_cmd *cmd;
- struct hci_request req;
int err;
u8 val, enabled;
- BT_DBG("request for %s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (!lmp_le_capable(hdev))
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE,
@@ -1907,9 +2568,6 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
val = !!cp->val;
enabled = lmp_host_le_capable(hdev);
- if (!val)
- hci_req_clear_adv_instance(hdev, NULL, NULL, 0x00, true);
-
if (!hdev_is_powered(hdev) || val == enabled) {
bool changed = false;
@@ -1941,34 +2599,80 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
}
cmd = mgmt_pending_add(sk, MGMT_OP_SET_LE, hdev, data, len);
- if (!cmd) {
+ if (!cmd)
err = -ENOMEM;
- goto unlock;
- }
+ else
+ err = hci_cmd_sync_queue(hdev, set_le_sync, cmd,
+ set_le_complete);
- hci_req_init(&req, hdev);
+ if (err < 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE,
+ MGMT_STATUS_FAILED);
- memset(&hci_cp, 0, sizeof(hci_cp));
+ if (cmd)
+ mgmt_pending_remove(cmd);
+ }
- if (val) {
- hci_cp.le = val;
- hci_cp.simul = 0x00;
- } else {
- if (hci_dev_test_flag(hdev, HCI_LE_ADV))
- __hci_req_disable_advertising(&req);
+unlock:
+ hci_dev_unlock(hdev);
+ return err;
+}
- if (ext_adv_capable(hdev))
- __hci_req_clear_ext_adv_sets(&req);
+static int send_hci_cmd_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_hci_cmd_sync *cp = cmd->param;
+ struct sk_buff *skb;
+
+ skb = __hci_cmd_sync_ev(hdev, le16_to_cpu(cp->opcode),
+ le16_to_cpu(cp->params_len), cp->params,
+ cp->event, cp->timeout ?
+ secs_to_jiffies(cp->timeout) :
+ HCI_CMD_TIMEOUT);
+ if (IS_ERR(skb)) {
+ mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_HCI_CMD_SYNC,
+ mgmt_status(PTR_ERR(skb)));
+ goto done;
}
- hci_req_add(&req, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(hci_cp),
- &hci_cp);
+ mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_HCI_CMD_SYNC, 0,
+ skb->data, skb->len);
- err = hci_req_run(&req, le_enable_complete);
- if (err < 0)
- mgmt_pending_remove(cmd);
+ kfree_skb(skb);
+
+done:
+ mgmt_pending_free(cmd);
+
+ return 0;
+}
+
+static int mgmt_hci_cmd_sync(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_hci_cmd_sync *cp = data;
+ struct mgmt_pending_cmd *cmd;
+ int err;
+
+ if (len != (offsetof(struct mgmt_cp_hci_cmd_sync, params) +
+ le16_to_cpu(cp->params_len)))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_HCI_CMD_SYNC,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+ cmd = mgmt_pending_new(sk, MGMT_OP_HCI_CMD_SYNC, hdev, data, len);
+ if (!cmd)
+ err = -ENOMEM;
+ else
+ err = hci_cmd_sync_queue(hdev, send_hci_cmd_sync, cmd, NULL);
+
+ if (err < 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_HCI_CMD_SYNC,
+ MGMT_STATUS_FAILED);
+
+ if (cmd)
+ mgmt_pending_free(cmd);
+ }
-unlock:
hci_dev_unlock(hdev);
return err;
}
@@ -2015,41 +2719,37 @@ static u8 get_uuid_size(const u8 *uuid)
return 16;
}
-static void mgmt_class_complete(struct hci_dev *hdev, u16 mgmt_op, u8 status)
+static void mgmt_class_complete(struct hci_dev *hdev, void *data, int err)
{
- struct mgmt_pending_cmd *cmd;
-
- hci_dev_lock(hdev);
-
- cmd = pending_find(mgmt_op, hdev);
- if (!cmd)
- goto unlock;
+ struct mgmt_pending_cmd *cmd = data;
- mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode,
- mgmt_status(status), hdev->dev_class, 3);
+ bt_dev_dbg(hdev, "err %d", err);
- mgmt_pending_remove(cmd);
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode,
+ mgmt_status(err), hdev->dev_class, 3);
-unlock:
- hci_dev_unlock(hdev);
+ mgmt_pending_free(cmd);
}
-static void add_uuid_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+static int add_uuid_sync(struct hci_dev *hdev, void *data)
{
- BT_DBG("status 0x%02x", status);
+ int err;
+
+ err = hci_update_class_sync(hdev);
+ if (err)
+ return err;
- mgmt_class_complete(hdev, MGMT_OP_ADD_UUID, status);
+ return hci_update_eir_sync(hdev);
}
static int add_uuid(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
{
struct mgmt_cp_add_uuid *cp = data;
struct mgmt_pending_cmd *cmd;
- struct hci_request req;
struct bt_uuid *uuid;
int err;
- BT_DBG("request for %s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
hci_dev_lock(hdev);
@@ -2071,28 +2771,21 @@ static int add_uuid(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
list_add_tail(&uuid->list, &hdev->uuids);
- hci_req_init(&req, hdev);
-
- __hci_req_update_class(&req);
- __hci_req_update_eir(&req);
-
- err = hci_req_run(&req, add_uuid_complete);
- if (err < 0) {
- if (err != -ENODATA)
- goto failed;
-
- err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_UUID, 0,
- hdev->dev_class, 3);
- goto failed;
- }
-
- cmd = mgmt_pending_add(sk, MGMT_OP_ADD_UUID, hdev, data, len);
+ cmd = mgmt_pending_new(sk, MGMT_OP_ADD_UUID, hdev, data, len);
if (!cmd) {
err = -ENOMEM;
goto failed;
}
- err = 0;
+ /* MGMT_OP_ADD_UUID don't require adapter the UP/Running so use
+ * hci_cmd_sync_submit instead of hci_cmd_sync_queue.
+ */
+ err = hci_cmd_sync_submit(hdev, add_uuid_sync, cmd,
+ mgmt_class_complete);
+ if (err < 0) {
+ mgmt_pending_free(cmd);
+ goto failed;
+ }
failed:
hci_dev_unlock(hdev);
@@ -2113,11 +2806,15 @@ static bool enable_service_cache(struct hci_dev *hdev)
return false;
}
-static void remove_uuid_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+static int remove_uuid_sync(struct hci_dev *hdev, void *data)
{
- BT_DBG("status 0x%02x", status);
+ int err;
- mgmt_class_complete(hdev, MGMT_OP_REMOVE_UUID, status);
+ err = hci_update_class_sync(hdev);
+ if (err)
+ return err;
+
+ return hci_update_eir_sync(hdev);
}
static int remove_uuid(struct sock *sk, struct hci_dev *hdev, void *data,
@@ -2126,11 +2823,12 @@ static int remove_uuid(struct sock *sk, struct hci_dev *hdev, void *data,
struct mgmt_cp_remove_uuid *cp = data;
struct mgmt_pending_cmd *cmd;
struct bt_uuid *match, *tmp;
- u8 bt_uuid_any[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
- struct hci_request req;
+ static const u8 bt_uuid_any[] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+ };
int err, found;
- BT_DBG("request for %s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
hci_dev_lock(hdev);
@@ -2171,39 +2869,38 @@ static int remove_uuid(struct sock *sk, struct hci_dev *hdev, void *data,
}
update_class:
- hci_req_init(&req, hdev);
-
- __hci_req_update_class(&req);
- __hci_req_update_eir(&req);
-
- err = hci_req_run(&req, remove_uuid_complete);
- if (err < 0) {
- if (err != -ENODATA)
- goto unlock;
-
- err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_UUID, 0,
- hdev->dev_class, 3);
- goto unlock;
- }
-
- cmd = mgmt_pending_add(sk, MGMT_OP_REMOVE_UUID, hdev, data, len);
+ cmd = mgmt_pending_new(sk, MGMT_OP_REMOVE_UUID, hdev, data, len);
if (!cmd) {
err = -ENOMEM;
goto unlock;
}
- err = 0;
+ /* MGMT_OP_REMOVE_UUID don't require adapter the UP/Running so use
+ * hci_cmd_sync_submit instead of hci_cmd_sync_queue.
+ */
+ err = hci_cmd_sync_submit(hdev, remove_uuid_sync, cmd,
+ mgmt_class_complete);
+ if (err < 0)
+ mgmt_pending_free(cmd);
unlock:
hci_dev_unlock(hdev);
return err;
}
-static void set_class_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+static int set_class_sync(struct hci_dev *hdev, void *data)
{
- BT_DBG("status 0x%02x", status);
+ int err = 0;
+
+ if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE)) {
+ cancel_delayed_work_sync(&hdev->service_cache);
+ err = hci_update_eir_sync(hdev);
+ }
+
+ if (err)
+ return err;
- mgmt_class_complete(hdev, MGMT_OP_SET_DEV_CLASS, status);
+ return hci_update_class_sync(hdev);
}
static int set_dev_class(struct sock *sk, struct hci_dev *hdev, void *data,
@@ -2211,10 +2908,9 @@ static int set_dev_class(struct sock *sk, struct hci_dev *hdev, void *data,
{
struct mgmt_cp_set_dev_class *cp = data;
struct mgmt_pending_cmd *cmd;
- struct hci_request req;
int err;
- BT_DBG("request for %s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (!lmp_bredr_capable(hdev))
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS,
@@ -2243,34 +2939,19 @@ static int set_dev_class(struct sock *sk, struct hci_dev *hdev, void *data,
goto unlock;
}
- hci_req_init(&req, hdev);
-
- if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE)) {
- hci_dev_unlock(hdev);
- cancel_delayed_work_sync(&hdev->service_cache);
- hci_dev_lock(hdev);
- __hci_req_update_eir(&req);
- }
-
- __hci_req_update_class(&req);
-
- err = hci_req_run(&req, set_class_complete);
- if (err < 0) {
- if (err != -ENODATA)
- goto unlock;
-
- err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, 0,
- hdev->dev_class, 3);
- goto unlock;
- }
-
- cmd = mgmt_pending_add(sk, MGMT_OP_SET_DEV_CLASS, hdev, data, len);
+ cmd = mgmt_pending_new(sk, MGMT_OP_SET_DEV_CLASS, hdev, data, len);
if (!cmd) {
err = -ENOMEM;
goto unlock;
}
- err = 0;
+ /* MGMT_OP_SET_DEV_CLASS don't require adapter the UP/Running so use
+ * hci_cmd_sync_submit instead of hci_cmd_sync_queue.
+ */
+ err = hci_cmd_sync_submit(hdev, set_class_sync, cmd,
+ mgmt_class_complete);
+ if (err < 0)
+ mgmt_pending_free(cmd);
unlock:
hci_dev_unlock(hdev);
@@ -2287,7 +2968,7 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data,
bool changed;
int i;
- BT_DBG("request for %s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (!lmp_bredr_capable(hdev))
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS,
@@ -2301,8 +2982,7 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data,
MGMT_STATUS_INVALID_PARAMS);
}
- expected_len = sizeof(*cp) + key_count *
- sizeof(struct mgmt_link_key_info);
+ expected_len = struct_size(cp, keys, key_count);
if (expected_len != len) {
bt_dev_err(hdev, "load_link_keys: expected %u bytes, got %u bytes",
expected_len, len);
@@ -2314,17 +2994,8 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data,
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS,
MGMT_STATUS_INVALID_PARAMS);
- BT_DBG("%s debug_keys %u key_count %u", hdev->name, cp->debug_keys,
- key_count);
-
- for (i = 0; i < key_count; i++) {
- struct mgmt_link_key_info *key = &cp->keys[i];
-
- if (key->addr.type != BDADDR_BREDR || key->type > 0x08)
- return mgmt_cmd_status(sk, hdev->id,
- MGMT_OP_LOAD_LINK_KEYS,
- MGMT_STATUS_INVALID_PARAMS);
- }
+ bt_dev_dbg(hdev, "debug_keys %u key_count %u", cp->debug_keys,
+ key_count);
hci_dev_lock(hdev);
@@ -2342,6 +3013,27 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data,
for (i = 0; i < key_count; i++) {
struct mgmt_link_key_info *key = &cp->keys[i];
+ if (hci_is_blocked_key(hdev,
+ HCI_BLOCKED_KEY_TYPE_LINKKEY,
+ key->val)) {
+ bt_dev_warn(hdev, "Skipping blocked link key for %pMR",
+ &key->addr.bdaddr);
+ continue;
+ }
+
+ if (key->addr.type != BDADDR_BREDR) {
+ bt_dev_warn(hdev,
+ "Invalid link address type %u for %pMR",
+ key->addr.type, &key->addr.bdaddr);
+ continue;
+ }
+
+ if (key->type > 0x08) {
+ bt_dev_warn(hdev, "Invalid link key type %u for %pMR",
+ key->type, &key->addr.bdaddr);
+ continue;
+ }
+
/* Always ignore debug keys and require a new pairing if
* the user wants to use them.
*/
@@ -2371,6 +3063,42 @@ static int device_unpaired(struct hci_dev *hdev, bdaddr_t *bdaddr,
skip_sk);
}
+static void unpair_device_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_unpair_device *cp = cmd->param;
+
+ if (!err)
+ device_unpaired(hdev, &cp->addr.bdaddr, cp->addr.type, cmd->sk);
+
+ cmd->cmd_complete(cmd, err);
+ mgmt_pending_free(cmd);
+}
+
+static int unpair_device_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_unpair_device *cp = cmd->param;
+ struct hci_conn *conn;
+
+ if (cp->addr.type == BDADDR_BREDR)
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK,
+ &cp->addr.bdaddr);
+ else
+ conn = hci_conn_hash_lookup_le(hdev, &cp->addr.bdaddr,
+ le_addr_type(cp->addr.type));
+
+ if (!conn)
+ return 0;
+
+ /* Disregard any possible error since the likes of hci_abort_conn_sync
+ * will clean up the connection no matter the error.
+ */
+ hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM);
+
+ return 0;
+}
+
static int unpair_device(struct sock *sk, struct hci_dev *hdev, void *data,
u16 len)
{
@@ -2481,7 +3209,7 @@ done:
goto unlock;
}
- cmd = mgmt_pending_add(sk, MGMT_OP_UNPAIR_DEVICE, hdev, cp,
+ cmd = mgmt_pending_new(sk, MGMT_OP_UNPAIR_DEVICE, hdev, cp,
sizeof(*cp));
if (!cmd) {
err = -ENOMEM;
@@ -2490,25 +3218,57 @@ done:
cmd->cmd_complete = addr_cmd_complete;
- err = hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM);
+ err = hci_cmd_sync_queue(hdev, unpair_device_sync, cmd,
+ unpair_device_complete);
if (err < 0)
- mgmt_pending_remove(cmd);
+ mgmt_pending_free(cmd);
unlock:
hci_dev_unlock(hdev);
return err;
}
+static void disconnect_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+
+ cmd->cmd_complete(cmd, mgmt_status(err));
+ mgmt_pending_free(cmd);
+}
+
+static int disconnect_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_disconnect *cp = cmd->param;
+ struct hci_conn *conn;
+
+ if (cp->addr.type == BDADDR_BREDR)
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK,
+ &cp->addr.bdaddr);
+ else
+ conn = hci_conn_hash_lookup_le(hdev, &cp->addr.bdaddr,
+ le_addr_type(cp->addr.type));
+
+ if (!conn)
+ return -ENOTCONN;
+
+ /* Disregard any possible error since the likes of hci_abort_conn_sync
+ * will clean up the connection no matter the error.
+ */
+ hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM);
+
+ return 0;
+}
+
static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data,
u16 len)
{
struct mgmt_cp_disconnect *cp = data;
struct mgmt_rp_disconnect rp;
struct mgmt_pending_cmd *cmd;
- struct hci_conn *conn;
int err;
- BT_DBG("");
+ bt_dev_dbg(hdev, "sock %p", sk);
memset(&rp, 0, sizeof(rp));
bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
@@ -2528,27 +3288,7 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data,
goto failed;
}
- if (pending_find(MGMT_OP_DISCONNECT, hdev)) {
- err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT,
- MGMT_STATUS_BUSY, &rp, sizeof(rp));
- goto failed;
- }
-
- if (cp->addr.type == BDADDR_BREDR)
- conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK,
- &cp->addr.bdaddr);
- else
- conn = hci_conn_hash_lookup_le(hdev, &cp->addr.bdaddr,
- le_addr_type(cp->addr.type));
-
- if (!conn || conn->state == BT_OPEN || conn->state == BT_CLOSED) {
- err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_DISCONNECT,
- MGMT_STATUS_NOT_CONNECTED, &rp,
- sizeof(rp));
- goto failed;
- }
-
- cmd = mgmt_pending_add(sk, MGMT_OP_DISCONNECT, hdev, data, len);
+ cmd = mgmt_pending_new(sk, MGMT_OP_DISCONNECT, hdev, data, len);
if (!cmd) {
err = -ENOMEM;
goto failed;
@@ -2556,9 +3296,10 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data,
cmd->cmd_complete = generic_cmd_complete;
- err = hci_disconnect(conn, HCI_ERROR_REMOTE_USER_TERM);
+ err = hci_cmd_sync_queue(hdev, disconnect_sync, cmd,
+ disconnect_complete);
if (err < 0)
- mgmt_pending_remove(cmd);
+ mgmt_pending_free(cmd);
failed:
hci_dev_unlock(hdev);
@@ -2568,6 +3309,9 @@ failed:
static u8 link_to_bdaddr(u8 link_type, u8 addr_type)
{
switch (link_type) {
+ case CIS_LINK:
+ case BIS_LINK:
+ case PA_LINK:
case LE_LINK:
switch (addr_type) {
case ADDR_LE_DEV_PUBLIC:
@@ -2589,11 +3333,10 @@ static int get_connections(struct sock *sk, struct hci_dev *hdev, void *data,
{
struct mgmt_rp_get_connections *rp;
struct hci_conn *c;
- size_t rp_len;
int err;
u16 i;
- BT_DBG("");
+ bt_dev_dbg(hdev, "sock %p", sk);
hci_dev_lock(hdev);
@@ -2609,8 +3352,7 @@ static int get_connections(struct sock *sk, struct hci_dev *hdev, void *data,
i++;
}
- rp_len = sizeof(*rp) + (i * sizeof(struct mgmt_addr_info));
- rp = kmalloc(rp_len, GFP_KERNEL);
+ rp = kmalloc(struct_size(rp, addr, i), GFP_KERNEL);
if (!rp) {
err = -ENOMEM;
goto unlock;
@@ -2630,10 +3372,8 @@ static int get_connections(struct sock *sk, struct hci_dev *hdev, void *data,
rp->conn_count = cpu_to_le16(i);
/* Recalculate length in case of filtered SCO connections, etc */
- rp_len = sizeof(*rp) + (i * sizeof(struct mgmt_addr_info));
-
err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONNECTIONS, 0, rp,
- rp_len);
+ struct_size(rp, addr, i));
kfree(rp);
@@ -2672,7 +3412,7 @@ static int pin_code_reply(struct sock *sk, struct hci_dev *hdev, void *data,
struct mgmt_pending_cmd *cmd;
int err;
- BT_DBG("");
+ bt_dev_dbg(hdev, "sock %p", sk);
hci_dev_lock(hdev);
@@ -2730,7 +3470,7 @@ static int set_io_capability(struct sock *sk, struct hci_dev *hdev, void *data,
{
struct mgmt_cp_set_io_capability *cp = data;
- BT_DBG("");
+ bt_dev_dbg(hdev, "sock %p", sk);
if (cp->io_capability > SMP_IO_KEYBOARD_DISPLAY)
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY,
@@ -2740,8 +3480,7 @@ static int set_io_capability(struct sock *sk, struct hci_dev *hdev, void *data,
hdev->io_capability = cp->io_capability;
- BT_DBG("%s IO capability set to 0x%02x", hdev->name,
- hdev->io_capability);
+ bt_dev_dbg(hdev, "IO capability set to 0x%02x", hdev->io_capability);
hci_dev_unlock(hdev);
@@ -2776,7 +3515,7 @@ static int pairing_complete(struct mgmt_pending_cmd *cmd, u8 status)
bacpy(&rp.addr.bdaddr, &conn->dst);
rp.addr.type = link_to_bdaddr(conn->type, conn->dst_type);
- err = mgmt_cmd_complete(cmd->sk, cmd->index, MGMT_OP_PAIR_DEVICE,
+ err = mgmt_cmd_complete(cmd->sk, cmd->hdev->id, MGMT_OP_PAIR_DEVICE,
status, &rp, sizeof(rp));
/* So we don't get further callbacks for this connection */
@@ -2853,7 +3592,7 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data,
struct hci_conn *conn;
int err;
- BT_DBG("");
+ bt_dev_dbg(hdev, "sock %p", sk);
memset(&rp, 0, sizeof(rp));
bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
@@ -2890,7 +3629,8 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data,
if (cp->addr.type == BDADDR_BREDR) {
conn = hci_connect_acl(hdev, &cp->addr.bdaddr, sec_level,
- auth_type);
+ auth_type, CONN_REASON_PAIR_DEVICE,
+ HCI_ACL_CONN_TIMEOUT);
} else {
u8 addr_type = le_addr_type(cp->addr.type);
struct hci_conn_params *p;
@@ -2898,20 +3638,24 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data,
/* When pairing a new device, it is expected to remember
* this device for future connections. Adding the connection
* parameter information ahead of time allows tracking
- * of the slave preferred values and will speed up any
+ * of the peripheral preferred values and will speed up any
* further connection establishment.
*
* If connection parameters already exist, then they
* will be kept and this function does nothing.
*/
p = hci_conn_params_add(hdev, &cp->addr.bdaddr, addr_type);
+ if (!p) {
+ err = -EIO;
+ goto unlock;
+ }
if (p->auto_connect == HCI_AUTO_CONN_EXPLICIT)
p->auto_connect = HCI_AUTO_CONN_DISABLED;
- conn = hci_connect_le_scan(hdev, &cp->addr.bdaddr,
- addr_type, sec_level,
- HCI_LE_CONN_TIMEOUT);
+ conn = hci_connect_le_scan(hdev, &cp->addr.bdaddr, addr_type,
+ sec_level, HCI_LE_CONN_TIMEOUT,
+ CONN_REASON_PAIR_DEVICE);
}
if (IS_ERR(conn)) {
@@ -2982,7 +3726,7 @@ static int cancel_pair_device(struct sock *sk, struct hci_dev *hdev, void *data,
struct hci_conn *conn;
int err;
- BT_DBG("");
+ bt_dev_dbg(hdev, "sock %p", sk);
hci_dev_lock(hdev);
@@ -3012,6 +3756,20 @@ static int cancel_pair_device(struct sock *sk, struct hci_dev *hdev, void *data,
err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_CANCEL_PAIR_DEVICE, 0,
addr, sizeof(*addr));
+
+ /* Since user doesn't want to proceed with the connection, abort any
+ * ongoing pairing and then terminate the link if it was created
+ * because of the pair device action.
+ */
+ if (addr->type == BDADDR_BREDR)
+ hci_remove_link_key(hdev, &addr->bdaddr);
+ else
+ smp_cancel_and_remove_pairing(hdev, &addr->bdaddr,
+ le_addr_type(addr->type));
+
+ if (conn->conn_reason == CONN_REASON_PAIR_DEVICE)
+ hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM);
+
unlock:
hci_dev_unlock(hdev);
return err;
@@ -3093,7 +3851,7 @@ static int pin_code_neg_reply(struct sock *sk, struct hci_dev *hdev,
{
struct mgmt_cp_pin_code_neg_reply *cp = data;
- BT_DBG("");
+ bt_dev_dbg(hdev, "sock %p", sk);
return user_pairing_resp(sk, hdev, &cp->addr,
MGMT_OP_PIN_CODE_NEG_REPLY,
@@ -3105,7 +3863,7 @@ static int user_confirm_reply(struct sock *sk, struct hci_dev *hdev, void *data,
{
struct mgmt_cp_user_confirm_reply *cp = data;
- BT_DBG("");
+ bt_dev_dbg(hdev, "sock %p", sk);
if (len != sizeof(*cp))
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_USER_CONFIRM_REPLY,
@@ -3121,7 +3879,7 @@ static int user_confirm_neg_reply(struct sock *sk, struct hci_dev *hdev,
{
struct mgmt_cp_user_confirm_neg_reply *cp = data;
- BT_DBG("");
+ bt_dev_dbg(hdev, "sock %p", sk);
return user_pairing_resp(sk, hdev, &cp->addr,
MGMT_OP_USER_CONFIRM_NEG_REPLY,
@@ -3133,7 +3891,7 @@ static int user_passkey_reply(struct sock *sk, struct hci_dev *hdev, void *data,
{
struct mgmt_cp_user_passkey_reply *cp = data;
- BT_DBG("");
+ bt_dev_dbg(hdev, "sock %p", sk);
return user_pairing_resp(sk, hdev, &cp->addr,
MGMT_OP_USER_PASSKEY_REPLY,
@@ -3145,72 +3903,96 @@ static int user_passkey_neg_reply(struct sock *sk, struct hci_dev *hdev,
{
struct mgmt_cp_user_passkey_neg_reply *cp = data;
- BT_DBG("");
+ bt_dev_dbg(hdev, "sock %p", sk);
return user_pairing_resp(sk, hdev, &cp->addr,
MGMT_OP_USER_PASSKEY_NEG_REPLY,
HCI_OP_USER_PASSKEY_NEG_REPLY, 0);
}
-static void adv_expire(struct hci_dev *hdev, u32 flags)
+static int adv_expire_sync(struct hci_dev *hdev, u32 flags)
{
struct adv_info *adv_instance;
- struct hci_request req;
- int err;
adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance);
if (!adv_instance)
- return;
+ return 0;
/* stop if current instance doesn't need to be changed */
if (!(adv_instance->flags & flags))
- return;
+ return 0;
cancel_adv_timeout(hdev);
adv_instance = hci_get_next_instance(hdev, adv_instance->instance);
if (!adv_instance)
- return;
+ return 0;
- hci_req_init(&req, hdev);
- err = __hci_req_schedule_adv_instance(&req, adv_instance->instance,
- true);
- if (err)
- return;
+ hci_schedule_adv_instance_sync(hdev, adv_instance->instance, true);
- hci_req_run(&req, NULL);
+ return 0;
}
-static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+static int name_changed_sync(struct hci_dev *hdev, void *data)
{
+ return adv_expire_sync(hdev, MGMT_ADV_FLAG_LOCAL_NAME);
+}
+
+static void set_name_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
struct mgmt_cp_set_local_name *cp;
- struct mgmt_pending_cmd *cmd;
+ u8 status = mgmt_status(err);
- BT_DBG("status 0x%02x", status);
+ bt_dev_dbg(hdev, "err %d", err);
- hci_dev_lock(hdev);
-
- cmd = pending_find(MGMT_OP_SET_LOCAL_NAME, hdev);
- if (!cmd)
- goto unlock;
+ if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
+ return;
cp = cmd->param;
if (status) {
mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME,
- mgmt_status(status));
+ status);
} else {
mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, 0,
cp, sizeof(*cp));
if (hci_dev_test_flag(hdev, HCI_LE_ADV))
- adv_expire(hdev, MGMT_ADV_FLAG_LOCAL_NAME);
+ hci_cmd_sync_queue(hdev, name_changed_sync, NULL, NULL);
}
- mgmt_pending_remove(cmd);
+ mgmt_pending_free(cmd);
+}
-unlock:
- hci_dev_unlock(hdev);
+static int set_name_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_set_local_name cp;
+
+ mutex_lock(&hdev->mgmt_pending_lock);
+
+ if (!__mgmt_pending_listed(hdev, cmd)) {
+ mutex_unlock(&hdev->mgmt_pending_lock);
+ return -ECANCELED;
+ }
+
+ memcpy(&cp, cmd->param, sizeof(cp));
+
+ mutex_unlock(&hdev->mgmt_pending_lock);
+
+ if (lmp_bredr_capable(hdev)) {
+ hci_update_name_sync(hdev, cp.name);
+ hci_update_eir_sync(hdev);
+ }
+
+ /* The name is stored in the scan response data and so
+ * no need to update the advertising data here.
+ */
+ if (lmp_le_capable(hdev) && hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ hci_update_scan_rsp_data_sync(hdev, hdev->cur_adv_instance);
+
+ return 0;
}
static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data,
@@ -3218,10 +4000,9 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data,
{
struct mgmt_cp_set_local_name *cp = data;
struct mgmt_pending_cmd *cmd;
- struct hci_request req;
int err;
- BT_DBG("");
+ bt_dev_dbg(hdev, "sock %p", sk);
hci_dev_lock(hdev);
@@ -3254,57 +4035,57 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data,
}
cmd = mgmt_pending_add(sk, MGMT_OP_SET_LOCAL_NAME, hdev, data, len);
- if (!cmd) {
+ if (!cmd)
err = -ENOMEM;
- goto failed;
- }
+ else
+ err = hci_cmd_sync_queue(hdev, set_name_sync, cmd,
+ set_name_complete);
- memcpy(hdev->dev_name, cp->name, sizeof(hdev->dev_name));
+ if (err < 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LOCAL_NAME,
+ MGMT_STATUS_FAILED);
- hci_req_init(&req, hdev);
+ if (cmd)
+ mgmt_pending_remove(cmd);
- if (lmp_bredr_capable(hdev)) {
- __hci_req_update_name(&req);
- __hci_req_update_eir(&req);
+ goto failed;
}
- /* The name is stored in the scan response data and so
- * no need to udpate the advertising data here.
- */
- if (lmp_le_capable(hdev) && hci_dev_test_flag(hdev, HCI_ADVERTISING))
- __hci_req_update_scan_rsp_data(&req, hdev->cur_adv_instance);
-
- err = hci_req_run(&req, set_name_complete);
- if (err < 0)
- mgmt_pending_remove(cmd);
+ memcpy(hdev->dev_name, cp->name, sizeof(hdev->dev_name));
failed:
hci_dev_unlock(hdev);
return err;
}
+static int appearance_changed_sync(struct hci_dev *hdev, void *data)
+{
+ return adv_expire_sync(hdev, MGMT_ADV_FLAG_APPEARANCE);
+}
+
static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data,
u16 len)
{
struct mgmt_cp_set_appearance *cp = data;
- u16 apperance;
+ u16 appearance;
int err;
- BT_DBG("");
+ bt_dev_dbg(hdev, "sock %p", sk);
if (!lmp_le_capable(hdev))
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_APPEARANCE,
MGMT_STATUS_NOT_SUPPORTED);
- apperance = le16_to_cpu(cp->appearance);
+ appearance = le16_to_cpu(cp->appearance);
hci_dev_lock(hdev);
- if (hdev->appearance != apperance) {
- hdev->appearance = apperance;
+ if (hdev->appearance != appearance) {
+ hdev->appearance = appearance;
if (hci_dev_test_flag(hdev, HCI_LE_ADV))
- adv_expire(hdev, MGMT_ADV_FLAG_APPEARANCE);
+ hci_cmd_sync_queue(hdev, appearance_changed_sync, NULL,
+ NULL);
ext_info_changed(hdev, sk);
}
@@ -3320,9 +4101,9 @@ static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data,
static int get_phy_configuration(struct sock *sk, struct hci_dev *hdev,
void *data, u16 len)
{
- struct mgmt_rp_get_phy_confguration rp;
+ struct mgmt_rp_get_phy_configuration rp;
- BT_DBG("sock %p %s", sk, hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
hci_dev_lock(hdev);
@@ -3350,23 +4131,28 @@ int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip)
sizeof(ev), skip);
}
-static void set_default_phy_complete(struct hci_dev *hdev, u8 status,
- u16 opcode, struct sk_buff *skb)
+static void set_default_phy_complete(struct hci_dev *hdev, void *data, int err)
{
- struct mgmt_pending_cmd *cmd;
+ struct mgmt_pending_cmd *cmd = data;
+ struct sk_buff *skb;
+ u8 status = mgmt_status(err);
- BT_DBG("status 0x%02x", status);
+ skb = cmd->skb;
- hci_dev_lock(hdev);
+ if (!status) {
+ if (!skb)
+ status = MGMT_STATUS_FAILED;
+ else if (IS_ERR(skb))
+ status = mgmt_status(PTR_ERR(skb));
+ else
+ status = mgmt_status(skb->data[0]);
+ }
- cmd = pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev);
- if (!cmd)
- goto unlock;
+ bt_dev_dbg(hdev, "status %d", status);
if (status) {
mgmt_cmd_status(cmd->sk, hdev->id,
- MGMT_OP_SET_PHY_CONFIGURATION,
- mgmt_status(status));
+ MGMT_OP_SET_PHY_CONFIGURATION, status);
} else {
mgmt_cmd_complete(cmd->sk, hdev->id,
MGMT_OP_SET_PHY_CONFIGURATION, 0,
@@ -3375,25 +4161,64 @@ static void set_default_phy_complete(struct hci_dev *hdev, u8 status,
mgmt_phy_configuration_changed(hdev, cmd->sk);
}
- mgmt_pending_remove(cmd);
+ if (skb && !IS_ERR(skb))
+ kfree_skb(skb);
-unlock:
- hci_dev_unlock(hdev);
+ mgmt_pending_free(cmd);
+}
+
+static int set_default_phy_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_set_phy_configuration *cp = cmd->param;
+ struct hci_cp_le_set_default_phy cp_phy;
+ u32 selected_phys;
+
+ selected_phys = __le32_to_cpu(cp->selected_phys);
+
+ memset(&cp_phy, 0, sizeof(cp_phy));
+
+ if (!(selected_phys & MGMT_PHY_LE_TX_MASK))
+ cp_phy.all_phys |= 0x01;
+
+ if (!(selected_phys & MGMT_PHY_LE_RX_MASK))
+ cp_phy.all_phys |= 0x02;
+
+ if (selected_phys & MGMT_PHY_LE_1M_TX)
+ cp_phy.tx_phys |= HCI_LE_SET_PHY_1M;
+
+ if (selected_phys & MGMT_PHY_LE_2M_TX)
+ cp_phy.tx_phys |= HCI_LE_SET_PHY_2M;
+
+ if (selected_phys & MGMT_PHY_LE_CODED_TX)
+ cp_phy.tx_phys |= HCI_LE_SET_PHY_CODED;
+
+ if (selected_phys & MGMT_PHY_LE_1M_RX)
+ cp_phy.rx_phys |= HCI_LE_SET_PHY_1M;
+
+ if (selected_phys & MGMT_PHY_LE_2M_RX)
+ cp_phy.rx_phys |= HCI_LE_SET_PHY_2M;
+
+ if (selected_phys & MGMT_PHY_LE_CODED_RX)
+ cp_phy.rx_phys |= HCI_LE_SET_PHY_CODED;
+
+ cmd->skb = __hci_cmd_sync(hdev, HCI_OP_LE_SET_DEFAULT_PHY,
+ sizeof(cp_phy), &cp_phy, HCI_CMD_TIMEOUT);
+
+ return 0;
}
static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev,
void *data, u16 len)
{
- struct mgmt_cp_set_phy_confguration *cp = data;
- struct hci_cp_le_set_default_phy cp_phy;
+ struct mgmt_cp_set_phy_configuration *cp = data;
struct mgmt_pending_cmd *cmd;
- struct hci_request req;
u32 selected_phys, configurable_phys, supported_phys, unconfigure_phys;
u16 pkt_type = (HCI_DH1 | HCI_DM1);
bool changed = false;
int err;
- BT_DBG("sock %p %s", sk, hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
configurable_phys = get_configurable_phys(hdev);
supported_phys = get_supported_phys(hdev);
@@ -3489,75 +4314,1380 @@ static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev,
goto unlock;
}
- cmd = mgmt_pending_add(sk, MGMT_OP_SET_PHY_CONFIGURATION, hdev, data,
+ cmd = mgmt_pending_new(sk, MGMT_OP_SET_PHY_CONFIGURATION, hdev, data,
len);
- if (!cmd) {
+ if (!cmd)
err = -ENOMEM;
- goto unlock;
+ else
+ err = hci_cmd_sync_queue(hdev, set_default_phy_sync, cmd,
+ set_default_phy_complete);
+
+ if (err < 0) {
+ err = mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ MGMT_STATUS_FAILED);
+
+ if (cmd)
+ mgmt_pending_remove(cmd);
}
- hci_req_init(&req, hdev);
+unlock:
+ hci_dev_unlock(hdev);
- memset(&cp_phy, 0, sizeof(cp_phy));
+ return err;
+}
- if (!(selected_phys & MGMT_PHY_LE_TX_MASK))
- cp_phy.all_phys |= 0x01;
+static int set_blocked_keys(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ int err = MGMT_STATUS_SUCCESS;
+ struct mgmt_cp_set_blocked_keys *keys = data;
+ const u16 max_key_count = ((U16_MAX - sizeof(*keys)) /
+ sizeof(struct mgmt_blocked_key_info));
+ u16 key_count, expected_len;
+ int i;
- if (!(selected_phys & MGMT_PHY_LE_RX_MASK))
- cp_phy.all_phys |= 0x02;
+ bt_dev_dbg(hdev, "sock %p", sk);
- if (selected_phys & MGMT_PHY_LE_1M_TX)
- cp_phy.tx_phys |= HCI_LE_SET_PHY_1M;
+ key_count = __le16_to_cpu(keys->key_count);
+ if (key_count > max_key_count) {
+ bt_dev_err(hdev, "too big key_count value %u", key_count);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BLOCKED_KEYS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
- if (selected_phys & MGMT_PHY_LE_2M_TX)
- cp_phy.tx_phys |= HCI_LE_SET_PHY_2M;
+ expected_len = struct_size(keys, keys, key_count);
+ if (expected_len != len) {
+ bt_dev_err(hdev, "expected %u bytes, got %u bytes",
+ expected_len, len);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BLOCKED_KEYS,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
- if (selected_phys & MGMT_PHY_LE_CODED_TX)
- cp_phy.tx_phys |= HCI_LE_SET_PHY_CODED;
+ hci_dev_lock(hdev);
- if (selected_phys & MGMT_PHY_LE_1M_RX)
- cp_phy.rx_phys |= HCI_LE_SET_PHY_1M;
+ hci_blocked_keys_clear(hdev);
- if (selected_phys & MGMT_PHY_LE_2M_RX)
- cp_phy.rx_phys |= HCI_LE_SET_PHY_2M;
+ for (i = 0; i < key_count; ++i) {
+ struct blocked_key *b = kzalloc(sizeof(*b), GFP_KERNEL);
- if (selected_phys & MGMT_PHY_LE_CODED_RX)
- cp_phy.rx_phys |= HCI_LE_SET_PHY_CODED;
+ if (!b) {
+ err = MGMT_STATUS_NO_RESOURCES;
+ break;
+ }
+
+ b->type = keys->keys[i].type;
+ memcpy(b->val, keys->keys[i].val, sizeof(b->val));
+ list_add_rcu(&b->list, &hdev->blocked_keys);
+ }
+ hci_dev_unlock(hdev);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_BLOCKED_KEYS,
+ err, NULL, 0);
+}
+
+static int set_wideband_speech(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_mode *cp = data;
+ int err;
+ bool changed = false;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ if (!hci_test_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED))
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_WIDEBAND_SPEECH,
+ MGMT_STATUS_NOT_SUPPORTED);
+
+ if (cp->val != 0x00 && cp->val != 0x01)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_WIDEBAND_SPEECH,
+ MGMT_STATUS_INVALID_PARAMS);
- hci_req_add(&req, HCI_OP_LE_SET_DEFAULT_PHY, sizeof(cp_phy), &cp_phy);
+ hci_dev_lock(hdev);
+
+ if (hdev_is_powered(hdev) &&
+ !!cp->val != hci_dev_test_flag(hdev,
+ HCI_WIDEBAND_SPEECH_ENABLED)) {
+ err = mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_WIDEBAND_SPEECH,
+ MGMT_STATUS_REJECTED);
+ goto unlock;
+ }
- err = hci_req_run_skb(&req, set_default_phy_complete);
+ if (cp->val)
+ changed = !hci_dev_test_and_set_flag(hdev,
+ HCI_WIDEBAND_SPEECH_ENABLED);
+ else
+ changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_WIDEBAND_SPEECH_ENABLED);
+
+ err = send_settings_rsp(sk, MGMT_OP_SET_WIDEBAND_SPEECH, hdev);
if (err < 0)
- mgmt_pending_remove(cmd);
+ goto unlock;
+
+ if (changed)
+ err = new_settings(hdev, sk);
unlock:
hci_dev_unlock(hdev);
+ return err;
+}
+
+static int read_controller_cap(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ char buf[20];
+ struct mgmt_rp_read_controller_cap *rp = (void *)buf;
+ u16 cap_len = 0;
+ u8 flags = 0;
+ u8 tx_power_range[2];
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ memset(&buf, 0, sizeof(buf));
+
+ hci_dev_lock(hdev);
+
+ /* When the Read Simple Pairing Options command is supported, then
+ * the remote public key validation is supported.
+ *
+ * Alternatively, when Microsoft extensions are available, they can
+ * indicate support for public key validation as well.
+ */
+ if ((hdev->commands[41] & 0x08) || msft_curve_validity(hdev))
+ flags |= 0x01; /* Remote public key validation (BR/EDR) */
+
+ flags |= 0x02; /* Remote public key validation (LE) */
+
+ /* When the Read Encryption Key Size command is supported, then the
+ * encryption key size is enforced.
+ */
+ if (hdev->commands[20] & 0x10)
+ flags |= 0x04; /* Encryption key size enforcement (BR/EDR) */
+
+ flags |= 0x08; /* Encryption key size enforcement (LE) */
+
+ cap_len = eir_append_data(rp->cap, cap_len, MGMT_CAP_SEC_FLAGS,
+ &flags, 1);
+
+ /* When the Read Simple Pairing Options command is supported, then
+ * also max encryption key size information is provided.
+ */
+ if (hdev->commands[41] & 0x08)
+ cap_len = eir_append_le16(rp->cap, cap_len,
+ MGMT_CAP_MAX_ENC_KEY_SIZE,
+ hdev->max_enc_key_size);
+
+ cap_len = eir_append_le16(rp->cap, cap_len,
+ MGMT_CAP_SMP_MAX_ENC_KEY_SIZE,
+ SMP_MAX_ENC_KEY_SIZE);
+
+ /* Append the min/max LE tx power parameters if we were able to fetch
+ * it from the controller
+ */
+ if (hdev->commands[38] & 0x80) {
+ memcpy(&tx_power_range[0], &hdev->min_le_tx_power, 1);
+ memcpy(&tx_power_range[1], &hdev->max_le_tx_power, 1);
+ cap_len = eir_append_data(rp->cap, cap_len, MGMT_CAP_LE_TX_PWR,
+ tx_power_range, 2);
+ }
+
+ rp->cap_len = cpu_to_le16(cap_len);
+
+ hci_dev_unlock(hdev);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_CONTROLLER_CAP, 0,
+ rp, sizeof(*rp) + cap_len);
+}
+
+#ifdef CONFIG_BT_FEATURE_DEBUG
+/* d4992530-b9ec-469f-ab01-6c481c47da1c */
+static const u8 debug_uuid[16] = {
+ 0x1c, 0xda, 0x47, 0x1c, 0x48, 0x6c, 0x01, 0xab,
+ 0x9f, 0x46, 0xec, 0xb9, 0x30, 0x25, 0x99, 0xd4,
+};
+#endif
+
+/* 330859bc-7506-492d-9370-9a6f0614037f */
+static const u8 quality_report_uuid[16] = {
+ 0x7f, 0x03, 0x14, 0x06, 0x6f, 0x9a, 0x70, 0x93,
+ 0x2d, 0x49, 0x06, 0x75, 0xbc, 0x59, 0x08, 0x33,
+};
+
+/* a6695ace-ee7f-4fb9-881a-5fac66c629af */
+static const u8 offload_codecs_uuid[16] = {
+ 0xaf, 0x29, 0xc6, 0x66, 0xac, 0x5f, 0x1a, 0x88,
+ 0xb9, 0x4f, 0x7f, 0xee, 0xce, 0x5a, 0x69, 0xa6,
+};
+
+/* 671b10b5-42c0-4696-9227-eb28d1b049d6 */
+static const u8 le_simultaneous_roles_uuid[16] = {
+ 0xd6, 0x49, 0xb0, 0xd1, 0x28, 0xeb, 0x27, 0x92,
+ 0x96, 0x46, 0xc0, 0x42, 0xb5, 0x10, 0x1b, 0x67,
+};
+
+/* 6fbaf188-05e0-496a-9885-d6ddfdb4e03e */
+static const u8 iso_socket_uuid[16] = {
+ 0x3e, 0xe0, 0xb4, 0xfd, 0xdd, 0xd6, 0x85, 0x98,
+ 0x6a, 0x49, 0xe0, 0x05, 0x88, 0xf1, 0xba, 0x6f,
+};
+
+/* 2ce463d7-7a03-4d8d-bf05-5f24e8f36e76 */
+static const u8 mgmt_mesh_uuid[16] = {
+ 0x76, 0x6e, 0xf3, 0xe8, 0x24, 0x5f, 0x05, 0xbf,
+ 0x8d, 0x4d, 0x03, 0x7a, 0xd7, 0x63, 0xe4, 0x2c,
+};
+
+static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_rp_read_exp_features_info *rp;
+ size_t len;
+ u16 idx = 0;
+ u32 flags;
+ int status;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ /* Enough space for 7 features */
+ len = sizeof(*rp) + (sizeof(rp->features[0]) * 7);
+ rp = kzalloc(len, GFP_KERNEL);
+ if (!rp)
+ return -ENOMEM;
+
+#ifdef CONFIG_BT_FEATURE_DEBUG
+ flags = bt_dbg_get() ? BIT(0) : 0;
+
+ memcpy(rp->features[idx].uuid, debug_uuid, 16);
+ rp->features[idx].flags = cpu_to_le32(flags);
+ idx++;
+#endif
+
+ if (hdev && hci_dev_le_state_simultaneous(hdev)) {
+ if (hci_dev_test_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES))
+ flags = BIT(0);
+ else
+ flags = 0;
+
+ memcpy(rp->features[idx].uuid, le_simultaneous_roles_uuid, 16);
+ rp->features[idx].flags = cpu_to_le32(flags);
+ idx++;
+ }
+
+ if (hdev && (aosp_has_quality_report(hdev) ||
+ hdev->set_quality_report)) {
+ if (hci_dev_test_flag(hdev, HCI_QUALITY_REPORT))
+ flags = BIT(0);
+ else
+ flags = 0;
+
+ memcpy(rp->features[idx].uuid, quality_report_uuid, 16);
+ rp->features[idx].flags = cpu_to_le32(flags);
+ idx++;
+ }
+
+ if (hdev && hdev->get_data_path_id) {
+ if (hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED))
+ flags = BIT(0);
+ else
+ flags = 0;
+
+ memcpy(rp->features[idx].uuid, offload_codecs_uuid, 16);
+ rp->features[idx].flags = cpu_to_le32(flags);
+ idx++;
+ }
+
+ if (IS_ENABLED(CONFIG_BT_LE)) {
+ flags = iso_inited() ? BIT(0) : 0;
+ memcpy(rp->features[idx].uuid, iso_socket_uuid, 16);
+ rp->features[idx].flags = cpu_to_le32(flags);
+ idx++;
+ }
+
+ if (hdev && lmp_le_capable(hdev)) {
+ if (hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL))
+ flags = BIT(0);
+ else
+ flags = 0;
+
+ memcpy(rp->features[idx].uuid, mgmt_mesh_uuid, 16);
+ rp->features[idx].flags = cpu_to_le32(flags);
+ idx++;
+ }
+
+ rp->feature_count = cpu_to_le16(idx);
+
+ /* After reading the experimental features information, enable
+ * the events to update client on any future change.
+ */
+ hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
+
+ status = mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE,
+ MGMT_OP_READ_EXP_FEATURES_INFO,
+ 0, rp, sizeof(*rp) + (20 * idx));
+
+ kfree(rp);
+ return status;
+}
+
+static int exp_feature_changed(struct hci_dev *hdev, const u8 *uuid,
+ bool enabled, struct sock *skip)
+{
+ struct mgmt_ev_exp_feature_changed ev;
+
+ memset(&ev, 0, sizeof(ev));
+ memcpy(ev.uuid, uuid, 16);
+ ev.flags = cpu_to_le32(enabled ? BIT(0) : 0);
+
+ return mgmt_limited_event(MGMT_EV_EXP_FEATURE_CHANGED, hdev,
+ &ev, sizeof(ev),
+ HCI_MGMT_EXP_FEATURE_EVENTS, skip);
+}
+
+#define EXP_FEAT(_uuid, _set_func) \
+{ \
+ .uuid = _uuid, \
+ .set_func = _set_func, \
+}
+
+/* The zero key uuid is special. Multiple exp features are set through it. */
+static int set_zero_key_func(struct sock *sk, struct hci_dev *hdev,
+ struct mgmt_cp_set_exp_feature *cp, u16 data_len)
+{
+ struct mgmt_rp_set_exp_feature rp;
+
+ memset(rp.uuid, 0, 16);
+ rp.flags = cpu_to_le32(0);
+
+#ifdef CONFIG_BT_FEATURE_DEBUG
+ if (!hdev) {
+ bool changed = bt_dbg_get();
+
+ bt_dbg_set(false);
+
+ if (changed)
+ exp_feature_changed(NULL, ZERO_KEY, false, sk);
+ }
+#endif
+
+ hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
+
+ return mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE, 0,
+ &rp, sizeof(rp));
+}
+
+#ifdef CONFIG_BT_FEATURE_DEBUG
+static int set_debug_func(struct sock *sk, struct hci_dev *hdev,
+ struct mgmt_cp_set_exp_feature *cp, u16 data_len)
+{
+ struct mgmt_rp_set_exp_feature rp;
+
+ bool val, changed;
+ int err;
+
+ /* Command requires to use the non-controller index */
+ if (hdev)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_INDEX);
+
+ /* Parameters are limited to a single octet */
+ if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1)
+ return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ /* Only boolean on/off is supported */
+ if (cp->param[0] != 0x00 && cp->param[0] != 0x01)
+ return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ val = !!cp->param[0];
+ changed = val ? !bt_dbg_get() : bt_dbg_get();
+ bt_dbg_set(val);
+
+ memcpy(rp.uuid, debug_uuid, 16);
+ rp.flags = cpu_to_le32(val ? BIT(0) : 0);
+
+ hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
+
+ err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE, 0,
+ &rp, sizeof(rp));
+
+ if (changed)
+ exp_feature_changed(hdev, debug_uuid, val, sk);
return err;
}
+#endif
-static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status,
- u16 opcode, struct sk_buff *skb)
+static int set_mgmt_mesh_func(struct sock *sk, struct hci_dev *hdev,
+ struct mgmt_cp_set_exp_feature *cp, u16 data_len)
+{
+ struct mgmt_rp_set_exp_feature rp;
+ bool val, changed;
+ int err;
+
+ /* Command requires to use the controller index */
+ if (!hdev)
+ return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_INDEX);
+
+ /* Parameters are limited to a single octet */
+ if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ /* Only boolean on/off is supported */
+ if (cp->param[0] != 0x00 && cp->param[0] != 0x01)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ val = !!cp->param[0];
+
+ if (val) {
+ changed = !hci_dev_test_and_set_flag(hdev,
+ HCI_MESH_EXPERIMENTAL);
+ } else {
+ hci_dev_clear_flag(hdev, HCI_MESH);
+ changed = hci_dev_test_and_clear_flag(hdev,
+ HCI_MESH_EXPERIMENTAL);
+ }
+
+ memcpy(rp.uuid, mgmt_mesh_uuid, 16);
+ rp.flags = cpu_to_le32(val ? BIT(0) : 0);
+
+ hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
+
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE, 0,
+ &rp, sizeof(rp));
+
+ if (changed)
+ exp_feature_changed(hdev, mgmt_mesh_uuid, val, sk);
+
+ return err;
+}
+
+static int set_quality_report_func(struct sock *sk, struct hci_dev *hdev,
+ struct mgmt_cp_set_exp_feature *cp,
+ u16 data_len)
+{
+ struct mgmt_rp_set_exp_feature rp;
+ bool val, changed;
+ int err;
+
+ /* Command requires to use a valid controller index */
+ if (!hdev)
+ return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_INDEX);
+
+ /* Parameters are limited to a single octet */
+ if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ /* Only boolean on/off is supported */
+ if (cp->param[0] != 0x00 && cp->param[0] != 0x01)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_req_sync_lock(hdev);
+
+ val = !!cp->param[0];
+ changed = (val != hci_dev_test_flag(hdev, HCI_QUALITY_REPORT));
+
+ if (!aosp_has_quality_report(hdev) && !hdev->set_quality_report) {
+ err = mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_NOT_SUPPORTED);
+ goto unlock_quality_report;
+ }
+
+ if (changed) {
+ if (hdev->set_quality_report)
+ err = hdev->set_quality_report(hdev, val);
+ else
+ err = aosp_set_quality_report(hdev, val);
+
+ if (err) {
+ err = mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_FAILED);
+ goto unlock_quality_report;
+ }
+
+ if (val)
+ hci_dev_set_flag(hdev, HCI_QUALITY_REPORT);
+ else
+ hci_dev_clear_flag(hdev, HCI_QUALITY_REPORT);
+ }
+
+ bt_dev_dbg(hdev, "quality report enable %d changed %d", val, changed);
+
+ memcpy(rp.uuid, quality_report_uuid, 16);
+ rp.flags = cpu_to_le32(val ? BIT(0) : 0);
+ hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_EXP_FEATURE, 0,
+ &rp, sizeof(rp));
+
+ if (changed)
+ exp_feature_changed(hdev, quality_report_uuid, val, sk);
+
+unlock_quality_report:
+ hci_req_sync_unlock(hdev);
+ return err;
+}
+
+static int set_offload_codec_func(struct sock *sk, struct hci_dev *hdev,
+ struct mgmt_cp_set_exp_feature *cp,
+ u16 data_len)
+{
+ bool val, changed;
+ int err;
+ struct mgmt_rp_set_exp_feature rp;
+
+ /* Command requires to use a valid controller index */
+ if (!hdev)
+ return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_INDEX);
+
+ /* Parameters are limited to a single octet */
+ if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ /* Only boolean on/off is supported */
+ if (cp->param[0] != 0x00 && cp->param[0] != 0x01)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ val = !!cp->param[0];
+ changed = (val != hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED));
+
+ if (!hdev->get_data_path_id) {
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_NOT_SUPPORTED);
+ }
+
+ if (changed) {
+ if (val)
+ hci_dev_set_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED);
+ else
+ hci_dev_clear_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED);
+ }
+
+ bt_dev_info(hdev, "offload codecs enable %d changed %d",
+ val, changed);
+
+ memcpy(rp.uuid, offload_codecs_uuid, 16);
+ rp.flags = cpu_to_le32(val ? BIT(0) : 0);
+ hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE, 0,
+ &rp, sizeof(rp));
+
+ if (changed)
+ exp_feature_changed(hdev, offload_codecs_uuid, val, sk);
+
+ return err;
+}
+
+static int set_le_simultaneous_roles_func(struct sock *sk, struct hci_dev *hdev,
+ struct mgmt_cp_set_exp_feature *cp,
+ u16 data_len)
+{
+ bool val, changed;
+ int err;
+ struct mgmt_rp_set_exp_feature rp;
+
+ /* Command requires to use a valid controller index */
+ if (!hdev)
+ return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_INDEX);
+
+ /* Parameters are limited to a single octet */
+ if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ /* Only boolean on/off is supported */
+ if (cp->param[0] != 0x00 && cp->param[0] != 0x01)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ val = !!cp->param[0];
+ changed = (val != hci_dev_test_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES));
+
+ if (!hci_dev_le_state_simultaneous(hdev)) {
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_NOT_SUPPORTED);
+ }
+
+ if (changed) {
+ if (val)
+ hci_dev_set_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES);
+ else
+ hci_dev_clear_flag(hdev, HCI_LE_SIMULTANEOUS_ROLES);
+ }
+
+ bt_dev_info(hdev, "LE simultaneous roles enable %d changed %d",
+ val, changed);
+
+ memcpy(rp.uuid, le_simultaneous_roles_uuid, 16);
+ rp.flags = cpu_to_le32(val ? BIT(0) : 0);
+ hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE, 0,
+ &rp, sizeof(rp));
+
+ if (changed)
+ exp_feature_changed(hdev, le_simultaneous_roles_uuid, val, sk);
+
+ return err;
+}
+
+#ifdef CONFIG_BT_LE
+static int set_iso_socket_func(struct sock *sk, struct hci_dev *hdev,
+ struct mgmt_cp_set_exp_feature *cp, u16 data_len)
+{
+ struct mgmt_rp_set_exp_feature rp;
+ bool val, changed = false;
+ int err;
+
+ /* Command requires to use the non-controller index */
+ if (hdev)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_INDEX);
+
+ /* Parameters are limited to a single octet */
+ if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1)
+ return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ /* Only boolean on/off is supported */
+ if (cp->param[0] != 0x00 && cp->param[0] != 0x01)
+ return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ val = cp->param[0] ? true : false;
+ if (val)
+ err = iso_init();
+ else
+ err = iso_exit();
+
+ if (!err)
+ changed = true;
+
+ memcpy(rp.uuid, iso_socket_uuid, 16);
+ rp.flags = cpu_to_le32(val ? BIT(0) : 0);
+
+ hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
+
+ err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE, 0,
+ &rp, sizeof(rp));
+
+ if (changed)
+ exp_feature_changed(hdev, iso_socket_uuid, val, sk);
+
+ return err;
+}
+#endif
+
+static const struct mgmt_exp_feature {
+ const u8 *uuid;
+ int (*set_func)(struct sock *sk, struct hci_dev *hdev,
+ struct mgmt_cp_set_exp_feature *cp, u16 data_len);
+} exp_features[] = {
+ EXP_FEAT(ZERO_KEY, set_zero_key_func),
+#ifdef CONFIG_BT_FEATURE_DEBUG
+ EXP_FEAT(debug_uuid, set_debug_func),
+#endif
+ EXP_FEAT(mgmt_mesh_uuid, set_mgmt_mesh_func),
+ EXP_FEAT(quality_report_uuid, set_quality_report_func),
+ EXP_FEAT(offload_codecs_uuid, set_offload_codec_func),
+ EXP_FEAT(le_simultaneous_roles_uuid, set_le_simultaneous_roles_func),
+#ifdef CONFIG_BT_LE
+ EXP_FEAT(iso_socket_uuid, set_iso_socket_func),
+#endif
+
+ /* end with a null feature */
+ EXP_FEAT(NULL, NULL)
+};
+
+static int set_exp_feature(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_cp_set_exp_feature *cp = data;
+ size_t i = 0;
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ for (i = 0; exp_features[i].uuid; i++) {
+ if (!memcmp(cp->uuid, exp_features[i].uuid, 16))
+ return exp_features[i].set_func(sk, hdev, cp, data_len);
+ }
+
+ return mgmt_cmd_status(sk, hdev ? hdev->id : MGMT_INDEX_NONE,
+ MGMT_OP_SET_EXP_FEATURE,
+ MGMT_STATUS_NOT_SUPPORTED);
+}
+
+static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len)
+{
+ struct mgmt_cp_get_device_flags *cp = data;
+ struct mgmt_rp_get_device_flags rp;
+ struct bdaddr_list_with_flags *br_params;
+ struct hci_conn_params *params;
+ u32 supported_flags;
+ u32 current_flags = 0;
+ u8 status = MGMT_STATUS_INVALID_PARAMS;
+
+ bt_dev_dbg(hdev, "Get device flags %pMR (type 0x%x)\n",
+ &cp->addr.bdaddr, cp->addr.type);
+
+ hci_dev_lock(hdev);
+
+ supported_flags = hdev->conn_flags;
+
+ memset(&rp, 0, sizeof(rp));
+
+ if (cp->addr.type == BDADDR_BREDR) {
+ br_params = hci_bdaddr_list_lookup_with_flags(&hdev->accept_list,
+ &cp->addr.bdaddr,
+ cp->addr.type);
+ if (!br_params)
+ goto done;
+
+ current_flags = br_params->flags;
+ } else {
+ params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr,
+ le_addr_type(cp->addr.type));
+ if (!params)
+ goto done;
+
+ current_flags = params->flags;
+ }
+
+ bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
+ rp.addr.type = cp->addr.type;
+ rp.supported_flags = cpu_to_le32(supported_flags);
+ rp.current_flags = cpu_to_le32(current_flags);
+
+ status = MGMT_STATUS_SUCCESS;
+
+done:
+ hci_dev_unlock(hdev);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_DEVICE_FLAGS, status,
+ &rp, sizeof(rp));
+}
+
+static void device_flags_changed(struct sock *sk, struct hci_dev *hdev,
+ bdaddr_t *bdaddr, u8 bdaddr_type,
+ u32 supported_flags, u32 current_flags)
+{
+ struct mgmt_ev_device_flags_changed ev;
+
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = bdaddr_type;
+ ev.supported_flags = cpu_to_le32(supported_flags);
+ ev.current_flags = cpu_to_le32(current_flags);
+
+ mgmt_event(MGMT_EV_DEVICE_FLAGS_CHANGED, hdev, &ev, sizeof(ev), sk);
+}
+
+static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type)
+{
+ struct hci_conn *conn;
+
+ conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, addr);
+ if (!conn)
+ return false;
+
+ if (conn->dst_type != type)
+ return false;
+
+ if (conn->state != BT_CONNECTED)
+ return false;
+
+ return true;
+}
+
+/* This function requires the caller holds hdev->lock */
+static struct hci_conn_params *hci_conn_params_set(struct hci_dev *hdev,
+ bdaddr_t *addr, u8 addr_type,
+ u8 auto_connect)
+{
+ struct hci_conn_params *params;
+
+ params = hci_conn_params_add(hdev, addr, addr_type);
+ if (!params)
+ return NULL;
+
+ if (params->auto_connect == auto_connect)
+ return params;
+
+ hci_pend_le_list_del_init(params);
+
+ switch (auto_connect) {
+ case HCI_AUTO_CONN_DISABLED:
+ case HCI_AUTO_CONN_LINK_LOSS:
+ /* If auto connect is being disabled when we're trying to
+ * connect to device, keep connecting.
+ */
+ if (params->explicit_connect)
+ hci_pend_le_list_add(params, &hdev->pend_le_conns);
+ break;
+ case HCI_AUTO_CONN_REPORT:
+ if (params->explicit_connect)
+ hci_pend_le_list_add(params, &hdev->pend_le_conns);
+ else
+ hci_pend_le_list_add(params, &hdev->pend_le_reports);
+ break;
+ case HCI_AUTO_CONN_DIRECT:
+ case HCI_AUTO_CONN_ALWAYS:
+ if (!is_connected(hdev, addr, addr_type))
+ hci_pend_le_list_add(params, &hdev->pend_le_conns);
+ break;
+ }
+
+ params->auto_connect = auto_connect;
+
+ bt_dev_dbg(hdev, "addr %pMR (type %u) auto_connect %u",
+ addr, addr_type, auto_connect);
+
+ return params;
+}
+
+static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 len)
+{
+ struct mgmt_cp_set_device_flags *cp = data;
+ struct bdaddr_list_with_flags *br_params;
+ struct hci_conn_params *params;
+ u8 status = MGMT_STATUS_INVALID_PARAMS;
+ u32 supported_flags;
+ u32 current_flags = __le32_to_cpu(cp->current_flags);
+
+ bt_dev_dbg(hdev, "Set device flags %pMR (type 0x%x) = 0x%x",
+ &cp->addr.bdaddr, cp->addr.type, current_flags);
+
+ // We should take hci_dev_lock() early, I think.. conn_flags can change
+ supported_flags = hdev->conn_flags;
+
+ if ((supported_flags | current_flags) != supported_flags) {
+ bt_dev_warn(hdev, "Bad flag given (0x%x) vs supported (0x%0x)",
+ current_flags, supported_flags);
+ goto done;
+ }
+
+ hci_dev_lock(hdev);
+
+ if (cp->addr.type == BDADDR_BREDR) {
+ br_params = hci_bdaddr_list_lookup_with_flags(&hdev->accept_list,
+ &cp->addr.bdaddr,
+ cp->addr.type);
+
+ if (br_params) {
+ br_params->flags = current_flags;
+ status = MGMT_STATUS_SUCCESS;
+ } else {
+ bt_dev_warn(hdev, "No such BR/EDR device %pMR (0x%x)",
+ &cp->addr.bdaddr, cp->addr.type);
+ }
+
+ goto unlock;
+ }
+
+ params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr,
+ le_addr_type(cp->addr.type));
+ if (!params) {
+ /* Create a new hci_conn_params if it doesn't exist */
+ params = hci_conn_params_set(hdev, &cp->addr.bdaddr,
+ le_addr_type(cp->addr.type),
+ HCI_AUTO_CONN_DISABLED);
+ if (!params) {
+ bt_dev_warn(hdev, "No such LE device %pMR (0x%x)",
+ &cp->addr.bdaddr,
+ le_addr_type(cp->addr.type));
+ goto unlock;
+ }
+ }
+
+ supported_flags = hdev->conn_flags;
+
+ if ((supported_flags | current_flags) != supported_flags) {
+ bt_dev_warn(hdev, "Bad flag given (0x%x) vs supported (0x%0x)",
+ current_flags, supported_flags);
+ goto unlock;
+ }
+
+ WRITE_ONCE(params->flags, current_flags);
+ status = MGMT_STATUS_SUCCESS;
+
+ /* Update passive scan if HCI_CONN_FLAG_DEVICE_PRIVACY
+ * has been set.
+ */
+ if (params->flags & HCI_CONN_FLAG_DEVICE_PRIVACY)
+ hci_update_passive_scan(hdev);
+
+unlock:
+ hci_dev_unlock(hdev);
+
+done:
+ if (status == MGMT_STATUS_SUCCESS)
+ device_flags_changed(sk, hdev, &cp->addr.bdaddr, cp->addr.type,
+ supported_flags, current_flags);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_DEVICE_FLAGS, status,
+ &cp->addr, sizeof(cp->addr));
+}
+
+static void mgmt_adv_monitor_added(struct sock *sk, struct hci_dev *hdev,
+ u16 handle)
+{
+ struct mgmt_ev_adv_monitor_added ev;
+
+ ev.monitor_handle = cpu_to_le16(handle);
+
+ mgmt_event(MGMT_EV_ADV_MONITOR_ADDED, hdev, &ev, sizeof(ev), sk);
+}
+
+static void mgmt_adv_monitor_removed(struct sock *sk, struct hci_dev *hdev,
+ __le16 handle)
+{
+ struct mgmt_ev_adv_monitor_removed ev;
+
+ ev.monitor_handle = handle;
+
+ mgmt_event(MGMT_EV_ADV_MONITOR_REMOVED, hdev, &ev, sizeof(ev), sk);
+}
+
+static int read_adv_mon_features(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct adv_monitor *monitor = NULL;
+ struct mgmt_rp_read_adv_monitor_features *rp = NULL;
+ int handle, err;
+ size_t rp_size = 0;
+ __u32 supported = 0;
+ __u32 enabled = 0;
+ __u16 num_handles = 0;
+ __u16 handles[HCI_MAX_ADV_MONITOR_NUM_HANDLES];
+
+ BT_DBG("request for %s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ if (msft_monitor_supported(hdev))
+ supported |= MGMT_ADV_MONITOR_FEATURE_MASK_OR_PATTERNS;
+
+ idr_for_each_entry(&hdev->adv_monitors_idr, monitor, handle)
+ handles[num_handles++] = monitor->handle;
+
+ hci_dev_unlock(hdev);
+
+ rp_size = sizeof(*rp) + (num_handles * sizeof(u16));
+ rp = kmalloc(rp_size, GFP_KERNEL);
+ if (!rp)
+ return -ENOMEM;
+
+ /* All supported features are currently enabled */
+ enabled = supported;
+
+ rp->supported_features = cpu_to_le32(supported);
+ rp->enabled_features = cpu_to_le32(enabled);
+ rp->max_num_handles = cpu_to_le16(HCI_MAX_ADV_MONITOR_NUM_HANDLES);
+ rp->max_num_patterns = HCI_MAX_ADV_MONITOR_NUM_PATTERNS;
+ rp->num_handles = cpu_to_le16(num_handles);
+ if (num_handles)
+ memcpy(&rp->handles, &handles, (num_handles * sizeof(u16)));
+
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_READ_ADV_MONITOR_FEATURES,
+ MGMT_STATUS_SUCCESS, rp, rp_size);
+
+ kfree(rp);
+
+ return err;
+}
+
+static void mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev,
+ void *data, int status)
+{
+ struct mgmt_rp_add_adv_patterns_monitor rp;
+ struct mgmt_pending_cmd *cmd = data;
+ struct adv_monitor *monitor;
+
+ /* This is likely the result of hdev being closed and mgmt_index_removed
+ * is attempting to clean up any pending command so
+ * hci_adv_monitors_clear is about to be called which will take care of
+ * freeing the adv_monitor instances.
+ */
+ if (status == -ECANCELED && !mgmt_pending_valid(hdev, cmd))
+ return;
+
+ monitor = cmd->user_data;
+
+ hci_dev_lock(hdev);
+
+ rp.monitor_handle = cpu_to_le16(monitor->handle);
+
+ if (!status) {
+ mgmt_adv_monitor_added(cmd->sk, hdev, monitor->handle);
+ hdev->adv_monitors_cnt++;
+ if (monitor->state == ADV_MONITOR_STATE_NOT_REGISTERED)
+ monitor->state = ADV_MONITOR_STATE_REGISTERED;
+ hci_update_passive_scan(hdev);
+ }
+
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode,
+ mgmt_status(status), &rp, sizeof(rp));
+ mgmt_pending_remove(cmd);
+
+ hci_dev_unlock(hdev);
+ bt_dev_dbg(hdev, "add monitor %d complete, status %d",
+ rp.monitor_handle, status);
+}
+
+static int mgmt_add_adv_patterns_monitor_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct adv_monitor *mon;
+
+ mutex_lock(&hdev->mgmt_pending_lock);
+
+ if (!__mgmt_pending_listed(hdev, cmd)) {
+ mutex_unlock(&hdev->mgmt_pending_lock);
+ return -ECANCELED;
+ }
+
+ mon = cmd->user_data;
+
+ mutex_unlock(&hdev->mgmt_pending_lock);
+
+ return hci_add_adv_monitor(hdev, mon);
+}
+
+static int __add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev,
+ struct adv_monitor *m, u8 status,
+ void *data, u16 len, u16 op)
{
- struct mgmt_rp_read_local_oob_data mgmt_rp;
- size_t rp_size = sizeof(mgmt_rp);
struct mgmt_pending_cmd *cmd;
+ int err;
- BT_DBG("%s status %u", hdev->name, status);
+ hci_dev_lock(hdev);
- cmd = pending_find(MGMT_OP_READ_LOCAL_OOB_DATA, hdev);
- if (!cmd)
+ if (status)
+ goto unlock;
+
+ if (pending_find(MGMT_OP_SET_LE, hdev) ||
+ pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR, hdev) ||
+ pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, hdev)) {
+ status = MGMT_STATUS_BUSY;
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_add(sk, op, hdev, data, len);
+ if (!cmd) {
+ status = MGMT_STATUS_NO_RESOURCES;
+ goto unlock;
+ }
+
+ cmd->user_data = m;
+ err = hci_cmd_sync_queue(hdev, mgmt_add_adv_patterns_monitor_sync, cmd,
+ mgmt_add_adv_patterns_monitor_complete);
+ if (err) {
+ if (err == -ENOMEM)
+ status = MGMT_STATUS_NO_RESOURCES;
+ else
+ status = MGMT_STATUS_FAILED;
+
+ goto unlock;
+ }
+
+ hci_dev_unlock(hdev);
+
+ return 0;
+
+unlock:
+ hci_free_adv_monitor(hdev, m);
+ hci_dev_unlock(hdev);
+ return mgmt_cmd_status(sk, hdev->id, op, status);
+}
+
+static void parse_adv_monitor_rssi(struct adv_monitor *m,
+ struct mgmt_adv_rssi_thresholds *rssi)
+{
+ if (rssi) {
+ m->rssi.low_threshold = rssi->low_threshold;
+ m->rssi.low_threshold_timeout =
+ __le16_to_cpu(rssi->low_threshold_timeout);
+ m->rssi.high_threshold = rssi->high_threshold;
+ m->rssi.high_threshold_timeout =
+ __le16_to_cpu(rssi->high_threshold_timeout);
+ m->rssi.sampling_period = rssi->sampling_period;
+ } else {
+ /* Default values. These numbers are the least constricting
+ * parameters for MSFT API to work, so it behaves as if there
+ * are no rssi parameter to consider. May need to be changed
+ * if other API are to be supported.
+ */
+ m->rssi.low_threshold = -127;
+ m->rssi.low_threshold_timeout = 60;
+ m->rssi.high_threshold = -127;
+ m->rssi.high_threshold_timeout = 0;
+ m->rssi.sampling_period = 0;
+ }
+}
+
+static u8 parse_adv_monitor_pattern(struct adv_monitor *m, u8 pattern_count,
+ struct mgmt_adv_pattern *patterns)
+{
+ u8 offset = 0, length = 0;
+ struct adv_pattern *p = NULL;
+ int i;
+
+ for (i = 0; i < pattern_count; i++) {
+ offset = patterns[i].offset;
+ length = patterns[i].length;
+ if (offset >= HCI_MAX_AD_LENGTH ||
+ length > HCI_MAX_AD_LENGTH ||
+ (offset + length) > HCI_MAX_AD_LENGTH)
+ return MGMT_STATUS_INVALID_PARAMS;
+
+ p = kmalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return MGMT_STATUS_NO_RESOURCES;
+
+ p->ad_type = patterns[i].ad_type;
+ p->offset = patterns[i].offset;
+ p->length = patterns[i].length;
+ memcpy(p->value, patterns[i].value, p->length);
+
+ INIT_LIST_HEAD(&p->list);
+ list_add(&p->list, &m->patterns);
+ }
+
+ return MGMT_STATUS_SUCCESS;
+}
+
+static int add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_add_adv_patterns_monitor *cp = data;
+ struct adv_monitor *m = NULL;
+ u8 status = MGMT_STATUS_SUCCESS;
+ size_t expected_size = sizeof(*cp);
+
+ BT_DBG("request for %s", hdev->name);
+
+ if (len <= sizeof(*cp)) {
+ status = MGMT_STATUS_INVALID_PARAMS;
+ goto done;
+ }
+
+ expected_size += cp->pattern_count * sizeof(struct mgmt_adv_pattern);
+ if (len != expected_size) {
+ status = MGMT_STATUS_INVALID_PARAMS;
+ goto done;
+ }
+
+ m = kzalloc(sizeof(*m), GFP_KERNEL);
+ if (!m) {
+ status = MGMT_STATUS_NO_RESOURCES;
+ goto done;
+ }
+
+ INIT_LIST_HEAD(&m->patterns);
+
+ parse_adv_monitor_rssi(m, NULL);
+ status = parse_adv_monitor_pattern(m, cp->pattern_count, cp->patterns);
+
+done:
+ return __add_adv_patterns_monitor(sk, hdev, m, status, data, len,
+ MGMT_OP_ADD_ADV_PATTERNS_MONITOR);
+}
+
+static int add_adv_patterns_monitor_rssi(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_add_adv_patterns_monitor_rssi *cp = data;
+ struct adv_monitor *m = NULL;
+ u8 status = MGMT_STATUS_SUCCESS;
+ size_t expected_size = sizeof(*cp);
+
+ BT_DBG("request for %s", hdev->name);
+
+ if (len <= sizeof(*cp)) {
+ status = MGMT_STATUS_INVALID_PARAMS;
+ goto done;
+ }
+
+ expected_size += cp->pattern_count * sizeof(struct mgmt_adv_pattern);
+ if (len != expected_size) {
+ status = MGMT_STATUS_INVALID_PARAMS;
+ goto done;
+ }
+
+ m = kzalloc(sizeof(*m), GFP_KERNEL);
+ if (!m) {
+ status = MGMT_STATUS_NO_RESOURCES;
+ goto done;
+ }
+
+ INIT_LIST_HEAD(&m->patterns);
+
+ parse_adv_monitor_rssi(m, &cp->rssi);
+ status = parse_adv_monitor_pattern(m, cp->pattern_count, cp->patterns);
+
+done:
+ return __add_adv_patterns_monitor(sk, hdev, m, status, data, len,
+ MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI);
+}
+
+static void mgmt_remove_adv_monitor_complete(struct hci_dev *hdev,
+ void *data, int status)
+{
+ struct mgmt_rp_remove_adv_monitor rp;
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_remove_adv_monitor *cp;
+
+ if (status == -ECANCELED)
return;
- if (status || !skb) {
+ hci_dev_lock(hdev);
+
+ cp = cmd->param;
+
+ rp.monitor_handle = cp->monitor_handle;
+
+ if (!status) {
+ mgmt_adv_monitor_removed(cmd->sk, hdev, cp->monitor_handle);
+ hci_update_passive_scan(hdev);
+ }
+
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode,
+ mgmt_status(status), &rp, sizeof(rp));
+ mgmt_pending_free(cmd);
+
+ hci_dev_unlock(hdev);
+ bt_dev_dbg(hdev, "remove monitor %d complete, status %d",
+ rp.monitor_handle, status);
+}
+
+static int mgmt_remove_adv_monitor_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_remove_adv_monitor *cp = cmd->param;
+ u16 handle = __le16_to_cpu(cp->monitor_handle);
+
+ if (!handle)
+ return hci_remove_all_adv_monitor(hdev);
+
+ return hci_remove_single_adv_monitor(hdev, handle);
+}
+
+static int remove_adv_monitor(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_pending_cmd *cmd;
+ int err, status;
+
+ hci_dev_lock(hdev);
+
+ if (pending_find(MGMT_OP_SET_LE, hdev) ||
+ pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR, hdev) ||
+ pending_find(MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, hdev)) {
+ status = MGMT_STATUS_BUSY;
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_new(sk, MGMT_OP_REMOVE_ADV_MONITOR, hdev, data, len);
+ if (!cmd) {
+ status = MGMT_STATUS_NO_RESOURCES;
+ goto unlock;
+ }
+
+ err = hci_cmd_sync_submit(hdev, mgmt_remove_adv_monitor_sync, cmd,
+ mgmt_remove_adv_monitor_complete);
+
+ if (err) {
+ mgmt_pending_free(cmd);
+
+ if (err == -ENOMEM)
+ status = MGMT_STATUS_NO_RESOURCES;
+ else
+ status = MGMT_STATUS_FAILED;
+
+ goto unlock;
+ }
+
+ hci_dev_unlock(hdev);
+
+ return 0;
+
+unlock:
+ hci_dev_unlock(hdev);
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADV_MONITOR,
+ status);
+}
+
+static void read_local_oob_data_complete(struct hci_dev *hdev, void *data,
+ int err)
+{
+ struct mgmt_rp_read_local_oob_data mgmt_rp;
+ size_t rp_size = sizeof(mgmt_rp);
+ struct mgmt_pending_cmd *cmd = data;
+ struct sk_buff *skb = cmd->skb;
+ u8 status = mgmt_status(err);
+
+ if (!status) {
+ if (!skb)
+ status = MGMT_STATUS_FAILED;
+ else if (IS_ERR(skb))
+ status = mgmt_status(PTR_ERR(skb));
+ else
+ status = mgmt_status(skb->data[0]);
+ }
+
+ bt_dev_dbg(hdev, "status %d", status);
+
+ if (status) {
mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA,
- status ? mgmt_status(status) : MGMT_STATUS_FAILED);
+ status);
goto remove;
}
memset(&mgmt_rp, 0, sizeof(mgmt_rp));
- if (opcode == HCI_OP_READ_LOCAL_OOB_DATA) {
+ if (!bredr_sc_enabled(hdev)) {
struct hci_rp_read_local_oob_data *rp = (void *) skb->data;
if (skb->len < sizeof(*rp)) {
@@ -3592,17 +5722,34 @@ static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status,
MGMT_STATUS_SUCCESS, &mgmt_rp, rp_size);
remove:
- mgmt_pending_remove(cmd);
+ if (skb && !IS_ERR(skb))
+ kfree_skb(skb);
+
+ mgmt_pending_free(cmd);
+}
+
+static int read_local_oob_data_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+
+ if (bredr_sc_enabled(hdev))
+ cmd->skb = hci_read_local_oob_data_sync(hdev, true, cmd->sk);
+ else
+ cmd->skb = hci_read_local_oob_data_sync(hdev, false, cmd->sk);
+
+ if (IS_ERR(cmd->skb))
+ return PTR_ERR(cmd->skb);
+ else
+ return 0;
}
static int read_local_oob_data(struct sock *sk, struct hci_dev *hdev,
void *data, u16 data_len)
{
struct mgmt_pending_cmd *cmd;
- struct hci_request req;
int err;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
hci_dev_lock(hdev);
@@ -3618,28 +5765,20 @@ static int read_local_oob_data(struct sock *sk, struct hci_dev *hdev,
goto unlock;
}
- if (pending_find(MGMT_OP_READ_LOCAL_OOB_DATA, hdev)) {
- err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA,
- MGMT_STATUS_BUSY);
- goto unlock;
- }
-
- cmd = mgmt_pending_add(sk, MGMT_OP_READ_LOCAL_OOB_DATA, hdev, NULL, 0);
- if (!cmd) {
+ cmd = mgmt_pending_new(sk, MGMT_OP_READ_LOCAL_OOB_DATA, hdev, NULL, 0);
+ if (!cmd)
err = -ENOMEM;
- goto unlock;
- }
-
- hci_req_init(&req, hdev);
-
- if (bredr_sc_enabled(hdev))
- hci_req_add(&req, HCI_OP_READ_LOCAL_OOB_EXT_DATA, 0, NULL);
else
- hci_req_add(&req, HCI_OP_READ_LOCAL_OOB_DATA, 0, NULL);
+ err = hci_cmd_sync_queue(hdev, read_local_oob_data_sync, cmd,
+ read_local_oob_data_complete);
- err = hci_req_run_skb(&req, read_local_oob_data_complete);
- if (err < 0)
- mgmt_pending_remove(cmd);
+ if (err < 0) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA,
+ MGMT_STATUS_FAILED);
+
+ if (cmd)
+ mgmt_pending_free(cmd);
+ }
unlock:
hci_dev_unlock(hdev);
@@ -3652,7 +5791,7 @@ static int add_remote_oob_data(struct sock *sk, struct hci_dev *hdev,
struct mgmt_addr_info *addr = data;
int err;
- BT_DBG("%s ", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (!bdaddr_type_is_valid(addr->type))
return mgmt_cmd_complete(sk, hdev->id,
@@ -3761,7 +5900,7 @@ static int remove_remote_oob_data(struct sock *sk, struct hci_dev *hdev,
u8 status;
int err;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (cp->addr.type != BDADDR_BREDR)
return mgmt_cmd_complete(sk, hdev->id,
@@ -3791,29 +5930,6 @@ done:
return err;
}
-void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status)
-{
- struct mgmt_pending_cmd *cmd;
-
- BT_DBG("status %d", status);
-
- hci_dev_lock(hdev);
-
- cmd = pending_find(MGMT_OP_START_DISCOVERY, hdev);
- if (!cmd)
- cmd = pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev);
-
- if (!cmd)
- cmd = pending_find(MGMT_OP_START_LIMITED_DISCOVERY, hdev);
-
- if (cmd) {
- cmd->cmd_complete(cmd, mgmt_status(status));
- mgmt_pending_remove(cmd);
- }
-
- hci_dev_unlock(hdev);
-}
-
static bool discovery_type_is_valid(struct hci_dev *hdev, uint8_t type,
uint8_t *mgmt_status)
{
@@ -3827,7 +5943,7 @@ static bool discovery_type_is_valid(struct hci_dev *hdev, uint8_t type,
*mgmt_status = mgmt_le_support(hdev);
if (*mgmt_status)
return false;
- /* Intentional fall-through */
+ fallthrough;
case DISCOV_TYPE_BREDR:
*mgmt_status = mgmt_bredr_support(hdev);
if (*mgmt_status)
@@ -3841,6 +5957,31 @@ static bool discovery_type_is_valid(struct hci_dev *hdev, uint8_t type,
return true;
}
+static void start_discovery_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
+ return;
+
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err),
+ cmd->param, 1);
+ mgmt_pending_free(cmd);
+
+ hci_discovery_set_state(hdev, err ? DISCOVERY_STOPPED:
+ DISCOVERY_FINDING);
+}
+
+static int start_discovery_sync(struct hci_dev *hdev, void *data)
+{
+ if (!mgmt_pending_listed(hdev, data))
+ return -ECANCELED;
+
+ return hci_start_discovery_sync(hdev);
+}
+
static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev,
u16 op, void *data, u16 len)
{
@@ -3849,7 +5990,7 @@ static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev,
u8 status;
int err;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
hci_dev_lock(hdev);
@@ -3873,6 +6014,13 @@ static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev,
goto failed;
}
+ /* Can't start discovery when it is paused */
+ if (hdev->discovery_paused) {
+ err = mgmt_cmd_complete(sk, hdev->id, op, MGMT_STATUS_BUSY,
+ &cp->type, sizeof(cp->type));
+ goto failed;
+ }
+
/* Clear the discovery filter first to free any previously
* allocated memory for the UUID list.
*/
@@ -3891,11 +6039,14 @@ static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev,
goto failed;
}
- cmd->cmd_complete = generic_cmd_complete;
+ err = hci_cmd_sync_queue(hdev, start_discovery_sync, cmd,
+ start_discovery_complete);
+ if (err < 0) {
+ mgmt_pending_remove(cmd);
+ goto failed;
+ }
hci_discovery_set_state(hdev, DISCOVERY_STARTING);
- queue_work(hdev->req_workqueue, &hdev->discov_update);
- err = 0;
failed:
hci_dev_unlock(hdev);
@@ -3917,13 +6068,6 @@ static int start_limited_discovery(struct sock *sk, struct hci_dev *hdev,
data, len);
}
-static int service_discovery_cmd_complete(struct mgmt_pending_cmd *cmd,
- u8 status)
-{
- return mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status,
- cmd->param, 1);
-}
-
static int start_service_discovery(struct sock *sk, struct hci_dev *hdev,
void *data, u16 len)
{
@@ -3934,7 +6078,7 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev,
u8 status;
int err;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
hci_dev_lock(hdev);
@@ -3955,6 +6099,14 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev,
goto failed;
}
+ if (hdev->discovery_paused) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_START_SERVICE_DISCOVERY,
+ MGMT_STATUS_BUSY, &cp->type,
+ sizeof(cp->type));
+ goto failed;
+ }
+
uuid_count = __le16_to_cpu(cp->uuid_count);
if (uuid_count > max_uuid_count) {
bt_dev_err(hdev, "service_discovery: too big uuid_count value %u",
@@ -3991,8 +6143,6 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev,
goto failed;
}
- cmd->cmd_complete = service_discovery_cmd_complete;
-
/* Clear the discovery filter first to free any previously
* allocated memory for the UUID list.
*/
@@ -4016,30 +6166,43 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev,
}
}
+ err = hci_cmd_sync_queue(hdev, start_discovery_sync, cmd,
+ start_discovery_complete);
+ if (err < 0) {
+ mgmt_pending_remove(cmd);
+ goto failed;
+ }
+
hci_discovery_set_state(hdev, DISCOVERY_STARTING);
- queue_work(hdev->req_workqueue, &hdev->discov_update);
- err = 0;
failed:
hci_dev_unlock(hdev);
return err;
}
-void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status)
+static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err)
{
- struct mgmt_pending_cmd *cmd;
+ struct mgmt_pending_cmd *cmd = data;
- BT_DBG("status %d", status);
+ if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd))
+ return;
- hci_dev_lock(hdev);
+ bt_dev_dbg(hdev, "err %d", err);
- cmd = pending_find(MGMT_OP_STOP_DISCOVERY, hdev);
- if (cmd) {
- cmd->cmd_complete(cmd, mgmt_status(status));
- mgmt_pending_remove(cmd);
- }
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err),
+ cmd->param, 1);
+ mgmt_pending_free(cmd);
- hci_dev_unlock(hdev);
+ if (!err)
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+}
+
+static int stop_discovery_sync(struct hci_dev *hdev, void *data)
+{
+ if (!mgmt_pending_listed(hdev, data))
+ return -ECANCELED;
+
+ return hci_stop_discovery_sync(hdev);
}
static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data,
@@ -4049,7 +6212,7 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data,
struct mgmt_pending_cmd *cmd;
int err;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
hci_dev_lock(hdev);
@@ -4073,11 +6236,14 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data,
goto unlock;
}
- cmd->cmd_complete = generic_cmd_complete;
+ err = hci_cmd_sync_queue(hdev, stop_discovery_sync, cmd,
+ stop_discovery_complete);
+ if (err < 0) {
+ mgmt_pending_remove(cmd);
+ goto unlock;
+ }
hci_discovery_set_state(hdev, DISCOVERY_STOPPING);
- queue_work(hdev->req_workqueue, &hdev->discov_update);
- err = 0;
unlock:
hci_dev_unlock(hdev);
@@ -4091,7 +6257,7 @@ static int confirm_name(struct sock *sk, struct hci_dev *hdev, void *data,
struct inquiry_entry *e;
int err;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
hci_dev_lock(hdev);
@@ -4133,7 +6299,7 @@ static int block_device(struct sock *sk, struct hci_dev *hdev, void *data,
u8 status;
int err;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (!bdaddr_type_is_valid(cp->addr.type))
return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_BLOCK_DEVICE,
@@ -4142,7 +6308,7 @@ static int block_device(struct sock *sk, struct hci_dev *hdev, void *data,
hci_dev_lock(hdev);
- err = hci_bdaddr_list_add(&hdev->blacklist, &cp->addr.bdaddr,
+ err = hci_bdaddr_list_add(&hdev->reject_list, &cp->addr.bdaddr,
cp->addr.type);
if (err < 0) {
status = MGMT_STATUS_FAILED;
@@ -4169,7 +6335,7 @@ static int unblock_device(struct sock *sk, struct hci_dev *hdev, void *data,
u8 status;
int err;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (!bdaddr_type_is_valid(cp->addr.type))
return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNBLOCK_DEVICE,
@@ -4178,7 +6344,7 @@ static int unblock_device(struct sock *sk, struct hci_dev *hdev, void *data,
hci_dev_lock(hdev);
- err = hci_bdaddr_list_del(&hdev->blacklist, &cp->addr.bdaddr,
+ err = hci_bdaddr_list_del(&hdev->reject_list, &cp->addr.bdaddr,
cp->addr.type);
if (err < 0) {
status = MGMT_STATUS_INVALID_PARAMS;
@@ -4198,15 +6364,19 @@ done:
return err;
}
+static int set_device_id_sync(struct hci_dev *hdev, void *data)
+{
+ return hci_update_eir_sync(hdev);
+}
+
static int set_device_id(struct sock *sk, struct hci_dev *hdev, void *data,
u16 len)
{
struct mgmt_cp_set_device_id *cp = data;
- struct hci_request req;
int err;
__u16 source;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
source = __le16_to_cpu(cp->source);
@@ -4224,38 +6394,36 @@ static int set_device_id(struct sock *sk, struct hci_dev *hdev, void *data,
err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_DEVICE_ID, 0,
NULL, 0);
- hci_req_init(&req, hdev);
- __hci_req_update_eir(&req);
- hci_req_run(&req, NULL);
+ hci_cmd_sync_queue(hdev, set_device_id_sync, NULL, NULL);
hci_dev_unlock(hdev);
return err;
}
-static void enable_advertising_instance(struct hci_dev *hdev, u8 status,
- u16 opcode)
+static void enable_advertising_instance(struct hci_dev *hdev, int err)
{
- BT_DBG("status %d", status);
+ if (err)
+ bt_dev_err(hdev, "failed to re-configure advertising %d", err);
+ else
+ bt_dev_dbg(hdev, "status %d", err);
}
-static void set_advertising_complete(struct hci_dev *hdev, u8 status,
- u16 opcode)
+static void set_advertising_complete(struct hci_dev *hdev, void *data, int err)
{
+ struct mgmt_pending_cmd *cmd = data;
struct cmd_lookup match = { NULL, hdev };
- struct hci_request req;
u8 instance;
struct adv_info *adv_instance;
- int err;
+ u8 status = mgmt_status(err);
- hci_dev_lock(hdev);
+ if (err == -ECANCELED || !mgmt_pending_valid(hdev, data))
+ return;
if (status) {
- u8 mgmt_err = mgmt_status(status);
-
- mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev,
- cmd_status_rsp, &mgmt_err);
- goto unlock;
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, status);
+ mgmt_pending_free(cmd);
+ return;
}
if (hci_dev_test_flag(hdev, HCI_LE_ADV))
@@ -4263,8 +6431,7 @@ static void set_advertising_complete(struct hci_dev *hdev, u8 status,
else
hci_dev_clear_flag(hdev, HCI_ADVERTISING);
- mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, settings_rsp,
- &match);
+ settings_rsp(cmd, &match);
new_settings(hdev, match.sk);
@@ -4276,30 +6443,68 @@ static void set_advertising_complete(struct hci_dev *hdev, u8 status,
*/
if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
list_empty(&hdev->adv_instances))
- goto unlock;
+ return;
instance = hdev->cur_adv_instance;
if (!instance) {
adv_instance = list_first_entry_or_null(&hdev->adv_instances,
struct adv_info, list);
if (!adv_instance)
- goto unlock;
+ return;
instance = adv_instance->instance;
}
- hci_req_init(&req, hdev);
+ err = hci_schedule_adv_instance_sync(hdev, instance, true);
- err = __hci_req_schedule_adv_instance(&req, instance, true);
+ enable_advertising_instance(hdev, err);
+}
- if (!err)
- err = hci_req_run(&req, enable_advertising_instance);
+static int set_adv_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_mode cp;
+ u8 val;
- if (err)
- bt_dev_err(hdev, "failed to re-configure advertising");
+ mutex_lock(&hdev->mgmt_pending_lock);
-unlock:
- hci_dev_unlock(hdev);
+ if (!__mgmt_pending_listed(hdev, cmd)) {
+ mutex_unlock(&hdev->mgmt_pending_lock);
+ return -ECANCELED;
+ }
+
+ memcpy(&cp, cmd->param, sizeof(cp));
+
+ mutex_unlock(&hdev->mgmt_pending_lock);
+
+ val = !!cp.val;
+
+ if (cp.val == 0x02)
+ hci_dev_set_flag(hdev, HCI_ADVERTISING_CONNECTABLE);
+ else
+ hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE);
+
+ cancel_adv_timeout(hdev);
+
+ if (val) {
+ /* Switch to instance "0" for the Set Advertising setting.
+ * We cannot use update_[adv|scan_rsp]_data() here as the
+ * HCI_ADVERTISING flag is not yet set.
+ */
+ hdev->cur_adv_instance = 0x00;
+
+ if (ext_adv_capable(hdev)) {
+ hci_start_ext_adv_sync(hdev, 0x00);
+ } else {
+ hci_update_adv_data_sync(hdev, 0x00);
+ hci_update_scan_rsp_data_sync(hdev, 0x00);
+ hci_enable_advertising_sync(hdev);
+ }
+ } else {
+ hci_disable_advertising_sync(hdev);
+ }
+
+ return 0;
}
static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data,
@@ -4307,11 +6512,10 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data,
{
struct mgmt_mode *cp = data;
struct mgmt_pending_cmd *cmd;
- struct hci_request req;
u8 val, status;
int err;
- BT_DBG("request for %s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
status = mgmt_le_support(hdev);
if (status)
@@ -4322,6 +6526,10 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data,
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING,
MGMT_STATUS_INVALID_PARAMS);
+ if (hdev->advertising_paused)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_ADVERTISING,
+ MGMT_STATUS_BUSY);
+
hci_dev_lock(hdev);
val = !!cp->val;
@@ -4334,6 +6542,7 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data,
if (!hdev_is_powered(hdev) ||
(val == hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
(cp->val == 0x02) == hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) ||
+ hci_dev_test_flag(hdev, HCI_MESH) ||
hci_conn_num(hdev, LE_LINK) > 0 ||
(hci_dev_test_flag(hdev, HCI_LE_SCAN) &&
hdev->le_scan_type == LE_SCAN_ACTIVE)) {
@@ -4369,40 +6578,13 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data,
}
cmd = mgmt_pending_add(sk, MGMT_OP_SET_ADVERTISING, hdev, data, len);
- if (!cmd) {
+ if (!cmd)
err = -ENOMEM;
- goto unlock;
- }
-
- hci_req_init(&req, hdev);
-
- if (cp->val == 0x02)
- hci_dev_set_flag(hdev, HCI_ADVERTISING_CONNECTABLE);
else
- hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE);
+ err = hci_cmd_sync_queue(hdev, set_adv_sync, cmd,
+ set_advertising_complete);
- cancel_adv_timeout(hdev);
-
- if (val) {
- /* Switch to instance "0" for the Set Advertising setting.
- * We cannot use update_[adv|scan_rsp]_data() here as the
- * HCI_ADVERTISING flag is not yet set.
- */
- hdev->cur_adv_instance = 0x00;
-
- if (ext_adv_capable(hdev)) {
- __hci_req_start_ext_adv(&req, 0x00);
- } else {
- __hci_req_update_adv_data(&req, 0x00);
- __hci_req_update_scan_rsp_data(&req, 0x00);
- __hci_req_enable_advertising(&req);
- }
- } else {
- __hci_req_disable_advertising(&req);
- }
-
- err = hci_req_run(&req, set_advertising_complete);
- if (err < 0)
+ if (err < 0 && cmd)
mgmt_pending_remove(cmd);
unlock:
@@ -4416,7 +6598,7 @@ static int set_static_address(struct sock *sk, struct hci_dev *hdev,
struct mgmt_cp_set_static_address *cp = data;
int err;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (!lmp_le_capable(hdev))
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_STATIC_ADDRESS,
@@ -4461,12 +6643,13 @@ static int set_scan_params(struct sock *sk, struct hci_dev *hdev,
__u16 interval, window;
int err;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (!lmp_le_capable(hdev))
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS,
MGMT_STATUS_NOT_SUPPORTED);
+ /* Keep allowed ranges in sync with set_mesh() */
interval = __le16_to_cpu(cp->interval);
if (interval < 0x0004 || interval > 0x4000)
@@ -4495,38 +6678,23 @@ static int set_scan_params(struct sock *sk, struct hci_dev *hdev,
* loaded.
*/
if (hci_dev_test_flag(hdev, HCI_LE_SCAN) &&
- hdev->discovery.state == DISCOVERY_STOPPED) {
- struct hci_request req;
-
- hci_req_init(&req, hdev);
-
- hci_req_add_le_scan_disable(&req);
- hci_req_add_le_passive_scan(&req);
-
- hci_req_run(&req, NULL);
- }
+ hdev->discovery.state == DISCOVERY_STOPPED)
+ hci_update_passive_scan(hdev);
hci_dev_unlock(hdev);
return err;
}
-static void fast_connectable_complete(struct hci_dev *hdev, u8 status,
- u16 opcode)
+static void fast_connectable_complete(struct hci_dev *hdev, void *data, int err)
{
- struct mgmt_pending_cmd *cmd;
-
- BT_DBG("status 0x%02x", status);
-
- hci_dev_lock(hdev);
+ struct mgmt_pending_cmd *cmd = data;
- cmd = pending_find(MGMT_OP_SET_FAST_CONNECTABLE, hdev);
- if (!cmd)
- goto unlock;
+ bt_dev_dbg(hdev, "err %d", err);
- if (status) {
+ if (err) {
mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE,
- mgmt_status(status));
+ mgmt_status(err));
} else {
struct mgmt_mode *cp = cmd->param;
@@ -4539,10 +6707,15 @@ static void fast_connectable_complete(struct hci_dev *hdev, u8 status,
new_settings(hdev, cmd->sk);
}
- mgmt_pending_remove(cmd);
+ mgmt_pending_free(cmd);
+}
-unlock:
- hci_dev_unlock(hdev);
+static int write_fast_connectable_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_mode *cp = cmd->param;
+
+ return hci_write_fast_connectable_sync(hdev, cp->val);
}
static int set_fast_connectable(struct sock *sk, struct hci_dev *hdev,
@@ -4550,58 +6723,49 @@ static int set_fast_connectable(struct sock *sk, struct hci_dev *hdev,
{
struct mgmt_mode *cp = data;
struct mgmt_pending_cmd *cmd;
- struct hci_request req;
int err;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) ||
hdev->hci_ver < BLUETOOTH_VER_1_2)
- return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE,
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_FAST_CONNECTABLE,
MGMT_STATUS_NOT_SUPPORTED);
if (cp->val != 0x00 && cp->val != 0x01)
- return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE,
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_FAST_CONNECTABLE,
MGMT_STATUS_INVALID_PARAMS);
hci_dev_lock(hdev);
- if (pending_find(MGMT_OP_SET_FAST_CONNECTABLE, hdev)) {
- err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE,
- MGMT_STATUS_BUSY);
- goto unlock;
- }
-
if (!!cp->val == hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE)) {
- err = send_settings_rsp(sk, MGMT_OP_SET_FAST_CONNECTABLE,
- hdev);
+ err = send_settings_rsp(sk, MGMT_OP_SET_FAST_CONNECTABLE, hdev);
goto unlock;
}
if (!hdev_is_powered(hdev)) {
hci_dev_change_flag(hdev, HCI_FAST_CONNECTABLE);
- err = send_settings_rsp(sk, MGMT_OP_SET_FAST_CONNECTABLE,
- hdev);
+ err = send_settings_rsp(sk, MGMT_OP_SET_FAST_CONNECTABLE, hdev);
new_settings(hdev, sk);
goto unlock;
}
- cmd = mgmt_pending_add(sk, MGMT_OP_SET_FAST_CONNECTABLE, hdev,
- data, len);
- if (!cmd) {
+ cmd = mgmt_pending_new(sk, MGMT_OP_SET_FAST_CONNECTABLE, hdev, data,
+ len);
+ if (!cmd)
err = -ENOMEM;
- goto unlock;
- }
-
- hci_req_init(&req, hdev);
-
- __hci_req_write_fast_connectable(&req, cp->val);
+ else
+ err = hci_cmd_sync_queue(hdev, write_fast_connectable_sync, cmd,
+ fast_connectable_complete);
- err = hci_req_run(&req, fast_connectable_complete);
if (err < 0) {
- err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE,
- MGMT_STATUS_FAILED);
- mgmt_pending_remove(cmd);
+ mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_FAST_CONNECTABLE,
+ MGMT_STATUS_FAILED);
+
+ if (cmd)
+ mgmt_pending_free(cmd);
}
unlock:
@@ -4610,46 +6774,54 @@ unlock:
return err;
}
-static void set_bredr_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+static void set_bredr_complete(struct hci_dev *hdev, void *data, int err)
{
- struct mgmt_pending_cmd *cmd;
-
- BT_DBG("status 0x%02x", status);
-
- hci_dev_lock(hdev);
+ struct mgmt_pending_cmd *cmd = data;
- cmd = pending_find(MGMT_OP_SET_BREDR, hdev);
- if (!cmd)
- goto unlock;
+ bt_dev_dbg(hdev, "err %d", err);
- if (status) {
- u8 mgmt_err = mgmt_status(status);
+ if (err) {
+ u8 mgmt_err = mgmt_status(err);
/* We need to restore the flag if related HCI commands
* failed.
*/
hci_dev_clear_flag(hdev, HCI_BREDR_ENABLED);
- mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err);
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err);
} else {
send_settings_rsp(cmd->sk, MGMT_OP_SET_BREDR, hdev);
new_settings(hdev, cmd->sk);
}
- mgmt_pending_remove(cmd);
+ mgmt_pending_free(cmd);
+}
-unlock:
- hci_dev_unlock(hdev);
+static int set_bredr_sync(struct hci_dev *hdev, void *data)
+{
+ int status;
+
+ status = hci_write_fast_connectable_sync(hdev, false);
+
+ if (!status)
+ status = hci_update_scan_sync(hdev);
+
+ /* Since only the advertising data flags will change, there
+ * is no need to update the scan response data.
+ */
+ if (!status)
+ status = hci_update_adv_data_sync(hdev, hdev->cur_adv_instance);
+
+ return status;
}
static int set_bredr(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
{
struct mgmt_mode *cp = data;
struct mgmt_pending_cmd *cmd;
- struct hci_request req;
int err;
- BT_DBG("request for %s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (!lmp_bredr_capable(hdev) || !lmp_le_capable(hdev))
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR,
@@ -4676,7 +6848,6 @@ static int set_bredr(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
hci_dev_clear_flag(hdev, HCI_SSP_ENABLED);
hci_dev_clear_flag(hdev, HCI_LINK_SECURITY);
hci_dev_clear_flag(hdev, HCI_FAST_CONNECTABLE);
- hci_dev_clear_flag(hdev, HCI_HS_ENABLED);
}
hci_dev_change_flag(hdev, HCI_BREDR_ENABLED);
@@ -4718,15 +6889,19 @@ static int set_bredr(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
}
}
- if (pending_find(MGMT_OP_SET_BREDR, hdev)) {
- err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR,
- MGMT_STATUS_BUSY);
- goto unlock;
- }
-
- cmd = mgmt_pending_add(sk, MGMT_OP_SET_BREDR, hdev, data, len);
- if (!cmd) {
+ cmd = mgmt_pending_new(sk, MGMT_OP_SET_BREDR, hdev, data, len);
+ if (!cmd)
err = -ENOMEM;
+ else
+ err = hci_cmd_sync_queue(hdev, set_bredr_sync, cmd,
+ set_bredr_complete);
+
+ if (err < 0) {
+ mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR,
+ MGMT_STATUS_FAILED);
+ if (cmd)
+ mgmt_pending_free(cmd);
+
goto unlock;
}
@@ -4735,42 +6910,23 @@ static int set_bredr(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
*/
hci_dev_set_flag(hdev, HCI_BREDR_ENABLED);
- hci_req_init(&req, hdev);
-
- __hci_req_write_fast_connectable(&req, false);
- __hci_req_update_scan(&req);
-
- /* Since only the advertising data flags will change, there
- * is no need to update the scan response data.
- */
- __hci_req_update_adv_data(&req, hdev->cur_adv_instance);
-
- err = hci_req_run(&req, set_bredr_complete);
- if (err < 0)
- mgmt_pending_remove(cmd);
-
unlock:
hci_dev_unlock(hdev);
return err;
}
-static void sc_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+static void set_secure_conn_complete(struct hci_dev *hdev, void *data, int err)
{
- struct mgmt_pending_cmd *cmd;
+ struct mgmt_pending_cmd *cmd = data;
struct mgmt_mode *cp;
- BT_DBG("%s status %u", hdev->name, status);
-
- hci_dev_lock(hdev);
+ bt_dev_dbg(hdev, "err %d", err);
- cmd = pending_find(MGMT_OP_SET_SECURE_CONN, hdev);
- if (!cmd)
- goto unlock;
+ if (err) {
+ u8 mgmt_err = mgmt_status(err);
- if (status) {
- mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode,
- mgmt_status(status));
- goto remove;
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err);
+ goto done;
}
cp = cmd->param;
@@ -4790,13 +6946,23 @@ static void sc_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode)
break;
}
- send_settings_rsp(cmd->sk, MGMT_OP_SET_SECURE_CONN, hdev);
+ send_settings_rsp(cmd->sk, cmd->opcode, hdev);
new_settings(hdev, cmd->sk);
-remove:
- mgmt_pending_remove(cmd);
-unlock:
- hci_dev_unlock(hdev);
+done:
+ mgmt_pending_free(cmd);
+}
+
+static int set_secure_conn_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_mode *cp = cmd->param;
+ u8 val = !!cp->val;
+
+ /* Force write of val */
+ hci_dev_set_flag(hdev, HCI_SC_ENABLED);
+
+ return hci_write_sc_support_sync(hdev, val);
}
static int set_secure_conn(struct sock *sk, struct hci_dev *hdev,
@@ -4804,11 +6970,10 @@ static int set_secure_conn(struct sock *sk, struct hci_dev *hdev,
{
struct mgmt_mode *cp = data;
struct mgmt_pending_cmd *cmd;
- struct hci_request req;
u8 val;
int err;
- BT_DBG("request for %s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (!lmp_sc_capable(hdev) &&
!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
@@ -4823,7 +6988,7 @@ static int set_secure_conn(struct sock *sk, struct hci_dev *hdev,
if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02)
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN,
- MGMT_STATUS_INVALID_PARAMS);
+ MGMT_STATUS_INVALID_PARAMS);
hci_dev_lock(hdev);
@@ -4854,12 +7019,6 @@ static int set_secure_conn(struct sock *sk, struct hci_dev *hdev,
goto failed;
}
- if (pending_find(MGMT_OP_SET_SECURE_CONN, hdev)) {
- err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN,
- MGMT_STATUS_BUSY);
- goto failed;
- }
-
val = !!cp->val;
if (val == hci_dev_test_flag(hdev, HCI_SC_ENABLED) &&
@@ -4868,18 +7027,18 @@ static int set_secure_conn(struct sock *sk, struct hci_dev *hdev,
goto failed;
}
- cmd = mgmt_pending_add(sk, MGMT_OP_SET_SECURE_CONN, hdev, data, len);
- if (!cmd) {
+ cmd = mgmt_pending_new(sk, MGMT_OP_SET_SECURE_CONN, hdev, data, len);
+ if (!cmd)
err = -ENOMEM;
- goto failed;
- }
+ else
+ err = hci_cmd_sync_queue(hdev, set_secure_conn_sync, cmd,
+ set_secure_conn_complete);
- hci_req_init(&req, hdev);
- hci_req_add(&req, HCI_OP_WRITE_SC_SUPPORT, 1, &val);
- err = hci_req_run(&req, sc_enable_complete);
if (err < 0) {
- mgmt_pending_remove(cmd);
- goto failed;
+ mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SECURE_CONN,
+ MGMT_STATUS_FAILED);
+ if (cmd)
+ mgmt_pending_free(cmd);
}
failed:
@@ -4894,7 +7053,7 @@ static int set_debug_keys(struct sock *sk, struct hci_dev *hdev,
bool changed, use_changed;
int err;
- BT_DBG("request for %s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02)
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEBUG_KEYS,
@@ -4941,7 +7100,7 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data,
bool changed;
int err;
- BT_DBG("request for %s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (!lmp_le_capable(hdev))
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY,
@@ -5016,7 +7175,7 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data,
u16 irk_count, expected_len;
int i, err;
- BT_DBG("request for %s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (!lmp_le_capable(hdev))
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS,
@@ -5030,7 +7189,7 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data,
MGMT_STATUS_INVALID_PARAMS);
}
- expected_len = sizeof(*cp) + irk_count * sizeof(struct mgmt_irk_info);
+ expected_len = struct_size(cp, irks, irk_count);
if (expected_len != len) {
bt_dev_err(hdev, "load_irks: expected %u bytes, got %u bytes",
expected_len, len);
@@ -5038,7 +7197,7 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data,
MGMT_STATUS_INVALID_PARAMS);
}
- BT_DBG("%s irk_count %u", hdev->name, irk_count);
+ bt_dev_dbg(hdev, "irk_count %u", irk_count);
for (i = 0; i < irk_count; i++) {
struct mgmt_irk_info *key = &cp->irks[i];
@@ -5056,6 +7215,14 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data,
for (i = 0; i < irk_count; i++) {
struct mgmt_irk_info *irk = &cp->irks[i];
+ if (hci_is_blocked_key(hdev,
+ HCI_BLOCKED_KEY_TYPE_IRK,
+ irk->val)) {
+ bt_dev_warn(hdev, "Skipping blocked IRK for %pMR",
+ &irk->addr.bdaddr);
+ continue;
+ }
+
hci_add_irk(hdev, &irk->addr.bdaddr,
le_addr_type(irk->addr.type), irk->val,
BDADDR_ANY);
@@ -5072,7 +7239,7 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data,
static bool ltk_is_valid(struct mgmt_ltk_info *key)
{
- if (key->master != 0x00 && key->master != 0x01)
+ if (key->initiator != 0x00 && key->initiator != 0x01)
return false;
switch (key->addr.type) {
@@ -5098,7 +7265,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev,
u16 key_count, expected_len;
int i, err;
- BT_DBG("request for %s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (!lmp_le_capable(hdev))
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS,
@@ -5112,8 +7279,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev,
MGMT_STATUS_INVALID_PARAMS);
}
- expected_len = sizeof(*cp) + key_count *
- sizeof(struct mgmt_ltk_info);
+ expected_len = struct_size(cp, keys, key_count);
if (expected_len != len) {
bt_dev_err(hdev, "load_keys: expected %u bytes, got %u bytes",
expected_len, len);
@@ -5121,16 +7287,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev,
MGMT_STATUS_INVALID_PARAMS);
}
- BT_DBG("%s key_count %u", hdev->name, key_count);
-
- for (i = 0; i < key_count; i++) {
- struct mgmt_ltk_info *key = &cp->keys[i];
-
- if (!ltk_is_valid(key))
- return mgmt_cmd_status(sk, hdev->id,
- MGMT_OP_LOAD_LONG_TERM_KEYS,
- MGMT_STATUS_INVALID_PARAMS);
- }
+ bt_dev_dbg(hdev, "key_count %u", key_count);
hci_dev_lock(hdev);
@@ -5140,14 +7297,28 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev,
struct mgmt_ltk_info *key = &cp->keys[i];
u8 type, authenticated;
+ if (hci_is_blocked_key(hdev,
+ HCI_BLOCKED_KEY_TYPE_LTK,
+ key->val)) {
+ bt_dev_warn(hdev, "Skipping blocked LTK for %pMR",
+ &key->addr.bdaddr);
+ continue;
+ }
+
+ if (!ltk_is_valid(key)) {
+ bt_dev_warn(hdev, "Invalid LTK for %pMR",
+ &key->addr.bdaddr);
+ continue;
+ }
+
switch (key->type) {
case MGMT_LTK_UNAUTHENTICATED:
authenticated = 0x00;
- type = key->master ? SMP_LTK : SMP_LTK_SLAVE;
+ type = key->initiator ? SMP_LTK : SMP_LTK_RESPONDER;
break;
case MGMT_LTK_AUTHENTICATED:
authenticated = 0x01;
- type = key->master ? SMP_LTK : SMP_LTK_SLAVE;
+ type = key->initiator ? SMP_LTK : SMP_LTK_RESPONDER;
break;
case MGMT_LTK_P256_UNAUTH:
authenticated = 0x00;
@@ -5160,7 +7331,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev,
case MGMT_LTK_P256_DEBUG:
authenticated = 0x00;
type = SMP_LTK_P256_DEBUG;
- /* fall through */
+ fallthrough;
default:
continue;
}
@@ -5178,14 +7349,19 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev,
return err;
}
-static int conn_info_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status)
+static void get_conn_info_complete(struct hci_dev *hdev, void *data, int err)
{
+ struct mgmt_pending_cmd *cmd = data;
struct hci_conn *conn = cmd->user_data;
+ struct mgmt_cp_get_conn_info *cp = cmd->param;
struct mgmt_rp_get_conn_info rp;
- int err;
+ u8 status;
- memcpy(&rp.addr, cmd->param, sizeof(rp.addr));
+ bt_dev_dbg(hdev, "err %d", err);
+ memcpy(&rp.addr, &cp->addr, sizeof(rp.addr));
+
+ status = mgmt_status(err);
if (status == MGMT_STATUS_SUCCESS) {
rp.rssi = conn->rssi;
rp.tx_power = conn->tx_power;
@@ -5196,67 +7372,48 @@ static int conn_info_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status)
rp.max_tx_power = HCI_TX_POWER_INVALID;
}
- err = mgmt_cmd_complete(cmd->sk, cmd->index, MGMT_OP_GET_CONN_INFO,
- status, &rp, sizeof(rp));
-
- hci_conn_drop(conn);
- hci_conn_put(conn);
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, MGMT_OP_GET_CONN_INFO, status,
+ &rp, sizeof(rp));
- return err;
+ mgmt_pending_free(cmd);
}
-static void conn_info_refresh_complete(struct hci_dev *hdev, u8 hci_status,
- u16 opcode)
+static int get_conn_info_sync(struct hci_dev *hdev, void *data)
{
- struct hci_cp_read_rssi *cp;
- struct mgmt_pending_cmd *cmd;
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_get_conn_info *cp = cmd->param;
struct hci_conn *conn;
- u16 handle;
- u8 status;
-
- BT_DBG("status 0x%02x", hci_status);
+ int err;
+ __le16 handle;
- hci_dev_lock(hdev);
+ /* Make sure we are still connected */
+ if (cp->addr.type == BDADDR_BREDR)
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK,
+ &cp->addr.bdaddr);
+ else
+ conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, &cp->addr.bdaddr);
- /* Commands sent in request are either Read RSSI or Read Transmit Power
- * Level so we check which one was last sent to retrieve connection
- * handle. Both commands have handle as first parameter so it's safe to
- * cast data on the same command struct.
- *
- * First command sent is always Read RSSI and we fail only if it fails.
- * In other case we simply override error to indicate success as we
- * already remembered if TX power value is actually valid.
- */
- cp = hci_sent_cmd_data(hdev, HCI_OP_READ_RSSI);
- if (!cp) {
- cp = hci_sent_cmd_data(hdev, HCI_OP_READ_TX_POWER);
- status = MGMT_STATUS_SUCCESS;
- } else {
- status = mgmt_status(hci_status);
- }
+ if (!conn || conn->state != BT_CONNECTED)
+ return MGMT_STATUS_NOT_CONNECTED;
- if (!cp) {
- bt_dev_err(hdev, "invalid sent_cmd in conn_info response");
- goto unlock;
- }
+ cmd->user_data = conn;
+ handle = cpu_to_le16(conn->handle);
- handle = __le16_to_cpu(cp->handle);
- conn = hci_conn_hash_lookup_handle(hdev, handle);
- if (!conn) {
- bt_dev_err(hdev, "unknown handle (%d) in conn_info response",
- handle);
- goto unlock;
- }
+ /* Refresh RSSI each time */
+ err = hci_read_rssi_sync(hdev, handle);
- cmd = pending_find_data(MGMT_OP_GET_CONN_INFO, hdev, conn);
- if (!cmd)
- goto unlock;
+ /* For LE links TX power does not change thus we don't need to
+ * query for it once value is known.
+ */
+ if (!err && (!bdaddr_type_is_le(cp->addr.type) ||
+ conn->tx_power == HCI_TX_POWER_INVALID))
+ err = hci_read_tx_power_sync(hdev, handle, 0x00);
- cmd->cmd_complete(cmd, status);
- mgmt_pending_remove(cmd);
+ /* Max TX power needs to be read only once per connection */
+ if (!err && conn->max_tx_power == HCI_TX_POWER_INVALID)
+ err = hci_read_tx_power_sync(hdev, handle, 0x01);
-unlock:
- hci_dev_unlock(hdev);
+ return err;
}
static int get_conn_info(struct sock *sk, struct hci_dev *hdev, void *data,
@@ -5268,7 +7425,7 @@ static int get_conn_info(struct sock *sk, struct hci_dev *hdev, void *data,
unsigned long conn_info_age;
int err = 0;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
memset(&rp, 0, sizeof(rp));
bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
@@ -5301,18 +7458,11 @@ static int get_conn_info(struct sock *sk, struct hci_dev *hdev, void *data,
goto unlock;
}
- if (pending_find_data(MGMT_OP_GET_CONN_INFO, hdev, conn)) {
- err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO,
- MGMT_STATUS_BUSY, &rp, sizeof(rp));
- goto unlock;
- }
-
/* To avoid client trying to guess when to poll again for information we
* calculate conn info age as random value between min/max set in hdev.
*/
- conn_info_age = hdev->conn_info_min_age +
- prandom_u32_max(hdev->conn_info_max_age -
- hdev->conn_info_min_age);
+ conn_info_age = get_random_u32_inclusive(hdev->conn_info_min_age,
+ hdev->conn_info_max_age - 1);
/* Query controller to refresh cached values if they are too old or were
* never read.
@@ -5320,50 +7470,27 @@ static int get_conn_info(struct sock *sk, struct hci_dev *hdev, void *data,
if (time_after(jiffies, conn->conn_info_timestamp +
msecs_to_jiffies(conn_info_age)) ||
!conn->conn_info_timestamp) {
- struct hci_request req;
- struct hci_cp_read_tx_power req_txp_cp;
- struct hci_cp_read_rssi req_rssi_cp;
struct mgmt_pending_cmd *cmd;
- hci_req_init(&req, hdev);
- req_rssi_cp.handle = cpu_to_le16(conn->handle);
- hci_req_add(&req, HCI_OP_READ_RSSI, sizeof(req_rssi_cp),
- &req_rssi_cp);
-
- /* For LE links TX power does not change thus we don't need to
- * query for it once value is known.
- */
- if (!bdaddr_type_is_le(cp->addr.type) ||
- conn->tx_power == HCI_TX_POWER_INVALID) {
- req_txp_cp.handle = cpu_to_le16(conn->handle);
- req_txp_cp.type = 0x00;
- hci_req_add(&req, HCI_OP_READ_TX_POWER,
- sizeof(req_txp_cp), &req_txp_cp);
+ cmd = mgmt_pending_new(sk, MGMT_OP_GET_CONN_INFO, hdev, data,
+ len);
+ if (!cmd) {
+ err = -ENOMEM;
+ } else {
+ err = hci_cmd_sync_queue(hdev, get_conn_info_sync,
+ cmd, get_conn_info_complete);
}
- /* Max TX power needs to be read only once per connection */
- if (conn->max_tx_power == HCI_TX_POWER_INVALID) {
- req_txp_cp.handle = cpu_to_le16(conn->handle);
- req_txp_cp.type = 0x01;
- hci_req_add(&req, HCI_OP_READ_TX_POWER,
- sizeof(req_txp_cp), &req_txp_cp);
- }
+ if (err < 0) {
+ mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CONN_INFO,
+ MGMT_STATUS_FAILED, &rp, sizeof(rp));
- err = hci_req_run(&req, conn_info_refresh_complete);
- if (err < 0)
- goto unlock;
+ if (cmd)
+ mgmt_pending_free(cmd);
- cmd = mgmt_pending_add(sk, MGMT_OP_GET_CONN_INFO, hdev,
- data, len);
- if (!cmd) {
- err = -ENOMEM;
goto unlock;
}
- hci_conn_hold(conn);
- cmd->user_data = hci_conn_get(conn);
- cmd->cmd_complete = conn_info_cmd_complete;
-
conn->conn_info_timestamp = jiffies;
} else {
/* Cache is valid, just reply with values cached in hci_conn */
@@ -5380,24 +7507,24 @@ unlock:
return err;
}
-static int clock_info_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status)
+static void get_clock_info_complete(struct hci_dev *hdev, void *data, int err)
{
- struct hci_conn *conn = cmd->user_data;
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_get_clock_info *cp = cmd->param;
struct mgmt_rp_get_clock_info rp;
- struct hci_dev *hdev;
- int err;
+ struct hci_conn *conn = cmd->user_data;
+ u8 status = mgmt_status(err);
+
+ bt_dev_dbg(hdev, "err %d", err);
memset(&rp, 0, sizeof(rp));
- memcpy(&rp.addr, cmd->param, sizeof(rp.addr));
+ bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
+ rp.addr.type = cp->addr.type;
- if (status)
+ if (err)
goto complete;
- hdev = hci_dev_get(cmd->index);
- if (hdev) {
- rp.local_clock = cpu_to_le32(hdev->clock);
- hci_dev_put(hdev);
- }
+ rp.local_clock = cpu_to_le32(hdev->clock);
if (conn) {
rp.piconet_clock = cpu_to_le32(conn->clock);
@@ -5405,61 +7532,44 @@ static int clock_info_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status)
}
complete:
- err = mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, &rp,
- sizeof(rp));
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, status, &rp,
+ sizeof(rp));
- if (conn) {
- hci_conn_drop(conn);
- hci_conn_put(conn);
- }
-
- return err;
+ mgmt_pending_free(cmd);
}
-static void get_clock_info_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+static int get_clock_info_sync(struct hci_dev *hdev, void *data)
{
- struct hci_cp_read_clock *hci_cp;
- struct mgmt_pending_cmd *cmd;
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_get_clock_info *cp = cmd->param;
+ struct hci_cp_read_clock hci_cp;
struct hci_conn *conn;
- BT_DBG("%s status %u", hdev->name, status);
-
- hci_dev_lock(hdev);
-
- hci_cp = hci_sent_cmd_data(hdev, HCI_OP_READ_CLOCK);
- if (!hci_cp)
- goto unlock;
-
- if (hci_cp->which) {
- u16 handle = __le16_to_cpu(hci_cp->handle);
- conn = hci_conn_hash_lookup_handle(hdev, handle);
- } else {
- conn = NULL;
- }
+ memset(&hci_cp, 0, sizeof(hci_cp));
+ hci_read_clock_sync(hdev, &hci_cp);
- cmd = pending_find_data(MGMT_OP_GET_CLOCK_INFO, hdev, conn);
- if (!cmd)
- goto unlock;
+ /* Make sure connection still exists */
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->addr.bdaddr);
+ if (!conn || conn->state != BT_CONNECTED)
+ return MGMT_STATUS_NOT_CONNECTED;
- cmd->cmd_complete(cmd, mgmt_status(status));
- mgmt_pending_remove(cmd);
+ cmd->user_data = conn;
+ hci_cp.handle = cpu_to_le16(conn->handle);
+ hci_cp.which = 0x01; /* Piconet clock */
-unlock:
- hci_dev_unlock(hdev);
+ return hci_read_clock_sync(hdev, &hci_cp);
}
static int get_clock_info(struct sock *sk, struct hci_dev *hdev, void *data,
- u16 len)
+ u16 len)
{
struct mgmt_cp_get_clock_info *cp = data;
struct mgmt_rp_get_clock_info rp;
- struct hci_cp_read_clock hci_cp;
struct mgmt_pending_cmd *cmd;
- struct hci_request req;
struct hci_conn *conn;
int err;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
memset(&rp, 0, sizeof(rp));
bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr);
@@ -5493,119 +7603,79 @@ static int get_clock_info(struct sock *sk, struct hci_dev *hdev, void *data,
conn = NULL;
}
- cmd = mgmt_pending_add(sk, MGMT_OP_GET_CLOCK_INFO, hdev, data, len);
- if (!cmd) {
+ cmd = mgmt_pending_new(sk, MGMT_OP_GET_CLOCK_INFO, hdev, data, len);
+ if (!cmd)
err = -ENOMEM;
- goto unlock;
- }
-
- cmd->cmd_complete = clock_info_cmd_complete;
-
- hci_req_init(&req, hdev);
-
- memset(&hci_cp, 0, sizeof(hci_cp));
- hci_req_add(&req, HCI_OP_READ_CLOCK, sizeof(hci_cp), &hci_cp);
+ else
+ err = hci_cmd_sync_queue(hdev, get_clock_info_sync, cmd,
+ get_clock_info_complete);
- if (conn) {
- hci_conn_hold(conn);
- cmd->user_data = hci_conn_get(conn);
+ if (err < 0) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_CLOCK_INFO,
+ MGMT_STATUS_FAILED, &rp, sizeof(rp));
- hci_cp.handle = cpu_to_le16(conn->handle);
- hci_cp.which = 0x01; /* Piconet clock */
- hci_req_add(&req, HCI_OP_READ_CLOCK, sizeof(hci_cp), &hci_cp);
+ if (cmd)
+ mgmt_pending_free(cmd);
}
- err = hci_req_run(&req, get_clock_info_complete);
- if (err < 0)
- mgmt_pending_remove(cmd);
unlock:
hci_dev_unlock(hdev);
return err;
}
-static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type)
+static void device_added(struct sock *sk, struct hci_dev *hdev,
+ bdaddr_t *bdaddr, u8 type, u8 action)
{
- struct hci_conn *conn;
-
- conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, addr);
- if (!conn)
- return false;
-
- if (conn->dst_type != type)
- return false;
+ struct mgmt_ev_device_added ev;
- if (conn->state != BT_CONNECTED)
- return false;
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = type;
+ ev.action = action;
- return true;
+ mgmt_event(MGMT_EV_DEVICE_ADDED, hdev, &ev, sizeof(ev), sk);
}
-/* This function requires the caller holds hdev->lock */
-static int hci_conn_params_set(struct hci_dev *hdev, bdaddr_t *addr,
- u8 addr_type, u8 auto_connect)
+static void add_device_complete(struct hci_dev *hdev, void *data, int err)
{
- struct hci_conn_params *params;
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_add_device *cp = cmd->param;
- params = hci_conn_params_add(hdev, addr, addr_type);
- if (!params)
- return -EIO;
-
- if (params->auto_connect == auto_connect)
- return 0;
+ if (!err) {
+ struct hci_conn_params *params;
- list_del_init(&params->action);
+ params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr,
+ le_addr_type(cp->addr.type));
- switch (auto_connect) {
- case HCI_AUTO_CONN_DISABLED:
- case HCI_AUTO_CONN_LINK_LOSS:
- /* If auto connect is being disabled when we're trying to
- * connect to device, keep connecting.
- */
- if (params->explicit_connect)
- list_add(&params->action, &hdev->pend_le_conns);
- break;
- case HCI_AUTO_CONN_REPORT:
- if (params->explicit_connect)
- list_add(&params->action, &hdev->pend_le_conns);
- else
- list_add(&params->action, &hdev->pend_le_reports);
- break;
- case HCI_AUTO_CONN_DIRECT:
- case HCI_AUTO_CONN_ALWAYS:
- if (!is_connected(hdev, addr, addr_type))
- list_add(&params->action, &hdev->pend_le_conns);
- break;
+ device_added(cmd->sk, hdev, &cp->addr.bdaddr, cp->addr.type,
+ cp->action);
+ device_flags_changed(NULL, hdev, &cp->addr.bdaddr,
+ cp->addr.type, hdev->conn_flags,
+ params ? params->flags : 0);
}
- params->auto_connect = auto_connect;
-
- BT_DBG("addr %pMR (type %u) auto_connect %u", addr, addr_type,
- auto_connect);
-
- return 0;
+ mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_ADD_DEVICE,
+ mgmt_status(err), &cp->addr, sizeof(cp->addr));
+ mgmt_pending_free(cmd);
}
-static void device_added(struct sock *sk, struct hci_dev *hdev,
- bdaddr_t *bdaddr, u8 type, u8 action)
+static int add_device_sync(struct hci_dev *hdev, void *data)
{
- struct mgmt_ev_device_added ev;
-
- bacpy(&ev.addr.bdaddr, bdaddr);
- ev.addr.type = type;
- ev.action = action;
-
- mgmt_event(MGMT_EV_DEVICE_ADDED, hdev, &ev, sizeof(ev), sk);
+ return hci_update_passive_scan_sync(hdev);
}
static int add_device(struct sock *sk, struct hci_dev *hdev,
void *data, u16 len)
{
+ struct mgmt_pending_cmd *cmd;
struct mgmt_cp_add_device *cp = data;
u8 auto_conn, addr_type;
+ struct hci_conn_params *params;
int err;
+ u32 current_flags = 0;
+ u32 supported_flags;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (!bdaddr_type_is_valid(cp->addr.type) ||
!bacmp(&cp->addr.bdaddr, BDADDR_ANY))
@@ -5630,12 +7700,13 @@ static int add_device(struct sock *sk, struct hci_dev *hdev,
goto unlock;
}
- err = hci_bdaddr_list_add(&hdev->whitelist, &cp->addr.bdaddr,
- cp->addr.type);
+ err = hci_bdaddr_list_add_with_flags(&hdev->accept_list,
+ &cp->addr.bdaddr,
+ cp->addr.type, 0);
if (err)
goto unlock;
- hci_req_update_scan(hdev);
+ hci_update_scan(hdev);
goto added;
}
@@ -5664,18 +7735,37 @@ static int add_device(struct sock *sk, struct hci_dev *hdev,
/* If the connection parameters don't exist for this device,
* they will be created and configured with defaults.
*/
- if (hci_conn_params_set(hdev, &cp->addr.bdaddr, addr_type,
- auto_conn) < 0) {
+ params = hci_conn_params_set(hdev, &cp->addr.bdaddr, addr_type,
+ auto_conn);
+ if (!params) {
err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE,
MGMT_STATUS_FAILED, &cp->addr,
sizeof(cp->addr));
goto unlock;
}
- hci_update_background_scan(hdev);
+ cmd = mgmt_pending_new(sk, MGMT_OP_ADD_DEVICE, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ err = hci_cmd_sync_queue(hdev, add_device_sync, cmd,
+ add_device_complete);
+ if (err < 0) {
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE,
+ MGMT_STATUS_FAILED, &cp->addr,
+ sizeof(cp->addr));
+ mgmt_pending_free(cmd);
+ }
+
+ goto unlock;
added:
device_added(sk, hdev, &cp->addr.bdaddr, cp->addr.type, cp->action);
+ supported_flags = hdev->conn_flags;
+ device_flags_changed(NULL, hdev, &cp->addr.bdaddr, cp->addr.type,
+ supported_flags, current_flags);
err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE,
MGMT_STATUS_SUCCESS, &cp->addr,
@@ -5697,13 +7787,18 @@ static void device_removed(struct sock *sk, struct hci_dev *hdev,
mgmt_event(MGMT_EV_DEVICE_REMOVED, hdev, &ev, sizeof(ev), sk);
}
+static int remove_device_sync(struct hci_dev *hdev, void *data)
+{
+ return hci_update_passive_scan_sync(hdev);
+}
+
static int remove_device(struct sock *sk, struct hci_dev *hdev,
void *data, u16 len)
{
struct mgmt_cp_remove_device *cp = data;
int err;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
hci_dev_lock(hdev);
@@ -5720,7 +7815,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev,
}
if (cp->addr.type == BDADDR_BREDR) {
- err = hci_bdaddr_list_del(&hdev->whitelist,
+ err = hci_bdaddr_list_del(&hdev->accept_list,
&cp->addr.bdaddr,
cp->addr.type);
if (err) {
@@ -5732,7 +7827,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev,
goto unlock;
}
- hci_req_update_scan(hdev);
+ hci_update_scan(hdev);
device_removed(sk, hdev, &cp->addr.bdaddr,
cp->addr.type);
@@ -5773,10 +7868,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev,
goto unlock;
}
- list_del(&params->action);
- list_del(&params->list);
- kfree(params);
- hci_update_background_scan(hdev);
+ hci_conn_params_free(params);
device_removed(sk, hdev, &cp->addr.bdaddr, cp->addr.type);
} else {
@@ -5791,13 +7883,13 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev,
goto unlock;
}
- list_for_each_entry_safe(b, btmp, &hdev->whitelist, list) {
+ list_for_each_entry_safe(b, btmp, &hdev->accept_list, list) {
device_removed(sk, hdev, &b->bdaddr, b->bdaddr_type);
list_del(&b->list);
kfree(b);
}
- hci_req_update_scan(hdev);
+ hci_update_scan(hdev);
list_for_each_entry_safe(p, tmp, &hdev->le_conn_params, list) {
if (p->auto_connect == HCI_AUTO_CONN_DISABLED)
@@ -5807,16 +7899,14 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev,
p->auto_connect = HCI_AUTO_CONN_EXPLICIT;
continue;
}
- list_del(&p->action);
- list_del(&p->list);
- kfree(p);
+ hci_conn_params_free(p);
}
- BT_DBG("All LE connection parameters were removed");
-
- hci_update_background_scan(hdev);
+ bt_dev_dbg(hdev, "All LE connection parameters were removed");
}
+ hci_cmd_sync_queue(hdev, remove_device_sync, NULL, NULL);
+
complete:
err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_DEVICE,
MGMT_STATUS_SUCCESS, &cp->addr,
@@ -5826,6 +7916,18 @@ unlock:
return err;
}
+static int conn_update_sync(struct hci_dev *hdev, void *data)
+{
+ struct hci_conn_params *params = data;
+ struct hci_conn *conn;
+
+ conn = hci_conn_hash_lookup_le(hdev, &params->addr, params->addr_type);
+ if (!conn)
+ return -ECANCELED;
+
+ return hci_le_conn_update_sync(hdev, conn, params);
+}
+
static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data,
u16 len)
{
@@ -5847,8 +7949,7 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data,
MGMT_STATUS_INVALID_PARAMS);
}
- expected_len = sizeof(*cp) + param_count *
- sizeof(struct mgmt_conn_param);
+ expected_len = struct_size(cp, params, param_count);
if (expected_len != len) {
bt_dev_err(hdev, "load_conn_param: expected %u bytes, got %u bytes",
expected_len, len);
@@ -5856,20 +7957,22 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data,
MGMT_STATUS_INVALID_PARAMS);
}
- BT_DBG("%s param_count %u", hdev->name, param_count);
+ bt_dev_dbg(hdev, "param_count %u", param_count);
hci_dev_lock(hdev);
- hci_conn_params_clear_disabled(hdev);
+ if (param_count > 1)
+ hci_conn_params_clear_disabled(hdev);
for (i = 0; i < param_count; i++) {
struct mgmt_conn_param *param = &cp->params[i];
struct hci_conn_params *hci_param;
u16 min, max, latency, timeout;
+ bool update = false;
u8 addr_type;
- BT_DBG("Adding %pMR (type %u)", &param->addr.bdaddr,
- param->addr.type);
+ bt_dev_dbg(hdev, "Adding %pMR (type %u)", &param->addr.bdaddr,
+ param->addr.type);
if (param->addr.type == BDADDR_LE_PUBLIC) {
addr_type = ADDR_LE_DEV_PUBLIC;
@@ -5885,14 +7988,27 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data,
latency = le16_to_cpu(param->latency);
timeout = le16_to_cpu(param->timeout);
- BT_DBG("min 0x%04x max 0x%04x latency 0x%04x timeout 0x%04x",
- min, max, latency, timeout);
+ bt_dev_dbg(hdev, "min 0x%04x max 0x%04x latency 0x%04x timeout 0x%04x",
+ min, max, latency, timeout);
if (hci_check_conn_params(min, max, latency, timeout) < 0) {
bt_dev_err(hdev, "ignoring invalid connection parameters");
continue;
}
+ /* Detect when the loading is for an existing parameter then
+ * attempt to trigger the connection update procedure.
+ */
+ if (!i && param_count == 1) {
+ hci_param = hci_conn_params_lookup(hdev,
+ &param->addr.bdaddr,
+ addr_type);
+ if (hci_param)
+ update = true;
+ else
+ hci_conn_params_clear_disabled(hdev);
+ }
+
hci_param = hci_conn_params_add(hdev, &param->addr.bdaddr,
addr_type);
if (!hci_param) {
@@ -5904,6 +8020,25 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data,
hci_param->conn_max_interval = max;
hci_param->conn_latency = latency;
hci_param->supervision_timeout = timeout;
+
+ /* Check if we need to trigger a connection update */
+ if (update) {
+ struct hci_conn *conn;
+
+ /* Lookup for existing connection as central and check
+ * if parameters match and if they don't then trigger
+ * a connection update.
+ */
+ conn = hci_conn_hash_lookup_le(hdev, &hci_param->addr,
+ addr_type);
+ if (conn && conn->role == HCI_ROLE_MASTER &&
+ (conn->le_conn_min_interval != min ||
+ conn->le_conn_max_interval != max ||
+ conn->le_conn_latency != latency ||
+ conn->le_supv_timeout != timeout))
+ hci_cmd_sync_queue(hdev, conn_update_sync,
+ hci_param, NULL);
+ }
}
hci_dev_unlock(hdev);
@@ -5919,7 +8054,7 @@ static int set_external_config(struct sock *sk, struct hci_dev *hdev,
bool changed;
int err;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (hdev_is_powered(hdev))
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG,
@@ -5929,7 +8064,7 @@ static int set_external_config(struct sock *sk, struct hci_dev *hdev,
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG,
MGMT_STATUS_INVALID_PARAMS);
- if (!test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks))
+ if (!hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG))
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG,
MGMT_STATUS_NOT_SUPPORTED);
@@ -5975,7 +8110,7 @@ static int set_public_address(struct sock *sk, struct hci_dev *hdev,
bool changed;
int err;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (hdev_is_powered(hdev))
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS,
@@ -6020,21 +8155,27 @@ unlock:
return err;
}
-static void read_local_oob_ext_data_complete(struct hci_dev *hdev, u8 status,
- u16 opcode, struct sk_buff *skb)
+static void read_local_oob_ext_data_complete(struct hci_dev *hdev, void *data,
+ int err)
{
const struct mgmt_cp_read_local_oob_ext_data *mgmt_cp;
struct mgmt_rp_read_local_oob_ext_data *mgmt_rp;
u8 *h192, *r192, *h256, *r256;
- struct mgmt_pending_cmd *cmd;
+ struct mgmt_pending_cmd *cmd = data;
+ struct sk_buff *skb = cmd->skb;
+ u8 status = mgmt_status(err);
u16 eir_len;
- int err;
- BT_DBG("%s status %u", hdev->name, status);
+ if (!status) {
+ if (!skb)
+ status = MGMT_STATUS_FAILED;
+ else if (IS_ERR(skb))
+ status = mgmt_status(PTR_ERR(skb));
+ else
+ status = mgmt_status(skb->data[0]);
+ }
- cmd = pending_find(MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev);
- if (!cmd)
- return;
+ bt_dev_dbg(hdev, "status %u", status);
mgmt_cp = cmd->param;
@@ -6046,7 +8187,7 @@ static void read_local_oob_ext_data_complete(struct hci_dev *hdev, u8 status,
r192 = NULL;
h256 = NULL;
r256 = NULL;
- } else if (opcode == HCI_OP_READ_LOCAL_OOB_DATA) {
+ } else if (!bredr_sc_enabled(hdev)) {
struct hci_rp_read_local_oob_data *rp;
if (skb->len != sizeof(*rp)) {
@@ -6091,7 +8232,7 @@ static void read_local_oob_ext_data_complete(struct hci_dev *hdev, u8 status,
if (!mgmt_rp)
goto done;
- if (status)
+ if (eir_len == 0)
goto send_rsp;
eir_len = eir_append_data(mgmt_rp->eir, 0, EIR_CLASS_OF_DEV,
@@ -6127,30 +8268,27 @@ send_rsp:
mgmt_rp, sizeof(*mgmt_rp) + eir_len,
HCI_MGMT_OOB_DATA_EVENTS, cmd->sk);
done:
+ if (skb && !IS_ERR(skb))
+ kfree_skb(skb);
+
kfree(mgmt_rp);
- mgmt_pending_remove(cmd);
+ mgmt_pending_free(cmd);
}
static int read_local_ssp_oob_req(struct hci_dev *hdev, struct sock *sk,
struct mgmt_cp_read_local_oob_ext_data *cp)
{
struct mgmt_pending_cmd *cmd;
- struct hci_request req;
int err;
- cmd = mgmt_pending_add(sk, MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev,
+ cmd = mgmt_pending_new(sk, MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev,
cp, sizeof(*cp));
if (!cmd)
return -ENOMEM;
- hci_req_init(&req, hdev);
+ err = hci_cmd_sync_queue(hdev, read_local_oob_data_sync, cmd,
+ read_local_oob_ext_data_complete);
- if (bredr_sc_enabled(hdev))
- hci_req_add(&req, HCI_OP_READ_LOCAL_OOB_EXT_DATA, 0, NULL);
- else
- hci_req_add(&req, HCI_OP_READ_LOCAL_OOB_DATA, 0, NULL);
-
- err = hci_req_run_skb(&req, read_local_oob_ext_data_complete);
if (err < 0) {
mgmt_pending_remove(cmd);
return err;
@@ -6169,7 +8307,7 @@ static int read_local_oob_ext_data(struct sock *sk, struct hci_dev *hdev,
u8 status, flags, role, addr[7], hash[16], rand[16];
int err;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (hdev_is_powered(hdev)) {
switch (cp->type) {
@@ -6202,6 +8340,11 @@ static int read_local_oob_ext_data(struct sock *sk, struct hci_dev *hdev,
if (!rp)
return -ENOMEM;
+ if (!status && !lmp_ssp_capable(hdev)) {
+ status = MGMT_STATUS_NOT_SUPPORTED;
+ eir_len = 0;
+ }
+
if (status)
goto complete;
@@ -6325,21 +8468,27 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev)
flags |= MGMT_ADV_FLAG_MANAGED_FLAGS;
flags |= MGMT_ADV_FLAG_APPEARANCE;
flags |= MGMT_ADV_FLAG_LOCAL_NAME;
+ flags |= MGMT_ADV_PARAM_DURATION;
+ flags |= MGMT_ADV_PARAM_TIMEOUT;
+ flags |= MGMT_ADV_PARAM_INTERVALS;
+ flags |= MGMT_ADV_PARAM_TX_POWER;
+ flags |= MGMT_ADV_PARAM_SCAN_RSP;
/* In extended adv TX_POWER returned from Set Adv Param
* will be always valid.
*/
- if ((hdev->adv_tx_power != HCI_TX_POWER_INVALID) ||
- ext_adv_capable(hdev))
+ if (hdev->adv_tx_power != HCI_TX_POWER_INVALID || ext_adv_capable(hdev))
flags |= MGMT_ADV_FLAG_TX_POWER;
if (ext_adv_capable(hdev)) {
flags |= MGMT_ADV_FLAG_SEC_1M;
+ flags |= MGMT_ADV_FLAG_HW_OFFLOAD;
+ flags |= MGMT_ADV_FLAG_CAN_SET_TX_POWER;
- if (hdev->le_features[1] & HCI_LE_PHY_2M)
+ if (le_2m_capable(hdev))
flags |= MGMT_ADV_FLAG_SEC_2M;
- if (hdev->le_features[1] & HCI_LE_PHY_CODED)
+ if (le_coded_capable(hdev))
flags |= MGMT_ADV_FLAG_SEC_CODED;
}
@@ -6356,7 +8505,7 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev,
u32 supported_flags;
u8 *instance;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (!lmp_le_capable(hdev))
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_ADV_FEATURES,
@@ -6374,15 +8523,21 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev,
supported_flags = get_supported_adv_flags(hdev);
rp->supported_flags = cpu_to_le32(supported_flags);
- rp->max_adv_data_len = HCI_MAX_AD_LENGTH;
- rp->max_scan_rsp_len = HCI_MAX_AD_LENGTH;
- rp->max_instances = HCI_MAX_ADV_INSTANCES;
+ rp->max_adv_data_len = max_adv_len(hdev);
+ rp->max_scan_rsp_len = max_adv_len(hdev);
+ rp->max_instances = hdev->le_num_of_adv_sets;
rp->num_instances = hdev->adv_instance_cnt;
instance = rp->instance;
list_for_each_entry(adv_instance, &hdev->adv_instances, list) {
- *instance = adv_instance->instance;
- instance++;
+ /* Only instances 1-le_num_of_adv_sets are externally visible */
+ if (adv_instance->instance <= hdev->adv_instance_cnt) {
+ *instance = adv_instance->instance;
+ instance++;
+ } else {
+ rp->num_instances--;
+ rp_len--;
+ }
}
hci_dev_unlock(hdev);
@@ -6397,15 +8552,15 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev,
static u8 calculate_name_len(struct hci_dev *hdev)
{
- u8 buf[HCI_MAX_SHORT_NAME_LENGTH + 3];
+ u8 buf[HCI_MAX_SHORT_NAME_LENGTH + 2]; /* len + type + name */
- return append_local_name(hdev, buf, 0);
+ return eir_append_local_name(hdev, buf, 0);
}
static u8 tlv_data_max_len(struct hci_dev *hdev, u32 adv_flags,
bool is_adv_data)
{
- u8 max_len = HCI_MAX_AD_LENGTH;
+ u8 max_len = max_adv_len(hdev);
if (is_adv_data) {
if (adv_flags & (MGMT_ADV_FLAG_DISCOV |
@@ -6460,9 +8615,12 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data,
return false;
/* Make sure that the data is correctly formatted. */
- for (i = 0, cur_len = 0; i < len; i += (cur_len + 1)) {
+ for (i = 0; i < len; i += (cur_len + 1)) {
cur_len = data[i];
+ if (!cur_len)
+ continue;
+
if (data[i + 1] == EIR_FLAGS &&
(!is_adv_data || flags_managed(adv_flags)))
return false;
@@ -6490,56 +8648,87 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data,
return true;
}
-static void add_advertising_complete(struct hci_dev *hdev, u8 status,
- u16 opcode)
+static bool requested_adv_flags_are_valid(struct hci_dev *hdev, u32 adv_flags)
{
- struct mgmt_pending_cmd *cmd;
- struct mgmt_cp_add_advertising *cp;
- struct mgmt_rp_add_advertising rp;
- struct adv_info *adv_instance, *n;
- u8 instance;
+ u32 supported_flags, phy_flags;
- BT_DBG("status %d", status);
+ /* The current implementation only supports a subset of the specified
+ * flags. Also need to check mutual exclusiveness of sec flags.
+ */
+ supported_flags = get_supported_adv_flags(hdev);
+ phy_flags = adv_flags & MGMT_ADV_FLAG_SEC_MASK;
+ if (adv_flags & ~supported_flags ||
+ ((phy_flags && (phy_flags ^ (phy_flags & -phy_flags)))))
+ return false;
+
+ return true;
+}
+
+static bool adv_busy(struct hci_dev *hdev)
+{
+ return pending_find(MGMT_OP_SET_LE, hdev);
+}
+
+static void add_adv_complete(struct hci_dev *hdev, struct sock *sk, u8 instance,
+ int err)
+{
+ struct adv_info *adv, *n;
+
+ bt_dev_dbg(hdev, "err %d", err);
hci_dev_lock(hdev);
- cmd = pending_find(MGMT_OP_ADD_ADVERTISING, hdev);
+ list_for_each_entry_safe(adv, n, &hdev->adv_instances, list) {
+ u8 instance;
- list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) {
- if (!adv_instance->pending)
+ if (!adv->pending)
continue;
- if (!status) {
- adv_instance->pending = false;
+ if (!err) {
+ adv->pending = false;
continue;
}
- instance = adv_instance->instance;
+ instance = adv->instance;
if (hdev->cur_adv_instance == instance)
cancel_adv_timeout(hdev);
hci_remove_adv_instance(hdev, instance);
- mgmt_advertising_removed(cmd ? cmd->sk : NULL, hdev, instance);
+ mgmt_advertising_removed(sk, hdev, instance);
}
- if (!cmd)
- goto unlock;
+ hci_dev_unlock(hdev);
+}
+
+static void add_advertising_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_add_advertising *cp = cmd->param;
+ struct mgmt_rp_add_advertising rp;
+
+ memset(&rp, 0, sizeof(rp));
- cp = cmd->param;
rp.instance = cp->instance;
- if (status)
- mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode,
- mgmt_status(status));
+ if (err)
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode,
+ mgmt_status(err));
else
- mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode,
- mgmt_status(status), &rp, sizeof(rp));
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode,
+ mgmt_status(err), &rp, sizeof(rp));
- mgmt_pending_remove(cmd);
+ add_adv_complete(hdev, cmd->sk, cp->instance, err);
-unlock:
- hci_dev_unlock(hdev);
+ mgmt_pending_free(cmd);
+}
+
+static int add_advertising_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_add_advertising *cp = cmd->param;
+
+ return hci_schedule_adv_instance_sync(hdev, cp->instance, true);
}
static int add_advertising(struct sock *sk, struct hci_dev *hdev,
@@ -6548,24 +8737,22 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
struct mgmt_cp_add_advertising *cp = data;
struct mgmt_rp_add_advertising rp;
u32 flags;
- u32 supported_flags, phy_flags;
u8 status;
u16 timeout, duration;
- unsigned int prev_instance_cnt = hdev->adv_instance_cnt;
+ unsigned int prev_instance_cnt;
u8 schedule_instance = 0;
- struct adv_info *next_instance;
+ struct adv_info *adv, *next_instance;
int err;
struct mgmt_pending_cmd *cmd;
- struct hci_request req;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
status = mgmt_le_support(hdev);
if (status)
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
status);
- if (cp->instance < 1 || cp->instance > HCI_MAX_ADV_INSTANCES)
+ if (cp->instance < 1 || cp->instance > hdev->le_num_of_adv_sets)
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
MGMT_STATUS_INVALID_PARAMS);
@@ -6577,13 +8764,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
timeout = __le16_to_cpu(cp->timeout);
duration = __le16_to_cpu(cp->duration);
- /* The current implementation only supports a subset of the specified
- * flags. Also need to check mutual exclusiveness of sec flags.
- */
- supported_flags = get_supported_adv_flags(hdev);
- phy_flags = flags & MGMT_ADV_FLAG_SEC_MASK;
- if (flags & ~supported_flags ||
- ((phy_flags && (phy_flags ^ (phy_flags & -phy_flags)))))
+ if (!requested_adv_flags_are_valid(hdev, flags))
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
MGMT_STATUS_INVALID_PARAMS);
@@ -6595,9 +8776,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
goto unlock;
}
- if (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) ||
- pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) ||
- pending_find(MGMT_OP_SET_LE, hdev)) {
+ if (adv_busy(hdev)) {
err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
MGMT_STATUS_BUSY);
goto unlock;
@@ -6611,12 +8790,17 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
goto unlock;
}
- err = hci_add_adv_instance(hdev, cp->instance, flags,
+ prev_instance_cnt = hdev->adv_instance_cnt;
+
+ adv = hci_add_adv_instance(hdev, cp->instance, flags,
cp->adv_data_len, cp->data,
cp->scan_rsp_len,
cp->data + cp->adv_data_len,
- timeout, duration);
- if (err < 0) {
+ timeout, duration,
+ HCI_ADV_TX_POWER_NO_PREFERENCE,
+ hdev->le_adv_min_interval,
+ hdev->le_adv_max_interval, 0);
+ if (IS_ERR(adv)) {
err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
MGMT_STATUS_FAILED);
goto unlock;
@@ -6662,22 +8846,193 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
/* We're good to go, update advertising data, parameters, and start
* advertising.
*/
- cmd = mgmt_pending_add(sk, MGMT_OP_ADD_ADVERTISING, hdev, data,
+ cmd = mgmt_pending_new(sk, MGMT_OP_ADD_ADVERTISING, hdev, data,
data_len);
if (!cmd) {
err = -ENOMEM;
goto unlock;
}
- hci_req_init(&req, hdev);
+ cp->instance = schedule_instance;
- err = __hci_req_schedule_adv_instance(&req, schedule_instance, true);
+ err = hci_cmd_sync_queue(hdev, add_advertising_sync, cmd,
+ add_advertising_complete);
+ if (err < 0)
+ mgmt_pending_free(cmd);
- if (!err)
- err = hci_req_run(&req, add_advertising_complete);
+unlock:
+ hci_dev_unlock(hdev);
- if (err < 0)
- mgmt_pending_remove(cmd);
+ return err;
+}
+
+static void add_ext_adv_params_complete(struct hci_dev *hdev, void *data,
+ int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_add_ext_adv_params *cp = cmd->param;
+ struct mgmt_rp_add_ext_adv_params rp;
+ struct adv_info *adv;
+ u32 flags;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ adv = hci_find_adv_instance(hdev, cp->instance);
+ if (!adv)
+ goto unlock;
+
+ rp.instance = cp->instance;
+ rp.tx_power = adv->tx_power;
+
+ /* While we're at it, inform userspace of the available space for this
+ * advertisement, given the flags that will be used.
+ */
+ flags = __le32_to_cpu(cp->flags);
+ rp.max_adv_data_len = tlv_data_max_len(hdev, flags, true);
+ rp.max_scan_rsp_len = tlv_data_max_len(hdev, flags, false);
+
+ if (err) {
+ /* If this advertisement was previously advertising and we
+ * failed to update it, we signal that it has been removed and
+ * delete its structure
+ */
+ if (!adv->pending)
+ mgmt_advertising_removed(cmd->sk, hdev, cp->instance);
+
+ hci_remove_adv_instance(hdev, cp->instance);
+
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode,
+ mgmt_status(err));
+ } else {
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode,
+ mgmt_status(err), &rp, sizeof(rp));
+ }
+
+unlock:
+ mgmt_pending_free(cmd);
+
+ hci_dev_unlock(hdev);
+}
+
+static int add_ext_adv_params_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_add_ext_adv_params *cp = cmd->param;
+
+ return hci_setup_ext_adv_instance_sync(hdev, cp->instance);
+}
+
+static int add_ext_adv_params(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_cp_add_ext_adv_params *cp = data;
+ struct mgmt_rp_add_ext_adv_params rp;
+ struct mgmt_pending_cmd *cmd = NULL;
+ struct adv_info *adv;
+ u32 flags, min_interval, max_interval;
+ u16 timeout, duration;
+ u8 status;
+ s8 tx_power;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ status = mgmt_le_support(hdev);
+ if (status)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
+ status);
+
+ if (cp->instance < 1 || cp->instance > hdev->le_num_of_adv_sets)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ /* The purpose of breaking add_advertising into two separate MGMT calls
+ * for params and data is to allow more parameters to be added to this
+ * structure in the future. For this reason, we verify that we have the
+ * bare minimum structure we know of when the interface was defined. Any
+ * extra parameters we don't know about will be ignored in this request.
+ */
+ if (data_len < MGMT_ADD_EXT_ADV_PARAMS_MIN_SIZE)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ flags = __le32_to_cpu(cp->flags);
+
+ if (!requested_adv_flags_are_valid(hdev, flags))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ hci_dev_lock(hdev);
+
+ /* In new interface, we require that we are powered to register */
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_STATUS_REJECTED);
+ goto unlock;
+ }
+
+ if (adv_busy(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ /* Parse defined parameters from request, use defaults otherwise */
+ timeout = (flags & MGMT_ADV_PARAM_TIMEOUT) ?
+ __le16_to_cpu(cp->timeout) : 0;
+
+ duration = (flags & MGMT_ADV_PARAM_DURATION) ?
+ __le16_to_cpu(cp->duration) :
+ hdev->def_multi_adv_rotation_duration;
+
+ min_interval = (flags & MGMT_ADV_PARAM_INTERVALS) ?
+ __le32_to_cpu(cp->min_interval) :
+ hdev->le_adv_min_interval;
+
+ max_interval = (flags & MGMT_ADV_PARAM_INTERVALS) ?
+ __le32_to_cpu(cp->max_interval) :
+ hdev->le_adv_max_interval;
+
+ tx_power = (flags & MGMT_ADV_PARAM_TX_POWER) ?
+ cp->tx_power :
+ HCI_ADV_TX_POWER_NO_PREFERENCE;
+
+ /* Create advertising instance with no advertising or response data */
+ adv = hci_add_adv_instance(hdev, cp->instance, flags, 0, NULL, 0, NULL,
+ timeout, duration, tx_power, min_interval,
+ max_interval, 0);
+
+ if (IS_ERR(adv)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_STATUS_FAILED);
+ goto unlock;
+ }
+
+ /* Submit request for advertising params if ext adv available */
+ if (ext_adv_capable(hdev)) {
+ cmd = mgmt_pending_new(sk, MGMT_OP_ADD_EXT_ADV_PARAMS, hdev,
+ data, data_len);
+ if (!cmd) {
+ err = -ENOMEM;
+ hci_remove_adv_instance(hdev, cp->instance);
+ goto unlock;
+ }
+
+ err = hci_cmd_sync_queue(hdev, add_ext_adv_params_sync, cmd,
+ add_ext_adv_params_complete);
+ if (err < 0)
+ mgmt_pending_free(cmd);
+ } else {
+ rp.instance = cp->instance;
+ rp.tx_power = HCI_ADV_TX_POWER_NO_PREFERENCE;
+ rp.max_adv_data_len = tlv_data_max_len(hdev, flags, true);
+ rp.max_scan_rsp_len = tlv_data_max_len(hdev, flags, false);
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_ADD_EXT_ADV_PARAMS,
+ MGMT_STATUS_SUCCESS, &rp, sizeof(rp));
+ }
unlock:
hci_dev_unlock(hdev);
@@ -6685,46 +9040,214 @@ unlock:
return err;
}
-static void remove_advertising_complete(struct hci_dev *hdev, u8 status,
- u16 opcode)
+static void add_ext_adv_data_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_add_ext_adv_data *cp = cmd->param;
+ struct mgmt_rp_add_advertising rp;
+
+ add_adv_complete(hdev, cmd->sk, cp->instance, err);
+
+ memset(&rp, 0, sizeof(rp));
+
+ rp.instance = cp->instance;
+
+ if (err)
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode,
+ mgmt_status(err));
+ else
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode,
+ mgmt_status(err), &rp, sizeof(rp));
+
+ mgmt_pending_free(cmd);
+}
+
+static int add_ext_adv_data_sync(struct hci_dev *hdev, void *data)
{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_add_ext_adv_data *cp = cmd->param;
+ int err;
+
+ if (ext_adv_capable(hdev)) {
+ err = hci_update_adv_data_sync(hdev, cp->instance);
+ if (err)
+ return err;
+
+ err = hci_update_scan_rsp_data_sync(hdev, cp->instance);
+ if (err)
+ return err;
+
+ return hci_enable_ext_advertising_sync(hdev, cp->instance);
+ }
+
+ return hci_schedule_adv_instance_sync(hdev, cp->instance, true);
+}
+
+static int add_ext_adv_data(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len)
+{
+ struct mgmt_cp_add_ext_adv_data *cp = data;
+ struct mgmt_rp_add_ext_adv_data rp;
+ u8 schedule_instance = 0;
+ struct adv_info *next_instance;
+ struct adv_info *adv_instance;
+ int err = 0;
struct mgmt_pending_cmd *cmd;
- struct mgmt_cp_remove_advertising *cp;
- struct mgmt_rp_remove_advertising rp;
- BT_DBG("status %d", status);
+ BT_DBG("%s", hdev->name);
hci_dev_lock(hdev);
- /* A failure status here only means that we failed to disable
- * advertising. Otherwise, the advertising instance has been removed,
- * so report success.
+ adv_instance = hci_find_adv_instance(hdev, cp->instance);
+
+ if (!adv_instance) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto unlock;
+ }
+
+ /* In new interface, we require that we are powered to register */
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA,
+ MGMT_STATUS_REJECTED);
+ goto clear_new_instance;
+ }
+
+ if (adv_busy(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA,
+ MGMT_STATUS_BUSY);
+ goto clear_new_instance;
+ }
+
+ /* Validate new data */
+ if (!tlv_data_is_valid(hdev, adv_instance->flags, cp->data,
+ cp->adv_data_len, true) ||
+ !tlv_data_is_valid(hdev, adv_instance->flags, cp->data +
+ cp->adv_data_len, cp->scan_rsp_len, false)) {
+ err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA,
+ MGMT_STATUS_INVALID_PARAMS);
+ goto clear_new_instance;
+ }
+
+ /* Set the data in the advertising instance */
+ hci_set_adv_instance_data(hdev, cp->instance, cp->adv_data_len,
+ cp->data, cp->scan_rsp_len,
+ cp->data + cp->adv_data_len);
+
+ /* If using software rotation, determine next instance to use */
+ if (hdev->cur_adv_instance == cp->instance) {
+ /* If the currently advertised instance is being changed
+ * then cancel the current advertising and schedule the
+ * next instance. If there is only one instance then the
+ * overridden advertising data will be visible right
+ * away
+ */
+ cancel_adv_timeout(hdev);
+
+ next_instance = hci_get_next_instance(hdev, cp->instance);
+ if (next_instance)
+ schedule_instance = next_instance->instance;
+ } else if (!hdev->adv_instance_timeout) {
+ /* Immediately advertise the new instance if no other
+ * instance is currently being advertised.
+ */
+ schedule_instance = cp->instance;
+ }
+
+ /* If the HCI_ADVERTISING flag is set or there is no instance to
+ * be advertised then we have no HCI communication to make.
+ * Simply return.
*/
- cmd = pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev);
- if (!cmd)
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || !schedule_instance) {
+ if (adv_instance->pending) {
+ mgmt_advertising_added(sk, hdev, cp->instance);
+ adv_instance->pending = false;
+ }
+ rp.instance = cp->instance;
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_DATA,
+ MGMT_STATUS_SUCCESS, &rp, sizeof(rp));
goto unlock;
+ }
- cp = cmd->param;
- rp.instance = cp->instance;
+ cmd = mgmt_pending_new(sk, MGMT_OP_ADD_EXT_ADV_DATA, hdev, data,
+ data_len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto clear_new_instance;
+ }
- mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, MGMT_STATUS_SUCCESS,
- &rp, sizeof(rp));
- mgmt_pending_remove(cmd);
+ err = hci_cmd_sync_queue(hdev, add_ext_adv_data_sync, cmd,
+ add_ext_adv_data_complete);
+ if (err < 0) {
+ mgmt_pending_free(cmd);
+ goto clear_new_instance;
+ }
+
+ /* We were successful in updating data, so trigger advertising_added
+ * event if this is an instance that wasn't previously advertising. If
+ * a failure occurs in the requests we initiated, we will remove the
+ * instance again in add_advertising_complete
+ */
+ if (adv_instance->pending)
+ mgmt_advertising_added(sk, hdev, cp->instance);
+
+ goto unlock;
+
+clear_new_instance:
+ hci_remove_adv_instance(hdev, cp->instance);
unlock:
hci_dev_unlock(hdev);
+
+ return err;
+}
+
+static void remove_advertising_complete(struct hci_dev *hdev, void *data,
+ int err)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_remove_advertising *cp = cmd->param;
+ struct mgmt_rp_remove_advertising rp;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ memset(&rp, 0, sizeof(rp));
+ rp.instance = cp->instance;
+
+ if (err)
+ mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode,
+ mgmt_status(err));
+ else
+ mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode,
+ MGMT_STATUS_SUCCESS, &rp, sizeof(rp));
+
+ mgmt_pending_free(cmd);
+}
+
+static int remove_advertising_sync(struct hci_dev *hdev, void *data)
+{
+ struct mgmt_pending_cmd *cmd = data;
+ struct mgmt_cp_remove_advertising *cp = cmd->param;
+ int err;
+
+ err = hci_remove_advertising_sync(hdev, cmd->sk, cp->instance, true);
+ if (err)
+ return err;
+
+ if (list_empty(&hdev->adv_instances))
+ err = hci_disable_advertising_sync(hdev);
+
+ return err;
}
static int remove_advertising(struct sock *sk, struct hci_dev *hdev,
void *data, u16 data_len)
{
struct mgmt_cp_remove_advertising *cp = data;
- struct mgmt_rp_remove_advertising rp;
struct mgmt_pending_cmd *cmd;
- struct hci_request req;
int err;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
hci_dev_lock(hdev);
@@ -6735,9 +9258,7 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev,
goto unlock;
}
- if (pending_find(MGMT_OP_ADD_ADVERTISING, hdev) ||
- pending_find(MGMT_OP_REMOVE_ADVERTISING, hdev) ||
- pending_find(MGMT_OP_SET_LE, hdev)) {
+ if (pending_find(MGMT_OP_SET_LE, hdev)) {
err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING,
MGMT_STATUS_BUSY);
goto unlock;
@@ -6749,38 +9270,17 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev,
goto unlock;
}
- hci_req_init(&req, hdev);
-
- hci_req_clear_adv_instance(hdev, sk, &req, cp->instance, true);
-
- if (list_empty(&hdev->adv_instances))
- __hci_req_disable_advertising(&req);
-
- /* If no HCI commands have been collected so far or the HCI_ADVERTISING
- * flag is set or the device isn't powered then we have no HCI
- * communication to make. Simply return.
- */
- if (skb_queue_empty(&req.cmd_q) ||
- !hdev_is_powered(hdev) ||
- hci_dev_test_flag(hdev, HCI_ADVERTISING)) {
- hci_req_purge(&req);
- rp.instance = cp->instance;
- err = mgmt_cmd_complete(sk, hdev->id,
- MGMT_OP_REMOVE_ADVERTISING,
- MGMT_STATUS_SUCCESS, &rp, sizeof(rp));
- goto unlock;
- }
-
- cmd = mgmt_pending_add(sk, MGMT_OP_REMOVE_ADVERTISING, hdev, data,
+ cmd = mgmt_pending_new(sk, MGMT_OP_REMOVE_ADVERTISING, hdev, data,
data_len);
if (!cmd) {
err = -ENOMEM;
goto unlock;
}
- err = hci_req_run(&req, remove_advertising_complete);
+ err = hci_cmd_sync_queue(hdev, remove_advertising_sync, cmd,
+ remove_advertising_complete);
if (err < 0)
- mgmt_pending_remove(cmd);
+ mgmt_pending_free(cmd);
unlock:
hci_dev_unlock(hdev);
@@ -6794,15 +9294,14 @@ static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev,
struct mgmt_cp_get_adv_size_info *cp = data;
struct mgmt_rp_get_adv_size_info rp;
u32 flags, supported_flags;
- int err;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "sock %p", sk);
if (!lmp_le_capable(hdev))
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO,
MGMT_STATUS_REJECTED);
- if (cp->instance < 1 || cp->instance > HCI_MAX_ADV_INSTANCES)
+ if (cp->instance < 1 || cp->instance > hdev->le_num_of_adv_sets)
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO,
MGMT_STATUS_INVALID_PARAMS);
@@ -6821,10 +9320,8 @@ static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev,
rp.max_adv_data_len = tlv_data_max_len(hdev, flags, true);
rp.max_scan_rsp_len = tlv_data_max_len(hdev, flags, false);
- err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO,
- MGMT_STATUS_SUCCESS, &rp, sizeof(rp));
-
- return err;
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO,
+ MGMT_STATUS_SUCCESS, &rp, sizeof(rp));
}
static const struct hci_mgmt_handler mgmt_handlers[] = {
@@ -6921,32 +9418,62 @@ static const struct hci_mgmt_handler mgmt_handlers[] = {
{ set_appearance, MGMT_SET_APPEARANCE_SIZE },
{ get_phy_configuration, MGMT_GET_PHY_CONFIGURATION_SIZE },
{ set_phy_configuration, MGMT_SET_PHY_CONFIGURATION_SIZE },
+ { set_blocked_keys, MGMT_OP_SET_BLOCKED_KEYS_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { set_wideband_speech, MGMT_SETTING_SIZE },
+ { read_controller_cap, MGMT_READ_CONTROLLER_CAP_SIZE,
+ HCI_MGMT_UNTRUSTED },
+ { read_exp_features_info, MGMT_READ_EXP_FEATURES_INFO_SIZE,
+ HCI_MGMT_UNTRUSTED |
+ HCI_MGMT_HDEV_OPTIONAL },
+ { set_exp_feature, MGMT_SET_EXP_FEATURE_SIZE,
+ HCI_MGMT_VAR_LEN |
+ HCI_MGMT_HDEV_OPTIONAL },
+ { read_def_system_config, MGMT_READ_DEF_SYSTEM_CONFIG_SIZE,
+ HCI_MGMT_UNTRUSTED },
+ { set_def_system_config, MGMT_SET_DEF_SYSTEM_CONFIG_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { read_def_runtime_config, MGMT_READ_DEF_RUNTIME_CONFIG_SIZE,
+ HCI_MGMT_UNTRUSTED },
+ { set_def_runtime_config, MGMT_SET_DEF_RUNTIME_CONFIG_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { get_device_flags, MGMT_GET_DEVICE_FLAGS_SIZE },
+ { set_device_flags, MGMT_SET_DEVICE_FLAGS_SIZE },
+ { read_adv_mon_features, MGMT_READ_ADV_MONITOR_FEATURES_SIZE },
+ { add_adv_patterns_monitor,MGMT_ADD_ADV_PATTERNS_MONITOR_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { remove_adv_monitor, MGMT_REMOVE_ADV_MONITOR_SIZE },
+ { add_ext_adv_params, MGMT_ADD_EXT_ADV_PARAMS_MIN_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { add_ext_adv_data, MGMT_ADD_EXT_ADV_DATA_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { add_adv_patterns_monitor_rssi,
+ MGMT_ADD_ADV_PATTERNS_MONITOR_RSSI_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { set_mesh, MGMT_SET_MESH_RECEIVER_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { mesh_features, MGMT_MESH_READ_FEATURES_SIZE },
+ { mesh_send, MGMT_MESH_SEND_SIZE,
+ HCI_MGMT_VAR_LEN },
+ { mesh_send_cancel, MGMT_MESH_SEND_CANCEL_SIZE },
+ { mgmt_hci_cmd_sync, MGMT_HCI_CMD_SYNC_SIZE, HCI_MGMT_VAR_LEN },
};
void mgmt_index_added(struct hci_dev *hdev)
{
struct mgmt_ev_ext_index ev;
- if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks))
+ if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE))
return;
- switch (hdev->dev_type) {
- case HCI_PRIMARY:
- if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
- mgmt_index_event(MGMT_EV_UNCONF_INDEX_ADDED, hdev,
- NULL, 0, HCI_MGMT_UNCONF_INDEX_EVENTS);
- ev.type = 0x01;
- } else {
- mgmt_index_event(MGMT_EV_INDEX_ADDED, hdev, NULL, 0,
- HCI_MGMT_INDEX_EVENTS);
- ev.type = 0x00;
- }
- break;
- case HCI_AMP:
- ev.type = 0x02;
- break;
- default:
- return;
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
+ mgmt_index_event(MGMT_EV_UNCONF_INDEX_ADDED, hdev, NULL, 0,
+ HCI_MGMT_UNCONF_INDEX_EVENTS);
+ ev.type = 0x01;
+ } else {
+ mgmt_index_event(MGMT_EV_INDEX_ADDED, hdev, NULL, 0,
+ HCI_MGMT_INDEX_EVENTS);
+ ev.type = 0x00;
}
ev.bus = hdev->bus;
@@ -6958,77 +9485,52 @@ void mgmt_index_added(struct hci_dev *hdev)
void mgmt_index_removed(struct hci_dev *hdev)
{
struct mgmt_ev_ext_index ev;
- u8 status = MGMT_STATUS_INVALID_INDEX;
+ struct cmd_lookup match = { NULL, hdev, MGMT_STATUS_INVALID_INDEX };
- if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks))
+ if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE))
return;
- switch (hdev->dev_type) {
- case HCI_PRIMARY:
- mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status);
+ mgmt_pending_foreach(0, hdev, true, cmd_complete_rsp, &match);
- if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
- mgmt_index_event(MGMT_EV_UNCONF_INDEX_REMOVED, hdev,
- NULL, 0, HCI_MGMT_UNCONF_INDEX_EVENTS);
- ev.type = 0x01;
- } else {
- mgmt_index_event(MGMT_EV_INDEX_REMOVED, hdev, NULL, 0,
- HCI_MGMT_INDEX_EVENTS);
- ev.type = 0x00;
- }
- break;
- case HCI_AMP:
- ev.type = 0x02;
- break;
- default:
- return;
+ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
+ mgmt_index_event(MGMT_EV_UNCONF_INDEX_REMOVED, hdev, NULL, 0,
+ HCI_MGMT_UNCONF_INDEX_EVENTS);
+ ev.type = 0x01;
+ } else {
+ mgmt_index_event(MGMT_EV_INDEX_REMOVED, hdev, NULL, 0,
+ HCI_MGMT_INDEX_EVENTS);
+ ev.type = 0x00;
}
ev.bus = hdev->bus;
mgmt_index_event(MGMT_EV_EXT_INDEX_REMOVED, hdev, &ev, sizeof(ev),
HCI_MGMT_EXT_INDEX_EVENTS);
-}
-
-/* This function requires the caller holds hdev->lock */
-static void restart_le_actions(struct hci_dev *hdev)
-{
- struct hci_conn_params *p;
-
- list_for_each_entry(p, &hdev->le_conn_params, list) {
- /* Needed for AUTO_OFF case where might not "really"
- * have been powered off.
- */
- list_del_init(&p->action);
- switch (p->auto_connect) {
- case HCI_AUTO_CONN_DIRECT:
- case HCI_AUTO_CONN_ALWAYS:
- list_add(&p->action, &hdev->pend_le_conns);
- break;
- case HCI_AUTO_CONN_REPORT:
- list_add(&p->action, &hdev->pend_le_reports);
- break;
- default:
- break;
- }
- }
+ /* Cancel any remaining timed work */
+ if (!hci_dev_test_flag(hdev, HCI_MGMT))
+ return;
+ cancel_delayed_work_sync(&hdev->discov_off);
+ cancel_delayed_work_sync(&hdev->service_cache);
+ cancel_delayed_work_sync(&hdev->rpa_expired);
+ cancel_delayed_work_sync(&hdev->mesh_send_done);
}
void mgmt_power_on(struct hci_dev *hdev, int err)
{
struct cmd_lookup match = { NULL, hdev };
- BT_DBG("err %d", err);
+ bt_dev_dbg(hdev, "err %d", err);
hci_dev_lock(hdev);
if (!err) {
restart_le_actions(hdev);
- hci_update_background_scan(hdev);
+ hci_update_passive_scan(hdev);
}
- mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, &match);
+ mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, true, settings_rsp,
+ &match);
new_settings(hdev, match.sk);
@@ -7041,9 +9543,10 @@ void mgmt_power_on(struct hci_dev *hdev, int err)
void __mgmt_power_off(struct hci_dev *hdev)
{
struct cmd_lookup match = { NULL, hdev };
- u8 status, zero_cod[] = { 0, 0, 0 };
+ u8 zero_cod[] = { 0, 0, 0 };
- mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, &match);
+ mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, true, settings_rsp,
+ &match);
/* If the power off is because of hdev unregistration let
* use the appropriate INVALID_INDEX status. Otherwise use
@@ -7053,11 +9556,11 @@ void __mgmt_power_off(struct hci_dev *hdev)
* status responses.
*/
if (hci_dev_test_flag(hdev, HCI_UNREGISTER))
- status = MGMT_STATUS_INVALID_INDEX;
+ match.mgmt_status = MGMT_STATUS_INVALID_INDEX;
else
- status = MGMT_STATUS_NOT_POWERED;
+ match.mgmt_status = MGMT_STATUS_NOT_POWERED;
- mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &status);
+ mgmt_pending_foreach(0, hdev, true, cmd_complete_rsp, &match);
if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) {
mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
@@ -7112,7 +9615,7 @@ static u8 mgmt_ltk_type(struct smp_ltk *ltk)
{
switch (ltk->type) {
case SMP_LTK:
- case SMP_LTK_SLAVE:
+ case SMP_LTK_RESPONDER:
if (ltk->authenticated)
return MGMT_LTK_AUTHENTICATED;
return MGMT_LTK_UNAUTHENTICATED;
@@ -7158,7 +9661,7 @@ void mgmt_new_ltk(struct hci_dev *hdev, struct smp_ltk *key, bool persistent)
ev.key.rand = key->rand;
if (key->type == SMP_LTK)
- ev.key.master = 1;
+ ev.key.initiator = 1;
/* Make sure we copy only the significant bytes based on the
* encryption key size, and set the rest of the value to zeroes.
@@ -7238,15 +9741,35 @@ void mgmt_new_conn_param(struct hci_dev *hdev, bdaddr_t *bdaddr,
}
void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn,
- u32 flags, u8 *name, u8 name_len)
+ u8 *name, u8 name_len)
{
- char buf[512];
- struct mgmt_ev_device_connected *ev = (void *) buf;
+ struct sk_buff *skb;
+ struct mgmt_ev_device_connected *ev;
u16 eir_len = 0;
+ u32 flags = 0;
+
+ if (test_and_set_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags))
+ return;
+
+ /* allocate buff for LE or BR/EDR adv */
+ if (conn->le_adv_data_len > 0)
+ skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_CONNECTED,
+ sizeof(*ev) + conn->le_adv_data_len);
+ else
+ skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_CONNECTED,
+ sizeof(*ev) + (name ? eir_precalc_len(name_len) : 0) +
+ eir_precalc_len(sizeof(conn->dev_class)));
+ if (!skb)
+ return;
+
+ ev = skb_put(skb, sizeof(*ev));
bacpy(&ev->addr.bdaddr, &conn->dst);
ev->addr.type = link_to_bdaddr(conn->type, conn->dst_type);
+ if (conn->out)
+ flags |= MGMT_DEV_FOUND_INITIATED_CONN;
+
ev->flags = __cpu_to_le32(flags);
/* We must ensure that the EIR Data fields are ordered and
@@ -7254,36 +9777,20 @@ void mgmt_device_connected(struct hci_dev *hdev, struct hci_conn *conn,
* adding any BR/EDR data to the LE adv.
*/
if (conn->le_adv_data_len > 0) {
- memcpy(&ev->eir[eir_len],
- conn->le_adv_data, conn->le_adv_data_len);
+ skb_put_data(skb, conn->le_adv_data, conn->le_adv_data_len);
eir_len = conn->le_adv_data_len;
} else {
- if (name_len > 0)
- eir_len = eir_append_data(ev->eir, 0, EIR_NAME_COMPLETE,
- name, name_len);
+ if (name)
+ eir_len += eir_skb_put_data(skb, EIR_NAME_COMPLETE, name, name_len);
- if (memcmp(conn->dev_class, "\0\0\0", 3) != 0)
- eir_len = eir_append_data(ev->eir, eir_len,
- EIR_CLASS_OF_DEV,
- conn->dev_class, 3);
+ if (memcmp(conn->dev_class, "\0\0\0", sizeof(conn->dev_class)))
+ eir_len += eir_skb_put_data(skb, EIR_CLASS_OF_DEV,
+ conn->dev_class, sizeof(conn->dev_class));
}
ev->eir_len = cpu_to_le16(eir_len);
- mgmt_event(MGMT_EV_DEVICE_CONNECTED, hdev, buf,
- sizeof(*ev) + eir_len, NULL);
-}
-
-static void disconnect_rsp(struct mgmt_pending_cmd *cmd, void *data)
-{
- struct sock **sk = data;
-
- cmd->cmd_complete(cmd, 0);
-
- *sk = cmd->sk;
- sock_hold(*sk);
-
- mgmt_pending_remove(cmd);
+ mgmt_event_skb(skb, NULL);
}
static void unpair_device_rsp(struct mgmt_pending_cmd *cmd, void *data)
@@ -7294,7 +9801,6 @@ static void unpair_device_rsp(struct mgmt_pending_cmd *cmd, void *data)
device_unpaired(hdev, &cp->addr.bdaddr, cp->addr.type, cmd->sk);
cmd->cmd_complete(cmd, 0);
- mgmt_pending_remove(cmd);
}
bool mgmt_powering_down(struct hci_dev *hdev)
@@ -7302,6 +9808,9 @@ bool mgmt_powering_down(struct hci_dev *hdev)
struct mgmt_pending_cmd *cmd;
struct mgmt_mode *cp;
+ if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN))
+ return true;
+
cmd = pending_find(MGMT_OP_SET_POWERED, hdev);
if (!cmd)
return false;
@@ -7320,33 +9829,26 @@ void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr,
struct mgmt_ev_device_disconnected ev;
struct sock *sk = NULL;
- /* The connection is still in hci_conn_hash so test for 1
- * instead of 0 to know if this is the last one.
- */
- if (mgmt_powering_down(hdev) && hci_conn_count(hdev) == 1) {
- cancel_delayed_work(&hdev->power_off);
- queue_work(hdev->req_workqueue, &hdev->power_off.work);
- }
-
if (!mgmt_connected)
return;
- if (link_type != ACL_LINK && link_type != LE_LINK)
+ if (link_type != ACL_LINK &&
+ link_type != LE_LINK &&
+ link_type != BIS_LINK)
return;
- mgmt_pending_foreach(MGMT_OP_DISCONNECT, hdev, disconnect_rsp, &sk);
-
bacpy(&ev.addr.bdaddr, bdaddr);
ev.addr.type = link_to_bdaddr(link_type, addr_type);
ev.reason = reason;
+ /* Report disconnects due to suspend */
+ if (hdev->suspended)
+ ev.reason = MGMT_DEV_DISCONN_LOCAL_HOST_SUSPEND;
+
mgmt_event(MGMT_EV_DEVICE_DISCONNECTED, hdev, &ev, sizeof(ev), sk);
if (sk)
sock_put(sk);
-
- mgmt_pending_foreach(MGMT_OP_UNPAIR_DEVICE, hdev, unpair_device_rsp,
- hdev);
}
void mgmt_disconnect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr,
@@ -7356,8 +9858,8 @@ void mgmt_disconnect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr,
struct mgmt_cp_disconnect *cp;
struct mgmt_pending_cmd *cmd;
- mgmt_pending_foreach(MGMT_OP_UNPAIR_DEVICE, hdev, unpair_device_rsp,
- hdev);
+ mgmt_pending_foreach(MGMT_OP_UNPAIR_DEVICE, hdev, true,
+ unpair_device_rsp, hdev);
cmd = pending_find(MGMT_OP_DISCONNECT, hdev);
if (!cmd)
@@ -7375,21 +9877,18 @@ void mgmt_disconnect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr,
mgmt_pending_remove(cmd);
}
-void mgmt_connect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
- u8 addr_type, u8 status)
+void mgmt_connect_failed(struct hci_dev *hdev, struct hci_conn *conn, u8 status)
{
struct mgmt_ev_connect_failed ev;
- /* The connection is still in hci_conn_hash so test for 1
- * instead of 0 to know if this is the last one.
- */
- if (mgmt_powering_down(hdev) && hci_conn_count(hdev) == 1) {
- cancel_delayed_work(&hdev->power_off);
- queue_work(hdev->req_workqueue, &hdev->power_off.work);
+ if (test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags)) {
+ mgmt_device_disconnected(hdev, &conn->dst, conn->type,
+ conn->dst_type, status, true);
+ return;
}
- bacpy(&ev.addr.bdaddr, bdaddr);
- ev.addr.type = link_to_bdaddr(link_type, addr_type);
+ bacpy(&ev.addr.bdaddr, &conn->dst);
+ ev.addr.type = link_to_bdaddr(conn->type, conn->dst_type);
ev.status = mgmt_status(status);
mgmt_event(MGMT_EV_CONNECT_FAILED, hdev, &ev, sizeof(ev), NULL);
@@ -7438,7 +9937,7 @@ int mgmt_user_confirm_request(struct hci_dev *hdev, bdaddr_t *bdaddr,
{
struct mgmt_ev_user_confirm_request ev;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr);
bacpy(&ev.addr.bdaddr, bdaddr);
ev.addr.type = link_to_bdaddr(link_type, addr_type);
@@ -7454,7 +9953,7 @@ int mgmt_user_passkey_request(struct hci_dev *hdev, bdaddr_t *bdaddr,
{
struct mgmt_ev_user_passkey_request ev;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr);
bacpy(&ev.addr.bdaddr, bdaddr);
ev.addr.type = link_to_bdaddr(link_type, addr_type);
@@ -7515,7 +10014,7 @@ int mgmt_user_passkey_notify(struct hci_dev *hdev, bdaddr_t *bdaddr,
{
struct mgmt_ev_passkey_notify ev;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr);
bacpy(&ev.addr.bdaddr, bdaddr);
ev.addr.type = link_to_bdaddr(link_type, addr_type);
@@ -7553,7 +10052,7 @@ void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status)
if (status) {
u8 mgmt_err = mgmt_status(status);
- mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev,
+ mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev, true,
cmd_status_rsp, &mgmt_err);
return;
}
@@ -7563,8 +10062,8 @@ void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status)
else
changed = hci_dev_test_and_clear_flag(hdev, HCI_LINK_SECURITY);
- mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev, settings_rsp,
- &match);
+ mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev, true,
+ settings_rsp, &match);
if (changed)
new_settings(hdev, match.sk);
@@ -7573,74 +10072,6 @@ void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status)
sock_put(match.sk);
}
-static void clear_eir(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
- struct hci_cp_write_eir cp;
-
- if (!lmp_ext_inq_capable(hdev))
- return;
-
- memset(hdev->eir, 0, sizeof(hdev->eir));
-
- memset(&cp, 0, sizeof(cp));
-
- hci_req_add(req, HCI_OP_WRITE_EIR, sizeof(cp), &cp);
-}
-
-void mgmt_ssp_enable_complete(struct hci_dev *hdev, u8 enable, u8 status)
-{
- struct cmd_lookup match = { NULL, hdev };
- struct hci_request req;
- bool changed = false;
-
- if (status) {
- u8 mgmt_err = mgmt_status(status);
-
- if (enable && hci_dev_test_and_clear_flag(hdev,
- HCI_SSP_ENABLED)) {
- hci_dev_clear_flag(hdev, HCI_HS_ENABLED);
- new_settings(hdev, NULL);
- }
-
- mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, cmd_status_rsp,
- &mgmt_err);
- return;
- }
-
- if (enable) {
- changed = !hci_dev_test_and_set_flag(hdev, HCI_SSP_ENABLED);
- } else {
- changed = hci_dev_test_and_clear_flag(hdev, HCI_SSP_ENABLED);
- if (!changed)
- changed = hci_dev_test_and_clear_flag(hdev,
- HCI_HS_ENABLED);
- else
- hci_dev_clear_flag(hdev, HCI_HS_ENABLED);
- }
-
- mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, settings_rsp, &match);
-
- if (changed)
- new_settings(hdev, match.sk);
-
- if (match.sk)
- sock_put(match.sk);
-
- hci_req_init(&req, hdev);
-
- if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) {
- if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS))
- hci_req_add(&req, HCI_OP_WRITE_SSP_DEBUG_MODE,
- sizeof(enable), &enable);
- __hci_req_update_eir(&req);
- } else {
- clear_eir(&req);
- }
-
- hci_req_run(&req, NULL);
-}
-
static void sk_lookup(struct mgmt_pending_cmd *cmd, void *data)
{
struct cmd_lookup *match = data;
@@ -7656,9 +10087,12 @@ void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class,
{
struct cmd_lookup match = { NULL, hdev, mgmt_status(status) };
- mgmt_pending_foreach(MGMT_OP_SET_DEV_CLASS, hdev, sk_lookup, &match);
- mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, sk_lookup, &match);
- mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match);
+ mgmt_pending_foreach(MGMT_OP_SET_DEV_CLASS, hdev, false, sk_lookup,
+ &match);
+ mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, false, sk_lookup,
+ &match);
+ mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, false, sk_lookup,
+ &match);
if (!status) {
mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, dev_class,
@@ -7689,6 +10123,9 @@ void mgmt_set_local_name_complete(struct hci_dev *hdev, u8 *name, u8 status)
/* If this is a HCI command related to powering on the
* HCI dev don't send any mgmt signals.
*/
+ if (hci_dev_test_flag(hdev, HCI_POWERING_DOWN))
+ return;
+
if (pending_find(MGMT_OP_SET_POWERED, hdev))
return;
}
@@ -7765,21 +10202,6 @@ static bool eir_has_uuids(u8 *eir, u16 eir_len, u16 uuid_count, u8 (*uuids)[16])
return false;
}
-static void restart_le_scan(struct hci_dev *hdev)
-{
- /* If controller is not scanning we are done. */
- if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
- return;
-
- if (time_after(jiffies + DISCOV_LE_RESTART_DELAY,
- hdev->discovery.scan_start +
- hdev->discovery.scan_duration))
- return;
-
- queue_delayed_work(hdev->req_workqueue, &hdev->le_scan_restart,
- DISCOV_LE_RESTART_DELAY);
-}
-
static bool is_filter_match(struct hci_dev *hdev, s8 rssi, u8 *eir,
u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len)
{
@@ -7795,7 +10217,7 @@ static bool is_filter_match(struct hci_dev *hdev, s8 rssi, u8 *eir,
if (hdev->discovery.rssi != HCI_RSSI_INVALID &&
(rssi == HCI_RSSI_INVALID ||
(rssi < hdev->discovery.rssi &&
- !test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks))))
+ !hci_test_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER))))
return false;
if (hdev->discovery.uuid_count != 0) {
@@ -7813,9 +10235,7 @@ static bool is_filter_match(struct hci_dev *hdev, s8 rssi, u8 *eir,
/* If duplicate filtering does not report RSSI changes, then restart
* scanning to ensure updated result with updated RSSI values.
*/
- if (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks)) {
- restart_le_scan(hdev);
-
+ if (hci_test_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER)) {
/* Validate RSSI value against the RSSI threshold once more. */
if (hdev->discovery.rssi != HCI_RSSI_INVALID &&
rssi < hdev->discovery.rssi)
@@ -7825,13 +10245,192 @@ static bool is_filter_match(struct hci_dev *hdev, s8 rssi, u8 *eir,
return true;
}
+void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle,
+ bdaddr_t *bdaddr, u8 addr_type)
+{
+ struct mgmt_ev_adv_monitor_device_lost ev;
+
+ ev.monitor_handle = cpu_to_le16(handle);
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = addr_type;
+
+ mgmt_event(MGMT_EV_ADV_MONITOR_DEVICE_LOST, hdev, &ev, sizeof(ev),
+ NULL);
+}
+
+static void mgmt_send_adv_monitor_device_found(struct hci_dev *hdev,
+ struct sk_buff *skb,
+ struct sock *skip_sk,
+ u16 handle)
+{
+ struct sk_buff *advmon_skb;
+ size_t advmon_skb_len;
+ __le16 *monitor_handle;
+
+ if (!skb)
+ return;
+
+ advmon_skb_len = (sizeof(struct mgmt_ev_adv_monitor_device_found) -
+ sizeof(struct mgmt_ev_device_found)) + skb->len;
+ advmon_skb = mgmt_alloc_skb(hdev, MGMT_EV_ADV_MONITOR_DEVICE_FOUND,
+ advmon_skb_len);
+ if (!advmon_skb)
+ return;
+
+ /* ADV_MONITOR_DEVICE_FOUND is similar to DEVICE_FOUND event except
+ * that it also has 'monitor_handle'. Make a copy of DEVICE_FOUND and
+ * store monitor_handle of the matched monitor.
+ */
+ monitor_handle = skb_put(advmon_skb, sizeof(*monitor_handle));
+ *monitor_handle = cpu_to_le16(handle);
+ skb_put_data(advmon_skb, skb->data, skb->len);
+
+ mgmt_event_skb(advmon_skb, skip_sk);
+}
+
+static void mgmt_adv_monitor_device_found(struct hci_dev *hdev,
+ bdaddr_t *bdaddr, bool report_device,
+ struct sk_buff *skb,
+ struct sock *skip_sk)
+{
+ struct monitored_device *dev, *tmp;
+ bool matched = false;
+ bool notified = false;
+
+ /* We have received the Advertisement Report because:
+ * 1. the kernel has initiated active discovery
+ * 2. if not, we have pend_le_reports > 0 in which case we are doing
+ * passive scanning
+ * 3. if none of the above is true, we have one or more active
+ * Advertisement Monitor
+ *
+ * For case 1 and 2, report all advertisements via MGMT_EV_DEVICE_FOUND
+ * and report ONLY one advertisement per device for the matched Monitor
+ * via MGMT_EV_ADV_MONITOR_DEVICE_FOUND event.
+ *
+ * For case 3, since we are not active scanning and all advertisements
+ * received are due to a matched Advertisement Monitor, report all
+ * advertisements ONLY via MGMT_EV_ADV_MONITOR_DEVICE_FOUND event.
+ */
+ if (report_device && !hdev->advmon_pend_notify) {
+ mgmt_event_skb(skb, skip_sk);
+ return;
+ }
+
+ hdev->advmon_pend_notify = false;
+
+ list_for_each_entry_safe(dev, tmp, &hdev->monitored_devices, list) {
+ if (!bacmp(&dev->bdaddr, bdaddr)) {
+ matched = true;
+
+ if (!dev->notified) {
+ mgmt_send_adv_monitor_device_found(hdev, skb,
+ skip_sk,
+ dev->handle);
+ notified = true;
+ dev->notified = true;
+ }
+ }
+
+ if (!dev->notified)
+ hdev->advmon_pend_notify = true;
+ }
+
+ if (!report_device &&
+ ((matched && !notified) || !msft_monitor_supported(hdev))) {
+ /* Handle 0 indicates that we are not active scanning and this
+ * is a subsequent advertisement report for an already matched
+ * Advertisement Monitor or the controller offloading support
+ * is not available.
+ */
+ mgmt_send_adv_monitor_device_found(hdev, skb, skip_sk, 0);
+ }
+
+ if (report_device)
+ mgmt_event_skb(skb, skip_sk);
+ else
+ kfree_skb(skb);
+}
+
+static void mesh_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ u8 addr_type, s8 rssi, u32 flags, u8 *eir,
+ u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len,
+ u64 instant)
+{
+ struct sk_buff *skb;
+ struct mgmt_ev_mesh_device_found *ev;
+ int i, j;
+
+ if (!hdev->mesh_ad_types[0])
+ goto accepted;
+
+ /* Scan for requested AD types */
+ if (eir_len > 0) {
+ for (i = 0; i + 1 < eir_len; i += eir[i] + 1) {
+ for (j = 0; j < sizeof(hdev->mesh_ad_types); j++) {
+ if (!hdev->mesh_ad_types[j])
+ break;
+
+ if (hdev->mesh_ad_types[j] == eir[i + 1])
+ goto accepted;
+ }
+ }
+ }
+
+ if (scan_rsp_len > 0) {
+ for (i = 0; i + 1 < scan_rsp_len; i += scan_rsp[i] + 1) {
+ for (j = 0; j < sizeof(hdev->mesh_ad_types); j++) {
+ if (!hdev->mesh_ad_types[j])
+ break;
+
+ if (hdev->mesh_ad_types[j] == scan_rsp[i + 1])
+ goto accepted;
+ }
+ }
+ }
+
+ return;
+
+accepted:
+ skb = mgmt_alloc_skb(hdev, MGMT_EV_MESH_DEVICE_FOUND,
+ sizeof(*ev) + eir_len + scan_rsp_len);
+ if (!skb)
+ return;
+
+ ev = skb_put(skb, sizeof(*ev));
+
+ bacpy(&ev->addr.bdaddr, bdaddr);
+ ev->addr.type = link_to_bdaddr(LE_LINK, addr_type);
+ ev->rssi = rssi;
+ ev->flags = cpu_to_le32(flags);
+ ev->instant = cpu_to_le64(instant);
+
+ if (eir_len > 0)
+ /* Copy EIR or advertising data into event */
+ skb_put_data(skb, eir, eir_len);
+
+ if (scan_rsp_len > 0)
+ /* Append scan response data to event */
+ skb_put_data(skb, scan_rsp, scan_rsp_len);
+
+ ev->eir_len = cpu_to_le16(eir_len + scan_rsp_len);
+
+ mgmt_event_skb(skb, NULL);
+}
+
void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
u8 addr_type, u8 *dev_class, s8 rssi, u32 flags,
- u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len)
+ u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len,
+ u64 instant)
{
- char buf[512];
- struct mgmt_ev_device_found *ev = (void *)buf;
- size_t ev_size;
+ struct sk_buff *skb;
+ struct mgmt_ev_device_found *ev;
+ bool report_device = hci_discovery_active(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_MESH) && link_type == LE_LINK)
+ mesh_device_found(hdev, bdaddr, addr_type, rssi, flags,
+ eir, eir_len, scan_rsp, scan_rsp_len,
+ instant);
/* Don't send events for a non-kernel initiated discovery. With
* LE one exception is if we have pend_le_reports > 0 in which
@@ -7840,7 +10439,9 @@ void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
if (!hci_discovery_active(hdev)) {
if (link_type == ACL_LINK)
return;
- if (link_type == LE_LINK && list_empty(&hdev->pend_le_reports))
+ if (link_type == LE_LINK && !list_empty(&hdev->pend_le_reports))
+ report_device = true;
+ else if (!hci_is_adv_monitoring(hdev))
return;
}
@@ -7863,13 +10464,13 @@ void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
}
}
- /* Make sure that the buffer is big enough. The 5 extra bytes
- * are for the potential CoD field.
- */
- if (sizeof(*ev) + eir_len + scan_rsp_len + 5 > sizeof(buf))
+ /* Allocate skb. The 5 extra bytes are for the potential CoD field */
+ skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_FOUND,
+ sizeof(*ev) + eir_len + scan_rsp_len + 5);
+ if (!skb)
return;
- memset(buf, 0, sizeof(buf));
+ ev = skb_put(skb, sizeof(*ev));
/* In case of device discovery with BR/EDR devices (pre 1.2), the
* RSSI value was reported as 0 when not available. This behavior
@@ -7890,51 +10491,59 @@ void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
if (eir_len > 0)
/* Copy EIR or advertising data into event */
- memcpy(ev->eir, eir, eir_len);
+ skb_put_data(skb, eir, eir_len);
- if (dev_class && !eir_get_data(ev->eir, eir_len, EIR_CLASS_OF_DEV,
- NULL))
- eir_len = eir_append_data(ev->eir, eir_len, EIR_CLASS_OF_DEV,
- dev_class, 3);
+ if (dev_class && !eir_get_data(eir, eir_len, EIR_CLASS_OF_DEV, NULL)) {
+ u8 eir_cod[5];
+
+ eir_len += eir_append_data(eir_cod, 0, EIR_CLASS_OF_DEV,
+ dev_class, 3);
+ skb_put_data(skb, eir_cod, sizeof(eir_cod));
+ }
if (scan_rsp_len > 0)
/* Append scan response data to event */
- memcpy(ev->eir + eir_len, scan_rsp, scan_rsp_len);
+ skb_put_data(skb, scan_rsp, scan_rsp_len);
ev->eir_len = cpu_to_le16(eir_len + scan_rsp_len);
- ev_size = sizeof(*ev) + eir_len + scan_rsp_len;
- mgmt_event(MGMT_EV_DEVICE_FOUND, hdev, ev, ev_size, NULL);
+ mgmt_adv_monitor_device_found(hdev, bdaddr, report_device, skb, NULL);
}
void mgmt_remote_name(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
u8 addr_type, s8 rssi, u8 *name, u8 name_len)
{
+ struct sk_buff *skb;
struct mgmt_ev_device_found *ev;
- char buf[sizeof(*ev) + HCI_MAX_NAME_LENGTH + 2];
- u16 eir_len;
-
- ev = (struct mgmt_ev_device_found *) buf;
+ u16 eir_len = 0;
+ u32 flags = 0;
- memset(buf, 0, sizeof(buf));
+ skb = mgmt_alloc_skb(hdev, MGMT_EV_DEVICE_FOUND,
+ sizeof(*ev) + (name ? eir_precalc_len(name_len) : 0));
+ if (!skb)
+ return;
+ ev = skb_put(skb, sizeof(*ev));
bacpy(&ev->addr.bdaddr, bdaddr);
ev->addr.type = link_to_bdaddr(link_type, addr_type);
ev->rssi = rssi;
- eir_len = eir_append_data(ev->eir, 0, EIR_NAME_COMPLETE, name,
- name_len);
+ if (name)
+ eir_len += eir_skb_put_data(skb, EIR_NAME_COMPLETE, name, name_len);
+ else
+ flags = MGMT_DEV_FOUND_NAME_REQUEST_FAILED;
ev->eir_len = cpu_to_le16(eir_len);
+ ev->flags = cpu_to_le32(flags);
- mgmt_event(MGMT_EV_DEVICE_FOUND, hdev, ev, sizeof(*ev) + eir_len, NULL);
+ mgmt_event_skb(skb, NULL);
}
void mgmt_discovering(struct hci_dev *hdev, u8 discovering)
{
struct mgmt_ev_discovering ev;
- BT_DBG("%s discovering %u", hdev->name, discovering);
+ bt_dev_dbg(hdev, "discovering %u", discovering);
memset(&ev, 0, sizeof(ev));
ev.type = hdev->discovery.type;
@@ -7943,6 +10552,30 @@ void mgmt_discovering(struct hci_dev *hdev, u8 discovering)
mgmt_event(MGMT_EV_DISCOVERING, hdev, &ev, sizeof(ev), NULL);
}
+void mgmt_suspending(struct hci_dev *hdev, u8 state)
+{
+ struct mgmt_ev_controller_suspend ev;
+
+ ev.suspend_state = state;
+ mgmt_event(MGMT_EV_CONTROLLER_SUSPEND, hdev, &ev, sizeof(ev), NULL);
+}
+
+void mgmt_resuming(struct hci_dev *hdev, u8 reason, bdaddr_t *bdaddr,
+ u8 addr_type)
+{
+ struct mgmt_ev_controller_resume ev;
+
+ ev.wake_reason = reason;
+ if (bdaddr) {
+ bacpy(&ev.addr.bdaddr, bdaddr);
+ ev.addr.type = addr_type;
+ } else {
+ memset(&ev.addr, 0, sizeof(ev.addr));
+ }
+
+ mgmt_event(MGMT_EV_CONTROLLER_RESUME, hdev, &ev, sizeof(ev), NULL);
+}
+
static struct hci_mgmt_chan chan = {
.channel = HCI_CHANNEL_CONTROL,
.handler_count = ARRAY_SIZE(mgmt_handlers),
@@ -7959,3 +10592,22 @@ void mgmt_exit(void)
{
hci_mgmt_chan_unregister(&chan);
}
+
+void mgmt_cleanup(struct sock *sk)
+{
+ struct mgmt_mesh_tx *mesh_tx;
+ struct hci_dev *hdev;
+
+ read_lock(&hci_dev_list_lock);
+
+ list_for_each_entry(hdev, &hci_dev_list, list) {
+ do {
+ mesh_tx = mgmt_mesh_next(hdev, sk);
+
+ if (mesh_tx)
+ mesh_send_complete(hdev, mesh_tx, true);
+ } while (mesh_tx);
+ }
+
+ read_unlock(&hci_dev_list_lock);
+}
diff --git a/net/bluetooth/mgmt_config.c b/net/bluetooth/mgmt_config.c
new file mode 100644
index 000000000000..c4063d200c0a
--- /dev/null
+++ b/net/bluetooth/mgmt_config.c
@@ -0,0 +1,346 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright (C) 2020 Google Corporation
+ */
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/mgmt.h>
+
+#include "mgmt_util.h"
+#include "mgmt_config.h"
+
+#define HDEV_PARAM_U16(_param_name_) \
+ struct {\
+ struct mgmt_tlv_hdr entry; \
+ __le16 value; \
+ } __packed _param_name_
+
+#define HDEV_PARAM_U8(_param_name_) \
+ struct {\
+ struct mgmt_tlv_hdr entry; \
+ __u8 value; \
+ } __packed _param_name_
+
+#define TLV_SET_U16(_param_code_, _param_name_) \
+ { \
+ { cpu_to_le16(_param_code_), sizeof(__u16) }, \
+ cpu_to_le16(hdev->_param_name_) \
+ }
+
+#define TLV_SET_U8(_param_code_, _param_name_) \
+ { \
+ { cpu_to_le16(_param_code_), sizeof(__u8) }, \
+ hdev->_param_name_ \
+ }
+
+#define TLV_SET_U16_JIFFIES_TO_MSECS(_param_code_, _param_name_) \
+ { \
+ { cpu_to_le16(_param_code_), sizeof(__u16) }, \
+ cpu_to_le16(jiffies_to_msecs(hdev->_param_name_)) \
+ }
+
+int read_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len)
+{
+ int ret;
+ struct mgmt_rp_read_def_system_config {
+ /* Please see mgmt-api.txt for documentation of these values */
+ HDEV_PARAM_U16(def_page_scan_type);
+ HDEV_PARAM_U16(def_page_scan_int);
+ HDEV_PARAM_U16(def_page_scan_window);
+ HDEV_PARAM_U16(def_inq_scan_type);
+ HDEV_PARAM_U16(def_inq_scan_int);
+ HDEV_PARAM_U16(def_inq_scan_window);
+ HDEV_PARAM_U16(def_br_lsto);
+ HDEV_PARAM_U16(def_page_timeout);
+ HDEV_PARAM_U16(sniff_min_interval);
+ HDEV_PARAM_U16(sniff_max_interval);
+ HDEV_PARAM_U16(le_adv_min_interval);
+ HDEV_PARAM_U16(le_adv_max_interval);
+ HDEV_PARAM_U16(def_multi_adv_rotation_duration);
+ HDEV_PARAM_U16(le_scan_interval);
+ HDEV_PARAM_U16(le_scan_window);
+ HDEV_PARAM_U16(le_scan_int_suspend);
+ HDEV_PARAM_U16(le_scan_window_suspend);
+ HDEV_PARAM_U16(le_scan_int_discovery);
+ HDEV_PARAM_U16(le_scan_window_discovery);
+ HDEV_PARAM_U16(le_scan_int_adv_monitor);
+ HDEV_PARAM_U16(le_scan_window_adv_monitor);
+ HDEV_PARAM_U16(le_scan_int_connect);
+ HDEV_PARAM_U16(le_scan_window_connect);
+ HDEV_PARAM_U16(le_conn_min_interval);
+ HDEV_PARAM_U16(le_conn_max_interval);
+ HDEV_PARAM_U16(le_conn_latency);
+ HDEV_PARAM_U16(le_supv_timeout);
+ HDEV_PARAM_U16(def_le_autoconnect_timeout);
+ HDEV_PARAM_U16(advmon_allowlist_duration);
+ HDEV_PARAM_U16(advmon_no_filter_duration);
+ HDEV_PARAM_U8(enable_advmon_interleave_scan);
+ } __packed rp = {
+ TLV_SET_U16(0x0000, def_page_scan_type),
+ TLV_SET_U16(0x0001, def_page_scan_int),
+ TLV_SET_U16(0x0002, def_page_scan_window),
+ TLV_SET_U16(0x0003, def_inq_scan_type),
+ TLV_SET_U16(0x0004, def_inq_scan_int),
+ TLV_SET_U16(0x0005, def_inq_scan_window),
+ TLV_SET_U16(0x0006, def_br_lsto),
+ TLV_SET_U16(0x0007, def_page_timeout),
+ TLV_SET_U16(0x0008, sniff_min_interval),
+ TLV_SET_U16(0x0009, sniff_max_interval),
+ TLV_SET_U16(0x000a, le_adv_min_interval),
+ TLV_SET_U16(0x000b, le_adv_max_interval),
+ TLV_SET_U16(0x000c, def_multi_adv_rotation_duration),
+ TLV_SET_U16(0x000d, le_scan_interval),
+ TLV_SET_U16(0x000e, le_scan_window),
+ TLV_SET_U16(0x000f, le_scan_int_suspend),
+ TLV_SET_U16(0x0010, le_scan_window_suspend),
+ TLV_SET_U16(0x0011, le_scan_int_discovery),
+ TLV_SET_U16(0x0012, le_scan_window_discovery),
+ TLV_SET_U16(0x0013, le_scan_int_adv_monitor),
+ TLV_SET_U16(0x0014, le_scan_window_adv_monitor),
+ TLV_SET_U16(0x0015, le_scan_int_connect),
+ TLV_SET_U16(0x0016, le_scan_window_connect),
+ TLV_SET_U16(0x0017, le_conn_min_interval),
+ TLV_SET_U16(0x0018, le_conn_max_interval),
+ TLV_SET_U16(0x0019, le_conn_latency),
+ TLV_SET_U16(0x001a, le_supv_timeout),
+ TLV_SET_U16_JIFFIES_TO_MSECS(0x001b,
+ def_le_autoconnect_timeout),
+ TLV_SET_U16(0x001d, advmon_allowlist_duration),
+ TLV_SET_U16(0x001e, advmon_no_filter_duration),
+ TLV_SET_U8(0x001f, enable_advmon_interleave_scan),
+ };
+
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ ret = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_READ_DEF_SYSTEM_CONFIG,
+ 0, &rp, sizeof(rp));
+ return ret;
+}
+
+#define TO_TLV(x) ((struct mgmt_tlv *)(x))
+#define TLV_GET_LE16(tlv) le16_to_cpu(*((__le16 *)(TO_TLV(tlv)->value)))
+#define TLV_GET_U8(tlv) (*((__u8 *)(TO_TLV(tlv)->value)))
+
+int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len)
+{
+ u16 buffer_left = data_len;
+ u8 *buffer = data;
+
+ if (buffer_left < sizeof(struct mgmt_tlv)) {
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_DEF_SYSTEM_CONFIG,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ /* First pass to validate the tlv */
+ while (buffer_left >= sizeof(struct mgmt_tlv)) {
+ const u8 len = TO_TLV(buffer)->length;
+ size_t exp_type_len;
+ const u16 exp_len = sizeof(struct mgmt_tlv) +
+ len;
+ const u16 type = le16_to_cpu(TO_TLV(buffer)->type);
+
+ if (buffer_left < exp_len) {
+ bt_dev_warn(hdev, "invalid len left %u, exp >= %u",
+ buffer_left, exp_len);
+
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_DEF_SYSTEM_CONFIG,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ /* Please see mgmt-api.txt for documentation of these values */
+ switch (type) {
+ case 0x0000:
+ case 0x0001:
+ case 0x0002:
+ case 0x0003:
+ case 0x0004:
+ case 0x0005:
+ case 0x0006:
+ case 0x0007:
+ case 0x0008:
+ case 0x0009:
+ case 0x000a:
+ case 0x000b:
+ case 0x000c:
+ case 0x000d:
+ case 0x000e:
+ case 0x000f:
+ case 0x0010:
+ case 0x0011:
+ case 0x0012:
+ case 0x0013:
+ case 0x0014:
+ case 0x0015:
+ case 0x0016:
+ case 0x0017:
+ case 0x0018:
+ case 0x0019:
+ case 0x001a:
+ case 0x001b:
+ case 0x001d:
+ case 0x001e:
+ exp_type_len = sizeof(u16);
+ break;
+ case 0x001f:
+ exp_type_len = sizeof(u8);
+ break;
+ default:
+ exp_type_len = 0;
+ bt_dev_warn(hdev, "unsupported parameter %u", type);
+ break;
+ }
+
+ if (exp_type_len && len != exp_type_len) {
+ bt_dev_warn(hdev, "invalid length %d, exp %zu for type %u",
+ len, exp_type_len, type);
+
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_DEF_SYSTEM_CONFIG,
+ MGMT_STATUS_INVALID_PARAMS);
+ }
+
+ buffer_left -= exp_len;
+ buffer += exp_len;
+ }
+
+ buffer_left = data_len;
+ buffer = data;
+ while (buffer_left >= sizeof(struct mgmt_tlv)) {
+ const u8 len = TO_TLV(buffer)->length;
+ const u16 exp_len = sizeof(struct mgmt_tlv) +
+ len;
+ const u16 type = le16_to_cpu(TO_TLV(buffer)->type);
+
+ switch (type) {
+ case 0x0000:
+ hdev->def_page_scan_type = TLV_GET_LE16(buffer);
+ break;
+ case 0x0001:
+ hdev->def_page_scan_int = TLV_GET_LE16(buffer);
+ break;
+ case 0x0002:
+ hdev->def_page_scan_window = TLV_GET_LE16(buffer);
+ break;
+ case 0x0003:
+ hdev->def_inq_scan_type = TLV_GET_LE16(buffer);
+ break;
+ case 0x0004:
+ hdev->def_inq_scan_int = TLV_GET_LE16(buffer);
+ break;
+ case 0x0005:
+ hdev->def_inq_scan_window = TLV_GET_LE16(buffer);
+ break;
+ case 0x0006:
+ hdev->def_br_lsto = TLV_GET_LE16(buffer);
+ break;
+ case 0x0007:
+ hdev->def_page_timeout = TLV_GET_LE16(buffer);
+ break;
+ case 0x0008:
+ hdev->sniff_min_interval = TLV_GET_LE16(buffer);
+ break;
+ case 0x0009:
+ hdev->sniff_max_interval = TLV_GET_LE16(buffer);
+ break;
+ case 0x000a:
+ hdev->le_adv_min_interval = TLV_GET_LE16(buffer);
+ break;
+ case 0x000b:
+ hdev->le_adv_max_interval = TLV_GET_LE16(buffer);
+ break;
+ case 0x000c:
+ hdev->def_multi_adv_rotation_duration =
+ TLV_GET_LE16(buffer);
+ break;
+ case 0x000d:
+ hdev->le_scan_interval = TLV_GET_LE16(buffer);
+ break;
+ case 0x000e:
+ hdev->le_scan_window = TLV_GET_LE16(buffer);
+ break;
+ case 0x000f:
+ hdev->le_scan_int_suspend = TLV_GET_LE16(buffer);
+ break;
+ case 0x0010:
+ hdev->le_scan_window_suspend = TLV_GET_LE16(buffer);
+ break;
+ case 0x0011:
+ hdev->le_scan_int_discovery = TLV_GET_LE16(buffer);
+ break;
+ case 0x00012:
+ hdev->le_scan_window_discovery = TLV_GET_LE16(buffer);
+ break;
+ case 0x00013:
+ hdev->le_scan_int_adv_monitor = TLV_GET_LE16(buffer);
+ break;
+ case 0x00014:
+ hdev->le_scan_window_adv_monitor = TLV_GET_LE16(buffer);
+ break;
+ case 0x00015:
+ hdev->le_scan_int_connect = TLV_GET_LE16(buffer);
+ break;
+ case 0x00016:
+ hdev->le_scan_window_connect = TLV_GET_LE16(buffer);
+ break;
+ case 0x00017:
+ hdev->le_conn_min_interval = TLV_GET_LE16(buffer);
+ break;
+ case 0x00018:
+ hdev->le_conn_max_interval = TLV_GET_LE16(buffer);
+ break;
+ case 0x00019:
+ hdev->le_conn_latency = TLV_GET_LE16(buffer);
+ break;
+ case 0x0001a:
+ hdev->le_supv_timeout = TLV_GET_LE16(buffer);
+ break;
+ case 0x0001b:
+ hdev->def_le_autoconnect_timeout =
+ msecs_to_jiffies(TLV_GET_LE16(buffer));
+ break;
+ case 0x0001d:
+ hdev->advmon_allowlist_duration = TLV_GET_LE16(buffer);
+ break;
+ case 0x0001e:
+ hdev->advmon_no_filter_duration = TLV_GET_LE16(buffer);
+ break;
+ case 0x0001f:
+ hdev->enable_advmon_interleave_scan = TLV_GET_U8(buffer);
+ break;
+ default:
+ bt_dev_warn(hdev, "unsupported parameter %u", type);
+ break;
+ }
+
+ buffer_left -= exp_len;
+ buffer += exp_len;
+ }
+
+ return mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_SET_DEF_SYSTEM_CONFIG, 0, NULL, 0);
+}
+
+int read_def_runtime_config(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len)
+{
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ return mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_READ_DEF_RUNTIME_CONFIG, 0, NULL, 0);
+}
+
+int set_def_runtime_config(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len)
+{
+ bt_dev_dbg(hdev, "sock %p", sk);
+
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEF_SYSTEM_CONFIG,
+ MGMT_STATUS_INVALID_PARAMS);
+}
diff --git a/net/bluetooth/mgmt_config.h b/net/bluetooth/mgmt_config.h
new file mode 100644
index 000000000000..a4965f107891
--- /dev/null
+++ b/net/bluetooth/mgmt_config.h
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright (C) 2020 Google Corporation
+ */
+
+int read_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len);
+
+int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len);
+
+int read_def_runtime_config(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len);
+
+int set_def_runtime_config(struct sock *sk, struct hci_dev *hdev, void *data,
+ u16 data_len);
diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c
index 0d0a6d77b9e8..aa7b5585cb26 100644
--- a/net/bluetooth/mgmt_util.c
+++ b/net/bluetooth/mgmt_util.c
@@ -21,7 +21,7 @@
SOFTWARE IS DISCLAIMED.
*/
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
@@ -56,40 +56,73 @@ static struct sk_buff *create_monitor_ctrl_event(__le16 index, u32 cookie,
return skb;
}
-int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel,
- void *data, u16 data_len, int flag, struct sock *skip_sk)
+struct sk_buff *mgmt_alloc_skb(struct hci_dev *hdev, u16 opcode,
+ unsigned int size)
{
struct sk_buff *skb;
- struct mgmt_hdr *hdr;
- skb = alloc_skb(sizeof(*hdr) + data_len, GFP_KERNEL);
+ skb = alloc_skb(sizeof(struct mgmt_hdr) + size, GFP_KERNEL);
if (!skb)
- return -ENOMEM;
+ return skb;
- hdr = skb_put(skb, sizeof(*hdr));
- hdr->opcode = cpu_to_le16(event);
- if (hdev)
- hdr->index = cpu_to_le16(hdev->id);
- else
- hdr->index = cpu_to_le16(MGMT_INDEX_NONE);
- hdr->len = cpu_to_le16(data_len);
+ skb_reserve(skb, sizeof(struct mgmt_hdr));
+ bt_cb(skb)->mgmt.hdev = hdev;
+ bt_cb(skb)->mgmt.opcode = opcode;
- if (data)
- skb_put_data(skb, data, data_len);
+ return skb;
+}
+
+int mgmt_send_event_skb(unsigned short channel, struct sk_buff *skb, int flag,
+ struct sock *skip_sk)
+{
+ struct hci_dev *hdev;
+ struct mgmt_hdr *hdr;
+ int len;
+
+ if (!skb)
+ return -EINVAL;
+
+ len = skb->len;
+ hdev = bt_cb(skb)->mgmt.hdev;
/* Time stamp */
__net_timestamp(skb);
- hci_send_to_channel(channel, skb, flag, skip_sk);
-
+ /* Send just the data, without headers, to the monitor */
if (channel == HCI_CHANNEL_CONTROL)
- hci_send_monitor_ctrl_event(hdev, event, data, data_len,
+ hci_send_monitor_ctrl_event(hdev, bt_cb(skb)->mgmt.opcode,
+ skb->data, skb->len,
skb_get_ktime(skb), flag, skip_sk);
+ hdr = skb_push(skb, sizeof(*hdr));
+ hdr->opcode = cpu_to_le16(bt_cb(skb)->mgmt.opcode);
+ if (hdev)
+ hdr->index = cpu_to_le16(hdev->id);
+ else
+ hdr->index = cpu_to_le16(MGMT_INDEX_NONE);
+ hdr->len = cpu_to_le16(len);
+
+ hci_send_to_channel(channel, skb, flag, skip_sk);
+
kfree_skb(skb);
return 0;
}
+int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel,
+ void *data, u16 data_len, int flag, struct sock *skip_sk)
+{
+ struct sk_buff *skb;
+
+ skb = mgmt_alloc_skb(hdev, event, data_len);
+ if (!skb)
+ return -ENOMEM;
+
+ if (data)
+ skb_put_data(skb, data, data_len);
+
+ return mgmt_send_event_skb(channel, skb, flag, skip_sk);
+}
+
int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status)
{
struct sk_buff *skb, *mskb;
@@ -184,50 +217,50 @@ int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status,
struct mgmt_pending_cmd *mgmt_pending_find(unsigned short channel, u16 opcode,
struct hci_dev *hdev)
{
- struct mgmt_pending_cmd *cmd;
+ struct mgmt_pending_cmd *cmd, *tmp;
+
+ mutex_lock(&hdev->mgmt_pending_lock);
- list_for_each_entry(cmd, &hdev->mgmt_pending, list) {
+ list_for_each_entry_safe(cmd, tmp, &hdev->mgmt_pending, list) {
if (hci_sock_get_channel(cmd->sk) != channel)
continue;
- if (cmd->opcode == opcode)
- return cmd;
- }
-
- return NULL;
-}
-struct mgmt_pending_cmd *mgmt_pending_find_data(unsigned short channel,
- u16 opcode,
- struct hci_dev *hdev,
- const void *data)
-{
- struct mgmt_pending_cmd *cmd;
-
- list_for_each_entry(cmd, &hdev->mgmt_pending, list) {
- if (cmd->user_data != data)
- continue;
- if (cmd->opcode == opcode)
+ if (cmd->opcode == opcode) {
+ mutex_unlock(&hdev->mgmt_pending_lock);
return cmd;
+ }
}
+ mutex_unlock(&hdev->mgmt_pending_lock);
+
return NULL;
}
-void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev,
+void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, bool remove,
void (*cb)(struct mgmt_pending_cmd *cmd, void *data),
void *data)
{
struct mgmt_pending_cmd *cmd, *tmp;
+ mutex_lock(&hdev->mgmt_pending_lock);
+
list_for_each_entry_safe(cmd, tmp, &hdev->mgmt_pending, list) {
if (opcode > 0 && cmd->opcode != opcode)
continue;
+ if (remove)
+ list_del(&cmd->list);
+
cb(cmd, data);
+
+ if (remove)
+ mgmt_pending_free(cmd);
}
+
+ mutex_unlock(&hdev->mgmt_pending_lock);
}
-struct mgmt_pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode,
+struct mgmt_pending_cmd *mgmt_pending_new(struct sock *sk, u16 opcode,
struct hci_dev *hdev,
void *data, u16 len)
{
@@ -238,7 +271,7 @@ struct mgmt_pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode,
return NULL;
cmd->opcode = opcode;
- cmd->index = hdev->id;
+ cmd->hdev = hdev;
cmd->param = kmemdup(data, len, GFP_KERNEL);
if (!cmd->param) {
@@ -251,7 +284,22 @@ struct mgmt_pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode,
cmd->sk = sk;
sock_hold(sk);
- list_add(&cmd->list, &hdev->mgmt_pending);
+ return cmd;
+}
+
+struct mgmt_pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode,
+ struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_pending_cmd *cmd;
+
+ cmd = mgmt_pending_new(sk, opcode, hdev, data, len);
+ if (!cmd)
+ return NULL;
+
+ mutex_lock(&hdev->mgmt_pending_lock);
+ list_add_tail(&cmd->list, &hdev->mgmt_pending);
+ mutex_unlock(&hdev->mgmt_pending_lock);
return cmd;
}
@@ -265,6 +313,129 @@ void mgmt_pending_free(struct mgmt_pending_cmd *cmd)
void mgmt_pending_remove(struct mgmt_pending_cmd *cmd)
{
+ mutex_lock(&cmd->hdev->mgmt_pending_lock);
list_del(&cmd->list);
+ mutex_unlock(&cmd->hdev->mgmt_pending_lock);
+
mgmt_pending_free(cmd);
}
+
+bool __mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd)
+{
+ struct mgmt_pending_cmd *tmp;
+
+ lockdep_assert_held(&hdev->mgmt_pending_lock);
+
+ if (!cmd)
+ return false;
+
+ list_for_each_entry(tmp, &hdev->mgmt_pending, list) {
+ if (cmd == tmp)
+ return true;
+ }
+
+ return false;
+}
+
+bool mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd)
+{
+ bool listed;
+
+ mutex_lock(&hdev->mgmt_pending_lock);
+ listed = __mgmt_pending_listed(hdev, cmd);
+ mutex_unlock(&hdev->mgmt_pending_lock);
+
+ return listed;
+}
+
+bool mgmt_pending_valid(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd)
+{
+ bool listed;
+
+ if (!cmd)
+ return false;
+
+ mutex_lock(&hdev->mgmt_pending_lock);
+
+ listed = __mgmt_pending_listed(hdev, cmd);
+ if (listed)
+ list_del(&cmd->list);
+
+ mutex_unlock(&hdev->mgmt_pending_lock);
+
+ return listed;
+}
+
+void mgmt_mesh_foreach(struct hci_dev *hdev,
+ void (*cb)(struct mgmt_mesh_tx *mesh_tx, void *data),
+ void *data, struct sock *sk)
+{
+ struct mgmt_mesh_tx *mesh_tx, *tmp;
+
+ list_for_each_entry_safe(mesh_tx, tmp, &hdev->mesh_pending, list) {
+ if (!sk || mesh_tx->sk == sk)
+ cb(mesh_tx, data);
+ }
+}
+
+struct mgmt_mesh_tx *mgmt_mesh_next(struct hci_dev *hdev, struct sock *sk)
+{
+ struct mgmt_mesh_tx *mesh_tx;
+
+ if (list_empty(&hdev->mesh_pending))
+ return NULL;
+
+ list_for_each_entry(mesh_tx, &hdev->mesh_pending, list) {
+ if (!sk || mesh_tx->sk == sk)
+ return mesh_tx;
+ }
+
+ return NULL;
+}
+
+struct mgmt_mesh_tx *mgmt_mesh_find(struct hci_dev *hdev, u8 handle)
+{
+ struct mgmt_mesh_tx *mesh_tx;
+
+ if (list_empty(&hdev->mesh_pending))
+ return NULL;
+
+ list_for_each_entry(mesh_tx, &hdev->mesh_pending, list) {
+ if (mesh_tx->handle == handle)
+ return mesh_tx;
+ }
+
+ return NULL;
+}
+
+struct mgmt_mesh_tx *mgmt_mesh_add(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_mesh_tx *mesh_tx;
+
+ mesh_tx = kzalloc(sizeof(*mesh_tx), GFP_KERNEL);
+ if (!mesh_tx)
+ return NULL;
+
+ hdev->mesh_send_ref++;
+ if (!hdev->mesh_send_ref)
+ hdev->mesh_send_ref++;
+
+ mesh_tx->handle = hdev->mesh_send_ref;
+ mesh_tx->index = hdev->id;
+ memcpy(mesh_tx->param, data, len);
+ mesh_tx->param_len = len;
+ mesh_tx->sk = sk;
+ sock_hold(sk);
+
+ list_add_tail(&mesh_tx->list, &hdev->mesh_pending);
+
+ return mesh_tx;
+}
+
+void mgmt_mesh_remove(struct mgmt_mesh_tx *mesh_tx)
+{
+ list_del(&mesh_tx->list);
+ sock_put(mesh_tx->sk);
+ kfree(mesh_tx);
+}
diff --git a/net/bluetooth/mgmt_util.h b/net/bluetooth/mgmt_util.h
index 6559f189213c..bcba8c9d8952 100644
--- a/net/bluetooth/mgmt_util.h
+++ b/net/bluetooth/mgmt_util.h
@@ -20,17 +20,32 @@
SOFTWARE IS DISCLAIMED.
*/
+struct mgmt_mesh_tx {
+ struct list_head list;
+ int index;
+ size_t param_len;
+ struct sock *sk;
+ u8 handle;
+ u8 instance;
+ u8 param[sizeof(struct mgmt_cp_mesh_send) + 31];
+};
+
struct mgmt_pending_cmd {
struct list_head list;
u16 opcode;
- int index;
+ struct hci_dev *hdev;
void *param;
size_t param_len;
struct sock *sk;
+ struct sk_buff *skb;
void *user_data;
int (*cmd_complete)(struct mgmt_pending_cmd *cmd, u8 status);
};
+struct sk_buff *mgmt_alloc_skb(struct hci_dev *hdev, u16 opcode,
+ unsigned int size);
+int mgmt_send_event_skb(unsigned short channel, struct sk_buff *skb, int flag,
+ struct sock *skip_sk);
int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel,
void *data, u16 data_len, int flag, struct sock *skip_sk);
int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status);
@@ -39,15 +54,25 @@ int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status,
struct mgmt_pending_cmd *mgmt_pending_find(unsigned short channel, u16 opcode,
struct hci_dev *hdev);
-struct mgmt_pending_cmd *mgmt_pending_find_data(unsigned short channel,
- u16 opcode,
- struct hci_dev *hdev,
- const void *data);
-void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev,
+void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, bool remove,
void (*cb)(struct mgmt_pending_cmd *cmd, void *data),
void *data);
struct mgmt_pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode,
struct hci_dev *hdev,
void *data, u16 len);
+struct mgmt_pending_cmd *mgmt_pending_new(struct sock *sk, u16 opcode,
+ struct hci_dev *hdev,
+ void *data, u16 len);
void mgmt_pending_free(struct mgmt_pending_cmd *cmd);
void mgmt_pending_remove(struct mgmt_pending_cmd *cmd);
+bool __mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd);
+bool mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd);
+bool mgmt_pending_valid(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd);
+void mgmt_mesh_foreach(struct hci_dev *hdev,
+ void (*cb)(struct mgmt_mesh_tx *mesh_tx, void *data),
+ void *data, struct sock *sk);
+struct mgmt_mesh_tx *mgmt_mesh_find(struct hci_dev *hdev, u8 handle);
+struct mgmt_mesh_tx *mgmt_mesh_next(struct hci_dev *hdev, struct sock *sk);
+struct mgmt_mesh_tx *mgmt_mesh_add(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len);
+void mgmt_mesh_remove(struct mgmt_mesh_tx *mesh_tx);
diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c
new file mode 100644
index 000000000000..c560d8467669
--- /dev/null
+++ b/net/bluetooth/msft.c
@@ -0,0 +1,1201 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google Corporation
+ */
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/mgmt.h>
+
+#include "mgmt_util.h"
+#include "msft.h"
+
+#define MSFT_RSSI_THRESHOLD_VALUE_MIN -127
+#define MSFT_RSSI_THRESHOLD_VALUE_MAX 20
+#define MSFT_RSSI_LOW_TIMEOUT_MAX 0x3C
+
+#define MSFT_OP_READ_SUPPORTED_FEATURES 0x00
+struct msft_cp_read_supported_features {
+ __u8 sub_opcode;
+} __packed;
+
+struct msft_rp_read_supported_features {
+ __u8 status;
+ __u8 sub_opcode;
+ __le64 features;
+ __u8 evt_prefix_len;
+ __u8 evt_prefix[];
+} __packed;
+
+#define MSFT_OP_LE_MONITOR_ADVERTISEMENT 0x03
+#define MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN 0x01
+struct msft_le_monitor_advertisement_pattern {
+ __u8 length;
+ __u8 data_type;
+ __u8 start_byte;
+ __u8 pattern[];
+};
+
+struct msft_le_monitor_advertisement_pattern_data {
+ __u8 count;
+ __u8 data[];
+};
+
+struct msft_cp_le_monitor_advertisement {
+ __u8 sub_opcode;
+ __s8 rssi_high;
+ __s8 rssi_low;
+ __u8 rssi_low_interval;
+ __u8 rssi_sampling_period;
+ __u8 cond_type;
+ __u8 data[];
+} __packed;
+
+struct msft_rp_le_monitor_advertisement {
+ __u8 status;
+ __u8 sub_opcode;
+ __u8 handle;
+} __packed;
+
+#define MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT 0x04
+struct msft_cp_le_cancel_monitor_advertisement {
+ __u8 sub_opcode;
+ __u8 handle;
+} __packed;
+
+struct msft_rp_le_cancel_monitor_advertisement {
+ __u8 status;
+ __u8 sub_opcode;
+} __packed;
+
+#define MSFT_OP_LE_SET_ADVERTISEMENT_FILTER_ENABLE 0x05
+struct msft_cp_le_set_advertisement_filter_enable {
+ __u8 sub_opcode;
+ __u8 enable;
+} __packed;
+
+struct msft_rp_le_set_advertisement_filter_enable {
+ __u8 status;
+ __u8 sub_opcode;
+} __packed;
+
+#define MSFT_EV_LE_MONITOR_DEVICE 0x02
+struct msft_ev_le_monitor_device {
+ __u8 addr_type;
+ bdaddr_t bdaddr;
+ __u8 monitor_handle;
+ __u8 monitor_state;
+} __packed;
+
+struct msft_monitor_advertisement_handle_data {
+ __u8 msft_handle;
+ __u16 mgmt_handle;
+ __s8 rssi_high;
+ __s8 rssi_low;
+ __u8 rssi_low_interval;
+ __u8 rssi_sampling_period;
+ __u8 cond_type;
+ struct list_head list;
+};
+
+enum monitor_addr_filter_state {
+ AF_STATE_IDLE,
+ AF_STATE_ADDING,
+ AF_STATE_ADDED,
+ AF_STATE_REMOVING,
+};
+
+#define MSFT_MONITOR_ADVERTISEMENT_TYPE_ADDR 0x04
+struct msft_monitor_addr_filter_data {
+ __u8 msft_handle;
+ __u8 pattern_handle; /* address filters pertain to */
+ __u16 mgmt_handle;
+ int state;
+ __s8 rssi_high;
+ __s8 rssi_low;
+ __u8 rssi_low_interval;
+ __u8 rssi_sampling_period;
+ __u8 addr_type;
+ bdaddr_t bdaddr;
+ struct list_head list;
+};
+
+struct msft_data {
+ __u64 features;
+ __u8 evt_prefix_len;
+ __u8 *evt_prefix;
+ struct list_head handle_map;
+ struct list_head address_filters;
+ __u8 resuming;
+ __u8 suspending;
+ __u8 filter_enabled;
+ /* To synchronize add/remove address filter and monitor device event.*/
+ struct mutex filter_lock;
+};
+
+bool msft_monitor_supported(struct hci_dev *hdev)
+{
+ return !!(msft_get_features(hdev) & MSFT_FEATURE_MASK_LE_ADV_MONITOR);
+}
+
+static bool read_supported_features(struct hci_dev *hdev,
+ struct msft_data *msft)
+{
+ struct msft_cp_read_supported_features cp;
+ struct msft_rp_read_supported_features *rp;
+ struct sk_buff *skb;
+
+ cp.sub_opcode = MSFT_OP_READ_SUPPORTED_FEATURES;
+
+ skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+ if (IS_ERR(skb)) {
+ bt_dev_err(hdev, "Failed to read MSFT supported features (%ld)",
+ PTR_ERR(skb));
+ return false;
+ }
+
+ if (skb->len < sizeof(*rp)) {
+ bt_dev_err(hdev, "MSFT supported features length mismatch");
+ goto failed;
+ }
+
+ rp = (struct msft_rp_read_supported_features *)skb->data;
+
+ if (rp->sub_opcode != MSFT_OP_READ_SUPPORTED_FEATURES)
+ goto failed;
+
+ if (rp->evt_prefix_len > 0) {
+ msft->evt_prefix = kmemdup(rp->evt_prefix, rp->evt_prefix_len,
+ GFP_KERNEL);
+ if (!msft->evt_prefix)
+ goto failed;
+ }
+
+ msft->evt_prefix_len = rp->evt_prefix_len;
+ msft->features = __le64_to_cpu(rp->features);
+
+ if (msft->features & MSFT_FEATURE_MASK_CURVE_VALIDITY)
+ hdev->msft_curve_validity = true;
+
+ kfree_skb(skb);
+ return true;
+
+failed:
+ kfree_skb(skb);
+ return false;
+}
+
+/* is_mgmt = true matches the handle exposed to userspace via mgmt.
+ * is_mgmt = false matches the handle used by the msft controller.
+ * This function requires the caller holds hdev->lock
+ */
+static struct msft_monitor_advertisement_handle_data *msft_find_handle_data
+ (struct hci_dev *hdev, u16 handle, bool is_mgmt)
+{
+ struct msft_monitor_advertisement_handle_data *entry;
+ struct msft_data *msft = hdev->msft_data;
+
+ list_for_each_entry(entry, &msft->handle_map, list) {
+ if (is_mgmt && entry->mgmt_handle == handle)
+ return entry;
+ if (!is_mgmt && entry->msft_handle == handle)
+ return entry;
+ }
+
+ return NULL;
+}
+
+/* This function requires the caller holds msft->filter_lock */
+static struct msft_monitor_addr_filter_data *msft_find_address_data
+ (struct hci_dev *hdev, u8 addr_type, bdaddr_t *addr,
+ u8 pattern_handle)
+{
+ struct msft_monitor_addr_filter_data *entry;
+ struct msft_data *msft = hdev->msft_data;
+
+ list_for_each_entry(entry, &msft->address_filters, list) {
+ if (entry->pattern_handle == pattern_handle &&
+ addr_type == entry->addr_type &&
+ !bacmp(addr, &entry->bdaddr))
+ return entry;
+ }
+
+ return NULL;
+}
+
+/* This function requires the caller holds hdev->lock */
+static int msft_monitor_device_del(struct hci_dev *hdev, __u16 mgmt_handle,
+ bdaddr_t *bdaddr, __u8 addr_type,
+ bool notify)
+{
+ struct monitored_device *dev, *tmp;
+ int count = 0;
+
+ list_for_each_entry_safe(dev, tmp, &hdev->monitored_devices, list) {
+ /* mgmt_handle == 0 indicates remove all devices, whereas,
+ * bdaddr == NULL indicates remove all devices matching the
+ * mgmt_handle.
+ */
+ if ((!mgmt_handle || dev->handle == mgmt_handle) &&
+ (!bdaddr || (!bacmp(bdaddr, &dev->bdaddr) &&
+ addr_type == dev->addr_type))) {
+ if (notify && dev->notified) {
+ mgmt_adv_monitor_device_lost(hdev, dev->handle,
+ &dev->bdaddr,
+ dev->addr_type);
+ }
+
+ list_del(&dev->list);
+ kfree(dev);
+ count++;
+ }
+ }
+
+ return count;
+}
+
+static int msft_le_monitor_advertisement_cb(struct hci_dev *hdev, u16 opcode,
+ struct adv_monitor *monitor,
+ struct sk_buff *skb)
+{
+ struct msft_rp_le_monitor_advertisement *rp;
+ struct msft_monitor_advertisement_handle_data *handle_data;
+ struct msft_data *msft = hdev->msft_data;
+ int status = 0;
+
+ hci_dev_lock(hdev);
+
+ rp = (struct msft_rp_le_monitor_advertisement *)skb->data;
+ if (skb->len < sizeof(*rp)) {
+ status = HCI_ERROR_UNSPECIFIED;
+ goto unlock;
+ }
+
+ status = rp->status;
+ if (status)
+ goto unlock;
+
+ handle_data = kmalloc(sizeof(*handle_data), GFP_KERNEL);
+ if (!handle_data) {
+ status = HCI_ERROR_UNSPECIFIED;
+ goto unlock;
+ }
+
+ handle_data->mgmt_handle = monitor->handle;
+ handle_data->msft_handle = rp->handle;
+ handle_data->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN;
+ INIT_LIST_HEAD(&handle_data->list);
+ list_add(&handle_data->list, &msft->handle_map);
+
+ monitor->state = ADV_MONITOR_STATE_OFFLOADED;
+
+unlock:
+ if (status)
+ hci_free_adv_monitor(hdev, monitor);
+
+ hci_dev_unlock(hdev);
+
+ return status;
+}
+
+/* This function requires the caller holds hci_req_sync_lock */
+static void msft_remove_addr_filters_sync(struct hci_dev *hdev, u8 handle)
+{
+ struct msft_monitor_addr_filter_data *address_filter, *n;
+ struct msft_cp_le_cancel_monitor_advertisement cp;
+ struct msft_data *msft = hdev->msft_data;
+ struct list_head head;
+ struct sk_buff *skb;
+
+ INIT_LIST_HEAD(&head);
+
+ /* Cancel all corresponding address monitors */
+ mutex_lock(&msft->filter_lock);
+
+ list_for_each_entry_safe(address_filter, n, &msft->address_filters,
+ list) {
+ if (address_filter->pattern_handle != handle)
+ continue;
+
+ list_del(&address_filter->list);
+
+ /* Keep the address filter and let
+ * msft_add_address_filter_sync() remove and free the address
+ * filter.
+ */
+ if (address_filter->state == AF_STATE_ADDING) {
+ address_filter->state = AF_STATE_REMOVING;
+ continue;
+ }
+
+ /* Keep the address filter and let
+ * msft_cancel_address_filter_sync() remove and free the address
+ * filter
+ */
+ if (address_filter->state == AF_STATE_REMOVING)
+ continue;
+
+ list_add_tail(&address_filter->list, &head);
+ }
+
+ mutex_unlock(&msft->filter_lock);
+
+ list_for_each_entry_safe(address_filter, n, &head, list) {
+ list_del(&address_filter->list);
+
+ cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT;
+ cp.handle = address_filter->msft_handle;
+
+ skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+ if (IS_ERR(skb)) {
+ kfree(address_filter);
+ continue;
+ }
+
+ kfree_skb(skb);
+
+ bt_dev_dbg(hdev, "MSFT: Canceled device %pMR address filter",
+ &address_filter->bdaddr);
+
+ kfree(address_filter);
+ }
+}
+
+static int msft_le_cancel_monitor_advertisement_cb(struct hci_dev *hdev,
+ u16 opcode,
+ struct adv_monitor *monitor,
+ struct sk_buff *skb)
+{
+ struct msft_rp_le_cancel_monitor_advertisement *rp;
+ struct msft_monitor_advertisement_handle_data *handle_data;
+ struct msft_data *msft = hdev->msft_data;
+ int status = 0;
+ u8 msft_handle;
+
+ rp = (struct msft_rp_le_cancel_monitor_advertisement *)skb->data;
+ if (skb->len < sizeof(*rp)) {
+ status = HCI_ERROR_UNSPECIFIED;
+ goto done;
+ }
+
+ status = rp->status;
+ if (status)
+ goto done;
+
+ hci_dev_lock(hdev);
+
+ handle_data = msft_find_handle_data(hdev, monitor->handle, true);
+
+ if (handle_data) {
+ if (monitor->state == ADV_MONITOR_STATE_OFFLOADED)
+ monitor->state = ADV_MONITOR_STATE_REGISTERED;
+
+ /* Do not free the monitor if it is being removed due to
+ * suspend. It will be re-monitored on resume.
+ */
+ if (!msft->suspending) {
+ hci_free_adv_monitor(hdev, monitor);
+
+ /* Clear any monitored devices by this Adv Monitor */
+ msft_monitor_device_del(hdev, handle_data->mgmt_handle,
+ NULL, 0, false);
+ }
+
+ msft_handle = handle_data->msft_handle;
+
+ list_del(&handle_data->list);
+ kfree(handle_data);
+
+ hci_dev_unlock(hdev);
+
+ msft_remove_addr_filters_sync(hdev, msft_handle);
+ } else {
+ hci_dev_unlock(hdev);
+ }
+
+done:
+ return status;
+}
+
+/* This function requires the caller holds hci_req_sync_lock */
+static int msft_remove_monitor_sync(struct hci_dev *hdev,
+ struct adv_monitor *monitor)
+{
+ struct msft_cp_le_cancel_monitor_advertisement cp;
+ struct msft_monitor_advertisement_handle_data *handle_data;
+ struct sk_buff *skb;
+
+ handle_data = msft_find_handle_data(hdev, monitor->handle, true);
+
+ /* If no matched handle, just remove without telling controller */
+ if (!handle_data)
+ return -ENOENT;
+
+ cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT;
+ cp.handle = handle_data->msft_handle;
+
+ skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ return msft_le_cancel_monitor_advertisement_cb(hdev, hdev->msft_opcode,
+ monitor, skb);
+}
+
+/* This function requires the caller holds hci_req_sync_lock */
+int msft_suspend_sync(struct hci_dev *hdev)
+{
+ struct msft_data *msft = hdev->msft_data;
+ struct adv_monitor *monitor;
+ int handle = 0;
+
+ if (!msft || !msft_monitor_supported(hdev))
+ return 0;
+
+ msft->suspending = true;
+
+ while (1) {
+ monitor = idr_get_next(&hdev->adv_monitors_idr, &handle);
+ if (!monitor)
+ break;
+
+ msft_remove_monitor_sync(hdev, monitor);
+
+ handle++;
+ }
+
+ /* All monitors have been removed */
+ msft->suspending = false;
+
+ return 0;
+}
+
+static bool msft_monitor_rssi_valid(struct adv_monitor *monitor)
+{
+ struct adv_rssi_thresholds *r = &monitor->rssi;
+
+ if (r->high_threshold < MSFT_RSSI_THRESHOLD_VALUE_MIN ||
+ r->high_threshold > MSFT_RSSI_THRESHOLD_VALUE_MAX ||
+ r->low_threshold < MSFT_RSSI_THRESHOLD_VALUE_MIN ||
+ r->low_threshold > MSFT_RSSI_THRESHOLD_VALUE_MAX)
+ return false;
+
+ /* High_threshold_timeout is not supported,
+ * once high_threshold is reached, events are immediately reported.
+ */
+ if (r->high_threshold_timeout != 0)
+ return false;
+
+ if (r->low_threshold_timeout > MSFT_RSSI_LOW_TIMEOUT_MAX)
+ return false;
+
+ /* Sampling period from 0x00 to 0xFF are all allowed */
+ return true;
+}
+
+static bool msft_monitor_pattern_valid(struct adv_monitor *monitor)
+{
+ return msft_monitor_rssi_valid(monitor);
+ /* No additional check needed for pattern-based monitor */
+}
+
+static int msft_add_monitor_sync(struct hci_dev *hdev,
+ struct adv_monitor *monitor)
+{
+ struct msft_cp_le_monitor_advertisement *cp;
+ struct msft_le_monitor_advertisement_pattern_data *pattern_data;
+ struct msft_monitor_advertisement_handle_data *handle_data;
+ struct msft_le_monitor_advertisement_pattern *pattern;
+ struct adv_pattern *entry;
+ size_t total_size = sizeof(*cp) + sizeof(*pattern_data);
+ ptrdiff_t offset = 0;
+ u8 pattern_count = 0;
+ struct sk_buff *skb;
+ int err;
+
+ if (!msft_monitor_pattern_valid(monitor))
+ return -EINVAL;
+
+ list_for_each_entry(entry, &monitor->patterns, list) {
+ pattern_count++;
+ total_size += sizeof(*pattern) + entry->length;
+ }
+
+ cp = kmalloc(total_size, GFP_KERNEL);
+ if (!cp)
+ return -ENOMEM;
+
+ cp->sub_opcode = MSFT_OP_LE_MONITOR_ADVERTISEMENT;
+ cp->rssi_high = monitor->rssi.high_threshold;
+ cp->rssi_low = monitor->rssi.low_threshold;
+ cp->rssi_low_interval = (u8)monitor->rssi.low_threshold_timeout;
+ cp->rssi_sampling_period = monitor->rssi.sampling_period;
+
+ cp->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN;
+
+ pattern_data = (void *)cp->data;
+ pattern_data->count = pattern_count;
+
+ list_for_each_entry(entry, &monitor->patterns, list) {
+ pattern = (void *)(pattern_data->data + offset);
+ /* the length also includes data_type and offset */
+ pattern->length = entry->length + 2;
+ pattern->data_type = entry->ad_type;
+ pattern->start_byte = entry->offset;
+ memcpy(pattern->pattern, entry->value, entry->length);
+ offset += sizeof(*pattern) + entry->length;
+ }
+
+ skb = __hci_cmd_sync(hdev, hdev->msft_opcode, total_size, cp,
+ HCI_CMD_TIMEOUT);
+
+ if (IS_ERR(skb)) {
+ err = PTR_ERR(skb);
+ goto out_free;
+ }
+
+ err = msft_le_monitor_advertisement_cb(hdev, hdev->msft_opcode,
+ monitor, skb);
+ if (err)
+ goto out_free;
+
+ handle_data = msft_find_handle_data(hdev, monitor->handle, true);
+ if (!handle_data) {
+ err = -ENODATA;
+ goto out_free;
+ }
+
+ handle_data->rssi_high = cp->rssi_high;
+ handle_data->rssi_low = cp->rssi_low;
+ handle_data->rssi_low_interval = cp->rssi_low_interval;
+ handle_data->rssi_sampling_period = cp->rssi_sampling_period;
+
+out_free:
+ kfree(cp);
+ return err;
+}
+
+/* This function requires the caller holds hci_req_sync_lock */
+static void reregister_monitor(struct hci_dev *hdev)
+{
+ struct adv_monitor *monitor;
+ struct msft_data *msft = hdev->msft_data;
+ int handle = 0;
+
+ if (!msft)
+ return;
+
+ msft->resuming = true;
+
+ while (1) {
+ monitor = idr_get_next(&hdev->adv_monitors_idr, &handle);
+ if (!monitor)
+ break;
+
+ msft_add_monitor_sync(hdev, monitor);
+
+ handle++;
+ }
+
+ /* All monitors have been reregistered */
+ msft->resuming = false;
+}
+
+/* This function requires the caller holds hci_req_sync_lock */
+int msft_resume_sync(struct hci_dev *hdev)
+{
+ struct msft_data *msft = hdev->msft_data;
+
+ if (!msft || !msft_monitor_supported(hdev))
+ return 0;
+
+ hci_dev_lock(hdev);
+
+ /* Clear already tracked devices on resume. Once the monitors are
+ * reregistered, devices in range will be found again after resume.
+ */
+ hdev->advmon_pend_notify = false;
+ msft_monitor_device_del(hdev, 0, NULL, 0, true);
+
+ hci_dev_unlock(hdev);
+
+ reregister_monitor(hdev);
+
+ return 0;
+}
+
+/* This function requires the caller holds hci_req_sync_lock */
+void msft_do_open(struct hci_dev *hdev)
+{
+ struct msft_data *msft = hdev->msft_data;
+
+ if (hdev->msft_opcode == HCI_OP_NOP)
+ return;
+
+ if (!msft) {
+ bt_dev_err(hdev, "MSFT extension not registered");
+ return;
+ }
+
+ bt_dev_dbg(hdev, "Initialize MSFT extension");
+
+ /* Reset existing MSFT data before re-reading */
+ kfree(msft->evt_prefix);
+ msft->evt_prefix = NULL;
+ msft->evt_prefix_len = 0;
+ msft->features = 0;
+
+ if (!read_supported_features(hdev, msft)) {
+ hdev->msft_data = NULL;
+ kfree(msft);
+ return;
+ }
+
+ if (msft_monitor_supported(hdev)) {
+ msft->resuming = true;
+ msft_set_filter_enable(hdev, true);
+ /* Monitors get removed on power off, so we need to explicitly
+ * tell the controller to re-monitor.
+ */
+ reregister_monitor(hdev);
+ }
+}
+
+void msft_do_close(struct hci_dev *hdev)
+{
+ struct msft_data *msft = hdev->msft_data;
+ struct msft_monitor_advertisement_handle_data *handle_data, *tmp;
+ struct msft_monitor_addr_filter_data *address_filter, *n;
+ struct adv_monitor *monitor;
+
+ if (!msft)
+ return;
+
+ bt_dev_dbg(hdev, "Cleanup of MSFT extension");
+
+ /* The controller will silently remove all monitors on power off.
+ * Therefore, remove handle_data mapping and reset monitor state.
+ */
+ list_for_each_entry_safe(handle_data, tmp, &msft->handle_map, list) {
+ monitor = idr_find(&hdev->adv_monitors_idr,
+ handle_data->mgmt_handle);
+
+ if (monitor && monitor->state == ADV_MONITOR_STATE_OFFLOADED)
+ monitor->state = ADV_MONITOR_STATE_REGISTERED;
+
+ list_del(&handle_data->list);
+ kfree(handle_data);
+ }
+
+ mutex_lock(&msft->filter_lock);
+ list_for_each_entry_safe(address_filter, n, &msft->address_filters,
+ list) {
+ list_del(&address_filter->list);
+ kfree(address_filter);
+ }
+ mutex_unlock(&msft->filter_lock);
+
+ hci_dev_lock(hdev);
+
+ /* Clear any devices that are being monitored and notify device lost */
+ hdev->advmon_pend_notify = false;
+ msft_monitor_device_del(hdev, 0, NULL, 0, true);
+
+ hci_dev_unlock(hdev);
+}
+
+static int msft_cancel_address_filter_sync(struct hci_dev *hdev, void *data)
+{
+ struct msft_monitor_addr_filter_data *address_filter = data;
+ struct msft_cp_le_cancel_monitor_advertisement cp;
+ struct msft_data *msft = hdev->msft_data;
+ struct sk_buff *skb;
+ int err = 0;
+
+ if (!msft) {
+ bt_dev_err(hdev, "MSFT: msft data is freed");
+ return -EINVAL;
+ }
+
+ /* The address filter has been removed by hci dev close */
+ if (!test_bit(HCI_UP, &hdev->flags))
+ return 0;
+
+ mutex_lock(&msft->filter_lock);
+ list_del(&address_filter->list);
+ mutex_unlock(&msft->filter_lock);
+
+ cp.sub_opcode = MSFT_OP_LE_CANCEL_MONITOR_ADVERTISEMENT;
+ cp.handle = address_filter->msft_handle;
+
+ skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+ if (IS_ERR(skb)) {
+ bt_dev_err(hdev, "MSFT: Failed to cancel address (%pMR) filter",
+ &address_filter->bdaddr);
+ err = PTR_ERR(skb);
+ goto done;
+ }
+ kfree_skb(skb);
+
+ bt_dev_dbg(hdev, "MSFT: Canceled device %pMR address filter",
+ &address_filter->bdaddr);
+
+done:
+ kfree(address_filter);
+
+ return err;
+}
+
+void msft_register(struct hci_dev *hdev)
+{
+ struct msft_data *msft = NULL;
+
+ bt_dev_dbg(hdev, "Register MSFT extension");
+
+ msft = kzalloc(sizeof(*msft), GFP_KERNEL);
+ if (!msft) {
+ bt_dev_err(hdev, "Failed to register MSFT extension");
+ return;
+ }
+
+ INIT_LIST_HEAD(&msft->handle_map);
+ INIT_LIST_HEAD(&msft->address_filters);
+ hdev->msft_data = msft;
+ mutex_init(&msft->filter_lock);
+}
+
+void msft_release(struct hci_dev *hdev)
+{
+ struct msft_data *msft = hdev->msft_data;
+
+ if (!msft)
+ return;
+
+ bt_dev_dbg(hdev, "Unregister MSFT extension");
+
+ hdev->msft_data = NULL;
+
+ kfree(msft->evt_prefix);
+ mutex_destroy(&msft->filter_lock);
+ kfree(msft);
+}
+
+/* This function requires the caller holds hdev->lock */
+static void msft_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ __u8 addr_type, __u16 mgmt_handle)
+{
+ struct monitored_device *dev;
+
+ dev = kmalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev) {
+ bt_dev_err(hdev, "MSFT vendor event %u: no memory",
+ MSFT_EV_LE_MONITOR_DEVICE);
+ return;
+ }
+
+ bacpy(&dev->bdaddr, bdaddr);
+ dev->addr_type = addr_type;
+ dev->handle = mgmt_handle;
+ dev->notified = false;
+
+ INIT_LIST_HEAD(&dev->list);
+ list_add(&dev->list, &hdev->monitored_devices);
+ hdev->advmon_pend_notify = true;
+}
+
+/* This function requires the caller holds hdev->lock */
+static void msft_device_lost(struct hci_dev *hdev, bdaddr_t *bdaddr,
+ __u8 addr_type, __u16 mgmt_handle)
+{
+ if (!msft_monitor_device_del(hdev, mgmt_handle, bdaddr, addr_type,
+ true)) {
+ bt_dev_err(hdev, "MSFT vendor event %u: dev %pMR not in list",
+ MSFT_EV_LE_MONITOR_DEVICE, bdaddr);
+ }
+}
+
+static void *msft_skb_pull(struct hci_dev *hdev, struct sk_buff *skb,
+ u8 ev, size_t len)
+{
+ void *data;
+
+ data = skb_pull_data(skb, len);
+ if (!data)
+ bt_dev_err(hdev, "Malformed MSFT vendor event: 0x%02x", ev);
+
+ return data;
+}
+
+static int msft_add_address_filter_sync(struct hci_dev *hdev, void *data)
+{
+ struct msft_monitor_addr_filter_data *address_filter = data;
+ struct msft_rp_le_monitor_advertisement *rp;
+ struct msft_cp_le_monitor_advertisement *cp;
+ struct msft_data *msft = hdev->msft_data;
+ struct sk_buff *skb = NULL;
+ bool remove = false;
+ size_t size;
+
+ if (!msft) {
+ bt_dev_err(hdev, "MSFT: msft data is freed");
+ return -EINVAL;
+ }
+
+ /* The address filter has been removed by hci dev close */
+ if (!test_bit(HCI_UP, &hdev->flags))
+ return -ENODEV;
+
+ /* We are safe to use the address filter from now on.
+ * msft_monitor_device_evt() wouldn't delete this filter because it's
+ * not been added by now.
+ * And all other functions that requiring hci_req_sync_lock wouldn't
+ * touch this filter before this func completes because it's protected
+ * by hci_req_sync_lock.
+ */
+
+ if (address_filter->state == AF_STATE_REMOVING) {
+ mutex_lock(&msft->filter_lock);
+ list_del(&address_filter->list);
+ mutex_unlock(&msft->filter_lock);
+ kfree(address_filter);
+ return 0;
+ }
+
+ size = sizeof(*cp) +
+ sizeof(address_filter->addr_type) +
+ sizeof(address_filter->bdaddr);
+ cp = kzalloc(size, GFP_KERNEL);
+ if (!cp) {
+ bt_dev_err(hdev, "MSFT: Alloc cmd param err");
+ remove = true;
+ goto done;
+ }
+
+ cp->sub_opcode = MSFT_OP_LE_MONITOR_ADVERTISEMENT;
+ cp->rssi_high = address_filter->rssi_high;
+ cp->rssi_low = address_filter->rssi_low;
+ cp->rssi_low_interval = address_filter->rssi_low_interval;
+ cp->rssi_sampling_period = address_filter->rssi_sampling_period;
+ cp->cond_type = MSFT_MONITOR_ADVERTISEMENT_TYPE_ADDR;
+ cp->data[0] = address_filter->addr_type;
+ memcpy(&cp->data[1], &address_filter->bdaddr,
+ sizeof(address_filter->bdaddr));
+
+ skb = __hci_cmd_sync(hdev, hdev->msft_opcode, size, cp,
+ HCI_CMD_TIMEOUT);
+ kfree(cp);
+
+ if (IS_ERR(skb)) {
+ bt_dev_err(hdev, "Failed to enable address %pMR filter",
+ &address_filter->bdaddr);
+ skb = NULL;
+ remove = true;
+ goto done;
+ }
+
+ rp = skb_pull_data(skb, sizeof(*rp));
+ if (!rp || rp->sub_opcode != MSFT_OP_LE_MONITOR_ADVERTISEMENT ||
+ rp->status)
+ remove = true;
+
+done:
+ mutex_lock(&msft->filter_lock);
+
+ if (remove) {
+ bt_dev_warn(hdev, "MSFT: Remove address (%pMR) filter",
+ &address_filter->bdaddr);
+ list_del(&address_filter->list);
+ kfree(address_filter);
+ } else {
+ address_filter->state = AF_STATE_ADDED;
+ address_filter->msft_handle = rp->handle;
+ bt_dev_dbg(hdev, "MSFT: Address %pMR filter enabled",
+ &address_filter->bdaddr);
+ }
+ mutex_unlock(&msft->filter_lock);
+
+ kfree_skb(skb);
+
+ return 0;
+}
+
+/* This function requires the caller holds msft->filter_lock */
+static struct msft_monitor_addr_filter_data *msft_add_address_filter
+ (struct hci_dev *hdev, u8 addr_type, bdaddr_t *bdaddr,
+ struct msft_monitor_advertisement_handle_data *handle_data)
+{
+ struct msft_monitor_addr_filter_data *address_filter = NULL;
+ struct msft_data *msft = hdev->msft_data;
+ int err;
+
+ address_filter = kzalloc(sizeof(*address_filter), GFP_KERNEL);
+ if (!address_filter)
+ return NULL;
+
+ address_filter->state = AF_STATE_ADDING;
+ address_filter->msft_handle = 0xff;
+ address_filter->pattern_handle = handle_data->msft_handle;
+ address_filter->mgmt_handle = handle_data->mgmt_handle;
+ address_filter->rssi_high = handle_data->rssi_high;
+ address_filter->rssi_low = handle_data->rssi_low;
+ address_filter->rssi_low_interval = handle_data->rssi_low_interval;
+ address_filter->rssi_sampling_period = handle_data->rssi_sampling_period;
+ address_filter->addr_type = addr_type;
+ bacpy(&address_filter->bdaddr, bdaddr);
+
+ /* With the above AF_STATE_ADDING, duplicated address filter can be
+ * avoided when receiving monitor device event (found/lost) frequently
+ * for the same device.
+ */
+ list_add_tail(&address_filter->list, &msft->address_filters);
+
+ err = hci_cmd_sync_queue(hdev, msft_add_address_filter_sync,
+ address_filter, NULL);
+ if (err < 0) {
+ bt_dev_err(hdev, "MSFT: Add address %pMR filter err", bdaddr);
+ list_del(&address_filter->list);
+ kfree(address_filter);
+ return NULL;
+ }
+
+ bt_dev_dbg(hdev, "MSFT: Add device %pMR address filter",
+ &address_filter->bdaddr);
+
+ return address_filter;
+}
+
+/* This function requires the caller holds hdev->lock */
+static void msft_monitor_device_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct msft_monitor_addr_filter_data *n, *address_filter = NULL;
+ struct msft_ev_le_monitor_device *ev;
+ struct msft_monitor_advertisement_handle_data *handle_data;
+ struct msft_data *msft = hdev->msft_data;
+ u16 mgmt_handle = 0xffff;
+ u8 addr_type;
+
+ ev = msft_skb_pull(hdev, skb, MSFT_EV_LE_MONITOR_DEVICE, sizeof(*ev));
+ if (!ev)
+ return;
+
+ bt_dev_dbg(hdev,
+ "MSFT vendor event 0x%02x: handle 0x%04x state %d addr %pMR",
+ MSFT_EV_LE_MONITOR_DEVICE, ev->monitor_handle,
+ ev->monitor_state, &ev->bdaddr);
+
+ handle_data = msft_find_handle_data(hdev, ev->monitor_handle, false);
+
+ if (!hci_test_quirk(hdev, HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER)) {
+ if (!handle_data)
+ return;
+ mgmt_handle = handle_data->mgmt_handle;
+ goto report_state;
+ }
+
+ if (handle_data) {
+ /* Don't report any device found/lost event from pattern
+ * monitors. Pattern monitor always has its address filters for
+ * tracking devices.
+ */
+
+ address_filter = msft_find_address_data(hdev, ev->addr_type,
+ &ev->bdaddr,
+ handle_data->msft_handle);
+ if (address_filter)
+ return;
+
+ if (ev->monitor_state && handle_data->cond_type ==
+ MSFT_MONITOR_ADVERTISEMENT_TYPE_PATTERN)
+ msft_add_address_filter(hdev, ev->addr_type,
+ &ev->bdaddr, handle_data);
+
+ return;
+ }
+
+ /* This device event is not from pattern monitor.
+ * Report it if there is a corresponding address_filter for it.
+ */
+ list_for_each_entry(n, &msft->address_filters, list) {
+ if (n->state == AF_STATE_ADDED &&
+ n->msft_handle == ev->monitor_handle) {
+ mgmt_handle = n->mgmt_handle;
+ address_filter = n;
+ break;
+ }
+ }
+
+ if (!address_filter) {
+ bt_dev_warn(hdev, "MSFT: Unexpected device event %pMR, %u, %u",
+ &ev->bdaddr, ev->monitor_handle, ev->monitor_state);
+ return;
+ }
+
+report_state:
+ switch (ev->addr_type) {
+ case ADDR_LE_DEV_PUBLIC:
+ addr_type = BDADDR_LE_PUBLIC;
+ break;
+
+ case ADDR_LE_DEV_RANDOM:
+ addr_type = BDADDR_LE_RANDOM;
+ break;
+
+ default:
+ bt_dev_err(hdev,
+ "MSFT vendor event 0x%02x: unknown addr type 0x%02x",
+ MSFT_EV_LE_MONITOR_DEVICE, ev->addr_type);
+ return;
+ }
+
+ if (ev->monitor_state) {
+ msft_device_found(hdev, &ev->bdaddr, addr_type, mgmt_handle);
+ } else {
+ if (address_filter && address_filter->state == AF_STATE_ADDED) {
+ address_filter->state = AF_STATE_REMOVING;
+ hci_cmd_sync_queue(hdev,
+ msft_cancel_address_filter_sync,
+ address_filter,
+ NULL);
+ }
+ msft_device_lost(hdev, &ev->bdaddr, addr_type, mgmt_handle);
+ }
+}
+
+void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb)
+{
+ struct msft_data *msft = hdev->msft_data;
+ u8 *evt_prefix;
+ u8 *evt;
+
+ if (!msft)
+ return;
+
+ /* When the extension has defined an event prefix, check that it
+ * matches, and otherwise just return.
+ */
+ if (msft->evt_prefix_len > 0) {
+ evt_prefix = msft_skb_pull(hdev, skb, 0, msft->evt_prefix_len);
+ if (!evt_prefix)
+ return;
+
+ if (memcmp(evt_prefix, msft->evt_prefix, msft->evt_prefix_len))
+ return;
+ }
+
+ /* Every event starts at least with an event code and the rest of
+ * the data is variable and depends on the event code.
+ */
+ if (skb->len < 1)
+ return;
+
+ evt = msft_skb_pull(hdev, skb, 0, sizeof(*evt));
+ if (!evt)
+ return;
+
+ hci_dev_lock(hdev);
+
+ switch (*evt) {
+ case MSFT_EV_LE_MONITOR_DEVICE:
+ mutex_lock(&msft->filter_lock);
+ msft_monitor_device_evt(hdev, skb);
+ mutex_unlock(&msft->filter_lock);
+ break;
+
+ default:
+ bt_dev_dbg(hdev, "MSFT vendor event 0x%02x", *evt);
+ break;
+ }
+
+ hci_dev_unlock(hdev);
+}
+
+__u64 msft_get_features(struct hci_dev *hdev)
+{
+ struct msft_data *msft = hdev->msft_data;
+
+ return msft ? msft->features : 0;
+}
+
+static void msft_le_set_advertisement_filter_enable_cb(struct hci_dev *hdev,
+ void *user_data,
+ u8 status)
+{
+ struct msft_cp_le_set_advertisement_filter_enable *cp = user_data;
+ struct msft_data *msft = hdev->msft_data;
+
+ /* Error 0x0C would be returned if the filter enabled status is
+ * already set to whatever we were trying to set.
+ * Although the default state should be disabled, some controller set
+ * the initial value to enabled. Because there is no way to know the
+ * actual initial value before sending this command, here we also treat
+ * error 0x0C as success.
+ */
+ if (status != 0x00 && status != 0x0C)
+ return;
+
+ hci_dev_lock(hdev);
+
+ msft->filter_enabled = cp->enable;
+
+ if (status == 0x0C)
+ bt_dev_warn(hdev, "MSFT filter_enable is already %s",
+ cp->enable ? "on" : "off");
+
+ hci_dev_unlock(hdev);
+}
+
+/* This function requires the caller holds hci_req_sync_lock */
+int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor)
+{
+ struct msft_data *msft = hdev->msft_data;
+
+ if (!msft)
+ return -EOPNOTSUPP;
+
+ if (msft->resuming || msft->suspending)
+ return -EBUSY;
+
+ return msft_add_monitor_sync(hdev, monitor);
+}
+
+/* This function requires the caller holds hci_req_sync_lock */
+int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor)
+{
+ struct msft_data *msft = hdev->msft_data;
+
+ if (!msft)
+ return -EOPNOTSUPP;
+
+ if (msft->resuming || msft->suspending)
+ return -EBUSY;
+
+ return msft_remove_monitor_sync(hdev, monitor);
+}
+
+int msft_set_filter_enable(struct hci_dev *hdev, bool enable)
+{
+ struct msft_cp_le_set_advertisement_filter_enable cp;
+ struct msft_data *msft = hdev->msft_data;
+ int err;
+
+ if (!msft)
+ return -EOPNOTSUPP;
+
+ cp.sub_opcode = MSFT_OP_LE_SET_ADVERTISEMENT_FILTER_ENABLE;
+ cp.enable = enable;
+ err = __hci_cmd_sync_status(hdev, hdev->msft_opcode, sizeof(cp), &cp,
+ HCI_CMD_TIMEOUT);
+
+ msft_le_set_advertisement_filter_enable_cb(hdev, &cp, err);
+
+ return 0;
+}
+
+bool msft_curve_validity(struct hci_dev *hdev)
+{
+ return hdev->msft_curve_validity;
+}
diff --git a/net/bluetooth/msft.h b/net/bluetooth/msft.h
new file mode 100644
index 000000000000..fe538e9c91c0
--- /dev/null
+++ b/net/bluetooth/msft.h
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Google Corporation
+ */
+
+#define MSFT_FEATURE_MASK_BREDR_RSSI_MONITOR BIT(0)
+#define MSFT_FEATURE_MASK_LE_CONN_RSSI_MONITOR BIT(1)
+#define MSFT_FEATURE_MASK_LE_ADV_RSSI_MONITOR BIT(2)
+#define MSFT_FEATURE_MASK_LE_ADV_MONITOR BIT(3)
+#define MSFT_FEATURE_MASK_CURVE_VALIDITY BIT(4)
+#define MSFT_FEATURE_MASK_CONCURRENT_ADV_MONITOR BIT(5)
+
+#if IS_ENABLED(CONFIG_BT_MSFTEXT)
+
+bool msft_monitor_supported(struct hci_dev *hdev);
+void msft_register(struct hci_dev *hdev);
+void msft_release(struct hci_dev *hdev);
+void msft_do_open(struct hci_dev *hdev);
+void msft_do_close(struct hci_dev *hdev);
+void msft_vendor_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb);
+__u64 msft_get_features(struct hci_dev *hdev);
+int msft_add_monitor_pattern(struct hci_dev *hdev, struct adv_monitor *monitor);
+int msft_remove_monitor(struct hci_dev *hdev, struct adv_monitor *monitor);
+void msft_req_add_set_filter_enable(struct hci_request *req, bool enable);
+int msft_set_filter_enable(struct hci_dev *hdev, bool enable);
+int msft_suspend_sync(struct hci_dev *hdev);
+int msft_resume_sync(struct hci_dev *hdev);
+bool msft_curve_validity(struct hci_dev *hdev);
+
+#else
+
+static inline bool msft_monitor_supported(struct hci_dev *hdev)
+{
+ return false;
+}
+
+static inline void msft_register(struct hci_dev *hdev) {}
+static inline void msft_release(struct hci_dev *hdev) {}
+static inline void msft_do_open(struct hci_dev *hdev) {}
+static inline void msft_do_close(struct hci_dev *hdev) {}
+static inline void msft_vendor_evt(struct hci_dev *hdev, void *data,
+ struct sk_buff *skb) {}
+static inline __u64 msft_get_features(struct hci_dev *hdev) { return 0; }
+static inline int msft_add_monitor_pattern(struct hci_dev *hdev,
+ struct adv_monitor *monitor)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int msft_remove_monitor(struct hci_dev *hdev,
+ struct adv_monitor *monitor)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void msft_req_add_set_filter_enable(struct hci_request *req,
+ bool enable) {}
+static inline int msft_set_filter_enable(struct hci_dev *hdev, bool enable)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int msft_suspend_sync(struct hci_dev *hdev)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int msft_resume_sync(struct hci_dev *hdev)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline bool msft_curve_validity(struct hci_dev *hdev)
+{
+ return false;
+}
+
+#endif
diff --git a/net/bluetooth/rfcomm/Kconfig b/net/bluetooth/rfcomm/Kconfig
index 335df7515220..9b9953ebf4c0 100644
--- a/net/bluetooth/rfcomm/Kconfig
+++ b/net/bluetooth/rfcomm/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config BT_RFCOMM
tristate "RFCOMM protocol support"
depends on BT_BREDR
diff --git a/net/bluetooth/rfcomm/Makefile b/net/bluetooth/rfcomm/Makefile
index fe07988a3705..593e5c48c131 100644
--- a/net/bluetooth/rfcomm/Makefile
+++ b/net/bluetooth/rfcomm/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the Linux Bluetooth RFCOMM layer.
#
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index b98225d65e87..57b1dca8141f 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -28,19 +28,20 @@
#include <linux/module.h>
#include <linux/debugfs.h>
#include <linux/kthread.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
#include <net/bluetooth/l2cap.h>
#include <net/bluetooth/rfcomm.h>
+#include <trace/events/sock.h>
+
#define VERSION "1.11"
static bool disable_cfc;
static bool l2cap_ertm;
static int channel_mtu = -1;
-static unsigned int l2cap_mtu = RFCOMM_MAX_L2CAP_MTU;
static struct task_struct *rfcomm_thread;
@@ -73,8 +74,6 @@ static struct rfcomm_session *rfcomm_session_del(struct rfcomm_session *s);
/* ---- RFCOMM frame parsing macros ---- */
#define __get_dlci(b) ((b & 0xfc) >> 2)
-#define __get_channel(b) ((b & 0xf8) >> 3)
-#define __get_dir(b) ((b & 0x04) >> 2)
#define __get_type(b) ((b & 0xef))
#define __test_ea(b) ((b & 0x01))
@@ -87,7 +86,6 @@ static struct rfcomm_session *rfcomm_session_del(struct rfcomm_session *s);
#define __ctrl(type, pf) (((type & 0xef) | (pf << 4)))
#define __dlci(dir, chn) (((chn & 0x1f) << 1) | dir)
#define __srv_channel(dlci) (dlci >> 1)
-#define __dir(dlci) (dlci & 0x01)
#define __len8(len) (((len) << 1) | 1)
#define __len16(len) ((len) << 1)
@@ -190,6 +188,8 @@ static void rfcomm_l2state_change(struct sock *sk)
static void rfcomm_l2data_ready(struct sock *sk)
{
+ trace_sk_data_ready(sk);
+
BT_DBG("%p", sk);
rfcomm_schedule();
}
@@ -235,7 +235,7 @@ static int rfcomm_check_security(struct rfcomm_dlc *d)
static void rfcomm_session_timeout(struct timer_list *t)
{
- struct rfcomm_session *s = from_timer(s, t, timer);
+ struct rfcomm_session *s = timer_container_of(s, t, timer);
BT_DBG("session %p state %ld", s, s->state);
@@ -254,13 +254,13 @@ static void rfcomm_session_clear_timer(struct rfcomm_session *s)
{
BT_DBG("session %p state %ld", s, s->state);
- del_timer_sync(&s->timer);
+ timer_delete_sync(&s->timer);
}
/* ---- RFCOMM DLCs ---- */
static void rfcomm_dlc_timeout(struct timer_list *t)
{
- struct rfcomm_dlc *d = from_timer(d, t, timer);
+ struct rfcomm_dlc *d = timer_container_of(d, t, timer);
BT_DBG("dlc %p state %ld", d, d->state);
@@ -281,7 +281,7 @@ static void rfcomm_dlc_clear_timer(struct rfcomm_dlc *d)
{
BT_DBG("dlc %p state %ld", d, d->state);
- if (del_timer(&d->timer))
+ if (timer_delete(&d->timer))
rfcomm_dlc_put(d);
}
@@ -483,6 +483,7 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
/* if closing a dlc in a session that hasn't been started,
* just close and unlink the dlc
*/
+ fallthrough;
default:
rfcomm_dlc_clear_timer(d);
@@ -552,22 +553,58 @@ struct rfcomm_dlc *rfcomm_dlc_exists(bdaddr_t *src, bdaddr_t *dst, u8 channel)
return dlc;
}
+static int rfcomm_dlc_send_frag(struct rfcomm_dlc *d, struct sk_buff *frag)
+{
+ int len = frag->len;
+
+ BT_DBG("dlc %p mtu %d len %d", d, d->mtu, len);
+
+ if (len > d->mtu)
+ return -EINVAL;
+
+ rfcomm_make_uih(frag, d->addr);
+ __skb_queue_tail(&d->tx_queue, frag);
+
+ return len;
+}
+
int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb)
{
- int len = skb->len;
+ unsigned long flags;
+ struct sk_buff *frag, *next;
+ int len;
if (d->state != BT_CONNECTED)
return -ENOTCONN;
- BT_DBG("dlc %p mtu %d len %d", d, d->mtu, len);
+ frag = skb_shinfo(skb)->frag_list;
+ skb_shinfo(skb)->frag_list = NULL;
- if (len > d->mtu)
- return -EINVAL;
+ /* Queue all fragments atomically. */
+ spin_lock_irqsave(&d->tx_queue.lock, flags);
- rfcomm_make_uih(skb, d->addr);
- skb_queue_tail(&d->tx_queue, skb);
+ len = rfcomm_dlc_send_frag(d, skb);
+ if (len < 0 || !frag)
+ goto unlock;
+
+ for (; frag; frag = next) {
+ int ret;
+
+ next = frag->next;
+
+ ret = rfcomm_dlc_send_frag(d, frag);
+ if (ret < 0) {
+ dev_kfree_skb_irq(frag);
+ goto unlock;
+ }
+
+ len += ret;
+ }
+
+unlock:
+ spin_unlock_irqrestore(&d->tx_queue.lock, flags);
- if (!test_bit(RFCOMM_TX_THROTTLED, &d->flags))
+ if (len > 0 && !test_bit(RFCOMM_TX_THROTTLED, &d->flags))
rfcomm_schedule();
return len;
}
@@ -744,14 +781,15 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src,
addr.l2_psm = 0;
addr.l2_cid = 0;
addr.l2_bdaddr_type = BDADDR_BREDR;
- *err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr));
+ *err = kernel_bind(sock, (struct sockaddr_unsized *)&addr, sizeof(addr));
if (*err < 0)
goto failed;
/* Set L2CAP options */
sk = sock->sk;
lock_sock(sk);
- l2cap_pi(sk)->chan->imtu = l2cap_mtu;
+ /* Set MTU to 0 so L2CAP can auto select the MTU */
+ l2cap_pi(sk)->chan->imtu = 0;
l2cap_pi(sk)->chan->sec_level = sec_level;
if (l2cap_ertm)
l2cap_pi(sk)->chan->mode = L2CAP_MODE_ERTM;
@@ -770,7 +808,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src,
addr.l2_psm = cpu_to_le16(L2CAP_PSM_RFCOMM);
addr.l2_cid = 0;
addr.l2_bdaddr_type = BDADDR_BREDR;
- *err = kernel_connect(sock, (struct sockaddr *) &addr, sizeof(addr), O_NONBLOCK);
+ *err = kernel_connect(sock, (struct sockaddr_unsized *)&addr, sizeof(addr), O_NONBLOCK);
if (*err == 0 || *err == -EINPROGRESS)
return s;
@@ -1903,7 +1941,7 @@ static struct rfcomm_session *rfcomm_process_rx(struct rfcomm_session *s)
/* Get data directly from socket receive queue without copying it. */
while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
skb_orphan(skb);
- if (!skb_linearize(skb)) {
+ if (!skb_linearize(skb) && sk->sk_state != BT_CLOSED) {
s = rfcomm_recv_frame(s, skb);
if (!s)
break;
@@ -1924,7 +1962,8 @@ static void rfcomm_accept_connection(struct rfcomm_session *s)
int err;
/* Fast check for a new connection.
- * Avoids unnesesary socket allocations. */
+ * Avoids unnecessary socket allocations.
+ */
if (list_empty(&bt_sk(sock->sk)->accept_q))
return;
@@ -2029,7 +2068,7 @@ static int rfcomm_add_listener(bdaddr_t *ba)
addr.l2_psm = cpu_to_le16(L2CAP_PSM_RFCOMM);
addr.l2_cid = 0;
addr.l2_bdaddr_type = BDADDR_BREDR;
- err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr));
+ err = kernel_bind(sock, (struct sockaddr_unsized *)&addr, sizeof(addr));
if (err < 0) {
BT_ERR("Bind failed %d", err);
goto failed;
@@ -2038,7 +2077,8 @@ static int rfcomm_add_listener(bdaddr_t *ba)
/* Set L2CAP options */
sk = sock->sk;
lock_sock(sk);
- l2cap_pi(sk)->chan->imtu = l2cap_mtu;
+ /* Set MTU to 0 so L2CAP can auto select the MTU */
+ l2cap_pi(sk)->chan->imtu = 0;
release_sock(sk);
/* Start listening on the socket */
@@ -2166,17 +2206,7 @@ static int rfcomm_dlc_debugfs_show(struct seq_file *f, void *x)
return 0;
}
-static int rfcomm_dlc_debugfs_open(struct inode *inode, struct file *file)
-{
- return single_open(file, rfcomm_dlc_debugfs_show, inode->i_private);
-}
-
-static const struct file_operations rfcomm_dlc_debugfs_fops = {
- .open = rfcomm_dlc_debugfs_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(rfcomm_dlc_debugfs);
static struct dentry *rfcomm_dlc_debugfs;
@@ -2246,9 +2276,6 @@ MODULE_PARM_DESC(disable_cfc, "Disable credit based flow control");
module_param(channel_mtu, int, 0644);
MODULE_PARM_DESC(channel_mtu, "Default MTU for the RFCOMM channel");
-module_param(l2cap_mtu, uint, 0644);
-MODULE_PARM_DESC(l2cap_mtu, "Default MTU for the L2CAP connection");
-
module_param(l2cap_ertm, bool, 0644);
MODULE_PARM_DESC(l2cap_ertm, "Use L2CAP ERTM mode for connection");
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index d606e9212291..be6639cd6f59 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -24,7 +24,7 @@
/*
* RFCOMM sockets.
*/
-
+#include <linux/compat.h>
#include <linux/export.h>
#include <linux/debugfs.h>
#include <linux/sched/signal.h>
@@ -64,15 +64,13 @@ static void rfcomm_sk_data_ready(struct rfcomm_dlc *d, struct sk_buff *skb)
static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err)
{
struct sock *sk = d->owner, *parent;
- unsigned long flags;
if (!sk)
return;
BT_DBG("dlc %p state %ld err %d", d, d->state, err);
- local_irq_save(flags);
- bh_lock_sock(sk);
+ lock_sock(sk);
if (err)
sk->sk_err = err;
@@ -93,8 +91,7 @@ static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err)
sk->sk_state_change(sk);
}
- bh_unlock_sock(sk);
- local_irq_restore(flags);
+ release_sock(sk);
if (parent && sock_flag(sk, SOCK_ZAPPED)) {
/* We have to drop DLC lock here, otherwise
@@ -221,7 +218,7 @@ static void __rfcomm_sock_close(struct sock *sk)
case BT_CONFIG:
case BT_CONNECTED:
rfcomm_dlc_close(d, 0);
- /* fall through */
+ fallthrough;
default:
sock_set_flag(sk, SOCK_ZAPPED);
@@ -271,21 +268,19 @@ static struct proto rfcomm_proto = {
.obj_size = sizeof(struct rfcomm_pinfo)
};
-static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern)
+static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock,
+ int proto, gfp_t prio, int kern)
{
struct rfcomm_dlc *d;
struct sock *sk;
- sk = sk_alloc(net, PF_BLUETOOTH, prio, &rfcomm_proto, kern);
- if (!sk)
+ d = rfcomm_dlc_alloc(prio);
+ if (!d)
return NULL;
- sock_init_data(sock, sk);
- INIT_LIST_HEAD(&bt_sk(sk)->accept_q);
-
- d = rfcomm_dlc_alloc(prio);
- if (!d) {
- sk_free(sk);
+ sk = bt_sock_alloc(net, sock, &rfcomm_proto, proto, prio, kern);
+ if (!sk) {
+ rfcomm_dlc_free(d);
return NULL;
}
@@ -301,11 +296,6 @@ static struct sock *rfcomm_sock_alloc(struct net *net, struct socket *sock, int
sk->sk_sndbuf = RFCOMM_MAX_CREDITS * RFCOMM_DEFAULT_MTU * 10;
sk->sk_rcvbuf = RFCOMM_MAX_CREDITS * RFCOMM_DEFAULT_MTU * 10;
- sock_reset_flag(sk, SOCK_ZAPPED);
-
- sk->sk_protocol = proto;
- sk->sk_state = BT_OPEN;
-
bt_sock_link(&rfcomm_sk_list, sk);
BT_DBG("sk %p", sk);
@@ -334,7 +324,7 @@ static int rfcomm_sock_create(struct net *net, struct socket *sock,
return 0;
}
-static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len)
+static int rfcomm_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len)
{
struct sockaddr_rc sa;
struct sock *sk = sock->sk;
@@ -381,7 +371,8 @@ done:
return err;
}
-static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags)
+static int rfcomm_sock_connect(struct socket *sock, struct sockaddr_unsized *addr,
+ int alen, int flags)
{
struct sockaddr_rc *sa = (struct sockaddr_rc *) addr;
struct sock *sk = sock->sk;
@@ -394,6 +385,7 @@ static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int a
addr->sa_family != AF_BLUETOOTH)
return -EINVAL;
+ sock_hold(sk);
lock_sock(sk);
if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) {
@@ -413,14 +405,18 @@ static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int a
d->sec_level = rfcomm_pi(sk)->sec_level;
d->role_switch = rfcomm_pi(sk)->role_switch;
+ /* Drop sock lock to avoid potential deadlock with the RFCOMM lock */
+ release_sock(sk);
err = rfcomm_dlc_open(d, &rfcomm_pi(sk)->src, &sa->rc_bdaddr,
sa->rc_channel);
- if (!err)
+ lock_sock(sk);
+ if (!err && !sock_flag(sk, SOCK_ZAPPED))
err = bt_sock_wait_state(sk, BT_CONNECTED,
sock_sndtimeo(sk, flags & O_NONBLOCK));
done:
release_sock(sk);
+ sock_put(sk);
return err;
}
@@ -473,8 +469,8 @@ done:
return err;
}
-static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int flags,
- bool kern)
+static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock,
+ struct proto_accept_arg *arg)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct sock *sk = sock->sk, *nsk;
@@ -488,7 +484,7 @@ static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int f
goto done;
}
- timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+ timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
BT_DBG("sk %p timeo %ld", sk, timeo);
@@ -578,46 +574,20 @@ static int rfcomm_sock_sendmsg(struct socket *sock, struct msghdr *msg,
lock_sock(sk);
sent = bt_sock_wait_ready(sk, msg->msg_flags);
- if (sent)
- goto done;
- while (len) {
- size_t size = min_t(size_t, len, d->mtu);
- int err;
-
- skb = sock_alloc_send_skb(sk, size + RFCOMM_SKB_RESERVE,
- msg->msg_flags & MSG_DONTWAIT, &err);
- if (!skb) {
- if (sent == 0)
- sent = err;
- break;
- }
- skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE);
-
- err = memcpy_from_msg(skb_put(skb, size), msg, size);
- if (err) {
- kfree_skb(skb);
- if (sent == 0)
- sent = err;
- break;
- }
-
- skb->priority = sk->sk_priority;
+ release_sock(sk);
- err = rfcomm_dlc_send(d, skb);
- if (err < 0) {
- kfree_skb(skb);
- if (sent == 0)
- sent = err;
- break;
- }
+ if (sent)
+ return sent;
- sent += size;
- len -= size;
- }
+ skb = bt_skb_sendmmsg(sk, msg, len, d->mtu, RFCOMM_SKB_HEAD_RESERVE,
+ RFCOMM_SKB_TAIL_RESERVE);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
-done:
- release_sock(sk);
+ sent = rfcomm_dlc_send(d, skb);
+ if (sent < 0)
+ kfree_skb(skb);
return sent;
}
@@ -647,7 +617,8 @@ static int rfcomm_sock_recvmsg(struct socket *sock, struct msghdr *msg,
return len;
}
-static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, unsigned int optlen)
+static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
int err = 0;
@@ -659,10 +630,9 @@ static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __u
switch (optname) {
case RFCOMM_LM:
- if (get_user(opt, (u32 __user *) optval)) {
- err = -EFAULT;
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
break;
- }
if (opt & RFCOMM_LM_FIPS) {
err = -EINVAL;
@@ -688,12 +658,12 @@ static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __u
return err;
}
-static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
+static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
struct bt_security sec;
int err = 0;
- size_t len;
u32 opt;
BT_DBG("sk %p", sk);
@@ -715,11 +685,9 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, c
sec.level = BT_SECURITY_LOW;
- len = min_t(unsigned int, sizeof(sec), optlen);
- if (copy_from_user((char *) &sec, optval, len)) {
- err = -EFAULT;
+ err = copy_safe_from_sockptr(&sec, sizeof(sec), optval, optlen);
+ if (err)
break;
- }
if (sec.level > BT_SECURITY_HIGH) {
err = -EINVAL;
@@ -735,10 +703,9 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, c
break;
}
- if (get_user(opt, (u32 __user *) optval)) {
- err = -EFAULT;
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
break;
- }
if (opt)
set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
@@ -762,7 +729,8 @@ static int rfcomm_sock_getsockopt_old(struct socket *sock, int optname, char __u
struct sock *l2cap_sk;
struct l2cap_conn *conn;
struct rfcomm_conninfo cinfo;
- int len, err = 0;
+ int err = 0;
+ size_t len;
u32 opt;
BT_DBG("sk %p", sk);
@@ -816,7 +784,7 @@ static int rfcomm_sock_getsockopt_old(struct socket *sock, int optname, char __u
cinfo.hci_handle = conn->hcon->handle;
memcpy(cinfo.dev_class, conn->hcon->dev_class, 3);
- len = min_t(unsigned int, len, sizeof(cinfo));
+ len = min(len, sizeof(cinfo));
if (copy_to_user(optval, (char *) &cinfo, len))
err = -EFAULT;
@@ -835,7 +803,8 @@ static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, c
{
struct sock *sk = sock->sk;
struct bt_security sec;
- int len, err = 0;
+ int err = 0;
+ size_t len;
BT_DBG("sk %p", sk);
@@ -860,7 +829,7 @@ static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, c
sec.level = rfcomm_pi(sk)->sec_level;
sec.key_size = 0;
- len = min_t(unsigned int, len, sizeof(sec));
+ len = min(len, sizeof(sec));
if (copy_to_user(optval, (char *) &sec, len))
err = -EFAULT;
@@ -898,9 +867,7 @@ static int rfcomm_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned lon
if (err == -ENOIOCTLCMD) {
#ifdef CONFIG_BT_RFCOMM_TTY
- lock_sock(sk);
err = rfcomm_dev_ioctl(sk, cmd, (void __user *) arg);
- release_sock(sk);
#else
err = -EOPNOTSUPP;
#endif
@@ -909,6 +876,13 @@ static int rfcomm_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned lon
return err;
}
+#ifdef CONFIG_COMPAT
+static int rfcomm_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ return rfcomm_sock_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
static int rfcomm_sock_shutdown(struct socket *sock, int how)
{
struct sock *sk = sock->sk;
@@ -922,7 +896,10 @@ static int rfcomm_sock_shutdown(struct socket *sock, int how)
lock_sock(sk);
if (!sk->sk_shutdown) {
sk->sk_shutdown = SHUTDOWN_MASK;
+
+ release_sock(sk);
__rfcomm_sock_close(sk);
+ lock_sock(sk);
if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
!(current->flags & PF_EXITING))
@@ -968,7 +945,7 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc *
if (!parent)
return 0;
- bh_lock_sock(parent);
+ lock_sock(parent);
/* Check for backlog size */
if (sk_acceptq_is_full(parent)) {
@@ -988,14 +965,14 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc *
rfcomm_pi(sk)->channel = channel;
sk->sk_state = BT_CONFIG;
- bt_accept_enqueue(parent, sk);
+ bt_accept_enqueue(parent, sk, true);
/* Accept connection and return socket DLC */
*d = rfcomm_pi(sk)->dlc;
result = 1;
done:
- bh_unlock_sock(parent);
+ release_sock(parent);
if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags))
parent->sk_state_change(parent);
@@ -1020,17 +997,7 @@ static int rfcomm_sock_debugfs_show(struct seq_file *f, void *p)
return 0;
}
-static int rfcomm_sock_debugfs_open(struct inode *inode, struct file *file)
-{
- return single_open(file, rfcomm_sock_debugfs_show, inode->i_private);
-}
-
-static const struct file_operations rfcomm_sock_debugfs_fops = {
- .open = rfcomm_sock_debugfs_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(rfcomm_sock_debugfs);
static struct dentry *rfcomm_sock_debugfs;
@@ -1049,9 +1016,13 @@ static const struct proto_ops rfcomm_sock_ops = {
.setsockopt = rfcomm_sock_setsockopt,
.getsockopt = rfcomm_sock_getsockopt,
.ioctl = rfcomm_sock_ioctl,
+ .gettstamp = sock_gettstamp,
.poll = bt_sock_poll,
.socketpair = sock_no_socketpair,
- .mmap = sock_no_mmap
+ .mmap = sock_no_mmap,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = rfcomm_sock_compat_ioctl,
+#endif
};
static const struct net_proto_family rfcomm_sock_family_ops = {
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 5e44d842cc5d..b783526ab588 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -35,7 +35,6 @@
#include <net/bluetooth/hci_core.h>
#include <net/bluetooth/rfcomm.h>
-#define RFCOMM_TTY_MAGIC 0x6d02 /* magic number for rfcomm struct */
#define RFCOMM_TTY_PORTS RFCOMM_MAX_DEV /* whole lotta rfcomm devices */
#define RFCOMM_TTY_MAJOR 216 /* device node major id of the usb/bluetooth.c driver */
#define RFCOMM_TTY_MINOR 0
@@ -120,7 +119,7 @@ static int rfcomm_dev_activate(struct tty_port *port, struct tty_struct *tty)
}
/* we block the open until the dlc->state becomes BT_CONNECTED */
-static int rfcomm_dev_carrier_raised(struct tty_port *port)
+static bool rfcomm_dev_carrier_raised(struct tty_port *port)
{
struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port);
@@ -198,20 +197,22 @@ static void rfcomm_reparent_device(struct rfcomm_dev *dev)
hci_dev_put(hdev);
}
-static ssize_t show_address(struct device *tty_dev, struct device_attribute *attr, char *buf)
+static ssize_t address_show(struct device *tty_dev,
+ struct device_attribute *attr, char *buf)
{
struct rfcomm_dev *dev = dev_get_drvdata(tty_dev);
- return sprintf(buf, "%pMR\n", &dev->dst);
+ return sysfs_emit(buf, "%pMR\n", &dev->dst);
}
-static ssize_t show_channel(struct device *tty_dev, struct device_attribute *attr, char *buf)
+static ssize_t channel_show(struct device *tty_dev,
+ struct device_attribute *attr, char *buf)
{
struct rfcomm_dev *dev = dev_get_drvdata(tty_dev);
- return sprintf(buf, "%d\n", dev->channel);
+ return sysfs_emit(buf, "%d\n", dev->channel);
}
-static DEVICE_ATTR(address, 0444, show_address, NULL);
-static DEVICE_ATTR(channel, 0444, show_channel, NULL);
+static DEVICE_ATTR_RO(address);
+static DEVICE_ATTR_RO(channel);
static struct rfcomm_dev *__rfcomm_dev_add(struct rfcomm_dev_req *req,
struct rfcomm_dlc *dlc)
@@ -413,10 +414,8 @@ static int __rfcomm_create_dev(struct sock *sk, void __user *arg)
dlc = rfcomm_dlc_exists(&req.src, &req.dst, req.channel);
if (IS_ERR(dlc))
return PTR_ERR(dlc);
- else if (dlc) {
- rfcomm_dlc_put(dlc);
+ if (dlc)
return -EBUSY;
- }
dlc = rfcomm_dlc_alloc(GFP_KERNEL);
if (!dlc)
return -ENOMEM;
@@ -439,7 +438,6 @@ static int __rfcomm_release_dev(void __user *arg)
{
struct rfcomm_dev_req req;
struct rfcomm_dev *dev;
- struct tty_struct *tty;
if (copy_from_user(&req, arg, sizeof(req)))
return -EFAULT;
@@ -465,11 +463,7 @@ static int __rfcomm_release_dev(void __user *arg)
rfcomm_dlc_close(dev->dlc, 0);
/* Shut down TTY synchronously before freeing rfcomm_dev */
- tty = tty_port_tty_get(&dev->port);
- if (tty) {
- tty_vhangup(tty);
- tty_kref_put(tty);
- }
+ tty_port_tty_vhangup(&dev->port);
if (!test_bit(RFCOMM_TTY_OWNED, &dev->status))
tty_port_put(&dev->port);
@@ -505,7 +499,7 @@ static int rfcomm_get_dev_list(void __user *arg)
struct rfcomm_dev *dev;
struct rfcomm_dev_list_req *dl;
struct rfcomm_dev_info *di;
- int n = 0, size, err;
+ int n = 0, err;
u16 dev_num;
BT_DBG("");
@@ -516,12 +510,11 @@ static int rfcomm_get_dev_list(void __user *arg)
if (!dev_num || dev_num > (PAGE_SIZE * 4) / sizeof(*di))
return -EINVAL;
- size = sizeof(*dl) + dev_num * sizeof(*di);
-
- dl = kzalloc(size, GFP_KERNEL);
+ dl = kzalloc(struct_size(dl, dev_info, dev_num), GFP_KERNEL);
if (!dl)
return -ENOMEM;
+ dl->dev_num = dev_num;
di = dl->dev_info;
mutex_lock(&rfcomm_dev_lock);
@@ -529,12 +522,12 @@ static int rfcomm_get_dev_list(void __user *arg)
list_for_each_entry(dev, &rfcomm_dev_list, list) {
if (!tty_port_get(&dev->port))
continue;
- (di + n)->id = dev->id;
- (di + n)->flags = dev->flags;
- (di + n)->state = dev->dlc->state;
- (di + n)->channel = dev->channel;
- bacpy(&(di + n)->src, &dev->src);
- bacpy(&(di + n)->dst, &dev->dst);
+ di[n].id = dev->id;
+ di[n].flags = dev->flags;
+ di[n].state = dev->dlc->state;
+ di[n].channel = dev->channel;
+ bacpy(&di[n].src, &dev->src);
+ bacpy(&di[n].dst, &dev->dst);
tty_port_put(&dev->port);
if (++n >= dev_num)
break;
@@ -543,9 +536,7 @@ static int rfcomm_get_dev_list(void __user *arg)
mutex_unlock(&rfcomm_dev_lock);
dl->dev_num = n;
- size = sizeof(*dl) + n * sizeof(*di);
-
- err = copy_to_user(arg, dl, size);
+ err = copy_to_user(arg, dl, struct_size(dl, dev_info, n));
kfree(dl);
return err ? -EFAULT : 0;
@@ -652,8 +643,8 @@ static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig)
tty_port_tty_hangup(&dev->port, true);
dev->modem_status =
- ((v24_sig & RFCOMM_V24_RTC) ? (TIOCM_DSR | TIOCM_DTR) : 0) |
- ((v24_sig & RFCOMM_V24_RTR) ? (TIOCM_RTS | TIOCM_CTS) : 0) |
+ ((v24_sig & RFCOMM_V24_RTC) ? TIOCM_DSR : 0) |
+ ((v24_sig & RFCOMM_V24_RTR) ? TIOCM_CTS : 0) |
((v24_sig & RFCOMM_V24_IC) ? TIOCM_RI : 0) |
((v24_sig & RFCOMM_V24_DV) ? TIOCM_CD : 0);
}
@@ -772,7 +763,7 @@ static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp)
static void rfcomm_tty_close(struct tty_struct *tty, struct file *filp)
{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ struct rfcomm_dev *dev = tty->driver_data;
BT_DBG("tty %p dev %p dlc %p opened %d", tty, dev, dev->dlc,
dev->port.count);
@@ -780,17 +771,18 @@ static void rfcomm_tty_close(struct tty_struct *tty, struct file *filp)
tty_port_close(&dev->port, tty, filp);
}
-static int rfcomm_tty_write(struct tty_struct *tty, const unsigned char *buf, int count)
+static ssize_t rfcomm_tty_write(struct tty_struct *tty, const u8 *buf,
+ size_t count)
{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ struct rfcomm_dev *dev = tty->driver_data;
struct rfcomm_dlc *dlc = dev->dlc;
struct sk_buff *skb;
- int sent = 0, size;
+ size_t sent = 0, size;
- BT_DBG("tty %p count %d", tty, count);
+ BT_DBG("tty %p count %zu", tty, count);
while (count) {
- size = min_t(uint, count, dlc->mtu);
+ size = min_t(size_t, count, dlc->mtu);
skb = rfcomm_wmalloc(dev, size + RFCOMM_SKB_RESERVE, GFP_ATOMIC);
if (!skb)
@@ -809,9 +801,9 @@ static int rfcomm_tty_write(struct tty_struct *tty, const unsigned char *buf, in
return sent;
}
-static int rfcomm_tty_write_room(struct tty_struct *tty)
+static unsigned int rfcomm_tty_write_room(struct tty_struct *tty)
{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ struct rfcomm_dev *dev = tty->driver_data;
int room = 0;
if (dev && dev->dlc)
@@ -839,18 +831,6 @@ static int rfcomm_tty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned l
BT_DBG("TIOCMIWAIT");
break;
- case TIOCGSERIAL:
- BT_ERR("TIOCGSERIAL is not supported");
- return -ENOIOCTLCMD;
-
- case TIOCSSERIAL:
- BT_ERR("TIOCSSERIAL is not supported");
- return -ENOIOCTLCMD;
-
- case TIOCSERGSTRUCT:
- BT_ERR("TIOCSERGSTRUCT is not supported");
- return -ENOIOCTLCMD;
-
case TIOCSERGETLSR:
BT_ERR("TIOCSERGETLSR is not supported");
return -ENOIOCTLCMD;
@@ -867,7 +847,8 @@ static int rfcomm_tty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned l
return -ENOIOCTLCMD;
}
-static void rfcomm_tty_set_termios(struct tty_struct *tty, struct ktermios *old)
+static void rfcomm_tty_set_termios(struct tty_struct *tty,
+ const struct ktermios *old)
{
struct ktermios *new = &tty->termios;
int old_baud_rate = tty_termios_baud_rate(old);
@@ -876,7 +857,7 @@ static void rfcomm_tty_set_termios(struct tty_struct *tty, struct ktermios *old)
u8 baud, data_bits, stop_bits, parity, x_on, x_off;
u16 changes = 0;
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ struct rfcomm_dev *dev = tty->driver_data;
BT_DBG("tty %p termios %p", tty, old);
@@ -994,7 +975,7 @@ static void rfcomm_tty_set_termios(struct tty_struct *tty, struct ktermios *old)
baud = RFCOMM_RPN_BR_230400;
break;
default:
- /* 9600 is standard accordinag to the RFCOMM specification */
+ /* 9600 is standard according to the RFCOMM specification */
baud = RFCOMM_RPN_BR_9600;
break;
@@ -1008,7 +989,7 @@ static void rfcomm_tty_set_termios(struct tty_struct *tty, struct ktermios *old)
static void rfcomm_tty_throttle(struct tty_struct *tty)
{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ struct rfcomm_dev *dev = tty->driver_data;
BT_DBG("tty %p dev %p", tty, dev);
@@ -1017,16 +998,16 @@ static void rfcomm_tty_throttle(struct tty_struct *tty)
static void rfcomm_tty_unthrottle(struct tty_struct *tty)
{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ struct rfcomm_dev *dev = tty->driver_data;
BT_DBG("tty %p dev %p", tty, dev);
rfcomm_dlc_unthrottle(dev->dlc);
}
-static int rfcomm_tty_chars_in_buffer(struct tty_struct *tty)
+static unsigned int rfcomm_tty_chars_in_buffer(struct tty_struct *tty)
{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ struct rfcomm_dev *dev = tty->driver_data;
BT_DBG("tty %p dev %p", tty, dev);
@@ -1041,7 +1022,7 @@ static int rfcomm_tty_chars_in_buffer(struct tty_struct *tty)
static void rfcomm_tty_flush_buffer(struct tty_struct *tty)
{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ struct rfcomm_dev *dev = tty->driver_data;
BT_DBG("tty %p dev %p", tty, dev);
@@ -1052,7 +1033,7 @@ static void rfcomm_tty_flush_buffer(struct tty_struct *tty)
tty_wakeup(tty);
}
-static void rfcomm_tty_send_xchar(struct tty_struct *tty, char ch)
+static void rfcomm_tty_send_xchar(struct tty_struct *tty, u8 ch)
{
BT_DBG("tty %p ch %c", tty, ch);
}
@@ -1064,7 +1045,7 @@ static void rfcomm_tty_wait_until_sent(struct tty_struct *tty, int timeout)
static void rfcomm_tty_hangup(struct tty_struct *tty)
{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ struct rfcomm_dev *dev = tty->driver_data;
BT_DBG("tty %p dev %p", tty, dev);
@@ -1073,16 +1054,20 @@ static void rfcomm_tty_hangup(struct tty_struct *tty)
static int rfcomm_tty_tiocmget(struct tty_struct *tty)
{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ struct rfcomm_dev *dev = tty->driver_data;
+ struct rfcomm_dlc *dlc = dev->dlc;
+ u8 v24_sig;
BT_DBG("tty %p dev %p", tty, dev);
- return dev->modem_status;
+ rfcomm_dlc_get_modem_status(dlc, &v24_sig);
+
+ return (v24_sig & (TIOCM_DTR | TIOCM_RTS)) | dev->modem_status;
}
static int rfcomm_tty_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear)
{
- struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data;
+ struct rfcomm_dev *dev = tty->driver_data;
struct rfcomm_dlc *dlc = dev->dlc;
u8 v24_sig;
@@ -1090,23 +1075,15 @@ static int rfcomm_tty_tiocmset(struct tty_struct *tty, unsigned int set, unsigne
rfcomm_dlc_get_modem_status(dlc, &v24_sig);
- if (set & TIOCM_DSR || set & TIOCM_DTR)
+ if (set & TIOCM_DTR)
v24_sig |= RFCOMM_V24_RTC;
- if (set & TIOCM_RTS || set & TIOCM_CTS)
+ if (set & TIOCM_RTS)
v24_sig |= RFCOMM_V24_RTR;
- if (set & TIOCM_RI)
- v24_sig |= RFCOMM_V24_IC;
- if (set & TIOCM_CD)
- v24_sig |= RFCOMM_V24_DV;
- if (clear & TIOCM_DSR || clear & TIOCM_DTR)
+ if (clear & TIOCM_DTR)
v24_sig &= ~RFCOMM_V24_RTC;
- if (clear & TIOCM_RTS || clear & TIOCM_CTS)
+ if (clear & TIOCM_RTS)
v24_sig &= ~RFCOMM_V24_RTR;
- if (clear & TIOCM_RI)
- v24_sig &= ~RFCOMM_V24_IC;
- if (clear & TIOCM_CD)
- v24_sig &= ~RFCOMM_V24_DV;
rfcomm_dlc_set_modem_status(dlc, v24_sig);
@@ -1139,9 +1116,10 @@ int __init rfcomm_init_ttys(void)
{
int error;
- rfcomm_tty_driver = alloc_tty_driver(RFCOMM_TTY_PORTS);
- if (!rfcomm_tty_driver)
- return -ENOMEM;
+ rfcomm_tty_driver = tty_alloc_driver(RFCOMM_TTY_PORTS,
+ TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV);
+ if (IS_ERR(rfcomm_tty_driver))
+ return PTR_ERR(rfcomm_tty_driver);
rfcomm_tty_driver->driver_name = "rfcomm";
rfcomm_tty_driver->name = "rfcomm";
@@ -1149,7 +1127,6 @@ int __init rfcomm_init_ttys(void)
rfcomm_tty_driver->minor_start = RFCOMM_TTY_MINOR;
rfcomm_tty_driver->type = TTY_DRIVER_TYPE_SERIAL;
rfcomm_tty_driver->subtype = SERIAL_TYPE_NORMAL;
- rfcomm_tty_driver->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV;
rfcomm_tty_driver->init_termios = tty_std_termios;
rfcomm_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL;
rfcomm_tty_driver->init_termios.c_lflag &= ~ICANON;
@@ -1158,7 +1135,7 @@ int __init rfcomm_init_ttys(void)
error = tty_register_driver(rfcomm_tty_driver);
if (error) {
BT_ERR("Can't register RFCOMM TTY driver");
- put_tty_driver(rfcomm_tty_driver);
+ tty_driver_kref_put(rfcomm_tty_driver);
return error;
}
@@ -1170,5 +1147,5 @@ int __init rfcomm_init_ttys(void)
void rfcomm_cleanup_ttys(void)
{
tty_unregister_driver(rfcomm_tty_driver);
- put_tty_driver(rfcomm_tty_driver);
+ tty_driver_kref_put(rfcomm_tty_driver);
}
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 8f0f9279eac9..87ba90336e80 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -48,11 +48,14 @@ struct sco_conn {
spinlock_t lock;
struct sock *sk;
+ struct delayed_work timeout_work;
+
unsigned int mtu;
+ struct kref ref;
};
-#define sco_conn_lock(c) spin_lock(&c->lock);
-#define sco_conn_unlock(c) spin_unlock(&c->lock);
+#define sco_conn_lock(c) spin_lock(&c->lock)
+#define sco_conn_unlock(c) spin_unlock(&c->lock)
static void sco_sock_close(struct sock *sk);
static void sco_sock_kill(struct sock *sk);
@@ -66,6 +69,7 @@ struct sco_pinfo {
bdaddr_t dst;
__u32 flags;
__u16 setting;
+ struct bt_codec codec;
struct sco_conn *conn;
};
@@ -73,53 +77,147 @@ struct sco_pinfo {
#define SCO_CONN_TIMEOUT (HZ * 40)
#define SCO_DISCONN_TIMEOUT (HZ * 2)
-static void sco_sock_timeout(struct timer_list *t)
+static void sco_conn_free(struct kref *ref)
+{
+ struct sco_conn *conn = container_of(ref, struct sco_conn, ref);
+
+ BT_DBG("conn %p", conn);
+
+ if (conn->sk)
+ sco_pi(conn->sk)->conn = NULL;
+
+ if (conn->hcon) {
+ conn->hcon->sco_data = NULL;
+ hci_conn_drop(conn->hcon);
+ }
+
+ /* Ensure no more work items will run since hci_conn has been dropped */
+ disable_delayed_work_sync(&conn->timeout_work);
+
+ kfree(conn);
+}
+
+static void sco_conn_put(struct sco_conn *conn)
+{
+ if (!conn)
+ return;
+
+ BT_DBG("conn %p refcnt %d", conn, kref_read(&conn->ref));
+
+ kref_put(&conn->ref, sco_conn_free);
+}
+
+static struct sco_conn *sco_conn_hold(struct sco_conn *conn)
+{
+ BT_DBG("conn %p refcnt %u", conn, kref_read(&conn->ref));
+
+ kref_get(&conn->ref);
+ return conn;
+}
+
+static struct sco_conn *sco_conn_hold_unless_zero(struct sco_conn *conn)
+{
+ if (!conn)
+ return NULL;
+
+ BT_DBG("conn %p refcnt %u", conn, kref_read(&conn->ref));
+
+ if (!kref_get_unless_zero(&conn->ref))
+ return NULL;
+
+ return conn;
+}
+
+static struct sock *sco_sock_hold(struct sco_conn *conn)
{
- struct sock *sk = from_timer(sk, t, sk_timer);
+ if (!conn || !bt_sock_linked(&sco_sk_list, conn->sk))
+ return NULL;
+
+ sock_hold(conn->sk);
+
+ return conn->sk;
+}
+
+static void sco_sock_timeout(struct work_struct *work)
+{
+ struct sco_conn *conn = container_of(work, struct sco_conn,
+ timeout_work.work);
+ struct sock *sk;
+
+ conn = sco_conn_hold_unless_zero(conn);
+ if (!conn)
+ return;
+
+ sco_conn_lock(conn);
+ if (!conn->hcon) {
+ sco_conn_unlock(conn);
+ sco_conn_put(conn);
+ return;
+ }
+ sk = sco_sock_hold(conn);
+ sco_conn_unlock(conn);
+ sco_conn_put(conn);
+
+ if (!sk)
+ return;
BT_DBG("sock %p state %d", sk, sk->sk_state);
- bh_lock_sock(sk);
+ lock_sock(sk);
sk->sk_err = ETIMEDOUT;
sk->sk_state_change(sk);
- bh_unlock_sock(sk);
-
- sco_sock_kill(sk);
+ release_sock(sk);
sock_put(sk);
}
static void sco_sock_set_timer(struct sock *sk, long timeout)
{
+ if (!sco_pi(sk)->conn)
+ return;
+
BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout);
- sk_reset_timer(sk, &sk->sk_timer, jiffies + timeout);
+ cancel_delayed_work(&sco_pi(sk)->conn->timeout_work);
+ schedule_delayed_work(&sco_pi(sk)->conn->timeout_work, timeout);
}
static void sco_sock_clear_timer(struct sock *sk)
{
+ if (!sco_pi(sk)->conn)
+ return;
+
BT_DBG("sock %p state %d", sk, sk->sk_state);
- sk_stop_timer(sk, &sk->sk_timer);
+ cancel_delayed_work(&sco_pi(sk)->conn->timeout_work);
}
/* ---- SCO connections ---- */
static struct sco_conn *sco_conn_add(struct hci_conn *hcon)
{
- struct hci_dev *hdev = hcon->hdev;
struct sco_conn *conn = hcon->sco_data;
- if (conn)
+ conn = sco_conn_hold_unless_zero(conn);
+ if (conn) {
+ if (!conn->hcon) {
+ sco_conn_lock(conn);
+ conn->hcon = hcon;
+ sco_conn_unlock(conn);
+ }
return conn;
+ }
conn = kzalloc(sizeof(struct sco_conn), GFP_KERNEL);
if (!conn)
return NULL;
+ kref_init(&conn->ref);
spin_lock_init(&conn->lock);
+ INIT_DELAYED_WORK(&conn->timeout_work, sco_sock_timeout);
hcon->sco_data = conn;
conn->hcon = hcon;
+ conn->mtu = hcon->mtu;
- if (hdev->sco_mtu > 0)
- conn->mtu = hdev->sco_mtu;
+ if (hcon->mtu > 0)
+ conn->mtu = hcon->mtu;
else
conn->mtu = 60;
@@ -135,17 +233,15 @@ static void sco_chan_del(struct sock *sk, int err)
struct sco_conn *conn;
conn = sco_pi(sk)->conn;
+ sco_pi(sk)->conn = NULL;
BT_DBG("sk %p, conn %p, err %d", sk, conn, err);
if (conn) {
sco_conn_lock(conn);
conn->sk = NULL;
- sco_pi(sk)->conn = NULL;
sco_conn_unlock(conn);
-
- if (conn->hcon)
- hci_conn_drop(conn->hcon);
+ sco_conn_put(conn);
}
sk->sk_state = BT_CLOSED;
@@ -160,28 +256,28 @@ static void sco_conn_del(struct hci_conn *hcon, int err)
struct sco_conn *conn = hcon->sco_data;
struct sock *sk;
+ conn = sco_conn_hold_unless_zero(conn);
if (!conn)
return;
BT_DBG("hcon %p conn %p, err %d", hcon, conn, err);
- /* Kill socket */
sco_conn_lock(conn);
- sk = conn->sk;
+ sk = sco_sock_hold(conn);
sco_conn_unlock(conn);
+ sco_conn_put(conn);
- if (sk) {
- sock_hold(sk);
- bh_lock_sock(sk);
- sco_sock_clear_timer(sk);
- sco_chan_del(sk, err);
- bh_unlock_sock(sk);
- sco_sock_kill(sk);
- sock_put(sk);
+ if (!sk) {
+ sco_conn_put(conn);
+ return;
}
- hcon->sco_data = NULL;
- kfree(conn);
+ /* Kill socket */
+ lock_sock(sk);
+ sco_sock_clear_timer(sk);
+ sco_chan_del(sk, err);
+ release_sock(sk);
+ sock_put(sk);
}
static void __sco_chan_add(struct sco_conn *conn, struct sock *sk,
@@ -193,7 +289,7 @@ static void __sco_chan_add(struct sco_conn *conn, struct sock *sk,
conn->sk = sk;
if (parent)
- bt_accept_enqueue(parent, sk);
+ bt_accept_enqueue(parent, sk, true);
}
static int sco_chan_add(struct sco_conn *conn, struct sock *sk,
@@ -231,52 +327,62 @@ static int sco_connect(struct sock *sk)
else
type = SCO_LINK;
- if (sco_pi(sk)->setting == BT_VOICE_TRANSPARENT &&
- (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev))) {
- err = -EOPNOTSUPP;
- goto done;
+ switch (sco_pi(sk)->setting & SCO_AIRMODE_MASK) {
+ case SCO_AIRMODE_TRANSP:
+ if (!lmp_transp_capable(hdev) || !lmp_esco_capable(hdev)) {
+ err = -EOPNOTSUPP;
+ goto unlock;
+ }
+ break;
}
hcon = hci_connect_sco(hdev, type, &sco_pi(sk)->dst,
- sco_pi(sk)->setting);
+ sco_pi(sk)->setting, &sco_pi(sk)->codec,
+ READ_ONCE(sk->sk_sndtimeo));
if (IS_ERR(hcon)) {
err = PTR_ERR(hcon);
- goto done;
+ goto unlock;
}
conn = sco_conn_add(hcon);
if (!conn) {
hci_conn_drop(hcon);
err = -ENOMEM;
- goto done;
+ goto unlock;
}
- /* Update source addr of the socket */
- bacpy(&sco_pi(sk)->src, &hcon->src);
+ lock_sock(sk);
err = sco_chan_add(conn, sk, NULL);
- if (err)
- goto done;
+ if (err) {
+ release_sock(sk);
+ goto unlock;
+ }
+
+ /* Update source addr of the socket */
+ bacpy(&sco_pi(sk)->src, &hcon->src);
if (hcon->state == BT_CONNECTED) {
sco_sock_clear_timer(sk);
sk->sk_state = BT_CONNECTED;
} else {
sk->sk_state = BT_CONNECT;
- sco_sock_set_timer(sk, sk->sk_sndtimeo);
+ sco_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo));
}
-done:
+ release_sock(sk);
+
+unlock:
hci_dev_unlock(hdev);
hci_dev_put(hdev);
return err;
}
-static int sco_send_frame(struct sock *sk, struct msghdr *msg, int len)
+static int sco_send_frame(struct sock *sk, struct sk_buff *skb,
+ const struct sockcm_cookie *sockc)
{
struct sco_conn *conn = sco_pi(sk)->conn;
- struct sk_buff *skb;
- int err;
+ int len = skb->len;
/* Check outgoing MTU */
if (len > conn->mtu)
@@ -284,15 +390,7 @@ static int sco_send_frame(struct sock *sk, struct msghdr *msg, int len)
BT_DBG("sk %p len %d", sk, len);
- skb = bt_skb_send_alloc(sk, len, msg->msg_flags & MSG_DONTWAIT, &err);
- if (!skb)
- return err;
-
- if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
- kfree_skb(skb);
- return -EFAULT;
- }
-
+ hci_setup_tx_timestamp(skb, 1, sockc);
hci_send_sco(conn->hcon, skb);
return len;
@@ -309,7 +407,7 @@ static void sco_recv_frame(struct sco_conn *conn, struct sk_buff *skb)
if (!sk)
goto drop;
- BT_DBG("sk %p len %d", sk, skb->len);
+ BT_DBG("sk %p len %u", sk, skb->len);
if (sk->sk_state != BT_CONNECTED)
goto drop;
@@ -368,6 +466,8 @@ static void sco_sock_destruct(struct sock *sk)
{
BT_DBG("sk %p", sk);
+ sco_conn_put(sco_pi(sk)->conn);
+
skb_queue_purge(&sk->sk_receive_queue);
skb_queue_purge(&sk->sk_write_queue);
}
@@ -393,12 +493,18 @@ static void sco_sock_cleanup_listen(struct sock *parent)
*/
static void sco_sock_kill(struct sock *sk)
{
- if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket ||
- sock_flag(sk, SOCK_DEAD))
+ if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket)
return;
BT_DBG("sk %p state %d", sk, sk->sk_state);
+ /* Sock is dead, so set conn->sk to NULL to avoid possible UAF */
+ if (sco_pi(sk)->conn) {
+ sco_conn_lock(sco_pi(sk)->conn);
+ sco_pi(sk)->conn->sk = NULL;
+ sco_conn_unlock(sco_pi(sk)->conn);
+ }
+
/* Kill poor orphan */
bt_sock_unlink(&sco_sk_list, sk);
sock_set_flag(sk, SOCK_DEAD);
@@ -416,17 +522,6 @@ static void __sco_sock_close(struct sock *sk)
case BT_CONNECTED:
case BT_CONFIG:
- if (sco_pi(sk)->conn->hcon) {
- sk->sk_state = BT_DISCONN;
- sco_sock_set_timer(sk, SCO_DISCONN_TIMEOUT);
- sco_conn_lock(sco_pi(sk)->conn);
- hci_conn_drop(sco_pi(sk)->conn->hcon);
- sco_pi(sk)->conn->hcon = NULL;
- sco_conn_unlock(sco_pi(sk)->conn);
- } else
- sco_chan_del(sk, ECONNRESET);
- break;
-
case BT_CONNECT2:
case BT_CONNECT:
case BT_DISCONN:
@@ -437,16 +532,16 @@ static void __sco_sock_close(struct sock *sk)
sock_set_flag(sk, SOCK_ZAPPED);
break;
}
+
}
/* Must be called on unlocked socket. */
static void sco_sock_close(struct sock *sk)
{
- sco_sock_clear_timer(sk);
lock_sock(sk);
+ sco_sock_clear_timer(sk);
__sco_sock_close(sk);
release_sock(sk);
- sco_sock_kill(sk);
}
static void sco_sock_init(struct sock *sk, struct sock *parent)
@@ -471,24 +566,18 @@ static struct sock *sco_sock_alloc(struct net *net, struct socket *sock,
{
struct sock *sk;
- sk = sk_alloc(net, PF_BLUETOOTH, prio, &sco_proto, kern);
+ sk = bt_sock_alloc(net, sock, &sco_proto, proto, prio, kern);
if (!sk)
return NULL;
- sock_init_data(sock, sk);
- INIT_LIST_HEAD(&bt_sk(sk)->accept_q);
-
sk->sk_destruct = sco_sock_destruct;
sk->sk_sndtimeo = SCO_CONN_TIMEOUT;
- sock_reset_flag(sk, SOCK_ZAPPED);
-
- sk->sk_protocol = proto;
- sk->sk_state = BT_OPEN;
-
sco_pi(sk)->setting = BT_VOICE_CVSD_16BIT;
-
- timer_setup(&sk->sk_timer, sco_sock_timeout, 0);
+ sco_pi(sk)->codec.id = BT_CODEC_CVSD;
+ sco_pi(sk)->codec.cid = 0xffff;
+ sco_pi(sk)->codec.vid = 0xffff;
+ sco_pi(sk)->codec.data_path = 0x00;
bt_sock_link(&sco_sk_list, sk);
return sk;
@@ -516,19 +605,19 @@ static int sco_sock_create(struct net *net, struct socket *sock, int protocol,
return 0;
}
-static int sco_sock_bind(struct socket *sock, struct sockaddr *addr,
+static int sco_sock_bind(struct socket *sock, struct sockaddr_unsized *addr,
int addr_len)
{
struct sockaddr_sco *sa = (struct sockaddr_sco *) addr;
struct sock *sk = sock->sk;
int err = 0;
- BT_DBG("sk %p %pMR", sk, &sa->sco_bdaddr);
-
if (!addr || addr_len < sizeof(struct sockaddr_sco) ||
addr->sa_family != AF_BLUETOOTH)
return -EINVAL;
+ BT_DBG("sk %p %pMR", sk, &sa->sco_bdaddr);
+
lock_sock(sk);
if (sk->sk_state != BT_OPEN) {
@@ -550,7 +639,7 @@ done:
return err;
}
-static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags)
+static int sco_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags)
{
struct sockaddr_sco *sa = (struct sockaddr_sco *) addr;
struct sock *sk = sock->sk;
@@ -566,21 +655,22 @@ static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen
return -EBADFD;
if (sk->sk_type != SOCK_SEQPACKET)
- return -EINVAL;
+ err = -EINVAL;
lock_sock(sk);
-
/* Set destination address and psm */
bacpy(&sco_pi(sk)->dst, &sa->sco_bdaddr);
+ release_sock(sk);
err = sco_connect(sk);
if (err)
- goto done;
+ return err;
+
+ lock_sock(sk);
err = bt_sock_wait_state(sk, BT_CONNECTED,
sock_sndtimeo(sk, flags & O_NONBLOCK));
-done:
release_sock(sk);
return err;
}
@@ -626,7 +716,7 @@ done:
}
static int sco_sock_accept(struct socket *sock, struct socket *newsock,
- int flags, bool kern)
+ struct proto_accept_arg *arg)
{
DEFINE_WAIT_FUNC(wait, woken_wake_function);
struct sock *sk = sock->sk, *ch;
@@ -635,7 +725,7 @@ static int sco_sock_accept(struct socket *sock, struct socket *newsock,
lock_sock(sk);
- timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+ timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
BT_DBG("sk %p timeo %ld", sk, timeo);
@@ -702,6 +792,8 @@ static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg,
size_t len)
{
struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ struct sockcm_cookie sockc;
int err;
BT_DBG("sock %p, sk %p", sock, sk);
@@ -713,14 +805,29 @@ static int sco_sock_sendmsg(struct socket *sock, struct msghdr *msg,
if (msg->msg_flags & MSG_OOB)
return -EOPNOTSUPP;
+ hci_sockcm_init(&sockc, sk);
+
+ if (msg->msg_controllen) {
+ err = sock_cmsg_send(sk, msg, &sockc);
+ if (err)
+ return err;
+ }
+
+ skb = bt_skb_sendmsg(sk, msg, len, len, 0, 0);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
lock_sock(sk);
if (sk->sk_state == BT_CONNECTED)
- err = sco_send_frame(sk, msg, len);
+ err = sco_send_frame(sk, skb, &sockc);
else
err = -ENOTCONN;
release_sock(sk);
+
+ if (err < 0)
+ kfree_skb(skb);
return err;
}
@@ -761,6 +868,11 @@ static void sco_conn_defer_accept(struct hci_conn *conn, u16 setting)
cp.max_latency = cpu_to_le16(0xffff);
cp.retrans_effort = 0xff;
break;
+ default:
+ /* use CVSD settings as fallback */
+ cp.max_latency = cpu_to_le16(0xffff);
+ cp.retrans_effort = 0xff;
+ break;
}
hci_send_cmd(hdev, HCI_OP_ACCEPT_SYNC_CONN_REQ,
@@ -774,6 +886,10 @@ static int sco_sock_recvmsg(struct socket *sock, struct msghdr *msg,
struct sock *sk = sock->sk;
struct sco_pinfo *pi = sco_pi(sk);
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return sock_recv_errqueue(sk, msg, len, SOL_BLUETOOTH,
+ BT_SCM_ERROR);
+
lock_sock(sk);
if (sk->sk_state == BT_CONNECT2 &&
@@ -791,12 +907,15 @@ static int sco_sock_recvmsg(struct socket *sock, struct msghdr *msg,
}
static int sco_sock_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
- int len, err = 0;
+ int err = 0;
struct bt_voice voice;
u32 opt;
+ struct bt_codecs *codecs;
+ struct hci_dev *hdev;
+ __u8 buffer[255];
BT_DBG("sk %p", sk);
@@ -810,10 +929,9 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname,
break;
}
- if (get_user(opt, (u32 __user *) optval)) {
- err = -EFAULT;
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
break;
- }
if (opt)
set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
@@ -830,20 +948,90 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname,
voice.setting = sco_pi(sk)->setting;
- len = min_t(unsigned int, sizeof(voice), optlen);
- if (copy_from_user((char *)&voice, optval, len)) {
- err = -EFAULT;
+ err = copy_safe_from_sockptr(&voice, sizeof(voice), optval,
+ optlen);
+ if (err)
+ break;
+
+ sco_pi(sk)->setting = voice.setting;
+ hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src,
+ BDADDR_BREDR);
+ if (!hdev) {
+ err = -EBADFD;
+ break;
+ }
+
+ switch (sco_pi(sk)->setting & SCO_AIRMODE_MASK) {
+ case SCO_AIRMODE_TRANSP:
+ if (enhanced_sync_conn_capable(hdev))
+ sco_pi(sk)->codec.id = BT_CODEC_TRANSPARENT;
+ break;
+ }
+
+ hci_dev_put(hdev);
+ break;
+
+ case BT_PKT_STATUS:
+ err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
+ if (err)
+ break;
+
+ if (opt)
+ set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags);
+ else
+ clear_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags);
+ break;
+
+ case BT_CODEC:
+ if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND &&
+ sk->sk_state != BT_CONNECT2) {
+ err = -EINVAL;
+ break;
+ }
+
+ hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src,
+ BDADDR_BREDR);
+ if (!hdev) {
+ err = -EBADFD;
break;
}
- /* Explicitly check for these values */
- if (voice.setting != BT_VOICE_TRANSPARENT &&
- voice.setting != BT_VOICE_CVSD_16BIT) {
+ if (!hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED)) {
+ hci_dev_put(hdev);
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ if (!hdev->get_data_path_id) {
+ hci_dev_put(hdev);
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ if (optlen < sizeof(struct bt_codecs) ||
+ optlen > sizeof(buffer)) {
+ hci_dev_put(hdev);
err = -EINVAL;
break;
}
- sco_pi(sk)->setting = voice.setting;
+ err = copy_struct_from_sockptr(buffer, sizeof(buffer), optval,
+ optlen);
+ if (err) {
+ hci_dev_put(hdev);
+ break;
+ }
+
+ codecs = (void *)buffer;
+
+ if (codecs->num_codecs > 1) {
+ hci_dev_put(hdev);
+ err = -EINVAL;
+ break;
+ }
+
+ sco_pi(sk)->codec = codecs->codecs[0];
+ hci_dev_put(hdev);
break;
default:
@@ -861,7 +1049,8 @@ static int sco_sock_getsockopt_old(struct socket *sock, int optname,
struct sock *sk = sock->sk;
struct sco_options opts;
struct sco_conninfo cinfo;
- int len, err = 0;
+ int err = 0;
+ size_t len;
BT_DBG("sk %p", sk);
@@ -881,9 +1070,9 @@ static int sco_sock_getsockopt_old(struct socket *sock, int optname,
opts.mtu = sco_pi(sk)->conn->mtu;
- BT_DBG("mtu %d", opts.mtu);
+ BT_DBG("mtu %u", opts.mtu);
- len = min_t(unsigned int, len, sizeof(opts));
+ len = min(len, sizeof(opts));
if (copy_to_user(optval, (char *)&opts, len))
err = -EFAULT;
@@ -901,7 +1090,7 @@ static int sco_sock_getsockopt_old(struct socket *sock, int optname,
cinfo.hci_handle = sco_pi(sk)->conn->hcon->handle;
memcpy(cinfo.dev_class, sco_pi(sk)->conn->hcon->dev_class, 3);
- len = min_t(unsigned int, len, sizeof(cinfo));
+ len = min(len, sizeof(cinfo));
if (copy_to_user(optval, (char *)&cinfo, len))
err = -EFAULT;
@@ -922,6 +1111,13 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname,
struct sock *sk = sock->sk;
int len, err = 0;
struct bt_voice voice;
+ u32 phys;
+ int buf_len;
+ struct codec_list *c;
+ u8 num_codecs, i, __user *ptr;
+ struct hci_dev *hdev;
+ struct hci_codec_caps *caps;
+ struct bt_codec codec;
BT_DBG("sk %p", sk);
@@ -956,6 +1152,132 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname,
break;
+ case BT_PHY:
+ if (sk->sk_state != BT_CONNECTED) {
+ err = -ENOTCONN;
+ break;
+ }
+
+ phys = hci_conn_get_phy(sco_pi(sk)->conn->hcon);
+
+ if (put_user(phys, (u32 __user *) optval))
+ err = -EFAULT;
+ break;
+
+ case BT_PKT_STATUS:
+ if (put_user(test_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags),
+ (int __user *)optval))
+ err = -EFAULT;
+ break;
+
+ case BT_SNDMTU:
+ case BT_RCVMTU:
+ if (sk->sk_state != BT_CONNECTED) {
+ err = -ENOTCONN;
+ break;
+ }
+
+ if (put_user(sco_pi(sk)->conn->mtu, (u32 __user *)optval))
+ err = -EFAULT;
+ break;
+
+ case BT_CODEC:
+ num_codecs = 0;
+ buf_len = 0;
+
+ hdev = hci_get_route(&sco_pi(sk)->dst, &sco_pi(sk)->src, BDADDR_BREDR);
+ if (!hdev) {
+ err = -EBADFD;
+ break;
+ }
+
+ if (!hci_dev_test_flag(hdev, HCI_OFFLOAD_CODECS_ENABLED)) {
+ hci_dev_put(hdev);
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ if (!hdev->get_data_path_id) {
+ hci_dev_put(hdev);
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ release_sock(sk);
+
+ /* find total buffer size required to copy codec + caps */
+ hci_dev_lock(hdev);
+ list_for_each_entry(c, &hdev->local_codecs, list) {
+ if (c->transport != HCI_TRANSPORT_SCO_ESCO)
+ continue;
+ num_codecs++;
+ for (i = 0, caps = c->caps; i < c->num_caps; i++) {
+ buf_len += 1 + caps->len;
+ caps = (void *)&caps->data[caps->len];
+ }
+ buf_len += sizeof(struct bt_codec);
+ }
+ hci_dev_unlock(hdev);
+
+ buf_len += sizeof(struct bt_codecs);
+ if (buf_len > len) {
+ hci_dev_put(hdev);
+ return -ENOBUFS;
+ }
+ ptr = optval;
+
+ if (put_user(num_codecs, ptr)) {
+ hci_dev_put(hdev);
+ return -EFAULT;
+ }
+ ptr += sizeof(num_codecs);
+
+ /* Iterate all the codecs supported over SCO and populate
+ * codec data
+ */
+ hci_dev_lock(hdev);
+ list_for_each_entry(c, &hdev->local_codecs, list) {
+ if (c->transport != HCI_TRANSPORT_SCO_ESCO)
+ continue;
+
+ codec.id = c->id;
+ codec.cid = c->cid;
+ codec.vid = c->vid;
+ err = hdev->get_data_path_id(hdev, &codec.data_path);
+ if (err < 0)
+ break;
+ codec.num_caps = c->num_caps;
+ if (copy_to_user(ptr, &codec, sizeof(codec))) {
+ err = -EFAULT;
+ break;
+ }
+ ptr += sizeof(codec);
+
+ /* find codec capabilities data length */
+ len = 0;
+ for (i = 0, caps = c->caps; i < c->num_caps; i++) {
+ len += 1 + caps->len;
+ caps = (void *)&caps->data[caps->len];
+ }
+
+ /* copy codec capabilities data */
+ if (len && copy_to_user(ptr, c->caps, len)) {
+ err = -EFAULT;
+ break;
+ }
+ ptr += len;
+ }
+
+ hci_dev_unlock(hdev);
+ hci_dev_put(hdev);
+
+ lock_sock(sk);
+
+ if (!err && put_user(buf_len, optlen))
+ err = -EFAULT;
+
+ break;
+
default:
err = -ENOPROTOOPT;
break;
@@ -1007,7 +1329,7 @@ static int sco_sock_release(struct socket *sock)
sco_sock_close(sk);
- if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime &&
+ if (sock_flag(sk, SOCK_LINGER) && READ_ONCE(sk->sk_lingertime) &&
!(current->flags & PF_EXITING)) {
lock_sock(sk);
err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime);
@@ -1027,11 +1349,11 @@ static void sco_conn_ready(struct sco_conn *conn)
BT_DBG("conn %p", conn);
if (sk) {
+ lock_sock(sk);
sco_sock_clear_timer(sk);
- bh_lock_sock(sk);
sk->sk_state = BT_CONNECTED;
sk->sk_state_change(sk);
- bh_unlock_sock(sk);
+ release_sock(sk);
} else {
sco_conn_lock(conn);
@@ -1046,12 +1368,12 @@ static void sco_conn_ready(struct sco_conn *conn)
return;
}
- bh_lock_sock(parent);
+ lock_sock(parent);
sk = sco_sock_alloc(sock_net(parent), NULL,
BTPROTO_SCO, GFP_ATOMIC, 0);
if (!sk) {
- bh_unlock_sock(parent);
+ release_sock(parent);
sco_conn_unlock(conn);
return;
}
@@ -1061,6 +1383,7 @@ static void sco_conn_ready(struct sco_conn *conn)
bacpy(&sco_pi(sk)->src, &conn->hcon->src);
bacpy(&sco_pi(sk)->dst, &conn->hcon->dst);
+ sco_conn_hold(conn);
hci_conn_hold(conn->hcon);
__sco_chan_add(conn, sk, parent);
@@ -1072,7 +1395,7 @@ static void sco_conn_ready(struct sco_conn *conn)
/* Wake up parent */
parent->sk_data_ready(parent);
- bh_unlock_sock(parent);
+ release_sock(parent);
sco_conn_unlock(conn);
}
@@ -1111,14 +1434,16 @@ static void sco_connect_cfm(struct hci_conn *hcon, __u8 status)
if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK)
return;
- BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status);
+ BT_DBG("hcon %p bdaddr %pMR status %u", hcon, &hcon->dst, status);
if (!status) {
struct sco_conn *conn;
conn = sco_conn_add(hcon);
- if (conn)
+ if (conn) {
sco_conn_ready(conn);
+ sco_conn_put(conn);
+ }
} else
sco_conn_del(hcon, bt_to_errno(status));
}
@@ -1133,22 +1458,39 @@ static void sco_disconn_cfm(struct hci_conn *hcon, __u8 reason)
sco_conn_del(hcon, bt_to_errno(reason));
}
-void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb)
+int sco_recv_scodata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb)
{
- struct sco_conn *conn = hcon->sco_data;
+ struct hci_conn *hcon;
+ struct sco_conn *conn;
- if (!conn)
- goto drop;
+ hci_dev_lock(hdev);
+
+ hcon = hci_conn_hash_lookup_handle(hdev, handle);
+ if (!hcon) {
+ hci_dev_unlock(hdev);
+ kfree_skb(skb);
+ return -ENOENT;
+ }
- BT_DBG("conn %p len %d", conn, skb->len);
+ conn = sco_conn_hold_unless_zero(hcon->sco_data);
+ hcon = NULL;
- if (skb->len) {
- sco_recv_frame(conn, skb);
- return;
+ hci_dev_unlock(hdev);
+
+ if (!conn) {
+ kfree_skb(skb);
+ return -EINVAL;
}
-drop:
- kfree_skb(skb);
+ BT_DBG("conn %p len %u", conn, skb->len);
+
+ if (skb->len)
+ sco_recv_frame(conn, skb);
+ else
+ kfree_skb(skb);
+
+ sco_conn_put(conn);
+ return 0;
}
static struct hci_cb sco_cb = {
@@ -1173,17 +1515,7 @@ static int sco_debugfs_show(struct seq_file *f, void *p)
return 0;
}
-static int sco_debugfs_open(struct inode *inode, struct file *file)
-{
- return single_open(file, sco_debugfs_show, inode->i_private);
-}
-
-static const struct file_operations sco_debugfs_fops = {
- .open = sco_debugfs_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(sco_debugfs);
static struct dentry *sco_debugfs;
@@ -1200,6 +1532,7 @@ static const struct proto_ops sco_sock_ops = {
.recvmsg = sco_sock_recvmsg,
.poll = bt_sock_poll,
.ioctl = bt_sock_ioctl,
+ .gettstamp = sock_gettstamp,
.mmap = sock_no_mmap,
.socketpair = sock_no_socketpair,
.shutdown = sco_sock_shutdown,
diff --git a/net/bluetooth/selftest.c b/net/bluetooth/selftest.c
index 03e3c89c3046..f49604d44b87 100644
--- a/net/bluetooth/selftest.c
+++ b/net/bluetooth/selftest.c
@@ -205,7 +205,7 @@ static int __init test_ecdh(void)
calltime = ktime_get();
- tfm = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0);
+ tfm = crypto_alloc_kpp("ecdh-nist-p256", 0, 0);
if (IS_ERR(tfm)) {
BT_ERR("Unable to create ECDH crypto context");
err = PTR_ERR(tfm);
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 73f7211d0431..3a1ce04a7a53 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -22,11 +22,10 @@
#include <linux/debugfs.h>
#include <linux/scatterlist.h>
-#include <linux/crypto.h>
-#include <crypto/algapi.h>
-#include <crypto/b128ops.h>
+#include <crypto/aes.h>
#include <crypto/hash.h>
#include <crypto/kpp.h>
+#include <crypto/utils.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
@@ -40,7 +39,7 @@
((struct smp_dev *)((struct l2cap_chan *)((hdev)->smp_data))->data)
/* Low-level debug macros to be used for stuff that we don't want
- * accidentially in dmesg, i.e. the values of the various crypto keys
+ * accidentally in dmesg, i.e. the values of the various crypto keys
* and the inputs & outputs of crypto functions.
*/
#ifdef DEBUG
@@ -54,9 +53,11 @@
#define SMP_ALLOW_CMD(smp, code) set_bit(code, &smp->allow_cmd)
/* Keys which are not distributed with Secure Connections */
-#define SMP_SC_NO_DIST (SMP_DIST_ENC_KEY | SMP_DIST_LINK_KEY);
+#define SMP_SC_NO_DIST (SMP_DIST_ENC_KEY | SMP_DIST_LINK_KEY)
-#define SMP_TIMEOUT msecs_to_jiffies(30000)
+#define SMP_TIMEOUT secs_to_jiffies(30)
+
+#define ID_ADDR_TIMEOUT msecs_to_jiffies(200)
#define AUTH_REQ_MASK(dev) (hci_dev_test_flag(dev, HCI_SC_ENABLED) ? \
0x3f : 0x07)
@@ -88,10 +89,6 @@ struct smp_dev {
u8 local_rand[16];
bool debug_key;
- u8 min_key_size;
- u8 max_key_size;
-
- struct crypto_cipher *tfm_aes;
struct crypto_shash *tfm_cmac;
struct crypto_kpp *tfm_ecdh;
};
@@ -115,9 +112,9 @@ struct smp_chan {
u8 id_addr_type;
u8 irk[16];
struct smp_csrk *csrk;
- struct smp_csrk *slave_csrk;
+ struct smp_csrk *responder_csrk;
struct smp_ltk *ltk;
- struct smp_ltk *slave_ltk;
+ struct smp_ltk *responder_ltk;
struct smp_irk *remote_irk;
u8 *link_key;
unsigned long flags;
@@ -130,7 +127,6 @@ struct smp_chan {
u8 dhkey[32];
u8 mackey[16];
- struct crypto_cipher *tfm_aes;
struct crypto_shash *tfm_cmac;
struct crypto_kpp *tfm_ecdh;
};
@@ -174,7 +170,6 @@ static int aes_cmac(struct crypto_shash *tfm, const u8 k[16], const u8 *m,
size_t len, u8 mac[16])
{
uint8_t tmp[16], mac_msb[16], msg_msb[CMAC_MSG_MAX];
- SHASH_DESC_ON_STACK(desc, tfm);
int err;
if (len > CMAC_MSG_MAX)
@@ -185,9 +180,6 @@ static int aes_cmac(struct crypto_shash *tfm, const u8 k[16], const u8 *m,
return -EINVAL;
}
- desc->tfm = tfm;
- desc->flags = 0;
-
/* Swap key and message from LSB to MSB */
swap_buf(k, tmp, 16);
swap_buf(m, msg_msb, len);
@@ -201,8 +193,7 @@ static int aes_cmac(struct crypto_shash *tfm, const u8 k[16], const u8 *m,
return err;
}
- err = crypto_shash_digest(desc, msg_msb, len, mac_msb);
- shash_desc_zero(desc);
+ err = crypto_shash_tfm_digest(tfm, msg_msb, len, mac_msb);
if (err) {
BT_ERR("Hash computation error %d", err);
return err;
@@ -381,22 +372,18 @@ static int smp_h7(struct crypto_shash *tfm_cmac, const u8 w[16],
* s1 and ah.
*/
-static int smp_e(struct crypto_cipher *tfm, const u8 *k, u8 *r)
+static int smp_e(const u8 *k, u8 *r)
{
+ struct crypto_aes_ctx ctx;
uint8_t tmp[16], data[16];
int err;
SMP_DBG("k %16phN r %16phN", k, r);
- if (!tfm) {
- BT_ERR("tfm %p", tfm);
- return -EINVAL;
- }
-
/* The most significant octet of key corresponds to k[0] */
swap_buf(k, tmp, 16);
- err = crypto_cipher_setkey(tfm, tmp, 16);
+ err = aes_expandkey(&ctx, tmp, 16);
if (err) {
BT_ERR("cipher setkey failed: %d", err);
return err;
@@ -405,17 +392,18 @@ static int smp_e(struct crypto_cipher *tfm, const u8 *k, u8 *r)
/* Most significant octet of plaintextData corresponds to data[0] */
swap_buf(r, data, 16);
- crypto_cipher_encrypt_one(tfm, data, data);
+ aes_encrypt(&ctx, data, data);
/* Most significant octet of encryptedData corresponds to data[0] */
swap_buf(data, r, 16);
SMP_DBG("r %16phN", r);
+ memzero_explicit(&ctx, sizeof(ctx));
return err;
}
-static int smp_c1(struct crypto_cipher *tfm_aes, const u8 k[16],
+static int smp_c1(const u8 k[16],
const u8 r[16], const u8 preq[7], const u8 pres[7], u8 _iat,
const bdaddr_t *ia, u8 _rat, const bdaddr_t *ra, u8 res[16])
{
@@ -437,10 +425,10 @@ static int smp_c1(struct crypto_cipher *tfm_aes, const u8 k[16],
SMP_DBG("p1 %16phN", p1);
/* res = r XOR p1 */
- u128_xor((u128 *) res, (u128 *) r, (u128 *) p1);
+ crypto_xor_cpy(res, r, p1, sizeof(p1));
/* res = e(k, res) */
- err = smp_e(tfm_aes, k, res);
+ err = smp_e(k, res);
if (err) {
BT_ERR("Encrypt data error");
return err;
@@ -454,17 +442,17 @@ static int smp_c1(struct crypto_cipher *tfm_aes, const u8 k[16],
SMP_DBG("p2 %16phN", p2);
/* res = res XOR p2 */
- u128_xor((u128 *) res, (u128 *) res, (u128 *) p2);
+ crypto_xor(res, p2, sizeof(p2));
/* res = e(k, res) */
- err = smp_e(tfm_aes, k, res);
+ err = smp_e(k, res);
if (err)
BT_ERR("Encrypt data error");
return err;
}
-static int smp_s1(struct crypto_cipher *tfm_aes, const u8 k[16],
+static int smp_s1(const u8 k[16],
const u8 r1[16], const u8 r2[16], u8 _r[16])
{
int err;
@@ -473,15 +461,14 @@ static int smp_s1(struct crypto_cipher *tfm_aes, const u8 k[16],
memcpy(_r, r2, 8);
memcpy(_r + 8, r1, 8);
- err = smp_e(tfm_aes, k, _r);
+ err = smp_e(k, _r);
if (err)
BT_ERR("Encrypt data error");
return err;
}
-static int smp_ah(struct crypto_cipher *tfm, const u8 irk[16],
- const u8 r[3], u8 res[3])
+static int smp_ah(const u8 irk[16], const u8 r[3], u8 res[3])
{
u8 _res[16];
int err;
@@ -490,7 +477,7 @@ static int smp_ah(struct crypto_cipher *tfm, const u8 irk[16],
memcpy(_res, r, 3);
memset(_res + 3, 0, 13);
- err = smp_e(tfm, irk, _res);
+ err = smp_e(irk, _res);
if (err) {
BT_ERR("Encrypt error");
return err;
@@ -511,18 +498,15 @@ bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16],
const bdaddr_t *bdaddr)
{
struct l2cap_chan *chan = hdev->smp_data;
- struct smp_dev *smp;
u8 hash[3];
int err;
if (!chan || !chan->data)
return false;
- smp = chan->data;
-
- BT_DBG("RPA %pMR IRK %*phN", bdaddr, 16, irk);
+ bt_dev_dbg(hdev, "RPA %pMR IRK %*phN", bdaddr, 16, irk);
- err = smp_ah(smp->tfm_aes, irk, &bdaddr->b[3], hash);
+ err = smp_ah(irk, &bdaddr->b[3], hash);
if (err)
return false;
@@ -532,24 +516,21 @@ bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16],
int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa)
{
struct l2cap_chan *chan = hdev->smp_data;
- struct smp_dev *smp;
int err;
if (!chan || !chan->data)
return -EOPNOTSUPP;
- smp = chan->data;
-
get_random_bytes(&rpa->b[3], 3);
rpa->b[5] &= 0x3f; /* Clear two most significant bits */
rpa->b[5] |= 0x40; /* Set second most significant bit */
- err = smp_ah(smp->tfm_aes, irk, &rpa->b[3], rpa->b);
+ err = smp_ah(irk, &rpa->b[3], rpa->b);
if (err < 0)
return err;
- BT_DBG("RPA %pMR", rpa);
+ bt_dev_dbg(hdev, "RPA %pMR", rpa);
return 0;
}
@@ -566,7 +547,7 @@ int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16])
smp = chan->data;
if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) {
- BT_DBG("Using debug keys");
+ bt_dev_dbg(hdev, "Using debug keys");
err = set_ecdh_privkey(smp->tfm_ecdh, debug_sk);
if (err)
return err;
@@ -580,7 +561,7 @@ int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16])
return err;
/* This is unlikely, but we need to check that
- * we didn't accidentially generate a debug key.
+ * we didn't accidentally generate a debug key.
*/
if (crypto_memneq(smp->local_pk, debug_pk, 64))
break;
@@ -615,7 +596,7 @@ static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data)
if (!chan)
return;
- BT_DBG("code 0x%2.2x", code);
+ bt_dev_dbg(conn->hcon->hdev, "code 0x%2.2x", code);
iv[0].iov_base = &code;
iv[0].iov_len = 1;
@@ -625,9 +606,9 @@ static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data)
memset(&msg, 0, sizeof(msg));
- iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iv, 2, 1 + len);
+ iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, iv, 2, 1 + len);
- l2cap_chan_send(chan, &msg, 1 + len);
+ l2cap_chan_send(chan, &msg, 1 + len, NULL);
if (!chan->data)
return;
@@ -720,7 +701,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn,
if (rsp == NULL) {
req->io_capability = conn->hcon->io_capability;
req->oob_flag = oob_flag;
- req->max_key_size = SMP_DEV(hdev)->max_key_size;
+ req->max_key_size = hdev->le_max_key_size;
req->init_key_dist = local_dist;
req->resp_key_dist = remote_dist;
req->auth_req = (authreq & AUTH_REQ_MASK(hdev));
@@ -731,7 +712,7 @@ static void build_pairing_cmd(struct l2cap_conn *conn,
rsp->io_capability = conn->hcon->io_capability;
rsp->oob_flag = oob_flag;
- rsp->max_key_size = SMP_DEV(hdev)->max_key_size;
+ rsp->max_key_size = hdev->le_max_key_size;
rsp->init_key_dist = req->init_key_dist & remote_dist;
rsp->resp_key_dist = req->resp_key_dist & local_dist;
rsp->auth_req = (authreq & AUTH_REQ_MASK(hdev));
@@ -745,7 +726,11 @@ static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size)
struct hci_dev *hdev = conn->hcon->hdev;
struct smp_chan *smp = chan->data;
- if (max_key_size > SMP_DEV(hdev)->max_key_size ||
+ if (conn->hcon->pending_sec_level == BT_SECURITY_FIPS &&
+ max_key_size != SMP_MAX_ENC_KEY_SIZE)
+ return SMP_ENC_KEY_SIZE;
+
+ if (max_key_size > hdev->le_max_key_size ||
max_key_size < SMP_MIN_ENC_KEY_SIZE)
return SMP_ENC_KEY_SIZE;
@@ -768,11 +753,10 @@ static void smp_chan_destroy(struct l2cap_conn *conn)
complete = test_bit(SMP_FLAG_COMPLETE, &smp->flags);
mgmt_smp_complete(hcon, complete);
- kzfree(smp->csrk);
- kzfree(smp->slave_csrk);
- kzfree(smp->link_key);
+ kfree_sensitive(smp->csrk);
+ kfree_sensitive(smp->responder_csrk);
+ kfree_sensitive(smp->link_key);
- crypto_free_cipher(smp->tfm_aes);
crypto_free_shash(smp->tfm_cmac);
crypto_free_kpp(smp->tfm_ecdh);
@@ -793,9 +777,9 @@ static void smp_chan_destroy(struct l2cap_conn *conn)
kfree_rcu(smp->ltk, rcu);
}
- if (smp->slave_ltk) {
- list_del_rcu(&smp->slave_ltk->list);
- kfree_rcu(smp->slave_ltk, rcu);
+ if (smp->responder_ltk) {
+ list_del_rcu(&smp->responder_ltk->list);
+ kfree_rcu(smp->responder_ltk, rcu);
}
if (smp->remote_irk) {
@@ -805,7 +789,7 @@ static void smp_chan_destroy(struct l2cap_conn *conn)
}
chan->data = NULL;
- kzfree(smp);
+ kfree_sensitive(smp);
hci_conn_drop(hcon);
}
@@ -870,13 +854,14 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth,
struct l2cap_chan *chan = conn->smp;
struct smp_chan *smp = chan->data;
u32 passkey = 0;
- int ret = 0;
+ int ret;
/* Initialize key for JUST WORKS */
memset(smp->tk, 0, sizeof(smp->tk));
clear_bit(SMP_FLAG_TK_VALID, &smp->flags);
- BT_DBG("tk_request: auth:%d lcl:%d rem:%d", auth, local_io, remote_io);
+ bt_dev_dbg(hcon->hdev, "auth:%u lcl:%u rem:%u", auth, local_io,
+ remote_io);
/* If neither side wants MITM, either "just" confirm an incoming
* request or use just-works for outgoing ones. The JUST_CFM
@@ -899,9 +884,16 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth,
hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT)
smp->method = JUST_WORKS;
- /* If Just Works, Continue with Zero TK */
+ /* If Just Works, Continue with Zero TK and ask user-space for
+ * confirmation */
if (smp->method == JUST_WORKS) {
- set_bit(SMP_FLAG_TK_VALID, &smp->flags);
+ ret = mgmt_user_confirm_request(hcon->hdev, &hcon->dst,
+ hcon->type,
+ hcon->dst_type,
+ passkey, 1);
+ if (ret)
+ return ret;
+ set_bit(SMP_FLAG_WAIT_USER, &smp->flags);
return 0;
}
@@ -918,11 +910,11 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth,
hcon->pending_sec_level = BT_SECURITY_HIGH;
}
- /* If both devices have Keyoard-Display I/O, the master
- * Confirms and the slave Enters the passkey.
+ /* If both devices have Keyboard-Display I/O, the initiator
+ * Confirms and the responder Enters the passkey.
*/
if (smp->method == OVERLAP) {
- if (hcon->role == HCI_ROLE_MASTER)
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags))
smp->method = CFM_PASSKEY;
else
smp->method = REQ_PASSKEY;
@@ -934,7 +926,7 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth,
get_random_bytes(&passkey, sizeof(passkey));
passkey %= 1000000;
put_unaligned_le32(passkey, smp->tk);
- BT_DBG("PassKey: %d", passkey);
+ bt_dev_dbg(hcon->hdev, "PassKey: %u", passkey);
set_bit(SMP_FLAG_TK_VALID, &smp->flags);
}
@@ -959,9 +951,9 @@ static u8 smp_confirm(struct smp_chan *smp)
struct smp_cmd_pairing_confirm cp;
int ret;
- BT_DBG("conn %p", conn);
+ bt_dev_dbg(conn->hcon->hdev, "conn %p", conn);
- ret = smp_c1(smp->tfm_aes, smp->tk, smp->prnd, smp->preq, smp->prsp,
+ ret = smp_c1(smp->tk, smp->prnd, smp->preq, smp->prsp,
conn->hcon->init_addr_type, &conn->hcon->init_addr,
conn->hcon->resp_addr_type, &conn->hcon->resp_addr,
cp.confirm_val);
@@ -972,7 +964,7 @@ static u8 smp_confirm(struct smp_chan *smp)
smp_send_cmd(smp->conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cp), &cp);
- if (conn->hcon->out)
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags))
SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM);
else
SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM);
@@ -987,12 +979,11 @@ static u8 smp_random(struct smp_chan *smp)
u8 confirm[16];
int ret;
- if (IS_ERR_OR_NULL(smp->tfm_aes))
- return SMP_UNSPECIFIED;
-
- BT_DBG("conn %p %s", conn, conn->hcon->out ? "master" : "slave");
+ bt_dev_dbg(conn->hcon->hdev, "conn %p %s", conn,
+ test_bit(SMP_FLAG_INITIATOR, &smp->flags) ? "initiator" :
+ "responder");
- ret = smp_c1(smp->tfm_aes, smp->tk, smp->rrnd, smp->preq, smp->prsp,
+ ret = smp_c1(smp->tk, smp->rrnd, smp->preq, smp->prsp,
hcon->init_addr_type, &hcon->init_addr,
hcon->resp_addr_type, &hcon->resp_addr, confirm);
if (ret)
@@ -1004,12 +995,12 @@ static u8 smp_random(struct smp_chan *smp)
return SMP_CONFIRM_FAILED;
}
- if (hcon->out) {
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
u8 stk[16];
__le64 rand = 0;
__le16 ediv = 0;
- smp_s1(smp->tfm_aes, smp->tk, smp->rrnd, smp->prnd, stk);
+ smp_s1(smp->tk, smp->rrnd, smp->prnd, stk);
if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &hcon->flags))
return SMP_UNSPECIFIED;
@@ -1025,15 +1016,15 @@ static u8 smp_random(struct smp_chan *smp)
smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd),
smp->prnd);
- smp_s1(smp->tfm_aes, smp->tk, smp->prnd, smp->rrnd, stk);
+ smp_s1(smp->tk, smp->prnd, smp->rrnd, stk);
if (hcon->pending_sec_level == BT_SECURITY_HIGH)
auth = 1;
else
auth = 0;
- /* Even though there's no _SLAVE suffix this is the
- * slave STK we're adding for later lookup (the master
+ /* Even though there's no _RESPONDER suffix this is the
+ * responder STK we're adding for later lookup (the initiator
* STK never needs to be stored).
*/
hci_add_ltk(hcon->hdev, &hcon->dst, hcon->dst_type,
@@ -1078,7 +1069,12 @@ static void smp_notify_keys(struct l2cap_conn *conn)
if (hcon->type == LE_LINK) {
bacpy(&hcon->dst, &smp->remote_irk->bdaddr);
hcon->dst_type = smp->remote_irk->addr_type;
- queue_work(hdev->workqueue, &conn->id_addr_update_work);
+ /* Use a short delay to make sure the new address is
+ * propagated _before_ the channels.
+ */
+ queue_delayed_work(hdev->workqueue,
+ &conn->id_addr_timer,
+ ID_ADDR_TIMEOUT);
}
}
@@ -1088,10 +1084,10 @@ static void smp_notify_keys(struct l2cap_conn *conn)
mgmt_new_csrk(hdev, smp->csrk, persistent);
}
- if (smp->slave_csrk) {
- smp->slave_csrk->bdaddr_type = hcon->dst_type;
- bacpy(&smp->slave_csrk->bdaddr, &hcon->dst);
- mgmt_new_csrk(hdev, smp->slave_csrk, persistent);
+ if (smp->responder_csrk) {
+ smp->responder_csrk->bdaddr_type = hcon->dst_type;
+ bacpy(&smp->responder_csrk->bdaddr, &hcon->dst);
+ mgmt_new_csrk(hdev, smp->responder_csrk, persistent);
}
if (smp->ltk) {
@@ -1100,10 +1096,10 @@ static void smp_notify_keys(struct l2cap_conn *conn)
mgmt_new_ltk(hdev, smp->ltk, persistent);
}
- if (smp->slave_ltk) {
- smp->slave_ltk->bdaddr_type = hcon->dst_type;
- bacpy(&smp->slave_ltk->bdaddr, &hcon->dst);
- mgmt_new_ltk(hdev, smp->slave_ltk, persistent);
+ if (smp->responder_ltk) {
+ smp->responder_ltk->bdaddr_type = hcon->dst_type;
+ bacpy(&smp->responder_ltk->bdaddr, &hcon->dst);
+ mgmt_new_ltk(hdev, smp->responder_ltk, persistent);
}
if (smp->link_key) {
@@ -1164,11 +1160,11 @@ static void sc_generate_link_key(struct smp_chan *smp)
return;
if (test_bit(SMP_FLAG_CT2, &smp->flags)) {
- /* SALT = 0x00000000000000000000000000000000746D7031 */
+ /* SALT = 0x000000000000000000000000746D7031 */
const u8 salt[16] = { 0x31, 0x70, 0x6d, 0x74 };
if (smp_h7(smp->tfm_cmac, smp->tk, salt, smp->link_key)) {
- kzfree(smp->link_key);
+ kfree_sensitive(smp->link_key);
smp->link_key = NULL;
return;
}
@@ -1177,14 +1173,14 @@ static void sc_generate_link_key(struct smp_chan *smp)
const u8 tmp1[4] = { 0x31, 0x70, 0x6d, 0x74 };
if (smp_h6(smp->tfm_cmac, smp->tk, tmp1, smp->link_key)) {
- kzfree(smp->link_key);
+ kfree_sensitive(smp->link_key);
smp->link_key = NULL;
return;
}
}
if (smp_h6(smp->tfm_cmac, smp->link_key, lebr, smp->link_key)) {
- kzfree(smp->link_key);
+ kfree_sensitive(smp->link_key);
smp->link_key = NULL;
return;
}
@@ -1222,7 +1218,7 @@ static void sc_generate_ltk(struct smp_chan *smp)
set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags);
if (test_bit(SMP_FLAG_CT2, &smp->flags)) {
- /* SALT = 0x00000000000000000000000000000000746D7032 */
+ /* SALT = 0x000000000000000000000000746D7032 */
const u8 salt[16] = { 0x32, 0x70, 0x6d, 0x74 };
if (smp_h7(smp->tfm_cmac, key->val, salt, smp->tk))
@@ -1249,19 +1245,20 @@ static void smp_distribute_keys(struct smp_chan *smp)
struct hci_dev *hdev = hcon->hdev;
__u8 *keydist;
- BT_DBG("conn %p", conn);
+ bt_dev_dbg(hdev, "conn %p", conn);
rsp = (void *) &smp->prsp[1];
/* The responder sends its keys first */
- if (hcon->out && (smp->remote_key_dist & KEY_DIST_MASK)) {
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags) &&
+ (smp->remote_key_dist & KEY_DIST_MASK)) {
smp_allow_key_dist(smp);
return;
}
req = (void *) &smp->preq[1];
- if (hcon->out) {
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
keydist = &rsp->init_key_dist;
*keydist &= req->init_key_dist;
} else {
@@ -1279,11 +1276,11 @@ static void smp_distribute_keys(struct smp_chan *smp)
*keydist &= ~SMP_SC_NO_DIST;
}
- BT_DBG("keydist 0x%x", *keydist);
+ bt_dev_dbg(hdev, "keydist 0x%x", *keydist);
if (*keydist & SMP_DIST_ENC_KEY) {
struct smp_cmd_encrypt_info enc;
- struct smp_cmd_master_ident ident;
+ struct smp_cmd_initiator_ident ident;
struct smp_ltk *ltk;
u8 authenticated;
__le16 ediv;
@@ -1304,14 +1301,15 @@ static void smp_distribute_keys(struct smp_chan *smp)
authenticated = hcon->sec_level == BT_SECURITY_HIGH;
ltk = hci_add_ltk(hdev, &hcon->dst, hcon->dst_type,
- SMP_LTK_SLAVE, authenticated, enc.ltk,
+ SMP_LTK_RESPONDER, authenticated, enc.ltk,
smp->enc_key_size, ediv, rand);
- smp->slave_ltk = ltk;
+ smp->responder_ltk = ltk;
ident.ediv = ediv;
ident.rand = rand;
- smp_send_cmd(conn, SMP_CMD_MASTER_IDENT, sizeof(ident), &ident);
+ smp_send_cmd(conn, SMP_CMD_INITIATOR_IDENT, sizeof(ident),
+ &ident);
*keydist &= ~SMP_DIST_ENC_KEY;
}
@@ -1354,7 +1352,7 @@ static void smp_distribute_keys(struct smp_chan *smp)
csrk->type = MGMT_CSRK_LOCAL_UNAUTHENTICATED;
memcpy(csrk->val, sign.csrk, sizeof(csrk->val));
}
- smp->slave_csrk = csrk;
+ smp->responder_csrk = csrk;
smp_send_cmd(conn, SMP_CMD_SIGN_INFO, sizeof(sign), &sign);
@@ -1379,13 +1377,14 @@ static void smp_timeout(struct work_struct *work)
security_timer.work);
struct l2cap_conn *conn = smp->conn;
- BT_DBG("conn %p", conn);
+ bt_dev_dbg(conn->hcon->hdev, "conn %p", conn);
- hci_disconnect(conn->hcon, HCI_ERROR_REMOTE_USER_TERM);
+ hci_disconnect(conn->hcon, HCI_ERROR_AUTH_FAILURE);
}
static struct smp_chan *smp_chan_create(struct l2cap_conn *conn)
{
+ struct hci_conn *hcon = conn->hcon;
struct l2cap_chan *chan = conn->smp;
struct smp_chan *smp;
@@ -1393,21 +1392,15 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn)
if (!smp)
return NULL;
- smp->tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(smp->tfm_aes)) {
- BT_ERR("Unable to create AES crypto context");
- goto zfree_smp;
- }
-
smp->tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0);
if (IS_ERR(smp->tfm_cmac)) {
- BT_ERR("Unable to create CMAC crypto context");
- goto free_cipher;
+ bt_dev_err(hcon->hdev, "Unable to create CMAC crypto context");
+ goto zfree_smp;
}
- smp->tfm_ecdh = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0);
+ smp->tfm_ecdh = crypto_alloc_kpp("ecdh-nist-p256", 0, 0);
if (IS_ERR(smp->tfm_ecdh)) {
- BT_ERR("Unable to create ECDH crypto context");
+ bt_dev_err(hcon->hdev, "Unable to create ECDH crypto context");
goto free_shash;
}
@@ -1418,16 +1411,14 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn)
INIT_DELAYED_WORK(&smp->security_timer, smp_timeout);
- hci_conn_hold(conn->hcon);
+ hci_conn_hold(hcon);
return smp;
free_shash:
crypto_free_shash(smp->tfm_cmac);
-free_cipher:
- crypto_free_cipher(smp->tfm_aes);
zfree_smp:
- kzfree(smp);
+ kfree_sensitive(smp);
return NULL;
}
@@ -1436,7 +1427,7 @@ static int sc_mackey_and_ltk(struct smp_chan *smp, u8 mackey[16], u8 ltk[16])
struct hci_conn *hcon = smp->conn->hcon;
u8 *na, *nb, a[7], b[7];
- if (hcon->out) {
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
na = smp->prnd;
nb = smp->rrnd;
} else {
@@ -1464,7 +1455,7 @@ static void sc_dhkey_check(struct smp_chan *smp)
a[6] = hcon->init_addr_type;
b[6] = hcon->resp_addr_type;
- if (hcon->out) {
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
local_addr = a;
remote_addr = b;
memcpy(io_cap, &smp->preq[1], 3);
@@ -1543,7 +1534,7 @@ static u8 sc_passkey_round(struct smp_chan *smp, u8 smp_op)
/* The round is only complete when the initiator
* receives pairing random.
*/
- if (!hcon->out) {
+ if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM,
sizeof(smp->prnd), smp->prnd);
if (smp->passkey_round == 20)
@@ -1571,7 +1562,7 @@ static u8 sc_passkey_round(struct smp_chan *smp, u8 smp_op)
SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM);
- if (hcon->out) {
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM,
sizeof(smp->prnd), smp->prnd);
return 0;
@@ -1582,11 +1573,11 @@ static u8 sc_passkey_round(struct smp_chan *smp, u8 smp_op)
case SMP_CMD_PUBLIC_KEY:
default:
/* Initiating device starts the round */
- if (!hcon->out)
+ if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags))
return 0;
- BT_DBG("%s Starting passkey round %u", hdev->name,
- smp->passkey_round + 1);
+ bt_dev_dbg(hdev, "Starting passkey round %u",
+ smp->passkey_round + 1);
SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM);
@@ -1627,7 +1618,7 @@ static int sc_user_reply(struct smp_chan *smp, u16 mgmt_op, __le32 passkey)
}
/* Initiator sends DHKey check first */
- if (hcon->out) {
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
sc_dhkey_check(smp);
SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK);
} else if (test_and_clear_bit(SMP_FLAG_DHKEY_PENDING, &smp->flags)) {
@@ -1646,11 +1637,11 @@ int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey)
u32 value;
int err;
- BT_DBG("");
-
if (!conn)
return -ENOTCONN;
+ bt_dev_dbg(conn->hcon->hdev, "");
+
chan = conn->smp;
if (!chan)
return -ENOTCONN;
@@ -1672,9 +1663,9 @@ int smp_user_confirm_reply(struct hci_conn *hcon, u16 mgmt_op, __le32 passkey)
case MGMT_OP_USER_PASSKEY_REPLY:
value = le32_to_cpu(passkey);
memset(smp->tk, 0, sizeof(smp->tk));
- BT_DBG("PassKey: %d", value);
+ bt_dev_dbg(conn->hcon->hdev, "PassKey: %u", value);
put_unaligned_le32(value, smp->tk);
- /* Fall Through */
+ fallthrough;
case MGMT_OP_USER_CONFIRM_REPLY:
set_bit(SMP_FLAG_TK_VALID, &smp->flags);
break;
@@ -1750,25 +1741,23 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb)
struct smp_cmd_pairing rsp, *req = (void *) skb->data;
struct l2cap_chan *chan = conn->smp;
struct hci_dev *hdev = conn->hcon->hdev;
- struct smp_chan *smp;
+ struct smp_chan *smp = chan->data;
u8 key_size, auth, sec_level;
int ret;
- BT_DBG("conn %p", conn);
+ bt_dev_dbg(hdev, "conn %p", conn);
if (skb->len < sizeof(*req))
return SMP_INVALID_PARAMS;
- if (conn->hcon->role != HCI_ROLE_SLAVE)
+ if (smp && test_bit(SMP_FLAG_INITIATOR, &smp->flags))
return SMP_CMD_NOTSUPP;
- if (!chan->data)
+ if (!smp) {
smp = smp_chan_create(conn);
- else
- smp = chan->data;
-
- if (!smp)
- return SMP_UNSPECIFIED;
+ if (!smp)
+ return SMP_UNSPECIFIED;
+ }
/* We didn't start the pairing, so match remote */
auth = req->auth_req & AUTH_REQ_MASK(hdev);
@@ -1887,7 +1876,7 @@ static u8 sc_send_public_key(struct smp_chan *smp)
{
struct hci_dev *hdev = smp->conn->hcon->hdev;
- BT_DBG("");
+ bt_dev_dbg(hdev, "");
if (test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) {
struct l2cap_chan *chan = hdev->smp_data;
@@ -1908,7 +1897,7 @@ static u8 sc_send_public_key(struct smp_chan *smp)
}
if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) {
- BT_DBG("Using debug keys");
+ bt_dev_dbg(hdev, "Using debug keys");
if (set_ecdh_privkey(smp->tfm_ecdh, debug_sk))
return SMP_UNSPECIFIED;
memcpy(smp->local_pk, debug_pk, 64);
@@ -1920,7 +1909,7 @@ static u8 sc_send_public_key(struct smp_chan *smp)
return SMP_UNSPECIFIED;
/* This is unlikely, but we need to check that
- * we didn't accidentially generate a debug key.
+ * we didn't accidentally generate a debug key.
*/
if (crypto_memneq(smp->local_pk, debug_pk, 64))
break;
@@ -1945,12 +1934,12 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb)
u8 key_size, auth;
int ret;
- BT_DBG("conn %p", conn);
+ bt_dev_dbg(hdev, "conn %p", conn);
if (skb->len < sizeof(*rsp))
return SMP_INVALID_PARAMS;
- if (conn->hcon->role != HCI_ROLE_MASTER)
+ if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags))
return SMP_CMD_NOTSUPP;
skb_pull(skb, sizeof(*rsp));
@@ -2040,12 +2029,12 @@ static u8 sc_check_confirm(struct smp_chan *smp)
{
struct l2cap_conn *conn = smp->conn;
- BT_DBG("");
+ bt_dev_dbg(conn->hcon->hdev, "");
if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY)
return sc_passkey_round(smp, SMP_CMD_PAIRING_CONFIRM);
- if (conn->hcon->out) {
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd),
smp->prnd);
SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM);
@@ -2066,8 +2055,8 @@ static int fixup_sc_false_positive(struct smp_chan *smp)
struct smp_cmd_pairing *req, *rsp;
u8 auth;
- /* The issue is only observed when we're in slave role */
- if (hcon->out)
+ /* The issue is only observed when we're in responder role */
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags))
return SMP_UNSPECIFIED;
if (hci_dev_test_flag(hdev, HCI_SC_ONLY)) {
@@ -2099,8 +2088,12 @@ static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb)
{
struct l2cap_chan *chan = conn->smp;
struct smp_chan *smp = chan->data;
+ struct hci_conn *hcon = conn->hcon;
+ struct hci_dev *hdev = hcon->hdev;
- BT_DBG("conn %p %s", conn, conn->hcon->out ? "master" : "slave");
+ bt_dev_dbg(hdev, "conn %p %s", conn,
+ test_bit(SMP_FLAG_INITIATOR, &smp->flags) ? "initiator" :
+ "responder");
if (skb->len < sizeof(smp->pcnf))
return SMP_INVALID_PARAMS;
@@ -2115,14 +2108,14 @@ static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb)
if (test_bit(SMP_FLAG_REMOTE_PK, &smp->flags))
return sc_check_confirm(smp);
- BT_ERR("Unexpected SMP Pairing Confirm");
+ bt_dev_err(hdev, "Unexpected SMP Pairing Confirm");
ret = fixup_sc_false_positive(smp);
if (ret)
return ret;
}
- if (conn->hcon->out) {
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd),
smp->prnd);
SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM);
@@ -2142,11 +2135,11 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb)
struct l2cap_chan *chan = conn->smp;
struct smp_chan *smp = chan->data;
struct hci_conn *hcon = conn->hcon;
- u8 *pkax, *pkbx, *na, *nb;
- u32 passkey;
+ u8 *pkax, *pkbx, *na, *nb, confirm_hint;
+ u32 passkey = 0;
int err;
- BT_DBG("conn %p", conn);
+ bt_dev_dbg(hcon->hdev, "conn %p", conn);
if (skb->len < sizeof(smp->rrnd))
return SMP_INVALID_PARAMS;
@@ -2157,7 +2150,7 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb)
if (!test_bit(SMP_FLAG_SC, &smp->flags))
return smp_random(smp);
- if (hcon->out) {
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
pkax = smp->local_pk;
pkbx = smp->remote_pk;
na = smp->prnd;
@@ -2170,7 +2163,7 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb)
}
if (smp->method == REQ_OOB) {
- if (!hcon->out)
+ if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags))
smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM,
sizeof(smp->prnd), smp->prnd);
SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK);
@@ -2181,7 +2174,7 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb)
if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY)
return sc_passkey_round(smp, SMP_CMD_PAIRING_RANDOM);
- if (hcon->out) {
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
u8 cfm[16];
err = smp_f4(smp->tfm_cmac, smp->remote_pk, smp->local_pk,
@@ -2203,8 +2196,8 @@ mackey_and_ltk:
if (err)
return SMP_UNSPECIFIED;
- if (smp->method == JUST_WORKS || smp->method == REQ_OOB) {
- if (hcon->out) {
+ if (smp->method == REQ_OOB) {
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
sc_dhkey_check(smp);
SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK);
}
@@ -2215,8 +2208,15 @@ mackey_and_ltk:
if (err)
return SMP_UNSPECIFIED;
+ /* Always require user confirmation for Just-Works pairing to prevent
+ * impersonation attacks, or in case of a legitimate device that is
+ * repairing use the confirmation as acknowledgment to proceed with the
+ * creation of new keys.
+ */
+ confirm_hint = smp->method == JUST_WORKS ? 1 : 0;
+
err = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type,
- hcon->dst_type, passkey, 0);
+ hcon->dst_type, passkey, confirm_hint);
if (err)
return SMP_UNSPECIFIED;
@@ -2243,7 +2243,7 @@ static bool smp_ltk_encrypt(struct l2cap_conn *conn, u8 sec_level)
hci_le_start_enc(hcon, key->ediv, key->rand, key->val, key->enc_size);
hcon->enc_key_size = key->enc_size;
- /* We never store STKs for master role, so clear this flag */
+ /* We never store STKs for initiator role, so clear this flag */
clear_bit(HCI_CONN_STK_ENCRYPT, &hcon->flags);
return true;
@@ -2272,16 +2272,33 @@ bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level,
return false;
}
+static void smp_send_pairing_req(struct smp_chan *smp, __u8 auth)
+{
+ struct smp_cmd_pairing cp;
+
+ if (smp->conn->hcon->type == ACL_LINK)
+ build_bredr_pairing_cmd(smp, &cp, NULL);
+ else
+ build_pairing_cmd(smp->conn, &cp, NULL, auth);
+
+ smp->preq[0] = SMP_CMD_PAIRING_REQ;
+ memcpy(&smp->preq[1], &cp, sizeof(cp));
+
+ smp_send_cmd(smp->conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp);
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP);
+
+ set_bit(SMP_FLAG_INITIATOR, &smp->flags);
+}
+
static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb)
{
struct smp_cmd_security_req *rp = (void *) skb->data;
- struct smp_cmd_pairing cp;
struct hci_conn *hcon = conn->hcon;
struct hci_dev *hdev = hcon->hdev;
struct smp_chan *smp;
u8 sec_level, auth;
- BT_DBG("conn %p", conn);
+ bt_dev_dbg(hdev, "conn %p", conn);
if (skb->len < sizeof(*rp))
return SMP_INVALID_PARAMS;
@@ -2324,16 +2341,20 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb)
skb_pull(skb, sizeof(*rp));
- memset(&cp, 0, sizeof(cp));
- build_pairing_cmd(conn, &cp, NULL, auth);
+ smp_send_pairing_req(smp, auth);
- smp->preq[0] = SMP_CMD_PAIRING_REQ;
- memcpy(&smp->preq[1], &cp, sizeof(cp));
+ return 0;
+}
- smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp);
- SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP);
+static void smp_send_security_req(struct smp_chan *smp, __u8 auth)
+{
+ struct smp_cmd_security_req cp;
- return 0;
+ cp.auth_req = auth;
+ smp_send_cmd(smp->conn, SMP_CMD_SECURITY_REQ, sizeof(cp), &cp);
+ SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_REQ);
+
+ clear_bit(SMP_FLAG_INITIATOR, &smp->flags);
}
int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)
@@ -2344,7 +2365,8 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)
__u8 authreq;
int ret;
- BT_DBG("conn %p hcon %p level 0x%2.2x", conn, hcon, sec_level);
+ bt_dev_dbg(hcon->hdev, "conn %p hcon %p level 0x%2.2x", conn, hcon,
+ sec_level);
/* This may be NULL if there's an unexpected disconnection */
if (!conn)
@@ -2391,30 +2413,23 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)
authreq |= SMP_AUTH_CT2;
}
- /* Require MITM if IO Capability allows or the security level
- * requires it.
+ /* Don't attempt to set MITM if setting is overridden by debugfs
+ * Needed to pass certification test SM/MAS/PKE/BV-01-C
*/
- if (hcon->io_capability != HCI_IO_NO_INPUT_OUTPUT ||
- hcon->pending_sec_level > BT_SECURITY_MEDIUM)
- authreq |= SMP_AUTH_MITM;
-
- if (hcon->role == HCI_ROLE_MASTER) {
- struct smp_cmd_pairing cp;
-
- build_pairing_cmd(conn, &cp, NULL, authreq);
- smp->preq[0] = SMP_CMD_PAIRING_REQ;
- memcpy(&smp->preq[1], &cp, sizeof(cp));
-
- smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp);
- SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP);
- } else {
- struct smp_cmd_security_req cp;
- cp.auth_req = authreq;
- smp_send_cmd(conn, SMP_CMD_SECURITY_REQ, sizeof(cp), &cp);
- SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_REQ);
+ if (!hci_dev_test_flag(hcon->hdev, HCI_FORCE_NO_MITM)) {
+ /* Require MITM if IO Capability allows or the security level
+ * requires it.
+ */
+ if (hcon->io_capability != HCI_IO_NO_INPUT_OUTPUT ||
+ hcon->pending_sec_level > BT_SECURITY_MEDIUM)
+ authreq |= SMP_AUTH_MITM;
}
- set_bit(SMP_FLAG_INITIATOR, &smp->flags);
+ if (hcon->role == HCI_ROLE_MASTER)
+ smp_send_pairing_req(smp, authreq);
+ else
+ smp_send_security_req(smp, authreq);
+
ret = 0;
unlock:
@@ -2453,7 +2468,7 @@ int smp_cancel_and_remove_pairing(struct hci_dev *hdev, bdaddr_t *bdaddr,
/* Set keys to NULL to make sure smp_failure() does not try to
* remove and free already invalidated rcu list entries. */
smp->ltk = NULL;
- smp->slave_ltk = NULL;
+ smp->responder_ltk = NULL;
smp->remote_irk = NULL;
if (test_bit(SMP_FLAG_COMPLETE, &smp->flags))
@@ -2475,12 +2490,21 @@ static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb)
struct l2cap_chan *chan = conn->smp;
struct smp_chan *smp = chan->data;
- BT_DBG("conn %p", conn);
+ bt_dev_dbg(conn->hcon->hdev, "conn %p", conn);
if (skb->len < sizeof(*rp))
return SMP_INVALID_PARAMS;
- SMP_ALLOW_CMD(smp, SMP_CMD_MASTER_IDENT);
+ /* Pairing is aborted if any blocked keys are distributed */
+ if (hci_is_blocked_key(conn->hcon->hdev, HCI_BLOCKED_KEY_TYPE_LTK,
+ rp->ltk)) {
+ bt_dev_warn_ratelimited(conn->hcon->hdev,
+ "LTK blocked for %pMR",
+ &conn->hcon->dst);
+ return SMP_INVALID_PARAMS;
+ }
+
+ SMP_ALLOW_CMD(smp, SMP_CMD_INITIATOR_IDENT);
skb_pull(skb, sizeof(*rp));
@@ -2489,9 +2513,9 @@ static int smp_cmd_encrypt_info(struct l2cap_conn *conn, struct sk_buff *skb)
return 0;
}
-static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb)
+static int smp_cmd_initiator_ident(struct l2cap_conn *conn, struct sk_buff *skb)
{
- struct smp_cmd_master_ident *rp = (void *) skb->data;
+ struct smp_cmd_initiator_ident *rp = (void *)skb->data;
struct l2cap_chan *chan = conn->smp;
struct smp_chan *smp = chan->data;
struct hci_dev *hdev = conn->hcon->hdev;
@@ -2499,7 +2523,7 @@ static int smp_cmd_master_ident(struct l2cap_conn *conn, struct sk_buff *skb)
struct smp_ltk *ltk;
u8 authenticated;
- BT_DBG("conn %p", conn);
+ bt_dev_dbg(hdev, "conn %p", conn);
if (skb->len < sizeof(*rp))
return SMP_INVALID_PARAMS;
@@ -2531,11 +2555,20 @@ static int smp_cmd_ident_info(struct l2cap_conn *conn, struct sk_buff *skb)
struct l2cap_chan *chan = conn->smp;
struct smp_chan *smp = chan->data;
- BT_DBG("");
+ bt_dev_dbg(conn->hcon->hdev, "");
if (skb->len < sizeof(*info))
return SMP_INVALID_PARAMS;
+ /* Pairing is aborted if any blocked keys are distributed */
+ if (hci_is_blocked_key(conn->hcon->hdev, HCI_BLOCKED_KEY_TYPE_IRK,
+ info->irk)) {
+ bt_dev_warn_ratelimited(conn->hcon->hdev,
+ "Identity key blocked for %pMR",
+ &conn->hcon->dst);
+ return SMP_INVALID_PARAMS;
+ }
+
SMP_ALLOW_CMD(smp, SMP_CMD_IDENT_ADDR_INFO);
skb_pull(skb, sizeof(*info));
@@ -2554,7 +2587,7 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn,
struct hci_conn *hcon = conn->hcon;
bdaddr_t rpa;
- BT_DBG("");
+ bt_dev_dbg(hcon->hdev, "");
if (skb->len < sizeof(*info))
return SMP_INVALID_PARAMS;
@@ -2583,6 +2616,19 @@ static int smp_cmd_ident_addr_info(struct l2cap_conn *conn,
goto distribute;
}
+ /* Drop IRK if peer is using identity address during pairing but is
+ * providing different address as identity information.
+ *
+ * Microsoft Surface Precision Mouse is known to have this bug.
+ */
+ if (hci_is_identity_address(&hcon->dst, hcon->dst_type) &&
+ (bacmp(&info->bdaddr, &hcon->dst) ||
+ info->addr_type != hcon->dst_type)) {
+ bt_dev_err(hcon->hdev,
+ "ignoring IRK with invalid identity address");
+ goto distribute;
+ }
+
bacpy(&smp->id_addr, &info->bdaddr);
smp->id_addr_type = info->addr_type;
@@ -2608,7 +2654,7 @@ static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb)
struct smp_chan *smp = chan->data;
struct smp_csrk *csrk;
- BT_DBG("conn %p", conn);
+ bt_dev_dbg(conn->hcon->hdev, "conn %p", conn);
if (skb->len < sizeof(*rp))
return SMP_INVALID_PARAMS;
@@ -2634,8 +2680,6 @@ static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb)
static u8 sc_select_method(struct smp_chan *smp)
{
- struct l2cap_conn *conn = smp->conn;
- struct hci_conn *hcon = conn->hcon;
struct smp_cmd_pairing *local, *remote;
u8 local_mitm, remote_mitm, local_io, remote_io, method;
@@ -2648,7 +2692,7 @@ static u8 sc_select_method(struct smp_chan *smp)
* the "struct smp_cmd_pairing" from them we need to skip the
* first byte which contains the opcode.
*/
- if (hcon->out) {
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
local = (void *) &smp->preq[1];
remote = (void *) &smp->prsp[1];
} else {
@@ -2688,11 +2732,20 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb)
struct smp_cmd_pairing_confirm cfm;
int err;
- BT_DBG("conn %p", conn);
+ bt_dev_dbg(hdev, "conn %p", conn);
if (skb->len < sizeof(*key))
return SMP_INVALID_PARAMS;
+ /* Check if remote and local public keys are the same and debug key is
+ * not in use.
+ */
+ if (!test_bit(SMP_FLAG_DEBUG_KEY, &smp->flags) &&
+ !crypto_memneq(key, smp->local_pk, 64)) {
+ bt_dev_err(hdev, "Remote and local public keys are identical");
+ return SMP_UNSPECIFIED;
+ }
+
memcpy(smp->remote_pk, key, 64);
if (test_bit(SMP_FLAG_REMOTE_OOB, &smp->flags)) {
@@ -2708,7 +2761,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb)
/* Non-initiating device sends its public key after receiving
* the key from the initiating device.
*/
- if (!hcon->out) {
+ if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
err = sc_send_public_key(smp);
if (err)
return err;
@@ -2743,7 +2796,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb)
smp->method = sc_select_method(smp);
- BT_DBG("%s selected method 0x%02x", hdev->name, smp->method);
+ bt_dev_dbg(hdev, "selected method 0x%02x", smp->method);
/* JUST_WORKS and JUST_CFM result in an unauthenticated key */
if (smp->method == JUST_WORKS || smp->method == JUST_CFM)
@@ -2770,7 +2823,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb)
}
if (smp->method == REQ_OOB) {
- if (hcon->out)
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags))
smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM,
sizeof(smp->prnd), smp->prnd);
@@ -2779,7 +2832,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb)
return 0;
}
- if (hcon->out)
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags))
SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM);
if (smp->method == REQ_PASSKEY) {
@@ -2794,7 +2847,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb)
/* The Initiating device waits for the non-initiating device to
* send the confirm value.
*/
- if (conn->hcon->out)
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags))
return 0;
err = smp_f4(smp->tfm_cmac, smp->local_pk, smp->remote_pk, smp->prnd,
@@ -2818,7 +2871,7 @@ static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb)
u8 io_cap[3], r[16], e[16];
int err;
- BT_DBG("conn %p", conn);
+ bt_dev_dbg(hcon->hdev, "conn %p", conn);
if (skb->len < sizeof(*check))
return SMP_INVALID_PARAMS;
@@ -2828,7 +2881,7 @@ static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb)
a[6] = hcon->init_addr_type;
b[6] = hcon->resp_addr_type;
- if (hcon->out) {
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
local_addr = a;
remote_addr = b;
memcpy(io_cap, &smp->prsp[1], 3);
@@ -2853,19 +2906,19 @@ static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb)
if (crypto_memneq(check->e, e, 16))
return SMP_DHKEY_CHECK_FAILED;
- if (!hcon->out) {
+ if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
if (test_bit(SMP_FLAG_WAIT_USER, &smp->flags)) {
set_bit(SMP_FLAG_DHKEY_PENDING, &smp->flags);
return 0;
}
- /* Slave sends DHKey check as response to master */
+ /* Responder sends DHKey check as response to initiator */
sc_dhkey_check(smp);
}
sc_add_ltk(smp);
- if (hcon->out) {
+ if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
hci_le_start_enc(hcon, 0, 0, smp->tk, smp->enc_key_size);
hcon->enc_key_size = smp->enc_key_size;
}
@@ -2878,7 +2931,7 @@ static int smp_cmd_keypress_notify(struct l2cap_conn *conn,
{
struct smp_cmd_keypress_notify *kp = (void *) skb->data;
- BT_DBG("value 0x%02x", kp->value);
+ bt_dev_dbg(conn->hcon->hdev, "value 0x%02x", kp->value);
return 0;
}
@@ -2907,8 +2960,25 @@ static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb)
if (code > SMP_CMD_MAX)
goto drop;
- if (smp && !test_and_clear_bit(code, &smp->allow_cmd))
+ if (smp && !test_and_clear_bit(code, &smp->allow_cmd)) {
+ /* If there is a context and the command is not allowed consider
+ * it a failure so the session is cleanup properly.
+ */
+ switch (code) {
+ case SMP_CMD_IDENT_INFO:
+ case SMP_CMD_IDENT_ADDR_INFO:
+ case SMP_CMD_SIGN_INFO:
+ /* 3.6.1. Key distribution and generation
+ *
+ * A device may reject a distributed key by sending the
+ * Pairing Failed command with the reason set to
+ * "Key Rejected".
+ */
+ smp_failure(conn, SMP_KEY_REJECTED);
+ break;
+ }
goto drop;
+ }
/* If we don't have a context the only allowed commands are
* pairing request and security request.
@@ -2946,8 +3016,8 @@ static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb)
reason = smp_cmd_encrypt_info(conn, skb);
break;
- case SMP_CMD_MASTER_IDENT:
- reason = smp_cmd_master_ident(conn, skb);
+ case SMP_CMD_INITIATOR_IDENT:
+ reason = smp_cmd_initiator_ident(conn, skb);
break;
case SMP_CMD_IDENT_INFO:
@@ -2975,7 +3045,7 @@ static int smp_sig_channel(struct l2cap_chan *chan, struct sk_buff *skb)
break;
default:
- BT_DBG("Unknown command code 0x%2.2x", code);
+ bt_dev_dbg(hcon->hdev, "Unknown command code 0x%2.2x", code);
reason = SMP_CMD_NOTSUPP;
goto done;
}
@@ -3000,7 +3070,7 @@ static void smp_teardown_cb(struct l2cap_chan *chan, int err)
{
struct l2cap_conn *conn = chan->conn;
- BT_DBG("chan %p", chan);
+ bt_dev_dbg(conn->hcon->hdev, "chan %p", chan);
if (chan->data)
smp_chan_destroy(conn);
@@ -3014,10 +3084,9 @@ static void bredr_pairing(struct l2cap_chan *chan)
struct l2cap_conn *conn = chan->conn;
struct hci_conn *hcon = conn->hcon;
struct hci_dev *hdev = hcon->hdev;
- struct smp_cmd_pairing req;
struct smp_chan *smp;
- BT_DBG("chan %p", chan);
+ bt_dev_dbg(hdev, "chan %p", chan);
/* Only new pairings are interesting */
if (!test_bit(HCI_CONN_NEW_LINK_KEY, &hcon->flags))
@@ -3027,7 +3096,7 @@ static void bredr_pairing(struct l2cap_chan *chan)
if (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags))
return;
- /* Only master may initiate SMP over BR/EDR */
+ /* Only initiator may initiate SMP over BR/EDR */
if (hcon->role != HCI_ROLE_MASTER)
return;
@@ -3064,16 +3133,9 @@ static void bredr_pairing(struct l2cap_chan *chan)
set_bit(SMP_FLAG_SC, &smp->flags);
- BT_DBG("%s starting SMP over BR/EDR", hdev->name);
-
- /* Prepare and send the BR/EDR SMP Pairing Request */
- build_bredr_pairing_cmd(smp, &req, NULL);
+ bt_dev_dbg(hdev, "starting SMP over BR/EDR");
- smp->preq[0] = SMP_CMD_PAIRING_REQ;
- memcpy(&smp->preq[1], &req, sizeof(req));
-
- smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(req), &req);
- SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP);
+ smp_send_pairing_req(smp, 0x00);
}
static void smp_resume_cb(struct l2cap_chan *chan)
@@ -3082,7 +3144,7 @@ static void smp_resume_cb(struct l2cap_chan *chan)
struct l2cap_conn *conn = chan->conn;
struct hci_conn *hcon = conn->hcon;
- BT_DBG("chan %p", chan);
+ bt_dev_dbg(hcon->hdev, "chan %p", chan);
if (hcon->type == ACL_LINK) {
bredr_pairing(chan);
@@ -3105,12 +3167,12 @@ static void smp_ready_cb(struct l2cap_chan *chan)
struct l2cap_conn *conn = chan->conn;
struct hci_conn *hcon = conn->hcon;
- BT_DBG("chan %p", chan);
+ bt_dev_dbg(hcon->hdev, "chan %p", chan);
/* No need to call l2cap_chan_hold() here since we already own
* the reference taken in smp_new_conn_cb(). This is just the
* first time that we tie it to a specific pointer. The code in
- * l2cap_core.c ensures that there's no risk this function wont
+ * l2cap_core.c ensures that there's no risk this function won't
* get called if smp_new_conn_cb was previously called.
*/
conn->smp = chan;
@@ -3123,7 +3185,7 @@ static int smp_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb)
{
int err;
- BT_DBG("chan %p", chan);
+ bt_dev_dbg(chan->conn->hcon->hdev, "chan %p", chan);
err = smp_sig_channel(chan, skb);
if (err) {
@@ -3223,7 +3285,6 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
{
struct l2cap_chan *chan;
struct smp_dev *smp;
- struct crypto_cipher *tfm_aes;
struct crypto_shash *tfm_cmac;
struct crypto_kpp *tfm_ecdh;
@@ -3236,45 +3297,32 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
if (!smp)
return ERR_PTR(-ENOMEM);
- tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm_aes)) {
- BT_ERR("Unable to create AES crypto context");
- kzfree(smp);
- return ERR_CAST(tfm_aes);
- }
-
tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0);
if (IS_ERR(tfm_cmac)) {
- BT_ERR("Unable to create CMAC crypto context");
- crypto_free_cipher(tfm_aes);
- kzfree(smp);
+ bt_dev_err(hdev, "Unable to create CMAC crypto context");
+ kfree_sensitive(smp);
return ERR_CAST(tfm_cmac);
}
- tfm_ecdh = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0);
+ tfm_ecdh = crypto_alloc_kpp("ecdh-nist-p256", 0, 0);
if (IS_ERR(tfm_ecdh)) {
- BT_ERR("Unable to create ECDH crypto context");
+ bt_dev_err(hdev, "Unable to create ECDH crypto context");
crypto_free_shash(tfm_cmac);
- crypto_free_cipher(tfm_aes);
- kzfree(smp);
+ kfree_sensitive(smp);
return ERR_CAST(tfm_ecdh);
}
smp->local_oob = false;
- smp->tfm_aes = tfm_aes;
smp->tfm_cmac = tfm_cmac;
smp->tfm_ecdh = tfm_ecdh;
- smp->min_key_size = SMP_MIN_ENC_KEY_SIZE;
- smp->max_key_size = SMP_MAX_ENC_KEY_SIZE;
create_chan:
chan = l2cap_chan_create();
if (!chan) {
if (smp) {
- crypto_free_cipher(smp->tfm_aes);
crypto_free_shash(smp->tfm_cmac);
crypto_free_kpp(smp->tfm_ecdh);
- kzfree(smp);
+ kfree_sensitive(smp);
}
return ERR_PTR(-ENOMEM);
}
@@ -3319,40 +3367,16 @@ static void smp_del_chan(struct l2cap_chan *chan)
smp = chan->data;
if (smp) {
chan->data = NULL;
- crypto_free_cipher(smp->tfm_aes);
crypto_free_shash(smp->tfm_cmac);
crypto_free_kpp(smp->tfm_ecdh);
- kzfree(smp);
+ kfree_sensitive(smp);
}
l2cap_chan_put(chan);
}
-static ssize_t force_bredr_smp_read(struct file *file,
- char __user *user_buf,
- size_t count, loff_t *ppos)
-{
- struct hci_dev *hdev = file->private_data;
- char buf[3];
-
- buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP) ? 'Y': 'N';
- buf[1] = '\n';
- buf[2] = '\0';
- return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
-}
-
-static ssize_t force_bredr_smp_write(struct file *file,
- const char __user *user_buf,
- size_t count, loff_t *ppos)
+int smp_force_bredr(struct hci_dev *hdev, bool enable)
{
- struct hci_dev *hdev = file->private_data;
- bool enable;
- int err;
-
- err = kstrtobool_from_user(user_buf, count, &enable);
- if (err)
- return err;
-
if (enable == hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP))
return -EALREADY;
@@ -3374,109 +3398,14 @@ static ssize_t force_bredr_smp_write(struct file *file,
hci_dev_change_flag(hdev, HCI_FORCE_BREDR_SMP);
- return count;
-}
-
-static const struct file_operations force_bredr_smp_fops = {
- .open = simple_open,
- .read = force_bredr_smp_read,
- .write = force_bredr_smp_write,
- .llseek = default_llseek,
-};
-
-static ssize_t le_min_key_size_read(struct file *file,
- char __user *user_buf,
- size_t count, loff_t *ppos)
-{
- struct hci_dev *hdev = file->private_data;
- char buf[4];
-
- snprintf(buf, sizeof(buf), "%2u\n", SMP_DEV(hdev)->min_key_size);
-
- return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));
-}
-
-static ssize_t le_min_key_size_write(struct file *file,
- const char __user *user_buf,
- size_t count, loff_t *ppos)
-{
- struct hci_dev *hdev = file->private_data;
- char buf[32];
- size_t buf_size = min(count, (sizeof(buf) - 1));
- u8 key_size;
-
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
-
- buf[buf_size] = '\0';
-
- sscanf(buf, "%hhu", &key_size);
-
- if (key_size > SMP_DEV(hdev)->max_key_size ||
- key_size < SMP_MIN_ENC_KEY_SIZE)
- return -EINVAL;
-
- SMP_DEV(hdev)->min_key_size = key_size;
-
- return count;
-}
-
-static const struct file_operations le_min_key_size_fops = {
- .open = simple_open,
- .read = le_min_key_size_read,
- .write = le_min_key_size_write,
- .llseek = default_llseek,
-};
-
-static ssize_t le_max_key_size_read(struct file *file,
- char __user *user_buf,
- size_t count, loff_t *ppos)
-{
- struct hci_dev *hdev = file->private_data;
- char buf[4];
-
- snprintf(buf, sizeof(buf), "%2u\n", SMP_DEV(hdev)->max_key_size);
-
- return simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf));
-}
-
-static ssize_t le_max_key_size_write(struct file *file,
- const char __user *user_buf,
- size_t count, loff_t *ppos)
-{
- struct hci_dev *hdev = file->private_data;
- char buf[32];
- size_t buf_size = min(count, (sizeof(buf) - 1));
- u8 key_size;
-
- if (copy_from_user(buf, user_buf, buf_size))
- return -EFAULT;
-
- buf[buf_size] = '\0';
-
- sscanf(buf, "%hhu", &key_size);
-
- if (key_size > SMP_MAX_ENC_KEY_SIZE ||
- key_size < SMP_DEV(hdev)->min_key_size)
- return -EINVAL;
-
- SMP_DEV(hdev)->max_key_size = key_size;
-
- return count;
+ return 0;
}
-static const struct file_operations le_max_key_size_fops = {
- .open = simple_open,
- .read = le_max_key_size_read,
- .write = le_max_key_size_write,
- .llseek = default_llseek,
-};
-
int smp_register(struct hci_dev *hdev)
{
struct l2cap_chan *chan;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "");
/* If the controller does not support Low Energy operation, then
* there is also no need to register any SMP channel.
@@ -3496,22 +3425,7 @@ int smp_register(struct hci_dev *hdev)
hdev->smp_data = chan;
- debugfs_create_file("le_min_key_size", 0644, hdev->debugfs, hdev,
- &le_min_key_size_fops);
- debugfs_create_file("le_max_key_size", 0644, hdev->debugfs, hdev,
- &le_max_key_size_fops);
-
- /* If the controller does not support BR/EDR Secure Connections
- * feature, then the BR/EDR SMP channel shall not be present.
- *
- * To test this with Bluetooth 4.0 controllers, create a debugfs
- * switch that allows forcing BR/EDR SMP support and accepting
- * cross-transport pairing on non-AES encrypted connections.
- */
if (!lmp_sc_capable(hdev)) {
- debugfs_create_file("force_bredr_smp", 0644, hdev->debugfs,
- hdev, &force_bredr_smp_fops);
-
/* Flag can be already set here (due to power toggle) */
if (!hci_dev_test_flag(hdev, HCI_FORCE_BREDR_SMP))
return 0;
@@ -3575,7 +3489,7 @@ static int __init test_debug_key(struct crypto_kpp *tfm_ecdh)
return 0;
}
-static int __init test_ah(struct crypto_cipher *tfm_aes)
+static int __init test_ah(void)
{
const u8 irk[16] = {
0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34,
@@ -3585,7 +3499,7 @@ static int __init test_ah(struct crypto_cipher *tfm_aes)
u8 res[3];
int err;
- err = smp_ah(tfm_aes, irk, r, res);
+ err = smp_ah(irk, r, res);
if (err)
return err;
@@ -3595,7 +3509,7 @@ static int __init test_ah(struct crypto_cipher *tfm_aes)
return 0;
}
-static int __init test_c1(struct crypto_cipher *tfm_aes)
+static int __init test_c1(void)
{
const u8 k[16] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -3615,7 +3529,7 @@ static int __init test_c1(struct crypto_cipher *tfm_aes)
u8 res[16];
int err;
- err = smp_c1(tfm_aes, k, r, preq, pres, _iat, &ia, _rat, &ra, res);
+ err = smp_c1(k, r, preq, pres, _iat, &ia, _rat, &ra, res);
if (err)
return err;
@@ -3625,7 +3539,7 @@ static int __init test_c1(struct crypto_cipher *tfm_aes)
return 0;
}
-static int __init test_s1(struct crypto_cipher *tfm_aes)
+static int __init test_s1(void)
{
const u8 k[16] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -3640,7 +3554,7 @@ static int __init test_s1(struct crypto_cipher *tfm_aes)
u8 res[16];
int err;
- err = smp_s1(tfm_aes, k, r1, r2, res);
+ err = smp_s1(k, r1, r2, res);
if (err)
return err;
@@ -3821,8 +3735,7 @@ static const struct file_operations test_smp_fops = {
.llseek = default_llseek,
};
-static int __init run_selftests(struct crypto_cipher *tfm_aes,
- struct crypto_shash *tfm_cmac,
+static int __init run_selftests(struct crypto_shash *tfm_cmac,
struct crypto_kpp *tfm_ecdh)
{
ktime_t calltime, delta, rettime;
@@ -3837,19 +3750,19 @@ static int __init run_selftests(struct crypto_cipher *tfm_aes,
goto done;
}
- err = test_ah(tfm_aes);
+ err = test_ah();
if (err) {
BT_ERR("smp_ah test failed");
goto done;
}
- err = test_c1(tfm_aes);
+ err = test_c1();
if (err) {
BT_ERR("smp_c1 test failed");
goto done;
}
- err = test_s1(tfm_aes);
+ err = test_s1();
if (err) {
BT_ERR("smp_s1 test failed");
goto done;
@@ -3906,36 +3819,26 @@ done:
int __init bt_selftest_smp(void)
{
- struct crypto_cipher *tfm_aes;
struct crypto_shash *tfm_cmac;
struct crypto_kpp *tfm_ecdh;
int err;
- tfm_aes = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm_aes)) {
- BT_ERR("Unable to create AES crypto context");
- return PTR_ERR(tfm_aes);
- }
-
- tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, CRYPTO_ALG_ASYNC);
+ tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0);
if (IS_ERR(tfm_cmac)) {
BT_ERR("Unable to create CMAC crypto context");
- crypto_free_cipher(tfm_aes);
return PTR_ERR(tfm_cmac);
}
- tfm_ecdh = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0);
+ tfm_ecdh = crypto_alloc_kpp("ecdh-nist-p256", 0, 0);
if (IS_ERR(tfm_ecdh)) {
BT_ERR("Unable to create ECDH crypto context");
crypto_free_shash(tfm_cmac);
- crypto_free_cipher(tfm_aes);
return PTR_ERR(tfm_ecdh);
}
- err = run_selftests(tfm_aes, tfm_cmac, tfm_ecdh);
+ err = run_selftests(tfm_cmac, tfm_ecdh);
crypto_free_shash(tfm_cmac);
- crypto_free_cipher(tfm_aes);
crypto_free_kpp(tfm_ecdh);
return err;
diff --git a/net/bluetooth/smp.h b/net/bluetooth/smp.h
index 121edadd5f8d..c5da53dfab04 100644
--- a/net/bluetooth/smp.h
+++ b/net/bluetooth/smp.h
@@ -79,8 +79,8 @@ struct smp_cmd_encrypt_info {
__u8 ltk[16];
} __packed;
-#define SMP_CMD_MASTER_IDENT 0x07
-struct smp_cmd_master_ident {
+#define SMP_CMD_INITIATOR_IDENT 0x07
+struct smp_cmd_initiator_ident {
__le16 ediv;
__le64 rand;
} __packed;
@@ -138,6 +138,7 @@ struct smp_cmd_keypress_notify {
#define SMP_NUMERIC_COMP_FAILED 0x0c
#define SMP_BREDR_PAIRING_IN_PROGRESS 0x0d
#define SMP_CROSS_TRANSP_NOT_ALLOWED 0x0e
+#define SMP_KEY_REJECTED 0x0f
#define SMP_MIN_ENC_KEY_SIZE 7
#define SMP_MAX_ENC_KEY_SIZE 16
@@ -146,7 +147,7 @@ struct smp_cmd_keypress_notify {
enum {
SMP_STK,
SMP_LTK,
- SMP_LTK_SLAVE,
+ SMP_LTK_RESPONDER,
SMP_LTK_P256,
SMP_LTK_P256_DEBUG,
};
@@ -193,6 +194,8 @@ bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16],
int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa);
int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]);
+int smp_force_bredr(struct hci_dev *hdev, bool enable);
+
int smp_register(struct hci_dev *hdev);
void smp_unregister(struct hci_dev *hdev);
diff --git a/net/bpf/Makefile b/net/bpf/Makefile
index 27b2992a0692..1ebe270bde23 100644
--- a/net/bpf/Makefile
+++ b/net/bpf/Makefile
@@ -1 +1,5 @@
-obj-y := test_run.o
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_BPF_SYSCALL) := test_run.o
+ifeq ($(CONFIG_BPF_JIT),y)
+obj-$(CONFIG_BPF_SYSCALL) += bpf_dummy_struct_ops.o
+endif
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
new file mode 100644
index 000000000000..812457819b5a
--- /dev/null
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2021. Huawei Technologies Co., Ltd
+ */
+#include <linux/kernel.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+
+static struct bpf_struct_ops bpf_bpf_dummy_ops;
+
+/* A common type for test_N with return value in bpf_dummy_ops */
+typedef int (*dummy_ops_test_ret_fn)(struct bpf_dummy_ops_state *state, ...);
+
+static int dummy_ops_test_ret_function(struct bpf_dummy_ops_state *state, ...)
+{
+ return 0;
+}
+
+struct bpf_dummy_ops_test_args {
+ u64 args[MAX_BPF_FUNC_ARGS];
+ struct bpf_dummy_ops_state state;
+};
+
+static struct btf *bpf_dummy_ops_btf;
+
+static struct bpf_dummy_ops_test_args *
+dummy_ops_init_args(const union bpf_attr *kattr, unsigned int nr)
+{
+ __u32 size_in;
+ struct bpf_dummy_ops_test_args *args;
+ void __user *ctx_in;
+ void __user *u_state;
+
+ size_in = kattr->test.ctx_size_in;
+ if (size_in != sizeof(u64) * nr)
+ return ERR_PTR(-EINVAL);
+
+ args = kzalloc(sizeof(*args), GFP_KERNEL);
+ if (!args)
+ return ERR_PTR(-ENOMEM);
+
+ ctx_in = u64_to_user_ptr(kattr->test.ctx_in);
+ if (copy_from_user(args->args, ctx_in, size_in))
+ goto out;
+
+ /* args[0] is 0 means state argument of test_N will be NULL */
+ u_state = u64_to_user_ptr(args->args[0]);
+ if (u_state && copy_from_user(&args->state, u_state,
+ sizeof(args->state)))
+ goto out;
+
+ return args;
+out:
+ kfree(args);
+ return ERR_PTR(-EFAULT);
+}
+
+static int dummy_ops_copy_args(struct bpf_dummy_ops_test_args *args)
+{
+ void __user *u_state;
+
+ u_state = u64_to_user_ptr(args->args[0]);
+ if (u_state && copy_to_user(u_state, &args->state, sizeof(args->state)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int dummy_ops_call_op(void *image, struct bpf_dummy_ops_test_args *args)
+{
+ dummy_ops_test_ret_fn test = (void *)image + cfi_get_offset();
+ struct bpf_dummy_ops_state *state = NULL;
+
+ /* state needs to be NULL if args[0] is 0 */
+ if (args->args[0])
+ state = &args->state;
+ return test(state, args->args[1], args->args[2],
+ args->args[3], args->args[4]);
+}
+
+static const struct bpf_ctx_arg_aux *find_ctx_arg_info(struct bpf_prog_aux *aux, int offset)
+{
+ int i;
+
+ for (i = 0; i < aux->ctx_arg_info_size; i++)
+ if (aux->ctx_arg_info[i].offset == offset)
+ return &aux->ctx_arg_info[i];
+
+ return NULL;
+}
+
+/* There is only one check at the moment:
+ * - zero should not be passed for pointer parameters not marked as nullable.
+ */
+static int check_test_run_args(struct bpf_prog *prog, struct bpf_dummy_ops_test_args *args)
+{
+ const struct btf_type *func_proto = prog->aux->attach_func_proto;
+
+ for (u32 arg_no = 0; arg_no < btf_type_vlen(func_proto) ; ++arg_no) {
+ const struct btf_param *param = &btf_params(func_proto)[arg_no];
+ const struct bpf_ctx_arg_aux *info;
+ const struct btf_type *t;
+ int offset;
+
+ if (args->args[arg_no] != 0)
+ continue;
+
+ /* Program is validated already, so there is no need
+ * to check if t is NULL.
+ */
+ t = btf_type_skip_modifiers(bpf_dummy_ops_btf, param->type, NULL);
+ if (!btf_type_is_ptr(t))
+ continue;
+
+ offset = btf_ctx_arg_offset(bpf_dummy_ops_btf, func_proto, arg_no);
+ info = find_ctx_arg_info(prog->aux, offset);
+ if (info && type_may_be_null(info->reg_type))
+ continue;
+
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+extern const struct bpf_link_ops bpf_struct_ops_link_lops;
+
+int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ const struct bpf_struct_ops *st_ops = &bpf_bpf_dummy_ops;
+ const struct btf_type *func_proto;
+ struct bpf_dummy_ops_test_args *args;
+ struct bpf_tramp_links *tlinks = NULL;
+ struct bpf_tramp_link *link = NULL;
+ void *image = NULL;
+ unsigned int op_idx;
+ u32 image_off = 0;
+ int prog_ret;
+ s32 type_id;
+ int err;
+
+ type_id = btf_find_by_name_kind(bpf_dummy_ops_btf,
+ bpf_bpf_dummy_ops.name,
+ BTF_KIND_STRUCT);
+ if (type_id < 0)
+ return -EINVAL;
+ if (prog->aux->attach_btf_id != type_id)
+ return -EOPNOTSUPP;
+
+ func_proto = prog->aux->attach_func_proto;
+ args = dummy_ops_init_args(kattr, btf_type_vlen(func_proto));
+ if (IS_ERR(args))
+ return PTR_ERR(args);
+
+ err = check_test_run_args(prog, args);
+ if (err)
+ goto out;
+
+ tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL);
+ if (!tlinks) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto out;
+ }
+ /* prog doesn't take the ownership of the reference from caller */
+ bpf_prog_inc(prog);
+ bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog,
+ prog->expected_attach_type);
+
+ op_idx = prog->expected_attach_type;
+ err = bpf_struct_ops_prepare_trampoline(tlinks, link,
+ &st_ops->func_models[op_idx],
+ &dummy_ops_test_ret_function,
+ &image, &image_off,
+ true);
+ if (err < 0)
+ goto out;
+
+ err = arch_protect_bpf_trampoline(image, PAGE_SIZE);
+ if (err)
+ goto out;
+ prog_ret = dummy_ops_call_op(image, args);
+
+ err = dummy_ops_copy_args(args);
+ if (err)
+ goto out;
+ if (put_user(prog_ret, &uattr->test.retval))
+ err = -EFAULT;
+out:
+ kfree(args);
+ bpf_struct_ops_image_free(image);
+ if (link)
+ bpf_link_put(&link->link);
+ kfree(tlinks);
+ return err;
+}
+
+static int bpf_dummy_init(struct btf *btf)
+{
+ bpf_dummy_ops_btf = btf;
+ return 0;
+}
+
+static bool bpf_dummy_ops_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+static int bpf_dummy_ops_check_member(const struct btf_type *t,
+ const struct btf_member *member,
+ const struct bpf_prog *prog)
+{
+ u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+ switch (moff) {
+ case offsetof(struct bpf_dummy_ops, test_sleepable):
+ break;
+ default:
+ if (prog->sleepable)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int bpf_dummy_ops_btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size)
+{
+ const struct btf_type *state;
+ const struct btf_type *t;
+ s32 type_id;
+
+ type_id = btf_find_by_name_kind(reg->btf, "bpf_dummy_ops_state",
+ BTF_KIND_STRUCT);
+ if (type_id < 0)
+ return -EINVAL;
+
+ t = btf_type_by_id(reg->btf, reg->btf_id);
+ state = btf_type_by_id(reg->btf, type_id);
+ if (t != state) {
+ bpf_log(log, "only access to bpf_dummy_ops_state is supported\n");
+ return -EACCES;
+ }
+
+ if (off + size > sizeof(struct bpf_dummy_ops_state)) {
+ bpf_log(log, "write access at off %d with size %d\n", off, size);
+ return -EACCES;
+ }
+
+ return NOT_INIT;
+}
+
+static const struct bpf_verifier_ops bpf_dummy_verifier_ops = {
+ .is_valid_access = bpf_dummy_ops_is_valid_access,
+ .btf_struct_access = bpf_dummy_ops_btf_struct_access,
+};
+
+static int bpf_dummy_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ return -EOPNOTSUPP;
+}
+
+static int bpf_dummy_reg(void *kdata, struct bpf_link *link)
+{
+ return -EOPNOTSUPP;
+}
+
+static void bpf_dummy_unreg(void *kdata, struct bpf_link *link)
+{
+}
+
+static int bpf_dummy_ops__test_1(struct bpf_dummy_ops_state *cb__nullable)
+{
+ return 0;
+}
+
+static int bpf_dummy_test_2(struct bpf_dummy_ops_state *cb, int a1, unsigned short a2,
+ char a3, unsigned long a4)
+{
+ return 0;
+}
+
+static int bpf_dummy_test_sleepable(struct bpf_dummy_ops_state *cb)
+{
+ return 0;
+}
+
+static struct bpf_dummy_ops __bpf_bpf_dummy_ops = {
+ .test_1 = bpf_dummy_ops__test_1,
+ .test_2 = bpf_dummy_test_2,
+ .test_sleepable = bpf_dummy_test_sleepable,
+};
+
+static struct bpf_struct_ops bpf_bpf_dummy_ops = {
+ .verifier_ops = &bpf_dummy_verifier_ops,
+ .init = bpf_dummy_init,
+ .check_member = bpf_dummy_ops_check_member,
+ .init_member = bpf_dummy_init_member,
+ .reg = bpf_dummy_reg,
+ .unreg = bpf_dummy_unreg,
+ .name = "bpf_dummy_ops",
+ .cfi_stubs = &__bpf_bpf_dummy_ops,
+ .owner = THIS_MODULE,
+};
+
+static int __init bpf_dummy_struct_ops_init(void)
+{
+ return register_bpf_struct_ops(&bpf_bpf_dummy_ops, bpf_dummy_ops);
+}
+late_initcall(bpf_dummy_struct_ops_init);
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index f4078830ea50..655efac6f133 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -1,199 +1,1834 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2017 Facebook
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
*/
#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
#include <linux/slab.h>
+#include <linux/init.h>
#include <linux/vmalloc.h>
#include <linux/etherdevice.h>
#include <linux/filter.h>
+#include <linux/rcupdate_trace.h>
#include <linux/sched/signal.h>
+#include <net/bpf_sk_storage.h>
+#include <net/hotdata.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/net_namespace.h>
+#include <net/page_pool/helpers.h>
+#include <linux/error-injection.h>
+#include <linux/smp.h>
+#include <linux/sock_diag.h>
+#include <linux/netfilter.h>
+#include <net/netdev_rx_queue.h>
+#include <net/xdp.h>
+#include <net/netfilter/nf_bpf_link.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/bpf_test_run.h>
+
+struct bpf_test_timer {
+ u32 i;
+ u64 time_start, time_spent;
+};
-static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx,
- struct bpf_cgroup_storage *storage)
+static void bpf_test_timer_enter(struct bpf_test_timer *t)
+ __acquires(rcu)
{
- u32 ret;
+ rcu_read_lock_dont_migrate();
+ t->time_start = ktime_get_ns();
+}
- preempt_disable();
- rcu_read_lock();
- bpf_cgroup_storage_set(storage);
- ret = BPF_PROG_RUN(prog, ctx);
- rcu_read_unlock();
- preempt_enable();
+static void bpf_test_timer_leave(struct bpf_test_timer *t)
+ __releases(rcu)
+{
+ t->time_start = 0;
+ rcu_read_unlock_migrate();
+}
- return ret;
+static bool bpf_test_timer_continue(struct bpf_test_timer *t, int iterations,
+ u32 repeat, int *err, u32 *duration)
+ __must_hold(rcu)
+{
+ t->i += iterations;
+ if (t->i >= repeat) {
+ /* We're done. */
+ t->time_spent += ktime_get_ns() - t->time_start;
+ do_div(t->time_spent, t->i);
+ *duration = t->time_spent > U32_MAX ? U32_MAX : (u32)t->time_spent;
+ *err = 0;
+ goto reset;
+ }
+
+ if (signal_pending(current)) {
+ /* During iteration: we've been cancelled, abort. */
+ *err = -EINTR;
+ goto reset;
+ }
+
+ if (need_resched()) {
+ /* During iteration: we need to reschedule between runs. */
+ t->time_spent += ktime_get_ns() - t->time_start;
+ bpf_test_timer_leave(t);
+ cond_resched();
+ bpf_test_timer_enter(t);
+ }
+
+ /* Do another round. */
+ return true;
+
+reset:
+ t->i = 0;
+ return false;
+}
+
+/* We put this struct at the head of each page with a context and frame
+ * initialised when the page is allocated, so we don't have to do this on each
+ * repetition of the test run.
+ */
+struct xdp_page_head {
+ struct xdp_buff orig_ctx;
+ struct xdp_buff ctx;
+ union {
+ /* ::data_hard_start starts here */
+ DECLARE_FLEX_ARRAY(struct xdp_frame, frame);
+ DECLARE_FLEX_ARRAY(u8, data);
+ };
+};
+
+struct xdp_test_data {
+ struct xdp_buff *orig_ctx;
+ struct xdp_rxq_info rxq;
+ struct net_device *dev;
+ struct page_pool *pp;
+ struct xdp_frame **frames;
+ struct sk_buff **skbs;
+ struct xdp_mem_info mem;
+ u32 batch_size;
+ u32 frame_cnt;
+};
+
+/* tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c:%MAX_PKT_SIZE
+ * must be updated accordingly this gets changed, otherwise BPF selftests
+ * will fail.
+ */
+#define TEST_XDP_FRAME_SIZE (PAGE_SIZE - sizeof(struct xdp_page_head))
+#define TEST_XDP_MAX_BATCH 256
+
+static void xdp_test_run_init_page(netmem_ref netmem, void *arg)
+{
+ struct xdp_page_head *head =
+ phys_to_virt(page_to_phys(netmem_to_page(netmem)));
+ struct xdp_buff *new_ctx, *orig_ctx;
+ u32 headroom = XDP_PACKET_HEADROOM;
+ struct xdp_test_data *xdp = arg;
+ size_t frm_len, meta_len;
+ struct xdp_frame *frm;
+ void *data;
+
+ orig_ctx = xdp->orig_ctx;
+ frm_len = orig_ctx->data_end - orig_ctx->data_meta;
+ meta_len = orig_ctx->data - orig_ctx->data_meta;
+ headroom -= meta_len;
+
+ new_ctx = &head->ctx;
+ frm = head->frame;
+ data = head->data;
+ memcpy(data + headroom, orig_ctx->data_meta, frm_len);
+
+ xdp_init_buff(new_ctx, TEST_XDP_FRAME_SIZE, &xdp->rxq);
+ xdp_prepare_buff(new_ctx, data, headroom, frm_len, true);
+ new_ctx->data = new_ctx->data_meta + meta_len;
+
+ xdp_update_frame_from_buff(new_ctx, frm);
+ frm->mem_type = new_ctx->rxq->mem.type;
+
+ memcpy(&head->orig_ctx, new_ctx, sizeof(head->orig_ctx));
+}
+
+static int xdp_test_run_setup(struct xdp_test_data *xdp, struct xdp_buff *orig_ctx)
+{
+ struct page_pool *pp;
+ int err = -ENOMEM;
+ struct page_pool_params pp_params = {
+ .order = 0,
+ .flags = 0,
+ .pool_size = xdp->batch_size,
+ .nid = NUMA_NO_NODE,
+ .init_callback = xdp_test_run_init_page,
+ .init_arg = xdp,
+ };
+
+ xdp->frames = kvmalloc_array(xdp->batch_size, sizeof(void *), GFP_KERNEL);
+ if (!xdp->frames)
+ return -ENOMEM;
+
+ xdp->skbs = kvmalloc_array(xdp->batch_size, sizeof(void *), GFP_KERNEL);
+ if (!xdp->skbs)
+ goto err_skbs;
+
+ pp = page_pool_create(&pp_params);
+ if (IS_ERR(pp)) {
+ err = PTR_ERR(pp);
+ goto err_pp;
+ }
+
+ /* will copy 'mem.id' into pp->xdp_mem_id */
+ err = xdp_reg_mem_model(&xdp->mem, MEM_TYPE_PAGE_POOL, pp);
+ if (err)
+ goto err_mmodel;
+
+ xdp->pp = pp;
+
+ /* We create a 'fake' RXQ referencing the original dev, but with an
+ * xdp_mem_info pointing to our page_pool
+ */
+ xdp_rxq_info_reg(&xdp->rxq, orig_ctx->rxq->dev, 0, 0);
+ xdp->rxq.mem.type = MEM_TYPE_PAGE_POOL;
+ xdp->rxq.mem.id = pp->xdp_mem_id;
+ xdp->dev = orig_ctx->rxq->dev;
+ xdp->orig_ctx = orig_ctx;
+
+ return 0;
+
+err_mmodel:
+ page_pool_destroy(pp);
+err_pp:
+ kvfree(xdp->skbs);
+err_skbs:
+ kvfree(xdp->frames);
+ return err;
+}
+
+static void xdp_test_run_teardown(struct xdp_test_data *xdp)
+{
+ xdp_unreg_mem_model(&xdp->mem);
+ page_pool_destroy(xdp->pp);
+ kfree(xdp->frames);
+ kfree(xdp->skbs);
+}
+
+static bool frame_was_changed(const struct xdp_page_head *head)
+{
+ /* xdp_scrub_frame() zeroes the data pointer, flags is the last field,
+ * i.e. has the highest chances to be overwritten. If those two are
+ * untouched, it's most likely safe to skip the context reset.
+ */
+ return head->frame->data != head->orig_ctx.data ||
+ head->frame->flags != head->orig_ctx.flags;
+}
+
+static bool ctx_was_changed(struct xdp_page_head *head)
+{
+ return head->orig_ctx.data != head->ctx.data ||
+ head->orig_ctx.data_meta != head->ctx.data_meta ||
+ head->orig_ctx.data_end != head->ctx.data_end;
}
-static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time)
+static void reset_ctx(struct xdp_page_head *head)
{
- struct bpf_cgroup_storage *storage = NULL;
- u64 time_start, time_spent = 0;
- u32 ret = 0, i;
+ if (likely(!frame_was_changed(head) && !ctx_was_changed(head)))
+ return;
+
+ head->ctx.data = head->orig_ctx.data;
+ head->ctx.data_meta = head->orig_ctx.data_meta;
+ head->ctx.data_end = head->orig_ctx.data_end;
+ xdp_update_frame_from_buff(&head->ctx, head->frame);
+ head->frame->mem_type = head->orig_ctx.rxq->mem.type;
+}
+
+static int xdp_recv_frames(struct xdp_frame **frames, int nframes,
+ struct sk_buff **skbs,
+ struct net_device *dev)
+{
+ gfp_t gfp = __GFP_ZERO | GFP_ATOMIC;
+ int i, n;
+ LIST_HEAD(list);
+
+ n = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, gfp, nframes,
+ (void **)skbs);
+ if (unlikely(n == 0)) {
+ for (i = 0; i < nframes; i++)
+ xdp_return_frame(frames[i]);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < nframes; i++) {
+ struct xdp_frame *xdpf = frames[i];
+ struct sk_buff *skb = skbs[i];
+
+ skb = __xdp_build_skb_from_frame(xdpf, skb, dev);
+ if (!skb) {
+ xdp_return_frame(xdpf);
+ continue;
+ }
+
+ list_add_tail(&skb->list, &list);
+ }
+ netif_receive_skb_list(&list);
+
+ return 0;
+}
+
+static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
+ u32 repeat)
+{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+ int err = 0, act, ret, i, nframes = 0, batch_sz;
+ struct xdp_frame **frames = xdp->frames;
+ struct bpf_redirect_info *ri;
+ struct xdp_page_head *head;
+ struct xdp_frame *frm;
+ bool redirect = false;
+ struct xdp_buff *ctx;
+ struct page *page;
+
+ batch_sz = min_t(u32, repeat, xdp->batch_size);
+
+ local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+ ri = bpf_net_ctx_get_ri();
+ xdp_set_return_frame_no_direct();
- storage = bpf_cgroup_storage_alloc(prog);
- if (IS_ERR(storage))
- return PTR_ERR(storage);
+ for (i = 0; i < batch_sz; i++) {
+ page = page_pool_dev_alloc_pages(xdp->pp);
+ if (!page) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ head = phys_to_virt(page_to_phys(page));
+ reset_ctx(head);
+ ctx = &head->ctx;
+ frm = head->frame;
+ xdp->frame_cnt++;
+
+ act = bpf_prog_run_xdp(prog, ctx);
+
+ /* if program changed pkt bounds we need to update the xdp_frame */
+ if (unlikely(ctx_was_changed(head))) {
+ ret = xdp_update_frame_from_buff(ctx, frm);
+ if (ret) {
+ xdp_return_buff(ctx);
+ continue;
+ }
+ }
+
+ switch (act) {
+ case XDP_TX:
+ /* we can't do a real XDP_TX since we're not in the
+ * driver, so turn it into a REDIRECT back to the same
+ * index
+ */
+ ri->tgt_index = xdp->dev->ifindex;
+ ri->map_id = INT_MAX;
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
+ fallthrough;
+ case XDP_REDIRECT:
+ redirect = true;
+ ret = xdp_do_redirect_frame(xdp->dev, ctx, frm, prog);
+ if (ret)
+ xdp_return_buff(ctx);
+ break;
+ case XDP_PASS:
+ frames[nframes++] = frm;
+ break;
+ default:
+ bpf_warn_invalid_xdp_action(NULL, prog, act);
+ fallthrough;
+ case XDP_DROP:
+ xdp_return_buff(ctx);
+ break;
+ }
+ }
+
+out:
+ if (redirect)
+ xdp_do_flush();
+ if (nframes) {
+ ret = xdp_recv_frames(frames, nframes, xdp->skbs, xdp->dev);
+ if (ret)
+ err = ret;
+ }
+
+ xdp_clear_return_frame_no_direct();
+ bpf_net_ctx_clear(bpf_net_ctx);
+ local_bh_enable();
+ return err;
+}
+
+static int bpf_test_run_xdp_live(struct bpf_prog *prog, struct xdp_buff *ctx,
+ u32 repeat, u32 batch_size, u32 *time)
+
+{
+ struct xdp_test_data xdp = { .batch_size = batch_size };
+ struct bpf_test_timer t = {};
+ int ret;
if (!repeat)
repeat = 1;
- time_start = ktime_get_ns();
- for (i = 0; i < repeat; i++) {
- ret = bpf_test_run_one(prog, ctx, storage);
- if (need_resched()) {
- if (signal_pending(current))
- break;
- time_spent += ktime_get_ns() - time_start;
- cond_resched();
- time_start = ktime_get_ns();
+
+ ret = xdp_test_run_setup(&xdp, ctx);
+ if (ret)
+ return ret;
+
+ bpf_test_timer_enter(&t);
+ do {
+ xdp.frame_cnt = 0;
+ ret = xdp_test_run_batch(&xdp, prog, repeat - t.i);
+ if (unlikely(ret < 0))
+ break;
+ } while (bpf_test_timer_continue(&t, xdp.frame_cnt, repeat, &ret, time));
+ bpf_test_timer_leave(&t);
+
+ xdp_test_run_teardown(&xdp);
+ return ret;
+}
+
+static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
+ u32 *retval, u32 *time, bool xdp)
+{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+ struct bpf_prog_array_item item = {.prog = prog};
+ struct bpf_run_ctx *old_ctx;
+ struct bpf_cg_run_ctx run_ctx;
+ struct bpf_test_timer t = {};
+ enum bpf_cgroup_storage_type stype;
+ int ret;
+
+ for_each_cgroup_storage_type(stype) {
+ item.cgroup_storage[stype] = bpf_cgroup_storage_alloc(prog, stype);
+ if (IS_ERR(item.cgroup_storage[stype])) {
+ item.cgroup_storage[stype] = NULL;
+ for_each_cgroup_storage_type(stype)
+ bpf_cgroup_storage_free(item.cgroup_storage[stype]);
+ return -ENOMEM;
}
}
- time_spent += ktime_get_ns() - time_start;
- do_div(time_spent, repeat);
- *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent;
- bpf_cgroup_storage_free(storage);
+ if (!repeat)
+ repeat = 1;
+
+ bpf_test_timer_enter(&t);
+ old_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ do {
+ run_ctx.prog_item = &item;
+ local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+
+ if (xdp)
+ *retval = bpf_prog_run_xdp(prog, ctx);
+ else
+ *retval = bpf_prog_run(prog, ctx);
+
+ bpf_net_ctx_clear(bpf_net_ctx);
+ local_bh_enable();
+ } while (bpf_test_timer_continue(&t, 1, repeat, &ret, time));
+ bpf_reset_run_ctx(old_ctx);
+ bpf_test_timer_leave(&t);
+
+ for_each_cgroup_storage_type(stype)
+ bpf_cgroup_storage_free(item.cgroup_storage[stype]);
return ret;
}
static int bpf_test_finish(const union bpf_attr *kattr,
union bpf_attr __user *uattr, const void *data,
- u32 size, u32 retval, u32 duration)
+ struct skb_shared_info *sinfo, u32 size, u32 frag_size,
+ u32 retval, u32 duration)
{
void __user *data_out = u64_to_user_ptr(kattr->test.data_out);
int err = -EFAULT;
+ u32 copy_size = size;
+
+ /* Clamp copy if the user has provided a size hint, but copy the full
+ * buffer if not to retain old behaviour.
+ */
+ if (kattr->test.data_size_out &&
+ copy_size > kattr->test.data_size_out) {
+ copy_size = kattr->test.data_size_out;
+ err = -ENOSPC;
+ }
+
+ if (data_out) {
+ int len = sinfo ? copy_size - frag_size : copy_size;
+
+ if (len < 0) {
+ err = -ENOSPC;
+ goto out;
+ }
+
+ if (copy_to_user(data_out, data, len))
+ goto out;
+
+ if (sinfo) {
+ int i, offset = len;
+ u32 data_len;
+
+ for (i = 0; i < sinfo->nr_frags; i++) {
+ skb_frag_t *frag = &sinfo->frags[i];
+
+ if (offset >= copy_size) {
+ err = -ENOSPC;
+ break;
+ }
+
+ data_len = min_t(u32, copy_size - offset,
+ skb_frag_size(frag));
+
+ if (copy_to_user(data_out + offset,
+ skb_frag_address(frag),
+ data_len))
+ goto out;
+
+ offset += data_len;
+ }
+ }
+ }
- if (data_out && copy_to_user(data_out, data, size))
- goto out;
if (copy_to_user(&uattr->test.data_size_out, &size, sizeof(size)))
goto out;
if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval)))
goto out;
if (copy_to_user(&uattr->test.duration, &duration, sizeof(duration)))
goto out;
- err = 0;
+ if (err != -ENOSPC)
+ err = 0;
out:
+ trace_bpf_test_finish(&err);
return err;
}
-static void *bpf_test_init(const union bpf_attr *kattr, u32 size,
- u32 headroom, u32 tailroom)
+/* Integer types of various sizes and pointer combinations cover variety of
+ * architecture dependent calling conventions. 7+ can be supported in the
+ * future.
+ */
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_fentry_test1(int a)
+{
+ return a + 1;
+}
+EXPORT_SYMBOL_GPL(bpf_fentry_test1);
+
+noinline int bpf_fentry_test2(int a, u64 b)
+{
+ return a + b;
+}
+
+noinline int bpf_fentry_test3(char a, int b, u64 c)
+{
+ return a + b + c;
+}
+
+noinline int bpf_fentry_test4(void *a, char b, int c, u64 d)
+{
+ return (long)a + b + c + d;
+}
+
+noinline int bpf_fentry_test5(u64 a, void *b, short c, int d, u64 e)
+{
+ return a + (long)b + c + d + e;
+}
+
+noinline int bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f)
+{
+ return a + (long)b + c + d + (long)e + f;
+}
+
+struct bpf_fentry_test_t {
+ struct bpf_fentry_test_t *a;
+};
+
+noinline int bpf_fentry_test7(struct bpf_fentry_test_t *arg)
+{
+ asm volatile ("" : "+r"(arg));
+ return (long)arg;
+}
+
+noinline int bpf_fentry_test8(struct bpf_fentry_test_t *arg)
+{
+ return (long)arg->a;
+}
+
+__bpf_kfunc u32 bpf_fentry_test9(u32 *a)
+{
+ return *a;
+}
+
+noinline int bpf_fentry_test10(const void *a)
+{
+ return (long)a;
+}
+
+noinline void bpf_fentry_test_sinfo(struct skb_shared_info *sinfo)
+{
+}
+
+__bpf_kfunc int bpf_modify_return_test(int a, int *b)
+{
+ *b += 1;
+ return a + *b;
+}
+
+__bpf_kfunc int bpf_modify_return_test2(int a, int *b, short c, int d,
+ void *e, char f, int g)
+{
+ *b += 1;
+ return a + *b + c + d + (long)e + f + g;
+}
+
+__bpf_kfunc int bpf_modify_return_test_tp(int nonce)
+{
+ trace_bpf_trigger_tp(nonce);
+
+ return nonce;
+}
+
+noinline int bpf_fentry_shadow_test(int a)
+{
+ return a + 1;
+}
+
+struct prog_test_member1 {
+ int a;
+};
+
+struct prog_test_member {
+ struct prog_test_member1 m;
+ int c;
+};
+
+struct prog_test_ref_kfunc {
+ int a;
+ int b;
+ struct prog_test_member memb;
+ struct prog_test_ref_kfunc *next;
+ refcount_t cnt;
+};
+
+__bpf_kfunc void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p)
+{
+ refcount_dec(&p->cnt);
+}
+
+__bpf_kfunc void bpf_kfunc_call_test_release_dtor(void *p)
+{
+ bpf_kfunc_call_test_release(p);
+}
+CFI_NOSEAL(bpf_kfunc_call_test_release_dtor);
+
+__bpf_kfunc void bpf_kfunc_call_memb_release(struct prog_test_member *p)
+{
+}
+
+__bpf_kfunc void bpf_kfunc_call_memb_release_dtor(void *p)
+{
+}
+CFI_NOSEAL(bpf_kfunc_call_memb_release_dtor);
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(bpf_test_modify_return_ids)
+BTF_ID_FLAGS(func, bpf_modify_return_test)
+BTF_ID_FLAGS(func, bpf_modify_return_test2)
+BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
+BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE)
+BTF_KFUNCS_END(bpf_test_modify_return_ids)
+
+static const struct btf_kfunc_id_set bpf_test_modify_return_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_test_modify_return_ids,
+};
+
+BTF_KFUNCS_START(test_sk_check_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_kfunc_call_test_release, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_kfunc_call_memb_release, KF_RELEASE)
+BTF_KFUNCS_END(test_sk_check_kfunc_ids)
+
+static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size,
+ u32 size, u32 headroom, u32 tailroom)
{
void __user *data_in = u64_to_user_ptr(kattr->test.data_in);
void *data;
- if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom)
+ if (user_size > PAGE_SIZE - headroom - tailroom)
return ERR_PTR(-EINVAL);
+ size = SKB_DATA_ALIGN(size);
data = kzalloc(size + headroom + tailroom, GFP_USER);
if (!data)
return ERR_PTR(-ENOMEM);
- if (copy_from_user(data + headroom, data_in, size)) {
+ if (copy_from_user(data + headroom, data_in, user_size)) {
kfree(data);
return ERR_PTR(-EFAULT);
}
+
return data;
}
+int bpf_prog_test_run_tracing(struct bpf_prog *prog,
+ const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_fentry_test_t arg = {};
+ u16 side_effect = 0, ret = 0;
+ int b = 2, err = -EFAULT;
+ u32 retval = 0;
+
+ if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size)
+ return -EINVAL;
+
+ switch (prog->expected_attach_type) {
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ if (bpf_fentry_test1(1) != 2 ||
+ bpf_fentry_test2(2, 3) != 5 ||
+ bpf_fentry_test3(4, 5, 6) != 15 ||
+ bpf_fentry_test4((void *)7, 8, 9, 10) != 34 ||
+ bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 ||
+ bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111 ||
+ bpf_fentry_test7((struct bpf_fentry_test_t *)0) != 0 ||
+ bpf_fentry_test8(&arg) != 0 ||
+ bpf_fentry_test9(&retval) != 0 ||
+ bpf_fentry_test10((void *)0) != 0)
+ goto out;
+ break;
+ case BPF_MODIFY_RETURN:
+ ret = bpf_modify_return_test(1, &b);
+ if (b != 2)
+ side_effect++;
+ b = 2;
+ ret += bpf_modify_return_test2(1, &b, 3, 4, (void *)5, 6, 7);
+ if (b != 2)
+ side_effect++;
+ break;
+ default:
+ goto out;
+ }
+
+ retval = ((u32)side_effect << 16) | ret;
+ if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval)))
+ goto out;
+
+ err = 0;
+out:
+ trace_bpf_test_finish(&err);
+ return err;
+}
+
+struct bpf_raw_tp_test_run_info {
+ struct bpf_prog *prog;
+ void *ctx;
+ u32 retval;
+};
+
+static void
+__bpf_prog_test_run_raw_tp(void *data)
+{
+ struct bpf_raw_tp_test_run_info *info = data;
+ struct bpf_trace_run_ctx run_ctx = {};
+ struct bpf_run_ctx *old_run_ctx;
+
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+
+ rcu_read_lock();
+ info->retval = bpf_prog_run(info->prog, info->ctx);
+ rcu_read_unlock();
+
+ bpf_reset_run_ctx(old_run_ctx);
+}
+
+int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
+ const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in);
+ __u32 ctx_size_in = kattr->test.ctx_size_in;
+ struct bpf_raw_tp_test_run_info info;
+ int cpu = kattr->test.cpu, err = 0;
+ int current_cpu;
+
+ /* doesn't support data_in/out, ctx_out, duration, or repeat */
+ if (kattr->test.data_in || kattr->test.data_out ||
+ kattr->test.ctx_out || kattr->test.duration ||
+ kattr->test.repeat || kattr->test.batch_size)
+ return -EINVAL;
+
+ if (ctx_size_in < prog->aux->max_ctx_offset ||
+ ctx_size_in > MAX_BPF_FUNC_ARGS * sizeof(u64))
+ return -EINVAL;
+
+ if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 && cpu != 0)
+ return -EINVAL;
+
+ if (ctx_size_in) {
+ info.ctx = memdup_user(ctx_in, ctx_size_in);
+ if (IS_ERR(info.ctx))
+ return PTR_ERR(info.ctx);
+ } else {
+ info.ctx = NULL;
+ }
+
+ info.prog = prog;
+
+ current_cpu = get_cpu();
+ if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 ||
+ cpu == current_cpu) {
+ __bpf_prog_test_run_raw_tp(&info);
+ } else if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
+ /* smp_call_function_single() also checks cpu_online()
+ * after csd_lock(). However, since cpu is from user
+ * space, let's do an extra quick check to filter out
+ * invalid value before smp_call_function_single().
+ */
+ err = -ENXIO;
+ } else {
+ err = smp_call_function_single(cpu, __bpf_prog_test_run_raw_tp,
+ &info, 1);
+ }
+ put_cpu();
+
+ if (!err &&
+ copy_to_user(&uattr->test.retval, &info.retval, sizeof(u32)))
+ err = -EFAULT;
+
+ kfree(info.ctx);
+ return err;
+}
+
+static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size)
+{
+ void __user *data_in = u64_to_user_ptr(kattr->test.ctx_in);
+ void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out);
+ u32 size = kattr->test.ctx_size_in;
+ void *data;
+ int err;
+
+ if (!data_in && !data_out)
+ return NULL;
+
+ data = kzalloc(max_size, GFP_USER);
+ if (!data)
+ return ERR_PTR(-ENOMEM);
+
+ if (data_in) {
+ err = bpf_check_uarg_tail_zero(USER_BPFPTR(data_in), max_size, size);
+ if (err) {
+ kfree(data);
+ return ERR_PTR(err);
+ }
+
+ size = min_t(u32, max_size, size);
+ if (copy_from_user(data, data_in, size)) {
+ kfree(data);
+ return ERR_PTR(-EFAULT);
+ }
+ }
+ return data;
+}
+
+static int bpf_ctx_finish(const union bpf_attr *kattr,
+ union bpf_attr __user *uattr, const void *data,
+ u32 size)
+{
+ void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out);
+ int err = -EFAULT;
+ u32 copy_size = size;
+
+ if (!data || !data_out)
+ return 0;
+
+ if (copy_size > kattr->test.ctx_size_out) {
+ copy_size = kattr->test.ctx_size_out;
+ err = -ENOSPC;
+ }
+
+ if (copy_to_user(data_out, data, copy_size))
+ goto out;
+ if (copy_to_user(&uattr->test.ctx_size_out, &size, sizeof(size)))
+ goto out;
+ if (err != -ENOSPC)
+ err = 0;
+out:
+ return err;
+}
+
+/**
+ * range_is_zero - test whether buffer is initialized
+ * @buf: buffer to check
+ * @from: check from this position
+ * @to: check up until (excluding) this position
+ *
+ * This function returns true if the there is a non-zero byte
+ * in the buf in the range [from,to).
+ */
+static inline bool range_is_zero(void *buf, size_t from, size_t to)
+{
+ return !memchr_inv((u8 *)buf + from, 0, to - from);
+}
+
+static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb)
+{
+ struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb;
+
+ if (!__skb)
+ return 0;
+
+ /* make sure the fields we don't use are zeroed */
+ if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, mark)))
+ return -EINVAL;
+
+ /* mark is allowed */
+
+ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, mark),
+ offsetof(struct __sk_buff, priority)))
+ return -EINVAL;
+
+ /* priority is allowed */
+ /* ingress_ifindex is allowed */
+ /* ifindex is allowed */
+
+ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, ifindex),
+ offsetof(struct __sk_buff, cb)))
+ return -EINVAL;
+
+ /* cb is allowed */
+
+ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, cb),
+ offsetof(struct __sk_buff, data_end)))
+ return -EINVAL;
+
+ /* data_end is allowed, but not copied to skb */
+
+ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, data_end),
+ offsetof(struct __sk_buff, tstamp)))
+ return -EINVAL;
+
+ /* tstamp is allowed */
+ /* wire_len is allowed */
+ /* gso_segs is allowed */
+
+ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_segs),
+ offsetof(struct __sk_buff, gso_size)))
+ return -EINVAL;
+
+ /* gso_size is allowed */
+
+ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_size),
+ offsetof(struct __sk_buff, hwtstamp)))
+ return -EINVAL;
+
+ /* hwtstamp is allowed */
+
+ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, hwtstamp),
+ sizeof(struct __sk_buff)))
+ return -EINVAL;
+
+ skb->mark = __skb->mark;
+ skb->priority = __skb->priority;
+ skb->skb_iif = __skb->ingress_ifindex;
+ skb->tstamp = __skb->tstamp;
+ memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN);
+
+ if (__skb->wire_len == 0) {
+ cb->pkt_len = skb->len;
+ } else {
+ if (__skb->wire_len < skb->len ||
+ __skb->wire_len > GSO_LEGACY_MAX_SIZE)
+ return -EINVAL;
+ cb->pkt_len = __skb->wire_len;
+ }
+
+ if (__skb->gso_segs > GSO_MAX_SEGS)
+ return -EINVAL;
+
+ /* Currently GSO type is zero/unset. If this gets extended with
+ * a small list of accepted GSO types in future, the filter for
+ * an unset GSO type in bpf_clone_redirect() can be lifted.
+ */
+ skb_shinfo(skb)->gso_segs = __skb->gso_segs;
+ skb_shinfo(skb)->gso_size = __skb->gso_size;
+ skb_shinfo(skb)->hwtstamps.hwtstamp = __skb->hwtstamp;
+
+ return 0;
+}
+
+static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb)
+{
+ struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb;
+
+ if (!__skb)
+ return;
+
+ __skb->mark = skb->mark;
+ __skb->priority = skb->priority;
+ __skb->ingress_ifindex = skb->skb_iif;
+ __skb->ifindex = skb->dev->ifindex;
+ __skb->tstamp = skb->tstamp;
+ memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN);
+ __skb->wire_len = cb->pkt_len;
+ __skb->gso_segs = skb_shinfo(skb)->gso_segs;
+ __skb->hwtstamp = skb_shinfo(skb)->hwtstamps.hwtstamp;
+}
+
+static struct proto bpf_dummy_proto = {
+ .name = "bpf_dummy",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct sock),
+};
+
int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
union bpf_attr __user *uattr)
{
- bool is_l2 = false, is_direct_pkt_access = false;
- u32 size = kattr->test.data_size_in;
+ bool is_l2 = false, is_direct_pkt_access = false, is_lwt = false;
+ u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ struct net *net = current->nsproxy->net_ns;
+ struct net_device *dev = net->loopback_dev;
+ u32 headroom = NET_SKB_PAD + NET_IP_ALIGN;
+ u32 linear_sz = kattr->test.data_size_in;
u32 repeat = kattr->test.repeat;
+ struct __sk_buff *ctx = NULL;
+ struct sk_buff *skb = NULL;
+ struct sock *sk = NULL;
u32 retval, duration;
int hh_len = ETH_HLEN;
- struct sk_buff *skb;
- void *data;
+ void *data = NULL;
int ret;
- data = bpf_test_init(kattr, size, NET_SKB_PAD + NET_IP_ALIGN,
- SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
- if (IS_ERR(data))
- return PTR_ERR(data);
+ if ((kattr->test.flags & ~BPF_F_TEST_SKB_CHECKSUM_COMPLETE) ||
+ kattr->test.cpu || kattr->test.batch_size)
+ return -EINVAL;
+
+ if (kattr->test.data_size_in < ETH_HLEN)
+ return -EINVAL;
switch (prog->type) {
case BPF_PROG_TYPE_SCHED_CLS:
case BPF_PROG_TYPE_SCHED_ACT:
+ is_direct_pkt_access = true;
is_l2 = true;
- /* fall through */
+ break;
case BPF_PROG_TYPE_LWT_IN:
case BPF_PROG_TYPE_LWT_OUT:
case BPF_PROG_TYPE_LWT_XMIT:
+ is_lwt = true;
+ fallthrough;
+ case BPF_PROG_TYPE_CGROUP_SKB:
is_direct_pkt_access = true;
break;
default:
break;
}
- skb = build_skb(data, 0);
+ ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff));
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ if (ctx) {
+ if (ctx->data_end > kattr->test.data_size_in || ctx->data || ctx->data_meta) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (ctx->data_end) {
+ /* Non-linear LWT test_run is unsupported for now. */
+ if (is_lwt) {
+ ret = -EINVAL;
+ goto out;
+ }
+ linear_sz = max(ETH_HLEN, ctx->data_end);
+ }
+ }
+
+ linear_sz = min_t(u32, linear_sz, PAGE_SIZE - headroom - tailroom);
+
+ data = bpf_test_init(kattr, linear_sz, linear_sz, headroom, tailroom);
+ if (IS_ERR(data)) {
+ ret = PTR_ERR(data);
+ data = NULL;
+ goto out;
+ }
+
+ sk = sk_alloc(net, AF_UNSPEC, GFP_USER, &bpf_dummy_proto, 1);
+ if (!sk) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ sock_init_data(NULL, sk);
+
+ skb = slab_build_skb(data);
if (!skb) {
- kfree(data);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
+ skb->sk = sk;
+
+ data = NULL; /* data released via kfree_skb */
skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
- __skb_put(skb, size);
- skb->protocol = eth_type_trans(skb, current->nsproxy->net_ns->loopback_dev);
+ __skb_put(skb, linear_sz);
+
+ if (unlikely(kattr->test.data_size_in > linear_sz)) {
+ void __user *data_in = u64_to_user_ptr(kattr->test.data_in);
+ struct skb_shared_info *sinfo = skb_shinfo(skb);
+ u32 copied = linear_sz;
+
+ while (copied < kattr->test.data_size_in) {
+ struct page *page;
+ u32 data_len;
+
+ if (sinfo->nr_frags == MAX_SKB_FRAGS) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ data_len = min_t(u32, kattr->test.data_size_in - copied,
+ PAGE_SIZE);
+ skb_fill_page_desc(skb, sinfo->nr_frags, page, 0, data_len);
+
+ if (copy_from_user(page_address(page), data_in + copied,
+ data_len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ skb->data_len += data_len;
+ skb->truesize += PAGE_SIZE;
+ skb->len += data_len;
+ copied += data_len;
+ }
+ }
+
+ if (ctx && ctx->ifindex > 1) {
+ dev = dev_get_by_index(net, ctx->ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ }
+ skb->protocol = eth_type_trans(skb, dev);
skb_reset_network_header(skb);
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ sk->sk_family = AF_INET;
+ if (sizeof(struct iphdr) <= skb_headlen(skb)) {
+ sk->sk_rcv_saddr = ip_hdr(skb)->saddr;
+ sk->sk_daddr = ip_hdr(skb)->daddr;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ sk->sk_family = AF_INET6;
+ if (sizeof(struct ipv6hdr) <= skb_headlen(skb)) {
+ sk->sk_v6_rcv_saddr = ipv6_hdr(skb)->saddr;
+ sk->sk_v6_daddr = ipv6_hdr(skb)->daddr;
+ }
+ break;
+#endif
+ default:
+ break;
+ }
+
if (is_l2)
__skb_push(skb, hh_len);
if (is_direct_pkt_access)
bpf_compute_data_pointers(skb);
- retval = bpf_test_run(prog, skb, repeat, &duration);
+
+ ret = convert___skb_to_skb(skb, ctx);
+ if (ret)
+ goto out;
+
+ if (kattr->test.flags & BPF_F_TEST_SKB_CHECKSUM_COMPLETE) {
+ const int off = skb_network_offset(skb);
+ int len = skb->len - off;
+
+ skb->csum = skb_checksum(skb, off, len, 0);
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ }
+
+ ret = bpf_test_run(prog, skb, repeat, &retval, &duration, false);
+ if (ret)
+ goto out;
if (!is_l2) {
if (skb_headroom(skb) < hh_len) {
int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
if (pskb_expand_head(skb, nhead, 0, GFP_USER)) {
- kfree_skb(skb);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
}
memset(__skb_push(skb, hh_len), 0, hh_len);
}
- size = skb->len;
- /* bpf program can never convert linear skb to non-linear */
- if (WARN_ON_ONCE(skb_is_nonlinear(skb)))
- size = skb_headlen(skb);
- ret = bpf_test_finish(kattr, uattr, skb->data, size, retval, duration);
+ if (kattr->test.flags & BPF_F_TEST_SKB_CHECKSUM_COMPLETE) {
+ const int off = skb_network_offset(skb);
+ int len = skb->len - off;
+ __wsum csum;
+
+ csum = skb_checksum(skb, off, len, 0);
+
+ if (csum_fold(skb->csum) != csum_fold(csum)) {
+ ret = -EBADMSG;
+ goto out;
+ }
+ }
+
+ convert_skb_to___skb(skb, ctx);
+
+ if (skb_is_nonlinear(skb))
+ /* bpf program can never convert linear skb to non-linear */
+ WARN_ON_ONCE(linear_sz == kattr->test.data_size_in);
+ ret = bpf_test_finish(kattr, uattr, skb->data, skb_shinfo(skb), skb->len,
+ skb->data_len, retval, duration);
+ if (!ret)
+ ret = bpf_ctx_finish(kattr, uattr, ctx,
+ sizeof(struct __sk_buff));
+out:
+ if (dev && dev != net->loopback_dev)
+ dev_put(dev);
kfree_skb(skb);
+ kfree(data);
+ if (sk)
+ sk_free(sk);
+ kfree(ctx);
return ret;
}
+static int xdp_convert_md_to_buff(struct xdp_md *xdp_md, struct xdp_buff *xdp)
+{
+ unsigned int ingress_ifindex, rx_queue_index;
+ struct netdev_rx_queue *rxqueue;
+ struct net_device *device;
+
+ if (!xdp_md)
+ return 0;
+
+ if (xdp_md->egress_ifindex != 0)
+ return -EINVAL;
+
+ ingress_ifindex = xdp_md->ingress_ifindex;
+ rx_queue_index = xdp_md->rx_queue_index;
+
+ if (!ingress_ifindex && rx_queue_index)
+ return -EINVAL;
+
+ if (ingress_ifindex) {
+ device = dev_get_by_index(current->nsproxy->net_ns,
+ ingress_ifindex);
+ if (!device)
+ return -ENODEV;
+
+ if (rx_queue_index >= device->real_num_rx_queues)
+ goto free_dev;
+
+ rxqueue = __netif_get_rx_queue(device, rx_queue_index);
+
+ if (!xdp_rxq_info_is_reg(&rxqueue->xdp_rxq))
+ goto free_dev;
+
+ xdp->rxq = &rxqueue->xdp_rxq;
+ /* The device is now tracked in the xdp->rxq for later
+ * dev_put()
+ */
+ }
+
+ xdp->data = xdp->data_meta + xdp_md->data;
+ return 0;
+
+free_dev:
+ dev_put(device);
+ return -EINVAL;
+}
+
+static void xdp_convert_buff_to_md(struct xdp_buff *xdp, struct xdp_md *xdp_md)
+{
+ if (!xdp_md)
+ return;
+
+ xdp_md->data = xdp->data - xdp->data_meta;
+ xdp_md->data_end = xdp->data_end - xdp->data_meta;
+
+ if (xdp_md->ingress_ifindex)
+ dev_put(xdp->rxq->dev);
+}
+
int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr,
union bpf_attr __user *uattr)
{
- u32 size = kattr->test.data_size_in;
+ bool do_live = (kattr->test.flags & BPF_F_TEST_XDP_LIVE_FRAMES);
+ u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ u32 retval = 0, meta_sz = 0, duration, max_linear_sz, size;
+ u32 linear_sz = kattr->test.data_size_in;
+ u32 batch_size = kattr->test.batch_size;
+ u32 headroom = XDP_PACKET_HEADROOM;
u32 repeat = kattr->test.repeat;
struct netdev_rx_queue *rxqueue;
+ struct skb_shared_info *sinfo;
struct xdp_buff xdp = {};
+ int i, ret = -EINVAL;
+ struct xdp_md *ctx;
+ void *data;
+
+ if (prog->expected_attach_type == BPF_XDP_DEVMAP ||
+ prog->expected_attach_type == BPF_XDP_CPUMAP)
+ return -EINVAL;
+
+ if (kattr->test.flags & ~BPF_F_TEST_XDP_LIVE_FRAMES)
+ return -EINVAL;
+
+ if (bpf_prog_is_dev_bound(prog->aux))
+ return -EINVAL;
+
+ if (do_live) {
+ if (!batch_size)
+ batch_size = NAPI_POLL_WEIGHT;
+ else if (batch_size > TEST_XDP_MAX_BATCH)
+ return -E2BIG;
+
+ headroom += sizeof(struct xdp_page_head);
+ } else if (batch_size) {
+ return -EINVAL;
+ }
+
+ ctx = bpf_ctx_init(kattr, sizeof(struct xdp_md));
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ if (ctx) {
+ /* There can't be user provided data before the meta data */
+ if (ctx->data_meta || ctx->data_end > kattr->test.data_size_in ||
+ ctx->data > ctx->data_end ||
+ unlikely(xdp_metalen_invalid(ctx->data)) ||
+ (do_live && (kattr->test.data_out || kattr->test.ctx_out)))
+ goto free_ctx;
+ /* Meta data is allocated from the headroom */
+ headroom -= ctx->data;
+
+ meta_sz = ctx->data;
+ linear_sz = ctx->data_end;
+ }
+
+ max_linear_sz = PAGE_SIZE - headroom - tailroom;
+ linear_sz = min_t(u32, linear_sz, max_linear_sz);
+
+ /* disallow live data mode for jumbo frames */
+ if (do_live && kattr->test.data_size_in > linear_sz)
+ goto free_ctx;
+
+ if (kattr->test.data_size_in - meta_sz < ETH_HLEN)
+ goto free_ctx;
+
+ data = bpf_test_init(kattr, linear_sz, max_linear_sz, headroom, tailroom);
+ if (IS_ERR(data)) {
+ ret = PTR_ERR(data);
+ goto free_ctx;
+ }
+
+ rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0);
+ rxqueue->xdp_rxq.frag_size = PAGE_SIZE;
+ xdp_init_buff(&xdp, rxqueue->xdp_rxq.frag_size, &rxqueue->xdp_rxq);
+ xdp_prepare_buff(&xdp, data, headroom, linear_sz, true);
+ sinfo = xdp_get_shared_info_from_buff(&xdp);
+
+ ret = xdp_convert_md_to_buff(ctx, &xdp);
+ if (ret)
+ goto free_data;
+
+ size = linear_sz;
+ if (unlikely(kattr->test.data_size_in > size)) {
+ void __user *data_in = u64_to_user_ptr(kattr->test.data_in);
+
+ while (size < kattr->test.data_size_in) {
+ struct page *page;
+ skb_frag_t *frag;
+ u32 data_len;
+
+ if (sinfo->nr_frags == MAX_SKB_FRAGS) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ frag = &sinfo->frags[sinfo->nr_frags++];
+
+ data_len = min_t(u32, kattr->test.data_size_in - size,
+ PAGE_SIZE);
+ skb_frag_fill_page_desc(frag, page, 0, data_len);
+
+ if (copy_from_user(page_address(page), data_in + size,
+ data_len)) {
+ ret = -EFAULT;
+ goto out;
+ }
+ sinfo->xdp_frags_size += data_len;
+ size += data_len;
+ }
+ xdp_buff_set_frags_flag(&xdp);
+ }
+
+ if (repeat > 1)
+ bpf_prog_change_xdp(NULL, prog);
+
+ if (do_live)
+ ret = bpf_test_run_xdp_live(prog, &xdp, repeat, batch_size, &duration);
+ else
+ ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true);
+ /* We convert the xdp_buff back to an xdp_md before checking the return
+ * code so the reference count of any held netdevice will be decremented
+ * even if the test run failed.
+ */
+ xdp_convert_buff_to_md(&xdp, ctx);
+ if (ret)
+ goto out;
+
+ size = xdp.data_end - xdp.data_meta + sinfo->xdp_frags_size;
+ ret = bpf_test_finish(kattr, uattr, xdp.data_meta, sinfo, size, sinfo->xdp_frags_size,
+ retval, duration);
+ if (!ret)
+ ret = bpf_ctx_finish(kattr, uattr, ctx,
+ sizeof(struct xdp_md));
+
+out:
+ if (repeat > 1)
+ bpf_prog_change_xdp(prog, NULL);
+free_data:
+ for (i = 0; i < sinfo->nr_frags; i++)
+ __free_page(skb_frag_page(&sinfo->frags[i]));
+ kfree(data);
+free_ctx:
+ kfree(ctx);
+ return ret;
+}
+
+static int verify_user_bpf_flow_keys(struct bpf_flow_keys *ctx)
+{
+ /* make sure the fields we don't use are zeroed */
+ if (!range_is_zero(ctx, 0, offsetof(struct bpf_flow_keys, flags)))
+ return -EINVAL;
+
+ /* flags is allowed */
+
+ if (!range_is_zero(ctx, offsetofend(struct bpf_flow_keys, flags),
+ sizeof(struct bpf_flow_keys)))
+ return -EINVAL;
+
+ return 0;
+}
+
+int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog,
+ const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_test_timer t = {};
+ u32 size = kattr->test.data_size_in;
+ struct bpf_flow_dissector ctx = {};
+ u32 repeat = kattr->test.repeat;
+ struct bpf_flow_keys *user_ctx;
+ struct bpf_flow_keys flow_keys;
+ const struct ethhdr *eth;
+ unsigned int flags = 0;
u32 retval, duration;
void *data;
int ret;
- data = bpf_test_init(kattr, size, XDP_PACKET_HEADROOM + NET_IP_ALIGN, 0);
+ if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size)
+ return -EINVAL;
+
+ if (size < ETH_HLEN)
+ return -EINVAL;
+
+ data = bpf_test_init(kattr, kattr->test.data_size_in, size, 0, 0);
if (IS_ERR(data))
return PTR_ERR(data);
- xdp.data_hard_start = data;
- xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN;
- xdp.data_meta = xdp.data;
- xdp.data_end = xdp.data + size;
+ eth = (struct ethhdr *)data;
- rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0);
- xdp.rxq = &rxqueue->xdp_rxq;
+ if (!repeat)
+ repeat = 1;
+
+ user_ctx = bpf_ctx_init(kattr, sizeof(struct bpf_flow_keys));
+ if (IS_ERR(user_ctx)) {
+ kfree(data);
+ return PTR_ERR(user_ctx);
+ }
+ if (user_ctx) {
+ ret = verify_user_bpf_flow_keys(user_ctx);
+ if (ret)
+ goto out;
+ flags = user_ctx->flags;
+ }
- retval = bpf_test_run(prog, &xdp, repeat, &duration);
- if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN ||
- xdp.data_end != xdp.data + size)
- size = xdp.data_end - xdp.data;
- ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration);
+ ctx.flow_keys = &flow_keys;
+ ctx.data = data;
+ ctx.data_end = (__u8 *)data + size;
+
+ bpf_test_timer_enter(&t);
+ do {
+ retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN,
+ size, flags);
+ } while (bpf_test_timer_continue(&t, 1, repeat, &ret, &duration));
+ bpf_test_timer_leave(&t);
+
+ if (ret < 0)
+ goto out;
+
+ ret = bpf_test_finish(kattr, uattr, &flow_keys, NULL,
+ sizeof(flow_keys), 0, retval, duration);
+ if (!ret)
+ ret = bpf_ctx_finish(kattr, uattr, user_ctx,
+ sizeof(struct bpf_flow_keys));
+
+out:
+ kfree(user_ctx);
kfree(data);
return ret;
}
+
+int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_test_timer t = {};
+ struct bpf_prog_array *progs = NULL;
+ struct bpf_sk_lookup_kern ctx = {};
+ u32 repeat = kattr->test.repeat;
+ struct bpf_sk_lookup *user_ctx;
+ u32 retval, duration;
+ int ret = -EINVAL;
+
+ if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size)
+ return -EINVAL;
+
+ if (kattr->test.data_in || kattr->test.data_size_in || kattr->test.data_out ||
+ kattr->test.data_size_out)
+ return -EINVAL;
+
+ if (!repeat)
+ repeat = 1;
+
+ user_ctx = bpf_ctx_init(kattr, sizeof(*user_ctx));
+ if (IS_ERR(user_ctx))
+ return PTR_ERR(user_ctx);
+
+ if (!user_ctx)
+ return -EINVAL;
+
+ if (user_ctx->sk)
+ goto out;
+
+ if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx)))
+ goto out;
+
+ if (user_ctx->local_port > U16_MAX) {
+ ret = -ERANGE;
+ goto out;
+ }
+
+ ctx.family = (u16)user_ctx->family;
+ ctx.protocol = (u16)user_ctx->protocol;
+ ctx.dport = (u16)user_ctx->local_port;
+ ctx.sport = user_ctx->remote_port;
+
+ switch (ctx.family) {
+ case AF_INET:
+ ctx.v4.daddr = (__force __be32)user_ctx->local_ip4;
+ ctx.v4.saddr = (__force __be32)user_ctx->remote_ip4;
+ break;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6:
+ ctx.v6.daddr = (struct in6_addr *)user_ctx->local_ip6;
+ ctx.v6.saddr = (struct in6_addr *)user_ctx->remote_ip6;
+ break;
+#endif
+
+ default:
+ ret = -EAFNOSUPPORT;
+ goto out;
+ }
+
+ progs = bpf_prog_array_alloc(1, GFP_KERNEL);
+ if (!progs) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ progs->items[0].prog = prog;
+
+ bpf_test_timer_enter(&t);
+ do {
+ ctx.selected_sk = NULL;
+ retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, bpf_prog_run);
+ } while (bpf_test_timer_continue(&t, 1, repeat, &ret, &duration));
+ bpf_test_timer_leave(&t);
+
+ if (ret < 0)
+ goto out;
+
+ user_ctx->cookie = 0;
+ if (ctx.selected_sk) {
+ if (ctx.selected_sk->sk_reuseport && !ctx.no_reuseport) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ user_ctx->cookie = sock_gen_cookie(ctx.selected_sk);
+ }
+
+ ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, 0, retval, duration);
+ if (!ret)
+ ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx));
+
+out:
+ bpf_prog_array_free(progs);
+ kfree(user_ctx);
+ return ret;
+}
+
+int bpf_prog_test_run_syscall(struct bpf_prog *prog,
+ const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in);
+ __u32 ctx_size_in = kattr->test.ctx_size_in;
+ void *ctx = NULL;
+ u32 retval;
+ int err = 0;
+
+ /* doesn't support data_in/out, ctx_out, duration, or repeat or flags */
+ if (kattr->test.data_in || kattr->test.data_out ||
+ kattr->test.ctx_out || kattr->test.duration ||
+ kattr->test.repeat || kattr->test.flags ||
+ kattr->test.batch_size)
+ return -EINVAL;
+
+ if (ctx_size_in < prog->aux->max_ctx_offset ||
+ ctx_size_in > U16_MAX)
+ return -EINVAL;
+
+ if (ctx_size_in) {
+ ctx = memdup_user(ctx_in, ctx_size_in);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ }
+
+ rcu_read_lock_trace();
+ retval = bpf_prog_run_pin_on_cpu(prog, ctx);
+ rcu_read_unlock_trace();
+
+ if (copy_to_user(&uattr->test.retval, &retval, sizeof(u32))) {
+ err = -EFAULT;
+ goto out;
+ }
+ if (ctx_size_in)
+ if (copy_to_user(ctx_in, ctx, ctx_size_in))
+ err = -EFAULT;
+out:
+ kfree(ctx);
+ return err;
+}
+
+static int verify_and_copy_hook_state(struct nf_hook_state *state,
+ const struct nf_hook_state *user,
+ struct net_device *dev)
+{
+ if (user->in || user->out)
+ return -EINVAL;
+
+ if (user->net || user->sk || user->okfn)
+ return -EINVAL;
+
+ switch (user->pf) {
+ case NFPROTO_IPV4:
+ case NFPROTO_IPV6:
+ switch (state->hook) {
+ case NF_INET_PRE_ROUTING:
+ state->in = dev;
+ break;
+ case NF_INET_LOCAL_IN:
+ state->in = dev;
+ break;
+ case NF_INET_FORWARD:
+ state->in = dev;
+ state->out = dev;
+ break;
+ case NF_INET_LOCAL_OUT:
+ state->out = dev;
+ break;
+ case NF_INET_POST_ROUTING:
+ state->out = dev;
+ break;
+ }
+
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ state->pf = user->pf;
+ state->hook = user->hook;
+
+ return 0;
+}
+
+static __be16 nfproto_eth(int nfproto)
+{
+ switch (nfproto) {
+ case NFPROTO_IPV4:
+ return htons(ETH_P_IP);
+ case NFPROTO_IPV6:
+ break;
+ }
+
+ return htons(ETH_P_IPV6);
+}
+
+int bpf_prog_test_run_nf(struct bpf_prog *prog,
+ const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct net_device *dev = net->loopback_dev;
+ struct nf_hook_state *user_ctx, hook_state = {
+ .pf = NFPROTO_IPV4,
+ .hook = NF_INET_LOCAL_OUT,
+ };
+ u32 size = kattr->test.data_size_in;
+ u32 repeat = kattr->test.repeat;
+ struct bpf_nf_ctx ctx = {
+ .state = &hook_state,
+ };
+ struct sk_buff *skb = NULL;
+ u32 retval, duration;
+ void *data;
+ int ret;
+
+ if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size)
+ return -EINVAL;
+
+ if (size < sizeof(struct iphdr))
+ return -EINVAL;
+
+ data = bpf_test_init(kattr, kattr->test.data_size_in, size,
+ NET_SKB_PAD + NET_IP_ALIGN,
+ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
+ if (IS_ERR(data))
+ return PTR_ERR(data);
+
+ if (!repeat)
+ repeat = 1;
+
+ user_ctx = bpf_ctx_init(kattr, sizeof(struct nf_hook_state));
+ if (IS_ERR(user_ctx)) {
+ kfree(data);
+ return PTR_ERR(user_ctx);
+ }
+
+ if (user_ctx) {
+ ret = verify_and_copy_hook_state(&hook_state, user_ctx, dev);
+ if (ret)
+ goto out;
+ }
+
+ skb = slab_build_skb(data);
+ if (!skb) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ data = NULL; /* data released via kfree_skb */
+
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+ __skb_put(skb, size);
+
+ ret = -EINVAL;
+
+ if (hook_state.hook != NF_INET_LOCAL_OUT) {
+ if (size < ETH_HLEN + sizeof(struct iphdr))
+ goto out;
+
+ skb->protocol = eth_type_trans(skb, dev);
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ if (hook_state.pf == NFPROTO_IPV4)
+ break;
+ goto out;
+ case htons(ETH_P_IPV6):
+ if (size < ETH_HLEN + sizeof(struct ipv6hdr))
+ goto out;
+ if (hook_state.pf == NFPROTO_IPV6)
+ break;
+ goto out;
+ default:
+ ret = -EPROTO;
+ goto out;
+ }
+
+ skb_reset_network_header(skb);
+ } else {
+ skb->protocol = nfproto_eth(hook_state.pf);
+ }
+
+ ctx.skb = skb;
+
+ ret = bpf_test_run(prog, &ctx, repeat, &retval, &duration, false);
+ if (ret)
+ goto out;
+
+ ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, 0, retval, duration);
+
+out:
+ kfree(user_ctx);
+ kfree_skb(skb);
+ kfree(data);
+ return ret;
+}
+
+static const struct btf_kfunc_id_set bpf_prog_test_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &test_sk_check_kfunc_ids,
+};
+
+BTF_ID_LIST(bpf_prog_test_dtor_kfunc_ids)
+BTF_ID(struct, prog_test_ref_kfunc)
+BTF_ID(func, bpf_kfunc_call_test_release_dtor)
+BTF_ID(struct, prog_test_member)
+BTF_ID(func, bpf_kfunc_call_memb_release_dtor)
+
+static int __init bpf_prog_test_run_init(void)
+{
+ const struct btf_id_dtor_kfunc bpf_prog_test_dtor_kfunc[] = {
+ {
+ .btf_id = bpf_prog_test_dtor_kfunc_ids[0],
+ .kfunc_btf_id = bpf_prog_test_dtor_kfunc_ids[1]
+ },
+ {
+ .btf_id = bpf_prog_test_dtor_kfunc_ids[2],
+ .kfunc_btf_id = bpf_prog_test_dtor_kfunc_ids[3],
+ },
+ };
+ int ret;
+
+ ret = register_btf_fmodret_id_set(&bpf_test_modify_return_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_prog_test_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_prog_test_kfunc_set);
+ return ret ?: register_btf_id_dtor_kfuncs(bpf_prog_test_dtor_kfunc,
+ ARRAY_SIZE(bpf_prog_test_dtor_kfunc),
+ THIS_MODULE);
+}
+late_initcall(bpf_prog_test_run_init);
diff --git a/net/bpfilter/.gitignore b/net/bpfilter/.gitignore
deleted file mode 100644
index e97084e3eea2..000000000000
--- a/net/bpfilter/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-bpfilter_umh
diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
deleted file mode 100644
index e558b46596c4..000000000000
--- a/net/bpfilter/Kconfig
+++ /dev/null
@@ -1,15 +0,0 @@
-menuconfig BPFILTER
- bool "BPF based packet filtering framework (BPFILTER)"
- depends on NET && BPF && INET
- help
- This builds experimental bpfilter framework that is aiming to
- provide netfilter compatible functionality via BPF
-
-if BPFILTER
-config BPFILTER_UMH
- tristate "bpfilter kernel module with user mode helper"
- depends on $(success,$(srctree)/scripts/cc-can-link.sh $(CC))
- default m
- help
- This builds bpfilter kernel module with embedded user mode helper
-endif
diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile
deleted file mode 100644
index 0947ee7f70d5..000000000000
--- a/net/bpfilter/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for the Linux BPFILTER layer.
-#
-
-hostprogs-y := bpfilter_umh
-bpfilter_umh-objs := main.o
-KBUILD_HOSTCFLAGS += -I. -Itools/include/ -Itools/include/uapi
-HOSTCC := $(CC)
-
-ifeq ($(CONFIG_BPFILTER_UMH), y)
-# builtin bpfilter_umh should be compiled with -static
-# since rootfs isn't mounted at the time of __init
-# function is called and do_execv won't find elf interpreter
-KBUILD_HOSTLDFLAGS += -static
-endif
-
-$(obj)/bpfilter_umh_blob.o: $(obj)/bpfilter_umh
-
-obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o
-bpfilter-objs += bpfilter_kern.o bpfilter_umh_blob.o
diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c
deleted file mode 100644
index b64e1649993b..000000000000
--- a/net/bpfilter/bpfilter_kern.c
+++ /dev/null
@@ -1,117 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/umh.h>
-#include <linux/bpfilter.h>
-#include <linux/sched.h>
-#include <linux/sched/signal.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include "msgfmt.h"
-
-extern char bpfilter_umh_start;
-extern char bpfilter_umh_end;
-
-static struct umh_info info;
-/* since ip_getsockopt() can run in parallel, serialize access to umh */
-static DEFINE_MUTEX(bpfilter_lock);
-
-static void shutdown_umh(struct umh_info *info)
-{
- struct task_struct *tsk;
-
- if (!info->pid)
- return;
- tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID);
- if (tsk)
- force_sig(SIGKILL, tsk);
- fput(info->pipe_to_umh);
- fput(info->pipe_from_umh);
- info->pid = 0;
-}
-
-static void __stop_umh(void)
-{
- if (IS_ENABLED(CONFIG_INET)) {
- bpfilter_process_sockopt = NULL;
- shutdown_umh(&info);
- }
-}
-
-static void stop_umh(void)
-{
- mutex_lock(&bpfilter_lock);
- __stop_umh();
- mutex_unlock(&bpfilter_lock);
-}
-
-static int __bpfilter_process_sockopt(struct sock *sk, int optname,
- char __user *optval,
- unsigned int optlen, bool is_set)
-{
- struct mbox_request req;
- struct mbox_reply reply;
- loff_t pos;
- ssize_t n;
- int ret = -EFAULT;
-
- req.is_set = is_set;
- req.pid = current->pid;
- req.cmd = optname;
- req.addr = (long __force __user)optval;
- req.len = optlen;
- mutex_lock(&bpfilter_lock);
- if (!info.pid)
- goto out;
- n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos);
- if (n != sizeof(req)) {
- pr_err("write fail %zd\n", n);
- __stop_umh();
- ret = -EFAULT;
- goto out;
- }
- pos = 0;
- n = kernel_read(info.pipe_from_umh, &reply, sizeof(reply), &pos);
- if (n != sizeof(reply)) {
- pr_err("read fail %zd\n", n);
- __stop_umh();
- ret = -EFAULT;
- goto out;
- }
- ret = reply.status;
-out:
- mutex_unlock(&bpfilter_lock);
- return ret;
-}
-
-static int __init load_umh(void)
-{
- int err;
-
- /* fork usermode process */
- err = fork_usermode_blob(&bpfilter_umh_start,
- &bpfilter_umh_end - &bpfilter_umh_start,
- &info);
- if (err)
- return err;
- pr_info("Loaded bpfilter_umh pid %d\n", info.pid);
-
- /* health check that usermode process started correctly */
- if (__bpfilter_process_sockopt(NULL, 0, NULL, 0, 0) != 0) {
- stop_umh();
- return -EFAULT;
- }
- if (IS_ENABLED(CONFIG_INET))
- bpfilter_process_sockopt = &__bpfilter_process_sockopt;
-
- return 0;
-}
-
-static void __exit fini_umh(void)
-{
- stop_umh();
-}
-module_init(load_umh);
-module_exit(fini_umh);
-MODULE_LICENSE("GPL");
diff --git a/net/bpfilter/bpfilter_umh_blob.S b/net/bpfilter/bpfilter_umh_blob.S
deleted file mode 100644
index 40311d10d2f2..000000000000
--- a/net/bpfilter/bpfilter_umh_blob.S
+++ /dev/null
@@ -1,7 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
- .section .init.rodata, "a"
- .global bpfilter_umh_start
-bpfilter_umh_start:
- .incbin "net/bpfilter/bpfilter_umh"
- .global bpfilter_umh_end
-bpfilter_umh_end:
diff --git a/net/bpfilter/main.c b/net/bpfilter/main.c
deleted file mode 100644
index 1317f108df8a..000000000000
--- a/net/bpfilter/main.c
+++ /dev/null
@@ -1,63 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#define _GNU_SOURCE
-#include <sys/uio.h>
-#include <errno.h>
-#include <stdio.h>
-#include <sys/socket.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include "include/uapi/linux/bpf.h"
-#include <asm/unistd.h>
-#include "msgfmt.h"
-
-int debug_fd;
-
-static int handle_get_cmd(struct mbox_request *cmd)
-{
- switch (cmd->cmd) {
- case 0:
- return 0;
- default:
- break;
- }
- return -ENOPROTOOPT;
-}
-
-static int handle_set_cmd(struct mbox_request *cmd)
-{
- return -ENOPROTOOPT;
-}
-
-static void loop(void)
-{
- while (1) {
- struct mbox_request req;
- struct mbox_reply reply;
- int n;
-
- n = read(0, &req, sizeof(req));
- if (n != sizeof(req)) {
- dprintf(debug_fd, "invalid request %d\n", n);
- return;
- }
-
- reply.status = req.is_set ?
- handle_set_cmd(&req) :
- handle_get_cmd(&req);
-
- n = write(1, &reply, sizeof(reply));
- if (n != sizeof(reply)) {
- dprintf(debug_fd, "reply failed %d\n", n);
- return;
- }
- }
-}
-
-int main(void)
-{
- debug_fd = open("/dev/console", 00000002);
- dprintf(debug_fd, "Started bpfilter\n");
- loop();
- close(debug_fd);
- return 0;
-}
diff --git a/net/bpfilter/msgfmt.h b/net/bpfilter/msgfmt.h
deleted file mode 100644
index 98d121c62945..000000000000
--- a/net/bpfilter/msgfmt.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _NET_BPFILTER_MSGFMT_H
-#define _NET_BPFILTER_MSGFMT_H
-
-struct mbox_request {
- __u64 addr;
- __u32 len;
- __u32 is_set;
- __u32 cmd;
- __u32 pid;
-};
-
-struct mbox_reply {
- __u32 status;
-};
-
-#endif
diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig
index aa0d3b2f1bb7..3c8ded7d3e84 100644
--- a/net/bridge/Kconfig
+++ b/net/bridge/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# 802.1d Ethernet Bridging
#
@@ -7,7 +8,7 @@ config BRIDGE
select LLC
select STP
depends on IPV6 || IPV6=n
- ---help---
+ help
If you say Y here, then your Linux box will be able to act as an
Ethernet bridge, which means that the different Ethernet segments it
is connected to will appear as one Ethernet to the participants.
@@ -17,7 +18,7 @@ config BRIDGE
other third party bridge products.
In order to use the Ethernet bridge, you'll need the bridge
- configuration tools; see <file:Documentation/networking/bridge.txt>
+ configuration tools; see <file:Documentation/networking/bridge.rst>
for location. Please read the Bridge mini-HOWTO for more
information.
@@ -38,7 +39,7 @@ config BRIDGE_IGMP_SNOOPING
depends on BRIDGE
depends on INET
default y
- ---help---
+ help
If you say Y here, then the Ethernet bridge will be able selectively
forward multicast traffic based on IGMP/MLD traffic received from
each port.
@@ -52,7 +53,7 @@ config BRIDGE_VLAN_FILTERING
depends on BRIDGE
depends on VLAN_8021Q
default n
- ---help---
+ help
If you say Y here, then the Ethernet bridge will be able selectively
receive and forward traffic based on VLAN information in the packet
any VLAN information configured on the bridge port or bridge device.
@@ -60,3 +61,26 @@ config BRIDGE_VLAN_FILTERING
Say N to exclude this support and reduce the binary size.
If unsure, say Y.
+
+config BRIDGE_MRP
+ bool "MRP protocol"
+ depends on BRIDGE
+ default n
+ help
+ If you say Y here, then the Ethernet bridge will be able to run MRP
+ protocol to detect loops
+
+ Say N to exclude this support and reduce the binary size.
+
+ If unsure, say N.
+
+config BRIDGE_CFM
+ bool "CFM protocol"
+ depends on BRIDGE
+ help
+ If you say Y here, then the Ethernet bridge will be able to run CFM
+ protocol according to 802.1Q section 12.14
+
+ Say N to exclude this support and reduce the binary size.
+
+ If unsure, say N.
diff --git a/net/bridge/Makefile b/net/bridge/Makefile
index ac9ef337f0fa..24bd1c0a9a5a 100644
--- a/net/bridge/Makefile
+++ b/net/bridge/Makefile
@@ -18,10 +18,14 @@ br_netfilter-y := br_netfilter_hooks.o
br_netfilter-$(subst m,y,$(CONFIG_IPV6)) += br_netfilter_ipv6.o
obj-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o
-bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o
+bridge-$(CONFIG_BRIDGE_IGMP_SNOOPING) += br_multicast.o br_mdb.o br_multicast_eht.o
-bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o
+bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o br_vlan_options.o br_mst.o
bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o
obj-$(CONFIG_NETFILTER) += netfilter/
+
+bridge-$(CONFIG_BRIDGE_MRP) += br_mrp_switchdev.o br_mrp.o br_mrp_netlink.o
+
+bridge-$(CONFIG_BRIDGE_CFM) += br_cfm.o br_cfm_netlink.o
diff --git a/net/bridge/br.c b/net/bridge/br.c
index b0a0b82e2d91..c37e52e2f29a 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Generic parts
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
@@ -31,6 +27,8 @@
*/
static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr)
{
+ struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
+ struct netdev_notifier_pre_changeaddr_info *prechaddr_info;
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct net_bridge_port *p;
struct net_bridge *br;
@@ -38,10 +36,31 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
bool changed_addr;
int err;
- /* register of bridge completed, add sysfs entries */
- if ((dev->priv_flags & IFF_EBRIDGE) && event == NETDEV_REGISTER) {
- br_sysfs_addbr(dev);
- return NOTIFY_DONE;
+ if (netif_is_bridge_master(dev)) {
+ struct net_bridge *br = netdev_priv(dev);
+
+ if (event == NETDEV_REGISTER)
+ br_fdb_change_mac_address(br, dev->dev_addr);
+
+ err = br_vlan_bridge_event(dev, event, ptr);
+ if (err)
+ return notifier_from_errno(err);
+
+ if (event == NETDEV_REGISTER) {
+ /* register of bridge completed, add sysfs entries */
+ err = br_sysfs_addbr(dev);
+ if (err)
+ return notifier_from_errno(err);
+
+ return NOTIFY_DONE;
+ }
+ }
+
+ if (is_vlan_dev(dev)) {
+ struct net_device *real_dev = vlan_dev_real_dev(dev);
+
+ if (netif_is_bridge_master(real_dev))
+ br_vlan_vlan_upper_event(real_dev, dev, event);
}
/* not a port of a bridge */
@@ -56,6 +75,17 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
br_mtu_auto_adjust(br);
break;
+ case NETDEV_PRE_CHANGEADDR:
+ if (br->dev->addr_assign_type == NET_ADDR_SET)
+ break;
+ prechaddr_info = ptr;
+ err = netif_pre_changeaddr_notify(br->dev,
+ prechaddr_info->dev_addr,
+ extack);
+ if (err)
+ return notifier_from_errno(err);
+ break;
+
case NETDEV_CHANGEADDR:
spin_lock_bh(&br->lock);
br_fdb_changeaddr(p, dev->dev_addr);
@@ -104,7 +134,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
break;
case NETDEV_PRE_TYPE_CHANGE:
- /* Forbid underlaying device to change its type. */
+ /* Forbid underlying device to change its type. */
return NOTIFY_BAD;
case NETDEV_RESEND_IGMP:
@@ -113,6 +143,9 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
break;
}
+ if (event != NETDEV_UNREGISTER)
+ br_vlan_port_event(p, event);
+
/* Events that may cause spanning tree to refresh */
if (!notified && (event == NETDEV_CHANGEADDR || event == NETDEV_UP ||
event == NETDEV_CHANGE || event == NETDEV_DOWN))
@@ -145,13 +178,14 @@ static int br_switchdev_event(struct notifier_block *unused,
case SWITCHDEV_FDB_ADD_TO_BRIDGE:
fdb_info = ptr;
err = br_fdb_external_learn_add(br, p, fdb_info->addr,
- fdb_info->vid, false);
+ fdb_info->vid,
+ fdb_info->locked, false);
if (err) {
err = notifier_from_errno(err);
break;
}
br_fdb_offloaded_set(br, p, fdb_info->addr,
- fdb_info->vid);
+ fdb_info->vid, fdb_info->offloaded);
break;
case SWITCHDEV_FDB_DEL_TO_BRIDGE:
fdb_info = ptr;
@@ -163,7 +197,12 @@ static int br_switchdev_event(struct notifier_block *unused,
case SWITCHDEV_FDB_OFFLOADED:
fdb_info = ptr;
br_fdb_offloaded_set(br, p, fdb_info->addr,
- fdb_info->vid);
+ fdb_info->vid, fdb_info->offloaded);
+ break;
+ case SWITCHDEV_FDB_FLUSH_TO_BRIDGE:
+ fdb_info = ptr;
+ /* Don't delete static entries */
+ br_fdb_delete_by_port(br, p, fdb_info->vid, 0);
break;
}
@@ -175,23 +214,208 @@ static struct notifier_block br_switchdev_notifier = {
.notifier_call = br_switchdev_event,
};
-static void __net_exit br_net_exit(struct net *net)
+/* called under rtnl_mutex */
+static int br_switchdev_blocking_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
{
- struct net_device *dev;
- LIST_HEAD(list);
+ struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr);
+ struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+ struct switchdev_notifier_brport_info *brport_info;
+ const struct switchdev_brport *b;
+ struct net_bridge_port *p;
+ int err = NOTIFY_DONE;
- rtnl_lock();
- for_each_netdev(net, dev)
- if (dev->priv_flags & IFF_EBRIDGE)
- br_dev_delete(dev, &list);
+ p = br_port_get_rtnl(dev);
+ if (!p)
+ goto out;
+
+ switch (event) {
+ case SWITCHDEV_BRPORT_OFFLOADED:
+ brport_info = ptr;
+ b = &brport_info->brport;
+
+ err = br_switchdev_port_offload(p, b->dev, b->ctx,
+ b->atomic_nb, b->blocking_nb,
+ b->tx_fwd_offload, extack);
+ err = notifier_from_errno(err);
+ break;
+ case SWITCHDEV_BRPORT_UNOFFLOADED:
+ brport_info = ptr;
+ b = &brport_info->brport;
+
+ br_switchdev_port_unoffload(p, b->ctx, b->atomic_nb,
+ b->blocking_nb);
+ break;
+ case SWITCHDEV_BRPORT_REPLAY:
+ brport_info = ptr;
+ b = &brport_info->brport;
- unregister_netdevice_many(&list);
- rtnl_unlock();
+ err = br_switchdev_port_replay(p, b->dev, b->ctx, b->atomic_nb,
+ b->blocking_nb, extack);
+ err = notifier_from_errno(err);
+ break;
+ }
+out:
+ return err;
+}
+
+static struct notifier_block br_switchdev_blocking_notifier = {
+ .notifier_call = br_switchdev_blocking_event,
+};
+
+static int
+br_toggle_fdb_local_vlan_0(struct net_bridge *br, bool on,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0) == on)
+ return 0;
+
+ err = br_fdb_toggle_local_vlan_0(br, on, extack);
+ if (err)
+ return err;
+
+ br_opt_toggle(br, BROPT_FDB_LOCAL_VLAN_0, on);
+ return 0;
+}
+
+/* br_boolopt_toggle - change user-controlled boolean option
+ *
+ * @br: bridge device
+ * @opt: id of the option to change
+ * @on: new option value
+ * @extack: extack for error messages
+ *
+ * Changes the value of the respective boolean option to @on taking care of
+ * any internal option value mapping and configuration.
+ */
+int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on,
+ struct netlink_ext_ack *extack)
+{
+ int err = 0;
+
+ switch (opt) {
+ case BR_BOOLOPT_NO_LL_LEARN:
+ br_opt_toggle(br, BROPT_NO_LL_LEARN, on);
+ break;
+ case BR_BOOLOPT_MCAST_VLAN_SNOOPING:
+ err = br_multicast_toggle_vlan_snooping(br, on, extack);
+ break;
+ case BR_BOOLOPT_MST_ENABLE:
+ err = br_mst_set_enabled(br, on, extack);
+ break;
+ case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION:
+ br_opt_toggle(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, on);
+ break;
+ case BR_BOOLOPT_FDB_LOCAL_VLAN_0:
+ err = br_toggle_fdb_local_vlan_0(br, on, extack);
+ break;
+ default:
+ /* shouldn't be called with unsupported options */
+ WARN_ON(1);
+ break;
+ }
+
+ return err;
+}
+
+int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt)
+{
+ switch (opt) {
+ case BR_BOOLOPT_NO_LL_LEARN:
+ return br_opt_get(br, BROPT_NO_LL_LEARN);
+ case BR_BOOLOPT_MCAST_VLAN_SNOOPING:
+ return br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED);
+ case BR_BOOLOPT_MST_ENABLE:
+ return br_opt_get(br, BROPT_MST_ENABLED);
+ case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION:
+ return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION);
+ case BR_BOOLOPT_FDB_LOCAL_VLAN_0:
+ return br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0);
+ default:
+ /* shouldn't be called with unsupported options */
+ WARN_ON(1);
+ break;
+ }
+
+ return 0;
+}
+
+int br_boolopt_multi_toggle(struct net_bridge *br,
+ struct br_boolopt_multi *bm,
+ struct netlink_ext_ack *extack)
+{
+ unsigned long bitmap = bm->optmask;
+ int err = 0;
+ int opt_id;
+
+ opt_id = find_next_bit(&bitmap, BITS_PER_LONG, BR_BOOLOPT_MAX);
+ if (opt_id != BITS_PER_LONG) {
+ NL_SET_ERR_MSG_FMT_MOD(extack, "Unknown boolean option %d",
+ opt_id);
+ return -EINVAL;
+ }
+
+ for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) {
+ bool on = !!(bm->optval & BIT(opt_id));
+
+ err = br_boolopt_toggle(br, opt_id, on, extack);
+ if (err) {
+ br_debug(br, "boolopt multi-toggle error: option: %d current: %d new: %d error: %d\n",
+ opt_id, br_boolopt_get(br, opt_id), on, err);
+ break;
+ }
+ }
+
+ return err;
+}
+
+void br_boolopt_multi_get(const struct net_bridge *br,
+ struct br_boolopt_multi *bm)
+{
+ u32 optval = 0;
+ int opt_id;
+
+ for (opt_id = 0; opt_id < BR_BOOLOPT_MAX; opt_id++)
+ optval |= (br_boolopt_get(br, opt_id) << opt_id);
+
+ bm->optval = optval;
+ bm->optmask = GENMASK((BR_BOOLOPT_MAX - 1), 0);
+}
+
+/* private bridge options, controlled by the kernel */
+void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on)
+{
+ bool cur = !!br_opt_get(br, opt);
+
+ br_debug(br, "toggle option: %d state: %d -> %d\n",
+ opt, cur, on);
+
+ if (cur == on)
+ return;
+
+ if (on)
+ set_bit(opt, &br->options);
+ else
+ clear_bit(opt, &br->options);
+}
+
+static void __net_exit br_net_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
+{
+ struct net_device *dev;
+
+ ASSERT_RTNL_NET(net);
+
+ for_each_netdev(net, dev)
+ if (netif_is_bridge_master(dev))
+ br_dev_delete(dev, dev_to_kill);
}
static struct pernet_operations br_net_ops = {
- .exit = br_net_exit,
+ .exit_rtnl = br_net_exit_rtnl,
};
static const struct stp_proto br_stp_proto = {
@@ -202,7 +426,7 @@ static int __init br_init(void)
{
int err;
- BUILD_BUG_ON(sizeof(struct br_input_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb));
+ BUILD_BUG_ON(sizeof(struct br_input_skb_cb) > sizeof_field(struct sk_buff, cb));
err = stp_proto_register(&br_stp_proto);
if (err < 0) {
@@ -230,11 +454,15 @@ static int __init br_init(void)
if (err)
goto err_out4;
- err = br_netlink_init();
+ err = register_switchdev_blocking_notifier(&br_switchdev_blocking_notifier);
if (err)
goto err_out5;
- brioctl_set(br_ioctl_deviceless_stub);
+ err = br_netlink_init();
+ if (err)
+ goto err_out6;
+
+ brioctl_set(br_ioctl_stub);
#if IS_ENABLED(CONFIG_ATM_LANE)
br_fdb_test_addr_hook = br_fdb_test_addr;
@@ -248,6 +476,8 @@ static int __init br_init(void)
return 0;
+err_out6:
+ unregister_switchdev_blocking_notifier(&br_switchdev_blocking_notifier);
err_out5:
unregister_switchdev_notifier(&br_switchdev_notifier);
err_out4:
@@ -267,6 +497,7 @@ static void __exit br_deinit(void)
{
stp_proto_unregister(&br_stp_proto);
br_netlink_fini();
+ unregister_switchdev_blocking_notifier(&br_switchdev_blocking_notifier);
unregister_switchdev_notifier(&br_switchdev_notifier);
unregister_netdevice_notifier(&br_device_notifier);
brioctl_set(NULL);
@@ -286,3 +517,5 @@ module_exit(br_deinit)
MODULE_LICENSE("GPL");
MODULE_VERSION(BR_VERSION);
MODULE_ALIAS_RTNL_LINK("bridge");
+MODULE_DESCRIPTION("Ethernet bridge driver");
+MODULE_IMPORT_NS("NETDEV_INTERNAL");
diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
index 2cf7716254be..1e2b51769eec 100644
--- a/net/bridge/br_arp_nd_proxy.c
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Handle bridge arp/nd proxy/suppress
*
@@ -6,11 +7,6 @@
*
* Authors:
* Roopa Prabhu <roopa@cumulusnetworks.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -21,6 +17,7 @@
#include <linux/if_vlan.h>
#include <linux/inetdevice.h>
#include <net/addrconf.h>
+#include <net/ipv6_stubs.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ip6_checksum.h>
#endif
@@ -33,13 +30,13 @@ void br_recalculate_neigh_suppress_enabled(struct net_bridge *br)
bool neigh_suppress = false;
list_for_each_entry(p, &br->port_list, list) {
- if (p->flags & BR_NEIGH_SUPPRESS) {
+ if (p->flags & (BR_NEIGH_SUPPRESS | BR_NEIGH_VLAN_SUPPRESS)) {
neigh_suppress = true;
break;
}
}
- br->neigh_suppress_enabled = neigh_suppress;
+ br_opt_toggle(br, BROPT_NEIGH_SUPPRESS_ENABLED, neigh_suppress);
}
#if IS_ENABLED(CONFIG_INET)
@@ -87,13 +84,14 @@ static void br_arp_send(struct net_bridge *br, struct net_bridge_port *p,
skb->ip_summed = CHECKSUM_UNNECESSARY;
skb->pkt_type = PACKET_HOST;
- netif_rx_ni(skb);
+ netif_rx(skb);
}
}
-static int br_chk_addr_ip(struct net_device *dev, void *data)
+static int br_chk_addr_ip(struct net_device *dev,
+ struct netdev_nested_priv *priv)
{
- __be32 ip = *(__be32 *)data;
+ __be32 ip = *(__be32 *)priv->data;
struct in_device *in_dev;
__be32 addr = 0;
@@ -110,11 +108,15 @@ static int br_chk_addr_ip(struct net_device *dev, void *data)
static bool br_is_local_ip(struct net_device *dev, __be32 ip)
{
- if (br_chk_addr_ip(dev, &ip))
+ struct netdev_nested_priv priv = {
+ .data = (void *)&ip,
+ };
+
+ if (br_chk_addr_ip(dev, &priv))
return true;
/* check if ip is configured on upper dev */
- if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip, &ip))
+ if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip, &priv))
return true;
return false;
@@ -130,7 +132,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
u8 *arpptr, *sha;
__be32 sip, tip;
- BR_INPUT_SKB_CB(skb)->proxyarp_replied = false;
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 0;
if ((dev->flags & IFF_NOARP) ||
!pskb_may_pull(skb, arp_hdr_len(dev)))
@@ -155,12 +157,17 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
ipv4_is_multicast(tip))
return;
- if (br->neigh_suppress_enabled) {
- if (p && (p->flags & BR_NEIGH_SUPPRESS))
+ if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) {
+ if (br_is_neigh_suppress_enabled(p, vid))
return;
- if (ipv4_is_zeronet(sip) || sip == tip) {
+ if (is_unicast_ether_addr(eth_hdr(skb)->h_dest) &&
+ parp->ar_op == htons(ARPOP_REQUEST))
+ return;
+ if (parp->ar_op != htons(ARPOP_RREQUEST) &&
+ parp->ar_op != htons(ARPOP_RREPLY) &&
+ (ipv4_is_zeronet(sip) || sip == tip)) {
/* prevent flooding to neigh suppress ports */
- BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
return;
}
}
@@ -175,11 +182,12 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
return;
}
- if (br->neigh_suppress_enabled && br_is_local_ip(vlandev, tip)) {
+ if (br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) &&
+ br_is_local_ip(vlandev, tip)) {
/* its our local ip, so don't proxy reply
* and don't forward to neigh suppress ports
*/
- BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
return;
}
@@ -187,7 +195,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
if (n) {
struct net_bridge_fdb_entry *f;
- if (!(n->nud_state & NUD_VALID)) {
+ if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
neigh_release(n);
return;
}
@@ -197,8 +205,8 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
bool replied = false;
if ((p && (p->flags & BR_PROXYARP)) ||
- (f->dst && (f->dst->flags & (BR_PROXYARP_WIFI |
- BR_NEIGH_SUPPRESS)))) {
+ (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)) ||
+ br_is_neigh_suppress_enabled(f->dst, vid)) {
if (!vid)
br_arp_send(br, p, skb->dev, sip, tip,
sha, n->ha, sha, 0, 0);
@@ -213,8 +221,9 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
/* If we have replied or as long as we know the
* mac, indicate to arp replied
*/
- if (replied || br->neigh_suppress_enabled)
- BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ if (replied ||
+ br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED))
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
}
neigh_release(n);
@@ -223,7 +232,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
#endif
#if IS_ENABLED(CONFIG_IPV6)
-struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *msg)
+struct nd_msg *br_is_nd_neigh_msg(const struct sk_buff *skb, struct nd_msg *msg)
{
struct nd_msg *m;
@@ -277,6 +286,10 @@ static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p,
ns_olen = request->len - (skb_network_offset(request) +
sizeof(struct ipv6hdr)) - sizeof(*ns);
for (i = 0; i < ns_olen - 1; i += (ns->opt[i + 1] << 3)) {
+ if (!ns->opt[i + 1]) {
+ kfree_skb(reply);
+ return;
+ }
if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
break;
@@ -311,7 +324,7 @@ static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p,
/* Neighbor Advertisement */
memset(na, 0, sizeof(*na) + na_olen);
na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
- na->icmph.icmp6_router = 0; /* XXX: should be 1 ? */
+ na->icmph.icmp6_router = (n->flags & NTF_ROUTER) ? 1 : 0;
na->icmph.icmp6_override = 1;
na->icmph.icmp6_solicited = 1;
na->target = ns->target;
@@ -354,13 +367,14 @@ static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p,
reply->ip_summed = CHECKSUM_UNNECESSARY;
reply->pkt_type = PACKET_HOST;
- netif_rx_ni(reply);
+ netif_rx(reply);
}
}
-static int br_chk_addr_ip6(struct net_device *dev, void *data)
+static int br_chk_addr_ip6(struct net_device *dev,
+ struct netdev_nested_priv *priv)
{
- struct in6_addr *addr = (struct in6_addr *)data;
+ struct in6_addr *addr = (struct in6_addr *)priv->data;
if (ipv6_chk_addr(dev_net(dev), addr, dev, 0))
return 1;
@@ -371,11 +385,15 @@ static int br_chk_addr_ip6(struct net_device *dev, void *data)
static bool br_is_local_ip6(struct net_device *dev, struct in6_addr *addr)
{
- if (br_chk_addr_ip6(dev, addr))
+ struct netdev_nested_priv priv = {
+ .data = (void *)addr,
+ };
+
+ if (br_chk_addr_ip6(dev, &priv))
return true;
/* check if ip is configured on upper dev */
- if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip6, addr))
+ if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip6, &priv))
return true;
return false;
@@ -390,15 +408,19 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
struct ipv6hdr *iphdr;
struct neighbour *n;
- BR_INPUT_SKB_CB(skb)->proxyarp_replied = false;
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 0;
+
+ if (br_is_neigh_suppress_enabled(p, vid))
+ return;
- if (p && (p->flags & BR_NEIGH_SUPPRESS))
+ if (is_unicast_ether_addr(eth_hdr(skb)->h_dest) &&
+ msg->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION)
return;
if (msg->icmph.icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT &&
!msg->icmph.icmp6_solicited) {
/* prevent flooding to neigh suppress ports */
- BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
return;
}
@@ -411,7 +433,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
if (ipv6_addr_any(saddr) || !ipv6_addr_cmp(saddr, daddr)) {
/* prevent flooding to neigh suppress ports */
- BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
return;
}
@@ -429,7 +451,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
/* its our own ip, so don't proxy reply
* and don't forward to arp suppress ports
*/
- BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
return;
}
@@ -437,7 +459,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
if (n) {
struct net_bridge_fdb_entry *f;
- if (!(n->nud_state & NUD_VALID)) {
+ if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
neigh_release(n);
return;
}
@@ -446,7 +468,7 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
if (f) {
bool replied = false;
- if (f->dst && (f->dst->flags & BR_NEIGH_SUPPRESS)) {
+ if (br_is_neigh_suppress_enabled(f->dst, vid)) {
if (vid != 0)
br_nd_send(br, p, skb, n,
skb->vlan_proto,
@@ -460,10 +482,32 @@ void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
* mac, indicate to NEIGH_SUPPRESS ports that we
* have replied
*/
- if (replied || br->neigh_suppress_enabled)
- BR_INPUT_SKB_CB(skb)->proxyarp_replied = true;
+ if (replied ||
+ br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED))
+ BR_INPUT_SKB_CB(skb)->proxyarp_replied = 1;
}
neigh_release(n);
}
}
#endif
+
+bool br_is_neigh_suppress_enabled(const struct net_bridge_port *p, u16 vid)
+{
+ if (!p)
+ return false;
+
+ if (!vid)
+ return !!(p->flags & BR_NEIGH_SUPPRESS);
+
+ if (p->flags & BR_NEIGH_VLAN_SUPPRESS) {
+ struct net_bridge_vlan_group *vg = nbp_vlan_group_rcu(p);
+ struct net_bridge_vlan *v;
+
+ v = br_vlan_find(vg, vid);
+ if (!v)
+ return false;
+ return !!(v->priv_flags & BR_VLFLAG_NEIGH_SUPPRESS_ENABLED);
+ } else {
+ return !!(p->flags & BR_NEIGH_SUPPRESS);
+ }
+}
diff --git a/net/bridge/br_cfm.c b/net/bridge/br_cfm.c
new file mode 100644
index 000000000000..c2c1c7d44c61
--- /dev/null
+++ b/net/bridge/br_cfm.c
@@ -0,0 +1,867 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/cfm_bridge.h>
+#include <uapi/linux/cfm_bridge.h>
+#include "br_private_cfm.h"
+
+static struct br_cfm_mep *br_mep_find(struct net_bridge *br, u32 instance)
+{
+ struct br_cfm_mep *mep;
+
+ hlist_for_each_entry(mep, &br->mep_list, head)
+ if (mep->instance == instance)
+ return mep;
+
+ return NULL;
+}
+
+static struct br_cfm_mep *br_mep_find_ifindex(struct net_bridge *br,
+ u32 ifindex)
+{
+ struct br_cfm_mep *mep;
+
+ hlist_for_each_entry_rcu(mep, &br->mep_list, head,
+ lockdep_rtnl_is_held())
+ if (mep->create.ifindex == ifindex)
+ return mep;
+
+ return NULL;
+}
+
+static struct br_cfm_peer_mep *br_peer_mep_find(struct br_cfm_mep *mep,
+ u32 mepid)
+{
+ struct br_cfm_peer_mep *peer_mep;
+
+ hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head,
+ lockdep_rtnl_is_held())
+ if (peer_mep->mepid == mepid)
+ return peer_mep;
+
+ return NULL;
+}
+
+static struct net_bridge_port *br_mep_get_port(struct net_bridge *br,
+ u32 ifindex)
+{
+ struct net_bridge_port *port;
+
+ list_for_each_entry(port, &br->port_list, list)
+ if (port->dev->ifindex == ifindex)
+ return port;
+
+ return NULL;
+}
+
+/* Calculate the CCM interval in us. */
+static u32 interval_to_us(enum br_cfm_ccm_interval interval)
+{
+ switch (interval) {
+ case BR_CFM_CCM_INTERVAL_NONE:
+ return 0;
+ case BR_CFM_CCM_INTERVAL_3_3_MS:
+ return 3300;
+ case BR_CFM_CCM_INTERVAL_10_MS:
+ return 10 * 1000;
+ case BR_CFM_CCM_INTERVAL_100_MS:
+ return 100 * 1000;
+ case BR_CFM_CCM_INTERVAL_1_SEC:
+ return 1000 * 1000;
+ case BR_CFM_CCM_INTERVAL_10_SEC:
+ return 10 * 1000 * 1000;
+ case BR_CFM_CCM_INTERVAL_1_MIN:
+ return 60 * 1000 * 1000;
+ case BR_CFM_CCM_INTERVAL_10_MIN:
+ return 10 * 60 * 1000 * 1000;
+ }
+ return 0;
+}
+
+/* Convert the interface interval to CCM PDU value. */
+static u32 interval_to_pdu(enum br_cfm_ccm_interval interval)
+{
+ switch (interval) {
+ case BR_CFM_CCM_INTERVAL_NONE:
+ return 0;
+ case BR_CFM_CCM_INTERVAL_3_3_MS:
+ return 1;
+ case BR_CFM_CCM_INTERVAL_10_MS:
+ return 2;
+ case BR_CFM_CCM_INTERVAL_100_MS:
+ return 3;
+ case BR_CFM_CCM_INTERVAL_1_SEC:
+ return 4;
+ case BR_CFM_CCM_INTERVAL_10_SEC:
+ return 5;
+ case BR_CFM_CCM_INTERVAL_1_MIN:
+ return 6;
+ case BR_CFM_CCM_INTERVAL_10_MIN:
+ return 7;
+ }
+ return 0;
+}
+
+/* Convert the CCM PDU value to interval on interface. */
+static u32 pdu_to_interval(u32 value)
+{
+ switch (value) {
+ case 0:
+ return BR_CFM_CCM_INTERVAL_NONE;
+ case 1:
+ return BR_CFM_CCM_INTERVAL_3_3_MS;
+ case 2:
+ return BR_CFM_CCM_INTERVAL_10_MS;
+ case 3:
+ return BR_CFM_CCM_INTERVAL_100_MS;
+ case 4:
+ return BR_CFM_CCM_INTERVAL_1_SEC;
+ case 5:
+ return BR_CFM_CCM_INTERVAL_10_SEC;
+ case 6:
+ return BR_CFM_CCM_INTERVAL_1_MIN;
+ case 7:
+ return BR_CFM_CCM_INTERVAL_10_MIN;
+ }
+ return BR_CFM_CCM_INTERVAL_NONE;
+}
+
+static void ccm_rx_timer_start(struct br_cfm_peer_mep *peer_mep)
+{
+ u32 interval_us;
+
+ interval_us = interval_to_us(peer_mep->mep->cc_config.exp_interval);
+ /* Function ccm_rx_dwork must be called with 1/4
+ * of the configured CC 'expected_interval'
+ * in order to detect CCM defect after 3.25 interval.
+ */
+ queue_delayed_work(system_percpu_wq, &peer_mep->ccm_rx_dwork,
+ usecs_to_jiffies(interval_us / 4));
+}
+
+static void br_cfm_notify(int event, const struct net_bridge_port *port)
+{
+ u32 filter = RTEXT_FILTER_CFM_STATUS;
+
+ br_info_notify(event, port->br, NULL, filter);
+}
+
+static void cc_peer_enable(struct br_cfm_peer_mep *peer_mep)
+{
+ memset(&peer_mep->cc_status, 0, sizeof(peer_mep->cc_status));
+ peer_mep->ccm_rx_count_miss = 0;
+
+ ccm_rx_timer_start(peer_mep);
+}
+
+static void cc_peer_disable(struct br_cfm_peer_mep *peer_mep)
+{
+ cancel_delayed_work_sync(&peer_mep->ccm_rx_dwork);
+}
+
+static struct sk_buff *ccm_frame_build(struct br_cfm_mep *mep,
+ const struct br_cfm_cc_ccm_tx_info *const tx_info)
+
+{
+ struct br_cfm_common_hdr *common_hdr;
+ struct net_bridge_port *b_port;
+ struct br_cfm_maid *maid;
+ u8 *itu_reserved, *e_tlv;
+ struct ethhdr *eth_hdr;
+ struct sk_buff *skb;
+ __be32 *status_tlv;
+ __be32 *snumber;
+ __be16 *mepid;
+
+ skb = dev_alloc_skb(CFM_CCM_MAX_FRAME_LENGTH);
+ if (!skb)
+ return NULL;
+
+ rcu_read_lock();
+ b_port = rcu_dereference(mep->b_port);
+ if (!b_port) {
+ kfree_skb(skb);
+ rcu_read_unlock();
+ return NULL;
+ }
+ skb->dev = b_port->dev;
+ rcu_read_unlock();
+ /* The device cannot be deleted until the work_queue functions has
+ * completed. This function is called from ccm_tx_work_expired()
+ * that is a work_queue functions.
+ */
+
+ skb->protocol = htons(ETH_P_CFM);
+ skb->priority = CFM_FRAME_PRIO;
+
+ /* Ethernet header */
+ eth_hdr = skb_put(skb, sizeof(*eth_hdr));
+ ether_addr_copy(eth_hdr->h_dest, tx_info->dmac.addr);
+ ether_addr_copy(eth_hdr->h_source, mep->config.unicast_mac.addr);
+ eth_hdr->h_proto = htons(ETH_P_CFM);
+
+ /* Common CFM Header */
+ common_hdr = skb_put(skb, sizeof(*common_hdr));
+ common_hdr->mdlevel_version = mep->config.mdlevel << 5;
+ common_hdr->opcode = BR_CFM_OPCODE_CCM;
+ common_hdr->flags = (mep->rdi << 7) |
+ interval_to_pdu(mep->cc_config.exp_interval);
+ common_hdr->tlv_offset = CFM_CCM_TLV_OFFSET;
+
+ /* Sequence number */
+ snumber = skb_put(skb, sizeof(*snumber));
+ if (tx_info->seq_no_update) {
+ *snumber = cpu_to_be32(mep->ccm_tx_snumber);
+ mep->ccm_tx_snumber += 1;
+ } else {
+ *snumber = 0;
+ }
+
+ mepid = skb_put(skb, sizeof(*mepid));
+ *mepid = cpu_to_be16((u16)mep->config.mepid);
+
+ maid = skb_put(skb, sizeof(*maid));
+ memcpy(maid->data, mep->cc_config.exp_maid.data, sizeof(maid->data));
+
+ /* ITU reserved (CFM_CCM_ITU_RESERVED_SIZE octets) */
+ itu_reserved = skb_put(skb, CFM_CCM_ITU_RESERVED_SIZE);
+ memset(itu_reserved, 0, CFM_CCM_ITU_RESERVED_SIZE);
+
+ /* Generel CFM TLV format:
+ * TLV type: one byte
+ * TLV value length: two bytes
+ * TLV value: 'TLV value length' bytes
+ */
+
+ /* Port status TLV. The value length is 1. Total of 4 bytes. */
+ if (tx_info->port_tlv) {
+ status_tlv = skb_put(skb, sizeof(*status_tlv));
+ *status_tlv = cpu_to_be32((CFM_PORT_STATUS_TLV_TYPE << 24) |
+ (1 << 8) | /* Value length */
+ (tx_info->port_tlv_value & 0xFF));
+ }
+
+ /* Interface status TLV. The value length is 1. Total of 4 bytes. */
+ if (tx_info->if_tlv) {
+ status_tlv = skb_put(skb, sizeof(*status_tlv));
+ *status_tlv = cpu_to_be32((CFM_IF_STATUS_TLV_TYPE << 24) |
+ (1 << 8) | /* Value length */
+ (tx_info->if_tlv_value & 0xFF));
+ }
+
+ /* End TLV */
+ e_tlv = skb_put(skb, sizeof(*e_tlv));
+ *e_tlv = CFM_ENDE_TLV_TYPE;
+
+ return skb;
+}
+
+static void ccm_frame_tx(struct sk_buff *skb)
+{
+ skb_reset_network_header(skb);
+ dev_queue_xmit(skb);
+}
+
+/* This function is called with the configured CC 'expected_interval'
+ * in order to drive CCM transmission when enabled.
+ */
+static void ccm_tx_work_expired(struct work_struct *work)
+{
+ struct delayed_work *del_work;
+ struct br_cfm_mep *mep;
+ struct sk_buff *skb;
+ u32 interval_us;
+
+ del_work = to_delayed_work(work);
+ mep = container_of(del_work, struct br_cfm_mep, ccm_tx_dwork);
+
+ if (time_before_eq(mep->ccm_tx_end, jiffies)) {
+ /* Transmission period has ended */
+ mep->cc_ccm_tx_info.period = 0;
+ return;
+ }
+
+ skb = ccm_frame_build(mep, &mep->cc_ccm_tx_info);
+ if (skb)
+ ccm_frame_tx(skb);
+
+ interval_us = interval_to_us(mep->cc_config.exp_interval);
+ queue_delayed_work(system_percpu_wq, &mep->ccm_tx_dwork,
+ usecs_to_jiffies(interval_us));
+}
+
+/* This function is called with 1/4 of the configured CC 'expected_interval'
+ * in order to detect CCM defect after 3.25 interval.
+ */
+static void ccm_rx_work_expired(struct work_struct *work)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct net_bridge_port *b_port;
+ struct delayed_work *del_work;
+
+ del_work = to_delayed_work(work);
+ peer_mep = container_of(del_work, struct br_cfm_peer_mep, ccm_rx_dwork);
+
+ /* After 13 counts (4 * 3,25) then 3.25 intervals are expired */
+ if (peer_mep->ccm_rx_count_miss < 13) {
+ /* 3.25 intervals are NOT expired without CCM reception */
+ peer_mep->ccm_rx_count_miss++;
+
+ /* Start timer again */
+ ccm_rx_timer_start(peer_mep);
+ } else {
+ /* 3.25 intervals are expired without CCM reception.
+ * CCM defect detected
+ */
+ peer_mep->cc_status.ccm_defect = true;
+
+ /* Change in CCM defect status - notify */
+ rcu_read_lock();
+ b_port = rcu_dereference(peer_mep->mep->b_port);
+ if (b_port)
+ br_cfm_notify(RTM_NEWLINK, b_port);
+ rcu_read_unlock();
+ }
+}
+
+static u32 ccm_tlv_extract(struct sk_buff *skb, u32 index,
+ struct br_cfm_peer_mep *peer_mep)
+{
+ __be32 *s_tlv;
+ __be32 _s_tlv;
+ u32 h_s_tlv;
+ u8 *e_tlv;
+ u8 _e_tlv;
+
+ e_tlv = skb_header_pointer(skb, index, sizeof(_e_tlv), &_e_tlv);
+ if (!e_tlv)
+ return 0;
+
+ /* TLV is present - get the status TLV */
+ s_tlv = skb_header_pointer(skb,
+ index,
+ sizeof(_s_tlv), &_s_tlv);
+ if (!s_tlv)
+ return 0;
+
+ h_s_tlv = ntohl(*s_tlv);
+ if ((h_s_tlv >> 24) == CFM_IF_STATUS_TLV_TYPE) {
+ /* Interface status TLV */
+ peer_mep->cc_status.tlv_seen = true;
+ peer_mep->cc_status.if_tlv_value = (h_s_tlv & 0xFF);
+ }
+
+ if ((h_s_tlv >> 24) == CFM_PORT_STATUS_TLV_TYPE) {
+ /* Port status TLV */
+ peer_mep->cc_status.tlv_seen = true;
+ peer_mep->cc_status.port_tlv_value = (h_s_tlv & 0xFF);
+ }
+
+ /* The Sender ID TLV is not handled */
+ /* The Organization-Specific TLV is not handled */
+
+ /* Return the length of this tlv.
+ * This is the length of the value field plus 3 bytes for size of type
+ * field and length field
+ */
+ return ((h_s_tlv >> 8) & 0xFFFF) + 3;
+}
+
+/* note: already called with rcu_read_lock */
+static int br_cfm_frame_rx(struct net_bridge_port *port, struct sk_buff *skb)
+{
+ u32 mdlevel, interval, size, index, max;
+ const struct br_cfm_common_hdr *hdr;
+ struct br_cfm_peer_mep *peer_mep;
+ const struct br_cfm_maid *maid;
+ struct br_cfm_common_hdr _hdr;
+ struct br_cfm_maid _maid;
+ struct br_cfm_mep *mep;
+ struct net_bridge *br;
+ __be32 *snumber;
+ __be32 _snumber;
+ __be16 *mepid;
+ __be16 _mepid;
+
+ if (port->state == BR_STATE_DISABLED)
+ return 0;
+
+ hdr = skb_header_pointer(skb, 0, sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return 1;
+
+ br = port->br;
+ mep = br_mep_find_ifindex(br, port->dev->ifindex);
+ if (unlikely(!mep))
+ /* No MEP on this port - must be forwarded */
+ return 0;
+
+ mdlevel = hdr->mdlevel_version >> 5;
+ if (mdlevel > mep->config.mdlevel)
+ /* The level is above this MEP level - must be forwarded */
+ return 0;
+
+ if ((hdr->mdlevel_version & 0x1F) != 0) {
+ /* Invalid version */
+ mep->status.version_unexp_seen = true;
+ return 1;
+ }
+
+ if (mdlevel < mep->config.mdlevel) {
+ /* The level is below this MEP level */
+ mep->status.rx_level_low_seen = true;
+ return 1;
+ }
+
+ if (hdr->opcode == BR_CFM_OPCODE_CCM) {
+ /* CCM PDU received. */
+ /* MA ID is after common header + sequence number + MEP ID */
+ maid = skb_header_pointer(skb,
+ CFM_CCM_PDU_MAID_OFFSET,
+ sizeof(_maid), &_maid);
+ if (!maid)
+ return 1;
+ if (memcmp(maid->data, mep->cc_config.exp_maid.data,
+ sizeof(maid->data)))
+ /* MA ID not as expected */
+ return 1;
+
+ /* MEP ID is after common header + sequence number */
+ mepid = skb_header_pointer(skb,
+ CFM_CCM_PDU_MEPID_OFFSET,
+ sizeof(_mepid), &_mepid);
+ if (!mepid)
+ return 1;
+ peer_mep = br_peer_mep_find(mep, (u32)ntohs(*mepid));
+ if (!peer_mep)
+ return 1;
+
+ /* Interval is in common header flags */
+ interval = hdr->flags & 0x07;
+ if (mep->cc_config.exp_interval != pdu_to_interval(interval))
+ /* Interval not as expected */
+ return 1;
+
+ /* A valid CCM frame is received */
+ if (peer_mep->cc_status.ccm_defect) {
+ peer_mep->cc_status.ccm_defect = false;
+
+ /* Change in CCM defect status - notify */
+ br_cfm_notify(RTM_NEWLINK, port);
+
+ /* Start CCM RX timer */
+ ccm_rx_timer_start(peer_mep);
+ }
+
+ peer_mep->cc_status.seen = true;
+ peer_mep->ccm_rx_count_miss = 0;
+
+ /* RDI is in common header flags */
+ peer_mep->cc_status.rdi = (hdr->flags & 0x80) ? true : false;
+
+ /* Sequence number is after common header */
+ snumber = skb_header_pointer(skb,
+ CFM_CCM_PDU_SEQNR_OFFSET,
+ sizeof(_snumber), &_snumber);
+ if (!snumber)
+ return 1;
+ if (ntohl(*snumber) != (mep->ccm_rx_snumber + 1))
+ /* Unexpected sequence number */
+ peer_mep->cc_status.seq_unexp_seen = true;
+
+ mep->ccm_rx_snumber = ntohl(*snumber);
+
+ /* TLV end is after common header + sequence number + MEP ID +
+ * MA ID + ITU reserved
+ */
+ index = CFM_CCM_PDU_TLV_OFFSET;
+ max = 0;
+ do { /* Handle all TLVs */
+ size = ccm_tlv_extract(skb, index, peer_mep);
+ index += size;
+ max += 1;
+ } while (size != 0 && max < 4); /* Max four TLVs possible */
+
+ return 1;
+ }
+
+ mep->status.opcode_unexp_seen = true;
+
+ return 1;
+}
+
+static struct br_frame_type cfm_frame_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_CFM),
+ .frame_handler = br_cfm_frame_rx,
+};
+
+int br_cfm_mep_create(struct net_bridge *br,
+ const u32 instance,
+ struct br_cfm_mep_create *const create,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_port *p;
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ if (create->domain == BR_CFM_VLAN) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "VLAN domain not supported");
+ return -EINVAL;
+ }
+ if (create->domain != BR_CFM_PORT) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Invalid domain value");
+ return -EINVAL;
+ }
+ if (create->direction == BR_CFM_MEP_DIRECTION_UP) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Up-MEP not supported");
+ return -EINVAL;
+ }
+ if (create->direction != BR_CFM_MEP_DIRECTION_DOWN) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Invalid direction value");
+ return -EINVAL;
+ }
+ p = br_mep_get_port(br, create->ifindex);
+ if (!p) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Port is not related to bridge");
+ return -EINVAL;
+ }
+ mep = br_mep_find(br, instance);
+ if (mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance already exists");
+ return -EEXIST;
+ }
+
+ /* In PORT domain only one instance can be created per port */
+ if (create->domain == BR_CFM_PORT) {
+ mep = br_mep_find_ifindex(br, create->ifindex);
+ if (mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Only one Port MEP on a port allowed");
+ return -EINVAL;
+ }
+ }
+
+ mep = kzalloc(sizeof(*mep), GFP_KERNEL);
+ if (!mep)
+ return -ENOMEM;
+
+ mep->create = *create;
+ mep->instance = instance;
+ rcu_assign_pointer(mep->b_port, p);
+
+ INIT_HLIST_HEAD(&mep->peer_mep_list);
+ INIT_DELAYED_WORK(&mep->ccm_tx_dwork, ccm_tx_work_expired);
+
+ if (hlist_empty(&br->mep_list))
+ br_add_frame(br, &cfm_frame_type);
+
+ hlist_add_tail_rcu(&mep->head, &br->mep_list);
+
+ return 0;
+}
+
+static void mep_delete_implementation(struct net_bridge *br,
+ struct br_cfm_mep *mep)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct hlist_node *n_store;
+
+ ASSERT_RTNL();
+
+ /* Empty and free peer MEP list */
+ hlist_for_each_entry_safe(peer_mep, n_store, &mep->peer_mep_list, head) {
+ cancel_delayed_work_sync(&peer_mep->ccm_rx_dwork);
+ hlist_del_rcu(&peer_mep->head);
+ kfree_rcu(peer_mep, rcu);
+ }
+
+ cancel_delayed_work_sync(&mep->ccm_tx_dwork);
+
+ RCU_INIT_POINTER(mep->b_port, NULL);
+ hlist_del_rcu(&mep->head);
+ kfree_rcu(mep, rcu);
+
+ if (hlist_empty(&br->mep_list))
+ br_del_frame(br, &cfm_frame_type);
+}
+
+int br_cfm_mep_delete(struct net_bridge *br,
+ const u32 instance,
+ struct netlink_ext_ack *extack)
+{
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ mep = br_mep_find(br, instance);
+ if (!mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance does not exists");
+ return -ENOENT;
+ }
+
+ mep_delete_implementation(br, mep);
+
+ return 0;
+}
+
+int br_cfm_mep_config_set(struct net_bridge *br,
+ const u32 instance,
+ const struct br_cfm_mep_config *const config,
+ struct netlink_ext_ack *extack)
+{
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ mep = br_mep_find(br, instance);
+ if (!mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance does not exists");
+ return -ENOENT;
+ }
+
+ mep->config = *config;
+
+ return 0;
+}
+
+int br_cfm_cc_config_set(struct net_bridge *br,
+ const u32 instance,
+ const struct br_cfm_cc_config *const config,
+ struct netlink_ext_ack *extack)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ mep = br_mep_find(br, instance);
+ if (!mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance does not exists");
+ return -ENOENT;
+ }
+
+ /* Check for no change in configuration */
+ if (memcmp(config, &mep->cc_config, sizeof(*config)) == 0)
+ return 0;
+
+ if (config->enable && !mep->cc_config.enable)
+ /* CC is enabled */
+ hlist_for_each_entry(peer_mep, &mep->peer_mep_list, head)
+ cc_peer_enable(peer_mep);
+
+ if (!config->enable && mep->cc_config.enable)
+ /* CC is disabled */
+ hlist_for_each_entry(peer_mep, &mep->peer_mep_list, head)
+ cc_peer_disable(peer_mep);
+
+ mep->cc_config = *config;
+ mep->ccm_rx_snumber = 0;
+ mep->ccm_tx_snumber = 1;
+
+ return 0;
+}
+
+int br_cfm_cc_peer_mep_add(struct net_bridge *br, const u32 instance,
+ u32 mepid,
+ struct netlink_ext_ack *extack)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ mep = br_mep_find(br, instance);
+ if (!mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance does not exists");
+ return -ENOENT;
+ }
+
+ peer_mep = br_peer_mep_find(mep, mepid);
+ if (peer_mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Peer MEP-ID already exists");
+ return -EEXIST;
+ }
+
+ peer_mep = kzalloc(sizeof(*peer_mep), GFP_KERNEL);
+ if (!peer_mep)
+ return -ENOMEM;
+
+ peer_mep->mepid = mepid;
+ peer_mep->mep = mep;
+ INIT_DELAYED_WORK(&peer_mep->ccm_rx_dwork, ccm_rx_work_expired);
+
+ if (mep->cc_config.enable)
+ cc_peer_enable(peer_mep);
+
+ hlist_add_tail_rcu(&peer_mep->head, &mep->peer_mep_list);
+
+ return 0;
+}
+
+int br_cfm_cc_peer_mep_remove(struct net_bridge *br, const u32 instance,
+ u32 mepid,
+ struct netlink_ext_ack *extack)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ mep = br_mep_find(br, instance);
+ if (!mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance does not exists");
+ return -ENOENT;
+ }
+
+ peer_mep = br_peer_mep_find(mep, mepid);
+ if (!peer_mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Peer MEP-ID does not exists");
+ return -ENOENT;
+ }
+
+ cc_peer_disable(peer_mep);
+
+ hlist_del_rcu(&peer_mep->head);
+ kfree_rcu(peer_mep, rcu);
+
+ return 0;
+}
+
+int br_cfm_cc_rdi_set(struct net_bridge *br, const u32 instance,
+ const bool rdi, struct netlink_ext_ack *extack)
+{
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ mep = br_mep_find(br, instance);
+ if (!mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance does not exists");
+ return -ENOENT;
+ }
+
+ mep->rdi = rdi;
+
+ return 0;
+}
+
+int br_cfm_cc_ccm_tx(struct net_bridge *br, const u32 instance,
+ const struct br_cfm_cc_ccm_tx_info *const tx_info,
+ struct netlink_ext_ack *extack)
+{
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ mep = br_mep_find(br, instance);
+ if (!mep) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MEP instance does not exists");
+ return -ENOENT;
+ }
+
+ if (memcmp(tx_info, &mep->cc_ccm_tx_info, sizeof(*tx_info)) == 0) {
+ /* No change in tx_info. */
+ if (mep->cc_ccm_tx_info.period == 0)
+ /* Transmission is not enabled - just return */
+ return 0;
+
+ /* Transmission is ongoing, the end time is recalculated */
+ mep->ccm_tx_end = jiffies +
+ usecs_to_jiffies(tx_info->period * 1000000);
+ return 0;
+ }
+
+ if (tx_info->period == 0 && mep->cc_ccm_tx_info.period == 0)
+ /* Some change in info and transmission is not ongoing */
+ goto save;
+
+ if (tx_info->period != 0 && mep->cc_ccm_tx_info.period != 0) {
+ /* Some change in info and transmission is ongoing
+ * The end time is recalculated
+ */
+ mep->ccm_tx_end = jiffies +
+ usecs_to_jiffies(tx_info->period * 1000000);
+
+ goto save;
+ }
+
+ if (tx_info->period == 0 && mep->cc_ccm_tx_info.period != 0) {
+ cancel_delayed_work_sync(&mep->ccm_tx_dwork);
+ goto save;
+ }
+
+ /* Start delayed work to transmit CCM frames. It is done with zero delay
+ * to send first frame immediately
+ */
+ mep->ccm_tx_end = jiffies + usecs_to_jiffies(tx_info->period * 1000000);
+ queue_delayed_work(system_percpu_wq, &mep->ccm_tx_dwork, 0);
+
+save:
+ mep->cc_ccm_tx_info = *tx_info;
+
+ return 0;
+}
+
+int br_cfm_mep_count(struct net_bridge *br, u32 *count)
+{
+ struct br_cfm_mep *mep;
+
+ *count = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mep, &br->mep_list, head)
+ *count += 1;
+ rcu_read_unlock();
+
+ return 0;
+}
+
+int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct br_cfm_mep *mep;
+
+ *count = 0;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(mep, &br->mep_list, head)
+ hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head)
+ *count += 1;
+ rcu_read_unlock();
+
+ return 0;
+}
+
+bool br_cfm_created(struct net_bridge *br)
+{
+ return !hlist_empty(&br->mep_list);
+}
+
+/* Deletes the CFM instances on a specific bridge port
+ */
+void br_cfm_port_del(struct net_bridge *br, struct net_bridge_port *port)
+{
+ struct hlist_node *n_store;
+ struct br_cfm_mep *mep;
+
+ ASSERT_RTNL();
+
+ hlist_for_each_entry_safe(mep, n_store, &br->mep_list, head)
+ if (mep->create.ifindex == port->dev->ifindex)
+ mep_delete_implementation(br, mep);
+}
diff --git a/net/bridge/br_cfm_netlink.c b/net/bridge/br_cfm_netlink.c
new file mode 100644
index 000000000000..2faab44652e7
--- /dev/null
+++ b/net/bridge/br_cfm_netlink.c
@@ -0,0 +1,726 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <net/genetlink.h>
+
+#include "br_private.h"
+#include "br_private_cfm.h"
+
+static const struct nla_policy
+br_cfm_mep_create_policy[IFLA_BRIDGE_CFM_MEP_CREATE_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_MEP_CREATE_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+br_cfm_mep_delete_policy[IFLA_BRIDGE_CFM_MEP_DELETE_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_MEP_DELETE_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+br_cfm_mep_config_policy[IFLA_BRIDGE_CFM_MEP_CONFIG_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_MEP_CONFIG_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC] = NLA_POLICY_ETH_ADDR,
+ [IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL] = NLA_POLICY_MAX(NLA_U32, 7),
+ [IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID] = NLA_POLICY_MAX(NLA_U32, 0x1FFF),
+};
+
+static const struct nla_policy
+br_cfm_cc_config_policy[IFLA_BRIDGE_CFM_CC_CONFIG_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_CC_CONFIG_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID] = {
+ .type = NLA_BINARY, .len = CFM_MAID_LENGTH },
+};
+
+static const struct nla_policy
+br_cfm_cc_peer_mep_policy[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_CC_PEER_MEP_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_PEER_MEPID] = NLA_POLICY_MAX(NLA_U32, 0x1FFF),
+};
+
+static const struct nla_policy
+br_cfm_cc_rdi_policy[IFLA_BRIDGE_CFM_CC_RDI_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_CC_RDI_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_CC_RDI_INSTANCE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_RDI_RDI] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+br_cfm_cc_ccm_tx_policy[IFLA_BRIDGE_CFM_CC_CCM_TX_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC] = NLA_POLICY_ETH_ADDR,
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE] = { .type = NLA_U8 },
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE] = { .type = NLA_U8 },
+};
+
+static const struct nla_policy
+br_cfm_policy[IFLA_BRIDGE_CFM_MAX + 1] = {
+ [IFLA_BRIDGE_CFM_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_CFM_MEP_CREATE] =
+ NLA_POLICY_NESTED(br_cfm_mep_create_policy),
+ [IFLA_BRIDGE_CFM_MEP_DELETE] =
+ NLA_POLICY_NESTED(br_cfm_mep_delete_policy),
+ [IFLA_BRIDGE_CFM_MEP_CONFIG] =
+ NLA_POLICY_NESTED(br_cfm_mep_config_policy),
+ [IFLA_BRIDGE_CFM_CC_CONFIG] =
+ NLA_POLICY_NESTED(br_cfm_cc_config_policy),
+ [IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD] =
+ NLA_POLICY_NESTED(br_cfm_cc_peer_mep_policy),
+ [IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE] =
+ NLA_POLICY_NESTED(br_cfm_cc_peer_mep_policy),
+ [IFLA_BRIDGE_CFM_CC_RDI] =
+ NLA_POLICY_NESTED(br_cfm_cc_rdi_policy),
+ [IFLA_BRIDGE_CFM_CC_CCM_TX] =
+ NLA_POLICY_NESTED(br_cfm_cc_ccm_tx_policy),
+};
+
+static int br_mep_create_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_CREATE_MAX + 1];
+ struct br_cfm_mep_create create;
+ u32 instance;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_CREATE_MAX, attr,
+ br_cfm_mep_create_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing DOMAIN attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing DIRECTION attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing IFINDEX attribute");
+ return -EINVAL;
+ }
+
+ memset(&create, 0, sizeof(create));
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE]);
+ create.domain = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN]);
+ create.direction = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION]);
+ create.ifindex = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX]);
+
+ return br_cfm_mep_create(br, instance, &create, extack);
+}
+
+static int br_mep_delete_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_DELETE_MAX + 1];
+ u32 instance;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_DELETE_MAX, attr,
+ br_cfm_mep_delete_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_DELETE_INSTANCE]);
+
+ return br_cfm_mep_delete(br, instance, extack);
+}
+
+static int br_mep_config_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MAX + 1];
+ struct br_cfm_mep_config config;
+ u32 instance;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MEP_CONFIG_MAX, attr,
+ br_cfm_mep_config_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing UNICAST_MAC attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing MDLEVEL attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing MEPID attribute");
+ return -EINVAL;
+ }
+
+ memset(&config, 0, sizeof(config));
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE]);
+ nla_memcpy(&config.unicast_mac.addr,
+ tb[IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC],
+ sizeof(config.unicast_mac.addr));
+ config.mdlevel = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL]);
+ config.mepid = nla_get_u32(tb[IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID]);
+
+ return br_cfm_mep_config_set(br, instance, &config, extack);
+}
+
+static int br_cc_config_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_CC_CONFIG_MAX + 1];
+ struct br_cfm_cc_config config;
+ u32 instance;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_CONFIG_MAX, attr,
+ br_cfm_cc_config_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing ENABLE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INTERVAL attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing MAID attribute");
+ return -EINVAL;
+ }
+
+ memset(&config, 0, sizeof(config));
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE]);
+ config.enable = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE]);
+ config.exp_interval = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL]);
+ nla_memcpy(&config.exp_maid.data, tb[IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID],
+ sizeof(config.exp_maid.data));
+
+ return br_cfm_cc_config_set(br, instance, &config, extack);
+}
+
+static int br_cc_peer_mep_add_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1];
+ u32 instance, peer_mep_id;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX, attr,
+ br_cfm_cc_peer_mep_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing PEER_MEP_ID attribute");
+ return -EINVAL;
+ }
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]);
+ peer_mep_id = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]);
+
+ return br_cfm_cc_peer_mep_add(br, instance, peer_mep_id, extack);
+}
+
+static int br_cc_peer_mep_remove_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX + 1];
+ u32 instance, peer_mep_id;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_PEER_MEP_MAX, attr,
+ br_cfm_cc_peer_mep_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing PEER_MEP_ID attribute");
+ return -EINVAL;
+ }
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE]);
+ peer_mep_id = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_PEER_MEPID]);
+
+ return br_cfm_cc_peer_mep_remove(br, instance, peer_mep_id, extack);
+}
+
+static int br_cc_rdi_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_CC_RDI_MAX + 1];
+ u32 instance, rdi;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_RDI_MAX, attr,
+ br_cfm_cc_rdi_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_CC_RDI_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_RDI_RDI]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing RDI attribute");
+ return -EINVAL;
+ }
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_RDI_INSTANCE]);
+ rdi = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_RDI_RDI]);
+
+ return br_cfm_cc_rdi_set(br, instance, rdi, extack);
+}
+
+static int br_cc_ccm_tx_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_CC_CCM_TX_MAX + 1];
+ struct br_cfm_cc_ccm_tx_info tx_info;
+ u32 instance;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_CC_CCM_TX_MAX, attr,
+ br_cfm_cc_ccm_tx_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing INSTANCE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing DMAC attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing SEQ_NO_UPDATE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing PERIOD attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing IF_TLV attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing IF_TLV_VALUE attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing PORT_TLV attribute");
+ return -EINVAL;
+ }
+ if (!tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing PORT_TLV_VALUE attribute");
+ return -EINVAL;
+ }
+
+ memset(&tx_info, 0, sizeof(tx_info));
+
+ instance = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE]);
+ nla_memcpy(&tx_info.dmac.addr,
+ tb[IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC],
+ sizeof(tx_info.dmac.addr));
+ tx_info.seq_no_update = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE]);
+ tx_info.period = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD]);
+ tx_info.if_tlv = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV]);
+ tx_info.if_tlv_value = nla_get_u8(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE]);
+ tx_info.port_tlv = nla_get_u32(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV]);
+ tx_info.port_tlv_value = nla_get_u8(tb[IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE]);
+
+ return br_cfm_cc_ccm_tx(br, instance, &tx_info, extack);
+}
+
+int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p,
+ struct nlattr *attr, int cmd, struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_CFM_MAX + 1];
+ int err;
+
+ /* When this function is called for a port then the br pointer is
+ * invalid, therefor set the br to point correctly
+ */
+ if (p)
+ br = p->br;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_CFM_MAX, attr,
+ br_cfm_policy, extack);
+ if (err)
+ return err;
+
+ if (tb[IFLA_BRIDGE_CFM_MEP_CREATE]) {
+ err = br_mep_create_parse(br, tb[IFLA_BRIDGE_CFM_MEP_CREATE],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_CFM_MEP_DELETE]) {
+ err = br_mep_delete_parse(br, tb[IFLA_BRIDGE_CFM_MEP_DELETE],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_CFM_MEP_CONFIG]) {
+ err = br_mep_config_parse(br, tb[IFLA_BRIDGE_CFM_MEP_CONFIG],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_CFM_CC_CONFIG]) {
+ err = br_cc_config_parse(br, tb[IFLA_BRIDGE_CFM_CC_CONFIG],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD]) {
+ err = br_cc_peer_mep_add_parse(br, tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_ADD],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE]) {
+ err = br_cc_peer_mep_remove_parse(br, tb[IFLA_BRIDGE_CFM_CC_PEER_MEP_REMOVE],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_CFM_CC_RDI]) {
+ err = br_cc_rdi_parse(br, tb[IFLA_BRIDGE_CFM_CC_RDI],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_CFM_CC_CCM_TX]) {
+ err = br_cc_ccm_tx_parse(br, tb[IFLA_BRIDGE_CFM_CC_CCM_TX],
+ extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct br_cfm_mep *mep;
+ struct nlattr *tb;
+
+ hlist_for_each_entry_rcu(mep, &br->mep_list, head) {
+ tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_CREATE_INFO);
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_DOMAIN,
+ mep->create.domain))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_DIRECTION,
+ mep->create.direction))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CREATE_IFINDEX,
+ mep->create.ifindex))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, tb);
+
+ tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_INFO);
+
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_UNICAST_MAC,
+ sizeof(mep->config.unicast_mac.addr),
+ mep->config.unicast_mac.addr))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_MDLEVEL,
+ mep->config.mdlevel))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_CONFIG_MEPID,
+ mep->config.mepid))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, tb);
+
+ tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_CONFIG_INFO);
+
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_ENABLE,
+ mep->cc_config.enable))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CONFIG_EXP_INTERVAL,
+ mep->cc_config.exp_interval))
+ goto nla_put_failure;
+
+ if (nla_put(skb, IFLA_BRIDGE_CFM_CC_CONFIG_EXP_MAID,
+ sizeof(mep->cc_config.exp_maid.data),
+ mep->cc_config.exp_maid.data))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, tb);
+
+ tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_RDI_INFO);
+
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_RDI_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_RDI_RDI,
+ mep->rdi))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, tb);
+
+ tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_INFO);
+
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_DMAC,
+ sizeof(mep->cc_ccm_tx_info.dmac),
+ mep->cc_ccm_tx_info.dmac.addr))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_SEQ_NO_UPDATE,
+ mep->cc_ccm_tx_info.seq_no_update))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PERIOD,
+ mep->cc_ccm_tx_info.period))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV,
+ mep->cc_ccm_tx_info.if_tlv))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_IF_TLV_VALUE,
+ mep->cc_ccm_tx_info.if_tlv_value))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV,
+ mep->cc_ccm_tx_info.port_tlv))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, IFLA_BRIDGE_CFM_CC_CCM_TX_PORT_TLV_VALUE,
+ mep->cc_ccm_tx_info.port_tlv_value))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, tb);
+
+ hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head) {
+ tb = nla_nest_start(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_MEP_INFO);
+
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_MEP_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_MEPID,
+ peer_mep->mepid))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, tb);
+ }
+ }
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, tb);
+
+nla_info_failure:
+ return -EMSGSIZE;
+}
+
+int br_cfm_status_fill_info(struct sk_buff *skb,
+ struct net_bridge *br,
+ bool getlink)
+{
+ struct br_cfm_peer_mep *peer_mep;
+ struct br_cfm_mep *mep;
+ struct nlattr *tb;
+
+ hlist_for_each_entry_rcu(mep, &br->mep_list, head) {
+ tb = nla_nest_start(skb, IFLA_BRIDGE_CFM_MEP_STATUS_INFO);
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_MEP_STATUS_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_MEP_STATUS_OPCODE_UNEXP_SEEN,
+ mep->status.opcode_unexp_seen))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_MEP_STATUS_VERSION_UNEXP_SEEN,
+ mep->status.version_unexp_seen))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_MEP_STATUS_RX_LEVEL_LOW_SEEN,
+ mep->status.rx_level_low_seen))
+ goto nla_put_failure;
+
+ /* Only clear if this is a GETLINK */
+ if (getlink) {
+ /* Clear all 'seen' indications */
+ mep->status.opcode_unexp_seen = false;
+ mep->status.version_unexp_seen = false;
+ mep->status.rx_level_low_seen = false;
+ }
+
+ nla_nest_end(skb, tb);
+
+ hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head) {
+ tb = nla_nest_start(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_INFO);
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_INSTANCE,
+ mep->instance))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_PEER_MEPID,
+ peer_mep->mepid))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_CCM_DEFECT,
+ peer_mep->cc_status.ccm_defect))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_CFM_CC_PEER_STATUS_RDI,
+ peer_mep->cc_status.rdi))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_PORT_TLV_VALUE,
+ peer_mep->cc_status.port_tlv_value))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_IF_TLV_VALUE,
+ peer_mep->cc_status.if_tlv_value))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEEN,
+ peer_mep->cc_status.seen))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_TLV_SEEN,
+ peer_mep->cc_status.tlv_seen))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb,
+ IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEQ_UNEXP_SEEN,
+ peer_mep->cc_status.seq_unexp_seen))
+ goto nla_put_failure;
+
+ if (getlink) { /* Only clear if this is a GETLINK */
+ /* Clear all 'seen' indications */
+ peer_mep->cc_status.seen = false;
+ peer_mep->cc_status.tlv_seen = false;
+ peer_mep->cc_status.seq_unexp_seen = false;
+ }
+
+ nla_nest_end(skb, tb);
+ }
+ }
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, tb);
+
+nla_info_failure:
+ return -EMSGSIZE;
+}
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index e682a668ce57..a818fdc22da9 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Device handling code
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -20,6 +16,8 @@
#include <linux/netfilter_bridge.h>
#include <linux/uaccess.h>
+#include <net/netdev_lock.h>
+
#include "br_private.h"
#define COMMON_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | \
@@ -28,20 +26,29 @@
const struct nf_br_ops __rcu *nf_br_ops __read_mostly;
EXPORT_SYMBOL_GPL(nf_br_ops);
-static struct lock_class_key bridge_netdev_addr_lock_key;
-
/* net device transmit always called with BH disabled */
netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
{
+ enum skb_drop_reason reason = pskb_may_pull_reason(skb, ETH_HLEN);
+ struct net_bridge_mcast_port *pmctx_null = NULL;
struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_mcast *brmctx = &br->multicast_ctx;
struct net_bridge_fdb_entry *dst;
struct net_bridge_mdb_entry *mdst;
- struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);
const struct nf_br_ops *nf_ops;
+ u8 state = BR_STATE_FORWARDING;
+ struct net_bridge_vlan *vlan;
const unsigned char *dest;
- struct ethhdr *eth;
u16 vid = 0;
+ if (unlikely(reason != SKB_NOT_DROPPED_YET)) {
+ kfree_skb_reason(skb, reason);
+ return NETDEV_TX_OK;
+ }
+
+ memset(skb->cb, 0, sizeof(struct br_input_skb_cb));
+ br_tc_skb_miss_set(skb, false);
+
rcu_read_lock();
nf_ops = rcu_dereference(nf_br_ops);
if (nf_ops && nf_ops->br_dev_xmit_hook(skb)) {
@@ -49,29 +56,27 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK;
}
- u64_stats_update_begin(&brstats->syncp);
- brstats->tx_packets++;
- brstats->tx_bytes += skb->len;
- u64_stats_update_end(&brstats->syncp);
+ dev_sw_netstats_tx_add(dev, 1, skb->len);
br_switchdev_frame_unmark(skb);
BR_INPUT_SKB_CB(skb)->brdev = dev;
+ BR_INPUT_SKB_CB(skb)->frag_max_size = 0;
skb_reset_mac_header(skb);
- eth = eth_hdr(skb);
skb_pull(skb, ETH_HLEN);
- if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid))
+ if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid,
+ &state, &vlan))
goto out;
if (IS_ENABLED(CONFIG_INET) &&
- (eth->h_proto == htons(ETH_P_ARP) ||
- eth->h_proto == htons(ETH_P_RARP)) &&
- br->neigh_suppress_enabled) {
+ (eth_hdr(skb)->h_proto == htons(ETH_P_ARP) ||
+ eth_hdr(skb)->h_proto == htons(ETH_P_RARP)) &&
+ br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED)) {
br_do_proxy_suppress_arp(skb, br, vid, NULL);
} else if (IS_ENABLED(CONFIG_IPV6) &&
skb->protocol == htons(ETH_P_IPV6) &&
- br->neigh_suppress_enabled &&
+ br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) &&
pskb_may_pull(skb, sizeof(struct ipv6hdr) +
sizeof(struct nd_msg)) &&
ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
@@ -84,69 +89,65 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
dest = eth_hdr(skb)->h_dest;
if (is_broadcast_ether_addr(dest)) {
- br_flood(br, skb, BR_PKT_BROADCAST, false, true);
+ br_flood(br, skb, BR_PKT_BROADCAST, false, true, vid);
} else if (is_multicast_ether_addr(dest)) {
if (unlikely(netpoll_tx_running(dev))) {
- br_flood(br, skb, BR_PKT_MULTICAST, false, true);
+ br_flood(br, skb, BR_PKT_MULTICAST, false, true, vid);
goto out;
}
- if (br_multicast_rcv(br, NULL, skb, vid)) {
+ if (br_multicast_rcv(&brmctx, &pmctx_null, vlan, skb, vid)) {
kfree_skb(skb);
goto out;
}
- mdst = br_mdb_get(br, skb, vid);
+ mdst = br_mdb_entry_skb_get(brmctx, skb, vid);
if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
- br_multicast_querier_exists(br, eth_hdr(skb)))
- br_multicast_flood(mdst, skb, false, true);
+ br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst))
+ br_multicast_flood(mdst, skb, brmctx, false, true);
else
- br_flood(br, skb, BR_PKT_MULTICAST, false, true);
+ br_flood(br, skb, BR_PKT_MULTICAST, false, true, vid);
} else if ((dst = br_fdb_find_rcu(br, dest, vid)) != NULL) {
br_forward(dst->dst, skb, false, true);
} else {
- br_flood(br, skb, BR_PKT_UNICAST, false, true);
+ br_flood(br, skb, BR_PKT_UNICAST, false, true, vid);
}
out:
rcu_read_unlock();
return NETDEV_TX_OK;
}
-static void br_set_lockdep_class(struct net_device *dev)
-{
- lockdep_set_class(&dev->addr_list_lock, &bridge_netdev_addr_lock_key);
-}
-
static int br_dev_init(struct net_device *dev)
{
struct net_bridge *br = netdev_priv(dev);
int err;
- br->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
- if (!br->stats)
- return -ENOMEM;
-
err = br_fdb_hash_init(br);
+ if (err)
+ return err;
+
+ err = br_mdb_hash_init(br);
if (err) {
- free_percpu(br->stats);
+ br_fdb_hash_fini(br);
return err;
}
err = br_vlan_init(br);
if (err) {
- free_percpu(br->stats);
+ br_mdb_hash_fini(br);
br_fdb_hash_fini(br);
return err;
}
err = br_multicast_init_stats(br);
if (err) {
- free_percpu(br->stats);
br_vlan_flush(br);
+ br_mdb_hash_fini(br);
br_fdb_hash_fini(br);
+ return err;
}
- br_set_lockdep_class(dev);
- return err;
+ netdev_lockdep_set_classes(dev);
+ return 0;
}
static void br_dev_uninit(struct net_device *dev)
@@ -156,8 +157,8 @@ static void br_dev_uninit(struct net_device *dev)
br_multicast_dev_del(br);
br_multicast_uninit_stats(br);
br_vlan_flush(br);
+ br_mdb_hash_fini(br);
br_fdb_hash_fini(br);
- free_percpu(br->stats);
}
static int br_dev_open(struct net_device *dev)
@@ -169,6 +170,9 @@ static int br_dev_open(struct net_device *dev)
br_stp_enable_bridge(br);
br_multicast_open(br);
+ if (br_opt_get(br, BROPT_MULTICAST_ENABLED))
+ br_multicast_join_snoopers(br);
+
return 0;
}
@@ -189,46 +193,22 @@ static int br_dev_stop(struct net_device *dev)
br_stp_disable_bridge(br);
br_multicast_stop(br);
+ if (br_opt_get(br, BROPT_MULTICAST_ENABLED))
+ br_multicast_leave_snoopers(br);
+
netif_stop_queue(dev);
return 0;
}
-static void br_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *stats)
-{
- struct net_bridge *br = netdev_priv(dev);
- struct pcpu_sw_netstats tmp, sum = { 0 };
- unsigned int cpu;
-
- for_each_possible_cpu(cpu) {
- unsigned int start;
- const struct pcpu_sw_netstats *bstats
- = per_cpu_ptr(br->stats, cpu);
- do {
- start = u64_stats_fetch_begin_irq(&bstats->syncp);
- memcpy(&tmp, bstats, sizeof(tmp));
- } while (u64_stats_fetch_retry_irq(&bstats->syncp, start));
- sum.tx_bytes += tmp.tx_bytes;
- sum.tx_packets += tmp.tx_packets;
- sum.rx_bytes += tmp.rx_bytes;
- sum.rx_packets += tmp.rx_packets;
- }
-
- stats->tx_bytes = sum.tx_bytes;
- stats->tx_packets = sum.tx_packets;
- stats->rx_bytes = sum.rx_bytes;
- stats->rx_packets = sum.rx_packets;
-}
-
static int br_change_mtu(struct net_device *dev, int new_mtu)
{
struct net_bridge *br = netdev_priv(dev);
- dev->mtu = new_mtu;
+ WRITE_ONCE(dev->mtu, new_mtu);
/* this flag will be cleared if the MTU was automatically adjusted */
- br->mtu_set_by_user = true;
+ br_opt_toggle(br, BROPT_MTU_SET_BY_USER, true);
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
/* remember the MTU in the rtable for PMTU */
dst_metric_set(&br->fake_rtable.dst, RTAX_MTU, new_mtu);
@@ -246,6 +226,12 @@ static int br_set_mac_address(struct net_device *dev, void *p)
if (!is_valid_ether_addr(addr->sa_data))
return -EADDRNOTAVAIL;
+ /* dev_set_mac_addr() can be called by a master device on bridge's
+ * NETDEV_UNREGISTER, but since it's being destroyed do nothing
+ */
+ if (dev->reg_state != NETREG_REGISTERED)
+ return -EBUSY;
+
spin_lock_bh(&br->lock);
if (!ether_addr_equal(dev->dev_addr, addr->sa_data)) {
/* Mac address will be changed in br_stp_change_bridge_id(). */
@@ -258,10 +244,41 @@ static int br_set_mac_address(struct net_device *dev, void *p)
static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info)
{
- strlcpy(info->driver, "bridge", sizeof(info->driver));
- strlcpy(info->version, BR_VERSION, sizeof(info->version));
- strlcpy(info->fw_version, "N/A", sizeof(info->fw_version));
- strlcpy(info->bus_info, "N/A", sizeof(info->bus_info));
+ strscpy(info->driver, "bridge", sizeof(info->driver));
+ strscpy(info->version, BR_VERSION, sizeof(info->version));
+ strscpy(info->fw_version, "N/A", sizeof(info->fw_version));
+ strscpy(info->bus_info, "N/A", sizeof(info->bus_info));
+}
+
+static int br_get_link_ksettings(struct net_device *dev,
+ struct ethtool_link_ksettings *cmd)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_port *p;
+
+ cmd->base.duplex = DUPLEX_UNKNOWN;
+ cmd->base.port = PORT_OTHER;
+ cmd->base.speed = SPEED_UNKNOWN;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ struct ethtool_link_ksettings ecmd;
+ struct net_device *pdev = p->dev;
+
+ if (!netif_running(pdev) || !netif_oper_up(pdev))
+ continue;
+
+ if (__ethtool_get_link_ksettings(pdev, &ecmd))
+ continue;
+
+ if (ecmd.base.speed == (__u32)SPEED_UNKNOWN)
+ continue;
+
+ if (cmd->base.speed == (__u32)SPEED_UNKNOWN ||
+ cmd->base.speed < ecmd.base.speed)
+ cmd->base.speed = ecmd.base.speed;
+ }
+
+ return 0;
}
static netdev_features_t br_fix_features(struct net_device *dev,
@@ -313,7 +330,7 @@ int br_netpoll_enable(struct net_bridge_port *p)
return __br_netpoll_enable(p);
}
-static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni)
+static int br_netpoll_setup(struct net_device *dev)
{
struct net_bridge *br = netdev_priv(dev);
struct net_bridge_port *p;
@@ -344,7 +361,7 @@ void br_netpoll_disable(struct net_bridge_port *p)
p->np = NULL;
- __netpoll_free_async(np);
+ __netpoll_free(np);
}
#endif
@@ -365,9 +382,58 @@ static int br_del_slave(struct net_device *dev, struct net_device *slave_dev)
return br_del_if(br, slave_dev);
}
+static int br_fill_forward_path(struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+ struct net_bridge_fdb_entry *f;
+ struct net_bridge_port *dst;
+ struct net_bridge *br;
+
+ if (netif_is_bridge_port(ctx->dev))
+ return -1;
+
+ br = netdev_priv(ctx->dev);
+
+ br_vlan_fill_forward_path_pvid(br, ctx, path);
+
+ f = br_fdb_find_rcu(br, ctx->daddr, path->bridge.vlan_id);
+ if (!f)
+ return -1;
+
+ dst = READ_ONCE(f->dst);
+ if (!dst)
+ return -1;
+
+ if (br_vlan_fill_forward_path_mode(br, dst, path))
+ return -1;
+
+ path->type = DEV_PATH_BRIDGE;
+ path->dev = dst->br->dev;
+ ctx->dev = dst->dev;
+
+ switch (path->bridge.vlan_mode) {
+ case DEV_PATH_BR_VLAN_TAG:
+ if (ctx->num_vlans >= ARRAY_SIZE(ctx->vlan))
+ return -ENOSPC;
+ ctx->vlan[ctx->num_vlans].id = path->bridge.vlan_id;
+ ctx->vlan[ctx->num_vlans].proto = path->bridge.vlan_proto;
+ ctx->num_vlans++;
+ break;
+ case DEV_PATH_BR_VLAN_UNTAG_HW:
+ case DEV_PATH_BR_VLAN_UNTAG:
+ ctx->num_vlans--;
+ break;
+ case DEV_PATH_BR_VLAN_KEEP:
+ break;
+ }
+
+ return 0;
+}
+
static const struct ethtool_ops br_ethtool_ops = {
- .get_drvinfo = br_getinfo,
- .get_link = ethtool_op_get_link,
+ .get_drvinfo = br_getinfo,
+ .get_link = ethtool_op_get_link,
+ .get_link_ksettings = br_get_link_ksettings,
};
static const struct net_device_ops br_netdev_ops = {
@@ -376,12 +442,12 @@ static const struct net_device_ops br_netdev_ops = {
.ndo_init = br_dev_init,
.ndo_uninit = br_dev_uninit,
.ndo_start_xmit = br_dev_xmit,
- .ndo_get_stats64 = br_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_set_mac_address = br_set_mac_address,
.ndo_set_rx_mode = br_dev_set_multicast_list,
.ndo_change_rx_flags = br_dev_change_rx_flags,
.ndo_change_mtu = br_change_mtu,
- .ndo_do_ioctl = br_dev_ioctl,
+ .ndo_siocdevprivate = br_dev_siocdevprivate,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_netpoll_setup = br_netpoll_setup,
.ndo_netpoll_cleanup = br_netpoll_cleanup,
@@ -392,14 +458,22 @@ static const struct net_device_ops br_netdev_ops = {
.ndo_fix_features = br_fix_features,
.ndo_fdb_add = br_fdb_add,
.ndo_fdb_del = br_fdb_delete,
+ .ndo_fdb_del_bulk = br_fdb_delete_bulk,
.ndo_fdb_dump = br_fdb_dump,
+ .ndo_fdb_get = br_fdb_get,
+ .ndo_mdb_add = br_mdb_add,
+ .ndo_mdb_del = br_mdb_del,
+ .ndo_mdb_del_bulk = br_mdb_del_bulk,
+ .ndo_mdb_dump = br_mdb_dump,
+ .ndo_mdb_get = br_mdb_get,
.ndo_bridge_getlink = br_getlink,
.ndo_bridge_setlink = br_setlink,
.ndo_bridge_dellink = br_dellink,
.ndo_features_check = passthru_features_check,
+ .ndo_fill_forward_path = br_fill_forward_path,
};
-static struct device_type br_type = {
+static const struct device_type br_type = {
.name = "bridge",
};
@@ -415,17 +489,27 @@ void br_dev_setup(struct net_device *dev)
dev->ethtool_ops = &br_ethtool_ops;
SET_NETDEV_DEVTYPE(dev, &br_type);
dev->priv_flags = IFF_EBRIDGE | IFF_NO_QUEUE;
+ dev->lltx = true;
+ dev->netns_immutable = true;
- dev->features = COMMON_FEATURES | NETIF_F_LLTX | NETIF_F_NETNS_LOCAL |
- NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
+ dev->features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX;
dev->hw_features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX;
dev->vlan_features = COMMON_FEATURES;
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
br->dev = dev;
spin_lock_init(&br->lock);
INIT_LIST_HEAD(&br->port_list);
INIT_HLIST_HEAD(&br->fdb_list);
+ INIT_HLIST_HEAD(&br->frame_type_list);
+#if IS_ENABLED(CONFIG_BRIDGE_MRP)
+ INIT_HLIST_HEAD(&br->mrp_list);
+#endif
+#if IS_ENABLED(CONFIG_BRIDGE_CFM)
+ INIT_HLIST_HEAD(&br->mep_list);
+#endif
spin_lock_init(&br->hash_lock);
br->bridge_id.prio[0] = 0x80;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 502f66349530..58d22e2b85fc 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Forwarding database
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -22,7 +18,7 @@
#include <linux/random.h>
#include <linux/slab.h>
#include <linux/atomic.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/if_vlan.h>
#include <net/switchdev.h>
#include <trace/events/bridge.h>
@@ -33,21 +29,13 @@ static const struct rhashtable_params br_fdb_rht_params = {
.key_offset = offsetof(struct net_bridge_fdb_entry, key),
.key_len = sizeof(struct net_bridge_fdb_key),
.automatic_shrinking = true,
- .locks_mul = 1,
};
static struct kmem_cache *br_fdb_cache __read_mostly;
-static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
- const unsigned char *addr, u16 vid);
-static void fdb_notify(struct net_bridge *br,
- const struct net_bridge_fdb_entry *, int, bool);
int __init br_fdb_init(void)
{
- br_fdb_cache = kmem_cache_create("bridge_fdb_cache",
- sizeof(struct net_bridge_fdb_entry),
- 0,
- SLAB_HWCACHE_ALIGN, NULL);
+ br_fdb_cache = KMEM_CACHE(net_bridge_fdb_entry, SLAB_HWCACHE_ALIGN);
if (!br_fdb_cache)
return -ENOMEM;
@@ -80,15 +68,138 @@ static inline unsigned long hold_time(const struct net_bridge *br)
static inline int has_expired(const struct net_bridge *br,
const struct net_bridge_fdb_entry *fdb)
{
- return !fdb->is_static && !fdb->added_by_external_learn &&
- time_before_eq(fdb->updated + hold_time(br), jiffies);
+ return !test_bit(BR_FDB_STATIC, &fdb->flags) &&
+ !test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags) &&
+ time_before_eq(fdb->updated + hold_time(br), jiffies);
}
-static void fdb_rcu_free(struct rcu_head *head)
+static int fdb_to_nud(const struct net_bridge *br,
+ const struct net_bridge_fdb_entry *fdb)
{
- struct net_bridge_fdb_entry *ent
- = container_of(head, struct net_bridge_fdb_entry, rcu);
- kmem_cache_free(br_fdb_cache, ent);
+ if (test_bit(BR_FDB_LOCAL, &fdb->flags))
+ return NUD_PERMANENT;
+ else if (test_bit(BR_FDB_STATIC, &fdb->flags))
+ return NUD_NOARP;
+ else if (has_expired(br, fdb))
+ return NUD_STALE;
+ else
+ return NUD_REACHABLE;
+}
+
+static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
+ const struct net_bridge_fdb_entry *fdb,
+ u32 portid, u32 seq, int type, unsigned int flags)
+{
+ const struct net_bridge_port *dst = READ_ONCE(fdb->dst);
+ unsigned long now = jiffies;
+ struct nda_cacheinfo ci;
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+ u32 ext_flags = 0;
+
+ nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ ndm = nlmsg_data(nlh);
+ ndm->ndm_family = AF_BRIDGE;
+ ndm->ndm_pad1 = 0;
+ ndm->ndm_pad2 = 0;
+ ndm->ndm_flags = 0;
+ ndm->ndm_type = 0;
+ ndm->ndm_ifindex = dst ? dst->dev->ifindex : br->dev->ifindex;
+ ndm->ndm_state = fdb_to_nud(br, fdb);
+
+ if (test_bit(BR_FDB_OFFLOADED, &fdb->flags))
+ ndm->ndm_flags |= NTF_OFFLOADED;
+ if (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags))
+ ndm->ndm_flags |= NTF_EXT_LEARNED;
+ if (test_bit(BR_FDB_STICKY, &fdb->flags))
+ ndm->ndm_flags |= NTF_STICKY;
+ if (test_bit(BR_FDB_LOCKED, &fdb->flags))
+ ext_flags |= NTF_EXT_LOCKED;
+
+ if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, NDA_FLAGS_EXT, ext_flags))
+ goto nla_put_failure;
+
+ ci.ndm_used = jiffies_to_clock_t(now - fdb->used);
+ ci.ndm_confirmed = 0;
+ ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated);
+ ci.ndm_refcnt = 0;
+ if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
+ goto nla_put_failure;
+
+ if (fdb->key.vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16),
+ &fdb->key.vlan_id))
+ goto nla_put_failure;
+
+ if (test_bit(BR_FDB_NOTIFY, &fdb->flags)) {
+ struct nlattr *nest = nla_nest_start(skb, NDA_FDB_EXT_ATTRS);
+ u8 notify_bits = FDB_NOTIFY_BIT;
+
+ if (!nest)
+ goto nla_put_failure;
+ if (test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags))
+ notify_bits |= FDB_NOTIFY_INACTIVE_BIT;
+
+ if (nla_put_u8(skb, NFEA_ACTIVITY_NOTIFY, notify_bits)) {
+ nla_nest_cancel(skb, nest);
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, nest);
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static inline size_t fdb_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ndmsg))
+ + nla_total_size(ETH_ALEN) /* NDA_LLADDR */
+ + nla_total_size(sizeof(u32)) /* NDA_MASTER */
+ + nla_total_size(sizeof(u32)) /* NDA_FLAGS_EXT */
+ + nla_total_size(sizeof(u16)) /* NDA_VLAN */
+ + nla_total_size(sizeof(struct nda_cacheinfo))
+ + nla_total_size(0) /* NDA_FDB_EXT_ATTRS */
+ + nla_total_size(sizeof(u8)); /* NFEA_ACTIVITY_NOTIFY */
+}
+
+static void fdb_notify(struct net_bridge *br,
+ const struct net_bridge_fdb_entry *fdb, int type,
+ bool swdev_notify)
+{
+ struct net *net = dev_net(br->dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ if (swdev_notify)
+ br_switchdev_fdb_notify(br, fdb, type);
+
+ skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
+
+ err = fdb_fill_info(skb, br, fdb, 0, 0, type, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in fdb_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+ return;
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
}
static struct net_bridge_fdb_entry *fdb_find_rcu(struct rhashtable *tbl,
@@ -202,17 +313,24 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f,
{
trace_fdb_delete(br, f);
- if (f->is_static)
+ if (test_bit(BR_FDB_STATIC, &f->flags))
fdb_del_hw_addr(br, f->key.addr.addr);
hlist_del_init_rcu(&f->fdb_node);
rhashtable_remove_fast(&br->fdb_hash_tbl, &f->rhnode,
br_fdb_rht_params);
+ if (test_and_clear_bit(BR_FDB_DYNAMIC_LEARNED, &f->flags))
+ atomic_dec(&br->fdb_n_learned);
fdb_notify(br, f, RTM_DELNEIGH, swdev_notify);
- call_rcu(&f->rcu, fdb_rcu_free);
+ kfree_rcu(f, rcu);
}
-/* Delete a local entry if no other port had the same address. */
+/* Delete a local entry if no other port had the same address.
+ *
+ * This function should only be called on entries with BR_FDB_LOCAL set,
+ * so even with BR_FDB_ADDED_BY_USER cleared we never need to increase
+ * the accounting for dynamically learned entries again.
+ */
static void fdb_delete_local(struct net_bridge *br,
const struct net_bridge_port *p,
struct net_bridge_fdb_entry *f)
@@ -229,7 +347,7 @@ static void fdb_delete_local(struct net_bridge *br,
if (op != p && ether_addr_equal(op->dev->dev_addr, addr) &&
(!vid || br_vlan_find(vg, vid))) {
f->dst = op;
- f->added_by_user = 0;
+ clear_bit(BR_FDB_ADDED_BY_USER, &f->flags);
return;
}
}
@@ -240,7 +358,7 @@ static void fdb_delete_local(struct net_bridge *br,
if (p && ether_addr_equal(br->dev->dev_addr, addr) &&
(!vid || (v && br_vlan_should_use(v)))) {
f->dst = NULL;
- f->added_by_user = 0;
+ clear_bit(BR_FDB_ADDED_BY_USER, &f->flags);
return;
}
@@ -255,39 +373,118 @@ void br_fdb_find_delete_local(struct net_bridge *br,
spin_lock_bh(&br->hash_lock);
f = br_fdb_find(br, addr, vid);
- if (f && f->is_local && !f->added_by_user && f->dst == p)
+ if (f && test_bit(BR_FDB_LOCAL, &f->flags) &&
+ !test_bit(BR_FDB_ADDED_BY_USER, &f->flags) && f->dst == p)
fdb_delete_local(br, p, f);
spin_unlock_bh(&br->hash_lock);
}
+static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br,
+ struct net_bridge_port *source,
+ const unsigned char *addr,
+ __u16 vid,
+ unsigned long flags)
+{
+ bool learned = !test_bit(BR_FDB_ADDED_BY_USER, &flags) &&
+ !test_bit(BR_FDB_LOCAL, &flags);
+ u32 max_learned = READ_ONCE(br->fdb_max_learned);
+ struct net_bridge_fdb_entry *fdb;
+ int err;
+
+ if (likely(learned)) {
+ int n_learned = atomic_read(&br->fdb_n_learned);
+
+ if (unlikely(max_learned && n_learned >= max_learned))
+ return NULL;
+ __set_bit(BR_FDB_DYNAMIC_LEARNED, &flags);
+ }
+
+ fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC);
+ if (!fdb)
+ return NULL;
+
+ memcpy(fdb->key.addr.addr, addr, ETH_ALEN);
+ WRITE_ONCE(fdb->dst, source);
+ fdb->key.vlan_id = vid;
+ fdb->flags = flags;
+ fdb->updated = fdb->used = jiffies;
+ err = rhashtable_lookup_insert_fast(&br->fdb_hash_tbl, &fdb->rhnode,
+ br_fdb_rht_params);
+ if (err) {
+ kmem_cache_free(br_fdb_cache, fdb);
+ return NULL;
+ }
+
+ if (likely(learned))
+ atomic_inc(&br->fdb_n_learned);
+
+ hlist_add_head_rcu(&fdb->fdb_node, &br->fdb_list);
+
+ return fdb;
+}
+
+static int fdb_add_local(struct net_bridge *br, struct net_bridge_port *source,
+ const unsigned char *addr, u16 vid)
+{
+ struct net_bridge_fdb_entry *fdb;
+
+ if (!is_valid_ether_addr(addr))
+ return -EINVAL;
+
+ fdb = br_fdb_find(br, addr, vid);
+ if (fdb) {
+ /* it is okay to have multiple ports with same
+ * address, just use the first one.
+ */
+ if (test_bit(BR_FDB_LOCAL, &fdb->flags))
+ return 0;
+ br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n",
+ source ? source->dev->name : br->dev->name, addr, vid);
+ fdb_delete(br, fdb, true);
+ }
+
+ fdb = fdb_create(br, source, addr, vid,
+ BIT(BR_FDB_LOCAL) | BIT(BR_FDB_STATIC));
+ if (!fdb)
+ return -ENOMEM;
+
+ fdb_add_hw_addr(br, addr);
+ fdb_notify(br, fdb, RTM_NEWNEIGH, true);
+ return 0;
+}
+
void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
{
struct net_bridge_vlan_group *vg;
struct net_bridge_fdb_entry *f;
struct net_bridge *br = p->br;
struct net_bridge_vlan *v;
+ bool local_vlan_0;
+
+ local_vlan_0 = br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0);
spin_lock_bh(&br->hash_lock);
vg = nbp_vlan_group(p);
hlist_for_each_entry(f, &br->fdb_list, fdb_node) {
- if (f->dst == p && f->is_local && !f->added_by_user) {
+ if (f->dst == p && test_bit(BR_FDB_LOCAL, &f->flags) &&
+ !test_bit(BR_FDB_ADDED_BY_USER, &f->flags)) {
/* delete old one */
fdb_delete_local(br, p, f);
- /* if this port has no vlan information
- * configured, we can safely be done at
- * this point.
+ /* if this port has no vlan information configured, or
+ * local entries are only kept on VLAN 0, we can safely
+ * be done at this point.
*/
- if (!vg || !vg->num_vlans)
+ if (!vg || !vg->num_vlans || local_vlan_0)
goto insert;
}
}
insert:
/* insert new address, may fail if invalid address or dup. */
- fdb_insert(br, p, newaddr, 0);
+ fdb_add_local(br, p, newaddr, 0);
- if (!vg || !vg->num_vlans)
+ if (!vg || !vg->num_vlans || local_vlan_0)
goto done;
/* Now add entries for every VLAN configured on the port.
@@ -295,7 +492,7 @@ insert:
* from under us.
*/
list_for_each_entry(v, &vg->vlan_list, vlist)
- fdb_insert(br, p, newaddr, v->vid);
+ fdb_add_local(br, p, newaddr, v->vid);
done:
spin_unlock_bh(&br->hash_lock);
@@ -306,17 +503,21 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
struct net_bridge_vlan_group *vg;
struct net_bridge_fdb_entry *f;
struct net_bridge_vlan *v;
+ bool local_vlan_0;
+
+ local_vlan_0 = br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0);
spin_lock_bh(&br->hash_lock);
/* If old entry was unassociated with any port, then delete it. */
f = br_fdb_find(br, br->dev->dev_addr, 0);
- if (f && f->is_local && !f->dst && !f->added_by_user)
+ if (f && test_bit(BR_FDB_LOCAL, &f->flags) &&
+ !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags))
fdb_delete_local(br, NULL, f);
- fdb_insert(br, NULL, newaddr, 0);
+ fdb_add_local(br, NULL, newaddr, 0);
vg = br_vlan_group(br);
- if (!vg || !vg->num_vlans)
+ if (!vg || !vg->num_vlans || local_vlan_0)
goto out;
/* Now remove and add entries for every VLAN configured on the
* bridge. This function runs under RTNL so the bitmap will not
@@ -326,9 +527,10 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
if (!br_vlan_should_use(v))
continue;
f = br_fdb_find(br, br->dev->dev_addr, v->vid);
- if (f && f->is_local && !f->dst && !f->added_by_user)
+ if (f && test_bit(BR_FDB_LOCAL, &f->flags) &&
+ !f->dst && !test_bit(BR_FDB_ADDED_BY_USER, &f->flags))
fdb_delete_local(br, NULL, f);
- fdb_insert(br, NULL, newaddr, v->vid);
+ fdb_add_local(br, NULL, newaddr, v->vid);
}
out:
spin_unlock_bh(&br->hash_lock);
@@ -349,11 +551,21 @@ void br_fdb_cleanup(struct work_struct *work)
*/
rcu_read_lock();
hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
- unsigned long this_timer;
-
- if (f->is_static || f->added_by_external_learn)
+ unsigned long this_timer = f->updated + delay;
+
+ if (test_bit(BR_FDB_STATIC, &f->flags) ||
+ test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags)) {
+ if (test_bit(BR_FDB_NOTIFY, &f->flags)) {
+ if (time_after(this_timer, now))
+ work_delay = min(work_delay,
+ this_timer - now);
+ else if (!test_and_set_bit(BR_FDB_NOTIFY_INACTIVE,
+ &f->flags))
+ fdb_notify(br, f, RTM_NEWNEIGH, false);
+ }
continue;
- this_timer = f->updated + delay;
+ }
+
if (time_after(this_timer, now)) {
work_delay = min(work_delay, this_timer - now);
} else {
@@ -370,18 +582,276 @@ void br_fdb_cleanup(struct work_struct *work)
mod_delayed_work(system_long_wq, &br->gc_work, work_delay);
}
-/* Completely flush all dynamic entries in forwarding database.*/
-void br_fdb_flush(struct net_bridge *br)
+static void br_fdb_delete_locals_per_vlan_port(struct net_bridge *br,
+ struct net_bridge_port *p)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ struct net_device *dev;
+
+ if (p) {
+ vg = nbp_vlan_group(p);
+ dev = p->dev;
+ } else {
+ vg = br_vlan_group(br);
+ dev = br->dev;
+ }
+
+ list_for_each_entry(v, &vg->vlan_list, vlist)
+ br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid);
+}
+
+static void br_fdb_delete_locals_per_vlan(struct net_bridge *br)
+{
+ struct net_bridge_port *p;
+
+ ASSERT_RTNL();
+
+ list_for_each_entry(p, &br->port_list, list)
+ br_fdb_delete_locals_per_vlan_port(br, p);
+
+ br_fdb_delete_locals_per_vlan_port(br, NULL);
+}
+
+static int br_fdb_insert_locals_per_vlan_port(struct net_bridge *br,
+ struct net_bridge_port *p,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ struct net_device *dev;
+ int err;
+
+ if (p) {
+ vg = nbp_vlan_group(p);
+ dev = p->dev;
+ } else {
+ vg = br_vlan_group(br);
+ dev = br->dev;
+ }
+
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ if (!br_vlan_should_use(v))
+ continue;
+
+ err = br_fdb_add_local(br, p, dev->dev_addr, v->vid);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int br_fdb_insert_locals_per_vlan(struct net_bridge *br,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_port *p;
+ int err;
+
+ ASSERT_RTNL();
+
+ list_for_each_entry(p, &br->port_list, list) {
+ err = br_fdb_insert_locals_per_vlan_port(br, p, extack);
+ if (err)
+ goto rollback;
+ }
+
+ err = br_fdb_insert_locals_per_vlan_port(br, NULL, extack);
+ if (err)
+ goto rollback;
+
+ return 0;
+
+rollback:
+ NL_SET_ERR_MSG_MOD(extack, "fdb_local_vlan_0 toggle: FDB entry insertion failed");
+ br_fdb_delete_locals_per_vlan(br);
+ return err;
+}
+
+int br_fdb_toggle_local_vlan_0(struct net_bridge *br, bool on,
+ struct netlink_ext_ack *extack)
+{
+ if (!on)
+ return br_fdb_insert_locals_per_vlan(br, extack);
+
+ br_fdb_delete_locals_per_vlan(br);
+ return 0;
+}
+
+static bool __fdb_flush_matches(const struct net_bridge *br,
+ const struct net_bridge_fdb_entry *f,
+ const struct net_bridge_fdb_flush_desc *desc)
+{
+ const struct net_bridge_port *dst = READ_ONCE(f->dst);
+ int port_ifidx = dst ? dst->dev->ifindex : br->dev->ifindex;
+
+ if (desc->vlan_id && desc->vlan_id != f->key.vlan_id)
+ return false;
+ if (desc->port_ifindex && desc->port_ifindex != port_ifidx)
+ return false;
+ if (desc->flags_mask && (f->flags & desc->flags_mask) != desc->flags)
+ return false;
+
+ return true;
+}
+
+/* Flush forwarding database entries matching the description */
+void br_fdb_flush(struct net_bridge *br,
+ const struct net_bridge_fdb_flush_desc *desc)
{
struct net_bridge_fdb_entry *f;
- struct hlist_node *tmp;
- spin_lock_bh(&br->hash_lock);
- hlist_for_each_entry_safe(f, tmp, &br->fdb_list, fdb_node) {
- if (!f->is_static)
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
+ if (!__fdb_flush_matches(br, f, desc))
+ continue;
+
+ spin_lock_bh(&br->hash_lock);
+ if (!hlist_unhashed(&f->fdb_node))
fdb_delete(br, f, true);
+ spin_unlock_bh(&br->hash_lock);
}
- spin_unlock_bh(&br->hash_lock);
+ rcu_read_unlock();
+}
+
+static unsigned long __ndm_state_to_fdb_flags(u16 ndm_state)
+{
+ unsigned long flags = 0;
+
+ if (ndm_state & NUD_PERMANENT)
+ __set_bit(BR_FDB_LOCAL, &flags);
+ if (ndm_state & NUD_NOARP)
+ __set_bit(BR_FDB_STATIC, &flags);
+
+ return flags;
+}
+
+static unsigned long __ndm_flags_to_fdb_flags(u8 ndm_flags)
+{
+ unsigned long flags = 0;
+
+ if (ndm_flags & NTF_USE)
+ __set_bit(BR_FDB_ADDED_BY_USER, &flags);
+ if (ndm_flags & NTF_EXT_LEARNED)
+ __set_bit(BR_FDB_ADDED_BY_EXT_LEARN, &flags);
+ if (ndm_flags & NTF_OFFLOADED)
+ __set_bit(BR_FDB_OFFLOADED, &flags);
+ if (ndm_flags & NTF_STICKY)
+ __set_bit(BR_FDB_STICKY, &flags);
+
+ return flags;
+}
+
+static int __fdb_flush_validate_ifindex(const struct net_bridge *br,
+ int ifindex,
+ struct netlink_ext_ack *extack)
+{
+ const struct net_device *dev;
+
+ dev = __dev_get_by_index(dev_net(br->dev), ifindex);
+ if (!dev) {
+ NL_SET_ERR_MSG_MOD(extack, "Unknown flush device ifindex");
+ return -ENODEV;
+ }
+ if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) {
+ NL_SET_ERR_MSG_MOD(extack, "Flush device is not a bridge or bridge port");
+ return -EINVAL;
+ }
+ if (netif_is_bridge_master(dev) && dev != br->dev) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Flush bridge device does not match target bridge device");
+ return -EINVAL;
+ }
+ if (netif_is_bridge_port(dev)) {
+ struct net_bridge_port *p = br_port_get_rtnl(dev);
+
+ if (p->br != br) {
+ NL_SET_ERR_MSG_MOD(extack, "Port belongs to a different bridge device");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static const struct nla_policy br_fdb_del_bulk_policy[NDA_MAX + 1] = {
+ [NDA_VLAN] = NLA_POLICY_RANGE(NLA_U16, 1, VLAN_N_VID - 2),
+ [NDA_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1),
+ [NDA_NDM_STATE_MASK] = { .type = NLA_U16 },
+ [NDA_NDM_FLAGS_MASK] = { .type = NLA_U8 },
+};
+
+int br_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_fdb_flush_desc desc = {};
+ struct ndmsg *ndm = nlmsg_data(nlh);
+ struct net_bridge_port *p = NULL;
+ struct nlattr *tb[NDA_MAX + 1];
+ struct net_bridge *br;
+ u8 ndm_flags;
+ int err;
+
+ ndm_flags = ndm->ndm_flags & ~FDB_FLUSH_IGNORED_NDM_FLAGS;
+
+ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX,
+ br_fdb_del_bulk_policy, extack);
+ if (err)
+ return err;
+
+ if (netif_is_bridge_master(dev)) {
+ br = netdev_priv(dev);
+ } else {
+ p = br_port_get_rtnl(dev);
+ if (!p) {
+ NL_SET_ERR_MSG_MOD(extack, "Device is not a bridge port");
+ return -EINVAL;
+ }
+ br = p->br;
+ }
+
+ if (tb[NDA_VLAN])
+ desc.vlan_id = nla_get_u16(tb[NDA_VLAN]);
+
+ if (ndm_flags & ~FDB_FLUSH_ALLOWED_NDM_FLAGS) {
+ NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm flag bits set");
+ return -EINVAL;
+ }
+ if (ndm->ndm_state & ~FDB_FLUSH_ALLOWED_NDM_STATES) {
+ NL_SET_ERR_MSG(extack, "Unsupported fdb flush ndm state bits set");
+ return -EINVAL;
+ }
+
+ desc.flags |= __ndm_state_to_fdb_flags(ndm->ndm_state);
+ desc.flags |= __ndm_flags_to_fdb_flags(ndm_flags);
+ if (tb[NDA_NDM_STATE_MASK]) {
+ u16 ndm_state_mask = nla_get_u16(tb[NDA_NDM_STATE_MASK]);
+
+ desc.flags_mask |= __ndm_state_to_fdb_flags(ndm_state_mask);
+ }
+ if (tb[NDA_NDM_FLAGS_MASK]) {
+ u8 ndm_flags_mask = nla_get_u8(tb[NDA_NDM_FLAGS_MASK]);
+
+ desc.flags_mask |= __ndm_flags_to_fdb_flags(ndm_flags_mask);
+ }
+ if (tb[NDA_IFINDEX]) {
+ int ifidx = nla_get_s32(tb[NDA_IFINDEX]);
+
+ err = __fdb_flush_validate_ifindex(br, ifidx, extack);
+ if (err)
+ return err;
+ desc.port_ifindex = ifidx;
+ } else if (p) {
+ /* flush was invoked with port device and NTF_MASTER */
+ desc.port_ifindex = p->dev->ifindex;
+ }
+
+ br_debug(br, "flushing port ifindex: %d vlan id: %u flags: 0x%lx flags mask: 0x%lx\n",
+ desc.port_ifindex, desc.vlan_id, desc.flags, desc.flags_mask);
+
+ br_fdb_flush(br, &desc);
+
+ return 0;
}
/* Flush all entries referring to a specific port.
@@ -402,10 +872,13 @@ void br_fdb_delete_by_port(struct net_bridge *br,
continue;
if (!do_all)
- if (f->is_static || (vid && f->key.vlan_id != vid))
+ if (test_bit(BR_FDB_STATIC, &f->flags) ||
+ (test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &f->flags) &&
+ !test_bit(BR_FDB_OFFLOADED, &f->flags)) ||
+ (vid && f->key.vlan_id != vid))
continue;
- if (f->is_local)
+ if (test_bit(BR_FDB_LOCAL, &f->flags))
fdb_delete_local(br, p, f);
else
fdb_delete(br, f, true);
@@ -427,9 +900,14 @@ int br_fdb_test_addr(struct net_device *dev, unsigned char *addr)
if (!port)
ret = 0;
else {
+ const struct net_bridge_port *dst = NULL;
+
fdb = br_fdb_find_rcu(port->br, addr, 0);
- ret = fdb && fdb->dst && fdb->dst->dev != dev &&
- fdb->dst->state == BR_STATE_FORWARDING;
+ if (fdb)
+ dst = READ_ONCE(fdb->dst);
+
+ ret = dst && dst->dev != dev &&
+ dst->state == BR_STATE_FORWARDING;
}
rcu_read_unlock();
@@ -474,8 +952,8 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf,
fe->port_no = f->dst->port_no;
fe->port_hi = f->dst->port_no >> 8;
- fe->is_local = f->is_local;
- if (!f->is_static)
+ fe->is_local = test_bit(BR_FDB_LOCAL, &f->flags);
+ if (!test_bit(BR_FDB_STATIC, &f->flags))
fe->ageing_timer_value = jiffies_delta_to_clock_t(jiffies - f->updated);
++fe;
++num;
@@ -485,129 +963,84 @@ int br_fdb_fillbuf(struct net_bridge *br, void *buf,
return num;
}
-static struct net_bridge_fdb_entry *fdb_create(struct net_bridge *br,
- struct net_bridge_port *source,
- const unsigned char *addr,
- __u16 vid,
- unsigned char is_local,
- unsigned char is_static)
-{
- struct net_bridge_fdb_entry *fdb;
-
- fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC);
- if (fdb) {
- memcpy(fdb->key.addr.addr, addr, ETH_ALEN);
- fdb->dst = source;
- fdb->key.vlan_id = vid;
- fdb->is_local = is_local;
- fdb->is_static = is_static;
- fdb->added_by_user = 0;
- fdb->added_by_external_learn = 0;
- fdb->offloaded = 0;
- fdb->updated = fdb->used = jiffies;
- if (rhashtable_lookup_insert_fast(&br->fdb_hash_tbl,
- &fdb->rhnode,
- br_fdb_rht_params)) {
- kmem_cache_free(br_fdb_cache, fdb);
- fdb = NULL;
- } else {
- hlist_add_head_rcu(&fdb->fdb_node, &br->fdb_list);
- }
- }
- return fdb;
-}
-
-static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
- const unsigned char *addr, u16 vid)
-{
- struct net_bridge_fdb_entry *fdb;
-
- if (!is_valid_ether_addr(addr))
- return -EINVAL;
-
- fdb = br_fdb_find(br, addr, vid);
- if (fdb) {
- /* it is okay to have multiple ports with same
- * address, just use the first one.
- */
- if (fdb->is_local)
- return 0;
- br_warn(br, "adding interface %s with same address as a received packet (addr:%pM, vlan:%u)\n",
- source ? source->dev->name : br->dev->name, addr, vid);
- fdb_delete(br, fdb, true);
- }
-
- fdb = fdb_create(br, source, addr, vid, 1, 1);
- if (!fdb)
- return -ENOMEM;
-
- fdb_add_hw_addr(br, addr);
- fdb_notify(br, fdb, RTM_NEWNEIGH, true);
- return 0;
-}
-
/* Add entry for local address of interface */
-int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
- const unsigned char *addr, u16 vid)
+int br_fdb_add_local(struct net_bridge *br, struct net_bridge_port *source,
+ const unsigned char *addr, u16 vid)
{
int ret;
spin_lock_bh(&br->hash_lock);
- ret = fdb_insert(br, source, addr, vid);
+ ret = fdb_add_local(br, source, addr, vid);
spin_unlock_bh(&br->hash_lock);
return ret;
}
+/* returns true if the fdb was modified */
+static bool __fdb_mark_active(struct net_bridge_fdb_entry *fdb)
+{
+ return !!(test_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags) &&
+ test_and_clear_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags));
+}
+
void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
- const unsigned char *addr, u16 vid, bool added_by_user)
+ const unsigned char *addr, u16 vid, unsigned long flags)
{
struct net_bridge_fdb_entry *fdb;
- bool fdb_modified = false;
/* some users want to always flood. */
if (hold_time(br) == 0)
return;
- /* ignore packets unless we are using this port */
- if (!(source->state == BR_STATE_LEARNING ||
- source->state == BR_STATE_FORWARDING))
- return;
-
fdb = fdb_find_rcu(&br->fdb_hash_tbl, addr, vid);
if (likely(fdb)) {
/* attempt to update an entry for a local interface */
- if (unlikely(fdb->is_local)) {
+ if (unlikely(test_bit(BR_FDB_LOCAL, &fdb->flags))) {
if (net_ratelimit())
br_warn(br, "received packet on %s with own address as source address (addr:%pM, vlan:%u)\n",
source->dev->name, addr, vid);
} else {
unsigned long now = jiffies;
+ bool fdb_modified = false;
+
+ if (now != fdb->updated) {
+ fdb->updated = now;
+ fdb_modified = __fdb_mark_active(fdb);
+ }
/* fastpath: update of existing entry */
- if (unlikely(source != fdb->dst)) {
- fdb->dst = source;
+ if (unlikely(source != READ_ONCE(fdb->dst) &&
+ !test_bit(BR_FDB_STICKY, &fdb->flags))) {
+ br_switchdev_fdb_notify(br, fdb, RTM_DELNEIGH);
+ WRITE_ONCE(fdb->dst, source);
fdb_modified = true;
/* Take over HW learned entry */
- if (unlikely(fdb->added_by_external_learn))
- fdb->added_by_external_learn = 0;
+ if (unlikely(test_bit(BR_FDB_ADDED_BY_EXT_LEARN,
+ &fdb->flags)))
+ clear_bit(BR_FDB_ADDED_BY_EXT_LEARN,
+ &fdb->flags);
+ /* Clear locked flag when roaming to an
+ * unlocked port.
+ */
+ if (unlikely(test_bit(BR_FDB_LOCKED, &fdb->flags)))
+ clear_bit(BR_FDB_LOCKED, &fdb->flags);
+ }
+
+ if (unlikely(test_bit(BR_FDB_ADDED_BY_USER, &flags))) {
+ set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
+ if (test_and_clear_bit(BR_FDB_DYNAMIC_LEARNED,
+ &fdb->flags))
+ atomic_dec(&br->fdb_n_learned);
}
- if (now != fdb->updated)
- fdb->updated = now;
- if (unlikely(added_by_user))
- fdb->added_by_user = 1;
if (unlikely(fdb_modified)) {
- trace_br_fdb_update(br, source, addr, vid, added_by_user);
+ trace_br_fdb_update(br, source, addr, vid, flags);
fdb_notify(br, fdb, RTM_NEWNEIGH, true);
}
}
} else {
spin_lock(&br->hash_lock);
- fdb = fdb_create(br, source, addr, vid, 0, 0);
+ fdb = fdb_create(br, source, addr, vid, flags);
if (fdb) {
- if (unlikely(added_by_user))
- fdb->added_by_user = 1;
- trace_br_fdb_update(br, source, addr, vid,
- added_by_user);
+ trace_br_fdb_update(br, source, addr, vid, flags);
fdb_notify(br, fdb, RTM_NEWNEIGH, true);
}
/* else we lose race and someone else inserts
@@ -617,106 +1050,6 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
}
}
-static int fdb_to_nud(const struct net_bridge *br,
- const struct net_bridge_fdb_entry *fdb)
-{
- if (fdb->is_local)
- return NUD_PERMANENT;
- else if (fdb->is_static)
- return NUD_NOARP;
- else if (has_expired(br, fdb))
- return NUD_STALE;
- else
- return NUD_REACHABLE;
-}
-
-static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br,
- const struct net_bridge_fdb_entry *fdb,
- u32 portid, u32 seq, int type, unsigned int flags)
-{
- unsigned long now = jiffies;
- struct nda_cacheinfo ci;
- struct nlmsghdr *nlh;
- struct ndmsg *ndm;
-
- nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
- if (nlh == NULL)
- return -EMSGSIZE;
-
- ndm = nlmsg_data(nlh);
- ndm->ndm_family = AF_BRIDGE;
- ndm->ndm_pad1 = 0;
- ndm->ndm_pad2 = 0;
- ndm->ndm_flags = 0;
- ndm->ndm_type = 0;
- ndm->ndm_ifindex = fdb->dst ? fdb->dst->dev->ifindex : br->dev->ifindex;
- ndm->ndm_state = fdb_to_nud(br, fdb);
-
- if (fdb->offloaded)
- ndm->ndm_flags |= NTF_OFFLOADED;
- if (fdb->added_by_external_learn)
- ndm->ndm_flags |= NTF_EXT_LEARNED;
-
- if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr))
- goto nla_put_failure;
- if (nla_put_u32(skb, NDA_MASTER, br->dev->ifindex))
- goto nla_put_failure;
- ci.ndm_used = jiffies_to_clock_t(now - fdb->used);
- ci.ndm_confirmed = 0;
- ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated);
- ci.ndm_refcnt = 0;
- if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
- goto nla_put_failure;
-
- if (fdb->key.vlan_id && nla_put(skb, NDA_VLAN, sizeof(u16),
- &fdb->key.vlan_id))
- goto nla_put_failure;
-
- nlmsg_end(skb, nlh);
- return 0;
-
-nla_put_failure:
- nlmsg_cancel(skb, nlh);
- return -EMSGSIZE;
-}
-
-static inline size_t fdb_nlmsg_size(void)
-{
- return NLMSG_ALIGN(sizeof(struct ndmsg))
- + nla_total_size(ETH_ALEN) /* NDA_LLADDR */
- + nla_total_size(sizeof(u32)) /* NDA_MASTER */
- + nla_total_size(sizeof(u16)) /* NDA_VLAN */
- + nla_total_size(sizeof(struct nda_cacheinfo));
-}
-
-static void fdb_notify(struct net_bridge *br,
- const struct net_bridge_fdb_entry *fdb, int type,
- bool swdev_notify)
-{
- struct net *net = dev_net(br->dev);
- struct sk_buff *skb;
- int err = -ENOBUFS;
-
- if (swdev_notify)
- br_switchdev_fdb_notify(fdb, type);
-
- skb = nlmsg_new(fdb_nlmsg_size(), GFP_ATOMIC);
- if (skb == NULL)
- goto errout;
-
- err = fdb_fill_info(skb, br, fdb, 0, 0, type, 0);
- if (err < 0) {
- /* -EMSGSIZE implies BUG in fdb_nlmsg_size() */
- WARN_ON(err == -EMSGSIZE);
- kfree_skb(skb);
- goto errout;
- }
- rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
- return;
-errout:
- rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
-}
-
/* Dump information about entries, in response to GETNEIGH */
int br_fdb_dump(struct sk_buff *skb,
struct netlink_callback *cb,
@@ -724,11 +1057,12 @@ int br_fdb_dump(struct sk_buff *skb,
struct net_device *filter_dev,
int *idx)
{
+ struct ndo_fdb_dump_context *ctx = (void *)cb->ctx;
struct net_bridge *br = netdev_priv(dev);
struct net_bridge_fdb_entry *f;
int err = 0;
- if (!(dev->priv_flags & IFF_EBRIDGE))
+ if (!netif_is_bridge_master(dev))
return err;
if (!filter_dev) {
@@ -739,7 +1073,7 @@ int br_fdb_dump(struct sk_buff *skb,
rcu_read_lock();
hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
- if (*idx < cb->args[2])
+ if (*idx < ctx->fdb_idx)
goto skip;
if (filter_dev && (!f->dst || f->dst->dev != filter_dev)) {
if (filter_dev != dev)
@@ -770,12 +1104,67 @@ skip:
return err;
}
+int br_fdb_get(struct sk_buff *skb,
+ struct nlattr *tb[],
+ struct net_device *dev,
+ const unsigned char *addr,
+ u16 vid, u32 portid, u32 seq,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_fdb_entry *f;
+ int err = 0;
+
+ rcu_read_lock();
+ f = br_fdb_find_rcu(br, addr, vid);
+ if (!f) {
+ NL_SET_ERR_MSG(extack, "Fdb entry not found");
+ err = -ENOENT;
+ goto errout;
+ }
+
+ err = fdb_fill_info(skb, br, f, portid, seq,
+ RTM_NEWNEIGH, 0);
+errout:
+ rcu_read_unlock();
+ return err;
+}
+
+/* returns true if the fdb is modified */
+static bool fdb_handle_notify(struct net_bridge_fdb_entry *fdb, u8 notify)
+{
+ bool modified = false;
+
+ /* allow to mark an entry as inactive, usually done on creation */
+ if ((notify & FDB_NOTIFY_INACTIVE_BIT) &&
+ !test_and_set_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags))
+ modified = true;
+
+ if ((notify & FDB_NOTIFY_BIT) &&
+ !test_and_set_bit(BR_FDB_NOTIFY, &fdb->flags)) {
+ /* enabled activity tracking */
+ modified = true;
+ } else if (!(notify & FDB_NOTIFY_BIT) &&
+ test_and_clear_bit(BR_FDB_NOTIFY, &fdb->flags)) {
+ /* disabled activity tracking, clear notify state */
+ clear_bit(BR_FDB_NOTIFY_INACTIVE, &fdb->flags);
+ modified = true;
+ }
+
+ return modified;
+}
+
/* Update (create or replace) forwarding database entry */
static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
- const __u8 *addr, __u16 state, __u16 flags, __u16 vid)
+ const u8 *addr, struct ndmsg *ndm, u16 flags, u16 vid,
+ struct nlattr *nfea_tb[])
{
+ bool is_sticky = !!(ndm->ndm_flags & NTF_STICKY);
+ bool refresh = !nfea_tb[NFEA_DONT_REFRESH];
struct net_bridge_fdb_entry *fdb;
+ u16 state = ndm->ndm_state;
bool modified = false;
+ u8 notify = 0;
/* If the port cannot learn allow only local and static entries */
if (source && !(state & NUD_PERMANENT) && !(state & NUD_NOARP) &&
@@ -789,12 +1178,23 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
return -EINVAL;
}
+ if (is_sticky && (state & NUD_PERMANENT))
+ return -EINVAL;
+
+ if (nfea_tb[NFEA_ACTIVITY_NOTIFY]) {
+ notify = nla_get_u8(nfea_tb[NFEA_ACTIVITY_NOTIFY]);
+ if ((notify & ~BR_FDB_NOTIFY_SETTABLE_BITS) ||
+ (notify & BR_FDB_NOTIFY_SETTABLE_BITS) == FDB_NOTIFY_INACTIVE_BIT)
+ return -EINVAL;
+ }
+
fdb = br_fdb_find(br, addr, vid);
if (fdb == NULL) {
if (!(flags & NLM_F_CREATE))
return -ENOENT;
- fdb = fdb_create(br, source, addr, vid, 0, 0);
+ fdb = fdb_create(br, source, addr, vid,
+ BIT(BR_FDB_ADDED_BY_USER));
if (!fdb)
return -ENOMEM;
@@ -803,40 +1203,49 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
if (flags & NLM_F_EXCL)
return -EEXIST;
- if (fdb->dst != source) {
- fdb->dst = source;
+ if (READ_ONCE(fdb->dst) != source) {
+ WRITE_ONCE(fdb->dst, source);
modified = true;
}
+
+ set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
+ if (test_and_clear_bit(BR_FDB_DYNAMIC_LEARNED, &fdb->flags))
+ atomic_dec(&br->fdb_n_learned);
}
if (fdb_to_nud(br, fdb) != state) {
if (state & NUD_PERMANENT) {
- fdb->is_local = 1;
- if (!fdb->is_static) {
- fdb->is_static = 1;
+ set_bit(BR_FDB_LOCAL, &fdb->flags);
+ if (!test_and_set_bit(BR_FDB_STATIC, &fdb->flags))
fdb_add_hw_addr(br, addr);
- }
} else if (state & NUD_NOARP) {
- fdb->is_local = 0;
- if (!fdb->is_static) {
- fdb->is_static = 1;
+ clear_bit(BR_FDB_LOCAL, &fdb->flags);
+ if (!test_and_set_bit(BR_FDB_STATIC, &fdb->flags))
fdb_add_hw_addr(br, addr);
- }
} else {
- fdb->is_local = 0;
- if (fdb->is_static) {
- fdb->is_static = 0;
+ clear_bit(BR_FDB_LOCAL, &fdb->flags);
+ if (test_and_clear_bit(BR_FDB_STATIC, &fdb->flags))
fdb_del_hw_addr(br, addr);
- }
}
modified = true;
}
- fdb->added_by_user = 1;
+
+ if (is_sticky != test_bit(BR_FDB_STICKY, &fdb->flags)) {
+ change_bit(BR_FDB_STICKY, &fdb->flags);
+ modified = true;
+ }
+
+ if (test_and_clear_bit(BR_FDB_LOCKED, &fdb->flags))
+ modified = true;
+
+ if (fdb_handle_notify(fdb, notify))
+ modified = true;
fdb->used = jiffies;
if (modified) {
- fdb->updated = jiffies;
+ if (refresh)
+ fdb->updated = jiffies;
fdb_notify(br, fdb, RTM_NEWNEIGH, true);
}
@@ -845,7 +1254,8 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source,
static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
struct net_bridge_port *p, const unsigned char *addr,
- u16 nlh_flags, u16 vid)
+ u16 nlh_flags, u16 vid, struct nlattr *nfea_tb[],
+ bool *notified, struct netlink_ext_ack *extack)
{
int err = 0;
@@ -855,32 +1265,49 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
br->dev->name);
return -EINVAL;
}
+ if (!nbp_state_should_learn(p))
+ return 0;
+
local_bh_disable();
rcu_read_lock();
- br_fdb_update(br, p, addr, vid, true);
+ br_fdb_update(br, p, addr, vid, BIT(BR_FDB_ADDED_BY_USER));
rcu_read_unlock();
local_bh_enable();
} else if (ndm->ndm_flags & NTF_EXT_LEARNED) {
- err = br_fdb_external_learn_add(br, p, addr, vid, true);
+ if (!p && !(ndm->ndm_state & NUD_PERMANENT)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "FDB entry towards bridge must be permanent");
+ return -EINVAL;
+ }
+ err = br_fdb_external_learn_add(br, p, addr, vid, false, true);
} else {
spin_lock_bh(&br->hash_lock);
- err = fdb_add_entry(br, p, addr, ndm->ndm_state,
- nlh_flags, vid);
+ err = fdb_add_entry(br, p, addr, ndm, nlh_flags, vid, nfea_tb);
spin_unlock_bh(&br->hash_lock);
}
+ if (!err)
+ *notified = true;
return err;
}
+static const struct nla_policy br_nda_fdb_pol[NFEA_MAX + 1] = {
+ [NFEA_ACTIVITY_NOTIFY] = { .type = NLA_U8 },
+ [NFEA_DONT_REFRESH] = { .type = NLA_FLAG },
+};
+
/* Add new permanent fdb entry with RTM_NEWNEIGH */
int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
struct net_device *dev,
- const unsigned char *addr, u16 vid, u16 nlh_flags)
+ const unsigned char *addr, u16 vid, u16 nlh_flags,
+ bool *notified, struct netlink_ext_ack *extack)
{
+ struct nlattr *nfea_tb[NFEA_MAX + 1], *attr;
struct net_bridge_vlan_group *vg;
struct net_bridge_port *p = NULL;
struct net_bridge_vlan *v;
struct net_bridge *br = NULL;
+ u32 ext_flags = 0;
int err = 0;
trace_br_fdb_add(ndm, dev, addr, vid, nlh_flags);
@@ -895,7 +1322,7 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
return -EINVAL;
}
- if (dev->priv_flags & IFF_EBRIDGE) {
+ if (netif_is_bridge_master(dev)) {
br = netdev_priv(dev);
vg = br_vlan_group(br);
} else {
@@ -909,6 +1336,24 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
vg = nbp_vlan_group(p);
}
+ if (tb[NDA_FLAGS_EXT])
+ ext_flags = nla_get_u32(tb[NDA_FLAGS_EXT]);
+
+ if (ext_flags & NTF_EXT_LOCKED) {
+ NL_SET_ERR_MSG_MOD(extack, "Cannot add FDB entry with \"locked\" flag set");
+ return -EINVAL;
+ }
+
+ if (tb[NDA_FDB_EXT_ATTRS]) {
+ attr = tb[NDA_FDB_EXT_ATTRS];
+ err = nla_parse_nested(nfea_tb, NFEA_MAX, attr,
+ br_nda_fdb_pol, extack);
+ if (err)
+ return err;
+ } else {
+ memset(nfea_tb, 0, sizeof(struct nlattr *) * (NFEA_MAX + 1));
+ }
+
if (vid) {
v = br_vlan_find(vg, vid);
if (!v || !br_vlan_should_use(v)) {
@@ -917,9 +1362,11 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
}
/* VID was specified, so use it. */
- err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid);
+ err = __br_fdb_add(ndm, br, p, addr, nlh_flags, vid, nfea_tb,
+ notified, extack);
} else {
- err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0);
+ err = __br_fdb_add(ndm, br, p, addr, nlh_flags, 0, nfea_tb,
+ notified, extack);
if (err || !vg || !vg->num_vlans)
goto out;
@@ -930,7 +1377,8 @@ int br_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
list_for_each_entry(v, &vg->vlan_list, vlist) {
if (!br_vlan_should_use(v))
continue;
- err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid);
+ err = __br_fdb_add(ndm, br, p, addr, nlh_flags, v->vid,
+ nfea_tb, notified, extack);
if (err)
goto out;
}
@@ -942,27 +1390,28 @@ out:
static int fdb_delete_by_addr_and_port(struct net_bridge *br,
const struct net_bridge_port *p,
- const u8 *addr, u16 vlan)
+ const u8 *addr, u16 vlan, bool *notified)
{
struct net_bridge_fdb_entry *fdb;
fdb = br_fdb_find(br, addr, vlan);
- if (!fdb || fdb->dst != p)
+ if (!fdb || READ_ONCE(fdb->dst) != p)
return -ENOENT;
fdb_delete(br, fdb, true);
+ *notified = true;
return 0;
}
static int __br_fdb_delete(struct net_bridge *br,
const struct net_bridge_port *p,
- const unsigned char *addr, u16 vid)
+ const unsigned char *addr, u16 vid, bool *notified)
{
int err;
spin_lock_bh(&br->hash_lock);
- err = fdb_delete_by_addr_and_port(br, p, addr, vid);
+ err = fdb_delete_by_addr_and_port(br, p, addr, vid, notified);
spin_unlock_bh(&br->hash_lock);
return err;
@@ -971,15 +1420,15 @@ static int __br_fdb_delete(struct net_bridge *br,
/* Remove neighbor entry with RTM_DELNEIGH */
int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
struct net_device *dev,
- const unsigned char *addr, u16 vid)
+ const unsigned char *addr, u16 vid, bool *notified,
+ struct netlink_ext_ack *extack)
{
struct net_bridge_vlan_group *vg;
struct net_bridge_port *p = NULL;
- struct net_bridge_vlan *v;
struct net_bridge *br;
int err;
- if (dev->priv_flags & IFF_EBRIDGE) {
+ if (netif_is_bridge_master(dev)) {
br = netdev_priv(dev);
vg = br_vlan_group(br);
} else {
@@ -994,23 +1443,19 @@ int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
}
if (vid) {
- v = br_vlan_find(vg, vid);
- if (!v) {
- pr_info("bridge: RTM_DELNEIGH with unconfigured vlan %d on %s\n", vid, dev->name);
- return -EINVAL;
- }
-
- err = __br_fdb_delete(br, p, addr, vid);
+ err = __br_fdb_delete(br, p, addr, vid, notified);
} else {
+ struct net_bridge_vlan *v;
+
err = -ENOENT;
- err &= __br_fdb_delete(br, p, addr, 0);
+ err &= __br_fdb_delete(br, p, addr, 0, notified);
if (!vg || !vg->num_vlans)
return err;
list_for_each_entry(v, &vg->vlan_list, vlist) {
if (!br_vlan_should_use(v))
continue;
- err &= __br_fdb_delete(br, p, addr, v->vid);
+ err &= __br_fdb_delete(br, p, addr, v->vid, notified);
}
}
@@ -1028,7 +1473,7 @@ int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p)
rcu_read_lock();
hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
/* We only care for static entries */
- if (!f->is_static)
+ if (!test_bit(BR_FDB_STATIC, &f->flags))
continue;
err = dev_uc_add(p->dev, f->key.addr.addr);
if (err)
@@ -1042,7 +1487,7 @@ done:
rollback:
hlist_for_each_entry_rcu(tmp, &br->fdb_list, fdb_node) {
/* We only care for static entries */
- if (!tmp->is_static)
+ if (!test_bit(BR_FDB_STATIC, &tmp->flags))
continue;
if (tmp == f)
break;
@@ -1061,7 +1506,7 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p)
rcu_read_lock();
hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
/* We only care for static entries */
- if (!f->is_static)
+ if (!test_bit(BR_FDB_STATIC, &f->flags))
continue;
dev_uc_del(p->dev, f->key.addr.addr);
@@ -1070,7 +1515,7 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p)
}
int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
- const unsigned char *addr, u16 vid,
+ const unsigned char *addr, u16 vid, bool locked,
bool swdev_notify)
{
struct net_bridge_fdb_entry *fdb;
@@ -1079,34 +1524,67 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
trace_br_fdb_external_learn_add(br, p, addr, vid);
+ if (locked && (!p || !(p->flags & BR_PORT_MAB)))
+ return -EINVAL;
+
spin_lock_bh(&br->hash_lock);
fdb = br_fdb_find(br, addr, vid);
if (!fdb) {
- fdb = fdb_create(br, p, addr, vid, 0, 0);
+ unsigned long flags = BIT(BR_FDB_ADDED_BY_EXT_LEARN);
+
+ if (swdev_notify)
+ flags |= BIT(BR_FDB_ADDED_BY_USER);
+
+ if (!p)
+ flags |= BIT(BR_FDB_LOCAL);
+
+ if (locked)
+ flags |= BIT(BR_FDB_LOCKED);
+
+ fdb = fdb_create(br, p, addr, vid, flags);
if (!fdb) {
err = -ENOMEM;
goto err_unlock;
}
- fdb->added_by_external_learn = 1;
fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify);
} else {
+ if (locked &&
+ (!test_bit(BR_FDB_LOCKED, &fdb->flags) ||
+ READ_ONCE(fdb->dst) != p)) {
+ err = -EINVAL;
+ goto err_unlock;
+ }
+
fdb->updated = jiffies;
- if (fdb->dst != p) {
- fdb->dst = p;
+ if (READ_ONCE(fdb->dst) != p) {
+ WRITE_ONCE(fdb->dst, p);
modified = true;
}
- if (fdb->added_by_external_learn) {
+ if (test_and_set_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags)) {
/* Refresh entry */
fdb->used = jiffies;
- } else if (!fdb->added_by_user) {
- /* Take over SW learned entry */
- fdb->added_by_external_learn = 1;
+ } else {
modified = true;
}
+ if (locked != test_bit(BR_FDB_LOCKED, &fdb->flags)) {
+ change_bit(BR_FDB_LOCKED, &fdb->flags);
+ modified = true;
+ }
+
+ if (swdev_notify)
+ set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
+
+ if (!p)
+ set_bit(BR_FDB_LOCAL, &fdb->flags);
+
+ if ((swdev_notify || !p) &&
+ test_and_clear_bit(BR_FDB_DYNAMIC_LEARNED, &fdb->flags))
+ atomic_dec(&br->fdb_n_learned);
+
if (modified)
fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify);
}
@@ -1127,7 +1605,7 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
spin_lock_bh(&br->hash_lock);
fdb = br_fdb_find(br, addr, vid);
- if (fdb && fdb->added_by_external_learn)
+ if (fdb && test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags))
fdb_delete(br, fdb, swdev_notify);
else
err = -ENOENT;
@@ -1138,15 +1616,35 @@ int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
}
void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
- const unsigned char *addr, u16 vid)
+ const unsigned char *addr, u16 vid, bool offloaded)
{
struct net_bridge_fdb_entry *fdb;
spin_lock_bh(&br->hash_lock);
fdb = br_fdb_find(br, addr, vid);
- if (fdb)
- fdb->offloaded = 1;
+ if (fdb && offloaded != test_bit(BR_FDB_OFFLOADED, &fdb->flags))
+ change_bit(BR_FDB_OFFLOADED, &fdb->flags);
spin_unlock_bh(&br->hash_lock);
}
+
+void br_fdb_clear_offload(const struct net_device *dev, u16 vid)
+{
+ struct net_bridge_fdb_entry *f;
+ struct net_bridge_port *p;
+
+ ASSERT_RTNL();
+
+ p = br_port_get_rtnl(dev);
+ if (!p)
+ return;
+
+ spin_lock_bh(&p->br->hash_lock);
+ hlist_for_each_entry(f, &p->br->fdb_list, fdb_node) {
+ if (f->dst == p && f->key.vlan_id == vid)
+ clear_bit(BR_FDB_OFFLOADED, &f->flags);
+ }
+ spin_unlock_bh(&p->br->hash_lock);
+}
+EXPORT_SYMBOL_GPL(br_fdb_clear_offload);
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 5372e2042adf..dea09096ad0f 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Forwarding decision
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/err.h>
@@ -29,30 +25,31 @@ static inline int should_deliver(const struct net_bridge_port *p,
vg = nbp_vlan_group_rcu(p);
return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) &&
- br_allowed_egress(vg, skb) && p->state == BR_STATE_FORWARDING &&
- nbp_switchdev_allowed_egress(p, skb) &&
+ (br_mst_is_enabled(p) || p->state == BR_STATE_FORWARDING) &&
+ br_allowed_egress(vg, skb) && nbp_switchdev_allowed_egress(p, skb) &&
!br_skb_isolated(p, skb);
}
int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ skb_push(skb, ETH_HLEN);
if (!is_skb_forwardable(skb->dev, skb))
goto drop;
- skb_push(skb, ETH_HLEN);
br_drop_fake_rtable(skb);
if (skb->ip_summed == CHECKSUM_PARTIAL &&
- (skb->protocol == htons(ETH_P_8021Q) ||
- skb->protocol == htons(ETH_P_8021AD))) {
+ eth_type_vlan(skb->protocol)) {
int depth;
- if (!__vlan_get_protocol(skb, skb->protocol, &depth))
+ if (!vlan_get_protocol_and_depth(skb, skb->protocol, &depth))
goto drop;
skb_set_network_header(skb, depth);
}
+ br_switchdev_frame_set_offload_fwd_mark(skb);
+
dev_queue_xmit(skb);
return 0;
@@ -65,6 +62,7 @@ EXPORT_SYMBOL_GPL(br_dev_queue_push_xmit);
int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ skb_clear_tstamp(skb);
return NF_HOOK(NFPROTO_BRIDGE, NF_BR_POST_ROUTING,
net, sk, skb, NULL, skb->dev,
br_dev_queue_push_xmit);
@@ -80,6 +78,11 @@ static void __br_forward(const struct net_bridge_port *to,
struct net *net;
int br_hook;
+ /* Mark the skb for forwarding offload early so that br_handle_vlan()
+ * can know whether to pop the VLAN header on egress or keep it.
+ */
+ nbp_switchdev_frame_mark_tx_fwd_offload(to, skb);
+
vg = nbp_vlan_group_rcu(to);
skb = br_handle_vlan(to->br, to, vg, skb);
if (!skb)
@@ -97,12 +100,11 @@ static void __br_forward(const struct net_bridge_port *to,
net = dev_net(indev);
} else {
if (unlikely(netpoll_tx_running(to->br->dev))) {
- if (!is_skb_forwardable(skb->dev, skb)) {
+ skb_push(skb, ETH_HLEN);
+ if (!is_skb_forwardable(skb->dev, skb))
kfree_skb(skb);
- } else {
- skb_push(skb, ETH_HLEN);
+ else
br_netpoll_send_skb(to, skb);
- }
return;
}
br_hook = NF_BR_LOCAL_OUT;
@@ -122,7 +124,7 @@ static int deliver_clone(const struct net_bridge_port *prev,
skb = skb_clone(skb, GFP_ATOMIC);
if (!skb) {
- dev->stats.tx_dropped++;
+ DEV_STATS_INC(dev, tx_dropped);
return -ENOMEM;
}
@@ -146,12 +148,14 @@ void br_forward(const struct net_bridge_port *to,
goto out;
/* redirect to backup link if the destination port is down */
- if (rcu_access_pointer(to->backup_port) && !netif_carrier_ok(to->dev)) {
+ if (rcu_access_pointer(to->backup_port) &&
+ (!netif_carrier_ok(to->dev) || !netif_running(to->dev))) {
struct net_bridge_port *backup_port;
backup_port = rcu_dereference(to->backup_port);
if (unlikely(!backup_port))
goto out;
+ BR_INPUT_SKB_CB(skb)->backup_nhid = READ_ONCE(to->backup_nhid);
to = backup_port;
}
@@ -173,30 +177,37 @@ static struct net_bridge_port *maybe_deliver(
struct net_bridge_port *prev, struct net_bridge_port *p,
struct sk_buff *skb, bool local_orig)
{
+ u8 igmp_type = br_multicast_igmp_type(skb);
int err;
if (!should_deliver(p, skb))
return prev;
+ nbp_switchdev_frame_mark_tx_fwd_to_hwdom(p, skb);
+
if (!prev)
goto out;
err = deliver_clone(prev, skb, local_orig);
if (err)
return ERR_PTR(err);
-
out:
+ br_multicast_count(p->br, p, skb, igmp_type, BR_MCAST_DIR_TX);
+
return p;
}
/* called under rcu_read_lock */
void br_flood(struct net_bridge *br, struct sk_buff *skb,
- enum br_pkt_type pkt_type, bool local_rcv, bool local_orig)
+ enum br_pkt_type pkt_type, bool local_rcv, bool local_orig,
+ u16 vid)
{
- u8 igmp_type = br_multicast_igmp_type(skb);
+ enum skb_drop_reason reason = SKB_DROP_REASON_NO_TX_TARGET;
struct net_bridge_port *prev = NULL;
struct net_bridge_port *p;
+ br_tc_skb_miss_set(skb, pkt_type != BR_PKT_BROADCAST);
+
list_for_each_entry_rcu(p, &br->port_list, list) {
/* Do not flood unicast traffic to ports that turn it off, nor
* other traffic if flood off, except for traffic we originate
@@ -219,16 +230,17 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
/* Do not flood to ports that enable proxy ARP */
if (p->flags & BR_PROXYARP)
continue;
- if ((p->flags & (BR_PROXYARP_WIFI | BR_NEIGH_SUPPRESS)) &&
- BR_INPUT_SKB_CB(skb)->proxyarp_replied)
+ if (BR_INPUT_SKB_CB(skb)->proxyarp_replied &&
+ ((p->flags & BR_PROXYARP_WIFI) ||
+ br_is_neigh_suppress_enabled(p, vid)))
continue;
prev = maybe_deliver(prev, p, skb, local_orig);
- if (IS_ERR(prev))
+ if (IS_ERR(prev)) {
+ reason = PTR_ERR(prev) == -ENOMEM ? SKB_DROP_REASON_NOMEM :
+ SKB_DROP_REASON_NOT_SPECIFIED;
goto out;
- if (prev == p)
- br_multicast_count(p->br, p, skb, igmp_type,
- BR_MCAST_DIR_TX);
+ }
}
if (!prev)
@@ -242,7 +254,7 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
out:
if (!local_rcv)
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
}
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
@@ -251,6 +263,7 @@ static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb,
{
struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
const unsigned char *src = eth_hdr(skb)->h_source;
+ struct sk_buff *nskb;
if (!should_deliver(p, skb))
return;
@@ -259,12 +272,16 @@ static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb,
if (skb->dev == p->dev && ether_addr_equal(src, addr))
return;
- skb = skb_copy(skb, GFP_ATOMIC);
- if (!skb) {
- dev->stats.tx_dropped++;
+ __skb_push(skb, ETH_HLEN);
+ nskb = pskb_copy(skb, GFP_ATOMIC);
+ __skb_pull(skb, ETH_HLEN);
+ if (!nskb) {
+ DEV_STATS_INC(dev, tx_dropped);
return;
}
+ skb = nskb;
+ __skb_pull(skb, ETH_HLEN);
if (!is_broadcast_ether_addr(addr))
memcpy(eth_hdr(skb)->h_dest, addr, ETH_ALEN);
@@ -274,22 +291,32 @@ static void maybe_deliver_addr(struct net_bridge_port *p, struct sk_buff *skb,
/* called with rcu_read_lock */
void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
struct sk_buff *skb,
+ struct net_bridge_mcast *brmctx,
bool local_rcv, bool local_orig)
{
- struct net_device *dev = BR_INPUT_SKB_CB(skb)->brdev;
- u8 igmp_type = br_multicast_igmp_type(skb);
- struct net_bridge *br = netdev_priv(dev);
+ enum skb_drop_reason reason = SKB_DROP_REASON_NO_TX_TARGET;
struct net_bridge_port *prev = NULL;
struct net_bridge_port_group *p;
+ bool allow_mode_include = true;
struct hlist_node *rp;
- rp = rcu_dereference(hlist_first_rcu(&br->router_list));
- p = mdst ? rcu_dereference(mdst->ports) : NULL;
+ rp = br_multicast_get_first_rport_node(brmctx, skb);
+
+ if (mdst) {
+ p = rcu_dereference(mdst->ports);
+ if (br_multicast_should_handle_mode(brmctx, mdst->addr.proto) &&
+ br_multicast_is_star_g(&mdst->addr))
+ allow_mode_include = false;
+ } else {
+ p = NULL;
+ br_tc_skb_miss_set(skb, true);
+ }
+
while (p || rp) {
struct net_bridge_port *port, *lport, *rport;
- lport = p ? p->port : NULL;
- rport = hlist_entry_safe(rp, struct net_bridge_port, rlist);
+ lport = p ? p->key.port : NULL;
+ rport = br_multicast_rport_from_node_skb(rp, skb);
if ((unsigned long)lport > (unsigned long)rport) {
port = lport;
@@ -299,18 +326,21 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
local_orig);
goto delivered;
}
+ if ((!allow_mode_include &&
+ p->filter_mode == MCAST_INCLUDE) ||
+ (p->flags & MDB_PG_FLAGS_BLOCKED))
+ goto delivered;
} else {
port = rport;
}
prev = maybe_deliver(prev, port, skb, local_orig);
-delivered:
- if (IS_ERR(prev))
+ if (IS_ERR(prev)) {
+ reason = PTR_ERR(prev) == -ENOMEM ? SKB_DROP_REASON_NOMEM :
+ SKB_DROP_REASON_NOT_SPECIFIED;
goto out;
- if (prev == port)
- br_multicast_count(port->br, port, skb, igmp_type,
- BR_MCAST_DIR_TX);
-
+ }
+delivered:
if ((unsigned long)lport >= (unsigned long)port)
p = rcu_dereference(p->next);
if ((unsigned long)rport >= (unsigned long)port)
@@ -328,6 +358,6 @@ delivered:
out:
if (!local_rcv)
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
}
#endif
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 0363f1bdc401..4c67a32745f6 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Userspace interface
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -44,12 +40,21 @@ static int port_cost(struct net_device *dev)
switch (ecmd.base.speed) {
case SPEED_10000:
return 2;
- case SPEED_1000:
+ case SPEED_5000:
+ return 3;
+ case SPEED_2500:
return 4;
+ case SPEED_1000:
+ return 5;
case SPEED_100:
return 19;
case SPEED_10:
return 100;
+ case SPEED_UNKNOWN:
+ return 100;
+ default:
+ if (ecmd.base.speed > SPEED_10000)
+ return 1;
}
}
@@ -161,8 +166,9 @@ void br_manage_promisc(struct net_bridge *br)
* This lets us disable promiscuous mode and write
* this config to hw.
*/
- if (br->auto_cnt == 0 ||
- (br->auto_cnt == 1 && br_auto_port(p)))
+ if ((p->dev->priv_flags & IFF_UNICAST_FLT) &&
+ (br->auto_cnt == 0 ||
+ (br->auto_cnt == 1 && br_auto_port(p))))
br_port_clear_promisc(p);
else
br_port_set_promisc(p);
@@ -179,7 +185,7 @@ int nbp_backup_change(struct net_bridge_port *p,
ASSERT_RTNL();
if (backup_dev) {
- if (!br_port_exists(backup_dev))
+ if (!netif_is_bridge_port(backup_dev))
return -ENOENT;
backup_p = br_port_get_rtnl(backup_dev);
@@ -257,14 +263,14 @@ static void release_nbp(struct kobject *kobj)
kfree(p);
}
-static void brport_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid)
+static void brport_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid)
{
struct net_bridge_port *p = kobj_to_brport(kobj);
net_ns_get_ownership(dev_net(p->dev), uid, gid);
}
-static struct kobj_type brport_ktype = {
+static const struct kobj_type brport_ktype = {
#ifdef CONFIG_SYSFS
.sysfs_ops = &brport_sysfs_ops,
#endif
@@ -278,7 +284,7 @@ static void destroy_nbp(struct net_bridge_port *p)
p->br = NULL;
p->dev = NULL;
- dev_put(dev);
+ netdev_put(dev, &p->dev_tracker);
kobject_put(&p->kobj);
}
@@ -337,6 +343,9 @@ static void del_nbp(struct net_bridge_port *p)
br_stp_disable_port(p);
spin_unlock_bh(&br->lock);
+ br_mrp_port_del(br, p);
+ br_cfm_port_del(br, p);
+
br_ifinfo_notify(RTM_DELLINK, NULL, p);
list_del_rcu(&p->list);
@@ -377,6 +386,7 @@ void br_dev_delete(struct net_device *dev, struct list_head *head)
del_nbp(p);
}
+ br_mst_uninit(br);
br_recalculate_neigh_suppress_enabled(br);
br_fdb_delete_by_port(br, NULL, 0, 1);
@@ -394,17 +404,16 @@ static int find_portno(struct net_bridge *br)
struct net_bridge_port *p;
unsigned long *inuse;
- inuse = kcalloc(BITS_TO_LONGS(BR_MAX_PORTS), sizeof(unsigned long),
- GFP_KERNEL);
+ inuse = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL);
if (!inuse)
return -ENOMEM;
- set_bit(0, inuse); /* zero is reserved */
- list_for_each_entry(p, &br->port_list, list) {
- set_bit(p->port_no, inuse);
- }
+ __set_bit(0, inuse); /* zero is reserved */
+ list_for_each_entry(p, &br->port_list, list)
+ __set_bit(p->port_no, inuse);
+
index = find_first_zero_bit(inuse, BR_MAX_PORTS);
- kfree(inuse);
+ bitmap_free(inuse);
return (index >= BR_MAX_PORTS) ? -EXFULL : index;
}
@@ -425,7 +434,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br,
return ERR_PTR(-ENOMEM);
p->br = br;
- dev_hold(dev);
+ netdev_hold(dev, &p->dev_tracker, GFP_KERNEL);
p->dev = dev;
p->path_cost = port_cost(dev);
p->priority = 0x8000 >> BR_PORT_BITS;
@@ -436,7 +445,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br,
br_stp_port_timer_init(p);
err = br_multicast_add_port(p);
if (err) {
- dev_put(dev);
+ netdev_put(dev, &p->dev_tracker);
kfree(p);
p = ERR_PTR(err);
}
@@ -458,7 +467,7 @@ int br_add_bridge(struct net *net, const char *name)
dev_net_set(dev, net);
dev->rtnl_link_ops = &br_link_ops;
- res = register_netdev(dev);
+ res = register_netdevice(dev);
if (res)
free_netdev(dev);
return res;
@@ -469,12 +478,11 @@ int br_del_bridge(struct net *net, const char *name)
struct net_device *dev;
int ret = 0;
- rtnl_lock();
dev = __dev_get_by_name(net, name);
if (dev == NULL)
ret = -ENXIO; /* Could not find device */
- else if (!(dev->priv_flags & IFF_EBRIDGE)) {
+ else if (!netif_is_bridge_master(dev)) {
/* Attempt to delete non bridge device! */
ret = -EPERM;
}
@@ -487,7 +495,6 @@ int br_del_bridge(struct net *net, const char *name)
else
br_dev_delete(dev, NULL);
- rtnl_unlock();
return ret;
}
@@ -509,28 +516,14 @@ void br_mtu_auto_adjust(struct net_bridge *br)
ASSERT_RTNL();
/* if the bridge MTU was manually configured don't mess with it */
- if (br->mtu_set_by_user)
+ if (br_opt_get(br, BROPT_MTU_SET_BY_USER))
return;
/* change to the minimum MTU and clear the flag which was set by
* the bridge ndo_change_mtu callback
*/
dev_set_mtu(br->dev, br_mtu_min(br));
- br->mtu_set_by_user = false;
-}
-
-static void br_set_gso_limits(struct net_bridge *br)
-{
- unsigned int gso_max_size = GSO_MAX_SIZE;
- u16 gso_max_segs = GSO_MAX_SEGS;
- const struct net_bridge_port *p;
-
- list_for_each_entry(p, &br->port_list, list) {
- gso_max_size = min(gso_max_size, p->dev->gso_max_size);
- gso_max_segs = min(gso_max_segs, p->dev->gso_max_segs);
- }
- br->dev->gso_max_size = gso_max_size;
- br->dev->gso_max_segs = gso_max_segs;
+ br_opt_toggle(br, BROPT_MTU_SET_BY_USER, false);
}
/*
@@ -564,18 +557,12 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
struct net_bridge_port *p;
int err = 0;
unsigned br_hr, dev_hr;
- bool changed_addr;
+ bool changed_addr, fdb_synced = false;
- /* Don't allow bridging non-ethernet like devices, or DSA-enabled
- * master network devices since the bridge layer rx_handler prevents
- * the DSA fake ethertype handler to be invoked, so we do not strip off
- * the DSA switch tag protocol header and the bridge layer just return
- * RX_HANDLER_CONSUMED, stopping RX processing for these frames.
- */
+ /* Don't allow bridging non-ethernet like devices. */
if ((dev->flags & IFF_LOOPBACK) ||
dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN ||
- !is_valid_ether_addr(dev->dev_addr) ||
- netdev_uses_dsa(dev))
+ !is_valid_ether_addr(dev->dev_addr))
return -EINVAL;
/* No bridging of bridges */
@@ -603,13 +590,17 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
call_netdevice_notifiers(NETDEV_JOIN, dev);
err = dev_set_allmulti(dev, 1);
- if (err)
- goto put_back;
+ if (err) {
+ br_multicast_del_port(p);
+ netdev_put(dev, &p->dev_tracker);
+ kfree(p); /* kobject not yet init'd, manually free */
+ goto err1;
+ }
err = kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj),
SYSFS_BRIDGE_PORT_ATTR);
if (err)
- goto err1;
+ goto err2;
err = br_sysfs_addif(p);
if (err)
@@ -619,7 +610,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
if (err)
goto err3;
- err = netdev_rx_handler_register(dev, br_handle_frame, p);
+ err = netdev_rx_handler_register(dev, br_get_rx_handler(dev), p);
if (err)
goto err4;
@@ -629,17 +620,24 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
if (err)
goto err5;
- err = nbp_switchdev_mark_set(p);
- if (err)
- goto err6;
-
dev_disable_lro(dev);
list_add_rcu(&p->list, &br->port_list);
nbp_update_port_count(br);
-
- netdev_update_features(br->dev);
+ if (!br_promisc_port(p) && (p->dev->priv_flags & IFF_UNICAST_FLT)) {
+ /* When updating the port count we also update all ports'
+ * promiscuous mode.
+ * A port leaving promiscuous mode normally gets the bridge's
+ * fdb synced to the unicast filter (if supported), however,
+ * `br_port_clear_promisc` does not distinguish between
+ * non-promiscuous ports and *new* ports, so we need to
+ * sync explicitly here.
+ */
+ fdb_synced = br_fdb_sync_static(br, p) == 0;
+ if (!fdb_synced)
+ netdev_err(dev, "failed to sync bridge static fdb addresses to this port\n");
+ }
br_hr = br->dev->needed_headroom;
dev_hr = netdev_get_fwd_headroom(dev);
@@ -648,13 +646,23 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
else
netdev_set_rx_headroom(dev, br_hr);
- if (br_fdb_insert(br, p, dev->dev_addr, 0))
+ if (br_fdb_add_local(br, p, dev->dev_addr, 0))
netdev_err(dev, "failed insert local address bridge forwarding table\n");
- err = nbp_vlan_init(p);
+ if (br->dev->addr_assign_type != NET_ADDR_SET) {
+ /* Ask for permission to use this MAC address now, even if we
+ * don't end up choosing it below.
+ */
+ err = netif_pre_changeaddr_notify(br->dev, dev->dev_addr,
+ extack);
+ if (err)
+ goto err6;
+ }
+
+ err = nbp_vlan_init(p, extack);
if (err) {
netdev_err(dev, "failed to initialize vlan filtering on this port\n");
- goto err7;
+ goto err6;
}
spin_lock_bh(&br->lock);
@@ -671,17 +679,19 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);
br_mtu_auto_adjust(br);
- br_set_gso_limits(br);
+
+ netdev_compute_master_upper_features(br->dev, false);
kobject_uevent(&p->kobj, KOBJ_ADD);
return 0;
-err7:
+err6:
+ if (fdb_synced)
+ br_fdb_unsync_static(br, p);
list_del_rcu(&p->list);
br_fdb_delete_by_port(br, p, 0, 1);
nbp_update_port_count(br);
-err6:
netdev_upper_dev_unlink(dev, br->dev);
err5:
dev->priv_flags &= ~IFF_BRIDGE_PORT;
@@ -691,13 +701,11 @@ err4:
err3:
sysfs_remove_link(br->ifobj, p->dev->name);
err2:
+ br_multicast_del_port(p);
+ netdev_put(dev, &p->dev_tracker);
kobject_put(&p->kobj);
- p = NULL; /* kobject_put frees */
-err1:
dev_set_allmulti(dev, -1);
-put_back:
- dev_put(dev);
- kfree(p);
+err1:
return err;
}
@@ -718,7 +726,6 @@ int br_del_if(struct net_bridge *br, struct net_device *dev)
del_nbp(p);
br_mtu_auto_adjust(br);
- br_set_gso_limits(br);
spin_lock_bh(&br->lock);
changed_addr = br_stp_recalculate_bridge_id(br);
@@ -727,7 +734,7 @@ int br_del_if(struct net_bridge *br, struct net_device *dev)
if (changed_addr)
call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);
- netdev_update_features(br->dev);
+ netdev_compute_master_upper_features(br->dev, false);
return 0;
}
@@ -739,6 +746,18 @@ void br_port_flags_change(struct net_bridge_port *p, unsigned long mask)
if (mask & BR_AUTO_MASK)
nbp_update_port_count(br);
- if (mask & BR_NEIGH_SUPPRESS)
+ if (mask & (BR_NEIGH_SUPPRESS | BR_NEIGH_VLAN_SUPPRESS))
br_recalculate_neigh_suppress_enabled(br);
}
+
+bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag)
+{
+ struct net_bridge_port *p;
+
+ p = br_port_get_rtnl_rcu(dev);
+ if (!p)
+ return false;
+
+ return p->flags & flag;
+}
+EXPORT_SYMBOL_GPL(br_port_flag_is_set);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 72074276c088..777fa869c1a1 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Handle incoming frames
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/slab.h>
@@ -16,17 +12,17 @@
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/netfilter_bridge.h>
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
+#include <net/netfilter/nf_queue.h>
+#endif
#include <linux/neighbour.h>
#include <net/arp.h>
+#include <net/dsa.h>
#include <linux/export.h>
#include <linux/rculist.h>
#include "br_private.h"
#include "br_private_tunnel.h"
-/* Hook for brouter */
-br_should_route_hook_t __rcu *br_should_route_hook __read_mostly;
-EXPORT_SYMBOL(br_should_route_hook);
-
static int
br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb)
{
@@ -34,21 +30,24 @@ br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb)
return netif_receive_skb(skb);
}
-static int br_pass_frame_up(struct sk_buff *skb)
+static int br_pass_frame_up(struct sk_buff *skb, bool promisc)
{
struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev;
struct net_bridge *br = netdev_priv(brdev);
struct net_bridge_vlan_group *vg;
- struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats);
- u64_stats_update_begin(&brstats->syncp);
- brstats->rx_packets++;
- brstats->rx_bytes += skb->len;
- u64_stats_update_end(&brstats->syncp);
+ dev_sw_netstats_rx_add(brdev, skb->len);
vg = br_vlan_group_rcu(br);
+
+ /* Reset the offload_fwd_mark because there could be a stacked
+ * bridge above, and it should not think this bridge it doing
+ * that bridge's work forwarding out its ports.
+ */
+ br_switchdev_frame_unmark(skb);
+
/* Bridge is just like any other port. Make sure the
- * packet is allowed except in promisc modue when someone
+ * packet is allowed except in promisc mode when someone
* may be running packet capture.
*/
if (!(brdev->flags & IFF_PROMISC) &&
@@ -66,6 +65,8 @@ static int br_pass_frame_up(struct sk_buff *skb)
br_multicast_count(br, NULL, skb, br_multicast_igmp_type(skb),
BR_MCAST_DIR_TX);
+ BR_INPUT_SKB_CB(skb)->promisc = promisc;
+
return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
dev_net(indev), NULL, skb, indev, NULL,
br_netif_receive_skb);
@@ -74,44 +75,93 @@ static int br_pass_frame_up(struct sk_buff *skb)
/* note: already called with rcu_read_lock */
int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct net_bridge_port *p = br_port_get_rcu(skb->dev);
enum br_pkt_type pkt_type = BR_PKT_UNICAST;
struct net_bridge_fdb_entry *dst = NULL;
+ struct net_bridge_mcast_port *pmctx;
struct net_bridge_mdb_entry *mdst;
bool local_rcv, mcast_hit = false;
- const unsigned char *dest;
+ struct net_bridge_mcast *brmctx;
+ struct net_bridge_vlan *vlan;
struct net_bridge *br;
+ bool promisc;
u16 vid = 0;
+ u8 state;
- if (!p || p->state == BR_STATE_DISABLED)
+ if (!p)
goto drop;
- if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid))
+ br = p->br;
+
+ if (br_mst_is_enabled(p)) {
+ state = BR_STATE_FORWARDING;
+ } else {
+ if (p->state == BR_STATE_DISABLED) {
+ reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE;
+ goto drop;
+ }
+
+ state = p->state;
+ }
+
+ brmctx = &p->br->multicast_ctx;
+ pmctx = &p->multicast_ctx;
+ if (!br_allowed_ingress(p->br, nbp_vlan_group_rcu(p), skb, &vid,
+ &state, &vlan))
goto out;
+ if (p->flags & BR_PORT_LOCKED) {
+ struct net_bridge_fdb_entry *fdb_src =
+ br_fdb_find_rcu(br, eth_hdr(skb)->h_source, vid);
+
+ if (!fdb_src) {
+ /* FDB miss. Create locked FDB entry if MAB is enabled
+ * and drop the packet.
+ */
+ if (p->flags & BR_PORT_MAB)
+ br_fdb_update(br, p, eth_hdr(skb)->h_source,
+ vid, BIT(BR_FDB_LOCKED));
+ goto drop;
+ } else if (READ_ONCE(fdb_src->dst) != p ||
+ test_bit(BR_FDB_LOCAL, &fdb_src->flags)) {
+ /* FDB mismatch. Drop the packet without roaming. */
+ goto drop;
+ } else if (test_bit(BR_FDB_LOCKED, &fdb_src->flags)) {
+ /* FDB match, but entry is locked. Refresh it and drop
+ * the packet.
+ */
+ br_fdb_update(br, p, eth_hdr(skb)->h_source, vid,
+ BIT(BR_FDB_LOCKED));
+ goto drop;
+ }
+ }
+
nbp_switchdev_frame_mark(p, skb);
/* insert into forwarding database after filtering to avoid spoofing */
- br = p->br;
if (p->flags & BR_LEARNING)
- br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false);
+ br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0);
+
+ promisc = !!(br->dev->flags & IFF_PROMISC);
+ local_rcv = promisc;
- local_rcv = !!(br->dev->flags & IFF_PROMISC);
- dest = eth_hdr(skb)->h_dest;
- if (is_multicast_ether_addr(dest)) {
+ if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) {
/* by definition the broadcast is also a multicast address */
- if (is_broadcast_ether_addr(dest)) {
+ if (is_broadcast_ether_addr(eth_hdr(skb)->h_dest)) {
pkt_type = BR_PKT_BROADCAST;
local_rcv = true;
} else {
pkt_type = BR_PKT_MULTICAST;
- if (br_multicast_rcv(br, p, skb, vid))
+ if (br_multicast_rcv(&brmctx, &pmctx, vlan, skb, vid))
goto drop;
}
}
- if (p->state == BR_STATE_LEARNING)
+ if (state == BR_STATE_LEARNING) {
+ reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE;
goto drop;
+ }
BR_INPUT_SKB_CB(skb)->brdev = br->dev;
BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED);
@@ -122,7 +172,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
br_do_proxy_suppress_arp(skb, br, vid, p);
} else if (IS_ENABLED(CONFIG_IPV6) &&
skb->protocol == htons(ETH_P_IPV6) &&
- br->neigh_suppress_enabled &&
+ br_opt_get(br, BROPT_NEIGH_SUPPRESS_ENABLED) &&
pskb_may_pull(skb, sizeof(struct ipv6hdr) +
sizeof(struct nd_msg)) &&
ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
@@ -135,22 +185,32 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
switch (pkt_type) {
case BR_PKT_MULTICAST:
- mdst = br_mdb_get(br, skb, vid);
+ mdst = br_mdb_entry_skb_get(brmctx, skb, vid);
if ((mdst || BR_INPUT_SKB_CB_MROUTERS_ONLY(skb)) &&
- br_multicast_querier_exists(br, eth_hdr(skb))) {
+ br_multicast_querier_exists(brmctx, eth_hdr(skb), mdst)) {
if ((mdst && mdst->host_joined) ||
- br_multicast_is_router(br)) {
+ br_multicast_is_router(brmctx, skb) ||
+ br->dev->flags & IFF_ALLMULTI) {
local_rcv = true;
- br->dev->stats.multicast++;
+ DEV_STATS_INC(br->dev, multicast);
}
mcast_hit = true;
} else {
local_rcv = true;
- br->dev->stats.multicast++;
+ DEV_STATS_INC(br->dev, multicast);
}
break;
case BR_PKT_UNICAST:
- dst = br_fdb_find_rcu(br, dest, vid);
+ dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, vid);
+ if (unlikely(!dst && vid &&
+ br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0))) {
+ dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, 0);
+ if (dst &&
+ (!test_bit(BR_FDB_LOCAL, &dst->flags) ||
+ test_bit(BR_FDB_ADDED_BY_USER, &dst->flags)))
+ dst = NULL;
+ }
+ break;
default:
break;
}
@@ -158,26 +218,26 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
if (dst) {
unsigned long now = jiffies;
- if (dst->is_local)
- return br_pass_frame_up(skb);
+ if (test_bit(BR_FDB_LOCAL, &dst->flags))
+ return br_pass_frame_up(skb, false);
if (now != dst->used)
dst->used = now;
br_forward(dst->dst, skb, local_rcv, false);
} else {
if (!mcast_hit)
- br_flood(br, skb, pkt_type, local_rcv, false);
+ br_flood(br, skb, pkt_type, local_rcv, false, vid);
else
- br_multicast_flood(mdst, skb, local_rcv, false);
+ br_multicast_flood(mdst, skb, brmctx, local_rcv, false);
}
if (local_rcv)
- return br_pass_frame_up(skb);
+ return br_pass_frame_up(skb, promisc);
out:
return 0;
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
goto out;
}
EXPORT_SYMBOL_GPL(br_handle_frame_finish);
@@ -188,19 +248,87 @@ static void __br_handle_local_finish(struct sk_buff *skb)
u16 vid = 0;
/* check if vlan is allowed, to avoid spoofing */
- if (p->flags & BR_LEARNING && br_should_learn(p, skb, &vid))
- br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false);
+ if ((p->flags & BR_LEARNING) &&
+ nbp_state_should_learn(p) &&
+ !br_opt_get(p->br, BROPT_NO_LL_LEARN) &&
+ br_should_learn(p, skb, &vid))
+ br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, 0);
}
/* note: already called with rcu_read_lock */
static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct net_bridge_port *p = br_port_get_rcu(skb->dev);
-
__br_handle_local_finish(skb);
- BR_INPUT_SKB_CB(skb)->brdev = p->br->dev;
- br_pass_frame_up(skb);
+ /* return 1 to signal the okfn() was called so it's ok to use the skb */
+ return 1;
+}
+
+static int nf_hook_bridge_pre(struct sk_buff *skb, struct sk_buff **pskb)
+{
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
+ struct nf_hook_entries *e = NULL;
+ struct nf_hook_state state;
+ unsigned int verdict, i;
+ struct net *net;
+ int ret;
+
+ net = dev_net(skb->dev);
+#ifdef HAVE_JUMP_LABEL
+ if (!static_key_false(&nf_hooks_needed[NFPROTO_BRIDGE][NF_BR_PRE_ROUTING]))
+ goto frame_finish;
+#endif
+
+ e = rcu_dereference(net->nf.hooks_bridge[NF_BR_PRE_ROUTING]);
+ if (!e)
+ goto frame_finish;
+
+ nf_hook_state_init(&state, NF_BR_PRE_ROUTING,
+ NFPROTO_BRIDGE, skb->dev, NULL, NULL,
+ net, br_handle_frame_finish);
+
+ for (i = 0; i < e->num_hook_entries; i++) {
+ verdict = nf_hook_entry_hookfn(&e->hooks[i], skb, &state);
+ switch (verdict & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ if (BR_INPUT_SKB_CB(skb)->br_netfilter_broute) {
+ *pskb = skb;
+ return RX_HANDLER_PASS;
+ }
+ break;
+ case NF_DROP:
+ kfree_skb(skb);
+ return RX_HANDLER_CONSUMED;
+ case NF_QUEUE:
+ ret = nf_queue(skb, &state, i, verdict);
+ if (ret == 1)
+ continue;
+ return RX_HANDLER_CONSUMED;
+ default: /* STOLEN */
+ return RX_HANDLER_CONSUMED;
+ }
+ }
+frame_finish:
+ net = dev_net(skb->dev);
+ br_handle_frame_finish(net, NULL, skb);
+#else
+ br_handle_frame_finish(dev_net(skb->dev), NULL, skb);
+#endif
+ return RX_HANDLER_CONSUMED;
+}
+
+/* Return 0 if the frame was not processed otherwise 1
+ * note: already called with rcu_read_lock
+ */
+static int br_process_frame_type(struct net_bridge_port *p,
+ struct sk_buff *skb)
+{
+ struct br_frame_type *tmp;
+
+ hlist_for_each_entry_rcu(tmp, &p->br->frame_type_list, list)
+ if (unlikely(tmp->type == skb->protocol))
+ return tmp->frame_handler(p, skb);
+
return 0;
}
@@ -208,29 +336,31 @@ static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_bu
* Return NULL if skb is handled
* note: already called with rcu_read_lock
*/
-rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
+static rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct net_bridge_port *p;
struct sk_buff *skb = *pskb;
const unsigned char *dest = eth_hdr(skb)->h_dest;
- br_should_route_hook_t *rhook;
if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
return RX_HANDLER_PASS;
- if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
+ if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) {
+ reason = SKB_DROP_REASON_MAC_INVALID_SOURCE;
goto drop;
+ }
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb)
return RX_HANDLER_CONSUMED;
+ memset(skb->cb, 0, sizeof(struct br_input_skb_cb));
+ br_tc_skb_miss_set(skb, false);
+
p = br_port_get_rcu(skb->dev);
- if (p->flags & BR_VLAN_TUNNEL) {
- if (br_handle_ingress_vlan_tunnel(skb, p,
- nbp_vlan_group_rcu(p)))
- goto drop;
- }
+ if (p->flags & BR_VLAN_TUNNEL)
+ br_handle_ingress_vlan_tunnel(skb, p, nbp_vlan_group_rcu(p));
if (unlikely(is_link_local_ether_addr(dest))) {
u16 fwd_mask = p->br->group_fwd_mask_required;
@@ -261,6 +391,7 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
return RX_HANDLER_PASS;
case 0x01: /* IEEE MAC (Pause) */
+ reason = SKB_DROP_REASON_MAC_IEEE_MAC_CONTROL;
goto drop;
case 0x0E: /* 802.1AB LLDP */
@@ -278,35 +409,77 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
goto forward;
}
- /* Deliver packet to local host only */
- NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(skb->dev),
- NULL, skb, skb->dev, NULL, br_handle_local_finish);
- return RX_HANDLER_CONSUMED;
+ BR_INPUT_SKB_CB(skb)->promisc = false;
+
+ /* The else clause should be hit when nf_hook():
+ * - returns < 0 (drop/error)
+ * - returns = 0 (stolen/nf_queue)
+ * Thus return 1 from the okfn() to signal the skb is ok to pass
+ */
+ if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
+ dev_net(skb->dev), NULL, skb, skb->dev, NULL,
+ br_handle_local_finish) == 1) {
+ return RX_HANDLER_PASS;
+ } else {
+ return RX_HANDLER_CONSUMED;
+ }
}
+ if (unlikely(br_process_frame_type(p, skb)))
+ return RX_HANDLER_PASS;
+
forward:
+ if (br_mst_is_enabled(p))
+ goto defer_stp_filtering;
+
switch (p->state) {
case BR_STATE_FORWARDING:
- rhook = rcu_dereference(br_should_route_hook);
- if (rhook) {
- if ((*rhook)(skb)) {
- *pskb = skb;
- return RX_HANDLER_PASS;
- }
- dest = eth_hdr(skb)->h_dest;
- }
- /* fall through */
case BR_STATE_LEARNING:
+defer_stp_filtering:
if (ether_addr_equal(p->br->dev->dev_addr, dest))
skb->pkt_type = PACKET_HOST;
- NF_HOOK(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING,
- dev_net(skb->dev), NULL, skb, skb->dev, NULL,
- br_handle_frame_finish);
- break;
+ return nf_hook_bridge_pre(skb, pskb);
default:
+ reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE;
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
}
return RX_HANDLER_CONSUMED;
}
+
+/* This function has no purpose other than to appease the br_port_get_rcu/rtnl
+ * helpers which identify bridged ports according to the rx_handler installed
+ * on them (so there _needs_ to be a bridge rx_handler even if we don't need it
+ * to do anything useful). This bridge won't support traffic to/from the stack,
+ * but only hardware bridging. So return RX_HANDLER_PASS so we don't steal
+ * frames from the ETH_P_XDSA packet_type handler.
+ */
+static rx_handler_result_t br_handle_frame_dummy(struct sk_buff **pskb)
+{
+ return RX_HANDLER_PASS;
+}
+
+rx_handler_func_t *br_get_rx_handler(const struct net_device *dev)
+{
+ if (netdev_uses_dsa(dev))
+ return br_handle_frame_dummy;
+
+ return br_handle_frame;
+}
+
+void br_add_frame(struct net_bridge *br, struct br_frame_type *ft)
+{
+ hlist_add_head_rcu(&ft->list, &br->frame_type_list);
+}
+
+void br_del_frame(struct net_bridge *br, struct br_frame_type *ft)
+{
+ struct br_frame_type *tmp;
+
+ hlist_for_each_entry(tmp, &br->frame_type_list, list)
+ if (ft == tmp) {
+ hlist_del_rcu(&ft->list);
+ return;
+ }
+}
diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index 73b957fd639d..6bc0a11f2ed3 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -1,17 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Ioctl handler
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/capability.h>
+#include <linux/compat.h>
#include <linux/kernel.h>
#include <linux/if_bridge.h>
#include <linux/netdevice.h>
@@ -30,7 +27,7 @@ static int get_bridge_ifindices(struct net *net, int *indices, int num)
for_each_netdev_rcu(net, dev) {
if (i >= num)
break;
- if (dev->priv_flags & IFF_EBRIDGE)
+ if (netif_is_bridge_master(dev))
indices[i++] = dev->ifindex;
}
rcu_read_unlock();
@@ -75,7 +72,8 @@ static int get_fdb_entries(struct net_bridge *br, void __user *userbuf,
num = br_fdb_fillbuf(br, buf, maxnum, offset);
if (num > 0) {
- if (copy_to_user(userbuf, buf, num*sizeof(struct __fdb_entry)))
+ if (copy_to_user(userbuf, buf,
+ array_size(num, sizeof(struct __fdb_entry))))
num = -EFAULT;
}
kfree(buf);
@@ -105,20 +103,56 @@ static int add_del_if(struct net_bridge *br, int ifindex, int isadd)
return ret;
}
+#define BR_UARGS_MAX 4
+static int br_dev_read_uargs(unsigned long *args, size_t nr_args,
+ void __user **argp, void __user *data)
+{
+ int ret;
+
+ if (nr_args < 2 || nr_args > BR_UARGS_MAX)
+ return -EINVAL;
+
+ if (in_compat_syscall()) {
+ unsigned int cargs[BR_UARGS_MAX];
+ int i;
+
+ ret = copy_from_user(cargs, data, nr_args * sizeof(*cargs));
+ if (ret)
+ goto fault;
+
+ for (i = 0; i < nr_args; ++i)
+ args[i] = cargs[i];
+
+ *argp = compat_ptr(args[1]);
+ } else {
+ ret = copy_from_user(args, data, nr_args * sizeof(*args));
+ if (ret)
+ goto fault;
+ *argp = (void __user *)args[1];
+ }
+
+ return 0;
+fault:
+ return -EFAULT;
+}
+
/*
* Legacy ioctl's through SIOCDEVPRIVATE
- * This interface is deprecated because it was too difficult to
+ * This interface is deprecated because it was too difficult
* to do the translation for 32/64bit ioctl compatibility.
*/
-static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq,
+ void __user *data, int cmd)
{
struct net_bridge *br = netdev_priv(dev);
struct net_bridge_port *p = NULL;
unsigned long args[4];
- int ret = -EOPNOTSUPP;
+ void __user *argp;
+ int ret;
- if (copy_from_user(args, rq->ifr_data, sizeof(args)))
- return -EFAULT;
+ ret = br_dev_read_uargs(args, ARRAY_SIZE(args), &argp, data);
+ if (ret)
+ return ret;
switch (args[0]) {
case BRCTL_ADD_IF:
@@ -175,7 +209,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
return -ENOMEM;
get_port_ifindices(br, indices, num);
- if (copy_to_user((void __user *)args[1], indices, num*sizeof(int)))
+ if (copy_to_user(argp, indices, array_size(num, sizeof(int))))
num = -EFAULT;
kfree(indices);
return num;
@@ -236,7 +270,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
rcu_read_unlock();
- if (copy_to_user((void __user *)args[1], &p, sizeof(p)))
+ if (copy_to_user(argp, &p, sizeof(p)))
return -EFAULT;
return 0;
@@ -246,8 +280,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
- br_stp_set_enabled(br, args[1]);
- ret = 0;
+ ret = br_stp_set_enabled(br, args[1], NULL);
break;
case BRCTL_SET_BRIDGE_PRIORITY:
@@ -287,8 +320,10 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
}
case BRCTL_GET_FDB_ENTRIES:
- return get_fdb_entries(br, (void __user *)args[1],
- args[2], args[3]);
+ return get_fdb_entries(br, argp, args[2], args[3]);
+
+ default:
+ ret = -EOPNOTSUPP;
}
if (!ret) {
@@ -301,12 +336,15 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
return ret;
}
-static int old_deviceless(struct net *net, void __user *uarg)
+static int old_deviceless(struct net *net, void __user *data)
{
unsigned long args[3];
+ void __user *argp;
+ int ret;
- if (copy_from_user(args, uarg, sizeof(args)))
- return -EFAULT;
+ ret = br_dev_read_uargs(args, ARRAY_SIZE(args), &argp, data);
+ if (ret)
+ return ret;
switch (args[0]) {
case BRCTL_GET_VERSION:
@@ -325,7 +363,8 @@ static int old_deviceless(struct net *net, void __user *uarg)
args[2] = get_bridge_ifindices(net, indices, args[2]);
- ret = copy_to_user((void __user *)args[1], indices, args[2]*sizeof(int))
+ ret = copy_to_user(argp, indices,
+ array_size(args[2], sizeof(int)))
? -EFAULT : args[2];
kfree(indices);
@@ -340,7 +379,7 @@ static int old_deviceless(struct net *net, void __user *uarg)
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- if (copy_from_user(buf, (void __user *)args[1], IFNAMSIZ))
+ if (copy_from_user(buf, argp, IFNAMSIZ))
return -EFAULT;
buf[IFNAMSIZ-1] = 0;
@@ -355,48 +394,77 @@ static int old_deviceless(struct net *net, void __user *uarg)
return -EOPNOTSUPP;
}
-int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg)
+int br_ioctl_stub(struct net *net, unsigned int cmd, void __user *uarg)
{
+ int ret = -EOPNOTSUPP;
+ struct ifreq ifr;
+
+ if (cmd == SIOCBRADDIF || cmd == SIOCBRDELIF) {
+ void __user *data;
+ char *colon;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (get_user_ifreq(&ifr, &data, uarg))
+ return -EFAULT;
+
+ ifr.ifr_name[IFNAMSIZ - 1] = 0;
+ colon = strchr(ifr.ifr_name, ':');
+ if (colon)
+ *colon = 0;
+ }
+
+ rtnl_lock();
+
switch (cmd) {
case SIOCGIFBR:
case SIOCSIFBR:
- return old_deviceless(net, uarg);
-
+ ret = old_deviceless(net, uarg);
+ break;
case SIOCBRADDBR:
case SIOCBRDELBR:
{
char buf[IFNAMSIZ];
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- return -EPERM;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ break;
+ }
- if (copy_from_user(buf, uarg, IFNAMSIZ))
- return -EFAULT;
+ if (copy_from_user(buf, uarg, IFNAMSIZ)) {
+ ret = -EFAULT;
+ break;
+ }
buf[IFNAMSIZ-1] = 0;
if (cmd == SIOCBRADDBR)
- return br_add_bridge(net, buf);
-
- return br_del_bridge(net, buf);
- }
+ ret = br_add_bridge(net, buf);
+ else
+ ret = br_del_bridge(net, buf);
}
- return -EOPNOTSUPP;
-}
-
-int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
-{
- struct net_bridge *br = netdev_priv(dev);
-
- switch (cmd) {
- case SIOCDEVPRIVATE:
- return old_dev_ioctl(dev, rq, cmd);
-
+ break;
case SIOCBRADDIF:
case SIOCBRDELIF:
- return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF);
+ {
+ struct net_device *dev;
+
+ dev = __dev_get_by_name(net, ifr.ifr_name);
+ if (!dev || !netif_device_present(dev)) {
+ ret = -ENODEV;
+ break;
+ }
+ if (!netif_is_bridge_master(dev)) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+ ret = add_del_if(netdev_priv(dev), ifr.ifr_ifindex, cmd == SIOCBRADDIF);
+ }
+ break;
}
- br_debug(br, "Bridge does not support ioctl 0x%x\n", cmd);
- return -EOPNOTSUPP;
+ rtnl_unlock();
+
+ return ret;
}
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index 6d9f48bd374a..400eb872b403 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -16,31 +16,109 @@
#include "br_private.h"
-static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
- struct net_device *dev)
+static bool
+br_ip4_rports_get_timer(struct net_bridge_mcast_port *pmctx,
+ unsigned long *timer)
{
- struct net_bridge *br = netdev_priv(dev);
- struct net_bridge_port *p;
+ *timer = br_timer_value(&pmctx->ip4_mc_router_timer);
+ return !hlist_unhashed(&pmctx->ip4_rlist);
+}
+
+static bool
+br_ip6_rports_get_timer(struct net_bridge_mcast_port *pmctx,
+ unsigned long *timer)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ *timer = br_timer_value(&pmctx->ip6_mc_router_timer);
+ return !hlist_unhashed(&pmctx->ip6_rlist);
+#else
+ *timer = 0;
+ return false;
+#endif
+}
+
+static size_t __br_rports_one_size(void)
+{
+ return nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PORT */
+ nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_TIMER */
+ nla_total_size(sizeof(u8)) + /* MDBA_ROUTER_PATTR_TYPE */
+ nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_INET_TIMER */
+ nla_total_size(sizeof(u32)) + /* MDBA_ROUTER_PATTR_INET6_TIMER */
+ nla_total_size(sizeof(u32)); /* MDBA_ROUTER_PATTR_VID */
+}
+
+size_t br_rports_size(const struct net_bridge_mcast *brmctx)
+{
+ struct net_bridge_mcast_port *pmctx;
+ size_t size = nla_total_size(0); /* MDBA_ROUTER */
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(pmctx, &brmctx->ip4_mc_router_list,
+ ip4_rlist)
+ size += __br_rports_one_size();
+
+#if IS_ENABLED(CONFIG_IPV6)
+ hlist_for_each_entry_rcu(pmctx, &brmctx->ip6_mc_router_list,
+ ip6_rlist)
+ size += __br_rports_one_size();
+#endif
+ rcu_read_unlock();
+
+ return size;
+}
+
+int br_rports_fill_info(struct sk_buff *skb,
+ const struct net_bridge_mcast *brmctx)
+{
+ u16 vid = brmctx->vlan ? brmctx->vlan->vid : 0;
+ bool have_ip4_mc_rtr, have_ip6_mc_rtr;
+ unsigned long ip4_timer, ip6_timer;
struct nlattr *nest, *port_nest;
+ struct net_bridge_port *p;
- if (!br->multicast_router || hlist_empty(&br->router_list))
+ if (!brmctx->multicast_router || !br_rports_have_mc_router(brmctx))
return 0;
- nest = nla_nest_start(skb, MDBA_ROUTER);
+ nest = nla_nest_start_noflag(skb, MDBA_ROUTER);
if (nest == NULL)
return -EMSGSIZE;
- hlist_for_each_entry_rcu(p, &br->router_list, rlist) {
- if (!p)
+ list_for_each_entry_rcu(p, &brmctx->br->port_list, list) {
+ struct net_bridge_mcast_port *pmctx;
+
+ if (vid) {
+ struct net_bridge_vlan *v;
+
+ v = br_vlan_find(nbp_vlan_group(p), vid);
+ if (!v)
+ continue;
+ pmctx = &v->port_mcast_ctx;
+ } else {
+ pmctx = &p->multicast_ctx;
+ }
+
+ have_ip4_mc_rtr = br_ip4_rports_get_timer(pmctx, &ip4_timer);
+ have_ip6_mc_rtr = br_ip6_rports_get_timer(pmctx, &ip6_timer);
+
+ if (!have_ip4_mc_rtr && !have_ip6_mc_rtr)
continue;
- port_nest = nla_nest_start(skb, MDBA_ROUTER_PORT);
+
+ port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT);
if (!port_nest)
goto fail;
+
if (nla_put_nohdr(skb, sizeof(u32), &p->dev->ifindex) ||
nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER,
- br_timer_value(&p->multicast_router_timer)) ||
+ max(ip4_timer, ip6_timer)) ||
nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE,
- p->multicast_router)) {
+ p->multicast_ctx.multicast_router) ||
+ (have_ip4_mc_rtr &&
+ nla_put_u32(skb, MDBA_ROUTER_PATTR_INET_TIMER,
+ ip4_timer)) ||
+ (have_ip6_mc_rtr &&
+ nla_put_u32(skb, MDBA_ROUTER_PATTR_INET6_TIMER,
+ ip6_timer)) ||
+ (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid))) {
nla_nest_cancel(skb, port_nest);
goto fail;
}
@@ -60,168 +138,291 @@ static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags)
e->flags = 0;
if (flags & MDB_PG_FLAGS_OFFLOAD)
e->flags |= MDB_FLAGS_OFFLOAD;
+ if (flags & MDB_PG_FLAGS_FAST_LEAVE)
+ e->flags |= MDB_FLAGS_FAST_LEAVE;
+ if (flags & MDB_PG_FLAGS_STAR_EXCL)
+ e->flags |= MDB_FLAGS_STAR_EXCL;
+ if (flags & MDB_PG_FLAGS_BLOCKED)
+ e->flags |= MDB_FLAGS_BLOCKED;
+ if (flags & MDB_PG_FLAGS_OFFLOAD_FAILED)
+ e->flags |= MDB_FLAGS_OFFLOAD_FAILED;
}
-static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip)
+static void __mdb_entry_to_br_ip(struct br_mdb_entry *entry, struct br_ip *ip,
+ struct nlattr **mdb_attrs)
{
memset(ip, 0, sizeof(struct br_ip));
ip->vid = entry->vid;
ip->proto = entry->addr.proto;
- if (ip->proto == htons(ETH_P_IP))
- ip->u.ip4 = entry->addr.u.ip4;
+ switch (ip->proto) {
+ case htons(ETH_P_IP):
+ ip->dst.ip4 = entry->addr.u.ip4;
+ if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE])
+ ip->src.ip4 = nla_get_in_addr(mdb_attrs[MDBE_ATTR_SOURCE]);
+ break;
#if IS_ENABLED(CONFIG_IPV6)
- else
- ip->u.ip6 = entry->addr.u.ip6;
+ case htons(ETH_P_IPV6):
+ ip->dst.ip6 = entry->addr.u.ip6;
+ if (mdb_attrs && mdb_attrs[MDBE_ATTR_SOURCE])
+ ip->src.ip6 = nla_get_in6_addr(mdb_attrs[MDBE_ATTR_SOURCE]);
+ break;
+#endif
+ default:
+ ether_addr_copy(ip->dst.mac_addr, entry->addr.u.mac_addr);
+ }
+
+}
+
+static int __mdb_fill_srcs(struct sk_buff *skb,
+ struct net_bridge_port_group *p)
+{
+ struct net_bridge_group_src *ent;
+ struct nlattr *nest, *nest_ent;
+
+ if (hlist_empty(&p->src_list))
+ return 0;
+
+ nest = nla_nest_start(skb, MDBA_MDB_EATTR_SRC_LIST);
+ if (!nest)
+ return -EMSGSIZE;
+
+ hlist_for_each_entry_rcu(ent, &p->src_list, node,
+ lockdep_is_held(&p->key.port->br->multicast_lock)) {
+ nest_ent = nla_nest_start(skb, MDBA_MDB_SRCLIST_ENTRY);
+ if (!nest_ent)
+ goto out_cancel_err;
+ switch (ent->addr.proto) {
+ case htons(ETH_P_IP):
+ if (nla_put_in_addr(skb, MDBA_MDB_SRCATTR_ADDRESS,
+ ent->addr.src.ip4)) {
+ nla_nest_cancel(skb, nest_ent);
+ goto out_cancel_err;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ if (nla_put_in6_addr(skb, MDBA_MDB_SRCATTR_ADDRESS,
+ &ent->addr.src.ip6)) {
+ nla_nest_cancel(skb, nest_ent);
+ goto out_cancel_err;
+ }
+ break;
+#endif
+ default:
+ nla_nest_cancel(skb, nest_ent);
+ continue;
+ }
+ if (nla_put_u32(skb, MDBA_MDB_SRCATTR_TIMER,
+ br_timer_value(&ent->timer))) {
+ nla_nest_cancel(skb, nest_ent);
+ goto out_cancel_err;
+ }
+ nla_nest_end(skb, nest_ent);
+ }
+
+ nla_nest_end(skb, nest);
+
+ return 0;
+
+out_cancel_err:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int __mdb_fill_info(struct sk_buff *skb,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *p)
+{
+ bool dump_srcs_mode = false;
+ struct timer_list *mtimer;
+ struct nlattr *nest_ent;
+ struct br_mdb_entry e;
+ u8 flags = 0;
+ int ifindex;
+
+ memset(&e, 0, sizeof(e));
+ if (p) {
+ ifindex = p->key.port->dev->ifindex;
+ mtimer = &p->timer;
+ flags = p->flags;
+ } else {
+ ifindex = mp->br->dev->ifindex;
+ mtimer = &mp->timer;
+ }
+
+ __mdb_entry_fill_flags(&e, flags);
+ e.ifindex = ifindex;
+ e.vid = mp->addr.vid;
+ if (mp->addr.proto == htons(ETH_P_IP)) {
+ e.addr.u.ip4 = mp->addr.dst.ip4;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (mp->addr.proto == htons(ETH_P_IPV6)) {
+ e.addr.u.ip6 = mp->addr.dst.ip6;
+#endif
+ } else {
+ ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr);
+ e.state = MDB_PERMANENT;
+ }
+ e.addr.proto = mp->addr.proto;
+ nest_ent = nla_nest_start_noflag(skb,
+ MDBA_MDB_ENTRY_INFO);
+ if (!nest_ent)
+ return -EMSGSIZE;
+
+ if (nla_put_nohdr(skb, sizeof(e), &e) ||
+ nla_put_u32(skb,
+ MDBA_MDB_EATTR_TIMER,
+ br_timer_value(mtimer)))
+ goto nest_err;
+
+ switch (mp->addr.proto) {
+ case htons(ETH_P_IP):
+ dump_srcs_mode = !!(mp->br->multicast_ctx.multicast_igmp_version == 3);
+ if (mp->addr.src.ip4) {
+ if (nla_put_in_addr(skb, MDBA_MDB_EATTR_SOURCE,
+ mp->addr.src.ip4))
+ goto nest_err;
+ break;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ dump_srcs_mode = !!(mp->br->multicast_ctx.multicast_mld_version == 2);
+ if (!ipv6_addr_any(&mp->addr.src.ip6)) {
+ if (nla_put_in6_addr(skb, MDBA_MDB_EATTR_SOURCE,
+ &mp->addr.src.ip6))
+ goto nest_err;
+ break;
+ }
+ break;
#endif
+ default:
+ ether_addr_copy(e.addr.u.mac_addr, mp->addr.dst.mac_addr);
+ }
+ if (p) {
+ if (nla_put_u8(skb, MDBA_MDB_EATTR_RTPROT, p->rt_protocol))
+ goto nest_err;
+ if (dump_srcs_mode &&
+ (__mdb_fill_srcs(skb, p) ||
+ nla_put_u8(skb, MDBA_MDB_EATTR_GROUP_MODE,
+ p->filter_mode)))
+ goto nest_err;
+ }
+ nla_nest_end(skb, nest_ent);
+
+ return 0;
+
+nest_err:
+ nla_nest_cancel(skb, nest_ent);
+ return -EMSGSIZE;
}
static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
struct net_device *dev)
{
+ int idx = 0, s_idx = cb->args[1], err = 0, pidx = 0, s_pidx = cb->args[2];
struct net_bridge *br = netdev_priv(dev);
- struct net_bridge_mdb_htable *mdb;
+ struct net_bridge_mdb_entry *mp;
struct nlattr *nest, *nest2;
- int i, err = 0;
- int idx = 0, s_idx = cb->args[1];
-
- if (br->multicast_disabled)
- return 0;
-
- mdb = rcu_dereference(br->mdb);
- if (!mdb)
- return 0;
- nest = nla_nest_start(skb, MDBA_MDB);
+ nest = nla_nest_start_noflag(skb, MDBA_MDB);
if (nest == NULL)
return -EMSGSIZE;
- for (i = 0; i < mdb->max; i++) {
- struct net_bridge_mdb_entry *mp;
+ hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) {
struct net_bridge_port_group *p;
struct net_bridge_port_group __rcu **pp;
- struct net_bridge_port *port;
- hlist_for_each_entry_rcu(mp, &mdb->mhash[i], hlist[mdb->ver]) {
- if (idx < s_idx)
- goto skip;
+ if (idx < s_idx)
+ goto skip;
- nest2 = nla_nest_start(skb, MDBA_MDB_ENTRY);
- if (nest2 == NULL) {
- err = -EMSGSIZE;
- goto out;
+ nest2 = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY);
+ if (!nest2) {
+ err = -EMSGSIZE;
+ break;
+ }
+
+ if (!s_pidx && mp->host_joined) {
+ err = __mdb_fill_info(skb, mp, NULL);
+ if (err) {
+ nla_nest_cancel(skb, nest2);
+ break;
}
+ }
- for (pp = &mp->ports;
- (p = rcu_dereference(*pp)) != NULL;
- pp = &p->next) {
- struct nlattr *nest_ent;
- struct br_mdb_entry e;
-
- port = p->port;
- if (!port)
- continue;
-
- memset(&e, 0, sizeof(e));
- e.ifindex = port->dev->ifindex;
- e.vid = p->addr.vid;
- __mdb_entry_fill_flags(&e, p->flags);
- if (p->addr.proto == htons(ETH_P_IP))
- e.addr.u.ip4 = p->addr.u.ip4;
-#if IS_ENABLED(CONFIG_IPV6)
- if (p->addr.proto == htons(ETH_P_IPV6))
- e.addr.u.ip6 = p->addr.u.ip6;
-#endif
- e.addr.proto = p->addr.proto;
- nest_ent = nla_nest_start(skb,
- MDBA_MDB_ENTRY_INFO);
- if (!nest_ent) {
- nla_nest_cancel(skb, nest2);
- err = -EMSGSIZE;
- goto out;
- }
- if (nla_put_nohdr(skb, sizeof(e), &e) ||
- nla_put_u32(skb,
- MDBA_MDB_EATTR_TIMER,
- br_timer_value(&p->timer))) {
- nla_nest_cancel(skb, nest_ent);
- nla_nest_cancel(skb, nest2);
- err = -EMSGSIZE;
- goto out;
- }
- nla_nest_end(skb, nest_ent);
+ for (pp = &mp->ports; (p = rcu_dereference(*pp)) != NULL;
+ pp = &p->next) {
+ if (!p->key.port)
+ continue;
+ if (pidx < s_pidx)
+ goto skip_pg;
+
+ err = __mdb_fill_info(skb, mp, p);
+ if (err) {
+ nla_nest_end(skb, nest2);
+ goto out;
}
- nla_nest_end(skb, nest2);
- skip:
- idx++;
+skip_pg:
+ pidx++;
}
+ pidx = 0;
+ s_pidx = 0;
+ nla_nest_end(skb, nest2);
+skip:
+ idx++;
}
out:
cb->args[1] = idx;
+ cb->args[2] = pidx;
nla_nest_end(skb, nest);
return err;
}
-static int br_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
+int br_mdb_dump(struct net_device *dev, struct sk_buff *skb,
+ struct netlink_callback *cb)
{
- struct net_device *dev;
- struct net *net = sock_net(skb->sk);
- struct nlmsghdr *nlh = NULL;
- int idx = 0, s_idx;
-
- s_idx = cb->args[0];
-
- rcu_read_lock();
-
- /* In theory this could be wrapped to 0... */
- cb->seq = net->dev_base_seq + br_mdb_rehash_seq;
-
- for_each_netdev_rcu(net, dev) {
- if (dev->priv_flags & IFF_EBRIDGE) {
- struct br_port_msg *bpm;
+ struct net_bridge *br = netdev_priv(dev);
+ struct br_port_msg *bpm;
+ struct nlmsghdr *nlh;
+ int err;
- if (idx < s_idx)
- goto skip;
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, RTM_GETMDB, sizeof(*bpm),
+ NLM_F_MULTI);
+ if (!nlh)
+ return -EMSGSIZE;
- nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, RTM_GETMDB,
- sizeof(*bpm), NLM_F_MULTI);
- if (nlh == NULL)
- break;
+ bpm = nlmsg_data(nlh);
+ memset(bpm, 0, sizeof(*bpm));
+ bpm->ifindex = dev->ifindex;
- bpm = nlmsg_data(nlh);
- memset(bpm, 0, sizeof(*bpm));
- bpm->ifindex = dev->ifindex;
- if (br_mdb_fill_info(skb, cb, dev) < 0)
- goto out;
- if (br_rports_fill_info(skb, cb, dev) < 0)
- goto out;
+ rcu_read_lock();
- cb->args[1] = 0;
- nlmsg_end(skb, nlh);
- skip:
- idx++;
- }
- }
+ err = br_mdb_fill_info(skb, cb, dev);
+ if (err)
+ goto out;
+ err = br_rports_fill_info(skb, &br->multicast_ctx);
+ if (err)
+ goto out;
out:
- if (nlh)
- nlmsg_end(skb, nlh);
rcu_read_unlock();
- cb->args[0] = idx;
- return skb->len;
+ nlmsg_end(skb, nlh);
+ return err;
}
static int nlmsg_populate_mdb_fill(struct sk_buff *skb,
struct net_device *dev,
- struct br_mdb_entry *entry, u32 pid,
- u32 seq, int type, unsigned int flags)
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ int type)
{
struct nlmsghdr *nlh;
struct br_port_msg *bpm;
struct nlattr *nest, *nest2;
- nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0);
+ nlh = nlmsg_put(skb, 0, 0, type, sizeof(*bpm), 0);
if (!nlh)
return -EMSGSIZE;
@@ -229,14 +430,14 @@ static int nlmsg_populate_mdb_fill(struct sk_buff *skb,
memset(bpm, 0, sizeof(*bpm));
bpm->family = AF_BRIDGE;
bpm->ifindex = dev->ifindex;
- nest = nla_nest_start(skb, MDBA_MDB);
+ nest = nla_nest_start_noflag(skb, MDBA_MDB);
if (nest == NULL)
goto cancel;
- nest2 = nla_nest_start(skb, MDBA_MDB_ENTRY);
+ nest2 = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY);
if (nest2 == NULL)
goto end;
- if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(*entry), entry))
+ if (__mdb_fill_info(skb, mp, pg))
goto end;
nla_nest_end(skb, nest2);
@@ -251,134 +452,90 @@ cancel:
return -EMSGSIZE;
}
-static inline size_t rtnl_mdb_nlmsg_size(void)
+static size_t rtnl_mdb_nlmsg_pg_size(const struct net_bridge_port_group *pg)
{
- return NLMSG_ALIGN(sizeof(struct br_port_msg))
- + nla_total_size(sizeof(struct br_mdb_entry));
-}
-
-struct br_mdb_complete_info {
- struct net_bridge_port *port;
- struct br_ip ip;
-};
-
-static void br_mdb_complete(struct net_device *dev, int err, void *priv)
-{
- struct br_mdb_complete_info *data = priv;
- struct net_bridge_port_group __rcu **pp;
- struct net_bridge_port_group *p;
- struct net_bridge_mdb_htable *mdb;
- struct net_bridge_mdb_entry *mp;
- struct net_bridge_port *port = data->port;
- struct net_bridge *br = port->br;
+ struct net_bridge_group_src *ent;
+ size_t nlmsg_size, addr_size = 0;
- if (err)
- goto err;
+ /* MDBA_MDB_ENTRY_INFO */
+ nlmsg_size = nla_total_size(sizeof(struct br_mdb_entry)) +
+ /* MDBA_MDB_EATTR_TIMER */
+ nla_total_size(sizeof(u32));
- spin_lock_bh(&br->multicast_lock);
- mdb = mlock_dereference(br->mdb, br);
- mp = br_mdb_ip_get(mdb, &data->ip);
- if (!mp)
+ if (!pg)
goto out;
- for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL;
- pp = &p->next) {
- if (p->port != port)
- continue;
- p->flags |= MDB_PG_FLAGS_OFFLOAD;
- }
-out:
- spin_unlock_bh(&br->multicast_lock);
-err:
- kfree(priv);
-}
-static void br_mdb_switchdev_host_port(struct net_device *dev,
- struct net_device *lower_dev,
- struct br_mdb_entry *entry, int type)
-{
- struct switchdev_obj_port_mdb mdb = {
- .obj = {
- .id = SWITCHDEV_OBJ_ID_HOST_MDB,
- .flags = SWITCHDEV_F_DEFER,
- },
- .vid = entry->vid,
- };
-
- if (entry->addr.proto == htons(ETH_P_IP))
- ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
+ /* MDBA_MDB_EATTR_RTPROT */
+ nlmsg_size += nla_total_size(sizeof(u8));
+
+ switch (pg->key.addr.proto) {
+ case htons(ETH_P_IP):
+ /* MDBA_MDB_EATTR_SOURCE */
+ if (pg->key.addr.src.ip4)
+ nlmsg_size += nla_total_size(sizeof(__be32));
+ if (pg->key.port->br->multicast_ctx.multicast_igmp_version == 2)
+ goto out;
+ addr_size = sizeof(__be32);
+ break;
#if IS_ENABLED(CONFIG_IPV6)
- else
- ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
+ case htons(ETH_P_IPV6):
+ /* MDBA_MDB_EATTR_SOURCE */
+ if (!ipv6_addr_any(&pg->key.addr.src.ip6))
+ nlmsg_size += nla_total_size(sizeof(struct in6_addr));
+ if (pg->key.port->br->multicast_ctx.multicast_mld_version == 1)
+ goto out;
+ addr_size = sizeof(struct in6_addr);
+ break;
#endif
+ }
- mdb.obj.orig_dev = dev;
- switch (type) {
- case RTM_NEWMDB:
- switchdev_port_obj_add(lower_dev, &mdb.obj);
- break;
- case RTM_DELMDB:
- switchdev_port_obj_del(lower_dev, &mdb.obj);
- break;
+ /* MDBA_MDB_EATTR_GROUP_MODE */
+ nlmsg_size += nla_total_size(sizeof(u8));
+
+ /* MDBA_MDB_EATTR_SRC_LIST nested attr */
+ if (!hlist_empty(&pg->src_list))
+ nlmsg_size += nla_total_size(0);
+
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ /* MDBA_MDB_SRCLIST_ENTRY nested attr +
+ * MDBA_MDB_SRCATTR_ADDRESS + MDBA_MDB_SRCATTR_TIMER
+ */
+ nlmsg_size += nla_total_size(0) +
+ nla_total_size(addr_size) +
+ nla_total_size(sizeof(u32));
}
+out:
+ return nlmsg_size;
}
-static void br_mdb_switchdev_host(struct net_device *dev,
- struct br_mdb_entry *entry, int type)
+static size_t rtnl_mdb_nlmsg_size(const struct net_bridge_port_group *pg)
{
- struct net_device *lower_dev;
- struct list_head *iter;
-
- netdev_for_each_lower_dev(dev, lower_dev, iter)
- br_mdb_switchdev_host_port(dev, lower_dev, entry, type);
+ return NLMSG_ALIGN(sizeof(struct br_port_msg)) +
+ /* MDBA_MDB */
+ nla_total_size(0) +
+ /* MDBA_MDB_ENTRY */
+ nla_total_size(0) +
+ /* Port group entry */
+ rtnl_mdb_nlmsg_pg_size(pg);
}
-static void __br_mdb_notify(struct net_device *dev, struct net_bridge_port *p,
- struct br_mdb_entry *entry, int type)
+static void __br_mdb_notify(struct net_device *dev,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ int type, bool notify_switchdev)
{
- struct br_mdb_complete_info *complete_info;
- struct switchdev_obj_port_mdb mdb = {
- .obj = {
- .id = SWITCHDEV_OBJ_ID_PORT_MDB,
- .flags = SWITCHDEV_F_DEFER,
- },
- .vid = entry->vid,
- };
- struct net_device *port_dev;
struct net *net = dev_net(dev);
struct sk_buff *skb;
int err = -ENOBUFS;
- port_dev = __dev_get_by_index(net, entry->ifindex);
- if (entry->addr.proto == htons(ETH_P_IP))
- ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
-#if IS_ENABLED(CONFIG_IPV6)
- else
- ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
-#endif
+ if (notify_switchdev)
+ br_switchdev_mdb_notify(dev, mp, pg, type);
- mdb.obj.orig_dev = port_dev;
- if (p && port_dev && type == RTM_NEWMDB) {
- complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC);
- if (complete_info) {
- complete_info->port = p;
- __mdb_entry_to_br_ip(entry, &complete_info->ip);
- mdb.obj.complete_priv = complete_info;
- mdb.obj.complete = br_mdb_complete;
- if (switchdev_port_obj_add(port_dev, &mdb.obj))
- kfree(complete_info);
- }
- } else if (p && port_dev && type == RTM_DELMDB) {
- switchdev_port_obj_del(port_dev, &mdb.obj);
- }
-
- if (!p)
- br_mdb_switchdev_host(dev, entry, type);
-
- skb = nlmsg_new(rtnl_mdb_nlmsg_size(), GFP_ATOMIC);
+ skb = nlmsg_new(rtnl_mdb_nlmsg_size(pg), GFP_ATOMIC);
if (!skb)
goto errout;
- err = nlmsg_populate_mdb_fill(skb, dev, entry, 0, 0, type, NTF_SELF);
+ err = nlmsg_populate_mdb_fill(skb, dev, mp, pg, type);
if (err < 0) {
kfree_skb(skb);
goto errout;
@@ -390,36 +547,31 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_MDB, err);
}
-void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
- struct br_ip *group, int type, u8 flags)
+void br_mdb_notify(struct net_device *dev,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ int type)
{
- struct br_mdb_entry entry;
+ __br_mdb_notify(dev, mp, pg, type, true);
+}
- memset(&entry, 0, sizeof(entry));
- if (port)
- entry.ifindex = port->dev->ifindex;
- else
- entry.ifindex = dev->ifindex;
- entry.addr.proto = group->proto;
- entry.addr.u.ip4 = group->u.ip4;
-#if IS_ENABLED(CONFIG_IPV6)
- entry.addr.u.ip6 = group->u.ip6;
-#endif
- entry.vid = group->vid;
- __mdb_entry_fill_flags(&entry, flags);
- __br_mdb_notify(dev, port, &entry, type);
+void br_mdb_flag_change_notify(struct net_device *dev,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg)
+{
+ __br_mdb_notify(dev, mp, pg, RTM_NEWMDB, false);
}
static int nlmsg_populate_rtr_fill(struct sk_buff *skb,
struct net_device *dev,
- int ifindex, u32 pid,
+ int ifindex, u16 vid, u32 pid,
u32 seq, int type, unsigned int flags)
{
+ struct nlattr *nest, *port_nest;
struct br_port_msg *bpm;
struct nlmsghdr *nlh;
- struct nlattr *nest;
- nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), NLM_F_MULTI);
+ nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), 0);
if (!nlh)
return -EMSGSIZE;
@@ -427,12 +579,22 @@ static int nlmsg_populate_rtr_fill(struct sk_buff *skb,
memset(bpm, 0, sizeof(*bpm));
bpm->family = AF_BRIDGE;
bpm->ifindex = dev->ifindex;
- nest = nla_nest_start(skb, MDBA_ROUTER);
+ nest = nla_nest_start_noflag(skb, MDBA_ROUTER);
if (!nest)
goto cancel;
- if (nla_put_u32(skb, MDBA_ROUTER_PORT, ifindex))
+ port_nest = nla_nest_start_noflag(skb, MDBA_ROUTER_PORT);
+ if (!port_nest)
+ goto end;
+ if (nla_put_nohdr(skb, sizeof(u32), &ifindex)) {
+ nla_nest_cancel(skb, port_nest);
+ goto end;
+ }
+ if (vid && nla_put_u16(skb, MDBA_ROUTER_PATTR_VID, vid)) {
+ nla_nest_cancel(skb, port_nest);
goto end;
+ }
+ nla_nest_end(skb, port_nest);
nla_nest_end(skb, nest);
nlmsg_end(skb, nlh);
@@ -448,23 +610,28 @@ cancel:
static inline size_t rtnl_rtr_nlmsg_size(void)
{
return NLMSG_ALIGN(sizeof(struct br_port_msg))
- + nla_total_size(sizeof(__u32));
+ + nla_total_size(sizeof(__u32))
+ + nla_total_size(sizeof(u16));
}
-void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
+void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx,
int type)
{
struct net *net = dev_net(dev);
struct sk_buff *skb;
int err = -ENOBUFS;
int ifindex;
+ u16 vid;
- ifindex = port ? port->dev->ifindex : 0;
+ ifindex = pmctx ? pmctx->port->dev->ifindex : 0;
+ vid = pmctx && br_multicast_port_ctx_is_vlan(pmctx) ? pmctx->vlan->vid :
+ 0;
skb = nlmsg_new(rtnl_rtr_nlmsg_size(), GFP_ATOMIC);
if (!skb)
goto errout;
- err = nlmsg_populate_rtr_fill(skb, dev, ifindex, 0, 0, type, NTF_SELF);
+ err = nlmsg_populate_rtr_fill(skb, dev, ifindex, vid, 0, 0, type,
+ NTF_SELF);
if (err < 0) {
kfree_skb(skb);
goto errout;
@@ -477,233 +644,750 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_MDB, err);
}
-static bool is_valid_mdb_entry(struct br_mdb_entry *entry)
-{
- if (entry->ifindex == 0)
- return false;
+static const struct nla_policy
+br_mdbe_src_list_entry_pol[MDBE_SRCATTR_MAX + 1] = {
+ [MDBE_SRCATTR_ADDRESS] = NLA_POLICY_RANGE(NLA_BINARY,
+ sizeof(struct in_addr),
+ sizeof(struct in6_addr)),
+};
- if (entry->addr.proto == htons(ETH_P_IP)) {
- if (!ipv4_is_multicast(entry->addr.u.ip4))
+static const struct nla_policy
+br_mdbe_src_list_pol[MDBE_SRC_LIST_MAX + 1] = {
+ [MDBE_SRC_LIST_ENTRY] = NLA_POLICY_NESTED(br_mdbe_src_list_entry_pol),
+};
+
+static const struct nla_policy br_mdbe_attrs_pol[MDBE_ATTR_MAX + 1] = {
+ [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY,
+ sizeof(struct in_addr),
+ sizeof(struct in6_addr)),
+ [MDBE_ATTR_GROUP_MODE] = NLA_POLICY_RANGE(NLA_U8, MCAST_EXCLUDE,
+ MCAST_INCLUDE),
+ [MDBE_ATTR_SRC_LIST] = NLA_POLICY_NESTED(br_mdbe_src_list_pol),
+ [MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC),
+};
+
+static bool is_valid_mdb_source(struct nlattr *attr, __be16 proto,
+ struct netlink_ext_ack *extack)
+{
+ switch (proto) {
+ case htons(ETH_P_IP):
+ if (nla_len(attr) != sizeof(struct in_addr)) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv4 invalid source address length");
return false;
- if (ipv4_is_local_multicast(entry->addr.u.ip4))
+ }
+ if (ipv4_is_multicast(nla_get_in_addr(attr))) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv4 multicast source address is not allowed");
return false;
+ }
+ break;
#if IS_ENABLED(CONFIG_IPV6)
- } else if (entry->addr.proto == htons(ETH_P_IPV6)) {
- if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6))
+ case htons(ETH_P_IPV6): {
+ struct in6_addr src;
+
+ if (nla_len(attr) != sizeof(struct in6_addr)) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv6 invalid source address length");
+ return false;
+ }
+ src = nla_get_in6_addr(attr);
+ if (ipv6_addr_is_multicast(&src)) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv6 multicast source address is not allowed");
return false;
+ }
+ break;
+ }
#endif
- } else
- return false;
- if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY)
- return false;
- if (entry->vid >= VLAN_VID_MASK)
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Invalid protocol used with source address");
return false;
+ }
return true;
}
-static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct net_device **pdev, struct br_mdb_entry **pentry)
+static struct net_bridge_mcast *
+__br_mdb_choose_context(struct net_bridge *br,
+ const struct br_mdb_entry *entry,
+ struct netlink_ext_ack *extack)
{
- struct net *net = sock_net(skb->sk);
- struct br_mdb_entry *entry;
- struct br_port_msg *bpm;
- struct nlattr *tb[MDBA_SET_ENTRY_MAX+1];
- struct net_device *dev;
- int err;
-
- err = nlmsg_parse(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX, NULL,
- NULL);
- if (err < 0)
- return err;
+ struct net_bridge_mcast *brmctx = NULL;
+ struct net_bridge_vlan *v;
- bpm = nlmsg_data(nlh);
- if (bpm->ifindex == 0) {
- pr_info("PF_BRIDGE: br_mdb_parse() with invalid ifindex\n");
- return -EINVAL;
+ if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) {
+ brmctx = &br->multicast_ctx;
+ goto out;
}
- dev = __dev_get_by_index(net, bpm->ifindex);
- if (dev == NULL) {
- pr_info("PF_BRIDGE: br_mdb_parse() with unknown ifindex\n");
- return -ENODEV;
+ if (!entry->vid) {
+ NL_SET_ERR_MSG_MOD(extack, "Cannot add an entry without a vlan when vlan snooping is enabled");
+ goto out;
}
- if (!(dev->priv_flags & IFF_EBRIDGE)) {
- pr_info("PF_BRIDGE: br_mdb_parse() with non-bridge\n");
- return -EOPNOTSUPP;
+ v = br_vlan_find(br_vlan_group(br), entry->vid);
+ if (!v) {
+ NL_SET_ERR_MSG_MOD(extack, "Vlan is not configured");
+ goto out;
+ }
+ if (br_multicast_ctx_vlan_global_disabled(&v->br_mcast_ctx)) {
+ NL_SET_ERR_MSG_MOD(extack, "Vlan's multicast processing is disabled");
+ goto out;
}
+ brmctx = &v->br_mcast_ctx;
+out:
+ return brmctx;
+}
- *pdev = dev;
+static int br_mdb_replace_group_sg(const struct br_mdb_config *cfg,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ struct net_bridge_mcast *brmctx,
+ unsigned char flags)
+{
+ unsigned long now = jiffies;
- if (!tb[MDBA_SET_ENTRY] ||
- nla_len(tb[MDBA_SET_ENTRY]) != sizeof(struct br_mdb_entry)) {
- pr_info("PF_BRIDGE: br_mdb_parse() with invalid attr\n");
- return -EINVAL;
+ pg->flags = flags;
+ pg->rt_protocol = cfg->rt_protocol;
+ if (!(flags & MDB_PG_FLAGS_PERMANENT) && !cfg->src_entry)
+ mod_timer(&pg->timer,
+ now + brmctx->multicast_membership_interval);
+ else
+ timer_delete(&pg->timer);
+
+ br_mdb_notify(cfg->br->dev, mp, pg, RTM_NEWMDB);
+
+ return 0;
+}
+
+static int br_mdb_add_group_sg(const struct br_mdb_config *cfg,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_mcast *brmctx,
+ unsigned char flags,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_port_group __rcu **pp;
+ struct net_bridge_port_group *p;
+ unsigned long now = jiffies;
+
+ for (pp = &mp->ports;
+ (p = mlock_dereference(*pp, cfg->br)) != NULL;
+ pp = &p->next) {
+ if (p->key.port == cfg->p) {
+ if (!(cfg->nlflags & NLM_F_REPLACE)) {
+ NL_SET_ERR_MSG_MOD(extack, "(S, G) group is already joined by port");
+ return -EEXIST;
+ }
+ return br_mdb_replace_group_sg(cfg, mp, p, brmctx,
+ flags);
+ }
+ if ((unsigned long)p->key.port < (unsigned long)cfg->p)
+ break;
}
- entry = nla_data(tb[MDBA_SET_ENTRY]);
- if (!is_valid_mdb_entry(entry)) {
- pr_info("PF_BRIDGE: br_mdb_parse() with invalid entry\n");
- return -EINVAL;
+ p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL,
+ MCAST_INCLUDE, cfg->rt_protocol, extack);
+ if (unlikely(!p))
+ return -ENOMEM;
+
+ rcu_assign_pointer(*pp, p);
+ if (!(flags & MDB_PG_FLAGS_PERMANENT) && !cfg->src_entry)
+ mod_timer(&p->timer,
+ now + brmctx->multicast_membership_interval);
+ br_mdb_notify(cfg->br->dev, mp, p, RTM_NEWMDB);
+
+ /* All of (*, G) EXCLUDE ports need to be added to the new (S, G) for
+ * proper replication.
+ */
+ if (br_multicast_should_handle_mode(brmctx, cfg->group.proto)) {
+ struct net_bridge_mdb_entry *star_mp;
+ struct br_ip star_group;
+
+ star_group = p->key.addr;
+ memset(&star_group.src, 0, sizeof(star_group.src));
+ star_mp = br_mdb_ip_get(cfg->br, &star_group);
+ if (star_mp)
+ br_multicast_sg_add_exclude_ports(star_mp, p);
}
- *pentry = entry;
return 0;
}
-static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
- struct br_ip *group, unsigned char state)
+static int br_mdb_add_group_src_fwd(const struct br_mdb_config *cfg,
+ struct br_ip *src_ip,
+ struct net_bridge_mcast *brmctx,
+ struct netlink_ext_ack *extack)
{
- struct net_bridge_mdb_entry *mp;
- struct net_bridge_port_group *p;
- struct net_bridge_port_group __rcu **pp;
- struct net_bridge_mdb_htable *mdb;
+ struct net_bridge_mdb_entry *sgmp;
+ struct br_mdb_config sg_cfg;
+ struct br_ip sg_ip;
+ u8 flags = 0;
+
+ sg_ip = cfg->group;
+ sg_ip.src = src_ip->src;
+ sgmp = br_multicast_new_group(cfg->br, &sg_ip);
+ if (IS_ERR(sgmp)) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to add (S, G) MDB entry");
+ return PTR_ERR(sgmp);
+ }
+
+ if (cfg->entry->state == MDB_PERMANENT)
+ flags |= MDB_PG_FLAGS_PERMANENT;
+ if (cfg->filter_mode == MCAST_EXCLUDE)
+ flags |= MDB_PG_FLAGS_BLOCKED;
+
+ memset(&sg_cfg, 0, sizeof(sg_cfg));
+ sg_cfg.br = cfg->br;
+ sg_cfg.p = cfg->p;
+ sg_cfg.entry = cfg->entry;
+ sg_cfg.group = sg_ip;
+ sg_cfg.src_entry = true;
+ sg_cfg.filter_mode = MCAST_INCLUDE;
+ sg_cfg.rt_protocol = cfg->rt_protocol;
+ sg_cfg.nlflags = cfg->nlflags;
+ return br_mdb_add_group_sg(&sg_cfg, sgmp, brmctx, flags, extack);
+}
+
+static int br_mdb_add_group_src(const struct br_mdb_config *cfg,
+ struct net_bridge_port_group *pg,
+ struct net_bridge_mcast *brmctx,
+ struct br_mdb_src_entry *src,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_group_src *ent;
unsigned long now = jiffies;
int err;
- mdb = mlock_dereference(br->mdb, br);
- mp = br_mdb_ip_get(mdb, group);
- if (!mp) {
- mp = br_multicast_new_group(br, port, group);
- err = PTR_ERR_OR_ZERO(mp);
+ ent = br_multicast_find_group_src(pg, &src->addr);
+ if (!ent) {
+ ent = br_multicast_new_group_src(pg, &src->addr);
+ if (!ent) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to add new source entry");
+ return -ENOSPC;
+ }
+ } else if (!(cfg->nlflags & NLM_F_REPLACE)) {
+ NL_SET_ERR_MSG_MOD(extack, "Source entry already exists");
+ return -EEXIST;
+ }
+
+ if (cfg->filter_mode == MCAST_INCLUDE &&
+ cfg->entry->state == MDB_TEMPORARY)
+ mod_timer(&ent->timer, now + br_multicast_gmi(brmctx));
+ else
+ timer_delete(&ent->timer);
+
+ /* Install a (S, G) forwarding entry for the source. */
+ err = br_mdb_add_group_src_fwd(cfg, &src->addr, brmctx, extack);
+ if (err)
+ goto err_del_sg;
+
+ ent->flags = BR_SGRP_F_INSTALLED | BR_SGRP_F_USER_ADDED;
+
+ return 0;
+
+err_del_sg:
+ __br_multicast_del_group_src(ent);
+ return err;
+}
+
+static void br_mdb_del_group_src(struct net_bridge_port_group *pg,
+ struct br_mdb_src_entry *src)
+{
+ struct net_bridge_group_src *ent;
+
+ ent = br_multicast_find_group_src(pg, &src->addr);
+ if (WARN_ON_ONCE(!ent))
+ return;
+ br_multicast_del_group_src(ent, false);
+}
+
+static int br_mdb_add_group_srcs(const struct br_mdb_config *cfg,
+ struct net_bridge_port_group *pg,
+ struct net_bridge_mcast *brmctx,
+ struct netlink_ext_ack *extack)
+{
+ int i, err;
+
+ for (i = 0; i < cfg->num_src_entries; i++) {
+ err = br_mdb_add_group_src(cfg, pg, brmctx,
+ &cfg->src_entries[i], extack);
if (err)
- return err;
+ goto err_del_group_srcs;
}
+ return 0;
+
+err_del_group_srcs:
+ for (i--; i >= 0; i--)
+ br_mdb_del_group_src(pg, &cfg->src_entries[i]);
+ return err;
+}
+
+static int br_mdb_replace_group_srcs(const struct br_mdb_config *cfg,
+ struct net_bridge_port_group *pg,
+ struct net_bridge_mcast *brmctx,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_group_src *ent;
+ struct hlist_node *tmp;
+ int err;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags |= BR_SGRP_F_DELETE;
+
+ err = br_mdb_add_group_srcs(cfg, pg, brmctx, extack);
+ if (err)
+ goto err_clear_delete;
+
+ hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node) {
+ if (ent->flags & BR_SGRP_F_DELETE)
+ br_multicast_del_group_src(ent, false);
+ }
+
+ return 0;
+
+err_clear_delete:
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags &= ~BR_SGRP_F_DELETE;
+ return err;
+}
+
+static int br_mdb_replace_group_star_g(const struct br_mdb_config *cfg,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ struct net_bridge_mcast *brmctx,
+ unsigned char flags,
+ struct netlink_ext_ack *extack)
+{
+ unsigned long now = jiffies;
+ int err;
+
+ err = br_mdb_replace_group_srcs(cfg, pg, brmctx, extack);
+ if (err)
+ return err;
+
+ pg->flags = flags;
+ pg->filter_mode = cfg->filter_mode;
+ pg->rt_protocol = cfg->rt_protocol;
+ if (!(flags & MDB_PG_FLAGS_PERMANENT) &&
+ cfg->filter_mode == MCAST_EXCLUDE)
+ mod_timer(&pg->timer,
+ now + brmctx->multicast_membership_interval);
+ else
+ timer_delete(&pg->timer);
+
+ br_mdb_notify(cfg->br->dev, mp, pg, RTM_NEWMDB);
+
+ if (br_multicast_should_handle_mode(brmctx, cfg->group.proto))
+ br_multicast_star_g_handle_mode(pg, cfg->filter_mode);
+
+ return 0;
+}
+
+static int br_mdb_add_group_star_g(const struct br_mdb_config *cfg,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_mcast *brmctx,
+ unsigned char flags,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_port_group __rcu **pp;
+ struct net_bridge_port_group *p;
+ unsigned long now = jiffies;
+ int err;
+
for (pp = &mp->ports;
- (p = mlock_dereference(*pp, br)) != NULL;
+ (p = mlock_dereference(*pp, cfg->br)) != NULL;
pp = &p->next) {
- if (p->port == port)
- return -EEXIST;
- if ((unsigned long)p->port < (unsigned long)port)
+ if (p->key.port == cfg->p) {
+ if (!(cfg->nlflags & NLM_F_REPLACE)) {
+ NL_SET_ERR_MSG_MOD(extack, "(*, G) group is already joined by port");
+ return -EEXIST;
+ }
+ return br_mdb_replace_group_star_g(cfg, mp, p, brmctx,
+ flags, extack);
+ }
+ if ((unsigned long)p->key.port < (unsigned long)cfg->p)
break;
}
- p = br_multicast_new_port_group(port, group, *pp, state, NULL);
+ p = br_multicast_new_port_group(cfg->p, &cfg->group, *pp, flags, NULL,
+ cfg->filter_mode, cfg->rt_protocol,
+ extack);
if (unlikely(!p))
return -ENOMEM;
+
+ err = br_mdb_add_group_srcs(cfg, p, brmctx, extack);
+ if (err)
+ goto err_del_port_group;
+
rcu_assign_pointer(*pp, p);
- if (state == MDB_TEMPORARY)
- mod_timer(&p->timer, now + br->multicast_membership_interval);
+ if (!(flags & MDB_PG_FLAGS_PERMANENT) &&
+ cfg->filter_mode == MCAST_EXCLUDE)
+ mod_timer(&p->timer,
+ now + brmctx->multicast_membership_interval);
+ br_mdb_notify(cfg->br->dev, mp, p, RTM_NEWMDB);
+ /* If we are adding a new EXCLUDE port group (*, G), it needs to be
+ * also added to all (S, G) entries for proper replication.
+ */
+ if (br_multicast_should_handle_mode(brmctx, cfg->group.proto) &&
+ cfg->filter_mode == MCAST_EXCLUDE)
+ br_multicast_star_g_handle_mode(p, MCAST_EXCLUDE);
return 0;
+
+err_del_port_group:
+ br_multicast_del_port_group(p);
+ return err;
}
-static int __br_mdb_add(struct net *net, struct net_bridge *br,
- struct br_mdb_entry *entry)
+static int br_mdb_add_group(const struct br_mdb_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct br_mdb_entry *entry = cfg->entry;
+ struct net_bridge_port *port = cfg->p;
+ struct net_bridge_mdb_entry *mp;
+ struct net_bridge *br = cfg->br;
+ struct net_bridge_mcast *brmctx;
+ struct br_ip group = cfg->group;
+ unsigned char flags = 0;
+
+ brmctx = __br_mdb_choose_context(br, entry, extack);
+ if (!brmctx)
+ return -EINVAL;
+
+ mp = br_multicast_new_group(br, &group);
+ if (IS_ERR(mp))
+ return PTR_ERR(mp);
+
+ /* host join */
+ if (!port) {
+ if (mp->host_joined && !(cfg->nlflags & NLM_F_REPLACE)) {
+ NL_SET_ERR_MSG_MOD(extack, "Group is already joined by host");
+ return -EEXIST;
+ }
+
+ br_multicast_host_join(brmctx, mp, false);
+ br_mdb_notify(br->dev, mp, NULL, RTM_NEWMDB);
+
+ return 0;
+ }
+
+ if (entry->state == MDB_PERMANENT)
+ flags |= MDB_PG_FLAGS_PERMANENT;
+
+ if (br_multicast_is_star_g(&group))
+ return br_mdb_add_group_star_g(cfg, mp, brmctx, flags, extack);
+ else
+ return br_mdb_add_group_sg(cfg, mp, brmctx, flags, extack);
+}
+
+static int __br_mdb_add(const struct br_mdb_config *cfg,
+ struct netlink_ext_ack *extack)
{
- struct br_ip ip;
- struct net_device *dev;
- struct net_bridge_port *p;
int ret;
- if (!netif_running(br->dev) || br->multicast_disabled)
+ spin_lock_bh(&cfg->br->multicast_lock);
+ ret = br_mdb_add_group(cfg, extack);
+ spin_unlock_bh(&cfg->br->multicast_lock);
+
+ return ret;
+}
+
+static int br_mdb_config_src_entry_init(struct nlattr *src_entry,
+ struct br_mdb_src_entry *src,
+ __be16 proto,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[MDBE_SRCATTR_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, MDBE_SRCATTR_MAX, src_entry,
+ br_mdbe_src_list_entry_pol, extack);
+ if (err)
+ return err;
+
+ if (NL_REQ_ATTR_CHECK(extack, src_entry, tb, MDBE_SRCATTR_ADDRESS))
return -EINVAL;
- dev = __dev_get_by_index(net, entry->ifindex);
- if (!dev)
- return -ENODEV;
+ if (!is_valid_mdb_source(tb[MDBE_SRCATTR_ADDRESS], proto, extack))
+ return -EINVAL;
+
+ src->addr.proto = proto;
+ nla_memcpy(&src->addr.src, tb[MDBE_SRCATTR_ADDRESS],
+ nla_len(tb[MDBE_SRCATTR_ADDRESS]));
+
+ return 0;
+}
- p = br_port_get_rtnl(dev);
- if (!p || p->br != br || p->state == BR_STATE_DISABLED)
+static int br_mdb_config_src_list_init(struct nlattr *src_list,
+ struct br_mdb_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *src_entry;
+ int rem, err;
+ int i = 0;
+
+ nla_for_each_nested(src_entry, src_list, rem)
+ cfg->num_src_entries++;
+
+ if (cfg->num_src_entries >= PG_SRC_ENT_LIMIT) {
+ NL_SET_ERR_MSG_FMT_MOD(extack, "Exceeded maximum number of source entries (%u)",
+ PG_SRC_ENT_LIMIT - 1);
return -EINVAL;
+ }
- __mdb_entry_to_br_ip(entry, &ip);
+ cfg->src_entries = kcalloc(cfg->num_src_entries,
+ sizeof(struct br_mdb_src_entry), GFP_KERNEL);
+ if (!cfg->src_entries)
+ return -ENOMEM;
- spin_lock_bh(&br->multicast_lock);
- ret = br_mdb_add_group(br, p, &ip, entry->state);
- spin_unlock_bh(&br->multicast_lock);
- return ret;
+ nla_for_each_nested(src_entry, src_list, rem) {
+ err = br_mdb_config_src_entry_init(src_entry,
+ &cfg->src_entries[i],
+ cfg->entry->addr.proto,
+ extack);
+ if (err)
+ goto err_src_entry_init;
+ i++;
+ }
+
+ return 0;
+
+err_src_entry_init:
+ kfree(cfg->src_entries);
+ return err;
+}
+
+static void br_mdb_config_src_list_fini(struct br_mdb_config *cfg)
+{
+ kfree(cfg->src_entries);
+}
+
+static int br_mdb_config_attrs_init(struct nlattr *set_attrs,
+ struct br_mdb_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *mdb_attrs[MDBE_ATTR_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(mdb_attrs, MDBE_ATTR_MAX, set_attrs,
+ br_mdbe_attrs_pol, extack);
+ if (err)
+ return err;
+
+ if (mdb_attrs[MDBE_ATTR_SOURCE] &&
+ !is_valid_mdb_source(mdb_attrs[MDBE_ATTR_SOURCE],
+ cfg->entry->addr.proto, extack))
+ return -EINVAL;
+
+ __mdb_entry_to_br_ip(cfg->entry, &cfg->group, mdb_attrs);
+
+ if (mdb_attrs[MDBE_ATTR_GROUP_MODE]) {
+ if (!cfg->p) {
+ NL_SET_ERR_MSG_MOD(extack, "Filter mode cannot be set for host groups");
+ return -EINVAL;
+ }
+ if (!br_multicast_is_star_g(&cfg->group)) {
+ NL_SET_ERR_MSG_MOD(extack, "Filter mode can only be set for (*, G) entries");
+ return -EINVAL;
+ }
+ cfg->filter_mode = nla_get_u8(mdb_attrs[MDBE_ATTR_GROUP_MODE]);
+ } else {
+ cfg->filter_mode = MCAST_EXCLUDE;
+ }
+
+ if (mdb_attrs[MDBE_ATTR_SRC_LIST]) {
+ if (!cfg->p) {
+ NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set for host groups");
+ return -EINVAL;
+ }
+ if (!br_multicast_is_star_g(&cfg->group)) {
+ NL_SET_ERR_MSG_MOD(extack, "Source list can only be set for (*, G) entries");
+ return -EINVAL;
+ }
+ if (!mdb_attrs[MDBE_ATTR_GROUP_MODE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Source list cannot be set without filter mode");
+ return -EINVAL;
+ }
+ err = br_mdb_config_src_list_init(mdb_attrs[MDBE_ATTR_SRC_LIST],
+ cfg, extack);
+ if (err)
+ return err;
+ }
+
+ if (!cfg->num_src_entries && cfg->filter_mode == MCAST_INCLUDE) {
+ NL_SET_ERR_MSG_MOD(extack, "Cannot add (*, G) INCLUDE with an empty source list");
+ return -EINVAL;
+ }
+
+ if (mdb_attrs[MDBE_ATTR_RTPROT]) {
+ if (!cfg->p) {
+ NL_SET_ERR_MSG_MOD(extack, "Protocol cannot be set for host groups");
+ return -EINVAL;
+ }
+ cfg->rt_protocol = nla_get_u8(mdb_attrs[MDBE_ATTR_RTPROT]);
+ }
+
+ return 0;
+}
+
+static int br_mdb_config_init(struct br_mdb_config *cfg, struct net_device *dev,
+ struct nlattr *tb[], u16 nlmsg_flags,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = dev_net(dev);
+
+ memset(cfg, 0, sizeof(*cfg));
+ cfg->filter_mode = MCAST_EXCLUDE;
+ cfg->rt_protocol = RTPROT_STATIC;
+ cfg->nlflags = nlmsg_flags;
+
+ cfg->br = netdev_priv(dev);
+
+ if (!netif_running(cfg->br->dev)) {
+ NL_SET_ERR_MSG_MOD(extack, "Bridge device is not running");
+ return -EINVAL;
+ }
+
+ if (!br_opt_get(cfg->br, BROPT_MULTICAST_ENABLED)) {
+ NL_SET_ERR_MSG_MOD(extack, "Bridge's multicast processing is disabled");
+ return -EINVAL;
+ }
+
+ cfg->entry = nla_data(tb[MDBA_SET_ENTRY]);
+
+ if (cfg->entry->ifindex != cfg->br->dev->ifindex) {
+ struct net_device *pdev;
+
+ pdev = __dev_get_by_index(net, cfg->entry->ifindex);
+ if (!pdev) {
+ NL_SET_ERR_MSG_MOD(extack, "Port net device doesn't exist");
+ return -ENODEV;
+ }
+
+ cfg->p = br_port_get_rtnl(pdev);
+ if (!cfg->p) {
+ NL_SET_ERR_MSG_MOD(extack, "Net device is not a bridge port");
+ return -EINVAL;
+ }
+
+ if (cfg->p->br != cfg->br) {
+ NL_SET_ERR_MSG_MOD(extack, "Port belongs to a different bridge device");
+ return -EINVAL;
+ }
+ }
+
+ if (cfg->entry->addr.proto == htons(ETH_P_IP) &&
+ ipv4_is_zeronet(cfg->entry->addr.u.ip4)) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv4 entry group address 0.0.0.0 is not allowed");
+ return -EINVAL;
+ }
+
+ if (tb[MDBA_SET_ENTRY_ATTRS])
+ return br_mdb_config_attrs_init(tb[MDBA_SET_ENTRY_ATTRS], cfg,
+ extack);
+ else
+ __mdb_entry_to_br_ip(cfg->entry, &cfg->group, NULL);
+
+ return 0;
}
-static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
+static void br_mdb_config_fini(struct br_mdb_config *cfg)
+{
+ br_mdb_config_src_list_fini(cfg);
+}
+
+int br_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags,
+ struct netlink_ext_ack *extack)
{
- struct net *net = sock_net(skb->sk);
struct net_bridge_vlan_group *vg;
- struct net_device *dev, *pdev;
- struct br_mdb_entry *entry;
- struct net_bridge_port *p;
struct net_bridge_vlan *v;
- struct net_bridge *br;
+ struct br_mdb_config cfg;
int err;
- err = br_mdb_parse(skb, nlh, &dev, &entry);
- if (err < 0)
+ err = br_mdb_config_init(&cfg, dev, tb, nlmsg_flags, extack);
+ if (err)
return err;
- br = netdev_priv(dev);
+ err = -EINVAL;
+ /* host join errors which can happen before creating the group */
+ if (!cfg.p && !br_group_is_l2(&cfg.group)) {
+ /* don't allow any flags for host-joined IP groups */
+ if (cfg.entry->state) {
+ NL_SET_ERR_MSG_MOD(extack, "Flags are not allowed for host groups");
+ goto out;
+ }
+ if (!br_multicast_is_star_g(&cfg.group)) {
+ NL_SET_ERR_MSG_MOD(extack, "Groups with sources cannot be manually host joined");
+ goto out;
+ }
+ }
+
+ if (br_group_is_l2(&cfg.group) && cfg.entry->state != MDB_PERMANENT) {
+ NL_SET_ERR_MSG_MOD(extack, "Only permanent L2 entries allowed");
+ goto out;
+ }
+
+ if (cfg.p) {
+ if (cfg.p->state == BR_STATE_DISABLED && cfg.entry->state != MDB_PERMANENT) {
+ NL_SET_ERR_MSG_MOD(extack, "Port is in disabled state and entry is not permanent");
+ goto out;
+ }
+ vg = nbp_vlan_group(cfg.p);
+ } else {
+ vg = br_vlan_group(cfg.br);
+ }
/* If vlan filtering is enabled and VLAN is not specified
* install mdb entry on all vlans configured on the port.
*/
- pdev = __dev_get_by_index(net, entry->ifindex);
- if (!pdev)
- return -ENODEV;
-
- p = br_port_get_rtnl(pdev);
- if (!p || p->br != br || p->state == BR_STATE_DISABLED)
- return -EINVAL;
-
- vg = nbp_vlan_group(p);
- if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) {
+ if (br_vlan_enabled(cfg.br->dev) && vg && cfg.entry->vid == 0) {
list_for_each_entry(v, &vg->vlan_list, vlist) {
- entry->vid = v->vid;
- err = __br_mdb_add(net, br, entry);
+ cfg.entry->vid = v->vid;
+ cfg.group.vid = v->vid;
+ err = __br_mdb_add(&cfg, extack);
if (err)
break;
- __br_mdb_notify(dev, p, entry, RTM_NEWMDB);
}
} else {
- err = __br_mdb_add(net, br, entry);
- if (!err)
- __br_mdb_notify(dev, p, entry, RTM_NEWMDB);
+ err = __br_mdb_add(&cfg, extack);
}
+out:
+ br_mdb_config_fini(&cfg);
return err;
}
-static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
+static int __br_mdb_del(const struct br_mdb_config *cfg)
{
- struct net_bridge_mdb_htable *mdb;
+ struct br_mdb_entry *entry = cfg->entry;
+ struct net_bridge *br = cfg->br;
struct net_bridge_mdb_entry *mp;
struct net_bridge_port_group *p;
struct net_bridge_port_group __rcu **pp;
- struct br_ip ip;
+ struct br_ip ip = cfg->group;
int err = -EINVAL;
- if (!netif_running(br->dev) || br->multicast_disabled)
- return -EINVAL;
-
- __mdb_entry_to_br_ip(entry, &ip);
-
spin_lock_bh(&br->multicast_lock);
- mdb = mlock_dereference(br->mdb, br);
-
- mp = br_mdb_ip_get(mdb, &ip);
+ mp = br_mdb_ip_get(br, &ip);
if (!mp)
goto unlock;
+ /* host leave */
+ if (entry->ifindex == mp->br->dev->ifindex && mp->host_joined) {
+ br_multicast_host_leave(mp, false);
+ err = 0;
+ br_mdb_notify(br->dev, mp, NULL, RTM_DELMDB);
+ if (!mp->ports && netif_running(br->dev))
+ mod_timer(&mp->timer, jiffies);
+ goto unlock;
+ }
+
for (pp = &mp->ports;
(p = mlock_dereference(*pp, br)) != NULL;
pp = &p->next) {
- if (!p->port || p->port->dev->ifindex != entry->ifindex)
+ if (!p->key.port || p->key.port->dev->ifindex != entry->ifindex)
continue;
- if (p->port->state == BR_STATE_DISABLED)
- goto unlock;
-
- __mdb_entry_fill_flags(entry, p->flags);
- rcu_assign_pointer(*pp, p->next);
- hlist_del_init(&p->mglist);
- del_timer(&p->timer);
- call_rcu_bh(&p->rcu, br_multicast_free_pg);
+ br_multicast_del_pg(mp, p, pp);
err = 0;
-
- if (!mp->ports && !mp->host_joined &&
- netif_running(br->dev))
- mod_timer(&mp->timer, jiffies);
break;
}
@@ -712,62 +1396,327 @@ unlock:
return err;
}
-static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
+int br_mdb_del(struct net_device *dev, struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
{
- struct net *net = sock_net(skb->sk);
struct net_bridge_vlan_group *vg;
- struct net_device *dev, *pdev;
- struct br_mdb_entry *entry;
- struct net_bridge_port *p;
struct net_bridge_vlan *v;
- struct net_bridge *br;
+ struct br_mdb_config cfg;
int err;
- err = br_mdb_parse(skb, nlh, &dev, &entry);
- if (err < 0)
+ err = br_mdb_config_init(&cfg, dev, tb, 0, extack);
+ if (err)
return err;
- br = netdev_priv(dev);
+ if (cfg.p)
+ vg = nbp_vlan_group(cfg.p);
+ else
+ vg = br_vlan_group(cfg.br);
/* If vlan filtering is enabled and VLAN is not specified
* delete mdb entry on all vlans configured on the port.
*/
- pdev = __dev_get_by_index(net, entry->ifindex);
- if (!pdev)
- return -ENODEV;
-
- p = br_port_get_rtnl(pdev);
- if (!p || p->br != br || p->state == BR_STATE_DISABLED)
- return -EINVAL;
-
- vg = nbp_vlan_group(p);
- if (br_vlan_enabled(br->dev) && vg && entry->vid == 0) {
+ if (br_vlan_enabled(cfg.br->dev) && vg && cfg.entry->vid == 0) {
list_for_each_entry(v, &vg->vlan_list, vlist) {
- entry->vid = v->vid;
- err = __br_mdb_del(br, entry);
- if (!err)
- __br_mdb_notify(dev, p, entry, RTM_DELMDB);
+ cfg.entry->vid = v->vid;
+ cfg.group.vid = v->vid;
+ err = __br_mdb_del(&cfg);
}
} else {
- err = __br_mdb_del(br, entry);
- if (!err)
- __br_mdb_notify(dev, p, entry, RTM_DELMDB);
+ err = __br_mdb_del(&cfg);
}
+ br_mdb_config_fini(&cfg);
return err;
}
-void br_mdb_init(void)
+struct br_mdb_flush_desc {
+ u32 port_ifindex;
+ u16 vid;
+ u8 rt_protocol;
+ u8 state;
+ u8 state_mask;
+};
+
+static const struct nla_policy br_mdbe_attrs_del_bulk_pol[MDBE_ATTR_MAX + 1] = {
+ [MDBE_ATTR_RTPROT] = NLA_POLICY_MIN(NLA_U8, RTPROT_STATIC),
+ [MDBE_ATTR_STATE_MASK] = NLA_POLICY_MASK(NLA_U8, MDB_PERMANENT),
+};
+
+static int br_mdb_flush_desc_init(struct br_mdb_flush_desc *desc,
+ struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ struct br_mdb_entry *entry = nla_data(tb[MDBA_SET_ENTRY]);
+ struct nlattr *mdbe_attrs[MDBE_ATTR_MAX + 1];
+ int err;
+
+ desc->port_ifindex = entry->ifindex;
+ desc->vid = entry->vid;
+ desc->state = entry->state;
+
+ if (!tb[MDBA_SET_ENTRY_ATTRS])
+ return 0;
+
+ err = nla_parse_nested(mdbe_attrs, MDBE_ATTR_MAX,
+ tb[MDBA_SET_ENTRY_ATTRS],
+ br_mdbe_attrs_del_bulk_pol, extack);
+ if (err)
+ return err;
+
+ if (mdbe_attrs[MDBE_ATTR_STATE_MASK])
+ desc->state_mask = nla_get_u8(mdbe_attrs[MDBE_ATTR_STATE_MASK]);
+
+ if (mdbe_attrs[MDBE_ATTR_RTPROT])
+ desc->rt_protocol = nla_get_u8(mdbe_attrs[MDBE_ATTR_RTPROT]);
+
+ return 0;
+}
+
+static void br_mdb_flush_host(struct net_bridge *br,
+ struct net_bridge_mdb_entry *mp,
+ const struct br_mdb_flush_desc *desc)
+{
+ u8 state;
+
+ if (desc->port_ifindex && desc->port_ifindex != br->dev->ifindex)
+ return;
+
+ if (desc->rt_protocol)
+ return;
+
+ state = br_group_is_l2(&mp->addr) ? MDB_PERMANENT : 0;
+ if (desc->state_mask && (state & desc->state_mask) != desc->state)
+ return;
+
+ br_multicast_host_leave(mp, true);
+ if (!mp->ports && netif_running(br->dev))
+ mod_timer(&mp->timer, jiffies);
+}
+
+static void br_mdb_flush_pgs(struct net_bridge *br,
+ struct net_bridge_mdb_entry *mp,
+ const struct br_mdb_flush_desc *desc)
+{
+ struct net_bridge_port_group __rcu **pp;
+ struct net_bridge_port_group *p;
+
+ for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL;) {
+ u8 state;
+
+ if (desc->port_ifindex &&
+ desc->port_ifindex != p->key.port->dev->ifindex) {
+ pp = &p->next;
+ continue;
+ }
+
+ if (desc->rt_protocol && desc->rt_protocol != p->rt_protocol) {
+ pp = &p->next;
+ continue;
+ }
+
+ state = p->flags & MDB_PG_FLAGS_PERMANENT ? MDB_PERMANENT : 0;
+ if (desc->state_mask &&
+ (state & desc->state_mask) != desc->state) {
+ pp = &p->next;
+ continue;
+ }
+
+ br_multicast_del_pg(mp, p, pp);
+ }
+}
+
+static void br_mdb_flush(struct net_bridge *br,
+ const struct br_mdb_flush_desc *desc)
+{
+ struct net_bridge_mdb_entry *mp;
+
+ spin_lock_bh(&br->multicast_lock);
+
+ /* Safe variant is not needed because entries are removed from the list
+ * upon group timer expiration or bridge deletion.
+ */
+ hlist_for_each_entry(mp, &br->mdb_list, mdb_node) {
+ if (desc->vid && desc->vid != mp->addr.vid)
+ continue;
+
+ br_mdb_flush_host(br, mp, desc);
+ br_mdb_flush_pgs(br, mp, desc);
+ }
+
+ spin_unlock_bh(&br->multicast_lock);
+}
+
+int br_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
{
- rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETMDB, NULL, br_mdb_dump, 0);
- rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWMDB, br_mdb_add, NULL, 0);
- rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_DELMDB, br_mdb_del, NULL, 0);
+ struct net_bridge *br = netdev_priv(dev);
+ struct br_mdb_flush_desc desc = {};
+ int err;
+
+ err = br_mdb_flush_desc_init(&desc, tb, extack);
+ if (err)
+ return err;
+
+ br_mdb_flush(br, &desc);
+
+ return 0;
}
-void br_mdb_uninit(void)
+static const struct nla_policy br_mdbe_attrs_get_pol[MDBE_ATTR_MAX + 1] = {
+ [MDBE_ATTR_SOURCE] = NLA_POLICY_RANGE(NLA_BINARY,
+ sizeof(struct in_addr),
+ sizeof(struct in6_addr)),
+};
+
+static int br_mdb_get_parse(struct net_device *dev, struct nlattr *tb[],
+ struct br_ip *group, struct netlink_ext_ack *extack)
{
- rtnl_unregister(PF_BRIDGE, RTM_GETMDB);
- rtnl_unregister(PF_BRIDGE, RTM_NEWMDB);
- rtnl_unregister(PF_BRIDGE, RTM_DELMDB);
+ struct br_mdb_entry *entry = nla_data(tb[MDBA_GET_ENTRY]);
+ struct nlattr *mdbe_attrs[MDBE_ATTR_MAX + 1];
+ int err;
+
+ if (!tb[MDBA_GET_ENTRY_ATTRS]) {
+ __mdb_entry_to_br_ip(entry, group, NULL);
+ return 0;
+ }
+
+ err = nla_parse_nested(mdbe_attrs, MDBE_ATTR_MAX,
+ tb[MDBA_GET_ENTRY_ATTRS], br_mdbe_attrs_get_pol,
+ extack);
+ if (err)
+ return err;
+
+ if (mdbe_attrs[MDBE_ATTR_SOURCE] &&
+ !is_valid_mdb_source(mdbe_attrs[MDBE_ATTR_SOURCE],
+ entry->addr.proto, extack))
+ return -EINVAL;
+
+ __mdb_entry_to_br_ip(entry, group, mdbe_attrs);
+
+ return 0;
+}
+
+static struct sk_buff *
+br_mdb_get_reply_alloc(const struct net_bridge_mdb_entry *mp)
+{
+ struct net_bridge_port_group *pg;
+ size_t nlmsg_size;
+
+ nlmsg_size = NLMSG_ALIGN(sizeof(struct br_port_msg)) +
+ /* MDBA_MDB */
+ nla_total_size(0) +
+ /* MDBA_MDB_ENTRY */
+ nla_total_size(0);
+
+ if (mp->host_joined)
+ nlmsg_size += rtnl_mdb_nlmsg_pg_size(NULL);
+
+ for (pg = mlock_dereference(mp->ports, mp->br); pg;
+ pg = mlock_dereference(pg->next, mp->br))
+ nlmsg_size += rtnl_mdb_nlmsg_pg_size(pg);
+
+ return nlmsg_new(nlmsg_size, GFP_ATOMIC);
+}
+
+static int br_mdb_get_reply_fill(struct sk_buff *skb,
+ struct net_bridge_mdb_entry *mp, u32 portid,
+ u32 seq)
+{
+ struct nlattr *mdb_nest, *mdb_entry_nest;
+ struct net_bridge_port_group *pg;
+ struct br_port_msg *bpm;
+ struct nlmsghdr *nlh;
+ int err;
+
+ nlh = nlmsg_put(skb, portid, seq, RTM_NEWMDB, sizeof(*bpm), 0);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ bpm = nlmsg_data(nlh);
+ memset(bpm, 0, sizeof(*bpm));
+ bpm->family = AF_BRIDGE;
+ bpm->ifindex = mp->br->dev->ifindex;
+ mdb_nest = nla_nest_start_noflag(skb, MDBA_MDB);
+ if (!mdb_nest) {
+ err = -EMSGSIZE;
+ goto cancel;
+ }
+ mdb_entry_nest = nla_nest_start_noflag(skb, MDBA_MDB_ENTRY);
+ if (!mdb_entry_nest) {
+ err = -EMSGSIZE;
+ goto cancel;
+ }
+
+ if (mp->host_joined) {
+ err = __mdb_fill_info(skb, mp, NULL);
+ if (err)
+ goto cancel;
+ }
+
+ for (pg = mlock_dereference(mp->ports, mp->br); pg;
+ pg = mlock_dereference(pg->next, mp->br)) {
+ err = __mdb_fill_info(skb, mp, pg);
+ if (err)
+ goto cancel;
+ }
+
+ nla_nest_end(skb, mdb_entry_nest);
+ nla_nest_end(skb, mdb_nest);
+ nlmsg_end(skb, nlh);
+
+ return 0;
+
+cancel:
+ nlmsg_cancel(skb, nlh);
+ return err;
+}
+
+int br_mdb_get(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge *br = netdev_priv(dev);
+ struct net_bridge_mdb_entry *mp;
+ struct sk_buff *skb;
+ struct br_ip group;
+ int err;
+
+ err = br_mdb_get_parse(dev, tb, &group, extack);
+ if (err)
+ return err;
+
+ /* Hold the multicast lock to ensure that the MDB entry does not change
+ * between the time the reply size is determined and when the reply is
+ * filled in.
+ */
+ spin_lock_bh(&br->multicast_lock);
+
+ mp = br_mdb_ip_get(br, &group);
+ if (!mp || (!mp->ports && !mp->host_joined)) {
+ NL_SET_ERR_MSG_MOD(extack, "MDB entry not found");
+ err = -ENOENT;
+ goto unlock;
+ }
+
+ skb = br_mdb_get_reply_alloc(mp);
+ if (!skb) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ err = br_mdb_get_reply_fill(skb, mp, portid, seq);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to fill MDB get reply");
+ goto free;
+ }
+
+ spin_unlock_bh(&br->multicast_lock);
+
+ return rtnl_unicast(skb, dev_net(dev), portid);
+
+free:
+ kfree_skb(skb);
+unlock:
+ spin_unlock_bh(&br->multicast_lock);
+ return err;
}
diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c
new file mode 100644
index 000000000000..3c36fa24bc05
--- /dev/null
+++ b/net/bridge/br_mrp.c
@@ -0,0 +1,1260 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/mrp_bridge.h>
+#include "br_private_mrp.h"
+
+static const u8 mrp_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x1 };
+static const u8 mrp_in_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x3 };
+
+static int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb);
+
+static struct br_frame_type mrp_frame_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_MRP),
+ .frame_handler = br_mrp_process,
+};
+
+static bool br_mrp_is_ring_port(struct net_bridge_port *p_port,
+ struct net_bridge_port *s_port,
+ struct net_bridge_port *port)
+{
+ if (port == p_port ||
+ port == s_port)
+ return true;
+
+ return false;
+}
+
+static bool br_mrp_is_in_port(struct net_bridge_port *i_port,
+ struct net_bridge_port *port)
+{
+ if (port == i_port)
+ return true;
+
+ return false;
+}
+
+static struct net_bridge_port *br_mrp_get_port(struct net_bridge *br,
+ u32 ifindex)
+{
+ struct net_bridge_port *res = NULL;
+ struct net_bridge_port *port;
+
+ list_for_each_entry(port, &br->port_list, list) {
+ if (port->dev->ifindex == ifindex) {
+ res = port;
+ break;
+ }
+ }
+
+ return res;
+}
+
+static struct br_mrp *br_mrp_find_id(struct net_bridge *br, u32 ring_id)
+{
+ struct br_mrp *res = NULL;
+ struct br_mrp *mrp;
+
+ hlist_for_each_entry_rcu(mrp, &br->mrp_list, list,
+ lockdep_rtnl_is_held()) {
+ if (mrp->ring_id == ring_id) {
+ res = mrp;
+ break;
+ }
+ }
+
+ return res;
+}
+
+static struct br_mrp *br_mrp_find_in_id(struct net_bridge *br, u32 in_id)
+{
+ struct br_mrp *res = NULL;
+ struct br_mrp *mrp;
+
+ hlist_for_each_entry_rcu(mrp, &br->mrp_list, list,
+ lockdep_rtnl_is_held()) {
+ if (mrp->in_id == in_id) {
+ res = mrp;
+ break;
+ }
+ }
+
+ return res;
+}
+
+static bool br_mrp_unique_ifindex(struct net_bridge *br, u32 ifindex)
+{
+ struct br_mrp *mrp;
+
+ hlist_for_each_entry_rcu(mrp, &br->mrp_list, list,
+ lockdep_rtnl_is_held()) {
+ struct net_bridge_port *p;
+
+ p = rtnl_dereference(mrp->p_port);
+ if (p && p->dev->ifindex == ifindex)
+ return false;
+
+ p = rtnl_dereference(mrp->s_port);
+ if (p && p->dev->ifindex == ifindex)
+ return false;
+
+ p = rtnl_dereference(mrp->i_port);
+ if (p && p->dev->ifindex == ifindex)
+ return false;
+ }
+
+ return true;
+}
+
+static struct br_mrp *br_mrp_find_port(struct net_bridge *br,
+ struct net_bridge_port *p)
+{
+ struct br_mrp *res = NULL;
+ struct br_mrp *mrp;
+
+ hlist_for_each_entry_rcu(mrp, &br->mrp_list, list,
+ lockdep_rtnl_is_held()) {
+ if (rcu_access_pointer(mrp->p_port) == p ||
+ rcu_access_pointer(mrp->s_port) == p ||
+ rcu_access_pointer(mrp->i_port) == p) {
+ res = mrp;
+ break;
+ }
+ }
+
+ return res;
+}
+
+static int br_mrp_next_seq(struct br_mrp *mrp)
+{
+ mrp->seq_id++;
+ return mrp->seq_id;
+}
+
+static struct sk_buff *br_mrp_skb_alloc(struct net_bridge_port *p,
+ const u8 *src, const u8 *dst)
+{
+ struct ethhdr *eth_hdr;
+ struct sk_buff *skb;
+ __be16 *version;
+
+ skb = dev_alloc_skb(MRP_MAX_FRAME_LENGTH);
+ if (!skb)
+ return NULL;
+
+ skb->dev = p->dev;
+ skb->protocol = htons(ETH_P_MRP);
+ skb->priority = MRP_FRAME_PRIO;
+ skb_reserve(skb, sizeof(*eth_hdr));
+
+ eth_hdr = skb_push(skb, sizeof(*eth_hdr));
+ ether_addr_copy(eth_hdr->h_dest, dst);
+ ether_addr_copy(eth_hdr->h_source, src);
+ eth_hdr->h_proto = htons(ETH_P_MRP);
+
+ version = skb_put(skb, sizeof(*version));
+ *version = cpu_to_be16(MRP_VERSION);
+
+ return skb;
+}
+
+static void br_mrp_skb_tlv(struct sk_buff *skb,
+ enum br_mrp_tlv_header_type type,
+ u8 length)
+{
+ struct br_mrp_tlv_hdr *hdr;
+
+ hdr = skb_put(skb, sizeof(*hdr));
+ hdr->type = type;
+ hdr->length = length;
+}
+
+static void br_mrp_skb_common(struct sk_buff *skb, struct br_mrp *mrp)
+{
+ struct br_mrp_common_hdr *hdr;
+
+ br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_COMMON, sizeof(*hdr));
+
+ hdr = skb_put(skb, sizeof(*hdr));
+ hdr->seq_id = cpu_to_be16(br_mrp_next_seq(mrp));
+ memset(hdr->domain, 0xff, MRP_DOMAIN_UUID_LENGTH);
+}
+
+static struct sk_buff *br_mrp_alloc_test_skb(struct br_mrp *mrp,
+ struct net_bridge_port *p,
+ enum br_mrp_port_role_type port_role)
+{
+ struct br_mrp_ring_test_hdr *hdr = NULL;
+ struct sk_buff *skb = NULL;
+
+ if (!p)
+ return NULL;
+
+ skb = br_mrp_skb_alloc(p, p->dev->dev_addr, mrp_test_dmac);
+ if (!skb)
+ return NULL;
+
+ br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_RING_TEST, sizeof(*hdr));
+ hdr = skb_put(skb, sizeof(*hdr));
+
+ hdr->prio = cpu_to_be16(mrp->prio);
+ ether_addr_copy(hdr->sa, p->br->dev->dev_addr);
+ hdr->port_role = cpu_to_be16(port_role);
+ hdr->state = cpu_to_be16(mrp->ring_state);
+ hdr->transitions = cpu_to_be16(mrp->ring_transitions);
+ hdr->timestamp = cpu_to_be32(jiffies_to_msecs(jiffies));
+
+ br_mrp_skb_common(skb, mrp);
+
+ /* In case the node behaves as MRA then the Test frame needs to have
+ * an Option TLV which includes eventually a sub-option TLV that has
+ * the type AUTO_MGR
+ */
+ if (mrp->ring_role == BR_MRP_RING_ROLE_MRA) {
+ struct br_mrp_sub_option1_hdr *sub_opt = NULL;
+ struct br_mrp_tlv_hdr *sub_tlv = NULL;
+ struct br_mrp_oui_hdr *oui = NULL;
+ u8 length;
+
+ length = sizeof(*sub_opt) + sizeof(*sub_tlv) + sizeof(oui) +
+ MRP_OPT_PADDING;
+ br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_OPTION, length);
+
+ oui = skb_put(skb, sizeof(*oui));
+ memset(oui, 0x0, sizeof(*oui));
+ sub_opt = skb_put(skb, sizeof(*sub_opt));
+ memset(sub_opt, 0x0, sizeof(*sub_opt));
+
+ sub_tlv = skb_put(skb, sizeof(*sub_tlv));
+ sub_tlv->type = BR_MRP_SUB_TLV_HEADER_TEST_AUTO_MGR;
+
+ /* 32 bit alligment shall be ensured therefore add 2 bytes */
+ skb_put(skb, MRP_OPT_PADDING);
+ }
+
+ br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_END, 0x0);
+
+ return skb;
+}
+
+static struct sk_buff *br_mrp_alloc_in_test_skb(struct br_mrp *mrp,
+ struct net_bridge_port *p,
+ enum br_mrp_port_role_type port_role)
+{
+ struct br_mrp_in_test_hdr *hdr = NULL;
+ struct sk_buff *skb = NULL;
+
+ if (!p)
+ return NULL;
+
+ skb = br_mrp_skb_alloc(p, p->dev->dev_addr, mrp_in_test_dmac);
+ if (!skb)
+ return NULL;
+
+ br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_IN_TEST, sizeof(*hdr));
+ hdr = skb_put(skb, sizeof(*hdr));
+
+ hdr->id = cpu_to_be16(mrp->in_id);
+ ether_addr_copy(hdr->sa, p->br->dev->dev_addr);
+ hdr->port_role = cpu_to_be16(port_role);
+ hdr->state = cpu_to_be16(mrp->in_state);
+ hdr->transitions = cpu_to_be16(mrp->in_transitions);
+ hdr->timestamp = cpu_to_be32(jiffies_to_msecs(jiffies));
+
+ br_mrp_skb_common(skb, mrp);
+ br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_END, 0x0);
+
+ return skb;
+}
+
+/* This function is continuously called in the following cases:
+ * - when node role is MRM, in this case test_monitor is always set to false
+ * because it needs to notify the userspace that the ring is open and needs to
+ * send MRP_Test frames
+ * - when node role is MRA, there are 2 subcases:
+ * - when MRA behaves as MRM, in this case is similar with MRM role
+ * - when MRA behaves as MRC, in this case test_monitor is set to true,
+ * because it needs to detect when it stops seeing MRP_Test frames
+ * from MRM node but it doesn't need to send MRP_Test frames.
+ */
+static void br_mrp_test_work_expired(struct work_struct *work)
+{
+ struct delayed_work *del_work = to_delayed_work(work);
+ struct br_mrp *mrp = container_of(del_work, struct br_mrp, test_work);
+ struct net_bridge_port *p;
+ bool notify_open = false;
+ struct sk_buff *skb;
+
+ if (time_before_eq(mrp->test_end, jiffies))
+ return;
+
+ if (mrp->test_count_miss < mrp->test_max_miss) {
+ mrp->test_count_miss++;
+ } else {
+ /* Notify that the ring is open only if the ring state is
+ * closed, otherwise it would continue to notify at every
+ * interval.
+ * Also notify that the ring is open when the node has the
+ * role MRA and behaves as MRC. The reason is that the
+ * userspace needs to know when the MRM stopped sending
+ * MRP_Test frames so that the current node to try to take
+ * the role of a MRM.
+ */
+ if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED ||
+ mrp->test_monitor)
+ notify_open = true;
+ }
+
+ rcu_read_lock();
+
+ p = rcu_dereference(mrp->p_port);
+ if (p) {
+ if (!mrp->test_monitor) {
+ skb = br_mrp_alloc_test_skb(mrp, p,
+ BR_MRP_PORT_ROLE_PRIMARY);
+ if (!skb)
+ goto out;
+
+ skb_reset_network_header(skb);
+ dev_queue_xmit(skb);
+ }
+
+ if (notify_open && !mrp->ring_role_offloaded)
+ br_mrp_ring_port_open(p->dev, true);
+ }
+
+ p = rcu_dereference(mrp->s_port);
+ if (p) {
+ if (!mrp->test_monitor) {
+ skb = br_mrp_alloc_test_skb(mrp, p,
+ BR_MRP_PORT_ROLE_SECONDARY);
+ if (!skb)
+ goto out;
+
+ skb_reset_network_header(skb);
+ dev_queue_xmit(skb);
+ }
+
+ if (notify_open && !mrp->ring_role_offloaded)
+ br_mrp_ring_port_open(p->dev, true);
+ }
+
+out:
+ rcu_read_unlock();
+
+ queue_delayed_work(system_percpu_wq, &mrp->test_work,
+ usecs_to_jiffies(mrp->test_interval));
+}
+
+/* This function is continuously called when the node has the interconnect role
+ * MIM. It would generate interconnect test frames and will send them on all 3
+ * ports. But will also check if it stop receiving interconnect test frames.
+ */
+static void br_mrp_in_test_work_expired(struct work_struct *work)
+{
+ struct delayed_work *del_work = to_delayed_work(work);
+ struct br_mrp *mrp = container_of(del_work, struct br_mrp, in_test_work);
+ struct net_bridge_port *p;
+ bool notify_open = false;
+ struct sk_buff *skb;
+
+ if (time_before_eq(mrp->in_test_end, jiffies))
+ return;
+
+ if (mrp->in_test_count_miss < mrp->in_test_max_miss) {
+ mrp->in_test_count_miss++;
+ } else {
+ /* Notify that the interconnect ring is open only if the
+ * interconnect ring state is closed, otherwise it would
+ * continue to notify at every interval.
+ */
+ if (mrp->in_state == BR_MRP_IN_STATE_CLOSED)
+ notify_open = true;
+ }
+
+ rcu_read_lock();
+
+ p = rcu_dereference(mrp->p_port);
+ if (p) {
+ skb = br_mrp_alloc_in_test_skb(mrp, p,
+ BR_MRP_PORT_ROLE_PRIMARY);
+ if (!skb)
+ goto out;
+
+ skb_reset_network_header(skb);
+ dev_queue_xmit(skb);
+
+ if (notify_open && !mrp->in_role_offloaded)
+ br_mrp_in_port_open(p->dev, true);
+ }
+
+ p = rcu_dereference(mrp->s_port);
+ if (p) {
+ skb = br_mrp_alloc_in_test_skb(mrp, p,
+ BR_MRP_PORT_ROLE_SECONDARY);
+ if (!skb)
+ goto out;
+
+ skb_reset_network_header(skb);
+ dev_queue_xmit(skb);
+
+ if (notify_open && !mrp->in_role_offloaded)
+ br_mrp_in_port_open(p->dev, true);
+ }
+
+ p = rcu_dereference(mrp->i_port);
+ if (p) {
+ skb = br_mrp_alloc_in_test_skb(mrp, p,
+ BR_MRP_PORT_ROLE_INTER);
+ if (!skb)
+ goto out;
+
+ skb_reset_network_header(skb);
+ dev_queue_xmit(skb);
+
+ if (notify_open && !mrp->in_role_offloaded)
+ br_mrp_in_port_open(p->dev, true);
+ }
+
+out:
+ rcu_read_unlock();
+
+ queue_delayed_work(system_percpu_wq, &mrp->in_test_work,
+ usecs_to_jiffies(mrp->in_test_interval));
+}
+
+/* Deletes the MRP instance.
+ * note: called under rtnl_lock
+ */
+static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp)
+{
+ struct net_bridge_port *p;
+ u8 state;
+
+ /* Stop sending MRP_Test frames */
+ cancel_delayed_work_sync(&mrp->test_work);
+ br_mrp_switchdev_send_ring_test(br, mrp, 0, 0, 0, 0);
+
+ /* Stop sending MRP_InTest frames if has an interconnect role */
+ cancel_delayed_work_sync(&mrp->in_test_work);
+ br_mrp_switchdev_send_in_test(br, mrp, 0, 0, 0);
+
+ /* Disable the roles */
+ br_mrp_switchdev_set_ring_role(br, mrp, BR_MRP_RING_ROLE_DISABLED);
+ p = rtnl_dereference(mrp->i_port);
+ if (p)
+ br_mrp_switchdev_set_in_role(br, mrp, mrp->in_id, mrp->ring_id,
+ BR_MRP_IN_ROLE_DISABLED);
+
+ br_mrp_switchdev_del(br, mrp);
+
+ /* Reset the ports */
+ p = rtnl_dereference(mrp->p_port);
+ if (p) {
+ spin_lock_bh(&br->lock);
+ state = netif_running(br->dev) ?
+ BR_STATE_FORWARDING : BR_STATE_DISABLED;
+ p->state = state;
+ p->flags &= ~BR_MRP_AWARE;
+ spin_unlock_bh(&br->lock);
+ br_mrp_port_switchdev_set_state(p, state);
+ rcu_assign_pointer(mrp->p_port, NULL);
+ }
+
+ p = rtnl_dereference(mrp->s_port);
+ if (p) {
+ spin_lock_bh(&br->lock);
+ state = netif_running(br->dev) ?
+ BR_STATE_FORWARDING : BR_STATE_DISABLED;
+ p->state = state;
+ p->flags &= ~BR_MRP_AWARE;
+ spin_unlock_bh(&br->lock);
+ br_mrp_port_switchdev_set_state(p, state);
+ rcu_assign_pointer(mrp->s_port, NULL);
+ }
+
+ p = rtnl_dereference(mrp->i_port);
+ if (p) {
+ spin_lock_bh(&br->lock);
+ state = netif_running(br->dev) ?
+ BR_STATE_FORWARDING : BR_STATE_DISABLED;
+ p->state = state;
+ p->flags &= ~BR_MRP_AWARE;
+ spin_unlock_bh(&br->lock);
+ br_mrp_port_switchdev_set_state(p, state);
+ rcu_assign_pointer(mrp->i_port, NULL);
+ }
+
+ hlist_del_rcu(&mrp->list);
+ kfree_rcu(mrp, rcu);
+
+ if (hlist_empty(&br->mrp_list))
+ br_del_frame(br, &mrp_frame_type);
+}
+
+/* Adds a new MRP instance.
+ * note: called under rtnl_lock
+ */
+int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance)
+{
+ struct net_bridge_port *p;
+ struct br_mrp *mrp;
+ int err;
+
+ /* If the ring exists, it is not possible to create another one with the
+ * same ring_id
+ */
+ mrp = br_mrp_find_id(br, instance->ring_id);
+ if (mrp)
+ return -EINVAL;
+
+ if (!br_mrp_get_port(br, instance->p_ifindex) ||
+ !br_mrp_get_port(br, instance->s_ifindex))
+ return -EINVAL;
+
+ /* It is not possible to have the same port part of multiple rings */
+ if (!br_mrp_unique_ifindex(br, instance->p_ifindex) ||
+ !br_mrp_unique_ifindex(br, instance->s_ifindex))
+ return -EINVAL;
+
+ mrp = kzalloc(sizeof(*mrp), GFP_KERNEL);
+ if (!mrp)
+ return -ENOMEM;
+
+ mrp->ring_id = instance->ring_id;
+ mrp->prio = instance->prio;
+
+ p = br_mrp_get_port(br, instance->p_ifindex);
+ spin_lock_bh(&br->lock);
+ p->state = BR_STATE_FORWARDING;
+ p->flags |= BR_MRP_AWARE;
+ spin_unlock_bh(&br->lock);
+ rcu_assign_pointer(mrp->p_port, p);
+
+ p = br_mrp_get_port(br, instance->s_ifindex);
+ spin_lock_bh(&br->lock);
+ p->state = BR_STATE_FORWARDING;
+ p->flags |= BR_MRP_AWARE;
+ spin_unlock_bh(&br->lock);
+ rcu_assign_pointer(mrp->s_port, p);
+
+ if (hlist_empty(&br->mrp_list))
+ br_add_frame(br, &mrp_frame_type);
+
+ INIT_DELAYED_WORK(&mrp->test_work, br_mrp_test_work_expired);
+ INIT_DELAYED_WORK(&mrp->in_test_work, br_mrp_in_test_work_expired);
+ hlist_add_tail_rcu(&mrp->list, &br->mrp_list);
+
+ err = br_mrp_switchdev_add(br, mrp);
+ if (err)
+ goto delete_mrp;
+
+ return 0;
+
+delete_mrp:
+ br_mrp_del_impl(br, mrp);
+
+ return err;
+}
+
+/* Deletes the MRP instance from which the port is part of
+ * note: called under rtnl_lock
+ */
+void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p)
+{
+ struct br_mrp *mrp = br_mrp_find_port(br, p);
+
+ /* If the port is not part of a MRP instance just bail out */
+ if (!mrp)
+ return;
+
+ br_mrp_del_impl(br, mrp);
+}
+
+/* Deletes existing MRP instance based on ring_id
+ * note: called under rtnl_lock
+ */
+int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance)
+{
+ struct br_mrp *mrp = br_mrp_find_id(br, instance->ring_id);
+
+ if (!mrp)
+ return -EINVAL;
+
+ br_mrp_del_impl(br, mrp);
+
+ return 0;
+}
+
+/* Set port state, port state can be forwarding, blocked or disabled
+ * note: already called with rtnl_lock
+ */
+int br_mrp_set_port_state(struct net_bridge_port *p,
+ enum br_mrp_port_state_type state)
+{
+ u32 port_state;
+
+ if (!p || !(p->flags & BR_MRP_AWARE))
+ return -EINVAL;
+
+ spin_lock_bh(&p->br->lock);
+
+ if (state == BR_MRP_PORT_STATE_FORWARDING)
+ port_state = BR_STATE_FORWARDING;
+ else
+ port_state = BR_STATE_BLOCKING;
+
+ p->state = port_state;
+ spin_unlock_bh(&p->br->lock);
+
+ br_mrp_port_switchdev_set_state(p, port_state);
+
+ return 0;
+}
+
+/* Set port role, port role can be primary or secondary
+ * note: already called with rtnl_lock
+ */
+int br_mrp_set_port_role(struct net_bridge_port *p,
+ enum br_mrp_port_role_type role)
+{
+ struct br_mrp *mrp;
+
+ if (!p || !(p->flags & BR_MRP_AWARE))
+ return -EINVAL;
+
+ mrp = br_mrp_find_port(p->br, p);
+
+ if (!mrp)
+ return -EINVAL;
+
+ switch (role) {
+ case BR_MRP_PORT_ROLE_PRIMARY:
+ rcu_assign_pointer(mrp->p_port, p);
+ break;
+ case BR_MRP_PORT_ROLE_SECONDARY:
+ rcu_assign_pointer(mrp->s_port, p);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ br_mrp_port_switchdev_set_role(p, role);
+
+ return 0;
+}
+
+/* Set ring state, ring state can be only Open or Closed
+ * note: already called with rtnl_lock
+ */
+int br_mrp_set_ring_state(struct net_bridge *br,
+ struct br_mrp_ring_state *state)
+{
+ struct br_mrp *mrp = br_mrp_find_id(br, state->ring_id);
+
+ if (!mrp)
+ return -EINVAL;
+
+ if (mrp->ring_state != state->ring_state)
+ mrp->ring_transitions++;
+
+ mrp->ring_state = state->ring_state;
+
+ br_mrp_switchdev_set_ring_state(br, mrp, state->ring_state);
+
+ return 0;
+}
+
+/* Set ring role, ring role can be only MRM(Media Redundancy Manager) or
+ * MRC(Media Redundancy Client).
+ * note: already called with rtnl_lock
+ */
+int br_mrp_set_ring_role(struct net_bridge *br,
+ struct br_mrp_ring_role *role)
+{
+ struct br_mrp *mrp = br_mrp_find_id(br, role->ring_id);
+ enum br_mrp_hw_support support;
+
+ if (!mrp)
+ return -EINVAL;
+
+ mrp->ring_role = role->ring_role;
+
+ /* If there is an error just bailed out */
+ support = br_mrp_switchdev_set_ring_role(br, mrp, role->ring_role);
+ if (support == BR_MRP_NONE)
+ return -EOPNOTSUPP;
+
+ /* Now detect if the HW actually applied the role or not. If the HW
+ * applied the role it means that the SW will not to do those operations
+ * anymore. For example if the role ir MRM then the HW will notify the
+ * SW when ring is open, but if the is not pushed to the HW the SW will
+ * need to detect when the ring is open
+ */
+ mrp->ring_role_offloaded = support == BR_MRP_SW ? 0 : 1;
+
+ return 0;
+}
+
+/* Start to generate or monitor MRP test frames, the frames are generated by
+ * HW and if it fails, they are generated by the SW.
+ * note: already called with rtnl_lock
+ */
+int br_mrp_start_test(struct net_bridge *br,
+ struct br_mrp_start_test *test)
+{
+ struct br_mrp *mrp = br_mrp_find_id(br, test->ring_id);
+ enum br_mrp_hw_support support;
+
+ if (!mrp)
+ return -EINVAL;
+
+ /* Try to push it to the HW and if it fails then continue with SW
+ * implementation and if that also fails then return error.
+ */
+ support = br_mrp_switchdev_send_ring_test(br, mrp, test->interval,
+ test->max_miss, test->period,
+ test->monitor);
+ if (support == BR_MRP_NONE)
+ return -EOPNOTSUPP;
+
+ if (support == BR_MRP_HW)
+ return 0;
+
+ mrp->test_interval = test->interval;
+ mrp->test_end = jiffies + usecs_to_jiffies(test->period);
+ mrp->test_max_miss = test->max_miss;
+ mrp->test_monitor = test->monitor;
+ mrp->test_count_miss = 0;
+ queue_delayed_work(system_percpu_wq, &mrp->test_work,
+ usecs_to_jiffies(test->interval));
+
+ return 0;
+}
+
+/* Set in state, int state can be only Open or Closed
+ * note: already called with rtnl_lock
+ */
+int br_mrp_set_in_state(struct net_bridge *br, struct br_mrp_in_state *state)
+{
+ struct br_mrp *mrp = br_mrp_find_in_id(br, state->in_id);
+
+ if (!mrp)
+ return -EINVAL;
+
+ if (mrp->in_state != state->in_state)
+ mrp->in_transitions++;
+
+ mrp->in_state = state->in_state;
+
+ br_mrp_switchdev_set_in_state(br, mrp, state->in_state);
+
+ return 0;
+}
+
+/* Set in role, in role can be only MIM(Media Interconnection Manager) or
+ * MIC(Media Interconnection Client).
+ * note: already called with rtnl_lock
+ */
+int br_mrp_set_in_role(struct net_bridge *br, struct br_mrp_in_role *role)
+{
+ struct br_mrp *mrp = br_mrp_find_id(br, role->ring_id);
+ enum br_mrp_hw_support support;
+ struct net_bridge_port *p;
+
+ if (!mrp)
+ return -EINVAL;
+
+ if (!br_mrp_get_port(br, role->i_ifindex))
+ return -EINVAL;
+
+ if (role->in_role == BR_MRP_IN_ROLE_DISABLED) {
+ u8 state;
+
+ /* It is not allowed to disable a port that doesn't exist */
+ p = rtnl_dereference(mrp->i_port);
+ if (!p)
+ return -EINVAL;
+
+ /* Stop the generating MRP_InTest frames */
+ cancel_delayed_work_sync(&mrp->in_test_work);
+ br_mrp_switchdev_send_in_test(br, mrp, 0, 0, 0);
+
+ /* Remove the port */
+ spin_lock_bh(&br->lock);
+ state = netif_running(br->dev) ?
+ BR_STATE_FORWARDING : BR_STATE_DISABLED;
+ p->state = state;
+ p->flags &= ~BR_MRP_AWARE;
+ spin_unlock_bh(&br->lock);
+ br_mrp_port_switchdev_set_state(p, state);
+ rcu_assign_pointer(mrp->i_port, NULL);
+
+ mrp->in_role = role->in_role;
+ mrp->in_id = 0;
+
+ return 0;
+ }
+
+ /* It is not possible to have the same port part of multiple rings */
+ if (!br_mrp_unique_ifindex(br, role->i_ifindex))
+ return -EINVAL;
+
+ /* It is not allowed to set a different interconnect port if the mrp
+ * instance has already one. First it needs to be disabled and after
+ * that set the new port
+ */
+ if (rcu_access_pointer(mrp->i_port))
+ return -EINVAL;
+
+ p = br_mrp_get_port(br, role->i_ifindex);
+ spin_lock_bh(&br->lock);
+ p->state = BR_STATE_FORWARDING;
+ p->flags |= BR_MRP_AWARE;
+ spin_unlock_bh(&br->lock);
+ rcu_assign_pointer(mrp->i_port, p);
+
+ mrp->in_role = role->in_role;
+ mrp->in_id = role->in_id;
+
+ /* If there is an error just bailed out */
+ support = br_mrp_switchdev_set_in_role(br, mrp, role->in_id,
+ role->ring_id, role->in_role);
+ if (support == BR_MRP_NONE)
+ return -EOPNOTSUPP;
+
+ /* Now detect if the HW actually applied the role or not. If the HW
+ * applied the role it means that the SW will not to do those operations
+ * anymore. For example if the role is MIM then the HW will notify the
+ * SW when interconnect ring is open, but if the is not pushed to the HW
+ * the SW will need to detect when the interconnect ring is open.
+ */
+ mrp->in_role_offloaded = support == BR_MRP_SW ? 0 : 1;
+
+ return 0;
+}
+
+/* Start to generate MRP_InTest frames, the frames are generated by
+ * HW and if it fails, they are generated by the SW.
+ * note: already called with rtnl_lock
+ */
+int br_mrp_start_in_test(struct net_bridge *br,
+ struct br_mrp_start_in_test *in_test)
+{
+ struct br_mrp *mrp = br_mrp_find_in_id(br, in_test->in_id);
+ enum br_mrp_hw_support support;
+
+ if (!mrp)
+ return -EINVAL;
+
+ if (mrp->in_role != BR_MRP_IN_ROLE_MIM)
+ return -EINVAL;
+
+ /* Try to push it to the HW and if it fails then continue with SW
+ * implementation and if that also fails then return error.
+ */
+ support = br_mrp_switchdev_send_in_test(br, mrp, in_test->interval,
+ in_test->max_miss,
+ in_test->period);
+ if (support == BR_MRP_NONE)
+ return -EOPNOTSUPP;
+
+ if (support == BR_MRP_HW)
+ return 0;
+
+ mrp->in_test_interval = in_test->interval;
+ mrp->in_test_end = jiffies + usecs_to_jiffies(in_test->period);
+ mrp->in_test_max_miss = in_test->max_miss;
+ mrp->in_test_count_miss = 0;
+ queue_delayed_work(system_percpu_wq, &mrp->in_test_work,
+ usecs_to_jiffies(in_test->interval));
+
+ return 0;
+}
+
+/* Determine if the frame type is a ring frame */
+static bool br_mrp_ring_frame(struct sk_buff *skb)
+{
+ const struct br_mrp_tlv_hdr *hdr;
+ struct br_mrp_tlv_hdr _hdr;
+
+ hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return false;
+
+ if (hdr->type == BR_MRP_TLV_HEADER_RING_TEST ||
+ hdr->type == BR_MRP_TLV_HEADER_RING_TOPO ||
+ hdr->type == BR_MRP_TLV_HEADER_RING_LINK_DOWN ||
+ hdr->type == BR_MRP_TLV_HEADER_RING_LINK_UP ||
+ hdr->type == BR_MRP_TLV_HEADER_OPTION)
+ return true;
+
+ return false;
+}
+
+/* Determine if the frame type is an interconnect frame */
+static bool br_mrp_in_frame(struct sk_buff *skb)
+{
+ const struct br_mrp_tlv_hdr *hdr;
+ struct br_mrp_tlv_hdr _hdr;
+
+ hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return false;
+
+ if (hdr->type == BR_MRP_TLV_HEADER_IN_TEST ||
+ hdr->type == BR_MRP_TLV_HEADER_IN_TOPO ||
+ hdr->type == BR_MRP_TLV_HEADER_IN_LINK_DOWN ||
+ hdr->type == BR_MRP_TLV_HEADER_IN_LINK_UP ||
+ hdr->type == BR_MRP_TLV_HEADER_IN_LINK_STATUS)
+ return true;
+
+ return false;
+}
+
+/* Process only MRP Test frame. All the other MRP frames are processed by
+ * userspace application
+ * note: already called with rcu_read_lock
+ */
+static void br_mrp_mrm_process(struct br_mrp *mrp, struct net_bridge_port *port,
+ struct sk_buff *skb)
+{
+ const struct br_mrp_tlv_hdr *hdr;
+ struct br_mrp_tlv_hdr _hdr;
+
+ /* Each MRP header starts with a version field which is 16 bits.
+ * Therefore skip the version and get directly the TLV header.
+ */
+ hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return;
+
+ if (hdr->type != BR_MRP_TLV_HEADER_RING_TEST)
+ return;
+
+ mrp->test_count_miss = 0;
+
+ /* Notify the userspace that the ring is closed only when the ring is
+ * not closed
+ */
+ if (mrp->ring_state != BR_MRP_RING_STATE_CLOSED)
+ br_mrp_ring_port_open(port->dev, false);
+}
+
+/* Determine if the test hdr has a better priority than the node */
+static bool br_mrp_test_better_than_own(struct br_mrp *mrp,
+ struct net_bridge *br,
+ const struct br_mrp_ring_test_hdr *hdr)
+{
+ u16 prio = be16_to_cpu(hdr->prio);
+
+ if (prio < mrp->prio ||
+ (prio == mrp->prio &&
+ ether_addr_to_u64(hdr->sa) < ether_addr_to_u64(br->dev->dev_addr)))
+ return true;
+
+ return false;
+}
+
+/* Process only MRP Test frame. All the other MRP frames are processed by
+ * userspace application
+ * note: already called with rcu_read_lock
+ */
+static void br_mrp_mra_process(struct br_mrp *mrp, struct net_bridge *br,
+ struct net_bridge_port *port,
+ struct sk_buff *skb)
+{
+ const struct br_mrp_ring_test_hdr *test_hdr;
+ struct br_mrp_ring_test_hdr _test_hdr;
+ const struct br_mrp_tlv_hdr *hdr;
+ struct br_mrp_tlv_hdr _hdr;
+
+ /* Each MRP header starts with a version field which is 16 bits.
+ * Therefore skip the version and get directly the TLV header.
+ */
+ hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return;
+
+ if (hdr->type != BR_MRP_TLV_HEADER_RING_TEST)
+ return;
+
+ test_hdr = skb_header_pointer(skb, sizeof(uint16_t) + sizeof(_hdr),
+ sizeof(_test_hdr), &_test_hdr);
+ if (!test_hdr)
+ return;
+
+ /* Only frames that have a better priority than the node will
+ * clear the miss counter because otherwise the node will need to behave
+ * as MRM.
+ */
+ if (br_mrp_test_better_than_own(mrp, br, test_hdr))
+ mrp->test_count_miss = 0;
+}
+
+/* Process only MRP InTest frame. All the other MRP frames are processed by
+ * userspace application
+ * note: already called with rcu_read_lock
+ */
+static bool br_mrp_mim_process(struct br_mrp *mrp, struct net_bridge_port *port,
+ struct sk_buff *skb)
+{
+ const struct br_mrp_in_test_hdr *in_hdr;
+ struct br_mrp_in_test_hdr _in_hdr;
+ const struct br_mrp_tlv_hdr *hdr;
+ struct br_mrp_tlv_hdr _hdr;
+
+ /* Each MRP header starts with a version field which is 16 bits.
+ * Therefore skip the version and get directly the TLV header.
+ */
+ hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return false;
+
+ /* The check for InTest frame type was already done */
+ in_hdr = skb_header_pointer(skb, sizeof(uint16_t) + sizeof(_hdr),
+ sizeof(_in_hdr), &_in_hdr);
+ if (!in_hdr)
+ return false;
+
+ /* It needs to process only it's own InTest frames. */
+ if (mrp->in_id != ntohs(in_hdr->id))
+ return false;
+
+ mrp->in_test_count_miss = 0;
+
+ /* Notify the userspace that the ring is closed only when the ring is
+ * not closed
+ */
+ if (mrp->in_state != BR_MRP_IN_STATE_CLOSED)
+ br_mrp_in_port_open(port->dev, false);
+
+ return true;
+}
+
+/* Get the MRP frame type
+ * note: already called with rcu_read_lock
+ */
+static u8 br_mrp_get_frame_type(struct sk_buff *skb)
+{
+ const struct br_mrp_tlv_hdr *hdr;
+ struct br_mrp_tlv_hdr _hdr;
+
+ /* Each MRP header starts with a version field which is 16 bits.
+ * Therefore skip the version and get directly the TLV header.
+ */
+ hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return 0xff;
+
+ return hdr->type;
+}
+
+static bool br_mrp_mrm_behaviour(struct br_mrp *mrp)
+{
+ if (mrp->ring_role == BR_MRP_RING_ROLE_MRM ||
+ (mrp->ring_role == BR_MRP_RING_ROLE_MRA && !mrp->test_monitor))
+ return true;
+
+ return false;
+}
+
+static bool br_mrp_mrc_behaviour(struct br_mrp *mrp)
+{
+ if (mrp->ring_role == BR_MRP_RING_ROLE_MRC ||
+ (mrp->ring_role == BR_MRP_RING_ROLE_MRA && mrp->test_monitor))
+ return true;
+
+ return false;
+}
+
+/* This will just forward the frame to the other mrp ring ports, depending on
+ * the frame type, ring role and interconnect role
+ * note: already called with rcu_read_lock
+ */
+static int br_mrp_rcv(struct net_bridge_port *p,
+ struct sk_buff *skb, struct net_device *dev)
+{
+ struct net_bridge_port *p_port, *s_port, *i_port = NULL;
+ struct net_bridge_port *p_dst, *s_dst, *i_dst = NULL;
+ struct net_bridge *br;
+ struct br_mrp *mrp;
+
+ /* If port is disabled don't accept any frames */
+ if (p->state == BR_STATE_DISABLED)
+ return 0;
+
+ br = p->br;
+ mrp = br_mrp_find_port(br, p);
+ if (unlikely(!mrp))
+ return 0;
+
+ p_port = rcu_dereference(mrp->p_port);
+ if (!p_port)
+ return 0;
+ p_dst = p_port;
+
+ s_port = rcu_dereference(mrp->s_port);
+ if (!s_port)
+ return 0;
+ s_dst = s_port;
+
+ /* If the frame is a ring frame then it is not required to check the
+ * interconnect role and ports to process or forward the frame
+ */
+ if (br_mrp_ring_frame(skb)) {
+ /* If the role is MRM then don't forward the frames */
+ if (mrp->ring_role == BR_MRP_RING_ROLE_MRM) {
+ br_mrp_mrm_process(mrp, p, skb);
+ goto no_forward;
+ }
+
+ /* If the role is MRA then don't forward the frames if it
+ * behaves as MRM node
+ */
+ if (mrp->ring_role == BR_MRP_RING_ROLE_MRA) {
+ if (!mrp->test_monitor) {
+ br_mrp_mrm_process(mrp, p, skb);
+ goto no_forward;
+ }
+
+ br_mrp_mra_process(mrp, br, p, skb);
+ }
+
+ goto forward;
+ }
+
+ if (br_mrp_in_frame(skb)) {
+ u8 in_type = br_mrp_get_frame_type(skb);
+
+ i_port = rcu_dereference(mrp->i_port);
+ i_dst = i_port;
+
+ /* If the ring port is in block state it should not forward
+ * In_Test frames
+ */
+ if (br_mrp_is_ring_port(p_port, s_port, p) &&
+ p->state == BR_STATE_BLOCKING &&
+ in_type == BR_MRP_TLV_HEADER_IN_TEST)
+ goto no_forward;
+
+ /* Nodes that behaves as MRM needs to stop forwarding the
+ * frames in case the ring is closed, otherwise will be a loop.
+ * In this case the frame is no forward between the ring ports.
+ */
+ if (br_mrp_mrm_behaviour(mrp) &&
+ br_mrp_is_ring_port(p_port, s_port, p) &&
+ (s_port->state != BR_STATE_FORWARDING ||
+ p_port->state != BR_STATE_FORWARDING)) {
+ p_dst = NULL;
+ s_dst = NULL;
+ }
+
+ /* A node that behaves as MRC and doesn't have a interconnect
+ * role then it should forward all frames between the ring ports
+ * because it doesn't have an interconnect port
+ */
+ if (br_mrp_mrc_behaviour(mrp) &&
+ mrp->in_role == BR_MRP_IN_ROLE_DISABLED)
+ goto forward;
+
+ if (mrp->in_role == BR_MRP_IN_ROLE_MIM) {
+ if (in_type == BR_MRP_TLV_HEADER_IN_TEST) {
+ /* MIM should not forward it's own InTest
+ * frames
+ */
+ if (br_mrp_mim_process(mrp, p, skb)) {
+ goto no_forward;
+ } else {
+ if (br_mrp_is_ring_port(p_port, s_port,
+ p))
+ i_dst = NULL;
+
+ if (br_mrp_is_in_port(i_port, p))
+ goto no_forward;
+ }
+ } else {
+ /* MIM should forward IntLinkChange/Status and
+ * IntTopoChange between ring ports but MIM
+ * should not forward IntLinkChange/Status and
+ * IntTopoChange if the frame was received at
+ * the interconnect port
+ */
+ if (br_mrp_is_ring_port(p_port, s_port, p))
+ i_dst = NULL;
+
+ if (br_mrp_is_in_port(i_port, p))
+ goto no_forward;
+ }
+ }
+
+ if (mrp->in_role == BR_MRP_IN_ROLE_MIC) {
+ /* MIC should forward InTest frames on all ports
+ * regardless of the received port
+ */
+ if (in_type == BR_MRP_TLV_HEADER_IN_TEST)
+ goto forward;
+
+ /* MIC should forward IntLinkChange frames only if they
+ * are received on ring ports to all the ports
+ */
+ if (br_mrp_is_ring_port(p_port, s_port, p) &&
+ (in_type == BR_MRP_TLV_HEADER_IN_LINK_UP ||
+ in_type == BR_MRP_TLV_HEADER_IN_LINK_DOWN))
+ goto forward;
+
+ /* MIC should forward IntLinkStatus frames only to
+ * interconnect port if it was received on a ring port.
+ * If it is received on interconnect port then, it
+ * should be forward on both ring ports
+ */
+ if (br_mrp_is_ring_port(p_port, s_port, p) &&
+ in_type == BR_MRP_TLV_HEADER_IN_LINK_STATUS) {
+ p_dst = NULL;
+ s_dst = NULL;
+ }
+
+ /* Should forward the InTopo frames only between the
+ * ring ports
+ */
+ if (in_type == BR_MRP_TLV_HEADER_IN_TOPO) {
+ i_dst = NULL;
+ goto forward;
+ }
+
+ /* In all the other cases don't forward the frames */
+ goto no_forward;
+ }
+ }
+
+forward:
+ if (p_dst)
+ br_forward(p_dst, skb, true, false);
+ if (s_dst)
+ br_forward(s_dst, skb, true, false);
+ if (i_dst)
+ br_forward(i_dst, skb, true, false);
+
+no_forward:
+ return 1;
+}
+
+/* Check if the frame was received on a port that is part of MRP ring
+ * and if the frame has MRP eth. In that case process the frame otherwise do
+ * normal forwarding.
+ * note: already called with rcu_read_lock
+ */
+static int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb)
+{
+ /* If there is no MRP instance do normal forwarding */
+ if (likely(!(p->flags & BR_MRP_AWARE)))
+ goto out;
+
+ return br_mrp_rcv(p, skb, p->dev);
+out:
+ return 0;
+}
+
+bool br_mrp_enabled(struct net_bridge *br)
+{
+ return !hlist_empty(&br->mrp_list);
+}
diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c
new file mode 100644
index 000000000000..ce6f63c77cc0
--- /dev/null
+++ b/net/bridge/br_mrp_netlink.c
@@ -0,0 +1,571 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <net/genetlink.h>
+
+#include <uapi/linux/mrp_bridge.h>
+#include "br_private.h"
+#include "br_private_mrp.h"
+
+static const struct nla_policy br_mrp_policy[IFLA_BRIDGE_MRP_MAX + 1] = {
+ [IFLA_BRIDGE_MRP_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_MRP_INSTANCE] = { .type = NLA_NESTED },
+ [IFLA_BRIDGE_MRP_PORT_STATE] = { .type = NLA_NESTED },
+ [IFLA_BRIDGE_MRP_PORT_ROLE] = { .type = NLA_NESTED },
+ [IFLA_BRIDGE_MRP_RING_STATE] = { .type = NLA_NESTED },
+ [IFLA_BRIDGE_MRP_RING_ROLE] = { .type = NLA_NESTED },
+ [IFLA_BRIDGE_MRP_START_TEST] = { .type = NLA_NESTED },
+ [IFLA_BRIDGE_MRP_IN_ROLE] = { .type = NLA_NESTED },
+ [IFLA_BRIDGE_MRP_IN_STATE] = { .type = NLA_NESTED },
+ [IFLA_BRIDGE_MRP_START_IN_TEST] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+br_mrp_instance_policy[IFLA_BRIDGE_MRP_INSTANCE_MAX + 1] = {
+ [IFLA_BRIDGE_MRP_INSTANCE_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_MRP_INSTANCE_RING_ID] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_INSTANCE_PRIO] = { .type = NLA_U16 },
+};
+
+static int br_mrp_instance_parse(struct net_bridge *br, struct nlattr *attr,
+ int cmd, struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MRP_INSTANCE_MAX + 1];
+ struct br_mrp_instance inst;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_INSTANCE_MAX, attr,
+ br_mrp_instance_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_MRP_INSTANCE_RING_ID] ||
+ !tb[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX] ||
+ !tb[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing attribute: RING_ID or P_IFINDEX or S_IFINDEX");
+ return -EINVAL;
+ }
+
+ memset(&inst, 0, sizeof(inst));
+
+ inst.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_RING_ID]);
+ inst.p_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX]);
+ inst.s_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]);
+ inst.prio = MRP_DEFAULT_PRIO;
+
+ if (tb[IFLA_BRIDGE_MRP_INSTANCE_PRIO])
+ inst.prio = nla_get_u16(tb[IFLA_BRIDGE_MRP_INSTANCE_PRIO]);
+
+ if (cmd == RTM_SETLINK)
+ return br_mrp_add(br, &inst);
+ else
+ return br_mrp_del(br, &inst);
+
+ return 0;
+}
+
+static const struct nla_policy
+br_mrp_port_state_policy[IFLA_BRIDGE_MRP_PORT_STATE_MAX + 1] = {
+ [IFLA_BRIDGE_MRP_PORT_STATE_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_MRP_PORT_STATE_STATE] = { .type = NLA_U32 },
+};
+
+static int br_mrp_port_state_parse(struct net_bridge_port *p,
+ struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MRP_PORT_STATE_MAX + 1];
+ enum br_mrp_port_state_type state;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_PORT_STATE_MAX, attr,
+ br_mrp_port_state_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_MRP_PORT_STATE_STATE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing attribute: STATE");
+ return -EINVAL;
+ }
+
+ state = nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_STATE_STATE]);
+
+ return br_mrp_set_port_state(p, state);
+}
+
+static const struct nla_policy
+br_mrp_port_role_policy[IFLA_BRIDGE_MRP_PORT_ROLE_MAX + 1] = {
+ [IFLA_BRIDGE_MRP_PORT_ROLE_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_MRP_PORT_ROLE_ROLE] = { .type = NLA_U32 },
+};
+
+static int br_mrp_port_role_parse(struct net_bridge_port *p,
+ struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MRP_PORT_ROLE_MAX + 1];
+ enum br_mrp_port_role_type role;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_PORT_ROLE_MAX, attr,
+ br_mrp_port_role_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_MRP_PORT_ROLE_ROLE]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing attribute: ROLE");
+ return -EINVAL;
+ }
+
+ role = nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_ROLE_ROLE]);
+
+ return br_mrp_set_port_role(p, role);
+}
+
+static const struct nla_policy
+br_mrp_ring_state_policy[IFLA_BRIDGE_MRP_RING_STATE_MAX + 1] = {
+ [IFLA_BRIDGE_MRP_RING_STATE_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_MRP_RING_STATE_RING_ID] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_RING_STATE_STATE] = { .type = NLA_U32 },
+};
+
+static int br_mrp_ring_state_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MRP_RING_STATE_MAX + 1];
+ struct br_mrp_ring_state state;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_RING_STATE_MAX, attr,
+ br_mrp_ring_state_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_MRP_RING_STATE_RING_ID] ||
+ !tb[IFLA_BRIDGE_MRP_RING_STATE_STATE]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing attribute: RING_ID or STATE");
+ return -EINVAL;
+ }
+
+ memset(&state, 0x0, sizeof(state));
+
+ state.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_STATE_RING_ID]);
+ state.ring_state = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_STATE_STATE]);
+
+ return br_mrp_set_ring_state(br, &state);
+}
+
+static const struct nla_policy
+br_mrp_ring_role_policy[IFLA_BRIDGE_MRP_RING_ROLE_MAX + 1] = {
+ [IFLA_BRIDGE_MRP_RING_ROLE_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_MRP_RING_ROLE_RING_ID] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_RING_ROLE_ROLE] = { .type = NLA_U32 },
+};
+
+static int br_mrp_ring_role_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MRP_RING_ROLE_MAX + 1];
+ struct br_mrp_ring_role role;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_RING_ROLE_MAX, attr,
+ br_mrp_ring_role_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_MRP_RING_ROLE_RING_ID] ||
+ !tb[IFLA_BRIDGE_MRP_RING_ROLE_ROLE]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing attribute: RING_ID or ROLE");
+ return -EINVAL;
+ }
+
+ memset(&role, 0x0, sizeof(role));
+
+ role.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_ROLE_RING_ID]);
+ role.ring_role = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_ROLE_ROLE]);
+
+ return br_mrp_set_ring_role(br, &role);
+}
+
+static const struct nla_policy
+br_mrp_start_test_policy[IFLA_BRIDGE_MRP_START_TEST_MAX + 1] = {
+ [IFLA_BRIDGE_MRP_START_TEST_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_MRP_START_TEST_RING_ID] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_START_TEST_INTERVAL] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_START_TEST_MAX_MISS] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_START_TEST_PERIOD] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_START_TEST_MONITOR] = { .type = NLA_U32 },
+};
+
+static int br_mrp_start_test_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MRP_START_TEST_MAX + 1];
+ struct br_mrp_start_test test;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_START_TEST_MAX, attr,
+ br_mrp_start_test_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_MRP_START_TEST_RING_ID] ||
+ !tb[IFLA_BRIDGE_MRP_START_TEST_INTERVAL] ||
+ !tb[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS] ||
+ !tb[IFLA_BRIDGE_MRP_START_TEST_PERIOD]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing attribute: RING_ID or INTERVAL or MAX_MISS or PERIOD");
+ return -EINVAL;
+ }
+
+ memset(&test, 0x0, sizeof(test));
+
+ test.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_RING_ID]);
+ test.interval = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_INTERVAL]);
+ test.max_miss = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS]);
+ test.period = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_PERIOD]);
+ test.monitor = false;
+
+ if (tb[IFLA_BRIDGE_MRP_START_TEST_MONITOR])
+ test.monitor =
+ nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_MONITOR]);
+
+ return br_mrp_start_test(br, &test);
+}
+
+static const struct nla_policy
+br_mrp_in_state_policy[IFLA_BRIDGE_MRP_IN_STATE_MAX + 1] = {
+ [IFLA_BRIDGE_MRP_IN_STATE_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_MRP_IN_STATE_IN_ID] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_IN_STATE_STATE] = { .type = NLA_U32 },
+};
+
+static int br_mrp_in_state_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MRP_IN_STATE_MAX + 1];
+ struct br_mrp_in_state state;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_IN_STATE_MAX, attr,
+ br_mrp_in_state_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_MRP_IN_STATE_IN_ID] ||
+ !tb[IFLA_BRIDGE_MRP_IN_STATE_STATE]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing attribute: IN_ID or STATE");
+ return -EINVAL;
+ }
+
+ memset(&state, 0x0, sizeof(state));
+
+ state.in_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_STATE_IN_ID]);
+ state.in_state = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_STATE_STATE]);
+
+ return br_mrp_set_in_state(br, &state);
+}
+
+static const struct nla_policy
+br_mrp_in_role_policy[IFLA_BRIDGE_MRP_IN_ROLE_MAX + 1] = {
+ [IFLA_BRIDGE_MRP_IN_ROLE_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_MRP_IN_ROLE_RING_ID] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_IN_ROLE_IN_ID] = { .type = NLA_U16 },
+ [IFLA_BRIDGE_MRP_IN_ROLE_ROLE] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_IN_ROLE_I_IFINDEX] = { .type = NLA_U32 },
+};
+
+static int br_mrp_in_role_parse(struct net_bridge *br, struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MRP_IN_ROLE_MAX + 1];
+ struct br_mrp_in_role role;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_IN_ROLE_MAX, attr,
+ br_mrp_in_role_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_MRP_IN_ROLE_RING_ID] ||
+ !tb[IFLA_BRIDGE_MRP_IN_ROLE_IN_ID] ||
+ !tb[IFLA_BRIDGE_MRP_IN_ROLE_I_IFINDEX] ||
+ !tb[IFLA_BRIDGE_MRP_IN_ROLE_ROLE]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing attribute: RING_ID or ROLE or IN_ID or I_IFINDEX");
+ return -EINVAL;
+ }
+
+ memset(&role, 0x0, sizeof(role));
+
+ role.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_ROLE_RING_ID]);
+ role.in_id = nla_get_u16(tb[IFLA_BRIDGE_MRP_IN_ROLE_IN_ID]);
+ role.i_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_ROLE_I_IFINDEX]);
+ role.in_role = nla_get_u32(tb[IFLA_BRIDGE_MRP_IN_ROLE_ROLE]);
+
+ return br_mrp_set_in_role(br, &role);
+}
+
+static const struct nla_policy
+br_mrp_start_in_test_policy[IFLA_BRIDGE_MRP_START_IN_TEST_MAX + 1] = {
+ [IFLA_BRIDGE_MRP_START_IN_TEST_UNSPEC] = { .type = NLA_REJECT },
+ [IFLA_BRIDGE_MRP_START_IN_TEST_IN_ID] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_START_IN_TEST_MAX_MISS] = { .type = NLA_U32 },
+ [IFLA_BRIDGE_MRP_START_IN_TEST_PERIOD] = { .type = NLA_U32 },
+};
+
+static int br_mrp_start_in_test_parse(struct net_bridge *br,
+ struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MRP_START_IN_TEST_MAX + 1];
+ struct br_mrp_start_in_test test;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_START_IN_TEST_MAX, attr,
+ br_mrp_start_in_test_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_MRP_START_IN_TEST_IN_ID] ||
+ !tb[IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL] ||
+ !tb[IFLA_BRIDGE_MRP_START_IN_TEST_MAX_MISS] ||
+ !tb[IFLA_BRIDGE_MRP_START_IN_TEST_PERIOD]) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Missing attribute: RING_ID or INTERVAL or MAX_MISS or PERIOD");
+ return -EINVAL;
+ }
+
+ memset(&test, 0x0, sizeof(test));
+
+ test.in_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_IN_TEST_IN_ID]);
+ test.interval = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_IN_TEST_INTERVAL]);
+ test.max_miss = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_IN_TEST_MAX_MISS]);
+ test.period = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_IN_TEST_PERIOD]);
+
+ return br_mrp_start_in_test(br, &test);
+}
+
+int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p,
+ struct nlattr *attr, int cmd, struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MRP_MAX + 1];
+ int err;
+
+ /* When this function is called for a port then the br pointer is
+ * invalid, therefor set the br to point correctly
+ */
+ if (p)
+ br = p->br;
+
+ if (br->stp_enabled != BR_NO_STP) {
+ NL_SET_ERR_MSG_MOD(extack, "MRP can't be enabled if STP is already enabled");
+ return -EINVAL;
+ }
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_MAX, attr,
+ br_mrp_policy, extack);
+ if (err)
+ return err;
+
+ if (tb[IFLA_BRIDGE_MRP_INSTANCE]) {
+ err = br_mrp_instance_parse(br, tb[IFLA_BRIDGE_MRP_INSTANCE],
+ cmd, extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_MRP_PORT_STATE]) {
+ err = br_mrp_port_state_parse(p, tb[IFLA_BRIDGE_MRP_PORT_STATE],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_MRP_PORT_ROLE]) {
+ err = br_mrp_port_role_parse(p, tb[IFLA_BRIDGE_MRP_PORT_ROLE],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_MRP_RING_STATE]) {
+ err = br_mrp_ring_state_parse(br,
+ tb[IFLA_BRIDGE_MRP_RING_STATE],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_MRP_RING_ROLE]) {
+ err = br_mrp_ring_role_parse(br, tb[IFLA_BRIDGE_MRP_RING_ROLE],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_MRP_START_TEST]) {
+ err = br_mrp_start_test_parse(br,
+ tb[IFLA_BRIDGE_MRP_START_TEST],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_MRP_IN_STATE]) {
+ err = br_mrp_in_state_parse(br, tb[IFLA_BRIDGE_MRP_IN_STATE],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_MRP_IN_ROLE]) {
+ err = br_mrp_in_role_parse(br, tb[IFLA_BRIDGE_MRP_IN_ROLE],
+ extack);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRIDGE_MRP_START_IN_TEST]) {
+ err = br_mrp_start_in_test_parse(br,
+ tb[IFLA_BRIDGE_MRP_START_IN_TEST],
+ extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br)
+{
+ struct nlattr *tb, *mrp_tb;
+ struct br_mrp *mrp;
+
+ mrp_tb = nla_nest_start_noflag(skb, IFLA_BRIDGE_MRP);
+ if (!mrp_tb)
+ return -EMSGSIZE;
+
+ hlist_for_each_entry_rcu(mrp, &br->mrp_list, list) {
+ struct net_bridge_port *p;
+
+ tb = nla_nest_start_noflag(skb, IFLA_BRIDGE_MRP_INFO);
+ if (!tb)
+ goto nla_info_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_RING_ID,
+ mrp->ring_id))
+ goto nla_put_failure;
+
+ p = rcu_dereference(mrp->p_port);
+ if (p && nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_P_IFINDEX,
+ p->dev->ifindex))
+ goto nla_put_failure;
+
+ p = rcu_dereference(mrp->s_port);
+ if (p && nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_S_IFINDEX,
+ p->dev->ifindex))
+ goto nla_put_failure;
+
+ p = rcu_dereference(mrp->i_port);
+ if (p && nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_I_IFINDEX,
+ p->dev->ifindex))
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, IFLA_BRIDGE_MRP_INFO_PRIO,
+ mrp->prio))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_RING_STATE,
+ mrp->ring_state))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_RING_ROLE,
+ mrp->ring_role))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_TEST_INTERVAL,
+ mrp->test_interval))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_TEST_MAX_MISS,
+ mrp->test_max_miss))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_TEST_MONITOR,
+ mrp->test_monitor))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_IN_STATE,
+ mrp->in_state))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_IN_ROLE,
+ mrp->in_role))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_IN_TEST_INTERVAL,
+ mrp->in_test_interval))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, IFLA_BRIDGE_MRP_INFO_IN_TEST_MAX_MISS,
+ mrp->in_test_max_miss))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, tb);
+ }
+ nla_nest_end(skb, mrp_tb);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, tb);
+
+nla_info_failure:
+ nla_nest_cancel(skb, mrp_tb);
+
+ return -EMSGSIZE;
+}
+
+int br_mrp_ring_port_open(struct net_device *dev, u8 loc)
+{
+ struct net_bridge_port *p;
+ int err = 0;
+
+ p = br_port_get_rcu(dev);
+ if (!p) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (loc)
+ p->flags |= BR_MRP_LOST_CONT;
+ else
+ p->flags &= ~BR_MRP_LOST_CONT;
+
+ br_ifinfo_notify(RTM_NEWLINK, NULL, p);
+
+out:
+ return err;
+}
+
+int br_mrp_in_port_open(struct net_device *dev, u8 loc)
+{
+ struct net_bridge_port *p;
+ int err = 0;
+
+ p = br_port_get_rcu(dev);
+ if (!p) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (loc)
+ p->flags |= BR_MRP_LOST_IN_CONT;
+ else
+ p->flags &= ~BR_MRP_LOST_IN_CONT;
+
+ br_ifinfo_notify(RTM_NEWLINK, NULL, p);
+
+out:
+ return err;
+}
diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c
new file mode 100644
index 000000000000..cb54b324fa8c
--- /dev/null
+++ b/net/bridge/br_mrp_switchdev.c
@@ -0,0 +1,241 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <net/switchdev.h>
+
+#include "br_private_mrp.h"
+
+static enum br_mrp_hw_support
+br_mrp_switchdev_port_obj(struct net_bridge *br,
+ const struct switchdev_obj *obj, bool add)
+{
+ int err;
+
+ if (add)
+ err = switchdev_port_obj_add(br->dev, obj, NULL);
+ else
+ err = switchdev_port_obj_del(br->dev, obj);
+
+ /* In case of success just return and notify the SW that doesn't need
+ * to do anything
+ */
+ if (!err)
+ return BR_MRP_HW;
+
+ if (err != -EOPNOTSUPP)
+ return BR_MRP_NONE;
+
+ /* Continue with SW backup */
+ return BR_MRP_SW;
+}
+
+int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp)
+{
+ struct switchdev_obj_mrp mrp_obj = {
+ .obj.orig_dev = br->dev,
+ .obj.id = SWITCHDEV_OBJ_ID_MRP,
+ .p_port = rtnl_dereference(mrp->p_port)->dev,
+ .s_port = rtnl_dereference(mrp->s_port)->dev,
+ .ring_id = mrp->ring_id,
+ .prio = mrp->prio,
+ };
+
+ if (!IS_ENABLED(CONFIG_NET_SWITCHDEV))
+ return 0;
+
+ return switchdev_port_obj_add(br->dev, &mrp_obj.obj, NULL);
+}
+
+int br_mrp_switchdev_del(struct net_bridge *br, struct br_mrp *mrp)
+{
+ struct switchdev_obj_mrp mrp_obj = {
+ .obj.orig_dev = br->dev,
+ .obj.id = SWITCHDEV_OBJ_ID_MRP,
+ .p_port = NULL,
+ .s_port = NULL,
+ .ring_id = mrp->ring_id,
+ };
+
+ if (!IS_ENABLED(CONFIG_NET_SWITCHDEV))
+ return 0;
+
+ return switchdev_port_obj_del(br->dev, &mrp_obj.obj);
+}
+
+enum br_mrp_hw_support
+br_mrp_switchdev_set_ring_role(struct net_bridge *br, struct br_mrp *mrp,
+ enum br_mrp_ring_role_type role)
+{
+ struct switchdev_obj_ring_role_mrp mrp_role = {
+ .obj.orig_dev = br->dev,
+ .obj.id = SWITCHDEV_OBJ_ID_RING_ROLE_MRP,
+ .ring_role = role,
+ .ring_id = mrp->ring_id,
+ .sw_backup = false,
+ };
+ enum br_mrp_hw_support support;
+ int err;
+
+ if (!IS_ENABLED(CONFIG_NET_SWITCHDEV))
+ return BR_MRP_SW;
+
+ support = br_mrp_switchdev_port_obj(br, &mrp_role.obj,
+ role != BR_MRP_RING_ROLE_DISABLED);
+ if (support != BR_MRP_SW)
+ return support;
+
+ /* If the driver can't configure to run completely the protocol in HW,
+ * then try again to configure the HW so the SW can run the protocol.
+ */
+ mrp_role.sw_backup = true;
+ if (role != BR_MRP_RING_ROLE_DISABLED)
+ err = switchdev_port_obj_add(br->dev, &mrp_role.obj, NULL);
+ else
+ err = switchdev_port_obj_del(br->dev, &mrp_role.obj);
+
+ if (!err)
+ return BR_MRP_SW;
+
+ return BR_MRP_NONE;
+}
+
+enum br_mrp_hw_support
+br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp,
+ u32 interval, u8 max_miss, u32 period,
+ bool monitor)
+{
+ struct switchdev_obj_ring_test_mrp test = {
+ .obj.orig_dev = br->dev,
+ .obj.id = SWITCHDEV_OBJ_ID_RING_TEST_MRP,
+ .interval = interval,
+ .max_miss = max_miss,
+ .ring_id = mrp->ring_id,
+ .period = period,
+ .monitor = monitor,
+ };
+
+ if (!IS_ENABLED(CONFIG_NET_SWITCHDEV))
+ return BR_MRP_SW;
+
+ return br_mrp_switchdev_port_obj(br, &test.obj, interval != 0);
+}
+
+int br_mrp_switchdev_set_ring_state(struct net_bridge *br,
+ struct br_mrp *mrp,
+ enum br_mrp_ring_state_type state)
+{
+ struct switchdev_obj_ring_state_mrp mrp_state = {
+ .obj.orig_dev = br->dev,
+ .obj.id = SWITCHDEV_OBJ_ID_RING_STATE_MRP,
+ .ring_state = state,
+ .ring_id = mrp->ring_id,
+ };
+
+ if (!IS_ENABLED(CONFIG_NET_SWITCHDEV))
+ return 0;
+
+ return switchdev_port_obj_add(br->dev, &mrp_state.obj, NULL);
+}
+
+enum br_mrp_hw_support
+br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp,
+ u16 in_id, u32 ring_id,
+ enum br_mrp_in_role_type role)
+{
+ struct switchdev_obj_in_role_mrp mrp_role = {
+ .obj.orig_dev = br->dev,
+ .obj.id = SWITCHDEV_OBJ_ID_IN_ROLE_MRP,
+ .in_role = role,
+ .in_id = mrp->in_id,
+ .ring_id = mrp->ring_id,
+ .i_port = rtnl_dereference(mrp->i_port)->dev,
+ .sw_backup = false,
+ };
+ enum br_mrp_hw_support support;
+ int err;
+
+ if (!IS_ENABLED(CONFIG_NET_SWITCHDEV))
+ return BR_MRP_SW;
+
+ support = br_mrp_switchdev_port_obj(br, &mrp_role.obj,
+ role != BR_MRP_IN_ROLE_DISABLED);
+ if (support != BR_MRP_NONE)
+ return support;
+
+ /* If the driver can't configure to run completely the protocol in HW,
+ * then try again to configure the HW so the SW can run the protocol.
+ */
+ mrp_role.sw_backup = true;
+ if (role != BR_MRP_IN_ROLE_DISABLED)
+ err = switchdev_port_obj_add(br->dev, &mrp_role.obj, NULL);
+ else
+ err = switchdev_port_obj_del(br->dev, &mrp_role.obj);
+
+ if (!err)
+ return BR_MRP_SW;
+
+ return BR_MRP_NONE;
+}
+
+int br_mrp_switchdev_set_in_state(struct net_bridge *br, struct br_mrp *mrp,
+ enum br_mrp_in_state_type state)
+{
+ struct switchdev_obj_in_state_mrp mrp_state = {
+ .obj.orig_dev = br->dev,
+ .obj.id = SWITCHDEV_OBJ_ID_IN_STATE_MRP,
+ .in_state = state,
+ .in_id = mrp->in_id,
+ };
+
+ if (!IS_ENABLED(CONFIG_NET_SWITCHDEV))
+ return 0;
+
+ return switchdev_port_obj_add(br->dev, &mrp_state.obj, NULL);
+}
+
+enum br_mrp_hw_support
+br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp,
+ u32 interval, u8 max_miss, u32 period)
+{
+ struct switchdev_obj_in_test_mrp test = {
+ .obj.orig_dev = br->dev,
+ .obj.id = SWITCHDEV_OBJ_ID_IN_TEST_MRP,
+ .interval = interval,
+ .max_miss = max_miss,
+ .in_id = mrp->in_id,
+ .period = period,
+ };
+
+ if (!IS_ENABLED(CONFIG_NET_SWITCHDEV))
+ return BR_MRP_SW;
+
+ return br_mrp_switchdev_port_obj(br, &test.obj, interval != 0);
+}
+
+int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, u32 state)
+{
+ struct switchdev_attr attr = {
+ .orig_dev = p->dev,
+ .id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
+ .u.stp_state = state,
+ };
+
+ if (!IS_ENABLED(CONFIG_NET_SWITCHDEV))
+ return 0;
+
+ return switchdev_port_attr_set(p->dev, &attr, NULL);
+}
+
+int br_mrp_port_switchdev_set_role(struct net_bridge_port *p,
+ enum br_mrp_port_role_type role)
+{
+ struct switchdev_attr attr = {
+ .orig_dev = p->dev,
+ .id = SWITCHDEV_ATTR_ID_MRP_PORT_ROLE,
+ .u.mrp_port_role = role,
+ };
+
+ if (!IS_ENABLED(CONFIG_NET_SWITCHDEV))
+ return 0;
+
+ return switchdev_port_attr_set(p->dev, &attr, NULL);
+}
diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c
new file mode 100644
index 000000000000..43a300ae6bfa
--- /dev/null
+++ b/net/bridge/br_mst.c
@@ -0,0 +1,366 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Bridge Multiple Spanning Tree Support
+ *
+ * Authors:
+ * Tobias Waldekranz <tobias@waldekranz.com>
+ */
+
+#include <linux/kernel.h>
+#include <net/switchdev.h>
+
+#include "br_private.h"
+
+DEFINE_STATIC_KEY_FALSE(br_mst_used);
+
+bool br_mst_enabled(const struct net_device *dev)
+{
+ if (!netif_is_bridge_master(dev))
+ return false;
+
+ return br_opt_get(netdev_priv(dev), BROPT_MST_ENABLED);
+}
+EXPORT_SYMBOL_GPL(br_mst_enabled);
+
+void br_mst_uninit(struct net_bridge *br)
+{
+ if (br_opt_get(br, BROPT_MST_ENABLED))
+ static_branch_dec(&br_mst_used);
+}
+
+int br_mst_get_info(const struct net_device *dev, u16 msti, unsigned long *vids)
+{
+ const struct net_bridge_vlan_group *vg;
+ const struct net_bridge_vlan *v;
+ const struct net_bridge *br;
+
+ ASSERT_RTNL();
+
+ if (!netif_is_bridge_master(dev))
+ return -EINVAL;
+
+ br = netdev_priv(dev);
+ if (!br_opt_get(br, BROPT_MST_ENABLED))
+ return -EINVAL;
+
+ vg = br_vlan_group(br);
+
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ if (v->msti == msti)
+ __set_bit(v->vid, vids);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_mst_get_info);
+
+int br_mst_get_state(const struct net_device *dev, u16 msti, u8 *state)
+{
+ const struct net_bridge_port *p = NULL;
+ const struct net_bridge_vlan_group *vg;
+ const struct net_bridge_vlan *v;
+
+ ASSERT_RTNL();
+
+ p = br_port_get_check_rtnl(dev);
+ if (!p || !br_opt_get(p->br, BROPT_MST_ENABLED))
+ return -EINVAL;
+
+ vg = nbp_vlan_group(p);
+
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ if (v->brvlan->msti == msti) {
+ *state = v->state;
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+EXPORT_SYMBOL_GPL(br_mst_get_state);
+
+static void br_mst_vlan_set_state(struct net_bridge_vlan_group *vg,
+ struct net_bridge_vlan *v,
+ u8 state)
+{
+ if (br_vlan_get_state(v) == state)
+ return;
+
+ if (v->vid == vg->pvid)
+ br_vlan_set_pvid_state(vg, state);
+
+ br_vlan_set_state(v, state);
+}
+
+int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state,
+ struct netlink_ext_ack *extack)
+{
+ struct switchdev_attr attr = {
+ .id = SWITCHDEV_ATTR_ID_PORT_MST_STATE,
+ .orig_dev = p->dev,
+ .u.mst_state = {
+ .msti = msti,
+ .state = state,
+ },
+ };
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ int err = 0;
+
+ rcu_read_lock();
+ vg = nbp_vlan_group_rcu(p);
+ if (!vg)
+ goto out;
+
+ /* MSTI 0 (CST) state changes are notified via the regular
+ * SWITCHDEV_ATTR_ID_PORT_STP_STATE.
+ */
+ if (msti) {
+ err = switchdev_port_attr_set(p->dev, &attr, extack);
+ if (err && err != -EOPNOTSUPP)
+ goto out;
+ }
+
+ err = 0;
+ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
+ if (v->brvlan->msti != msti)
+ continue;
+
+ br_mst_vlan_set_state(vg, v, state);
+ }
+
+out:
+ rcu_read_unlock();
+ return err;
+}
+
+static void br_mst_vlan_sync_state(struct net_bridge_vlan *pv, u16 msti)
+{
+ struct net_bridge_vlan_group *vg = nbp_vlan_group(pv->port);
+ struct net_bridge_vlan *v;
+
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ /* If this port already has a defined state in this
+ * MSTI (through some other VLAN membership), inherit
+ * it.
+ */
+ if (v != pv && v->brvlan->msti == msti) {
+ br_mst_vlan_set_state(vg, pv, v->state);
+ return;
+ }
+ }
+
+ /* Otherwise, start out in a new MSTI with all ports disabled. */
+ return br_mst_vlan_set_state(vg, pv, BR_STATE_DISABLED);
+}
+
+int br_mst_vlan_set_msti(struct net_bridge_vlan *mv, u16 msti)
+{
+ struct switchdev_attr attr = {
+ .id = SWITCHDEV_ATTR_ID_VLAN_MSTI,
+ .orig_dev = mv->br->dev,
+ .u.vlan_msti = {
+ .vid = mv->vid,
+ .msti = msti,
+ },
+ };
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *pv;
+ struct net_bridge_port *p;
+ int err;
+
+ if (mv->msti == msti)
+ return 0;
+
+ err = switchdev_port_attr_set(mv->br->dev, &attr, NULL);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ mv->msti = msti;
+
+ list_for_each_entry(p, &mv->br->port_list, list) {
+ vg = nbp_vlan_group(p);
+
+ pv = br_vlan_find(vg, mv->vid);
+ if (pv)
+ br_mst_vlan_sync_state(pv, msti);
+ }
+
+ return 0;
+}
+
+void br_mst_vlan_init_state(struct net_bridge_vlan *v)
+{
+ /* VLANs always start out in MSTI 0 (CST) */
+ v->msti = 0;
+
+ if (br_vlan_is_master(v))
+ v->state = BR_STATE_FORWARDING;
+ else
+ v->state = v->port->state;
+}
+
+int br_mst_set_enabled(struct net_bridge *br, bool on,
+ struct netlink_ext_ack *extack)
+{
+ struct switchdev_attr attr = {
+ .id = SWITCHDEV_ATTR_ID_BRIDGE_MST,
+ .orig_dev = br->dev,
+ .u.mst = on,
+ };
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p;
+ int err;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ vg = nbp_vlan_group(p);
+
+ if (!vg->num_vlans)
+ continue;
+
+ NL_SET_ERR_MSG(extack,
+ "MST mode can't be changed while VLANs exist");
+ return -EBUSY;
+ }
+
+ if (br_opt_get(br, BROPT_MST_ENABLED) == on)
+ return 0;
+
+ err = switchdev_port_attr_set(br->dev, &attr, extack);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ if (on)
+ static_branch_inc(&br_mst_used);
+ else
+ static_branch_dec(&br_mst_used);
+
+ br_opt_toggle(br, BROPT_MST_ENABLED, on);
+ return 0;
+}
+
+size_t br_mst_info_size(const struct net_bridge_vlan_group *vg)
+{
+ DECLARE_BITMAP(seen, VLAN_N_VID) = { 0 };
+ const struct net_bridge_vlan *v;
+ size_t sz;
+
+ /* IFLA_BRIDGE_MST */
+ sz = nla_total_size(0);
+
+ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
+ if (test_bit(v->brvlan->msti, seen))
+ continue;
+
+ /* IFLA_BRIDGE_MST_ENTRY */
+ sz += nla_total_size(0) +
+ /* IFLA_BRIDGE_MST_ENTRY_MSTI */
+ nla_total_size(sizeof(u16)) +
+ /* IFLA_BRIDGE_MST_ENTRY_STATE */
+ nla_total_size(sizeof(u8));
+
+ __set_bit(v->brvlan->msti, seen);
+ }
+
+ return sz;
+}
+
+int br_mst_fill_info(struct sk_buff *skb,
+ const struct net_bridge_vlan_group *vg)
+{
+ DECLARE_BITMAP(seen, VLAN_N_VID) = { 0 };
+ const struct net_bridge_vlan *v;
+ struct nlattr *nest;
+ int err = 0;
+
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ if (test_bit(v->brvlan->msti, seen))
+ continue;
+
+ nest = nla_nest_start_noflag(skb, IFLA_BRIDGE_MST_ENTRY);
+ if (!nest ||
+ nla_put_u16(skb, IFLA_BRIDGE_MST_ENTRY_MSTI, v->brvlan->msti) ||
+ nla_put_u8(skb, IFLA_BRIDGE_MST_ENTRY_STATE, v->state)) {
+ err = -EMSGSIZE;
+ break;
+ }
+ nla_nest_end(skb, nest);
+
+ __set_bit(v->brvlan->msti, seen);
+ }
+
+ return err;
+}
+
+static const struct nla_policy br_mst_nl_policy[IFLA_BRIDGE_MST_ENTRY_MAX + 1] = {
+ [IFLA_BRIDGE_MST_ENTRY_MSTI] = NLA_POLICY_RANGE(NLA_U16,
+ 1, /* 0 reserved for CST */
+ VLAN_N_VID - 1),
+ [IFLA_BRIDGE_MST_ENTRY_STATE] = NLA_POLICY_RANGE(NLA_U8,
+ BR_STATE_DISABLED,
+ BR_STATE_BLOCKING),
+};
+
+static int br_mst_process_one(struct net_bridge_port *p,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_BRIDGE_MST_ENTRY_MAX + 1];
+ u16 msti;
+ u8 state;
+ int err;
+
+ err = nla_parse_nested(tb, IFLA_BRIDGE_MST_ENTRY_MAX, attr,
+ br_mst_nl_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_BRIDGE_MST_ENTRY_MSTI]) {
+ NL_SET_ERR_MSG_MOD(extack, "MSTI not specified");
+ return -EINVAL;
+ }
+
+ if (!tb[IFLA_BRIDGE_MST_ENTRY_STATE]) {
+ NL_SET_ERR_MSG_MOD(extack, "State not specified");
+ return -EINVAL;
+ }
+
+ msti = nla_get_u16(tb[IFLA_BRIDGE_MST_ENTRY_MSTI]);
+ state = nla_get_u8(tb[IFLA_BRIDGE_MST_ENTRY_STATE]);
+
+ return br_mst_set_state(p, msti, state, extack);
+}
+
+int br_mst_process(struct net_bridge_port *p, const struct nlattr *mst_attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *attr;
+ int err, msts = 0;
+ int rem;
+
+ if (!br_opt_get(p->br, BROPT_MST_ENABLED)) {
+ NL_SET_ERR_MSG_MOD(extack, "Can't modify MST state when MST is disabled");
+ return -EBUSY;
+ }
+
+ nla_for_each_nested(attr, mst_attr, rem) {
+ switch (nla_type(attr)) {
+ case IFLA_BRIDGE_MST_ENTRY:
+ err = br_mst_process_one(p, attr, extack);
+ break;
+ default:
+ continue;
+ }
+
+ msts++;
+ if (err)
+ break;
+ }
+
+ if (!msts) {
+ NL_SET_ERR_MSG_MOD(extack, "Found no MST entries to process");
+ err = -EINVAL;
+ }
+
+ return err;
+}
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 20ed7adcf1cc..d55a4ab87837 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1,19 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Bridge multicast support.
*
* Copyright (c) 2010 Herbert Xu <herbert@gondor.apana.org.au>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
*/
#include <linux/err.h>
#include <linux/export.h>
#include <linux/if_ether.h>
#include <linux/igmp.h>
+#include <linux/in.h>
#include <linux/jhash.h>
#include <linux/kernel.h>
#include <linux/log2.h>
@@ -29,254 +25,819 @@
#include <net/ip.h>
#include <net/switchdev.h>
#if IS_ENABLED(CONFIG_IPV6)
+#include <linux/icmpv6.h>
#include <net/ipv6.h>
#include <net/mld.h>
#include <net/ip6_checksum.h>
#include <net/addrconf.h>
#endif
+#include <trace/events/bridge.h>
#include "br_private.h"
-
-static void br_multicast_start_querier(struct net_bridge *br,
+#include "br_private_mcast_eht.h"
+
+static const struct rhashtable_params br_mdb_rht_params = {
+ .head_offset = offsetof(struct net_bridge_mdb_entry, rhnode),
+ .key_offset = offsetof(struct net_bridge_mdb_entry, addr),
+ .key_len = sizeof(struct br_ip),
+ .automatic_shrinking = true,
+};
+
+static const struct rhashtable_params br_sg_port_rht_params = {
+ .head_offset = offsetof(struct net_bridge_port_group, rhnode),
+ .key_offset = offsetof(struct net_bridge_port_group, key),
+ .key_len = sizeof(struct net_bridge_port_group_sg_key),
+ .automatic_shrinking = true,
+};
+
+static void br_multicast_start_querier(struct net_bridge_mcast *brmctx,
struct bridge_mcast_own_query *query);
-static void br_multicast_add_router(struct net_bridge *br,
- struct net_bridge_port *port);
-static void br_ip4_multicast_leave_group(struct net_bridge *br,
- struct net_bridge_port *port,
+static void br_ip4_multicast_add_router(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx);
+static void br_ip4_multicast_leave_group(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
__be32 group,
__u16 vid,
const unsigned char *src);
+static void br_multicast_port_group_rexmit(struct timer_list *t);
-static void __del_port_router(struct net_bridge_port *p);
+static void
+br_multicast_rport_del_notify(struct net_bridge_mcast_port *pmctx, bool deleted);
+static void br_ip6_multicast_add_router(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx);
#if IS_ENABLED(CONFIG_IPV6)
-static void br_ip6_multicast_leave_group(struct net_bridge *br,
- struct net_bridge_port *port,
+static void br_ip6_multicast_leave_group(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
const struct in6_addr *group,
__u16 vid, const unsigned char *src);
#endif
-unsigned int br_mdb_rehash_seq;
-
-static inline int br_ip_equal(const struct br_ip *a, const struct br_ip *b)
+static struct net_bridge_port_group *
+__br_multicast_add_group(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct br_ip *group,
+ const unsigned char *src,
+ u8 filter_mode,
+ bool igmpv2_mldv1,
+ bool blocked);
+static void br_multicast_find_del_pg(struct net_bridge *br,
+ struct net_bridge_port_group *pg);
+static void __br_multicast_stop(struct net_bridge_mcast *brmctx);
+
+static int br_mc_disabled_update(struct net_device *dev, bool value,
+ struct netlink_ext_ack *extack);
+
+static struct net_bridge_port_group *
+br_sg_port_find(struct net_bridge *br,
+ struct net_bridge_port_group_sg_key *sg_p)
{
- if (a->proto != b->proto)
- return 0;
- if (a->vid != b->vid)
- return 0;
- switch (a->proto) {
- case htons(ETH_P_IP):
- return a->u.ip4 == b->u.ip4;
-#if IS_ENABLED(CONFIG_IPV6)
- case htons(ETH_P_IPV6):
- return ipv6_addr_equal(&a->u.ip6, &b->u.ip6);
-#endif
- }
- return 0;
-}
+ lockdep_assert_held_once(&br->multicast_lock);
-static inline int __br_ip4_hash(struct net_bridge_mdb_htable *mdb, __be32 ip,
- __u16 vid)
-{
- return jhash_2words((__force u32)ip, vid, mdb->secret) & (mdb->max - 1);
+ return rhashtable_lookup_fast(&br->sg_port_tbl, sg_p,
+ br_sg_port_rht_params);
}
-#if IS_ENABLED(CONFIG_IPV6)
-static inline int __br_ip6_hash(struct net_bridge_mdb_htable *mdb,
- const struct in6_addr *ip,
- __u16 vid)
+static struct net_bridge_mdb_entry *br_mdb_ip_get_rcu(struct net_bridge *br,
+ struct br_ip *dst)
{
- return jhash_2words(ipv6_addr_hash(ip), vid,
- mdb->secret) & (mdb->max - 1);
+ return rhashtable_lookup(&br->mdb_hash_tbl, dst, br_mdb_rht_params);
}
-#endif
-static inline int br_ip_hash(struct net_bridge_mdb_htable *mdb,
- struct br_ip *ip)
-{
- switch (ip->proto) {
- case htons(ETH_P_IP):
- return __br_ip4_hash(mdb, ip->u.ip4, ip->vid);
-#if IS_ENABLED(CONFIG_IPV6)
- case htons(ETH_P_IPV6):
- return __br_ip6_hash(mdb, &ip->u.ip6, ip->vid);
-#endif
- }
- return 0;
-}
-
-static struct net_bridge_mdb_entry *__br_mdb_ip_get(
- struct net_bridge_mdb_htable *mdb, struct br_ip *dst, int hash)
+struct net_bridge_mdb_entry *br_mdb_ip_get(struct net_bridge *br,
+ struct br_ip *dst)
{
- struct net_bridge_mdb_entry *mp;
-
- hlist_for_each_entry_rcu(mp, &mdb->mhash[hash], hlist[mdb->ver]) {
- if (br_ip_equal(&mp->addr, dst))
- return mp;
- }
+ struct net_bridge_mdb_entry *ent;
- return NULL;
-}
+ lockdep_assert_held_once(&br->multicast_lock);
-struct net_bridge_mdb_entry *br_mdb_ip_get(struct net_bridge_mdb_htable *mdb,
- struct br_ip *dst)
-{
- if (!mdb)
- return NULL;
+ rcu_read_lock();
+ ent = rhashtable_lookup(&br->mdb_hash_tbl, dst, br_mdb_rht_params);
+ rcu_read_unlock();
- return __br_mdb_ip_get(mdb, dst, br_ip_hash(mdb, dst));
+ return ent;
}
-static struct net_bridge_mdb_entry *br_mdb_ip4_get(
- struct net_bridge_mdb_htable *mdb, __be32 dst, __u16 vid)
+static struct net_bridge_mdb_entry *br_mdb_ip4_get(struct net_bridge *br,
+ __be32 dst, __u16 vid)
{
struct br_ip br_dst;
- br_dst.u.ip4 = dst;
+ memset(&br_dst, 0, sizeof(br_dst));
+ br_dst.dst.ip4 = dst;
br_dst.proto = htons(ETH_P_IP);
br_dst.vid = vid;
- return br_mdb_ip_get(mdb, &br_dst);
+ return br_mdb_ip_get(br, &br_dst);
}
#if IS_ENABLED(CONFIG_IPV6)
-static struct net_bridge_mdb_entry *br_mdb_ip6_get(
- struct net_bridge_mdb_htable *mdb, const struct in6_addr *dst,
- __u16 vid)
+static struct net_bridge_mdb_entry *br_mdb_ip6_get(struct net_bridge *br,
+ const struct in6_addr *dst,
+ __u16 vid)
{
struct br_ip br_dst;
- br_dst.u.ip6 = *dst;
+ memset(&br_dst, 0, sizeof(br_dst));
+ br_dst.dst.ip6 = *dst;
br_dst.proto = htons(ETH_P_IPV6);
br_dst.vid = vid;
- return br_mdb_ip_get(mdb, &br_dst);
+ return br_mdb_ip_get(br, &br_dst);
}
#endif
-struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
- struct sk_buff *skb, u16 vid)
+struct net_bridge_mdb_entry *
+br_mdb_entry_skb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb,
+ u16 vid)
{
- struct net_bridge_mdb_htable *mdb = rcu_dereference(br->mdb);
+ struct net_bridge *br = brmctx->br;
struct br_ip ip;
- if (br->multicast_disabled)
+ if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) ||
+ br_multicast_ctx_vlan_global_disabled(brmctx))
return NULL;
if (BR_INPUT_SKB_CB(skb)->igmp)
return NULL;
+ memset(&ip, 0, sizeof(ip));
ip.proto = skb->protocol;
ip.vid = vid;
switch (skb->protocol) {
case htons(ETH_P_IP):
- ip.u.ip4 = ip_hdr(skb)->daddr;
+ ip.dst.ip4 = ip_hdr(skb)->daddr;
+ if (brmctx->multicast_igmp_version == 3) {
+ struct net_bridge_mdb_entry *mdb;
+
+ ip.src.ip4 = ip_hdr(skb)->saddr;
+ mdb = br_mdb_ip_get_rcu(br, &ip);
+ if (mdb)
+ return mdb;
+ ip.src.ip4 = 0;
+ }
break;
#if IS_ENABLED(CONFIG_IPV6)
case htons(ETH_P_IPV6):
- ip.u.ip6 = ipv6_hdr(skb)->daddr;
+ ip.dst.ip6 = ipv6_hdr(skb)->daddr;
+ if (brmctx->multicast_mld_version == 2) {
+ struct net_bridge_mdb_entry *mdb;
+
+ ip.src.ip6 = ipv6_hdr(skb)->saddr;
+ mdb = br_mdb_ip_get_rcu(br, &ip);
+ if (mdb)
+ return mdb;
+ memset(&ip.src.ip6, 0, sizeof(ip.src.ip6));
+ }
break;
#endif
default:
+ ip.proto = 0;
+ ether_addr_copy(ip.dst.mac_addr, eth_hdr(skb)->h_dest);
+ }
+
+ return br_mdb_ip_get_rcu(br, &ip);
+}
+
+/* IMPORTANT: this function must be used only when the contexts cannot be
+ * passed down (e.g. timer) and must be used for read-only purposes because
+ * the vlan snooping option can change, so it can return any context
+ * (non-vlan or vlan). Its initial intended purpose is to read timer values
+ * from the *current* context based on the option. At worst that could lead
+ * to inconsistent timers when the contexts are changed, i.e. src timer
+ * which needs to re-arm with a specific delay taken from the old context
+ */
+static struct net_bridge_mcast_port *
+br_multicast_pg_to_port_ctx(const struct net_bridge_port_group *pg)
+{
+ struct net_bridge_mcast_port *pmctx = &pg->key.port->multicast_ctx;
+ struct net_bridge_vlan *vlan;
+
+ lockdep_assert_held_once(&pg->key.port->br->multicast_lock);
+
+ /* if vlan snooping is disabled use the port's multicast context */
+ if (!pg->key.addr.vid ||
+ !br_opt_get(pg->key.port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED))
+ goto out;
+
+ /* locking is tricky here, due to different rules for multicast and
+ * vlans we need to take rcu to find the vlan and make sure it has
+ * the BR_VLFLAG_MCAST_ENABLED flag set, it can only change under
+ * multicast_lock which must be already held here, so the vlan's pmctx
+ * can safely be used on return
+ */
+ rcu_read_lock();
+ vlan = br_vlan_find(nbp_vlan_group_rcu(pg->key.port), pg->key.addr.vid);
+ if (vlan && !br_multicast_port_ctx_vlan_disabled(&vlan->port_mcast_ctx))
+ pmctx = &vlan->port_mcast_ctx;
+ else
+ pmctx = NULL;
+ rcu_read_unlock();
+out:
+ return pmctx;
+}
+
+static struct net_bridge_mcast_port *
+br_multicast_port_vid_to_port_ctx(struct net_bridge_port *port, u16 vid)
+{
+ struct net_bridge_mcast_port *pmctx = NULL;
+ struct net_bridge_vlan *vlan;
+
+ lockdep_assert_held_once(&port->br->multicast_lock);
+
+ if (!br_opt_get(port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED))
return NULL;
+
+ /* Take RCU to access the vlan. */
+ rcu_read_lock();
+
+ vlan = br_vlan_find(nbp_vlan_group_rcu(port), vid);
+ if (vlan && !br_multicast_port_ctx_vlan_disabled(&vlan->port_mcast_ctx))
+ pmctx = &vlan->port_mcast_ctx;
+
+ rcu_read_unlock();
+
+ return pmctx;
+}
+
+/* when snooping we need to check if the contexts should be used
+ * in the following order:
+ * - if pmctx is non-NULL (port), check if it should be used
+ * - if pmctx is NULL (bridge), check if brmctx should be used
+ */
+static bool
+br_multicast_ctx_should_use(const struct net_bridge_mcast *brmctx,
+ const struct net_bridge_mcast_port *pmctx)
+{
+ if (!netif_running(brmctx->br->dev))
+ return false;
+
+ if (pmctx)
+ return !br_multicast_port_ctx_state_disabled(pmctx);
+ else
+ return !br_multicast_ctx_vlan_disabled(brmctx);
+}
+
+static bool br_port_group_equal(struct net_bridge_port_group *p,
+ struct net_bridge_port *port,
+ const unsigned char *src)
+{
+ if (p->key.port != port)
+ return false;
+
+ if (!(port->flags & BR_MULTICAST_TO_UNICAST))
+ return true;
+
+ return ether_addr_equal(src, p->eth_addr);
+}
+
+static void __fwd_add_star_excl(struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg,
+ struct br_ip *sg_ip)
+{
+ struct net_bridge_port_group_sg_key sg_key;
+ struct net_bridge_port_group *src_pg;
+ struct net_bridge_mcast *brmctx;
+
+ memset(&sg_key, 0, sizeof(sg_key));
+ brmctx = br_multicast_port_ctx_get_global(pmctx);
+ sg_key.port = pg->key.port;
+ sg_key.addr = *sg_ip;
+ if (br_sg_port_find(brmctx->br, &sg_key))
+ return;
+
+ src_pg = __br_multicast_add_group(brmctx, pmctx,
+ sg_ip, pg->eth_addr,
+ MCAST_INCLUDE, false, false);
+ if (IS_ERR_OR_NULL(src_pg) ||
+ src_pg->rt_protocol != RTPROT_KERNEL)
+ return;
+
+ src_pg->flags |= MDB_PG_FLAGS_STAR_EXCL;
+}
+
+static void __fwd_del_star_excl(struct net_bridge_port_group *pg,
+ struct br_ip *sg_ip)
+{
+ struct net_bridge_port_group_sg_key sg_key;
+ struct net_bridge *br = pg->key.port->br;
+ struct net_bridge_port_group *src_pg;
+
+ memset(&sg_key, 0, sizeof(sg_key));
+ sg_key.port = pg->key.port;
+ sg_key.addr = *sg_ip;
+ src_pg = br_sg_port_find(br, &sg_key);
+ if (!src_pg || !(src_pg->flags & MDB_PG_FLAGS_STAR_EXCL) ||
+ src_pg->rt_protocol != RTPROT_KERNEL)
+ return;
+
+ br_multicast_find_del_pg(br, src_pg);
+}
+
+/* When a port group transitions to (or is added as) EXCLUDE we need to add it
+ * to all other ports' S,G entries which are not blocked by the current group
+ * for proper replication, the assumption is that any S,G blocked entries
+ * are already added so the S,G,port lookup should skip them.
+ * When a port group transitions from EXCLUDE -> INCLUDE mode or is being
+ * deleted we need to remove it from all ports' S,G entries where it was
+ * automatically installed before (i.e. where it's MDB_PG_FLAGS_STAR_EXCL).
+ */
+void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg,
+ u8 filter_mode)
+{
+ struct net_bridge *br = pg->key.port->br;
+ struct net_bridge_port_group *pg_lst;
+ struct net_bridge_mcast_port *pmctx;
+ struct net_bridge_mdb_entry *mp;
+ struct br_ip sg_ip;
+
+ if (WARN_ON(!br_multicast_is_star_g(&pg->key.addr)))
+ return;
+
+ mp = br_mdb_ip_get(br, &pg->key.addr);
+ if (!mp)
+ return;
+ pmctx = br_multicast_pg_to_port_ctx(pg);
+ if (!pmctx)
+ return;
+
+ memset(&sg_ip, 0, sizeof(sg_ip));
+ sg_ip = pg->key.addr;
+
+ for (pg_lst = mlock_dereference(mp->ports, br);
+ pg_lst;
+ pg_lst = mlock_dereference(pg_lst->next, br)) {
+ struct net_bridge_group_src *src_ent;
+
+ if (pg_lst == pg)
+ continue;
+ hlist_for_each_entry(src_ent, &pg_lst->src_list, node) {
+ if (!(src_ent->flags & BR_SGRP_F_INSTALLED))
+ continue;
+ sg_ip.src = src_ent->addr.src;
+ switch (filter_mode) {
+ case MCAST_INCLUDE:
+ __fwd_del_star_excl(pg, &sg_ip);
+ break;
+ case MCAST_EXCLUDE:
+ __fwd_add_star_excl(pmctx, pg, &sg_ip);
+ break;
+ }
+ }
+ }
+}
+
+/* called when adding a new S,G with host_joined == false by default */
+static void br_multicast_sg_host_state(struct net_bridge_mdb_entry *star_mp,
+ struct net_bridge_port_group *sg)
+{
+ struct net_bridge_mdb_entry *sg_mp;
+
+ if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr)))
+ return;
+ if (!star_mp->host_joined)
+ return;
+
+ sg_mp = br_mdb_ip_get(star_mp->br, &sg->key.addr);
+ if (!sg_mp)
+ return;
+ sg_mp->host_joined = true;
+}
+
+/* set the host_joined state of all of *,G's S,G entries */
+static void br_multicast_star_g_host_state(struct net_bridge_mdb_entry *star_mp)
+{
+ struct net_bridge *br = star_mp->br;
+ struct net_bridge_mdb_entry *sg_mp;
+ struct net_bridge_port_group *pg;
+ struct br_ip sg_ip;
+
+ if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr)))
+ return;
+
+ memset(&sg_ip, 0, sizeof(sg_ip));
+ sg_ip = star_mp->addr;
+ for (pg = mlock_dereference(star_mp->ports, br);
+ pg;
+ pg = mlock_dereference(pg->next, br)) {
+ struct net_bridge_group_src *src_ent;
+
+ hlist_for_each_entry(src_ent, &pg->src_list, node) {
+ if (!(src_ent->flags & BR_SGRP_F_INSTALLED))
+ continue;
+ sg_ip.src = src_ent->addr.src;
+ sg_mp = br_mdb_ip_get(br, &sg_ip);
+ if (!sg_mp)
+ continue;
+ sg_mp->host_joined = star_mp->host_joined;
+ }
}
+}
+
+static void br_multicast_sg_del_exclude_ports(struct net_bridge_mdb_entry *sgmp)
+{
+ struct net_bridge_port_group __rcu **pp;
+ struct net_bridge_port_group *p;
+
+ /* *,G exclude ports are only added to S,G entries */
+ if (WARN_ON(br_multicast_is_star_g(&sgmp->addr)))
+ return;
- return br_mdb_ip_get(mdb, &ip);
+ /* we need the STAR_EXCLUDE ports if there are non-STAR_EXCLUDE ports
+ * we should ignore perm entries since they're managed by user-space
+ */
+ for (pp = &sgmp->ports;
+ (p = mlock_dereference(*pp, sgmp->br)) != NULL;
+ pp = &p->next)
+ if (!(p->flags & (MDB_PG_FLAGS_STAR_EXCL |
+ MDB_PG_FLAGS_PERMANENT)))
+ return;
+
+ /* currently the host can only have joined the *,G which means
+ * we treat it as EXCLUDE {}, so for an S,G it's considered a
+ * STAR_EXCLUDE entry and we can safely leave it
+ */
+ sgmp->host_joined = false;
+
+ for (pp = &sgmp->ports;
+ (p = mlock_dereference(*pp, sgmp->br)) != NULL;) {
+ if (!(p->flags & MDB_PG_FLAGS_PERMANENT))
+ br_multicast_del_pg(sgmp, p, pp);
+ else
+ pp = &p->next;
+ }
}
-static void br_mdb_free(struct rcu_head *head)
+void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp,
+ struct net_bridge_port_group *sg)
{
- struct net_bridge_mdb_htable *mdb =
- container_of(head, struct net_bridge_mdb_htable, rcu);
- struct net_bridge_mdb_htable *old = mdb->old;
+ struct net_bridge_port_group_sg_key sg_key;
+ struct net_bridge *br = star_mp->br;
+ struct net_bridge_mcast_port *pmctx;
+ struct net_bridge_port_group *pg;
+ struct net_bridge_mcast *brmctx;
+
+ if (WARN_ON(br_multicast_is_star_g(&sg->key.addr)))
+ return;
+ if (WARN_ON(!br_multicast_is_star_g(&star_mp->addr)))
+ return;
+
+ br_multicast_sg_host_state(star_mp, sg);
+ memset(&sg_key, 0, sizeof(sg_key));
+ sg_key.addr = sg->key.addr;
+ /* we need to add all exclude ports to the S,G */
+ for (pg = mlock_dereference(star_mp->ports, br);
+ pg;
+ pg = mlock_dereference(pg->next, br)) {
+ struct net_bridge_port_group *src_pg;
+
+ if (pg == sg || pg->filter_mode == MCAST_INCLUDE)
+ continue;
+
+ sg_key.port = pg->key.port;
+ if (br_sg_port_find(br, &sg_key))
+ continue;
- mdb->old = NULL;
- kfree(old->mhash);
- kfree(old);
+ pmctx = br_multicast_pg_to_port_ctx(pg);
+ if (!pmctx)
+ continue;
+ brmctx = br_multicast_port_ctx_get_global(pmctx);
+
+ src_pg = __br_multicast_add_group(brmctx, pmctx,
+ &sg->key.addr,
+ sg->eth_addr,
+ MCAST_INCLUDE, false, false);
+ if (IS_ERR_OR_NULL(src_pg) ||
+ src_pg->rt_protocol != RTPROT_KERNEL)
+ continue;
+ src_pg->flags |= MDB_PG_FLAGS_STAR_EXCL;
+ }
}
-static int br_mdb_copy(struct net_bridge_mdb_htable *new,
- struct net_bridge_mdb_htable *old,
- int elasticity)
+static void br_multicast_fwd_src_add(struct net_bridge_group_src *src)
{
+ struct net_bridge_mdb_entry *star_mp;
+ struct net_bridge_mcast_port *pmctx;
+ struct net_bridge_port_group *sg;
+ struct net_bridge_mcast *brmctx;
+ struct br_ip sg_ip;
+
+ if (src->flags & BR_SGRP_F_INSTALLED)
+ return;
+
+ memset(&sg_ip, 0, sizeof(sg_ip));
+ pmctx = br_multicast_pg_to_port_ctx(src->pg);
+ if (!pmctx)
+ return;
+ brmctx = br_multicast_port_ctx_get_global(pmctx);
+ sg_ip = src->pg->key.addr;
+ sg_ip.src = src->addr.src;
+
+ sg = __br_multicast_add_group(brmctx, pmctx, &sg_ip,
+ src->pg->eth_addr, MCAST_INCLUDE, false,
+ !timer_pending(&src->timer));
+ if (IS_ERR_OR_NULL(sg))
+ return;
+ src->flags |= BR_SGRP_F_INSTALLED;
+ sg->flags &= ~MDB_PG_FLAGS_STAR_EXCL;
+
+ /* if it was added by user-space as perm we can skip next steps */
+ if (sg->rt_protocol != RTPROT_KERNEL &&
+ (sg->flags & MDB_PG_FLAGS_PERMANENT))
+ return;
+
+ /* the kernel is now responsible for removing this S,G */
+ timer_delete(&sg->timer);
+ star_mp = br_mdb_ip_get(src->br, &src->pg->key.addr);
+ if (!star_mp)
+ return;
+
+ br_multicast_sg_add_exclude_ports(star_mp, sg);
+}
+
+static void br_multicast_fwd_src_remove(struct net_bridge_group_src *src,
+ bool fastleave)
+{
+ struct net_bridge_port_group *p, *pg = src->pg;
+ struct net_bridge_port_group __rcu **pp;
struct net_bridge_mdb_entry *mp;
- int maxlen;
- int len;
- int i;
+ struct br_ip sg_ip;
- for (i = 0; i < old->max; i++)
- hlist_for_each_entry(mp, &old->mhash[i], hlist[old->ver])
- hlist_add_head(&mp->hlist[new->ver],
- &new->mhash[br_ip_hash(new, &mp->addr)]);
+ memset(&sg_ip, 0, sizeof(sg_ip));
+ sg_ip = pg->key.addr;
+ sg_ip.src = src->addr.src;
- if (!elasticity)
- return 0;
+ mp = br_mdb_ip_get(src->br, &sg_ip);
+ if (!mp)
+ return;
+
+ for (pp = &mp->ports;
+ (p = mlock_dereference(*pp, src->br)) != NULL;
+ pp = &p->next) {
+ if (!br_port_group_equal(p, pg->key.port, pg->eth_addr))
+ continue;
- maxlen = 0;
- for (i = 0; i < new->max; i++) {
- len = 0;
- hlist_for_each_entry(mp, &new->mhash[i], hlist[new->ver])
- len++;
- if (len > maxlen)
- maxlen = len;
+ if (p->rt_protocol != RTPROT_KERNEL &&
+ (p->flags & MDB_PG_FLAGS_PERMANENT) &&
+ !(src->flags & BR_SGRP_F_USER_ADDED))
+ break;
+
+ if (fastleave)
+ p->flags |= MDB_PG_FLAGS_FAST_LEAVE;
+ br_multicast_del_pg(mp, p, pp);
+ break;
}
+ src->flags &= ~BR_SGRP_F_INSTALLED;
+}
- return maxlen > elasticity ? -EINVAL : 0;
+/* install S,G and based on src's timer enable or disable forwarding */
+static void br_multicast_fwd_src_handle(struct net_bridge_group_src *src)
+{
+ struct net_bridge_port_group_sg_key sg_key;
+ struct net_bridge_port_group *sg;
+ u8 old_flags;
+
+ br_multicast_fwd_src_add(src);
+
+ memset(&sg_key, 0, sizeof(sg_key));
+ sg_key.addr = src->pg->key.addr;
+ sg_key.addr.src = src->addr.src;
+ sg_key.port = src->pg->key.port;
+
+ sg = br_sg_port_find(src->br, &sg_key);
+ if (!sg || (sg->flags & MDB_PG_FLAGS_PERMANENT))
+ return;
+
+ old_flags = sg->flags;
+ if (timer_pending(&src->timer))
+ sg->flags &= ~MDB_PG_FLAGS_BLOCKED;
+ else
+ sg->flags |= MDB_PG_FLAGS_BLOCKED;
+
+ if (old_flags != sg->flags) {
+ struct net_bridge_mdb_entry *sg_mp;
+
+ sg_mp = br_mdb_ip_get(src->br, &sg_key.addr);
+ if (!sg_mp)
+ return;
+ br_mdb_notify(src->br->dev, sg_mp, sg, RTM_NEWMDB);
+ }
}
-void br_multicast_free_pg(struct rcu_head *head)
+static void br_multicast_destroy_mdb_entry(struct net_bridge_mcast_gc *gc)
{
- struct net_bridge_port_group *p =
- container_of(head, struct net_bridge_port_group, rcu);
+ struct net_bridge_mdb_entry *mp;
- kfree(p);
+ mp = container_of(gc, struct net_bridge_mdb_entry, mcast_gc);
+ WARN_ON(!hlist_unhashed(&mp->mdb_node));
+ WARN_ON(mp->ports);
+
+ timer_shutdown_sync(&mp->timer);
+ kfree_rcu(mp, rcu);
}
-static void br_multicast_free_group(struct rcu_head *head)
+static void br_multicast_del_mdb_entry(struct net_bridge_mdb_entry *mp)
{
- struct net_bridge_mdb_entry *mp =
- container_of(head, struct net_bridge_mdb_entry, rcu);
+ struct net_bridge *br = mp->br;
- kfree(mp);
+ rhashtable_remove_fast(&br->mdb_hash_tbl, &mp->rhnode,
+ br_mdb_rht_params);
+ hlist_del_init_rcu(&mp->mdb_node);
+ hlist_add_head(&mp->mcast_gc.gc_node, &br->mcast_gc_list);
+ queue_work(system_long_wq, &br->mcast_gc_work);
}
static void br_multicast_group_expired(struct timer_list *t)
{
- struct net_bridge_mdb_entry *mp = from_timer(mp, t, timer);
+ struct net_bridge_mdb_entry *mp = timer_container_of(mp, t, timer);
struct net_bridge *br = mp->br;
- struct net_bridge_mdb_htable *mdb;
spin_lock(&br->multicast_lock);
- if (!netif_running(br->dev) || timer_pending(&mp->timer))
+ if (hlist_unhashed(&mp->mdb_node) || !netif_running(br->dev) ||
+ timer_pending(&mp->timer))
goto out;
- mp->host_joined = false;
- br_mdb_notify(br->dev, NULL, &mp->addr, RTM_DELMDB, 0);
+ br_multicast_host_leave(mp, true);
if (mp->ports)
goto out;
+ br_multicast_del_mdb_entry(mp);
+out:
+ spin_unlock(&br->multicast_lock);
+}
- mdb = mlock_dereference(br->mdb, br);
+static void br_multicast_destroy_group_src(struct net_bridge_mcast_gc *gc)
+{
+ struct net_bridge_group_src *src;
- hlist_del_rcu(&mp->hlist[mdb->ver]);
- mdb->size--;
+ src = container_of(gc, struct net_bridge_group_src, mcast_gc);
+ WARN_ON(!hlist_unhashed(&src->node));
- call_rcu_bh(&mp->rcu, br_multicast_free_group);
+ timer_shutdown_sync(&src->timer);
+ kfree_rcu(src, rcu);
+}
-out:
- spin_unlock(&br->multicast_lock);
+void __br_multicast_del_group_src(struct net_bridge_group_src *src)
+{
+ struct net_bridge *br = src->pg->key.port->br;
+
+ hlist_del_init_rcu(&src->node);
+ src->pg->src_ents--;
+ hlist_add_head(&src->mcast_gc.gc_node, &br->mcast_gc_list);
+ queue_work(system_long_wq, &br->mcast_gc_work);
+}
+
+void br_multicast_del_group_src(struct net_bridge_group_src *src,
+ bool fastleave)
+{
+ br_multicast_fwd_src_remove(src, fastleave);
+ __br_multicast_del_group_src(src);
}
-static void br_multicast_del_pg(struct net_bridge *br,
- struct net_bridge_port_group *pg)
+static int
+br_multicast_port_ngroups_inc_one(struct net_bridge_mcast_port *pmctx,
+ struct netlink_ext_ack *extack,
+ const char *what)
{
- struct net_bridge_mdb_htable *mdb;
+ u32 max = READ_ONCE(pmctx->mdb_max_entries);
+ u32 n = READ_ONCE(pmctx->mdb_n_entries);
+
+ if (max && n >= max) {
+ NL_SET_ERR_MSG_FMT_MOD(extack, "%s is already in %u groups, and mcast_max_groups=%u",
+ what, n, max);
+ return -E2BIG;
+ }
+
+ WRITE_ONCE(pmctx->mdb_n_entries, n + 1);
+ return 0;
+}
+
+static void br_multicast_port_ngroups_dec_one(struct net_bridge_mcast_port *pmctx)
+{
+ u32 n = READ_ONCE(pmctx->mdb_n_entries);
+
+ WARN_ON_ONCE(n == 0);
+ WRITE_ONCE(pmctx->mdb_n_entries, n - 1);
+}
+
+static int br_multicast_port_ngroups_inc(struct net_bridge_port *port,
+ const struct br_ip *group,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_mcast_port *pmctx;
+ int err;
+
+ lockdep_assert_held_once(&port->br->multicast_lock);
+
+ /* Always count on the port context. */
+ err = br_multicast_port_ngroups_inc_one(&port->multicast_ctx, extack,
+ "Port");
+ if (err) {
+ trace_br_mdb_full(port->dev, group);
+ return err;
+ }
+
+ /* Only count on the VLAN context if VID is given, and if snooping on
+ * that VLAN is enabled.
+ */
+ if (!group->vid)
+ return 0;
+
+ pmctx = br_multicast_port_vid_to_port_ctx(port, group->vid);
+ if (!pmctx)
+ return 0;
+
+ err = br_multicast_port_ngroups_inc_one(pmctx, extack, "Port-VLAN");
+ if (err) {
+ trace_br_mdb_full(port->dev, group);
+ goto dec_one_out;
+ }
+
+ return 0;
+
+dec_one_out:
+ br_multicast_port_ngroups_dec_one(&port->multicast_ctx);
+ return err;
+}
+
+static void br_multicast_port_ngroups_dec(struct net_bridge_port *port, u16 vid)
+{
+ struct net_bridge_mcast_port *pmctx;
+
+ lockdep_assert_held_once(&port->br->multicast_lock);
+
+ if (vid) {
+ pmctx = br_multicast_port_vid_to_port_ctx(port, vid);
+ if (pmctx)
+ br_multicast_port_ngroups_dec_one(pmctx);
+ }
+ br_multicast_port_ngroups_dec_one(&port->multicast_ctx);
+}
+
+u32 br_multicast_ngroups_get(const struct net_bridge_mcast_port *pmctx)
+{
+ return READ_ONCE(pmctx->mdb_n_entries);
+}
+
+void br_multicast_ngroups_set_max(struct net_bridge_mcast_port *pmctx, u32 max)
+{
+ WRITE_ONCE(pmctx->mdb_max_entries, max);
+}
+
+u32 br_multicast_ngroups_get_max(const struct net_bridge_mcast_port *pmctx)
+{
+ return READ_ONCE(pmctx->mdb_max_entries);
+}
+
+static void br_multicast_destroy_port_group(struct net_bridge_mcast_gc *gc)
+{
+ struct net_bridge_port_group *pg;
+
+ pg = container_of(gc, struct net_bridge_port_group, mcast_gc);
+ WARN_ON(!hlist_unhashed(&pg->mglist));
+ WARN_ON(!hlist_empty(&pg->src_list));
+
+ timer_shutdown_sync(&pg->rexmit_timer);
+ timer_shutdown_sync(&pg->timer);
+ kfree_rcu(pg, rcu);
+}
+
+void br_multicast_del_pg(struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ struct net_bridge_port_group __rcu **pp)
+{
+ struct net_bridge *br = pg->key.port->br;
+ struct net_bridge_group_src *ent;
+ struct hlist_node *tmp;
+
+ rcu_assign_pointer(*pp, pg->next);
+ hlist_del_init(&pg->mglist);
+ br_multicast_eht_clean_sets(pg);
+ hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node)
+ br_multicast_del_group_src(ent, false);
+ br_mdb_notify(br->dev, mp, pg, RTM_DELMDB);
+ if (!br_multicast_is_star_g(&mp->addr)) {
+ rhashtable_remove_fast(&br->sg_port_tbl, &pg->rhnode,
+ br_sg_port_rht_params);
+ br_multicast_sg_del_exclude_ports(mp);
+ } else {
+ br_multicast_star_g_handle_mode(pg, MCAST_INCLUDE);
+ }
+ br_multicast_port_ngroups_dec(pg->key.port, pg->key.addr.vid);
+ hlist_add_head(&pg->mcast_gc.gc_node, &br->mcast_gc_list);
+ queue_work(system_long_wq, &br->mcast_gc_work);
+
+ if (!mp->ports && !mp->host_joined && netif_running(br->dev))
+ mod_timer(&mp->timer, jiffies);
+}
+
+static void br_multicast_find_del_pg(struct net_bridge *br,
+ struct net_bridge_port_group *pg)
+{
+ struct net_bridge_port_group __rcu **pp;
struct net_bridge_mdb_entry *mp;
struct net_bridge_port_group *p;
- struct net_bridge_port_group __rcu **pp;
-
- mdb = mlock_dereference(br->mdb, br);
- mp = br_mdb_ip_get(mdb, &pg->addr);
+ mp = br_mdb_ip_get(br, &pg->key.addr);
if (WARN_ON(!mp))
return;
@@ -286,17 +847,7 @@ static void br_multicast_del_pg(struct net_bridge *br,
if (p != pg)
continue;
- rcu_assign_pointer(*pp, p->next);
- hlist_del_init(&p->mglist);
- del_timer(&p->timer);
- br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB,
- p->flags);
- call_rcu_bh(&p->rcu, br_multicast_free_pg);
-
- if (!mp->ports && !mp->host_joined &&
- netif_running(br->dev))
- mod_timer(&mp->timer, jiffies);
-
+ br_multicast_del_pg(mp, pg, pp);
return;
}
@@ -305,115 +856,148 @@ static void br_multicast_del_pg(struct net_bridge *br,
static void br_multicast_port_group_expired(struct timer_list *t)
{
- struct net_bridge_port_group *pg = from_timer(pg, t, timer);
- struct net_bridge *br = pg->port->br;
+ struct net_bridge_port_group *pg = timer_container_of(pg, t, timer);
+ struct net_bridge_group_src *src_ent;
+ struct net_bridge *br = pg->key.port->br;
+ struct hlist_node *tmp;
+ bool changed;
spin_lock(&br->multicast_lock);
if (!netif_running(br->dev) || timer_pending(&pg->timer) ||
hlist_unhashed(&pg->mglist) || pg->flags & MDB_PG_FLAGS_PERMANENT)
goto out;
- br_multicast_del_pg(br, pg);
+ changed = !!(pg->filter_mode == MCAST_EXCLUDE);
+ pg->filter_mode = MCAST_INCLUDE;
+ hlist_for_each_entry_safe(src_ent, tmp, &pg->src_list, node) {
+ if (!timer_pending(&src_ent->timer)) {
+ br_multicast_del_group_src(src_ent, false);
+ changed = true;
+ }
+ }
+
+ if (hlist_empty(&pg->src_list)) {
+ br_multicast_find_del_pg(br, pg);
+ } else if (changed) {
+ struct net_bridge_mdb_entry *mp = br_mdb_ip_get(br, &pg->key.addr);
+ if (changed && br_multicast_is_star_g(&pg->key.addr))
+ br_multicast_star_g_handle_mode(pg, MCAST_INCLUDE);
+
+ if (WARN_ON(!mp))
+ goto out;
+ br_mdb_notify(br->dev, mp, pg, RTM_NEWMDB);
+ }
out:
spin_unlock(&br->multicast_lock);
}
-static int br_mdb_rehash(struct net_bridge_mdb_htable __rcu **mdbp, int max,
- int elasticity)
+static void br_multicast_gc(struct hlist_head *head)
{
- struct net_bridge_mdb_htable *old = rcu_dereference_protected(*mdbp, 1);
- struct net_bridge_mdb_htable *mdb;
- int err;
-
- mdb = kmalloc(sizeof(*mdb), GFP_ATOMIC);
- if (!mdb)
- return -ENOMEM;
-
- mdb->max = max;
- mdb->old = old;
+ struct net_bridge_mcast_gc *gcent;
+ struct hlist_node *tmp;
- mdb->mhash = kcalloc(max, sizeof(*mdb->mhash), GFP_ATOMIC);
- if (!mdb->mhash) {
- kfree(mdb);
- return -ENOMEM;
+ hlist_for_each_entry_safe(gcent, tmp, head, gc_node) {
+ hlist_del_init(&gcent->gc_node);
+ gcent->destroy(gcent);
}
+}
- mdb->size = old ? old->size : 0;
- mdb->ver = old ? old->ver ^ 1 : 0;
+static void __br_multicast_query_handle_vlan(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct sk_buff *skb)
+{
+ struct net_bridge_vlan *vlan = NULL;
- if (!old || elasticity)
- get_random_bytes(&mdb->secret, sizeof(mdb->secret));
- else
- mdb->secret = old->secret;
+ if (pmctx && br_multicast_port_ctx_is_vlan(pmctx))
+ vlan = pmctx->vlan;
+ else if (br_multicast_ctx_is_vlan(brmctx))
+ vlan = brmctx->vlan;
- if (!old)
- goto out;
+ if (vlan && !(vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED)) {
+ u16 vlan_proto;
- err = br_mdb_copy(mdb, old, elasticity);
- if (err) {
- kfree(mdb->mhash);
- kfree(mdb);
- return err;
+ if (br_vlan_get_proto(brmctx->br->dev, &vlan_proto) != 0)
+ return;
+ __vlan_hwaccel_put_tag(skb, htons(vlan_proto), vlan->vid);
}
-
- br_mdb_rehash_seq++;
- call_rcu_bh(&mdb->rcu, br_mdb_free);
-
-out:
- rcu_assign_pointer(*mdbp, mdb);
-
- return 0;
}
-static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
- __be32 group,
- u8 *igmp_type)
+static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg,
+ __be32 ip_dst, __be32 group,
+ bool with_srcs, bool over_lmqt,
+ u8 sflag, u8 *igmp_type,
+ bool *need_rexmit)
{
+ struct net_bridge_port *p = pg ? pg->key.port : NULL;
+ struct net_bridge_group_src *ent;
+ size_t pkt_size, igmp_hdr_size;
+ unsigned long now = jiffies;
struct igmpv3_query *ihv3;
- size_t igmp_hdr_size;
+ void *csum_start = NULL;
+ __sum16 *csum = NULL;
struct sk_buff *skb;
struct igmphdr *ih;
struct ethhdr *eth;
+ unsigned long lmqt;
struct iphdr *iph;
+ u16 lmqt_srcs = 0;
igmp_hdr_size = sizeof(*ih);
- if (br->multicast_igmp_version == 3)
+ if (brmctx->multicast_igmp_version == 3) {
igmp_hdr_size = sizeof(*ihv3);
- skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*iph) +
- igmp_hdr_size + 4);
+ if (pg && with_srcs) {
+ lmqt = now + (brmctx->multicast_last_member_interval *
+ brmctx->multicast_last_member_count);
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ if (over_lmqt == time_after(ent->timer.expires,
+ lmqt) &&
+ ent->src_query_rexmit_cnt > 0)
+ lmqt_srcs++;
+ }
+
+ if (!lmqt_srcs)
+ return NULL;
+ igmp_hdr_size += lmqt_srcs * sizeof(__be32);
+ }
+ }
+
+ pkt_size = sizeof(*eth) + sizeof(*iph) + 4 + igmp_hdr_size;
+ if ((p && pkt_size > p->dev->mtu) ||
+ pkt_size > brmctx->br->dev->mtu)
+ return NULL;
+
+ skb = netdev_alloc_skb_ip_align(brmctx->br->dev, pkt_size);
if (!skb)
goto out;
+ __br_multicast_query_handle_vlan(brmctx, pmctx, skb);
skb->protocol = htons(ETH_P_IP);
skb_reset_mac_header(skb);
eth = eth_hdr(skb);
- ether_addr_copy(eth->h_source, br->dev->dev_addr);
- eth->h_dest[0] = 1;
- eth->h_dest[1] = 0;
- eth->h_dest[2] = 0x5e;
- eth->h_dest[3] = 0;
- eth->h_dest[4] = 0;
- eth->h_dest[5] = 1;
+ ether_addr_copy(eth->h_source, brmctx->br->dev->dev_addr);
+ ip_eth_mc_map(ip_dst, eth->h_dest);
eth->h_proto = htons(ETH_P_IP);
skb_put(skb, sizeof(*eth));
skb_set_network_header(skb, skb->len);
iph = ip_hdr(skb);
+ iph->tot_len = htons(pkt_size - sizeof(*eth));
iph->version = 4;
iph->ihl = 6;
iph->tos = 0xc0;
- iph->tot_len = htons(sizeof(*iph) + igmp_hdr_size + 4);
iph->id = 0;
iph->frag_off = htons(IP_DF);
iph->ttl = 1;
iph->protocol = IPPROTO_IGMP;
- iph->saddr = br->multicast_query_use_ifaddr ?
- inet_select_addr(br->dev, 0, RT_SCOPE_LINK) : 0;
- iph->daddr = htonl(INADDR_ALLHOSTS_GROUP);
+ iph->saddr = br_opt_get(brmctx->br, BROPT_MULTICAST_QUERY_USE_IFADDR) ?
+ inet_select_addr(brmctx->br->dev, 0, RT_SCOPE_LINK) : 0;
+ iph->daddr = ip_dst;
((u8 *)&iph[1])[0] = IPOPT_RA;
((u8 *)&iph[1])[1] = 4;
((u8 *)&iph[1])[2] = 0;
@@ -424,34 +1008,60 @@ static struct sk_buff *br_ip4_multicast_alloc_query(struct net_bridge *br,
skb_set_transport_header(skb, skb->len);
*igmp_type = IGMP_HOST_MEMBERSHIP_QUERY;
- switch (br->multicast_igmp_version) {
+ switch (brmctx->multicast_igmp_version) {
case 2:
ih = igmp_hdr(skb);
ih->type = IGMP_HOST_MEMBERSHIP_QUERY;
- ih->code = (group ? br->multicast_last_member_interval :
- br->multicast_query_response_interval) /
+ ih->code = (group ? brmctx->multicast_last_member_interval :
+ brmctx->multicast_query_response_interval) /
(HZ / IGMP_TIMER_SCALE);
ih->group = group;
ih->csum = 0;
- ih->csum = ip_compute_csum((void *)ih, sizeof(*ih));
+ csum = &ih->csum;
+ csum_start = (void *)ih;
break;
case 3:
ihv3 = igmpv3_query_hdr(skb);
ihv3->type = IGMP_HOST_MEMBERSHIP_QUERY;
- ihv3->code = (group ? br->multicast_last_member_interval :
- br->multicast_query_response_interval) /
+ ihv3->code = (group ? brmctx->multicast_last_member_interval :
+ brmctx->multicast_query_response_interval) /
(HZ / IGMP_TIMER_SCALE);
ihv3->group = group;
- ihv3->qqic = br->multicast_query_interval / HZ;
- ihv3->nsrcs = 0;
+ ihv3->qqic = brmctx->multicast_query_interval / HZ;
+ ihv3->nsrcs = htons(lmqt_srcs);
ihv3->resv = 0;
- ihv3->suppress = 0;
+ ihv3->suppress = sflag;
ihv3->qrv = 2;
ihv3->csum = 0;
- ihv3->csum = ip_compute_csum((void *)ihv3, sizeof(*ihv3));
+ csum = &ihv3->csum;
+ csum_start = (void *)ihv3;
+ if (!pg || !with_srcs)
+ break;
+
+ lmqt_srcs = 0;
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ if (over_lmqt == time_after(ent->timer.expires,
+ lmqt) &&
+ ent->src_query_rexmit_cnt > 0) {
+ ihv3->srcs[lmqt_srcs++] = ent->addr.src.ip4;
+ ent->src_query_rexmit_cnt--;
+ if (need_rexmit && ent->src_query_rexmit_cnt)
+ *need_rexmit = true;
+ }
+ }
+ if (WARN_ON(lmqt_srcs != ntohs(ihv3->nsrcs))) {
+ kfree_skb(skb);
+ return NULL;
+ }
break;
}
+ if (WARN_ON(!csum || !csum_start)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ *csum = ip_compute_csum(csum_start, igmp_hdr_size);
skb_put(skb, igmp_hdr_size);
__skb_pull(skb, sizeof(*eth));
@@ -460,34 +1070,67 @@ out:
}
#if IS_ENABLED(CONFIG_IPV6)
-static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
- const struct in6_addr *grp,
- u8 *igmp_type)
+static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg,
+ const struct in6_addr *ip6_dst,
+ const struct in6_addr *group,
+ bool with_srcs, bool over_llqt,
+ u8 sflag, u8 *igmp_type,
+ bool *need_rexmit)
{
+ struct net_bridge_port *p = pg ? pg->key.port : NULL;
+ struct net_bridge_group_src *ent;
+ size_t pkt_size, mld_hdr_size;
+ unsigned long now = jiffies;
struct mld2_query *mld2q;
+ void *csum_start = NULL;
unsigned long interval;
+ __sum16 *csum = NULL;
struct ipv6hdr *ip6h;
struct mld_msg *mldq;
- size_t mld_hdr_size;
struct sk_buff *skb;
+ unsigned long llqt;
struct ethhdr *eth;
+ u16 llqt_srcs = 0;
u8 *hopopt;
mld_hdr_size = sizeof(*mldq);
- if (br->multicast_mld_version == 2)
+ if (brmctx->multicast_mld_version == 2) {
mld_hdr_size = sizeof(*mld2q);
- skb = netdev_alloc_skb_ip_align(br->dev, sizeof(*eth) + sizeof(*ip6h) +
- 8 + mld_hdr_size);
+ if (pg && with_srcs) {
+ llqt = now + (brmctx->multicast_last_member_interval *
+ brmctx->multicast_last_member_count);
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ if (over_llqt == time_after(ent->timer.expires,
+ llqt) &&
+ ent->src_query_rexmit_cnt > 0)
+ llqt_srcs++;
+ }
+
+ if (!llqt_srcs)
+ return NULL;
+ mld_hdr_size += llqt_srcs * sizeof(struct in6_addr);
+ }
+ }
+
+ pkt_size = sizeof(*eth) + sizeof(*ip6h) + 8 + mld_hdr_size;
+ if ((p && pkt_size > p->dev->mtu) ||
+ pkt_size > brmctx->br->dev->mtu)
+ return NULL;
+
+ skb = netdev_alloc_skb_ip_align(brmctx->br->dev, pkt_size);
if (!skb)
goto out;
+ __br_multicast_query_handle_vlan(brmctx, pmctx, skb);
skb->protocol = htons(ETH_P_IPV6);
/* Ethernet header */
skb_reset_mac_header(skb);
eth = eth_hdr(skb);
- ether_addr_copy(eth->h_source, br->dev->dev_addr);
+ ether_addr_copy(eth->h_source, brmctx->br->dev->dev_addr);
eth->h_proto = htons(ETH_P_IPV6);
skb_put(skb, sizeof(*eth));
@@ -499,15 +1142,15 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
ip6h->payload_len = htons(8 + mld_hdr_size);
ip6h->nexthdr = IPPROTO_HOPOPTS;
ip6h->hop_limit = 1;
- ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1));
- if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0,
- &ip6h->saddr)) {
+ ip6h->daddr = *ip6_dst;
+ if (ipv6_dev_get_saddr(dev_net(brmctx->br->dev), brmctx->br->dev,
+ &ip6h->daddr, 0, &ip6h->saddr)) {
kfree_skb(skb);
- br->has_ipv6_addr = 0;
+ br_opt_toggle(brmctx->br, BROPT_HAS_IPV6_ADDR, false);
return NULL;
}
- br->has_ipv6_addr = 1;
+ br_opt_toggle(brmctx->br, BROPT_HAS_IPV6_ADDR, true);
ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest);
hopopt = (u8 *)(ip6h + 1);
@@ -524,11 +1167,11 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
/* ICMPv6 */
skb_set_transport_header(skb, skb->len);
- interval = ipv6_addr_any(grp) ?
- br->multicast_query_response_interval :
- br->multicast_last_member_interval;
+ interval = ipv6_addr_any(group) ?
+ brmctx->multicast_query_response_interval :
+ brmctx->multicast_last_member_interval;
*igmp_type = ICMPV6_MGM_QUERY;
- switch (br->multicast_mld_version) {
+ switch (brmctx->multicast_mld_version) {
case 1:
mldq = (struct mld_msg *)icmp6_hdr(skb);
mldq->mld_type = ICMPV6_MGM_QUERY;
@@ -536,12 +1179,9 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
mldq->mld_cksum = 0;
mldq->mld_maxdelay = htons((u16)jiffies_to_msecs(interval));
mldq->mld_reserved = 0;
- mldq->mld_mca = *grp;
- mldq->mld_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
- sizeof(*mldq), IPPROTO_ICMPV6,
- csum_partial(mldq,
- sizeof(*mldq),
- 0));
+ mldq->mld_mca = *group;
+ csum = &mldq->mld_cksum;
+ csum_start = (void *)mldq;
break;
case 2:
mld2q = (struct mld2_query *)icmp6_hdr(skb);
@@ -551,21 +1191,43 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br,
mld2q->mld2q_cksum = 0;
mld2q->mld2q_resv1 = 0;
mld2q->mld2q_resv2 = 0;
- mld2q->mld2q_suppress = 0;
+ mld2q->mld2q_suppress = sflag;
mld2q->mld2q_qrv = 2;
- mld2q->mld2q_nsrcs = 0;
- mld2q->mld2q_qqic = br->multicast_query_interval / HZ;
- mld2q->mld2q_mca = *grp;
- mld2q->mld2q_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
- sizeof(*mld2q),
- IPPROTO_ICMPV6,
- csum_partial(mld2q,
- sizeof(*mld2q),
- 0));
+ mld2q->mld2q_nsrcs = htons(llqt_srcs);
+ mld2q->mld2q_qqic = brmctx->multicast_query_interval / HZ;
+ mld2q->mld2q_mca = *group;
+ csum = &mld2q->mld2q_cksum;
+ csum_start = (void *)mld2q;
+ if (!pg || !with_srcs)
+ break;
+
+ llqt_srcs = 0;
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ if (over_llqt == time_after(ent->timer.expires,
+ llqt) &&
+ ent->src_query_rexmit_cnt > 0) {
+ mld2q->mld2q_srcs[llqt_srcs++] = ent->addr.src.ip6;
+ ent->src_query_rexmit_cnt--;
+ if (need_rexmit && ent->src_query_rexmit_cnt)
+ *need_rexmit = true;
+ }
+ }
+ if (WARN_ON(llqt_srcs != ntohs(mld2q->mld2q_nsrcs))) {
+ kfree_skb(skb);
+ return NULL;
+ }
break;
}
- skb_put(skb, mld_hdr_size);
+ if (WARN_ON(!csum || !csum_start)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ *csum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, mld_hdr_size,
+ IPPROTO_ICMPV6,
+ csum_partial(csum_start, mld_hdr_size, 0));
+ skb_put(skb, mld_hdr_size);
__skb_pull(skb, sizeof(*eth));
out:
@@ -573,295 +1235,454 @@ out:
}
#endif
-static struct sk_buff *br_multicast_alloc_query(struct net_bridge *br,
- struct br_ip *addr,
- u8 *igmp_type)
+static struct sk_buff *br_multicast_alloc_query(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg,
+ struct br_ip *ip_dst,
+ struct br_ip *group,
+ bool with_srcs, bool over_lmqt,
+ u8 sflag, u8 *igmp_type,
+ bool *need_rexmit)
{
- switch (addr->proto) {
+ __be32 ip4_dst;
+
+ switch (group->proto) {
case htons(ETH_P_IP):
- return br_ip4_multicast_alloc_query(br, addr->u.ip4, igmp_type);
+ ip4_dst = ip_dst ? ip_dst->dst.ip4 : htonl(INADDR_ALLHOSTS_GROUP);
+ return br_ip4_multicast_alloc_query(brmctx, pmctx, pg,
+ ip4_dst, group->dst.ip4,
+ with_srcs, over_lmqt,
+ sflag, igmp_type,
+ need_rexmit);
#if IS_ENABLED(CONFIG_IPV6)
- case htons(ETH_P_IPV6):
- return br_ip6_multicast_alloc_query(br, &addr->u.ip6,
- igmp_type);
+ case htons(ETH_P_IPV6): {
+ struct in6_addr ip6_dst;
+
+ if (ip_dst)
+ ip6_dst = ip_dst->dst.ip6;
+ else
+ ipv6_addr_set(&ip6_dst, htonl(0xff020000), 0, 0,
+ htonl(1));
+
+ return br_ip6_multicast_alloc_query(brmctx, pmctx, pg,
+ &ip6_dst, &group->dst.ip6,
+ with_srcs, over_lmqt,
+ sflag, igmp_type,
+ need_rexmit);
+ }
#endif
}
return NULL;
}
-static struct net_bridge_mdb_entry *br_multicast_get_group(
- struct net_bridge *br, struct net_bridge_port *port,
- struct br_ip *group, int hash)
+struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br,
+ struct br_ip *group)
{
- struct net_bridge_mdb_htable *mdb;
struct net_bridge_mdb_entry *mp;
- unsigned int count = 0;
- unsigned int max;
- int elasticity;
int err;
- mdb = rcu_dereference_protected(br->mdb, 1);
- hlist_for_each_entry(mp, &mdb->mhash[hash], hlist[mdb->ver]) {
- count++;
- if (unlikely(br_ip_equal(group, &mp->addr)))
- return mp;
- }
+ mp = br_mdb_ip_get(br, group);
+ if (mp)
+ return mp;
- elasticity = 0;
- max = mdb->max;
+ if (atomic_read(&br->mdb_hash_tbl.nelems) >= br->hash_max) {
+ trace_br_mdb_full(br->dev, group);
+ br_mc_disabled_update(br->dev, false, NULL);
+ br_opt_toggle(br, BROPT_MULTICAST_ENABLED, false);
+ return ERR_PTR(-E2BIG);
+ }
- if (unlikely(count > br->hash_elasticity && count)) {
- if (net_ratelimit())
- br_info(br, "Multicast hash table "
- "chain limit reached: %s\n",
- port ? port->dev->name : br->dev->name);
+ mp = kzalloc(sizeof(*mp), GFP_ATOMIC);
+ if (unlikely(!mp))
+ return ERR_PTR(-ENOMEM);
- elasticity = br->hash_elasticity;
+ mp->br = br;
+ mp->addr = *group;
+ mp->mcast_gc.destroy = br_multicast_destroy_mdb_entry;
+ timer_setup(&mp->timer, br_multicast_group_expired, 0);
+ err = rhashtable_lookup_insert_fast(&br->mdb_hash_tbl, &mp->rhnode,
+ br_mdb_rht_params);
+ if (err) {
+ kfree(mp);
+ mp = ERR_PTR(err);
+ } else {
+ hlist_add_head_rcu(&mp->mdb_node, &br->mdb_list);
}
- if (mdb->size >= max) {
- max *= 2;
- if (unlikely(max > br->hash_max)) {
- br_warn(br, "Multicast hash table maximum of %d "
- "reached, disabling snooping: %s\n",
- br->hash_max,
- port ? port->dev->name : br->dev->name);
- err = -E2BIG;
-disable:
- br->multicast_disabled = 1;
- goto err;
- }
+ return mp;
+}
+
+static void br_multicast_group_src_expired(struct timer_list *t)
+{
+ struct net_bridge_group_src *src = timer_container_of(src, t, timer);
+ struct net_bridge_port_group *pg;
+ struct net_bridge *br = src->br;
+
+ spin_lock(&br->multicast_lock);
+ if (hlist_unhashed(&src->node) || !netif_running(br->dev) ||
+ timer_pending(&src->timer))
+ goto out;
+
+ pg = src->pg;
+ if (pg->filter_mode == MCAST_INCLUDE) {
+ br_multicast_del_group_src(src, false);
+ if (!hlist_empty(&pg->src_list))
+ goto out;
+ br_multicast_find_del_pg(br, pg);
+ } else {
+ br_multicast_fwd_src_handle(src);
}
- if (max > mdb->max || elasticity) {
- if (mdb->old) {
- if (net_ratelimit())
- br_info(br, "Multicast hash table "
- "on fire: %s\n",
- port ? port->dev->name : br->dev->name);
- err = -EEXIST;
- goto err;
- }
+out:
+ spin_unlock(&br->multicast_lock);
+}
- err = br_mdb_rehash(&br->mdb, max, elasticity);
- if (err) {
- br_warn(br, "Cannot rehash multicast "
- "hash table, disabling snooping: %s, %d, %d\n",
- port ? port->dev->name : br->dev->name,
- mdb->size, err);
- goto disable;
- }
+struct net_bridge_group_src *
+br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip)
+{
+ struct net_bridge_group_src *ent;
- err = -EAGAIN;
- goto err;
+ switch (ip->proto) {
+ case htons(ETH_P_IP):
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ if (ip->src.ip4 == ent->addr.src.ip4)
+ return ent;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ if (!ipv6_addr_cmp(&ent->addr.src.ip6, &ip->src.ip6))
+ return ent;
+ break;
+#endif
}
return NULL;
-
-err:
- mp = ERR_PTR(err);
- return mp;
}
-struct net_bridge_mdb_entry *br_multicast_new_group(struct net_bridge *br,
- struct net_bridge_port *p,
- struct br_ip *group)
+struct net_bridge_group_src *
+br_multicast_new_group_src(struct net_bridge_port_group *pg, struct br_ip *src_ip)
{
- struct net_bridge_mdb_htable *mdb;
- struct net_bridge_mdb_entry *mp;
- int hash;
- int err;
+ struct net_bridge_group_src *grp_src;
- mdb = rcu_dereference_protected(br->mdb, 1);
- if (!mdb) {
- err = br_mdb_rehash(&br->mdb, BR_HASH_SIZE, 0);
- if (err)
- return ERR_PTR(err);
- goto rehash;
- }
+ if (unlikely(pg->src_ents >= PG_SRC_ENT_LIMIT))
+ return NULL;
- hash = br_ip_hash(mdb, group);
- mp = br_multicast_get_group(br, p, group, hash);
- switch (PTR_ERR(mp)) {
- case 0:
+ switch (src_ip->proto) {
+ case htons(ETH_P_IP):
+ if (ipv4_is_zeronet(src_ip->src.ip4) ||
+ ipv4_is_multicast(src_ip->src.ip4))
+ return NULL;
break;
-
- case -EAGAIN:
-rehash:
- mdb = rcu_dereference_protected(br->mdb, 1);
- hash = br_ip_hash(mdb, group);
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ if (ipv6_addr_any(&src_ip->src.ip6) ||
+ ipv6_addr_is_multicast(&src_ip->src.ip6))
+ return NULL;
break;
-
- default:
- goto out;
+#endif
}
- mp = kzalloc(sizeof(*mp), GFP_ATOMIC);
- if (unlikely(!mp))
- return ERR_PTR(-ENOMEM);
+ grp_src = kzalloc(sizeof(*grp_src), GFP_ATOMIC);
+ if (unlikely(!grp_src))
+ return NULL;
- mp->br = br;
- mp->addr = *group;
- timer_setup(&mp->timer, br_multicast_group_expired, 0);
+ grp_src->pg = pg;
+ grp_src->br = pg->key.port->br;
+ grp_src->addr = *src_ip;
+ grp_src->mcast_gc.destroy = br_multicast_destroy_group_src;
+ timer_setup(&grp_src->timer, br_multicast_group_src_expired, 0);
- hlist_add_head_rcu(&mp->hlist[mdb->ver], &mdb->mhash[hash]);
- mdb->size++;
+ hlist_add_head_rcu(&grp_src->node, &pg->src_list);
+ pg->src_ents++;
-out:
- return mp;
+ return grp_src;
}
struct net_bridge_port_group *br_multicast_new_port_group(
struct net_bridge_port *port,
- struct br_ip *group,
+ const struct br_ip *group,
struct net_bridge_port_group __rcu *next,
unsigned char flags,
- const unsigned char *src)
+ const unsigned char *src,
+ u8 filter_mode,
+ u8 rt_protocol,
+ struct netlink_ext_ack *extack)
{
struct net_bridge_port_group *p;
+ int err;
- p = kzalloc(sizeof(*p), GFP_ATOMIC);
- if (unlikely(!p))
+ err = br_multicast_port_ngroups_inc(port, group, extack);
+ if (err)
return NULL;
- p->addr = *group;
- p->port = port;
+ p = kzalloc(sizeof(*p), GFP_ATOMIC);
+ if (unlikely(!p)) {
+ NL_SET_ERR_MSG_MOD(extack, "Couldn't allocate new port group");
+ goto dec_out;
+ }
+
+ p->key.addr = *group;
+ p->key.port = port;
p->flags = flags;
+ p->filter_mode = filter_mode;
+ p->rt_protocol = rt_protocol;
+ p->eht_host_tree = RB_ROOT;
+ p->eht_set_tree = RB_ROOT;
+ p->mcast_gc.destroy = br_multicast_destroy_port_group;
+ INIT_HLIST_HEAD(&p->src_list);
+
+ if (!br_multicast_is_star_g(group) &&
+ rhashtable_lookup_insert_fast(&port->br->sg_port_tbl, &p->rhnode,
+ br_sg_port_rht_params)) {
+ NL_SET_ERR_MSG_MOD(extack, "Couldn't insert new port group");
+ goto free_out;
+ }
+
rcu_assign_pointer(p->next, next);
- hlist_add_head(&p->mglist, &port->mglist);
timer_setup(&p->timer, br_multicast_port_group_expired, 0);
+ timer_setup(&p->rexmit_timer, br_multicast_port_group_rexmit, 0);
+ hlist_add_head(&p->mglist, &port->mglist);
if (src)
memcpy(p->eth_addr, src, ETH_ALEN);
else
- memset(p->eth_addr, 0xff, ETH_ALEN);
+ eth_broadcast_addr(p->eth_addr);
return p;
+
+free_out:
+ kfree(p);
+dec_out:
+ br_multicast_port_ngroups_dec(port, group->vid);
+ return NULL;
}
-static bool br_port_group_equal(struct net_bridge_port_group *p,
- struct net_bridge_port *port,
- const unsigned char *src)
+void br_multicast_del_port_group(struct net_bridge_port_group *p)
{
- if (p->port != port)
- return false;
+ struct net_bridge_port *port = p->key.port;
+ __u16 vid = p->key.addr.vid;
- if (!(port->flags & BR_MULTICAST_TO_UNICAST))
- return true;
+ hlist_del_init(&p->mglist);
+ if (!br_multicast_is_star_g(&p->key.addr))
+ rhashtable_remove_fast(&port->br->sg_port_tbl, &p->rhnode,
+ br_sg_port_rht_params);
+ kfree(p);
+ br_multicast_port_ngroups_dec(port, vid);
+}
- return ether_addr_equal(src, p->eth_addr);
+void br_multicast_host_join(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_mdb_entry *mp, bool notify)
+{
+ if (!mp->host_joined) {
+ mp->host_joined = true;
+ if (br_multicast_is_star_g(&mp->addr))
+ br_multicast_star_g_host_state(mp);
+ if (notify)
+ br_mdb_notify(mp->br->dev, mp, NULL, RTM_NEWMDB);
+ }
+
+ if (br_group_is_l2(&mp->addr))
+ return;
+
+ mod_timer(&mp->timer, jiffies + brmctx->multicast_membership_interval);
}
-static int br_multicast_add_group(struct net_bridge *br,
- struct net_bridge_port *port,
- struct br_ip *group,
- const unsigned char *src)
+void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify)
+{
+ if (!mp->host_joined)
+ return;
+
+ mp->host_joined = false;
+ if (br_multicast_is_star_g(&mp->addr))
+ br_multicast_star_g_host_state(mp);
+ if (notify)
+ br_mdb_notify(mp->br->dev, mp, NULL, RTM_DELMDB);
+}
+
+static struct net_bridge_port_group *
+__br_multicast_add_group(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct br_ip *group,
+ const unsigned char *src,
+ u8 filter_mode,
+ bool igmpv2_mldv1,
+ bool blocked)
{
struct net_bridge_port_group __rcu **pp;
- struct net_bridge_port_group *p;
+ struct net_bridge_port_group *p = NULL;
struct net_bridge_mdb_entry *mp;
unsigned long now = jiffies;
- int err;
- spin_lock(&br->multicast_lock);
- if (!netif_running(br->dev) ||
- (port && port->state == BR_STATE_DISABLED))
+ if (!br_multicast_ctx_should_use(brmctx, pmctx))
goto out;
- mp = br_multicast_new_group(br, port, group);
- err = PTR_ERR(mp);
+ mp = br_multicast_new_group(brmctx->br, group);
if (IS_ERR(mp))
- goto err;
+ return ERR_CAST(mp);
- if (!port) {
- if (!mp->host_joined) {
- mp->host_joined = true;
- br_mdb_notify(br->dev, NULL, &mp->addr, RTM_NEWMDB, 0);
- }
- mod_timer(&mp->timer, now + br->multicast_membership_interval);
+ if (!pmctx) {
+ br_multicast_host_join(brmctx, mp, true);
goto out;
}
for (pp = &mp->ports;
- (p = mlock_dereference(*pp, br)) != NULL;
+ (p = mlock_dereference(*pp, brmctx->br)) != NULL;
pp = &p->next) {
- if (br_port_group_equal(p, port, src))
+ if (br_port_group_equal(p, pmctx->port, src))
goto found;
- if ((unsigned long)p->port < (unsigned long)port)
+ if ((unsigned long)p->key.port < (unsigned long)pmctx->port)
break;
}
- p = br_multicast_new_port_group(port, group, *pp, 0, src);
- if (unlikely(!p))
- goto err;
+ p = br_multicast_new_port_group(pmctx->port, group, *pp, 0, src,
+ filter_mode, RTPROT_KERNEL, NULL);
+ if (unlikely(!p)) {
+ p = ERR_PTR(-ENOMEM);
+ goto out;
+ }
rcu_assign_pointer(*pp, p);
- br_mdb_notify(br->dev, port, group, RTM_NEWMDB, 0);
+ if (blocked)
+ p->flags |= MDB_PG_FLAGS_BLOCKED;
+ br_mdb_notify(brmctx->br->dev, mp, p, RTM_NEWMDB);
found:
- mod_timer(&p->timer, now + br->multicast_membership_interval);
+ if (igmpv2_mldv1)
+ mod_timer(&p->timer,
+ now + brmctx->multicast_membership_interval);
+
out:
- err = 0;
+ return p;
+}
+
+static int br_multicast_add_group(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct br_ip *group,
+ const unsigned char *src,
+ u8 filter_mode,
+ bool igmpv2_mldv1)
+{
+ struct net_bridge_port_group *pg;
+ int err;
+
+ spin_lock(&brmctx->br->multicast_lock);
+ pg = __br_multicast_add_group(brmctx, pmctx, group, src, filter_mode,
+ igmpv2_mldv1, false);
+ /* NULL is considered valid for host joined groups */
+ err = PTR_ERR_OR_ZERO(pg);
+ spin_unlock(&brmctx->br->multicast_lock);
-err:
- spin_unlock(&br->multicast_lock);
return err;
}
-static int br_ip4_multicast_add_group(struct net_bridge *br,
- struct net_bridge_port *port,
+static int br_ip4_multicast_add_group(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
__be32 group,
__u16 vid,
- const unsigned char *src)
+ const unsigned char *src,
+ bool igmpv2)
{
struct br_ip br_group;
+ u8 filter_mode;
if (ipv4_is_local_multicast(group))
return 0;
- br_group.u.ip4 = group;
+ memset(&br_group, 0, sizeof(br_group));
+ br_group.dst.ip4 = group;
br_group.proto = htons(ETH_P_IP);
br_group.vid = vid;
+ filter_mode = igmpv2 ? MCAST_EXCLUDE : MCAST_INCLUDE;
- return br_multicast_add_group(br, port, &br_group, src);
+ return br_multicast_add_group(brmctx, pmctx, &br_group, src,
+ filter_mode, igmpv2);
}
#if IS_ENABLED(CONFIG_IPV6)
-static int br_ip6_multicast_add_group(struct net_bridge *br,
- struct net_bridge_port *port,
+static int br_ip6_multicast_add_group(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
const struct in6_addr *group,
__u16 vid,
- const unsigned char *src)
+ const unsigned char *src,
+ bool mldv1)
{
struct br_ip br_group;
+ u8 filter_mode;
if (ipv6_addr_is_ll_all_nodes(group))
return 0;
- br_group.u.ip6 = *group;
+ memset(&br_group, 0, sizeof(br_group));
+ br_group.dst.ip6 = *group;
br_group.proto = htons(ETH_P_IPV6);
br_group.vid = vid;
+ filter_mode = mldv1 ? MCAST_EXCLUDE : MCAST_INCLUDE;
- return br_multicast_add_group(br, port, &br_group, src);
+ return br_multicast_add_group(brmctx, pmctx, &br_group, src,
+ filter_mode, mldv1);
}
#endif
-static void br_multicast_router_expired(struct timer_list *t)
+static bool br_multicast_rport_del(struct hlist_node *rlist)
{
- struct net_bridge_port *port =
- from_timer(port, t, multicast_router_timer);
- struct net_bridge *br = port->br;
+ if (hlist_unhashed(rlist))
+ return false;
+
+ hlist_del_init_rcu(rlist);
+ return true;
+}
+
+static bool br_ip4_multicast_rport_del(struct net_bridge_mcast_port *pmctx)
+{
+ return br_multicast_rport_del(&pmctx->ip4_rlist);
+}
+
+static bool br_ip6_multicast_rport_del(struct net_bridge_mcast_port *pmctx)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ return br_multicast_rport_del(&pmctx->ip6_rlist);
+#else
+ return false;
+#endif
+}
+
+static void br_multicast_router_expired(struct net_bridge_mcast_port *pmctx,
+ struct timer_list *t,
+ struct hlist_node *rlist)
+{
+ struct net_bridge *br = pmctx->port->br;
+ bool del;
spin_lock(&br->multicast_lock);
- if (port->multicast_router == MDB_RTR_TYPE_DISABLED ||
- port->multicast_router == MDB_RTR_TYPE_PERM ||
- timer_pending(&port->multicast_router_timer))
+ if (pmctx->multicast_router == MDB_RTR_TYPE_DISABLED ||
+ pmctx->multicast_router == MDB_RTR_TYPE_PERM ||
+ timer_pending(t))
goto out;
- __del_port_router(port);
+ del = br_multicast_rport_del(rlist);
+ br_multicast_rport_del_notify(pmctx, del);
out:
spin_unlock(&br->multicast_lock);
}
+static void br_ip4_multicast_router_expired(struct timer_list *t)
+{
+ struct net_bridge_mcast_port *pmctx = timer_container_of(pmctx, t,
+ ip4_mc_router_timer);
+
+ br_multicast_router_expired(pmctx, t, &pmctx->ip4_rlist);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void br_ip6_multicast_router_expired(struct timer_list *t)
+{
+ struct net_bridge_mcast_port *pmctx = timer_container_of(pmctx, t,
+ ip6_mc_router_timer);
+
+ br_multicast_router_expired(pmctx, t, &pmctx->ip6_rlist);
+}
+#endif
+
static void br_mc_router_state_change(struct net_bridge *p,
bool is_mc_router)
{
@@ -872,112 +1693,184 @@ static void br_mc_router_state_change(struct net_bridge *p,
.u.mrouter = is_mc_router,
};
- switchdev_port_attr_set(p->dev, &attr);
+ switchdev_port_attr_set(p->dev, &attr, NULL);
}
-static void br_multicast_local_router_expired(struct timer_list *t)
+static void br_multicast_local_router_expired(struct net_bridge_mcast *brmctx,
+ struct timer_list *timer)
{
- struct net_bridge *br = from_timer(br, t, multicast_router_timer);
-
- spin_lock(&br->multicast_lock);
- if (br->multicast_router == MDB_RTR_TYPE_DISABLED ||
- br->multicast_router == MDB_RTR_TYPE_PERM ||
- timer_pending(&br->multicast_router_timer))
+ spin_lock(&brmctx->br->multicast_lock);
+ if (brmctx->multicast_router == MDB_RTR_TYPE_DISABLED ||
+ brmctx->multicast_router == MDB_RTR_TYPE_PERM ||
+ br_ip4_multicast_is_router(brmctx) ||
+ br_ip6_multicast_is_router(brmctx))
goto out;
- br_mc_router_state_change(br, false);
+ br_mc_router_state_change(brmctx->br, false);
out:
- spin_unlock(&br->multicast_lock);
+ spin_unlock(&brmctx->br->multicast_lock);
}
-static void br_multicast_querier_expired(struct net_bridge *br,
+static void br_ip4_multicast_local_router_expired(struct timer_list *t)
+{
+ struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t,
+ ip4_mc_router_timer);
+
+ br_multicast_local_router_expired(brmctx, t);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void br_ip6_multicast_local_router_expired(struct timer_list *t)
+{
+ struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t,
+ ip6_mc_router_timer);
+
+ br_multicast_local_router_expired(brmctx, t);
+}
+#endif
+
+static void br_multicast_querier_expired(struct net_bridge_mcast *brmctx,
struct bridge_mcast_own_query *query)
{
- spin_lock(&br->multicast_lock);
- if (!netif_running(br->dev) || br->multicast_disabled)
+ spin_lock(&brmctx->br->multicast_lock);
+ if (!netif_running(brmctx->br->dev) ||
+ br_multicast_ctx_vlan_global_disabled(brmctx) ||
+ !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED))
goto out;
- br_multicast_start_querier(br, query);
+ br_multicast_start_querier(brmctx, query);
out:
- spin_unlock(&br->multicast_lock);
+ spin_unlock(&brmctx->br->multicast_lock);
}
static void br_ip4_multicast_querier_expired(struct timer_list *t)
{
- struct net_bridge *br = from_timer(br, t, ip4_other_query.timer);
+ struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t,
+ ip4_other_query.timer);
- br_multicast_querier_expired(br, &br->ip4_own_query);
+ br_multicast_querier_expired(brmctx, &brmctx->ip4_own_query);
}
#if IS_ENABLED(CONFIG_IPV6)
static void br_ip6_multicast_querier_expired(struct timer_list *t)
{
- struct net_bridge *br = from_timer(br, t, ip6_other_query.timer);
+ struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t,
+ ip6_other_query.timer);
- br_multicast_querier_expired(br, &br->ip6_own_query);
+ br_multicast_querier_expired(brmctx, &brmctx->ip6_own_query);
}
#endif
-static void br_multicast_select_own_querier(struct net_bridge *br,
+static void br_multicast_query_delay_expired(struct timer_list *t)
+{
+}
+
+static void br_multicast_select_own_querier(struct net_bridge_mcast *brmctx,
struct br_ip *ip,
struct sk_buff *skb)
{
if (ip->proto == htons(ETH_P_IP))
- br->ip4_querier.addr.u.ip4 = ip_hdr(skb)->saddr;
+ brmctx->ip4_querier.addr.src.ip4 = ip_hdr(skb)->saddr;
#if IS_ENABLED(CONFIG_IPV6)
else
- br->ip6_querier.addr.u.ip6 = ipv6_hdr(skb)->saddr;
+ brmctx->ip6_querier.addr.src.ip6 = ipv6_hdr(skb)->saddr;
#endif
}
-static void __br_multicast_send_query(struct net_bridge *br,
- struct net_bridge_port *port,
- struct br_ip *ip)
+static void __br_multicast_send_query(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg,
+ struct br_ip *ip_dst,
+ struct br_ip *group,
+ bool with_srcs,
+ u8 sflag,
+ bool *need_rexmit)
{
+ bool over_lmqt = !!sflag;
struct sk_buff *skb;
u8 igmp_type;
- skb = br_multicast_alloc_query(br, ip, &igmp_type);
+ if (!br_multicast_ctx_should_use(brmctx, pmctx) ||
+ !br_multicast_ctx_matches_vlan_snooping(brmctx))
+ return;
+
+again_under_lmqt:
+ skb = br_multicast_alloc_query(brmctx, pmctx, pg, ip_dst, group,
+ with_srcs, over_lmqt, sflag, &igmp_type,
+ need_rexmit);
if (!skb)
return;
- if (port) {
- skb->dev = port->dev;
- br_multicast_count(br, port, skb, igmp_type,
+ if (pmctx) {
+ skb->dev = pmctx->port->dev;
+ br_multicast_count(brmctx->br, pmctx->port, skb, igmp_type,
BR_MCAST_DIR_TX);
NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_OUT,
- dev_net(port->dev), NULL, skb, NULL, skb->dev,
+ dev_net(pmctx->port->dev), NULL, skb, NULL, skb->dev,
br_dev_queue_push_xmit);
+
+ if (over_lmqt && with_srcs && sflag) {
+ over_lmqt = false;
+ goto again_under_lmqt;
+ }
} else {
- br_multicast_select_own_querier(br, ip, skb);
- br_multicast_count(br, port, skb, igmp_type,
+ br_multicast_select_own_querier(brmctx, group, skb);
+ br_multicast_count(brmctx->br, NULL, skb, igmp_type,
BR_MCAST_DIR_RX);
netif_rx(skb);
}
}
-static void br_multicast_send_query(struct net_bridge *br,
- struct net_bridge_port *port,
+static void br_multicast_read_querier(const struct bridge_mcast_querier *querier,
+ struct bridge_mcast_querier *dest)
+{
+ unsigned int seq;
+
+ memset(dest, 0, sizeof(*dest));
+ do {
+ seq = read_seqcount_begin(&querier->seq);
+ dest->port_ifidx = querier->port_ifidx;
+ memcpy(&dest->addr, &querier->addr, sizeof(struct br_ip));
+ } while (read_seqcount_retry(&querier->seq, seq));
+}
+
+static void br_multicast_update_querier(struct net_bridge_mcast *brmctx,
+ struct bridge_mcast_querier *querier,
+ int ifindex,
+ struct br_ip *saddr)
+{
+ write_seqcount_begin(&querier->seq);
+ querier->port_ifidx = ifindex;
+ memcpy(&querier->addr, saddr, sizeof(*saddr));
+ write_seqcount_end(&querier->seq);
+}
+
+static void br_multicast_send_query(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
struct bridge_mcast_own_query *own_query)
{
struct bridge_mcast_other_query *other_query = NULL;
+ struct bridge_mcast_querier *querier;
struct br_ip br_group;
unsigned long time;
- if (!netif_running(br->dev) || br->multicast_disabled ||
- !br->multicast_querier)
+ if (!br_multicast_ctx_should_use(brmctx, pmctx) ||
+ !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED) ||
+ !brmctx->multicast_querier)
return;
- memset(&br_group.u, 0, sizeof(br_group.u));
+ memset(&br_group.dst, 0, sizeof(br_group.dst));
- if (port ? (own_query == &port->ip4_own_query) :
- (own_query == &br->ip4_own_query)) {
- other_query = &br->ip4_other_query;
+ if (pmctx ? (own_query == &pmctx->ip4_own_query) :
+ (own_query == &brmctx->ip4_own_query)) {
+ querier = &brmctx->ip4_querier;
+ other_query = &brmctx->ip4_other_query;
br_group.proto = htons(ETH_P_IP);
#if IS_ENABLED(CONFIG_IPV6)
} else {
- other_query = &br->ip6_other_query;
+ querier = &brmctx->ip6_querier;
+ other_query = &brmctx->ip6_other_query;
br_group.proto = htons(ETH_P_IPV6);
#endif
}
@@ -985,30 +1878,39 @@ static void br_multicast_send_query(struct net_bridge *br,
if (!other_query || timer_pending(&other_query->timer))
return;
- __br_multicast_send_query(br, port, &br_group);
+ /* we're about to select ourselves as querier */
+ if (!pmctx && querier->port_ifidx) {
+ struct br_ip zeroip = {};
+
+ br_multicast_update_querier(brmctx, querier, 0, &zeroip);
+ }
+
+ __br_multicast_send_query(brmctx, pmctx, NULL, NULL, &br_group, false,
+ 0, NULL);
time = jiffies;
- time += own_query->startup_sent < br->multicast_startup_query_count ?
- br->multicast_startup_query_interval :
- br->multicast_query_interval;
+ time += own_query->startup_sent < brmctx->multicast_startup_query_count ?
+ brmctx->multicast_startup_query_interval :
+ brmctx->multicast_query_interval;
mod_timer(&own_query->timer, time);
}
static void
-br_multicast_port_query_expired(struct net_bridge_port *port,
+br_multicast_port_query_expired(struct net_bridge_mcast_port *pmctx,
struct bridge_mcast_own_query *query)
{
- struct net_bridge *br = port->br;
+ struct net_bridge *br = pmctx->port->br;
+ struct net_bridge_mcast *brmctx;
spin_lock(&br->multicast_lock);
- if (port->state == BR_STATE_DISABLED ||
- port->state == BR_STATE_BLOCKING)
+ if (br_multicast_port_ctx_state_stopped(pmctx))
goto out;
- if (query->startup_sent < br->multicast_startup_query_count)
+ brmctx = br_multicast_port_ctx_get_global(pmctx);
+ if (query->startup_sent < brmctx->multicast_startup_query_count)
query->startup_sent++;
- br_multicast_send_query(port->br, port, query);
+ br_multicast_send_query(brmctx, pmctx, query);
out:
spin_unlock(&br->multicast_lock);
@@ -1016,45 +1918,131 @@ out:
static void br_ip4_multicast_port_query_expired(struct timer_list *t)
{
- struct net_bridge_port *port = from_timer(port, t, ip4_own_query.timer);
+ struct net_bridge_mcast_port *pmctx = timer_container_of(pmctx, t,
+ ip4_own_query.timer);
- br_multicast_port_query_expired(port, &port->ip4_own_query);
+ br_multicast_port_query_expired(pmctx, &pmctx->ip4_own_query);
}
#if IS_ENABLED(CONFIG_IPV6)
static void br_ip6_multicast_port_query_expired(struct timer_list *t)
{
- struct net_bridge_port *port = from_timer(port, t, ip6_own_query.timer);
+ struct net_bridge_mcast_port *pmctx = timer_container_of(pmctx, t,
+ ip6_own_query.timer);
- br_multicast_port_query_expired(port, &port->ip6_own_query);
+ br_multicast_port_query_expired(pmctx, &pmctx->ip6_own_query);
}
#endif
-static void br_mc_disabled_update(struct net_device *dev, bool value)
+static void br_multicast_port_group_rexmit(struct timer_list *t)
+{
+ struct net_bridge_port_group *pg = timer_container_of(pg, t,
+ rexmit_timer);
+ struct bridge_mcast_other_query *other_query = NULL;
+ struct net_bridge *br = pg->key.port->br;
+ struct net_bridge_mcast_port *pmctx;
+ struct net_bridge_mcast *brmctx;
+ bool need_rexmit = false;
+
+ spin_lock(&br->multicast_lock);
+ if (!netif_running(br->dev) || hlist_unhashed(&pg->mglist) ||
+ !br_opt_get(br, BROPT_MULTICAST_ENABLED))
+ goto out;
+
+ pmctx = br_multicast_pg_to_port_ctx(pg);
+ if (!pmctx)
+ goto out;
+ brmctx = br_multicast_port_ctx_get_global(pmctx);
+ if (!brmctx->multicast_querier)
+ goto out;
+
+ if (pg->key.addr.proto == htons(ETH_P_IP))
+ other_query = &brmctx->ip4_other_query;
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ other_query = &brmctx->ip6_other_query;
+#endif
+
+ if (!other_query || timer_pending(&other_query->timer))
+ goto out;
+
+ if (pg->grp_query_rexmit_cnt) {
+ pg->grp_query_rexmit_cnt--;
+ __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr,
+ &pg->key.addr, false, 1, NULL);
+ }
+ __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr,
+ &pg->key.addr, true, 0, &need_rexmit);
+
+ if (pg->grp_query_rexmit_cnt || need_rexmit)
+ mod_timer(&pg->rexmit_timer, jiffies +
+ brmctx->multicast_last_member_interval);
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
+static int br_mc_disabled_update(struct net_device *dev, bool value,
+ struct netlink_ext_ack *extack)
{
struct switchdev_attr attr = {
.orig_dev = dev,
.id = SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED,
.flags = SWITCHDEV_F_DEFER,
- .u.mc_disabled = value,
+ .u.mc_disabled = !value,
};
- switchdev_port_attr_set(dev, &attr);
+ return switchdev_port_attr_set(dev, &attr, extack);
}
-int br_multicast_add_port(struct net_bridge_port *port)
+void br_multicast_port_ctx_init(struct net_bridge_port *port,
+ struct net_bridge_vlan *vlan,
+ struct net_bridge_mcast_port *pmctx)
{
- port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
-
- timer_setup(&port->multicast_router_timer,
- br_multicast_router_expired, 0);
- timer_setup(&port->ip4_own_query.timer,
+ pmctx->port = port;
+ pmctx->vlan = vlan;
+ pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
+ timer_setup(&pmctx->ip4_mc_router_timer,
+ br_ip4_multicast_router_expired, 0);
+ timer_setup(&pmctx->ip4_own_query.timer,
br_ip4_multicast_port_query_expired, 0);
#if IS_ENABLED(CONFIG_IPV6)
- timer_setup(&port->ip6_own_query.timer,
+ timer_setup(&pmctx->ip6_mc_router_timer,
+ br_ip6_multicast_router_expired, 0);
+ timer_setup(&pmctx->ip6_own_query.timer,
br_ip6_multicast_port_query_expired, 0);
#endif
- br_mc_disabled_update(port->dev, port->br->multicast_disabled);
+}
+
+void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx)
+{
+ struct net_bridge *br = pmctx->port->br;
+ bool del = false;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ timer_delete_sync(&pmctx->ip6_mc_router_timer);
+#endif
+ timer_delete_sync(&pmctx->ip4_mc_router_timer);
+
+ spin_lock_bh(&br->multicast_lock);
+ del |= br_ip6_multicast_rport_del(pmctx);
+ del |= br_ip4_multicast_rport_del(pmctx);
+ br_multicast_rport_del_notify(pmctx, del);
+ spin_unlock_bh(&br->multicast_lock);
+}
+
+int br_multicast_add_port(struct net_bridge_port *port)
+{
+ int err;
+
+ port->multicast_eht_hosts_limit = BR_MCAST_DEFAULT_EHT_HOSTS_LIMIT;
+ br_multicast_port_ctx_init(port, NULL, &port->multicast_ctx);
+
+ err = br_mc_disabled_update(port->dev,
+ br_opt_get(port->br,
+ BROPT_MULTICAST_ENABLED),
+ NULL);
+ if (err && err != -EOPNOTSUPP)
+ return err;
port->mcast_stats = netdev_alloc_pcpu_stats(struct bridge_mcast_stats);
if (!port->mcast_stats)
@@ -1072,9 +2060,10 @@ void br_multicast_del_port(struct net_bridge_port *port)
/* Take care of the remaining groups, only perm ones should be left */
spin_lock_bh(&br->multicast_lock);
hlist_for_each_entry_safe(pg, n, &port->mglist, mglist)
- br_multicast_del_pg(br, pg);
+ br_multicast_find_del_pg(br, pg);
spin_unlock_bh(&br->multicast_lock);
- del_timer_sync(&port->multicast_router_timer);
+ flush_work(&br->mcast_gc_work);
+ br_multicast_port_ctx_deinit(&port->multicast_ctx);
free_percpu(port->mcast_stats);
}
@@ -1082,71 +2071,800 @@ static void br_multicast_enable(struct bridge_mcast_own_query *query)
{
query->startup_sent = 0;
- if (try_to_del_timer_sync(&query->timer) >= 0 ||
- del_timer(&query->timer))
+ if (timer_delete_sync_try(&query->timer) >= 0 ||
+ timer_delete(&query->timer))
mod_timer(&query->timer, jiffies);
}
-static void __br_multicast_enable_port(struct net_bridge_port *port)
+static void __br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx)
{
- struct net_bridge *br = port->br;
+ struct net_bridge *br = pmctx->port->br;
+ struct net_bridge_mcast *brmctx;
- if (br->multicast_disabled || !netif_running(br->dev))
+ brmctx = br_multicast_port_ctx_get_global(pmctx);
+ if (!br_opt_get(br, BROPT_MULTICAST_ENABLED) ||
+ !netif_running(br->dev))
return;
- br_multicast_enable(&port->ip4_own_query);
+ br_multicast_enable(&pmctx->ip4_own_query);
#if IS_ENABLED(CONFIG_IPV6)
- br_multicast_enable(&port->ip6_own_query);
+ br_multicast_enable(&pmctx->ip6_own_query);
#endif
- if (port->multicast_router == MDB_RTR_TYPE_PERM &&
- hlist_unhashed(&port->rlist))
- br_multicast_add_router(br, port);
+ if (pmctx->multicast_router == MDB_RTR_TYPE_PERM) {
+ br_ip4_multicast_add_router(brmctx, pmctx);
+ br_ip6_multicast_add_router(brmctx, pmctx);
+ }
+
+ if (br_multicast_port_ctx_is_vlan(pmctx)) {
+ struct net_bridge_port_group *pg;
+ u32 n = 0;
+
+ /* The mcast_n_groups counter might be wrong. First,
+ * BR_VLFLAG_MCAST_ENABLED is toggled before temporary entries
+ * are flushed, thus mcast_n_groups after the toggle does not
+ * reflect the true values. And second, permanent entries added
+ * while BR_VLFLAG_MCAST_ENABLED was disabled, are not reflected
+ * either. Thus we have to refresh the counter.
+ */
+
+ hlist_for_each_entry(pg, &pmctx->port->mglist, mglist) {
+ if (pg->key.addr.vid == pmctx->vlan->vid)
+ n++;
+ }
+ WRITE_ONCE(pmctx->mdb_n_entries, n);
+ }
}
-void br_multicast_enable_port(struct net_bridge_port *port)
+static void br_multicast_enable_port_ctx(struct net_bridge_mcast_port *pmctx)
{
- struct net_bridge *br = port->br;
+ struct net_bridge *br = pmctx->port->br;
- spin_lock(&br->multicast_lock);
- __br_multicast_enable_port(port);
- spin_unlock(&br->multicast_lock);
+ spin_lock_bh(&br->multicast_lock);
+ if (br_multicast_port_ctx_is_vlan(pmctx) &&
+ !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) {
+ spin_unlock_bh(&br->multicast_lock);
+ return;
+ }
+ __br_multicast_enable_port_ctx(pmctx);
+ spin_unlock_bh(&br->multicast_lock);
}
-void br_multicast_disable_port(struct net_bridge_port *port)
+static void __br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx)
{
- struct net_bridge *br = port->br;
struct net_bridge_port_group *pg;
struct hlist_node *n;
+ bool del = false;
+
+ hlist_for_each_entry_safe(pg, n, &pmctx->port->mglist, mglist)
+ if (!(pg->flags & MDB_PG_FLAGS_PERMANENT) &&
+ (!br_multicast_port_ctx_is_vlan(pmctx) ||
+ pg->key.addr.vid == pmctx->vlan->vid))
+ br_multicast_find_del_pg(pmctx->port->br, pg);
+
+ del |= br_ip4_multicast_rport_del(pmctx);
+ timer_delete(&pmctx->ip4_mc_router_timer);
+ timer_delete(&pmctx->ip4_own_query.timer);
+ del |= br_ip6_multicast_rport_del(pmctx);
+#if IS_ENABLED(CONFIG_IPV6)
+ timer_delete(&pmctx->ip6_mc_router_timer);
+ timer_delete(&pmctx->ip6_own_query.timer);
+#endif
+ br_multicast_rport_del_notify(pmctx, del);
+}
- spin_lock(&br->multicast_lock);
- hlist_for_each_entry_safe(pg, n, &port->mglist, mglist)
- if (!(pg->flags & MDB_PG_FLAGS_PERMANENT))
- br_multicast_del_pg(br, pg);
+static void br_multicast_disable_port_ctx(struct net_bridge_mcast_port *pmctx)
+{
+ struct net_bridge *br = pmctx->port->br;
+
+ spin_lock_bh(&br->multicast_lock);
+ if (br_multicast_port_ctx_is_vlan(pmctx) &&
+ !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED)) {
+ spin_unlock_bh(&br->multicast_lock);
+ return;
+ }
+
+ __br_multicast_disable_port_ctx(pmctx);
+ spin_unlock_bh(&br->multicast_lock);
+}
- __del_port_router(port);
+static void br_multicast_toggle_port(struct net_bridge_port *port, bool on)
+{
+#if IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING)
+ if (br_opt_get(port->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) {
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *vlan;
+
+ rcu_read_lock();
+ vg = nbp_vlan_group_rcu(port);
+ if (!vg) {
+ rcu_read_unlock();
+ return;
+ }
+
+ /* iterate each vlan, toggle vlan multicast context */
+ list_for_each_entry_rcu(vlan, &vg->vlan_list, vlist) {
+ struct net_bridge_mcast_port *pmctx =
+ &vlan->port_mcast_ctx;
+ u8 state = br_vlan_get_state(vlan);
+ /* enable vlan multicast context when state is
+ * LEARNING or FORWARDING
+ */
+ if (on && br_vlan_state_allowed(state, true))
+ br_multicast_enable_port_ctx(pmctx);
+ else
+ br_multicast_disable_port_ctx(pmctx);
+ }
+ rcu_read_unlock();
+ return;
+ }
+#endif
+ /* toggle port multicast context when vlan snooping is disabled */
+ if (on)
+ br_multicast_enable_port_ctx(&port->multicast_ctx);
+ else
+ br_multicast_disable_port_ctx(&port->multicast_ctx);
+}
- del_timer(&port->multicast_router_timer);
- del_timer(&port->ip4_own_query.timer);
+void br_multicast_enable_port(struct net_bridge_port *port)
+{
+ br_multicast_toggle_port(port, true);
+}
+
+void br_multicast_disable_port(struct net_bridge_port *port)
+{
+ br_multicast_toggle_port(port, false);
+}
+
+static int __grp_src_delete_marked(struct net_bridge_port_group *pg)
+{
+ struct net_bridge_group_src *ent;
+ struct hlist_node *tmp;
+ int deleted = 0;
+
+ hlist_for_each_entry_safe(ent, tmp, &pg->src_list, node)
+ if (ent->flags & BR_SGRP_F_DELETE) {
+ br_multicast_del_group_src(ent, false);
+ deleted++;
+ }
+
+ return deleted;
+}
+
+static void __grp_src_mod_timer(struct net_bridge_group_src *src,
+ unsigned long expires)
+{
+ mod_timer(&src->timer, expires);
+ br_multicast_fwd_src_handle(src);
+}
+
+static void __grp_src_query_marked_and_rexmit(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg)
+{
+ struct bridge_mcast_other_query *other_query = NULL;
+ u32 lmqc = brmctx->multicast_last_member_count;
+ unsigned long lmqt, lmi, now = jiffies;
+ struct net_bridge_group_src *ent;
+
+ if (!netif_running(brmctx->br->dev) ||
+ !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED))
+ return;
+
+ if (pg->key.addr.proto == htons(ETH_P_IP))
+ other_query = &brmctx->ip4_other_query;
#if IS_ENABLED(CONFIG_IPV6)
- del_timer(&port->ip6_own_query.timer);
+ else
+ other_query = &brmctx->ip6_other_query;
#endif
- spin_unlock(&br->multicast_lock);
+
+ lmqt = now + br_multicast_lmqt(brmctx);
+ hlist_for_each_entry(ent, &pg->src_list, node) {
+ if (ent->flags & BR_SGRP_F_SEND) {
+ ent->flags &= ~BR_SGRP_F_SEND;
+ if (ent->timer.expires > lmqt) {
+ if (brmctx->multicast_querier &&
+ other_query &&
+ !timer_pending(&other_query->timer))
+ ent->src_query_rexmit_cnt = lmqc;
+ __grp_src_mod_timer(ent, lmqt);
+ }
+ }
+ }
+
+ if (!brmctx->multicast_querier ||
+ !other_query || timer_pending(&other_query->timer))
+ return;
+
+ __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr,
+ &pg->key.addr, true, 1, NULL);
+
+ lmi = now + brmctx->multicast_last_member_interval;
+ if (!timer_pending(&pg->rexmit_timer) ||
+ time_after(pg->rexmit_timer.expires, lmi))
+ mod_timer(&pg->rexmit_timer, lmi);
+}
+
+static void __grp_send_query_and_rexmit(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg)
+{
+ struct bridge_mcast_other_query *other_query = NULL;
+ unsigned long now = jiffies, lmi;
+
+ if (!netif_running(brmctx->br->dev) ||
+ !br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED))
+ return;
+
+ if (pg->key.addr.proto == htons(ETH_P_IP))
+ other_query = &brmctx->ip4_other_query;
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ other_query = &brmctx->ip6_other_query;
+#endif
+
+ if (brmctx->multicast_querier &&
+ other_query && !timer_pending(&other_query->timer)) {
+ lmi = now + brmctx->multicast_last_member_interval;
+ pg->grp_query_rexmit_cnt = brmctx->multicast_last_member_count - 1;
+ __br_multicast_send_query(brmctx, pmctx, pg, &pg->key.addr,
+ &pg->key.addr, false, 0, NULL);
+ if (!timer_pending(&pg->rexmit_timer) ||
+ time_after(pg->rexmit_timer.expires, lmi))
+ mod_timer(&pg->rexmit_timer, lmi);
+ }
+
+ if (pg->filter_mode == MCAST_EXCLUDE &&
+ (!timer_pending(&pg->timer) ||
+ time_after(pg->timer.expires, now + br_multicast_lmqt(brmctx))))
+ mod_timer(&pg->timer, now + br_multicast_lmqt(brmctx));
+}
+
+/* State Msg type New state Actions
+ * INCLUDE (A) IS_IN (B) INCLUDE (A+B) (B)=GMI
+ * INCLUDE (A) ALLOW (B) INCLUDE (A+B) (B)=GMI
+ * EXCLUDE (X,Y) ALLOW (A) EXCLUDE (X+A,Y-A) (A)=GMI
+ */
+static bool br_multicast_isinc_allow(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size,
+ int grec_type)
+{
+ struct net_bridge_group_src *ent;
+ unsigned long now = jiffies;
+ bool changed = false;
+ struct br_ip src_ip;
+ u32 src_idx;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (!ent) {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent)
+ changed = true;
+ }
+
+ if (ent)
+ __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx));
+ }
+
+ if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ grec_type))
+ changed = true;
+
+ return changed;
+}
+
+/* State Msg type New state Actions
+ * INCLUDE (A) IS_EX (B) EXCLUDE (A*B,B-A) (B-A)=0
+ * Delete (A-B)
+ * Group Timer=GMI
+ */
+static void __grp_src_isexc_incl(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size,
+ int grec_type)
+{
+ struct net_bridge_group_src *ent;
+ struct br_ip src_ip;
+ u32 src_idx;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags |= BR_SGRP_F_DELETE;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent)
+ ent->flags &= ~BR_SGRP_F_DELETE;
+ else
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent)
+ br_multicast_fwd_src_handle(ent);
+ }
+
+ br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ grec_type);
+
+ __grp_src_delete_marked(pg);
+}
+
+/* State Msg type New state Actions
+ * EXCLUDE (X,Y) IS_EX (A) EXCLUDE (A-Y,Y*A) (A-X-Y)=GMI
+ * Delete (X-A)
+ * Delete (Y-A)
+ * Group Timer=GMI
+ */
+static bool __grp_src_isexc_excl(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size,
+ int grec_type)
+{
+ struct net_bridge_group_src *ent;
+ unsigned long now = jiffies;
+ bool changed = false;
+ struct br_ip src_ip;
+ u32 src_idx;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags |= BR_SGRP_F_DELETE;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ ent->flags &= ~BR_SGRP_F_DELETE;
+ } else {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent) {
+ __grp_src_mod_timer(ent,
+ now + br_multicast_gmi(brmctx));
+ changed = true;
+ }
+ }
+ }
+
+ if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ grec_type))
+ changed = true;
+
+ if (__grp_src_delete_marked(pg))
+ changed = true;
+
+ return changed;
+}
+
+static bool br_multicast_isexc(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size,
+ int grec_type)
+{
+ bool changed = false;
+
+ switch (pg->filter_mode) {
+ case MCAST_INCLUDE:
+ __grp_src_isexc_incl(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ grec_type);
+ br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE);
+ changed = true;
+ break;
+ case MCAST_EXCLUDE:
+ changed = __grp_src_isexc_excl(brmctx, pg, h_addr, srcs, nsrcs,
+ addr_size, grec_type);
+ break;
+ }
+
+ pg->filter_mode = MCAST_EXCLUDE;
+ mod_timer(&pg->timer, jiffies + br_multicast_gmi(brmctx));
+
+ return changed;
+}
+
+/* State Msg type New state Actions
+ * INCLUDE (A) TO_IN (B) INCLUDE (A+B) (B)=GMI
+ * Send Q(G,A-B)
+ */
+static bool __grp_src_toin_incl(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size,
+ int grec_type)
+{
+ u32 src_idx, to_send = pg->src_ents;
+ struct net_bridge_group_src *ent;
+ unsigned long now = jiffies;
+ bool changed = false;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags |= BR_SGRP_F_SEND;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ ent->flags &= ~BR_SGRP_F_SEND;
+ to_send--;
+ } else {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent)
+ changed = true;
+ }
+ if (ent)
+ __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx));
+ }
+
+ if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ grec_type))
+ changed = true;
+
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg);
+
+ return changed;
+}
+
+/* State Msg type New state Actions
+ * EXCLUDE (X,Y) TO_IN (A) EXCLUDE (X+A,Y-A) (A)=GMI
+ * Send Q(G,X-A)
+ * Send Q(G)
+ */
+static bool __grp_src_toin_excl(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size,
+ int grec_type)
+{
+ u32 src_idx, to_send = pg->src_ents;
+ struct net_bridge_group_src *ent;
+ unsigned long now = jiffies;
+ bool changed = false;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ if (timer_pending(&ent->timer))
+ ent->flags |= BR_SGRP_F_SEND;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ if (timer_pending(&ent->timer)) {
+ ent->flags &= ~BR_SGRP_F_SEND;
+ to_send--;
+ }
+ } else {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent)
+ changed = true;
+ }
+ if (ent)
+ __grp_src_mod_timer(ent, now + br_multicast_gmi(brmctx));
+ }
+
+ if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ grec_type))
+ changed = true;
+
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg);
+
+ __grp_send_query_and_rexmit(brmctx, pmctx, pg);
+
+ return changed;
+}
+
+static bool br_multicast_toin(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size,
+ int grec_type)
+{
+ bool changed = false;
+
+ switch (pg->filter_mode) {
+ case MCAST_INCLUDE:
+ changed = __grp_src_toin_incl(brmctx, pmctx, pg, h_addr, srcs,
+ nsrcs, addr_size, grec_type);
+ break;
+ case MCAST_EXCLUDE:
+ changed = __grp_src_toin_excl(brmctx, pmctx, pg, h_addr, srcs,
+ nsrcs, addr_size, grec_type);
+ break;
+ }
+
+ if (br_multicast_eht_should_del_pg(pg)) {
+ pg->flags |= MDB_PG_FLAGS_FAST_LEAVE;
+ br_multicast_find_del_pg(pg->key.port->br, pg);
+ /* a notification has already been sent and we shouldn't
+ * access pg after the delete so we have to return false
+ */
+ changed = false;
+ }
+
+ return changed;
+}
+
+/* State Msg type New state Actions
+ * INCLUDE (A) TO_EX (B) EXCLUDE (A*B,B-A) (B-A)=0
+ * Delete (A-B)
+ * Send Q(G,A*B)
+ * Group Timer=GMI
+ */
+static void __grp_src_toex_incl(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size,
+ int grec_type)
+{
+ struct net_bridge_group_src *ent;
+ u32 src_idx, to_send = 0;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags = (ent->flags & ~BR_SGRP_F_SEND) | BR_SGRP_F_DELETE;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ ent->flags = (ent->flags & ~BR_SGRP_F_DELETE) |
+ BR_SGRP_F_SEND;
+ to_send++;
+ } else {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ }
+ if (ent)
+ br_multicast_fwd_src_handle(ent);
+ }
+
+ br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ grec_type);
+
+ __grp_src_delete_marked(pg);
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg);
+}
+
+/* State Msg type New state Actions
+ * EXCLUDE (X,Y) TO_EX (A) EXCLUDE (A-Y,Y*A) (A-X-Y)=Group Timer
+ * Delete (X-A)
+ * Delete (Y-A)
+ * Send Q(G,A-Y)
+ * Group Timer=GMI
+ */
+static bool __grp_src_toex_excl(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size,
+ int grec_type)
+{
+ struct net_bridge_group_src *ent;
+ u32 src_idx, to_send = 0;
+ bool changed = false;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags = (ent->flags & ~BR_SGRP_F_SEND) | BR_SGRP_F_DELETE;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ ent->flags &= ~BR_SGRP_F_DELETE;
+ } else {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent) {
+ __grp_src_mod_timer(ent, pg->timer.expires);
+ changed = true;
+ }
+ }
+ if (ent && timer_pending(&ent->timer)) {
+ ent->flags |= BR_SGRP_F_SEND;
+ to_send++;
+ }
+ }
+
+ if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ grec_type))
+ changed = true;
+
+ if (__grp_src_delete_marked(pg))
+ changed = true;
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg);
+
+ return changed;
+}
+
+static bool br_multicast_toex(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size,
+ int grec_type)
+{
+ bool changed = false;
+
+ switch (pg->filter_mode) {
+ case MCAST_INCLUDE:
+ __grp_src_toex_incl(brmctx, pmctx, pg, h_addr, srcs, nsrcs,
+ addr_size, grec_type);
+ br_multicast_star_g_handle_mode(pg, MCAST_EXCLUDE);
+ changed = true;
+ break;
+ case MCAST_EXCLUDE:
+ changed = __grp_src_toex_excl(brmctx, pmctx, pg, h_addr, srcs,
+ nsrcs, addr_size, grec_type);
+ break;
+ }
+
+ pg->filter_mode = MCAST_EXCLUDE;
+ mod_timer(&pg->timer, jiffies + br_multicast_gmi(brmctx));
+
+ return changed;
}
-static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
- struct net_bridge_port *port,
+/* State Msg type New state Actions
+ * INCLUDE (A) BLOCK (B) INCLUDE (A) Send Q(G,A*B)
+ */
+static bool __grp_src_block_incl(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size, int grec_type)
+{
+ struct net_bridge_group_src *ent;
+ u32 src_idx, to_send = 0;
+ bool changed = false;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags &= ~BR_SGRP_F_SEND;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (ent) {
+ ent->flags |= BR_SGRP_F_SEND;
+ to_send++;
+ }
+ }
+
+ if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ grec_type))
+ changed = true;
+
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg);
+
+ return changed;
+}
+
+/* State Msg type New state Actions
+ * EXCLUDE (X,Y) BLOCK (A) EXCLUDE (X+(A-Y),Y) (A-X-Y)=Group Timer
+ * Send Q(G,A-Y)
+ */
+static bool __grp_src_block_excl(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size, int grec_type)
+{
+ struct net_bridge_group_src *ent;
+ u32 src_idx, to_send = 0;
+ bool changed = false;
+ struct br_ip src_ip;
+
+ hlist_for_each_entry(ent, &pg->src_list, node)
+ ent->flags &= ~BR_SGRP_F_SEND;
+
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&src_ip.src, srcs + (src_idx * addr_size), addr_size);
+ ent = br_multicast_find_group_src(pg, &src_ip);
+ if (!ent) {
+ ent = br_multicast_new_group_src(pg, &src_ip);
+ if (ent) {
+ __grp_src_mod_timer(ent, pg->timer.expires);
+ changed = true;
+ }
+ }
+ if (ent && timer_pending(&ent->timer)) {
+ ent->flags |= BR_SGRP_F_SEND;
+ to_send++;
+ }
+ }
+
+ if (br_multicast_eht_handle(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ grec_type))
+ changed = true;
+
+ if (to_send)
+ __grp_src_query_marked_and_rexmit(brmctx, pmctx, pg);
+
+ return changed;
+}
+
+static bool br_multicast_block(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct net_bridge_port_group *pg, void *h_addr,
+ void *srcs, u32 nsrcs, size_t addr_size, int grec_type)
+{
+ bool changed = false;
+
+ switch (pg->filter_mode) {
+ case MCAST_INCLUDE:
+ changed = __grp_src_block_incl(brmctx, pmctx, pg, h_addr, srcs,
+ nsrcs, addr_size, grec_type);
+ break;
+ case MCAST_EXCLUDE:
+ changed = __grp_src_block_excl(brmctx, pmctx, pg, h_addr, srcs,
+ nsrcs, addr_size, grec_type);
+ break;
+ }
+
+ if ((pg->filter_mode == MCAST_INCLUDE && hlist_empty(&pg->src_list)) ||
+ br_multicast_eht_should_del_pg(pg)) {
+ if (br_multicast_eht_should_del_pg(pg))
+ pg->flags |= MDB_PG_FLAGS_FAST_LEAVE;
+ br_multicast_find_del_pg(pg->key.port->br, pg);
+ /* a notification has already been sent and we shouldn't
+ * access pg after the delete so we have to return false
+ */
+ changed = false;
+ }
+
+ return changed;
+}
+
+static struct net_bridge_port_group *
+br_multicast_find_port(struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port *p,
+ const unsigned char *src)
+{
+ struct net_bridge *br __maybe_unused = mp->br;
+ struct net_bridge_port_group *pg;
+
+ for (pg = mlock_dereference(mp->ports, br);
+ pg;
+ pg = mlock_dereference(pg->next, br))
+ if (br_port_group_equal(pg, p, src))
+ return pg;
+
+ return NULL;
+}
+
+static int br_ip4_multicast_igmp3_report(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
struct sk_buff *skb,
u16 vid)
{
+ bool igmpv2 = brmctx->multicast_igmp_version == 2;
+ struct net_bridge_mdb_entry *mdst;
+ struct net_bridge_port_group *pg;
const unsigned char *src;
struct igmpv3_report *ih;
struct igmpv3_grec *grec;
- int i;
- int len;
- int num;
- int type;
+ int i, len, num, type;
+ __be32 group, *h_addr;
+ bool changed = false;
int err = 0;
- __be32 group;
+ u16 nsrcs;
ih = igmpv3_report_hdr(skb);
num = ntohs(ih->ngrec);
@@ -1154,18 +2872,18 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
for (i = 0; i < num; i++) {
len += sizeof(*grec);
- if (!pskb_may_pull(skb, len))
+ if (!ip_mc_may_pull(skb, len))
return -EINVAL;
grec = (void *)(skb->data + len - sizeof(*grec));
group = grec->grec_mca;
type = grec->grec_type;
+ nsrcs = ntohs(grec->grec_nsrcs);
- len += ntohs(grec->grec_nsrcs) * 4;
- if (!pskb_may_pull(skb, len))
+ len += nsrcs * 4;
+ if (!ip_mc_may_pull(skb, len))
return -EINVAL;
- /* We treat this as an IGMPv2 report for now. */
switch (type) {
case IGMPV3_MODE_IS_INCLUDE:
case IGMPV3_MODE_IS_EXCLUDE:
@@ -1180,62 +2898,128 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge *br,
}
src = eth_hdr(skb)->h_source;
- if ((type == IGMPV3_CHANGE_TO_INCLUDE ||
- type == IGMPV3_MODE_IS_INCLUDE) &&
- ntohs(grec->grec_nsrcs) == 0) {
- br_ip4_multicast_leave_group(br, port, group, vid, src);
+ if (nsrcs == 0 &&
+ (type == IGMPV3_CHANGE_TO_INCLUDE ||
+ type == IGMPV3_MODE_IS_INCLUDE)) {
+ if (!pmctx || igmpv2) {
+ br_ip4_multicast_leave_group(brmctx, pmctx,
+ group, vid, src);
+ continue;
+ }
} else {
- err = br_ip4_multicast_add_group(br, port, group, vid,
- src);
+ err = br_ip4_multicast_add_group(brmctx, pmctx, group,
+ vid, src, igmpv2);
if (err)
break;
}
+
+ if (!pmctx || igmpv2)
+ continue;
+
+ spin_lock(&brmctx->br->multicast_lock);
+ if (!br_multicast_ctx_should_use(brmctx, pmctx))
+ goto unlock_continue;
+
+ mdst = br_mdb_ip4_get(brmctx->br, group, vid);
+ if (!mdst)
+ goto unlock_continue;
+ pg = br_multicast_find_port(mdst, pmctx->port, src);
+ if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT))
+ goto unlock_continue;
+ /* reload grec and host addr */
+ grec = (void *)(skb->data + len - sizeof(*grec) - (nsrcs * 4));
+ h_addr = &ip_hdr(skb)->saddr;
+ switch (type) {
+ case IGMPV3_ALLOW_NEW_SOURCES:
+ changed = br_multicast_isinc_allow(brmctx, pg, h_addr,
+ grec->grec_src,
+ nsrcs, sizeof(__be32), type);
+ break;
+ case IGMPV3_MODE_IS_INCLUDE:
+ changed = br_multicast_isinc_allow(brmctx, pg, h_addr,
+ grec->grec_src,
+ nsrcs, sizeof(__be32), type);
+ break;
+ case IGMPV3_MODE_IS_EXCLUDE:
+ changed = br_multicast_isexc(brmctx, pg, h_addr,
+ grec->grec_src,
+ nsrcs, sizeof(__be32), type);
+ break;
+ case IGMPV3_CHANGE_TO_INCLUDE:
+ changed = br_multicast_toin(brmctx, pmctx, pg, h_addr,
+ grec->grec_src,
+ nsrcs, sizeof(__be32), type);
+ break;
+ case IGMPV3_CHANGE_TO_EXCLUDE:
+ changed = br_multicast_toex(brmctx, pmctx, pg, h_addr,
+ grec->grec_src,
+ nsrcs, sizeof(__be32), type);
+ break;
+ case IGMPV3_BLOCK_OLD_SOURCES:
+ changed = br_multicast_block(brmctx, pmctx, pg, h_addr,
+ grec->grec_src,
+ nsrcs, sizeof(__be32), type);
+ break;
+ }
+ if (changed)
+ br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB);
+unlock_continue:
+ spin_unlock(&brmctx->br->multicast_lock);
}
return err;
}
#if IS_ENABLED(CONFIG_IPV6)
-static int br_ip6_multicast_mld2_report(struct net_bridge *br,
- struct net_bridge_port *port,
+static int br_ip6_multicast_mld2_report(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
struct sk_buff *skb,
u16 vid)
{
+ bool mldv1 = brmctx->multicast_mld_version == 1;
+ struct net_bridge_mdb_entry *mdst;
+ struct net_bridge_port_group *pg;
+ unsigned int nsrcs_offset;
+ struct mld2_report *mld2r;
const unsigned char *src;
- struct icmp6hdr *icmp6h;
+ struct in6_addr *h_addr;
struct mld2_grec *grec;
- int i;
- int len;
- int num;
+ unsigned int grec_len;
+ bool changed = false;
+ int i, len, num;
int err = 0;
- if (!pskb_may_pull(skb, sizeof(*icmp6h)))
+ if (!ipv6_mc_may_pull(skb, sizeof(*mld2r)))
return -EINVAL;
- icmp6h = icmp6_hdr(skb);
- num = ntohs(icmp6h->icmp6_dataun.un_data16[1]);
- len = skb_transport_offset(skb) + sizeof(*icmp6h);
+ mld2r = (struct mld2_report *)icmp6_hdr(skb);
+ num = ntohs(mld2r->mld2r_ngrec);
+ len = skb_transport_offset(skb) + sizeof(*mld2r);
for (i = 0; i < num; i++) {
- __be16 *nsrcs, _nsrcs;
+ __be16 *_nsrcs, __nsrcs;
+ u16 nsrcs;
+
+ nsrcs_offset = len + offsetof(struct mld2_grec, grec_nsrcs);
+
+ if (skb_transport_offset(skb) + ipv6_transport_len(skb) <
+ nsrcs_offset + sizeof(__nsrcs))
+ return -EINVAL;
- nsrcs = skb_header_pointer(skb,
- len + offsetof(struct mld2_grec,
- grec_nsrcs),
- sizeof(_nsrcs), &_nsrcs);
- if (!nsrcs)
+ _nsrcs = skb_header_pointer(skb, nsrcs_offset,
+ sizeof(__nsrcs), &__nsrcs);
+ if (!_nsrcs)
return -EINVAL;
- if (!pskb_may_pull(skb,
- len + sizeof(*grec) +
- sizeof(struct in6_addr) * ntohs(*nsrcs)))
+ nsrcs = ntohs(*_nsrcs);
+ grec_len = struct_size(grec, grec_src, nsrcs);
+
+ if (!ipv6_mc_may_pull(skb, len + grec_len))
return -EINVAL;
grec = (struct mld2_grec *)(skb->data + len);
- len += sizeof(*grec) +
- sizeof(struct in6_addr) * ntohs(*nsrcs);
+ len += grec_len;
- /* We treat these as MLDv1 reports for now. */
switch (grec->grec_type) {
case MLD2_MODE_IS_INCLUDE:
case MLD2_MODE_IS_EXCLUDE:
@@ -1252,96 +3036,244 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
src = eth_hdr(skb)->h_source;
if ((grec->grec_type == MLD2_CHANGE_TO_INCLUDE ||
grec->grec_type == MLD2_MODE_IS_INCLUDE) &&
- ntohs(*nsrcs) == 0) {
- br_ip6_multicast_leave_group(br, port, &grec->grec_mca,
- vid, src);
+ nsrcs == 0) {
+ if (!pmctx || mldv1) {
+ br_ip6_multicast_leave_group(brmctx, pmctx,
+ &grec->grec_mca,
+ vid, src);
+ continue;
+ }
} else {
- err = br_ip6_multicast_add_group(br, port,
+ err = br_ip6_multicast_add_group(brmctx, pmctx,
&grec->grec_mca, vid,
- src);
+ src, mldv1);
if (err)
break;
}
+
+ if (!pmctx || mldv1)
+ continue;
+
+ spin_lock(&brmctx->br->multicast_lock);
+ if (!br_multicast_ctx_should_use(brmctx, pmctx))
+ goto unlock_continue;
+
+ mdst = br_mdb_ip6_get(brmctx->br, &grec->grec_mca, vid);
+ if (!mdst)
+ goto unlock_continue;
+ pg = br_multicast_find_port(mdst, pmctx->port, src);
+ if (!pg || (pg->flags & MDB_PG_FLAGS_PERMANENT))
+ goto unlock_continue;
+ h_addr = &ipv6_hdr(skb)->saddr;
+ switch (grec->grec_type) {
+ case MLD2_ALLOW_NEW_SOURCES:
+ changed = br_multicast_isinc_allow(brmctx, pg, h_addr,
+ grec->grec_src, nsrcs,
+ sizeof(struct in6_addr),
+ grec->grec_type);
+ break;
+ case MLD2_MODE_IS_INCLUDE:
+ changed = br_multicast_isinc_allow(brmctx, pg, h_addr,
+ grec->grec_src, nsrcs,
+ sizeof(struct in6_addr),
+ grec->grec_type);
+ break;
+ case MLD2_MODE_IS_EXCLUDE:
+ changed = br_multicast_isexc(brmctx, pg, h_addr,
+ grec->grec_src, nsrcs,
+ sizeof(struct in6_addr),
+ grec->grec_type);
+ break;
+ case MLD2_CHANGE_TO_INCLUDE:
+ changed = br_multicast_toin(brmctx, pmctx, pg, h_addr,
+ grec->grec_src, nsrcs,
+ sizeof(struct in6_addr),
+ grec->grec_type);
+ break;
+ case MLD2_CHANGE_TO_EXCLUDE:
+ changed = br_multicast_toex(brmctx, pmctx, pg, h_addr,
+ grec->grec_src, nsrcs,
+ sizeof(struct in6_addr),
+ grec->grec_type);
+ break;
+ case MLD2_BLOCK_OLD_SOURCES:
+ changed = br_multicast_block(brmctx, pmctx, pg, h_addr,
+ grec->grec_src, nsrcs,
+ sizeof(struct in6_addr),
+ grec->grec_type);
+ break;
+ }
+ if (changed)
+ br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB);
+unlock_continue:
+ spin_unlock(&brmctx->br->multicast_lock);
}
return err;
}
#endif
-static bool br_ip4_multicast_select_querier(struct net_bridge *br,
- struct net_bridge_port *port,
- __be32 saddr)
+static bool br_multicast_select_querier(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct br_ip *saddr)
{
- if (!timer_pending(&br->ip4_own_query.timer) &&
- !timer_pending(&br->ip4_other_query.timer))
- goto update;
+ int port_ifidx = pmctx ? pmctx->port->dev->ifindex : 0;
+ struct timer_list *own_timer, *other_timer;
+ struct bridge_mcast_querier *querier;
- if (!br->ip4_querier.addr.u.ip4)
- goto update;
+ switch (saddr->proto) {
+ case htons(ETH_P_IP):
+ querier = &brmctx->ip4_querier;
+ own_timer = &brmctx->ip4_own_query.timer;
+ other_timer = &brmctx->ip4_other_query.timer;
+ if (!querier->addr.src.ip4 ||
+ ntohl(saddr->src.ip4) <= ntohl(querier->addr.src.ip4))
+ goto update;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ querier = &brmctx->ip6_querier;
+ own_timer = &brmctx->ip6_own_query.timer;
+ other_timer = &brmctx->ip6_other_query.timer;
+ if (ipv6_addr_cmp(&saddr->src.ip6, &querier->addr.src.ip6) <= 0)
+ goto update;
+ break;
+#endif
+ default:
+ return false;
+ }
- if (ntohl(saddr) <= ntohl(br->ip4_querier.addr.u.ip4))
+ if (!timer_pending(own_timer) && !timer_pending(other_timer))
goto update;
return false;
update:
- br->ip4_querier.addr.u.ip4 = saddr;
-
- /* update protected by general multicast_lock by caller */
- rcu_assign_pointer(br->ip4_querier.port, port);
+ br_multicast_update_querier(brmctx, querier, port_ifidx, saddr);
return true;
}
-#if IS_ENABLED(CONFIG_IPV6)
-static bool br_ip6_multicast_select_querier(struct net_bridge *br,
- struct net_bridge_port *port,
- struct in6_addr *saddr)
+static struct net_bridge_port *
+__br_multicast_get_querier_port(struct net_bridge *br,
+ const struct bridge_mcast_querier *querier)
{
- if (!timer_pending(&br->ip6_own_query.timer) &&
- !timer_pending(&br->ip6_other_query.timer))
- goto update;
-
- if (ipv6_addr_cmp(saddr, &br->ip6_querier.addr.u.ip6) <= 0)
- goto update;
-
- return false;
+ int port_ifidx = READ_ONCE(querier->port_ifidx);
+ struct net_bridge_port *p;
+ struct net_device *dev;
-update:
- br->ip6_querier.addr.u.ip6 = *saddr;
+ if (port_ifidx == 0)
+ return NULL;
- /* update protected by general multicast_lock by caller */
- rcu_assign_pointer(br->ip6_querier.port, port);
+ dev = dev_get_by_index_rcu(dev_net(br->dev), port_ifidx);
+ if (!dev)
+ return NULL;
+ p = br_port_get_rtnl_rcu(dev);
+ if (!p || p->br != br)
+ return NULL;
- return true;
+ return p;
}
-#endif
-static bool br_multicast_select_querier(struct net_bridge *br,
- struct net_bridge_port *port,
- struct br_ip *saddr)
+size_t br_multicast_querier_state_size(void)
{
- switch (saddr->proto) {
- case htons(ETH_P_IP):
- return br_ip4_multicast_select_querier(br, port, saddr->u.ip4);
+ return nla_total_size(0) + /* nest attribute */
+ nla_total_size(sizeof(__be32)) + /* BRIDGE_QUERIER_IP_ADDRESS */
+ nla_total_size(sizeof(int)) + /* BRIDGE_QUERIER_IP_PORT */
+ nla_total_size_64bit(sizeof(u64)) + /* BRIDGE_QUERIER_IP_OTHER_TIMER */
#if IS_ENABLED(CONFIG_IPV6)
- case htons(ETH_P_IPV6):
- return br_ip6_multicast_select_querier(br, port, &saddr->u.ip6);
+ nla_total_size(sizeof(struct in6_addr)) + /* BRIDGE_QUERIER_IPV6_ADDRESS */
+ nla_total_size(sizeof(int)) + /* BRIDGE_QUERIER_IPV6_PORT */
+ nla_total_size_64bit(sizeof(u64)) + /* BRIDGE_QUERIER_IPV6_OTHER_TIMER */
#endif
+ 0;
+}
+
+/* protected by rtnl or rcu */
+int br_multicast_dump_querier_state(struct sk_buff *skb,
+ const struct net_bridge_mcast *brmctx,
+ int nest_attr)
+{
+ struct bridge_mcast_querier querier = {};
+ struct net_bridge_port *p;
+ struct nlattr *nest;
+
+ if (!br_opt_get(brmctx->br, BROPT_MULTICAST_ENABLED) ||
+ br_multicast_ctx_vlan_global_disabled(brmctx))
+ return 0;
+
+ nest = nla_nest_start(skb, nest_attr);
+ if (!nest)
+ return -EMSGSIZE;
+
+ rcu_read_lock();
+ if (!brmctx->multicast_querier &&
+ !timer_pending(&brmctx->ip4_other_query.timer))
+ goto out_v6;
+
+ br_multicast_read_querier(&brmctx->ip4_querier, &querier);
+ if (nla_put_in_addr(skb, BRIDGE_QUERIER_IP_ADDRESS,
+ querier.addr.src.ip4)) {
+ rcu_read_unlock();
+ goto out_err;
}
- return false;
+ p = __br_multicast_get_querier_port(brmctx->br, &querier);
+ if (timer_pending(&brmctx->ip4_other_query.timer) &&
+ (nla_put_u64_64bit(skb, BRIDGE_QUERIER_IP_OTHER_TIMER,
+ br_timer_value(&brmctx->ip4_other_query.timer),
+ BRIDGE_QUERIER_PAD) ||
+ (p && nla_put_u32(skb, BRIDGE_QUERIER_IP_PORT, p->dev->ifindex)))) {
+ rcu_read_unlock();
+ goto out_err;
+ }
+
+out_v6:
+#if IS_ENABLED(CONFIG_IPV6)
+ if (!brmctx->multicast_querier &&
+ !timer_pending(&brmctx->ip6_other_query.timer))
+ goto out;
+
+ br_multicast_read_querier(&brmctx->ip6_querier, &querier);
+ if (nla_put_in6_addr(skb, BRIDGE_QUERIER_IPV6_ADDRESS,
+ &querier.addr.src.ip6)) {
+ rcu_read_unlock();
+ goto out_err;
+ }
+
+ p = __br_multicast_get_querier_port(brmctx->br, &querier);
+ if (timer_pending(&brmctx->ip6_other_query.timer) &&
+ (nla_put_u64_64bit(skb, BRIDGE_QUERIER_IPV6_OTHER_TIMER,
+ br_timer_value(&brmctx->ip6_other_query.timer),
+ BRIDGE_QUERIER_PAD) ||
+ (p && nla_put_u32(skb, BRIDGE_QUERIER_IPV6_PORT,
+ p->dev->ifindex)))) {
+ rcu_read_unlock();
+ goto out_err;
+ }
+out:
+#endif
+ rcu_read_unlock();
+ nla_nest_end(skb, nest);
+ if (!nla_len(nest))
+ nla_nest_cancel(skb, nest);
+
+ return 0;
+
+out_err:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
}
static void
-br_multicast_update_query_timer(struct net_bridge *br,
+br_multicast_update_query_timer(struct net_bridge_mcast *brmctx,
struct bridge_mcast_other_query *query,
unsigned long max_delay)
{
if (!timer_pending(&query->timer))
- query->delay_time = jiffies + max_delay;
+ mod_timer(&query->delay_timer, jiffies + max_delay);
- mod_timer(&query->timer, jiffies + br->multicast_querier_interval);
+ mod_timer(&query->timer, jiffies + brmctx->multicast_querier_interval);
}
static void br_port_mc_router_state_change(struct net_bridge_port *p,
@@ -1354,109 +3286,244 @@ static void br_port_mc_router_state_change(struct net_bridge_port *p,
.u.mrouter = is_mc_router,
};
- switchdev_port_attr_set(p->dev, &attr);
+ switchdev_port_attr_set(p->dev, &attr, NULL);
}
-/*
- * Add port to router_list
+static struct net_bridge_port *
+br_multicast_rport_from_node(struct net_bridge_mcast *brmctx,
+ struct hlist_head *mc_router_list,
+ struct hlist_node *rlist)
+{
+ struct net_bridge_mcast_port *pmctx;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (mc_router_list == &brmctx->ip6_mc_router_list)
+ pmctx = hlist_entry(rlist, struct net_bridge_mcast_port,
+ ip6_rlist);
+ else
+#endif
+ pmctx = hlist_entry(rlist, struct net_bridge_mcast_port,
+ ip4_rlist);
+
+ return pmctx->port;
+}
+
+static struct hlist_node *
+br_multicast_get_rport_slot(struct net_bridge_mcast *brmctx,
+ struct net_bridge_port *port,
+ struct hlist_head *mc_router_list)
+
+{
+ struct hlist_node *slot = NULL;
+ struct net_bridge_port *p;
+ struct hlist_node *rlist;
+
+ hlist_for_each(rlist, mc_router_list) {
+ p = br_multicast_rport_from_node(brmctx, mc_router_list, rlist);
+
+ if ((unsigned long)port >= (unsigned long)p)
+ break;
+
+ slot = rlist;
+ }
+
+ return slot;
+}
+
+static bool br_multicast_no_router_otherpf(struct net_bridge_mcast_port *pmctx,
+ struct hlist_node *rnode)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (rnode != &pmctx->ip6_rlist)
+ return hlist_unhashed(&pmctx->ip6_rlist);
+ else
+ return hlist_unhashed(&pmctx->ip4_rlist);
+#else
+ return true;
+#endif
+}
+
+/* Add port to router_list
* list is maintained ordered by pointer value
* and locked by br->multicast_lock and RCU
*/
-static void br_multicast_add_router(struct net_bridge *br,
- struct net_bridge_port *port)
+static void br_multicast_add_router(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct hlist_node *rlist,
+ struct hlist_head *mc_router_list)
{
- struct net_bridge_port *p;
- struct hlist_node *slot = NULL;
+ struct hlist_node *slot;
- if (!hlist_unhashed(&port->rlist))
+ if (!hlist_unhashed(rlist))
return;
- hlist_for_each_entry(p, &br->router_list, rlist) {
- if ((unsigned long) port >= (unsigned long) p)
- break;
- slot = &p->rlist;
- }
+ slot = br_multicast_get_rport_slot(brmctx, pmctx->port, mc_router_list);
if (slot)
- hlist_add_behind_rcu(&port->rlist, slot);
+ hlist_add_behind_rcu(rlist, slot);
else
- hlist_add_head_rcu(&port->rlist, &br->router_list);
- br_rtr_notify(br->dev, port, RTM_NEWMDB);
- br_port_mc_router_state_change(port, true);
+ hlist_add_head_rcu(rlist, mc_router_list);
+
+ /* For backwards compatibility for now, only notify if we
+ * switched from no IPv4/IPv6 multicast router to a new
+ * IPv4 or IPv6 multicast router.
+ */
+ if (br_multicast_no_router_otherpf(pmctx, rlist)) {
+ br_rtr_notify(pmctx->port->br->dev, pmctx, RTM_NEWMDB);
+ br_port_mc_router_state_change(pmctx->port, true);
+ }
+}
+
+/* Add port to router_list
+ * list is maintained ordered by pointer value
+ * and locked by br->multicast_lock and RCU
+ */
+static void br_ip4_multicast_add_router(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx)
+{
+ br_multicast_add_router(brmctx, pmctx, &pmctx->ip4_rlist,
+ &brmctx->ip4_mc_router_list);
}
-static void br_multicast_mark_router(struct net_bridge *br,
- struct net_bridge_port *port)
+/* Add port to router_list
+ * list is maintained ordered by pointer value
+ * and locked by br->multicast_lock and RCU
+ */
+static void br_ip6_multicast_add_router(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ br_multicast_add_router(brmctx, pmctx, &pmctx->ip6_rlist,
+ &brmctx->ip6_mc_router_list);
+#endif
+}
+
+static void br_multicast_mark_router(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct timer_list *timer,
+ struct hlist_node *rlist,
+ struct hlist_head *mc_router_list)
{
unsigned long now = jiffies;
- if (!port) {
- if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) {
- if (!timer_pending(&br->multicast_router_timer))
- br_mc_router_state_change(br, true);
- mod_timer(&br->multicast_router_timer,
- now + br->multicast_querier_interval);
+ if (!br_multicast_ctx_should_use(brmctx, pmctx))
+ return;
+
+ if (!pmctx) {
+ if (brmctx->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) {
+ if (!br_ip4_multicast_is_router(brmctx) &&
+ !br_ip6_multicast_is_router(brmctx))
+ br_mc_router_state_change(brmctx->br, true);
+ mod_timer(timer, now + brmctx->multicast_querier_interval);
}
return;
}
- if (port->multicast_router == MDB_RTR_TYPE_DISABLED ||
- port->multicast_router == MDB_RTR_TYPE_PERM)
+ if (pmctx->multicast_router == MDB_RTR_TYPE_DISABLED ||
+ pmctx->multicast_router == MDB_RTR_TYPE_PERM)
return;
- br_multicast_add_router(br, port);
+ br_multicast_add_router(brmctx, pmctx, rlist, mc_router_list);
+ mod_timer(timer, now + brmctx->multicast_querier_interval);
+}
+
+static void br_ip4_multicast_mark_router(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx)
+{
+ struct timer_list *timer = &brmctx->ip4_mc_router_timer;
+ struct hlist_node *rlist = NULL;
+
+ if (pmctx) {
+ timer = &pmctx->ip4_mc_router_timer;
+ rlist = &pmctx->ip4_rlist;
+ }
- mod_timer(&port->multicast_router_timer,
- now + br->multicast_querier_interval);
+ br_multicast_mark_router(brmctx, pmctx, timer, rlist,
+ &brmctx->ip4_mc_router_list);
}
-static void br_multicast_query_received(struct net_bridge *br,
- struct net_bridge_port *port,
- struct bridge_mcast_other_query *query,
- struct br_ip *saddr,
- unsigned long max_delay)
+static void br_ip6_multicast_mark_router(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx)
{
- if (!br_multicast_select_querier(br, port, saddr))
+#if IS_ENABLED(CONFIG_IPV6)
+ struct timer_list *timer = &brmctx->ip6_mc_router_timer;
+ struct hlist_node *rlist = NULL;
+
+ if (pmctx) {
+ timer = &pmctx->ip6_mc_router_timer;
+ rlist = &pmctx->ip6_rlist;
+ }
+
+ br_multicast_mark_router(brmctx, pmctx, timer, rlist,
+ &brmctx->ip6_mc_router_list);
+#endif
+}
+
+static void
+br_ip4_multicast_query_received(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct bridge_mcast_other_query *query,
+ struct br_ip *saddr,
+ unsigned long max_delay)
+{
+ if (!br_multicast_select_querier(brmctx, pmctx, saddr))
+ return;
+
+ br_multicast_update_query_timer(brmctx, query, max_delay);
+ br_ip4_multicast_mark_router(brmctx, pmctx);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void
+br_ip6_multicast_query_received(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct bridge_mcast_other_query *query,
+ struct br_ip *saddr,
+ unsigned long max_delay)
+{
+ if (!br_multicast_select_querier(brmctx, pmctx, saddr))
return;
- br_multicast_update_query_timer(br, query, max_delay);
- br_multicast_mark_router(br, port);
+ br_multicast_update_query_timer(brmctx, query, max_delay);
+ br_ip6_multicast_mark_router(brmctx, pmctx);
}
+#endif
-static void br_ip4_multicast_query(struct net_bridge *br,
- struct net_bridge_port *port,
+static void br_ip4_multicast_query(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
struct sk_buff *skb,
u16 vid)
{
+ unsigned int transport_len = ip_transport_len(skb);
const struct iphdr *iph = ip_hdr(skb);
struct igmphdr *ih = igmp_hdr(skb);
struct net_bridge_mdb_entry *mp;
struct igmpv3_query *ih3;
struct net_bridge_port_group *p;
struct net_bridge_port_group __rcu **pp;
- struct br_ip saddr;
+ struct br_ip saddr = {};
unsigned long max_delay;
unsigned long now = jiffies;
- unsigned int offset = skb_transport_offset(skb);
__be32 group;
- spin_lock(&br->multicast_lock);
- if (!netif_running(br->dev) ||
- (port && port->state == BR_STATE_DISABLED))
+ spin_lock(&brmctx->br->multicast_lock);
+ if (!br_multicast_ctx_should_use(brmctx, pmctx))
goto out;
group = ih->group;
- if (skb->len == offset + sizeof(*ih)) {
+ if (transport_len == sizeof(*ih)) {
max_delay = ih->code * (HZ / IGMP_TIMER_SCALE);
if (!max_delay) {
max_delay = 10 * HZ;
group = 0;
}
- } else if (skb->len >= offset + sizeof(*ih3)) {
+ } else if (transport_len >= sizeof(*ih3)) {
ih3 = igmpv3_query_hdr(skb);
- if (ih3->nsrcs)
+ if (ih3->nsrcs ||
+ (brmctx->multicast_igmp_version == 3 && group &&
+ ih3->suppress))
goto out;
max_delay = ih3->code ?
@@ -1467,51 +3534,54 @@ static void br_ip4_multicast_query(struct net_bridge *br,
if (!group) {
saddr.proto = htons(ETH_P_IP);
- saddr.u.ip4 = iph->saddr;
+ saddr.src.ip4 = iph->saddr;
- br_multicast_query_received(br, port, &br->ip4_other_query,
- &saddr, max_delay);
+ br_ip4_multicast_query_received(brmctx, pmctx,
+ &brmctx->ip4_other_query,
+ &saddr, max_delay);
goto out;
}
- mp = br_mdb_ip4_get(mlock_dereference(br->mdb, br), group, vid);
+ mp = br_mdb_ip4_get(brmctx->br, group, vid);
if (!mp)
goto out;
- max_delay *= br->multicast_last_member_count;
+ max_delay *= brmctx->multicast_last_member_count;
if (mp->host_joined &&
(timer_pending(&mp->timer) ?
time_after(mp->timer.expires, now + max_delay) :
- try_to_del_timer_sync(&mp->timer) >= 0))
+ timer_delete_sync_try(&mp->timer) >= 0))
mod_timer(&mp->timer, now + max_delay);
for (pp = &mp->ports;
- (p = mlock_dereference(*pp, br)) != NULL;
+ (p = mlock_dereference(*pp, brmctx->br)) != NULL;
pp = &p->next) {
if (timer_pending(&p->timer) ?
time_after(p->timer.expires, now + max_delay) :
- try_to_del_timer_sync(&p->timer) >= 0)
+ timer_delete_sync_try(&p->timer) >= 0 &&
+ (brmctx->multicast_igmp_version == 2 ||
+ p->filter_mode == MCAST_EXCLUDE))
mod_timer(&p->timer, now + max_delay);
}
out:
- spin_unlock(&br->multicast_lock);
+ spin_unlock(&brmctx->br->multicast_lock);
}
#if IS_ENABLED(CONFIG_IPV6)
-static int br_ip6_multicast_query(struct net_bridge *br,
- struct net_bridge_port *port,
+static int br_ip6_multicast_query(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
struct sk_buff *skb,
u16 vid)
{
- const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ unsigned int transport_len = ipv6_transport_len(skb);
struct mld_msg *mld;
struct net_bridge_mdb_entry *mp;
struct mld2_query *mld2q;
struct net_bridge_port_group *p;
struct net_bridge_port_group __rcu **pp;
- struct br_ip saddr;
+ struct br_ip saddr = {};
unsigned long max_delay;
unsigned long now = jiffies;
unsigned int offset = skb_transport_offset(skb);
@@ -1519,12 +3589,11 @@ static int br_ip6_multicast_query(struct net_bridge *br,
bool is_general_query;
int err = 0;
- spin_lock(&br->multicast_lock);
- if (!netif_running(br->dev) ||
- (port && port->state == BR_STATE_DISABLED))
+ spin_lock(&brmctx->br->multicast_lock);
+ if (!br_multicast_ctx_should_use(brmctx, pmctx))
goto out;
- if (skb->len == offset + sizeof(*mld)) {
+ if (transport_len == sizeof(*mld)) {
if (!pskb_may_pull(skb, offset + sizeof(*mld))) {
err = -EINVAL;
goto out;
@@ -1541,6 +3610,10 @@ static int br_ip6_multicast_query(struct net_bridge *br,
mld2q = (struct mld2_query *)icmp6_hdr(skb);
if (!mld2q->mld2q_nsrcs)
group = &mld2q->mld2q_mca;
+ if (brmctx->multicast_mld_version == 2 &&
+ !ipv6_addr_any(&mld2q->mld2q_mca) &&
+ mld2q->mld2q_suppress)
+ goto out;
max_delay = max(msecs_to_jiffies(mldv2_mrc(mld2q)), 1UL);
}
@@ -1549,84 +3622,79 @@ static int br_ip6_multicast_query(struct net_bridge *br,
if (is_general_query) {
saddr.proto = htons(ETH_P_IPV6);
- saddr.u.ip6 = ip6h->saddr;
+ saddr.src.ip6 = ipv6_hdr(skb)->saddr;
- br_multicast_query_received(br, port, &br->ip6_other_query,
- &saddr, max_delay);
+ br_ip6_multicast_query_received(brmctx, pmctx,
+ &brmctx->ip6_other_query,
+ &saddr, max_delay);
goto out;
} else if (!group) {
goto out;
}
- mp = br_mdb_ip6_get(mlock_dereference(br->mdb, br), group, vid);
+ mp = br_mdb_ip6_get(brmctx->br, group, vid);
if (!mp)
goto out;
- max_delay *= br->multicast_last_member_count;
+ max_delay *= brmctx->multicast_last_member_count;
if (mp->host_joined &&
(timer_pending(&mp->timer) ?
time_after(mp->timer.expires, now + max_delay) :
- try_to_del_timer_sync(&mp->timer) >= 0))
+ timer_delete_sync_try(&mp->timer) >= 0))
mod_timer(&mp->timer, now + max_delay);
for (pp = &mp->ports;
- (p = mlock_dereference(*pp, br)) != NULL;
+ (p = mlock_dereference(*pp, brmctx->br)) != NULL;
pp = &p->next) {
if (timer_pending(&p->timer) ?
time_after(p->timer.expires, now + max_delay) :
- try_to_del_timer_sync(&p->timer) >= 0)
+ timer_delete_sync_try(&p->timer) >= 0 &&
+ (brmctx->multicast_mld_version == 1 ||
+ p->filter_mode == MCAST_EXCLUDE))
mod_timer(&p->timer, now + max_delay);
}
out:
- spin_unlock(&br->multicast_lock);
+ spin_unlock(&brmctx->br->multicast_lock);
return err;
}
#endif
static void
-br_multicast_leave_group(struct net_bridge *br,
- struct net_bridge_port *port,
+br_multicast_leave_group(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
struct br_ip *group,
struct bridge_mcast_other_query *other_query,
struct bridge_mcast_own_query *own_query,
const unsigned char *src)
{
- struct net_bridge_mdb_htable *mdb;
struct net_bridge_mdb_entry *mp;
struct net_bridge_port_group *p;
unsigned long now;
unsigned long time;
- spin_lock(&br->multicast_lock);
- if (!netif_running(br->dev) ||
- (port && port->state == BR_STATE_DISABLED))
+ spin_lock(&brmctx->br->multicast_lock);
+ if (!br_multicast_ctx_should_use(brmctx, pmctx))
goto out;
- mdb = mlock_dereference(br->mdb, br);
- mp = br_mdb_ip_get(mdb, group);
+ mp = br_mdb_ip_get(brmctx->br, group);
if (!mp)
goto out;
- if (port && (port->flags & BR_MULTICAST_FAST_LEAVE)) {
+ if (pmctx && (pmctx->port->flags & BR_MULTICAST_FAST_LEAVE)) {
struct net_bridge_port_group __rcu **pp;
for (pp = &mp->ports;
- (p = mlock_dereference(*pp, br)) != NULL;
+ (p = mlock_dereference(*pp, brmctx->br)) != NULL;
pp = &p->next) {
- if (!br_port_group_equal(p, port, src))
+ if (!br_port_group_equal(p, pmctx->port, src))
continue;
- rcu_assign_pointer(*pp, p->next);
- hlist_del_init(&p->mglist);
- del_timer(&p->timer);
- call_rcu_bh(&p->rcu, br_multicast_free_pg);
- br_mdb_notify(br->dev, port, group, RTM_DELMDB,
- p->flags);
+ if (p->flags & MDB_PG_FLAGS_PERMANENT)
+ break;
- if (!mp->ports && !mp->host_joined &&
- netif_running(br->dev))
- mod_timer(&mp->timer, jiffies);
+ p->flags |= MDB_PG_FLAGS_FAST_LEAVE;
+ br_multicast_del_pg(mp, p, pp);
}
goto out;
}
@@ -1634,24 +3702,25 @@ br_multicast_leave_group(struct net_bridge *br,
if (timer_pending(&other_query->timer))
goto out;
- if (br->multicast_querier) {
- __br_multicast_send_query(br, port, &mp->addr);
+ if (brmctx->multicast_querier) {
+ __br_multicast_send_query(brmctx, pmctx, NULL, NULL, &mp->addr,
+ false, 0, NULL);
- time = jiffies + br->multicast_last_member_count *
- br->multicast_last_member_interval;
+ time = jiffies + brmctx->multicast_last_member_count *
+ brmctx->multicast_last_member_interval;
mod_timer(&own_query->timer, time);
- for (p = mlock_dereference(mp->ports, br);
- p != NULL;
- p = mlock_dereference(p->next, br)) {
- if (!br_port_group_equal(p, port, src))
+ for (p = mlock_dereference(mp->ports, brmctx->br);
+ p != NULL && pmctx != NULL;
+ p = mlock_dereference(p->next, brmctx->br)) {
+ if (!br_port_group_equal(p, pmctx->port, src))
continue;
if (!hlist_unhashed(&p->mglist) &&
(timer_pending(&p->timer) ?
time_after(p->timer.expires, time) :
- try_to_del_timer_sync(&p->timer) >= 0)) {
+ timer_delete_sync_try(&p->timer) >= 0)) {
mod_timer(&p->timer, time);
}
@@ -1660,41 +3729,41 @@ br_multicast_leave_group(struct net_bridge *br,
}
now = jiffies;
- time = now + br->multicast_last_member_count *
- br->multicast_last_member_interval;
+ time = now + brmctx->multicast_last_member_count *
+ brmctx->multicast_last_member_interval;
- if (!port) {
+ if (!pmctx) {
if (mp->host_joined &&
(timer_pending(&mp->timer) ?
time_after(mp->timer.expires, time) :
- try_to_del_timer_sync(&mp->timer) >= 0)) {
+ timer_delete_sync_try(&mp->timer) >= 0)) {
mod_timer(&mp->timer, time);
}
goto out;
}
- for (p = mlock_dereference(mp->ports, br);
+ for (p = mlock_dereference(mp->ports, brmctx->br);
p != NULL;
- p = mlock_dereference(p->next, br)) {
- if (p->port != port)
+ p = mlock_dereference(p->next, brmctx->br)) {
+ if (p->key.port != pmctx->port)
continue;
if (!hlist_unhashed(&p->mglist) &&
(timer_pending(&p->timer) ?
time_after(p->timer.expires, time) :
- try_to_del_timer_sync(&p->timer) >= 0)) {
+ timer_delete_sync_try(&p->timer) >= 0)) {
mod_timer(&p->timer, time);
}
break;
}
out:
- spin_unlock(&br->multicast_lock);
+ spin_unlock(&brmctx->br->multicast_lock);
}
-static void br_ip4_multicast_leave_group(struct net_bridge *br,
- struct net_bridge_port *port,
+static void br_ip4_multicast_leave_group(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
__be32 group,
__u16 vid,
const unsigned char *src)
@@ -1705,19 +3774,21 @@ static void br_ip4_multicast_leave_group(struct net_bridge *br,
if (ipv4_is_local_multicast(group))
return;
- own_query = port ? &port->ip4_own_query : &br->ip4_own_query;
+ own_query = pmctx ? &pmctx->ip4_own_query : &brmctx->ip4_own_query;
- br_group.u.ip4 = group;
+ memset(&br_group, 0, sizeof(br_group));
+ br_group.dst.ip4 = group;
br_group.proto = htons(ETH_P_IP);
br_group.vid = vid;
- br_multicast_leave_group(br, port, &br_group, &br->ip4_other_query,
+ br_multicast_leave_group(brmctx, pmctx, &br_group,
+ &brmctx->ip4_other_query,
own_query, src);
}
#if IS_ENABLED(CONFIG_IPV6)
-static void br_ip6_multicast_leave_group(struct net_bridge *br,
- struct net_bridge_port *port,
+static void br_ip6_multicast_leave_group(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
const struct in6_addr *group,
__u16 vid,
const unsigned char *src)
@@ -1728,13 +3799,15 @@ static void br_ip6_multicast_leave_group(struct net_bridge *br,
if (ipv6_addr_is_ll_all_nodes(group))
return;
- own_query = port ? &port->ip6_own_query : &br->ip6_own_query;
+ own_query = pmctx ? &pmctx->ip6_own_query : &brmctx->ip6_own_query;
- br_group.u.ip6 = *group;
+ memset(&br_group, 0, sizeof(br_group));
+ br_group.dst.ip6 = *group;
br_group.proto = htons(ETH_P_IPV6);
br_group.vid = vid;
- br_multicast_leave_group(br, port, &br_group, &br->ip6_other_query,
+ br_multicast_leave_group(brmctx, pmctx, &br_group,
+ &brmctx->ip6_other_query,
own_query, src);
}
#endif
@@ -1746,7 +3819,7 @@ static void br_multicast_err_count(const struct net_bridge *br,
struct bridge_mcast_stats __percpu *stats;
struct bridge_mcast_stats *pstats;
- if (!br->multicast_stats_enabled)
+ if (!br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED))
return;
if (p)
@@ -1772,8 +3845,8 @@ static void br_multicast_err_count(const struct net_bridge *br,
u64_stats_update_end(&pstats->syncp);
}
-static void br_multicast_pim(struct net_bridge *br,
- struct net_bridge_port *port,
+static void br_multicast_pim(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
const struct sk_buff *skb)
{
unsigned int offset = skb_transport_offset(skb);
@@ -1784,31 +3857,51 @@ static void br_multicast_pim(struct net_bridge *br,
pim_hdr_type(pimhdr) != PIM_TYPE_HELLO)
return;
- br_multicast_mark_router(br, port);
+ spin_lock(&brmctx->br->multicast_lock);
+ br_ip4_multicast_mark_router(brmctx, pmctx);
+ spin_unlock(&brmctx->br->multicast_lock);
+}
+
+static int br_ip4_multicast_mrd_rcv(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct sk_buff *skb)
+{
+ if (ip_hdr(skb)->protocol != IPPROTO_IGMP ||
+ igmp_hdr(skb)->type != IGMP_MRDISC_ADV)
+ return -ENOMSG;
+
+ spin_lock(&brmctx->br->multicast_lock);
+ br_ip4_multicast_mark_router(brmctx, pmctx);
+ spin_unlock(&brmctx->br->multicast_lock);
+
+ return 0;
}
-static int br_multicast_ipv4_rcv(struct net_bridge *br,
- struct net_bridge_port *port,
+static int br_multicast_ipv4_rcv(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
struct sk_buff *skb,
u16 vid)
{
- struct sk_buff *skb_trimmed = NULL;
+ struct net_bridge_port *p = pmctx ? pmctx->port : NULL;
const unsigned char *src;
struct igmphdr *ih;
int err;
- err = ip_mc_check_igmp(skb, &skb_trimmed);
+ err = ip_mc_check_igmp(skb);
if (err == -ENOMSG) {
if (!ipv4_is_local_multicast(ip_hdr(skb)->daddr)) {
BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
} else if (pim_ipv4_all_pim_routers(ip_hdr(skb)->daddr)) {
if (ip_hdr(skb)->protocol == IPPROTO_PIM)
- br_multicast_pim(br, port, skb);
+ br_multicast_pim(brmctx, pmctx, skb);
+ } else if (ipv4_is_all_snoopers(ip_hdr(skb)->daddr)) {
+ br_ip4_multicast_mrd_rcv(brmctx, pmctx, skb);
}
+
return 0;
} else if (err < 0) {
- br_multicast_err_count(br, port, skb->protocol);
+ br_multicast_err_count(brmctx->br, p, skb->protocol);
return err;
}
@@ -1820,47 +3913,61 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br,
case IGMP_HOST_MEMBERSHIP_REPORT:
case IGMPV2_HOST_MEMBERSHIP_REPORT:
BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
- err = br_ip4_multicast_add_group(br, port, ih->group, vid, src);
+ err = br_ip4_multicast_add_group(brmctx, pmctx, ih->group, vid,
+ src, true);
break;
case IGMPV3_HOST_MEMBERSHIP_REPORT:
- err = br_ip4_multicast_igmp3_report(br, port, skb_trimmed, vid);
+ err = br_ip4_multicast_igmp3_report(brmctx, pmctx, skb, vid);
break;
case IGMP_HOST_MEMBERSHIP_QUERY:
- br_ip4_multicast_query(br, port, skb_trimmed, vid);
+ br_ip4_multicast_query(brmctx, pmctx, skb, vid);
break;
case IGMP_HOST_LEAVE_MESSAGE:
- br_ip4_multicast_leave_group(br, port, ih->group, vid, src);
+ br_ip4_multicast_leave_group(brmctx, pmctx, ih->group, vid, src);
break;
}
- if (skb_trimmed && skb_trimmed != skb)
- kfree_skb(skb_trimmed);
-
- br_multicast_count(br, port, skb, BR_INPUT_SKB_CB(skb)->igmp,
+ br_multicast_count(brmctx->br, p, skb, BR_INPUT_SKB_CB(skb)->igmp,
BR_MCAST_DIR_RX);
return err;
}
#if IS_ENABLED(CONFIG_IPV6)
-static int br_multicast_ipv6_rcv(struct net_bridge *br,
- struct net_bridge_port *port,
+static void br_ip6_multicast_mrd_rcv(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
+ struct sk_buff *skb)
+{
+ if (icmp6_hdr(skb)->icmp6_type != ICMPV6_MRDISC_ADV)
+ return;
+
+ spin_lock(&brmctx->br->multicast_lock);
+ br_ip6_multicast_mark_router(brmctx, pmctx);
+ spin_unlock(&brmctx->br->multicast_lock);
+}
+
+static int br_multicast_ipv6_rcv(struct net_bridge_mcast *brmctx,
+ struct net_bridge_mcast_port *pmctx,
struct sk_buff *skb,
u16 vid)
{
- struct sk_buff *skb_trimmed = NULL;
+ struct net_bridge_port *p = pmctx ? pmctx->port : NULL;
const unsigned char *src;
struct mld_msg *mld;
int err;
- err = ipv6_mc_check_mld(skb, &skb_trimmed);
+ err = ipv6_mc_check_mld(skb);
- if (err == -ENOMSG) {
+ if (err == -ENOMSG || err == -ENODATA) {
if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr))
BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
+ if (err == -ENODATA &&
+ ipv6_addr_is_all_snoopers(&ipv6_hdr(skb)->daddr))
+ br_ip6_multicast_mrd_rcv(brmctx, pmctx, skb);
+
return 0;
} else if (err < 0) {
- br_multicast_err_count(br, port, skb->protocol);
+ br_multicast_err_count(brmctx->br, p, skb->protocol);
return err;
}
@@ -1871,32 +3978,32 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br,
case ICMPV6_MGM_REPORT:
src = eth_hdr(skb)->h_source;
BR_INPUT_SKB_CB(skb)->mrouters_only = 1;
- err = br_ip6_multicast_add_group(br, port, &mld->mld_mca, vid,
- src);
+ err = br_ip6_multicast_add_group(brmctx, pmctx, &mld->mld_mca,
+ vid, src, true);
break;
case ICMPV6_MLD2_REPORT:
- err = br_ip6_multicast_mld2_report(br, port, skb_trimmed, vid);
+ err = br_ip6_multicast_mld2_report(brmctx, pmctx, skb, vid);
break;
case ICMPV6_MGM_QUERY:
- err = br_ip6_multicast_query(br, port, skb_trimmed, vid);
+ err = br_ip6_multicast_query(brmctx, pmctx, skb, vid);
break;
case ICMPV6_MGM_REDUCTION:
src = eth_hdr(skb)->h_source;
- br_ip6_multicast_leave_group(br, port, &mld->mld_mca, vid, src);
+ br_ip6_multicast_leave_group(brmctx, pmctx, &mld->mld_mca, vid,
+ src);
break;
}
- if (skb_trimmed && skb_trimmed != skb)
- kfree_skb(skb_trimmed);
-
- br_multicast_count(br, port, skb, BR_INPUT_SKB_CB(skb)->igmp,
+ br_multicast_count(brmctx->br, p, skb, BR_INPUT_SKB_CB(skb)->igmp,
BR_MCAST_DIR_RX);
return err;
}
#endif
-int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
+int br_multicast_rcv(struct net_bridge_mcast **brmctx,
+ struct net_bridge_mcast_port **pmctx,
+ struct net_bridge_vlan *vlan,
struct sk_buff *skb, u16 vid)
{
int ret = 0;
@@ -1904,16 +4011,36 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
BR_INPUT_SKB_CB(skb)->igmp = 0;
BR_INPUT_SKB_CB(skb)->mrouters_only = 0;
- if (br->multicast_disabled)
+ if (!br_opt_get((*brmctx)->br, BROPT_MULTICAST_ENABLED))
return 0;
+ if (br_opt_get((*brmctx)->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) && vlan) {
+ const struct net_bridge_vlan *masterv;
+
+ /* the vlan has the master flag set only when transmitting
+ * through the bridge device
+ */
+ if (br_vlan_is_master(vlan)) {
+ masterv = vlan;
+ *brmctx = &vlan->br_mcast_ctx;
+ *pmctx = NULL;
+ } else {
+ masterv = vlan->brvlan;
+ *brmctx = &vlan->brvlan->br_mcast_ctx;
+ *pmctx = &vlan->port_mcast_ctx;
+ }
+
+ if (!(masterv->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED))
+ return 0;
+ }
+
switch (skb->protocol) {
case htons(ETH_P_IP):
- ret = br_multicast_ipv4_rcv(br, port, skb, vid);
+ ret = br_multicast_ipv4_rcv(*brmctx, *pmctx, skb, vid);
break;
#if IS_ENABLED(CONFIG_IPV6)
case htons(ETH_P_IPV6):
- ret = br_multicast_ipv6_rcv(br, port, skb, vid);
+ ret = br_multicast_ipv6_rcv(*brmctx, *pmctx, skb, vid);
break;
#endif
}
@@ -1921,291 +4048,670 @@ int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
return ret;
}
-static void br_multicast_query_expired(struct net_bridge *br,
- struct bridge_mcast_own_query *query,
- struct bridge_mcast_querier *querier)
+static void br_multicast_query_expired(struct net_bridge_mcast *brmctx,
+ struct bridge_mcast_own_query *query)
{
- spin_lock(&br->multicast_lock);
- if (query->startup_sent < br->multicast_startup_query_count)
+ spin_lock(&brmctx->br->multicast_lock);
+ if (br_multicast_ctx_vlan_disabled(brmctx))
+ goto out;
+
+ if (query->startup_sent < brmctx->multicast_startup_query_count)
query->startup_sent++;
- RCU_INIT_POINTER(querier->port, NULL);
- br_multicast_send_query(br, NULL, query);
- spin_unlock(&br->multicast_lock);
+ br_multicast_send_query(brmctx, NULL, query);
+out:
+ spin_unlock(&brmctx->br->multicast_lock);
}
static void br_ip4_multicast_query_expired(struct timer_list *t)
{
- struct net_bridge *br = from_timer(br, t, ip4_own_query.timer);
+ struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t,
+ ip4_own_query.timer);
- br_multicast_query_expired(br, &br->ip4_own_query, &br->ip4_querier);
+ br_multicast_query_expired(brmctx, &brmctx->ip4_own_query);
}
#if IS_ENABLED(CONFIG_IPV6)
static void br_ip6_multicast_query_expired(struct timer_list *t)
{
- struct net_bridge *br = from_timer(br, t, ip6_own_query.timer);
+ struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t,
+ ip6_own_query.timer);
- br_multicast_query_expired(br, &br->ip6_own_query, &br->ip6_querier);
+ br_multicast_query_expired(brmctx, &brmctx->ip6_own_query);
}
#endif
-void br_multicast_init(struct net_bridge *br)
+static void br_multicast_gc_work(struct work_struct *work)
+{
+ struct net_bridge *br = container_of(work, struct net_bridge,
+ mcast_gc_work);
+ HLIST_HEAD(deleted_head);
+
+ spin_lock_bh(&br->multicast_lock);
+ hlist_move_list(&br->mcast_gc_list, &deleted_head);
+ spin_unlock_bh(&br->multicast_lock);
+
+ br_multicast_gc(&deleted_head);
+}
+
+void br_multicast_ctx_init(struct net_bridge *br,
+ struct net_bridge_vlan *vlan,
+ struct net_bridge_mcast *brmctx)
{
- br->hash_elasticity = 4;
- br->hash_max = 512;
-
- br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
- br->multicast_querier = 0;
- br->multicast_query_use_ifaddr = 0;
- br->multicast_last_member_count = 2;
- br->multicast_startup_query_count = 2;
-
- br->multicast_last_member_interval = HZ;
- br->multicast_query_response_interval = 10 * HZ;
- br->multicast_startup_query_interval = 125 * HZ / 4;
- br->multicast_query_interval = 125 * HZ;
- br->multicast_querier_interval = 255 * HZ;
- br->multicast_membership_interval = 260 * HZ;
-
- br->ip4_other_query.delay_time = 0;
- br->ip4_querier.port = NULL;
- br->multicast_igmp_version = 2;
+ brmctx->br = br;
+ brmctx->vlan = vlan;
+ brmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
+ brmctx->multicast_last_member_count = 2;
+ brmctx->multicast_startup_query_count = 2;
+
+ brmctx->multicast_last_member_interval = HZ;
+ brmctx->multicast_query_response_interval = 10 * HZ;
+ brmctx->multicast_startup_query_interval = 125 * HZ / 4;
+ brmctx->multicast_query_interval = 125 * HZ;
+ brmctx->multicast_querier_interval = 255 * HZ;
+ brmctx->multicast_membership_interval = 260 * HZ;
+
+ brmctx->ip4_querier.port_ifidx = 0;
+ seqcount_spinlock_init(&brmctx->ip4_querier.seq, &br->multicast_lock);
+ brmctx->multicast_igmp_version = 2;
#if IS_ENABLED(CONFIG_IPV6)
- br->multicast_mld_version = 1;
- br->ip6_other_query.delay_time = 0;
- br->ip6_querier.port = NULL;
+ brmctx->multicast_mld_version = 1;
+ brmctx->ip6_querier.port_ifidx = 0;
+ seqcount_spinlock_init(&brmctx->ip6_querier.seq, &br->multicast_lock);
#endif
- br->has_ipv6_addr = 1;
- spin_lock_init(&br->multicast_lock);
- timer_setup(&br->multicast_router_timer,
- br_multicast_local_router_expired, 0);
- timer_setup(&br->ip4_other_query.timer,
+ timer_setup(&brmctx->ip4_mc_router_timer,
+ br_ip4_multicast_local_router_expired, 0);
+ timer_setup(&brmctx->ip4_other_query.timer,
br_ip4_multicast_querier_expired, 0);
- timer_setup(&br->ip4_own_query.timer,
+ timer_setup(&brmctx->ip4_other_query.delay_timer,
+ br_multicast_query_delay_expired, 0);
+ timer_setup(&brmctx->ip4_own_query.timer,
br_ip4_multicast_query_expired, 0);
#if IS_ENABLED(CONFIG_IPV6)
- timer_setup(&br->ip6_other_query.timer,
+ timer_setup(&brmctx->ip6_mc_router_timer,
+ br_ip6_multicast_local_router_expired, 0);
+ timer_setup(&brmctx->ip6_other_query.timer,
br_ip6_multicast_querier_expired, 0);
- timer_setup(&br->ip6_own_query.timer,
+ timer_setup(&brmctx->ip6_other_query.delay_timer,
+ br_multicast_query_delay_expired, 0);
+ timer_setup(&brmctx->ip6_own_query.timer,
br_ip6_multicast_query_expired, 0);
#endif
}
-static void __br_multicast_open(struct net_bridge *br,
- struct bridge_mcast_own_query *query)
+void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx)
{
- query->startup_sent = 0;
+ __br_multicast_stop(brmctx);
+}
+
+void br_multicast_init(struct net_bridge *br)
+{
+ br->hash_max = BR_MULTICAST_DEFAULT_HASH_MAX;
+
+ br_multicast_ctx_init(br, NULL, &br->multicast_ctx);
+
+ br_opt_toggle(br, BROPT_MULTICAST_ENABLED, true);
+ br_opt_toggle(br, BROPT_HAS_IPV6_ADDR, true);
+
+ spin_lock_init(&br->multicast_lock);
+ INIT_HLIST_HEAD(&br->mdb_list);
+ INIT_HLIST_HEAD(&br->mcast_gc_list);
+ INIT_WORK(&br->mcast_gc_work, br_multicast_gc_work);
+}
+
+static void br_ip4_multicast_join_snoopers(struct net_bridge *br)
+{
+ struct in_device *in_dev = in_dev_get(br->dev);
- if (br->multicast_disabled)
+ if (!in_dev)
return;
- mod_timer(&query->timer, jiffies);
+ __ip_mc_inc_group(in_dev, htonl(INADDR_ALLSNOOPERS_GROUP), GFP_ATOMIC);
+ in_dev_put(in_dev);
}
-void br_multicast_open(struct net_bridge *br)
+#if IS_ENABLED(CONFIG_IPV6)
+static void br_ip6_multicast_join_snoopers(struct net_bridge *br)
+{
+ struct in6_addr addr;
+
+ ipv6_addr_set(&addr, htonl(0xff020000), 0, 0, htonl(0x6a));
+ ipv6_dev_mc_inc(br->dev, &addr);
+}
+#else
+static inline void br_ip6_multicast_join_snoopers(struct net_bridge *br)
+{
+}
+#endif
+
+void br_multicast_join_snoopers(struct net_bridge *br)
+{
+ br_ip4_multicast_join_snoopers(br);
+ br_ip6_multicast_join_snoopers(br);
+}
+
+static void br_ip4_multicast_leave_snoopers(struct net_bridge *br)
{
- __br_multicast_open(br, &br->ip4_own_query);
+ struct in_device *in_dev = in_dev_get(br->dev);
+
+ if (WARN_ON(!in_dev))
+ return;
+
+ __ip_mc_dec_group(in_dev, htonl(INADDR_ALLSNOOPERS_GROUP), GFP_ATOMIC);
+ in_dev_put(in_dev);
+}
+
#if IS_ENABLED(CONFIG_IPV6)
- __br_multicast_open(br, &br->ip6_own_query);
+static void br_ip6_multicast_leave_snoopers(struct net_bridge *br)
+{
+ struct in6_addr addr;
+
+ ipv6_addr_set(&addr, htonl(0xff020000), 0, 0, htonl(0x6a));
+ ipv6_dev_mc_dec(br->dev, &addr);
+}
+#else
+static inline void br_ip6_multicast_leave_snoopers(struct net_bridge *br)
+{
+}
#endif
+
+void br_multicast_leave_snoopers(struct net_bridge *br)
+{
+ br_ip4_multicast_leave_snoopers(br);
+ br_ip6_multicast_leave_snoopers(br);
}
-void br_multicast_stop(struct net_bridge *br)
+static void __br_multicast_open_query(struct net_bridge *br,
+ struct bridge_mcast_own_query *query)
{
- del_timer_sync(&br->multicast_router_timer);
- del_timer_sync(&br->ip4_other_query.timer);
- del_timer_sync(&br->ip4_own_query.timer);
+ query->startup_sent = 0;
+
+ if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
+ return;
+
+ mod_timer(&query->timer, jiffies);
+}
+
+static void __br_multicast_open(struct net_bridge_mcast *brmctx)
+{
+ __br_multicast_open_query(brmctx->br, &brmctx->ip4_own_query);
#if IS_ENABLED(CONFIG_IPV6)
- del_timer_sync(&br->ip6_other_query.timer);
- del_timer_sync(&br->ip6_own_query.timer);
+ __br_multicast_open_query(brmctx->br, &brmctx->ip6_own_query);
#endif
}
-void br_multicast_dev_del(struct net_bridge *br)
+void br_multicast_open(struct net_bridge *br)
{
- struct net_bridge_mdb_htable *mdb;
- struct net_bridge_mdb_entry *mp;
- struct hlist_node *n;
- u32 ver;
- int i;
+ ASSERT_RTNL();
- spin_lock_bh(&br->multicast_lock);
- mdb = mlock_dereference(br->mdb, br);
- if (!mdb)
- goto out;
+ if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) {
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *vlan;
- br->mdb = NULL;
+ vg = br_vlan_group(br);
+ if (vg) {
+ list_for_each_entry(vlan, &vg->vlan_list, vlist) {
+ struct net_bridge_mcast *brmctx;
- ver = mdb->ver;
- for (i = 0; i < mdb->max; i++) {
- hlist_for_each_entry_safe(mp, n, &mdb->mhash[i],
- hlist[ver]) {
- del_timer(&mp->timer);
- call_rcu_bh(&mp->rcu, br_multicast_free_group);
+ brmctx = &vlan->br_mcast_ctx;
+ if (br_vlan_is_brentry(vlan) &&
+ !br_multicast_ctx_vlan_disabled(brmctx))
+ __br_multicast_open(&vlan->br_mcast_ctx);
+ }
}
+ } else {
+ __br_multicast_open(&br->multicast_ctx);
}
+}
- if (mdb->old) {
+static void __br_multicast_stop(struct net_bridge_mcast *brmctx)
+{
+ timer_delete_sync(&brmctx->ip4_mc_router_timer);
+ timer_delete_sync(&brmctx->ip4_other_query.timer);
+ timer_delete_sync(&brmctx->ip4_other_query.delay_timer);
+ timer_delete_sync(&brmctx->ip4_own_query.timer);
+#if IS_ENABLED(CONFIG_IPV6)
+ timer_delete_sync(&brmctx->ip6_mc_router_timer);
+ timer_delete_sync(&brmctx->ip6_other_query.timer);
+ timer_delete_sync(&brmctx->ip6_other_query.delay_timer);
+ timer_delete_sync(&brmctx->ip6_own_query.timer);
+#endif
+}
+
+void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, u8 state)
+{
+#if IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING)
+ struct net_bridge *br;
+
+ if (!br_vlan_should_use(v))
+ return;
+
+ if (br_vlan_is_master(v))
+ return;
+
+ br = v->port->br;
+
+ if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED))
+ return;
+
+ if (br_vlan_state_allowed(state, true))
+ br_multicast_enable_port_ctx(&v->port_mcast_ctx);
+
+ /* Multicast is not disabled for the vlan when it goes in
+ * blocking state because the timers will expire and stop by
+ * themselves without sending more queries.
+ */
+#endif
+}
+
+void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on)
+{
+ struct net_bridge *br;
+
+ /* it's okay to check for the flag without the multicast lock because it
+ * can only change under RTNL -> multicast_lock, we need the latter to
+ * sync with timers and packets
+ */
+ if (on == !!(vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED))
+ return;
+
+ if (br_vlan_is_master(vlan)) {
+ br = vlan->br;
+
+ if (!br_vlan_is_brentry(vlan) ||
+ (on &&
+ br_multicast_ctx_vlan_global_disabled(&vlan->br_mcast_ctx)))
+ return;
+
+ spin_lock_bh(&br->multicast_lock);
+ vlan->priv_flags ^= BR_VLFLAG_MCAST_ENABLED;
spin_unlock_bh(&br->multicast_lock);
- rcu_barrier_bh();
+
+ if (on)
+ __br_multicast_open(&vlan->br_mcast_ctx);
+ else
+ __br_multicast_stop(&vlan->br_mcast_ctx);
+ } else {
+ struct net_bridge_mcast *brmctx;
+
+ brmctx = br_multicast_port_ctx_get_global(&vlan->port_mcast_ctx);
+ if (on && br_multicast_ctx_vlan_global_disabled(brmctx))
+ return;
+
+ br = vlan->port->br;
spin_lock_bh(&br->multicast_lock);
- WARN_ON(mdb->old);
+ vlan->priv_flags ^= BR_VLFLAG_MCAST_ENABLED;
+ if (on)
+ __br_multicast_enable_port_ctx(&vlan->port_mcast_ctx);
+ else
+ __br_multicast_disable_port_ctx(&vlan->port_mcast_ctx);
+ spin_unlock_bh(&br->multicast_lock);
}
+}
- mdb->old = mdb;
- call_rcu_bh(&mdb->rcu, br_mdb_free);
+static void br_multicast_toggle_vlan(struct net_bridge_vlan *vlan, bool on)
+{
+ struct net_bridge_port *p;
-out:
+ if (WARN_ON_ONCE(!br_vlan_is_master(vlan)))
+ return;
+
+ list_for_each_entry(p, &vlan->br->port_list, list) {
+ struct net_bridge_vlan *vport;
+
+ vport = br_vlan_find(nbp_vlan_group(p), vlan->vid);
+ if (!vport)
+ continue;
+ br_multicast_toggle_one_vlan(vport, on);
+ }
+
+ if (br_vlan_is_brentry(vlan))
+ br_multicast_toggle_one_vlan(vlan, on);
+}
+
+int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *vlan;
+ struct net_bridge_port *p;
+
+ if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) == on)
+ return 0;
+
+ if (on && !br_opt_get(br, BROPT_VLAN_ENABLED)) {
+ NL_SET_ERR_MSG_MOD(extack, "Cannot enable multicast vlan snooping with vlan filtering disabled");
+ return -EINVAL;
+ }
+
+ vg = br_vlan_group(br);
+ if (!vg)
+ return 0;
+
+ br_opt_toggle(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED, on);
+
+ /* disable/enable non-vlan mcast contexts based on vlan snooping */
+ if (on)
+ __br_multicast_stop(&br->multicast_ctx);
+ else
+ __br_multicast_open(&br->multicast_ctx);
+ list_for_each_entry(p, &br->port_list, list) {
+ if (on)
+ br_multicast_disable_port_ctx(&p->multicast_ctx);
+ else
+ br_multicast_enable_port_ctx(&p->multicast_ctx);
+ }
+
+ list_for_each_entry(vlan, &vg->vlan_list, vlist)
+ br_multicast_toggle_vlan(vlan, on);
+
+ return 0;
+}
+
+bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on)
+{
+ ASSERT_RTNL();
+
+ /* BR_VLFLAG_GLOBAL_MCAST_ENABLED relies on eventual consistency and
+ * requires only RTNL to change
+ */
+ if (on == !!(vlan->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED))
+ return false;
+
+ vlan->priv_flags ^= BR_VLFLAG_GLOBAL_MCAST_ENABLED;
+ br_multicast_toggle_vlan(vlan, on);
+
+ return true;
+}
+
+void br_multicast_stop(struct net_bridge *br)
+{
+ ASSERT_RTNL();
+
+ if (br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) {
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *vlan;
+
+ vg = br_vlan_group(br);
+ if (vg) {
+ list_for_each_entry(vlan, &vg->vlan_list, vlist) {
+ struct net_bridge_mcast *brmctx;
+
+ brmctx = &vlan->br_mcast_ctx;
+ if (br_vlan_is_brentry(vlan) &&
+ !br_multicast_ctx_vlan_disabled(brmctx))
+ __br_multicast_stop(&vlan->br_mcast_ctx);
+ }
+ }
+ } else {
+ __br_multicast_stop(&br->multicast_ctx);
+ }
+}
+
+void br_multicast_dev_del(struct net_bridge *br)
+{
+ struct net_bridge_mdb_entry *mp;
+ HLIST_HEAD(deleted_head);
+ struct hlist_node *tmp;
+
+ spin_lock_bh(&br->multicast_lock);
+ hlist_for_each_entry_safe(mp, tmp, &br->mdb_list, mdb_node)
+ br_multicast_del_mdb_entry(mp);
+ hlist_move_list(&br->mcast_gc_list, &deleted_head);
spin_unlock_bh(&br->multicast_lock);
+
+ br_multicast_ctx_deinit(&br->multicast_ctx);
+ br_multicast_gc(&deleted_head);
+ cancel_work_sync(&br->mcast_gc_work);
+
+ rcu_barrier();
}
-int br_multicast_set_router(struct net_bridge *br, unsigned long val)
+int br_multicast_set_router(struct net_bridge_mcast *brmctx, unsigned long val)
{
int err = -EINVAL;
- spin_lock_bh(&br->multicast_lock);
+ spin_lock_bh(&brmctx->br->multicast_lock);
switch (val) {
case MDB_RTR_TYPE_DISABLED:
case MDB_RTR_TYPE_PERM:
- br_mc_router_state_change(br, val == MDB_RTR_TYPE_PERM);
- del_timer(&br->multicast_router_timer);
- br->multicast_router = val;
+ br_mc_router_state_change(brmctx->br, val == MDB_RTR_TYPE_PERM);
+ timer_delete(&brmctx->ip4_mc_router_timer);
+#if IS_ENABLED(CONFIG_IPV6)
+ timer_delete(&brmctx->ip6_mc_router_timer);
+#endif
+ brmctx->multicast_router = val;
err = 0;
break;
case MDB_RTR_TYPE_TEMP_QUERY:
- if (br->multicast_router != MDB_RTR_TYPE_TEMP_QUERY)
- br_mc_router_state_change(br, false);
- br->multicast_router = val;
+ if (brmctx->multicast_router != MDB_RTR_TYPE_TEMP_QUERY)
+ br_mc_router_state_change(brmctx->br, false);
+ brmctx->multicast_router = val;
err = 0;
break;
}
- spin_unlock_bh(&br->multicast_lock);
+ spin_unlock_bh(&brmctx->br->multicast_lock);
return err;
}
-static void __del_port_router(struct net_bridge_port *p)
+static void
+br_multicast_rport_del_notify(struct net_bridge_mcast_port *pmctx, bool deleted)
{
- if (hlist_unhashed(&p->rlist))
+ if (!deleted)
return;
- hlist_del_init_rcu(&p->rlist);
- br_rtr_notify(p->br->dev, p, RTM_DELMDB);
- br_port_mc_router_state_change(p, false);
+
+ /* For backwards compatibility for now, only notify if there is
+ * no multicast router anymore for both IPv4 and IPv6.
+ */
+ if (!hlist_unhashed(&pmctx->ip4_rlist))
+ return;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (!hlist_unhashed(&pmctx->ip6_rlist))
+ return;
+#endif
+
+ br_rtr_notify(pmctx->port->br->dev, pmctx, RTM_DELMDB);
+ br_port_mc_router_state_change(pmctx->port, false);
/* don't allow timer refresh */
- if (p->multicast_router == MDB_RTR_TYPE_TEMP)
- p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
+ if (pmctx->multicast_router == MDB_RTR_TYPE_TEMP)
+ pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
}
-int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
+int br_multicast_set_port_router(struct net_bridge_mcast_port *pmctx,
+ unsigned long val)
{
- struct net_bridge *br = p->br;
+ struct net_bridge_mcast *brmctx;
unsigned long now = jiffies;
int err = -EINVAL;
+ bool del = false;
- spin_lock(&br->multicast_lock);
- if (p->multicast_router == val) {
+ brmctx = br_multicast_port_ctx_get_global(pmctx);
+ spin_lock_bh(&brmctx->br->multicast_lock);
+ if (pmctx->multicast_router == val) {
/* Refresh the temp router port timer */
- if (p->multicast_router == MDB_RTR_TYPE_TEMP)
- mod_timer(&p->multicast_router_timer,
- now + br->multicast_querier_interval);
+ if (pmctx->multicast_router == MDB_RTR_TYPE_TEMP) {
+ mod_timer(&pmctx->ip4_mc_router_timer,
+ now + brmctx->multicast_querier_interval);
+#if IS_ENABLED(CONFIG_IPV6)
+ mod_timer(&pmctx->ip6_mc_router_timer,
+ now + brmctx->multicast_querier_interval);
+#endif
+ }
err = 0;
goto unlock;
}
switch (val) {
case MDB_RTR_TYPE_DISABLED:
- p->multicast_router = MDB_RTR_TYPE_DISABLED;
- __del_port_router(p);
- del_timer(&p->multicast_router_timer);
+ pmctx->multicast_router = MDB_RTR_TYPE_DISABLED;
+ del |= br_ip4_multicast_rport_del(pmctx);
+ timer_delete(&pmctx->ip4_mc_router_timer);
+ del |= br_ip6_multicast_rport_del(pmctx);
+#if IS_ENABLED(CONFIG_IPV6)
+ timer_delete(&pmctx->ip6_mc_router_timer);
+#endif
+ br_multicast_rport_del_notify(pmctx, del);
break;
case MDB_RTR_TYPE_TEMP_QUERY:
- p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
- __del_port_router(p);
+ pmctx->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
+ del |= br_ip4_multicast_rport_del(pmctx);
+ del |= br_ip6_multicast_rport_del(pmctx);
+ br_multicast_rport_del_notify(pmctx, del);
break;
case MDB_RTR_TYPE_PERM:
- p->multicast_router = MDB_RTR_TYPE_PERM;
- del_timer(&p->multicast_router_timer);
- br_multicast_add_router(br, p);
+ pmctx->multicast_router = MDB_RTR_TYPE_PERM;
+ timer_delete(&pmctx->ip4_mc_router_timer);
+ br_ip4_multicast_add_router(brmctx, pmctx);
+#if IS_ENABLED(CONFIG_IPV6)
+ timer_delete(&pmctx->ip6_mc_router_timer);
+#endif
+ br_ip6_multicast_add_router(brmctx, pmctx);
break;
case MDB_RTR_TYPE_TEMP:
- p->multicast_router = MDB_RTR_TYPE_TEMP;
- br_multicast_mark_router(br, p);
+ pmctx->multicast_router = MDB_RTR_TYPE_TEMP;
+ br_ip4_multicast_mark_router(brmctx, pmctx);
+ br_ip6_multicast_mark_router(brmctx, pmctx);
break;
default:
goto unlock;
}
err = 0;
unlock:
- spin_unlock(&br->multicast_lock);
+ spin_unlock_bh(&brmctx->br->multicast_lock);
+
+ return err;
+}
+
+int br_multicast_set_vlan_router(struct net_bridge_vlan *v, u8 mcast_router)
+{
+ int err;
+
+ if (br_vlan_is_master(v))
+ err = br_multicast_set_router(&v->br_mcast_ctx, mcast_router);
+ else
+ err = br_multicast_set_port_router(&v->port_mcast_ctx,
+ mcast_router);
return err;
}
-static void br_multicast_start_querier(struct net_bridge *br,
+static void br_multicast_start_querier(struct net_bridge_mcast *brmctx,
struct bridge_mcast_own_query *query)
{
struct net_bridge_port *port;
- __br_multicast_open(br, query);
+ if (!br_multicast_ctx_matches_vlan_snooping(brmctx))
+ return;
+
+ __br_multicast_open_query(brmctx->br, query);
- list_for_each_entry(port, &br->port_list, list) {
- if (port->state == BR_STATE_DISABLED ||
- port->state == BR_STATE_BLOCKING)
+ rcu_read_lock();
+ list_for_each_entry_rcu(port, &brmctx->br->port_list, list) {
+ struct bridge_mcast_own_query *ip4_own_query;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct bridge_mcast_own_query *ip6_own_query;
+#endif
+
+ if (br_multicast_port_ctx_state_stopped(&port->multicast_ctx))
continue;
- if (query == &br->ip4_own_query)
- br_multicast_enable(&port->ip4_own_query);
+ if (br_multicast_ctx_is_vlan(brmctx)) {
+ struct net_bridge_vlan *vlan;
+
+ vlan = br_vlan_find(nbp_vlan_group_rcu(port),
+ brmctx->vlan->vid);
+ if (!vlan ||
+ br_multicast_port_ctx_state_stopped(&vlan->port_mcast_ctx))
+ continue;
+
+ ip4_own_query = &vlan->port_mcast_ctx.ip4_own_query;
+#if IS_ENABLED(CONFIG_IPV6)
+ ip6_own_query = &vlan->port_mcast_ctx.ip6_own_query;
+#endif
+ } else {
+ ip4_own_query = &port->multicast_ctx.ip4_own_query;
+#if IS_ENABLED(CONFIG_IPV6)
+ ip6_own_query = &port->multicast_ctx.ip6_own_query;
+#endif
+ }
+
+ if (query == &brmctx->ip4_own_query)
+ br_multicast_enable(ip4_own_query);
#if IS_ENABLED(CONFIG_IPV6)
else
- br_multicast_enable(&port->ip6_own_query);
+ br_multicast_enable(ip6_own_query);
#endif
}
+ rcu_read_unlock();
}
-int br_multicast_toggle(struct net_bridge *br, unsigned long val)
+static void br_multicast_del_grps(struct net_bridge *br)
{
- struct net_bridge_mdb_htable *mdb;
struct net_bridge_port *port;
+
+ list_for_each_entry(port, &br->port_list, list)
+ __br_multicast_disable_port_ctx(&port->multicast_ctx);
+}
+
+int br_multicast_toggle(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_port *port;
+ bool change_snoopers = false;
int err = 0;
spin_lock_bh(&br->multicast_lock);
- if (br->multicast_disabled == !val)
+ if (!!br_opt_get(br, BROPT_MULTICAST_ENABLED) == !!val)
goto unlock;
- br_mc_disabled_update(br->dev, !val);
- br->multicast_disabled = !val;
- if (br->multicast_disabled)
+ err = br_mc_disabled_update(br->dev, val, extack);
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ if (err)
goto unlock;
- if (!netif_running(br->dev))
+ br_opt_toggle(br, BROPT_MULTICAST_ENABLED, !!val);
+ if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) {
+ change_snoopers = true;
+ br_multicast_del_grps(br);
goto unlock;
-
- mdb = mlock_dereference(br->mdb, br);
- if (mdb) {
- if (mdb->old) {
- err = -EEXIST;
-rollback:
- br->multicast_disabled = !!val;
- goto unlock;
- }
-
- err = br_mdb_rehash(&br->mdb, mdb->max,
- br->hash_elasticity);
- if (err)
- goto rollback;
}
+ if (!netif_running(br->dev))
+ goto unlock;
+
br_multicast_open(br);
list_for_each_entry(port, &br->port_list, list)
- __br_multicast_enable_port(port);
+ __br_multicast_enable_port_ctx(&port->multicast_ctx);
+
+ change_snoopers = true;
unlock:
spin_unlock_bh(&br->multicast_lock);
+ /* br_multicast_join_snoopers has the potential to cause
+ * an MLD Report/Leave to be delivered to br_multicast_rcv,
+ * which would in turn call br_multicast_add_group, which would
+ * attempt to acquire multicast_lock. This function should be
+ * called after the lock has been released to avoid deadlocks on
+ * multicast_lock.
+ *
+ * br_multicast_leave_snoopers does not have the problem since
+ * br_multicast_rcv first checks BROPT_MULTICAST_ENABLED, and
+ * returns without calling br_multicast_ipv4/6_rcv if it's not
+ * enabled. Moved both functions out just for symmetry.
+ */
+ if (change_snoopers) {
+ if (br_opt_get(br, BROPT_MULTICAST_ENABLED))
+ br_multicast_join_snoopers(br);
+ else
+ br_multicast_leave_snoopers(br);
+ }
+
return err;
}
@@ -2213,7 +4719,7 @@ bool br_multicast_enabled(const struct net_device *dev)
{
struct net_bridge *br = netdev_priv(dev);
- return !br->multicast_disabled;
+ return !!br_opt_get(br, BROPT_MULTICAST_ENABLED);
}
EXPORT_SYMBOL_GPL(br_multicast_enabled);
@@ -2223,86 +4729,50 @@ bool br_multicast_router(const struct net_device *dev)
bool is_router;
spin_lock_bh(&br->multicast_lock);
- is_router = br_multicast_is_router(br);
+ is_router = br_multicast_is_router(&br->multicast_ctx, NULL);
spin_unlock_bh(&br->multicast_lock);
return is_router;
}
EXPORT_SYMBOL_GPL(br_multicast_router);
-int br_multicast_set_querier(struct net_bridge *br, unsigned long val)
+int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val)
{
unsigned long max_delay;
val = !!val;
- spin_lock_bh(&br->multicast_lock);
- if (br->multicast_querier == val)
+ spin_lock_bh(&brmctx->br->multicast_lock);
+ if (brmctx->multicast_querier == val)
goto unlock;
- br->multicast_querier = val;
+ WRITE_ONCE(brmctx->multicast_querier, val);
if (!val)
goto unlock;
- max_delay = br->multicast_query_response_interval;
+ max_delay = brmctx->multicast_query_response_interval;
- if (!timer_pending(&br->ip4_other_query.timer))
- br->ip4_other_query.delay_time = jiffies + max_delay;
+ if (!timer_pending(&brmctx->ip4_other_query.timer))
+ mod_timer(&brmctx->ip4_other_query.delay_timer,
+ jiffies + max_delay);
- br_multicast_start_querier(br, &br->ip4_own_query);
+ br_multicast_start_querier(brmctx, &brmctx->ip4_own_query);
#if IS_ENABLED(CONFIG_IPV6)
- if (!timer_pending(&br->ip6_other_query.timer))
- br->ip6_other_query.delay_time = jiffies + max_delay;
+ if (!timer_pending(&brmctx->ip6_other_query.timer))
+ mod_timer(&brmctx->ip6_other_query.delay_timer,
+ jiffies + max_delay);
- br_multicast_start_querier(br, &br->ip6_own_query);
+ br_multicast_start_querier(brmctx, &brmctx->ip6_own_query);
#endif
unlock:
- spin_unlock_bh(&br->multicast_lock);
+ spin_unlock_bh(&brmctx->br->multicast_lock);
return 0;
}
-int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val)
-{
- int err = -EINVAL;
- u32 old;
- struct net_bridge_mdb_htable *mdb;
-
- spin_lock_bh(&br->multicast_lock);
- if (!is_power_of_2(val))
- goto unlock;
-
- mdb = mlock_dereference(br->mdb, br);
- if (mdb && val < mdb->size)
- goto unlock;
-
- err = 0;
-
- old = br->hash_max;
- br->hash_max = val;
-
- if (mdb) {
- if (mdb->old) {
- err = -EEXIST;
-rollback:
- br->hash_max = old;
- goto unlock;
- }
-
- err = br_mdb_rehash(&br->mdb, br->hash_max,
- br->hash_elasticity);
- if (err)
- goto rollback;
- }
-
-unlock:
- spin_unlock_bh(&br->multicast_lock);
-
- return err;
-}
-
-int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val)
+int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx,
+ unsigned long val)
{
/* Currently we support only version 2 and 3 */
switch (val) {
@@ -2313,15 +4783,16 @@ int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val)
return -EINVAL;
}
- spin_lock_bh(&br->multicast_lock);
- br->multicast_igmp_version = val;
- spin_unlock_bh(&br->multicast_lock);
+ spin_lock_bh(&brmctx->br->multicast_lock);
+ brmctx->multicast_igmp_version = val;
+ spin_unlock_bh(&brmctx->br->multicast_lock);
return 0;
}
#if IS_ENABLED(CONFIG_IPV6)
-int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val)
+int br_multicast_set_mld_version(struct net_bridge_mcast *brmctx,
+ unsigned long val)
{
/* Currently we support version 1 and 2 */
switch (val) {
@@ -2332,14 +4803,62 @@ int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val)
return -EINVAL;
}
- spin_lock_bh(&br->multicast_lock);
- br->multicast_mld_version = val;
- spin_unlock_bh(&br->multicast_lock);
+ spin_lock_bh(&brmctx->br->multicast_lock);
+ brmctx->multicast_mld_version = val;
+ spin_unlock_bh(&brmctx->br->multicast_lock);
return 0;
}
#endif
+void br_multicast_set_query_intvl(struct net_bridge_mcast *brmctx,
+ unsigned long val)
+{
+ unsigned long intvl_jiffies = clock_t_to_jiffies(val);
+
+ if (intvl_jiffies < BR_MULTICAST_QUERY_INTVL_MIN) {
+ br_info(brmctx->br,
+ "trying to set multicast query interval below minimum, setting to %lu (%ums)\n",
+ jiffies_to_clock_t(BR_MULTICAST_QUERY_INTVL_MIN),
+ jiffies_to_msecs(BR_MULTICAST_QUERY_INTVL_MIN));
+ intvl_jiffies = BR_MULTICAST_QUERY_INTVL_MIN;
+ }
+
+ if (intvl_jiffies > BR_MULTICAST_QUERY_INTVL_MAX) {
+ br_info(brmctx->br,
+ "trying to set multicast query interval above maximum, setting to %lu (%ums)\n",
+ jiffies_to_clock_t(BR_MULTICAST_QUERY_INTVL_MAX),
+ jiffies_to_msecs(BR_MULTICAST_QUERY_INTVL_MAX));
+ intvl_jiffies = BR_MULTICAST_QUERY_INTVL_MAX;
+ }
+
+ brmctx->multicast_query_interval = intvl_jiffies;
+}
+
+void br_multicast_set_startup_query_intvl(struct net_bridge_mcast *brmctx,
+ unsigned long val)
+{
+ unsigned long intvl_jiffies = clock_t_to_jiffies(val);
+
+ if (intvl_jiffies < BR_MULTICAST_STARTUP_QUERY_INTVL_MIN) {
+ br_info(brmctx->br,
+ "trying to set multicast startup query interval below minimum, setting to %lu (%ums)\n",
+ jiffies_to_clock_t(BR_MULTICAST_STARTUP_QUERY_INTVL_MIN),
+ jiffies_to_msecs(BR_MULTICAST_STARTUP_QUERY_INTVL_MIN));
+ intvl_jiffies = BR_MULTICAST_STARTUP_QUERY_INTVL_MIN;
+ }
+
+ if (intvl_jiffies > BR_MULTICAST_STARTUP_QUERY_INTVL_MAX) {
+ br_info(brmctx->br,
+ "trying to set multicast startup query interval above maximum, setting to %lu (%ums)\n",
+ jiffies_to_clock_t(BR_MULTICAST_STARTUP_QUERY_INTVL_MAX),
+ jiffies_to_msecs(BR_MULTICAST_STARTUP_QUERY_INTVL_MAX));
+ intvl_jiffies = BR_MULTICAST_STARTUP_QUERY_INTVL_MAX;
+ }
+
+ brmctx->multicast_startup_query_interval = intvl_jiffies;
+}
+
/**
* br_multicast_list_adjacent - Returns snooped multicast addresses
* @dev: The bridge port adjacent to which to retrieve addresses
@@ -2367,7 +4886,7 @@ int br_multicast_list_adjacent(struct net_device *dev,
int count = 0;
rcu_read_lock();
- if (!br_ip_list || !br_port_exists(dev))
+ if (!br_ip_list || !netif_is_bridge_port(dev))
goto unlock;
port = br_port_get_rcu(dev);
@@ -2385,7 +4904,7 @@ int br_multicast_list_adjacent(struct net_device *dev,
if (!entry)
goto unlock;
- entry->addr = group->addr;
+ entry->addr = group->key.addr;
list_add(&entry->list, br_ip_list);
count++;
}
@@ -2414,7 +4933,7 @@ bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto)
bool ret = false;
rcu_read_lock();
- if (!br_port_exists(dev))
+ if (!netif_is_bridge_port(dev))
goto unlock;
port = br_port_get_rcu(dev);
@@ -2426,7 +4945,7 @@ bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto)
memset(&eth, 0, sizeof(eth));
eth.h_proto = htons(proto);
- ret = br_multicast_querier_exists(br, &eth);
+ ret = br_multicast_querier_exists(&br->multicast_ctx, &eth, NULL);
unlock:
rcu_read_unlock();
@@ -2445,12 +4964,14 @@ EXPORT_SYMBOL_GPL(br_multicast_has_querier_anywhere);
*/
bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto)
{
+ struct net_bridge_mcast *brmctx;
struct net_bridge *br;
struct net_bridge_port *port;
bool ret = false;
+ int port_ifidx;
rcu_read_lock();
- if (!br_port_exists(dev))
+ if (!netif_is_bridge_port(dev))
goto unlock;
port = br_port_get_rcu(dev);
@@ -2458,17 +4979,20 @@ bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto)
goto unlock;
br = port->br;
+ brmctx = &br->multicast_ctx;
switch (proto) {
case ETH_P_IP:
- if (!timer_pending(&br->ip4_other_query.timer) ||
- rcu_dereference(br->ip4_querier.port) == port)
+ port_ifidx = brmctx->ip4_querier.port_ifidx;
+ if (!timer_pending(&brmctx->ip4_other_query.timer) ||
+ port_ifidx == port->dev->ifindex)
goto unlock;
break;
#if IS_ENABLED(CONFIG_IPV6)
case ETH_P_IPV6:
- if (!timer_pending(&br->ip6_other_query.timer) ||
- rcu_dereference(br->ip6_querier.port) == port)
+ port_ifidx = brmctx->ip6_querier.port_ifidx;
+ if (!timer_pending(&brmctx->ip6_other_query.timer) ||
+ port_ifidx == port->dev->ifindex)
goto unlock;
break;
#endif
@@ -2483,6 +5007,64 @@ unlock:
}
EXPORT_SYMBOL_GPL(br_multicast_has_querier_adjacent);
+/**
+ * br_multicast_has_router_adjacent - Checks for a router behind a bridge port
+ * @dev: The bridge port adjacent to which to check for a multicast router
+ * @proto: The protocol family to check for: IGMP -> ETH_P_IP, MLD -> ETH_P_IPV6
+ *
+ * Checks whether the given interface has a bridge on top and if so returns
+ * true if a multicast router is behind one of the other ports of this
+ * bridge. Otherwise returns false.
+ */
+bool br_multicast_has_router_adjacent(struct net_device *dev, int proto)
+{
+ struct net_bridge_mcast_port *pmctx;
+ struct net_bridge_mcast *brmctx;
+ struct net_bridge_port *port;
+ bool ret = false;
+
+ rcu_read_lock();
+ port = br_port_get_check_rcu(dev);
+ if (!port)
+ goto unlock;
+
+ brmctx = &port->br->multicast_ctx;
+ switch (proto) {
+ case ETH_P_IP:
+ hlist_for_each_entry_rcu(pmctx, &brmctx->ip4_mc_router_list,
+ ip4_rlist) {
+ if (pmctx->port == port)
+ continue;
+
+ ret = true;
+ goto unlock;
+ }
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case ETH_P_IPV6:
+ hlist_for_each_entry_rcu(pmctx, &brmctx->ip6_mc_router_list,
+ ip6_rlist) {
+ if (pmctx->port == port)
+ continue;
+
+ ret = true;
+ goto unlock;
+ }
+ break;
+#endif
+ default:
+ /* when compiled without IPv6 support, be conservative and
+ * always assume presence of an IPv6 multicast router
+ */
+ ret = true;
+ }
+
+unlock:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(br_multicast_has_router_adjacent);
+
static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats,
const struct sk_buff *skb, u8 type, u8 dir)
{
@@ -2554,13 +5136,14 @@ static void br_mcast_stats_add(struct bridge_mcast_stats __percpu *stats,
u64_stats_update_end(&pstats->syncp);
}
-void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p,
+void br_multicast_count(struct net_bridge *br,
+ const struct net_bridge_port *p,
const struct sk_buff *skb, u8 type, u8 dir)
{
struct bridge_mcast_stats __percpu *stats;
/* if multicast_disabled is true then igmp type can't be set */
- if (!type || !br->multicast_stats_enabled)
+ if (!type || !br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED))
return;
if (p)
@@ -2587,7 +5170,8 @@ void br_multicast_uninit_stats(struct net_bridge *br)
free_percpu(br->mcast_stats);
}
-static void mcast_stats_add_dir(u64 *dst, u64 *src)
+/* noinline for https://llvm.org/pr45802#c9 */
+static noinline_for_stack void mcast_stats_add_dir(u64 *dst, u64 *src)
{
dst[BR_MCAST_DIR_RX] += src[BR_MCAST_DIR_RX];
dst[BR_MCAST_DIR_TX] += src[BR_MCAST_DIR_TX];
@@ -2616,9 +5200,9 @@ void br_multicast_get_stats(const struct net_bridge *br,
unsigned int start;
do {
- start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
memcpy(&temp, &cpu_stats->mstats, sizeof(temp));
- } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
mcast_stats_add_dir(tdst.igmp_v1queries, temp.igmp_v1queries);
mcast_stats_add_dir(tdst.igmp_v2queries, temp.igmp_v2queries);
@@ -2638,3 +5222,26 @@ void br_multicast_get_stats(const struct net_bridge *br,
}
memcpy(dest, &tdst, sizeof(*dest));
}
+
+int br_mdb_hash_init(struct net_bridge *br)
+{
+ int err;
+
+ err = rhashtable_init(&br->sg_port_tbl, &br_sg_port_rht_params);
+ if (err)
+ return err;
+
+ err = rhashtable_init(&br->mdb_hash_tbl, &br_mdb_rht_params);
+ if (err) {
+ rhashtable_destroy(&br->sg_port_tbl);
+ return err;
+ }
+
+ return 0;
+}
+
+void br_mdb_hash_fini(struct net_bridge *br)
+{
+ rhashtable_destroy(&br->sg_port_tbl);
+ rhashtable_destroy(&br->mdb_hash_tbl);
+}
diff --git a/net/bridge/br_multicast_eht.c b/net/bridge/br_multicast_eht.c
new file mode 100644
index 000000000000..adfd74102019
--- /dev/null
+++ b/net/bridge/br_multicast_eht.c
@@ -0,0 +1,822 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+// Copyright (c) 2020, Nikolay Aleksandrov <nikolay@nvidia.com>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/if_ether.h>
+#include <linux/igmp.h>
+#include <linux/in.h>
+#include <linux/jhash.h>
+#include <linux/kernel.h>
+#include <linux/log2.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/timer.h>
+#include <linux/inetdevice.h>
+#include <linux/mroute.h>
+#include <net/ip.h>
+#include <net/switchdev.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <linux/icmpv6.h>
+#include <net/ipv6.h>
+#include <net/mld.h>
+#include <net/ip6_checksum.h>
+#include <net/addrconf.h>
+#endif
+
+#include "br_private.h"
+#include "br_private_mcast_eht.h"
+
+static bool br_multicast_del_eht_set_entry(struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *src_addr,
+ union net_bridge_eht_addr *h_addr);
+static void br_multicast_create_eht_set_entry(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *src_addr,
+ union net_bridge_eht_addr *h_addr,
+ int filter_mode,
+ bool allow_zero_src);
+
+static struct net_bridge_group_eht_host *
+br_multicast_eht_host_lookup(struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr)
+{
+ struct rb_node *node = pg->eht_host_tree.rb_node;
+
+ while (node) {
+ struct net_bridge_group_eht_host *this;
+ int result;
+
+ this = rb_entry(node, struct net_bridge_group_eht_host,
+ rb_node);
+ result = memcmp(h_addr, &this->h_addr, sizeof(*h_addr));
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
+ else
+ return this;
+ }
+
+ return NULL;
+}
+
+static int br_multicast_eht_host_filter_mode(struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr)
+{
+ struct net_bridge_group_eht_host *eht_host;
+
+ eht_host = br_multicast_eht_host_lookup(pg, h_addr);
+ if (!eht_host)
+ return MCAST_INCLUDE;
+
+ return eht_host->filter_mode;
+}
+
+static struct net_bridge_group_eht_set_entry *
+br_multicast_eht_set_entry_lookup(struct net_bridge_group_eht_set *eht_set,
+ union net_bridge_eht_addr *h_addr)
+{
+ struct rb_node *node = eht_set->entry_tree.rb_node;
+
+ while (node) {
+ struct net_bridge_group_eht_set_entry *this;
+ int result;
+
+ this = rb_entry(node, struct net_bridge_group_eht_set_entry,
+ rb_node);
+ result = memcmp(h_addr, &this->h_addr, sizeof(*h_addr));
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
+ else
+ return this;
+ }
+
+ return NULL;
+}
+
+static struct net_bridge_group_eht_set *
+br_multicast_eht_set_lookup(struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *src_addr)
+{
+ struct rb_node *node = pg->eht_set_tree.rb_node;
+
+ while (node) {
+ struct net_bridge_group_eht_set *this;
+ int result;
+
+ this = rb_entry(node, struct net_bridge_group_eht_set,
+ rb_node);
+ result = memcmp(src_addr, &this->src_addr, sizeof(*src_addr));
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
+ else
+ return this;
+ }
+
+ return NULL;
+}
+
+static void __eht_destroy_host(struct net_bridge_group_eht_host *eht_host)
+{
+ WARN_ON(!hlist_empty(&eht_host->set_entries));
+
+ br_multicast_eht_hosts_dec(eht_host->pg);
+
+ rb_erase(&eht_host->rb_node, &eht_host->pg->eht_host_tree);
+ RB_CLEAR_NODE(&eht_host->rb_node);
+ kfree(eht_host);
+}
+
+static void br_multicast_destroy_eht_set_entry(struct net_bridge_mcast_gc *gc)
+{
+ struct net_bridge_group_eht_set_entry *set_h;
+
+ set_h = container_of(gc, struct net_bridge_group_eht_set_entry, mcast_gc);
+ WARN_ON(!RB_EMPTY_NODE(&set_h->rb_node));
+
+ timer_shutdown_sync(&set_h->timer);
+ kfree(set_h);
+}
+
+static void br_multicast_destroy_eht_set(struct net_bridge_mcast_gc *gc)
+{
+ struct net_bridge_group_eht_set *eht_set;
+
+ eht_set = container_of(gc, struct net_bridge_group_eht_set, mcast_gc);
+ WARN_ON(!RB_EMPTY_NODE(&eht_set->rb_node));
+ WARN_ON(!RB_EMPTY_ROOT(&eht_set->entry_tree));
+
+ timer_shutdown_sync(&eht_set->timer);
+ kfree(eht_set);
+}
+
+static void __eht_del_set_entry(struct net_bridge_group_eht_set_entry *set_h)
+{
+ struct net_bridge_group_eht_host *eht_host = set_h->h_parent;
+ union net_bridge_eht_addr zero_addr;
+
+ rb_erase(&set_h->rb_node, &set_h->eht_set->entry_tree);
+ RB_CLEAR_NODE(&set_h->rb_node);
+ hlist_del_init(&set_h->host_list);
+ memset(&zero_addr, 0, sizeof(zero_addr));
+ if (memcmp(&set_h->h_addr, &zero_addr, sizeof(zero_addr)))
+ eht_host->num_entries--;
+ hlist_add_head(&set_h->mcast_gc.gc_node, &set_h->br->mcast_gc_list);
+ queue_work(system_long_wq, &set_h->br->mcast_gc_work);
+
+ if (hlist_empty(&eht_host->set_entries))
+ __eht_destroy_host(eht_host);
+}
+
+static void br_multicast_del_eht_set(struct net_bridge_group_eht_set *eht_set)
+{
+ struct net_bridge_group_eht_set_entry *set_h;
+ struct rb_node *node;
+
+ while ((node = rb_first(&eht_set->entry_tree))) {
+ set_h = rb_entry(node, struct net_bridge_group_eht_set_entry,
+ rb_node);
+ __eht_del_set_entry(set_h);
+ }
+
+ rb_erase(&eht_set->rb_node, &eht_set->pg->eht_set_tree);
+ RB_CLEAR_NODE(&eht_set->rb_node);
+ hlist_add_head(&eht_set->mcast_gc.gc_node, &eht_set->br->mcast_gc_list);
+ queue_work(system_long_wq, &eht_set->br->mcast_gc_work);
+}
+
+void br_multicast_eht_clean_sets(struct net_bridge_port_group *pg)
+{
+ struct net_bridge_group_eht_set *eht_set;
+ struct rb_node *node;
+
+ while ((node = rb_first(&pg->eht_set_tree))) {
+ eht_set = rb_entry(node, struct net_bridge_group_eht_set,
+ rb_node);
+ br_multicast_del_eht_set(eht_set);
+ }
+}
+
+static void br_multicast_eht_set_entry_expired(struct timer_list *t)
+{
+ struct net_bridge_group_eht_set_entry *set_h = timer_container_of(set_h,
+ t,
+ timer);
+ struct net_bridge *br = set_h->br;
+
+ spin_lock(&br->multicast_lock);
+ if (RB_EMPTY_NODE(&set_h->rb_node) || timer_pending(&set_h->timer))
+ goto out;
+
+ br_multicast_del_eht_set_entry(set_h->eht_set->pg,
+ &set_h->eht_set->src_addr,
+ &set_h->h_addr);
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
+static void br_multicast_eht_set_expired(struct timer_list *t)
+{
+ struct net_bridge_group_eht_set *eht_set = timer_container_of(eht_set,
+ t,
+ timer);
+ struct net_bridge *br = eht_set->br;
+
+ spin_lock(&br->multicast_lock);
+ if (RB_EMPTY_NODE(&eht_set->rb_node) || timer_pending(&eht_set->timer))
+ goto out;
+
+ br_multicast_del_eht_set(eht_set);
+out:
+ spin_unlock(&br->multicast_lock);
+}
+
+static struct net_bridge_group_eht_host *
+__eht_lookup_create_host(struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ unsigned char filter_mode)
+{
+ struct rb_node **link = &pg->eht_host_tree.rb_node, *parent = NULL;
+ struct net_bridge_group_eht_host *eht_host;
+
+ while (*link) {
+ struct net_bridge_group_eht_host *this;
+ int result;
+
+ this = rb_entry(*link, struct net_bridge_group_eht_host,
+ rb_node);
+ result = memcmp(h_addr, &this->h_addr, sizeof(*h_addr));
+ parent = *link;
+ if (result < 0)
+ link = &((*link)->rb_left);
+ else if (result > 0)
+ link = &((*link)->rb_right);
+ else
+ return this;
+ }
+
+ if (br_multicast_eht_hosts_over_limit(pg))
+ return NULL;
+
+ eht_host = kzalloc(sizeof(*eht_host), GFP_ATOMIC);
+ if (!eht_host)
+ return NULL;
+
+ memcpy(&eht_host->h_addr, h_addr, sizeof(*h_addr));
+ INIT_HLIST_HEAD(&eht_host->set_entries);
+ eht_host->pg = pg;
+ eht_host->filter_mode = filter_mode;
+
+ rb_link_node(&eht_host->rb_node, parent, link);
+ rb_insert_color(&eht_host->rb_node, &pg->eht_host_tree);
+
+ br_multicast_eht_hosts_inc(pg);
+
+ return eht_host;
+}
+
+static struct net_bridge_group_eht_set_entry *
+__eht_lookup_create_set_entry(struct net_bridge *br,
+ struct net_bridge_group_eht_set *eht_set,
+ struct net_bridge_group_eht_host *eht_host,
+ bool allow_zero_src)
+{
+ struct rb_node **link = &eht_set->entry_tree.rb_node, *parent = NULL;
+ struct net_bridge_group_eht_set_entry *set_h;
+
+ while (*link) {
+ struct net_bridge_group_eht_set_entry *this;
+ int result;
+
+ this = rb_entry(*link, struct net_bridge_group_eht_set_entry,
+ rb_node);
+ result = memcmp(&eht_host->h_addr, &this->h_addr,
+ sizeof(union net_bridge_eht_addr));
+ parent = *link;
+ if (result < 0)
+ link = &((*link)->rb_left);
+ else if (result > 0)
+ link = &((*link)->rb_right);
+ else
+ return this;
+ }
+
+ /* always allow auto-created zero entry */
+ if (!allow_zero_src && eht_host->num_entries >= PG_SRC_ENT_LIMIT)
+ return NULL;
+
+ set_h = kzalloc(sizeof(*set_h), GFP_ATOMIC);
+ if (!set_h)
+ return NULL;
+
+ memcpy(&set_h->h_addr, &eht_host->h_addr,
+ sizeof(union net_bridge_eht_addr));
+ set_h->mcast_gc.destroy = br_multicast_destroy_eht_set_entry;
+ set_h->eht_set = eht_set;
+ set_h->h_parent = eht_host;
+ set_h->br = br;
+ timer_setup(&set_h->timer, br_multicast_eht_set_entry_expired, 0);
+
+ hlist_add_head(&set_h->host_list, &eht_host->set_entries);
+ rb_link_node(&set_h->rb_node, parent, link);
+ rb_insert_color(&set_h->rb_node, &eht_set->entry_tree);
+ /* we must not count the auto-created zero entry otherwise we won't be
+ * able to track the full list of PG_SRC_ENT_LIMIT entries
+ */
+ if (!allow_zero_src)
+ eht_host->num_entries++;
+
+ return set_h;
+}
+
+static struct net_bridge_group_eht_set *
+__eht_lookup_create_set(struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *src_addr)
+{
+ struct rb_node **link = &pg->eht_set_tree.rb_node, *parent = NULL;
+ struct net_bridge_group_eht_set *eht_set;
+
+ while (*link) {
+ struct net_bridge_group_eht_set *this;
+ int result;
+
+ this = rb_entry(*link, struct net_bridge_group_eht_set,
+ rb_node);
+ result = memcmp(src_addr, &this->src_addr, sizeof(*src_addr));
+ parent = *link;
+ if (result < 0)
+ link = &((*link)->rb_left);
+ else if (result > 0)
+ link = &((*link)->rb_right);
+ else
+ return this;
+ }
+
+ eht_set = kzalloc(sizeof(*eht_set), GFP_ATOMIC);
+ if (!eht_set)
+ return NULL;
+
+ memcpy(&eht_set->src_addr, src_addr, sizeof(*src_addr));
+ eht_set->mcast_gc.destroy = br_multicast_destroy_eht_set;
+ eht_set->pg = pg;
+ eht_set->br = pg->key.port->br;
+ eht_set->entry_tree = RB_ROOT;
+ timer_setup(&eht_set->timer, br_multicast_eht_set_expired, 0);
+
+ rb_link_node(&eht_set->rb_node, parent, link);
+ rb_insert_color(&eht_set->rb_node, &pg->eht_set_tree);
+
+ return eht_set;
+}
+
+static void br_multicast_ip_src_to_eht_addr(const struct br_ip *src,
+ union net_bridge_eht_addr *dest)
+{
+ switch (src->proto) {
+ case htons(ETH_P_IP):
+ dest->ip4 = src->src.ip4;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ memcpy(&dest->ip6, &src->src.ip6, sizeof(struct in6_addr));
+ break;
+#endif
+ }
+}
+
+static void br_eht_convert_host_filter_mode(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ int filter_mode)
+{
+ struct net_bridge_group_eht_host *eht_host;
+ union net_bridge_eht_addr zero_addr;
+
+ eht_host = br_multicast_eht_host_lookup(pg, h_addr);
+ if (eht_host)
+ eht_host->filter_mode = filter_mode;
+
+ memset(&zero_addr, 0, sizeof(zero_addr));
+ switch (filter_mode) {
+ case MCAST_INCLUDE:
+ br_multicast_del_eht_set_entry(pg, &zero_addr, h_addr);
+ break;
+ case MCAST_EXCLUDE:
+ br_multicast_create_eht_set_entry(brmctx, pg, &zero_addr,
+ h_addr, MCAST_EXCLUDE,
+ true);
+ break;
+ }
+}
+
+static void br_multicast_create_eht_set_entry(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *src_addr,
+ union net_bridge_eht_addr *h_addr,
+ int filter_mode,
+ bool allow_zero_src)
+{
+ struct net_bridge_group_eht_set_entry *set_h;
+ struct net_bridge_group_eht_host *eht_host;
+ struct net_bridge *br = pg->key.port->br;
+ struct net_bridge_group_eht_set *eht_set;
+ union net_bridge_eht_addr zero_addr;
+
+ memset(&zero_addr, 0, sizeof(zero_addr));
+ if (!allow_zero_src && !memcmp(src_addr, &zero_addr, sizeof(zero_addr)))
+ return;
+
+ eht_set = __eht_lookup_create_set(pg, src_addr);
+ if (!eht_set)
+ return;
+
+ eht_host = __eht_lookup_create_host(pg, h_addr, filter_mode);
+ if (!eht_host)
+ goto fail_host;
+
+ set_h = __eht_lookup_create_set_entry(br, eht_set, eht_host,
+ allow_zero_src);
+ if (!set_h)
+ goto fail_set_entry;
+
+ mod_timer(&set_h->timer, jiffies + br_multicast_gmi(brmctx));
+ mod_timer(&eht_set->timer, jiffies + br_multicast_gmi(brmctx));
+
+ return;
+
+fail_set_entry:
+ if (hlist_empty(&eht_host->set_entries))
+ __eht_destroy_host(eht_host);
+fail_host:
+ if (RB_EMPTY_ROOT(&eht_set->entry_tree))
+ br_multicast_del_eht_set(eht_set);
+}
+
+static bool br_multicast_del_eht_set_entry(struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *src_addr,
+ union net_bridge_eht_addr *h_addr)
+{
+ struct net_bridge_group_eht_set_entry *set_h;
+ struct net_bridge_group_eht_set *eht_set;
+ bool set_deleted = false;
+
+ eht_set = br_multicast_eht_set_lookup(pg, src_addr);
+ if (!eht_set)
+ goto out;
+
+ set_h = br_multicast_eht_set_entry_lookup(eht_set, h_addr);
+ if (!set_h)
+ goto out;
+
+ __eht_del_set_entry(set_h);
+
+ if (RB_EMPTY_ROOT(&eht_set->entry_tree)) {
+ br_multicast_del_eht_set(eht_set);
+ set_deleted = true;
+ }
+
+out:
+ return set_deleted;
+}
+
+static void br_multicast_del_eht_host(struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr)
+{
+ struct net_bridge_group_eht_set_entry *set_h;
+ struct net_bridge_group_eht_host *eht_host;
+ struct hlist_node *tmp;
+
+ eht_host = br_multicast_eht_host_lookup(pg, h_addr);
+ if (!eht_host)
+ return;
+
+ hlist_for_each_entry_safe(set_h, tmp, &eht_host->set_entries, host_list)
+ br_multicast_del_eht_set_entry(set_h->eht_set->pg,
+ &set_h->eht_set->src_addr,
+ &set_h->h_addr);
+}
+
+/* create new set entries from reports */
+static void __eht_create_set_entries(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ size_t addr_size,
+ int filter_mode)
+{
+ union net_bridge_eht_addr eht_src_addr;
+ u32 src_idx;
+
+ memset(&eht_src_addr, 0, sizeof(eht_src_addr));
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size);
+ br_multicast_create_eht_set_entry(brmctx, pg, &eht_src_addr,
+ h_addr, filter_mode,
+ false);
+ }
+}
+
+/* delete existing set entries and their (S,G) entries if they were the last */
+static bool __eht_del_set_entries(struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ size_t addr_size)
+{
+ union net_bridge_eht_addr eht_src_addr;
+ struct net_bridge_group_src *src_ent;
+ bool changed = false;
+ struct br_ip src_ip;
+ u32 src_idx;
+
+ memset(&eht_src_addr, 0, sizeof(eht_src_addr));
+ memset(&src_ip, 0, sizeof(src_ip));
+ src_ip.proto = pg->key.addr.proto;
+ for (src_idx = 0; src_idx < nsrcs; src_idx++) {
+ memcpy(&eht_src_addr, srcs + (src_idx * addr_size), addr_size);
+ if (!br_multicast_del_eht_set_entry(pg, &eht_src_addr, h_addr))
+ continue;
+ memcpy(&src_ip, srcs + (src_idx * addr_size), addr_size);
+ src_ent = br_multicast_find_group_src(pg, &src_ip);
+ if (!src_ent)
+ continue;
+ br_multicast_del_group_src(src_ent, true);
+ changed = true;
+ }
+
+ return changed;
+}
+
+static bool br_multicast_eht_allow(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ size_t addr_size)
+{
+ bool changed = false;
+
+ switch (br_multicast_eht_host_filter_mode(pg, h_addr)) {
+ case MCAST_INCLUDE:
+ __eht_create_set_entries(brmctx, pg, h_addr, srcs, nsrcs,
+ addr_size, MCAST_INCLUDE);
+ break;
+ case MCAST_EXCLUDE:
+ changed = __eht_del_set_entries(pg, h_addr, srcs, nsrcs,
+ addr_size);
+ break;
+ }
+
+ return changed;
+}
+
+static bool br_multicast_eht_block(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ size_t addr_size)
+{
+ bool changed = false;
+
+ switch (br_multicast_eht_host_filter_mode(pg, h_addr)) {
+ case MCAST_INCLUDE:
+ changed = __eht_del_set_entries(pg, h_addr, srcs, nsrcs,
+ addr_size);
+ break;
+ case MCAST_EXCLUDE:
+ __eht_create_set_entries(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ MCAST_EXCLUDE);
+ break;
+ }
+
+ return changed;
+}
+
+/* flush_entries is true when changing mode */
+static bool __eht_inc_exc(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ size_t addr_size,
+ unsigned char filter_mode,
+ bool to_report)
+{
+ bool changed = false, flush_entries = to_report;
+ union net_bridge_eht_addr eht_src_addr;
+
+ if (br_multicast_eht_host_filter_mode(pg, h_addr) != filter_mode)
+ flush_entries = true;
+
+ memset(&eht_src_addr, 0, sizeof(eht_src_addr));
+ /* if we're changing mode del host and its entries */
+ if (flush_entries)
+ br_multicast_del_eht_host(pg, h_addr);
+ __eht_create_set_entries(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ filter_mode);
+ /* we can be missing sets only if we've deleted some entries */
+ if (flush_entries) {
+ struct net_bridge_group_eht_set *eht_set;
+ struct net_bridge_group_src *src_ent;
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry_safe(src_ent, tmp, &pg->src_list, node) {
+ br_multicast_ip_src_to_eht_addr(&src_ent->addr,
+ &eht_src_addr);
+ if (!br_multicast_eht_set_lookup(pg, &eht_src_addr)) {
+ br_multicast_del_group_src(src_ent, true);
+ changed = true;
+ continue;
+ }
+ /* this is an optimization for TO_INCLUDE where we lower
+ * the set's timeout to LMQT to catch timeout hosts:
+ * - host A (timing out): set entries X, Y
+ * - host B: set entry Z (new from current TO_INCLUDE)
+ * sends BLOCK Z after LMQT but host A's EHT
+ * entries still exist (unless lowered to LMQT
+ * so they can timeout with the S,Gs)
+ * => we wait another LMQT, when we can just delete the
+ * group immediately
+ */
+ if (!(src_ent->flags & BR_SGRP_F_SEND) ||
+ filter_mode != MCAST_INCLUDE ||
+ !to_report)
+ continue;
+ eht_set = br_multicast_eht_set_lookup(pg,
+ &eht_src_addr);
+ if (!eht_set)
+ continue;
+ mod_timer(&eht_set->timer, jiffies + br_multicast_lmqt(brmctx));
+ }
+ }
+
+ return changed;
+}
+
+static bool br_multicast_eht_inc(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ size_t addr_size,
+ bool to_report)
+{
+ bool changed;
+
+ changed = __eht_inc_exc(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ MCAST_INCLUDE, to_report);
+ br_eht_convert_host_filter_mode(brmctx, pg, h_addr, MCAST_INCLUDE);
+
+ return changed;
+}
+
+static bool br_multicast_eht_exc(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ size_t addr_size,
+ bool to_report)
+{
+ bool changed;
+
+ changed = __eht_inc_exc(brmctx, pg, h_addr, srcs, nsrcs, addr_size,
+ MCAST_EXCLUDE, to_report);
+ br_eht_convert_host_filter_mode(brmctx, pg, h_addr, MCAST_EXCLUDE);
+
+ return changed;
+}
+
+static bool __eht_ip4_handle(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ int grec_type)
+{
+ bool changed = false, to_report = false;
+
+ switch (grec_type) {
+ case IGMPV3_ALLOW_NEW_SOURCES:
+ br_multicast_eht_allow(brmctx, pg, h_addr, srcs, nsrcs,
+ sizeof(__be32));
+ break;
+ case IGMPV3_BLOCK_OLD_SOURCES:
+ changed = br_multicast_eht_block(brmctx, pg, h_addr, srcs, nsrcs,
+ sizeof(__be32));
+ break;
+ case IGMPV3_CHANGE_TO_INCLUDE:
+ to_report = true;
+ fallthrough;
+ case IGMPV3_MODE_IS_INCLUDE:
+ changed = br_multicast_eht_inc(brmctx, pg, h_addr, srcs, nsrcs,
+ sizeof(__be32), to_report);
+ break;
+ case IGMPV3_CHANGE_TO_EXCLUDE:
+ to_report = true;
+ fallthrough;
+ case IGMPV3_MODE_IS_EXCLUDE:
+ changed = br_multicast_eht_exc(brmctx, pg, h_addr, srcs, nsrcs,
+ sizeof(__be32), to_report);
+ break;
+ }
+
+ return changed;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static bool __eht_ip6_handle(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ union net_bridge_eht_addr *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ int grec_type)
+{
+ bool changed = false, to_report = false;
+
+ switch (grec_type) {
+ case MLD2_ALLOW_NEW_SOURCES:
+ br_multicast_eht_allow(brmctx, pg, h_addr, srcs, nsrcs,
+ sizeof(struct in6_addr));
+ break;
+ case MLD2_BLOCK_OLD_SOURCES:
+ changed = br_multicast_eht_block(brmctx, pg, h_addr, srcs, nsrcs,
+ sizeof(struct in6_addr));
+ break;
+ case MLD2_CHANGE_TO_INCLUDE:
+ to_report = true;
+ fallthrough;
+ case MLD2_MODE_IS_INCLUDE:
+ changed = br_multicast_eht_inc(brmctx, pg, h_addr, srcs, nsrcs,
+ sizeof(struct in6_addr),
+ to_report);
+ break;
+ case MLD2_CHANGE_TO_EXCLUDE:
+ to_report = true;
+ fallthrough;
+ case MLD2_MODE_IS_EXCLUDE:
+ changed = br_multicast_eht_exc(brmctx, pg, h_addr, srcs, nsrcs,
+ sizeof(struct in6_addr),
+ to_report);
+ break;
+ }
+
+ return changed;
+}
+#endif
+
+/* true means an entry was deleted */
+bool br_multicast_eht_handle(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ void *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ size_t addr_size,
+ int grec_type)
+{
+ bool eht_enabled = !!(pg->key.port->flags & BR_MULTICAST_FAST_LEAVE);
+ union net_bridge_eht_addr eht_host_addr;
+ bool changed = false;
+
+ if (!eht_enabled)
+ goto out;
+
+ memset(&eht_host_addr, 0, sizeof(eht_host_addr));
+ memcpy(&eht_host_addr, h_addr, addr_size);
+ if (addr_size == sizeof(__be32))
+ changed = __eht_ip4_handle(brmctx, pg, &eht_host_addr, srcs,
+ nsrcs, grec_type);
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ changed = __eht_ip6_handle(brmctx, pg, &eht_host_addr, srcs,
+ nsrcs, grec_type);
+#endif
+
+out:
+ return changed;
+}
+
+int br_multicast_eht_set_hosts_limit(struct net_bridge_port *p,
+ u32 eht_hosts_limit)
+{
+ struct net_bridge *br = p->br;
+
+ if (!eht_hosts_limit)
+ return -EINVAL;
+
+ spin_lock_bh(&br->multicast_lock);
+ p->multicast_eht_hosts_limit = eht_hosts_limit;
+ spin_unlock_bh(&br->multicast_lock);
+
+ return 0;
+}
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 37278dc280eb..083e2fe96441 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Handle firewalling
* Linux ethernet bridge
@@ -6,11 +7,6 @@
* Lennert Buytenhek <buytenh@gnu.org>
* Bart De Schuymer <bdschuym@pandora.be>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Lennert dedicates this file to Kerstin Wurdinger.
*/
@@ -37,9 +33,11 @@
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/addrconf.h>
+#include <net/dst_metadata.h>
#include <net/route.h>
#include <net/netfilter/br_netfilter.h>
#include <net/netns/generic.h>
+#include <net/inet_dscp.h>
#include <linux/uaccess.h>
#include "br_private.h"
@@ -47,29 +45,30 @@
#include <linux/sysctl.h>
#endif
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack_core.h>
+#endif
+
static unsigned int brnf_net_id __read_mostly;
struct brnf_net {
bool enabled;
-};
#ifdef CONFIG_SYSCTL
-static struct ctl_table_header *brnf_sysctl_header;
-static int brnf_call_iptables __read_mostly = 1;
-static int brnf_call_ip6tables __read_mostly = 1;
-static int brnf_call_arptables __read_mostly = 1;
-static int brnf_filter_vlan_tagged __read_mostly;
-static int brnf_filter_pppoe_tagged __read_mostly;
-static int brnf_pass_vlan_indev __read_mostly;
-#else
-#define brnf_call_iptables 1
-#define brnf_call_ip6tables 1
-#define brnf_call_arptables 1
-#define brnf_filter_vlan_tagged 0
-#define brnf_filter_pppoe_tagged 0
-#define brnf_pass_vlan_indev 0
+ struct ctl_table_header *ctl_hdr;
#endif
+ /* default value is 1 */
+ int call_iptables;
+ int call_ip6tables;
+ int call_arptables;
+
+ /* default value is 0 */
+ int filter_vlan_tagged;
+ int filter_pppoe_tagged;
+ int pass_vlan_indev;
+};
+
#define IS_IP(skb) \
(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP))
@@ -89,17 +88,28 @@ static inline __be16 vlan_proto(const struct sk_buff *skb)
return 0;
}
-#define IS_VLAN_IP(skb) \
- (vlan_proto(skb) == htons(ETH_P_IP) && \
- brnf_filter_vlan_tagged)
+static inline bool is_vlan_ip(const struct sk_buff *skb, const struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+ return vlan_proto(skb) == htons(ETH_P_IP) && brnet->filter_vlan_tagged;
+}
+
+static inline bool is_vlan_ipv6(const struct sk_buff *skb,
+ const struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+ return vlan_proto(skb) == htons(ETH_P_IPV6) &&
+ brnet->filter_vlan_tagged;
+}
-#define IS_VLAN_IPV6(skb) \
- (vlan_proto(skb) == htons(ETH_P_IPV6) && \
- brnf_filter_vlan_tagged)
+static inline bool is_vlan_arp(const struct sk_buff *skb, const struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
-#define IS_VLAN_ARP(skb) \
- (vlan_proto(skb) == htons(ETH_P_ARP) && \
- brnf_filter_vlan_tagged)
+ return vlan_proto(skb) == htons(ETH_P_ARP) && brnet->filter_vlan_tagged;
+}
static inline __be16 pppoe_proto(const struct sk_buff *skb)
{
@@ -107,20 +117,29 @@ static inline __be16 pppoe_proto(const struct sk_buff *skb)
sizeof(struct pppoe_hdr)));
}
-#define IS_PPPOE_IP(skb) \
- (skb->protocol == htons(ETH_P_PPP_SES) && \
- pppoe_proto(skb) == htons(PPP_IP) && \
- brnf_filter_pppoe_tagged)
+static inline bool is_pppoe_ip(const struct sk_buff *skb, const struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+ return skb->protocol == htons(ETH_P_PPP_SES) &&
+ pppoe_proto(skb) == htons(PPP_IP) && brnet->filter_pppoe_tagged;
+}
+
+static inline bool is_pppoe_ipv6(const struct sk_buff *skb,
+ const struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
-#define IS_PPPOE_IPV6(skb) \
- (skb->protocol == htons(ETH_P_PPP_SES) && \
- pppoe_proto(skb) == htons(PPP_IPV6) && \
- brnf_filter_pppoe_tagged)
+ return skb->protocol == htons(ETH_P_PPP_SES) &&
+ pppoe_proto(skb) == htons(PPP_IPV6) &&
+ brnet->filter_pppoe_tagged;
+}
/* largest possible L2 header, see br_nf_dev_queue_xmit() */
#define NF_BRIDGE_MAX_MAC_HEADER_LENGTH (PPPOE_SES_HLEN + ETH_HLEN)
struct brnf_frag_data {
+ local_lock_t bh_lock;
char mac[NF_BRIDGE_MAX_MAC_HEADER_LENGTH];
u8 encap_size;
u8 size;
@@ -128,14 +147,13 @@ struct brnf_frag_data {
__be16 vlan_proto;
};
-static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage);
+static DEFINE_PER_CPU(struct brnf_frag_data, brnf_frag_data_storage) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
static void nf_bridge_info_free(struct sk_buff *skb)
{
- if (skb->nf_bridge) {
- nf_bridge_put(skb->nf_bridge);
- skb->nf_bridge = NULL;
- }
+ skb_ext_del(skb, SKB_EXT_BRIDGE_NF);
}
static inline struct net_device *bridge_parent(const struct net_device *dev)
@@ -148,19 +166,7 @@ static inline struct net_device *bridge_parent(const struct net_device *dev)
static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb)
{
- struct nf_bridge_info *nf_bridge = skb->nf_bridge;
-
- if (refcount_read(&nf_bridge->use) > 1) {
- struct nf_bridge_info *tmp = nf_bridge_alloc(skb);
-
- if (tmp) {
- memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info));
- refcount_set(&tmp->use, 1);
- }
- nf_bridge_put(nf_bridge);
- nf_bridge = tmp;
- }
- return nf_bridge;
+ return skb_ext_add(skb, SKB_EXT_BRIDGE_NF);
}
unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb)
@@ -217,7 +223,7 @@ static int br_validate_ipv4(struct net *net, struct sk_buff *skb)
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto csum_error;
- len = ntohs(iph->tot_len);
+ len = skb_ip_totlen(skb);
if (skb->len < len) {
__IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
@@ -247,7 +253,9 @@ drop:
void nf_bridge_update_protocol(struct sk_buff *skb)
{
- switch (skb->nf_bridge->orig_proto) {
+ const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+
+ switch (nf_bridge->orig_proto) {
case BRNF_PROTO_8021Q:
skb->protocol = htons(ETH_P_8021Q);
break;
@@ -278,9 +286,19 @@ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
int ret;
- if (neigh->hh.hh_len) {
+ if ((READ_ONCE(neigh->nud_state) & NUD_CONNECTED) &&
+ READ_ONCE(neigh->hh.hh_len)) {
+ struct net_device *br_indev;
+
+ br_indev = nf_bridge_get_physindev(skb, net);
+ if (!br_indev) {
+ neigh_release(neigh);
+ goto free_skb;
+ }
+
neigh_hh_bridge(&neigh->hh, skb);
- skb->dev = nf_bridge->physindev;
+ skb->dev = br_indev;
+
ret = br_handle_frame_finish(net, sk, skb);
} else {
/* the neighbour function below overwrites the complete
@@ -294,7 +312,7 @@ int br_nf_pre_routing_finish_bridge(struct net *net, struct sock *sk, struct sk_
/* tell br_dev_xmit to continue with forwarding */
nf_bridge->bridged_dnat = 1;
/* FIXME Need to refragment */
- ret = neigh->output(neigh, skb);
+ ret = READ_ONCE(neigh->output)(neigh, skb);
}
neigh_release(neigh);
return ret;
@@ -352,11 +370,17 @@ br_nf_ipv4_daddr_was_changed(const struct sk_buff *skb,
*/
static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct net_device *dev = skb->dev;
- struct iphdr *iph = ip_hdr(skb);
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+ struct net_device *dev = skb->dev, *br_indev;
+ const struct iphdr *iph = ip_hdr(skb);
+ enum skb_drop_reason reason;
struct rtable *rt;
- int err;
+
+ br_indev = nf_bridge_get_physindev(skb, net);
+ if (!br_indev) {
+ kfree_skb(skb);
+ return 0;
+ }
nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
@@ -366,37 +390,14 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_
}
nf_bridge->in_prerouting = 0;
if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) {
- if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
- struct in_device *in_dev = __in_dev_get_rcu(dev);
-
- /* If err equals -EHOSTUNREACH the error is due to a
- * martian destination or due to the fact that
- * forwarding is disabled. For most martian packets,
- * ip_route_output_key() will fail. It won't fail for 2 types of
- * martian destinations: loopback destinations and destination
- * 0.0.0.0. In both cases the packet will be dropped because the
- * destination is the loopback device and not the bridge. */
- if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev))
- goto free_skb;
-
- rt = ip_route_output(net, iph->daddr, 0,
- RT_TOS(iph->tos), 0);
- if (!IS_ERR(rt)) {
- /* - Bridged-and-DNAT'ed traffic doesn't
- * require ip_forwarding. */
- if (rt->dst.dev == dev) {
- skb_dst_set(skb, &rt->dst);
- goto bridged_dnat;
- }
- ip_rt_put(rt);
- }
-free_skb:
- kfree_skb(skb);
+ reason = ip_route_input(skb, iph->daddr, iph->saddr,
+ ip4h_dscp(iph), dev);
+ if (reason) {
+ kfree_skb_reason(skb, reason);
return 0;
} else {
if (skb_dst(skb)->dev == dev) {
-bridged_dnat:
- skb->dev = nf_bridge->physindev;
+ skb->dev = br_indev;
nf_bridge_update_protocol(skb);
nf_bridge_push_encap_header(skb);
br_nf_hook_thresh(NF_BR_PRE_ROUTING,
@@ -409,15 +410,16 @@ bridged_dnat:
skb->pkt_type = PACKET_HOST;
}
} else {
- rt = bridge_parent_rtable(nf_bridge->physindev);
+ rt = bridge_parent_rtable(br_indev);
if (!rt) {
kfree_skb(skb);
return 0;
}
+ skb_dst_drop(skb);
skb_dst_set_noref(skb, &rt->dst);
}
- skb->dev = nf_bridge->physindev;
+ skb->dev = br_indev;
nf_bridge_update_protocol(skb);
nf_bridge_push_encap_header(skb);
br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb, skb->dev, NULL,
@@ -425,12 +427,16 @@ bridged_dnat:
return 0;
}
-static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev)
+static struct net_device *brnf_get_logical_dev(struct sk_buff *skb,
+ const struct net_device *dev,
+ const struct net *net)
{
struct net_device *vlan, *br;
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
br = bridge_parent(dev);
- if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
+
+ if (brnet->pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
return br;
vlan = __vlan_find_dev_deep_rcu(br, skb->vlan_proto,
@@ -440,7 +446,7 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct
}
/* Some common code for IPv4/IPv6 */
-struct net_device *setup_pre_routing(struct sk_buff *skb)
+struct net_device *setup_pre_routing(struct sk_buff *skb, const struct net *net)
{
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
@@ -450,8 +456,8 @@ struct net_device *setup_pre_routing(struct sk_buff *skb)
}
nf_bridge->in_prerouting = 1;
- nf_bridge->physindev = skb->dev;
- skb->dev = brnf_get_logical_dev(skb, skb->dev);
+ nf_bridge->physinif = skb->dev->ifindex;
+ skb->dev = brnf_get_logical_dev(skb, skb->dev, net);
if (skb->protocol == htons(ETH_P_8021Q))
nf_bridge->orig_proto = BRNF_PROTO_8021Q;
@@ -477,44 +483,53 @@ static unsigned int br_nf_pre_routing(void *priv,
struct net_bridge_port *p;
struct net_bridge *br;
__u32 len = nf_bridge_encap_header_len(skb);
+ struct brnf_net *brnet;
if (unlikely(!pskb_may_pull(skb, len)))
- return NF_DROP;
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_PKT_TOO_SMALL, 0);
p = br_port_get_rcu(state->in);
if (p == NULL)
- return NF_DROP;
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0);
br = p->br;
- if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) {
- if (!brnf_call_ip6tables && !br->nf_call_ip6tables)
+ brnet = net_generic(state->net, brnf_net_id);
+ if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
+ is_pppoe_ipv6(skb, state->net)) {
+ if (!brnet->call_ip6tables &&
+ !br_opt_get(br, BROPT_NF_CALL_IP6TABLES))
return NF_ACCEPT;
+ if (!ipv6_mod_enabled()) {
+ pr_warn_once("Module ipv6 is disabled, so call_ip6tables is not supported.");
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_IPV6DISABLED, 0);
+ }
nf_bridge_pull_encap_header_rcsum(skb);
return br_nf_pre_routing_ipv6(priv, skb, state);
}
- if (!brnf_call_iptables && !br->nf_call_iptables)
+ if (!brnet->call_iptables && !br_opt_get(br, BROPT_NF_CALL_IPTABLES))
return NF_ACCEPT;
- if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb))
+ if (!IS_IP(skb) && !is_vlan_ip(skb, state->net) &&
+ !is_pppoe_ip(skb, state->net))
return NF_ACCEPT;
nf_bridge_pull_encap_header_rcsum(skb);
if (br_validate_ipv4(state->net, skb))
- return NF_DROP;
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0);
- nf_bridge_put(skb->nf_bridge);
if (!nf_bridge_alloc(skb))
- return NF_DROP;
- if (!setup_pre_routing(skb))
- return NF_DROP;
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0);
+ if (!setup_pre_routing(skb, state->net))
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0);
nf_bridge = nf_bridge_info_get(skb);
nf_bridge->ipv4_daddr = ip_hdr(skb)->daddr;
skb->protocol = htons(ETH_P_IP);
+ skb->transport_header = skb->network_header + ip_hdr(skb)->ihl * 4;
NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->net, state->sk, skb,
skb->dev, NULL,
@@ -523,6 +538,97 @@ static unsigned int br_nf_pre_routing(void *priv,
return NF_STOLEN;
}
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+/* conntracks' nf_confirm logic cannot handle cloned skbs referencing
+ * the same nf_conn entry, which will happen for multicast (broadcast)
+ * Frames on bridges.
+ *
+ * Example:
+ * macvlan0
+ * br0
+ * ethX ethY
+ *
+ * ethX (or Y) receives multicast or broadcast packet containing
+ * an IP packet, not yet in conntrack table.
+ *
+ * 1. skb passes through bridge and fake-ip (br_netfilter)Prerouting.
+ * -> skb->_nfct now references a unconfirmed entry
+ * 2. skb is broad/mcast packet. bridge now passes clones out on each bridge
+ * interface.
+ * 3. skb gets passed up the stack.
+ * 4. In macvlan case, macvlan driver retains clone(s) of the mcast skb
+ * and schedules a work queue to send them out on the lower devices.
+ *
+ * The clone skb->_nfct is not a copy, it is the same entry as the
+ * original skb. The macvlan rx handler then returns RX_HANDLER_PASS.
+ * 5. Normal conntrack hooks (in NF_INET_LOCAL_IN) confirm the orig skb.
+ *
+ * The Macvlan broadcast worker and normal confirm path will race.
+ *
+ * This race will not happen if step 2 already confirmed a clone. In that
+ * case later steps perform skb_clone() with skb->_nfct already confirmed (in
+ * hash table). This works fine.
+ *
+ * But such confirmation won't happen when eb/ip/nftables rules dropped the
+ * packets before they reached the nf_confirm step in postrouting.
+ *
+ * Work around this problem by explicit confirmation of the entry at
+ * LOCAL_IN time, before upper layer has a chance to clone the unconfirmed
+ * entry.
+ *
+ */
+static unsigned int br_nf_local_in(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ bool promisc = BR_INPUT_SKB_CB(skb)->promisc;
+ struct nf_conntrack *nfct = skb_nfct(skb);
+ const struct nf_ct_hook *ct_hook;
+ struct nf_conn *ct;
+ int ret;
+
+ if (promisc) {
+ nf_reset_ct(skb);
+ return NF_ACCEPT;
+ }
+
+ if (!nfct || skb->pkt_type == PACKET_HOST)
+ return NF_ACCEPT;
+
+ ct = container_of(nfct, struct nf_conn, ct_general);
+ if (likely(nf_ct_is_confirmed(ct)))
+ return NF_ACCEPT;
+
+ if (WARN_ON_ONCE(refcount_read(&nfct->use) != 1)) {
+ nf_reset_ct(skb);
+ return NF_ACCEPT;
+ }
+
+ WARN_ON_ONCE(skb_shared(skb));
+
+ /* We can't call nf_confirm here, it would create a dependency
+ * on nf_conntrack module.
+ */
+ ct_hook = rcu_dereference(nf_ct_hook);
+ if (!ct_hook) {
+ skb->_nfct = 0ul;
+ nf_conntrack_put(nfct);
+ return NF_ACCEPT;
+ }
+
+ nf_bridge_pull_encap_header(skb);
+ ret = ct_hook->confirm(skb);
+ switch (ret & NF_VERDICT_MASK) {
+ case NF_STOLEN:
+ return NF_STOLEN;
+ default:
+ nf_bridge_push_encap_header(skb);
+ break;
+ }
+
+ return ret;
+}
+#endif
/* PF_BRIDGE/FORWARD *************************************************/
static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
@@ -530,7 +636,7 @@ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
struct net_device *in;
- if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) {
+ if (!IS_ARP(skb) && !is_vlan_arp(skb, net)) {
if (skb->protocol == htons(ETH_P_IP))
nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
@@ -538,7 +644,11 @@ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff
if (skb->protocol == htons(ETH_P_IPV6))
nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size;
- in = nf_bridge->physindev;
+ in = nf_bridge_get_physindev(skb, net);
+ if (!in) {
+ kfree_skb(skb);
+ return 0;
+ }
if (nf_bridge->pkt_otherhost) {
skb->pkt_type = PACKET_OTHERHOST;
nf_bridge->pkt_otherhost = false;
@@ -555,41 +665,29 @@ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff
}
-/* This is the 'purely bridged' case. For IP, we pass the packet to
- * netfilter with indev and outdev set to the bridge device,
- * but we are still able to filter on the 'real' indev/outdev
- * because of the physdev module. For ARP, indev and outdev are the
- * bridge ports. */
-static unsigned int br_nf_forward_ip(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
+static unsigned int br_nf_forward_ip(struct sk_buff *skb,
+ const struct nf_hook_state *state,
+ u8 pf)
{
struct nf_bridge_info *nf_bridge;
struct net_device *parent;
- u_int8_t pf;
- if (!skb->nf_bridge)
+ nf_bridge = nf_bridge_info_get(skb);
+ if (!nf_bridge)
return NF_ACCEPT;
/* Need exclusive nf_bridge_info since we might have multiple
* different physoutdevs. */
if (!nf_bridge_unshare(skb))
- return NF_DROP;
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0);
nf_bridge = nf_bridge_info_get(skb);
if (!nf_bridge)
- return NF_DROP;
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0);
parent = bridge_parent(state->out);
if (!parent)
- return NF_DROP;
-
- if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
- pf = NFPROTO_IPV4;
- else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
- pf = NFPROTO_IPV6;
- else
- return NF_ACCEPT;
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0);
nf_bridge_pull_encap_header(skb);
@@ -600,53 +698,53 @@ static unsigned int br_nf_forward_ip(void *priv,
if (pf == NFPROTO_IPV4) {
if (br_validate_ipv4(state->net, skb))
- return NF_DROP;
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0);
IPCB(skb)->frag_max_size = nf_bridge->frag_max_size;
- }
-
- if (pf == NFPROTO_IPV6) {
+ skb->protocol = htons(ETH_P_IP);
+ } else if (pf == NFPROTO_IPV6) {
if (br_validate_ipv6(state->net, skb))
- return NF_DROP;
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0);
IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size;
+ skb->protocol = htons(ETH_P_IPV6);
+ } else {
+ WARN_ON_ONCE(1);
+ return NF_DROP;
}
nf_bridge->physoutdev = skb->dev;
- if (pf == NFPROTO_IPV4)
- skb->protocol = htons(ETH_P_IP);
- else
- skb->protocol = htons(ETH_P_IPV6);
NF_HOOK(pf, NF_INET_FORWARD, state->net, NULL, skb,
- brnf_get_logical_dev(skb, state->in),
+ brnf_get_logical_dev(skb, state->in, state->net),
parent, br_nf_forward_finish);
return NF_STOLEN;
}
-static unsigned int br_nf_forward_arp(void *priv,
- struct sk_buff *skb,
+static unsigned int br_nf_forward_arp(struct sk_buff *skb,
const struct nf_hook_state *state)
{
struct net_bridge_port *p;
struct net_bridge *br;
struct net_device **d = (struct net_device **)(skb->cb);
+ struct brnf_net *brnet;
p = br_port_get_rcu(state->out);
if (p == NULL)
return NF_ACCEPT;
br = p->br;
- if (!brnf_call_arptables && !br->nf_call_arptables)
+ brnet = net_generic(state->net, brnf_net_id);
+ if (!brnet->call_arptables && !br_opt_get(br, BROPT_NF_CALL_ARPTABLES))
return NF_ACCEPT;
- if (!IS_ARP(skb)) {
- if (!IS_VLAN_ARP(skb))
- return NF_ACCEPT;
+ if (is_vlan_arp(skb, state->net))
nf_bridge_pull_encap_header(skb);
- }
+
+ if (unlikely(!pskb_may_pull(skb, sizeof(struct arphdr))))
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_PKT_TOO_SMALL, 0);
if (arp_hdr(skb)->ar_pln != 4) {
- if (IS_VLAN_ARP(skb))
+ if (is_vlan_arp(skb, state->net))
nf_bridge_push_encap_header(skb);
return NF_ACCEPT;
}
@@ -657,6 +755,28 @@ static unsigned int br_nf_forward_arp(void *priv,
return NF_STOLEN;
}
+/* This is the 'purely bridged' case. For IP, we pass the packet to
+ * netfilter with indev and outdev set to the bridge device,
+ * but we are still able to filter on the 'real' indev/outdev
+ * because of the physdev module. For ARP, indev and outdev are the
+ * bridge ports.
+ */
+static unsigned int br_nf_forward(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ if (IS_IP(skb) || is_vlan_ip(skb, state->net) ||
+ is_pppoe_ip(skb, state->net))
+ return br_nf_forward_ip(skb, state, NFPROTO_IPV4);
+ if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
+ is_pppoe_ipv6(skb, state->net))
+ return br_nf_forward_ip(skb, state, NFPROTO_IPV6);
+ if (IS_ARP(skb) || is_vlan_arp(skb, state->net))
+ return br_nf_forward_arp(skb, state);
+
+ return NF_ACCEPT;
+}
+
static int br_nf_push_frag_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct brnf_frag_data *data;
@@ -670,10 +790,8 @@ static int br_nf_push_frag_xmit(struct net *net, struct sock *sk, struct sk_buff
return 0;
}
- if (data->vlan_tci) {
- skb->vlan_tci = data->vlan_tci;
- skb->vlan_proto = data->vlan_proto;
- }
+ if (data->vlan_proto)
+ __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci);
skb_copy_to_linear_data_offset(skb, -data->size, data->mac, data->size);
__skb_push(skb, data->encap_size);
@@ -702,7 +820,9 @@ br_nf_ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
static unsigned int nf_bridge_mtu_reduction(const struct sk_buff *skb)
{
- if (skb->nf_bridge->orig_proto == BRNF_PROTO_PPPOE)
+ const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+
+ if (nf_bridge->orig_proto == BRNF_PROTO_PPPOE)
return PPPOE_SES_HLEN;
return 0;
}
@@ -711,18 +831,31 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff
{
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
unsigned int mtu, mtu_reserved;
+ int ret;
mtu_reserved = nf_bridge_mtu_reduction(skb);
mtu = skb->dev->mtu;
+ if (nf_bridge->pkt_otherhost) {
+ skb->pkt_type = PACKET_OTHERHOST;
+ nf_bridge->pkt_otherhost = false;
+ }
+
if (nf_bridge->frag_max_size && nf_bridge->frag_max_size < mtu)
mtu = nf_bridge->frag_max_size;
+ nf_bridge_update_protocol(skb);
+ nf_bridge_push_encap_header(skb);
+
if (skb_is_gso(skb) || skb->len + mtu_reserved <= mtu) {
nf_bridge_info_free(skb);
return br_dev_queue_push_xmit(net, sk, skb);
}
+ /* Fragmentation on metadata/template dst is not supported */
+ if (unlikely(!skb_valid_dst(skb)))
+ goto drop;
+
/* This is wrong! We should preserve the original fragment
* boundaries by preserving frag_list rather than refragmenting.
*/
@@ -735,19 +868,25 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff
IPCB(skb)->frag_max_size = nf_bridge->frag_max_size;
- nf_bridge_update_protocol(skb);
-
+ local_lock_nested_bh(&brnf_frag_data_storage.bh_lock);
data = this_cpu_ptr(&brnf_frag_data_storage);
- data->vlan_tci = skb->vlan_tci;
- data->vlan_proto = skb->vlan_proto;
+ if (skb_vlan_tag_present(skb)) {
+ data->vlan_tci = skb->vlan_tci;
+ data->vlan_proto = skb->vlan_proto;
+ } else {
+ data->vlan_proto = 0;
+ }
+
data->encap_size = nf_bridge_encap_header_len(skb);
data->size = ETH_HLEN + data->encap_size;
skb_copy_from_linear_data_offset(skb, -data->size, data->mac,
data->size);
- return br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit);
+ ret = br_nf_ip_fragment(net, sk, skb, br_nf_push_frag_xmit);
+ local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock);
+ return ret;
}
if (IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) &&
skb->protocol == htons(ETH_P_IPV6)) {
@@ -759,8 +898,7 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff
IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size;
- nf_bridge_update_protocol(skb);
-
+ local_lock_nested_bh(&brnf_frag_data_storage.bh_lock);
data = this_cpu_ptr(&brnf_frag_data_storage);
data->encap_size = nf_bridge_encap_header_len(skb);
data->size = ETH_HLEN + data->encap_size;
@@ -768,8 +906,12 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff
skb_copy_from_linear_data_offset(skb, -data->size, data->mac,
data->size);
- if (v6ops)
- return v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit);
+ if (v6ops) {
+ ret = v6ops->fragment(net, sk, skb, br_nf_push_frag_xmit);
+ local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock);
+ return ret;
+ }
+ local_unlock_nested_bh(&brnf_frag_data_storage.bh_lock);
kfree_skb(skb);
return -EMSGSIZE;
@@ -799,17 +941,17 @@ static unsigned int br_nf_post_routing(void *priv,
return NF_ACCEPT;
if (!realoutdev)
- return NF_DROP;
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0);
- if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
+ if (IS_IP(skb) || is_vlan_ip(skb, state->net) ||
+ is_pppoe_ip(skb, state->net))
pf = NFPROTO_IPV4;
- else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
+ else if (IS_IPV6(skb) || is_vlan_ipv6(skb, state->net) ||
+ is_pppoe_ipv6(skb, state->net))
pf = NFPROTO_IPV6;
else
return NF_ACCEPT;
- /* We assume any code from br_dev_queue_push_xmit onwards doesn't care
- * about the value of skb->pkt_type. */
if (skb->pkt_type == PACKET_OTHERHOST) {
skb->pkt_type = PACKET_HOST;
nf_bridge->pkt_otherhost = true;
@@ -835,10 +977,19 @@ static unsigned int ip_sabotage_in(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- if (skb->nf_bridge && !skb->nf_bridge->in_prerouting &&
- !netif_is_l3_master(skb->dev)) {
- state->okfn(state->net, state->sk, skb);
- return NF_STOLEN;
+ struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+
+ if (nf_bridge) {
+ if (nf_bridge->sabotage_in_done)
+ return NF_ACCEPT;
+
+ if (!nf_bridge->in_prerouting &&
+ !netif_is_l3_master(skb->dev) &&
+ !netif_is_l3_slave(skb->dev)) {
+ nf_bridge->sabotage_in_done = 1;
+ state->okfn(state->net, state->sk, skb);
+ return NF_STOLEN;
+ }
}
return NF_ACCEPT;
@@ -856,6 +1007,13 @@ static unsigned int ip_sabotage_in(void *priv,
static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
{
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+ struct net_device *br_indev;
+
+ br_indev = nf_bridge_get_physindev(skb, dev_net(skb->dev));
+ if (!br_indev) {
+ kfree_skb(skb);
+ return;
+ }
skb_pull(skb, ETH_HLEN);
nf_bridge->bridged_dnat = 0;
@@ -865,7 +1023,7 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
skb_copy_to_linear_data_offset(skb, -(ETH_HLEN - ETH_ALEN),
nf_bridge->neigh_header,
ETH_HLEN - ETH_ALEN);
- skb->dev = nf_bridge->physindev;
+ skb->dev = br_indev;
nf_bridge->physoutdev = NULL;
br_handle_frame_finish(dev_net(skb->dev), NULL, skb);
@@ -873,7 +1031,9 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb)
static int br_nf_dev_xmit(struct sk_buff *skb)
{
- if (skb->nf_bridge && skb->nf_bridge->bridged_dnat) {
+ const struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
+
+ if (nf_bridge && nf_bridge->bridged_dnat) {
br_nf_pre_routing_finish_bridge_slow(skb);
return 1;
}
@@ -884,11 +1044,6 @@ static const struct nf_br_ops br_ops = {
.br_dev_xmit_hook = br_nf_dev_xmit,
};
-void br_netfilter_enable(void)
-{
-}
-EXPORT_SYMBOL_GPL(br_netfilter_enable);
-
/* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because
* br_dev_queue_push_xmit is called afterwards */
static const struct nf_hook_ops br_nf_ops[] = {
@@ -898,14 +1053,16 @@ static const struct nf_hook_ops br_nf_ops[] = {
.hooknum = NF_BR_PRE_ROUTING,
.priority = NF_BR_PRI_BRNF,
},
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
{
- .hook = br_nf_forward_ip,
+ .hook = br_nf_local_in,
.pf = NFPROTO_BRIDGE,
- .hooknum = NF_BR_FORWARD,
- .priority = NF_BR_PRI_BRNF - 1,
+ .hooknum = NF_BR_LOCAL_IN,
+ .priority = NF_BR_PRI_LAST,
},
+#endif
{
- .hook = br_nf_forward_arp,
+ .hook = br_nf_forward,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_FORWARD,
.priority = NF_BR_PRI_BRNF,
@@ -938,7 +1095,7 @@ static int brnf_device_event(struct notifier_block *unused, unsigned long event,
struct net *net;
int ret;
- if (event != NETDEV_REGISTER || !(dev->priv_flags & IFF_EBRIDGE))
+ if (event != NETDEV_REGISTER || !netif_is_bridge_master(dev))
return NOTIFY_DONE;
ASSERT_RTNL();
@@ -956,23 +1113,6 @@ static int brnf_device_event(struct notifier_block *unused, unsigned long event,
return NOTIFY_OK;
}
-static void __net_exit brnf_exit_net(struct net *net)
-{
- struct brnf_net *brnet = net_generic(net, brnf_net_id);
-
- if (!brnet->enabled)
- return;
-
- nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops));
- brnet->enabled = false;
-}
-
-static struct pernet_operations brnf_net_ops __read_mostly = {
- .exit = brnf_exit_net,
- .id = &brnf_net_id,
- .size = sizeof(struct brnf_net),
-};
-
static struct notifier_block brnf_notifier __read_mostly = {
.notifier_call = brnf_device_event,
};
@@ -1000,9 +1140,24 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net,
return okfn(net, sk, skb);
ops = nf_hook_entries_get_hook_ops(e);
- for (i = 0; i < e->num_hook_entries &&
- ops[i]->priority <= NF_BR_PRI_BRNF; i++)
- ;
+ for (i = 0; i < e->num_hook_entries; i++) {
+ /* These hooks have already been called */
+ if (ops[i]->priority < NF_BR_PRI_BRNF)
+ continue;
+
+ /* These hooks have not been called yet, run them. */
+ if (ops[i]->priority > NF_BR_PRI_BRNF)
+ break;
+
+ /* take a closer look at NF_BR_PRI_BRNF. */
+ if (ops[i]->hook == br_nf_pre_routing) {
+ /* This hook diverted the skb to this function,
+ * hooks after this have not been run yet.
+ */
+ i++;
+ break;
+ }
+ }
nf_hook_state_init(&state, hook, NFPROTO_BRIDGE, indev, outdev,
sk, net, okfn);
@@ -1016,8 +1171,8 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net,
#ifdef CONFIG_SYSCTL
static
-int brnf_sysctl_call_tables(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+int brnf_sysctl_call_tables(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -1031,49 +1186,124 @@ int brnf_sysctl_call_tables(struct ctl_table *ctl, int write,
static struct ctl_table brnf_table[] = {
{
.procname = "bridge-nf-call-arptables",
- .data = &brnf_call_arptables,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = brnf_sysctl_call_tables,
},
{
.procname = "bridge-nf-call-iptables",
- .data = &brnf_call_iptables,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = brnf_sysctl_call_tables,
},
{
.procname = "bridge-nf-call-ip6tables",
- .data = &brnf_call_ip6tables,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = brnf_sysctl_call_tables,
},
{
.procname = "bridge-nf-filter-vlan-tagged",
- .data = &brnf_filter_vlan_tagged,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = brnf_sysctl_call_tables,
},
{
.procname = "bridge-nf-filter-pppoe-tagged",
- .data = &brnf_filter_pppoe_tagged,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = brnf_sysctl_call_tables,
},
{
.procname = "bridge-nf-pass-vlan-input-dev",
- .data = &brnf_pass_vlan_indev,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = brnf_sysctl_call_tables,
},
- { }
};
+
+static inline void br_netfilter_sysctl_default(struct brnf_net *brnf)
+{
+ brnf->call_iptables = 1;
+ brnf->call_ip6tables = 1;
+ brnf->call_arptables = 1;
+ brnf->filter_vlan_tagged = 0;
+ brnf->filter_pppoe_tagged = 0;
+ brnf->pass_vlan_indev = 0;
+}
+
+static int br_netfilter_sysctl_init_net(struct net *net)
+{
+ struct ctl_table *table = brnf_table;
+ struct brnf_net *brnet;
+
+ if (!net_eq(net, &init_net)) {
+ table = kmemdup(table, sizeof(brnf_table), GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+ }
+
+ brnet = net_generic(net, brnf_net_id);
+ table[0].data = &brnet->call_arptables;
+ table[1].data = &brnet->call_iptables;
+ table[2].data = &brnet->call_ip6tables;
+ table[3].data = &brnet->filter_vlan_tagged;
+ table[4].data = &brnet->filter_pppoe_tagged;
+ table[5].data = &brnet->pass_vlan_indev;
+
+ br_netfilter_sysctl_default(brnet);
+
+ brnet->ctl_hdr = register_net_sysctl_sz(net, "net/bridge", table,
+ ARRAY_SIZE(brnf_table));
+ if (!brnet->ctl_hdr) {
+ if (!net_eq(net, &init_net))
+ kfree(table);
+
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void br_netfilter_sysctl_exit_net(struct net *net,
+ struct brnf_net *brnet)
+{
+ const struct ctl_table *table = brnet->ctl_hdr->ctl_table_arg;
+
+ unregister_net_sysctl_table(brnet->ctl_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
+
+static int __net_init brnf_init_net(struct net *net)
+{
+ return br_netfilter_sysctl_init_net(net);
+}
+#endif
+
+static void __net_exit brnf_exit_net(struct net *net)
+{
+ struct brnf_net *brnet;
+
+ brnet = net_generic(net, brnf_net_id);
+ if (brnet->enabled) {
+ nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops));
+ brnet->enabled = false;
+ }
+
+#ifdef CONFIG_SYSCTL
+ br_netfilter_sysctl_exit_net(net, brnet);
#endif
+}
+
+static struct pernet_operations brnf_net_ops __read_mostly = {
+#ifdef CONFIG_SYSCTL
+ .init = brnf_init_net,
+#endif
+ .exit = brnf_exit_net,
+ .id = &brnf_net_id,
+ .size = sizeof(struct brnf_net),
+};
static int __init br_netfilter_init(void)
{
@@ -1089,16 +1319,6 @@ static int __init br_netfilter_init(void)
return ret;
}
-#ifdef CONFIG_SYSCTL
- brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table);
- if (brnf_sysctl_header == NULL) {
- printk(KERN_WARNING
- "br_netfilter: can't register to sysctl.\n");
- unregister_netdevice_notifier(&brnf_notifier);
- unregister_pernet_subsys(&brnf_net_ops);
- return -ENOMEM;
- }
-#endif
RCU_INIT_POINTER(nf_br_ops, &br_ops);
printk(KERN_NOTICE "Bridge firewalling registered\n");
return 0;
@@ -1109,9 +1329,6 @@ static void __exit br_netfilter_fini(void)
RCU_INIT_POINTER(nf_br_ops, NULL);
unregister_netdevice_notifier(&brnf_notifier);
unregister_pernet_subsys(&brnf_net_ops);
-#ifdef CONFIG_SYSCTL
- unregister_net_sysctl_table(brnf_sysctl_header);
-#endif
}
module_init(br_netfilter_init);
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index 96c072e71ea2..e0421eaa3abc 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Handle firewalling
* Linux ethernet bridge
@@ -6,11 +7,6 @@
* Lennert Buytenhek <buytenh@gnu.org>
* Bart De Schuymer <bdschuym@pandora.be>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Lennert dedicates this file to Kerstin Wurdinger.
*/
@@ -44,62 +40,6 @@
#include <linux/sysctl.h>
#endif
-/* We only check the length. A bridge shouldn't do any hop-by-hop stuff
- * anyway
- */
-static int br_nf_check_hbh_len(struct sk_buff *skb)
-{
- unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1);
- u32 pkt_len;
- const unsigned char *nh = skb_network_header(skb);
- int off = raw - nh;
- int len = (raw[1] + 1) << 3;
-
- if ((raw + len) - skb->data > skb_headlen(skb))
- goto bad;
-
- off += 2;
- len -= 2;
-
- while (len > 0) {
- int optlen = nh[off + 1] + 2;
-
- switch (nh[off]) {
- case IPV6_TLV_PAD1:
- optlen = 1;
- break;
-
- case IPV6_TLV_PADN:
- break;
-
- case IPV6_TLV_JUMBO:
- if (nh[off + 1] != 4 || (off & 3) != 2)
- goto bad;
- pkt_len = ntohl(*(__be32 *)(nh + off + 2));
- if (pkt_len <= IPV6_MAXPLEN ||
- ipv6_hdr(skb)->payload_len)
- goto bad;
- if (pkt_len > skb->len - sizeof(struct ipv6hdr))
- goto bad;
- if (pskb_trim_rcsum(skb,
- pkt_len + sizeof(struct ipv6hdr)))
- goto bad;
- nh = skb_network_header(skb);
- break;
- default:
- if (optlen > len)
- goto bad;
- break;
- }
- off += optlen;
- len -= optlen;
- }
- if (len == 0)
- return 0;
-bad:
- return -1;
-}
-
int br_validate_ipv6(struct net *net, struct sk_buff *skb)
{
const struct ipv6hdr *hdr;
@@ -119,21 +59,19 @@ int br_validate_ipv6(struct net *net, struct sk_buff *skb)
goto inhdr_error;
pkt_len = ntohs(hdr->payload_len);
+ if (hdr->nexthdr == NEXTHDR_HOP && nf_ip6_check_hbh_len(skb, &pkt_len))
+ goto drop;
- if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
- if (pkt_len + ip6h_len > skb->len) {
- __IP6_INC_STATS(net, idev,
- IPSTATS_MIB_INTRUNCATEDPKTS);
- goto drop;
- }
- if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) {
- __IP6_INC_STATS(net, idev,
- IPSTATS_MIB_INDISCARDS);
- goto drop;
- }
+ if (pkt_len + ip6h_len > skb->len) {
+ __IP6_INC_STATS(net, idev,
+ IPSTATS_MIB_INTRUNCATEDPKTS);
+ goto drop;
}
- if (hdr->nexthdr == NEXTHDR_HOP && br_nf_check_hbh_len(skb))
+ if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) {
+ __IP6_INC_STATS(net, idev,
+ IPSTATS_MIB_INDISCARDS);
goto drop;
+ }
memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
/* No IP options in IPv6 header; however it should be
@@ -164,9 +102,15 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc
{
struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb);
struct rtable *rt;
- struct net_device *dev = skb->dev;
+ struct net_device *dev = skb->dev, *br_indev;
const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
+ br_indev = nf_bridge_get_physindev(skb, net);
+ if (!br_indev) {
+ kfree_skb(skb);
+ return 0;
+ }
+
nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size;
if (nf_bridge->pkt_otherhost) {
@@ -184,7 +128,7 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc
}
if (skb_dst(skb)->dev == dev) {
- skb->dev = nf_bridge->physindev;
+ skb->dev = br_indev;
nf_bridge_update_protocol(skb);
nf_bridge_push_encap_header(skb);
br_nf_hook_thresh(NF_BR_PRE_ROUTING,
@@ -195,15 +139,16 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc
ether_addr_copy(eth_hdr(skb)->h_dest, dev->dev_addr);
skb->pkt_type = PACKET_HOST;
} else {
- rt = bridge_parent_rtable(nf_bridge->physindev);
+ rt = bridge_parent_rtable(br_indev);
if (!rt) {
kfree_skb(skb);
return 0;
}
+ skb_dst_drop(skb);
skb_dst_set_noref(skb, &rt->dst);
}
- skb->dev = nf_bridge->physindev;
+ skb->dev = br_indev;
nf_bridge_update_protocol(skb);
nf_bridge_push_encap_header(skb);
br_nf_hook_thresh(NF_BR_PRE_ROUTING, net, sk, skb,
@@ -222,18 +167,20 @@ unsigned int br_nf_pre_routing_ipv6(void *priv,
struct nf_bridge_info *nf_bridge;
if (br_validate_ipv6(state->net, skb))
- return NF_DROP;
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_IP_INHDR, 0);
- nf_bridge_put(skb->nf_bridge);
- if (!nf_bridge_alloc(skb))
- return NF_DROP;
- if (!setup_pre_routing(skb))
- return NF_DROP;
+ nf_bridge = nf_bridge_alloc(skb);
+ if (!nf_bridge)
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_NOMEM, 0);
+ if (!setup_pre_routing(skb, state->net))
+ return NF_DROP_REASON(skb, SKB_DROP_REASON_DEV_READY, 0);
nf_bridge = nf_bridge_info_get(skb);
nf_bridge->ipv6_daddr = ipv6_hdr(skb)->daddr;
skb->protocol = htons(ETH_P_IPV6);
+ skb->transport_header = skb->network_header + sizeof(struct ipv6hdr);
+
NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->net, state->sk, skb,
skb->dev, NULL,
br_nf_pre_routing_finish_ipv6);
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index ec2b58a09f76..0264730938f4 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Bridge netlink control interface
*
* Authors:
* Stephen Hemminger <shemminger@osdl.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -20,7 +16,9 @@
#include "br_private.h"
#include "br_private_stp.h"
+#include "br_private_cfm.h"
#include "br_private_tunnel.h"
+#include "br_private_mcast_eht.h"
static int __get_num_vlan_infos(struct net_bridge_vlan_group *vg,
u32 filter_mask)
@@ -97,15 +95,18 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev,
{
struct net_bridge_vlan_group *vg = NULL;
struct net_bridge_port *p = NULL;
- struct net_bridge *br;
- int num_vlan_infos;
+ struct net_bridge *br = NULL;
+ u32 num_cfm_peer_mep_infos;
+ u32 num_cfm_mep_infos;
size_t vinfo_sz = 0;
+ int num_vlan_infos;
rcu_read_lock();
- if (br_port_exists(dev)) {
- p = br_port_get_rcu(dev);
- vg = nbp_vlan_group_rcu(p);
- } else if (dev->priv_flags & IFF_EBRIDGE) {
+ if (netif_is_bridge_port(dev)) {
+ p = br_port_get_check_rcu(dev);
+ if (p)
+ vg = nbp_vlan_group_rcu(p);
+ } else if (netif_is_bridge_master(dev)) {
br = netdev_priv(dev);
vg = br_vlan_group_rcu(br);
}
@@ -118,6 +119,52 @@ static size_t br_get_link_af_size_filtered(const struct net_device *dev,
/* Each VLAN is returned in bridge_vlan_info along with flags */
vinfo_sz += num_vlan_infos * nla_total_size(sizeof(struct bridge_vlan_info));
+ if (p && vg && (filter_mask & RTEXT_FILTER_MST))
+ vinfo_sz += br_mst_info_size(vg);
+
+ if (!(filter_mask & RTEXT_FILTER_CFM_STATUS))
+ return vinfo_sz;
+
+ if (!br)
+ return vinfo_sz;
+
+ /* CFM status info must be added */
+ br_cfm_mep_count(br, &num_cfm_mep_infos);
+ br_cfm_peer_mep_count(br, &num_cfm_peer_mep_infos);
+
+ vinfo_sz += nla_total_size(0); /* IFLA_BRIDGE_CFM */
+ /* For each status struct the MEP instance (u32) is added */
+ /* MEP instance (u32) + br_cfm_mep_status */
+ vinfo_sz += num_cfm_mep_infos *
+ /*IFLA_BRIDGE_CFM_MEP_STATUS_INSTANCE */
+ (nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_MEP_STATUS_OPCODE_UNEXP_SEEN */
+ + nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_MEP_STATUS_VERSION_UNEXP_SEEN */
+ + nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_MEP_STATUS_RX_LEVEL_LOW_SEEN */
+ + nla_total_size(sizeof(u32)));
+ /* MEP instance (u32) + br_cfm_cc_peer_status */
+ vinfo_sz += num_cfm_peer_mep_infos *
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_INSTANCE */
+ (nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_PEER_MEPID */
+ + nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_CCM_DEFECT */
+ + nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_RDI */
+ + nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_PORT_TLV_VALUE */
+ + nla_total_size(sizeof(u8))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_IF_TLV_VALUE */
+ + nla_total_size(sizeof(u8))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEEN */
+ + nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_TLV_SEEN */
+ + nla_total_size(sizeof(u32))
+ /* IFLA_BRIDGE_CFM_CC_PEER_STATUS_SEQ_UNEXP_SEEN */
+ + nla_total_size(sizeof(u32)));
+
return vinfo_sz;
}
@@ -140,6 +187,9 @@ static inline size_t br_port_info_size(void)
+ nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */
+ nla_total_size(1) /* IFLA_BRPORT_NEIGH_SUPPRESS */
+ nla_total_size(1) /* IFLA_BRPORT_ISOLATED */
+ + nla_total_size(1) /* IFLA_BRPORT_LOCKED */
+ + nla_total_size(1) /* IFLA_BRPORT_MAB */
+ + nla_total_size(1) /* IFLA_BRPORT_NEIGH_VLAN_SUPPRESS */
+ nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */
+ nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */
+ nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */
@@ -153,8 +203,15 @@ static inline size_t br_port_info_size(void)
+ nla_total_size_64bit(sizeof(u64)) /* IFLA_BRPORT_HOLD_TIMER */
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */
+ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_N_GROUPS */
+ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_MAX_GROUPS */
#endif
+ nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */
+ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */
+ + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_IN_OPEN */
+ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT */
+ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_CNT */
+ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_BACKUP_NHID */
+ 0;
}
@@ -217,7 +274,15 @@ static int br_port_fill_attrs(struct sk_buff *skb,
nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) ||
nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS,
!!(p->flags & BR_NEIGH_SUPPRESS)) ||
- nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)))
+ nla_put_u8(skb, IFLA_BRPORT_MRP_RING_OPEN, !!(p->flags &
+ BR_MRP_LOST_CONT)) ||
+ nla_put_u8(skb, IFLA_BRPORT_MRP_IN_OPEN,
+ !!(p->flags & BR_MRP_LOST_IN_CONT)) ||
+ nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED)) ||
+ nla_put_u8(skb, IFLA_BRPORT_LOCKED, !!(p->flags & BR_PORT_LOCKED)) ||
+ nla_put_u8(skb, IFLA_BRPORT_MAB, !!(p->flags & BR_PORT_MAB)) ||
+ nla_put_u8(skb, IFLA_BRPORT_NEIGH_VLAN_SUPPRESS,
+ !!(p->flags & BR_NEIGH_VLAN_SUPPRESS)))
return -EMSGSIZE;
timerval = br_timer_value(&p->message_age_timer);
@@ -235,7 +300,15 @@ static int br_port_fill_attrs(struct sk_buff *skb,
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
if (nla_put_u8(skb, IFLA_BRPORT_MULTICAST_ROUTER,
- p->multicast_router))
+ p->multicast_ctx.multicast_router) ||
+ nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT,
+ p->multicast_eht_hosts_limit) ||
+ nla_put_u32(skb, IFLA_BRPORT_MCAST_EHT_HOSTS_CNT,
+ p->multicast_eht_hosts_cnt) ||
+ nla_put_u32(skb, IFLA_BRPORT_MCAST_N_GROUPS,
+ br_multicast_ngroups_get(&p->multicast_ctx)) ||
+ nla_put_u32(skb, IFLA_BRPORT_MCAST_MAX_GROUPS,
+ br_multicast_ngroups_get_max(&p->multicast_ctx)))
return -EMSGSIZE;
#endif
@@ -247,6 +320,10 @@ static int br_port_fill_attrs(struct sk_buff *skb,
backup_p->dev->ifindex);
rcu_read_unlock();
+ if (p->backup_nhid &&
+ nla_put_u32(skb, IFLA_BRPORT_BACKUP_NHID, p->backup_nhid))
+ return -EMSGSIZE;
+
return 0;
}
@@ -375,9 +452,12 @@ nla_put_failure:
static int br_fill_ifinfo(struct sk_buff *skb,
const struct net_bridge_port *port,
u32 pid, u32 seq, int event, unsigned int flags,
- u32 filter_mask, const struct net_device *dev)
+ u32 filter_mask, const struct net_device *dev,
+ bool getlink)
{
- u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN;
+ u8 operstate = netif_running(dev) ? READ_ONCE(dev->operstate) :
+ IF_OPER_DOWN;
+ struct nlattr *af = NULL;
struct net_bridge *br;
struct ifinfomsg *hdr;
struct nlmsghdr *nlh;
@@ -387,7 +467,7 @@ static int br_fill_ifinfo(struct sk_buff *skb,
else
br = netdev_priv(dev);
- br_debug(br, "br_fill_info event %d port %s master %s\n",
+ br_debug(br, "br_fill_ifinfo event %d port %s master %s\n",
event, dev->name, br->dev->name);
nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags);
@@ -399,7 +479,7 @@ static int br_fill_ifinfo(struct sk_buff *skb,
hdr->__ifi_pad = 0;
hdr->ifi_type = dev->type;
hdr->ifi_index = dev->ifindex;
- hdr->ifi_flags = dev_get_flags(dev);
+ hdr->ifi_flags = netif_get_flags(dev);
hdr->ifi_change = 0;
if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
@@ -413,19 +493,29 @@ static int br_fill_ifinfo(struct sk_buff *skb,
goto nla_put_failure;
if (event == RTM_NEWLINK && port) {
- struct nlattr *nest
- = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED);
+ struct nlattr *nest;
+ nest = nla_nest_start(skb, IFLA_PROTINFO);
if (nest == NULL || br_port_fill_attrs(skb, port) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
}
+ if (filter_mask & (RTEXT_FILTER_BRVLAN |
+ RTEXT_FILTER_BRVLAN_COMPRESSED |
+ RTEXT_FILTER_MRP |
+ RTEXT_FILTER_CFM_CONFIG |
+ RTEXT_FILTER_CFM_STATUS |
+ RTEXT_FILTER_MST)) {
+ af = nla_nest_start_noflag(skb, IFLA_AF_SPEC);
+ if (!af)
+ goto nla_put_failure;
+ }
+
/* Check if the VID information is requested */
if ((filter_mask & RTEXT_FILTER_BRVLAN) ||
(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)) {
struct net_bridge_vlan_group *vg;
- struct nlattr *af;
int err;
/* RCU needed because of the VLAN locking rules (rcu || rtnl) */
@@ -439,11 +529,6 @@ static int br_fill_ifinfo(struct sk_buff *skb,
rcu_read_unlock();
goto done;
}
- af = nla_nest_start(skb, IFLA_AF_SPEC);
- if (!af) {
- rcu_read_unlock();
- goto nla_put_failure;
- }
if (filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED)
err = br_fill_ifvlaninfo_compressed(skb, vg);
else
@@ -454,10 +539,80 @@ static int br_fill_ifinfo(struct sk_buff *skb,
rcu_read_unlock();
if (err)
goto nla_put_failure;
- nla_nest_end(skb, af);
+ }
+
+ if (filter_mask & RTEXT_FILTER_MRP) {
+ int err;
+
+ if (!br_mrp_enabled(br) || port)
+ goto done;
+
+ rcu_read_lock();
+ err = br_mrp_fill_info(skb, br);
+ rcu_read_unlock();
+
+ if (err)
+ goto nla_put_failure;
+ }
+
+ if (filter_mask & (RTEXT_FILTER_CFM_CONFIG | RTEXT_FILTER_CFM_STATUS)) {
+ struct nlattr *cfm_nest = NULL;
+ int err;
+
+ if (!br_cfm_created(br) || port)
+ goto done;
+
+ cfm_nest = nla_nest_start(skb, IFLA_BRIDGE_CFM);
+ if (!cfm_nest)
+ goto nla_put_failure;
+
+ if (filter_mask & RTEXT_FILTER_CFM_CONFIG) {
+ rcu_read_lock();
+ err = br_cfm_config_fill_info(skb, br);
+ rcu_read_unlock();
+ if (err)
+ goto nla_put_failure;
+ }
+
+ if (filter_mask & RTEXT_FILTER_CFM_STATUS) {
+ rcu_read_lock();
+ err = br_cfm_status_fill_info(skb, br, getlink);
+ rcu_read_unlock();
+ if (err)
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, cfm_nest);
+ }
+
+ if ((filter_mask & RTEXT_FILTER_MST) &&
+ br_opt_get(br, BROPT_MST_ENABLED) && port) {
+ const struct net_bridge_vlan_group *vg = nbp_vlan_group(port);
+ struct nlattr *mst_nest;
+ int err;
+
+ if (!vg || !vg->num_vlans)
+ goto done;
+
+ mst_nest = nla_nest_start(skb, IFLA_BRIDGE_MST);
+ if (!mst_nest)
+ goto nla_put_failure;
+
+ err = br_mst_fill_info(skb, vg);
+ if (err)
+ goto nla_put_failure;
+
+ nla_nest_end(skb, mst_nest);
}
done:
+ if (af) {
+ if (nlmsg_get_pos(skb) - (void *)af > nla_attr_size(0))
+ nla_nest_end(skb, af);
+ else
+ nla_nest_cancel(skb, af);
+ }
+
nlmsg_end(skb, nlh);
return 0;
@@ -466,11 +621,9 @@ nla_put_failure:
return -EMSGSIZE;
}
-/* Notify listeners of a change in bridge or port information */
-void br_ifinfo_notify(int event, const struct net_bridge *br,
- const struct net_bridge_port *port)
+void br_info_notify(int event, const struct net_bridge *br,
+ const struct net_bridge_port *port, u32 filter)
{
- u32 filter = RTEXT_FILTER_BRVLAN_COMPRESSED;
struct net_device *dev;
struct sk_buff *skb;
int err = -ENOBUFS;
@@ -495,7 +648,7 @@ void br_ifinfo_notify(int event, const struct net_bridge *br,
if (skb == NULL)
goto errout;
- err = br_fill_ifinfo(skb, port, 0, 0, event, 0, filter, dev);
+ err = br_fill_ifinfo(skb, port, 0, 0, event, 0, filter, dev, false);
if (err < 0) {
/* -EMSGSIZE implies BUG in br_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
@@ -508,6 +661,15 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_LINK, err);
}
+/* Notify listeners of a change in bridge or port information */
+void br_ifinfo_notify(int event, const struct net_bridge *br,
+ const struct net_bridge_port *port)
+{
+ u32 filter = RTEXT_FILTER_BRVLAN_COMPRESSED;
+
+ br_info_notify(event, br, port, filter);
+}
+
/*
* Dump information about all ports, in response to GETLINK
*/
@@ -517,15 +679,19 @@ int br_getlink(struct sk_buff *skb, u32 pid, u32 seq,
struct net_bridge_port *port = br_port_get_rtnl(dev);
if (!port && !(filter_mask & RTEXT_FILTER_BRVLAN) &&
- !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED))
+ !(filter_mask & RTEXT_FILTER_BRVLAN_COMPRESSED) &&
+ !(filter_mask & RTEXT_FILTER_MRP) &&
+ !(filter_mask & RTEXT_FILTER_CFM_CONFIG) &&
+ !(filter_mask & RTEXT_FILTER_CFM_STATUS))
return 0;
return br_fill_ifinfo(skb, port, pid, seq, RTM_NEWLINK, nlflags,
- filter_mask, dev);
+ filter_mask, dev, true);
}
static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p,
- int cmd, struct bridge_vlan_info *vinfo, bool *changed)
+ int cmd, struct bridge_vlan_info *vinfo, bool *changed,
+ struct netlink_ext_ack *extack)
{
bool curr_change;
int err = 0;
@@ -537,11 +703,11 @@ static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p,
* per-VLAN entry as well
*/
err = nbp_vlan_add(p, vinfo->vid, vinfo->flags,
- &curr_change);
+ &curr_change, extack);
} else {
vinfo->flags |= BRIDGE_VLAN_INFO_BRENTRY;
err = br_vlan_add(br, vinfo->vid, vinfo->flags,
- &curr_change);
+ &curr_change, extack);
}
if (curr_change)
*changed = true;
@@ -564,56 +730,81 @@ static int br_vlan_info(struct net_bridge *br, struct net_bridge_port *p,
return err;
}
-static int br_process_vlan_info(struct net_bridge *br,
- struct net_bridge_port *p, int cmd,
- struct bridge_vlan_info *vinfo_curr,
- struct bridge_vlan_info **vinfo_last,
- bool *changed)
+int br_process_vlan_info(struct net_bridge *br,
+ struct net_bridge_port *p, int cmd,
+ struct bridge_vlan_info *vinfo_curr,
+ struct bridge_vlan_info **vinfo_last,
+ bool *changed,
+ struct netlink_ext_ack *extack)
{
- if (!vinfo_curr->vid || vinfo_curr->vid >= VLAN_VID_MASK)
+ int err, rtm_cmd;
+
+ if (!br_vlan_valid_id(vinfo_curr->vid, extack))
return -EINVAL;
+ /* needed for vlan-only NEWVLAN/DELVLAN notifications */
+ rtm_cmd = br_afspec_cmd_to_rtm(cmd);
+
if (vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
- /* check if we are already processing a range */
- if (*vinfo_last)
+ if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack))
return -EINVAL;
*vinfo_last = vinfo_curr;
- /* don't allow range of pvids */
- if ((*vinfo_last)->flags & BRIDGE_VLAN_INFO_PVID)
- return -EINVAL;
return 0;
}
if (*vinfo_last) {
struct bridge_vlan_info tmp_vinfo;
- int v, err;
-
- if (!(vinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END))
- return -EINVAL;
+ int v, v_change_start = 0;
- if (vinfo_curr->vid <= (*vinfo_last)->vid)
+ if (!br_vlan_valid_range(vinfo_curr, *vinfo_last, extack))
return -EINVAL;
memcpy(&tmp_vinfo, *vinfo_last,
sizeof(struct bridge_vlan_info));
for (v = (*vinfo_last)->vid; v <= vinfo_curr->vid; v++) {
+ bool curr_change = false;
+
tmp_vinfo.vid = v;
- err = br_vlan_info(br, p, cmd, &tmp_vinfo, changed);
+ err = br_vlan_info(br, p, cmd, &tmp_vinfo, &curr_change,
+ extack);
if (err)
break;
+ if (curr_change) {
+ *changed = curr_change;
+ if (!v_change_start)
+ v_change_start = v;
+ } else {
+ /* nothing to notify yet */
+ if (!v_change_start)
+ continue;
+ br_vlan_notify(br, p, v_change_start,
+ v - 1, rtm_cmd);
+ v_change_start = 0;
+ }
+ cond_resched();
}
+ /* v_change_start is set only if the last/whole range changed */
+ if (v_change_start)
+ br_vlan_notify(br, p, v_change_start,
+ v - 1, rtm_cmd);
+
*vinfo_last = NULL;
return err;
}
- return br_vlan_info(br, p, cmd, vinfo_curr, changed);
+ err = br_vlan_info(br, p, cmd, vinfo_curr, changed, extack);
+ if (*changed)
+ br_vlan_notify(br, p, vinfo_curr->vid, 0, rtm_cmd);
+
+ return err;
}
static int br_afspec(struct net_bridge *br,
struct net_bridge_port *p,
struct nlattr *af_spec,
- int cmd, bool *changed)
+ int cmd, bool *changed,
+ struct netlink_ext_ack *extack)
{
struct bridge_vlan_info *vinfo_curr = NULL;
struct bridge_vlan_info *vinfo_last = NULL;
@@ -643,7 +834,35 @@ static int br_afspec(struct net_bridge *br,
return -EINVAL;
vinfo_curr = nla_data(attr);
err = br_process_vlan_info(br, p, cmd, vinfo_curr,
- &vinfo_last, changed);
+ &vinfo_last, changed,
+ extack);
+ if (err)
+ return err;
+ break;
+ case IFLA_BRIDGE_MRP:
+ err = br_mrp_parse(br, p, attr, cmd, extack);
+ if (err)
+ return err;
+ break;
+ case IFLA_BRIDGE_CFM:
+ err = br_cfm_parse(br, p, attr, cmd, extack);
+ if (err)
+ return err;
+ break;
+ case IFLA_BRIDGE_MST:
+ if (!p) {
+ NL_SET_ERR_MSG(extack,
+ "MST states can only be set on bridge ports");
+ return -EINVAL;
+ }
+
+ if (cmd != RTM_SETLINK) {
+ NL_SET_ERR_MSG(extack,
+ "MST states can only be set through RTM_SETLINK");
+ return -EINVAL;
+ }
+
+ err = br_mst_process(p, attr, extack);
if (err)
return err;
break;
@@ -654,6 +873,8 @@ static int br_afspec(struct net_bridge *br,
}
static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
+ [IFLA_BRPORT_UNSPEC] = { .strict_start_type =
+ IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT + 1 },
[IFLA_BRPORT_STATE] = { .type = NLA_U8 },
[IFLA_BRPORT_COST] = { .type = NLA_U32 },
[IFLA_BRPORT_PRIORITY] = { .type = NLA_U16 },
@@ -673,7 +894,14 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
[IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 },
[IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 },
[IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 },
+ [IFLA_BRPORT_LOCKED] = { .type = NLA_U8 },
+ [IFLA_BRPORT_MAB] = { .type = NLA_U8 },
[IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 },
+ [IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT] = { .type = NLA_U32 },
+ [IFLA_BRPORT_MCAST_N_GROUPS] = { .type = NLA_REJECT },
+ [IFLA_BRPORT_MCAST_MAX_GROUPS] = { .type = NLA_U32 },
+ [IFLA_BRPORT_NEIGH_VLAN_SUPPRESS] = NLA_POLICY_MAX(NLA_U8, 1),
+ [IFLA_BRPORT_BACKUP_NHID] = { .type = NLA_U32 },
};
/* Change the state of the port and notify spanning tree */
@@ -699,87 +927,78 @@ static int br_set_port_state(struct net_bridge_port *p, u8 state)
}
/* Set/clear or port flags based on attribute */
-static int br_set_port_flag(struct net_bridge_port *p, struct nlattr *tb[],
- int attrtype, unsigned long mask)
+static void br_set_port_flag(struct net_bridge_port *p, struct nlattr *tb[],
+ int attrtype, unsigned long mask)
{
- unsigned long flags;
- int err;
-
if (!tb[attrtype])
- return 0;
+ return;
if (nla_get_u8(tb[attrtype]))
- flags = p->flags | mask;
+ p->flags |= mask;
else
- flags = p->flags & ~mask;
-
- err = br_switchdev_set_port_flag(p, flags, mask);
- if (err)
- return err;
-
- p->flags = flags;
- return 0;
+ p->flags &= ~mask;
}
/* Process bridge protocol info on port */
-static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
+static int br_setport(struct net_bridge_port *p, struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
{
- unsigned long old_flags = p->flags;
- bool br_vlan_tunnel_old = false;
+ unsigned long old_flags, changed_mask;
+ bool br_vlan_tunnel_old;
int err;
- err = br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE);
- if (err)
- return err;
-
- err = br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD);
- if (err)
- return err;
-
- err = br_set_port_flag(p, tb, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE);
- if (err)
- return err;
-
- err = br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK);
- if (err)
- return err;
-
- err = br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING);
- if (err)
- return err;
-
- err = br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD);
- if (err)
- return err;
-
- err = br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD);
- if (err)
- return err;
-
- err = br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_TO_UCAST, BR_MULTICAST_TO_UNICAST);
- if (err)
- return err;
-
- err = br_set_port_flag(p, tb, IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD);
- if (err)
- return err;
-
- err = br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP);
- if (err)
- return err;
+ old_flags = p->flags;
+ br_vlan_tunnel_old = (old_flags & BR_VLAN_TUNNEL) ? true : false;
+
+ br_set_port_flag(p, tb, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE);
+ br_set_port_flag(p, tb, IFLA_BRPORT_GUARD, BR_BPDU_GUARD);
+ br_set_port_flag(p, tb, IFLA_BRPORT_FAST_LEAVE,
+ BR_MULTICAST_FAST_LEAVE);
+ br_set_port_flag(p, tb, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK);
+ br_set_port_flag(p, tb, IFLA_BRPORT_LEARNING, BR_LEARNING);
+ br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD);
+ br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD);
+ br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_TO_UCAST,
+ BR_MULTICAST_TO_UNICAST);
+ br_set_port_flag(p, tb, IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD);
+ br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP);
+ br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI);
+ br_set_port_flag(p, tb, IFLA_BRPORT_VLAN_TUNNEL, BR_VLAN_TUNNEL);
+ br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS, BR_NEIGH_SUPPRESS);
+ br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED);
+ br_set_port_flag(p, tb, IFLA_BRPORT_LOCKED, BR_PORT_LOCKED);
+ br_set_port_flag(p, tb, IFLA_BRPORT_MAB, BR_PORT_MAB);
+ br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_VLAN_SUPPRESS,
+ BR_NEIGH_VLAN_SUPPRESS);
+
+ if ((p->flags & BR_PORT_MAB) &&
+ (!(p->flags & BR_PORT_LOCKED) || !(p->flags & BR_LEARNING))) {
+ NL_SET_ERR_MSG(extack, "Bridge port must be locked and have learning enabled when MAB is enabled");
+ p->flags = old_flags;
+ return -EINVAL;
+ } else if (!(p->flags & BR_PORT_MAB) && (old_flags & BR_PORT_MAB)) {
+ struct net_bridge_fdb_flush_desc desc = {
+ .flags = BIT(BR_FDB_LOCKED),
+ .flags_mask = BIT(BR_FDB_LOCKED),
+ .port_ifindex = p->dev->ifindex,
+ };
+
+ br_fdb_flush(p->br, &desc);
+ }
- err = br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI);
- if (err)
- return err;
+ changed_mask = old_flags ^ p->flags;
- br_vlan_tunnel_old = (p->flags & BR_VLAN_TUNNEL) ? true : false;
- err = br_set_port_flag(p, tb, IFLA_BRPORT_VLAN_TUNNEL, BR_VLAN_TUNNEL);
- if (err)
+ err = br_switchdev_set_port_flag(p, p->flags, changed_mask, extack);
+ if (err) {
+ p->flags = old_flags;
return err;
+ }
if (br_vlan_tunnel_old && !(p->flags & BR_VLAN_TUNNEL))
nbp_vlan_tunnel_info_flush(p);
+ br_port_flags_change(p, changed_mask);
+
if (tb[IFLA_BRPORT_COST]) {
err = br_stp_set_path_cost(p, nla_get_u32(tb[IFLA_BRPORT_COST]));
if (err)
@@ -805,10 +1024,27 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
if (tb[IFLA_BRPORT_MULTICAST_ROUTER]) {
u8 mcast_router = nla_get_u8(tb[IFLA_BRPORT_MULTICAST_ROUTER]);
- err = br_multicast_set_port_router(p, mcast_router);
+ err = br_multicast_set_port_router(&p->multicast_ctx,
+ mcast_router);
+ if (err)
+ return err;
+ }
+
+ if (tb[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT]) {
+ u32 hlimit;
+
+ hlimit = nla_get_u32(tb[IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT]);
+ err = br_multicast_eht_set_hosts_limit(p, hlimit);
if (err)
return err;
}
+
+ if (tb[IFLA_BRPORT_MCAST_MAX_GROUPS]) {
+ u32 max_groups;
+
+ max_groups = nla_get_u32(tb[IFLA_BRPORT_MCAST_MAX_GROUPS]);
+ br_multicast_ngroups_set_max(&p->multicast_ctx, max_groups);
+ }
#endif
if (tb[IFLA_BRPORT_GROUP_FWD_MASK]) {
@@ -819,15 +1055,6 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
p->group_fwd_mask = fwd_mask;
}
- err = br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS,
- BR_NEIGH_SUPPRESS);
- if (err)
- return err;
-
- err = br_set_port_flag(p, tb, IFLA_BRPORT_ISOLATED, BR_ISOLATED);
- if (err)
- return err;
-
if (tb[IFLA_BRPORT_BACKUP_PORT]) {
struct net_device *backup_dev = NULL;
u32 backup_ifindex;
@@ -845,12 +1072,18 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
return err;
}
- br_port_flags_change(p, old_flags ^ p->flags);
+ if (tb[IFLA_BRPORT_BACKUP_NHID]) {
+ u32 backup_nhid = nla_get_u32(tb[IFLA_BRPORT_BACKUP_NHID]);
+
+ WRITE_ONCE(p->backup_nhid, backup_nhid);
+ }
+
return 0;
}
/* Change state and parameters on port. */
-int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
+int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags,
+ struct netlink_ext_ack *extack)
{
struct net_bridge *br = (struct net_bridge *)netdev_priv(dev);
struct nlattr *tb[IFLA_BRPORT_MAX + 1];
@@ -874,13 +1107,15 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
if (p && protinfo) {
if (protinfo->nla_type & NLA_F_NESTED) {
- err = nla_parse_nested(tb, IFLA_BRPORT_MAX, protinfo,
- br_port_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, IFLA_BRPORT_MAX,
+ protinfo,
+ br_port_policy,
+ NULL);
if (err)
return err;
spin_lock_bh(&p->br->lock);
- err = br_setport(p, tb);
+ err = br_setport(p, tb, extack);
spin_unlock_bh(&p->br->lock);
} else {
/* Binary compatibility with old RSTP */
@@ -897,7 +1132,7 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
}
if (afspec)
- err = br_afspec(br, p, afspec, RTM_SETLINK, &changed);
+ err = br_afspec(br, p, afspec, RTM_SETLINK, &changed, extack);
if (changed)
br_ifinfo_notify(RTM_NEWLINK, br, p);
@@ -920,10 +1155,10 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
p = br_port_get_rtnl(dev);
/* We want to accept dev as bridge itself as well */
- if (!p && !(dev->priv_flags & IFF_EBRIDGE))
+ if (!p && !netif_is_bridge_master(dev))
return -EINVAL;
- err = br_afspec(br, p, afspec, RTM_DELLINK, &changed);
+ err = br_afspec(br, p, afspec, RTM_DELLINK, &changed, NULL);
if (changed)
/* Send RTM_NEWLINK because userspace
* expects RTM_NEWLINK for vlan dels
@@ -947,15 +1182,9 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[],
return 0;
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
- if (data[IFLA_BR_VLAN_PROTOCOL]) {
- switch (nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL])) {
- case htons(ETH_P_8021Q):
- case htons(ETH_P_8021AD):
- break;
- default:
- return -EPROTONOSUPPORT;
- }
- }
+ if (data[IFLA_BR_VLAN_PROTOCOL] &&
+ !eth_type_vlan(nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL])))
+ return -EPROTONOSUPPORT;
if (data[IFLA_BR_VLAN_DEFAULT_PVID]) {
__u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]);
@@ -981,7 +1210,7 @@ static int br_port_slave_changelink(struct net_device *brdev,
return 0;
spin_lock_bh(&br->lock);
- ret = br_setport(br_port_get_rtnl(dev), data);
+ ret = br_setport(br_port_get_rtnl(dev), data, extack);
spin_unlock_bh(&br->lock);
return ret;
@@ -1001,6 +1230,8 @@ static size_t br_port_get_slave_size(const struct net_device *brdev,
}
static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
+ [IFLA_BR_UNSPEC] = { .strict_start_type =
+ IFLA_BR_FDB_N_LEARNED },
[IFLA_BR_FORWARD_DELAY] = { .type = NLA_U32 },
[IFLA_BR_HELLO_TIME] = { .type = NLA_U32 },
[IFLA_BR_MAX_AGE] = { .type = NLA_U32 },
@@ -1034,6 +1265,11 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = {
[IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 },
[IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 },
[IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 },
+ [IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 },
+ [IFLA_BR_MULTI_BOOLOPT] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct br_boolopt_multi)),
+ [IFLA_BR_FDB_N_LEARNED] = { .type = NLA_REJECT },
+ [IFLA_BR_FDB_MAX_LEARNED] = { .type = NLA_U32 },
};
static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
@@ -1073,7 +1309,9 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
if (data[IFLA_BR_STP_STATE]) {
u32 stp_enabled = nla_get_u32(data[IFLA_BR_STP_STATE]);
- br_stp_set_enabled(br, stp_enabled);
+ err = br_stp_set_enabled(br, stp_enabled, extack);
+ if (err)
+ return err;
}
if (data[IFLA_BR_PRIORITY]) {
@@ -1085,7 +1323,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
if (data[IFLA_BR_VLAN_FILTERING]) {
u8 vlan_filter = nla_get_u8(data[IFLA_BR_VLAN_FILTERING]);
- err = __br_vlan_filter_toggle(br, vlan_filter);
+ err = br_vlan_filter_toggle(br, vlan_filter, extack);
if (err)
return err;
}
@@ -1094,7 +1332,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
if (data[IFLA_BR_VLAN_PROTOCOL]) {
__be16 vlan_proto = nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL]);
- err = __br_vlan_set_proto(br, vlan_proto);
+ err = __br_vlan_set_proto(br, vlan_proto, extack);
if (err)
return err;
}
@@ -1102,7 +1340,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
if (data[IFLA_BR_VLAN_DEFAULT_PVID]) {
__u16 defpvid = nla_get_u16(data[IFLA_BR_VLAN_DEFAULT_PVID]);
- err = __br_vlan_set_default_pvid(br, defpvid);
+ err = __br_vlan_set_default_pvid(br, defpvid, extack);
if (err)
return err;
}
@@ -1114,6 +1352,14 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
if (err)
return err;
}
+
+ if (data[IFLA_BR_VLAN_STATS_PER_PORT]) {
+ __u8 per_port = nla_get_u8(data[IFLA_BR_VLAN_STATS_PER_PORT]);
+
+ err = br_vlan_set_stats_per_port(br, per_port);
+ if (err)
+ return err;
+ }
#endif
if (data[IFLA_BR_GROUP_FWD_MASK]) {
@@ -1139,18 +1385,24 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
spin_lock_bh(&br->lock);
memcpy(br->group_addr, new_addr, sizeof(br->group_addr));
spin_unlock_bh(&br->lock);
- br->group_addr_set = true;
+ br_opt_toggle(br, BROPT_GROUP_ADDR_SET, true);
br_recalculate_fwd_mask(br);
}
- if (data[IFLA_BR_FDB_FLUSH])
- br_fdb_flush(br);
+ if (data[IFLA_BR_FDB_FLUSH]) {
+ struct net_bridge_fdb_flush_desc desc = {
+ .flags_mask = BIT(BR_FDB_STATIC)
+ };
+
+ br_fdb_flush(br, &desc);
+ }
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
if (data[IFLA_BR_MCAST_ROUTER]) {
u8 multicast_router = nla_get_u8(data[IFLA_BR_MCAST_ROUTER]);
- err = br_multicast_set_router(br, multicast_router);
+ err = br_multicast_set_router(&br->multicast_ctx,
+ multicast_router);
if (err)
return err;
}
@@ -1158,7 +1410,7 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
if (data[IFLA_BR_MCAST_SNOOPING]) {
u8 mcast_snooping = nla_get_u8(data[IFLA_BR_MCAST_SNOOPING]);
- err = br_multicast_toggle(br, mcast_snooping);
+ err = br_multicast_toggle(br, mcast_snooping, extack);
if (err)
return err;
}
@@ -1167,91 +1419,86 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
u8 val;
val = nla_get_u8(data[IFLA_BR_MCAST_QUERY_USE_IFADDR]);
- br->multicast_query_use_ifaddr = !!val;
+ br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val);
}
if (data[IFLA_BR_MCAST_QUERIER]) {
u8 mcast_querier = nla_get_u8(data[IFLA_BR_MCAST_QUERIER]);
- err = br_multicast_set_querier(br, mcast_querier);
+ err = br_multicast_set_querier(&br->multicast_ctx,
+ mcast_querier);
if (err)
return err;
}
- if (data[IFLA_BR_MCAST_HASH_ELASTICITY]) {
- u32 val = nla_get_u32(data[IFLA_BR_MCAST_HASH_ELASTICITY]);
-
- br->hash_elasticity = val;
- }
-
- if (data[IFLA_BR_MCAST_HASH_MAX]) {
- u32 hash_max = nla_get_u32(data[IFLA_BR_MCAST_HASH_MAX]);
+ if (data[IFLA_BR_MCAST_HASH_ELASTICITY])
+ br_warn(br, "the hash_elasticity option has been deprecated and is always %u\n",
+ RHT_ELASTICITY);
- err = br_multicast_set_hash_max(br, hash_max);
- if (err)
- return err;
- }
+ if (data[IFLA_BR_MCAST_HASH_MAX])
+ br->hash_max = nla_get_u32(data[IFLA_BR_MCAST_HASH_MAX]);
if (data[IFLA_BR_MCAST_LAST_MEMBER_CNT]) {
u32 val = nla_get_u32(data[IFLA_BR_MCAST_LAST_MEMBER_CNT]);
- br->multicast_last_member_count = val;
+ br->multicast_ctx.multicast_last_member_count = val;
}
if (data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]) {
u32 val = nla_get_u32(data[IFLA_BR_MCAST_STARTUP_QUERY_CNT]);
- br->multicast_startup_query_count = val;
+ br->multicast_ctx.multicast_startup_query_count = val;
}
if (data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]) {
u64 val = nla_get_u64(data[IFLA_BR_MCAST_LAST_MEMBER_INTVL]);
- br->multicast_last_member_interval = clock_t_to_jiffies(val);
+ br->multicast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val);
}
if (data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]) {
u64 val = nla_get_u64(data[IFLA_BR_MCAST_MEMBERSHIP_INTVL]);
- br->multicast_membership_interval = clock_t_to_jiffies(val);
+ br->multicast_ctx.multicast_membership_interval = clock_t_to_jiffies(val);
}
if (data[IFLA_BR_MCAST_QUERIER_INTVL]) {
u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERIER_INTVL]);
- br->multicast_querier_interval = clock_t_to_jiffies(val);
+ br->multicast_ctx.multicast_querier_interval = clock_t_to_jiffies(val);
}
if (data[IFLA_BR_MCAST_QUERY_INTVL]) {
u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_INTVL]);
- br->multicast_query_interval = clock_t_to_jiffies(val);
+ br_multicast_set_query_intvl(&br->multicast_ctx, val);
}
if (data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]) {
u64 val = nla_get_u64(data[IFLA_BR_MCAST_QUERY_RESPONSE_INTVL]);
- br->multicast_query_response_interval = clock_t_to_jiffies(val);
+ br->multicast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val);
}
if (data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]) {
u64 val = nla_get_u64(data[IFLA_BR_MCAST_STARTUP_QUERY_INTVL]);
- br->multicast_startup_query_interval = clock_t_to_jiffies(val);
+ br_multicast_set_startup_query_intvl(&br->multicast_ctx, val);
}
if (data[IFLA_BR_MCAST_STATS_ENABLED]) {
__u8 mcast_stats;
mcast_stats = nla_get_u8(data[IFLA_BR_MCAST_STATS_ENABLED]);
- br->multicast_stats_enabled = !!mcast_stats;
+ br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!mcast_stats);
}
if (data[IFLA_BR_MCAST_IGMP_VERSION]) {
__u8 igmp_version;
igmp_version = nla_get_u8(data[IFLA_BR_MCAST_IGMP_VERSION]);
- err = br_multicast_set_igmp_version(br, igmp_version);
+ err = br_multicast_set_igmp_version(&br->multicast_ctx,
+ igmp_version);
if (err)
return err;
}
@@ -1261,7 +1508,8 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
__u8 mld_version;
mld_version = nla_get_u8(data[IFLA_BR_MCAST_MLD_VERSION]);
- err = br_multicast_set_mld_version(br, mld_version);
+ err = br_multicast_set_mld_version(&br->multicast_ctx,
+ mld_version);
if (err)
return err;
}
@@ -1271,30 +1519,47 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[],
if (data[IFLA_BR_NF_CALL_IPTABLES]) {
u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IPTABLES]);
- br->nf_call_iptables = val ? true : false;
+ br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val);
}
if (data[IFLA_BR_NF_CALL_IP6TABLES]) {
u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_IP6TABLES]);
- br->nf_call_ip6tables = val ? true : false;
+ br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val);
}
if (data[IFLA_BR_NF_CALL_ARPTABLES]) {
u8 val = nla_get_u8(data[IFLA_BR_NF_CALL_ARPTABLES]);
- br->nf_call_arptables = val ? true : false;
+ br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val);
}
#endif
+ if (data[IFLA_BR_MULTI_BOOLOPT]) {
+ struct br_boolopt_multi *bm;
+
+ bm = nla_data(data[IFLA_BR_MULTI_BOOLOPT]);
+ err = br_boolopt_multi_toggle(br, bm, extack);
+ if (err)
+ return err;
+ }
+
+ if (data[IFLA_BR_FDB_MAX_LEARNED]) {
+ u32 val = nla_get_u32(data[IFLA_BR_FDB_MAX_LEARNED]);
+
+ WRITE_ONCE(br->fdb_max_learned, val);
+ }
+
return 0;
}
-static int br_dev_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[],
+static int br_dev_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
struct netlink_ext_ack *extack)
{
struct net_bridge *br = netdev_priv(dev);
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
int err;
err = register_netdevice(dev);
@@ -1327,6 +1592,7 @@ static size_t br_get_size(const struct net_device *brdev)
nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */
nla_total_size(sizeof(u16)) + /* IFLA_BR_VLAN_DEFAULT_PVID */
nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_ENABLED */
+ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_PER_PORT */
#endif
nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */
nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */
@@ -1340,6 +1606,8 @@ static size_t br_get_size(const struct net_device *brdev)
nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_TOPOLOGY_CHANGE_TIMER */
nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_GC_TIMER */
nla_total_size(ETH_ALEN) + /* IFLA_BR_GROUP_ADDR */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_FDB_N_LEARNED */
+ nla_total_size(sizeof(u32)) + /* IFLA_BR_FDB_MAX_LEARNED */
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_ROUTER */
nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_SNOOPING */
@@ -1358,12 +1626,14 @@ static size_t br_get_size(const struct net_device *brdev)
nla_total_size_64bit(sizeof(u64)) + /* IFLA_BR_MCAST_STARTUP_QUERY_INTVL */
nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_IGMP_VERSION */
nla_total_size(sizeof(u8)) + /* IFLA_BR_MCAST_MLD_VERSION */
+ br_multicast_querier_state_size() + /* IFLA_BR_MCAST_QUERIER_STATE */
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IPTABLES */
nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_IP6TABLES */
nla_total_size(sizeof(u8)) + /* IFLA_BR_NF_CALL_ARPTABLES */
#endif
+ nla_total_size(sizeof(struct br_boolopt_multi)) + /* IFLA_BR_MULTI_BOOLOPT */
0;
}
@@ -1377,6 +1647,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
u32 stp_enabled = br->stp_enabled;
u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1];
u8 vlan_enabled = br_vlan_enabled(br->dev);
+ struct br_boolopt_multi bm;
u64 clockval;
clockval = br_timer_value(&br->hello_timer);
@@ -1393,6 +1664,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD))
return -EMSGSIZE;
+ br_boolopt_multi_get(br, &bm);
if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) ||
nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) ||
nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) ||
@@ -1410,70 +1682,81 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE, br->topology_change) ||
nla_put_u8(skb, IFLA_BR_TOPOLOGY_CHANGE_DETECTED,
br->topology_change_detected) ||
- nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr))
+ nla_put(skb, IFLA_BR_GROUP_ADDR, ETH_ALEN, br->group_addr) ||
+ nla_put(skb, IFLA_BR_MULTI_BOOLOPT, sizeof(bm), &bm) ||
+ nla_put_u32(skb, IFLA_BR_FDB_N_LEARNED,
+ atomic_read(&br->fdb_n_learned)) ||
+ nla_put_u32(skb, IFLA_BR_FDB_MAX_LEARNED, br->fdb_max_learned))
return -EMSGSIZE;
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto) ||
nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid) ||
- nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED, br->vlan_stats_enabled))
+ nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED,
+ br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) ||
+ nla_put_u8(skb, IFLA_BR_VLAN_STATS_PER_PORT,
+ br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)))
return -EMSGSIZE;
#endif
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
- if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER, br->multicast_router) ||
- nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING, !br->multicast_disabled) ||
+ if (nla_put_u8(skb, IFLA_BR_MCAST_ROUTER,
+ br->multicast_ctx.multicast_router) ||
+ nla_put_u8(skb, IFLA_BR_MCAST_SNOOPING,
+ br_opt_get(br, BROPT_MULTICAST_ENABLED)) ||
nla_put_u8(skb, IFLA_BR_MCAST_QUERY_USE_IFADDR,
- br->multicast_query_use_ifaddr) ||
- nla_put_u8(skb, IFLA_BR_MCAST_QUERIER, br->multicast_querier) ||
+ br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)) ||
+ nla_put_u8(skb, IFLA_BR_MCAST_QUERIER,
+ br->multicast_ctx.multicast_querier) ||
nla_put_u8(skb, IFLA_BR_MCAST_STATS_ENABLED,
- br->multicast_stats_enabled) ||
- nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY,
- br->hash_elasticity) ||
+ br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)) ||
+ nla_put_u32(skb, IFLA_BR_MCAST_HASH_ELASTICITY, RHT_ELASTICITY) ||
nla_put_u32(skb, IFLA_BR_MCAST_HASH_MAX, br->hash_max) ||
nla_put_u32(skb, IFLA_BR_MCAST_LAST_MEMBER_CNT,
- br->multicast_last_member_count) ||
+ br->multicast_ctx.multicast_last_member_count) ||
nla_put_u32(skb, IFLA_BR_MCAST_STARTUP_QUERY_CNT,
- br->multicast_startup_query_count) ||
+ br->multicast_ctx.multicast_startup_query_count) ||
nla_put_u8(skb, IFLA_BR_MCAST_IGMP_VERSION,
- br->multicast_igmp_version))
+ br->multicast_ctx.multicast_igmp_version) ||
+ br_multicast_dump_querier_state(skb, &br->multicast_ctx,
+ IFLA_BR_MCAST_QUERIER_STATE))
return -EMSGSIZE;
#if IS_ENABLED(CONFIG_IPV6)
if (nla_put_u8(skb, IFLA_BR_MCAST_MLD_VERSION,
- br->multicast_mld_version))
+ br->multicast_ctx.multicast_mld_version))
return -EMSGSIZE;
#endif
- clockval = jiffies_to_clock_t(br->multicast_last_member_interval);
+ clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval);
if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_LAST_MEMBER_INTVL, clockval,
IFLA_BR_PAD))
return -EMSGSIZE;
- clockval = jiffies_to_clock_t(br->multicast_membership_interval);
+ clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval);
if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_MEMBERSHIP_INTVL, clockval,
IFLA_BR_PAD))
return -EMSGSIZE;
- clockval = jiffies_to_clock_t(br->multicast_querier_interval);
+ clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval);
if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERIER_INTVL, clockval,
IFLA_BR_PAD))
return -EMSGSIZE;
- clockval = jiffies_to_clock_t(br->multicast_query_interval);
+ clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval);
if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_INTVL, clockval,
IFLA_BR_PAD))
return -EMSGSIZE;
- clockval = jiffies_to_clock_t(br->multicast_query_response_interval);
+ clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval);
if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, clockval,
IFLA_BR_PAD))
return -EMSGSIZE;
- clockval = jiffies_to_clock_t(br->multicast_startup_query_interval);
+ clockval = jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval);
if (nla_put_u64_64bit(skb, IFLA_BR_MCAST_STARTUP_QUERY_INTVL, clockval,
IFLA_BR_PAD))
return -EMSGSIZE;
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
if (nla_put_u8(skb, IFLA_BR_NF_CALL_IPTABLES,
- br->nf_call_iptables ? 1 : 0) ||
+ br_opt_get(br, BROPT_NF_CALL_IPTABLES) ? 1 : 0) ||
nla_put_u8(skb, IFLA_BR_NF_CALL_IP6TABLES,
- br->nf_call_ip6tables ? 1 : 0) ||
+ br_opt_get(br, BROPT_NF_CALL_IP6TABLES) ? 1 : 0) ||
nla_put_u8(skb, IFLA_BR_NF_CALL_ARPTABLES,
- br->nf_call_arptables ? 1 : 0))
+ br_opt_get(br, BROPT_NF_CALL_ARPTABLES) ? 1 : 0))
return -EMSGSIZE;
#endif
@@ -1497,7 +1780,6 @@ static size_t br_get_linkxstats_size(const struct net_device *dev, int attr)
p = br_port_get_rtnl(dev);
if (!p)
return 0;
- br = p->br;
vg = nbp_vlan_group(p);
break;
default:
@@ -1511,7 +1793,8 @@ static size_t br_get_linkxstats_size(const struct net_device *dev, int attr)
}
return numvls * nla_total_size(sizeof(struct bridge_vlan_xstats)) +
- nla_total_size(sizeof(struct br_mcast_stats)) +
+ nla_total_size_64bit(sizeof(struct br_mcast_stats)) +
+ (p ? nla_total_size_64bit(sizeof(p->stp_xstats)) : 0) +
nla_total_size(0);
}
@@ -1543,7 +1826,7 @@ static int br_fill_linkxstats(struct sk_buff *skb,
return -EINVAL;
}
- nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BRIDGE);
+ nest = nla_nest_start_noflag(skb, LINK_XSTATS_TYPE_BRIDGE);
if (!nest)
return -EMSGSIZE;
@@ -1553,7 +1836,7 @@ static int br_fill_linkxstats(struct sk_buff *skb,
pvid = br_get_pvid(vg);
list_for_each_entry(v, &vg->vlan_list, vlist) {
struct bridge_vlan_xstats vxi;
- struct br_vlan_stats stats;
+ struct pcpu_sw_netstats stats;
if (++vl_idx < *prividx)
continue;
@@ -1563,10 +1846,10 @@ static int br_fill_linkxstats(struct sk_buff *skb,
if (v->vid == pvid)
vxi.flags |= BRIDGE_VLAN_INFO_PVID;
br_vlan_get_stats(v, &stats);
- vxi.rx_bytes = stats.rx_bytes;
- vxi.rx_packets = stats.rx_packets;
- vxi.tx_bytes = stats.tx_bytes;
- vxi.tx_packets = stats.tx_packets;
+ vxi.rx_bytes = u64_stats_read(&stats.rx_bytes);
+ vxi.rx_packets = u64_stats_read(&stats.rx_packets);
+ vxi.tx_bytes = u64_stats_read(&stats.tx_bytes);
+ vxi.tx_packets = u64_stats_read(&stats.tx_packets);
if (nla_put(skb, BRIDGE_XSTATS_VLAN, sizeof(vxi), &vxi))
goto nla_put_failure;
@@ -1583,6 +1866,19 @@ static int br_fill_linkxstats(struct sk_buff *skb,
br_multicast_get_stats(br, p, nla_data(nla));
}
#endif
+
+ if (p) {
+ nla = nla_reserve_64bit(skb, BRIDGE_XSTATS_STP,
+ sizeof(p->stp_xstats),
+ BRIDGE_XSTATS_PAD);
+ if (!nla)
+ goto nla_put_failure;
+
+ spin_lock_bh(&br->lock);
+ memcpy(nla_data(nla), &p->stp_xstats, sizeof(p->stp_xstats));
+ spin_unlock_bh(&br->lock);
+ }
+
nla_nest_end(skb, nest);
*prividx = 0;
@@ -1626,8 +1922,13 @@ int __init br_netlink_init(void)
{
int err;
- br_mdb_init();
- rtnl_af_register(&br_af_ops);
+ err = br_vlan_rtnl_init();
+ if (err)
+ goto out;
+
+ err = rtnl_af_register(&br_af_ops);
+ if (err)
+ goto out_vlan;
err = rtnl_link_register(&br_link_ops);
if (err)
@@ -1637,13 +1938,15 @@ int __init br_netlink_init(void)
out_af:
rtnl_af_unregister(&br_af_ops);
- br_mdb_uninit();
+out_vlan:
+ br_vlan_rtnl_uninit();
+out:
return err;
}
void br_netlink_fini(void)
{
- br_mdb_uninit();
+ br_vlan_rtnl_uninit();
rtnl_af_unregister(&br_af_ops);
rtnl_link_unregister(&br_link_ops);
}
diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c
index da8cb99fd259..71a12da30004 100644
--- a/net/bridge/br_netlink_tunnel.c
+++ b/net/bridge/br_netlink_tunnel.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Bridge per vlan tunnel port dst_metadata netlink control interface
*
* Authors:
* Roopa Prabhu <roopa@cumulusnetworks.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -30,8 +26,8 @@ static size_t __get_vlan_tinfo_size(void)
nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_VLAN_TUNNEL_FLAGS */
}
-static bool vlan_tunid_inrange(struct net_bridge_vlan *v_curr,
- struct net_bridge_vlan *v_last)
+bool vlan_tunid_inrange(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *v_last)
{
__be32 tunid_curr = tunnel_id_to_key32(v_curr->tinfo.tunnel_id);
__be32 tunid_last = tunnel_id_to_key32(v_last->tinfo.tunnel_id);
@@ -97,7 +93,7 @@ static int br_fill_vlan_tinfo(struct sk_buff *skb, u16 vid,
__be32 tid = tunnel_id_to_key32(tunnel_id);
struct nlattr *tmap;
- tmap = nla_nest_start(skb, IFLA_BRIDGE_VLAN_TUNNEL_INFO);
+ tmap = nla_nest_start_noflag(skb, IFLA_BRIDGE_VLAN_TUNNEL_INFO);
if (!tmap)
return -EMSGSIZE;
if (nla_put_u32(skb, IFLA_BRIDGE_VLAN_TUNNEL_ID,
@@ -192,13 +188,16 @@ initvars:
}
static const struct nla_policy vlan_tunnel_policy[IFLA_BRIDGE_VLAN_TUNNEL_MAX + 1] = {
+ [IFLA_BRIDGE_VLAN_TUNNEL_UNSPEC] = {
+ .strict_start_type = IFLA_BRIDGE_VLAN_TUNNEL_FLAGS + 1
+ },
[IFLA_BRIDGE_VLAN_TUNNEL_ID] = { .type = NLA_U32 },
[IFLA_BRIDGE_VLAN_TUNNEL_VID] = { .type = NLA_U16 },
[IFLA_BRIDGE_VLAN_TUNNEL_FLAGS] = { .type = NLA_U16 },
};
-static int br_vlan_tunnel_info(struct net_bridge_port *p, int cmd,
- u16 vid, u32 tun_id, bool *changed)
+int br_vlan_tunnel_info(const struct net_bridge_port *p, int cmd,
+ u16 vid, u32 tun_id, bool *changed)
{
int err = 0;
@@ -230,8 +229,8 @@ int br_parse_vlan_tunnel_info(struct nlattr *attr,
memset(tinfo, 0, sizeof(*tinfo));
- err = nla_parse_nested(tb, IFLA_BRIDGE_VLAN_TUNNEL_MAX, attr,
- vlan_tunnel_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, IFLA_BRIDGE_VLAN_TUNNEL_MAX,
+ attr, vlan_tunnel_policy, NULL);
if (err < 0)
return err;
@@ -254,8 +253,38 @@ int br_parse_vlan_tunnel_info(struct nlattr *attr,
return 0;
}
-int br_process_vlan_tunnel_info(struct net_bridge *br,
- struct net_bridge_port *p, int cmd,
+/* send a notification if v_curr can't enter the range and start a new one */
+static void __vlan_tunnel_handle_range(const struct net_bridge_port *p,
+ struct net_bridge_vlan **v_start,
+ struct net_bridge_vlan **v_end,
+ int v_curr, bool curr_change)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+
+ vg = nbp_vlan_group(p);
+ if (!vg)
+ return;
+
+ v = br_vlan_find(vg, v_curr);
+
+ if (!*v_start)
+ goto out_init;
+
+ if (v && curr_change && br_vlan_can_enter_range(v, *v_end)) {
+ *v_end = v;
+ return;
+ }
+
+ br_vlan_notify(p->br, p, (*v_start)->vid, (*v_end)->vid, RTM_NEWVLAN);
+out_init:
+ /* we start a range only if there are any changes to notify about */
+ *v_start = curr_change ? v : NULL;
+ *v_end = *v_start;
+}
+
+int br_process_vlan_tunnel_info(const struct net_bridge *br,
+ const struct net_bridge_port *p, int cmd,
struct vtunnel_info *tinfo_curr,
struct vtunnel_info *tinfo_last,
bool *changed)
@@ -267,6 +296,7 @@ int br_process_vlan_tunnel_info(struct net_bridge *br,
return -EINVAL;
memcpy(tinfo_last, tinfo_curr, sizeof(struct vtunnel_info));
} else if (tinfo_curr->flags & BRIDGE_VLAN_INFO_RANGE_END) {
+ struct net_bridge_vlan *v_start = NULL, *v_end = NULL;
int t, v;
if (!(tinfo_last->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN))
@@ -276,11 +306,24 @@ int br_process_vlan_tunnel_info(struct net_bridge *br,
return -EINVAL;
t = tinfo_last->tunid;
for (v = tinfo_last->vid; v <= tinfo_curr->vid; v++) {
- err = br_vlan_tunnel_info(p, cmd, v, t, changed);
+ bool curr_change = false;
+
+ err = br_vlan_tunnel_info(p, cmd, v, t, &curr_change);
if (err)
- return err;
+ break;
t++;
+
+ if (curr_change)
+ *changed = curr_change;
+ __vlan_tunnel_handle_range(p, &v_start, &v_end, v,
+ curr_change);
}
+ if (v_start && v_end)
+ br_vlan_notify(br, p, v_start->vid, v_end->vid,
+ RTM_NEWVLAN);
+ if (err)
+ return err;
+
memset(tinfo_last, 0, sizeof(struct vtunnel_info));
memset(tinfo_curr, 0, sizeof(struct vtunnel_info));
} else {
@@ -290,6 +333,7 @@ int br_process_vlan_tunnel_info(struct net_bridge *br,
tinfo_curr->tunid, changed);
if (err)
return err;
+ br_vlan_notify(br, p, tinfo_curr->vid, 0, RTM_NEWVLAN);
memset(tinfo_last, 0, sizeof(struct vtunnel_info));
memset(tinfo_curr, 0, sizeof(struct vtunnel_info));
}
diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c
index 8e2d7cfa4e16..a8c67035e23c 100644
--- a/net/bridge/br_nf_core.c
+++ b/net/bridge/br_nf_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Handle firewalling core
* Linux ethernet bridge
@@ -6,11 +7,6 @@
* Lennert Buytenhek <buytenh@gnu.org>
* Bart De Schuymer <bdschuym@pandora.be>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Lennert dedicates this file to Kerstin Wurdinger.
*/
@@ -26,7 +22,8 @@
#endif
static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
{
}
@@ -68,17 +65,14 @@ static struct dst_ops fake_dst_ops = {
* ipt_REJECT needs it. Future netfilter modules might
* require us to fill additional fields.
*/
-static const u32 br_dst_default_metrics[RTAX_MAX] = {
- [RTAX_MTU - 1] = 1500,
-};
-
void br_netfilter_rtable_init(struct net_bridge *br)
{
struct rtable *rt = &br->fake_rtable;
- atomic_set(&rt->dst.__refcnt, 1);
+ rcuref_init(&rt->dst.__rcuref, 1);
rt->dst.dev = br->dev;
- dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
+ dst_init_metrics(&rt->dst, br->metrics, false);
+ dst_metric_set(&rt->dst, RTAX_MTU, br->dev->mtu);
rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE;
rt->dst.ops = &fake_dst_ops;
}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 11ed2029985f..7280c4e9305f 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -1,13 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#ifndef _BR_PRIVATE_H
@@ -19,6 +15,7 @@
#include <linux/u64_stats_sync.h>
#include <net/route.h>
#include <net/ip6_fib.h>
+#include <net/pkt_cls.h>
#include <linux/if_vlan.h>
#include <linux/rhashtable.h>
#include <linux/refcount.h>
@@ -31,6 +28,14 @@
#define BR_PORT_BITS 10
#define BR_MAX_PORTS (1<<BR_PORT_BITS)
+#define BR_MULTICAST_DEFAULT_HASH_MAX 4096
+#define BR_MULTICAST_QUERY_INTVL_MIN msecs_to_jiffies(1000)
+#define BR_MULTICAST_STARTUP_QUERY_INTVL_MIN BR_MULTICAST_QUERY_INTVL_MIN
+#define BR_MULTICAST_QUERY_INTVL_MAX msecs_to_jiffies(86400000) /* 24 hours */
+#define BR_MULTICAST_STARTUP_QUERY_INTVL_MAX BR_MULTICAST_QUERY_INTVL_MAX
+
+#define BR_HWDOM_MAX BITS_PER_LONG
+
#define BR_VERSION "2.3"
/* Control of forwarding link local multicast */
@@ -50,18 +55,18 @@ enum {
/* Path to usermode spanning tree program */
#define BR_STP_PROG "/sbin/bridge-stp"
+#define BR_FDB_NOTIFY_SETTABLE_BITS (FDB_NOTIFY_BIT | FDB_NOTIFY_INACTIVE_BIT)
+
typedef struct bridge_id bridge_id;
typedef struct mac_addr mac_addr;
typedef __u16 port_id;
-struct bridge_id
-{
+struct bridge_id {
unsigned char prio[2];
unsigned char addr[ETH_ALEN];
};
-struct mac_addr
-{
+struct mac_addr {
unsigned char addr[ETH_ALEN];
};
@@ -75,13 +80,14 @@ struct bridge_mcast_own_query {
/* other querier */
struct bridge_mcast_other_query {
struct timer_list timer;
- unsigned long delay_time;
+ struct timer_list delay_timer;
};
/* selected querier */
struct bridge_mcast_querier {
struct br_ip addr;
- struct net_bridge_port __rcu *port;
+ int port_ifidx;
+ seqcount_spinlock_t seq;
};
/* IGMP/MLD statistics */
@@ -89,33 +95,115 @@ struct bridge_mcast_stats {
struct br_mcast_stats mstats;
struct u64_stats_sync syncp;
};
+
+struct br_mdb_src_entry {
+ struct br_ip addr;
+};
+
+struct br_mdb_config {
+ struct net_bridge *br;
+ struct net_bridge_port *p;
+ struct br_mdb_entry *entry;
+ struct br_ip group;
+ bool src_entry;
+ u8 filter_mode;
+ u16 nlflags;
+ struct br_mdb_src_entry *src_entries;
+ int num_src_entries;
+ u8 rt_protocol;
+};
#endif
-struct br_vlan_stats {
- u64 rx_bytes;
- u64 rx_packets;
- u64 tx_bytes;
- u64 tx_packets;
- struct u64_stats_sync syncp;
+/* net_bridge_mcast_port must be always defined due to forwarding stubs */
+struct net_bridge_mcast_port {
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ struct net_bridge_port *port;
+ struct net_bridge_vlan *vlan;
+
+ struct bridge_mcast_own_query ip4_own_query;
+ struct timer_list ip4_mc_router_timer;
+ struct hlist_node ip4_rlist;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct bridge_mcast_own_query ip6_own_query;
+ struct timer_list ip6_mc_router_timer;
+ struct hlist_node ip6_rlist;
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+ unsigned char multicast_router;
+ u32 mdb_n_entries;
+ u32 mdb_max_entries;
+#endif /* CONFIG_BRIDGE_IGMP_SNOOPING */
+};
+
+/* net_bridge_mcast must be always defined due to forwarding stubs */
+struct net_bridge_mcast {
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ struct net_bridge *br;
+ struct net_bridge_vlan *vlan;
+
+ u32 multicast_last_member_count;
+ u32 multicast_startup_query_count;
+
+ u8 multicast_querier;
+ u8 multicast_igmp_version;
+ u8 multicast_router;
+#if IS_ENABLED(CONFIG_IPV6)
+ u8 multicast_mld_version;
+#endif
+ unsigned long multicast_last_member_interval;
+ unsigned long multicast_membership_interval;
+ unsigned long multicast_querier_interval;
+ unsigned long multicast_query_interval;
+ unsigned long multicast_query_response_interval;
+ unsigned long multicast_startup_query_interval;
+ struct hlist_head ip4_mc_router_list;
+ struct timer_list ip4_mc_router_timer;
+ struct bridge_mcast_other_query ip4_other_query;
+ struct bridge_mcast_own_query ip4_own_query;
+ struct bridge_mcast_querier ip4_querier;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct hlist_head ip6_mc_router_list;
+ struct timer_list ip6_mc_router_timer;
+ struct bridge_mcast_other_query ip6_other_query;
+ struct bridge_mcast_own_query ip6_own_query;
+ struct bridge_mcast_querier ip6_querier;
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+#endif /* CONFIG_BRIDGE_IGMP_SNOOPING */
};
struct br_tunnel_info {
- __be64 tunnel_id;
- struct metadata_dst *tunnel_dst;
+ __be64 tunnel_id;
+ struct metadata_dst __rcu *tunnel_dst;
+};
+
+/* private vlan flags */
+enum {
+ BR_VLFLAG_PER_PORT_STATS = BIT(0),
+ BR_VLFLAG_ADDED_BY_SWITCHDEV = BIT(1),
+ BR_VLFLAG_MCAST_ENABLED = BIT(2),
+ BR_VLFLAG_GLOBAL_MCAST_ENABLED = BIT(3),
+ BR_VLFLAG_NEIGH_SUPPRESS_ENABLED = BIT(4),
};
/**
* struct net_bridge_vlan - per-vlan entry
*
* @vnode: rhashtable member
+ * @tnode: rhashtable member
* @vid: VLAN id
* @flags: bridge vlan flags
+ * @priv_flags: private (in-kernel) bridge vlan flags
+ * @state: STP state (e.g. blocking, learning, forwarding)
* @stats: per-cpu VLAN statistics
* @br: if MASTER flag set, this points to a bridge struct
* @port: if MASTER flag unset, this points to a port struct
* @refcnt: if MASTER flag set, this is bumped for each port referencing it
* @brvlan: if MASTER flag unset, this points to the global per-VLAN context
* for this VLAN entry
+ * @tinfo: bridge tunnel info
+ * @br_mcast_ctx: if MASTER flag set, this is the global vlan multicast context
+ * @port_mcast_ctx: if MASTER flag unset, this is the per-port/vlan multicast
+ * context
+ * @msti: if MASTER flag set, this holds the VLANs MST instance
* @vlist: sorted list of VLAN entries
* @rcu: used for entry destruction
*
@@ -129,7 +217,9 @@ struct net_bridge_vlan {
struct rhash_head tnode;
u16 vid;
u16 flags;
- struct br_vlan_stats __percpu *stats;
+ u16 priv_flags;
+ u8 state;
+ struct pcpu_sw_netstats __percpu *stats;
union {
struct net_bridge *br;
struct net_bridge_port *port;
@@ -141,6 +231,13 @@ struct net_bridge_vlan {
struct br_tunnel_info tinfo;
+ union {
+ struct net_bridge_mcast br_mcast_ctx;
+ struct net_bridge_mcast_port port_mcast_ctx;
+ };
+
+ u16 msti;
+
struct list_head vlist;
struct rcu_head rcu;
@@ -153,6 +250,7 @@ struct net_bridge_vlan {
* @vlan_list: sorted VLAN entry list
* @num_vlans: number of total VLAN entries
* @pvid: PVID VLAN id
+ * @pvid_state: PVID's STP state (e.g. forwarding, learning, blocking)
*
* IMPORTANT: Be careful when checking if there're VLAN entries using list
* primitives because the bridge can have entries in its list which
@@ -166,6 +264,21 @@ struct net_bridge_vlan_group {
struct list_head vlan_list;
u16 num_vlans;
u16 pvid;
+ u8 pvid_state;
+};
+
+/* bridge fdb flags */
+enum {
+ BR_FDB_LOCAL,
+ BR_FDB_STATIC,
+ BR_FDB_STICKY,
+ BR_FDB_ADDED_BY_USER,
+ BR_FDB_ADDED_BY_EXT_LEARN,
+ BR_FDB_OFFLOADED,
+ BR_FDB_NOTIFY,
+ BR_FDB_NOTIFY_INACTIVE,
+ BR_FDB_LOCKED,
+ BR_FDB_DYNAMIC_LEARNED,
};
struct net_bridge_fdb_key {
@@ -179,11 +292,7 @@ struct net_bridge_fdb_entry {
struct net_bridge_fdb_key key;
struct hlist_node fdb_node;
- unsigned char is_local:1,
- is_static:1,
- added_by_user:1,
- added_by_external_learn:1,
- offloaded:1;
+ unsigned long flags;
/* write-heavy members should not affect lookups */
unsigned long updated ____cacheline_aligned_in_smp;
@@ -192,45 +301,91 @@ struct net_bridge_fdb_entry {
struct rcu_head rcu;
};
-#define MDB_PG_FLAGS_PERMANENT BIT(0)
-#define MDB_PG_FLAGS_OFFLOAD BIT(1)
+struct net_bridge_fdb_flush_desc {
+ unsigned long flags;
+ unsigned long flags_mask;
+ int port_ifindex;
+ u16 vlan_id;
+};
-struct net_bridge_port_group {
+#define MDB_PG_FLAGS_PERMANENT BIT(0)
+#define MDB_PG_FLAGS_OFFLOAD BIT(1)
+#define MDB_PG_FLAGS_FAST_LEAVE BIT(2)
+#define MDB_PG_FLAGS_STAR_EXCL BIT(3)
+#define MDB_PG_FLAGS_BLOCKED BIT(4)
+#define MDB_PG_FLAGS_OFFLOAD_FAILED BIT(5)
+
+#define PG_SRC_ENT_LIMIT 32
+
+#define BR_SGRP_F_DELETE BIT(0)
+#define BR_SGRP_F_SEND BIT(1)
+#define BR_SGRP_F_INSTALLED BIT(2)
+#define BR_SGRP_F_USER_ADDED BIT(3)
+
+struct net_bridge_mcast_gc {
+ struct hlist_node gc_node;
+ void (*destroy)(struct net_bridge_mcast_gc *gc);
+};
+
+struct net_bridge_group_src {
+ struct hlist_node node;
+
+ struct br_ip addr;
+ struct net_bridge_port_group *pg;
+ u8 flags;
+ u8 src_query_rexmit_cnt;
+ struct timer_list timer;
+
+ struct net_bridge *br;
+ struct net_bridge_mcast_gc mcast_gc;
+ struct rcu_head rcu;
+};
+
+struct net_bridge_port_group_sg_key {
struct net_bridge_port *port;
+ struct br_ip addr;
+};
+
+struct net_bridge_port_group {
struct net_bridge_port_group __rcu *next;
+ struct net_bridge_port_group_sg_key key;
+ unsigned char eth_addr[ETH_ALEN] __aligned(2);
+ unsigned char flags;
+ unsigned char filter_mode;
+ unsigned char grp_query_rexmit_cnt;
+ unsigned char rt_protocol;
+
+ struct hlist_head src_list;
+ unsigned int src_ents;
+ struct timer_list timer;
+ struct timer_list rexmit_timer;
struct hlist_node mglist;
+ struct rb_root eht_set_tree;
+ struct rb_root eht_host_tree;
+
+ struct rhash_head rhnode;
+ struct net_bridge_mcast_gc mcast_gc;
struct rcu_head rcu;
- struct timer_list timer;
- struct br_ip addr;
- unsigned char flags;
- unsigned char eth_addr[ETH_ALEN];
};
-struct net_bridge_mdb_entry
-{
- struct hlist_node hlist[2];
+struct net_bridge_mdb_entry {
+ struct rhash_head rhnode;
struct net_bridge *br;
struct net_bridge_port_group __rcu *ports;
- struct rcu_head rcu;
- struct timer_list timer;
struct br_ip addr;
bool host_joined;
-};
-struct net_bridge_mdb_htable
-{
- struct hlist_head *mhash;
+ struct timer_list timer;
+ struct hlist_node mdb_node;
+
+ struct net_bridge_mcast_gc mcast_gc;
struct rcu_head rcu;
- struct net_bridge_mdb_htable *old;
- u32 size;
- u32 max;
- u32 secret;
- u32 ver;
};
struct net_bridge_port {
struct net_bridge *br;
struct net_device *dev;
+ netdevice_tracker dev_tracker;
struct list_head list;
unsigned long flags;
@@ -238,6 +393,7 @@ struct net_bridge_port {
struct net_bridge_vlan_group __rcu *vlgrp;
#endif
struct net_bridge_port __rcu *backup_port;
+ u32 backup_nhid;
/* STP */
u8 priority;
@@ -259,16 +415,14 @@ struct net_bridge_port {
struct kobject kobj;
struct rcu_head rcu;
+ struct net_bridge_mcast_port multicast_ctx;
+
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
- struct bridge_mcast_own_query ip4_own_query;
-#if IS_ENABLED(CONFIG_IPV6)
- struct bridge_mcast_own_query ip6_own_query;
-#endif /* IS_ENABLED(CONFIG_IPV6) */
- unsigned char multicast_router;
struct bridge_mcast_stats __percpu *mcast_stats;
- struct timer_list multicast_router_timer;
+
+ u32 multicast_eht_hosts_limit;
+ u32 multicast_eht_hosts_cnt;
struct hlist_head mglist;
- struct hlist_node rlist;
#endif
#ifdef CONFIG_SYSFS
@@ -279,10 +433,17 @@ struct net_bridge_port {
struct netpoll *np;
#endif
#ifdef CONFIG_NET_SWITCHDEV
- int offload_fwd_mark;
+ /* Identifier used to group ports that share the same switchdev
+ * hardware domain.
+ */
+ int hwdom;
+ int offload_count;
+ struct netdev_phys_item_id ppid;
#endif
u16 group_fwd_mask;
u16 backup_redirected_cnt;
+
+ struct bridge_stp_xstats stp_xstats;
};
#define kobj_to_brport(obj) container_of(obj, struct net_bridge_port, kobj)
@@ -290,8 +451,6 @@ struct net_bridge_port {
#define br_auto_port(p) ((p)->flags & BR_AUTO_MASK)
#define br_promisc_port(p) ((p)->flags & BR_PROMISC)
-#define br_port_exists(dev) (dev->priv_flags & IFF_BRIDGE_PORT)
-
static inline struct net_bridge_port *br_port_get_rcu(const struct net_device *dev)
{
return rcu_dereference(dev->rx_handler_data);
@@ -299,40 +458,59 @@ static inline struct net_bridge_port *br_port_get_rcu(const struct net_device *d
static inline struct net_bridge_port *br_port_get_rtnl(const struct net_device *dev)
{
- return br_port_exists(dev) ?
+ return netif_is_bridge_port(dev) ?
rtnl_dereference(dev->rx_handler_data) : NULL;
}
static inline struct net_bridge_port *br_port_get_rtnl_rcu(const struct net_device *dev)
{
- return br_port_exists(dev) ?
+ return netif_is_bridge_port(dev) ?
rcu_dereference_rtnl(dev->rx_handler_data) : NULL;
}
+enum net_bridge_opts {
+ BROPT_VLAN_ENABLED,
+ BROPT_VLAN_STATS_ENABLED,
+ BROPT_NF_CALL_IPTABLES,
+ BROPT_NF_CALL_IP6TABLES,
+ BROPT_NF_CALL_ARPTABLES,
+ BROPT_GROUP_ADDR_SET,
+ BROPT_MULTICAST_ENABLED,
+ BROPT_MULTICAST_QUERY_USE_IFADDR,
+ BROPT_MULTICAST_STATS_ENABLED,
+ BROPT_HAS_IPV6_ADDR,
+ BROPT_NEIGH_SUPPRESS_ENABLED,
+ BROPT_MTU_SET_BY_USER,
+ BROPT_VLAN_STATS_PER_PORT,
+ BROPT_NO_LL_LEARN,
+ BROPT_VLAN_BRIDGE_BINDING,
+ BROPT_MCAST_VLAN_SNOOPING_ENABLED,
+ BROPT_MST_ENABLED,
+ BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION,
+ BROPT_FDB_LOCAL_VLAN_0,
+};
+
struct net_bridge {
spinlock_t lock;
spinlock_t hash_lock;
- struct list_head port_list;
+ struct hlist_head frame_type_list;
struct net_device *dev;
- struct pcpu_sw_netstats __percpu *stats;
+ unsigned long options;
/* These fields are accessed on each packet */
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
- u8 vlan_enabled;
- u8 vlan_stats_enabled;
__be16 vlan_proto;
u16 default_pvid;
struct net_bridge_vlan_group __rcu *vlgrp;
#endif
struct rhashtable fdb_hash_tbl;
+ struct list_head port_list;
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
union {
struct rtable fake_rtable;
struct rt6_info fake_rt6_info;
};
- bool nf_call_iptables;
- bool nf_call_ip6tables;
- bool nf_call_arptables;
+ u32 metrics[RTAX_MAX];
#endif
u16 group_fwd_mask;
u16 group_fwd_mask_required;
@@ -340,7 +518,6 @@ struct net_bridge {
/* STP */
bridge_id designated_root;
bridge_id bridge_id;
- u32 root_path_cost;
unsigned char topology_change;
unsigned char topology_change_detected;
u16 root_port;
@@ -352,9 +529,9 @@ struct net_bridge {
unsigned long bridge_hello_time;
unsigned long bridge_forward_delay;
unsigned long bridge_ageing_time;
+ u32 root_path_cost;
u8 group_addr[ETH_ALEN];
- bool group_addr_set;
enum {
BR_NO_STP, /* no spanning tree */
@@ -362,45 +539,22 @@ struct net_bridge {
BR_USER_STP, /* new RSTP in userspace */
} stp_enabled;
-#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
- unsigned char multicast_router;
+ struct net_bridge_mcast multicast_ctx;
- u8 multicast_disabled:1;
- u8 multicast_querier:1;
- u8 multicast_query_use_ifaddr:1;
- u8 has_ipv6_addr:1;
- u8 multicast_stats_enabled:1;
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ struct bridge_mcast_stats __percpu *mcast_stats;
- u32 hash_elasticity;
u32 hash_max;
- u32 multicast_last_member_count;
- u32 multicast_startup_query_count;
-
- u8 multicast_igmp_version;
+ spinlock_t multicast_lock;
- unsigned long multicast_last_member_interval;
- unsigned long multicast_membership_interval;
- unsigned long multicast_querier_interval;
- unsigned long multicast_query_interval;
- unsigned long multicast_query_response_interval;
- unsigned long multicast_startup_query_interval;
+ struct rhashtable mdb_hash_tbl;
+ struct rhashtable sg_port_tbl;
- spinlock_t multicast_lock;
- struct net_bridge_mdb_htable __rcu *mdb;
- struct hlist_head router_list;
+ struct hlist_head mcast_gc_list;
+ struct hlist_head mdb_list;
- struct timer_list multicast_router_timer;
- struct bridge_mcast_other_query ip4_other_query;
- struct bridge_mcast_own_query ip4_own_query;
- struct bridge_mcast_querier ip4_querier;
- struct bridge_mcast_stats __percpu *mcast_stats;
-#if IS_ENABLED(CONFIG_IPV6)
- struct bridge_mcast_other_query ip6_other_query;
- struct bridge_mcast_own_query ip6_own_query;
- struct bridge_mcast_querier ip6_querier;
- u8 multicast_mld_version;
-#endif /* IS_ENABLED(CONFIG_IPV6) */
+ struct work_struct mcast_gc_work;
#endif
struct timer_list hello_timer;
@@ -410,32 +564,63 @@ struct net_bridge {
struct kobject *ifobj;
u32 auto_cnt;
+ atomic_t fdb_n_learned;
+ u32 fdb_max_learned;
+
#ifdef CONFIG_NET_SWITCHDEV
- int offload_fwd_mark;
+ /* Counter used to make sure that hardware domains get unique
+ * identifiers in case a bridge spans multiple switchdev instances.
+ */
+ int last_hwdom;
+ /* Bit mask of hardware domain numbers in use */
+ unsigned long busy_hwdoms;
#endif
- bool neigh_suppress_enabled;
- bool mtu_set_by_user;
struct hlist_head fdb_list;
+
+#if IS_ENABLED(CONFIG_BRIDGE_MRP)
+ struct hlist_head mrp_list;
+#endif
+#if IS_ENABLED(CONFIG_BRIDGE_CFM)
+ struct hlist_head mep_list;
+#endif
};
struct br_input_skb_cb {
struct net_device *brdev;
+ u16 frag_max_size;
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
- int igmp;
- int mrouters_only;
+ u8 igmp;
+ u8 mrouters_only:1;
#endif
-
- bool proxyarp_replied;
- bool src_port_isolated;
-
+ u8 proxyarp_replied:1;
+ u8 src_port_isolated:1;
+ u8 promisc:1;
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
- bool vlan_filtered;
+ u8 vlan_filtered:1;
+#endif
+#ifdef CONFIG_NETFILTER_FAMILY_BRIDGE
+ u8 br_netfilter_broute:1;
#endif
#ifdef CONFIG_NET_SWITCHDEV
- int offload_fwd_mark;
+ /* Set if TX data plane offloading is used towards at least one
+ * hardware domain.
+ */
+ u8 tx_fwd_offload:1;
+ /* The switchdev hardware domain from which this packet was received.
+ * If skb->offload_fwd_mark was set, then this packet was already
+ * forwarded by hardware to the other ports in the source hardware
+ * domain, otherwise it wasn't.
+ */
+ int src_hwdom;
+ /* Bit mask of hardware domains towards this packet has already been
+ * transmitted using the TX data plane offload.
+ */
+ unsigned long fwd_hwdoms;
#endif
+
+ u32 backup_nhid;
};
#define BR_INPUT_SKB_CB(__skb) ((struct br_input_skb_cb *)(__skb)->cb)
@@ -492,6 +677,126 @@ static inline bool br_vlan_should_use(const struct net_bridge_vlan *v)
return true;
}
+static inline bool nbp_state_should_learn(const struct net_bridge_port *p)
+{
+ return p->state == BR_STATE_LEARNING || p->state == BR_STATE_FORWARDING;
+}
+
+static inline bool br_vlan_valid_id(u16 vid, struct netlink_ext_ack *extack)
+{
+ bool ret = vid > 0 && vid < VLAN_VID_MASK;
+
+ if (!ret)
+ NL_SET_ERR_MSG_MOD(extack, "Vlan id is invalid");
+
+ return ret;
+}
+
+static inline bool br_vlan_valid_range(const struct bridge_vlan_info *cur,
+ const struct bridge_vlan_info *last,
+ struct netlink_ext_ack *extack)
+{
+ /* pvid flag is not allowed in ranges */
+ if (cur->flags & BRIDGE_VLAN_INFO_PVID) {
+ NL_SET_ERR_MSG_MOD(extack, "Pvid isn't allowed in a range");
+ return false;
+ }
+
+ /* when cur is the range end, check if:
+ * - it has range start flag
+ * - range ids are invalid (end is equal to or before start)
+ */
+ if (last) {
+ if (cur->flags & BRIDGE_VLAN_INFO_RANGE_BEGIN) {
+ NL_SET_ERR_MSG_MOD(extack, "Found a new vlan range start while processing one");
+ return false;
+ } else if (!(cur->flags & BRIDGE_VLAN_INFO_RANGE_END)) {
+ NL_SET_ERR_MSG_MOD(extack, "Vlan range end flag is missing");
+ return false;
+ } else if (cur->vid <= last->vid) {
+ NL_SET_ERR_MSG_MOD(extack, "End vlan id is less than or equal to start vlan id");
+ return false;
+ }
+ }
+
+ /* check for required range flags */
+ if (!(cur->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN |
+ BRIDGE_VLAN_INFO_RANGE_END))) {
+ NL_SET_ERR_MSG_MOD(extack, "Both vlan range flags are missing");
+ return false;
+ }
+
+ return true;
+}
+
+static inline u8 br_vlan_multicast_router(const struct net_bridge_vlan *v)
+{
+ u8 mcast_router = MDB_RTR_TYPE_DISABLED;
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ if (!br_vlan_is_master(v))
+ mcast_router = v->port_mcast_ctx.multicast_router;
+ else
+ mcast_router = v->br_mcast_ctx.multicast_router;
+#endif
+
+ return mcast_router;
+}
+
+static inline int br_afspec_cmd_to_rtm(int cmd)
+{
+ switch (cmd) {
+ case RTM_SETLINK:
+ return RTM_NEWVLAN;
+ case RTM_DELLINK:
+ return RTM_DELVLAN;
+ }
+
+ return 0;
+}
+
+static inline int br_opt_get(const struct net_bridge *br,
+ enum net_bridge_opts opt)
+{
+ return test_bit(opt, &br->options);
+}
+
+int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on,
+ struct netlink_ext_ack *extack);
+int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt);
+int br_boolopt_multi_toggle(struct net_bridge *br,
+ struct br_boolopt_multi *bm,
+ struct netlink_ext_ack *extack);
+void br_boolopt_multi_get(const struct net_bridge *br,
+ struct br_boolopt_multi *bm);
+void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on);
+
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+static inline void br_tc_skb_miss_set(struct sk_buff *skb, bool miss)
+{
+ struct tc_skb_ext *ext;
+
+ if (!tc_skb_ext_tc_enabled())
+ return;
+
+ ext = skb_ext_find(skb, TC_SKB_EXT);
+ if (ext) {
+ ext->l2_miss = miss;
+ return;
+ }
+ if (!miss)
+ return;
+ ext = tc_skb_ext_alloc(skb);
+ if (!ext)
+ return;
+ ext->l2_miss = true;
+}
+#else
+static inline void br_tc_skb_miss_set(struct sk_buff *skb, bool miss)
+{
+}
+#endif
+
/* br_device.c */
void br_dev_setup(struct net_device *dev);
void br_dev_delete(struct net_device *dev, struct list_head *list);
@@ -500,10 +805,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev);
static inline void br_netpoll_send_skb(const struct net_bridge_port *p,
struct sk_buff *skb)
{
- struct netpoll *np = p->np;
-
- if (np)
- netpoll_send_skb(np, skb);
+ netpoll_send_skb(p->np, skb);
}
int br_netpoll_enable(struct net_bridge_port *p);
@@ -525,17 +827,25 @@ static inline void br_netpoll_disable(struct net_bridge_port *p)
#endif
/* br_fdb.c */
+#define FDB_FLUSH_IGNORED_NDM_FLAGS (NTF_MASTER | NTF_SELF)
+#define FDB_FLUSH_ALLOWED_NDM_STATES (NUD_PERMANENT | NUD_NOARP)
+#define FDB_FLUSH_ALLOWED_NDM_FLAGS (NTF_USE | NTF_EXT_LEARNED | \
+ NTF_STICKY | NTF_OFFLOADED)
+
int br_fdb_init(void);
void br_fdb_fini(void);
int br_fdb_hash_init(struct net_bridge *br);
void br_fdb_hash_fini(struct net_bridge *br);
-void br_fdb_flush(struct net_bridge *br);
+void br_fdb_flush(struct net_bridge *br,
+ const struct net_bridge_fdb_flush_desc *desc);
void br_fdb_find_delete_local(struct net_bridge *br,
const struct net_bridge_port *p,
const unsigned char *addr, u16 vid);
void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr);
void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr);
void br_fdb_cleanup(struct work_struct *work);
+int br_fdb_toggle_local_vlan_0(struct net_bridge *br, bool on,
+ struct netlink_ext_ack *extack);
void br_fdb_delete_by_port(struct net_bridge *br,
const struct net_bridge_port *p, u16 vid, int do_all);
struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
@@ -544,27 +854,34 @@ struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
int br_fdb_test_addr(struct net_device *dev, unsigned char *addr);
int br_fdb_fillbuf(struct net_bridge *br, void *buf, unsigned long count,
unsigned long off);
-int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source,
- const unsigned char *addr, u16 vid);
+int br_fdb_add_local(struct net_bridge *br, struct net_bridge_port *source,
+ const unsigned char *addr, u16 vid);
void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
- const unsigned char *addr, u16 vid, bool added_by_user);
+ const unsigned char *addr, u16 vid, unsigned long flags);
int br_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
- struct net_device *dev, const unsigned char *addr, u16 vid);
+ struct net_device *dev, const unsigned char *addr, u16 vid,
+ bool *notified, struct netlink_ext_ack *extack);
+int br_fdb_delete_bulk(struct nlmsghdr *nlh, struct net_device *dev,
+ struct netlink_ext_ack *extack);
int br_fdb_add(struct ndmsg *nlh, struct nlattr *tb[], struct net_device *dev,
- const unsigned char *addr, u16 vid, u16 nlh_flags);
+ const unsigned char *addr, u16 vid, u16 nlh_flags,
+ bool *notified, struct netlink_ext_ack *extack);
int br_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
struct net_device *dev, struct net_device *fdev, int *idx);
+int br_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev,
+ const unsigned char *addr, u16 vid, u32 portid, u32 seq,
+ struct netlink_ext_ack *extack);
int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p);
void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p);
int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
const unsigned char *addr, u16 vid,
- bool swdev_notify);
+ bool locked, bool swdev_notify);
int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
const unsigned char *addr, u16 vid,
bool swdev_notify);
void br_fdb_offloaded_set(struct net_bridge *br, struct net_bridge_port *p,
- const unsigned char *addr, u16 vid);
+ const unsigned char *addr, u16 vid, bool offloaded);
/* br_forward.c */
enum br_pkt_type {
@@ -577,7 +894,8 @@ void br_forward(const struct net_bridge_port *to, struct sk_buff *skb,
bool local_rcv, bool local_orig);
int br_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
void br_flood(struct net_bridge *br, struct sk_buff *skb,
- enum br_pkt_type pkt_type, bool local_rcv, bool local_orig);
+ enum br_pkt_type pkt_type, bool local_rcv, bool local_orig,
+ u16 vid);
/* return true if both source port and dest port are isolated */
static inline bool br_skb_isolated(const struct net_bridge_port *to,
@@ -603,16 +921,26 @@ int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev);
/* br_input.c */
int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
-rx_handler_result_t br_handle_frame(struct sk_buff **pskb);
+rx_handler_func_t *br_get_rx_handler(const struct net_device *dev);
+
+struct br_frame_type {
+ __be16 type;
+ int (*frame_handler)(struct net_bridge_port *port,
+ struct sk_buff *skb);
+ struct hlist_node list;
+};
+
+void br_add_frame(struct net_bridge *br, struct br_frame_type *ft);
+void br_del_frame(struct net_bridge *br, struct br_frame_type *ft);
static inline bool br_rx_handler_check_rcu(const struct net_device *dev)
{
- return rcu_dereference(dev->rx_handler) == br_handle_frame;
+ return rcu_dereference(dev->rx_handler) == br_get_rx_handler(dev);
}
static inline bool br_rx_handler_check_rtnl(const struct net_device *dev)
{
- return rcu_dereference_rtnl(dev->rx_handler) == br_handle_frame;
+ return rcu_dereference_rtnl(dev->rx_handler) == br_get_rx_handler(dev);
}
static inline struct net_bridge_port *br_port_get_check_rcu(const struct net_device *dev)
@@ -627,79 +955,214 @@ br_port_get_check_rtnl(const struct net_device *dev)
}
/* br_ioctl.c */
-int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
-int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd,
- void __user *arg);
+int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq,
+ void __user *data, int cmd);
+int br_ioctl_stub(struct net *net, unsigned int cmd, void __user *uarg);
/* br_multicast.c */
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
-extern unsigned int br_mdb_rehash_seq;
-int br_multicast_rcv(struct net_bridge *br, struct net_bridge_port *port,
+int br_multicast_rcv(struct net_bridge_mcast **brmctx,
+ struct net_bridge_mcast_port **pmctx,
+ struct net_bridge_vlan *vlan,
struct sk_buff *skb, u16 vid);
-struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
- struct sk_buff *skb, u16 vid);
+struct net_bridge_mdb_entry *
+br_mdb_entry_skb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb,
+ u16 vid);
int br_multicast_add_port(struct net_bridge_port *port);
void br_multicast_del_port(struct net_bridge_port *port);
void br_multicast_enable_port(struct net_bridge_port *port);
void br_multicast_disable_port(struct net_bridge_port *port);
void br_multicast_init(struct net_bridge *br);
+void br_multicast_join_snoopers(struct net_bridge *br);
+void br_multicast_leave_snoopers(struct net_bridge *br);
void br_multicast_open(struct net_bridge *br);
void br_multicast_stop(struct net_bridge *br);
void br_multicast_dev_del(struct net_bridge *br);
-void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
- struct sk_buff *skb, bool local_rcv, bool local_orig);
-int br_multicast_set_router(struct net_bridge *br, unsigned long val);
-int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val);
-int br_multicast_toggle(struct net_bridge *br, unsigned long val);
-int br_multicast_set_querier(struct net_bridge *br, unsigned long val);
-int br_multicast_set_hash_max(struct net_bridge *br, unsigned long val);
-int br_multicast_set_igmp_version(struct net_bridge *br, unsigned long val);
+void br_multicast_flood(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb,
+ struct net_bridge_mcast *brmctx,
+ bool local_rcv, bool local_orig);
+int br_multicast_set_router(struct net_bridge_mcast *brmctx, unsigned long val);
+int br_multicast_set_port_router(struct net_bridge_mcast_port *pmctx,
+ unsigned long val);
+int br_multicast_set_vlan_router(struct net_bridge_vlan *v, u8 mcast_router);
+int br_multicast_toggle(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack);
+int br_multicast_set_querier(struct net_bridge_mcast *brmctx, unsigned long val);
+int br_multicast_set_igmp_version(struct net_bridge_mcast *brmctx,
+ unsigned long val);
#if IS_ENABLED(CONFIG_IPV6)
-int br_multicast_set_mld_version(struct net_bridge *br, unsigned long val);
+int br_multicast_set_mld_version(struct net_bridge_mcast *brmctx,
+ unsigned long val);
#endif
struct net_bridge_mdb_entry *
-br_mdb_ip_get(struct net_bridge_mdb_htable *mdb, struct br_ip *dst);
+br_mdb_ip_get(struct net_bridge *br, struct br_ip *dst);
struct net_bridge_mdb_entry *
-br_multicast_new_group(struct net_bridge *br, struct net_bridge_port *port,
- struct br_ip *group);
-void br_multicast_free_pg(struct rcu_head *head);
+br_multicast_new_group(struct net_bridge *br, struct br_ip *group);
struct net_bridge_port_group *
-br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group,
+br_multicast_new_port_group(struct net_bridge_port *port,
+ const struct br_ip *group,
struct net_bridge_port_group __rcu *next,
- unsigned char flags, const unsigned char *src);
-void br_mdb_init(void);
-void br_mdb_uninit(void);
-void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
- struct br_ip *group, int type, u8 flags);
-void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
+ unsigned char flags, const unsigned char *src,
+ u8 filter_mode, u8 rt_protocol,
+ struct netlink_ext_ack *extack);
+void br_multicast_del_port_group(struct net_bridge_port_group *p);
+int br_mdb_hash_init(struct net_bridge *br);
+void br_mdb_hash_fini(struct net_bridge *br);
+void br_mdb_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg, int type);
+void br_mdb_flag_change_notify(struct net_device *dev, struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg);
+void br_rtr_notify(struct net_device *dev, struct net_bridge_mcast_port *pmctx,
int type);
-void br_multicast_count(struct net_bridge *br, const struct net_bridge_port *p,
+void br_multicast_del_pg(struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ struct net_bridge_port_group __rcu **pp);
+void br_multicast_count(struct net_bridge *br,
+ const struct net_bridge_port *p,
const struct sk_buff *skb, u8 type, u8 dir);
int br_multicast_init_stats(struct net_bridge *br);
void br_multicast_uninit_stats(struct net_bridge *br);
void br_multicast_get_stats(const struct net_bridge *br,
const struct net_bridge_port *p,
struct br_mcast_stats *dest);
+u32 br_multicast_ngroups_get(const struct net_bridge_mcast_port *pmctx);
+void br_multicast_ngroups_set_max(struct net_bridge_mcast_port *pmctx, u32 max);
+u32 br_multicast_ngroups_get_max(const struct net_bridge_mcast_port *pmctx);
+int br_mdb_add(struct net_device *dev, struct nlattr *tb[], u16 nlmsg_flags,
+ struct netlink_ext_ack *extack);
+int br_mdb_del(struct net_device *dev, struct nlattr *tb[],
+ struct netlink_ext_ack *extack);
+int br_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[],
+ struct netlink_ext_ack *extack);
+int br_mdb_dump(struct net_device *dev, struct sk_buff *skb,
+ struct netlink_callback *cb);
+int br_mdb_get(struct net_device *dev, struct nlattr *tb[], u32 portid, u32 seq,
+ struct netlink_ext_ack *extack);
+void br_multicast_host_join(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_mdb_entry *mp, bool notify);
+void br_multicast_host_leave(struct net_bridge_mdb_entry *mp, bool notify);
+void br_multicast_star_g_handle_mode(struct net_bridge_port_group *pg,
+ u8 filter_mode);
+void br_multicast_sg_add_exclude_ports(struct net_bridge_mdb_entry *star_mp,
+ struct net_bridge_port_group *sg);
+struct net_bridge_group_src *
+br_multicast_find_group_src(struct net_bridge_port_group *pg, struct br_ip *ip);
+struct net_bridge_group_src *
+br_multicast_new_group_src(struct net_bridge_port_group *pg,
+ struct br_ip *src_ip);
+void __br_multicast_del_group_src(struct net_bridge_group_src *src);
+void br_multicast_del_group_src(struct net_bridge_group_src *src,
+ bool fastleave);
+void br_multicast_ctx_init(struct net_bridge *br,
+ struct net_bridge_vlan *vlan,
+ struct net_bridge_mcast *brmctx);
+void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx);
+void br_multicast_port_ctx_init(struct net_bridge_port *port,
+ struct net_bridge_vlan *vlan,
+ struct net_bridge_mcast_port *pmctx);
+void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx);
+void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v, u8 state);
+void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan, bool on);
+int br_multicast_toggle_vlan_snooping(struct net_bridge *br, bool on,
+ struct netlink_ext_ack *extack);
+bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan, bool on);
+
+int br_rports_fill_info(struct sk_buff *skb,
+ const struct net_bridge_mcast *brmctx);
+int br_multicast_dump_querier_state(struct sk_buff *skb,
+ const struct net_bridge_mcast *brmctx,
+ int nest_attr);
+size_t br_multicast_querier_state_size(void);
+size_t br_rports_size(const struct net_bridge_mcast *brmctx);
+void br_multicast_set_query_intvl(struct net_bridge_mcast *brmctx,
+ unsigned long val);
+void br_multicast_set_startup_query_intvl(struct net_bridge_mcast *brmctx,
+ unsigned long val);
+
+static inline bool br_group_is_l2(const struct br_ip *group)
+{
+ return group->proto == 0;
+}
#define mlock_dereference(X, br) \
rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock))
-static inline bool br_multicast_is_router(struct net_bridge *br)
+static inline struct hlist_node *
+br_multicast_get_first_rport_node(struct net_bridge_mcast *brmctx,
+ struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (skb->protocol == htons(ETH_P_IPV6))
+ return rcu_dereference(hlist_first_rcu(&brmctx->ip6_mc_router_list));
+#endif
+ return rcu_dereference(hlist_first_rcu(&brmctx->ip4_mc_router_list));
+}
+
+static inline struct net_bridge_port *
+br_multicast_rport_from_node_skb(struct hlist_node *rp, struct sk_buff *skb)
+{
+ struct net_bridge_mcast_port *mctx;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (skb->protocol == htons(ETH_P_IPV6))
+ mctx = hlist_entry_safe(rp, struct net_bridge_mcast_port,
+ ip6_rlist);
+ else
+#endif
+ mctx = hlist_entry_safe(rp, struct net_bridge_mcast_port,
+ ip4_rlist);
+
+ if (mctx)
+ return mctx->port;
+ else
+ return NULL;
+}
+
+static inline bool br_ip4_multicast_is_router(struct net_bridge_mcast *brmctx)
+{
+ return timer_pending(&brmctx->ip4_mc_router_timer);
+}
+
+static inline bool br_ip6_multicast_is_router(struct net_bridge_mcast *brmctx)
{
- return br->multicast_router == 2 ||
- (br->multicast_router == 1 &&
- timer_pending(&br->multicast_router_timer));
+#if IS_ENABLED(CONFIG_IPV6)
+ return timer_pending(&brmctx->ip6_mc_router_timer);
+#else
+ return false;
+#endif
}
static inline bool
-__br_multicast_querier_exists(struct net_bridge *br,
- struct bridge_mcast_other_query *querier,
- const bool is_ipv6)
+br_multicast_is_router(struct net_bridge_mcast *brmctx, struct sk_buff *skb)
+{
+ switch (brmctx->multicast_router) {
+ case MDB_RTR_TYPE_PERM:
+ return true;
+ case MDB_RTR_TYPE_TEMP_QUERY:
+ if (skb) {
+ if (skb->protocol == htons(ETH_P_IP))
+ return br_ip4_multicast_is_router(brmctx);
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ return br_ip6_multicast_is_router(brmctx);
+ } else {
+ return br_ip4_multicast_is_router(brmctx) ||
+ br_ip6_multicast_is_router(brmctx);
+ }
+ fallthrough;
+ default:
+ return false;
+ }
+}
+
+static inline bool
+__br_multicast_querier_exists(struct net_bridge_mcast *brmctx,
+ struct bridge_mcast_other_query *querier,
+ const bool is_ipv6)
{
bool own_querier_enabled;
- if (br->multicast_querier) {
- if (is_ipv6 && !br->has_ipv6_addr)
+ if (brmctx->multicast_querier) {
+ if (is_ipv6 && !br_opt_get(brmctx->br, BROPT_HAS_IPV6_ADDR))
own_querier_enabled = false;
else
own_querier_enabled = true;
@@ -707,21 +1170,52 @@ __br_multicast_querier_exists(struct net_bridge *br,
own_querier_enabled = false;
}
- return time_is_before_jiffies(querier->delay_time) &&
+ return !timer_pending(&querier->delay_timer) &&
(own_querier_enabled || timer_pending(&querier->timer));
}
-static inline bool br_multicast_querier_exists(struct net_bridge *br,
- struct ethhdr *eth)
+static inline bool br_multicast_querier_exists(struct net_bridge_mcast *brmctx,
+ struct ethhdr *eth,
+ const struct net_bridge_mdb_entry *mdb)
{
switch (eth->h_proto) {
case (htons(ETH_P_IP)):
- return __br_multicast_querier_exists(br,
- &br->ip4_other_query, false);
+ return __br_multicast_querier_exists(brmctx,
+ &brmctx->ip4_other_query, false);
#if IS_ENABLED(CONFIG_IPV6)
case (htons(ETH_P_IPV6)):
- return __br_multicast_querier_exists(br,
- &br->ip6_other_query, true);
+ return __br_multicast_querier_exists(brmctx,
+ &brmctx->ip6_other_query, true);
+#endif
+ default:
+ return !!mdb && br_group_is_l2(&mdb->addr);
+ }
+}
+
+static inline bool br_multicast_is_star_g(const struct br_ip *ip)
+{
+ switch (ip->proto) {
+ case htons(ETH_P_IP):
+ return ipv4_is_zeronet(ip->src.ip4);
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ return ipv6_addr_any(&ip->src.ip6);
+#endif
+ default:
+ return false;
+ }
+}
+
+static inline bool
+br_multicast_should_handle_mode(const struct net_bridge_mcast *brmctx,
+ __be16 proto)
+{
+ switch (proto) {
+ case htons(ETH_P_IP):
+ return !!(brmctx->multicast_igmp_version == 3);
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ return !!(brmctx->multicast_mld_version == 2);
#endif
default:
return false;
@@ -732,17 +1226,162 @@ static inline int br_multicast_igmp_type(const struct sk_buff *skb)
{
return BR_INPUT_SKB_CB(skb)->igmp;
}
+
+static inline unsigned long br_multicast_lmqt(const struct net_bridge_mcast *brmctx)
+{
+ return brmctx->multicast_last_member_interval *
+ brmctx->multicast_last_member_count;
+}
+
+static inline unsigned long br_multicast_gmi(const struct net_bridge_mcast *brmctx)
+{
+ return brmctx->multicast_membership_interval;
+}
+
+static inline bool
+br_multicast_ctx_is_vlan(const struct net_bridge_mcast *brmctx)
+{
+ return !!brmctx->vlan;
+}
+
+static inline bool
+br_multicast_port_ctx_is_vlan(const struct net_bridge_mcast_port *pmctx)
+{
+ return !!pmctx->vlan;
+}
+
+static inline struct net_bridge_mcast *
+br_multicast_port_ctx_get_global(const struct net_bridge_mcast_port *pmctx)
+{
+ if (!br_multicast_port_ctx_is_vlan(pmctx))
+ return &pmctx->port->br->multicast_ctx;
+ else
+ return &pmctx->vlan->brvlan->br_mcast_ctx;
+}
+
+static inline bool
+br_multicast_ctx_vlan_global_disabled(const struct net_bridge_mcast *brmctx)
+{
+ return br_multicast_ctx_is_vlan(brmctx) &&
+ (!br_opt_get(brmctx->br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) ||
+ !(brmctx->vlan->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED));
+}
+
+static inline bool
+br_multicast_ctx_vlan_disabled(const struct net_bridge_mcast *brmctx)
+{
+ return br_multicast_ctx_is_vlan(brmctx) &&
+ !(brmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED);
+}
+
+static inline bool
+br_multicast_port_ctx_vlan_disabled(const struct net_bridge_mcast_port *pmctx)
+{
+ return br_multicast_port_ctx_is_vlan(pmctx) &&
+ !(pmctx->vlan->priv_flags & BR_VLFLAG_MCAST_ENABLED);
+}
+
+static inline bool
+br_multicast_port_ctx_state_disabled(const struct net_bridge_mcast_port *pmctx)
+{
+ return pmctx->port->state == BR_STATE_DISABLED ||
+ (br_multicast_port_ctx_is_vlan(pmctx) &&
+ (br_multicast_port_ctx_vlan_disabled(pmctx) ||
+ pmctx->vlan->state == BR_STATE_DISABLED));
+}
+
+static inline bool
+br_multicast_port_ctx_state_stopped(const struct net_bridge_mcast_port *pmctx)
+{
+ return br_multicast_port_ctx_state_disabled(pmctx) ||
+ pmctx->port->state == BR_STATE_BLOCKING ||
+ (br_multicast_port_ctx_is_vlan(pmctx) &&
+ pmctx->vlan->state == BR_STATE_BLOCKING);
+}
+
+static inline bool
+br_rports_have_mc_router(const struct net_bridge_mcast *brmctx)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ return !hlist_empty(&brmctx->ip4_mc_router_list) ||
+ !hlist_empty(&brmctx->ip6_mc_router_list);
#else
-static inline int br_multicast_rcv(struct net_bridge *br,
- struct net_bridge_port *port,
+ return !hlist_empty(&brmctx->ip4_mc_router_list);
+#endif
+}
+
+static inline bool
+br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
+ const struct net_bridge_mcast *brmctx2)
+{
+ return brmctx1->multicast_igmp_version ==
+ brmctx2->multicast_igmp_version &&
+ brmctx1->multicast_last_member_count ==
+ brmctx2->multicast_last_member_count &&
+ brmctx1->multicast_startup_query_count ==
+ brmctx2->multicast_startup_query_count &&
+ brmctx1->multicast_last_member_interval ==
+ brmctx2->multicast_last_member_interval &&
+ brmctx1->multicast_membership_interval ==
+ brmctx2->multicast_membership_interval &&
+ brmctx1->multicast_querier_interval ==
+ brmctx2->multicast_querier_interval &&
+ brmctx1->multicast_query_interval ==
+ brmctx2->multicast_query_interval &&
+ brmctx1->multicast_query_response_interval ==
+ brmctx2->multicast_query_response_interval &&
+ brmctx1->multicast_startup_query_interval ==
+ brmctx2->multicast_startup_query_interval &&
+ brmctx1->multicast_querier == brmctx2->multicast_querier &&
+ brmctx1->multicast_router == brmctx2->multicast_router &&
+ !br_rports_have_mc_router(brmctx1) &&
+ !br_rports_have_mc_router(brmctx2) &&
+#if IS_ENABLED(CONFIG_IPV6)
+ brmctx1->multicast_mld_version ==
+ brmctx2->multicast_mld_version &&
+#endif
+ true;
+}
+
+static inline bool
+br_multicast_ctx_matches_vlan_snooping(const struct net_bridge_mcast *brmctx)
+{
+ bool vlan_snooping_enabled;
+
+ vlan_snooping_enabled = !!br_opt_get(brmctx->br,
+ BROPT_MCAST_VLAN_SNOOPING_ENABLED);
+
+ return !!(vlan_snooping_enabled == br_multicast_ctx_is_vlan(brmctx));
+}
+
+static inline void
+br_multicast_set_pg_offload_flags(struct net_bridge_port_group *p,
+ bool offloaded)
+{
+ p->flags &= ~(MDB_PG_FLAGS_OFFLOAD | MDB_PG_FLAGS_OFFLOAD_FAILED);
+ p->flags |= (offloaded ? MDB_PG_FLAGS_OFFLOAD :
+ MDB_PG_FLAGS_OFFLOAD_FAILED);
+}
+
+static inline bool
+br_mdb_should_notify(const struct net_bridge *br, u8 changed_flags)
+{
+ return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION) &&
+ (changed_flags & MDB_PG_FLAGS_OFFLOAD_FAILED);
+}
+#else
+static inline int br_multicast_rcv(struct net_bridge_mcast **brmctx,
+ struct net_bridge_mcast_port **pmctx,
+ struct net_bridge_vlan *vlan,
struct sk_buff *skb,
u16 vid)
{
return 0;
}
-static inline struct net_bridge_mdb_entry *br_mdb_get(struct net_bridge *br,
- struct sk_buff *skb, u16 vid)
+static inline struct net_bridge_mdb_entry *
+br_mdb_entry_skb_get(struct net_bridge_mcast *brmctx, struct sk_buff *skb,
+ u16 vid)
{
return NULL;
}
@@ -768,6 +1407,14 @@ static inline void br_multicast_init(struct net_bridge *br)
{
}
+static inline void br_multicast_join_snoopers(struct net_bridge *br)
+{
+}
+
+static inline void br_multicast_leave_snoopers(struct net_bridge *br)
+{
+}
+
static inline void br_multicast_open(struct net_bridge *br)
{
}
@@ -782,26 +1429,61 @@ static inline void br_multicast_dev_del(struct net_bridge *br)
static inline void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
struct sk_buff *skb,
+ struct net_bridge_mcast *brmctx,
bool local_rcv, bool local_orig)
{
}
-static inline bool br_multicast_is_router(struct net_bridge *br)
+static inline bool br_multicast_is_router(struct net_bridge_mcast *brmctx,
+ struct sk_buff *skb)
{
return false;
}
-static inline bool br_multicast_querier_exists(struct net_bridge *br,
- struct ethhdr *eth)
+static inline bool br_multicast_querier_exists(struct net_bridge_mcast *brmctx,
+ struct ethhdr *eth,
+ const struct net_bridge_mdb_entry *mdb)
{
return false;
}
-static inline void br_mdb_init(void)
+static inline int br_mdb_add(struct net_device *dev, struct nlattr *tb[],
+ u16 nlmsg_flags, struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_mdb_del(struct net_device *dev, struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
{
+ return -EOPNOTSUPP;
}
-static inline void br_mdb_uninit(void)
+static inline int br_mdb_del_bulk(struct net_device *dev, struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_mdb_dump(struct net_device *dev, struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return 0;
+}
+
+static inline int br_mdb_get(struct net_device *dev, struct nlattr *tb[],
+ u32 portid, u32 seq,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_mdb_hash_init(struct net_bridge *br)
+{
+ return 0;
+}
+
+static inline void br_mdb_hash_fini(struct net_bridge *br)
{
}
@@ -825,13 +1507,64 @@ static inline int br_multicast_igmp_type(const struct sk_buff *skb)
{
return 0;
}
+
+static inline void br_multicast_ctx_init(struct net_bridge *br,
+ struct net_bridge_vlan *vlan,
+ struct net_bridge_mcast *brmctx)
+{
+}
+
+static inline void br_multicast_ctx_deinit(struct net_bridge_mcast *brmctx)
+{
+}
+
+static inline void br_multicast_port_ctx_init(struct net_bridge_port *port,
+ struct net_bridge_vlan *vlan,
+ struct net_bridge_mcast_port *pmctx)
+{
+}
+
+static inline void br_multicast_port_ctx_deinit(struct net_bridge_mcast_port *pmctx)
+{
+}
+
+static inline void br_multicast_update_vlan_mcast_ctx(struct net_bridge_vlan *v,
+ u8 state)
+{
+}
+
+static inline void br_multicast_toggle_one_vlan(struct net_bridge_vlan *vlan,
+ bool on)
+{
+}
+
+static inline int br_multicast_toggle_vlan_snooping(struct net_bridge *br,
+ bool on,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline bool br_multicast_toggle_global_vlan(struct net_bridge_vlan *vlan,
+ bool on)
+{
+ return false;
+}
+
+static inline bool
+br_multicast_ctx_options_equal(const struct net_bridge_mcast *brmctx1,
+ const struct net_bridge_mcast *brmctx2)
+{
+ return true;
+}
#endif
/* br_vlan.c */
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
bool br_allowed_ingress(const struct net_bridge *br,
struct net_bridge_vlan_group *vg, struct sk_buff *skb,
- u16 *vid);
+ u16 *vid, u8 *state,
+ struct net_bridge_vlan **vlan);
bool br_allowed_egress(struct net_bridge_vlan_group *vg,
const struct sk_buff *skb);
bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid);
@@ -840,27 +1573,53 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br,
struct net_bridge_vlan_group *vg,
struct sk_buff *skb);
int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags,
- bool *changed);
+ bool *changed, struct netlink_ext_ack *extack);
int br_vlan_delete(struct net_bridge *br, u16 vid);
void br_vlan_flush(struct net_bridge *br);
struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid);
void br_recalculate_fwd_mask(struct net_bridge *br);
-int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val);
-int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val);
-int __br_vlan_set_proto(struct net_bridge *br, __be16 proto);
-int br_vlan_set_proto(struct net_bridge *br, unsigned long val);
+int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack);
+int __br_vlan_set_proto(struct net_bridge *br, __be16 proto,
+ struct netlink_ext_ack *extack);
+int br_vlan_set_proto(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack);
int br_vlan_set_stats(struct net_bridge *br, unsigned long val);
+int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val);
int br_vlan_init(struct net_bridge *br);
-int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val);
-int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid);
+int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack);
+int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid,
+ struct netlink_ext_ack *extack);
int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
- bool *changed);
+ bool *changed, struct netlink_ext_ack *extack);
int nbp_vlan_delete(struct net_bridge_port *port, u16 vid);
void nbp_vlan_flush(struct net_bridge_port *port);
-int nbp_vlan_init(struct net_bridge_port *port);
+int nbp_vlan_init(struct net_bridge_port *port, struct netlink_ext_ack *extack);
int nbp_get_num_vlan_infos(struct net_bridge_port *p, u32 filter_mask);
void br_vlan_get_stats(const struct net_bridge_vlan *v,
- struct br_vlan_stats *stats);
+ struct pcpu_sw_netstats *stats);
+void br_vlan_port_event(struct net_bridge_port *p, unsigned long event);
+int br_vlan_bridge_event(struct net_device *dev, unsigned long event,
+ void *ptr);
+void br_vlan_vlan_upper_event(struct net_device *br_dev,
+ struct net_device *vlan_dev,
+ unsigned long event);
+int br_vlan_rtnl_init(void);
+void br_vlan_rtnl_uninit(void);
+void br_vlan_notify(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ u16 vid, u16 vid_range,
+ int cmd);
+bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *range_end);
+
+void br_vlan_fill_forward_path_pvid(struct net_bridge *br,
+ struct net_device_path_ctx *ctx,
+ struct net_device_path *path);
+int br_vlan_fill_forward_path_mode(struct net_bridge *br,
+ struct net_bridge_port *dst,
+ struct net_device_path *path);
static inline struct net_bridge_vlan_group *br_vlan_group(
const struct net_bridge *br)
@@ -894,7 +1653,7 @@ static inline int br_vlan_get_tag(const struct sk_buff *skb, u16 *vid)
int err = 0;
if (skb_vlan_tag_present(skb)) {
- *vid = skb_vlan_tag_get(skb) & VLAN_VID_MASK;
+ *vid = skb_vlan_tag_get_id(skb);
} else {
*vid = 0;
err = -EINVAL;
@@ -912,12 +1671,19 @@ static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg)
return vg->pvid;
}
+static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid)
+{
+ return v->vid == pvid ? v->flags | BRIDGE_VLAN_INFO_PVID : v->flags;
+}
#else
static inline bool br_allowed_ingress(const struct net_bridge *br,
struct net_bridge_vlan_group *vg,
struct sk_buff *skb,
- u16 *vid)
+ u16 *vid, u8 *state,
+ struct net_bridge_vlan **vlan)
+
{
+ *vlan = NULL;
return true;
}
@@ -942,7 +1708,7 @@ static inline struct sk_buff *br_handle_vlan(struct net_bridge *br,
}
static inline int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags,
- bool *changed)
+ bool *changed, struct netlink_ext_ack *extack)
{
*changed = false;
return -EOPNOTSUPP;
@@ -967,7 +1733,7 @@ static inline int br_vlan_init(struct net_bridge *br)
}
static inline int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
- bool *changed)
+ bool *changed, struct netlink_ext_ack *extack)
{
*changed = false;
return -EOPNOTSUPP;
@@ -988,7 +1754,8 @@ static inline struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group
return NULL;
}
-static inline int nbp_vlan_init(struct net_bridge_port *port)
+static inline int nbp_vlan_init(struct net_bridge_port *port,
+ struct netlink_ext_ack *extack)
{
return 0;
}
@@ -1003,8 +1770,9 @@ static inline u16 br_get_pvid(const struct net_bridge_vlan_group *vg)
return 0;
}
-static inline int __br_vlan_filter_toggle(struct net_bridge *br,
- unsigned long val)
+static inline int br_vlan_filter_toggle(struct net_bridge *br,
+ unsigned long val,
+ struct netlink_ext_ack *extack)
{
return -EOPNOTSUPP;
}
@@ -1015,6 +1783,19 @@ static inline int nbp_get_num_vlan_infos(struct net_bridge_port *p,
return 0;
}
+static inline void br_vlan_fill_forward_path_pvid(struct net_bridge *br,
+ struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+}
+
+static inline int br_vlan_fill_forward_path_mode(struct net_bridge *br,
+ struct net_bridge_port *dst,
+ struct net_device_path *path)
+{
+ return 0;
+}
+
static inline struct net_bridge_vlan_group *br_vlan_group(
const struct net_bridge *br)
{
@@ -1040,7 +1821,177 @@ static inline struct net_bridge_vlan_group *nbp_vlan_group_rcu(
}
static inline void br_vlan_get_stats(const struct net_bridge_vlan *v,
- struct br_vlan_stats *stats)
+ struct pcpu_sw_netstats *stats)
+{
+}
+
+static inline void br_vlan_port_event(struct net_bridge_port *p,
+ unsigned long event)
+{
+}
+
+static inline int br_vlan_bridge_event(struct net_device *dev,
+ unsigned long event, void *ptr)
+{
+ return 0;
+}
+
+static inline void br_vlan_vlan_upper_event(struct net_device *br_dev,
+ struct net_device *vlan_dev,
+ unsigned long event)
+{
+}
+
+static inline int br_vlan_rtnl_init(void)
+{
+ return 0;
+}
+
+static inline void br_vlan_rtnl_uninit(void)
+{
+}
+
+static inline void br_vlan_notify(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ u16 vid, u16 vid_range,
+ int cmd)
+{
+}
+
+static inline bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *range_end)
+{
+ return true;
+}
+
+static inline u16 br_vlan_flags(const struct net_bridge_vlan *v, u16 pvid)
+{
+ return 0;
+}
+
+#endif
+
+/* br_vlan_options.c */
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *range_end);
+bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v,
+ const struct net_bridge_port *p);
+size_t br_vlan_opts_nl_size(void);
+int br_vlan_process_options(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ struct net_bridge_vlan *range_start,
+ struct net_bridge_vlan *range_end,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack);
+int br_vlan_rtm_process_global_options(struct net_device *dev,
+ const struct nlattr *attr,
+ int cmd,
+ struct netlink_ext_ack *extack);
+bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *r_end);
+bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
+ const struct net_bridge_vlan *v_opts);
+
+/* vlan state manipulation helpers using *_ONCE to annotate lock-free access,
+ * while br_vlan_set_state() may access data protected by multicast_lock.
+ */
+static inline u8 br_vlan_get_state(const struct net_bridge_vlan *v)
+{
+ return READ_ONCE(v->state);
+}
+
+static inline void br_vlan_set_state(struct net_bridge_vlan *v, u8 state)
+{
+ WRITE_ONCE(v->state, state);
+ br_multicast_update_vlan_mcast_ctx(v, state);
+}
+
+static inline u8 br_vlan_get_pvid_state(const struct net_bridge_vlan_group *vg)
+{
+ return READ_ONCE(vg->pvid_state);
+}
+
+static inline void br_vlan_set_pvid_state(struct net_bridge_vlan_group *vg,
+ u8 state)
+{
+ WRITE_ONCE(vg->pvid_state, state);
+}
+
+/* learn_allow is true at ingress and false at egress */
+static inline bool br_vlan_state_allowed(u8 state, bool learn_allow)
+{
+ switch (state) {
+ case BR_STATE_LEARNING:
+ return learn_allow;
+ case BR_STATE_FORWARDING:
+ return true;
+ default:
+ return false;
+ }
+}
+#endif
+
+/* br_mst.c */
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+DECLARE_STATIC_KEY_FALSE(br_mst_used);
+static inline bool br_mst_is_enabled(const struct net_bridge_port *p)
+{
+ /* check the port's vlan group to avoid racing with port deletion */
+ return static_branch_unlikely(&br_mst_used) &&
+ br_opt_get(p->br, BROPT_MST_ENABLED) &&
+ rcu_access_pointer(p->vlgrp);
+}
+
+int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state,
+ struct netlink_ext_ack *extack);
+int br_mst_vlan_set_msti(struct net_bridge_vlan *v, u16 msti);
+void br_mst_vlan_init_state(struct net_bridge_vlan *v);
+int br_mst_set_enabled(struct net_bridge *br, bool on,
+ struct netlink_ext_ack *extack);
+size_t br_mst_info_size(const struct net_bridge_vlan_group *vg);
+int br_mst_fill_info(struct sk_buff *skb,
+ const struct net_bridge_vlan_group *vg);
+int br_mst_process(struct net_bridge_port *p, const struct nlattr *mst_attr,
+ struct netlink_ext_ack *extack);
+void br_mst_uninit(struct net_bridge *br);
+#else
+static inline bool br_mst_is_enabled(const struct net_bridge_port *p)
+{
+ return false;
+}
+
+static inline int br_mst_set_state(struct net_bridge_port *p, u16 msti,
+ u8 state, struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_mst_set_enabled(struct net_bridge *br, bool on,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline size_t br_mst_info_size(const struct net_bridge_vlan_group *vg)
+{
+ return 0;
+}
+
+static inline int br_mst_fill_info(struct sk_buff *skb,
+ const struct net_bridge_vlan_group *vg)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_mst_process(struct net_bridge_port *p,
+ const struct nlattr *mst_attr,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void br_mst_uninit(struct net_bridge *br)
{
}
#endif
@@ -1078,7 +2029,8 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time);
/* br_stp_if.c */
void br_stp_enable_bridge(struct net_bridge *br);
void br_stp_disable_bridge(struct net_bridge *br);
-void br_stp_set_enabled(struct net_bridge *br, unsigned long val);
+int br_stp_set_enabled(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack);
void br_stp_enable_port(struct net_bridge_port *p);
void br_stp_disable_port(struct net_bridge_port *p);
bool br_stp_recalculate_bridge_id(struct net_bridge *br);
@@ -1103,16 +2055,112 @@ unsigned long br_timer_value(const struct timer_list *timer);
extern int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr);
#endif
+/* br_mrp.c */
+#if IS_ENABLED(CONFIG_BRIDGE_MRP)
+int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p,
+ struct nlattr *attr, int cmd, struct netlink_ext_ack *extack);
+bool br_mrp_enabled(struct net_bridge *br);
+void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p);
+int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br);
+#else
+static inline int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p,
+ struct nlattr *attr, int cmd,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline bool br_mrp_enabled(struct net_bridge *br)
+{
+ return false;
+}
+
+static inline void br_mrp_port_del(struct net_bridge *br,
+ struct net_bridge_port *p)
+{
+}
+
+static inline int br_mrp_fill_info(struct sk_buff *skb, struct net_bridge *br)
+{
+ return 0;
+}
+
+#endif
+
+/* br_cfm.c */
+#if IS_ENABLED(CONFIG_BRIDGE_CFM)
+int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p,
+ struct nlattr *attr, int cmd, struct netlink_ext_ack *extack);
+bool br_cfm_created(struct net_bridge *br);
+void br_cfm_port_del(struct net_bridge *br, struct net_bridge_port *p);
+int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br);
+int br_cfm_status_fill_info(struct sk_buff *skb,
+ struct net_bridge *br,
+ bool getlink);
+int br_cfm_mep_count(struct net_bridge *br, u32 *count);
+int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count);
+#else
+static inline int br_cfm_parse(struct net_bridge *br, struct net_bridge_port *p,
+ struct nlattr *attr, int cmd,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline bool br_cfm_created(struct net_bridge *br)
+{
+ return false;
+}
+
+static inline void br_cfm_port_del(struct net_bridge *br,
+ struct net_bridge_port *p)
+{
+}
+
+static inline int br_cfm_config_fill_info(struct sk_buff *skb, struct net_bridge *br)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_cfm_status_fill_info(struct sk_buff *skb,
+ struct net_bridge *br,
+ bool getlink)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int br_cfm_mep_count(struct net_bridge *br, u32 *count)
+{
+ *count = 0;
+ return -EOPNOTSUPP;
+}
+
+static inline int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count)
+{
+ *count = 0;
+ return -EOPNOTSUPP;
+}
+#endif
+
/* br_netlink.c */
extern struct rtnl_link_ops br_link_ops;
int br_netlink_init(void);
void br_netlink_fini(void);
void br_ifinfo_notify(int event, const struct net_bridge *br,
const struct net_bridge_port *port);
-int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
+void br_info_notify(int event, const struct net_bridge *br,
+ const struct net_bridge_port *port, u32 filter);
+int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags,
+ struct netlink_ext_ack *extack);
int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev,
u32 filter_mask, int nlflags);
+int br_process_vlan_info(struct net_bridge *br,
+ struct net_bridge_port *p, int cmd,
+ struct bridge_vlan_info *vinfo_curr,
+ struct bridge_vlan_info **vinfo_last,
+ bool *changed,
+ struct netlink_ext_ack *extack);
#ifdef CONFIG_SYSFS
/* br_sysfs_if.c */
@@ -1134,27 +2182,102 @@ static inline void br_sysfs_delbr(struct net_device *dev) { return; }
/* br_switchdev.c */
#ifdef CONFIG_NET_SWITCHDEV
-int nbp_switchdev_mark_set(struct net_bridge_port *p);
+int br_switchdev_port_offload(struct net_bridge_port *p,
+ struct net_device *dev, const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb,
+ bool tx_fwd_offload,
+ struct netlink_ext_ack *extack);
+
+void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb);
+
+int br_switchdev_port_replay(struct net_bridge_port *p,
+ struct net_device *dev, const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb,
+ struct netlink_ext_ack *extack);
+
+bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb);
+
+void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb);
+
+void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p,
+ struct sk_buff *skb);
+void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p,
+ struct sk_buff *skb);
void nbp_switchdev_frame_mark(const struct net_bridge_port *p,
struct sk_buff *skb);
bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p,
const struct sk_buff *skb);
int br_switchdev_set_port_flag(struct net_bridge_port *p,
unsigned long flags,
- unsigned long mask);
-void br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb,
+ unsigned long mask,
+ struct netlink_ext_ack *extack);
+void br_switchdev_fdb_notify(struct net_bridge *br,
+ const struct net_bridge_fdb_entry *fdb, int type);
+void br_switchdev_mdb_notify(struct net_device *dev,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
int type);
-int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags);
+int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags,
+ bool changed, struct netlink_ext_ack *extack);
int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid);
+void br_switchdev_init(struct net_bridge *br);
static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
{
skb->offload_fwd_mark = 0;
}
#else
-static inline int nbp_switchdev_mark_set(struct net_bridge_port *p)
+static inline int
+br_switchdev_port_offload(struct net_bridge_port *p,
+ struct net_device *dev, const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb,
+ bool tx_fwd_offload,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline void
+br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb)
+{
+}
+
+static inline int
+br_switchdev_port_replay(struct net_bridge_port *p,
+ struct net_device *dev, const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb,
+ struct netlink_ext_ack *extack)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb)
+{
+ return false;
+}
+
+static inline void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb)
+{
+}
+
+static inline void
+nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p,
+ struct sk_buff *skb)
+{
+}
+
+static inline void
+nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p,
+ struct sk_buff *skb)
{
- return 0;
}
static inline void nbp_switchdev_frame_mark(const struct net_bridge_port *p,
@@ -1170,13 +2293,15 @@ static inline bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p,
static inline int br_switchdev_set_port_flag(struct net_bridge_port *p,
unsigned long flags,
- unsigned long mask)
+ unsigned long mask,
+ struct netlink_ext_ack *extack)
{
return 0;
}
-static inline int br_switchdev_port_vlan_add(struct net_device *dev,
- u16 vid, u16 flags)
+static inline int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid,
+ u16 flags, bool changed,
+ struct netlink_ext_ack *extack)
{
return -EOPNOTSUPP;
}
@@ -1187,13 +2312,26 @@ static inline int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid)
}
static inline void
-br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
+br_switchdev_fdb_notify(struct net_bridge *br,
+ const struct net_bridge_fdb_entry *fdb, int type)
+{
+}
+
+static inline void br_switchdev_mdb_notify(struct net_device *dev,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ int type)
{
}
static inline void br_switchdev_frame_unmark(struct sk_buff *skb)
{
}
+
+static inline void br_switchdev_init(struct net_bridge *br)
+{
+}
+
#endif /* CONFIG_NET_SWITCHDEV */
/* br_arp_nd_proxy.c */
@@ -1202,5 +2340,6 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
u16 vid, struct net_bridge_port *p);
void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
u16 vid, struct net_bridge_port *p, struct nd_msg *msg);
-struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *m);
+struct nd_msg *br_is_nd_neigh_msg(const struct sk_buff *skb, struct nd_msg *m);
+bool br_is_neigh_suppress_enabled(const struct net_bridge_port *p, u16 vid);
#endif
diff --git a/net/bridge/br_private_cfm.h b/net/bridge/br_private_cfm.h
new file mode 100644
index 000000000000..a43a5e7fa2c3
--- /dev/null
+++ b/net/bridge/br_private_cfm.h
@@ -0,0 +1,147 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _BR_PRIVATE_CFM_H_
+#define _BR_PRIVATE_CFM_H_
+
+#include "br_private.h"
+#include <uapi/linux/cfm_bridge.h>
+
+struct br_cfm_mep_create {
+ enum br_cfm_domain domain; /* Domain for this MEP */
+ enum br_cfm_mep_direction direction; /* Up or Down MEP direction */
+ u32 ifindex; /* Residence port */
+};
+
+int br_cfm_mep_create(struct net_bridge *br,
+ const u32 instance,
+ struct br_cfm_mep_create *const create,
+ struct netlink_ext_ack *extack);
+
+int br_cfm_mep_delete(struct net_bridge *br,
+ const u32 instance,
+ struct netlink_ext_ack *extack);
+
+struct br_cfm_mep_config {
+ u32 mdlevel;
+ u32 mepid; /* MEPID for this MEP */
+ struct mac_addr unicast_mac; /* The MEP unicast MAC */
+};
+
+int br_cfm_mep_config_set(struct net_bridge *br,
+ const u32 instance,
+ const struct br_cfm_mep_config *const config,
+ struct netlink_ext_ack *extack);
+
+struct br_cfm_maid {
+ u8 data[CFM_MAID_LENGTH];
+};
+
+struct br_cfm_cc_config {
+ /* Expected received CCM PDU MAID. */
+ struct br_cfm_maid exp_maid;
+
+ /* Expected received CCM PDU interval. */
+ /* Transmitting CCM PDU interval when CCM tx is enabled. */
+ enum br_cfm_ccm_interval exp_interval;
+
+ bool enable; /* Enable/disable CCM PDU handling */
+};
+
+int br_cfm_cc_config_set(struct net_bridge *br,
+ const u32 instance,
+ const struct br_cfm_cc_config *const config,
+ struct netlink_ext_ack *extack);
+
+int br_cfm_cc_peer_mep_add(struct net_bridge *br, const u32 instance,
+ u32 peer_mep_id,
+ struct netlink_ext_ack *extack);
+int br_cfm_cc_peer_mep_remove(struct net_bridge *br, const u32 instance,
+ u32 peer_mep_id,
+ struct netlink_ext_ack *extack);
+
+/* Transmitted CCM Remote Defect Indication status set.
+ * This RDI is inserted in transmitted CCM PDUs if CCM transmission is enabled.
+ * See br_cfm_cc_ccm_tx() with interval != BR_CFM_CCM_INTERVAL_NONE
+ */
+int br_cfm_cc_rdi_set(struct net_bridge *br, const u32 instance,
+ const bool rdi, struct netlink_ext_ack *extack);
+
+/* OAM PDU Tx information */
+struct br_cfm_cc_ccm_tx_info {
+ struct mac_addr dmac;
+ /* The CCM will be transmitted for this period in seconds.
+ * Call br_cfm_cc_ccm_tx before timeout to keep transmission alive.
+ * When period is zero any ongoing transmission will be stopped.
+ */
+ u32 period;
+
+ bool seq_no_update; /* Update Tx CCM sequence number */
+ bool if_tlv; /* Insert Interface Status TLV */
+ u8 if_tlv_value; /* Interface Status TLV value */
+ bool port_tlv; /* Insert Port Status TLV */
+ u8 port_tlv_value; /* Port Status TLV value */
+ /* Sender ID TLV ??
+ * Organization-Specific TLV ??
+ */
+};
+
+int br_cfm_cc_ccm_tx(struct net_bridge *br, const u32 instance,
+ const struct br_cfm_cc_ccm_tx_info *const tx_info,
+ struct netlink_ext_ack *extack);
+
+struct br_cfm_mep_status {
+ /* Indications that an OAM PDU has been seen. */
+ bool opcode_unexp_seen; /* RX of OAM PDU with unexpected opcode */
+ bool version_unexp_seen; /* RX of OAM PDU with unexpected version */
+ bool rx_level_low_seen; /* Rx of OAM PDU with level low */
+};
+
+struct br_cfm_cc_peer_status {
+ /* This CCM related status is based on the latest received CCM PDU. */
+ u8 port_tlv_value; /* Port Status TLV value */
+ u8 if_tlv_value; /* Interface Status TLV value */
+
+ /* CCM has not been received for 3.25 intervals */
+ u8 ccm_defect:1;
+
+ /* (RDI == 1) for last received CCM PDU */
+ u8 rdi:1;
+
+ /* Indications that a CCM PDU has been seen. */
+ u8 seen:1; /* CCM PDU received */
+ u8 tlv_seen:1; /* CCM PDU with TLV received */
+ /* CCM PDU with unexpected sequence number received */
+ u8 seq_unexp_seen:1;
+};
+
+struct br_cfm_mep {
+ /* list header of MEP instances */
+ struct hlist_node head;
+ u32 instance;
+ struct br_cfm_mep_create create;
+ struct br_cfm_mep_config config;
+ struct br_cfm_cc_config cc_config;
+ struct br_cfm_cc_ccm_tx_info cc_ccm_tx_info;
+ /* List of multiple peer MEPs */
+ struct hlist_head peer_mep_list;
+ struct net_bridge_port __rcu *b_port;
+ unsigned long ccm_tx_end;
+ struct delayed_work ccm_tx_dwork;
+ u32 ccm_tx_snumber;
+ u32 ccm_rx_snumber;
+ struct br_cfm_mep_status status;
+ bool rdi;
+ struct rcu_head rcu;
+};
+
+struct br_cfm_peer_mep {
+ struct hlist_node head;
+ struct br_cfm_mep *mep;
+ struct delayed_work ccm_rx_dwork;
+ u32 mepid;
+ struct br_cfm_cc_peer_status cc_status;
+ u32 ccm_rx_count_miss;
+ struct rcu_head rcu;
+};
+
+#endif /* _BR_PRIVATE_CFM_H_ */
diff --git a/net/bridge/br_private_mcast_eht.h b/net/bridge/br_private_mcast_eht.h
new file mode 100644
index 000000000000..adf82a05515a
--- /dev/null
+++ b/net/bridge/br_private_mcast_eht.h
@@ -0,0 +1,94 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later
+ * Copyright (c) 2020, Nikolay Aleksandrov <nikolay@nvidia.com>
+ */
+#ifndef _BR_PRIVATE_MCAST_EHT_H_
+#define _BR_PRIVATE_MCAST_EHT_H_
+
+#define BR_MCAST_DEFAULT_EHT_HOSTS_LIMIT 512
+
+union net_bridge_eht_addr {
+ __be32 ip4;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct in6_addr ip6;
+#endif
+};
+
+/* single host's list of set entries and filter_mode */
+struct net_bridge_group_eht_host {
+ struct rb_node rb_node;
+
+ union net_bridge_eht_addr h_addr;
+ struct hlist_head set_entries;
+ unsigned int num_entries;
+ unsigned char filter_mode;
+ struct net_bridge_port_group *pg;
+};
+
+/* (host, src entry) added to a per-src set and host's list */
+struct net_bridge_group_eht_set_entry {
+ struct rb_node rb_node;
+ struct hlist_node host_list;
+
+ union net_bridge_eht_addr h_addr;
+ struct timer_list timer;
+ struct net_bridge *br;
+ struct net_bridge_group_eht_set *eht_set;
+ struct net_bridge_group_eht_host *h_parent;
+ struct net_bridge_mcast_gc mcast_gc;
+};
+
+/* per-src set */
+struct net_bridge_group_eht_set {
+ struct rb_node rb_node;
+
+ union net_bridge_eht_addr src_addr;
+ struct rb_root entry_tree;
+ struct timer_list timer;
+ struct net_bridge_port_group *pg;
+ struct net_bridge *br;
+ struct net_bridge_mcast_gc mcast_gc;
+};
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+void br_multicast_eht_clean_sets(struct net_bridge_port_group *pg);
+bool br_multicast_eht_handle(const struct net_bridge_mcast *brmctx,
+ struct net_bridge_port_group *pg,
+ void *h_addr,
+ void *srcs,
+ u32 nsrcs,
+ size_t addr_size,
+ int grec_type);
+int br_multicast_eht_set_hosts_limit(struct net_bridge_port *p,
+ u32 eht_hosts_limit);
+
+static inline bool
+br_multicast_eht_should_del_pg(const struct net_bridge_port_group *pg)
+{
+ return !!((pg->key.port->flags & BR_MULTICAST_FAST_LEAVE) &&
+ RB_EMPTY_ROOT(&pg->eht_host_tree));
+}
+
+static inline bool
+br_multicast_eht_hosts_over_limit(const struct net_bridge_port_group *pg)
+{
+ const struct net_bridge_port *p = pg->key.port;
+
+ return !!(p->multicast_eht_hosts_cnt >= p->multicast_eht_hosts_limit);
+}
+
+static inline void br_multicast_eht_hosts_inc(struct net_bridge_port_group *pg)
+{
+ struct net_bridge_port *p = pg->key.port;
+
+ p->multicast_eht_hosts_cnt++;
+}
+
+static inline void br_multicast_eht_hosts_dec(struct net_bridge_port_group *pg)
+{
+ struct net_bridge_port *p = pg->key.port;
+
+ p->multicast_eht_hosts_cnt--;
+}
+#endif /* CONFIG_BRIDGE_IGMP_SNOOPING */
+
+#endif /* _BR_PRIVATE_MCAST_EHT_H_ */
diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h
new file mode 100644
index 000000000000..bda8e1896712
--- /dev/null
+++ b/net/bridge/br_private_mrp.h
@@ -0,0 +1,148 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef _BR_PRIVATE_MRP_H_
+#define _BR_PRIVATE_MRP_H_
+
+#include "br_private.h"
+#include <uapi/linux/mrp_bridge.h>
+
+#define MRP_OPT_PADDING 0x2
+
+struct br_mrp {
+ /* list of mrp instances */
+ struct hlist_node list;
+
+ struct net_bridge_port __rcu *p_port;
+ struct net_bridge_port __rcu *s_port;
+ struct net_bridge_port __rcu *i_port;
+
+ u32 ring_id;
+ u16 in_id;
+ u16 prio;
+
+ enum br_mrp_ring_role_type ring_role;
+ u8 ring_role_offloaded;
+ enum br_mrp_ring_state_type ring_state;
+ u32 ring_transitions;
+
+ enum br_mrp_in_role_type in_role;
+ u8 in_role_offloaded;
+ enum br_mrp_in_state_type in_state;
+ u32 in_transitions;
+
+ struct delayed_work test_work;
+ u32 test_interval;
+ unsigned long test_end;
+ u32 test_count_miss;
+ u32 test_max_miss;
+ bool test_monitor;
+
+ struct delayed_work in_test_work;
+ u32 in_test_interval;
+ unsigned long in_test_end;
+ u32 in_test_count_miss;
+ u32 in_test_max_miss;
+
+ u32 seq_id;
+
+ struct rcu_head rcu;
+};
+
+/* This type is returned by br_mrp_switchdev functions that allow to have a SW
+ * backup in case the HW can't implement completely the protocol.
+ * BR_MRP_NONE - means the HW can't run at all the protocol, so the SW stops
+ * configuring the node anymore.
+ * BR_MRP_SW - the HW can help the SW to run the protocol, by redirecting MRP
+ * frames to CPU.
+ * BR_MRP_HW - the HW can implement completely the protocol.
+ */
+enum br_mrp_hw_support {
+ BR_MRP_NONE,
+ BR_MRP_SW,
+ BR_MRP_HW,
+};
+
+/* br_mrp.c */
+int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance);
+int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance);
+int br_mrp_set_port_state(struct net_bridge_port *p,
+ enum br_mrp_port_state_type state);
+int br_mrp_set_port_role(struct net_bridge_port *p,
+ enum br_mrp_port_role_type role);
+int br_mrp_set_ring_state(struct net_bridge *br,
+ struct br_mrp_ring_state *state);
+int br_mrp_set_ring_role(struct net_bridge *br, struct br_mrp_ring_role *role);
+int br_mrp_start_test(struct net_bridge *br, struct br_mrp_start_test *test);
+int br_mrp_set_in_state(struct net_bridge *br, struct br_mrp_in_state *state);
+int br_mrp_set_in_role(struct net_bridge *br, struct br_mrp_in_role *role);
+int br_mrp_start_in_test(struct net_bridge *br,
+ struct br_mrp_start_in_test *test);
+
+/* br_mrp_switchdev.c */
+int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp);
+int br_mrp_switchdev_del(struct net_bridge *br, struct br_mrp *mrp);
+enum br_mrp_hw_support
+br_mrp_switchdev_set_ring_role(struct net_bridge *br, struct br_mrp *mrp,
+ enum br_mrp_ring_role_type role);
+int br_mrp_switchdev_set_ring_state(struct net_bridge *br, struct br_mrp *mrp,
+ enum br_mrp_ring_state_type state);
+enum br_mrp_hw_support
+br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp,
+ u32 interval, u8 max_miss, u32 period,
+ bool monitor);
+int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, u32 state);
+int br_mrp_port_switchdev_set_role(struct net_bridge_port *p,
+ enum br_mrp_port_role_type role);
+enum br_mrp_hw_support
+br_mrp_switchdev_set_in_role(struct net_bridge *br, struct br_mrp *mrp,
+ u16 in_id, u32 ring_id,
+ enum br_mrp_in_role_type role);
+int br_mrp_switchdev_set_in_state(struct net_bridge *br, struct br_mrp *mrp,
+ enum br_mrp_in_state_type state);
+enum br_mrp_hw_support
+br_mrp_switchdev_send_in_test(struct net_bridge *br, struct br_mrp *mrp,
+ u32 interval, u8 max_miss, u32 period);
+
+/* br_mrp_netlink.c */
+int br_mrp_ring_port_open(struct net_device *dev, u8 loc);
+int br_mrp_in_port_open(struct net_device *dev, u8 loc);
+
+/* MRP protocol data units */
+struct br_mrp_tlv_hdr {
+ __u8 type;
+ __u8 length;
+};
+
+struct br_mrp_common_hdr {
+ __be16 seq_id;
+ __u8 domain[MRP_DOMAIN_UUID_LENGTH];
+};
+
+struct br_mrp_ring_test_hdr {
+ __be16 prio;
+ __u8 sa[ETH_ALEN];
+ __be16 port_role;
+ __be16 state;
+ __be16 transitions;
+ __be32 timestamp;
+} __attribute__((__packed__));
+
+struct br_mrp_in_test_hdr {
+ __be16 id;
+ __u8 sa[ETH_ALEN];
+ __be16 port_role;
+ __be16 state;
+ __be16 transitions;
+ __be32 timestamp;
+} __attribute__((__packed__));
+
+struct br_mrp_oui_hdr {
+ __u8 oui[MRP_OUI_LENGTH];
+};
+
+struct br_mrp_sub_option1_hdr {
+ __u8 type;
+ __u8 data[MRP_MANUFACTURE_DATA_LENGTH];
+};
+
+#endif /* _BR_PRIVATE_MRP_H */
diff --git a/net/bridge/br_private_stp.h b/net/bridge/br_private_stp.h
index 3f7543a29b76..814cf1364cfb 100644
--- a/net/bridge/br_private_stp.h
+++ b/net/bridge/br_private_stp.h
@@ -1,13 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#ifndef _BR_PRIVATE_STP_H
diff --git a/net/bridge/br_private_tunnel.h b/net/bridge/br_private_tunnel.h
index a259471bfd78..efb096025151 100644
--- a/net/bridge/br_private_tunnel.h
+++ b/net/bridge/br_private_tunnel.h
@@ -1,13 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Bridge per vlan tunnels
*
* Authors:
* Roopa Prabhu <roopa@cumulusnetworks.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#ifndef _BR_PRIVATE_TUNNEL_H
@@ -22,8 +18,8 @@ struct vtunnel_info {
/* br_netlink_tunnel.c */
int br_parse_vlan_tunnel_info(struct nlattr *attr,
struct vtunnel_info *tinfo);
-int br_process_vlan_tunnel_info(struct net_bridge *br,
- struct net_bridge_port *p,
+int br_process_vlan_tunnel_info(const struct net_bridge *br,
+ const struct net_bridge_port *p,
int cmd,
struct vtunnel_info *tinfo_curr,
struct vtunnel_info *tinfo_last,
@@ -31,19 +27,24 @@ int br_process_vlan_tunnel_info(struct net_bridge *br,
int br_get_vlan_tunnel_info_size(struct net_bridge_vlan_group *vg);
int br_fill_vlan_tunnel_info(struct sk_buff *skb,
struct net_bridge_vlan_group *vg);
+bool vlan_tunid_inrange(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *v_last);
+int br_vlan_tunnel_info(const struct net_bridge_port *p, int cmd,
+ u16 vid, u32 tun_id, bool *changed);
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
/* br_vlan_tunnel.c */
int vlan_tunnel_init(struct net_bridge_vlan_group *vg);
void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg);
-int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid);
-int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id);
+int nbp_vlan_tunnel_info_delete(const struct net_bridge_port *port, u16 vid);
+int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port, u16 vid,
+ u32 tun_id);
void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port);
void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
struct net_bridge_vlan *vlan);
-int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
- struct net_bridge_port *p,
- struct net_bridge_vlan_group *vg);
+void br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
+ struct net_bridge_port *p,
+ struct net_bridge_vlan_group *vg);
int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
struct net_bridge_vlan *vlan);
#else
@@ -52,13 +53,13 @@ static inline int vlan_tunnel_init(struct net_bridge_vlan_group *vg)
return 0;
}
-static inline int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port,
+static inline int nbp_vlan_tunnel_info_delete(const struct net_bridge_port *port,
u16 vid)
{
return 0;
}
-static inline int nbp_vlan_tunnel_info_add(struct net_bridge_port *port,
+static inline int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port,
u16 vid, u32 tun_id)
{
return 0;
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index b6941961a876..024210f95468 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Spanning tree protocol; generic parts
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/rculist.h>
@@ -40,8 +36,20 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
};
int err;
+ /* Don't change the state of the ports if they are driven by a different
+ * protocol.
+ */
+ if (p->flags & BR_MRP_AWARE)
+ return;
+
p->state = state;
- err = switchdev_port_attr_set(p->dev, &attr);
+ if (br_opt_get(p->br, BROPT_MST_ENABLED)) {
+ err = br_mst_set_state(p, 0, state, NULL);
+ if (err)
+ br_warn(p->br, "error setting MST state on port %u(%s)\n",
+ p->port_no, netdev_name(p->dev));
+ }
+ err = switchdev_port_attr_set(p->dev, &attr, NULL);
if (err && err != -EOPNOTSUPP)
br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
(unsigned int) p->port_no, p->dev->name);
@@ -49,14 +57,40 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
br_info(p->br, "port %u(%s) entered %s state\n",
(unsigned int) p->port_no, p->dev->name,
br_port_state_names[p->state]);
+
+ if (p->br->stp_enabled == BR_KERNEL_STP) {
+ switch (p->state) {
+ case BR_STATE_BLOCKING:
+ p->stp_xstats.transition_blk++;
+ break;
+ case BR_STATE_FORWARDING:
+ p->stp_xstats.transition_fwd++;
+ break;
+ }
+ }
+}
+
+u8 br_port_get_stp_state(const struct net_device *dev)
+{
+ struct net_bridge_port *p;
+
+ ASSERT_RTNL();
+
+ p = br_port_get_rtnl(dev);
+ if (!p)
+ return BR_STATE_DISABLED;
+
+ return p->state;
}
+EXPORT_SYMBOL_GPL(br_port_get_stp_state);
/* called under bridge lock */
struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no)
{
struct net_bridge_port *p;
- list_for_each_entry_rcu(p, &br->port_list, list) {
+ list_for_each_entry_rcu(p, &br->port_list, list,
+ lockdep_is_held(&br->lock)) {
if (p->port_no == port_no)
return p;
}
@@ -164,7 +198,7 @@ void br_become_root_bridge(struct net_bridge *br)
br->hello_time = br->bridge_hello_time;
br->forward_delay = br->bridge_forward_delay;
br_topology_change_detection(br);
- del_timer(&br->tcn_timer);
+ timer_delete(&br->tcn_timer);
if (br->dev->flags & IFF_UP) {
br_config_bpdu_generation(br);
@@ -329,7 +363,7 @@ static int br_supersedes_port_info(const struct net_bridge_port *p,
static void br_topology_change_acknowledged(struct net_bridge *br)
{
br->topology_change_detected = 0;
- del_timer(&br->tcn_timer);
+ timer_delete(&br->tcn_timer);
}
/* called under bridge lock */
@@ -405,7 +439,7 @@ static void br_make_blocking(struct net_bridge_port *p)
br_set_state(p, BR_STATE_BLOCKING);
br_ifinfo_notify(RTM_NEWLINK, NULL, p);
- del_timer(&p->forward_delay_timer);
+ timer_delete(&p->forward_delay_timer);
}
}
@@ -420,7 +454,7 @@ static void br_make_forwarding(struct net_bridge_port *p)
if (br->stp_enabled == BR_NO_STP || br->forward_delay == 0) {
br_set_state(p, BR_STATE_FORWARDING);
br_topology_change_detection(br);
- del_timer(&p->forward_delay_timer);
+ timer_delete(&p->forward_delay_timer);
} else if (br->stp_enabled == BR_KERNEL_STP)
br_set_state(p, BR_STATE_LISTENING);
else
@@ -449,7 +483,7 @@ void br_port_state_selection(struct net_bridge *br)
p->topology_change_ack = 0;
br_make_forwarding(p);
} else if (br_is_designated_port(p)) {
- del_timer(&p->message_age_timer);
+ timer_delete(&p->message_age_timer);
br_make_forwarding(p);
} else {
p->config_pending = 0;
@@ -488,6 +522,8 @@ void br_received_config_bpdu(struct net_bridge_port *p,
struct net_bridge *br;
int was_root;
+ p->stp_xstats.rx_bpdu++;
+
br = p->br;
was_root = br_is_root_bridge(br);
@@ -497,9 +533,9 @@ void br_received_config_bpdu(struct net_bridge_port *p,
br_port_state_selection(br);
if (!br_is_root_bridge(br) && was_root) {
- del_timer(&br->hello_timer);
+ timer_delete(&br->hello_timer);
if (br->topology_change_detected) {
- del_timer(&br->topology_change_timer);
+ timer_delete(&br->topology_change_timer);
br_transmit_tcn(br);
mod_timer(&br->tcn_timer,
@@ -521,6 +557,8 @@ void br_received_config_bpdu(struct net_bridge_port *p,
/* called under bridge lock */
void br_received_tcn_bpdu(struct net_bridge_port *p)
{
+ p->stp_xstats.rx_tcn++;
+
if (br_is_designated_port(p)) {
br_info(p->br, "port %u(%s) received tcn bpdu\n",
(unsigned int) p->port_no, p->dev->name);
@@ -573,7 +611,7 @@ int __set_ageing_time(struct net_device *dev, unsigned long t)
};
int err;
- err = switchdev_port_attr_set(dev, &attr);
+ err = switchdev_port_attr_set(dev, &attr, NULL);
if (err && err != -EOPNOTSUPP)
return err;
@@ -583,8 +621,8 @@ int __set_ageing_time(struct net_device *dev, unsigned long t)
/* Set time interval that dynamic forwarding entries live
* For pure software bridge, allow values outside the 802.1
* standard specification for special cases:
- * 0 - entry never ages (all permanant)
- * 1 - entry disappears (no persistance)
+ * 0 - entry never ages (all permanent)
+ * 1 - entry disappears (no persistence)
*
* Offloaded switch entries maybe more restrictive
*/
@@ -607,6 +645,19 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time)
return 0;
}
+clock_t br_get_ageing_time(const struct net_device *br_dev)
+{
+ const struct net_bridge *br;
+
+ if (!netif_is_bridge_master(br_dev))
+ return 0;
+
+ br = netdev_priv(br_dev);
+
+ return jiffies_to_clock_t(br->ageing_time);
+}
+EXPORT_SYMBOL_GPL(br_get_ageing_time);
+
/* called under bridge lock */
void __br_set_topology_change(struct net_bridge *br, unsigned char val)
{
diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c
index 1b75d6bf12bd..7895489ac6fe 100644
--- a/net/bridge/br_stp_bpdu.c
+++ b/net/bridge/br_stp_bpdu.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Spanning tree protocol; BPDU handling
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -21,7 +17,7 @@
#include <net/llc.h>
#include <net/llc_pdu.h>
#include <net/stp.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include "br_private.h"
#include "br_private_stp.h"
@@ -122,6 +118,8 @@ void br_send_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu)
br_set_ticks(buf+33, bpdu->forward_delay);
br_send_bpdu(p, buf, 35);
+
+ p->stp_xstats.tx_bpdu++;
}
/* called under bridge lock */
@@ -137,6 +135,8 @@ void br_send_tcn_bpdu(struct net_bridge_port *p)
buf[2] = 0;
buf[3] = BPDU_TYPE_TCN;
br_send_bpdu(p, buf, 4);
+
+ p->stp_xstats.tx_tcn++;
}
/*
@@ -147,7 +147,6 @@ void br_send_tcn_bpdu(struct net_bridge_port *p)
void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb,
struct net_device *dev)
{
- const unsigned char *dest = eth_hdr(skb)->h_dest;
struct net_bridge_port *p;
struct net_bridge *br;
const unsigned char *buf;
@@ -176,7 +175,7 @@ void br_stp_rcv(const struct stp_proto *proto, struct sk_buff *skb,
if (p->state == BR_STATE_DISABLED)
goto out;
- if (!ether_addr_equal(dest, br->group_addr))
+ if (!ether_addr_equal(eth_hdr(skb)->h_dest, br->group_addr))
goto out;
if (p->flags & BR_BPDU_GUARD) {
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index 808e2b914015..c20a41bf253b 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Spanning tree protocol; interface code
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -85,9 +81,9 @@ void br_stp_disable_bridge(struct net_bridge *br)
br->topology_change_detected = 0;
spin_unlock_bh(&br->lock);
- del_timer_sync(&br->hello_timer);
- del_timer_sync(&br->topology_change_timer);
- del_timer_sync(&br->tcn_timer);
+ timer_delete_sync(&br->hello_timer);
+ timer_delete_sync(&br->topology_change_timer);
+ timer_delete_sync(&br->tcn_timer);
cancel_delayed_work_sync(&br->gc_work);
}
@@ -113,11 +109,12 @@ void br_stp_disable_port(struct net_bridge_port *p)
br_ifinfo_notify(RTM_NEWLINK, NULL, p);
- del_timer(&p->message_age_timer);
- del_timer(&p->forward_delay_timer);
- del_timer(&p->hold_timer);
+ timer_delete(&p->message_age_timer);
+ timer_delete(&p->forward_delay_timer);
+ timer_delete(&p->hold_timer);
- br_fdb_delete_by_port(br, p, 0, 0);
+ if (!rcu_access_pointer(p->backup_port))
+ br_fdb_delete_by_port(br, p, 0, 0);
br_multicast_disable_port(p);
br_configuration_update(br);
@@ -199,10 +196,17 @@ static void br_stp_stop(struct net_bridge *br)
br->stp_enabled = BR_NO_STP;
}
-void br_stp_set_enabled(struct net_bridge *br, unsigned long val)
+int br_stp_set_enabled(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
ASSERT_RTNL();
+ if (br_mrp_enabled(br)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "STP can't be enabled if MRP is already enabled");
+ return -EINVAL;
+ }
+
if (val) {
if (br->stp_enabled == BR_NO_STP)
br_stp_start(br);
@@ -210,6 +214,8 @@ void br_stp_set_enabled(struct net_bridge *br, unsigned long val)
if (br->stp_enabled != BR_NO_STP)
br_stp_stop(br);
}
+
+ return 0;
}
/* called under bridge lock */
@@ -227,7 +233,7 @@ void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr)
memcpy(oldaddr, br->bridge_id.addr, ETH_ALEN);
memcpy(br->bridge_id.addr, addr, ETH_ALEN);
- memcpy(br->dev->dev_addr, addr, ETH_ALEN);
+ eth_hw_addr_set(br->dev, addr);
list_for_each_entry(p, &br->port_list, list) {
if (ether_addr_equal(p->designated_bridge.addr, oldaddr))
diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c
index e7739de5f0e1..e5d453305381 100644
--- a/net/bridge/br_stp_timer.c
+++ b/net/bridge/br_stp_timer.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Spanning tree protocol; timer-related code
* Linux ethernet bridge
*
* Authors:
* Lennert Buytenhek <buytenh@gnu.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -33,7 +29,7 @@ static int br_is_designated_for_some_port(const struct net_bridge *br)
static void br_hello_timer_expired(struct timer_list *t)
{
- struct net_bridge *br = from_timer(br, t, hello_timer);
+ struct net_bridge *br = timer_container_of(br, t, hello_timer);
br_debug(br, "hello timer expired\n");
spin_lock(&br->lock);
@@ -49,7 +45,8 @@ static void br_hello_timer_expired(struct timer_list *t)
static void br_message_age_timer_expired(struct timer_list *t)
{
- struct net_bridge_port *p = from_timer(p, t, message_age_timer);
+ struct net_bridge_port *p = timer_container_of(p, t,
+ message_age_timer);
struct net_bridge *br = p->br;
const bridge_id *id = &p->designated_bridge;
int was_root;
@@ -82,7 +79,8 @@ static void br_message_age_timer_expired(struct timer_list *t)
static void br_forward_delay_timer_expired(struct timer_list *t)
{
- struct net_bridge_port *p = from_timer(p, t, forward_delay_timer);
+ struct net_bridge_port *p = timer_container_of(p, t,
+ forward_delay_timer);
struct net_bridge *br = p->br;
br_debug(br, "port %u(%s) forward delay timer\n",
@@ -106,7 +104,7 @@ static void br_forward_delay_timer_expired(struct timer_list *t)
static void br_tcn_timer_expired(struct timer_list *t)
{
- struct net_bridge *br = from_timer(br, t, tcn_timer);
+ struct net_bridge *br = timer_container_of(br, t, tcn_timer);
br_debug(br, "tcn timer expired\n");
spin_lock(&br->lock);
@@ -120,7 +118,8 @@ static void br_tcn_timer_expired(struct timer_list *t)
static void br_topology_change_timer_expired(struct timer_list *t)
{
- struct net_bridge *br = from_timer(br, t, topology_change_timer);
+ struct net_bridge *br = timer_container_of(br, t,
+ topology_change_timer);
br_debug(br, "topo change timer expired\n");
spin_lock(&br->lock);
@@ -131,7 +130,7 @@ static void br_topology_change_timer_expired(struct timer_list *t)
static void br_hold_timer_expired(struct timer_list *t)
{
- struct net_bridge_port *p = from_timer(p, t, hold_timer);
+ struct net_bridge_port *p = timer_container_of(p, t, hold_timer);
br_debug(p->br, "port %u(%s) hold timer expired\n",
(unsigned int) p->port_no, p->dev->name);
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index d77f807420c4..fe3f7bbe86ee 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -4,150 +4,190 @@
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
+#include <net/ip.h>
#include <net/switchdev.h>
#include "br_private.h"
-static int br_switchdev_mark_get(struct net_bridge *br, struct net_device *dev)
+static struct static_key_false br_switchdev_tx_fwd_offload;
+
+static bool nbp_switchdev_can_offload_tx_fwd(const struct net_bridge_port *p,
+ const struct sk_buff *skb)
{
- struct net_bridge_port *p;
+ if (!static_branch_unlikely(&br_switchdev_tx_fwd_offload))
+ return false;
- /* dev is yet to be added to the port list. */
- list_for_each_entry(p, &br->port_list, list) {
- if (switchdev_port_same_parent_id(dev, p->dev))
- return p->offload_fwd_mark;
- }
+ if (br_multicast_igmp_type(skb))
+ return false;
- return ++br->offload_fwd_mark;
+ return (p->flags & BR_TX_FWD_OFFLOAD) &&
+ (p->hwdom != BR_INPUT_SKB_CB(skb)->src_hwdom);
}
-int nbp_switchdev_mark_set(struct net_bridge_port *p)
+bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb)
{
- struct switchdev_attr attr = {
- .orig_dev = p->dev,
- .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
- };
- int err;
+ if (!static_branch_unlikely(&br_switchdev_tx_fwd_offload))
+ return false;
- ASSERT_RTNL();
+ return BR_INPUT_SKB_CB(skb)->tx_fwd_offload;
+}
- err = switchdev_port_attr_get(p->dev, &attr);
- if (err) {
- if (err == -EOPNOTSUPP)
- return 0;
- return err;
- }
+void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb)
+{
+ skb->offload_fwd_mark = br_switchdev_frame_uses_tx_fwd_offload(skb);
+}
- p->offload_fwd_mark = br_switchdev_mark_get(p->br, p->dev);
+/* Mark the frame for TX forwarding offload if this egress port supports it */
+void nbp_switchdev_frame_mark_tx_fwd_offload(const struct net_bridge_port *p,
+ struct sk_buff *skb)
+{
+ if (nbp_switchdev_can_offload_tx_fwd(p, skb))
+ BR_INPUT_SKB_CB(skb)->tx_fwd_offload = true;
+}
- return 0;
+/* Lazily adds the hwdom of the egress bridge port to the bit mask of hwdoms
+ * that the skb has been already forwarded to, to avoid further cloning to
+ * other ports in the same hwdom by making nbp_switchdev_allowed_egress()
+ * return false.
+ */
+void nbp_switchdev_frame_mark_tx_fwd_to_hwdom(const struct net_bridge_port *p,
+ struct sk_buff *skb)
+{
+ if (nbp_switchdev_can_offload_tx_fwd(p, skb))
+ set_bit(p->hwdom, &BR_INPUT_SKB_CB(skb)->fwd_hwdoms);
}
void nbp_switchdev_frame_mark(const struct net_bridge_port *p,
struct sk_buff *skb)
{
- if (skb->offload_fwd_mark && !WARN_ON_ONCE(!p->offload_fwd_mark))
- BR_INPUT_SKB_CB(skb)->offload_fwd_mark = p->offload_fwd_mark;
+ if (p->hwdom)
+ BR_INPUT_SKB_CB(skb)->src_hwdom = p->hwdom;
}
bool nbp_switchdev_allowed_egress(const struct net_bridge_port *p,
const struct sk_buff *skb)
{
- return !skb->offload_fwd_mark ||
- BR_INPUT_SKB_CB(skb)->offload_fwd_mark != p->offload_fwd_mark;
+ struct br_input_skb_cb *cb = BR_INPUT_SKB_CB(skb);
+
+ return !test_bit(p->hwdom, &cb->fwd_hwdoms) &&
+ (!skb->offload_fwd_mark || cb->src_hwdom != p->hwdom);
}
/* Flags that can be offloaded to hardware */
-#define BR_PORT_FLAGS_HW_OFFLOAD (BR_LEARNING | BR_FLOOD | \
- BR_MCAST_FLOOD | BR_BCAST_FLOOD)
+#define BR_PORT_FLAGS_HW_OFFLOAD (BR_LEARNING | BR_FLOOD | BR_PORT_MAB | \
+ BR_MCAST_FLOOD | BR_BCAST_FLOOD | BR_PORT_LOCKED | \
+ BR_HAIRPIN_MODE | BR_ISOLATED | BR_MULTICAST_TO_UNICAST)
int br_switchdev_set_port_flag(struct net_bridge_port *p,
unsigned long flags,
- unsigned long mask)
+ unsigned long mask,
+ struct netlink_ext_ack *extack)
{
struct switchdev_attr attr = {
.orig_dev = p->dev,
- .id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT,
+ };
+ struct switchdev_notifier_port_attr_info info = {
+ .attr = &attr,
};
int err;
- if (mask & ~BR_PORT_FLAGS_HW_OFFLOAD)
+ mask &= BR_PORT_FLAGS_HW_OFFLOAD;
+ if (!mask)
return 0;
- err = switchdev_port_attr_get(p->dev, &attr);
+ attr.id = SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS;
+ attr.u.brport_flags.val = flags;
+ attr.u.brport_flags.mask = mask;
+
+ /* We run from atomic context here */
+ err = call_switchdev_notifiers(SWITCHDEV_PORT_ATTR_SET, p->dev,
+ &info.info, extack);
+ err = notifier_to_errno(err);
if (err == -EOPNOTSUPP)
return 0;
- if (err)
- return err;
- /* Check if specific bridge flag attribute offload is supported */
- if (!(attr.u.brport_flags_support & mask)) {
- br_warn(p->br, "bridge flag offload is not supported %u(%s)\n",
- (unsigned int)p->port_no, p->dev->name);
+ if (err) {
+ NL_SET_ERR_MSG_WEAK_MOD(extack,
+ "bridge flag offload is not supported");
return -EOPNOTSUPP;
}
attr.id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS;
attr.flags = SWITCHDEV_F_DEFER;
- attr.u.brport_flags = flags;
- err = switchdev_port_attr_set(p->dev, &attr);
+
+ err = switchdev_port_attr_set(p->dev, &attr, extack);
if (err) {
- br_warn(p->br, "error setting offload flag on port %u(%s)\n",
- (unsigned int)p->port_no, p->dev->name);
+ NL_SET_ERR_MSG_WEAK_MOD(extack,
+ "error setting offload flag on port");
return err;
}
return 0;
}
-static void
-br_switchdev_fdb_call_notifiers(bool adding, const unsigned char *mac,
- u16 vid, struct net_device *dev,
- bool added_by_user)
+static void br_switchdev_fdb_populate(struct net_bridge *br,
+ struct switchdev_notifier_fdb_info *item,
+ const struct net_bridge_fdb_entry *fdb,
+ const void *ctx)
{
- struct switchdev_notifier_fdb_info info;
- unsigned long notifier_type;
+ const struct net_bridge_port *p = READ_ONCE(fdb->dst);
- info.addr = mac;
- info.vid = vid;
- info.added_by_user = added_by_user;
- notifier_type = adding ? SWITCHDEV_FDB_ADD_TO_DEVICE : SWITCHDEV_FDB_DEL_TO_DEVICE;
- call_switchdev_notifiers(notifier_type, dev, &info.info);
+ item->addr = fdb->key.addr.addr;
+ item->vid = fdb->key.vlan_id;
+ item->added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
+ item->offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags);
+ item->is_local = test_bit(BR_FDB_LOCAL, &fdb->flags);
+ item->locked = false;
+ item->info.dev = (!p || item->is_local) ? br->dev : p->dev;
+ item->info.ctx = ctx;
}
void
-br_switchdev_fdb_notify(const struct net_bridge_fdb_entry *fdb, int type)
+br_switchdev_fdb_notify(struct net_bridge *br,
+ const struct net_bridge_fdb_entry *fdb, int type)
{
- if (!fdb->dst)
+ struct switchdev_notifier_fdb_info item;
+
+ if (test_bit(BR_FDB_LOCKED, &fdb->flags))
+ return;
+
+ /* Entries with these flags were created using ndm_state == NUD_REACHABLE,
+ * ndm_flags == NTF_MASTER( | NTF_STICKY), ext_flags == 0 by something
+ * equivalent to 'bridge fdb add ... master dynamic (sticky)'.
+ * Drivers don't know how to deal with these, so don't notify them to
+ * avoid confusing them.
+ */
+ if (test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags) &&
+ !test_bit(BR_FDB_STATIC, &fdb->flags) &&
+ !test_bit(BR_FDB_ADDED_BY_EXT_LEARN, &fdb->flags))
return;
+ br_switchdev_fdb_populate(br, &item, fdb, NULL);
+
switch (type) {
case RTM_DELNEIGH:
- br_switchdev_fdb_call_notifiers(false, fdb->key.addr.addr,
- fdb->key.vlan_id,
- fdb->dst->dev,
- fdb->added_by_user);
+ call_switchdev_notifiers(SWITCHDEV_FDB_DEL_TO_DEVICE,
+ item.info.dev, &item.info, NULL);
break;
case RTM_NEWNEIGH:
- br_switchdev_fdb_call_notifiers(true, fdb->key.addr.addr,
- fdb->key.vlan_id,
- fdb->dst->dev,
- fdb->added_by_user);
+ call_switchdev_notifiers(SWITCHDEV_FDB_ADD_TO_DEVICE,
+ item.info.dev, &item.info, NULL);
break;
}
}
-int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags)
+int br_switchdev_port_vlan_add(struct net_device *dev, u16 vid, u16 flags,
+ bool changed, struct netlink_ext_ack *extack)
{
struct switchdev_obj_port_vlan v = {
.obj.orig_dev = dev,
.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
.flags = flags,
- .vid_begin = vid,
- .vid_end = vid,
+ .vid = vid,
+ .changed = changed,
};
- return switchdev_port_obj_add(dev, &v.obj);
+ return switchdev_port_obj_add(dev, &v.obj, extack);
}
int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid)
@@ -155,9 +195,682 @@ int br_switchdev_port_vlan_del(struct net_device *dev, u16 vid)
struct switchdev_obj_port_vlan v = {
.obj.orig_dev = dev,
.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
- .vid_begin = vid,
- .vid_end = vid,
+ .vid = vid,
};
return switchdev_port_obj_del(dev, &v.obj);
}
+
+static int nbp_switchdev_hwdom_set(struct net_bridge_port *joining)
+{
+ struct net_bridge *br = joining->br;
+ struct net_bridge_port *p;
+ int hwdom;
+
+ /* joining is yet to be added to the port list. */
+ list_for_each_entry(p, &br->port_list, list) {
+ if (netdev_phys_item_id_same(&joining->ppid, &p->ppid)) {
+ joining->hwdom = p->hwdom;
+ return 0;
+ }
+ }
+
+ hwdom = find_next_zero_bit(&br->busy_hwdoms, BR_HWDOM_MAX, 1);
+ if (hwdom >= BR_HWDOM_MAX)
+ return -EBUSY;
+
+ set_bit(hwdom, &br->busy_hwdoms);
+ joining->hwdom = hwdom;
+ return 0;
+}
+
+static void nbp_switchdev_hwdom_put(struct net_bridge_port *leaving)
+{
+ struct net_bridge *br = leaving->br;
+ struct net_bridge_port *p;
+
+ /* leaving is no longer in the port list. */
+ list_for_each_entry(p, &br->port_list, list) {
+ if (p->hwdom == leaving->hwdom)
+ return;
+ }
+
+ clear_bit(leaving->hwdom, &br->busy_hwdoms);
+}
+
+static int nbp_switchdev_add(struct net_bridge_port *p,
+ struct netdev_phys_item_id ppid,
+ bool tx_fwd_offload,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (p->offload_count) {
+ /* Prevent unsupported configurations such as a bridge port
+ * which is a bonding interface, and the member ports are from
+ * different hardware switches.
+ */
+ if (!netdev_phys_item_id_same(&p->ppid, &ppid)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Same bridge port cannot be offloaded by two physical switches");
+ return -EBUSY;
+ }
+
+ /* Tolerate drivers that call switchdev_bridge_port_offload()
+ * more than once for the same bridge port, such as when the
+ * bridge port is an offloaded bonding/team interface.
+ */
+ p->offload_count++;
+
+ return 0;
+ }
+
+ p->ppid = ppid;
+ p->offload_count = 1;
+
+ err = nbp_switchdev_hwdom_set(p);
+ if (err)
+ return err;
+
+ if (tx_fwd_offload) {
+ p->flags |= BR_TX_FWD_OFFLOAD;
+ static_branch_inc(&br_switchdev_tx_fwd_offload);
+ }
+
+ return 0;
+}
+
+static void nbp_switchdev_del(struct net_bridge_port *p)
+{
+ if (WARN_ON(!p->offload_count))
+ return;
+
+ p->offload_count--;
+
+ if (p->offload_count)
+ return;
+
+ if (p->hwdom)
+ nbp_switchdev_hwdom_put(p);
+
+ if (p->flags & BR_TX_FWD_OFFLOAD) {
+ p->flags &= ~BR_TX_FWD_OFFLOAD;
+ static_branch_dec(&br_switchdev_tx_fwd_offload);
+ }
+}
+
+static int
+br_switchdev_fdb_replay_one(struct net_bridge *br, struct notifier_block *nb,
+ const struct net_bridge_fdb_entry *fdb,
+ unsigned long action, const void *ctx)
+{
+ struct switchdev_notifier_fdb_info item;
+ int err;
+
+ br_switchdev_fdb_populate(br, &item, fdb, ctx);
+
+ err = nb->notifier_call(nb, action, &item);
+ return notifier_to_errno(err);
+}
+
+static int
+br_switchdev_fdb_replay(const struct net_device *br_dev, const void *ctx,
+ bool adding, struct notifier_block *nb)
+{
+ struct net_bridge_fdb_entry *fdb;
+ struct net_bridge *br;
+ unsigned long action;
+ int err = 0;
+
+ if (!nb)
+ return 0;
+
+ if (!netif_is_bridge_master(br_dev))
+ return -EINVAL;
+
+ br = netdev_priv(br_dev);
+
+ if (adding)
+ action = SWITCHDEV_FDB_ADD_TO_DEVICE;
+ else
+ action = SWITCHDEV_FDB_DEL_TO_DEVICE;
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(fdb, &br->fdb_list, fdb_node) {
+ err = br_switchdev_fdb_replay_one(br, nb, fdb, action, ctx);
+ if (err)
+ break;
+ }
+
+ rcu_read_unlock();
+
+ return err;
+}
+
+static int br_switchdev_vlan_attr_replay(struct net_device *br_dev,
+ const void *ctx,
+ struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ struct switchdev_notifier_port_attr_info attr_info = {
+ .info = {
+ .dev = br_dev,
+ .extack = extack,
+ .ctx = ctx,
+ },
+ };
+ struct net_bridge *br = netdev_priv(br_dev);
+ struct net_bridge_vlan_group *vg;
+ struct switchdev_attr attr;
+ struct net_bridge_vlan *v;
+ int err;
+
+ attr_info.attr = &attr;
+ attr.orig_dev = br_dev;
+
+ vg = br_vlan_group(br);
+ if (!vg)
+ return 0;
+
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ if (v->msti) {
+ attr.id = SWITCHDEV_ATTR_ID_VLAN_MSTI;
+ attr.u.vlan_msti.vid = v->vid;
+ attr.u.vlan_msti.msti = v->msti;
+
+ err = nb->notifier_call(nb, SWITCHDEV_PORT_ATTR_SET,
+ &attr_info);
+ err = notifier_to_errno(err);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int
+br_switchdev_vlan_replay_one(struct notifier_block *nb,
+ struct net_device *dev,
+ struct switchdev_obj_port_vlan *vlan,
+ const void *ctx, unsigned long action,
+ struct netlink_ext_ack *extack)
+{
+ struct switchdev_notifier_port_obj_info obj_info = {
+ .info = {
+ .dev = dev,
+ .extack = extack,
+ .ctx = ctx,
+ },
+ .obj = &vlan->obj,
+ };
+ int err;
+
+ err = nb->notifier_call(nb, action, &obj_info);
+ return notifier_to_errno(err);
+}
+
+static int br_switchdev_vlan_replay_group(struct notifier_block *nb,
+ struct net_device *dev,
+ struct net_bridge_vlan_group *vg,
+ const void *ctx, unsigned long action,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_vlan *v;
+ int err = 0;
+ u16 pvid;
+
+ if (!vg)
+ return 0;
+
+ pvid = br_get_pvid(vg);
+
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ struct switchdev_obj_port_vlan vlan = {
+ .obj.orig_dev = dev,
+ .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+ .flags = br_vlan_flags(v, pvid),
+ .vid = v->vid,
+ };
+
+ if (!br_vlan_should_use(v))
+ continue;
+
+ err = br_switchdev_vlan_replay_one(nb, dev, &vlan, ctx,
+ action, extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int br_switchdev_vlan_replay(struct net_device *br_dev,
+ const void *ctx, bool adding,
+ struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge *br = netdev_priv(br_dev);
+ struct net_bridge_port *p;
+ unsigned long action;
+ int err;
+
+ ASSERT_RTNL();
+
+ if (!nb)
+ return 0;
+
+ if (!netif_is_bridge_master(br_dev))
+ return -EINVAL;
+
+ if (adding)
+ action = SWITCHDEV_PORT_OBJ_ADD;
+ else
+ action = SWITCHDEV_PORT_OBJ_DEL;
+
+ err = br_switchdev_vlan_replay_group(nb, br_dev, br_vlan_group(br),
+ ctx, action, extack);
+ if (err)
+ return err;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ struct net_device *dev = p->dev;
+
+ err = br_switchdev_vlan_replay_group(nb, dev,
+ nbp_vlan_group(p),
+ ctx, action, extack);
+ if (err)
+ return err;
+ }
+
+ if (adding) {
+ err = br_switchdev_vlan_attr_replay(br_dev, ctx, nb, extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+struct br_switchdev_mdb_complete_info {
+ struct net_bridge_port *port;
+ struct br_ip ip;
+};
+
+static void br_switchdev_mdb_complete(struct net_device *dev, int err, void *priv)
+{
+ struct br_switchdev_mdb_complete_info *data = priv;
+ struct net_bridge_port_group __rcu **pp;
+ struct net_bridge_port_group *p;
+ struct net_bridge_mdb_entry *mp;
+ struct net_bridge_port *port = data->port;
+ struct net_bridge *br = port->br;
+ u8 old_flags;
+
+ if (err == -EOPNOTSUPP)
+ goto out_free;
+
+ spin_lock_bh(&br->multicast_lock);
+ mp = br_mdb_ip_get(br, &data->ip);
+ if (!mp)
+ goto out;
+ for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL;
+ pp = &p->next) {
+ if (p->key.port != port)
+ continue;
+
+ old_flags = p->flags;
+ br_multicast_set_pg_offload_flags(p, !err);
+ if (br_mdb_should_notify(br, old_flags ^ p->flags))
+ br_mdb_flag_change_notify(br->dev, mp, p);
+ }
+out:
+ spin_unlock_bh(&br->multicast_lock);
+out_free:
+ kfree(priv);
+}
+
+static void br_switchdev_mdb_populate(struct switchdev_obj_port_mdb *mdb,
+ const struct net_bridge_mdb_entry *mp)
+{
+ if (mp->addr.proto == htons(ETH_P_IP))
+ ip_eth_mc_map(mp->addr.dst.ip4, mdb->addr);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (mp->addr.proto == htons(ETH_P_IPV6))
+ ipv6_eth_mc_map(&mp->addr.dst.ip6, mdb->addr);
+#endif
+ else
+ ether_addr_copy(mdb->addr, mp->addr.dst.mac_addr);
+
+ mdb->vid = mp->addr.vid;
+}
+
+static void br_switchdev_host_mdb_one(struct net_device *dev,
+ struct net_device *lower_dev,
+ struct net_bridge_mdb_entry *mp,
+ int type)
+{
+ struct switchdev_obj_port_mdb mdb = {
+ .obj = {
+ .id = SWITCHDEV_OBJ_ID_HOST_MDB,
+ .flags = SWITCHDEV_F_DEFER,
+ .orig_dev = dev,
+ },
+ };
+
+ br_switchdev_mdb_populate(&mdb, mp);
+
+ switch (type) {
+ case RTM_NEWMDB:
+ switchdev_port_obj_add(lower_dev, &mdb.obj, NULL);
+ break;
+ case RTM_DELMDB:
+ switchdev_port_obj_del(lower_dev, &mdb.obj);
+ break;
+ }
+}
+
+static void br_switchdev_host_mdb(struct net_device *dev,
+ struct net_bridge_mdb_entry *mp, int type)
+{
+ struct net_device *lower_dev;
+ struct list_head *iter;
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter)
+ br_switchdev_host_mdb_one(dev, lower_dev, mp, type);
+}
+
+static int
+br_switchdev_mdb_replay_one(struct notifier_block *nb, struct net_device *dev,
+ const struct switchdev_obj_port_mdb *mdb,
+ unsigned long action, const void *ctx,
+ struct netlink_ext_ack *extack)
+{
+ struct switchdev_notifier_port_obj_info obj_info = {
+ .info = {
+ .dev = dev,
+ .extack = extack,
+ .ctx = ctx,
+ },
+ .obj = &mdb->obj,
+ };
+ int err;
+
+ err = nb->notifier_call(nb, action, &obj_info);
+ return notifier_to_errno(err);
+}
+
+static int br_switchdev_mdb_queue_one(struct list_head *mdb_list,
+ struct net_device *dev,
+ unsigned long action,
+ enum switchdev_obj_id id,
+ const struct net_bridge_mdb_entry *mp,
+ struct net_device *orig_dev)
+{
+ struct switchdev_obj_port_mdb mdb = {
+ .obj = {
+ .id = id,
+ .orig_dev = orig_dev,
+ },
+ };
+ struct switchdev_obj_port_mdb *pmdb;
+
+ br_switchdev_mdb_populate(&mdb, mp);
+
+ if (action == SWITCHDEV_PORT_OBJ_ADD &&
+ switchdev_port_obj_act_is_deferred(dev, action, &mdb.obj)) {
+ /* This event is already in the deferred queue of
+ * events, so this replay must be elided, lest the
+ * driver receives duplicate events for it. This can
+ * only happen when replaying additions, since
+ * modifications are always immediately visible in
+ * br->mdb_list, whereas actual event delivery may be
+ * delayed.
+ */
+ return 0;
+ }
+
+ pmdb = kmemdup(&mdb, sizeof(mdb), GFP_ATOMIC);
+ if (!pmdb)
+ return -ENOMEM;
+
+ list_add_tail(&pmdb->obj.list, mdb_list);
+ return 0;
+}
+
+void br_switchdev_mdb_notify(struct net_device *dev,
+ struct net_bridge_mdb_entry *mp,
+ struct net_bridge_port_group *pg,
+ int type)
+{
+ struct br_switchdev_mdb_complete_info *complete_info;
+ struct switchdev_obj_port_mdb mdb = {
+ .obj = {
+ .id = SWITCHDEV_OBJ_ID_PORT_MDB,
+ .flags = SWITCHDEV_F_DEFER,
+ },
+ };
+
+ if (!pg)
+ return br_switchdev_host_mdb(dev, mp, type);
+
+ br_switchdev_mdb_populate(&mdb, mp);
+
+ mdb.obj.orig_dev = pg->key.port->dev;
+ switch (type) {
+ case RTM_NEWMDB:
+ complete_info = kmalloc(sizeof(*complete_info), GFP_ATOMIC);
+ if (!complete_info)
+ break;
+ complete_info->port = pg->key.port;
+ complete_info->ip = mp->addr;
+ mdb.obj.complete_priv = complete_info;
+ mdb.obj.complete = br_switchdev_mdb_complete;
+ if (switchdev_port_obj_add(pg->key.port->dev, &mdb.obj, NULL))
+ kfree(complete_info);
+ break;
+ case RTM_DELMDB:
+ switchdev_port_obj_del(pg->key.port->dev, &mdb.obj);
+ break;
+ }
+}
+#endif
+
+static int
+br_switchdev_mdb_replay(struct net_device *br_dev, struct net_device *dev,
+ const void *ctx, bool adding, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ const struct net_bridge_mdb_entry *mp;
+ struct switchdev_obj *obj, *tmp;
+ struct net_bridge *br;
+ unsigned long action;
+ LIST_HEAD(mdb_list);
+ int err = 0;
+
+ ASSERT_RTNL();
+
+ if (!nb)
+ return 0;
+
+ if (!netif_is_bridge_master(br_dev) || !netif_is_bridge_port(dev))
+ return -EINVAL;
+
+ br = netdev_priv(br_dev);
+
+ if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
+ return 0;
+
+ if (adding)
+ action = SWITCHDEV_PORT_OBJ_ADD;
+ else
+ action = SWITCHDEV_PORT_OBJ_DEL;
+
+ /* br_switchdev_mdb_queue_one() will take care to not queue a
+ * replay of an event that is already pending in the switchdev
+ * deferred queue. In order to safely determine that, there
+ * must be no new deferred MDB notifications enqueued for the
+ * duration of the MDB scan. Therefore, grab the write-side
+ * lock to avoid racing with any concurrent IGMP/MLD snooping.
+ */
+ spin_lock_bh(&br->multicast_lock);
+
+ hlist_for_each_entry(mp, &br->mdb_list, mdb_node) {
+ struct net_bridge_port_group __rcu * const *pp;
+ const struct net_bridge_port_group *p;
+
+ if (mp->host_joined) {
+ err = br_switchdev_mdb_queue_one(&mdb_list, dev, action,
+ SWITCHDEV_OBJ_ID_HOST_MDB,
+ mp, br_dev);
+ if (err) {
+ spin_unlock_bh(&br->multicast_lock);
+ goto out_free_mdb;
+ }
+ }
+
+ for (pp = &mp->ports; (p = mlock_dereference(*pp, br)) != NULL;
+ pp = &p->next) {
+ if (p->key.port->dev != dev)
+ continue;
+
+ err = br_switchdev_mdb_queue_one(&mdb_list, dev, action,
+ SWITCHDEV_OBJ_ID_PORT_MDB,
+ mp, dev);
+ if (err) {
+ spin_unlock_bh(&br->multicast_lock);
+ goto out_free_mdb;
+ }
+ }
+ }
+
+ spin_unlock_bh(&br->multicast_lock);
+
+ list_for_each_entry(obj, &mdb_list, list) {
+ err = br_switchdev_mdb_replay_one(nb, dev,
+ SWITCHDEV_OBJ_PORT_MDB(obj),
+ action, ctx, extack);
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ if (err)
+ goto out_free_mdb;
+ }
+
+out_free_mdb:
+ list_for_each_entry_safe(obj, tmp, &mdb_list, list) {
+ list_del(&obj->list);
+ kfree(SWITCHDEV_OBJ_PORT_MDB(obj));
+ }
+
+ if (err)
+ return err;
+#endif
+
+ return 0;
+}
+
+static int nbp_switchdev_sync_objs(struct net_bridge_port *p, const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *br_dev = p->br->dev;
+ struct net_device *dev = p->dev;
+ int err;
+
+ err = br_switchdev_vlan_replay(br_dev, ctx, true, blocking_nb, extack);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ err = br_switchdev_mdb_replay(br_dev, dev, ctx, true, blocking_nb,
+ extack);
+ if (err) {
+ /* -EOPNOTSUPP not propagated from MDB replay. */
+ return err;
+ }
+
+ err = br_switchdev_fdb_replay(br_dev, ctx, true, atomic_nb);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ return 0;
+}
+
+static void nbp_switchdev_unsync_objs(struct net_bridge_port *p,
+ const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb)
+{
+ struct net_device *br_dev = p->br->dev;
+ struct net_device *dev = p->dev;
+
+ br_switchdev_fdb_replay(br_dev, ctx, false, atomic_nb);
+
+ br_switchdev_mdb_replay(br_dev, dev, ctx, false, blocking_nb, NULL);
+
+ br_switchdev_vlan_replay(br_dev, ctx, false, blocking_nb, NULL);
+
+ /* Make sure that the device leaving this bridge has seen all
+ * relevant events before it is disassociated. In the normal
+ * case, when the device is directly attached to the bridge,
+ * this is covered by del_nbp(). If the association was indirect
+ * however, e.g. via a team or bond, and the device is leaving
+ * that intermediate device, then the bridge port remains in
+ * place.
+ */
+ switchdev_deferred_process();
+}
+
+/* Let the bridge know that this port is offloaded, so that it can assign a
+ * switchdev hardware domain to it.
+ */
+int br_switchdev_port_offload(struct net_bridge_port *p,
+ struct net_device *dev, const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb,
+ bool tx_fwd_offload,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_phys_item_id ppid;
+ int err;
+
+ err = netif_get_port_parent_id(dev, &ppid, false);
+ if (err)
+ return err;
+
+ err = nbp_switchdev_add(p, ppid, tx_fwd_offload, extack);
+ if (err)
+ return err;
+
+ err = nbp_switchdev_sync_objs(p, ctx, atomic_nb, blocking_nb, extack);
+ if (err)
+ goto out_switchdev_del;
+
+ return 0;
+
+out_switchdev_del:
+ nbp_switchdev_del(p);
+
+ return err;
+}
+
+void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb)
+{
+ nbp_switchdev_unsync_objs(p, ctx, atomic_nb, blocking_nb);
+
+ nbp_switchdev_del(p);
+}
+
+int br_switchdev_port_replay(struct net_bridge_port *p,
+ struct net_device *dev, const void *ctx,
+ struct notifier_block *atomic_nb,
+ struct notifier_block *blocking_nb,
+ struct netlink_ext_ack *extack)
+{
+ return nbp_switchdev_sync_objs(p, ctx, atomic_nb, blocking_nb, extack);
+}
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 0318a69888d4..cb4855ed9500 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Sysfs attributes of bridge
* Linux ethernet bridge
*
* Authors:
* Stephen Hemminger <shemminger@osdl.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/capability.h>
@@ -23,6 +19,10 @@
#include "br_private.h"
+/* IMPORTANT: new bridge options must be added with netlink support only
+ * please do not add new sysfs entries
+ */
+
#define to_bridge(cd) ((struct net_bridge *)netdev_priv(to_net_dev(cd)))
/*
@@ -30,26 +30,33 @@
*/
static ssize_t store_bridge_parm(struct device *d,
const char *buf, size_t len,
- int (*set)(struct net_bridge *, unsigned long))
+ int (*set)(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack))
{
struct net_bridge *br = to_bridge(d);
- char *endp;
+ struct netlink_ext_ack extack = {0};
unsigned long val;
int err;
if (!ns_capable(dev_net(br->dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
- val = simple_strtoul(buf, &endp, 0);
- if (endp == buf)
- return -EINVAL;
+ err = kstrtoul(buf, 0, &val);
+ if (err != 0)
+ return err;
if (!rtnl_trylock())
return restart_syscall();
- err = (*set)(br, val);
+ err = (*set)(br, val, &extack);
if (!err)
netdev_state_change(br->dev);
+ if (extack._msg) {
+ if (err)
+ br_err(br, "%s\n", extack._msg);
+ else
+ br_warn(br, "%s\n", extack._msg);
+ }
rtnl_unlock();
return err ? err : len;
@@ -63,11 +70,17 @@ static ssize_t forward_delay_show(struct device *d,
return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay));
}
+static int set_forward_delay(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ return br_set_forward_delay(br, val);
+}
+
static ssize_t forward_delay_store(struct device *d,
struct device_attribute *attr,
const char *buf, size_t len)
{
- return store_bridge_parm(d, buf, len, br_set_forward_delay);
+ return store_bridge_parm(d, buf, len, set_forward_delay);
}
static DEVICE_ATTR_RW(forward_delay);
@@ -78,11 +91,17 @@ static ssize_t hello_time_show(struct device *d, struct device_attribute *attr,
jiffies_to_clock_t(to_bridge(d)->hello_time));
}
+static int set_hello_time(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ return br_set_hello_time(br, val);
+}
+
static ssize_t hello_time_store(struct device *d,
struct device_attribute *attr, const char *buf,
size_t len)
{
- return store_bridge_parm(d, buf, len, br_set_hello_time);
+ return store_bridge_parm(d, buf, len, set_hello_time);
}
static DEVICE_ATTR_RW(hello_time);
@@ -93,10 +112,16 @@ static ssize_t max_age_show(struct device *d, struct device_attribute *attr,
jiffies_to_clock_t(to_bridge(d)->max_age));
}
+static int set_max_age(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ return br_set_max_age(br, val);
+}
+
static ssize_t max_age_store(struct device *d, struct device_attribute *attr,
const char *buf, size_t len)
{
- return store_bridge_parm(d, buf, len, br_set_max_age);
+ return store_bridge_parm(d, buf, len, set_max_age);
}
static DEVICE_ATTR_RW(max_age);
@@ -107,7 +132,8 @@ static ssize_t ageing_time_show(struct device *d,
return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time));
}
-static int set_ageing_time(struct net_bridge *br, unsigned long val)
+static int set_ageing_time(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
return br_set_ageing_time(br, val);
}
@@ -128,11 +154,10 @@ static ssize_t stp_state_show(struct device *d,
}
-static int set_stp_state(struct net_bridge *br, unsigned long val)
+static int set_stp_state(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- br_stp_set_enabled(br, val);
-
- return 0;
+ return br_stp_set_enabled(br, val, extack);
}
static ssize_t stp_state_store(struct device *d,
@@ -151,7 +176,8 @@ static ssize_t group_fwd_mask_show(struct device *d,
return sprintf(buf, "%#x\n", br->group_fwd_mask);
}
-static int set_group_fwd_mask(struct net_bridge *br, unsigned long val)
+static int set_group_fwd_mask(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
if (val & BR_GROUPFWD_RESTRICTED)
return -EINVAL;
@@ -178,7 +204,8 @@ static ssize_t priority_show(struct device *d, struct device_attribute *attr,
(br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]);
}
-static int set_priority(struct net_bridge *br, unsigned long val)
+static int set_priority(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
br_stp_set_bridge_priority(br, (u16) val);
return 0;
@@ -303,7 +330,7 @@ static ssize_t group_addr_store(struct device *d,
ether_addr_copy(br->group_addr, new_addr);
spin_unlock_bh(&br->lock);
- br->group_addr_set = true;
+ br_opt_toggle(br, BROPT_GROUP_ADDR_SET, true);
br_recalculate_fwd_mask(br);
netdev_state_change(br->dev);
@@ -314,9 +341,14 @@ static ssize_t group_addr_store(struct device *d,
static DEVICE_ATTR_RW(group_addr);
-static int set_flush(struct net_bridge *br, unsigned long val)
+static int set_flush(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- br_fdb_flush(br);
+ struct net_bridge_fdb_flush_desc desc = {
+ .flags_mask = BIT(BR_FDB_STATIC)
+ };
+
+ br_fdb_flush(br, &desc);
return 0;
}
@@ -328,19 +360,47 @@ static ssize_t flush_store(struct device *d,
}
static DEVICE_ATTR_WO(flush);
+static ssize_t no_linklocal_learn_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%d\n", br_boolopt_get(br, BR_BOOLOPT_NO_LL_LEARN));
+}
+
+static int set_no_linklocal_learn(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ return br_boolopt_toggle(br, BR_BOOLOPT_NO_LL_LEARN, !!val, extack);
+}
+
+static ssize_t no_linklocal_learn_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_no_linklocal_learn);
+}
+static DEVICE_ATTR_RW(no_linklocal_learn);
+
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
static ssize_t multicast_router_show(struct device *d,
struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%d\n", br->multicast_router);
+ return sprintf(buf, "%d\n", br->multicast_ctx.multicast_router);
+}
+
+static int set_multicast_router(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ return br_multicast_set_router(&br->multicast_ctx, val);
}
static ssize_t multicast_router_store(struct device *d,
struct device_attribute *attr,
const char *buf, size_t len)
{
- return store_bridge_parm(d, buf, len, br_multicast_set_router);
+ return store_bridge_parm(d, buf, len, set_multicast_router);
}
static DEVICE_ATTR_RW(multicast_router);
@@ -349,7 +409,7 @@ static ssize_t multicast_snooping_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%d\n", !br->multicast_disabled);
+ return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_ENABLED));
}
static ssize_t multicast_snooping_store(struct device *d,
@@ -365,12 +425,14 @@ static ssize_t multicast_query_use_ifaddr_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%d\n", br->multicast_query_use_ifaddr);
+ return sprintf(buf, "%d\n",
+ br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR));
}
-static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val)
+static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- br->multicast_query_use_ifaddr = !!val;
+ br_opt_toggle(br, BROPT_MULTICAST_QUERY_USE_IFADDR, !!val);
return 0;
}
@@ -388,27 +450,35 @@ static ssize_t multicast_querier_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%d\n", br->multicast_querier);
+ return sprintf(buf, "%d\n", br->multicast_ctx.multicast_querier);
+}
+
+static int set_multicast_querier(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ return br_multicast_set_querier(&br->multicast_ctx, val);
}
static ssize_t multicast_querier_store(struct device *d,
struct device_attribute *attr,
const char *buf, size_t len)
{
- return store_bridge_parm(d, buf, len, br_multicast_set_querier);
+ return store_bridge_parm(d, buf, len, set_multicast_querier);
}
static DEVICE_ATTR_RW(multicast_querier);
static ssize_t hash_elasticity_show(struct device *d,
struct device_attribute *attr, char *buf)
{
- struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br->hash_elasticity);
+ return sprintf(buf, "%u\n", RHT_ELASTICITY);
}
-static int set_elasticity(struct net_bridge *br, unsigned long val)
+static int set_elasticity(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- br->hash_elasticity = val;
+ /* 16 is RHT_ELASTICITY */
+ NL_SET_ERR_MSG_MOD(extack,
+ "the hash_elasticity option has been deprecated and is always 16");
return 0;
}
@@ -427,10 +497,17 @@ static ssize_t hash_max_show(struct device *d, struct device_attribute *attr,
return sprintf(buf, "%u\n", br->hash_max);
}
+static int set_hash_max(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ br->hash_max = val;
+ return 0;
+}
+
static ssize_t hash_max_store(struct device *d, struct device_attribute *attr,
const char *buf, size_t len)
{
- return store_bridge_parm(d, buf, len, br_multicast_set_hash_max);
+ return store_bridge_parm(d, buf, len, set_hash_max);
}
static DEVICE_ATTR_RW(hash_max);
@@ -440,14 +517,20 @@ static ssize_t multicast_igmp_version_show(struct device *d,
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br->multicast_igmp_version);
+ return sprintf(buf, "%u\n", br->multicast_ctx.multicast_igmp_version);
+}
+
+static int set_multicast_igmp_version(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ return br_multicast_set_igmp_version(&br->multicast_ctx, val);
}
static ssize_t multicast_igmp_version_store(struct device *d,
struct device_attribute *attr,
const char *buf, size_t len)
{
- return store_bridge_parm(d, buf, len, br_multicast_set_igmp_version);
+ return store_bridge_parm(d, buf, len, set_multicast_igmp_version);
}
static DEVICE_ATTR_RW(multicast_igmp_version);
@@ -456,12 +539,13 @@ static ssize_t multicast_last_member_count_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br->multicast_last_member_count);
+ return sprintf(buf, "%u\n", br->multicast_ctx.multicast_last_member_count);
}
-static int set_last_member_count(struct net_bridge *br, unsigned long val)
+static int set_last_member_count(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- br->multicast_last_member_count = val;
+ br->multicast_ctx.multicast_last_member_count = val;
return 0;
}
@@ -477,12 +561,13 @@ static ssize_t multicast_startup_query_count_show(
struct device *d, struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br->multicast_startup_query_count);
+ return sprintf(buf, "%u\n", br->multicast_ctx.multicast_startup_query_count);
}
-static int set_startup_query_count(struct net_bridge *br, unsigned long val)
+static int set_startup_query_count(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- br->multicast_startup_query_count = val;
+ br->multicast_ctx.multicast_startup_query_count = val;
return 0;
}
@@ -499,12 +584,13 @@ static ssize_t multicast_last_member_interval_show(
{
struct net_bridge *br = to_bridge(d);
return sprintf(buf, "%lu\n",
- jiffies_to_clock_t(br->multicast_last_member_interval));
+ jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval));
}
-static int set_last_member_interval(struct net_bridge *br, unsigned long val)
+static int set_last_member_interval(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- br->multicast_last_member_interval = clock_t_to_jiffies(val);
+ br->multicast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val);
return 0;
}
@@ -521,12 +607,13 @@ static ssize_t multicast_membership_interval_show(
{
struct net_bridge *br = to_bridge(d);
return sprintf(buf, "%lu\n",
- jiffies_to_clock_t(br->multicast_membership_interval));
+ jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval));
}
-static int set_membership_interval(struct net_bridge *br, unsigned long val)
+static int set_membership_interval(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- br->multicast_membership_interval = clock_t_to_jiffies(val);
+ br->multicast_ctx.multicast_membership_interval = clock_t_to_jiffies(val);
return 0;
}
@@ -544,12 +631,13 @@ static ssize_t multicast_querier_interval_show(struct device *d,
{
struct net_bridge *br = to_bridge(d);
return sprintf(buf, "%lu\n",
- jiffies_to_clock_t(br->multicast_querier_interval));
+ jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval));
}
-static int set_querier_interval(struct net_bridge *br, unsigned long val)
+static int set_querier_interval(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- br->multicast_querier_interval = clock_t_to_jiffies(val);
+ br->multicast_ctx.multicast_querier_interval = clock_t_to_jiffies(val);
return 0;
}
@@ -567,12 +655,13 @@ static ssize_t multicast_query_interval_show(struct device *d,
{
struct net_bridge *br = to_bridge(d);
return sprintf(buf, "%lu\n",
- jiffies_to_clock_t(br->multicast_query_interval));
+ jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval));
}
-static int set_query_interval(struct net_bridge *br, unsigned long val)
+static int set_query_interval(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- br->multicast_query_interval = clock_t_to_jiffies(val);
+ br_multicast_set_query_intvl(&br->multicast_ctx, val);
return 0;
}
@@ -590,12 +679,13 @@ static ssize_t multicast_query_response_interval_show(
struct net_bridge *br = to_bridge(d);
return sprintf(
buf, "%lu\n",
- jiffies_to_clock_t(br->multicast_query_response_interval));
+ jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval));
}
-static int set_query_response_interval(struct net_bridge *br, unsigned long val)
+static int set_query_response_interval(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- br->multicast_query_response_interval = clock_t_to_jiffies(val);
+ br->multicast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val);
return 0;
}
@@ -613,12 +703,13 @@ static ssize_t multicast_startup_query_interval_show(
struct net_bridge *br = to_bridge(d);
return sprintf(
buf, "%lu\n",
- jiffies_to_clock_t(br->multicast_startup_query_interval));
+ jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval));
}
-static int set_startup_query_interval(struct net_bridge *br, unsigned long val)
+static int set_startup_query_interval(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- br->multicast_startup_query_interval = clock_t_to_jiffies(val);
+ br_multicast_set_startup_query_intvl(&br->multicast_ctx, val);
return 0;
}
@@ -636,12 +727,14 @@ static ssize_t multicast_stats_enabled_show(struct device *d,
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br->multicast_stats_enabled);
+ return sprintf(buf, "%d\n",
+ br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED));
}
-static int set_stats_enabled(struct net_bridge *br, unsigned long val)
+static int set_stats_enabled(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- br->multicast_stats_enabled = !!val;
+ br_opt_toggle(br, BROPT_MULTICAST_STATS_ENABLED, !!val);
return 0;
}
@@ -661,14 +754,20 @@ static ssize_t multicast_mld_version_show(struct device *d,
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br->multicast_mld_version);
+ return sprintf(buf, "%u\n", br->multicast_ctx.multicast_mld_version);
+}
+
+static int set_multicast_mld_version(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ return br_multicast_set_mld_version(&br->multicast_ctx, val);
}
static ssize_t multicast_mld_version_store(struct device *d,
struct device_attribute *attr,
const char *buf, size_t len)
{
- return store_bridge_parm(d, buf, len, br_multicast_set_mld_version);
+ return store_bridge_parm(d, buf, len, set_multicast_mld_version);
}
static DEVICE_ATTR_RW(multicast_mld_version);
#endif
@@ -678,12 +777,13 @@ static ssize_t nf_call_iptables_show(
struct device *d, struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br->nf_call_iptables);
+ return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IPTABLES));
}
-static int set_nf_call_iptables(struct net_bridge *br, unsigned long val)
+static int set_nf_call_iptables(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- br->nf_call_iptables = val ? true : false;
+ br_opt_toggle(br, BROPT_NF_CALL_IPTABLES, !!val);
return 0;
}
@@ -699,12 +799,13 @@ static ssize_t nf_call_ip6tables_show(
struct device *d, struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br->nf_call_ip6tables);
+ return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IP6TABLES));
}
-static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val)
+static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- br->nf_call_ip6tables = val ? true : false;
+ br_opt_toggle(br, BROPT_NF_CALL_IP6TABLES, !!val);
return 0;
}
@@ -720,12 +821,13 @@ static ssize_t nf_call_arptables_show(
struct device *d, struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br->nf_call_arptables);
+ return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_ARPTABLES));
}
-static int set_nf_call_arptables(struct net_bridge *br, unsigned long val)
+static int set_nf_call_arptables(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- br->nf_call_arptables = val ? true : false;
+ br_opt_toggle(br, BROPT_NF_CALL_ARPTABLES, !!val);
return 0;
}
@@ -743,7 +845,7 @@ static ssize_t vlan_filtering_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%d\n", br->vlan_enabled);
+ return sprintf(buf, "%d\n", br_opt_get(br, BROPT_VLAN_ENABLED));
}
static ssize_t vlan_filtering_store(struct device *d,
@@ -791,16 +893,44 @@ static ssize_t vlan_stats_enabled_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br->vlan_stats_enabled);
+ return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_ENABLED));
+}
+
+static int set_vlan_stats_enabled(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ return br_vlan_set_stats(br, val);
}
static ssize_t vlan_stats_enabled_store(struct device *d,
struct device_attribute *attr,
const char *buf, size_t len)
{
- return store_bridge_parm(d, buf, len, br_vlan_set_stats);
+ return store_bridge_parm(d, buf, len, set_vlan_stats_enabled);
}
static DEVICE_ATTR_RW(vlan_stats_enabled);
+
+static ssize_t vlan_stats_per_port_show(struct device *d,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct net_bridge *br = to_bridge(d);
+ return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_PER_PORT));
+}
+
+static int set_vlan_stats_per_port(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
+{
+ return br_vlan_set_stats_per_port(br, val);
+}
+
+static ssize_t vlan_stats_per_port_store(struct device *d,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_vlan_stats_per_port);
+}
+static DEVICE_ATTR_RW(vlan_stats_per_port);
#endif
static struct attribute *bridge_attrs[] = {
@@ -823,6 +953,7 @@ static struct attribute *bridge_attrs[] = {
&dev_attr_gc_timer.attr,
&dev_attr_group_addr.attr,
&dev_attr_flush.attr,
+ &dev_attr_no_linklocal_learn.attr,
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
&dev_attr_multicast_router.attr,
&dev_attr_multicast_snooping.attr,
@@ -854,6 +985,7 @@ static struct attribute *bridge_attrs[] = {
&dev_attr_vlan_protocol.attr,
&dev_attr_default_pvid.attr,
&dev_attr_vlan_stats_enabled.attr,
+ &dev_attr_vlan_stats_per_port.attr,
#endif
NULL
};
@@ -870,7 +1002,7 @@ static const struct attribute_group bridge_group = {
* Returns the number of bytes read.
*/
static ssize_t brforward_read(struct file *filp, struct kobject *kobj,
- struct bin_attribute *bin_attr,
+ const struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t count)
{
struct device *dev = kobj_to_dev(kobj);
@@ -891,7 +1023,7 @@ static ssize_t brforward_read(struct file *filp, struct kobject *kobj,
return n;
}
-static struct bin_attribute bridge_forward = {
+static const struct bin_attribute bridge_forward = {
.attr = { .name = SYSFS_BRIDGE_FDB,
.mode = 0444, },
.read = brforward_read,
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 7c87a2fe5248..74fdd8105dca 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Sysfs attributes of bridge ports
* Linux ethernet bridge
*
* Authors:
* Stephen Hemminger <shemminger@osdl.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/capability.h>
@@ -21,6 +17,10 @@
#include "br_private.h"
+/* IMPORTANT: new bridge port options must be added with netlink support only
+ * please do not add new sysfs entries
+ */
+
struct brport_attribute {
struct attribute attr;
ssize_t (*show)(struct net_bridge_port *, char *);
@@ -59,9 +59,9 @@ static BRPORT_ATTR(_name, 0644, \
static int store_flag(struct net_bridge_port *p, unsigned long v,
unsigned long mask)
{
- unsigned long flags;
-
- flags = p->flags;
+ struct netlink_ext_ack extack = {0};
+ unsigned long flags = p->flags;
+ int err;
if (v)
flags |= mask;
@@ -69,6 +69,12 @@ static int store_flag(struct net_bridge_port *p, unsigned long v,
flags &= ~mask;
if (flags != p->flags) {
+ err = br_switchdev_set_port_flag(p, flags, mask, &extack);
+ if (err) {
+ netdev_err(p->dev, "%s\n", extack._msg);
+ return err;
+ }
+
p->flags = flags;
br_port_flags_change(p, mask);
}
@@ -238,13 +244,13 @@ BRPORT_ATTR_FLAG(isolated, BR_ISOLATED);
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
{
- return sprintf(buf, "%d\n", p->multicast_router);
+ return sprintf(buf, "%d\n", p->multicast_ctx.multicast_router);
}
static int store_multicast_router(struct net_bridge_port *p,
unsigned long v)
{
- return br_multicast_set_port_router(p, v);
+ return br_multicast_set_port_router(&p->multicast_ctx, v);
}
static BRPORT_ATTR(multicast_router, 0644, show_multicast_router,
store_multicast_router);
@@ -320,9 +326,6 @@ static ssize_t brport_store(struct kobject *kobj,
if (!rtnl_trylock())
return restart_syscall();
- if (!p->dev || !p->br)
- goto out_unlock;
-
if (brport_attr->store_raw) {
char *buf_copy;
@@ -381,7 +384,7 @@ int br_sysfs_addif(struct net_bridge_port *p)
return err;
}
- strlcpy(p->sysfs_name, p->dev->name, IFNAMSIZ);
+ strscpy(p->sysfs_name, p->dev->name, IFNAMSIZ);
return sysfs_create_link(br->ifobj, &p->kobj, p->sysfs_name);
}
@@ -403,7 +406,7 @@ int br_sysfs_renameif(struct net_bridge_port *p)
netdev_notice(br->dev, "unable to rename link %s to %s",
p->sysfs_name, p->dev->name);
else
- strlcpy(p->sysfs_name, p->dev->name, IFNAMSIZ);
+ strscpy(p->sysfs_name, p->dev->name, IFNAMSIZ);
return err;
}
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 7df269092103..ce72b837ff8e 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
@@ -7,6 +8,8 @@
#include "br_private.h"
#include "br_private_tunnel.h"
+static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid);
+
static inline int br_vlan_cmp(struct rhashtable_compare_arg *arg,
const void *ptr)
{
@@ -21,7 +24,6 @@ static const struct rhashtable_params br_vlan_rht_params = {
.key_offset = offsetof(struct net_bridge_vlan, vid),
.key_len = sizeof(u16),
.nelem_hint = 3,
- .locks_mul = 1,
.max_size = VLAN_N_VID,
.obj_cmpfn = br_vlan_cmp,
.automatic_shrinking = true,
@@ -32,64 +34,85 @@ static struct net_bridge_vlan *br_vlan_lookup(struct rhashtable *tbl, u16 vid)
return rhashtable_lookup_fast(tbl, &vid, br_vlan_rht_params);
}
-static bool __vlan_add_pvid(struct net_bridge_vlan_group *vg, u16 vid)
+static void __vlan_add_pvid(struct net_bridge_vlan_group *vg,
+ const struct net_bridge_vlan *v)
{
- if (vg->pvid == vid)
- return false;
+ if (vg->pvid == v->vid)
+ return;
smp_wmb();
- vg->pvid = vid;
-
- return true;
+ br_vlan_set_pvid_state(vg, v->state);
+ vg->pvid = v->vid;
}
-static bool __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid)
+static void __vlan_delete_pvid(struct net_bridge_vlan_group *vg, u16 vid)
{
if (vg->pvid != vid)
- return false;
+ return;
smp_wmb();
vg->pvid = 0;
-
- return true;
}
-/* return true if anything changed, false otherwise */
-static bool __vlan_add_flags(struct net_bridge_vlan *v, u16 flags)
+/* Update the BRIDGE_VLAN_INFO_PVID and BRIDGE_VLAN_INFO_UNTAGGED flags of @v.
+ * If @commit is false, return just whether the BRIDGE_VLAN_INFO_PVID and
+ * BRIDGE_VLAN_INFO_UNTAGGED bits of @flags would produce any change onto @v.
+ */
+static bool __vlan_flags_update(struct net_bridge_vlan *v, u16 flags,
+ bool commit)
{
struct net_bridge_vlan_group *vg;
- u16 old_flags = v->flags;
- bool ret;
+ bool change;
if (br_vlan_is_master(v))
vg = br_vlan_group(v->br);
else
vg = nbp_vlan_group(v->port);
+ /* check if anything would be changed on commit */
+ change = !!(flags & BRIDGE_VLAN_INFO_PVID) == !!(vg->pvid != v->vid) ||
+ ((flags ^ v->flags) & BRIDGE_VLAN_INFO_UNTAGGED);
+
+ if (!commit)
+ goto out;
+
if (flags & BRIDGE_VLAN_INFO_PVID)
- ret = __vlan_add_pvid(vg, v->vid);
+ __vlan_add_pvid(vg, v);
else
- ret = __vlan_delete_pvid(vg, v->vid);
+ __vlan_delete_pvid(vg, v->vid);
if (flags & BRIDGE_VLAN_INFO_UNTAGGED)
v->flags |= BRIDGE_VLAN_INFO_UNTAGGED;
else
v->flags &= ~BRIDGE_VLAN_INFO_UNTAGGED;
- return ret || !!(old_flags ^ v->flags);
+out:
+ return change;
+}
+
+static bool __vlan_flags_would_change(struct net_bridge_vlan *v, u16 flags)
+{
+ return __vlan_flags_update(v, flags, false);
+}
+
+static void __vlan_flags_commit(struct net_bridge_vlan *v, u16 flags)
+{
+ __vlan_flags_update(v, flags, true);
}
static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br,
- u16 vid, u16 flags)
+ struct net_bridge_vlan *v, u16 flags,
+ struct netlink_ext_ack *extack)
{
int err;
/* Try switchdev op first. In case it is not supported, fallback to
* 8021q add.
*/
- err = br_switchdev_port_vlan_add(dev, vid, flags);
+ err = br_switchdev_port_vlan_add(dev, v->vid, flags, false, extack);
if (err == -EOPNOTSUPP)
- return vlan_vid_add(dev, br->vlan_proto, vid);
+ return vlan_vid_add(dev, br->vlan_proto, v->vid);
+ v->priv_flags |= BR_VLFLAG_ADDED_BY_SWITCHDEV;
return err;
}
@@ -107,9 +130,7 @@ static void __vlan_add_list(struct net_bridge_vlan *v)
headp = &vg->vlan_list;
list_for_each_prev(hpos, headp) {
vent = list_entry(hpos, struct net_bridge_vlan, vlist);
- if (v->vid < vent->vid)
- continue;
- else
+ if (v->vid >= vent->vid)
break;
}
list_add_rcu(&v->vlist, hpos);
@@ -121,25 +142,25 @@ static void __vlan_del_list(struct net_bridge_vlan *v)
}
static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
- u16 vid)
+ const struct net_bridge_vlan *v)
{
int err;
/* Try switchdev op first. In case it is not supported, fallback to
* 8021q del.
*/
- err = br_switchdev_port_vlan_del(dev, vid);
- if (err == -EOPNOTSUPP) {
- vlan_vid_del(dev, br->vlan_proto, vid);
- return 0;
- }
- return err;
+ err = br_switchdev_port_vlan_del(dev, v->vid);
+ if (!(v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV))
+ vlan_vid_del(dev, br->vlan_proto, v->vid);
+ return err == -EOPNOTSUPP ? 0 : err;
}
-/* Returns a master vlan, if it didn't exist it gets created. In all cases a
+/* Returns a master vlan, if it didn't exist it gets created. In all cases
* a reference is taken to the master vlan before returning.
*/
-static struct net_bridge_vlan *br_vlan_get_master(struct net_bridge *br, u16 vid)
+static struct net_bridge_vlan *
+br_vlan_get_master(struct net_bridge *br, u16 vid,
+ struct netlink_ext_ack *extack)
{
struct net_bridge_vlan_group *vg;
struct net_bridge_vlan *masterv;
@@ -150,7 +171,7 @@ static struct net_bridge_vlan *br_vlan_get_master(struct net_bridge *br, u16 vid
bool changed;
/* missing global ctx, create it now */
- if (br_vlan_add(br, vid, 0, &changed))
+ if (br_vlan_add(br, vid, 0, &changed, extack))
return NULL;
masterv = br_vlan_find(vg, vid);
if (WARN_ON(!masterv))
@@ -186,10 +207,43 @@ static void br_vlan_put_master(struct net_bridge_vlan *masterv)
rhashtable_remove_fast(&vg->vlan_hash,
&masterv->vnode, br_vlan_rht_params);
__vlan_del_list(masterv);
+ br_multicast_toggle_one_vlan(masterv, false);
+ br_multicast_ctx_deinit(&masterv->br_mcast_ctx);
call_rcu(&masterv->rcu, br_master_vlan_rcu_free);
}
}
+static void nbp_vlan_rcu_free(struct rcu_head *rcu)
+{
+ struct net_bridge_vlan *v;
+
+ v = container_of(rcu, struct net_bridge_vlan, rcu);
+ WARN_ON(br_vlan_is_master(v));
+ /* if we had per-port stats configured then free them here */
+ if (v->priv_flags & BR_VLFLAG_PER_PORT_STATS)
+ free_percpu(v->stats);
+ v->stats = NULL;
+ kfree(v);
+}
+
+static void br_vlan_init_state(struct net_bridge_vlan *v)
+{
+ struct net_bridge *br;
+
+ if (br_vlan_is_master(v))
+ br = v->br;
+ else
+ br = v->port->br;
+
+ if (br_opt_get(br, BROPT_MST_ENABLED)) {
+ br_mst_vlan_init_state(v);
+ return;
+ }
+
+ v->state = BR_STATE_FORWARDING;
+ v->msti = 0;
+}
+
/* This is the shared VLAN add function which works for both ports and bridge
* devices. There are four possible calls to this function in terms of the
* vlan entry type:
@@ -201,7 +255,8 @@ static void br_vlan_put_master(struct net_bridge_vlan *masterv)
* 4. same as 3 but with both master and brentry flags set so the entry
* will be used for filtering in both the port and the bridge
*/
-static int __vlan_add(struct net_bridge_vlan *v, u16 flags)
+static int __vlan_add(struct net_bridge_vlan *v, u16 flags,
+ struct netlink_ext_ack *extack)
{
struct net_bridge_vlan *masterv = NULL;
struct net_bridge_port *p = NULL;
@@ -226,7 +281,7 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags)
* This ensures tagged traffic enters the bridge when
* promiscuous mode is disabled by br_manage_promisc().
*/
- err = __vlan_vid_add(dev, br, v->vid, flags);
+ err = __vlan_vid_add(dev, br, v, flags, extack);
if (err)
goto out;
@@ -236,39 +291,70 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags)
err = br_vlan_add(br, v->vid,
flags | BRIDGE_VLAN_INFO_BRENTRY,
- &changed);
+ &changed, extack);
if (err)
goto out_filt;
+
+ if (changed)
+ br_vlan_notify(br, NULL, v->vid, 0,
+ RTM_NEWVLAN);
}
- masterv = br_vlan_get_master(br, v->vid);
- if (!masterv)
+ masterv = br_vlan_get_master(br, v->vid, extack);
+ if (!masterv) {
+ err = -ENOMEM;
goto out_filt;
+ }
v->brvlan = masterv;
- v->stats = masterv->stats;
+ if (br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)) {
+ v->stats =
+ netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!v->stats) {
+ err = -ENOMEM;
+ goto out_filt;
+ }
+ v->priv_flags |= BR_VLFLAG_PER_PORT_STATS;
+ } else {
+ v->stats = masterv->stats;
+ }
+ br_multicast_port_ctx_init(p, v, &v->port_mcast_ctx);
} else {
- err = br_switchdev_port_vlan_add(dev, v->vid, flags);
- if (err && err != -EOPNOTSUPP)
- goto out;
+ if (br_vlan_should_use(v)) {
+ err = br_switchdev_port_vlan_add(dev, v->vid, flags,
+ false, extack);
+ if (err && err != -EOPNOTSUPP)
+ goto out;
+ }
+ br_multicast_ctx_init(br, v, &v->br_mcast_ctx);
+ v->priv_flags |= BR_VLFLAG_GLOBAL_MCAST_ENABLED;
}
/* Add the dev mac and count the vlan only if it's usable */
if (br_vlan_should_use(v)) {
- err = br_fdb_insert(br, p, dev->dev_addr, v->vid);
- if (err) {
- br_err(br, "failed insert local address into bridge forwarding table\n");
- goto out_filt;
+ if (!br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0)) {
+ err = br_fdb_add_local(br, p, dev->dev_addr, v->vid);
+ if (err) {
+ br_err(br, "failed insert local address into bridge forwarding table\n");
+ goto out_filt;
+ }
}
vg->num_vlans++;
}
+ /* set the state before publishing */
+ br_vlan_init_state(v);
+
err = rhashtable_lookup_insert_fast(&vg->vlan_hash, &v->vnode,
br_vlan_rht_params);
if (err)
goto out_fdb_insert;
__vlan_add_list(v);
- __vlan_add_flags(v, flags);
+ __vlan_flags_commit(v, flags);
+ br_multicast_toggle_one_vlan(v, true);
+
+ if (p)
+ nbp_vlan_set_vlan_dev_state(p, v->vid);
out:
return err;
@@ -280,8 +366,12 @@ out_fdb_insert:
out_filt:
if (p) {
- __vlan_vid_del(dev, br, v->vid);
+ __vlan_vid_del(dev, br, v);
if (masterv) {
+ if (v->stats && masterv->stats != v->stats)
+ free_percpu(v->stats);
+ v->stats = NULL;
+
br_vlan_put_master(masterv);
v->brvlan = NULL;
}
@@ -309,7 +399,7 @@ static int __vlan_del(struct net_bridge_vlan *v)
__vlan_delete_pvid(vg, v->vid);
if (p) {
- err = __vlan_vid_del(p->dev, p->br, v->vid);
+ err = __vlan_vid_del(p->dev, p->br, v);
if (err)
goto out;
} else {
@@ -329,7 +419,10 @@ static int __vlan_del(struct net_bridge_vlan *v)
rhashtable_remove_fast(&vg->vlan_hash, &v->vnode,
br_vlan_rht_params);
__vlan_del_list(v);
- kfree_rcu(v, rcu);
+ nbp_vlan_set_vlan_dev_state(p, v->vid);
+ br_multicast_toggle_one_vlan(v, false);
+ br_multicast_port_ctx_deinit(&v->port_mcast_ctx);
+ call_rcu(&v->rcu, nbp_vlan_rcu_free);
}
br_vlan_put_master(masterv);
@@ -345,13 +438,38 @@ static void __vlan_group_free(struct net_bridge_vlan_group *vg)
kfree(vg);
}
-static void __vlan_flush(struct net_bridge_vlan_group *vg)
+static void __vlan_flush(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ struct net_bridge_vlan_group *vg)
{
struct net_bridge_vlan *vlan, *tmp;
+ u16 v_start = 0, v_end = 0;
+ int err;
__vlan_delete_pvid(vg, vg->pvid);
- list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist)
- __vlan_del(vlan);
+ list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) {
+ /* take care of disjoint ranges */
+ if (!v_start) {
+ v_start = vlan->vid;
+ } else if (vlan->vid - v_end != 1) {
+ /* found range end, notify and start next one */
+ br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN);
+ v_start = vlan->vid;
+ }
+ v_end = vlan->vid;
+
+ err = __vlan_del(vlan);
+ if (err) {
+ br_err(br,
+ "port %u(%s) failed to delete vlan %d: %pe\n",
+ (unsigned int) p->port_no, p->dev->name,
+ vlan->vid, ERR_PTR(err));
+ }
+ }
+
+ /* notify about the last/whole vlan range */
+ if (v_start)
+ br_vlan_notify(br, p, v_start, v_end, RTM_DELVLAN);
}
struct sk_buff *br_handle_vlan(struct net_bridge *br,
@@ -359,7 +477,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br,
struct net_bridge_vlan_group *vg,
struct sk_buff *skb)
{
- struct br_vlan_stats *stats;
+ struct pcpu_sw_netstats *stats;
struct net_bridge_vlan *v;
u16 vid;
@@ -386,16 +504,24 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br,
return NULL;
}
}
- if (br->vlan_stats_enabled) {
+ if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
stats = this_cpu_ptr(v->stats);
u64_stats_update_begin(&stats->syncp);
- stats->tx_bytes += skb->len;
- stats->tx_packets++;
+ u64_stats_add(&stats->tx_bytes, skb->len);
+ u64_stats_inc(&stats->tx_packets);
u64_stats_update_end(&stats->syncp);
}
- if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED)
- skb->vlan_tci = 0;
+ /* If the skb will be sent using forwarding offload, the assumption is
+ * that the switchdev will inject the packet into hardware together
+ * with the bridge VLAN, so that it can be forwarded according to that
+ * VLAN. The switchdev should deal with popping the VLAN header in
+ * hardware on each egress port as appropriate. So only strip the VLAN
+ * header if forwarding offload is not being used.
+ */
+ if (v->flags & BRIDGE_VLAN_INFO_UNTAGGED &&
+ !br_switchdev_frame_uses_tx_fwd_offload(skb))
+ __vlan_hwaccel_clear_tag(skb);
if (p && (p->flags & BR_VLAN_TUNNEL) &&
br_handle_egress_vlan_tunnel(skb, v)) {
@@ -409,9 +535,11 @@ out:
/* Called under RCU */
static bool __allowed_ingress(const struct net_bridge *br,
struct net_bridge_vlan_group *vg,
- struct sk_buff *skb, u16 *vid)
+ struct sk_buff *skb, u16 *vid,
+ u8 *state,
+ struct net_bridge_vlan **vlan)
{
- struct br_vlan_stats *stats;
+ struct pcpu_sw_netstats *stats;
struct net_bridge_vlan *v;
bool tagged;
@@ -468,28 +596,43 @@ static bool __allowed_ingress(const struct net_bridge *br,
__vlan_hwaccel_put_tag(skb, br->vlan_proto, pvid);
else
/* Priority-tagged Frame.
- * At this point, We know that skb->vlan_tci had
- * VLAN_TAG_PRESENT bit and its VID field was 0x000.
+ * At this point, we know that skb->vlan_tci VID
+ * field was 0.
* We update only VID field and preserve PCP field.
*/
skb->vlan_tci |= pvid;
- /* if stats are disabled we can avoid the lookup */
- if (!br->vlan_stats_enabled)
+ /* if snooping and stats are disabled we can avoid the lookup */
+ if (!br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED) &&
+ !br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
+ if (*state == BR_STATE_FORWARDING) {
+ *state = br_vlan_get_pvid_state(vg);
+ if (!br_vlan_state_allowed(*state, true))
+ goto drop;
+ }
return true;
+ }
}
v = br_vlan_find(vg, *vid);
if (!v || !br_vlan_should_use(v))
goto drop;
- if (br->vlan_stats_enabled) {
+ if (*state == BR_STATE_FORWARDING) {
+ *state = br_vlan_get_state(v);
+ if (!br_vlan_state_allowed(*state, true))
+ goto drop;
+ }
+
+ if (br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) {
stats = this_cpu_ptr(v->stats);
u64_stats_update_begin(&stats->syncp);
- stats->rx_bytes += skb->len;
- stats->rx_packets++;
+ u64_stats_add(&stats->rx_bytes, skb->len);
+ u64_stats_inc(&stats->rx_packets);
u64_stats_update_end(&stats->syncp);
}
+ *vlan = v;
+
return true;
drop:
@@ -499,17 +642,19 @@ drop:
bool br_allowed_ingress(const struct net_bridge *br,
struct net_bridge_vlan_group *vg, struct sk_buff *skb,
- u16 *vid)
+ u16 *vid, u8 *state,
+ struct net_bridge_vlan **vlan)
{
/* If VLAN filtering is disabled on the bridge, all packets are
* permitted.
*/
- if (!br->vlan_enabled) {
+ *vlan = NULL;
+ if (!br_opt_get(br, BROPT_VLAN_ENABLED)) {
BR_INPUT_SKB_CB(skb)->vlan_filtered = false;
return true;
}
- return __allowed_ingress(br, vg, skb, vid);
+ return __allowed_ingress(br, vg, skb, vid, state, vlan);
}
/* Called under RCU. */
@@ -525,7 +670,8 @@ bool br_allowed_egress(struct net_bridge_vlan_group *vg,
br_vlan_get_tag(skb, &vid);
v = br_vlan_find(vg, vid);
- if (v && br_vlan_should_use(v))
+ if (v && br_vlan_should_use(v) &&
+ br_vlan_state_allowed(br_vlan_get_state(v), false))
return true;
return false;
@@ -536,9 +682,10 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid)
{
struct net_bridge_vlan_group *vg;
struct net_bridge *br = p->br;
+ struct net_bridge_vlan *v;
/* If filtering was disabled at input, let it pass. */
- if (!br->vlan_enabled)
+ if (!br_opt_get(br, BROPT_VLAN_ENABLED))
return true;
vg = nbp_vlan_group_rcu(p);
@@ -550,13 +697,15 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid)
if (!*vid) {
*vid = br_get_pvid(vg);
- if (!*vid)
+ if (!*vid ||
+ !br_vlan_state_allowed(br_vlan_get_pvid_state(vg), true))
return false;
return true;
}
- if (br_vlan_find(vg, *vid))
+ v = br_vlan_find(vg, *vid);
+ if (v && br_vlan_state_allowed(br_vlan_get_state(v), true))
return true;
return false;
@@ -565,23 +714,36 @@ bool br_should_learn(struct net_bridge_port *p, struct sk_buff *skb, u16 *vid)
static int br_vlan_add_existing(struct net_bridge *br,
struct net_bridge_vlan_group *vg,
struct net_bridge_vlan *vlan,
- u16 flags, bool *changed)
+ u16 flags, bool *changed,
+ struct netlink_ext_ack *extack)
{
+ bool becomes_brentry = false;
+ bool would_change = false;
int err;
- err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags);
- if (err && err != -EOPNOTSUPP)
- return err;
-
if (!br_vlan_is_brentry(vlan)) {
/* Trying to change flags of non-existent bridge vlan */
- if (!(flags & BRIDGE_VLAN_INFO_BRENTRY)) {
- err = -EINVAL;
- goto err_flags;
- }
+ if (!(flags & BRIDGE_VLAN_INFO_BRENTRY))
+ return -EINVAL;
+
+ becomes_brentry = true;
+ } else {
+ would_change = __vlan_flags_would_change(vlan, flags);
+ }
+
+ /* Master VLANs that aren't brentries weren't notified before,
+ * time to notify them now.
+ */
+ if (becomes_brentry || would_change) {
+ err = br_switchdev_port_vlan_add(br->dev, vlan->vid, flags,
+ would_change, extack);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+ }
+
+ if (becomes_brentry) {
/* It was only kept for port vlans, now make it real */
- err = br_fdb_insert(br, NULL, br->dev->dev_addr,
- vlan->vid);
+ err = br_fdb_add_local(br, NULL, br->dev->dev_addr, vlan->vid);
if (err) {
br_err(br, "failed to insert local address into bridge forwarding table\n");
goto err_fdb_insert;
@@ -591,15 +753,16 @@ static int br_vlan_add_existing(struct net_bridge *br,
vlan->flags |= BRIDGE_VLAN_INFO_BRENTRY;
vg->num_vlans++;
*changed = true;
+ br_multicast_toggle_one_vlan(vlan, true);
}
- if (__vlan_add_flags(vlan, flags))
+ __vlan_flags_commit(vlan, flags);
+ if (would_change)
*changed = true;
return 0;
err_fdb_insert:
-err_flags:
br_switchdev_port_vlan_del(br->dev, vlan->vid);
return err;
}
@@ -608,7 +771,8 @@ err_flags:
* Must be called with vid in range from 1 to 4094 inclusive.
* changed must be true only if the vlan was created or updated
*/
-int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed)
+int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed,
+ struct netlink_ext_ack *extack)
{
struct net_bridge_vlan_group *vg;
struct net_bridge_vlan *vlan;
@@ -620,13 +784,14 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed)
vg = br_vlan_group(br);
vlan = br_vlan_find(vg, vid);
if (vlan)
- return br_vlan_add_existing(br, vg, vlan, flags, changed);
+ return br_vlan_add_existing(br, vg, vlan, flags, changed,
+ extack);
vlan = kzalloc(sizeof(*vlan), GFP_KERNEL);
if (!vlan)
return -ENOMEM;
- vlan->stats = netdev_alloc_pcpu_stats(struct br_vlan_stats);
+ vlan->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
if (!vlan->stats) {
kfree(vlan);
return -ENOMEM;
@@ -637,7 +802,7 @@ int br_vlan_add(struct net_bridge *br, u16 vid, u16 flags, bool *changed)
vlan->br = br;
if (flags & BRIDGE_VLAN_INFO_BRENTRY)
refcount_set(&vlan->refcnt, 1);
- ret = __vlan_add(vlan, flags);
+ ret = __vlan_add(vlan, flags, extack);
if (ret) {
free_percpu(vlan->stats);
kfree(vlan);
@@ -678,9 +843,9 @@ void br_vlan_flush(struct net_bridge *br)
ASSERT_RTNL();
vg = br_vlan_group(br);
- __vlan_flush(vg);
+ __vlan_flush(br, NULL, vg);
RCU_INIT_POINTER(br->vlgrp, NULL);
- synchronize_rcu();
+ synchronize_net();
__vlan_group_free(vg);
}
@@ -695,11 +860,12 @@ struct net_bridge_vlan *br_vlan_find(struct net_bridge_vlan_group *vg, u16 vid)
/* Must be protected by RTNL. */
static void recalculate_group_addr(struct net_bridge *br)
{
- if (br->group_addr_set)
+ if (br_opt_get(br, BROPT_GROUP_ADDR_SET))
return;
spin_lock_bh(&br->lock);
- if (!br->vlan_enabled || br->vlan_proto == htons(ETH_P_8021Q)) {
+ if (!br_opt_get(br, BROPT_VLAN_ENABLED) ||
+ br->vlan_proto == htons(ETH_P_8021Q)) {
/* Bridge Group Address */
br->group_addr[5] = 0x00;
} else { /* vlan_enabled && ETH_P_8021AD */
@@ -712,14 +878,16 @@ static void recalculate_group_addr(struct net_bridge *br)
/* Must be protected by RTNL. */
void br_recalculate_fwd_mask(struct net_bridge *br)
{
- if (!br->vlan_enabled || br->vlan_proto == htons(ETH_P_8021Q))
+ if (!br_opt_get(br, BROPT_VLAN_ENABLED) ||
+ br->vlan_proto == htons(ETH_P_8021Q))
br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT;
else /* vlan_enabled && ETH_P_8021AD */
br->group_fwd_mask_required = BR_GROUPFWD_8021AD &
~(1u << br->group_addr[5]);
}
-int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
+int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
struct switchdev_attr attr = {
.orig_dev = br->dev,
@@ -729,56 +897,80 @@ int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
};
int err;
- if (br->vlan_enabled == val)
+ if (br_opt_get(br, BROPT_VLAN_ENABLED) == !!val)
return 0;
- err = switchdev_port_attr_set(br->dev, &attr);
- if (err && err != -EOPNOTSUPP)
+ br_opt_toggle(br, BROPT_VLAN_ENABLED, !!val);
+
+ err = switchdev_port_attr_set(br->dev, &attr, extack);
+ if (err && err != -EOPNOTSUPP) {
+ br_opt_toggle(br, BROPT_VLAN_ENABLED, !val);
return err;
+ }
- br->vlan_enabled = val;
br_manage_promisc(br);
recalculate_group_addr(br);
br_recalculate_fwd_mask(br);
+ if (!val && br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED)) {
+ br_info(br, "vlan filtering disabled, automatically disabling multicast vlan snooping\n");
+ br_multicast_toggle_vlan_snooping(br, false, NULL);
+ }
return 0;
}
-int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
+bool br_vlan_enabled(const struct net_device *dev)
{
- return __br_vlan_filter_toggle(br, val);
+ struct net_bridge *br = netdev_priv(dev);
+
+ return br_opt_get(br, BROPT_VLAN_ENABLED);
}
+EXPORT_SYMBOL_GPL(br_vlan_enabled);
-bool br_vlan_enabled(const struct net_device *dev)
+int br_vlan_get_proto(const struct net_device *dev, u16 *p_proto)
{
struct net_bridge *br = netdev_priv(dev);
- return !!br->vlan_enabled;
+ *p_proto = ntohs(br->vlan_proto);
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(br_vlan_enabled);
+EXPORT_SYMBOL_GPL(br_vlan_get_proto);
-int __br_vlan_set_proto(struct net_bridge *br, __be16 proto)
+int __br_vlan_set_proto(struct net_bridge *br, __be16 proto,
+ struct netlink_ext_ack *extack)
{
+ struct switchdev_attr attr = {
+ .orig_dev = br->dev,
+ .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_PROTOCOL,
+ .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
+ .u.vlan_protocol = ntohs(proto),
+ };
int err = 0;
struct net_bridge_port *p;
struct net_bridge_vlan *vlan;
struct net_bridge_vlan_group *vg;
- __be16 oldproto;
+ __be16 oldproto = br->vlan_proto;
if (br->vlan_proto == proto)
return 0;
+ err = switchdev_port_attr_set(br->dev, &attr, extack);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
/* Add VLANs for the new proto to the device filter. */
list_for_each_entry(p, &br->port_list, list) {
vg = nbp_vlan_group(p);
list_for_each_entry(vlan, &vg->vlan_list, vlist) {
+ if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
+ continue;
err = vlan_vid_add(p->dev, proto, vlan->vid);
if (err)
goto err_filt;
}
}
- oldproto = br->vlan_proto;
br->vlan_proto = proto;
recalculate_group_addr(br);
@@ -787,31 +979,44 @@ int __br_vlan_set_proto(struct net_bridge *br, __be16 proto)
/* Delete VLANs for the old proto from the device filter. */
list_for_each_entry(p, &br->port_list, list) {
vg = nbp_vlan_group(p);
- list_for_each_entry(vlan, &vg->vlan_list, vlist)
+ list_for_each_entry(vlan, &vg->vlan_list, vlist) {
+ if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
+ continue;
vlan_vid_del(p->dev, oldproto, vlan->vid);
+ }
}
return 0;
err_filt:
- list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist)
+ attr.u.vlan_protocol = ntohs(oldproto);
+ switchdev_port_attr_set(br->dev, &attr, NULL);
+
+ list_for_each_entry_continue_reverse(vlan, &vg->vlan_list, vlist) {
+ if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
+ continue;
vlan_vid_del(p->dev, proto, vlan->vid);
+ }
list_for_each_entry_continue_reverse(p, &br->port_list, list) {
vg = nbp_vlan_group(p);
- list_for_each_entry(vlan, &vg->vlan_list, vlist)
+ list_for_each_entry(vlan, &vg->vlan_list, vlist) {
+ if (vlan->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
+ continue;
vlan_vid_del(p->dev, proto, vlan->vid);
+ }
}
return err;
}
-int br_vlan_set_proto(struct net_bridge *br, unsigned long val)
+int br_vlan_set_proto(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
- if (val != ETH_P_8021Q && val != ETH_P_8021AD)
+ if (!eth_type_vlan(htons(val)))
return -EPROTONOSUPPORT;
- return __br_vlan_set_proto(br, htons(val));
+ return __br_vlan_set_proto(br, htons(val), extack);
}
int br_vlan_set_stats(struct net_bridge *br, unsigned long val)
@@ -819,7 +1024,31 @@ int br_vlan_set_stats(struct net_bridge *br, unsigned long val)
switch (val) {
case 0:
case 1:
- br->vlan_stats_enabled = val;
+ br_opt_toggle(br, BROPT_VLAN_STATS_ENABLED, !!val);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val)
+{
+ struct net_bridge_port *p;
+
+ /* allow to change the option if there are no port vlans configured */
+ list_for_each_entry(p, &br->port_list, list) {
+ struct net_bridge_vlan_group *vg = nbp_vlan_group(p);
+
+ if (vg->num_vlans)
+ return -EBUSY;
+ }
+
+ switch (val) {
+ case 0:
+ case 1:
+ br_opt_toggle(br, BROPT_VLAN_STATS_PER_PORT, !!val);
break;
default:
return -EINVAL;
@@ -851,18 +1080,22 @@ static void br_vlan_disable_default_pvid(struct net_bridge *br)
/* Disable default_pvid on all ports where it is still
* configured.
*/
- if (vlan_default_pvid(br_vlan_group(br), pvid))
- br_vlan_delete(br, pvid);
+ if (vlan_default_pvid(br_vlan_group(br), pvid)) {
+ if (!br_vlan_delete(br, pvid))
+ br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN);
+ }
list_for_each_entry(p, &br->port_list, list) {
- if (vlan_default_pvid(nbp_vlan_group(p), pvid))
- nbp_vlan_delete(p, pvid);
+ if (vlan_default_pvid(nbp_vlan_group(p), pvid) &&
+ !nbp_vlan_delete(p, pvid))
+ br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN);
}
br->default_pvid = 0;
}
-int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid)
+int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid,
+ struct netlink_ext_ack *extack)
{
const struct net_bridge_vlan *pvent;
struct net_bridge_vlan_group *vg;
@@ -877,8 +1110,7 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid)
return 0;
}
- changed = kcalloc(BITS_TO_LONGS(BR_MAX_PORTS), sizeof(unsigned long),
- GFP_KERNEL);
+ changed = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL);
if (!changed)
return -ENOMEM;
@@ -895,11 +1127,14 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid)
BRIDGE_VLAN_INFO_PVID |
BRIDGE_VLAN_INFO_UNTAGGED |
BRIDGE_VLAN_INFO_BRENTRY,
- &vlchange);
+ &vlchange, extack);
if (err)
goto out;
- br_vlan_delete(br, old_pvid);
- set_bit(0, changed);
+
+ if (br_vlan_delete(br, old_pvid))
+ br_vlan_notify(br, NULL, old_pvid, 0, RTM_DELVLAN);
+ br_vlan_notify(br, NULL, pvid, 0, RTM_NEWVLAN);
+ __set_bit(0, changed);
}
list_for_each_entry(p, &br->port_list, list) {
@@ -915,17 +1150,19 @@ int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid)
err = nbp_vlan_add(p, pvid,
BRIDGE_VLAN_INFO_PVID |
BRIDGE_VLAN_INFO_UNTAGGED,
- &vlchange);
+ &vlchange, extack);
if (err)
goto err_port;
- nbp_vlan_delete(p, old_pvid);
- set_bit(p->port_no, changed);
+ if (nbp_vlan_delete(p, old_pvid))
+ br_vlan_notify(br, p, old_pvid, 0, RTM_DELVLAN);
+ br_vlan_notify(p->br, p, pvid, 0, RTM_NEWVLAN);
+ __set_bit(p->port_no, changed);
}
br->default_pvid = pvid;
out:
- kfree(changed);
+ bitmap_free(changed);
return err;
err_port:
@@ -933,27 +1170,34 @@ err_port:
if (!test_bit(p->port_no, changed))
continue;
- if (old_pvid)
+ if (old_pvid) {
nbp_vlan_add(p, old_pvid,
BRIDGE_VLAN_INFO_PVID |
BRIDGE_VLAN_INFO_UNTAGGED,
- &vlchange);
+ &vlchange, NULL);
+ br_vlan_notify(p->br, p, old_pvid, 0, RTM_NEWVLAN);
+ }
nbp_vlan_delete(p, pvid);
+ br_vlan_notify(br, p, pvid, 0, RTM_DELVLAN);
}
if (test_bit(0, changed)) {
- if (old_pvid)
+ if (old_pvid) {
br_vlan_add(br, old_pvid,
BRIDGE_VLAN_INFO_PVID |
BRIDGE_VLAN_INFO_UNTAGGED |
BRIDGE_VLAN_INFO_BRENTRY,
- &vlchange);
+ &vlchange, NULL);
+ br_vlan_notify(br, NULL, old_pvid, 0, RTM_NEWVLAN);
+ }
br_vlan_delete(br, pvid);
+ br_vlan_notify(br, NULL, pvid, 0, RTM_DELVLAN);
}
goto out;
}
-int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val)
+int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val,
+ struct netlink_ext_ack *extack)
{
u16 pvid = val;
int err = 0;
@@ -965,12 +1209,12 @@ int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val)
goto out;
/* Only allow default pvid change when filtering is disabled */
- if (br->vlan_enabled) {
+ if (br_opt_get(br, BROPT_VLAN_ENABLED)) {
pr_info_once("Please disable vlan filtering to change default_pvid\n");
err = -EPERM;
goto out;
}
- err = __br_vlan_set_default_pvid(br, pvid);
+ err = __br_vlan_set_default_pvid(br, pvid, extack);
out:
return err;
}
@@ -979,7 +1223,6 @@ int br_vlan_init(struct net_bridge *br)
{
struct net_bridge_vlan_group *vg;
int ret = -ENOMEM;
- bool changed;
vg = kzalloc(sizeof(*vg), GFP_KERNEL);
if (!vg)
@@ -994,17 +1237,10 @@ int br_vlan_init(struct net_bridge *br)
br->vlan_proto = htons(ETH_P_8021Q);
br->default_pvid = 1;
rcu_assign_pointer(br->vlgrp, vg);
- ret = br_vlan_add(br, 1,
- BRIDGE_VLAN_INFO_PVID | BRIDGE_VLAN_INFO_UNTAGGED |
- BRIDGE_VLAN_INFO_BRENTRY, &changed);
- if (ret)
- goto err_vlan_add;
out:
return ret;
-err_vlan_add:
- vlan_tunnel_deinit(vg);
err_tunnel_init:
rhashtable_destroy(&vg->vlan_hash);
err_rhtbl:
@@ -1013,13 +1249,13 @@ err_rhtbl:
goto out;
}
-int nbp_vlan_init(struct net_bridge_port *p)
+int nbp_vlan_init(struct net_bridge_port *p, struct netlink_ext_ack *extack)
{
struct switchdev_attr attr = {
.orig_dev = p->br->dev,
.id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
.flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
- .u.vlan_filtering = p->br->vlan_enabled,
+ .u.vlan_filtering = br_opt_get(p->br, BROPT_VLAN_ENABLED),
};
struct net_bridge_vlan_group *vg;
int ret = -ENOMEM;
@@ -1028,7 +1264,7 @@ int nbp_vlan_init(struct net_bridge_port *p)
if (!vg)
goto out;
- ret = switchdev_port_attr_set(p->dev, &attr);
+ ret = switchdev_port_attr_set(p->dev, &attr, extack);
if (ret && ret != -EOPNOTSUPP)
goto err_vlan_enabled;
@@ -1046,9 +1282,10 @@ int nbp_vlan_init(struct net_bridge_port *p)
ret = nbp_vlan_add(p, p->br->default_pvid,
BRIDGE_VLAN_INFO_PVID |
BRIDGE_VLAN_INFO_UNTAGGED,
- &changed);
+ &changed, extack);
if (ret)
goto err_vlan_add;
+ br_vlan_notify(p->br, p, p->br->default_pvid, 0, RTM_NEWVLAN);
}
out:
return ret;
@@ -1071,7 +1308,7 @@ err_vlan_enabled:
* changed must be true only if the vlan was created or updated
*/
int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
- bool *changed)
+ bool *changed, struct netlink_ext_ack *extack)
{
struct net_bridge_vlan *vlan;
int ret;
@@ -1081,11 +1318,18 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
*changed = false;
vlan = br_vlan_find(nbp_vlan_group(port), vid);
if (vlan) {
- /* Pass the flags to the hardware bridge */
- ret = br_switchdev_port_vlan_add(port->dev, vid, flags);
- if (ret && ret != -EOPNOTSUPP)
- return ret;
- *changed = __vlan_add_flags(vlan, flags);
+ bool would_change = __vlan_flags_would_change(vlan, flags);
+
+ if (would_change) {
+ /* Pass the flags to the hardware bridge */
+ ret = br_switchdev_port_vlan_add(port->dev, vid, flags,
+ true, extack);
+ if (ret && ret != -EOPNOTSUPP)
+ return ret;
+ }
+
+ __vlan_flags_commit(vlan, flags);
+ *changed = would_change;
return 0;
}
@@ -1096,7 +1340,7 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags,
vlan->vid = vid;
vlan->port = port;
- ret = __vlan_add(vlan, flags);
+ ret = __vlan_add(vlan, flags, extack);
if (ret)
kfree(vlan);
else
@@ -1130,45 +1374,49 @@ void nbp_vlan_flush(struct net_bridge_port *port)
ASSERT_RTNL();
vg = nbp_vlan_group(port);
- __vlan_flush(vg);
+ __vlan_flush(port->br, port, vg);
RCU_INIT_POINTER(port->vlgrp, NULL);
- synchronize_rcu();
+ synchronize_net();
__vlan_group_free(vg);
}
void br_vlan_get_stats(const struct net_bridge_vlan *v,
- struct br_vlan_stats *stats)
+ struct pcpu_sw_netstats *stats)
{
int i;
memset(stats, 0, sizeof(*stats));
for_each_possible_cpu(i) {
u64 rxpackets, rxbytes, txpackets, txbytes;
- struct br_vlan_stats *cpu_stats;
+ struct pcpu_sw_netstats *cpu_stats;
unsigned int start;
cpu_stats = per_cpu_ptr(v->stats, i);
do {
- start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
- rxpackets = cpu_stats->rx_packets;
- rxbytes = cpu_stats->rx_bytes;
- txbytes = cpu_stats->tx_bytes;
- txpackets = cpu_stats->tx_packets;
- } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));
-
- stats->rx_packets += rxpackets;
- stats->rx_bytes += rxbytes;
- stats->tx_bytes += txbytes;
- stats->tx_packets += txpackets;
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
+ rxpackets = u64_stats_read(&cpu_stats->rx_packets);
+ rxbytes = u64_stats_read(&cpu_stats->rx_bytes);
+ txbytes = u64_stats_read(&cpu_stats->tx_bytes);
+ txpackets = u64_stats_read(&cpu_stats->tx_packets);
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
+
+ u64_stats_add(&stats->rx_packets, rxpackets);
+ u64_stats_add(&stats->rx_bytes, rxbytes);
+ u64_stats_add(&stats->tx_bytes, txbytes);
+ u64_stats_add(&stats->tx_packets, txpackets);
}
}
int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
{
struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p;
ASSERT_RTNL();
- if (netif_is_bridge_master(dev))
+ p = br_port_get_check_rtnl(dev);
+ if (p)
+ vg = nbp_vlan_group(p);
+ else if (netif_is_bridge_master(dev))
vg = br_vlan_group(netdev_priv(dev));
else
return -EINVAL;
@@ -1178,6 +1426,79 @@ int br_vlan_get_pvid(const struct net_device *dev, u16 *p_pvid)
}
EXPORT_SYMBOL_GPL(br_vlan_get_pvid);
+int br_vlan_get_pvid_rcu(const struct net_device *dev, u16 *p_pvid)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p;
+
+ p = br_port_get_check_rcu(dev);
+ if (p)
+ vg = nbp_vlan_group_rcu(p);
+ else if (netif_is_bridge_master(dev))
+ vg = br_vlan_group_rcu(netdev_priv(dev));
+ else
+ return -EINVAL;
+
+ *p_pvid = br_get_pvid(vg);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_pvid_rcu);
+
+void br_vlan_fill_forward_path_pvid(struct net_bridge *br,
+ struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+ struct net_bridge_vlan_group *vg;
+ int idx = ctx->num_vlans - 1;
+ u16 vid;
+
+ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP;
+
+ if (!br_opt_get(br, BROPT_VLAN_ENABLED))
+ return;
+
+ vg = br_vlan_group_rcu(br);
+
+ if (idx >= 0 &&
+ ctx->vlan[idx].proto == br->vlan_proto) {
+ vid = ctx->vlan[idx].id;
+ } else {
+ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_TAG;
+ vid = br_get_pvid(vg);
+ }
+
+ path->bridge.vlan_id = vid;
+ path->bridge.vlan_proto = br->vlan_proto;
+}
+
+int br_vlan_fill_forward_path_mode(struct net_bridge *br,
+ struct net_bridge_port *dst,
+ struct net_device_path *path)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+
+ if (!br_opt_get(br, BROPT_VLAN_ENABLED))
+ return 0;
+
+ vg = nbp_vlan_group_rcu(dst);
+ v = br_vlan_find(vg, path->bridge.vlan_id);
+ if (!v || !br_vlan_should_use(v))
+ return -EINVAL;
+
+ if (!(v->flags & BRIDGE_VLAN_INFO_UNTAGGED))
+ return 0;
+
+ if (path->bridge.vlan_mode == DEV_PATH_BR_VLAN_TAG)
+ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_KEEP;
+ else if (v->priv_flags & BR_VLFLAG_ADDED_BY_SWITCHDEV)
+ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG_HW;
+ else
+ path->bridge.vlan_mode = DEV_PATH_BR_VLAN_UNTAG;
+
+ return 0;
+}
+
int br_vlan_get_info(const struct net_device *dev, u16 vid,
struct bridge_vlan_info *p_vinfo)
{
@@ -1200,6 +1521,829 @@ int br_vlan_get_info(const struct net_device *dev, u16 vid,
p_vinfo->vid = vid;
p_vinfo->flags = v->flags;
+ if (vid == br_get_pvid(vg))
+ p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID;
return 0;
}
EXPORT_SYMBOL_GPL(br_vlan_get_info);
+
+int br_vlan_get_info_rcu(const struct net_device *dev, u16 vid,
+ struct bridge_vlan_info *p_vinfo)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ struct net_bridge_port *p;
+
+ p = br_port_get_check_rcu(dev);
+ if (p)
+ vg = nbp_vlan_group_rcu(p);
+ else if (netif_is_bridge_master(dev))
+ vg = br_vlan_group_rcu(netdev_priv(dev));
+ else
+ return -EINVAL;
+
+ v = br_vlan_find(vg, vid);
+ if (!v)
+ return -ENOENT;
+
+ p_vinfo->vid = vid;
+ p_vinfo->flags = v->flags;
+ if (vid == br_get_pvid(vg))
+ p_vinfo->flags |= BRIDGE_VLAN_INFO_PVID;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_vlan_get_info_rcu);
+
+static int br_vlan_is_bind_vlan_dev(const struct net_device *dev)
+{
+ return is_vlan_dev(dev) &&
+ !!(vlan_dev_priv(dev)->flags & VLAN_FLAG_BRIDGE_BINDING);
+}
+
+static int br_vlan_is_bind_vlan_dev_fn(struct net_device *dev,
+ __always_unused struct netdev_nested_priv *priv)
+{
+ return br_vlan_is_bind_vlan_dev(dev);
+}
+
+static bool br_vlan_has_upper_bind_vlan_dev(struct net_device *dev)
+{
+ int found;
+
+ rcu_read_lock();
+ found = netdev_walk_all_upper_dev_rcu(dev, br_vlan_is_bind_vlan_dev_fn,
+ NULL);
+ rcu_read_unlock();
+
+ return !!found;
+}
+
+struct br_vlan_bind_walk_data {
+ u16 vid;
+ struct net_device *result;
+};
+
+static int br_vlan_match_bind_vlan_dev_fn(struct net_device *dev,
+ struct netdev_nested_priv *priv)
+{
+ struct br_vlan_bind_walk_data *data = priv->data;
+ int found = 0;
+
+ if (br_vlan_is_bind_vlan_dev(dev) &&
+ vlan_dev_priv(dev)->vlan_id == data->vid) {
+ data->result = dev;
+ found = 1;
+ }
+
+ return found;
+}
+
+static struct net_device *
+br_vlan_get_upper_bind_vlan_dev(struct net_device *dev, u16 vid)
+{
+ struct br_vlan_bind_walk_data data = {
+ .vid = vid,
+ };
+ struct netdev_nested_priv priv = {
+ .data = (void *)&data,
+ };
+
+ rcu_read_lock();
+ netdev_walk_all_upper_dev_rcu(dev, br_vlan_match_bind_vlan_dev_fn,
+ &priv);
+ rcu_read_unlock();
+
+ return data.result;
+}
+
+static bool br_vlan_is_dev_up(const struct net_device *dev)
+{
+ return !!(dev->flags & IFF_UP) && netif_oper_up(dev);
+}
+
+static void br_vlan_set_vlan_dev_state(const struct net_bridge *br,
+ struct net_device *vlan_dev)
+{
+ u16 vid = vlan_dev_priv(vlan_dev)->vlan_id;
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p;
+ bool has_carrier = false;
+
+ if (!netif_carrier_ok(br->dev)) {
+ netif_carrier_off(vlan_dev);
+ return;
+ }
+
+ list_for_each_entry(p, &br->port_list, list) {
+ vg = nbp_vlan_group(p);
+ if (br_vlan_find(vg, vid) && br_vlan_is_dev_up(p->dev)) {
+ has_carrier = true;
+ break;
+ }
+ }
+
+ if (has_carrier)
+ netif_carrier_on(vlan_dev);
+ else
+ netif_carrier_off(vlan_dev);
+}
+
+static void br_vlan_set_all_vlan_dev_state(struct net_bridge_port *p)
+{
+ struct net_bridge_vlan_group *vg = nbp_vlan_group(p);
+ struct net_bridge_vlan *vlan;
+ struct net_device *vlan_dev;
+
+ list_for_each_entry(vlan, &vg->vlan_list, vlist) {
+ vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev,
+ vlan->vid);
+ if (vlan_dev) {
+ if (br_vlan_is_dev_up(p->dev)) {
+ if (netif_carrier_ok(p->br->dev))
+ netif_carrier_on(vlan_dev);
+ } else {
+ br_vlan_set_vlan_dev_state(p->br, vlan_dev);
+ }
+ }
+ }
+}
+
+static void br_vlan_toggle_bridge_binding(struct net_device *br_dev,
+ bool enable)
+{
+ struct net_bridge *br = netdev_priv(br_dev);
+
+ if (enable)
+ br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, true);
+ else
+ br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING,
+ br_vlan_has_upper_bind_vlan_dev(br_dev));
+}
+
+static void br_vlan_upper_change(struct net_device *dev,
+ struct net_device *upper_dev,
+ bool linking)
+{
+ struct net_bridge *br = netdev_priv(dev);
+
+ if (!br_vlan_is_bind_vlan_dev(upper_dev))
+ return;
+
+ br_vlan_toggle_bridge_binding(dev, linking);
+ if (linking)
+ br_vlan_set_vlan_dev_state(br, upper_dev);
+}
+
+struct br_vlan_link_state_walk_data {
+ struct net_bridge *br;
+};
+
+static int br_vlan_link_state_change_fn(struct net_device *vlan_dev,
+ struct netdev_nested_priv *priv)
+{
+ struct br_vlan_link_state_walk_data *data = priv->data;
+
+ if (br_vlan_is_bind_vlan_dev(vlan_dev))
+ br_vlan_set_vlan_dev_state(data->br, vlan_dev);
+
+ return 0;
+}
+
+static void br_vlan_link_state_change(struct net_device *dev,
+ struct net_bridge *br)
+{
+ struct br_vlan_link_state_walk_data data = {
+ .br = br
+ };
+ struct netdev_nested_priv priv = {
+ .data = (void *)&data,
+ };
+
+ rcu_read_lock();
+ netdev_walk_all_upper_dev_rcu(dev, br_vlan_link_state_change_fn,
+ &priv);
+ rcu_read_unlock();
+}
+
+/* Must be protected by RTNL. */
+static void nbp_vlan_set_vlan_dev_state(struct net_bridge_port *p, u16 vid)
+{
+ struct net_device *vlan_dev;
+
+ if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING))
+ return;
+
+ vlan_dev = br_vlan_get_upper_bind_vlan_dev(p->br->dev, vid);
+ if (vlan_dev)
+ br_vlan_set_vlan_dev_state(p->br, vlan_dev);
+}
+
+/* Must be protected by RTNL. */
+int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr)
+{
+ struct netdev_notifier_changeupper_info *info;
+ struct net_bridge *br = netdev_priv(dev);
+ int vlcmd = 0, ret = 0;
+ bool changed = false;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ ret = br_vlan_add(br, br->default_pvid,
+ BRIDGE_VLAN_INFO_PVID |
+ BRIDGE_VLAN_INFO_UNTAGGED |
+ BRIDGE_VLAN_INFO_BRENTRY, &changed, NULL);
+ vlcmd = RTM_NEWVLAN;
+ break;
+ case NETDEV_UNREGISTER:
+ changed = !br_vlan_delete(br, br->default_pvid);
+ vlcmd = RTM_DELVLAN;
+ break;
+ case NETDEV_CHANGEUPPER:
+ info = ptr;
+ br_vlan_upper_change(dev, info->upper_dev, info->linking);
+ break;
+
+ case NETDEV_CHANGE:
+ case NETDEV_UP:
+ if (!br_opt_get(br, BROPT_VLAN_BRIDGE_BINDING))
+ break;
+ br_vlan_link_state_change(dev, br);
+ break;
+ }
+ if (changed)
+ br_vlan_notify(br, NULL, br->default_pvid, 0, vlcmd);
+
+ return ret;
+}
+
+void br_vlan_vlan_upper_event(struct net_device *br_dev,
+ struct net_device *vlan_dev,
+ unsigned long event)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(vlan_dev);
+ struct net_bridge *br = netdev_priv(br_dev);
+ bool bridge_binding;
+
+ switch (event) {
+ case NETDEV_CHANGE:
+ case NETDEV_UP:
+ break;
+ default:
+ return;
+ }
+
+ bridge_binding = vlan->flags & VLAN_FLAG_BRIDGE_BINDING;
+ br_vlan_toggle_bridge_binding(br_dev, bridge_binding);
+ if (bridge_binding)
+ br_vlan_set_vlan_dev_state(br, vlan_dev);
+ else if (!bridge_binding && netif_carrier_ok(br_dev))
+ netif_carrier_on(vlan_dev);
+}
+
+/* Must be protected by RTNL. */
+void br_vlan_port_event(struct net_bridge_port *p, unsigned long event)
+{
+ if (!br_opt_get(p->br, BROPT_VLAN_BRIDGE_BINDING))
+ return;
+
+ switch (event) {
+ case NETDEV_CHANGE:
+ case NETDEV_DOWN:
+ case NETDEV_UP:
+ br_vlan_set_all_vlan_dev_state(p);
+ break;
+ }
+}
+
+static bool br_vlan_stats_fill(struct sk_buff *skb,
+ const struct net_bridge_vlan *v)
+{
+ struct pcpu_sw_netstats stats;
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY_STATS);
+ if (!nest)
+ return false;
+
+ br_vlan_get_stats(v, &stats);
+ if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_BYTES,
+ u64_stats_read(&stats.rx_bytes),
+ BRIDGE_VLANDB_STATS_PAD) ||
+ nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_RX_PACKETS,
+ u64_stats_read(&stats.rx_packets),
+ BRIDGE_VLANDB_STATS_PAD) ||
+ nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_BYTES,
+ u64_stats_read(&stats.tx_bytes),
+ BRIDGE_VLANDB_STATS_PAD) ||
+ nla_put_u64_64bit(skb, BRIDGE_VLANDB_STATS_TX_PACKETS,
+ u64_stats_read(&stats.tx_packets),
+ BRIDGE_VLANDB_STATS_PAD))
+ goto out_err;
+
+ nla_nest_end(skb, nest);
+
+ return true;
+
+out_err:
+ nla_nest_cancel(skb, nest);
+ return false;
+}
+
+/* v_opts is used to dump the options which must be equal in the whole range */
+static bool br_vlan_fill_vids(struct sk_buff *skb, u16 vid, u16 vid_range,
+ const struct net_bridge_vlan *v_opts,
+ const struct net_bridge_port *p,
+ u16 flags,
+ bool dump_stats)
+{
+ struct bridge_vlan_info info;
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY);
+ if (!nest)
+ return false;
+
+ memset(&info, 0, sizeof(info));
+ info.vid = vid;
+ if (flags & BRIDGE_VLAN_INFO_UNTAGGED)
+ info.flags |= BRIDGE_VLAN_INFO_UNTAGGED;
+ if (flags & BRIDGE_VLAN_INFO_PVID)
+ info.flags |= BRIDGE_VLAN_INFO_PVID;
+
+ if (nla_put(skb, BRIDGE_VLANDB_ENTRY_INFO, sizeof(info), &info))
+ goto out_err;
+
+ if (vid_range && vid < vid_range &&
+ !(flags & BRIDGE_VLAN_INFO_PVID) &&
+ nla_put_u16(skb, BRIDGE_VLANDB_ENTRY_RANGE, vid_range))
+ goto out_err;
+
+ if (v_opts) {
+ if (!br_vlan_opts_fill(skb, v_opts, p))
+ goto out_err;
+
+ if (dump_stats && !br_vlan_stats_fill(skb, v_opts))
+ goto out_err;
+ }
+
+ nla_nest_end(skb, nest);
+
+ return true;
+
+out_err:
+ nla_nest_cancel(skb, nest);
+ return false;
+}
+
+static size_t rtnl_vlan_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct br_vlan_msg))
+ + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY */
+ + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_ENTRY_RANGE */
+ + nla_total_size(sizeof(struct bridge_vlan_info)) /* BRIDGE_VLANDB_ENTRY_INFO */
+ + br_vlan_opts_nl_size(); /* bridge vlan options */
+}
+
+void br_vlan_notify(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ u16 vid, u16 vid_range,
+ int cmd)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v = NULL;
+ struct br_vlan_msg *bvm;
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+ struct net *net;
+ u16 flags = 0;
+ int ifindex;
+
+ /* right now notifications are done only with rtnl held */
+ ASSERT_RTNL();
+
+ if (p) {
+ ifindex = p->dev->ifindex;
+ vg = nbp_vlan_group(p);
+ net = dev_net(p->dev);
+ } else {
+ ifindex = br->dev->ifindex;
+ vg = br_vlan_group(br);
+ net = dev_net(br->dev);
+ }
+
+ skb = nlmsg_new(rtnl_vlan_nlmsg_size(), GFP_KERNEL);
+ if (!skb)
+ goto out_err;
+
+ err = -EMSGSIZE;
+ nlh = nlmsg_put(skb, 0, 0, cmd, sizeof(*bvm), 0);
+ if (!nlh)
+ goto out_err;
+ bvm = nlmsg_data(nlh);
+ memset(bvm, 0, sizeof(*bvm));
+ bvm->family = AF_BRIDGE;
+ bvm->ifindex = ifindex;
+
+ switch (cmd) {
+ case RTM_NEWVLAN:
+ /* need to find the vlan due to flags/options */
+ v = br_vlan_find(vg, vid);
+ if (!v || !br_vlan_should_use(v))
+ goto out_kfree;
+
+ flags = v->flags;
+ if (br_get_pvid(vg) == v->vid)
+ flags |= BRIDGE_VLAN_INFO_PVID;
+ break;
+ case RTM_DELVLAN:
+ break;
+ default:
+ goto out_kfree;
+ }
+
+ if (!br_vlan_fill_vids(skb, vid, vid_range, v, p, flags, false))
+ goto out_err;
+
+ nlmsg_end(skb, nlh);
+ rtnl_notify(skb, net, 0, RTNLGRP_BRVLAN, NULL, GFP_KERNEL);
+ return;
+
+out_err:
+ rtnl_set_sk_err(net, RTNLGRP_BRVLAN, err);
+out_kfree:
+ kfree_skb(skb);
+}
+
+/* check if v_curr can enter a range ending in range_end */
+bool br_vlan_can_enter_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *range_end)
+{
+ return v_curr->vid - range_end->vid == 1 &&
+ range_end->flags == v_curr->flags &&
+ br_vlan_opts_eq_range(v_curr, range_end);
+}
+
+static int br_vlan_dump_dev(const struct net_device *dev,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ u32 dump_flags)
+{
+ struct net_bridge_vlan *v, *range_start = NULL, *range_end = NULL;
+ bool dump_global = !!(dump_flags & BRIDGE_VLANDB_DUMPF_GLOBAL);
+ bool dump_stats = !!(dump_flags & BRIDGE_VLANDB_DUMPF_STATS);
+ struct net_bridge_vlan_group *vg;
+ int idx = 0, s_idx = cb->args[1];
+ struct nlmsghdr *nlh = NULL;
+ struct net_bridge_port *p;
+ struct br_vlan_msg *bvm;
+ struct net_bridge *br;
+ int err = 0;
+ u16 pvid;
+
+ if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev))
+ return -EINVAL;
+
+ if (netif_is_bridge_master(dev)) {
+ br = netdev_priv(dev);
+ vg = br_vlan_group_rcu(br);
+ p = NULL;
+ } else {
+ /* global options are dumped only for bridge devices */
+ if (dump_global)
+ return 0;
+
+ p = br_port_get_rcu(dev);
+ if (WARN_ON(!p))
+ return -EINVAL;
+ vg = nbp_vlan_group_rcu(p);
+ br = p->br;
+ }
+
+ if (!vg)
+ return 0;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ RTM_NEWVLAN, sizeof(*bvm), NLM_F_MULTI);
+ if (!nlh)
+ return -EMSGSIZE;
+ bvm = nlmsg_data(nlh);
+ memset(bvm, 0, sizeof(*bvm));
+ bvm->family = PF_BRIDGE;
+ bvm->ifindex = dev->ifindex;
+ pvid = br_get_pvid(vg);
+
+ /* idx must stay at range's beginning until it is filled in */
+ list_for_each_entry_rcu(v, &vg->vlan_list, vlist) {
+ if (!dump_global && !br_vlan_should_use(v))
+ continue;
+ if (idx < s_idx) {
+ idx++;
+ continue;
+ }
+
+ if (!range_start) {
+ range_start = v;
+ range_end = v;
+ continue;
+ }
+
+ if (dump_global) {
+ if (br_vlan_global_opts_can_enter_range(v, range_end))
+ goto update_end;
+ if (!br_vlan_global_opts_fill(skb, range_start->vid,
+ range_end->vid,
+ range_start)) {
+ err = -EMSGSIZE;
+ break;
+ }
+ /* advance number of filled vlans */
+ idx += range_end->vid - range_start->vid + 1;
+
+ range_start = v;
+ } else if (dump_stats || v->vid == pvid ||
+ !br_vlan_can_enter_range(v, range_end)) {
+ u16 vlan_flags = br_vlan_flags(range_start, pvid);
+
+ if (!br_vlan_fill_vids(skb, range_start->vid,
+ range_end->vid, range_start,
+ p, vlan_flags, dump_stats)) {
+ err = -EMSGSIZE;
+ break;
+ }
+ /* advance number of filled vlans */
+ idx += range_end->vid - range_start->vid + 1;
+
+ range_start = v;
+ }
+update_end:
+ range_end = v;
+ }
+
+ /* err will be 0 and range_start will be set in 3 cases here:
+ * - first vlan (range_start == range_end)
+ * - last vlan (range_start == range_end, not in range)
+ * - last vlan range (range_start != range_end, in range)
+ */
+ if (!err && range_start) {
+ if (dump_global &&
+ !br_vlan_global_opts_fill(skb, range_start->vid,
+ range_end->vid, range_start))
+ err = -EMSGSIZE;
+ else if (!dump_global &&
+ !br_vlan_fill_vids(skb, range_start->vid,
+ range_end->vid, range_start,
+ p, br_vlan_flags(range_start, pvid),
+ dump_stats))
+ err = -EMSGSIZE;
+ }
+
+ cb->args[1] = err ? idx : 0;
+
+ nlmsg_end(skb, nlh);
+
+ return err;
+}
+
+static const struct nla_policy br_vlan_db_dump_pol[BRIDGE_VLANDB_DUMP_MAX + 1] = {
+ [BRIDGE_VLANDB_DUMP_FLAGS] = { .type = NLA_U32 },
+};
+
+static int br_vlan_rtm_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct nlattr *dtb[BRIDGE_VLANDB_DUMP_MAX + 1];
+ int idx = 0, err = 0, s_idx = cb->args[0];
+ struct net *net = sock_net(skb->sk);
+ struct br_vlan_msg *bvm;
+ struct net_device *dev;
+ u32 dump_flags = 0;
+
+ err = nlmsg_parse(cb->nlh, sizeof(*bvm), dtb, BRIDGE_VLANDB_DUMP_MAX,
+ br_vlan_db_dump_pol, cb->extack);
+ if (err < 0)
+ return err;
+
+ bvm = nlmsg_data(cb->nlh);
+ if (dtb[BRIDGE_VLANDB_DUMP_FLAGS])
+ dump_flags = nla_get_u32(dtb[BRIDGE_VLANDB_DUMP_FLAGS]);
+
+ rcu_read_lock();
+ if (bvm->ifindex) {
+ dev = dev_get_by_index_rcu(net, bvm->ifindex);
+ if (!dev) {
+ err = -ENODEV;
+ goto out_err;
+ }
+ err = br_vlan_dump_dev(dev, skb, cb, dump_flags);
+ /* if the dump completed without an error we return 0 here */
+ if (err != -EMSGSIZE)
+ goto out_err;
+ } else {
+ for_each_netdev_rcu(net, dev) {
+ if (idx < s_idx)
+ goto skip;
+
+ err = br_vlan_dump_dev(dev, skb, cb, dump_flags);
+ if (err == -EMSGSIZE)
+ break;
+skip:
+ idx++;
+ }
+ }
+ cb->args[0] = idx;
+ rcu_read_unlock();
+
+ return skb->len;
+
+out_err:
+ rcu_read_unlock();
+
+ return err;
+}
+
+static const struct nla_policy br_vlan_db_policy[BRIDGE_VLANDB_ENTRY_MAX + 1] = {
+ [BRIDGE_VLANDB_ENTRY_INFO] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct bridge_vlan_info)),
+ [BRIDGE_VLANDB_ENTRY_RANGE] = { .type = NLA_U16 },
+ [BRIDGE_VLANDB_ENTRY_STATE] = { .type = NLA_U8 },
+ [BRIDGE_VLANDB_ENTRY_TUNNEL_INFO] = { .type = NLA_NESTED },
+ [BRIDGE_VLANDB_ENTRY_MCAST_ROUTER] = { .type = NLA_U8 },
+ [BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS] = { .type = NLA_REJECT },
+ [BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS] = { .type = NLA_U32 },
+ [BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS] = NLA_POLICY_MAX(NLA_U8, 1),
+};
+
+static int br_vlan_rtm_process_one(struct net_device *dev,
+ const struct nlattr *attr,
+ int cmd, struct netlink_ext_ack *extack)
+{
+ struct bridge_vlan_info *vinfo, vrange_end, *vinfo_last = NULL;
+ struct nlattr *tb[BRIDGE_VLANDB_ENTRY_MAX + 1];
+ bool changed = false, skip_processing = false;
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_port *p = NULL;
+ int err = 0, cmdmap = 0;
+ struct net_bridge *br;
+
+ if (netif_is_bridge_master(dev)) {
+ br = netdev_priv(dev);
+ vg = br_vlan_group(br);
+ } else {
+ p = br_port_get_rtnl(dev);
+ if (WARN_ON(!p))
+ return -ENODEV;
+ br = p->br;
+ vg = nbp_vlan_group(p);
+ }
+
+ if (WARN_ON(!vg))
+ return -ENODEV;
+
+ err = nla_parse_nested(tb, BRIDGE_VLANDB_ENTRY_MAX, attr,
+ br_vlan_db_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[BRIDGE_VLANDB_ENTRY_INFO]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry info");
+ return -EINVAL;
+ }
+ memset(&vrange_end, 0, sizeof(vrange_end));
+
+ vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]);
+ if (vinfo->flags & (BRIDGE_VLAN_INFO_RANGE_BEGIN |
+ BRIDGE_VLAN_INFO_RANGE_END)) {
+ NL_SET_ERR_MSG_MOD(extack, "Old-style vlan ranges are not allowed when using RTM vlan calls");
+ return -EINVAL;
+ }
+ if (!br_vlan_valid_id(vinfo->vid, extack))
+ return -EINVAL;
+
+ if (tb[BRIDGE_VLANDB_ENTRY_RANGE]) {
+ vrange_end.vid = nla_get_u16(tb[BRIDGE_VLANDB_ENTRY_RANGE]);
+ /* validate user-provided flags without RANGE_BEGIN */
+ vrange_end.flags = BRIDGE_VLAN_INFO_RANGE_END | vinfo->flags;
+ vinfo->flags |= BRIDGE_VLAN_INFO_RANGE_BEGIN;
+
+ /* vinfo_last is the range start, vinfo the range end */
+ vinfo_last = vinfo;
+ vinfo = &vrange_end;
+
+ if (!br_vlan_valid_id(vinfo->vid, extack) ||
+ !br_vlan_valid_range(vinfo, vinfo_last, extack))
+ return -EINVAL;
+ }
+
+ switch (cmd) {
+ case RTM_NEWVLAN:
+ cmdmap = RTM_SETLINK;
+ skip_processing = !!(vinfo->flags & BRIDGE_VLAN_INFO_ONLY_OPTS);
+ break;
+ case RTM_DELVLAN:
+ cmdmap = RTM_DELLINK;
+ break;
+ }
+
+ if (!skip_processing) {
+ struct bridge_vlan_info *tmp_last = vinfo_last;
+
+ /* br_process_vlan_info may overwrite vinfo_last */
+ err = br_process_vlan_info(br, p, cmdmap, vinfo, &tmp_last,
+ &changed, extack);
+
+ /* notify first if anything changed */
+ if (changed)
+ br_ifinfo_notify(cmdmap, br, p);
+
+ if (err)
+ return err;
+ }
+
+ /* deal with options */
+ if (cmd == RTM_NEWVLAN) {
+ struct net_bridge_vlan *range_start, *range_end;
+
+ if (vinfo_last) {
+ range_start = br_vlan_find(vg, vinfo_last->vid);
+ range_end = br_vlan_find(vg, vinfo->vid);
+ } else {
+ range_start = br_vlan_find(vg, vinfo->vid);
+ range_end = range_start;
+ }
+
+ err = br_vlan_process_options(br, p, range_start, range_end,
+ tb, extack);
+ }
+
+ return err;
+}
+
+static int br_vlan_rtm_process(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct br_vlan_msg *bvm;
+ struct net_device *dev;
+ struct nlattr *attr;
+ int err, vlans = 0;
+ int rem;
+
+ /* this should validate the header and check for remaining bytes */
+ err = nlmsg_parse(nlh, sizeof(*bvm), NULL, BRIDGE_VLANDB_MAX, NULL,
+ extack);
+ if (err < 0)
+ return err;
+
+ bvm = nlmsg_data(nlh);
+ dev = __dev_get_by_index(net, bvm->ifindex);
+ if (!dev)
+ return -ENODEV;
+
+ if (!netif_is_bridge_master(dev) && !netif_is_bridge_port(dev)) {
+ NL_SET_ERR_MSG_MOD(extack, "The device is not a valid bridge or bridge port");
+ return -EINVAL;
+ }
+
+ nlmsg_for_each_attr(attr, nlh, sizeof(*bvm), rem) {
+ switch (nla_type(attr)) {
+ case BRIDGE_VLANDB_ENTRY:
+ err = br_vlan_rtm_process_one(dev, attr,
+ nlh->nlmsg_type,
+ extack);
+ break;
+ case BRIDGE_VLANDB_GLOBAL_OPTIONS:
+ err = br_vlan_rtm_process_global_options(dev, attr,
+ nlh->nlmsg_type,
+ extack);
+ break;
+ default:
+ continue;
+ }
+
+ vlans++;
+ if (err)
+ break;
+ }
+ if (!vlans) {
+ NL_SET_ERR_MSG_MOD(extack, "No vlans found to process");
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
+static const struct rtnl_msg_handler br_vlan_rtnl_msg_handlers[] = {
+ {THIS_MODULE, PF_BRIDGE, RTM_NEWVLAN, br_vlan_rtm_process, NULL, 0},
+ {THIS_MODULE, PF_BRIDGE, RTM_DELVLAN, br_vlan_rtm_process, NULL, 0},
+ {THIS_MODULE, PF_BRIDGE, RTM_GETVLAN, NULL, br_vlan_rtm_dump, 0},
+};
+
+int br_vlan_rtnl_init(void)
+{
+ return rtnl_register_many(br_vlan_rtnl_msg_handlers);
+}
+
+void br_vlan_rtnl_uninit(void)
+{
+ rtnl_unregister_many(br_vlan_rtnl_msg_handlers);
+}
diff --git a/net/bridge/br_vlan_options.c b/net/bridge/br_vlan_options.c
new file mode 100644
index 000000000000..8fa89b04ee94
--- /dev/null
+++ b/net/bridge/br_vlan_options.c
@@ -0,0 +1,740 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (c) 2020, Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <net/ip_tunnels.h>
+
+#include "br_private.h"
+#include "br_private_tunnel.h"
+
+static bool __vlan_tun_put(struct sk_buff *skb, const struct net_bridge_vlan *v)
+{
+ __be32 tid = tunnel_id_to_key32(v->tinfo.tunnel_id);
+ struct nlattr *nest;
+
+ if (!v->tinfo.tunnel_dst)
+ return true;
+
+ nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY_TUNNEL_INFO);
+ if (!nest)
+ return false;
+ if (nla_put_u32(skb, BRIDGE_VLANDB_TINFO_ID, be32_to_cpu(tid))) {
+ nla_nest_cancel(skb, nest);
+ return false;
+ }
+ nla_nest_end(skb, nest);
+
+ return true;
+}
+
+static bool __vlan_tun_can_enter_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *range_end)
+{
+ return (!v_curr->tinfo.tunnel_dst && !range_end->tinfo.tunnel_dst) ||
+ vlan_tunid_inrange(v_curr, range_end);
+}
+
+/* check if the options' state of v_curr allow it to enter the range */
+bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *range_end)
+{
+ u8 range_mc_rtr = br_vlan_multicast_router(range_end);
+ u8 curr_mc_rtr = br_vlan_multicast_router(v_curr);
+
+ return v_curr->state == range_end->state &&
+ __vlan_tun_can_enter_range(v_curr, range_end) &&
+ curr_mc_rtr == range_mc_rtr;
+}
+
+bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v,
+ const struct net_bridge_port *p)
+{
+ if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE, br_vlan_get_state(v)) ||
+ !__vlan_tun_put(skb, v) ||
+ nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS,
+ !!(v->priv_flags & BR_VLFLAG_NEIGH_SUPPRESS_ENABLED)))
+ return false;
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_MCAST_ROUTER,
+ br_vlan_multicast_router(v)))
+ return false;
+ if (p && !br_multicast_port_ctx_vlan_disabled(&v->port_mcast_ctx) &&
+ (nla_put_u32(skb, BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS,
+ br_multicast_ngroups_get(&v->port_mcast_ctx)) ||
+ nla_put_u32(skb, BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS,
+ br_multicast_ngroups_get_max(&v->port_mcast_ctx))))
+ return false;
+#endif
+
+ return true;
+}
+
+size_t br_vlan_opts_nl_size(void)
+{
+ return nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_STATE */
+ + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY_TUNNEL_INFO */
+ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_TINFO_ID */
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_MCAST_ROUTER */
+ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS */
+ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS */
+#endif
+ + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS */
+ + 0;
+}
+
+static int br_vlan_modify_state(struct net_bridge_vlan_group *vg,
+ struct net_bridge_vlan *v,
+ u8 state,
+ bool *changed,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge *br;
+
+ ASSERT_RTNL();
+
+ if (state > BR_STATE_BLOCKING) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid vlan state");
+ return -EINVAL;
+ }
+
+ if (br_vlan_is_brentry(v))
+ br = v->br;
+ else
+ br = v->port->br;
+
+ if (br->stp_enabled == BR_KERNEL_STP) {
+ NL_SET_ERR_MSG_MOD(extack, "Can't modify vlan state when using kernel STP");
+ return -EBUSY;
+ }
+
+ if (br_opt_get(br, BROPT_MST_ENABLED)) {
+ NL_SET_ERR_MSG_MOD(extack, "Can't modify vlan state directly when MST is enabled");
+ return -EBUSY;
+ }
+
+ if (v->state == state)
+ return 0;
+
+ if (v->vid == br_get_pvid(vg))
+ br_vlan_set_pvid_state(vg, state);
+
+ br_vlan_set_state(v, state);
+ *changed = true;
+
+ return 0;
+}
+
+static const struct nla_policy br_vlandb_tinfo_pol[BRIDGE_VLANDB_TINFO_MAX + 1] = {
+ [BRIDGE_VLANDB_TINFO_ID] = { .type = NLA_U32 },
+ [BRIDGE_VLANDB_TINFO_CMD] = { .type = NLA_U32 },
+};
+
+static int br_vlan_modify_tunnel(const struct net_bridge_port *p,
+ struct net_bridge_vlan *v,
+ struct nlattr **tb,
+ bool *changed,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tun_tb[BRIDGE_VLANDB_TINFO_MAX + 1], *attr;
+ struct bridge_vlan_info *vinfo;
+ u32 tun_id = 0;
+ int cmd, err;
+
+ if (!p) {
+ NL_SET_ERR_MSG_MOD(extack, "Can't modify tunnel mapping of non-port vlans");
+ return -EINVAL;
+ }
+ if (!(p->flags & BR_VLAN_TUNNEL)) {
+ NL_SET_ERR_MSG_MOD(extack, "Port doesn't have tunnel flag set");
+ return -EINVAL;
+ }
+
+ attr = tb[BRIDGE_VLANDB_ENTRY_TUNNEL_INFO];
+ err = nla_parse_nested(tun_tb, BRIDGE_VLANDB_TINFO_MAX, attr,
+ br_vlandb_tinfo_pol, extack);
+ if (err)
+ return err;
+
+ if (!tun_tb[BRIDGE_VLANDB_TINFO_CMD]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing tunnel command attribute");
+ return -ENOENT;
+ }
+ cmd = nla_get_u32(tun_tb[BRIDGE_VLANDB_TINFO_CMD]);
+ switch (cmd) {
+ case RTM_SETLINK:
+ if (!tun_tb[BRIDGE_VLANDB_TINFO_ID]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing tunnel id attribute");
+ return -ENOENT;
+ }
+ /* when working on vlan ranges this is the starting tunnel id */
+ tun_id = nla_get_u32(tun_tb[BRIDGE_VLANDB_TINFO_ID]);
+ /* vlan info attr is guaranteed by br_vlan_rtm_process_one */
+ vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]);
+ /* tunnel ids are mapped to each vlan in increasing order,
+ * the starting vlan is in BRIDGE_VLANDB_ENTRY_INFO and v is the
+ * current vlan, so we compute: tun_id + v - vinfo->vid
+ */
+ tun_id += v->vid - vinfo->vid;
+ break;
+ case RTM_DELLINK:
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported tunnel command");
+ return -EINVAL;
+ }
+
+ return br_vlan_tunnel_info(p, cmd, v->vid, tun_id, changed);
+}
+
+static int br_vlan_process_one_opts(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ struct net_bridge_vlan_group *vg,
+ struct net_bridge_vlan *v,
+ struct nlattr **tb,
+ bool *changed,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ *changed = false;
+ if (tb[BRIDGE_VLANDB_ENTRY_STATE]) {
+ u8 state = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_STATE]);
+
+ err = br_vlan_modify_state(vg, v, state, changed, extack);
+ if (err)
+ return err;
+ }
+ if (tb[BRIDGE_VLANDB_ENTRY_TUNNEL_INFO]) {
+ err = br_vlan_modify_tunnel(p, v, tb, changed, extack);
+ if (err)
+ return err;
+ }
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ if (tb[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]) {
+ u8 val;
+
+ val = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]);
+ err = br_multicast_set_vlan_router(v, val);
+ if (err)
+ return err;
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS]) {
+ u32 val;
+
+ if (!p) {
+ NL_SET_ERR_MSG_MOD(extack, "Can't set mcast_max_groups for non-port vlans");
+ return -EINVAL;
+ }
+ if (br_multicast_port_ctx_vlan_disabled(&v->port_mcast_ctx)) {
+ NL_SET_ERR_MSG_MOD(extack, "Multicast snooping disabled on this VLAN");
+ return -EINVAL;
+ }
+
+ val = nla_get_u32(tb[BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS]);
+ br_multicast_ngroups_set_max(&v->port_mcast_ctx, val);
+ *changed = true;
+ }
+#endif
+
+ if (tb[BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS]) {
+ bool enabled = v->priv_flags & BR_VLFLAG_NEIGH_SUPPRESS_ENABLED;
+ bool val = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS]);
+
+ if (!p) {
+ NL_SET_ERR_MSG_MOD(extack, "Can't set neigh_suppress for non-port vlans");
+ return -EINVAL;
+ }
+
+ if (val != enabled) {
+ v->priv_flags ^= BR_VLFLAG_NEIGH_SUPPRESS_ENABLED;
+ *changed = true;
+ }
+ }
+
+ return 0;
+}
+
+int br_vlan_process_options(const struct net_bridge *br,
+ const struct net_bridge_port *p,
+ struct net_bridge_vlan *range_start,
+ struct net_bridge_vlan *range_end,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_vlan *v, *curr_start = NULL, *curr_end = NULL;
+ struct net_bridge_vlan_group *vg;
+ int vid, err = 0;
+ u16 pvid;
+
+ if (p)
+ vg = nbp_vlan_group(p);
+ else
+ vg = br_vlan_group(br);
+
+ if (!range_start || !br_vlan_should_use(range_start)) {
+ NL_SET_ERR_MSG_MOD(extack, "Vlan range start doesn't exist, can't process options");
+ return -ENOENT;
+ }
+ if (!range_end || !br_vlan_should_use(range_end)) {
+ NL_SET_ERR_MSG_MOD(extack, "Vlan range end doesn't exist, can't process options");
+ return -ENOENT;
+ }
+
+ pvid = br_get_pvid(vg);
+ for (vid = range_start->vid; vid <= range_end->vid; vid++) {
+ bool changed = false;
+
+ v = br_vlan_find(vg, vid);
+ if (!v || !br_vlan_should_use(v)) {
+ NL_SET_ERR_MSG_MOD(extack, "Vlan in range doesn't exist, can't process options");
+ err = -ENOENT;
+ break;
+ }
+
+ err = br_vlan_process_one_opts(br, p, vg, v, tb, &changed,
+ extack);
+ if (err)
+ break;
+
+ if (changed) {
+ /* vlan options changed, check for range */
+ if (!curr_start) {
+ curr_start = v;
+ curr_end = v;
+ continue;
+ }
+
+ if (v->vid == pvid ||
+ !br_vlan_can_enter_range(v, curr_end)) {
+ br_vlan_notify(br, p, curr_start->vid,
+ curr_end->vid, RTM_NEWVLAN);
+ curr_start = v;
+ }
+ curr_end = v;
+ } else {
+ /* nothing changed and nothing to notify yet */
+ if (!curr_start)
+ continue;
+
+ br_vlan_notify(br, p, curr_start->vid, curr_end->vid,
+ RTM_NEWVLAN);
+ curr_start = NULL;
+ curr_end = NULL;
+ }
+ }
+ if (curr_start)
+ br_vlan_notify(br, p, curr_start->vid, curr_end->vid,
+ RTM_NEWVLAN);
+
+ return err;
+}
+
+bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr,
+ const struct net_bridge_vlan *r_end)
+{
+ return v_curr->vid - r_end->vid == 1 &&
+ v_curr->msti == r_end->msti &&
+ ((v_curr->priv_flags ^ r_end->priv_flags) &
+ BR_VLFLAG_GLOBAL_MCAST_ENABLED) == 0 &&
+ br_multicast_ctx_options_equal(&v_curr->br_mcast_ctx,
+ &r_end->br_mcast_ctx);
+}
+
+bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range,
+ const struct net_bridge_vlan *v_opts)
+{
+ struct nlattr *nest2 __maybe_unused;
+ u64 clockval __maybe_unused;
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, BRIDGE_VLANDB_GLOBAL_OPTIONS);
+ if (!nest)
+ return false;
+
+ if (nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_ID, vid))
+ goto out_err;
+
+ if (vid_range && vid < vid_range &&
+ nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_RANGE, vid_range))
+ goto out_err;
+
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING,
+ !!(v_opts->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) ||
+ nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION,
+ v_opts->br_mcast_ctx.multicast_igmp_version) ||
+ nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT,
+ v_opts->br_mcast_ctx.multicast_last_member_count) ||
+ nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT,
+ v_opts->br_mcast_ctx.multicast_startup_query_count) ||
+ nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER,
+ v_opts->br_mcast_ctx.multicast_querier) ||
+ br_multicast_dump_querier_state(skb, &v_opts->br_mcast_ctx,
+ BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE))
+ goto out_err;
+
+ clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_last_member_interval);
+ if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL,
+ clockval, BRIDGE_VLANDB_GOPTS_PAD))
+ goto out_err;
+ clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_membership_interval);
+ if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL,
+ clockval, BRIDGE_VLANDB_GOPTS_PAD))
+ goto out_err;
+ clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_querier_interval);
+ if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL,
+ clockval, BRIDGE_VLANDB_GOPTS_PAD))
+ goto out_err;
+ clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_query_interval);
+ if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL,
+ clockval, BRIDGE_VLANDB_GOPTS_PAD))
+ goto out_err;
+ clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_query_response_interval);
+ if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL,
+ clockval, BRIDGE_VLANDB_GOPTS_PAD))
+ goto out_err;
+ clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_startup_query_interval);
+ if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL,
+ clockval, BRIDGE_VLANDB_GOPTS_PAD))
+ goto out_err;
+
+ if (br_rports_have_mc_router(&v_opts->br_mcast_ctx)) {
+ nest2 = nla_nest_start(skb,
+ BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS);
+ if (!nest2)
+ goto out_err;
+
+ rcu_read_lock();
+ if (br_rports_fill_info(skb, &v_opts->br_mcast_ctx)) {
+ rcu_read_unlock();
+ nla_nest_cancel(skb, nest2);
+ goto out_err;
+ }
+ rcu_read_unlock();
+
+ nla_nest_end(skb, nest2);
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION,
+ v_opts->br_mcast_ctx.multicast_mld_version))
+ goto out_err;
+#endif
+#endif
+
+ if (nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_MSTI, v_opts->msti))
+ goto out_err;
+
+ nla_nest_end(skb, nest);
+
+ return true;
+
+out_err:
+ nla_nest_cancel(skb, nest);
+ return false;
+}
+
+static size_t rtnl_vlan_global_opts_nlmsg_size(const struct net_bridge_vlan *v)
+{
+ return NLMSG_ALIGN(sizeof(struct br_vlan_msg))
+ + nla_total_size(0) /* BRIDGE_VLANDB_GLOBAL_OPTIONS */
+ + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_GOPTS_ID */
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING */
+ + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION */
+ + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION */
+ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT */
+ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT */
+ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL */
+ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL */
+ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL */
+ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL */
+ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL */
+ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL */
+ + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER */
+ + br_multicast_querier_state_size() /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE */
+ + nla_total_size(0) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS */
+ + br_rports_size(&v->br_mcast_ctx) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS */
+#endif
+ + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_GOPTS_MSTI */
+ + nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */
+}
+
+static void br_vlan_global_opts_notify(const struct net_bridge *br,
+ u16 vid, u16 vid_range)
+{
+ struct net_bridge_vlan *v;
+ struct br_vlan_msg *bvm;
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ /* right now notifications are done only with rtnl held */
+ ASSERT_RTNL();
+
+ /* need to find the vlan due to flags/options */
+ v = br_vlan_find(br_vlan_group(br), vid);
+ if (!v)
+ return;
+
+ skb = nlmsg_new(rtnl_vlan_global_opts_nlmsg_size(v), GFP_KERNEL);
+ if (!skb)
+ goto out_err;
+
+ err = -EMSGSIZE;
+ nlh = nlmsg_put(skb, 0, 0, RTM_NEWVLAN, sizeof(*bvm), 0);
+ if (!nlh)
+ goto out_err;
+ bvm = nlmsg_data(nlh);
+ memset(bvm, 0, sizeof(*bvm));
+ bvm->family = AF_BRIDGE;
+ bvm->ifindex = br->dev->ifindex;
+
+ if (!br_vlan_global_opts_fill(skb, vid, vid_range, v))
+ goto out_err;
+
+ nlmsg_end(skb, nlh);
+ rtnl_notify(skb, dev_net(br->dev), 0, RTNLGRP_BRVLAN, NULL, GFP_KERNEL);
+ return;
+
+out_err:
+ rtnl_set_sk_err(dev_net(br->dev), RTNLGRP_BRVLAN, err);
+ kfree_skb(skb);
+}
+
+static int br_vlan_process_global_one_opts(const struct net_bridge *br,
+ struct net_bridge_vlan_group *vg,
+ struct net_bridge_vlan *v,
+ struct nlattr **tb,
+ bool *changed,
+ struct netlink_ext_ack *extack)
+{
+ int err __maybe_unused;
+
+ *changed = false;
+#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]) {
+ u8 mc_snooping;
+
+ mc_snooping = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]);
+ if (br_multicast_toggle_global_vlan(v, !!mc_snooping))
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]) {
+ u8 ver;
+
+ ver = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]);
+ err = br_multicast_set_igmp_version(&v->br_mcast_ctx, ver);
+ if (err)
+ return err;
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]) {
+ u32 cnt;
+
+ cnt = nla_get_u32(tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]);
+ v->br_mcast_ctx.multicast_last_member_count = cnt;
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]) {
+ u32 cnt;
+
+ cnt = nla_get_u32(tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]);
+ v->br_mcast_ctx.multicast_startup_query_count = cnt;
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]) {
+ u64 val;
+
+ val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]);
+ v->br_mcast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val);
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]) {
+ u64 val;
+
+ val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]);
+ v->br_mcast_ctx.multicast_membership_interval = clock_t_to_jiffies(val);
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]) {
+ u64 val;
+
+ val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]);
+ v->br_mcast_ctx.multicast_querier_interval = clock_t_to_jiffies(val);
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]) {
+ u64 val;
+
+ val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]);
+ br_multicast_set_query_intvl(&v->br_mcast_ctx, val);
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL]) {
+ u64 val;
+
+ val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL]);
+ v->br_mcast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val);
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL]) {
+ u64 val;
+
+ val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL]);
+ br_multicast_set_startup_query_intvl(&v->br_mcast_ctx, val);
+ *changed = true;
+ }
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]) {
+ u8 val;
+
+ val = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]);
+ err = br_multicast_set_querier(&v->br_mcast_ctx, val);
+ if (err)
+ return err;
+ *changed = true;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) {
+ u8 ver;
+
+ ver = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]);
+ err = br_multicast_set_mld_version(&v->br_mcast_ctx, ver);
+ if (err)
+ return err;
+ *changed = true;
+ }
+#endif
+#endif
+ if (tb[BRIDGE_VLANDB_GOPTS_MSTI]) {
+ u16 msti;
+
+ msti = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_MSTI]);
+ err = br_mst_vlan_set_msti(v, msti);
+ if (err)
+ return err;
+ *changed = true;
+ }
+
+ return 0;
+}
+
+static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = {
+ [BRIDGE_VLANDB_GOPTS_ID] = { .type = NLA_U16 },
+ [BRIDGE_VLANDB_GOPTS_RANGE] = { .type = NLA_U16 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING] = { .type = NLA_U8 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION] = { .type = NLA_U8 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL] = { .type = NLA_U64 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_QUERIER] = { .type = NLA_U8 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION] = { .type = NLA_U8 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT] = { .type = NLA_U32 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT] = { .type = NLA_U32 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL] = { .type = NLA_U64 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL] = { .type = NLA_U64 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL] = { .type = NLA_U64 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL] = { .type = NLA_U64 },
+ [BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 },
+ [BRIDGE_VLANDB_GOPTS_MSTI] = NLA_POLICY_MAX(NLA_U16, VLAN_N_VID - 1),
+};
+
+int br_vlan_rtm_process_global_options(struct net_device *dev,
+ const struct nlattr *attr,
+ int cmd,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_vlan *v, *curr_start = NULL, *curr_end = NULL;
+ struct nlattr *tb[BRIDGE_VLANDB_GOPTS_MAX + 1];
+ struct net_bridge_vlan_group *vg;
+ u16 vid, vid_range = 0;
+ struct net_bridge *br;
+ int err = 0;
+
+ if (cmd != RTM_NEWVLAN) {
+ NL_SET_ERR_MSG_MOD(extack, "Global vlan options support only set operation");
+ return -EINVAL;
+ }
+ if (!netif_is_bridge_master(dev)) {
+ NL_SET_ERR_MSG_MOD(extack, "Global vlan options can only be set on bridge device");
+ return -EINVAL;
+ }
+ br = netdev_priv(dev);
+ vg = br_vlan_group(br);
+ if (WARN_ON(!vg))
+ return -ENODEV;
+
+ err = nla_parse_nested(tb, BRIDGE_VLANDB_GOPTS_MAX, attr,
+ br_vlan_db_gpol, extack);
+ if (err)
+ return err;
+
+ if (!tb[BRIDGE_VLANDB_GOPTS_ID]) {
+ NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry id");
+ return -EINVAL;
+ }
+ vid = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_ID]);
+ if (!br_vlan_valid_id(vid, extack))
+ return -EINVAL;
+
+ if (tb[BRIDGE_VLANDB_GOPTS_RANGE]) {
+ vid_range = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_RANGE]);
+ if (!br_vlan_valid_id(vid_range, extack))
+ return -EINVAL;
+ if (vid >= vid_range) {
+ NL_SET_ERR_MSG_MOD(extack, "End vlan id is less than or equal to start vlan id");
+ return -EINVAL;
+ }
+ } else {
+ vid_range = vid;
+ }
+
+ for (; vid <= vid_range; vid++) {
+ bool changed = false;
+
+ v = br_vlan_find(vg, vid);
+ if (!v) {
+ NL_SET_ERR_MSG_MOD(extack, "Vlan in range doesn't exist, can't process global options");
+ err = -ENOENT;
+ break;
+ }
+
+ err = br_vlan_process_global_one_opts(br, vg, v, tb, &changed,
+ extack);
+ if (err)
+ break;
+
+ if (changed) {
+ /* vlan options changed, check for range */
+ if (!curr_start) {
+ curr_start = v;
+ curr_end = v;
+ continue;
+ }
+
+ if (!br_vlan_global_opts_can_enter_range(v, curr_end)) {
+ br_vlan_global_opts_notify(br, curr_start->vid,
+ curr_end->vid);
+ curr_start = v;
+ }
+ curr_end = v;
+ } else {
+ /* nothing changed and nothing to notify yet */
+ if (!curr_start)
+ continue;
+
+ br_vlan_global_opts_notify(br, curr_start->vid,
+ curr_end->vid);
+ curr_start = NULL;
+ curr_end = NULL;
+ }
+ }
+ if (curr_start)
+ br_vlan_global_opts_notify(br, curr_start->vid, curr_end->vid);
+
+ return err;
+}
diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c
index 6d2c4eed2dc8..a966a6ec8263 100644
--- a/net/bridge/br_vlan_tunnel.c
+++ b/net/bridge/br_vlan_tunnel.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Bridge per vlan tunnel port dst_metadata handling code
*
* Authors:
* Roopa Prabhu <roopa@cumulusnetworks.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -34,48 +30,55 @@ static const struct rhashtable_params br_vlan_tunnel_rht_params = {
.key_offset = offsetof(struct net_bridge_vlan, tinfo.tunnel_id),
.key_len = sizeof(__be64),
.nelem_hint = 3,
- .locks_mul = 1,
.obj_cmpfn = br_vlan_tunid_cmp,
.automatic_shrinking = true,
};
static struct net_bridge_vlan *br_vlan_tunnel_lookup(struct rhashtable *tbl,
- u64 tunnel_id)
+ __be64 tunnel_id)
{
return rhashtable_lookup_fast(tbl, &tunnel_id,
br_vlan_tunnel_rht_params);
}
+static void vlan_tunnel_info_release(struct net_bridge_vlan *vlan)
+{
+ struct metadata_dst *tdst = rtnl_dereference(vlan->tinfo.tunnel_dst);
+
+ WRITE_ONCE(vlan->tinfo.tunnel_id, 0);
+ RCU_INIT_POINTER(vlan->tinfo.tunnel_dst, NULL);
+ dst_release(&tdst->dst);
+}
+
void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg,
struct net_bridge_vlan *vlan)
{
- if (!vlan->tinfo.tunnel_dst)
+ if (!rcu_access_pointer(vlan->tinfo.tunnel_dst))
return;
rhashtable_remove_fast(&vg->tunnel_hash, &vlan->tnode,
br_vlan_tunnel_rht_params);
- vlan->tinfo.tunnel_id = 0;
- dst_release(&vlan->tinfo.tunnel_dst->dst);
- vlan->tinfo.tunnel_dst = NULL;
+ vlan_tunnel_info_release(vlan);
}
static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg,
struct net_bridge_vlan *vlan, u32 tun_id)
{
- struct metadata_dst *metadata = NULL;
+ struct metadata_dst *metadata = rtnl_dereference(vlan->tinfo.tunnel_dst);
__be64 key = key32_to_tunnel_id(cpu_to_be32(tun_id));
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
int err;
- if (vlan->tinfo.tunnel_dst)
+ if (metadata)
return -EEXIST;
- metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY,
- key, 0);
+ __set_bit(IP_TUNNEL_KEY_BIT, flags);
+ metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, flags, key, 0);
if (!metadata)
return -EINVAL;
metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_BRIDGE;
- vlan->tinfo.tunnel_dst = metadata;
- vlan->tinfo.tunnel_id = key;
+ rcu_assign_pointer(vlan->tinfo.tunnel_dst, metadata);
+ WRITE_ONCE(vlan->tinfo.tunnel_id, key);
err = rhashtable_lookup_insert_fast(&vg->tunnel_hash, &vlan->tnode,
br_vlan_tunnel_rht_params);
@@ -84,9 +87,7 @@ static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg,
return 0;
out:
- dst_release(&vlan->tinfo.tunnel_dst->dst);
- vlan->tinfo.tunnel_dst = NULL;
- vlan->tinfo.tunnel_id = 0;
+ vlan_tunnel_info_release(vlan);
return err;
}
@@ -94,7 +95,8 @@ out:
/* Must be protected by RTNL.
* Must be called with vid in range from 1 to 4094 inclusive.
*/
-int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id)
+int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port, u16 vid,
+ u32 tun_id)
{
struct net_bridge_vlan_group *vg;
struct net_bridge_vlan *vlan;
@@ -112,7 +114,7 @@ int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id)
/* Must be protected by RTNL.
* Must be called with vid in range from 1 to 4094 inclusive.
*/
-int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid)
+int nbp_vlan_tunnel_info_delete(const struct net_bridge_port *port, u16 vid)
{
struct net_bridge_vlan_group *vg;
struct net_bridge_vlan *v;
@@ -157,41 +159,43 @@ void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg)
rhashtable_destroy(&vg->tunnel_hash);
}
-int br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
- struct net_bridge_port *p,
- struct net_bridge_vlan_group *vg)
+void br_handle_ingress_vlan_tunnel(struct sk_buff *skb,
+ struct net_bridge_port *p,
+ struct net_bridge_vlan_group *vg)
{
struct ip_tunnel_info *tinfo = skb_tunnel_info(skb);
struct net_bridge_vlan *vlan;
if (!vg || !tinfo)
- return 0;
+ return;
/* if already tagged, ignore */
if (skb_vlan_tagged(skb))
- return 0;
+ return;
/* lookup vid, given tunnel id */
vlan = br_vlan_tunnel_lookup(&vg->tunnel_hash, tinfo->key.tun_id);
if (!vlan)
- return 0;
+ return;
skb_dst_drop(skb);
__vlan_hwaccel_put_tag(skb, p->br->vlan_proto, vlan->vid);
-
- return 0;
}
int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
struct net_bridge_vlan *vlan)
{
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+ struct metadata_dst *tunnel_dst;
+ __be64 tunnel_id;
int err;
- if (!vlan || !vlan->tinfo.tunnel_id)
+ if (!vlan)
return 0;
- if (unlikely(!skb_vlan_tag_present(skb)))
+ tunnel_id = READ_ONCE(vlan->tinfo.tunnel_id);
+ if (!tunnel_id || unlikely(!skb_vlan_tag_present(skb)))
return 0;
skb_dst_drop(skb);
@@ -199,7 +203,25 @@ int br_handle_egress_vlan_tunnel(struct sk_buff *skb,
if (err)
return err;
- skb_dst_set(skb, dst_clone(&vlan->tinfo.tunnel_dst->dst));
+ if (BR_INPUT_SKB_CB(skb)->backup_nhid) {
+ __set_bit(IP_TUNNEL_KEY_BIT, flags);
+ tunnel_dst = __ip_tun_set_dst(0, 0, 0, 0, 0, flags,
+ tunnel_id, 0);
+ if (!tunnel_dst)
+ return -ENOMEM;
+
+ tunnel_dst->u.tun_info.mode |= IP_TUNNEL_INFO_TX |
+ IP_TUNNEL_INFO_BRIDGE;
+ tunnel_dst->u.tun_info.key.nhid =
+ BR_INPUT_SKB_CB(skb)->backup_nhid;
+ skb_dst_set(skb, &tunnel_dst->dst);
+
+ return 0;
+ }
+
+ tunnel_dst = rcu_dereference(vlan->tinfo.tunnel_dst);
+ if (tunnel_dst && dst_hold_safe(&tunnel_dst->dst))
+ skb_dst_set(skb, &tunnel_dst->dst);
return 0;
}
diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig
index 9a0159aebe1a..4fd5a6ea26b4 100644
--- a/net/bridge/netfilter/Kconfig
+++ b/net/bridge/netfilter/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Bridge netfilter configuration
#
@@ -5,21 +6,50 @@
menuconfig NF_TABLES_BRIDGE
depends on BRIDGE && NETFILTER && NF_TABLES
select NETFILTER_FAMILY_BRIDGE
- bool "Ethernet Bridge nf_tables support"
+ tristate "Ethernet Bridge nf_tables support"
if NF_TABLES_BRIDGE
+
+config NFT_BRIDGE_META
+ tristate "Netfilter nf_table bridge meta support"
+ help
+ Add support for bridge dedicated meta key.
+
config NFT_BRIDGE_REJECT
tristate "Netfilter nf_tables bridge reject support"
- depends on NFT_REJECT && NFT_REJECT_IPV4 && NFT_REJECT_IPV6
+ depends on NFT_REJECT
+ depends on NF_REJECT_IPV4
+ depends on NF_REJECT_IPV6
help
Add support to reject packets.
-config NF_LOG_BRIDGE
- tristate "Bridge packet logging"
- select NF_LOG_COMMON
-
endif # NF_TABLES_BRIDGE
+config NF_CONNTRACK_BRIDGE
+ tristate "IPv4/IPV6 bridge connection tracking support"
+ depends on NF_CONNTRACK
+ default n
+ help
+ Connection tracking keeps a record of what packets have passed
+ through your machine, in order to figure out how they are related
+ into connections. This is used to enhance packet filtering via
+ stateful policies. Enable this if you want native tracking from
+ the bridge. This provides a replacement for the `br_netfilter'
+ infrastructure.
+
+ To compile it as a module, choose M here. If unsure, say N.
+
+# old sockopt interface and eval loop
+config BRIDGE_NF_EBTABLES_LEGACY
+ tristate "Legacy EBTABLES support"
+ depends on BRIDGE && NETFILTER_XTABLES_LEGACY
+ depends on NETFILTER_XTABLES
+ default n
+ help
+ Legacy ebtables packet/frame classifier.
+ This is not needed if you are using ebtables over nftables
+ (iptables-nft).
+
menuconfig BRIDGE_NF_EBTABLES
tristate "Ethernet Bridge tables (ebtables) support"
depends on BRIDGE && NETFILTER && NETFILTER_XTABLES
@@ -36,6 +66,7 @@ if BRIDGE_NF_EBTABLES
#
config BRIDGE_EBT_BROUTE
tristate "ebt: broute table support"
+ depends on BRIDGE_NF_EBTABLES_LEGACY
help
The ebtables broute table is used to define rules that decide between
bridging and routing frames, giving Linux the functionality of a
@@ -46,6 +77,7 @@ config BRIDGE_EBT_BROUTE
config BRIDGE_EBT_T_FILTER
tristate "ebt: filter table support"
+ depends on BRIDGE_NF_EBTABLES_LEGACY
help
The ebtables filter table is used to define frame filtering rules at
local input, forwarding and local output. See the man page for
@@ -55,6 +87,7 @@ config BRIDGE_EBT_T_FILTER
config BRIDGE_EBT_T_NAT
tristate "ebt: nat table support"
+ depends on BRIDGE_NF_EBTABLES_LEGACY
help
The ebtables nat table is used to define rules that alter the MAC
source address (MAC SNAT) or the MAC destination address (MAC DNAT).
@@ -113,7 +146,7 @@ config BRIDGE_EBT_LIMIT
equivalent of the iptables limit match.
If you want to compile it as a module, say M here and read
- <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+ <file:Documentation/kbuild/modules.rst>. If unsure, say `N'.
config BRIDGE_EBT_MARK
tristate "ebt: mark filter support"
diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile
index 9b868861f21a..b9a1303da977 100644
--- a/net/bridge/netfilter/Makefile
+++ b/net/bridge/netfilter/Makefile
@@ -3,12 +3,13 @@
# Makefile for the netfilter modules for Link Layer filtering on a bridge.
#
+obj-$(CONFIG_NFT_BRIDGE_META) += nft_meta_bridge.o
obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o
-# packet logging
-obj-$(CONFIG_NF_LOG_BRIDGE) += nf_log_bridge.o
+# connection tracking
+obj-$(CONFIG_NF_CONNTRACK_BRIDGE) += nf_conntrack_bridge.o
-obj-$(CONFIG_BRIDGE_NF_EBTABLES) += ebtables.o
+obj-$(CONFIG_BRIDGE_NF_EBTABLES_LEGACY) += ebtables.o
# tables
obj-$(CONFIG_BRIDGE_EBT_BROUTE) += ebtable_broute.o
diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c
index 5fc4affd9fdb..68c2519bdc52 100644
--- a/net/bridge/netfilter/ebt_802_3.c
+++ b/net/bridge/netfilter/ebt_802_3.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* 802_3
*
@@ -10,7 +11,13 @@
#include <linux/module.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_bridge/ebtables.h>
-#include <linux/netfilter_bridge/ebt_802_3.h>
+#include <linux/skbuff.h>
+#include <uapi/linux/netfilter_bridge/ebt_802_3.h>
+
+static struct ebt_802_3_hdr *ebt_802_3_hdr(const struct sk_buff *skb)
+{
+ return (struct ebt_802_3_hdr *)skb_mac_header(skb);
+}
static bool
ebt_802_3_mt(const struct sk_buff *skb, struct xt_action_param *par)
diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c
index 620e54f08296..96f7243b6314 100644
--- a/net/bridge/netfilter/ebt_among.c
+++ b/net/bridge/netfilter/ebt_among.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_among
*
diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c
index 227142282b45..0707cc00fe8f 100644
--- a/net/bridge/netfilter/ebt_arp.c
+++ b/net/bridge/netfilter/ebt_arp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_arp
*
diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c
index db85230e49c3..d9e77e2500cd 100644
--- a/net/bridge/netfilter/ebt_arpreply.c
+++ b/net/bridge/netfilter/ebt_arpreply.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_arpreply
*
diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c
index dfc86a0199da..3fda71a8579d 100644
--- a/net/bridge/netfilter/ebt_dnat.c
+++ b/net/bridge/netfilter/ebt_dnat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_dnat
*
@@ -19,9 +20,8 @@ static unsigned int
ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ebt_nat_info *info = par->targinfo;
- struct net_device *dev;
- if (!skb_make_writable(skb, 0))
+ if (skb_ensure_writable(skb, 0))
return EBT_DROP;
ether_addr_copy(eth_hdr(skb)->h_dest, info->mac);
@@ -32,10 +32,22 @@ ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par)
else
skb->pkt_type = PACKET_MULTICAST;
} else {
- if (xt_hooknum(par) != NF_BR_BROUTING)
- dev = br_port_get_rcu(xt_in(par))->br->dev;
- else
+ const struct net_device *dev;
+
+ switch (xt_hooknum(par)) {
+ case NF_BR_BROUTING:
dev = xt_in(par);
+ break;
+ case NF_BR_PRE_ROUTING:
+ dev = br_port_get_rcu(xt_in(par))->br->dev;
+ break;
+ default:
+ dev = NULL;
+ break;
+ }
+
+ if (!dev) /* NF_BR_LOCAL_OUT */
+ return info->target;
if (ether_addr_equal(info->mac, dev->dev_addr))
skb->pkt_type = PACKET_HOST;
diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c
index ffaa8ce2e724..df372496c1c1 100644
--- a/net/bridge/netfilter/ebt_ip.c
+++ b/net/bridge/netfilter/ebt_ip.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_ip
*
diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c
index 2a5a52a53ec4..f3225bc31f6c 100644
--- a/net/bridge/netfilter/ebt_ip6.c
+++ b/net/bridge/netfilter/ebt_ip6.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_ip6
*
diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c
index 165b9d678cf1..e16183bd1bb8 100644
--- a/net/bridge/netfilter/ebt_limit.c
+++ b/net/bridge/netfilter/ebt_limit.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_limit
*
@@ -86,7 +87,7 @@ static int ebt_limit_mt_check(const struct xt_mtchk_param *par)
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
/*
* no conversion function needed --
* only avg/burst have meaningful values in userspace.
@@ -106,7 +107,7 @@ static struct xt_match ebt_limit_mt_reg __read_mostly = {
.checkentry = ebt_limit_mt_check,
.matchsize = sizeof(struct ebt_limit_info),
.usersize = offsetof(struct ebt_limit_info, prev),
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
.compatsize = sizeof(struct ebt_compat_limit_info),
#endif
.me = THIS_MODULE,
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 707caea39743..e2eea1daaf8b 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_log
*
diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c
index 19f0f9592d32..8cf653c72fd8 100644
--- a/net/bridge/netfilter/ebt_mark.c
+++ b/net/bridge/netfilter/ebt_mark.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_mark
*
@@ -52,7 +53,7 @@ static int ebt_mark_tg_check(const struct xt_tgchk_param *par)
return -EINVAL;
return 0;
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
struct compat_ebt_mark_t_info {
compat_ulong_t mark;
compat_uint_t target;
@@ -86,7 +87,7 @@ static struct xt_target ebt_mark_tg_reg __read_mostly = {
.target = ebt_mark_tg,
.checkentry = ebt_mark_tg_check,
.targetsize = sizeof(struct ebt_mark_t_info),
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
.compatsize = sizeof(struct compat_ebt_mark_t_info),
.compat_from_user = mark_tg_compat_from_user,
.compat_to_user = mark_tg_compat_to_user,
diff --git a/net/bridge/netfilter/ebt_mark_m.c b/net/bridge/netfilter/ebt_mark_m.c
index d98baefc4c7e..5872e73c741e 100644
--- a/net/bridge/netfilter/ebt_mark_m.c
+++ b/net/bridge/netfilter/ebt_mark_m.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_mark_m
*
@@ -36,7 +37,7 @@ static int ebt_mark_mt_check(const struct xt_mtchk_param *par)
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
struct compat_ebt_mark_m_info {
compat_ulong_t mark, mask;
uint8_t invert, bitmask;
@@ -74,7 +75,7 @@ static struct xt_match ebt_mark_mt_reg __read_mostly = {
.match = ebt_mark_mt,
.checkentry = ebt_mark_mt_check,
.matchsize = sizeof(struct ebt_mark_m_info),
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
.compatsize = sizeof(struct compat_ebt_mark_m_info),
.compat_from_user = mark_mt_compat_from_user,
.compat_to_user = mark_mt_compat_to_user,
diff --git a/net/bridge/netfilter/ebt_nflog.c b/net/bridge/netfilter/ebt_nflog.c
index da1c2fdc08c8..61bf8f4465ab 100644
--- a/net/bridge/netfilter/ebt_nflog.c
+++ b/net/bridge/netfilter/ebt_nflog.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_nflog
*
diff --git a/net/bridge/netfilter/ebt_pkttype.c b/net/bridge/netfilter/ebt_pkttype.c
index 496a56515307..c9e306119ee3 100644
--- a/net/bridge/netfilter/ebt_pkttype.c
+++ b/net/bridge/netfilter/ebt_pkttype.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_pkttype
*
diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c
index a7223eaf490b..307790562b49 100644
--- a/net/bridge/netfilter/ebt_redirect.c
+++ b/net/bridge/netfilter/ebt_redirect.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_redirect
*
@@ -20,7 +21,7 @@ ebt_redirect_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ebt_redirect_info *info = par->targinfo;
- if (!skb_make_writable(skb, 0))
+ if (skb_ensure_writable(skb, 0))
return EBT_DROP;
if (xt_hooknum(par) != NF_BR_BROUTING)
diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c
index 11cf9e9e9222..7dfbcdfc30e5 100644
--- a/net/bridge/netfilter/ebt_snat.c
+++ b/net/bridge/netfilter/ebt_snat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_snat
*
@@ -21,7 +22,7 @@ ebt_snat_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ebt_nat_info *info = par->targinfo;
- if (!skb_make_writable(skb, 0))
+ if (skb_ensure_writable(skb, 0))
return EBT_DROP;
ether_addr_copy(eth_hdr(skb)->h_source, info->mac);
diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
index 46c1fe7637ea..8f68afda5f81 100644
--- a/net/bridge/netfilter/ebt_stp.c
+++ b/net/bridge/netfilter/ebt_stp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebt_stp
*
@@ -14,7 +15,6 @@
#include <linux/netfilter_bridge/ebt_stp.h>
#define BPDU_TYPE_CONFIG 0
-#define BPDU_TYPE_TCN 0x80
struct stp_header {
u8 dsap;
diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c
index 98c221dbf059..80ede370afed 100644
--- a/net/bridge/netfilter/ebt_vlan.c
+++ b/net/bridge/netfilter/ebt_vlan.c
@@ -1,20 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Description: EBTables 802.1Q match extension kernelspace module.
* Authors: Nick Fedchik <nick@fedchik.org.ua>
* Bart De Schuymer <bdschuym@pandora.be>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <linux/if_ether.h>
diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c
index 276b60262981..741360219552 100644
--- a/net/bridge/netfilter/ebtable_broute.c
+++ b/net/bridge/netfilter/ebtable_broute.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebtable_broute
*
@@ -15,6 +16,8 @@
#include <linux/module.h>
#include <linux/if_bridge.h>
+#include "../br_private.h"
+
/* EBT_ACCEPT means the frame will be bridged
* EBT_DROP means the frame will be routed
*/
@@ -33,72 +36,103 @@ static struct ebt_replace_kernel initial_table = {
.entries = (char *)&initial_chain,
};
-static int check(const struct ebt_table_info *info, unsigned int valid_hooks)
-{
- if (valid_hooks & ~(1 << NF_BR_BROUTING))
- return -EINVAL;
- return 0;
-}
-
static const struct ebt_table broute_table = {
.name = "broute",
.table = &initial_table,
.valid_hooks = 1 << NF_BR_BROUTING,
- .check = check,
.me = THIS_MODULE,
};
-static int ebt_broute(struct sk_buff *skb)
+static unsigned int ebt_broute(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *s)
{
+ struct net_bridge_port *p = br_port_get_rcu(skb->dev);
struct nf_hook_state state;
+ unsigned char *dest;
int ret;
+ if (!p || p->state != BR_STATE_FORWARDING)
+ return NF_ACCEPT;
+
nf_hook_state_init(&state, NF_BR_BROUTING,
- NFPROTO_BRIDGE, skb->dev, NULL, NULL,
- dev_net(skb->dev), NULL);
+ NFPROTO_BRIDGE, s->in, NULL, NULL,
+ s->net, NULL);
+
+ ret = ebt_do_table(priv, skb, &state);
+ if (ret != NF_DROP)
+ return ret;
+
+ /* DROP in ebtables -t broute means that the
+ * skb should be routed, not bridged.
+ * This is awkward, but can't be changed for compatibility
+ * reasons.
+ *
+ * We map DROP to ACCEPT and set the ->br_netfilter_broute flag.
+ */
+ BR_INPUT_SKB_CB(skb)->br_netfilter_broute = 1;
+
+ /* undo PACKET_HOST mangling done in br_input in case the dst
+ * address matches the logical bridge but not the port.
+ */
+ dest = eth_hdr(skb)->h_dest;
+ if (skb->pkt_type == PACKET_HOST &&
+ !ether_addr_equal(skb->dev->dev_addr, dest) &&
+ ether_addr_equal(p->br->dev->dev_addr, dest))
+ skb->pkt_type = PACKET_OTHERHOST;
- ret = ebt_do_table(skb, &state, state.net->xt.broute_table);
- if (ret == NF_DROP)
- return 1; /* route it */
- return 0; /* bridge it */
+ return NF_ACCEPT;
}
-static int __net_init broute_net_init(struct net *net)
+static const struct nf_hook_ops ebt_ops_broute = {
+ .hook = ebt_broute,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_PRE_ROUTING,
+ .priority = NF_BR_PRI_FIRST,
+};
+
+static int broute_table_init(struct net *net)
+{
+ return ebt_register_table(net, &broute_table, &ebt_ops_broute);
+}
+
+static void __net_exit broute_net_pre_exit(struct net *net)
{
- return ebt_register_table(net, &broute_table, NULL,
- &net->xt.broute_table);
+ ebt_unregister_table_pre_exit(net, "broute");
}
static void __net_exit broute_net_exit(struct net *net)
{
- ebt_unregister_table(net, net->xt.broute_table, NULL);
+ ebt_unregister_table(net, "broute");
}
static struct pernet_operations broute_net_ops = {
- .init = broute_net_init,
.exit = broute_net_exit,
+ .pre_exit = broute_net_pre_exit,
};
static int __init ebtable_broute_init(void)
{
- int ret;
+ int ret = ebt_register_template(&broute_table, broute_table_init);
+
+ if (ret)
+ return ret;
ret = register_pernet_subsys(&broute_net_ops);
- if (ret < 0)
+ if (ret) {
+ ebt_unregister_template(&broute_table);
return ret;
- /* see br_input.c */
- RCU_INIT_POINTER(br_should_route_hook,
- (br_should_route_hook_t *)ebt_broute);
+ }
+
return 0;
}
static void __exit ebtable_broute_fini(void)
{
- RCU_INIT_POINTER(br_should_route_hook, NULL);
- synchronize_net();
unregister_pernet_subsys(&broute_net_ops);
+ ebt_unregister_template(&broute_table);
}
module_init(ebtable_broute_init);
module_exit(ebtable_broute_fini);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Force packets to be routed instead of bridged");
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index 550324c516ee..dacd81b12e62 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebtable_filter
*
@@ -42,82 +43,77 @@ static struct ebt_replace_kernel initial_table = {
.entries = (char *)initial_chains,
};
-static int check(const struct ebt_table_info *info, unsigned int valid_hooks)
-{
- if (valid_hooks & ~FILTER_VALID_HOOKS)
- return -EINVAL;
- return 0;
-}
-
static const struct ebt_table frame_filter = {
.name = "filter",
.table = &initial_table,
.valid_hooks = FILTER_VALID_HOOKS,
- .check = check,
.me = THIS_MODULE,
};
-static unsigned int
-ebt_in_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ebt_do_table(skb, state, state->net->xt.frame_filter);
-}
-
-static unsigned int
-ebt_out_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ebt_do_table(skb, state, state->net->xt.frame_filter);
-}
-
static const struct nf_hook_ops ebt_ops_filter[] = {
{
- .hook = ebt_in_hook,
+ .hook = ebt_do_table,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_LOCAL_IN,
.priority = NF_BR_PRI_FILTER_BRIDGED,
},
{
- .hook = ebt_in_hook,
+ .hook = ebt_do_table,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_FORWARD,
.priority = NF_BR_PRI_FILTER_BRIDGED,
},
{
- .hook = ebt_out_hook,
+ .hook = ebt_do_table,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_LOCAL_OUT,
.priority = NF_BR_PRI_FILTER_OTHER,
},
};
-static int __net_init frame_filter_net_init(struct net *net)
+static int frame_filter_table_init(struct net *net)
+{
+ return ebt_register_table(net, &frame_filter, ebt_ops_filter);
+}
+
+static void __net_exit frame_filter_net_pre_exit(struct net *net)
{
- return ebt_register_table(net, &frame_filter, ebt_ops_filter,
- &net->xt.frame_filter);
+ ebt_unregister_table_pre_exit(net, "filter");
}
static void __net_exit frame_filter_net_exit(struct net *net)
{
- ebt_unregister_table(net, net->xt.frame_filter, ebt_ops_filter);
+ ebt_unregister_table(net, "filter");
}
static struct pernet_operations frame_filter_net_ops = {
- .init = frame_filter_net_init,
.exit = frame_filter_net_exit,
+ .pre_exit = frame_filter_net_pre_exit,
};
static int __init ebtable_filter_init(void)
{
- return register_pernet_subsys(&frame_filter_net_ops);
+ int ret = ebt_register_template(&frame_filter, frame_filter_table_init);
+
+ if (ret)
+ return ret;
+
+ ret = register_pernet_subsys(&frame_filter_net_ops);
+ if (ret) {
+ ebt_unregister_template(&frame_filter);
+ return ret;
+ }
+
+ return 0;
}
static void __exit ebtable_filter_fini(void)
{
unregister_pernet_subsys(&frame_filter_net_ops);
+ ebt_unregister_template(&frame_filter);
}
module_init(ebtable_filter_init);
module_exit(ebtable_filter_fini);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ebtables legacy filter table");
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index c0fb3ca518af..0f2a8c6118d4 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* ebtable_nat
*
@@ -42,82 +43,77 @@ static struct ebt_replace_kernel initial_table = {
.entries = (char *)initial_chains,
};
-static int check(const struct ebt_table_info *info, unsigned int valid_hooks)
-{
- if (valid_hooks & ~NAT_VALID_HOOKS)
- return -EINVAL;
- return 0;
-}
-
static const struct ebt_table frame_nat = {
.name = "nat",
.table = &initial_table,
.valid_hooks = NAT_VALID_HOOKS,
- .check = check,
.me = THIS_MODULE,
};
-static unsigned int
-ebt_nat_in(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ebt_do_table(skb, state, state->net->xt.frame_nat);
-}
-
-static unsigned int
-ebt_nat_out(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ebt_do_table(skb, state, state->net->xt.frame_nat);
-}
-
static const struct nf_hook_ops ebt_ops_nat[] = {
{
- .hook = ebt_nat_out,
+ .hook = ebt_do_table,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_LOCAL_OUT,
.priority = NF_BR_PRI_NAT_DST_OTHER,
},
{
- .hook = ebt_nat_out,
+ .hook = ebt_do_table,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_POST_ROUTING,
.priority = NF_BR_PRI_NAT_SRC,
},
{
- .hook = ebt_nat_in,
+ .hook = ebt_do_table,
.pf = NFPROTO_BRIDGE,
.hooknum = NF_BR_PRE_ROUTING,
.priority = NF_BR_PRI_NAT_DST_BRIDGED,
},
};
-static int __net_init frame_nat_net_init(struct net *net)
+static int frame_nat_table_init(struct net *net)
+{
+ return ebt_register_table(net, &frame_nat, ebt_ops_nat);
+}
+
+static void __net_exit frame_nat_net_pre_exit(struct net *net)
{
- return ebt_register_table(net, &frame_nat, ebt_ops_nat,
- &net->xt.frame_nat);
+ ebt_unregister_table_pre_exit(net, "nat");
}
static void __net_exit frame_nat_net_exit(struct net *net)
{
- ebt_unregister_table(net, net->xt.frame_nat, ebt_ops_nat);
+ ebt_unregister_table(net, "nat");
}
static struct pernet_operations frame_nat_net_ops = {
- .init = frame_nat_net_init,
.exit = frame_nat_net_exit,
+ .pre_exit = frame_nat_net_pre_exit,
};
static int __init ebtable_nat_init(void)
{
- return register_pernet_subsys(&frame_nat_net_ops);
+ int ret = ebt_register_template(&frame_nat, frame_nat_table_init);
+
+ if (ret)
+ return ret;
+
+ ret = register_pernet_subsys(&frame_nat_net_ops);
+ if (ret) {
+ ebt_unregister_template(&frame_nat);
+ return ret;
+ }
+
+ return ret;
}
static void __exit ebtable_nat_fini(void)
{
unregister_pernet_subsys(&frame_nat_net_ops);
+ ebt_unregister_template(&frame_nat);
}
module_init(ebtable_nat_init);
module_exit(ebtable_nat_fini);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ebtables legacy stateless nat table");
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 491828713e0b..5697e3949a36 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* ebtables
*
@@ -8,11 +9,6 @@
*
* This code is strongly inspired by the iptables code which is
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kmod.h>
@@ -28,13 +24,10 @@
#include <linux/cpumask.h>
#include <linux/audit.h>
#include <net/sock.h>
+#include <net/netns/generic.h>
/* needed for logical [in,out]-dev filtering */
#include "../br_private.h"
-#define BUGPRINT(format, args...) printk("kernel msg: ebtables bug: please "\
- "report to author: "format, ## args)
-/* #define BUGPRINT(format, args...) */
-
/* Each cpu has its own set of counters, so there is no need for write_lock in
* the softirq
* For reading or updating the counters, the user context needs to
@@ -47,11 +40,23 @@
#define COUNTER_BASE(c, n, cpu) ((struct ebt_counter *)(((char *)c) + \
COUNTER_OFFSET(n) * cpu))
+struct ebt_pernet {
+ struct list_head tables;
+};
+struct ebt_template {
+ struct list_head list;
+ char name[EBT_TABLE_MAXNAMELEN];
+ struct module *owner;
+ /* called when table is needed in the given netns */
+ int (*table_init)(struct net *net);
+};
+static unsigned int ebt_pernet_id __read_mostly;
+static LIST_HEAD(template_tables);
static DEFINE_MUTEX(ebt_mutex);
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
static void ebt_standard_compat_from_user(void *dst, const void *src)
{
int v = *(compat_int_t *)src;
@@ -77,7 +82,7 @@ static struct xt_target ebt_standard_target = {
.revision = 0,
.family = NFPROTO_BRIDGE,
.targetsize = sizeof(int),
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
.compatsize = sizeof(compat_int_t),
.compat_from_user = ebt_standard_compat_from_user,
.compat_to_user = ebt_standard_compat_to_user,
@@ -184,10 +189,10 @@ ebt_get_target_c(const struct ebt_entry *e)
}
/* Do some firewalling */
-unsigned int ebt_do_table(struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct ebt_table *table)
+unsigned int ebt_do_table(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
{
+ struct ebt_table *table = priv;
unsigned int hook = state->hook;
int i, nentries;
struct ebt_entry *point;
@@ -229,7 +234,7 @@ unsigned int ebt_do_table(struct sk_buff *skb,
return NF_DROP;
}
- ADD_COUNTER(*(counter_base + i), 1, skb->len);
+ ADD_COUNTER(*(counter_base + i), skb->len, 1);
/* these should only watch: not modify, nor tell us
* what to do with the packet
@@ -313,30 +318,57 @@ letscontinue:
/* If it succeeds, returns element and locks mutex */
static inline void *
-find_inlist_lock_noload(struct list_head *head, const char *name, int *error,
+find_inlist_lock_noload(struct net *net, const char *name, int *error,
struct mutex *mutex)
{
- struct {
- struct list_head list;
- char name[EBT_FUNCTION_MAXNAMELEN];
- } *e;
+ struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id);
+ struct ebt_template *tmpl;
+ struct ebt_table *table;
mutex_lock(mutex);
- list_for_each_entry(e, head, list) {
- if (strcmp(e->name, name) == 0)
- return e;
+ list_for_each_entry(table, &ebt_net->tables, list) {
+ if (strcmp(table->name, name) == 0)
+ return table;
}
+
+ list_for_each_entry(tmpl, &template_tables, list) {
+ if (strcmp(name, tmpl->name) == 0) {
+ struct module *owner = tmpl->owner;
+
+ if (!try_module_get(owner))
+ goto out;
+
+ mutex_unlock(mutex);
+
+ *error = tmpl->table_init(net);
+ if (*error) {
+ module_put(owner);
+ return NULL;
+ }
+
+ mutex_lock(mutex);
+ module_put(owner);
+ break;
+ }
+ }
+
+ list_for_each_entry(table, &ebt_net->tables, list) {
+ if (strcmp(table->name, name) == 0)
+ return table;
+ }
+
+out:
*error = -ENOENT;
mutex_unlock(mutex);
return NULL;
}
static void *
-find_inlist_lock(struct list_head *head, const char *name, const char *prefix,
+find_inlist_lock(struct net *net, const char *name, const char *prefix,
int *error, struct mutex *mutex)
{
return try_then_request_module(
- find_inlist_lock_noload(head, name, error, mutex),
+ find_inlist_lock_noload(net, name, error, mutex),
"%s%s", prefix, name);
}
@@ -344,8 +376,7 @@ static inline struct ebt_table *
find_table_lock(struct net *net, const char *name, int *error,
struct mutex *mutex)
{
- return find_inlist_lock(&net->xt.tables[NFPROTO_BRIDGE], name,
- "ebtable_", error, mutex);
+ return find_inlist_lock(net, name, "ebtable_", error, mutex);
}
static inline void ebt_free_table_info(struct ebt_table_info *info)
@@ -385,7 +416,7 @@ ebt_check_match(struct ebt_entry_match *m, struct xt_mtchk_param *par,
par->match = match;
par->matchinfo = m->data;
ret = xt_check_match(par, m->match_size,
- e->ethproto, e->invflags & EBT_IPROTO);
+ ntohs(e->ethproto), e->invflags & EBT_IPROTO);
if (ret < 0) {
module_put(match->me);
return ret;
@@ -422,7 +453,7 @@ ebt_check_watcher(struct ebt_entry_watcher *w, struct xt_tgchk_param *par,
par->target = watcher;
par->targinfo = w->data;
ret = xt_check_target(par, w->watcher_size,
- e->ethproto, e->invflags & EBT_IPROTO);
+ ntohs(e->ethproto), e->invflags & EBT_IPROTO);
if (ret < 0) {
module_put(watcher->me);
return ret;
@@ -466,8 +497,6 @@ static int ebt_verify_pointers(const struct ebt_replace *repl,
/* we make userspace set this right,
* so there is no misunderstanding
*/
- BUGPRINT("EBT_ENTRY_OR_ENTRIES shouldn't be set "
- "in distinguisher\n");
return -EINVAL;
}
if (i != NF_BR_NUMHOOKS)
@@ -485,18 +514,14 @@ static int ebt_verify_pointers(const struct ebt_replace *repl,
offset += e->next_offset;
}
}
- if (offset != limit) {
- BUGPRINT("entries_size too small\n");
+ if (offset != limit)
return -EINVAL;
- }
/* check if all valid hooks have a chain */
for (i = 0; i < NF_BR_NUMHOOKS; i++) {
if (!newinfo->hook_entry[i] &&
- (valid_hooks & (1 << i))) {
- BUGPRINT("Valid hook without chain\n");
+ (valid_hooks & (1 << i)))
return -EINVAL;
- }
}
return 0;
}
@@ -523,26 +548,20 @@ ebt_check_entry_size_and_hooks(const struct ebt_entry *e,
/* this checks if the previous chain has as many entries
* as it said it has
*/
- if (*n != *cnt) {
- BUGPRINT("nentries does not equal the nr of entries "
- "in the chain\n");
+ if (*n != *cnt)
return -EINVAL;
- }
+
if (((struct ebt_entries *)e)->policy != EBT_DROP &&
((struct ebt_entries *)e)->policy != EBT_ACCEPT) {
/* only RETURN from udc */
if (i != NF_BR_NUMHOOKS ||
- ((struct ebt_entries *)e)->policy != EBT_RETURN) {
- BUGPRINT("bad policy\n");
+ ((struct ebt_entries *)e)->policy != EBT_RETURN)
return -EINVAL;
- }
}
if (i == NF_BR_NUMHOOKS) /* it's a user defined chain */
(*udc_cnt)++;
- if (((struct ebt_entries *)e)->counter_offset != *totalcnt) {
- BUGPRINT("counter_offset != totalcnt");
+ if (((struct ebt_entries *)e)->counter_offset != *totalcnt)
return -EINVAL;
- }
*n = ((struct ebt_entries *)e)->nentries;
*cnt = 0;
return 0;
@@ -550,15 +569,13 @@ ebt_check_entry_size_and_hooks(const struct ebt_entry *e,
/* a plain old entry, heh */
if (sizeof(struct ebt_entry) > e->watchers_offset ||
e->watchers_offset > e->target_offset ||
- e->target_offset >= e->next_offset) {
- BUGPRINT("entry offsets not in right order\n");
+ e->target_offset >= e->next_offset)
return -EINVAL;
- }
+
/* this is not checked anywhere else */
- if (e->next_offset - e->target_offset < sizeof(struct ebt_entry_target)) {
- BUGPRINT("target size too small\n");
+ if (e->next_offset - e->target_offset < sizeof(struct ebt_entry_target))
return -EINVAL;
- }
+
(*cnt)++;
(*totalcnt)++;
return 0;
@@ -678,18 +695,15 @@ ebt_check_entry(struct ebt_entry *e, struct net *net,
if (e->bitmask == 0)
return 0;
- if (e->bitmask & ~EBT_F_MASK) {
- BUGPRINT("Unknown flag for bitmask\n");
+ if (e->bitmask & ~EBT_F_MASK)
return -EINVAL;
- }
- if (e->invflags & ~EBT_INV_MASK) {
- BUGPRINT("Unknown flag for inv bitmask\n");
+
+ if (e->invflags & ~EBT_INV_MASK)
return -EINVAL;
- }
- if ((e->bitmask & EBT_NOPROTO) && (e->bitmask & EBT_802_3)) {
- BUGPRINT("NOPROTO & 802_3 not allowed\n");
+
+ if ((e->bitmask & EBT_NOPROTO) && (e->bitmask & EBT_802_3))
return -EINVAL;
- }
+
/* what hook do we belong to? */
for (i = 0; i < NF_BR_NUMHOOKS; i++) {
if (!newinfo->hook_entry[i])
@@ -748,13 +762,11 @@ ebt_check_entry(struct ebt_entry *e, struct net *net,
t->u.target = target;
if (t->u.target == &ebt_standard_target) {
if (gap < sizeof(struct ebt_standard_target)) {
- BUGPRINT("Standard target size too big\n");
ret = -EFAULT;
goto cleanup_watchers;
}
if (((struct ebt_standard_target *)t)->verdict <
-NUM_STANDARD_TARGETS) {
- BUGPRINT("Invalid standard target\n");
ret = -EFAULT;
goto cleanup_watchers;
}
@@ -767,7 +779,7 @@ ebt_check_entry(struct ebt_entry *e, struct net *net,
tgpar.target = target;
tgpar.targinfo = t->data;
ret = xt_check_target(&tgpar, t->target_size,
- e->ethproto, e->invflags & EBT_IPROTO);
+ ntohs(e->ethproto), e->invflags & EBT_IPROTO);
if (ret < 0) {
module_put(target->me);
goto cleanup_watchers;
@@ -813,10 +825,9 @@ static int check_chainloops(const struct ebt_entries *chain, struct ebt_cl_stack
if (strcmp(t->u.name, EBT_STANDARD_TARGET))
goto letscontinue;
if (e->target_offset + sizeof(struct ebt_standard_target) >
- e->next_offset) {
- BUGPRINT("Standard target size too big\n");
+ e->next_offset)
return -1;
- }
+
verdict = ((struct ebt_standard_target *)t)->verdict;
if (verdict >= 0) { /* jump to another chain */
struct ebt_entries *hlp2 =
@@ -825,14 +836,12 @@ static int check_chainloops(const struct ebt_entries *chain, struct ebt_cl_stack
if (hlp2 == cl_s[i].cs.chaininfo)
break;
/* bad destination or loop */
- if (i == udc_cnt) {
- BUGPRINT("bad destination\n");
+ if (i == udc_cnt)
return -1;
- }
- if (cl_s[i].cs.n) {
- BUGPRINT("loop\n");
+
+ if (cl_s[i].cs.n)
return -1;
- }
+
if (cl_s[i].hookmask & (1 << hooknr))
goto letscontinue;
/* this can't be 0, so the loop test is correct */
@@ -865,24 +874,21 @@ static int translate_table(struct net *net, const char *name,
i = 0;
while (i < NF_BR_NUMHOOKS && !newinfo->hook_entry[i])
i++;
- if (i == NF_BR_NUMHOOKS) {
- BUGPRINT("No valid hooks specified\n");
+ if (i == NF_BR_NUMHOOKS)
return -EINVAL;
- }
- if (newinfo->hook_entry[i] != (struct ebt_entries *)newinfo->entries) {
- BUGPRINT("Chains don't start at beginning\n");
+
+ if (newinfo->hook_entry[i] != (struct ebt_entries *)newinfo->entries)
return -EINVAL;
- }
+
/* make sure chains are ordered after each other in same order
* as their corresponding hooks
*/
for (j = i + 1; j < NF_BR_NUMHOOKS; j++) {
if (!newinfo->hook_entry[j])
continue;
- if (newinfo->hook_entry[j] <= newinfo->hook_entry[i]) {
- BUGPRINT("Hook order must be followed\n");
+ if (newinfo->hook_entry[j] <= newinfo->hook_entry[i])
return -EINVAL;
- }
+
i = j;
}
@@ -900,15 +906,11 @@ static int translate_table(struct net *net, const char *name,
if (ret != 0)
return ret;
- if (i != j) {
- BUGPRINT("nentries does not equal the nr of entries in the "
- "(last) chain\n");
+ if (i != j)
return -EINVAL;
- }
- if (k != newinfo->nentries) {
- BUGPRINT("Total nentries is wrong\n");
+
+ if (k != newinfo->nentries)
return -EINVAL;
- }
/* get the location of the udc, put them in an array
* while we're at it, allocate the chainstack
@@ -918,13 +920,15 @@ static int translate_table(struct net *net, const char *name,
* if an error occurs
*/
newinfo->chainstack =
- vmalloc(array_size(nr_cpu_ids,
- sizeof(*(newinfo->chainstack))));
+ vmalloc_array(nr_cpu_ids,
+ sizeof(*(newinfo->chainstack)));
if (!newinfo->chainstack)
return -ENOMEM;
for_each_possible_cpu(i) {
newinfo->chainstack[i] =
- vmalloc(array_size(udc_cnt, sizeof(*(newinfo->chainstack[0]))));
+ vmalloc_node(array_size(udc_cnt,
+ sizeof(*(newinfo->chainstack[0]))),
+ cpu_to_node(i));
if (!newinfo->chainstack[i]) {
while (i)
vfree(newinfo->chainstack[--i]);
@@ -934,7 +938,7 @@ static int translate_table(struct net *net, const char *name,
}
}
- cl_s = vmalloc(array_size(udc_cnt, sizeof(*cl_s)));
+ cl_s = vmalloc_array(udc_cnt, sizeof(*cl_s));
if (!cl_s)
return -ENOMEM;
i = 0; /* the i'th udc */
@@ -942,7 +946,6 @@ static int translate_table(struct net *net, const char *name,
ebt_get_udc_positions, newinfo, &i, cl_s);
/* sanity check */
if (i != udc_cnt) {
- BUGPRINT("i != udc_cnt\n");
vfree(cl_s);
return -EFAULT;
}
@@ -997,8 +1000,8 @@ static void get_counters(const struct ebt_counter *oldcounters,
continue;
counter_base = COUNTER_BASE(oldcounters, nentries, cpu);
for (i = 0; i < nentries; i++)
- ADD_COUNTER(counters[i], counter_base[i].pcnt,
- counter_base[i].bcnt);
+ ADD_COUNTER(counters[i], counter_base[i].bcnt,
+ counter_base[i].pcnt);
}
}
@@ -1015,8 +1018,8 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
* the check on the size is done later, when we have the lock
*/
if (repl->num_counters) {
- unsigned long size = repl->num_counters * sizeof(*counterstmp);
- counterstmp = vmalloc(size);
+ counterstmp = vmalloc_array(repl->num_counters,
+ sizeof(*counterstmp));
if (!counterstmp)
return -ENOMEM;
}
@@ -1037,12 +1040,12 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
goto free_iterate;
}
- /* the table doesn't like it */
- if (t->check && (ret = t->check(newinfo, repl->valid_hooks)))
+ if (repl->valid_hooks != t->valid_hooks) {
+ ret = -EINVAL;
goto free_unlock;
+ }
if (repl->num_counters && repl->num_counters != t->private->nentries) {
- BUGPRINT("Wrong nr. of counters requested\n");
ret = -EINVAL;
goto free_unlock;
}
@@ -1071,7 +1074,7 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
*/
if (repl->num_counters &&
copy_to_user(repl->counters, counterstmp,
- repl->num_counters * sizeof(struct ebt_counter))) {
+ array_size(repl->num_counters, sizeof(struct ebt_counter)))) {
/* Silent error, can't fail, new table is already in place */
net_warn_ratelimited("ebtables: counters copy to user failed while replacing table\n");
}
@@ -1085,15 +1088,9 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
vfree(table);
vfree(counterstmp);
-#ifdef CONFIG_AUDIT
- if (audit_enabled) {
- audit_log(audit_context(), GFP_KERNEL,
- AUDIT_NETFILTER_CFG,
- "table=%s family=%u entries=%u",
- repl->name, AF_BRIDGE, repl->nentries);
- }
-#endif
- return ret;
+ audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries,
+ AUDIT_XT_OP_REPLACE, GFP_KERNEL);
+ return 0;
free_unlock:
mutex_unlock(&ebt_mutex);
@@ -1108,25 +1105,23 @@ free_counterstmp:
}
/* replace the table */
-static int do_replace(struct net *net, const void __user *user,
- unsigned int len)
+static int do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
int ret, countersize;
struct ebt_table_info *newinfo;
struct ebt_replace tmp;
- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ if (len < sizeof(tmp))
+ return -EINVAL;
+ if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
return -EFAULT;
- if (len != sizeof(tmp) + tmp.entries_size) {
- BUGPRINT("Wrong len argument\n");
+ if (len != sizeof(tmp) + tmp.entries_size)
return -EINVAL;
- }
- if (tmp.entries_size == 0) {
- BUGPRINT("Entries_size never zero\n");
+ if (tmp.entries_size == 0)
return -EINVAL;
- }
+
/* overflow check */
if (tmp.nentries >= ((INT_MAX - sizeof(struct ebt_table_info)) /
NR_CPUS - SMP_CACHE_BYTES) / sizeof(struct ebt_counter))
@@ -1137,21 +1132,20 @@ static int do_replace(struct net *net, const void __user *user,
tmp.name[sizeof(tmp.name) - 1] = 0;
countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids;
- newinfo = vmalloc(sizeof(*newinfo) + countersize);
+ newinfo = __vmalloc(sizeof(*newinfo) + countersize, GFP_KERNEL_ACCOUNT);
if (!newinfo)
return -ENOMEM;
if (countersize)
memset(newinfo->counters, 0, countersize);
- newinfo->entries = vmalloc(tmp.entries_size);
+ newinfo->entries = __vmalloc(tmp.entries_size, GFP_KERNEL_ACCOUNT);
if (!newinfo->entries) {
ret = -ENOMEM;
goto free_newinfo;
}
if (copy_from_user(
newinfo->entries, tmp.entries, tmp.entries_size) != 0) {
- BUGPRINT("Couldn't copy entries from userspace\n");
ret = -EFAULT;
goto free_entries;
}
@@ -1171,6 +1165,8 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table)
mutex_lock(&ebt_mutex);
list_del(&table->list);
mutex_unlock(&ebt_mutex);
+ audit_log_nfcfg(table->name, AF_BRIDGE, table->private->nentries,
+ AUDIT_XT_OP_UNREGISTER, GFP_KERNEL);
EBT_ENTRY_ITERATE(table->private->entries, table->private->entries_size,
ebt_cleanup_entry, net, NULL);
if (table->private->nentries)
@@ -1178,24 +1174,26 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table)
vfree(table->private->entries);
ebt_free_table_info(table->private);
vfree(table->private);
+ kfree(table->ops);
kfree(table);
}
int ebt_register_table(struct net *net, const struct ebt_table *input_table,
- const struct nf_hook_ops *ops, struct ebt_table **res)
+ const struct nf_hook_ops *template_ops)
{
+ struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id);
struct ebt_table_info *newinfo;
struct ebt_table *t, *table;
+ struct nf_hook_ops *ops;
+ unsigned int num_ops;
struct ebt_replace_kernel *repl;
int ret, i, countersize;
void *p;
if (input_table == NULL || (repl = input_table->table) == NULL ||
repl->entries == NULL || repl->entries_size == 0 ||
- repl->counters != NULL || input_table->private != NULL) {
- BUGPRINT("Bad table data for ebt_register_table!!!\n");
+ repl->counters != NULL || input_table->private != NULL)
return -EINVAL;
- }
/* Don't add one table to multiple lists. */
table = kmemdup(input_table, sizeof(struct ebt_table), GFP_KERNEL);
@@ -1233,24 +1231,15 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table,
((char *)repl->hook_entry[i] - repl->entries);
}
ret = translate_table(net, repl->name, newinfo);
- if (ret != 0) {
- BUGPRINT("Translate_table failed\n");
- goto free_chainstack;
- }
-
- if (table->check && table->check(newinfo, table->valid_hooks)) {
- BUGPRINT("The table doesn't like its own initial data, lol\n");
- ret = -EINVAL;
+ if (ret != 0)
goto free_chainstack;
- }
table->private = newinfo;
rwlock_init(&table->lock);
mutex_lock(&ebt_mutex);
- list_for_each_entry(t, &net->xt.tables[NFPROTO_BRIDGE], list) {
+ list_for_each_entry(t, &ebt_net->tables, list) {
if (strcmp(t->name, table->name) == 0) {
ret = -EEXIST;
- BUGPRINT("Table name already exists\n");
goto free_unlock;
}
}
@@ -1260,20 +1249,34 @@ int ebt_register_table(struct net *net, const struct ebt_table *input_table,
ret = -ENOENT;
goto free_unlock;
}
- list_add(&table->list, &net->xt.tables[NFPROTO_BRIDGE]);
- mutex_unlock(&ebt_mutex);
- WRITE_ONCE(*res, table);
+ num_ops = hweight32(table->valid_hooks);
+ if (num_ops == 0) {
+ ret = -EINVAL;
+ goto free_unlock;
+ }
- if (!ops)
- return 0;
+ ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL);
+ if (!ops) {
+ ret = -ENOMEM;
+ if (newinfo->nentries)
+ module_put(table->me);
+ goto free_unlock;
+ }
- ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
- if (ret) {
+ for (i = 0; i < num_ops; i++)
+ ops[i].priv = table;
+
+ list_add(&table->list, &ebt_net->tables);
+ mutex_unlock(&ebt_mutex);
+
+ table->ops = ops;
+ ret = nf_register_net_hooks(net, ops, num_ops);
+ if (ret)
__ebt_unregister_table(net, table);
- *res = NULL;
- }
+ audit_log_nfcfg(repl->name, AF_BRIDGE, repl->nentries,
+ AUDIT_XT_OP_REGISTER, GFP_KERNEL);
return ret;
free_unlock:
mutex_unlock(&ebt_mutex);
@@ -1288,19 +1291,93 @@ out:
return ret;
}
-void ebt_unregister_table(struct net *net, struct ebt_table *table,
- const struct nf_hook_ops *ops)
+int ebt_register_template(const struct ebt_table *t, int (*table_init)(struct net *net))
{
- if (ops)
- nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
- __ebt_unregister_table(net, table);
+ struct ebt_template *tmpl;
+
+ mutex_lock(&ebt_mutex);
+ list_for_each_entry(tmpl, &template_tables, list) {
+ if (WARN_ON_ONCE(strcmp(t->name, tmpl->name) == 0)) {
+ mutex_unlock(&ebt_mutex);
+ return -EEXIST;
+ }
+ }
+
+ tmpl = kzalloc(sizeof(*tmpl), GFP_KERNEL);
+ if (!tmpl) {
+ mutex_unlock(&ebt_mutex);
+ return -ENOMEM;
+ }
+
+ tmpl->table_init = table_init;
+ strscpy(tmpl->name, t->name, sizeof(tmpl->name));
+ tmpl->owner = t->me;
+ list_add(&tmpl->list, &template_tables);
+
+ mutex_unlock(&ebt_mutex);
+ return 0;
+}
+EXPORT_SYMBOL(ebt_register_template);
+
+void ebt_unregister_template(const struct ebt_table *t)
+{
+ struct ebt_template *tmpl;
+
+ mutex_lock(&ebt_mutex);
+ list_for_each_entry(tmpl, &template_tables, list) {
+ if (strcmp(t->name, tmpl->name))
+ continue;
+
+ list_del(&tmpl->list);
+ mutex_unlock(&ebt_mutex);
+ kfree(tmpl);
+ return;
+ }
+
+ mutex_unlock(&ebt_mutex);
+ WARN_ON_ONCE(1);
+}
+EXPORT_SYMBOL(ebt_unregister_template);
+
+static struct ebt_table *__ebt_find_table(struct net *net, const char *name)
+{
+ struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id);
+ struct ebt_table *t;
+
+ mutex_lock(&ebt_mutex);
+
+ list_for_each_entry(t, &ebt_net->tables, list) {
+ if (strcmp(t->name, name) == 0) {
+ mutex_unlock(&ebt_mutex);
+ return t;
+ }
+ }
+
+ mutex_unlock(&ebt_mutex);
+ return NULL;
+}
+
+void ebt_unregister_table_pre_exit(struct net *net, const char *name)
+{
+ struct ebt_table *table = __ebt_find_table(net, name);
+
+ if (table)
+ nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
+}
+EXPORT_SYMBOL(ebt_unregister_table_pre_exit);
+
+void ebt_unregister_table(struct net *net, const char *name)
+{
+ struct ebt_table *table = __ebt_find_table(net, name);
+
+ if (table)
+ __ebt_unregister_table(net, table);
}
/* userspace just supplied us with counters */
static int do_update_counters(struct net *net, const char *name,
- struct ebt_counter __user *counters,
- unsigned int num_counters,
- const void __user *user, unsigned int len)
+ struct ebt_counter __user *counters,
+ unsigned int num_counters, unsigned int len)
{
int i, ret;
struct ebt_counter *tmp;
@@ -1309,7 +1386,7 @@ static int do_update_counters(struct net *net, const char *name,
if (num_counters == 0)
return -EINVAL;
- tmp = vmalloc(array_size(num_counters, sizeof(*tmp)));
+ tmp = vmalloc_array(num_counters, sizeof(*tmp));
if (!tmp)
return -ENOMEM;
@@ -1318,12 +1395,12 @@ static int do_update_counters(struct net *net, const char *name,
goto free_tmp;
if (num_counters != t->private->nentries) {
- BUGPRINT("Wrong nr of counters\n");
ret = -EINVAL;
goto unlock_mutex;
}
- if (copy_from_user(tmp, counters, num_counters * sizeof(*counters))) {
+ if (copy_from_user(tmp, counters,
+ array_size(num_counters, sizeof(*counters)))) {
ret = -EFAULT;
goto unlock_mutex;
}
@@ -1333,7 +1410,7 @@ static int do_update_counters(struct net *net, const char *name,
/* we add to the counters of the first cpu */
for (i = 0; i < num_counters; i++)
- ADD_COUNTER(t->private->counters[i], tmp[i].pcnt, tmp[i].bcnt);
+ ADD_COUNTER(t->private->counters[i], tmp[i].bcnt, tmp[i].pcnt);
write_unlock_bh(&t->lock);
ret = 0;
@@ -1344,19 +1421,20 @@ free_tmp:
return ret;
}
-static int update_counters(struct net *net, const void __user *user,
- unsigned int len)
+static int update_counters(struct net *net, sockptr_t arg, unsigned int len)
{
struct ebt_replace hlp;
- if (copy_from_user(&hlp, user, sizeof(hlp)))
+ if (len < sizeof(hlp))
+ return -EINVAL;
+ if (copy_from_sockptr(&hlp, arg, sizeof(hlp)))
return -EFAULT;
if (len != sizeof(hlp) + hlp.num_counters * sizeof(struct ebt_counter))
return -EINVAL;
return do_update_counters(net, hlp.name, hlp.counters,
- hlp.num_counters, user, len);
+ hlp.num_counters, len);
}
static inline int ebt_obj_to_user(char __user *um, const char *_name,
@@ -1368,7 +1446,7 @@ static inline int ebt_obj_to_user(char __user *um, const char *_name,
/* ebtables expects 31 bytes long names but xt_match names are 29 bytes
* long. Copy 29 bytes and fill remaining bytes with zeroes.
*/
- strlcpy(name, _name, sizeof(name));
+ strscpy(name, _name, sizeof(name));
if (copy_to_user(um, name, EBT_EXTENSION_MAXNAMELEN) ||
put_user(revision, (u8 __user *)(um + EBT_EXTENSION_MAXNAMELEN)) ||
put_user(datasize, (int __user *)(um + EBT_EXTENSION_MAXNAMELEN + 1)) ||
@@ -1445,12 +1523,10 @@ static int copy_counters_to_user(struct ebt_table *t,
if (num_counters == 0)
return 0;
- if (num_counters != nentries) {
- BUGPRINT("Num_counters wrong\n");
+ if (num_counters != nentries)
return -EINVAL;
- }
- counterstmp = vmalloc(array_size(nentries, sizeof(*counterstmp)));
+ counterstmp = vmalloc_array(nentries, sizeof(*counterstmp));
if (!counterstmp)
return -ENOMEM;
@@ -1459,7 +1535,7 @@ static int copy_counters_to_user(struct ebt_table *t,
write_unlock_bh(&t->lock);
if (copy_to_user(user, counterstmp,
- nentries * sizeof(struct ebt_counter)))
+ array_size(nentries, sizeof(struct ebt_counter))))
ret = -EFAULT;
vfree(counterstmp);
return ret;
@@ -1494,15 +1570,11 @@ static int copy_everything_to_user(struct ebt_table *t, void __user *user,
(tmp.num_counters ? nentries * sizeof(struct ebt_counter) : 0))
return -EINVAL;
- if (tmp.nentries != nentries) {
- BUGPRINT("Nentries wrong\n");
+ if (tmp.nentries != nentries)
return -EINVAL;
- }
- if (tmp.entries_size != entries_size) {
- BUGPRINT("Wrong size\n");
+ if (tmp.entries_size != entries_size)
return -EINVAL;
- }
ret = copy_counters_to_user(t, oldcounters, tmp.counters,
tmp.num_counters, nentries);
@@ -1514,88 +1586,7 @@ static int copy_everything_to_user(struct ebt_table *t, void __user *user,
ebt_entry_to_user, entries, tmp.entries);
}
-static int do_ebt_set_ctl(struct sock *sk,
- int cmd, void __user *user, unsigned int len)
-{
- int ret;
- struct net *net = sock_net(sk);
-
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- return -EPERM;
-
- switch (cmd) {
- case EBT_SO_SET_ENTRIES:
- ret = do_replace(net, user, len);
- break;
- case EBT_SO_SET_COUNTERS:
- ret = update_counters(net, user, len);
- break;
- default:
- ret = -EINVAL;
- }
- return ret;
-}
-
-static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
-{
- int ret;
- struct ebt_replace tmp;
- struct ebt_table *t;
- struct net *net = sock_net(sk);
-
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- return -EPERM;
-
- if (copy_from_user(&tmp, user, sizeof(tmp)))
- return -EFAULT;
-
- tmp.name[sizeof(tmp.name) - 1] = '\0';
-
- t = find_table_lock(net, tmp.name, &ret, &ebt_mutex);
- if (!t)
- return ret;
-
- switch (cmd) {
- case EBT_SO_GET_INFO:
- case EBT_SO_GET_INIT_INFO:
- if (*len != sizeof(struct ebt_replace)) {
- ret = -EINVAL;
- mutex_unlock(&ebt_mutex);
- break;
- }
- if (cmd == EBT_SO_GET_INFO) {
- tmp.nentries = t->private->nentries;
- tmp.entries_size = t->private->entries_size;
- tmp.valid_hooks = t->valid_hooks;
- } else {
- tmp.nentries = t->table->nentries;
- tmp.entries_size = t->table->entries_size;
- tmp.valid_hooks = t->table->valid_hooks;
- }
- mutex_unlock(&ebt_mutex);
- if (copy_to_user(user, &tmp, *len) != 0) {
- BUGPRINT("c2u Didn't work\n");
- ret = -EFAULT;
- break;
- }
- ret = 0;
- break;
-
- case EBT_SO_GET_ENTRIES:
- case EBT_SO_GET_INIT_ENTRIES:
- ret = copy_everything_to_user(t, user, len, cmd);
- mutex_unlock(&ebt_mutex);
- break;
-
- default:
- mutex_unlock(&ebt_mutex);
- ret = -EINVAL;
- }
-
- return ret;
-}
-
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
/* 32 bit-userspace compatibility definitions. */
struct compat_ebt_replace {
char name[EBT_TABLE_MAXNAMELEN];
@@ -1621,7 +1612,7 @@ struct compat_ebt_entry_mwt {
compat_uptr_t ptr;
} u;
compat_uint_t match_size;
- compat_uint_t data[0] __attribute__ ((aligned (__alignof__(struct compat_ebt_replace))));
+ compat_uint_t data[] __aligned(__alignof__(struct compat_ebt_replace));
};
/* account for possible padding between match_size and ->data */
@@ -1830,20 +1821,28 @@ static int compat_calc_entry(const struct ebt_entry *e,
return 0;
}
+static int ebt_compat_init_offsets(unsigned int number)
+{
+ if (number > INT_MAX)
+ return -EINVAL;
+
+ /* also count the base chain policies */
+ number += NF_BR_NUMHOOKS;
+
+ return xt_compat_init_offsets(NFPROTO_BRIDGE, number);
+}
static int compat_table_info(const struct ebt_table_info *info,
struct compat_ebt_replace *newinfo)
{
unsigned int size = info->entries_size;
const void *entries = info->entries;
+ int ret;
newinfo->entries_size = size;
- if (info->nentries) {
- int ret = xt_compat_init_offsets(NFPROTO_BRIDGE,
- info->nentries);
- if (ret)
- return ret;
- }
+ ret = ebt_compat_init_offsets(info->nentries);
+ if (ret)
+ return ret;
return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info,
entries, newinfo);
@@ -1919,7 +1918,7 @@ static int ebt_buf_count(struct ebt_entries_buf_state *state, unsigned int sz)
}
static int ebt_buf_add(struct ebt_entries_buf_state *state,
- void *data, unsigned int sz)
+ const void *data, unsigned int sz)
{
if (state->buf_kern_start == NULL)
goto count_only;
@@ -1953,7 +1952,7 @@ enum compat_mwt {
EBT_COMPAT_TARGET,
};
-static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt,
+static int compat_mtw_from_user(const struct compat_ebt_entry_mwt *mwt,
enum compat_mwt compat_mwt,
struct ebt_entries_buf_state *state,
const unsigned char *base)
@@ -1991,7 +1990,7 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt,
size_kern = match_size;
module_put(match->me);
break;
- case EBT_COMPAT_WATCHER: /* fallthrough */
+ case EBT_COMPAT_WATCHER:
case EBT_COMPAT_TARGET:
wt = xt_request_find_target(NFPROTO_BRIDGE, name,
mwt->u.revision);
@@ -2031,22 +2030,23 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt,
/* return size of all matches, watchers or target, including necessary
* alignment and padding.
*/
-static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32,
+static int ebt_size_mwt(const struct compat_ebt_entry_mwt *match32,
unsigned int size_left, enum compat_mwt type,
struct ebt_entries_buf_state *state, const void *base)
{
+ const char *buf = (const char *)match32;
int growth = 0;
- char *buf;
if (size_left == 0)
return 0;
- buf = (char *) match32;
-
- while (size_left >= sizeof(*match32)) {
+ do {
struct ebt_entry_match *match_kern;
int ret;
+ if (size_left < sizeof(*match32))
+ return -EINVAL;
+
match_kern = (struct ebt_entry_match *) state->buf_kern_start;
if (match_kern) {
char *tmp;
@@ -2083,21 +2083,18 @@ static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32,
if (match_kern)
match_kern->match_size = ret;
- if (WARN_ON(type == EBT_COMPAT_TARGET && size_left))
- return -EINVAL;
-
match32 = (struct compat_ebt_entry_mwt *) buf;
- }
+ } while (size_left);
return growth;
}
/* called for all ebt_entry structures. */
-static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base,
+static int size_entry_mwt(const struct ebt_entry *entry, const unsigned char *base,
unsigned int *total,
struct ebt_entries_buf_state *state)
{
- unsigned int i, j, startoff, new_offset = 0;
+ unsigned int i, j, startoff, next_expected_off, new_offset = 0;
/* stores match/watchers/targets & offset of next struct ebt_entry: */
unsigned int offsets[4];
unsigned int *offsets_update = NULL;
@@ -2122,8 +2119,7 @@ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base,
return ret;
offsets[0] = sizeof(struct ebt_entry); /* matches come first */
- memcpy(&offsets[1], &entry->watchers_offset,
- sizeof(offsets) - sizeof(offsets[0]));
+ memcpy(&offsets[1], &entry->offsets, sizeof(entry->offsets));
if (state->buf_kern_start) {
buf_start = state->buf_kern_start + state->buf_kern_offset;
@@ -2183,11 +2179,13 @@ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base,
return ret;
}
- startoff = state->buf_user_offset - startoff;
+ next_expected_off = state->buf_user_offset - startoff;
+ if (next_expected_off != entry->next_offset)
+ return -EINVAL;
- if (WARN_ON(*total < startoff))
+ if (*total < entry->next_offset)
return -EINVAL;
- *total -= startoff;
+ *total -= entry->next_offset;
return 0;
}
@@ -2208,13 +2206,15 @@ static int compat_copy_entries(unsigned char *data, unsigned int size_user,
if (ret < 0)
return ret;
- WARN_ON(size_remaining);
+ if (size_remaining)
+ return -EINVAL;
+
return state->buf_kern_offset;
}
static int compat_copy_ebt_replace_from_user(struct ebt_replace *repl,
- void __user *user, unsigned int len)
+ sockptr_t arg, unsigned int len)
{
struct compat_ebt_replace tmp;
int i;
@@ -2222,7 +2222,7 @@ static int compat_copy_ebt_replace_from_user(struct ebt_replace *repl,
if (len < sizeof(tmp))
return -EINVAL;
- if (copy_from_user(&tmp, user, sizeof(tmp)))
+ if (copy_from_sockptr(&tmp, arg, sizeof(tmp)))
return -EFAULT;
if (len != sizeof(tmp) + tmp.entries_size)
@@ -2249,8 +2249,7 @@ static int compat_copy_ebt_replace_from_user(struct ebt_replace *repl,
return 0;
}
-static int compat_do_replace(struct net *net, void __user *user,
- unsigned int len)
+static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
int ret, i, countersize, size64;
struct ebt_table_info *newinfo;
@@ -2258,10 +2257,10 @@ static int compat_do_replace(struct net *net, void __user *user,
struct ebt_entries_buf_state state;
void *entries_tmp;
- ret = compat_copy_ebt_replace_from_user(&tmp, user, len);
+ ret = compat_copy_ebt_replace_from_user(&tmp, arg, len);
if (ret) {
/* try real handler in case userland supplied needed padding */
- if (ret == -EINVAL && do_replace(net, user, len) == 0)
+ if (ret == -EINVAL && do_replace(net, arg, len) == 0)
ret = 0;
return ret;
}
@@ -2291,9 +2290,10 @@ static int compat_do_replace(struct net *net, void __user *user,
xt_compat_lock(NFPROTO_BRIDGE);
- ret = xt_compat_init_offsets(NFPROTO_BRIDGE, tmp.nentries);
+ ret = ebt_compat_init_offsets(tmp.nentries);
if (ret < 0)
goto out_unlock;
+
ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state);
if (ret < 0)
goto out_unlock;
@@ -2315,8 +2315,10 @@ static int compat_do_replace(struct net *net, void __user *user,
state.buf_kern_len = size64;
ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state);
- if (WARN_ON(ret < 0))
+ if (WARN_ON(ret < 0)) {
+ vfree(entries_tmp);
goto out_unlock;
+ }
vfree(entries_tmp);
tmp.entries_size = size64;
@@ -2349,42 +2351,22 @@ out_unlock:
goto free_entries;
}
-static int compat_update_counters(struct net *net, void __user *user,
+static int compat_update_counters(struct net *net, sockptr_t arg,
unsigned int len)
{
struct compat_ebt_replace hlp;
- if (copy_from_user(&hlp, user, sizeof(hlp)))
+ if (len < sizeof(hlp))
+ return -EINVAL;
+ if (copy_from_sockptr(&hlp, arg, sizeof(hlp)))
return -EFAULT;
/* try real handler in case userland supplied needed padding */
if (len != sizeof(hlp) + hlp.num_counters * sizeof(struct ebt_counter))
- return update_counters(net, user, len);
+ return update_counters(net, arg, len);
return do_update_counters(net, hlp.name, compat_ptr(hlp.counters),
- hlp.num_counters, user, len);
-}
-
-static int compat_do_ebt_set_ctl(struct sock *sk,
- int cmd, void __user *user, unsigned int len)
-{
- int ret;
- struct net *net = sock_net(sk);
-
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- return -EPERM;
-
- switch (cmd) {
- case EBT_SO_SET_ENTRIES:
- ret = compat_do_replace(net, user, len);
- break;
- case EBT_SO_SET_COUNTERS:
- ret = compat_update_counters(net, user, len);
- break;
- default:
- ret = -EINVAL;
- }
- return ret;
+ hlp.num_counters, len);
}
static int compat_do_ebt_get_ctl(struct sock *sk, int cmd,
@@ -2395,13 +2377,9 @@ static int compat_do_ebt_get_ctl(struct sock *sk, int cmd,
struct ebt_table *t;
struct net *net = sock_net(sk);
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- return -EPERM;
-
- /* try real handler in case userland supplied needed padding */
- if ((cmd == EBT_SO_GET_INFO ||
- cmd == EBT_SO_GET_INIT_INFO) && *len != sizeof(tmp))
- return do_ebt_get_ctl(sk, cmd, user, len);
+ if ((cmd == EBT_SO_GET_INFO || cmd == EBT_SO_GET_INIT_INFO) &&
+ *len != sizeof(struct compat_ebt_replace))
+ return -EINVAL;
if (copy_from_user(&tmp, user, sizeof(tmp)))
return -EFAULT;
@@ -2464,23 +2442,129 @@ static int compat_do_ebt_get_ctl(struct sock *sk, int cmd,
}
#endif
+static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+ struct net *net = sock_net(sk);
+ struct ebt_replace tmp;
+ struct ebt_table *t;
+ int ret;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ /* try real handler in case userland supplied needed padding */
+ if (in_compat_syscall() &&
+ ((cmd != EBT_SO_GET_INFO && cmd != EBT_SO_GET_INIT_INFO) ||
+ *len != sizeof(tmp)))
+ return compat_do_ebt_get_ctl(sk, cmd, user, len);
+#endif
+
+ if (copy_from_user(&tmp, user, sizeof(tmp)))
+ return -EFAULT;
+
+ tmp.name[sizeof(tmp.name) - 1] = '\0';
+
+ t = find_table_lock(net, tmp.name, &ret, &ebt_mutex);
+ if (!t)
+ return ret;
+
+ switch (cmd) {
+ case EBT_SO_GET_INFO:
+ case EBT_SO_GET_INIT_INFO:
+ if (*len != sizeof(struct ebt_replace)) {
+ ret = -EINVAL;
+ mutex_unlock(&ebt_mutex);
+ break;
+ }
+ if (cmd == EBT_SO_GET_INFO) {
+ tmp.nentries = t->private->nentries;
+ tmp.entries_size = t->private->entries_size;
+ tmp.valid_hooks = t->valid_hooks;
+ } else {
+ tmp.nentries = t->table->nentries;
+ tmp.entries_size = t->table->entries_size;
+ tmp.valid_hooks = t->table->valid_hooks;
+ }
+ mutex_unlock(&ebt_mutex);
+ if (copy_to_user(user, &tmp, *len) != 0) {
+ ret = -EFAULT;
+ break;
+ }
+ ret = 0;
+ break;
+
+ case EBT_SO_GET_ENTRIES:
+ case EBT_SO_GET_INIT_ENTRIES:
+ ret = copy_everything_to_user(t, user, len, cmd);
+ mutex_unlock(&ebt_mutex);
+ break;
+
+ default:
+ mutex_unlock(&ebt_mutex);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int do_ebt_set_ctl(struct sock *sk, int cmd, sockptr_t arg,
+ unsigned int len)
+{
+ struct net *net = sock_net(sk);
+ int ret;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case EBT_SO_SET_ENTRIES:
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ ret = compat_do_replace(net, arg, len);
+ else
+#endif
+ ret = do_replace(net, arg, len);
+ break;
+ case EBT_SO_SET_COUNTERS:
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ ret = compat_update_counters(net, arg, len);
+ else
+#endif
+ ret = update_counters(net, arg, len);
+ break;
+ default:
+ ret = -EINVAL;
+ }
+ return ret;
+}
+
static struct nf_sockopt_ops ebt_sockopts = {
.pf = PF_INET,
.set_optmin = EBT_BASE_CTL,
.set_optmax = EBT_SO_SET_MAX + 1,
.set = do_ebt_set_ctl,
-#ifdef CONFIG_COMPAT
- .compat_set = compat_do_ebt_set_ctl,
-#endif
.get_optmin = EBT_BASE_CTL,
.get_optmax = EBT_SO_GET_MAX + 1,
.get = do_ebt_get_ctl,
-#ifdef CONFIG_COMPAT
- .compat_get = compat_do_ebt_get_ctl,
-#endif
.owner = THIS_MODULE,
};
+static int __net_init ebt_pernet_init(struct net *net)
+{
+ struct ebt_pernet *ebt_net = net_generic(net, ebt_pernet_id);
+
+ INIT_LIST_HEAD(&ebt_net->tables);
+ return 0;
+}
+
+static struct pernet_operations ebt_net_ops = {
+ .init = ebt_pernet_init,
+ .id = &ebt_pernet_id,
+ .size = sizeof(struct ebt_pernet),
+};
+
static int __init ebtables_init(void)
{
int ret;
@@ -2494,13 +2578,21 @@ static int __init ebtables_init(void)
return ret;
}
+ ret = register_pernet_subsys(&ebt_net_ops);
+ if (ret < 0) {
+ nf_unregister_sockopt(&ebt_sockopts);
+ xt_unregister_target(&ebt_standard_target);
+ return ret;
+ }
+
return 0;
}
-static void __exit ebtables_fini(void)
+static void ebtables_fini(void)
{
nf_unregister_sockopt(&ebt_sockopts);
xt_unregister_target(&ebt_standard_target);
+ unregister_pernet_subsys(&ebt_net_ops);
}
EXPORT_SYMBOL(ebt_register_table);
@@ -2509,3 +2601,4 @@ EXPORT_SYMBOL(ebt_do_table);
module_init(ebtables_init);
module_exit(ebtables_fini);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("ebtables legacy core");
diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
new file mode 100644
index 000000000000..6482de4d8750
--- /dev/null
+++ b/net/bridge/netfilter/nf_conntrack_bridge.c
@@ -0,0 +1,455 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/icmp.h>
+#include <linux/sysctl.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_bridge.h>
+
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+#include "../br_private.h"
+
+/* Best effort variant of ip_do_fragment which preserves geometry, unless skbuff
+ * has been linearized or cloned.
+ */
+static int nf_br_ip_fragment(struct net *net, struct sock *sk,
+ struct sk_buff *skb,
+ struct nf_bridge_frag_data *data,
+ int (*output)(struct net *, struct sock *sk,
+ const struct nf_bridge_frag_data *data,
+ struct sk_buff *))
+{
+ int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+ u8 tstamp_type = skb->tstamp_type;
+ unsigned int hlen, ll_rs, mtu;
+ ktime_t tstamp = skb->tstamp;
+ struct ip_frag_state state;
+ struct iphdr *iph;
+ int err = 0;
+
+ /* for offloaded checksums cleanup checksum before fragmentation */
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ (err = skb_checksum_help(skb)))
+ goto blackhole;
+
+ iph = ip_hdr(skb);
+
+ /*
+ * Setup starting values
+ */
+
+ hlen = iph->ihl * 4;
+ frag_max_size -= hlen;
+ ll_rs = LL_RESERVED_SPACE(skb->dev);
+ mtu = skb->dev->mtu;
+
+ if (skb_has_frag_list(skb)) {
+ unsigned int first_len = skb_pagelen(skb);
+ struct ip_fraglist_iter iter;
+ struct sk_buff *frag;
+
+ if (first_len - hlen > mtu)
+ goto blackhole;
+
+ if (skb_cloned(skb) ||
+ skb_headroom(skb) < ll_rs)
+ goto slow_path;
+
+ skb_walk_frags(skb, frag) {
+ if (frag->len > mtu)
+ goto blackhole;
+
+ if (skb_shared(frag) ||
+ skb_headroom(frag) < hlen + ll_rs)
+ goto slow_path;
+ }
+
+ ip_fraglist_init(skb, iph, hlen, &iter);
+
+ for (;;) {
+ if (iter.frag)
+ ip_fraglist_prepare(skb, &iter);
+
+ skb_set_delivery_time(skb, tstamp, tstamp_type);
+ err = output(net, sk, data, skb);
+ if (err || !iter.frag)
+ break;
+
+ skb = ip_fraglist_next(&iter);
+ }
+
+ if (!err)
+ return 0;
+
+ kfree_skb_list(iter.frag);
+
+ return err;
+ }
+slow_path:
+ /* This is a linearized skbuff, the original geometry is lost for us.
+ * This may also be a clone skbuff, we could preserve the geometry for
+ * the copies but probably not worth the effort.
+ */
+ ip_frag_init(skb, hlen, ll_rs, frag_max_size, false, &state);
+
+ while (state.left > 0) {
+ struct sk_buff *skb2;
+
+ skb2 = ip_frag_next(skb, &state);
+ if (IS_ERR(skb2)) {
+ err = PTR_ERR(skb2);
+ goto blackhole;
+ }
+
+ skb_set_delivery_time(skb2, tstamp, tstamp_type);
+ err = output(net, sk, data, skb2);
+ if (err)
+ goto blackhole;
+ }
+ consume_skb(skb);
+ return err;
+
+blackhole:
+ kfree_skb(skb);
+ return 0;
+}
+
+/* ip_defrag() expects IPCB() in place. */
+static void br_skb_cb_save(struct sk_buff *skb, struct br_input_skb_cb *cb,
+ size_t inet_skb_parm_size)
+{
+ memcpy(cb, skb->cb, sizeof(*cb));
+ memset(skb->cb, 0, inet_skb_parm_size);
+}
+
+static void br_skb_cb_restore(struct sk_buff *skb,
+ const struct br_input_skb_cb *cb,
+ u16 fragsz)
+{
+ memcpy(skb->cb, cb, sizeof(*cb));
+ BR_INPUT_SKB_CB(skb)->frag_max_size = fragsz;
+}
+
+static unsigned int nf_ct_br_defrag4(struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
+ enum ip_conntrack_info ctinfo;
+ struct br_input_skb_cb cb;
+ const struct nf_conn *ct;
+ int err;
+
+ if (!ip_is_fragment(ip_hdr(skb)))
+ return NF_ACCEPT;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct)
+ zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
+
+ br_skb_cb_save(skb, &cb, sizeof(struct inet_skb_parm));
+ local_bh_disable();
+ err = ip_defrag(state->net, skb,
+ IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id);
+ local_bh_enable();
+ if (!err) {
+ br_skb_cb_restore(skb, &cb, IPCB(skb)->frag_max_size);
+ skb->ignore_df = 1;
+ return NF_ACCEPT;
+ }
+
+ return NF_STOLEN;
+}
+
+static unsigned int nf_ct_br_defrag6(struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+ u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
+ enum ip_conntrack_info ctinfo;
+ struct br_input_skb_cb cb;
+ const struct nf_conn *ct;
+ int err;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct)
+ zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
+
+ br_skb_cb_save(skb, &cb, sizeof(struct inet6_skb_parm));
+
+ err = nf_ct_frag6_gather(state->net, skb,
+ IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id);
+ /* queued */
+ if (err == -EINPROGRESS)
+ return NF_STOLEN;
+
+ br_skb_cb_restore(skb, &cb, IP6CB(skb)->frag_max_size);
+ return err == 0 ? NF_ACCEPT : NF_DROP;
+#else
+ return NF_ACCEPT;
+#endif
+}
+
+static int nf_ct_br_ip_check(const struct sk_buff *skb)
+{
+ const struct iphdr *iph;
+ int nhoff, len;
+
+ nhoff = skb_network_offset(skb);
+ iph = ip_hdr(skb);
+ if (iph->ihl < 5 ||
+ iph->version != 4)
+ return -1;
+
+ len = skb_ip_totlen(skb);
+ if (skb->len < nhoff + len ||
+ len < (iph->ihl * 4))
+ return -1;
+
+ return 0;
+}
+
+static int nf_ct_br_ipv6_check(const struct sk_buff *skb)
+{
+ const struct ipv6hdr *hdr;
+ int nhoff, len;
+
+ nhoff = skb_network_offset(skb);
+ hdr = ipv6_hdr(skb);
+ if (hdr->version != 6)
+ return -1;
+
+ len = ntohs(hdr->payload_len) + sizeof(struct ipv6hdr) + nhoff;
+ if (skb->len < len)
+ return -1;
+
+ return 0;
+}
+
+static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_hook_state bridge_state = *state;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ u32 len;
+ int ret;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if ((ct && !nf_ct_is_template(ct)) ||
+ ctinfo == IP_CT_UNTRACKED)
+ return NF_ACCEPT;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ return NF_ACCEPT;
+
+ len = skb_ip_totlen(skb);
+ if (pskb_trim_rcsum(skb, len))
+ return NF_ACCEPT;
+
+ if (nf_ct_br_ip_check(skb))
+ return NF_ACCEPT;
+
+ bridge_state.pf = NFPROTO_IPV4;
+ ret = nf_ct_br_defrag4(skb, &bridge_state);
+ break;
+ case htons(ETH_P_IPV6):
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ return NF_ACCEPT;
+
+ len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
+ if (pskb_trim_rcsum(skb, len))
+ return NF_ACCEPT;
+
+ if (nf_ct_br_ipv6_check(skb))
+ return NF_ACCEPT;
+
+ bridge_state.pf = NFPROTO_IPV6;
+ ret = nf_ct_br_defrag6(skb, &bridge_state);
+ break;
+ default:
+ nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
+ return NF_ACCEPT;
+ }
+
+ if (ret != NF_ACCEPT)
+ return ret;
+
+ return nf_conntrack_in(skb, &bridge_state);
+}
+
+static unsigned int nf_ct_bridge_in(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ bool promisc = BR_INPUT_SKB_CB(skb)->promisc;
+ struct nf_conntrack *nfct = skb_nfct(skb);
+ struct nf_conn *ct;
+
+ if (promisc) {
+ nf_reset_ct(skb);
+ return NF_ACCEPT;
+ }
+
+ if (!nfct || skb->pkt_type == PACKET_HOST)
+ return NF_ACCEPT;
+
+ /* nf_conntrack_confirm() cannot handle concurrent clones,
+ * this happens for broad/multicast frames with e.g. macvlan on top
+ * of the bridge device.
+ */
+ ct = container_of(nfct, struct nf_conn, ct_general);
+ if (nf_ct_is_confirmed(ct) || nf_ct_is_template(ct))
+ return NF_ACCEPT;
+
+ /* let inet prerouting call conntrack again */
+ skb->_nfct = 0;
+ nf_ct_put(ct);
+
+ return NF_ACCEPT;
+}
+
+static void nf_ct_bridge_frag_save(struct sk_buff *skb,
+ struct nf_bridge_frag_data *data)
+{
+ if (skb_vlan_tag_present(skb)) {
+ data->vlan_present = true;
+ data->vlan_tci = skb->vlan_tci;
+ data->vlan_proto = skb->vlan_proto;
+ } else {
+ data->vlan_present = false;
+ }
+ skb_copy_from_linear_data_offset(skb, -ETH_HLEN, data->mac, ETH_HLEN);
+}
+
+static unsigned int
+nf_ct_bridge_refrag(struct sk_buff *skb, const struct nf_hook_state *state,
+ int (*output)(struct net *, struct sock *sk,
+ const struct nf_bridge_frag_data *data,
+ struct sk_buff *))
+{
+ struct nf_bridge_frag_data data;
+
+ if (!BR_INPUT_SKB_CB(skb)->frag_max_size)
+ return NF_ACCEPT;
+
+ nf_ct_bridge_frag_save(skb, &data);
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ nf_br_ip_fragment(state->net, state->sk, skb, &data, output);
+ break;
+ case htons(ETH_P_IPV6):
+ nf_br_ip6_fragment(state->net, state->sk, skb, &data, output);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return NF_DROP;
+ }
+
+ return NF_STOLEN;
+}
+
+/* Actually only slow path refragmentation needs this. */
+static int nf_ct_bridge_frag_restore(struct sk_buff *skb,
+ const struct nf_bridge_frag_data *data)
+{
+ int err;
+
+ err = skb_cow_head(skb, ETH_HLEN);
+ if (err) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+ if (data->vlan_present)
+ __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci);
+ else if (skb_vlan_tag_present(skb))
+ __vlan_hwaccel_clear_tag(skb);
+
+ skb_copy_to_linear_data_offset(skb, -ETH_HLEN, data->mac, ETH_HLEN);
+ skb_reset_mac_header(skb);
+
+ return 0;
+}
+
+static int nf_ct_bridge_refrag_post(struct net *net, struct sock *sk,
+ const struct nf_bridge_frag_data *data,
+ struct sk_buff *skb)
+{
+ int err;
+
+ err = nf_ct_bridge_frag_restore(skb, data);
+ if (err < 0)
+ return err;
+
+ return br_dev_queue_push_xmit(net, sk, skb);
+}
+
+static unsigned int nf_ct_bridge_post(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ int ret;
+
+ ret = nf_confirm(priv, skb, state);
+ if (ret != NF_ACCEPT)
+ return ret;
+
+ return nf_ct_bridge_refrag(skb, state, nf_ct_bridge_refrag_post);
+}
+
+static struct nf_hook_ops nf_ct_bridge_hook_ops[] __read_mostly = {
+ {
+ .hook = nf_ct_bridge_pre,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_PRE_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK,
+ },
+ {
+ .hook = nf_ct_bridge_in,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
+ },
+ {
+ .hook = nf_ct_bridge_post,
+ .pf = NFPROTO_BRIDGE,
+ .hooknum = NF_BR_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
+ },
+};
+
+static struct nf_ct_bridge_info bridge_info = {
+ .ops = nf_ct_bridge_hook_ops,
+ .ops_size = ARRAY_SIZE(nf_ct_bridge_hook_ops),
+ .me = THIS_MODULE,
+};
+
+static int __init nf_conntrack_l3proto_bridge_init(void)
+{
+ nf_ct_bridge_register(&bridge_info);
+
+ return 0;
+}
+
+static void __exit nf_conntrack_l3proto_bridge_fini(void)
+{
+ nf_ct_bridge_unregister(&bridge_info);
+}
+
+module_init(nf_conntrack_l3proto_bridge_init);
+module_exit(nf_conntrack_l3proto_bridge_fini);
+
+MODULE_ALIAS("nf_conntrack-" __stringify(AF_BRIDGE));
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Bridge IPv4 and IPv6 connection tracking");
diff --git a/net/bridge/netfilter/nf_log_bridge.c b/net/bridge/netfilter/nf_log_bridge.c
deleted file mode 100644
index bd2b3c78f59b..000000000000
--- a/net/bridge/netfilter/nf_log_bridge.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * (C) 2014 by Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/skbuff.h>
-#include <linux/if_bridge.h>
-#include <linux/ip.h>
-#include <net/route.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_log.h>
-
-static void nf_log_bridge_packet(struct net *net, u_int8_t pf,
- unsigned int hooknum,
- const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *loginfo,
- const char *prefix)
-{
- nf_log_l2packet(net, pf, eth_hdr(skb)->h_proto, hooknum, skb,
- in, out, loginfo, prefix);
-}
-
-static struct nf_logger nf_bridge_logger __read_mostly = {
- .name = "nf_log_bridge",
- .type = NF_LOG_TYPE_LOG,
- .logfn = nf_log_bridge_packet,
- .me = THIS_MODULE,
-};
-
-static int __net_init nf_log_bridge_net_init(struct net *net)
-{
- return nf_log_set(net, NFPROTO_BRIDGE, &nf_bridge_logger);
-}
-
-static void __net_exit nf_log_bridge_net_exit(struct net *net)
-{
- nf_log_unset(net, &nf_bridge_logger);
-}
-
-static struct pernet_operations nf_log_bridge_net_ops = {
- .init = nf_log_bridge_net_init,
- .exit = nf_log_bridge_net_exit,
-};
-
-static int __init nf_log_bridge_init(void)
-{
- int ret;
-
- /* Request to load the real packet loggers. */
- nf_logger_request_module(NFPROTO_IPV4, NF_LOG_TYPE_LOG);
- nf_logger_request_module(NFPROTO_IPV6, NF_LOG_TYPE_LOG);
- nf_logger_request_module(NFPROTO_ARP, NF_LOG_TYPE_LOG);
-
- ret = register_pernet_subsys(&nf_log_bridge_net_ops);
- if (ret < 0)
- return ret;
-
- nf_log_register(NFPROTO_BRIDGE, &nf_bridge_logger);
- return 0;
-}
-
-static void __exit nf_log_bridge_exit(void)
-{
- unregister_pernet_subsys(&nf_log_bridge_net_ops);
- nf_log_unregister(&nf_bridge_logger);
-}
-
-module_init(nf_log_bridge_init);
-module_exit(nf_log_bridge_exit);
-
-MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
-MODULE_DESCRIPTION("Netfilter bridge packet logging");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NF_LOGGER(AF_BRIDGE, 0);
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
new file mode 100644
index 000000000000..b7af36bbd306
--- /dev/null
+++ b/net/bridge/netfilter/nft_meta_bridge.c
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_meta.h>
+#include <linux/if_bridge.h>
+#include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
+
+#include "../br_private.h"
+
+static const struct net_device *
+nft_meta_get_bridge(const struct net_device *dev)
+{
+ if (dev && netif_is_bridge_port(dev))
+ return netdev_master_upper_dev_get_rcu((struct net_device *)dev);
+
+ return NULL;
+}
+
+static void nft_meta_bridge_get_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+ const struct net_device *in = nft_in(pkt), *out = nft_out(pkt);
+ u32 *dest = &regs->data[priv->dreg];
+ const struct net_device *br_dev;
+
+ switch (priv->key) {
+ case NFT_META_BRI_IIFNAME:
+ br_dev = nft_meta_get_bridge(in);
+ break;
+ case NFT_META_BRI_OIFNAME:
+ br_dev = nft_meta_get_bridge(out);
+ break;
+ case NFT_META_BRI_IIFPVID: {
+ u16 p_pvid;
+
+ br_dev = nft_meta_get_bridge(in);
+ if (!br_dev || !br_vlan_enabled(br_dev))
+ goto err;
+
+ br_vlan_get_pvid_rcu(in, &p_pvid);
+ nft_reg_store16(dest, p_pvid);
+ return;
+ }
+ case NFT_META_BRI_IIFVPROTO: {
+ u16 p_proto;
+
+ br_dev = nft_meta_get_bridge(in);
+ if (!br_dev || !br_vlan_enabled(br_dev))
+ goto err;
+
+ br_vlan_get_proto(br_dev, &p_proto);
+ nft_reg_store_be16(dest, htons(p_proto));
+ return;
+ }
+ case NFT_META_BRI_IIFHWADDR:
+ br_dev = nft_meta_get_bridge(in);
+ if (!br_dev)
+ goto err;
+
+ memcpy(dest, br_dev->dev_addr, ETH_ALEN);
+ return;
+ default:
+ return nft_meta_get_eval(expr, regs, pkt);
+ }
+
+ strscpy_pad((char *)dest, br_dev ? br_dev->name : "", IFNAMSIZ);
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_meta_bridge_get_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_meta *priv = nft_expr_priv(expr);
+ unsigned int len;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+ switch (priv->key) {
+ case NFT_META_BRI_IIFNAME:
+ case NFT_META_BRI_OIFNAME:
+ len = IFNAMSIZ;
+ break;
+ case NFT_META_BRI_IIFPVID:
+ case NFT_META_BRI_IIFVPROTO:
+ len = sizeof(u16);
+ break;
+ case NFT_META_BRI_IIFHWADDR:
+ len = ETH_ALEN;
+ break;
+ default:
+ return nft_meta_get_init(ctx, expr, tb);
+ }
+
+ priv->len = len;
+ return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg,
+ NULL, NFT_DATA_VALUE, len);
+}
+
+static struct nft_expr_type nft_meta_bridge_type;
+static const struct nft_expr_ops nft_meta_bridge_get_ops = {
+ .type = &nft_meta_bridge_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+ .eval = nft_meta_bridge_get_eval,
+ .init = nft_meta_bridge_get_init,
+ .dump = nft_meta_get_dump,
+ .reduce = nft_meta_get_reduce,
+};
+
+static void nft_meta_bridge_set_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_meta *meta = nft_expr_priv(expr);
+ u32 *sreg = &regs->data[meta->sreg];
+ struct sk_buff *skb = pkt->skb;
+ u8 value8;
+
+ switch (meta->key) {
+ case NFT_META_BRI_BROUTE:
+ value8 = nft_reg_load8(sreg);
+ BR_INPUT_SKB_CB(skb)->br_netfilter_broute = !!value8;
+ break;
+ default:
+ nft_meta_set_eval(expr, regs, pkt);
+ }
+}
+
+static int nft_meta_bridge_set_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_meta *priv = nft_expr_priv(expr);
+ unsigned int len;
+ int err;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+ switch (priv->key) {
+ case NFT_META_BRI_BROUTE:
+ len = sizeof(u8);
+ break;
+ default:
+ return nft_meta_set_init(ctx, expr, tb);
+ }
+
+ priv->len = len;
+ err = nft_parse_register_load(ctx, tb[NFTA_META_SREG], &priv->sreg, len);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static bool nft_meta_bridge_set_reduce(struct nft_regs_track *track,
+ const struct nft_expr *expr)
+{
+ int i;
+
+ for (i = 0; i < NFT_REG32_NUM; i++) {
+ if (!track->regs[i].selector)
+ continue;
+
+ if (track->regs[i].selector->ops != &nft_meta_bridge_get_ops)
+ continue;
+
+ __nft_reg_track_cancel(track, i);
+ }
+
+ return false;
+}
+
+static int nft_meta_bridge_set_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ struct nft_meta *priv = nft_expr_priv(expr);
+ unsigned int hooks;
+
+ switch (priv->key) {
+ case NFT_META_BRI_BROUTE:
+ case NFT_META_BRI_IIFHWADDR:
+ hooks = 1 << NF_BR_PRE_ROUTING;
+ break;
+ default:
+ return nft_meta_set_validate(ctx, expr);
+ }
+
+ return nft_chain_validate_hooks(ctx->chain, hooks);
+}
+
+static const struct nft_expr_ops nft_meta_bridge_set_ops = {
+ .type = &nft_meta_bridge_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+ .eval = nft_meta_bridge_set_eval,
+ .init = nft_meta_bridge_set_init,
+ .destroy = nft_meta_set_destroy,
+ .dump = nft_meta_set_dump,
+ .reduce = nft_meta_bridge_set_reduce,
+ .validate = nft_meta_bridge_set_validate,
+};
+
+static const struct nft_expr_ops *
+nft_meta_bridge_select_ops(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[])
+{
+ if (tb[NFTA_META_KEY] == NULL)
+ return ERR_PTR(-EINVAL);
+
+ if (tb[NFTA_META_DREG] && tb[NFTA_META_SREG])
+ return ERR_PTR(-EINVAL);
+
+ if (tb[NFTA_META_DREG])
+ return &nft_meta_bridge_get_ops;
+
+ if (tb[NFTA_META_SREG])
+ return &nft_meta_bridge_set_ops;
+
+ return ERR_PTR(-EINVAL);
+}
+
+static struct nft_expr_type nft_meta_bridge_type __read_mostly = {
+ .family = NFPROTO_BRIDGE,
+ .name = "meta",
+ .select_ops = nft_meta_bridge_select_ops,
+ .policy = nft_meta_policy,
+ .maxattr = NFTA_META_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_meta_bridge_module_init(void)
+{
+ return nft_register_expr(&nft_meta_bridge_type);
+}
+
+static void __exit nft_meta_bridge_module_exit(void)
+{
+ nft_unregister_expr(&nft_meta_bridge_type);
+}
+
+module_init(nft_meta_bridge_module_init);
+module_exit(nft_meta_bridge_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("wenxu <wenxu@ucloud.cn>");
+MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "meta");
+MODULE_DESCRIPTION("Support for bridge dedicated meta key");
diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index 08cbed7d940e..1cb5c16e97b7 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2014 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/kernel.h>
@@ -34,30 +31,12 @@ static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb,
ether_addr_copy(eth->h_dest, eth_hdr(oldskb)->h_source);
eth->h_proto = eth_hdr(oldskb)->h_proto;
skb_pull(nskb, ETH_HLEN);
-}
-
-static int nft_bridge_iphdr_validate(struct sk_buff *skb)
-{
- struct iphdr *iph;
- u32 len;
-
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
- return 0;
-
- iph = ip_hdr(skb);
- if (iph->ihl < 5 || iph->version != 4)
- return 0;
-
- len = ntohs(iph->tot_len);
- if (skb->len < len)
- return 0;
- else if (len < (iph->ihl*4))
- return 0;
- if (!pskb_may_pull(skb, iph->ihl*4))
- return 0;
+ if (skb_vlan_tag_present(oldskb)) {
+ u16 vid = skb_vlan_tag_get(oldskb);
- return 1;
+ __vlan_hwaccel_put_tag(nskb, oldskb->vlan_proto, vid);
+ }
}
/* We cannot use oldskb->dev, it can be either bridge device (NF_BRIDGE INPUT)
@@ -69,29 +48,11 @@ static void nft_reject_br_send_v4_tcp_reset(struct net *net,
int hook)
{
struct sk_buff *nskb;
- struct iphdr *niph;
- const struct tcphdr *oth;
- struct tcphdr _oth;
-
- if (!nft_bridge_iphdr_validate(oldskb))
- return;
-
- oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook);
- if (!oth)
- return;
- nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
- LL_MAX_HEADER, GFP_ATOMIC);
+ nskb = nf_reject_skb_v4_tcp_reset(net, oldskb, NULL, hook);
if (!nskb)
return;
- skb_reserve(nskb, LL_MAX_HEADER);
- niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
- net->ipv4.sysctl_ip_default_ttl);
- nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
- niph->tot_len = htons(nskb->len);
- ip_send_check(niph);
-
nft_reject_br_push_etherhdr(oldskb, nskb);
br_forward(br_port_get_rcu(dev), nskb, false, true);
@@ -103,138 +64,32 @@ static void nft_reject_br_send_v4_unreach(struct net *net,
int hook, u8 code)
{
struct sk_buff *nskb;
- struct iphdr *niph;
- struct icmphdr *icmph;
- unsigned int len;
- __wsum csum;
- u8 proto;
-
- if (!nft_bridge_iphdr_validate(oldskb))
- return;
-
- /* IP header checks: fragment. */
- if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
- return;
-
- /* RFC says return as much as we can without exceeding 576 bytes. */
- len = min_t(unsigned int, 536, oldskb->len);
-
- if (!pskb_may_pull(oldskb, len))
- return;
-
- if (pskb_trim_rcsum(oldskb, ntohs(ip_hdr(oldskb)->tot_len)))
- return;
- if (ip_hdr(oldskb)->protocol == IPPROTO_TCP ||
- ip_hdr(oldskb)->protocol == IPPROTO_UDP)
- proto = ip_hdr(oldskb)->protocol;
- else
- proto = 0;
-
- if (!skb_csum_unnecessary(oldskb) &&
- nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), proto))
- return;
-
- nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct icmphdr) +
- LL_MAX_HEADER + len, GFP_ATOMIC);
+ nskb = nf_reject_skb_v4_unreach(net, oldskb, NULL, hook, code);
if (!nskb)
return;
- skb_reserve(nskb, LL_MAX_HEADER);
- niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP,
- net->ipv4.sysctl_ip_default_ttl);
-
- skb_reset_transport_header(nskb);
- icmph = skb_put_zero(nskb, sizeof(struct icmphdr));
- icmph->type = ICMP_DEST_UNREACH;
- icmph->code = code;
-
- skb_put_data(nskb, skb_network_header(oldskb), len);
-
- csum = csum_partial((void *)icmph, len + sizeof(struct icmphdr), 0);
- icmph->checksum = csum_fold(csum);
-
- niph->tot_len = htons(nskb->len);
- ip_send_check(niph);
-
nft_reject_br_push_etherhdr(oldskb, nskb);
br_forward(br_port_get_rcu(dev), nskb, false, true);
}
-static int nft_bridge_ip6hdr_validate(struct sk_buff *skb)
-{
- struct ipv6hdr *hdr;
- u32 pkt_len;
-
- if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
- return 0;
-
- hdr = ipv6_hdr(skb);
- if (hdr->version != 6)
- return 0;
-
- pkt_len = ntohs(hdr->payload_len);
- if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
- return 0;
-
- return 1;
-}
-
static void nft_reject_br_send_v6_tcp_reset(struct net *net,
struct sk_buff *oldskb,
const struct net_device *dev,
int hook)
{
struct sk_buff *nskb;
- const struct tcphdr *oth;
- struct tcphdr _oth;
- unsigned int otcplen;
- struct ipv6hdr *nip6h;
- if (!nft_bridge_ip6hdr_validate(oldskb))
- return;
-
- oth = nf_reject_ip6_tcphdr_get(oldskb, &_oth, &otcplen, hook);
- if (!oth)
- return;
-
- nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct tcphdr) +
- LL_MAX_HEADER, GFP_ATOMIC);
+ nskb = nf_reject_skb_v6_tcp_reset(net, oldskb, NULL, hook);
if (!nskb)
return;
- skb_reserve(nskb, LL_MAX_HEADER);
- nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP,
- net->ipv6.devconf_all->hop_limit);
- nf_reject_ip6_tcphdr_put(nskb, oldskb, oth, otcplen);
- nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr));
-
nft_reject_br_push_etherhdr(oldskb, nskb);
br_forward(br_port_get_rcu(dev), nskb, false, true);
}
-static bool reject6_br_csum_ok(struct sk_buff *skb, int hook)
-{
- const struct ipv6hdr *ip6h = ipv6_hdr(skb);
- int thoff;
- __be16 fo;
- u8 proto = ip6h->nexthdr;
-
- if (skb_csum_unnecessary(skb))
- return true;
-
- if (ip6h->payload_len &&
- pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h)))
- return false;
-
- thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo);
- if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0)
- return false;
-
- return nf_ip6_checksum(skb, hook, thoff, proto) == 0;
-}
static void nft_reject_br_send_v6_unreach(struct net *net,
struct sk_buff *oldskb,
@@ -242,49 +97,11 @@ static void nft_reject_br_send_v6_unreach(struct net *net,
int hook, u8 code)
{
struct sk_buff *nskb;
- struct ipv6hdr *nip6h;
- struct icmp6hdr *icmp6h;
- unsigned int len;
-
- if (!nft_bridge_ip6hdr_validate(oldskb))
- return;
-
- /* Include "As much of invoking packet as possible without the ICMPv6
- * packet exceeding the minimum IPv6 MTU" in the ICMP payload.
- */
- len = min_t(unsigned int, 1220, oldskb->len);
-
- if (!pskb_may_pull(oldskb, len))
- return;
-
- if (!reject6_br_csum_ok(oldskb, hook))
- return;
- nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr) +
- LL_MAX_HEADER + len, GFP_ATOMIC);
+ nskb = nf_reject_skb_v6_unreach(net, oldskb, NULL, hook, code);
if (!nskb)
return;
- skb_reserve(nskb, LL_MAX_HEADER);
- nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_ICMPV6,
- net->ipv6.devconf_all->hop_limit);
-
- skb_reset_transport_header(nskb);
- icmp6h = skb_put_zero(nskb, sizeof(struct icmp6hdr));
- icmp6h->icmp6_type = ICMPV6_DEST_UNREACH;
- icmp6h->icmp6_code = code;
-
- skb_put_data(nskb, skb_network_header(oldskb), len);
- nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr));
-
- icmp6h->icmp6_cksum =
- csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr,
- nskb->len - sizeof(struct ipv6hdr),
- IPPROTO_ICMPV6,
- csum_partial(icmp6h,
- nskb->len - sizeof(struct ipv6hdr),
- 0));
-
nft_reject_br_push_etherhdr(oldskb, nskb);
br_forward(br_port_get_rcu(dev), nskb, false, true);
@@ -353,77 +170,21 @@ out:
}
static int nft_reject_bridge_validate(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nft_data **data)
+ const struct nft_expr *expr)
{
return nft_chain_validate_hooks(ctx->chain, (1 << NF_BR_PRE_ROUTING) |
(1 << NF_BR_LOCAL_IN));
}
-static int nft_reject_bridge_init(const struct nft_ctx *ctx,
- const struct nft_expr *expr,
- const struct nlattr * const tb[])
-{
- struct nft_reject *priv = nft_expr_priv(expr);
- int icmp_code;
-
- if (tb[NFTA_REJECT_TYPE] == NULL)
- return -EINVAL;
-
- priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE]));
- switch (priv->type) {
- case NFT_REJECT_ICMP_UNREACH:
- case NFT_REJECT_ICMPX_UNREACH:
- if (tb[NFTA_REJECT_ICMP_CODE] == NULL)
- return -EINVAL;
-
- icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]);
- if (priv->type == NFT_REJECT_ICMPX_UNREACH &&
- icmp_code > NFT_REJECT_ICMPX_MAX)
- return -EINVAL;
-
- priv->icmp_code = icmp_code;
- break;
- case NFT_REJECT_TCP_RST:
- break;
- default:
- return -EINVAL;
- }
- return 0;
-}
-
-static int nft_reject_bridge_dump(struct sk_buff *skb,
- const struct nft_expr *expr)
-{
- const struct nft_reject *priv = nft_expr_priv(expr);
-
- if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type)))
- goto nla_put_failure;
-
- switch (priv->type) {
- case NFT_REJECT_ICMP_UNREACH:
- case NFT_REJECT_ICMPX_UNREACH:
- if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code))
- goto nla_put_failure;
- break;
- default:
- break;
- }
-
- return 0;
-
-nla_put_failure:
- return -1;
-}
-
static struct nft_expr_type nft_reject_bridge_type;
static const struct nft_expr_ops nft_reject_bridge_ops = {
.type = &nft_reject_bridge_type,
.size = NFT_EXPR_SIZE(sizeof(struct nft_reject)),
.eval = nft_reject_bridge_eval,
- .init = nft_reject_bridge_init,
- .dump = nft_reject_bridge_dump,
+ .init = nft_reject_init,
+ .dump = nft_reject_dump,
.validate = nft_reject_bridge_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_reject_bridge_type __read_mostly = {
@@ -451,3 +212,4 @@ module_exit(nft_reject_bridge_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_AF_EXPR(AF_BRIDGE, "reject");
+MODULE_DESCRIPTION("Reject packets from bridge via nftables");
diff --git a/net/caif/Kconfig b/net/caif/Kconfig
index d3694953b1d7..87205251cc25 100644
--- a/net/caif/Kconfig
+++ b/net/caif/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# CAIF net configurations
#
@@ -6,17 +7,17 @@ menuconfig CAIF
tristate "CAIF support"
select CRC_CCITT
default n
- ---help---
+ help
The "Communication CPU to Application CPU Interface" (CAIF) is a packet
based connection-oriented MUX protocol developed by ST-Ericsson for use
with its modems. It is accessed from user space as sockets (PF_CAIF).
Say Y (or M) here if you build for a phone product (e.g. Android or
- MeeGo ) that uses CAIF as transport, if unsure say N.
+ MeeGo) that uses CAIF as transport. If unsure say N.
If you select to build it as module then CAIF_NETDEV also needs to be
- built as modules. You will also need to say yes to any CAIF physical
- devices that your platform requires.
+ built as a module. You will also need to say Y (or M) to any CAIF
+ physical devices that your platform requires.
See Documentation/networking/caif for a further explanation on how to
use and configure CAIF.
@@ -25,7 +26,7 @@ config CAIF_DEBUG
bool "Enable Debug"
depends on CAIF
default n
- ---help---
+ help
Enable the inclusion of debug code in the CAIF stack.
Be aware that doing this will impact performance.
If unsure say N.
@@ -34,9 +35,9 @@ config CAIF_NETDEV
tristate "CAIF GPRS Network device"
depends on CAIF
default CAIF
- ---help---
+ help
Say Y if you will be using a CAIF based GPRS network device.
- This can be either built-in or a loadable module,
+ This can be either built-in or a loadable module.
If you select to build it as a built-in then the main CAIF device must
also be a built-in.
If unsure say Y.
@@ -45,9 +46,9 @@ config CAIF_USB
tristate "CAIF USB support"
depends on CAIF
default n
- ---help---
+ help
Say Y if you are using CAIF over USB CDC NCM.
- This can be either built-in or a loadable module,
+ This can be either built-in or a loadable module.
If you select to build it as a built-in then the main CAIF device must
also be a built-in.
If unsure say N.
diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index 711d7156efd8..24e85c5487ef 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -1,8 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* CAIF Interface registration.
* Copyright (C) ST-Ericsson AB 2010
* Author: Sjur Brendeland
- * License terms: GNU General Public License (GPL) version 2
*
* Borrowed heavily from file: pn_dev.c. Thanks to Remi Denis-Courmont
* and Sakari Ailus <sakari.ailus@nokia.com>
@@ -27,6 +27,7 @@
#include <net/caif/cfcnfg.h>
#include <net/caif/cfserl.h>
+MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol support");
MODULE_LICENSE("GPL");
/* Used for local tracking of the CAIF net devices */
@@ -112,7 +113,8 @@ static struct caif_device_entry *caif_get(struct net_device *dev)
caif_device_list(dev_net(dev));
struct caif_device_entry *caifd;
- list_for_each_entry_rcu(caifd, &caifdevs->list, list) {
+ list_for_each_entry_rcu(caifd, &caifdevs->list, list,
+ lockdep_rtnl_is_held()) {
if (caifd->netdev == dev)
return caifd;
}
@@ -141,7 +143,7 @@ static void caif_flow_cb(struct sk_buff *skb)
spin_lock_bh(&caifd->flow_lock);
send_xoff = caifd->xoff;
- caifd->xoff = 0;
+ caifd->xoff = false;
dtor = caifd->xoff_skb_dtor;
if (WARN_ON(caifd->xoff_skb != skb))
@@ -186,15 +188,19 @@ static int transmit(struct cflayer *layer, struct cfpkt *pkt)
goto noxoff;
if (likely(!netif_queue_stopped(caifd->netdev))) {
+ struct Qdisc *sch;
+
/* If we run with a TX queue, check if the queue is too long*/
txq = netdev_get_tx_queue(skb->dev, 0);
- qlen = qdisc_qlen(rcu_dereference_bh(txq->qdisc));
-
- if (likely(qlen == 0))
+ sch = rcu_dereference_bh(txq->qdisc);
+ if (likely(qdisc_is_empty(sch)))
goto noxoff;
+ /* can check for explicit qdisc len value only !NOLOCK,
+ * always set flow off otherwise
+ */
high = (caifd->netdev->tx_queue_len * q_high) / 100;
- if (likely(qlen < high))
+ if (!(sch->flags & TCQ_F_NOLOCK) && likely(sch->q.qlen < high))
goto noxoff;
}
@@ -215,7 +221,7 @@ static int transmit(struct cflayer *layer, struct cfpkt *pkt)
pr_debug("queue has stopped(%d) or is full (%d > %d)\n",
netif_queue_stopped(caifd->netdev),
qlen, high);
- caifd->xoff = 1;
+ caifd->xoff = true;
caifd->xoff_skb = skb;
caifd->xoff_skb_dtor = skb->destructor;
skb->destructor = caif_flow_cb;
@@ -263,7 +269,7 @@ static int receive(struct sk_buff *skb, struct net_device *dev,
err = caifd->layer.up->receive(caifd->layer.up, pkt);
- /* For -EILSEQ the packet is not freed so so it now */
+ /* For -EILSEQ the packet is not freed so free it now */
if (err == -EILSEQ)
cfpkt_destroy(pkt);
@@ -303,7 +309,7 @@ static void dev_flowctrl(struct net_device *dev, int on)
caifd_put(caifd);
}
-void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev,
+int caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev,
struct cflayer *link_support, int head_room,
struct cflayer **layer,
int (**rcv_func)(struct sk_buff *, struct net_device *,
@@ -314,11 +320,12 @@ void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev,
enum cfcnfg_phy_preference pref;
struct cfcnfg *cfg = get_cfcnfg(dev_net(dev));
struct caif_device_entry_list *caifdevs;
+ int res;
caifdevs = caif_device_list(dev_net(dev));
caifd = caif_device_alloc(dev);
if (!caifd)
- return;
+ return -ENOMEM;
*layer = &caifd->layer;
spin_lock_init(&caifd->flow_lock);
@@ -336,10 +343,10 @@ void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev,
mutex_lock(&caifdevs->lock);
list_add_rcu(&caifd->list, &caifdevs->list);
- strlcpy(caifd->layer.name, dev->name,
+ strscpy(caifd->layer.name, dev->name,
sizeof(caifd->layer.name));
caifd->layer.transmit = transmit;
- cfcnfg_add_phy_layer(cfg,
+ res = cfcnfg_add_phy_layer(cfg,
dev,
&caifd->layer,
pref,
@@ -349,6 +356,7 @@ void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev,
mutex_unlock(&caifdevs->lock);
if (rcv_func)
*rcv_func = receive;
+ return res;
}
EXPORT_SYMBOL(caif_enroll_dev);
@@ -363,6 +371,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
struct cflayer *layer, *link_support;
int head_room = 0;
struct caif_device_entry_list *caifdevs;
+ int res;
cfg = get_cfcnfg(dev_net(dev));
caifdevs = caif_device_list(dev_net(dev));
@@ -388,8 +397,10 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
break;
}
}
- caif_enroll_dev(dev, caifdev, link_support, head_room,
+ res = caif_enroll_dev(dev, caifdev, link_support, head_room,
&layer, NULL);
+ if (res)
+ cfserl_release(link_support);
caifdev->flowctrl = dev_flowctrl;
break;
@@ -402,7 +413,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
break;
}
- caifd->xoff = 0;
+ caifd->xoff = false;
cfcnfg_set_phy_state(cfg, &caifd->layer, true);
rcu_read_unlock();
@@ -437,7 +448,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what,
if (caifd->xoff_skb_dtor != NULL && caifd->xoff_skb != NULL)
caifd->xoff_skb->destructor = caifd->xoff_skb_dtor;
- caifd->xoff = 0;
+ caifd->xoff = false;
caifd->xoff_skb_dtor = NULL;
caifd->xoff_skb = NULL;
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index d18965f3291f..af218742af5a 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -1,11 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) ST-Ericsson AB 2010
* Author: Sjur Brendeland
- * License terms: GNU General Public License (GPL) version 2
*/
#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
+#include <linux/filter.h>
#include <linux/fs.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -26,6 +27,7 @@
#include <net/caif/caif_dev.h>
#include <net/caif/cfpkt.h>
+MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol socket support (AF_CAIF)");
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(AF_CAIF);
@@ -46,7 +48,7 @@ enum caif_states {
struct caifsock {
struct sock sk; /* must be first member */
struct cflayer layer;
- u32 flow_state;
+ unsigned long flow_state;
struct caif_connect_request conn_req;
struct mutex readlock;
struct dentry *debugfs_socket_dir;
@@ -55,38 +57,32 @@ struct caifsock {
static int rx_flow_is_on(struct caifsock *cf_sk)
{
- return test_bit(RX_FLOW_ON_BIT,
- (void *) &cf_sk->flow_state);
+ return test_bit(RX_FLOW_ON_BIT, &cf_sk->flow_state);
}
static int tx_flow_is_on(struct caifsock *cf_sk)
{
- return test_bit(TX_FLOW_ON_BIT,
- (void *) &cf_sk->flow_state);
+ return test_bit(TX_FLOW_ON_BIT, &cf_sk->flow_state);
}
static void set_rx_flow_off(struct caifsock *cf_sk)
{
- clear_bit(RX_FLOW_ON_BIT,
- (void *) &cf_sk->flow_state);
+ clear_bit(RX_FLOW_ON_BIT, &cf_sk->flow_state);
}
static void set_rx_flow_on(struct caifsock *cf_sk)
{
- set_bit(RX_FLOW_ON_BIT,
- (void *) &cf_sk->flow_state);
+ set_bit(RX_FLOW_ON_BIT, &cf_sk->flow_state);
}
static void set_tx_flow_off(struct caifsock *cf_sk)
{
- clear_bit(TX_FLOW_ON_BIT,
- (void *) &cf_sk->flow_state);
+ clear_bit(TX_FLOW_ON_BIT, &cf_sk->flow_state);
}
static void set_tx_flow_on(struct caifsock *cf_sk)
{
- set_bit(TX_FLOW_ON_BIT,
- (void *) &cf_sk->flow_state);
+ set_bit(TX_FLOW_ON_BIT, &cf_sk->flow_state);
}
static void caif_read_lock(struct sock *sk)
@@ -243,7 +239,7 @@ static void caif_ctrl_cb(struct cflayer *layr,
cf_sk->sk.sk_shutdown = SHUTDOWN_MASK;
cf_sk->sk.sk_err = ECONNRESET;
set_rx_flow_on(cf_sk);
- cf_sk->sk.sk_error_report(&cf_sk->sk);
+ sk_error_report(&cf_sk->sk);
break;
default:
@@ -281,7 +277,7 @@ static int caif_seqpkt_recvmsg(struct socket *sock, struct msghdr *m,
if (flags & MSG_OOB)
goto read_error;
- skb = skb_recv_datagram(sk, flags, 0 , &ret);
+ skb = skb_recv_datagram(sk, flags, &ret);
if (!skb)
goto read_error;
copylen = skb->len;
@@ -538,9 +534,6 @@ static int caif_seqpkt_sendmsg(struct socket *sock, struct msghdr *msg,
if (msg->msg_namelen)
goto err;
- ret = -EINVAL;
- if (unlikely(msg->msg_iter.iov->iov_base == NULL))
- goto err;
noblock = msg->msg_flags & MSG_DONTWAIT;
timeo = sock_sndtimeo(sk, noblock);
@@ -669,8 +662,8 @@ out_err:
return sent ? : err;
}
-static int setsockopt(struct socket *sock,
- int lvl, int opt, char __user *ov, unsigned int ol)
+static int setsockopt(struct socket *sock, int lvl, int opt, sockptr_t ov,
+ unsigned int ol)
{
struct sock *sk = sock->sk;
struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
@@ -685,7 +678,7 @@ static int setsockopt(struct socket *sock,
return -EINVAL;
if (lvl != SOL_CAIF)
goto bad_sol;
- if (copy_from_user(&linksel, ov, sizeof(int)))
+ if (copy_from_sockptr(&linksel, ov, sizeof(int)))
return -EINVAL;
lock_sock(&(cf_sk->sk));
cf_sk->conn_req.link_selector = linksel;
@@ -699,7 +692,7 @@ static int setsockopt(struct socket *sock,
return -ENOPROTOOPT;
lock_sock(&(cf_sk->sk));
if (ol > sizeof(cf_sk->conn_req.param.data) ||
- copy_from_user(&cf_sk->conn_req.param.data, ov, ol)) {
+ copy_from_sockptr(&cf_sk->conn_req.param.data, ov, ol)) {
release_sock(&cf_sk->sk);
return -EINVAL;
}
@@ -741,7 +734,7 @@ bad_sol:
* o sock->state: holds the SS_* socket state and is updated by connect and
* disconnect.
*/
-static int caif_connect(struct socket *sock, struct sockaddr *uaddr,
+static int caif_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
@@ -941,7 +934,7 @@ static __poll_t caif_poll(struct file *file,
__poll_t mask;
struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
- sock_poll_wait(file, wait);
+ sock_poll_wait(file, sock, wait);
mask = 0;
/* exceptional events? */
@@ -953,7 +946,7 @@ static __poll_t caif_poll(struct file *file,
mask |= EPOLLRDHUP;
/* readable? */
- if (!skb_queue_empty(&sk->sk_receive_queue) ||
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue) ||
(sk->sk_shutdown & RCV_SHUTDOWN))
mask |= EPOLLIN | EPOLLRDNORM;
@@ -981,11 +974,9 @@ static const struct proto_ops caif_seqpacket_ops = {
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = setsockopt,
- .getsockopt = sock_no_getsockopt,
.sendmsg = caif_seqpkt_sendmsg,
.recvmsg = caif_seqpkt_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
};
static const struct proto_ops caif_stream_ops = {
@@ -1002,11 +993,9 @@ static const struct proto_ops caif_stream_ops = {
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = setsockopt,
- .getsockopt = sock_no_getsockopt,
.sendmsg = caif_stream_sendmsg,
.recvmsg = caif_stream_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
};
/* This function is called when a socket is finally destroyed. */
@@ -1021,6 +1010,7 @@ static void caif_sock_destructor(struct sock *sk)
return;
}
sk_stream_kill_queues(&cf_sk->sk);
+ WARN_ON_ONCE(sk->sk_forward_alloc);
caif_free_client(&cf_sk->layer);
}
diff --git a/net/caif/caif_usb.c b/net/caif/caif_usb.c
index 1a082a946045..5dc05a1e3178 100644
--- a/net/caif/caif_usb.c
+++ b/net/caif/caif_usb.c
@@ -1,9 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* CAIF USB handler
* Copyright (C) ST-Ericsson AB 2011
* Author: Sjur Brendeland
- * License terms: GNU General Public License (GPL) version 2
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
@@ -21,6 +20,7 @@
#include <net/caif/cfpkt.h>
#include <net/caif/cfcnfg.h>
+MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol USB support");
MODULE_LICENSE("GPL");
#define CFUSB_PAD_DESCR_SZ 1 /* Alignment descriptor length */
@@ -63,7 +63,7 @@ static int cfusbl_transmit(struct cflayer *layr, struct cfpkt *pkt)
hpad = (info->hdr_len + CFUSB_PAD_DESCR_SZ) & (CFUSB_ALIGNMENT - 1);
if (skb_headroom(skb) < ETH_HLEN + CFUSB_PAD_DESCR_SZ + hpad) {
- pr_warn("Headroom to small\n");
+ pr_warn("Headroom too small\n");
kfree_skb(skb);
return -EIO;
}
@@ -82,7 +82,7 @@ static void cfusbl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
layr->up->ctrlcmd(layr->up, ctrl, layr->id);
}
-static struct cflayer *cfusbl_create(int phyid, u8 ethaddr[ETH_ALEN],
+static struct cflayer *cfusbl_create(int phyid, const u8 ethaddr[ETH_ALEN],
u8 braddr[ETH_ALEN])
{
struct cfusbl *this = kmalloc(sizeof(struct cfusbl), GFP_ATOMIC);
@@ -116,6 +116,11 @@ static struct cflayer *cfusbl_create(int phyid, u8 ethaddr[ETH_ALEN],
return (struct cflayer *) this;
}
+static void cfusbl_release(struct cflayer *layer)
+{
+ kfree(layer);
+}
+
static struct packet_type caif_usb_type __read_mostly = {
.type = cpu_to_be16(ETH_P_802_EX1),
};
@@ -128,6 +133,10 @@ static int cfusbl_device_notify(struct notifier_block *me, unsigned long what,
struct cflayer *layer, *link_support;
struct usbnet *usbnet;
struct usb_device *usbdev;
+ int res;
+
+ if (what == NETDEV_UNREGISTER && dev->reg_state >= NETREG_UNREGISTERED)
+ return 0;
/* Check whether we have a NCM device, and find its VID/PID. */
if (!(dev->dev.parent && dev->dev.parent->driver &&
@@ -170,15 +179,21 @@ static int cfusbl_device_notify(struct notifier_block *me, unsigned long what,
if (dev->num_tx_queues > 1)
pr_warn("USB device uses more than one tx queue\n");
- caif_enroll_dev(dev, &common, link_support, CFUSB_MAX_HEADLEN,
+ res = caif_enroll_dev(dev, &common, link_support, CFUSB_MAX_HEADLEN,
&layer, &caif_usb_type.func);
+ if (res)
+ goto err;
+
if (!pack_added)
dev_add_pack(&caif_usb_type);
pack_added = true;
- strlcpy(layer->name, dev->name, sizeof(layer->name));
+ strscpy(layer->name, dev->name, sizeof(layer->name));
return 0;
+err:
+ cfusbl_release(link_support);
+ return res;
}
static struct notifier_block caif_device_notifier = {
diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c
index 8f00bea093b9..52509e185960 100644
--- a/net/caif/cfcnfg.c
+++ b/net/caif/cfcnfg.c
@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) ST-Ericsson AB 2010
* Author: Sjur Brendeland
- * License terms: GNU General Public License (GPL) version 2
*/
#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
@@ -268,14 +268,14 @@ static int caif_connect_req_to_link_param(struct cfcnfg *cnfg,
case CAIFPROTO_RFM:
l->linktype = CFCTRL_SRV_RFM;
l->u.datagram.connid = s->sockaddr.u.rfm.connection_id;
- strlcpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume,
+ strscpy(l->u.rfm.volume, s->sockaddr.u.rfm.volume,
sizeof(l->u.rfm.volume));
break;
case CAIFPROTO_UTIL:
l->linktype = CFCTRL_SRV_UTIL;
l->endpoint = 0x00;
l->chtype = 0x00;
- strlcpy(l->u.utility.name, s->sockaddr.u.util.service,
+ strscpy(l->u.utility.name, s->sockaddr.u.util.service,
sizeof(l->u.utility.name));
caif_assert(sizeof(l->u.utility.name) > 10);
l->u.utility.paramlen = s->param.size;
@@ -450,7 +450,7 @@ unlock:
rcu_read_unlock();
}
-void
+int
cfcnfg_add_phy_layer(struct cfcnfg *cnfg,
struct net_device *dev, struct cflayer *phy_layer,
enum cfcnfg_phy_preference pref,
@@ -459,7 +459,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg,
{
struct cflayer *frml;
struct cfcnfg_phyinfo *phyinfo = NULL;
- int i;
+ int i, res = 0;
u8 phyid;
mutex_lock(&cnfg->lock);
@@ -473,12 +473,15 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg,
goto got_phyid;
}
pr_warn("Too many CAIF Link Layers (max 6)\n");
+ res = -EEXIST;
goto out;
got_phyid:
phyinfo = kzalloc(sizeof(struct cfcnfg_phyinfo), GFP_ATOMIC);
- if (!phyinfo)
- goto out_err;
+ if (!phyinfo) {
+ res = -ENOMEM;
+ goto out;
+ }
phy_layer->id = phyid;
phyinfo->pref = pref;
@@ -492,8 +495,10 @@ got_phyid:
frml = cffrml_create(phyid, fcs);
- if (!frml)
+ if (!frml) {
+ res = -ENOMEM;
goto out_err;
+ }
phyinfo->frm_layer = frml;
layer_set_up(frml, cnfg->mux);
@@ -511,11 +516,12 @@ got_phyid:
list_add_rcu(&phyinfo->node, &cnfg->phys);
out:
mutex_unlock(&cnfg->lock);
- return;
+ return res;
out_err:
kfree(phyinfo);
mutex_unlock(&cnfg->lock);
+ return res;
}
EXPORT_SYMBOL(cfcnfg_add_phy_layer);
diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c
index a1e85f032108..2aa1e7d46eb2 100644
--- a/net/caif/cfctrl.c
+++ b/net/caif/cfctrl.c
@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) ST-Ericsson AB 2010
* Author: Sjur Brendeland
- * License terms: GNU General Public License (GPL) version 2
*/
#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
@@ -201,14 +201,14 @@ int cfctrl_linkup_request(struct cflayer *layer,
struct cflayer *user_layer)
{
struct cfctrl *cfctrl = container_obj(layer);
+ struct cflayer *dn = cfctrl->serv.layer.dn;
+ char utility_name[UTILITY_NAME_LENGTH];
+ struct cfctrl_request_info *req;
+ struct cfpkt *pkt;
u32 tmp32;
u16 tmp16;
u8 tmp8;
- struct cfctrl_request_info *req;
int ret;
- char utility_name[16];
- struct cfpkt *pkt;
- struct cflayer *dn = cfctrl->serv.layer.dn;
if (!dn) {
pr_debug("not able to send linkup request\n");
@@ -257,9 +257,7 @@ int cfctrl_linkup_request(struct cflayer *layer,
cfpkt_add_body(pkt, &tmp16, 2);
tmp16 = cpu_to_le16(param->u.utility.fifosize_bufs);
cfpkt_add_body(pkt, &tmp16, 2);
- memset(utility_name, 0, sizeof(utility_name));
- strlcpy(utility_name, param->u.utility.name,
- UTILITY_NAME_LENGTH);
+ strscpy_pad(utility_name, param->u.utility.name);
cfpkt_add_body(pkt, utility_name, UTILITY_NAME_LENGTH);
tmp8 = param->u.utility.paramlen;
cfpkt_add_body(pkt, &tmp8, 1);
@@ -269,11 +267,15 @@ int cfctrl_linkup_request(struct cflayer *layer,
default:
pr_warn("Request setup of bad link type = %d\n",
param->linktype);
+ cfpkt_destroy(pkt);
return -EINVAL;
}
req = kzalloc(sizeof(*req), GFP_KERNEL);
- if (!req)
+ if (!req) {
+ cfpkt_destroy(pkt);
return -ENOMEM;
+ }
+
req->client_layer = user_layer;
req->cmd = CFCTRL_CMD_LINK_SETUP;
req->param = *param;
@@ -347,17 +349,154 @@ int cfctrl_cancel_req(struct cflayer *layr, struct cflayer *adap_layer)
return found;
}
+static int cfctrl_link_setup(struct cfctrl *cfctrl, struct cfpkt *pkt, u8 cmdrsp)
+{
+ u8 len;
+ u8 linkid = 0;
+ enum cfctrl_srv serv;
+ enum cfctrl_srv servtype;
+ u8 endpoint;
+ u8 physlinkid;
+ u8 prio;
+ u8 tmp;
+ u8 *cp;
+ int i;
+ struct cfctrl_link_param linkparam;
+ struct cfctrl_request_info rsp, *req;
+
+ memset(&linkparam, 0, sizeof(linkparam));
+
+ tmp = cfpkt_extr_head_u8(pkt);
+
+ serv = tmp & CFCTRL_SRV_MASK;
+ linkparam.linktype = serv;
+
+ servtype = tmp >> 4;
+ linkparam.chtype = servtype;
+
+ tmp = cfpkt_extr_head_u8(pkt);
+ physlinkid = tmp & 0x07;
+ prio = tmp >> 3;
+
+ linkparam.priority = prio;
+ linkparam.phyid = physlinkid;
+ endpoint = cfpkt_extr_head_u8(pkt);
+ linkparam.endpoint = endpoint & 0x03;
+
+ switch (serv) {
+ case CFCTRL_SRV_VEI:
+ case CFCTRL_SRV_DBG:
+ if (CFCTRL_ERR_BIT & cmdrsp)
+ break;
+ /* Link ID */
+ linkid = cfpkt_extr_head_u8(pkt);
+ break;
+ case CFCTRL_SRV_VIDEO:
+ tmp = cfpkt_extr_head_u8(pkt);
+ linkparam.u.video.connid = tmp;
+ if (CFCTRL_ERR_BIT & cmdrsp)
+ break;
+ /* Link ID */
+ linkid = cfpkt_extr_head_u8(pkt);
+ break;
+
+ case CFCTRL_SRV_DATAGRAM:
+ linkparam.u.datagram.connid = cfpkt_extr_head_u32(pkt);
+ if (CFCTRL_ERR_BIT & cmdrsp)
+ break;
+ /* Link ID */
+ linkid = cfpkt_extr_head_u8(pkt);
+ break;
+ case CFCTRL_SRV_RFM:
+ /* Construct a frame, convert
+ * DatagramConnectionID
+ * to network format long and copy it out...
+ */
+ linkparam.u.rfm.connid = cfpkt_extr_head_u32(pkt);
+ cp = (u8 *) linkparam.u.rfm.volume;
+ for (tmp = cfpkt_extr_head_u8(pkt);
+ cfpkt_more(pkt) && tmp != '\0';
+ tmp = cfpkt_extr_head_u8(pkt))
+ *cp++ = tmp;
+ *cp = '\0';
+
+ if (CFCTRL_ERR_BIT & cmdrsp)
+ break;
+ /* Link ID */
+ linkid = cfpkt_extr_head_u8(pkt);
+
+ break;
+ case CFCTRL_SRV_UTIL:
+ /* Construct a frame, convert
+ * DatagramConnectionID
+ * to network format long and copy it out...
+ */
+ /* Fifosize KB */
+ linkparam.u.utility.fifosize_kb = cfpkt_extr_head_u16(pkt);
+ /* Fifosize bufs */
+ linkparam.u.utility.fifosize_bufs = cfpkt_extr_head_u16(pkt);
+ /* name */
+ cp = (u8 *) linkparam.u.utility.name;
+ caif_assert(sizeof(linkparam.u.utility.name)
+ >= UTILITY_NAME_LENGTH);
+ for (i = 0; i < UTILITY_NAME_LENGTH && cfpkt_more(pkt); i++) {
+ tmp = cfpkt_extr_head_u8(pkt);
+ *cp++ = tmp;
+ }
+ /* Length */
+ len = cfpkt_extr_head_u8(pkt);
+ linkparam.u.utility.paramlen = len;
+ /* Param Data */
+ cp = linkparam.u.utility.params;
+ while (cfpkt_more(pkt) && len--) {
+ tmp = cfpkt_extr_head_u8(pkt);
+ *cp++ = tmp;
+ }
+ if (CFCTRL_ERR_BIT & cmdrsp)
+ break;
+ /* Link ID */
+ linkid = cfpkt_extr_head_u8(pkt);
+ /* Length */
+ len = cfpkt_extr_head_u8(pkt);
+ /* Param Data */
+ cfpkt_extr_head(pkt, NULL, len);
+ break;
+ default:
+ pr_warn("Request setup, invalid type (%d)\n", serv);
+ return -1;
+ }
+
+ rsp.cmd = CFCTRL_CMD_LINK_SETUP;
+ rsp.param = linkparam;
+ spin_lock_bh(&cfctrl->info_list_lock);
+ req = cfctrl_remove_req(cfctrl, &rsp);
+
+ if (CFCTRL_ERR_BIT == (CFCTRL_ERR_BIT & cmdrsp) ||
+ cfpkt_erroneous(pkt)) {
+ pr_err("Invalid O/E bit or parse error "
+ "on CAIF control channel\n");
+ cfctrl->res.reject_rsp(cfctrl->serv.layer.up, 0,
+ req ? req->client_layer : NULL);
+ } else {
+ cfctrl->res.linksetup_rsp(cfctrl->serv.layer.up, linkid,
+ serv, physlinkid,
+ req ? req->client_layer : NULL);
+ }
+
+ kfree(req);
+
+ spin_unlock_bh(&cfctrl->info_list_lock);
+
+ return 0;
+}
+
static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
{
u8 cmdrsp;
u8 cmd;
- int ret = -1;
- u8 len;
- u8 param[255];
+ int ret = 0;
u8 linkid = 0;
struct cfctrl *cfctrl = container_obj(layer);
- struct cfctrl_request_info rsp, *req;
-
cmdrsp = cfpkt_extr_head_u8(pkt);
cmd = cmdrsp & CFCTRL_CMD_MASK;
@@ -370,150 +509,7 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
switch (cmd) {
case CFCTRL_CMD_LINK_SETUP:
- {
- enum cfctrl_srv serv;
- enum cfctrl_srv servtype;
- u8 endpoint;
- u8 physlinkid;
- u8 prio;
- u8 tmp;
- u8 *cp;
- int i;
- struct cfctrl_link_param linkparam;
- memset(&linkparam, 0, sizeof(linkparam));
-
- tmp = cfpkt_extr_head_u8(pkt);
-
- serv = tmp & CFCTRL_SRV_MASK;
- linkparam.linktype = serv;
-
- servtype = tmp >> 4;
- linkparam.chtype = servtype;
-
- tmp = cfpkt_extr_head_u8(pkt);
- physlinkid = tmp & 0x07;
- prio = tmp >> 3;
-
- linkparam.priority = prio;
- linkparam.phyid = physlinkid;
- endpoint = cfpkt_extr_head_u8(pkt);
- linkparam.endpoint = endpoint & 0x03;
-
- switch (serv) {
- case CFCTRL_SRV_VEI:
- case CFCTRL_SRV_DBG:
- if (CFCTRL_ERR_BIT & cmdrsp)
- break;
- /* Link ID */
- linkid = cfpkt_extr_head_u8(pkt);
- break;
- case CFCTRL_SRV_VIDEO:
- tmp = cfpkt_extr_head_u8(pkt);
- linkparam.u.video.connid = tmp;
- if (CFCTRL_ERR_BIT & cmdrsp)
- break;
- /* Link ID */
- linkid = cfpkt_extr_head_u8(pkt);
- break;
-
- case CFCTRL_SRV_DATAGRAM:
- linkparam.u.datagram.connid =
- cfpkt_extr_head_u32(pkt);
- if (CFCTRL_ERR_BIT & cmdrsp)
- break;
- /* Link ID */
- linkid = cfpkt_extr_head_u8(pkt);
- break;
- case CFCTRL_SRV_RFM:
- /* Construct a frame, convert
- * DatagramConnectionID
- * to network format long and copy it out...
- */
- linkparam.u.rfm.connid =
- cfpkt_extr_head_u32(pkt);
- cp = (u8 *) linkparam.u.rfm.volume;
- for (tmp = cfpkt_extr_head_u8(pkt);
- cfpkt_more(pkt) && tmp != '\0';
- tmp = cfpkt_extr_head_u8(pkt))
- *cp++ = tmp;
- *cp = '\0';
-
- if (CFCTRL_ERR_BIT & cmdrsp)
- break;
- /* Link ID */
- linkid = cfpkt_extr_head_u8(pkt);
-
- break;
- case CFCTRL_SRV_UTIL:
- /* Construct a frame, convert
- * DatagramConnectionID
- * to network format long and copy it out...
- */
- /* Fifosize KB */
- linkparam.u.utility.fifosize_kb =
- cfpkt_extr_head_u16(pkt);
- /* Fifosize bufs */
- linkparam.u.utility.fifosize_bufs =
- cfpkt_extr_head_u16(pkt);
- /* name */
- cp = (u8 *) linkparam.u.utility.name;
- caif_assert(sizeof(linkparam.u.utility.name)
- >= UTILITY_NAME_LENGTH);
- for (i = 0;
- i < UTILITY_NAME_LENGTH
- && cfpkt_more(pkt); i++) {
- tmp = cfpkt_extr_head_u8(pkt);
- *cp++ = tmp;
- }
- /* Length */
- len = cfpkt_extr_head_u8(pkt);
- linkparam.u.utility.paramlen = len;
- /* Param Data */
- cp = linkparam.u.utility.params;
- while (cfpkt_more(pkt) && len--) {
- tmp = cfpkt_extr_head_u8(pkt);
- *cp++ = tmp;
- }
- if (CFCTRL_ERR_BIT & cmdrsp)
- break;
- /* Link ID */
- linkid = cfpkt_extr_head_u8(pkt);
- /* Length */
- len = cfpkt_extr_head_u8(pkt);
- /* Param Data */
- cfpkt_extr_head(pkt, &param, len);
- break;
- default:
- pr_warn("Request setup, invalid type (%d)\n",
- serv);
- goto error;
- }
-
- rsp.cmd = cmd;
- rsp.param = linkparam;
- spin_lock_bh(&cfctrl->info_list_lock);
- req = cfctrl_remove_req(cfctrl, &rsp);
-
- if (CFCTRL_ERR_BIT == (CFCTRL_ERR_BIT & cmdrsp) ||
- cfpkt_erroneous(pkt)) {
- pr_err("Invalid O/E bit or parse error "
- "on CAIF control channel\n");
- cfctrl->res.reject_rsp(cfctrl->serv.layer.up,
- 0,
- req ? req->client_layer
- : NULL);
- } else {
- cfctrl->res.linksetup_rsp(cfctrl->serv.
- layer.up, linkid,
- serv, physlinkid,
- req ? req->
- client_layer : NULL);
- }
-
- kfree(req);
-
- spin_unlock_bh(&cfctrl->info_list_lock);
- }
+ ret = cfctrl_link_setup(cfctrl, pkt, cmdrsp);
break;
case CFCTRL_CMD_LINK_DESTROY:
linkid = cfpkt_extr_head_u8(pkt);
@@ -540,9 +536,9 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt)
break;
default:
pr_err("Unrecognized Control Frame\n");
+ ret = -1;
goto error;
}
- ret = 0;
error:
cfpkt_destroy(pkt);
return ret;
diff --git a/net/caif/cfdbgl.c b/net/caif/cfdbgl.c
index 7aae0b56829e..77f428428b47 100644
--- a/net/caif/cfdbgl.c
+++ b/net/caif/cfdbgl.c
@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) ST-Ericsson AB 2010
* Author: Sjur Brendeland
- * License terms: GNU General Public License (GPL) version 2
*/
#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
@@ -26,7 +26,7 @@ struct cflayer *cfdbgl_create(u8 channel_id, struct dev_info *dev_info)
cfsrvl_init(dbg, channel_id, dev_info, false);
dbg->layer.receive = cfdbgl_receive;
dbg->layer.transmit = cfdbgl_transmit;
- snprintf(dbg->layer.name, CAIF_LAYER_NAME_SZ - 1, "dbg%d", channel_id);
+ snprintf(dbg->layer.name, CAIF_LAYER_NAME_SZ, "dbg%d", channel_id);
return &dbg->layer;
}
diff --git a/net/caif/cfdgml.c b/net/caif/cfdgml.c
index 3bdddb32d55a..eb6f8ef47a79 100644
--- a/net/caif/cfdgml.c
+++ b/net/caif/cfdgml.c
@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) ST-Ericsson AB 2010
* Author: Sjur Brendeland
- * License terms: GNU General Public License (GPL) version 2
*/
#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
@@ -33,8 +33,7 @@ struct cflayer *cfdgml_create(u8 channel_id, struct dev_info *dev_info)
cfsrvl_init(dgm, channel_id, dev_info, true);
dgm->layer.receive = cfdgml_receive;
dgm->layer.transmit = cfdgml_transmit;
- snprintf(dgm->layer.name, CAIF_LAYER_NAME_SZ - 1, "dgm%d", channel_id);
- dgm->layer.name[CAIF_LAYER_NAME_SZ - 1] = '\0';
+ snprintf(dgm->layer.name, CAIF_LAYER_NAME_SZ, "dgm%d", channel_id);
return &dgm->layer;
}
diff --git a/net/caif/cffrml.c b/net/caif/cffrml.c
index 434ba8557826..6651a8dc62e0 100644
--- a/net/caif/cffrml.c
+++ b/net/caif/cffrml.c
@@ -1,9 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* CAIF Framing Layer.
*
* Copyright (C) ST-Ericsson AB 2010
* Author: Sjur Brendeland
- * License terms: GNU General Public License (GPL) version 2
*/
#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
diff --git a/net/caif/cfmuxl.c b/net/caif/cfmuxl.c
index 510aa5a753f0..4172b0d0db63 100644
--- a/net/caif/cfmuxl.c
+++ b/net/caif/cfmuxl.c
@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) ST-Ericsson AB 2010
* Author: Sjur Brendeland
- * License terms: GNU General Public License (GPL) version 2
*/
#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c
index 38c2b7a890dd..96236d21b18e 100644
--- a/net/caif/cfpkt_skbuff.c
+++ b/net/caif/cfpkt_skbuff.c
@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) ST-Ericsson AB 2010
* Author: Sjur Brendeland
- * License terms: GNU General Public License (GPL) version 2
*/
#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
@@ -21,13 +21,6 @@ do { \
pr_warn(errmsg); \
} while (0)
-struct cfpktq {
- struct sk_buff_head head;
- atomic_t count;
- /* Lock protects count updates */
- spinlock_t lock;
-};
-
/*
* net/caif/ is generic and does not
* understand SKB, so we do this typecast
@@ -305,10 +298,8 @@ struct cfpkt *cfpkt_append(struct cfpkt *dstpkt,
if (unlikely(is_erronous(dstpkt) || is_erronous(addpkt))) {
return dstpkt;
}
- if (expectlen > addlen)
- neededtailspace = expectlen;
- else
- neededtailspace = addlen;
+
+ neededtailspace = max(expectlen, addlen);
if (dst->tail + neededtailspace > dst->end) {
/* Create a dumplicate of 'dst' with more tail space */
@@ -319,16 +310,12 @@ struct cfpkt *cfpkt_append(struct cfpkt *dstpkt,
if (tmppkt == NULL)
return NULL;
tmp = pkt_to_skb(tmppkt);
- skb_set_tail_pointer(tmp, dstlen);
- tmp->len = dstlen;
- memcpy(tmp->data, dst->data, dstlen);
+ skb_put_data(tmp, dst->data, dstlen);
cfpkt_destroy(dstpkt);
dst = tmp;
}
- memcpy(skb_tail_pointer(dst), add->data, skb_headlen(add));
+ skb_put_data(dst, add->data, skb_headlen(add));
cfpkt_destroy(addpkt);
- dst->tail += addlen;
- dst->len += addlen;
return skb_to_pkt(dst);
}
@@ -359,13 +346,11 @@ struct cfpkt *cfpkt_split(struct cfpkt *pkt, u16 pos)
if (skb2 == NULL)
return NULL;
+ skb_put_data(skb2, split, len2nd);
+
/* Reduce the length of the original packet */
- skb_set_tail_pointer(skb, pos);
- skb->len = pos;
+ skb_trim(skb, pos);
- memcpy(skb2->data, split, len2nd);
- skb2->tail += len2nd;
- skb2->len += len2nd;
skb2->priority = skb->priority;
return skb_to_pkt(skb2);
}
diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c
index b82440e1fcb4..3c335057f255 100644
--- a/net/caif/cfrfml.c
+++ b/net/caif/cfrfml.c
@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) ST-Ericsson AB 2010
* Author: Sjur Brendeland
- * License terms: GNU General Public License (GPL) version 2
*/
#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
@@ -9,7 +9,7 @@
#include <linux/stddef.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <net/caif/caif_layer.h>
#include <net/caif/cfsrvl.h>
#include <net/caif/cfpkt.h>
@@ -116,7 +116,7 @@ static int cfrfml_receive(struct cflayer *layr, struct cfpkt *pkt)
if (segmented) {
if (rfml->incomplete_frm == NULL) {
/* Initial Segment */
- if (cfpkt_peek_head(pkt, rfml->seghead, 6) < 0)
+ if (cfpkt_peek_head(pkt, rfml->seghead, 6) != 0)
goto out;
rfml->pdu_size = get_unaligned_le16(rfml->seghead+4);
@@ -233,7 +233,7 @@ static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt)
if (cfpkt_getlen(pkt) > rfml->fragment_size + RFM_HEAD_SIZE)
err = cfpkt_peek_head(pkt, head, 6);
- if (err < 0)
+ if (err != 0)
goto out;
while (cfpkt_getlen(frontpkt) > rfml->fragment_size + RFM_HEAD_SIZE) {
@@ -264,9 +264,6 @@ static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt)
frontpkt = rearpkt;
rearpkt = NULL;
- err = -ENOMEM;
- if (frontpkt == NULL)
- goto out;
err = -EPROTO;
if (cfpkt_add_head(frontpkt, head, 6) < 0)
goto out;
diff --git a/net/caif/cfserl.c b/net/caif/cfserl.c
index ce60f06d76de..aee11c74d3c8 100644
--- a/net/caif/cfserl.c
+++ b/net/caif/cfserl.c
@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) ST-Ericsson AB 2010
* Author: Sjur Brendeland
- * License terms: GNU General Public License (GPL) version 2
*/
#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
@@ -31,6 +31,11 @@ static int cfserl_transmit(struct cflayer *layr, struct cfpkt *pkt);
static void cfserl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl,
int phyid);
+void cfserl_release(struct cflayer *layer)
+{
+ kfree(layer);
+}
+
struct cflayer *cfserl_create(int instance, bool use_stx)
{
struct cfserl *this = kzalloc(sizeof(struct cfserl), GFP_ATOMIC);
@@ -123,7 +128,6 @@ static int cfserl_receive(struct cflayer *l, struct cfpkt *newpkt)
if (pkt != NULL)
cfpkt_destroy(pkt);
layr->incomplete_frm = NULL;
- expectlen = 0;
spin_unlock(&layr->sync);
return -EPROTO;
}
diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c
index a6e115463052..171fa32ada85 100644
--- a/net/caif/cfsrvl.c
+++ b/net/caif/cfsrvl.c
@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) ST-Ericsson AB 2010
* Author: Sjur Brendeland
- * License terms: GNU General Public License (GPL) version 2
*/
#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
@@ -21,7 +21,6 @@
#define SRVL_FLOW_OFF 0x81
#define SRVL_FLOW_ON 0x80
#define SRVL_SET_PIN 0x82
-#define SRVL_CTRL_PKT_SIZE 1
#define container_obj(layr) container_of(layr, struct cfsrvl, layer)
@@ -184,12 +183,6 @@ bool cfsrvl_ready(struct cfsrvl *service, int *err)
return true;
}
-u8 cfsrvl_getphyid(struct cflayer *layer)
-{
- struct cfsrvl *servl = container_obj(layer);
- return servl->dev_info.id;
-}
-
bool cfsrvl_phyid_match(struct cflayer *layer, int phyid)
{
struct cfsrvl *servl = container_obj(layer);
diff --git a/net/caif/cfutill.c b/net/caif/cfutill.c
index 1728fa4471cf..b2e47ede912f 100644
--- a/net/caif/cfutill.c
+++ b/net/caif/cfutill.c
@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) ST-Ericsson AB 2010
* Author: Sjur Brendeland
- * License terms: GNU General Public License (GPL) version 2
*/
#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
@@ -33,7 +33,7 @@ struct cflayer *cfutill_create(u8 channel_id, struct dev_info *dev_info)
cfsrvl_init(util, channel_id, dev_info, true);
util->layer.receive = cfutill_receive;
util->layer.transmit = cfutill_transmit;
- snprintf(util->layer.name, CAIF_LAYER_NAME_SZ - 1, "util1");
+ snprintf(util->layer.name, CAIF_LAYER_NAME_SZ, "util1");
return &util->layer;
}
diff --git a/net/caif/cfveil.c b/net/caif/cfveil.c
index 262224581efa..db2274b94a5d 100644
--- a/net/caif/cfveil.c
+++ b/net/caif/cfveil.c
@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) ST-Ericsson AB 2010
* Author: Sjur Brendeland
- * License terms: GNU General Public License (GPL) version 2
*/
#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
@@ -32,7 +32,7 @@ struct cflayer *cfvei_create(u8 channel_id, struct dev_info *dev_info)
cfsrvl_init(vei, channel_id, dev_info, true);
vei->layer.receive = cfvei_receive;
vei->layer.transmit = cfvei_transmit;
- snprintf(vei->layer.name, CAIF_LAYER_NAME_SZ - 1, "vei%d", channel_id);
+ snprintf(vei->layer.name, CAIF_LAYER_NAME_SZ, "vei%d", channel_id);
return &vei->layer;
}
diff --git a/net/caif/cfvidl.c b/net/caif/cfvidl.c
index b3b110e8a350..134bad43196c 100644
--- a/net/caif/cfvidl.c
+++ b/net/caif/cfvidl.c
@@ -1,7 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) ST-Ericsson AB 2010
* Author: Sjur Brendeland
- * License terms: GNU General Public License (GPL) version 2
*/
#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
@@ -29,7 +29,7 @@ struct cflayer *cfvidl_create(u8 channel_id, struct dev_info *dev_info)
cfsrvl_init(vid, channel_id, dev_info, false);
vid->layer.receive = cfvidl_receive;
vid->layer.transmit = cfvidl_transmit;
- snprintf(vid->layer.name, CAIF_LAYER_NAME_SZ - 1, "vid1");
+ snprintf(vid->layer.name, CAIF_LAYER_NAME_SZ, "vid1");
return &vid->layer;
}
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index 13e2ae6be620..fa6a3c2634a8 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -1,8 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) ST-Ericsson AB 2010
* Authors: Sjur Brendeland
* Daniel Martensson
- * License terms: GNU General Public License (GPL) version 2
*/
#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__
@@ -31,6 +31,7 @@
/*This list is protected by the rtnl lock. */
static LIST_HEAD(chnl_net_list);
+MODULE_DESCRIPTION("ST-Ericsson CAIF modem protocol GPRS network device");
MODULE_LICENSE("GPL");
MODULE_ALIAS_RTNL_LINK("caif");
@@ -46,27 +47,12 @@ struct chnl_net {
struct caif_connect_request conn_req;
struct list_head list_field;
struct net_device *netdev;
- char name[256];
wait_queue_head_t netmgmt_wq;
/* Flow status to remember and control the transmission. */
bool flowenabled;
enum caif_states state;
};
-static void robust_list_del(struct list_head *delete_node)
-{
- struct list_head *list_node;
- struct list_head *n;
- ASSERT_RTNL();
- list_for_each_safe(list_node, n, &chnl_net_list) {
- if (list_node == delete_node) {
- list_del(list_node);
- return;
- }
- }
- WARN_ON(1);
-}
-
static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt)
{
struct sk_buff *skb;
@@ -76,8 +62,6 @@ static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt)
u8 buf;
priv = container_of(layr, struct chnl_net, chnl);
- if (!priv)
- return -EINVAL;
skb = (struct sk_buff *) cfpkt_tonative(pkt);
@@ -115,10 +99,7 @@ static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt)
else
skb->ip_summed = CHECKSUM_NONE;
- if (in_interrupt())
- netif_rx(skb);
- else
- netif_rx_ni(skb);
+ netif_rx(skb);
/* Update statistics. */
priv->netdev->stats.rx_packets++;
@@ -211,7 +192,8 @@ static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow,
}
}
-static int chnl_net_start_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t chnl_net_start_xmit(struct sk_buff *skb,
+ struct net_device *dev)
{
struct chnl_net *priv;
struct cfpkt *pkt = NULL;
@@ -328,9 +310,6 @@ static int chnl_net_open(struct net_device *dev)
if (result == 0) {
pr_debug("connect timeout\n");
- caif_disconnect_client(dev_net(dev), &priv->chnl);
- priv->state = CAIF_DISCONNECTED;
- pr_debug("state disconnected\n");
result = -ETIMEDOUT;
goto error;
}
@@ -367,7 +346,7 @@ static int chnl_net_init(struct net_device *dev)
struct chnl_net *priv;
ASSERT_RTNL();
priv = netdev_priv(dev);
- strncpy(priv->name, dev->name, sizeof(priv->name));
+ INIT_LIST_HEAD(&priv->list_field);
return 0;
}
@@ -376,7 +355,7 @@ static void chnl_net_uninit(struct net_device *dev)
struct chnl_net *priv;
ASSERT_RTNL();
priv = netdev_priv(dev);
- robust_list_del(&priv->list_field);
+ list_del_init(&priv->list_field);
}
static const struct net_device_ops netdev_ops = {
@@ -459,10 +438,11 @@ static void caif_netlink_parms(struct nlattr *data[],
}
}
-static int ipcaif_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[],
+static int ipcaif_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
struct netlink_ext_ack *extack)
{
+ struct nlattr **data = params->data;
int ret;
struct chnl_net *caifdev;
ASSERT_RTNL();
@@ -541,7 +521,7 @@ static void __exit chnl_exit_module(void)
rtnl_lock();
list_for_each_safe(list_node, _tmp, &chnl_net_list) {
dev = list_entry(list_node, struct chnl_net, list_field);
- list_del(list_node);
+ list_del_init(list_node);
delete_device(dev);
}
rtnl_unlock();
diff --git a/net/can/Kconfig b/net/can/Kconfig
index a4399be54ff4..e4ccf731a24c 100644
--- a/net/can/Kconfig
+++ b/net/can/Kconfig
@@ -1,27 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Controller Area Network (CAN) network layer core configuration
#
menuconfig CAN
- depends on NET
tristate "CAN bus subsystem support"
- ---help---
+ select CAN_DEV
+ help
Controller Area Network (CAN) is a slow (up to 1Mbit/s) serial
- communications protocol which was developed by Bosch in
- 1991, mainly for automotive, but now widely used in marine
- (NMEA2000), industrial, and medical applications.
- More information on the CAN network protocol family PF_CAN
- is contained in <Documentation/networking/can.rst>.
+ communications protocol. Development of the CAN bus started in
+ 1983 at Robert Bosch GmbH, and the protocol was officially
+ released in 1986. The CAN bus was originally mainly for automotive,
+ but is now widely used in marine (NMEA2000), industrial, and medical
+ applications. More information on the CAN network protocol family
+ PF_CAN is contained in <Documentation/networking/can.rst>.
If you want CAN support you should say Y here and also to the
- specific driver for your controller(s) below.
+ specific driver for your controller(s) under the Network device
+ support section.
if CAN
config CAN_RAW
tristate "Raw CAN Protocol (raw access with CAN-ID filtering)"
default y
- ---help---
+ help
The raw CAN protocol option offers access to the CAN bus via
the BSD socket API. You probably want to use the raw socket in
most cases where no higher level protocol is being used. The raw
@@ -31,7 +34,7 @@ config CAN_RAW
config CAN_BCM
tristate "Broadcast Manager CAN Protocol (with content filtering)"
default y
- ---help---
+ help
The Broadcast Manager offers content filtering, timeout monitoring,
sending of RTR frames, and cyclic CAN messages without permanent user
interaction. The BCM can be 'programmed' via the BSD socket API and
@@ -43,7 +46,7 @@ config CAN_BCM
config CAN_GW
tristate "CAN Gateway/Router (with netlink configuration)"
default y
- ---help---
+ help
The CAN Gateway/Router is used to route (and modify) CAN frames.
It is based on the PF_CAN core infrastructure for msg filtering and
msg sending and can optionally modify routed CAN frames on the fly.
@@ -51,6 +54,20 @@ config CAN_GW
They can be modified with AND/OR/XOR/SET operations as configured
by the netlink configuration interface known e.g. from iptables.
-source "drivers/net/can/Kconfig"
+source "net/can/j1939/Kconfig"
+
+config CAN_ISOTP
+ tristate "ISO 15765-2 CAN transport protocol"
+ help
+ CAN Transport Protocols offer support for segmented Point-to-Point
+ communication between CAN nodes via two defined CAN Identifiers.
+ This protocol driver implements segmented data transfers for CAN CC
+ (aka Classical CAN, CAN 2.0B) and CAN FD frame types which were
+ introduced with ISO 15765-2:2016.
+ As CAN frames can only transport a small amount of data bytes
+ (max. 8 bytes for CAN CC and max. 64 bytes for CAN FD) this
+ segmentation is needed to transport longer Protocol Data Units (PDU)
+ as needed e.g. for vehicle diagnosis (UDS, ISO 14229) or IP-over-CAN
+ traffic.
endif
diff --git a/net/can/Makefile b/net/can/Makefile
index 1242bbbfe57f..58f2c31c1ef3 100644
--- a/net/can/Makefile
+++ b/net/can/Makefile
@@ -15,3 +15,8 @@ can-bcm-y := bcm.o
obj-$(CONFIG_CAN_GW) += can-gw.o
can-gw-y := gw.o
+
+obj-$(CONFIG_CAN_J1939) += j1939/
+
+obj-$(CONFIG_CAN_ISOTP) += can-isotp.o
+can-isotp-y := isotp.o
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 1684ba5b51eb..770173d8db42 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -1,5 +1,5 @@
-/*
- * af_can.c - Protocol family CAN core module
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/* af_can.c - Protocol family CAN core module
* (used by different CAN protocol modules)
*
* Copyright (c) 2002-2017 Volkswagen Group Electronic Research
@@ -58,6 +58,7 @@
#include <linux/can.h>
#include <linux/can/core.h>
#include <linux/can/skb.h>
+#include <linux/can/can-ml.h>
#include <linux/ratelimit.h>
#include <net/net_namespace.h>
#include <net/sock.h>
@@ -83,29 +84,14 @@ static DEFINE_MUTEX(proto_tab_lock);
static atomic_t skbcounter = ATOMIC_INIT(0);
-/*
- * af_can socket functions
- */
+/* af_can socket functions */
-int can_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
-{
- struct sock *sk = sock->sk;
-
- switch (cmd) {
-
- case SIOCGSTAMP:
- return sock_get_timestamp(sk, (struct timeval __user *)arg);
-
- default:
- return -ENOIOCTLCMD;
- }
-}
-EXPORT_SYMBOL(can_ioctl);
-
-static void can_sock_destruct(struct sock *sk)
+void can_sock_destruct(struct sock *sk)
{
skb_queue_purge(&sk->sk_receive_queue);
+ skb_queue_purge(&sk->sk_error_queue);
}
+EXPORT_SYMBOL(can_sock_destruct);
static const struct can_proto *can_get_proto(int protocol)
{
@@ -145,14 +131,13 @@ static int can_create(struct net *net, struct socket *sock, int protocol,
err = request_module("can-proto-%d", protocol);
- /*
- * In case of error we only print a message but don't
+ /* In case of error we only print a message but don't
* return the error code immediately. Below we will
* return -EPROTONOSUPPORT
*/
if (err)
- printk_ratelimited(KERN_ERR "can: request_module "
- "(can-proto-%d) failed.\n", protocol);
+ pr_err_ratelimited("can: request_module (can-proto-%d) failed.\n",
+ protocol);
cp = can_get_proto(protocol);
}
@@ -186,6 +171,9 @@ static int can_create(struct net *net, struct socket *sock, int protocol,
/* release sk on errors */
sock_orphan(sk);
sock_put(sk);
+ sock->sk = NULL;
+ } else {
+ sock_prot_inuse_add(net, sk->sk_prot, 1);
}
errout:
@@ -193,9 +181,7 @@ static int can_create(struct net *net, struct socket *sock, int protocol,
return err;
}
-/*
- * af_can tx path
- */
+/* af_can tx path */
/**
* can_send - transmit a CAN frame (optional with local loopback)
@@ -216,27 +202,26 @@ static int can_create(struct net *net, struct socket *sock, int protocol,
int can_send(struct sk_buff *skb, int loop)
{
struct sk_buff *newskb = NULL;
- struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
- struct s_stats *can_stats = dev_net(skb->dev)->can.can_stats;
+ struct can_pkg_stats *pkg_stats = dev_net(skb->dev)->can.pkg_stats;
int err = -EINVAL;
- if (skb->len == CAN_MTU) {
+ if (can_is_canxl_skb(skb)) {
+ skb->protocol = htons(ETH_P_CANXL);
+ } else if (can_is_can_skb(skb)) {
skb->protocol = htons(ETH_P_CAN);
- if (unlikely(cfd->len > CAN_MAX_DLEN))
- goto inval_skb;
- } else if (skb->len == CANFD_MTU) {
+ } else if (can_is_canfd_skb(skb)) {
+ struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
+
skb->protocol = htons(ETH_P_CANFD);
- if (unlikely(cfd->len > CANFD_MAX_DLEN))
- goto inval_skb;
- } else
+
+ /* set CAN FD flag for CAN FD frames by default */
+ cfd->flags |= CANFD_FDF;
+ } else {
goto inval_skb;
+ }
- /*
- * Make sure the CAN frame can pass the selected CAN netdevice.
- * As structs can_frame and canfd_frame are similar, we can provide
- * CAN FD frames to legacy CAN drivers as long as the length is <= 8
- */
- if (unlikely(skb->len > skb->dev->mtu && cfd->len > CAN_MAX_DLEN)) {
+ /* Make sure the CAN frame can pass the selected CAN netdevice. */
+ if (unlikely(skb->len > READ_ONCE(skb->dev->mtu))) {
err = -EMSGSIZE;
goto inval_skb;
}
@@ -263,8 +248,7 @@ int can_send(struct sk_buff *skb, int loop)
/* indication for the CAN driver: do loopback */
skb->pkt_type = PACKET_LOOPBACK;
- /*
- * The reference to the originating sock may be required
+ /* The reference to the originating sock may be required
* by the receiving socket to check whether the frame is
* its own. Example: can_raw sockopt CAN_RAW_RECV_OWN_MSGS
* Therefore we have to ensure that skb->sk remains the
@@ -273,8 +257,7 @@ int can_send(struct sk_buff *skb, int loop)
*/
if (!(skb->dev->flags & IFF_ECHO)) {
- /*
- * If the interface is not capable to do loopback
+ /* If the interface is not capable to do loopback
* itself, we do it here.
*/
newskb = skb_clone(skb, GFP_ATOMIC);
@@ -303,11 +286,11 @@ int can_send(struct sk_buff *skb, int loop)
}
if (newskb)
- netif_rx_ni(newskb);
+ netif_rx(newskb);
/* update statistics */
- can_stats->tx_frames++;
- can_stats->tx_frames_delta++;
+ atomic_long_inc(&pkg_stats->tx_frames);
+ atomic_long_inc(&pkg_stats->tx_frames_delta);
return 0;
@@ -317,17 +300,17 @@ inval_skb:
}
EXPORT_SYMBOL(can_send);
-/*
- * af_can rx path
- */
+/* af_can rx path */
-static struct can_dev_rcv_lists *find_dev_rcv_lists(struct net *net,
- struct net_device *dev)
+static struct can_dev_rcv_lists *can_dev_rcv_lists_find(struct net *net,
+ struct net_device *dev)
{
- if (!dev)
- return net->can.can_rx_alldev_list;
- else
- return (struct can_dev_rcv_lists *)dev->ml_priv;
+ if (dev) {
+ struct can_ml_priv *can_ml = can_get_ml_priv(dev);
+ return &can_ml->dev_rcv_lists;
+ } else {
+ return net->can.rx_alldev_list;
+ }
}
/**
@@ -354,10 +337,10 @@ static unsigned int effhash(canid_t can_id)
}
/**
- * find_rcv_list - determine optimal filterlist inside device filter struct
+ * can_rcv_list_find - determine optimal filterlist inside device filter struct
* @can_id: pointer to CAN identifier of a given can_filter
* @mask: pointer to CAN mask of a given can_filter
- * @d: pointer to the device filter struct
+ * @dev_rcv_lists: pointer to the device filter struct
*
* Description:
* Returns the optimal filterlist to reduce the filter handling in the
@@ -377,11 +360,11 @@ static unsigned int effhash(canid_t can_id)
*
* Return:
* Pointer to optimal filterlist for the given can_id/mask pair.
- * Constistency checked mask.
+ * Consistency checked mask.
* Reduced can_id to have a preprocessed filter compare value.
*/
-static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask,
- struct can_dev_rcv_lists *d)
+static struct hlist_head *can_rcv_list_find(canid_t *can_id, canid_t *mask,
+ struct can_dev_rcv_lists *dev_rcv_lists)
{
canid_t inv = *can_id & CAN_INV_FILTER; /* save flag before masking */
@@ -389,7 +372,7 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask,
if (*mask & CAN_ERR_FLAG) {
/* clear CAN_ERR_FLAG in filter entry */
*mask &= CAN_ERR_MASK;
- return &d->rx[RX_ERR];
+ return &dev_rcv_lists->rx[RX_ERR];
}
/* with cleared CAN_ERR_FLAG we have a simple mask/value filterpair */
@@ -405,32 +388,32 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask,
/* inverse can_id/can_mask filter */
if (inv)
- return &d->rx[RX_INV];
+ return &dev_rcv_lists->rx[RX_INV];
/* mask == 0 => no condition testing at receive time */
if (!(*mask))
- return &d->rx[RX_ALL];
+ return &dev_rcv_lists->rx[RX_ALL];
/* extra filterlists for the subscription of a single non-RTR can_id */
if (((*mask & CAN_EFF_RTR_FLAGS) == CAN_EFF_RTR_FLAGS) &&
!(*can_id & CAN_RTR_FLAG)) {
-
if (*can_id & CAN_EFF_FLAG) {
if (*mask == (CAN_EFF_MASK | CAN_EFF_RTR_FLAGS))
- return &d->rx_eff[effhash(*can_id)];
+ return &dev_rcv_lists->rx_eff[effhash(*can_id)];
} else {
if (*mask == (CAN_SFF_MASK | CAN_EFF_RTR_FLAGS))
- return &d->rx_sff[*can_id];
+ return &dev_rcv_lists->rx_sff[*can_id];
}
}
/* default: filter via can_id/can_mask */
- return &d->rx[RX_FIL];
+ return &dev_rcv_lists->rx[RX_FIL];
}
/**
* can_rx_register - subscribe CAN frames from a specific interface
- * @dev: pointer to netdevice (NULL => subcribe from 'all' CAN devices list)
+ * @net: the applicable net namespace
+ * @dev: pointer to netdevice (NULL => subscribe from 'all' CAN devices list)
* @can_id: CAN identifier (see description)
* @mask: CAN mask (see description)
* @func: callback function on filter match
@@ -462,70 +445,62 @@ int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id,
canid_t mask, void (*func)(struct sk_buff *, void *),
void *data, char *ident, struct sock *sk)
{
- struct receiver *r;
- struct hlist_head *rl;
- struct can_dev_rcv_lists *d;
- struct s_pstats *can_pstats = net->can.can_pstats;
- int err = 0;
+ struct receiver *rcv;
+ struct hlist_head *rcv_list;
+ struct can_dev_rcv_lists *dev_rcv_lists;
+ struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats;
/* insert new receiver (dev,canid,mask) -> (func,data) */
- if (dev && dev->type != ARPHRD_CAN)
+ if (dev && (dev->type != ARPHRD_CAN || !can_get_ml_priv(dev)))
return -ENODEV;
if (dev && !net_eq(net, dev_net(dev)))
return -ENODEV;
- r = kmem_cache_alloc(rcv_cache, GFP_KERNEL);
- if (!r)
+ rcv = kmem_cache_alloc(rcv_cache, GFP_KERNEL);
+ if (!rcv)
return -ENOMEM;
- spin_lock(&net->can.can_rcvlists_lock);
+ spin_lock_bh(&net->can.rcvlists_lock);
- d = find_dev_rcv_lists(net, dev);
- if (d) {
- rl = find_rcv_list(&can_id, &mask, d);
+ dev_rcv_lists = can_dev_rcv_lists_find(net, dev);
+ rcv_list = can_rcv_list_find(&can_id, &mask, dev_rcv_lists);
- r->can_id = can_id;
- r->mask = mask;
- r->matches = 0;
- r->func = func;
- r->data = data;
- r->ident = ident;
- r->sk = sk;
+ rcv->can_id = can_id;
+ rcv->mask = mask;
+ rcv->matches = 0;
+ rcv->func = func;
+ rcv->data = data;
+ rcv->ident = ident;
+ rcv->sk = sk;
- hlist_add_head_rcu(&r->list, rl);
- d->entries++;
+ hlist_add_head_rcu(&rcv->list, rcv_list);
+ dev_rcv_lists->entries++;
- can_pstats->rcv_entries++;
- if (can_pstats->rcv_entries_max < can_pstats->rcv_entries)
- can_pstats->rcv_entries_max = can_pstats->rcv_entries;
- } else {
- kmem_cache_free(rcv_cache, r);
- err = -ENODEV;
- }
-
- spin_unlock(&net->can.can_rcvlists_lock);
+ rcv_lists_stats->rcv_entries++;
+ rcv_lists_stats->rcv_entries_max = max(rcv_lists_stats->rcv_entries_max,
+ rcv_lists_stats->rcv_entries);
+ spin_unlock_bh(&net->can.rcvlists_lock);
- return err;
+ return 0;
}
EXPORT_SYMBOL(can_rx_register);
-/*
- * can_rx_delete_receiver - rcu callback for single receiver entry removal
- */
+/* can_rx_delete_receiver - rcu callback for single receiver entry removal */
static void can_rx_delete_receiver(struct rcu_head *rp)
{
- struct receiver *r = container_of(rp, struct receiver, rcu);
- struct sock *sk = r->sk;
+ struct receiver *rcv = container_of(rp, struct receiver, rcu);
+ struct sock *sk = rcv->sk;
- kmem_cache_free(rcv_cache, r);
+ kmem_cache_free(rcv_cache, rcv);
if (sk)
sock_put(sk);
}
/**
* can_rx_unregister - unsubscribe CAN frames from a specific interface
+ * @net: the applicable net namespace
* @dev: pointer to netdevice (NULL => unsubscribe from 'all' CAN devices list)
* @can_id: CAN identifier
* @mask: CAN mask
@@ -539,10 +514,10 @@ void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id,
canid_t mask, void (*func)(struct sk_buff *, void *),
void *data)
{
- struct receiver *r = NULL;
- struct hlist_head *rl;
- struct s_pstats *can_pstats = net->can.can_pstats;
- struct can_dev_rcv_lists *d;
+ struct receiver *rcv = NULL;
+ struct hlist_head *rcv_list;
+ struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats;
+ struct can_dev_rcv_lists *dev_rcv_lists;
if (dev && dev->type != ARPHRD_CAN)
return;
@@ -550,86 +525,72 @@ void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id,
if (dev && !net_eq(net, dev_net(dev)))
return;
- spin_lock(&net->can.can_rcvlists_lock);
+ spin_lock_bh(&net->can.rcvlists_lock);
- d = find_dev_rcv_lists(net, dev);
- if (!d) {
- pr_err("BUG: receive list not found for "
- "dev %s, id %03X, mask %03X\n",
- DNAME(dev), can_id, mask);
- goto out;
- }
+ dev_rcv_lists = can_dev_rcv_lists_find(net, dev);
+ rcv_list = can_rcv_list_find(&can_id, &mask, dev_rcv_lists);
- rl = find_rcv_list(&can_id, &mask, d);
-
- /*
- * Search the receiver list for the item to delete. This should
+ /* Search the receiver list for the item to delete. This should
* exist, since no receiver may be unregistered that hasn't
* been registered before.
*/
-
- hlist_for_each_entry_rcu(r, rl, list) {
- if (r->can_id == can_id && r->mask == mask &&
- r->func == func && r->data == data)
+ hlist_for_each_entry_rcu(rcv, rcv_list, list) {
+ if (rcv->can_id == can_id && rcv->mask == mask &&
+ rcv->func == func && rcv->data == data)
break;
}
- /*
- * Check for bugs in CAN protocol implementations using af_can.c:
- * 'r' will be NULL if no matching list item was found for removal.
+ /* Check for bugs in CAN protocol implementations using af_can.c:
+ * 'rcv' will be NULL if no matching list item was found for removal.
+ * As this case may potentially happen when closing a socket while
+ * the notifier for removing the CAN netdev is running we just print
+ * a warning here.
*/
-
- if (!r) {
- WARN(1, "BUG: receive list entry not found for dev %s, "
- "id %03X, mask %03X\n", DNAME(dev), can_id, mask);
+ if (!rcv) {
+ pr_warn("can: receive list entry not found for dev %s, id %03X, mask %03X\n",
+ DNAME(dev), can_id, mask);
goto out;
}
- hlist_del_rcu(&r->list);
- d->entries--;
-
- if (can_pstats->rcv_entries > 0)
- can_pstats->rcv_entries--;
+ hlist_del_rcu(&rcv->list);
+ dev_rcv_lists->entries--;
- /* remove device structure requested by NETDEV_UNREGISTER */
- if (d->remove_on_zero_entries && !d->entries) {
- kfree(d);
- dev->ml_priv = NULL;
- }
+ if (rcv_lists_stats->rcv_entries > 0)
+ rcv_lists_stats->rcv_entries--;
out:
- spin_unlock(&net->can.can_rcvlists_lock);
+ spin_unlock_bh(&net->can.rcvlists_lock);
/* schedule the receiver item for deletion */
- if (r) {
- if (r->sk)
- sock_hold(r->sk);
- call_rcu(&r->rcu, can_rx_delete_receiver);
+ if (rcv) {
+ if (rcv->sk)
+ sock_hold(rcv->sk);
+ call_rcu(&rcv->rcu, can_rx_delete_receiver);
}
}
EXPORT_SYMBOL(can_rx_unregister);
-static inline void deliver(struct sk_buff *skb, struct receiver *r)
+static inline void deliver(struct sk_buff *skb, struct receiver *rcv)
{
- r->func(skb, r->data);
- r->matches++;
+ rcv->func(skb, rcv->data);
+ rcv->matches++;
}
-static int can_rcv_filter(struct can_dev_rcv_lists *d, struct sk_buff *skb)
+static int can_rcv_filter(struct can_dev_rcv_lists *dev_rcv_lists, struct sk_buff *skb)
{
- struct receiver *r;
+ struct receiver *rcv;
int matches = 0;
struct can_frame *cf = (struct can_frame *)skb->data;
canid_t can_id = cf->can_id;
- if (d->entries == 0)
+ if (dev_rcv_lists->entries == 0)
return 0;
if (can_id & CAN_ERR_FLAG) {
/* check for error message frame entries only */
- hlist_for_each_entry_rcu(r, &d->rx[RX_ERR], list) {
- if (can_id & r->mask) {
- deliver(skb, r);
+ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_ERR], list) {
+ if (can_id & rcv->mask) {
+ deliver(skb, rcv);
matches++;
}
}
@@ -637,23 +598,23 @@ static int can_rcv_filter(struct can_dev_rcv_lists *d, struct sk_buff *skb)
}
/* check for unfiltered entries */
- hlist_for_each_entry_rcu(r, &d->rx[RX_ALL], list) {
- deliver(skb, r);
+ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_ALL], list) {
+ deliver(skb, rcv);
matches++;
}
/* check for can_id/mask entries */
- hlist_for_each_entry_rcu(r, &d->rx[RX_FIL], list) {
- if ((can_id & r->mask) == r->can_id) {
- deliver(skb, r);
+ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_FIL], list) {
+ if ((can_id & rcv->mask) == rcv->can_id) {
+ deliver(skb, rcv);
matches++;
}
}
/* check for inverted can_id/mask entries */
- hlist_for_each_entry_rcu(r, &d->rx[RX_INV], list) {
- if ((can_id & r->mask) != r->can_id) {
- deliver(skb, r);
+ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx[RX_INV], list) {
+ if ((can_id & rcv->mask) != rcv->can_id) {
+ deliver(skb, rcv);
matches++;
}
}
@@ -663,16 +624,16 @@ static int can_rcv_filter(struct can_dev_rcv_lists *d, struct sk_buff *skb)
return matches;
if (can_id & CAN_EFF_FLAG) {
- hlist_for_each_entry_rcu(r, &d->rx_eff[effhash(can_id)], list) {
- if (r->can_id == can_id) {
- deliver(skb, r);
+ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx_eff[effhash(can_id)], list) {
+ if (rcv->can_id == can_id) {
+ deliver(skb, rcv);
matches++;
}
}
} else {
can_id &= CAN_SFF_MASK;
- hlist_for_each_entry_rcu(r, &d->rx_sff[can_id], list) {
- deliver(skb, r);
+ hlist_for_each_entry_rcu(rcv, &dev_rcv_lists->rx_sff[can_id], list) {
+ deliver(skb, rcv);
matches++;
}
}
@@ -682,14 +643,14 @@ static int can_rcv_filter(struct can_dev_rcv_lists *d, struct sk_buff *skb)
static void can_receive(struct sk_buff *skb, struct net_device *dev)
{
- struct can_dev_rcv_lists *d;
+ struct can_dev_rcv_lists *dev_rcv_lists;
struct net *net = dev_net(dev);
- struct s_stats *can_stats = net->can.can_stats;
+ struct can_pkg_stats *pkg_stats = net->can.pkg_stats;
int matches;
/* update statistics */
- can_stats->rx_frames++;
- can_stats->rx_frames_delta++;
+ atomic_long_inc(&pkg_stats->rx_frames);
+ atomic_long_inc(&pkg_stats->rx_frames_delta);
/* create non-zero unique skb identifier together with *skb */
while (!(can_skb_prv(skb)->skbcnt))
@@ -698,12 +659,11 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev)
rcu_read_lock();
/* deliver the packet to sockets listening on all devices */
- matches = can_rcv_filter(net->can.can_rx_alldev_list, skb);
+ matches = can_rcv_filter(net->can.rx_alldev_list, skb);
/* find receive list for this device */
- d = find_dev_rcv_lists(net, dev);
- if (d)
- matches += can_rcv_filter(d, skb);
+ dev_rcv_lists = can_dev_rcv_lists_find(net, dev);
+ matches += can_rcv_filter(dev_rcv_lists, skb);
rcu_read_unlock();
@@ -711,21 +671,19 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev)
consume_skb(skb);
if (matches > 0) {
- can_stats->matches++;
- can_stats->matches_delta++;
+ atomic_long_inc(&pkg_stats->matches);
+ atomic_long_inc(&pkg_stats->matches_delta);
}
}
static int can_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
- struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
+ if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_can_skb(skb))) {
+ pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d\n",
+ dev->type, skb->len);
- if (unlikely(dev->type != ARPHRD_CAN || skb->len != CAN_MTU ||
- cfd->len > CAN_MAX_DLEN)) {
- pr_warn_once("PF_CAN: dropped non conform CAN skbuf: dev type %d, len %d, datalen %d\n",
- dev->type, skb->len, cfd->len);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_CAN_RX_INVALID_FRAME);
return NET_RX_DROP;
}
@@ -734,15 +692,13 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev,
}
static int canfd_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt, struct net_device *orig_dev)
+ struct packet_type *pt, struct net_device *orig_dev)
{
- struct canfd_frame *cfd = (struct canfd_frame *)skb->data;
+ if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canfd_skb(skb))) {
+ pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d\n",
+ dev->type, skb->len);
- if (unlikely(dev->type != ARPHRD_CAN || skb->len != CANFD_MTU ||
- cfd->len > CANFD_MAX_DLEN)) {
- pr_warn_once("PF_CAN: dropped non conform CAN FD skbuf: dev type %d, len %d, datalen %d\n",
- dev->type, skb->len, cfd->len);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_CANFD_RX_INVALID_FRAME);
return NET_RX_DROP;
}
@@ -750,9 +706,22 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev,
return NET_RX_SUCCESS;
}
-/*
- * af_can protocol functions
- */
+static int canxl_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canxl_skb(skb))) {
+ pr_warn_once("PF_CAN: dropped non conform CAN XL skbuff: dev type %d, len %d\n",
+ dev->type, skb->len);
+
+ kfree_skb_reason(skb, SKB_DROP_REASON_CANXL_RX_INVALID_FRAME);
+ return NET_RX_DROP;
+ }
+
+ can_receive(skb, dev);
+ return NET_RX_SUCCESS;
+}
+
+/* af_can protocol functions */
/**
* can_proto_register - register CAN transport protocol
@@ -783,8 +752,9 @@ int can_proto_register(const struct can_proto *cp)
if (rcu_access_pointer(proto_tab[proto])) {
pr_err("can: protocol %d already registered\n", proto);
err = -EBUSY;
- } else
+ } else {
RCU_INIT_POINTER(proto_tab[proto], cp);
+ }
mutex_unlock(&proto_tab_lock);
@@ -814,121 +784,56 @@ void can_proto_unregister(const struct can_proto *cp)
}
EXPORT_SYMBOL(can_proto_unregister);
-/*
- * af_can notifier to create/remove CAN netdevice specific structs
- */
-static int can_notifier(struct notifier_block *nb, unsigned long msg,
- void *ptr)
-{
- struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct can_dev_rcv_lists *d;
-
- if (dev->type != ARPHRD_CAN)
- return NOTIFY_DONE;
-
- switch (msg) {
-
- case NETDEV_REGISTER:
-
- /* create new dev_rcv_lists for this device */
- d = kzalloc(sizeof(*d), GFP_KERNEL);
- if (!d)
- return NOTIFY_DONE;
- BUG_ON(dev->ml_priv);
- dev->ml_priv = d;
-
- break;
-
- case NETDEV_UNREGISTER:
- spin_lock(&dev_net(dev)->can.can_rcvlists_lock);
-
- d = dev->ml_priv;
- if (d) {
- if (d->entries)
- d->remove_on_zero_entries = 1;
- else {
- kfree(d);
- dev->ml_priv = NULL;
- }
- } else
- pr_err("can: notifier: receive list not found for dev "
- "%s\n", dev->name);
-
- spin_unlock(&dev_net(dev)->can.can_rcvlists_lock);
-
- break;
- }
-
- return NOTIFY_DONE;
-}
-
static int can_pernet_init(struct net *net)
{
- spin_lock_init(&net->can.can_rcvlists_lock);
- net->can.can_rx_alldev_list =
- kzalloc(sizeof(struct can_dev_rcv_lists), GFP_KERNEL);
- if (!net->can.can_rx_alldev_list)
+ spin_lock_init(&net->can.rcvlists_lock);
+ net->can.rx_alldev_list =
+ kzalloc(sizeof(*net->can.rx_alldev_list), GFP_KERNEL);
+ if (!net->can.rx_alldev_list)
goto out;
- net->can.can_stats = kzalloc(sizeof(struct s_stats), GFP_KERNEL);
- if (!net->can.can_stats)
- goto out_free_alldev_list;
- net->can.can_pstats = kzalloc(sizeof(struct s_pstats), GFP_KERNEL);
- if (!net->can.can_pstats)
- goto out_free_can_stats;
+ net->can.pkg_stats = kzalloc(sizeof(*net->can.pkg_stats), GFP_KERNEL);
+ if (!net->can.pkg_stats)
+ goto out_free_rx_alldev_list;
+ net->can.rcv_lists_stats = kzalloc(sizeof(*net->can.rcv_lists_stats), GFP_KERNEL);
+ if (!net->can.rcv_lists_stats)
+ goto out_free_pkg_stats;
if (IS_ENABLED(CONFIG_PROC_FS)) {
/* the statistics are updated every second (timer triggered) */
if (stats_timer) {
- timer_setup(&net->can.can_stattimer, can_stat_update,
+ timer_setup(&net->can.stattimer, can_stat_update,
0);
- mod_timer(&net->can.can_stattimer,
+ mod_timer(&net->can.stattimer,
round_jiffies(jiffies + HZ));
}
- net->can.can_stats->jiffies_init = jiffies;
+ net->can.pkg_stats->jiffies_init = jiffies;
can_init_proc(net);
}
return 0;
- out_free_can_stats:
- kfree(net->can.can_stats);
- out_free_alldev_list:
- kfree(net->can.can_rx_alldev_list);
+ out_free_pkg_stats:
+ kfree(net->can.pkg_stats);
+ out_free_rx_alldev_list:
+ kfree(net->can.rx_alldev_list);
out:
return -ENOMEM;
}
static void can_pernet_exit(struct net *net)
{
- struct net_device *dev;
-
if (IS_ENABLED(CONFIG_PROC_FS)) {
can_remove_proc(net);
if (stats_timer)
- del_timer_sync(&net->can.can_stattimer);
- }
-
- /* remove created dev_rcv_lists from still registered CAN devices */
- rcu_read_lock();
- for_each_netdev_rcu(net, dev) {
- if (dev->type == ARPHRD_CAN && dev->ml_priv) {
- struct can_dev_rcv_lists *d = dev->ml_priv;
-
- BUG_ON(d->entries);
- kfree(d);
- dev->ml_priv = NULL;
- }
+ timer_delete_sync(&net->can.stattimer);
}
- rcu_read_unlock();
- kfree(net->can.can_rx_alldev_list);
- kfree(net->can.can_stats);
- kfree(net->can.can_pstats);
+ kfree(net->can.rx_alldev_list);
+ kfree(net->can.pkg_stats);
+ kfree(net->can.rcv_lists_stats);
}
-/*
- * af_can module init/exit functions
- */
+/* af_can module init/exit functions */
static struct packet_type can_packet __read_mostly = {
.type = cpu_to_be16(ETH_P_CAN),
@@ -940,17 +845,17 @@ static struct packet_type canfd_packet __read_mostly = {
.func = canfd_rcv,
};
+static struct packet_type canxl_packet __read_mostly = {
+ .type = cpu_to_be16(ETH_P_CANXL),
+ .func = canxl_rcv,
+};
+
static const struct net_proto_family can_family_ops = {
.family = PF_CAN,
.create = can_create,
.owner = THIS_MODULE,
};
-/* notifier block for netdevice event */
-static struct notifier_block can_netdev_notifier __read_mostly = {
- .notifier_call = can_notifier,
-};
-
static struct pernet_operations can_pernet_ops __read_mostly = {
.init = can_pernet_init,
.exit = can_pernet_exit,
@@ -958,36 +863,52 @@ static struct pernet_operations can_pernet_ops __read_mostly = {
static __init int can_init(void)
{
+ int err;
+
/* check for correct padding to be able to use the structs similarly */
- BUILD_BUG_ON(offsetof(struct can_frame, can_dlc) !=
+ BUILD_BUG_ON(offsetof(struct can_frame, len) !=
offsetof(struct canfd_frame, len) ||
+ offsetof(struct can_frame, len) !=
+ offsetof(struct canxl_frame, flags) ||
offsetof(struct can_frame, data) !=
offsetof(struct canfd_frame, data));
- pr_info("can: controller area network core (" CAN_VERSION_STRING ")\n");
+ pr_info("can: controller area network core\n");
rcv_cache = kmem_cache_create("can_receiver", sizeof(struct receiver),
0, 0, NULL);
if (!rcv_cache)
return -ENOMEM;
- register_pernet_subsys(&can_pernet_ops);
+ err = register_pernet_subsys(&can_pernet_ops);
+ if (err)
+ goto out_pernet;
/* protocol register */
- sock_register(&can_family_ops);
- register_netdevice_notifier(&can_netdev_notifier);
+ err = sock_register(&can_family_ops);
+ if (err)
+ goto out_sock;
+
dev_add_pack(&can_packet);
dev_add_pack(&canfd_packet);
+ dev_add_pack(&canxl_packet);
return 0;
+
+out_sock:
+ unregister_pernet_subsys(&can_pernet_ops);
+out_pernet:
+ kmem_cache_destroy(rcv_cache);
+
+ return err;
}
static __exit void can_exit(void)
{
/* protocol unregister */
+ dev_remove_pack(&canxl_packet);
dev_remove_pack(&canfd_packet);
dev_remove_pack(&can_packet);
- unregister_netdevice_notifier(&can_netdev_notifier);
sock_unregister(PF_CAN);
unregister_pernet_subsys(&can_pernet_ops);
diff --git a/net/can/af_can.h b/net/can/af_can.h
index 9cb3719632bd..22f3352c77fe 100644
--- a/net/can/af_can.h
+++ b/net/can/af_can.h
@@ -1,5 +1,5 @@
-/*
- * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
+/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */
+/* Copyright (c) 2002-2007 Volkswagen Group Electronic Research
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -53,37 +53,22 @@ struct receiver {
canid_t can_id;
canid_t mask;
unsigned long matches;
- void (*func)(struct sk_buff *, void *);
+ void (*func)(struct sk_buff *skb, void *data);
void *data;
char *ident;
struct sock *sk;
struct rcu_head rcu;
};
-#define CAN_SFF_RCV_ARRAY_SZ (1 << CAN_SFF_ID_BITS)
-#define CAN_EFF_RCV_HASH_BITS 10
-#define CAN_EFF_RCV_ARRAY_SZ (1 << CAN_EFF_RCV_HASH_BITS)
-
-enum { RX_ERR, RX_ALL, RX_FIL, RX_INV, RX_MAX };
-
-/* per device receive filters linked at dev->ml_priv */
-struct can_dev_rcv_lists {
- struct hlist_head rx[RX_MAX];
- struct hlist_head rx_sff[CAN_SFF_RCV_ARRAY_SZ];
- struct hlist_head rx_eff[CAN_EFF_RCV_ARRAY_SZ];
- int remove_on_zero_entries;
- int entries;
-};
-
/* statistic structures */
/* can be reset e.g. by can_init_stats() */
-struct s_stats {
+struct can_pkg_stats {
unsigned long jiffies_init;
- unsigned long rx_frames;
- unsigned long tx_frames;
- unsigned long matches;
+ atomic_long_t rx_frames;
+ atomic_long_t tx_frames;
+ atomic_long_t matches;
unsigned long total_rx_rate;
unsigned long total_tx_rate;
@@ -97,13 +82,13 @@ struct s_stats {
unsigned long max_tx_rate;
unsigned long max_rx_match_ratio;
- unsigned long rx_frames_delta;
- unsigned long tx_frames_delta;
- unsigned long matches_delta;
+ atomic_long_t rx_frames_delta;
+ atomic_long_t tx_frames_delta;
+ atomic_long_t matches_delta;
};
/* persistent statistics */
-struct s_pstats {
+struct can_rcv_lists_stats {
unsigned long stats_reset;
unsigned long user_reset;
unsigned long rcv_entries;
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 0af8f0db892a..7eba8ae01a5b 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/*
* bcm.c - Broadcast Manager to filter/send (cyclic) CAN content
*
@@ -57,6 +58,7 @@
#include <linux/can/skb.h>
#include <linux/can/bcm.h>
#include <linux/slab.h>
+#include <linux/spinlock.h>
#include <net/sock.h>
#include <net/net_namespace.h>
@@ -67,23 +69,28 @@
*/
#define MAX_NFRAMES 256
+/* limit timers to 400 days for sending/timeouts */
+#define BCM_TIMER_SEC_MAX (400 * 24 * 60 * 60)
+
/* use of last_frames[index].flags */
+#define RX_LOCAL 0x10 /* frame was created on the local host */
+#define RX_OWN 0x20 /* frame was sent via the socket it was received on */
#define RX_RECV 0x40 /* received data for this element */
#define RX_THR 0x80 /* element not been sent due to throttle feature */
-#define BCM_CAN_FLAGS_MASK 0x3F /* to clean private flags after usage */
+#define BCM_CAN_FLAGS_MASK 0x0F /* to clean private flags after usage */
/* get best masking value for can_rx_register() for a given single can_id */
#define REGMASK(id) ((id & CAN_EFF_FLAG) ? \
(CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \
(CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG))
-#define CAN_BCM_VERSION "20170425"
-
MODULE_DESCRIPTION("PF_CAN broadcast manager protocol");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>");
MODULE_ALIAS("can-proto-2");
+#define BCM_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_ifindex)
+
/*
* easy access to the first 64 bit of can(fd)_frame payload. cp->data is
* 64 bit aligned so the offset has to be multiples of 8 which is ensured
@@ -96,13 +103,13 @@ static inline u64 get_u64(const struct canfd_frame *cp, int offset)
struct bcm_op {
struct list_head list;
+ struct rcu_head rcu;
int ifindex;
canid_t can_id;
u32 flags;
unsigned long frames_abs, frames_filtered;
struct bcm_timeval ival1, ival2;
struct hrtimer timer, thrtimer;
- struct tasklet_struct tsklet, thrtsklet;
ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg;
int rx_ifindex;
int cfsiz;
@@ -116,13 +123,14 @@ struct bcm_op {
struct canfd_frame last_sframe;
struct sock *sk;
struct net_device *rx_reg_dev;
+ spinlock_t bcm_tx_lock; /* protect currframe/count in runtime updates */
};
struct bcm_sock {
struct sock sk;
int bound;
int ifindex;
- struct notifier_block notifier;
+ struct list_head notifier;
struct list_head rx_ops;
struct list_head tx_ops;
unsigned long dropped_usr_msgs;
@@ -130,6 +138,20 @@ struct bcm_sock {
char procname [32]; /* inode number in decimal with \0 */
};
+static LIST_HEAD(bcm_notifier_list);
+static DEFINE_SPINLOCK(bcm_notifier_lock);
+static struct bcm_sock *bcm_busy_notifier;
+
+/* Return pointer to store the extra msg flags for bcm_recvmsg().
+ * We use the space of one unsigned int beyond the 'struct sockaddr_can'
+ * in skb->cb.
+ */
+static inline unsigned int *bcm_flags(struct sk_buff *skb)
+{
+ /* return pointer after struct sockaddr_can */
+ return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]);
+}
+
static inline struct bcm_sock *bcm_sk(const struct sock *sk)
{
return (struct bcm_sock *)sk;
@@ -140,6 +162,22 @@ static inline ktime_t bcm_timeval_to_ktime(struct bcm_timeval tv)
return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC);
}
+/* check limitations for timeval provided by user */
+static bool bcm_is_invalid_tv(struct bcm_msg_head *msg_head)
+{
+ if ((msg_head->ival1.tv_sec < 0) ||
+ (msg_head->ival1.tv_sec > BCM_TIMER_SEC_MAX) ||
+ (msg_head->ival1.tv_usec < 0) ||
+ (msg_head->ival1.tv_usec >= USEC_PER_SEC) ||
+ (msg_head->ival2.tv_sec < 0) ||
+ (msg_head->ival2.tv_sec > BCM_TIMER_SEC_MAX) ||
+ (msg_head->ival2.tv_usec < 0) ||
+ (msg_head->ival2.tv_usec >= USEC_PER_SEC))
+ return true;
+
+ return false;
+}
+
#define CFSIZ(flags) ((flags & CAN_FD_FRAME) ? CANFD_MTU : CAN_MTU)
#define OPSIZ sizeof(struct bcm_op)
#define MHSIZ sizeof(struct bcm_msg_head)
@@ -170,7 +208,7 @@ static int bcm_proc_show(struct seq_file *m, void *v)
{
char ifname[IFNAMSIZ];
struct net *net = m->private;
- struct sock *sk = (struct sock *)PDE_DATA(m->file->f_inode);
+ struct sock *sk = (struct sock *)pde_data(m->file->f_inode);
struct bcm_sock *bo = bcm_sk(sk);
struct bcm_op *op;
@@ -181,7 +219,9 @@ static int bcm_proc_show(struct seq_file *m, void *v)
seq_printf(m, " / bound %s", bcm_proc_getifname(net, ifname, bo->ifindex));
seq_printf(m, " <<<\n");
- list_for_each_entry(op, &bo->rx_ops, list) {
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(op, &bo->rx_ops, list) {
unsigned long reduction;
@@ -237,6 +277,9 @@ static int bcm_proc_show(struct seq_file *m, void *v)
seq_printf(m, "# sent %ld\n", op->frames_abs);
}
seq_putc(m, '\n');
+
+ rcu_read_unlock();
+
return 0;
}
#endif /* CONFIG_PROC_FS */
@@ -249,12 +292,18 @@ static void bcm_can_tx(struct bcm_op *op)
{
struct sk_buff *skb;
struct net_device *dev;
- struct canfd_frame *cf = op->frames + op->cfsiz * op->currframe;
+ struct canfd_frame *cf;
+ int err;
/* no target device? => exit */
if (!op->ifindex)
return;
+ /* read currframe under lock protection */
+ spin_lock_bh(&op->bcm_tx_lock);
+ cf = op->frames + op->cfsiz * op->currframe;
+ spin_unlock_bh(&op->bcm_tx_lock);
+
dev = dev_get_by_index(sock_net(op->sk), op->ifindex);
if (!dev) {
/* RFC: should this bcm_op remove itself here? */
@@ -274,15 +323,24 @@ static void bcm_can_tx(struct bcm_op *op)
/* send with loopback */
skb->dev = dev;
can_skb_set_owner(skb, op->sk);
- can_send(skb, 1);
+ err = can_send(skb, 1);
+
+ /* update currframe and count under lock protection */
+ spin_lock_bh(&op->bcm_tx_lock);
+
+ if (!err)
+ op->frames_abs++;
- /* update statistics */
op->currframe++;
- op->frames_abs++;
/* reached last frame? */
if (op->currframe >= op->nframes)
op->currframe = 0;
+
+ if (op->count > 0)
+ op->count--;
+
+ spin_unlock_bh(&op->bcm_tx_lock);
out:
dev_put(dev);
}
@@ -300,6 +358,8 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
struct sock *sk = op->sk;
unsigned int datalen = head->nframes * op->cfsiz;
int err;
+ unsigned int *pflags;
+ enum skb_drop_reason reason;
skb = alloc_skb(sizeof(*head) + datalen, gfp_any());
if (!skb)
@@ -307,6 +367,14 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
skb_put_data(skb, head, sizeof(*head));
+ /* ensure space for sockaddr_can and msg flags */
+ sock_skb_cb_check_size(sizeof(struct sockaddr_can) +
+ sizeof(unsigned int));
+
+ /* initialize msg flags */
+ pflags = bcm_flags(skb);
+ *pflags = 0;
+
if (head->nframes) {
/* CAN frames starting here */
firstframe = (struct canfd_frame *)skb_tail_pointer(skb);
@@ -319,8 +387,14 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
* relevant for updates that are generated by the
* BCM, where nframes is 1
*/
- if (head->nframes == 1)
+ if (head->nframes == 1) {
+ if (firstframe->flags & RX_LOCAL)
+ *pflags |= MSG_DONTROUTE;
+ if (firstframe->flags & RX_OWN)
+ *pflags |= MSG_CONFIRM;
+
firstframe->flags &= BCM_CAN_FLAGS_MASK;
+ }
}
if (has_timestamp) {
@@ -335,45 +409,54 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head,
* containing the interface index.
*/
- sock_skb_cb_check_size(sizeof(struct sockaddr_can));
addr = (struct sockaddr_can *)skb->cb;
memset(addr, 0, sizeof(*addr));
addr->can_family = AF_CAN;
addr->can_ifindex = op->rx_ifindex;
- err = sock_queue_rcv_skb(sk, skb);
+ err = sock_queue_rcv_skb_reason(sk, skb, &reason);
if (err < 0) {
struct bcm_sock *bo = bcm_sk(sk);
- kfree_skb(skb);
+ sk_skb_reason_drop(sk, skb, reason);
/* don't care about overflows in this statistic */
bo->dropped_usr_msgs++;
}
}
-static void bcm_tx_start_timer(struct bcm_op *op)
+static bool bcm_tx_set_expiry(struct bcm_op *op, struct hrtimer *hrt)
{
+ ktime_t ival;
+
if (op->kt_ival1 && op->count)
- hrtimer_start(&op->timer,
- ktime_add(ktime_get(), op->kt_ival1),
- HRTIMER_MODE_ABS);
+ ival = op->kt_ival1;
else if (op->kt_ival2)
- hrtimer_start(&op->timer,
- ktime_add(ktime_get(), op->kt_ival2),
- HRTIMER_MODE_ABS);
+ ival = op->kt_ival2;
+ else
+ return false;
+
+ hrtimer_set_expires(hrt, ktime_add(ktime_get(), ival));
+ return true;
}
-static void bcm_tx_timeout_tsklet(unsigned long data)
+static void bcm_tx_start_timer(struct bcm_op *op)
{
- struct bcm_op *op = (struct bcm_op *)data;
+ if (bcm_tx_set_expiry(op, &op->timer))
+ hrtimer_start_expires(&op->timer, HRTIMER_MODE_ABS_SOFT);
+}
+
+/* bcm_tx_timeout_handler - performs cyclic CAN frame transmissions */
+static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
+{
+ struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
struct bcm_msg_head msg_head;
if (op->kt_ival1 && (op->count > 0)) {
-
- op->count--;
+ bcm_can_tx(op);
if (!op->count && (op->flags & TX_COUNTEVT)) {
/* create notification to user */
+ memset(&msg_head, 0, sizeof(msg_head));
msg_head.opcode = TX_EXPIRED;
msg_head.flags = op->flags;
msg_head.count = op->count;
@@ -384,24 +467,13 @@ static void bcm_tx_timeout_tsklet(unsigned long data)
bcm_send_to_user(op, &msg_head, NULL, 0);
}
- bcm_can_tx(op);
- } else if (op->kt_ival2)
+ } else if (op->kt_ival2) {
bcm_can_tx(op);
+ }
- bcm_tx_start_timer(op);
-}
-
-/*
- * bcm_tx_timeout_handler - performs cyclic CAN frame transmissions
- */
-static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer)
-{
- struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
-
- tasklet_schedule(&op->tsklet);
-
- return HRTIMER_NORESTART;
+ return bcm_tx_set_expiry(op, &op->timer) ?
+ HRTIMER_RESTART : HRTIMER_NORESTART;
}
/*
@@ -419,8 +491,9 @@ static void bcm_rx_changed(struct bcm_op *op, struct canfd_frame *data)
op->frames_filtered = op->frames_abs = 0;
/* this element is not throttled anymore */
- data->flags &= (BCM_CAN_FLAGS_MASK|RX_RECV);
+ data->flags &= ~RX_THR;
+ memset(&head, 0, sizeof(head));
head.opcode = RX_CHANGED;
head.flags = op->flags;
head.count = op->count;
@@ -439,13 +512,17 @@ static void bcm_rx_changed(struct bcm_op *op, struct canfd_frame *data)
*/
static void bcm_rx_update_and_send(struct bcm_op *op,
struct canfd_frame *lastdata,
- const struct canfd_frame *rxdata)
+ const struct canfd_frame *rxdata,
+ unsigned char traffic_flags)
{
memcpy(lastdata, rxdata, op->cfsiz);
/* mark as used and throttled by default */
lastdata->flags |= (RX_RECV|RX_THR);
+ /* add own/local/remote traffic flags */
+ lastdata->flags |= traffic_flags;
+
/* throttling mode inactive ? */
if (!op->kt_ival2) {
/* send RX_CHANGED to the user immediately */
@@ -467,7 +544,7 @@ static void bcm_rx_update_and_send(struct bcm_op *op,
/* do not send the saved data - only start throttle timer */
hrtimer_start(&op->thrtimer,
ktime_add(op->kt_lastmsg, op->kt_ival2),
- HRTIMER_MODE_ABS);
+ HRTIMER_MODE_ABS_SOFT);
return;
}
@@ -482,7 +559,8 @@ rx_changed_settime:
* received data stored in op->last_frames[]
*/
static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index,
- const struct canfd_frame *rxdata)
+ const struct canfd_frame *rxdata,
+ unsigned char traffic_flags)
{
struct canfd_frame *cf = op->frames + op->cfsiz * index;
struct canfd_frame *lcf = op->last_frames + op->cfsiz * index;
@@ -495,7 +573,7 @@ static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index,
if (!(lcf->flags & RX_RECV)) {
/* received data for the first time => send update to user */
- bcm_rx_update_and_send(op, lcf, rxdata);
+ bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags);
return;
}
@@ -503,7 +581,7 @@ static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index,
for (i = 0; i < rxdata->len; i += 8) {
if ((get_u64(cf, i) & get_u64(rxdata, i)) !=
(get_u64(cf, i) & get_u64(lcf, i))) {
- bcm_rx_update_and_send(op, lcf, rxdata);
+ bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags);
return;
}
}
@@ -511,7 +589,7 @@ static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index,
if (op->flags & RX_CHECK_DLC) {
/* do a real check in CAN frame length */
if (rxdata->len != lcf->len) {
- bcm_rx_update_and_send(op, lcf, rxdata);
+ bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags);
return;
}
}
@@ -526,15 +604,23 @@ static void bcm_rx_starttimer(struct bcm_op *op)
return;
if (op->kt_ival1)
- hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL);
+ hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL_SOFT);
}
-static void bcm_rx_timeout_tsklet(unsigned long data)
+/* bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out */
+static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
{
- struct bcm_op *op = (struct bcm_op *)data;
+ struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
struct bcm_msg_head msg_head;
+ /* if user wants to be informed, when cyclic CAN-Messages come back */
+ if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
+ /* clear received CAN frames to indicate 'nothing received' */
+ memset(op->last_frames, 0, op->nframes * op->cfsiz);
+ }
+
/* create notification to user */
+ memset(&msg_head, 0, sizeof(msg_head));
msg_head.opcode = RX_TIMEOUT;
msg_head.flags = op->flags;
msg_head.count = op->count;
@@ -544,25 +630,6 @@ static void bcm_rx_timeout_tsklet(unsigned long data)
msg_head.nframes = 0;
bcm_send_to_user(op, &msg_head, NULL, 0);
-}
-
-/*
- * bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out
- */
-static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
-{
- struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer);
-
- /* schedule before NET_RX_SOFTIRQ */
- tasklet_hi_schedule(&op->tsklet);
-
- /* no restart of the timer is done here! */
-
- /* if user wants to be informed, when cyclic CAN-Messages come back */
- if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) {
- /* clear received CAN frames to indicate 'nothing received' */
- memset(op->last_frames, 0, op->nframes * op->cfsiz);
- }
return HRTIMER_NORESTART;
}
@@ -570,14 +637,12 @@ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer)
/*
* bcm_rx_do_flush - helper for bcm_rx_thr_flush
*/
-static inline int bcm_rx_do_flush(struct bcm_op *op, int update,
- unsigned int index)
+static inline int bcm_rx_do_flush(struct bcm_op *op, unsigned int index)
{
struct canfd_frame *lcf = op->last_frames + op->cfsiz * index;
if ((op->last_frames) && (lcf->flags & RX_THR)) {
- if (update)
- bcm_rx_changed(op, lcf);
+ bcm_rx_changed(op, lcf);
return 1;
}
return 0;
@@ -585,11 +650,8 @@ static inline int bcm_rx_do_flush(struct bcm_op *op, int update,
/*
* bcm_rx_thr_flush - Check for throttled data and send it to the userspace
- *
- * update == 0 : just check if throttled data is available (any irq context)
- * update == 1 : check and send throttled data to userspace (soft_irq context)
*/
-static int bcm_rx_thr_flush(struct bcm_op *op, int update)
+static int bcm_rx_thr_flush(struct bcm_op *op)
{
int updated = 0;
@@ -598,24 +660,16 @@ static int bcm_rx_thr_flush(struct bcm_op *op, int update)
/* for MUX filter we start at index 1 */
for (i = 1; i < op->nframes; i++)
- updated += bcm_rx_do_flush(op, update, i);
+ updated += bcm_rx_do_flush(op, i);
} else {
/* for RX_FILTER_ID and simple filter */
- updated += bcm_rx_do_flush(op, update, 0);
+ updated += bcm_rx_do_flush(op, 0);
}
return updated;
}
-static void bcm_rx_thr_tsklet(unsigned long data)
-{
- struct bcm_op *op = (struct bcm_op *)data;
-
- /* push the changed data to the userspace */
- bcm_rx_thr_flush(op, 1);
-}
-
/*
* bcm_rx_thr_handler - the time for blocked content updates is over now:
* Check for throttled data and send it to the userspace
@@ -624,10 +678,8 @@ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer)
{
struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer);
- tasklet_schedule(&op->thrtsklet);
-
- if (bcm_rx_thr_flush(op, 0)) {
- hrtimer_forward(hrtimer, ktime_get(), op->kt_ival2);
+ if (bcm_rx_thr_flush(op)) {
+ hrtimer_forward_now(hrtimer, op->kt_ival2);
return HRTIMER_RESTART;
} else {
/* rearm throttle handling */
@@ -644,13 +696,19 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data)
struct bcm_op *op = (struct bcm_op *)data;
const struct canfd_frame *rxframe = (struct canfd_frame *)skb->data;
unsigned int i;
+ unsigned char traffic_flags;
if (op->can_id != rxframe->can_id)
return;
/* make sure to handle the correct frame type (CAN / CAN FD) */
- if (skb->len != op->cfsiz)
- return;
+ if (op->flags & CAN_FD_FRAME) {
+ if (!can_is_canfd_skb(skb))
+ return;
+ } else {
+ if (!can_is_can_skb(skb))
+ return;
+ }
/* disable timeout */
hrtimer_cancel(&op->timer);
@@ -668,15 +726,24 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data)
return;
}
+ /* compute flags to distinguish between own/local/remote CAN traffic */
+ traffic_flags = 0;
+ if (skb->sk) {
+ traffic_flags |= RX_LOCAL;
+ if (skb->sk == op->sk)
+ traffic_flags |= RX_OWN;
+ }
+
if (op->flags & RX_FILTER_ID) {
/* the easiest case */
- bcm_rx_update_and_send(op, op->last_frames, rxframe);
+ bcm_rx_update_and_send(op, op->last_frames, rxframe,
+ traffic_flags);
goto rx_starttimer;
}
if (op->nframes == 1) {
/* simple compare with index 0 */
- bcm_rx_cmp_to_index(op, 0, rxframe);
+ bcm_rx_cmp_to_index(op, 0, rxframe, traffic_flags);
goto rx_starttimer;
}
@@ -693,7 +760,8 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data)
if ((get_u64(op->frames, 0) & get_u64(rxframe, 0)) ==
(get_u64(op->frames, 0) &
get_u64(op->frames + op->cfsiz * i, 0))) {
- bcm_rx_cmp_to_index(op, i, rxframe);
+ bcm_rx_cmp_to_index(op, i, rxframe,
+ traffic_flags);
break;
}
}
@@ -720,25 +788,9 @@ static struct bcm_op *bcm_find_op(struct list_head *ops,
return NULL;
}
-static void bcm_remove_op(struct bcm_op *op)
+static void bcm_free_op_rcu(struct rcu_head *rcu_head)
{
- if (op->tsklet.func) {
- while (test_bit(TASKLET_STATE_SCHED, &op->tsklet.state) ||
- test_bit(TASKLET_STATE_RUN, &op->tsklet.state) ||
- hrtimer_active(&op->timer)) {
- hrtimer_cancel(&op->timer);
- tasklet_kill(&op->tsklet);
- }
- }
-
- if (op->thrtsklet.func) {
- while (test_bit(TASKLET_STATE_SCHED, &op->thrtsklet.state) ||
- test_bit(TASKLET_STATE_RUN, &op->thrtsklet.state) ||
- hrtimer_active(&op->thrtimer)) {
- hrtimer_cancel(&op->thrtimer);
- tasklet_kill(&op->thrtsklet);
- }
- }
+ struct bcm_op *op = container_of(rcu_head, struct bcm_op, rcu);
if ((op->frames) && (op->frames != &op->sframe))
kfree(op->frames);
@@ -749,6 +801,14 @@ static void bcm_remove_op(struct bcm_op *op)
kfree(op);
}
+static void bcm_remove_op(struct bcm_op *op)
+{
+ hrtimer_cancel(&op->timer);
+ hrtimer_cancel(&op->thrtimer);
+
+ call_rcu(&op->rcu, bcm_free_op_rcu);
+}
+
static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op)
{
if (op->rx_reg_dev == dev) {
@@ -774,6 +834,9 @@ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh,
if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) &&
(op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) {
+ /* disable automatic timer on frame reception */
+ op->flags |= RX_NO_AUTOTIMER;
+
/*
* Don't care if we're bound or not (due to netdev
* problems) can_rx_unregister() is always a save
@@ -801,7 +864,7 @@ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh,
REGMASK(op->can_id),
bcm_rx_handler, op);
- list_del(&op->list);
+ list_del_rcu(&op->list);
bcm_remove_op(op);
return 1; /* done */
}
@@ -821,7 +884,7 @@ static int bcm_delete_tx_op(struct list_head *ops, struct bcm_msg_head *mh,
list_for_each_entry_safe(op, n, ops, list) {
if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) &&
(op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) {
- list_del(&op->list);
+ list_del_rcu(&op->list);
bcm_remove_op(op);
return 1; /* done */
}
@@ -873,6 +936,10 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
if (msg_head->nframes < 1 || msg_head->nframes > MAX_NFRAMES)
return -EINVAL;
+ /* check timeval limitations */
+ if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head))
+ return -EINVAL;
+
/* check the given can_id */
op = bcm_find_op(&bo->tx_ops, msg_head, ifindex);
if (op) {
@@ -910,6 +977,27 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
}
op->flags = msg_head->flags;
+ /* only lock for unlikely count/nframes/currframe changes */
+ if (op->nframes != msg_head->nframes ||
+ op->flags & TX_RESET_MULTI_IDX ||
+ op->flags & SETTIMER) {
+
+ spin_lock_bh(&op->bcm_tx_lock);
+
+ if (op->nframes != msg_head->nframes ||
+ op->flags & TX_RESET_MULTI_IDX) {
+ /* potentially update changed nframes */
+ op->nframes = msg_head->nframes;
+ /* restart multiple frame transmission */
+ op->currframe = 0;
+ }
+
+ if (op->flags & SETTIMER)
+ op->count = msg_head->count;
+
+ spin_unlock_bh(&op->bcm_tx_lock);
+ }
+
} else {
/* insert new BCM operation for the given can_id */
@@ -917,9 +1005,14 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
if (!op)
return -ENOMEM;
+ spin_lock_init(&op->bcm_tx_lock);
op->can_id = msg_head->can_id;
op->cfsiz = CFSIZ(msg_head->flags);
op->flags = msg_head->flags;
+ op->nframes = msg_head->nframes;
+
+ if (op->flags & SETTIMER)
+ op->count = msg_head->count;
/* create array for CAN frames and copy the data */
if (msg_head->nframes > 1) {
@@ -937,6 +1030,8 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
cf = op->frames + op->cfsiz * i;
err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz);
+ if (err < 0)
+ goto free_op;
if (op->flags & CAN_FD_FRAME) {
if (cf->len > 64)
@@ -946,12 +1041,8 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
err = -EINVAL;
}
- if (err < 0) {
- if (op->frames != &op->sframe)
- kfree(op->frames);
- kfree(op);
- return err;
- }
+ if (err < 0)
+ goto free_op;
if (msg_head->flags & TX_CP_CAN_ID) {
/* copy can_id into frame */
@@ -967,37 +1058,20 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
op->ifindex = ifindex;
/* initialize uninitialized (kzalloc) structure */
- hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- op->timer.function = bcm_tx_timeout_handler;
-
- /* initialize tasklet for tx countevent notification */
- tasklet_init(&op->tsklet, bcm_tx_timeout_tsklet,
- (unsigned long) op);
+ hrtimer_setup(&op->timer, bcm_tx_timeout_handler, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_SOFT);
/* currently unused in tx_ops */
- hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_setup(&op->thrtimer, hrtimer_dummy_timeout, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_SOFT);
/* add this bcm_op to the list of the tx_ops */
list_add(&op->list, &bo->tx_ops);
} /* if ((op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex))) */
- if (op->nframes != msg_head->nframes) {
- op->nframes = msg_head->nframes;
- /* start multiple frame transmission with index 0 */
- op->currframe = 0;
- }
-
- /* check flags */
-
- if (op->flags & TX_RESET_MULTI_IDX) {
- /* start multiple frame transmission with index 0 */
- op->currframe = 0;
- }
-
if (op->flags & SETTIMER) {
/* set timer values */
- op->count = msg_head->count;
op->ival1 = msg_head->ival1;
op->ival2 = msg_head->ival2;
op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1);
@@ -1014,16 +1088,19 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
op->flags |= TX_ANNOUNCE;
}
- if (op->flags & TX_ANNOUNCE) {
+ if (op->flags & TX_ANNOUNCE)
bcm_can_tx(op);
- if (op->count)
- op->count--;
- }
if (op->flags & STARTTIMER)
bcm_tx_start_timer(op);
return msg_head->nframes * op->cfsiz + MHSIZ;
+
+free_op:
+ if (op->frames != &op->sframe)
+ kfree(op->frames);
+ kfree(op);
+ return err;
}
/*
@@ -1053,6 +1130,10 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
(!(msg_head->can_id & CAN_RTR_FLAG))))
return -EINVAL;
+ /* check timeval limitations */
+ if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head))
+ return -EINVAL;
+
/* check the given can_id */
op = bcm_find_op(&bo->rx_ops, msg_head, ifindex);
if (op) {
@@ -1140,19 +1221,10 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
op->rx_ifindex = ifindex;
/* initialize uninitialized (kzalloc) structure */
- hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- op->timer.function = bcm_rx_timeout_handler;
-
- /* initialize tasklet for rx timeout notification */
- tasklet_init(&op->tsklet, bcm_rx_timeout_tsklet,
- (unsigned long) op);
-
- hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- op->thrtimer.function = bcm_rx_thr_handler;
-
- /* initialize tasklet for rx throttle handling */
- tasklet_init(&op->thrtsklet, bcm_rx_thr_tsklet,
- (unsigned long) op);
+ hrtimer_setup(&op->timer, bcm_rx_timeout_handler, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_SOFT);
+ hrtimer_setup(&op->thrtimer, bcm_rx_thr_handler, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_SOFT);
/* add this bcm_op to the list of the rx_ops */
list_add(&op->list, &bo->rx_ops);
@@ -1199,12 +1271,12 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
*/
op->kt_lastmsg = 0;
hrtimer_cancel(&op->thrtimer);
- bcm_rx_thr_flush(op, 1);
+ bcm_rx_thr_flush(op);
}
if ((op->flags & STARTTIMER) && op->kt_ival1)
hrtimer_start(&op->timer, op->kt_ival1,
- HRTIMER_MODE_REL);
+ HRTIMER_MODE_REL_SOFT);
}
/* now we can register for can_ids, if we added a new bcm_op */
@@ -1230,7 +1302,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg,
bcm_rx_handler, op, "bcm", sk);
if (err) {
/* this bcm rx op is broken -> remove it */
- list_del(&op->list);
+ list_del_rcu(&op->list);
bcm_remove_op(op);
return err;
}
@@ -1318,7 +1390,7 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
/* no bound device as default => check msg_name */
DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name);
- if (msg->msg_namelen < sizeof(*addr))
+ if (msg->msg_namelen < BCM_MIN_NAMELEN)
return -EINVAL;
if (addr->can_family != AF_CAN)
@@ -1402,20 +1474,15 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
/*
* notification handler for netdevice status changes
*/
-static int bcm_notifier(struct notifier_block *nb, unsigned long msg,
- void *ptr)
+static void bcm_notify(struct bcm_sock *bo, unsigned long msg,
+ struct net_device *dev)
{
- struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct bcm_sock *bo = container_of(nb, struct bcm_sock, notifier);
struct sock *sk = &bo->sk;
struct bcm_op *op;
int notify_enodev = 0;
if (!net_eq(dev_net(dev), sock_net(sk)))
- return NOTIFY_DONE;
-
- if (dev->type != ARPHRD_CAN)
- return NOTIFY_DONE;
+ return;
switch (msg) {
@@ -1429,6 +1496,12 @@ static int bcm_notifier(struct notifier_block *nb, unsigned long msg,
/* remove device reference, if this is our bound device */
if (bo->bound && bo->ifindex == dev->ifindex) {
+#if IS_ENABLED(CONFIG_PROC_FS)
+ if (sock_net(sk)->can.bcmproc_dir && bo->bcm_proc_read) {
+ remove_proc_entry(bo->procname, sock_net(sk)->can.bcmproc_dir);
+ bo->bcm_proc_read = NULL;
+ }
+#endif
bo->bound = 0;
bo->ifindex = 0;
notify_enodev = 1;
@@ -1439,7 +1512,7 @@ static int bcm_notifier(struct notifier_block *nb, unsigned long msg,
if (notify_enodev) {
sk->sk_err = ENODEV;
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_error_report(sk);
+ sk_error_report(sk);
}
break;
@@ -1447,10 +1520,31 @@ static int bcm_notifier(struct notifier_block *nb, unsigned long msg,
if (bo->bound && bo->ifindex == dev->ifindex) {
sk->sk_err = ENETDOWN;
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_error_report(sk);
+ sk_error_report(sk);
}
}
+}
+static int bcm_notifier(struct notifier_block *nb, unsigned long msg,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (dev->type != ARPHRD_CAN)
+ return NOTIFY_DONE;
+ if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN)
+ return NOTIFY_DONE;
+ if (unlikely(bcm_busy_notifier)) /* Check for reentrant bug. */
+ return NOTIFY_DONE;
+
+ spin_lock(&bcm_notifier_lock);
+ list_for_each_entry(bcm_busy_notifier, &bcm_notifier_list, notifier) {
+ spin_unlock(&bcm_notifier_lock);
+ bcm_notify(bcm_busy_notifier, msg, dev);
+ spin_lock(&bcm_notifier_lock);
+ }
+ bcm_busy_notifier = NULL;
+ spin_unlock(&bcm_notifier_lock);
return NOTIFY_DONE;
}
@@ -1470,9 +1564,9 @@ static int bcm_init(struct sock *sk)
INIT_LIST_HEAD(&bo->rx_ops);
/* set notifier */
- bo->notifier.notifier_call = bcm_notifier;
-
- register_netdevice_notifier(&bo->notifier);
+ spin_lock(&bcm_notifier_lock);
+ list_add_tail(&bo->notifier, &bcm_notifier_list);
+ spin_unlock(&bcm_notifier_lock);
return 0;
}
@@ -1495,10 +1589,23 @@ static int bcm_release(struct socket *sock)
/* remove bcm_ops, timer, rx_unregister(), etc. */
- unregister_netdevice_notifier(&bo->notifier);
+ spin_lock(&bcm_notifier_lock);
+ while (bcm_busy_notifier == bo) {
+ spin_unlock(&bcm_notifier_lock);
+ schedule_timeout_uninterruptible(1);
+ spin_lock(&bcm_notifier_lock);
+ }
+ list_del(&bo->notifier);
+ spin_unlock(&bcm_notifier_lock);
lock_sock(sk);
+#if IS_ENABLED(CONFIG_PROC_FS)
+ /* remove procfs entry */
+ if (net->can.bcmproc_dir && bo->bcm_proc_read)
+ remove_proc_entry(bo->procname, net->can.bcmproc_dir);
+#endif /* CONFIG_PROC_FS */
+
list_for_each_entry_safe(op, next, &bo->tx_ops, list)
bcm_remove_op(op);
@@ -1527,14 +1634,12 @@ static int bcm_release(struct socket *sock)
REGMASK(op->can_id),
bcm_rx_handler, op);
- bcm_remove_op(op);
}
-#if IS_ENABLED(CONFIG_PROC_FS)
- /* remove procfs entry */
- if (net->can.bcmproc_dir && bo->bcm_proc_read)
- remove_proc_entry(bo->procname, net->can.bcmproc_dir);
-#endif /* CONFIG_PROC_FS */
+ synchronize_rcu();
+
+ list_for_each_entry_safe(op, next, &bo->rx_ops, list)
+ bcm_remove_op(op);
/* remove device reference */
if (bo->bound) {
@@ -1546,12 +1651,13 @@ static int bcm_release(struct socket *sock)
sock->sk = NULL;
release_sock(sk);
+ sock_prot_inuse_add(net, sk->sk_prot, -1);
sock_put(sk);
return 0;
}
-static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len,
+static int bcm_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int len,
int flags)
{
struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
@@ -1560,7 +1666,7 @@ static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len,
struct net *net = sock_net(sk);
int ret = 0;
- if (len < sizeof(*addr))
+ if (len < BCM_MIN_NAMELEN)
return -EINVAL;
lock_sock(sk);
@@ -1621,12 +1727,9 @@ static int bcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
struct sock *sk = sock->sk;
struct sk_buff *skb;
int error = 0;
- int noblock;
int err;
- noblock = flags & MSG_DONTWAIT;
- flags &= ~MSG_DONTWAIT;
- skb = skb_recv_datagram(sk, flags, noblock, &error);
+ skb = skb_recv_datagram(sk, flags, &error);
if (!skb)
return error;
@@ -1639,19 +1742,29 @@ static int bcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
return err;
}
- sock_recv_ts_and_drops(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
if (msg->msg_name) {
- __sockaddr_check_size(sizeof(struct sockaddr_can));
- msg->msg_namelen = sizeof(struct sockaddr_can);
+ __sockaddr_check_size(BCM_MIN_NAMELEN);
+ msg->msg_namelen = BCM_MIN_NAMELEN;
memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
}
+ /* assign the flags that have been recorded in bcm_send_to_user() */
+ msg->msg_flags |= *(bcm_flags(skb));
+
skb_free_datagram(sk, skb);
return size;
}
+static int bcm_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ /* no ioctls for socket layer -> hand it down to NIC layer */
+ return -ENOIOCTLCMD;
+}
+
static const struct proto_ops bcm_ops = {
.family = PF_CAN,
.release = bcm_release,
@@ -1661,15 +1774,13 @@ static const struct proto_ops bcm_ops = {
.accept = sock_no_accept,
.getname = sock_no_getname,
.poll = datagram_poll,
- .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */
+ .ioctl = bcm_sock_no_ioctlcmd,
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
- .setsockopt = sock_no_setsockopt,
- .getsockopt = sock_no_getsockopt,
.sendmsg = bcm_sendmsg,
.recvmsg = bcm_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
};
static struct proto bcm_proto __read_mostly = {
@@ -1710,25 +1821,43 @@ static struct pernet_operations canbcm_pernet_ops __read_mostly = {
.exit = canbcm_pernet_exit,
};
+static struct notifier_block canbcm_notifier = {
+ .notifier_call = bcm_notifier
+};
+
static int __init bcm_module_init(void)
{
int err;
- pr_info("can: broadcast manager protocol (rev " CAN_BCM_VERSION " t)\n");
+ pr_info("can: broadcast manager protocol\n");
+
+ err = register_pernet_subsys(&canbcm_pernet_ops);
+ if (err)
+ return err;
+
+ err = register_netdevice_notifier(&canbcm_notifier);
+ if (err)
+ goto register_notifier_failed;
err = can_proto_register(&bcm_can_proto);
if (err < 0) {
printk(KERN_ERR "can: registration of bcm protocol failed\n");
- return err;
+ goto register_proto_failed;
}
- register_pernet_subsys(&canbcm_pernet_ops);
return 0;
+
+register_proto_failed:
+ unregister_netdevice_notifier(&canbcm_notifier);
+register_notifier_failed:
+ unregister_pernet_subsys(&canbcm_pernet_ops);
+ return err;
}
static void __exit bcm_module_exit(void)
{
can_proto_unregister(&bcm_can_proto);
+ unregister_netdevice_notifier(&canbcm_notifier);
unregister_pernet_subsys(&canbcm_pernet_ops);
}
diff --git a/net/can/gw.c b/net/can/gw.c
index faa3da88a127..55eccb1c7620 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -1,7 +1,7 @@
-/*
- * gw.c - CAN frame Gateway/Router/Bridge with netlink interface
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/* gw.c - CAN frame Gateway/Router/Bridge with netlink interface
*
- * Copyright (c) 2017 Volkswagen Group Electronic Research
+ * Copyright (c) 2019 Volkswagen Group Electronic Research
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -59,7 +59,6 @@
#include <net/net_namespace.h>
#include <net/sock.h>
-#define CAN_GW_VERSION "20170425"
#define CAN_GW_NAME "can-gw"
MODULE_DESCRIPTION("PF_CAN netlink gateway");
@@ -85,10 +84,10 @@ static struct kmem_cache *cgw_cache __read_mostly;
/* structure that contains the (on-the-fly) CAN frame modifications */
struct cf_mod {
struct {
- struct can_frame and;
- struct can_frame or;
- struct can_frame xor;
- struct can_frame set;
+ struct canfd_frame and;
+ struct canfd_frame or;
+ struct canfd_frame xor;
+ struct canfd_frame set;
} modframe;
struct {
u8 and;
@@ -96,7 +95,7 @@ struct cf_mod {
u8 xor;
u8 set;
} modtype;
- void (*modfunc[MAX_MODFUNCTIONS])(struct can_frame *cf,
+ void (*modfunc[MAX_MODFUNCTIONS])(struct canfd_frame *cf,
struct cf_mod *mod);
/* CAN frame checksum calculation after CAN frame modifications */
@@ -105,15 +104,15 @@ struct cf_mod {
struct cgw_csum_crc8 crc8;
} csum;
struct {
- void (*xor)(struct can_frame *cf, struct cgw_csum_xor *xor);
- void (*crc8)(struct can_frame *cf, struct cgw_csum_crc8 *crc8);
+ void (*xor)(struct canfd_frame *cf,
+ struct cgw_csum_xor *xor);
+ void (*crc8)(struct canfd_frame *cf,
+ struct cgw_csum_crc8 *crc8);
} csumfunc;
u32 uid;
};
-
-/*
- * So far we just support CAN -> CAN routing and frame modifications.
+/* So far we just support CAN -> CAN routing and frame modifications.
*
* The internal can_can_gw structure contains data and attributes for
* a CAN -> CAN gateway job.
@@ -131,7 +130,7 @@ struct cgw_job {
u32 handled_frames;
u32 dropped_frames;
u32 deleted_frames;
- struct cf_mod mod;
+ struct cf_mod __rcu *cf_mod;
union {
/* CAN frame data source */
struct net_device *dev;
@@ -151,39 +150,150 @@ struct cgw_job {
/* modification functions that are invoked in the hot path in can_can_gw_rcv */
-#define MODFUNC(func, op) static void func(struct can_frame *cf, \
+#define MODFUNC(func, op) static void func(struct canfd_frame *cf, \
struct cf_mod *mod) { op ; }
MODFUNC(mod_and_id, cf->can_id &= mod->modframe.and.can_id)
-MODFUNC(mod_and_dlc, cf->can_dlc &= mod->modframe.and.can_dlc)
+MODFUNC(mod_and_len, cf->len &= mod->modframe.and.len)
+MODFUNC(mod_and_flags, cf->flags &= mod->modframe.and.flags)
MODFUNC(mod_and_data, *(u64 *)cf->data &= *(u64 *)mod->modframe.and.data)
MODFUNC(mod_or_id, cf->can_id |= mod->modframe.or.can_id)
-MODFUNC(mod_or_dlc, cf->can_dlc |= mod->modframe.or.can_dlc)
+MODFUNC(mod_or_len, cf->len |= mod->modframe.or.len)
+MODFUNC(mod_or_flags, cf->flags |= mod->modframe.or.flags)
MODFUNC(mod_or_data, *(u64 *)cf->data |= *(u64 *)mod->modframe.or.data)
MODFUNC(mod_xor_id, cf->can_id ^= mod->modframe.xor.can_id)
-MODFUNC(mod_xor_dlc, cf->can_dlc ^= mod->modframe.xor.can_dlc)
+MODFUNC(mod_xor_len, cf->len ^= mod->modframe.xor.len)
+MODFUNC(mod_xor_flags, cf->flags ^= mod->modframe.xor.flags)
MODFUNC(mod_xor_data, *(u64 *)cf->data ^= *(u64 *)mod->modframe.xor.data)
MODFUNC(mod_set_id, cf->can_id = mod->modframe.set.can_id)
-MODFUNC(mod_set_dlc, cf->can_dlc = mod->modframe.set.can_dlc)
+MODFUNC(mod_set_len, cf->len = mod->modframe.set.len)
+MODFUNC(mod_set_flags, cf->flags = mod->modframe.set.flags)
MODFUNC(mod_set_data, *(u64 *)cf->data = *(u64 *)mod->modframe.set.data)
-static inline void canframecpy(struct can_frame *dst, struct can_frame *src)
+static void mod_and_fddata(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ int i;
+
+ for (i = 0; i < CANFD_MAX_DLEN; i += 8)
+ *(u64 *)(cf->data + i) &= *(u64 *)(mod->modframe.and.data + i);
+}
+
+static void mod_or_fddata(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ int i;
+
+ for (i = 0; i < CANFD_MAX_DLEN; i += 8)
+ *(u64 *)(cf->data + i) |= *(u64 *)(mod->modframe.or.data + i);
+}
+
+static void mod_xor_fddata(struct canfd_frame *cf, struct cf_mod *mod)
{
- /*
- * Copy the struct members separately to ensure that no uninitialized
+ int i;
+
+ for (i = 0; i < CANFD_MAX_DLEN; i += 8)
+ *(u64 *)(cf->data + i) ^= *(u64 *)(mod->modframe.xor.data + i);
+}
+
+static void mod_set_fddata(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ memcpy(cf->data, mod->modframe.set.data, CANFD_MAX_DLEN);
+}
+
+/* retrieve valid CC DLC value and store it into 'len' */
+static void mod_retrieve_ccdlc(struct canfd_frame *cf)
+{
+ struct can_frame *ccf = (struct can_frame *)cf;
+
+ /* len8_dlc is only valid if len == CAN_MAX_DLEN */
+ if (ccf->len != CAN_MAX_DLEN)
+ return;
+
+ /* do we have a valid len8_dlc value from 9 .. 15 ? */
+ if (ccf->len8_dlc > CAN_MAX_DLEN && ccf->len8_dlc <= CAN_MAX_RAW_DLC)
+ ccf->len = ccf->len8_dlc;
+}
+
+/* convert valid CC DLC value in 'len' into struct can_frame elements */
+static void mod_store_ccdlc(struct canfd_frame *cf)
+{
+ struct can_frame *ccf = (struct can_frame *)cf;
+
+ /* clear potential leftovers */
+ ccf->len8_dlc = 0;
+
+ /* plain data length 0 .. 8 - that was easy */
+ if (ccf->len <= CAN_MAX_DLEN)
+ return;
+
+ /* potentially broken values are caught in can_can_gw_rcv() */
+ if (ccf->len > CAN_MAX_RAW_DLC)
+ return;
+
+ /* we have a valid dlc value from 9 .. 15 in ccf->len */
+ ccf->len8_dlc = ccf->len;
+ ccf->len = CAN_MAX_DLEN;
+}
+
+static void mod_and_ccdlc(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ mod_retrieve_ccdlc(cf);
+ mod_and_len(cf, mod);
+ mod_store_ccdlc(cf);
+}
+
+static void mod_or_ccdlc(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ mod_retrieve_ccdlc(cf);
+ mod_or_len(cf, mod);
+ mod_store_ccdlc(cf);
+}
+
+static void mod_xor_ccdlc(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ mod_retrieve_ccdlc(cf);
+ mod_xor_len(cf, mod);
+ mod_store_ccdlc(cf);
+}
+
+static void mod_set_ccdlc(struct canfd_frame *cf, struct cf_mod *mod)
+{
+ mod_set_len(cf, mod);
+ mod_store_ccdlc(cf);
+}
+
+static void canframecpy(struct canfd_frame *dst, struct can_frame *src)
+{
+ /* Copy the struct members separately to ensure that no uninitialized
* data are copied in the 3 bytes hole of the struct. This is needed
* to make easy compares of the data in the struct cf_mod.
*/
dst->can_id = src->can_id;
- dst->can_dlc = src->can_dlc;
+ dst->len = src->len;
*(u64 *)dst->data = *(u64 *)src->data;
}
-static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re)
+static void canfdframecpy(struct canfd_frame *dst, struct canfd_frame *src)
{
- /*
- * absolute dlc values 0 .. 7 => 0 .. 7, e.g. data [0]
+ /* Copy the struct members separately to ensure that no uninitialized
+ * data are copied in the 2 bytes hole of the struct. This is needed
+ * to make easy compares of the data in the struct cf_mod.
+ */
+
+ dst->can_id = src->can_id;
+ dst->flags = src->flags;
+ dst->len = src->len;
+ memcpy(dst->data, src->data, CANFD_MAX_DLEN);
+}
+
+static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re, struct rtcanmsg *r)
+{
+ s8 dlen = CAN_MAX_DLEN;
+
+ if (r->flags & CGW_FLAGS_CAN_FD)
+ dlen = CANFD_MAX_DLEN;
+
+ /* absolute dlc values 0 .. 7 => 0 .. 7, e.g. data [0]
* relative to received dlc -1 .. -8 :
* e.g. for received dlc = 8
* -1 => index = 7 (data[7])
@@ -191,27 +301,27 @@ static int cgw_chk_csum_parms(s8 fr, s8 to, s8 re)
* -8 => index = 0 (data[0])
*/
- if (fr > -9 && fr < 8 &&
- to > -9 && to < 8 &&
- re > -9 && re < 8)
+ if (fr >= -dlen && fr < dlen &&
+ to >= -dlen && to < dlen &&
+ re >= -dlen && re < dlen)
return 0;
else
return -EINVAL;
}
-static inline int calc_idx(int idx, int rx_dlc)
+static inline int calc_idx(int idx, int rx_len)
{
if (idx < 0)
- return rx_dlc + idx;
+ return rx_len + idx;
else
return idx;
}
-static void cgw_csum_xor_rel(struct can_frame *cf, struct cgw_csum_xor *xor)
+static void cgw_csum_xor_rel(struct canfd_frame *cf, struct cgw_csum_xor *xor)
{
- int from = calc_idx(xor->from_idx, cf->can_dlc);
- int to = calc_idx(xor->to_idx, cf->can_dlc);
- int res = calc_idx(xor->result_idx, cf->can_dlc);
+ int from = calc_idx(xor->from_idx, cf->len);
+ int to = calc_idx(xor->to_idx, cf->len);
+ int res = calc_idx(xor->result_idx, cf->len);
u8 val = xor->init_xor_val;
int i;
@@ -229,7 +339,7 @@ static void cgw_csum_xor_rel(struct can_frame *cf, struct cgw_csum_xor *xor)
cf->data[res] = val;
}
-static void cgw_csum_xor_pos(struct can_frame *cf, struct cgw_csum_xor *xor)
+static void cgw_csum_xor_pos(struct canfd_frame *cf, struct cgw_csum_xor *xor)
{
u8 val = xor->init_xor_val;
int i;
@@ -240,7 +350,7 @@ static void cgw_csum_xor_pos(struct can_frame *cf, struct cgw_csum_xor *xor)
cf->data[xor->result_idx] = val;
}
-static void cgw_csum_xor_neg(struct can_frame *cf, struct cgw_csum_xor *xor)
+static void cgw_csum_xor_neg(struct canfd_frame *cf, struct cgw_csum_xor *xor)
{
u8 val = xor->init_xor_val;
int i;
@@ -251,11 +361,12 @@ static void cgw_csum_xor_neg(struct can_frame *cf, struct cgw_csum_xor *xor)
cf->data[xor->result_idx] = val;
}
-static void cgw_csum_crc8_rel(struct can_frame *cf, struct cgw_csum_crc8 *crc8)
+static void cgw_csum_crc8_rel(struct canfd_frame *cf,
+ struct cgw_csum_crc8 *crc8)
{
- int from = calc_idx(crc8->from_idx, cf->can_dlc);
- int to = calc_idx(crc8->to_idx, cf->can_dlc);
- int res = calc_idx(crc8->result_idx, cf->can_dlc);
+ int from = calc_idx(crc8->from_idx, cf->len);
+ int to = calc_idx(crc8->to_idx, cf->len);
+ int res = calc_idx(crc8->result_idx, cf->len);
u8 crc = crc8->init_crc_val;
int i;
@@ -264,96 +375,103 @@ static void cgw_csum_crc8_rel(struct can_frame *cf, struct cgw_csum_crc8 *crc8)
if (from <= to) {
for (i = crc8->from_idx; i <= crc8->to_idx; i++)
- crc = crc8->crctab[crc^cf->data[i]];
+ crc = crc8->crctab[crc ^ cf->data[i]];
} else {
for (i = crc8->from_idx; i >= crc8->to_idx; i--)
- crc = crc8->crctab[crc^cf->data[i]];
+ crc = crc8->crctab[crc ^ cf->data[i]];
}
switch (crc8->profile) {
-
case CGW_CRC8PRF_1U8:
- crc = crc8->crctab[crc^crc8->profile_data[0]];
+ crc = crc8->crctab[crc ^ crc8->profile_data[0]];
break;
case CGW_CRC8PRF_16U8:
- crc = crc8->crctab[crc^crc8->profile_data[cf->data[1] & 0xF]];
+ crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]];
break;
case CGW_CRC8PRF_SFFID_XOR:
- crc = crc8->crctab[crc^(cf->can_id & 0xFF)^
+ crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^
(cf->can_id >> 8 & 0xFF)];
break;
-
}
- cf->data[crc8->result_idx] = crc^crc8->final_xor_val;
+ cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val;
}
-static void cgw_csum_crc8_pos(struct can_frame *cf, struct cgw_csum_crc8 *crc8)
+static void cgw_csum_crc8_pos(struct canfd_frame *cf,
+ struct cgw_csum_crc8 *crc8)
{
u8 crc = crc8->init_crc_val;
int i;
for (i = crc8->from_idx; i <= crc8->to_idx; i++)
- crc = crc8->crctab[crc^cf->data[i]];
+ crc = crc8->crctab[crc ^ cf->data[i]];
switch (crc8->profile) {
-
case CGW_CRC8PRF_1U8:
- crc = crc8->crctab[crc^crc8->profile_data[0]];
+ crc = crc8->crctab[crc ^ crc8->profile_data[0]];
break;
case CGW_CRC8PRF_16U8:
- crc = crc8->crctab[crc^crc8->profile_data[cf->data[1] & 0xF]];
+ crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]];
break;
case CGW_CRC8PRF_SFFID_XOR:
- crc = crc8->crctab[crc^(cf->can_id & 0xFF)^
+ crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^
(cf->can_id >> 8 & 0xFF)];
break;
}
- cf->data[crc8->result_idx] = crc^crc8->final_xor_val;
+ cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val;
}
-static void cgw_csum_crc8_neg(struct can_frame *cf, struct cgw_csum_crc8 *crc8)
+static void cgw_csum_crc8_neg(struct canfd_frame *cf,
+ struct cgw_csum_crc8 *crc8)
{
u8 crc = crc8->init_crc_val;
int i;
for (i = crc8->from_idx; i >= crc8->to_idx; i--)
- crc = crc8->crctab[crc^cf->data[i]];
+ crc = crc8->crctab[crc ^ cf->data[i]];
switch (crc8->profile) {
-
case CGW_CRC8PRF_1U8:
- crc = crc8->crctab[crc^crc8->profile_data[0]];
+ crc = crc8->crctab[crc ^ crc8->profile_data[0]];
break;
case CGW_CRC8PRF_16U8:
- crc = crc8->crctab[crc^crc8->profile_data[cf->data[1] & 0xF]];
+ crc = crc8->crctab[crc ^ crc8->profile_data[cf->data[1] & 0xF]];
break;
case CGW_CRC8PRF_SFFID_XOR:
- crc = crc8->crctab[crc^(cf->can_id & 0xFF)^
+ crc = crc8->crctab[crc ^ (cf->can_id & 0xFF) ^
(cf->can_id >> 8 & 0xFF)];
break;
}
- cf->data[crc8->result_idx] = crc^crc8->final_xor_val;
+ cf->data[crc8->result_idx] = crc ^ crc8->final_xor_val;
}
/* the receive & process & send function */
static void can_can_gw_rcv(struct sk_buff *skb, void *data)
{
struct cgw_job *gwj = (struct cgw_job *)data;
- struct can_frame *cf;
+ struct canfd_frame *cf;
struct sk_buff *nskb;
+ struct cf_mod *mod;
int modidx = 0;
- /*
- * Do not handle CAN frames routed more than 'max_hops' times.
+ /* process strictly Classic CAN or CAN FD frames */
+ if (gwj->flags & CGW_FLAGS_CAN_FD) {
+ if (!can_is_canfd_skb(skb))
+ return;
+ } else {
+ if (!can_is_can_skb(skb))
+ return;
+ }
+
+ /* Do not handle CAN frames routed more than 'max_hops' times.
* In general we should never catch this delimiter which is intended
* to cover a misconfiguration protection (e.g. circular CAN routes).
*
@@ -384,13 +502,13 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
can_skb_prv(skb)->ifindex == gwj->dst.dev->ifindex)
return;
- /*
- * clone the given skb, which has not been done in can_rcv()
+ /* clone the given skb, which has not been done in can_rcv()
*
* When there is at least one modification function activated,
* we need to copy the skb as we want to modify skb->data.
*/
- if (gwj->mod.modfunc[0])
+ mod = rcu_dereference(gwj->cf_mod);
+ if (mod->modfunc[0])
nskb = skb_copy(skb, GFP_ATOMIC);
else
nskb = skb_clone(skb, GFP_ATOMIC);
@@ -410,19 +528,31 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
nskb->dev = gwj->dst.dev;
/* pointer to modifiable CAN frame */
- cf = (struct can_frame *)nskb->data;
+ cf = (struct canfd_frame *)nskb->data;
/* perform preprocessed modification functions if there are any */
- while (modidx < MAX_MODFUNCTIONS && gwj->mod.modfunc[modidx])
- (*gwj->mod.modfunc[modidx++])(cf, &gwj->mod);
+ while (modidx < MAX_MODFUNCTIONS && mod->modfunc[modidx])
+ (*mod->modfunc[modidx++])(cf, mod);
- /* check for checksum updates when the CAN frame has been modified */
+ /* Has the CAN frame been modified? */
if (modidx) {
- if (gwj->mod.csumfunc.crc8)
- (*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8);
+ /* get available space for the processed CAN frame type */
+ int max_len = nskb->len - offsetof(struct canfd_frame, data);
+
+ /* dlc may have changed, make sure it fits to the CAN frame */
+ if (cf->len > max_len) {
+ /* delete frame due to misconfiguration */
+ gwj->deleted_frames++;
+ kfree_skb(nskb);
+ return;
+ }
+
+ /* check for checksum updates */
+ if (mod->csumfunc.crc8)
+ (*mod->csumfunc.crc8)(cf, &mod->csum.crc8);
- if (gwj->mod.csumfunc.xor)
- (*gwj->mod.csumfunc.xor)(cf, &gwj->mod.csum.xor);
+ if (mod->csumfunc.xor)
+ (*mod->csumfunc.xor)(cf, &mod->csum.xor);
}
/* clear the skb timestamp if not configured the other way */
@@ -449,6 +579,24 @@ static inline void cgw_unregister_filter(struct net *net, struct cgw_job *gwj)
gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj);
}
+static void cgw_job_free_rcu(struct rcu_head *rcu_head)
+{
+ struct cgw_job *gwj = container_of(rcu_head, struct cgw_job, rcu);
+
+ /* cgw_job::cf_mod is always accessed from the same cgw_job object within
+ * the same RCU read section. Once cgw_job is scheduled for removal,
+ * cf_mod can also be removed without mandating an additional grace period.
+ */
+ kfree(rcu_access_pointer(gwj->cf_mod));
+ kmem_cache_free(cgw_cache, gwj);
+}
+
+/* Return cgw_job::cf_mod with RTNL protected section */
+static struct cf_mod *cgw_job_cf_mod(struct cgw_job *gwj)
+{
+ return rcu_dereference_protected(gwj->cf_mod, rtnl_is_locked());
+}
+
static int cgw_notifier(struct notifier_block *nb,
unsigned long msg, void *ptr)
{
@@ -459,18 +607,16 @@ static int cgw_notifier(struct notifier_block *nb,
return NOTIFY_DONE;
if (msg == NETDEV_UNREGISTER) {
-
struct cgw_job *gwj = NULL;
struct hlist_node *nx;
ASSERT_RTNL();
hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) {
-
if (gwj->src.dev == dev || gwj->dst.dev == dev) {
hlist_del(&gwj->list);
cgw_unregister_filter(net, gwj);
- kmem_cache_free(cgw_cache, gwj);
+ call_rcu(&gwj->rcu, cgw_job_free_rcu);
}
}
}
@@ -481,9 +627,9 @@ static int cgw_notifier(struct notifier_block *nb,
static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type,
u32 pid, u32 seq, int flags)
{
- struct cgw_frame_mod mb;
struct rtcanmsg *rtcan;
struct nlmsghdr *nlh;
+ struct cf_mod *mod;
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtcan), flags);
if (!nlh)
@@ -518,53 +664,87 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type,
goto cancel;
}
- if (gwj->mod.modtype.and) {
- memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf));
- mb.modtype = gwj->mod.modtype.and;
- if (nla_put(skb, CGW_MOD_AND, sizeof(mb), &mb) < 0)
- goto cancel;
- }
+ mod = cgw_job_cf_mod(gwj);
+ if (gwj->flags & CGW_FLAGS_CAN_FD) {
+ struct cgw_fdframe_mod mb;
- if (gwj->mod.modtype.or) {
- memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf));
- mb.modtype = gwj->mod.modtype.or;
- if (nla_put(skb, CGW_MOD_OR, sizeof(mb), &mb) < 0)
- goto cancel;
- }
+ if (mod->modtype.and) {
+ memcpy(&mb.cf, &mod->modframe.and, sizeof(mb.cf));
+ mb.modtype = mod->modtype.and;
+ if (nla_put(skb, CGW_FDMOD_AND, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
- if (gwj->mod.modtype.xor) {
- memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf));
- mb.modtype = gwj->mod.modtype.xor;
- if (nla_put(skb, CGW_MOD_XOR, sizeof(mb), &mb) < 0)
- goto cancel;
- }
+ if (mod->modtype.or) {
+ memcpy(&mb.cf, &mod->modframe.or, sizeof(mb.cf));
+ mb.modtype = mod->modtype.or;
+ if (nla_put(skb, CGW_FDMOD_OR, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
- if (gwj->mod.modtype.set) {
- memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf));
- mb.modtype = gwj->mod.modtype.set;
- if (nla_put(skb, CGW_MOD_SET, sizeof(mb), &mb) < 0)
- goto cancel;
+ if (mod->modtype.xor) {
+ memcpy(&mb.cf, &mod->modframe.xor, sizeof(mb.cf));
+ mb.modtype = mod->modtype.xor;
+ if (nla_put(skb, CGW_FDMOD_XOR, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+
+ if (mod->modtype.set) {
+ memcpy(&mb.cf, &mod->modframe.set, sizeof(mb.cf));
+ mb.modtype = mod->modtype.set;
+ if (nla_put(skb, CGW_FDMOD_SET, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+ } else {
+ struct cgw_frame_mod mb;
+
+ if (mod->modtype.and) {
+ memcpy(&mb.cf, &mod->modframe.and, sizeof(mb.cf));
+ mb.modtype = mod->modtype.and;
+ if (nla_put(skb, CGW_MOD_AND, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+
+ if (mod->modtype.or) {
+ memcpy(&mb.cf, &mod->modframe.or, sizeof(mb.cf));
+ mb.modtype = mod->modtype.or;
+ if (nla_put(skb, CGW_MOD_OR, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+
+ if (mod->modtype.xor) {
+ memcpy(&mb.cf, &mod->modframe.xor, sizeof(mb.cf));
+ mb.modtype = mod->modtype.xor;
+ if (nla_put(skb, CGW_MOD_XOR, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
+
+ if (mod->modtype.set) {
+ memcpy(&mb.cf, &mod->modframe.set, sizeof(mb.cf));
+ mb.modtype = mod->modtype.set;
+ if (nla_put(skb, CGW_MOD_SET, sizeof(mb), &mb) < 0)
+ goto cancel;
+ }
}
- if (gwj->mod.uid) {
- if (nla_put_u32(skb, CGW_MOD_UID, gwj->mod.uid) < 0)
+ if (mod->uid) {
+ if (nla_put_u32(skb, CGW_MOD_UID, mod->uid) < 0)
goto cancel;
}
- if (gwj->mod.csumfunc.crc8) {
+ if (mod->csumfunc.crc8) {
if (nla_put(skb, CGW_CS_CRC8, CGW_CS_CRC8_LEN,
- &gwj->mod.csum.crc8) < 0)
+ &mod->csum.crc8) < 0)
goto cancel;
}
- if (gwj->mod.csumfunc.xor) {
+ if (mod->csumfunc.xor) {
if (nla_put(skb, CGW_CS_XOR, CGW_CS_XOR_LEN,
- &gwj->mod.csum.xor) < 0)
+ &mod->csum.xor) < 0)
goto cancel;
}
if (gwj->gwtype == CGW_TYPE_CAN_CAN) {
-
if (gwj->ccgw.filter.can_id || gwj->ccgw.filter.can_mask) {
if (nla_put(skb, CGW_FILTER, sizeof(struct can_filter),
&gwj->ccgw.filter) < 0)
@@ -599,8 +779,9 @@ static int cgw_dump_jobs(struct sk_buff *skb, struct netlink_callback *cb)
if (idx < s_idx)
goto cont;
- if (cgw_put_job(skb, gwj, RTM_NEWROUTE, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0)
+ if (cgw_put_job(skb, gwj, RTM_NEWROUTE,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI) < 0)
break;
cont:
idx++;
@@ -612,7 +793,7 @@ cont:
return skb->len;
}
-static const struct nla_policy cgw_policy[CGW_MAX+1] = {
+static const struct nla_policy cgw_policy[CGW_MAX + 1] = {
[CGW_MOD_AND] = { .len = sizeof(struct cgw_frame_mod) },
[CGW_MOD_OR] = { .len = sizeof(struct cgw_frame_mod) },
[CGW_MOD_XOR] = { .len = sizeof(struct cgw_frame_mod) },
@@ -624,22 +805,26 @@ static const struct nla_policy cgw_policy[CGW_MAX+1] = {
[CGW_FILTER] = { .len = sizeof(struct can_filter) },
[CGW_LIM_HOPS] = { .type = NLA_U8 },
[CGW_MOD_UID] = { .type = NLA_U32 },
+ [CGW_FDMOD_AND] = { .len = sizeof(struct cgw_fdframe_mod) },
+ [CGW_FDMOD_OR] = { .len = sizeof(struct cgw_fdframe_mod) },
+ [CGW_FDMOD_XOR] = { .len = sizeof(struct cgw_fdframe_mod) },
+ [CGW_FDMOD_SET] = { .len = sizeof(struct cgw_fdframe_mod) },
};
/* check for common and gwtype specific attributes */
static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
u8 gwtype, void *gwtypeattr, u8 *limhops)
{
- struct nlattr *tb[CGW_MAX+1];
- struct cgw_frame_mod mb;
+ struct nlattr *tb[CGW_MAX + 1];
+ struct rtcanmsg *r = nlmsg_data(nlh);
int modidx = 0;
int err = 0;
/* initialize modification & checksum data space */
memset(mod, 0, sizeof(*mod));
- err = nlmsg_parse(nlh, sizeof(struct rtcanmsg), tb, CGW_MAX,
- cgw_policy, NULL);
+ err = nlmsg_parse_deprecated(nlh, sizeof(struct rtcanmsg), tb,
+ CGW_MAX, cgw_policy, NULL);
if (err < 0)
return err;
@@ -651,87 +836,166 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
}
/* check for AND/OR/XOR/SET modifications */
+ if (r->flags & CGW_FLAGS_CAN_FD) {
+ struct cgw_fdframe_mod mb;
- if (tb[CGW_MOD_AND]) {
- nla_memcpy(&mb, tb[CGW_MOD_AND], CGW_MODATTR_LEN);
+ if (tb[CGW_FDMOD_AND]) {
+ nla_memcpy(&mb, tb[CGW_FDMOD_AND], CGW_FDMODATTR_LEN);
- canframecpy(&mod->modframe.and, &mb.cf);
- mod->modtype.and = mb.modtype;
+ canfdframecpy(&mod->modframe.and, &mb.cf);
+ mod->modtype.and = mb.modtype;
- if (mb.modtype & CGW_MOD_ID)
- mod->modfunc[modidx++] = mod_and_id;
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_and_id;
- if (mb.modtype & CGW_MOD_DLC)
- mod->modfunc[modidx++] = mod_and_dlc;
+ if (mb.modtype & CGW_MOD_LEN)
+ mod->modfunc[modidx++] = mod_and_len;
- if (mb.modtype & CGW_MOD_DATA)
- mod->modfunc[modidx++] = mod_and_data;
- }
+ if (mb.modtype & CGW_MOD_FLAGS)
+ mod->modfunc[modidx++] = mod_and_flags;
- if (tb[CGW_MOD_OR]) {
- nla_memcpy(&mb, tb[CGW_MOD_OR], CGW_MODATTR_LEN);
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_and_fddata;
+ }
- canframecpy(&mod->modframe.or, &mb.cf);
- mod->modtype.or = mb.modtype;
+ if (tb[CGW_FDMOD_OR]) {
+ nla_memcpy(&mb, tb[CGW_FDMOD_OR], CGW_FDMODATTR_LEN);
- if (mb.modtype & CGW_MOD_ID)
- mod->modfunc[modidx++] = mod_or_id;
+ canfdframecpy(&mod->modframe.or, &mb.cf);
+ mod->modtype.or = mb.modtype;
- if (mb.modtype & CGW_MOD_DLC)
- mod->modfunc[modidx++] = mod_or_dlc;
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_or_id;
- if (mb.modtype & CGW_MOD_DATA)
- mod->modfunc[modidx++] = mod_or_data;
- }
+ if (mb.modtype & CGW_MOD_LEN)
+ mod->modfunc[modidx++] = mod_or_len;
- if (tb[CGW_MOD_XOR]) {
- nla_memcpy(&mb, tb[CGW_MOD_XOR], CGW_MODATTR_LEN);
+ if (mb.modtype & CGW_MOD_FLAGS)
+ mod->modfunc[modidx++] = mod_or_flags;
- canframecpy(&mod->modframe.xor, &mb.cf);
- mod->modtype.xor = mb.modtype;
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_or_fddata;
+ }
- if (mb.modtype & CGW_MOD_ID)
- mod->modfunc[modidx++] = mod_xor_id;
+ if (tb[CGW_FDMOD_XOR]) {
+ nla_memcpy(&mb, tb[CGW_FDMOD_XOR], CGW_FDMODATTR_LEN);
- if (mb.modtype & CGW_MOD_DLC)
- mod->modfunc[modidx++] = mod_xor_dlc;
+ canfdframecpy(&mod->modframe.xor, &mb.cf);
+ mod->modtype.xor = mb.modtype;
- if (mb.modtype & CGW_MOD_DATA)
- mod->modfunc[modidx++] = mod_xor_data;
- }
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_xor_id;
+
+ if (mb.modtype & CGW_MOD_LEN)
+ mod->modfunc[modidx++] = mod_xor_len;
+
+ if (mb.modtype & CGW_MOD_FLAGS)
+ mod->modfunc[modidx++] = mod_xor_flags;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_xor_fddata;
+ }
+
+ if (tb[CGW_FDMOD_SET]) {
+ nla_memcpy(&mb, tb[CGW_FDMOD_SET], CGW_FDMODATTR_LEN);
+
+ canfdframecpy(&mod->modframe.set, &mb.cf);
+ mod->modtype.set = mb.modtype;
+
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_set_id;
+
+ if (mb.modtype & CGW_MOD_LEN)
+ mod->modfunc[modidx++] = mod_set_len;
+
+ if (mb.modtype & CGW_MOD_FLAGS)
+ mod->modfunc[modidx++] = mod_set_flags;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_set_fddata;
+ }
+ } else {
+ struct cgw_frame_mod mb;
+
+ if (tb[CGW_MOD_AND]) {
+ nla_memcpy(&mb, tb[CGW_MOD_AND], CGW_MODATTR_LEN);
+
+ canframecpy(&mod->modframe.and, &mb.cf);
+ mod->modtype.and = mb.modtype;
+
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_and_id;
+
+ if (mb.modtype & CGW_MOD_DLC)
+ mod->modfunc[modidx++] = mod_and_ccdlc;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_and_data;
+ }
+
+ if (tb[CGW_MOD_OR]) {
+ nla_memcpy(&mb, tb[CGW_MOD_OR], CGW_MODATTR_LEN);
+
+ canframecpy(&mod->modframe.or, &mb.cf);
+ mod->modtype.or = mb.modtype;
+
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_or_id;
+
+ if (mb.modtype & CGW_MOD_DLC)
+ mod->modfunc[modidx++] = mod_or_ccdlc;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_or_data;
+ }
+
+ if (tb[CGW_MOD_XOR]) {
+ nla_memcpy(&mb, tb[CGW_MOD_XOR], CGW_MODATTR_LEN);
+
+ canframecpy(&mod->modframe.xor, &mb.cf);
+ mod->modtype.xor = mb.modtype;
- if (tb[CGW_MOD_SET]) {
- nla_memcpy(&mb, tb[CGW_MOD_SET], CGW_MODATTR_LEN);
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_xor_id;
+
+ if (mb.modtype & CGW_MOD_DLC)
+ mod->modfunc[modidx++] = mod_xor_ccdlc;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_xor_data;
+ }
- canframecpy(&mod->modframe.set, &mb.cf);
- mod->modtype.set = mb.modtype;
+ if (tb[CGW_MOD_SET]) {
+ nla_memcpy(&mb, tb[CGW_MOD_SET], CGW_MODATTR_LEN);
- if (mb.modtype & CGW_MOD_ID)
- mod->modfunc[modidx++] = mod_set_id;
+ canframecpy(&mod->modframe.set, &mb.cf);
+ mod->modtype.set = mb.modtype;
- if (mb.modtype & CGW_MOD_DLC)
- mod->modfunc[modidx++] = mod_set_dlc;
+ if (mb.modtype & CGW_MOD_ID)
+ mod->modfunc[modidx++] = mod_set_id;
- if (mb.modtype & CGW_MOD_DATA)
- mod->modfunc[modidx++] = mod_set_data;
+ if (mb.modtype & CGW_MOD_DLC)
+ mod->modfunc[modidx++] = mod_set_ccdlc;
+
+ if (mb.modtype & CGW_MOD_DATA)
+ mod->modfunc[modidx++] = mod_set_data;
+ }
}
/* check for checksum operations after CAN frame modifications */
if (modidx) {
-
if (tb[CGW_CS_CRC8]) {
struct cgw_csum_crc8 *c = nla_data(tb[CGW_CS_CRC8]);
err = cgw_chk_csum_parms(c->from_idx, c->to_idx,
- c->result_idx);
+ c->result_idx, r);
if (err)
return err;
nla_memcpy(&mod->csum.crc8, tb[CGW_CS_CRC8],
CGW_CS_CRC8_LEN);
- /*
- * select dedicated processing function to reduce
+ /* select dedicated processing function to reduce
* runtime operations in receive hot path.
*/
if (c->from_idx < 0 || c->to_idx < 0 ||
@@ -747,15 +1011,14 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
struct cgw_csum_xor *c = nla_data(tb[CGW_CS_XOR]);
err = cgw_chk_csum_parms(c->from_idx, c->to_idx,
- c->result_idx);
+ c->result_idx, r);
if (err)
return err;
nla_memcpy(&mod->csum.xor, tb[CGW_CS_XOR],
CGW_CS_XOR_LEN);
- /*
- * select dedicated processing function to reduce
+ /* select dedicated processing function to reduce
* runtime operations in receive hot path.
*/
if (c->from_idx < 0 || c->to_idx < 0 ||
@@ -767,16 +1030,14 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod,
mod->csumfunc.xor = cgw_csum_xor_neg;
}
- if (tb[CGW_MOD_UID]) {
+ if (tb[CGW_MOD_UID])
nla_memcpy(&mod->uid, tb[CGW_MOD_UID], sizeof(u32));
- }
}
if (gwtype == CGW_TYPE_CAN_CAN) {
-
/* check CGW_TYPE_CAN_CAN specific attributes */
-
struct can_can_gw *ccgw = (struct can_can_gw *)gwtypeattr;
+
memset(ccgw, 0, sizeof(*ccgw));
/* check for can_filter in attributes */
@@ -813,7 +1074,7 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net *net = sock_net(skb->sk);
struct rtcanmsg *r;
struct cgw_job *gwj;
- struct cf_mod mod;
+ struct cf_mod *mod;
struct can_can_gw ccgw;
u8 limhops = 0;
int err = 0;
@@ -832,39 +1093,48 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh,
if (r->gwtype != CGW_TYPE_CAN_CAN)
return -EINVAL;
- err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops);
- if (err < 0)
- return err;
+ mod = kmalloc(sizeof(*mod), GFP_KERNEL);
+ if (!mod)
+ return -ENOMEM;
- if (mod.uid) {
+ err = cgw_parse_attr(nlh, mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops);
+ if (err < 0)
+ goto out_free_cf;
+ if (mod->uid) {
ASSERT_RTNL();
/* check for updating an existing job with identical uid */
hlist_for_each_entry(gwj, &net->can.cgw_list, list) {
+ struct cf_mod *old_cf;
- if (gwj->mod.uid != mod.uid)
+ old_cf = cgw_job_cf_mod(gwj);
+ if (old_cf->uid != mod->uid)
continue;
/* interfaces & filters must be identical */
- if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw)))
- return -EINVAL;
+ if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) {
+ err = -EINVAL;
+ goto out_free_cf;
+ }
- /* update modifications with disabled softirq & quit */
- local_bh_disable();
- memcpy(&gwj->mod, &mod, sizeof(mod));
- local_bh_enable();
+ rcu_assign_pointer(gwj->cf_mod, mod);
+ kfree_rcu_mightsleep(old_cf);
return 0;
}
}
/* ifindex == 0 is not allowed for job creation */
- if (!ccgw.src_idx || !ccgw.dst_idx)
- return -ENODEV;
+ if (!ccgw.src_idx || !ccgw.dst_idx) {
+ err = -ENODEV;
+ goto out_free_cf;
+ }
gwj = kmem_cache_alloc(cgw_cache, GFP_KERNEL);
- if (!gwj)
- return -ENOMEM;
+ if (!gwj) {
+ err = -ENOMEM;
+ goto out_free_cf;
+ }
gwj->handled_frames = 0;
gwj->dropped_frames = 0;
@@ -874,7 +1144,7 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh,
gwj->limit_hops = limhops;
/* insert already parsed information */
- memcpy(&gwj->mod, &mod, sizeof(mod));
+ RCU_INIT_POINTER(gwj->cf_mod, mod);
memcpy(&gwj->ccgw, &ccgw, sizeof(ccgw));
err = -ENODEV;
@@ -895,15 +1165,24 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh,
if (gwj->dst.dev->type != ARPHRD_CAN)
goto out;
+ /* is sending the skb back to the incoming interface intended? */
+ if (gwj->src.dev == gwj->dst.dev &&
+ !(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK)) {
+ err = -EINVAL;
+ goto out;
+ }
+
ASSERT_RTNL();
err = cgw_register_filter(net, gwj);
if (!err)
hlist_add_head_rcu(&gwj->list, &net->can.cgw_list);
out:
- if (err)
+ if (err) {
kmem_cache_free(cgw_cache, gwj);
-
+out_free_cf:
+ kfree(mod);
+ }
return err;
}
@@ -917,7 +1196,7 @@ static void cgw_remove_all_jobs(struct net *net)
hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) {
hlist_del(&gwj->list);
cgw_unregister_filter(net, gwj);
- kmem_cache_free(cgw_cache, gwj);
+ call_rcu(&gwj->rcu, cgw_job_free_rcu);
}
}
@@ -963,6 +1242,7 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh,
/* remove only the first matching entry */
hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) {
+ struct cf_mod *cf_mod;
if (gwj->flags != r->flags)
continue;
@@ -970,13 +1250,14 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh,
if (gwj->limit_hops != limhops)
continue;
+ cf_mod = cgw_job_cf_mod(gwj);
/* we have a match when uid is enabled and identical */
- if (gwj->mod.uid || mod.uid) {
- if (gwj->mod.uid != mod.uid)
+ if (cf_mod->uid || mod.uid) {
+ if (cf_mod->uid != mod.uid)
continue;
} else {
/* no uid => check for identical modifications */
- if (memcmp(&gwj->mod, &mod, sizeof(mod)))
+ if (memcmp(cf_mod, &mod, sizeof(mod)))
continue;
}
@@ -986,7 +1267,7 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh,
hlist_del(&gwj->list);
cgw_unregister_filter(net, gwj);
- kmem_cache_free(cgw_cache, gwj);
+ call_rcu(&gwj->rcu, cgw_job_free_rcu);
err = 0;
break;
}
@@ -1000,16 +1281,28 @@ static int __net_init cangw_pernet_init(struct net *net)
return 0;
}
-static void __net_exit cangw_pernet_exit(struct net *net)
+static void __net_exit cangw_pernet_exit_batch(struct list_head *net_list)
{
+ struct net *net;
+
rtnl_lock();
- cgw_remove_all_jobs(net);
+ list_for_each_entry(net, net_list, exit_list)
+ cgw_remove_all_jobs(net);
rtnl_unlock();
}
static struct pernet_operations cangw_pernet_ops = {
.init = cangw_pernet_init,
- .exit = cangw_pernet_exit,
+ .exit_batch = cangw_pernet_exit_batch,
+};
+
+static const struct rtnl_msg_handler cgw_rtnl_msg_handlers[] __initconst_or_module = {
+ {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_NEWROUTE,
+ .doit = cgw_create_job},
+ {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_DELROUTE,
+ .doit = cgw_remove_job},
+ {.owner = THIS_MODULE, .protocol = PF_CAN, .msgtype = RTM_GETROUTE,
+ .dumpit = cgw_dump_jobs},
};
static __init int cgw_module_init(void)
@@ -1019,35 +1312,38 @@ static __init int cgw_module_init(void)
/* sanitize given module parameter */
max_hops = clamp_t(unsigned int, max_hops, CGW_MIN_HOPS, CGW_MAX_HOPS);
- pr_info("can: netlink gateway (rev " CAN_GW_VERSION ") max_hops=%d\n",
- max_hops);
+ pr_info("can: netlink gateway - max_hops=%d\n", max_hops);
+
+ ret = register_pernet_subsys(&cangw_pernet_ops);
+ if (ret)
+ return ret;
- register_pernet_subsys(&cangw_pernet_ops);
+ ret = -ENOMEM;
cgw_cache = kmem_cache_create("can_gw", sizeof(struct cgw_job),
0, 0, NULL);
-
if (!cgw_cache)
- return -ENOMEM;
+ goto out_cache_create;
/* set notifier */
notifier.notifier_call = cgw_notifier;
- register_netdevice_notifier(&notifier);
-
- ret = rtnl_register_module(THIS_MODULE, PF_CAN, RTM_GETROUTE,
- NULL, cgw_dump_jobs, 0);
- if (ret) {
- unregister_netdevice_notifier(&notifier);
- kmem_cache_destroy(cgw_cache);
- return -ENOBUFS;
- }
+ ret = register_netdevice_notifier(&notifier);
+ if (ret)
+ goto out_register_notifier;
- /* Only the first call to rtnl_register_module can fail */
- rtnl_register_module(THIS_MODULE, PF_CAN, RTM_NEWROUTE,
- cgw_create_job, NULL, 0);
- rtnl_register_module(THIS_MODULE, PF_CAN, RTM_DELROUTE,
- cgw_remove_job, NULL, 0);
+ ret = rtnl_register_many(cgw_rtnl_msg_handlers);
+ if (ret)
+ goto out_rtnl_register;
return 0;
+
+out_rtnl_register:
+ unregister_netdevice_notifier(&notifier);
+out_register_notifier:
+ kmem_cache_destroy(cgw_cache);
+out_cache_create:
+ unregister_pernet_subsys(&cangw_pernet_ops);
+
+ return ret;
}
static __exit void cgw_module_exit(void)
diff --git a/net/can/isotp.c b/net/can/isotp.c
new file mode 100644
index 000000000000..ce588b85665a
--- /dev/null
+++ b/net/can/isotp.c
@@ -0,0 +1,1739 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/* isotp.c - ISO 15765-2 CAN transport protocol for protocol family CAN
+ *
+ * This implementation does not provide ISO-TP specific return values to the
+ * userspace.
+ *
+ * - RX path timeout of data reception leads to -ETIMEDOUT
+ * - RX path SN mismatch leads to -EILSEQ
+ * - RX path data reception with wrong padding leads to -EBADMSG
+ * - TX path flowcontrol reception timeout leads to -ECOMM
+ * - TX path flowcontrol reception overflow leads to -EMSGSIZE
+ * - TX path flowcontrol reception with wrong layout/padding leads to -EBADMSG
+ * - when a transfer (tx) is on the run the next write() blocks until it's done
+ * - use CAN_ISOTP_WAIT_TX_DONE flag to block the caller until the PDU is sent
+ * - as we have static buffers the check whether the PDU fits into the buffer
+ * is done at FF reception time (no support for sending 'wait frames')
+ *
+ * Copyright (c) 2020 Volkswagen Group Electronic Research
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of Volkswagen nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * Alternatively, provided that this notice is retained in full, this
+ * software may be distributed under the terms of the GNU General
+ * Public License ("GPL") version 2, in which case the provisions of the
+ * GPL apply INSTEAD OF those given above.
+ *
+ * The provided data structures and external interfaces from this code
+ * are not restricted to be used by modules with a GPL compatible license.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/hrtimer.h>
+#include <linux/wait.h>
+#include <linux/uio.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/socket.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/can.h>
+#include <linux/can/core.h>
+#include <linux/can/skb.h>
+#include <linux/can/isotp.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/net_namespace.h>
+
+MODULE_DESCRIPTION("PF_CAN ISO 15765-2 transport protocol");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Oliver Hartkopp <socketcan@hartkopp.net>");
+MODULE_ALIAS("can-proto-6");
+
+#define ISOTP_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.tp)
+
+#define SINGLE_MASK(id) (((id) & CAN_EFF_FLAG) ? \
+ (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \
+ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG))
+
+/* Since ISO 15765-2:2016 the CAN isotp protocol supports more than 4095
+ * byte per ISO PDU as the FF_DL can take full 32 bit values (4 Gbyte).
+ * We would need some good concept to handle this between user space and
+ * kernel space. For now set the static buffer to something about 8 kbyte
+ * to be able to test this new functionality.
+ */
+#define DEFAULT_MAX_PDU_SIZE 8300
+
+/* maximum PDU size before ISO 15765-2:2016 extension was 4095 */
+#define MAX_12BIT_PDU_SIZE 4095
+
+/* limit the isotp pdu size from the optional module parameter to 1MByte */
+#define MAX_PDU_SIZE (1025 * 1024U)
+
+static unsigned int max_pdu_size __read_mostly = DEFAULT_MAX_PDU_SIZE;
+module_param(max_pdu_size, uint, 0444);
+MODULE_PARM_DESC(max_pdu_size, "maximum isotp pdu size (default "
+ __stringify(DEFAULT_MAX_PDU_SIZE) ")");
+
+/* N_PCI type values in bits 7-4 of N_PCI bytes */
+#define N_PCI_SF 0x00 /* single frame */
+#define N_PCI_FF 0x10 /* first frame */
+#define N_PCI_CF 0x20 /* consecutive frame */
+#define N_PCI_FC 0x30 /* flow control */
+
+#define N_PCI_SZ 1 /* size of the PCI byte #1 */
+#define SF_PCI_SZ4 1 /* size of SingleFrame PCI including 4 bit SF_DL */
+#define SF_PCI_SZ8 2 /* size of SingleFrame PCI including 8 bit SF_DL */
+#define FF_PCI_SZ12 2 /* size of FirstFrame PCI including 12 bit FF_DL */
+#define FF_PCI_SZ32 6 /* size of FirstFrame PCI including 32 bit FF_DL */
+#define FC_CONTENT_SZ 3 /* flow control content size in byte (FS/BS/STmin) */
+
+#define ISOTP_CHECK_PADDING (CAN_ISOTP_CHK_PAD_LEN | CAN_ISOTP_CHK_PAD_DATA)
+#define ISOTP_ALL_BC_FLAGS (CAN_ISOTP_SF_BROADCAST | CAN_ISOTP_CF_BROADCAST)
+
+/* Flow Status given in FC frame */
+#define ISOTP_FC_CTS 0 /* clear to send */
+#define ISOTP_FC_WT 1 /* wait */
+#define ISOTP_FC_OVFLW 2 /* overflow */
+
+#define ISOTP_FC_TIMEOUT 1 /* 1 sec */
+#define ISOTP_ECHO_TIMEOUT 2 /* 2 secs */
+
+enum {
+ ISOTP_IDLE = 0,
+ ISOTP_WAIT_FIRST_FC,
+ ISOTP_WAIT_FC,
+ ISOTP_WAIT_DATA,
+ ISOTP_SENDING,
+ ISOTP_SHUTDOWN,
+};
+
+struct tpcon {
+ u8 *buf;
+ unsigned int buflen;
+ unsigned int len;
+ unsigned int idx;
+ u32 state;
+ u8 bs;
+ u8 sn;
+ u8 ll_dl;
+ u8 sbuf[DEFAULT_MAX_PDU_SIZE];
+};
+
+struct isotp_sock {
+ struct sock sk;
+ int bound;
+ int ifindex;
+ canid_t txid;
+ canid_t rxid;
+ ktime_t tx_gap;
+ ktime_t lastrxcf_tstamp;
+ struct hrtimer rxtimer, txtimer, txfrtimer;
+ struct can_isotp_options opt;
+ struct can_isotp_fc_options rxfc, txfc;
+ struct can_isotp_ll_options ll;
+ u32 frame_txtime;
+ u32 force_tx_stmin;
+ u32 force_rx_stmin;
+ u32 cfecho; /* consecutive frame echo tag */
+ struct tpcon rx, tx;
+ struct list_head notifier;
+ wait_queue_head_t wait;
+ spinlock_t rx_lock; /* protect single thread state machine */
+};
+
+static LIST_HEAD(isotp_notifier_list);
+static DEFINE_SPINLOCK(isotp_notifier_lock);
+static struct isotp_sock *isotp_busy_notifier;
+
+static inline struct isotp_sock *isotp_sk(const struct sock *sk)
+{
+ return (struct isotp_sock *)sk;
+}
+
+static u32 isotp_bc_flags(struct isotp_sock *so)
+{
+ return so->opt.flags & ISOTP_ALL_BC_FLAGS;
+}
+
+static bool isotp_register_rxid(struct isotp_sock *so)
+{
+ /* no broadcast modes => register rx_id for FC frame reception */
+ return (isotp_bc_flags(so) == 0);
+}
+
+static enum hrtimer_restart isotp_rx_timer_handler(struct hrtimer *hrtimer)
+{
+ struct isotp_sock *so = container_of(hrtimer, struct isotp_sock,
+ rxtimer);
+ struct sock *sk = &so->sk;
+
+ if (so->rx.state == ISOTP_WAIT_DATA) {
+ /* we did not get new data frames in time */
+
+ /* report 'connection timed out' */
+ sk->sk_err = ETIMEDOUT;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+
+ /* reset rx state */
+ so->rx.state = ISOTP_IDLE;
+ }
+
+ return HRTIMER_NORESTART;
+}
+
+static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus)
+{
+ struct net_device *dev;
+ struct sk_buff *nskb;
+ struct canfd_frame *ncf;
+ struct isotp_sock *so = isotp_sk(sk);
+ int can_send_ret;
+
+ nskb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), gfp_any());
+ if (!nskb)
+ return 1;
+
+ dev = dev_get_by_index(sock_net(sk), so->ifindex);
+ if (!dev) {
+ kfree_skb(nskb);
+ return 1;
+ }
+
+ can_skb_reserve(nskb);
+ can_skb_prv(nskb)->ifindex = dev->ifindex;
+ can_skb_prv(nskb)->skbcnt = 0;
+
+ nskb->dev = dev;
+ can_skb_set_owner(nskb, sk);
+ ncf = (struct canfd_frame *)nskb->data;
+ skb_put_zero(nskb, so->ll.mtu);
+
+ /* create & send flow control reply */
+ ncf->can_id = so->txid;
+
+ if (so->opt.flags & CAN_ISOTP_TX_PADDING) {
+ memset(ncf->data, so->opt.txpad_content, CAN_MAX_DLEN);
+ ncf->len = CAN_MAX_DLEN;
+ } else {
+ ncf->len = ae + FC_CONTENT_SZ;
+ }
+
+ ncf->data[ae] = N_PCI_FC | flowstatus;
+ ncf->data[ae + 1] = so->rxfc.bs;
+ ncf->data[ae + 2] = so->rxfc.stmin;
+
+ if (ae)
+ ncf->data[0] = so->opt.ext_address;
+
+ ncf->flags = so->ll.tx_flags;
+
+ can_send_ret = can_send(nskb, 1);
+ if (can_send_ret)
+ pr_notice_once("can-isotp: %s: can_send_ret %pe\n",
+ __func__, ERR_PTR(can_send_ret));
+
+ dev_put(dev);
+
+ /* reset blocksize counter */
+ so->rx.bs = 0;
+
+ /* reset last CF frame rx timestamp for rx stmin enforcement */
+ so->lastrxcf_tstamp = ktime_set(0, 0);
+
+ /* start rx timeout watchdog */
+ hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0),
+ HRTIMER_MODE_REL_SOFT);
+ return 0;
+}
+
+static void isotp_rcv_skb(struct sk_buff *skb, struct sock *sk)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)skb->cb;
+ enum skb_drop_reason reason;
+
+ BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct sockaddr_can));
+
+ memset(addr, 0, sizeof(*addr));
+ addr->can_family = AF_CAN;
+ addr->can_ifindex = skb->dev->ifindex;
+
+ if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0)
+ sk_skb_reason_drop(sk, skb, reason);
+}
+
+static u8 padlen(u8 datalen)
+{
+ static const u8 plen[] = {
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, /* 0 - 8 */
+ 12, 12, 12, 12, /* 9 - 12 */
+ 16, 16, 16, 16, /* 13 - 16 */
+ 20, 20, 20, 20, /* 17 - 20 */
+ 24, 24, 24, 24, /* 21 - 24 */
+ 32, 32, 32, 32, 32, 32, 32, 32, /* 25 - 32 */
+ 48, 48, 48, 48, 48, 48, 48, 48, /* 33 - 40 */
+ 48, 48, 48, 48, 48, 48, 48, 48 /* 41 - 48 */
+ };
+
+ if (datalen > 48)
+ return 64;
+
+ return plen[datalen];
+}
+
+/* check for length optimization and return 1/true when the check fails */
+static int check_optimized(struct canfd_frame *cf, int start_index)
+{
+ /* for CAN_DL <= 8 the start_index is equal to the CAN_DL as the
+ * padding would start at this point. E.g. if the padding would
+ * start at cf.data[7] cf->len has to be 7 to be optimal.
+ * Note: The data[] index starts with zero.
+ */
+ if (cf->len <= CAN_MAX_DLEN)
+ return (cf->len != start_index);
+
+ /* This relation is also valid in the non-linear DLC range, where
+ * we need to take care of the minimal next possible CAN_DL.
+ * The correct check would be (padlen(cf->len) != padlen(start_index)).
+ * But as cf->len can only take discrete values from 12, .., 64 at this
+ * point the padlen(cf->len) is always equal to cf->len.
+ */
+ return (cf->len != padlen(start_index));
+}
+
+/* check padding and return 1/true when the check fails */
+static int check_pad(struct isotp_sock *so, struct canfd_frame *cf,
+ int start_index, u8 content)
+{
+ int i;
+
+ /* no RX_PADDING value => check length of optimized frame length */
+ if (!(so->opt.flags & CAN_ISOTP_RX_PADDING)) {
+ if (so->opt.flags & CAN_ISOTP_CHK_PAD_LEN)
+ return check_optimized(cf, start_index);
+
+ /* no valid test against empty value => ignore frame */
+ return 1;
+ }
+
+ /* check datalength of correctly padded CAN frame */
+ if ((so->opt.flags & CAN_ISOTP_CHK_PAD_LEN) &&
+ cf->len != padlen(cf->len))
+ return 1;
+
+ /* check padding content */
+ if (so->opt.flags & CAN_ISOTP_CHK_PAD_DATA) {
+ for (i = start_index; i < cf->len; i++)
+ if (cf->data[i] != content)
+ return 1;
+ }
+ return 0;
+}
+
+static void isotp_send_cframe(struct isotp_sock *so);
+
+static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae)
+{
+ struct sock *sk = &so->sk;
+
+ if (so->tx.state != ISOTP_WAIT_FC &&
+ so->tx.state != ISOTP_WAIT_FIRST_FC)
+ return 0;
+
+ hrtimer_cancel(&so->txtimer);
+
+ if ((cf->len < ae + FC_CONTENT_SZ) ||
+ ((so->opt.flags & ISOTP_CHECK_PADDING) &&
+ check_pad(so, cf, ae + FC_CONTENT_SZ, so->opt.rxpad_content))) {
+ /* malformed PDU - report 'not a data message' */
+ sk->sk_err = EBADMSG;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+
+ so->tx.state = ISOTP_IDLE;
+ wake_up_interruptible(&so->wait);
+ return 1;
+ }
+
+ /* get static/dynamic communication params from first/every FC frame */
+ if (so->tx.state == ISOTP_WAIT_FIRST_FC ||
+ so->opt.flags & CAN_ISOTP_DYN_FC_PARMS) {
+ so->txfc.bs = cf->data[ae + 1];
+ so->txfc.stmin = cf->data[ae + 2];
+
+ /* fix wrong STmin values according spec */
+ if (so->txfc.stmin > 0x7F &&
+ (so->txfc.stmin < 0xF1 || so->txfc.stmin > 0xF9))
+ so->txfc.stmin = 0x7F;
+
+ so->tx_gap = ktime_set(0, 0);
+ /* add transmission time for CAN frame N_As */
+ so->tx_gap = ktime_add_ns(so->tx_gap, so->frame_txtime);
+ /* add waiting time for consecutive frames N_Cs */
+ if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN)
+ so->tx_gap = ktime_add_ns(so->tx_gap,
+ so->force_tx_stmin);
+ else if (so->txfc.stmin < 0x80)
+ so->tx_gap = ktime_add_ns(so->tx_gap,
+ so->txfc.stmin * 1000000);
+ else
+ so->tx_gap = ktime_add_ns(so->tx_gap,
+ (so->txfc.stmin - 0xF0)
+ * 100000);
+ so->tx.state = ISOTP_WAIT_FC;
+ }
+
+ switch (cf->data[ae] & 0x0F) {
+ case ISOTP_FC_CTS:
+ so->tx.bs = 0;
+ so->tx.state = ISOTP_SENDING;
+ /* send CF frame and enable echo timeout handling */
+ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0),
+ HRTIMER_MODE_REL_SOFT);
+ isotp_send_cframe(so);
+ break;
+
+ case ISOTP_FC_WT:
+ /* start timer to wait for next FC frame */
+ hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0),
+ HRTIMER_MODE_REL_SOFT);
+ break;
+
+ case ISOTP_FC_OVFLW:
+ /* overflow on receiver side - report 'message too long' */
+ sk->sk_err = EMSGSIZE;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+ fallthrough;
+
+ default:
+ /* stop this tx job */
+ so->tx.state = ISOTP_IDLE;
+ wake_up_interruptible(&so->wait);
+ }
+ return 0;
+}
+
+static int isotp_rcv_sf(struct sock *sk, struct canfd_frame *cf, int pcilen,
+ struct sk_buff *skb, int len)
+{
+ struct isotp_sock *so = isotp_sk(sk);
+ struct sk_buff *nskb;
+
+ hrtimer_cancel(&so->rxtimer);
+ so->rx.state = ISOTP_IDLE;
+
+ if (!len || len > cf->len - pcilen)
+ return 1;
+
+ if ((so->opt.flags & ISOTP_CHECK_PADDING) &&
+ check_pad(so, cf, pcilen + len, so->opt.rxpad_content)) {
+ /* malformed PDU - report 'not a data message' */
+ sk->sk_err = EBADMSG;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+ return 1;
+ }
+
+ nskb = alloc_skb(len, gfp_any());
+ if (!nskb)
+ return 1;
+
+ memcpy(skb_put(nskb, len), &cf->data[pcilen], len);
+
+ nskb->tstamp = skb->tstamp;
+ nskb->dev = skb->dev;
+ isotp_rcv_skb(nskb, sk);
+ return 0;
+}
+
+static int isotp_rcv_ff(struct sock *sk, struct canfd_frame *cf, int ae)
+{
+ struct isotp_sock *so = isotp_sk(sk);
+ int i;
+ int off;
+ int ff_pci_sz;
+
+ hrtimer_cancel(&so->rxtimer);
+ so->rx.state = ISOTP_IDLE;
+
+ /* get the used sender LL_DL from the (first) CAN frame data length */
+ so->rx.ll_dl = padlen(cf->len);
+
+ /* the first frame has to use the entire frame up to LL_DL length */
+ if (cf->len != so->rx.ll_dl)
+ return 1;
+
+ /* get the FF_DL */
+ so->rx.len = (cf->data[ae] & 0x0F) << 8;
+ so->rx.len += cf->data[ae + 1];
+
+ /* Check for FF_DL escape sequence supporting 32 bit PDU length */
+ if (so->rx.len) {
+ ff_pci_sz = FF_PCI_SZ12;
+ } else {
+ /* FF_DL = 0 => get real length from next 4 bytes */
+ so->rx.len = cf->data[ae + 2] << 24;
+ so->rx.len += cf->data[ae + 3] << 16;
+ so->rx.len += cf->data[ae + 4] << 8;
+ so->rx.len += cf->data[ae + 5];
+ ff_pci_sz = FF_PCI_SZ32;
+ }
+
+ /* take care of a potential SF_DL ESC offset for TX_DL > 8 */
+ off = (so->rx.ll_dl > CAN_MAX_DLEN) ? 1 : 0;
+
+ if (so->rx.len + ae + off + ff_pci_sz < so->rx.ll_dl)
+ return 1;
+
+ /* PDU size > default => try max_pdu_size */
+ if (so->rx.len > so->rx.buflen && so->rx.buflen < max_pdu_size) {
+ u8 *newbuf = kmalloc(max_pdu_size, GFP_ATOMIC);
+
+ if (newbuf) {
+ so->rx.buf = newbuf;
+ so->rx.buflen = max_pdu_size;
+ }
+ }
+
+ if (so->rx.len > so->rx.buflen) {
+ /* send FC frame with overflow status */
+ isotp_send_fc(sk, ae, ISOTP_FC_OVFLW);
+ return 1;
+ }
+
+ /* copy the first received data bytes */
+ so->rx.idx = 0;
+ for (i = ae + ff_pci_sz; i < so->rx.ll_dl; i++)
+ so->rx.buf[so->rx.idx++] = cf->data[i];
+
+ /* initial setup for this pdu reception */
+ so->rx.sn = 1;
+ so->rx.state = ISOTP_WAIT_DATA;
+
+ /* no creation of flow control frames */
+ if (so->opt.flags & CAN_ISOTP_LISTEN_MODE)
+ return 0;
+
+ /* send our first FC frame */
+ isotp_send_fc(sk, ae, ISOTP_FC_CTS);
+ return 0;
+}
+
+static int isotp_rcv_cf(struct sock *sk, struct canfd_frame *cf, int ae,
+ struct sk_buff *skb)
+{
+ struct isotp_sock *so = isotp_sk(sk);
+ struct sk_buff *nskb;
+ int i;
+
+ if (so->rx.state != ISOTP_WAIT_DATA)
+ return 0;
+
+ /* drop if timestamp gap is less than force_rx_stmin nano secs */
+ if (so->opt.flags & CAN_ISOTP_FORCE_RXSTMIN) {
+ if (ktime_to_ns(ktime_sub(skb->tstamp, so->lastrxcf_tstamp)) <
+ so->force_rx_stmin)
+ return 0;
+
+ so->lastrxcf_tstamp = skb->tstamp;
+ }
+
+ hrtimer_cancel(&so->rxtimer);
+
+ /* CFs are never longer than the FF */
+ if (cf->len > so->rx.ll_dl)
+ return 1;
+
+ /* CFs have usually the LL_DL length */
+ if (cf->len < so->rx.ll_dl) {
+ /* this is only allowed for the last CF */
+ if (so->rx.len - so->rx.idx > so->rx.ll_dl - ae - N_PCI_SZ)
+ return 1;
+ }
+
+ if ((cf->data[ae] & 0x0F) != so->rx.sn) {
+ /* wrong sn detected - report 'illegal byte sequence' */
+ sk->sk_err = EILSEQ;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+
+ /* reset rx state */
+ so->rx.state = ISOTP_IDLE;
+ return 1;
+ }
+ so->rx.sn++;
+ so->rx.sn %= 16;
+
+ for (i = ae + N_PCI_SZ; i < cf->len; i++) {
+ so->rx.buf[so->rx.idx++] = cf->data[i];
+ if (so->rx.idx >= so->rx.len)
+ break;
+ }
+
+ if (so->rx.idx >= so->rx.len) {
+ /* we are done */
+ so->rx.state = ISOTP_IDLE;
+
+ if ((so->opt.flags & ISOTP_CHECK_PADDING) &&
+ check_pad(so, cf, i + 1, so->opt.rxpad_content)) {
+ /* malformed PDU - report 'not a data message' */
+ sk->sk_err = EBADMSG;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+ return 1;
+ }
+
+ nskb = alloc_skb(so->rx.len, gfp_any());
+ if (!nskb)
+ return 1;
+
+ memcpy(skb_put(nskb, so->rx.len), so->rx.buf,
+ so->rx.len);
+
+ nskb->tstamp = skb->tstamp;
+ nskb->dev = skb->dev;
+ isotp_rcv_skb(nskb, sk);
+ return 0;
+ }
+
+ /* perform blocksize handling, if enabled */
+ if (!so->rxfc.bs || ++so->rx.bs < so->rxfc.bs) {
+ /* start rx timeout watchdog */
+ hrtimer_start(&so->rxtimer, ktime_set(ISOTP_FC_TIMEOUT, 0),
+ HRTIMER_MODE_REL_SOFT);
+ return 0;
+ }
+
+ /* no creation of flow control frames */
+ if (so->opt.flags & CAN_ISOTP_LISTEN_MODE)
+ return 0;
+
+ /* we reached the specified blocksize so->rxfc.bs */
+ isotp_send_fc(sk, ae, ISOTP_FC_CTS);
+ return 0;
+}
+
+static void isotp_rcv(struct sk_buff *skb, void *data)
+{
+ struct sock *sk = (struct sock *)data;
+ struct isotp_sock *so = isotp_sk(sk);
+ struct canfd_frame *cf;
+ int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0;
+ u8 n_pci_type, sf_dl;
+
+ /* Strictly receive only frames with the configured MTU size
+ * => clear separation of CAN2.0 / CAN FD transport channels
+ */
+ if (skb->len != so->ll.mtu)
+ return;
+
+ cf = (struct canfd_frame *)skb->data;
+
+ /* if enabled: check reception of my configured extended address */
+ if (ae && cf->data[0] != so->opt.rx_ext_address)
+ return;
+
+ n_pci_type = cf->data[ae] & 0xF0;
+
+ /* Make sure the state changes and data structures stay consistent at
+ * CAN frame reception time. This locking is not needed in real world
+ * use cases but the inconsistency can be triggered with syzkaller.
+ */
+ spin_lock(&so->rx_lock);
+
+ if (so->opt.flags & CAN_ISOTP_HALF_DUPLEX) {
+ /* check rx/tx path half duplex expectations */
+ if ((so->tx.state != ISOTP_IDLE && n_pci_type != N_PCI_FC) ||
+ (so->rx.state != ISOTP_IDLE && n_pci_type == N_PCI_FC))
+ goto out_unlock;
+ }
+
+ switch (n_pci_type) {
+ case N_PCI_FC:
+ /* tx path: flow control frame containing the FC parameters */
+ isotp_rcv_fc(so, cf, ae);
+ break;
+
+ case N_PCI_SF:
+ /* rx path: single frame
+ *
+ * As we do not have a rx.ll_dl configuration, we can only test
+ * if the CAN frames payload length matches the LL_DL == 8
+ * requirements - no matter if it's CAN 2.0 or CAN FD
+ */
+
+ /* get the SF_DL from the N_PCI byte */
+ sf_dl = cf->data[ae] & 0x0F;
+
+ if (cf->len <= CAN_MAX_DLEN) {
+ isotp_rcv_sf(sk, cf, SF_PCI_SZ4 + ae, skb, sf_dl);
+ } else {
+ if (can_is_canfd_skb(skb)) {
+ /* We have a CAN FD frame and CAN_DL is greater than 8:
+ * Only frames with the SF_DL == 0 ESC value are valid.
+ *
+ * If so take care of the increased SF PCI size
+ * (SF_PCI_SZ8) to point to the message content behind
+ * the extended SF PCI info and get the real SF_DL
+ * length value from the formerly first data byte.
+ */
+ if (sf_dl == 0)
+ isotp_rcv_sf(sk, cf, SF_PCI_SZ8 + ae, skb,
+ cf->data[SF_PCI_SZ4 + ae]);
+ }
+ }
+ break;
+
+ case N_PCI_FF:
+ /* rx path: first frame */
+ isotp_rcv_ff(sk, cf, ae);
+ break;
+
+ case N_PCI_CF:
+ /* rx path: consecutive frame */
+ isotp_rcv_cf(sk, cf, ae, skb);
+ break;
+ }
+
+out_unlock:
+ spin_unlock(&so->rx_lock);
+}
+
+static void isotp_fill_dataframe(struct canfd_frame *cf, struct isotp_sock *so,
+ int ae, int off)
+{
+ int pcilen = N_PCI_SZ + ae + off;
+ int space = so->tx.ll_dl - pcilen;
+ int num = min_t(int, so->tx.len - so->tx.idx, space);
+ int i;
+
+ cf->can_id = so->txid;
+ cf->len = num + pcilen;
+
+ if (num < space) {
+ if (so->opt.flags & CAN_ISOTP_TX_PADDING) {
+ /* user requested padding */
+ cf->len = padlen(cf->len);
+ memset(cf->data, so->opt.txpad_content, cf->len);
+ } else if (cf->len > CAN_MAX_DLEN) {
+ /* mandatory padding for CAN FD frames */
+ cf->len = padlen(cf->len);
+ memset(cf->data, CAN_ISOTP_DEFAULT_PAD_CONTENT,
+ cf->len);
+ }
+ }
+
+ for (i = 0; i < num; i++)
+ cf->data[pcilen + i] = so->tx.buf[so->tx.idx++];
+
+ if (ae)
+ cf->data[0] = so->opt.ext_address;
+}
+
+static void isotp_send_cframe(struct isotp_sock *so)
+{
+ struct sock *sk = &so->sk;
+ struct sk_buff *skb;
+ struct net_device *dev;
+ struct canfd_frame *cf;
+ int can_send_ret;
+ int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0;
+
+ dev = dev_get_by_index(sock_net(sk), so->ifindex);
+ if (!dev)
+ return;
+
+ skb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), GFP_ATOMIC);
+ if (!skb) {
+ dev_put(dev);
+ return;
+ }
+
+ can_skb_reserve(skb);
+ can_skb_prv(skb)->ifindex = dev->ifindex;
+ can_skb_prv(skb)->skbcnt = 0;
+
+ cf = (struct canfd_frame *)skb->data;
+ skb_put_zero(skb, so->ll.mtu);
+
+ /* create consecutive frame */
+ isotp_fill_dataframe(cf, so, ae, 0);
+
+ /* place consecutive frame N_PCI in appropriate index */
+ cf->data[ae] = N_PCI_CF | so->tx.sn++;
+ so->tx.sn %= 16;
+ so->tx.bs++;
+
+ cf->flags = so->ll.tx_flags;
+
+ skb->dev = dev;
+ can_skb_set_owner(skb, sk);
+
+ /* cfecho should have been zero'ed by init/isotp_rcv_echo() */
+ if (so->cfecho)
+ pr_notice_once("can-isotp: cfecho is %08X != 0\n", so->cfecho);
+
+ /* set consecutive frame echo tag */
+ so->cfecho = *(u32 *)cf->data;
+
+ /* send frame with local echo enabled */
+ can_send_ret = can_send(skb, 1);
+ if (can_send_ret) {
+ pr_notice_once("can-isotp: %s: can_send_ret %pe\n",
+ __func__, ERR_PTR(can_send_ret));
+ if (can_send_ret == -ENOBUFS)
+ pr_notice_once("can-isotp: tx queue is full\n");
+ }
+ dev_put(dev);
+}
+
+static void isotp_create_fframe(struct canfd_frame *cf, struct isotp_sock *so,
+ int ae)
+{
+ int i;
+ int ff_pci_sz;
+
+ cf->can_id = so->txid;
+ cf->len = so->tx.ll_dl;
+ if (ae)
+ cf->data[0] = so->opt.ext_address;
+
+ /* create N_PCI bytes with 12/32 bit FF_DL data length */
+ if (so->tx.len > MAX_12BIT_PDU_SIZE) {
+ /* use 32 bit FF_DL notation */
+ cf->data[ae] = N_PCI_FF;
+ cf->data[ae + 1] = 0;
+ cf->data[ae + 2] = (u8)(so->tx.len >> 24) & 0xFFU;
+ cf->data[ae + 3] = (u8)(so->tx.len >> 16) & 0xFFU;
+ cf->data[ae + 4] = (u8)(so->tx.len >> 8) & 0xFFU;
+ cf->data[ae + 5] = (u8)so->tx.len & 0xFFU;
+ ff_pci_sz = FF_PCI_SZ32;
+ } else {
+ /* use 12 bit FF_DL notation */
+ cf->data[ae] = (u8)(so->tx.len >> 8) | N_PCI_FF;
+ cf->data[ae + 1] = (u8)so->tx.len & 0xFFU;
+ ff_pci_sz = FF_PCI_SZ12;
+ }
+
+ /* add first data bytes depending on ae */
+ for (i = ae + ff_pci_sz; i < so->tx.ll_dl; i++)
+ cf->data[i] = so->tx.buf[so->tx.idx++];
+
+ so->tx.sn = 1;
+}
+
+static void isotp_rcv_echo(struct sk_buff *skb, void *data)
+{
+ struct sock *sk = (struct sock *)data;
+ struct isotp_sock *so = isotp_sk(sk);
+ struct canfd_frame *cf = (struct canfd_frame *)skb->data;
+
+ /* only handle my own local echo CF/SF skb's (no FF!) */
+ if (skb->sk != sk || so->cfecho != *(u32 *)cf->data)
+ return;
+
+ /* cancel local echo timeout */
+ hrtimer_cancel(&so->txtimer);
+
+ /* local echo skb with consecutive frame has been consumed */
+ so->cfecho = 0;
+
+ if (so->tx.idx >= so->tx.len) {
+ /* we are done */
+ so->tx.state = ISOTP_IDLE;
+ wake_up_interruptible(&so->wait);
+ return;
+ }
+
+ if (so->txfc.bs && so->tx.bs >= so->txfc.bs) {
+ /* stop and wait for FC with timeout */
+ so->tx.state = ISOTP_WAIT_FC;
+ hrtimer_start(&so->txtimer, ktime_set(ISOTP_FC_TIMEOUT, 0),
+ HRTIMER_MODE_REL_SOFT);
+ return;
+ }
+
+ /* no gap between data frames needed => use burst mode */
+ if (!so->tx_gap) {
+ /* enable echo timeout handling */
+ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0),
+ HRTIMER_MODE_REL_SOFT);
+ isotp_send_cframe(so);
+ return;
+ }
+
+ /* start timer to send next consecutive frame with correct delay */
+ hrtimer_start(&so->txfrtimer, so->tx_gap, HRTIMER_MODE_REL_SOFT);
+}
+
+static enum hrtimer_restart isotp_tx_timer_handler(struct hrtimer *hrtimer)
+{
+ struct isotp_sock *so = container_of(hrtimer, struct isotp_sock,
+ txtimer);
+ struct sock *sk = &so->sk;
+
+ /* don't handle timeouts in IDLE or SHUTDOWN state */
+ if (so->tx.state == ISOTP_IDLE || so->tx.state == ISOTP_SHUTDOWN)
+ return HRTIMER_NORESTART;
+
+ /* we did not get any flow control or echo frame in time */
+
+ /* report 'communication error on send' */
+ sk->sk_err = ECOMM;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+
+ /* reset tx state */
+ so->tx.state = ISOTP_IDLE;
+ wake_up_interruptible(&so->wait);
+
+ return HRTIMER_NORESTART;
+}
+
+static enum hrtimer_restart isotp_txfr_timer_handler(struct hrtimer *hrtimer)
+{
+ struct isotp_sock *so = container_of(hrtimer, struct isotp_sock,
+ txfrtimer);
+
+ /* start echo timeout handling and cover below protocol error */
+ hrtimer_start(&so->txtimer, ktime_set(ISOTP_ECHO_TIMEOUT, 0),
+ HRTIMER_MODE_REL_SOFT);
+
+ /* cfecho should be consumed by isotp_rcv_echo() here */
+ if (so->tx.state == ISOTP_SENDING && !so->cfecho)
+ isotp_send_cframe(so);
+
+ return HRTIMER_NORESTART;
+}
+
+static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+{
+ struct sock *sk = sock->sk;
+ struct isotp_sock *so = isotp_sk(sk);
+ struct sk_buff *skb;
+ struct net_device *dev;
+ struct canfd_frame *cf;
+ int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0;
+ int wait_tx_done = (so->opt.flags & CAN_ISOTP_WAIT_TX_DONE) ? 1 : 0;
+ s64 hrtimer_sec = ISOTP_ECHO_TIMEOUT;
+ int off;
+ int err;
+
+ if (!so->bound || so->tx.state == ISOTP_SHUTDOWN)
+ return -EADDRNOTAVAIL;
+
+ while (cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SENDING) != ISOTP_IDLE) {
+ /* we do not support multiple buffers - for now */
+ if (msg->msg_flags & MSG_DONTWAIT)
+ return -EAGAIN;
+
+ if (so->tx.state == ISOTP_SHUTDOWN)
+ return -EADDRNOTAVAIL;
+
+ /* wait for complete transmission of current pdu */
+ err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE);
+ if (err)
+ goto err_event_drop;
+ }
+
+ /* PDU size > default => try max_pdu_size */
+ if (size > so->tx.buflen && so->tx.buflen < max_pdu_size) {
+ u8 *newbuf = kmalloc(max_pdu_size, GFP_KERNEL);
+
+ if (newbuf) {
+ so->tx.buf = newbuf;
+ so->tx.buflen = max_pdu_size;
+ }
+ }
+
+ if (!size || size > so->tx.buflen) {
+ err = -EINVAL;
+ goto err_out_drop;
+ }
+
+ /* take care of a potential SF_DL ESC offset for TX_DL > 8 */
+ off = (so->tx.ll_dl > CAN_MAX_DLEN) ? 1 : 0;
+
+ /* does the given data fit into a single frame for SF_BROADCAST? */
+ if ((isotp_bc_flags(so) == CAN_ISOTP_SF_BROADCAST) &&
+ (size > so->tx.ll_dl - SF_PCI_SZ4 - ae - off)) {
+ err = -EINVAL;
+ goto err_out_drop;
+ }
+
+ err = memcpy_from_msg(so->tx.buf, msg, size);
+ if (err < 0)
+ goto err_out_drop;
+
+ dev = dev_get_by_index(sock_net(sk), so->ifindex);
+ if (!dev) {
+ err = -ENXIO;
+ goto err_out_drop;
+ }
+
+ skb = sock_alloc_send_skb(sk, so->ll.mtu + sizeof(struct can_skb_priv),
+ msg->msg_flags & MSG_DONTWAIT, &err);
+ if (!skb) {
+ dev_put(dev);
+ goto err_out_drop;
+ }
+
+ can_skb_reserve(skb);
+ can_skb_prv(skb)->ifindex = dev->ifindex;
+ can_skb_prv(skb)->skbcnt = 0;
+
+ so->tx.len = size;
+ so->tx.idx = 0;
+
+ cf = (struct canfd_frame *)skb->data;
+ skb_put_zero(skb, so->ll.mtu);
+
+ /* cfecho should have been zero'ed by init / former isotp_rcv_echo() */
+ if (so->cfecho)
+ pr_notice_once("can-isotp: uninit cfecho %08X\n", so->cfecho);
+
+ /* check for single frame transmission depending on TX_DL */
+ if (size <= so->tx.ll_dl - SF_PCI_SZ4 - ae - off) {
+ /* The message size generally fits into a SingleFrame - good.
+ *
+ * SF_DL ESC offset optimization:
+ *
+ * When TX_DL is greater 8 but the message would still fit
+ * into a 8 byte CAN frame, we can omit the offset.
+ * This prevents a protocol caused length extension from
+ * CAN_DL = 8 to CAN_DL = 12 due to the SF_SL ESC handling.
+ */
+ if (size <= CAN_MAX_DLEN - SF_PCI_SZ4 - ae)
+ off = 0;
+
+ isotp_fill_dataframe(cf, so, ae, off);
+
+ /* place single frame N_PCI w/o length in appropriate index */
+ cf->data[ae] = N_PCI_SF;
+
+ /* place SF_DL size value depending on the SF_DL ESC offset */
+ if (off)
+ cf->data[SF_PCI_SZ4 + ae] = size;
+ else
+ cf->data[ae] |= size;
+
+ /* set CF echo tag for isotp_rcv_echo() (SF-mode) */
+ so->cfecho = *(u32 *)cf->data;
+ } else {
+ /* send first frame */
+
+ isotp_create_fframe(cf, so, ae);
+
+ if (isotp_bc_flags(so) == CAN_ISOTP_CF_BROADCAST) {
+ /* set timer for FC-less operation (STmin = 0) */
+ if (so->opt.flags & CAN_ISOTP_FORCE_TXSTMIN)
+ so->tx_gap = ktime_set(0, so->force_tx_stmin);
+ else
+ so->tx_gap = ktime_set(0, so->frame_txtime);
+
+ /* disable wait for FCs due to activated block size */
+ so->txfc.bs = 0;
+
+ /* set CF echo tag for isotp_rcv_echo() (CF-mode) */
+ so->cfecho = *(u32 *)cf->data;
+ } else {
+ /* standard flow control check */
+ so->tx.state = ISOTP_WAIT_FIRST_FC;
+
+ /* start timeout for FC */
+ hrtimer_sec = ISOTP_FC_TIMEOUT;
+
+ /* no CF echo tag for isotp_rcv_echo() (FF-mode) */
+ so->cfecho = 0;
+ }
+ }
+
+ hrtimer_start(&so->txtimer, ktime_set(hrtimer_sec, 0),
+ HRTIMER_MODE_REL_SOFT);
+
+ /* send the first or only CAN frame */
+ cf->flags = so->ll.tx_flags;
+
+ skb->dev = dev;
+ skb->sk = sk;
+ err = can_send(skb, 1);
+ dev_put(dev);
+ if (err) {
+ pr_notice_once("can-isotp: %s: can_send_ret %pe\n",
+ __func__, ERR_PTR(err));
+
+ /* no transmission -> no timeout monitoring */
+ hrtimer_cancel(&so->txtimer);
+
+ /* reset consecutive frame echo tag */
+ so->cfecho = 0;
+
+ goto err_out_drop;
+ }
+
+ if (wait_tx_done) {
+ /* wait for complete transmission of current pdu */
+ err = wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE);
+ if (err)
+ goto err_event_drop;
+
+ err = sock_error(sk);
+ if (err)
+ return err;
+ }
+
+ return size;
+
+err_event_drop:
+ /* got signal: force tx state machine to be idle */
+ so->tx.state = ISOTP_IDLE;
+ hrtimer_cancel(&so->txfrtimer);
+ hrtimer_cancel(&so->txtimer);
+err_out_drop:
+ /* drop this PDU and unlock a potential wait queue */
+ so->tx.state = ISOTP_IDLE;
+ wake_up_interruptible(&so->wait);
+
+ return err;
+}
+
+static int isotp_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ struct isotp_sock *so = isotp_sk(sk);
+ int ret = 0;
+
+ if (flags & ~(MSG_DONTWAIT | MSG_TRUNC | MSG_PEEK | MSG_CMSG_COMPAT))
+ return -EINVAL;
+
+ if (!so->bound)
+ return -EADDRNOTAVAIL;
+
+ skb = skb_recv_datagram(sk, flags, &ret);
+ if (!skb)
+ return ret;
+
+ if (size < skb->len)
+ msg->msg_flags |= MSG_TRUNC;
+ else
+ size = skb->len;
+
+ ret = memcpy_to_msg(msg, skb->data, size);
+ if (ret < 0)
+ goto out_err;
+
+ sock_recv_cmsgs(msg, sk, skb);
+
+ if (msg->msg_name) {
+ __sockaddr_check_size(ISOTP_MIN_NAMELEN);
+ msg->msg_namelen = ISOTP_MIN_NAMELEN;
+ memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
+ }
+
+ /* set length of return value */
+ ret = (flags & MSG_TRUNC) ? skb->len : size;
+
+out_err:
+ skb_free_datagram(sk, skb);
+
+ return ret;
+}
+
+static int isotp_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct isotp_sock *so;
+ struct net *net;
+
+ if (!sk)
+ return 0;
+
+ so = isotp_sk(sk);
+ net = sock_net(sk);
+
+ /* wait for complete transmission of current pdu */
+ while (wait_event_interruptible(so->wait, so->tx.state == ISOTP_IDLE) == 0 &&
+ cmpxchg(&so->tx.state, ISOTP_IDLE, ISOTP_SHUTDOWN) != ISOTP_IDLE)
+ ;
+
+ /* force state machines to be idle also when a signal occurred */
+ so->tx.state = ISOTP_SHUTDOWN;
+ so->rx.state = ISOTP_IDLE;
+
+ spin_lock(&isotp_notifier_lock);
+ while (isotp_busy_notifier == so) {
+ spin_unlock(&isotp_notifier_lock);
+ schedule_timeout_uninterruptible(1);
+ spin_lock(&isotp_notifier_lock);
+ }
+ list_del(&so->notifier);
+ spin_unlock(&isotp_notifier_lock);
+
+ lock_sock(sk);
+
+ /* remove current filters & unregister */
+ if (so->bound) {
+ if (so->ifindex) {
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, so->ifindex);
+ if (dev) {
+ if (isotp_register_rxid(so))
+ can_rx_unregister(net, dev, so->rxid,
+ SINGLE_MASK(so->rxid),
+ isotp_rcv, sk);
+
+ can_rx_unregister(net, dev, so->txid,
+ SINGLE_MASK(so->txid),
+ isotp_rcv_echo, sk);
+ dev_put(dev);
+ synchronize_rcu();
+ }
+ }
+ }
+
+ hrtimer_cancel(&so->txfrtimer);
+ hrtimer_cancel(&so->txtimer);
+ hrtimer_cancel(&so->rxtimer);
+
+ so->ifindex = 0;
+ so->bound = 0;
+
+ if (so->rx.buf != so->rx.sbuf)
+ kfree(so->rx.buf);
+
+ if (so->tx.buf != so->tx.sbuf)
+ kfree(so->tx.buf);
+
+ sock_orphan(sk);
+ sock->sk = NULL;
+
+ release_sock(sk);
+ sock_prot_inuse_add(net, sk->sk_prot, -1);
+ sock_put(sk);
+
+ return 0;
+}
+
+static int isotp_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int len)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+ struct sock *sk = sock->sk;
+ struct isotp_sock *so = isotp_sk(sk);
+ struct net *net = sock_net(sk);
+ int ifindex;
+ struct net_device *dev;
+ canid_t tx_id = addr->can_addr.tp.tx_id;
+ canid_t rx_id = addr->can_addr.tp.rx_id;
+ int err = 0;
+ int notify_enetdown = 0;
+
+ if (len < ISOTP_MIN_NAMELEN)
+ return -EINVAL;
+
+ if (addr->can_family != AF_CAN)
+ return -EINVAL;
+
+ /* sanitize tx CAN identifier */
+ if (tx_id & CAN_EFF_FLAG)
+ tx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK);
+ else
+ tx_id &= CAN_SFF_MASK;
+
+ /* give feedback on wrong CAN-ID value */
+ if (tx_id != addr->can_addr.tp.tx_id)
+ return -EINVAL;
+
+ /* sanitize rx CAN identifier (if needed) */
+ if (isotp_register_rxid(so)) {
+ if (rx_id & CAN_EFF_FLAG)
+ rx_id &= (CAN_EFF_FLAG | CAN_EFF_MASK);
+ else
+ rx_id &= CAN_SFF_MASK;
+
+ /* give feedback on wrong CAN-ID value */
+ if (rx_id != addr->can_addr.tp.rx_id)
+ return -EINVAL;
+ }
+
+ if (!addr->can_ifindex)
+ return -ENODEV;
+
+ lock_sock(sk);
+
+ if (so->bound) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* ensure different CAN IDs when the rx_id is to be registered */
+ if (isotp_register_rxid(so) && rx_id == tx_id) {
+ err = -EADDRNOTAVAIL;
+ goto out;
+ }
+
+ dev = dev_get_by_index(net, addr->can_ifindex);
+ if (!dev) {
+ err = -ENODEV;
+ goto out;
+ }
+ if (dev->type != ARPHRD_CAN) {
+ dev_put(dev);
+ err = -ENODEV;
+ goto out;
+ }
+ if (READ_ONCE(dev->mtu) < so->ll.mtu) {
+ dev_put(dev);
+ err = -EINVAL;
+ goto out;
+ }
+ if (!(dev->flags & IFF_UP))
+ notify_enetdown = 1;
+
+ ifindex = dev->ifindex;
+
+ if (isotp_register_rxid(so))
+ can_rx_register(net, dev, rx_id, SINGLE_MASK(rx_id),
+ isotp_rcv, sk, "isotp", sk);
+
+ /* no consecutive frame echo skb in flight */
+ so->cfecho = 0;
+
+ /* register for echo skb's */
+ can_rx_register(net, dev, tx_id, SINGLE_MASK(tx_id),
+ isotp_rcv_echo, sk, "isotpe", sk);
+
+ dev_put(dev);
+
+ /* switch to new settings */
+ so->ifindex = ifindex;
+ so->rxid = rx_id;
+ so->txid = tx_id;
+ so->bound = 1;
+
+out:
+ release_sock(sk);
+
+ if (notify_enetdown) {
+ sk->sk_err = ENETDOWN;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+ }
+
+ return err;
+}
+
+static int isotp_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+ struct sock *sk = sock->sk;
+ struct isotp_sock *so = isotp_sk(sk);
+
+ if (peer)
+ return -EOPNOTSUPP;
+
+ memset(addr, 0, ISOTP_MIN_NAMELEN);
+ addr->can_family = AF_CAN;
+ addr->can_ifindex = so->ifindex;
+ addr->can_addr.tp.rx_id = so->rxid;
+ addr->can_addr.tp.tx_id = so->txid;
+
+ return ISOTP_MIN_NAMELEN;
+}
+
+static int isotp_setsockopt_locked(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct isotp_sock *so = isotp_sk(sk);
+ int ret = 0;
+
+ if (so->bound)
+ return -EISCONN;
+
+ switch (optname) {
+ case CAN_ISOTP_OPTS:
+ if (optlen != sizeof(struct can_isotp_options))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&so->opt, optval, optlen))
+ return -EFAULT;
+
+ /* no separate rx_ext_address is given => use ext_address */
+ if (!(so->opt.flags & CAN_ISOTP_RX_EXT_ADDR))
+ so->opt.rx_ext_address = so->opt.ext_address;
+
+ /* these broadcast flags are not allowed together */
+ if (isotp_bc_flags(so) == ISOTP_ALL_BC_FLAGS) {
+ /* CAN_ISOTP_SF_BROADCAST is prioritized */
+ so->opt.flags &= ~CAN_ISOTP_CF_BROADCAST;
+
+ /* give user feedback on wrong config attempt */
+ ret = -EINVAL;
+ }
+
+ /* check for frame_txtime changes (0 => no changes) */
+ if (so->opt.frame_txtime) {
+ if (so->opt.frame_txtime == CAN_ISOTP_FRAME_TXTIME_ZERO)
+ so->frame_txtime = 0;
+ else
+ so->frame_txtime = so->opt.frame_txtime;
+ }
+ break;
+
+ case CAN_ISOTP_RECV_FC:
+ if (optlen != sizeof(struct can_isotp_fc_options))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&so->rxfc, optval, optlen))
+ return -EFAULT;
+ break;
+
+ case CAN_ISOTP_TX_STMIN:
+ if (optlen != sizeof(u32))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&so->force_tx_stmin, optval, optlen))
+ return -EFAULT;
+ break;
+
+ case CAN_ISOTP_RX_STMIN:
+ if (optlen != sizeof(u32))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&so->force_rx_stmin, optval, optlen))
+ return -EFAULT;
+ break;
+
+ case CAN_ISOTP_LL_OPTS:
+ if (optlen == sizeof(struct can_isotp_ll_options)) {
+ struct can_isotp_ll_options ll;
+
+ if (copy_from_sockptr(&ll, optval, optlen))
+ return -EFAULT;
+
+ /* check for correct ISO 11898-1 DLC data length */
+ if (ll.tx_dl != padlen(ll.tx_dl))
+ return -EINVAL;
+
+ if (ll.mtu != CAN_MTU && ll.mtu != CANFD_MTU)
+ return -EINVAL;
+
+ if (ll.mtu == CAN_MTU &&
+ (ll.tx_dl > CAN_MAX_DLEN || ll.tx_flags != 0))
+ return -EINVAL;
+
+ memcpy(&so->ll, &ll, sizeof(ll));
+
+ /* set ll_dl for tx path to similar place as for rx */
+ so->tx.ll_dl = ll.tx_dl;
+ } else {
+ return -EINVAL;
+ }
+ break;
+
+ default:
+ ret = -ENOPROTOOPT;
+ }
+
+ return ret;
+}
+
+static int isotp_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+
+{
+ struct sock *sk = sock->sk;
+ int ret;
+
+ if (level != SOL_CAN_ISOTP)
+ return -EINVAL;
+
+ lock_sock(sk);
+ ret = isotp_setsockopt_locked(sock, level, optname, optval, optlen);
+ release_sock(sk);
+ return ret;
+}
+
+static int isotp_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct isotp_sock *so = isotp_sk(sk);
+ int len;
+ void *val;
+
+ if (level != SOL_CAN_ISOTP)
+ return -EINVAL;
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case CAN_ISOTP_OPTS:
+ len = min_t(int, len, sizeof(struct can_isotp_options));
+ val = &so->opt;
+ break;
+
+ case CAN_ISOTP_RECV_FC:
+ len = min_t(int, len, sizeof(struct can_isotp_fc_options));
+ val = &so->rxfc;
+ break;
+
+ case CAN_ISOTP_TX_STMIN:
+ len = min_t(int, len, sizeof(u32));
+ val = &so->force_tx_stmin;
+ break;
+
+ case CAN_ISOTP_RX_STMIN:
+ len = min_t(int, len, sizeof(u32));
+ val = &so->force_rx_stmin;
+ break;
+
+ case CAN_ISOTP_LL_OPTS:
+ len = min_t(int, len, sizeof(struct can_isotp_ll_options));
+ val = &so->ll;
+ break;
+
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, val, len))
+ return -EFAULT;
+ return 0;
+}
+
+static void isotp_notify(struct isotp_sock *so, unsigned long msg,
+ struct net_device *dev)
+{
+ struct sock *sk = &so->sk;
+
+ if (!net_eq(dev_net(dev), sock_net(sk)))
+ return;
+
+ if (so->ifindex != dev->ifindex)
+ return;
+
+ switch (msg) {
+ case NETDEV_UNREGISTER:
+ lock_sock(sk);
+ /* remove current filters & unregister */
+ if (so->bound) {
+ if (isotp_register_rxid(so))
+ can_rx_unregister(dev_net(dev), dev, so->rxid,
+ SINGLE_MASK(so->rxid),
+ isotp_rcv, sk);
+
+ can_rx_unregister(dev_net(dev), dev, so->txid,
+ SINGLE_MASK(so->txid),
+ isotp_rcv_echo, sk);
+ }
+
+ so->ifindex = 0;
+ so->bound = 0;
+ release_sock(sk);
+
+ sk->sk_err = ENODEV;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+ break;
+
+ case NETDEV_DOWN:
+ sk->sk_err = ENETDOWN;
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
+ break;
+ }
+}
+
+static int isotp_notifier(struct notifier_block *nb, unsigned long msg,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (dev->type != ARPHRD_CAN)
+ return NOTIFY_DONE;
+ if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN)
+ return NOTIFY_DONE;
+ if (unlikely(isotp_busy_notifier)) /* Check for reentrant bug. */
+ return NOTIFY_DONE;
+
+ spin_lock(&isotp_notifier_lock);
+ list_for_each_entry(isotp_busy_notifier, &isotp_notifier_list, notifier) {
+ spin_unlock(&isotp_notifier_lock);
+ isotp_notify(isotp_busy_notifier, msg, dev);
+ spin_lock(&isotp_notifier_lock);
+ }
+ isotp_busy_notifier = NULL;
+ spin_unlock(&isotp_notifier_lock);
+ return NOTIFY_DONE;
+}
+
+static int isotp_init(struct sock *sk)
+{
+ struct isotp_sock *so = isotp_sk(sk);
+
+ so->ifindex = 0;
+ so->bound = 0;
+
+ so->opt.flags = CAN_ISOTP_DEFAULT_FLAGS;
+ so->opt.ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS;
+ so->opt.rx_ext_address = CAN_ISOTP_DEFAULT_EXT_ADDRESS;
+ so->opt.rxpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT;
+ so->opt.txpad_content = CAN_ISOTP_DEFAULT_PAD_CONTENT;
+ so->opt.frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME;
+ so->frame_txtime = CAN_ISOTP_DEFAULT_FRAME_TXTIME;
+ so->rxfc.bs = CAN_ISOTP_DEFAULT_RECV_BS;
+ so->rxfc.stmin = CAN_ISOTP_DEFAULT_RECV_STMIN;
+ so->rxfc.wftmax = CAN_ISOTP_DEFAULT_RECV_WFTMAX;
+ so->ll.mtu = CAN_ISOTP_DEFAULT_LL_MTU;
+ so->ll.tx_dl = CAN_ISOTP_DEFAULT_LL_TX_DL;
+ so->ll.tx_flags = CAN_ISOTP_DEFAULT_LL_TX_FLAGS;
+
+ /* set ll_dl for tx path to similar place as for rx */
+ so->tx.ll_dl = so->ll.tx_dl;
+
+ so->rx.state = ISOTP_IDLE;
+ so->tx.state = ISOTP_IDLE;
+
+ so->rx.buf = so->rx.sbuf;
+ so->tx.buf = so->tx.sbuf;
+ so->rx.buflen = ARRAY_SIZE(so->rx.sbuf);
+ so->tx.buflen = ARRAY_SIZE(so->tx.sbuf);
+
+ hrtimer_setup(&so->rxtimer, isotp_rx_timer_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
+ hrtimer_setup(&so->txtimer, isotp_tx_timer_handler, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
+ hrtimer_setup(&so->txfrtimer, isotp_txfr_timer_handler, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_SOFT);
+
+ init_waitqueue_head(&so->wait);
+ spin_lock_init(&so->rx_lock);
+
+ spin_lock(&isotp_notifier_lock);
+ list_add_tail(&so->notifier, &isotp_notifier_list);
+ spin_unlock(&isotp_notifier_lock);
+
+ return 0;
+}
+
+static __poll_t isotp_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+ struct sock *sk = sock->sk;
+ struct isotp_sock *so = isotp_sk(sk);
+
+ __poll_t mask = datagram_poll(file, sock, wait);
+ poll_wait(file, &so->wait, wait);
+
+ /* Check for false positives due to TX state */
+ if ((mask & EPOLLWRNORM) && (so->tx.state != ISOTP_IDLE))
+ mask &= ~(EPOLLOUT | EPOLLWRNORM);
+
+ return mask;
+}
+
+static int isotp_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ /* no ioctls for socket layer -> hand it down to NIC layer */
+ return -ENOIOCTLCMD;
+}
+
+static const struct proto_ops isotp_ops = {
+ .family = PF_CAN,
+ .release = isotp_release,
+ .bind = isotp_bind,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = isotp_getname,
+ .poll = isotp_poll,
+ .ioctl = isotp_sock_no_ioctlcmd,
+ .gettstamp = sock_gettstamp,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = isotp_setsockopt,
+ .getsockopt = isotp_getsockopt,
+ .sendmsg = isotp_sendmsg,
+ .recvmsg = isotp_recvmsg,
+ .mmap = sock_no_mmap,
+};
+
+static struct proto isotp_proto __read_mostly = {
+ .name = "CAN_ISOTP",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct isotp_sock),
+ .init = isotp_init,
+};
+
+static const struct can_proto isotp_can_proto = {
+ .type = SOCK_DGRAM,
+ .protocol = CAN_ISOTP,
+ .ops = &isotp_ops,
+ .prot = &isotp_proto,
+};
+
+static struct notifier_block canisotp_notifier = {
+ .notifier_call = isotp_notifier
+};
+
+static __init int isotp_module_init(void)
+{
+ int err;
+
+ max_pdu_size = max_t(unsigned int, max_pdu_size, MAX_12BIT_PDU_SIZE);
+ max_pdu_size = min_t(unsigned int, max_pdu_size, MAX_PDU_SIZE);
+
+ pr_info("can: isotp protocol (max_pdu_size %d)\n", max_pdu_size);
+
+ err = can_proto_register(&isotp_can_proto);
+ if (err < 0)
+ pr_err("can: registration of isotp protocol failed %pe\n", ERR_PTR(err));
+ else
+ register_netdevice_notifier(&canisotp_notifier);
+
+ return err;
+}
+
+static __exit void isotp_module_exit(void)
+{
+ can_proto_unregister(&isotp_can_proto);
+ unregister_netdevice_notifier(&canisotp_notifier);
+}
+
+module_init(isotp_module_init);
+module_exit(isotp_module_exit);
diff --git a/net/can/j1939/Kconfig b/net/can/j1939/Kconfig
new file mode 100644
index 000000000000..2998298b71ec
--- /dev/null
+++ b/net/can/j1939/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# SAE J1939 network layer core configuration
+#
+
+config CAN_J1939
+ tristate "SAE J1939"
+ depends on CAN
+ help
+ SAE J1939
+ Say Y to have in-kernel support for j1939 socket type. This
+ allows communication according to SAE j1939.
+ The relevant parts in kernel are
+ SAE j1939-21 (datalink & transport protocol)
+ & SAE j1939-81 (network management).
diff --git a/net/can/j1939/Makefile b/net/can/j1939/Makefile
new file mode 100644
index 000000000000..19181bdae173
--- /dev/null
+++ b/net/can/j1939/Makefile
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_CAN_J1939) += can-j1939.o
+
+can-j1939-objs := \
+ address-claim.o \
+ bus.o \
+ main.o \
+ socket.o \
+ transport.o
diff --git a/net/can/j1939/address-claim.c b/net/can/j1939/address-claim.c
new file mode 100644
index 000000000000..ca4ad6cdd5cb
--- /dev/null
+++ b/net/can/j1939/address-claim.c
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2010-2011 EIA Electronics,
+// Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2010-2011 EIA Electronics,
+// Pieter Beyens <pieter.beyens@eia.be>
+// Copyright (c) 2017-2019 Pengutronix,
+// Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+// Oleksij Rempel <kernel@pengutronix.de>
+
+/* J1939 Address Claiming.
+ * Address Claiming in the kernel
+ * - keeps track of the AC states of ECU's,
+ * - resolves NAME<=>SA taking into account the AC states of ECU's.
+ *
+ * All Address Claim msgs (including host-originated msg) are processed
+ * at the receive path (a sent msg is always received again via CAN echo).
+ * As such, the processing of AC msgs is done in the order on which msgs
+ * are sent on the bus.
+ *
+ * This module doesn't send msgs itself (e.g. replies on Address Claims),
+ * this is the responsibility of a user space application or daemon.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+
+#include "j1939-priv.h"
+
+static inline name_t j1939_skb_to_name(const struct sk_buff *skb)
+{
+ return le64_to_cpup((__le64 *)skb->data);
+}
+
+static inline bool j1939_ac_msg_is_request(struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ int req_pgn;
+
+ if (skb->len < 3 || skcb->addr.pgn != J1939_PGN_REQUEST)
+ return false;
+
+ req_pgn = skb->data[0] | (skb->data[1] << 8) | (skb->data[2] << 16);
+
+ return req_pgn == J1939_PGN_ADDRESS_CLAIMED;
+}
+
+static int j1939_ac_verify_outgoing(struct j1939_priv *priv,
+ struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+
+ if (skb->len != 8) {
+ netdev_notice(priv->ndev, "tx address claim with dlc %i\n",
+ skb->len);
+ return -EPROTO;
+ }
+
+ if (skcb->addr.src_name != j1939_skb_to_name(skb)) {
+ netdev_notice(priv->ndev, "tx address claim with different name\n");
+ return -EPROTO;
+ }
+
+ if (skcb->addr.sa == J1939_NO_ADDR) {
+ netdev_notice(priv->ndev, "tx address claim with broadcast sa\n");
+ return -EPROTO;
+ }
+
+ /* ac must always be a broadcast */
+ if (skcb->addr.dst_name || skcb->addr.da != J1939_NO_ADDR) {
+ netdev_notice(priv->ndev, "tx address claim with dest, not broadcast\n");
+ return -EPROTO;
+ }
+ return 0;
+}
+
+int j1939_ac_fixup(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ int ret;
+ u8 addr;
+
+ /* network mgmt: address claiming msgs */
+ if (skcb->addr.pgn == J1939_PGN_ADDRESS_CLAIMED) {
+ struct j1939_ecu *ecu;
+
+ ret = j1939_ac_verify_outgoing(priv, skb);
+ /* return both when failure & when successful */
+ if (ret < 0)
+ return ret;
+ ecu = j1939_ecu_get_by_name(priv, skcb->addr.src_name);
+ if (!ecu)
+ return -ENODEV;
+
+ if (ecu->addr != skcb->addr.sa)
+ /* hold further traffic for ecu, remove from parent */
+ j1939_ecu_unmap(ecu);
+ j1939_ecu_put(ecu);
+ } else if (skcb->addr.src_name) {
+ /* assign source address */
+ addr = j1939_name_to_addr(priv, skcb->addr.src_name);
+ if (!j1939_address_is_unicast(addr) &&
+ !j1939_ac_msg_is_request(skb)) {
+ netdev_notice(priv->ndev, "tx drop: invalid sa for name 0x%016llx\n",
+ skcb->addr.src_name);
+ return -EADDRNOTAVAIL;
+ }
+ skcb->addr.sa = addr;
+ }
+
+ /* assign destination address */
+ if (skcb->addr.dst_name) {
+ addr = j1939_name_to_addr(priv, skcb->addr.dst_name);
+ if (!j1939_address_is_unicast(addr)) {
+ netdev_notice(priv->ndev, "tx drop: invalid da for name 0x%016llx\n",
+ skcb->addr.dst_name);
+ return -EADDRNOTAVAIL;
+ }
+ skcb->addr.da = addr;
+ }
+ return 0;
+}
+
+static void j1939_ac_process(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_ecu *ecu, *prev;
+ name_t name;
+
+ if (skb->len != 8) {
+ netdev_notice(priv->ndev, "rx address claim with wrong dlc %i\n",
+ skb->len);
+ return;
+ }
+
+ name = j1939_skb_to_name(skb);
+ skcb->addr.src_name = name;
+ if (!name) {
+ netdev_notice(priv->ndev, "rx address claim without name\n");
+ return;
+ }
+
+ if (!j1939_address_is_valid(skcb->addr.sa)) {
+ netdev_notice(priv->ndev, "rx address claim with broadcast sa\n");
+ return;
+ }
+
+ write_lock_bh(&priv->lock);
+
+ /* Few words on the ECU ref counting:
+ *
+ * First we get an ECU handle, either with
+ * j1939_ecu_get_by_name_locked() (increments the ref counter)
+ * or j1939_ecu_create_locked() (initializes an ECU object
+ * with a ref counter of 1).
+ *
+ * j1939_ecu_unmap_locked() will decrement the ref counter,
+ * but only if the ECU was mapped before. So "ecu" still
+ * belongs to us.
+ *
+ * j1939_ecu_timer_start() will increment the ref counter
+ * before it starts the timer, so we can put the ecu when
+ * leaving this function.
+ */
+ ecu = j1939_ecu_get_by_name_locked(priv, name);
+
+ if (ecu && ecu->addr == skcb->addr.sa) {
+ /* The ISO 11783-5 standard, in "4.5.2 - Address claim
+ * requirements", states:
+ * d) No CF shall begin, or resume, transmission on the
+ * network until 250 ms after it has successfully claimed
+ * an address except when responding to a request for
+ * address-claimed.
+ *
+ * But "Figure 6" and "Figure 7" in "4.5.4.2 - Address-claim
+ * prioritization" show that the CF begins the transmission
+ * after 250 ms from the first AC (address-claimed) message
+ * even if it sends another AC message during that time window
+ * to resolve the address contention with another CF.
+ *
+ * As stated in "4.4.2.3 - Address-claimed message":
+ * In order to successfully claim an address, the CF sending
+ * an address claimed message shall not receive a contending
+ * claim from another CF for at least 250 ms.
+ *
+ * As stated in "4.4.3.2 - NAME management (NM) message":
+ * 1) A commanding CF can
+ * d) request that a CF with a specified NAME transmit
+ * the address-claimed message with its current NAME.
+ * 2) A target CF shall
+ * d) send an address-claimed message in response to a
+ * request for a matching NAME
+ *
+ * Taking the above arguments into account, the 250 ms wait is
+ * requested only during network initialization.
+ *
+ * Do not restart the timer on AC message if both the NAME and
+ * the address match and so if the address has already been
+ * claimed (timer has expired) or the AC message has been sent
+ * to resolve the contention with another CF (timer is still
+ * running).
+ */
+ goto out_ecu_put;
+ }
+
+ if (!ecu && j1939_address_is_unicast(skcb->addr.sa))
+ ecu = j1939_ecu_create_locked(priv, name);
+
+ if (IS_ERR_OR_NULL(ecu))
+ goto out_unlock_bh;
+
+ /* cancel pending (previous) address claim */
+ j1939_ecu_timer_cancel(ecu);
+
+ if (j1939_address_is_idle(skcb->addr.sa)) {
+ j1939_ecu_unmap_locked(ecu);
+ goto out_ecu_put;
+ }
+
+ /* save new addr */
+ if (ecu->addr != skcb->addr.sa)
+ j1939_ecu_unmap_locked(ecu);
+ ecu->addr = skcb->addr.sa;
+
+ prev = j1939_ecu_get_by_addr_locked(priv, skcb->addr.sa);
+ if (prev) {
+ if (ecu->name > prev->name) {
+ j1939_ecu_unmap_locked(ecu);
+ j1939_ecu_put(prev);
+ goto out_ecu_put;
+ } else {
+ /* kick prev if less or equal */
+ j1939_ecu_unmap_locked(prev);
+ j1939_ecu_put(prev);
+ }
+ }
+
+ j1939_ecu_timer_start(ecu);
+ out_ecu_put:
+ j1939_ecu_put(ecu);
+ out_unlock_bh:
+ write_unlock_bh(&priv->lock);
+}
+
+void j1939_ac_recv(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_ecu *ecu;
+
+ /* network mgmt */
+ if (skcb->addr.pgn == J1939_PGN_ADDRESS_CLAIMED) {
+ j1939_ac_process(priv, skb);
+ } else if (j1939_address_is_unicast(skcb->addr.sa)) {
+ /* assign source name */
+ ecu = j1939_ecu_get_by_addr(priv, skcb->addr.sa);
+ if (ecu) {
+ skcb->addr.src_name = ecu->name;
+ j1939_ecu_put(ecu);
+ }
+ }
+
+ /* assign destination name */
+ ecu = j1939_ecu_get_by_addr(priv, skcb->addr.da);
+ if (ecu) {
+ skcb->addr.dst_name = ecu->name;
+ j1939_ecu_put(ecu);
+ }
+}
diff --git a/net/can/j1939/bus.c b/net/can/j1939/bus.c
new file mode 100644
index 000000000000..797719cb227e
--- /dev/null
+++ b/net/can/j1939/bus.c
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2010-2011 EIA Electronics,
+// Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2017-2019 Pengutronix,
+// Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+// Oleksij Rempel <kernel@pengutronix.de>
+
+/* bus for j1939 remote devices
+ * Since rtnetlink, no real bus is used.
+ */
+
+#include <net/sock.h>
+
+#include "j1939-priv.h"
+
+static void __j1939_ecu_release(struct kref *kref)
+{
+ struct j1939_ecu *ecu = container_of(kref, struct j1939_ecu, kref);
+ struct j1939_priv *priv = ecu->priv;
+
+ list_del(&ecu->list);
+ kfree(ecu);
+ j1939_priv_put(priv);
+}
+
+void j1939_ecu_put(struct j1939_ecu *ecu)
+{
+ kref_put(&ecu->kref, __j1939_ecu_release);
+}
+
+static void j1939_ecu_get(struct j1939_ecu *ecu)
+{
+ kref_get(&ecu->kref);
+}
+
+static bool j1939_ecu_is_mapped_locked(struct j1939_ecu *ecu)
+{
+ struct j1939_priv *priv = ecu->priv;
+
+ lockdep_assert_held(&priv->lock);
+
+ return j1939_ecu_find_by_addr_locked(priv, ecu->addr) == ecu;
+}
+
+/* ECU device interface */
+/* map ECU to a bus address space */
+static void j1939_ecu_map_locked(struct j1939_ecu *ecu)
+{
+ struct j1939_priv *priv = ecu->priv;
+ struct j1939_addr_ent *ent;
+
+ lockdep_assert_held(&priv->lock);
+
+ if (!j1939_address_is_unicast(ecu->addr))
+ return;
+
+ ent = &priv->ents[ecu->addr];
+
+ if (ent->ecu) {
+ netdev_warn(priv->ndev, "Trying to map already mapped ECU, addr: 0x%02x, name: 0x%016llx. Skip it.\n",
+ ecu->addr, ecu->name);
+ return;
+ }
+
+ j1939_ecu_get(ecu);
+ ent->ecu = ecu;
+ ent->nusers += ecu->nusers;
+}
+
+/* unmap ECU from a bus address space */
+void j1939_ecu_unmap_locked(struct j1939_ecu *ecu)
+{
+ struct j1939_priv *priv = ecu->priv;
+ struct j1939_addr_ent *ent;
+
+ lockdep_assert_held(&priv->lock);
+
+ if (!j1939_address_is_unicast(ecu->addr))
+ return;
+
+ if (!j1939_ecu_is_mapped_locked(ecu))
+ return;
+
+ ent = &priv->ents[ecu->addr];
+ ent->ecu = NULL;
+ ent->nusers -= ecu->nusers;
+ j1939_ecu_put(ecu);
+}
+
+void j1939_ecu_unmap(struct j1939_ecu *ecu)
+{
+ write_lock_bh(&ecu->priv->lock);
+ j1939_ecu_unmap_locked(ecu);
+ write_unlock_bh(&ecu->priv->lock);
+}
+
+void j1939_ecu_unmap_all(struct j1939_priv *priv)
+{
+ int i;
+
+ write_lock_bh(&priv->lock);
+ for (i = 0; i < ARRAY_SIZE(priv->ents); i++)
+ if (priv->ents[i].ecu)
+ j1939_ecu_unmap_locked(priv->ents[i].ecu);
+ write_unlock_bh(&priv->lock);
+}
+
+void j1939_ecu_timer_start(struct j1939_ecu *ecu)
+{
+ /* The ECU is held here and released in the
+ * j1939_ecu_timer_handler() or j1939_ecu_timer_cancel().
+ */
+ j1939_ecu_get(ecu);
+
+ /* Schedule timer in 250 msec to commit address change. */
+ hrtimer_start(&ecu->ac_timer, ms_to_ktime(250),
+ HRTIMER_MODE_REL_SOFT);
+}
+
+void j1939_ecu_timer_cancel(struct j1939_ecu *ecu)
+{
+ if (hrtimer_cancel(&ecu->ac_timer))
+ j1939_ecu_put(ecu);
+}
+
+static enum hrtimer_restart j1939_ecu_timer_handler(struct hrtimer *hrtimer)
+{
+ struct j1939_ecu *ecu =
+ container_of(hrtimer, struct j1939_ecu, ac_timer);
+ struct j1939_priv *priv = ecu->priv;
+
+ write_lock_bh(&priv->lock);
+ /* TODO: can we test if ecu->addr is unicast before starting
+ * the timer?
+ */
+ j1939_ecu_map_locked(ecu);
+
+ /* The corresponding j1939_ecu_get() is in
+ * j1939_ecu_timer_start().
+ */
+ j1939_ecu_put(ecu);
+ write_unlock_bh(&priv->lock);
+
+ return HRTIMER_NORESTART;
+}
+
+struct j1939_ecu *j1939_ecu_create_locked(struct j1939_priv *priv, name_t name)
+{
+ struct j1939_ecu *ecu;
+
+ lockdep_assert_held(&priv->lock);
+
+ ecu = kzalloc(sizeof(*ecu), gfp_any());
+ if (!ecu)
+ return ERR_PTR(-ENOMEM);
+ kref_init(&ecu->kref);
+ ecu->addr = J1939_IDLE_ADDR;
+ ecu->name = name;
+
+ hrtimer_setup(&ecu->ac_timer, j1939_ecu_timer_handler, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_SOFT);
+ INIT_LIST_HEAD(&ecu->list);
+
+ j1939_priv_get(priv);
+ ecu->priv = priv;
+ list_add_tail(&ecu->list, &priv->ecus);
+
+ return ecu;
+}
+
+struct j1939_ecu *j1939_ecu_find_by_addr_locked(struct j1939_priv *priv,
+ u8 addr)
+{
+ lockdep_assert_held(&priv->lock);
+
+ return priv->ents[addr].ecu;
+}
+
+struct j1939_ecu *j1939_ecu_get_by_addr_locked(struct j1939_priv *priv, u8 addr)
+{
+ struct j1939_ecu *ecu;
+
+ lockdep_assert_held(&priv->lock);
+
+ if (!j1939_address_is_unicast(addr))
+ return NULL;
+
+ ecu = j1939_ecu_find_by_addr_locked(priv, addr);
+ if (ecu)
+ j1939_ecu_get(ecu);
+
+ return ecu;
+}
+
+struct j1939_ecu *j1939_ecu_get_by_addr(struct j1939_priv *priv, u8 addr)
+{
+ struct j1939_ecu *ecu;
+
+ read_lock_bh(&priv->lock);
+ ecu = j1939_ecu_get_by_addr_locked(priv, addr);
+ read_unlock_bh(&priv->lock);
+
+ return ecu;
+}
+
+/* get pointer to ecu without increasing ref counter */
+static struct j1939_ecu *j1939_ecu_find_by_name_locked(struct j1939_priv *priv,
+ name_t name)
+{
+ struct j1939_ecu *ecu;
+
+ lockdep_assert_held(&priv->lock);
+
+ list_for_each_entry(ecu, &priv->ecus, list) {
+ if (ecu->name == name)
+ return ecu;
+ }
+
+ return NULL;
+}
+
+struct j1939_ecu *j1939_ecu_get_by_name_locked(struct j1939_priv *priv,
+ name_t name)
+{
+ struct j1939_ecu *ecu;
+
+ lockdep_assert_held(&priv->lock);
+
+ if (!name)
+ return NULL;
+
+ ecu = j1939_ecu_find_by_name_locked(priv, name);
+ if (ecu)
+ j1939_ecu_get(ecu);
+
+ return ecu;
+}
+
+struct j1939_ecu *j1939_ecu_get_by_name(struct j1939_priv *priv, name_t name)
+{
+ struct j1939_ecu *ecu;
+
+ read_lock_bh(&priv->lock);
+ ecu = j1939_ecu_get_by_name_locked(priv, name);
+ read_unlock_bh(&priv->lock);
+
+ return ecu;
+}
+
+u8 j1939_name_to_addr(struct j1939_priv *priv, name_t name)
+{
+ struct j1939_ecu *ecu;
+ int addr = J1939_IDLE_ADDR;
+
+ if (!name)
+ return J1939_NO_ADDR;
+
+ read_lock_bh(&priv->lock);
+ ecu = j1939_ecu_find_by_name_locked(priv, name);
+ if (ecu && j1939_ecu_is_mapped_locked(ecu))
+ /* ecu's SA is registered */
+ addr = ecu->addr;
+
+ read_unlock_bh(&priv->lock);
+
+ return addr;
+}
+
+/* TX addr/name accounting
+ * Transport protocol needs to know if a SA is local or not
+ * These functions originate from userspace manipulating sockets,
+ * so locking is straigforward
+ */
+
+int j1939_local_ecu_get(struct j1939_priv *priv, name_t name, u8 sa)
+{
+ struct j1939_ecu *ecu;
+ int err = 0;
+
+ write_lock_bh(&priv->lock);
+
+ if (j1939_address_is_unicast(sa))
+ priv->ents[sa].nusers++;
+
+ if (!name)
+ goto done;
+
+ ecu = j1939_ecu_get_by_name_locked(priv, name);
+ if (!ecu)
+ ecu = j1939_ecu_create_locked(priv, name);
+ err = PTR_ERR_OR_ZERO(ecu);
+ if (err) {
+ if (j1939_address_is_unicast(sa))
+ priv->ents[sa].nusers--;
+ goto done;
+ }
+
+ ecu->nusers++;
+ /* TODO: do we care if ecu->addr != sa? */
+ if (j1939_ecu_is_mapped_locked(ecu))
+ /* ecu's sa is active already */
+ priv->ents[ecu->addr].nusers++;
+
+ done:
+ write_unlock_bh(&priv->lock);
+
+ return err;
+}
+
+void j1939_local_ecu_put(struct j1939_priv *priv, name_t name, u8 sa)
+{
+ struct j1939_ecu *ecu;
+
+ write_lock_bh(&priv->lock);
+
+ if (j1939_address_is_unicast(sa))
+ priv->ents[sa].nusers--;
+
+ if (!name)
+ goto done;
+
+ ecu = j1939_ecu_find_by_name_locked(priv, name);
+ if (WARN_ON_ONCE(!ecu))
+ goto done;
+
+ ecu->nusers--;
+ /* TODO: do we care if ecu->addr != sa? */
+ if (j1939_ecu_is_mapped_locked(ecu))
+ /* ecu's sa is active already */
+ priv->ents[ecu->addr].nusers--;
+ j1939_ecu_put(ecu);
+
+ done:
+ write_unlock_bh(&priv->lock);
+}
diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h
new file mode 100644
index 000000000000..81f58924b4ac
--- /dev/null
+++ b/net/can/j1939/j1939-priv.h
@@ -0,0 +1,345 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+// Copyright (c) 2010-2011 EIA Electronics,
+// Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2017-2019 Pengutronix,
+// Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+// Oleksij Rempel <kernel@pengutronix.de>
+
+#ifndef _J1939_PRIV_H_
+#define _J1939_PRIV_H_
+
+#include <linux/can/j1939.h>
+#include <net/sock.h>
+
+/* Timeout to receive the abort signal over loop back. In case CAN
+ * bus is open, the timeout should be triggered.
+ */
+#define J1939_XTP_ABORT_TIMEOUT_MS 500
+#define J1939_SIMPLE_ECHO_TIMEOUT_MS (10 * 1000)
+
+struct j1939_session;
+enum j1939_sk_errqueue_type {
+ J1939_ERRQUEUE_TX_ACK,
+ J1939_ERRQUEUE_TX_SCHED,
+ J1939_ERRQUEUE_TX_ABORT,
+ J1939_ERRQUEUE_RX_RTS,
+ J1939_ERRQUEUE_RX_DPO,
+ J1939_ERRQUEUE_RX_ABORT,
+};
+
+/* j1939 devices */
+struct j1939_ecu {
+ struct list_head list;
+ name_t name;
+ u8 addr;
+
+ /* indicates that this ecu successfully claimed @sa as its address */
+ struct hrtimer ac_timer;
+ struct kref kref;
+ struct j1939_priv *priv;
+
+ /* count users, to help transport protocol decide for interaction */
+ int nusers;
+};
+
+struct j1939_priv {
+ struct list_head ecus;
+ /* local list entry in priv
+ * These allow irq (& softirq) context lookups on j1939 devices
+ * This approach (separate lists) is done as the other 2 alternatives
+ * are not easier or even wrong
+ * 1) using the pure kobject methods involves mutexes, which are not
+ * allowed in irq context.
+ * 2) duplicating data structures would require a lot of synchronization
+ * code
+ * usage:
+ */
+
+ /* segments need a lock to protect the above list */
+ rwlock_t lock;
+
+ struct net_device *ndev;
+
+ /* list of 256 ecu ptrs, that cache the claimed addresses.
+ * also protected by the above lock
+ */
+ struct j1939_addr_ent {
+ struct j1939_ecu *ecu;
+ /* count users, to help transport protocol */
+ int nusers;
+ } ents[256];
+
+ struct kref kref;
+
+ /* List of active sessions to prevent start of conflicting
+ * one.
+ *
+ * Do not start two sessions of same type, addresses and
+ * direction.
+ */
+ struct list_head active_session_list;
+
+ /* protects active_session_list */
+ spinlock_t active_session_list_lock;
+
+ unsigned int tp_max_packet_size;
+
+ /* lock for j1939_socks list */
+ rwlock_t j1939_socks_lock;
+ struct list_head j1939_socks;
+
+ struct kref rx_kref;
+ u32 rx_tskey;
+};
+
+void j1939_ecu_put(struct j1939_ecu *ecu);
+
+/* keep the cache of what is local */
+int j1939_local_ecu_get(struct j1939_priv *priv, name_t name, u8 sa);
+void j1939_local_ecu_put(struct j1939_priv *priv, name_t name, u8 sa);
+
+static inline bool j1939_address_is_unicast(u8 addr)
+{
+ return addr <= J1939_MAX_UNICAST_ADDR;
+}
+
+static inline bool j1939_address_is_idle(u8 addr)
+{
+ return addr == J1939_IDLE_ADDR;
+}
+
+static inline bool j1939_address_is_valid(u8 addr)
+{
+ return addr != J1939_NO_ADDR;
+}
+
+static inline bool j1939_pgn_is_pdu1(pgn_t pgn)
+{
+ /* ignore dp & res bits for this */
+ return (pgn & 0xff00) < 0xf000;
+}
+
+/* utility to correctly unmap an ECU */
+void j1939_ecu_unmap_locked(struct j1939_ecu *ecu);
+void j1939_ecu_unmap(struct j1939_ecu *ecu);
+
+u8 j1939_name_to_addr(struct j1939_priv *priv, name_t name);
+struct j1939_ecu *j1939_ecu_find_by_addr_locked(struct j1939_priv *priv,
+ u8 addr);
+struct j1939_ecu *j1939_ecu_get_by_addr(struct j1939_priv *priv, u8 addr);
+struct j1939_ecu *j1939_ecu_get_by_addr_locked(struct j1939_priv *priv,
+ u8 addr);
+struct j1939_ecu *j1939_ecu_get_by_name(struct j1939_priv *priv, name_t name);
+struct j1939_ecu *j1939_ecu_get_by_name_locked(struct j1939_priv *priv,
+ name_t name);
+
+enum j1939_transfer_type {
+ J1939_TP,
+ J1939_ETP,
+ J1939_SIMPLE,
+};
+
+struct j1939_addr {
+ name_t src_name;
+ name_t dst_name;
+ pgn_t pgn;
+
+ u8 sa;
+ u8 da;
+
+ u8 type;
+};
+
+/* control buffer of the sk_buff */
+struct j1939_sk_buff_cb {
+ /* Offset in bytes within one ETP session */
+ u32 offset;
+
+ /* for tx, MSG_SYN will be used to sync on sockets */
+ u32 msg_flags;
+ u32 tskey;
+
+ struct j1939_addr addr;
+
+ /* Flags for quick lookups during skb processing.
+ * These are set in the receive path only.
+ */
+#define J1939_ECU_LOCAL_SRC BIT(0)
+#define J1939_ECU_LOCAL_DST BIT(1)
+ u8 flags;
+
+ priority_t priority;
+};
+
+static inline
+struct j1939_sk_buff_cb *j1939_skb_to_cb(const struct sk_buff *skb)
+{
+ BUILD_BUG_ON(sizeof(struct j1939_sk_buff_cb) > sizeof(skb->cb));
+
+ return (struct j1939_sk_buff_cb *)skb->cb;
+}
+
+int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb);
+void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb);
+bool j1939_sk_recv_match(struct j1939_priv *priv,
+ struct j1939_sk_buff_cb *skcb);
+void j1939_sk_send_loop_abort(struct sock *sk, int err);
+void j1939_sk_errqueue(struct j1939_session *session,
+ enum j1939_sk_errqueue_type type);
+void j1939_sk_queue_activate_next(struct j1939_session *session);
+
+/* stack entries */
+struct j1939_session *j1939_tp_send(struct j1939_priv *priv,
+ struct sk_buff *skb, size_t size);
+int j1939_tp_recv(struct j1939_priv *priv, struct sk_buff *skb);
+int j1939_ac_fixup(struct j1939_priv *priv, struct sk_buff *skb);
+void j1939_ac_recv(struct j1939_priv *priv, struct sk_buff *skb);
+void j1939_simple_recv(struct j1939_priv *priv, struct sk_buff *skb);
+
+/* network management */
+struct j1939_ecu *j1939_ecu_create_locked(struct j1939_priv *priv, name_t name);
+
+void j1939_ecu_timer_start(struct j1939_ecu *ecu);
+void j1939_ecu_timer_cancel(struct j1939_ecu *ecu);
+void j1939_ecu_unmap_all(struct j1939_priv *priv);
+
+struct j1939_priv *j1939_netdev_start(struct net_device *ndev);
+void j1939_netdev_stop(struct j1939_priv *priv);
+
+void j1939_priv_put(struct j1939_priv *priv);
+void j1939_priv_get(struct j1939_priv *priv);
+
+/* notify/alert all j1939 sockets bound to ifindex */
+void j1939_sk_netdev_event_netdown(struct j1939_priv *priv);
+void j1939_sk_netdev_event_unregister(struct j1939_priv *priv);
+int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk);
+void j1939_tp_init(struct j1939_priv *priv);
+
+/* decrement pending skb for a j1939 socket */
+void j1939_sock_pending_del(struct sock *sk);
+
+enum j1939_session_state {
+ J1939_SESSION_NEW,
+ J1939_SESSION_ACTIVE,
+ /* waiting for abort signal on the bus */
+ J1939_SESSION_WAITING_ABORT,
+ J1939_SESSION_ACTIVE_MAX,
+ J1939_SESSION_DONE,
+};
+
+struct j1939_session {
+ struct j1939_priv *priv;
+ struct list_head active_session_list_entry;
+ struct list_head sk_session_queue_entry;
+ struct kref kref;
+ struct sock *sk;
+
+ /* ifindex, src, dst, pgn define the session block
+ * the are _never_ modified after insertion in the list
+ * this decreases locking problems a _lot_
+ */
+ struct j1939_sk_buff_cb skcb;
+ struct sk_buff_head skb_queue;
+
+ /* all tx related stuff (last_txcmd, pkt.tx)
+ * is protected (modified only) with the txtimer hrtimer
+ * 'total' & 'block' are never changed,
+ * last_cmd, last & block are protected by ->lock
+ * this means that the tx may run after cts is received that should
+ * have stopped tx, but this time discrepancy is never avoided anyhow
+ */
+ u8 last_cmd, last_txcmd;
+ bool transmission;
+ bool extd;
+ /* Total message size, number of bytes */
+ unsigned int total_message_size;
+ /* Total number of bytes queue from socket to the session */
+ unsigned int total_queued_size;
+ unsigned int tx_retry;
+
+ int err;
+ u32 tskey;
+ enum j1939_session_state state;
+
+ /* Packets counters for a (extended) transfer session. The packet is
+ * maximal of 7 bytes.
+ */
+ struct {
+ /* total - total number of packets for this session */
+ unsigned int total;
+ /* last - last packet of a transfer block after which
+ * responder should send ETP.CM_CTS and originator
+ * ETP.CM_DPO
+ */
+ unsigned int last;
+ /* tx - number of packets send by originator node.
+ * this counter can be set back if responder node
+ * didn't received all packets send by originator.
+ */
+ unsigned int tx;
+ unsigned int tx_acked;
+ /* rx - number of packets received */
+ unsigned int rx;
+ /* block - amount of packets expected in one block */
+ unsigned int block;
+ /* dpo - ETP.CM_DPO, Data Packet Offset */
+ unsigned int dpo;
+ } pkt;
+ struct hrtimer txtimer, rxtimer;
+};
+
+struct j1939_sock {
+ struct sock sk; /* must be first to skip with memset */
+ struct j1939_priv *priv;
+ struct list_head list;
+
+#define J1939_SOCK_BOUND BIT(0)
+#define J1939_SOCK_CONNECTED BIT(1)
+#define J1939_SOCK_PROMISC BIT(2)
+#define J1939_SOCK_ERRQUEUE BIT(3)
+ int state;
+
+ int ifindex;
+ struct j1939_addr addr;
+ spinlock_t filters_lock;
+ struct j1939_filter *filters;
+ int nfilters;
+ pgn_t pgn_rx_filter;
+
+ /* j1939 may emit equal PGN (!= equal CAN-id's) out of order
+ * when transport protocol comes in.
+ * To allow emitting in order, keep a 'pending' nr. of packets
+ */
+ atomic_t skb_pending;
+ wait_queue_head_t waitq;
+
+ /* lock for the sk_session_queue list */
+ spinlock_t sk_session_queue_lock;
+ struct list_head sk_session_queue;
+};
+
+static inline struct j1939_sock *j1939_sk(const struct sock *sk)
+{
+ return container_of(sk, struct j1939_sock, sk);
+}
+
+void j1939_session_get(struct j1939_session *session);
+void j1939_session_put(struct j1939_session *session);
+void j1939_session_skb_queue(struct j1939_session *session,
+ struct sk_buff *skb);
+int j1939_session_activate(struct j1939_session *session);
+void j1939_tp_schedule_txtimer(struct j1939_session *session, int msec);
+void j1939_session_timers_cancel(struct j1939_session *session);
+
+#define J1939_MIN_TP_PACKET_SIZE 9
+#define J1939_MAX_TP_PACKET_SIZE (7 * 0xff)
+#define J1939_MAX_ETP_PACKET_SIZE (7 * 0x00ffffff)
+
+#define J1939_REGULAR 0
+#define J1939_EXTENDED 1
+
+/* CAN protocol */
+extern const struct can_proto j1939_can_proto;
+
+#endif /* _J1939_PRIV_H_ */
diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c
new file mode 100644
index 000000000000..a93af55df5fd
--- /dev/null
+++ b/net/can/j1939/main.c
@@ -0,0 +1,430 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2010-2011 EIA Electronics,
+// Pieter Beyens <pieter.beyens@eia.be>
+// Copyright (c) 2010-2011 EIA Electronics,
+// Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2018 Protonic,
+// Robin van der Gracht <robin@protonic.nl>
+// Copyright (c) 2017-2019 Pengutronix,
+// Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+// Oleksij Rempel <kernel@pengutronix.de>
+
+/* Core of can-j1939 that links j1939 to CAN. */
+
+#include <linux/can/can-ml.h>
+#include <linux/can/core.h>
+#include <linux/can/skb.h>
+#include <linux/if_arp.h>
+#include <linux/module.h>
+
+#include "j1939-priv.h"
+
+MODULE_DESCRIPTION("PF_CAN SAE J1939");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("EIA Electronics (Kurt Van Dijck & Pieter Beyens)");
+MODULE_ALIAS("can-proto-" __stringify(CAN_J1939));
+
+/* LOWLEVEL CAN interface */
+
+/* CAN_HDR: #bytes before can_frame data part */
+#define J1939_CAN_HDR (offsetof(struct can_frame, data))
+
+/* lowest layer */
+static void j1939_can_recv(struct sk_buff *iskb, void *data)
+{
+ struct j1939_priv *priv = data;
+ struct sk_buff *skb;
+ struct j1939_sk_buff_cb *skcb, *iskcb;
+ struct can_frame *cf;
+
+ /* make sure we only get Classical CAN frames */
+ if (!can_is_can_skb(iskb))
+ return;
+
+ /* create a copy of the skb
+ * j1939 only delivers the real data bytes,
+ * the header goes into sockaddr.
+ * j1939 may not touch the incoming skb in such way
+ */
+ skb = skb_clone(iskb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ j1939_priv_get(priv);
+ can_skb_set_owner(skb, iskb->sk);
+
+ /* get a pointer to the header of the skb
+ * the skb payload (pointer) is moved, so that the next skb_data
+ * returns the actual payload
+ */
+ cf = (void *)skb->data;
+ skb_pull(skb, J1939_CAN_HDR);
+
+ /* fix length, set to dlc, with 8 maximum */
+ skb_trim(skb, min_t(uint8_t, cf->len, 8));
+
+ /* set addr */
+ skcb = j1939_skb_to_cb(skb);
+ memset(skcb, 0, sizeof(*skcb));
+
+ iskcb = j1939_skb_to_cb(iskb);
+ skcb->tskey = iskcb->tskey;
+ skcb->priority = (cf->can_id >> 26) & 0x7;
+ skcb->addr.sa = cf->can_id;
+ skcb->addr.pgn = (cf->can_id >> 8) & J1939_PGN_MAX;
+ /* set default message type */
+ skcb->addr.type = J1939_TP;
+
+ if (!j1939_address_is_valid(skcb->addr.sa)) {
+ netdev_err_once(priv->ndev, "%s: sa is broadcast address, ignoring!\n",
+ __func__);
+ goto done;
+ }
+
+ if (j1939_pgn_is_pdu1(skcb->addr.pgn)) {
+ /* Type 1: with destination address */
+ skcb->addr.da = skcb->addr.pgn;
+ /* normalize pgn: strip dst address */
+ skcb->addr.pgn &= 0x3ff00;
+ } else {
+ /* set broadcast address */
+ skcb->addr.da = J1939_NO_ADDR;
+ }
+
+ /* update localflags */
+ read_lock_bh(&priv->lock);
+ if (j1939_address_is_unicast(skcb->addr.sa) &&
+ priv->ents[skcb->addr.sa].nusers)
+ skcb->flags |= J1939_ECU_LOCAL_SRC;
+ if (j1939_address_is_unicast(skcb->addr.da) &&
+ priv->ents[skcb->addr.da].nusers)
+ skcb->flags |= J1939_ECU_LOCAL_DST;
+ read_unlock_bh(&priv->lock);
+
+ /* deliver into the j1939 stack ... */
+ j1939_ac_recv(priv, skb);
+
+ if (j1939_tp_recv(priv, skb))
+ /* this means the transport layer processed the message */
+ goto done;
+
+ j1939_simple_recv(priv, skb);
+ j1939_sk_recv(priv, skb);
+ done:
+ j1939_priv_put(priv);
+ kfree_skb(skb);
+}
+
+/* NETDEV MANAGEMENT */
+
+/* values for can_rx_(un)register */
+#define J1939_CAN_ID CAN_EFF_FLAG
+#define J1939_CAN_MASK (CAN_EFF_FLAG | CAN_RTR_FLAG)
+
+static DEFINE_MUTEX(j1939_netdev_lock);
+
+static struct j1939_priv *j1939_priv_create(struct net_device *ndev)
+{
+ struct j1939_priv *priv;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return NULL;
+
+ rwlock_init(&priv->lock);
+ INIT_LIST_HEAD(&priv->ecus);
+ priv->ndev = ndev;
+ kref_init(&priv->kref);
+ kref_init(&priv->rx_kref);
+ dev_hold(ndev);
+
+ netdev_dbg(priv->ndev, "%s : 0x%p\n", __func__, priv);
+
+ return priv;
+}
+
+static inline void j1939_priv_set(struct net_device *ndev,
+ struct j1939_priv *priv)
+{
+ struct can_ml_priv *can_ml = can_get_ml_priv(ndev);
+
+ can_ml->j1939_priv = priv;
+}
+
+static void __j1939_priv_release(struct kref *kref)
+{
+ struct j1939_priv *priv = container_of(kref, struct j1939_priv, kref);
+ struct net_device *ndev = priv->ndev;
+
+ netdev_dbg(priv->ndev, "%s: 0x%p\n", __func__, priv);
+
+ WARN_ON_ONCE(!list_empty(&priv->active_session_list));
+ WARN_ON_ONCE(!list_empty(&priv->ecus));
+ WARN_ON_ONCE(!list_empty(&priv->j1939_socks));
+
+ dev_put(ndev);
+ kfree(priv);
+}
+
+void j1939_priv_put(struct j1939_priv *priv)
+{
+ kref_put(&priv->kref, __j1939_priv_release);
+}
+
+void j1939_priv_get(struct j1939_priv *priv)
+{
+ kref_get(&priv->kref);
+}
+
+static int j1939_can_rx_register(struct j1939_priv *priv)
+{
+ struct net_device *ndev = priv->ndev;
+ int ret;
+
+ j1939_priv_get(priv);
+ ret = can_rx_register(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK,
+ j1939_can_recv, priv, "j1939", NULL);
+ if (ret < 0) {
+ j1939_priv_put(priv);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void j1939_can_rx_unregister(struct j1939_priv *priv)
+{
+ struct net_device *ndev = priv->ndev;
+
+ can_rx_unregister(dev_net(ndev), ndev, J1939_CAN_ID, J1939_CAN_MASK,
+ j1939_can_recv, priv);
+
+ /* The last reference of priv is dropped by the RCU deferred
+ * j1939_sk_sock_destruct() of the last socket, so we can
+ * safely drop this reference here.
+ */
+ j1939_priv_put(priv);
+}
+
+static void __j1939_rx_release(struct kref *kref)
+ __releases(&j1939_netdev_lock)
+{
+ struct j1939_priv *priv = container_of(kref, struct j1939_priv,
+ rx_kref);
+
+ j1939_can_rx_unregister(priv);
+ j1939_ecu_unmap_all(priv);
+ j1939_priv_set(priv->ndev, NULL);
+ mutex_unlock(&j1939_netdev_lock);
+}
+
+/* get pointer to priv without increasing ref counter */
+static inline struct j1939_priv *j1939_ndev_to_priv(struct net_device *ndev)
+{
+ struct can_ml_priv *can_ml = can_get_ml_priv(ndev);
+
+ return can_ml->j1939_priv;
+}
+
+static struct j1939_priv *j1939_priv_get_by_ndev_locked(struct net_device *ndev)
+{
+ struct j1939_priv *priv;
+
+ lockdep_assert_held(&j1939_netdev_lock);
+
+ priv = j1939_ndev_to_priv(ndev);
+ if (priv)
+ j1939_priv_get(priv);
+
+ return priv;
+}
+
+static struct j1939_priv *j1939_priv_get_by_ndev(struct net_device *ndev)
+{
+ struct j1939_priv *priv;
+
+ mutex_lock(&j1939_netdev_lock);
+ priv = j1939_priv_get_by_ndev_locked(ndev);
+ mutex_unlock(&j1939_netdev_lock);
+
+ return priv;
+}
+
+struct j1939_priv *j1939_netdev_start(struct net_device *ndev)
+{
+ struct j1939_priv *priv, *priv_new;
+ int ret;
+
+ mutex_lock(&j1939_netdev_lock);
+ priv = j1939_priv_get_by_ndev_locked(ndev);
+ if (priv) {
+ kref_get(&priv->rx_kref);
+ mutex_unlock(&j1939_netdev_lock);
+ return priv;
+ }
+ mutex_unlock(&j1939_netdev_lock);
+
+ priv = j1939_priv_create(ndev);
+ if (!priv)
+ return ERR_PTR(-ENOMEM);
+
+ j1939_tp_init(priv);
+ rwlock_init(&priv->j1939_socks_lock);
+ INIT_LIST_HEAD(&priv->j1939_socks);
+
+ mutex_lock(&j1939_netdev_lock);
+ priv_new = j1939_priv_get_by_ndev_locked(ndev);
+ if (priv_new) {
+ /* Someone was faster than us, use their priv and roll
+ * back our's.
+ */
+ kref_get(&priv_new->rx_kref);
+ mutex_unlock(&j1939_netdev_lock);
+ dev_put(ndev);
+ kfree(priv);
+ return priv_new;
+ }
+ j1939_priv_set(ndev, priv);
+
+ ret = j1939_can_rx_register(priv);
+ if (ret < 0)
+ goto out_priv_put;
+
+ mutex_unlock(&j1939_netdev_lock);
+ return priv;
+
+ out_priv_put:
+ j1939_priv_set(ndev, NULL);
+ mutex_unlock(&j1939_netdev_lock);
+
+ dev_put(ndev);
+ kfree(priv);
+
+ return ERR_PTR(ret);
+}
+
+void j1939_netdev_stop(struct j1939_priv *priv)
+{
+ kref_put_mutex(&priv->rx_kref, __j1939_rx_release, &j1939_netdev_lock);
+ j1939_priv_put(priv);
+}
+
+int j1939_send_one(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ int ret, dlc;
+ canid_t canid;
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct can_frame *cf;
+
+ /* apply sanity checks */
+ if (j1939_pgn_is_pdu1(skcb->addr.pgn))
+ skcb->addr.pgn &= J1939_PGN_PDU1_MAX;
+ else
+ skcb->addr.pgn &= J1939_PGN_MAX;
+
+ if (skcb->priority > 7)
+ skcb->priority = 6;
+
+ ret = j1939_ac_fixup(priv, skb);
+ if (unlikely(ret))
+ goto failed;
+ dlc = skb->len;
+
+ /* re-claim the CAN_HDR from the SKB */
+ cf = skb_push(skb, J1939_CAN_HDR);
+
+ /* initialize header structure */
+ memset(cf, 0, J1939_CAN_HDR);
+
+ /* make it a full can frame again */
+ skb_put_zero(skb, 8 - dlc);
+
+ canid = CAN_EFF_FLAG |
+ (skcb->priority << 26) |
+ (skcb->addr.pgn << 8) |
+ skcb->addr.sa;
+ if (j1939_pgn_is_pdu1(skcb->addr.pgn))
+ canid |= skcb->addr.da << 8;
+
+ cf->can_id = canid;
+ cf->len = dlc;
+
+ return can_send(skb, 1);
+
+ failed:
+ kfree_skb(skb);
+ return ret;
+}
+
+static int j1939_netdev_notify(struct notifier_block *nb,
+ unsigned long msg, void *data)
+{
+ struct net_device *ndev = netdev_notifier_info_to_dev(data);
+ struct can_ml_priv *can_ml = can_get_ml_priv(ndev);
+ struct j1939_priv *priv;
+
+ if (!can_ml)
+ goto notify_done;
+
+ priv = j1939_priv_get_by_ndev(ndev);
+ if (!priv)
+ goto notify_done;
+
+ switch (msg) {
+ case NETDEV_DOWN:
+ j1939_cancel_active_session(priv, NULL);
+ j1939_sk_netdev_event_netdown(priv);
+ j1939_ecu_unmap_all(priv);
+ break;
+ case NETDEV_UNREGISTER:
+ j1939_cancel_active_session(priv, NULL);
+ j1939_sk_netdev_event_netdown(priv);
+ j1939_sk_netdev_event_unregister(priv);
+ break;
+ }
+
+ j1939_priv_put(priv);
+
+notify_done:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block j1939_netdev_notifier = {
+ .notifier_call = j1939_netdev_notify,
+};
+
+/* MODULE interface */
+static __init int j1939_module_init(void)
+{
+ int ret;
+
+ pr_info("can: SAE J1939\n");
+
+ ret = register_netdevice_notifier(&j1939_netdev_notifier);
+ if (ret)
+ goto fail_notifier;
+
+ ret = can_proto_register(&j1939_can_proto);
+ if (ret < 0) {
+ pr_err("can: registration of j1939 protocol failed\n");
+ goto fail_sk;
+ }
+
+ return 0;
+
+ fail_sk:
+ unregister_netdevice_notifier(&j1939_netdev_notifier);
+ fail_notifier:
+ return ret;
+}
+
+static __exit void j1939_module_exit(void)
+{
+ can_proto_unregister(&j1939_can_proto);
+
+ unregister_netdevice_notifier(&j1939_netdev_notifier);
+}
+
+module_init(j1939_module_init);
+module_exit(j1939_module_exit);
diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
new file mode 100644
index 000000000000..6272326dd614
--- /dev/null
+++ b/net/can/j1939/socket.c
@@ -0,0 +1,1393 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2010-2011 EIA Electronics,
+// Pieter Beyens <pieter.beyens@eia.be>
+// Copyright (c) 2010-2011 EIA Electronics,
+// Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2018 Protonic,
+// Robin van der Gracht <robin@protonic.nl>
+// Copyright (c) 2017-2019 Pengutronix,
+// Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+// Oleksij Rempel <kernel@pengutronix.de>
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/can/can-ml.h>
+#include <linux/can/core.h>
+#include <linux/can/skb.h>
+#include <linux/errqueue.h>
+#include <linux/if_arp.h>
+
+#include "j1939-priv.h"
+
+#define J1939_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.j1939)
+
+/* conversion function between struct sock::sk_priority from linux and
+ * j1939 priority field
+ */
+static inline priority_t j1939_prio(u32 sk_priority)
+{
+ sk_priority = min(sk_priority, 7U);
+
+ return 7 - sk_priority;
+}
+
+static inline u32 j1939_to_sk_priority(priority_t prio)
+{
+ return 7 - prio;
+}
+
+/* function to see if pgn is to be evaluated */
+static inline bool j1939_pgn_is_valid(pgn_t pgn)
+{
+ return pgn <= J1939_PGN_MAX;
+}
+
+/* test function to avoid non-zero DA placeholder for pdu1 pgn's */
+static inline bool j1939_pgn_is_clean_pdu(pgn_t pgn)
+{
+ if (j1939_pgn_is_pdu1(pgn))
+ return !(pgn & 0xff);
+ else
+ return true;
+}
+
+static inline void j1939_sock_pending_add(struct sock *sk)
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+
+ atomic_inc(&jsk->skb_pending);
+}
+
+static int j1939_sock_pending_get(struct sock *sk)
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+
+ return atomic_read(&jsk->skb_pending);
+}
+
+void j1939_sock_pending_del(struct sock *sk)
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+
+ /* atomic_dec_return returns the new value */
+ if (!atomic_dec_return(&jsk->skb_pending))
+ wake_up(&jsk->waitq); /* no pending SKB's */
+}
+
+static void j1939_jsk_add(struct j1939_priv *priv, struct j1939_sock *jsk)
+{
+ jsk->state |= J1939_SOCK_BOUND;
+ j1939_priv_get(priv);
+
+ write_lock_bh(&priv->j1939_socks_lock);
+ list_add_tail(&jsk->list, &priv->j1939_socks);
+ write_unlock_bh(&priv->j1939_socks_lock);
+}
+
+static void j1939_jsk_del(struct j1939_priv *priv, struct j1939_sock *jsk)
+{
+ write_lock_bh(&priv->j1939_socks_lock);
+ list_del_init(&jsk->list);
+ write_unlock_bh(&priv->j1939_socks_lock);
+
+ j1939_priv_put(priv);
+ jsk->state &= ~J1939_SOCK_BOUND;
+}
+
+static bool j1939_sk_queue_session(struct j1939_session *session)
+{
+ struct j1939_sock *jsk = j1939_sk(session->sk);
+ bool empty;
+
+ spin_lock_bh(&jsk->sk_session_queue_lock);
+ empty = list_empty(&jsk->sk_session_queue);
+ j1939_session_get(session);
+ list_add_tail(&session->sk_session_queue_entry, &jsk->sk_session_queue);
+ spin_unlock_bh(&jsk->sk_session_queue_lock);
+ j1939_sock_pending_add(&jsk->sk);
+
+ return empty;
+}
+
+static struct
+j1939_session *j1939_sk_get_incomplete_session(struct j1939_sock *jsk)
+{
+ struct j1939_session *session = NULL;
+
+ spin_lock_bh(&jsk->sk_session_queue_lock);
+ if (!list_empty(&jsk->sk_session_queue)) {
+ session = list_last_entry(&jsk->sk_session_queue,
+ struct j1939_session,
+ sk_session_queue_entry);
+ if (session->total_queued_size == session->total_message_size)
+ session = NULL;
+ else
+ j1939_session_get(session);
+ }
+ spin_unlock_bh(&jsk->sk_session_queue_lock);
+
+ return session;
+}
+
+static void j1939_sk_queue_drop_all(struct j1939_priv *priv,
+ struct j1939_sock *jsk, int err)
+{
+ struct j1939_session *session, *tmp;
+
+ netdev_dbg(priv->ndev, "%s: err: %i\n", __func__, err);
+ spin_lock_bh(&jsk->sk_session_queue_lock);
+ list_for_each_entry_safe(session, tmp, &jsk->sk_session_queue,
+ sk_session_queue_entry) {
+ list_del_init(&session->sk_session_queue_entry);
+ session->err = err;
+ j1939_session_put(session);
+ }
+ spin_unlock_bh(&jsk->sk_session_queue_lock);
+}
+
+static void j1939_sk_queue_activate_next_locked(struct j1939_session *session)
+{
+ struct j1939_sock *jsk;
+ struct j1939_session *first;
+ int err;
+
+ /* RX-Session don't have a socket (yet) */
+ if (!session->sk)
+ return;
+
+ jsk = j1939_sk(session->sk);
+ lockdep_assert_held(&jsk->sk_session_queue_lock);
+
+ err = session->err;
+
+ first = list_first_entry_or_null(&jsk->sk_session_queue,
+ struct j1939_session,
+ sk_session_queue_entry);
+
+ /* Some else has already activated the next session */
+ if (first != session)
+ return;
+
+activate_next:
+ list_del_init(&first->sk_session_queue_entry);
+ j1939_session_put(first);
+ first = list_first_entry_or_null(&jsk->sk_session_queue,
+ struct j1939_session,
+ sk_session_queue_entry);
+ if (!first)
+ return;
+
+ if (j1939_session_activate(first)) {
+ netdev_warn_once(first->priv->ndev,
+ "%s: 0x%p: Identical session is already activated.\n",
+ __func__, first);
+ first->err = -EBUSY;
+ goto activate_next;
+ } else {
+ /* Give receiver some time (arbitrary chosen) to recover */
+ int time_ms = 0;
+
+ if (err)
+ time_ms = 10 + get_random_u32_below(16);
+
+ j1939_tp_schedule_txtimer(first, time_ms);
+ }
+}
+
+void j1939_sk_queue_activate_next(struct j1939_session *session)
+{
+ struct j1939_sock *jsk;
+
+ if (!session->sk)
+ return;
+
+ jsk = j1939_sk(session->sk);
+
+ spin_lock_bh(&jsk->sk_session_queue_lock);
+ j1939_sk_queue_activate_next_locked(session);
+ spin_unlock_bh(&jsk->sk_session_queue_lock);
+}
+
+static bool j1939_sk_match_dst(struct j1939_sock *jsk,
+ const struct j1939_sk_buff_cb *skcb)
+{
+ if ((jsk->state & J1939_SOCK_PROMISC))
+ return true;
+
+ /* Destination address filter */
+ if (jsk->addr.src_name && skcb->addr.dst_name) {
+ if (jsk->addr.src_name != skcb->addr.dst_name)
+ return false;
+ } else {
+ /* receive (all sockets) if
+ * - all packages that match our bind() address
+ * - all broadcast on a socket if SO_BROADCAST
+ * is set
+ */
+ if (j1939_address_is_unicast(skcb->addr.da)) {
+ if (jsk->addr.sa != skcb->addr.da)
+ return false;
+ } else if (!sock_flag(&jsk->sk, SOCK_BROADCAST)) {
+ /* receiving broadcast without SO_BROADCAST
+ * flag is not allowed
+ */
+ return false;
+ }
+ }
+
+ /* Source address filter */
+ if (jsk->state & J1939_SOCK_CONNECTED) {
+ /* receive (all sockets) if
+ * - all packages that match our connect() name or address
+ */
+ if (jsk->addr.dst_name && skcb->addr.src_name) {
+ if (jsk->addr.dst_name != skcb->addr.src_name)
+ return false;
+ } else {
+ if (jsk->addr.da != skcb->addr.sa)
+ return false;
+ }
+ }
+
+ /* PGN filter */
+ if (j1939_pgn_is_valid(jsk->pgn_rx_filter) &&
+ jsk->pgn_rx_filter != skcb->addr.pgn)
+ return false;
+
+ return true;
+}
+
+/* matches skb control buffer (addr) with a j1939 filter */
+static bool j1939_sk_match_filter(struct j1939_sock *jsk,
+ const struct j1939_sk_buff_cb *skcb)
+{
+ const struct j1939_filter *f;
+ int nfilter;
+
+ spin_lock_bh(&jsk->filters_lock);
+
+ f = jsk->filters;
+ nfilter = jsk->nfilters;
+
+ if (!nfilter)
+ /* receive all when no filters are assigned */
+ goto filter_match_found;
+
+ for (; nfilter; ++f, --nfilter) {
+ if ((skcb->addr.pgn & f->pgn_mask) != f->pgn)
+ continue;
+ if ((skcb->addr.sa & f->addr_mask) != f->addr)
+ continue;
+ if ((skcb->addr.src_name & f->name_mask) != f->name)
+ continue;
+ goto filter_match_found;
+ }
+
+ spin_unlock_bh(&jsk->filters_lock);
+ return false;
+
+filter_match_found:
+ spin_unlock_bh(&jsk->filters_lock);
+ return true;
+}
+
+static bool j1939_sk_recv_match_one(struct j1939_sock *jsk,
+ const struct j1939_sk_buff_cb *skcb)
+{
+ if (!(jsk->state & J1939_SOCK_BOUND))
+ return false;
+
+ if (!j1939_sk_match_dst(jsk, skcb))
+ return false;
+
+ if (!j1939_sk_match_filter(jsk, skcb))
+ return false;
+
+ return true;
+}
+
+static void j1939_sk_recv_one(struct j1939_sock *jsk, struct sk_buff *oskb)
+{
+ const struct j1939_sk_buff_cb *oskcb = j1939_skb_to_cb(oskb);
+ struct j1939_sk_buff_cb *skcb;
+ enum skb_drop_reason reason;
+ struct sk_buff *skb;
+
+ if (oskb->sk == &jsk->sk)
+ return;
+
+ if (!j1939_sk_recv_match_one(jsk, oskcb))
+ return;
+
+ skb = skb_clone(oskb, GFP_ATOMIC);
+ if (!skb) {
+ pr_warn("skb clone failed\n");
+ return;
+ }
+ can_skb_set_owner(skb, oskb->sk);
+
+ skcb = j1939_skb_to_cb(skb);
+ skcb->msg_flags &= ~(MSG_DONTROUTE);
+ if (skb->sk)
+ skcb->msg_flags |= MSG_DONTROUTE;
+
+ if (sock_queue_rcv_skb_reason(&jsk->sk, skb, &reason) < 0)
+ sk_skb_reason_drop(&jsk->sk, skb, reason);
+}
+
+bool j1939_sk_recv_match(struct j1939_priv *priv, struct j1939_sk_buff_cb *skcb)
+{
+ struct j1939_sock *jsk;
+ bool match = false;
+
+ read_lock_bh(&priv->j1939_socks_lock);
+ list_for_each_entry(jsk, &priv->j1939_socks, list) {
+ match = j1939_sk_recv_match_one(jsk, skcb);
+ if (match)
+ break;
+ }
+ read_unlock_bh(&priv->j1939_socks_lock);
+
+ return match;
+}
+
+void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_sock *jsk;
+
+ read_lock_bh(&priv->j1939_socks_lock);
+ list_for_each_entry(jsk, &priv->j1939_socks, list) {
+ j1939_sk_recv_one(jsk, skb);
+ }
+ read_unlock_bh(&priv->j1939_socks_lock);
+}
+
+static void j1939_sk_sock_destruct(struct sock *sk)
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+
+ /* This function will be called by the generic networking code, when
+ * the socket is ultimately closed (sk->sk_destruct).
+ *
+ * The race between
+ * - processing a received CAN frame
+ * (can_receive -> j1939_can_recv)
+ * and accessing j1939_priv
+ * ... and ...
+ * - closing a socket
+ * (j1939_can_rx_unregister -> can_rx_unregister)
+ * and calling the final j1939_priv_put()
+ *
+ * is avoided by calling the final j1939_priv_put() from this
+ * RCU deferred cleanup call.
+ */
+ if (jsk->priv) {
+ j1939_priv_put(jsk->priv);
+ jsk->priv = NULL;
+ }
+
+ /* call generic CAN sock destruct */
+ can_sock_destruct(sk);
+}
+
+static int j1939_sk_init(struct sock *sk)
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+
+ /* Ensure that "sk" is first member in "struct j1939_sock", so that we
+ * can skip it during memset().
+ */
+ BUILD_BUG_ON(offsetof(struct j1939_sock, sk) != 0);
+ memset((void *)jsk + sizeof(jsk->sk), 0x0,
+ sizeof(*jsk) - sizeof(jsk->sk));
+
+ INIT_LIST_HEAD(&jsk->list);
+ init_waitqueue_head(&jsk->waitq);
+ jsk->sk.sk_priority = j1939_to_sk_priority(6);
+ jsk->sk.sk_reuse = 1; /* per default */
+ jsk->addr.sa = J1939_NO_ADDR;
+ jsk->addr.da = J1939_NO_ADDR;
+ jsk->addr.pgn = J1939_NO_PGN;
+ jsk->pgn_rx_filter = J1939_NO_PGN;
+ atomic_set(&jsk->skb_pending, 0);
+ spin_lock_init(&jsk->sk_session_queue_lock);
+ INIT_LIST_HEAD(&jsk->sk_session_queue);
+ spin_lock_init(&jsk->filters_lock);
+
+ /* j1939_sk_sock_destruct() depends on SOCK_RCU_FREE flag */
+ sock_set_flag(sk, SOCK_RCU_FREE);
+ sk->sk_destruct = j1939_sk_sock_destruct;
+ sk->sk_protocol = CAN_J1939;
+
+ return 0;
+}
+
+static int j1939_sk_sanity_check(struct sockaddr_can *addr, int len)
+{
+ if (!addr)
+ return -EDESTADDRREQ;
+ if (len < J1939_MIN_NAMELEN)
+ return -EINVAL;
+ if (addr->can_family != AF_CAN)
+ return -EINVAL;
+ if (!addr->can_ifindex)
+ return -ENODEV;
+ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) &&
+ !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int j1939_sk_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int len)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+ struct j1939_sock *jsk = j1939_sk(sock->sk);
+ struct j1939_priv *priv;
+ struct sock *sk;
+ struct net *net;
+ int ret = 0;
+
+ ret = j1939_sk_sanity_check(addr, len);
+ if (ret)
+ return ret;
+
+ lock_sock(sock->sk);
+
+ priv = jsk->priv;
+ sk = sock->sk;
+ net = sock_net(sk);
+
+ /* Already bound to an interface? */
+ if (jsk->state & J1939_SOCK_BOUND) {
+ /* A re-bind() to a different interface is not
+ * supported.
+ */
+ if (jsk->ifindex != addr->can_ifindex) {
+ ret = -EINVAL;
+ goto out_release_sock;
+ }
+
+ /* drop old references */
+ j1939_jsk_del(priv, jsk);
+ j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa);
+ } else {
+ struct can_ml_priv *can_ml;
+ struct net_device *ndev;
+
+ ndev = dev_get_by_index(net, addr->can_ifindex);
+ if (!ndev) {
+ ret = -ENODEV;
+ goto out_release_sock;
+ }
+
+ can_ml = can_get_ml_priv(ndev);
+ if (!can_ml) {
+ dev_put(ndev);
+ ret = -ENODEV;
+ goto out_release_sock;
+ }
+
+ if (!(ndev->flags & IFF_UP)) {
+ dev_put(ndev);
+ ret = -ENETDOWN;
+ goto out_release_sock;
+ }
+
+ priv = j1939_netdev_start(ndev);
+ dev_put(ndev);
+ if (IS_ERR(priv)) {
+ ret = PTR_ERR(priv);
+ goto out_release_sock;
+ }
+
+ jsk->ifindex = addr->can_ifindex;
+
+ /* the corresponding j1939_priv_put() is called via
+ * sk->sk_destruct, which points to j1939_sk_sock_destruct()
+ */
+ j1939_priv_get(priv);
+ jsk->priv = priv;
+ }
+
+ /* set default transmit pgn */
+ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn))
+ jsk->pgn_rx_filter = addr->can_addr.j1939.pgn;
+ jsk->addr.src_name = addr->can_addr.j1939.name;
+ jsk->addr.sa = addr->can_addr.j1939.addr;
+
+ /* get new references */
+ ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa);
+ if (ret) {
+ j1939_netdev_stop(priv);
+ jsk->priv = NULL;
+ synchronize_rcu();
+ j1939_priv_put(priv);
+ goto out_release_sock;
+ }
+
+ j1939_jsk_add(priv, jsk);
+
+ out_release_sock: /* fall through */
+ release_sock(sock->sk);
+
+ return ret;
+}
+
+static int j1939_sk_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
+ int len, int flags)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+ struct j1939_sock *jsk = j1939_sk(sock->sk);
+ int ret = 0;
+
+ ret = j1939_sk_sanity_check(addr, len);
+ if (ret)
+ return ret;
+
+ lock_sock(sock->sk);
+
+ /* bind() before connect() is mandatory */
+ if (!(jsk->state & J1939_SOCK_BOUND)) {
+ ret = -EINVAL;
+ goto out_release_sock;
+ }
+
+ /* A connect() to a different interface is not supported. */
+ if (jsk->ifindex != addr->can_ifindex) {
+ ret = -EINVAL;
+ goto out_release_sock;
+ }
+
+ if (!addr->can_addr.j1939.name &&
+ addr->can_addr.j1939.addr == J1939_NO_ADDR &&
+ !sock_flag(&jsk->sk, SOCK_BROADCAST)) {
+ /* broadcast, but SO_BROADCAST not set */
+ ret = -EACCES;
+ goto out_release_sock;
+ }
+
+ jsk->addr.dst_name = addr->can_addr.j1939.name;
+ jsk->addr.da = addr->can_addr.j1939.addr;
+
+ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn))
+ jsk->addr.pgn = addr->can_addr.j1939.pgn;
+
+ jsk->state |= J1939_SOCK_CONNECTED;
+
+ out_release_sock: /* fall through */
+ release_sock(sock->sk);
+
+ return ret;
+}
+
+static void j1939_sk_sock2sockaddr_can(struct sockaddr_can *addr,
+ const struct j1939_sock *jsk, int peer)
+{
+ /* There are two holes (2 bytes and 3 bytes) to clear to avoid
+ * leaking kernel information to user space.
+ */
+ memset(addr, 0, J1939_MIN_NAMELEN);
+
+ addr->can_family = AF_CAN;
+ addr->can_ifindex = jsk->ifindex;
+ addr->can_addr.j1939.pgn = jsk->addr.pgn;
+ if (peer) {
+ addr->can_addr.j1939.name = jsk->addr.dst_name;
+ addr->can_addr.j1939.addr = jsk->addr.da;
+ } else {
+ addr->can_addr.j1939.name = jsk->addr.src_name;
+ addr->can_addr.j1939.addr = jsk->addr.sa;
+ }
+}
+
+static int j1939_sk_getname(struct socket *sock, struct sockaddr *uaddr,
+ int peer)
+{
+ struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
+ struct sock *sk = sock->sk;
+ struct j1939_sock *jsk = j1939_sk(sk);
+ int ret = 0;
+
+ lock_sock(sk);
+
+ if (peer && !(jsk->state & J1939_SOCK_CONNECTED)) {
+ ret = -EADDRNOTAVAIL;
+ goto failure;
+ }
+
+ j1939_sk_sock2sockaddr_can(addr, jsk, peer);
+ ret = J1939_MIN_NAMELEN;
+
+ failure:
+ release_sock(sk);
+
+ return ret;
+}
+
+static int j1939_sk_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct j1939_sock *jsk;
+
+ if (!sk)
+ return 0;
+
+ lock_sock(sk);
+ jsk = j1939_sk(sk);
+
+ if (jsk->state & J1939_SOCK_BOUND) {
+ struct j1939_priv *priv = jsk->priv;
+
+ if (wait_event_interruptible(jsk->waitq,
+ !j1939_sock_pending_get(&jsk->sk))) {
+ j1939_cancel_active_session(priv, sk);
+ j1939_sk_queue_drop_all(priv, jsk, ESHUTDOWN);
+ }
+
+ j1939_jsk_del(priv, jsk);
+
+ j1939_local_ecu_put(priv, jsk->addr.src_name,
+ jsk->addr.sa);
+
+ j1939_netdev_stop(priv);
+ }
+
+ kfree(jsk->filters);
+ sock_orphan(sk);
+ sock->sk = NULL;
+
+ release_sock(sk);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ sock_put(sk);
+
+ return 0;
+}
+
+static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, sockptr_t optval,
+ unsigned int optlen, int flag)
+{
+ int tmp;
+
+ if (optlen != sizeof(tmp))
+ return -EINVAL;
+ if (copy_from_sockptr(&tmp, optval, optlen))
+ return -EFAULT;
+ lock_sock(&jsk->sk);
+ if (tmp)
+ jsk->state |= flag;
+ else
+ jsk->state &= ~flag;
+ release_sock(&jsk->sk);
+ return tmp;
+}
+
+static int j1939_sk_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ struct sock *sk = sock->sk;
+ struct j1939_sock *jsk = j1939_sk(sk);
+ int tmp, count = 0, ret = 0;
+ struct j1939_filter *filters = NULL, *ofilters;
+
+ if (level != SOL_CAN_J1939)
+ return -EINVAL;
+
+ switch (optname) {
+ case SO_J1939_FILTER:
+ if (!sockptr_is_null(optval) && optlen != 0) {
+ struct j1939_filter *f;
+ int c;
+
+ if (optlen % sizeof(*filters) != 0)
+ return -EINVAL;
+
+ if (optlen > J1939_FILTER_MAX *
+ sizeof(struct j1939_filter))
+ return -EINVAL;
+
+ count = optlen / sizeof(*filters);
+ filters = memdup_sockptr(optval, optlen);
+ if (IS_ERR(filters))
+ return PTR_ERR(filters);
+
+ for (f = filters, c = count; c; f++, c--) {
+ f->name &= f->name_mask;
+ f->pgn &= f->pgn_mask;
+ f->addr &= f->addr_mask;
+ }
+ }
+
+ lock_sock(&jsk->sk);
+ spin_lock_bh(&jsk->filters_lock);
+ ofilters = jsk->filters;
+ jsk->filters = filters;
+ jsk->nfilters = count;
+ spin_unlock_bh(&jsk->filters_lock);
+ release_sock(&jsk->sk);
+ kfree(ofilters);
+ return 0;
+ case SO_J1939_PROMISC:
+ return j1939_sk_setsockopt_flag(jsk, optval, optlen,
+ J1939_SOCK_PROMISC);
+ case SO_J1939_ERRQUEUE:
+ ret = j1939_sk_setsockopt_flag(jsk, optval, optlen,
+ J1939_SOCK_ERRQUEUE);
+ if (ret < 0)
+ return ret;
+
+ if (!(jsk->state & J1939_SOCK_ERRQUEUE))
+ skb_queue_purge(&sk->sk_error_queue);
+ return ret;
+ case SO_J1939_SEND_PRIO:
+ if (optlen != sizeof(tmp))
+ return -EINVAL;
+ if (copy_from_sockptr(&tmp, optval, optlen))
+ return -EFAULT;
+ if (tmp < 0 || tmp > 7)
+ return -EDOM;
+ if (tmp < 2 && !capable(CAP_NET_ADMIN))
+ return -EPERM;
+ lock_sock(&jsk->sk);
+ jsk->sk.sk_priority = j1939_to_sk_priority(tmp);
+ release_sock(&jsk->sk);
+ return 0;
+ default:
+ return -ENOPROTOOPT;
+ }
+}
+
+static int j1939_sk_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = sock->sk;
+ struct j1939_sock *jsk = j1939_sk(sk);
+ int ret, ulen;
+ /* set defaults for using 'int' properties */
+ int tmp = 0;
+ int len = sizeof(tmp);
+ void *val = &tmp;
+
+ if (level != SOL_CAN_J1939)
+ return -EINVAL;
+ if (get_user(ulen, optlen))
+ return -EFAULT;
+ if (ulen < 0)
+ return -EINVAL;
+
+ lock_sock(&jsk->sk);
+ switch (optname) {
+ case SO_J1939_PROMISC:
+ tmp = (jsk->state & J1939_SOCK_PROMISC) ? 1 : 0;
+ break;
+ case SO_J1939_ERRQUEUE:
+ tmp = (jsk->state & J1939_SOCK_ERRQUEUE) ? 1 : 0;
+ break;
+ case SO_J1939_SEND_PRIO:
+ tmp = j1939_prio(jsk->sk.sk_priority);
+ break;
+ default:
+ ret = -ENOPROTOOPT;
+ goto no_copy;
+ }
+
+ /* copy to user, based on 'len' & 'val'
+ * but most sockopt's are 'int' properties, and have 'len' & 'val'
+ * left unchanged, but instead modified 'tmp'
+ */
+ if (len > ulen)
+ ret = -EFAULT;
+ else if (put_user(len, optlen))
+ ret = -EFAULT;
+ else if (copy_to_user(optval, val, len))
+ ret = -EFAULT;
+ else
+ ret = 0;
+ no_copy:
+ release_sock(&jsk->sk);
+ return ret;
+}
+
+static int j1939_sk_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct sk_buff *skb;
+ struct j1939_sk_buff_cb *skcb;
+ int ret = 0;
+
+ if (flags & ~(MSG_DONTWAIT | MSG_ERRQUEUE | MSG_CMSG_COMPAT))
+ return -EINVAL;
+
+ if (flags & MSG_ERRQUEUE)
+ return sock_recv_errqueue(sock->sk, msg, size, SOL_CAN_J1939,
+ SCM_J1939_ERRQUEUE);
+
+ skb = skb_recv_datagram(sk, flags, &ret);
+ if (!skb)
+ return ret;
+
+ if (size < skb->len)
+ msg->msg_flags |= MSG_TRUNC;
+ else
+ size = skb->len;
+
+ ret = memcpy_to_msg(msg, skb->data, size);
+ if (ret < 0) {
+ skb_free_datagram(sk, skb);
+ return ret;
+ }
+
+ skcb = j1939_skb_to_cb(skb);
+ if (j1939_address_is_valid(skcb->addr.da))
+ put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_ADDR,
+ sizeof(skcb->addr.da), &skcb->addr.da);
+
+ if (skcb->addr.dst_name)
+ put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_NAME,
+ sizeof(skcb->addr.dst_name), &skcb->addr.dst_name);
+
+ put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_PRIO,
+ sizeof(skcb->priority), &skcb->priority);
+
+ if (msg->msg_name) {
+ struct sockaddr_can *paddr = msg->msg_name;
+
+ msg->msg_namelen = J1939_MIN_NAMELEN;
+ memset(msg->msg_name, 0, msg->msg_namelen);
+ paddr->can_family = AF_CAN;
+ paddr->can_ifindex = skb->skb_iif;
+ paddr->can_addr.j1939.name = skcb->addr.src_name;
+ paddr->can_addr.j1939.addr = skcb->addr.sa;
+ paddr->can_addr.j1939.pgn = skcb->addr.pgn;
+ }
+
+ sock_recv_cmsgs(msg, sk, skb);
+ msg->msg_flags |= skcb->msg_flags;
+ skb_free_datagram(sk, skb);
+
+ return size;
+}
+
+static struct sk_buff *j1939_sk_alloc_skb(struct net_device *ndev,
+ struct sock *sk,
+ struct msghdr *msg, size_t size,
+ int *errcode)
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+ struct j1939_sk_buff_cb *skcb;
+ struct sk_buff *skb;
+ int ret;
+
+ skb = sock_alloc_send_skb(sk,
+ size +
+ sizeof(struct can_frame) -
+ sizeof(((struct can_frame *)NULL)->data) +
+ sizeof(struct can_skb_priv),
+ msg->msg_flags & MSG_DONTWAIT, &ret);
+ if (!skb)
+ goto failure;
+
+ can_skb_reserve(skb);
+ can_skb_prv(skb)->ifindex = ndev->ifindex;
+ can_skb_prv(skb)->skbcnt = 0;
+ skb_reserve(skb, offsetof(struct can_frame, data));
+
+ ret = memcpy_from_msg(skb_put(skb, size), msg, size);
+ if (ret < 0)
+ goto free_skb;
+
+ skb->dev = ndev;
+
+ skcb = j1939_skb_to_cb(skb);
+ memset(skcb, 0, sizeof(*skcb));
+ skcb->addr = jsk->addr;
+ skcb->priority = j1939_prio(READ_ONCE(sk->sk_priority));
+
+ if (msg->msg_name) {
+ struct sockaddr_can *addr = msg->msg_name;
+
+ if (addr->can_addr.j1939.name ||
+ addr->can_addr.j1939.addr != J1939_NO_ADDR) {
+ skcb->addr.dst_name = addr->can_addr.j1939.name;
+ skcb->addr.da = addr->can_addr.j1939.addr;
+ }
+ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn))
+ skcb->addr.pgn = addr->can_addr.j1939.pgn;
+ }
+
+ *errcode = ret;
+ return skb;
+
+free_skb:
+ kfree_skb(skb);
+failure:
+ *errcode = ret;
+ return NULL;
+}
+
+static size_t j1939_sk_opt_stats_get_size(enum j1939_sk_errqueue_type type)
+{
+ switch (type) {
+ case J1939_ERRQUEUE_RX_RTS:
+ return
+ nla_total_size(sizeof(u32)) + /* J1939_NLA_TOTAL_SIZE */
+ nla_total_size(sizeof(u32)) + /* J1939_NLA_PGN */
+ nla_total_size(sizeof(u64)) + /* J1939_NLA_SRC_NAME */
+ nla_total_size(sizeof(u64)) + /* J1939_NLA_DEST_NAME */
+ nla_total_size(sizeof(u8)) + /* J1939_NLA_SRC_ADDR */
+ nla_total_size(sizeof(u8)) + /* J1939_NLA_DEST_ADDR */
+ 0;
+ default:
+ return
+ nla_total_size(sizeof(u32)) + /* J1939_NLA_BYTES_ACKED */
+ 0;
+ }
+}
+
+static struct sk_buff *
+j1939_sk_get_timestamping_opt_stats(struct j1939_session *session,
+ enum j1939_sk_errqueue_type type)
+{
+ struct sk_buff *stats;
+ u32 size;
+
+ stats = alloc_skb(j1939_sk_opt_stats_get_size(type), GFP_ATOMIC);
+ if (!stats)
+ return NULL;
+
+ if (session->skcb.addr.type == J1939_SIMPLE)
+ size = session->total_message_size;
+ else
+ size = min(session->pkt.tx_acked * 7,
+ session->total_message_size);
+
+ switch (type) {
+ case J1939_ERRQUEUE_RX_RTS:
+ nla_put_u32(stats, J1939_NLA_TOTAL_SIZE,
+ session->total_message_size);
+ nla_put_u32(stats, J1939_NLA_PGN,
+ session->skcb.addr.pgn);
+ nla_put_u64_64bit(stats, J1939_NLA_SRC_NAME,
+ session->skcb.addr.src_name, J1939_NLA_PAD);
+ nla_put_u64_64bit(stats, J1939_NLA_DEST_NAME,
+ session->skcb.addr.dst_name, J1939_NLA_PAD);
+ nla_put_u8(stats, J1939_NLA_SRC_ADDR,
+ session->skcb.addr.sa);
+ nla_put_u8(stats, J1939_NLA_DEST_ADDR,
+ session->skcb.addr.da);
+ break;
+ default:
+ nla_put_u32(stats, J1939_NLA_BYTES_ACKED, size);
+ }
+
+ return stats;
+}
+
+static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk,
+ enum j1939_sk_errqueue_type type)
+{
+ struct j1939_priv *priv = session->priv;
+ struct j1939_sock *jsk;
+ struct sock_exterr_skb *serr;
+ struct sk_buff *skb;
+ char *state = "UNK";
+ u32 tsflags;
+ int err;
+
+ jsk = j1939_sk(sk);
+
+ if (!(jsk->state & J1939_SOCK_ERRQUEUE))
+ return;
+
+ tsflags = READ_ONCE(sk->sk_tsflags);
+ switch (type) {
+ case J1939_ERRQUEUE_TX_ACK:
+ if (!(tsflags & SOF_TIMESTAMPING_TX_ACK))
+ return;
+ break;
+ case J1939_ERRQUEUE_TX_SCHED:
+ if (!(tsflags & SOF_TIMESTAMPING_TX_SCHED))
+ return;
+ break;
+ case J1939_ERRQUEUE_TX_ABORT:
+ break;
+ case J1939_ERRQUEUE_RX_RTS:
+ fallthrough;
+ case J1939_ERRQUEUE_RX_DPO:
+ fallthrough;
+ case J1939_ERRQUEUE_RX_ABORT:
+ if (!(tsflags & SOF_TIMESTAMPING_RX_SOFTWARE))
+ return;
+ break;
+ default:
+ netdev_err(priv->ndev, "Unknown errqueue type %i\n", type);
+ }
+
+ skb = j1939_sk_get_timestamping_opt_stats(session, type);
+ if (!skb)
+ return;
+
+ skb->tstamp = ktime_get_real();
+
+ BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb));
+
+ serr = SKB_EXT_ERR(skb);
+ memset(serr, 0, sizeof(*serr));
+ switch (type) {
+ case J1939_ERRQUEUE_TX_ACK:
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+ serr->ee.ee_info = SCM_TSTAMP_ACK;
+ state = "TX ACK";
+ break;
+ case J1939_ERRQUEUE_TX_SCHED:
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
+ serr->ee.ee_info = SCM_TSTAMP_SCHED;
+ state = "TX SCH";
+ break;
+ case J1939_ERRQUEUE_TX_ABORT:
+ serr->ee.ee_errno = session->err;
+ serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+ serr->ee.ee_info = J1939_EE_INFO_TX_ABORT;
+ state = "TX ABT";
+ break;
+ case J1939_ERRQUEUE_RX_RTS:
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+ serr->ee.ee_info = J1939_EE_INFO_RX_RTS;
+ state = "RX RTS";
+ break;
+ case J1939_ERRQUEUE_RX_DPO:
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+ serr->ee.ee_info = J1939_EE_INFO_RX_DPO;
+ state = "RX DPO";
+ break;
+ case J1939_ERRQUEUE_RX_ABORT:
+ serr->ee.ee_errno = session->err;
+ serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+ serr->ee.ee_info = J1939_EE_INFO_RX_ABORT;
+ state = "RX ABT";
+ break;
+ }
+
+ serr->opt_stats = true;
+ if (tsflags & SOF_TIMESTAMPING_OPT_ID)
+ serr->ee.ee_data = session->tskey;
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p tskey: %i, state: %s\n",
+ __func__, session, session->tskey, state);
+ err = sock_queue_err_skb(sk, skb);
+
+ if (err)
+ kfree_skb(skb);
+};
+
+void j1939_sk_errqueue(struct j1939_session *session,
+ enum j1939_sk_errqueue_type type)
+{
+ struct j1939_priv *priv = session->priv;
+ struct j1939_sock *jsk;
+
+ if (session->sk) {
+ /* send TX notifications to the socket of origin */
+ __j1939_sk_errqueue(session, session->sk, type);
+ return;
+ }
+
+ /* spread RX notifications to all sockets subscribed to this session */
+ read_lock_bh(&priv->j1939_socks_lock);
+ list_for_each_entry(jsk, &priv->j1939_socks, list) {
+ if (j1939_sk_recv_match_one(jsk, &session->skcb))
+ __j1939_sk_errqueue(session, &jsk->sk, type);
+ }
+ read_unlock_bh(&priv->j1939_socks_lock);
+};
+
+void j1939_sk_send_loop_abort(struct sock *sk, int err)
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+
+ if (jsk->state & J1939_SOCK_ERRQUEUE)
+ return;
+
+ sk->sk_err = err;
+
+ sk_error_report(sk);
+}
+
+static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk,
+ struct msghdr *msg, size_t size)
+
+{
+ struct j1939_sock *jsk = j1939_sk(sk);
+ struct j1939_session *session = j1939_sk_get_incomplete_session(jsk);
+ struct sk_buff *skb;
+ size_t segment_size, todo_size;
+ int ret = 0;
+
+ if (session &&
+ session->total_message_size != session->total_queued_size + size) {
+ j1939_session_put(session);
+ return -EIO;
+ }
+
+ todo_size = size;
+
+ do {
+ struct j1939_sk_buff_cb *skcb;
+
+ segment_size = min_t(size_t, J1939_MAX_TP_PACKET_SIZE,
+ todo_size);
+
+ /* Allocate skb for one segment */
+ skb = j1939_sk_alloc_skb(priv->ndev, sk, msg, segment_size,
+ &ret);
+ if (ret)
+ break;
+
+ skcb = j1939_skb_to_cb(skb);
+
+ if (!session) {
+ /* at this point the size should be full size
+ * of the session
+ */
+ skcb->offset = 0;
+ session = j1939_tp_send(priv, skb, size);
+ if (IS_ERR(session)) {
+ ret = PTR_ERR(session);
+ goto kfree_skb;
+ }
+ if (j1939_sk_queue_session(session)) {
+ /* try to activate session if we a
+ * fist in the queue
+ */
+ if (!j1939_session_activate(session)) {
+ j1939_tp_schedule_txtimer(session, 0);
+ } else {
+ ret = -EBUSY;
+ session->err = ret;
+ j1939_sk_queue_drop_all(priv, jsk,
+ EBUSY);
+ break;
+ }
+ }
+ } else {
+ skcb->offset = session->total_queued_size;
+ j1939_session_skb_queue(session, skb);
+ }
+
+ todo_size -= segment_size;
+ session->total_queued_size += segment_size;
+ } while (todo_size);
+
+ switch (ret) {
+ case 0: /* OK */
+ if (todo_size)
+ netdev_warn(priv->ndev,
+ "no error found and not completely queued?! %zu\n",
+ todo_size);
+ ret = size;
+ break;
+ case -ERESTARTSYS:
+ ret = -EINTR;
+ fallthrough;
+ case -EAGAIN: /* OK */
+ if (todo_size != size)
+ ret = size - todo_size;
+ break;
+ default: /* ERROR */
+ break;
+ }
+
+ if (session)
+ j1939_session_put(session);
+
+ return ret;
+
+ kfree_skb:
+ kfree_skb(skb);
+ return ret;
+}
+
+static int j1939_sk_sendmsg(struct socket *sock, struct msghdr *msg,
+ size_t size)
+{
+ struct sock *sk = sock->sk;
+ struct j1939_sock *jsk = j1939_sk(sk);
+ struct j1939_priv *priv;
+ int ifindex;
+ int ret;
+
+ lock_sock(sock->sk);
+ /* various socket state tests */
+ if (!(jsk->state & J1939_SOCK_BOUND)) {
+ ret = -EBADFD;
+ goto sendmsg_done;
+ }
+
+ priv = jsk->priv;
+ ifindex = jsk->ifindex;
+
+ if (!jsk->addr.src_name && jsk->addr.sa == J1939_NO_ADDR) {
+ /* no source address assigned yet */
+ ret = -EBADFD;
+ goto sendmsg_done;
+ }
+
+ /* deal with provided destination address info */
+ if (msg->msg_name) {
+ struct sockaddr_can *addr = msg->msg_name;
+
+ if (msg->msg_namelen < J1939_MIN_NAMELEN) {
+ ret = -EINVAL;
+ goto sendmsg_done;
+ }
+
+ if (addr->can_family != AF_CAN) {
+ ret = -EINVAL;
+ goto sendmsg_done;
+ }
+
+ if (addr->can_ifindex && addr->can_ifindex != ifindex) {
+ ret = -EBADFD;
+ goto sendmsg_done;
+ }
+
+ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) &&
+ !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) {
+ ret = -EINVAL;
+ goto sendmsg_done;
+ }
+
+ if (!addr->can_addr.j1939.name &&
+ addr->can_addr.j1939.addr == J1939_NO_ADDR &&
+ !sock_flag(sk, SOCK_BROADCAST)) {
+ /* broadcast, but SO_BROADCAST not set */
+ ret = -EACCES;
+ goto sendmsg_done;
+ }
+ } else {
+ if (!jsk->addr.dst_name && jsk->addr.da == J1939_NO_ADDR &&
+ !sock_flag(sk, SOCK_BROADCAST)) {
+ /* broadcast, but SO_BROADCAST not set */
+ ret = -EACCES;
+ goto sendmsg_done;
+ }
+ }
+
+ ret = j1939_sk_send_loop(priv, sk, msg, size);
+
+sendmsg_done:
+ release_sock(sock->sk);
+
+ return ret;
+}
+
+void j1939_sk_netdev_event_netdown(struct j1939_priv *priv)
+{
+ struct j1939_sock *jsk;
+ int error_code = ENETDOWN;
+
+ read_lock_bh(&priv->j1939_socks_lock);
+ list_for_each_entry(jsk, &priv->j1939_socks, list) {
+ jsk->sk.sk_err = error_code;
+ if (!sock_flag(&jsk->sk, SOCK_DEAD))
+ sk_error_report(&jsk->sk);
+
+ j1939_sk_queue_drop_all(priv, jsk, error_code);
+ }
+ read_unlock_bh(&priv->j1939_socks_lock);
+}
+
+void j1939_sk_netdev_event_unregister(struct j1939_priv *priv)
+{
+ struct sock *sk;
+ struct j1939_sock *jsk;
+ bool wait_rcu = false;
+
+rescan: /* The caller is holding a ref on this "priv" via j1939_priv_get_by_ndev(). */
+ read_lock_bh(&priv->j1939_socks_lock);
+ list_for_each_entry(jsk, &priv->j1939_socks, list) {
+ /* Skip if j1939_jsk_add() is not called on this socket. */
+ if (!(jsk->state & J1939_SOCK_BOUND))
+ continue;
+ sk = &jsk->sk;
+ sock_hold(sk);
+ read_unlock_bh(&priv->j1939_socks_lock);
+ /* Check if j1939_jsk_del() is not yet called on this socket after holding
+ * socket's lock, for both j1939_sk_bind() and j1939_sk_release() call
+ * j1939_jsk_del() with socket's lock held.
+ */
+ lock_sock(sk);
+ if (jsk->state & J1939_SOCK_BOUND) {
+ /* Neither j1939_sk_bind() nor j1939_sk_release() called j1939_jsk_del().
+ * Make this socket no longer bound, by pretending as if j1939_sk_bind()
+ * dropped old references but did not get new references.
+ */
+ j1939_jsk_del(priv, jsk);
+ j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa);
+ j1939_netdev_stop(priv);
+ /* Call j1939_priv_put() now and prevent j1939_sk_sock_destruct() from
+ * calling the corresponding j1939_priv_put().
+ *
+ * j1939_sk_sock_destruct() is supposed to call j1939_priv_put() after
+ * an RCU grace period. But since the caller is holding a ref on this
+ * "priv", we can defer synchronize_rcu() until immediately before
+ * the caller calls j1939_priv_put().
+ */
+ j1939_priv_put(priv);
+ jsk->priv = NULL;
+ wait_rcu = true;
+ }
+ release_sock(sk);
+ sock_put(sk);
+ goto rescan;
+ }
+ read_unlock_bh(&priv->j1939_socks_lock);
+ if (wait_rcu)
+ synchronize_rcu();
+}
+
+static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ /* no ioctls for socket layer -> hand it down to NIC layer */
+ return -ENOIOCTLCMD;
+}
+
+static const struct proto_ops j1939_ops = {
+ .family = PF_CAN,
+ .release = j1939_sk_release,
+ .bind = j1939_sk_bind,
+ .connect = j1939_sk_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = j1939_sk_getname,
+ .poll = datagram_poll,
+ .ioctl = j1939_sk_no_ioctlcmd,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = j1939_sk_setsockopt,
+ .getsockopt = j1939_sk_getsockopt,
+ .sendmsg = j1939_sk_sendmsg,
+ .recvmsg = j1939_sk_recvmsg,
+ .mmap = sock_no_mmap,
+};
+
+static struct proto j1939_proto __read_mostly = {
+ .name = "CAN_J1939",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct j1939_sock),
+ .init = j1939_sk_init,
+};
+
+const struct can_proto j1939_can_proto = {
+ .type = SOCK_DGRAM,
+ .protocol = CAN_J1939,
+ .ops = &j1939_ops,
+ .prot = &j1939_proto,
+};
diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c
new file mode 100644
index 000000000000..fbf5c8001c9d
--- /dev/null
+++ b/net/can/j1939/transport.c
@@ -0,0 +1,2220 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2010-2011 EIA Electronics,
+// Kurt Van Dijck <kurt.van.dijck@eia.be>
+// Copyright (c) 2018 Protonic,
+// Robin van der Gracht <robin@protonic.nl>
+// Copyright (c) 2017-2019 Pengutronix,
+// Marc Kleine-Budde <kernel@pengutronix.de>
+// Copyright (c) 2017-2019 Pengutronix,
+// Oleksij Rempel <kernel@pengutronix.de>
+
+#include <linux/can/skb.h>
+
+#include "j1939-priv.h"
+
+#define J1939_XTP_TX_RETRY_LIMIT 100
+
+#define J1939_ETP_PGN_CTL 0xc800
+#define J1939_ETP_PGN_DAT 0xc700
+#define J1939_TP_PGN_CTL 0xec00
+#define J1939_TP_PGN_DAT 0xeb00
+
+#define J1939_TP_CMD_RTS 0x10
+#define J1939_TP_CMD_CTS 0x11
+#define J1939_TP_CMD_EOMA 0x13
+#define J1939_TP_CMD_BAM 0x20
+#define J1939_TP_CMD_ABORT 0xff
+
+#define J1939_ETP_CMD_RTS 0x14
+#define J1939_ETP_CMD_CTS 0x15
+#define J1939_ETP_CMD_DPO 0x16
+#define J1939_ETP_CMD_EOMA 0x17
+#define J1939_ETP_CMD_ABORT 0xff
+
+enum j1939_xtp_abort {
+ J1939_XTP_NO_ABORT = 0,
+ J1939_XTP_ABORT_BUSY = 1,
+ /* Already in one or more connection managed sessions and
+ * cannot support another.
+ *
+ * EALREADY:
+ * Operation already in progress
+ */
+
+ J1939_XTP_ABORT_RESOURCE = 2,
+ /* System resources were needed for another task so this
+ * connection managed session was terminated.
+ *
+ * EMSGSIZE:
+ * The socket type requires that message be sent atomically,
+ * and the size of the message to be sent made this
+ * impossible.
+ */
+
+ J1939_XTP_ABORT_TIMEOUT = 3,
+ /* A timeout occurred and this is the connection abort to
+ * close the session.
+ *
+ * EHOSTUNREACH:
+ * The destination host cannot be reached (probably because
+ * the host is down or a remote router cannot reach it).
+ */
+
+ J1939_XTP_ABORT_GENERIC = 4,
+ /* CTS messages received when data transfer is in progress
+ *
+ * EBADMSG:
+ * Not a data message
+ */
+
+ J1939_XTP_ABORT_FAULT = 5,
+ /* Maximal retransmit request limit reached
+ *
+ * ENOTRECOVERABLE:
+ * State not recoverable
+ */
+
+ J1939_XTP_ABORT_UNEXPECTED_DATA = 6,
+ /* Unexpected data transfer packet
+ *
+ * ENOTCONN:
+ * Transport endpoint is not connected
+ */
+
+ J1939_XTP_ABORT_BAD_SEQ = 7,
+ /* Bad sequence number (and software is not able to recover)
+ *
+ * EILSEQ:
+ * Illegal byte sequence
+ */
+
+ J1939_XTP_ABORT_DUP_SEQ = 8,
+ /* Duplicate sequence number (and software is not able to
+ * recover)
+ */
+
+ J1939_XTP_ABORT_EDPO_UNEXPECTED = 9,
+ /* Unexpected EDPO packet (ETP) or Message size > 1785 bytes
+ * (TP)
+ */
+
+ J1939_XTP_ABORT_BAD_EDPO_PGN = 10,
+ /* Unexpected EDPO PGN (PGN in EDPO is bad) */
+
+ J1939_XTP_ABORT_EDPO_OUTOF_CTS = 11,
+ /* EDPO number of packets is greater than CTS */
+
+ J1939_XTP_ABORT_BAD_EDPO_OFFSET = 12,
+ /* Bad EDPO offset */
+
+ J1939_XTP_ABORT_OTHER_DEPRECATED = 13,
+ /* Deprecated. Use 250 instead (Any other reason) */
+
+ J1939_XTP_ABORT_ECTS_UNXPECTED_PGN = 14,
+ /* Unexpected ECTS PGN (PGN in ECTS is bad) */
+
+ J1939_XTP_ABORT_ECTS_TOO_BIG = 15,
+ /* ECTS requested packets exceeds message size */
+
+ J1939_XTP_ABORT_OTHER = 250,
+ /* Any other reason (if a Connection Abort reason is
+ * identified that is not listed in the table use code 250)
+ */
+};
+
+static unsigned int j1939_tp_block = 255;
+static unsigned int j1939_tp_packet_delay;
+static unsigned int j1939_tp_padding = 1;
+
+/* helpers */
+static const char *j1939_xtp_abort_to_str(enum j1939_xtp_abort abort)
+{
+ switch (abort) {
+ case J1939_XTP_ABORT_BUSY:
+ return "Already in one or more connection managed sessions and cannot support another.";
+ case J1939_XTP_ABORT_RESOURCE:
+ return "System resources were needed for another task so this connection managed session was terminated.";
+ case J1939_XTP_ABORT_TIMEOUT:
+ return "A timeout occurred and this is the connection abort to close the session.";
+ case J1939_XTP_ABORT_GENERIC:
+ return "CTS messages received when data transfer is in progress";
+ case J1939_XTP_ABORT_FAULT:
+ return "Maximal retransmit request limit reached";
+ case J1939_XTP_ABORT_UNEXPECTED_DATA:
+ return "Unexpected data transfer packet";
+ case J1939_XTP_ABORT_BAD_SEQ:
+ return "Bad sequence number (and software is not able to recover)";
+ case J1939_XTP_ABORT_DUP_SEQ:
+ return "Duplicate sequence number (and software is not able to recover)";
+ case J1939_XTP_ABORT_EDPO_UNEXPECTED:
+ return "Unexpected EDPO packet (ETP) or Message size > 1785 bytes (TP)";
+ case J1939_XTP_ABORT_BAD_EDPO_PGN:
+ return "Unexpected EDPO PGN (PGN in EDPO is bad)";
+ case J1939_XTP_ABORT_EDPO_OUTOF_CTS:
+ return "EDPO number of packets is greater than CTS";
+ case J1939_XTP_ABORT_BAD_EDPO_OFFSET:
+ return "Bad EDPO offset";
+ case J1939_XTP_ABORT_OTHER_DEPRECATED:
+ return "Deprecated. Use 250 instead (Any other reason)";
+ case J1939_XTP_ABORT_ECTS_UNXPECTED_PGN:
+ return "Unexpected ECTS PGN (PGN in ECTS is bad)";
+ case J1939_XTP_ABORT_ECTS_TOO_BIG:
+ return "ECTS requested packets exceeds message size";
+ case J1939_XTP_ABORT_OTHER:
+ return "Any other reason (if a Connection Abort reason is identified that is not listed in the table use code 250)";
+ default:
+ return "<unknown>";
+ }
+}
+
+static int j1939_xtp_abort_to_errno(struct j1939_priv *priv,
+ enum j1939_xtp_abort abort)
+{
+ int err;
+
+ switch (abort) {
+ case J1939_XTP_NO_ABORT:
+ WARN_ON_ONCE(abort == J1939_XTP_NO_ABORT);
+ err = 0;
+ break;
+ case J1939_XTP_ABORT_BUSY:
+ err = EALREADY;
+ break;
+ case J1939_XTP_ABORT_RESOURCE:
+ err = EMSGSIZE;
+ break;
+ case J1939_XTP_ABORT_TIMEOUT:
+ err = EHOSTUNREACH;
+ break;
+ case J1939_XTP_ABORT_GENERIC:
+ err = EBADMSG;
+ break;
+ case J1939_XTP_ABORT_FAULT:
+ err = ENOTRECOVERABLE;
+ break;
+ case J1939_XTP_ABORT_UNEXPECTED_DATA:
+ err = ENOTCONN;
+ break;
+ case J1939_XTP_ABORT_BAD_SEQ:
+ err = EILSEQ;
+ break;
+ case J1939_XTP_ABORT_DUP_SEQ:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_EDPO_UNEXPECTED:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_BAD_EDPO_PGN:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_EDPO_OUTOF_CTS:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_BAD_EDPO_OFFSET:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_OTHER_DEPRECATED:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_ECTS_UNXPECTED_PGN:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_ECTS_TOO_BIG:
+ err = EPROTO;
+ break;
+ case J1939_XTP_ABORT_OTHER:
+ err = EPROTO;
+ break;
+ default:
+ netdev_warn(priv->ndev, "Unknown abort code %i", abort);
+ err = EPROTO;
+ }
+
+ return err;
+}
+
+static inline void j1939_session_list_lock(struct j1939_priv *priv)
+{
+ spin_lock_bh(&priv->active_session_list_lock);
+}
+
+static inline void j1939_session_list_unlock(struct j1939_priv *priv)
+{
+ spin_unlock_bh(&priv->active_session_list_lock);
+}
+
+void j1939_session_get(struct j1939_session *session)
+{
+ kref_get(&session->kref);
+}
+
+/* session completion functions */
+static void __j1939_session_drop(struct j1939_session *session)
+{
+ if (!session->transmission)
+ return;
+
+ j1939_sock_pending_del(session->sk);
+ sock_put(session->sk);
+}
+
+static void j1939_session_destroy(struct j1939_session *session)
+{
+ struct sk_buff *skb;
+
+ if (session->transmission) {
+ if (session->err)
+ j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_ABORT);
+ else
+ j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_ACK);
+ } else if (session->err) {
+ j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT);
+ }
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ WARN_ON_ONCE(!list_empty(&session->sk_session_queue_entry));
+ WARN_ON_ONCE(!list_empty(&session->active_session_list_entry));
+
+ while ((skb = skb_dequeue(&session->skb_queue)) != NULL) {
+ /* drop ref taken in j1939_session_skb_queue() */
+ skb_unref(skb);
+ kfree_skb(skb);
+ }
+ __j1939_session_drop(session);
+ j1939_priv_put(session->priv);
+ kfree(session);
+}
+
+static void __j1939_session_release(struct kref *kref)
+{
+ struct j1939_session *session = container_of(kref, struct j1939_session,
+ kref);
+
+ j1939_session_destroy(session);
+}
+
+void j1939_session_put(struct j1939_session *session)
+{
+ kref_put(&session->kref, __j1939_session_release);
+}
+
+static void j1939_session_txtimer_cancel(struct j1939_session *session)
+{
+ if (hrtimer_cancel(&session->txtimer))
+ j1939_session_put(session);
+}
+
+static void j1939_session_rxtimer_cancel(struct j1939_session *session)
+{
+ if (hrtimer_cancel(&session->rxtimer))
+ j1939_session_put(session);
+}
+
+void j1939_session_timers_cancel(struct j1939_session *session)
+{
+ j1939_session_txtimer_cancel(session);
+ j1939_session_rxtimer_cancel(session);
+}
+
+static inline bool j1939_cb_is_broadcast(const struct j1939_sk_buff_cb *skcb)
+{
+ return (!skcb->addr.dst_name && (skcb->addr.da == 0xff));
+}
+
+static void j1939_session_skb_drop_old(struct j1939_session *session)
+{
+ struct sk_buff *do_skb;
+ struct j1939_sk_buff_cb *do_skcb;
+ unsigned int offset_start;
+ unsigned long flags;
+
+ if (skb_queue_len(&session->skb_queue) < 2)
+ return;
+
+ offset_start = session->pkt.tx_acked * 7;
+
+ spin_lock_irqsave(&session->skb_queue.lock, flags);
+ do_skb = skb_peek(&session->skb_queue);
+ do_skcb = j1939_skb_to_cb(do_skb);
+
+ if ((do_skcb->offset + do_skb->len) < offset_start) {
+ __skb_unlink(do_skb, &session->skb_queue);
+ /* drop ref taken in j1939_session_skb_queue() */
+ skb_unref(do_skb);
+ spin_unlock_irqrestore(&session->skb_queue.lock, flags);
+
+ kfree_skb(do_skb);
+ } else {
+ spin_unlock_irqrestore(&session->skb_queue.lock, flags);
+ }
+}
+
+void j1939_session_skb_queue(struct j1939_session *session,
+ struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_priv *priv = session->priv;
+
+ j1939_ac_fixup(priv, skb);
+
+ if (j1939_address_is_unicast(skcb->addr.da) &&
+ priv->ents[skcb->addr.da].nusers)
+ skcb->flags |= J1939_ECU_LOCAL_DST;
+
+ skcb->flags |= J1939_ECU_LOCAL_SRC;
+
+ skb_get(skb);
+ skb_queue_tail(&session->skb_queue, skb);
+}
+
+static struct
+sk_buff *j1939_session_skb_get_by_offset(struct j1939_session *session,
+ unsigned int offset_start)
+{
+ struct j1939_priv *priv = session->priv;
+ struct j1939_sk_buff_cb *do_skcb;
+ struct sk_buff *skb = NULL;
+ struct sk_buff *do_skb;
+ unsigned long flags;
+
+ spin_lock_irqsave(&session->skb_queue.lock, flags);
+ skb_queue_walk(&session->skb_queue, do_skb) {
+ do_skcb = j1939_skb_to_cb(do_skb);
+
+ if ((offset_start >= do_skcb->offset &&
+ offset_start < (do_skcb->offset + do_skb->len)) ||
+ (offset_start == 0 && do_skcb->offset == 0 && do_skb->len == 0)) {
+ skb = do_skb;
+ }
+ }
+
+ if (skb)
+ skb_get(skb);
+
+ spin_unlock_irqrestore(&session->skb_queue.lock, flags);
+
+ if (!skb)
+ netdev_dbg(priv->ndev, "%s: 0x%p: no skb found for start: %i, queue size: %i\n",
+ __func__, session, offset_start,
+ skb_queue_len(&session->skb_queue));
+
+ return skb;
+}
+
+static struct sk_buff *j1939_session_skb_get(struct j1939_session *session)
+{
+ unsigned int offset_start;
+
+ offset_start = session->pkt.dpo * 7;
+ return j1939_session_skb_get_by_offset(session, offset_start);
+}
+
+/* see if we are receiver
+ * returns 0 for broadcasts, although we will receive them
+ */
+static inline int j1939_tp_im_receiver(const struct j1939_sk_buff_cb *skcb)
+{
+ return skcb->flags & J1939_ECU_LOCAL_DST;
+}
+
+/* see if we are sender */
+static inline int j1939_tp_im_transmitter(const struct j1939_sk_buff_cb *skcb)
+{
+ return skcb->flags & J1939_ECU_LOCAL_SRC;
+}
+
+/* see if we are involved as either receiver or transmitter */
+static int j1939_tp_im_involved(const struct j1939_sk_buff_cb *skcb, bool swap)
+{
+ if (swap)
+ return j1939_tp_im_receiver(skcb);
+ else
+ return j1939_tp_im_transmitter(skcb);
+}
+
+static int j1939_tp_im_involved_anydir(struct j1939_sk_buff_cb *skcb)
+{
+ return skcb->flags & (J1939_ECU_LOCAL_SRC | J1939_ECU_LOCAL_DST);
+}
+
+/* extract pgn from flow-ctl message */
+static inline pgn_t j1939_xtp_ctl_to_pgn(const u8 *dat)
+{
+ pgn_t pgn;
+
+ pgn = (dat[7] << 16) | (dat[6] << 8) | (dat[5] << 0);
+ if (j1939_pgn_is_pdu1(pgn))
+ pgn &= 0xffff00;
+ return pgn;
+}
+
+static inline unsigned int j1939_tp_ctl_to_size(const u8 *dat)
+{
+ return (dat[2] << 8) + (dat[1] << 0);
+}
+
+static inline unsigned int j1939_etp_ctl_to_packet(const u8 *dat)
+{
+ return (dat[4] << 16) | (dat[3] << 8) | (dat[2] << 0);
+}
+
+static inline unsigned int j1939_etp_ctl_to_size(const u8 *dat)
+{
+ return (dat[4] << 24) | (dat[3] << 16) |
+ (dat[2] << 8) | (dat[1] << 0);
+}
+
+/* find existing session:
+ * reverse: swap cb's src & dst
+ * there is no problem with matching broadcasts, since
+ * broadcasts (no dst, no da) would never call this
+ * with reverse == true
+ */
+static bool j1939_session_match(struct j1939_addr *se_addr,
+ struct j1939_addr *sk_addr, bool reverse)
+{
+ if (se_addr->type != sk_addr->type)
+ return false;
+
+ if (reverse) {
+ if (se_addr->src_name) {
+ if (se_addr->src_name != sk_addr->dst_name)
+ return false;
+ } else if (se_addr->sa != sk_addr->da) {
+ return false;
+ }
+
+ if (se_addr->dst_name) {
+ if (se_addr->dst_name != sk_addr->src_name)
+ return false;
+ } else if (se_addr->da != sk_addr->sa) {
+ return false;
+ }
+ } else {
+ if (se_addr->src_name) {
+ if (se_addr->src_name != sk_addr->src_name)
+ return false;
+ } else if (se_addr->sa != sk_addr->sa) {
+ return false;
+ }
+
+ if (se_addr->dst_name) {
+ if (se_addr->dst_name != sk_addr->dst_name)
+ return false;
+ } else if (se_addr->da != sk_addr->da) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static struct
+j1939_session *j1939_session_get_by_addr_locked(struct j1939_priv *priv,
+ struct list_head *root,
+ struct j1939_addr *addr,
+ bool reverse, bool transmitter)
+{
+ struct j1939_session *session;
+
+ lockdep_assert_held(&priv->active_session_list_lock);
+
+ list_for_each_entry(session, root, active_session_list_entry) {
+ j1939_session_get(session);
+ if (j1939_session_match(&session->skcb.addr, addr, reverse) &&
+ session->transmission == transmitter)
+ return session;
+ j1939_session_put(session);
+ }
+
+ return NULL;
+}
+
+static struct
+j1939_session *j1939_session_get_simple(struct j1939_priv *priv,
+ struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+
+ lockdep_assert_held(&priv->active_session_list_lock);
+
+ list_for_each_entry(session, &priv->active_session_list,
+ active_session_list_entry) {
+ j1939_session_get(session);
+ if (session->skcb.addr.type == J1939_SIMPLE &&
+ session->tskey == skcb->tskey && session->sk == skb->sk)
+ return session;
+ j1939_session_put(session);
+ }
+
+ return NULL;
+}
+
+static struct
+j1939_session *j1939_session_get_by_addr(struct j1939_priv *priv,
+ struct j1939_addr *addr,
+ bool reverse, bool transmitter)
+{
+ struct j1939_session *session;
+
+ j1939_session_list_lock(priv);
+ session = j1939_session_get_by_addr_locked(priv,
+ &priv->active_session_list,
+ addr, reverse, transmitter);
+ j1939_session_list_unlock(priv);
+
+ return session;
+}
+
+static void j1939_skbcb_swap(struct j1939_sk_buff_cb *skcb)
+{
+ u8 tmp = 0;
+
+ swap(skcb->addr.dst_name, skcb->addr.src_name);
+ swap(skcb->addr.da, skcb->addr.sa);
+
+ /* swap SRC and DST flags, leave other untouched */
+ if (skcb->flags & J1939_ECU_LOCAL_SRC)
+ tmp |= J1939_ECU_LOCAL_DST;
+ if (skcb->flags & J1939_ECU_LOCAL_DST)
+ tmp |= J1939_ECU_LOCAL_SRC;
+ skcb->flags &= ~(J1939_ECU_LOCAL_SRC | J1939_ECU_LOCAL_DST);
+ skcb->flags |= tmp;
+}
+
+static struct
+sk_buff *j1939_tp_tx_dat_new(struct j1939_priv *priv,
+ const struct j1939_sk_buff_cb *re_skcb,
+ bool ctl,
+ bool swap_src_dst)
+{
+ struct sk_buff *skb;
+ struct j1939_sk_buff_cb *skcb;
+
+ skb = alloc_skb(sizeof(struct can_frame) + sizeof(struct can_skb_priv),
+ GFP_ATOMIC);
+ if (unlikely(!skb))
+ return ERR_PTR(-ENOMEM);
+
+ skb->dev = priv->ndev;
+ can_skb_reserve(skb);
+ can_skb_prv(skb)->ifindex = priv->ndev->ifindex;
+ can_skb_prv(skb)->skbcnt = 0;
+ /* reserve CAN header */
+ skb_reserve(skb, offsetof(struct can_frame, data));
+
+ /* skb->cb must be large enough to hold a j1939_sk_buff_cb structure */
+ BUILD_BUG_ON(sizeof(skb->cb) < sizeof(*re_skcb));
+
+ memcpy(skb->cb, re_skcb, sizeof(*re_skcb));
+ skcb = j1939_skb_to_cb(skb);
+ if (swap_src_dst)
+ j1939_skbcb_swap(skcb);
+
+ if (ctl) {
+ if (skcb->addr.type == J1939_ETP)
+ skcb->addr.pgn = J1939_ETP_PGN_CTL;
+ else
+ skcb->addr.pgn = J1939_TP_PGN_CTL;
+ } else {
+ if (skcb->addr.type == J1939_ETP)
+ skcb->addr.pgn = J1939_ETP_PGN_DAT;
+ else
+ skcb->addr.pgn = J1939_TP_PGN_DAT;
+ }
+
+ return skb;
+}
+
+/* TP transmit packet functions */
+static int j1939_tp_tx_dat(struct j1939_session *session,
+ const u8 *dat, int len)
+{
+ struct j1939_priv *priv = session->priv;
+ struct sk_buff *skb;
+
+ skb = j1939_tp_tx_dat_new(priv, &session->skcb,
+ false, false);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ skb_put_data(skb, dat, len);
+ if (j1939_tp_padding && len < 8)
+ memset(skb_put(skb, 8 - len), 0xff, 8 - len);
+
+ return j1939_send_one(priv, skb);
+}
+
+static int j1939_xtp_do_tx_ctl(struct j1939_priv *priv,
+ const struct j1939_sk_buff_cb *re_skcb,
+ bool swap_src_dst, pgn_t pgn, const u8 *dat)
+{
+ struct sk_buff *skb;
+ u8 *skdat;
+
+ if (!j1939_tp_im_involved(re_skcb, swap_src_dst))
+ return 0;
+
+ skb = j1939_tp_tx_dat_new(priv, re_skcb, true, swap_src_dst);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ skdat = skb_put(skb, 8);
+ memcpy(skdat, dat, 5);
+ skdat[5] = (pgn >> 0);
+ skdat[6] = (pgn >> 8);
+ skdat[7] = (pgn >> 16);
+
+ return j1939_send_one(priv, skb);
+}
+
+static inline int j1939_tp_tx_ctl(struct j1939_session *session,
+ bool swap_src_dst, const u8 *dat)
+{
+ struct j1939_priv *priv = session->priv;
+
+ return j1939_xtp_do_tx_ctl(priv, &session->skcb,
+ swap_src_dst,
+ session->skcb.addr.pgn, dat);
+}
+
+static int j1939_xtp_tx_abort(struct j1939_priv *priv,
+ const struct j1939_sk_buff_cb *re_skcb,
+ bool swap_src_dst,
+ enum j1939_xtp_abort err,
+ pgn_t pgn)
+{
+ u8 dat[5];
+
+ if (!j1939_tp_im_involved(re_skcb, swap_src_dst))
+ return 0;
+
+ memset(dat, 0xff, sizeof(dat));
+ dat[0] = J1939_TP_CMD_ABORT;
+ dat[1] = err;
+ return j1939_xtp_do_tx_ctl(priv, re_skcb, swap_src_dst, pgn, dat);
+}
+
+void j1939_tp_schedule_txtimer(struct j1939_session *session, int msec)
+{
+ j1939_session_get(session);
+ hrtimer_start(&session->txtimer, ms_to_ktime(msec),
+ HRTIMER_MODE_REL_SOFT);
+}
+
+static inline void j1939_tp_set_rxtimeout(struct j1939_session *session,
+ int msec)
+{
+ j1939_session_rxtimer_cancel(session);
+ j1939_session_get(session);
+ hrtimer_start(&session->rxtimer, ms_to_ktime(msec),
+ HRTIMER_MODE_REL_SOFT);
+}
+
+static int j1939_session_tx_rts(struct j1939_session *session)
+{
+ u8 dat[8];
+ int ret;
+
+ memset(dat, 0xff, sizeof(dat));
+
+ dat[1] = (session->total_message_size >> 0);
+ dat[2] = (session->total_message_size >> 8);
+ dat[3] = session->pkt.total;
+
+ if (session->skcb.addr.type == J1939_ETP) {
+ dat[0] = J1939_ETP_CMD_RTS;
+ dat[1] = (session->total_message_size >> 0);
+ dat[2] = (session->total_message_size >> 8);
+ dat[3] = (session->total_message_size >> 16);
+ dat[4] = (session->total_message_size >> 24);
+ } else if (j1939_cb_is_broadcast(&session->skcb)) {
+ dat[0] = J1939_TP_CMD_BAM;
+ /* fake cts for broadcast */
+ session->pkt.tx = 0;
+ } else {
+ dat[0] = J1939_TP_CMD_RTS;
+ dat[4] = dat[3];
+ }
+
+ if (dat[0] == session->last_txcmd)
+ /* done already */
+ return 0;
+
+ ret = j1939_tp_tx_ctl(session, false, dat);
+ if (ret < 0)
+ return ret;
+
+ session->last_txcmd = dat[0];
+ if (dat[0] == J1939_TP_CMD_BAM) {
+ j1939_tp_schedule_txtimer(session, 50);
+ j1939_tp_set_rxtimeout(session, 250);
+ } else {
+ j1939_tp_set_rxtimeout(session, 1250);
+ }
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ return 0;
+}
+
+static int j1939_session_tx_dpo(struct j1939_session *session)
+{
+ unsigned int pkt;
+ u8 dat[8];
+ int ret;
+
+ memset(dat, 0xff, sizeof(dat));
+
+ dat[0] = J1939_ETP_CMD_DPO;
+ session->pkt.dpo = session->pkt.tx_acked;
+ pkt = session->pkt.dpo;
+ dat[1] = session->pkt.last - session->pkt.tx_acked;
+ dat[2] = (pkt >> 0);
+ dat[3] = (pkt >> 8);
+ dat[4] = (pkt >> 16);
+
+ ret = j1939_tp_tx_ctl(session, false, dat);
+ if (ret < 0)
+ return ret;
+
+ session->last_txcmd = dat[0];
+ j1939_tp_set_rxtimeout(session, 1250);
+ session->pkt.tx = session->pkt.tx_acked;
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ return 0;
+}
+
+static int j1939_session_tx_dat(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ struct j1939_sk_buff_cb *se_skcb;
+ int offset, pkt_done, pkt_end;
+ unsigned int len, pdelay;
+ struct sk_buff *se_skb;
+ const u8 *tpdat;
+ int ret = 0;
+ u8 dat[8];
+
+ se_skb = j1939_session_skb_get_by_offset(session, session->pkt.tx * 7);
+ if (!se_skb)
+ return -ENOBUFS;
+
+ se_skcb = j1939_skb_to_cb(se_skb);
+ tpdat = se_skb->data;
+ ret = 0;
+ pkt_done = 0;
+ if (session->skcb.addr.type != J1939_ETP &&
+ j1939_cb_is_broadcast(&session->skcb))
+ pkt_end = session->pkt.total;
+ else
+ pkt_end = session->pkt.last;
+
+ while (session->pkt.tx < pkt_end) {
+ dat[0] = session->pkt.tx - session->pkt.dpo + 1;
+ offset = (session->pkt.tx * 7) - se_skcb->offset;
+ len = se_skb->len - offset;
+ if (len > 7)
+ len = 7;
+
+ if (offset + len > se_skb->len) {
+ netdev_err_once(priv->ndev,
+ "%s: 0x%p: requested data outside of queued buffer: offset %i, len %i, pkt.tx: %i\n",
+ __func__, session, se_skcb->offset,
+ se_skb->len , session->pkt.tx);
+ ret = -EOVERFLOW;
+ goto out_free;
+ }
+
+ if (!len) {
+ ret = -ENOBUFS;
+ break;
+ }
+
+ memcpy(&dat[1], &tpdat[offset], len);
+ ret = j1939_tp_tx_dat(session, dat, len + 1);
+ if (ret < 0) {
+ /* ENOBUFS == CAN interface TX queue is full */
+ if (ret != -ENOBUFS)
+ netdev_alert(priv->ndev,
+ "%s: 0x%p: queue data error: %i\n",
+ __func__, session, ret);
+ break;
+ }
+
+ session->last_txcmd = 0xff;
+ pkt_done++;
+ session->pkt.tx++;
+ pdelay = j1939_cb_is_broadcast(&session->skcb) ? 50 :
+ j1939_tp_packet_delay;
+
+ if (session->pkt.tx < session->pkt.total && pdelay) {
+ j1939_tp_schedule_txtimer(session, pdelay);
+ break;
+ }
+ }
+
+ if (pkt_done)
+ j1939_tp_set_rxtimeout(session, 250);
+
+ out_free:
+ if (ret)
+ kfree_skb(se_skb);
+ else
+ consume_skb(se_skb);
+
+ return ret;
+}
+
+static int j1939_xtp_txnext_transmiter(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ int ret = 0;
+
+ if (!j1939_tp_im_transmitter(&session->skcb)) {
+ netdev_alert(priv->ndev, "%s: 0x%p: called by not transmitter!\n",
+ __func__, session);
+ return -EINVAL;
+ }
+
+ switch (session->last_cmd) {
+ case 0:
+ ret = j1939_session_tx_rts(session);
+ break;
+
+ case J1939_ETP_CMD_CTS:
+ if (session->last_txcmd != J1939_ETP_CMD_DPO) {
+ ret = j1939_session_tx_dpo(session);
+ if (ret)
+ return ret;
+ }
+
+ fallthrough;
+ case J1939_TP_CMD_CTS:
+ case 0xff: /* did some data */
+ case J1939_ETP_CMD_DPO:
+ case J1939_TP_CMD_BAM:
+ ret = j1939_session_tx_dat(session);
+
+ break;
+ default:
+ netdev_alert(priv->ndev, "%s: 0x%p: unexpected last_cmd: %x\n",
+ __func__, session, session->last_cmd);
+ }
+
+ return ret;
+}
+
+static int j1939_session_tx_cts(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ unsigned int pkt, len;
+ int ret;
+ u8 dat[8];
+
+ if (!j1939_sk_recv_match(priv, &session->skcb))
+ return -ENOENT;
+
+ len = session->pkt.total - session->pkt.rx;
+ len = min3(len, session->pkt.block, j1939_tp_block ?: 255);
+ memset(dat, 0xff, sizeof(dat));
+
+ if (session->skcb.addr.type == J1939_ETP) {
+ pkt = session->pkt.rx + 1;
+ dat[0] = J1939_ETP_CMD_CTS;
+ dat[1] = len;
+ dat[2] = (pkt >> 0);
+ dat[3] = (pkt >> 8);
+ dat[4] = (pkt >> 16);
+ } else {
+ dat[0] = J1939_TP_CMD_CTS;
+ dat[1] = len;
+ dat[2] = session->pkt.rx + 1;
+ }
+
+ if (dat[0] == session->last_txcmd)
+ /* done already */
+ return 0;
+
+ ret = j1939_tp_tx_ctl(session, true, dat);
+ if (ret < 0)
+ return ret;
+
+ if (len)
+ /* only mark cts done when len is set */
+ session->last_txcmd = dat[0];
+ j1939_tp_set_rxtimeout(session, 1250);
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ return 0;
+}
+
+static int j1939_session_tx_eoma(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ u8 dat[8];
+ int ret;
+
+ if (!j1939_sk_recv_match(priv, &session->skcb))
+ return -ENOENT;
+
+ memset(dat, 0xff, sizeof(dat));
+
+ if (session->skcb.addr.type == J1939_ETP) {
+ dat[0] = J1939_ETP_CMD_EOMA;
+ dat[1] = session->total_message_size >> 0;
+ dat[2] = session->total_message_size >> 8;
+ dat[3] = session->total_message_size >> 16;
+ dat[4] = session->total_message_size >> 24;
+ } else {
+ dat[0] = J1939_TP_CMD_EOMA;
+ dat[1] = session->total_message_size;
+ dat[2] = session->total_message_size >> 8;
+ dat[3] = session->pkt.total;
+ }
+
+ if (dat[0] == session->last_txcmd)
+ /* done already */
+ return 0;
+
+ ret = j1939_tp_tx_ctl(session, true, dat);
+ if (ret < 0)
+ return ret;
+
+ session->last_txcmd = dat[0];
+
+ /* wait for the EOMA packet to come in */
+ j1939_tp_set_rxtimeout(session, 1250);
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ return 0;
+}
+
+static int j1939_xtp_txnext_receiver(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ int ret = 0;
+
+ if (!j1939_tp_im_receiver(&session->skcb)) {
+ netdev_alert(priv->ndev, "%s: 0x%p: called by not receiver!\n",
+ __func__, session);
+ return -EINVAL;
+ }
+
+ switch (session->last_cmd) {
+ case J1939_TP_CMD_RTS:
+ case J1939_ETP_CMD_RTS:
+ ret = j1939_session_tx_cts(session);
+ break;
+
+ case J1939_ETP_CMD_CTS:
+ case J1939_TP_CMD_CTS:
+ case 0xff: /* did some data */
+ case J1939_ETP_CMD_DPO:
+ if ((session->skcb.addr.type == J1939_TP &&
+ j1939_cb_is_broadcast(&session->skcb)))
+ break;
+
+ if (session->pkt.rx >= session->pkt.total) {
+ ret = j1939_session_tx_eoma(session);
+ } else if (session->pkt.rx >= session->pkt.last) {
+ session->last_txcmd = 0;
+ ret = j1939_session_tx_cts(session);
+ }
+ break;
+ default:
+ netdev_alert(priv->ndev, "%s: 0x%p: unexpected last_cmd: %x\n",
+ __func__, session, session->last_cmd);
+ }
+
+ return ret;
+}
+
+static int j1939_simple_txnext(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ struct sk_buff *se_skb = j1939_session_skb_get(session);
+ struct sk_buff *skb;
+ int ret;
+
+ if (!se_skb)
+ return 0;
+
+ skb = skb_clone(se_skb, GFP_ATOMIC);
+ if (!skb) {
+ ret = -ENOMEM;
+ goto out_free;
+ }
+
+ can_skb_set_owner(skb, se_skb->sk);
+
+ j1939_tp_set_rxtimeout(session, J1939_SIMPLE_ECHO_TIMEOUT_MS);
+
+ ret = j1939_send_one(priv, skb);
+ if (ret)
+ goto out_free;
+
+ j1939_sk_errqueue(session, J1939_ERRQUEUE_TX_SCHED);
+ j1939_sk_queue_activate_next(session);
+
+ out_free:
+ if (ret)
+ kfree_skb(se_skb);
+ else
+ consume_skb(se_skb);
+
+ return ret;
+}
+
+static bool j1939_session_deactivate_locked(struct j1939_session *session)
+{
+ bool active = false;
+
+ lockdep_assert_held(&session->priv->active_session_list_lock);
+
+ if (session->state >= J1939_SESSION_ACTIVE &&
+ session->state < J1939_SESSION_ACTIVE_MAX) {
+ active = true;
+
+ list_del_init(&session->active_session_list_entry);
+ session->state = J1939_SESSION_DONE;
+ j1939_session_put(session);
+ }
+
+ return active;
+}
+
+static bool j1939_session_deactivate(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ bool active;
+
+ j1939_session_list_lock(priv);
+ active = j1939_session_deactivate_locked(session);
+ j1939_session_list_unlock(priv);
+
+ return active;
+}
+
+static void
+j1939_session_deactivate_activate_next(struct j1939_session *session)
+{
+ if (j1939_session_deactivate(session))
+ j1939_sk_queue_activate_next(session);
+}
+
+static void __j1939_session_cancel(struct j1939_session *session,
+ enum j1939_xtp_abort err)
+{
+ struct j1939_priv *priv = session->priv;
+
+ WARN_ON_ONCE(!err);
+ lockdep_assert_held(&session->priv->active_session_list_lock);
+
+ session->err = j1939_xtp_abort_to_errno(priv, err);
+ session->state = J1939_SESSION_WAITING_ABORT;
+ /* do not send aborts on incoming broadcasts */
+ if (!j1939_cb_is_broadcast(&session->skcb)) {
+ j1939_xtp_tx_abort(priv, &session->skcb,
+ !session->transmission,
+ err, session->skcb.addr.pgn);
+ }
+
+ if (session->sk)
+ j1939_sk_send_loop_abort(session->sk, session->err);
+}
+
+static void j1939_session_cancel(struct j1939_session *session,
+ enum j1939_xtp_abort err)
+{
+ j1939_session_list_lock(session->priv);
+
+ if (session->state >= J1939_SESSION_ACTIVE &&
+ session->state < J1939_SESSION_WAITING_ABORT) {
+ j1939_tp_set_rxtimeout(session, J1939_XTP_ABORT_TIMEOUT_MS);
+ __j1939_session_cancel(session, err);
+ }
+
+ j1939_session_list_unlock(session->priv);
+
+ if (!session->sk)
+ j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT);
+}
+
+static enum hrtimer_restart j1939_tp_txtimer(struct hrtimer *hrtimer)
+{
+ struct j1939_session *session =
+ container_of(hrtimer, struct j1939_session, txtimer);
+ struct j1939_priv *priv = session->priv;
+ int ret = 0;
+
+ if (session->skcb.addr.type == J1939_SIMPLE) {
+ ret = j1939_simple_txnext(session);
+ } else {
+ if (session->transmission)
+ ret = j1939_xtp_txnext_transmiter(session);
+ else
+ ret = j1939_xtp_txnext_receiver(session);
+ }
+
+ switch (ret) {
+ case -ENOBUFS:
+ /* Retry limit is currently arbitrary chosen */
+ if (session->tx_retry < J1939_XTP_TX_RETRY_LIMIT) {
+ session->tx_retry++;
+ j1939_tp_schedule_txtimer(session,
+ 10 + get_random_u32_below(16));
+ } else {
+ netdev_alert(priv->ndev, "%s: 0x%p: tx retry count reached\n",
+ __func__, session);
+ session->err = -ENETUNREACH;
+ j1939_session_rxtimer_cancel(session);
+ j1939_session_deactivate_activate_next(session);
+ }
+ break;
+ case -ENETDOWN:
+ /* In this case we should get a netdev_event(), all active
+ * sessions will be cleared by j1939_cancel_active_session().
+ * So handle this as an error, but let
+ * j1939_cancel_active_session() do the cleanup including
+ * propagation of the error to user space.
+ */
+ break;
+ case -EOVERFLOW:
+ j1939_session_cancel(session, J1939_XTP_ABORT_ECTS_TOO_BIG);
+ break;
+ case 0:
+ session->tx_retry = 0;
+ break;
+ default:
+ netdev_alert(priv->ndev, "%s: 0x%p: tx aborted with unknown reason: %i\n",
+ __func__, session, ret);
+ if (session->skcb.addr.type != J1939_SIMPLE) {
+ j1939_session_cancel(session, J1939_XTP_ABORT_OTHER);
+ } else {
+ session->err = ret;
+ j1939_session_rxtimer_cancel(session);
+ j1939_session_deactivate_activate_next(session);
+ }
+ }
+
+ j1939_session_put(session);
+
+ return HRTIMER_NORESTART;
+}
+
+static void j1939_session_completed(struct j1939_session *session)
+{
+ struct sk_buff *se_skb;
+
+ if (!session->transmission) {
+ se_skb = j1939_session_skb_get(session);
+ /* distribute among j1939 receivers */
+ j1939_sk_recv(session->priv, se_skb);
+ consume_skb(se_skb);
+ }
+
+ j1939_session_deactivate_activate_next(session);
+}
+
+static enum hrtimer_restart j1939_tp_rxtimer(struct hrtimer *hrtimer)
+{
+ struct j1939_session *session = container_of(hrtimer,
+ struct j1939_session,
+ rxtimer);
+ struct j1939_priv *priv = session->priv;
+
+ if (session->state == J1939_SESSION_WAITING_ABORT) {
+ netdev_alert(priv->ndev, "%s: 0x%p: abort rx timeout. Force session deactivation\n",
+ __func__, session);
+
+ j1939_session_deactivate_activate_next(session);
+
+ } else if (session->skcb.addr.type == J1939_SIMPLE) {
+ netdev_alert(priv->ndev, "%s: 0x%p: Timeout. Failed to send simple message.\n",
+ __func__, session);
+
+ /* The message is probably stuck in the CAN controller and can
+ * be send as soon as CAN bus is in working state again.
+ */
+ session->err = -ETIME;
+ j1939_session_deactivate(session);
+ } else {
+ j1939_session_list_lock(session->priv);
+ if (session->state >= J1939_SESSION_ACTIVE &&
+ session->state < J1939_SESSION_ACTIVE_MAX) {
+ netdev_alert(priv->ndev, "%s: 0x%p: rx timeout, send abort\n",
+ __func__, session);
+ j1939_session_get(session);
+ hrtimer_start(&session->rxtimer,
+ ms_to_ktime(J1939_XTP_ABORT_TIMEOUT_MS),
+ HRTIMER_MODE_REL_SOFT);
+ __j1939_session_cancel(session, J1939_XTP_ABORT_TIMEOUT);
+ }
+ j1939_session_list_unlock(session->priv);
+
+ if (!session->sk)
+ j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT);
+ }
+
+ j1939_session_put(session);
+
+ return HRTIMER_NORESTART;
+}
+
+static bool j1939_xtp_rx_cmd_bad_pgn(struct j1939_session *session,
+ const struct sk_buff *skb)
+{
+ const struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ pgn_t pgn = j1939_xtp_ctl_to_pgn(skb->data);
+ struct j1939_priv *priv = session->priv;
+ enum j1939_xtp_abort abort = J1939_XTP_NO_ABORT;
+ u8 cmd = skb->data[0];
+
+ if (session->skcb.addr.pgn == pgn)
+ return false;
+
+ switch (cmd) {
+ case J1939_TP_CMD_BAM:
+ abort = J1939_XTP_NO_ABORT;
+ break;
+
+ case J1939_ETP_CMD_RTS:
+ fallthrough;
+ case J1939_TP_CMD_RTS:
+ abort = J1939_XTP_ABORT_BUSY;
+ break;
+
+ case J1939_ETP_CMD_CTS:
+ fallthrough;
+ case J1939_TP_CMD_CTS:
+ abort = J1939_XTP_ABORT_ECTS_UNXPECTED_PGN;
+ break;
+
+ case J1939_ETP_CMD_DPO:
+ abort = J1939_XTP_ABORT_BAD_EDPO_PGN;
+ break;
+
+ case J1939_ETP_CMD_EOMA:
+ fallthrough;
+ case J1939_TP_CMD_EOMA:
+ abort = J1939_XTP_ABORT_OTHER;
+ break;
+
+ case J1939_ETP_CMD_ABORT: /* && J1939_TP_CMD_ABORT */
+ abort = J1939_XTP_NO_ABORT;
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ netdev_warn(priv->ndev, "%s: 0x%p: CMD 0x%02x with PGN 0x%05x for running session with different PGN 0x%05x.\n",
+ __func__, session, cmd, pgn, session->skcb.addr.pgn);
+ if (abort != J1939_XTP_NO_ABORT)
+ j1939_xtp_tx_abort(priv, skcb, true, abort, pgn);
+
+ return true;
+}
+
+static void j1939_xtp_rx_abort_one(struct j1939_priv *priv, struct sk_buff *skb,
+ bool reverse, bool transmitter)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+ u8 abort = skb->data[1];
+
+ session = j1939_session_get_by_addr(priv, &skcb->addr, reverse,
+ transmitter);
+ if (!session)
+ return;
+
+ if (j1939_xtp_rx_cmd_bad_pgn(session, skb))
+ goto abort_put;
+
+ netdev_info(priv->ndev, "%s: 0x%p: 0x%05x: (%u) %s\n", __func__,
+ session, j1939_xtp_ctl_to_pgn(skb->data), abort,
+ j1939_xtp_abort_to_str(abort));
+
+ j1939_session_timers_cancel(session);
+ session->err = j1939_xtp_abort_to_errno(priv, abort);
+ if (session->sk)
+ j1939_sk_send_loop_abort(session->sk, session->err);
+ else
+ j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_ABORT);
+ j1939_session_deactivate_activate_next(session);
+
+abort_put:
+ j1939_session_put(session);
+}
+
+/* abort packets may come in 2 directions */
+static void
+j1939_xtp_rx_abort(struct j1939_priv *priv, struct sk_buff *skb,
+ bool transmitter)
+{
+ j1939_xtp_rx_abort_one(priv, skb, false, transmitter);
+ j1939_xtp_rx_abort_one(priv, skb, true, transmitter);
+}
+
+static void
+j1939_xtp_rx_eoma_one(struct j1939_session *session, struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ const u8 *dat;
+ int len;
+
+ if (j1939_xtp_rx_cmd_bad_pgn(session, skb))
+ return;
+
+ dat = skb->data;
+
+ if (skcb->addr.type == J1939_ETP)
+ len = j1939_etp_ctl_to_size(dat);
+ else
+ len = j1939_tp_ctl_to_size(dat);
+
+ if (session->total_message_size != len) {
+ netdev_warn_once(session->priv->ndev,
+ "%s: 0x%p: Incorrect size. Expected: %i; got: %i.\n",
+ __func__, session, session->total_message_size,
+ len);
+ }
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ session->pkt.tx_acked = session->pkt.total;
+ j1939_session_timers_cancel(session);
+ /* transmitted without problems */
+ j1939_session_completed(session);
+}
+
+static void
+j1939_xtp_rx_eoma(struct j1939_priv *priv, struct sk_buff *skb,
+ bool transmitter)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+
+ session = j1939_session_get_by_addr(priv, &skcb->addr, true,
+ transmitter);
+ if (!session)
+ return;
+
+ j1939_xtp_rx_eoma_one(session, skb);
+ j1939_session_put(session);
+}
+
+static void
+j1939_xtp_rx_cts_one(struct j1939_session *session, struct sk_buff *skb)
+{
+ enum j1939_xtp_abort err = J1939_XTP_ABORT_FAULT;
+ unsigned int pkt;
+ const u8 *dat;
+
+ dat = skb->data;
+
+ if (j1939_xtp_rx_cmd_bad_pgn(session, skb))
+ return;
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ if (session->last_cmd == dat[0]) {
+ err = J1939_XTP_ABORT_DUP_SEQ;
+ goto out_session_cancel;
+ }
+
+ if (session->skcb.addr.type == J1939_ETP)
+ pkt = j1939_etp_ctl_to_packet(dat);
+ else
+ pkt = dat[2];
+
+ if (!pkt)
+ goto out_session_cancel;
+ else if (dat[1] > session->pkt.block /* 0xff for etp */)
+ goto out_session_cancel;
+
+ /* set packet counters only when not CTS(0) */
+ session->pkt.tx_acked = pkt - 1;
+ j1939_session_skb_drop_old(session);
+ session->pkt.last = session->pkt.tx_acked + dat[1];
+ if (session->pkt.last > session->pkt.total)
+ /* safety measure */
+ session->pkt.last = session->pkt.total;
+ /* TODO: do not set tx here, do it in txtimer */
+ session->pkt.tx = session->pkt.tx_acked;
+
+ session->last_cmd = dat[0];
+ if (dat[1]) {
+ j1939_tp_set_rxtimeout(session, 1250);
+ if (session->transmission) {
+ if (session->pkt.tx_acked)
+ j1939_sk_errqueue(session,
+ J1939_ERRQUEUE_TX_SCHED);
+ j1939_session_txtimer_cancel(session);
+ j1939_tp_schedule_txtimer(session, 0);
+ }
+ } else {
+ /* CTS(0) */
+ j1939_tp_set_rxtimeout(session, 550);
+ }
+ return;
+
+ out_session_cancel:
+ j1939_session_timers_cancel(session);
+ j1939_session_cancel(session, err);
+}
+
+static void
+j1939_xtp_rx_cts(struct j1939_priv *priv, struct sk_buff *skb, bool transmitter)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+
+ session = j1939_session_get_by_addr(priv, &skcb->addr, true,
+ transmitter);
+ if (!session)
+ return;
+ j1939_xtp_rx_cts_one(session, skb);
+ j1939_session_put(session);
+}
+
+static struct j1939_session *j1939_session_new(struct j1939_priv *priv,
+ struct sk_buff *skb, size_t size)
+{
+ struct j1939_session *session;
+ struct j1939_sk_buff_cb *skcb;
+
+ session = kzalloc(sizeof(*session), gfp_any());
+ if (!session)
+ return NULL;
+
+ INIT_LIST_HEAD(&session->active_session_list_entry);
+ INIT_LIST_HEAD(&session->sk_session_queue_entry);
+ kref_init(&session->kref);
+
+ j1939_priv_get(priv);
+ session->priv = priv;
+ session->total_message_size = size;
+ session->state = J1939_SESSION_NEW;
+
+ skb_queue_head_init(&session->skb_queue);
+ skb_queue_tail(&session->skb_queue, skb_get(skb));
+
+ skcb = j1939_skb_to_cb(skb);
+ memcpy(&session->skcb, skcb, sizeof(session->skcb));
+
+ hrtimer_setup(&session->txtimer, j1939_tp_txtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
+ hrtimer_setup(&session->rxtimer, j1939_tp_rxtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
+
+ netdev_dbg(priv->ndev, "%s: 0x%p: sa: %02x, da: %02x\n",
+ __func__, session, skcb->addr.sa, skcb->addr.da);
+
+ return session;
+}
+
+static struct
+j1939_session *j1939_session_fresh_new(struct j1939_priv *priv,
+ int size,
+ const struct j1939_sk_buff_cb *rel_skcb)
+{
+ struct sk_buff *skb;
+ struct j1939_sk_buff_cb *skcb;
+ struct j1939_session *session;
+
+ skb = alloc_skb(size + sizeof(struct can_skb_priv), GFP_ATOMIC);
+ if (unlikely(!skb))
+ return NULL;
+
+ skb->dev = priv->ndev;
+ can_skb_reserve(skb);
+ can_skb_prv(skb)->ifindex = priv->ndev->ifindex;
+ can_skb_prv(skb)->skbcnt = 0;
+ skcb = j1939_skb_to_cb(skb);
+ memcpy(skcb, rel_skcb, sizeof(*skcb));
+
+ session = j1939_session_new(priv, skb, size);
+ if (!session) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ /* alloc data area */
+ skb_put(skb, size);
+ /* skb is recounted in j1939_session_new() */
+ return session;
+}
+
+int j1939_session_activate(struct j1939_session *session)
+{
+ struct j1939_priv *priv = session->priv;
+ struct j1939_session *active = NULL;
+ int ret = 0;
+
+ j1939_session_list_lock(priv);
+ if (session->skcb.addr.type != J1939_SIMPLE)
+ active = j1939_session_get_by_addr_locked(priv,
+ &priv->active_session_list,
+ &session->skcb.addr, false,
+ session->transmission);
+ if (active) {
+ j1939_session_put(active);
+ ret = -EAGAIN;
+ } else {
+ WARN_ON_ONCE(session->state != J1939_SESSION_NEW);
+ list_add_tail(&session->active_session_list_entry,
+ &priv->active_session_list);
+ j1939_session_get(session);
+ session->state = J1939_SESSION_ACTIVE;
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n",
+ __func__, session);
+ }
+ j1939_session_list_unlock(priv);
+
+ return ret;
+}
+
+static struct
+j1939_session *j1939_xtp_rx_rts_session_new(struct j1939_priv *priv,
+ struct sk_buff *skb)
+{
+ enum j1939_xtp_abort abort = J1939_XTP_NO_ABORT;
+ struct j1939_sk_buff_cb skcb = *j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+ const u8 *dat;
+ int len, ret;
+ pgn_t pgn;
+
+ netdev_dbg(priv->ndev, "%s\n", __func__);
+
+ dat = skb->data;
+ pgn = j1939_xtp_ctl_to_pgn(dat);
+ skcb.addr.pgn = pgn;
+
+ if (!j1939_sk_recv_match(priv, &skcb))
+ return NULL;
+
+ if (skcb.addr.type == J1939_ETP) {
+ len = j1939_etp_ctl_to_size(dat);
+ if (len > J1939_MAX_ETP_PACKET_SIZE)
+ abort = J1939_XTP_ABORT_FAULT;
+ else if (len > priv->tp_max_packet_size)
+ abort = J1939_XTP_ABORT_RESOURCE;
+ else if (len <= J1939_MAX_TP_PACKET_SIZE)
+ abort = J1939_XTP_ABORT_FAULT;
+ } else {
+ len = j1939_tp_ctl_to_size(dat);
+ if (len > J1939_MAX_TP_PACKET_SIZE)
+ abort = J1939_XTP_ABORT_FAULT;
+ else if (len > priv->tp_max_packet_size)
+ abort = J1939_XTP_ABORT_RESOURCE;
+ else if (len < J1939_MIN_TP_PACKET_SIZE)
+ abort = J1939_XTP_ABORT_FAULT;
+ }
+
+ if (abort != J1939_XTP_NO_ABORT) {
+ j1939_xtp_tx_abort(priv, &skcb, true, abort, pgn);
+ return NULL;
+ }
+
+ session = j1939_session_fresh_new(priv, len, &skcb);
+ if (!session) {
+ j1939_xtp_tx_abort(priv, &skcb, true,
+ J1939_XTP_ABORT_RESOURCE, pgn);
+ return NULL;
+ }
+
+ /* initialize the control buffer: plain copy */
+ session->pkt.total = (len + 6) / 7;
+ session->pkt.block = 0xff;
+ if (skcb.addr.type != J1939_ETP) {
+ if (dat[3] != session->pkt.total)
+ netdev_alert(priv->ndev, "%s: 0x%p: strange total, %u != %u\n",
+ __func__, session, session->pkt.total,
+ dat[3]);
+ session->pkt.total = dat[3];
+ session->pkt.block = min(dat[3], dat[4]);
+ }
+
+ session->pkt.rx = 0;
+ session->pkt.tx = 0;
+
+ session->tskey = priv->rx_tskey++;
+ j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_RTS);
+
+ ret = j1939_session_activate(session);
+ if (ret) {
+ /* Entering this scope indicates an issue with the J1939 bus.
+ * Possible scenarios include:
+ * - A time lapse occurred, and a new session was initiated
+ * due to another packet being sent correctly. This could
+ * have been caused by too long interrupt, debugger, or being
+ * out-scheduled by another task.
+ * - The bus is receiving numerous erroneous packets, either
+ * from a malfunctioning device or during a test scenario.
+ */
+ netdev_alert(priv->ndev, "%s: 0x%p: concurrent session with same addr (%02x %02x) is already active.\n",
+ __func__, session, skcb.addr.sa, skcb.addr.da);
+ j1939_session_put(session);
+ return NULL;
+ }
+
+ return session;
+}
+
+static int j1939_xtp_rx_rts_session_active(struct j1939_session *session,
+ struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_priv *priv = session->priv;
+
+ if (!session->transmission) {
+ if (j1939_xtp_rx_cmd_bad_pgn(session, skb))
+ return -EBUSY;
+
+ /* RTS on active session */
+ j1939_session_timers_cancel(session);
+ j1939_session_cancel(session, J1939_XTP_ABORT_BUSY);
+ }
+
+ if (session->last_cmd != 0) {
+ /* we received a second rts on the same connection */
+ netdev_alert(priv->ndev, "%s: 0x%p: connection exists (%02x %02x). last cmd: %x\n",
+ __func__, session, skcb->addr.sa, skcb->addr.da,
+ session->last_cmd);
+
+ j1939_session_timers_cancel(session);
+ j1939_session_cancel(session, J1939_XTP_ABORT_BUSY);
+ if (session->transmission)
+ j1939_session_deactivate_activate_next(session);
+
+ return -EBUSY;
+ }
+
+ if (session->skcb.addr.sa != skcb->addr.sa ||
+ session->skcb.addr.da != skcb->addr.da)
+ netdev_warn(priv->ndev, "%s: 0x%p: session->skcb.addr.sa=0x%02x skcb->addr.sa=0x%02x session->skcb.addr.da=0x%02x skcb->addr.da=0x%02x\n",
+ __func__, session,
+ session->skcb.addr.sa, skcb->addr.sa,
+ session->skcb.addr.da, skcb->addr.da);
+ /* make sure 'sa' & 'da' are correct !
+ * They may be 'not filled in yet' for sending
+ * skb's, since they did not pass the Address Claim ever.
+ */
+ session->skcb.addr.sa = skcb->addr.sa;
+ session->skcb.addr.da = skcb->addr.da;
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ return 0;
+}
+
+static void j1939_xtp_rx_rts(struct j1939_priv *priv, struct sk_buff *skb,
+ bool transmitter)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+ u8 cmd = skb->data[0];
+
+ session = j1939_session_get_by_addr(priv, &skcb->addr, false,
+ transmitter);
+
+ if (!session) {
+ if (transmitter) {
+ /* If we're the transmitter and this function is called,
+ * we received our own RTS. A session has already been
+ * created.
+ *
+ * For some reasons however it might have been destroyed
+ * already. So don't create a new one here (using
+ * "j1939_xtp_rx_rts_session_new()") as this will be a
+ * receiver session.
+ *
+ * The reasons the session is already destroyed might
+ * be:
+ * - user space closed socket was and the session was
+ * aborted
+ * - session was aborted due to external abort message
+ */
+ return;
+ }
+ session = j1939_xtp_rx_rts_session_new(priv, skb);
+ if (!session) {
+ if (cmd == J1939_TP_CMD_BAM && j1939_sk_recv_match(priv, skcb))
+ netdev_info(priv->ndev, "%s: failed to create TP BAM session\n",
+ __func__);
+ return;
+ }
+ } else {
+ if (j1939_xtp_rx_rts_session_active(session, skb)) {
+ j1939_session_put(session);
+ return;
+ }
+ }
+ session->last_cmd = cmd;
+
+ if (cmd == J1939_TP_CMD_BAM) {
+ if (!session->transmission)
+ j1939_tp_set_rxtimeout(session, 750);
+ } else {
+ if (!session->transmission) {
+ j1939_session_txtimer_cancel(session);
+ j1939_tp_schedule_txtimer(session, 0);
+ }
+ j1939_tp_set_rxtimeout(session, 1250);
+ }
+
+ j1939_session_put(session);
+}
+
+static void j1939_xtp_rx_dpo_one(struct j1939_session *session,
+ struct sk_buff *skb)
+{
+ const u8 *dat = skb->data;
+
+ if (j1939_xtp_rx_cmd_bad_pgn(session, skb))
+ return;
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session);
+
+ /* transmitted without problems */
+ session->pkt.dpo = j1939_etp_ctl_to_packet(skb->data);
+ session->last_cmd = dat[0];
+ j1939_tp_set_rxtimeout(session, 750);
+
+ if (!session->transmission)
+ j1939_sk_errqueue(session, J1939_ERRQUEUE_RX_DPO);
+}
+
+static void j1939_xtp_rx_dpo(struct j1939_priv *priv, struct sk_buff *skb,
+ bool transmitter)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+
+ session = j1939_session_get_by_addr(priv, &skcb->addr, false,
+ transmitter);
+ if (!session) {
+ netdev_info(priv->ndev,
+ "%s: no connection found\n", __func__);
+ return;
+ }
+
+ j1939_xtp_rx_dpo_one(session, skb);
+ j1939_session_put(session);
+}
+
+static void j1939_xtp_rx_dat_one(struct j1939_session *session,
+ struct sk_buff *skb)
+{
+ enum j1939_xtp_abort abort = J1939_XTP_ABORT_FAULT;
+ struct j1939_priv *priv = session->priv;
+ struct j1939_sk_buff_cb *skcb, *se_skcb;
+ struct sk_buff *se_skb = NULL;
+ const u8 *dat;
+ u8 *tpdat;
+ int offset;
+ int nbytes;
+ bool final = false;
+ bool remain = false;
+ bool do_cts_eoma = false;
+ int packet;
+
+ skcb = j1939_skb_to_cb(skb);
+ dat = skb->data;
+ if (skb->len != 8) {
+ /* makes no sense */
+ abort = J1939_XTP_ABORT_UNEXPECTED_DATA;
+ goto out_session_cancel;
+ }
+
+ switch (session->last_cmd) {
+ case 0xff:
+ break;
+ case J1939_ETP_CMD_DPO:
+ if (skcb->addr.type == J1939_ETP)
+ break;
+ fallthrough;
+ case J1939_TP_CMD_BAM:
+ fallthrough;
+ case J1939_TP_CMD_CTS:
+ if (skcb->addr.type != J1939_ETP)
+ break;
+ fallthrough;
+ default:
+ netdev_info(priv->ndev, "%s: 0x%p: last %02x\n", __func__,
+ session, session->last_cmd);
+ goto out_session_cancel;
+ }
+
+ packet = (dat[0] - 1 + session->pkt.dpo);
+ if (packet > session->pkt.total ||
+ (session->pkt.rx + 1) > session->pkt.total) {
+ netdev_info(priv->ndev, "%s: 0x%p: should have been completed\n",
+ __func__, session);
+ goto out_session_cancel;
+ }
+
+ se_skb = j1939_session_skb_get_by_offset(session, packet * 7);
+ if (!se_skb) {
+ netdev_warn(priv->ndev, "%s: 0x%p: no skb found\n", __func__,
+ session);
+ goto out_session_cancel;
+ }
+
+ se_skcb = j1939_skb_to_cb(se_skb);
+ offset = packet * 7 - se_skcb->offset;
+ nbytes = se_skb->len - offset;
+ if (nbytes > 7)
+ nbytes = 7;
+ if (nbytes <= 0 || (nbytes + 1) > skb->len) {
+ netdev_info(priv->ndev, "%s: 0x%p: nbytes %i, len %i\n",
+ __func__, session, nbytes, skb->len);
+ goto out_session_cancel;
+ }
+
+ tpdat = se_skb->data;
+ if (!session->transmission) {
+ memcpy(&tpdat[offset], &dat[1], nbytes);
+ } else {
+ int err;
+
+ err = memcmp(&tpdat[offset], &dat[1], nbytes);
+ if (err)
+ netdev_err_once(priv->ndev,
+ "%s: 0x%p: Data of RX-looped back packet (%*ph) doesn't match TX data (%*ph)!\n",
+ __func__, session,
+ nbytes, &dat[1],
+ nbytes, &tpdat[offset]);
+ }
+
+ if (packet == session->pkt.rx)
+ session->pkt.rx++;
+
+ if (se_skcb->addr.type != J1939_ETP &&
+ j1939_cb_is_broadcast(&session->skcb)) {
+ if (session->pkt.rx >= session->pkt.total)
+ final = true;
+ else
+ remain = true;
+ } else {
+ /* never final, an EOMA must follow */
+ if (session->pkt.rx >= session->pkt.last)
+ do_cts_eoma = true;
+ }
+
+ if (final) {
+ j1939_session_timers_cancel(session);
+ j1939_session_completed(session);
+ } else if (remain) {
+ if (!session->transmission)
+ j1939_tp_set_rxtimeout(session, 750);
+ } else if (do_cts_eoma) {
+ j1939_tp_set_rxtimeout(session, 1250);
+ if (!session->transmission)
+ j1939_tp_schedule_txtimer(session, 0);
+ } else {
+ j1939_tp_set_rxtimeout(session, 750);
+ }
+ session->last_cmd = 0xff;
+ consume_skb(se_skb);
+ j1939_session_put(session);
+
+ return;
+
+ out_session_cancel:
+ kfree_skb(se_skb);
+ j1939_session_timers_cancel(session);
+ j1939_session_cancel(session, abort);
+ j1939_session_put(session);
+}
+
+static void j1939_xtp_rx_dat(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb;
+ struct j1939_session *session;
+
+ skcb = j1939_skb_to_cb(skb);
+
+ if (j1939_tp_im_transmitter(skcb)) {
+ session = j1939_session_get_by_addr(priv, &skcb->addr, false,
+ true);
+ if (!session)
+ netdev_info(priv->ndev, "%s: no tx connection found\n",
+ __func__);
+ else
+ j1939_xtp_rx_dat_one(session, skb);
+ }
+
+ if (j1939_tp_im_receiver(skcb)) {
+ session = j1939_session_get_by_addr(priv, &skcb->addr, false,
+ false);
+ if (!session)
+ netdev_info(priv->ndev, "%s: no rx connection found\n",
+ __func__);
+ else
+ j1939_xtp_rx_dat_one(session, skb);
+ }
+
+ if (j1939_cb_is_broadcast(skcb)) {
+ session = j1939_session_get_by_addr(priv, &skcb->addr, false,
+ false);
+ if (session)
+ j1939_xtp_rx_dat_one(session, skb);
+ }
+}
+
+/* j1939 main intf */
+struct j1939_session *j1939_tp_send(struct j1939_priv *priv,
+ struct sk_buff *skb, size_t size)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ struct j1939_session *session;
+ int ret;
+
+ if (skcb->addr.pgn == J1939_TP_PGN_DAT ||
+ skcb->addr.pgn == J1939_TP_PGN_CTL ||
+ skcb->addr.pgn == J1939_ETP_PGN_DAT ||
+ skcb->addr.pgn == J1939_ETP_PGN_CTL)
+ /* avoid conflict */
+ return ERR_PTR(-EDOM);
+
+ if (size > priv->tp_max_packet_size)
+ return ERR_PTR(-EMSGSIZE);
+
+ if (size <= 8)
+ skcb->addr.type = J1939_SIMPLE;
+ else if (size > J1939_MAX_TP_PACKET_SIZE)
+ skcb->addr.type = J1939_ETP;
+ else
+ skcb->addr.type = J1939_TP;
+
+ if (skcb->addr.type == J1939_ETP &&
+ j1939_cb_is_broadcast(skcb))
+ return ERR_PTR(-EDESTADDRREQ);
+
+ /* fill in addresses from names */
+ ret = j1939_ac_fixup(priv, skb);
+ if (unlikely(ret))
+ return ERR_PTR(ret);
+
+ /* fix DST flags, it may be used there soon */
+ if (j1939_address_is_unicast(skcb->addr.da) &&
+ priv->ents[skcb->addr.da].nusers)
+ skcb->flags |= J1939_ECU_LOCAL_DST;
+
+ /* src is always local, I'm sending ... */
+ skcb->flags |= J1939_ECU_LOCAL_SRC;
+
+ /* prepare new session */
+ session = j1939_session_new(priv, skb, size);
+ if (!session)
+ return ERR_PTR(-ENOMEM);
+
+ /* skb is recounted in j1939_session_new() */
+ sock_hold(skb->sk);
+ session->sk = skb->sk;
+ session->transmission = true;
+ session->pkt.total = (size + 6) / 7;
+ session->pkt.block = skcb->addr.type == J1939_ETP ? 255 :
+ min(j1939_tp_block ?: 255, session->pkt.total);
+
+ if (j1939_cb_is_broadcast(&session->skcb))
+ /* set the end-packet for broadcast */
+ session->pkt.last = session->pkt.total;
+
+ skcb->tskey = atomic_inc_return(&session->sk->sk_tskey) - 1;
+ session->tskey = skcb->tskey;
+
+ return session;
+}
+
+static void j1939_tp_cmd_recv(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+ int extd = J1939_TP;
+ u8 cmd = skb->data[0];
+
+ switch (cmd) {
+ case J1939_ETP_CMD_RTS:
+ extd = J1939_ETP;
+ fallthrough;
+ case J1939_TP_CMD_BAM:
+ if (cmd == J1939_TP_CMD_BAM && !j1939_cb_is_broadcast(skcb)) {
+ netdev_err_once(priv->ndev, "%s: BAM to unicast (%02x), ignoring!\n",
+ __func__, skcb->addr.sa);
+ return;
+ }
+ fallthrough;
+ case J1939_TP_CMD_RTS:
+ if (skcb->addr.type != extd)
+ return;
+
+ if (cmd == J1939_TP_CMD_RTS && j1939_cb_is_broadcast(skcb)) {
+ netdev_alert(priv->ndev, "%s: rts without destination (%02x)\n",
+ __func__, skcb->addr.sa);
+ return;
+ }
+
+ if (j1939_tp_im_transmitter(skcb))
+ j1939_xtp_rx_rts(priv, skb, true);
+
+ if (j1939_tp_im_receiver(skcb) || j1939_cb_is_broadcast(skcb))
+ j1939_xtp_rx_rts(priv, skb, false);
+
+ break;
+
+ case J1939_ETP_CMD_CTS:
+ extd = J1939_ETP;
+ fallthrough;
+ case J1939_TP_CMD_CTS:
+ if (skcb->addr.type != extd)
+ return;
+
+ if (j1939_tp_im_transmitter(skcb))
+ j1939_xtp_rx_cts(priv, skb, false);
+
+ if (j1939_tp_im_receiver(skcb))
+ j1939_xtp_rx_cts(priv, skb, true);
+
+ break;
+
+ case J1939_ETP_CMD_DPO:
+ if (skcb->addr.type != J1939_ETP)
+ return;
+
+ if (j1939_tp_im_transmitter(skcb))
+ j1939_xtp_rx_dpo(priv, skb, true);
+
+ if (j1939_tp_im_receiver(skcb))
+ j1939_xtp_rx_dpo(priv, skb, false);
+
+ break;
+
+ case J1939_ETP_CMD_EOMA:
+ extd = J1939_ETP;
+ fallthrough;
+ case J1939_TP_CMD_EOMA:
+ if (skcb->addr.type != extd)
+ return;
+
+ if (j1939_tp_im_transmitter(skcb))
+ j1939_xtp_rx_eoma(priv, skb, false);
+
+ if (j1939_tp_im_receiver(skcb))
+ j1939_xtp_rx_eoma(priv, skb, true);
+
+ break;
+
+ case J1939_ETP_CMD_ABORT: /* && J1939_TP_CMD_ABORT */
+ if (j1939_cb_is_broadcast(skcb)) {
+ netdev_err_once(priv->ndev, "%s: abort to broadcast (%02x), ignoring!\n",
+ __func__, skcb->addr.sa);
+ return;
+ }
+
+ if (j1939_tp_im_transmitter(skcb))
+ j1939_xtp_rx_abort(priv, skb, true);
+
+ if (j1939_tp_im_receiver(skcb))
+ j1939_xtp_rx_abort(priv, skb, false);
+
+ break;
+ default:
+ return;
+ }
+}
+
+int j1939_tp_recv(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_sk_buff_cb *skcb = j1939_skb_to_cb(skb);
+
+ if (!j1939_tp_im_involved_anydir(skcb) && !j1939_cb_is_broadcast(skcb))
+ return 0;
+
+ switch (skcb->addr.pgn) {
+ case J1939_ETP_PGN_DAT:
+ skcb->addr.type = J1939_ETP;
+ fallthrough;
+ case J1939_TP_PGN_DAT:
+ j1939_xtp_rx_dat(priv, skb);
+ break;
+
+ case J1939_ETP_PGN_CTL:
+ skcb->addr.type = J1939_ETP;
+ fallthrough;
+ case J1939_TP_PGN_CTL:
+ if (skb->len < 8)
+ return 0; /* Don't care. Nothing to extract here */
+
+ j1939_tp_cmd_recv(priv, skb);
+ break;
+ default:
+ return 0; /* no problem */
+ }
+ return 1; /* "I processed the message" */
+}
+
+void j1939_simple_recv(struct j1939_priv *priv, struct sk_buff *skb)
+{
+ struct j1939_session *session;
+
+ if (!skb->sk)
+ return;
+
+ if (skb->sk->sk_family != AF_CAN ||
+ skb->sk->sk_protocol != CAN_J1939)
+ return;
+
+ j1939_session_list_lock(priv);
+ session = j1939_session_get_simple(priv, skb);
+ j1939_session_list_unlock(priv);
+ if (!session) {
+ netdev_warn(priv->ndev,
+ "%s: Received already invalidated message\n",
+ __func__);
+ return;
+ }
+
+ j1939_session_timers_cancel(session);
+ j1939_session_deactivate(session);
+ j1939_session_put(session);
+}
+
+int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk)
+{
+ struct j1939_session *session, *saved;
+
+ netdev_dbg(priv->ndev, "%s, sk: %p\n", __func__, sk);
+ j1939_session_list_lock(priv);
+ list_for_each_entry_safe(session, saved,
+ &priv->active_session_list,
+ active_session_list_entry) {
+ if (!sk || sk == session->sk) {
+ if (hrtimer_try_to_cancel(&session->txtimer) == 1)
+ j1939_session_put(session);
+ if (hrtimer_try_to_cancel(&session->rxtimer) == 1)
+ j1939_session_put(session);
+
+ session->err = ESHUTDOWN;
+ j1939_session_deactivate_locked(session);
+ }
+ }
+ j1939_session_list_unlock(priv);
+ return NOTIFY_DONE;
+}
+
+void j1939_tp_init(struct j1939_priv *priv)
+{
+ spin_lock_init(&priv->active_session_list_lock);
+ INIT_LIST_HEAD(&priv->active_session_list);
+ priv->tp_max_packet_size = J1939_MAX_ETP_PACKET_SIZE;
+}
diff --git a/net/can/proc.c b/net/can/proc.c
index 70fea17bb04c..0938bf7dd646 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
/*
* proc.c - procfs support for Protocol family CAN core module
*
@@ -44,6 +45,7 @@
#include <linux/list.h>
#include <linux/rcupdate.h>
#include <linux/if_arp.h>
+#include <linux/can/can-ml.h>
#include <linux/can/core.h>
#include "af_can.h"
@@ -52,7 +54,6 @@
* proc filenames for the PF_CAN core
*/
-#define CAN_PROC_VERSION "version"
#define CAN_PROC_STATS "stats"
#define CAN_PROC_RESET_STATS "reset_stats"
#define CAN_PROC_RCVLIST_ALL "rcvlist_all"
@@ -77,29 +78,27 @@ static const char rx_list_name[][8] = {
static void can_init_stats(struct net *net)
{
- struct s_stats *can_stats = net->can.can_stats;
- struct s_pstats *can_pstats = net->can.can_pstats;
+ struct can_pkg_stats *pkg_stats = net->can.pkg_stats;
+ struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats;
/*
* This memset function is called from a timer context (when
* can_stattimer is active which is the default) OR in a process
* context (reading the proc_fs when can_stattimer is disabled).
*/
- memset(can_stats, 0, sizeof(struct s_stats));
- can_stats->jiffies_init = jiffies;
+ memset(pkg_stats, 0, sizeof(struct can_pkg_stats));
+ pkg_stats->jiffies_init = jiffies;
- can_pstats->stats_reset++;
+ rcv_lists_stats->stats_reset++;
if (user_reset) {
user_reset = 0;
- can_pstats->user_reset++;
+ rcv_lists_stats->user_reset++;
}
}
static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif,
unsigned long count)
{
- unsigned long rate;
-
if (oldjif == newjif)
return 0;
@@ -110,73 +109,76 @@ static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif,
return 99999999;
}
- rate = (count * HZ) / (newjif - oldjif);
-
- return rate;
+ return (count * HZ) / (newjif - oldjif);
}
void can_stat_update(struct timer_list *t)
{
- struct net *net = from_timer(net, t, can.can_stattimer);
- struct s_stats *can_stats = net->can.can_stats;
+ struct net *net = timer_container_of(net, t, can.stattimer);
+ struct can_pkg_stats *pkg_stats = net->can.pkg_stats;
unsigned long j = jiffies; /* snapshot */
+ long rx_frames = atomic_long_read(&pkg_stats->rx_frames);
+ long tx_frames = atomic_long_read(&pkg_stats->tx_frames);
+ long matches = atomic_long_read(&pkg_stats->matches);
+ long rx_frames_delta = atomic_long_read(&pkg_stats->rx_frames_delta);
+ long tx_frames_delta = atomic_long_read(&pkg_stats->tx_frames_delta);
+ long matches_delta = atomic_long_read(&pkg_stats->matches_delta);
+
/* restart counting in timer context on user request */
if (user_reset)
can_init_stats(net);
/* restart counting on jiffies overflow */
- if (j < can_stats->jiffies_init)
+ if (j < pkg_stats->jiffies_init)
can_init_stats(net);
/* prevent overflow in calc_rate() */
- if (can_stats->rx_frames > (ULONG_MAX / HZ))
+ if (rx_frames > (LONG_MAX / HZ))
can_init_stats(net);
/* prevent overflow in calc_rate() */
- if (can_stats->tx_frames > (ULONG_MAX / HZ))
+ if (tx_frames > (LONG_MAX / HZ))
can_init_stats(net);
/* matches overflow - very improbable */
- if (can_stats->matches > (ULONG_MAX / 100))
+ if (matches > (LONG_MAX / 100))
can_init_stats(net);
/* calc total values */
- if (can_stats->rx_frames)
- can_stats->total_rx_match_ratio = (can_stats->matches * 100) /
- can_stats->rx_frames;
+ if (rx_frames)
+ pkg_stats->total_rx_match_ratio = (matches * 100) / rx_frames;
- can_stats->total_tx_rate = calc_rate(can_stats->jiffies_init, j,
- can_stats->tx_frames);
- can_stats->total_rx_rate = calc_rate(can_stats->jiffies_init, j,
- can_stats->rx_frames);
+ pkg_stats->total_tx_rate = calc_rate(pkg_stats->jiffies_init, j,
+ tx_frames);
+ pkg_stats->total_rx_rate = calc_rate(pkg_stats->jiffies_init, j,
+ rx_frames);
/* calc current values */
- if (can_stats->rx_frames_delta)
- can_stats->current_rx_match_ratio =
- (can_stats->matches_delta * 100) /
- can_stats->rx_frames_delta;
+ if (rx_frames_delta)
+ pkg_stats->current_rx_match_ratio =
+ (matches_delta * 100) / rx_frames_delta;
- can_stats->current_tx_rate = calc_rate(0, HZ, can_stats->tx_frames_delta);
- can_stats->current_rx_rate = calc_rate(0, HZ, can_stats->rx_frames_delta);
+ pkg_stats->current_tx_rate = calc_rate(0, HZ, tx_frames_delta);
+ pkg_stats->current_rx_rate = calc_rate(0, HZ, rx_frames_delta);
/* check / update maximum values */
- if (can_stats->max_tx_rate < can_stats->current_tx_rate)
- can_stats->max_tx_rate = can_stats->current_tx_rate;
+ if (pkg_stats->max_tx_rate < pkg_stats->current_tx_rate)
+ pkg_stats->max_tx_rate = pkg_stats->current_tx_rate;
- if (can_stats->max_rx_rate < can_stats->current_rx_rate)
- can_stats->max_rx_rate = can_stats->current_rx_rate;
+ if (pkg_stats->max_rx_rate < pkg_stats->current_rx_rate)
+ pkg_stats->max_rx_rate = pkg_stats->current_rx_rate;
- if (can_stats->max_rx_match_ratio < can_stats->current_rx_match_ratio)
- can_stats->max_rx_match_ratio = can_stats->current_rx_match_ratio;
+ if (pkg_stats->max_rx_match_ratio < pkg_stats->current_rx_match_ratio)
+ pkg_stats->max_rx_match_ratio = pkg_stats->current_rx_match_ratio;
/* clear values for 'current rate' calculation */
- can_stats->tx_frames_delta = 0;
- can_stats->rx_frames_delta = 0;
- can_stats->matches_delta = 0;
+ atomic_long_set(&pkg_stats->tx_frames_delta, 0);
+ atomic_long_set(&pkg_stats->rx_frames_delta, 0);
+ atomic_long_set(&pkg_stats->matches_delta, 0);
/* restart timer (one second) */
- mod_timer(&net->can.can_stattimer, round_jiffies(jiffies + HZ));
+ mod_timer(&net->can.stattimer, round_jiffies(jiffies + HZ));
}
/*
@@ -204,67 +206,72 @@ static void can_print_recv_banner(struct seq_file *m)
* can1. 00000000 00000000 00000000
* ....... 0 tp20
*/
- seq_puts(m, " device can_id can_mask function"
- " userdata matches ident\n");
+ if (IS_ENABLED(CONFIG_64BIT))
+ seq_puts(m, " device can_id can_mask function userdata matches ident\n");
+ else
+ seq_puts(m, " device can_id can_mask function userdata matches ident\n");
}
static int can_stats_proc_show(struct seq_file *m, void *v)
{
struct net *net = m->private;
- struct s_stats *can_stats = net->can.can_stats;
- struct s_pstats *can_pstats = net->can.can_pstats;
+ struct can_pkg_stats *pkg_stats = net->can.pkg_stats;
+ struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats;
seq_putc(m, '\n');
- seq_printf(m, " %8ld transmitted frames (TXF)\n", can_stats->tx_frames);
- seq_printf(m, " %8ld received frames (RXF)\n", can_stats->rx_frames);
- seq_printf(m, " %8ld matched frames (RXMF)\n", can_stats->matches);
+ seq_printf(m, " %8ld transmitted frames (TXF)\n",
+ atomic_long_read(&pkg_stats->tx_frames));
+ seq_printf(m, " %8ld received frames (RXF)\n",
+ atomic_long_read(&pkg_stats->rx_frames));
+ seq_printf(m, " %8ld matched frames (RXMF)\n",
+ atomic_long_read(&pkg_stats->matches));
seq_putc(m, '\n');
- if (net->can.can_stattimer.function == can_stat_update) {
+ if (net->can.stattimer.function == can_stat_update) {
seq_printf(m, " %8ld %% total match ratio (RXMR)\n",
- can_stats->total_rx_match_ratio);
+ pkg_stats->total_rx_match_ratio);
seq_printf(m, " %8ld frames/s total tx rate (TXR)\n",
- can_stats->total_tx_rate);
+ pkg_stats->total_tx_rate);
seq_printf(m, " %8ld frames/s total rx rate (RXR)\n",
- can_stats->total_rx_rate);
+ pkg_stats->total_rx_rate);
seq_putc(m, '\n');
seq_printf(m, " %8ld %% current match ratio (CRXMR)\n",
- can_stats->current_rx_match_ratio);
+ pkg_stats->current_rx_match_ratio);
seq_printf(m, " %8ld frames/s current tx rate (CTXR)\n",
- can_stats->current_tx_rate);
+ pkg_stats->current_tx_rate);
seq_printf(m, " %8ld frames/s current rx rate (CRXR)\n",
- can_stats->current_rx_rate);
+ pkg_stats->current_rx_rate);
seq_putc(m, '\n');
seq_printf(m, " %8ld %% max match ratio (MRXMR)\n",
- can_stats->max_rx_match_ratio);
+ pkg_stats->max_rx_match_ratio);
seq_printf(m, " %8ld frames/s max tx rate (MTXR)\n",
- can_stats->max_tx_rate);
+ pkg_stats->max_tx_rate);
seq_printf(m, " %8ld frames/s max rx rate (MRXR)\n",
- can_stats->max_rx_rate);
+ pkg_stats->max_rx_rate);
seq_putc(m, '\n');
}
seq_printf(m, " %8ld current receive list entries (CRCV)\n",
- can_pstats->rcv_entries);
+ rcv_lists_stats->rcv_entries);
seq_printf(m, " %8ld maximum receive list entries (MRCV)\n",
- can_pstats->rcv_entries_max);
+ rcv_lists_stats->rcv_entries_max);
- if (can_pstats->stats_reset)
+ if (rcv_lists_stats->stats_reset)
seq_printf(m, "\n %8ld statistic resets (STR)\n",
- can_pstats->stats_reset);
+ rcv_lists_stats->stats_reset);
- if (can_pstats->user_reset)
+ if (rcv_lists_stats->user_reset)
seq_printf(m, " %8ld user statistic resets (USTR)\n",
- can_pstats->user_reset);
+ rcv_lists_stats->user_reset);
seq_putc(m, '\n');
return 0;
@@ -273,37 +280,31 @@ static int can_stats_proc_show(struct seq_file *m, void *v)
static int can_reset_stats_proc_show(struct seq_file *m, void *v)
{
struct net *net = m->private;
- struct s_pstats *can_pstats = net->can.can_pstats;
- struct s_stats *can_stats = net->can.can_stats;
+ struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats;
+ struct can_pkg_stats *pkg_stats = net->can.pkg_stats;
user_reset = 1;
- if (net->can.can_stattimer.function == can_stat_update) {
+ if (net->can.stattimer.function == can_stat_update) {
seq_printf(m, "Scheduled statistic reset #%ld.\n",
- can_pstats->stats_reset + 1);
+ rcv_lists_stats->stats_reset + 1);
} else {
- if (can_stats->jiffies_init != jiffies)
+ if (pkg_stats->jiffies_init != jiffies)
can_init_stats(net);
seq_printf(m, "Performed statistic reset #%ld.\n",
- can_pstats->stats_reset);
+ rcv_lists_stats->stats_reset);
}
return 0;
}
-static int can_version_proc_show(struct seq_file *m, void *v)
-{
- seq_printf(m, "%s\n", CAN_VERSION_STRING);
- return 0;
-}
-
static inline void can_rcvlist_proc_show_one(struct seq_file *m, int idx,
struct net_device *dev,
- struct can_dev_rcv_lists *d)
+ struct can_dev_rcv_lists *dev_rcv_lists)
{
- if (!hlist_empty(&d->rx[idx])) {
+ if (!hlist_empty(&dev_rcv_lists->rx[idx])) {
can_print_recv_banner(m);
- can_print_rcvlist(m, &d->rx[idx], dev);
+ can_print_rcvlist(m, &dev_rcv_lists->rx[idx], dev);
} else
seq_printf(m, " (%s: no entry)\n", DNAME(dev));
@@ -312,9 +313,9 @@ static inline void can_rcvlist_proc_show_one(struct seq_file *m, int idx,
static int can_rcvlist_proc_show(struct seq_file *m, void *v)
{
/* double cast to prevent GCC warning */
- int idx = (int)(long)PDE_DATA(m->file->f_inode);
+ int idx = (int)(long)pde_data(m->file->f_inode);
struct net_device *dev;
- struct can_dev_rcv_lists *d;
+ struct can_dev_rcv_lists *dev_rcv_lists;
struct net *net = m->private;
seq_printf(m, "\nreceive list '%s':\n", rx_list_name[idx]);
@@ -322,13 +323,16 @@ static int can_rcvlist_proc_show(struct seq_file *m, void *v)
rcu_read_lock();
/* receive list for 'all' CAN devices (dev == NULL) */
- d = net->can.can_rx_alldev_list;
- can_rcvlist_proc_show_one(m, idx, NULL, d);
+ dev_rcv_lists = net->can.rx_alldev_list;
+ can_rcvlist_proc_show_one(m, idx, NULL, dev_rcv_lists);
/* receive list for registered CAN devices */
for_each_netdev_rcu(net, dev) {
- if (dev->type == ARPHRD_CAN && dev->ml_priv)
- can_rcvlist_proc_show_one(m, idx, dev, dev->ml_priv);
+ struct can_ml_priv *can_ml = can_get_ml_priv(dev);
+
+ if (can_ml)
+ can_rcvlist_proc_show_one(m, idx, dev,
+ &can_ml->dev_rcv_lists);
}
rcu_read_unlock();
@@ -365,7 +369,7 @@ static inline void can_rcvlist_proc_show_array(struct seq_file *m,
static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v)
{
struct net_device *dev;
- struct can_dev_rcv_lists *d;
+ struct can_dev_rcv_lists *dev_rcv_lists;
struct net *net = m->private;
/* RX_SFF */
@@ -374,15 +378,18 @@ static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v)
rcu_read_lock();
/* sff receive list for 'all' CAN devices (dev == NULL) */
- d = net->can.can_rx_alldev_list;
- can_rcvlist_proc_show_array(m, NULL, d->rx_sff, ARRAY_SIZE(d->rx_sff));
+ dev_rcv_lists = net->can.rx_alldev_list;
+ can_rcvlist_proc_show_array(m, NULL, dev_rcv_lists->rx_sff,
+ ARRAY_SIZE(dev_rcv_lists->rx_sff));
/* sff receive list for registered CAN devices */
for_each_netdev_rcu(net, dev) {
- if (dev->type == ARPHRD_CAN && dev->ml_priv) {
- d = dev->ml_priv;
- can_rcvlist_proc_show_array(m, dev, d->rx_sff,
- ARRAY_SIZE(d->rx_sff));
+ struct can_ml_priv *can_ml = can_get_ml_priv(dev);
+
+ if (can_ml) {
+ dev_rcv_lists = &can_ml->dev_rcv_lists;
+ can_rcvlist_proc_show_array(m, dev, dev_rcv_lists->rx_sff,
+ ARRAY_SIZE(dev_rcv_lists->rx_sff));
}
}
@@ -395,7 +402,7 @@ static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v)
static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v)
{
struct net_device *dev;
- struct can_dev_rcv_lists *d;
+ struct can_dev_rcv_lists *dev_rcv_lists;
struct net *net = m->private;
/* RX_EFF */
@@ -404,15 +411,18 @@ static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v)
rcu_read_lock();
/* eff receive list for 'all' CAN devices (dev == NULL) */
- d = net->can.can_rx_alldev_list;
- can_rcvlist_proc_show_array(m, NULL, d->rx_eff, ARRAY_SIZE(d->rx_eff));
+ dev_rcv_lists = net->can.rx_alldev_list;
+ can_rcvlist_proc_show_array(m, NULL, dev_rcv_lists->rx_eff,
+ ARRAY_SIZE(dev_rcv_lists->rx_eff));
/* eff receive list for registered CAN devices */
for_each_netdev_rcu(net, dev) {
- if (dev->type == ARPHRD_CAN && dev->ml_priv) {
- d = dev->ml_priv;
- can_rcvlist_proc_show_array(m, dev, d->rx_eff,
- ARRAY_SIZE(d->rx_eff));
+ struct can_ml_priv *can_ml = can_get_ml_priv(dev);
+
+ if (can_ml) {
+ dev_rcv_lists = &can_ml->dev_rcv_lists;
+ can_rcvlist_proc_show_array(m, dev, dev_rcv_lists->rx_eff,
+ ARRAY_SIZE(dev_rcv_lists->rx_eff));
}
}
@@ -437,8 +447,6 @@ void can_init_proc(struct net *net)
}
/* own procfs entries from the AF_CAN core */
- net->can.pde_version = proc_create_net_single(CAN_PROC_VERSION, 0644,
- net->can.proc_dir, can_version_proc_show, NULL);
net->can.pde_stats = proc_create_net_single(CAN_PROC_STATS, 0644,
net->can.proc_dir, can_stats_proc_show, NULL);
net->can.pde_reset_stats = proc_create_net_single(CAN_PROC_RESET_STATS,
@@ -467,8 +475,8 @@ void can_init_proc(struct net *net)
*/
void can_remove_proc(struct net *net)
{
- if (net->can.pde_version)
- remove_proc_entry(CAN_PROC_VERSION, net->can.proc_dir);
+ if (!net->can.proc_dir)
+ return;
if (net->can.pde_stats)
remove_proc_entry(CAN_PROC_STATS, net->can.proc_dir);
@@ -494,6 +502,5 @@ void can_remove_proc(struct net *net)
if (net->can.pde_rcvlist_sff)
remove_proc_entry(CAN_PROC_RCVLIST_SFF, net->can.proc_dir);
- if (net->can.proc_dir)
- remove_proc_entry("can", net->proc_net);
+ remove_proc_entry("can", net->proc_net);
}
diff --git a/net/can/raw.c b/net/can/raw.c
index 1051eee82581..be1ef7cf4204 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -1,5 +1,5 @@
-/*
- * raw.c - Raw sockets for protocol family CAN
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
+/* raw.c - Raw sockets for protocol family CAN
*
* Copyright (c) 2002-2007 Volkswagen Group Electronic Research
* All rights reserved.
@@ -50,22 +50,22 @@
#include <linux/skbuff.h>
#include <linux/can.h>
#include <linux/can/core.h>
+#include <linux/can/dev.h> /* for can_is_canxl_dev_mtu() */
#include <linux/can/skb.h>
#include <linux/can/raw.h>
#include <net/sock.h>
#include <net/net_namespace.h>
-#define CAN_RAW_VERSION CAN_VERSION
-
MODULE_DESCRIPTION("PF_CAN raw protocol");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_AUTHOR("Urs Thuermann <urs.thuermann@volkswagen.de>");
MODULE_ALIAS("can-proto-1");
+#define RAW_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_ifindex)
+
#define MASK_ALL 0
-/*
- * A raw socket has a list of can_filters attached to it, each receiving
+/* A raw socket has a list of can_filters attached to it, each receiving
* the CAN frames matching that filter. If the filter list is empty,
* no CAN frames will be received by the socket. The default after
* opening the socket, is to have one filter which receives all frames.
@@ -75,29 +75,39 @@ MODULE_ALIAS("can-proto-1");
*/
struct uniqframe {
- int skbcnt;
const struct sk_buff *skb;
+ int skbcnt;
unsigned int join_rx_count;
};
struct raw_sock {
struct sock sk;
- int bound;
+ struct net_device *dev;
+ netdevice_tracker dev_tracker;
+ struct list_head notifier;
int ifindex;
- struct notifier_block notifier;
- int loopback;
- int recv_own_msgs;
- int fd_frames;
- int join_filters;
+ unsigned int bound:1;
+ unsigned int loopback:1;
+ unsigned int recv_own_msgs:1;
+ unsigned int fd_frames:1;
+ unsigned int xl_frames:1;
+ unsigned int join_filters:1;
+ struct can_raw_vcid_options raw_vcid_opts;
+ canid_t tx_vcid_shifted;
+ canid_t rx_vcid_shifted;
+ canid_t rx_vcid_mask_shifted;
+ can_err_mask_t err_mask;
int count; /* number of active filters */
struct can_filter dfilter; /* default/single filter */
struct can_filter *filter; /* pointer to filter(s) */
- can_err_mask_t err_mask;
struct uniqframe __percpu *uniq;
};
-/*
- * Return pointer to store the extra msg flags for raw_recvmsg().
+static LIST_HEAD(raw_notifier_list);
+static DEFINE_SPINLOCK(raw_notifier_lock);
+static struct raw_sock *raw_busy_notifier;
+
+/* Return pointer to store the extra msg flags for raw_recvmsg().
* We use the space of one unsigned int beyond the 'struct sockaddr_can'
* in skb->cb.
*/
@@ -119,6 +129,7 @@ static void raw_rcv(struct sk_buff *oskb, void *data)
{
struct sock *sk = (struct sock *)data;
struct raw_sock *ro = raw_sk(sk);
+ enum skb_drop_reason reason;
struct sockaddr_can *addr;
struct sk_buff *skb;
unsigned int *pflags;
@@ -127,21 +138,40 @@ static void raw_rcv(struct sk_buff *oskb, void *data)
if (!ro->recv_own_msgs && oskb->sk == sk)
return;
- /* do not pass non-CAN2.0 frames to a legacy socket */
- if (!ro->fd_frames && oskb->len != CAN_MTU)
+ /* make sure to not pass oversized frames to the socket */
+ if (!ro->fd_frames && can_is_canfd_skb(oskb))
return;
+ if (can_is_canxl_skb(oskb)) {
+ struct canxl_frame *cxl = (struct canxl_frame *)oskb->data;
+
+ /* make sure to not pass oversized frames to the socket */
+ if (!ro->xl_frames)
+ return;
+
+ /* filter CAN XL VCID content */
+ if (ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_RX_FILTER) {
+ /* apply VCID filter if user enabled the filter */
+ if ((cxl->prio & ro->rx_vcid_mask_shifted) !=
+ (ro->rx_vcid_shifted & ro->rx_vcid_mask_shifted))
+ return;
+ } else {
+ /* no filter => do not forward VCID tagged frames */
+ if (cxl->prio & CANXL_VCID_MASK)
+ return;
+ }
+ }
+
/* eliminate multiple filter matches for the same skb */
if (this_cpu_ptr(ro->uniq)->skb == oskb &&
this_cpu_ptr(ro->uniq)->skbcnt == can_skb_prv(oskb)->skbcnt) {
- if (ro->join_filters) {
- this_cpu_inc(ro->uniq->join_rx_count);
- /* drop frame until all enabled filters matched */
- if (this_cpu_ptr(ro->uniq)->join_rx_count < ro->count)
- return;
- } else {
+ if (!ro->join_filters)
+ return;
+
+ this_cpu_inc(ro->uniq->join_rx_count);
+ /* drop frame until all enabled filters matched */
+ if (this_cpu_ptr(ro->uniq)->join_rx_count < ro->count)
return;
- }
} else {
this_cpu_ptr(ro->uniq)->skb = oskb;
this_cpu_ptr(ro->uniq)->skbcnt = can_skb_prv(oskb)->skbcnt;
@@ -156,17 +186,16 @@ static void raw_rcv(struct sk_buff *oskb, void *data)
if (!skb)
return;
- /*
- * Put the datagram to the queue so that raw_recvmsg() can
- * get it from there. We need to pass the interface index to
- * raw_recvmsg(). We pass a whole struct sockaddr_can in skb->cb
- * containing the interface index.
+ /* Put the datagram to the queue so that raw_recvmsg() can get
+ * it from there. We need to pass the interface index to
+ * raw_recvmsg(). We pass a whole struct sockaddr_can in
+ * skb->cb containing the interface index.
*/
sock_skb_cb_check_size(sizeof(struct sockaddr_can));
addr = (struct sockaddr_can *)skb->cb;
memset(addr, 0, sizeof(*addr));
- addr->can_family = AF_CAN;
+ addr->can_family = AF_CAN;
addr->can_ifindex = skb->dev->ifindex;
/* add CAN specific message flags for raw_recvmsg() */
@@ -177,8 +206,8 @@ static void raw_rcv(struct sk_buff *oskb, void *data)
if (oskb->sk == sk)
*pflags |= MSG_CONFIRM;
- if (sock_queue_rcv_skb(sk, skb) < 0)
- kfree_skb(skb);
+ if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0)
+ sk_skb_reason_drop(sk, skb, reason);
}
static int raw_enable_filters(struct net *net, struct net_device *dev,
@@ -266,50 +295,68 @@ static int raw_enable_allfilters(struct net *net, struct net_device *dev,
return err;
}
-static int raw_notifier(struct notifier_block *nb,
- unsigned long msg, void *ptr)
+static void raw_notify(struct raw_sock *ro, unsigned long msg,
+ struct net_device *dev)
{
- struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct raw_sock *ro = container_of(nb, struct raw_sock, notifier);
struct sock *sk = &ro->sk;
if (!net_eq(dev_net(dev), sock_net(sk)))
- return NOTIFY_DONE;
-
- if (dev->type != ARPHRD_CAN)
- return NOTIFY_DONE;
+ return;
- if (ro->ifindex != dev->ifindex)
- return NOTIFY_DONE;
+ if (ro->dev != dev)
+ return;
switch (msg) {
-
case NETDEV_UNREGISTER:
lock_sock(sk);
/* remove current filters & unregister */
- if (ro->bound)
+ if (ro->bound) {
raw_disable_allfilters(dev_net(dev), dev, sk);
+ netdev_put(dev, &ro->dev_tracker);
+ }
if (ro->count > 1)
kfree(ro->filter);
ro->ifindex = 0;
- ro->bound = 0;
- ro->count = 0;
+ ro->bound = 0;
+ ro->dev = NULL;
+ ro->count = 0;
release_sock(sk);
sk->sk_err = ENODEV;
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_error_report(sk);
+ sk_error_report(sk);
break;
case NETDEV_DOWN:
sk->sk_err = ENETDOWN;
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_error_report(sk);
+ sk_error_report(sk);
break;
}
+}
+
+static int raw_notifier(struct notifier_block *nb, unsigned long msg,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (dev->type != ARPHRD_CAN)
+ return NOTIFY_DONE;
+ if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN)
+ return NOTIFY_DONE;
+ if (unlikely(raw_busy_notifier)) /* Check for reentrant bug. */
+ return NOTIFY_DONE;
+ spin_lock(&raw_notifier_lock);
+ list_for_each_entry(raw_busy_notifier, &raw_notifier_list, notifier) {
+ spin_unlock(&raw_notifier_lock);
+ raw_notify(raw_busy_notifier, msg, dev);
+ spin_lock(&raw_notifier_lock);
+ }
+ raw_busy_notifier = NULL;
+ spin_unlock(&raw_notifier_lock);
return NOTIFY_DONE;
}
@@ -319,6 +366,7 @@ static int raw_init(struct sock *sk)
ro->bound = 0;
ro->ifindex = 0;
+ ro->dev = NULL;
/* set default filter to single entry dfilter */
ro->dfilter.can_id = 0;
@@ -330,6 +378,7 @@ static int raw_init(struct sock *sk)
ro->loopback = 1;
ro->recv_own_msgs = 0;
ro->fd_frames = 0;
+ ro->xl_frames = 0;
ro->join_filters = 0;
/* alloc_percpu provides zero'ed memory */
@@ -338,9 +387,9 @@ static int raw_init(struct sock *sk)
return -ENOMEM;
/* set notifier */
- ro->notifier.notifier_call = raw_notifier;
-
- register_netdevice_notifier(&ro->notifier);
+ spin_lock(&raw_notifier_lock);
+ list_add_tail(&ro->notifier, &raw_notifier_list);
+ spin_unlock(&raw_notifier_lock);
return 0;
}
@@ -349,79 +398,89 @@ static int raw_release(struct socket *sock)
{
struct sock *sk = sock->sk;
struct raw_sock *ro;
+ struct net *net;
if (!sk)
return 0;
ro = raw_sk(sk);
+ net = sock_net(sk);
- unregister_netdevice_notifier(&ro->notifier);
+ spin_lock(&raw_notifier_lock);
+ while (raw_busy_notifier == ro) {
+ spin_unlock(&raw_notifier_lock);
+ schedule_timeout_uninterruptible(1);
+ spin_lock(&raw_notifier_lock);
+ }
+ list_del(&ro->notifier);
+ spin_unlock(&raw_notifier_lock);
+ rtnl_lock();
lock_sock(sk);
/* remove current filters & unregister */
if (ro->bound) {
- if (ro->ifindex) {
- struct net_device *dev;
-
- dev = dev_get_by_index(sock_net(sk), ro->ifindex);
- if (dev) {
- raw_disable_allfilters(dev_net(dev), dev, sk);
- dev_put(dev);
- }
- } else
- raw_disable_allfilters(sock_net(sk), NULL, sk);
+ if (ro->dev) {
+ raw_disable_allfilters(dev_net(ro->dev), ro->dev, sk);
+ netdev_put(ro->dev, &ro->dev_tracker);
+ } else {
+ raw_disable_allfilters(net, NULL, sk);
+ }
}
if (ro->count > 1)
kfree(ro->filter);
ro->ifindex = 0;
- ro->bound = 0;
- ro->count = 0;
+ ro->bound = 0;
+ ro->dev = NULL;
+ ro->count = 0;
free_percpu(ro->uniq);
sock_orphan(sk);
sock->sk = NULL;
release_sock(sk);
+ rtnl_unlock();
+
+ sock_prot_inuse_add(net, sk->sk_prot, -1);
sock_put(sk);
return 0;
}
-static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len)
+static int raw_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int len)
{
struct sockaddr_can *addr = (struct sockaddr_can *)uaddr;
struct sock *sk = sock->sk;
struct raw_sock *ro = raw_sk(sk);
+ struct net_device *dev = NULL;
int ifindex;
int err = 0;
int notify_enetdown = 0;
- if (len < sizeof(*addr))
+ if (len < RAW_MIN_NAMELEN)
return -EINVAL;
if (addr->can_family != AF_CAN)
return -EINVAL;
+ rtnl_lock();
lock_sock(sk);
if (ro->bound && addr->can_ifindex == ro->ifindex)
goto out;
if (addr->can_ifindex) {
- struct net_device *dev;
-
dev = dev_get_by_index(sock_net(sk), addr->can_ifindex);
if (!dev) {
err = -ENODEV;
goto out;
}
if (dev->type != ARPHRD_CAN) {
- dev_put(dev);
err = -ENODEV;
- goto out;
+ goto out_put_dev;
}
+
if (!(dev->flags & IFF_UP))
notify_enetdown = 1;
@@ -429,7 +488,9 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len)
/* filters set by default/setsockopt */
err = raw_enable_allfilters(sock_net(sk), dev, sk);
- dev_put(dev);
+ if (err)
+ goto out_put_dev;
+
} else {
ifindex = 0;
@@ -440,30 +501,34 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len)
if (!err) {
if (ro->bound) {
/* unregister old filters */
- if (ro->ifindex) {
- struct net_device *dev;
-
- dev = dev_get_by_index(sock_net(sk),
- ro->ifindex);
- if (dev) {
- raw_disable_allfilters(dev_net(dev),
- dev, sk);
- dev_put(dev);
- }
- } else
+ if (ro->dev) {
+ raw_disable_allfilters(dev_net(ro->dev),
+ ro->dev, sk);
+ /* drop reference to old ro->dev */
+ netdev_put(ro->dev, &ro->dev_tracker);
+ } else {
raw_disable_allfilters(sock_net(sk), NULL, sk);
+ }
}
ro->ifindex = ifindex;
ro->bound = 1;
+ /* bind() ok -> hold a reference for new ro->dev */
+ ro->dev = dev;
+ if (ro->dev)
+ netdev_hold(ro->dev, &ro->dev_tracker, GFP_KERNEL);
}
- out:
+out_put_dev:
+ /* remove potential reference from dev_get_by_index() */
+ dev_put(dev);
+out:
release_sock(sk);
+ rtnl_unlock();
if (notify_enetdown) {
sk->sk_err = ENETDOWN;
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_error_report(sk);
+ sk_error_report(sk);
}
return err;
@@ -479,15 +544,15 @@ static int raw_getname(struct socket *sock, struct sockaddr *uaddr,
if (peer)
return -EOPNOTSUPP;
- memset(addr, 0, sizeof(*addr));
+ memset(addr, 0, RAW_MIN_NAMELEN);
addr->can_family = AF_CAN;
addr->can_ifindex = ro->ifindex;
- return sizeof(*addr);
+ return RAW_MIN_NAMELEN;
}
static int raw_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
struct raw_sock *ro = raw_sk(sk);
@@ -496,13 +561,13 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
struct net_device *dev = NULL;
can_err_mask_t err_mask = 0;
int count = 0;
+ int flag;
int err = 0;
if (level != SOL_CAN_RAW)
return -EINVAL;
switch (optname) {
-
case CAN_RAW_FILTER:
if (optlen % sizeof(struct can_filter) != 0)
return -EINVAL;
@@ -514,18 +579,26 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
if (count > 1) {
/* filter does not fit into dfilter => alloc space */
- filter = memdup_user(optval, optlen);
+ filter = memdup_sockptr(optval, optlen);
if (IS_ERR(filter))
return PTR_ERR(filter);
} else if (count == 1) {
- if (copy_from_user(&sfilter, optval, sizeof(sfilter)))
+ if (copy_from_sockptr(&sfilter, optval, sizeof(sfilter)))
return -EFAULT;
}
+ rtnl_lock();
lock_sock(sk);
- if (ro->bound && ro->ifindex)
- dev = dev_get_by_index(sock_net(sk), ro->ifindex);
+ dev = ro->dev;
+ if (ro->bound && dev) {
+ if (dev->reg_state != NETREG_REGISTERED) {
+ if (count > 1)
+ kfree(filter);
+ err = -ENODEV;
+ goto out_fil;
+ }
+ }
if (ro->bound) {
/* (try to) register the new filters */
@@ -560,10 +633,8 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
ro->count = count;
out_fil:
- if (dev)
- dev_put(dev);
-
release_sock(sk);
+ rtnl_unlock();
break;
@@ -571,15 +642,21 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
if (optlen != sizeof(err_mask))
return -EINVAL;
- if (copy_from_user(&err_mask, optval, optlen))
+ if (copy_from_sockptr(&err_mask, optval, optlen))
return -EFAULT;
err_mask &= CAN_ERR_MASK;
+ rtnl_lock();
lock_sock(sk);
- if (ro->bound && ro->ifindex)
- dev = dev_get_by_index(sock_net(sk), ro->ifindex);
+ dev = ro->dev;
+ if (ro->bound && dev) {
+ if (dev->reg_state != NETREG_REGISTERED) {
+ err = -ENODEV;
+ goto out_err;
+ }
+ }
/* remove current error mask */
if (ro->bound) {
@@ -599,47 +676,80 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
ro->err_mask = err_mask;
out_err:
- if (dev)
- dev_put(dev);
-
release_sock(sk);
+ rtnl_unlock();
break;
case CAN_RAW_LOOPBACK:
- if (optlen != sizeof(ro->loopback))
+ if (optlen != sizeof(flag))
return -EINVAL;
- if (copy_from_user(&ro->loopback, optval, optlen))
+ if (copy_from_sockptr(&flag, optval, optlen))
return -EFAULT;
+ ro->loopback = !!flag;
break;
case CAN_RAW_RECV_OWN_MSGS:
- if (optlen != sizeof(ro->recv_own_msgs))
+ if (optlen != sizeof(flag))
return -EINVAL;
- if (copy_from_user(&ro->recv_own_msgs, optval, optlen))
+ if (copy_from_sockptr(&flag, optval, optlen))
return -EFAULT;
+ ro->recv_own_msgs = !!flag;
break;
case CAN_RAW_FD_FRAMES:
- if (optlen != sizeof(ro->fd_frames))
+ if (optlen != sizeof(flag))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&flag, optval, optlen))
+ return -EFAULT;
+
+ /* Enabling CAN XL includes CAN FD */
+ if (ro->xl_frames && !flag)
+ return -EINVAL;
+
+ ro->fd_frames = !!flag;
+ break;
+
+ case CAN_RAW_XL_FRAMES:
+ if (optlen != sizeof(flag))
return -EINVAL;
- if (copy_from_user(&ro->fd_frames, optval, optlen))
+ if (copy_from_sockptr(&flag, optval, optlen))
return -EFAULT;
+ ro->xl_frames = !!flag;
+
+ /* Enabling CAN XL includes CAN FD */
+ if (ro->xl_frames)
+ ro->fd_frames = ro->xl_frames;
+ break;
+
+ case CAN_RAW_XL_VCID_OPTS:
+ if (optlen != sizeof(ro->raw_vcid_opts))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&ro->raw_vcid_opts, optval, optlen))
+ return -EFAULT;
+
+ /* prepare 32 bit values for handling in hot path */
+ ro->tx_vcid_shifted = ro->raw_vcid_opts.tx_vcid << CANXL_VCID_OFFSET;
+ ro->rx_vcid_shifted = ro->raw_vcid_opts.rx_vcid << CANXL_VCID_OFFSET;
+ ro->rx_vcid_mask_shifted = ro->raw_vcid_opts.rx_vcid_mask << CANXL_VCID_OFFSET;
break;
case CAN_RAW_JOIN_FILTERS:
- if (optlen != sizeof(ro->join_filters))
+ if (optlen != sizeof(flag))
return -EINVAL;
- if (copy_from_user(&ro->join_filters, optval, optlen))
+ if (copy_from_sockptr(&flag, optval, optlen))
return -EFAULT;
+ ro->join_filters = !!flag;
break;
default:
@@ -653,9 +763,9 @@ static int raw_getsockopt(struct socket *sock, int level, int optname,
{
struct sock *sk = sock->sk;
struct raw_sock *ro = raw_sk(sk);
+ int flag;
int len;
void *val;
- int err = 0;
if (level != SOL_CAN_RAW)
return -EINVAL;
@@ -665,23 +775,34 @@ static int raw_getsockopt(struct socket *sock, int level, int optname,
return -EINVAL;
switch (optname) {
+ case CAN_RAW_FILTER: {
+ int err = 0;
- case CAN_RAW_FILTER:
lock_sock(sk);
if (ro->count > 0) {
int fsize = ro->count * sizeof(struct can_filter);
- if (len > fsize)
- len = fsize;
- if (copy_to_user(optval, ro->filter, len))
- err = -EFAULT;
- } else
+
+ /* user space buffer to small for filter list? */
+ if (len < fsize) {
+ /* return -ERANGE and needed space in optlen */
+ err = -ERANGE;
+ if (put_user(fsize, optlen))
+ err = -EFAULT;
+ } else {
+ if (len > fsize)
+ len = fsize;
+ if (copy_to_user(optval, ro->filter, len))
+ err = -EFAULT;
+ }
+ } else {
len = 0;
+ }
release_sock(sk);
if (!err)
err = put_user(len, optlen);
return err;
-
+ }
case CAN_RAW_ERR_FILTER:
if (len > sizeof(can_err_mask_t))
len = sizeof(can_err_mask_t);
@@ -691,25 +812,55 @@ static int raw_getsockopt(struct socket *sock, int level, int optname,
case CAN_RAW_LOOPBACK:
if (len > sizeof(int))
len = sizeof(int);
- val = &ro->loopback;
+ flag = ro->loopback;
+ val = &flag;
break;
case CAN_RAW_RECV_OWN_MSGS:
if (len > sizeof(int))
len = sizeof(int);
- val = &ro->recv_own_msgs;
+ flag = ro->recv_own_msgs;
+ val = &flag;
break;
case CAN_RAW_FD_FRAMES:
if (len > sizeof(int))
len = sizeof(int);
- val = &ro->fd_frames;
+ flag = ro->fd_frames;
+ val = &flag;
+ break;
+
+ case CAN_RAW_XL_FRAMES:
+ if (len > sizeof(int))
+ len = sizeof(int);
+ flag = ro->xl_frames;
+ val = &flag;
break;
+ case CAN_RAW_XL_VCID_OPTS: {
+ int err = 0;
+
+ /* user space buffer to small for VCID opts? */
+ if (len < sizeof(ro->raw_vcid_opts)) {
+ /* return -ERANGE and needed space in optlen */
+ err = -ERANGE;
+ if (put_user(sizeof(ro->raw_vcid_opts), optlen))
+ err = -EFAULT;
+ } else {
+ if (len > sizeof(ro->raw_vcid_opts))
+ len = sizeof(ro->raw_vcid_opts);
+ if (copy_to_user(optval, &ro->raw_vcid_opts, len))
+ err = -EFAULT;
+ }
+ if (!err)
+ err = put_user(len, optlen);
+ return err;
+ }
case CAN_RAW_JOIN_FILTERS:
if (len > sizeof(int))
len = sizeof(int);
- val = &ro->join_filters;
+ flag = ro->join_filters;
+ val = &flag;
break;
default:
@@ -723,34 +874,108 @@ static int raw_getsockopt(struct socket *sock, int level, int optname,
return 0;
}
+static void raw_put_canxl_vcid(struct raw_sock *ro, struct sk_buff *skb)
+{
+ struct canxl_frame *cxl = (struct canxl_frame *)skb->data;
+
+ /* sanitize non CAN XL bits */
+ cxl->prio &= (CANXL_PRIO_MASK | CANXL_VCID_MASK);
+
+ /* clear VCID in CAN XL frame if pass through is disabled */
+ if (!(ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_TX_PASS))
+ cxl->prio &= CANXL_PRIO_MASK;
+
+ /* set VCID in CAN XL frame if enabled */
+ if (ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_TX_SET) {
+ cxl->prio &= CANXL_PRIO_MASK;
+ cxl->prio |= ro->tx_vcid_shifted;
+ }
+}
+
+static inline bool raw_dev_cc_enabled(struct net_device *dev,
+ struct can_priv *priv)
+{
+ /* The CANXL-only mode disables error-signalling on the CAN bus
+ * which is needed to send CAN CC/FD frames
+ */
+ if (priv)
+ return !can_dev_in_xl_only_mode(priv);
+
+ /* virtual CAN interfaces always support CAN CC */
+ return true;
+}
+
+static inline bool raw_dev_fd_enabled(struct net_device *dev,
+ struct can_priv *priv)
+{
+ /* check FD ctrlmode on real CAN interfaces */
+ if (priv)
+ return (priv->ctrlmode & CAN_CTRLMODE_FD);
+
+ /* check MTU for virtual CAN FD interfaces */
+ return (READ_ONCE(dev->mtu) >= CANFD_MTU);
+}
+
+static inline bool raw_dev_xl_enabled(struct net_device *dev,
+ struct can_priv *priv)
+{
+ /* check XL ctrlmode on real CAN interfaces */
+ if (priv)
+ return (priv->ctrlmode & CAN_CTRLMODE_XL);
+
+ /* check MTU for virtual CAN XL interfaces */
+ return can_is_canxl_dev_mtu(READ_ONCE(dev->mtu));
+}
+
+static unsigned int raw_check_txframe(struct raw_sock *ro, struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct can_priv *priv = safe_candev_priv(dev);
+
+ /* Classical CAN */
+ if (can_is_can_skb(skb) && raw_dev_cc_enabled(dev, priv))
+ return CAN_MTU;
+
+ /* CAN FD */
+ if (ro->fd_frames && can_is_canfd_skb(skb) &&
+ raw_dev_fd_enabled(dev, priv))
+ return CANFD_MTU;
+
+ /* CAN XL */
+ if (ro->xl_frames && can_is_canxl_skb(skb) &&
+ raw_dev_xl_enabled(dev, priv))
+ return CANXL_MTU;
+
+ return 0;
+}
+
static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
struct sock *sk = sock->sk;
struct raw_sock *ro = raw_sk(sk);
+ struct sockcm_cookie sockc;
struct sk_buff *skb;
struct net_device *dev;
+ unsigned int txmtu;
int ifindex;
- int err;
+ int err = -EINVAL;
+
+ /* check for valid CAN frame sizes */
+ if (size < CANXL_HDR_SIZE + CANXL_MIN_DLEN || size > CANXL_MTU)
+ return -EINVAL;
if (msg->msg_name) {
DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name);
- if (msg->msg_namelen < sizeof(*addr))
+ if (msg->msg_namelen < RAW_MIN_NAMELEN)
return -EINVAL;
if (addr->can_family != AF_CAN)
return -EINVAL;
ifindex = addr->can_ifindex;
- } else
- ifindex = ro->ifindex;
-
- if (ro->fd_frames) {
- if (unlikely(size != CANFD_MTU && size != CAN_MTU))
- return -EINVAL;
} else {
- if (unlikely(size != CAN_MTU))
- return -EINVAL;
+ ifindex = ro->ifindex;
}
dev = dev_get_by_index(sock_net(sk), ifindex);
@@ -766,15 +991,35 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
can_skb_prv(skb)->ifindex = dev->ifindex;
can_skb_prv(skb)->skbcnt = 0;
+ /* fill the skb before testing for valid CAN frames */
err = memcpy_from_msg(skb_put(skb, size), msg, size);
if (err < 0)
goto free_skb;
- sock_tx_timestamp(sk, sk->sk_tsflags, &skb_shinfo(skb)->tx_flags);
+ err = -EINVAL;
+
+ /* check for valid CAN (CC/FD/XL) frame content */
+ txmtu = raw_check_txframe(ro, skb, dev);
+ if (!txmtu)
+ goto free_skb;
+
+ /* only CANXL: clear/forward/set VCID value */
+ if (txmtu == CANXL_MTU)
+ raw_put_canxl_vcid(ro, skb);
+
+ sockcm_init(&sockc, sk);
+ if (msg->msg_controllen) {
+ err = sock_cmsg_send(sk, msg, &sockc);
+ if (unlikely(err))
+ goto free_skb;
+ }
skb->dev = dev;
- skb->sk = sk;
- skb->priority = sk->sk_priority;
+ skb->priority = sockc.priority;
+ skb->mark = sockc.mark;
+ skb->tstamp = sockc.transmit_time;
+
+ skb_setup_tx_timestamp(skb, &sockc);
err = can_send(skb, ro->loopback);
@@ -799,12 +1044,12 @@ static int raw_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
struct sock *sk = sock->sk;
struct sk_buff *skb;
int err = 0;
- int noblock;
- noblock = flags & MSG_DONTWAIT;
- flags &= ~MSG_DONTWAIT;
+ if (flags & MSG_ERRQUEUE)
+ return sock_recv_errqueue(sk, msg, size,
+ SOL_CAN_RAW, SCM_CAN_RAW_ERRQUEUE);
- skb = skb_recv_datagram(sk, flags, noblock, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
if (!skb)
return err;
@@ -819,11 +1064,11 @@ static int raw_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
return err;
}
- sock_recv_ts_and_drops(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
if (msg->msg_name) {
- __sockaddr_check_size(sizeof(struct sockaddr_can));
- msg->msg_namelen = sizeof(struct sockaddr_can);
+ __sockaddr_check_size(RAW_MIN_NAMELEN);
+ msg->msg_namelen = RAW_MIN_NAMELEN;
memcpy(msg->msg_name, skb->cb, msg->msg_namelen);
}
@@ -835,6 +1080,13 @@ static int raw_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
return size;
}
+static int raw_sock_no_ioctlcmd(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ /* no ioctls for socket layer -> hand it down to NIC layer */
+ return -ENOIOCTLCMD;
+}
+
static const struct proto_ops raw_ops = {
.family = PF_CAN,
.release = raw_release,
@@ -844,7 +1096,8 @@ static const struct proto_ops raw_ops = {
.accept = sock_no_accept,
.getname = raw_getname,
.poll = datagram_poll,
- .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */
+ .ioctl = raw_sock_no_ioctlcmd,
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = raw_setsockopt,
@@ -852,7 +1105,6 @@ static const struct proto_ops raw_ops = {
.sendmsg = raw_sendmsg,
.recvmsg = raw_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
};
static struct proto raw_proto __read_mostly = {
@@ -869,22 +1121,37 @@ static const struct can_proto raw_can_proto = {
.prot = &raw_proto,
};
+static struct notifier_block canraw_notifier = {
+ .notifier_call = raw_notifier
+};
+
static __init int raw_module_init(void)
{
int err;
- pr_info("can: raw protocol (rev " CAN_RAW_VERSION ")\n");
+ pr_info("can: raw protocol\n");
+
+ err = register_netdevice_notifier(&canraw_notifier);
+ if (err)
+ return err;
err = can_proto_register(&raw_can_proto);
- if (err < 0)
- printk(KERN_ERR "can: registration of raw protocol failed\n");
+ if (err < 0) {
+ pr_err("can: registration of raw protocol failed\n");
+ goto register_proto_failed;
+ }
+
+ return 0;
+register_proto_failed:
+ unregister_netdevice_notifier(&canraw_notifier);
return err;
}
static __exit void raw_module_exit(void)
{
can_proto_unregister(&raw_can_proto);
+ unregister_netdevice_notifier(&canraw_notifier);
}
module_init(raw_module_init);
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
index cd2d5b9301a1..ea60e3ef0834 100644
--- a/net/ceph/Kconfig
+++ b/net/ceph/Kconfig
@@ -1,9 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
config CEPH_LIB
tristate "Ceph core library"
depends on INET
- select LIBCRC32C
+ select CRC32
select CRYPTO_AES
select CRYPTO_CBC
+ select CRYPTO_GCM
+ select CRYPTO_LIB_SHA256
select CRYPTO
select KEYS
default n
@@ -12,7 +15,7 @@ config CEPH_LIB
common functionality to both the Ceph filesystem and
to the rados block device (rbd).
- More information at http://ceph.newdream.net/.
+ More information at https://ceph.io/.
If unsure, say N.
@@ -38,6 +41,6 @@ config CEPH_LIB_USE_DNS_RESOLVER
be resolved using the CONFIG_DNS_RESOLVER facility.
For information on how to use CONFIG_DNS_RESOLVER consult
- Documentation/networking/dns_resolver.txt
+ Documentation/networking/dns_resolver.rst
If unsure, say N.
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index db09defe27d0..8802a0c0155d 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -5,7 +5,7 @@
obj-$(CONFIG_CEPH_LIB) += libceph.o
libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
- mon_client.o \
+ mon_client.o decode.o \
cls_lock_client.o \
osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
striper.o \
@@ -13,5 +13,6 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
auth.o auth_none.o \
crypto.o armor.o \
auth_x.o \
- ceph_fs.o ceph_strings.o ceph_hash.o \
- pagevec.o snapshot.o string_table.o
+ ceph_strings.o ceph_hash.o \
+ pagevec.o snapshot.o string_table.o \
+ messenger_v1.o messenger_v2.o
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index fbeee068ea14..d38c9eadbe2f 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -21,32 +21,47 @@ static u32 supported_protocols[] = {
CEPH_AUTH_CEPHX
};
-static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
+static int init_protocol(struct ceph_auth_client *ac, int proto)
{
- switch (protocol) {
+ dout("%s proto %d\n", __func__, proto);
+
+ switch (proto) {
case CEPH_AUTH_NONE:
return ceph_auth_none_init(ac);
case CEPH_AUTH_CEPHX:
return ceph_x_init(ac);
default:
- return -ENOENT;
+ pr_err("bad auth protocol %d\n", proto);
+ return -EINVAL;
}
}
+void ceph_auth_set_global_id(struct ceph_auth_client *ac, u64 global_id)
+{
+ dout("%s global_id %llu\n", __func__, global_id);
+
+ if (!global_id)
+ pr_err("got zero global_id\n");
+
+ if (ac->global_id && global_id != ac->global_id)
+ pr_err("global_id changed from %llu to %llu\n", ac->global_id,
+ global_id);
+
+ ac->global_id = global_id;
+}
+
/*
* setup, teardown.
*/
-struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_crypto_key *key)
+struct ceph_auth_client *ceph_auth_init(const char *name,
+ const struct ceph_crypto_key *key,
+ const int *con_modes)
{
struct ceph_auth_client *ac;
- int ret;
- dout("auth_init name '%s'\n", name);
-
- ret = -ENOMEM;
ac = kzalloc(sizeof(*ac), GFP_NOFS);
if (!ac)
- goto out;
+ return ERR_PTR(-ENOMEM);
mutex_init(&ac->mutex);
ac->negotiating = true;
@@ -54,12 +69,13 @@ struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_cryp
ac->name = name;
else
ac->name = CEPH_AUTH_NAME_DEFAULT;
- dout("auth_init name %s\n", ac->name);
ac->key = key;
- return ac;
+ ac->preferred_mode = con_modes[0];
+ ac->fallback_mode = con_modes[1];
-out:
- return ERR_PTR(ret);
+ dout("%s name '%s' preferred_mode %d fallback_mode %d\n", __func__,
+ ac->name, ac->preferred_mode, ac->fallback_mode);
+ return ac;
}
void ceph_auth_destroy(struct ceph_auth_client *ac)
@@ -145,31 +161,35 @@ bad:
goto out;
}
-static int ceph_build_auth_request(struct ceph_auth_client *ac,
- void *msg_buf, size_t msg_len)
+static int build_request(struct ceph_auth_client *ac, bool add_header,
+ void *buf, int buf_len)
{
- struct ceph_mon_request_header *monhdr = msg_buf;
- void *p = monhdr + 1;
- void *end = msg_buf + msg_len;
+ void *end = buf + buf_len;
+ void *p;
int ret;
- monhdr->have_version = 0;
- monhdr->session_mon = cpu_to_le16(-1);
- monhdr->session_mon_tid = 0;
-
- ceph_encode_32(&p, ac->protocol);
+ p = buf;
+ if (add_header) {
+ /* struct ceph_mon_request_header + protocol */
+ ceph_encode_64_safe(&p, end, 0, e_range);
+ ceph_encode_16_safe(&p, end, -1, e_range);
+ ceph_encode_64_safe(&p, end, 0, e_range);
+ ceph_encode_32_safe(&p, end, ac->protocol, e_range);
+ }
+ ceph_encode_need(&p, end, sizeof(u32), e_range);
ret = ac->ops->build_request(ac, p + sizeof(u32), end);
if (ret < 0) {
- pr_err("error %d building auth method %s request\n", ret,
- ac->ops->name);
- goto out;
+ pr_err("auth protocol '%s' building request failed: %d\n",
+ ceph_auth_proto_name(ac->protocol), ret);
+ return ret;
}
dout(" built request %d bytes\n", ret);
ceph_encode_32(&p, ret);
- ret = p + ret - msg_buf;
-out:
- return ret;
+ return p + ret - buf;
+
+e_range:
+ return -ERANGE;
}
/*
@@ -211,11 +231,6 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
payload_end = payload + payload_len;
- if (global_id && ac->global_id != global_id) {
- dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
- ac->global_id = global_id;
- }
-
if (ac->negotiating) {
/* server does not support our protocols? */
if (!protocol && result < 0) {
@@ -229,10 +244,10 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
ac->ops = NULL;
}
if (ac->protocol != protocol) {
- ret = ceph_auth_init_protocol(ac, protocol);
+ ret = init_protocol(ac, protocol);
if (ret) {
- pr_err("error %d on auth protocol %d init\n",
- ret, protocol);
+ pr_err("auth protocol '%s' init failed: %d\n",
+ ceph_auth_proto_name(protocol), ret);
goto out;
}
}
@@ -240,11 +255,20 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
ac->negotiating = false;
}
- ret = ac->ops->handle_reply(ac, result, payload, payload_end);
+ if (result) {
+ pr_err("auth protocol '%s' mauth authentication failed: %d\n",
+ ceph_auth_proto_name(ac->protocol), result);
+ ret = result;
+ goto out;
+ }
+
+ ret = ac->ops->handle_reply(ac, global_id, payload, payload_end,
+ NULL, NULL, NULL, NULL);
if (ret == -EAGAIN) {
- ret = ceph_build_auth_request(ac, reply_buf, reply_len);
+ ret = build_request(ac, true, reply_buf, reply_len);
+ goto out;
} else if (ret) {
- pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
+ goto out;
}
out:
@@ -264,7 +288,7 @@ int ceph_build_auth(struct ceph_auth_client *ac,
mutex_lock(&ac->mutex);
if (ac->ops->should_authenticate(ac))
- ret = ceph_build_auth_request(ac, msg_buf, msg_len);
+ ret = build_request(ac, true, msg_buf, msg_len);
mutex_unlock(&ac->mutex);
return ret;
}
@@ -281,19 +305,38 @@ int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
}
EXPORT_SYMBOL(ceph_auth_is_authenticated);
-int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
- int peer_type,
- struct ceph_auth_handshake *auth)
+int __ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ int peer_type, bool force_new,
+ int *proto, int *pref_mode, int *fallb_mode)
{
- int ret = 0;
+ int ret;
mutex_lock(&ac->mutex);
- if (ac->ops && ac->ops->create_authorizer)
+ if (force_new && auth->authorizer) {
+ ceph_auth_destroy_authorizer(auth->authorizer);
+ auth->authorizer = NULL;
+ }
+ if (!auth->authorizer)
ret = ac->ops->create_authorizer(ac, peer_type, auth);
+ else if (ac->ops->update_authorizer)
+ ret = ac->ops->update_authorizer(ac, peer_type, auth);
+ else
+ ret = 0;
+ if (ret)
+ goto out;
+
+ *proto = ac->protocol;
+ if (pref_mode && fallb_mode) {
+ *pref_mode = ac->preferred_mode;
+ *fallb_mode = ac->fallback_mode;
+ }
+
+out:
mutex_unlock(&ac->mutex);
return ret;
}
-EXPORT_SYMBOL(ceph_auth_create_authorizer);
+EXPORT_SYMBOL(__ceph_auth_get_authorizer);
void ceph_auth_destroy_authorizer(struct ceph_authorizer *a)
{
@@ -301,20 +344,6 @@ void ceph_auth_destroy_authorizer(struct ceph_authorizer *a)
}
EXPORT_SYMBOL(ceph_auth_destroy_authorizer);
-int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
- int peer_type,
- struct ceph_auth_handshake *a)
-{
- int ret = 0;
-
- mutex_lock(&ac->mutex);
- if (ac->ops && ac->ops->update_authorizer)
- ret = ac->ops->update_authorizer(ac, peer_type, a);
- mutex_unlock(&ac->mutex);
- return ret;
-}
-EXPORT_SYMBOL(ceph_auth_update_authorizer);
-
int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
struct ceph_authorizer *a,
void *challenge_buf,
@@ -332,13 +361,18 @@ int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac,
EXPORT_SYMBOL(ceph_auth_add_authorizer_challenge);
int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
- struct ceph_authorizer *a)
+ struct ceph_authorizer *a,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
{
int ret = 0;
mutex_lock(&ac->mutex);
if (ac->ops && ac->ops->verify_authorizer_reply)
- ret = ac->ops->verify_authorizer_reply(ac, a);
+ ret = ac->ops->verify_authorizer_reply(ac, a,
+ reply, reply_len, session_key, session_key_len,
+ con_secret, con_secret_len);
mutex_unlock(&ac->mutex);
return ret;
}
@@ -352,3 +386,274 @@ void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type)
mutex_unlock(&ac->mutex);
}
EXPORT_SYMBOL(ceph_auth_invalidate_authorizer);
+
+/*
+ * msgr2 authentication
+ */
+
+static bool contains(const int *arr, int cnt, int val)
+{
+ int i;
+
+ for (i = 0; i < cnt; i++) {
+ if (arr[i] == val)
+ return true;
+ }
+
+ return false;
+}
+
+static int encode_con_modes(void **p, void *end, int pref_mode, int fallb_mode)
+{
+ WARN_ON(pref_mode == CEPH_CON_MODE_UNKNOWN);
+ if (fallb_mode != CEPH_CON_MODE_UNKNOWN) {
+ ceph_encode_32_safe(p, end, 2, e_range);
+ ceph_encode_32_safe(p, end, pref_mode, e_range);
+ ceph_encode_32_safe(p, end, fallb_mode, e_range);
+ } else {
+ ceph_encode_32_safe(p, end, 1, e_range);
+ ceph_encode_32_safe(p, end, pref_mode, e_range);
+ }
+
+ return 0;
+
+e_range:
+ return -ERANGE;
+}
+
+/*
+ * Similar to ceph_auth_build_hello().
+ */
+int ceph_auth_get_request(struct ceph_auth_client *ac, void *buf, int buf_len)
+{
+ int proto = ac->key ? CEPH_AUTH_CEPHX : CEPH_AUTH_NONE;
+ void *end = buf + buf_len;
+ void *lenp;
+ void *p;
+ int ret;
+
+ mutex_lock(&ac->mutex);
+ if (ac->protocol == CEPH_AUTH_UNKNOWN) {
+ ret = init_protocol(ac, proto);
+ if (ret) {
+ pr_err("auth protocol '%s' init failed: %d\n",
+ ceph_auth_proto_name(proto), ret);
+ goto out;
+ }
+ } else {
+ WARN_ON(ac->protocol != proto);
+ ac->ops->reset(ac);
+ }
+
+ p = buf;
+ ceph_encode_32_safe(&p, end, ac->protocol, e_range);
+ ret = encode_con_modes(&p, end, ac->preferred_mode, ac->fallback_mode);
+ if (ret)
+ goto out;
+
+ lenp = p;
+ p += 4; /* space for len */
+
+ ceph_encode_8_safe(&p, end, CEPH_AUTH_MODE_MON, e_range);
+ ret = ceph_auth_entity_name_encode(ac->name, &p, end);
+ if (ret)
+ goto out;
+
+ ceph_encode_64_safe(&p, end, ac->global_id, e_range);
+ ceph_encode_32(&lenp, p - lenp - 4);
+ ret = p - buf;
+
+out:
+ mutex_unlock(&ac->mutex);
+ return ret;
+
+e_range:
+ ret = -ERANGE;
+ goto out;
+}
+
+int ceph_auth_handle_reply_more(struct ceph_auth_client *ac, void *reply,
+ int reply_len, void *buf, int buf_len)
+{
+ int ret;
+
+ mutex_lock(&ac->mutex);
+ ret = ac->ops->handle_reply(ac, 0, reply, reply + reply_len,
+ NULL, NULL, NULL, NULL);
+ if (ret == -EAGAIN)
+ ret = build_request(ac, false, buf, buf_len);
+ else
+ WARN_ON(ret >= 0);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+
+int ceph_auth_handle_reply_done(struct ceph_auth_client *ac,
+ u64 global_id, void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ int ret;
+
+ mutex_lock(&ac->mutex);
+ ret = ac->ops->handle_reply(ac, global_id, reply, reply + reply_len,
+ session_key, session_key_len,
+ con_secret, con_secret_len);
+ WARN_ON(ret == -EAGAIN || ret > 0);
+ mutex_unlock(&ac->mutex);
+ return ret;
+}
+
+bool ceph_auth_handle_bad_method(struct ceph_auth_client *ac,
+ int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt)
+{
+ mutex_lock(&ac->mutex);
+ WARN_ON(used_proto != ac->protocol);
+
+ if (result == -EOPNOTSUPP) {
+ if (!contains(allowed_protos, proto_cnt, ac->protocol)) {
+ pr_err("auth protocol '%s' not allowed\n",
+ ceph_auth_proto_name(ac->protocol));
+ goto not_allowed;
+ }
+ if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) &&
+ (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN ||
+ !contains(allowed_modes, mode_cnt, ac->fallback_mode))) {
+ pr_err("preferred mode '%s' not allowed\n",
+ ceph_con_mode_name(ac->preferred_mode));
+ if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN)
+ pr_err("no fallback mode\n");
+ else
+ pr_err("fallback mode '%s' not allowed\n",
+ ceph_con_mode_name(ac->fallback_mode));
+ goto not_allowed;
+ }
+ }
+
+ WARN_ON(result == -EOPNOTSUPP || result >= 0);
+ pr_err("auth protocol '%s' msgr authentication failed: %d\n",
+ ceph_auth_proto_name(ac->protocol), result);
+
+ mutex_unlock(&ac->mutex);
+ return true;
+
+not_allowed:
+ mutex_unlock(&ac->mutex);
+ return false;
+}
+
+int ceph_auth_get_authorizer(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ int peer_type, void *buf, int *buf_len)
+{
+ void *end = buf + *buf_len;
+ int pref_mode, fallb_mode;
+ int proto;
+ void *p;
+ int ret;
+
+ ret = __ceph_auth_get_authorizer(ac, auth, peer_type, true, &proto,
+ &pref_mode, &fallb_mode);
+ if (ret)
+ return ret;
+
+ p = buf;
+ ceph_encode_32_safe(&p, end, proto, e_range);
+ ret = encode_con_modes(&p, end, pref_mode, fallb_mode);
+ if (ret)
+ return ret;
+
+ ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range);
+ *buf_len = p - buf;
+ return 0;
+
+e_range:
+ return -ERANGE;
+}
+EXPORT_SYMBOL(ceph_auth_get_authorizer);
+
+int ceph_auth_handle_svc_reply_more(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ void *reply, int reply_len,
+ void *buf, int *buf_len)
+{
+ void *end = buf + *buf_len;
+ void *p;
+ int ret;
+
+ ret = ceph_auth_add_authorizer_challenge(ac, auth->authorizer,
+ reply, reply_len);
+ if (ret)
+ return ret;
+
+ p = buf;
+ ceph_encode_32_safe(&p, end, auth->authorizer_buf_len, e_range);
+ *buf_len = p - buf;
+ return 0;
+
+e_range:
+ return -ERANGE;
+}
+EXPORT_SYMBOL(ceph_auth_handle_svc_reply_more);
+
+int ceph_auth_handle_svc_reply_done(struct ceph_auth_client *ac,
+ struct ceph_auth_handshake *auth,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+ reply, reply_len, session_key, session_key_len,
+ con_secret, con_secret_len);
+}
+EXPORT_SYMBOL(ceph_auth_handle_svc_reply_done);
+
+bool ceph_auth_handle_bad_authorizer(struct ceph_auth_client *ac,
+ int peer_type, int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt)
+{
+ mutex_lock(&ac->mutex);
+ WARN_ON(used_proto != ac->protocol);
+
+ if (result == -EOPNOTSUPP) {
+ if (!contains(allowed_protos, proto_cnt, ac->protocol)) {
+ pr_err("auth protocol '%s' not allowed by %s\n",
+ ceph_auth_proto_name(ac->protocol),
+ ceph_entity_type_name(peer_type));
+ goto not_allowed;
+ }
+ if (!contains(allowed_modes, mode_cnt, ac->preferred_mode) &&
+ (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN ||
+ !contains(allowed_modes, mode_cnt, ac->fallback_mode))) {
+ pr_err("preferred mode '%s' not allowed by %s\n",
+ ceph_con_mode_name(ac->preferred_mode),
+ ceph_entity_type_name(peer_type));
+ if (ac->fallback_mode == CEPH_CON_MODE_UNKNOWN)
+ pr_err("no fallback mode\n");
+ else
+ pr_err("fallback mode '%s' not allowed by %s\n",
+ ceph_con_mode_name(ac->fallback_mode),
+ ceph_entity_type_name(peer_type));
+ goto not_allowed;
+ }
+ }
+
+ WARN_ON(result == -EOPNOTSUPP || result >= 0);
+ pr_err("auth protocol '%s' authorization to %s failed: %d\n",
+ ceph_auth_proto_name(ac->protocol),
+ ceph_entity_type_name(peer_type), result);
+
+ if (ac->ops->invalidate_authorizer)
+ ac->ops->invalidate_authorizer(ac, peer_type);
+
+ mutex_unlock(&ac->mutex);
+ return true;
+
+not_allowed:
+ mutex_unlock(&ac->mutex);
+ return false;
+}
+EXPORT_SYMBOL(ceph_auth_handle_bad_authorizer);
diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c
index edb7042479ed..77b5519bc45f 100644
--- a/net/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -69,13 +69,16 @@ static int build_request(struct ceph_auth_client *ac, void *buf, void *end)
* the generic auth code decode the global_id, and we carry no actual
* authenticate state, so nothing happens here.
*/
-static int handle_reply(struct ceph_auth_client *ac, int result,
- void *buf, void *end)
+static int handle_reply(struct ceph_auth_client *ac, u64 global_id,
+ void *buf, void *end, u8 *session_key,
+ int *session_key_len, u8 *con_secret,
+ int *con_secret_len)
{
struct ceph_auth_none_info *xi = ac->private;
xi->starting = false;
- return result;
+ ceph_auth_set_global_id(ac, global_id);
+ return 0;
}
static void ceph_auth_none_destroy_authorizer(struct ceph_authorizer *a)
@@ -109,14 +112,13 @@ static int ceph_auth_none_create_authorizer(
auth->authorizer = (struct ceph_authorizer *) au;
auth->authorizer_buf = au->buf;
auth->authorizer_buf_len = au->buf_len;
- auth->authorizer_reply_buf = au->reply_buf;
- auth->authorizer_reply_buf_len = sizeof (au->reply_buf);
+ auth->authorizer_reply_buf = NULL;
+ auth->authorizer_reply_buf_len = 0;
return 0;
}
static const struct ceph_auth_client_ops ceph_auth_none_ops = {
- .name = "none",
.reset = reset,
.destroy = destroy,
.is_authenticated = is_authenticated,
diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h
index 4158f064302e..bb121539e796 100644
--- a/net/ceph/auth_none.h
+++ b/net/ceph/auth_none.h
@@ -16,7 +16,6 @@ struct ceph_none_authorizer {
struct ceph_authorizer base;
char buf[128];
int buf_len;
- char reply_buf[0];
};
struct ceph_auth_none_info {
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index b52732337ca6..a21c157daf7d 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -22,12 +22,15 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
{
struct ceph_x_info *xi = ac->private;
- int need;
+ int missing;
+ int need; /* missing + need renewal */
ceph_x_validate_tickets(ac, &need);
- dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
- ac->want_keys, need, xi->have_keys);
- return (ac->want_keys & xi->have_keys) == ac->want_keys;
+ missing = ac->want_keys & ~xi->have_keys;
+ WARN_ON((need & missing) != missing);
+ dout("%s want 0x%x have 0x%x missing 0x%x -> %d\n", __func__,
+ ac->want_keys, xi->have_keys, missing, !missing);
+ return !missing;
}
static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
@@ -36,9 +39,9 @@ static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
int need;
ceph_x_validate_tickets(ac, &need);
- dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
- ac->want_keys, need, xi->have_keys);
- return need != 0;
+ dout("%s want 0x%x have 0x%x need 0x%x -> %d\n", __func__,
+ ac->want_keys, xi->have_keys, need, !!need);
+ return !!need;
}
static int ceph_x_encrypt_offset(void)
@@ -197,7 +200,7 @@ static int process_one_ticket(struct ceph_auth_client *ac,
dout(" decrypted %d bytes\n", ret);
dend = dp + ret;
- tkt_struct_v = ceph_decode_8(&dp);
+ ceph_decode_8_safe(&dp, dend, tkt_struct_v, bad);
if (tkt_struct_v != 1)
goto bad;
@@ -205,6 +208,7 @@ static int process_one_ticket(struct ceph_auth_client *ac,
if (ret)
goto out;
+ ceph_decode_need(&dp, dend, sizeof(struct ceph_timespec), bad);
ceph_decode_timespec64(&validity, dp);
dp += sizeof(struct ceph_timespec);
new_expires = ktime_get_real_seconds() + validity.tv_sec;
@@ -265,22 +269,21 @@ out:
static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
struct ceph_crypto_key *secret,
- void *buf, void *end)
+ void **p, void *end)
{
- void *p = buf;
u8 reply_struct_v;
u32 num;
int ret;
- ceph_decode_8_safe(&p, end, reply_struct_v, bad);
+ ceph_decode_8_safe(p, end, reply_struct_v, bad);
if (reply_struct_v != 1)
return -EINVAL;
- ceph_decode_32_safe(&p, end, num, bad);
+ ceph_decode_32_safe(p, end, num, bad);
dout("%d tickets\n", num);
while (num--) {
- ret = process_one_ticket(ac, secret, &p, end);
+ ret = process_one_ticket(ac, secret, p, end);
if (ret)
return ret;
}
@@ -379,6 +382,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
}
}
au->service = th->service;
+ WARN_ON(!th->secret_id);
au->secret_id = th->secret_id;
msg_a = au->buf->vec.iov_base;
@@ -442,9 +446,10 @@ static bool need_key(struct ceph_x_ticket_handler *th)
static bool have_key(struct ceph_x_ticket_handler *th)
{
- if (th->have_key) {
- if (ktime_get_real_seconds() >= th->expires)
- th->have_key = false;
+ if (th->have_key && ktime_get_real_seconds() >= th->expires) {
+ dout("ticket %d (%s) secret_id %llu expired\n", th->service,
+ ceph_entity_type_name(th->service), th->secret_id);
+ th->have_key = false;
}
return th->have_key;
@@ -486,6 +491,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
struct ceph_x_info *xi = ac->private;
int need;
struct ceph_x_request_header *head = buf;
+ void *p;
int ret;
struct ceph_x_ticket_handler *th =
get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
@@ -494,18 +500,17 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
return PTR_ERR(th);
ceph_x_validate_tickets(ac, &need);
-
- dout("build_request want %x have %x need %x\n",
- ac->want_keys, xi->have_keys, need);
+ dout("%s want 0x%x have 0x%x need 0x%x\n", __func__, ac->want_keys,
+ xi->have_keys, need);
if (need & CEPH_ENTITY_TYPE_AUTH) {
struct ceph_x_authenticate *auth = (void *)(head + 1);
- void *p = auth + 1;
void *enc_buf = xi->auth_authorizer.enc_buf;
struct ceph_x_challenge_blob *blob = enc_buf +
ceph_x_encrypt_offset();
u64 *u;
+ p = auth + 1;
if (p > end)
return -ERANGE;
@@ -521,7 +526,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
if (ret < 0)
return ret;
- auth->struct_v = 1;
+ auth->struct_v = 3; /* nautilus+ */
auth->key = 0;
for (u = (u64 *)enc_buf; u + 1 <= (u64 *)(enc_buf + ret); u++)
auth->key ^= *(__le64 *)u;
@@ -534,45 +539,144 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
if (ret < 0)
return ret;
+ /* nautilus+: request service tickets at the same time */
+ need = ac->want_keys & ~CEPH_ENTITY_TYPE_AUTH;
+ WARN_ON(!need);
+ ceph_encode_32_safe(&p, end, need, e_range);
return p - buf;
}
if (need) {
- void *p = head + 1;
- struct ceph_x_service_ticket_request *req;
-
- if (p > end)
- return -ERANGE;
- head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
-
+ dout(" get_principal_session_key\n");
ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
if (ret)
return ret;
- ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
- xi->auth_authorizer.buf->vec.iov_len);
- req = p;
- req->keys = cpu_to_le32(need);
- p += sizeof(*req);
+ p = buf;
+ ceph_encode_16_safe(&p, end, CEPHX_GET_PRINCIPAL_SESSION_KEY,
+ e_range);
+ ceph_encode_copy_safe(&p, end,
+ xi->auth_authorizer.buf->vec.iov_base,
+ xi->auth_authorizer.buf->vec.iov_len, e_range);
+ ceph_encode_8_safe(&p, end, 1, e_range);
+ ceph_encode_32_safe(&p, end, need, e_range);
return p - buf;
}
return 0;
+
+e_range:
+ return -ERANGE;
+}
+
+static int decode_con_secret(void **p, void *end, u8 *con_secret,
+ int *con_secret_len)
+{
+ int len;
+
+ ceph_decode_32_safe(p, end, len, bad);
+ ceph_decode_need(p, end, len, bad);
+
+ dout("%s len %d\n", __func__, len);
+ if (con_secret) {
+ if (len > CEPH_MAX_CON_SECRET_LEN) {
+ pr_err("connection secret too big %d\n", len);
+ goto bad_memzero;
+ }
+ memcpy(con_secret, *p, len);
+ *con_secret_len = len;
+ }
+ memzero_explicit(*p, len);
+ *p += len;
+ return 0;
+
+bad_memzero:
+ memzero_explicit(*p, len);
+bad:
+ pr_err("failed to decode connection secret\n");
+ return -EINVAL;
}
-static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
- void *buf, void *end)
+static int handle_auth_session_key(struct ceph_auth_client *ac, u64 global_id,
+ void **p, void *end,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ struct ceph_x_info *xi = ac->private;
+ struct ceph_x_ticket_handler *th;
+ void *dp, *dend;
+ int len;
+ int ret;
+
+ /* AUTH ticket */
+ ret = ceph_x_proc_ticket_reply(ac, &xi->secret, p, end);
+ if (ret)
+ return ret;
+
+ ceph_auth_set_global_id(ac, global_id);
+ if (*p == end) {
+ /* pre-nautilus (or didn't request service tickets!) */
+ WARN_ON(session_key || con_secret);
+ return 0;
+ }
+
+ th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+ if (IS_ERR(th))
+ return PTR_ERR(th);
+
+ if (session_key) {
+ memcpy(session_key, th->session_key.key, th->session_key.len);
+ *session_key_len = th->session_key.len;
+ }
+
+ /* connection secret */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ ceph_decode_need(p, end, len, e_inval);
+ dout("%s connection secret blob len %d\n", __func__, len);
+ if (len > 0) {
+ dp = *p + ceph_x_encrypt_offset();
+ ret = ceph_x_decrypt(&th->session_key, p, *p + len);
+ if (ret < 0)
+ return ret;
+
+ dout("%s decrypted %d bytes\n", __func__, ret);
+ dend = dp + ret;
+
+ ret = decode_con_secret(&dp, dend, con_secret, con_secret_len);
+ if (ret)
+ return ret;
+ }
+
+ /* service tickets */
+ ceph_decode_32_safe(p, end, len, e_inval);
+ ceph_decode_need(p, end, len, e_inval);
+ dout("%s service tickets blob len %d\n", __func__, len);
+ if (len > 0) {
+ ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
+ p, *p + len);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+static int ceph_x_handle_reply(struct ceph_auth_client *ac, u64 global_id,
+ void *buf, void *end,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
{
struct ceph_x_info *xi = ac->private;
- struct ceph_x_reply_header *head = buf;
struct ceph_x_ticket_handler *th;
int len = end - buf;
+ int result;
+ void *p;
int op;
int ret;
- if (result)
- return result; /* XXX hmm? */
-
if (xi->starting) {
/* it's a hello */
struct ceph_x_server_challenge *sc = buf;
@@ -587,22 +691,25 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
return -EAGAIN;
}
- op = le16_to_cpu(head->op);
- result = le32_to_cpu(head->result);
+ p = buf;
+ ceph_decode_16_safe(&p, end, op, e_inval);
+ ceph_decode_32_safe(&p, end, result, e_inval);
dout("handle_reply op %d result %d\n", op, result);
switch (op) {
case CEPHX_GET_AUTH_SESSION_KEY:
- /* verify auth key */
- ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
- buf + sizeof(*head), end);
+ /* AUTH ticket + [connection secret] + service tickets */
+ ret = handle_auth_session_key(ac, global_id, &p, end,
+ session_key, session_key_len,
+ con_secret, con_secret_len);
break;
case CEPHX_GET_PRINCIPAL_SESSION_KEY:
th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
if (IS_ERR(th))
return PTR_ERR(th);
- ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
- buf + sizeof(*head), end);
+
+ /* service tickets */
+ ret = ceph_x_proc_ticket_reply(ac, &th->session_key, &p, end);
break;
default:
@@ -613,6 +720,9 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
if (ac->want_keys == xi->have_keys)
return 0;
return -EAGAIN;
+
+e_inval:
+ return -EINVAL;
}
static void ceph_x_destroy_authorizer(struct ceph_authorizer *a)
@@ -678,40 +788,44 @@ static int ceph_x_update_authorizer(
return 0;
}
-static int decrypt_authorize_challenge(struct ceph_x_authorizer *au,
- void *challenge_buf,
- int challenge_buf_len,
- u64 *server_challenge)
+/*
+ * CephXAuthorizeChallenge
+ */
+static int decrypt_authorizer_challenge(struct ceph_crypto_key *secret,
+ void *challenge, int challenge_len,
+ u64 *server_challenge)
{
- struct ceph_x_authorize_challenge *ch =
- challenge_buf + sizeof(struct ceph_x_encrypt_header);
+ void *dp, *dend;
int ret;
/* no leading len */
- ret = __ceph_x_decrypt(&au->session_key, challenge_buf,
- challenge_buf_len);
+ ret = __ceph_x_decrypt(secret, challenge, challenge_len);
if (ret < 0)
return ret;
- if (ret < sizeof(*ch)) {
- pr_err("bad size %d for ceph_x_authorize_challenge\n", ret);
- return -EINVAL;
- }
- *server_challenge = le64_to_cpu(ch->server_challenge);
+ dout("%s decrypted %d bytes\n", __func__, ret);
+ dp = challenge + sizeof(struct ceph_x_encrypt_header);
+ dend = dp + ret;
+
+ ceph_decode_skip_8(&dp, dend, e_inval); /* struct_v */
+ ceph_decode_64_safe(&dp, dend, *server_challenge, e_inval);
+ dout("%s server_challenge %llu\n", __func__, *server_challenge);
return 0;
+
+e_inval:
+ return -EINVAL;
}
static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac,
struct ceph_authorizer *a,
- void *challenge_buf,
- int challenge_buf_len)
+ void *challenge, int challenge_len)
{
struct ceph_x_authorizer *au = (void *)a;
u64 server_challenge;
int ret;
- ret = decrypt_authorize_challenge(au, challenge_buf, challenge_buf_len,
- &server_challenge);
+ ret = decrypt_authorizer_challenge(&au->session_key, challenge,
+ challenge_len, &server_challenge);
if (ret) {
pr_err("failed to decrypt authorize challenge: %d", ret);
return ret;
@@ -726,29 +840,67 @@ static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac,
return 0;
}
+/*
+ * CephXAuthorizeReply
+ */
+static int decrypt_authorizer_reply(struct ceph_crypto_key *secret,
+ void **p, void *end, u64 *nonce_plus_one,
+ u8 *con_secret, int *con_secret_len)
+{
+ void *dp, *dend;
+ u8 struct_v;
+ int ret;
+
+ dp = *p + ceph_x_encrypt_offset();
+ ret = ceph_x_decrypt(secret, p, end);
+ if (ret < 0)
+ return ret;
+
+ dout("%s decrypted %d bytes\n", __func__, ret);
+ dend = dp + ret;
+
+ ceph_decode_8_safe(&dp, dend, struct_v, e_inval);
+ ceph_decode_64_safe(&dp, dend, *nonce_plus_one, e_inval);
+ dout("%s nonce_plus_one %llu\n", __func__, *nonce_plus_one);
+ if (struct_v >= 2) {
+ ret = decode_con_secret(&dp, dend, con_secret, con_secret_len);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
- struct ceph_authorizer *a)
+ struct ceph_authorizer *a,
+ void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
{
struct ceph_x_authorizer *au = (void *)a;
- void *p = au->enc_buf;
- struct ceph_x_authorize_reply *reply = p + ceph_x_encrypt_offset();
+ u64 nonce_plus_one;
int ret;
- ret = ceph_x_decrypt(&au->session_key, &p, p + CEPHX_AU_ENC_BUF_LEN);
- if (ret < 0)
+ if (session_key) {
+ memcpy(session_key, au->session_key.key, au->session_key.len);
+ *session_key_len = au->session_key.len;
+ }
+
+ ret = decrypt_authorizer_reply(&au->session_key, &reply,
+ reply + reply_len, &nonce_plus_one,
+ con_secret, con_secret_len);
+ if (ret)
return ret;
- if (ret < sizeof(*reply)) {
- pr_err("bad size %d for ceph_x_authorize_reply\n", ret);
- return -EINVAL;
+
+ if (nonce_plus_one != au->nonce + 1) {
+ pr_err("failed to authenticate server\n");
+ return -EPERM;
}
- if (au->nonce + 1 != le64_to_cpu(reply->nonce_plus_one))
- ret = -EPERM;
- else
- ret = 0;
- dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
- au->nonce, le64_to_cpu(reply->nonce_plus_one), ret);
- return ret;
+ return 0;
}
static void ceph_x_reset(struct ceph_auth_client *ac)
@@ -785,8 +937,15 @@ static void invalidate_ticket(struct ceph_auth_client *ac, int peer_type)
struct ceph_x_ticket_handler *th;
th = get_ticket_handler(ac, peer_type);
- if (!IS_ERR(th))
+ if (IS_ERR(th))
+ return;
+
+ if (th->have_key) {
+ dout("ticket %d (%s) secret_id %llu invalidated\n",
+ th->service, ceph_entity_type_name(th->service),
+ th->secret_id);
th->have_key = false;
+ }
}
static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
@@ -911,7 +1070,6 @@ static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth,
}
static const struct ceph_auth_client_ops ceph_x_ops = {
- .name = "x",
.is_authenticated = ceph_x_is_authenticated,
.should_authenticate = ceph_x_should_authenticate,
.build_request = ceph_x_build_request,
diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h
index 24b0b74564d0..9c60feeb1bcb 100644
--- a/net/ceph/auth_x_protocol.h
+++ b/net/ceph/auth_x_protocol.h
@@ -38,7 +38,8 @@ struct ceph_x_authenticate {
__u8 struct_v;
__le64 client_challenge;
__le64 key;
- /* ticket blob */
+ /* old_ticket blob */
+ /* nautilus+: other_keys */
} __attribute__ ((packed));
struct ceph_x_service_ticket_request {
@@ -86,7 +87,7 @@ struct ceph_x_authorize_reply {
/*
- * encyption bundle
+ * encryption bundle
*/
#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c
index 5622763ad402..7e51f128045d 100644
--- a/net/ceph/buffer.c
+++ b/net/ceph/buffer.c
@@ -7,7 +7,7 @@
#include <linux/ceph/buffer.h>
#include <linux/ceph/decode.h>
-#include <linux/ceph/libceph.h> /* for ceph_kvmalloc */
+#include <linux/ceph/libceph.h> /* for kvmalloc */
struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
{
@@ -17,7 +17,7 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
if (!b)
return NULL;
- b->vec.iov_base = ceph_kvmalloc(len, gfp);
+ b->vec.iov_base = kvmalloc(len, gfp);
if (!b->vec.iov_base) {
kfree(b);
return NULL;
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 87afb9ec4c68..e734e57be083 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/ceph/ceph_debug.h>
#include <linux/backing-dev.h>
@@ -10,8 +11,9 @@
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/nsproxy.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/statfs.h>
@@ -174,6 +176,10 @@ int ceph_compare_options(struct ceph_options *new_opt,
}
}
+ ret = ceph_compare_crush_locs(&opt1->crush_locs, &opt2->crush_locs);
+ if (ret)
+ return ret;
+
/* any matching mon ip implies a match */
for (i = 0; i < opt1->num_mon; i++) {
if (ceph_monmap_contains(client->monc.monmap,
@@ -184,26 +190,14 @@ int ceph_compare_options(struct ceph_options *new_opt,
}
EXPORT_SYMBOL(ceph_compare_options);
-void *ceph_kvmalloc(size_t size, gfp_t flags)
-{
- if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
- void *ptr = kmalloc(size, flags | __GFP_NOWARN);
- if (ptr)
- return ptr;
- }
-
- return __vmalloc(size, flags, PAGE_KERNEL);
-}
-
-
-static int parse_fsid(const char *str, struct ceph_fsid *fsid)
+int ceph_parse_fsid(const char *str, struct ceph_fsid *fsid)
{
int i = 0;
char tmp[3];
int err = -EINVAL;
int d;
- dout("parse_fsid '%s'\n", str);
+ dout("%s '%s'\n", __func__, str);
tmp[2] = 0;
while (*str && i < 16) {
if (ispunct(*str)) {
@@ -223,69 +217,129 @@ static int parse_fsid(const char *str, struct ceph_fsid *fsid)
if (i == 16)
err = 0;
- dout("parse_fsid ret %d got fsid %pU\n", err, fsid);
+ dout("%s ret %d got fsid %pU\n", __func__, err, fsid);
return err;
}
+EXPORT_SYMBOL(ceph_parse_fsid);
/*
* ceph options
*/
enum {
- Opt_osdtimeout,
Opt_osdkeepalivetimeout,
Opt_mount_timeout,
Opt_osd_idle_ttl,
Opt_osd_request_timeout,
- Opt_last_int,
/* int args above */
Opt_fsid,
Opt_name,
Opt_secret,
Opt_key,
Opt_ip,
- Opt_last_string,
+ Opt_crush_location,
+ Opt_read_from_replica,
+ Opt_ms_mode,
/* string args above */
Opt_share,
- Opt_noshare,
Opt_crc,
- Opt_nocrc,
Opt_cephx_require_signatures,
- Opt_nocephx_require_signatures,
Opt_cephx_sign_messages,
- Opt_nocephx_sign_messages,
Opt_tcp_nodelay,
- Opt_notcp_nodelay,
+ Opt_abort_on_full,
+ Opt_rxbounce,
};
-static match_table_t opt_tokens = {
- {Opt_osdtimeout, "osdtimeout=%d"},
- {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
- {Opt_mount_timeout, "mount_timeout=%d"},
- {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
- {Opt_osd_request_timeout, "osd_request_timeout=%d"},
- /* int args above */
- {Opt_fsid, "fsid=%s"},
- {Opt_name, "name=%s"},
- {Opt_secret, "secret=%s"},
- {Opt_key, "key=%s"},
- {Opt_ip, "ip=%s"},
- /* string args above */
- {Opt_share, "share"},
- {Opt_noshare, "noshare"},
- {Opt_crc, "crc"},
- {Opt_nocrc, "nocrc"},
- {Opt_cephx_require_signatures, "cephx_require_signatures"},
- {Opt_nocephx_require_signatures, "nocephx_require_signatures"},
- {Opt_cephx_sign_messages, "cephx_sign_messages"},
- {Opt_nocephx_sign_messages, "nocephx_sign_messages"},
- {Opt_tcp_nodelay, "tcp_nodelay"},
- {Opt_notcp_nodelay, "notcp_nodelay"},
- {-1, NULL}
+enum {
+ Opt_read_from_replica_no,
+ Opt_read_from_replica_balance,
+ Opt_read_from_replica_localize,
+};
+
+static const struct constant_table ceph_param_read_from_replica[] = {
+ {"no", Opt_read_from_replica_no},
+ {"balance", Opt_read_from_replica_balance},
+ {"localize", Opt_read_from_replica_localize},
+ {}
+};
+
+enum ceph_ms_mode {
+ Opt_ms_mode_legacy,
+ Opt_ms_mode_crc,
+ Opt_ms_mode_secure,
+ Opt_ms_mode_prefer_crc,
+ Opt_ms_mode_prefer_secure
+};
+
+static const struct constant_table ceph_param_ms_mode[] = {
+ {"legacy", Opt_ms_mode_legacy},
+ {"crc", Opt_ms_mode_crc},
+ {"secure", Opt_ms_mode_secure},
+ {"prefer-crc", Opt_ms_mode_prefer_crc},
+ {"prefer-secure", Opt_ms_mode_prefer_secure},
+ {}
};
+static const struct fs_parameter_spec ceph_parameters[] = {
+ fsparam_flag ("abort_on_full", Opt_abort_on_full),
+ __fsparam (NULL, "cephx_require_signatures", Opt_cephx_require_signatures,
+ fs_param_neg_with_no|fs_param_deprecated, NULL),
+ fsparam_flag_no ("cephx_sign_messages", Opt_cephx_sign_messages),
+ fsparam_flag_no ("crc", Opt_crc),
+ fsparam_string ("crush_location", Opt_crush_location),
+ fsparam_string ("fsid", Opt_fsid),
+ fsparam_string ("ip", Opt_ip),
+ fsparam_string ("key", Opt_key),
+ fsparam_u32 ("mount_timeout", Opt_mount_timeout),
+ fsparam_string ("name", Opt_name),
+ fsparam_u32 ("osd_idle_ttl", Opt_osd_idle_ttl),
+ fsparam_u32 ("osd_request_timeout", Opt_osd_request_timeout),
+ fsparam_u32 ("osdkeepalive", Opt_osdkeepalivetimeout),
+ fsparam_enum ("read_from_replica", Opt_read_from_replica,
+ ceph_param_read_from_replica),
+ fsparam_flag ("rxbounce", Opt_rxbounce),
+ fsparam_enum ("ms_mode", Opt_ms_mode,
+ ceph_param_ms_mode),
+ fsparam_string ("secret", Opt_secret),
+ fsparam_flag_no ("share", Opt_share),
+ fsparam_flag_no ("tcp_nodelay", Opt_tcp_nodelay),
+ {}
+};
+
+struct ceph_options *ceph_alloc_options(void)
+{
+ struct ceph_options *opt;
+
+ opt = kzalloc(sizeof(*opt), GFP_KERNEL);
+ if (!opt)
+ return NULL;
+
+ opt->crush_locs = RB_ROOT;
+ opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
+ GFP_KERNEL);
+ if (!opt->mon_addr) {
+ kfree(opt);
+ return NULL;
+ }
+
+ opt->flags = CEPH_OPT_DEFAULT;
+ opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
+ opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
+ opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
+ opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
+ opt->read_from_replica = CEPH_READ_FROM_REPLICA_DEFAULT;
+ opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN;
+ opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
+ return opt;
+}
+EXPORT_SYMBOL(ceph_alloc_options);
+
void ceph_destroy_options(struct ceph_options *opt)
{
dout("destroy_options %p\n", opt);
+ if (!opt)
+ return;
+
+ ceph_clear_crush_locs(&opt->crush_locs);
kfree(opt->name);
if (opt->key) {
ceph_crypto_key_destroy(opt->key);
@@ -297,7 +351,9 @@ void ceph_destroy_options(struct ceph_options *opt)
EXPORT_SYMBOL(ceph_destroy_options);
/* get secret from key store */
-static int get_secret(struct ceph_crypto_key *dst, const char *name) {
+static int get_secret(struct ceph_crypto_key *dst, const char *name,
+ struct p_log *log)
+{
struct key *ukey;
int key_err;
int err = 0;
@@ -310,20 +366,20 @@ static int get_secret(struct ceph_crypto_key *dst, const char *name) {
key_err = PTR_ERR(ukey);
switch (key_err) {
case -ENOKEY:
- pr_warn("ceph: Mount failed due to key not found: %s\n",
- name);
+ error_plog(log, "Failed due to key not found: %s",
+ name);
break;
case -EKEYEXPIRED:
- pr_warn("ceph: Mount failed due to expired key: %s\n",
- name);
+ error_plog(log, "Failed due to expired key: %s",
+ name);
break;
case -EKEYREVOKED:
- pr_warn("ceph: Mount failed due to revoked key: %s\n",
- name);
+ error_plog(log, "Failed due to revoked key: %s",
+ name);
break;
default:
- pr_warn("ceph: Mount failed due to unknown key error %d: %s\n",
- key_err, name);
+ error_plog(log, "Failed due to key error %d: %s",
+ key_err, name);
}
err = -EPERM;
goto out;
@@ -341,218 +397,216 @@ out:
return err;
}
-struct ceph_options *
-ceph_parse_options(char *options, const char *dev_name,
- const char *dev_name_end,
- int (*parse_extra_token)(char *c, void *private),
- void *private)
+int ceph_parse_mon_ips(const char *buf, size_t len, struct ceph_options *opt,
+ struct fc_log *l, char delim)
{
- struct ceph_options *opt;
- const char *c;
- int err = -ENOMEM;
- substring_t argstr[MAX_OPT_ARGS];
-
- opt = kzalloc(sizeof(*opt), GFP_KERNEL);
- if (!opt)
- return ERR_PTR(-ENOMEM);
- opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
- GFP_KERNEL);
- if (!opt->mon_addr)
- goto out;
+ struct p_log log = {.prefix = "libceph", .log = l};
+ int ret;
- dout("parse_options %p options '%s' dev_name '%s'\n", opt, options,
- dev_name);
+ /* ip1[:port1][<delim>ip2[:port2]...] */
+ ret = ceph_parse_ips(buf, buf + len, opt->mon_addr, CEPH_MAX_MON,
+ &opt->num_mon, delim);
+ if (ret) {
+ error_plog(&log, "Failed to parse monitor IPs: %d", ret);
+ return ret;
+ }
- /* start with defaults */
- opt->flags = CEPH_OPT_DEFAULT;
- opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
- opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
- opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
- opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
+ return 0;
+}
+EXPORT_SYMBOL(ceph_parse_mon_ips);
- /* get mon ip(s) */
- /* ip1[:port1][,ip2[:port2]...] */
- err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr,
- CEPH_MAX_MON, &opt->num_mon);
- if (err < 0)
- goto out;
+int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt,
+ struct fc_log *l)
+{
+ struct fs_parse_result result;
+ int token, err;
+ struct p_log log = {.prefix = "libceph", .log = l};
+
+ token = __fs_parse(&log, ceph_parameters, param, &result);
+ dout("%s fs_parse '%s' token %d\n", __func__, param->key, token);
+ if (token < 0)
+ return token;
+
+ switch (token) {
+ case Opt_ip:
+ err = ceph_parse_ips(param->string,
+ param->string + param->size,
+ &opt->my_addr, 1, NULL, ',');
+ if (err) {
+ error_plog(&log, "Failed to parse ip: %d", err);
+ return err;
+ }
+ opt->flags |= CEPH_OPT_MYIP;
+ break;
- /* parse mount options */
- while ((c = strsep(&options, ",")) != NULL) {
- int token, intval;
- if (!*c)
- continue;
- err = -EINVAL;
- token = match_token((char *)c, opt_tokens, argstr);
- if (token < 0 && parse_extra_token) {
- /* extra? */
- err = parse_extra_token((char *)c, private);
- if (err < 0) {
- pr_err("bad option at '%s'\n", c);
- goto out;
- }
- continue;
+ case Opt_fsid:
+ err = ceph_parse_fsid(param->string, &opt->fsid);
+ if (err) {
+ error_plog(&log, "Failed to parse fsid: %d", err);
+ return err;
}
- if (token < Opt_last_int) {
- err = match_int(&argstr[0], &intval);
- if (err < 0) {
- pr_err("bad option arg (not int) at '%s'\n", c);
- goto out;
- }
- dout("got int token %d val %d\n", token, intval);
- } else if (token > Opt_last_int && token < Opt_last_string) {
- dout("got string token %d val %s\n", token,
- argstr[0].from);
- } else {
- dout("got token %d\n", token);
+ opt->flags |= CEPH_OPT_FSID;
+ break;
+ case Opt_name:
+ kfree(opt->name);
+ opt->name = param->string;
+ param->string = NULL;
+ break;
+ case Opt_secret:
+ ceph_crypto_key_destroy(opt->key);
+ kfree(opt->key);
+
+ opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL);
+ if (!opt->key)
+ return -ENOMEM;
+ err = ceph_crypto_key_unarmor(opt->key, param->string);
+ if (err) {
+ error_plog(&log, "Failed to parse secret: %d", err);
+ return err;
}
- switch (token) {
- case Opt_ip:
- err = ceph_parse_ips(argstr[0].from,
- argstr[0].to,
- &opt->my_addr,
- 1, NULL);
- if (err < 0)
- goto out;
- opt->flags |= CEPH_OPT_MYIP;
- break;
+ break;
+ case Opt_key:
+ ceph_crypto_key_destroy(opt->key);
+ kfree(opt->key);
- case Opt_fsid:
- err = parse_fsid(argstr[0].from, &opt->fsid);
- if (err == 0)
- opt->flags |= CEPH_OPT_FSID;
- break;
- case Opt_name:
- kfree(opt->name);
- opt->name = kstrndup(argstr[0].from,
- argstr[0].to-argstr[0].from,
- GFP_KERNEL);
- if (!opt->name) {
- err = -ENOMEM;
- goto out;
- }
+ opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL);
+ if (!opt->key)
+ return -ENOMEM;
+ return get_secret(opt->key, param->string, &log);
+ case Opt_crush_location:
+ ceph_clear_crush_locs(&opt->crush_locs);
+ err = ceph_parse_crush_location(param->string,
+ &opt->crush_locs);
+ if (err) {
+ error_plog(&log, "Failed to parse CRUSH location: %d",
+ err);
+ return err;
+ }
+ break;
+ case Opt_read_from_replica:
+ switch (result.uint_32) {
+ case Opt_read_from_replica_no:
+ opt->read_from_replica = 0;
break;
- case Opt_secret:
- ceph_crypto_key_destroy(opt->key);
- kfree(opt->key);
-
- opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL);
- if (!opt->key) {
- err = -ENOMEM;
- goto out;
- }
- err = ceph_crypto_key_unarmor(opt->key, argstr[0].from);
- if (err < 0)
- goto out;
+ case Opt_read_from_replica_balance:
+ opt->read_from_replica = CEPH_OSD_FLAG_BALANCE_READS;
break;
- case Opt_key:
- ceph_crypto_key_destroy(opt->key);
- kfree(opt->key);
-
- opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL);
- if (!opt->key) {
- err = -ENOMEM;
- goto out;
- }
- err = get_secret(opt->key, argstr[0].from);
- if (err < 0)
- goto out;
+ case Opt_read_from_replica_localize:
+ opt->read_from_replica = CEPH_OSD_FLAG_LOCALIZE_READS;
break;
-
- /* misc */
- case Opt_osdtimeout:
- pr_warn("ignoring deprecated osdtimeout option\n");
+ default:
+ BUG();
+ }
+ break;
+ case Opt_ms_mode:
+ switch (result.uint_32) {
+ case Opt_ms_mode_legacy:
+ opt->con_modes[0] = CEPH_CON_MODE_UNKNOWN;
+ opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
break;
- case Opt_osdkeepalivetimeout:
- /* 0 isn't well defined right now, reject it */
- if (intval < 1 || intval > INT_MAX / 1000) {
- pr_err("osdkeepalive out of range\n");
- err = -EINVAL;
- goto out;
- }
- opt->osd_keepalive_timeout =
- msecs_to_jiffies(intval * 1000);
+ case Opt_ms_mode_crc:
+ opt->con_modes[0] = CEPH_CON_MODE_CRC;
+ opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
break;
- case Opt_osd_idle_ttl:
- /* 0 isn't well defined right now, reject it */
- if (intval < 1 || intval > INT_MAX / 1000) {
- pr_err("osd_idle_ttl out of range\n");
- err = -EINVAL;
- goto out;
- }
- opt->osd_idle_ttl = msecs_to_jiffies(intval * 1000);
+ case Opt_ms_mode_secure:
+ opt->con_modes[0] = CEPH_CON_MODE_SECURE;
+ opt->con_modes[1] = CEPH_CON_MODE_UNKNOWN;
break;
- case Opt_mount_timeout:
- /* 0 is "wait forever" (i.e. infinite timeout) */
- if (intval < 0 || intval > INT_MAX / 1000) {
- pr_err("mount_timeout out of range\n");
- err = -EINVAL;
- goto out;
- }
- opt->mount_timeout = msecs_to_jiffies(intval * 1000);
+ case Opt_ms_mode_prefer_crc:
+ opt->con_modes[0] = CEPH_CON_MODE_CRC;
+ opt->con_modes[1] = CEPH_CON_MODE_SECURE;
break;
- case Opt_osd_request_timeout:
- /* 0 is "wait forever" (i.e. infinite timeout) */
- if (intval < 0 || intval > INT_MAX / 1000) {
- pr_err("osd_request_timeout out of range\n");
- err = -EINVAL;
- goto out;
- }
- opt->osd_request_timeout = msecs_to_jiffies(intval * 1000);
+ case Opt_ms_mode_prefer_secure:
+ opt->con_modes[0] = CEPH_CON_MODE_SECURE;
+ opt->con_modes[1] = CEPH_CON_MODE_CRC;
break;
-
- case Opt_share:
+ default:
+ BUG();
+ }
+ break;
+
+ case Opt_osdkeepalivetimeout:
+ /* 0 isn't well defined right now, reject it */
+ if (result.uint_32 < 1 || result.uint_32 > INT_MAX / 1000)
+ goto out_of_range;
+ opt->osd_keepalive_timeout =
+ msecs_to_jiffies(result.uint_32 * 1000);
+ break;
+ case Opt_osd_idle_ttl:
+ /* 0 isn't well defined right now, reject it */
+ if (result.uint_32 < 1 || result.uint_32 > INT_MAX / 1000)
+ goto out_of_range;
+ opt->osd_idle_ttl = msecs_to_jiffies(result.uint_32 * 1000);
+ break;
+ case Opt_mount_timeout:
+ /* 0 is "wait forever" (i.e. infinite timeout) */
+ if (result.uint_32 > INT_MAX / 1000)
+ goto out_of_range;
+ opt->mount_timeout = msecs_to_jiffies(result.uint_32 * 1000);
+ break;
+ case Opt_osd_request_timeout:
+ /* 0 is "wait forever" (i.e. infinite timeout) */
+ if (result.uint_32 > INT_MAX / 1000)
+ goto out_of_range;
+ opt->osd_request_timeout =
+ msecs_to_jiffies(result.uint_32 * 1000);
+ break;
+
+ case Opt_share:
+ if (!result.negated)
opt->flags &= ~CEPH_OPT_NOSHARE;
- break;
- case Opt_noshare:
+ else
opt->flags |= CEPH_OPT_NOSHARE;
- break;
-
- case Opt_crc:
+ break;
+ case Opt_crc:
+ if (!result.negated)
opt->flags &= ~CEPH_OPT_NOCRC;
- break;
- case Opt_nocrc:
+ else
opt->flags |= CEPH_OPT_NOCRC;
- break;
-
- case Opt_cephx_require_signatures:
- opt->flags &= ~CEPH_OPT_NOMSGAUTH;
- break;
- case Opt_nocephx_require_signatures:
- opt->flags |= CEPH_OPT_NOMSGAUTH;
- break;
- case Opt_cephx_sign_messages:
+ break;
+ case Opt_cephx_require_signatures:
+ if (!result.negated)
+ warn_plog(&log, "Ignoring cephx_require_signatures");
+ else
+ warn_plog(&log, "Ignoring nocephx_require_signatures, use nocephx_sign_messages");
+ break;
+ case Opt_cephx_sign_messages:
+ if (!result.negated)
opt->flags &= ~CEPH_OPT_NOMSGSIGN;
- break;
- case Opt_nocephx_sign_messages:
+ else
opt->flags |= CEPH_OPT_NOMSGSIGN;
- break;
-
- case Opt_tcp_nodelay:
+ break;
+ case Opt_tcp_nodelay:
+ if (!result.negated)
opt->flags |= CEPH_OPT_TCP_NODELAY;
- break;
- case Opt_notcp_nodelay:
+ else
opt->flags &= ~CEPH_OPT_TCP_NODELAY;
- break;
+ break;
- default:
- BUG_ON(token);
- }
+ case Opt_abort_on_full:
+ opt->flags |= CEPH_OPT_ABORT_ON_FULL;
+ break;
+ case Opt_rxbounce:
+ opt->flags |= CEPH_OPT_RXBOUNCE;
+ break;
+
+ default:
+ BUG();
}
- /* success */
- return opt;
+ return 0;
-out:
- ceph_destroy_options(opt);
- return ERR_PTR(err);
+out_of_range:
+ return inval_plog(&log, "%s out of range", param->key);
}
-EXPORT_SYMBOL(ceph_parse_options);
+EXPORT_SYMBOL(ceph_parse_param);
-int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
+int ceph_print_client_options(struct seq_file *m, struct ceph_client *client,
+ bool show_all)
{
struct ceph_options *opt = client->options;
size_t pos = m->count;
+ struct rb_node *n;
if (opt->name) {
seq_puts(m, "name=");
@@ -562,18 +616,57 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
if (opt->key)
seq_puts(m, "secret=<hidden>,");
+ if (!RB_EMPTY_ROOT(&opt->crush_locs)) {
+ seq_puts(m, "crush_location=");
+ for (n = rb_first(&opt->crush_locs); ; ) {
+ struct crush_loc_node *loc =
+ rb_entry(n, struct crush_loc_node, cl_node);
+
+ seq_printf(m, "%s:%s", loc->cl_loc.cl_type_name,
+ loc->cl_loc.cl_name);
+ n = rb_next(n);
+ if (!n)
+ break;
+
+ seq_putc(m, '|');
+ }
+ seq_putc(m, ',');
+ }
+ if (opt->read_from_replica == CEPH_OSD_FLAG_BALANCE_READS) {
+ seq_puts(m, "read_from_replica=balance,");
+ } else if (opt->read_from_replica == CEPH_OSD_FLAG_LOCALIZE_READS) {
+ seq_puts(m, "read_from_replica=localize,");
+ }
+ if (opt->con_modes[0] != CEPH_CON_MODE_UNKNOWN) {
+ if (opt->con_modes[0] == CEPH_CON_MODE_CRC &&
+ opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) {
+ seq_puts(m, "ms_mode=crc,");
+ } else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE &&
+ opt->con_modes[1] == CEPH_CON_MODE_UNKNOWN) {
+ seq_puts(m, "ms_mode=secure,");
+ } else if (opt->con_modes[0] == CEPH_CON_MODE_CRC &&
+ opt->con_modes[1] == CEPH_CON_MODE_SECURE) {
+ seq_puts(m, "ms_mode=prefer-crc,");
+ } else if (opt->con_modes[0] == CEPH_CON_MODE_SECURE &&
+ opt->con_modes[1] == CEPH_CON_MODE_CRC) {
+ seq_puts(m, "ms_mode=prefer-secure,");
+ }
+ }
+
if (opt->flags & CEPH_OPT_FSID)
seq_printf(m, "fsid=%pU,", &opt->fsid);
if (opt->flags & CEPH_OPT_NOSHARE)
seq_puts(m, "noshare,");
if (opt->flags & CEPH_OPT_NOCRC)
seq_puts(m, "nocrc,");
- if (opt->flags & CEPH_OPT_NOMSGAUTH)
- seq_puts(m, "nocephx_require_signatures,");
if (opt->flags & CEPH_OPT_NOMSGSIGN)
seq_puts(m, "nocephx_sign_messages,");
if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0)
seq_puts(m, "notcp_nodelay,");
+ if (show_all && (opt->flags & CEPH_OPT_ABORT_ON_FULL))
+ seq_puts(m, "abort_on_full,");
+ if (opt->flags & CEPH_OPT_RXBOUNCE)
+ seq_puts(m, "rxbounce,");
if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
seq_printf(m, "mount_timeout=%d,",
@@ -636,7 +729,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT;
- if (!ceph_test_opt(client, NOMSGAUTH))
+ if (!ceph_test_opt(client, NOMSGSIGN))
client->required_features |= CEPH_FEATURE_MSG_AUTH;
/* msgr */
@@ -684,42 +777,61 @@ void ceph_destroy_client(struct ceph_client *client)
}
EXPORT_SYMBOL(ceph_destroy_client);
-/*
- * true if we have the mon map (and have thus joined the cluster)
- */
-static bool have_mon_and_osd_map(struct ceph_client *client)
+void ceph_reset_client_addr(struct ceph_client *client)
{
- return client->monc.monmap && client->monc.monmap->epoch &&
- client->osdc.osdmap && client->osdc.osdmap->epoch;
+ ceph_messenger_reset_nonce(&client->msgr);
+ ceph_monc_reopen_session(&client->monc);
+ ceph_osdc_reopen_osds(&client->osdc);
}
+EXPORT_SYMBOL(ceph_reset_client_addr);
/*
* mount: join the ceph cluster, and open root directory.
*/
-int __ceph_open_session(struct ceph_client *client, unsigned long started)
+int __ceph_open_session(struct ceph_client *client)
{
- unsigned long timeout = client->options->mount_timeout;
- long err;
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ long timeout = ceph_timeout_jiffies(client->options->mount_timeout);
+ bool have_monmap, have_osdmap;
+ int err;
/* open session, and wait for mon and osd maps */
err = ceph_monc_open_session(&client->monc);
if (err < 0)
return err;
- while (!have_mon_and_osd_map(client)) {
- if (timeout && time_after_eq(jiffies, started + timeout))
- return -ETIMEDOUT;
+ add_wait_queue(&client->auth_wq, &wait);
+ for (;;) {
+ mutex_lock(&client->monc.mutex);
+ err = client->auth_err;
+ have_monmap = client->monc.monmap && client->monc.monmap->epoch;
+ mutex_unlock(&client->monc.mutex);
+
+ down_read(&client->osdc.lock);
+ have_osdmap = client->osdc.osdmap && client->osdc.osdmap->epoch;
+ up_read(&client->osdc.lock);
+
+ if (err || (have_monmap && have_osdmap))
+ break;
+
+ if (signal_pending(current)) {
+ err = -ERESTARTSYS;
+ break;
+ }
+
+ if (!timeout) {
+ err = -ETIMEDOUT;
+ break;
+ }
/* wait */
dout("mount waiting for mon_map\n");
- err = wait_event_interruptible_timeout(client->auth_wq,
- have_mon_and_osd_map(client) || (client->auth_err < 0),
- ceph_timeout_jiffies(timeout));
- if (err < 0)
- return err;
- if (client->auth_err < 0)
- return client->auth_err;
+ timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout);
}
+ remove_wait_queue(&client->auth_wq, &wait);
+
+ if (err)
+ return err;
pr_info("client%llu fsid %pU\n", ceph_client_gid(client),
&client->fsid);
@@ -729,30 +841,43 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started)
}
EXPORT_SYMBOL(__ceph_open_session);
-
int ceph_open_session(struct ceph_client *client)
{
int ret;
- unsigned long started = jiffies; /* note the start time */
dout("open_session start\n");
mutex_lock(&client->mount_mutex);
- ret = __ceph_open_session(client, started);
+ ret = __ceph_open_session(client);
mutex_unlock(&client->mount_mutex);
return ret;
}
EXPORT_SYMBOL(ceph_open_session);
+int ceph_wait_for_latest_osdmap(struct ceph_client *client,
+ unsigned long timeout)
+{
+ u64 newest_epoch;
+ int ret;
+
+ ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch);
+ if (ret)
+ return ret;
+
+ if (client->osdc.osdmap->epoch >= newest_epoch)
+ return 0;
+
+ ceph_osdc_maybe_request_map(&client->osdc);
+ return ceph_monc_wait_osdmap(&client->monc, newest_epoch, timeout);
+}
+EXPORT_SYMBOL(ceph_wait_for_latest_osdmap);
static int __init init_ceph_lib(void)
{
int ret = 0;
- ret = ceph_debugfs_init();
- if (ret < 0)
- goto out;
+ ceph_debugfs_init();
ret = ceph_crypto_init();
if (ret < 0)
@@ -777,7 +902,6 @@ out_crypto:
ceph_crypto_shutdown();
out_debugfs:
ceph_debugfs_cleanup();
-out:
return ret;
}
diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c
deleted file mode 100644
index 756a2dc10d27..000000000000
--- a/net/ceph/ceph_fs.c
+++ /dev/null
@@ -1,104 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Some non-inline ceph helpers
- */
-#include <linux/module.h>
-#include <linux/ceph/types.h>
-
-/*
- * return true if @layout appears to be valid
- */
-int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
-{
- __u32 su = layout->stripe_unit;
- __u32 sc = layout->stripe_count;
- __u32 os = layout->object_size;
-
- /* stripe unit, object size must be non-zero, 64k increment */
- if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
- return 0;
- if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
- return 0;
- /* object size must be a multiple of stripe unit */
- if (os < su || os % su)
- return 0;
- /* stripe count must be non-zero */
- if (!sc)
- return 0;
- return 1;
-}
-
-void ceph_file_layout_from_legacy(struct ceph_file_layout *fl,
- struct ceph_file_layout_legacy *legacy)
-{
- fl->stripe_unit = le32_to_cpu(legacy->fl_stripe_unit);
- fl->stripe_count = le32_to_cpu(legacy->fl_stripe_count);
- fl->object_size = le32_to_cpu(legacy->fl_object_size);
- fl->pool_id = le32_to_cpu(legacy->fl_pg_pool);
- if (fl->pool_id == 0 && fl->stripe_unit == 0 &&
- fl->stripe_count == 0 && fl->object_size == 0)
- fl->pool_id = -1;
-}
-EXPORT_SYMBOL(ceph_file_layout_from_legacy);
-
-void ceph_file_layout_to_legacy(struct ceph_file_layout *fl,
- struct ceph_file_layout_legacy *legacy)
-{
- legacy->fl_stripe_unit = cpu_to_le32(fl->stripe_unit);
- legacy->fl_stripe_count = cpu_to_le32(fl->stripe_count);
- legacy->fl_object_size = cpu_to_le32(fl->object_size);
- if (fl->pool_id >= 0)
- legacy->fl_pg_pool = cpu_to_le32(fl->pool_id);
- else
- legacy->fl_pg_pool = 0;
-}
-EXPORT_SYMBOL(ceph_file_layout_to_legacy);
-
-int ceph_flags_to_mode(int flags)
-{
- int mode;
-
-#ifdef O_DIRECTORY /* fixme */
- if ((flags & O_DIRECTORY) == O_DIRECTORY)
- return CEPH_FILE_MODE_PIN;
-#endif
-
- switch (flags & O_ACCMODE) {
- case O_WRONLY:
- mode = CEPH_FILE_MODE_WR;
- break;
- case O_RDONLY:
- mode = CEPH_FILE_MODE_RD;
- break;
- case O_RDWR:
- case O_ACCMODE: /* this is what the VFS does */
- mode = CEPH_FILE_MODE_RDWR;
- break;
- }
-#ifdef O_LAZY
- if (flags & O_LAZY)
- mode |= CEPH_FILE_MODE_LAZY;
-#endif
-
- return mode;
-}
-EXPORT_SYMBOL(ceph_flags_to_mode);
-
-int ceph_caps_for_mode(int mode)
-{
- int caps = CEPH_CAP_PIN;
-
- if (mode & CEPH_FILE_MODE_RD)
- caps |= CEPH_CAP_FILE_SHARED |
- CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
- if (mode & CEPH_FILE_MODE_WR)
- caps |= CEPH_CAP_FILE_EXCL |
- CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
- CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
- CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
- if (mode & CEPH_FILE_MODE_LAZY)
- caps |= CEPH_CAP_FILE_LAZYIO;
-
- return caps;
-}
-EXPORT_SYMBOL(ceph_caps_for_mode);
diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c
index 9a5850f264ed..16a47c0eef37 100644
--- a/net/ceph/ceph_hash.c
+++ b/net/ceph/ceph_hash.c
@@ -4,7 +4,7 @@
/*
* Robert Jenkin's hash function.
- * http://burtleburtle.net/bob/hash/evahash.html
+ * https://burtleburtle.net/bob/hash/evahash.html
* This is in the public domain.
*/
#define mix(a, b, c) \
@@ -50,35 +50,35 @@ unsigned int ceph_str_hash_rjenkins(const char *str, unsigned int length)
switch (len) {
case 11:
c = c + ((__u32)k[10] << 24);
- /* fall through */
+ fallthrough;
case 10:
c = c + ((__u32)k[9] << 16);
- /* fall through */
+ fallthrough;
case 9:
c = c + ((__u32)k[8] << 8);
/* the first byte of c is reserved for the length */
- /* fall through */
+ fallthrough;
case 8:
b = b + ((__u32)k[7] << 24);
- /* fall through */
+ fallthrough;
case 7:
b = b + ((__u32)k[6] << 16);
- /* fall through */
+ fallthrough;
case 6:
b = b + ((__u32)k[5] << 8);
- /* fall through */
+ fallthrough;
case 5:
b = b + k[4];
- /* fall through */
+ fallthrough;
case 4:
a = a + ((__u32)k[3] << 24);
- /* fall through */
+ fallthrough;
case 3:
a = a + ((__u32)k[2] << 16);
- /* fall through */
+ fallthrough;
case 2:
a = a + ((__u32)k[1] << 8);
- /* fall through */
+ fallthrough;
case 1:
a = a + k[0];
/* case 0: nothing left to add */
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
index 10e01494993c..355fea272120 100644
--- a/net/ceph/ceph_strings.c
+++ b/net/ceph/ceph_strings.c
@@ -18,6 +18,34 @@ const char *ceph_entity_type_name(int type)
}
EXPORT_SYMBOL(ceph_entity_type_name);
+const char *ceph_auth_proto_name(int proto)
+{
+ switch (proto) {
+ case CEPH_AUTH_UNKNOWN:
+ return "unknown";
+ case CEPH_AUTH_NONE:
+ return "none";
+ case CEPH_AUTH_CEPHX:
+ return "cephx";
+ default:
+ return "???";
+ }
+}
+
+const char *ceph_con_mode_name(int mode)
+{
+ switch (mode) {
+ case CEPH_CON_MODE_UNKNOWN:
+ return "unknown";
+ case CEPH_CON_MODE_CRC:
+ return "crc";
+ case CEPH_CON_MODE_SECURE:
+ return "secure";
+ default:
+ return "???";
+ }
+}
+
const char *ceph_osd_op_name(int op)
{
switch (op) {
diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c
index 2105a6eaa66c..66136a4c1ce7 100644
--- a/net/ceph/cls_lock_client.c
+++ b/net/ceph/cls_lock_client.c
@@ -6,10 +6,13 @@
#include <linux/ceph/cls_lock_client.h>
#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
/**
* ceph_cls_lock - grab rados lock for object
- * @oid, @oloc: object to lock
+ * @osdc: OSD client instance
+ * @oid: object to lock
+ * @oloc: object to lock
* @lock_name: the name of the lock
* @type: lock type (CEPH_CLS_LOCK_EXCLUSIVE or CEPH_CLS_LOCK_SHARED)
* @cookie: user-defined identifier for this instance of the lock
@@ -81,7 +84,9 @@ EXPORT_SYMBOL(ceph_cls_lock);
/**
* ceph_cls_unlock - release rados lock for object
- * @oid, @oloc: object to lock
+ * @osdc: OSD client instance
+ * @oid: object to lock
+ * @oloc: object to lock
* @lock_name: the name of the lock
* @cookie: user-defined identifier for this instance of the lock
*/
@@ -129,7 +134,9 @@ EXPORT_SYMBOL(ceph_cls_unlock);
/**
* ceph_cls_break_lock - release rados lock for object for specified client
- * @oid, @oloc: object to lock
+ * @osdc: OSD client instance
+ * @oid: object to lock
+ * @oloc: object to lock
* @lock_name: the name of the lock
* @cookie: user-defined identifier for this instance of the lock
* @locker: current lock owner
@@ -264,14 +271,17 @@ static int decode_locker(void **p, void *end, struct ceph_locker *locker)
return ret;
*p += sizeof(struct ceph_timespec); /* skip expiration */
- ceph_decode_copy(p, &locker->info.addr, sizeof(locker->info.addr));
- ceph_decode_addr(&locker->info.addr);
+
+ ret = ceph_decode_entity_addr(p, end, &locker->info.addr);
+ if (ret)
+ return ret;
+
len = ceph_decode_32(p);
*p += len; /* skip description */
dout("%s %s%llu cookie %s addr %s\n", __func__,
ENTITY_NAME(locker->id.name), locker->id.cookie,
- ceph_pr_addr(&locker->info.addr.in_addr));
+ ceph_pr_addr(&locker->info.addr));
return 0;
}
@@ -360,7 +370,7 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
dout("%s lock_name %s\n", __func__, lock_name);
ret = ceph_osdc_call(osdc, oid, oloc, "lock", "get_info",
CEPH_OSD_FLAG_READ, get_info_op_page,
- get_info_op_buf_size, reply_page, &reply_len);
+ get_info_op_buf_size, &reply_page, &reply_len);
dout("%s: status %d\n", __func__, ret);
if (ret >= 0) {
@@ -375,3 +385,47 @@ int ceph_cls_lock_info(struct ceph_osd_client *osdc,
return ret;
}
EXPORT_SYMBOL(ceph_cls_lock_info);
+
+int ceph_cls_assert_locked(struct ceph_osd_request *req, int which,
+ char *lock_name, u8 type, char *cookie, char *tag)
+{
+ int assert_op_buf_size;
+ int name_len = strlen(lock_name);
+ int cookie_len = strlen(cookie);
+ int tag_len = strlen(tag);
+ struct page **pages;
+ void *p, *end;
+ int ret;
+
+ assert_op_buf_size = name_len + sizeof(__le32) +
+ cookie_len + sizeof(__le32) +
+ tag_len + sizeof(__le32) +
+ sizeof(u8) + CEPH_ENCODING_START_BLK_LEN;
+ if (assert_op_buf_size > PAGE_SIZE)
+ return -E2BIG;
+
+ ret = osd_req_op_cls_init(req, which, "lock", "assert_locked");
+ if (ret)
+ return ret;
+
+ pages = ceph_alloc_page_vector(1, GFP_NOIO);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ p = page_address(pages[0]);
+ end = p + assert_op_buf_size;
+
+ /* encode cls_lock_assert_op struct */
+ ceph_start_encoding(&p, 1, 1,
+ assert_op_buf_size - CEPH_ENCODING_START_BLK_LEN);
+ ceph_encode_string(&p, end, lock_name, name_len);
+ ceph_encode_8(&p, type);
+ ceph_encode_string(&p, end, cookie, cookie_len);
+ ceph_encode_string(&p, end, tag, tag_len);
+ WARN_ON(p != end);
+
+ osd_req_op_cls_request_data_pages(req, which, pages, assert_op_buf_size,
+ 0, false, true);
+ return 0;
+}
+EXPORT_SYMBOL(ceph_cls_assert_locked);
diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c
index 3d70244bc1b6..254ded0b05f6 100644
--- a/net/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -2,7 +2,6 @@
#ifdef __KERNEL__
# include <linux/slab.h>
# include <linux/crush/crush.h>
-void clear_choose_args(struct crush_map *c);
#else
# include "crush_compat.h"
# include "crush.h"
@@ -130,6 +129,8 @@ void crush_destroy(struct crush_map *map)
#ifndef __KERNEL__
kfree(map->choose_tries);
#else
+ clear_crush_names(&map->type_names);
+ clear_crush_names(&map->names);
clear_choose_args(map);
#endif
kfree(map);
diff --git a/net/ceph/crush/hash.c b/net/ceph/crush/hash.c
index e5cc603cdb17..fe79f6d2d0db 100644
--- a/net/ceph/crush/hash.c
+++ b/net/ceph/crush/hash.c
@@ -7,7 +7,7 @@
/*
* Robert Jenkins' function for mixing 32-bit values
- * http://burtleburtle.net/bob/hash/evahash.html
+ * https://burtleburtle.net/bob/hash/evahash.html
* a, b = random bits, c = input and output
*/
#define crush_hashmix(a, b, c) do { \
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 3f323ed9df52..3a5bd1cd1e99 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -298,7 +298,7 @@ static __u64 crush_ln(unsigned int xin)
*
* for reference, see:
*
- * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables
+ * https://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables
*
*/
@@ -429,7 +429,10 @@ static int is_out(const struct crush_map *map,
/**
* crush_choose_firstn - choose numrep distinct items of given type
* @map: the crush_map
+ * @work: working space initialized by crush_init_workspace()
* @bucket: the bucket we are choose an item from
+ * @weight: weight vector (for map leaves)
+ * @weight_max: size of weight vector
* @x: crush input value
* @numrep: the number of items to choose
* @type: the type of item to choose
@@ -445,6 +448,7 @@ static int is_out(const struct crush_map *map,
* @vary_r: pass r to recursive calls
* @out2: second output vector for leaf items (if @recurse_to_leaf)
* @parent_r: r value passed from the parent
+ * @choose_args: weights and ids for each known bucket
*/
static int crush_choose_firstn(const struct crush_map *map,
struct crush_work *work,
@@ -636,9 +640,8 @@ reject:
}
-/**
+/*
* crush_choose_indep: alternative breadth-first positionally stable mapping
- *
*/
static void crush_choose_indep(const struct crush_map *map,
struct crush_work *work,
@@ -906,7 +909,6 @@ int crush_do_rule(const struct crush_map *map,
int recurse_to_leaf;
int wsize = 0;
int osize;
- int *tmp;
const struct crush_rule *rule;
__u32 step;
int i, j;
@@ -987,7 +989,7 @@ int crush_do_rule(const struct crush_map *map,
case CRUSH_RULE_CHOOSELEAF_FIRSTN:
case CRUSH_RULE_CHOOSE_FIRSTN:
firstn = 1;
- /* fall through */
+ fallthrough;
case CRUSH_RULE_CHOOSELEAF_INDEP:
case CRUSH_RULE_CHOOSE_INDEP:
if (wsize == 0)
@@ -1073,9 +1075,7 @@ int crush_do_rule(const struct crush_map *map,
memcpy(o, c, osize*sizeof(*o));
/* swap o and w arrays */
- tmp = o;
- o = w;
- w = tmp;
+ swap(o, w);
wsize = osize;
break;
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index 02172c408ff2..01b2ce1e8fc0 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -46,9 +46,9 @@ static int set_secret(struct ceph_crypto_key *key, void *buf)
goto fail;
}
- /* crypto_alloc_skcipher() allocates with GFP_KERNEL */
+ /* crypto_alloc_sync_skcipher() allocates with GFP_KERNEL */
noio_flag = memalloc_noio_save();
- key->tfm = crypto_alloc_skcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
+ key->tfm = crypto_alloc_sync_skcipher("cbc(aes)", 0, 0);
memalloc_noio_restore(noio_flag);
if (IS_ERR(key->tfm)) {
ret = PTR_ERR(key->tfm);
@@ -56,7 +56,7 @@ static int set_secret(struct ceph_crypto_key *key, void *buf)
goto fail;
}
- ret = crypto_skcipher_setkey(key->tfm, key->key, key->len);
+ ret = crypto_sync_skcipher_setkey(key->tfm, key->key, key->len);
if (ret)
goto fail;
@@ -74,18 +74,6 @@ int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
return set_secret(dst, src->key);
}
-int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
-{
- if (*p + sizeof(u16) + sizeof(key->created) +
- sizeof(u16) + key->len > end)
- return -ERANGE;
- ceph_encode_16(p, key->type);
- ceph_encode_copy(p, &key->created, sizeof(key->created));
- ceph_encode_16(p, key->len);
- ceph_encode_copy(p, key->key, key->len);
- return 0;
-}
-
int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
{
int ret;
@@ -96,6 +84,7 @@ int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
key->len = ceph_decode_16(p);
ceph_decode_need(p, end, key->len, bad);
ret = set_secret(key, *p);
+ memzero_explicit(*p, key->len);
*p += key->len;
return ret;
@@ -134,17 +123,19 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
{
if (key) {
- kfree(key->key);
+ kfree_sensitive(key->key);
key->key = NULL;
- crypto_free_skcipher(key->tfm);
- key->tfm = NULL;
+ if (key->tfm) {
+ crypto_free_sync_skcipher(key->tfm);
+ key->tfm = NULL;
+ }
}
}
static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
/*
- * Should be used for buffers allocated with ceph_kvmalloc().
+ * Should be used for buffers allocated with kvmalloc().
* Currently these are encrypt out-buffer (ceph_buffer) and decrypt
* in-buffer (msg front).
*
@@ -216,7 +207,7 @@ static void teardown_sgtable(struct sg_table *sgt)
static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt,
void *buf, int buf_len, int in_len, int *pout_len)
{
- SKCIPHER_REQUEST_ON_STACK(req, key->tfm);
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, key->tfm);
struct sg_table sgt;
struct scatterlist prealloc_sg;
char iv[AES_BLOCK_SIZE] __aligned(8);
@@ -232,7 +223,7 @@ static int ceph_aes_crypt(const struct ceph_crypto_key *key, bool encrypt,
return ret;
memcpy(iv, aes_iv, AES_BLOCK_SIZE);
- skcipher_request_set_tfm(req, key->tfm);
+ skcipher_request_set_sync_tfm(req, key->tfm);
skcipher_request_set_callback(req, 0, NULL, NULL);
skcipher_request_set_crypt(req, sgt.sgl, sgt.sgl, crypt_len, iv);
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
index bb45c7d43739..23de29fc613c 100644
--- a/net/ceph/crypto.h
+++ b/net/ceph/crypto.h
@@ -5,6 +5,9 @@
#include <linux/ceph/types.h>
#include <linux/ceph/buffer.h>
+#define CEPH_KEY_LEN 16
+#define CEPH_MAX_CON_SECRET_LEN 64
+
/*
* cryptographic secret
*/
@@ -13,12 +16,11 @@ struct ceph_crypto_key {
struct ceph_timespec created;
int len;
void *key;
- struct crypto_skcipher *tfm;
+ struct crypto_sync_skcipher *tfm;
};
int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
const struct ceph_crypto_key *src);
-int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end);
int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end);
int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
void ceph_crypto_key_destroy(struct ceph_crypto_key *key);
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 02952605d121..83c270bce63c 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -36,8 +36,9 @@ static int monmap_show(struct seq_file *s, void *p)
int i;
struct ceph_client *client = s->private;
+ mutex_lock(&client->monc.mutex);
if (client->monc.monmap == NULL)
- return 0;
+ goto out_unlock;
seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
for (i = 0; i < client->monc.monmap->num_mon; i++) {
@@ -46,8 +47,11 @@ static int monmap_show(struct seq_file *s, void *p)
seq_printf(s, "\t%s%lld\t%s\n",
ENTITY_NAME(inst->name),
- ceph_pr_addr(&inst->addr.in_addr));
+ ceph_pr_addr(&inst->addr));
}
+
+out_unlock:
+ mutex_unlock(&client->monc.mutex);
return 0;
}
@@ -56,13 +60,14 @@ static int osdmap_show(struct seq_file *s, void *p)
int i;
struct ceph_client *client = s->private;
struct ceph_osd_client *osdc = &client->osdc;
- struct ceph_osdmap *map = osdc->osdmap;
+ struct ceph_osdmap *map;
struct rb_node *n;
+ down_read(&osdc->lock);
+ map = osdc->osdmap;
if (map == NULL)
- return 0;
+ goto out_unlock;
- down_read(&osdc->lock);
seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map->epoch,
osdc->epoch_barrier, map->flags);
@@ -81,11 +86,13 @@ static int osdmap_show(struct seq_file *s, void *p)
u32 state = map->osd_state[i];
char sb[64];
- seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n",
- i, ceph_pr_addr(&addr->in_addr),
+ seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\t%2d\n",
+ i, ceph_pr_addr(addr),
((map->osd_weight[i]*100) >> 16),
ceph_osdmap_state_str(sb, sizeof(sb), state),
- ((ceph_get_primary_affinity(map, i)*100) >> 16));
+ ((ceph_get_primary_affinity(map, i)*100) >> 16),
+ ceph_get_crush_locality(map, i,
+ &client->options->crush_locs));
}
for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) {
struct ceph_pg_mapping *pg =
@@ -129,6 +136,7 @@ static int osdmap_show(struct seq_file *s, void *p)
seq_printf(s, "]\n");
}
+out_unlock:
up_read(&osdc->lock);
return 0;
}
@@ -221,6 +229,9 @@ static void dump_request(struct seq_file *s, struct ceph_osd_request *req)
if (op->op == CEPH_OSD_OP_WATCH)
seq_printf(s, "-%s",
ceph_osd_watch_op_name(op->watch.op));
+ else if (op->op == CEPH_OSD_OP_CALL)
+ seq_printf(s, "-%s/%s", op->cls.class_name,
+ op->cls.method_name);
}
seq_putc(s, '\n');
@@ -375,7 +386,7 @@ static int client_options_show(struct seq_file *s, void *p)
struct ceph_client *client = s->private;
int ret;
- ret = ceph_print_client_options(s, client);
+ ret = ceph_print_client_options(s, client, true);
if (ret)
return ret;
@@ -383,18 +394,15 @@ static int client_options_show(struct seq_file *s, void *p)
return 0;
}
-CEPH_DEFINE_SHOW_FUNC(monmap_show)
-CEPH_DEFINE_SHOW_FUNC(osdmap_show)
-CEPH_DEFINE_SHOW_FUNC(monc_show)
-CEPH_DEFINE_SHOW_FUNC(osdc_show)
-CEPH_DEFINE_SHOW_FUNC(client_options_show)
+DEFINE_SHOW_ATTRIBUTE(monmap);
+DEFINE_SHOW_ATTRIBUTE(osdmap);
+DEFINE_SHOW_ATTRIBUTE(monc);
+DEFINE_SHOW_ATTRIBUTE(osdc);
+DEFINE_SHOW_ATTRIBUTE(client_options);
-int __init ceph_debugfs_init(void)
+void __init ceph_debugfs_init(void)
{
ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
- if (!ceph_debugfs_dir)
- return -ENOMEM;
- return 0;
}
void ceph_debugfs_cleanup(void)
@@ -402,9 +410,8 @@ void ceph_debugfs_cleanup(void)
debugfs_remove(ceph_debugfs_dir);
}
-int ceph_debugfs_client_init(struct ceph_client *client)
+void ceph_debugfs_client_init(struct ceph_client *client)
{
- int ret = -ENOMEM;
char name[80];
snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
@@ -412,56 +419,37 @@ int ceph_debugfs_client_init(struct ceph_client *client)
dout("ceph_debugfs_client_init %p %s\n", client, name);
- BUG_ON(client->debugfs_dir);
client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
- if (!client->debugfs_dir)
- goto out;
client->monc.debugfs_file = debugfs_create_file("monc",
0400,
client->debugfs_dir,
client,
- &monc_show_fops);
- if (!client->monc.debugfs_file)
- goto out;
+ &monc_fops);
client->osdc.debugfs_file = debugfs_create_file("osdc",
0400,
client->debugfs_dir,
client,
- &osdc_show_fops);
- if (!client->osdc.debugfs_file)
- goto out;
+ &osdc_fops);
client->debugfs_monmap = debugfs_create_file("monmap",
0400,
client->debugfs_dir,
client,
- &monmap_show_fops);
- if (!client->debugfs_monmap)
- goto out;
+ &monmap_fops);
client->debugfs_osdmap = debugfs_create_file("osdmap",
0400,
client->debugfs_dir,
client,
- &osdmap_show_fops);
- if (!client->debugfs_osdmap)
- goto out;
+ &osdmap_fops);
client->debugfs_options = debugfs_create_file("client_options",
0400,
client->debugfs_dir,
client,
- &client_options_show_fops);
- if (!client->debugfs_options)
- goto out;
-
- return 0;
-
-out:
- ceph_debugfs_client_cleanup(client);
- return ret;
+ &client_options_fops);
}
void ceph_debugfs_client_cleanup(struct ceph_client *client)
@@ -477,18 +465,16 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client)
#else /* CONFIG_DEBUG_FS */
-int __init ceph_debugfs_init(void)
+void __init ceph_debugfs_init(void)
{
- return 0;
}
void ceph_debugfs_cleanup(void)
{
}
-int ceph_debugfs_client_init(struct ceph_client *client)
+void ceph_debugfs_client_init(struct ceph_client *client)
{
- return 0;
}
void ceph_debugfs_client_cleanup(struct ceph_client *client)
diff --git a/net/ceph/decode.c b/net/ceph/decode.c
new file mode 100644
index 000000000000..bc109a1a4616
--- /dev/null
+++ b/net/ceph/decode.c
@@ -0,0 +1,193 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/inet.h>
+
+#include <linux/ceph/decode.h>
+#include <linux/ceph/messenger.h> /* for ceph_pr_addr() */
+
+static int
+ceph_decode_entity_addr_versioned(void **p, void *end,
+ struct ceph_entity_addr *addr)
+{
+ int ret;
+ u8 struct_v;
+ u32 struct_len, addr_len;
+ void *struct_end;
+
+ ret = ceph_start_decoding(p, end, 1, "entity_addr_t", &struct_v,
+ &struct_len);
+ if (ret)
+ goto bad;
+
+ ret = -EINVAL;
+ struct_end = *p + struct_len;
+
+ ceph_decode_copy_safe(p, end, &addr->type, sizeof(addr->type), bad);
+
+ ceph_decode_copy_safe(p, end, &addr->nonce, sizeof(addr->nonce), bad);
+
+ ceph_decode_32_safe(p, end, addr_len, bad);
+ if (addr_len > sizeof(addr->in_addr))
+ goto bad;
+
+ memset(&addr->in_addr, 0, sizeof(addr->in_addr));
+ if (addr_len) {
+ ceph_decode_copy_safe(p, end, &addr->in_addr, addr_len, bad);
+
+ addr->in_addr.ss_family =
+ le16_to_cpu((__force __le16)addr->in_addr.ss_family);
+ }
+
+ /* Advance past anything the client doesn't yet understand */
+ *p = struct_end;
+ ret = 0;
+bad:
+ return ret;
+}
+
+static int
+ceph_decode_entity_addr_legacy(void **p, void *end,
+ struct ceph_entity_addr *addr)
+{
+ int ret = -EINVAL;
+
+ /* Skip rest of type field */
+ ceph_decode_skip_n(p, end, 3, bad);
+
+ /*
+ * Clients that don't support ADDR2 always send TYPE_NONE, change it
+ * to TYPE_LEGACY for forward compatibility.
+ */
+ addr->type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
+ ceph_decode_copy_safe(p, end, &addr->nonce, sizeof(addr->nonce), bad);
+ memset(&addr->in_addr, 0, sizeof(addr->in_addr));
+ ceph_decode_copy_safe(p, end, &addr->in_addr,
+ sizeof(addr->in_addr), bad);
+ addr->in_addr.ss_family =
+ be16_to_cpu((__force __be16)addr->in_addr.ss_family);
+ ret = 0;
+bad:
+ return ret;
+}
+
+int
+ceph_decode_entity_addr(void **p, void *end, struct ceph_entity_addr *addr)
+{
+ u8 marker;
+
+ ceph_decode_8_safe(p, end, marker, bad);
+ if (marker == 1)
+ return ceph_decode_entity_addr_versioned(p, end, addr);
+ else if (marker == 0)
+ return ceph_decode_entity_addr_legacy(p, end, addr);
+bad:
+ return -EINVAL;
+}
+EXPORT_SYMBOL(ceph_decode_entity_addr);
+
+/*
+ * Return addr of desired type (MSGR2 or LEGACY) or error.
+ * Make sure there is only one match.
+ *
+ * Assume encoding with MSG_ADDR2.
+ */
+int ceph_decode_entity_addrvec(void **p, void *end, bool msgr2,
+ struct ceph_entity_addr *addr)
+{
+ __le32 my_type = msgr2 ? CEPH_ENTITY_ADDR_TYPE_MSGR2 :
+ CEPH_ENTITY_ADDR_TYPE_LEGACY;
+ struct ceph_entity_addr tmp_addr;
+ int addr_cnt;
+ bool found;
+ u8 marker;
+ int ret;
+ int i;
+
+ ceph_decode_8_safe(p, end, marker, e_inval);
+ if (marker != 2) {
+ pr_err("bad addrvec marker %d\n", marker);
+ return -EINVAL;
+ }
+
+ ceph_decode_32_safe(p, end, addr_cnt, e_inval);
+ dout("%s addr_cnt %d\n", __func__, addr_cnt);
+
+ found = false;
+ for (i = 0; i < addr_cnt; i++) {
+ ret = ceph_decode_entity_addr(p, end, &tmp_addr);
+ if (ret)
+ return ret;
+
+ dout("%s i %d addr %s\n", __func__, i, ceph_pr_addr(&tmp_addr));
+ if (tmp_addr.type == my_type) {
+ if (found) {
+ pr_err("another match of type %d in addrvec\n",
+ le32_to_cpu(my_type));
+ return -EINVAL;
+ }
+
+ memcpy(addr, &tmp_addr, sizeof(*addr));
+ found = true;
+ }
+ }
+
+ if (found)
+ return 0;
+
+ if (!addr_cnt)
+ return 0; /* normal -- e.g. unused OSD id/slot */
+
+ if (addr_cnt == 1 && !memchr_inv(&tmp_addr, 0, sizeof(tmp_addr)))
+ return 0; /* weird but effectively the same as !addr_cnt */
+
+ pr_err("no match of type %d in addrvec\n", le32_to_cpu(my_type));
+ return -ENOENT;
+
+e_inval:
+ return -EINVAL;
+}
+EXPORT_SYMBOL(ceph_decode_entity_addrvec);
+
+static int get_sockaddr_encoding_len(sa_family_t family)
+{
+ union {
+ struct sockaddr sa;
+ struct sockaddr_in sin;
+ struct sockaddr_in6 sin6;
+ } u;
+
+ switch (family) {
+ case AF_INET:
+ return sizeof(u.sin);
+ case AF_INET6:
+ return sizeof(u.sin6);
+ default:
+ return sizeof(u);
+ }
+}
+
+int ceph_entity_addr_encoding_len(const struct ceph_entity_addr *addr)
+{
+ sa_family_t family = get_unaligned(&addr->in_addr.ss_family);
+ int addr_len = get_sockaddr_encoding_len(family);
+
+ return 1 + CEPH_ENCODING_START_BLK_LEN + 4 + 4 + 4 + addr_len;
+}
+
+void ceph_encode_entity_addr(void **p, const struct ceph_entity_addr *addr)
+{
+ sa_family_t family = get_unaligned(&addr->in_addr.ss_family);
+ int addr_len = get_sockaddr_encoding_len(family);
+
+ ceph_encode_8(p, 1); /* marker */
+ ceph_start_encoding(p, 1, 1, sizeof(addr->type) +
+ sizeof(addr->nonce) +
+ sizeof(u32) + addr_len);
+ ceph_encode_copy(p, &addr->type, sizeof(addr->type));
+ ceph_encode_copy(p, &addr->nonce, sizeof(addr->nonce));
+
+ ceph_encode_32(p, addr_len);
+ ceph_encode_16(p, family);
+ ceph_encode_copy(p, addr->in_addr.__data, addr_len - sizeof(family));
+}
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 0a187196aeed..70b25f4ecba6 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -17,6 +17,7 @@
#endif /* CONFIG_BLOCK */
#include <linux/dns_resolver.h>
#include <net/tcp.h>
+#include <trace/events/sock.h>
#include <linux/ceph/ceph_features.h>
#include <linux/ceph/libceph.h>
@@ -82,71 +83,51 @@
#define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */
#define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */
-/*
- * connection states
- */
-#define CON_STATE_CLOSED 1 /* -> PREOPEN */
-#define CON_STATE_PREOPEN 2 /* -> CONNECTING, CLOSED */
-#define CON_STATE_CONNECTING 3 /* -> NEGOTIATING, CLOSED */
-#define CON_STATE_NEGOTIATING 4 /* -> OPEN, CLOSED */
-#define CON_STATE_OPEN 5 /* -> STANDBY, CLOSED */
-#define CON_STATE_STANDBY 6 /* -> PREOPEN, CLOSED */
-
-/*
- * ceph_connection flag bits
- */
-#define CON_FLAG_LOSSYTX 0 /* we can close channel or drop
- * messages on errors */
-#define CON_FLAG_KEEPALIVE_PENDING 1 /* we need to send a keepalive */
-#define CON_FLAG_WRITE_PENDING 2 /* we have data ready to send */
-#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */
-#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */
-
static bool con_flag_valid(unsigned long con_flag)
{
switch (con_flag) {
- case CON_FLAG_LOSSYTX:
- case CON_FLAG_KEEPALIVE_PENDING:
- case CON_FLAG_WRITE_PENDING:
- case CON_FLAG_SOCK_CLOSED:
- case CON_FLAG_BACKOFF:
+ case CEPH_CON_F_LOSSYTX:
+ case CEPH_CON_F_KEEPALIVE_PENDING:
+ case CEPH_CON_F_WRITE_PENDING:
+ case CEPH_CON_F_SOCK_CLOSED:
+ case CEPH_CON_F_BACKOFF:
return true;
default:
return false;
}
}
-static void con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
+void ceph_con_flag_clear(struct ceph_connection *con, unsigned long con_flag)
{
BUG_ON(!con_flag_valid(con_flag));
clear_bit(con_flag, &con->flags);
}
-static void con_flag_set(struct ceph_connection *con, unsigned long con_flag)
+void ceph_con_flag_set(struct ceph_connection *con, unsigned long con_flag)
{
BUG_ON(!con_flag_valid(con_flag));
set_bit(con_flag, &con->flags);
}
-static bool con_flag_test(struct ceph_connection *con, unsigned long con_flag)
+bool ceph_con_flag_test(struct ceph_connection *con, unsigned long con_flag)
{
BUG_ON(!con_flag_valid(con_flag));
return test_bit(con_flag, &con->flags);
}
-static bool con_flag_test_and_clear(struct ceph_connection *con,
- unsigned long con_flag)
+bool ceph_con_flag_test_and_clear(struct ceph_connection *con,
+ unsigned long con_flag)
{
BUG_ON(!con_flag_valid(con_flag));
return test_and_clear_bit(con_flag, &con->flags);
}
-static bool con_flag_test_and_set(struct ceph_connection *con,
- unsigned long con_flag)
+bool ceph_con_flag_test_and_set(struct ceph_connection *con,
+ unsigned long con_flag)
{
BUG_ON(!con_flag_valid(con_flag));
@@ -156,13 +137,6 @@ static bool con_flag_test_and_set(struct ceph_connection *con,
/* Slab caches for frequently-allocated structures */
static struct kmem_cache *ceph_msg_cache;
-static struct kmem_cache *ceph_msg_data_cache;
-
-/* static tag bytes (protocol control messages) */
-static char tag_msg = CEPH_MSGR_TAG_MSG;
-static char tag_ack = CEPH_MSGR_TAG_ACK;
-static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
-static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
#ifdef CONFIG_LOCKDEP
static struct lock_class_key socket_class;
@@ -185,42 +159,48 @@ static void con_fault(struct ceph_connection *con);
static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN];
static atomic_t addr_str_seq = ATOMIC_INIT(0);
-static struct page *zero_page; /* used in certain error cases */
+struct page *ceph_zero_page; /* used in certain error cases */
-const char *ceph_pr_addr(const struct sockaddr_storage *ss)
+const char *ceph_pr_addr(const struct ceph_entity_addr *addr)
{
int i;
char *s;
- struct sockaddr_in *in4 = (struct sockaddr_in *) ss;
- struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss;
+ struct sockaddr_storage ss = addr->in_addr; /* align */
+ struct sockaddr_in *in4 = (struct sockaddr_in *)&ss;
+ struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)&ss;
i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK;
s = addr_str[i];
- switch (ss->ss_family) {
+ switch (ss.ss_family) {
case AF_INET:
- snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%hu", &in4->sin_addr,
+ snprintf(s, MAX_ADDR_STR_LEN, "(%d)%pI4:%hu",
+ le32_to_cpu(addr->type), &in4->sin_addr,
ntohs(in4->sin_port));
break;
case AF_INET6:
- snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%hu", &in6->sin6_addr,
+ snprintf(s, MAX_ADDR_STR_LEN, "(%d)[%pI6c]:%hu",
+ le32_to_cpu(addr->type), &in6->sin6_addr,
ntohs(in6->sin6_port));
break;
default:
snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)",
- ss->ss_family);
+ ss.ss_family);
}
return s;
}
EXPORT_SYMBOL(ceph_pr_addr);
-static void encode_my_addr(struct ceph_messenger *msgr)
+void ceph_encode_my_addr(struct ceph_messenger *msgr)
{
- memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
- ceph_encode_addr(&msgr->my_enc_addr);
+ if (!ceph_msgr2(from_msgr(msgr))) {
+ memcpy(&msgr->my_enc_addr, &msgr->inst.addr,
+ sizeof(msgr->my_enc_addr));
+ ceph_encode_banner_addr(&msgr->my_enc_addr);
+ }
}
/*
@@ -235,23 +215,11 @@ static int ceph_msgr_slab_init(void)
if (!ceph_msg_cache)
return -ENOMEM;
- BUG_ON(ceph_msg_data_cache);
- ceph_msg_data_cache = KMEM_CACHE(ceph_msg_data, 0);
- if (ceph_msg_data_cache)
- return 0;
-
- kmem_cache_destroy(ceph_msg_cache);
- ceph_msg_cache = NULL;
-
- return -ENOMEM;
+ return 0;
}
static void ceph_msgr_slab_exit(void)
{
- BUG_ON(!ceph_msg_data_cache);
- kmem_cache_destroy(ceph_msg_data_cache);
- ceph_msg_data_cache = NULL;
-
BUG_ON(!ceph_msg_cache);
kmem_cache_destroy(ceph_msg_cache);
ceph_msg_cache = NULL;
@@ -264,9 +232,9 @@ static void _ceph_msgr_exit(void)
ceph_msgr_wq = NULL;
}
- BUG_ON(zero_page == NULL);
- put_page(zero_page);
- zero_page = NULL;
+ BUG_ON(!ceph_zero_page);
+ put_page(ceph_zero_page);
+ ceph_zero_page = NULL;
ceph_msgr_slab_exit();
}
@@ -276,15 +244,16 @@ int __init ceph_msgr_init(void)
if (ceph_msgr_slab_init())
return -ENOMEM;
- BUG_ON(zero_page != NULL);
- zero_page = ZERO_PAGE(0);
- get_page(zero_page);
+ BUG_ON(ceph_zero_page);
+ ceph_zero_page = ZERO_PAGE(0);
+ get_page(ceph_zero_page);
/*
* The number of active work items is limited by the number of
* connections, so leave @max_active at default.
*/
- ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_MEM_RECLAIM, 0);
+ ceph_msgr_wq = alloc_workqueue("ceph-msgr",
+ WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (ceph_msgr_wq)
return 0;
@@ -377,12 +346,15 @@ static void con_sock_state_closed(struct ceph_connection *con)
static void ceph_sock_data_ready(struct sock *sk)
{
struct ceph_connection *con = sk->sk_user_data;
+
+ trace_sk_data_ready(sk);
+
if (atomic_read(&con->msgr->stopping)) {
return;
}
if (sk->sk_state != TCP_CLOSE_WAIT) {
- dout("%s on %p state = %lu, queueing work\n", __func__,
+ dout("%s %p state = %d, queueing work\n", __func__,
con, con->state);
queue_con(con);
}
@@ -400,7 +372,7 @@ static void ceph_sock_write_space(struct sock *sk)
* buffer. See net/ipv4/tcp_input.c:tcp_check_space()
* and net/core/stream.c:sk_stream_write_space().
*/
- if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) {
+ if (ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)) {
if (sk_stream_is_writeable(sk)) {
dout("%s %p queueing write work\n", __func__, con);
clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
@@ -416,17 +388,17 @@ static void ceph_sock_state_change(struct sock *sk)
{
struct ceph_connection *con = sk->sk_user_data;
- dout("%s %p state = %lu sk_state = %u\n", __func__,
+ dout("%s %p state = %d sk_state = %u\n", __func__,
con, con->state, sk->sk_state);
switch (sk->sk_state) {
case TCP_CLOSE:
dout("%s TCP_CLOSE\n", __func__);
- /* fall through */
+ fallthrough;
case TCP_CLOSE_WAIT:
dout("%s TCP_CLOSE_WAIT\n", __func__);
con_sock_state_closing(con);
- con_flag_set(con, CON_FLAG_SOCK_CLOSED);
+ ceph_con_flag_set(con, CEPH_CON_F_SOCK_CLOSED);
queue_con(con);
break;
case TCP_ESTABLISHED:
@@ -460,23 +432,26 @@ static void set_sock_callbacks(struct socket *sock,
/*
* initiate connection to a remote socket.
*/
-static int ceph_tcp_connect(struct ceph_connection *con)
+int ceph_tcp_connect(struct ceph_connection *con)
{
- struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
+ struct sockaddr_storage ss = con->peer_addr.in_addr; /* align */
struct socket *sock;
unsigned int noio_flag;
int ret;
+ dout("%s con %p peer_addr %s\n", __func__, con,
+ ceph_pr_addr(&con->peer_addr));
BUG_ON(con->sock);
/* sock_create_kern() allocates with GFP_KERNEL */
noio_flag = memalloc_noio_save();
- ret = sock_create_kern(read_pnet(&con->msgr->net), paddr->ss_family,
+ ret = sock_create_kern(read_pnet(&con->msgr->net), ss.ss_family,
SOCK_STREAM, IPPROTO_TCP, &sock);
memalloc_noio_restore(noio_flag);
if (ret)
return ret;
sock->sk->sk_allocation = GFP_NOFS;
+ sock->sk->sk_use_task_frag = false;
#ifdef CONFIG_LOCKDEP
lockdep_set_class(&sock->sk->sk_lock, &socket_class);
@@ -484,145 +459,35 @@ static int ceph_tcp_connect(struct ceph_connection *con)
set_sock_callbacks(sock, con);
- dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
-
con_sock_state_connecting(con);
- ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
- O_NONBLOCK);
+ ret = kernel_connect(sock, (struct sockaddr_unsized *)&ss, sizeof(ss),
+ O_NONBLOCK);
if (ret == -EINPROGRESS) {
dout("connect %s EINPROGRESS sk_state = %u\n",
- ceph_pr_addr(&con->peer_addr.in_addr),
+ ceph_pr_addr(&con->peer_addr),
sock->sk->sk_state);
} else if (ret < 0) {
pr_err("connect %s error %d\n",
- ceph_pr_addr(&con->peer_addr.in_addr), ret);
+ ceph_pr_addr(&con->peer_addr), ret);
sock_release(sock);
return ret;
}
- if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) {
- int optval = 1;
-
- ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
- (char *)&optval, sizeof(optval));
- if (ret)
- pr_err("kernel_setsockopt(TCP_NODELAY) failed: %d",
- ret);
- }
+ if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY))
+ tcp_sock_set_nodelay(sock->sk);
con->sock = sock;
return 0;
}
/*
- * If @buf is NULL, discard up to @len bytes.
- */
-static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
-{
- struct kvec iov = {buf, len};
- struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
- int r;
-
- if (!buf)
- msg.msg_flags |= MSG_TRUNC;
-
- iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &iov, 1, len);
- r = sock_recvmsg(sock, &msg, msg.msg_flags);
- if (r == -EAGAIN)
- r = 0;
- return r;
-}
-
-static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
- int page_offset, size_t length)
-{
- struct bio_vec bvec = {
- .bv_page = page,
- .bv_offset = page_offset,
- .bv_len = length
- };
- struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
- int r;
-
- BUG_ON(page_offset + length > PAGE_SIZE);
- iov_iter_bvec(&msg.msg_iter, READ | ITER_BVEC, &bvec, 1, length);
- r = sock_recvmsg(sock, &msg, msg.msg_flags);
- if (r == -EAGAIN)
- r = 0;
- return r;
-}
-
-/*
- * write something. @more is true if caller will be sending more data
- * shortly.
- */
-static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
- size_t kvlen, size_t len, int more)
-{
- struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
- int r;
-
- if (more)
- msg.msg_flags |= MSG_MORE;
- else
- msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
-
- r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
- if (r == -EAGAIN)
- r = 0;
- return r;
-}
-
-static int __ceph_tcp_sendpage(struct socket *sock, struct page *page,
- int offset, size_t size, bool more)
-{
- int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR);
- int ret;
-
- ret = kernel_sendpage(sock, page, offset, size, flags);
- if (ret == -EAGAIN)
- ret = 0;
-
- return ret;
-}
-
-static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
- int offset, size_t size, bool more)
-{
- struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
- struct bio_vec bvec;
- int ret;
-
- /* sendpage cannot properly handle pages with page_count == 0,
- * we need to fallback to sendmsg if that's the case */
- if (page_count(page) >= 1)
- return __ceph_tcp_sendpage(sock, page, offset, size, more);
-
- bvec.bv_page = page;
- bvec.bv_offset = offset;
- bvec.bv_len = size;
-
- if (more)
- msg.msg_flags |= MSG_MORE;
- else
- msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
-
- iov_iter_bvec(&msg.msg_iter, WRITE | ITER_BVEC, &bvec, 1, size);
- ret = sock_sendmsg(sock, &msg);
- if (ret == -EAGAIN)
- ret = 0;
-
- return ret;
-}
-
-/*
* Shutdown/close the socket for the given connection.
*/
-static int con_close_socket(struct ceph_connection *con)
+int ceph_con_close_socket(struct ceph_connection *con)
{
int rc = 0;
- dout("con_close_socket on %p sock %p\n", con, con->sock);
+ dout("%s con %p sock %p\n", __func__, con, con->sock);
if (con->sock) {
rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
sock_release(con->sock);
@@ -635,12 +500,38 @@ static int con_close_socket(struct ceph_connection *con)
* received a socket close event before we had the chance to
* shut the socket down.
*/
- con_flag_clear(con, CON_FLAG_SOCK_CLOSED);
+ ceph_con_flag_clear(con, CEPH_CON_F_SOCK_CLOSED);
con_sock_state_closed(con);
return rc;
}
+static void ceph_con_reset_protocol(struct ceph_connection *con)
+{
+ dout("%s con %p\n", __func__, con);
+
+ ceph_con_close_socket(con);
+ if (con->in_msg) {
+ WARN_ON(con->in_msg->con != con);
+ ceph_msg_put(con->in_msg);
+ con->in_msg = NULL;
+ }
+ if (con->out_msg) {
+ WARN_ON(con->out_msg->con != con);
+ ceph_msg_put(con->out_msg);
+ con->out_msg = NULL;
+ }
+ if (con->bounce_page) {
+ __free_page(con->bounce_page);
+ con->bounce_page = NULL;
+ }
+
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ceph_con_v2_reset_protocol(con);
+ else
+ ceph_con_v1_reset_protocol(con);
+}
+
/*
* Reset a connection. Discard all incoming and outgoing messages
* and clear *_seq state.
@@ -651,6 +542,7 @@ static void ceph_msg_remove(struct ceph_msg *msg)
ceph_msg_put(msg);
}
+
static void ceph_msg_remove_list(struct list_head *head)
{
while (!list_empty(head)) {
@@ -660,31 +552,22 @@ static void ceph_msg_remove_list(struct list_head *head)
}
}
-static void reset_connection(struct ceph_connection *con)
+void ceph_con_reset_session(struct ceph_connection *con)
{
- /* reset connection, out_queue, msg_ and connect_seq */
- /* discard existing out_queue and msg_seq */
- dout("reset_connection %p\n", con);
+ dout("%s con %p\n", __func__, con);
+
+ WARN_ON(con->in_msg);
+ WARN_ON(con->out_msg);
ceph_msg_remove_list(&con->out_queue);
ceph_msg_remove_list(&con->out_sent);
-
- if (con->in_msg) {
- BUG_ON(con->in_msg->con != con);
- ceph_msg_put(con->in_msg);
- con->in_msg = NULL;
- }
-
- con->connect_seq = 0;
con->out_seq = 0;
- if (con->out_msg) {
- BUG_ON(con->out_msg->con != con);
- ceph_msg_put(con->out_msg);
- con->out_msg = NULL;
- }
con->in_seq = 0;
con->in_seq_acked = 0;
- con->out_skip = 0;
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ceph_con_v2_reset_session(con);
+ else
+ ceph_con_v1_reset_session(con);
}
/*
@@ -693,19 +576,18 @@ static void reset_connection(struct ceph_connection *con)
void ceph_con_close(struct ceph_connection *con)
{
mutex_lock(&con->mutex);
- dout("con_close %p peer %s\n", con,
- ceph_pr_addr(&con->peer_addr.in_addr));
- con->state = CON_STATE_CLOSED;
+ dout("con_close %p peer %s\n", con, ceph_pr_addr(&con->peer_addr));
+ con->state = CEPH_CON_S_CLOSED;
- con_flag_clear(con, CON_FLAG_LOSSYTX); /* so we retry next connect */
- con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING);
- con_flag_clear(con, CON_FLAG_WRITE_PENDING);
- con_flag_clear(con, CON_FLAG_BACKOFF);
+ ceph_con_flag_clear(con, CEPH_CON_F_LOSSYTX); /* so we retry next
+ connect */
+ ceph_con_flag_clear(con, CEPH_CON_F_KEEPALIVE_PENDING);
+ ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+ ceph_con_flag_clear(con, CEPH_CON_F_BACKOFF);
- reset_connection(con);
- con->peer_global_seq = 0;
+ ceph_con_reset_protocol(con);
+ ceph_con_reset_session(con);
cancel_con(con);
- con_close_socket(con);
mutex_unlock(&con->mutex);
}
EXPORT_SYMBOL(ceph_con_close);
@@ -718,10 +600,10 @@ void ceph_con_open(struct ceph_connection *con,
struct ceph_entity_addr *addr)
{
mutex_lock(&con->mutex);
- dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
+ dout("con_open %p %s\n", con, ceph_pr_addr(addr));
- WARN_ON(con->state != CON_STATE_CLOSED);
- con->state = CON_STATE_PREOPEN;
+ WARN_ON(con->state != CEPH_CON_S_CLOSED);
+ con->state = CEPH_CON_S_PREOPEN;
con->peer_name.type = (__u8) entity_type;
con->peer_name.num = cpu_to_le64(entity_num);
@@ -738,7 +620,10 @@ EXPORT_SYMBOL(ceph_con_open);
*/
bool ceph_con_opened(struct ceph_connection *con)
{
- return con->connect_seq > 0;
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ return ceph_con_v2_opened(con);
+
+ return ceph_con_v1_opened(con);
}
/*
@@ -761,16 +646,15 @@ void ceph_con_init(struct ceph_connection *con, void *private,
INIT_LIST_HEAD(&con->out_sent);
INIT_DELAYED_WORK(&con->work, ceph_con_workfn);
- con->state = CON_STATE_CLOSED;
+ con->state = CEPH_CON_S_CLOSED;
}
EXPORT_SYMBOL(ceph_con_init);
-
/*
* We maintain a global counter to order connection attempts. Get
* a unique seq greater than @gt.
*/
-static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
+u32 ceph_get_global_seq(struct ceph_messenger *msgr, u32 gt)
{
u32 ret;
@@ -782,48 +666,53 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
return ret;
}
-static void con_out_kvec_reset(struct ceph_connection *con)
-{
- BUG_ON(con->out_skip);
-
- con->out_kvec_left = 0;
- con->out_kvec_bytes = 0;
- con->out_kvec_cur = &con->out_kvec[0];
-}
-
-static void con_out_kvec_add(struct ceph_connection *con,
- size_t size, void *data)
+/*
+ * Discard messages that have been acked by the server.
+ */
+void ceph_con_discard_sent(struct ceph_connection *con, u64 ack_seq)
{
- int index = con->out_kvec_left;
+ struct ceph_msg *msg;
+ u64 seq;
- BUG_ON(con->out_skip);
- BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
+ dout("%s con %p ack_seq %llu\n", __func__, con, ack_seq);
+ while (!list_empty(&con->out_sent)) {
+ msg = list_first_entry(&con->out_sent, struct ceph_msg,
+ list_head);
+ WARN_ON(msg->needs_out_seq);
+ seq = le64_to_cpu(msg->hdr.seq);
+ if (seq > ack_seq)
+ break;
- con->out_kvec[index].iov_len = size;
- con->out_kvec[index].iov_base = data;
- con->out_kvec_left++;
- con->out_kvec_bytes += size;
+ dout("%s con %p discarding msg %p seq %llu\n", __func__, con,
+ msg, seq);
+ ceph_msg_remove(msg);
+ }
}
/*
- * Chop off a kvec from the end. Return residual number of bytes for
- * that kvec, i.e. how many bytes would have been written if the kvec
- * hadn't been nuked.
+ * Discard messages that have been requeued in con_fault(), up to
+ * reconnect_seq. This avoids gratuitously resending messages that
+ * the server had received and handled prior to reconnect.
*/
-static int con_out_kvec_skip(struct ceph_connection *con)
+void ceph_con_discard_requeued(struct ceph_connection *con, u64 reconnect_seq)
{
- int off = con->out_kvec_cur - con->out_kvec;
- int skip = 0;
+ struct ceph_msg *msg;
+ u64 seq;
- if (con->out_kvec_bytes > 0) {
- skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len;
- BUG_ON(con->out_kvec_bytes < skip);
- BUG_ON(!con->out_kvec_left);
- con->out_kvec_bytes -= skip;
- con->out_kvec_left--;
- }
+ dout("%s con %p reconnect_seq %llu\n", __func__, con, reconnect_seq);
+ while (!list_empty(&con->out_queue)) {
+ msg = list_first_entry(&con->out_queue, struct ceph_msg,
+ list_head);
+ if (msg->needs_out_seq)
+ break;
+ seq = le64_to_cpu(msg->hdr.seq);
+ if (seq > reconnect_seq)
+ break;
- return skip;
+ dout("%s con %p discarding msg %p seq %llu\n", __func__, con,
+ msg, seq);
+ ceph_msg_remove(msg);
+ }
}
#ifdef CONFIG_BLOCK
@@ -845,7 +734,6 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor,
it->iter.bi_size = cursor->resid;
BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter));
- cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter);
}
static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
@@ -864,18 +752,18 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
size_t bytes)
{
struct ceph_bio_iter *it = &cursor->bio_iter;
+ struct page *page = bio_iter_page(it->bio, it->iter);
BUG_ON(bytes > cursor->resid);
BUG_ON(bytes > bio_iter_len(it->bio, it->iter));
cursor->resid -= bytes;
bio_advance_iter(it->bio, &it->iter, bytes);
- if (!cursor->resid) {
- BUG_ON(!cursor->last_piece);
+ if (!cursor->resid)
return false; /* no more data */
- }
- if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done))
+ if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done &&
+ page == bio_iter_page(it->bio, it->iter)))
return false; /* more bytes to process in this segment */
if (!it->iter.bi_size) {
@@ -885,9 +773,7 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
it->iter.bi_size = cursor->resid;
}
- BUG_ON(cursor->last_piece);
BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter));
- cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter);
return true;
}
#endif /* CONFIG_BLOCK */
@@ -903,8 +789,6 @@ static void ceph_msg_data_bvecs_cursor_init(struct ceph_msg_data_cursor *cursor,
cursor->bvec_iter.bi_size = cursor->resid;
BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter));
- cursor->last_piece =
- cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter);
}
static struct page *ceph_msg_data_bvecs_next(struct ceph_msg_data_cursor *cursor,
@@ -923,24 +807,21 @@ static bool ceph_msg_data_bvecs_advance(struct ceph_msg_data_cursor *cursor,
size_t bytes)
{
struct bio_vec *bvecs = cursor->data->bvec_pos.bvecs;
+ struct page *page = bvec_iter_page(bvecs, cursor->bvec_iter);
BUG_ON(bytes > cursor->resid);
BUG_ON(bytes > bvec_iter_len(bvecs, cursor->bvec_iter));
cursor->resid -= bytes;
bvec_iter_advance(bvecs, &cursor->bvec_iter, bytes);
- if (!cursor->resid) {
- BUG_ON(!cursor->last_piece);
+ if (!cursor->resid)
return false; /* no more data */
- }
- if (!bytes || cursor->bvec_iter.bi_bvec_done)
+ if (!bytes || (cursor->bvec_iter.bi_bvec_done &&
+ page == bvec_iter_page(bvecs, cursor->bvec_iter)))
return false; /* more bytes to process in this segment */
- BUG_ON(cursor->last_piece);
BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter));
- cursor->last_piece =
- cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter);
return true;
}
@@ -966,7 +847,6 @@ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor,
BUG_ON(page_count > (int)USHRT_MAX);
cursor->page_count = (unsigned short)page_count;
BUG_ON(length > SIZE_MAX - cursor->page_offset);
- cursor->last_piece = cursor->page_offset + cursor->resid <= PAGE_SIZE;
}
static struct page *
@@ -981,11 +861,7 @@ ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor,
BUG_ON(cursor->page_offset >= PAGE_SIZE);
*page_offset = cursor->page_offset;
- if (cursor->last_piece)
- *length = cursor->resid;
- else
- *length = PAGE_SIZE - *page_offset;
-
+ *length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset);
return data->pages[cursor->page_index];
}
@@ -1010,8 +886,6 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor,
BUG_ON(cursor->page_index >= cursor->page_count);
cursor->page_index++;
- cursor->last_piece = cursor->resid <= PAGE_SIZE;
-
return true;
}
@@ -1041,7 +915,6 @@ ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor,
cursor->resid = min(length, pagelist->length);
cursor->page = page;
cursor->offset = 0;
- cursor->last_piece = cursor->resid <= PAGE_SIZE;
}
static struct page *
@@ -1061,11 +934,7 @@ ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor,
/* offset of first page in pagelist is always 0 */
*page_offset = cursor->offset & ~PAGE_MASK;
- if (cursor->last_piece)
- *length = cursor->resid;
- else
- *length = PAGE_SIZE - *page_offset;
-
+ *length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset);
return cursor->page;
}
@@ -1098,11 +967,65 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
cursor->page = list_next_entry(cursor->page, lru);
- cursor->last_piece = cursor->resid <= PAGE_SIZE;
-
return true;
}
+static void ceph_msg_data_iter_cursor_init(struct ceph_msg_data_cursor *cursor,
+ size_t length)
+{
+ struct ceph_msg_data *data = cursor->data;
+
+ cursor->iov_iter = data->iter;
+ cursor->lastlen = 0;
+ iov_iter_truncate(&cursor->iov_iter, length);
+ cursor->resid = iov_iter_count(&cursor->iov_iter);
+}
+
+static struct page *ceph_msg_data_iter_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset, size_t *length)
+{
+ struct page *page;
+ ssize_t len;
+
+ if (cursor->lastlen)
+ iov_iter_revert(&cursor->iov_iter, cursor->lastlen);
+
+ len = iov_iter_get_pages2(&cursor->iov_iter, &page, PAGE_SIZE,
+ 1, page_offset);
+ BUG_ON(len < 0);
+
+ cursor->lastlen = len;
+
+ /*
+ * FIXME: The assumption is that the pages represented by the iov_iter
+ * are pinned, with the references held by the upper-level
+ * callers, or by virtue of being under writeback. Eventually,
+ * we'll get an iov_iter_get_pages2 variant that doesn't take
+ * page refs. Until then, just put the page ref.
+ */
+ VM_BUG_ON_PAGE(!PageWriteback(page) && page_count(page) < 2, page);
+ put_page(page);
+
+ *length = min_t(size_t, len, cursor->resid);
+ return page;
+}
+
+static bool ceph_msg_data_iter_advance(struct ceph_msg_data_cursor *cursor,
+ size_t bytes)
+{
+ BUG_ON(bytes > cursor->resid);
+ cursor->resid -= bytes;
+
+ if (bytes < cursor->lastlen) {
+ cursor->lastlen -= bytes;
+ } else {
+ iov_iter_advance(&cursor->iov_iter, bytes - cursor->lastlen);
+ cursor->lastlen = 0;
+ }
+
+ return cursor->resid;
+}
+
/*
* Message data is handled (sent or received) in pieces, where each
* piece resides on a single page. The network layer might not
@@ -1130,6 +1053,9 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
case CEPH_MSG_DATA_BVECS:
ceph_msg_data_bvecs_cursor_init(cursor, length);
break;
+ case CEPH_MSG_DATA_ITER:
+ ceph_msg_data_iter_cursor_init(cursor, length);
+ break;
case CEPH_MSG_DATA_NONE:
default:
/* BUG(); */
@@ -1138,19 +1064,16 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
cursor->need_crc = true;
}
-static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
+void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor,
+ struct ceph_msg *msg, size_t length)
{
- struct ceph_msg_data_cursor *cursor = &msg->cursor;
- struct ceph_msg_data *data;
-
BUG_ON(!length);
BUG_ON(length > msg->data_length);
- BUG_ON(list_empty(&msg->data));
+ BUG_ON(!msg->num_data_items);
- cursor->data_head = &msg->data;
cursor->total_resid = length;
- data = list_first_entry(&msg->data, struct ceph_msg_data, links);
- cursor->data = data;
+ cursor->data = msg->data;
+ cursor->sr_resid = 0;
__ceph_msg_data_cursor_init(cursor);
}
@@ -1160,9 +1083,8 @@ static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
* data item, and supply the page offset and length of that piece.
* Indicate whether this is the last piece in this data item.
*/
-static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
- size_t *page_offset, size_t *length,
- bool *last_piece)
+struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
+ size_t *page_offset, size_t *length)
{
struct page *page;
@@ -1181,6 +1103,9 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
case CEPH_MSG_DATA_BVECS:
page = ceph_msg_data_bvecs_next(cursor, page_offset, length);
break;
+ case CEPH_MSG_DATA_ITER:
+ page = ceph_msg_data_iter_next(cursor, page_offset, length);
+ break;
case CEPH_MSG_DATA_NONE:
default:
page = NULL;
@@ -1191,8 +1116,6 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
BUG_ON(*page_offset + *length > PAGE_SIZE);
BUG_ON(!*length);
BUG_ON(*length > cursor->resid);
- if (last_piece)
- *last_piece = cursor->last_piece;
return page;
}
@@ -1201,8 +1124,7 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
* Returns true if the result moves the cursor on to the next piece
* of the data item.
*/
-static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
- size_t bytes)
+void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes)
{
bool new_piece;
@@ -1222,6 +1144,9 @@ static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
case CEPH_MSG_DATA_BVECS:
new_piece = ceph_msg_data_bvecs_advance(cursor, bytes);
break;
+ case CEPH_MSG_DATA_ITER:
+ new_piece = ceph_msg_data_iter_advance(cursor, bytes);
+ break;
case CEPH_MSG_DATA_NONE:
default:
BUG();
@@ -1230,340 +1155,15 @@ static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
cursor->total_resid -= bytes;
if (!cursor->resid && cursor->total_resid) {
- WARN_ON(!cursor->last_piece);
- BUG_ON(list_is_last(&cursor->data->links, cursor->data_head));
- cursor->data = list_next_entry(cursor->data, links);
+ cursor->data++;
__ceph_msg_data_cursor_init(cursor);
new_piece = true;
}
cursor->need_crc = new_piece;
}
-static size_t sizeof_footer(struct ceph_connection *con)
-{
- return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
- sizeof(struct ceph_msg_footer) :
- sizeof(struct ceph_msg_footer_old);
-}
-
-static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
-{
- BUG_ON(!msg);
- BUG_ON(!data_len);
-
- /* Initialize data cursor */
-
- ceph_msg_data_cursor_init(msg, (size_t)data_len);
-}
-
-/*
- * Prepare footer for currently outgoing message, and finish things
- * off. Assumes out_kvec* are already valid.. we just add on to the end.
- */
-static void prepare_write_message_footer(struct ceph_connection *con)
-{
- struct ceph_msg *m = con->out_msg;
-
- m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
-
- dout("prepare_write_message_footer %p\n", con);
- con_out_kvec_add(con, sizeof_footer(con), &m->footer);
- if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
- if (con->ops->sign_message)
- con->ops->sign_message(m);
- else
- m->footer.sig = 0;
- } else {
- m->old_footer.flags = m->footer.flags;
- }
- con->out_more = m->more_to_follow;
- con->out_msg_done = true;
-}
-
-/*
- * Prepare headers for the next outgoing message.
- */
-static void prepare_write_message(struct ceph_connection *con)
-{
- struct ceph_msg *m;
- u32 crc;
-
- con_out_kvec_reset(con);
- con->out_msg_done = false;
-
- /* Sneak an ack in there first? If we can get it into the same
- * TCP packet that's a good thing. */
- if (con->in_seq > con->in_seq_acked) {
- con->in_seq_acked = con->in_seq;
- con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
- con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
- con_out_kvec_add(con, sizeof (con->out_temp_ack),
- &con->out_temp_ack);
- }
-
- BUG_ON(list_empty(&con->out_queue));
- m = list_first_entry(&con->out_queue, struct ceph_msg, list_head);
- con->out_msg = m;
- BUG_ON(m->con != con);
-
- /* put message on sent list */
- ceph_msg_get(m);
- list_move_tail(&m->list_head, &con->out_sent);
-
- /*
- * only assign outgoing seq # if we haven't sent this message
- * yet. if it is requeued, resend with it's original seq.
- */
- if (m->needs_out_seq) {
- m->hdr.seq = cpu_to_le64(++con->out_seq);
- m->needs_out_seq = false;
-
- if (con->ops->reencode_message)
- con->ops->reencode_message(m);
- }
-
- dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
- m, con->out_seq, le16_to_cpu(m->hdr.type),
- le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
- m->data_length);
- WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len));
- WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
-
- /* tag + hdr + front + middle */
- con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
- con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr);
- con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
-
- if (m->middle)
- con_out_kvec_add(con, m->middle->vec.iov_len,
- m->middle->vec.iov_base);
-
- /* fill in hdr crc and finalize hdr */
- crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
- con->out_msg->hdr.crc = cpu_to_le32(crc);
- memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr));
-
- /* fill in front and middle crc, footer */
- crc = crc32c(0, m->front.iov_base, m->front.iov_len);
- con->out_msg->footer.front_crc = cpu_to_le32(crc);
- if (m->middle) {
- crc = crc32c(0, m->middle->vec.iov_base,
- m->middle->vec.iov_len);
- con->out_msg->footer.middle_crc = cpu_to_le32(crc);
- } else
- con->out_msg->footer.middle_crc = 0;
- dout("%s front_crc %u middle_crc %u\n", __func__,
- le32_to_cpu(con->out_msg->footer.front_crc),
- le32_to_cpu(con->out_msg->footer.middle_crc));
- con->out_msg->footer.flags = 0;
-
- /* is there a data payload? */
- con->out_msg->footer.data_crc = 0;
- if (m->data_length) {
- prepare_message_data(con->out_msg, m->data_length);
- con->out_more = 1; /* data + footer will follow */
- } else {
- /* no, queue up footer too and be done */
- prepare_write_message_footer(con);
- }
-
- con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-/*
- * Prepare an ack.
- */
-static void prepare_write_ack(struct ceph_connection *con)
-{
- dout("prepare_write_ack %p %llu -> %llu\n", con,
- con->in_seq_acked, con->in_seq);
- con->in_seq_acked = con->in_seq;
-
- con_out_kvec_reset(con);
-
- con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
-
- con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
- con_out_kvec_add(con, sizeof (con->out_temp_ack),
- &con->out_temp_ack);
-
- con->out_more = 1; /* more will follow.. eventually.. */
- con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-/*
- * Prepare to share the seq during handshake
- */
-static void prepare_write_seq(struct ceph_connection *con)
-{
- dout("prepare_write_seq %p %llu -> %llu\n", con,
- con->in_seq_acked, con->in_seq);
- con->in_seq_acked = con->in_seq;
-
- con_out_kvec_reset(con);
-
- con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
- con_out_kvec_add(con, sizeof (con->out_temp_ack),
- &con->out_temp_ack);
-
- con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-/*
- * Prepare to write keepalive byte.
- */
-static void prepare_write_keepalive(struct ceph_connection *con)
-{
- dout("prepare_write_keepalive %p\n", con);
- con_out_kvec_reset(con);
- if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
- struct timespec64 now;
-
- ktime_get_real_ts64(&now);
- con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
- ceph_encode_timespec64(&con->out_temp_keepalive2, &now);
- con_out_kvec_add(con, sizeof(con->out_temp_keepalive2),
- &con->out_temp_keepalive2);
- } else {
- con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
- }
- con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-/*
- * Connection negotiation.
- */
-
-static int get_connect_authorizer(struct ceph_connection *con)
-{
- struct ceph_auth_handshake *auth;
- int auth_proto;
-
- if (!con->ops->get_authorizer) {
- con->auth = NULL;
- con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
- con->out_connect.authorizer_len = 0;
- return 0;
- }
-
- auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry);
- if (IS_ERR(auth))
- return PTR_ERR(auth);
-
- con->auth = auth;
- con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
- con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len);
- return 0;
-}
-
-/*
- * We connected to a peer and are saying hello.
- */
-static void prepare_write_banner(struct ceph_connection *con)
-{
- con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
- con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
- &con->msgr->my_enc_addr);
-
- con->out_more = 0;
- con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-static void __prepare_write_connect(struct ceph_connection *con)
-{
- con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect);
- if (con->auth)
- con_out_kvec_add(con, con->auth->authorizer_buf_len,
- con->auth->authorizer_buf);
-
- con->out_more = 0;
- con_flag_set(con, CON_FLAG_WRITE_PENDING);
-}
-
-static int prepare_write_connect(struct ceph_connection *con)
-{
- unsigned int global_seq = get_global_seq(con->msgr, 0);
- int proto;
- int ret;
-
- switch (con->peer_name.type) {
- case CEPH_ENTITY_TYPE_MON:
- proto = CEPH_MONC_PROTOCOL;
- break;
- case CEPH_ENTITY_TYPE_OSD:
- proto = CEPH_OSDC_PROTOCOL;
- break;
- case CEPH_ENTITY_TYPE_MDS:
- proto = CEPH_MDSC_PROTOCOL;
- break;
- default:
- BUG();
- }
-
- dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
- con->connect_seq, global_seq, proto);
-
- con->out_connect.features =
- cpu_to_le64(from_msgr(con->msgr)->supported_features);
- con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
- con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
- con->out_connect.global_seq = cpu_to_le32(global_seq);
- con->out_connect.protocol_version = cpu_to_le32(proto);
- con->out_connect.flags = 0;
-
- ret = get_connect_authorizer(con);
- if (ret)
- return ret;
-
- __prepare_write_connect(con);
- return 0;
-}
-
-/*
- * write as much of pending kvecs to the socket as we can.
- * 1 -> done
- * 0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_kvec(struct ceph_connection *con)
-{
- int ret;
-
- dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
- while (con->out_kvec_bytes > 0) {
- ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
- con->out_kvec_left, con->out_kvec_bytes,
- con->out_more);
- if (ret <= 0)
- goto out;
- con->out_kvec_bytes -= ret;
- if (con->out_kvec_bytes == 0)
- break; /* done */
-
- /* account for full iov entries consumed */
- while (ret >= con->out_kvec_cur->iov_len) {
- BUG_ON(!con->out_kvec_left);
- ret -= con->out_kvec_cur->iov_len;
- con->out_kvec_cur++;
- con->out_kvec_left--;
- }
- /* and for a partially-consumed entry */
- if (ret) {
- con->out_kvec_cur->iov_len -= ret;
- con->out_kvec_cur->iov_base += ret;
- }
- }
- con->out_kvec_left = 0;
- ret = 1;
-out:
- dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
- con->out_kvec_bytes, con->out_kvec_left, ret);
- return ret; /* done! */
-}
-
-static u32 ceph_crc32c_page(u32 crc, struct page *page,
- unsigned int page_offset,
- unsigned int length)
+u32 ceph_crc32c_page(u32 crc, struct page *page, unsigned int page_offset,
+ unsigned int length)
{
char *kaddr;
@@ -1574,284 +1174,43 @@ static u32 ceph_crc32c_page(u32 crc, struct page *page,
return crc;
}
-/*
- * Write as much message data payload as we can. If we finish, queue
- * up the footer.
- * 1 -> done, footer is now queued in out_kvec[].
- * 0 -> socket full, but more to do
- * <0 -> error
- */
-static int write_partial_message_data(struct ceph_connection *con)
-{
- struct ceph_msg *msg = con->out_msg;
- struct ceph_msg_data_cursor *cursor = &msg->cursor;
- bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
- u32 crc;
-
- dout("%s %p msg %p\n", __func__, con, msg);
-
- if (list_empty(&msg->data))
- return -EINVAL;
-
- /*
- * Iterate through each page that contains data to be
- * written, and send as much as possible for each.
- *
- * If we are calculating the data crc (the default), we will
- * need to map the page. If we have no pages, they have
- * been revoked, so use the zero page.
- */
- crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
- while (cursor->total_resid) {
- struct page *page;
- size_t page_offset;
- size_t length;
- bool last_piece;
- int ret;
-
- if (!cursor->resid) {
- ceph_msg_data_advance(cursor, 0);
- continue;
- }
-
- page = ceph_msg_data_next(cursor, &page_offset, &length,
- &last_piece);
- ret = ceph_tcp_sendpage(con->sock, page, page_offset,
- length, !last_piece);
- if (ret <= 0) {
- if (do_datacrc)
- msg->footer.data_crc = cpu_to_le32(crc);
-
- return ret;
- }
- if (do_datacrc && cursor->need_crc)
- crc = ceph_crc32c_page(crc, page, page_offset, length);
- ceph_msg_data_advance(cursor, (size_t)ret);
- }
-
- dout("%s %p msg %p done\n", __func__, con, msg);
-
- /* prepare and queue up footer, too */
- if (do_datacrc)
- msg->footer.data_crc = cpu_to_le32(crc);
- else
- msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
- con_out_kvec_reset(con);
- prepare_write_message_footer(con);
-
- return 1; /* must return > 0 to indicate success */
-}
-
-/*
- * write some zeros
- */
-static int write_partial_skip(struct ceph_connection *con)
-{
- int ret;
-
- dout("%s %p %d left\n", __func__, con, con->out_skip);
- while (con->out_skip > 0) {
- size_t size = min(con->out_skip, (int) PAGE_SIZE);
-
- ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true);
- if (ret <= 0)
- goto out;
- con->out_skip -= ret;
- }
- ret = 1;
-out:
- return ret;
-}
-
-/*
- * Prepare to read connection handshake, or an ack.
- */
-static void prepare_read_banner(struct ceph_connection *con)
-{
- dout("prepare_read_banner %p\n", con);
- con->in_base_pos = 0;
-}
-
-static void prepare_read_connect(struct ceph_connection *con)
-{
- dout("prepare_read_connect %p\n", con);
- con->in_base_pos = 0;
-}
-
-static void prepare_read_ack(struct ceph_connection *con)
-{
- dout("prepare_read_ack %p\n", con);
- con->in_base_pos = 0;
-}
-
-static void prepare_read_seq(struct ceph_connection *con)
-{
- dout("prepare_read_seq %p\n", con);
- con->in_base_pos = 0;
- con->in_tag = CEPH_MSGR_TAG_SEQ;
-}
-
-static void prepare_read_tag(struct ceph_connection *con)
-{
- dout("prepare_read_tag %p\n", con);
- con->in_base_pos = 0;
- con->in_tag = CEPH_MSGR_TAG_READY;
-}
-
-static void prepare_read_keepalive_ack(struct ceph_connection *con)
-{
- dout("prepare_read_keepalive_ack %p\n", con);
- con->in_base_pos = 0;
-}
-
-/*
- * Prepare to read a message.
- */
-static int prepare_read_message(struct ceph_connection *con)
-{
- dout("prepare_read_message %p\n", con);
- BUG_ON(con->in_msg != NULL);
- con->in_base_pos = 0;
- con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
- return 0;
-}
-
-
-static int read_partial(struct ceph_connection *con,
- int end, int size, void *object)
-{
- while (con->in_base_pos < end) {
- int left = end - con->in_base_pos;
- int have = size - left;
- int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
- if (ret <= 0)
- return ret;
- con->in_base_pos += ret;
- }
- return 1;
-}
-
-
-/*
- * Read all or part of the connect-side handshake on a new connection
- */
-static int read_partial_banner(struct ceph_connection *con)
-{
- int size;
- int end;
- int ret;
-
- dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
-
- /* peer's banner */
- size = strlen(CEPH_BANNER);
- end = size;
- ret = read_partial(con, end, size, con->in_banner);
- if (ret <= 0)
- goto out;
-
- size = sizeof (con->actual_peer_addr);
- end += size;
- ret = read_partial(con, end, size, &con->actual_peer_addr);
- if (ret <= 0)
- goto out;
-
- size = sizeof (con->peer_addr_for_me);
- end += size;
- ret = read_partial(con, end, size, &con->peer_addr_for_me);
- if (ret <= 0)
- goto out;
-
-out:
- return ret;
-}
-static int read_partial_connect(struct ceph_connection *con)
+bool ceph_addr_is_blank(const struct ceph_entity_addr *addr)
{
- int size;
- int end;
- int ret;
-
- dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
-
- size = sizeof (con->in_reply);
- end = size;
- ret = read_partial(con, end, size, &con->in_reply);
- if (ret <= 0)
- goto out;
-
- if (con->auth) {
- size = le32_to_cpu(con->in_reply.authorizer_len);
- if (size > con->auth->authorizer_reply_buf_len) {
- pr_err("authorizer reply too big: %d > %zu\n", size,
- con->auth->authorizer_reply_buf_len);
- ret = -EINVAL;
- goto out;
- }
+ struct sockaddr_storage ss = addr->in_addr; /* align */
+ struct in_addr *addr4 = &((struct sockaddr_in *)&ss)->sin_addr;
+ struct in6_addr *addr6 = &((struct sockaddr_in6 *)&ss)->sin6_addr;
- end += size;
- ret = read_partial(con, end, size,
- con->auth->authorizer_reply_buf);
- if (ret <= 0)
- goto out;
- }
-
- dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
- con, (int)con->in_reply.tag,
- le32_to_cpu(con->in_reply.connect_seq),
- le32_to_cpu(con->in_reply.global_seq));
-out:
- return ret;
-}
-
-/*
- * Verify the hello banner looks okay.
- */
-static int verify_hello(struct ceph_connection *con)
-{
- if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
- pr_err("connect to %s got bad banner\n",
- ceph_pr_addr(&con->peer_addr.in_addr));
- con->error_msg = "protocol error, bad banner";
- return -1;
- }
- return 0;
-}
-
-static bool addr_is_blank(struct sockaddr_storage *ss)
-{
- struct in_addr *addr = &((struct sockaddr_in *)ss)->sin_addr;
- struct in6_addr *addr6 = &((struct sockaddr_in6 *)ss)->sin6_addr;
-
- switch (ss->ss_family) {
+ switch (ss.ss_family) {
case AF_INET:
- return addr->s_addr == htonl(INADDR_ANY);
+ return addr4->s_addr == htonl(INADDR_ANY);
case AF_INET6:
return ipv6_addr_any(addr6);
default:
return true;
}
}
+EXPORT_SYMBOL(ceph_addr_is_blank);
-static int addr_port(struct sockaddr_storage *ss)
+int ceph_addr_port(const struct ceph_entity_addr *addr)
{
- switch (ss->ss_family) {
+ switch (get_unaligned(&addr->in_addr.ss_family)) {
case AF_INET:
- return ntohs(((struct sockaddr_in *)ss)->sin_port);
+ return ntohs(get_unaligned(&((struct sockaddr_in *)&addr->in_addr)->sin_port));
case AF_INET6:
- return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
+ return ntohs(get_unaligned(&((struct sockaddr_in6 *)&addr->in_addr)->sin6_port));
}
return 0;
}
-static void addr_set_port(struct sockaddr_storage *ss, int p)
+void ceph_addr_set_port(struct ceph_entity_addr *addr, int p)
{
- switch (ss->ss_family) {
+ switch (get_unaligned(&addr->in_addr.ss_family)) {
case AF_INET:
- ((struct sockaddr_in *)ss)->sin_port = htons(p);
+ put_unaligned(htons(p), &((struct sockaddr_in *)&addr->in_addr)->sin_port);
break;
case AF_INET6:
- ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
+ put_unaligned(htons(p), &((struct sockaddr_in6 *)&addr->in_addr)->sin6_port);
break;
}
}
@@ -1859,21 +1218,18 @@ static void addr_set_port(struct sockaddr_storage *ss, int p)
/*
* Unlike other *_pton function semantics, zero indicates success.
*/
-static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss,
+static int ceph_pton(const char *str, size_t len, struct ceph_entity_addr *addr,
char delim, const char **ipend)
{
- struct sockaddr_in *in4 = (struct sockaddr_in *) ss;
- struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss;
-
- memset(ss, 0, sizeof(*ss));
+ memset(&addr->in_addr, 0, sizeof(addr->in_addr));
- if (in4_pton(str, len, (u8 *)&in4->sin_addr.s_addr, delim, ipend)) {
- ss->ss_family = AF_INET;
+ if (in4_pton(str, len, (u8 *)&((struct sockaddr_in *)&addr->in_addr)->sin_addr.s_addr, delim, ipend)) {
+ put_unaligned(AF_INET, &addr->in_addr.ss_family);
return 0;
}
- if (in6_pton(str, len, (u8 *)&in6->sin6_addr.s6_addr, delim, ipend)) {
- ss->ss_family = AF_INET6;
+ if (in6_pton(str, len, (u8 *)&((struct sockaddr_in6 *)&addr->in_addr)->sin6_addr.s6_addr, delim, ipend)) {
+ put_unaligned(AF_INET6, &addr->in_addr.ss_family);
return 0;
}
@@ -1885,7 +1241,7 @@ static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss,
*/
#ifdef CONFIG_CEPH_LIB_USE_DNS_RESOLVER
static int ceph_dns_resolve_name(const char *name, size_t namelen,
- struct sockaddr_storage *ss, char delim, const char **ipend)
+ struct ceph_entity_addr *addr, char delim, const char **ipend)
{
const char *end, *delim_p;
char *colon_p, *ip_addr = NULL;
@@ -1899,7 +1255,7 @@ static int ceph_dns_resolve_name(const char *name, size_t namelen,
colon_p = memchr(name, ':', namelen);
if (delim_p && colon_p)
- end = delim_p < colon_p ? delim_p : colon_p;
+ end = min(delim_p, colon_p);
else if (!delim_p && colon_p)
end = colon_p;
else {
@@ -1912,9 +1268,10 @@ static int ceph_dns_resolve_name(const char *name, size_t namelen,
return -EINVAL;
/* do dns_resolve upcall */
- ip_len = dns_query(NULL, name, end - name, NULL, &ip_addr, NULL);
+ ip_len = dns_query(current->nsproxy->net_ns,
+ NULL, name, end - name, NULL, &ip_addr, NULL, false);
if (ip_len > 0)
- ret = ceph_pton(ip_addr, ip_len, ss, -1, NULL);
+ ret = ceph_pton(ip_addr, ip_len, addr, -1, NULL);
else
ret = -ESRCH;
@@ -1923,13 +1280,13 @@ static int ceph_dns_resolve_name(const char *name, size_t namelen,
*ipend = end;
pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name,
- ret, ret ? "failed" : ceph_pr_addr(ss));
+ ret, ret ? "failed" : ceph_pr_addr(addr));
return ret;
}
#else
static inline int ceph_dns_resolve_name(const char *name, size_t namelen,
- struct sockaddr_storage *ss, char delim, const char **ipend)
+ struct ceph_entity_addr *addr, char delim, const char **ipend)
{
return -EINVAL;
}
@@ -1940,13 +1297,13 @@ static inline int ceph_dns_resolve_name(const char *name, size_t namelen,
* then try to extract a hostname to resolve using userspace DNS upcall.
*/
static int ceph_parse_server_name(const char *name, size_t namelen,
- struct sockaddr_storage *ss, char delim, const char **ipend)
+ struct ceph_entity_addr *addr, char delim, const char **ipend)
{
int ret;
- ret = ceph_pton(name, namelen, ss, delim, ipend);
+ ret = ceph_pton(name, namelen, addr, delim, ipend);
if (ret)
- ret = ceph_dns_resolve_name(name, namelen, ss, delim, ipend);
+ ret = ceph_dns_resolve_name(name, namelen, addr, delim, ipend);
return ret;
}
@@ -1957,31 +1314,31 @@ static int ceph_parse_server_name(const char *name, size_t namelen,
*/
int ceph_parse_ips(const char *c, const char *end,
struct ceph_entity_addr *addr,
- int max_count, int *count)
+ int max_count, int *count, char delim)
{
int i, ret = -EINVAL;
const char *p = c;
dout("parse_ips on '%.*s'\n", (int)(end-c), c);
for (i = 0; i < max_count; i++) {
+ char cur_delim = delim;
const char *ipend;
- struct sockaddr_storage *ss = &addr[i].in_addr;
int port;
- char delim = ',';
if (*p == '[') {
- delim = ']';
+ cur_delim = ']';
p++;
}
- ret = ceph_parse_server_name(p, end - p, ss, delim, &ipend);
+ ret = ceph_parse_server_name(p, end - p, &addr[i], cur_delim,
+ &ipend);
if (ret)
goto bad;
ret = -EINVAL;
p = ipend;
- if (delim == ']') {
+ if (cur_delim == ']') {
if (*p != ']') {
dout("missing matching ']'\n");
goto bad;
@@ -2005,13 +1362,23 @@ int ceph_parse_ips(const char *c, const char *end,
port = CEPH_MON_PORT;
}
- addr_set_port(ss, port);
+ ceph_addr_set_port(&addr[i], port);
+ /*
+ * We want the type to be set according to ms_mode
+ * option, but options are normally parsed after mon
+ * addresses. Rather than complicating parsing, set
+ * to LEGACY and override in build_initial_monmap()
+ * for mon addresses and ceph_messenger_init() for
+ * ip option.
+ */
+ addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY;
+ addr[i].nonce = 0;
- dout("parse_ips got %s\n", ceph_pr_addr(ss));
+ dout("%s got %s\n", __func__, ceph_pr_addr(&addr[i]));
if (p == end)
break;
- if (*p != ',')
+ if (*p != delim)
goto bad;
p++;
}
@@ -2024,527 +1391,15 @@ int ceph_parse_ips(const char *c, const char *end,
return 0;
bad:
- pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
return ret;
}
-EXPORT_SYMBOL(ceph_parse_ips);
-
-static int process_banner(struct ceph_connection *con)
-{
- dout("process_banner on %p\n", con);
-
- if (verify_hello(con) < 0)
- return -1;
-
- ceph_decode_addr(&con->actual_peer_addr);
- ceph_decode_addr(&con->peer_addr_for_me);
-
- /*
- * Make sure the other end is who we wanted. note that the other
- * end may not yet know their ip address, so if it's 0.0.0.0, give
- * them the benefit of the doubt.
- */
- if (memcmp(&con->peer_addr, &con->actual_peer_addr,
- sizeof(con->peer_addr)) != 0 &&
- !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
- con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
- pr_warn("wrong peer, want %s/%d, got %s/%d\n",
- ceph_pr_addr(&con->peer_addr.in_addr),
- (int)le32_to_cpu(con->peer_addr.nonce),
- ceph_pr_addr(&con->actual_peer_addr.in_addr),
- (int)le32_to_cpu(con->actual_peer_addr.nonce));
- con->error_msg = "wrong peer at address";
- return -1;
- }
-
- /*
- * did we learn our address?
- */
- if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
- int port = addr_port(&con->msgr->inst.addr.in_addr);
-
- memcpy(&con->msgr->inst.addr.in_addr,
- &con->peer_addr_for_me.in_addr,
- sizeof(con->peer_addr_for_me.in_addr));
- addr_set_port(&con->msgr->inst.addr.in_addr, port);
- encode_my_addr(con->msgr);
- dout("process_banner learned my addr is %s\n",
- ceph_pr_addr(&con->msgr->inst.addr.in_addr));
- }
-
- return 0;
-}
-
-static int process_connect(struct ceph_connection *con)
-{
- u64 sup_feat = from_msgr(con->msgr)->supported_features;
- u64 req_feat = from_msgr(con->msgr)->required_features;
- u64 server_feat = le64_to_cpu(con->in_reply.features);
- int ret;
-
- dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
-
- if (con->auth) {
- /*
- * Any connection that defines ->get_authorizer()
- * should also define ->add_authorizer_challenge() and
- * ->verify_authorizer_reply().
- *
- * See get_connect_authorizer().
- */
- if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
- ret = con->ops->add_authorizer_challenge(
- con, con->auth->authorizer_reply_buf,
- le32_to_cpu(con->in_reply.authorizer_len));
- if (ret < 0)
- return ret;
-
- con_out_kvec_reset(con);
- __prepare_write_connect(con);
- prepare_read_connect(con);
- return 0;
- }
-
- ret = con->ops->verify_authorizer_reply(con);
- if (ret < 0) {
- con->error_msg = "bad authorize reply";
- return ret;
- }
- }
-
- switch (con->in_reply.tag) {
- case CEPH_MSGR_TAG_FEATURES:
- pr_err("%s%lld %s feature set mismatch,"
- " my %llx < server's %llx, missing %llx\n",
- ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr.in_addr),
- sup_feat, server_feat, server_feat & ~sup_feat);
- con->error_msg = "missing required protocol features";
- reset_connection(con);
- return -1;
-
- case CEPH_MSGR_TAG_BADPROTOVER:
- pr_err("%s%lld %s protocol version mismatch,"
- " my %d != server's %d\n",
- ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr.in_addr),
- le32_to_cpu(con->out_connect.protocol_version),
- le32_to_cpu(con->in_reply.protocol_version));
- con->error_msg = "protocol version mismatch";
- reset_connection(con);
- return -1;
-
- case CEPH_MSGR_TAG_BADAUTHORIZER:
- con->auth_retry++;
- dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
- con->auth_retry);
- if (con->auth_retry == 2) {
- con->error_msg = "connect authorization failure";
- return -1;
- }
- con_out_kvec_reset(con);
- ret = prepare_write_connect(con);
- if (ret < 0)
- return ret;
- prepare_read_connect(con);
- break;
-
- case CEPH_MSGR_TAG_RESETSESSION:
- /*
- * If we connected with a large connect_seq but the peer
- * has no record of a session with us (no connection, or
- * connect_seq == 0), they will send RESETSESION to indicate
- * that they must have reset their session, and may have
- * dropped messages.
- */
- dout("process_connect got RESET peer seq %u\n",
- le32_to_cpu(con->in_reply.connect_seq));
- pr_err("%s%lld %s connection reset\n",
- ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr.in_addr));
- reset_connection(con);
- con_out_kvec_reset(con);
- ret = prepare_write_connect(con);
- if (ret < 0)
- return ret;
- prepare_read_connect(con);
-
- /* Tell ceph about it. */
- mutex_unlock(&con->mutex);
- pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
- if (con->ops->peer_reset)
- con->ops->peer_reset(con);
- mutex_lock(&con->mutex);
- if (con->state != CON_STATE_NEGOTIATING)
- return -EAGAIN;
- break;
-
- case CEPH_MSGR_TAG_RETRY_SESSION:
- /*
- * If we sent a smaller connect_seq than the peer has, try
- * again with a larger value.
- */
- dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
- le32_to_cpu(con->out_connect.connect_seq),
- le32_to_cpu(con->in_reply.connect_seq));
- con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
- con_out_kvec_reset(con);
- ret = prepare_write_connect(con);
- if (ret < 0)
- return ret;
- prepare_read_connect(con);
- break;
-
- case CEPH_MSGR_TAG_RETRY_GLOBAL:
- /*
- * If we sent a smaller global_seq than the peer has, try
- * again with a larger value.
- */
- dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
- con->peer_global_seq,
- le32_to_cpu(con->in_reply.global_seq));
- get_global_seq(con->msgr,
- le32_to_cpu(con->in_reply.global_seq));
- con_out_kvec_reset(con);
- ret = prepare_write_connect(con);
- if (ret < 0)
- return ret;
- prepare_read_connect(con);
- break;
-
- case CEPH_MSGR_TAG_SEQ:
- case CEPH_MSGR_TAG_READY:
- if (req_feat & ~server_feat) {
- pr_err("%s%lld %s protocol feature mismatch,"
- " my required %llx > server's %llx, need %llx\n",
- ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr.in_addr),
- req_feat, server_feat, req_feat & ~server_feat);
- con->error_msg = "missing required protocol features";
- reset_connection(con);
- return -1;
- }
-
- WARN_ON(con->state != CON_STATE_NEGOTIATING);
- con->state = CON_STATE_OPEN;
- con->auth_retry = 0; /* we authenticated; clear flag */
- con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
- con->connect_seq++;
- con->peer_features = server_feat;
- dout("process_connect got READY gseq %d cseq %d (%d)\n",
- con->peer_global_seq,
- le32_to_cpu(con->in_reply.connect_seq),
- con->connect_seq);
- WARN_ON(con->connect_seq !=
- le32_to_cpu(con->in_reply.connect_seq));
-
- if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
- con_flag_set(con, CON_FLAG_LOSSYTX);
-
- con->delay = 0; /* reset backoff memory */
-
- if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) {
- prepare_write_seq(con);
- prepare_read_seq(con);
- } else {
- prepare_read_tag(con);
- }
- break;
-
- case CEPH_MSGR_TAG_WAIT:
- /*
- * If there is a connection race (we are opening
- * connections to each other), one of us may just have
- * to WAIT. This shouldn't happen if we are the
- * client.
- */
- con->error_msg = "protocol error, got WAIT as client";
- return -1;
-
- default:
- con->error_msg = "protocol error, garbage tag during connect";
- return -1;
- }
- return 0;
-}
-
-
-/*
- * read (part of) an ack
- */
-static int read_partial_ack(struct ceph_connection *con)
-{
- int size = sizeof (con->in_temp_ack);
- int end = size;
-
- return read_partial(con, end, size, &con->in_temp_ack);
-}
-
-/*
- * We can finally discard anything that's been acked.
- */
-static void process_ack(struct ceph_connection *con)
-{
- struct ceph_msg *m;
- u64 ack = le64_to_cpu(con->in_temp_ack);
- u64 seq;
- bool reconnect = (con->in_tag == CEPH_MSGR_TAG_SEQ);
- struct list_head *list = reconnect ? &con->out_queue : &con->out_sent;
-
- /*
- * In the reconnect case, con_fault() has requeued messages
- * in out_sent. We should cleanup old messages according to
- * the reconnect seq.
- */
- while (!list_empty(list)) {
- m = list_first_entry(list, struct ceph_msg, list_head);
- if (reconnect && m->needs_out_seq)
- break;
- seq = le64_to_cpu(m->hdr.seq);
- if (seq > ack)
- break;
- dout("got ack for seq %llu type %d at %p\n", seq,
- le16_to_cpu(m->hdr.type), m);
- m->ack_stamp = jiffies;
- ceph_msg_remove(m);
- }
-
- prepare_read_tag(con);
-}
-
-
-static int read_partial_message_section(struct ceph_connection *con,
- struct kvec *section,
- unsigned int sec_len, u32 *crc)
-{
- int ret, left;
-
- BUG_ON(!section);
-
- while (section->iov_len < sec_len) {
- BUG_ON(section->iov_base == NULL);
- left = sec_len - section->iov_len;
- ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
- section->iov_len, left);
- if (ret <= 0)
- return ret;
- section->iov_len += ret;
- }
- if (section->iov_len == sec_len)
- *crc = crc32c(0, section->iov_base, section->iov_len);
-
- return 1;
-}
-
-static int read_partial_msg_data(struct ceph_connection *con)
-{
- struct ceph_msg *msg = con->in_msg;
- struct ceph_msg_data_cursor *cursor = &msg->cursor;
- bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
- struct page *page;
- size_t page_offset;
- size_t length;
- u32 crc = 0;
- int ret;
-
- BUG_ON(!msg);
- if (list_empty(&msg->data))
- return -EIO;
-
- if (do_datacrc)
- crc = con->in_data_crc;
- while (cursor->total_resid) {
- if (!cursor->resid) {
- ceph_msg_data_advance(cursor, 0);
- continue;
- }
-
- page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
- ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
- if (ret <= 0) {
- if (do_datacrc)
- con->in_data_crc = crc;
-
- return ret;
- }
-
- if (do_datacrc)
- crc = ceph_crc32c_page(crc, page, page_offset, ret);
- ceph_msg_data_advance(cursor, (size_t)ret);
- }
- if (do_datacrc)
- con->in_data_crc = crc;
-
- return 1; /* must return > 0 to indicate success */
-}
-
-/*
- * read (part of) a message.
- */
-static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip);
-
-static int read_partial_message(struct ceph_connection *con)
-{
- struct ceph_msg *m = con->in_msg;
- int size;
- int end;
- int ret;
- unsigned int front_len, middle_len, data_len;
- bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
- bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH);
- u64 seq;
- u32 crc;
-
- dout("read_partial_message con %p msg %p\n", con, m);
-
- /* header */
- size = sizeof (con->in_hdr);
- end = size;
- ret = read_partial(con, end, size, &con->in_hdr);
- if (ret <= 0)
- return ret;
-
- crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc));
- if (cpu_to_le32(crc) != con->in_hdr.crc) {
- pr_err("read_partial_message bad hdr crc %u != expected %u\n",
- crc, con->in_hdr.crc);
- return -EBADMSG;
- }
-
- front_len = le32_to_cpu(con->in_hdr.front_len);
- if (front_len > CEPH_MSG_MAX_FRONT_LEN)
- return -EIO;
- middle_len = le32_to_cpu(con->in_hdr.middle_len);
- if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
- return -EIO;
- data_len = le32_to_cpu(con->in_hdr.data_len);
- if (data_len > CEPH_MSG_MAX_DATA_LEN)
- return -EIO;
-
- /* verify seq# */
- seq = le64_to_cpu(con->in_hdr.seq);
- if ((s64)seq - (s64)con->in_seq < 1) {
- pr_info("skipping %s%lld %s seq %lld expected %lld\n",
- ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr.in_addr),
- seq, con->in_seq + 1);
- con->in_base_pos = -front_len - middle_len - data_len -
- sizeof_footer(con);
- con->in_tag = CEPH_MSGR_TAG_READY;
- return 1;
- } else if ((s64)seq - (s64)con->in_seq > 1) {
- pr_err("read_partial_message bad seq %lld expected %lld\n",
- seq, con->in_seq + 1);
- con->error_msg = "bad message sequence # for incoming message";
- return -EBADE;
- }
-
- /* allocate message? */
- if (!con->in_msg) {
- int skip = 0;
-
- dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
- front_len, data_len);
- ret = ceph_con_in_msg_alloc(con, &skip);
- if (ret < 0)
- return ret;
-
- BUG_ON(!con->in_msg ^ skip);
- if (skip) {
- /* skip this message */
- dout("alloc_msg said skip message\n");
- con->in_base_pos = -front_len - middle_len - data_len -
- sizeof_footer(con);
- con->in_tag = CEPH_MSGR_TAG_READY;
- con->in_seq++;
- return 1;
- }
-
- BUG_ON(!con->in_msg);
- BUG_ON(con->in_msg->con != con);
- m = con->in_msg;
- m->front.iov_len = 0; /* haven't read it yet */
- if (m->middle)
- m->middle->vec.iov_len = 0;
-
- /* prepare for data payload, if any */
-
- if (data_len)
- prepare_message_data(con->in_msg, data_len);
- }
-
- /* front */
- ret = read_partial_message_section(con, &m->front, front_len,
- &con->in_front_crc);
- if (ret <= 0)
- return ret;
-
- /* middle */
- if (m->middle) {
- ret = read_partial_message_section(con, &m->middle->vec,
- middle_len,
- &con->in_middle_crc);
- if (ret <= 0)
- return ret;
- }
-
- /* (page) data */
- if (data_len) {
- ret = read_partial_msg_data(con);
- if (ret <= 0)
- return ret;
- }
-
- /* footer */
- size = sizeof_footer(con);
- end += size;
- ret = read_partial(con, end, size, &m->footer);
- if (ret <= 0)
- return ret;
-
- if (!need_sign) {
- m->footer.flags = m->old_footer.flags;
- m->footer.sig = 0;
- }
-
- dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
- m, front_len, m->footer.front_crc, middle_len,
- m->footer.middle_crc, data_len, m->footer.data_crc);
-
- /* crc ok? */
- if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
- pr_err("read_partial_message %p front crc %u != exp. %u\n",
- m, con->in_front_crc, m->footer.front_crc);
- return -EBADMSG;
- }
- if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
- pr_err("read_partial_message %p middle crc %u != exp %u\n",
- m, con->in_middle_crc, m->footer.middle_crc);
- return -EBADMSG;
- }
- if (do_datacrc &&
- (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
- con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
- pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
- con->in_data_crc, le32_to_cpu(m->footer.data_crc));
- return -EBADMSG;
- }
-
- if (need_sign && con->ops->check_message_signature &&
- con->ops->check_message_signature(m)) {
- pr_err("read_partial_message %p signature check failed\n", m);
- return -EBADMSG;
- }
-
- return 1; /* done! */
-}
/*
* Process message. This happens in the worker thread. The callback should
* be careful not to do anything that waits on other incoming messages or it
* may deadlock.
*/
-static void process_message(struct ceph_connection *con)
+void ceph_con_process_message(struct ceph_connection *con)
{
struct ceph_msg *msg = con->in_msg;
@@ -2558,12 +1413,13 @@ static void process_message(struct ceph_connection *con)
con->in_seq++;
mutex_unlock(&con->mutex);
- dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
+ dout("===== %p %llu from %s%lld %d=%s len %d+%d+%d (%u %u %u) =====\n",
msg, le64_to_cpu(msg->hdr.seq),
ENTITY_NAME(msg->hdr.src),
le16_to_cpu(msg->hdr.type),
ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
le32_to_cpu(msg->hdr.front_len),
+ le32_to_cpu(msg->hdr.middle_len),
le32_to_cpu(msg->hdr.data_len),
con->in_front_crc, con->in_middle_crc, con->in_data_crc);
con->ops->dispatch(con, msg);
@@ -2571,264 +1427,6 @@ static void process_message(struct ceph_connection *con)
mutex_lock(&con->mutex);
}
-static int read_keepalive_ack(struct ceph_connection *con)
-{
- struct ceph_timespec ceph_ts;
- size_t size = sizeof(ceph_ts);
- int ret = read_partial(con, size, size, &ceph_ts);
- if (ret <= 0)
- return ret;
- ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts);
- prepare_read_tag(con);
- return 1;
-}
-
-/*
- * Write something to the socket. Called in a worker thread when the
- * socket appears to be writeable and we have something ready to send.
- */
-static int try_write(struct ceph_connection *con)
-{
- int ret = 1;
-
- dout("try_write start %p state %lu\n", con, con->state);
- if (con->state != CON_STATE_PREOPEN &&
- con->state != CON_STATE_CONNECTING &&
- con->state != CON_STATE_NEGOTIATING &&
- con->state != CON_STATE_OPEN)
- return 0;
-
- /* open the socket first? */
- if (con->state == CON_STATE_PREOPEN) {
- BUG_ON(con->sock);
- con->state = CON_STATE_CONNECTING;
-
- con_out_kvec_reset(con);
- prepare_write_banner(con);
- prepare_read_banner(con);
-
- BUG_ON(con->in_msg);
- con->in_tag = CEPH_MSGR_TAG_READY;
- dout("try_write initiating connect on %p new state %lu\n",
- con, con->state);
- ret = ceph_tcp_connect(con);
- if (ret < 0) {
- con->error_msg = "connect error";
- goto out;
- }
- }
-
-more:
- dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
- BUG_ON(!con->sock);
-
- /* kvec data queued? */
- if (con->out_kvec_left) {
- ret = write_partial_kvec(con);
- if (ret <= 0)
- goto out;
- }
- if (con->out_skip) {
- ret = write_partial_skip(con);
- if (ret <= 0)
- goto out;
- }
-
- /* msg pages? */
- if (con->out_msg) {
- if (con->out_msg_done) {
- ceph_msg_put(con->out_msg);
- con->out_msg = NULL; /* we're done with this one */
- goto do_next;
- }
-
- ret = write_partial_message_data(con);
- if (ret == 1)
- goto more; /* we need to send the footer, too! */
- if (ret == 0)
- goto out;
- if (ret < 0) {
- dout("try_write write_partial_message_data err %d\n",
- ret);
- goto out;
- }
- }
-
-do_next:
- if (con->state == CON_STATE_OPEN) {
- if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
- prepare_write_keepalive(con);
- goto more;
- }
- /* is anything else pending? */
- if (!list_empty(&con->out_queue)) {
- prepare_write_message(con);
- goto more;
- }
- if (con->in_seq > con->in_seq_acked) {
- prepare_write_ack(con);
- goto more;
- }
- }
-
- /* Nothing to do! */
- con_flag_clear(con, CON_FLAG_WRITE_PENDING);
- dout("try_write nothing else to write.\n");
- ret = 0;
-out:
- dout("try_write done on %p ret %d\n", con, ret);
- return ret;
-}
-
-/*
- * Read what we can from the socket.
- */
-static int try_read(struct ceph_connection *con)
-{
- int ret = -1;
-
-more:
- dout("try_read start on %p state %lu\n", con, con->state);
- if (con->state != CON_STATE_CONNECTING &&
- con->state != CON_STATE_NEGOTIATING &&
- con->state != CON_STATE_OPEN)
- return 0;
-
- BUG_ON(!con->sock);
-
- dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
- con->in_base_pos);
-
- if (con->state == CON_STATE_CONNECTING) {
- dout("try_read connecting\n");
- ret = read_partial_banner(con);
- if (ret <= 0)
- goto out;
- ret = process_banner(con);
- if (ret < 0)
- goto out;
-
- con->state = CON_STATE_NEGOTIATING;
-
- /*
- * Received banner is good, exchange connection info.
- * Do not reset out_kvec, as sending our banner raced
- * with receiving peer banner after connect completed.
- */
- ret = prepare_write_connect(con);
- if (ret < 0)
- goto out;
- prepare_read_connect(con);
-
- /* Send connection info before awaiting response */
- goto out;
- }
-
- if (con->state == CON_STATE_NEGOTIATING) {
- dout("try_read negotiating\n");
- ret = read_partial_connect(con);
- if (ret <= 0)
- goto out;
- ret = process_connect(con);
- if (ret < 0)
- goto out;
- goto more;
- }
-
- WARN_ON(con->state != CON_STATE_OPEN);
-
- if (con->in_base_pos < 0) {
- /*
- * skipping + discarding content.
- */
- ret = ceph_tcp_recvmsg(con->sock, NULL, -con->in_base_pos);
- if (ret <= 0)
- goto out;
- dout("skipped %d / %d bytes\n", ret, -con->in_base_pos);
- con->in_base_pos += ret;
- if (con->in_base_pos)
- goto more;
- }
- if (con->in_tag == CEPH_MSGR_TAG_READY) {
- /*
- * what's next?
- */
- ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
- if (ret <= 0)
- goto out;
- dout("try_read got tag %d\n", (int)con->in_tag);
- switch (con->in_tag) {
- case CEPH_MSGR_TAG_MSG:
- prepare_read_message(con);
- break;
- case CEPH_MSGR_TAG_ACK:
- prepare_read_ack(con);
- break;
- case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
- prepare_read_keepalive_ack(con);
- break;
- case CEPH_MSGR_TAG_CLOSE:
- con_close_socket(con);
- con->state = CON_STATE_CLOSED;
- goto out;
- default:
- goto bad_tag;
- }
- }
- if (con->in_tag == CEPH_MSGR_TAG_MSG) {
- ret = read_partial_message(con);
- if (ret <= 0) {
- switch (ret) {
- case -EBADMSG:
- con->error_msg = "bad crc/signature";
- /* fall through */
- case -EBADE:
- ret = -EIO;
- break;
- case -EIO:
- con->error_msg = "io error";
- break;
- }
- goto out;
- }
- if (con->in_tag == CEPH_MSGR_TAG_READY)
- goto more;
- process_message(con);
- if (con->state == CON_STATE_OPEN)
- prepare_read_tag(con);
- goto more;
- }
- if (con->in_tag == CEPH_MSGR_TAG_ACK ||
- con->in_tag == CEPH_MSGR_TAG_SEQ) {
- /*
- * the final handshake seq exchange is semantically
- * equivalent to an ACK
- */
- ret = read_partial_ack(con);
- if (ret <= 0)
- goto out;
- process_ack(con);
- goto more;
- }
- if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
- ret = read_keepalive_ack(con);
- if (ret <= 0)
- goto out;
- goto more;
- }
-
-out:
- dout("try_read done on %p ret %d\n", con, ret);
- return ret;
-
-bad_tag:
- pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
- con->error_msg = "protocol error, garbage tag";
- ret = -1;
- goto out;
-}
-
-
/*
* Atomically queue work on a connection after the specified delay.
* Bump @con reference to avoid races with connection teardown.
@@ -2841,13 +1439,16 @@ static int queue_con_delay(struct ceph_connection *con, unsigned long delay)
return -ENOENT;
}
+ if (delay >= HZ)
+ delay = round_jiffies_relative(delay);
+
+ dout("%s %p %lu\n", __func__, con, delay);
if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) {
dout("%s %p - already queued\n", __func__, con);
con->ops->put(con);
return -EBUSY;
}
- dout("%s %p %lu\n", __func__, con, delay);
return 0;
}
@@ -2866,27 +1467,30 @@ static void cancel_con(struct ceph_connection *con)
static bool con_sock_closed(struct ceph_connection *con)
{
- if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED))
+ if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_SOCK_CLOSED))
return false;
#define CASE(x) \
- case CON_STATE_ ## x: \
+ case CEPH_CON_S_ ## x: \
con->error_msg = "socket closed (con state " #x ")"; \
break;
switch (con->state) {
CASE(CLOSED);
CASE(PREOPEN);
- CASE(CONNECTING);
- CASE(NEGOTIATING);
+ CASE(V1_BANNER);
+ CASE(V1_CONNECT_MSG);
+ CASE(V2_BANNER_PREFIX);
+ CASE(V2_BANNER_PAYLOAD);
+ CASE(V2_HELLO);
+ CASE(V2_AUTH);
+ CASE(V2_AUTH_SIGNATURE);
+ CASE(V2_SESSION_CONNECT);
+ CASE(V2_SESSION_RECONNECT);
CASE(OPEN);
CASE(STANDBY);
default:
- pr_warn("%s con %p unrecognized state %lu\n",
- __func__, con, con->state);
- con->error_msg = "unrecognized con state";
BUG();
- break;
}
#undef CASE
@@ -2897,15 +1501,15 @@ static bool con_backoff(struct ceph_connection *con)
{
int ret;
- if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF))
+ if (!ceph_con_flag_test_and_clear(con, CEPH_CON_F_BACKOFF))
return false;
- ret = queue_con_delay(con, round_jiffies_relative(con->delay));
+ ret = queue_con_delay(con, con->delay);
if (ret) {
dout("%s: con %p FAILED to back off %lu\n", __func__,
con, con->delay);
BUG_ON(ret == -ENOENT);
- con_flag_set(con, CON_FLAG_BACKOFF);
+ ceph_con_flag_set(con, CEPH_CON_F_BACKOFF);
}
return true;
@@ -2921,11 +1525,11 @@ static void con_fault_finish(struct ceph_connection *con)
* in case we faulted due to authentication, invalidate our
* current tickets so that we can get new ones.
*/
- if (con->auth_retry) {
- dout("auth_retry %d, invalidating\n", con->auth_retry);
+ if (!ceph_msgr2(from_msgr(con->msgr)) && con->v1.auth_retry) {
+ dout("auth_retry %d, invalidating\n", con->v1.auth_retry);
if (con->ops->invalidate_authorizer)
con->ops->invalidate_authorizer(con);
- con->auth_retry = 0;
+ con->v1.auth_retry = 0;
}
if (con->ops->fault)
@@ -2953,21 +1557,24 @@ static void ceph_con_workfn(struct work_struct *work)
dout("%s: con %p BACKOFF\n", __func__, con);
break;
}
- if (con->state == CON_STATE_STANDBY) {
+ if (con->state == CEPH_CON_S_STANDBY) {
dout("%s: con %p STANDBY\n", __func__, con);
break;
}
- if (con->state == CON_STATE_CLOSED) {
+ if (con->state == CEPH_CON_S_CLOSED) {
dout("%s: con %p CLOSED\n", __func__, con);
BUG_ON(con->sock);
break;
}
- if (con->state == CON_STATE_PREOPEN) {
+ if (con->state == CEPH_CON_S_PREOPEN) {
dout("%s: con %p PREOPEN\n", __func__, con);
BUG_ON(con->sock);
}
- ret = try_read(con);
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ret = ceph_con_v2_try_read(con);
+ else
+ ret = ceph_con_v1_try_read(con);
if (ret < 0) {
if (ret == -EAGAIN)
continue;
@@ -2977,7 +1584,10 @@ static void ceph_con_workfn(struct work_struct *work)
break;
}
- ret = try_write(con);
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ret = ceph_con_v2_try_write(con);
+ else
+ ret = ceph_con_v1_try_write(con);
if (ret < 0) {
if (ret == -EAGAIN)
continue;
@@ -3004,54 +1614,55 @@ static void ceph_con_workfn(struct work_struct *work)
*/
static void con_fault(struct ceph_connection *con)
{
- dout("fault %p state %lu to peer %s\n",
- con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
+ dout("fault %p state %d to peer %s\n",
+ con, con->state, ceph_pr_addr(&con->peer_addr));
pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
- ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
+ ceph_pr_addr(&con->peer_addr), con->error_msg);
con->error_msg = NULL;
- WARN_ON(con->state != CON_STATE_CONNECTING &&
- con->state != CON_STATE_NEGOTIATING &&
- con->state != CON_STATE_OPEN);
+ WARN_ON(con->state == CEPH_CON_S_STANDBY ||
+ con->state == CEPH_CON_S_CLOSED);
- con_close_socket(con);
+ ceph_con_reset_protocol(con);
- if (con_flag_test(con, CON_FLAG_LOSSYTX)) {
+ if (ceph_con_flag_test(con, CEPH_CON_F_LOSSYTX)) {
dout("fault on LOSSYTX channel, marking CLOSED\n");
- con->state = CON_STATE_CLOSED;
+ con->state = CEPH_CON_S_CLOSED;
return;
}
- if (con->in_msg) {
- BUG_ON(con->in_msg->con != con);
- ceph_msg_put(con->in_msg);
- con->in_msg = NULL;
- }
-
/* Requeue anything that hasn't been acked */
list_splice_init(&con->out_sent, &con->out_queue);
/* If there are no messages queued or keepalive pending, place
* the connection in a STANDBY state */
if (list_empty(&con->out_queue) &&
- !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) {
+ !ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
- con_flag_clear(con, CON_FLAG_WRITE_PENDING);
- con->state = CON_STATE_STANDBY;
+ ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+ con->state = CEPH_CON_S_STANDBY;
} else {
/* retry after a delay. */
- con->state = CON_STATE_PREOPEN;
- if (con->delay == 0)
+ con->state = CEPH_CON_S_PREOPEN;
+ if (!con->delay) {
con->delay = BASE_DELAY_INTERVAL;
- else if (con->delay < MAX_DELAY_INTERVAL)
+ } else if (con->delay < MAX_DELAY_INTERVAL) {
con->delay *= 2;
- con_flag_set(con, CON_FLAG_BACKOFF);
+ if (con->delay > MAX_DELAY_INTERVAL)
+ con->delay = MAX_DELAY_INTERVAL;
+ }
+ ceph_con_flag_set(con, CEPH_CON_F_BACKOFF);
queue_con(con);
}
}
-
+void ceph_messenger_reset_nonce(struct ceph_messenger *msgr)
+{
+ u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000;
+ msgr->inst.addr.nonce = cpu_to_le32(nonce);
+ ceph_encode_my_addr(msgr);
+}
/*
* initialize a new messenger instance
@@ -3061,26 +1672,35 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
{
spin_lock_init(&msgr->global_seq_lock);
- if (myaddr)
- msgr->inst.addr = *myaddr;
+ if (myaddr) {
+ memcpy(&msgr->inst.addr.in_addr, &myaddr->in_addr,
+ sizeof(msgr->inst.addr.in_addr));
+ ceph_addr_set_port(&msgr->inst.addr, 0);
+ }
+
+ /*
+ * Since nautilus, clients are identified using type ANY.
+ * For msgr1, ceph_encode_banner_addr() munges it to NONE.
+ */
+ msgr->inst.addr.type = CEPH_ENTITY_ADDR_TYPE_ANY;
- /* select a random nonce */
- msgr->inst.addr.type = 0;
- get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
- encode_my_addr(msgr);
+ /* generate a random non-zero nonce */
+ do {
+ get_random_bytes(&msgr->inst.addr.nonce,
+ sizeof(msgr->inst.addr.nonce));
+ } while (!msgr->inst.addr.nonce);
+ ceph_encode_my_addr(msgr);
atomic_set(&msgr->stopping, 0);
write_pnet(&msgr->net, get_net(current->nsproxy->net_ns));
dout("%s %p\n", __func__, msgr);
}
-EXPORT_SYMBOL(ceph_messenger_init);
void ceph_messenger_fini(struct ceph_messenger *msgr)
{
put_net(read_pnet(&msgr->net));
}
-EXPORT_SYMBOL(ceph_messenger_fini);
static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con)
{
@@ -3094,17 +1714,20 @@ static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con)
static void clear_standby(struct ceph_connection *con)
{
/* come back from STANDBY? */
- if (con->state == CON_STATE_STANDBY) {
- dout("clear_standby %p and ++connect_seq\n", con);
- con->state = CON_STATE_PREOPEN;
- con->connect_seq++;
- WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING));
- WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING));
+ if (con->state == CEPH_CON_S_STANDBY) {
+ dout("clear_standby %p\n", con);
+ con->state = CEPH_CON_S_PREOPEN;
+ if (!ceph_msgr2(from_msgr(con->msgr)))
+ con->v1.connect_seq++;
+ WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING));
+ WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING));
}
}
/*
* Queue up an outgoing message on the given connection.
+ *
+ * Consumes a ref on @msg.
*/
void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
{
@@ -3115,7 +1738,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
mutex_lock(&con->mutex);
- if (con->state == CON_STATE_CLOSED) {
+ if (con->state == CEPH_CON_S_CLOSED) {
dout("con_send %p closed, dropping %p\n", con, msg);
ceph_msg_put(msg);
mutex_unlock(&con->mutex);
@@ -3138,7 +1761,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
/* if there wasn't anything waiting to send before, queue
* new work */
- if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
+ if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING))
queue_con(con);
}
EXPORT_SYMBOL(ceph_con_send);
@@ -3156,36 +1779,30 @@ void ceph_msg_revoke(struct ceph_msg *msg)
}
mutex_lock(&con->mutex);
- if (!list_empty(&msg->list_head)) {
- dout("%s %p msg %p - was on queue\n", __func__, con, msg);
- list_del_init(&msg->list_head);
- msg->hdr.seq = 0;
-
- ceph_msg_put(msg);
+ if (list_empty(&msg->list_head)) {
+ WARN_ON(con->out_msg == msg);
+ dout("%s con %p msg %p not linked\n", __func__, con, msg);
+ mutex_unlock(&con->mutex);
+ return;
}
+
+ dout("%s con %p msg %p was linked\n", __func__, con, msg);
+ msg->hdr.seq = 0;
+ ceph_msg_remove(msg);
+
if (con->out_msg == msg) {
- BUG_ON(con->out_skip);
- /* footer */
- if (con->out_msg_done) {
- con->out_skip += con_out_kvec_skip(con);
- } else {
- BUG_ON(!msg->data_length);
- con->out_skip += sizeof_footer(con);
- }
- /* data, middle, front */
- if (msg->data_length)
- con->out_skip += msg->cursor.total_resid;
- if (msg->middle)
- con->out_skip += con_out_kvec_skip(con);
- con->out_skip += con_out_kvec_skip(con);
-
- dout("%s %p msg %p - was sending, will write %d skip %d\n",
- __func__, con, msg, con->out_kvec_bytes, con->out_skip);
- msg->hdr.seq = 0;
+ WARN_ON(con->state != CEPH_CON_S_OPEN);
+ dout("%s con %p msg %p was sending\n", __func__, con, msg);
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ceph_con_v2_revoke(con, msg);
+ else
+ ceph_con_v1_revoke(con, msg);
+ ceph_msg_put(con->out_msg);
con->out_msg = NULL;
- ceph_msg_put(msg);
+ } else {
+ dout("%s con %p msg %p not current, out_msg %p\n", __func__,
+ con, msg, con->out_msg);
}
-
mutex_unlock(&con->mutex);
}
@@ -3203,25 +1820,17 @@ void ceph_msg_revoke_incoming(struct ceph_msg *msg)
mutex_lock(&con->mutex);
if (con->in_msg == msg) {
- unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
- unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
- unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
-
- /* skip rest of message */
- dout("%s %p msg %p revoked\n", __func__, con, msg);
- con->in_base_pos = con->in_base_pos -
- sizeof(struct ceph_msg_header) -
- front_len -
- middle_len -
- data_len -
- sizeof(struct ceph_msg_footer);
+ WARN_ON(con->state != CEPH_CON_S_OPEN);
+ dout("%s con %p msg %p was recving\n", __func__, con, msg);
+ if (ceph_msgr2(from_msgr(con->msgr)))
+ ceph_con_v2_revoke_incoming(con);
+ else
+ ceph_con_v1_revoke_incoming(con);
ceph_msg_put(con->in_msg);
con->in_msg = NULL;
- con->in_tag = CEPH_MSGR_TAG_READY;
- con->in_seq++;
} else {
- dout("%s %p in_msg %p msg %p no-op\n",
- __func__, con, con->in_msg, msg);
+ dout("%s con %p msg %p not current, in_msg %p\n", __func__,
+ con, msg, con->in_msg);
}
mutex_unlock(&con->mutex);
}
@@ -3234,9 +1843,10 @@ void ceph_con_keepalive(struct ceph_connection *con)
dout("con_keepalive %p\n", con);
mutex_lock(&con->mutex);
clear_standby(con);
+ ceph_con_flag_set(con, CEPH_CON_F_KEEPALIVE_PENDING);
mutex_unlock(&con->mutex);
- if (con_flag_test_and_set(con, CON_FLAG_KEEPALIVE_PENDING) == 0 &&
- con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0)
+
+ if (!ceph_con_flag_test_and_set(con, CEPH_CON_F_WRITE_PENDING))
queue_con(con);
}
EXPORT_SYMBOL(ceph_con_keepalive);
@@ -3256,49 +1866,37 @@ bool ceph_con_keepalive_expired(struct ceph_connection *con,
return false;
}
-static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type)
+static struct ceph_msg_data *ceph_msg_data_add(struct ceph_msg *msg)
{
- struct ceph_msg_data *data;
-
- if (WARN_ON(!ceph_msg_data_type_valid(type)))
- return NULL;
-
- data = kmem_cache_zalloc(ceph_msg_data_cache, GFP_NOFS);
- if (!data)
- return NULL;
-
- data->type = type;
- INIT_LIST_HEAD(&data->links);
-
- return data;
+ BUG_ON(msg->num_data_items >= msg->max_data_items);
+ return &msg->data[msg->num_data_items++];
}
static void ceph_msg_data_destroy(struct ceph_msg_data *data)
{
- if (!data)
- return;
-
- WARN_ON(!list_empty(&data->links));
- if (data->type == CEPH_MSG_DATA_PAGELIST)
+ if (data->type == CEPH_MSG_DATA_PAGES && data->own_pages) {
+ int num_pages = calc_pages_for(data->alignment, data->length);
+ ceph_release_page_vector(data->pages, num_pages);
+ } else if (data->type == CEPH_MSG_DATA_PAGELIST) {
ceph_pagelist_release(data->pagelist);
- kmem_cache_free(ceph_msg_data_cache, data);
+ }
}
void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
- size_t length, size_t alignment)
+ size_t length, size_t alignment, bool own_pages)
{
struct ceph_msg_data *data;
BUG_ON(!pages);
BUG_ON(!length);
- data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES);
- BUG_ON(!data);
+ data = ceph_msg_data_add(msg);
+ data->type = CEPH_MSG_DATA_PAGES;
data->pages = pages;
data->length = length;
data->alignment = alignment & ~PAGE_MASK;
+ data->own_pages = own_pages;
- list_add_tail(&data->links, &msg->data);
msg->data_length += length;
}
EXPORT_SYMBOL(ceph_msg_data_add_pages);
@@ -3311,11 +1909,11 @@ void ceph_msg_data_add_pagelist(struct ceph_msg *msg,
BUG_ON(!pagelist);
BUG_ON(!pagelist->length);
- data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST);
- BUG_ON(!data);
+ data = ceph_msg_data_add(msg);
+ data->type = CEPH_MSG_DATA_PAGELIST;
+ refcount_inc(&pagelist->refcnt);
data->pagelist = pagelist;
- list_add_tail(&data->links, &msg->data);
msg->data_length += pagelist->length;
}
EXPORT_SYMBOL(ceph_msg_data_add_pagelist);
@@ -3326,12 +1924,11 @@ void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos,
{
struct ceph_msg_data *data;
- data = ceph_msg_data_create(CEPH_MSG_DATA_BIO);
- BUG_ON(!data);
+ data = ceph_msg_data_add(msg);
+ data->type = CEPH_MSG_DATA_BIO;
data->bio_pos = *bio_pos;
data->bio_length = length;
- list_add_tail(&data->links, &msg->data);
msg->data_length += length;
}
EXPORT_SYMBOL(ceph_msg_data_add_bio);
@@ -3342,21 +1939,32 @@ void ceph_msg_data_add_bvecs(struct ceph_msg *msg,
{
struct ceph_msg_data *data;
- data = ceph_msg_data_create(CEPH_MSG_DATA_BVECS);
- BUG_ON(!data);
+ data = ceph_msg_data_add(msg);
+ data->type = CEPH_MSG_DATA_BVECS;
data->bvec_pos = *bvec_pos;
- list_add_tail(&data->links, &msg->data);
msg->data_length += bvec_pos->iter.bi_size;
}
EXPORT_SYMBOL(ceph_msg_data_add_bvecs);
+void ceph_msg_data_add_iter(struct ceph_msg *msg,
+ struct iov_iter *iter)
+{
+ struct ceph_msg_data *data;
+
+ data = ceph_msg_data_add(msg);
+ data->type = CEPH_MSG_DATA_ITER;
+ data->iter = *iter;
+
+ msg->data_length += iov_iter_count(&data->iter);
+}
+
/*
* construct a new message with given type, size
* the new msg has a ref count of 1.
*/
-struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
- bool can_fail)
+struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items,
+ gfp_t flags, bool can_fail)
{
struct ceph_msg *m;
@@ -3370,11 +1978,10 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
INIT_LIST_HEAD(&m->list_head);
kref_init(&m->kref);
- INIT_LIST_HEAD(&m->data);
/* front */
if (front_len) {
- m->front.iov_base = ceph_kvmalloc(front_len, flags);
+ m->front.iov_base = kvmalloc(front_len, flags);
if (m->front.iov_base == NULL) {
dout("ceph_msg_new can't allocate %d bytes\n",
front_len);
@@ -3385,6 +1992,15 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
}
m->front_alloc_len = m->front.iov_len = front_len;
+ if (max_data_items) {
+ m->data = kmalloc_array(max_data_items, sizeof(*m->data),
+ flags);
+ if (!m->data)
+ goto out2;
+
+ m->max_data_items = max_data_items;
+ }
+
dout("ceph_msg_new %p front %d\n", m, front_len);
return m;
@@ -3401,6 +2017,13 @@ out:
}
return NULL;
}
+EXPORT_SYMBOL(ceph_msg_new2);
+
+struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
+ bool can_fail)
+{
+ return ceph_msg_new2(type, front_len, 0, flags, can_fail);
+}
EXPORT_SYMBOL(ceph_msg_new);
/*
@@ -3441,9 +2064,9 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
* On error (ENOMEM, EAGAIN, ...),
* - con->in_msg == NULL
*/
-static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
+int ceph_con_in_msg_alloc(struct ceph_connection *con,
+ struct ceph_msg_header *hdr, int *skip)
{
- struct ceph_msg_header *hdr = &con->in_hdr;
int middle_len = le32_to_cpu(hdr->middle_len);
struct ceph_msg *msg;
int ret = 0;
@@ -3454,7 +2077,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
mutex_unlock(&con->mutex);
msg = con->ops->alloc_msg(con, hdr, skip);
mutex_lock(&con->mutex);
- if (con->state != CON_STATE_OPEN) {
+ if (con->state != CEPH_CON_S_OPEN) {
if (msg)
ceph_msg_put(msg);
return -EAGAIN;
@@ -3475,7 +2098,7 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
con->error_msg = "error allocating memory for incoming message";
return -ENOMEM;
}
- memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
+ memcpy(&con->in_msg->hdr, hdr, sizeof(*hdr));
if (middle_len && !con->in_msg->middle) {
ret = ceph_alloc_middle(con, con->in_msg);
@@ -3488,6 +2111,41 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
return ret;
}
+struct ceph_msg *ceph_con_get_out_msg(struct ceph_connection *con)
+{
+ struct ceph_msg *msg;
+
+ if (list_empty(&con->out_queue))
+ return NULL;
+
+ msg = list_first_entry(&con->out_queue, struct ceph_msg, list_head);
+ WARN_ON(msg->con != con);
+
+ /*
+ * Put the message on "sent" list using a ref from ceph_con_send().
+ * It is put when the message is acked or revoked.
+ */
+ list_move_tail(&msg->list_head, &con->out_sent);
+
+ /*
+ * Only assign outgoing seq # if we haven't sent this message
+ * yet. If it is requeued, resend with it's original seq.
+ */
+ if (msg->needs_out_seq) {
+ msg->hdr.seq = cpu_to_le64(++con->out_seq);
+ msg->needs_out_seq = false;
+
+ if (con->ops->reencode_message)
+ con->ops->reencode_message(msg);
+ }
+
+ /*
+ * Get a ref for out_msg. It is put when we are done sending the
+ * message or in case of a fault.
+ */
+ WARN_ON(con->out_msg);
+ return con->out_msg = ceph_msg_get(msg);
+}
/*
* Free a generically kmalloc'd message.
@@ -3496,13 +2154,14 @@ static void ceph_msg_free(struct ceph_msg *m)
{
dout("%s %p\n", __func__, m);
kvfree(m->front.iov_base);
+ kfree(m->data);
kmem_cache_free(ceph_msg_cache, m);
}
static void ceph_msg_release(struct kref *kref)
{
struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
- struct ceph_msg_data *data, *next;
+ int i;
dout("%s %p\n", __func__, m);
WARN_ON(!list_empty(&m->list_head));
@@ -3515,11 +2174,8 @@ static void ceph_msg_release(struct kref *kref)
m->middle = NULL;
}
- list_for_each_entry_safe(data, next, &m->data, links) {
- list_del_init(&data->links);
- ceph_msg_data_destroy(data);
- }
- m->data_length = 0;
+ for (i = 0; i < m->num_data_items; i++)
+ ceph_msg_data_destroy(&m->data[i]);
if (m->pool)
ceph_msgpool_put(m->pool, m);
diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
new file mode 100644
index 000000000000..c9e002d96319
--- /dev/null
+++ b/net/ceph/messenger_v1.c
@@ -0,0 +1,1620 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/ceph/ceph_debug.h>
+
+#include <linux/bvec.h>
+#include <linux/crc32c.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <net/sock.h>
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+
+/* static tag bytes (protocol control messages) */
+static char tag_msg = CEPH_MSGR_TAG_MSG;
+static char tag_ack = CEPH_MSGR_TAG_ACK;
+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
+
+/*
+ * If @buf is NULL, discard up to @len bytes.
+ */
+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
+{
+ struct kvec iov = {buf, len};
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+ int r;
+
+ if (!buf)
+ msg.msg_flags |= MSG_TRUNC;
+
+ iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, len);
+ r = sock_recvmsg(sock, &msg, msg.msg_flags);
+ if (r == -EAGAIN)
+ r = 0;
+ return r;
+}
+
+static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
+ int page_offset, size_t length)
+{
+ struct bio_vec bvec;
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+ int r;
+
+ BUG_ON(page_offset + length > PAGE_SIZE);
+ bvec_set_page(&bvec, page, length, page_offset);
+ iov_iter_bvec(&msg.msg_iter, ITER_DEST, &bvec, 1, length);
+ r = sock_recvmsg(sock, &msg, msg.msg_flags);
+ if (r == -EAGAIN)
+ r = 0;
+ return r;
+}
+
+/*
+ * write something. @more is true if caller will be sending more data
+ * shortly.
+ */
+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
+ size_t kvlen, size_t len, bool more)
+{
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+ int r;
+
+ if (more)
+ msg.msg_flags |= MSG_MORE;
+ else
+ msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
+
+ r = kernel_sendmsg(sock, &msg, iov, kvlen, len);
+ if (r == -EAGAIN)
+ r = 0;
+ return r;
+}
+
+/*
+ * @more: MSG_MORE or 0.
+ */
+static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
+ int offset, size_t size, int more)
+{
+ struct msghdr msg = {
+ .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | more,
+ };
+ struct bio_vec bvec;
+ int ret;
+
+ /*
+ * MSG_SPLICE_PAGES cannot properly handle pages with page_count == 0,
+ * we need to fall back to sendmsg if that's the case.
+ *
+ * Same goes for slab pages: skb_can_coalesce() allows
+ * coalescing neighboring slab objects into a single frag which
+ * triggers one of hardened usercopy checks.
+ */
+ if (sendpage_ok(page))
+ msg.msg_flags |= MSG_SPLICE_PAGES;
+
+ bvec_set_page(&bvec, page, size, offset);
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size);
+
+ ret = sock_sendmsg(sock, &msg);
+ if (ret == -EAGAIN)
+ ret = 0;
+
+ return ret;
+}
+
+static void con_out_kvec_reset(struct ceph_connection *con)
+{
+ BUG_ON(con->v1.out_skip);
+
+ con->v1.out_kvec_left = 0;
+ con->v1.out_kvec_bytes = 0;
+ con->v1.out_kvec_cur = &con->v1.out_kvec[0];
+}
+
+static void con_out_kvec_add(struct ceph_connection *con,
+ size_t size, void *data)
+{
+ int index = con->v1.out_kvec_left;
+
+ BUG_ON(con->v1.out_skip);
+ BUG_ON(index >= ARRAY_SIZE(con->v1.out_kvec));
+
+ con->v1.out_kvec[index].iov_len = size;
+ con->v1.out_kvec[index].iov_base = data;
+ con->v1.out_kvec_left++;
+ con->v1.out_kvec_bytes += size;
+}
+
+/*
+ * Chop off a kvec from the end. Return residual number of bytes for
+ * that kvec, i.e. how many bytes would have been written if the kvec
+ * hadn't been nuked.
+ */
+static int con_out_kvec_skip(struct ceph_connection *con)
+{
+ int skip = 0;
+
+ if (con->v1.out_kvec_bytes > 0) {
+ skip = con->v1.out_kvec_cur[con->v1.out_kvec_left - 1].iov_len;
+ BUG_ON(con->v1.out_kvec_bytes < skip);
+ BUG_ON(!con->v1.out_kvec_left);
+ con->v1.out_kvec_bytes -= skip;
+ con->v1.out_kvec_left--;
+ }
+
+ return skip;
+}
+
+static size_t sizeof_footer(struct ceph_connection *con)
+{
+ return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
+ sizeof(struct ceph_msg_footer) :
+ sizeof(struct ceph_msg_footer_old);
+}
+
+static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
+{
+ /* Initialize data cursor if it's not a sparse read */
+ u64 len = msg->sparse_read_total ? : data_len;
+
+ ceph_msg_data_cursor_init(&msg->cursor, msg, len);
+}
+
+/*
+ * Prepare footer for currently outgoing message, and finish things
+ * off. Assumes out_kvec* are already valid.. we just add on to the end.
+ */
+static void prepare_write_message_footer(struct ceph_connection *con,
+ struct ceph_msg *m)
+{
+ m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
+
+ dout("prepare_write_message_footer %p\n", con);
+ con_out_kvec_add(con, sizeof_footer(con), &m->footer);
+ if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
+ if (con->ops->sign_message)
+ con->ops->sign_message(m);
+ else
+ m->footer.sig = 0;
+ } else {
+ m->old_footer.flags = m->footer.flags;
+ }
+ con->v1.out_more = m->more_to_follow;
+ con->v1.out_msg_done = true;
+}
+
+/*
+ * Prepare headers for the next outgoing message.
+ */
+static void prepare_write_message(struct ceph_connection *con,
+ struct ceph_msg *m)
+{
+ u32 crc;
+
+ con_out_kvec_reset(con);
+ con->v1.out_msg_done = false;
+
+ /* Sneak an ack in there first? If we can get it into the same
+ * TCP packet that's a good thing. */
+ if (con->in_seq > con->in_seq_acked) {
+ con->in_seq_acked = con->in_seq;
+ con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+ con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+ &con->v1.out_temp_ack);
+ }
+
+ dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
+ m, con->out_seq, le16_to_cpu(m->hdr.type),
+ le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
+ m->data_length);
+ WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len));
+ WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
+
+ /* tag + hdr + front + middle */
+ con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
+ con_out_kvec_add(con, sizeof(con->v1.out_hdr), &con->v1.out_hdr);
+ con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
+
+ if (m->middle)
+ con_out_kvec_add(con, m->middle->vec.iov_len,
+ m->middle->vec.iov_base);
+
+ /* fill in hdr crc and finalize hdr */
+ crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
+ m->hdr.crc = cpu_to_le32(crc);
+ memcpy(&con->v1.out_hdr, &m->hdr, sizeof(con->v1.out_hdr));
+
+ /* fill in front and middle crc, footer */
+ crc = crc32c(0, m->front.iov_base, m->front.iov_len);
+ m->footer.front_crc = cpu_to_le32(crc);
+ if (m->middle) {
+ crc = crc32c(0, m->middle->vec.iov_base,
+ m->middle->vec.iov_len);
+ m->footer.middle_crc = cpu_to_le32(crc);
+ } else
+ m->footer.middle_crc = 0;
+ dout("%s front_crc %u middle_crc %u\n", __func__,
+ le32_to_cpu(m->footer.front_crc),
+ le32_to_cpu(m->footer.middle_crc));
+ m->footer.flags = 0;
+
+ /* is there a data payload? */
+ m->footer.data_crc = 0;
+ if (m->data_length) {
+ prepare_message_data(m, m->data_length);
+ con->v1.out_more = 1; /* data + footer will follow */
+ } else {
+ /* no, queue up footer too and be done */
+ prepare_write_message_footer(con, m);
+ }
+
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare an ack.
+ */
+static void prepare_write_ack(struct ceph_connection *con)
+{
+ dout("prepare_write_ack %p %llu -> %llu\n", con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+
+ con_out_kvec_reset(con);
+
+ con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
+
+ con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+ &con->v1.out_temp_ack);
+
+ con->v1.out_more = 1; /* more will follow.. eventually.. */
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare to share the seq during handshake
+ */
+static void prepare_write_seq(struct ceph_connection *con)
+{
+ dout("prepare_write_seq %p %llu -> %llu\n", con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+
+ con_out_kvec_reset(con);
+
+ con->v1.out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con_out_kvec_add(con, sizeof(con->v1.out_temp_ack),
+ &con->v1.out_temp_ack);
+
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Prepare to write keepalive byte.
+ */
+static void prepare_write_keepalive(struct ceph_connection *con)
+{
+ dout("prepare_write_keepalive %p\n", con);
+ con_out_kvec_reset(con);
+ if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
+ struct timespec64 now;
+
+ ktime_get_real_ts64(&now);
+ con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
+ ceph_encode_timespec64(&con->v1.out_temp_keepalive2, &now);
+ con_out_kvec_add(con, sizeof(con->v1.out_temp_keepalive2),
+ &con->v1.out_temp_keepalive2);
+ } else {
+ con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
+ }
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+/*
+ * Connection negotiation.
+ */
+
+static int get_connect_authorizer(struct ceph_connection *con)
+{
+ struct ceph_auth_handshake *auth;
+ int auth_proto;
+
+ if (!con->ops->get_authorizer) {
+ con->v1.auth = NULL;
+ con->v1.out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
+ con->v1.out_connect.authorizer_len = 0;
+ return 0;
+ }
+
+ auth = con->ops->get_authorizer(con, &auth_proto, con->v1.auth_retry);
+ if (IS_ERR(auth))
+ return PTR_ERR(auth);
+
+ con->v1.auth = auth;
+ con->v1.out_connect.authorizer_protocol = cpu_to_le32(auth_proto);
+ con->v1.out_connect.authorizer_len =
+ cpu_to_le32(auth->authorizer_buf_len);
+ return 0;
+}
+
+/*
+ * We connected to a peer and are saying hello.
+ */
+static void prepare_write_banner(struct ceph_connection *con)
+{
+ con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
+ con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
+ &con->msgr->my_enc_addr);
+
+ con->v1.out_more = 0;
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+static void __prepare_write_connect(struct ceph_connection *con)
+{
+ con_out_kvec_add(con, sizeof(con->v1.out_connect),
+ &con->v1.out_connect);
+ if (con->v1.auth)
+ con_out_kvec_add(con, con->v1.auth->authorizer_buf_len,
+ con->v1.auth->authorizer_buf);
+
+ con->v1.out_more = 0;
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+}
+
+static int prepare_write_connect(struct ceph_connection *con)
+{
+ unsigned int global_seq = ceph_get_global_seq(con->msgr, 0);
+ int proto;
+ int ret;
+
+ switch (con->peer_name.type) {
+ case CEPH_ENTITY_TYPE_MON:
+ proto = CEPH_MONC_PROTOCOL;
+ break;
+ case CEPH_ENTITY_TYPE_OSD:
+ proto = CEPH_OSDC_PROTOCOL;
+ break;
+ case CEPH_ENTITY_TYPE_MDS:
+ proto = CEPH_MDSC_PROTOCOL;
+ break;
+ default:
+ BUG();
+ }
+
+ dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
+ con->v1.connect_seq, global_seq, proto);
+
+ con->v1.out_connect.features =
+ cpu_to_le64(from_msgr(con->msgr)->supported_features);
+ con->v1.out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+ con->v1.out_connect.connect_seq = cpu_to_le32(con->v1.connect_seq);
+ con->v1.out_connect.global_seq = cpu_to_le32(global_seq);
+ con->v1.out_connect.protocol_version = cpu_to_le32(proto);
+ con->v1.out_connect.flags = 0;
+
+ ret = get_connect_authorizer(con);
+ if (ret)
+ return ret;
+
+ __prepare_write_connect(con);
+ return 0;
+}
+
+/*
+ * write as much of pending kvecs to the socket as we can.
+ * 1 -> done
+ * 0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_kvec(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("write_partial_kvec %p %d left\n", con, con->v1.out_kvec_bytes);
+ while (con->v1.out_kvec_bytes > 0) {
+ ret = ceph_tcp_sendmsg(con->sock, con->v1.out_kvec_cur,
+ con->v1.out_kvec_left,
+ con->v1.out_kvec_bytes,
+ con->v1.out_more);
+ if (ret <= 0)
+ goto out;
+ con->v1.out_kvec_bytes -= ret;
+ if (!con->v1.out_kvec_bytes)
+ break; /* done */
+
+ /* account for full iov entries consumed */
+ while (ret >= con->v1.out_kvec_cur->iov_len) {
+ BUG_ON(!con->v1.out_kvec_left);
+ ret -= con->v1.out_kvec_cur->iov_len;
+ con->v1.out_kvec_cur++;
+ con->v1.out_kvec_left--;
+ }
+ /* and for a partially-consumed entry */
+ if (ret) {
+ con->v1.out_kvec_cur->iov_len -= ret;
+ con->v1.out_kvec_cur->iov_base += ret;
+ }
+ }
+ con->v1.out_kvec_left = 0;
+ ret = 1;
+out:
+ dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+ con->v1.out_kvec_bytes, con->v1.out_kvec_left, ret);
+ return ret; /* done! */
+}
+
+/*
+ * Write as much message data payload as we can. If we finish, queue
+ * up the footer.
+ * 1 -> done, footer is now queued in out_kvec[].
+ * 0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_message_data(struct ceph_connection *con,
+ struct ceph_msg *msg)
+{
+ struct ceph_msg_data_cursor *cursor = &msg->cursor;
+ bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+ u32 crc;
+
+ dout("%s %p msg %p\n", __func__, con, msg);
+
+ if (!msg->num_data_items)
+ return -EINVAL;
+
+ /*
+ * Iterate through each page that contains data to be
+ * written, and send as much as possible for each.
+ *
+ * If we are calculating the data crc (the default), we will
+ * need to map the page. If we have no pages, they have
+ * been revoked, so use the zero page.
+ */
+ crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
+ while (cursor->total_resid) {
+ struct page *page;
+ size_t page_offset;
+ size_t length;
+ int ret;
+
+ if (!cursor->resid) {
+ ceph_msg_data_advance(cursor, 0);
+ continue;
+ }
+
+ page = ceph_msg_data_next(cursor, &page_offset, &length);
+ ret = ceph_tcp_sendpage(con->sock, page, page_offset, length,
+ MSG_MORE);
+ if (ret <= 0) {
+ if (do_datacrc)
+ msg->footer.data_crc = cpu_to_le32(crc);
+
+ return ret;
+ }
+ if (do_datacrc && cursor->need_crc)
+ crc = ceph_crc32c_page(crc, page, page_offset, length);
+ ceph_msg_data_advance(cursor, (size_t)ret);
+ }
+
+ dout("%s %p msg %p done\n", __func__, con, msg);
+
+ /* prepare and queue up footer, too */
+ if (do_datacrc)
+ msg->footer.data_crc = cpu_to_le32(crc);
+ else
+ msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+ con_out_kvec_reset(con);
+ prepare_write_message_footer(con, msg);
+
+ return 1; /* must return > 0 to indicate success */
+}
+
+/*
+ * write some zeros
+ */
+static int write_partial_skip(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s %p %d left\n", __func__, con, con->v1.out_skip);
+ while (con->v1.out_skip > 0) {
+ size_t size = min(con->v1.out_skip, (int)PAGE_SIZE);
+
+ ret = ceph_tcp_sendpage(con->sock, ceph_zero_page, 0, size,
+ MSG_MORE);
+ if (ret <= 0)
+ goto out;
+ con->v1.out_skip -= ret;
+ }
+ ret = 1;
+out:
+ return ret;
+}
+
+/*
+ * Prepare to read connection handshake, or an ack.
+ */
+static void prepare_read_banner(struct ceph_connection *con)
+{
+ dout("prepare_read_banner %p\n", con);
+ con->v1.in_base_pos = 0;
+}
+
+static void prepare_read_connect(struct ceph_connection *con)
+{
+ dout("prepare_read_connect %p\n", con);
+ con->v1.in_base_pos = 0;
+}
+
+static void prepare_read_ack(struct ceph_connection *con)
+{
+ dout("prepare_read_ack %p\n", con);
+ con->v1.in_base_pos = 0;
+}
+
+static void prepare_read_seq(struct ceph_connection *con)
+{
+ dout("prepare_read_seq %p\n", con);
+ con->v1.in_base_pos = 0;
+ con->v1.in_tag = CEPH_MSGR_TAG_SEQ;
+}
+
+static void prepare_read_tag(struct ceph_connection *con)
+{
+ dout("prepare_read_tag %p\n", con);
+ con->v1.in_base_pos = 0;
+ con->v1.in_tag = CEPH_MSGR_TAG_READY;
+}
+
+static void prepare_read_keepalive_ack(struct ceph_connection *con)
+{
+ dout("prepare_read_keepalive_ack %p\n", con);
+ con->v1.in_base_pos = 0;
+}
+
+/*
+ * Prepare to read a message.
+ */
+static int prepare_read_message(struct ceph_connection *con)
+{
+ dout("prepare_read_message %p\n", con);
+ BUG_ON(con->in_msg != NULL);
+ con->v1.in_base_pos = 0;
+ con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
+ return 0;
+}
+
+static int read_partial(struct ceph_connection *con,
+ int end, int size, void *object)
+{
+ while (con->v1.in_base_pos < end) {
+ int left = end - con->v1.in_base_pos;
+ int have = size - left;
+ int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
+ if (ret <= 0)
+ return ret;
+ con->v1.in_base_pos += ret;
+ }
+ return 1;
+}
+
+/*
+ * Read all or part of the connect-side handshake on a new connection
+ */
+static int read_partial_banner(struct ceph_connection *con)
+{
+ int size;
+ int end;
+ int ret;
+
+ dout("read_partial_banner %p at %d\n", con, con->v1.in_base_pos);
+
+ /* peer's banner */
+ size = strlen(CEPH_BANNER);
+ end = size;
+ ret = read_partial(con, end, size, con->v1.in_banner);
+ if (ret <= 0)
+ goto out;
+
+ size = sizeof(con->v1.actual_peer_addr);
+ end += size;
+ ret = read_partial(con, end, size, &con->v1.actual_peer_addr);
+ if (ret <= 0)
+ goto out;
+ ceph_decode_banner_addr(&con->v1.actual_peer_addr);
+
+ size = sizeof(con->v1.peer_addr_for_me);
+ end += size;
+ ret = read_partial(con, end, size, &con->v1.peer_addr_for_me);
+ if (ret <= 0)
+ goto out;
+ ceph_decode_banner_addr(&con->v1.peer_addr_for_me);
+
+out:
+ return ret;
+}
+
+static int read_partial_connect(struct ceph_connection *con)
+{
+ int size;
+ int end;
+ int ret;
+
+ dout("read_partial_connect %p at %d\n", con, con->v1.in_base_pos);
+
+ size = sizeof(con->v1.in_reply);
+ end = size;
+ ret = read_partial(con, end, size, &con->v1.in_reply);
+ if (ret <= 0)
+ goto out;
+
+ if (con->v1.auth) {
+ size = le32_to_cpu(con->v1.in_reply.authorizer_len);
+ if (size > con->v1.auth->authorizer_reply_buf_len) {
+ pr_err("authorizer reply too big: %d > %zu\n", size,
+ con->v1.auth->authorizer_reply_buf_len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ end += size;
+ ret = read_partial(con, end, size,
+ con->v1.auth->authorizer_reply_buf);
+ if (ret <= 0)
+ goto out;
+ }
+
+ dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
+ con, con->v1.in_reply.tag,
+ le32_to_cpu(con->v1.in_reply.connect_seq),
+ le32_to_cpu(con->v1.in_reply.global_seq));
+out:
+ return ret;
+}
+
+/*
+ * Verify the hello banner looks okay.
+ */
+static int verify_hello(struct ceph_connection *con)
+{
+ if (memcmp(con->v1.in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+ pr_err("connect to %s got bad banner\n",
+ ceph_pr_addr(&con->peer_addr));
+ con->error_msg = "protocol error, bad banner";
+ return -1;
+ }
+ return 0;
+}
+
+static int process_banner(struct ceph_connection *con)
+{
+ struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+
+ dout("process_banner on %p\n", con);
+
+ if (verify_hello(con) < 0)
+ return -1;
+
+ /*
+ * Make sure the other end is who we wanted. note that the other
+ * end may not yet know their ip address, so if it's 0.0.0.0, give
+ * them the benefit of the doubt.
+ */
+ if (memcmp(&con->peer_addr, &con->v1.actual_peer_addr,
+ sizeof(con->peer_addr)) != 0 &&
+ !(ceph_addr_is_blank(&con->v1.actual_peer_addr) &&
+ con->v1.actual_peer_addr.nonce == con->peer_addr.nonce)) {
+ pr_warn("wrong peer, want %s/%u, got %s/%u\n",
+ ceph_pr_addr(&con->peer_addr),
+ le32_to_cpu(con->peer_addr.nonce),
+ ceph_pr_addr(&con->v1.actual_peer_addr),
+ le32_to_cpu(con->v1.actual_peer_addr.nonce));
+ con->error_msg = "wrong peer at address";
+ return -1;
+ }
+
+ /*
+ * did we learn our address?
+ */
+ if (ceph_addr_is_blank(my_addr)) {
+ memcpy(&my_addr->in_addr,
+ &con->v1.peer_addr_for_me.in_addr,
+ sizeof(con->v1.peer_addr_for_me.in_addr));
+ ceph_addr_set_port(my_addr, 0);
+ ceph_encode_my_addr(con->msgr);
+ dout("process_banner learned my addr is %s\n",
+ ceph_pr_addr(my_addr));
+ }
+
+ return 0;
+}
+
+static int process_connect(struct ceph_connection *con)
+{
+ u64 sup_feat = from_msgr(con->msgr)->supported_features;
+ u64 req_feat = from_msgr(con->msgr)->required_features;
+ u64 server_feat = le64_to_cpu(con->v1.in_reply.features);
+ int ret;
+
+ dout("process_connect on %p tag %d\n", con, con->v1.in_tag);
+
+ if (con->v1.auth) {
+ int len = le32_to_cpu(con->v1.in_reply.authorizer_len);
+
+ /*
+ * Any connection that defines ->get_authorizer()
+ * should also define ->add_authorizer_challenge() and
+ * ->verify_authorizer_reply().
+ *
+ * See get_connect_authorizer().
+ */
+ if (con->v1.in_reply.tag ==
+ CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) {
+ ret = con->ops->add_authorizer_challenge(
+ con, con->v1.auth->authorizer_reply_buf, len);
+ if (ret < 0)
+ return ret;
+
+ con_out_kvec_reset(con);
+ __prepare_write_connect(con);
+ prepare_read_connect(con);
+ return 0;
+ }
+
+ if (len) {
+ ret = con->ops->verify_authorizer_reply(con);
+ if (ret < 0) {
+ con->error_msg = "bad authorize reply";
+ return ret;
+ }
+ }
+ }
+
+ switch (con->v1.in_reply.tag) {
+ case CEPH_MSGR_TAG_FEATURES:
+ pr_err("%s%lld %s feature set mismatch,"
+ " my %llx < server's %llx, missing %llx\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr),
+ sup_feat, server_feat, server_feat & ~sup_feat);
+ con->error_msg = "missing required protocol features";
+ return -1;
+
+ case CEPH_MSGR_TAG_BADPROTOVER:
+ pr_err("%s%lld %s protocol version mismatch,"
+ " my %d != server's %d\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr),
+ le32_to_cpu(con->v1.out_connect.protocol_version),
+ le32_to_cpu(con->v1.in_reply.protocol_version));
+ con->error_msg = "protocol version mismatch";
+ return -1;
+
+ case CEPH_MSGR_TAG_BADAUTHORIZER:
+ con->v1.auth_retry++;
+ dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
+ con->v1.auth_retry);
+ if (con->v1.auth_retry == 2) {
+ con->error_msg = "connect authorization failure";
+ return -1;
+ }
+ con_out_kvec_reset(con);
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ return ret;
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_RESETSESSION:
+ /*
+ * If we connected with a large connect_seq but the peer
+ * has no record of a session with us (no connection, or
+ * connect_seq == 0), they will send RESETSESION to indicate
+ * that they must have reset their session, and may have
+ * dropped messages.
+ */
+ dout("process_connect got RESET peer seq %u\n",
+ le32_to_cpu(con->v1.in_reply.connect_seq));
+ pr_info("%s%lld %s session reset\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr));
+ ceph_con_reset_session(con);
+ con_out_kvec_reset(con);
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ return ret;
+ prepare_read_connect(con);
+
+ /* Tell ceph about it. */
+ mutex_unlock(&con->mutex);
+ if (con->ops->peer_reset)
+ con->ops->peer_reset(con);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V1_CONNECT_MSG)
+ return -EAGAIN;
+ break;
+
+ case CEPH_MSGR_TAG_RETRY_SESSION:
+ /*
+ * If we sent a smaller connect_seq than the peer has, try
+ * again with a larger value.
+ */
+ dout("process_connect got RETRY_SESSION my seq %u, peer %u\n",
+ le32_to_cpu(con->v1.out_connect.connect_seq),
+ le32_to_cpu(con->v1.in_reply.connect_seq));
+ con->v1.connect_seq = le32_to_cpu(con->v1.in_reply.connect_seq);
+ con_out_kvec_reset(con);
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ return ret;
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_RETRY_GLOBAL:
+ /*
+ * If we sent a smaller global_seq than the peer has, try
+ * again with a larger value.
+ */
+ dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
+ con->v1.peer_global_seq,
+ le32_to_cpu(con->v1.in_reply.global_seq));
+ ceph_get_global_seq(con->msgr,
+ le32_to_cpu(con->v1.in_reply.global_seq));
+ con_out_kvec_reset(con);
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ return ret;
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_SEQ:
+ case CEPH_MSGR_TAG_READY:
+ if (req_feat & ~server_feat) {
+ pr_err("%s%lld %s protocol feature mismatch,"
+ " my required %llx > server's %llx, need %llx\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr),
+ req_feat, server_feat, req_feat & ~server_feat);
+ con->error_msg = "missing required protocol features";
+ return -1;
+ }
+
+ WARN_ON(con->state != CEPH_CON_S_V1_CONNECT_MSG);
+ con->state = CEPH_CON_S_OPEN;
+ con->v1.auth_retry = 0; /* we authenticated; clear flag */
+ con->v1.peer_global_seq =
+ le32_to_cpu(con->v1.in_reply.global_seq);
+ con->v1.connect_seq++;
+ con->peer_features = server_feat;
+ dout("process_connect got READY gseq %d cseq %d (%d)\n",
+ con->v1.peer_global_seq,
+ le32_to_cpu(con->v1.in_reply.connect_seq),
+ con->v1.connect_seq);
+ WARN_ON(con->v1.connect_seq !=
+ le32_to_cpu(con->v1.in_reply.connect_seq));
+
+ if (con->v1.in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+ ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
+
+ con->delay = 0; /* reset backoff memory */
+
+ if (con->v1.in_reply.tag == CEPH_MSGR_TAG_SEQ) {
+ prepare_write_seq(con);
+ prepare_read_seq(con);
+ } else {
+ prepare_read_tag(con);
+ }
+ break;
+
+ case CEPH_MSGR_TAG_WAIT:
+ /*
+ * If there is a connection race (we are opening
+ * connections to each other), one of us may just have
+ * to WAIT. This shouldn't happen if we are the
+ * client.
+ */
+ con->error_msg = "protocol error, got WAIT as client";
+ return -1;
+
+ default:
+ con->error_msg = "protocol error, garbage tag during connect";
+ return -1;
+ }
+ return 0;
+}
+
+/*
+ * read (part of) an ack
+ */
+static int read_partial_ack(struct ceph_connection *con)
+{
+ int size = sizeof(con->v1.in_temp_ack);
+ int end = size;
+
+ return read_partial(con, end, size, &con->v1.in_temp_ack);
+}
+
+/*
+ * We can finally discard anything that's been acked.
+ */
+static void process_ack(struct ceph_connection *con)
+{
+ u64 ack = le64_to_cpu(con->v1.in_temp_ack);
+
+ if (con->v1.in_tag == CEPH_MSGR_TAG_ACK)
+ ceph_con_discard_sent(con, ack);
+ else
+ ceph_con_discard_requeued(con, ack);
+
+ prepare_read_tag(con);
+}
+
+static int read_partial_message_chunk(struct ceph_connection *con,
+ struct kvec *section,
+ unsigned int sec_len, u32 *crc)
+{
+ int ret, left;
+
+ BUG_ON(!section);
+
+ while (section->iov_len < sec_len) {
+ BUG_ON(section->iov_base == NULL);
+ left = sec_len - section->iov_len;
+ ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
+ section->iov_len, left);
+ if (ret <= 0)
+ return ret;
+ section->iov_len += ret;
+ }
+ if (section->iov_len == sec_len)
+ *crc = crc32c(*crc, section->iov_base, section->iov_len);
+
+ return 1;
+}
+
+static inline int read_partial_message_section(struct ceph_connection *con,
+ struct kvec *section,
+ unsigned int sec_len, u32 *crc)
+{
+ *crc = 0;
+ return read_partial_message_chunk(con, section, sec_len, crc);
+}
+
+static int read_partial_sparse_msg_extent(struct ceph_connection *con, u32 *crc)
+{
+ struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor;
+ bool do_bounce = ceph_test_opt(from_msgr(con->msgr), RXBOUNCE);
+
+ if (do_bounce && unlikely(!con->bounce_page)) {
+ con->bounce_page = alloc_page(GFP_NOIO);
+ if (!con->bounce_page) {
+ pr_err("failed to allocate bounce page\n");
+ return -ENOMEM;
+ }
+ }
+
+ while (cursor->sr_resid > 0) {
+ struct page *page, *rpage;
+ size_t off, len;
+ int ret;
+
+ page = ceph_msg_data_next(cursor, &off, &len);
+ rpage = do_bounce ? con->bounce_page : page;
+
+ /* clamp to what remains in extent */
+ len = min_t(int, len, cursor->sr_resid);
+ ret = ceph_tcp_recvpage(con->sock, rpage, (int)off, len);
+ if (ret <= 0)
+ return ret;
+ *crc = ceph_crc32c_page(*crc, rpage, off, ret);
+ ceph_msg_data_advance(cursor, (size_t)ret);
+ cursor->sr_resid -= ret;
+ if (do_bounce)
+ memcpy_page(page, off, rpage, off, ret);
+ }
+ return 1;
+}
+
+static int read_partial_sparse_msg_data(struct ceph_connection *con)
+{
+ struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor;
+ bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+ u32 crc = 0;
+ int ret = 1;
+
+ if (do_datacrc)
+ crc = con->in_data_crc;
+
+ while (cursor->total_resid) {
+ if (con->v1.in_sr_kvec.iov_base)
+ ret = read_partial_message_chunk(con,
+ &con->v1.in_sr_kvec,
+ con->v1.in_sr_len,
+ &crc);
+ else if (cursor->sr_resid > 0)
+ ret = read_partial_sparse_msg_extent(con, &crc);
+ if (ret <= 0)
+ break;
+
+ memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec));
+ ret = con->ops->sparse_read(con, cursor,
+ (char **)&con->v1.in_sr_kvec.iov_base);
+ if (ret <= 0) {
+ ret = ret ? ret : 1; /* must return > 0 to indicate success */
+ break;
+ }
+ con->v1.in_sr_len = ret;
+ }
+
+ if (do_datacrc)
+ con->in_data_crc = crc;
+
+ return ret;
+}
+
+static int read_partial_msg_data(struct ceph_connection *con)
+{
+ struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor;
+ bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+ struct page *page;
+ size_t page_offset;
+ size_t length;
+ u32 crc = 0;
+ int ret;
+
+ if (do_datacrc)
+ crc = con->in_data_crc;
+ while (cursor->total_resid) {
+ if (!cursor->resid) {
+ ceph_msg_data_advance(cursor, 0);
+ continue;
+ }
+
+ page = ceph_msg_data_next(cursor, &page_offset, &length);
+ ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
+ if (ret <= 0) {
+ if (do_datacrc)
+ con->in_data_crc = crc;
+
+ return ret;
+ }
+
+ if (do_datacrc)
+ crc = ceph_crc32c_page(crc, page, page_offset, ret);
+ ceph_msg_data_advance(cursor, (size_t)ret);
+ }
+ if (do_datacrc)
+ con->in_data_crc = crc;
+
+ return 1; /* must return > 0 to indicate success */
+}
+
+static int read_partial_msg_data_bounce(struct ceph_connection *con)
+{
+ struct ceph_msg_data_cursor *cursor = &con->in_msg->cursor;
+ struct page *page;
+ size_t off, len;
+ u32 crc;
+ int ret;
+
+ if (unlikely(!con->bounce_page)) {
+ con->bounce_page = alloc_page(GFP_NOIO);
+ if (!con->bounce_page) {
+ pr_err("failed to allocate bounce page\n");
+ return -ENOMEM;
+ }
+ }
+
+ crc = con->in_data_crc;
+ while (cursor->total_resid) {
+ if (!cursor->resid) {
+ ceph_msg_data_advance(cursor, 0);
+ continue;
+ }
+
+ page = ceph_msg_data_next(cursor, &off, &len);
+ ret = ceph_tcp_recvpage(con->sock, con->bounce_page, 0, len);
+ if (ret <= 0) {
+ con->in_data_crc = crc;
+ return ret;
+ }
+
+ crc = crc32c(crc, page_address(con->bounce_page), ret);
+ memcpy_to_page(page, off, page_address(con->bounce_page), ret);
+
+ ceph_msg_data_advance(cursor, ret);
+ }
+ con->in_data_crc = crc;
+
+ return 1; /* must return > 0 to indicate success */
+}
+
+/*
+ * read (part of) a message.
+ */
+static int read_partial_message(struct ceph_connection *con)
+{
+ struct ceph_msg *m = con->in_msg;
+ int size;
+ int end;
+ int ret;
+ unsigned int front_len, middle_len, data_len;
+ bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
+ bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH);
+ u64 seq;
+ u32 crc;
+
+ dout("read_partial_message con %p msg %p\n", con, m);
+
+ /* header */
+ size = sizeof(con->v1.in_hdr);
+ end = size;
+ ret = read_partial(con, end, size, &con->v1.in_hdr);
+ if (ret <= 0)
+ return ret;
+
+ crc = crc32c(0, &con->v1.in_hdr, offsetof(struct ceph_msg_header, crc));
+ if (cpu_to_le32(crc) != con->v1.in_hdr.crc) {
+ pr_err("read_partial_message bad hdr crc %u != expected %u\n",
+ crc, con->v1.in_hdr.crc);
+ return -EBADMSG;
+ }
+
+ front_len = le32_to_cpu(con->v1.in_hdr.front_len);
+ if (front_len > CEPH_MSG_MAX_FRONT_LEN)
+ return -EIO;
+ middle_len = le32_to_cpu(con->v1.in_hdr.middle_len);
+ if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
+ return -EIO;
+ data_len = le32_to_cpu(con->v1.in_hdr.data_len);
+ if (data_len > CEPH_MSG_MAX_DATA_LEN)
+ return -EIO;
+
+ /* verify seq# */
+ seq = le64_to_cpu(con->v1.in_hdr.seq);
+ if ((s64)seq - (s64)con->in_seq < 1) {
+ pr_info("skipping %s%lld %s seq %lld expected %lld\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr),
+ seq, con->in_seq + 1);
+ con->v1.in_base_pos = -front_len - middle_len - data_len -
+ sizeof_footer(con);
+ con->v1.in_tag = CEPH_MSGR_TAG_READY;
+ return 1;
+ } else if ((s64)seq - (s64)con->in_seq > 1) {
+ pr_err("read_partial_message bad seq %lld expected %lld\n",
+ seq, con->in_seq + 1);
+ con->error_msg = "bad message sequence # for incoming message";
+ return -EBADE;
+ }
+
+ /* allocate message? */
+ if (!con->in_msg) {
+ int skip = 0;
+
+ dout("got hdr type %d front %d data %d\n", con->v1.in_hdr.type,
+ front_len, data_len);
+ ret = ceph_con_in_msg_alloc(con, &con->v1.in_hdr, &skip);
+ if (ret < 0)
+ return ret;
+
+ BUG_ON((!con->in_msg) ^ skip);
+ if (skip) {
+ /* skip this message */
+ dout("alloc_msg said skip message\n");
+ con->v1.in_base_pos = -front_len - middle_len -
+ data_len - sizeof_footer(con);
+ con->v1.in_tag = CEPH_MSGR_TAG_READY;
+ con->in_seq++;
+ return 1;
+ }
+
+ BUG_ON(!con->in_msg);
+ BUG_ON(con->in_msg->con != con);
+ m = con->in_msg;
+ m->front.iov_len = 0; /* haven't read it yet */
+ if (m->middle)
+ m->middle->vec.iov_len = 0;
+
+ /* prepare for data payload, if any */
+
+ if (data_len)
+ prepare_message_data(con->in_msg, data_len);
+ }
+
+ /* front */
+ ret = read_partial_message_section(con, &m->front, front_len,
+ &con->in_front_crc);
+ if (ret <= 0)
+ return ret;
+
+ /* middle */
+ if (m->middle) {
+ ret = read_partial_message_section(con, &m->middle->vec,
+ middle_len,
+ &con->in_middle_crc);
+ if (ret <= 0)
+ return ret;
+ }
+
+ /* (page) data */
+ if (data_len) {
+ if (!m->num_data_items)
+ return -EIO;
+
+ if (m->sparse_read_total)
+ ret = read_partial_sparse_msg_data(con);
+ else if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE))
+ ret = read_partial_msg_data_bounce(con);
+ else
+ ret = read_partial_msg_data(con);
+ if (ret <= 0)
+ return ret;
+ }
+
+ /* footer */
+ size = sizeof_footer(con);
+ end += size;
+ ret = read_partial(con, end, size, &m->footer);
+ if (ret <= 0)
+ return ret;
+
+ if (!need_sign) {
+ m->footer.flags = m->old_footer.flags;
+ m->footer.sig = 0;
+ }
+
+ dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
+ m, front_len, m->footer.front_crc, middle_len,
+ m->footer.middle_crc, data_len, m->footer.data_crc);
+
+ /* crc ok? */
+ if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
+ pr_err("read_partial_message %p front crc %u != exp. %u\n",
+ m, con->in_front_crc, m->footer.front_crc);
+ return -EBADMSG;
+ }
+ if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
+ pr_err("read_partial_message %p middle crc %u != exp %u\n",
+ m, con->in_middle_crc, m->footer.middle_crc);
+ return -EBADMSG;
+ }
+ if (do_datacrc &&
+ (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
+ con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
+ pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
+ con->in_data_crc, le32_to_cpu(m->footer.data_crc));
+ return -EBADMSG;
+ }
+
+ if (need_sign && con->ops->check_message_signature &&
+ con->ops->check_message_signature(m)) {
+ pr_err("read_partial_message %p signature check failed\n", m);
+ return -EBADMSG;
+ }
+
+ return 1; /* done! */
+}
+
+static int read_keepalive_ack(struct ceph_connection *con)
+{
+ struct ceph_timespec ceph_ts;
+ size_t size = sizeof(ceph_ts);
+ int ret = read_partial(con, size, size, &ceph_ts);
+ if (ret <= 0)
+ return ret;
+ ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts);
+ prepare_read_tag(con);
+ return 1;
+}
+
+/*
+ * Read what we can from the socket.
+ */
+int ceph_con_v1_try_read(struct ceph_connection *con)
+{
+ int ret = -1;
+
+more:
+ dout("try_read start %p state %d\n", con, con->state);
+ if (con->state != CEPH_CON_S_V1_BANNER &&
+ con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+ con->state != CEPH_CON_S_OPEN)
+ return 0;
+
+ BUG_ON(!con->sock);
+
+ dout("try_read tag %d in_base_pos %d\n", con->v1.in_tag,
+ con->v1.in_base_pos);
+
+ if (con->state == CEPH_CON_S_V1_BANNER) {
+ ret = read_partial_banner(con);
+ if (ret <= 0)
+ goto out;
+ ret = process_banner(con);
+ if (ret < 0)
+ goto out;
+
+ con->state = CEPH_CON_S_V1_CONNECT_MSG;
+
+ /*
+ * Received banner is good, exchange connection info.
+ * Do not reset out_kvec, as sending our banner raced
+ * with receiving peer banner after connect completed.
+ */
+ ret = prepare_write_connect(con);
+ if (ret < 0)
+ goto out;
+ prepare_read_connect(con);
+
+ /* Send connection info before awaiting response */
+ goto out;
+ }
+
+ if (con->state == CEPH_CON_S_V1_CONNECT_MSG) {
+ ret = read_partial_connect(con);
+ if (ret <= 0)
+ goto out;
+ ret = process_connect(con);
+ if (ret < 0)
+ goto out;
+ goto more;
+ }
+
+ WARN_ON(con->state != CEPH_CON_S_OPEN);
+
+ if (con->v1.in_base_pos < 0) {
+ /*
+ * skipping + discarding content.
+ */
+ ret = ceph_tcp_recvmsg(con->sock, NULL, -con->v1.in_base_pos);
+ if (ret <= 0)
+ goto out;
+ dout("skipped %d / %d bytes\n", ret, -con->v1.in_base_pos);
+ con->v1.in_base_pos += ret;
+ if (con->v1.in_base_pos)
+ goto more;
+ }
+ if (con->v1.in_tag == CEPH_MSGR_TAG_READY) {
+ /*
+ * what's next?
+ */
+ ret = ceph_tcp_recvmsg(con->sock, &con->v1.in_tag, 1);
+ if (ret <= 0)
+ goto out;
+ dout("try_read got tag %d\n", con->v1.in_tag);
+ switch (con->v1.in_tag) {
+ case CEPH_MSGR_TAG_MSG:
+ prepare_read_message(con);
+ break;
+ case CEPH_MSGR_TAG_ACK:
+ prepare_read_ack(con);
+ break;
+ case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
+ prepare_read_keepalive_ack(con);
+ break;
+ case CEPH_MSGR_TAG_CLOSE:
+ ceph_con_close_socket(con);
+ con->state = CEPH_CON_S_CLOSED;
+ goto out;
+ default:
+ goto bad_tag;
+ }
+ }
+ if (con->v1.in_tag == CEPH_MSGR_TAG_MSG) {
+ ret = read_partial_message(con);
+ if (ret <= 0) {
+ switch (ret) {
+ case -EBADMSG:
+ con->error_msg = "bad crc/signature";
+ fallthrough;
+ case -EBADE:
+ ret = -EIO;
+ break;
+ case -EIO:
+ con->error_msg = "io error";
+ break;
+ }
+ goto out;
+ }
+ if (con->v1.in_tag == CEPH_MSGR_TAG_READY)
+ goto more;
+ ceph_con_process_message(con);
+ if (con->state == CEPH_CON_S_OPEN)
+ prepare_read_tag(con);
+ goto more;
+ }
+ if (con->v1.in_tag == CEPH_MSGR_TAG_ACK ||
+ con->v1.in_tag == CEPH_MSGR_TAG_SEQ) {
+ /*
+ * the final handshake seq exchange is semantically
+ * equivalent to an ACK
+ */
+ ret = read_partial_ack(con);
+ if (ret <= 0)
+ goto out;
+ process_ack(con);
+ goto more;
+ }
+ if (con->v1.in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
+ ret = read_keepalive_ack(con);
+ if (ret <= 0)
+ goto out;
+ goto more;
+ }
+
+out:
+ dout("try_read done on %p ret %d\n", con, ret);
+ return ret;
+
+bad_tag:
+ pr_err("try_read bad tag %d\n", con->v1.in_tag);
+ con->error_msg = "protocol error, garbage tag";
+ ret = -1;
+ goto out;
+}
+
+/*
+ * Write something to the socket. Called in a worker thread when the
+ * socket appears to be writeable and we have something ready to send.
+ */
+int ceph_con_v1_try_write(struct ceph_connection *con)
+{
+ struct ceph_msg *msg;
+ int ret = 1;
+
+ dout("try_write start %p state %d\n", con, con->state);
+ if (con->state != CEPH_CON_S_PREOPEN &&
+ con->state != CEPH_CON_S_V1_BANNER &&
+ con->state != CEPH_CON_S_V1_CONNECT_MSG &&
+ con->state != CEPH_CON_S_OPEN)
+ return 0;
+
+ /* open the socket first? */
+ if (con->state == CEPH_CON_S_PREOPEN) {
+ BUG_ON(con->sock);
+ con->state = CEPH_CON_S_V1_BANNER;
+
+ con_out_kvec_reset(con);
+ prepare_write_banner(con);
+ prepare_read_banner(con);
+
+ BUG_ON(con->in_msg);
+ con->v1.in_tag = CEPH_MSGR_TAG_READY;
+ dout("try_write initiating connect on %p new state %d\n",
+ con, con->state);
+ ret = ceph_tcp_connect(con);
+ if (ret < 0) {
+ con->error_msg = "connect error";
+ goto out;
+ }
+ }
+
+more:
+ dout("try_write out_kvec_bytes %d\n", con->v1.out_kvec_bytes);
+ BUG_ON(!con->sock);
+
+ /* kvec data queued? */
+ if (con->v1.out_kvec_left) {
+ ret = write_partial_kvec(con);
+ if (ret <= 0)
+ goto out;
+ }
+ if (con->v1.out_skip) {
+ ret = write_partial_skip(con);
+ if (ret <= 0)
+ goto out;
+ }
+
+ /* msg pages? */
+ msg = con->out_msg;
+ if (msg) {
+ if (con->v1.out_msg_done) {
+ ceph_msg_put(msg);
+ con->out_msg = NULL; /* we're done with this one */
+ goto do_next;
+ }
+
+ ret = write_partial_message_data(con, msg);
+ if (ret == 1)
+ goto more; /* we need to send the footer, too! */
+ if (ret == 0)
+ goto out;
+ if (ret < 0) {
+ dout("try_write write_partial_message_data err %d\n",
+ ret);
+ goto out;
+ }
+ }
+
+do_next:
+ if (con->state == CEPH_CON_S_OPEN) {
+ if (ceph_con_flag_test_and_clear(con,
+ CEPH_CON_F_KEEPALIVE_PENDING)) {
+ prepare_write_keepalive(con);
+ goto more;
+ }
+ /* is anything else pending? */
+ if ((msg = ceph_con_get_out_msg(con)) != NULL) {
+ prepare_write_message(con, msg);
+ goto more;
+ }
+ if (con->in_seq > con->in_seq_acked) {
+ prepare_write_ack(con);
+ goto more;
+ }
+ }
+
+ /* Nothing to do! */
+ ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+ dout("try_write nothing else to write.\n");
+ ret = 0;
+out:
+ dout("try_write done on %p ret %d\n", con, ret);
+ return ret;
+}
+
+void ceph_con_v1_revoke(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ WARN_ON(con->v1.out_skip);
+ /* footer */
+ if (con->v1.out_msg_done) {
+ con->v1.out_skip += con_out_kvec_skip(con);
+ } else {
+ WARN_ON(!msg->data_length);
+ con->v1.out_skip += sizeof_footer(con);
+ }
+ /* data, middle, front */
+ if (msg->data_length)
+ con->v1.out_skip += msg->cursor.total_resid;
+ if (msg->middle)
+ con->v1.out_skip += con_out_kvec_skip(con);
+ con->v1.out_skip += con_out_kvec_skip(con);
+
+ dout("%s con %p out_kvec_bytes %d out_skip %d\n", __func__, con,
+ con->v1.out_kvec_bytes, con->v1.out_skip);
+}
+
+void ceph_con_v1_revoke_incoming(struct ceph_connection *con)
+{
+ unsigned int front_len = le32_to_cpu(con->v1.in_hdr.front_len);
+ unsigned int middle_len = le32_to_cpu(con->v1.in_hdr.middle_len);
+ unsigned int data_len = le32_to_cpu(con->v1.in_hdr.data_len);
+
+ /* skip rest of message */
+ con->v1.in_base_pos = con->v1.in_base_pos -
+ sizeof(struct ceph_msg_header) -
+ front_len -
+ middle_len -
+ data_len -
+ sizeof(struct ceph_msg_footer);
+
+ con->v1.in_tag = CEPH_MSGR_TAG_READY;
+ con->in_seq++;
+
+ dout("%s con %p in_base_pos %d\n", __func__, con, con->v1.in_base_pos);
+}
+
+bool ceph_con_v1_opened(struct ceph_connection *con)
+{
+ return con->v1.connect_seq;
+}
+
+void ceph_con_v1_reset_session(struct ceph_connection *con)
+{
+ con->v1.connect_seq = 0;
+ con->v1.peer_global_seq = 0;
+}
+
+void ceph_con_v1_reset_protocol(struct ceph_connection *con)
+{
+ con->v1.out_skip = 0;
+}
diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
new file mode 100644
index 000000000000..833e57849c1d
--- /dev/null
+++ b/net/ceph/messenger_v2.c
@@ -0,0 +1,3804 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Ceph msgr2 protocol implementation
+ *
+ * Copyright (C) 2020 Ilya Dryomov <idryomov@gmail.com>
+ */
+
+#include <linux/ceph/ceph_debug.h>
+
+#include <crypto/aead.h>
+#include <crypto/hash.h>
+#include <crypto/sha2.h>
+#include <crypto/utils.h>
+#include <linux/bvec.h>
+#include <linux/crc32c.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/socket.h>
+#include <linux/sched/mm.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#include <linux/ceph/ceph_features.h>
+#include <linux/ceph/decode.h>
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/messenger.h>
+
+#include "crypto.h" /* for CEPH_KEY_LEN and CEPH_MAX_CON_SECRET_LEN */
+
+#define FRAME_TAG_HELLO 1
+#define FRAME_TAG_AUTH_REQUEST 2
+#define FRAME_TAG_AUTH_BAD_METHOD 3
+#define FRAME_TAG_AUTH_REPLY_MORE 4
+#define FRAME_TAG_AUTH_REQUEST_MORE 5
+#define FRAME_TAG_AUTH_DONE 6
+#define FRAME_TAG_AUTH_SIGNATURE 7
+#define FRAME_TAG_CLIENT_IDENT 8
+#define FRAME_TAG_SERVER_IDENT 9
+#define FRAME_TAG_IDENT_MISSING_FEATURES 10
+#define FRAME_TAG_SESSION_RECONNECT 11
+#define FRAME_TAG_SESSION_RESET 12
+#define FRAME_TAG_SESSION_RETRY 13
+#define FRAME_TAG_SESSION_RETRY_GLOBAL 14
+#define FRAME_TAG_SESSION_RECONNECT_OK 15
+#define FRAME_TAG_WAIT 16
+#define FRAME_TAG_MESSAGE 17
+#define FRAME_TAG_KEEPALIVE2 18
+#define FRAME_TAG_KEEPALIVE2_ACK 19
+#define FRAME_TAG_ACK 20
+
+#define FRAME_LATE_STATUS_ABORTED 0x1
+#define FRAME_LATE_STATUS_COMPLETE 0xe
+#define FRAME_LATE_STATUS_ABORTED_MASK 0xf
+
+#define IN_S_HANDLE_PREAMBLE 1
+#define IN_S_HANDLE_CONTROL 2
+#define IN_S_HANDLE_CONTROL_REMAINDER 3
+#define IN_S_PREPARE_READ_DATA 4
+#define IN_S_PREPARE_READ_DATA_CONT 5
+#define IN_S_PREPARE_READ_ENC_PAGE 6
+#define IN_S_PREPARE_SPARSE_DATA 7
+#define IN_S_PREPARE_SPARSE_DATA_CONT 8
+#define IN_S_HANDLE_EPILOGUE 9
+#define IN_S_FINISH_SKIP 10
+
+#define OUT_S_QUEUE_DATA 1
+#define OUT_S_QUEUE_DATA_CONT 2
+#define OUT_S_QUEUE_ENC_PAGE 3
+#define OUT_S_QUEUE_ZEROS 4
+#define OUT_S_FINISH_MESSAGE 5
+#define OUT_S_GET_NEXT 6
+
+#define CTRL_BODY(p) ((void *)(p) + CEPH_PREAMBLE_LEN)
+#define FRONT_PAD(p) ((void *)(p) + CEPH_EPILOGUE_SECURE_LEN)
+#define MIDDLE_PAD(p) (FRONT_PAD(p) + CEPH_GCM_BLOCK_LEN)
+#define DATA_PAD(p) (MIDDLE_PAD(p) + CEPH_GCM_BLOCK_LEN)
+
+#define CEPH_MSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL)
+
+static int do_recvmsg(struct socket *sock, struct iov_iter *it)
+{
+ struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+ int ret;
+
+ msg.msg_iter = *it;
+ while (iov_iter_count(it)) {
+ ret = sock_recvmsg(sock, &msg, msg.msg_flags);
+ if (ret <= 0) {
+ if (ret == -EAGAIN)
+ ret = 0;
+ return ret;
+ }
+
+ iov_iter_advance(it, ret);
+ }
+
+ WARN_ON(msg_data_left(&msg));
+ return 1;
+}
+
+/*
+ * Read as much as possible.
+ *
+ * Return:
+ * 1 - done, nothing (else) to read
+ * 0 - socket is empty, need to wait
+ * <0 - error
+ */
+static int ceph_tcp_recv(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p %s %zu\n", __func__, con,
+ iov_iter_is_discard(&con->v2.in_iter) ? "discard" : "need",
+ iov_iter_count(&con->v2.in_iter));
+ ret = do_recvmsg(con->sock, &con->v2.in_iter);
+ dout("%s con %p ret %d left %zu\n", __func__, con, ret,
+ iov_iter_count(&con->v2.in_iter));
+ return ret;
+}
+
+static int do_sendmsg(struct socket *sock, struct iov_iter *it)
+{
+ struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+ int ret;
+
+ msg.msg_iter = *it;
+ while (iov_iter_count(it)) {
+ ret = sock_sendmsg(sock, &msg);
+ if (ret <= 0) {
+ if (ret == -EAGAIN)
+ ret = 0;
+ return ret;
+ }
+
+ iov_iter_advance(it, ret);
+ }
+
+ WARN_ON(msg_data_left(&msg));
+ return 1;
+}
+
+static int do_try_sendpage(struct socket *sock, struct iov_iter *it)
+{
+ struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS };
+ struct bio_vec bv;
+ int ret;
+
+ if (WARN_ON(!iov_iter_is_bvec(it)))
+ return -EINVAL;
+
+ while (iov_iter_count(it)) {
+ /* iov_iter_iovec() for ITER_BVEC */
+ bvec_set_page(&bv, it->bvec->bv_page,
+ min(iov_iter_count(it),
+ it->bvec->bv_len - it->iov_offset),
+ it->bvec->bv_offset + it->iov_offset);
+
+ /*
+ * MSG_SPLICE_PAGES cannot properly handle pages with
+ * page_count == 0, we need to fall back to sendmsg if
+ * that's the case.
+ *
+ * Same goes for slab pages: skb_can_coalesce() allows
+ * coalescing neighboring slab objects into a single frag
+ * which triggers one of hardened usercopy checks.
+ */
+ if (sendpage_ok(bv.bv_page))
+ msg.msg_flags |= MSG_SPLICE_PAGES;
+ else
+ msg.msg_flags &= ~MSG_SPLICE_PAGES;
+
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, bv.bv_len);
+ ret = sock_sendmsg(sock, &msg);
+ if (ret <= 0) {
+ if (ret == -EAGAIN)
+ ret = 0;
+ return ret;
+ }
+
+ iov_iter_advance(it, ret);
+ }
+
+ return 1;
+}
+
+/*
+ * Write as much as possible. The socket is expected to be corked,
+ * so we don't bother with MSG_MORE here.
+ *
+ * Return:
+ * 1 - done, nothing (else) to write
+ * 0 - socket is full, need to wait
+ * <0 - error
+ */
+static int ceph_tcp_send(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p have %zu try_sendpage %d\n", __func__, con,
+ iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage);
+ if (con->v2.out_iter_sendpage)
+ ret = do_try_sendpage(con->sock, &con->v2.out_iter);
+ else
+ ret = do_sendmsg(con->sock, &con->v2.out_iter);
+ dout("%s con %p ret %d left %zu\n", __func__, con, ret,
+ iov_iter_count(&con->v2.out_iter));
+ return ret;
+}
+
+static void add_in_kvec(struct ceph_connection *con, void *buf, int len)
+{
+ BUG_ON(con->v2.in_kvec_cnt >= ARRAY_SIZE(con->v2.in_kvecs));
+ WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+
+ con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_base = buf;
+ con->v2.in_kvecs[con->v2.in_kvec_cnt].iov_len = len;
+ con->v2.in_kvec_cnt++;
+
+ con->v2.in_iter.nr_segs++;
+ con->v2.in_iter.count += len;
+}
+
+static void reset_in_kvecs(struct ceph_connection *con)
+{
+ WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+ con->v2.in_kvec_cnt = 0;
+ iov_iter_kvec(&con->v2.in_iter, ITER_DEST, con->v2.in_kvecs, 0, 0);
+}
+
+static void set_in_bvec(struct ceph_connection *con, const struct bio_vec *bv)
+{
+ WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+ con->v2.in_bvec = *bv;
+ iov_iter_bvec(&con->v2.in_iter, ITER_DEST, &con->v2.in_bvec, 1, bv->bv_len);
+}
+
+static void set_in_skip(struct ceph_connection *con, int len)
+{
+ WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+ dout("%s con %p len %d\n", __func__, con, len);
+ iov_iter_discard(&con->v2.in_iter, ITER_DEST, len);
+}
+
+static void add_out_kvec(struct ceph_connection *con, void *buf, int len)
+{
+ BUG_ON(con->v2.out_kvec_cnt >= ARRAY_SIZE(con->v2.out_kvecs));
+ WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+ WARN_ON(con->v2.out_zero);
+
+ con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_base = buf;
+ con->v2.out_kvecs[con->v2.out_kvec_cnt].iov_len = len;
+ con->v2.out_kvec_cnt++;
+
+ con->v2.out_iter.nr_segs++;
+ con->v2.out_iter.count += len;
+}
+
+static void reset_out_kvecs(struct ceph_connection *con)
+{
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+ WARN_ON(con->v2.out_zero);
+
+ con->v2.out_kvec_cnt = 0;
+
+ iov_iter_kvec(&con->v2.out_iter, ITER_SOURCE, con->v2.out_kvecs, 0, 0);
+ con->v2.out_iter_sendpage = false;
+}
+
+static void set_out_bvec(struct ceph_connection *con, const struct bio_vec *bv,
+ bool zerocopy)
+{
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+ WARN_ON(con->v2.out_zero);
+
+ con->v2.out_bvec = *bv;
+ con->v2.out_iter_sendpage = zerocopy;
+ iov_iter_bvec(&con->v2.out_iter, ITER_SOURCE, &con->v2.out_bvec, 1,
+ con->v2.out_bvec.bv_len);
+}
+
+static void set_out_bvec_zero(struct ceph_connection *con)
+{
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+ WARN_ON(!con->v2.out_zero);
+
+ bvec_set_page(&con->v2.out_bvec, ceph_zero_page,
+ min(con->v2.out_zero, (int)PAGE_SIZE), 0);
+ con->v2.out_iter_sendpage = true;
+ iov_iter_bvec(&con->v2.out_iter, ITER_SOURCE, &con->v2.out_bvec, 1,
+ con->v2.out_bvec.bv_len);
+}
+
+static void out_zero_add(struct ceph_connection *con, int len)
+{
+ dout("%s con %p len %d\n", __func__, con, len);
+ con->v2.out_zero += len;
+}
+
+static void *alloc_conn_buf(struct ceph_connection *con, int len)
+{
+ void *buf;
+
+ dout("%s con %p len %d\n", __func__, con, len);
+
+ if (WARN_ON(con->v2.conn_buf_cnt >= ARRAY_SIZE(con->v2.conn_bufs)))
+ return NULL;
+
+ buf = kvmalloc(len, GFP_NOIO);
+ if (!buf)
+ return NULL;
+
+ con->v2.conn_bufs[con->v2.conn_buf_cnt++] = buf;
+ return buf;
+}
+
+static void free_conn_bufs(struct ceph_connection *con)
+{
+ while (con->v2.conn_buf_cnt)
+ kvfree(con->v2.conn_bufs[--con->v2.conn_buf_cnt]);
+}
+
+static void add_in_sign_kvec(struct ceph_connection *con, void *buf, int len)
+{
+ BUG_ON(con->v2.in_sign_kvec_cnt >= ARRAY_SIZE(con->v2.in_sign_kvecs));
+
+ con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_base = buf;
+ con->v2.in_sign_kvecs[con->v2.in_sign_kvec_cnt].iov_len = len;
+ con->v2.in_sign_kvec_cnt++;
+}
+
+static void clear_in_sign_kvecs(struct ceph_connection *con)
+{
+ con->v2.in_sign_kvec_cnt = 0;
+}
+
+static void add_out_sign_kvec(struct ceph_connection *con, void *buf, int len)
+{
+ BUG_ON(con->v2.out_sign_kvec_cnt >= ARRAY_SIZE(con->v2.out_sign_kvecs));
+
+ con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_base = buf;
+ con->v2.out_sign_kvecs[con->v2.out_sign_kvec_cnt].iov_len = len;
+ con->v2.out_sign_kvec_cnt++;
+}
+
+static void clear_out_sign_kvecs(struct ceph_connection *con)
+{
+ con->v2.out_sign_kvec_cnt = 0;
+}
+
+static bool con_secure(struct ceph_connection *con)
+{
+ return con->v2.con_mode == CEPH_CON_MODE_SECURE;
+}
+
+static int front_len(const struct ceph_msg *msg)
+{
+ return le32_to_cpu(msg->hdr.front_len);
+}
+
+static int middle_len(const struct ceph_msg *msg)
+{
+ return le32_to_cpu(msg->hdr.middle_len);
+}
+
+static int data_len(const struct ceph_msg *msg)
+{
+ return le32_to_cpu(msg->hdr.data_len);
+}
+
+static bool need_padding(int len)
+{
+ return !IS_ALIGNED(len, CEPH_GCM_BLOCK_LEN);
+}
+
+static int padded_len(int len)
+{
+ return ALIGN(len, CEPH_GCM_BLOCK_LEN);
+}
+
+static int padding_len(int len)
+{
+ return padded_len(len) - len;
+}
+
+/* preamble + control segment */
+static int head_onwire_len(int ctrl_len, bool secure)
+{
+ int head_len;
+ int rem_len;
+
+ BUG_ON(ctrl_len < 0 || ctrl_len > CEPH_MSG_MAX_CONTROL_LEN);
+
+ if (secure) {
+ head_len = CEPH_PREAMBLE_SECURE_LEN;
+ if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) {
+ rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+ head_len += padded_len(rem_len) + CEPH_GCM_TAG_LEN;
+ }
+ } else {
+ head_len = CEPH_PREAMBLE_PLAIN_LEN;
+ if (ctrl_len)
+ head_len += ctrl_len + CEPH_CRC_LEN;
+ }
+ return head_len;
+}
+
+/* front, middle and data segments + epilogue */
+static int __tail_onwire_len(int front_len, int middle_len, int data_len,
+ bool secure)
+{
+ BUG_ON(front_len < 0 || front_len > CEPH_MSG_MAX_FRONT_LEN ||
+ middle_len < 0 || middle_len > CEPH_MSG_MAX_MIDDLE_LEN ||
+ data_len < 0 || data_len > CEPH_MSG_MAX_DATA_LEN);
+
+ if (!front_len && !middle_len && !data_len)
+ return 0;
+
+ if (!secure)
+ return front_len + middle_len + data_len +
+ CEPH_EPILOGUE_PLAIN_LEN;
+
+ return padded_len(front_len) + padded_len(middle_len) +
+ padded_len(data_len) + CEPH_EPILOGUE_SECURE_LEN;
+}
+
+static int tail_onwire_len(const struct ceph_msg *msg, bool secure)
+{
+ return __tail_onwire_len(front_len(msg), middle_len(msg),
+ data_len(msg), secure);
+}
+
+/* head_onwire_len(sizeof(struct ceph_msg_header2), false) */
+#define MESSAGE_HEAD_PLAIN_LEN (CEPH_PREAMBLE_PLAIN_LEN + \
+ sizeof(struct ceph_msg_header2) + \
+ CEPH_CRC_LEN)
+
+static const int frame_aligns[] = {
+ sizeof(void *),
+ sizeof(void *),
+ sizeof(void *),
+ PAGE_SIZE
+};
+
+/*
+ * Discards trailing empty segments, unless there is just one segment.
+ * A frame always has at least one (possibly empty) segment.
+ */
+static int calc_segment_count(const int *lens, int len_cnt)
+{
+ int i;
+
+ for (i = len_cnt - 1; i >= 0; i--) {
+ if (lens[i])
+ return i + 1;
+ }
+
+ return 1;
+}
+
+static void init_frame_desc(struct ceph_frame_desc *desc, int tag,
+ const int *lens, int len_cnt)
+{
+ int i;
+
+ memset(desc, 0, sizeof(*desc));
+
+ desc->fd_tag = tag;
+ desc->fd_seg_cnt = calc_segment_count(lens, len_cnt);
+ BUG_ON(desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT);
+ for (i = 0; i < desc->fd_seg_cnt; i++) {
+ desc->fd_lens[i] = lens[i];
+ desc->fd_aligns[i] = frame_aligns[i];
+ }
+}
+
+/*
+ * Preamble crc covers everything up to itself (28 bytes) and
+ * is calculated and verified irrespective of the connection mode
+ * (i.e. even if the frame is encrypted).
+ */
+static void encode_preamble(const struct ceph_frame_desc *desc, void *p)
+{
+ void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN;
+ void *start = p;
+ int i;
+
+ memset(p, 0, CEPH_PREAMBLE_LEN);
+
+ ceph_encode_8(&p, desc->fd_tag);
+ ceph_encode_8(&p, desc->fd_seg_cnt);
+ for (i = 0; i < desc->fd_seg_cnt; i++) {
+ ceph_encode_32(&p, desc->fd_lens[i]);
+ ceph_encode_16(&p, desc->fd_aligns[i]);
+ }
+
+ put_unaligned_le32(crc32c(0, start, crcp - start), crcp);
+}
+
+static int decode_preamble(void *p, struct ceph_frame_desc *desc)
+{
+ void *crcp = p + CEPH_PREAMBLE_LEN - CEPH_CRC_LEN;
+ u32 crc, expected_crc;
+ int i;
+
+ crc = crc32c(0, p, crcp - p);
+ expected_crc = get_unaligned_le32(crcp);
+ if (crc != expected_crc) {
+ pr_err("bad preamble crc, calculated %u, expected %u\n",
+ crc, expected_crc);
+ return -EBADMSG;
+ }
+
+ memset(desc, 0, sizeof(*desc));
+
+ desc->fd_tag = ceph_decode_8(&p);
+ desc->fd_seg_cnt = ceph_decode_8(&p);
+ if (desc->fd_seg_cnt < 1 ||
+ desc->fd_seg_cnt > CEPH_FRAME_MAX_SEGMENT_COUNT) {
+ pr_err("bad segment count %d\n", desc->fd_seg_cnt);
+ return -EINVAL;
+ }
+ for (i = 0; i < desc->fd_seg_cnt; i++) {
+ desc->fd_lens[i] = ceph_decode_32(&p);
+ desc->fd_aligns[i] = ceph_decode_16(&p);
+ }
+
+ if (desc->fd_lens[0] < 0 ||
+ desc->fd_lens[0] > CEPH_MSG_MAX_CONTROL_LEN) {
+ pr_err("bad control segment length %d\n", desc->fd_lens[0]);
+ return -EINVAL;
+ }
+ if (desc->fd_lens[1] < 0 ||
+ desc->fd_lens[1] > CEPH_MSG_MAX_FRONT_LEN) {
+ pr_err("bad front segment length %d\n", desc->fd_lens[1]);
+ return -EINVAL;
+ }
+ if (desc->fd_lens[2] < 0 ||
+ desc->fd_lens[2] > CEPH_MSG_MAX_MIDDLE_LEN) {
+ pr_err("bad middle segment length %d\n", desc->fd_lens[2]);
+ return -EINVAL;
+ }
+ if (desc->fd_lens[3] < 0 ||
+ desc->fd_lens[3] > CEPH_MSG_MAX_DATA_LEN) {
+ pr_err("bad data segment length %d\n", desc->fd_lens[3]);
+ return -EINVAL;
+ }
+
+ /*
+ * This would fire for FRAME_TAG_WAIT (it has one empty
+ * segment), but we should never get it as client.
+ */
+ if (!desc->fd_lens[desc->fd_seg_cnt - 1]) {
+ pr_err("last segment empty, segment count %d\n",
+ desc->fd_seg_cnt);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void encode_epilogue_plain(struct ceph_connection *con, bool aborted)
+{
+ con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED :
+ FRAME_LATE_STATUS_COMPLETE;
+ cpu_to_le32s(&con->v2.out_epil.front_crc);
+ cpu_to_le32s(&con->v2.out_epil.middle_crc);
+ cpu_to_le32s(&con->v2.out_epil.data_crc);
+}
+
+static void encode_epilogue_secure(struct ceph_connection *con, bool aborted)
+{
+ memset(&con->v2.out_epil, 0, sizeof(con->v2.out_epil));
+ con->v2.out_epil.late_status = aborted ? FRAME_LATE_STATUS_ABORTED :
+ FRAME_LATE_STATUS_COMPLETE;
+}
+
+static int decode_epilogue(void *p, u32 *front_crc, u32 *middle_crc,
+ u32 *data_crc)
+{
+ u8 late_status;
+
+ late_status = ceph_decode_8(&p);
+ if ((late_status & FRAME_LATE_STATUS_ABORTED_MASK) !=
+ FRAME_LATE_STATUS_COMPLETE) {
+ /* we should never get an aborted message as client */
+ pr_err("bad late_status 0x%x\n", late_status);
+ return -EINVAL;
+ }
+
+ if (front_crc && middle_crc && data_crc) {
+ *front_crc = ceph_decode_32(&p);
+ *middle_crc = ceph_decode_32(&p);
+ *data_crc = ceph_decode_32(&p);
+ }
+
+ return 0;
+}
+
+static void fill_header(struct ceph_msg_header *hdr,
+ const struct ceph_msg_header2 *hdr2,
+ int front_len, int middle_len, int data_len,
+ const struct ceph_entity_name *peer_name)
+{
+ hdr->seq = hdr2->seq;
+ hdr->tid = hdr2->tid;
+ hdr->type = hdr2->type;
+ hdr->priority = hdr2->priority;
+ hdr->version = hdr2->version;
+ hdr->front_len = cpu_to_le32(front_len);
+ hdr->middle_len = cpu_to_le32(middle_len);
+ hdr->data_len = cpu_to_le32(data_len);
+ hdr->data_off = hdr2->data_off;
+ hdr->src = *peer_name;
+ hdr->compat_version = hdr2->compat_version;
+ hdr->reserved = 0;
+ hdr->crc = 0;
+}
+
+static void fill_header2(struct ceph_msg_header2 *hdr2,
+ const struct ceph_msg_header *hdr, u64 ack_seq)
+{
+ hdr2->seq = hdr->seq;
+ hdr2->tid = hdr->tid;
+ hdr2->type = hdr->type;
+ hdr2->priority = hdr->priority;
+ hdr2->version = hdr->version;
+ hdr2->data_pre_padding_len = 0;
+ hdr2->data_off = hdr->data_off;
+ hdr2->ack_seq = cpu_to_le64(ack_seq);
+ hdr2->flags = 0;
+ hdr2->compat_version = hdr->compat_version;
+ hdr2->reserved = 0;
+}
+
+static int verify_control_crc(struct ceph_connection *con)
+{
+ int ctrl_len = con->v2.in_desc.fd_lens[0];
+ u32 crc, expected_crc;
+
+ WARN_ON(con->v2.in_kvecs[0].iov_len != ctrl_len);
+ WARN_ON(con->v2.in_kvecs[1].iov_len != CEPH_CRC_LEN);
+
+ crc = crc32c(-1, con->v2.in_kvecs[0].iov_base, ctrl_len);
+ expected_crc = get_unaligned_le32(con->v2.in_kvecs[1].iov_base);
+ if (crc != expected_crc) {
+ pr_err("bad control crc, calculated %u, expected %u\n",
+ crc, expected_crc);
+ return -EBADMSG;
+ }
+
+ return 0;
+}
+
+static int verify_epilogue_crcs(struct ceph_connection *con, u32 front_crc,
+ u32 middle_crc, u32 data_crc)
+{
+ if (front_len(con->in_msg)) {
+ con->in_front_crc = crc32c(-1, con->in_msg->front.iov_base,
+ front_len(con->in_msg));
+ } else {
+ WARN_ON(!middle_len(con->in_msg) && !data_len(con->in_msg));
+ con->in_front_crc = -1;
+ }
+
+ if (middle_len(con->in_msg))
+ con->in_middle_crc = crc32c(-1,
+ con->in_msg->middle->vec.iov_base,
+ middle_len(con->in_msg));
+ else if (data_len(con->in_msg))
+ con->in_middle_crc = -1;
+ else
+ con->in_middle_crc = 0;
+
+ if (!data_len(con->in_msg))
+ con->in_data_crc = 0;
+
+ dout("%s con %p msg %p crcs %u %u %u\n", __func__, con, con->in_msg,
+ con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+
+ if (con->in_front_crc != front_crc) {
+ pr_err("bad front crc, calculated %u, expected %u\n",
+ con->in_front_crc, front_crc);
+ return -EBADMSG;
+ }
+ if (con->in_middle_crc != middle_crc) {
+ pr_err("bad middle crc, calculated %u, expected %u\n",
+ con->in_middle_crc, middle_crc);
+ return -EBADMSG;
+ }
+ if (con->in_data_crc != data_crc) {
+ pr_err("bad data crc, calculated %u, expected %u\n",
+ con->in_data_crc, data_crc);
+ return -EBADMSG;
+ }
+
+ return 0;
+}
+
+static int setup_crypto(struct ceph_connection *con,
+ const u8 *session_key, int session_key_len,
+ const u8 *con_secret, int con_secret_len)
+{
+ unsigned int noio_flag;
+ int ret;
+
+ dout("%s con %p con_mode %d session_key_len %d con_secret_len %d\n",
+ __func__, con, con->v2.con_mode, session_key_len, con_secret_len);
+ WARN_ON(con->v2.hmac_key_set || con->v2.gcm_tfm || con->v2.gcm_req);
+
+ if (con->v2.con_mode != CEPH_CON_MODE_CRC &&
+ con->v2.con_mode != CEPH_CON_MODE_SECURE) {
+ pr_err("bad con_mode %d\n", con->v2.con_mode);
+ return -EINVAL;
+ }
+
+ if (!session_key_len) {
+ WARN_ON(con->v2.con_mode != CEPH_CON_MODE_CRC);
+ WARN_ON(con_secret_len);
+ return 0; /* auth_none */
+ }
+
+ hmac_sha256_preparekey(&con->v2.hmac_key, session_key, session_key_len);
+ con->v2.hmac_key_set = true;
+
+ if (con->v2.con_mode == CEPH_CON_MODE_CRC) {
+ WARN_ON(con_secret_len);
+ return 0; /* auth_x, plain mode */
+ }
+
+ if (con_secret_len < CEPH_GCM_KEY_LEN + 2 * CEPH_GCM_IV_LEN) {
+ pr_err("con_secret too small %d\n", con_secret_len);
+ return -EINVAL;
+ }
+
+ noio_flag = memalloc_noio_save();
+ con->v2.gcm_tfm = crypto_alloc_aead("gcm(aes)", 0, 0);
+ memalloc_noio_restore(noio_flag);
+ if (IS_ERR(con->v2.gcm_tfm)) {
+ ret = PTR_ERR(con->v2.gcm_tfm);
+ con->v2.gcm_tfm = NULL;
+ pr_err("failed to allocate gcm tfm context: %d\n", ret);
+ return ret;
+ }
+
+ WARN_ON((unsigned long)con_secret &
+ crypto_aead_alignmask(con->v2.gcm_tfm));
+ ret = crypto_aead_setkey(con->v2.gcm_tfm, con_secret, CEPH_GCM_KEY_LEN);
+ if (ret) {
+ pr_err("failed to set gcm key: %d\n", ret);
+ return ret;
+ }
+
+ WARN_ON(crypto_aead_ivsize(con->v2.gcm_tfm) != CEPH_GCM_IV_LEN);
+ ret = crypto_aead_setauthsize(con->v2.gcm_tfm, CEPH_GCM_TAG_LEN);
+ if (ret) {
+ pr_err("failed to set gcm tag size: %d\n", ret);
+ return ret;
+ }
+
+ con->v2.gcm_req = aead_request_alloc(con->v2.gcm_tfm, GFP_NOIO);
+ if (!con->v2.gcm_req) {
+ pr_err("failed to allocate gcm request\n");
+ return -ENOMEM;
+ }
+
+ crypto_init_wait(&con->v2.gcm_wait);
+ aead_request_set_callback(con->v2.gcm_req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &con->v2.gcm_wait);
+
+ memcpy(&con->v2.in_gcm_nonce, con_secret + CEPH_GCM_KEY_LEN,
+ CEPH_GCM_IV_LEN);
+ memcpy(&con->v2.out_gcm_nonce,
+ con_secret + CEPH_GCM_KEY_LEN + CEPH_GCM_IV_LEN,
+ CEPH_GCM_IV_LEN);
+ return 0; /* auth_x, secure mode */
+}
+
+static void ceph_hmac_sha256(struct ceph_connection *con,
+ const struct kvec *kvecs, int kvec_cnt,
+ u8 hmac[SHA256_DIGEST_SIZE])
+{
+ struct hmac_sha256_ctx ctx;
+ int i;
+
+ dout("%s con %p hmac_key_set %d kvec_cnt %d\n", __func__, con,
+ con->v2.hmac_key_set, kvec_cnt);
+
+ if (!con->v2.hmac_key_set) {
+ memset(hmac, 0, SHA256_DIGEST_SIZE);
+ return; /* auth_none */
+ }
+
+ /* auth_x, both plain and secure modes */
+ hmac_sha256_init(&ctx, &con->v2.hmac_key);
+ for (i = 0; i < kvec_cnt; i++)
+ hmac_sha256_update(&ctx, kvecs[i].iov_base, kvecs[i].iov_len);
+ hmac_sha256_final(&ctx, hmac);
+}
+
+static void gcm_inc_nonce(struct ceph_gcm_nonce *nonce)
+{
+ u64 counter;
+
+ counter = le64_to_cpu(nonce->counter);
+ nonce->counter = cpu_to_le64(counter + 1);
+}
+
+static int gcm_crypt(struct ceph_connection *con, bool encrypt,
+ struct scatterlist *src, struct scatterlist *dst,
+ int src_len)
+{
+ struct ceph_gcm_nonce *nonce;
+ int ret;
+
+ nonce = encrypt ? &con->v2.out_gcm_nonce : &con->v2.in_gcm_nonce;
+
+ aead_request_set_ad(con->v2.gcm_req, 0); /* no AAD */
+ aead_request_set_crypt(con->v2.gcm_req, src, dst, src_len, (u8 *)nonce);
+ ret = crypto_wait_req(encrypt ? crypto_aead_encrypt(con->v2.gcm_req) :
+ crypto_aead_decrypt(con->v2.gcm_req),
+ &con->v2.gcm_wait);
+ if (ret)
+ return ret;
+
+ gcm_inc_nonce(nonce);
+ return 0;
+}
+
+static void get_bvec_at(struct ceph_msg_data_cursor *cursor,
+ struct bio_vec *bv)
+{
+ struct page *page;
+ size_t off, len;
+
+ WARN_ON(!cursor->total_resid);
+
+ /* skip zero-length data items */
+ while (!cursor->resid)
+ ceph_msg_data_advance(cursor, 0);
+
+ /* get a piece of data, cursor isn't advanced */
+ page = ceph_msg_data_next(cursor, &off, &len);
+ bvec_set_page(bv, page, len, off);
+}
+
+static int calc_sg_cnt(void *buf, int buf_len)
+{
+ int sg_cnt;
+
+ if (!buf_len)
+ return 0;
+
+ sg_cnt = need_padding(buf_len) ? 1 : 0;
+ if (is_vmalloc_addr(buf)) {
+ WARN_ON(offset_in_page(buf));
+ sg_cnt += PAGE_ALIGN(buf_len) >> PAGE_SHIFT;
+ } else {
+ sg_cnt++;
+ }
+
+ return sg_cnt;
+}
+
+static int calc_sg_cnt_cursor(struct ceph_msg_data_cursor *cursor)
+{
+ int data_len = cursor->total_resid;
+ struct bio_vec bv;
+ int sg_cnt;
+
+ if (!data_len)
+ return 0;
+
+ sg_cnt = need_padding(data_len) ? 1 : 0;
+ do {
+ get_bvec_at(cursor, &bv);
+ sg_cnt++;
+
+ ceph_msg_data_advance(cursor, bv.bv_len);
+ } while (cursor->total_resid);
+
+ return sg_cnt;
+}
+
+static void init_sgs(struct scatterlist **sg, void *buf, int buf_len, u8 *pad)
+{
+ void *end = buf + buf_len;
+ struct page *page;
+ int len;
+ void *p;
+
+ if (!buf_len)
+ return;
+
+ if (is_vmalloc_addr(buf)) {
+ p = buf;
+ do {
+ page = vmalloc_to_page(p);
+ len = min_t(int, end - p, PAGE_SIZE);
+ WARN_ON(!page || !len || offset_in_page(p));
+ sg_set_page(*sg, page, len, 0);
+ *sg = sg_next(*sg);
+ p += len;
+ } while (p != end);
+ } else {
+ sg_set_buf(*sg, buf, buf_len);
+ *sg = sg_next(*sg);
+ }
+
+ if (need_padding(buf_len)) {
+ sg_set_buf(*sg, pad, padding_len(buf_len));
+ *sg = sg_next(*sg);
+ }
+}
+
+static void init_sgs_cursor(struct scatterlist **sg,
+ struct ceph_msg_data_cursor *cursor, u8 *pad)
+{
+ int data_len = cursor->total_resid;
+ struct bio_vec bv;
+
+ if (!data_len)
+ return;
+
+ do {
+ get_bvec_at(cursor, &bv);
+ sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset);
+ *sg = sg_next(*sg);
+
+ ceph_msg_data_advance(cursor, bv.bv_len);
+ } while (cursor->total_resid);
+
+ if (need_padding(data_len)) {
+ sg_set_buf(*sg, pad, padding_len(data_len));
+ *sg = sg_next(*sg);
+ }
+}
+
+/**
+ * init_sgs_pages: set up scatterlist on an array of page pointers
+ * @sg: scatterlist to populate
+ * @pages: pointer to page array
+ * @dpos: position in the array to start (bytes)
+ * @dlen: len to add to sg (bytes)
+ * @pad: pointer to pad destination (if any)
+ *
+ * Populate the scatterlist from the page array, starting at an arbitrary
+ * byte in the array and running for a specified length.
+ */
+static void init_sgs_pages(struct scatterlist **sg, struct page **pages,
+ int dpos, int dlen, u8 *pad)
+{
+ int idx = dpos >> PAGE_SHIFT;
+ int off = offset_in_page(dpos);
+ int resid = dlen;
+
+ do {
+ int len = min(resid, (int)PAGE_SIZE - off);
+
+ sg_set_page(*sg, pages[idx], len, off);
+ *sg = sg_next(*sg);
+ off = 0;
+ ++idx;
+ resid -= len;
+ } while (resid);
+
+ if (need_padding(dlen)) {
+ sg_set_buf(*sg, pad, padding_len(dlen));
+ *sg = sg_next(*sg);
+ }
+}
+
+static int setup_message_sgs(struct sg_table *sgt, struct ceph_msg *msg,
+ u8 *front_pad, u8 *middle_pad, u8 *data_pad,
+ void *epilogue, struct page **pages, int dpos,
+ bool add_tag)
+{
+ struct ceph_msg_data_cursor cursor;
+ struct scatterlist *cur_sg;
+ int dlen = data_len(msg);
+ int sg_cnt;
+ int ret;
+
+ if (!front_len(msg) && !middle_len(msg) && !data_len(msg))
+ return 0;
+
+ sg_cnt = 1; /* epilogue + [auth tag] */
+ if (front_len(msg))
+ sg_cnt += calc_sg_cnt(msg->front.iov_base,
+ front_len(msg));
+ if (middle_len(msg))
+ sg_cnt += calc_sg_cnt(msg->middle->vec.iov_base,
+ middle_len(msg));
+ if (dlen) {
+ if (pages) {
+ sg_cnt += calc_pages_for(dpos, dlen);
+ if (need_padding(dlen))
+ sg_cnt++;
+ } else {
+ ceph_msg_data_cursor_init(&cursor, msg, dlen);
+ sg_cnt += calc_sg_cnt_cursor(&cursor);
+ }
+ }
+
+ ret = sg_alloc_table(sgt, sg_cnt, GFP_NOIO);
+ if (ret)
+ return ret;
+
+ cur_sg = sgt->sgl;
+ if (front_len(msg))
+ init_sgs(&cur_sg, msg->front.iov_base, front_len(msg),
+ front_pad);
+ if (middle_len(msg))
+ init_sgs(&cur_sg, msg->middle->vec.iov_base, middle_len(msg),
+ middle_pad);
+ if (dlen) {
+ if (pages) {
+ init_sgs_pages(&cur_sg, pages, dpos, dlen, data_pad);
+ } else {
+ ceph_msg_data_cursor_init(&cursor, msg, dlen);
+ init_sgs_cursor(&cur_sg, &cursor, data_pad);
+ }
+ }
+
+ WARN_ON(!sg_is_last(cur_sg));
+ sg_set_buf(cur_sg, epilogue,
+ CEPH_GCM_BLOCK_LEN + (add_tag ? CEPH_GCM_TAG_LEN : 0));
+ return 0;
+}
+
+static int decrypt_preamble(struct ceph_connection *con)
+{
+ struct scatterlist sg;
+
+ sg_init_one(&sg, con->v2.in_buf, CEPH_PREAMBLE_SECURE_LEN);
+ return gcm_crypt(con, false, &sg, &sg, CEPH_PREAMBLE_SECURE_LEN);
+}
+
+static int decrypt_control_remainder(struct ceph_connection *con)
+{
+ int ctrl_len = con->v2.in_desc.fd_lens[0];
+ int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+ int pt_len = padding_len(rem_len) + CEPH_GCM_TAG_LEN;
+ struct scatterlist sgs[2];
+
+ WARN_ON(con->v2.in_kvecs[0].iov_len != rem_len);
+ WARN_ON(con->v2.in_kvecs[1].iov_len != pt_len);
+
+ sg_init_table(sgs, 2);
+ sg_set_buf(&sgs[0], con->v2.in_kvecs[0].iov_base, rem_len);
+ sg_set_buf(&sgs[1], con->v2.in_buf, pt_len);
+
+ return gcm_crypt(con, false, sgs, sgs,
+ padded_len(rem_len) + CEPH_GCM_TAG_LEN);
+}
+
+/* Process sparse read data that lives in a buffer */
+static int process_v2_sparse_read(struct ceph_connection *con,
+ struct page **pages, int spos)
+{
+ struct ceph_msg_data_cursor cursor;
+ int ret;
+
+ ceph_msg_data_cursor_init(&cursor, con->in_msg,
+ con->in_msg->sparse_read_total);
+
+ for (;;) {
+ char *buf = NULL;
+
+ ret = con->ops->sparse_read(con, &cursor, &buf);
+ if (ret <= 0)
+ return ret;
+
+ dout("%s: sparse_read return %x buf %p\n", __func__, ret, buf);
+
+ do {
+ int idx = spos >> PAGE_SHIFT;
+ int soff = offset_in_page(spos);
+ struct page *spage = con->v2.in_enc_pages[idx];
+ int len = min_t(int, ret, PAGE_SIZE - soff);
+
+ if (buf) {
+ memcpy_from_page(buf, spage, soff, len);
+ buf += len;
+ } else {
+ struct bio_vec bv;
+
+ get_bvec_at(&cursor, &bv);
+ len = min_t(int, len, bv.bv_len);
+ memcpy_page(bv.bv_page, bv.bv_offset,
+ spage, soff, len);
+ ceph_msg_data_advance(&cursor, len);
+ }
+ spos += len;
+ ret -= len;
+ } while (ret);
+ }
+}
+
+static int decrypt_tail(struct ceph_connection *con)
+{
+ struct sg_table enc_sgt = {};
+ struct sg_table sgt = {};
+ struct page **pages = NULL;
+ bool sparse = !!con->in_msg->sparse_read_total;
+ int dpos = 0;
+ int tail_len;
+ int ret;
+
+ tail_len = tail_onwire_len(con->in_msg, true);
+ ret = sg_alloc_table_from_pages(&enc_sgt, con->v2.in_enc_pages,
+ con->v2.in_enc_page_cnt, 0, tail_len,
+ GFP_NOIO);
+ if (ret)
+ goto out;
+
+ if (sparse) {
+ dpos = padded_len(front_len(con->in_msg) + padded_len(middle_len(con->in_msg)));
+ pages = con->v2.in_enc_pages;
+ }
+
+ ret = setup_message_sgs(&sgt, con->in_msg, FRONT_PAD(con->v2.in_buf),
+ MIDDLE_PAD(con->v2.in_buf), DATA_PAD(con->v2.in_buf),
+ con->v2.in_buf, pages, dpos, true);
+ if (ret)
+ goto out;
+
+ dout("%s con %p msg %p enc_page_cnt %d sg_cnt %d\n", __func__, con,
+ con->in_msg, con->v2.in_enc_page_cnt, sgt.orig_nents);
+ ret = gcm_crypt(con, false, enc_sgt.sgl, sgt.sgl, tail_len);
+ if (ret)
+ goto out;
+
+ if (sparse && data_len(con->in_msg)) {
+ ret = process_v2_sparse_read(con, con->v2.in_enc_pages, dpos);
+ if (ret)
+ goto out;
+ }
+
+ WARN_ON(!con->v2.in_enc_page_cnt);
+ ceph_release_page_vector(con->v2.in_enc_pages,
+ con->v2.in_enc_page_cnt);
+ con->v2.in_enc_pages = NULL;
+ con->v2.in_enc_page_cnt = 0;
+
+out:
+ sg_free_table(&sgt);
+ sg_free_table(&enc_sgt);
+ return ret;
+}
+
+static int prepare_banner(struct ceph_connection *con)
+{
+ int buf_len = CEPH_BANNER_V2_LEN + 2 + 8 + 8;
+ void *buf, *p;
+
+ buf = alloc_conn_buf(con, buf_len);
+ if (!buf)
+ return -ENOMEM;
+
+ p = buf;
+ ceph_encode_copy(&p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN);
+ ceph_encode_16(&p, sizeof(u64) + sizeof(u64));
+ ceph_encode_64(&p, CEPH_MSGR2_SUPPORTED_FEATURES);
+ ceph_encode_64(&p, CEPH_MSGR2_REQUIRED_FEATURES);
+ WARN_ON(p != buf + buf_len);
+
+ add_out_kvec(con, buf, buf_len);
+ add_out_sign_kvec(con, buf, buf_len);
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+ return 0;
+}
+
+/*
+ * base:
+ * preamble
+ * control body (ctrl_len bytes)
+ * space for control crc
+ *
+ * extdata (optional):
+ * control body (extdata_len bytes)
+ *
+ * Compute control crc and gather base and extdata into:
+ *
+ * preamble
+ * control body (ctrl_len + extdata_len bytes)
+ * control crc
+ *
+ * Preamble should already be encoded at the start of base.
+ */
+static void prepare_head_plain(struct ceph_connection *con, void *base,
+ int ctrl_len, void *extdata, int extdata_len,
+ bool to_be_signed)
+{
+ int base_len = CEPH_PREAMBLE_LEN + ctrl_len + CEPH_CRC_LEN;
+ void *crcp = base + base_len - CEPH_CRC_LEN;
+ u32 crc;
+
+ crc = crc32c(-1, CTRL_BODY(base), ctrl_len);
+ if (extdata_len)
+ crc = crc32c(crc, extdata, extdata_len);
+ put_unaligned_le32(crc, crcp);
+
+ if (!extdata_len) {
+ add_out_kvec(con, base, base_len);
+ if (to_be_signed)
+ add_out_sign_kvec(con, base, base_len);
+ return;
+ }
+
+ add_out_kvec(con, base, crcp - base);
+ add_out_kvec(con, extdata, extdata_len);
+ add_out_kvec(con, crcp, CEPH_CRC_LEN);
+ if (to_be_signed) {
+ add_out_sign_kvec(con, base, crcp - base);
+ add_out_sign_kvec(con, extdata, extdata_len);
+ add_out_sign_kvec(con, crcp, CEPH_CRC_LEN);
+ }
+}
+
+static int prepare_head_secure_small(struct ceph_connection *con,
+ void *base, int ctrl_len)
+{
+ struct scatterlist sg;
+ int ret;
+
+ /* inline buffer padding? */
+ if (ctrl_len < CEPH_PREAMBLE_INLINE_LEN)
+ memset(CTRL_BODY(base) + ctrl_len, 0,
+ CEPH_PREAMBLE_INLINE_LEN - ctrl_len);
+
+ sg_init_one(&sg, base, CEPH_PREAMBLE_SECURE_LEN);
+ ret = gcm_crypt(con, true, &sg, &sg,
+ CEPH_PREAMBLE_SECURE_LEN - CEPH_GCM_TAG_LEN);
+ if (ret)
+ return ret;
+
+ add_out_kvec(con, base, CEPH_PREAMBLE_SECURE_LEN);
+ return 0;
+}
+
+/*
+ * base:
+ * preamble
+ * control body (ctrl_len bytes)
+ * space for padding, if needed
+ * space for control remainder auth tag
+ * space for preamble auth tag
+ *
+ * Encrypt preamble and the inline portion, then encrypt the remainder
+ * and gather into:
+ *
+ * preamble
+ * control body (48 bytes)
+ * preamble auth tag
+ * control body (ctrl_len - 48 bytes)
+ * zero padding, if needed
+ * control remainder auth tag
+ *
+ * Preamble should already be encoded at the start of base.
+ */
+static int prepare_head_secure_big(struct ceph_connection *con,
+ void *base, int ctrl_len)
+{
+ int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+ void *rem = CTRL_BODY(base) + CEPH_PREAMBLE_INLINE_LEN;
+ void *rem_tag = rem + padded_len(rem_len);
+ void *pmbl_tag = rem_tag + CEPH_GCM_TAG_LEN;
+ struct scatterlist sgs[2];
+ int ret;
+
+ sg_init_table(sgs, 2);
+ sg_set_buf(&sgs[0], base, rem - base);
+ sg_set_buf(&sgs[1], pmbl_tag, CEPH_GCM_TAG_LEN);
+ ret = gcm_crypt(con, true, sgs, sgs, rem - base);
+ if (ret)
+ return ret;
+
+ /* control remainder padding? */
+ if (need_padding(rem_len))
+ memset(rem + rem_len, 0, padding_len(rem_len));
+
+ sg_init_one(&sgs[0], rem, pmbl_tag - rem);
+ ret = gcm_crypt(con, true, sgs, sgs, rem_tag - rem);
+ if (ret)
+ return ret;
+
+ add_out_kvec(con, base, rem - base);
+ add_out_kvec(con, pmbl_tag, CEPH_GCM_TAG_LEN);
+ add_out_kvec(con, rem, pmbl_tag - rem);
+ return 0;
+}
+
+static int __prepare_control(struct ceph_connection *con, int tag,
+ void *base, int ctrl_len, void *extdata,
+ int extdata_len, bool to_be_signed)
+{
+ int total_len = ctrl_len + extdata_len;
+ struct ceph_frame_desc desc;
+ int ret;
+
+ dout("%s con %p tag %d len %d (%d+%d)\n", __func__, con, tag,
+ total_len, ctrl_len, extdata_len);
+
+ /* extdata may be vmalloc'ed but not base */
+ if (WARN_ON(is_vmalloc_addr(base) || !ctrl_len))
+ return -EINVAL;
+
+ init_frame_desc(&desc, tag, &total_len, 1);
+ encode_preamble(&desc, base);
+
+ if (con_secure(con)) {
+ if (WARN_ON(extdata_len || to_be_signed))
+ return -EINVAL;
+
+ if (ctrl_len <= CEPH_PREAMBLE_INLINE_LEN)
+ /* fully inlined, inline buffer may need padding */
+ ret = prepare_head_secure_small(con, base, ctrl_len);
+ else
+ /* partially inlined, inline buffer is full */
+ ret = prepare_head_secure_big(con, base, ctrl_len);
+ if (ret)
+ return ret;
+ } else {
+ prepare_head_plain(con, base, ctrl_len, extdata, extdata_len,
+ to_be_signed);
+ }
+
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+ return 0;
+}
+
+static int prepare_control(struct ceph_connection *con, int tag,
+ void *base, int ctrl_len)
+{
+ return __prepare_control(con, tag, base, ctrl_len, NULL, 0, false);
+}
+
+static int prepare_hello(struct ceph_connection *con)
+{
+ void *buf, *p;
+ int ctrl_len;
+
+ ctrl_len = 1 + ceph_entity_addr_encoding_len(&con->peer_addr);
+ buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+ if (!buf)
+ return -ENOMEM;
+
+ p = CTRL_BODY(buf);
+ ceph_encode_8(&p, CEPH_ENTITY_TYPE_CLIENT);
+ ceph_encode_entity_addr(&p, &con->peer_addr);
+ WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+ return __prepare_control(con, FRAME_TAG_HELLO, buf, ctrl_len,
+ NULL, 0, true);
+}
+
+/* so that head_onwire_len(AUTH_BUF_LEN, false) is 512 */
+#define AUTH_BUF_LEN (512 - CEPH_CRC_LEN - CEPH_PREAMBLE_PLAIN_LEN)
+
+static int prepare_auth_request(struct ceph_connection *con)
+{
+ void *authorizer, *authorizer_copy;
+ int ctrl_len, authorizer_len;
+ void *buf;
+ int ret;
+
+ ctrl_len = AUTH_BUF_LEN;
+ buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+ if (!buf)
+ return -ENOMEM;
+
+ mutex_unlock(&con->mutex);
+ ret = con->ops->get_auth_request(con, CTRL_BODY(buf), &ctrl_len,
+ &authorizer, &authorizer_len);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V2_HELLO) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ return -EAGAIN;
+ }
+
+ dout("%s con %p get_auth_request ret %d\n", __func__, con, ret);
+ if (ret)
+ return ret;
+
+ authorizer_copy = alloc_conn_buf(con, authorizer_len);
+ if (!authorizer_copy)
+ return -ENOMEM;
+
+ memcpy(authorizer_copy, authorizer, authorizer_len);
+
+ return __prepare_control(con, FRAME_TAG_AUTH_REQUEST, buf, ctrl_len,
+ authorizer_copy, authorizer_len, true);
+}
+
+static int prepare_auth_request_more(struct ceph_connection *con,
+ void *reply, int reply_len)
+{
+ int ctrl_len, authorizer_len;
+ void *authorizer;
+ void *buf;
+ int ret;
+
+ ctrl_len = AUTH_BUF_LEN;
+ buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, false));
+ if (!buf)
+ return -ENOMEM;
+
+ mutex_unlock(&con->mutex);
+ ret = con->ops->handle_auth_reply_more(con, reply, reply_len,
+ CTRL_BODY(buf), &ctrl_len,
+ &authorizer, &authorizer_len);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ return -EAGAIN;
+ }
+
+ dout("%s con %p handle_auth_reply_more ret %d\n", __func__, con, ret);
+ if (ret)
+ return ret;
+
+ return __prepare_control(con, FRAME_TAG_AUTH_REQUEST_MORE, buf,
+ ctrl_len, authorizer, authorizer_len, true);
+}
+
+static int prepare_auth_signature(struct ceph_connection *con)
+{
+ void *buf;
+
+ buf = alloc_conn_buf(con, head_onwire_len(SHA256_DIGEST_SIZE,
+ con_secure(con)));
+ if (!buf)
+ return -ENOMEM;
+
+ ceph_hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt,
+ CTRL_BODY(buf));
+
+ return prepare_control(con, FRAME_TAG_AUTH_SIGNATURE, buf,
+ SHA256_DIGEST_SIZE);
+}
+
+static int prepare_client_ident(struct ceph_connection *con)
+{
+ struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+ struct ceph_client *client = from_msgr(con->msgr);
+ u64 global_id = ceph_client_gid(client);
+ void *buf, *p;
+ int ctrl_len;
+
+ WARN_ON(con->v2.server_cookie);
+ WARN_ON(con->v2.connect_seq);
+ WARN_ON(con->v2.peer_global_seq);
+
+ if (!con->v2.client_cookie) {
+ do {
+ get_random_bytes(&con->v2.client_cookie,
+ sizeof(con->v2.client_cookie));
+ } while (!con->v2.client_cookie);
+ dout("%s con %p generated cookie 0x%llx\n", __func__, con,
+ con->v2.client_cookie);
+ } else {
+ dout("%s con %p cookie already set 0x%llx\n", __func__, con,
+ con->v2.client_cookie);
+ }
+
+ dout("%s con %p my_addr %s/%u peer_addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx cookie 0x%llx\n",
+ __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce),
+ ceph_pr_addr(&con->peer_addr), le32_to_cpu(con->peer_addr.nonce),
+ global_id, con->v2.global_seq, client->supported_features,
+ client->required_features, con->v2.client_cookie);
+
+ ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) +
+ ceph_entity_addr_encoding_len(&con->peer_addr) + 6 * 8;
+ buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con)));
+ if (!buf)
+ return -ENOMEM;
+
+ p = CTRL_BODY(buf);
+ ceph_encode_8(&p, 2); /* addrvec marker */
+ ceph_encode_32(&p, 1); /* addr_cnt */
+ ceph_encode_entity_addr(&p, my_addr);
+ ceph_encode_entity_addr(&p, &con->peer_addr);
+ ceph_encode_64(&p, global_id);
+ ceph_encode_64(&p, con->v2.global_seq);
+ ceph_encode_64(&p, client->supported_features);
+ ceph_encode_64(&p, client->required_features);
+ ceph_encode_64(&p, 0); /* flags */
+ ceph_encode_64(&p, con->v2.client_cookie);
+ WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+ return prepare_control(con, FRAME_TAG_CLIENT_IDENT, buf, ctrl_len);
+}
+
+static int prepare_session_reconnect(struct ceph_connection *con)
+{
+ struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+ void *buf, *p;
+ int ctrl_len;
+
+ WARN_ON(!con->v2.client_cookie);
+ WARN_ON(!con->v2.server_cookie);
+ WARN_ON(!con->v2.connect_seq);
+ WARN_ON(!con->v2.peer_global_seq);
+
+ dout("%s con %p my_addr %s/%u client_cookie 0x%llx server_cookie 0x%llx global_seq %llu connect_seq %llu in_seq %llu\n",
+ __func__, con, ceph_pr_addr(my_addr), le32_to_cpu(my_addr->nonce),
+ con->v2.client_cookie, con->v2.server_cookie, con->v2.global_seq,
+ con->v2.connect_seq, con->in_seq);
+
+ ctrl_len = 1 + 4 + ceph_entity_addr_encoding_len(my_addr) + 5 * 8;
+ buf = alloc_conn_buf(con, head_onwire_len(ctrl_len, con_secure(con)));
+ if (!buf)
+ return -ENOMEM;
+
+ p = CTRL_BODY(buf);
+ ceph_encode_8(&p, 2); /* entity_addrvec_t marker */
+ ceph_encode_32(&p, 1); /* my_addrs len */
+ ceph_encode_entity_addr(&p, my_addr);
+ ceph_encode_64(&p, con->v2.client_cookie);
+ ceph_encode_64(&p, con->v2.server_cookie);
+ ceph_encode_64(&p, con->v2.global_seq);
+ ceph_encode_64(&p, con->v2.connect_seq);
+ ceph_encode_64(&p, con->in_seq);
+ WARN_ON(p != CTRL_BODY(buf) + ctrl_len);
+
+ return prepare_control(con, FRAME_TAG_SESSION_RECONNECT, buf, ctrl_len);
+}
+
+static int prepare_keepalive2(struct ceph_connection *con)
+{
+ struct ceph_timespec *ts = CTRL_BODY(con->v2.out_buf);
+ struct timespec64 now;
+
+ ktime_get_real_ts64(&now);
+ dout("%s con %p timestamp %ptSp\n", __func__, con, &now);
+
+ ceph_encode_timespec64(ts, &now);
+
+ reset_out_kvecs(con);
+ return prepare_control(con, FRAME_TAG_KEEPALIVE2, con->v2.out_buf,
+ sizeof(struct ceph_timespec));
+}
+
+static int prepare_ack(struct ceph_connection *con)
+{
+ void *p;
+
+ dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+
+ p = CTRL_BODY(con->v2.out_buf);
+ ceph_encode_64(&p, con->in_seq_acked);
+
+ reset_out_kvecs(con);
+ return prepare_control(con, FRAME_TAG_ACK, con->v2.out_buf, 8);
+}
+
+static void prepare_epilogue_plain(struct ceph_connection *con,
+ struct ceph_msg *msg, bool aborted)
+{
+ dout("%s con %p msg %p aborted %d crcs %u %u %u\n", __func__, con,
+ msg, aborted, con->v2.out_epil.front_crc,
+ con->v2.out_epil.middle_crc, con->v2.out_epil.data_crc);
+
+ encode_epilogue_plain(con, aborted);
+ add_out_kvec(con, &con->v2.out_epil, CEPH_EPILOGUE_PLAIN_LEN);
+}
+
+/*
+ * For "used" empty segments, crc is -1. For unused (trailing)
+ * segments, crc is 0.
+ */
+static void prepare_message_plain(struct ceph_connection *con,
+ struct ceph_msg *msg)
+{
+ prepare_head_plain(con, con->v2.out_buf,
+ sizeof(struct ceph_msg_header2), NULL, 0, false);
+
+ if (!front_len(msg) && !middle_len(msg)) {
+ if (!data_len(msg)) {
+ /*
+ * Empty message: once the head is written,
+ * we are done -- there is no epilogue.
+ */
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+ return;
+ }
+
+ con->v2.out_epil.front_crc = -1;
+ con->v2.out_epil.middle_crc = -1;
+ con->v2.out_state = OUT_S_QUEUE_DATA;
+ return;
+ }
+
+ if (front_len(msg)) {
+ con->v2.out_epil.front_crc = crc32c(-1, msg->front.iov_base,
+ front_len(msg));
+ add_out_kvec(con, msg->front.iov_base, front_len(msg));
+ } else {
+ /* middle (at least) is there, checked above */
+ con->v2.out_epil.front_crc = -1;
+ }
+
+ if (middle_len(msg)) {
+ con->v2.out_epil.middle_crc =
+ crc32c(-1, msg->middle->vec.iov_base, middle_len(msg));
+ add_out_kvec(con, msg->middle->vec.iov_base, middle_len(msg));
+ } else {
+ con->v2.out_epil.middle_crc = data_len(msg) ? -1 : 0;
+ }
+
+ if (data_len(msg)) {
+ con->v2.out_state = OUT_S_QUEUE_DATA;
+ } else {
+ con->v2.out_epil.data_crc = 0;
+ prepare_epilogue_plain(con, msg, false);
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+ }
+}
+
+/*
+ * Unfortunately the kernel crypto API doesn't support streaming
+ * (piecewise) operation for AEAD algorithms, so we can't get away
+ * with a fixed size buffer and a couple sgs. Instead, we have to
+ * allocate pages for the entire tail of the message (currently up
+ * to ~32M) and two sgs arrays (up to ~256K each)...
+ */
+static int prepare_message_secure(struct ceph_connection *con,
+ struct ceph_msg *msg)
+{
+ void *zerop = page_address(ceph_zero_page);
+ struct sg_table enc_sgt = {};
+ struct sg_table sgt = {};
+ struct page **enc_pages;
+ int enc_page_cnt;
+ int tail_len;
+ int ret;
+
+ ret = prepare_head_secure_small(con, con->v2.out_buf,
+ sizeof(struct ceph_msg_header2));
+ if (ret)
+ return ret;
+
+ tail_len = tail_onwire_len(msg, true);
+ if (!tail_len) {
+ /*
+ * Empty message: once the head is written,
+ * we are done -- there is no epilogue.
+ */
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+ return 0;
+ }
+
+ encode_epilogue_secure(con, false);
+ ret = setup_message_sgs(&sgt, msg, zerop, zerop, zerop,
+ &con->v2.out_epil, NULL, 0, false);
+ if (ret)
+ goto out;
+
+ enc_page_cnt = calc_pages_for(0, tail_len);
+ enc_pages = ceph_alloc_page_vector(enc_page_cnt, GFP_NOIO);
+ if (IS_ERR(enc_pages)) {
+ ret = PTR_ERR(enc_pages);
+ goto out;
+ }
+
+ WARN_ON(con->v2.out_enc_pages || con->v2.out_enc_page_cnt);
+ con->v2.out_enc_pages = enc_pages;
+ con->v2.out_enc_page_cnt = enc_page_cnt;
+ con->v2.out_enc_resid = tail_len;
+ con->v2.out_enc_i = 0;
+
+ ret = sg_alloc_table_from_pages(&enc_sgt, enc_pages, enc_page_cnt,
+ 0, tail_len, GFP_NOIO);
+ if (ret)
+ goto out;
+
+ ret = gcm_crypt(con, true, sgt.sgl, enc_sgt.sgl,
+ tail_len - CEPH_GCM_TAG_LEN);
+ if (ret)
+ goto out;
+
+ dout("%s con %p msg %p sg_cnt %d enc_page_cnt %d\n", __func__, con,
+ msg, sgt.orig_nents, enc_page_cnt);
+ con->v2.out_state = OUT_S_QUEUE_ENC_PAGE;
+
+out:
+ sg_free_table(&sgt);
+ sg_free_table(&enc_sgt);
+ return ret;
+}
+
+static int prepare_message(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ int lens[] = {
+ sizeof(struct ceph_msg_header2),
+ front_len(msg),
+ middle_len(msg),
+ data_len(msg)
+ };
+ struct ceph_frame_desc desc;
+ int ret;
+
+ dout("%s con %p msg %p logical %d+%d+%d+%d\n", __func__, con,
+ msg, lens[0], lens[1], lens[2], lens[3]);
+
+ if (con->in_seq > con->in_seq_acked) {
+ dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+ }
+
+ reset_out_kvecs(con);
+ init_frame_desc(&desc, FRAME_TAG_MESSAGE, lens, 4);
+ encode_preamble(&desc, con->v2.out_buf);
+ fill_header2(CTRL_BODY(con->v2.out_buf), &msg->hdr,
+ con->in_seq_acked);
+
+ if (con_secure(con)) {
+ ret = prepare_message_secure(con, msg);
+ if (ret)
+ return ret;
+ } else {
+ prepare_message_plain(con, msg);
+ }
+
+ ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING);
+ return 0;
+}
+
+static int prepare_read_banner_prefix(struct ceph_connection *con)
+{
+ void *buf;
+
+ buf = alloc_conn_buf(con, CEPH_BANNER_V2_PREFIX_LEN);
+ if (!buf)
+ return -ENOMEM;
+
+ reset_in_kvecs(con);
+ add_in_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN);
+ add_in_sign_kvec(con, buf, CEPH_BANNER_V2_PREFIX_LEN);
+ con->state = CEPH_CON_S_V2_BANNER_PREFIX;
+ return 0;
+}
+
+static int prepare_read_banner_payload(struct ceph_connection *con,
+ int payload_len)
+{
+ void *buf;
+
+ buf = alloc_conn_buf(con, payload_len);
+ if (!buf)
+ return -ENOMEM;
+
+ reset_in_kvecs(con);
+ add_in_kvec(con, buf, payload_len);
+ add_in_sign_kvec(con, buf, payload_len);
+ con->state = CEPH_CON_S_V2_BANNER_PAYLOAD;
+ return 0;
+}
+
+static void prepare_read_preamble(struct ceph_connection *con)
+{
+ reset_in_kvecs(con);
+ add_in_kvec(con, con->v2.in_buf,
+ con_secure(con) ? CEPH_PREAMBLE_SECURE_LEN :
+ CEPH_PREAMBLE_PLAIN_LEN);
+ con->v2.in_state = IN_S_HANDLE_PREAMBLE;
+}
+
+static int prepare_read_control(struct ceph_connection *con)
+{
+ int ctrl_len = con->v2.in_desc.fd_lens[0];
+ int head_len;
+ void *buf;
+
+ reset_in_kvecs(con);
+ if (con->state == CEPH_CON_S_V2_HELLO ||
+ con->state == CEPH_CON_S_V2_AUTH) {
+ head_len = head_onwire_len(ctrl_len, false);
+ buf = alloc_conn_buf(con, head_len);
+ if (!buf)
+ return -ENOMEM;
+
+ /* preserve preamble */
+ memcpy(buf, con->v2.in_buf, CEPH_PREAMBLE_LEN);
+
+ add_in_kvec(con, CTRL_BODY(buf), ctrl_len);
+ add_in_kvec(con, CTRL_BODY(buf) + ctrl_len, CEPH_CRC_LEN);
+ add_in_sign_kvec(con, buf, head_len);
+ } else {
+ if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) {
+ buf = alloc_conn_buf(con, ctrl_len);
+ if (!buf)
+ return -ENOMEM;
+
+ add_in_kvec(con, buf, ctrl_len);
+ } else {
+ add_in_kvec(con, CTRL_BODY(con->v2.in_buf), ctrl_len);
+ }
+ add_in_kvec(con, con->v2.in_buf, CEPH_CRC_LEN);
+ }
+ con->v2.in_state = IN_S_HANDLE_CONTROL;
+ return 0;
+}
+
+static int prepare_read_control_remainder(struct ceph_connection *con)
+{
+ int ctrl_len = con->v2.in_desc.fd_lens[0];
+ int rem_len = ctrl_len - CEPH_PREAMBLE_INLINE_LEN;
+ void *buf;
+
+ buf = alloc_conn_buf(con, ctrl_len);
+ if (!buf)
+ return -ENOMEM;
+
+ memcpy(buf, CTRL_BODY(con->v2.in_buf), CEPH_PREAMBLE_INLINE_LEN);
+
+ reset_in_kvecs(con);
+ add_in_kvec(con, buf + CEPH_PREAMBLE_INLINE_LEN, rem_len);
+ add_in_kvec(con, con->v2.in_buf,
+ padding_len(rem_len) + CEPH_GCM_TAG_LEN);
+ con->v2.in_state = IN_S_HANDLE_CONTROL_REMAINDER;
+ return 0;
+}
+
+static int prepare_read_data(struct ceph_connection *con)
+{
+ struct bio_vec bv;
+
+ con->in_data_crc = -1;
+ ceph_msg_data_cursor_init(&con->v2.in_cursor, con->in_msg,
+ data_len(con->in_msg));
+
+ get_bvec_at(&con->v2.in_cursor, &bv);
+ if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
+ if (unlikely(!con->bounce_page)) {
+ con->bounce_page = alloc_page(GFP_NOIO);
+ if (!con->bounce_page) {
+ pr_err("failed to allocate bounce page\n");
+ return -ENOMEM;
+ }
+ }
+
+ bv.bv_page = con->bounce_page;
+ bv.bv_offset = 0;
+ }
+ set_in_bvec(con, &bv);
+ con->v2.in_state = IN_S_PREPARE_READ_DATA_CONT;
+ return 0;
+}
+
+static void prepare_read_data_cont(struct ceph_connection *con)
+{
+ struct bio_vec bv;
+
+ if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
+ con->in_data_crc = crc32c(con->in_data_crc,
+ page_address(con->bounce_page),
+ con->v2.in_bvec.bv_len);
+
+ get_bvec_at(&con->v2.in_cursor, &bv);
+ memcpy_to_page(bv.bv_page, bv.bv_offset,
+ page_address(con->bounce_page),
+ con->v2.in_bvec.bv_len);
+ } else {
+ con->in_data_crc = ceph_crc32c_page(con->in_data_crc,
+ con->v2.in_bvec.bv_page,
+ con->v2.in_bvec.bv_offset,
+ con->v2.in_bvec.bv_len);
+ }
+
+ ceph_msg_data_advance(&con->v2.in_cursor, con->v2.in_bvec.bv_len);
+ if (con->v2.in_cursor.total_resid) {
+ get_bvec_at(&con->v2.in_cursor, &bv);
+ if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
+ bv.bv_page = con->bounce_page;
+ bv.bv_offset = 0;
+ }
+ set_in_bvec(con, &bv);
+ WARN_ON(con->v2.in_state != IN_S_PREPARE_READ_DATA_CONT);
+ return;
+ }
+
+ /*
+ * We've read all data. Prepare to read epilogue.
+ */
+ reset_in_kvecs(con);
+ add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN);
+ con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+}
+
+static int prepare_sparse_read_cont(struct ceph_connection *con)
+{
+ int ret;
+ struct bio_vec bv;
+ char *buf = NULL;
+ struct ceph_msg_data_cursor *cursor = &con->v2.in_cursor;
+
+ WARN_ON(con->v2.in_state != IN_S_PREPARE_SPARSE_DATA_CONT);
+
+ if (iov_iter_is_bvec(&con->v2.in_iter)) {
+ if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
+ con->in_data_crc = crc32c(con->in_data_crc,
+ page_address(con->bounce_page),
+ con->v2.in_bvec.bv_len);
+ get_bvec_at(cursor, &bv);
+ memcpy_to_page(bv.bv_page, bv.bv_offset,
+ page_address(con->bounce_page),
+ con->v2.in_bvec.bv_len);
+ } else {
+ con->in_data_crc = ceph_crc32c_page(con->in_data_crc,
+ con->v2.in_bvec.bv_page,
+ con->v2.in_bvec.bv_offset,
+ con->v2.in_bvec.bv_len);
+ }
+
+ ceph_msg_data_advance(cursor, con->v2.in_bvec.bv_len);
+ cursor->sr_resid -= con->v2.in_bvec.bv_len;
+ dout("%s: advance by 0x%x sr_resid 0x%x\n", __func__,
+ con->v2.in_bvec.bv_len, cursor->sr_resid);
+ WARN_ON_ONCE(cursor->sr_resid > cursor->total_resid);
+ if (cursor->sr_resid) {
+ get_bvec_at(cursor, &bv);
+ if (bv.bv_len > cursor->sr_resid)
+ bv.bv_len = cursor->sr_resid;
+ if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
+ bv.bv_page = con->bounce_page;
+ bv.bv_offset = 0;
+ }
+ set_in_bvec(con, &bv);
+ con->v2.data_len_remain -= bv.bv_len;
+ return 0;
+ }
+ } else if (iov_iter_is_kvec(&con->v2.in_iter)) {
+ /* On first call, we have no kvec so don't compute crc */
+ if (con->v2.in_kvec_cnt) {
+ WARN_ON_ONCE(con->v2.in_kvec_cnt > 1);
+ con->in_data_crc = crc32c(con->in_data_crc,
+ con->v2.in_kvecs[0].iov_base,
+ con->v2.in_kvecs[0].iov_len);
+ }
+ } else {
+ return -EIO;
+ }
+
+ /* get next extent */
+ ret = con->ops->sparse_read(con, cursor, &buf);
+ if (ret <= 0) {
+ if (ret < 0)
+ return ret;
+
+ reset_in_kvecs(con);
+ add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN);
+ con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+ return 0;
+ }
+
+ if (buf) {
+ /* receive into buffer */
+ reset_in_kvecs(con);
+ add_in_kvec(con, buf, ret);
+ con->v2.data_len_remain -= ret;
+ return 0;
+ }
+
+ if (ret > cursor->total_resid) {
+ pr_warn("%s: ret 0x%x total_resid 0x%zx resid 0x%zx\n",
+ __func__, ret, cursor->total_resid, cursor->resid);
+ return -EIO;
+ }
+ get_bvec_at(cursor, &bv);
+ if (bv.bv_len > cursor->sr_resid)
+ bv.bv_len = cursor->sr_resid;
+ if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
+ if (unlikely(!con->bounce_page)) {
+ con->bounce_page = alloc_page(GFP_NOIO);
+ if (!con->bounce_page) {
+ pr_err("failed to allocate bounce page\n");
+ return -ENOMEM;
+ }
+ }
+
+ bv.bv_page = con->bounce_page;
+ bv.bv_offset = 0;
+ }
+ set_in_bvec(con, &bv);
+ con->v2.data_len_remain -= ret;
+ return ret;
+}
+
+static int prepare_sparse_read_data(struct ceph_connection *con)
+{
+ struct ceph_msg *msg = con->in_msg;
+
+ dout("%s: starting sparse read\n", __func__);
+
+ if (WARN_ON_ONCE(!con->ops->sparse_read))
+ return -EOPNOTSUPP;
+
+ if (!con_secure(con))
+ con->in_data_crc = -1;
+
+ ceph_msg_data_cursor_init(&con->v2.in_cursor, msg,
+ msg->sparse_read_total);
+
+ reset_in_kvecs(con);
+ con->v2.in_state = IN_S_PREPARE_SPARSE_DATA_CONT;
+ con->v2.data_len_remain = data_len(msg);
+ return prepare_sparse_read_cont(con);
+}
+
+static int prepare_read_tail_plain(struct ceph_connection *con)
+{
+ struct ceph_msg *msg = con->in_msg;
+
+ if (!front_len(msg) && !middle_len(msg)) {
+ WARN_ON(!data_len(msg));
+ return prepare_read_data(con);
+ }
+
+ reset_in_kvecs(con);
+ if (front_len(msg)) {
+ add_in_kvec(con, msg->front.iov_base, front_len(msg));
+ WARN_ON(msg->front.iov_len != front_len(msg));
+ }
+ if (middle_len(msg)) {
+ add_in_kvec(con, msg->middle->vec.iov_base, middle_len(msg));
+ WARN_ON(msg->middle->vec.iov_len != middle_len(msg));
+ }
+
+ if (data_len(msg)) {
+ if (msg->sparse_read_total)
+ con->v2.in_state = IN_S_PREPARE_SPARSE_DATA;
+ else
+ con->v2.in_state = IN_S_PREPARE_READ_DATA;
+ } else {
+ add_in_kvec(con, con->v2.in_buf, CEPH_EPILOGUE_PLAIN_LEN);
+ con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+ }
+ return 0;
+}
+
+static void prepare_read_enc_page(struct ceph_connection *con)
+{
+ struct bio_vec bv;
+
+ dout("%s con %p i %d resid %d\n", __func__, con, con->v2.in_enc_i,
+ con->v2.in_enc_resid);
+ WARN_ON(!con->v2.in_enc_resid);
+
+ bvec_set_page(&bv, con->v2.in_enc_pages[con->v2.in_enc_i],
+ min(con->v2.in_enc_resid, (int)PAGE_SIZE), 0);
+
+ set_in_bvec(con, &bv);
+ con->v2.in_enc_i++;
+ con->v2.in_enc_resid -= bv.bv_len;
+
+ if (con->v2.in_enc_resid) {
+ con->v2.in_state = IN_S_PREPARE_READ_ENC_PAGE;
+ return;
+ }
+
+ /*
+ * We are set to read the last piece of ciphertext (ending
+ * with epilogue) + auth tag.
+ */
+ WARN_ON(con->v2.in_enc_i != con->v2.in_enc_page_cnt);
+ con->v2.in_state = IN_S_HANDLE_EPILOGUE;
+}
+
+static int prepare_read_tail_secure(struct ceph_connection *con)
+{
+ struct page **enc_pages;
+ int enc_page_cnt;
+ int tail_len;
+
+ tail_len = tail_onwire_len(con->in_msg, true);
+ WARN_ON(!tail_len);
+
+ enc_page_cnt = calc_pages_for(0, tail_len);
+ enc_pages = ceph_alloc_page_vector(enc_page_cnt, GFP_NOIO);
+ if (IS_ERR(enc_pages))
+ return PTR_ERR(enc_pages);
+
+ WARN_ON(con->v2.in_enc_pages || con->v2.in_enc_page_cnt);
+ con->v2.in_enc_pages = enc_pages;
+ con->v2.in_enc_page_cnt = enc_page_cnt;
+ con->v2.in_enc_resid = tail_len;
+ con->v2.in_enc_i = 0;
+
+ prepare_read_enc_page(con);
+ return 0;
+}
+
+static void __finish_skip(struct ceph_connection *con)
+{
+ con->in_seq++;
+ prepare_read_preamble(con);
+}
+
+static void prepare_skip_message(struct ceph_connection *con)
+{
+ struct ceph_frame_desc *desc = &con->v2.in_desc;
+ int tail_len;
+
+ dout("%s con %p %d+%d+%d\n", __func__, con, desc->fd_lens[1],
+ desc->fd_lens[2], desc->fd_lens[3]);
+
+ tail_len = __tail_onwire_len(desc->fd_lens[1], desc->fd_lens[2],
+ desc->fd_lens[3], con_secure(con));
+ if (!tail_len) {
+ __finish_skip(con);
+ } else {
+ set_in_skip(con, tail_len);
+ con->v2.in_state = IN_S_FINISH_SKIP;
+ }
+}
+
+static int process_banner_prefix(struct ceph_connection *con)
+{
+ int payload_len;
+ void *p;
+
+ WARN_ON(con->v2.in_kvecs[0].iov_len != CEPH_BANNER_V2_PREFIX_LEN);
+
+ p = con->v2.in_kvecs[0].iov_base;
+ if (memcmp(p, CEPH_BANNER_V2, CEPH_BANNER_V2_LEN)) {
+ if (!memcmp(p, CEPH_BANNER, CEPH_BANNER_LEN))
+ con->error_msg = "server is speaking msgr1 protocol";
+ else
+ con->error_msg = "protocol error, bad banner";
+ return -EINVAL;
+ }
+
+ p += CEPH_BANNER_V2_LEN;
+ payload_len = ceph_decode_16(&p);
+ dout("%s con %p payload_len %d\n", __func__, con, payload_len);
+
+ return prepare_read_banner_payload(con, payload_len);
+}
+
+static int process_banner_payload(struct ceph_connection *con)
+{
+ void *end = con->v2.in_kvecs[0].iov_base + con->v2.in_kvecs[0].iov_len;
+ u64 feat = CEPH_MSGR2_SUPPORTED_FEATURES;
+ u64 req_feat = CEPH_MSGR2_REQUIRED_FEATURES;
+ u64 server_feat, server_req_feat;
+ void *p;
+ int ret;
+
+ p = con->v2.in_kvecs[0].iov_base;
+ ceph_decode_64_safe(&p, end, server_feat, bad);
+ ceph_decode_64_safe(&p, end, server_req_feat, bad);
+
+ dout("%s con %p server_feat 0x%llx server_req_feat 0x%llx\n",
+ __func__, con, server_feat, server_req_feat);
+
+ if (req_feat & ~server_feat) {
+ pr_err("msgr2 feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n",
+ server_feat, req_feat & ~server_feat);
+ con->error_msg = "missing required protocol features";
+ return -EINVAL;
+ }
+ if (server_req_feat & ~feat) {
+ pr_err("msgr2 feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n",
+ feat, server_req_feat & ~feat);
+ con->error_msg = "missing required protocol features";
+ return -EINVAL;
+ }
+
+ /* no reset_out_kvecs() as our banner may still be pending */
+ ret = prepare_hello(con);
+ if (ret) {
+ pr_err("prepare_hello failed: %d\n", ret);
+ return ret;
+ }
+
+ con->state = CEPH_CON_S_V2_HELLO;
+ prepare_read_preamble(con);
+ return 0;
+
+bad:
+ pr_err("failed to decode banner payload\n");
+ return -EINVAL;
+}
+
+static int process_hello(struct ceph_connection *con, void *p, void *end)
+{
+ struct ceph_entity_addr *my_addr = &con->msgr->inst.addr;
+ struct ceph_entity_addr addr_for_me;
+ u8 entity_type;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_HELLO) {
+ con->error_msg = "protocol error, unexpected hello";
+ return -EINVAL;
+ }
+
+ ceph_decode_8_safe(&p, end, entity_type, bad);
+ ret = ceph_decode_entity_addr(&p, end, &addr_for_me);
+ if (ret) {
+ pr_err("failed to decode addr_for_me: %d\n", ret);
+ return ret;
+ }
+
+ dout("%s con %p entity_type %d addr_for_me %s\n", __func__, con,
+ entity_type, ceph_pr_addr(&addr_for_me));
+
+ if (entity_type != con->peer_name.type) {
+ pr_err("bad peer type, want %d, got %d\n",
+ con->peer_name.type, entity_type);
+ con->error_msg = "wrong peer at address";
+ return -EINVAL;
+ }
+
+ /*
+ * Set our address to the address our first peer (i.e. monitor)
+ * sees that we are connecting from. If we are behind some sort
+ * of NAT and want to be identified by some private (not NATed)
+ * address, ip option should be used.
+ */
+ if (ceph_addr_is_blank(my_addr)) {
+ memcpy(&my_addr->in_addr, &addr_for_me.in_addr,
+ sizeof(my_addr->in_addr));
+ ceph_addr_set_port(my_addr, 0);
+ dout("%s con %p set my addr %s, as seen by peer %s\n",
+ __func__, con, ceph_pr_addr(my_addr),
+ ceph_pr_addr(&con->peer_addr));
+ } else {
+ dout("%s con %p my addr already set %s\n",
+ __func__, con, ceph_pr_addr(my_addr));
+ }
+
+ WARN_ON(ceph_addr_is_blank(my_addr) || ceph_addr_port(my_addr));
+ WARN_ON(my_addr->type != CEPH_ENTITY_ADDR_TYPE_ANY);
+ WARN_ON(!my_addr->nonce);
+
+ /* no reset_out_kvecs() as our hello may still be pending */
+ ret = prepare_auth_request(con);
+ if (ret) {
+ if (ret != -EAGAIN)
+ pr_err("prepare_auth_request failed: %d\n", ret);
+ return ret;
+ }
+
+ con->state = CEPH_CON_S_V2_AUTH;
+ return 0;
+
+bad:
+ pr_err("failed to decode hello\n");
+ return -EINVAL;
+}
+
+static int process_auth_bad_method(struct ceph_connection *con,
+ void *p, void *end)
+{
+ int allowed_protos[8], allowed_modes[8];
+ int allowed_proto_cnt, allowed_mode_cnt;
+ int used_proto, result;
+ int ret;
+ int i;
+
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ con->error_msg = "protocol error, unexpected auth_bad_method";
+ return -EINVAL;
+ }
+
+ ceph_decode_32_safe(&p, end, used_proto, bad);
+ ceph_decode_32_safe(&p, end, result, bad);
+ dout("%s con %p used_proto %d result %d\n", __func__, con, used_proto,
+ result);
+
+ ceph_decode_32_safe(&p, end, allowed_proto_cnt, bad);
+ if (allowed_proto_cnt > ARRAY_SIZE(allowed_protos)) {
+ pr_err("allowed_protos too big %d\n", allowed_proto_cnt);
+ return -EINVAL;
+ }
+ for (i = 0; i < allowed_proto_cnt; i++) {
+ ceph_decode_32_safe(&p, end, allowed_protos[i], bad);
+ dout("%s con %p allowed_protos[%d] %d\n", __func__, con,
+ i, allowed_protos[i]);
+ }
+
+ ceph_decode_32_safe(&p, end, allowed_mode_cnt, bad);
+ if (allowed_mode_cnt > ARRAY_SIZE(allowed_modes)) {
+ pr_err("allowed_modes too big %d\n", allowed_mode_cnt);
+ return -EINVAL;
+ }
+ for (i = 0; i < allowed_mode_cnt; i++) {
+ ceph_decode_32_safe(&p, end, allowed_modes[i], bad);
+ dout("%s con %p allowed_modes[%d] %d\n", __func__, con,
+ i, allowed_modes[i]);
+ }
+
+ mutex_unlock(&con->mutex);
+ ret = con->ops->handle_auth_bad_method(con, used_proto, result,
+ allowed_protos,
+ allowed_proto_cnt,
+ allowed_modes,
+ allowed_mode_cnt);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ return -EAGAIN;
+ }
+
+ dout("%s con %p handle_auth_bad_method ret %d\n", __func__, con, ret);
+ return ret;
+
+bad:
+ pr_err("failed to decode auth_bad_method\n");
+ return -EINVAL;
+}
+
+static int process_auth_reply_more(struct ceph_connection *con,
+ void *p, void *end)
+{
+ int payload_len;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ con->error_msg = "protocol error, unexpected auth_reply_more";
+ return -EINVAL;
+ }
+
+ ceph_decode_32_safe(&p, end, payload_len, bad);
+ ceph_decode_need(&p, end, payload_len, bad);
+
+ dout("%s con %p payload_len %d\n", __func__, con, payload_len);
+
+ reset_out_kvecs(con);
+ ret = prepare_auth_request_more(con, p, payload_len);
+ if (ret) {
+ if (ret != -EAGAIN)
+ pr_err("prepare_auth_request_more failed: %d\n", ret);
+ return ret;
+ }
+
+ return 0;
+
+bad:
+ pr_err("failed to decode auth_reply_more\n");
+ return -EINVAL;
+}
+
+/*
+ * Align session_key and con_secret to avoid GFP_ATOMIC allocation
+ * inside crypto_shash_setkey() and crypto_aead_setkey() called from
+ * setup_crypto(). __aligned(16) isn't guaranteed to work for stack
+ * objects, so do it by hand.
+ */
+static int process_auth_done(struct ceph_connection *con, void *p, void *end)
+{
+ u8 session_key_buf[CEPH_KEY_LEN + 16];
+ u8 con_secret_buf[CEPH_MAX_CON_SECRET_LEN + 16];
+ u8 *session_key = PTR_ALIGN(&session_key_buf[0], 16);
+ u8 *con_secret = PTR_ALIGN(&con_secret_buf[0], 16);
+ int session_key_len, con_secret_len;
+ int payload_len;
+ u64 global_id;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ con->error_msg = "protocol error, unexpected auth_done";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, global_id, bad);
+ ceph_decode_32_safe(&p, end, con->v2.con_mode, bad);
+ ceph_decode_32_safe(&p, end, payload_len, bad);
+
+ dout("%s con %p global_id %llu con_mode %d payload_len %d\n",
+ __func__, con, global_id, con->v2.con_mode, payload_len);
+
+ mutex_unlock(&con->mutex);
+ session_key_len = 0;
+ con_secret_len = 0;
+ ret = con->ops->handle_auth_done(con, global_id, p, payload_len,
+ session_key, &session_key_len,
+ con_secret, &con_secret_len);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V2_AUTH) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ dout("%s con %p handle_auth_done ret %d\n", __func__, con, ret);
+ if (ret)
+ goto out;
+
+ ret = setup_crypto(con, session_key, session_key_len, con_secret,
+ con_secret_len);
+ if (ret)
+ goto out;
+
+ reset_out_kvecs(con);
+ ret = prepare_auth_signature(con);
+ if (ret) {
+ pr_err("prepare_auth_signature failed: %d\n", ret);
+ goto out;
+ }
+
+ con->state = CEPH_CON_S_V2_AUTH_SIGNATURE;
+
+out:
+ memzero_explicit(session_key_buf, sizeof(session_key_buf));
+ memzero_explicit(con_secret_buf, sizeof(con_secret_buf));
+ return ret;
+
+bad:
+ pr_err("failed to decode auth_done\n");
+ return -EINVAL;
+}
+
+static int process_auth_signature(struct ceph_connection *con,
+ void *p, void *end)
+{
+ u8 hmac[SHA256_DIGEST_SIZE];
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_AUTH_SIGNATURE) {
+ con->error_msg = "protocol error, unexpected auth_signature";
+ return -EINVAL;
+ }
+
+ ceph_hmac_sha256(con, con->v2.out_sign_kvecs, con->v2.out_sign_kvec_cnt,
+ hmac);
+
+ ceph_decode_need(&p, end, SHA256_DIGEST_SIZE, bad);
+ if (crypto_memneq(p, hmac, SHA256_DIGEST_SIZE)) {
+ con->error_msg = "integrity error, bad auth signature";
+ return -EBADMSG;
+ }
+
+ dout("%s con %p auth signature ok\n", __func__, con);
+
+ /* no reset_out_kvecs() as our auth_signature may still be pending */
+ if (!con->v2.server_cookie) {
+ ret = prepare_client_ident(con);
+ if (ret) {
+ pr_err("prepare_client_ident failed: %d\n", ret);
+ return ret;
+ }
+
+ con->state = CEPH_CON_S_V2_SESSION_CONNECT;
+ } else {
+ ret = prepare_session_reconnect(con);
+ if (ret) {
+ pr_err("prepare_session_reconnect failed: %d\n", ret);
+ return ret;
+ }
+
+ con->state = CEPH_CON_S_V2_SESSION_RECONNECT;
+ }
+
+ return 0;
+
+bad:
+ pr_err("failed to decode auth_signature\n");
+ return -EINVAL;
+}
+
+static int process_server_ident(struct ceph_connection *con,
+ void *p, void *end)
+{
+ struct ceph_client *client = from_msgr(con->msgr);
+ u64 features, required_features;
+ struct ceph_entity_addr addr;
+ u64 global_seq;
+ u64 global_id;
+ u64 cookie;
+ u64 flags;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) {
+ con->error_msg = "protocol error, unexpected server_ident";
+ return -EINVAL;
+ }
+
+ ret = ceph_decode_entity_addrvec(&p, end, true, &addr);
+ if (ret) {
+ pr_err("failed to decode server addrs: %d\n", ret);
+ return ret;
+ }
+
+ ceph_decode_64_safe(&p, end, global_id, bad);
+ ceph_decode_64_safe(&p, end, global_seq, bad);
+ ceph_decode_64_safe(&p, end, features, bad);
+ ceph_decode_64_safe(&p, end, required_features, bad);
+ ceph_decode_64_safe(&p, end, flags, bad);
+ ceph_decode_64_safe(&p, end, cookie, bad);
+
+ dout("%s con %p addr %s/%u global_id %llu global_seq %llu features 0x%llx required_features 0x%llx flags 0x%llx cookie 0x%llx\n",
+ __func__, con, ceph_pr_addr(&addr), le32_to_cpu(addr.nonce),
+ global_id, global_seq, features, required_features, flags, cookie);
+
+ /* is this who we intended to talk to? */
+ if (memcmp(&addr, &con->peer_addr, sizeof(con->peer_addr))) {
+ pr_err("bad peer addr/nonce, want %s/%u, got %s/%u\n",
+ ceph_pr_addr(&con->peer_addr),
+ le32_to_cpu(con->peer_addr.nonce),
+ ceph_pr_addr(&addr), le32_to_cpu(addr.nonce));
+ con->error_msg = "wrong peer at address";
+ return -EINVAL;
+ }
+
+ if (client->required_features & ~features) {
+ pr_err("RADOS feature set mismatch: my required > server's supported 0x%llx, need 0x%llx\n",
+ features, client->required_features & ~features);
+ con->error_msg = "missing required protocol features";
+ return -EINVAL;
+ }
+
+ /*
+ * Both name->type and name->num are set in ceph_con_open() but
+ * name->num may be bogus in the initial monmap. name->type is
+ * verified in handle_hello().
+ */
+ WARN_ON(!con->peer_name.type);
+ con->peer_name.num = cpu_to_le64(global_id);
+ con->v2.peer_global_seq = global_seq;
+ con->peer_features = features;
+ WARN_ON(required_features & ~client->supported_features);
+ con->v2.server_cookie = cookie;
+
+ if (flags & CEPH_MSG_CONNECT_LOSSY) {
+ ceph_con_flag_set(con, CEPH_CON_F_LOSSYTX);
+ WARN_ON(con->v2.server_cookie);
+ } else {
+ WARN_ON(!con->v2.server_cookie);
+ }
+
+ clear_in_sign_kvecs(con);
+ clear_out_sign_kvecs(con);
+ free_conn_bufs(con);
+ con->delay = 0; /* reset backoff memory */
+
+ con->state = CEPH_CON_S_OPEN;
+ con->v2.out_state = OUT_S_GET_NEXT;
+ return 0;
+
+bad:
+ pr_err("failed to decode server_ident\n");
+ return -EINVAL;
+}
+
+static int process_ident_missing_features(struct ceph_connection *con,
+ void *p, void *end)
+{
+ struct ceph_client *client = from_msgr(con->msgr);
+ u64 missing_features;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_CONNECT) {
+ con->error_msg = "protocol error, unexpected ident_missing_features";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, missing_features, bad);
+ pr_err("RADOS feature set mismatch: server's required > my supported 0x%llx, missing 0x%llx\n",
+ client->supported_features, missing_features);
+ con->error_msg = "missing required protocol features";
+ return -EINVAL;
+
+bad:
+ pr_err("failed to decode ident_missing_features\n");
+ return -EINVAL;
+}
+
+static int process_session_reconnect_ok(struct ceph_connection *con,
+ void *p, void *end)
+{
+ u64 seq;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+ con->error_msg = "protocol error, unexpected session_reconnect_ok";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, seq, bad);
+
+ dout("%s con %p seq %llu\n", __func__, con, seq);
+ ceph_con_discard_requeued(con, seq);
+
+ clear_in_sign_kvecs(con);
+ clear_out_sign_kvecs(con);
+ free_conn_bufs(con);
+ con->delay = 0; /* reset backoff memory */
+
+ con->state = CEPH_CON_S_OPEN;
+ con->v2.out_state = OUT_S_GET_NEXT;
+ return 0;
+
+bad:
+ pr_err("failed to decode session_reconnect_ok\n");
+ return -EINVAL;
+}
+
+static int process_session_retry(struct ceph_connection *con,
+ void *p, void *end)
+{
+ u64 connect_seq;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+ con->error_msg = "protocol error, unexpected session_retry";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, connect_seq, bad);
+
+ dout("%s con %p connect_seq %llu\n", __func__, con, connect_seq);
+ WARN_ON(connect_seq <= con->v2.connect_seq);
+ con->v2.connect_seq = connect_seq + 1;
+
+ free_conn_bufs(con);
+
+ reset_out_kvecs(con);
+ ret = prepare_session_reconnect(con);
+ if (ret) {
+ pr_err("prepare_session_reconnect (cseq) failed: %d\n", ret);
+ return ret;
+ }
+
+ return 0;
+
+bad:
+ pr_err("failed to decode session_retry\n");
+ return -EINVAL;
+}
+
+static int process_session_retry_global(struct ceph_connection *con,
+ void *p, void *end)
+{
+ u64 global_seq;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+ con->error_msg = "protocol error, unexpected session_retry_global";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, global_seq, bad);
+
+ dout("%s con %p global_seq %llu\n", __func__, con, global_seq);
+ WARN_ON(global_seq <= con->v2.global_seq);
+ con->v2.global_seq = ceph_get_global_seq(con->msgr, global_seq);
+
+ free_conn_bufs(con);
+
+ reset_out_kvecs(con);
+ ret = prepare_session_reconnect(con);
+ if (ret) {
+ pr_err("prepare_session_reconnect (gseq) failed: %d\n", ret);
+ return ret;
+ }
+
+ return 0;
+
+bad:
+ pr_err("failed to decode session_retry_global\n");
+ return -EINVAL;
+}
+
+static int process_session_reset(struct ceph_connection *con,
+ void *p, void *end)
+{
+ bool full;
+ int ret;
+
+ if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+ con->error_msg = "protocol error, unexpected session_reset";
+ return -EINVAL;
+ }
+
+ ceph_decode_8_safe(&p, end, full, bad);
+ if (!full) {
+ con->error_msg = "protocol error, bad session_reset";
+ return -EINVAL;
+ }
+
+ pr_info("%s%lld %s session reset\n", ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr));
+ ceph_con_reset_session(con);
+
+ mutex_unlock(&con->mutex);
+ if (con->ops->peer_reset)
+ con->ops->peer_reset(con);
+ mutex_lock(&con->mutex);
+ if (con->state != CEPH_CON_S_V2_SESSION_RECONNECT) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ return -EAGAIN;
+ }
+
+ free_conn_bufs(con);
+
+ reset_out_kvecs(con);
+ ret = prepare_client_ident(con);
+ if (ret) {
+ pr_err("prepare_client_ident (rst) failed: %d\n", ret);
+ return ret;
+ }
+
+ con->state = CEPH_CON_S_V2_SESSION_CONNECT;
+ return 0;
+
+bad:
+ pr_err("failed to decode session_reset\n");
+ return -EINVAL;
+}
+
+static int process_keepalive2_ack(struct ceph_connection *con,
+ void *p, void *end)
+{
+ if (con->state != CEPH_CON_S_OPEN) {
+ con->error_msg = "protocol error, unexpected keepalive2_ack";
+ return -EINVAL;
+ }
+
+ ceph_decode_need(&p, end, sizeof(struct ceph_timespec), bad);
+ ceph_decode_timespec64(&con->last_keepalive_ack, p);
+
+ dout("%s con %p timestamp %ptSp\n", __func__, con, &con->last_keepalive_ack);
+
+ return 0;
+
+bad:
+ pr_err("failed to decode keepalive2_ack\n");
+ return -EINVAL;
+}
+
+static int process_ack(struct ceph_connection *con, void *p, void *end)
+{
+ u64 seq;
+
+ if (con->state != CEPH_CON_S_OPEN) {
+ con->error_msg = "protocol error, unexpected ack";
+ return -EINVAL;
+ }
+
+ ceph_decode_64_safe(&p, end, seq, bad);
+
+ dout("%s con %p seq %llu\n", __func__, con, seq);
+ ceph_con_discard_sent(con, seq);
+ return 0;
+
+bad:
+ pr_err("failed to decode ack\n");
+ return -EINVAL;
+}
+
+static int process_control(struct ceph_connection *con, void *p, void *end)
+{
+ int tag = con->v2.in_desc.fd_tag;
+ int ret;
+
+ dout("%s con %p tag %d len %d\n", __func__, con, tag, (int)(end - p));
+
+ switch (tag) {
+ case FRAME_TAG_HELLO:
+ ret = process_hello(con, p, end);
+ break;
+ case FRAME_TAG_AUTH_BAD_METHOD:
+ ret = process_auth_bad_method(con, p, end);
+ break;
+ case FRAME_TAG_AUTH_REPLY_MORE:
+ ret = process_auth_reply_more(con, p, end);
+ break;
+ case FRAME_TAG_AUTH_DONE:
+ ret = process_auth_done(con, p, end);
+ break;
+ case FRAME_TAG_AUTH_SIGNATURE:
+ ret = process_auth_signature(con, p, end);
+ break;
+ case FRAME_TAG_SERVER_IDENT:
+ ret = process_server_ident(con, p, end);
+ break;
+ case FRAME_TAG_IDENT_MISSING_FEATURES:
+ ret = process_ident_missing_features(con, p, end);
+ break;
+ case FRAME_TAG_SESSION_RECONNECT_OK:
+ ret = process_session_reconnect_ok(con, p, end);
+ break;
+ case FRAME_TAG_SESSION_RETRY:
+ ret = process_session_retry(con, p, end);
+ break;
+ case FRAME_TAG_SESSION_RETRY_GLOBAL:
+ ret = process_session_retry_global(con, p, end);
+ break;
+ case FRAME_TAG_SESSION_RESET:
+ ret = process_session_reset(con, p, end);
+ break;
+ case FRAME_TAG_KEEPALIVE2_ACK:
+ ret = process_keepalive2_ack(con, p, end);
+ break;
+ case FRAME_TAG_ACK:
+ ret = process_ack(con, p, end);
+ break;
+ default:
+ pr_err("bad tag %d\n", tag);
+ con->error_msg = "protocol error, bad tag";
+ return -EINVAL;
+ }
+ if (ret) {
+ dout("%s con %p error %d\n", __func__, con, ret);
+ return ret;
+ }
+
+ prepare_read_preamble(con);
+ return 0;
+}
+
+/*
+ * Return:
+ * 1 - con->in_msg set, read message
+ * 0 - skip message
+ * <0 - error
+ */
+static int process_message_header(struct ceph_connection *con,
+ void *p, void *end)
+{
+ struct ceph_frame_desc *desc = &con->v2.in_desc;
+ struct ceph_msg_header2 *hdr2 = p;
+ struct ceph_msg_header hdr;
+ int skip;
+ int ret;
+ u64 seq;
+
+ /* verify seq# */
+ seq = le64_to_cpu(hdr2->seq);
+ if ((s64)seq - (s64)con->in_seq < 1) {
+ pr_info("%s%lld %s skipping old message: seq %llu, expected %llu\n",
+ ENTITY_NAME(con->peer_name),
+ ceph_pr_addr(&con->peer_addr),
+ seq, con->in_seq + 1);
+ return 0;
+ }
+ if ((s64)seq - (s64)con->in_seq > 1) {
+ pr_err("bad seq %llu, expected %llu\n", seq, con->in_seq + 1);
+ con->error_msg = "bad message sequence # for incoming message";
+ return -EBADE;
+ }
+
+ ceph_con_discard_sent(con, le64_to_cpu(hdr2->ack_seq));
+
+ fill_header(&hdr, hdr2, desc->fd_lens[1], desc->fd_lens[2],
+ desc->fd_lens[3], &con->peer_name);
+ ret = ceph_con_in_msg_alloc(con, &hdr, &skip);
+ if (ret)
+ return ret;
+
+ WARN_ON(!con->in_msg ^ skip);
+ if (skip)
+ return 0;
+
+ WARN_ON(!con->in_msg);
+ WARN_ON(con->in_msg->con != con);
+ return 1;
+}
+
+static int process_message(struct ceph_connection *con)
+{
+ ceph_con_process_message(con);
+
+ /*
+ * We could have been closed by ceph_con_close() because
+ * ceph_con_process_message() temporarily drops con->mutex.
+ */
+ if (con->state != CEPH_CON_S_OPEN) {
+ dout("%s con %p state changed to %d\n", __func__, con,
+ con->state);
+ return -EAGAIN;
+ }
+
+ prepare_read_preamble(con);
+ return 0;
+}
+
+static int __handle_control(struct ceph_connection *con, void *p)
+{
+ void *end = p + con->v2.in_desc.fd_lens[0];
+ struct ceph_msg *msg;
+ int ret;
+
+ if (con->v2.in_desc.fd_tag != FRAME_TAG_MESSAGE)
+ return process_control(con, p, end);
+
+ ret = process_message_header(con, p, end);
+ if (ret < 0)
+ return ret;
+ if (ret == 0) {
+ prepare_skip_message(con);
+ return 0;
+ }
+
+ msg = con->in_msg; /* set in process_message_header() */
+ if (front_len(msg)) {
+ WARN_ON(front_len(msg) > msg->front_alloc_len);
+ msg->front.iov_len = front_len(msg);
+ } else {
+ msg->front.iov_len = 0;
+ }
+ if (middle_len(msg)) {
+ WARN_ON(middle_len(msg) > msg->middle->alloc_len);
+ msg->middle->vec.iov_len = middle_len(msg);
+ } else if (msg->middle) {
+ msg->middle->vec.iov_len = 0;
+ }
+
+ if (!front_len(msg) && !middle_len(msg) && !data_len(msg))
+ return process_message(con);
+
+ if (con_secure(con))
+ return prepare_read_tail_secure(con);
+
+ return prepare_read_tail_plain(con);
+}
+
+static int handle_preamble(struct ceph_connection *con)
+{
+ struct ceph_frame_desc *desc = &con->v2.in_desc;
+ int ret;
+
+ if (con_secure(con)) {
+ ret = decrypt_preamble(con);
+ if (ret) {
+ if (ret == -EBADMSG)
+ con->error_msg = "integrity error, bad preamble auth tag";
+ return ret;
+ }
+ }
+
+ ret = decode_preamble(con->v2.in_buf, desc);
+ if (ret) {
+ if (ret == -EBADMSG)
+ con->error_msg = "integrity error, bad crc";
+ else
+ con->error_msg = "protocol error, bad preamble";
+ return ret;
+ }
+
+ dout("%s con %p tag %d seg_cnt %d %d+%d+%d+%d\n", __func__,
+ con, desc->fd_tag, desc->fd_seg_cnt, desc->fd_lens[0],
+ desc->fd_lens[1], desc->fd_lens[2], desc->fd_lens[3]);
+
+ if (!con_secure(con))
+ return prepare_read_control(con);
+
+ if (desc->fd_lens[0] > CEPH_PREAMBLE_INLINE_LEN)
+ return prepare_read_control_remainder(con);
+
+ return __handle_control(con, CTRL_BODY(con->v2.in_buf));
+}
+
+static int handle_control(struct ceph_connection *con)
+{
+ int ctrl_len = con->v2.in_desc.fd_lens[0];
+ void *buf;
+ int ret;
+
+ WARN_ON(con_secure(con));
+
+ ret = verify_control_crc(con);
+ if (ret) {
+ con->error_msg = "integrity error, bad crc";
+ return ret;
+ }
+
+ if (con->state == CEPH_CON_S_V2_AUTH) {
+ buf = alloc_conn_buf(con, ctrl_len);
+ if (!buf)
+ return -ENOMEM;
+
+ memcpy(buf, con->v2.in_kvecs[0].iov_base, ctrl_len);
+ return __handle_control(con, buf);
+ }
+
+ return __handle_control(con, con->v2.in_kvecs[0].iov_base);
+}
+
+static int handle_control_remainder(struct ceph_connection *con)
+{
+ int ret;
+
+ WARN_ON(!con_secure(con));
+
+ ret = decrypt_control_remainder(con);
+ if (ret) {
+ if (ret == -EBADMSG)
+ con->error_msg = "integrity error, bad control remainder auth tag";
+ return ret;
+ }
+
+ return __handle_control(con, con->v2.in_kvecs[0].iov_base -
+ CEPH_PREAMBLE_INLINE_LEN);
+}
+
+static int handle_epilogue(struct ceph_connection *con)
+{
+ u32 front_crc, middle_crc, data_crc;
+ int ret;
+
+ if (con_secure(con)) {
+ ret = decrypt_tail(con);
+ if (ret) {
+ if (ret == -EBADMSG)
+ con->error_msg = "integrity error, bad epilogue auth tag";
+ return ret;
+ }
+
+ /* just late_status */
+ ret = decode_epilogue(con->v2.in_buf, NULL, NULL, NULL);
+ if (ret) {
+ con->error_msg = "protocol error, bad epilogue";
+ return ret;
+ }
+ } else {
+ ret = decode_epilogue(con->v2.in_buf, &front_crc,
+ &middle_crc, &data_crc);
+ if (ret) {
+ con->error_msg = "protocol error, bad epilogue";
+ return ret;
+ }
+
+ ret = verify_epilogue_crcs(con, front_crc, middle_crc,
+ data_crc);
+ if (ret) {
+ con->error_msg = "integrity error, bad crc";
+ return ret;
+ }
+ }
+
+ return process_message(con);
+}
+
+static void finish_skip(struct ceph_connection *con)
+{
+ dout("%s con %p\n", __func__, con);
+
+ if (con_secure(con))
+ gcm_inc_nonce(&con->v2.in_gcm_nonce);
+
+ __finish_skip(con);
+}
+
+static int populate_in_iter(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p state %d in_state %d\n", __func__, con, con->state,
+ con->v2.in_state);
+ WARN_ON(iov_iter_count(&con->v2.in_iter));
+
+ if (con->state == CEPH_CON_S_V2_BANNER_PREFIX) {
+ ret = process_banner_prefix(con);
+ } else if (con->state == CEPH_CON_S_V2_BANNER_PAYLOAD) {
+ ret = process_banner_payload(con);
+ } else if ((con->state >= CEPH_CON_S_V2_HELLO &&
+ con->state <= CEPH_CON_S_V2_SESSION_RECONNECT) ||
+ con->state == CEPH_CON_S_OPEN) {
+ switch (con->v2.in_state) {
+ case IN_S_HANDLE_PREAMBLE:
+ ret = handle_preamble(con);
+ break;
+ case IN_S_HANDLE_CONTROL:
+ ret = handle_control(con);
+ break;
+ case IN_S_HANDLE_CONTROL_REMAINDER:
+ ret = handle_control_remainder(con);
+ break;
+ case IN_S_PREPARE_READ_DATA:
+ ret = prepare_read_data(con);
+ break;
+ case IN_S_PREPARE_READ_DATA_CONT:
+ prepare_read_data_cont(con);
+ ret = 0;
+ break;
+ case IN_S_PREPARE_READ_ENC_PAGE:
+ prepare_read_enc_page(con);
+ ret = 0;
+ break;
+ case IN_S_PREPARE_SPARSE_DATA:
+ ret = prepare_sparse_read_data(con);
+ break;
+ case IN_S_PREPARE_SPARSE_DATA_CONT:
+ ret = prepare_sparse_read_cont(con);
+ break;
+ case IN_S_HANDLE_EPILOGUE:
+ ret = handle_epilogue(con);
+ break;
+ case IN_S_FINISH_SKIP:
+ finish_skip(con);
+ ret = 0;
+ break;
+ default:
+ WARN(1, "bad in_state %d", con->v2.in_state);
+ return -EINVAL;
+ }
+ } else {
+ WARN(1, "bad state %d", con->state);
+ return -EINVAL;
+ }
+ if (ret) {
+ dout("%s con %p error %d\n", __func__, con, ret);
+ return ret;
+ }
+
+ if (WARN_ON(!iov_iter_count(&con->v2.in_iter)))
+ return -ENODATA;
+ dout("%s con %p populated %zu\n", __func__, con,
+ iov_iter_count(&con->v2.in_iter));
+ return 1;
+}
+
+int ceph_con_v2_try_read(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p state %d need %zu\n", __func__, con, con->state,
+ iov_iter_count(&con->v2.in_iter));
+
+ if (con->state == CEPH_CON_S_PREOPEN)
+ return 0;
+
+ /*
+ * We should always have something pending here. If not,
+ * avoid calling populate_in_iter() as if we read something
+ * (ceph_tcp_recv() would immediately return 1).
+ */
+ if (WARN_ON(!iov_iter_count(&con->v2.in_iter)))
+ return -ENODATA;
+
+ for (;;) {
+ ret = ceph_tcp_recv(con);
+ if (ret <= 0)
+ return ret;
+
+ ret = populate_in_iter(con);
+ if (ret <= 0) {
+ if (ret && ret != -EAGAIN && !con->error_msg)
+ con->error_msg = "read processing error";
+ return ret;
+ }
+ }
+}
+
+static void queue_data(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ struct bio_vec bv;
+
+ con->v2.out_epil.data_crc = -1;
+ ceph_msg_data_cursor_init(&con->v2.out_cursor, msg,
+ data_len(msg));
+
+ get_bvec_at(&con->v2.out_cursor, &bv);
+ set_out_bvec(con, &bv, true);
+ con->v2.out_state = OUT_S_QUEUE_DATA_CONT;
+}
+
+static void queue_data_cont(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ struct bio_vec bv;
+
+ con->v2.out_epil.data_crc = ceph_crc32c_page(
+ con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page,
+ con->v2.out_bvec.bv_offset, con->v2.out_bvec.bv_len);
+
+ ceph_msg_data_advance(&con->v2.out_cursor, con->v2.out_bvec.bv_len);
+ if (con->v2.out_cursor.total_resid) {
+ get_bvec_at(&con->v2.out_cursor, &bv);
+ set_out_bvec(con, &bv, true);
+ WARN_ON(con->v2.out_state != OUT_S_QUEUE_DATA_CONT);
+ return;
+ }
+
+ /*
+ * We've written all data. Queue epilogue. Once it's written,
+ * we are done.
+ */
+ reset_out_kvecs(con);
+ prepare_epilogue_plain(con, msg, false);
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void queue_enc_page(struct ceph_connection *con)
+{
+ struct bio_vec bv;
+
+ dout("%s con %p i %d resid %d\n", __func__, con, con->v2.out_enc_i,
+ con->v2.out_enc_resid);
+ WARN_ON(!con->v2.out_enc_resid);
+
+ bvec_set_page(&bv, con->v2.out_enc_pages[con->v2.out_enc_i],
+ min(con->v2.out_enc_resid, (int)PAGE_SIZE), 0);
+
+ set_out_bvec(con, &bv, false);
+ con->v2.out_enc_i++;
+ con->v2.out_enc_resid -= bv.bv_len;
+
+ if (con->v2.out_enc_resid) {
+ WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE);
+ return;
+ }
+
+ /*
+ * We've queued the last piece of ciphertext (ending with
+ * epilogue) + auth tag. Once it's written, we are done.
+ */
+ WARN_ON(con->v2.out_enc_i != con->v2.out_enc_page_cnt);
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void queue_zeros(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ dout("%s con %p out_zero %d\n", __func__, con, con->v2.out_zero);
+
+ if (con->v2.out_zero) {
+ set_out_bvec_zero(con);
+ con->v2.out_zero -= con->v2.out_bvec.bv_len;
+ con->v2.out_state = OUT_S_QUEUE_ZEROS;
+ return;
+ }
+
+ /*
+ * We've zero-filled everything up to epilogue. Queue epilogue
+ * with late_status set to ABORTED and crcs adjusted for zeros.
+ * Once it's written, we are done patching up for the revoke.
+ */
+ reset_out_kvecs(con);
+ prepare_epilogue_plain(con, msg, true);
+ con->v2.out_state = OUT_S_FINISH_MESSAGE;
+}
+
+static void finish_message(struct ceph_connection *con)
+{
+ dout("%s con %p msg %p\n", __func__, con, con->out_msg);
+
+ /* we end up here both plain and secure modes */
+ if (con->v2.out_enc_pages) {
+ WARN_ON(!con->v2.out_enc_page_cnt);
+ ceph_release_page_vector(con->v2.out_enc_pages,
+ con->v2.out_enc_page_cnt);
+ con->v2.out_enc_pages = NULL;
+ con->v2.out_enc_page_cnt = 0;
+ }
+ /* message may have been revoked */
+ if (con->out_msg) {
+ ceph_msg_put(con->out_msg);
+ con->out_msg = NULL;
+ }
+
+ con->v2.out_state = OUT_S_GET_NEXT;
+}
+
+static int populate_out_iter(struct ceph_connection *con)
+{
+ struct ceph_msg *msg;
+ int ret;
+
+ dout("%s con %p state %d out_state %d\n", __func__, con, con->state,
+ con->v2.out_state);
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+
+ if (con->state != CEPH_CON_S_OPEN) {
+ WARN_ON(con->state < CEPH_CON_S_V2_BANNER_PREFIX ||
+ con->state > CEPH_CON_S_V2_SESSION_RECONNECT);
+ goto nothing_pending;
+ }
+
+ switch (con->v2.out_state) {
+ case OUT_S_QUEUE_DATA:
+ WARN_ON(!con->out_msg);
+ queue_data(con, con->out_msg);
+ goto populated;
+ case OUT_S_QUEUE_DATA_CONT:
+ WARN_ON(!con->out_msg);
+ queue_data_cont(con, con->out_msg);
+ goto populated;
+ case OUT_S_QUEUE_ENC_PAGE:
+ queue_enc_page(con);
+ goto populated;
+ case OUT_S_QUEUE_ZEROS:
+ WARN_ON(con->out_msg); /* revoked */
+ queue_zeros(con, con->out_msg);
+ goto populated;
+ case OUT_S_FINISH_MESSAGE:
+ finish_message(con);
+ break;
+ case OUT_S_GET_NEXT:
+ break;
+ default:
+ WARN(1, "bad out_state %d", con->v2.out_state);
+ return -EINVAL;
+ }
+
+ WARN_ON(con->v2.out_state != OUT_S_GET_NEXT);
+ if (ceph_con_flag_test_and_clear(con, CEPH_CON_F_KEEPALIVE_PENDING)) {
+ ret = prepare_keepalive2(con);
+ if (ret) {
+ pr_err("prepare_keepalive2 failed: %d\n", ret);
+ return ret;
+ }
+ } else if ((msg = ceph_con_get_out_msg(con)) != NULL) {
+ ret = prepare_message(con, msg);
+ if (ret) {
+ pr_err("prepare_message failed: %d\n", ret);
+ return ret;
+ }
+ } else if (con->in_seq > con->in_seq_acked) {
+ ret = prepare_ack(con);
+ if (ret) {
+ pr_err("prepare_ack failed: %d\n", ret);
+ return ret;
+ }
+ } else {
+ goto nothing_pending;
+ }
+
+populated:
+ if (WARN_ON(!iov_iter_count(&con->v2.out_iter)))
+ return -ENODATA;
+ dout("%s con %p populated %zu\n", __func__, con,
+ iov_iter_count(&con->v2.out_iter));
+ return 1;
+
+nothing_pending:
+ WARN_ON(iov_iter_count(&con->v2.out_iter));
+ dout("%s con %p nothing pending\n", __func__, con);
+ ceph_con_flag_clear(con, CEPH_CON_F_WRITE_PENDING);
+ return 0;
+}
+
+int ceph_con_v2_try_write(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("%s con %p state %d have %zu\n", __func__, con, con->state,
+ iov_iter_count(&con->v2.out_iter));
+
+ /* open the socket first? */
+ if (con->state == CEPH_CON_S_PREOPEN) {
+ WARN_ON(con->peer_addr.type != CEPH_ENTITY_ADDR_TYPE_MSGR2);
+
+ /*
+ * Always bump global_seq. Bump connect_seq only if
+ * there is a session (i.e. we are reconnecting and will
+ * send session_reconnect instead of client_ident).
+ */
+ con->v2.global_seq = ceph_get_global_seq(con->msgr, 0);
+ if (con->v2.server_cookie)
+ con->v2.connect_seq++;
+
+ ret = prepare_read_banner_prefix(con);
+ if (ret) {
+ pr_err("prepare_read_banner_prefix failed: %d\n", ret);
+ con->error_msg = "connect error";
+ return ret;
+ }
+
+ reset_out_kvecs(con);
+ ret = prepare_banner(con);
+ if (ret) {
+ pr_err("prepare_banner failed: %d\n", ret);
+ con->error_msg = "connect error";
+ return ret;
+ }
+
+ ret = ceph_tcp_connect(con);
+ if (ret) {
+ pr_err("ceph_tcp_connect failed: %d\n", ret);
+ con->error_msg = "connect error";
+ return ret;
+ }
+ }
+
+ if (!iov_iter_count(&con->v2.out_iter)) {
+ ret = populate_out_iter(con);
+ if (ret <= 0) {
+ if (ret && ret != -EAGAIN && !con->error_msg)
+ con->error_msg = "write processing error";
+ return ret;
+ }
+ }
+
+ tcp_sock_set_cork(con->sock->sk, true);
+ for (;;) {
+ ret = ceph_tcp_send(con);
+ if (ret <= 0)
+ break;
+
+ ret = populate_out_iter(con);
+ if (ret <= 0) {
+ if (ret && ret != -EAGAIN && !con->error_msg)
+ con->error_msg = "write processing error";
+ break;
+ }
+ }
+
+ tcp_sock_set_cork(con->sock->sk, false);
+ return ret;
+}
+
+static u32 crc32c_zeros(u32 crc, int zero_len)
+{
+ int len;
+
+ while (zero_len) {
+ len = min(zero_len, (int)PAGE_SIZE);
+ crc = crc32c(crc, page_address(ceph_zero_page), len);
+ zero_len -= len;
+ }
+
+ return crc;
+}
+
+static void prepare_zero_front(struct ceph_connection *con,
+ struct ceph_msg *msg, int resid)
+{
+ int sent;
+
+ WARN_ON(!resid || resid > front_len(msg));
+ sent = front_len(msg) - resid;
+ dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+ if (sent) {
+ con->v2.out_epil.front_crc =
+ crc32c(-1, msg->front.iov_base, sent);
+ con->v2.out_epil.front_crc =
+ crc32c_zeros(con->v2.out_epil.front_crc, resid);
+ } else {
+ con->v2.out_epil.front_crc = crc32c_zeros(-1, resid);
+ }
+
+ con->v2.out_iter.count -= resid;
+ out_zero_add(con, resid);
+}
+
+static void prepare_zero_middle(struct ceph_connection *con,
+ struct ceph_msg *msg, int resid)
+{
+ int sent;
+
+ WARN_ON(!resid || resid > middle_len(msg));
+ sent = middle_len(msg) - resid;
+ dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+ if (sent) {
+ con->v2.out_epil.middle_crc =
+ crc32c(-1, msg->middle->vec.iov_base, sent);
+ con->v2.out_epil.middle_crc =
+ crc32c_zeros(con->v2.out_epil.middle_crc, resid);
+ } else {
+ con->v2.out_epil.middle_crc = crc32c_zeros(-1, resid);
+ }
+
+ con->v2.out_iter.count -= resid;
+ out_zero_add(con, resid);
+}
+
+static void prepare_zero_data(struct ceph_connection *con,
+ struct ceph_msg *msg)
+{
+ dout("%s con %p\n", __func__, con);
+ con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(msg));
+ out_zero_add(con, data_len(msg));
+}
+
+static void revoke_at_queue_data(struct ceph_connection *con,
+ struct ceph_msg *msg)
+{
+ int boundary;
+ int resid;
+
+ WARN_ON(!data_len(msg));
+ WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+ resid = iov_iter_count(&con->v2.out_iter);
+
+ boundary = front_len(msg) + middle_len(msg);
+ if (resid > boundary) {
+ resid -= boundary;
+ WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN);
+ dout("%s con %p was sending head\n", __func__, con);
+ if (front_len(msg))
+ prepare_zero_front(con, msg, front_len(msg));
+ if (middle_len(msg))
+ prepare_zero_middle(con, msg, middle_len(msg));
+ prepare_zero_data(con, msg);
+ WARN_ON(iov_iter_count(&con->v2.out_iter) != resid);
+ con->v2.out_state = OUT_S_QUEUE_ZEROS;
+ return;
+ }
+
+ boundary = middle_len(msg);
+ if (resid > boundary) {
+ resid -= boundary;
+ dout("%s con %p was sending front\n", __func__, con);
+ prepare_zero_front(con, msg, resid);
+ if (middle_len(msg))
+ prepare_zero_middle(con, msg, middle_len(msg));
+ prepare_zero_data(con, msg);
+ queue_zeros(con, msg);
+ return;
+ }
+
+ WARN_ON(!resid);
+ dout("%s con %p was sending middle\n", __func__, con);
+ prepare_zero_middle(con, msg, resid);
+ prepare_zero_data(con, msg);
+ queue_zeros(con, msg);
+}
+
+static void revoke_at_queue_data_cont(struct ceph_connection *con,
+ struct ceph_msg *msg)
+{
+ int sent, resid; /* current piece of data */
+
+ WARN_ON(!data_len(msg));
+ WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter));
+ resid = iov_iter_count(&con->v2.out_iter);
+ WARN_ON(!resid || resid > con->v2.out_bvec.bv_len);
+ sent = con->v2.out_bvec.bv_len - resid;
+ dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid);
+
+ if (sent) {
+ con->v2.out_epil.data_crc = ceph_crc32c_page(
+ con->v2.out_epil.data_crc, con->v2.out_bvec.bv_page,
+ con->v2.out_bvec.bv_offset, sent);
+ ceph_msg_data_advance(&con->v2.out_cursor, sent);
+ }
+ WARN_ON(resid > con->v2.out_cursor.total_resid);
+ con->v2.out_epil.data_crc = crc32c_zeros(con->v2.out_epil.data_crc,
+ con->v2.out_cursor.total_resid);
+
+ con->v2.out_iter.count -= resid;
+ out_zero_add(con, con->v2.out_cursor.total_resid);
+ queue_zeros(con, msg);
+}
+
+static void revoke_at_finish_message(struct ceph_connection *con,
+ struct ceph_msg *msg)
+{
+ int boundary;
+ int resid;
+
+ WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter));
+ resid = iov_iter_count(&con->v2.out_iter);
+
+ if (!front_len(msg) && !middle_len(msg) &&
+ !data_len(msg)) {
+ WARN_ON(!resid || resid > MESSAGE_HEAD_PLAIN_LEN);
+ dout("%s con %p was sending head (empty message) - noop\n",
+ __func__, con);
+ return;
+ }
+
+ boundary = front_len(msg) + middle_len(msg) +
+ CEPH_EPILOGUE_PLAIN_LEN;
+ if (resid > boundary) {
+ resid -= boundary;
+ WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN);
+ dout("%s con %p was sending head\n", __func__, con);
+ if (front_len(msg))
+ prepare_zero_front(con, msg, front_len(msg));
+ if (middle_len(msg))
+ prepare_zero_middle(con, msg, middle_len(msg));
+ con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+ WARN_ON(iov_iter_count(&con->v2.out_iter) != resid);
+ con->v2.out_state = OUT_S_QUEUE_ZEROS;
+ return;
+ }
+
+ boundary = middle_len(msg) + CEPH_EPILOGUE_PLAIN_LEN;
+ if (resid > boundary) {
+ resid -= boundary;
+ dout("%s con %p was sending front\n", __func__, con);
+ prepare_zero_front(con, msg, resid);
+ if (middle_len(msg))
+ prepare_zero_middle(con, msg, middle_len(msg));
+ con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+ queue_zeros(con, msg);
+ return;
+ }
+
+ boundary = CEPH_EPILOGUE_PLAIN_LEN;
+ if (resid > boundary) {
+ resid -= boundary;
+ dout("%s con %p was sending middle\n", __func__, con);
+ prepare_zero_middle(con, msg, resid);
+ con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN;
+ queue_zeros(con, msg);
+ return;
+ }
+
+ WARN_ON(!resid);
+ dout("%s con %p was sending epilogue - noop\n", __func__, con);
+}
+
+void ceph_con_v2_revoke(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ WARN_ON(con->v2.out_zero);
+
+ if (con_secure(con)) {
+ WARN_ON(con->v2.out_state != OUT_S_QUEUE_ENC_PAGE &&
+ con->v2.out_state != OUT_S_FINISH_MESSAGE);
+ dout("%s con %p secure - noop\n", __func__, con);
+ return;
+ }
+
+ switch (con->v2.out_state) {
+ case OUT_S_QUEUE_DATA:
+ revoke_at_queue_data(con, msg);
+ break;
+ case OUT_S_QUEUE_DATA_CONT:
+ revoke_at_queue_data_cont(con, msg);
+ break;
+ case OUT_S_FINISH_MESSAGE:
+ revoke_at_finish_message(con, msg);
+ break;
+ default:
+ WARN(1, "bad out_state %d", con->v2.out_state);
+ break;
+ }
+}
+
+static void revoke_at_prepare_read_data(struct ceph_connection *con)
+{
+ int remaining;
+ int resid;
+
+ WARN_ON(con_secure(con));
+ WARN_ON(!data_len(con->in_msg));
+ WARN_ON(!iov_iter_is_kvec(&con->v2.in_iter));
+ resid = iov_iter_count(&con->v2.in_iter);
+ WARN_ON(!resid);
+
+ remaining = data_len(con->in_msg) + CEPH_EPILOGUE_PLAIN_LEN;
+ dout("%s con %p resid %d remaining %d\n", __func__, con, resid,
+ remaining);
+ con->v2.in_iter.count -= resid;
+ set_in_skip(con, resid + remaining);
+ con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+static void revoke_at_prepare_read_data_cont(struct ceph_connection *con)
+{
+ int recved, resid; /* current piece of data */
+ int remaining;
+
+ WARN_ON(con_secure(con));
+ WARN_ON(!data_len(con->in_msg));
+ WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter));
+ resid = iov_iter_count(&con->v2.in_iter);
+ WARN_ON(!resid || resid > con->v2.in_bvec.bv_len);
+ recved = con->v2.in_bvec.bv_len - resid;
+ dout("%s con %p recved %d resid %d\n", __func__, con, recved, resid);
+
+ if (recved)
+ ceph_msg_data_advance(&con->v2.in_cursor, recved);
+ WARN_ON(resid > con->v2.in_cursor.total_resid);
+
+ remaining = CEPH_EPILOGUE_PLAIN_LEN;
+ dout("%s con %p total_resid %zu remaining %d\n", __func__, con,
+ con->v2.in_cursor.total_resid, remaining);
+ con->v2.in_iter.count -= resid;
+ set_in_skip(con, con->v2.in_cursor.total_resid + remaining);
+ con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+static void revoke_at_prepare_read_enc_page(struct ceph_connection *con)
+{
+ int resid; /* current enc page (not necessarily data) */
+
+ WARN_ON(!con_secure(con));
+ WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter));
+ resid = iov_iter_count(&con->v2.in_iter);
+ WARN_ON(!resid || resid > con->v2.in_bvec.bv_len);
+
+ dout("%s con %p resid %d enc_resid %d\n", __func__, con, resid,
+ con->v2.in_enc_resid);
+ con->v2.in_iter.count -= resid;
+ set_in_skip(con, resid + con->v2.in_enc_resid);
+ con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+static void revoke_at_prepare_sparse_data(struct ceph_connection *con)
+{
+ int resid; /* current piece of data */
+ int remaining;
+
+ WARN_ON(con_secure(con));
+ WARN_ON(!data_len(con->in_msg));
+ WARN_ON(!iov_iter_is_bvec(&con->v2.in_iter));
+ resid = iov_iter_count(&con->v2.in_iter);
+ dout("%s con %p resid %d\n", __func__, con, resid);
+
+ remaining = CEPH_EPILOGUE_PLAIN_LEN + con->v2.data_len_remain;
+ con->v2.in_iter.count -= resid;
+ set_in_skip(con, resid + remaining);
+ con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+static void revoke_at_handle_epilogue(struct ceph_connection *con)
+{
+ int resid;
+
+ resid = iov_iter_count(&con->v2.in_iter);
+ WARN_ON(!resid);
+
+ dout("%s con %p resid %d\n", __func__, con, resid);
+ con->v2.in_iter.count -= resid;
+ set_in_skip(con, resid);
+ con->v2.in_state = IN_S_FINISH_SKIP;
+}
+
+void ceph_con_v2_revoke_incoming(struct ceph_connection *con)
+{
+ switch (con->v2.in_state) {
+ case IN_S_PREPARE_SPARSE_DATA:
+ case IN_S_PREPARE_READ_DATA:
+ revoke_at_prepare_read_data(con);
+ break;
+ case IN_S_PREPARE_READ_DATA_CONT:
+ revoke_at_prepare_read_data_cont(con);
+ break;
+ case IN_S_PREPARE_READ_ENC_PAGE:
+ revoke_at_prepare_read_enc_page(con);
+ break;
+ case IN_S_PREPARE_SPARSE_DATA_CONT:
+ revoke_at_prepare_sparse_data(con);
+ break;
+ case IN_S_HANDLE_EPILOGUE:
+ revoke_at_handle_epilogue(con);
+ break;
+ default:
+ WARN(1, "bad in_state %d", con->v2.in_state);
+ break;
+ }
+}
+
+bool ceph_con_v2_opened(struct ceph_connection *con)
+{
+ return con->v2.peer_global_seq;
+}
+
+void ceph_con_v2_reset_session(struct ceph_connection *con)
+{
+ con->v2.client_cookie = 0;
+ con->v2.server_cookie = 0;
+ con->v2.global_seq = 0;
+ con->v2.connect_seq = 0;
+ con->v2.peer_global_seq = 0;
+}
+
+void ceph_con_v2_reset_protocol(struct ceph_connection *con)
+{
+ iov_iter_truncate(&con->v2.in_iter, 0);
+ iov_iter_truncate(&con->v2.out_iter, 0);
+ con->v2.out_zero = 0;
+
+ clear_in_sign_kvecs(con);
+ clear_out_sign_kvecs(con);
+ free_conn_bufs(con);
+
+ if (con->v2.in_enc_pages) {
+ WARN_ON(!con->v2.in_enc_page_cnt);
+ ceph_release_page_vector(con->v2.in_enc_pages,
+ con->v2.in_enc_page_cnt);
+ con->v2.in_enc_pages = NULL;
+ con->v2.in_enc_page_cnt = 0;
+ }
+ if (con->v2.out_enc_pages) {
+ WARN_ON(!con->v2.out_enc_page_cnt);
+ ceph_release_page_vector(con->v2.out_enc_pages,
+ con->v2.out_enc_page_cnt);
+ con->v2.out_enc_pages = NULL;
+ con->v2.out_enc_page_cnt = 0;
+ }
+
+ con->v2.con_mode = CEPH_CON_MODE_UNKNOWN;
+ memzero_explicit(&con->v2.in_gcm_nonce, CEPH_GCM_IV_LEN);
+ memzero_explicit(&con->v2.out_gcm_nonce, CEPH_GCM_IV_LEN);
+
+ memzero_explicit(&con->v2.hmac_key, sizeof(con->v2.hmac_key));
+ con->v2.hmac_key_set = false;
+ if (con->v2.gcm_req) {
+ aead_request_free(con->v2.gcm_req);
+ con->v2.gcm_req = NULL;
+ }
+ if (con->v2.gcm_tfm) {
+ crypto_free_aead(con->v2.gcm_tfm);
+ con->v2.gcm_tfm = NULL;
+ }
+}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 18deb3d889c4..c227ececa925 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -36,53 +36,122 @@ static const struct ceph_connection_operations mon_con_ops;
static int __validate_auth(struct ceph_mon_client *monc);
+static int decode_mon_info(void **p, void *end, bool msgr2,
+ struct ceph_entity_addr *addr)
+{
+ void *mon_info_end;
+ u32 struct_len;
+ u8 struct_v;
+ int ret;
+
+ ret = ceph_start_decoding(p, end, 1, "mon_info_t", &struct_v,
+ &struct_len);
+ if (ret)
+ return ret;
+
+ mon_info_end = *p + struct_len;
+ ceph_decode_skip_string(p, end, e_inval); /* skip mon name */
+ ret = ceph_decode_entity_addrvec(p, end, msgr2, addr);
+ if (ret)
+ return ret;
+
+ *p = mon_info_end;
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
/*
* Decode a monmap blob (e.g., during mount).
+ *
+ * Assume MonMap v3 (i.e. encoding with MONNAMES and MONENC).
*/
-struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+static struct ceph_monmap *ceph_monmap_decode(void **p, void *end, bool msgr2)
{
- struct ceph_monmap *m = NULL;
- int i, err = -EINVAL;
+ struct ceph_monmap *monmap = NULL;
struct ceph_fsid fsid;
- u32 epoch, num_mon;
- u32 len;
+ u32 struct_len;
+ int blob_len;
+ int num_mon;
+ u8 struct_v;
+ u32 epoch;
+ int ret;
+ int i;
+
+ ceph_decode_32_safe(p, end, blob_len, e_inval);
+ ceph_decode_need(p, end, blob_len, e_inval);
+
+ ret = ceph_start_decoding(p, end, 6, "monmap", &struct_v, &struct_len);
+ if (ret)
+ goto fail;
+
+ dout("%s struct_v %d\n", __func__, struct_v);
+ ceph_decode_copy_safe(p, end, &fsid, sizeof(fsid), e_inval);
+ ceph_decode_32_safe(p, end, epoch, e_inval);
+ if (struct_v >= 6) {
+ u32 feat_struct_len;
+ u8 feat_struct_v;
+
+ *p += sizeof(struct ceph_timespec); /* skip last_changed */
+ *p += sizeof(struct ceph_timespec); /* skip created */
- ceph_decode_32_safe(&p, end, len, bad);
- ceph_decode_need(&p, end, len, bad);
+ ret = ceph_start_decoding(p, end, 1, "mon_feature_t",
+ &feat_struct_v, &feat_struct_len);
+ if (ret)
+ goto fail;
- dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
- p += sizeof(u16); /* skip version */
+ *p += feat_struct_len; /* skip persistent_features */
- ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
- ceph_decode_copy(&p, &fsid, sizeof(fsid));
- epoch = ceph_decode_32(&p);
+ ret = ceph_start_decoding(p, end, 1, "mon_feature_t",
+ &feat_struct_v, &feat_struct_len);
+ if (ret)
+ goto fail;
- num_mon = ceph_decode_32(&p);
- ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
+ *p += feat_struct_len; /* skip optional_features */
+ }
+ ceph_decode_32_safe(p, end, num_mon, e_inval);
+ dout("%s fsid %pU epoch %u num_mon %d\n", __func__, &fsid, epoch,
+ num_mon);
if (num_mon > CEPH_MAX_MON)
- goto bad;
- m = kmalloc(struct_size(m, mon_inst, num_mon), GFP_NOFS);
- if (m == NULL)
- return ERR_PTR(-ENOMEM);
- m->fsid = fsid;
- m->epoch = epoch;
- m->num_mon = num_mon;
- ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
- for (i = 0; i < num_mon; i++)
- ceph_decode_addr(&m->mon_inst[i].addr);
-
- dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
- m->num_mon);
- for (i = 0; i < m->num_mon; i++)
- dout("monmap_decode mon%d is %s\n", i,
- ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
- return m;
+ goto e_inval;
-bad:
- dout("monmap_decode failed with %d\n", err);
- kfree(m);
- return ERR_PTR(err);
+ monmap = kmalloc(struct_size(monmap, mon_inst, num_mon), GFP_NOIO);
+ if (!monmap) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ monmap->fsid = fsid;
+ monmap->epoch = epoch;
+ monmap->num_mon = num_mon;
+
+ /* legacy_mon_addr map or mon_info map */
+ for (i = 0; i < num_mon; i++) {
+ struct ceph_entity_inst *inst = &monmap->mon_inst[i];
+
+ ceph_decode_skip_string(p, end, e_inval); /* skip mon name */
+ inst->name.type = CEPH_ENTITY_TYPE_MON;
+ inst->name.num = cpu_to_le64(i);
+
+ if (struct_v >= 6)
+ ret = decode_mon_info(p, end, msgr2, &inst->addr);
+ else
+ ret = ceph_decode_entity_addr(p, end, &inst->addr);
+ if (ret)
+ goto fail;
+
+ dout("%s mon%d addr %s\n", __func__, i,
+ ceph_pr_addr(&inst->addr));
+ }
+
+ return monmap;
+
+e_inval:
+ ret = -EINVAL;
+fail:
+ kfree(monmap);
+ return ERR_PTR(ret);
}
/*
@@ -92,9 +161,11 @@ int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
{
int i;
- for (i = 0; i < m->num_mon; i++)
- if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
+ for (i = 0; i < m->num_mon; i++) {
+ if (ceph_addr_equal_no_type(addr, &m->mon_inst[i].addr))
return 1;
+ }
+
return 0;
}
@@ -151,7 +222,7 @@ static void pick_new_mon(struct ceph_mon_client *monc)
max--;
}
- n = prandom_u32() % max;
+ n = get_random_u32_below(max);
if (o >= 0 && n >= o)
n++;
@@ -186,10 +257,16 @@ static void __open_session(struct ceph_mon_client *monc)
&monc->monmap->mon_inst[monc->cur_mon].addr);
/*
- * send an initial keepalive to ensure our timestamp is valid
- * by the time we are in an OPENED state
+ * Queue a keepalive to ensure that in case of an early fault
+ * the messenger doesn't put us into STANDBY state and instead
+ * retries. This also ensures that our timestamp is valid by
+ * the time we finish hunting and delayed_work() checks it.
*/
ceph_con_keepalive(&monc->con);
+ if (ceph_msgr2(monc->client)) {
+ monc->pending_auth = 1;
+ return;
+ }
/* initiate authentication handshake */
ret = ceph_auth_build_hello(monc->auth,
@@ -203,12 +280,19 @@ static void reopen_session(struct ceph_mon_client *monc)
{
if (!monc->hunting)
pr_info("mon%d %s session lost, hunting for new mon\n",
- monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr.in_addr));
+ monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr));
__close_session(monc);
__open_session(monc);
}
+void ceph_monc_reopen_session(struct ceph_mon_client *monc)
+{
+ mutex_lock(&monc->mutex);
+ reopen_session(monc);
+ mutex_unlock(&monc->mutex);
+}
+
static void un_backoff(struct ceph_mon_client *monc)
{
monc->hunt_mult /= 2; /* reduce by 50% */
@@ -230,7 +314,7 @@ static void __schedule_delayed(struct ceph_mon_client *monc)
delay = CEPH_MONC_PING_INTERVAL;
dout("__schedule_delayed after %lu\n", delay);
- mod_delayed_work(system_wq, &monc->delayed_work,
+ mod_delayed_work(system_percpu_wq, &monc->delayed_work,
round_jiffies_relative(delay));
}
@@ -456,7 +540,7 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
struct ceph_msg *msg)
{
struct ceph_client *client = monc->client;
- struct ceph_monmap *monmap = NULL, *old = monc->monmap;
+ struct ceph_monmap *monmap;
void *p, *end;
mutex_lock(&monc->mutex);
@@ -465,20 +549,21 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
p = msg->front.iov_base;
end = p + msg->front.iov_len;
- monmap = ceph_monmap_decode(p, end);
+ monmap = ceph_monmap_decode(&p, end, ceph_msgr2(client));
if (IS_ERR(monmap)) {
pr_err("problem decoding monmap, %d\n",
(int)PTR_ERR(monmap));
+ ceph_msg_dump(msg);
goto out;
}
- if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
+ if (ceph_check_fsid(client, &monmap->fsid) < 0) {
kfree(monmap);
goto out;
}
- client->monc.monmap = monmap;
- kfree(old);
+ kfree(monc->monmap);
+ monc->monmap = monmap;
__ceph_monc_got_map(monc, CEPH_SUB_MONMAP, monc->monmap->epoch);
client->have_fsid = true;
@@ -884,8 +969,9 @@ bad:
ceph_msg_dump(msg);
}
-int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
- struct ceph_entity_addr *client_addr)
+static __printf(2, 0)
+int do_mon_command_vargs(struct ceph_mon_client *monc, const char *fmt,
+ va_list ap)
{
struct ceph_mon_generic_request *req;
struct ceph_mon_command *h;
@@ -913,10 +999,7 @@ int ceph_monc_blacklist_add(struct ceph_mon_client *monc,
h->monhdr.session_mon_tid = 0;
h->fsid = monc->monmap->fsid;
h->num_strs = cpu_to_le32(1);
- len = sprintf(h->str, "{ \"prefix\": \"osd blacklist\", \
- \"blacklistop\": \"add\", \
- \"addr\": \"%pISpc/%u\" }",
- &client_addr->in_addr, le32_to_cpu(client_addr->nonce));
+ len = vsprintf(h->str, fmt, ap);
h->str_len = cpu_to_le32(len);
send_generic_request(monc, req);
mutex_unlock(&monc->mutex);
@@ -926,7 +1009,55 @@ out:
put_generic_request(req);
return ret;
}
-EXPORT_SYMBOL(ceph_monc_blacklist_add);
+
+static __printf(2, 3)
+int do_mon_command(struct ceph_mon_client *monc, const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = do_mon_command_vargs(monc, fmt, ap);
+ va_end(ap);
+ return ret;
+}
+
+int ceph_monc_blocklist_add(struct ceph_mon_client *monc,
+ struct ceph_entity_addr *client_addr)
+{
+ int ret;
+
+ ret = do_mon_command(monc,
+ "{ \"prefix\": \"osd blocklist\", \
+ \"blocklistop\": \"add\", \
+ \"addr\": \"%pISpc/%u\" }",
+ &client_addr->in_addr,
+ le32_to_cpu(client_addr->nonce));
+ if (ret == -EINVAL) {
+ /*
+ * The monitor returns EINVAL on an unrecognized command.
+ * Try the legacy command -- it is exactly the same except
+ * for the name.
+ */
+ ret = do_mon_command(monc,
+ "{ \"prefix\": \"osd blacklist\", \
+ \"blacklistop\": \"add\", \
+ \"addr\": \"%pISpc/%u\" }",
+ &client_addr->in_addr,
+ le32_to_cpu(client_addr->nonce));
+ }
+ if (ret)
+ return ret;
+
+ /*
+ * Make sure we have the osdmap that includes the blocklist
+ * entry. This is needed to ensure that the OSDs pick up the
+ * new blocklist before processing any future requests from
+ * this client.
+ */
+ return ceph_wait_for_latest_osdmap(monc->client, 0);
+}
+EXPORT_SYMBOL(ceph_monc_blocklist_add);
/*
* Resend pending generic requests.
@@ -954,13 +1085,19 @@ static void delayed_work(struct work_struct *work)
struct ceph_mon_client *monc =
container_of(work, struct ceph_mon_client, delayed_work.work);
- dout("monc delayed_work\n");
mutex_lock(&monc->mutex);
+ dout("%s mon%d\n", __func__, monc->cur_mon);
+ if (monc->cur_mon < 0) {
+ goto out;
+ }
+
if (monc->hunting) {
dout("%s continuing hunt\n", __func__);
reopen_session(monc);
} else {
int is_auth = ceph_auth_is_authenticated(monc->auth);
+
+ dout("%s is_authed %d\n", __func__, is_auth);
if (ceph_con_keepalive_expired(&monc->con,
CEPH_MONC_PING_TIMEOUT)) {
dout("monc keepalive timeout\n");
@@ -985,6 +1122,8 @@ static void delayed_work(struct work_struct *work)
}
}
__schedule_delayed(monc);
+
+out:
mutex_unlock(&monc->mutex);
}
@@ -994,8 +1133,9 @@ static void delayed_work(struct work_struct *work)
*/
static int build_initial_monmap(struct ceph_mon_client *monc)
{
+ __le32 my_type = ceph_msgr2(monc->client) ?
+ CEPH_ENTITY_ADDR_TYPE_MSGR2 : CEPH_ENTITY_ADDR_TYPE_LEGACY;
struct ceph_options *opt = monc->client->options;
- struct ceph_entity_addr *mon_addr = opt->mon_addr;
int num_mon = opt->num_mon;
int i;
@@ -1004,25 +1144,28 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
GFP_KERNEL);
if (!monc->monmap)
return -ENOMEM;
+ monc->monmap->num_mon = num_mon;
+
for (i = 0; i < num_mon; i++) {
- monc->monmap->mon_inst[i].addr = mon_addr[i];
- monc->monmap->mon_inst[i].addr.nonce = 0;
- monc->monmap->mon_inst[i].name.type =
- CEPH_ENTITY_TYPE_MON;
- monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
+ struct ceph_entity_inst *inst = &monc->monmap->mon_inst[i];
+
+ memcpy(&inst->addr.in_addr, &opt->mon_addr[i].in_addr,
+ sizeof(inst->addr.in_addr));
+ inst->addr.type = my_type;
+ inst->addr.nonce = 0;
+ inst->name.type = CEPH_ENTITY_TYPE_MON;
+ inst->name.num = cpu_to_le64(i);
}
- monc->monmap->num_mon = num_mon;
return 0;
}
int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
{
- int err = 0;
+ int err;
dout("init\n");
memset(monc, 0, sizeof(*monc));
monc->client = cl;
- monc->monmap = NULL;
mutex_init(&monc->mutex);
err = build_initial_monmap(monc);
@@ -1031,8 +1174,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
/* connection */
/* authentication */
- monc->auth = ceph_auth_init(cl->options->name,
- cl->options->key);
+ monc->auth = ceph_auth_init(cl->options->name, cl->options->key,
+ cl->options->con_modes);
if (IS_ERR(monc->auth)) {
err = PTR_ERR(monc->auth);
goto out_monmap;
@@ -1097,13 +1240,15 @@ EXPORT_SYMBOL(ceph_monc_init);
void ceph_monc_stop(struct ceph_mon_client *monc)
{
dout("stop\n");
- cancel_delayed_work_sync(&monc->delayed_work);
mutex_lock(&monc->mutex);
__close_session(monc);
+ monc->hunting = false;
monc->cur_mon = -1;
mutex_unlock(&monc->mutex);
+ cancel_delayed_work_sync(&monc->delayed_work);
+
/*
* flush msgr queue before we destroy ourselves to ensure that:
* - any work that references our embedded con is finished.
@@ -1136,30 +1281,22 @@ static void finish_hunting(struct ceph_mon_client *monc)
}
}
-static void handle_auth_reply(struct ceph_mon_client *monc,
- struct ceph_msg *msg)
+static void finish_auth(struct ceph_mon_client *monc, int auth_err,
+ bool was_authed)
{
- int ret;
- int was_auth = 0;
+ dout("%s auth_err %d was_authed %d\n", __func__, auth_err, was_authed);
+ WARN_ON(auth_err > 0);
- mutex_lock(&monc->mutex);
- was_auth = ceph_auth_is_authenticated(monc->auth);
monc->pending_auth = 0;
- ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
- msg->front.iov_len,
- monc->m_auth->front.iov_base,
- monc->m_auth->front_alloc_len);
- if (ret > 0) {
- __send_prepared_auth_request(monc, ret);
- goto out;
+ if (auth_err) {
+ monc->client->auth_err = auth_err;
+ wake_up_all(&monc->client->auth_wq);
+ return;
}
- finish_hunting(monc);
-
- if (ret < 0) {
- monc->client->auth_err = ret;
- } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) {
- dout("authenticated, starting session\n");
+ if (!was_authed && ceph_auth_is_authenticated(monc->auth)) {
+ dout("%s authenticated, starting session global_id %llu\n",
+ __func__, monc->auth->global_id);
monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
monc->client->msgr.inst.name.num =
@@ -1169,13 +1306,29 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
__resend_generic_request(monc);
pr_info("mon%d %s session established\n", monc->cur_mon,
- ceph_pr_addr(&monc->con.peer_addr.in_addr));
+ ceph_pr_addr(&monc->con.peer_addr));
}
+}
-out:
+static void handle_auth_reply(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ bool was_authed;
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ was_authed = ceph_auth_is_authenticated(monc->auth);
+ ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
+ msg->front.iov_len,
+ monc->m_auth->front.iov_base,
+ monc->m_auth->front_alloc_len);
+ if (ret > 0) {
+ __send_prepared_auth_request(monc, ret);
+ } else {
+ finish_auth(monc, ret, was_authed);
+ finish_hunting(monc);
+ }
mutex_unlock(&monc->mutex);
- if (monc->client->auth_err < 0)
- wake_up_all(&monc->client->auth_wq);
}
static int __validate_auth(struct ceph_mon_client *monc)
@@ -1204,17 +1357,96 @@ int ceph_monc_validate_auth(struct ceph_mon_client *monc)
}
EXPORT_SYMBOL(ceph_monc_validate_auth);
+static int mon_get_auth_request(struct ceph_connection *con,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len)
+{
+ struct ceph_mon_client *monc = con->private;
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ ret = ceph_auth_get_request(monc->auth, buf, *buf_len);
+ mutex_unlock(&monc->mutex);
+ if (ret < 0)
+ return ret;
+
+ *buf_len = ret;
+ *authorizer = NULL;
+ *authorizer_len = 0;
+ return 0;
+}
+
+static int mon_handle_auth_reply_more(struct ceph_connection *con,
+ void *reply, int reply_len,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len)
+{
+ struct ceph_mon_client *monc = con->private;
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ ret = ceph_auth_handle_reply_more(monc->auth, reply, reply_len,
+ buf, *buf_len);
+ mutex_unlock(&monc->mutex);
+ if (ret < 0)
+ return ret;
+
+ *buf_len = ret;
+ *authorizer = NULL;
+ *authorizer_len = 0;
+ return 0;
+}
+
+static int mon_handle_auth_done(struct ceph_connection *con,
+ u64 global_id, void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ struct ceph_mon_client *monc = con->private;
+ bool was_authed;
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ WARN_ON(!monc->hunting);
+ was_authed = ceph_auth_is_authenticated(monc->auth);
+ ret = ceph_auth_handle_reply_done(monc->auth, global_id,
+ reply, reply_len,
+ session_key, session_key_len,
+ con_secret, con_secret_len);
+ finish_auth(monc, ret, was_authed);
+ if (!ret)
+ finish_hunting(monc);
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+
+static int mon_handle_auth_bad_method(struct ceph_connection *con,
+ int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt)
+{
+ struct ceph_mon_client *monc = con->private;
+ bool was_authed;
+
+ mutex_lock(&monc->mutex);
+ WARN_ON(!monc->hunting);
+ was_authed = ceph_auth_is_authenticated(monc->auth);
+ ceph_auth_handle_bad_method(monc->auth, used_proto, result,
+ allowed_protos, proto_cnt,
+ allowed_modes, mode_cnt);
+ finish_auth(monc, -EACCES, was_authed);
+ mutex_unlock(&monc->mutex);
+ return 0;
+}
+
/*
* handle incoming message
*/
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+static void mon_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
{
struct ceph_mon_client *monc = con->private;
int type = le16_to_cpu(msg->hdr.type);
- if (!monc)
- return;
-
switch (type) {
case CEPH_MSG_AUTH_REPLY:
handle_auth_reply(monc, msg);
@@ -1285,11 +1517,11 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
return get_generic_reply(con, hdr, skip);
/*
- * Older OSDs don't set reply tid even if the orignal
+ * Older OSDs don't set reply tid even if the original
* request had a non-zero tid. Work around this weirdness
* by allocating a new message.
*/
- /* fall through */
+ fallthrough;
case CEPH_MSG_MON_MAP:
case CEPH_MSG_MDS_MAP:
case CEPH_MSG_OSD_MAP:
@@ -1342,19 +1574,23 @@ static void mon_fault(struct ceph_connection *con)
* will come from the messenger workqueue, which is drained prior to
* mon_client destruction.
*/
-static struct ceph_connection *con_get(struct ceph_connection *con)
+static struct ceph_connection *mon_get_con(struct ceph_connection *con)
{
return con;
}
-static void con_put(struct ceph_connection *con)
+static void mon_put_con(struct ceph_connection *con)
{
}
static const struct ceph_connection_operations mon_con_ops = {
- .get = con_get,
- .put = con_put,
- .dispatch = dispatch,
- .fault = mon_fault,
+ .get = mon_get_con,
+ .put = mon_put_con,
.alloc_msg = mon_alloc_msg,
+ .dispatch = mon_dispatch,
+ .fault = mon_fault,
+ .get_auth_request = mon_get_auth_request,
+ .handle_auth_reply_more = mon_handle_auth_reply_more,
+ .handle_auth_done = mon_handle_auth_done,
+ .handle_auth_bad_method = mon_handle_auth_bad_method,
};
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c
index 72571535883f..e3ecb80cd182 100644
--- a/net/ceph/msgpool.c
+++ b/net/ceph/msgpool.c
@@ -14,7 +14,8 @@ static void *msgpool_alloc(gfp_t gfp_mask, void *arg)
struct ceph_msgpool *pool = arg;
struct ceph_msg *msg;
- msg = ceph_msg_new(pool->type, pool->front_len, gfp_mask, true);
+ msg = ceph_msg_new2(pool->type, pool->front_len, pool->max_data_items,
+ gfp_mask, true);
if (!msg) {
dout("msgpool_alloc %s failed\n", pool->name);
} else {
@@ -35,11 +36,13 @@ static void msgpool_free(void *element, void *arg)
}
int ceph_msgpool_init(struct ceph_msgpool *pool, int type,
- int front_len, int size, bool blocking, const char *name)
+ int front_len, int max_data_items, int size,
+ const char *name)
{
dout("msgpool %s init\n", name);
pool->type = type;
pool->front_len = front_len;
+ pool->max_data_items = max_data_items;
pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool);
if (!pool->pool)
return -ENOMEM;
@@ -53,18 +56,21 @@ void ceph_msgpool_destroy(struct ceph_msgpool *pool)
mempool_destroy(pool->pool);
}
-struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
- int front_len)
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len,
+ int max_data_items)
{
struct ceph_msg *msg;
- if (front_len > pool->front_len) {
- dout("msgpool_get %s need front %d, pool size is %d\n",
- pool->name, front_len, pool->front_len);
- WARN_ON(1);
+ if (front_len > pool->front_len ||
+ max_data_items > pool->max_data_items) {
+ pr_warn_ratelimited("%s need %d/%d, pool %s has %d/%d\n",
+ __func__, front_len, max_data_items, pool->name,
+ pool->front_len, pool->max_data_items);
+ WARN_ON_ONCE(1);
/* try to alloc a fresh message */
- return ceph_msg_new(pool->type, front_len, GFP_NOFS, false);
+ return ceph_msg_new2(pool->type, front_len, max_data_items,
+ GFP_NOFS, false);
}
msg = mempool_alloc(pool->pool, GFP_NOFS);
@@ -80,6 +86,9 @@ void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
msg->front.iov_len = pool->front_len;
msg->hdr.front_len = cpu_to_le32(pool->front_len);
+ msg->data_length = 0;
+ msg->num_data_items = 0;
+
kref_init(&msg->kref); /* retake single ref */
mempool_free(msg, pool->pool);
}
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 60934bd8796c..6664ea73ccf8 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -126,6 +126,9 @@ static void ceph_osd_data_init(struct ceph_osd_data *osd_data)
osd_data->type = CEPH_OSD_DATA_TYPE_NONE;
}
+/*
+ * Consumes @pages if @own_pages is true.
+ */
static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
struct page **pages, u64 length, u32 alignment,
bool pages_from_pool, bool own_pages)
@@ -138,6 +141,9 @@ static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
osd_data->own_pages = own_pages;
}
+/*
+ * Consumes a ref on @pagelist.
+ */
static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
struct ceph_pagelist *pagelist)
{
@@ -165,13 +171,12 @@ static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data,
osd_data->num_bvecs = num_bvecs;
}
-#define osd_req_op_data(oreq, whch, typ, fld) \
-({ \
- struct ceph_osd_request *__oreq = (oreq); \
- unsigned int __whch = (whch); \
- BUG_ON(__whch >= __oreq->r_num_ops); \
- &__oreq->r_ops[__whch].typ.fld; \
-})
+static void ceph_osd_iter_init(struct ceph_osd_data *osd_data,
+ struct iov_iter *iter)
+{
+ osd_data->type = CEPH_OSD_DATA_TYPE_ITER;
+ osd_data->iter = *iter;
+}
static struct ceph_osd_data *
osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
@@ -215,16 +220,6 @@ void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req,
}
EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages);
-void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req,
- unsigned int which, struct ceph_pagelist *pagelist)
-{
- struct ceph_osd_data *osd_data;
-
- osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
- ceph_osd_data_pagelist_init(osd_data, pagelist);
-}
-EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist);
-
#ifdef CONFIG_BLOCK
void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
unsigned int which,
@@ -266,28 +261,31 @@ void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req,
}
EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos);
-static void osd_req_op_cls_request_info_pagelist(
- struct ceph_osd_request *osd_req,
- unsigned int which, struct ceph_pagelist *pagelist)
+/**
+ * osd_req_op_extent_osd_iter - Set up an operation with an iterator buffer
+ * @osd_req: The request to set up
+ * @which: Index of the operation in which to set the iter
+ * @iter: The buffer iterator
+ */
+void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req,
+ unsigned int which, struct iov_iter *iter)
{
struct ceph_osd_data *osd_data;
- osd_data = osd_req_op_data(osd_req, which, cls, request_info);
- ceph_osd_data_pagelist_init(osd_data, pagelist);
+ osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
+ ceph_osd_iter_init(osd_data, iter);
}
+EXPORT_SYMBOL(osd_req_op_extent_osd_iter);
-void osd_req_op_cls_request_data_pagelist(
+static void osd_req_op_cls_request_info_pagelist(
struct ceph_osd_request *osd_req,
unsigned int which, struct ceph_pagelist *pagelist)
{
struct ceph_osd_data *osd_data;
- osd_data = osd_req_op_data(osd_req, which, cls, request_data);
+ osd_data = osd_req_op_data(osd_req, which, cls, request_info);
ceph_osd_data_pagelist_init(osd_data, pagelist);
- osd_req->r_ops[which].cls.indata_len += pagelist->length;
- osd_req->r_ops[which].indata_len += pagelist->length;
}
-EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
unsigned int which, struct page **pages, u64 length,
@@ -348,6 +346,8 @@ static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
#endif /* CONFIG_BLOCK */
case CEPH_OSD_DATA_TYPE_BVECS:
return osd_data->bvec_pos.iter.bi_size;
+ case CEPH_OSD_DATA_TYPE_ITER:
+ return iov_iter_count(&osd_data->iter);
default:
WARN(true, "unrecognized data type %d\n", (int)osd_data->type);
return 0;
@@ -362,6 +362,8 @@ static void ceph_osd_data_release(struct ceph_osd_data *osd_data)
num_pages = calc_pages_for((u64)osd_data->alignment,
(u64)osd_data->length);
ceph_release_page_vector(osd_data->pages, num_pages);
+ } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
+ ceph_pagelist_release(osd_data->pagelist);
}
ceph_osd_data_init(osd_data);
}
@@ -376,8 +378,10 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
switch (op->op) {
case CEPH_OSD_OP_READ:
+ case CEPH_OSD_OP_SPARSE_READ:
case CEPH_OSD_OP_WRITE:
case CEPH_OSD_OP_WRITEFULL:
+ kfree(op->extent.sparse_ext);
ceph_osd_data_release(&op->extent.osd_data);
break;
case CEPH_OSD_OP_CALL:
@@ -402,6 +406,9 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
case CEPH_OSD_OP_LIST_WATCHERS:
ceph_osd_data_release(&op->list_watchers.response_data);
break;
+ case CEPH_OSD_OP_COPY_FROM2:
+ ceph_osd_data_release(&op->copy_from.osd_data);
+ break;
default:
break;
}
@@ -442,8 +449,10 @@ static void target_copy(struct ceph_osd_request_target *dest,
dest->size = src->size;
dest->min_size = src->min_size;
dest->sort_bitwise = src->sort_bitwise;
+ dest->recovery_deletes = src->recovery_deletes;
dest->flags = src->flags;
+ dest->used_replica = src->used_replica;
dest->paused = src->paused;
dest->epoch = src->epoch;
@@ -467,7 +476,7 @@ static void request_release_checks(struct ceph_osd_request *req)
{
WARN_ON(!RB_EMPTY_NODE(&req->r_node));
WARN_ON(!RB_EMPTY_NODE(&req->r_mc_node));
- WARN_ON(!list_empty(&req->r_unsafe_item));
+ WARN_ON(!list_empty(&req->r_private_item));
WARN_ON(req->r_osd);
}
@@ -520,55 +529,18 @@ EXPORT_SYMBOL(ceph_osdc_put_request);
static void request_init(struct ceph_osd_request *req)
{
- /* req only, each op is zeroed in _osd_req_op_init() */
+ /* req only, each op is zeroed in osd_req_op_init() */
memset(req, 0, sizeof(*req));
kref_init(&req->r_kref);
init_completion(&req->r_completion);
RB_CLEAR_NODE(&req->r_node);
RB_CLEAR_NODE(&req->r_mc_node);
- INIT_LIST_HEAD(&req->r_unsafe_item);
+ INIT_LIST_HEAD(&req->r_private_item);
target_init(&req->r_t);
}
-/*
- * This is ugly, but it allows us to reuse linger registration and ping
- * requests, keeping the structure of the code around send_linger{_ping}()
- * reasonable. Setting up a min_nr=2 mempool for each linger request
- * and dealing with copying ops (this blasts req only, watch op remains
- * intact) isn't any better.
- */
-static void request_reinit(struct ceph_osd_request *req)
-{
- struct ceph_osd_client *osdc = req->r_osdc;
- bool mempool = req->r_mempool;
- unsigned int num_ops = req->r_num_ops;
- u64 snapid = req->r_snapid;
- struct ceph_snap_context *snapc = req->r_snapc;
- bool linger = req->r_linger;
- struct ceph_msg *request_msg = req->r_request;
- struct ceph_msg *reply_msg = req->r_reply;
-
- dout("%s req %p\n", __func__, req);
- WARN_ON(kref_read(&req->r_kref) != 1);
- request_release_checks(req);
-
- WARN_ON(kref_read(&request_msg->kref) != 1);
- WARN_ON(kref_read(&reply_msg->kref) != 1);
- target_destroy(&req->r_t);
-
- request_init(req);
- req->r_osdc = osdc;
- req->r_mempool = mempool;
- req->r_num_ops = num_ops;
- req->r_snapid = snapid;
- req->r_snapc = snapc;
- req->r_linger = linger;
- req->r_request = request_msg;
- req->r_reply = reply_msg;
-}
-
struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
struct ceph_snap_context *snapc,
unsigned int num_ops,
@@ -606,12 +578,15 @@ static int ceph_oloc_encoding_size(const struct ceph_object_locator *oloc)
return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0);
}
-int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
+static int __ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp,
+ int num_request_data_items,
+ int num_reply_data_items)
{
struct ceph_osd_client *osdc = req->r_osdc;
struct ceph_msg *msg;
int msg_size;
+ WARN_ON(req->r_request || req->r_reply);
WARN_ON(ceph_oid_empty(&req->r_base_oid));
WARN_ON(ceph_oloc_empty(&req->r_base_oloc));
@@ -633,9 +608,11 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
msg_size += 4 + 8; /* retry_attempt, features */
if (req->r_mempool)
- msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
+ msg = ceph_msgpool_get(&osdc->msgpool_op, msg_size,
+ num_request_data_items);
else
- msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true);
+ msg = ceph_msg_new2(CEPH_MSG_OSD_OP, msg_size,
+ num_request_data_items, gfp, true);
if (!msg)
return -ENOMEM;
@@ -648,9 +625,11 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
msg_size += req->r_num_ops * sizeof(struct ceph_osd_op);
if (req->r_mempool)
- msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+ msg = ceph_msgpool_get(&osdc->msgpool_op_reply, msg_size,
+ num_reply_data_items);
else
- msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true);
+ msg = ceph_msg_new2(CEPH_MSG_OSD_OPREPLY, msg_size,
+ num_reply_data_items, gfp, true);
if (!msg)
return -ENOMEM;
@@ -658,7 +637,6 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
return 0;
}
-EXPORT_SYMBOL(ceph_osdc_alloc_messages);
static bool osd_req_opcode_valid(u16 opcode)
{
@@ -671,13 +649,73 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE)
}
}
+static void get_num_data_items(struct ceph_osd_request *req,
+ int *num_request_data_items,
+ int *num_reply_data_items)
+{
+ struct ceph_osd_req_op *op;
+
+ *num_request_data_items = 0;
+ *num_reply_data_items = 0;
+
+ for (op = req->r_ops; op != &req->r_ops[req->r_num_ops]; op++) {
+ switch (op->op) {
+ /* request */
+ case CEPH_OSD_OP_WRITE:
+ case CEPH_OSD_OP_WRITEFULL:
+ case CEPH_OSD_OP_SETXATTR:
+ case CEPH_OSD_OP_CMPXATTR:
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ case CEPH_OSD_OP_COPY_FROM2:
+ *num_request_data_items += 1;
+ break;
+
+ /* reply */
+ case CEPH_OSD_OP_STAT:
+ case CEPH_OSD_OP_READ:
+ case CEPH_OSD_OP_SPARSE_READ:
+ case CEPH_OSD_OP_LIST_WATCHERS:
+ *num_reply_data_items += 1;
+ break;
+
+ /* both */
+ case CEPH_OSD_OP_NOTIFY:
+ *num_request_data_items += 1;
+ *num_reply_data_items += 1;
+ break;
+ case CEPH_OSD_OP_CALL:
+ *num_request_data_items += 2;
+ *num_reply_data_items += 1;
+ break;
+
+ default:
+ WARN_ON(!osd_req_opcode_valid(op->op));
+ break;
+ }
+ }
+}
+
+/*
+ * oid, oloc and OSD op opcode(s) must be filled in before this function
+ * is called.
+ */
+int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
+{
+ int num_request_data_items, num_reply_data_items;
+
+ get_num_data_items(req, &num_request_data_items, &num_reply_data_items);
+ return __ceph_osdc_alloc_messages(req, gfp, num_request_data_items,
+ num_reply_data_items);
+}
+EXPORT_SYMBOL(ceph_osdc_alloc_messages);
+
/*
* This is an osd op init function for opcodes that have no data or
* other information associated with them. It also serves as a
* common init routine for all the other init functions, below.
*/
-static struct ceph_osd_req_op *
-_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
+struct ceph_osd_req_op *
+osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
u16 opcode, u32 flags)
{
struct ceph_osd_req_op *op;
@@ -692,12 +730,6 @@ _osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
return op;
}
-
-void osd_req_op_init(struct ceph_osd_request *osd_req,
- unsigned int which, u16 opcode, u32 flags)
-{
- (void)_osd_req_op_init(osd_req, which, opcode, flags);
-}
EXPORT_SYMBOL(osd_req_op_init);
void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
@@ -705,13 +737,13 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
u64 offset, u64 length,
u64 truncate_size, u32 truncate_seq)
{
- struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
- opcode, 0);
+ struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which,
+ opcode, 0);
size_t payload_len = 0;
BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
opcode != CEPH_OSD_OP_WRITEFULL && opcode != CEPH_OSD_OP_ZERO &&
- opcode != CEPH_OSD_OP_TRUNCATE);
+ opcode != CEPH_OSD_OP_TRUNCATE && opcode != CEPH_OSD_OP_SPARSE_READ);
op->extent.offset = offset;
op->extent.length = length;
@@ -752,7 +784,7 @@ void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
BUG_ON(which + 1 >= osd_req->r_num_ops);
prev_op = &osd_req->r_ops[which];
- op = _osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags);
+ op = osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags);
/* dup previous one */
op->indata_len = prev_op->indata_len;
op->outdata_len = prev_op->outdata_len;
@@ -767,40 +799,45 @@ void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
EXPORT_SYMBOL(osd_req_op_extent_dup_last);
int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
- u16 opcode, const char *class, const char *method)
+ const char *class, const char *method)
{
- struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
- opcode, 0);
+ struct ceph_osd_req_op *op;
struct ceph_pagelist *pagelist;
size_t payload_len = 0;
size_t size;
+ int ret;
- BUG_ON(opcode != CEPH_OSD_OP_CALL);
+ op = osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0);
- pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
+ pagelist = ceph_pagelist_alloc(GFP_NOFS);
if (!pagelist)
return -ENOMEM;
- ceph_pagelist_init(pagelist);
-
op->cls.class_name = class;
size = strlen(class);
BUG_ON(size > (size_t) U8_MAX);
op->cls.class_len = size;
- ceph_pagelist_append(pagelist, class, size);
+ ret = ceph_pagelist_append(pagelist, class, size);
+ if (ret)
+ goto err_pagelist_free;
payload_len += size;
op->cls.method_name = method;
size = strlen(method);
BUG_ON(size > (size_t) U8_MAX);
op->cls.method_len = size;
- ceph_pagelist_append(pagelist, method, size);
+ ret = ceph_pagelist_append(pagelist, method, size);
+ if (ret)
+ goto err_pagelist_free;
payload_len += size;
osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
-
op->indata_len = payload_len;
return 0;
+
+err_pagelist_free:
+ ceph_pagelist_release(pagelist);
+ return ret;
}
EXPORT_SYMBOL(osd_req_op_cls_init);
@@ -808,25 +845,28 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
u16 opcode, const char *name, const void *value,
size_t size, u8 cmp_op, u8 cmp_mode)
{
- struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
- opcode, 0);
+ struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which,
+ opcode, 0);
struct ceph_pagelist *pagelist;
size_t payload_len;
+ int ret;
BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR);
- pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
+ pagelist = ceph_pagelist_alloc(GFP_NOFS);
if (!pagelist)
return -ENOMEM;
- ceph_pagelist_init(pagelist);
-
payload_len = strlen(name);
op->xattr.name_len = payload_len;
- ceph_pagelist_append(pagelist, name, payload_len);
+ ret = ceph_pagelist_append(pagelist, name, payload_len);
+ if (ret)
+ goto err_pagelist_free;
op->xattr.value_len = size;
- ceph_pagelist_append(pagelist, value, size);
+ ret = ceph_pagelist_append(pagelist, value, size);
+ if (ret)
+ goto err_pagelist_free;
payload_len += size;
op->xattr.cmp_op = cmp_op;
@@ -835,6 +875,10 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist);
op->indata_len = payload_len;
return 0;
+
+err_pagelist_free:
+ ceph_pagelist_release(pagelist);
+ return ret;
}
EXPORT_SYMBOL(osd_req_op_xattr_init);
@@ -842,27 +886,47 @@ EXPORT_SYMBOL(osd_req_op_xattr_init);
* @watch_opcode: CEPH_OSD_WATCH_OP_*
*/
static void osd_req_op_watch_init(struct ceph_osd_request *req, int which,
- u64 cookie, u8 watch_opcode)
+ u8 watch_opcode, u64 cookie, u32 gen)
{
struct ceph_osd_req_op *op;
- op = _osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0);
+ op = osd_req_op_init(req, which, CEPH_OSD_OP_WATCH, 0);
op->watch.cookie = cookie;
op->watch.op = watch_opcode;
- op->watch.gen = 0;
+ op->watch.gen = gen;
}
+/*
+ * prot_ver, timeout and notify payload (may be empty) should already be
+ * encoded in @request_pl
+ */
+static void osd_req_op_notify_init(struct ceph_osd_request *req, int which,
+ u64 cookie, struct ceph_pagelist *request_pl)
+{
+ struct ceph_osd_req_op *op;
+
+ op = osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
+ op->notify.cookie = cookie;
+
+ ceph_osd_data_pagelist_init(&op->notify.request_data, request_pl);
+ op->indata_len = request_pl->length;
+}
+
+/*
+ * @flags: CEPH_OSD_OP_ALLOC_HINT_FLAG_*
+ */
void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
unsigned int which,
u64 expected_object_size,
- u64 expected_write_size)
+ u64 expected_write_size,
+ u32 flags)
{
- struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
- CEPH_OSD_OP_SETALLOCHINT,
- 0);
+ struct ceph_osd_req_op *op;
+ op = osd_req_op_init(osd_req, which, CEPH_OSD_OP_SETALLOCHINT, 0);
op->alloc_hint.expected_object_size = expected_object_size;
op->alloc_hint.expected_write_size = expected_write_size;
+ op->alloc_hint.flags = flags;
/*
* CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
@@ -882,7 +946,7 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
BUG_ON(length > (u64) SIZE_MAX);
if (length)
ceph_msg_data_add_pages(msg, osd_data->pages,
- length, osd_data->alignment);
+ length, osd_data->alignment, false);
} else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
BUG_ON(!length);
ceph_msg_data_add_pagelist(msg, osd_data->pagelist);
@@ -892,6 +956,8 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
#endif
} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BVECS) {
ceph_msg_data_add_bvecs(msg, &osd_data->bvec_pos);
+ } else if (osd_data->type == CEPH_OSD_DATA_TYPE_ITER) {
+ ceph_msg_data_add_iter(msg, &osd_data->iter);
} else {
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
}
@@ -900,16 +966,11 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
static u32 osd_req_encode_op(struct ceph_osd_op *dst,
const struct ceph_osd_req_op *src)
{
- if (WARN_ON(!osd_req_opcode_valid(src->op))) {
- pr_err("unrecognized osd opcode %d\n", src->op);
-
- return 0;
- }
-
switch (src->op) {
case CEPH_OSD_OP_STAT:
break;
case CEPH_OSD_OP_READ:
+ case CEPH_OSD_OP_SPARSE_READ:
case CEPH_OSD_OP_WRITE:
case CEPH_OSD_OP_WRITEFULL:
case CEPH_OSD_OP_ZERO:
@@ -944,6 +1005,7 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
cpu_to_le64(src->alloc_hint.expected_object_size);
dst->alloc_hint.expected_write_size =
cpu_to_le64(src->alloc_hint.expected_write_size);
+ dst->alloc_hint.flags = cpu_to_le32(src->alloc_hint.flags);
break;
case CEPH_OSD_OP_SETXATTR:
case CEPH_OSD_OP_CMPXATTR:
@@ -955,6 +1017,18 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
case CEPH_OSD_OP_CREATE:
case CEPH_OSD_OP_DELETE:
break;
+ case CEPH_OSD_OP_COPY_FROM2:
+ dst->copy_from.snapid = cpu_to_le64(src->copy_from.snapid);
+ dst->copy_from.src_version =
+ cpu_to_le64(src->copy_from.src_version);
+ dst->copy_from.flags = src->copy_from.flags;
+ dst->copy_from.src_fadvise_flags =
+ cpu_to_le32(src->copy_from.src_fadvise_flags);
+ break;
+ case CEPH_OSD_OP_ASSERT_VER:
+ dst->assert_ver.unused = cpu_to_le64(0);
+ dst->assert_ver.ver = cpu_to_le64(src->assert_ver.ver);
+ break;
default:
pr_err("unsupported osd opcode %s\n",
ceph_osd_op_name(src->op));
@@ -997,7 +1071,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE &&
opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE &&
- opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE);
+ opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE &&
+ opcode != CEPH_OSD_OP_SPARSE_READ);
req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
GFP_NOFS);
@@ -1029,16 +1104,39 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
truncate_size, truncate_seq);
}
- req->r_flags = flags;
req->r_base_oloc.pool = layout->pool_id;
req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
+ req->r_flags = flags | osdc->client->options->read_from_replica;
req->r_snapid = vino.snap;
if (flags & CEPH_OSD_FLAG_WRITE)
req->r_data_offset = off;
- r = ceph_osdc_alloc_messages(req, GFP_NOFS);
+ if (num_ops > 1) {
+ int num_req_ops, num_rep_ops;
+
+ /*
+ * If this is a multi-op write request, assume that we'll need
+ * request ops. If it's a multi-op read then assume we'll need
+ * reply ops. Anything else and call it -EINVAL.
+ */
+ if (flags & CEPH_OSD_FLAG_WRITE) {
+ num_req_ops = num_ops;
+ num_rep_ops = 0;
+ } else if (flags & CEPH_OSD_FLAG_READ) {
+ num_req_ops = 0;
+ num_rep_ops = num_ops;
+ } else {
+ r = -EINVAL;
+ goto fail;
+ }
+
+ r = __ceph_osdc_alloc_messages(req, GFP_NOFS, num_req_ops,
+ num_rep_ops);
+ } else {
+ r = ceph_osdc_alloc_messages(req, GFP_NOFS);
+ }
if (r)
goto fail;
@@ -1050,6 +1148,20 @@ fail:
}
EXPORT_SYMBOL(ceph_osdc_new_request);
+int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt)
+{
+ WARN_ON(op->op != CEPH_OSD_OP_SPARSE_READ);
+
+ op->extent.sparse_ext_cnt = cnt;
+ op->extent.sparse_ext = kmalloc_array(cnt,
+ sizeof(*op->extent.sparse_ext),
+ GFP_NOFS);
+ if (!op->extent.sparse_ext)
+ return -ENOMEM;
+ return 0;
+}
+EXPORT_SYMBOL(__ceph_alloc_sparse_ext_map);
+
/*
* We keep osd requests in an rbtree, sorted by ->r_tid.
*/
@@ -1107,6 +1219,7 @@ static void osd_init(struct ceph_osd *osd)
{
refcount_set(&osd->o_ref, 1);
RB_CLEAR_NODE(&osd->o_node);
+ spin_lock_init(&osd->o_requests_lock);
osd->o_requests = RB_ROOT;
osd->o_linger_requests = RB_ROOT;
osd->o_backoff_mappings = RB_ROOT;
@@ -1117,6 +1230,13 @@ static void osd_init(struct ceph_osd *osd)
mutex_init(&osd->lock);
}
+static void ceph_init_sparse_read(struct ceph_sparse_read *sr)
+{
+ kfree(sr->sr_extent);
+ memset(sr, '\0', sizeof(*sr));
+ sr->sr_state = CEPH_SPARSE_READ_HDR;
+}
+
static void osd_cleanup(struct ceph_osd *osd)
{
WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
@@ -1127,6 +1247,8 @@ static void osd_cleanup(struct ceph_osd *osd)
WARN_ON(!list_empty(&osd->o_osd_lru));
WARN_ON(!list_empty(&osd->o_keepalive_item));
+ ceph_init_sparse_read(&osd->o_sparse_read);
+
if (osd->o_auth.authorizer) {
WARN_ON(osd_homeless(osd));
ceph_auth_destroy_authorizer(osd->o_auth.authorizer);
@@ -1146,6 +1268,9 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
osd_init(osd);
osd->o_osdc = osdc;
osd->o_osd = onum;
+ osd->o_sparse_op_idx = -1;
+
+ ceph_init_sparse_read(&osd->o_sparse_read);
ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
@@ -1336,7 +1461,9 @@ static void link_request(struct ceph_osd *osd, struct ceph_osd_request *req)
atomic_inc(&osd->o_osdc->num_homeless);
get_osd(osd);
+ spin_lock(&osd->o_requests_lock);
insert_request(&osd->o_requests, req);
+ spin_unlock(&osd->o_requests_lock);
req->r_osd = osd;
}
@@ -1348,7 +1475,9 @@ static void unlink_request(struct ceph_osd *osd, struct ceph_osd_request *req)
req, req->r_tid);
req->r_osd = NULL;
+ spin_lock(&osd->o_requests_lock);
erase_request(&osd->o_requests, req);
+ spin_unlock(&osd->o_requests_lock);
put_osd(osd);
if (!osd_homeless(osd))
@@ -1407,6 +1536,45 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc,
(osdc->osdmap->epoch < osdc->epoch_barrier);
}
+static int pick_random_replica(const struct ceph_osds *acting)
+{
+ int i = get_random_u32_below(acting->size);
+
+ dout("%s picked osd%d, primary osd%d\n", __func__,
+ acting->osds[i], acting->primary);
+ return i;
+}
+
+/*
+ * Picks the closest replica based on client's location given by
+ * crush_location option. Prefers the primary if the locality is
+ * the same.
+ */
+static int pick_closest_replica(struct ceph_osd_client *osdc,
+ const struct ceph_osds *acting)
+{
+ struct ceph_options *opt = osdc->client->options;
+ int best_i, best_locality;
+ int i = 0, locality;
+
+ do {
+ locality = ceph_get_crush_locality(osdc->osdmap,
+ acting->osds[i],
+ &opt->crush_locs);
+ if (i == 0 ||
+ (locality >= 0 && best_locality < 0) ||
+ (locality >= 0 && best_locality >= 0 &&
+ locality < best_locality)) {
+ best_i = i;
+ best_locality = locality;
+ }
+ } while (++i < acting->size);
+
+ dout("%s picked osd%d with locality %d, primary osd%d\n", __func__,
+ acting->osds[best_i], best_locality, acting->primary);
+ return best_i;
+}
+
enum calc_target_result {
CALC_TARGET_NO_ACTION = 0,
CALC_TARGET_NEED_RESEND,
@@ -1415,15 +1583,16 @@ enum calc_target_result {
static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
struct ceph_osd_request_target *t,
- struct ceph_connection *con,
bool any_change)
{
struct ceph_pg_pool_info *pi;
struct ceph_pg pgid, last_pgid;
struct ceph_osds up, acting;
+ bool is_read = t->flags & CEPH_OSD_FLAG_READ;
+ bool is_write = t->flags & CEPH_OSD_FLAG_WRITE;
bool force_resend = false;
bool unpaused = false;
- bool legacy_change;
+ bool legacy_change = false;
bool split = false;
bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
bool recovery_deletes = ceph_osdmap_flag(osdc,
@@ -1451,9 +1620,9 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
ceph_oid_copy(&t->target_oid, &t->base_oid);
ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
- if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
+ if (is_read && pi->read_tier >= 0)
t->target_oloc.pool = pi->read_tier;
- if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
+ if (is_write && pi->write_tier >= 0)
t->target_oloc.pool = pi->write_tier;
pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool);
@@ -1492,7 +1661,8 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
unpaused = true;
}
legacy_change = ceph_pg_compare(&t->pgid, &pgid) ||
- ceph_osds_changed(&t->acting, &acting, any_change);
+ ceph_osds_changed(&t->acting, &acting,
+ t->used_replica || any_change);
if (t->pg_num)
split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num);
@@ -1508,18 +1678,34 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
t->sort_bitwise = sort_bitwise;
t->recovery_deletes = recovery_deletes;
- t->osd = acting.primary;
+ if ((t->flags & (CEPH_OSD_FLAG_BALANCE_READS |
+ CEPH_OSD_FLAG_LOCALIZE_READS)) &&
+ !is_write && pi->type == CEPH_POOL_TYPE_REP &&
+ acting.size > 1) {
+ int pos;
+
+ WARN_ON(!is_read || acting.osds[0] != acting.primary);
+ if (t->flags & CEPH_OSD_FLAG_BALANCE_READS) {
+ pos = pick_random_replica(&acting);
+ } else {
+ pos = pick_closest_replica(osdc, &acting);
+ }
+ t->osd = acting.osds[pos];
+ t->used_replica = pos > 0;
+ } else {
+ t->osd = acting.primary;
+ t->used_replica = false;
+ }
}
- if (unpaused || legacy_change || force_resend ||
- (split && con && CEPH_HAVE_FEATURE(con->peer_features,
- RESEND_ON_SPLIT)))
+ if (unpaused || legacy_change || force_resend || split)
ct_res = CALC_TARGET_NEED_RESEND;
else
ct_res = CALC_TARGET_NO_ACTION;
out:
- dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
+ dout("%s t %p -> %d%d%d%d ct_res %d osd%d\n", __func__, t, unpaused,
+ legacy_change, force_resend, split, ct_res, t->osd);
return ct_res;
}
@@ -1845,48 +2031,56 @@ static bool should_plug_request(struct ceph_osd_request *req)
return true;
}
-static void setup_request_data(struct ceph_osd_request *req,
- struct ceph_msg *msg)
+/*
+ * Keep get_num_data_items() in sync with this function.
+ */
+static void setup_request_data(struct ceph_osd_request *req)
{
- u32 data_len = 0;
- int i;
+ struct ceph_msg *request_msg = req->r_request;
+ struct ceph_msg *reply_msg = req->r_reply;
+ struct ceph_osd_req_op *op;
- if (!list_empty(&msg->data))
+ if (req->r_request->num_data_items || req->r_reply->num_data_items)
return;
- WARN_ON(msg->data_length);
- for (i = 0; i < req->r_num_ops; i++) {
- struct ceph_osd_req_op *op = &req->r_ops[i];
-
+ WARN_ON(request_msg->data_length || reply_msg->data_length);
+ for (op = req->r_ops; op != &req->r_ops[req->r_num_ops]; op++) {
switch (op->op) {
/* request */
case CEPH_OSD_OP_WRITE:
case CEPH_OSD_OP_WRITEFULL:
WARN_ON(op->indata_len != op->extent.length);
- ceph_osdc_msg_data_add(msg, &op->extent.osd_data);
+ ceph_osdc_msg_data_add(request_msg,
+ &op->extent.osd_data);
break;
case CEPH_OSD_OP_SETXATTR:
case CEPH_OSD_OP_CMPXATTR:
WARN_ON(op->indata_len != op->xattr.name_len +
op->xattr.value_len);
- ceph_osdc_msg_data_add(msg, &op->xattr.osd_data);
+ ceph_osdc_msg_data_add(request_msg,
+ &op->xattr.osd_data);
break;
case CEPH_OSD_OP_NOTIFY_ACK:
- ceph_osdc_msg_data_add(msg,
+ ceph_osdc_msg_data_add(request_msg,
&op->notify_ack.request_data);
break;
+ case CEPH_OSD_OP_COPY_FROM2:
+ ceph_osdc_msg_data_add(request_msg,
+ &op->copy_from.osd_data);
+ break;
/* reply */
case CEPH_OSD_OP_STAT:
- ceph_osdc_msg_data_add(req->r_reply,
+ ceph_osdc_msg_data_add(reply_msg,
&op->raw_data_in);
break;
case CEPH_OSD_OP_READ:
- ceph_osdc_msg_data_add(req->r_reply,
+ case CEPH_OSD_OP_SPARSE_READ:
+ ceph_osdc_msg_data_add(reply_msg,
&op->extent.osd_data);
break;
case CEPH_OSD_OP_LIST_WATCHERS:
- ceph_osdc_msg_data_add(req->r_reply,
+ ceph_osdc_msg_data_add(reply_msg,
&op->list_watchers.response_data);
break;
@@ -1895,25 +2089,23 @@ static void setup_request_data(struct ceph_osd_request *req,
WARN_ON(op->indata_len != op->cls.class_len +
op->cls.method_len +
op->cls.indata_len);
- ceph_osdc_msg_data_add(msg, &op->cls.request_info);
+ ceph_osdc_msg_data_add(request_msg,
+ &op->cls.request_info);
/* optional, can be NONE */
- ceph_osdc_msg_data_add(msg, &op->cls.request_data);
+ ceph_osdc_msg_data_add(request_msg,
+ &op->cls.request_data);
/* optional, can be NONE */
- ceph_osdc_msg_data_add(req->r_reply,
+ ceph_osdc_msg_data_add(reply_msg,
&op->cls.response_data);
break;
case CEPH_OSD_OP_NOTIFY:
- ceph_osdc_msg_data_add(msg,
+ ceph_osdc_msg_data_add(request_msg,
&op->notify.request_data);
- ceph_osdc_msg_data_add(req->r_reply,
+ ceph_osdc_msg_data_add(reply_msg,
&op->notify.response_data);
break;
}
-
- data_len += op->indata_len;
}
-
- WARN_ON(data_len != msg->data_length);
}
static void encode_pgid(void **p, const struct ceph_pg *pgid)
@@ -1961,7 +2153,7 @@ static void encode_request_partial(struct ceph_osd_request *req,
req->r_data_offset || req->r_snapc);
}
- setup_request_data(req, msg);
+ setup_request_data(req);
encode_spgid(&p, &req->r_t.spgid); /* actual spg */
ceph_encode_32(&p, req->r_t.pgid.seed); /* raw hash */
@@ -2195,7 +2387,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
again:
- ct_res = calc_target(osdc, &req->r_t, NULL, false);
+ ct_res = calc_target(osdc, &req->r_t, false);
if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
goto promote;
@@ -2229,10 +2421,14 @@ again:
(ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
pool_full(osdc, req->r_t.base_oloc.pool))) {
dout("req %p full/pool_full\n", req);
- if (osdc->abort_on_full) {
+ if (ceph_test_opt(osdc->client, ABORT_ON_FULL)) {
err = -ENOSPC;
} else {
- pr_warn_ratelimited("FULL or reached pool quota\n");
+ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL))
+ pr_warn_ratelimited("cluster is full (osdmap FULL)\n");
+ else
+ pr_warn_ratelimited("pool %lld is full or reached quota\n",
+ req->r_t.base_oloc.pool);
req->r_t.paused = true;
maybe_request_map(osdc);
}
@@ -2280,6 +2476,7 @@ static void account_request(struct ceph_osd_request *req)
atomic_inc(&req->r_osdc->num_requests);
req->r_start_stamp = jiffies;
+ req->r_start_latency = ktime_get();
}
static void submit_request(struct ceph_osd_request *req, bool wrlocked)
@@ -2296,8 +2493,12 @@ static void finish_request(struct ceph_osd_request *req)
WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
- if (req->r_osd)
+ req->r_end_latency = ktime_get();
+
+ if (req->r_osd) {
+ ceph_init_sparse_read(&req->r_osd->o_sparse_read);
unlink_request(req->r_osd, req);
+ }
atomic_dec(&osdc->num_requests);
/*
@@ -2312,7 +2513,7 @@ static void finish_request(struct ceph_osd_request *req)
static void __complete_request(struct ceph_osd_request *req)
{
- dout("%s req %p tid %llu cb %pf result %d\n", __func__, req,
+ dout("%s req %p tid %llu cb %ps result %d\n", __func__, req,
req->r_tid, req->r_callback, req->r_result);
if (req->r_callback)
@@ -2399,6 +2600,14 @@ void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err)
}
EXPORT_SYMBOL(ceph_osdc_abort_requests);
+void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc)
+{
+ down_write(&osdc->lock);
+ osdc->abort_err = 0;
+ up_write(&osdc->lock);
+}
+EXPORT_SYMBOL(ceph_osdc_clear_abort_err);
+
static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
{
if (likely(eb > osdc->epoch_barrier)) {
@@ -2459,7 +2668,7 @@ static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc)
{
bool victims = false;
- if (osdc->abort_on_full &&
+ if (ceph_test_opt(osdc->client, ABORT_ON_FULL) &&
(ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc)))
for_each_request(osdc, abort_on_full_fn, &victims);
}
@@ -2563,10 +2772,13 @@ static void linger_release(struct kref *kref)
WARN_ON(!list_empty(&lreq->pending_lworks));
WARN_ON(lreq->osd);
- if (lreq->reg_req)
- ceph_osdc_put_request(lreq->reg_req);
- if (lreq->ping_req)
- ceph_osdc_put_request(lreq->ping_req);
+ if (lreq->request_pl)
+ ceph_pagelist_release(lreq->request_pl);
+ if (lreq->notify_id_pages)
+ ceph_release_page_vector(lreq->notify_id_pages, 1);
+
+ ceph_osdc_put_request(lreq->reg_req);
+ ceph_osdc_put_request(lreq->ping_req);
target_destroy(&lreq->t);
kfree(lreq);
}
@@ -2835,6 +3047,12 @@ static void linger_commit_cb(struct ceph_osd_request *req)
struct ceph_osd_linger_request *lreq = req->r_priv;
mutex_lock(&lreq->lock);
+ if (req != lreq->reg_req) {
+ dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n",
+ __func__, lreq, lreq->linger_id, req, lreq->reg_req);
+ goto out;
+ }
+
dout("%s lreq %p linger_id %llu result %d\n", __func__, lreq,
lreq->linger_id, req->r_result);
linger_reg_commit_complete(lreq, req->r_result);
@@ -2858,6 +3076,7 @@ static void linger_commit_cb(struct ceph_osd_request *req)
}
}
+out:
mutex_unlock(&lreq->lock);
linger_put(lreq);
}
@@ -2880,6 +3099,12 @@ static void linger_reconnect_cb(struct ceph_osd_request *req)
struct ceph_osd_linger_request *lreq = req->r_priv;
mutex_lock(&lreq->lock);
+ if (req != lreq->reg_req) {
+ dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n",
+ __func__, lreq, lreq->linger_id, req, lreq->reg_req);
+ goto out;
+ }
+
dout("%s lreq %p linger_id %llu result %d last_error %d\n", __func__,
lreq, lreq->linger_id, req->r_result, lreq->last_error);
if (req->r_result < 0) {
@@ -2889,48 +3114,64 @@ static void linger_reconnect_cb(struct ceph_osd_request *req)
}
}
+out:
mutex_unlock(&lreq->lock);
linger_put(lreq);
}
static void send_linger(struct ceph_osd_linger_request *lreq)
{
- struct ceph_osd_request *req = lreq->reg_req;
- struct ceph_osd_req_op *op = &req->r_ops[0];
+ struct ceph_osd_client *osdc = lreq->osdc;
+ struct ceph_osd_request *req;
+ int ret;
- verify_osdc_wrlocked(req->r_osdc);
+ verify_osdc_wrlocked(osdc);
+ mutex_lock(&lreq->lock);
dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
- if (req->r_osd)
- cancel_linger_request(req);
+ if (lreq->reg_req) {
+ if (lreq->reg_req->r_osd)
+ cancel_linger_request(lreq->reg_req);
+ ceph_osdc_put_request(lreq->reg_req);
+ }
- request_reinit(req);
- ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
- ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
- req->r_flags = lreq->t.flags;
+ req = ceph_osdc_alloc_request(osdc, NULL, 1, true, GFP_NOIO);
+ BUG_ON(!req);
+
+ target_copy(&req->r_t, &lreq->t);
req->r_mtime = lreq->mtime;
- mutex_lock(&lreq->lock);
if (lreq->is_watch && lreq->committed) {
- WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
- op->watch.cookie != lreq->linger_id);
- op->watch.op = CEPH_OSD_WATCH_OP_RECONNECT;
- op->watch.gen = ++lreq->register_gen;
+ osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_RECONNECT,
+ lreq->linger_id, ++lreq->register_gen);
dout("lreq %p reconnect register_gen %u\n", lreq,
- op->watch.gen);
+ req->r_ops[0].watch.gen);
req->r_callback = linger_reconnect_cb;
} else {
- if (!lreq->is_watch)
+ if (lreq->is_watch) {
+ osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_WATCH,
+ lreq->linger_id, 0);
+ } else {
lreq->notify_id = 0;
- else
- WARN_ON(op->watch.op != CEPH_OSD_WATCH_OP_WATCH);
+
+ refcount_inc(&lreq->request_pl->refcnt);
+ osd_req_op_notify_init(req, 0, lreq->linger_id,
+ lreq->request_pl);
+ ceph_osd_data_pages_init(
+ osd_req_op_data(req, 0, notify, response_data),
+ lreq->notify_id_pages, PAGE_SIZE, 0, false, false);
+ }
dout("lreq %p register\n", lreq);
req->r_callback = linger_commit_cb;
}
- mutex_unlock(&lreq->lock);
+
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ BUG_ON(ret);
req->r_priv = linger_get(lreq);
req->r_linger = true;
+ lreq->reg_req = req;
+ mutex_unlock(&lreq->lock);
submit_request(req, true);
}
@@ -2940,6 +3181,12 @@ static void linger_ping_cb(struct ceph_osd_request *req)
struct ceph_osd_linger_request *lreq = req->r_priv;
mutex_lock(&lreq->lock);
+ if (req != lreq->ping_req) {
+ dout("%s lreq %p linger_id %llu unknown req (%p != %p)\n",
+ __func__, lreq, lreq->linger_id, req, lreq->ping_req);
+ goto out;
+ }
+
dout("%s lreq %p linger_id %llu result %d ping_sent %lu last_error %d\n",
__func__, lreq, lreq->linger_id, req->r_result, lreq->ping_sent,
lreq->last_error);
@@ -2955,6 +3202,7 @@ static void linger_ping_cb(struct ceph_osd_request *req)
lreq->register_gen, req->r_ops[0].watch.gen);
}
+out:
mutex_unlock(&lreq->lock);
linger_put(lreq);
}
@@ -2962,8 +3210,8 @@ static void linger_ping_cb(struct ceph_osd_request *req)
static void send_linger_ping(struct ceph_osd_linger_request *lreq)
{
struct ceph_osd_client *osdc = lreq->osdc;
- struct ceph_osd_request *req = lreq->ping_req;
- struct ceph_osd_req_op *op = &req->r_ops[0];
+ struct ceph_osd_request *req;
+ int ret;
if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSERD)) {
dout("%s PAUSERD\n", __func__);
@@ -2975,19 +3223,26 @@ static void send_linger_ping(struct ceph_osd_linger_request *lreq)
__func__, lreq, lreq->linger_id, lreq->ping_sent,
lreq->register_gen);
- if (req->r_osd)
- cancel_linger_request(req);
+ if (lreq->ping_req) {
+ if (lreq->ping_req->r_osd)
+ cancel_linger_request(lreq->ping_req);
+ ceph_osdc_put_request(lreq->ping_req);
+ }
- request_reinit(req);
- target_copy(&req->r_t, &lreq->t);
+ req = ceph_osdc_alloc_request(osdc, NULL, 1, true, GFP_NOIO);
+ BUG_ON(!req);
- WARN_ON(op->op != CEPH_OSD_OP_WATCH ||
- op->watch.cookie != lreq->linger_id ||
- op->watch.op != CEPH_OSD_WATCH_OP_PING);
- op->watch.gen = lreq->register_gen;
+ target_copy(&req->r_t, &lreq->t);
+ osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_PING, lreq->linger_id,
+ lreq->register_gen);
req->r_callback = linger_ping_cb;
+
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ BUG_ON(ret);
+
req->r_priv = linger_get(lreq);
req->r_linger = true;
+ lreq->ping_req = req;
ceph_osdc_get_request(req);
account_request(req);
@@ -3001,11 +3256,15 @@ static void linger_submit(struct ceph_osd_linger_request *lreq)
struct ceph_osd_client *osdc = lreq->osdc;
struct ceph_osd *osd;
- calc_target(osdc, &lreq->t, NULL, false);
+ down_write(&osdc->lock);
+ linger_register(lreq);
+
+ calc_target(osdc, &lreq->t, false);
osd = lookup_create_osd(osdc, lreq->t.osd, true);
link_linger(osd, lreq);
send_linger(lreq);
+ up_write(&osdc->lock);
}
static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
@@ -3030,9 +3289,9 @@ static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq)
*/
static void __linger_cancel(struct ceph_osd_linger_request *lreq)
{
- if (lreq->is_watch && lreq->ping_req->r_osd)
+ if (lreq->ping_req && lreq->ping_req->r_osd)
cancel_linger_request(lreq->ping_req);
- if (lreq->reg_req->r_osd)
+ if (lreq->reg_req && lreq->reg_req->r_osd)
cancel_linger_request(lreq->reg_req);
cancel_linger_map_check(lreq);
unlink_linger(lreq->osd, lreq);
@@ -3137,17 +3396,24 @@ static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
int ret;
dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
- ret = wait_for_completion_interruptible(&lreq->reg_commit_wait);
+ ret = wait_for_completion_killable(&lreq->reg_commit_wait);
return ret ?: lreq->reg_commit_error;
}
-static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq)
+static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq,
+ unsigned long timeout)
{
- int ret;
+ long left;
dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
- ret = wait_for_completion_interruptible(&lreq->notify_finish_wait);
- return ret ?: lreq->notify_finish_error;
+ left = wait_for_completion_killable_timeout(&lreq->notify_finish_wait,
+ ceph_timeout_jiffies(timeout));
+ if (left <= 0)
+ left = left ?: -ETIMEDOUT;
+ else
+ left = lreq->notify_finish_error; /* completed */
+
+ return left;
}
/*
@@ -3372,9 +3638,6 @@ static int ceph_redirect_decode(void **p, void *end,
goto e_inval;
}
- len = ceph_decode_32(p);
- *p += len; /* skip osd_instructions */
-
/* skip the rest */
*p = struct_end;
out:
@@ -3541,7 +3804,29 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
* supported.
*/
req->r_t.target_oloc.pool = m.redirect.oloc.pool;
- req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
+ req->r_flags |= CEPH_OSD_FLAG_REDIRECTED |
+ CEPH_OSD_FLAG_IGNORE_OVERLAY |
+ CEPH_OSD_FLAG_IGNORE_CACHE;
+ req->r_tid = 0;
+ __submit_request(req, false);
+ goto out_unlock_osdc;
+ }
+
+ if (m.result == -EAGAIN) {
+ dout("req %p tid %llu EAGAIN\n", req, req->r_tid);
+ unlink_request(osd, req);
+ mutex_unlock(&osd->lock);
+
+ /*
+ * The object is missing on the replica or not (yet)
+ * readable. Clear pgid to force a resend to the primary
+ * via legacy_change.
+ */
+ req->r_t.pgid.pool = 0;
+ req->r_t.pgid.seed = 0;
+ WARN_ON(!req->r_t.used_replica);
+ req->r_flags &= ~(CEPH_OSD_FLAG_BALANCE_READS |
+ CEPH_OSD_FLAG_LOCALIZE_READS);
req->r_tid = 0;
__submit_request(req, false);
goto out_unlock_osdc;
@@ -3572,6 +3857,7 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
* one (type of) reply back.
*/
WARN_ON(!(m.flags & CEPH_OSD_FLAG_ONDISK));
+ req->r_version = m.user_version;
req->r_result = m.result ?: data_len;
finish_request(req);
mutex_unlock(&osd->lock);
@@ -3617,7 +3903,7 @@ recalc_linger_target(struct ceph_osd_linger_request *lreq)
struct ceph_osd_client *osdc = lreq->osdc;
enum calc_target_result ct_res;
- ct_res = calc_target(osdc, &lreq->t, NULL, true);
+ ct_res = calc_target(osdc, &lreq->t, true);
if (ct_res == CALC_TARGET_NEED_RESEND) {
struct ceph_osd *osd;
@@ -3663,7 +3949,7 @@ static void scan_requests(struct ceph_osd *osd,
if (!force_resend && !force_resend_writes)
break;
- /* fall through */
+ fallthrough;
case CALC_TARGET_NEED_RESEND:
cancel_linger_map_check(lreq);
/*
@@ -3689,8 +3975,7 @@ static void scan_requests(struct ceph_osd *osd,
n = rb_next(n); /* unlink_request(), check_pool_dne() */
dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
- ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con,
- false);
+ ct_res = calc_target(osdc, &req->r_t, false);
switch (ct_res) {
case CALC_TARGET_NO_ACTION:
force_resend_writes = cleared_full ||
@@ -3701,7 +3986,7 @@ static void scan_requests(struct ceph_osd *osd,
!force_resend_writes))
break;
- /* fall through */
+ fallthrough;
case CALC_TARGET_NEED_RESEND:
cancel_map_check(req);
unlink_request(osd, req);
@@ -3728,9 +4013,11 @@ static int handle_one_map(struct ceph_osd_client *osdc,
set_pool_was_full(osdc);
if (incremental)
- newmap = osdmap_apply_incremental(&p, end, osdc->osdmap);
+ newmap = osdmap_apply_incremental(&p, end,
+ ceph_msgr2(osdc->client),
+ osdc->osdmap);
else
- newmap = ceph_osdmap_decode(&p, end);
+ newmap = ceph_osdmap_decode(&p, end, ceph_msgr2(osdc->client));
if (IS_ERR(newmap))
return PTR_ERR(newmap);
@@ -3799,7 +4086,7 @@ static void kick_requests(struct ceph_osd_client *osdc,
n = rb_next(n);
if (req->r_t.epoch < osdc->osdmap->epoch) {
- ct_res = calc_target(osdc, &req->r_t, NULL, false);
+ ct_res = calc_target(osdc, &req->r_t, false);
if (ct_res == CALC_TARGET_POOL_DNE) {
erase_request(need_resend, req);
check_pool_dne(req);
@@ -4318,9 +4605,7 @@ static void handle_watch_notify(struct ceph_osd_client *osdc,
lreq->notify_id, notify_id);
} else if (!completion_done(&lreq->notify_finish_wait)) {
struct ceph_msg_data *data =
- list_first_entry_or_null(&msg->data,
- struct ceph_msg_data,
- links);
+ msg->num_data_items ? &msg->data[0] : NULL;
if (data) {
if (lreq->preply_pages) {
@@ -4328,9 +4613,7 @@ static void handle_watch_notify(struct ceph_osd_client *osdc,
CEPH_MSG_DATA_PAGES);
*lreq->preply_pages = data->pages;
*lreq->preply_len = data->length;
- } else {
- ceph_release_page_vector(data->pages,
- calc_pages_for(0, data->length));
+ data->own_pages = false;
}
}
lreq->notify_finish_error = return_code;
@@ -4365,21 +4648,23 @@ bad:
/*
* Register request, send initial attempt.
*/
-int ceph_osdc_start_request(struct ceph_osd_client *osdc,
- struct ceph_osd_request *req,
- bool nofail)
+void ceph_osdc_start_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
{
down_read(&osdc->lock);
submit_request(req, false);
up_read(&osdc->lock);
-
- return 0;
}
EXPORT_SYMBOL(ceph_osdc_start_request);
/*
- * Unregister a registered request. The request is not completed:
- * ->r_result isn't set and __complete_request() isn't called.
+ * Unregister request. If @req was registered, it isn't completed:
+ * r_result isn't set and __complete_request() isn't invoked.
+ *
+ * If @req wasn't registered, this call may have raced with
+ * handle_reply(), in which case r_result would already be set and
+ * __complete_request() would be getting invoked, possibly even
+ * concurrently with this call.
*/
void ceph_osdc_cancel_request(struct ceph_osd_request *req)
{
@@ -4465,26 +4750,6 @@ again:
}
EXPORT_SYMBOL(ceph_osdc_sync);
-static struct ceph_osd_request *
-alloc_linger_request(struct ceph_osd_linger_request *lreq)
-{
- struct ceph_osd_request *req;
-
- req = ceph_osdc_alloc_request(lreq->osdc, NULL, 1, false, GFP_NOIO);
- if (!req)
- return NULL;
-
- ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid);
- ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
-
- if (ceph_osdc_alloc_messages(req, GFP_NOIO)) {
- ceph_osdc_put_request(req);
- return NULL;
- }
-
- return req;
-}
-
/*
* Returns a handle, caller owns a ref.
*/
@@ -4514,27 +4779,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc,
lreq->t.flags = CEPH_OSD_FLAG_WRITE;
ktime_get_real_ts64(&lreq->mtime);
- lreq->reg_req = alloc_linger_request(lreq);
- if (!lreq->reg_req) {
- ret = -ENOMEM;
- goto err_put_lreq;
- }
-
- lreq->ping_req = alloc_linger_request(lreq);
- if (!lreq->ping_req) {
- ret = -ENOMEM;
- goto err_put_lreq;
- }
-
- down_write(&osdc->lock);
- linger_register(lreq); /* before osd_req_op_* */
- osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id,
- CEPH_OSD_WATCH_OP_WATCH);
- osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id,
- CEPH_OSD_WATCH_OP_PING);
linger_submit(lreq);
- up_write(&osdc->lock);
-
ret = linger_reg_commit_wait(lreq);
if (ret) {
linger_cancel(lreq);
@@ -4571,14 +4816,14 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc,
ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc);
req->r_flags = CEPH_OSD_FLAG_WRITE;
ktime_get_real_ts64(&req->r_mtime);
- osd_req_op_watch_init(req, 0, lreq->linger_id,
- CEPH_OSD_WATCH_OP_UNWATCH);
+ osd_req_op_watch_init(req, 0, CEPH_OSD_WATCH_OP_UNWATCH,
+ lreq->linger_id, 0);
ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
if (ret)
goto out_put_req;
- ceph_osdc_start_request(osdc, req, false);
+ ceph_osdc_start_request(osdc, req);
linger_cancel(lreq);
linger_put(lreq);
ret = wait_request_timeout(req, opts->mount_timeout);
@@ -4597,13 +4842,12 @@ static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which,
struct ceph_pagelist *pl;
int ret;
- op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0);
+ op = osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0);
- pl = kmalloc(sizeof(*pl), GFP_NOIO);
+ pl = ceph_pagelist_alloc(GFP_NOIO);
if (!pl)
return -ENOMEM;
- ceph_pagelist_init(pl);
ret = ceph_pagelist_encode_64(pl, notify_id);
ret |= ceph_pagelist_encode_64(pl, cookie);
if (payload) {
@@ -4641,16 +4885,16 @@ int ceph_osdc_notify_ack(struct ceph_osd_client *osdc,
ceph_oloc_copy(&req->r_base_oloc, oloc);
req->r_flags = CEPH_OSD_FLAG_READ;
- ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
+ payload_len);
if (ret)
goto out_put_req;
- ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload,
- payload_len);
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
if (ret)
goto out_put_req;
- ceph_osdc_start_request(osdc, req, false);
+ ceph_osdc_start_request(osdc, req);
ret = ceph_osdc_wait_request(osdc, req);
out_put_req:
@@ -4659,36 +4903,6 @@ out_put_req:
}
EXPORT_SYMBOL(ceph_osdc_notify_ack);
-static int osd_req_op_notify_init(struct ceph_osd_request *req, int which,
- u64 cookie, u32 prot_ver, u32 timeout,
- void *payload, u32 payload_len)
-{
- struct ceph_osd_req_op *op;
- struct ceph_pagelist *pl;
- int ret;
-
- op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0);
- op->notify.cookie = cookie;
-
- pl = kmalloc(sizeof(*pl), GFP_NOIO);
- if (!pl)
- return -ENOMEM;
-
- ceph_pagelist_init(pl);
- ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */
- ret |= ceph_pagelist_encode_32(pl, timeout);
- ret |= ceph_pagelist_encode_32(pl, payload_len);
- ret |= ceph_pagelist_append(pl, payload, payload_len);
- if (ret) {
- ceph_pagelist_release(pl);
- return -ENOMEM;
- }
-
- ceph_osd_data_pagelist_init(&op->notify.request_data, pl);
- op->indata_len = pl->length;
- return 0;
-}
-
/*
* @timeout: in seconds
*
@@ -4707,7 +4921,6 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc,
size_t *preply_len)
{
struct ceph_osd_linger_request *lreq;
- struct page **pages;
int ret;
WARN_ON(!timeout);
@@ -4720,45 +4933,41 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc,
if (!lreq)
return -ENOMEM;
- lreq->preply_pages = preply_pages;
- lreq->preply_len = preply_len;
-
- ceph_oid_copy(&lreq->t.base_oid, oid);
- ceph_oloc_copy(&lreq->t.base_oloc, oloc);
- lreq->t.flags = CEPH_OSD_FLAG_READ;
-
- lreq->reg_req = alloc_linger_request(lreq);
- if (!lreq->reg_req) {
+ lreq->request_pl = ceph_pagelist_alloc(GFP_NOIO);
+ if (!lreq->request_pl) {
ret = -ENOMEM;
goto out_put_lreq;
}
- /* for notify_id */
- pages = ceph_alloc_page_vector(1, GFP_NOIO);
- if (IS_ERR(pages)) {
- ret = PTR_ERR(pages);
+ ret = ceph_pagelist_encode_32(lreq->request_pl, 1); /* prot_ver */
+ ret |= ceph_pagelist_encode_32(lreq->request_pl, timeout);
+ ret |= ceph_pagelist_encode_32(lreq->request_pl, payload_len);
+ ret |= ceph_pagelist_append(lreq->request_pl, payload, payload_len);
+ if (ret) {
+ ret = -ENOMEM;
goto out_put_lreq;
}
- down_write(&osdc->lock);
- linger_register(lreq); /* before osd_req_op_* */
- ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1,
- timeout, payload, payload_len);
- if (ret) {
- linger_unregister(lreq);
- up_write(&osdc->lock);
- ceph_release_page_vector(pages, 1);
+ /* for notify_id */
+ lreq->notify_id_pages = ceph_alloc_page_vector(1, GFP_NOIO);
+ if (IS_ERR(lreq->notify_id_pages)) {
+ ret = PTR_ERR(lreq->notify_id_pages);
+ lreq->notify_id_pages = NULL;
goto out_put_lreq;
}
- ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify,
- response_data),
- pages, PAGE_SIZE, 0, false, true);
- linger_submit(lreq);
- up_write(&osdc->lock);
+ lreq->preply_pages = preply_pages;
+ lreq->preply_len = preply_len;
+
+ ceph_oid_copy(&lreq->t.base_oid, oid);
+ ceph_oloc_copy(&lreq->t.base_oloc, oloc);
+ lreq->t.flags = CEPH_OSD_FLAG_READ;
+
+ linger_submit(lreq);
ret = linger_reg_commit_wait(lreq);
if (!ret)
- ret = linger_notify_finish_wait(lreq);
+ ret = linger_notify_finish_wait(lreq,
+ msecs_to_jiffies(2 * timeout * MSEC_PER_SEC));
else
dout("lreq %p failed to initiate notify %d\n", lreq, ret);
@@ -4769,40 +4978,6 @@ out_put_lreq:
}
EXPORT_SYMBOL(ceph_osdc_notify);
-/*
- * Return the number of milliseconds since the watch was last
- * confirmed, or an error. If there is an error, the watch is no
- * longer valid, and should be destroyed with ceph_osdc_unwatch().
- */
-int ceph_osdc_watch_check(struct ceph_osd_client *osdc,
- struct ceph_osd_linger_request *lreq)
-{
- unsigned long stamp, age;
- int ret;
-
- down_read(&osdc->lock);
- mutex_lock(&lreq->lock);
- stamp = lreq->watch_valid_thru;
- if (!list_empty(&lreq->pending_lworks)) {
- struct linger_work *lwork =
- list_first_entry(&lreq->pending_lworks,
- struct linger_work,
- pending_item);
-
- if (time_before(lwork->queued_stamp, stamp))
- stamp = lwork->queued_stamp;
- }
- age = jiffies - stamp;
- dout("%s lreq %p linger_id %llu age %lu last_error %d\n", __func__,
- lreq, lreq->linger_id, age, lreq->last_error);
- /* we are truncating to msecs, so return a safe upper bound */
- ret = lreq->last_error ?: 1 + jiffies_to_msecs(age);
-
- mutex_unlock(&lreq->lock);
- up_read(&osdc->lock);
- return ret;
-}
-
static int decode_watcher(void **p, void *end, struct ceph_watch_item *item)
{
u8 struct_v;
@@ -4812,20 +4987,26 @@ static int decode_watcher(void **p, void *end, struct ceph_watch_item *item)
ret = ceph_start_decoding(p, end, 2, "watch_item_t",
&struct_v, &struct_len);
if (ret)
- return ret;
+ goto bad;
+
+ ret = -EINVAL;
+ ceph_decode_copy_safe(p, end, &item->name, sizeof(item->name), bad);
+ ceph_decode_64_safe(p, end, item->cookie, bad);
+ ceph_decode_skip_32(p, end, bad); /* skip timeout seconds */
- ceph_decode_copy(p, &item->name, sizeof(item->name));
- item->cookie = ceph_decode_64(p);
- *p += 4; /* skip timeout_seconds */
if (struct_v >= 2) {
- ceph_decode_copy(p, &item->addr, sizeof(item->addr));
- ceph_decode_addr(&item->addr);
+ ret = ceph_decode_entity_addr(p, end, &item->addr);
+ if (ret)
+ goto bad;
+ } else {
+ ret = 0;
}
dout("%s %s%llu cookie %llu addr %s\n", __func__,
ENTITY_NAME(item->name), item->cookie,
- ceph_pr_addr(&item->addr.in_addr));
- return 0;
+ ceph_pr_addr(&item->addr));
+bad:
+ return ret;
}
static int decode_watchers(void **p, void *end,
@@ -4881,10 +5062,6 @@ int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
ceph_oloc_copy(&req->r_base_oloc, oloc);
req->r_flags = CEPH_OSD_FLAG_READ;
- ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
- if (ret)
- goto out_put_req;
-
pages = ceph_alloc_page_vector(1, GFP_NOIO);
if (IS_ERR(pages)) {
ret = PTR_ERR(pages);
@@ -4896,7 +5073,11 @@ int ceph_osdc_list_watchers(struct ceph_osd_client *osdc,
response_data),
pages, PAGE_SIZE, 0, false, true);
- ceph_osdc_start_request(osdc, req, false);
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ if (ret)
+ goto out_put_req;
+
+ ceph_osdc_start_request(osdc, req);
ret = ceph_osdc_wait_request(osdc, req);
if (ret >= 0) {
void *p = page_address(pages[0]);
@@ -4942,12 +5123,12 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
const char *class, const char *method,
unsigned int flags,
struct page *req_page, size_t req_len,
- struct page *resp_page, size_t *resp_len)
+ struct page **resp_pages, size_t *resp_len)
{
struct ceph_osd_request *req;
int ret;
- if (req_len > PAGE_SIZE || (resp_page && *resp_len > PAGE_SIZE))
+ if (req_len > PAGE_SIZE)
return -E2BIG;
req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_NOIO);
@@ -4958,26 +5139,26 @@ int ceph_osdc_call(struct ceph_osd_client *osdc,
ceph_oloc_copy(&req->r_base_oloc, oloc);
req->r_flags = flags;
- ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
- if (ret)
- goto out_put_req;
-
- ret = osd_req_op_cls_init(req, 0, CEPH_OSD_OP_CALL, class, method);
+ ret = osd_req_op_cls_init(req, 0, class, method);
if (ret)
goto out_put_req;
if (req_page)
osd_req_op_cls_request_data_pages(req, 0, &req_page, req_len,
0, false, false);
- if (resp_page)
- osd_req_op_cls_response_data_pages(req, 0, &resp_page,
+ if (resp_pages)
+ osd_req_op_cls_response_data_pages(req, 0, resp_pages,
*resp_len, 0, false, false);
- ceph_osdc_start_request(osdc, req, false);
+ ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+ if (ret)
+ goto out_put_req;
+
+ ceph_osdc_start_request(osdc, req);
ret = ceph_osdc_wait_request(osdc, req);
if (ret >= 0) {
ret = req->r_ops[0].rval;
- if (resp_page)
+ if (resp_pages)
*resp_len = req->r_ops[0].outdata_len;
}
@@ -4988,6 +5169,24 @@ out_put_req:
EXPORT_SYMBOL(ceph_osdc_call);
/*
+ * reset all osd connections
+ */
+void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc)
+{
+ struct rb_node *n;
+
+ down_write(&osdc->lock);
+ for (n = rb_first(&osdc->osds); n; ) {
+ struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
+
+ n = rb_next(n);
+ if (!reopen_osd(osd))
+ kick_osd_requests(osd);
+ }
+ up_write(&osdc->lock);
+}
+
+/*
* init, shutdown
*/
int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
@@ -5021,11 +5220,12 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
goto out_map;
err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
- PAGE_SIZE, 10, true, "osd_op");
+ PAGE_SIZE, CEPH_OSD_SLAB_OPS, 10, "osd_op");
if (err < 0)
goto out_mempool;
err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
- PAGE_SIZE, 10, true, "osd_op_reply");
+ PAGE_SIZE, CEPH_OSD_SLAB_OPS, 10,
+ "osd_op_reply");
if (err < 0)
goto out_msgpool;
@@ -5089,84 +5289,43 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
ceph_msgpool_destroy(&osdc->msgpool_op_reply);
}
-/*
- * Read some contiguous pages. If we cross a stripe boundary, shorten
- * *plen. Return number of bytes read, or error.
- */
-int ceph_osdc_readpages(struct ceph_osd_client *osdc,
- struct ceph_vino vino, struct ceph_file_layout *layout,
- u64 off, u64 *plen,
- u32 truncate_seq, u64 truncate_size,
- struct page **pages, int num_pages, int page_align)
-{
- struct ceph_osd_request *req;
- int rc = 0;
-
- dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
- vino.snap, off, *plen);
- req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1,
- CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
- NULL, truncate_seq, truncate_size,
- false);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- /* it may be a short read due to an object boundary */
- osd_req_op_extent_osd_data_pages(req, 0,
- pages, *plen, page_align, false, false);
-
- dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
- off, *plen, *plen, page_align);
-
- rc = ceph_osdc_start_request(osdc, req, false);
- if (!rc)
- rc = ceph_osdc_wait_request(osdc, req);
-
- ceph_osdc_put_request(req);
- dout("readpages result %d\n", rc);
- return rc;
-}
-EXPORT_SYMBOL(ceph_osdc_readpages);
-
-/*
- * do a synchronous write on N pages
- */
-int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
- struct ceph_file_layout *layout,
- struct ceph_snap_context *snapc,
- u64 off, u64 len,
- u32 truncate_seq, u64 truncate_size,
- struct timespec64 *mtime,
- struct page **pages, int num_pages)
+int osd_req_op_copy_from_init(struct ceph_osd_request *req,
+ u64 src_snapid, u64 src_version,
+ struct ceph_object_id *src_oid,
+ struct ceph_object_locator *src_oloc,
+ u32 src_fadvise_flags,
+ u32 dst_fadvise_flags,
+ u32 truncate_seq, u64 truncate_size,
+ u8 copy_from_flags)
{
- struct ceph_osd_request *req;
- int rc = 0;
- int page_align = off & ~PAGE_MASK;
-
- req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1,
- CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
- snapc, truncate_seq, truncate_size,
- true);
- if (IS_ERR(req))
- return PTR_ERR(req);
-
- /* it may be a short write due to an object boundary */
- osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
- false, false);
- dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
-
- req->r_mtime = *mtime;
- rc = ceph_osdc_start_request(osdc, req, true);
- if (!rc)
- rc = ceph_osdc_wait_request(osdc, req);
+ struct ceph_osd_req_op *op;
+ struct page **pages;
+ void *p, *end;
- ceph_osdc_put_request(req);
- if (rc == 0)
- rc = len;
- dout("writepages result %d\n", rc);
- return rc;
+ pages = ceph_alloc_page_vector(1, GFP_KERNEL);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ op = osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM2,
+ dst_fadvise_flags);
+ op->copy_from.snapid = src_snapid;
+ op->copy_from.src_version = src_version;
+ op->copy_from.flags = copy_from_flags;
+ op->copy_from.src_fadvise_flags = src_fadvise_flags;
+
+ p = page_address(pages[0]);
+ end = p + PAGE_SIZE;
+ ceph_encode_string(&p, end, src_oid->name, src_oid->name_len);
+ encode_oloc(&p, end, src_oloc);
+ ceph_encode_32(&p, truncate_seq);
+ ceph_encode_64(&p, truncate_size);
+ op->indata_len = PAGE_SIZE - (end - p);
+
+ ceph_osd_data_pages_init(&op->copy_from.osd_data, pages,
+ op->indata_len, 0, false, true);
+ return 0;
}
-EXPORT_SYMBOL(ceph_osdc_writepages);
+EXPORT_SYMBOL(osd_req_op_copy_from_init);
int __init ceph_osdc_setup(void)
{
@@ -5190,7 +5349,7 @@ void ceph_osdc_cleanup(void)
/*
* handle incoming message
*/
-static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+static void osd_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
{
struct ceph_osd *osd = con->private;
struct ceph_osd_client *osdc = osd->o_osdc;
@@ -5218,6 +5377,24 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
ceph_msg_put(msg);
}
+/* How much sparse data was requested? */
+static u64 sparse_data_requested(struct ceph_osd_request *req)
+{
+ u64 len = 0;
+
+ if (req->r_flags & CEPH_OSD_FLAG_READ) {
+ int i;
+
+ for (i = 0; i < req->r_num_ops; ++i) {
+ struct ceph_osd_req_op *op = &req->r_ops[i];
+
+ if (op->op == CEPH_OSD_OP_SPARSE_READ)
+ len += op->extent.length;
+ }
+ }
+ return len;
+}
+
/*
* Lookup and return message for incoming reply. Don't try to do
* anything about a larger than preallocated data portion of the
@@ -5234,6 +5411,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
int front_len = le32_to_cpu(hdr->front_len);
int data_len = le32_to_cpu(hdr->data_len);
u64 tid = le64_to_cpu(hdr->tid);
+ u64 srlen;
down_read(&osdc->lock);
if (!osd_registered(osd)) {
@@ -5266,7 +5444,8 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
req->r_reply = m;
}
- if (data_len > req->r_reply->data_length) {
+ srlen = sparse_data_requested(req);
+ if (!srlen && data_len > req->r_reply->data_length) {
pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n",
__func__, osd->o_osd, req->r_tid, data_len,
req->r_reply->data_length);
@@ -5276,6 +5455,8 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
}
m = ceph_msg_get(req->r_reply);
+ m->sparse_read_total = srlen;
+
dout("get_reply tid %lld %p\n", tid, m);
out_unlock_session:
@@ -5285,9 +5466,6 @@ out_unlock_osdc:
return m;
}
-/*
- * TODO: switch to a msg-owned pagelist
- */
static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
{
struct ceph_msg *m;
@@ -5295,13 +5473,12 @@ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
u32 front_len = le32_to_cpu(hdr->front_len);
u32 data_len = le32_to_cpu(hdr->data_len);
- m = ceph_msg_new(type, front_len, GFP_NOIO, false);
+ m = ceph_msg_new2(type, front_len, 1, GFP_NOIO, false);
if (!m)
return NULL;
if (data_len) {
struct page **pages;
- struct ceph_osd_data osd_data;
pages = ceph_alloc_page_vector(calc_pages_for(0, data_len),
GFP_NOIO);
@@ -5310,17 +5487,15 @@ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr)
return NULL;
}
- ceph_osd_data_pages_init(&osd_data, pages, data_len, 0, false,
- false);
- ceph_osdc_msg_data_add(m, &osd_data);
+ ceph_msg_data_add_pages(m, pages, data_len, 0, true);
}
return m;
}
-static struct ceph_msg *alloc_msg(struct ceph_connection *con,
- struct ceph_msg_header *hdr,
- int *skip)
+static struct ceph_msg *osd_alloc_msg(struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip)
{
struct ceph_osd *osd = con->private;
int type = le16_to_cpu(hdr->type);
@@ -5344,7 +5519,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
/*
* Wrappers to refcount containing ceph_osd struct
*/
-static struct ceph_connection *get_osd_con(struct ceph_connection *con)
+static struct ceph_connection *osd_get_con(struct ceph_connection *con)
{
struct ceph_osd *osd = con->private;
if (get_osd(osd))
@@ -5352,7 +5527,7 @@ static struct ceph_connection *get_osd_con(struct ceph_connection *con)
return NULL;
}
-static void put_osd_con(struct ceph_connection *con)
+static void osd_put_con(struct ceph_connection *con)
{
struct ceph_osd *osd = con->private;
put_osd(osd);
@@ -5361,39 +5536,29 @@ static void put_osd_con(struct ceph_connection *con)
/*
* authentication
*/
+
/*
* Note: returned pointer is the address of a structure that's
* managed separately. Caller must *not* attempt to free it.
*/
-static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
- int *proto, int force_new)
+static struct ceph_auth_handshake *
+osd_get_authorizer(struct ceph_connection *con, int *proto, int force_new)
{
struct ceph_osd *o = con->private;
struct ceph_osd_client *osdc = o->o_osdc;
struct ceph_auth_client *ac = osdc->client->monc.auth;
struct ceph_auth_handshake *auth = &o->o_auth;
+ int ret;
- if (force_new && auth->authorizer) {
- ceph_auth_destroy_authorizer(auth->authorizer);
- auth->authorizer = NULL;
- }
- if (!auth->authorizer) {
- int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
- auth);
- if (ret)
- return ERR_PTR(ret);
- } else {
- int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
- auth);
- if (ret)
- return ERR_PTR(ret);
- }
- *proto = ac->protocol;
+ ret = __ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD,
+ force_new, proto, NULL, NULL);
+ if (ret)
+ return ERR_PTR(ret);
return auth;
}
-static int add_authorizer_challenge(struct ceph_connection *con,
+static int osd_add_authorizer_challenge(struct ceph_connection *con,
void *challenge_buf, int challenge_buf_len)
{
struct ceph_osd *o = con->private;
@@ -5404,16 +5569,19 @@ static int add_authorizer_challenge(struct ceph_connection *con,
challenge_buf, challenge_buf_len);
}
-static int verify_authorizer_reply(struct ceph_connection *con)
+static int osd_verify_authorizer_reply(struct ceph_connection *con)
{
struct ceph_osd *o = con->private;
struct ceph_osd_client *osdc = o->o_osdc;
struct ceph_auth_client *ac = osdc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &o->o_auth;
- return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer);
+ return ceph_auth_verify_authorizer_reply(ac, auth->authorizer,
+ auth->authorizer_reply_buf, auth->authorizer_reply_buf_len,
+ NULL, NULL, NULL, NULL);
}
-static int invalidate_authorizer(struct ceph_connection *con)
+static int osd_invalidate_authorizer(struct ceph_connection *con)
{
struct ceph_osd *o = con->private;
struct ceph_osd_client *osdc = o->o_osdc;
@@ -5423,6 +5591,80 @@ static int invalidate_authorizer(struct ceph_connection *con)
return ceph_monc_validate_auth(&osdc->client->monc);
}
+static int osd_get_auth_request(struct ceph_connection *con,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &o->o_auth;
+ int ret;
+
+ ret = ceph_auth_get_authorizer(ac, auth, CEPH_ENTITY_TYPE_OSD,
+ buf, buf_len);
+ if (ret)
+ return ret;
+
+ *authorizer = auth->authorizer_buf;
+ *authorizer_len = auth->authorizer_buf_len;
+ return 0;
+}
+
+static int osd_handle_auth_reply_more(struct ceph_connection *con,
+ void *reply, int reply_len,
+ void *buf, int *buf_len,
+ void **authorizer, int *authorizer_len)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &o->o_auth;
+ int ret;
+
+ ret = ceph_auth_handle_svc_reply_more(ac, auth, reply, reply_len,
+ buf, buf_len);
+ if (ret)
+ return ret;
+
+ *authorizer = auth->authorizer_buf;
+ *authorizer_len = auth->authorizer_buf_len;
+ return 0;
+}
+
+static int osd_handle_auth_done(struct ceph_connection *con,
+ u64 global_id, void *reply, int reply_len,
+ u8 *session_key, int *session_key_len,
+ u8 *con_secret, int *con_secret_len)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_auth_client *ac = o->o_osdc->client->monc.auth;
+ struct ceph_auth_handshake *auth = &o->o_auth;
+
+ return ceph_auth_handle_svc_reply_done(ac, auth, reply, reply_len,
+ session_key, session_key_len,
+ con_secret, con_secret_len);
+}
+
+static int osd_handle_auth_bad_method(struct ceph_connection *con,
+ int used_proto, int result,
+ const int *allowed_protos, int proto_cnt,
+ const int *allowed_modes, int mode_cnt)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_mon_client *monc = &o->o_osdc->client->monc;
+ int ret;
+
+ if (ceph_auth_handle_bad_authorizer(monc->auth, CEPH_ENTITY_TYPE_OSD,
+ used_proto, result,
+ allowed_protos, proto_cnt,
+ allowed_modes, mode_cnt)) {
+ ret = ceph_monc_validate_auth(monc);
+ if (ret)
+ return ret;
+ }
+
+ return -EACCES;
+}
+
static void osd_reencode_message(struct ceph_msg *msg)
{
int type = le16_to_cpu(msg->hdr.type);
@@ -5447,17 +5689,229 @@ static int osd_check_message_signature(struct ceph_msg *msg)
return ceph_auth_check_message_signature(auth, msg);
}
+static void advance_cursor(struct ceph_msg_data_cursor *cursor, size_t len,
+ bool zero)
+{
+ while (len) {
+ struct page *page;
+ size_t poff, plen;
+
+ page = ceph_msg_data_next(cursor, &poff, &plen);
+ if (plen > len)
+ plen = len;
+ if (zero)
+ zero_user_segment(page, poff, poff + plen);
+ len -= plen;
+ ceph_msg_data_advance(cursor, plen);
+ }
+}
+
+static int prep_next_sparse_read(struct ceph_connection *con,
+ struct ceph_msg_data_cursor *cursor)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_sparse_read *sr = &o->o_sparse_read;
+ struct ceph_osd_request *req;
+ struct ceph_osd_req_op *op;
+
+ spin_lock(&o->o_requests_lock);
+ req = lookup_request(&o->o_requests, le64_to_cpu(con->in_msg->hdr.tid));
+ if (!req) {
+ spin_unlock(&o->o_requests_lock);
+ return -EBADR;
+ }
+
+ if (o->o_sparse_op_idx < 0) {
+ dout("%s: [%d] starting new sparse read req\n",
+ __func__, o->o_osd);
+ } else {
+ u64 end;
+
+ op = &req->r_ops[o->o_sparse_op_idx];
+
+ WARN_ON_ONCE(op->extent.sparse_ext);
+
+ /* hand back buffer we took earlier */
+ op->extent.sparse_ext = sr->sr_extent;
+ sr->sr_extent = NULL;
+ op->extent.sparse_ext_cnt = sr->sr_count;
+ sr->sr_ext_len = 0;
+ dout("%s: [%d] completed extent array len %d cursor->resid %zd\n",
+ __func__, o->o_osd, op->extent.sparse_ext_cnt, cursor->resid);
+ /* Advance to end of data for this operation */
+ end = ceph_sparse_ext_map_end(op);
+ if (end < sr->sr_req_len)
+ advance_cursor(cursor, sr->sr_req_len - end, false);
+ }
+
+ ceph_init_sparse_read(sr);
+
+ /* find next op in this request (if any) */
+ while (++o->o_sparse_op_idx < req->r_num_ops) {
+ op = &req->r_ops[o->o_sparse_op_idx];
+ if (op->op == CEPH_OSD_OP_SPARSE_READ)
+ goto found;
+ }
+
+ /* reset for next sparse read request */
+ spin_unlock(&o->o_requests_lock);
+ o->o_sparse_op_idx = -1;
+ return 0;
+found:
+ sr->sr_req_off = op->extent.offset;
+ sr->sr_req_len = op->extent.length;
+ sr->sr_pos = sr->sr_req_off;
+ dout("%s: [%d] new sparse read op at idx %d 0x%llx~0x%llx\n", __func__,
+ o->o_osd, o->o_sparse_op_idx, sr->sr_req_off, sr->sr_req_len);
+
+ /* hand off request's sparse extent map buffer */
+ sr->sr_ext_len = op->extent.sparse_ext_cnt;
+ op->extent.sparse_ext_cnt = 0;
+ sr->sr_extent = op->extent.sparse_ext;
+ op->extent.sparse_ext = NULL;
+
+ spin_unlock(&o->o_requests_lock);
+ return 1;
+}
+
+#ifdef __BIG_ENDIAN
+static inline void convert_extent_map(struct ceph_sparse_read *sr)
+{
+ int i;
+
+ for (i = 0; i < sr->sr_count; i++) {
+ struct ceph_sparse_extent *ext = &sr->sr_extent[i];
+
+ ext->off = le64_to_cpu((__force __le64)ext->off);
+ ext->len = le64_to_cpu((__force __le64)ext->len);
+ }
+}
+#else
+static inline void convert_extent_map(struct ceph_sparse_read *sr)
+{
+}
+#endif
+
+static int osd_sparse_read(struct ceph_connection *con,
+ struct ceph_msg_data_cursor *cursor,
+ char **pbuf)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_sparse_read *sr = &o->o_sparse_read;
+ u32 count = sr->sr_count;
+ u64 eoff, elen, len = 0;
+ int i, ret;
+
+ switch (sr->sr_state) {
+ case CEPH_SPARSE_READ_HDR:
+next_op:
+ ret = prep_next_sparse_read(con, cursor);
+ if (ret <= 0)
+ return ret;
+
+ /* number of extents */
+ ret = sizeof(sr->sr_count);
+ *pbuf = (char *)&sr->sr_count;
+ sr->sr_state = CEPH_SPARSE_READ_EXTENTS;
+ break;
+ case CEPH_SPARSE_READ_EXTENTS:
+ /* Convert sr_count to host-endian */
+ count = le32_to_cpu((__force __le32)sr->sr_count);
+ sr->sr_count = count;
+ dout("[%d] got %u extents\n", o->o_osd, count);
+
+ if (count > 0) {
+ if (!sr->sr_extent || count > sr->sr_ext_len) {
+ /* no extent array provided, or too short */
+ kfree(sr->sr_extent);
+ sr->sr_extent = kmalloc_array(count,
+ sizeof(*sr->sr_extent),
+ GFP_NOIO);
+ if (!sr->sr_extent) {
+ pr_err("%s: failed to allocate %u extents\n",
+ __func__, count);
+ return -ENOMEM;
+ }
+ sr->sr_ext_len = count;
+ }
+ ret = count * sizeof(*sr->sr_extent);
+ *pbuf = (char *)sr->sr_extent;
+ sr->sr_state = CEPH_SPARSE_READ_DATA_LEN;
+ break;
+ }
+ /* No extents? Read data len */
+ fallthrough;
+ case CEPH_SPARSE_READ_DATA_LEN:
+ convert_extent_map(sr);
+ ret = sizeof(sr->sr_datalen);
+ *pbuf = (char *)&sr->sr_datalen;
+ sr->sr_state = CEPH_SPARSE_READ_DATA_PRE;
+ break;
+ case CEPH_SPARSE_READ_DATA_PRE:
+ /* Convert sr_datalen to host-endian */
+ sr->sr_datalen = le32_to_cpu((__force __le32)sr->sr_datalen);
+ for (i = 0; i < count; i++)
+ len += sr->sr_extent[i].len;
+ if (sr->sr_datalen != len) {
+ pr_warn_ratelimited("data len %u != extent len %llu\n",
+ sr->sr_datalen, len);
+ return -EREMOTEIO;
+ }
+ sr->sr_state = CEPH_SPARSE_READ_DATA;
+ fallthrough;
+ case CEPH_SPARSE_READ_DATA:
+ if (sr->sr_index >= count) {
+ sr->sr_state = CEPH_SPARSE_READ_HDR;
+ goto next_op;
+ }
+
+ eoff = sr->sr_extent[sr->sr_index].off;
+ elen = sr->sr_extent[sr->sr_index].len;
+
+ dout("[%d] ext %d off 0x%llx len 0x%llx\n",
+ o->o_osd, sr->sr_index, eoff, elen);
+
+ if (elen > INT_MAX) {
+ dout("Sparse read extent length too long (0x%llx)\n",
+ elen);
+ return -EREMOTEIO;
+ }
+
+ /* zero out anything from sr_pos to start of extent */
+ if (sr->sr_pos < eoff)
+ advance_cursor(cursor, eoff - sr->sr_pos, true);
+
+ /* Set position to end of extent */
+ sr->sr_pos = eoff + elen;
+
+ /* send back the new length and nullify the ptr */
+ cursor->sr_resid = elen;
+ ret = elen;
+ *pbuf = NULL;
+
+ /* Bump the array index */
+ ++sr->sr_index;
+ break;
+ }
+ return ret;
+}
+
static const struct ceph_connection_operations osd_con_ops = {
- .get = get_osd_con,
- .put = put_osd_con,
- .dispatch = dispatch,
- .get_authorizer = get_authorizer,
- .add_authorizer_challenge = add_authorizer_challenge,
- .verify_authorizer_reply = verify_authorizer_reply,
- .invalidate_authorizer = invalidate_authorizer,
- .alloc_msg = alloc_msg,
+ .get = osd_get_con,
+ .put = osd_put_con,
+ .sparse_read = osd_sparse_read,
+ .alloc_msg = osd_alloc_msg,
+ .dispatch = osd_dispatch,
+ .fault = osd_fault,
.reencode_message = osd_reencode_message,
+ .get_authorizer = osd_get_authorizer,
+ .add_authorizer_challenge = osd_add_authorizer_challenge,
+ .verify_authorizer_reply = osd_verify_authorizer_reply,
+ .invalidate_authorizer = osd_invalidate_authorizer,
.sign_message = osd_sign_message,
.check_message_signature = osd_check_message_signature,
- .fault = osd_fault,
+ .get_auth_request = osd_get_auth_request,
+ .handle_auth_reply_more = osd_handle_auth_reply_more,
+ .handle_auth_done = osd_handle_auth_done,
+ .handle_auth_bad_method = osd_handle_auth_bad_method,
};
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 98c0ff3d6441..d245fa508e1c 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -11,6 +11,22 @@
#include <linux/crush/hash.h>
#include <linux/crush/mapper.h>
+static __printf(2, 3)
+void osdmap_info(const struct ceph_osdmap *map, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ printk(KERN_INFO "%s (%pU e%u): %pV", KBUILD_MODNAME, &map->fsid,
+ map->epoch, &vaf);
+
+ va_end(args);
+}
+
char *ceph_osdmap_state_str(char *str, int len, u32 state)
{
if (!len)
@@ -138,6 +154,79 @@ bad:
return -EINVAL;
}
+struct crush_name_node {
+ struct rb_node cn_node;
+ int cn_id;
+ char cn_name[];
+};
+
+static struct crush_name_node *alloc_crush_name(size_t name_len)
+{
+ struct crush_name_node *cn;
+
+ cn = kmalloc(sizeof(*cn) + name_len + 1, GFP_NOIO);
+ if (!cn)
+ return NULL;
+
+ RB_CLEAR_NODE(&cn->cn_node);
+ return cn;
+}
+
+static void free_crush_name(struct crush_name_node *cn)
+{
+ WARN_ON(!RB_EMPTY_NODE(&cn->cn_node));
+
+ kfree(cn);
+}
+
+DEFINE_RB_FUNCS(crush_name, struct crush_name_node, cn_id, cn_node)
+
+static int decode_crush_names(void **p, void *end, struct rb_root *root)
+{
+ u32 n;
+
+ ceph_decode_32_safe(p, end, n, e_inval);
+ while (n--) {
+ struct crush_name_node *cn;
+ int id;
+ u32 name_len;
+
+ ceph_decode_32_safe(p, end, id, e_inval);
+ ceph_decode_32_safe(p, end, name_len, e_inval);
+ ceph_decode_need(p, end, name_len, e_inval);
+
+ cn = alloc_crush_name(name_len);
+ if (!cn)
+ return -ENOMEM;
+
+ cn->cn_id = id;
+ memcpy(cn->cn_name, *p, name_len);
+ cn->cn_name[name_len] = '\0';
+ *p += name_len;
+
+ if (!__insert_crush_name(root, cn)) {
+ free_crush_name(cn);
+ return -EEXIST;
+ }
+ }
+
+ return 0;
+
+e_inval:
+ return -EINVAL;
+}
+
+void clear_crush_names(struct rb_root *root)
+{
+ while (!RB_EMPTY_ROOT(root)) {
+ struct crush_name_node *cn =
+ rb_entry(rb_first(root), struct crush_name_node, cn_node);
+
+ erase_crush_name(root, cn);
+ free_crush_name(cn);
+ }
+}
+
static struct crush_choose_arg_map *alloc_choose_arg_map(void)
{
struct crush_choose_arg_map *arg_map;
@@ -354,6 +443,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
if (c == NULL)
return ERR_PTR(-ENOMEM);
+ c->type_names = RB_ROOT;
+ c->names = RB_ROOT;
c->choose_args = RB_ROOT;
/* set tunables to default values */
@@ -495,12 +586,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
/ sizeof(struct crush_rule_step))
goto bad;
#endif
- r = c->rules[i] = kmalloc(sizeof(*r) +
- yes*sizeof(struct crush_rule_step),
- GFP_NOFS);
+ r = kmalloc(struct_size(r, steps, yes), GFP_NOFS);
if (r == NULL)
goto badmem;
dout(" rule %d is at %p\n", i, r);
+ c->rules[i] = r;
r->len = yes;
ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
@@ -511,8 +601,14 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
}
}
- ceph_decode_skip_map(p, end, 32, string, bad); /* type_map */
- ceph_decode_skip_map(p, end, 32, string, bad); /* name_map */
+ err = decode_crush_names(p, end, &c->type_names);
+ if (err)
+ goto fail;
+
+ err = decode_crush_names(p, end, &c->names);
+ if (err)
+ goto fail;
+
ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */
/* tunables */
@@ -637,48 +733,11 @@ DEFINE_RB_FUNCS2(pg_mapping, struct ceph_pg_mapping, pgid, ceph_pg_compare,
/*
* rbtree of pg pool info
*/
-static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
-{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct ceph_pg_pool_info *pi = NULL;
-
- while (*p) {
- parent = *p;
- pi = rb_entry(parent, struct ceph_pg_pool_info, node);
- if (new->id < pi->id)
- p = &(*p)->rb_left;
- else if (new->id > pi->id)
- p = &(*p)->rb_right;
- else
- return -EEXIST;
- }
-
- rb_link_node(&new->node, parent, p);
- rb_insert_color(&new->node, root);
- return 0;
-}
-
-static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id)
-{
- struct ceph_pg_pool_info *pi;
- struct rb_node *n = root->rb_node;
-
- while (n) {
- pi = rb_entry(n, struct ceph_pg_pool_info, node);
- if (id < pi->id)
- n = n->rb_left;
- else if (id > pi->id)
- n = n->rb_right;
- else
- return pi;
- }
- return NULL;
-}
+DEFINE_RB_FUNCS(pg_pool, struct ceph_pg_pool_info, id, node)
struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id)
{
- return __lookup_pg_pool(&map->pg_pools, id);
+ return lookup_pg_pool(&map->pg_pools, id);
}
const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id)
@@ -691,8 +750,7 @@ const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id)
if (WARN_ON_ONCE(id > (u64) INT_MAX))
return NULL;
- pi = __lookup_pg_pool(&map->pg_pools, (int) id);
-
+ pi = lookup_pg_pool(&map->pg_pools, id);
return pi ? pi->name : NULL;
}
EXPORT_SYMBOL(ceph_pg_pool_name_by_id);
@@ -711,9 +769,18 @@ int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
}
EXPORT_SYMBOL(ceph_pg_poolid_by_name);
+u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id)
+{
+ struct ceph_pg_pool_info *pi;
+
+ pi = lookup_pg_pool(&map->pg_pools, id);
+ return pi ? pi->flags : 0;
+}
+EXPORT_SYMBOL(ceph_pg_pool_flags);
+
static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
{
- rb_erase(&pi->node, root);
+ erase_pg_pool(root, pi);
kfree(pi->name);
kfree(pi);
}
@@ -895,7 +962,7 @@ static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
ceph_decode_32_safe(p, end, len, bad);
dout(" pool %llu len %d\n", pool, len);
ceph_decode_need(p, end, len, bad);
- pi = __lookup_pg_pool(&map->pg_pools, pool);
+ pi = lookup_pg_pool(&map->pg_pools, pool);
if (pi) {
char *name = kstrndup(*p, len, GFP_NOFS);
@@ -914,6 +981,143 @@ bad:
}
/*
+ * CRUSH workspaces
+ *
+ * workspace_manager framework borrowed from fs/btrfs/compression.c.
+ * Two simplifications: there is only one type of workspace and there
+ * is always at least one workspace.
+ */
+static struct crush_work *alloc_workspace(const struct crush_map *c)
+{
+ struct crush_work *work;
+ size_t work_size;
+
+ WARN_ON(!c->working_size);
+ work_size = crush_work_size(c, CEPH_PG_MAX_SIZE);
+ dout("%s work_size %zu bytes\n", __func__, work_size);
+
+ work = kvmalloc(work_size, GFP_NOIO);
+ if (!work)
+ return NULL;
+
+ INIT_LIST_HEAD(&work->item);
+ crush_init_workspace(c, work);
+ return work;
+}
+
+static void free_workspace(struct crush_work *work)
+{
+ WARN_ON(!list_empty(&work->item));
+ kvfree(work);
+}
+
+static void init_workspace_manager(struct workspace_manager *wsm)
+{
+ INIT_LIST_HEAD(&wsm->idle_ws);
+ spin_lock_init(&wsm->ws_lock);
+ atomic_set(&wsm->total_ws, 0);
+ wsm->free_ws = 0;
+ init_waitqueue_head(&wsm->ws_wait);
+}
+
+static void add_initial_workspace(struct workspace_manager *wsm,
+ struct crush_work *work)
+{
+ WARN_ON(!list_empty(&wsm->idle_ws));
+
+ list_add(&work->item, &wsm->idle_ws);
+ atomic_set(&wsm->total_ws, 1);
+ wsm->free_ws = 1;
+}
+
+static void cleanup_workspace_manager(struct workspace_manager *wsm)
+{
+ struct crush_work *work;
+
+ while (!list_empty(&wsm->idle_ws)) {
+ work = list_first_entry(&wsm->idle_ws, struct crush_work,
+ item);
+ list_del_init(&work->item);
+ free_workspace(work);
+ }
+ atomic_set(&wsm->total_ws, 0);
+ wsm->free_ws = 0;
+}
+
+/*
+ * Finds an available workspace or allocates a new one. If it's not
+ * possible to allocate a new one, waits until there is one.
+ */
+static struct crush_work *get_workspace(struct workspace_manager *wsm,
+ const struct crush_map *c)
+{
+ struct crush_work *work;
+ int cpus = num_online_cpus();
+
+again:
+ spin_lock(&wsm->ws_lock);
+ if (!list_empty(&wsm->idle_ws)) {
+ work = list_first_entry(&wsm->idle_ws, struct crush_work,
+ item);
+ list_del_init(&work->item);
+ wsm->free_ws--;
+ spin_unlock(&wsm->ws_lock);
+ return work;
+
+ }
+ if (atomic_read(&wsm->total_ws) > cpus) {
+ DEFINE_WAIT(wait);
+
+ spin_unlock(&wsm->ws_lock);
+ prepare_to_wait(&wsm->ws_wait, &wait, TASK_UNINTERRUPTIBLE);
+ if (atomic_read(&wsm->total_ws) > cpus && !wsm->free_ws)
+ schedule();
+ finish_wait(&wsm->ws_wait, &wait);
+ goto again;
+ }
+ atomic_inc(&wsm->total_ws);
+ spin_unlock(&wsm->ws_lock);
+
+ work = alloc_workspace(c);
+ if (!work) {
+ atomic_dec(&wsm->total_ws);
+ wake_up(&wsm->ws_wait);
+
+ /*
+ * Do not return the error but go back to waiting. We
+ * have the initial workspace and the CRUSH computation
+ * time is bounded so we will get it eventually.
+ */
+ WARN_ON(atomic_read(&wsm->total_ws) < 1);
+ goto again;
+ }
+ return work;
+}
+
+/*
+ * Puts a workspace back on the list or frees it if we have enough
+ * idle ones sitting around.
+ */
+static void put_workspace(struct workspace_manager *wsm,
+ struct crush_work *work)
+{
+ spin_lock(&wsm->ws_lock);
+ if (wsm->free_ws <= num_online_cpus()) {
+ list_add(&work->item, &wsm->idle_ws);
+ wsm->free_ws++;
+ spin_unlock(&wsm->ws_lock);
+ goto wake;
+ }
+ spin_unlock(&wsm->ws_lock);
+
+ free_workspace(work);
+ atomic_dec(&wsm->total_ws);
+wake:
+ if (wq_has_sleeper(&wsm->ws_wait))
+ wake_up(&wsm->ws_wait);
+}
+
+/*
* osd map
*/
struct ceph_osdmap *ceph_osdmap_alloc(void)
@@ -930,7 +1134,8 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
map->primary_temp = RB_ROOT;
map->pg_upmap = RB_ROOT;
map->pg_upmap_items = RB_ROOT;
- mutex_init(&map->crush_workspace_mutex);
+
+ init_workspace_manager(&map->crush_wsm);
return map;
}
@@ -938,8 +1143,11 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
void ceph_osdmap_destroy(struct ceph_osdmap *map)
{
dout("osdmap_destroy %p\n", map);
+
if (map->crush)
crush_destroy(map->crush);
+ cleanup_workspace_manager(&map->crush_wsm);
+
while (!RB_EMPTY_ROOT(&map->pg_temp)) {
struct ceph_pg_mapping *pg =
rb_entry(rb_first(&map->pg_temp),
@@ -974,11 +1182,10 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
struct ceph_pg_pool_info, node);
__remove_pg_pool(&map->pg_pools, pi);
}
- kfree(map->osd_state);
- kfree(map->osd_weight);
- kfree(map->osd_addr);
- kfree(map->osd_primary_affinity);
- kfree(map->crush_workspace);
+ kvfree(map->osd_state);
+ kvfree(map->osd_weight);
+ kvfree(map->osd_addr);
+ kvfree(map->osd_primary_affinity);
kfree(map);
}
@@ -987,28 +1194,41 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
*
* The new elements are properly initialized.
*/
-static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max)
{
u32 *state;
u32 *weight;
struct ceph_entity_addr *addr;
+ u32 to_copy;
int i;
- state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS);
- if (!state)
- return -ENOMEM;
- map->osd_state = state;
+ dout("%s old %u new %u\n", __func__, map->max_osd, max);
+ if (max == map->max_osd)
+ return 0;
- weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS);
- if (!weight)
+ state = kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS);
+ weight = kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS);
+ addr = kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS);
+ if (!state || !weight || !addr) {
+ kvfree(state);
+ kvfree(weight);
+ kvfree(addr);
return -ENOMEM;
- map->osd_weight = weight;
+ }
- addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS);
- if (!addr)
- return -ENOMEM;
- map->osd_addr = addr;
+ to_copy = min(map->max_osd, max);
+ if (map->osd_state) {
+ memcpy(state, map->osd_state, to_copy * sizeof(*state));
+ memcpy(weight, map->osd_weight, to_copy * sizeof(*weight));
+ memcpy(addr, map->osd_addr, to_copy * sizeof(*addr));
+ kvfree(map->osd_state);
+ kvfree(map->osd_weight);
+ kvfree(map->osd_addr);
+ }
+ map->osd_state = state;
+ map->osd_weight = weight;
+ map->osd_addr = addr;
for (i = map->max_osd; i < max; i++) {
map->osd_state[i] = 0;
map->osd_weight[i] = CEPH_OSD_OUT;
@@ -1018,12 +1238,16 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
if (map->osd_primary_affinity) {
u32 *affinity;
- affinity = krealloc(map->osd_primary_affinity,
- max*sizeof(*affinity), GFP_NOFS);
+ affinity = kvmalloc(array_size(max, sizeof(*affinity)),
+ GFP_NOFS);
if (!affinity)
return -ENOMEM;
- map->osd_primary_affinity = affinity;
+ memcpy(affinity, map->osd_primary_affinity,
+ to_copy * sizeof(*affinity));
+ kvfree(map->osd_primary_affinity);
+
+ map->osd_primary_affinity = affinity;
for (i = map->max_osd; i < max; i++)
map->osd_primary_affinity[i] =
CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
@@ -1036,26 +1260,22 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
{
- void *workspace;
- size_t work_size;
+ struct crush_work *work;
if (IS_ERR(crush))
return PTR_ERR(crush);
- work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
- dout("%s work_size %zu bytes\n", __func__, work_size);
- workspace = kmalloc(work_size, GFP_NOIO);
- if (!workspace) {
+ work = alloc_workspace(crush);
+ if (!work) {
crush_destroy(crush);
return -ENOMEM;
}
- crush_init_workspace(crush, workspace);
if (map->crush)
crush_destroy(map->crush);
- kfree(map->crush_workspace);
+ cleanup_workspace_manager(&map->crush_wsm);
map->crush = crush;
- map->crush_workspace = workspace;
+ add_initial_workspace(&map->crush_wsm, work);
return 0;
}
@@ -1105,7 +1325,7 @@ static int get_osdmap_client_data_v(void **p, void *end,
return -EINVAL;
}
- /* old osdmap enconding */
+ /* old osdmap encoding */
struct_v = 0;
}
@@ -1129,18 +1349,18 @@ static int __decode_pools(void **p, void *end, struct ceph_osdmap *map,
ceph_decode_64_safe(p, end, pool, e_inval);
- pi = __lookup_pg_pool(&map->pg_pools, pool);
+ pi = lookup_pg_pool(&map->pg_pools, pool);
if (!incremental || !pi) {
pi = kzalloc(sizeof(*pi), GFP_NOFS);
if (!pi)
return -ENOMEM;
+ RB_CLEAR_NODE(&pi->node);
pi->id = pool;
- ret = __insert_pg_pool(&map->pg_pools, pi);
- if (ret) {
+ if (!__insert_pg_pool(&map->pg_pools, pi)) {
kfree(pi);
- return ret;
+ return -EEXIST;
}
}
@@ -1284,8 +1504,6 @@ static int decode_new_primary_temp(void **p, void *end,
u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
{
- BUG_ON(osd >= map->max_osd);
-
if (!map->osd_primary_affinity)
return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
@@ -1294,14 +1512,12 @@ u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
{
- BUG_ON(osd >= map->max_osd);
-
if (!map->osd_primary_affinity) {
int i;
- map->osd_primary_affinity = kmalloc_array(map->max_osd,
- sizeof(u32),
- GFP_NOFS);
+ map->osd_primary_affinity = kvmalloc(
+ array_size(map->max_osd, sizeof(*map->osd_primary_affinity)),
+ GFP_NOFS);
if (!map->osd_primary_affinity)
return -ENOMEM;
@@ -1322,7 +1538,7 @@ static int decode_primary_affinity(void **p, void *end,
ceph_decode_32_safe(p, end, len, e_inval);
if (len == 0) {
- kfree(map->osd_primary_affinity);
+ kvfree(map->osd_primary_affinity);
map->osd_primary_affinity = NULL;
return 0;
}
@@ -1357,12 +1573,14 @@ static int decode_new_primary_affinity(void **p, void *end,
ceph_decode_32_safe(p, end, osd, e_inval);
ceph_decode_32_safe(p, end, aff, e_inval);
+ if (osd >= map->max_osd)
+ goto e_inval;
ret = set_primary_affinity(map, osd, aff);
if (ret)
return ret;
- pr_info("osd%d primary-affinity 0x%x\n", osd, aff);
+ osdmap_info(map, "osd%d primary-affinity 0x%x\n", osd, aff);
}
return 0;
@@ -1443,7 +1661,8 @@ static int decode_old_pg_upmap_items(void **p, void *end,
/*
* decode a full map.
*/
-static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
+static int osdmap_decode(void **p, void *end, bool msgr2,
+ struct ceph_osdmap *map)
{
u8 struct_v;
u32 epoch = 0;
@@ -1490,11 +1709,9 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
/* osd_state, osd_weight, osd_addrs->client_addr */
ceph_decode_need(p, end, 3*sizeof(u32) +
- map->max_osd*((struct_v >= 5 ? sizeof(u32) :
- sizeof(u8)) +
- sizeof(*map->osd_weight) +
- sizeof(*map->osd_addr)), e_inval);
-
+ map->max_osd*(struct_v >= 5 ? sizeof(u32) :
+ sizeof(u8)) +
+ sizeof(*map->osd_weight), e_inval);
if (ceph_decode_32(p) != map->max_osd)
goto e_inval;
@@ -1515,9 +1732,18 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
if (ceph_decode_32(p) != map->max_osd)
goto e_inval;
- ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
- for (i = 0; i < map->max_osd; i++)
- ceph_decode_addr(&map->osd_addr[i]);
+ for (i = 0; i < map->max_osd; i++) {
+ struct ceph_entity_addr *addr = &map->osd_addr[i];
+
+ if (struct_v >= 8)
+ err = ceph_decode_entity_addrvec(p, end, msgr2, addr);
+ else
+ err = ceph_decode_entity_addr(p, end, addr);
+ if (err)
+ goto bad;
+
+ dout("%s osd%d addr %s\n", __func__, i, ceph_pr_addr(addr));
+ }
/* pg_temp */
err = decode_pg_temp(p, end, map);
@@ -1586,7 +1812,7 @@ bad:
/*
* Allocate and decode a full map.
*/
-struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
+struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2)
{
struct ceph_osdmap *map;
int ret;
@@ -1595,7 +1821,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
if (!map)
return ERR_PTR(-ENOMEM);
- ret = osdmap_decode(p, end, map);
+ ret = osdmap_decode(p, end, msgr2, map);
if (ret) {
ceph_osdmap_destroy(map);
return ERR_PTR(ret);
@@ -1613,18 +1839,28 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
* new_state: { osd=6, xorstate=EXISTS } # clear osd_state
*/
static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
- struct ceph_osdmap *map)
+ bool msgr2, struct ceph_osdmap *map)
{
void *new_up_client;
void *new_state;
void *new_weight_end;
u32 len;
+ int ret;
+ int i;
new_up_client = *p;
ceph_decode_32_safe(p, end, len, e_inval);
- len *= sizeof(u32) + sizeof(struct ceph_entity_addr);
- ceph_decode_need(p, end, len, e_inval);
- *p += len;
+ for (i = 0; i < len; ++i) {
+ struct ceph_entity_addr addr;
+
+ ceph_decode_skip_32(p, end, e_inval);
+ if (struct_v >= 7)
+ ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+ else
+ ret = ceph_decode_entity_addr(p, end, &addr);
+ if (ret)
+ return ret;
+ }
new_state = *p;
ceph_decode_32_safe(p, end, len, e_inval);
@@ -1641,10 +1877,12 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
ceph_decode_need(p, end, 2*sizeof(u32), e_inval);
osd = ceph_decode_32(p);
w = ceph_decode_32(p);
- BUG_ON(osd >= map->max_osd);
- pr_info("osd%d weight 0x%x %s\n", osd, w,
- w == CEPH_OSD_IN ? "(in)" :
- (w == CEPH_OSD_OUT ? "(out)" : ""));
+ if (osd >= map->max_osd)
+ goto e_inval;
+
+ osdmap_info(map, "osd%d weight 0x%x %s\n", osd, w,
+ w == CEPH_OSD_IN ? "(in)" :
+ (w == CEPH_OSD_OUT ? "(out)" : ""));
map->osd_weight[osd] = w;
/*
@@ -1665,22 +1903,23 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
while (len--) {
s32 osd;
u32 xorstate;
- int ret;
osd = ceph_decode_32(p);
+ if (osd >= map->max_osd)
+ goto e_inval;
+
if (struct_v >= 5)
xorstate = ceph_decode_32(p);
else
xorstate = ceph_decode_8(p);
if (xorstate == 0)
xorstate = CEPH_OSD_UP;
- BUG_ON(osd >= map->max_osd);
if ((map->osd_state[osd] & CEPH_OSD_UP) &&
(xorstate & CEPH_OSD_UP))
- pr_info("osd%d down\n", osd);
+ osdmap_info(map, "osd%d down\n", osd);
if ((map->osd_state[osd] & CEPH_OSD_EXISTS) &&
(xorstate & CEPH_OSD_EXISTS)) {
- pr_info("osd%d does not exist\n", osd);
+ osdmap_info(map, "osd%d does not exist\n", osd);
ret = set_primary_affinity(map, osd,
CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
if (ret)
@@ -1700,10 +1939,19 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
struct ceph_entity_addr addr;
osd = ceph_decode_32(p);
- ceph_decode_copy(p, &addr, sizeof(addr));
- ceph_decode_addr(&addr);
- BUG_ON(osd >= map->max_osd);
- pr_info("osd%d up\n", osd);
+ if (osd >= map->max_osd)
+ goto e_inval;
+
+ if (struct_v >= 7)
+ ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr);
+ else
+ ret = ceph_decode_entity_addr(p, end, &addr);
+ if (ret)
+ return ret;
+
+ dout("%s osd%d addr %s\n", __func__, osd, ceph_pr_addr(&addr));
+
+ osdmap_info(map, "osd%d up\n", osd);
map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
map->osd_addr[osd] = addr;
}
@@ -1718,7 +1966,7 @@ e_inval:
/*
* decode and apply an incremental map update.
*/
-struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2,
struct ceph_osdmap *map)
{
struct ceph_fsid fsid;
@@ -1753,7 +2001,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
if (len > 0) {
dout("apply_incremental full map len %d, %p to %p\n",
len, *p, end);
- return ceph_osdmap_decode(p, min(*p+len, end));
+ return ceph_osdmap_decode(p, min(*p+len, end), msgr2);
}
/* new crush? */
@@ -1799,13 +2047,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
struct ceph_pg_pool_info *pi;
ceph_decode_64_safe(p, end, pool, e_inval);
- pi = __lookup_pg_pool(&map->pg_pools, pool);
+ pi = lookup_pg_pool(&map->pg_pools, pool);
if (pi)
__remove_pg_pool(&map->pg_pools, pi);
}
/* new_up_client, new_state, new_weight */
- err = decode_new_up_state_weight(p, end, struct_v, map);
+ err = decode_new_up_state_weight(p, end, struct_v, msgr2, map);
if (err)
goto bad;
@@ -2249,6 +2497,7 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
s64 choose_args_index)
{
struct crush_choose_arg_map *arg_map;
+ struct crush_work *work;
int r;
BUG_ON(result_max > CEPH_PG_MAX_SIZE);
@@ -2259,12 +2508,11 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
arg_map = lookup_choose_arg_map(&map->crush->choose_args,
CEPH_DEFAULT_CHOOSE_ARGS);
- mutex_lock(&map->crush_workspace_mutex);
+ work = get_workspace(&map->crush_wsm, map->crush);
r = crush_do_rule(map->crush, ruleno, x, result, result_max,
- weight, weight_max, map->crush_workspace,
+ weight, weight_max, work,
arg_map ? arg_map->args : NULL);
- mutex_unlock(&map->crush_workspace_mutex);
-
+ put_workspace(&map->crush_wsm, work);
return r;
}
@@ -2642,3 +2890,221 @@ int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
return acting.primary;
}
EXPORT_SYMBOL(ceph_pg_to_acting_primary);
+
+static struct crush_loc_node *alloc_crush_loc(size_t type_name_len,
+ size_t name_len)
+{
+ struct crush_loc_node *loc;
+
+ loc = kmalloc(sizeof(*loc) + type_name_len + name_len + 2, GFP_NOIO);
+ if (!loc)
+ return NULL;
+
+ RB_CLEAR_NODE(&loc->cl_node);
+ return loc;
+}
+
+static void free_crush_loc(struct crush_loc_node *loc)
+{
+ WARN_ON(!RB_EMPTY_NODE(&loc->cl_node));
+
+ kfree(loc);
+}
+
+static int crush_loc_compare(const struct crush_loc *loc1,
+ const struct crush_loc *loc2)
+{
+ return strcmp(loc1->cl_type_name, loc2->cl_type_name) ?:
+ strcmp(loc1->cl_name, loc2->cl_name);
+}
+
+DEFINE_RB_FUNCS2(crush_loc, struct crush_loc_node, cl_loc, crush_loc_compare,
+ RB_BYPTR, const struct crush_loc *, cl_node)
+
+/*
+ * Parses a set of <bucket type name>':'<bucket name> pairs separated
+ * by '|', e.g. "rack:foo1|rack:foo2|datacenter:bar".
+ *
+ * Note that @crush_location is modified by strsep().
+ */
+int ceph_parse_crush_location(char *crush_location, struct rb_root *locs)
+{
+ struct crush_loc_node *loc;
+ const char *type_name, *name, *colon;
+ size_t type_name_len, name_len;
+
+ dout("%s '%s'\n", __func__, crush_location);
+ while ((type_name = strsep(&crush_location, "|"))) {
+ colon = strchr(type_name, ':');
+ if (!colon)
+ return -EINVAL;
+
+ type_name_len = colon - type_name;
+ if (type_name_len == 0)
+ return -EINVAL;
+
+ name = colon + 1;
+ name_len = strlen(name);
+ if (name_len == 0)
+ return -EINVAL;
+
+ loc = alloc_crush_loc(type_name_len, name_len);
+ if (!loc)
+ return -ENOMEM;
+
+ loc->cl_loc.cl_type_name = loc->cl_data;
+ memcpy(loc->cl_loc.cl_type_name, type_name, type_name_len);
+ loc->cl_loc.cl_type_name[type_name_len] = '\0';
+
+ loc->cl_loc.cl_name = loc->cl_data + type_name_len + 1;
+ memcpy(loc->cl_loc.cl_name, name, name_len);
+ loc->cl_loc.cl_name[name_len] = '\0';
+
+ if (!__insert_crush_loc(locs, loc)) {
+ free_crush_loc(loc);
+ return -EEXIST;
+ }
+
+ dout("%s type_name '%s' name '%s'\n", __func__,
+ loc->cl_loc.cl_type_name, loc->cl_loc.cl_name);
+ }
+
+ return 0;
+}
+
+int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2)
+{
+ struct rb_node *n1 = rb_first(locs1);
+ struct rb_node *n2 = rb_first(locs2);
+ int ret;
+
+ for ( ; n1 && n2; n1 = rb_next(n1), n2 = rb_next(n2)) {
+ struct crush_loc_node *loc1 =
+ rb_entry(n1, struct crush_loc_node, cl_node);
+ struct crush_loc_node *loc2 =
+ rb_entry(n2, struct crush_loc_node, cl_node);
+
+ ret = crush_loc_compare(&loc1->cl_loc, &loc2->cl_loc);
+ if (ret)
+ return ret;
+ }
+
+ if (!n1 && n2)
+ return -1;
+ if (n1 && !n2)
+ return 1;
+ return 0;
+}
+
+void ceph_clear_crush_locs(struct rb_root *locs)
+{
+ while (!RB_EMPTY_ROOT(locs)) {
+ struct crush_loc_node *loc =
+ rb_entry(rb_first(locs), struct crush_loc_node, cl_node);
+
+ erase_crush_loc(locs, loc);
+ free_crush_loc(loc);
+ }
+}
+
+/*
+ * [a-zA-Z0-9-_.]+
+ */
+static bool is_valid_crush_name(const char *name)
+{
+ do {
+ if (!('a' <= *name && *name <= 'z') &&
+ !('A' <= *name && *name <= 'Z') &&
+ !('0' <= *name && *name <= '9') &&
+ *name != '-' && *name != '_' && *name != '.')
+ return false;
+ } while (*++name != '\0');
+
+ return true;
+}
+
+/*
+ * Gets the parent of an item. Returns its id (<0 because the
+ * parent is always a bucket), type id (>0 for the same reason,
+ * via @parent_type_id) and location (via @parent_loc). If no
+ * parent, returns 0.
+ *
+ * Does a linear search, as there are no parent pointers of any
+ * kind. Note that the result is ambiguous for items that occur
+ * multiple times in the map.
+ */
+static int get_immediate_parent(struct crush_map *c, int id,
+ u16 *parent_type_id,
+ struct crush_loc *parent_loc)
+{
+ struct crush_bucket *b;
+ struct crush_name_node *type_cn, *cn;
+ int i, j;
+
+ for (i = 0; i < c->max_buckets; i++) {
+ b = c->buckets[i];
+ if (!b)
+ continue;
+
+ /* ignore per-class shadow hierarchy */
+ cn = lookup_crush_name(&c->names, b->id);
+ if (!cn || !is_valid_crush_name(cn->cn_name))
+ continue;
+
+ for (j = 0; j < b->size; j++) {
+ if (b->items[j] != id)
+ continue;
+
+ *parent_type_id = b->type;
+ type_cn = lookup_crush_name(&c->type_names, b->type);
+ parent_loc->cl_type_name = type_cn->cn_name;
+ parent_loc->cl_name = cn->cn_name;
+ return b->id;
+ }
+ }
+
+ return 0; /* no parent */
+}
+
+/*
+ * Calculates the locality/distance from an item to a client
+ * location expressed in terms of CRUSH hierarchy as a set of
+ * (bucket type name, bucket name) pairs. Specifically, looks
+ * for the lowest-valued bucket type for which the location of
+ * @id matches one of the locations in @locs, so for standard
+ * bucket types (host = 1, rack = 3, datacenter = 8, zone = 9)
+ * a matching host is closer than a matching rack and a matching
+ * data center is closer than a matching zone.
+ *
+ * Specifying multiple locations (a "multipath" location) such
+ * as "rack=foo1 rack=foo2 datacenter=bar" is allowed -- @locs
+ * is a multimap. The locality will be:
+ *
+ * - 3 for OSDs in racks foo1 and foo2
+ * - 8 for OSDs in data center bar
+ * - -1 for all other OSDs
+ *
+ * The lowest possible bucket type is 1, so the best locality
+ * for an OSD is 1 (i.e. a matching host). Locality 0 would be
+ * the OSD itself.
+ */
+int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id,
+ struct rb_root *locs)
+{
+ struct crush_loc loc;
+ u16 type_id;
+
+ /*
+ * Instead of repeated get_immediate_parent() calls,
+ * the location of @id could be obtained with a single
+ * depth-first traversal.
+ */
+ for (;;) {
+ id = get_immediate_parent(osdmap->crush, id, &type_id, &loc);
+ if (id >= 0)
+ return -1; /* not local */
+
+ if (lookup_crush_loc(locs, &loc))
+ return type_id;
+ }
+}
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
index 2ea0564771d2..5a9c4be5f222 100644
--- a/net/ceph/pagelist.c
+++ b/net/ceph/pagelist.c
@@ -6,6 +6,26 @@
#include <linux/highmem.h>
#include <linux/ceph/pagelist.h>
+struct ceph_pagelist *ceph_pagelist_alloc(gfp_t gfp_flags)
+{
+ struct ceph_pagelist *pl;
+
+ pl = kmalloc(sizeof(*pl), gfp_flags);
+ if (!pl)
+ return NULL;
+
+ INIT_LIST_HEAD(&pl->head);
+ pl->mapped_tail = NULL;
+ pl->length = 0;
+ pl->room = 0;
+ INIT_LIST_HEAD(&pl->free_list);
+ pl->num_pages_free = 0;
+ refcount_set(&pl->refcnt, 1);
+
+ return pl;
+}
+EXPORT_SYMBOL(ceph_pagelist_alloc);
+
static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
{
if (pl->mapped_tail) {
@@ -76,7 +96,7 @@ int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
EXPORT_SYMBOL(ceph_pagelist_append);
/* Allocate enough pages for a pagelist to append the given amount
- * of data without without allocating.
+ * of data without allocating.
* Returns: 0 on success, -ENOMEM on error.
*/
int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space)
@@ -111,41 +131,3 @@ int ceph_pagelist_free_reserve(struct ceph_pagelist *pl)
return 0;
}
EXPORT_SYMBOL(ceph_pagelist_free_reserve);
-
-/* Create a truncation point. */
-void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
- struct ceph_pagelist_cursor *c)
-{
- c->pl = pl;
- c->page_lru = pl->head.prev;
- c->room = pl->room;
-}
-EXPORT_SYMBOL(ceph_pagelist_set_cursor);
-
-/* Truncate a pagelist to the given point. Move extra pages to reserve.
- * This won't sleep.
- * Returns: 0 on success,
- * -EINVAL if the pagelist doesn't match the trunc point pagelist
- */
-int ceph_pagelist_truncate(struct ceph_pagelist *pl,
- struct ceph_pagelist_cursor *c)
-{
- struct page *page;
-
- if (pl != c->pl)
- return -EINVAL;
- ceph_pagelist_unmap_tail(pl);
- while (pl->head.prev != c->page_lru) {
- page = list_entry(pl->head.prev, struct page, lru);
- /* move from pagelist to reserve */
- list_move_tail(&page->lru, &pl->free_list);
- ++pl->num_pages_free;
- }
- pl->room = c->room;
- if (!list_empty(&pl->head)) {
- page = list_entry(pl->head.prev, struct page, lru);
- pl->mapped_tail = kmap(page);
- }
- return 0;
-}
-EXPORT_SYMBOL(ceph_pagelist_truncate);
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index d3736f5bffec..4509757d8b3b 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -10,39 +10,6 @@
#include <linux/ceph/libceph.h>
-/*
- * build a vector of user pages
- */
-struct page **ceph_get_direct_page_vector(const void __user *data,
- int num_pages, bool write_page)
-{
- struct page **pages;
- int got = 0;
- int rc = 0;
-
- pages = kmalloc_array(num_pages, sizeof(*pages), GFP_NOFS);
- if (!pages)
- return ERR_PTR(-ENOMEM);
-
- while (got < num_pages) {
- rc = get_user_pages_fast(
- (unsigned long)data + ((unsigned long)got * PAGE_SIZE),
- num_pages - got, write_page, pages + got);
- if (rc < 0)
- break;
- BUG_ON(rc == 0);
- got += rc;
- }
- if (rc < 0)
- goto fail;
- return pages;
-
-fail:
- ceph_put_page_vector(pages, got, false);
- return ERR_PTR(rc);
-}
-EXPORT_SYMBOL(ceph_get_direct_page_vector);
-
void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty)
{
int i;
@@ -88,58 +55,6 @@ struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
}
EXPORT_SYMBOL(ceph_alloc_page_vector);
-/*
- * copy user data into a page vector
- */
-int ceph_copy_user_to_page_vector(struct page **pages,
- const void __user *data,
- loff_t off, size_t len)
-{
- int i = 0;
- int po = off & ~PAGE_MASK;
- int left = len;
- int l, bad;
-
- while (left > 0) {
- l = min_t(int, PAGE_SIZE-po, left);
- bad = copy_from_user(page_address(pages[i]) + po, data, l);
- if (bad == l)
- return -EFAULT;
- data += l - bad;
- left -= l - bad;
- po += l - bad;
- if (po == PAGE_SIZE) {
- po = 0;
- i++;
- }
- }
- return len;
-}
-EXPORT_SYMBOL(ceph_copy_user_to_page_vector);
-
-void ceph_copy_to_page_vector(struct page **pages,
- const void *data,
- loff_t off, size_t len)
-{
- int i = 0;
- size_t po = off & ~PAGE_MASK;
- size_t left = len;
-
- while (left > 0) {
- size_t l = min_t(size_t, PAGE_SIZE-po, left);
-
- memcpy(page_address(pages[i]) + po, data, l);
- data += l;
- left -= l;
- po += l;
- if (po == PAGE_SIZE) {
- po = 0;
- i++;
- }
- }
-}
-EXPORT_SYMBOL(ceph_copy_to_page_vector);
-
void ceph_copy_from_page_vector(struct page **pages,
void *data,
loff_t off, size_t len)
diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c
index e14a5d038656..e24315937c45 100644
--- a/net/ceph/snapshot.c
+++ b/net/ceph/snapshot.c
@@ -1,21 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* snapshot.c Ceph snapshot context utility routines (part of libceph)
*
* Copyright (C) 2013 Inktank Storage, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA.
*/
#include <linux/types.h>
diff --git a/net/ceph/striper.c b/net/ceph/striper.c
index c36462dc86b7..3b3fa75d1189 100644
--- a/net/ceph/striper.c
+++ b/net/ceph/striper.c
@@ -259,3 +259,20 @@ int ceph_extent_to_file(struct ceph_file_layout *l,
return 0;
}
EXPORT_SYMBOL(ceph_extent_to_file);
+
+u64 ceph_get_num_objects(struct ceph_file_layout *l, u64 size)
+{
+ u64 period = (u64)l->stripe_count * l->object_size;
+ u64 num_periods = DIV64_U64_ROUND_UP(size, period);
+ u64 remainder_bytes;
+ u64 remainder_objs = 0;
+
+ div64_u64_rem(size, period, &remainder_bytes);
+ if (remainder_bytes > 0 &&
+ remainder_bytes < (u64)l->stripe_count * l->stripe_unit)
+ remainder_objs = l->stripe_count -
+ DIV_ROUND_UP_ULL(remainder_bytes, l->stripe_unit);
+
+ return num_periods * l->stripe_count - remainder_objs;
+}
+EXPORT_SYMBOL(ceph_get_num_objects);
diff --git a/net/compat.c b/net/compat.c
index 3b2105f6549d..2c9bd0edac99 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* 32bit Socket syscall emulation. Based on arch/sparc64/kernel/sys_sparc32.c.
*
@@ -32,21 +33,16 @@
#include <linux/uaccess.h>
#include <net/compat.h>
-int get_compat_msghdr(struct msghdr *kmsg,
- struct compat_msghdr __user *umsg,
- struct sockaddr __user **save_addr,
- struct iovec **iov)
+int __get_compat_msghdr(struct msghdr *kmsg,
+ struct compat_msghdr *msg,
+ struct sockaddr __user **save_addr)
{
- struct compat_msghdr msg;
ssize_t err;
- if (copy_from_user(&msg, umsg, sizeof(*umsg)))
- return -EFAULT;
-
- kmsg->msg_flags = msg.msg_flags;
- kmsg->msg_namelen = msg.msg_namelen;
+ kmsg->msg_flags = msg->msg_flags;
+ kmsg->msg_namelen = msg->msg_namelen;
- if (!msg.msg_name)
+ if (!msg->msg_name)
kmsg->msg_namelen = 0;
if (kmsg->msg_namelen < 0)
@@ -55,15 +51,17 @@ int get_compat_msghdr(struct msghdr *kmsg,
if (kmsg->msg_namelen > sizeof(struct sockaddr_storage))
kmsg->msg_namelen = sizeof(struct sockaddr_storage);
- kmsg->msg_control = compat_ptr(msg.msg_control);
- kmsg->msg_controllen = msg.msg_controllen;
+ kmsg->msg_control_is_user = true;
+ kmsg->msg_get_inq = 0;
+ kmsg->msg_control_user = compat_ptr(msg->msg_control);
+ kmsg->msg_controllen = msg->msg_controllen;
if (save_addr)
- *save_addr = compat_ptr(msg.msg_name);
+ *save_addr = compat_ptr(msg->msg_name);
- if (msg.msg_name && kmsg->msg_namelen) {
+ if (msg->msg_name && kmsg->msg_namelen) {
if (!save_addr) {
- err = move_addr_to_kernel(compat_ptr(msg.msg_name),
+ err = move_addr_to_kernel(compat_ptr(msg->msg_name),
kmsg->msg_namelen,
kmsg->msg_name);
if (err < 0)
@@ -74,14 +72,33 @@ int get_compat_msghdr(struct msghdr *kmsg,
kmsg->msg_namelen = 0;
}
- if (msg.msg_iovlen > UIO_MAXIOV)
+ if (msg->msg_iovlen > UIO_MAXIOV)
return -EMSGSIZE;
kmsg->msg_iocb = NULL;
+ kmsg->msg_ubuf = NULL;
+ return 0;
+}
- return compat_import_iovec(save_addr ? READ : WRITE,
- compat_ptr(msg.msg_iov), msg.msg_iovlen,
- UIO_FASTIOV, iov, &kmsg->msg_iter);
+int get_compat_msghdr(struct msghdr *kmsg,
+ struct compat_msghdr __user *umsg,
+ struct sockaddr __user **save_addr,
+ struct iovec **iov)
+{
+ struct compat_msghdr msg;
+ ssize_t err;
+
+ if (copy_from_user(&msg, umsg, sizeof(*umsg)))
+ return -EFAULT;
+
+ err = __get_compat_msghdr(kmsg, &msg, save_addr);
+ if (err)
+ return err;
+
+ err = import_iovec(save_addr ? ITER_DEST : ITER_SOURCE,
+ compat_ptr(msg.msg_iov), msg.msg_iovlen,
+ UIO_FASTIOV, iov, &kmsg->msg_iter);
+ return err < 0 ? err : 0;
}
/* Bleech... */
@@ -96,20 +113,20 @@ int get_compat_msghdr(struct msghdr *kmsg,
#define CMSG_COMPAT_FIRSTHDR(msg) \
(((msg)->msg_controllen) >= sizeof(struct compat_cmsghdr) ? \
- (struct compat_cmsghdr __user *)((msg)->msg_control) : \
+ (struct compat_cmsghdr __user *)((msg)->msg_control_user) : \
(struct compat_cmsghdr __user *)NULL)
#define CMSG_COMPAT_OK(ucmlen, ucmsg, mhdr) \
((ucmlen) >= sizeof(struct compat_cmsghdr) && \
(ucmlen) <= (unsigned long) \
((mhdr)->msg_controllen - \
- ((char *)(ucmsg) - (char *)(mhdr)->msg_control)))
+ ((char __user *)(ucmsg) - (char __user *)(mhdr)->msg_control_user)))
static inline struct compat_cmsghdr __user *cmsg_compat_nxthdr(struct msghdr *msg,
struct compat_cmsghdr __user *cmsg, int cmsg_len)
{
char __user *ptr = (char __user *)cmsg + CMSG_COMPAT_ALIGN(cmsg_len);
- if ((unsigned long)(ptr + 1 - (char __user *)msg->msg_control) >
+ if ((unsigned long)(ptr + 1 - (char __user *)msg->msg_control_user) >
msg->msg_controllen)
return NULL;
return (struct compat_cmsghdr __user *)ptr;
@@ -158,31 +175,32 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
if (kcmlen > stackbuf_size)
kcmsg_base = kcmsg = sock_kmalloc(sk, kcmlen, GFP_KERNEL);
if (kcmsg == NULL)
- return -ENOBUFS;
+ return -ENOMEM;
/* Now copy them over neatly. */
memset(kcmsg, 0, kcmlen);
ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg);
while (ucmsg != NULL) {
- if (__get_user(ucmlen, &ucmsg->cmsg_len))
+ struct compat_cmsghdr cmsg;
+ if (copy_from_user(&cmsg, ucmsg, sizeof(cmsg)))
goto Efault;
- if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg))
+ if (!CMSG_COMPAT_OK(cmsg.cmsg_len, ucmsg, kmsg))
goto Einval;
- tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr));
+ tmp = ((cmsg.cmsg_len - sizeof(*ucmsg)) + sizeof(struct cmsghdr));
if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp))
goto Einval;
kcmsg->cmsg_len = tmp;
+ kcmsg->cmsg_level = cmsg.cmsg_level;
+ kcmsg->cmsg_type = cmsg.cmsg_type;
tmp = CMSG_ALIGN(tmp);
- if (__get_user(kcmsg->cmsg_level, &ucmsg->cmsg_level) ||
- __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) ||
- copy_from_user(CMSG_DATA(kcmsg),
+ if (copy_from_user(CMSG_DATA(kcmsg),
CMSG_COMPAT_DATA(ucmsg),
- (ucmlen - sizeof(*ucmsg))))
+ (cmsg.cmsg_len - sizeof(*ucmsg))))
goto Efault;
/* Advance. */
kcmsg = (struct cmsghdr *)((char *)kcmsg + tmp);
- ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen);
+ ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, cmsg.cmsg_len);
}
/*
@@ -193,6 +211,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk,
goto Einval;
/* Ok, looks like we made it. Hook it up and return success. */
+ kmsg->msg_control_is_user = false;
kmsg->msg_control = kcmsg_base;
kmsg->msg_controllen = kcmlen;
return 0;
@@ -207,10 +226,10 @@ Efault:
int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *data)
{
- struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control;
+ struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control_user;
struct compat_cmsghdr cmhdr;
- struct compat_timeval ctv;
- struct compat_timespec cts[3];
+ struct old_timeval32 ctv;
+ struct old_timespec32 cts[3];
int cmlen;
if (cm == NULL || kmsg->msg_controllen < sizeof(*cm)) {
@@ -219,18 +238,18 @@ int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *dat
}
if (!COMPAT_USE_64BIT_TIME) {
- if (level == SOL_SOCKET && type == SCM_TIMESTAMP) {
- struct timeval *tv = (struct timeval *)data;
+ if (level == SOL_SOCKET && type == SO_TIMESTAMP_OLD) {
+ struct __kernel_old_timeval *tv = (struct __kernel_old_timeval *)data;
ctv.tv_sec = tv->tv_sec;
ctv.tv_usec = tv->tv_usec;
data = &ctv;
len = sizeof(ctv);
}
if (level == SOL_SOCKET &&
- (type == SCM_TIMESTAMPNS || type == SCM_TIMESTAMPING)) {
- int count = type == SCM_TIMESTAMPNS ? 1 : 3;
+ (type == SO_TIMESTAMPNS_OLD || type == SO_TIMESTAMPING_OLD)) {
+ int count = type == SO_TIMESTAMPNS_OLD ? 1 : 3;
int i;
- struct timespec *ts = (struct timespec *)data;
+ struct __kernel_old_timespec *ts = data;
for (i = 0; i < count; i++) {
cts[i].tv_sec = ts[i].tv_sec;
cts[i].tv_nsec = ts[i].tv_nsec;
@@ -256,44 +275,36 @@ int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *dat
cmlen = CMSG_COMPAT_SPACE(len);
if (kmsg->msg_controllen < cmlen)
cmlen = kmsg->msg_controllen;
- kmsg->msg_control += cmlen;
+ kmsg->msg_control_user += cmlen;
kmsg->msg_controllen -= cmlen;
return 0;
}
-void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm)
+static int scm_max_fds_compat(struct msghdr *msg)
{
- struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control;
- int fdmax = (kmsg->msg_controllen - sizeof(struct compat_cmsghdr)) / sizeof(int);
- int fdnum = scm->fp->count;
- struct file **fp = scm->fp->fp;
- int __user *cmfptr;
- int err = 0, i;
+ if (msg->msg_controllen <= sizeof(struct compat_cmsghdr))
+ return 0;
+ return (msg->msg_controllen - sizeof(struct compat_cmsghdr)) / sizeof(int);
+}
- if (fdnum < fdmax)
- fdmax = fdnum;
+void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm)
+{
+ struct compat_cmsghdr __user *cm =
+ (struct compat_cmsghdr __user *)msg->msg_control_user;
+ unsigned int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0;
+ int fdmax = min_t(int, scm_max_fds_compat(msg), scm->fp->count);
+ int __user *cmsg_data = CMSG_COMPAT_DATA(cm);
+ int err = 0, i;
- for (i = 0, cmfptr = (int __user *) CMSG_COMPAT_DATA(cm); i < fdmax; i++, cmfptr++) {
- int new_fd;
- err = security_file_receive(fp[i]);
- if (err)
- break;
- err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & kmsg->msg_flags
- ? O_CLOEXEC : 0);
+ for (i = 0; i < fdmax; i++) {
+ err = scm_recv_one_fd(scm->fp->fp[i], cmsg_data + i, o_flags);
if (err < 0)
break;
- new_fd = err;
- err = put_user(new_fd, cmfptr);
- if (err) {
- put_unused_fd(new_fd);
- break;
- }
- /* Bump the usage count and install the file. */
- fd_install(new_fd, get_file(fp[i]));
}
if (i > 0) {
int cmlen = CMSG_COMPAT_LEN(i * sizeof(int));
+
err = put_user(SOL_SOCKET, &cm->cmsg_level);
if (!err)
err = put_user(SCM_RIGHTS, &cm->cmsg_type);
@@ -301,442 +312,23 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm)
err = put_user(cmlen, &cm->cmsg_len);
if (!err) {
cmlen = CMSG_COMPAT_SPACE(i * sizeof(int));
- kmsg->msg_control += cmlen;
- kmsg->msg_controllen -= cmlen;
+ if (msg->msg_controllen < cmlen)
+ cmlen = msg->msg_controllen;
+ msg->msg_control_user += cmlen;
+ msg->msg_controllen -= cmlen;
}
}
- if (i < fdnum)
- kmsg->msg_flags |= MSG_CTRUNC;
+
+ if (i < scm->fp->count || (scm->fp->count && fdmax <= 0))
+ msg->msg_flags |= MSG_CTRUNC;
/*
- * All of the files that fit in the message have had their
- * usage counts incremented, so we just free the list.
+ * All of the files that fit in the message have had their usage counts
+ * incremented, so we just free the list.
*/
__scm_destroy(scm);
}
-/* allocate a 64-bit sock_fprog on the user stack for duration of syscall. */
-struct sock_fprog __user *get_compat_bpf_fprog(char __user *optval)
-{
- struct compat_sock_fprog __user *fprog32 = (struct compat_sock_fprog __user *)optval;
- struct sock_fprog __user *kfprog = compat_alloc_user_space(sizeof(struct sock_fprog));
- struct compat_sock_fprog f32;
- struct sock_fprog f;
-
- if (copy_from_user(&f32, fprog32, sizeof(*fprog32)))
- return NULL;
- memset(&f, 0, sizeof(f));
- f.len = f32.len;
- f.filter = compat_ptr(f32.filter);
- if (copy_to_user(kfprog, &f, sizeof(struct sock_fprog)))
- return NULL;
-
- return kfprog;
-}
-EXPORT_SYMBOL_GPL(get_compat_bpf_fprog);
-
-static int do_set_attach_filter(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- struct sock_fprog __user *kfprog;
-
- kfprog = get_compat_bpf_fprog(optval);
- if (!kfprog)
- return -EFAULT;
-
- return sock_setsockopt(sock, level, optname, (char __user *)kfprog,
- sizeof(struct sock_fprog));
-}
-
-static int do_set_sock_timeout(struct socket *sock, int level,
- int optname, char __user *optval, unsigned int optlen)
-{
- struct compat_timeval __user *up = (struct compat_timeval __user *)optval;
- struct timeval ktime;
- mm_segment_t old_fs;
- int err;
-
- if (optlen < sizeof(*up))
- return -EINVAL;
- if (!access_ok(VERIFY_READ, up, sizeof(*up)) ||
- __get_user(ktime.tv_sec, &up->tv_sec) ||
- __get_user(ktime.tv_usec, &up->tv_usec))
- return -EFAULT;
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- err = sock_setsockopt(sock, level, optname, (char *)&ktime, sizeof(ktime));
- set_fs(old_fs);
-
- return err;
-}
-
-static int compat_sock_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- if (optname == SO_ATTACH_FILTER ||
- optname == SO_ATTACH_REUSEPORT_CBPF)
- return do_set_attach_filter(sock, level, optname,
- optval, optlen);
- if (!COMPAT_USE_64BIT_TIME &&
- (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO))
- return do_set_sock_timeout(sock, level, optname, optval, optlen);
-
- return sock_setsockopt(sock, level, optname, optval, optlen);
-}
-
-static int __compat_sys_setsockopt(int fd, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- int err;
- struct socket *sock = sockfd_lookup(fd, &err);
-
- if (sock) {
- err = security_socket_setsockopt(sock, level, optname);
- if (err) {
- sockfd_put(sock);
- return err;
- }
-
- if (level == SOL_SOCKET)
- err = compat_sock_setsockopt(sock, level,
- optname, optval, optlen);
- else if (sock->ops->compat_setsockopt)
- err = sock->ops->compat_setsockopt(sock, level,
- optname, optval, optlen);
- else
- err = sock->ops->setsockopt(sock, level,
- optname, optval, optlen);
- sockfd_put(sock);
- }
- return err;
-}
-
-COMPAT_SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname,
- char __user *, optval, unsigned int, optlen)
-{
- return __compat_sys_setsockopt(fd, level, optname, optval, optlen);
-}
-
-static int do_get_sock_timeout(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- struct compat_timeval __user *up;
- struct timeval ktime;
- mm_segment_t old_fs;
- int len, err;
-
- up = (struct compat_timeval __user *) optval;
- if (get_user(len, optlen))
- return -EFAULT;
- if (len < sizeof(*up))
- return -EINVAL;
- len = sizeof(ktime);
- old_fs = get_fs();
- set_fs(KERNEL_DS);
- err = sock_getsockopt(sock, level, optname, (char *) &ktime, &len);
- set_fs(old_fs);
-
- if (!err) {
- if (put_user(sizeof(*up), optlen) ||
- !access_ok(VERIFY_WRITE, up, sizeof(*up)) ||
- __put_user(ktime.tv_sec, &up->tv_sec) ||
- __put_user(ktime.tv_usec, &up->tv_usec))
- err = -EFAULT;
- }
- return err;
-}
-
-static int compat_sock_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- if (!COMPAT_USE_64BIT_TIME &&
- (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO))
- return do_get_sock_timeout(sock, level, optname, optval, optlen);
- return sock_getsockopt(sock, level, optname, optval, optlen);
-}
-
-int compat_sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
-{
- struct compat_timeval __user *ctv;
- int err;
- struct timeval tv;
-
- if (COMPAT_USE_64BIT_TIME)
- return sock_get_timestamp(sk, userstamp);
-
- ctv = (struct compat_timeval __user *) userstamp;
- err = -ENOENT;
- sock_enable_timestamp(sk, SOCK_TIMESTAMP);
- tv = ktime_to_timeval(sk->sk_stamp);
- if (tv.tv_sec == -1)
- return err;
- if (tv.tv_sec == 0) {
- sk->sk_stamp = ktime_get_real();
- tv = ktime_to_timeval(sk->sk_stamp);
- }
- err = 0;
- if (put_user(tv.tv_sec, &ctv->tv_sec) ||
- put_user(tv.tv_usec, &ctv->tv_usec))
- err = -EFAULT;
- return err;
-}
-EXPORT_SYMBOL(compat_sock_get_timestamp);
-
-int compat_sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
-{
- struct compat_timespec __user *ctv;
- int err;
- struct timespec ts;
-
- if (COMPAT_USE_64BIT_TIME)
- return sock_get_timestampns (sk, userstamp);
-
- ctv = (struct compat_timespec __user *) userstamp;
- err = -ENOENT;
- sock_enable_timestamp(sk, SOCK_TIMESTAMP);
- ts = ktime_to_timespec(sk->sk_stamp);
- if (ts.tv_sec == -1)
- return err;
- if (ts.tv_sec == 0) {
- sk->sk_stamp = ktime_get_real();
- ts = ktime_to_timespec(sk->sk_stamp);
- }
- err = 0;
- if (put_user(ts.tv_sec, &ctv->tv_sec) ||
- put_user(ts.tv_nsec, &ctv->tv_nsec))
- err = -EFAULT;
- return err;
-}
-EXPORT_SYMBOL(compat_sock_get_timestampns);
-
-static int __compat_sys_getsockopt(int fd, int level, int optname,
- char __user *optval,
- int __user *optlen)
-{
- int err;
- struct socket *sock = sockfd_lookup(fd, &err);
-
- if (sock) {
- err = security_socket_getsockopt(sock, level, optname);
- if (err) {
- sockfd_put(sock);
- return err;
- }
-
- if (level == SOL_SOCKET)
- err = compat_sock_getsockopt(sock, level,
- optname, optval, optlen);
- else if (sock->ops->compat_getsockopt)
- err = sock->ops->compat_getsockopt(sock, level,
- optname, optval, optlen);
- else
- err = sock->ops->getsockopt(sock, level,
- optname, optval, optlen);
- sockfd_put(sock);
- }
- return err;
-}
-
-COMPAT_SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname,
- char __user *, optval, int __user *, optlen)
-{
- return __compat_sys_getsockopt(fd, level, optname, optval, optlen);
-}
-
-struct compat_group_req {
- __u32 gr_interface;
- struct __kernel_sockaddr_storage gr_group
- __aligned(4);
-} __packed;
-
-struct compat_group_source_req {
- __u32 gsr_interface;
- struct __kernel_sockaddr_storage gsr_group
- __aligned(4);
- struct __kernel_sockaddr_storage gsr_source
- __aligned(4);
-} __packed;
-
-struct compat_group_filter {
- __u32 gf_interface;
- struct __kernel_sockaddr_storage gf_group
- __aligned(4);
- __u32 gf_fmode;
- __u32 gf_numsrc;
- struct __kernel_sockaddr_storage gf_slist[1]
- __aligned(4);
-} __packed;
-
-#define __COMPAT_GF0_SIZE (sizeof(struct compat_group_filter) - \
- sizeof(struct __kernel_sockaddr_storage))
-
-
-int compat_mc_setsockopt(struct sock *sock, int level, int optname,
- char __user *optval, unsigned int optlen,
- int (*setsockopt)(struct sock *, int, int, char __user *, unsigned int))
-{
- char __user *koptval = optval;
- int koptlen = optlen;
-
- switch (optname) {
- case MCAST_JOIN_GROUP:
- case MCAST_LEAVE_GROUP:
- {
- struct compat_group_req __user *gr32 = (void *)optval;
- struct group_req __user *kgr =
- compat_alloc_user_space(sizeof(struct group_req));
- u32 interface;
-
- if (!access_ok(VERIFY_READ, gr32, sizeof(*gr32)) ||
- !access_ok(VERIFY_WRITE, kgr, sizeof(struct group_req)) ||
- __get_user(interface, &gr32->gr_interface) ||
- __put_user(interface, &kgr->gr_interface) ||
- copy_in_user(&kgr->gr_group, &gr32->gr_group,
- sizeof(kgr->gr_group)))
- return -EFAULT;
- koptval = (char __user *)kgr;
- koptlen = sizeof(struct group_req);
- break;
- }
- case MCAST_JOIN_SOURCE_GROUP:
- case MCAST_LEAVE_SOURCE_GROUP:
- case MCAST_BLOCK_SOURCE:
- case MCAST_UNBLOCK_SOURCE:
- {
- struct compat_group_source_req __user *gsr32 = (void *)optval;
- struct group_source_req __user *kgsr = compat_alloc_user_space(
- sizeof(struct group_source_req));
- u32 interface;
-
- if (!access_ok(VERIFY_READ, gsr32, sizeof(*gsr32)) ||
- !access_ok(VERIFY_WRITE, kgsr,
- sizeof(struct group_source_req)) ||
- __get_user(interface, &gsr32->gsr_interface) ||
- __put_user(interface, &kgsr->gsr_interface) ||
- copy_in_user(&kgsr->gsr_group, &gsr32->gsr_group,
- sizeof(kgsr->gsr_group)) ||
- copy_in_user(&kgsr->gsr_source, &gsr32->gsr_source,
- sizeof(kgsr->gsr_source)))
- return -EFAULT;
- koptval = (char __user *)kgsr;
- koptlen = sizeof(struct group_source_req);
- break;
- }
- case MCAST_MSFILTER:
- {
- struct compat_group_filter __user *gf32 = (void *)optval;
- struct group_filter __user *kgf;
- u32 interface, fmode, numsrc;
-
- if (!access_ok(VERIFY_READ, gf32, __COMPAT_GF0_SIZE) ||
- __get_user(interface, &gf32->gf_interface) ||
- __get_user(fmode, &gf32->gf_fmode) ||
- __get_user(numsrc, &gf32->gf_numsrc))
- return -EFAULT;
- koptlen = optlen + sizeof(struct group_filter) -
- sizeof(struct compat_group_filter);
- if (koptlen < GROUP_FILTER_SIZE(numsrc))
- return -EINVAL;
- kgf = compat_alloc_user_space(koptlen);
- if (!access_ok(VERIFY_WRITE, kgf, koptlen) ||
- __put_user(interface, &kgf->gf_interface) ||
- __put_user(fmode, &kgf->gf_fmode) ||
- __put_user(numsrc, &kgf->gf_numsrc) ||
- copy_in_user(&kgf->gf_group, &gf32->gf_group,
- sizeof(kgf->gf_group)) ||
- (numsrc && copy_in_user(kgf->gf_slist, gf32->gf_slist,
- numsrc * sizeof(kgf->gf_slist[0]))))
- return -EFAULT;
- koptval = (char __user *)kgf;
- break;
- }
-
- default:
- break;
- }
- return setsockopt(sock, level, optname, koptval, koptlen);
-}
-EXPORT_SYMBOL(compat_mc_setsockopt);
-
-int compat_mc_getsockopt(struct sock *sock, int level, int optname,
- char __user *optval, int __user *optlen,
- int (*getsockopt)(struct sock *, int, int, char __user *, int __user *))
-{
- struct compat_group_filter __user *gf32 = (void *)optval;
- struct group_filter __user *kgf;
- int __user *koptlen;
- u32 interface, fmode, numsrc;
- int klen, ulen, err;
-
- if (optname != MCAST_MSFILTER)
- return getsockopt(sock, level, optname, optval, optlen);
-
- koptlen = compat_alloc_user_space(sizeof(*koptlen));
- if (!access_ok(VERIFY_READ, optlen, sizeof(*optlen)) ||
- __get_user(ulen, optlen))
- return -EFAULT;
-
- /* adjust len for pad */
- klen = ulen + sizeof(*kgf) - sizeof(*gf32);
-
- if (klen < GROUP_FILTER_SIZE(0))
- return -EINVAL;
-
- if (!access_ok(VERIFY_WRITE, koptlen, sizeof(*koptlen)) ||
- __put_user(klen, koptlen))
- return -EFAULT;
-
- /* have to allow space for previous compat_alloc_user_space, too */
- kgf = compat_alloc_user_space(klen+sizeof(*optlen));
-
- if (!access_ok(VERIFY_READ, gf32, __COMPAT_GF0_SIZE) ||
- __get_user(interface, &gf32->gf_interface) ||
- __get_user(fmode, &gf32->gf_fmode) ||
- __get_user(numsrc, &gf32->gf_numsrc) ||
- __put_user(interface, &kgf->gf_interface) ||
- __put_user(fmode, &kgf->gf_fmode) ||
- __put_user(numsrc, &kgf->gf_numsrc) ||
- copy_in_user(&kgf->gf_group, &gf32->gf_group, sizeof(kgf->gf_group)))
- return -EFAULT;
-
- err = getsockopt(sock, level, optname, (char __user *)kgf, koptlen);
- if (err)
- return err;
-
- if (!access_ok(VERIFY_READ, koptlen, sizeof(*koptlen)) ||
- __get_user(klen, koptlen))
- return -EFAULT;
-
- ulen = klen - (sizeof(*kgf)-sizeof(*gf32));
-
- if (!access_ok(VERIFY_WRITE, optlen, sizeof(*optlen)) ||
- __put_user(ulen, optlen))
- return -EFAULT;
-
- if (!access_ok(VERIFY_READ, kgf, klen) ||
- !access_ok(VERIFY_WRITE, gf32, ulen) ||
- __get_user(interface, &kgf->gf_interface) ||
- __get_user(fmode, &kgf->gf_fmode) ||
- __get_user(numsrc, &kgf->gf_numsrc) ||
- __put_user(interface, &gf32->gf_interface) ||
- __put_user(fmode, &gf32->gf_fmode) ||
- __put_user(numsrc, &gf32->gf_numsrc))
- return -EFAULT;
- if (numsrc) {
- int copylen;
-
- klen -= GROUP_FILTER_SIZE(0);
- copylen = numsrc * sizeof(gf32->gf_slist[0]);
- if (copylen > klen)
- copylen = klen;
- if (copy_in_user(gf32->gf_slist, kgf->gf_slist, copylen))
- return -EFAULT;
- }
- return err;
-}
-EXPORT_SYMBOL(compat_mc_getsockopt);
-
-
/* Argument list sizes for compat_sys_socketcall */
#define AL(x) ((x) * sizeof(u32))
static unsigned char nas[21] = {
@@ -810,34 +402,23 @@ COMPAT_SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, buf, compat_size_t, len
return __compat_sys_recvfrom(fd, buf, len, flags, addr, addrlen);
}
-static int __compat_sys_recvmmsg(int fd, struct compat_mmsghdr __user *mmsg,
- unsigned int vlen, unsigned int flags,
- struct compat_timespec __user *timeout)
+COMPAT_SYSCALL_DEFINE5(recvmmsg_time64, int, fd, struct compat_mmsghdr __user *, mmsg,
+ unsigned int, vlen, unsigned int, flags,
+ struct __kernel_timespec __user *, timeout)
{
- int datagrams;
- struct timespec ktspec;
-
- if (timeout == NULL)
- return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
- flags | MSG_CMSG_COMPAT, NULL);
-
- if (compat_get_timespec(&ktspec, timeout))
- return -EFAULT;
-
- datagrams = __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
- flags | MSG_CMSG_COMPAT, &ktspec);
- if (datagrams > 0 && compat_put_timespec(&ktspec, timeout))
- datagrams = -EFAULT;
-
- return datagrams;
+ return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
+ flags | MSG_CMSG_COMPAT, timeout, NULL);
}
-COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg,
+#ifdef CONFIG_COMPAT_32BIT_TIME
+COMPAT_SYSCALL_DEFINE5(recvmmsg_time32, int, fd, struct compat_mmsghdr __user *, mmsg,
unsigned int, vlen, unsigned int, flags,
- struct compat_timespec __user *, timeout)
+ struct old_timespec32 __user *, timeout)
{
- return __compat_sys_recvmmsg(fd, mmsg, vlen, flags, timeout);
+ return __sys_recvmmsg(fd, (struct mmsghdr __user *)mmsg, vlen,
+ flags | MSG_CMSG_COMPAT, NULL, timeout);
}
+#endif
COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args)
{
@@ -879,10 +460,10 @@ COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args)
ret = __sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), 0);
break;
case SYS_GETSOCKNAME:
- ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]));
+ ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]), 0);
break;
case SYS_GETPEERNAME:
- ret = __sys_getpeername(a0, compat_ptr(a1), compat_ptr(a[2]));
+ ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]), 1);
break;
case SYS_SOCKETPAIR:
ret = __sys_socketpair(a0, a1, a[2], compat_ptr(a[3]));
@@ -907,13 +488,11 @@ COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args)
ret = __sys_shutdown(a0, a1);
break;
case SYS_SETSOCKOPT:
- ret = __compat_sys_setsockopt(a0, a1, a[2],
- compat_ptr(a[3]), a[4]);
+ ret = __sys_setsockopt(a0, a1, a[2], compat_ptr(a[3]), a[4]);
break;
case SYS_GETSOCKOPT:
- ret = __compat_sys_getsockopt(a0, a1, a[2],
- compat_ptr(a[3]),
- compat_ptr(a[4]));
+ ret = __sys_getsockopt(a0, a1, a[2], compat_ptr(a[3]),
+ compat_ptr(a[4]));
break;
case SYS_SENDMSG:
ret = __compat_sys_sendmsg(a0, compat_ptr(a1), a[2]);
@@ -925,8 +504,9 @@ COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args)
ret = __compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
break;
case SYS_RECVMMSG:
- ret = __compat_sys_recvmmsg(a0, compat_ptr(a1), a[2], a[3],
- compat_ptr(a[4]));
+ ret = __sys_recvmmsg(a0, compat_ptr(a1), a[2],
+ a[3] | MSG_CMSG_COMPAT, NULL,
+ compat_ptr(a[4]));
break;
case SYS_ACCEPT4:
ret = __sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]);
diff --git a/net/core/Makefile b/net/core/Makefile
index 80175e6a2eb8..9ef2099c5426 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -4,23 +4,32 @@
#
obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \
- gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o
+ gen_stats.o gen_estimator.o net_namespace.o secure_seq.o \
+ flow_dissector.o
obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
-obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
+obj-y += dev.o dev_api.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \
- fib_notifier.o xdp.o
+ fib_notifier.o xdp.o flow_offload.o gro.o \
+ netdev-genl.o netdev-genl-gen.o gso.o
+
+obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
obj-y += net-sysfs.o
-obj-$(CONFIG_PAGE_POOL) += page_pool.o
+obj-y += hotdata.o
+obj-y += netdev_rx_queue.o
+obj-y += netdev_queues.o
+obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
obj-$(CONFIG_NETPOLL) += netpoll.o
obj-$(CONFIG_FIB_RULES) += fib_rules.o
obj-$(CONFIG_TRACEPOINTS) += net-traces.o
obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
+obj-$(CONFIG_NET_IEEE8021Q_HELPERS) += ieee8021q_helpers.o
+obj-$(CONFIG_NET_SELFTESTS) += selftests.o
obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
@@ -29,6 +38,13 @@ obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o
obj-$(CONFIG_DST_CACHE) += dst_cache.o
obj-$(CONFIG_HWBM) += hwbm.o
-obj-$(CONFIG_NET_DEVLINK) += devlink.o
obj-$(CONFIG_GRO_CELLS) += gro_cells.o
obj-$(CONFIG_FAILOVER) += failover.o
+obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o
+obj-$(CONFIG_BPF_SYSCALL) += sock_map.o
+obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o
+obj-$(CONFIG_OF) += of_net.o
+obj-$(CONFIG_NET_TEST) += net_test.o
+obj-$(CONFIG_NET_DEVMEM) += devmem.o
+obj-$(CONFIG_DEBUG_NET) += lock_debug.o
+obj-$(CONFIG_FAIL_SKB_REALLOC) += skb_fault_injection.o
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
new file mode 100644
index 000000000000..850dd736ccd1
--- /dev/null
+++ b/net/core/bpf_sk_storage.c
@@ -0,0 +1,914 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+#include <linux/rculist.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/bpf_local_storage.h>
+#include <net/bpf_sk_storage.h>
+#include <net/sock.h>
+#include <uapi/linux/sock_diag.h>
+#include <uapi/linux/btf.h>
+#include <linux/rcupdate_trace.h>
+
+DEFINE_BPF_STORAGE_CACHE(sk_cache);
+
+static struct bpf_local_storage_data *
+bpf_sk_storage_lookup(struct sock *sk, struct bpf_map *map, bool cacheit_lockit)
+{
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_map *smap;
+
+ sk_storage =
+ rcu_dereference_check(sk->sk_bpf_storage, bpf_rcu_lock_held());
+ if (!sk_storage)
+ return NULL;
+
+ smap = (struct bpf_local_storage_map *)map;
+ return bpf_local_storage_lookup(sk_storage, smap, cacheit_lockit);
+}
+
+static int bpf_sk_storage_del(struct sock *sk, struct bpf_map *map)
+{
+ struct bpf_local_storage_data *sdata;
+
+ sdata = bpf_sk_storage_lookup(sk, map, false);
+ if (!sdata)
+ return -ENOENT;
+
+ bpf_selem_unlink(SELEM(sdata), false);
+
+ return 0;
+}
+
+/* Called by __sk_destruct() & bpf_sk_storage_clone() */
+void bpf_sk_storage_free(struct sock *sk)
+{
+ struct bpf_local_storage *sk_storage;
+
+ rcu_read_lock_dont_migrate();
+ sk_storage = rcu_dereference(sk->sk_bpf_storage);
+ if (!sk_storage)
+ goto out;
+
+ bpf_local_storage_destroy(sk_storage);
+out:
+ rcu_read_unlock_migrate();
+}
+
+static void bpf_sk_storage_map_free(struct bpf_map *map)
+{
+ bpf_local_storage_map_free(map, &sk_cache, NULL);
+}
+
+static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
+{
+ return bpf_local_storage_map_alloc(attr, &sk_cache, false);
+}
+
+static int notsupp_get_next_key(struct bpf_map *map, void *key,
+ void *next_key)
+{
+ return -ENOTSUPP;
+}
+
+static void *bpf_fd_sk_storage_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_local_storage_data *sdata;
+ struct socket *sock;
+ int fd, err;
+
+ fd = *(int *)key;
+ sock = sockfd_lookup(fd, &err);
+ if (sock) {
+ sdata = bpf_sk_storage_lookup(sock->sk, map, true);
+ sockfd_put(sock);
+ return sdata ? sdata->data : NULL;
+ }
+
+ return ERR_PTR(err);
+}
+
+static long bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_local_storage_data *sdata;
+ struct socket *sock;
+ int fd, err;
+
+ fd = *(int *)key;
+ sock = sockfd_lookup(fd, &err);
+ if (sock) {
+ sdata = bpf_local_storage_update(
+ sock->sk, (struct bpf_local_storage_map *)map, value,
+ map_flags, false, GFP_ATOMIC);
+ sockfd_put(sock);
+ return PTR_ERR_OR_ZERO(sdata);
+ }
+
+ return err;
+}
+
+static long bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
+{
+ struct socket *sock;
+ int fd, err;
+
+ fd = *(int *)key;
+ sock = sockfd_lookup(fd, &err);
+ if (sock) {
+ err = bpf_sk_storage_del(sock->sk, map);
+ sockfd_put(sock);
+ return err;
+ }
+
+ return err;
+}
+
+static struct bpf_local_storage_elem *
+bpf_sk_storage_clone_elem(struct sock *newsk,
+ struct bpf_local_storage_map *smap,
+ struct bpf_local_storage_elem *selem)
+{
+ struct bpf_local_storage_elem *copy_selem;
+
+ copy_selem = bpf_selem_alloc(smap, newsk, NULL, false, GFP_ATOMIC);
+ if (!copy_selem)
+ return NULL;
+
+ if (btf_record_has_field(smap->map.record, BPF_SPIN_LOCK))
+ copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
+ SDATA(selem)->data, true);
+ else
+ copy_map_value(&smap->map, SDATA(copy_selem)->data,
+ SDATA(selem)->data);
+
+ return copy_selem;
+}
+
+int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
+{
+ struct bpf_local_storage *new_sk_storage = NULL;
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_elem *selem;
+ int ret = 0;
+
+ RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
+
+ rcu_read_lock_dont_migrate();
+ sk_storage = rcu_dereference(sk->sk_bpf_storage);
+
+ if (!sk_storage || hlist_empty(&sk_storage->list))
+ goto out;
+
+ hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
+ struct bpf_local_storage_elem *copy_selem;
+ struct bpf_local_storage_map *smap;
+ struct bpf_map *map;
+
+ smap = rcu_dereference(SDATA(selem)->smap);
+ if (!(smap->map.map_flags & BPF_F_CLONE))
+ continue;
+
+ /* Note that for lockless listeners adding new element
+ * here can race with cleanup in bpf_local_storage_map_free.
+ * Try to grab map refcnt to make sure that it's still
+ * alive and prevent concurrent removal.
+ */
+ map = bpf_map_inc_not_zero(&smap->map);
+ if (IS_ERR(map))
+ continue;
+
+ copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
+ if (!copy_selem) {
+ ret = -ENOMEM;
+ bpf_map_put(map);
+ goto out;
+ }
+
+ if (new_sk_storage) {
+ bpf_selem_link_map(smap, copy_selem);
+ bpf_selem_link_storage_nolock(new_sk_storage, copy_selem);
+ } else {
+ ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC);
+ if (ret) {
+ bpf_selem_free(copy_selem, true);
+ atomic_sub(smap->elem_size,
+ &newsk->sk_omem_alloc);
+ bpf_map_put(map);
+ goto out;
+ }
+
+ new_sk_storage =
+ rcu_dereference(copy_selem->local_storage);
+ }
+ bpf_map_put(map);
+ }
+
+out:
+ rcu_read_unlock_migrate();
+
+ /* In case of an error, don't free anything explicitly here, the
+ * caller is responsible to call bpf_sk_storage_free.
+ */
+
+ return ret;
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
+ void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ struct bpf_local_storage_data *sdata;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!sk || !sk_fullsock(sk) || flags > BPF_SK_STORAGE_GET_F_CREATE)
+ return (unsigned long)NULL;
+
+ sdata = bpf_sk_storage_lookup(sk, map, true);
+ if (sdata)
+ return (unsigned long)sdata->data;
+
+ if (flags == BPF_SK_STORAGE_GET_F_CREATE &&
+ /* Cannot add new elem to a going away sk.
+ * Otherwise, the new elem may become a leak
+ * (and also other memory issues during map
+ * destruction).
+ */
+ refcount_inc_not_zero(&sk->sk_refcnt)) {
+ sdata = bpf_local_storage_update(
+ sk, (struct bpf_local_storage_map *)map, value,
+ BPF_NOEXIST, false, gfp_flags);
+ /* sk must be a fullsock (guaranteed by verifier),
+ * so sock_gen_put() is unnecessary.
+ */
+ sock_put(sk);
+ return IS_ERR(sdata) ?
+ (unsigned long)NULL : (unsigned long)sdata->data;
+ }
+
+ return (unsigned long)NULL;
+}
+
+BPF_CALL_2(bpf_sk_storage_delete, struct bpf_map *, map, struct sock *, sk)
+{
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (!sk || !sk_fullsock(sk))
+ return -EINVAL;
+
+ if (refcount_inc_not_zero(&sk->sk_refcnt)) {
+ int err;
+
+ err = bpf_sk_storage_del(sk, map);
+ sock_put(sk);
+ return err;
+ }
+
+ return -ENOENT;
+}
+
+static int bpf_sk_storage_charge(struct bpf_local_storage_map *smap,
+ void *owner, u32 size)
+{
+ struct sock *sk = (struct sock *)owner;
+ int optmem_max;
+
+ optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
+ /* same check as in sock_kmalloc() */
+ if (size <= optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
+ atomic_add(size, &sk->sk_omem_alloc);
+ return 0;
+ }
+
+ return -ENOMEM;
+}
+
+static void bpf_sk_storage_uncharge(struct bpf_local_storage_map *smap,
+ void *owner, u32 size)
+{
+ struct sock *sk = owner;
+
+ atomic_sub(size, &sk->sk_omem_alloc);
+}
+
+static struct bpf_local_storage __rcu **
+bpf_sk_storage_ptr(void *owner)
+{
+ struct sock *sk = owner;
+
+ return &sk->sk_bpf_storage;
+}
+
+const struct bpf_map_ops sk_storage_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc_check = bpf_local_storage_map_alloc_check,
+ .map_alloc = bpf_sk_storage_map_alloc,
+ .map_free = bpf_sk_storage_map_free,
+ .map_get_next_key = notsupp_get_next_key,
+ .map_lookup_elem = bpf_fd_sk_storage_lookup_elem,
+ .map_update_elem = bpf_fd_sk_storage_update_elem,
+ .map_delete_elem = bpf_fd_sk_storage_delete_elem,
+ .map_check_btf = bpf_local_storage_map_check_btf,
+ .map_btf_id = &bpf_local_storage_map_btf_id[0],
+ .map_local_storage_charge = bpf_sk_storage_charge,
+ .map_local_storage_uncharge = bpf_sk_storage_uncharge,
+ .map_owner_storage_ptr = bpf_sk_storage_ptr,
+ .map_mem_usage = bpf_local_storage_map_mem_usage,
+};
+
+const struct bpf_func_proto bpf_sk_storage_get_proto = {
+ .func = bpf_sk_storage_get,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto = {
+ .func = bpf_sk_storage_get,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_CTX, /* context is 'struct sock' */
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+};
+
+const struct bpf_func_proto bpf_sk_storage_delete_proto = {
+ .func = bpf_sk_storage_delete,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+};
+
+static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
+{
+ if (prog->aux->dst_prog)
+ return false;
+
+ /* Ensure the tracing program is not tracing
+ * any bpf_sk_storage*() function and also
+ * use the bpf_sk_storage_(get|delete) helper.
+ */
+ switch (prog->expected_attach_type) {
+ case BPF_TRACE_ITER:
+ case BPF_TRACE_RAW_TP:
+ /* bpf_sk_storage has no trace point */
+ return true;
+ case BPF_TRACE_FENTRY:
+ case BPF_TRACE_FEXIT:
+ return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage",
+ strlen("bpf_sk_storage"));
+ default:
+ return false;
+ }
+
+ return false;
+}
+
+/* *gfp_flags* is a hidden argument provided by the verifier */
+BPF_CALL_5(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
+ void *, value, u64, flags, gfp_t, gfp_flags)
+{
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (in_hardirq() || in_nmi())
+ return (unsigned long)NULL;
+
+ return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags,
+ gfp_flags);
+}
+
+BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map,
+ struct sock *, sk)
+{
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+ if (in_hardirq() || in_nmi())
+ return -EPERM;
+
+ return ____bpf_sk_storage_delete(map, sk);
+}
+
+const struct bpf_func_proto bpf_sk_storage_get_tracing_proto = {
+ .func = bpf_sk_storage_get_tracing,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+ .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg4_type = ARG_ANYTHING,
+ .allowed = bpf_sk_storage_tracing_allowed,
+};
+
+const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto = {
+ .func = bpf_sk_storage_delete_tracing,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL,
+ .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON],
+ .allowed = bpf_sk_storage_tracing_allowed,
+};
+
+struct bpf_sk_storage_diag {
+ u32 nr_maps;
+ struct bpf_map *maps[];
+};
+
+/* The reply will be like:
+ * INET_DIAG_BPF_SK_STORAGES (nla_nest)
+ * SK_DIAG_BPF_STORAGE (nla_nest)
+ * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
+ * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
+ * SK_DIAG_BPF_STORAGE (nla_nest)
+ * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
+ * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
+ * ....
+ */
+static int nla_value_size(u32 value_size)
+{
+ /* SK_DIAG_BPF_STORAGE (nla_nest)
+ * SK_DIAG_BPF_STORAGE_MAP_ID (nla_put_u32)
+ * SK_DIAG_BPF_STORAGE_MAP_VALUE (nla_reserve_64bit)
+ */
+ return nla_total_size(0) + nla_total_size(sizeof(u32)) +
+ nla_total_size_64bit(value_size);
+}
+
+void bpf_sk_storage_diag_free(struct bpf_sk_storage_diag *diag)
+{
+ u32 i;
+
+ if (!diag)
+ return;
+
+ for (i = 0; i < diag->nr_maps; i++)
+ bpf_map_put(diag->maps[i]);
+
+ kfree(diag);
+}
+EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_free);
+
+static bool diag_check_dup(const struct bpf_sk_storage_diag *diag,
+ const struct bpf_map *map)
+{
+ u32 i;
+
+ for (i = 0; i < diag->nr_maps; i++) {
+ if (diag->maps[i] == map)
+ return true;
+ }
+
+ return false;
+}
+
+struct bpf_sk_storage_diag *
+bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs)
+{
+ struct bpf_sk_storage_diag *diag;
+ struct nlattr *nla;
+ u32 nr_maps = 0;
+ int rem, err;
+
+ /* bpf_local_storage_map is currently limited to CAP_SYS_ADMIN as
+ * the map_alloc_check() side also does.
+ */
+ if (!bpf_capable())
+ return ERR_PTR(-EPERM);
+
+ nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD,
+ nla_stgs, rem) {
+ if (nla_len(nla) != sizeof(u32))
+ return ERR_PTR(-EINVAL);
+ nr_maps++;
+ }
+
+ diag = kzalloc(struct_size(diag, maps, nr_maps), GFP_KERNEL);
+ if (!diag)
+ return ERR_PTR(-ENOMEM);
+
+ nla_for_each_nested_type(nla, SK_DIAG_BPF_STORAGE_REQ_MAP_FD,
+ nla_stgs, rem) {
+ int map_fd = nla_get_u32(nla);
+ struct bpf_map *map = bpf_map_get(map_fd);
+
+ if (IS_ERR(map)) {
+ err = PTR_ERR(map);
+ goto err_free;
+ }
+ if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) {
+ bpf_map_put(map);
+ err = -EINVAL;
+ goto err_free;
+ }
+ if (diag_check_dup(diag, map)) {
+ bpf_map_put(map);
+ err = -EEXIST;
+ goto err_free;
+ }
+ diag->maps[diag->nr_maps++] = map;
+ }
+
+ return diag;
+
+err_free:
+ bpf_sk_storage_diag_free(diag);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_alloc);
+
+static int diag_get(struct bpf_local_storage_data *sdata, struct sk_buff *skb)
+{
+ struct nlattr *nla_stg, *nla_value;
+ struct bpf_local_storage_map *smap;
+
+ /* It cannot exceed max nlattr's payload */
+ BUILD_BUG_ON(U16_MAX - NLA_HDRLEN < BPF_LOCAL_STORAGE_MAX_VALUE_SIZE);
+
+ nla_stg = nla_nest_start(skb, SK_DIAG_BPF_STORAGE);
+ if (!nla_stg)
+ return -EMSGSIZE;
+
+ smap = rcu_dereference(sdata->smap);
+ if (nla_put_u32(skb, SK_DIAG_BPF_STORAGE_MAP_ID, smap->map.id))
+ goto errout;
+
+ nla_value = nla_reserve_64bit(skb, SK_DIAG_BPF_STORAGE_MAP_VALUE,
+ smap->map.value_size,
+ SK_DIAG_BPF_STORAGE_PAD);
+ if (!nla_value)
+ goto errout;
+
+ if (btf_record_has_field(smap->map.record, BPF_SPIN_LOCK))
+ copy_map_value_locked(&smap->map, nla_data(nla_value),
+ sdata->data, true);
+ else
+ copy_map_value(&smap->map, nla_data(nla_value), sdata->data);
+
+ nla_nest_end(skb, nla_stg);
+ return 0;
+
+errout:
+ nla_nest_cancel(skb, nla_stg);
+ return -EMSGSIZE;
+}
+
+static int bpf_sk_storage_diag_put_all(struct sock *sk, struct sk_buff *skb,
+ int stg_array_type,
+ unsigned int *res_diag_size)
+{
+ /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
+ unsigned int diag_size = nla_total_size(0);
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_elem *selem;
+ struct bpf_local_storage_map *smap;
+ struct nlattr *nla_stgs;
+ unsigned int saved_len;
+ int err = 0;
+
+ rcu_read_lock();
+
+ sk_storage = rcu_dereference(sk->sk_bpf_storage);
+ if (!sk_storage || hlist_empty(&sk_storage->list)) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ nla_stgs = nla_nest_start(skb, stg_array_type);
+ if (!nla_stgs)
+ /* Continue to learn diag_size */
+ err = -EMSGSIZE;
+
+ saved_len = skb->len;
+ hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
+ smap = rcu_dereference(SDATA(selem)->smap);
+ diag_size += nla_value_size(smap->map.value_size);
+
+ if (nla_stgs && diag_get(SDATA(selem), skb))
+ /* Continue to learn diag_size */
+ err = -EMSGSIZE;
+ }
+
+ rcu_read_unlock();
+
+ if (nla_stgs) {
+ if (saved_len == skb->len)
+ nla_nest_cancel(skb, nla_stgs);
+ else
+ nla_nest_end(skb, nla_stgs);
+ }
+
+ if (diag_size == nla_total_size(0)) {
+ *res_diag_size = 0;
+ return 0;
+ }
+
+ *res_diag_size = diag_size;
+ return err;
+}
+
+int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
+ struct sock *sk, struct sk_buff *skb,
+ int stg_array_type,
+ unsigned int *res_diag_size)
+{
+ /* stg_array_type (e.g. INET_DIAG_BPF_SK_STORAGES) */
+ unsigned int diag_size = nla_total_size(0);
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_data *sdata;
+ struct nlattr *nla_stgs;
+ unsigned int saved_len;
+ int err = 0;
+ u32 i;
+
+ *res_diag_size = 0;
+
+ /* No map has been specified. Dump all. */
+ if (!diag->nr_maps)
+ return bpf_sk_storage_diag_put_all(sk, skb, stg_array_type,
+ res_diag_size);
+
+ rcu_read_lock();
+ sk_storage = rcu_dereference(sk->sk_bpf_storage);
+ if (!sk_storage || hlist_empty(&sk_storage->list)) {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ nla_stgs = nla_nest_start(skb, stg_array_type);
+ if (!nla_stgs)
+ /* Continue to learn diag_size */
+ err = -EMSGSIZE;
+
+ saved_len = skb->len;
+ for (i = 0; i < diag->nr_maps; i++) {
+ sdata = bpf_local_storage_lookup(sk_storage,
+ (struct bpf_local_storage_map *)diag->maps[i],
+ false);
+
+ if (!sdata)
+ continue;
+
+ diag_size += nla_value_size(diag->maps[i]->value_size);
+
+ if (nla_stgs && diag_get(sdata, skb))
+ /* Continue to learn diag_size */
+ err = -EMSGSIZE;
+ }
+ rcu_read_unlock();
+
+ if (nla_stgs) {
+ if (saved_len == skb->len)
+ nla_nest_cancel(skb, nla_stgs);
+ else
+ nla_nest_end(skb, nla_stgs);
+ }
+
+ if (diag_size == nla_total_size(0)) {
+ *res_diag_size = 0;
+ return 0;
+ }
+
+ *res_diag_size = diag_size;
+ return err;
+}
+EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put);
+
+struct bpf_iter_seq_sk_storage_map_info {
+ struct bpf_map *map;
+ unsigned int bucket_id;
+ unsigned skip_elems;
+};
+
+static struct bpf_local_storage_elem *
+bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
+ struct bpf_local_storage_elem *prev_selem)
+ __acquires(RCU) __releases(RCU)
+{
+ struct bpf_local_storage *sk_storage;
+ struct bpf_local_storage_elem *selem;
+ u32 skip_elems = info->skip_elems;
+ struct bpf_local_storage_map *smap;
+ u32 bucket_id = info->bucket_id;
+ u32 i, count, n_buckets;
+ struct bpf_local_storage_map_bucket *b;
+
+ smap = (struct bpf_local_storage_map *)info->map;
+ n_buckets = 1U << smap->bucket_log;
+ if (bucket_id >= n_buckets)
+ return NULL;
+
+ /* try to find next selem in the same bucket */
+ selem = prev_selem;
+ count = 0;
+ while (selem) {
+ selem = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&selem->map_node)),
+ struct bpf_local_storage_elem, map_node);
+ if (!selem) {
+ /* not found, unlock and go to the next bucket */
+ b = &smap->buckets[bucket_id++];
+ rcu_read_unlock();
+ skip_elems = 0;
+ break;
+ }
+ sk_storage = rcu_dereference(selem->local_storage);
+ if (sk_storage) {
+ info->skip_elems = skip_elems + count;
+ return selem;
+ }
+ count++;
+ }
+
+ for (i = bucket_id; i < (1U << smap->bucket_log); i++) {
+ b = &smap->buckets[i];
+ rcu_read_lock();
+ count = 0;
+ hlist_for_each_entry_rcu(selem, &b->list, map_node) {
+ sk_storage = rcu_dereference(selem->local_storage);
+ if (sk_storage && count >= skip_elems) {
+ info->bucket_id = i;
+ info->skip_elems = count;
+ return selem;
+ }
+ count++;
+ }
+ rcu_read_unlock();
+ skip_elems = 0;
+ }
+
+ info->bucket_id = i;
+ info->skip_elems = 0;
+ return NULL;
+}
+
+static void *bpf_sk_storage_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ struct bpf_local_storage_elem *selem;
+
+ selem = bpf_sk_storage_map_seq_find_next(seq->private, NULL);
+ if (!selem)
+ return NULL;
+
+ if (*pos == 0)
+ ++*pos;
+ return selem;
+}
+
+static void *bpf_sk_storage_map_seq_next(struct seq_file *seq, void *v,
+ loff_t *pos)
+{
+ struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
+
+ ++*pos;
+ ++info->skip_elems;
+ return bpf_sk_storage_map_seq_find_next(seq->private, v);
+}
+
+struct bpf_iter__bpf_sk_storage_map {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct bpf_map *, map);
+ __bpf_md_ptr(struct sock *, sk);
+ __bpf_md_ptr(void *, value);
+};
+
+DEFINE_BPF_ITER_FUNC(bpf_sk_storage_map, struct bpf_iter_meta *meta,
+ struct bpf_map *map, struct sock *sk,
+ void *value)
+
+static int __bpf_sk_storage_map_seq_show(struct seq_file *seq,
+ struct bpf_local_storage_elem *selem)
+{
+ struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
+ struct bpf_iter__bpf_sk_storage_map ctx = {};
+ struct bpf_local_storage *sk_storage;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret = 0;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, selem == NULL);
+ if (prog) {
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (selem) {
+ sk_storage = rcu_dereference(selem->local_storage);
+ ctx.sk = sk_storage->owner;
+ ctx.value = SDATA(selem)->data;
+ }
+ ret = bpf_iter_run_prog(prog, &ctx);
+ }
+
+ return ret;
+}
+
+static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v)
+{
+ return __bpf_sk_storage_map_seq_show(seq, v);
+}
+
+static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
+{
+ if (!v)
+ (void)__bpf_sk_storage_map_seq_show(seq, v);
+ else
+ rcu_read_unlock();
+}
+
+static int bpf_iter_init_sk_storage_map(void *priv_data,
+ struct bpf_iter_aux_info *aux)
+{
+ struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data;
+
+ bpf_map_inc_with_uref(aux->map);
+ seq_info->map = aux->map;
+ return 0;
+}
+
+static void bpf_iter_fini_sk_storage_map(void *priv_data)
+{
+ struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data;
+
+ bpf_map_put_with_uref(seq_info->map);
+}
+
+static int bpf_iter_attach_map(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ struct bpf_map *map;
+ int err = -EINVAL;
+
+ if (!linfo->map.map_fd)
+ return -EBADF;
+
+ map = bpf_map_get_with_uref(linfo->map.map_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
+ goto put_map;
+
+ if (prog->aux->max_rdwr_access > map->value_size) {
+ err = -EACCES;
+ goto put_map;
+ }
+
+ aux->map = map;
+ return 0;
+
+put_map:
+ bpf_map_put_with_uref(map);
+ return err;
+}
+
+static void bpf_iter_detach_map(struct bpf_iter_aux_info *aux)
+{
+ bpf_map_put_with_uref(aux->map);
+}
+
+static const struct seq_operations bpf_sk_storage_map_seq_ops = {
+ .start = bpf_sk_storage_map_seq_start,
+ .next = bpf_sk_storage_map_seq_next,
+ .stop = bpf_sk_storage_map_seq_stop,
+ .show = bpf_sk_storage_map_seq_show,
+};
+
+static const struct bpf_iter_seq_info iter_seq_info = {
+ .seq_ops = &bpf_sk_storage_map_seq_ops,
+ .init_seq_private = bpf_iter_init_sk_storage_map,
+ .fini_seq_private = bpf_iter_fini_sk_storage_map,
+ .seq_priv_size = sizeof(struct bpf_iter_seq_sk_storage_map_info),
+};
+
+static struct bpf_iter_reg bpf_sk_storage_map_reg_info = {
+ .target = "bpf_sk_storage_map",
+ .attach_target = bpf_iter_attach_map,
+ .detach_target = bpf_iter_detach_map,
+ .show_fdinfo = bpf_iter_map_show_fdinfo,
+ .fill_link_info = bpf_iter_map_fill_link_info,
+ .ctx_arg_info_size = 2,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__bpf_sk_storage_map, sk),
+ PTR_TO_BTF_ID_OR_NULL },
+ { offsetof(struct bpf_iter__bpf_sk_storage_map, value),
+ PTR_TO_BUF | PTR_MAYBE_NULL },
+ },
+ .seq_info = &iter_seq_info,
+};
+
+static int __init bpf_sk_storage_map_iter_init(void)
+{
+ bpf_sk_storage_map_reg_info.ctx_arg_info[0].btf_id =
+ btf_sock_ids[BTF_SOCK_TYPE_SOCK];
+ return bpf_iter_reg_target(&bpf_sk_storage_map_reg_info);
+}
+late_initcall(bpf_sk_storage_map_iter_init);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 9aac0d63d53e..c285c6465923 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -50,7 +50,9 @@
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/pagemap.h>
-#include <linux/uio.h>
+#include <linux/iov_iter.h>
+#include <linux/indirect_call_wrapper.h>
+#include <linux/crc32.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
@@ -61,6 +63,8 @@
#include <trace/events/skb.h>
#include <net/busy_poll.h>
+#include "devmem.h"
+
/*
* Is a socket 'connection oriented' ?
*/
@@ -82,7 +86,8 @@ static int receiver_wake_function(wait_queue_entry_t *wait, unsigned int mode, i
/*
* Wait for the last received packet to be different from skb
*/
-int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
+int __skb_wait_for_more_packets(struct sock *sk, struct sk_buff_head *queue,
+ int *err, long *timeo_p,
const struct sk_buff *skb)
{
int error;
@@ -95,7 +100,7 @@ int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
if (error)
goto out_err;
- if (sk->sk_receive_queue.prev != skb)
+ if (READ_ONCE(queue->prev) != skb)
goto out;
/* Socket shut down? */
@@ -160,12 +165,9 @@ done:
return skb;
}
-struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
- struct sk_buff_head *queue,
+struct sk_buff *__skb_try_recv_from_queue(struct sk_buff_head *queue,
unsigned int flags,
- void (*destructor)(struct sock *sk,
- struct sk_buff *skb),
- int *peeked, int *off, int *err,
+ int *off, int *err,
struct sk_buff **last)
{
bool peek_at_off = false;
@@ -192,12 +194,9 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
return NULL;
}
}
- *peeked = 1;
refcount_inc(&skb->users);
} else {
__skb_unlink(skb, queue);
- if (destructor)
- destructor(sk, skb);
}
*off = _off;
return skb;
@@ -208,9 +207,8 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
/**
* __skb_try_recv_datagram - Receive a datagram skbuff
* @sk: socket
+ * @queue: socket queue from which to receive
* @flags: MSG\_ flags
- * @destructor: invoked under the receive lock on successful dequeue
- * @peeked: returns non-zero if this packet has been seen before
* @off: an offset in bytes to peek skb from. Returns an offset
* within an skb where data actually starts
* @err: error code returned
@@ -241,13 +239,11 @@ struct sk_buff *__skb_try_recv_from_queue(struct sock *sk,
* quite explicitly by POSIX 1003.1g, don't change them without having
* the standard around please.
*/
-struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
- void (*destructor)(struct sock *sk,
- struct sk_buff *skb),
- int *peeked, int *off, int *err,
+struct sk_buff *__skb_try_recv_datagram(struct sock *sk,
+ struct sk_buff_head *queue,
+ unsigned int flags, int *off, int *err,
struct sk_buff **last)
{
- struct sk_buff_head *queue = &sk->sk_receive_queue;
struct sk_buff *skb;
unsigned long cpu_flags;
/*
@@ -258,7 +254,6 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
if (error)
goto no_packet;
- *peeked = 0;
do {
/* Again only user level code calls this function, so nothing
* interrupt level will suddenly eat the receive_queue.
@@ -267,8 +262,8 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
* However, this function was correct in any case. 8)
*/
spin_lock_irqsave(&queue->lock, cpu_flags);
- skb = __skb_try_recv_from_queue(sk, queue, flags, destructor,
- peeked, off, &error, last);
+ skb = __skb_try_recv_from_queue(queue, flags, off, &error,
+ last);
spin_unlock_irqrestore(&queue->lock, cpu_flags);
if (error)
goto no_packet;
@@ -279,7 +274,7 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
break;
sk_busy_loop(sk, flags & MSG_DONTWAIT);
- } while (!skb_queue_empty(&sk->sk_receive_queue));
+ } while (READ_ONCE(queue->prev) != *last);
error = -EAGAIN;
@@ -289,10 +284,9 @@ no_packet:
}
EXPORT_SYMBOL(__skb_try_recv_datagram);
-struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
- void (*destructor)(struct sock *sk,
- struct sk_buff *skb),
- int *peeked, int *off, int *err)
+struct sk_buff *__skb_recv_datagram(struct sock *sk,
+ struct sk_buff_head *sk_queue,
+ unsigned int flags, int *off, int *err)
{
struct sk_buff *skb, *last;
long timeo;
@@ -300,57 +294,37 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
do {
- skb = __skb_try_recv_datagram(sk, flags, destructor, peeked,
- off, err, &last);
+ skb = __skb_try_recv_datagram(sk, sk_queue, flags, off, err,
+ &last);
if (skb)
return skb;
if (*err != -EAGAIN)
break;
} while (timeo &&
- !__skb_wait_for_more_packets(sk, err, &timeo, last));
+ !__skb_wait_for_more_packets(sk, sk_queue, err,
+ &timeo, last));
return NULL;
}
EXPORT_SYMBOL(__skb_recv_datagram);
struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned int flags,
- int noblock, int *err)
+ int *err)
{
- int peeked, off = 0;
+ int off = 0;
- return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
- NULL, &peeked, &off, err);
+ return __skb_recv_datagram(sk, &sk->sk_receive_queue, flags,
+ &off, err);
}
EXPORT_SYMBOL(skb_recv_datagram);
void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
{
consume_skb(skb);
- sk_mem_reclaim_partial(sk);
}
EXPORT_SYMBOL(skb_free_datagram);
-void __skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb, int len)
-{
- bool slow;
-
- if (!skb_unref(skb)) {
- sk_peek_offset_bwd(sk, len);
- return;
- }
-
- slow = lock_sock_fast(sk);
- sk_peek_offset_bwd(sk, len);
- skb_orphan(skb);
- sk_mem_reclaim_partial(sk);
- unlock_sock_fast(sk, slow);
-
- /* skb is now orphaned, can be freed outside of locked section */
- __kfree_skb(skb);
-}
-EXPORT_SYMBOL(__skb_free_datagram_locked);
-
int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
struct sk_buff *skb, unsigned int flags,
void (*destructor)(struct sock *sk,
@@ -371,7 +345,7 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
spin_unlock_bh(&sk_queue->lock);
}
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
return err;
}
EXPORT_SYMBOL(__sk_queue_drop_skb);
@@ -403,32 +377,30 @@ int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
NULL);
kfree_skb(skb);
- sk_mem_reclaim_partial(sk);
return err;
}
EXPORT_SYMBOL(skb_kill_datagram);
-/**
- * skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
- * @skb: buffer to copy
- * @offset: offset in the buffer to start copying from
- * @to: iovec iterator to copy to
- * @len: amount of data to copy from buffer to iovec
- */
-int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
- struct iov_iter *to, int len)
+INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr,
+ size_t bytes,
+ void *data __always_unused,
+ struct iov_iter *i));
+
+static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
+ struct iov_iter *to, int len, bool fault_short,
+ size_t (*cb)(const void *, size_t, void *,
+ struct iov_iter *), void *data)
{
int start = skb_headlen(skb);
int i, copy = start - offset, start_off = offset, n;
struct sk_buff *frag_iter;
- trace_skb_copy_datagram_iovec(skb, len);
-
/* Copy header. */
if (copy > 0) {
if (copy > len)
copy = len;
- n = copy_to_iter(skb->data + offset, copy, to);
+ n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
+ skb->data + offset, copy, data, to);
offset += n;
if (n != copy)
goto short_copy;
@@ -436,6 +408,9 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
return 0;
}
+ if (!skb_frags_readable(skb))
+ goto short_copy;
+
/* Copy paged appendix. Hmm... why does this look so complicated? */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
@@ -445,11 +420,23 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
+ u32 p_off, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
+
if (copy > len)
copy = len;
- n = copy_page_to_iter(skb_frag_page(frag),
- frag->page_offset + offset -
- start, copy, to);
+
+ n = 0;
+ skb_frag_foreach_page(frag,
+ skb_frag_off(frag) + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_local_page(p);
+ n += INDIRECT_CALL_1(cb, simple_copy_to_iter,
+ vaddr + p_off, p_len, data, to);
+ kunmap_local(vaddr);
+ }
+
offset += n;
if (n != copy)
goto short_copy;
@@ -468,8 +455,8 @@ int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
if ((copy = end - offset) > 0) {
if (copy > len)
copy = len;
- if (skb_copy_datagram_iter(frag_iter, offset - start,
- to, copy))
+ if (__skb_datagram_iter(frag_iter, offset - start,
+ to, copy, fault_short, cb, data))
goto fault;
if ((len -= copy) == 0)
return 0;
@@ -490,11 +477,64 @@ fault:
return -EFAULT;
short_copy:
- if (iov_iter_count(to))
+ if (fault_short || iov_iter_count(to))
goto fault;
return 0;
}
+
+#ifdef CONFIG_NET_CRC32C
+static size_t crc32c_and_copy_to_iter(const void *addr, size_t bytes,
+ void *_crcp, struct iov_iter *i)
+{
+ u32 *crcp = _crcp;
+ size_t copied;
+
+ copied = copy_to_iter(addr, bytes, i);
+ *crcp = crc32c(*crcp, addr, copied);
+ return copied;
+}
+
+/**
+ * skb_copy_and_crc32c_datagram_iter - Copy datagram to an iovec iterator
+ * and update a CRC32C value.
+ * @skb: buffer to copy
+ * @offset: offset in the buffer to start copying from
+ * @to: iovec iterator to copy to
+ * @len: amount of data to copy from buffer to iovec
+ * @crcp: pointer to CRC32C value to update
+ *
+ * Return: 0 on success, -EFAULT if there was a fault during copy.
+ */
+int skb_copy_and_crc32c_datagram_iter(const struct sk_buff *skb, int offset,
+ struct iov_iter *to, int len, u32 *crcp)
+{
+ return __skb_datagram_iter(skb, offset, to, len, true,
+ crc32c_and_copy_to_iter, crcp);
+}
+EXPORT_SYMBOL(skb_copy_and_crc32c_datagram_iter);
+#endif /* CONFIG_NET_CRC32C */
+
+static size_t simple_copy_to_iter(const void *addr, size_t bytes,
+ void *data __always_unused, struct iov_iter *i)
+{
+ return copy_to_iter(addr, bytes, i);
+}
+
+/**
+ * skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
+ * @skb: buffer to copy
+ * @offset: offset in the buffer to start copying from
+ * @to: iovec iterator to copy to
+ * @len: amount of data to copy from buffer to iovec
+ */
+int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
+ struct iov_iter *to, int len)
+{
+ trace_skb_copy_datagram_iovec(skb, len);
+ return __skb_datagram_iter(skb, offset, to, len, false,
+ simple_copy_to_iter, NULL);
+}
EXPORT_SYMBOL(skb_copy_datagram_iter);
/**
@@ -539,7 +579,7 @@ int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
if (copy > len)
copy = len;
copied = copy_page_from_iter(skb_frag_page(frag),
- frag->page_offset + offset - start,
+ skb_frag_off(frag) + offset - start,
copy, from);
if (copied != copy)
goto fault;
@@ -578,49 +618,157 @@ fault:
}
EXPORT_SYMBOL(skb_copy_datagram_from_iter);
-int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
- struct iov_iter *from, size_t length)
+int skb_copy_datagram_from_iter_full(struct sk_buff *skb, int offset,
+ struct iov_iter *from, int len)
+{
+ struct iov_iter_state state;
+ int ret;
+
+ iov_iter_save_state(from, &state);
+ ret = skb_copy_datagram_from_iter(skb, offset, from, len);
+ if (ret)
+ iov_iter_restore(from, &state);
+ return ret;
+}
+EXPORT_SYMBOL(skb_copy_datagram_from_iter_full);
+
+int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
+ struct iov_iter *from, size_t length)
{
int frag = skb_shinfo(skb)->nr_frags;
+ if (!skb_frags_readable(skb))
+ return -EFAULT;
+
while (length && iov_iter_count(from)) {
+ struct page *head, *last_head = NULL;
struct page *pages[MAX_SKB_FRAGS];
+ int refs, order, n = 0;
size_t start;
ssize_t copied;
- unsigned long truesize;
- int n = 0;
if (frag == MAX_SKB_FRAGS)
return -EMSGSIZE;
- copied = iov_iter_get_pages(from, pages, length,
+ copied = iov_iter_get_pages2(from, pages, length,
MAX_SKB_FRAGS - frag, &start);
if (copied < 0)
return -EFAULT;
- iov_iter_advance(from, copied);
length -= copied;
- truesize = PAGE_ALIGN(copied + start);
skb->data_len += copied;
skb->len += copied;
- skb->truesize += truesize;
- if (sk && sk->sk_type == SOCK_STREAM) {
- sk->sk_wmem_queued += truesize;
- sk_mem_charge(sk, truesize);
- } else {
- refcount_add(truesize, &skb->sk->sk_wmem_alloc);
- }
- while (copied) {
+ skb->truesize += PAGE_ALIGN(copied + start);
+
+ head = compound_head(pages[n]);
+ order = compound_order(head);
+
+ for (refs = 0; copied != 0; start = 0) {
int size = min_t(int, copied, PAGE_SIZE - start);
- skb_fill_page_desc(skb, frag++, pages[n], start, size);
- start = 0;
+
+ if (pages[n] - head > (1UL << order) - 1) {
+ head = compound_head(pages[n]);
+ order = compound_order(head);
+ }
+
+ start += (pages[n] - head) << PAGE_SHIFT;
copied -= size;
n++;
+ if (frag) {
+ skb_frag_t *last = &skb_shinfo(skb)->frags[frag - 1];
+
+ if (head == skb_frag_page(last) &&
+ start == skb_frag_off(last) + skb_frag_size(last)) {
+ skb_frag_size_add(last, size);
+ /* We combined this page, we need to release
+ * a reference. Since compound pages refcount
+ * is shared among many pages, batch the refcount
+ * adjustments to limit false sharing.
+ */
+ last_head = head;
+ refs++;
+ continue;
+ }
+ }
+ if (refs) {
+ page_ref_sub(last_head, refs);
+ refs = 0;
+ }
+ skb_fill_page_desc_noacc(skb, frag++, head, start, size);
}
+ if (refs)
+ page_ref_sub(last_head, refs);
}
return 0;
}
+
+static int
+zerocopy_fill_skb_from_devmem(struct sk_buff *skb, struct iov_iter *from,
+ int length,
+ struct net_devmem_dmabuf_binding *binding)
+{
+ int i = skb_shinfo(skb)->nr_frags;
+ size_t virt_addr, size, off;
+ struct net_iov *niov;
+
+ /* Devmem filling works by taking an IOVEC from the user where the
+ * iov_addrs are interpreted as an offset in bytes into the dma-buf to
+ * send from. We do not support other iter types.
+ */
+ if (iov_iter_type(from) != ITER_IOVEC &&
+ iov_iter_type(from) != ITER_UBUF)
+ return -EFAULT;
+
+ while (length && iov_iter_count(from)) {
+ if (i == MAX_SKB_FRAGS)
+ return -EMSGSIZE;
+
+ virt_addr = (size_t)iter_iov_addr(from);
+ niov = net_devmem_get_niov_at(binding, virt_addr, &off, &size);
+ if (!niov)
+ return -EFAULT;
+
+ size = min_t(size_t, size, length);
+ size = min_t(size_t, size, iter_iov_len(from));
+
+ get_netmem(net_iov_to_netmem(niov));
+ skb_add_rx_frag_netmem(skb, i, net_iov_to_netmem(niov), off,
+ size, PAGE_SIZE);
+ iov_iter_advance(from, size);
+ length -= size;
+ i++;
+ }
+
+ return 0;
+}
+
+int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb, struct iov_iter *from,
+ size_t length,
+ struct net_devmem_dmabuf_binding *binding)
+{
+ unsigned long orig_size = skb->truesize;
+ unsigned long truesize;
+ int ret;
+
+ if (msg && msg->msg_ubuf && msg->sg_from_iter)
+ ret = msg->sg_from_iter(skb, from, length);
+ else if (binding)
+ ret = zerocopy_fill_skb_from_devmem(skb, from, length, binding);
+ else
+ ret = zerocopy_fill_skb_from_iter(skb, from, length);
+
+ truesize = skb->truesize - orig_size;
+ if (sk && sk->sk_type == SOCK_STREAM) {
+ sk_wmem_queued_add(sk, truesize);
+ if (!skb_zcopy_pure(skb))
+ sk_mem_charge(sk, truesize);
+ } else {
+ refcount_add(truesize, &skb->sk->sk_wmem_alloc);
+ }
+ return ret;
+}
EXPORT_SYMBOL(__zerocopy_sg_from_iter);
/**
@@ -641,135 +789,88 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
if (skb_copy_datagram_from_iter(skb, 0, from, copy))
return -EFAULT;
- return __zerocopy_sg_from_iter(NULL, skb, from, ~0U);
+ return __zerocopy_sg_from_iter(NULL, NULL, skb, from, ~0U, NULL);
}
EXPORT_SYMBOL(zerocopy_sg_from_iter);
-static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
- struct iov_iter *to, int len,
- __wsum *csump)
+static __always_inline
+size_t copy_to_user_iter_csum(void __user *iter_to, size_t progress,
+ size_t len, void *from, void *priv2)
{
- int start = skb_headlen(skb);
- int i, copy = start - offset, start_off = offset;
- struct sk_buff *frag_iter;
- int pos = 0;
- int n;
-
- /* Copy header. */
- if (copy > 0) {
- if (copy > len)
- copy = len;
- n = csum_and_copy_to_iter(skb->data + offset, copy, csump, to);
- offset += n;
- if (n != copy)
- goto fault;
- if ((len -= copy) == 0)
- return 0;
- pos = copy;
- }
-
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- int end;
- const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ __wsum next, *csum = priv2;
- WARN_ON(start > offset + len);
+ next = csum_and_copy_to_user(from + progress, iter_to, len);
+ *csum = csum_block_add(*csum, next, progress);
+ return next ? 0 : len;
+}
- end = start + skb_frag_size(frag);
- if ((copy = end - offset) > 0) {
- __wsum csum2 = 0;
- struct page *page = skb_frag_page(frag);
- u8 *vaddr = kmap(page);
+static __always_inline
+size_t memcpy_to_iter_csum(void *iter_to, size_t progress,
+ size_t len, void *from, void *priv2)
+{
+ __wsum *csum = priv2;
+ __wsum next = csum_partial_copy_nocheck(from + progress, iter_to, len);
- if (copy > len)
- copy = len;
- n = csum_and_copy_to_iter(vaddr + frag->page_offset +
- offset - start, copy,
- &csum2, to);
- kunmap(page);
- offset += n;
- if (n != copy)
- goto fault;
- *csump = csum_block_add(*csump, csum2, pos);
- if (!(len -= copy))
- return 0;
- pos += copy;
- }
- start = end;
- }
+ *csum = csum_block_add(*csum, next, progress);
+ return 0;
+}
- skb_walk_frags(skb, frag_iter) {
- int end;
+struct csum_state {
+ __wsum csum;
+ size_t off;
+};
- WARN_ON(start > offset + len);
+static size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *_csstate,
+ struct iov_iter *i)
+{
+ struct csum_state *csstate = _csstate;
+ __wsum sum;
- end = start + frag_iter->len;
- if ((copy = end - offset) > 0) {
- __wsum csum2 = 0;
- if (copy > len)
- copy = len;
- if (skb_copy_and_csum_datagram(frag_iter,
- offset - start,
- to, copy,
- &csum2))
- goto fault;
- *csump = csum_block_add(*csump, csum2, pos);
- if ((len -= copy) == 0)
- return 0;
- offset += copy;
- pos += copy;
- }
- start = end;
- }
- if (!len)
+ if (WARN_ON_ONCE(i->data_source))
return 0;
+ if (unlikely(iov_iter_is_discard(i))) {
+ // can't use csum_memcpy() for that one - data is not copied
+ csstate->csum = csum_block_add(csstate->csum,
+ csum_partial(addr, bytes, 0),
+ csstate->off);
+ csstate->off += bytes;
+ return bytes;
+ }
-fault:
- iov_iter_revert(to, offset - start_off);
- return -EFAULT;
-}
-
-__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
-{
- __sum16 sum;
+ sum = csum_shift(csstate->csum, csstate->off);
- sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
- if (likely(!sum)) {
- if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
- !skb->csum_complete_sw)
- netdev_rx_csum_fault(skb->dev);
- }
- if (!skb_shared(skb))
- skb->csum_valid = !sum;
- return sum;
+ bytes = iterate_and_advance2(i, bytes, (void *)addr, &sum,
+ copy_to_user_iter_csum,
+ memcpy_to_iter_csum);
+ csstate->csum = csum_shift(sum, csstate->off);
+ csstate->off += bytes;
+ return bytes;
}
-EXPORT_SYMBOL(__skb_checksum_complete_head);
-__sum16 __skb_checksum_complete(struct sk_buff *skb)
+/**
+ * skb_copy_and_csum_datagram - Copy datagram to an iovec iterator
+ * and update a checksum.
+ * @skb: buffer to copy
+ * @offset: offset in the buffer to start copying from
+ * @to: iovec iterator to copy to
+ * @len: amount of data to copy from buffer to iovec
+ * @csump: checksum pointer
+ */
+static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
+ struct iov_iter *to, int len,
+ __wsum *csump)
{
- __wsum csum;
- __sum16 sum;
+ struct csum_state csdata = { .csum = *csump };
+ int ret;
- csum = skb_checksum(skb, 0, skb->len, 0);
+ ret = __skb_datagram_iter(skb, offset, to, len, true,
+ csum_and_copy_to_iter, &csdata);
+ if (ret)
+ return ret;
- /* skb->csum holds pseudo checksum */
- sum = csum_fold(csum_add(skb->csum, csum));
- if (likely(!sum)) {
- if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
- !skb->csum_complete_sw)
- netdev_rx_csum_fault(skb->dev);
- }
-
- if (!skb_shared(skb)) {
- /* Save full packet checksum */
- skb->csum = csum;
- skb->ip_summed = CHECKSUM_COMPLETE;
- skb->csum_complete_sw = 1;
- skb->csum_valid = !sum;
- }
-
- return sum;
+ *csump = csdata.csum;
+ return 0;
}
-EXPORT_SYMBOL(__skb_checksum_complete);
/**
* skb_copy_and_csum_datagram_msg - Copy and checksum skb to user iovec.
@@ -808,8 +909,9 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb,
return -EINVAL;
}
- if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE))
- netdev_rx_csum_fault(skb->dev);
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(NULL, skb);
}
return 0;
fault:
@@ -818,48 +920,54 @@ fault:
EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg);
/**
- * datagram_poll - generic datagram poll
+ * datagram_poll_queue - same as datagram_poll, but on a specific receive
+ * queue
* @file: file struct
* @sock: socket
* @wait: poll table
+ * @rcv_queue: receive queue to poll
*
- * Datagram poll: Again totally generic. This also handles
- * sequenced packet sockets providing the socket receive queue
- * is only ever holding data ready to receive.
+ * Performs polling on the given receive queue, handling shutdown, error,
+ * and connection state. This is useful for protocols that deliver
+ * userspace-bound packets through a custom queue instead of
+ * sk->sk_receive_queue.
*
- * Note: when you *don't* use this routine for this protocol,
- * and you use a different write policy from sock_writeable()
- * then please supply your own write_space callback.
+ * Return: poll bitmask indicating the socket's current state
*/
-__poll_t datagram_poll(struct file *file, struct socket *sock,
- poll_table *wait)
+__poll_t datagram_poll_queue(struct file *file, struct socket *sock,
+ poll_table *wait, struct sk_buff_head *rcv_queue)
{
struct sock *sk = sock->sk;
__poll_t mask;
+ u8 shutdown;
- sock_poll_wait(file, wait);
+ sock_poll_wait(file, sock, wait);
mask = 0;
/* exceptional events? */
- if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ if (READ_ONCE(sk->sk_err) ||
+ !skb_queue_empty_lockless(&sk->sk_error_queue))
mask |= EPOLLERR |
(sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
- if (sk->sk_shutdown & RCV_SHUTDOWN)
+ shutdown = READ_ONCE(sk->sk_shutdown);
+ if (shutdown & RCV_SHUTDOWN)
mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
- if (sk->sk_shutdown == SHUTDOWN_MASK)
+ if (shutdown == SHUTDOWN_MASK)
mask |= EPOLLHUP;
/* readable? */
- if (!skb_queue_empty(&sk->sk_receive_queue))
+ if (!skb_queue_empty_lockless(rcv_queue))
mask |= EPOLLIN | EPOLLRDNORM;
/* Connection-based need to check for termination and startup */
if (connection_based(sk)) {
- if (sk->sk_state == TCP_CLOSE)
+ int state = READ_ONCE(sk->sk_state);
+
+ if (state == TCP_CLOSE)
mask |= EPOLLHUP;
/* connection hasn't started yet? */
- if (sk->sk_state == TCP_SYN_SENT)
+ if (state == TCP_SYN_SENT)
return mask;
}
@@ -871,4 +979,27 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,
return mask;
}
+EXPORT_SYMBOL(datagram_poll_queue);
+
+/**
+ * datagram_poll - generic datagram poll
+ * @file: file struct
+ * @sock: socket
+ * @wait: poll table
+ *
+ * Datagram poll: Again totally generic. This also handles
+ * sequenced packet sockets providing the socket receive queue
+ * is only ever holding data ready to receive.
+ *
+ * Note: when you *don't* use this routine for this protocol,
+ * and you use a different write policy from sock_writeable()
+ * then please supply your own write_space callback.
+ *
+ * Return: poll bitmask indicating the socket's current state
+ */
+__poll_t datagram_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+ return datagram_poll_queue(file, sock, wait,
+ &sock->sk->sk_receive_queue);
+}
EXPORT_SYMBOL(datagram_poll);
diff --git a/net/core/dev.c b/net/core/dev.c
index 82114e1111e6..9094c0fb8c68 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* NET3 Protocol independent device support routines.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Derived from the non IP parts of dev.c 1.0.19
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -73,7 +69,7 @@
*/
#include <linux/uaccess.h>
-#include <linux/bitops.h>
+#include <linux/bitmap.h>
#include <linux/capability.h>
#include <linux/cpu.h>
#include <linux/types.h>
@@ -81,8 +77,11 @@
#include <linux/hash.h>
#include <linux/slab.h>
#include <linux/sched.h>
+#include <linux/sched/isolation.h>
#include <linux/sched/mm.h>
+#include <linux/smpboot.h>
#include <linux/mutex.h>
+#include <linux/rwsem.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
@@ -93,7 +92,9 @@
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
+#include <linux/ethtool_netlink.h>
#include <linux/skbuff.h>
+#include <linux/kthread.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <net/net_namespace.h>
@@ -101,12 +102,16 @@
#include <net/busy_poll.h>
#include <linux/rtnetlink.h>
#include <linux/stat.h>
+#include <net/dsa.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
+#include <net/gro.h>
+#include <net/netdev_queues.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <net/checksum.h>
#include <net/xfrm.h>
+#include <net/tcx.h>
#include <linux/highmem.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -131,7 +136,8 @@
#include <trace/events/napi.h>
#include <trace/events/net.h>
#include <trace/events/skb.h>
-#include <linux/pci.h>
+#include <trace/events/qdisc.h>
+#include <trace/events/xdp.h>
#include <linux/inetdevice.h>
#include <linux/cpu_rmap.h>
#include <linux/static_key.h>
@@ -140,51 +146,35 @@
#include <linux/if_macvlan.h>
#include <linux/errqueue.h>
#include <linux/hrtimer.h>
-#include <linux/netfilter_ingress.h>
+#include <linux/netfilter_netdev.h>
#include <linux/crash_dump.h>
#include <linux/sctp.h>
#include <net/udp_tunnel.h>
#include <linux/net_namespace.h>
-
+#include <linux/indirect_call_wrapper.h>
+#include <net/devlink.h>
+#include <linux/pm_runtime.h>
+#include <linux/prandom.h>
+#include <linux/once_lite.h>
+#include <net/netdev_lock.h>
+#include <net/netdev_rx_queue.h>
+#include <net/page_pool/types.h>
+#include <net/page_pool/helpers.h>
+#include <net/page_pool/memory_provider.h>
+#include <net/rps.h>
+#include <linux/phy_link_topology.h>
+
+#include "dev.h"
+#include "devmem.h"
#include "net-sysfs.h"
-#define MAX_GRO_SKBS 8
-
-/* This should be increased if a protocol with a bigger head is added. */
-#define GRO_MAX_HEAD (MAX_HEADER + 128)
-
static DEFINE_SPINLOCK(ptype_lock);
-static DEFINE_SPINLOCK(offload_lock);
struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
-struct list_head ptype_all __read_mostly; /* Taps */
-static struct list_head offload_base __read_mostly;
static int netif_rx_internal(struct sk_buff *skb);
-static int call_netdevice_notifiers_info(unsigned long val,
- struct netdev_notifier_info *info);
-static struct napi_struct *napi_by_id(unsigned int napi_id);
-
-/*
- * The @dev_base_head list is protected by @dev_base_lock and the rtnl
- * semaphore.
- *
- * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
- *
- * Writers must hold the rtnl semaphore while they loop through the
- * dev_base_head list, and hold dev_base_lock for writing when they do the
- * actual updates. This allows pure readers to access the list even
- * while a writer is preparing to update it.
- *
- * To put it another way, dev_base_lock is held for writing only to
- * protect against pure readers; the rtnl semaphore provides the
- * protection against other writers.
- *
- * See, for example usages, register_netdevice() and
- * unregister_netdevice(), which must be called with the rtnl
- * semaphore held.
- */
-DEFINE_RWLOCK(dev_base_lock);
-EXPORT_SYMBOL(dev_base_lock);
+static int call_netdevice_notifiers_extack(unsigned long val,
+ struct net_device *dev,
+ struct netlink_ext_ack *extack);
static DEFINE_MUTEX(ifalias_mutex);
@@ -194,12 +184,11 @@ static DEFINE_SPINLOCK(napi_hash_lock);
static unsigned int napi_gen_id = NR_CPUS;
static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
-static seqcount_t devnet_rename_seq;
-
static inline void dev_base_seq_inc(struct net *net)
{
- while (++net->dev_base_seq == 0)
- ;
+ unsigned int val = net->dev_base_seq + 1;
+
+ WRITE_ONCE(net->dev_base_seq, val ?: 1);
}
static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
@@ -214,33 +203,218 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
}
-static inline void rps_lock(struct softnet_data *sd)
+#ifndef CONFIG_PREEMPT_RT
+
+static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key);
+
+static int __init setup_backlog_napi_threads(char *arg)
+{
+ static_branch_enable(&use_backlog_threads_key);
+ return 0;
+}
+early_param("thread_backlog_napi", setup_backlog_napi_threads);
+
+static bool use_backlog_threads(void)
{
-#ifdef CONFIG_RPS
- spin_lock(&sd->input_pkt_queue.lock);
-#endif
+ return static_branch_unlikely(&use_backlog_threads_key);
}
-static inline void rps_unlock(struct softnet_data *sd)
+#else
+
+static bool use_backlog_threads(void)
{
-#ifdef CONFIG_RPS
- spin_unlock(&sd->input_pkt_queue.lock);
+ return true;
+}
+
#endif
+
+static inline void backlog_lock_irq_save(struct softnet_data *sd,
+ unsigned long *flags)
+{
+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
+ spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
+ else
+ local_irq_save(*flags);
+}
+
+static inline void backlog_lock_irq_disable(struct softnet_data *sd)
+{
+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
+ spin_lock_irq(&sd->input_pkt_queue.lock);
+ else
+ local_irq_disable();
+}
+
+static inline void backlog_unlock_irq_restore(struct softnet_data *sd,
+ unsigned long *flags)
+{
+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
+ spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
+ else
+ local_irq_restore(*flags);
+}
+
+static inline void backlog_unlock_irq_enable(struct softnet_data *sd)
+{
+ if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
+ spin_unlock_irq(&sd->input_pkt_queue.lock);
+ else
+ local_irq_enable();
+}
+
+static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
+ const char *name)
+{
+ struct netdev_name_node *name_node;
+
+ name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
+ if (!name_node)
+ return NULL;
+ INIT_HLIST_NODE(&name_node->hlist);
+ name_node->dev = dev;
+ name_node->name = name;
+ return name_node;
+}
+
+static struct netdev_name_node *
+netdev_name_node_head_alloc(struct net_device *dev)
+{
+ struct netdev_name_node *name_node;
+
+ name_node = netdev_name_node_alloc(dev, dev->name);
+ if (!name_node)
+ return NULL;
+ INIT_LIST_HEAD(&name_node->list);
+ return name_node;
+}
+
+static void netdev_name_node_free(struct netdev_name_node *name_node)
+{
+ kfree(name_node);
+}
+
+static void netdev_name_node_add(struct net *net,
+ struct netdev_name_node *name_node)
+{
+ hlist_add_head_rcu(&name_node->hlist,
+ dev_name_hash(net, name_node->name));
+}
+
+static void netdev_name_node_del(struct netdev_name_node *name_node)
+{
+ hlist_del_rcu(&name_node->hlist);
+}
+
+static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
+ const char *name)
+{
+ struct hlist_head *head = dev_name_hash(net, name);
+ struct netdev_name_node *name_node;
+
+ hlist_for_each_entry(name_node, head, hlist)
+ if (!strcmp(name_node->name, name))
+ return name_node;
+ return NULL;
+}
+
+static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
+ const char *name)
+{
+ struct hlist_head *head = dev_name_hash(net, name);
+ struct netdev_name_node *name_node;
+
+ hlist_for_each_entry_rcu(name_node, head, hlist)
+ if (!strcmp(name_node->name, name))
+ return name_node;
+ return NULL;
+}
+
+bool netdev_name_in_use(struct net *net, const char *name)
+{
+ return netdev_name_node_lookup(net, name);
+}
+EXPORT_SYMBOL(netdev_name_in_use);
+
+int netdev_name_node_alt_create(struct net_device *dev, const char *name)
+{
+ struct netdev_name_node *name_node;
+ struct net *net = dev_net(dev);
+
+ name_node = netdev_name_node_lookup(net, name);
+ if (name_node)
+ return -EEXIST;
+ name_node = netdev_name_node_alloc(dev, name);
+ if (!name_node)
+ return -ENOMEM;
+ netdev_name_node_add(net, name_node);
+ /* The node that holds dev->name acts as a head of per-device list. */
+ list_add_tail_rcu(&name_node->list, &dev->name_node->list);
+
+ return 0;
+}
+
+static void netdev_name_node_alt_free(struct rcu_head *head)
+{
+ struct netdev_name_node *name_node =
+ container_of(head, struct netdev_name_node, rcu);
+
+ kfree(name_node->name);
+ netdev_name_node_free(name_node);
+}
+
+static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
+{
+ netdev_name_node_del(name_node);
+ list_del(&name_node->list);
+ call_rcu(&name_node->rcu, netdev_name_node_alt_free);
+}
+
+int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
+{
+ struct netdev_name_node *name_node;
+ struct net *net = dev_net(dev);
+
+ name_node = netdev_name_node_lookup(net, name);
+ if (!name_node)
+ return -ENOENT;
+ /* lookup might have found our primary name or a name belonging
+ * to another device.
+ */
+ if (name_node == dev->name_node || name_node->dev != dev)
+ return -EINVAL;
+
+ __netdev_name_node_alt_destroy(name_node);
+ return 0;
+}
+
+static void netdev_name_node_alt_flush(struct net_device *dev)
+{
+ struct netdev_name_node *name_node, *tmp;
+
+ list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) {
+ list_del(&name_node->list);
+ netdev_name_node_alt_free(&name_node->rcu);
+ }
}
/* Device list insertion */
static void list_netdevice(struct net_device *dev)
{
+ struct netdev_name_node *name_node;
struct net *net = dev_net(dev);
ASSERT_RTNL();
- write_lock_bh(&dev_base_lock);
list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
- hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
+ netdev_name_node_add(net, dev->name_node);
hlist_add_head_rcu(&dev->index_hlist,
dev_index_hash(net, dev->ifindex));
- write_unlock_bh(&dev_base_lock);
+
+ netdev_for_each_altname(dev, name_node)
+ netdev_name_node_add(net, name_node);
+
+ /* We reserved the ifindex, this can't fail */
+ WARN_ON(xa_store(&net->dev_by_index, dev->ifindex, dev, GFP_KERNEL));
dev_base_seq_inc(net);
}
@@ -250,14 +424,20 @@ static void list_netdevice(struct net_device *dev)
*/
static void unlist_netdevice(struct net_device *dev)
{
+ struct netdev_name_node *name_node;
+ struct net *net = dev_net(dev);
+
ASSERT_RTNL();
+ xa_erase(&net->dev_by_index, dev->ifindex);
+
+ netdev_for_each_altname(dev, name_node)
+ netdev_name_node_del(name_node);
+
/* Unlink dev from the device chain */
- write_lock_bh(&dev_base_lock);
list_del_rcu(&dev->dev_list);
- hlist_del_rcu(&dev->name_hlist);
+ netdev_name_node_del(dev->name_node);
hlist_del_rcu(&dev->index_hlist);
- write_unlock_bh(&dev_base_lock);
dev_base_seq_inc(dev_net(dev));
}
@@ -273,9 +453,19 @@ static RAW_NOTIFIER_HEAD(netdev_chain);
* queue in the local softnet handler.
*/
-DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
+DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data) = {
+ .process_queue_bh_lock = INIT_LOCAL_LOCK(process_queue_bh_lock),
+};
EXPORT_PER_CPU_SYMBOL(softnet_data);
+/* Page_pool has a lockless array/stack to alloc/recycle pages.
+ * PP consumers must pay attention to run APIs in the appropriate context
+ * (e.g. NAPI context).
+ */
+DEFINE_PER_CPU(struct page_pool_bh, system_page_pool) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
+
#ifdef CONFIG_LOCKDEP
/*
* register_netdevice() inits txq->_xmit_lock and sets lockdep class
@@ -353,6 +543,7 @@ static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
unsigned short dev_type)
{
}
+
static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
{
}
@@ -383,10 +574,18 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
static inline struct list_head *ptype_head(const struct packet_type *pt)
{
- if (pt->type == htons(ETH_P_ALL))
- return pt->dev ? &pt->dev->ptype_all : &ptype_all;
- else
- return pt->dev ? &pt->dev->ptype_specific :
+ if (pt->type == htons(ETH_P_ALL)) {
+ if (!pt->af_packet_net && !pt->dev)
+ return NULL;
+
+ return pt->dev ? &pt->dev->ptype_all :
+ &pt->af_packet_net->ptype_all;
+ }
+
+ if (pt->dev)
+ return &pt->dev->ptype_specific;
+
+ return pt->af_packet_net ? &pt->af_packet_net->ptype_specific :
&ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
}
@@ -407,6 +606,9 @@ void dev_add_pack(struct packet_type *pt)
{
struct list_head *head = ptype_head(pt);
+ if (WARN_ON_ONCE(!head))
+ return;
+
spin_lock(&ptype_lock);
list_add_rcu(&pt->list, head);
spin_unlock(&ptype_lock);
@@ -431,6 +633,9 @@ void __dev_remove_pack(struct packet_type *pt)
struct list_head *head = ptype_head(pt);
struct packet_type *pt1;
+ if (!head)
+ return;
+
spin_lock(&ptype_lock);
list_for_each_entry(pt1, head, list) {
@@ -467,280 +672,197 @@ void dev_remove_pack(struct packet_type *pt)
EXPORT_SYMBOL(dev_remove_pack);
-/**
- * dev_add_offload - register offload handlers
- * @po: protocol offload declaration
+/*******************************************************************************
*
- * Add protocol offload handlers to the networking stack. The passed
- * &proto_offload is linked into kernel lists and may not be freed until
- * it has been removed from the kernel lists.
+ * Device Interface Subroutines
*
- * This call does not sleep therefore it can not
- * guarantee all CPU's that are in middle of receiving packets
- * will see the new offload handlers (until the next received packet).
- */
-void dev_add_offload(struct packet_offload *po)
-{
- struct packet_offload *elem;
-
- spin_lock(&offload_lock);
- list_for_each_entry(elem, &offload_base, list) {
- if (po->priority < elem->priority)
- break;
- }
- list_add_rcu(&po->list, elem->list.prev);
- spin_unlock(&offload_lock);
-}
-EXPORT_SYMBOL(dev_add_offload);
+ *******************************************************************************/
/**
- * __dev_remove_offload - remove offload handler
- * @po: packet offload declaration
- *
- * Remove a protocol offload handler that was previously added to the
- * kernel offload handlers by dev_add_offload(). The passed &offload_type
- * is removed from the kernel lists and can be freed or reused once this
- * function returns.
+ * dev_get_iflink - get 'iflink' value of a interface
+ * @dev: targeted interface
*
- * The packet type might still be in use by receivers
- * and must not be freed until after all the CPU's have gone
- * through a quiescent state.
+ * Indicates the ifindex the interface is linked to.
+ * Physical interfaces have the same 'ifindex' and 'iflink' values.
*/
-static void __dev_remove_offload(struct packet_offload *po)
-{
- struct list_head *head = &offload_base;
- struct packet_offload *po1;
- spin_lock(&offload_lock);
-
- list_for_each_entry(po1, head, list) {
- if (po == po1) {
- list_del_rcu(&po->list);
- goto out;
- }
- }
+int dev_get_iflink(const struct net_device *dev)
+{
+ if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
+ return dev->netdev_ops->ndo_get_iflink(dev);
- pr_warn("dev_remove_offload: %p not found\n", po);
-out:
- spin_unlock(&offload_lock);
+ return READ_ONCE(dev->ifindex);
}
+EXPORT_SYMBOL(dev_get_iflink);
/**
- * dev_remove_offload - remove packet offload handler
- * @po: packet offload declaration
- *
- * Remove a packet offload handler that was previously added to the kernel
- * offload handlers by dev_add_offload(). The passed &offload_type is
- * removed from the kernel lists and can be freed or reused once this
- * function returns.
+ * dev_fill_metadata_dst - Retrieve tunnel egress information.
+ * @dev: targeted interface
+ * @skb: The packet.
*
- * This call sleeps to guarantee that no CPU is looking at the packet
- * type after return.
+ * For better visibility of tunnel traffic OVS needs to retrieve
+ * egress tunnel information for a packet. Following API allows
+ * user to get this info.
*/
-void dev_remove_offload(struct packet_offload *po)
+int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
- __dev_remove_offload(po);
+ struct ip_tunnel_info *info;
- synchronize_net();
-}
-EXPORT_SYMBOL(dev_remove_offload);
+ if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
+ return -EINVAL;
-/******************************************************************************
- *
- * Device Boot-time Settings Routines
- *
- ******************************************************************************/
+ info = skb_tunnel_info_unclone(skb);
+ if (!info)
+ return -ENOMEM;
+ if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
+ return -EINVAL;
-/* Boot time configuration table */
-static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
+ return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
+}
+EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
-/**
- * netdev_boot_setup_add - add new setup entry
- * @name: name of the device
- * @map: configured settings for the device
- *
- * Adds new setup entry to the dev_boot_setup list. The function
- * returns 0 on error and 1 on success. This is a generic routine to
- * all netdevices.
- */
-static int netdev_boot_setup_add(char *name, struct ifmap *map)
+static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
{
- struct netdev_boot_setup *s;
- int i;
+ int k = stack->num_paths++;
- s = dev_boot_setup;
- for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
- if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
- memset(s[i].name, 0, sizeof(s[i].name));
- strlcpy(s[i].name, name, IFNAMSIZ);
- memcpy(&s[i].map, map, sizeof(s[i].map));
- break;
- }
- }
+ if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
+ return NULL;
- return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
+ return &stack->path[k];
}
-/**
- * netdev_boot_setup_check - check boot time settings
- * @dev: the netdevice
- *
- * Check boot time settings for the device.
- * The found settings are set for the device to be used
- * later in the device probing.
- * Returns 0 if no settings found, 1 if they are.
- */
-int netdev_boot_setup_check(struct net_device *dev)
+int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
+ struct net_device_path_stack *stack)
{
- struct netdev_boot_setup *s = dev_boot_setup;
- int i;
+ const struct net_device *last_dev;
+ struct net_device_path_ctx ctx = {
+ .dev = dev,
+ };
+ struct net_device_path *path;
+ int ret = 0;
- for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
- if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
- !strcmp(dev->name, s[i].name)) {
- dev->irq = s[i].map.irq;
- dev->base_addr = s[i].map.base_addr;
- dev->mem_start = s[i].map.mem_start;
- dev->mem_end = s[i].map.mem_end;
- return 1;
- }
- }
- return 0;
-}
-EXPORT_SYMBOL(netdev_boot_setup_check);
+ memcpy(ctx.daddr, daddr, sizeof(ctx.daddr));
+ stack->num_paths = 0;
+ while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
+ last_dev = ctx.dev;
+ path = dev_fwd_path(stack);
+ if (!path)
+ return -1;
+ memset(path, 0, sizeof(struct net_device_path));
+ ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
+ if (ret < 0)
+ return -1;
-/**
- * netdev_boot_base - get address from boot time settings
- * @prefix: prefix for network device
- * @unit: id for network device
- *
- * Check boot time settings for the base address of device.
- * The found settings are set for the device to be used
- * later in the device probing.
- * Returns 0 if no settings found.
- */
-unsigned long netdev_boot_base(const char *prefix, int unit)
-{
- const struct netdev_boot_setup *s = dev_boot_setup;
- char name[IFNAMSIZ];
- int i;
+ if (WARN_ON_ONCE(last_dev == ctx.dev))
+ return -1;
+ }
- sprintf(name, "%s%d", prefix, unit);
+ if (!ctx.dev)
+ return ret;
- /*
- * If device already registered then return base of 1
- * to indicate not to probe for this interface
- */
- if (__dev_get_by_name(&init_net, name))
- return 1;
+ path = dev_fwd_path(stack);
+ if (!path)
+ return -1;
+ path->type = DEV_PATH_ETHERNET;
+ path->dev = ctx.dev;
- for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
- if (!strcmp(name, s[i].name))
- return s[i].map.base_addr;
- return 0;
+ return ret;
}
+EXPORT_SYMBOL_GPL(dev_fill_forward_path);
-/*
- * Saves at boot time configured settings for any netdevice.
- */
-int __init netdev_boot_setup(char *str)
+/* must be called under rcu_read_lock(), as we dont take a reference */
+static struct napi_struct *napi_by_id(unsigned int napi_id)
{
- int ints[5];
- struct ifmap map;
-
- str = get_options(str, ARRAY_SIZE(ints), ints);
- if (!str || !*str)
- return 0;
+ unsigned int hash = napi_id % HASH_SIZE(napi_hash);
+ struct napi_struct *napi;
- /* Save settings */
- memset(&map, 0, sizeof(map));
- if (ints[0] > 0)
- map.irq = ints[1];
- if (ints[0] > 1)
- map.base_addr = ints[2];
- if (ints[0] > 2)
- map.mem_start = ints[3];
- if (ints[0] > 3)
- map.mem_end = ints[4];
+ hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
+ if (napi->napi_id == napi_id)
+ return napi;
- /* Add new entry to the list */
- return netdev_boot_setup_add(str, &map);
+ return NULL;
}
-__setup("netdev=", netdev_boot_setup);
-
-/*******************************************************************************
- *
- * Device Interface Subroutines
- *
- *******************************************************************************/
+/* must be called under rcu_read_lock(), as we dont take a reference */
+static struct napi_struct *
+netdev_napi_by_id(struct net *net, unsigned int napi_id)
+{
+ struct napi_struct *napi;
-/**
- * dev_get_iflink - get 'iflink' value of a interface
- * @dev: targeted interface
- *
- * Indicates the ifindex the interface is linked to.
- * Physical interfaces have the same 'ifindex' and 'iflink' values.
- */
+ napi = napi_by_id(napi_id);
+ if (!napi)
+ return NULL;
-int dev_get_iflink(const struct net_device *dev)
-{
- if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
- return dev->netdev_ops->ndo_get_iflink(dev);
+ if (WARN_ON_ONCE(!napi->dev))
+ return NULL;
+ if (!net_eq(net, dev_net(napi->dev)))
+ return NULL;
- return dev->ifindex;
+ return napi;
}
-EXPORT_SYMBOL(dev_get_iflink);
/**
- * dev_fill_metadata_dst - Retrieve tunnel egress information.
- * @dev: targeted interface
- * @skb: The packet.
+ * netdev_napi_by_id_lock() - find a device by NAPI ID and lock it
+ * @net: the applicable net namespace
+ * @napi_id: ID of a NAPI of a target device
*
- * For better visibility of tunnel traffic OVS needs to retrieve
- * egress tunnel information for a packet. Following API allows
- * user to get this info.
+ * Find a NAPI instance with @napi_id. Lock its device.
+ * The device must be in %NETREG_REGISTERED state for lookup to succeed.
+ * netdev_unlock() must be called to release it.
+ *
+ * Return: pointer to NAPI, its device with lock held, NULL if not found.
*/
-int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
+struct napi_struct *
+netdev_napi_by_id_lock(struct net *net, unsigned int napi_id)
{
- struct ip_tunnel_info *info;
+ struct napi_struct *napi;
+ struct net_device *dev;
- if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
- return -EINVAL;
+ rcu_read_lock();
+ napi = netdev_napi_by_id(net, napi_id);
+ if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) {
+ rcu_read_unlock();
+ return NULL;
+ }
- info = skb_tunnel_info_unclone(skb);
- if (!info)
- return -ENOMEM;
- if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
- return -EINVAL;
+ dev = napi->dev;
+ dev_hold(dev);
+ rcu_read_unlock();
- return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
+ dev = __netdev_put_lock(dev, net);
+ if (!dev)
+ return NULL;
+
+ rcu_read_lock();
+ napi = netdev_napi_by_id(net, napi_id);
+ if (napi && napi->dev != dev)
+ napi = NULL;
+ rcu_read_unlock();
+
+ if (!napi)
+ netdev_unlock(dev);
+ return napi;
}
-EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
/**
* __dev_get_by_name - find a device by its name
* @net: the applicable net namespace
* @name: name to find
*
- * Find an interface by name. Must be called under RTNL semaphore
- * or @dev_base_lock. If the name is found a pointer to the device
- * is returned. If the name is not found then %NULL is returned. The
+ * Find an interface by name. Must be called under RTNL semaphore.
+ * If the name is found a pointer to the device is returned.
+ * If the name is not found then %NULL is returned. The
* reference counters are not incremented so the caller must be
* careful with locks.
*/
struct net_device *__dev_get_by_name(struct net *net, const char *name)
{
- struct net_device *dev;
- struct hlist_head *head = dev_name_hash(net, name);
+ struct netdev_name_node *node_name;
- hlist_for_each_entry(dev, head, name_hlist)
- if (!strncmp(dev->name, name, IFNAMSIZ))
- return dev;
-
- return NULL;
+ node_name = netdev_name_node_lookup(net, name);
+ return node_name ? node_name->dev : NULL;
}
EXPORT_SYMBOL(__dev_get_by_name);
@@ -758,41 +880,50 @@ EXPORT_SYMBOL(__dev_get_by_name);
struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
{
- struct net_device *dev;
- struct hlist_head *head = dev_name_hash(net, name);
-
- hlist_for_each_entry_rcu(dev, head, name_hlist)
- if (!strncmp(dev->name, name, IFNAMSIZ))
- return dev;
+ struct netdev_name_node *node_name;
- return NULL;
+ node_name = netdev_name_node_lookup_rcu(net, name);
+ return node_name ? node_name->dev : NULL;
}
EXPORT_SYMBOL(dev_get_by_name_rcu);
+/* Deprecated for new users, call netdev_get_by_name() instead */
+struct net_device *dev_get_by_name(struct net *net, const char *name)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_name_rcu(net, name);
+ dev_hold(dev);
+ rcu_read_unlock();
+ return dev;
+}
+EXPORT_SYMBOL(dev_get_by_name);
+
/**
- * dev_get_by_name - find a device by its name
+ * netdev_get_by_name() - find a device by its name
* @net: the applicable net namespace
* @name: name to find
+ * @tracker: tracking object for the acquired reference
+ * @gfp: allocation flags for the tracker
*
* Find an interface by name. This can be called from any
* context and does its own locking. The returned handle has
- * the usage count incremented and the caller must use dev_put() to
+ * the usage count incremented and the caller must use netdev_put() to
* release it when it is no longer needed. %NULL is returned if no
* matching device is found.
*/
-
-struct net_device *dev_get_by_name(struct net *net, const char *name)
+struct net_device *netdev_get_by_name(struct net *net, const char *name,
+ netdevice_tracker *tracker, gfp_t gfp)
{
struct net_device *dev;
- rcu_read_lock();
- dev = dev_get_by_name_rcu(net, name);
+ dev = dev_get_by_name(net, name);
if (dev)
- dev_hold(dev);
- rcu_read_unlock();
+ netdev_tracker_alloc(dev, tracker, gfp);
return dev;
}
-EXPORT_SYMBOL(dev_get_by_name);
+EXPORT_SYMBOL(netdev_get_by_name);
/**
* __dev_get_by_index - find a device by its ifindex
@@ -802,8 +933,7 @@ EXPORT_SYMBOL(dev_get_by_name);
* Search for an interface by index. Returns %NULL if the device
* is not found or a pointer to the device. The device has not
* had its reference counter increased so the caller must be careful
- * about locking. The caller must hold either the RTNL semaphore
- * or @dev_base_lock.
+ * about locking. The caller must hold the RTNL semaphore.
*/
struct net_device *__dev_get_by_index(struct net *net, int ifindex)
@@ -843,30 +973,42 @@ struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
}
EXPORT_SYMBOL(dev_get_by_index_rcu);
+/* Deprecated for new users, call netdev_get_by_index() instead */
+struct net_device *dev_get_by_index(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, ifindex);
+ dev_hold(dev);
+ rcu_read_unlock();
+ return dev;
+}
+EXPORT_SYMBOL(dev_get_by_index);
/**
- * dev_get_by_index - find a device by its ifindex
+ * netdev_get_by_index() - find a device by its ifindex
* @net: the applicable net namespace
* @ifindex: index of device
+ * @tracker: tracking object for the acquired reference
+ * @gfp: allocation flags for the tracker
*
* Search for an interface by index. Returns NULL if the device
* is not found or a pointer to the device. The device returned has
* had a reference added and the pointer is safe until the user calls
- * dev_put to indicate they have finished with it.
+ * netdev_put() to indicate they have finished with it.
*/
-
-struct net_device *dev_get_by_index(struct net *net, int ifindex)
+struct net_device *netdev_get_by_index(struct net *net, int ifindex,
+ netdevice_tracker *tracker, gfp_t gfp)
{
struct net_device *dev;
- rcu_read_lock();
- dev = dev_get_by_index_rcu(net, ifindex);
+ dev = dev_get_by_index(net, ifindex);
if (dev)
- dev_hold(dev);
- rcu_read_unlock();
+ netdev_tracker_alloc(dev, tracker, gfp);
return dev;
}
-EXPORT_SYMBOL(dev_get_by_index);
+EXPORT_SYMBOL(netdev_get_by_index);
/**
* dev_get_by_napi_id - find a device by napi_id
@@ -877,54 +1019,183 @@ EXPORT_SYMBOL(dev_get_by_index);
* its reference counter increased so the caller must be careful
* about locking. The caller must hold RCU lock.
*/
-
struct net_device *dev_get_by_napi_id(unsigned int napi_id)
{
struct napi_struct *napi;
WARN_ON_ONCE(!rcu_read_lock_held());
- if (napi_id < MIN_NAPI_ID)
+ if (!napi_id_valid(napi_id))
return NULL;
napi = napi_by_id(napi_id);
return napi ? napi->dev : NULL;
}
-EXPORT_SYMBOL(dev_get_by_napi_id);
+
+/* Release the held reference on the net_device, and if the net_device
+ * is still registered try to lock the instance lock. If device is being
+ * unregistered NULL will be returned (but the reference has been released,
+ * either way!)
+ *
+ * This helper is intended for locking net_device after it has been looked up
+ * using a lockless lookup helper. Lock prevents the instance from going away.
+ */
+struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net)
+{
+ netdev_lock(dev);
+ if (dev->reg_state > NETREG_REGISTERED ||
+ dev->moving_ns || !net_eq(dev_net(dev), net)) {
+ netdev_unlock(dev);
+ dev_put(dev);
+ return NULL;
+ }
+ dev_put(dev);
+ return dev;
+}
+
+static struct net_device *
+__netdev_put_lock_ops_compat(struct net_device *dev, struct net *net)
+{
+ netdev_lock_ops_compat(dev);
+ if (dev->reg_state > NETREG_REGISTERED ||
+ dev->moving_ns || !net_eq(dev_net(dev), net)) {
+ netdev_unlock_ops_compat(dev);
+ dev_put(dev);
+ return NULL;
+ }
+ dev_put(dev);
+ return dev;
+}
+
+/**
+ * netdev_get_by_index_lock() - find a device by its ifindex
+ * @net: the applicable net namespace
+ * @ifindex: index of device
+ *
+ * Search for an interface by index. If a valid device
+ * with @ifindex is found it will be returned with netdev->lock held.
+ * netdev_unlock() must be called to release it.
+ *
+ * Return: pointer to a device with lock held, NULL if not found.
+ */
+struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return NULL;
+
+ return __netdev_put_lock(dev, net);
+}
+
+struct net_device *
+netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return NULL;
+
+ return __netdev_put_lock_ops_compat(dev, net);
+}
+
+struct net_device *
+netdev_xa_find_lock(struct net *net, struct net_device *dev,
+ unsigned long *index)
+{
+ if (dev)
+ netdev_unlock(dev);
+
+ do {
+ rcu_read_lock();
+ dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
+ if (!dev) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ dev = __netdev_put_lock(dev, net);
+ if (dev)
+ return dev;
+
+ (*index)++;
+ } while (true);
+}
+
+struct net_device *
+netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev,
+ unsigned long *index)
+{
+ if (dev)
+ netdev_unlock_ops_compat(dev);
+
+ do {
+ rcu_read_lock();
+ dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
+ if (!dev) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ dev = __netdev_put_lock_ops_compat(dev, net);
+ if (dev)
+ return dev;
+
+ (*index)++;
+ } while (true);
+}
+
+static DEFINE_SEQLOCK(netdev_rename_lock);
+
+void netdev_copy_name(struct net_device *dev, char *name)
+{
+ unsigned int seq;
+
+ do {
+ seq = read_seqbegin(&netdev_rename_lock);
+ strscpy(name, dev->name, IFNAMSIZ);
+ } while (read_seqretry(&netdev_rename_lock, seq));
+}
+EXPORT_IPV6_MOD_GPL(netdev_copy_name);
/**
* netdev_get_name - get a netdevice name, knowing its ifindex.
* @net: network namespace
* @name: a pointer to the buffer where the name will be stored.
* @ifindex: the ifindex of the interface to get the name from.
- *
- * The use of raw_seqcount_begin() and cond_resched() before
- * retrying is required as we want to give the writers a chance
- * to complete when CONFIG_PREEMPT is not set.
*/
int netdev_get_name(struct net *net, char *name, int ifindex)
{
struct net_device *dev;
- unsigned int seq;
+ int ret;
-retry:
- seq = raw_seqcount_begin(&devnet_rename_seq);
rcu_read_lock();
+
dev = dev_get_by_index_rcu(net, ifindex);
if (!dev) {
- rcu_read_unlock();
- return -ENODEV;
+ ret = -ENODEV;
+ goto out;
}
- strcpy(name, dev->name);
+ netdev_copy_name(dev, name);
+
+ ret = 0;
+out:
rcu_read_unlock();
- if (read_seqcount_retry(&devnet_rename_seq, seq)) {
- cond_resched();
- goto retry;
- }
+ return ret;
+}
- return 0;
+static bool dev_addr_cmp(struct net_device *dev, unsigned short type,
+ const char *ha)
+{
+ return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len);
}
/**
@@ -935,7 +1206,7 @@ retry:
*
* Search for an interface by MAC address. Returns NULL if the device
* is not found or a pointer to the device.
- * The caller must hold RCU or RTNL.
+ * The caller must hold RCU.
* The returned device has not had its ref count increased
* and the caller must therefore be careful about locking
*
@@ -947,26 +1218,38 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
struct net_device *dev;
for_each_netdev_rcu(net, dev)
- if (dev->type == type &&
- !memcmp(dev->dev_addr, ha, dev->addr_len))
+ if (dev_addr_cmp(dev, type, ha))
return dev;
return NULL;
}
EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
-struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
+/**
+ * dev_getbyhwaddr() - find a device by its hardware address
+ * @net: the applicable net namespace
+ * @type: media type of device
+ * @ha: hardware address
+ *
+ * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold
+ * rtnl_lock.
+ *
+ * Context: rtnl_lock() must be held.
+ * Return: pointer to the net_device, or NULL if not found
+ */
+struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type,
+ const char *ha)
{
struct net_device *dev;
ASSERT_RTNL();
for_each_netdev(net, dev)
- if (dev->type == type)
+ if (dev_addr_cmp(dev, type, ha))
return dev;
return NULL;
}
-EXPORT_SYMBOL(__dev_getfirstbyhwtype);
+EXPORT_SYMBOL(dev_getbyhwaddr);
struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
{
@@ -985,40 +1268,39 @@ struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
EXPORT_SYMBOL(dev_getfirstbyhwtype);
/**
- * __dev_get_by_flags - find any device with given flags
- * @net: the applicable net namespace
- * @if_flags: IFF_* values
- * @mask: bitmask of bits in if_flags to check
+ * netdev_get_by_flags_rcu - find any device with given flags
+ * @net: the applicable net namespace
+ * @tracker: tracking object for the acquired reference
+ * @if_flags: IFF_* values
+ * @mask: bitmask of bits in if_flags to check
*
- * Search for any interface with the given flags. Returns NULL if a device
- * is not found or a pointer to the device. Must be called inside
- * rtnl_lock(), and result refcount is unchanged.
+ * Search for any interface with the given flags.
+ *
+ * Context: rcu_read_lock() must be held.
+ * Returns: NULL if a device is not found or a pointer to the device.
*/
-
-struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
- unsigned short mask)
+struct net_device *netdev_get_by_flags_rcu(struct net *net, netdevice_tracker *tracker,
+ unsigned short if_flags, unsigned short mask)
{
- struct net_device *dev, *ret;
-
- ASSERT_RTNL();
+ struct net_device *dev;
- ret = NULL;
- for_each_netdev(net, dev) {
- if (((dev->flags ^ if_flags) & mask) == 0) {
- ret = dev;
- break;
+ for_each_netdev_rcu(net, dev) {
+ if (((READ_ONCE(dev->flags) ^ if_flags) & mask) == 0) {
+ netdev_hold(dev, tracker, GFP_ATOMIC);
+ return dev;
}
}
- return ret;
+
+ return NULL;
}
-EXPORT_SYMBOL(__dev_get_by_flags);
+EXPORT_IPV6_MOD(netdev_get_by_flags_rcu);
/**
* dev_valid_name - check if name is okay for network device
* @name: name string
*
* Network device names need to be valid file names to
- * to allow sysfs to work. We also disallow any kind of
+ * allow sysfs to work. We also disallow any kind of
* whitespace.
*/
bool dev_valid_name(const char *name)
@@ -1043,7 +1325,7 @@ EXPORT_SYMBOL(dev_valid_name);
* __dev_alloc_name - allocate a name for a device
* @net: network namespace to allocate the device name in
* @name: name format string
- * @buf: scratch buffer and result name string
+ * @res: result name string
*
* Passed a format string - eg "lt%d" it will try and find a suitable
* id. It scans list of devices to build up a free map, then chooses
@@ -1054,71 +1336,79 @@ EXPORT_SYMBOL(dev_valid_name);
* Returns the number of the unit assigned or a negative errno code.
*/
-static int __dev_alloc_name(struct net *net, const char *name, char *buf)
+static int __dev_alloc_name(struct net *net, const char *name, char *res)
{
int i = 0;
const char *p;
const int max_netdevices = 8*PAGE_SIZE;
unsigned long *inuse;
struct net_device *d;
+ char buf[IFNAMSIZ];
- if (!dev_valid_name(name))
+ /* Verify the string as this thing may have come from the user.
+ * There must be one "%d" and no other "%" characters.
+ */
+ p = strchr(name, '%');
+ if (!p || p[1] != 'd' || strchr(p + 2, '%'))
return -EINVAL;
- p = strchr(name, '%');
- if (p) {
- /*
- * Verify the string as this thing may have come from
- * the user. There must be either one "%d" and no other "%"
- * characters.
- */
- if (p[1] != 'd' || strchr(p + 2, '%'))
- return -EINVAL;
+ /* Use one page as a bit array of possible slots */
+ inuse = bitmap_zalloc(max_netdevices, GFP_ATOMIC);
+ if (!inuse)
+ return -ENOMEM;
- /* Use one page as a bit array of possible slots */
- inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
- if (!inuse)
- return -ENOMEM;
+ for_each_netdev(net, d) {
+ struct netdev_name_node *name_node;
- for_each_netdev(net, d) {
- if (!sscanf(d->name, name, &i))
+ netdev_for_each_altname(d, name_node) {
+ if (!sscanf(name_node->name, name, &i))
continue;
if (i < 0 || i >= max_netdevices)
continue;
- /* avoid cases where sscanf is not exact inverse of printf */
+ /* avoid cases where sscanf is not exact inverse of printf */
snprintf(buf, IFNAMSIZ, name, i);
- if (!strncmp(buf, d->name, IFNAMSIZ))
- set_bit(i, inuse);
+ if (!strncmp(buf, name_node->name, IFNAMSIZ))
+ __set_bit(i, inuse);
}
+ if (!sscanf(d->name, name, &i))
+ continue;
+ if (i < 0 || i >= max_netdevices)
+ continue;
- i = find_first_zero_bit(inuse, max_netdevices);
- free_page((unsigned long) inuse);
+ /* avoid cases where sscanf is not exact inverse of printf */
+ snprintf(buf, IFNAMSIZ, name, i);
+ if (!strncmp(buf, d->name, IFNAMSIZ))
+ __set_bit(i, inuse);
}
- snprintf(buf, IFNAMSIZ, name, i);
- if (!__dev_get_by_name(net, buf))
- return i;
+ i = find_first_zero_bit(inuse, max_netdevices);
+ bitmap_free(inuse);
+ if (i == max_netdevices)
+ return -ENFILE;
- /* It is possible to run out of possible slots
- * when the name is long and there isn't enough space left
- * for the digits, or if all bits are used.
- */
- return -ENFILE;
+ /* 'res' and 'name' could overlap, use 'buf' as an intermediate buffer */
+ strscpy(buf, name, IFNAMSIZ);
+ snprintf(res, IFNAMSIZ, buf, i);
+ return i;
}
-static int dev_alloc_name_ns(struct net *net,
- struct net_device *dev,
- const char *name)
+/* Returns negative errno or allocated unit id (see __dev_alloc_name()) */
+static int dev_prep_valid_name(struct net *net, struct net_device *dev,
+ const char *want_name, char *out_name,
+ int dup_errno)
{
- char buf[IFNAMSIZ];
- int ret;
+ if (!dev_valid_name(want_name))
+ return -EINVAL;
- BUG_ON(!net);
- ret = __dev_alloc_name(net, name, buf);
- if (ret >= 0)
- strlcpy(dev->name, buf, IFNAMSIZ);
- return ret;
+ if (strchr(want_name, '%'))
+ return __dev_alloc_name(net, want_name, out_name);
+
+ if (netdev_name_in_use(net, want_name))
+ return -dup_errno;
+ if (out_name != want_name)
+ strscpy(out_name, want_name, IFNAMSIZ);
+ return 0;
}
/**
@@ -1137,95 +1427,65 @@ static int dev_alloc_name_ns(struct net *net,
int dev_alloc_name(struct net_device *dev, const char *name)
{
- return dev_alloc_name_ns(dev_net(dev), dev, name);
+ return dev_prep_valid_name(dev_net(dev), dev, name, dev->name, ENFILE);
}
EXPORT_SYMBOL(dev_alloc_name);
-int dev_get_valid_name(struct net *net, struct net_device *dev,
- const char *name)
+static int dev_get_valid_name(struct net *net, struct net_device *dev,
+ const char *name)
{
- BUG_ON(!net);
-
- if (!dev_valid_name(name))
- return -EINVAL;
-
- if (strchr(name, '%'))
- return dev_alloc_name_ns(net, dev, name);
- else if (__dev_get_by_name(net, name))
- return -EEXIST;
- else if (dev->name != name)
- strlcpy(dev->name, name, IFNAMSIZ);
+ int ret;
- return 0;
+ ret = dev_prep_valid_name(net, dev, name, dev->name, EEXIST);
+ return ret < 0 ? ret : 0;
}
-EXPORT_SYMBOL(dev_get_valid_name);
-/**
- * dev_change_name - change name of a device
- * @dev: device
- * @newname: name (or format string) must be at least IFNAMSIZ
- *
- * Change name of a device, can pass format strings "eth%d".
- * for wildcarding.
- */
-int dev_change_name(struct net_device *dev, const char *newname)
+int netif_change_name(struct net_device *dev, const char *newname)
{
+ struct net *net = dev_net(dev);
unsigned char old_assign_type;
char oldname[IFNAMSIZ];
int err = 0;
int ret;
- struct net *net;
- ASSERT_RTNL();
- BUG_ON(!dev_net(dev));
-
- net = dev_net(dev);
- if (dev->flags & IFF_UP)
- return -EBUSY;
+ ASSERT_RTNL_NET(net);
- write_seqcount_begin(&devnet_rename_seq);
-
- if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
- write_seqcount_end(&devnet_rename_seq);
+ if (!strncmp(newname, dev->name, IFNAMSIZ))
return 0;
- }
memcpy(oldname, dev->name, IFNAMSIZ);
+ write_seqlock_bh(&netdev_rename_lock);
err = dev_get_valid_name(net, dev, newname);
- if (err < 0) {
- write_seqcount_end(&devnet_rename_seq);
+ write_sequnlock_bh(&netdev_rename_lock);
+
+ if (err < 0)
return err;
- }
if (oldname[0] && !strchr(oldname, '%'))
- netdev_info(dev, "renamed from %s\n", oldname);
+ netdev_info(dev, "renamed from %s%s\n", oldname,
+ dev->flags & IFF_UP ? " (while UP)" : "");
old_assign_type = dev->name_assign_type;
- dev->name_assign_type = NET_NAME_RENAMED;
+ WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED);
rollback:
ret = device_rename(&dev->dev, dev->name);
if (ret) {
+ write_seqlock_bh(&netdev_rename_lock);
memcpy(dev->name, oldname, IFNAMSIZ);
- dev->name_assign_type = old_assign_type;
- write_seqcount_end(&devnet_rename_seq);
+ write_sequnlock_bh(&netdev_rename_lock);
+ WRITE_ONCE(dev->name_assign_type, old_assign_type);
return ret;
}
- write_seqcount_end(&devnet_rename_seq);
-
netdev_adjacent_rename_links(dev, oldname);
- write_lock_bh(&dev_base_lock);
- hlist_del_rcu(&dev->name_hlist);
- write_unlock_bh(&dev_base_lock);
+ netdev_name_node_del(dev->name_node);
- synchronize_rcu();
+ synchronize_net();
- write_lock_bh(&dev_base_lock);
- hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
- write_unlock_bh(&dev_base_lock);
+ netdev_name_node_add(net, dev->name_node);
ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
ret = notifier_to_errno(ret);
@@ -1234,30 +1494,23 @@ rollback:
/* err >= 0 after dev_alloc_name() or stores the first errno */
if (err >= 0) {
err = ret;
- write_seqcount_begin(&devnet_rename_seq);
+ write_seqlock_bh(&netdev_rename_lock);
memcpy(dev->name, oldname, IFNAMSIZ);
+ write_sequnlock_bh(&netdev_rename_lock);
memcpy(oldname, newname, IFNAMSIZ);
- dev->name_assign_type = old_assign_type;
+ WRITE_ONCE(dev->name_assign_type, old_assign_type);
old_assign_type = NET_NAME_RENAMED;
goto rollback;
} else {
- pr_err("%s: name change rollback failed: %d\n",
- dev->name, ret);
+ netdev_err(dev, "name change rollback failed: %d\n",
+ ret);
}
}
return err;
}
-/**
- * dev_set_alias - change ifalias of a device
- * @dev: device
- * @alias: name up to IFALIASZ
- * @len: limit of bytes to copy from info
- *
- * Set ifalias for a device,
- */
-int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
+int netif_set_alias(struct net_device *dev, const char *alias, size_t len)
{
struct dev_ifalias *new_alias = NULL;
@@ -1274,8 +1527,8 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
}
mutex_lock(&ifalias_mutex);
- rcu_swap_protected(dev->ifalias, new_alias,
- mutex_is_locked(&ifalias_mutex));
+ new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
+ mutex_is_locked(&ifalias_mutex));
mutex_unlock(&ifalias_mutex);
if (new_alias)
@@ -1283,7 +1536,6 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
return len;
}
-EXPORT_SYMBOL(dev_set_alias);
/**
* dev_get_alias - get ifalias of a device
@@ -1320,16 +1572,10 @@ void netdev_features_change(struct net_device *dev)
}
EXPORT_SYMBOL(netdev_features_change);
-/**
- * netdev_state_change - device changes state
- * @dev: device to cause notification
- *
- * Called to indicate a device has changed state. This function calls
- * the notifier chains for netdev_chain and sends a NEWLINK message
- * to the routing socket.
- */
-void netdev_state_change(struct net_device *dev)
+void netif_state_change(struct net_device *dev)
{
+ netdev_ops_assert_locked_or_invisible(dev);
+
if (dev->flags & IFF_UP) {
struct netdev_notifier_change_info change_info = {
.info.dev = dev,
@@ -1337,10 +1583,28 @@ void netdev_state_change(struct net_device *dev)
call_netdevice_notifiers_info(NETDEV_CHANGE,
&change_info.info);
- rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
}
}
-EXPORT_SYMBOL(netdev_state_change);
+
+/**
+ * __netdev_notify_peers - notify network peers about existence of @dev,
+ * to be called when rtnl lock is already held.
+ * @dev: network device
+ *
+ * Generate traffic such that interested network peers are aware of
+ * @dev, such as by generating a gratuitous ARP. This may be used when
+ * a device wants to inform the rest of the network about some sort of
+ * reconfiguration such as a failover event or virtual machine
+ * migration.
+ */
+void __netdev_notify_peers(struct net_device *dev)
+{
+ ASSERT_RTNL();
+ call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
+ call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
+}
+EXPORT_SYMBOL(__netdev_notify_peers);
/**
* netdev_notify_peers - notify network peers about existence of @dev
@@ -1355,21 +1619,47 @@ EXPORT_SYMBOL(netdev_state_change);
void netdev_notify_peers(struct net_device *dev)
{
rtnl_lock();
- call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
- call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
+ __netdev_notify_peers(dev);
rtnl_unlock();
}
EXPORT_SYMBOL(netdev_notify_peers);
-static int __dev_open(struct net_device *dev)
+static int napi_threaded_poll(void *data);
+
+static int napi_kthread_create(struct napi_struct *n)
+{
+ int err = 0;
+
+ /* Create and wake up the kthread once to put it in
+ * TASK_INTERRUPTIBLE mode to avoid the blocked task
+ * warning and work with loadavg.
+ */
+ n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
+ n->dev->name, n->napi_id);
+ if (IS_ERR(n->thread)) {
+ err = PTR_ERR(n->thread);
+ pr_err("kthread_run failed with err %d\n", err);
+ n->thread = NULL;
+ }
+
+ return err;
+}
+
+static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
int ret;
ASSERT_RTNL();
+ dev_addr_check(dev);
- if (!netif_device_present(dev))
- return -ENODEV;
+ if (!netif_device_present(dev)) {
+ /* may be detached because parent is runtime-suspended */
+ if (dev->dev.parent)
+ pm_runtime_resume(dev->dev.parent);
+ if (!netif_device_present(dev))
+ return -ENODEV;
+ }
/* Block netpoll from trying to do any rx path servicing.
* If we don't do this there is a chance ndo_poll_controller
@@ -1377,13 +1667,15 @@ static int __dev_open(struct net_device *dev)
*/
netpoll_poll_disable(dev);
- ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
+ ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
ret = notifier_to_errno(ret);
if (ret)
return ret;
set_bit(__LINK_STATE_START, &dev->state);
+ netdev_ops_assert_locked(dev);
+
if (ops->ndo_validate_addr)
ret = ops->ndo_validate_addr(dev);
@@ -1395,7 +1687,7 @@ static int __dev_open(struct net_device *dev)
if (ret)
clear_bit(__LINK_STATE_START, &dev->state);
else {
- dev->flags |= IFF_UP;
+ netif_set_up(dev, true);
dev_set_rx_mode(dev);
dev_activate(dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
@@ -1404,35 +1696,22 @@ static int __dev_open(struct net_device *dev)
return ret;
}
-/**
- * dev_open - prepare an interface for use.
- * @dev: device to open
- *
- * Takes a device from down to up state. The device's private open
- * function is invoked and then the multicast lists are loaded. Finally
- * the device is moved into the up state and a %NETDEV_UP message is
- * sent to the netdev notifier chain.
- *
- * Calling this function on an active interface is a nop. On a failure
- * a negative errno code is returned.
- */
-int dev_open(struct net_device *dev)
+int netif_open(struct net_device *dev, struct netlink_ext_ack *extack)
{
int ret;
if (dev->flags & IFF_UP)
return 0;
- ret = __dev_open(dev);
+ ret = __dev_open(dev, extack);
if (ret < 0)
return ret;
- rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
call_netdevice_notifiers(NETDEV_UP, dev);
return ret;
}
-EXPORT_SYMBOL(dev_open);
static void __dev_close_many(struct list_head *head)
{
@@ -1470,10 +1749,13 @@ static void __dev_close_many(struct list_head *head)
* We allow it to be called even after a DETACH hot-plug
* event.
*/
+
+ netdev_ops_assert_locked(dev);
+
if (ops->ndo_stop)
ops->ndo_stop(dev);
- dev->flags &= ~IFF_UP;
+ netif_set_up(dev, false);
netpoll_poll_enable(dev);
}
}
@@ -1487,7 +1769,7 @@ static void __dev_close(struct net_device *dev)
list_del(&single);
}
-void dev_close_many(struct list_head *head, bool unlink)
+void netif_close_many(struct list_head *head, bool unlink)
{
struct net_device *dev, *tmp;
@@ -1499,45 +1781,27 @@ void dev_close_many(struct list_head *head, bool unlink)
__dev_close_many(head);
list_for_each_entry_safe(dev, tmp, head, close_list) {
- rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
call_netdevice_notifiers(NETDEV_DOWN, dev);
if (unlink)
list_del_init(&dev->close_list);
}
}
-EXPORT_SYMBOL(dev_close_many);
+EXPORT_SYMBOL_NS_GPL(netif_close_many, "NETDEV_INTERNAL");
-/**
- * dev_close - shutdown an interface.
- * @dev: device to shutdown
- *
- * This function moves an active device into down state. A
- * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
- * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
- * chain.
- */
-void dev_close(struct net_device *dev)
+void netif_close(struct net_device *dev)
{
if (dev->flags & IFF_UP) {
LIST_HEAD(single);
list_add(&dev->close_list, &single);
- dev_close_many(&single, true);
+ netif_close_many(&single, true);
list_del(&single);
}
}
-EXPORT_SYMBOL(dev_close);
+EXPORT_SYMBOL(netif_close);
-
-/**
- * dev_disable_lro - disable Large Receive Offload on a device
- * @dev: device
- *
- * Disable Large Receive Offload (LRO) on a net device. Must be
- * called under RTNL. This is needed if received packets may be
- * forwarded to another interface.
- */
-void dev_disable_lro(struct net_device *dev)
+void netif_disable_lro(struct net_device *dev)
{
struct net_device *lower_dev;
struct list_head *iter;
@@ -1548,10 +1812,13 @@ void dev_disable_lro(struct net_device *dev)
if (unlikely(dev->features & NETIF_F_LRO))
netdev_WARN(dev, "failed to disable LRO!\n");
- netdev_for_each_lower_dev(dev, lower_dev, iter)
- dev_disable_lro(lower_dev);
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ netdev_lock_ops(lower_dev);
+ netif_disable_lro(lower_dev);
+ netdev_unlock_ops(lower_dev);
+ }
}
-EXPORT_SYMBOL(dev_disable_lro);
+EXPORT_IPV6_MOD(netif_disable_lro);
/**
* dev_disable_gro_hw - disable HW Generic Receive Offload on a device
@@ -1579,12 +1846,15 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
- N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
- N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
- N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
- N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
+ N(POST_INIT) N(PRE_UNINIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN)
+ N(CHANGEUPPER) N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA)
+ N(BONDING_INFO) N(PRECHANGEUPPER) N(CHANGELOWERSTATE)
+ N(UDP_TUNNEL_PUSH_INFO) N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
+ N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
+ N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
+ N(XDP_FEAT_CHANGE)
}
#undef N
return "UNKNOWN_NETDEV_EVENT";
@@ -1601,6 +1871,64 @@ static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
return nb->notifier_call(nb, val, &info);
}
+static int call_netdevice_register_notifiers(struct notifier_block *nb,
+ struct net_device *dev)
+{
+ int err;
+
+ err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
+ err = notifier_to_errno(err);
+ if (err)
+ return err;
+
+ if (!(dev->flags & IFF_UP))
+ return 0;
+
+ call_netdevice_notifier(nb, NETDEV_UP, dev);
+ return 0;
+}
+
+static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
+ struct net_device *dev)
+{
+ if (dev->flags & IFF_UP) {
+ call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
+ dev);
+ call_netdevice_notifier(nb, NETDEV_DOWN, dev);
+ }
+ call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
+}
+
+static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
+ struct net *net)
+{
+ struct net_device *dev;
+ int err;
+
+ for_each_netdev(net, dev) {
+ netdev_lock_ops(dev);
+ err = call_netdevice_register_notifiers(nb, dev);
+ netdev_unlock_ops(dev);
+ if (err)
+ goto rollback;
+ }
+ return 0;
+
+rollback:
+ for_each_netdev_continue_reverse(net, dev)
+ call_netdevice_unregister_notifiers(nb, dev);
+ return err;
+}
+
+static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
+ struct net *net)
+{
+ struct net_device *dev;
+
+ for_each_netdev(net, dev)
+ call_netdevice_unregister_notifiers(nb, dev);
+}
+
static int dev_boot_phase = 1;
/**
@@ -1619,31 +1947,26 @@ static int dev_boot_phase = 1;
int register_netdevice_notifier(struct notifier_block *nb)
{
- struct net_device *dev;
- struct net_device *last;
struct net *net;
int err;
/* Close race with setup_net() and cleanup_net() */
down_write(&pernet_ops_rwsem);
+
+ /* When RTNL is removed, we need protection for netdev_chain. */
rtnl_lock();
+
err = raw_notifier_chain_register(&netdev_chain, nb);
if (err)
goto unlock;
if (dev_boot_phase)
goto unlock;
for_each_net(net) {
- for_each_netdev(net, dev) {
- err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
- err = notifier_to_errno(err);
- if (err)
- goto rollback;
-
- if (!(dev->flags & IFF_UP))
- continue;
-
- call_netdevice_notifier(nb, NETDEV_UP, dev);
- }
+ __rtnl_net_lock(net);
+ err = call_netdevice_register_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
+ if (err)
+ goto rollback;
}
unlock:
@@ -1652,22 +1975,12 @@ unlock:
return err;
rollback:
- last = dev;
- for_each_net(net) {
- for_each_netdev(net, dev) {
- if (dev == last)
- goto outroll;
-
- if (dev->flags & IFF_UP) {
- call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
- dev);
- call_netdevice_notifier(nb, NETDEV_DOWN, dev);
- }
- call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
- }
+ for_each_net_continue_reverse(net) {
+ __rtnl_net_lock(net);
+ call_netdevice_unregister_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
}
-outroll:
raw_notifier_chain_unregister(&netdev_chain, nb);
goto unlock;
}
@@ -1689,7 +2002,6 @@ EXPORT_SYMBOL(register_netdevice_notifier);
int unregister_netdevice_notifier(struct notifier_block *nb)
{
- struct net_device *dev;
struct net *net;
int err;
@@ -1701,15 +2013,11 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
goto unlock;
for_each_net(net) {
- for_each_netdev(net, dev) {
- if (dev->flags & IFF_UP) {
- call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
- dev);
- call_netdevice_notifier(nb, NETDEV_DOWN, dev);
- }
- call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
- }
+ __rtnl_net_lock(net);
+ call_netdevice_unregister_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
}
+
unlock:
rtnl_unlock();
up_write(&pernet_ops_rwsem);
@@ -1717,6 +2025,184 @@ unlock:
}
EXPORT_SYMBOL(unregister_netdevice_notifier);
+static int __register_netdevice_notifier_net(struct net *net,
+ struct notifier_block *nb,
+ bool ignore_call_fail)
+{
+ int err;
+
+ err = raw_notifier_chain_register(&net->netdev_chain, nb);
+ if (err)
+ return err;
+ if (dev_boot_phase)
+ return 0;
+
+ err = call_netdevice_register_net_notifiers(nb, net);
+ if (err && !ignore_call_fail)
+ goto chain_unregister;
+
+ return 0;
+
+chain_unregister:
+ raw_notifier_chain_unregister(&net->netdev_chain, nb);
+ return err;
+}
+
+static int __unregister_netdevice_notifier_net(struct net *net,
+ struct notifier_block *nb)
+{
+ int err;
+
+ err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
+ if (err)
+ return err;
+
+ call_netdevice_unregister_net_notifiers(nb, net);
+ return 0;
+}
+
+/**
+ * register_netdevice_notifier_net - register a per-netns network notifier block
+ * @net: network namespace
+ * @nb: notifier
+ *
+ * Register a notifier to be called when network device events occur.
+ * The notifier passed is linked into the kernel structures and must
+ * not be reused until it has been unregistered. A negative errno code
+ * is returned on a failure.
+ *
+ * When registered all registration and up events are replayed
+ * to the new notifier to allow device to have a race free
+ * view of the network device list.
+ */
+
+int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
+{
+ int err;
+
+ rtnl_net_lock(net);
+ err = __register_netdevice_notifier_net(net, nb, false);
+ rtnl_net_unlock(net);
+
+ return err;
+}
+EXPORT_SYMBOL(register_netdevice_notifier_net);
+
+/**
+ * unregister_netdevice_notifier_net - unregister a per-netns
+ * network notifier block
+ * @net: network namespace
+ * @nb: notifier
+ *
+ * Unregister a notifier previously registered by
+ * register_netdevice_notifier_net(). The notifier is unlinked from the
+ * kernel structures and may then be reused. A negative errno code
+ * is returned on a failure.
+ *
+ * After unregistering unregister and down device events are synthesized
+ * for all devices on the device list to the removed notifier to remove
+ * the need for special case cleanup code.
+ */
+
+int unregister_netdevice_notifier_net(struct net *net,
+ struct notifier_block *nb)
+{
+ int err;
+
+ rtnl_net_lock(net);
+ err = __unregister_netdevice_notifier_net(net, nb);
+ rtnl_net_unlock(net);
+
+ return err;
+}
+EXPORT_SYMBOL(unregister_netdevice_notifier_net);
+
+static void __move_netdevice_notifier_net(struct net *src_net,
+ struct net *dst_net,
+ struct notifier_block *nb)
+{
+ __unregister_netdevice_notifier_net(src_net, nb);
+ __register_netdevice_notifier_net(dst_net, nb, true);
+}
+
+static void rtnl_net_dev_lock(struct net_device *dev)
+{
+ bool again;
+
+ do {
+ struct net *net;
+
+ again = false;
+
+ /* netns might be being dismantled. */
+ rcu_read_lock();
+ net = dev_net_rcu(dev);
+ net_passive_inc(net);
+ rcu_read_unlock();
+
+ rtnl_net_lock(net);
+
+#ifdef CONFIG_NET_NS
+ /* dev might have been moved to another netns. */
+ if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) {
+ rtnl_net_unlock(net);
+ net_passive_dec(net);
+ again = true;
+ }
+#endif
+ } while (again);
+}
+
+static void rtnl_net_dev_unlock(struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+
+ rtnl_net_unlock(net);
+ net_passive_dec(net);
+}
+
+int register_netdevice_notifier_dev_net(struct net_device *dev,
+ struct notifier_block *nb,
+ struct netdev_net_notifier *nn)
+{
+ int err;
+
+ rtnl_net_dev_lock(dev);
+ err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
+ if (!err) {
+ nn->nb = nb;
+ list_add(&nn->list, &dev->net_notifier_list);
+ }
+ rtnl_net_dev_unlock(dev);
+
+ return err;
+}
+EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
+
+int unregister_netdevice_notifier_dev_net(struct net_device *dev,
+ struct notifier_block *nb,
+ struct netdev_net_notifier *nn)
+{
+ int err;
+
+ rtnl_net_dev_lock(dev);
+ list_del(&nn->list);
+ err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
+ rtnl_net_dev_unlock(dev);
+
+ return err;
+}
+EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
+
+static void move_netdevice_notifiers_dev_net(struct net_device *dev,
+ struct net *net)
+{
+ struct netdev_net_notifier *nn;
+
+ list_for_each_entry(nn, &dev->net_notifier_list, list)
+ __move_netdevice_notifier_net(dev_net(dev), net, nn->nb);
+}
+
/**
* call_netdevice_notifiers_info - call all network notifier blocks
* @val: value passed unmodified to notifier function
@@ -1726,14 +2212,63 @@ EXPORT_SYMBOL(unregister_netdevice_notifier);
* are as for raw_notifier_call_chain().
*/
-static int call_netdevice_notifiers_info(unsigned long val,
- struct netdev_notifier_info *info)
+int call_netdevice_notifiers_info(unsigned long val,
+ struct netdev_notifier_info *info)
{
+ struct net *net = dev_net(info->dev);
+ int ret;
+
ASSERT_RTNL();
+
+ /* Run per-netns notifier block chain first, then run the global one.
+ * Hopefully, one day, the global one is going to be removed after
+ * all notifier block registrators get converted to be per-netns.
+ */
+ ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
+ if (ret & NOTIFY_STOP_MASK)
+ return ret;
return raw_notifier_call_chain(&netdev_chain, val, info);
}
/**
+ * call_netdevice_notifiers_info_robust - call per-netns notifier blocks
+ * for and rollback on error
+ * @val_up: value passed unmodified to notifier function
+ * @val_down: value passed unmodified to the notifier function when
+ * recovering from an error on @val_up
+ * @info: notifier information data
+ *
+ * Call all per-netns network notifier blocks, but not notifier blocks on
+ * the global notifier chain. Parameters and return value are as for
+ * raw_notifier_call_chain_robust().
+ */
+
+static int
+call_netdevice_notifiers_info_robust(unsigned long val_up,
+ unsigned long val_down,
+ struct netdev_notifier_info *info)
+{
+ struct net *net = dev_net(info->dev);
+
+ ASSERT_RTNL();
+
+ return raw_notifier_call_chain_robust(&net->netdev_chain,
+ val_up, val_down, info);
+}
+
+static int call_netdevice_notifiers_extack(unsigned long val,
+ struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_info info = {
+ .dev = dev,
+ .extack = extack,
+ };
+
+ return call_netdevice_notifiers_info(val, &info);
+}
+
+/**
* call_netdevice_notifiers - call all network notifier blocks
* @val: value passed unmodified to notifier function
* @dev: net_device pointer passed unmodified to notifier function
@@ -1744,13 +2279,31 @@ static int call_netdevice_notifiers_info(unsigned long val,
int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
{
- struct netdev_notifier_info info = {
- .dev = dev,
+ return call_netdevice_notifiers_extack(val, dev, NULL);
+}
+EXPORT_SYMBOL(call_netdevice_notifiers);
+
+/**
+ * call_netdevice_notifiers_mtu - call all network notifier blocks
+ * @val: value passed unmodified to notifier function
+ * @dev: net_device pointer passed unmodified to notifier function
+ * @arg: additional u32 argument passed to the notifier function
+ *
+ * Call all network notifier blocks. Parameters and return value
+ * are as for raw_notifier_call_chain().
+ */
+static int call_netdevice_notifiers_mtu(unsigned long val,
+ struct net_device *dev, u32 arg)
+{
+ struct netdev_notifier_info_ext info = {
+ .info.dev = dev,
+ .ext.mtu = arg,
};
- return call_netdevice_notifiers_info(val, &info);
+ BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
+
+ return call_netdevice_notifiers_info(val, &info.info);
}
-EXPORT_SYMBOL(call_netdevice_notifiers);
#ifdef CONFIG_NET_INGRESS
static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
@@ -1784,8 +2337,14 @@ void net_dec_egress_queue(void)
EXPORT_SYMBOL_GPL(net_dec_egress_queue);
#endif
-static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
-#ifdef HAVE_JUMP_LABEL
+#ifdef CONFIG_NET_CLS_ACT
+DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key);
+EXPORT_SYMBOL(tcf_sw_enabled_key);
+#endif
+
+DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
+EXPORT_SYMBOL(netstamp_needed_key);
+#ifdef CONFIG_JUMP_LABEL
static atomic_t netstamp_needed_deferred;
static atomic_t netstamp_wanted;
static void netstamp_clear(struct work_struct *work)
@@ -1804,14 +2363,11 @@ static DECLARE_WORK(netstamp_work, netstamp_clear);
void net_enable_timestamp(void)
{
-#ifdef HAVE_JUMP_LABEL
- int wanted;
+#ifdef CONFIG_JUMP_LABEL
+ int wanted = atomic_read(&netstamp_wanted);
- while (1) {
- wanted = atomic_read(&netstamp_wanted);
- if (wanted <= 0)
- break;
- if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
+ while (wanted > 0) {
+ if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted + 1))
return;
}
atomic_inc(&netstamp_needed_deferred);
@@ -1824,14 +2380,11 @@ EXPORT_SYMBOL(net_enable_timestamp);
void net_disable_timestamp(void)
{
-#ifdef HAVE_JUMP_LABEL
- int wanted;
+#ifdef CONFIG_JUMP_LABEL
+ int wanted = atomic_read(&netstamp_wanted);
- while (1) {
- wanted = atomic_read(&netstamp_wanted);
- if (wanted <= 1)
- break;
- if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
+ while (wanted > 1) {
+ if (atomic_try_cmpxchg(&netstamp_wanted, &wanted, wanted - 1))
return;
}
atomic_dec(&netstamp_needed_deferred);
@@ -1845,40 +2398,27 @@ EXPORT_SYMBOL(net_disable_timestamp);
static inline void net_timestamp_set(struct sk_buff *skb)
{
skb->tstamp = 0;
+ skb->tstamp_type = SKB_CLOCK_REALTIME;
if (static_branch_unlikely(&netstamp_needed_key))
- __net_timestamp(skb);
+ skb->tstamp = ktime_get_real();
}
#define net_timestamp_check(COND, SKB) \
if (static_branch_unlikely(&netstamp_needed_key)) { \
if ((COND) && !(SKB)->tstamp) \
- __net_timestamp(SKB); \
+ (SKB)->tstamp = ktime_get_real(); \
} \
bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
{
- unsigned int len;
-
- if (!(dev->flags & IFF_UP))
- return false;
-
- len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
- if (skb->len <= len)
- return true;
-
- /* if TSO is enabled, we don't care about the length as the packet
- * could be forwarded without being segmented before
- */
- if (skb_is_gso(skb))
- return true;
-
- return false;
+ return __is_skb_forwardable(dev, skb, true);
}
EXPORT_SYMBOL_GPL(is_skb_forwardable);
-int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
+ bool check_mtu)
{
- int ret = ____dev_forward_skb(dev, skb);
+ int ret = ____dev_forward_skb(dev, skb, check_mtu);
if (likely(!ret)) {
skb->protocol = eth_type_trans(skb, dev);
@@ -1887,6 +2427,11 @@ int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
return ret;
}
+
+int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
+{
+ return __dev_forward_skb2(dev, skb, true);
+}
EXPORT_SYMBOL_GPL(__dev_forward_skb);
/**
@@ -1913,9 +2458,14 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
}
EXPORT_SYMBOL_GPL(dev_forward_skb);
-static inline int deliver_skb(struct sk_buff *skb,
- struct packet_type *pt_prev,
- struct net_device *orig_dev)
+int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
+{
+ return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
+}
+
+static int deliver_skb(struct sk_buff *skb,
+ struct packet_type *pt_prev,
+ struct net_device *orig_dev)
{
if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
return -ENOMEM;
@@ -1934,7 +2484,7 @@ static inline void deliver_ptype_list_skb(struct sk_buff *skb,
list_for_each_entry_rcu(ptype, ptype_list, list) {
if (ptype->type != type)
continue;
- if (pt_prev)
+ if (unlikely(pt_prev))
deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
@@ -1954,6 +2504,23 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
return false;
}
+/**
+ * dev_nit_active_rcu - return true if any network interface taps are in use
+ *
+ * The caller must hold the RCU lock
+ *
+ * @dev: network device to check for the presence of taps
+ */
+bool dev_nit_active_rcu(const struct net_device *dev)
+{
+ /* Callers may hold either RCU or RCU BH lock */
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+
+ return !list_empty(&dev_net(dev)->ptype_all) ||
+ !list_empty(&dev->ptype_all);
+}
+EXPORT_SYMBOL_GPL(dev_nit_active_rcu);
+
/*
* Support routine. Sends outgoing frames to any network
* taps currently in use.
@@ -1961,21 +2528,24 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
{
- struct packet_type *ptype;
+ struct packet_type *ptype, *pt_prev = NULL;
+ struct list_head *ptype_list;
struct sk_buff *skb2 = NULL;
- struct packet_type *pt_prev = NULL;
- struct list_head *ptype_list = &ptype_all;
rcu_read_lock();
+ ptype_list = &dev_net_rcu(dev)->ptype_all;
again:
list_for_each_entry_rcu(ptype, ptype_list, list) {
+ if (READ_ONCE(ptype->ignore_outgoing))
+ continue;
+
/* Never send packets back to the socket
* they originated from - MvS (miquels@drinkel.ow.org)
*/
if (skb_loop_sk(ptype, skb))
continue;
- if (pt_prev) {
+ if (unlikely(pt_prev)) {
deliver_skb(skb2, pt_prev, skb->dev);
pt_prev = ptype;
continue;
@@ -2007,7 +2577,7 @@ again:
pt_prev = ptype;
}
- if (ptype_list == &ptype_all) {
+ if (ptype_list != &dev->ptype_all) {
ptype_list = &dev->ptype_all;
goto again;
}
@@ -2042,7 +2612,7 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq)
/* If TC0 is invalidated disable TC mapping */
if (tc->offset + tc->count > txq) {
- pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
+ netdev_warn(dev, "Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
dev->num_tc = 0;
return;
}
@@ -2053,8 +2623,8 @@ static void netif_setup_tc(struct net_device *dev, unsigned int txq)
tc = &dev->tc_to_txq[q];
if (tc->offset + tc->count > txq) {
- pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
- i, q);
+ netdev_warn(dev, "Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
+ i, q);
netdev_set_prio_tc_map(dev, i, 0);
}
}
@@ -2081,22 +2651,19 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
EXPORT_SYMBOL(netdev_txq_to_tc);
#ifdef CONFIG_XPS
-struct static_key xps_needed __read_mostly;
-EXPORT_SYMBOL(xps_needed);
-struct static_key xps_rxqs_needed __read_mostly;
-EXPORT_SYMBOL(xps_rxqs_needed);
+static struct static_key xps_needed __read_mostly;
+static struct static_key xps_rxqs_needed __read_mostly;
static DEFINE_MUTEX(xps_map_mutex);
#define xmap_dereference(P) \
rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
- int tci, u16 index)
+ struct xps_dev_maps *old_maps, int tci, u16 index)
{
struct xps_map *map = NULL;
int pos;
- if (dev_maps)
- map = xmap_dereference(dev_maps->attr_map[tci]);
+ map = xmap_dereference(dev_maps->attr_map[tci]);
if (!map)
return false;
@@ -2109,6 +2676,8 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
break;
}
+ if (old_maps)
+ RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
kfree_rcu(map, rcu);
return false;
@@ -2121,7 +2690,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
struct xps_dev_maps *dev_maps,
int cpu, u16 offset, u16 count)
{
- int num_tc = dev->num_tc ? : 1;
+ int num_tc = dev_maps->num_tc;
bool active = false;
int tci;
@@ -2129,7 +2698,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
int i, j;
for (i = count, j = offset; i--; j++) {
- if (!remove_xps_queue(dev_maps, tci, j))
+ if (!remove_xps_queue(dev_maps, NULL, tci, j))
break;
}
@@ -2139,69 +2708,56 @@ static bool remove_xps_queue_cpu(struct net_device *dev,
return active;
}
-static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
- struct xps_dev_maps *dev_maps, unsigned int nr_ids,
- u16 offset, u16 count, bool is_rxqs_map)
+static void reset_xps_maps(struct net_device *dev,
+ struct xps_dev_maps *dev_maps,
+ enum xps_map_type type)
+{
+ static_key_slow_dec_cpuslocked(&xps_needed);
+ if (type == XPS_RXQS)
+ static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
+
+ RCU_INIT_POINTER(dev->xps_maps[type], NULL);
+
+ kfree_rcu(dev_maps, rcu);
+}
+
+static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
+ u16 offset, u16 count)
{
+ struct xps_dev_maps *dev_maps;
bool active = false;
int i, j;
- for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
- j < nr_ids;)
- active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
- count);
- if (!active) {
- if (is_rxqs_map) {
- RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
- } else {
- RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
+ dev_maps = xmap_dereference(dev->xps_maps[type]);
+ if (!dev_maps)
+ return;
- for (i = offset + (count - 1); count--; i--)
- netdev_queue_numa_node_write(
- netdev_get_tx_queue(dev, i),
- NUMA_NO_NODE);
- }
- kfree_rcu(dev_maps, rcu);
+ for (j = 0; j < dev_maps->nr_ids; j++)
+ active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
+ if (!active)
+ reset_xps_maps(dev, dev_maps, type);
+
+ if (type == XPS_CPUS) {
+ for (i = offset + (count - 1); count--; i--)
+ netdev_queue_numa_node_write(
+ netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
}
}
static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
u16 count)
{
- const unsigned long *possible_mask = NULL;
- struct xps_dev_maps *dev_maps;
- unsigned int nr_ids;
-
if (!static_key_false(&xps_needed))
return;
cpus_read_lock();
mutex_lock(&xps_map_mutex);
- if (static_key_false(&xps_rxqs_needed)) {
- dev_maps = xmap_dereference(dev->xps_rxqs_map);
- if (dev_maps) {
- nr_ids = dev->num_rx_queues;
- clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
- offset, count, true);
- }
- }
+ if (static_key_false(&xps_rxqs_needed))
+ clean_xps_maps(dev, XPS_RXQS, offset, count);
- dev_maps = xmap_dereference(dev->xps_cpus_map);
- if (!dev_maps)
- goto out_no_maps;
-
- if (num_possible_cpus() > 1)
- possible_mask = cpumask_bits(cpu_possible_mask);
- nr_ids = nr_cpu_ids;
- clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
- false);
+ clean_xps_maps(dev, XPS_CPUS, offset, count);
-out_no_maps:
- if (static_key_enabled(&xps_rxqs_needed))
- static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
-
- static_key_slow_dec_cpuslocked(&xps_needed);
mutex_unlock(&xps_map_mutex);
cpus_read_unlock();
}
@@ -2251,18 +2807,39 @@ static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
return new_map;
}
+/* Copy xps maps at a given index */
+static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
+ struct xps_dev_maps *new_dev_maps, int index,
+ int tc, bool skip_tc)
+{
+ int i, tci = index * dev_maps->num_tc;
+ struct xps_map *map;
+
+ /* copy maps belonging to foreign traffic classes */
+ for (i = 0; i < dev_maps->num_tc; i++, tci++) {
+ if (i == tc && skip_tc)
+ continue;
+
+ /* fill in the new device map from the old device map */
+ map = xmap_dereference(dev_maps->attr_map[tci]);
+ RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
+ }
+}
+
/* Must be called under cpus_read_lock */
int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
- u16 index, bool is_rxqs_map)
+ u16 index, enum xps_map_type type)
{
- const unsigned long *online_mask = NULL, *possible_mask = NULL;
- struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
+ struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
+ const unsigned long *online_mask = NULL;
+ bool active = false, copy = false;
int i, j, tci, numa_node_id = -2;
int maps_sz, num_tc = 1, tc = 0;
struct xps_map *map, *new_map;
- bool active = false;
unsigned int nr_ids;
+ WARN_ON_ONCE(index >= dev->num_tx_queues);
+
if (dev->num_tc) {
/* Do not allow XPS on subordinate device directly */
num_tc = dev->num_tc;
@@ -2278,38 +2855,48 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
}
mutex_lock(&xps_map_mutex);
- if (is_rxqs_map) {
+
+ dev_maps = xmap_dereference(dev->xps_maps[type]);
+ if (type == XPS_RXQS) {
maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
- dev_maps = xmap_dereference(dev->xps_rxqs_map);
nr_ids = dev->num_rx_queues;
} else {
maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
- if (num_possible_cpus() > 1) {
+ if (num_possible_cpus() > 1)
online_mask = cpumask_bits(cpu_online_mask);
- possible_mask = cpumask_bits(cpu_possible_mask);
- }
- dev_maps = xmap_dereference(dev->xps_cpus_map);
nr_ids = nr_cpu_ids;
}
if (maps_sz < L1_CACHE_BYTES)
maps_sz = L1_CACHE_BYTES;
+ /* The old dev_maps could be larger or smaller than the one we're
+ * setting up now, as dev->num_tc or nr_ids could have been updated in
+ * between. We could try to be smart, but let's be safe instead and only
+ * copy foreign traffic classes if the two map sizes match.
+ */
+ if (dev_maps &&
+ dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
+ copy = true;
+
/* allocate memory for queue storage */
for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
j < nr_ids;) {
- if (!new_dev_maps)
- new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
if (!new_dev_maps) {
- mutex_unlock(&xps_map_mutex);
- return -ENOMEM;
+ new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
+ if (!new_dev_maps) {
+ mutex_unlock(&xps_map_mutex);
+ return -ENOMEM;
+ }
+
+ new_dev_maps->nr_ids = nr_ids;
+ new_dev_maps->num_tc = num_tc;
}
tci = j * num_tc + tc;
- map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
- NULL;
+ map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
- map = expand_xps_map(map, j, index, is_rxqs_map);
+ map = expand_xps_map(map, j, index, type == XPS_RXQS);
if (!map)
goto error;
@@ -2319,29 +2906,24 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
if (!new_dev_maps)
goto out_no_new_maps;
- static_key_slow_inc_cpuslocked(&xps_needed);
- if (is_rxqs_map)
- static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
+ if (!dev_maps) {
+ /* Increment static keys at most once per type */
+ static_key_slow_inc_cpuslocked(&xps_needed);
+ if (type == XPS_RXQS)
+ static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
+ }
- for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- j < nr_ids;) {
- /* copy maps belonging to foreign traffic classes */
- for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
- /* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->attr_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
- }
+ for (j = 0; j < nr_ids; j++) {
+ bool skip_tc = false;
- /* We need to explicitly update tci as prevous loop
- * could break out early if dev_maps is NULL.
- */
tci = j * num_tc + tc;
-
if (netif_attr_test_mask(j, mask, nr_ids) &&
netif_attr_test_online(j, online_mask, nr_ids)) {
/* add tx-queue to CPU/rx-queue maps */
int pos = 0;
+ skip_tc = true;
+
map = xmap_dereference(new_dev_maps->attr_map[tci]);
while ((pos < map->len) && (map->queues[pos] != index))
pos++;
@@ -2349,83 +2931,81 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
if (pos == map->len)
map->queues[map->len++] = index;
#ifdef CONFIG_NUMA
- if (!is_rxqs_map) {
+ if (type == XPS_CPUS) {
if (numa_node_id == -2)
numa_node_id = cpu_to_node(j);
else if (numa_node_id != cpu_to_node(j))
numa_node_id = -1;
}
#endif
- } else if (dev_maps) {
- /* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->attr_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
}
- /* copy maps belonging to foreign traffic classes */
- for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
- /* fill in the new device map from the old device map */
- map = xmap_dereference(dev_maps->attr_map[tci]);
- RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
- }
+ if (copy)
+ xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
+ skip_tc);
}
- if (is_rxqs_map)
- rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
- else
- rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
+ rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
/* Cleanup old maps */
if (!dev_maps)
goto out_no_old_maps;
- for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- j < nr_ids;) {
- for (i = num_tc, tci = j * num_tc; i--; tci++) {
- new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+ for (j = 0; j < dev_maps->nr_ids; j++) {
+ for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
map = xmap_dereference(dev_maps->attr_map[tci]);
- if (map && map != new_map)
- kfree_rcu(map, rcu);
+ if (!map)
+ continue;
+
+ if (copy) {
+ new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+ if (map == new_map)
+ continue;
+ }
+
+ RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
+ kfree_rcu(map, rcu);
}
}
- kfree_rcu(dev_maps, rcu);
+ old_dev_maps = dev_maps;
out_no_old_maps:
dev_maps = new_dev_maps;
active = true;
out_no_new_maps:
- if (!is_rxqs_map) {
+ if (type == XPS_CPUS)
/* update Tx queue numa node */
netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
(numa_node_id >= 0) ?
numa_node_id : NUMA_NO_NODE);
- }
if (!dev_maps)
goto out_no_maps;
/* removes tx-queue from unused CPUs/rx-queues */
- for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- j < nr_ids;) {
- for (i = tc, tci = j * num_tc; i--; tci++)
- active |= remove_xps_queue(dev_maps, tci, index);
- if (!netif_attr_test_mask(j, mask, nr_ids) ||
- !netif_attr_test_online(j, online_mask, nr_ids))
- active |= remove_xps_queue(dev_maps, tci, index);
- for (i = num_tc - tc, tci++; --i; tci++)
- active |= remove_xps_queue(dev_maps, tci, index);
+ for (j = 0; j < dev_maps->nr_ids; j++) {
+ tci = j * dev_maps->num_tc;
+
+ for (i = 0; i < dev_maps->num_tc; i++, tci++) {
+ if (i == tc &&
+ netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
+ netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
+ continue;
+
+ active |= remove_xps_queue(dev_maps,
+ copy ? old_dev_maps : NULL,
+ tci, index);
+ }
}
+ if (old_dev_maps)
+ kfree_rcu(old_dev_maps, rcu);
+
/* free map if not active */
- if (!active) {
- if (is_rxqs_map)
- RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
- else
- RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
- kfree_rcu(dev_maps, rcu);
- }
+ if (!active)
+ reset_xps_maps(dev, dev_maps, type);
out_no_maps:
mutex_unlock(&xps_map_mutex);
@@ -2433,11 +3013,10 @@ out_no_maps:
return 0;
error:
/* remove any maps that we added */
- for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- j < nr_ids;) {
+ for (j = 0; j < nr_ids; j++) {
for (i = num_tc, tci = j * num_tc; i--; tci++) {
new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
- map = dev_maps ?
+ map = copy ?
xmap_dereference(dev_maps->attr_map[tci]) :
NULL;
if (new_map && new_map != map)
@@ -2458,7 +3037,7 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
int ret;
cpus_read_lock();
- ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
+ ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
cpus_read_unlock();
return ret;
@@ -2600,7 +3179,7 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
if (dev->reg_state == NETREG_REGISTERED ||
dev->reg_state == NETREG_UNREGISTERING) {
- ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
txq);
@@ -2610,6 +3189,10 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
if (dev->num_tc)
netif_setup_tc(dev, txq);
+ net_shaper_set_real_num_tx_queues(dev, txq);
+
+ dev_qdisc_change_real_num_tx(dev, txq);
+
dev->real_num_tx_queues = txq;
if (disabling) {
@@ -2627,7 +3210,6 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
}
EXPORT_SYMBOL(netif_set_real_num_tx_queues);
-#ifdef CONFIG_SYSFS
/**
* netif_set_real_num_rx_queues - set actual number of RX queues used
* @dev: Network device
@@ -2646,7 +3228,7 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
return -EINVAL;
if (dev->reg_state == NETREG_REGISTERED) {
- ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
rxq);
@@ -2658,18 +3240,121 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
return 0;
}
EXPORT_SYMBOL(netif_set_real_num_rx_queues);
-#endif
+
+/**
+ * netif_set_real_num_queues - set actual number of RX and TX queues used
+ * @dev: Network device
+ * @txq: Actual number of TX queues
+ * @rxq: Actual number of RX queues
+ *
+ * Set the real number of both TX and RX queues.
+ * Does nothing if the number of queues is already correct.
+ */
+int netif_set_real_num_queues(struct net_device *dev,
+ unsigned int txq, unsigned int rxq)
+{
+ unsigned int old_rxq = dev->real_num_rx_queues;
+ int err;
+
+ if (txq < 1 || txq > dev->num_tx_queues ||
+ rxq < 1 || rxq > dev->num_rx_queues)
+ return -EINVAL;
+
+ /* Start from increases, so the error path only does decreases -
+ * decreases can't fail.
+ */
+ if (rxq > dev->real_num_rx_queues) {
+ err = netif_set_real_num_rx_queues(dev, rxq);
+ if (err)
+ return err;
+ }
+ if (txq > dev->real_num_tx_queues) {
+ err = netif_set_real_num_tx_queues(dev, txq);
+ if (err)
+ goto undo_rx;
+ }
+ if (rxq < dev->real_num_rx_queues)
+ WARN_ON(netif_set_real_num_rx_queues(dev, rxq));
+ if (txq < dev->real_num_tx_queues)
+ WARN_ON(netif_set_real_num_tx_queues(dev, txq));
+
+ return 0;
+undo_rx:
+ WARN_ON(netif_set_real_num_rx_queues(dev, old_rxq));
+ return err;
+}
+EXPORT_SYMBOL(netif_set_real_num_queues);
+
+/**
+ * netif_set_tso_max_size() - set the max size of TSO frames supported
+ * @dev: netdev to update
+ * @size: max skb->len of a TSO frame
+ *
+ * Set the limit on the size of TSO super-frames the device can handle.
+ * Unless explicitly set the stack will assume the value of
+ * %GSO_LEGACY_MAX_SIZE.
+ */
+void netif_set_tso_max_size(struct net_device *dev, unsigned int size)
+{
+ dev->tso_max_size = min(GSO_MAX_SIZE, size);
+ if (size < READ_ONCE(dev->gso_max_size))
+ netif_set_gso_max_size(dev, size);
+ if (size < READ_ONCE(dev->gso_ipv4_max_size))
+ netif_set_gso_ipv4_max_size(dev, size);
+}
+EXPORT_SYMBOL(netif_set_tso_max_size);
+
+/**
+ * netif_set_tso_max_segs() - set the max number of segs supported for TSO
+ * @dev: netdev to update
+ * @segs: max number of TCP segments
+ *
+ * Set the limit on the number of TCP segments the device can generate from
+ * a single TSO super-frame.
+ * Unless explicitly set the stack will assume the value of %GSO_MAX_SEGS.
+ */
+void netif_set_tso_max_segs(struct net_device *dev, unsigned int segs)
+{
+ dev->tso_max_segs = segs;
+ if (segs < READ_ONCE(dev->gso_max_segs))
+ netif_set_gso_max_segs(dev, segs);
+}
+EXPORT_SYMBOL(netif_set_tso_max_segs);
+
+/**
+ * netif_inherit_tso_max() - copy all TSO limits from a lower device to an upper
+ * @to: netdev to update
+ * @from: netdev from which to copy the limits
+ */
+void netif_inherit_tso_max(struct net_device *to, const struct net_device *from)
+{
+ netif_set_tso_max_size(to, from->tso_max_size);
+ netif_set_tso_max_segs(to, from->tso_max_segs);
+}
+EXPORT_SYMBOL(netif_inherit_tso_max);
/**
* netif_get_num_default_rss_queues - default number of RSS queues
*
- * This routine should set an upper limit on the number of RSS queues
- * used by default by multiqueue devices.
+ * Default value is the number of physical cores if there are only 1 or 2, or
+ * divided by 2 if there are more.
*/
int netif_get_num_default_rss_queues(void)
{
- return is_kdump_kernel() ?
- 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
+ cpumask_var_t cpus;
+ int cpu, count = 0;
+
+ if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
+ return 1;
+
+ cpumask_copy(cpus, cpu_online_mask);
+ for_each_cpu(cpu, cpus) {
+ ++count;
+ cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
+ }
+ free_cpumask_var(cpus);
+
+ return count > 2 ? DIV_ROUND_UP(count, 2) : count;
}
EXPORT_SYMBOL(netif_get_num_default_rss_queues);
@@ -2689,13 +3374,20 @@ static void __netif_reschedule(struct Qdisc *q)
void __netif_schedule(struct Qdisc *q)
{
+ /* If q->defer_list is not empty, at least one thread is
+ * in __dev_xmit_skb() before llist_del_all(&q->defer_list).
+ * This thread will attempt to run the queue.
+ */
+ if (!llist_empty(&q->defer_list))
+ return;
+
if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
__netif_reschedule(q);
}
EXPORT_SYMBOL(__netif_schedule);
struct dev_kfree_skb_cb {
- enum skb_free_reason reason;
+ enum skb_drop_reason reason;
};
static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
@@ -2706,7 +3398,7 @@ static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
void netif_schedule_queue(struct netdev_queue *txq)
{
rcu_read_lock();
- if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
+ if (!netif_xmit_stopped(txq)) {
struct Qdisc *q = rcu_dereference(txq->qdisc);
__netif_schedule(q);
@@ -2728,7 +3420,7 @@ void netif_tx_wake_queue(struct netdev_queue *dev_queue)
}
EXPORT_SYMBOL(netif_tx_wake_queue);
-void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
+void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
unsigned long flags;
@@ -2748,16 +3440,16 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_restore(flags);
}
-EXPORT_SYMBOL(__dev_kfree_skb_irq);
+EXPORT_SYMBOL(dev_kfree_skb_irq_reason);
-void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
+void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
- if (in_irq() || irqs_disabled())
- __dev_kfree_skb_irq(skb, reason);
+ if (in_hardirq() || irqs_disabled())
+ dev_kfree_skb_irq_reason(skb, reason);
else
- dev_kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
}
-EXPORT_SYMBOL(__dev_kfree_skb_any);
+EXPORT_SYMBOL(dev_kfree_skb_any_reason);
/**
@@ -2786,7 +3478,7 @@ void netif_device_attach(struct net_device *dev)
if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
netif_running(dev)) {
netif_tx_wake_all_queues(dev);
- __netdev_watchdog_up(dev);
+ netdev_watchdog_up(dev);
}
}
EXPORT_SYMBOL(netif_device_attach);
@@ -2808,10 +3500,19 @@ static u16 skb_tx_hash(const struct net_device *dev,
qoffset = sb_dev->tc_to_txq[tc].offset;
qcount = sb_dev->tc_to_txq[tc].count;
+ if (unlikely(!qcount)) {
+ net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n",
+ sb_dev->name, qoffset, tc);
+ qoffset = 0;
+ qcount = dev->real_num_tx_queues;
+ }
}
if (skb_rx_queue_recorded(skb)) {
+ DEBUG_NET_WARN_ON_ONCE(qcount == 0);
hash = skb_get_rx_queue(skb);
+ if (hash >= qoffset)
+ hash -= qoffset;
while (unlikely(hash >= qcount))
hash -= qcount;
return hash + qoffset;
@@ -2820,7 +3521,7 @@ static u16 skb_tx_hash(const struct net_device *dev,
return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
}
-static void skb_warn_bad_offload(const struct sk_buff *skb)
+void skb_warn_bad_offload(const struct sk_buff *skb)
{
static const netdev_features_t null_features;
struct net_device *dev = skb->dev;
@@ -2835,12 +3536,10 @@ static void skb_warn_bad_offload(const struct sk_buff *skb)
else
name = netdev_name(dev);
}
- WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
- "gso_type=%d ip_summed=%d\n",
+ skb_dump(KERN_WARNING, skb, false);
+ WARN(1, "%s: caps=(%pNF, %pNF)\n",
name, dev ? &dev->features : &null_features,
- skb->sk ? &skb->sk->sk_route_caps : &null_features,
- skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
- skb_shinfo(skb)->gso_type, skb->ip_summed);
+ skb->sk ? &skb->sk->sk_route_caps : &null_features);
}
/*
@@ -2855,11 +3554,15 @@ int skb_checksum_help(struct sk_buff *skb)
if (skb->ip_summed == CHECKSUM_COMPLETE)
goto out_set_summed;
- if (unlikely(skb_shinfo(skb)->gso_size)) {
+ if (unlikely(skb_is_gso(skb))) {
skb_warn_bad_offload(skb);
return -EINVAL;
}
+ if (!skb_frags_readable(skb)) {
+ return -EFAULT;
+ }
+
/* Before computing a checksum, we should make sure no frag could
* be modified by an external entity : checksum could be wrong.
*/
@@ -2870,18 +3573,25 @@ int skb_checksum_help(struct sk_buff *skb)
}
offset = skb_checksum_start_offset(skb);
- BUG_ON(offset >= skb_headlen(skb));
+ ret = -EINVAL;
+ if (unlikely(offset >= skb_headlen(skb))) {
+ DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
+ WARN_ONCE(true, "offset (%d) >= skb_headlen() (%u)\n",
+ offset, skb_headlen(skb));
+ goto out;
+ }
csum = skb_checksum(skb, offset, skb->len - offset, 0);
offset += skb->csum_offset;
- BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
-
- if (skb_cloned(skb) &&
- !skb_clone_writable(skb, offset + sizeof(__sum16))) {
- ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
- if (ret)
- goto out;
+ if (unlikely(offset + sizeof(__sum16) > skb_headlen(skb))) {
+ DO_ONCE_LITE(skb_dump, KERN_ERR, skb, false);
+ WARN_ONCE(true, "offset+2 (%zu) > skb_headlen() (%u)\n",
+ offset + sizeof(__sum16), skb_headlen(skb));
+ goto out;
}
+ ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
+ if (ret)
+ goto out;
*(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
out_set_summed:
@@ -2891,9 +3601,10 @@ out:
}
EXPORT_SYMBOL(skb_checksum_help);
+#ifdef CONFIG_NET_CRC32C
int skb_crc32c_csum_help(struct sk_buff *skb)
{
- __le32 crc32c_csum;
+ u32 crc;
int ret = 0, offset, start;
if (skb->ip_summed != CHECKSUM_PARTIAL)
@@ -2916,21 +3627,19 @@ int skb_crc32c_csum_help(struct sk_buff *skb)
ret = -EINVAL;
goto out;
}
- if (skb_cloned(skb) &&
- !skb_clone_writable(skb, offset + sizeof(__le32))) {
- ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
- if (ret)
- goto out;
- }
- crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
- skb->len - start, ~(__u32)0,
- crc32c_csum_stub));
- *(__le32 *)(skb->data + offset) = crc32c_csum;
- skb->ip_summed = CHECKSUM_NONE;
- skb->csum_not_inet = 0;
+
+ ret = skb_ensure_writable(skb, offset + sizeof(__le32));
+ if (ret)
+ goto out;
+
+ crc = ~skb_crc32c(skb, start, skb->len - start, ~0);
+ *(__le32 *)(skb->data + offset) = cpu_to_le32(crc);
+ skb_reset_csum_not_inet(skb);
out:
return ret;
}
+EXPORT_SYMBOL(skb_crc32c_csum_help);
+#endif /* CONFIG_NET_CRC32C */
__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
@@ -2947,120 +3656,22 @@ __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
type = eth->h_proto;
}
- return __vlan_get_protocol(skb, type, depth);
+ return vlan_get_protocol_and_depth(skb, type, depth);
}
-/**
- * skb_mac_gso_segment - mac layer segmentation handler.
- * @skb: buffer to segment
- * @features: features for the output path (see dev->features)
- */
-struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
- netdev_features_t features)
-{
- struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
- struct packet_offload *ptype;
- int vlan_depth = skb->mac_len;
- __be16 type = skb_network_protocol(skb, &vlan_depth);
-
- if (unlikely(!type))
- return ERR_PTR(-EINVAL);
-
- __skb_pull(skb, vlan_depth);
- rcu_read_lock();
- list_for_each_entry_rcu(ptype, &offload_base, list) {
- if (ptype->type == type && ptype->callbacks.gso_segment) {
- segs = ptype->callbacks.gso_segment(skb, features);
- break;
- }
- }
- rcu_read_unlock();
-
- __skb_push(skb, skb->data - skb_mac_header(skb));
-
- return segs;
-}
-EXPORT_SYMBOL(skb_mac_gso_segment);
-
-
-/* openvswitch calls this on rx path, so we need a different check.
- */
-static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
-{
- if (tx_path)
- return skb->ip_summed != CHECKSUM_PARTIAL &&
- skb->ip_summed != CHECKSUM_UNNECESSARY;
-
- return skb->ip_summed == CHECKSUM_NONE;
-}
-
-/**
- * __skb_gso_segment - Perform segmentation on skb.
- * @skb: buffer to segment
- * @features: features for the output path (see dev->features)
- * @tx_path: whether it is called in TX path
- *
- * This function segments the given skb and returns a list of segments.
- *
- * It may return NULL if the skb requires no segmentation. This is
- * only possible when GSO is used for verifying header integrity.
- *
- * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
- */
-struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
- netdev_features_t features, bool tx_path)
+/* Take action when hardware reception checksum errors are detected. */
+#ifdef CONFIG_BUG
+static void do_netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
- struct sk_buff *segs;
-
- if (unlikely(skb_needs_check(skb, tx_path))) {
- int err;
-
- /* We're going to init ->check field in TCP or UDP header */
- err = skb_cow_head(skb, 0);
- if (err < 0)
- return ERR_PTR(err);
- }
-
- /* Only report GSO partial support if it will enable us to
- * support segmentation on this frame without needing additional
- * work.
- */
- if (features & NETIF_F_GSO_PARTIAL) {
- netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
- struct net_device *dev = skb->dev;
-
- partial_features |= dev->features & dev->gso_partial_features;
- if (!skb_gso_ok(skb, features | partial_features))
- features &= ~NETIF_F_GSO_PARTIAL;
- }
-
- BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
- sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
-
- SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
- SKB_GSO_CB(skb)->encap_level = 0;
-
- skb_reset_mac_header(skb);
- skb_reset_mac_len(skb);
-
- segs = skb_mac_gso_segment(skb, features);
-
- if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
- skb_warn_bad_offload(skb);
-
- return segs;
+ netdev_err(dev, "hw csum failure\n");
+ skb_dump(KERN_ERR, skb, true);
+ dump_stack();
}
-EXPORT_SYMBOL(__skb_gso_segment);
-/* Take action when hardware reception checksum errors are detected. */
-#ifdef CONFIG_BUG
-void netdev_rx_csum_fault(struct net_device *dev)
+void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
- if (net_ratelimit()) {
- pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
- dump_stack();
- }
+ DO_ONCE_LITE(do_netdev_rx_csum_fault, dev, skb);
}
EXPORT_SYMBOL(netdev_rx_csum_fault);
#endif
@@ -3074,8 +3685,9 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
if (!(dev->features & NETIF_F_HIGHDMA)) {
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ struct page *page = skb_frag_page(frag);
- if (PageHighMem(skb_frag_page(frag)))
+ if (page && PageHighMem(page))
return 1;
}
}
@@ -3108,10 +3720,9 @@ static netdev_features_t net_mpls_features(struct sk_buff *skb,
static netdev_features_t harmonize_features(struct sk_buff *skb,
netdev_features_t features)
{
- int tmp;
__be16 type;
- type = skb_network_protocol(skb, &tmp);
+ type = skb_network_protocol(skb, NULL);
features = net_mpls_features(skb, features, type);
if (skb->ip_summed != CHECKSUM_NONE &&
@@ -3145,9 +3756,17 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
{
u16 gso_segs = skb_shinfo(skb)->gso_segs;
- if (gso_segs > dev->gso_max_segs)
+ if (gso_segs > READ_ONCE(dev->gso_max_segs))
+ return features & ~NETIF_F_GSO_MASK;
+
+ if (unlikely(skb->len >= netif_get_gso_max_size(dev, skb)))
return features & ~NETIF_F_GSO_MASK;
+ if (!skb_shinfo(skb)->gso_type) {
+ skb_warn_bad_offload(skb);
+ return features & ~NETIF_F_GSO_MASK;
+ }
+
/* Support for GSO partial features requires software
* intervention before we can actually process the packets
* so we need to strip support for any partial features now
@@ -3157,8 +3776,14 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
features &= ~dev->gso_partial_features;
- /* Make sure to clear the IPv4 ID mangling feature if the
- * IPv4 header has the potential to be fragmented.
+ /* Make sure to clear the IPv4 ID mangling feature if the IPv4 header
+ * has the potential to be fragmented so that TSO does not generate
+ * segments with the same ID. For encapsulated packets, the ID mangling
+ * feature is guaranteed not to use the same ID for the outer IPv4
+ * headers of the generated segments if the headers have the potential
+ * to be fragmented, so there is no need to clear the IPv4 ID mangling
+ * feature (see the section about NETIF_F_TSO_MANGLEID in
+ * segmentation-offloads.rst).
*/
if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
struct iphdr *iph = skb->encapsulation ?
@@ -3168,6 +3793,18 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
features &= ~NETIF_F_TSO_MANGLEID;
}
+ /* NETIF_F_IPV6_CSUM does not support IPv6 extension headers,
+ * so neither does TSO that depends on it.
+ */
+ if (features & NETIF_F_IPV6_CSUM &&
+ (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6 ||
+ (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
+ vlan_get_protocol(skb) == htons(ETH_P_IPV6))) &&
+ skb_transport_header_was_set(skb) &&
+ skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
+ !ipv6_has_hopopt_jumbo(skb))
+ features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4);
+
return features;
}
@@ -3208,7 +3845,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,
unsigned int len;
int rc;
- if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
+ if (dev_nit_active_rcu(dev))
dev_queue_xmit_nit(skb, dev);
len = skb->len;
@@ -3228,7 +3865,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *de
while (skb) {
struct sk_buff *next = skb->next;
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
rc = xmit_one(skb, dev, txq, next != NULL);
if (unlikely(!dev_xmit_complete(rc))) {
skb->next = next;
@@ -3236,7 +3873,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *de
}
skb = next;
- if (netif_xmit_stopped(txq) && skb) {
+ if (netif_tx_queue_stopped(txq) && skb) {
rc = NETDEV_TX_BUSY;
break;
}
@@ -3259,18 +3896,100 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
int skb_csum_hwoffload_help(struct sk_buff *skb,
const netdev_features_t features)
{
- if (unlikely(skb->csum_not_inet))
+ if (unlikely(skb_csum_is_sctp(skb)))
return !!(features & NETIF_F_SCTP_CRC) ? 0 :
skb_crc32c_csum_help(skb);
- return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
+ if (features & NETIF_F_HW_CSUM)
+ return 0;
+
+ if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
+ if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) &&
+ skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
+ !ipv6_has_hopopt_jumbo(skb))
+ goto sw_checksum;
+
+ switch (skb->csum_offset) {
+ case offsetof(struct tcphdr, check):
+ case offsetof(struct udphdr, check):
+ return 0;
+ }
+ }
+
+sw_checksum:
+ return skb_checksum_help(skb);
}
EXPORT_SYMBOL(skb_csum_hwoffload_help);
+/* Checks if this SKB belongs to an HW offloaded socket
+ * and whether any SW fallbacks are required based on dev.
+ * Check decrypted mark in case skb_orphan() cleared socket.
+ */
+static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
+ struct net_device *dev)
+{
+#ifdef CONFIG_SOCK_VALIDATE_XMIT
+ struct sk_buff *(*sk_validate)(struct sock *sk, struct net_device *dev,
+ struct sk_buff *skb);
+ struct sock *sk = skb->sk;
+
+ sk_validate = NULL;
+ if (sk) {
+ if (sk_fullsock(sk))
+ sk_validate = sk->sk_validate_xmit_skb;
+ else if (sk_is_inet(sk) && sk->sk_state == TCP_TIME_WAIT)
+ sk_validate = inet_twsk(sk)->tw_validate_xmit_skb;
+ }
+
+ if (sk_validate) {
+ skb = sk_validate(sk, dev, skb);
+ } else if (unlikely(skb_is_decrypted(skb))) {
+ pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
+ kfree_skb(skb);
+ skb = NULL;
+ }
+#endif
+
+ return skb;
+}
+
+static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct skb_shared_info *shinfo;
+ struct net_iov *niov;
+
+ if (likely(skb_frags_readable(skb)))
+ goto out;
+
+ if (!dev->netmem_tx)
+ goto out_free;
+
+ shinfo = skb_shinfo(skb);
+
+ if (shinfo->nr_frags > 0) {
+ niov = netmem_to_net_iov(skb_frag_netmem(&shinfo->frags[0]));
+ if (net_is_devmem_iov(niov) &&
+ net_devmem_iov_binding(niov)->dev != dev)
+ goto out_free;
+ }
+
+out:
+ return skb;
+
+out_free:
+ kfree_skb(skb);
+ return NULL;
+}
+
static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
{
netdev_features_t features;
+ skb = validate_xmit_unreadable_skb(skb, dev);
+ if (unlikely(!skb))
+ goto out_null;
+
features = netif_skb_features(skb);
skb = validate_xmit_vlan(skb, features);
if (unlikely(!skb))
@@ -3318,7 +4037,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
out_kfree_skb:
kfree_skb(skb);
out_null:
- atomic_long_inc(&dev->tx_dropped);
+ dev_core_stats_tx_dropped_inc(dev);
return NULL;
}
@@ -3328,9 +4047,9 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
for (; skb != NULL; skb = next) {
next = skb->next;
- skb->next = NULL;
+ skb_mark_not_on_list(skb);
- /* in case skb wont be segmented, point to itself */
+ /* in case skb won't be segmented, point to itself */
skb->prev = skb;
skb = validate_xmit_skb(skb, dev, again);
@@ -3350,122 +4069,194 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d
}
EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
-static void qdisc_pkt_len_init(struct sk_buff *skb)
+static void qdisc_pkt_len_segs_init(struct sk_buff *skb)
{
- const struct skb_shared_info *shinfo = skb_shinfo(skb);
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ u16 gso_segs;
qdisc_skb_cb(skb)->pkt_len = skb->len;
+ if (!shinfo->gso_size) {
+ qdisc_skb_cb(skb)->pkt_segs = 1;
+ return;
+ }
+
+ qdisc_skb_cb(skb)->pkt_segs = gso_segs = shinfo->gso_segs;
/* To get more precise estimation of bytes sent on wire,
* we add to pkt_len the headers size of all segments
*/
- if (shinfo->gso_size) {
+ if (skb_transport_header_was_set(skb)) {
unsigned int hdr_len;
- u16 gso_segs = shinfo->gso_segs;
/* mac layer + network layer */
- hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
+ if (!skb->encapsulation)
+ hdr_len = skb_transport_offset(skb);
+ else
+ hdr_len = skb_inner_transport_offset(skb);
/* + transport layer */
if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
const struct tcphdr *th;
struct tcphdr _tcphdr;
- th = skb_header_pointer(skb, skb_transport_offset(skb),
+ th = skb_header_pointer(skb, hdr_len,
sizeof(_tcphdr), &_tcphdr);
if (likely(th))
hdr_len += __tcp_hdrlen(th);
- } else {
+ } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
struct udphdr _udphdr;
- if (skb_header_pointer(skb, skb_transport_offset(skb),
+ if (skb_header_pointer(skb, hdr_len,
sizeof(_udphdr), &_udphdr))
hdr_len += sizeof(struct udphdr);
}
- if (shinfo->gso_type & SKB_GSO_DODGY)
- gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
- shinfo->gso_size);
+ if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) {
+ int payload = skb->len - hdr_len;
+ /* Malicious packet. */
+ if (payload <= 0)
+ return;
+ gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size);
+ shinfo->gso_segs = gso_segs;
+ qdisc_skb_cb(skb)->pkt_segs = gso_segs;
+ }
qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
}
}
+static int dev_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *q,
+ struct sk_buff **to_free,
+ struct netdev_queue *txq)
+{
+ int rc;
+
+ rc = q->enqueue(skb, q, to_free) & NET_XMIT_MASK;
+ if (rc == NET_XMIT_SUCCESS)
+ trace_qdisc_enqueue(q, txq, skb);
+ return rc;
+}
+
static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
struct net_device *dev,
struct netdev_queue *txq)
{
+ struct sk_buff *next, *to_free = NULL, *to_free2 = NULL;
spinlock_t *root_lock = qdisc_lock(q);
- struct sk_buff *to_free = NULL;
- bool contended;
+ struct llist_node *ll_list, *first_n;
+ unsigned long defer_count = 0;
int rc;
qdisc_calculate_pkt_len(skb, q);
+ tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_DROP);
+
if (q->flags & TCQ_F_NOLOCK) {
- if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
- __qdisc_drop(skb, &to_free);
- rc = NET_XMIT_DROP;
- } else {
- rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
- qdisc_run(q);
+ if (q->flags & TCQ_F_CAN_BYPASS && nolock_qdisc_is_empty(q) &&
+ qdisc_run_begin(q)) {
+ /* Retest nolock_qdisc_is_empty() within the protection
+ * of q->seqlock to protect from racing with requeuing.
+ */
+ if (unlikely(!nolock_qdisc_is_empty(q))) {
+ rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
+ __qdisc_run(q);
+ to_free2 = qdisc_run_end(q);
+
+ goto free_skbs;
+ }
+
+ qdisc_bstats_cpu_update(q, skb);
+ if (sch_direct_xmit(skb, q, dev, txq, NULL, true) &&
+ !nolock_qdisc_is_empty(q))
+ __qdisc_run(q);
+
+ to_free2 = qdisc_run_end(q);
+ rc = NET_XMIT_SUCCESS;
+ goto free_skbs;
}
- if (unlikely(to_free))
- kfree_skb_list(to_free);
- return rc;
+ rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
+ to_free2 = qdisc_run(q);
+ goto free_skbs;
}
- /*
- * Heuristic to force contended enqueues to serialize on a
- * separate lock before trying to get qdisc main lock.
- * This permits qdisc->running owner to get the lock more
- * often and dequeue packets faster.
+ /* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit.
+ * In the try_cmpxchg() loop, we want to increment q->defer_count
+ * at most once to limit the number of skbs in defer_list.
+ * We perform the defer_count increment only if the list is not empty,
+ * because some arches have slow atomic_long_inc_return().
*/
- contended = qdisc_is_running(q);
- if (unlikely(contended))
- spin_lock(&q->busylock);
+ first_n = READ_ONCE(q->defer_list.first);
+ do {
+ if (first_n && !defer_count) {
+ defer_count = atomic_long_inc_return(&q->defer_count);
+ if (unlikely(defer_count > READ_ONCE(q->limit))) {
+ kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP);
+ return NET_XMIT_DROP;
+ }
+ }
+ skb->ll_node.next = first_n;
+ } while (!try_cmpxchg(&q->defer_list.first, &first_n, &skb->ll_node));
+
+ /* If defer_list was not empty, we know the cpu which queued
+ * the first skb will process the whole list for us.
+ */
+ if (first_n)
+ return NET_XMIT_SUCCESS;
spin_lock(root_lock);
+
+ ll_list = llist_del_all(&q->defer_list);
+ /* There is a small race because we clear defer_count not atomically
+ * with the prior llist_del_all(). This means defer_list could grow
+ * over q->limit.
+ */
+ atomic_long_set(&q->defer_count, 0);
+
+ ll_list = llist_reverse_order(ll_list);
+
if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
- __qdisc_drop(skb, &to_free);
+ llist_for_each_entry_safe(skb, next, ll_list, ll_node)
+ __qdisc_drop(skb, &to_free);
rc = NET_XMIT_DROP;
- } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
- qdisc_run_begin(q)) {
+ goto unlock;
+ }
+ if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
+ !llist_next(ll_list) && qdisc_run_begin(q)) {
/*
* This is a work-conserving queue; there are no old skbs
* waiting to be sent out; and the qdisc is not running -
* xmit the skb directly.
*/
+ DEBUG_NET_WARN_ON_ONCE(skb != llist_entry(ll_list,
+ struct sk_buff,
+ ll_node));
qdisc_bstats_update(q, skb);
-
- if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
- if (unlikely(contended)) {
- spin_unlock(&q->busylock);
- contended = false;
- }
+ if (sch_direct_xmit(skb, q, dev, txq, root_lock, true))
__qdisc_run(q);
- }
-
- qdisc_run_end(q);
+ to_free2 = qdisc_run_end(q);
rc = NET_XMIT_SUCCESS;
} else {
- rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
- if (qdisc_run_begin(q)) {
- if (unlikely(contended)) {
- spin_unlock(&q->busylock);
- contended = false;
- }
- __qdisc_run(q);
- qdisc_run_end(q);
+ int count = 0;
+
+ llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
+ prefetch(next);
+ prefetch(&next->priority);
+ skb_mark_not_on_list(skb);
+ rc = dev_qdisc_enqueue(skb, q, &to_free, txq);
+ count++;
}
+ to_free2 = qdisc_run(q);
+ if (count != 1)
+ rc = NET_XMIT_SUCCESS;
}
+unlock:
spin_unlock(root_lock);
- if (unlikely(to_free))
- kfree_skb_list(to_free);
- if (unlikely(contended))
- spin_unlock(&q->busylock);
+
+free_skbs:
+ tcf_kfree_skb_list(to_free);
+ tcf_kfree_skb_list(to_free2);
return rc;
}
@@ -3494,9 +4285,6 @@ static void skb_update_prio(struct sk_buff *skb)
#define skb_update_prio(skb)
#endif
-DEFINE_PER_CPU(int, xmit_recursion);
-EXPORT_SYMBOL(xmit_recursion);
-
/**
* dev_loopback_xmit - loop back @skb
* @net: network namespace this loopback is happening in
@@ -3508,67 +4296,266 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
skb_reset_mac_header(skb);
__skb_pull(skb, skb_network_offset(skb));
skb->pkt_type = PACKET_LOOPBACK;
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- WARN_ON(!skb_dst(skb));
+ if (skb->ip_summed == CHECKSUM_NONE)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ DEBUG_NET_WARN_ON_ONCE(!skb_dst(skb));
skb_dst_force(skb);
- netif_rx_ni(skb);
+ netif_rx(skb);
return 0;
}
EXPORT_SYMBOL(dev_loopback_xmit);
#ifdef CONFIG_NET_EGRESS
-static struct sk_buff *
-sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
+static struct netdev_queue *
+netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb)
+{
+ int qm = skb_get_queue_mapping(skb);
+
+ return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm));
+}
+
+#ifndef CONFIG_PREEMPT_RT
+static bool netdev_xmit_txqueue_skipped(void)
+{
+ return __this_cpu_read(softnet_data.xmit.skip_txqueue);
+}
+
+void netdev_xmit_skip_txqueue(bool skip)
{
- struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
- struct tcf_result cl_res;
+ __this_cpu_write(softnet_data.xmit.skip_txqueue, skip);
+}
+EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
+
+#else
+static bool netdev_xmit_txqueue_skipped(void)
+{
+ return current->net_xmit.skip_txqueue;
+}
+
+void netdev_xmit_skip_txqueue(bool skip)
+{
+ current->net_xmit.skip_txqueue = skip;
+}
+EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue);
+#endif
+#endif /* CONFIG_NET_EGRESS */
+
+#ifdef CONFIG_NET_XGRESS
+static int tc_run(struct tcx_entry *entry, struct sk_buff *skb,
+ enum skb_drop_reason *drop_reason)
+{
+ int ret = TC_ACT_UNSPEC;
+#ifdef CONFIG_NET_CLS_ACT
+ struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq);
+ struct tcf_result res;
if (!miniq)
- return skb;
+ return ret;
- /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
- mini_qdisc_bstats_cpu_update(miniq, skb);
+ /* Global bypass */
+ if (!static_branch_likely(&tcf_sw_enabled_key))
+ return ret;
+
+ /* Block-wise bypass */
+ if (tcf_block_bypass_sw(miniq->block))
+ return ret;
+
+ tc_skb_cb(skb)->mru = 0;
+ qdisc_skb_cb(skb)->post_ct = false;
+ tcf_set_drop_reason(skb, *drop_reason);
- switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
+ mini_qdisc_bstats_cpu_update(miniq, skb);
+ ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false);
+ /* Only tcf related quirks below. */
+ switch (ret) {
+ case TC_ACT_SHOT:
+ *drop_reason = tcf_get_drop_reason(skb);
+ mini_qdisc_qstats_cpu_drop(miniq);
+ break;
case TC_ACT_OK:
case TC_ACT_RECLASSIFY:
- skb->tc_index = TC_H_MIN(cl_res.classid);
+ skb->tc_index = TC_H_MIN(res.classid);
break;
+ }
+#endif /* CONFIG_NET_CLS_ACT */
+ return ret;
+}
+
+static DEFINE_STATIC_KEY_FALSE(tcx_needed_key);
+
+void tcx_inc(void)
+{
+ static_branch_inc(&tcx_needed_key);
+}
+
+void tcx_dec(void)
+{
+ static_branch_dec(&tcx_needed_key);
+}
+
+static __always_inline enum tcx_action_base
+tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb,
+ const bool needs_mac)
+{
+ const struct bpf_mprog_fp *fp;
+ const struct bpf_prog *prog;
+ int ret = TCX_NEXT;
+
+ if (needs_mac)
+ __skb_push(skb, skb->mac_len);
+ bpf_mprog_foreach_prog(entry, fp, prog) {
+ bpf_compute_data_pointers(skb);
+ ret = bpf_prog_run(prog, skb);
+ if (ret != TCX_NEXT)
+ break;
+ }
+ if (needs_mac)
+ __skb_pull(skb, skb->mac_len);
+ return tcx_action_code(skb, ret);
+}
+
+static __always_inline struct sk_buff *
+sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
+ struct net_device *orig_dev, bool *another)
+{
+ struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress);
+ enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_INGRESS;
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+ int sch_ret;
+
+ if (!entry)
+ return skb;
+
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+ if (unlikely(*pt_prev)) {
+ *ret = deliver_skb(skb, *pt_prev, orig_dev);
+ *pt_prev = NULL;
+ }
+
+ qdisc_pkt_len_segs_init(skb);
+ tcx_set_ingress(skb, true);
+
+ if (static_branch_unlikely(&tcx_needed_key)) {
+ sch_ret = tcx_run(entry, skb, true);
+ if (sch_ret != TC_ACT_UNSPEC)
+ goto ingress_verdict;
+ }
+ sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
+ingress_verdict:
+ switch (sch_ret) {
+ case TC_ACT_REDIRECT:
+ /* skb_mac_header check was done by BPF, so we can safely
+ * push the L2 header back before redirecting to another
+ * netdev.
+ */
+ __skb_push(skb, skb->mac_len);
+ if (skb_do_redirect(skb) == -EAGAIN) {
+ __skb_pull(skb, skb->mac_len);
+ *another = true;
+ break;
+ }
+ *ret = NET_RX_SUCCESS;
+ bpf_net_ctx_clear(bpf_net_ctx);
+ return NULL;
case TC_ACT_SHOT:
- mini_qdisc_qstats_cpu_drop(miniq);
- *ret = NET_XMIT_DROP;
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
+ *ret = NET_RX_DROP;
+ bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
+ /* used by tc_run */
case TC_ACT_STOLEN:
case TC_ACT_QUEUED:
case TC_ACT_TRAP:
- *ret = NET_XMIT_SUCCESS;
consume_skb(skb);
+ fallthrough;
+ case TC_ACT_CONSUMED:
+ *ret = NET_RX_SUCCESS;
+ bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
+ }
+ bpf_net_ctx_clear(bpf_net_ctx);
+
+ return skb;
+}
+
+static __always_inline struct sk_buff *
+sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
+{
+ struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress);
+ enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS;
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+ int sch_ret;
+
+ if (!entry)
+ return skb;
+
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+
+ /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
+ * already set by the caller.
+ */
+ if (static_branch_unlikely(&tcx_needed_key)) {
+ sch_ret = tcx_run(entry, skb, false);
+ if (sch_ret != TC_ACT_UNSPEC)
+ goto egress_verdict;
+ }
+ sch_ret = tc_run(tcx_entry(entry), skb, &drop_reason);
+egress_verdict:
+ switch (sch_ret) {
case TC_ACT_REDIRECT:
/* No need to push/pop skb's mac_header here on egress! */
skb_do_redirect(skb);
*ret = NET_XMIT_SUCCESS;
+ bpf_net_ctx_clear(bpf_net_ctx);
+ return NULL;
+ case TC_ACT_SHOT:
+ kfree_skb_reason(skb, drop_reason);
+ *ret = NET_XMIT_DROP;
+ bpf_net_ctx_clear(bpf_net_ctx);
+ return NULL;
+ /* used by tc_run */
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ consume_skb(skb);
+ fallthrough;
+ case TC_ACT_CONSUMED:
+ *ret = NET_XMIT_SUCCESS;
+ bpf_net_ctx_clear(bpf_net_ctx);
return NULL;
- default:
- break;
}
+ bpf_net_ctx_clear(bpf_net_ctx);
return skb;
}
-#endif /* CONFIG_NET_EGRESS */
+#else
+static __always_inline struct sk_buff *
+sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
+ struct net_device *orig_dev, bool *another)
+{
+ return skb;
+}
+
+static __always_inline struct sk_buff *
+sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
+{
+ return skb;
+}
+#endif /* CONFIG_NET_XGRESS */
#ifdef CONFIG_XPS
static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
struct xps_dev_maps *dev_maps, unsigned int tci)
{
+ int tc = netdev_get_prio_tc_map(dev, skb->priority);
struct xps_map *map;
int queue_index = -1;
- if (dev->num_tc) {
- tci *= dev->num_tc;
- tci += netdev_get_prio_tc_map(dev, skb->priority);
- }
+ if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
+ return queue_index;
+
+ tci *= dev_maps->num_tc;
+ tci += tc;
map = rcu_dereference(dev_maps->attr_map[tci]);
if (map) {
@@ -3599,18 +4586,18 @@ static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
if (!static_key_false(&xps_rxqs_needed))
goto get_cpus_map;
- dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
+ dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
if (dev_maps) {
int tci = sk_rx_queue_get(sk);
- if (tci >= 0 && tci < dev->num_rx_queues)
+ if (tci >= 0)
queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
tci);
}
get_cpus_map:
if (queue_index < 0) {
- dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
+ dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
if (dev_maps) {
unsigned int tci = skb->sender_cpu - 1;
@@ -3627,23 +4614,40 @@ get_cpus_map:
}
u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
- struct net_device *sb_dev,
- select_queue_fallback_t fallback)
+ struct net_device *sb_dev)
{
return 0;
}
EXPORT_SYMBOL(dev_pick_tx_zero);
-u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
- struct net_device *sb_dev,
- select_queue_fallback_t fallback)
+int sk_tx_queue_get(const struct sock *sk)
{
- return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
+ int resel, val;
+
+ if (!sk)
+ return -1;
+ /* Paired with WRITE_ONCE() in sk_tx_queue_clear()
+ * and sk_tx_queue_set().
+ */
+ val = READ_ONCE(sk->sk_tx_queue_mapping);
+
+ if (val == NO_QUEUE_MAPPING)
+ return -1;
+
+ if (!sk_fullsock(sk))
+ return val;
+
+ resel = READ_ONCE(sock_net(sk)->core.sysctl_txq_reselection);
+ if (resel && time_is_before_jiffies(
+ READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + resel))
+ return -1;
+
+ return val;
}
-EXPORT_SYMBOL(dev_pick_tx_cpu_id);
+EXPORT_SYMBOL(sk_tx_queue_get);
-static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
- struct net_device *sb_dev)
+u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
+ struct net_device *sb_dev)
{
struct sock *sk = skb->sk;
int queue_index = sk_tx_queue_get(sk);
@@ -3657,8 +4661,7 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
if (new_index < 0)
new_index = skb_tx_hash(dev, sb_dev, skb);
- if (queue_index != new_index && sk &&
- sk_fullsock(sk) &&
+ if (sk && sk_fullsock(sk) &&
rcu_access_pointer(sk->sk_dst_cache))
sk_tx_queue_set(sk, new_index);
@@ -3667,10 +4670,11 @@ static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
return queue_index;
}
+EXPORT_SYMBOL(netdev_pick_tx);
-struct netdev_queue *netdev_pick_tx(struct net_device *dev,
- struct sk_buff *skb,
- struct net_device *sb_dev)
+struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
+ struct sk_buff *skb,
+ struct net_device *sb_dev)
{
int queue_index = 0;
@@ -3685,10 +4689,9 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_select_queue)
- queue_index = ops->ndo_select_queue(dev, skb, sb_dev,
- __netdev_pick_tx);
+ queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
else
- queue_index = __netdev_pick_tx(dev, skb, sb_dev);
+ queue_index = netdev_pick_tx(dev, skb, sb_dev);
queue_index = netdev_cap_txqueue(dev, queue_index);
}
@@ -3698,43 +4701,40 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
}
/**
- * __dev_queue_xmit - transmit a buffer
- * @skb: buffer to transmit
- * @sb_dev: suboordinate device used for L2 forwarding offload
+ * __dev_queue_xmit() - transmit a buffer
+ * @skb: buffer to transmit
+ * @sb_dev: suboordinate device used for L2 forwarding offload
*
- * Queue a buffer for transmission to a network device. The caller must
- * have set the device and priority and built the buffer before calling
- * this function. The function can be called from an interrupt.
+ * Queue a buffer for transmission to a network device. The caller must
+ * have set the device and priority and built the buffer before calling
+ * this function. The function can be called from an interrupt.
*
- * A negative errno code is returned on a failure. A success does not
- * guarantee the frame will be transmitted as it may be dropped due
- * to congestion or traffic shaping.
+ * When calling this method, interrupts MUST be enabled. This is because
+ * the BH enable code must have IRQs enabled so that it will not deadlock.
*
- * -----------------------------------------------------------------------------------
- * I notice this method can also return errors from the queue disciplines,
- * including NET_XMIT_DROP, which is a positive value. So, errors can also
- * be positive.
+ * Regardless of the return value, the skb is consumed, so it is currently
+ * difficult to retry a send to this method. (You can bump the ref count
+ * before sending to hold a reference for retry if you are careful.)
*
- * Regardless of the return value, the skb is consumed, so it is currently
- * difficult to retry a send to this method. (You can bump the ref count
- * before sending to hold a reference for retry if you are careful.)
- *
- * When calling this method, interrupts MUST be enabled. This is because
- * the BH enable code must have IRQs enabled so that it will not deadlock.
- * --BLG
+ * Return:
+ * * 0 - buffer successfully transmitted
+ * * positive qdisc return code - NET_XMIT_DROP etc.
+ * * negative errno - other errors
*/
-static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
+int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
{
struct net_device *dev = skb->dev;
- struct netdev_queue *txq;
+ struct netdev_queue *txq = NULL;
struct Qdisc *q;
int rc = -ENOMEM;
bool again = false;
skb_reset_mac_header(skb);
+ skb_assert_len(skb);
- if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
- __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
+ if (unlikely(skb_shinfo(skb)->tx_flags &
+ (SKBTX_SCHED_TSTAMP | SKBTX_BPF)))
+ __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
@@ -3743,16 +4743,27 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
skb_update_prio(skb);
- qdisc_pkt_len_init(skb);
-#ifdef CONFIG_NET_CLS_ACT
- skb->tc_at_ingress = 0;
-# ifdef CONFIG_NET_EGRESS
+ qdisc_pkt_len_segs_init(skb);
+ tcx_set_ingress(skb, false);
+#ifdef CONFIG_NET_EGRESS
if (static_branch_unlikely(&egress_needed_key)) {
+ if (nf_hook_egress_active()) {
+ skb = nf_hook_egress(skb, &rc, dev);
+ if (!skb)
+ goto out;
+ }
+
+ netdev_xmit_skip_txqueue(false);
+
+ nf_skip_egress(skb, true);
skb = sch_handle_egress(skb, &rc, dev);
if (!skb)
goto out;
+ nf_skip_egress(skb, false);
+
+ if (netdev_xmit_txqueue_skipped())
+ txq = netdev_tx_queue_mapping(dev, skb);
}
-# endif
#endif
/* If device/qdisc don't need skb->dst, release it right now while
* its hot in this cpu cache.
@@ -3762,7 +4773,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
else
skb_dst_force(skb);
- txq = netdev_pick_tx(dev, skb, sb_dev);
+ if (!txq)
+ txq = netdev_core_pick_tx(dev, skb, sb_dev);
+
q = rcu_dereference_bh(txq->qdisc);
trace_net_dev_queue(skb);
@@ -3786,9 +4799,11 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
if (dev->flags & IFF_UP) {
int cpu = smp_processor_id(); /* ok because BHs are off */
- if (txq->xmit_lock_owner != cpu) {
- if (unlikely(__this_cpu_read(xmit_recursion) >
- XMIT_RECURSION_LIMIT))
+ /* Other cpus might concurrently change txq->xmit_lock_owner
+ * to -1 or to their cpu id, but not to our id.
+ */
+ if (READ_ONCE(txq->xmit_lock_owner) != cpu) {
+ if (dev_xmit_recursion())
goto recursion_alert;
skb = validate_xmit_skb(skb, dev, &again);
@@ -3798,9 +4813,9 @@ static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
HARD_TX_LOCK(dev, txq, cpu);
if (!netif_xmit_stopped(txq)) {
- __this_cpu_inc(xmit_recursion);
+ dev_xmit_recursion_inc();
skb = dev_hard_start_xmit(skb, dev, txq, &rc);
- __this_cpu_dec(xmit_recursion);
+ dev_xmit_recursion_dec();
if (dev_xmit_complete(rc)) {
HARD_TX_UNLOCK(dev, txq);
goto out;
@@ -3822,27 +4837,16 @@ recursion_alert:
rc = -ENETDOWN;
rcu_read_unlock_bh();
- atomic_long_inc(&dev->tx_dropped);
+ dev_core_stats_tx_dropped_inc(dev);
kfree_skb_list(skb);
return rc;
out:
rcu_read_unlock_bh();
return rc;
}
+EXPORT_SYMBOL(__dev_queue_xmit);
-int dev_queue_xmit(struct sk_buff *skb)
-{
- return __dev_queue_xmit(skb, NULL);
-}
-EXPORT_SYMBOL(dev_queue_xmit);
-
-int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
-{
- return __dev_queue_xmit(skb, sb_dev);
-}
-EXPORT_SYMBOL(dev_queue_xmit_accel);
-
-int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
+int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
{
struct net_device *dev = skb->dev;
struct sk_buff *orig_skb = skb;
@@ -3863,71 +4867,123 @@ int dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
local_bh_disable();
+ dev_xmit_recursion_inc();
HARD_TX_LOCK(dev, txq, smp_processor_id());
if (!netif_xmit_frozen_or_drv_stopped(txq))
ret = netdev_start_xmit(skb, dev, txq, false);
HARD_TX_UNLOCK(dev, txq);
+ dev_xmit_recursion_dec();
local_bh_enable();
-
- if (!dev_xmit_complete(ret))
- kfree_skb(skb);
-
return ret;
drop:
- atomic_long_inc(&dev->tx_dropped);
+ dev_core_stats_tx_dropped_inc(dev);
kfree_skb_list(skb);
return NET_XMIT_DROP;
}
-EXPORT_SYMBOL(dev_direct_xmit);
+EXPORT_SYMBOL(__dev_direct_xmit);
/*************************************************************************
* Receiver routines
*************************************************************************/
+static DEFINE_PER_CPU(struct task_struct *, backlog_napi);
-int netdev_max_backlog __read_mostly = 1000;
-EXPORT_SYMBOL(netdev_max_backlog);
-
-int netdev_tstamp_prequeue __read_mostly = 1;
-int netdev_budget __read_mostly = 300;
-unsigned int __read_mostly netdev_budget_usecs = 2000;
int weight_p __read_mostly = 64; /* old backlog weight */
int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
-int dev_rx_weight __read_mostly = 64;
-int dev_tx_weight __read_mostly = 64;
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
+ struct task_struct *thread;
+
+ lockdep_assert_irqs_disabled();
+
+ if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
+ /* Paired with smp_mb__before_atomic() in
+ * napi_enable()/netif_set_threaded().
+ * Use READ_ONCE() to guarantee a complete
+ * read on napi->thread. Only call
+ * wake_up_process() when it's not NULL.
+ */
+ thread = READ_ONCE(napi->thread);
+ if (thread) {
+ if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
+ goto use_local_napi;
+
+ set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
+ wake_up_process(thread);
+ return;
+ }
+ }
+
+use_local_napi:
+ DEBUG_NET_WARN_ON_ONCE(!list_empty(&napi->poll_list));
list_add_tail(&napi->poll_list, &sd->poll_list);
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ WRITE_ONCE(napi->list_owner, smp_processor_id());
+ /* If not called from net_rx_action()
+ * we have to raise NET_RX_SOFTIRQ.
+ */
+ if (!sd->in_net_rx_action)
+ raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
#ifdef CONFIG_RPS
-/* One global table that all flow-based protocols share. */
-struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
-EXPORT_SYMBOL(rps_sock_flow_table);
-u32 rps_cpu_mask __read_mostly;
-EXPORT_SYMBOL(rps_cpu_mask);
-
-struct static_key rps_needed __read_mostly;
+struct static_key_false rps_needed __read_mostly;
EXPORT_SYMBOL(rps_needed);
-struct static_key rfs_needed __read_mostly;
+struct static_key_false rfs_needed __read_mostly;
EXPORT_SYMBOL(rfs_needed);
+static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table)
+{
+ return hash_32(hash, flow_table->log);
+}
+
+#ifdef CONFIG_RFS_ACCEL
+/**
+ * rps_flow_is_active - check whether the flow is recently active.
+ * @rflow: Specific flow to check activity.
+ * @flow_table: per-queue flowtable that @rflow belongs to.
+ * @cpu: CPU saved in @rflow.
+ *
+ * If the CPU has processed many packets since the flow's last activity
+ * (beyond 10 times the table size), the flow is considered stale.
+ *
+ * Return: true if flow was recently active.
+ */
+static bool rps_flow_is_active(struct rps_dev_flow *rflow,
+ struct rps_dev_flow_table *flow_table,
+ unsigned int cpu)
+{
+ unsigned int flow_last_active;
+ unsigned int sd_input_head;
+
+ if (cpu >= nr_cpu_ids)
+ return false;
+
+ sd_input_head = READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head);
+ flow_last_active = READ_ONCE(rflow->last_qtail);
+
+ return (int)(sd_input_head - flow_last_active) <
+ (int)(10 << flow_table->log);
+}
+#endif
+
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
- struct rps_dev_flow *rflow, u16 next_cpu)
+ struct rps_dev_flow *rflow, u16 next_cpu, u32 hash,
+ u32 flow_id)
{
if (next_cpu < nr_cpu_ids) {
+ u32 head;
#ifdef CONFIG_RFS_ACCEL
struct netdev_rx_queue *rxqueue;
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *old_rflow;
- u32 flow_id;
+ struct rps_dev_flow *tmp_rflow;
+ unsigned int tmp_cpu;
u16 rxq_index;
int rc;
@@ -3943,23 +4999,38 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (!flow_table)
goto out;
- flow_id = skb_get_hash(skb) & flow_table->mask;
+
+ tmp_rflow = &flow_table->flows[flow_id];
+ tmp_cpu = READ_ONCE(tmp_rflow->cpu);
+
+ if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) {
+ if (rps_flow_is_active(tmp_rflow, flow_table,
+ tmp_cpu)) {
+ if (hash != READ_ONCE(tmp_rflow->hash) ||
+ next_cpu == tmp_cpu)
+ goto out;
+ }
+ }
+
rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
rxq_index, flow_id);
if (rc < 0)
goto out;
+
old_rflow = rflow;
- rflow = &flow_table->flows[flow_id];
- rflow->filter = rc;
- if (old_rflow->filter == rflow->filter)
- old_rflow->filter = RPS_NO_FILTER;
+ rflow = tmp_rflow;
+ WRITE_ONCE(rflow->filter, rc);
+ WRITE_ONCE(rflow->hash, hash);
+
+ if (old_rflow->filter == rc)
+ WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER);
out:
#endif
- rflow->last_qtail =
- per_cpu(softnet_data, next_cpu).input_queue_head;
+ head = READ_ONCE(per_cpu(softnet_data, next_cpu).input_queue_head);
+ rps_input_queue_tail_save(&rflow->last_qtail, head);
}
- rflow->cpu = next_cpu;
+ WRITE_ONCE(rflow->cpu, next_cpu);
return rflow;
}
@@ -3976,6 +5047,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow_table *flow_table;
struct rps_map *map;
int cpu = -1;
+ u32 flow_id;
u32 tcpu;
u32 hash;
@@ -4004,23 +5076,26 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
if (!hash)
goto done;
- sock_flow_table = rcu_dereference(rps_sock_flow_table);
+ sock_flow_table = rcu_dereference(net_hotdata.rps_sock_flow_table);
if (flow_table && sock_flow_table) {
struct rps_dev_flow *rflow;
u32 next_cpu;
u32 ident;
- /* First check into global flow table if there is a match */
- ident = sock_flow_table->ents[hash & sock_flow_table->mask];
- if ((ident ^ hash) & ~rps_cpu_mask)
+ /* First check into global flow table if there is a match.
+ * This READ_ONCE() pairs with WRITE_ONCE() from rps_record_sock_flow().
+ */
+ ident = READ_ONCE(sock_flow_table->ents[hash & sock_flow_table->mask]);
+ if ((ident ^ hash) & ~net_hotdata.rps_cpu_mask)
goto try_rps;
- next_cpu = ident & rps_cpu_mask;
+ next_cpu = ident & net_hotdata.rps_cpu_mask;
/* OK, now we know there is a match,
* we can look at the local (per receive queue) flow table
*/
- rflow = &flow_table->flows[hash & flow_table->mask];
+ flow_id = rfs_slot(hash, flow_table);
+ rflow = &flow_table->flows[flow_id];
tcpu = rflow->cpu;
/*
@@ -4036,10 +5111,11 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
*/
if (unlikely(tcpu != next_cpu) &&
(tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
- ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
+ ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -
rflow->last_qtail)) >= 0)) {
tcpu = next_cpu;
- rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
+ rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash,
+ flow_id);
}
if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
@@ -4083,17 +5159,16 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *rflow;
bool expire = true;
- unsigned int cpu;
rcu_read_lock();
flow_table = rcu_dereference(rxqueue->rps_flow_table);
- if (flow_table && flow_id <= flow_table->mask) {
+ if (flow_table && flow_id < (1UL << flow_table->log)) {
+ unsigned int cpu;
+
rflow = &flow_table->flows[flow_id];
cpu = READ_ONCE(rflow->cpu);
- if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
- ((int)(per_cpu(softnet_data, cpu).input_queue_head -
- rflow->last_qtail) <
- (int)(10 * flow_table->mask)))
+ if (READ_ONCE(rflow->filter) == filter_id &&
+ rps_flow_is_active(rflow, flow_table, cpu))
expire = false;
}
rcu_read_unlock();
@@ -4109,44 +5184,87 @@ static void rps_trigger_softirq(void *data)
struct softnet_data *sd = data;
____napi_schedule(sd, &sd->backlog);
- sd->received_rps++;
+ /* Pairs with READ_ONCE() in softnet_seq_show() */
+ WRITE_ONCE(sd->received_rps, sd->received_rps + 1);
}
#endif /* CONFIG_RPS */
+/* Called from hardirq (IPI) context */
+static void trigger_rx_softirq(void *data)
+{
+ struct softnet_data *sd = data;
+
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ smp_store_release(&sd->defer_ipi_scheduled, 0);
+}
+
/*
- * Check if this softnet_data structure is another cpu one
- * If yes, queue it to our IPI list and return 1
- * If no, return 0
+ * After we queued a packet into sd->input_pkt_queue,
+ * we need to make sure this queue is serviced soon.
+ *
+ * - If this is another cpu queue, link it to our rps_ipi_list,
+ * and make sure we will process rps_ipi_list from net_rx_action().
+ *
+ * - If this is our own queue, NAPI schedule our backlog.
+ * Note that this also raises NET_RX_SOFTIRQ.
*/
-static int rps_ipi_queued(struct softnet_data *sd)
+static void napi_schedule_rps(struct softnet_data *sd)
{
-#ifdef CONFIG_RPS
struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
+#ifdef CONFIG_RPS
if (sd != mysd) {
+ if (use_backlog_threads()) {
+ __napi_schedule_irqoff(&sd->backlog);
+ return;
+ }
+
sd->rps_ipi_next = mysd->rps_ipi_list;
mysd->rps_ipi_list = sd;
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
- return 1;
+ /* If not called from net_rx_action() or napi_threaded_poll()
+ * we have to raise NET_RX_SOFTIRQ.
+ */
+ if (!mysd->in_net_rx_action && !mysd->in_napi_threaded_poll)
+ __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ return;
}
#endif /* CONFIG_RPS */
- return 0;
+ __napi_schedule_irqoff(&mysd->backlog);
+}
+
+void kick_defer_list_purge(unsigned int cpu)
+{
+ struct softnet_data *sd = &per_cpu(softnet_data, cpu);
+ unsigned long flags;
+
+ if (use_backlog_threads()) {
+ backlog_lock_irq_save(sd, &flags);
+
+ if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
+ __napi_schedule_irqoff(&sd->backlog);
+
+ backlog_unlock_irq_restore(sd, &flags);
+
+ } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
+ smp_call_function_single_async(cpu, &sd->defer_csd);
+ }
}
#ifdef CONFIG_NET_FLOW_LIMIT
int netdev_flow_limit_table_len __read_mostly = (1 << 12);
#endif
-static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
+static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen,
+ int max_backlog)
{
#ifdef CONFIG_NET_FLOW_LIMIT
- struct sd_flow_limit *fl;
- struct softnet_data *sd;
unsigned int old_flow, new_flow;
+ const struct softnet_data *sd;
+ struct sd_flow_limit *fl;
- if (qlen < (netdev_max_backlog >> 1))
+ if (likely(qlen < (max_backlog >> 1)))
return false;
sd = this_cpu_ptr(&softnet_data);
@@ -4154,7 +5272,7 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
rcu_read_lock();
fl = rcu_dereference(sd->flow_limit);
if (fl) {
- new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
+ new_flow = hash_32(skb_get_hash(skb), fl->log_buckets);
old_flow = fl->history[fl->history_head];
fl->history[fl->history_head] = new_flow;
@@ -4165,7 +5283,8 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
fl->buckets[old_flow]--;
if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
- fl->count++;
+ /* Pairs with READ_ONCE() in softnet_seq_show() */
+ WRITE_ONCE(fl->count, fl->count + 1);
rcu_read_unlock();
return true;
}
@@ -4182,46 +5301,52 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
unsigned int *qtail)
{
+ enum skb_drop_reason reason;
struct softnet_data *sd;
unsigned long flags;
unsigned int qlen;
+ int max_backlog;
+ u32 tail;
- sd = &per_cpu(softnet_data, cpu);
+ reason = SKB_DROP_REASON_DEV_READY;
+ if (unlikely(!netif_running(skb->dev)))
+ goto bad_dev;
- local_irq_save(flags);
+ sd = &per_cpu(softnet_data, cpu);
- rps_lock(sd);
- if (!netif_running(skb->dev))
- goto drop;
+ qlen = skb_queue_len_lockless(&sd->input_pkt_queue);
+ max_backlog = READ_ONCE(net_hotdata.max_backlog);
+ if (unlikely(qlen > max_backlog) ||
+ skb_flow_limit(skb, qlen, max_backlog))
+ goto cpu_backlog_drop;
+ backlog_lock_irq_save(sd, &flags);
qlen = skb_queue_len(&sd->input_pkt_queue);
- if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
- if (qlen) {
-enqueue:
- __skb_queue_tail(&sd->input_pkt_queue, skb);
- input_queue_tail_incr_save(sd, qtail);
- rps_unlock(sd);
- local_irq_restore(flags);
- return NET_RX_SUCCESS;
- }
-
- /* Schedule NAPI for backlog device
- * We can use non atomic operation since we own the queue lock
- */
- if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
- if (!rps_ipi_queued(sd))
- ____napi_schedule(sd, &sd->backlog);
+ if (likely(qlen <= max_backlog)) {
+ if (!qlen) {
+ /* Schedule NAPI for backlog device. We can use
+ * non atomic operation as we own the queue lock.
+ */
+ if (!__test_and_set_bit(NAPI_STATE_SCHED,
+ &sd->backlog.state))
+ napi_schedule_rps(sd);
}
- goto enqueue;
- }
+ __skb_queue_tail(&sd->input_pkt_queue, skb);
+ tail = rps_input_queue_tail_incr(sd);
+ backlog_unlock_irq_restore(sd, &flags);
-drop:
- sd->dropped++;
- rps_unlock(sd);
+ /* save the tail outside of the critical section */
+ rps_input_queue_tail_save(qtail, tail);
+ return NET_RX_SUCCESS;
+ }
- local_irq_restore(flags);
+ backlog_unlock_irq_restore(sd, &flags);
- atomic_long_inc(&skb->dev->rx_dropped);
- kfree_skb(skb);
+cpu_backlog_drop:
+ reason = SKB_DROP_REASON_CPU_BACKLOG;
+ numa_drop_add(&sd->drop_counters, 1);
+bad_dev:
+ dev_core_stats_rx_dropped_inc(skb->dev);
+ kfree_skb_reason(skb, reason);
return NET_RX_DROP;
}
@@ -4248,76 +5373,93 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
return rxqueue;
}
-static u32 netif_receive_generic_xdp(struct sk_buff *skb,
- struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
+ const struct bpf_prog *xdp_prog)
{
+ void *orig_data, *orig_data_end, *hard_start;
struct netdev_rx_queue *rxqueue;
- void *orig_data, *orig_data_end;
- u32 metalen, act = XDP_DROP;
- int hlen, off;
- u32 mac_len;
+ bool orig_bcast, orig_host;
+ u32 mac_len, frame_sz;
+ __be16 orig_eth_type;
+ struct ethhdr *eth;
+ u32 metalen, act;
+ int off;
- /* Reinjected packets coming from act_mirred or similar should
- * not get XDP generic processing.
+ /* The XDP program wants to see the packet starting at the MAC
+ * header.
*/
- if (skb_cloned(skb) || skb_is_tc_redirected(skb))
- return XDP_PASS;
+ mac_len = skb->data - skb_mac_header(skb);
+ hard_start = skb->data - skb_headroom(skb);
- /* XDP packets must be linear and must have sufficient headroom
- * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
- * native XDP provides, thus we need to do it here as well.
- */
- if (skb_is_nonlinear(skb) ||
- skb_headroom(skb) < XDP_PACKET_HEADROOM) {
- int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
- int troom = skb->tail + skb->data_len - skb->end;
+ /* SKB "head" area always have tailroom for skb_shared_info */
+ frame_sz = (void *)skb_end_pointer(skb) - hard_start;
+ frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- /* In case we have to go down the path and also linearize,
- * then lets do the pskb_expand_head() work just once here.
- */
- if (pskb_expand_head(skb,
- hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
- troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
- goto do_drop;
- if (skb_linearize(skb))
- goto do_drop;
+ rxqueue = netif_get_rxqueue(skb);
+ xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
+ xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
+ skb_headlen(skb) + mac_len, true);
+ if (skb_is_nonlinear(skb)) {
+ skb_shinfo(skb)->xdp_frags_size = skb->data_len;
+ xdp_buff_set_frags_flag(xdp);
+ } else {
+ xdp_buff_clear_frags_flag(xdp);
}
- /* The XDP program wants to see the packet starting at the MAC
- * header.
- */
- mac_len = skb->data - skb_mac_header(skb);
- hlen = skb_headlen(skb) + mac_len;
- xdp->data = skb->data - mac_len;
- xdp->data_meta = xdp->data;
- xdp->data_end = xdp->data + hlen;
- xdp->data_hard_start = skb->data - skb_headroom(skb);
orig_data_end = xdp->data_end;
orig_data = xdp->data;
-
- rxqueue = netif_get_rxqueue(skb);
- xdp->rxq = &rxqueue->xdp_rxq;
+ eth = (struct ethhdr *)xdp->data;
+ orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr);
+ orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
+ orig_eth_type = eth->h_proto;
act = bpf_prog_run_xdp(xdp_prog, xdp);
+ /* check if bpf_xdp_adjust_head was used */
off = xdp->data - orig_data;
- if (off > 0)
- __skb_pull(skb, off);
- else if (off < 0)
- __skb_push(skb, -off);
- skb->mac_header += off;
-
- /* check if bpf_xdp_adjust_tail was used. it can only "shrink"
- * pckt.
- */
- off = orig_data_end - xdp->data_end;
+ if (off) {
+ if (off > 0)
+ __skb_pull(skb, off);
+ else if (off < 0)
+ __skb_push(skb, -off);
+
+ skb->mac_header += off;
+ skb_reset_network_header(skb);
+ }
+
+ /* check if bpf_xdp_adjust_tail was used */
+ off = xdp->data_end - orig_data_end;
if (off != 0) {
skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
- skb->len -= off;
+ skb->len += off; /* positive on grow, negative on shrink */
+ }
+ /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers
+ * (e.g. bpf_xdp_adjust_tail), we need to update data_len here.
+ */
+ if (xdp_buff_has_frags(xdp))
+ skb->data_len = skb_shinfo(skb)->xdp_frags_size;
+ else
+ skb->data_len = 0;
+
+ /* check if XDP changed eth hdr such SKB needs update */
+ eth = (struct ethhdr *)xdp->data;
+ if ((orig_eth_type != eth->h_proto) ||
+ (orig_host != ether_addr_equal_64bits(eth->h_dest,
+ skb->dev->dev_addr)) ||
+ (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
+ __skb_push(skb, ETH_HLEN);
+ skb->pkt_type = PACKET_HOST;
+ skb->protocol = eth_type_trans(skb, skb->dev);
}
+ /* Redirect/Tx gives L2 packet, code that will reuse skb must __skb_pull
+ * before calling us again on redirect path. We do not call do_redirect
+ * as we leave that up to the caller.
+ *
+ * Caller is responsible for managing lifetime of skb (i.e. calling
+ * kfree_skb in response to actions it cannot handle/XDP_DROP).
+ */
switch (act) {
case XDP_REDIRECT:
case XDP_TX:
@@ -4328,15 +5470,80 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
if (metalen)
skb_metadata_set(skb, metalen);
break;
+ }
+
+ return act;
+}
+
+static int
+netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog)
+{
+ struct sk_buff *skb = *pskb;
+ int err, hroom, troom;
+
+ local_lock_nested_bh(&system_page_pool.bh_lock);
+ err = skb_cow_data_for_xdp(this_cpu_read(system_page_pool.pool), pskb, prog);
+ local_unlock_nested_bh(&system_page_pool.bh_lock);
+ if (!err)
+ return 0;
+
+ /* In case we have to go down the path and also linearize,
+ * then lets do the pskb_expand_head() work just once here.
+ */
+ hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
+ troom = skb->tail + skb->data_len - skb->end;
+ err = pskb_expand_head(skb,
+ hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
+ troom > 0 ? troom + 128 : 0, GFP_ATOMIC);
+ if (err)
+ return err;
+
+ return skb_linearize(skb);
+}
+
+static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
+ struct xdp_buff *xdp,
+ const struct bpf_prog *xdp_prog)
+{
+ struct sk_buff *skb = *pskb;
+ u32 mac_len, act = XDP_DROP;
+
+ /* Reinjected packets coming from act_mirred or similar should
+ * not get XDP generic processing.
+ */
+ if (skb_is_redirected(skb))
+ return XDP_PASS;
+
+ /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM
+ * bytes. This is the guarantee that also native XDP provides,
+ * thus we need to do it here as well.
+ */
+ mac_len = skb->data - skb_mac_header(skb);
+ __skb_push(skb, mac_len);
+
+ if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
+ skb_headroom(skb) < XDP_PACKET_HEADROOM) {
+ if (netif_skb_check_for_xdp(pskb, xdp_prog))
+ goto do_drop;
+ }
+
+ __skb_pull(*pskb, mac_len);
+
+ act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog);
+ switch (act) {
+ case XDP_REDIRECT:
+ case XDP_TX:
+ case XDP_PASS:
+ break;
default:
- bpf_warn_invalid_xdp_action(act);
- /* fall through */
+ bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act);
+ fallthrough;
case XDP_ABORTED:
- trace_xdp_exception(skb->dev, xdp_prog, act);
- /* fall through */
+ trace_xdp_exception((*pskb)->dev, xdp_prog, act);
+ fallthrough;
case XDP_DROP:
do_drop:
- kfree_skb(skb);
+ kfree_skb(*pskb);
break;
}
@@ -4344,19 +5551,22 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
}
/* When doing generic XDP we have to bypass the qdisc layer and the
- * network taps in order to match in-driver-XDP behavior.
+ * network taps in order to match in-driver-XDP behavior. This also means
+ * that XDP packets are able to starve other packets going through a qdisc,
+ * and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
+ * queues, so they do not have this starvation issue.
*/
-void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
+void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
bool free_skb = true;
int cpu, rc;
- txq = netdev_pick_tx(dev, skb, NULL);
+ txq = netdev_core_pick_tx(dev, skb, NULL);
cpu = smp_processor_id();
HARD_TX_LOCK(dev, txq, cpu);
- if (!netif_xmit_stopped(txq)) {
+ if (!netif_xmit_frozen_or_drv_stopped(txq)) {
rc = netdev_start_xmit(skb, dev, txq, 0);
if (dev_xmit_complete(rc))
free_skb = false;
@@ -4364,39 +5574,45 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
HARD_TX_UNLOCK(dev, txq);
if (free_skb) {
trace_xdp_exception(dev, xdp_prog, XDP_TX);
+ dev_core_stats_tx_dropped_inc(dev);
kfree_skb(skb);
}
}
-EXPORT_SYMBOL_GPL(generic_xdp_tx);
static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
-int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
+int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb)
{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+
if (xdp_prog) {
struct xdp_buff xdp;
u32 act;
int err;
- act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+ act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog);
if (act != XDP_PASS) {
switch (act) {
case XDP_REDIRECT:
- err = xdp_do_generic_redirect(skb->dev, skb,
+ err = xdp_do_generic_redirect((*pskb)->dev, *pskb,
&xdp, xdp_prog);
if (err)
goto out_redir;
break;
case XDP_TX:
- generic_xdp_tx(skb, xdp_prog);
+ generic_xdp_tx(*pskb, xdp_prog);
break;
}
+ bpf_net_ctx_clear(bpf_net_ctx);
return XDP_DROP;
}
+ bpf_net_ctx_clear(bpf_net_ctx);
}
return XDP_PASS;
out_redir:
- kfree_skb(skb);
+ bpf_net_ctx_clear(bpf_net_ctx);
+ kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP);
return XDP_DROP;
}
EXPORT_SYMBOL_GPL(do_xdp_generic);
@@ -4405,33 +5621,15 @@ static int netif_rx_internal(struct sk_buff *skb)
{
int ret;
- net_timestamp_check(netdev_tstamp_prequeue, skb);
+ net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
trace_netif_rx(skb);
- if (static_branch_unlikely(&generic_xdp_needed_key)) {
- int ret;
-
- preempt_disable();
- rcu_read_lock();
- ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
- rcu_read_unlock();
- preempt_enable();
-
- /* Consider XDP consuming the packet a success from
- * the netdev point of view we do not want to count
- * this as an error.
- */
- if (ret != XDP_PASS)
- return NET_RX_SUCCESS;
- }
-
#ifdef CONFIG_RPS
- if (static_key_false(&rps_needed)) {
+ if (static_branch_unlikely(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu;
- preempt_disable();
rcu_read_lock();
cpu = get_rps_cpu(skb->dev, skb, &rflow);
@@ -4441,58 +5639,73 @@ static int netif_rx_internal(struct sk_buff *skb)
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
- preempt_enable();
} else
#endif
{
unsigned int qtail;
- ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
- put_cpu();
+ ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
}
return ret;
}
/**
+ * __netif_rx - Slightly optimized version of netif_rx
+ * @skb: buffer to post
+ *
+ * This behaves as netif_rx except that it does not disable bottom halves.
+ * As a result this function may only be invoked from the interrupt context
+ * (either hard or soft interrupt).
+ */
+int __netif_rx(struct sk_buff *skb)
+{
+ int ret;
+
+ lockdep_assert_once(hardirq_count() | softirq_count());
+
+ trace_netif_rx_entry(skb);
+ ret = netif_rx_internal(skb);
+ trace_netif_rx_exit(ret);
+ return ret;
+}
+EXPORT_SYMBOL(__netif_rx);
+
+/**
* netif_rx - post buffer to the network code
* @skb: buffer to post
*
* This function receives a packet from a device driver and queues it for
- * the upper (protocol) levels to process. It always succeeds. The buffer
- * may be dropped during processing for congestion control or by the
- * protocol layers.
+ * the upper (protocol) levels to process via the backlog NAPI device. It
+ * always succeeds. The buffer may be dropped during processing for
+ * congestion control or by the protocol layers.
+ * The network buffer is passed via the backlog NAPI device. Modern NIC
+ * driver should use NAPI and GRO.
+ * This function can used from interrupt and from process context. The
+ * caller from process context must not disable interrupts before invoking
+ * this function.
*
* return values:
* NET_RX_SUCCESS (no congestion)
* NET_RX_DROP (packet was dropped)
*
*/
-
int netif_rx(struct sk_buff *skb)
{
- trace_netif_rx_entry(skb);
+ bool need_bh_off = !(hardirq_count() | softirq_count());
+ int ret;
- return netif_rx_internal(skb);
+ if (need_bh_off)
+ local_bh_disable();
+ trace_netif_rx_entry(skb);
+ ret = netif_rx_internal(skb);
+ trace_netif_rx_exit(ret);
+ if (need_bh_off)
+ local_bh_enable();
+ return ret;
}
EXPORT_SYMBOL(netif_rx);
-int netif_rx_ni(struct sk_buff *skb)
-{
- int err;
-
- trace_netif_rx_ni_entry(skb);
-
- preempt_disable();
- err = netif_rx_internal(skb);
- if (local_softirq_pending())
- do_softirq();
- preempt_enable();
-
- return err;
-}
-EXPORT_SYMBOL(netif_rx_ni);
-
-static __latent_entropy void net_tx_action(struct softirq_action *h)
+static __latent_entropy void net_tx_action(void)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@ -4510,18 +5723,18 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
clist = clist->next;
WARN_ON(refcount_read(&skb->users));
- if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
- trace_consume_skb(skb);
+ if (likely(get_kfree_skb_cb(skb)->reason == SKB_CONSUMED))
+ trace_consume_skb(skb, net_tx_action);
else
- trace_kfree_skb(skb, net_tx_action);
+ trace_kfree_skb(skb, net_tx_action,
+ get_kfree_skb_cb(skb)->reason, NULL);
if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
__kfree_skb(skb);
else
- __kfree_skb_defer(skb);
+ __napi_kfree_skb(skb,
+ get_kfree_skb_cb(skb)->reason);
}
-
- __kfree_skb_flush();
}
if (sd->output_queue) {
@@ -4533,25 +5746,45 @@ static __latent_entropy void net_tx_action(struct softirq_action *h)
sd->output_queue_tailp = &sd->output_queue;
local_irq_enable();
+ rcu_read_lock();
+
while (head) {
- struct Qdisc *q = head;
spinlock_t *root_lock = NULL;
+ struct sk_buff *to_free;
+ struct Qdisc *q = head;
head = head->next_sched;
- if (!(q->flags & TCQ_F_NOLOCK)) {
- root_lock = qdisc_lock(q);
- spin_lock(root_lock);
- }
/* We need to make sure head->next_sched is read
* before clearing __QDISC_STATE_SCHED
*/
smp_mb__before_atomic();
+
+ if (!(q->flags & TCQ_F_NOLOCK)) {
+ root_lock = qdisc_lock(q);
+ spin_lock(root_lock);
+ } else if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED,
+ &q->state))) {
+ /* There is a synchronize_net() between
+ * STATE_DEACTIVATED flag being set and
+ * qdisc_reset()/some_qdisc_is_busy() in
+ * dev_deactivate(), so we can safely bail out
+ * early here to avoid data race between
+ * qdisc_deactivate() and some_qdisc_is_busy()
+ * for lockless qdisc.
+ */
+ clear_bit(__QDISC_STATE_SCHED, &q->state);
+ continue;
+ }
+
clear_bit(__QDISC_STATE_SCHED, &q->state);
- qdisc_run(q);
+ to_free = qdisc_run(q);
if (root_lock)
spin_unlock(root_lock);
+ tcf_kfree_skb_list(to_free);
}
+
+ rcu_read_unlock();
}
xfrm_dev_backlog(sd);
@@ -4564,64 +5797,6 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev,
EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
#endif
-static inline struct sk_buff *
-sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
- struct net_device *orig_dev)
-{
-#ifdef CONFIG_NET_CLS_ACT
- struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
- struct tcf_result cl_res;
-
- /* If there's at least one ingress present somewhere (so
- * we get here via enabled static key), remaining devices
- * that are not configured with an ingress qdisc will bail
- * out here.
- */
- if (!miniq)
- return skb;
-
- if (*pt_prev) {
- *ret = deliver_skb(skb, *pt_prev, orig_dev);
- *pt_prev = NULL;
- }
-
- qdisc_skb_cb(skb)->pkt_len = skb->len;
- skb->tc_at_ingress = 1;
- mini_qdisc_bstats_cpu_update(miniq, skb);
-
- switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
- case TC_ACT_OK:
- case TC_ACT_RECLASSIFY:
- skb->tc_index = TC_H_MIN(cl_res.classid);
- break;
- case TC_ACT_SHOT:
- mini_qdisc_qstats_cpu_drop(miniq);
- kfree_skb(skb);
- return NULL;
- case TC_ACT_STOLEN:
- case TC_ACT_QUEUED:
- case TC_ACT_TRAP:
- consume_skb(skb);
- return NULL;
- case TC_ACT_REDIRECT:
- /* skb_mac_header check was done by cls/act_bpf, so
- * we can safely push the L2 header back before
- * redirecting to another netdev
- */
- __skb_push(skb, skb->mac_len);
- skb_do_redirect(skb);
- return NULL;
- case TC_ACT_REINSERT:
- /* this does not scrub the packet, and updates stats on error */
- skb_tc_reinsert(skb, &cl_res);
- return NULL;
- default:
- break;
- }
-#endif /* CONFIG_NET_CLS_ACT */
- return skb;
-}
-
/**
* netdev_is_rx_handler_busy - check if receive handler is registered
* @dev: device to check
@@ -4713,11 +5888,10 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
int *ret, struct net_device *orig_dev)
{
-#ifdef CONFIG_NETFILTER_INGRESS
if (nf_hook_ingress_active(skb)) {
int ingress_retval;
- if (*pt_prev) {
+ if (unlikely(*pt_prev)) {
*ret = deliver_skb(skb, *pt_prev, orig_dev);
*pt_prev = NULL;
}
@@ -4727,29 +5901,36 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
rcu_read_unlock();
return ingress_retval;
}
-#endif /* CONFIG_NETFILTER_INGRESS */
return 0;
}
-static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc,
+static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
struct packet_type **ppt_prev)
{
+ enum skb_drop_reason drop_reason = SKB_DROP_REASON_UNHANDLED_PROTO;
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
+ struct sk_buff *skb = *pskb;
struct net_device *orig_dev;
bool deliver_exact = false;
int ret = NET_RX_DROP;
__be16 type;
- net_timestamp_check(!netdev_tstamp_prequeue, skb);
+ net_timestamp_check(!READ_ONCE(net_hotdata.tstamp_prequeue), skb);
trace_netif_receive_skb(skb);
orig_dev = skb->dev;
skb_reset_network_header(skb);
+#if !defined(CONFIG_DEBUG_NET)
+ /* We plan to no longer reset the transport header here.
+ * Give some time to fuzzers and dev build to catch bugs
+ * in network stacks.
+ */
if (!skb_transport_header_was_set(skb))
skb_reset_transport_header(skb);
+#endif
skb_reset_mac_len(skb);
pt_prev = NULL;
@@ -4759,8 +5940,21 @@ another_round:
__this_cpu_inc(softnet_data.processed);
- if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
- skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
+ if (static_branch_unlikely(&generic_xdp_needed_key)) {
+ int ret2;
+
+ migrate_disable();
+ ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog),
+ &skb);
+ migrate_enable();
+
+ if (ret2 != XDP_PASS) {
+ ret = NET_RX_DROP;
+ goto out;
+ }
+ }
+
+ if (eth_type_vlan(skb->protocol)) {
skb = skb_vlan_untag(skb);
if (unlikely(!skb))
goto out;
@@ -4772,14 +5966,15 @@ another_round:
if (pfmemalloc)
goto skip_taps;
- list_for_each_entry_rcu(ptype, &ptype_all, list) {
- if (pt_prev)
+ list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all,
+ list) {
+ if (unlikely(pt_prev))
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
- if (pt_prev)
+ if (unlikely(pt_prev))
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
@@ -4787,21 +5982,30 @@ another_round:
skip_taps:
#ifdef CONFIG_NET_INGRESS
if (static_branch_unlikely(&ingress_needed_key)) {
- skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
+ bool another = false;
+
+ nf_skip_egress(skb, true);
+ skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
+ &another);
+ if (another)
+ goto another_round;
if (!skb)
goto out;
+ nf_skip_egress(skb, false);
if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
goto out;
}
#endif
- skb_reset_tc(skb);
+ skb_reset_redirect(skb);
skip_classify:
- if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
+ if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) {
+ drop_reason = SKB_DROP_REASON_PFMEMALLOC;
goto drop;
+ }
if (skb_vlan_tag_present(skb)) {
- if (pt_prev) {
+ if (unlikely(pt_prev)) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
@@ -4813,7 +6017,7 @@ skip_classify:
rx_handler = rcu_dereference(skb->dev->rx_handler);
if (rx_handler) {
- if (pt_prev) {
+ if (unlikely(pt_prev)) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
@@ -4825,6 +6029,7 @@ skip_classify:
goto another_round;
case RX_HANDLER_EXACT:
deliver_exact = true;
+ break;
case RX_HANDLER_PASS:
break;
default:
@@ -4832,14 +6037,41 @@ skip_classify:
}
}
- if (unlikely(skb_vlan_tag_present(skb))) {
- if (skb_vlan_tag_get_id(skb))
+ if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
+check_vlan_id:
+ if (skb_vlan_tag_get_id(skb)) {
+ /* Vlan id is non 0 and vlan_do_receive() above couldn't
+ * find vlan device.
+ */
skb->pkt_type = PACKET_OTHERHOST;
+ } else if (eth_type_vlan(skb->protocol)) {
+ /* Outer header is 802.1P with vlan 0, inner header is
+ * 802.1Q or 802.1AD and vlan_do_receive() above could
+ * not find vlan dev for vlan id 0.
+ */
+ __vlan_hwaccel_clear_tag(skb);
+ skb = skb_vlan_untag(skb);
+ if (unlikely(!skb))
+ goto out;
+ if (vlan_do_receive(&skb))
+ /* After stripping off 802.1P header with vlan 0
+ * vlan dev is found for inner header.
+ */
+ goto another_round;
+ else if (unlikely(!skb))
+ goto out;
+ else
+ /* We have stripped outer 802.1P vlan 0 header.
+ * But could not find vlan dev.
+ * check again for vlan id to set OTHERHOST.
+ */
+ goto check_vlan_id;
+ }
/* Note: we might in the future use prio bits
* and set skb->priority like in vlan_do_receive()
* For the time being, just ignore Priority Code Point
*/
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
}
type = skb->protocol;
@@ -4849,6 +6081,14 @@ skip_classify:
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
&ptype_base[ntohs(type) &
PTYPE_HASH_MASK]);
+
+ /* orig_dev and skb->dev could belong to different netns;
+ * Even in such case we need to traverse only the list
+ * coming from skb->dev, as the ptype owner (packet socket)
+ * will use dev_net(skb->dev) to do namespace filtering.
+ */
+ deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+ &dev_net_rcu(skb->dev)->ptype_specific);
}
deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
@@ -4860,16 +6100,15 @@ skip_classify:
}
if (pt_prev) {
- if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
- goto drop;
*ppt_prev = pt_prev;
} else {
drop:
if (!deliver_exact)
- atomic_long_inc(&skb->dev->rx_dropped);
+ dev_core_stats_rx_dropped_inc(skb->dev);
else
- atomic_long_inc(&skb->dev->rx_nohandler);
- kfree_skb(skb);
+ dev_core_stats_rx_nohandler_inc(skb->dev);
+
+ kfree_skb_reason(skb, drop_reason);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/
@@ -4877,6 +6116,13 @@ drop:
}
out:
+ /* The invariant here is that if *ppt_prev is not NULL
+ * then skb should also be non-NULL.
+ *
+ * Apparently *ppt_prev assignment above holds this invariant due to
+ * skb dereferencing near it.
+ */
+ *pskb = skb;
return ret;
}
@@ -4886,9 +6132,10 @@ static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
struct packet_type *pt_prev = NULL;
int ret;
- ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+ ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
if (pt_prev)
- ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+ ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
+ skb->dev, pt_prev, orig_dev);
return ret;
}
@@ -4898,7 +6145,7 @@ static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
*
* More direct receive version of netif_receive_skb(). It should
* only be used by callers that have a need to skip RPS and Generic XDP.
- * Caller must also take care of handling if (page_is_)pfmemalloc.
+ * Caller must also take care of handling if ``(page_is_)pfmemalloc``.
*
* This function may only be called from softirq context and interrupts
* should be enabled.
@@ -4930,10 +6177,13 @@ static inline void __netif_receive_skb_list_ptype(struct list_head *head,
if (list_empty(head))
return;
if (pt_prev->list_func != NULL)
- pt_prev->list_func(head, pt_prev, orig_dev);
+ INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
+ ip_list_rcv, head, pt_prev, orig_dev);
else
- list_for_each_entry_safe(skb, next, head, list)
+ list_for_each_entry_safe(skb, next, head, list) {
+ skb_list_del_init(skb);
pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+ }
}
static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
@@ -4951,16 +6201,15 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo
struct packet_type *pt_curr = NULL;
/* Current (common) orig_dev of sublist */
struct net_device *od_curr = NULL;
- struct list_head sublist;
struct sk_buff *skb, *next;
+ LIST_HEAD(sublist);
- INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
struct net_device *orig_dev = skb->dev;
struct packet_type *pt_prev = NULL;
- list_del(&skb->list);
- __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+ skb_list_del_init(skb);
+ __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
if (!pt_prev)
continue;
if (pt_curr != pt_prev || od_curr != orig_dev) {
@@ -5049,15 +6298,11 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
static_branch_dec(&generic_xdp_needed_key);
} else if (new && !old) {
static_branch_inc(&generic_xdp_needed_key);
- dev_disable_lro(dev);
+ netif_disable_lro(dev);
dev_disable_gro_hw(dev);
}
break;
- case XDP_QUERY_PROG:
- xdp->prog_id = old ? old->aux->id : 0;
- break;
-
default:
ret = -EINVAL;
break;
@@ -5070,27 +6315,14 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
{
int ret;
- net_timestamp_check(netdev_tstamp_prequeue, skb);
+ net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue), skb);
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
- if (static_branch_unlikely(&generic_xdp_needed_key)) {
- int ret;
-
- preempt_disable();
- rcu_read_lock();
- ret = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
- rcu_read_unlock();
- preempt_enable();
-
- if (ret != XDP_PASS)
- return NET_RX_DROP;
- }
-
rcu_read_lock();
#ifdef CONFIG_RPS
- if (static_key_false(&rps_needed)) {
+ if (static_branch_unlikely(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu = get_rps_cpu(skb->dev, skb, &rflow);
@@ -5106,46 +6338,30 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
return ret;
}
-static void netif_receive_skb_list_internal(struct list_head *head)
+void netif_receive_skb_list_internal(struct list_head *head)
{
- struct bpf_prog *xdp_prog = NULL;
struct sk_buff *skb, *next;
- struct list_head sublist;
+ LIST_HEAD(sublist);
- INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
- net_timestamp_check(netdev_tstamp_prequeue, skb);
- list_del(&skb->list);
+ net_timestamp_check(READ_ONCE(net_hotdata.tstamp_prequeue),
+ skb);
+ skb_list_del_init(skb);
if (!skb_defer_rx_timestamp(skb))
list_add_tail(&skb->list, &sublist);
}
list_splice_init(&sublist, head);
- if (static_branch_unlikely(&generic_xdp_needed_key)) {
- preempt_disable();
- rcu_read_lock();
- list_for_each_entry_safe(skb, next, head, list) {
- xdp_prog = rcu_dereference(skb->dev->xdp_prog);
- list_del(&skb->list);
- if (do_xdp_generic(xdp_prog, skb) == XDP_PASS)
- list_add_tail(&skb->list, &sublist);
- }
- rcu_read_unlock();
- preempt_enable();
- /* Put passed packets back on main list */
- list_splice_init(&sublist, head);
- }
-
rcu_read_lock();
#ifdef CONFIG_RPS
- if (static_key_false(&rps_needed)) {
+ if (static_branch_unlikely(&rps_needed)) {
list_for_each_entry_safe(skb, next, head, list) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu = get_rps_cpu(skb->dev, skb, &rflow);
if (cpu >= 0) {
/* Will be handled, remove from list */
- list_del(&skb->list);
+ skb_list_del_init(skb);
enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
}
}
@@ -5172,9 +6388,14 @@ static void netif_receive_skb_list_internal(struct list_head *head)
*/
int netif_receive_skb(struct sk_buff *skb)
{
+ int ret;
+
trace_netif_receive_skb_entry(skb);
- return netif_receive_skb_internal(skb);
+ ret = netif_receive_skb_internal(skb);
+ trace_netif_receive_skb_exit(ret);
+
+ return ret;
}
EXPORT_SYMBOL(netif_receive_skb);
@@ -5194,552 +6415,124 @@ void netif_receive_skb_list(struct list_head *head)
if (list_empty(head))
return;
- list_for_each_entry(skb, head, list)
- trace_netif_receive_skb_list_entry(skb);
+ if (trace_netif_receive_skb_list_entry_enabled()) {
+ list_for_each_entry(skb, head, list)
+ trace_netif_receive_skb_list_entry(skb);
+ }
netif_receive_skb_list_internal(head);
+ trace_netif_receive_skb_list_exit(0);
}
EXPORT_SYMBOL(netif_receive_skb_list);
-DEFINE_PER_CPU(struct work_struct, flush_works);
-
/* Network device is going away, flush any packets still pending */
static void flush_backlog(struct work_struct *work)
{
struct sk_buff *skb, *tmp;
+ struct sk_buff_head list;
struct softnet_data *sd;
+ __skb_queue_head_init(&list);
local_bh_disable();
sd = this_cpu_ptr(&softnet_data);
- local_irq_disable();
- rps_lock(sd);
+ backlog_lock_irq_disable(sd);
skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
- if (skb->dev->reg_state == NETREG_UNREGISTERING) {
+ if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->input_pkt_queue);
- kfree_skb(skb);
- input_queue_head_incr(sd);
+ __skb_queue_tail(&list, skb);
+ rps_input_queue_head_incr(sd);
}
}
- rps_unlock(sd);
- local_irq_enable();
+ backlog_unlock_irq_enable(sd);
+ local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
- if (skb->dev->reg_state == NETREG_UNREGISTERING) {
+ if (READ_ONCE(skb->dev->reg_state) == NETREG_UNREGISTERING) {
__skb_unlink(skb, &sd->process_queue);
- kfree_skb(skb);
- input_queue_head_incr(sd);
+ __skb_queue_tail(&list, skb);
+ rps_input_queue_head_incr(sd);
}
}
+ local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
local_bh_enable();
-}
-static void flush_all_backlogs(void)
-{
- unsigned int cpu;
-
- get_online_cpus();
-
- for_each_online_cpu(cpu)
- queue_work_on(cpu, system_highpri_wq,
- per_cpu_ptr(&flush_works, cpu));
-
- for_each_online_cpu(cpu)
- flush_work(per_cpu_ptr(&flush_works, cpu));
-
- put_online_cpus();
-}
-
-static int napi_gro_complete(struct sk_buff *skb)
-{
- struct packet_offload *ptype;
- __be16 type = skb->protocol;
- struct list_head *head = &offload_base;
- int err = -ENOENT;
-
- BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
-
- if (NAPI_GRO_CB(skb)->count == 1) {
- skb_shinfo(skb)->gso_size = 0;
- goto out;
- }
-
- rcu_read_lock();
- list_for_each_entry_rcu(ptype, head, list) {
- if (ptype->type != type || !ptype->callbacks.gro_complete)
- continue;
-
- err = ptype->callbacks.gro_complete(skb, 0);
- break;
- }
- rcu_read_unlock();
-
- if (err) {
- WARN_ON(&ptype->list == head);
- kfree_skb(skb);
- return NET_RX_SUCCESS;
- }
-
-out:
- return netif_receive_skb_internal(skb);
-}
-
-static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
- bool flush_old)
-{
- struct list_head *head = &napi->gro_hash[index].list;
- struct sk_buff *skb, *p;
-
- list_for_each_entry_safe_reverse(skb, p, head, list) {
- if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
- return;
- list_del(&skb->list);
- skb->next = NULL;
- napi_gro_complete(skb);
- napi->gro_hash[index].count--;
- }
-
- if (!napi->gro_hash[index].count)
- __clear_bit(index, &napi->gro_bitmask);
-}
-
-/* napi->gro_hash[].list contains packets ordered by age.
- * youngest packets at the head of it.
- * Complete skbs in reverse order to reduce latencies.
- */
-void napi_gro_flush(struct napi_struct *napi, bool flush_old)
-{
- u32 i;
-
- for (i = 0; i < GRO_HASH_BUCKETS; i++) {
- if (test_bit(i, &napi->gro_bitmask))
- __napi_gro_flush_chain(napi, i, flush_old);
- }
-}
-EXPORT_SYMBOL(napi_gro_flush);
-
-static struct list_head *gro_list_prepare(struct napi_struct *napi,
- struct sk_buff *skb)
-{
- unsigned int maclen = skb->dev->hard_header_len;
- u32 hash = skb_get_hash_raw(skb);
- struct list_head *head;
- struct sk_buff *p;
-
- head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
- list_for_each_entry(p, head, list) {
- unsigned long diffs;
-
- NAPI_GRO_CB(p)->flush = 0;
-
- if (hash != skb_get_hash_raw(p)) {
- NAPI_GRO_CB(p)->same_flow = 0;
- continue;
- }
-
- diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
- diffs |= p->vlan_tci ^ skb->vlan_tci;
- diffs |= skb_metadata_dst_cmp(p, skb);
- diffs |= skb_metadata_differs(p, skb);
- if (maclen == ETH_HLEN)
- diffs |= compare_ether_header(skb_mac_header(p),
- skb_mac_header(skb));
- else if (!diffs)
- diffs = memcmp(skb_mac_header(p),
- skb_mac_header(skb),
- maclen);
- NAPI_GRO_CB(p)->same_flow = !diffs;
- }
-
- return head;
-}
-
-static void skb_gro_reset_offset(struct sk_buff *skb)
-{
- const struct skb_shared_info *pinfo = skb_shinfo(skb);
- const skb_frag_t *frag0 = &pinfo->frags[0];
-
- NAPI_GRO_CB(skb)->data_offset = 0;
- NAPI_GRO_CB(skb)->frag0 = NULL;
- NAPI_GRO_CB(skb)->frag0_len = 0;
-
- if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
- pinfo->nr_frags &&
- !PageHighMem(skb_frag_page(frag0))) {
- NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
- NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
- skb_frag_size(frag0),
- skb->end - skb->tail);
- }
-}
-
-static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
-{
- struct skb_shared_info *pinfo = skb_shinfo(skb);
-
- BUG_ON(skb->end - skb->tail < grow);
-
- memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
-
- skb->data_len -= grow;
- skb->tail += grow;
-
- pinfo->frags[0].page_offset += grow;
- skb_frag_size_sub(&pinfo->frags[0], grow);
-
- if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
- skb_frag_unref(skb, 0);
- memmove(pinfo->frags, pinfo->frags + 1,
- --pinfo->nr_frags * sizeof(pinfo->frags[0]));
- }
+ __skb_queue_purge_reason(&list, SKB_DROP_REASON_DEV_READY);
}
-static void gro_flush_oldest(struct list_head *head)
+static bool flush_required(int cpu)
{
- struct sk_buff *oldest;
+#if IS_ENABLED(CONFIG_RPS)
+ struct softnet_data *sd = &per_cpu(softnet_data, cpu);
+ bool do_flush;
- oldest = list_last_entry(head, struct sk_buff, list);
+ backlog_lock_irq_disable(sd);
- /* We are called with head length >= MAX_GRO_SKBS, so this is
- * impossible.
+ /* as insertion into process_queue happens with the rps lock held,
+ * process_queue access may race only with dequeue
*/
- if (WARN_ON_ONCE(!oldest))
- return;
+ do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
+ !skb_queue_empty_lockless(&sd->process_queue);
+ backlog_unlock_irq_enable(sd);
- /* Do not adjust napi->gro_hash[].count, caller is adding a new
- * SKB to the chain.
+ return do_flush;
+#endif
+ /* without RPS we can't safely check input_pkt_queue: during a
+ * concurrent remote skb_queue_splice() we can detect as empty both
+ * input_pkt_queue and process_queue even if the latter could end-up
+ * containing a lot of packets.
*/
- list_del(&oldest->list);
- napi_gro_complete(oldest);
-}
-
-static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
-{
- u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
- struct list_head *head = &offload_base;
- struct packet_offload *ptype;
- __be16 type = skb->protocol;
- struct list_head *gro_head;
- struct sk_buff *pp = NULL;
- enum gro_result ret;
- int same_flow;
- int grow;
-
- if (netif_elide_gro(skb->dev))
- goto normal;
-
- gro_head = gro_list_prepare(napi, skb);
-
- rcu_read_lock();
- list_for_each_entry_rcu(ptype, head, list) {
- if (ptype->type != type || !ptype->callbacks.gro_receive)
- continue;
-
- skb_set_network_header(skb, skb_gro_offset(skb));
- skb_reset_mac_len(skb);
- NAPI_GRO_CB(skb)->same_flow = 0;
- NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
- NAPI_GRO_CB(skb)->free = 0;
- NAPI_GRO_CB(skb)->encap_mark = 0;
- NAPI_GRO_CB(skb)->recursion_counter = 0;
- NAPI_GRO_CB(skb)->is_fou = 0;
- NAPI_GRO_CB(skb)->is_atomic = 1;
- NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
-
- /* Setup for GRO checksum validation */
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- NAPI_GRO_CB(skb)->csum = skb->csum;
- NAPI_GRO_CB(skb)->csum_valid = 1;
- NAPI_GRO_CB(skb)->csum_cnt = 0;
- break;
- case CHECKSUM_UNNECESSARY:
- NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
- NAPI_GRO_CB(skb)->csum_valid = 0;
- break;
- default:
- NAPI_GRO_CB(skb)->csum_cnt = 0;
- NAPI_GRO_CB(skb)->csum_valid = 0;
- }
-
- pp = ptype->callbacks.gro_receive(gro_head, skb);
- break;
- }
- rcu_read_unlock();
-
- if (&ptype->list == head)
- goto normal;
-
- if (IS_ERR(pp) && PTR_ERR(pp) == -EINPROGRESS) {
- ret = GRO_CONSUMED;
- goto ok;
- }
-
- same_flow = NAPI_GRO_CB(skb)->same_flow;
- ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
-
- if (pp) {
- list_del(&pp->list);
- pp->next = NULL;
- napi_gro_complete(pp);
- napi->gro_hash[hash].count--;
- }
-
- if (same_flow)
- goto ok;
-
- if (NAPI_GRO_CB(skb)->flush)
- goto normal;
-
- if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
- gro_flush_oldest(gro_head);
- } else {
- napi->gro_hash[hash].count++;
- }
- NAPI_GRO_CB(skb)->count = 1;
- NAPI_GRO_CB(skb)->age = jiffies;
- NAPI_GRO_CB(skb)->last = skb;
- skb_shinfo(skb)->gso_size = skb_gro_len(skb);
- list_add(&skb->list, gro_head);
- ret = GRO_HELD;
-
-pull:
- grow = skb_gro_offset(skb) - skb_headlen(skb);
- if (grow > 0)
- gro_pull_from_frag0(skb, grow);
-ok:
- if (napi->gro_hash[hash].count) {
- if (!test_bit(hash, &napi->gro_bitmask))
- __set_bit(hash, &napi->gro_bitmask);
- } else if (test_bit(hash, &napi->gro_bitmask)) {
- __clear_bit(hash, &napi->gro_bitmask);
- }
-
- return ret;
-
-normal:
- ret = GRO_NORMAL;
- goto pull;
-}
-
-struct packet_offload *gro_find_receive_by_type(__be16 type)
-{
- struct list_head *offload_head = &offload_base;
- struct packet_offload *ptype;
-
- list_for_each_entry_rcu(ptype, offload_head, list) {
- if (ptype->type != type || !ptype->callbacks.gro_receive)
- continue;
- return ptype;
- }
- return NULL;
-}
-EXPORT_SYMBOL(gro_find_receive_by_type);
-
-struct packet_offload *gro_find_complete_by_type(__be16 type)
-{
- struct list_head *offload_head = &offload_base;
- struct packet_offload *ptype;
-
- list_for_each_entry_rcu(ptype, offload_head, list) {
- if (ptype->type != type || !ptype->callbacks.gro_complete)
- continue;
- return ptype;
- }
- return NULL;
-}
-EXPORT_SYMBOL(gro_find_complete_by_type);
-
-static void napi_skb_free_stolen_head(struct sk_buff *skb)
-{
- skb_dst_drop(skb);
- secpath_reset(skb);
- kmem_cache_free(skbuff_head_cache, skb);
-}
-
-static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
-{
- switch (ret) {
- case GRO_NORMAL:
- if (netif_receive_skb_internal(skb))
- ret = GRO_DROP;
- break;
-
- case GRO_DROP:
- kfree_skb(skb);
- break;
-
- case GRO_MERGED_FREE:
- if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
- napi_skb_free_stolen_head(skb);
- else
- __kfree_skb(skb);
- break;
-
- case GRO_HELD:
- case GRO_MERGED:
- case GRO_CONSUMED:
- break;
- }
-
- return ret;
+ return true;
}
-gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
-{
- skb_mark_napi_id(skb, napi);
- trace_napi_gro_receive_entry(skb);
-
- skb_gro_reset_offset(skb);
-
- return napi_skb_finish(dev_gro_receive(napi, skb), skb);
-}
-EXPORT_SYMBOL(napi_gro_receive);
+struct flush_backlogs {
+ cpumask_t flush_cpus;
+ struct work_struct w[];
+};
-static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
+static struct flush_backlogs *flush_backlogs_alloc(void)
{
- if (unlikely(skb->pfmemalloc)) {
- consume_skb(skb);
- return;
- }
- __skb_pull(skb, skb_headlen(skb));
- /* restore the reserve we had after netdev_alloc_skb_ip_align() */
- skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
- skb->vlan_tci = 0;
- skb->dev = napi->dev;
- skb->skb_iif = 0;
- skb->encapsulation = 0;
- skb_shinfo(skb)->gso_type = 0;
- skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
- secpath_reset(skb);
-
- napi->skb = skb;
+ return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids),
+ GFP_KERNEL);
}
-struct sk_buff *napi_get_frags(struct napi_struct *napi)
-{
- struct sk_buff *skb = napi->skb;
+static struct flush_backlogs *flush_backlogs_fallback;
+static DEFINE_MUTEX(flush_backlogs_mutex);
- if (!skb) {
- skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
- if (skb) {
- napi->skb = skb;
- skb_mark_napi_id(skb, napi);
- }
- }
- return skb;
-}
-EXPORT_SYMBOL(napi_get_frags);
-
-static gro_result_t napi_frags_finish(struct napi_struct *napi,
- struct sk_buff *skb,
- gro_result_t ret)
+static void flush_all_backlogs(void)
{
- switch (ret) {
- case GRO_NORMAL:
- case GRO_HELD:
- __skb_push(skb, ETH_HLEN);
- skb->protocol = eth_type_trans(skb, skb->dev);
- if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
- ret = GRO_DROP;
- break;
-
- case GRO_DROP:
- napi_reuse_skb(napi, skb);
- break;
-
- case GRO_MERGED_FREE:
- if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
- napi_skb_free_stolen_head(skb);
- else
- napi_reuse_skb(napi, skb);
- break;
+ struct flush_backlogs *ptr = flush_backlogs_alloc();
+ unsigned int cpu;
- case GRO_MERGED:
- case GRO_CONSUMED:
- break;
+ if (!ptr) {
+ mutex_lock(&flush_backlogs_mutex);
+ ptr = flush_backlogs_fallback;
}
+ cpumask_clear(&ptr->flush_cpus);
- return ret;
-}
-
-/* Upper GRO stack assumes network header starts at gro_offset=0
- * Drivers could call both napi_gro_frags() and napi_gro_receive()
- * We copy ethernet header into skb->data to have a common layout.
- */
-static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
-{
- struct sk_buff *skb = napi->skb;
- const struct ethhdr *eth;
- unsigned int hlen = sizeof(*eth);
-
- napi->skb = NULL;
+ cpus_read_lock();
- skb_reset_mac_header(skb);
- skb_gro_reset_offset(skb);
-
- eth = skb_gro_header_fast(skb, 0);
- if (unlikely(skb_gro_header_hard(skb, hlen))) {
- eth = skb_gro_header_slow(skb, hlen, 0);
- if (unlikely(!eth)) {
- net_warn_ratelimited("%s: dropping impossible skb from %s\n",
- __func__, napi->dev->name);
- napi_reuse_skb(napi, skb);
- return NULL;
+ for_each_online_cpu(cpu) {
+ if (flush_required(cpu)) {
+ INIT_WORK(&ptr->w[cpu], flush_backlog);
+ queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]);
+ __cpumask_set_cpu(cpu, &ptr->flush_cpus);
}
- } else {
- gro_pull_from_frag0(skb, hlen);
- NAPI_GRO_CB(skb)->frag0 += hlen;
- NAPI_GRO_CB(skb)->frag0_len -= hlen;
}
- __skb_pull(skb, hlen);
- /*
- * This works because the only protocols we care about don't require
- * special handling.
- * We'll fix it up properly in napi_frags_finish()
+ /* we can have in flight packet[s] on the cpus we are not flushing,
+ * synchronize_net() in unregister_netdevice_many() will take care of
+ * them.
*/
- skb->protocol = eth->h_proto;
-
- return skb;
-}
-
-gro_result_t napi_gro_frags(struct napi_struct *napi)
-{
- struct sk_buff *skb = napi_frags_skb(napi);
-
- if (!skb)
- return GRO_DROP;
-
- trace_napi_gro_frags_entry(skb);
-
- return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
-}
-EXPORT_SYMBOL(napi_gro_frags);
-
-/* Compute the checksum from gro_offset and return the folded value
- * after adding in any pseudo checksum.
- */
-__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
-{
- __wsum wsum;
- __sum16 sum;
-
- wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
-
- /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
- sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
- if (likely(!sum)) {
- if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
- !skb->csum_complete_sw)
- netdev_rx_csum_fault(skb->dev);
- }
+ for_each_cpu(cpu, &ptr->flush_cpus)
+ flush_work(&ptr->w[cpu]);
- NAPI_GRO_CB(skb)->csum = wsum;
- NAPI_GRO_CB(skb)->csum_valid = 1;
+ cpus_read_unlock();
- return sum;
+ if (ptr != flush_backlogs_fallback)
+ kfree(ptr);
+ else
+ mutex_unlock(&flush_backlogs_mutex);
}
-EXPORT_SYMBOL(__skb_gro_checksum_complete);
static void net_rps_send_ipi(struct softnet_data *remsd)
{
@@ -5763,7 +6556,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
#ifdef CONFIG_RPS
struct softnet_data *remsd = sd->rps_ipi_list;
- if (remsd) {
+ if (!use_backlog_threads() && remsd) {
sd->rps_ipi_list = NULL;
local_irq_enable();
@@ -5778,7 +6571,7 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd)
static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
{
#ifdef CONFIG_RPS
- return sd->rps_ipi_list != NULL;
+ return !use_backlog_threads() && sd->rps_ipi_list;
#else
return false;
#endif
@@ -5798,22 +6591,26 @@ static int process_backlog(struct napi_struct *napi, int quota)
net_rps_action_and_irq_enable(sd);
}
- napi->weight = dev_rx_weight;
+ napi->weight = READ_ONCE(net_hotdata.dev_rx_weight);
while (again) {
struct sk_buff *skb;
+ local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
while ((skb = __skb_dequeue(&sd->process_queue))) {
+ local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
rcu_read_lock();
__netif_receive_skb(skb);
rcu_read_unlock();
- input_queue_head_incr(sd);
- if (++work >= quota)
+ if (++work >= quota) {
+ rps_input_queue_head_add(sd, work);
return work;
+ }
+ local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
}
+ local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
- local_irq_disable();
- rps_lock(sd);
+ backlog_lock_irq_disable(sd);
if (skb_queue_empty(&sd->input_pkt_queue)) {
/*
* Inline a custom version of __napi_complete().
@@ -5823,16 +6620,19 @@ static int process_backlog(struct napi_struct *napi, int quota)
* We can use a plain write instead of clear_bit(),
* and we dont need an smp_mb() memory barrier.
*/
- napi->state = 0;
+ napi->state &= NAPIF_STATE_THREADED;
again = false;
} else {
+ local_lock_nested_bh(&softnet_data.process_queue_bh_lock);
skb_queue_splice_tail_init(&sd->input_pkt_queue,
&sd->process_queue);
+ local_unlock_nested_bh(&softnet_data.process_queue_bh_lock);
}
- rps_unlock(sd);
- local_irq_enable();
+ backlog_unlock_irq_enable(sd);
}
+ if (work)
+ rps_input_queue_head_add(sd, work);
return work;
}
@@ -5858,16 +6658,15 @@ EXPORT_SYMBOL(__napi_schedule);
* @n: napi context
*
* Test if NAPI routine is already running, and if not mark
- * it as running. This is used as a condition variable
+ * it as running. This is used as a condition variable to
* insure only one NAPI poll instance runs. We also make
* sure there is no pending NAPI disable.
*/
bool napi_schedule_prep(struct napi_struct *n)
{
- unsigned long val, new;
+ unsigned long new, val = READ_ONCE(n->state);
do {
- val = READ_ONCE(n->state);
if (unlikely(val & NAPIF_STATE_DISABLE))
return false;
new = val | NAPIF_STATE_SCHED;
@@ -5880,7 +6679,7 @@ bool napi_schedule_prep(struct napi_struct *n)
*/
new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
NAPIF_STATE_MISSED;
- } while (cmpxchg(&n->state, val, new) != val);
+ } while (!try_cmpxchg(&n->state, &val, new));
return !(val & NAPIF_STATE_SCHED);
}
@@ -5890,17 +6689,25 @@ EXPORT_SYMBOL(napi_schedule_prep);
* __napi_schedule_irqoff - schedule for receive
* @n: entry to schedule
*
- * Variant of __napi_schedule() assuming hard irqs are masked
+ * Variant of __napi_schedule() assuming hard irqs are masked.
+ *
+ * On PREEMPT_RT enabled kernels this maps to __napi_schedule()
+ * because the interrupt disabled assumption might not be true
+ * due to force-threaded interrupts and spinlock substitution.
*/
void __napi_schedule_irqoff(struct napi_struct *n)
{
- ____napi_schedule(this_cpu_ptr(&softnet_data), n);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ ____napi_schedule(this_cpu_ptr(&softnet_data), n);
+ else
+ __napi_schedule(n);
}
EXPORT_SYMBOL(__napi_schedule_irqoff);
bool napi_complete_done(struct napi_struct *n, int work_done)
{
- unsigned long flags, val, new;
+ unsigned long flags, val, new, timeout = 0;
+ bool ret = true;
/*
* 1) Don't let napi dequeue from the cpu poll list
@@ -5912,31 +6719,40 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
NAPIF_STATE_IN_BUSY_POLL)))
return false;
- if (n->gro_bitmask) {
- unsigned long timeout = 0;
-
- if (work_done)
- timeout = n->dev->gro_flush_timeout;
-
+ if (work_done) {
+ if (n->gro.bitmask)
+ timeout = napi_get_gro_flush_timeout(n);
+ n->defer_hard_irqs_count = napi_get_defer_hard_irqs(n);
+ }
+ if (n->defer_hard_irqs_count > 0) {
+ n->defer_hard_irqs_count--;
+ timeout = napi_get_gro_flush_timeout(n);
if (timeout)
- hrtimer_start(&n->timer, ns_to_ktime(timeout),
- HRTIMER_MODE_REL_PINNED);
- else
- napi_gro_flush(n, false);
+ ret = false;
}
+
+ /*
+ * When the NAPI instance uses a timeout and keeps postponing
+ * it, we need to bound somehow the time packets are kept in
+ * the GRO layer.
+ */
+ gro_flush_normal(&n->gro, !!timeout);
+
if (unlikely(!list_empty(&n->poll_list))) {
/* If n->poll_list is not empty, we need to mask irqs */
local_irq_save(flags);
list_del_init(&n->poll_list);
local_irq_restore(flags);
}
+ WRITE_ONCE(n->list_owner, -1);
+ val = READ_ONCE(n->state);
do {
- val = READ_ONCE(n->state);
-
WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
- new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
+ new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
+ NAPIF_STATE_SCHED_THREADED |
+ NAPIF_STATE_PREFER_BUSY_POLL);
/* If STATE_MISSED was set, leave STATE_SCHED set,
* because we will call napi->poll() one more time.
@@ -5944,36 +6760,69 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
*/
new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
NAPIF_STATE_SCHED;
- } while (cmpxchg(&n->state, val, new) != val);
+ } while (!try_cmpxchg(&n->state, &val, new));
if (unlikely(val & NAPIF_STATE_MISSED)) {
__napi_schedule(n);
return false;
}
- return true;
+ if (timeout)
+ hrtimer_start(&n->timer, ns_to_ktime(timeout),
+ HRTIMER_MODE_REL_PINNED);
+ return ret;
}
EXPORT_SYMBOL(napi_complete_done);
-/* must be called under rcu_read_lock(), as we dont take a reference */
-static struct napi_struct *napi_by_id(unsigned int napi_id)
+static void skb_defer_free_flush(void)
{
- unsigned int hash = napi_id % HASH_SIZE(napi_hash);
- struct napi_struct *napi;
+ struct llist_node *free_list;
+ struct sk_buff *skb, *next;
+ struct skb_defer_node *sdn;
+ int node;
- hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
- if (napi->napi_id == napi_id)
- return napi;
+ for_each_node(node) {
+ sdn = this_cpu_ptr(net_hotdata.skb_defer_nodes) + node;
- return NULL;
+ if (llist_empty(&sdn->defer_list))
+ continue;
+ atomic_long_set(&sdn->defer_count, 0);
+ free_list = llist_del_all(&sdn->defer_list);
+
+ llist_for_each_entry_safe(skb, next, free_list, ll_node) {
+ prefetch(next);
+ napi_consume_skb(skb, 1);
+ }
+ }
}
#if defined(CONFIG_NET_RX_BUSY_POLL)
-#define BUSY_POLL_BUDGET 8
+static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
+{
+ if (!skip_schedule) {
+ gro_normal_list(&napi->gro);
+ __napi_schedule(napi);
+ return;
+ }
+
+ /* Flush too old packets. If HZ < 1000, flush all packets */
+ gro_flush_normal(&napi->gro, HZ >= 1000);
-static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
+ clear_bit(NAPI_STATE_SCHED, &napi->state);
+}
+
+enum {
+ NAPI_F_PREFER_BUSY_POLL = 1,
+ NAPI_F_END_ON_RESCHED = 2,
+};
+
+static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock,
+ unsigned flags, u16 budget)
{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+ bool skip_schedule = false;
+ unsigned long timeout;
int rc;
/* Busy polling means there is a high chance device driver hard irq
@@ -5989,41 +6838,59 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+
+ if (flags & NAPI_F_PREFER_BUSY_POLL) {
+ napi->defer_hard_irqs_count = napi_get_defer_hard_irqs(napi);
+ timeout = napi_get_gro_flush_timeout(napi);
+ if (napi->defer_hard_irqs_count && timeout) {
+ hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
+ skip_schedule = true;
+ }
+ }
/* All we really want here is to re-enable device interrupts.
* Ideally, a new ndo_busy_poll_stop() could avoid another round.
*/
- rc = napi->poll(napi, BUSY_POLL_BUDGET);
- trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
+ rc = napi->poll(napi, budget);
+ /* We can't gro_normal_list() here, because napi->poll() might have
+ * rearmed the napi (napi_complete_done()) in which case it could
+ * already be running on another CPU.
+ */
+ trace_napi_poll(napi, rc, budget);
netpoll_poll_unlock(have_poll_lock);
- if (rc == BUSY_POLL_BUDGET)
- __napi_schedule(napi);
+ if (rc == budget)
+ __busy_poll_stop(napi, skip_schedule);
+ bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
}
-void napi_busy_loop(unsigned int napi_id,
- bool (*loop_end)(void *, unsigned long),
- void *loop_end_arg)
+static void __napi_busy_loop(unsigned int napi_id,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg, unsigned flags, u16 budget)
{
unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
int (*napi_poll)(struct napi_struct *napi, int budget);
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
void *have_poll_lock = NULL;
struct napi_struct *napi;
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
restart:
napi_poll = NULL;
- rcu_read_lock();
-
napi = napi_by_id(napi_id);
if (!napi)
- goto out;
+ return;
- preempt_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_disable();
for (;;) {
int work = 0;
local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
if (!napi_poll) {
unsigned long val = READ_ONCE(napi->state);
@@ -6031,32 +6898,45 @@ restart:
* we avoid dirtying napi->state as much as we can.
*/
if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
- NAPIF_STATE_IN_BUSY_POLL))
+ NAPIF_STATE_IN_BUSY_POLL)) {
+ if (flags & NAPI_F_PREFER_BUSY_POLL)
+ set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
+ }
if (cmpxchg(&napi->state, val,
val | NAPIF_STATE_IN_BUSY_POLL |
- NAPIF_STATE_SCHED) != val)
+ NAPIF_STATE_SCHED) != val) {
+ if (flags & NAPI_F_PREFER_BUSY_POLL)
+ set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
+ }
have_poll_lock = netpoll_poll_lock(napi);
napi_poll = napi->poll;
}
- work = napi_poll(napi, BUSY_POLL_BUDGET);
- trace_napi_poll(napi, work, BUSY_POLL_BUDGET);
+ work = napi_poll(napi, budget);
+ trace_napi_poll(napi, work, budget);
+ gro_normal_list(&napi->gro);
count:
if (work > 0)
__NET_ADD_STATS(dev_net(napi->dev),
LINUX_MIB_BUSYPOLLRXPACKETS, work);
+ skb_defer_free_flush();
+ bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
if (!loop_end || loop_end(loop_end_arg, start_time))
break;
if (unlikely(need_resched())) {
+ if (flags & NAPI_F_END_ON_RESCHED)
+ break;
if (napi_poll)
- busy_poll_stop(napi, have_poll_lock);
- preempt_enable();
+ busy_poll_stop(napi, have_poll_lock, flags, budget);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
rcu_read_unlock();
cond_resched();
+ rcu_read_lock();
if (loop_end(loop_end_arg, start_time))
return;
goto restart;
@@ -6064,53 +6944,128 @@ count:
cpu_relax();
}
if (napi_poll)
- busy_poll_stop(napi, have_poll_lock);
- preempt_enable();
-out:
+ busy_poll_stop(napi, have_poll_lock, flags, budget);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ preempt_enable();
+}
+
+void napi_busy_loop_rcu(unsigned int napi_id,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg, bool prefer_busy_poll, u16 budget)
+{
+ unsigned flags = NAPI_F_END_ON_RESCHED;
+
+ if (prefer_busy_poll)
+ flags |= NAPI_F_PREFER_BUSY_POLL;
+
+ __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
+}
+
+void napi_busy_loop(unsigned int napi_id,
+ bool (*loop_end)(void *, unsigned long),
+ void *loop_end_arg, bool prefer_busy_poll, u16 budget)
+{
+ unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0;
+
+ rcu_read_lock();
+ __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget);
rcu_read_unlock();
}
EXPORT_SYMBOL(napi_busy_loop);
+void napi_suspend_irqs(unsigned int napi_id)
+{
+ struct napi_struct *napi;
+
+ rcu_read_lock();
+ napi = napi_by_id(napi_id);
+ if (napi) {
+ unsigned long timeout = napi_get_irq_suspend_timeout(napi);
+
+ if (timeout)
+ hrtimer_start(&napi->timer, ns_to_ktime(timeout),
+ HRTIMER_MODE_REL_PINNED);
+ }
+ rcu_read_unlock();
+}
+
+void napi_resume_irqs(unsigned int napi_id)
+{
+ struct napi_struct *napi;
+
+ rcu_read_lock();
+ napi = napi_by_id(napi_id);
+ if (napi) {
+ /* If irq_suspend_timeout is set to 0 between the call to
+ * napi_suspend_irqs and now, the original value still
+ * determines the safety timeout as intended and napi_watchdog
+ * will resume irq processing.
+ */
+ if (napi_get_irq_suspend_timeout(napi)) {
+ local_bh_disable();
+ napi_schedule(napi);
+ local_bh_enable();
+ }
+ }
+ rcu_read_unlock();
+}
+
#endif /* CONFIG_NET_RX_BUSY_POLL */
+static void __napi_hash_add_with_id(struct napi_struct *napi,
+ unsigned int napi_id)
+{
+ napi->gro.cached_napi_id = napi_id;
+
+ WRITE_ONCE(napi->napi_id, napi_id);
+ hlist_add_head_rcu(&napi->napi_hash_node,
+ &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+}
+
+static void napi_hash_add_with_id(struct napi_struct *napi,
+ unsigned int napi_id)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&napi_hash_lock, flags);
+ WARN_ON_ONCE(napi_by_id(napi_id));
+ __napi_hash_add_with_id(napi, napi_id);
+ spin_unlock_irqrestore(&napi_hash_lock, flags);
+}
+
static void napi_hash_add(struct napi_struct *napi)
{
- if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
- test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
+ unsigned long flags;
+
+ if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
return;
- spin_lock(&napi_hash_lock);
+ spin_lock_irqsave(&napi_hash_lock, flags);
/* 0..NR_CPUS range is reserved for sender_cpu use */
do {
- if (unlikely(++napi_gen_id < MIN_NAPI_ID))
+ if (unlikely(!napi_id_valid(++napi_gen_id)))
napi_gen_id = MIN_NAPI_ID;
} while (napi_by_id(napi_gen_id));
- napi->napi_id = napi_gen_id;
- hlist_add_head_rcu(&napi->napi_hash_node,
- &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+ __napi_hash_add_with_id(napi, napi_gen_id);
- spin_unlock(&napi_hash_lock);
+ spin_unlock_irqrestore(&napi_hash_lock, flags);
}
/* Warning : caller is responsible to make sure rcu grace period
* is respected before freeing memory containing @napi
*/
-bool napi_hash_del(struct napi_struct *napi)
+static void napi_hash_del(struct napi_struct *napi)
{
- bool rcu_sync_needed = false;
+ unsigned long flags;
- spin_lock(&napi_hash_lock);
+ spin_lock_irqsave(&napi_hash_lock, flags);
- if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
- rcu_sync_needed = true;
- hlist_del_rcu(&napi->napi_hash_node);
- }
- spin_unlock(&napi_hash_lock);
- return rcu_sync_needed;
+ hlist_del_init_rcu(&napi->napi_hash_node);
+
+ spin_unlock_irqrestore(&napi_hash_lock, flags);
}
-EXPORT_SYMBOL_GPL(napi_hash_del);
static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
{
@@ -6121,99 +7076,583 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
/* Note : we use a relaxed variant of napi_schedule_prep() not setting
* NAPI_STATE_MISSED, since we do not react to a device IRQ.
*/
- if (napi->gro_bitmask && !napi_disable_pending(napi) &&
- !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
+ if (!napi_disable_pending(napi) &&
+ !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
+ clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
__napi_schedule_irqoff(napi);
+ }
return HRTIMER_NORESTART;
}
-static void init_gro_hash(struct napi_struct *napi)
+static void napi_stop_kthread(struct napi_struct *napi)
{
- int i;
+ unsigned long val, new;
+
+ /* Wait until the napi STATE_THREADED is unset. */
+ while (true) {
+ val = READ_ONCE(napi->state);
+
+ /* If napi kthread own this napi or the napi is idle,
+ * STATE_THREADED can be unset here.
+ */
+ if ((val & NAPIF_STATE_SCHED_THREADED) ||
+ !(val & NAPIF_STATE_SCHED)) {
+ new = val & (~(NAPIF_STATE_THREADED |
+ NAPIF_STATE_THREADED_BUSY_POLL));
+ } else {
+ msleep(20);
+ continue;
+ }
+
+ if (try_cmpxchg(&napi->state, &val, new))
+ break;
+ }
+
+ /* Once STATE_THREADED is unset, wait for SCHED_THREADED to be unset by
+ * the kthread.
+ */
+ while (true) {
+ if (!test_bit(NAPI_STATE_SCHED_THREADED, &napi->state))
+ break;
+
+ msleep(20);
+ }
+
+ kthread_stop(napi->thread);
+ napi->thread = NULL;
+}
+
+static void napi_set_threaded_state(struct napi_struct *napi,
+ enum netdev_napi_threaded threaded_mode)
+{
+ bool threaded = threaded_mode != NETDEV_NAPI_THREADED_DISABLED;
+ bool busy_poll = threaded_mode == NETDEV_NAPI_THREADED_BUSY_POLL;
+
+ assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
+ assign_bit(NAPI_STATE_THREADED_BUSY_POLL, &napi->state, busy_poll);
+}
+
+int napi_set_threaded(struct napi_struct *napi,
+ enum netdev_napi_threaded threaded)
+{
+ if (threaded) {
+ if (!napi->thread) {
+ int err = napi_kthread_create(napi);
+
+ if (err)
+ return err;
+ }
+ }
+
+ if (napi->config)
+ napi->config->threaded = threaded;
+
+ /* Setting/unsetting threaded mode on a napi might not immediately
+ * take effect, if the current napi instance is actively being
+ * polled. In this case, the switch between threaded mode and
+ * softirq mode will happen in the next round of napi_schedule().
+ * This should not cause hiccups/stalls to the live traffic.
+ */
+ if (!threaded && napi->thread) {
+ napi_stop_kthread(napi);
+ } else {
+ /* Make sure kthread is created before THREADED bit is set. */
+ smp_mb__before_atomic();
+ napi_set_threaded_state(napi, threaded);
+ }
+
+ return 0;
+}
+
+int netif_set_threaded(struct net_device *dev,
+ enum netdev_napi_threaded threaded)
+{
+ struct napi_struct *napi;
+ int i, err = 0;
+
+ netdev_assert_locked_or_invisible(dev);
+
+ if (threaded) {
+ list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ if (!napi->thread) {
+ err = napi_kthread_create(napi);
+ if (err) {
+ threaded = NETDEV_NAPI_THREADED_DISABLED;
+ break;
+ }
+ }
+ }
+ }
+
+ WRITE_ONCE(dev->threaded, threaded);
+
+ /* The error should not occur as the kthreads are already created. */
+ list_for_each_entry(napi, &dev->napi_list, dev_list)
+ WARN_ON_ONCE(napi_set_threaded(napi, threaded));
+
+ /* Override the config for all NAPIs even if currently not listed */
+ for (i = 0; i < dev->num_napi_configs; i++)
+ dev->napi_config[i].threaded = threaded;
+
+ return err;
+}
+
+/**
+ * netif_threaded_enable() - enable threaded NAPIs
+ * @dev: net_device instance
+ *
+ * Enable threaded mode for the NAPI instances of the device. This may be useful
+ * for devices where multiple NAPI instances get scheduled by a single
+ * interrupt. Threaded NAPI allows moving the NAPI processing to cores other
+ * than the core where IRQ is mapped.
+ *
+ * This function should be called before @dev is registered.
+ */
+void netif_threaded_enable(struct net_device *dev)
+{
+ WARN_ON_ONCE(netif_set_threaded(dev, NETDEV_NAPI_THREADED_ENABLED));
+}
+EXPORT_SYMBOL(netif_threaded_enable);
+
+/**
+ * netif_queue_set_napi - Associate queue with the napi
+ * @dev: device to which NAPI and queue belong
+ * @queue_index: Index of queue
+ * @type: queue type as RX or TX
+ * @napi: NAPI context, pass NULL to clear previously set NAPI
+ *
+ * Set queue with its corresponding napi context. This should be done after
+ * registering the NAPI handler for the queue-vector and the queues have been
+ * mapped to the corresponding interrupt vector.
+ */
+void netif_queue_set_napi(struct net_device *dev, unsigned int queue_index,
+ enum netdev_queue_type type, struct napi_struct *napi)
+{
+ struct netdev_rx_queue *rxq;
+ struct netdev_queue *txq;
+
+ if (WARN_ON_ONCE(napi && !napi->dev))
+ return;
+ netdev_ops_assert_locked_or_invisible(dev);
+
+ switch (type) {
+ case NETDEV_QUEUE_TYPE_RX:
+ rxq = __netif_get_rx_queue(dev, queue_index);
+ rxq->napi = napi;
+ return;
+ case NETDEV_QUEUE_TYPE_TX:
+ txq = netdev_get_tx_queue(dev, queue_index);
+ txq->napi = napi;
+ return;
+ default:
+ return;
+ }
+}
+EXPORT_SYMBOL(netif_queue_set_napi);
+
+static void
+netif_napi_irq_notify(struct irq_affinity_notify *notify,
+ const cpumask_t *mask)
+{
+ struct napi_struct *napi =
+ container_of(notify, struct napi_struct, notify);
+#ifdef CONFIG_RFS_ACCEL
+ struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap;
+ int err;
+#endif
+
+ if (napi->config && napi->dev->irq_affinity_auto)
+ cpumask_copy(&napi->config->affinity_mask, mask);
+
+#ifdef CONFIG_RFS_ACCEL
+ if (napi->dev->rx_cpu_rmap_auto) {
+ err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask);
+ if (err)
+ netdev_warn(napi->dev, "RMAP update failed (%d)\n",
+ err);
+ }
+#endif
+}
+
+#ifdef CONFIG_RFS_ACCEL
+static void netif_napi_affinity_release(struct kref *ref)
+{
+ struct napi_struct *napi =
+ container_of(ref, struct napi_struct, notify.kref);
+ struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap;
+
+ netdev_assert_locked(napi->dev);
+ WARN_ON(test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER,
+ &napi->state));
+
+ if (!napi->dev->rx_cpu_rmap_auto)
+ return;
+ rmap->obj[napi->napi_rmap_idx] = NULL;
+ napi->napi_rmap_idx = -1;
+ cpu_rmap_put(rmap);
+}
+
+int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
+{
+ if (dev->rx_cpu_rmap_auto)
+ return 0;
+
+ dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs);
+ if (!dev->rx_cpu_rmap)
+ return -ENOMEM;
+
+ dev->rx_cpu_rmap_auto = true;
+ return 0;
+}
+EXPORT_SYMBOL(netif_enable_cpu_rmap);
+
+static void netif_del_cpu_rmap(struct net_device *dev)
+{
+ struct cpu_rmap *rmap = dev->rx_cpu_rmap;
+
+ if (!dev->rx_cpu_rmap_auto)
+ return;
+
+ /* Free the rmap */
+ cpu_rmap_put(rmap);
+ dev->rx_cpu_rmap = NULL;
+ dev->rx_cpu_rmap_auto = false;
+}
+
+#else
+static void netif_napi_affinity_release(struct kref *ref)
+{
+}
+
+int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs)
+{
+ return 0;
+}
+EXPORT_SYMBOL(netif_enable_cpu_rmap);
+
+static void netif_del_cpu_rmap(struct net_device *dev)
+{
+}
+#endif
+
+void netif_set_affinity_auto(struct net_device *dev)
+{
+ unsigned int i, maxqs, numa;
+
+ maxqs = max(dev->num_tx_queues, dev->num_rx_queues);
+ numa = dev_to_node(&dev->dev);
+
+ for (i = 0; i < maxqs; i++)
+ cpumask_set_cpu(cpumask_local_spread(i, numa),
+ &dev->napi_config[i].affinity_mask);
+
+ dev->irq_affinity_auto = true;
+}
+EXPORT_SYMBOL(netif_set_affinity_auto);
+
+void netif_napi_set_irq_locked(struct napi_struct *napi, int irq)
+{
+ int rc;
- for (i = 0; i < GRO_HASH_BUCKETS; i++) {
- INIT_LIST_HEAD(&napi->gro_hash[i].list);
- napi->gro_hash[i].count = 0;
+ netdev_assert_locked_or_invisible(napi->dev);
+
+ if (napi->irq == irq)
+ return;
+
+ /* Remove existing resources */
+ if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
+ irq_set_affinity_notifier(napi->irq, NULL);
+
+ napi->irq = irq;
+ if (irq < 0 ||
+ (!napi->dev->rx_cpu_rmap_auto && !napi->dev->irq_affinity_auto))
+ return;
+
+ /* Abort for buggy drivers */
+ if (napi->dev->irq_affinity_auto && WARN_ON_ONCE(!napi->config))
+ return;
+
+#ifdef CONFIG_RFS_ACCEL
+ if (napi->dev->rx_cpu_rmap_auto) {
+ rc = cpu_rmap_add(napi->dev->rx_cpu_rmap, napi);
+ if (rc < 0)
+ return;
+
+ cpu_rmap_get(napi->dev->rx_cpu_rmap);
+ napi->napi_rmap_idx = rc;
+ }
+#endif
+
+ /* Use core IRQ notifier */
+ napi->notify.notify = netif_napi_irq_notify;
+ napi->notify.release = netif_napi_affinity_release;
+ rc = irq_set_affinity_notifier(irq, &napi->notify);
+ if (rc) {
+ netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n",
+ rc);
+ goto put_rmap;
}
- napi->gro_bitmask = 0;
+
+ set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state);
+ return;
+
+put_rmap:
+#ifdef CONFIG_RFS_ACCEL
+ if (napi->dev->rx_cpu_rmap_auto) {
+ napi->dev->rx_cpu_rmap->obj[napi->napi_rmap_idx] = NULL;
+ cpu_rmap_put(napi->dev->rx_cpu_rmap);
+ napi->napi_rmap_idx = -1;
+ }
+#endif
+ napi->notify.notify = NULL;
+ napi->notify.release = NULL;
}
+EXPORT_SYMBOL(netif_napi_set_irq_locked);
-void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
- int (*poll)(struct napi_struct *, int), int weight)
+static void napi_restore_config(struct napi_struct *n)
{
+ n->defer_hard_irqs = n->config->defer_hard_irqs;
+ n->gro_flush_timeout = n->config->gro_flush_timeout;
+ n->irq_suspend_timeout = n->config->irq_suspend_timeout;
+
+ if (n->dev->irq_affinity_auto &&
+ test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state))
+ irq_set_affinity(n->irq, &n->config->affinity_mask);
+
+ /* a NAPI ID might be stored in the config, if so use it. if not, use
+ * napi_hash_add to generate one for us.
+ */
+ if (n->config->napi_id) {
+ napi_hash_add_with_id(n, n->config->napi_id);
+ } else {
+ napi_hash_add(n);
+ n->config->napi_id = n->napi_id;
+ }
+
+ WARN_ON_ONCE(napi_set_threaded(n, n->config->threaded));
+}
+
+static void napi_save_config(struct napi_struct *n)
+{
+ n->config->defer_hard_irqs = n->defer_hard_irqs;
+ n->config->gro_flush_timeout = n->gro_flush_timeout;
+ n->config->irq_suspend_timeout = n->irq_suspend_timeout;
+ napi_hash_del(n);
+}
+
+/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will
+ * inherit an existing ID try to insert it at the right position.
+ */
+static void
+netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi)
+{
+ unsigned int new_id, pos_id;
+ struct list_head *higher;
+ struct napi_struct *pos;
+
+ new_id = UINT_MAX;
+ if (napi->config && napi->config->napi_id)
+ new_id = napi->config->napi_id;
+
+ higher = &dev->napi_list;
+ list_for_each_entry(pos, &dev->napi_list, dev_list) {
+ if (napi_id_valid(pos->napi_id))
+ pos_id = pos->napi_id;
+ else if (pos->config)
+ pos_id = pos->config->napi_id;
+ else
+ pos_id = UINT_MAX;
+
+ if (pos_id <= new_id)
+ break;
+ higher = &pos->dev_list;
+ }
+ list_add_rcu(&napi->dev_list, higher); /* adds after higher */
+}
+
+/* Double check that napi_get_frags() allocates skbs with
+ * skb->head being backed by slab, not a page fragment.
+ * This is to make sure bug fixed in 3226b158e67c
+ * ("net: avoid 32 x truesize under-estimation for tiny skbs")
+ * does not accidentally come back.
+ */
+static void napi_get_frags_check(struct napi_struct *napi)
+{
+ struct sk_buff *skb;
+
+ local_bh_disable();
+ skb = napi_get_frags(napi);
+ WARN_ON_ONCE(skb && skb->head_frag);
+ napi_free_frags(napi);
+ local_bh_enable();
+}
+
+void netif_napi_add_weight_locked(struct net_device *dev,
+ struct napi_struct *napi,
+ int (*poll)(struct napi_struct *, int),
+ int weight)
+{
+ netdev_assert_locked(dev);
+ if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
+ return;
+
INIT_LIST_HEAD(&napi->poll_list);
- hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
- napi->timer.function = napi_watchdog;
- init_gro_hash(napi);
+ INIT_HLIST_NODE(&napi->napi_hash_node);
+ hrtimer_setup(&napi->timer, napi_watchdog, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ gro_init(&napi->gro);
napi->skb = NULL;
napi->poll = poll;
if (weight > NAPI_POLL_WEIGHT)
- pr_err_once("netif_napi_add() called with weight %d on device %s\n",
- weight, dev->name);
+ netdev_err_once(dev, "%s() called with weight %d\n", __func__,
+ weight);
napi->weight = weight;
- list_add(&napi->dev_list, &dev->napi_list);
napi->dev = dev;
#ifdef CONFIG_NETPOLL
napi->poll_owner = -1;
#endif
+ napi->list_owner = -1;
set_bit(NAPI_STATE_SCHED, &napi->state);
- napi_hash_add(napi);
+ set_bit(NAPI_STATE_NPSVC, &napi->state);
+ netif_napi_dev_list_add(dev, napi);
+
+ /* default settings from sysfs are applied to all NAPIs. any per-NAPI
+ * configuration will be loaded in napi_enable
+ */
+ napi_set_defer_hard_irqs(napi, READ_ONCE(dev->napi_defer_hard_irqs));
+ napi_set_gro_flush_timeout(napi, READ_ONCE(dev->gro_flush_timeout));
+
+ napi_get_frags_check(napi);
+ /* Create kthread for this napi if dev->threaded is set.
+ * Clear dev->threaded if kthread creation failed so that
+ * threaded mode will not be enabled in napi_enable().
+ */
+ if (napi_get_threaded_config(dev, napi))
+ if (napi_kthread_create(napi))
+ dev->threaded = NETDEV_NAPI_THREADED_DISABLED;
+ netif_napi_set_irq_locked(napi, -1);
}
-EXPORT_SYMBOL(netif_napi_add);
+EXPORT_SYMBOL(netif_napi_add_weight_locked);
-void napi_disable(struct napi_struct *n)
+void napi_disable_locked(struct napi_struct *n)
{
+ unsigned long val, new;
+
might_sleep();
+ netdev_assert_locked(n->dev);
+
set_bit(NAPI_STATE_DISABLE, &n->state);
- while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
- msleep(1);
- while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
- msleep(1);
+ val = READ_ONCE(n->state);
+ do {
+ while (val & (NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC)) {
+ usleep_range(20, 200);
+ val = READ_ONCE(n->state);
+ }
+
+ new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
+ new &= ~(NAPIF_STATE_THREADED |
+ NAPIF_STATE_THREADED_BUSY_POLL |
+ NAPIF_STATE_PREFER_BUSY_POLL);
+ } while (!try_cmpxchg(&n->state, &val, new));
hrtimer_cancel(&n->timer);
+ if (n->config)
+ napi_save_config(n);
+ else
+ napi_hash_del(n);
+
clear_bit(NAPI_STATE_DISABLE, &n->state);
}
+EXPORT_SYMBOL(napi_disable_locked);
+
+/**
+ * napi_disable() - prevent NAPI from scheduling
+ * @n: NAPI context
+ *
+ * Stop NAPI from being scheduled on this context.
+ * Waits till any outstanding processing completes.
+ * Takes netdev_lock() for associated net_device.
+ */
+void napi_disable(struct napi_struct *n)
+{
+ netdev_lock(n->dev);
+ napi_disable_locked(n);
+ netdev_unlock(n->dev);
+}
EXPORT_SYMBOL(napi_disable);
-static void flush_gro_hash(struct napi_struct *napi)
+void napi_enable_locked(struct napi_struct *n)
{
- int i;
+ unsigned long new, val = READ_ONCE(n->state);
- for (i = 0; i < GRO_HASH_BUCKETS; i++) {
- struct sk_buff *skb, *n;
+ if (n->config)
+ napi_restore_config(n);
+ else
+ napi_hash_add(n);
- list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
- kfree_skb(skb);
- napi->gro_hash[i].count = 0;
- }
+ do {
+ BUG_ON(!test_bit(NAPI_STATE_SCHED, &val));
+
+ new = val & ~(NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC);
+ if (n->dev->threaded && n->thread)
+ new |= NAPIF_STATE_THREADED;
+ } while (!try_cmpxchg(&n->state, &val, new));
+}
+EXPORT_SYMBOL(napi_enable_locked);
+
+/**
+ * napi_enable() - enable NAPI scheduling
+ * @n: NAPI context
+ *
+ * Enable scheduling of a NAPI instance.
+ * Must be paired with napi_disable().
+ * Takes netdev_lock() for associated net_device.
+ */
+void napi_enable(struct napi_struct *n)
+{
+ netdev_lock(n->dev);
+ napi_enable_locked(n);
+ netdev_unlock(n->dev);
}
+EXPORT_SYMBOL(napi_enable);
/* Must be called in process context */
-void netif_napi_del(struct napi_struct *napi)
+void __netif_napi_del_locked(struct napi_struct *napi)
{
- might_sleep();
- if (napi_hash_del(napi))
- synchronize_net();
- list_del_init(&napi->dev_list);
+ netdev_assert_locked(napi->dev);
+
+ if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
+ return;
+
+ /* Make sure NAPI is disabled (or was never enabled). */
+ WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state));
+
+ if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state))
+ irq_set_affinity_notifier(napi->irq, NULL);
+
+ if (napi->config) {
+ napi->index = -1;
+ napi->config = NULL;
+ }
+
+ list_del_rcu(&napi->dev_list);
napi_free_frags(napi);
- flush_gro_hash(napi);
- napi->gro_bitmask = 0;
+ gro_cleanup(&napi->gro);
+
+ if (napi->thread) {
+ kthread_stop(napi->thread);
+ napi->thread = NULL;
+ }
}
-EXPORT_SYMBOL(netif_napi_del);
+EXPORT_SYMBOL(__netif_napi_del_locked);
-static int napi_poll(struct napi_struct *n, struct list_head *repoll)
+static int __napi_poll(struct napi_struct *n, bool *repoll)
{
- void *have;
int work, weight;
- list_del_init(&n->poll_list);
-
- have = netpoll_poll_lock(n);
-
weight = n->weight;
/* This NAPI_STATE_SCHED test is for avoiding a race
@@ -6223,15 +7662,19 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
* accidentally calling ->poll() when NAPI is not scheduled.
*/
work = 0;
- if (test_bit(NAPI_STATE_SCHED, &n->state)) {
+ if (napi_is_scheduled(n)) {
work = n->poll(n, weight);
trace_napi_poll(n, work, weight);
+
+ xdp_do_check_flushed(n);
}
- WARN_ON_ONCE(work > weight);
+ if (unlikely(work > weight))
+ netdev_err_once(n->dev, "NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
+ n->poll, work, weight);
if (likely(work < weight))
- goto out_unlock;
+ return work;
/* Drivers must not modify the NAPI state if they
* consume the entire weight. In such cases this code
@@ -6240,42 +7683,175 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
*/
if (unlikely(napi_disable_pending(n))) {
napi_complete(n);
- goto out_unlock;
+ return work;
}
- if (n->gro_bitmask) {
- /* flush too old packets
- * If HZ < 1000, flush all packets.
- */
- napi_gro_flush(n, HZ >= 1000);
+ /* The NAPI context has more processing work, but busy-polling
+ * is preferred. Exit early.
+ */
+ if (napi_prefer_busy_poll(n)) {
+ if (napi_complete_done(n, work)) {
+ /* If timeout is not set, we need to make sure
+ * that the NAPI is re-scheduled.
+ */
+ napi_schedule(n);
+ }
+ return work;
}
+ /* Flush too old packets. If HZ < 1000, flush all packets */
+ gro_flush_normal(&n->gro, HZ >= 1000);
+
/* Some drivers may have called napi_schedule
* prior to exhausting their budget.
*/
if (unlikely(!list_empty(&n->poll_list))) {
pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
n->dev ? n->dev->name : "backlog");
- goto out_unlock;
+ return work;
}
- list_add_tail(&n->poll_list, repoll);
+ *repoll = true;
-out_unlock:
+ return work;
+}
+
+static int napi_poll(struct napi_struct *n, struct list_head *repoll)
+{
+ bool do_repoll = false;
+ void *have;
+ int work;
+
+ list_del_init(&n->poll_list);
+
+ have = netpoll_poll_lock(n);
+
+ work = __napi_poll(n, &do_repoll);
+
+ if (do_repoll) {
+#if defined(CONFIG_DEBUG_NET)
+ if (unlikely(!napi_is_scheduled(n)))
+ pr_crit("repoll requested for device %s %ps but napi is not scheduled.\n",
+ n->dev->name, n->poll);
+#endif
+ list_add_tail(&n->poll_list, repoll);
+ }
netpoll_poll_unlock(have);
return work;
}
-static __latent_entropy void net_rx_action(struct softirq_action *h)
+static int napi_thread_wait(struct napi_struct *napi)
+{
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ while (!kthread_should_stop()) {
+ /* Testing SCHED_THREADED bit here to make sure the current
+ * kthread owns this napi and could poll on this napi.
+ * Testing SCHED bit is not enough because SCHED bit might be
+ * set by some other busy poll thread or by napi_disable().
+ */
+ if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {
+ WARN_ON(!list_empty(&napi->poll_list));
+ __set_current_state(TASK_RUNNING);
+ return 0;
+ }
+
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
+
+ return -1;
+}
+
+static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll)
+{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+ struct softnet_data *sd;
+ unsigned long last_qs = jiffies;
+
+ for (;;) {
+ bool repoll = false;
+ void *have;
+
+ local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+
+ sd = this_cpu_ptr(&softnet_data);
+ sd->in_napi_threaded_poll = true;
+
+ have = netpoll_poll_lock(napi);
+ __napi_poll(napi, &repoll);
+ netpoll_poll_unlock(have);
+
+ sd->in_napi_threaded_poll = false;
+ barrier();
+
+ if (sd_has_rps_ipi_waiting(sd)) {
+ local_irq_disable();
+ net_rps_action_and_irq_enable(sd);
+ }
+ skb_defer_free_flush();
+ bpf_net_ctx_clear(bpf_net_ctx);
+
+ /* When busy poll is enabled, the old packets are not flushed in
+ * napi_complete_done. So flush them here.
+ */
+ if (busy_poll)
+ gro_flush_normal(&napi->gro, HZ >= 1000);
+ local_bh_enable();
+
+ /* Call cond_resched here to avoid watchdog warnings. */
+ if (repoll || busy_poll) {
+ rcu_softirq_qs_periodic(last_qs);
+ cond_resched();
+ }
+
+ if (!repoll)
+ break;
+ }
+}
+
+static int napi_threaded_poll(void *data)
+{
+ struct napi_struct *napi = data;
+ bool want_busy_poll;
+ bool in_busy_poll;
+ unsigned long val;
+
+ while (!napi_thread_wait(napi)) {
+ val = READ_ONCE(napi->state);
+
+ want_busy_poll = val & NAPIF_STATE_THREADED_BUSY_POLL;
+ in_busy_poll = val & NAPIF_STATE_IN_BUSY_POLL;
+
+ if (unlikely(val & NAPIF_STATE_DISABLE))
+ want_busy_poll = false;
+
+ if (want_busy_poll != in_busy_poll)
+ assign_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state,
+ want_busy_poll);
+
+ napi_threaded_poll_loop(napi, want_busy_poll);
+ }
+
+ return 0;
+}
+
+static __latent_entropy void net_rx_action(void)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies +
- usecs_to_jiffies(netdev_budget_usecs);
- int budget = netdev_budget;
+ usecs_to_jiffies(READ_ONCE(net_hotdata.netdev_budget_usecs));
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
+ int budget = READ_ONCE(net_hotdata.netdev_budget);
LIST_HEAD(list);
LIST_HEAD(repoll);
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
+start:
+ sd->in_net_rx_action = true;
local_irq_disable();
list_splice_init(&sd->poll_list, &list);
local_irq_enable();
@@ -6283,9 +7859,21 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
for (;;) {
struct napi_struct *n;
+ skb_defer_free_flush();
+
if (list_empty(&list)) {
- if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
- goto out;
+ if (list_empty(&repoll)) {
+ sd->in_net_rx_action = false;
+ barrier();
+ /* We need to check if ____napi_schedule()
+ * had refilled poll_list while
+ * sd->in_net_rx_action was true.
+ */
+ if (!list_empty(&sd->poll_list))
+ goto start;
+ if (!sd_has_rps_ipi_waiting(sd))
+ goto end;
+ }
break;
}
@@ -6298,7 +7886,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
*/
if (unlikely(budget <= 0 ||
time_after_eq(jiffies, time_limit))) {
- sd->time_squeeze++;
+ /* Pairs with READ_ONCE() in softnet_seq_show() */
+ WRITE_ONCE(sd->time_squeeze, sd->time_squeeze + 1);
break;
}
}
@@ -6310,18 +7899,24 @@ static __latent_entropy void net_rx_action(struct softirq_action *h)
list_splice(&list, &sd->poll_list);
if (!list_empty(&sd->poll_list))
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ else
+ sd->in_net_rx_action = false;
net_rps_action_and_irq_enable(sd);
-out:
- __kfree_skb_flush();
+end:
+ bpf_net_ctx_clear(bpf_net_ctx);
}
struct netdev_adjacent {
struct net_device *dev;
+ netdevice_tracker dev_tracker;
/* upper master flag, there can only be one master device per list */
bool master;
+ /* lookup ignore flag */
+ bool ignore;
+
/* counter for the number of times this device was added to us */
u16 ref_nr;
@@ -6344,9 +7939,10 @@ static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
return NULL;
}
-static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
+static int ____netdev_has_upper_dev(struct net_device *upper_dev,
+ struct netdev_nested_priv *priv)
{
- struct net_device *dev = data;
+ struct net_device *dev = (struct net_device *)priv->data;
return upper_dev == dev;
}
@@ -6363,15 +7959,19 @@ static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
bool netdev_has_upper_dev(struct net_device *dev,
struct net_device *upper_dev)
{
+ struct netdev_nested_priv priv = {
+ .data = (void *)upper_dev,
+ };
+
ASSERT_RTNL();
- return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
- upper_dev);
+ return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
+ &priv);
}
EXPORT_SYMBOL(netdev_has_upper_dev);
/**
- * netdev_has_upper_dev_all - Check if device is linked to an upper device
+ * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
* @dev: device
* @upper_dev: upper device to check
*
@@ -6383,8 +7983,12 @@ EXPORT_SYMBOL(netdev_has_upper_dev);
bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
struct net_device *upper_dev)
{
- return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
- upper_dev);
+ struct netdev_nested_priv priv = {
+ .data = (void *)upper_dev,
+ };
+
+ return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
+ &priv);
}
EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
@@ -6427,6 +8031,22 @@ struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
}
EXPORT_SYMBOL(netdev_master_upper_dev_get);
+static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
+{
+ struct netdev_adjacent *upper;
+
+ ASSERT_RTNL();
+
+ if (list_empty(&dev->adj_list.upper))
+ return NULL;
+
+ upper = list_first_entry(&dev->adj_list.upper,
+ struct netdev_adjacent, list);
+ if (likely(upper->master) && !upper->ignore)
+ return upper->dev;
+ return NULL;
+}
+
/**
* netdev_has_any_lower_dev - Check if device is linked to some device
* @dev: device
@@ -6477,6 +8097,23 @@ struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
}
EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
+static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
+ struct list_head **iter,
+ bool *ignore)
+{
+ struct netdev_adjacent *upper;
+
+ upper = list_entry((*iter)->next, struct netdev_adjacent, list);
+
+ if (&upper->list == &dev->adj_list.upper)
+ return NULL;
+
+ *iter = &upper->list;
+ *ignore = upper->ignore;
+
+ return upper->dev;
+}
+
static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
struct list_head **iter)
{
@@ -6494,34 +8131,116 @@ static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
return upper->dev;
}
+static int __netdev_walk_all_upper_dev(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ struct netdev_nested_priv *priv),
+ struct netdev_nested_priv *priv)
+{
+ struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
+ bool ignore;
+
+ now = dev;
+ iter = &dev->adj_list.upper;
+
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, priv);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ udev = __netdev_next_upper_dev(now, &iter, &ignore);
+ if (!udev)
+ break;
+ if (ignore)
+ continue;
+
+ next = udev;
+ niter = &udev->adj_list.upper;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
+ }
+
+ return 0;
+}
+
int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
int (*fn)(struct net_device *dev,
- void *data),
- void *data)
+ struct netdev_nested_priv *priv),
+ struct netdev_nested_priv *priv)
{
- struct net_device *udev;
- struct list_head *iter;
- int ret;
+ struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
- for (iter = &dev->adj_list.upper,
- udev = netdev_next_upper_dev_rcu(dev, &iter);
- udev;
- udev = netdev_next_upper_dev_rcu(dev, &iter)) {
- /* first is the upper device itself */
- ret = fn(udev, data);
- if (ret)
- return ret;
+ now = dev;
+ iter = &dev->adj_list.upper;
- /* then look at all of its upper devices */
- ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
- if (ret)
- return ret;
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, priv);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ udev = netdev_next_upper_dev_rcu(now, &iter);
+ if (!udev)
+ break;
+
+ next = udev;
+ niter = &udev->adj_list.upper;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
}
return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
+static bool __netdev_has_upper_dev(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ struct netdev_nested_priv priv = {
+ .flags = 0,
+ .data = (void *)upper_dev,
+ };
+
+ ASSERT_RTNL();
+
+ return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
+ &priv);
+}
+
/**
* netdev_lower_get_next_private - Get the next ->private from the
* lower neighbour list
@@ -6564,7 +8283,7 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev,
{
struct netdev_adjacent *lower;
- WARN_ON_ONCE(!rcu_read_lock_held());
+ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
@@ -6618,36 +8337,121 @@ static struct net_device *netdev_next_lower_dev(struct net_device *dev,
return lower->dev;
}
+static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
+ struct list_head **iter,
+ bool *ignore)
+{
+ struct netdev_adjacent *lower;
+
+ lower = list_entry((*iter)->next, struct netdev_adjacent, list);
+
+ if (&lower->list == &dev->adj_list.lower)
+ return NULL;
+
+ *iter = &lower->list;
+ *ignore = lower->ignore;
+
+ return lower->dev;
+}
+
int netdev_walk_all_lower_dev(struct net_device *dev,
int (*fn)(struct net_device *dev,
- void *data),
- void *data)
+ struct netdev_nested_priv *priv),
+ struct netdev_nested_priv *priv)
{
- struct net_device *ldev;
- struct list_head *iter;
- int ret;
+ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
- for (iter = &dev->adj_list.lower,
- ldev = netdev_next_lower_dev(dev, &iter);
- ldev;
- ldev = netdev_next_lower_dev(dev, &iter)) {
- /* first is the lower device itself */
- ret = fn(ldev, data);
- if (ret)
- return ret;
+ now = dev;
+ iter = &dev->adj_list.lower;
- /* then look at all of its lower devices */
- ret = netdev_walk_all_lower_dev(ldev, fn, data);
- if (ret)
- return ret;
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, priv);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ ldev = netdev_next_lower_dev(now, &iter);
+ if (!ldev)
+ break;
+
+ next = ldev;
+ niter = &ldev->adj_list.lower;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
}
return 0;
}
EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
-static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
- struct list_head **iter)
+static int __netdev_walk_all_lower_dev(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ struct netdev_nested_priv *priv),
+ struct netdev_nested_priv *priv)
+{
+ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
+ bool ignore;
+
+ now = dev;
+ iter = &dev->adj_list.lower;
+
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, priv);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ ldev = __netdev_next_lower_dev(now, &iter, &ignore);
+ if (!ldev)
+ break;
+ if (ignore)
+ continue;
+
+ next = ldev;
+ niter = &ldev->adj_list.lower;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
+ }
+
+ return 0;
+}
+
+struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
+ struct list_head **iter)
{
struct netdev_adjacent *lower;
@@ -6659,29 +8463,123 @@ static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
return lower->dev;
}
+EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
-int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
- int (*fn)(struct net_device *dev,
- void *data),
- void *data)
+static u8 __netdev_upper_depth(struct net_device *dev)
+{
+ struct net_device *udev;
+ struct list_head *iter;
+ u8 max_depth = 0;
+ bool ignore;
+
+ for (iter = &dev->adj_list.upper,
+ udev = __netdev_next_upper_dev(dev, &iter, &ignore);
+ udev;
+ udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
+ if (ignore)
+ continue;
+ if (max_depth < udev->upper_level)
+ max_depth = udev->upper_level;
+ }
+
+ return max_depth;
+}
+
+static u8 __netdev_lower_depth(struct net_device *dev)
{
struct net_device *ldev;
struct list_head *iter;
- int ret;
+ u8 max_depth = 0;
+ bool ignore;
for (iter = &dev->adj_list.lower,
- ldev = netdev_next_lower_dev_rcu(dev, &iter);
+ ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
ldev;
- ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
- /* first is the lower device itself */
- ret = fn(ldev, data);
- if (ret)
- return ret;
+ ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
+ if (ignore)
+ continue;
+ if (max_depth < ldev->lower_level)
+ max_depth = ldev->lower_level;
+ }
- /* then look at all of its lower devices */
- ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
- if (ret)
- return ret;
+ return max_depth;
+}
+
+static int __netdev_update_upper_level(struct net_device *dev,
+ struct netdev_nested_priv *__unused)
+{
+ dev->upper_level = __netdev_upper_depth(dev) + 1;
+ return 0;
+}
+
+#ifdef CONFIG_LOCKDEP
+static LIST_HEAD(net_unlink_list);
+
+static void net_unlink_todo(struct net_device *dev)
+{
+ if (list_empty(&dev->unlink_list))
+ list_add_tail(&dev->unlink_list, &net_unlink_list);
+}
+#endif
+
+static int __netdev_update_lower_level(struct net_device *dev,
+ struct netdev_nested_priv *priv)
+{
+ dev->lower_level = __netdev_lower_depth(dev) + 1;
+
+#ifdef CONFIG_LOCKDEP
+ if (!priv)
+ return 0;
+
+ if (priv->flags & NESTED_SYNC_IMM)
+ dev->nested_level = dev->lower_level - 1;
+ if (priv->flags & NESTED_SYNC_TODO)
+ net_unlink_todo(dev);
+#endif
+ return 0;
+}
+
+int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
+ int (*fn)(struct net_device *dev,
+ struct netdev_nested_priv *priv),
+ struct netdev_nested_priv *priv)
+{
+ struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
+ struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
+ int ret, cur = 0;
+
+ now = dev;
+ iter = &dev->adj_list.lower;
+
+ while (1) {
+ if (now != dev) {
+ ret = fn(now, priv);
+ if (ret)
+ return ret;
+ }
+
+ next = NULL;
+ while (1) {
+ ldev = netdev_next_lower_dev_rcu(now, &iter);
+ if (!ldev)
+ break;
+
+ next = ldev;
+ niter = &ldev->adj_list.lower;
+ dev_stack[cur] = now;
+ iter_stack[cur++] = iter;
+ break;
+ }
+
+ if (!next) {
+ if (!cur)
+ return 0;
+ next = dev_stack[--cur];
+ niter = iter_stack[cur];
+ }
+
+ now = next;
+ iter = niter;
}
return 0;
@@ -6785,7 +8683,8 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
adj->master = master;
adj->ref_nr = 1;
adj->private = private;
- dev_hold(adj_dev);
+ adj->ignore = false;
+ netdev_hold(adj_dev, &adj->dev_tracker, GFP_KERNEL);
pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
@@ -6814,8 +8713,8 @@ remove_symlinks:
if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
free_adj:
+ netdev_put(adj_dev, &adj->dev_tracker);
kfree(adj);
- dev_put(adj_dev);
return ret;
}
@@ -6856,7 +8755,7 @@ static void __netdev_adjacent_dev_remove(struct net_device *dev,
list_del_rcu(&adj->list);
pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
adj_dev->name, dev->name, adj_dev->name);
- dev_put(adj_dev);
+ netdev_put(adj_dev, &adj->dev_tracker);
kfree_rcu(adj, rcu);
}
@@ -6914,6 +8813,7 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
static int __netdev_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev, bool master,
void *upper_priv, void *upper_info,
+ struct netdev_nested_priv *priv,
struct netlink_ext_ack *extack)
{
struct netdev_notifier_changeupper_info changeupper_info = {
@@ -6935,14 +8835,17 @@ static int __netdev_upper_dev_link(struct net_device *dev,
return -EBUSY;
/* To prevent loops, check if dev is not upper device to upper_dev. */
- if (netdev_has_upper_dev(upper_dev, dev))
+ if (__netdev_has_upper_dev(upper_dev, dev))
return -EBUSY;
+ if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
+ return -EMLINK;
+
if (!master) {
- if (netdev_has_upper_dev(dev, upper_dev))
+ if (__netdev_has_upper_dev(dev, upper_dev))
return -EEXIST;
} else {
- master_dev = netdev_master_upper_dev_get(dev);
+ master_dev = __netdev_master_upper_dev_get(dev);
if (master_dev)
return master_dev == upper_dev ? -EEXIST : -EBUSY;
}
@@ -6964,6 +8867,13 @@ static int __netdev_upper_dev_link(struct net_device *dev,
if (ret)
goto rollback;
+ __netdev_update_upper_level(dev, NULL);
+ __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
+
+ __netdev_update_lower_level(upper_dev, priv);
+ __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
+ priv);
+
return 0;
rollback:
@@ -6987,8 +8897,13 @@ int netdev_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev,
struct netlink_ext_ack *extack)
{
+ struct netdev_nested_priv priv = {
+ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
+ .data = NULL,
+ };
+
return __netdev_upper_dev_link(dev, upper_dev, false,
- NULL, NULL, extack);
+ NULL, NULL, &priv, extack);
}
EXPORT_SYMBOL(netdev_upper_dev_link);
@@ -7011,21 +8926,19 @@ int netdev_master_upper_dev_link(struct net_device *dev,
void *upper_priv, void *upper_info,
struct netlink_ext_ack *extack)
{
+ struct netdev_nested_priv priv = {
+ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
+ .data = NULL,
+ };
+
return __netdev_upper_dev_link(dev, upper_dev, true,
- upper_priv, upper_info, extack);
+ upper_priv, upper_info, &priv, extack);
}
EXPORT_SYMBOL(netdev_master_upper_dev_link);
-/**
- * netdev_upper_dev_unlink - Removes a link to upper device
- * @dev: device
- * @upper_dev: new upper device
- *
- * Removes a link to device which is upper to this one. The caller must hold
- * the RTNL lock.
- */
-void netdev_upper_dev_unlink(struct net_device *dev,
- struct net_device *upper_dev)
+static void __netdev_upper_dev_unlink(struct net_device *dev,
+ struct net_device *upper_dev,
+ struct netdev_nested_priv *priv)
{
struct netdev_notifier_changeupper_info changeupper_info = {
.info = {
@@ -7046,9 +8959,129 @@ void netdev_upper_dev_unlink(struct net_device *dev,
call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
&changeupper_info.info);
+
+ __netdev_update_upper_level(dev, NULL);
+ __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
+
+ __netdev_update_lower_level(upper_dev, priv);
+ __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
+ priv);
+}
+
+/**
+ * netdev_upper_dev_unlink - Removes a link to upper device
+ * @dev: device
+ * @upper_dev: new upper device
+ *
+ * Removes a link to device which is upper to this one. The caller must hold
+ * the RTNL lock.
+ */
+void netdev_upper_dev_unlink(struct net_device *dev,
+ struct net_device *upper_dev)
+{
+ struct netdev_nested_priv priv = {
+ .flags = NESTED_SYNC_TODO,
+ .data = NULL,
+ };
+
+ __netdev_upper_dev_unlink(dev, upper_dev, &priv);
}
EXPORT_SYMBOL(netdev_upper_dev_unlink);
+static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
+ struct net_device *lower_dev,
+ bool val)
+{
+ struct netdev_adjacent *adj;
+
+ adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
+ if (adj)
+ adj->ignore = val;
+
+ adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
+ if (adj)
+ adj->ignore = val;
+}
+
+static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
+ struct net_device *lower_dev)
+{
+ __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
+}
+
+static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
+ struct net_device *lower_dev)
+{
+ __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
+}
+
+int netdev_adjacent_change_prepare(struct net_device *old_dev,
+ struct net_device *new_dev,
+ struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_nested_priv priv = {
+ .flags = 0,
+ .data = NULL,
+ };
+ int err;
+
+ if (!new_dev)
+ return 0;
+
+ if (old_dev && new_dev != old_dev)
+ netdev_adjacent_dev_disable(dev, old_dev);
+ err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
+ extack);
+ if (err) {
+ if (old_dev && new_dev != old_dev)
+ netdev_adjacent_dev_enable(dev, old_dev);
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(netdev_adjacent_change_prepare);
+
+void netdev_adjacent_change_commit(struct net_device *old_dev,
+ struct net_device *new_dev,
+ struct net_device *dev)
+{
+ struct netdev_nested_priv priv = {
+ .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
+ .data = NULL,
+ };
+
+ if (!new_dev || !old_dev)
+ return;
+
+ if (new_dev == old_dev)
+ return;
+
+ netdev_adjacent_dev_enable(dev, old_dev);
+ __netdev_upper_dev_unlink(old_dev, dev, &priv);
+}
+EXPORT_SYMBOL(netdev_adjacent_change_commit);
+
+void netdev_adjacent_change_abort(struct net_device *old_dev,
+ struct net_device *new_dev,
+ struct net_device *dev)
+{
+ struct netdev_nested_priv priv = {
+ .flags = 0,
+ .data = NULL,
+ };
+
+ if (!new_dev)
+ return;
+
+ if (old_dev && new_dev != old_dev)
+ netdev_adjacent_dev_enable(dev, old_dev);
+
+ __netdev_upper_dev_unlink(new_dev, dev, &priv);
+}
+EXPORT_SYMBOL(netdev_adjacent_change_abort);
+
/**
* netdev_bonding_info_change - Dispatch event about slave change
* @dev: device
@@ -7071,6 +9104,298 @@ void netdev_bonding_info_change(struct net_device *dev,
}
EXPORT_SYMBOL(netdev_bonding_info_change);
+static int netdev_offload_xstats_enable_l3(struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_offload_xstats_info info = {
+ .info.dev = dev,
+ .info.extack = extack,
+ .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
+ };
+ int err;
+ int rc;
+
+ dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
+ GFP_KERNEL);
+ if (!dev->offload_xstats_l3)
+ return -ENOMEM;
+
+ rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
+ NETDEV_OFFLOAD_XSTATS_DISABLE,
+ &info.info);
+ err = notifier_to_errno(rc);
+ if (err)
+ goto free_stats;
+
+ return 0;
+
+free_stats:
+ kfree(dev->offload_xstats_l3);
+ dev->offload_xstats_l3 = NULL;
+ return err;
+}
+
+int netdev_offload_xstats_enable(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ struct netlink_ext_ack *extack)
+{
+ ASSERT_RTNL();
+
+ if (netdev_offload_xstats_enabled(dev, type))
+ return -EALREADY;
+
+ switch (type) {
+ case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
+ return netdev_offload_xstats_enable_l3(dev, extack);
+ }
+
+ WARN_ON(1);
+ return -EINVAL;
+}
+EXPORT_SYMBOL(netdev_offload_xstats_enable);
+
+static void netdev_offload_xstats_disable_l3(struct net_device *dev)
+{
+ struct netdev_notifier_offload_xstats_info info = {
+ .info.dev = dev,
+ .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
+ };
+
+ call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
+ &info.info);
+ kfree(dev->offload_xstats_l3);
+ dev->offload_xstats_l3 = NULL;
+}
+
+int netdev_offload_xstats_disable(struct net_device *dev,
+ enum netdev_offload_xstats_type type)
+{
+ ASSERT_RTNL();
+
+ if (!netdev_offload_xstats_enabled(dev, type))
+ return -EALREADY;
+
+ switch (type) {
+ case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
+ netdev_offload_xstats_disable_l3(dev);
+ return 0;
+ }
+
+ WARN_ON(1);
+ return -EINVAL;
+}
+EXPORT_SYMBOL(netdev_offload_xstats_disable);
+
+static void netdev_offload_xstats_disable_all(struct net_device *dev)
+{
+ netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
+}
+
+static struct rtnl_hw_stats64 *
+netdev_offload_xstats_get_ptr(const struct net_device *dev,
+ enum netdev_offload_xstats_type type)
+{
+ switch (type) {
+ case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
+ return dev->offload_xstats_l3;
+ }
+
+ WARN_ON(1);
+ return NULL;
+}
+
+bool netdev_offload_xstats_enabled(const struct net_device *dev,
+ enum netdev_offload_xstats_type type)
+{
+ ASSERT_RTNL();
+
+ return netdev_offload_xstats_get_ptr(dev, type);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_enabled);
+
+struct netdev_notifier_offload_xstats_ru {
+ bool used;
+};
+
+struct netdev_notifier_offload_xstats_rd {
+ struct rtnl_hw_stats64 stats;
+ bool used;
+};
+
+static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
+ const struct rtnl_hw_stats64 *src)
+{
+ dest->rx_packets += src->rx_packets;
+ dest->tx_packets += src->tx_packets;
+ dest->rx_bytes += src->rx_bytes;
+ dest->tx_bytes += src->tx_bytes;
+ dest->rx_errors += src->rx_errors;
+ dest->tx_errors += src->tx_errors;
+ dest->rx_dropped += src->rx_dropped;
+ dest->tx_dropped += src->tx_dropped;
+ dest->multicast += src->multicast;
+}
+
+static int netdev_offload_xstats_get_used(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ bool *p_used,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_offload_xstats_ru report_used = {};
+ struct netdev_notifier_offload_xstats_info info = {
+ .info.dev = dev,
+ .info.extack = extack,
+ .type = type,
+ .report_used = &report_used,
+ };
+ int rc;
+
+ WARN_ON(!netdev_offload_xstats_enabled(dev, type));
+ rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
+ &info.info);
+ *p_used = report_used.used;
+ return notifier_to_errno(rc);
+}
+
+static int netdev_offload_xstats_get_stats(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ struct rtnl_hw_stats64 *p_stats,
+ bool *p_used,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_offload_xstats_rd report_delta = {};
+ struct netdev_notifier_offload_xstats_info info = {
+ .info.dev = dev,
+ .info.extack = extack,
+ .type = type,
+ .report_delta = &report_delta,
+ };
+ struct rtnl_hw_stats64 *stats;
+ int rc;
+
+ stats = netdev_offload_xstats_get_ptr(dev, type);
+ if (WARN_ON(!stats))
+ return -EINVAL;
+
+ rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
+ &info.info);
+
+ /* Cache whatever we got, even if there was an error, otherwise the
+ * successful stats retrievals would get lost.
+ */
+ netdev_hw_stats64_add(stats, &report_delta.stats);
+
+ if (p_stats)
+ *p_stats = *stats;
+ *p_used = report_delta.used;
+
+ return notifier_to_errno(rc);
+}
+
+int netdev_offload_xstats_get(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ struct rtnl_hw_stats64 *p_stats, bool *p_used,
+ struct netlink_ext_ack *extack)
+{
+ ASSERT_RTNL();
+
+ if (p_stats)
+ return netdev_offload_xstats_get_stats(dev, type, p_stats,
+ p_used, extack);
+ else
+ return netdev_offload_xstats_get_used(dev, type, p_used,
+ extack);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_get);
+
+void
+netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
+ const struct rtnl_hw_stats64 *stats)
+{
+ report_delta->used = true;
+ netdev_hw_stats64_add(&report_delta->stats, stats);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
+
+void
+netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
+{
+ report_used->used = true;
+}
+EXPORT_SYMBOL(netdev_offload_xstats_report_used);
+
+void netdev_offload_xstats_push_delta(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ const struct rtnl_hw_stats64 *p_stats)
+{
+ struct rtnl_hw_stats64 *stats;
+
+ ASSERT_RTNL();
+
+ stats = netdev_offload_xstats_get_ptr(dev, type);
+ if (WARN_ON(!stats))
+ return;
+
+ netdev_hw_stats64_add(stats, p_stats);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
+
+/**
+ * netdev_get_xmit_slave - Get the xmit slave of master device
+ * @dev: device
+ * @skb: The packet
+ * @all_slaves: assume all the slaves are active
+ *
+ * The reference counters are not incremented so the caller must be
+ * careful with locks. The caller must hold RCU lock.
+ * %NULL is returned if no slave is found.
+ */
+
+struct net_device *netdev_get_xmit_slave(struct net_device *dev,
+ struct sk_buff *skb,
+ bool all_slaves)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_get_xmit_slave)
+ return NULL;
+ return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
+}
+EXPORT_SYMBOL(netdev_get_xmit_slave);
+
+static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
+ struct sock *sk)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!ops->ndo_sk_get_lower_dev)
+ return NULL;
+ return ops->ndo_sk_get_lower_dev(dev, sk);
+}
+
+/**
+ * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
+ * @dev: device
+ * @sk: the socket
+ *
+ * %NULL is returned if no lower device is found.
+ */
+
+struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
+ struct sock *sk)
+{
+ struct net_device *lower;
+
+ lower = netdev_sk_get_lower_dev(dev, sk);
+ while (lower) {
+ dev = lower;
+ lower = netdev_sk_get_lower_dev(dev, sk);
+ }
+
+ return dev;
+}
+EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
+
static void netdev_adjacent_add_links(struct net_device *dev)
{
struct netdev_adjacent *iter;
@@ -7162,27 +9487,8 @@ void *netdev_lower_dev_get_private(struct net_device *dev,
EXPORT_SYMBOL(netdev_lower_dev_get_private);
-int dev_get_nest_level(struct net_device *dev)
-{
- struct net_device *lower = NULL;
- struct list_head *iter;
- int max_nest = -1;
- int nest;
-
- ASSERT_RTNL();
-
- netdev_for_each_lower_dev(dev, lower, iter) {
- nest = dev_get_nest_level(lower);
- if (max_nest < nest)
- max_nest = nest;
- }
-
- return max_nest + 1;
-}
-EXPORT_SYMBOL(dev_get_nest_level);
-
/**
- * netdev_lower_change - Dispatch event about lower device state change
+ * netdev_lower_state_changed - Dispatch event about lower device state change
* @lower_dev: device
* @lower_state_info: state to dispatch
*
@@ -7214,31 +9520,31 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)
static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
{
unsigned int old_flags = dev->flags;
+ unsigned int promiscuity, flags;
kuid_t uid;
kgid_t gid;
ASSERT_RTNL();
- dev->flags |= IFF_PROMISC;
- dev->promiscuity += inc;
- if (dev->promiscuity == 0) {
+ promiscuity = dev->promiscuity + inc;
+ if (promiscuity == 0) {
/*
* Avoid overflow.
* If inc causes overflow, untouch promisc and return error.
*/
- if (inc < 0)
- dev->flags &= ~IFF_PROMISC;
- else {
- dev->promiscuity -= inc;
- pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
- dev->name);
+ if (unlikely(inc > 0)) {
+ netdev_warn(dev, "promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n");
return -EOVERFLOW;
}
+ flags = old_flags & ~IFF_PROMISC;
+ } else {
+ flags = old_flags | IFF_PROMISC;
}
- if (dev->flags != old_flags) {
- pr_info("device %s %s promiscuous mode\n",
- dev->name,
- dev->flags & IFF_PROMISC ? "entered" : "left");
+ WRITE_ONCE(dev->promiscuity, promiscuity);
+ if (flags != old_flags) {
+ WRITE_ONCE(dev->flags, flags);
+ netdev_info(dev, "%s promiscuous mode\n",
+ dev->flags & IFF_PROMISC ? "entered" : "left");
if (audit_enabled) {
current_uid_gid(&uid, &gid);
audit_log(audit_context(), GFP_ATOMIC,
@@ -7254,23 +9560,20 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
dev_change_rx_flags(dev, IFF_PROMISC);
}
- if (notify)
- __dev_notify_flags(dev, old_flags, IFF_PROMISC);
+ if (notify) {
+ /* The ops lock is only required to ensure consistent locking
+ * for `NETDEV_CHANGE` notifiers. This function is sometimes
+ * called without the lock, even for devices that are ops
+ * locked, such as in `dev_uc_sync_multiple` when using
+ * bonding or teaming.
+ */
+ netdev_ops_assert_locked(dev);
+ __dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
+ }
return 0;
}
-/**
- * dev_set_promiscuity - update promiscuity count on a device
- * @dev: device
- * @inc: modifier
- *
- * Add or remove promiscuity from a device. While the count in the device
- * remains above zero the interface remains promiscuous. Once it hits zero
- * the device reverts back to normal filtering operation. A negative inc
- * value is used to drop promiscuity on the device.
- * Return 0 if successful or a negative errno code on error.
- */
-int dev_set_promiscuity(struct net_device *dev, int inc)
+int netif_set_promiscuity(struct net_device *dev, int inc)
{
unsigned int old_flags = dev->flags;
int err;
@@ -7282,59 +9585,42 @@ int dev_set_promiscuity(struct net_device *dev, int inc)
dev_set_rx_mode(dev);
return err;
}
-EXPORT_SYMBOL(dev_set_promiscuity);
-static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
+int netif_set_allmulti(struct net_device *dev, int inc, bool notify)
{
unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
+ unsigned int allmulti, flags;
ASSERT_RTNL();
- dev->flags |= IFF_ALLMULTI;
- dev->allmulti += inc;
- if (dev->allmulti == 0) {
+ allmulti = dev->allmulti + inc;
+ if (allmulti == 0) {
/*
* Avoid overflow.
* If inc causes overflow, untouch allmulti and return error.
*/
- if (inc < 0)
- dev->flags &= ~IFF_ALLMULTI;
- else {
- dev->allmulti -= inc;
- pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
- dev->name);
+ if (unlikely(inc > 0)) {
+ netdev_warn(dev, "allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n");
return -EOVERFLOW;
}
+ flags = old_flags & ~IFF_ALLMULTI;
+ } else {
+ flags = old_flags | IFF_ALLMULTI;
}
- if (dev->flags ^ old_flags) {
+ WRITE_ONCE(dev->allmulti, allmulti);
+ if (flags != old_flags) {
+ WRITE_ONCE(dev->flags, flags);
+ netdev_info(dev, "%s allmulticast mode\n",
+ dev->flags & IFF_ALLMULTI ? "entered" : "left");
dev_change_rx_flags(dev, IFF_ALLMULTI);
dev_set_rx_mode(dev);
if (notify)
__dev_notify_flags(dev, old_flags,
- dev->gflags ^ old_gflags);
+ dev->gflags ^ old_gflags, 0, NULL);
}
return 0;
}
-/**
- * dev_set_allmulti - update allmulti count on a device
- * @dev: device
- * @inc: modifier
- *
- * Add or remove reception of all multicast frames to a device. While the
- * count in the device remains above zero the interface remains listening
- * to all interfaces. Once it hits zero the device reverts back to normal
- * filtering operation. A negative @inc value is used to drop the counter
- * when releasing a resource needing all multicasts.
- * Return 0 if successful or a negative errno code on error.
- */
-
-int dev_set_allmulti(struct net_device *dev, int inc)
-{
- return __dev_set_allmulti(dev, inc, true);
-}
-EXPORT_SYMBOL(dev_set_allmulti);
-
/*
* Upload unicast and multicast address lists to device and
* configure RX filtering. When the device doesn't support unicast
@@ -7377,21 +9663,21 @@ void dev_set_rx_mode(struct net_device *dev)
}
/**
- * dev_get_flags - get flags reported to userspace
- * @dev: device
+ * netif_get_flags() - get flags reported to userspace
+ * @dev: device
*
- * Get the combination of flag bits exported through APIs to userspace.
+ * Get the combination of flag bits exported through APIs to userspace.
*/
-unsigned int dev_get_flags(const struct net_device *dev)
+unsigned int netif_get_flags(const struct net_device *dev)
{
unsigned int flags;
- flags = (dev->flags & ~(IFF_PROMISC |
+ flags = (READ_ONCE(dev->flags) & ~(IFF_PROMISC |
IFF_ALLMULTI |
IFF_RUNNING |
IFF_LOWER_UP |
IFF_DORMANT)) |
- (dev->gflags & (IFF_PROMISC |
+ (READ_ONCE(dev->gflags) & (IFF_PROMISC |
IFF_ALLMULTI));
if (netif_running(dev)) {
@@ -7405,9 +9691,10 @@ unsigned int dev_get_flags(const struct net_device *dev)
return flags;
}
-EXPORT_SYMBOL(dev_get_flags);
+EXPORT_SYMBOL(netif_get_flags);
-int __dev_change_flags(struct net_device *dev, unsigned int flags)
+int __dev_change_flags(struct net_device *dev, unsigned int flags,
+ struct netlink_ext_ack *extack)
{
unsigned int old_flags = dev->flags;
int ret;
@@ -7444,12 +9731,12 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
if (old_flags & IFF_UP)
__dev_close(dev);
else
- ret = __dev_open(dev);
+ ret = __dev_open(dev, extack);
}
if ((flags ^ dev->gflags) & IFF_PROMISC) {
int inc = (flags & IFF_PROMISC) ? 1 : -1;
- unsigned int old_flags = dev->flags;
+ old_flags = dev->flags;
dev->gflags ^= IFF_PROMISC;
@@ -7466,19 +9753,20 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags)
int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
dev->gflags ^= IFF_ALLMULTI;
- __dev_set_allmulti(dev, inc, false);
+ netif_set_allmulti(dev, inc, false);
}
return ret;
}
void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
- unsigned int gchanges)
+ unsigned int gchanges, u32 portid,
+ const struct nlmsghdr *nlh)
{
unsigned int changes = dev->flags ^ old_flags;
if (gchanges)
- rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);
if (changes & IFF_UP) {
if (dev->flags & IFF_UP)
@@ -7500,57 +9788,37 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
}
}
-/**
- * dev_change_flags - change device settings
- * @dev: device
- * @flags: device state flags
- *
- * Change settings on device based state flags. The flags are
- * in the userspace exported format.
- */
-int dev_change_flags(struct net_device *dev, unsigned int flags)
+int netif_change_flags(struct net_device *dev, unsigned int flags,
+ struct netlink_ext_ack *extack)
{
int ret;
unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
- ret = __dev_change_flags(dev, flags);
+ ret = __dev_change_flags(dev, flags, extack);
if (ret < 0)
return ret;
changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
- __dev_notify_flags(dev, old_flags, changes);
+ __dev_notify_flags(dev, old_flags, changes, 0, NULL);
return ret;
}
-EXPORT_SYMBOL(dev_change_flags);
-int __dev_set_mtu(struct net_device *dev, int new_mtu)
+int __netif_set_mtu(struct net_device *dev, int new_mtu)
{
const struct net_device_ops *ops = dev->netdev_ops;
if (ops->ndo_change_mtu)
return ops->ndo_change_mtu(dev, new_mtu);
- dev->mtu = new_mtu;
+ /* Pairs with all the lockless reads of dev->mtu in the stack */
+ WRITE_ONCE(dev->mtu, new_mtu);
return 0;
}
-EXPORT_SYMBOL(__dev_set_mtu);
+EXPORT_SYMBOL_NS_GPL(__netif_set_mtu, "NETDEV_INTERNAL");
-/**
- * dev_set_mtu_ext - Change maximum transfer unit
- * @dev: device
- * @new_mtu: new transfer unit
- * @extack: netlink extended ack
- *
- * Change the maximum transfer size of the network device.
- */
-int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
- struct netlink_ext_ack *extack)
+int dev_validate_mtu(struct net_device *dev, int new_mtu,
+ struct netlink_ext_ack *extack)
{
- int err, orig_mtu;
-
- if (new_mtu == dev->mtu)
- return 0;
-
/* MTU must be positive, and in range */
if (new_mtu < 0 || new_mtu < dev->min_mtu) {
NL_SET_ERR_MSG(extack, "mtu less than device minimum");
@@ -7561,6 +9829,32 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
return -EINVAL;
}
+ return 0;
+}
+
+/**
+ * netif_set_mtu_ext() - Change maximum transfer unit
+ * @dev: device
+ * @new_mtu: new transfer unit
+ * @extack: netlink extended ack
+ *
+ * Change the maximum transfer size of the network device.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int netif_set_mtu_ext(struct net_device *dev, int new_mtu,
+ struct netlink_ext_ack *extack)
+{
+ int err, orig_mtu;
+
+ netdev_ops_assert_locked(dev);
+
+ if (new_mtu == dev->mtu)
+ return 0;
+
+ err = dev_validate_mtu(dev, new_mtu, extack);
+ if (err)
+ return err;
if (!netif_device_present(dev))
return -ENODEV;
@@ -7571,41 +9865,38 @@ int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
return err;
orig_mtu = dev->mtu;
- err = __dev_set_mtu(dev, new_mtu);
+ err = __netif_set_mtu(dev, new_mtu);
if (!err) {
- err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
+ err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
+ orig_mtu);
err = notifier_to_errno(err);
if (err) {
/* setting mtu back and notifying everyone again,
* so that they have a chance to revert changes.
*/
- __dev_set_mtu(dev, orig_mtu);
- call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
+ __netif_set_mtu(dev, orig_mtu);
+ call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
+ new_mtu);
}
}
return err;
}
-int dev_set_mtu(struct net_device *dev, int new_mtu)
+int netif_set_mtu(struct net_device *dev, int new_mtu)
{
struct netlink_ext_ack extack;
int err;
memset(&extack, 0, sizeof(extack));
- err = dev_set_mtu_ext(dev, new_mtu, &extack);
+ err = netif_set_mtu_ext(dev, new_mtu, &extack);
if (err && extack._msg)
net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
return err;
}
-EXPORT_SYMBOL(dev_set_mtu);
+EXPORT_SYMBOL(netif_set_mtu);
-/**
- * dev_change_tx_queue_len - Change TX queue length of a netdevice
- * @dev: device
- * @new_len: new tx queue length
- */
-int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
+int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
{
unsigned int orig_len = dev->tx_queue_len;
int res;
@@ -7614,7 +9905,7 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
return -ERANGE;
if (new_len != orig_len) {
- dev->tx_queue_len = new_len;
+ WRITE_ONCE(dev->tx_queue_len, new_len);
res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
res = notifier_to_errno(res);
if (res)
@@ -7628,57 +9919,96 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
err_rollback:
netdev_err(dev, "refused to change device tx_queue_len\n");
- dev->tx_queue_len = orig_len;
+ WRITE_ONCE(dev->tx_queue_len, orig_len);
return res;
}
-/**
- * dev_set_group - Change group this device belongs to
- * @dev: device
- * @new_group: group this device should belong to
- */
-void dev_set_group(struct net_device *dev, int new_group)
+void netif_set_group(struct net_device *dev, int new_group)
{
dev->group = new_group;
}
-EXPORT_SYMBOL(dev_set_group);
/**
- * dev_set_mac_address - Change Media Access Control Address
- * @dev: device
- * @sa: new address
+ * netif_pre_changeaddr_notify() - Call NETDEV_PRE_CHANGEADDR.
+ * @dev: device
+ * @addr: new address
+ * @extack: netlink extended ack
*
- * Change the hardware (MAC) address of the device
+ * Return: 0 on success, -errno on failure.
*/
-int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
+int netif_pre_changeaddr_notify(struct net_device *dev, const char *addr,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_notifier_pre_changeaddr_info info = {
+ .info.dev = dev,
+ .info.extack = extack,
+ .dev_addr = addr,
+ };
+ int rc;
+
+ rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
+ return notifier_to_errno(rc);
+}
+EXPORT_SYMBOL_NS_GPL(netif_pre_changeaddr_notify, "NETDEV_INTERNAL");
+
+int netif_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
+ struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
int err;
if (!ops->ndo_set_mac_address)
return -EOPNOTSUPP;
- if (sa->sa_family != dev->type)
+ if (ss->ss_family != dev->type)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
- err = ops->ndo_set_mac_address(dev, sa);
+ err = netif_pre_changeaddr_notify(dev, ss->__data, extack);
if (err)
return err;
+ if (memcmp(dev->dev_addr, ss->__data, dev->addr_len)) {
+ err = ops->ndo_set_mac_address(dev, ss);
+ if (err)
+ return err;
+ }
dev->addr_assign_type = NET_ADDR_SET;
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
return 0;
}
-EXPORT_SYMBOL(dev_set_mac_address);
-/**
- * dev_change_carrier - Change device carrier
- * @dev: device
- * @new_carrier: new value
- *
- * Change device carrier
- */
-int dev_change_carrier(struct net_device *dev, bool new_carrier)
+DECLARE_RWSEM(dev_addr_sem);
+
+/* "sa" is a true struct sockaddr with limited "sa_data" member. */
+int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
+{
+ size_t size = sizeof(sa->sa_data);
+ struct net_device *dev;
+ int ret = 0;
+
+ down_read(&dev_addr_sem);
+ rcu_read_lock();
+
+ dev = dev_get_by_name_rcu(net, dev_name);
+ if (!dev) {
+ ret = -ENODEV;
+ goto unlock;
+ }
+ if (!dev->addr_len)
+ memset(sa->sa_data, 0, size);
+ else
+ memcpy(sa->sa_data, dev->dev_addr,
+ min_t(size_t, size, dev->addr_len));
+ sa->sa_family = dev->type;
+
+unlock:
+ rcu_read_unlock();
+ up_read(&dev_addr_sem);
+ return ret;
+}
+EXPORT_SYMBOL_NS_GPL(netif_get_mac_address, "NETDEV_INTERNAL");
+
+int netif_change_carrier(struct net_device *dev, bool new_carrier)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -7688,7 +10018,6 @@ int dev_change_carrier(struct net_device *dev, bool new_carrier)
return -ENODEV;
return ops->ndo_change_carrier(dev, new_carrier);
}
-EXPORT_SYMBOL(dev_change_carrier);
/**
* dev_get_phys_port_id - Get device physical port ID
@@ -7706,7 +10035,6 @@ int dev_get_phys_port_id(struct net_device *dev,
return -EOPNOTSUPP;
return ops->ndo_get_phys_port_id(dev, ppid);
}
-EXPORT_SYMBOL(dev_get_phys_port_id);
/**
* dev_get_phys_port_name - Get device physical port name
@@ -7720,279 +10048,755 @@ int dev_get_phys_port_name(struct net_device *dev,
char *name, size_t len)
{
const struct net_device_ops *ops = dev->netdev_ops;
+ int err;
- if (!ops->ndo_get_phys_port_name)
- return -EOPNOTSUPP;
- return ops->ndo_get_phys_port_name(dev, name, len);
+ if (ops->ndo_get_phys_port_name) {
+ err = ops->ndo_get_phys_port_name(dev, name, len);
+ if (err != -EOPNOTSUPP)
+ return err;
+ }
+ return devlink_compat_phys_port_name_get(dev, name, len);
}
-EXPORT_SYMBOL(dev_get_phys_port_name);
/**
- * dev_change_proto_down - update protocol port state information
- * @dev: device
- * @proto_down: new value
+ * netif_get_port_parent_id() - Get the device's port parent identifier
+ * @dev: network device
+ * @ppid: pointer to a storage for the port's parent identifier
+ * @recurse: allow/disallow recursion to lower devices
+ *
+ * Get the devices's port parent identifier.
*
- * This info can be used by switch drivers to set the phys state of the
- * port.
+ * Return: 0 on success, -errno on failure.
*/
-int dev_change_proto_down(struct net_device *dev, bool proto_down)
+int netif_get_port_parent_id(struct net_device *dev,
+ struct netdev_phys_item_id *ppid, bool recurse)
{
const struct net_device_ops *ops = dev->netdev_ops;
+ struct netdev_phys_item_id first = { };
+ struct net_device *lower_dev;
+ struct list_head *iter;
+ int err;
- if (!ops->ndo_change_proto_down)
+ if (ops->ndo_get_port_parent_id) {
+ err = ops->ndo_get_port_parent_id(dev, ppid);
+ if (err != -EOPNOTSUPP)
+ return err;
+ }
+
+ err = devlink_compat_switch_id_get(dev, ppid);
+ if (!recurse || err != -EOPNOTSUPP)
+ return err;
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ err = netif_get_port_parent_id(lower_dev, ppid, true);
+ if (err)
+ break;
+ if (!first.id_len)
+ first = *ppid;
+ else if (memcmp(&first, ppid, sizeof(*ppid)))
+ return -EOPNOTSUPP;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(netif_get_port_parent_id);
+
+/**
+ * netdev_port_same_parent_id - Indicate if two network devices have
+ * the same port parent identifier
+ * @a: first network device
+ * @b: second network device
+ */
+bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
+{
+ struct netdev_phys_item_id a_id = { };
+ struct netdev_phys_item_id b_id = { };
+
+ if (netif_get_port_parent_id(a, &a_id, true) ||
+ netif_get_port_parent_id(b, &b_id, true))
+ return false;
+
+ return netdev_phys_item_id_same(&a_id, &b_id);
+}
+EXPORT_SYMBOL(netdev_port_same_parent_id);
+
+int netif_change_proto_down(struct net_device *dev, bool proto_down)
+{
+ if (!dev->change_proto_down)
return -EOPNOTSUPP;
if (!netif_device_present(dev))
return -ENODEV;
- return ops->ndo_change_proto_down(dev, proto_down);
+ if (proto_down)
+ netif_carrier_off(dev);
+ else
+ netif_carrier_on(dev);
+ WRITE_ONCE(dev->proto_down, proto_down);
+ return 0;
}
-EXPORT_SYMBOL(dev_change_proto_down);
-u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
- enum bpf_netdev_command cmd)
+/**
+ * netdev_change_proto_down_reason_locked - proto down reason
+ *
+ * @dev: device
+ * @mask: proto down mask
+ * @value: proto down value
+ */
+void netdev_change_proto_down_reason_locked(struct net_device *dev,
+ unsigned long mask, u32 value)
{
- struct netdev_bpf xdp;
+ u32 proto_down_reason;
+ int b;
- if (!bpf_op)
- return 0;
+ if (!mask) {
+ proto_down_reason = value;
+ } else {
+ proto_down_reason = dev->proto_down_reason;
+ for_each_set_bit(b, &mask, 32) {
+ if (value & (1 << b))
+ proto_down_reason |= BIT(b);
+ else
+ proto_down_reason &= ~BIT(b);
+ }
+ }
+ WRITE_ONCE(dev->proto_down_reason, proto_down_reason);
+}
- memset(&xdp, 0, sizeof(xdp));
- xdp.command = cmd;
+struct bpf_xdp_link {
+ struct bpf_link link;
+ struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
+ int flags;
+};
+
+static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
+{
+ if (flags & XDP_FLAGS_HW_MODE)
+ return XDP_MODE_HW;
+ if (flags & XDP_FLAGS_DRV_MODE)
+ return XDP_MODE_DRV;
+ if (flags & XDP_FLAGS_SKB_MODE)
+ return XDP_MODE_SKB;
+ return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
+}
+
+static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
+{
+ switch (mode) {
+ case XDP_MODE_SKB:
+ return generic_xdp_install;
+ case XDP_MODE_DRV:
+ case XDP_MODE_HW:
+ return dev->netdev_ops->ndo_bpf;
+ default:
+ return NULL;
+ }
+}
+
+static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
+ enum bpf_xdp_mode mode)
+{
+ return dev->xdp_state[mode].link;
+}
+
+static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
+ enum bpf_xdp_mode mode)
+{
+ struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
+
+ if (link)
+ return link->link.prog;
+ return dev->xdp_state[mode].prog;
+}
+
+u8 dev_xdp_prog_count(struct net_device *dev)
+{
+ u8 count = 0;
+ int i;
+
+ for (i = 0; i < __MAX_XDP_MODE; i++)
+ if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
+ count++;
+ return count;
+}
+EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
+
+u8 dev_xdp_sb_prog_count(struct net_device *dev)
+{
+ u8 count = 0;
+ int i;
+
+ for (i = 0; i < __MAX_XDP_MODE; i++)
+ if (dev->xdp_state[i].prog &&
+ !dev->xdp_state[i].prog->aux->xdp_has_frags)
+ count++;
+ return count;
+}
+
+int netif_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
+{
+ if (!dev->netdev_ops->ndo_bpf)
+ return -EOPNOTSUPP;
- /* Query must always succeed. */
- WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
+ if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
+ bpf->command == XDP_SETUP_PROG &&
+ bpf->prog && !bpf->prog->aux->xdp_has_frags) {
+ NL_SET_ERR_MSG(bpf->extack,
+ "unable to propagate XDP to device using tcp-data-split");
+ return -EBUSY;
+ }
+
+ if (dev_get_min_mp_channel_count(dev)) {
+ NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider");
+ return -EBUSY;
+ }
- return xdp.prog_id;
+ return dev->netdev_ops->ndo_bpf(dev, bpf);
}
+EXPORT_SYMBOL_GPL(netif_xdp_propagate);
-static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
- struct netlink_ext_ack *extack, u32 flags,
- struct bpf_prog *prog)
+u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
+{
+ struct bpf_prog *prog = dev_xdp_prog(dev, mode);
+
+ return prog ? prog->aux->id : 0;
+}
+
+static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
+ struct bpf_xdp_link *link)
+{
+ dev->xdp_state[mode].link = link;
+ dev->xdp_state[mode].prog = NULL;
+}
+
+static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
+ struct bpf_prog *prog)
+{
+ dev->xdp_state[mode].link = NULL;
+ dev->xdp_state[mode].prog = prog;
+}
+
+static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
+ bpf_op_t bpf_op, struct netlink_ext_ack *extack,
+ u32 flags, struct bpf_prog *prog)
{
struct netdev_bpf xdp;
+ int err;
+
+ netdev_ops_assert_locked(dev);
+
+ if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
+ prog && !prog->aux->xdp_has_frags) {
+ NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split");
+ return -EBUSY;
+ }
+
+ if (dev_get_min_mp_channel_count(dev)) {
+ NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider");
+ return -EBUSY;
+ }
memset(&xdp, 0, sizeof(xdp));
- if (flags & XDP_FLAGS_HW_MODE)
- xdp.command = XDP_SETUP_PROG_HW;
- else
- xdp.command = XDP_SETUP_PROG;
+ xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
xdp.extack = extack;
xdp.flags = flags;
xdp.prog = prog;
- return bpf_op(dev, &xdp);
+ /* Drivers assume refcnt is already incremented (i.e, prog pointer is
+ * "moved" into driver), so they don't increment it on their own, but
+ * they do decrement refcnt when program is detached or replaced.
+ * Given net_device also owns link/prog, we need to bump refcnt here
+ * to prevent drivers from underflowing it.
+ */
+ if (prog)
+ bpf_prog_inc(prog);
+ err = bpf_op(dev, &xdp);
+ if (err) {
+ if (prog)
+ bpf_prog_put(prog);
+ return err;
+ }
+
+ if (mode != XDP_MODE_HW)
+ bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
+
+ return 0;
}
static void dev_xdp_uninstall(struct net_device *dev)
{
- struct netdev_bpf xdp;
- bpf_op_t ndo_bpf;
+ struct bpf_xdp_link *link;
+ struct bpf_prog *prog;
+ enum bpf_xdp_mode mode;
+ bpf_op_t bpf_op;
- /* Remove generic XDP */
- WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
+ ASSERT_RTNL();
- /* Remove from the driver */
- ndo_bpf = dev->netdev_ops->ndo_bpf;
- if (!ndo_bpf)
- return;
+ for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
+ prog = dev_xdp_prog(dev, mode);
+ if (!prog)
+ continue;
- memset(&xdp, 0, sizeof(xdp));
- xdp.command = XDP_QUERY_PROG;
- WARN_ON(ndo_bpf(dev, &xdp));
- if (xdp.prog_id)
- WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
- NULL));
+ bpf_op = dev_xdp_bpf_op(dev, mode);
+ if (!bpf_op)
+ continue;
- /* Remove HW offload */
- memset(&xdp, 0, sizeof(xdp));
- xdp.command = XDP_QUERY_PROG_HW;
- if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
- WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
- NULL));
+ WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
+
+ /* auto-detach link from net device */
+ link = dev_xdp_link(dev, mode);
+ if (link)
+ link->dev = NULL;
+ else
+ bpf_prog_put(prog);
+
+ dev_xdp_set_link(dev, mode, NULL);
+ }
}
-/**
- * dev_change_xdp_fd - set or clear a bpf program for a device rx path
- * @dev: device
- * @extack: netlink extended ack
- * @fd: new program fd or negative value to clear
- * @flags: xdp-related flags
- *
- * Set or clear a bpf program for a device
- */
-int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
- int fd, u32 flags)
+static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
+ struct bpf_xdp_link *link, struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog, u32 flags)
{
- const struct net_device_ops *ops = dev->netdev_ops;
- enum bpf_netdev_command query;
- struct bpf_prog *prog = NULL;
- bpf_op_t bpf_op, bpf_chk;
+ unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
+ struct bpf_prog *cur_prog;
+ struct net_device *upper;
+ struct list_head *iter;
+ enum bpf_xdp_mode mode;
+ bpf_op_t bpf_op;
int err;
ASSERT_RTNL();
- query = flags & XDP_FLAGS_HW_MODE ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
+ /* either link or prog attachment, never both */
+ if (link && (new_prog || old_prog))
+ return -EINVAL;
+ /* link supports only XDP mode flags */
+ if (link && (flags & ~XDP_FLAGS_MODES)) {
+ NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
+ return -EINVAL;
+ }
+ /* just one XDP mode bit should be set, zero defaults to drv/skb mode */
+ if (num_modes > 1) {
+ NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
+ return -EINVAL;
+ }
+ /* avoid ambiguity if offload + drv/skb mode progs are both loaded */
+ if (!num_modes && dev_xdp_prog_count(dev) > 1) {
+ NL_SET_ERR_MSG(extack,
+ "More than one program loaded, unset mode is ambiguous");
+ return -EINVAL;
+ }
+ /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
+ if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
+ NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
+ return -EINVAL;
+ }
- bpf_op = bpf_chk = ops->ndo_bpf;
- if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
- return -EOPNOTSUPP;
- if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))
- bpf_op = generic_xdp_install;
- if (bpf_op == bpf_chk)
- bpf_chk = generic_xdp_install;
+ mode = dev_xdp_mode(dev, flags);
+ /* can't replace attached link */
+ if (dev_xdp_link(dev, mode)) {
+ NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
+ return -EBUSY;
+ }
- if (fd >= 0) {
- if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG) ||
- __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG_HW))
+ /* don't allow if an upper device already has a program */
+ netdev_for_each_upper_dev_rcu(dev, upper, iter) {
+ if (dev_xdp_prog_count(upper) > 0) {
+ NL_SET_ERR_MSG(extack, "Cannot attach when an upper device already has a program");
return -EEXIST;
- if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
- __dev_xdp_query(dev, bpf_op, query))
- return -EBUSY;
+ }
+ }
- prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
- bpf_op == ops->ndo_bpf);
- if (IS_ERR(prog))
- return PTR_ERR(prog);
+ cur_prog = dev_xdp_prog(dev, mode);
+ /* can't replace attached prog with link */
+ if (link && cur_prog) {
+ NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
+ return -EBUSY;
+ }
+ if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
+ NL_SET_ERR_MSG(extack, "Active program does not match expected");
+ return -EEXIST;
+ }
- if (!(flags & XDP_FLAGS_HW_MODE) &&
- bpf_prog_is_dev_bound(prog->aux)) {
- NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported");
- bpf_prog_put(prog);
+ /* put effective new program into new_prog */
+ if (link)
+ new_prog = link->link.prog;
+
+ if (new_prog) {
+ bool offload = mode == XDP_MODE_HW;
+ enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
+ ? XDP_MODE_DRV : XDP_MODE_SKB;
+
+ if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
+ NL_SET_ERR_MSG(extack, "XDP program already attached");
+ return -EBUSY;
+ }
+ if (!offload && dev_xdp_prog(dev, other_mode)) {
+ NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
+ return -EEXIST;
+ }
+ if (!offload && bpf_prog_is_offloaded(new_prog->aux)) {
+ NL_SET_ERR_MSG(extack, "Using offloaded program without HW_MODE flag is not supported");
+ return -EINVAL;
+ }
+ if (bpf_prog_is_dev_bound(new_prog->aux) && !bpf_offload_dev_match(new_prog, dev)) {
+ NL_SET_ERR_MSG(extack, "Program bound to different device");
+ return -EINVAL;
+ }
+ if (bpf_prog_is_dev_bound(new_prog->aux) && mode == XDP_MODE_SKB) {
+ NL_SET_ERR_MSG(extack, "Can't attach device-bound programs in generic mode");
+ return -EINVAL;
+ }
+ if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
+ NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
+ return -EINVAL;
+ }
+ if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
+ NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
return -EINVAL;
}
}
- err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
- if (err < 0 && prog)
- bpf_prog_put(prog);
+ /* don't call drivers if the effective program didn't change */
+ if (new_prog != cur_prog) {
+ bpf_op = dev_xdp_bpf_op(dev, mode);
+ if (!bpf_op) {
+ NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
+ return -EOPNOTSUPP;
+ }
- return err;
+ err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
+ if (err)
+ return err;
+ }
+
+ if (link)
+ dev_xdp_set_link(dev, mode, link);
+ else
+ dev_xdp_set_prog(dev, mode, new_prog);
+ if (cur_prog)
+ bpf_prog_put(cur_prog);
+
+ return 0;
}
-/**
- * dev_new_index - allocate an ifindex
- * @net: the applicable net namespace
- *
- * Returns a suitable unique value for a new device interface
- * number. The caller must hold the rtnl semaphore or the
- * dev_base_lock to be sure it remains unique.
- */
-static int dev_new_index(struct net *net)
+static int dev_xdp_attach_link(struct net_device *dev,
+ struct netlink_ext_ack *extack,
+ struct bpf_xdp_link *link)
{
- int ifindex = net->ifindex;
+ return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
+}
- for (;;) {
- if (++ifindex <= 0)
- ifindex = 1;
- if (!__dev_get_by_index(net, ifindex))
- return net->ifindex = ifindex;
+static int dev_xdp_detach_link(struct net_device *dev,
+ struct netlink_ext_ack *extack,
+ struct bpf_xdp_link *link)
+{
+ enum bpf_xdp_mode mode;
+ bpf_op_t bpf_op;
+
+ ASSERT_RTNL();
+
+ mode = dev_xdp_mode(dev, link->flags);
+ if (dev_xdp_link(dev, mode) != link)
+ return -EINVAL;
+
+ bpf_op = dev_xdp_bpf_op(dev, mode);
+ WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
+ dev_xdp_set_link(dev, mode, NULL);
+ return 0;
+}
+
+static void bpf_xdp_link_release(struct bpf_link *link)
+{
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+
+ rtnl_lock();
+
+ /* if racing with net_device's tear down, xdp_link->dev might be
+ * already NULL, in which case link was already auto-detached
+ */
+ if (xdp_link->dev) {
+ netdev_lock_ops(xdp_link->dev);
+ WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
+ netdev_unlock_ops(xdp_link->dev);
+ xdp_link->dev = NULL;
}
+
+ rtnl_unlock();
}
-/* Delayed registration/unregisteration */
-static LIST_HEAD(net_todo_list);
-DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
+static int bpf_xdp_link_detach(struct bpf_link *link)
+{
+ bpf_xdp_link_release(link);
+ return 0;
+}
-static void net_set_todo(struct net_device *dev)
+static void bpf_xdp_link_dealloc(struct bpf_link *link)
{
- list_add_tail(&dev->todo_list, &net_todo_list);
- dev_net(dev)->dev_unreg_count++;
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+
+ kfree(xdp_link);
}
-static void rollback_registered_many(struct list_head *head)
+static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
{
- struct net_device *dev, *tmp;
- LIST_HEAD(close_head);
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+ u32 ifindex = 0;
- BUG_ON(dev_boot_phase);
- ASSERT_RTNL();
+ rtnl_lock();
+ if (xdp_link->dev)
+ ifindex = xdp_link->dev->ifindex;
+ rtnl_unlock();
- list_for_each_entry_safe(dev, tmp, head, unreg_list) {
- /* Some devices call without registering
- * for initialization unwind. Remove those
- * devices and proceed with the remaining.
- */
- if (dev->reg_state == NETREG_UNINITIALIZED) {
- pr_debug("unregister_netdevice: device %s/%p never was registered\n",
- dev->name, dev);
+ seq_printf(seq, "ifindex:\t%u\n", ifindex);
+}
- WARN_ON(1);
- list_del(&dev->unreg_list);
- continue;
- }
- dev->dismantle = true;
- BUG_ON(dev->reg_state != NETREG_REGISTERED);
+static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+ u32 ifindex = 0;
+
+ rtnl_lock();
+ if (xdp_link->dev)
+ ifindex = xdp_link->dev->ifindex;
+ rtnl_unlock();
+
+ info->xdp.ifindex = ifindex;
+ return 0;
+}
+
+static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
+ struct bpf_prog *old_prog)
+{
+ struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+ enum bpf_xdp_mode mode;
+ bpf_op_t bpf_op;
+ int err = 0;
+
+ rtnl_lock();
+
+ /* link might have been auto-released already, so fail */
+ if (!xdp_link->dev) {
+ err = -ENOLINK;
+ goto out_unlock;
}
- /* If device is running, close it first. */
- list_for_each_entry(dev, head, unreg_list)
- list_add_tail(&dev->close_list, &close_head);
- dev_close_many(&close_head, true);
+ if (old_prog && link->prog != old_prog) {
+ err = -EPERM;
+ goto out_unlock;
+ }
+ old_prog = link->prog;
+ if (old_prog->type != new_prog->type ||
+ old_prog->expected_attach_type != new_prog->expected_attach_type) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
- list_for_each_entry(dev, head, unreg_list) {
- /* And unlink it from device chain. */
- unlist_netdevice(dev);
+ if (old_prog == new_prog) {
+ /* no-op, don't disturb drivers */
+ bpf_prog_put(new_prog);
+ goto out_unlock;
+ }
- dev->reg_state = NETREG_UNREGISTERING;
+ netdev_lock_ops(xdp_link->dev);
+ mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
+ bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
+ err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
+ xdp_link->flags, new_prog);
+ netdev_unlock_ops(xdp_link->dev);
+ if (err)
+ goto out_unlock;
+
+ old_prog = xchg(&link->prog, new_prog);
+ bpf_prog_put(old_prog);
+
+out_unlock:
+ rtnl_unlock();
+ return err;
+}
+
+static const struct bpf_link_ops bpf_xdp_link_lops = {
+ .release = bpf_xdp_link_release,
+ .dealloc = bpf_xdp_link_dealloc,
+ .detach = bpf_xdp_link_detach,
+ .show_fdinfo = bpf_xdp_link_show_fdinfo,
+ .fill_link_info = bpf_xdp_link_fill_link_info,
+ .update_prog = bpf_xdp_link_update,
+};
+
+int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct net *net = current->nsproxy->net_ns;
+ struct bpf_link_primer link_primer;
+ struct netlink_ext_ack extack = {};
+ struct bpf_xdp_link *link;
+ struct net_device *dev;
+ int err, fd;
+
+ rtnl_lock();
+ dev = dev_get_by_index(net, attr->link_create.target_ifindex);
+ if (!dev) {
+ rtnl_unlock();
+ return -EINVAL;
}
- flush_all_backlogs();
- synchronize_net();
+ link = kzalloc(sizeof(*link), GFP_USER);
+ if (!link) {
+ err = -ENOMEM;
+ goto unlock;
+ }
- list_for_each_entry(dev, head, unreg_list) {
- struct sk_buff *skb = NULL;
+ bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog,
+ attr->link_create.attach_type);
+ link->dev = dev;
+ link->flags = attr->link_create.flags;
- /* Shutdown queueing discipline. */
- dev_shutdown(dev);
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err) {
+ kfree(link);
+ goto unlock;
+ }
- dev_xdp_uninstall(dev);
+ netdev_lock_ops(dev);
+ err = dev_xdp_attach_link(dev, &extack, link);
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
- /* Notify protocols, that we are about to destroy
- * this device. They should clean all the things.
- */
- call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+ if (err) {
+ link->dev = NULL;
+ bpf_link_cleanup(&link_primer);
+ trace_bpf_xdp_link_attach_failed(extack._msg);
+ goto out_put_dev;
+ }
- if (!dev->rtnl_link_ops ||
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
- skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
- GFP_KERNEL, NULL, 0);
+ fd = bpf_link_settle(&link_primer);
+ /* link itself doesn't hold dev's refcnt to not complicate shutdown */
+ dev_put(dev);
+ return fd;
- /*
- * Flush the unicast and multicast chains
- */
- dev_uc_flush(dev);
- dev_mc_flush(dev);
+unlock:
+ rtnl_unlock();
- if (dev->netdev_ops->ndo_uninit)
- dev->netdev_ops->ndo_uninit(dev);
+out_put_dev:
+ dev_put(dev);
+ return err;
+}
- if (skb)
- rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
+/**
+ * dev_change_xdp_fd - set or clear a bpf program for a device rx path
+ * @dev: device
+ * @extack: netlink extended ack
+ * @fd: new program fd or negative value to clear
+ * @expected_fd: old program fd that userspace expects to replace or clear
+ * @flags: xdp-related flags
+ *
+ * Set or clear a bpf program for a device
+ */
+int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
+ int fd, int expected_fd, u32 flags)
+{
+ enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
+ struct bpf_prog *new_prog = NULL, *old_prog = NULL;
+ int err;
- /* Notifier chain MUST detach us all upper devices. */
- WARN_ON(netdev_has_any_upper_dev(dev));
- WARN_ON(netdev_has_any_lower_dev(dev));
+ ASSERT_RTNL();
- /* Remove entries from kobject tree */
- netdev_unregister_kobject(dev);
-#ifdef CONFIG_XPS
- /* Remove XPS queueing entries */
- netif_reset_xps_queues_gt(dev, 0);
-#endif
+ if (fd >= 0) {
+ new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
+ mode != XDP_MODE_SKB);
+ if (IS_ERR(new_prog))
+ return PTR_ERR(new_prog);
+ }
+
+ if (expected_fd >= 0) {
+ old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
+ mode != XDP_MODE_SKB);
+ if (IS_ERR(old_prog)) {
+ err = PTR_ERR(old_prog);
+ old_prog = NULL;
+ goto err_out;
+ }
}
- synchronize_net();
+ err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
- list_for_each_entry(dev, head, unreg_list)
- dev_put(dev);
+err_out:
+ if (err && new_prog)
+ bpf_prog_put(new_prog);
+ if (old_prog)
+ bpf_prog_put(old_prog);
+ return err;
}
-static void rollback_registered(struct net_device *dev)
+u32 dev_get_min_mp_channel_count(const struct net_device *dev)
{
- LIST_HEAD(single);
+ int i;
- list_add(&dev->unreg_list, &single);
- rollback_registered_many(&single);
- list_del(&single);
+ netdev_ops_assert_locked(dev);
+
+ for (i = dev->real_num_rx_queues - 1; i >= 0; i--)
+ if (dev->_rx[i].mp_params.mp_priv)
+ /* The channel count is the idx plus 1. */
+ return i + 1;
+
+ return 0;
+}
+
+/**
+ * dev_index_reserve() - allocate an ifindex in a namespace
+ * @net: the applicable net namespace
+ * @ifindex: requested ifindex, pass %0 to get one allocated
+ *
+ * Allocate a ifindex for a new device. Caller must either use the ifindex
+ * to store the device (via list_netdevice()) or call dev_index_release()
+ * to give the index up.
+ *
+ * Return: a suitable unique value for a new device interface number or -errno.
+ */
+static int dev_index_reserve(struct net *net, u32 ifindex)
+{
+ int err;
+
+ if (ifindex > INT_MAX) {
+ DEBUG_NET_WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+
+ if (!ifindex)
+ err = xa_alloc_cyclic(&net->dev_by_index, &ifindex, NULL,
+ xa_limit_31b, &net->ifindex, GFP_KERNEL);
+ else
+ err = xa_insert(&net->dev_by_index, ifindex, NULL, GFP_KERNEL);
+ if (err < 0)
+ return err;
+
+ return ifindex;
+}
+
+static void dev_index_release(struct net *net, int ifindex)
+{
+ /* Expect only unused indexes, unlist_netdevice() removes the used */
+ WARN_ON(xa_erase(&net->dev_by_index, ifindex));
+}
+
+static bool from_cleanup_net(void)
+{
+#ifdef CONFIG_NET_NS
+ return current == READ_ONCE(cleanup_net_task);
+#else
+ return false;
+#endif
+}
+
+/* Delayed registration/unregisteration */
+LIST_HEAD(net_todo_list);
+DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
+atomic_t dev_unreg_count = ATOMIC_INIT(0);
+
+static void net_set_todo(struct net_device *dev)
+{
+ list_add_tail(&dev->todo_list, &net_todo_list);
}
static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
@@ -8002,7 +10806,7 @@ static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
netdev_features_t feature;
int feature_bit;
- for_each_netdev_feature(&upper_disables, feature_bit) {
+ for_each_netdev_feature(upper_disables, feature_bit) {
feature = __NETIF_F_BIT(feature_bit);
if (!(upper->wanted_features & feature)
&& (features & feature)) {
@@ -8022,21 +10826,34 @@ static void netdev_sync_lower_features(struct net_device *upper,
netdev_features_t feature;
int feature_bit;
- for_each_netdev_feature(&upper_disables, feature_bit) {
+ for_each_netdev_feature(upper_disables, feature_bit) {
feature = __NETIF_F_BIT(feature_bit);
if (!(features & feature) && (lower->features & feature)) {
netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
&feature, lower->name);
+ netdev_lock_ops(lower);
lower->wanted_features &= ~feature;
- netdev_update_features(lower);
+ __netdev_update_features(lower);
if (unlikely(lower->features & feature))
netdev_WARN(upper, "failed to disable %pNF on %s!\n",
&feature, lower->name);
+ else
+ netdev_features_change(lower);
+ netdev_unlock_ops(lower);
}
}
}
+static bool netdev_has_ip_or_hw_csum(netdev_features_t features)
+{
+ netdev_features_t ip_csum_mask = NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+ bool ip_csum = (features & ip_csum_mask) == ip_csum_mask;
+ bool hw_csum = features & NETIF_F_HW_CSUM;
+
+ return ip_csum || hw_csum;
+}
+
static netdev_features_t netdev_fix_features(struct net_device *dev,
netdev_features_t features)
{
@@ -8113,6 +10930,26 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
}
}
+ if ((features & NETIF_F_GRO_HW) && (features & NETIF_F_LRO)) {
+ netdev_dbg(dev, "Dropping LRO feature since HW-GRO is requested.\n");
+ features &= ~NETIF_F_LRO;
+ }
+
+ if ((features & NETIF_F_HW_TLS_TX) && !netdev_has_ip_or_hw_csum(features)) {
+ netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
+ features &= ~NETIF_F_HW_TLS_TX;
+ }
+
+ if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
+ netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
+ features &= ~NETIF_F_HW_TLS_RX;
+ }
+
+ if ((features & NETIF_F_GSO_UDP_L4) && !netdev_has_ip_or_hw_csum(features)) {
+ netdev_dbg(dev, "Dropping USO feature since no CSUM feature.\n");
+ features &= ~NETIF_F_GSO_UDP_L4;
+ }
+
return features;
}
@@ -8124,6 +10961,7 @@ int __netdev_update_features(struct net_device *dev)
int err = -1;
ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
features = netdev_get_wanted_features(dev);
@@ -8133,7 +10971,7 @@ int __netdev_update_features(struct net_device *dev)
/* driver might be less strict about feature dependencies */
features = netdev_fix_features(dev, features);
- /* some features can't be enabled if they're off an an upper device */
+ /* some features can't be enabled if they're off on an upper device */
netdev_for_each_upper_dev_rcu(dev, upper, iter)
features = netdev_sync_upper_features(dev, upper, features);
@@ -8176,12 +11014,14 @@ sync_lower:
* *before* calling udp_tunnel_get_rx_info,
* but *after* calling udp_tunnel_drop_rx_info.
*/
+ udp_tunnel_nic_lock(dev);
if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
dev->features = features;
udp_tunnel_get_rx_info(dev);
} else {
udp_tunnel_drop_rx_info(dev);
}
+ udp_tunnel_nic_unlock(dev);
}
if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
@@ -8257,6 +11097,11 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev,
else
netif_dormant_off(dev);
+ if (rootdev->operstate == IF_OPER_TESTING)
+ netif_testing_on(dev);
+ else
+ netif_testing_off(dev);
+
if (netif_carrier_ok(rootdev))
netif_carrier_on(dev);
else
@@ -8273,7 +11118,7 @@ static int netif_alloc_rx_queues(struct net_device *dev)
BUG_ON(count < 1);
- rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ rx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
if (!rx)
return -ENOMEM;
@@ -8283,7 +11128,7 @@ static int netif_alloc_rx_queues(struct net_device *dev)
rx[i].dev = dev;
/* XDP RX-queue setup */
- err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i);
+ err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
if (err < 0)
goto err_rxq_info;
}
@@ -8340,7 +11185,7 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
if (count < 1 || count > 0xffff)
return -EINVAL;
- tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ tx = kvzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
if (!tx)
return -ENOMEM;
@@ -8364,23 +11209,74 @@ void netif_tx_stop_all_queues(struct net_device *dev)
}
EXPORT_SYMBOL(netif_tx_stop_all_queues);
+static int netdev_do_alloc_pcpu_stats(struct net_device *dev)
+{
+ void __percpu *v;
+
+ /* Drivers implementing ndo_get_peer_dev must support tstat
+ * accounting, so that skb_do_redirect() can bump the dev's
+ * RX stats upon network namespace switch.
+ */
+ if (dev->netdev_ops->ndo_get_peer_dev &&
+ dev->pcpu_stat_type != NETDEV_PCPU_STAT_TSTATS)
+ return -EOPNOTSUPP;
+
+ switch (dev->pcpu_stat_type) {
+ case NETDEV_PCPU_STAT_NONE:
+ return 0;
+ case NETDEV_PCPU_STAT_LSTATS:
+ v = dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats);
+ break;
+ case NETDEV_PCPU_STAT_TSTATS:
+ v = dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ break;
+ case NETDEV_PCPU_STAT_DSTATS:
+ v = dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return v ? 0 : -ENOMEM;
+}
+
+static void netdev_do_free_pcpu_stats(struct net_device *dev)
+{
+ switch (dev->pcpu_stat_type) {
+ case NETDEV_PCPU_STAT_NONE:
+ return;
+ case NETDEV_PCPU_STAT_LSTATS:
+ free_percpu(dev->lstats);
+ break;
+ case NETDEV_PCPU_STAT_TSTATS:
+ free_percpu(dev->tstats);
+ break;
+ case NETDEV_PCPU_STAT_DSTATS:
+ free_percpu(dev->dstats);
+ break;
+ }
+}
+
+static void netdev_free_phy_link_topology(struct net_device *dev)
+{
+ struct phy_link_topology *topo = dev->link_topo;
+
+ if (IS_ENABLED(CONFIG_PHYLIB) && topo) {
+ xa_destroy(&topo->phys);
+ kfree(topo);
+ dev->link_topo = NULL;
+ }
+}
+
/**
- * register_netdevice - register a network device
- * @dev: device to register
+ * register_netdevice() - register a network device
+ * @dev: device to register
*
- * Take a completed network device structure and add it to the kernel
- * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
- * chain. 0 is returned on success. A negative errno code is returned
- * on a failure to set up the device, or if the name is a duplicate.
- *
- * Callers must hold the rtnl semaphore. You may want
- * register_netdev() instead of this.
- *
- * BUGS:
- * The locking appears insufficient to guarantee two parallel registers
- * will not get the same name.
+ * Take a prepared network device structure and make it externally accessible.
+ * A %NETDEV_REGISTER message is sent to the netdev notifier chain.
+ * Callers must hold the rtnl lock - you may want register_netdev()
+ * instead of this.
*/
-
int register_netdevice(struct net_device *dev)
{
int ret;
@@ -8397,6 +11293,14 @@ int register_netdevice(struct net_device *dev)
BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
BUG_ON(!net);
+ ret = ethtool_check_ops(dev->ethtool_ops);
+ if (ret)
+ return ret;
+
+ /* rss ctx ID 0 is reserved for the default context, start from 1 */
+ xa_init_flags(&dev->ethtool->rss_ctx, XA_FLAGS_ALLOC1);
+ mutex_init(&dev->ethtool->rss_lock);
+
spin_lock_init(&dev->addr_list_lock);
netdev_set_addr_lockdep_class(dev);
@@ -8404,13 +11308,18 @@ int register_netdevice(struct net_device *dev)
if (ret < 0)
goto out;
+ ret = -ENOMEM;
+ dev->name_node = netdev_name_node_head_alloc(dev);
+ if (!dev->name_node)
+ goto out;
+
/* Init, if this function is available */
if (dev->netdev_ops->ndo_init) {
ret = dev->netdev_ops->ndo_init(dev);
if (ret) {
if (ret > 0)
ret = -EIO;
- goto out;
+ goto err_free_name;
}
}
@@ -8423,19 +11332,22 @@ int register_netdevice(struct net_device *dev)
goto err_uninit;
}
- ret = -EBUSY;
- if (!dev->ifindex)
- dev->ifindex = dev_new_index(net);
- else if (__dev_get_by_index(net, dev->ifindex))
+ ret = netdev_do_alloc_pcpu_stats(dev);
+ if (ret)
goto err_uninit;
+ ret = dev_index_reserve(net, dev->ifindex);
+ if (ret < 0)
+ goto err_free_pcpu;
+ dev->ifindex = ret;
+
/* Transfer changeable features to wanted_features and enable
* software offloads (GSO and GRO).
*/
- dev->hw_features |= NETIF_F_SOFT_FEATURES;
+ dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
dev->features |= NETIF_F_SOFT_FEATURES;
- if (dev->netdev_ops->ndo_udp_tunnel_add) {
+ if (dev->udp_tunnel_nic_info) {
dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
}
@@ -8474,14 +11386,20 @@ int register_netdevice(struct net_device *dev)
ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
ret = notifier_to_errno(ret);
if (ret)
- goto err_uninit;
+ goto err_ifindex_release;
ret = netdev_register_kobject(dev);
+
+ netdev_lock(dev);
+ WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
+ netdev_unlock(dev);
+
if (ret)
- goto err_uninit;
- dev->reg_state = NETREG_REGISTERED;
+ goto err_uninit_notify;
+ netdev_lock_ops(dev);
__netdev_update_features(dev);
+ netdev_unlock_ops(dev);
/*
* Default initial state at registry is that the
@@ -8493,8 +11411,10 @@ int register_netdevice(struct net_device *dev)
linkwatch_init_dev(dev);
dev_init_scheduler(dev);
- dev_hold(dev);
+
+ netdev_hold(dev, &dev->dev_registered_tracker, GFP_KERNEL);
list_netdevice(dev);
+
add_device_randomness(dev->dev_addr, dev->addr_len);
/* If the device has permanent device address, driver should
@@ -8505,59 +11425,54 @@ int register_netdevice(struct net_device *dev)
memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
/* Notify protocols, that a new device appeared. */
+ netdev_lock_ops(dev);
ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
+ netdev_unlock_ops(dev);
ret = notifier_to_errno(ret);
if (ret) {
- rollback_registered(dev);
- dev->reg_state = NETREG_UNREGISTERED;
+ /* Expect explicit free_netdev() on failure */
+ dev->needs_free_netdev = false;
+ unregister_netdevice_queue(dev, NULL);
+ goto out;
}
/*
* Prevent userspace races by waiting until the network
* device is fully setup before sending notifications.
*/
- if (!dev->rtnl_link_ops ||
- dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
- rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
+ if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing))
+ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
out:
return ret;
+err_uninit_notify:
+ call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
+err_ifindex_release:
+ dev_index_release(net, dev->ifindex);
+err_free_pcpu:
+ netdev_do_free_pcpu_stats(dev);
err_uninit:
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
if (dev->priv_destructor)
dev->priv_destructor(dev);
+err_free_name:
+ netdev_name_node_free(dev->name_node);
goto out;
}
EXPORT_SYMBOL(register_netdevice);
-/**
- * init_dummy_netdev - init a dummy network device for NAPI
- * @dev: device to init
- *
- * This takes a network device structure and initialize the minimum
- * amount of fields so it can be used to schedule NAPI polls without
- * registering a full blown interface. This is to be used by drivers
- * that need to tie several hardware interfaces to a single NAPI
- * poll scheduler due to HW limitations.
+/* Initialize the core of a dummy net device.
+ * The setup steps dummy netdevs need which normal netdevs get by going
+ * through register_netdevice().
*/
-int init_dummy_netdev(struct net_device *dev)
+static void init_dummy_netdev(struct net_device *dev)
{
- /* Clear everything. Note we don't initialize spinlocks
- * are they aren't supposed to be taken by any of the
- * NAPI code and this dummy netdev is supposed to be
- * only ever used for NAPI polls
- */
- memset(dev, 0, sizeof(struct net_device));
-
/* make sure we BUG if trying to hit standard
* register/unregister code path
*/
dev->reg_state = NETREG_DUMMY;
- /* NAPI wants this */
- INIT_LIST_HEAD(&dev->napi_list);
-
/* a dummy interface is started by default */
set_bit(__LINK_STATE_PRESENT, &dev->state);
set_bit(__LINK_STATE_START, &dev->state);
@@ -8566,11 +11481,7 @@ int init_dummy_netdev(struct net_device *dev)
* because users of this 'device' dont need to change
* its refcount.
*/
-
- return 0;
}
-EXPORT_SYMBOL_GPL(init_dummy_netdev);
-
/**
* register_netdev - register a network device
@@ -8587,29 +11498,41 @@ EXPORT_SYMBOL_GPL(init_dummy_netdev);
*/
int register_netdev(struct net_device *dev)
{
+ struct net *net = dev_net(dev);
int err;
- if (rtnl_lock_killable())
+ if (rtnl_net_lock_killable(net))
return -EINTR;
+
err = register_netdevice(dev);
- rtnl_unlock();
+
+ rtnl_net_unlock(net);
+
return err;
}
EXPORT_SYMBOL(register_netdev);
int netdev_refcnt_read(const struct net_device *dev)
{
+#ifdef CONFIG_PCPU_DEV_REFCNT
int i, refcnt = 0;
for_each_possible_cpu(i)
refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
return refcnt;
+#else
+ return refcount_read(&dev->dev_refcnt);
+#endif
}
EXPORT_SYMBOL(netdev_refcnt_read);
+int netdev_unregister_timeout_secs __read_mostly = 10;
+
+#define WAIT_REFS_MIN_MSECS 1
+#define WAIT_REFS_MAX_MSECS 250
/**
- * netdev_wait_allrefs - wait until all references are gone.
- * @dev: target net_device
+ * netdev_wait_allrefs_any - wait until all references are gone.
+ * @list: list of net_devices to wait on
*
* This is called when unregistering network devices.
*
@@ -8619,50 +11542,69 @@ EXPORT_SYMBOL(netdev_refcnt_read);
* We can get stuck here if buggy protocols don't correctly
* call dev_put.
*/
-static void netdev_wait_allrefs(struct net_device *dev)
+static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
{
unsigned long rebroadcast_time, warning_time;
- int refcnt;
-
- linkwatch_forget_dev(dev);
+ struct net_device *dev;
+ int wait = 0;
rebroadcast_time = warning_time = jiffies;
- refcnt = netdev_refcnt_read(dev);
- while (refcnt != 0) {
+ list_for_each_entry(dev, list, todo_list)
+ if (netdev_refcnt_read(dev) == 1)
+ return dev;
+
+ while (true) {
if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
rtnl_lock();
/* Rebroadcast unregister notification */
- call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+ list_for_each_entry(dev, list, todo_list)
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
__rtnl_unlock();
rcu_barrier();
rtnl_lock();
- if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
- &dev->state)) {
- /* We must not have linkwatch events
- * pending on unregister. If this
- * happens, we simply run the queue
- * unscheduled, resulting in a noop
- * for this device.
- */
- linkwatch_run_queue();
- }
+ list_for_each_entry(dev, list, todo_list)
+ if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
+ &dev->state)) {
+ /* We must not have linkwatch events
+ * pending on unregister. If this
+ * happens, we simply run the queue
+ * unscheduled, resulting in a noop
+ * for this device.
+ */
+ linkwatch_run_queue();
+ break;
+ }
__rtnl_unlock();
rebroadcast_time = jiffies;
}
- msleep(250);
+ rcu_barrier();
- refcnt = netdev_refcnt_read(dev);
+ if (!wait) {
+ wait = WAIT_REFS_MIN_MSECS;
+ } else {
+ msleep(wait);
+ wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
+ }
+
+ list_for_each_entry(dev, list, todo_list)
+ if (netdev_refcnt_read(dev) == 1)
+ return dev;
+
+ if (time_after(jiffies, warning_time +
+ READ_ONCE(netdev_unregister_timeout_secs) * HZ)) {
+ list_for_each_entry(dev, list, todo_list) {
+ pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
+ dev->name, netdev_refcnt_read(dev));
+ ref_tracker_dir_print(&dev->refcnt_tracker, 10);
+ }
- if (time_after(jiffies, warning_time + 10 * HZ)) {
- pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
- dev->name, refcnt);
warning_time = jiffies;
}
}
@@ -8694,57 +11636,117 @@ static void netdev_wait_allrefs(struct net_device *dev)
*/
void netdev_run_todo(void)
{
+ struct net_device *dev, *tmp;
struct list_head list;
+ int cnt;
+#ifdef CONFIG_LOCKDEP
+ struct list_head unlink_list;
+
+ list_replace_init(&net_unlink_list, &unlink_list);
+
+ while (!list_empty(&unlink_list)) {
+ dev = list_first_entry(&unlink_list, struct net_device,
+ unlink_list);
+ list_del_init(&dev->unlink_list);
+ dev->nested_level = dev->lower_level - 1;
+ }
+#endif
/* Snapshot list, allow later requests */
list_replace_init(&net_todo_list, &list);
__rtnl_unlock();
-
/* Wait for rcu callbacks to finish before next phase */
if (!list_empty(&list))
rcu_barrier();
- while (!list_empty(&list)) {
- struct net_device *dev
- = list_first_entry(&list, struct net_device, todo_list);
- list_del(&dev->todo_list);
-
+ list_for_each_entry_safe(dev, tmp, &list, todo_list) {
if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
- pr_err("network todo '%s' but state %d\n",
- dev->name, dev->reg_state);
- dump_stack();
+ netdev_WARN(dev, "run_todo but not unregistering\n");
+ list_del(&dev->todo_list);
continue;
}
- dev->reg_state = NETREG_UNREGISTERED;
+ netdev_lock(dev);
+ WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
+ netdev_unlock(dev);
+ linkwatch_sync_dev(dev);
+ }
- netdev_wait_allrefs(dev);
+ cnt = 0;
+ while (!list_empty(&list)) {
+ dev = netdev_wait_allrefs_any(&list);
+ list_del(&dev->todo_list);
/* paranoia */
- BUG_ON(netdev_refcnt_read(dev));
+ BUG_ON(netdev_refcnt_read(dev) != 1);
BUG_ON(!list_empty(&dev->ptype_all));
BUG_ON(!list_empty(&dev->ptype_specific));
WARN_ON(rcu_access_pointer(dev->ip_ptr));
WARN_ON(rcu_access_pointer(dev->ip6_ptr));
-#if IS_ENABLED(CONFIG_DECNET)
- WARN_ON(dev->dn_ptr);
-#endif
+
+ netdev_do_free_pcpu_stats(dev);
if (dev->priv_destructor)
dev->priv_destructor(dev);
if (dev->needs_free_netdev)
free_netdev(dev);
- /* Report a network device has been unregistered */
- rtnl_lock();
- dev_net(dev)->dev_unreg_count--;
- __rtnl_unlock();
- wake_up(&netdev_unregistering_wq);
+ cnt++;
/* Free network device */
kobject_put(&dev->dev.kobj);
}
+ if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count))
+ wake_up(&netdev_unregistering_wq);
+}
+
+/* Collate per-cpu network dstats statistics
+ *
+ * Read per-cpu network statistics from dev->dstats and populate the related
+ * fields in @s.
+ */
+static void dev_fetch_dstats(struct rtnl_link_stats64 *s,
+ const struct pcpu_dstats __percpu *dstats)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ u64 rx_packets, rx_bytes, rx_drops;
+ u64 tx_packets, tx_bytes, tx_drops;
+ const struct pcpu_dstats *stats;
+ unsigned int start;
+
+ stats = per_cpu_ptr(dstats, cpu);
+ do {
+ start = u64_stats_fetch_begin(&stats->syncp);
+ rx_packets = u64_stats_read(&stats->rx_packets);
+ rx_bytes = u64_stats_read(&stats->rx_bytes);
+ rx_drops = u64_stats_read(&stats->rx_drops);
+ tx_packets = u64_stats_read(&stats->tx_packets);
+ tx_bytes = u64_stats_read(&stats->tx_bytes);
+ tx_drops = u64_stats_read(&stats->tx_drops);
+ } while (u64_stats_fetch_retry(&stats->syncp, start));
+
+ s->rx_packets += rx_packets;
+ s->rx_bytes += rx_bytes;
+ s->rx_dropped += rx_drops;
+ s->tx_packets += tx_packets;
+ s->tx_bytes += tx_bytes;
+ s->tx_dropped += tx_drops;
+ }
+}
+
+/* ndo_get_stats64 implementation for dtstats-based accounting.
+ *
+ * Populate @s from dev->stats and dev->dstats. This is used internally by the
+ * core for NETDEV_PCPU_STAT_DSTAT-type stats collection.
+ */
+static void dev_get_dstats64(const struct net_device *dev,
+ struct rtnl_link_stats64 *s)
+{
+ netdev_stats_to_stats64(s, &dev->stats);
+ dev_fetch_dstats(s, dev->dstats);
}
/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
@@ -8755,27 +11757,51 @@ void netdev_run_todo(void)
void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
const struct net_device_stats *netdev_stats)
{
-#if BITS_PER_LONG == 64
- BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
- memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
- /* zero out counters that only exist in rtnl_link_stats64 */
- memset((char *)stats64 + sizeof(*netdev_stats), 0,
- sizeof(*stats64) - sizeof(*netdev_stats));
-#else
- size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
- const unsigned long *src = (const unsigned long *)netdev_stats;
+ size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
+ const atomic_long_t *src = (atomic_long_t *)netdev_stats;
u64 *dst = (u64 *)stats64;
BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
for (i = 0; i < n; i++)
- dst[i] = src[i];
+ dst[i] = (unsigned long)atomic_long_read(&src[i]);
/* zero out counters that only exist in rtnl_link_stats64 */
memset((char *)stats64 + n * sizeof(u64), 0,
sizeof(*stats64) - n * sizeof(u64));
-#endif
}
EXPORT_SYMBOL(netdev_stats_to_stats64);
+static __cold struct net_device_core_stats __percpu *netdev_core_stats_alloc(
+ struct net_device *dev)
+{
+ struct net_device_core_stats __percpu *p;
+
+ p = alloc_percpu_gfp(struct net_device_core_stats,
+ GFP_ATOMIC | __GFP_NOWARN);
+
+ if (p && cmpxchg(&dev->core_stats, NULL, p))
+ free_percpu(p);
+
+ /* This READ_ONCE() pairs with the cmpxchg() above */
+ return READ_ONCE(dev->core_stats);
+}
+
+noinline void netdev_core_stats_inc(struct net_device *dev, u32 offset)
+{
+ /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
+ struct net_device_core_stats __percpu *p = READ_ONCE(dev->core_stats);
+ unsigned long __percpu *field;
+
+ if (unlikely(!p)) {
+ p = netdev_core_stats_alloc(dev);
+ if (!p)
+ return;
+ }
+
+ field = (unsigned long __percpu *)((void __percpu *)p + offset);
+ this_cpu_inc(*field);
+}
+EXPORT_SYMBOL_GPL(netdev_core_stats_inc);
+
/**
* dev_get_stats - get network device statistics
* @dev: device to get statistics from
@@ -8790,22 +11816,102 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
struct rtnl_link_stats64 *storage)
{
const struct net_device_ops *ops = dev->netdev_ops;
+ const struct net_device_core_stats __percpu *p;
+
+ /*
+ * IPv{4,6} and udp tunnels share common stat helpers and use
+ * different stat type (NETDEV_PCPU_STAT_TSTATS vs
+ * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent.
+ */
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) !=
+ offsetof(struct pcpu_dstats, rx_bytes));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) !=
+ offsetof(struct pcpu_dstats, rx_packets));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) !=
+ offsetof(struct pcpu_dstats, tx_bytes));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) !=
+ offsetof(struct pcpu_dstats, tx_packets));
if (ops->ndo_get_stats64) {
memset(storage, 0, sizeof(*storage));
ops->ndo_get_stats64(dev, storage);
} else if (ops->ndo_get_stats) {
netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
+ } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_TSTATS) {
+ dev_get_tstats64(dev, storage);
+ } else if (dev->pcpu_stat_type == NETDEV_PCPU_STAT_DSTATS) {
+ dev_get_dstats64(dev, storage);
} else {
netdev_stats_to_stats64(storage, &dev->stats);
}
- storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
- storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
- storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
+
+ /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
+ p = READ_ONCE(dev->core_stats);
+ if (p) {
+ const struct net_device_core_stats *core_stats;
+ int i;
+
+ for_each_possible_cpu(i) {
+ core_stats = per_cpu_ptr(p, i);
+ storage->rx_dropped += READ_ONCE(core_stats->rx_dropped);
+ storage->tx_dropped += READ_ONCE(core_stats->tx_dropped);
+ storage->rx_nohandler += READ_ONCE(core_stats->rx_nohandler);
+ storage->rx_otherhost_dropped += READ_ONCE(core_stats->rx_otherhost_dropped);
+ }
+ }
return storage;
}
EXPORT_SYMBOL(dev_get_stats);
+/**
+ * dev_fetch_sw_netstats - get per-cpu network device statistics
+ * @s: place to store stats
+ * @netstats: per-cpu network stats to read from
+ *
+ * Read per-cpu network statistics and populate the related fields in @s.
+ */
+void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
+ const struct pcpu_sw_netstats __percpu *netstats)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+ const struct pcpu_sw_netstats *stats;
+ unsigned int start;
+
+ stats = per_cpu_ptr(netstats, cpu);
+ do {
+ start = u64_stats_fetch_begin(&stats->syncp);
+ rx_packets = u64_stats_read(&stats->rx_packets);
+ rx_bytes = u64_stats_read(&stats->rx_bytes);
+ tx_packets = u64_stats_read(&stats->tx_packets);
+ tx_bytes = u64_stats_read(&stats->tx_bytes);
+ } while (u64_stats_fetch_retry(&stats->syncp, start));
+
+ s->rx_packets += rx_packets;
+ s->rx_bytes += rx_bytes;
+ s->tx_packets += tx_packets;
+ s->tx_bytes += tx_bytes;
+ }
+}
+EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
+
+/**
+ * dev_get_tstats64 - ndo_get_stats64 implementation
+ * @dev: device to get statistics from
+ * @s: place to store stats
+ *
+ * Populate @s from dev->stats and dev->tstats. Can be used as
+ * ndo_get_stats64() callback.
+ */
+void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
+{
+ netdev_stats_to_stats64(s, &dev->stats);
+ dev_fetch_sw_netstats(s, dev->tstats);
+}
+EXPORT_SYMBOL_GPL(dev_get_tstats64);
+
struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
{
struct netdev_queue *queue = dev_ingress_queue(dev);
@@ -8818,7 +11924,7 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
return NULL;
netdev_init_one_queue(dev, queue, NULL);
RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
- queue->qdisc_sleeping = &noop_qdisc;
+ RCU_INIT_POINTER(queue->qdisc_sleeping, &noop_qdisc);
rcu_assign_pointer(dev->ingress_queue, queue);
#endif
return queue;
@@ -8834,12 +11940,23 @@ void netdev_set_default_ethtool_ops(struct net_device *dev,
}
EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
-void netdev_freemem(struct net_device *dev)
+/**
+ * netdev_sw_irq_coalesce_default_on() - enable SW IRQ coalescing by default
+ * @dev: netdev to enable the IRQ coalescing on
+ *
+ * Sets a conservative default for SW IRQ coalescing. Users can use
+ * sysfs attributes to override the default values.
+ */
+void netdev_sw_irq_coalesce_default_on(struct net_device *dev)
{
- char *addr = (char *)dev - dev->padded;
+ WARN_ON(dev->reg_state == NETREG_REGISTERED);
- kvfree(addr);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ netdev_set_gro_flush_timeout(dev, 20000);
+ netdev_set_defer_hard_irqs(dev, 1);
+ }
}
+EXPORT_SYMBOL_GPL(netdev_sw_irq_coalesce_default_on);
/**
* alloc_netdev_mqs - allocate network device
@@ -8860,8 +11977,8 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
unsigned int txqs, unsigned int rxqs)
{
struct net_device *dev;
- unsigned int alloc_size;
- struct net_device *p;
+ size_t napi_config_sz;
+ unsigned int maxqs;
BUG_ON(strlen(name) >= sizeof(dev->name));
@@ -8875,25 +11992,24 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
return NULL;
}
- alloc_size = sizeof(struct net_device);
- if (sizeof_priv) {
- /* ensure 32-byte alignment of private area */
- alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
- alloc_size += sizeof_priv;
- }
- /* ensure 32-byte alignment of whole construct */
- alloc_size += NETDEV_ALIGN - 1;
+ maxqs = max(txqs, rxqs);
- p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
- if (!p)
+ dev = kvzalloc(struct_size(dev, priv, sizeof_priv),
+ GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
+ if (!dev)
return NULL;
- dev = PTR_ALIGN(p, NETDEV_ALIGN);
- dev->padded = (char *)dev - (char *)p;
+ dev->priv_len = sizeof_priv;
+ ref_tracker_dir_init(&dev->refcnt_tracker, 128, "netdev");
+#ifdef CONFIG_PCPU_DEV_REFCNT
dev->pcpu_refcnt = alloc_percpu(int);
if (!dev->pcpu_refcnt)
goto free_dev;
+ __dev_hold(dev);
+#else
+ refcount_set(&dev->dev_refcnt, 1);
+#endif
if (dev_addr_init(dev))
goto free_pcpu;
@@ -8903,8 +12019,20 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev_net_set(dev, &init_net);
- dev->gso_max_size = GSO_MAX_SIZE;
+ dev->gso_max_size = GSO_LEGACY_MAX_SIZE;
+ dev->xdp_zc_max_segs = 1;
dev->gso_max_segs = GSO_MAX_SEGS;
+ dev->gro_max_size = GRO_LEGACY_MAX_SIZE;
+ dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE;
+ dev->gro_ipv4_max_size = GRO_LEGACY_MAX_SIZE;
+ dev->tso_max_size = TSO_LEGACY_MAX_SIZE;
+ dev->tso_max_segs = TSO_MAX_SEGS;
+ dev->upper_level = 1;
+ dev->lower_level = 1;
+#ifdef CONFIG_LOCKDEP
+ dev->nested_level = 0;
+ INIT_LIST_HEAD(&dev->unlink_list);
+#endif
INIT_LIST_HEAD(&dev->napi_list);
INIT_LIST_HEAD(&dev->unreg_list);
@@ -8914,9 +12042,13 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
INIT_LIST_HEAD(&dev->adj_list.lower);
INIT_LIST_HEAD(&dev->ptype_all);
INIT_LIST_HEAD(&dev->ptype_specific);
+ INIT_LIST_HEAD(&dev->net_notifier_list);
#ifdef CONFIG_NET_SCHED
hash_init(dev->qdisc_hash);
#endif
+
+ mutex_init(&dev->lock);
+
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
@@ -8934,14 +12066,28 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev->real_num_rx_queues = rxqs;
if (netif_alloc_rx_queues(dev))
goto free_all;
+ dev->ethtool = kzalloc(sizeof(*dev->ethtool), GFP_KERNEL_ACCOUNT);
+ if (!dev->ethtool)
+ goto free_all;
- strcpy(dev->name, name);
+ dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT);
+ if (!dev->cfg)
+ goto free_all;
+ dev->cfg_pending = dev->cfg;
+
+ dev->num_napi_configs = maxqs;
+ napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
+ dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
+ if (!dev->napi_config)
+ goto free_all;
+
+ strscpy(dev->name, name);
dev->name_assign_type = name_assign_type;
dev->group = INIT_NETDEV_GROUP;
if (!dev->ethtool_ops)
dev->ethtool_ops = &default_ethtool_ops;
- nf_hook_ingress_init(dev);
+ nf_hook_netdev_init(dev);
return dev;
@@ -8950,13 +12096,31 @@ free_all:
return NULL;
free_pcpu:
+#ifdef CONFIG_PCPU_DEV_REFCNT
free_percpu(dev->pcpu_refcnt);
free_dev:
- netdev_freemem(dev);
+#endif
+ kvfree(dev);
return NULL;
}
EXPORT_SYMBOL(alloc_netdev_mqs);
+static void netdev_napi_exit(struct net_device *dev)
+{
+ if (!list_empty(&dev->napi_list)) {
+ struct napi_struct *p, *n;
+
+ netdev_lock(dev);
+ list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
+ __netif_napi_del_locked(p);
+ netdev_unlock(dev);
+
+ synchronize_net();
+ }
+
+ kvfree(dev->napi_config);
+}
+
/**
* free_netdev - free network device
* @dev: device
@@ -8968,9 +12132,21 @@ EXPORT_SYMBOL(alloc_netdev_mqs);
*/
void free_netdev(struct net_device *dev)
{
- struct napi_struct *p, *n;
-
might_sleep();
+
+ /* When called immediately after register_netdevice() failed the unwind
+ * handling may still be dismantling the device. Handle that case by
+ * deferring the free.
+ */
+ if (dev->reg_state == NETREG_UNREGISTERING) {
+ ASSERT_RTNL();
+ dev->needs_free_netdev = true;
+ return;
+ }
+
+ WARN_ON(dev->cfg != dev->cfg_pending);
+ kfree(dev->cfg);
+ kfree(dev->ethtool);
netif_free_tx_queues(dev);
netif_free_rx_queues(dev);
@@ -8979,20 +12155,33 @@ void free_netdev(struct net_device *dev)
/* Flush device addresses */
dev_addr_flush(dev);
- list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
- netif_napi_del(p);
+ netdev_napi_exit(dev);
+ netif_del_cpu_rmap(dev);
+
+ ref_tracker_dir_exit(&dev->refcnt_tracker);
+#ifdef CONFIG_PCPU_DEV_REFCNT
free_percpu(dev->pcpu_refcnt);
dev->pcpu_refcnt = NULL;
+#endif
+ free_percpu(dev->core_stats);
+ dev->core_stats = NULL;
+ free_percpu(dev->xdp_bulkq);
+ dev->xdp_bulkq = NULL;
+
+ netdev_free_phy_link_topology(dev);
+
+ mutex_destroy(&dev->lock);
/* Compatibility with error handling in drivers */
- if (dev->reg_state == NETREG_UNINITIALIZED) {
- netdev_freemem(dev);
+ if (dev->reg_state == NETREG_UNINITIALIZED ||
+ dev->reg_state == NETREG_DUMMY) {
+ kvfree(dev);
return;
}
BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
- dev->reg_state = NETREG_RELEASED;
+ WRITE_ONCE(dev->reg_state, NETREG_RELEASED);
/* will free via device release */
put_device(&dev->dev);
@@ -9000,6 +12189,19 @@ void free_netdev(struct net_device *dev)
EXPORT_SYMBOL(free_netdev);
/**
+ * alloc_netdev_dummy - Allocate and initialize a dummy net device.
+ * @sizeof_priv: size of private data to allocate space for
+ *
+ * Return: the allocated net_device on success, NULL otherwise
+ */
+struct net_device *alloc_netdev_dummy(int sizeof_priv)
+{
+ return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN,
+ init_dummy_netdev);
+}
+EXPORT_SYMBOL_GPL(alloc_netdev_dummy);
+
+/**
* synchronize_net - Synchronize with packet receive processing
*
* Wait for packets currently being received to be done.
@@ -9008,13 +12210,28 @@ EXPORT_SYMBOL(free_netdev);
void synchronize_net(void)
{
might_sleep();
- if (rtnl_is_locked())
+ if (from_cleanup_net() || rtnl_is_locked())
synchronize_rcu_expedited();
else
synchronize_rcu();
}
EXPORT_SYMBOL(synchronize_net);
+static void netdev_rss_contexts_free(struct net_device *dev)
+{
+ struct ethtool_rxfh_context *ctx;
+ unsigned long context;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ xa_for_each(&dev->ethtool->rss_ctx, context, ctx) {
+ xa_erase(&dev->ethtool->rss_ctx, context);
+ dev->ethtool_ops->remove_rxfh_context(dev, ctx, context, NULL);
+ kfree(ctx);
+ }
+ xa_destroy(&dev->ethtool->rss_ctx);
+ mutex_unlock(&dev->ethtool->rss_lock);
+}
+
/**
* unregister_netdevice_queue - remove device from the kernel
* @dev: device
@@ -9035,30 +12252,196 @@ void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
if (head) {
list_move_tail(&dev->unreg_list, head);
} else {
- rollback_registered(dev);
- /* Finish processing unregister after unlock */
- net_set_todo(dev);
+ LIST_HEAD(single);
+
+ list_add(&dev->unreg_list, &single);
+ unregister_netdevice_many(&single);
}
}
EXPORT_SYMBOL(unregister_netdevice_queue);
+static void dev_memory_provider_uninstall(struct net_device *dev)
+{
+ unsigned int i;
+
+ for (i = 0; i < dev->real_num_rx_queues; i++) {
+ struct netdev_rx_queue *rxq = &dev->_rx[i];
+ struct pp_memory_provider_params *p = &rxq->mp_params;
+
+ if (p->mp_ops && p->mp_ops->uninstall)
+ p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq);
+ }
+}
+
+/* devices must be UP and netdev_lock()'d */
+static void netif_close_many_and_unlock(struct list_head *close_head)
+{
+ struct net_device *dev, *tmp;
+
+ netif_close_many(close_head, false);
+
+ /* ... now unlock them */
+ list_for_each_entry_safe(dev, tmp, close_head, close_list) {
+ netdev_unlock(dev);
+ list_del_init(&dev->close_list);
+ }
+}
+
+static void netif_close_many_and_unlock_cond(struct list_head *close_head)
+{
+#ifdef CONFIG_LOCKDEP
+ /* We can only track up to MAX_LOCK_DEPTH locks per task.
+ *
+ * Reserve half the available slots for additional locks possibly
+ * taken by notifiers and (soft)irqs.
+ */
+ unsigned int limit = MAX_LOCK_DEPTH / 2;
+
+ if (lockdep_depth(current) > limit)
+ netif_close_many_and_unlock(close_head);
+#endif
+}
+
+void unregister_netdevice_many_notify(struct list_head *head,
+ u32 portid, const struct nlmsghdr *nlh)
+{
+ struct net_device *dev, *tmp;
+ LIST_HEAD(close_head);
+ int cnt = 0;
+
+ BUG_ON(dev_boot_phase);
+ ASSERT_RTNL();
+
+ if (list_empty(head))
+ return;
+
+ list_for_each_entry_safe(dev, tmp, head, unreg_list) {
+ /* Some devices call without registering
+ * for initialization unwind. Remove those
+ * devices and proceed with the remaining.
+ */
+ if (dev->reg_state == NETREG_UNINITIALIZED) {
+ pr_debug("unregister_netdevice: device %s/%p never was registered\n",
+ dev->name, dev);
+
+ WARN_ON(1);
+ list_del(&dev->unreg_list);
+ continue;
+ }
+ dev->dismantle = true;
+ BUG_ON(dev->reg_state != NETREG_REGISTERED);
+ }
+
+ /* If device is running, close it first. Start with ops locked... */
+ list_for_each_entry(dev, head, unreg_list) {
+ if (!(dev->flags & IFF_UP))
+ continue;
+ if (netdev_need_ops_lock(dev)) {
+ list_add_tail(&dev->close_list, &close_head);
+ netdev_lock(dev);
+ }
+ netif_close_many_and_unlock_cond(&close_head);
+ }
+ netif_close_many_and_unlock(&close_head);
+ /* ... now go over the rest. */
+ list_for_each_entry(dev, head, unreg_list) {
+ if (!netdev_need_ops_lock(dev))
+ list_add_tail(&dev->close_list, &close_head);
+ }
+ netif_close_many(&close_head, true);
+
+ list_for_each_entry(dev, head, unreg_list) {
+ /* And unlink it from device chain. */
+ unlist_netdevice(dev);
+ netdev_lock(dev);
+ WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
+ netdev_unlock(dev);
+ }
+ flush_all_backlogs();
+
+ synchronize_net();
+
+ list_for_each_entry(dev, head, unreg_list) {
+ struct sk_buff *skb = NULL;
+
+ /* Shutdown queueing discipline. */
+ netdev_lock_ops(dev);
+ dev_shutdown(dev);
+ dev_tcx_uninstall(dev);
+ dev_xdp_uninstall(dev);
+ dev_memory_provider_uninstall(dev);
+ netdev_unlock_ops(dev);
+ bpf_dev_bound_netdev_unregister(dev);
+
+ netdev_offload_xstats_disable_all(dev);
+
+ /* Notify protocols, that we are about to destroy
+ * this device. They should clean all the things.
+ */
+ call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+
+ if (!(dev->rtnl_link_ops && dev->rtnl_link_initializing))
+ skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
+ GFP_KERNEL, NULL, 0,
+ portid, nlh);
+
+ /*
+ * Flush the unicast and multicast chains
+ */
+ dev_uc_flush(dev);
+ dev_mc_flush(dev);
+
+ netdev_name_node_alt_flush(dev);
+ netdev_name_node_free(dev->name_node);
+
+ netdev_rss_contexts_free(dev);
+
+ call_netdevice_notifiers(NETDEV_PRE_UNINIT, dev);
+
+ if (dev->netdev_ops->ndo_uninit)
+ dev->netdev_ops->ndo_uninit(dev);
+
+ mutex_destroy(&dev->ethtool->rss_lock);
+
+ net_shaper_flush_netdev(dev);
+
+ if (skb)
+ rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, portid, nlh);
+
+ /* Notifier chain MUST detach us all upper devices. */
+ WARN_ON(netdev_has_any_upper_dev(dev));
+ WARN_ON(netdev_has_any_lower_dev(dev));
+
+ /* Remove entries from kobject tree */
+ netdev_unregister_kobject(dev);
+#ifdef CONFIG_XPS
+ /* Remove XPS queueing entries */
+ netif_reset_xps_queues_gt(dev, 0);
+#endif
+ }
+
+ synchronize_net();
+
+ list_for_each_entry(dev, head, unreg_list) {
+ netdev_put(dev, &dev->dev_registered_tracker);
+ net_set_todo(dev);
+ cnt++;
+ }
+ atomic_add(cnt, &dev_unreg_count);
+
+ list_del(head);
+}
+
/**
* unregister_netdevice_many - unregister many devices
* @head: list of devices
*
* Note: As most callers use a stack allocated list_head,
- * we force a list_del() to make sure stack wont be corrupted later.
+ * we force a list_del() to make sure stack won't be corrupted later.
*/
void unregister_netdevice_many(struct list_head *head)
{
- struct net_device *dev;
-
- if (!list_empty(head)) {
- rollback_registered_many(head);
- list_for_each_entry(dev, head, unreg_list)
- net_set_todo(dev);
- list_del(head);
- }
+ unregister_netdevice_many_notify(head, 0, NULL);
}
EXPORT_SYMBOL(unregister_netdevice_many);
@@ -9075,73 +12458,114 @@ EXPORT_SYMBOL(unregister_netdevice_many);
*/
void unregister_netdev(struct net_device *dev)
{
- rtnl_lock();
+ rtnl_net_dev_lock(dev);
unregister_netdevice(dev);
- rtnl_unlock();
+ rtnl_net_dev_unlock(dev);
}
EXPORT_SYMBOL(unregister_netdev);
-/**
- * dev_change_net_namespace - move device to different nethost namespace
- * @dev: device
- * @net: network namespace
- * @pat: If not NULL name pattern to try if the current device name
- * is already taken in the destination network namespace.
- *
- * This function shuts down a device interface and moves it
- * to a new network namespace. On success 0 is returned, on
- * a failure a netagive errno code is returned.
- *
- * Callers must hold the rtnl semaphore.
- */
-
-int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
+int __dev_change_net_namespace(struct net_device *dev, struct net *net,
+ const char *pat, int new_ifindex,
+ struct netlink_ext_ack *extack)
{
- int err, new_nsid, new_ifindex;
+ struct netdev_name_node *name_node;
+ struct net *net_old = dev_net(dev);
+ char new_name[IFNAMSIZ] = {};
+ int err, new_nsid;
ASSERT_RTNL();
/* Don't allow namespace local devices to be moved. */
err = -EINVAL;
- if (dev->features & NETIF_F_NETNS_LOCAL)
+ if (dev->netns_immutable) {
+ NL_SET_ERR_MSG(extack, "The interface netns is immutable");
goto out;
+ }
- /* Ensure the device has been registrered */
- if (dev->reg_state != NETREG_REGISTERED)
+ /* Ensure the device has been registered */
+ if (dev->reg_state != NETREG_REGISTERED) {
+ NL_SET_ERR_MSG(extack, "The interface isn't registered");
goto out;
+ }
/* Get out if there is nothing todo */
err = 0;
- if (net_eq(dev_net(dev), net))
+ if (net_eq(net_old, net))
goto out;
/* Pick the destination device name, and ensure
* we can use it in the destination network namespace.
*/
err = -EEXIST;
- if (__dev_get_by_name(net, dev->name)) {
+ if (netdev_name_in_use(net, dev->name)) {
/* We get here if we can't use the current device name */
- if (!pat)
+ if (!pat) {
+ NL_SET_ERR_MSG(extack,
+ "An interface with the same name exists in the target netns");
goto out;
- err = dev_get_valid_name(net, dev, pat);
- if (err < 0)
+ }
+ err = dev_prep_valid_name(net, dev, pat, new_name, EEXIST);
+ if (err < 0) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "Unable to use '%s' for the new interface name in the target netns",
+ pat);
+ goto out;
+ }
+ }
+ /* Check that none of the altnames conflicts. */
+ err = -EEXIST;
+ netdev_for_each_altname(dev, name_node) {
+ if (netdev_name_in_use(net, name_node->name)) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "An interface with the altname %s exists in the target netns",
+ name_node->name);
goto out;
+ }
+ }
+
+ /* Check that new_ifindex isn't used yet. */
+ if (new_ifindex) {
+ err = dev_index_reserve(net, new_ifindex);
+ if (err < 0) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "The ifindex %d is not available in the target netns",
+ new_ifindex);
+ goto out;
+ }
+ } else {
+ /* If there is an ifindex conflict assign a new one */
+ err = dev_index_reserve(net, dev->ifindex);
+ if (err == -EBUSY)
+ err = dev_index_reserve(net, 0);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Unable to allocate a new ifindex in the target netns");
+ goto out;
+ }
+ new_ifindex = err;
}
/*
* And now a mini version of register_netdevice unregister_netdevice.
*/
+ netdev_lock_ops(dev);
/* If device is running close it first. */
- dev_close(dev);
-
+ netif_close(dev);
/* And unlink it from device chain */
unlist_netdevice(dev);
+ if (!netdev_need_ops_lock(dev))
+ netdev_lock(dev);
+ dev->moving_ns = true;
+ netdev_unlock(dev);
+
synchronize_net();
/* Shutdown queueing discipline. */
+ netdev_lock_ops(dev);
dev_shutdown(dev);
+ netdev_unlock_ops(dev);
/* Notify protocols, that we are about to destroy
* this device. They should clean all the things.
@@ -9153,12 +12577,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
rcu_barrier();
- new_nsid = peernet2id_alloc(dev_net(dev), net);
- /* If there is an ifindex conflict assign a new one */
- if (__dev_get_by_index(net, dev->ifindex))
- new_ifindex = dev_new_index(net);
- else
- new_ifindex = dev->ifindex;
+ new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
new_ifindex);
@@ -9173,36 +12592,60 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
netdev_adjacent_del_links(dev);
+ /* Move per-net netdevice notifiers that are following the netdevice */
+ move_netdevice_notifiers_dev_net(dev, net);
+
/* Actually switch the network namespace */
+ netdev_lock(dev);
dev_net_set(dev, net);
+ netdev_unlock(dev);
dev->ifindex = new_ifindex;
+ if (new_name[0]) {
+ /* Rename the netdev to prepared name */
+ write_seqlock_bh(&netdev_rename_lock);
+ strscpy(dev->name, new_name, IFNAMSIZ);
+ write_sequnlock_bh(&netdev_rename_lock);
+ }
+
+ /* Fixup kobjects */
+ dev_set_uevent_suppress(&dev->dev, 1);
+ err = device_rename(&dev->dev, dev->name);
+ dev_set_uevent_suppress(&dev->dev, 0);
+ WARN_ON(err);
+
/* Send a netdev-add uevent to the new namespace */
kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
netdev_adjacent_add_links(dev);
- /* Fixup kobjects */
- err = device_rename(&dev->dev, dev->name);
+ /* Adapt owner in case owning user namespace of target network
+ * namespace is different from the original one.
+ */
+ err = netdev_change_owner(dev, net_old, net);
WARN_ON(err);
+ netdev_lock(dev);
+ dev->moving_ns = false;
+ if (!netdev_need_ops_lock(dev))
+ netdev_unlock(dev);
+
/* Add the device back in the hashes */
list_netdevice(dev);
-
/* Notify protocols, that a new device appeared. */
call_netdevice_notifiers(NETDEV_REGISTER, dev);
+ netdev_unlock_ops(dev);
/*
* Prevent userspace races by waiting until the network
* device is fully setup before sending notifications.
*/
- rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
+ rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
synchronize_net();
err = 0;
out:
return err;
}
-EXPORT_SYMBOL_GPL(dev_change_net_namespace);
static int dev_cpu_dead(unsigned int oldcpu)
{
@@ -9242,7 +12685,7 @@ static int dev_cpu_dead(unsigned int oldcpu)
list_del_init(&napi->poll_list);
if (napi->poll == process_backlog)
- napi->state = 0;
+ napi->state &= NAPIF_STATE_THREADED;
else
____napi_schedule(sd, napi);
}
@@ -9250,21 +12693,23 @@ static int dev_cpu_dead(unsigned int oldcpu)
raise_softirq_irqoff(NET_TX_SOFTIRQ);
local_irq_enable();
+ if (!use_backlog_threads()) {
#ifdef CONFIG_RPS
- remsd = oldsd->rps_ipi_list;
- oldsd->rps_ipi_list = NULL;
+ remsd = oldsd->rps_ipi_list;
+ oldsd->rps_ipi_list = NULL;
#endif
- /* send out pending IPI's on offline CPU */
- net_rps_send_ipi(remsd);
+ /* send out pending IPI's on offline CPU */
+ net_rps_send_ipi(remsd);
+ }
/* Process offline CPU's input_pkt_queue */
while ((skb = __skb_dequeue(&oldsd->process_queue))) {
- netif_rx_ni(skb);
- input_queue_head_incr(oldsd);
+ netif_rx(skb);
+ rps_input_queue_head_incr(oldsd);
}
while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
- netif_rx_ni(skb);
- input_queue_head_incr(oldsd);
+ netif_rx(skb);
+ rps_input_queue_head_incr(oldsd);
}
return 0;
@@ -9298,6 +12743,94 @@ netdev_features_t netdev_increment_features(netdev_features_t all,
}
EXPORT_SYMBOL(netdev_increment_features);
+/**
+ * netdev_compute_master_upper_features - compute feature from lowers
+ * @dev: the upper device
+ * @update_header: whether to update upper device's header_len/headroom/tailroom
+ *
+ * Recompute the upper device's feature based on all lower devices.
+ */
+void netdev_compute_master_upper_features(struct net_device *dev, bool update_header)
+{
+ unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
+ netdev_features_t gso_partial_features = MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES;
+ netdev_features_t xfrm_features = MASTER_UPPER_DEV_XFRM_FEATURES;
+ netdev_features_t mpls_features = MASTER_UPPER_DEV_MPLS_FEATURES;
+ netdev_features_t vlan_features = MASTER_UPPER_DEV_VLAN_FEATURES;
+ netdev_features_t enc_features = MASTER_UPPER_DEV_ENC_FEATURES;
+ unsigned short max_header_len = ETH_HLEN;
+ unsigned int tso_max_size = TSO_MAX_SIZE;
+ unsigned short max_headroom = 0;
+ unsigned short max_tailroom = 0;
+ u16 tso_max_segs = TSO_MAX_SEGS;
+ struct net_device *lower_dev;
+ struct list_head *iter;
+
+ mpls_features = netdev_base_features(mpls_features);
+ vlan_features = netdev_base_features(vlan_features);
+ enc_features = netdev_base_features(enc_features);
+
+ netdev_for_each_lower_dev(dev, lower_dev, iter) {
+ gso_partial_features = netdev_increment_features(gso_partial_features,
+ lower_dev->gso_partial_features,
+ MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES);
+
+ vlan_features = netdev_increment_features(vlan_features,
+ lower_dev->vlan_features,
+ MASTER_UPPER_DEV_VLAN_FEATURES);
+
+ enc_features = netdev_increment_features(enc_features,
+ lower_dev->hw_enc_features,
+ MASTER_UPPER_DEV_ENC_FEATURES);
+
+ if (IS_ENABLED(CONFIG_XFRM_OFFLOAD))
+ xfrm_features = netdev_increment_features(xfrm_features,
+ lower_dev->hw_enc_features,
+ MASTER_UPPER_DEV_XFRM_FEATURES);
+
+ mpls_features = netdev_increment_features(mpls_features,
+ lower_dev->mpls_features,
+ MASTER_UPPER_DEV_MPLS_FEATURES);
+
+ dst_release_flag &= lower_dev->priv_flags;
+
+ if (update_header) {
+ max_header_len = max(max_header_len, lower_dev->hard_header_len);
+ max_headroom = max(max_headroom, lower_dev->needed_headroom);
+ max_tailroom = max(max_tailroom, lower_dev->needed_tailroom);
+ }
+
+ tso_max_size = min(tso_max_size, lower_dev->tso_max_size);
+ tso_max_segs = min(tso_max_segs, lower_dev->tso_max_segs);
+ }
+
+ dev->gso_partial_features = gso_partial_features;
+ dev->vlan_features = vlan_features;
+ dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL |
+ NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_STAG_TX;
+ if (IS_ENABLED(CONFIG_XFRM_OFFLOAD))
+ dev->hw_enc_features |= xfrm_features;
+ dev->mpls_features = mpls_features;
+
+ dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
+ if ((dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) &&
+ dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM))
+ dev->priv_flags |= IFF_XMIT_DST_RELEASE;
+
+ if (update_header) {
+ dev->hard_header_len = max_header_len;
+ dev->needed_headroom = max_headroom;
+ dev->needed_tailroom = max_tailroom;
+ }
+
+ netif_set_tso_max_segs(dev, tso_max_segs);
+ netif_set_tso_max_size(dev, tso_max_size);
+
+ netdev_change_features(dev);
+}
+EXPORT_SYMBOL(netdev_compute_master_upper_features);
+
static struct hlist_head * __net_init netdev_create_hash(void)
{
int i;
@@ -9315,10 +12848,9 @@ static struct hlist_head * __net_init netdev_create_hash(void)
static int __net_init netdev_init(struct net *net)
{
BUILD_BUG_ON(GRO_HASH_BUCKETS >
- 8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask));
+ BITS_PER_BYTE * sizeof_field(struct gro_node, bitmask));
- if (net != &init_net)
- INIT_LIST_HEAD(&net->dev_base_head);
+ INIT_LIST_HEAD(&net->dev_base_head);
net->dev_name_head = netdev_create_hash();
if (net->dev_name_head == NULL)
@@ -9328,6 +12860,10 @@ static int __net_init netdev_init(struct net *net)
if (net->dev_index_head == NULL)
goto err_idx;
+ xa_init_flags(&net->dev_by_index, XA_FLAGS_ALLOC1);
+
+ RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
+
return 0;
err_idx:
@@ -9423,6 +12959,7 @@ static void __net_exit netdev_exit(struct net *net)
{
kfree(net->dev_name_head);
kfree(net->dev_index_head);
+ xa_destroy(&net->dev_by_index);
if (net != &init_net)
WARN_ON_ONCE(!list_empty(&net->dev_base_head));
}
@@ -9432,28 +12969,36 @@ static struct pernet_operations __net_initdata netdev_net_ops = {
.exit = netdev_exit,
};
-static void __net_exit default_device_exit(struct net *net)
+static void __net_exit default_device_exit_net(struct net *net)
{
+ struct netdev_name_node *name_node, *tmp;
struct net_device *dev, *aux;
/*
* Push all migratable network devices back to the
* initial network namespace
*/
- rtnl_lock();
+ ASSERT_RTNL();
for_each_netdev_safe(net, dev, aux) {
int err;
char fb_name[IFNAMSIZ];
/* Ignore unmoveable devices (i.e. loopback) */
- if (dev->features & NETIF_F_NETNS_LOCAL)
+ if (dev->netns_immutable)
continue;
/* Leave virtual devices for the generic cleanup */
- if (dev->rtnl_link_ops)
+ if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
continue;
/* Push remaining network devices to init_net */
snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
+ if (netdev_name_in_use(&init_net, fb_name))
+ snprintf(fb_name, IFNAMSIZ, "dev%%d");
+
+ netdev_for_each_altname_safe(dev, name_node, tmp)
+ if (netdev_name_in_use(&init_net, name_node->name))
+ __netdev_name_node_alt_destroy(name_node);
+
err = dev_change_net_namespace(dev, &init_net, fb_name);
if (err) {
pr_emerg("%s: failed to move %s to init_net: %d\n",
@@ -9461,35 +13006,6 @@ static void __net_exit default_device_exit(struct net *net)
BUG();
}
}
- rtnl_unlock();
-}
-
-static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
-{
- /* Return with the rtnl_lock held when there are no network
- * devices unregistering in any network namespace in net_list.
- */
- struct net *net;
- bool unregistering;
- DEFINE_WAIT_FUNC(wait, woken_wake_function);
-
- add_wait_queue(&netdev_unregistering_wq, &wait);
- for (;;) {
- unregistering = false;
- rtnl_lock();
- list_for_each_entry(net, net_list, exit_list) {
- if (net->dev_unreg_count > 0) {
- unregistering = true;
- break;
- }
- }
- if (!unregistering)
- break;
- __rtnl_unlock();
-
- wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
- }
- remove_wait_queue(&netdev_unregistering_wq, &wait);
}
static void __net_exit default_device_exit_batch(struct list_head *net_list)
@@ -9503,18 +13019,12 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
struct net *net;
LIST_HEAD(dev_kill_list);
- /* To prevent network device cleanup code from dereferencing
- * loopback devices or network devices that have been freed
- * wait here for all pending unregistrations to complete,
- * before unregistring the loopback device and allowing the
- * network namespace be freed.
- *
- * The netdev todo list containing all network devices
- * unregistrations that happen in default_device_exit_batch
- * will run in the rtnl_unlock() at the end of
- * default_device_exit_batch.
- */
- rtnl_lock_unregistering(net_list);
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list) {
+ default_device_exit_net(net);
+ cond_resched();
+ }
+
list_for_each_entry(net, net_list, exit_list) {
for_each_netdev_reverse(net, dev) {
if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
@@ -9528,10 +13038,64 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
}
static struct pernet_operations __net_initdata default_device_ops = {
- .exit = default_device_exit,
.exit_batch = default_device_exit_batch,
};
+static void __init net_dev_struct_check(void)
+{
+ /* TX read-mostly hotpath */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, priv_flags_fast);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, netdev_ops);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, header_ops);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, _tx);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, real_num_tx_queues);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_size);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_ipv4_max_size);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_max_segs);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, gso_partial_features);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, num_tc);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, mtu);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, needed_headroom);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tc_to_txq);
+#ifdef CONFIG_XPS
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, xps_maps);
+#endif
+#ifdef CONFIG_NETFILTER_EGRESS
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, nf_hooks_egress);
+#endif
+#ifdef CONFIG_NET_XGRESS
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_tx, tcx_egress);
+#endif
+ CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160);
+
+ /* TXRX read-mostly hotpath */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, state);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr);
+ CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 46);
+
+ /* RX read-mostly hotpath */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ifindex);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, real_num_rx_queues);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, _rx);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_max_size);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, gro_ipv4_max_size);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, rx_handler_data);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, nd_net);
+#ifdef CONFIG_NETPOLL
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, npinfo);
+#endif
+#ifdef CONFIG_NET_XGRESS
+ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, tcx_ingress);
+#endif
+ CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_rx, 92);
+}
+
/*
* Initialize the DEV module. At boot time this walks the device list and
* unhooks any devices that fail to initialise (normally hardware not
@@ -9539,6 +13103,67 @@ static struct pernet_operations __net_initdata default_device_ops = {
*
*/
+/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */
+#define SYSTEM_PERCPU_PAGE_POOL_SIZE ((1 << 20) / PAGE_SIZE)
+
+static int net_page_pool_create(int cpuid)
+{
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+ struct page_pool_params page_pool_params = {
+ .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE,
+ .flags = PP_FLAG_SYSTEM_POOL,
+ .nid = cpu_to_mem(cpuid),
+ };
+ struct page_pool *pp_ptr;
+ int err;
+
+ pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
+ if (IS_ERR(pp_ptr))
+ return -ENOMEM;
+
+ err = xdp_reg_page_pool(pp_ptr);
+ if (err) {
+ page_pool_destroy(pp_ptr);
+ return err;
+ }
+
+ per_cpu(system_page_pool.pool, cpuid) = pp_ptr;
+#endif
+ return 0;
+}
+
+static int backlog_napi_should_run(unsigned int cpu)
+{
+ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
+ struct napi_struct *napi = &sd->backlog;
+
+ return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
+}
+
+static void run_backlog_napi(unsigned int cpu)
+{
+ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
+
+ napi_threaded_poll_loop(&sd->backlog, false);
+}
+
+static void backlog_napi_setup(unsigned int cpu)
+{
+ struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
+ struct napi_struct *napi = &sd->backlog;
+
+ napi->thread = this_cpu_read(backlog_napi);
+ set_bit(NAPI_STATE_THREADED, &napi->state);
+}
+
+static struct smp_hotplug_thread backlog_threads = {
+ .store = &backlog_napi,
+ .thread_should_run = backlog_napi_should_run,
+ .thread_fn = run_backlog_napi,
+ .thread_comm = "backlog_napi/%u",
+ .setup = backlog_napi_setup,
+};
+
/*
* This is called single threaded during boot, so no need
* to take the rtnl semaphore.
@@ -9549,18 +13174,17 @@ static int __init net_dev_init(void)
BUG_ON(!dev_boot_phase);
+ net_dev_struct_check();
+
if (dev_proc_init())
goto out;
if (netdev_kobject_init())
goto out;
- INIT_LIST_HEAD(&ptype_all);
for (i = 0; i < PTYPE_HASH_SIZE; i++)
INIT_LIST_HEAD(&ptype_base[i]);
- INIT_LIST_HEAD(&offload_base);
-
if (register_pernet_subsys(&netdev_net_ops))
goto out;
@@ -9568,12 +13192,13 @@ static int __init net_dev_init(void)
* Initialise the packet receive queues.
*/
+ flush_backlogs_fallback = flush_backlogs_alloc();
+ if (!flush_backlogs_fallback)
+ goto out;
+
for_each_possible_cpu(i) {
- struct work_struct *flush = per_cpu_ptr(&flush_works, i);
struct softnet_data *sd = &per_cpu(softnet_data, i);
- INIT_WORK(flush, flush_backlog);
-
skb_queue_head_init(&sd->input_pkt_queue);
skb_queue_head_init(&sd->process_queue);
#ifdef CONFIG_XFRM_OFFLOAD
@@ -9582,15 +13207,26 @@ static int __init net_dev_init(void)
INIT_LIST_HEAD(&sd->poll_list);
sd->output_queue_tailp = &sd->output_queue;
#ifdef CONFIG_RPS
- sd->csd.func = rps_trigger_softirq;
- sd->csd.info = sd;
+ INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
sd->cpu = i;
#endif
+ INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
- init_gro_hash(&sd->backlog);
+ gro_init(&sd->backlog.gro);
sd->backlog.poll = process_backlog;
sd->backlog.weight = weight_p;
+ INIT_LIST_HEAD(&sd->backlog.poll_list);
+
+ if (net_page_pool_create(i))
+ goto out;
}
+ net_hotdata.skb_defer_nodes =
+ __alloc_percpu(sizeof(struct skb_defer_node) * nr_node_ids,
+ __alignof__(struct skb_defer_node));
+ if (!net_hotdata.skb_defer_nodes)
+ goto out;
+ if (use_backlog_threads())
+ smpboot_register_percpu_thread(&backlog_threads);
dev_boot_phase = 0;
@@ -9616,7 +13252,25 @@ static int __init net_dev_init(void)
NULL, dev_cpu_dead);
WARN_ON(rc < 0);
rc = 0;
+
+ /* avoid static key IPIs to isolated CPUs */
+ if (housekeeping_enabled(HK_TYPE_MISC))
+ net_enable_timestamp();
out:
+ if (rc < 0) {
+ for_each_possible_cpu(i) {
+ struct page_pool *pp_ptr;
+
+ pp_ptr = per_cpu(system_page_pool.pool, i);
+ if (!pp_ptr)
+ continue;
+
+ xdp_unreg_page_pool(pp_ptr);
+ page_pool_destroy(pp_ptr);
+ per_cpu(system_page_pool.pool, i) = NULL;
+ }
+ }
+
return rc;
}
diff --git a/net/core/dev.h b/net/core/dev.h
new file mode 100644
index 000000000000..da18536cbd35
--- /dev/null
+++ b/net/core/dev.h
@@ -0,0 +1,406 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _NET_CORE_DEV_H
+#define _NET_CORE_DEV_H
+
+#include <linux/cleanup.h>
+#include <linux/types.h>
+#include <linux/rwsem.h>
+#include <linux/netdevice.h>
+#include <net/netdev_lock.h>
+
+struct net;
+struct netlink_ext_ack;
+struct cpumask;
+
+/* Random bits of netdevice that don't need to be exposed */
+#define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */
+struct sd_flow_limit {
+ struct rcu_head rcu;
+ unsigned int count;
+ u8 log_buckets;
+ unsigned int history_head;
+ u16 history[FLOW_LIMIT_HISTORY];
+ u8 buckets[];
+};
+
+extern int netdev_flow_limit_table_len;
+
+struct napi_struct *
+netdev_napi_by_id_lock(struct net *net, unsigned int napi_id);
+struct net_device *dev_get_by_napi_id(unsigned int napi_id);
+
+struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net);
+struct net_device *
+netdev_xa_find_lock(struct net *net, struct net_device *dev,
+ unsigned long *index);
+
+DEFINE_FREE(netdev_unlock, struct net_device *, if (_T) netdev_unlock(_T));
+
+#define for_each_netdev_lock_scoped(net, var_name, ifindex) \
+ for (struct net_device *var_name __free(netdev_unlock) = NULL; \
+ (var_name = netdev_xa_find_lock(net, var_name, &ifindex)); \
+ ifindex++)
+
+struct net_device *
+netdev_get_by_index_lock_ops_compat(struct net *net, int ifindex);
+struct net_device *
+netdev_xa_find_lock_ops_compat(struct net *net, struct net_device *dev,
+ unsigned long *index);
+
+DEFINE_FREE(netdev_unlock_ops_compat, struct net_device *,
+ if (_T) netdev_unlock_ops_compat(_T));
+
+#define for_each_netdev_lock_ops_compat_scoped(net, var_name, ifindex) \
+ for (struct net_device *var_name __free(netdev_unlock_ops_compat) = NULL; \
+ (var_name = netdev_xa_find_lock_ops_compat(net, var_name, \
+ &ifindex)); \
+ ifindex++)
+
+#ifdef CONFIG_PROC_FS
+int __init dev_proc_init(void);
+#else
+#define dev_proc_init() 0
+#endif
+
+void linkwatch_init_dev(struct net_device *dev);
+void linkwatch_run_queue(void);
+
+void dev_addr_flush(struct net_device *dev);
+int dev_addr_init(struct net_device *dev);
+void dev_addr_check(struct net_device *dev);
+
+#if IS_ENABLED(CONFIG_NET_SHAPER)
+void net_shaper_flush_netdev(struct net_device *dev);
+void net_shaper_set_real_num_tx_queues(struct net_device *dev,
+ unsigned int txq);
+#else
+static inline void net_shaper_flush_netdev(struct net_device *dev) {}
+static inline void net_shaper_set_real_num_tx_queues(struct net_device *dev,
+ unsigned int txq) {}
+#endif
+
+/* sysctls not referred to from outside net/core/ */
+extern int netdev_unregister_timeout_secs;
+extern int weight_p;
+extern int dev_weight_rx_bias;
+extern int dev_weight_tx_bias;
+
+extern struct rw_semaphore dev_addr_sem;
+
+/* rtnl helpers */
+extern struct list_head net_todo_list;
+void netdev_run_todo(void);
+
+/* netdev management, shared between various uAPI entry points */
+struct netdev_name_node {
+ struct hlist_node hlist;
+ struct list_head list;
+ struct net_device *dev;
+ const char *name;
+ struct rcu_head rcu;
+};
+
+int netdev_get_name(struct net *net, char *name, int ifindex);
+int netif_change_name(struct net_device *dev, const char *newname);
+int dev_change_name(struct net_device *dev, const char *newname);
+
+#define netdev_for_each_altname(dev, namenode) \
+ list_for_each_entry((namenode), &(dev)->name_node->list, list)
+#define netdev_for_each_altname_safe(dev, namenode, next) \
+ list_for_each_entry_safe((namenode), (next), &(dev)->name_node->list, \
+ list)
+
+int netdev_name_node_alt_create(struct net_device *dev, const char *name);
+int netdev_name_node_alt_destroy(struct net_device *dev, const char *name);
+
+int dev_validate_mtu(struct net_device *dev, int mtu,
+ struct netlink_ext_ack *extack);
+int netif_set_mtu_ext(struct net_device *dev, int new_mtu,
+ struct netlink_ext_ack *extack);
+
+int dev_get_phys_port_id(struct net_device *dev,
+ struct netdev_phys_item_id *ppid);
+int dev_get_phys_port_name(struct net_device *dev,
+ char *name, size_t len);
+
+int netif_change_proto_down(struct net_device *dev, bool proto_down);
+int dev_change_proto_down(struct net_device *dev, bool proto_down);
+void netdev_change_proto_down_reason_locked(struct net_device *dev,
+ unsigned long mask, u32 value);
+
+typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
+int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
+ int fd, int expected_fd, u32 flags);
+
+int netif_change_tx_queue_len(struct net_device *dev, unsigned long new_len);
+int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len);
+void netif_set_group(struct net_device *dev, int new_group);
+void dev_set_group(struct net_device *dev, int new_group);
+int netif_change_carrier(struct net_device *dev, bool new_carrier);
+int dev_change_carrier(struct net_device *dev, bool new_carrier);
+
+void __dev_set_rx_mode(struct net_device *dev);
+
+void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
+ unsigned int gchanges, u32 portid,
+ const struct nlmsghdr *nlh);
+
+void unregister_netdevice_many_notify(struct list_head *head,
+ u32 portid, const struct nlmsghdr *nlh);
+
+static inline void netif_set_up(struct net_device *dev, bool value)
+{
+ if (value)
+ dev->flags |= IFF_UP;
+ else
+ dev->flags &= ~IFF_UP;
+
+ if (!netdev_need_ops_lock(dev))
+ netdev_lock(dev);
+ dev->up = value;
+ if (!netdev_need_ops_lock(dev))
+ netdev_unlock(dev);
+}
+
+static inline void netif_set_gso_max_size(struct net_device *dev,
+ unsigned int size)
+{
+ /* dev->gso_max_size is read locklessly from sk_setup_caps() */
+ WRITE_ONCE(dev->gso_max_size, size);
+ if (size <= GSO_LEGACY_MAX_SIZE)
+ WRITE_ONCE(dev->gso_ipv4_max_size, size);
+}
+
+static inline void netif_set_gso_max_segs(struct net_device *dev,
+ unsigned int segs)
+{
+ /* dev->gso_max_segs is read locklessly from sk_setup_caps() */
+ WRITE_ONCE(dev->gso_max_segs, segs);
+}
+
+static inline void netif_set_gro_max_size(struct net_device *dev,
+ unsigned int size)
+{
+ /* This pairs with the READ_ONCE() in skb_gro_receive() */
+ WRITE_ONCE(dev->gro_max_size, size);
+ if (size <= GRO_LEGACY_MAX_SIZE)
+ WRITE_ONCE(dev->gro_ipv4_max_size, size);
+}
+
+static inline void netif_set_gso_ipv4_max_size(struct net_device *dev,
+ unsigned int size)
+{
+ /* dev->gso_ipv4_max_size is read locklessly from sk_setup_caps() */
+ WRITE_ONCE(dev->gso_ipv4_max_size, size);
+}
+
+static inline void netif_set_gro_ipv4_max_size(struct net_device *dev,
+ unsigned int size)
+{
+ /* This pairs with the READ_ONCE() in skb_gro_receive() */
+ WRITE_ONCE(dev->gro_ipv4_max_size, size);
+}
+
+/**
+ * napi_get_defer_hard_irqs - get the NAPI's defer_hard_irqs
+ * @n: napi struct to get the defer_hard_irqs field from
+ *
+ * Return: the per-NAPI value of the defar_hard_irqs field.
+ */
+static inline u32 napi_get_defer_hard_irqs(const struct napi_struct *n)
+{
+ return READ_ONCE(n->defer_hard_irqs);
+}
+
+/**
+ * napi_set_defer_hard_irqs - set the defer_hard_irqs for a napi
+ * @n: napi_struct to set the defer_hard_irqs field
+ * @defer: the value the field should be set to
+ */
+static inline void napi_set_defer_hard_irqs(struct napi_struct *n, u32 defer)
+{
+ WRITE_ONCE(n->defer_hard_irqs, defer);
+}
+
+/**
+ * netdev_set_defer_hard_irqs - set defer_hard_irqs for all NAPIs of a netdev
+ * @netdev: the net_device for which all NAPIs will have defer_hard_irqs set
+ * @defer: the defer_hard_irqs value to set
+ */
+static inline void netdev_set_defer_hard_irqs(struct net_device *netdev,
+ u32 defer)
+{
+ unsigned int count = max(netdev->num_rx_queues,
+ netdev->num_tx_queues);
+ struct napi_struct *napi;
+ int i;
+
+ WRITE_ONCE(netdev->napi_defer_hard_irqs, defer);
+ list_for_each_entry(napi, &netdev->napi_list, dev_list)
+ napi_set_defer_hard_irqs(napi, defer);
+
+ for (i = 0; i < count; i++)
+ netdev->napi_config[i].defer_hard_irqs = defer;
+}
+
+/**
+ * napi_get_gro_flush_timeout - get the gro_flush_timeout
+ * @n: napi struct to get the gro_flush_timeout from
+ *
+ * Return: the per-NAPI value of the gro_flush_timeout field.
+ */
+static inline unsigned long
+napi_get_gro_flush_timeout(const struct napi_struct *n)
+{
+ return READ_ONCE(n->gro_flush_timeout);
+}
+
+/**
+ * napi_set_gro_flush_timeout - set the gro_flush_timeout for a napi
+ * @n: napi struct to set the gro_flush_timeout
+ * @timeout: timeout value to set
+ *
+ * napi_set_gro_flush_timeout sets the per-NAPI gro_flush_timeout
+ */
+static inline void napi_set_gro_flush_timeout(struct napi_struct *n,
+ unsigned long timeout)
+{
+ WRITE_ONCE(n->gro_flush_timeout, timeout);
+}
+
+/**
+ * netdev_set_gro_flush_timeout - set gro_flush_timeout of a netdev's NAPIs
+ * @netdev: the net_device for which all NAPIs will have gro_flush_timeout set
+ * @timeout: the timeout value to set
+ */
+static inline void netdev_set_gro_flush_timeout(struct net_device *netdev,
+ unsigned long timeout)
+{
+ unsigned int count = max(netdev->num_rx_queues,
+ netdev->num_tx_queues);
+ struct napi_struct *napi;
+ int i;
+
+ WRITE_ONCE(netdev->gro_flush_timeout, timeout);
+ list_for_each_entry(napi, &netdev->napi_list, dev_list)
+ napi_set_gro_flush_timeout(napi, timeout);
+
+ for (i = 0; i < count; i++)
+ netdev->napi_config[i].gro_flush_timeout = timeout;
+}
+
+/**
+ * napi_get_irq_suspend_timeout - get the irq_suspend_timeout
+ * @n: napi struct to get the irq_suspend_timeout from
+ *
+ * Return: the per-NAPI value of the irq_suspend_timeout field.
+ */
+static inline unsigned long
+napi_get_irq_suspend_timeout(const struct napi_struct *n)
+{
+ return READ_ONCE(n->irq_suspend_timeout);
+}
+
+/**
+ * napi_set_irq_suspend_timeout - set the irq_suspend_timeout for a napi
+ * @n: napi struct to set the irq_suspend_timeout
+ * @timeout: timeout value to set
+ *
+ * napi_set_irq_suspend_timeout sets the per-NAPI irq_suspend_timeout
+ */
+static inline void napi_set_irq_suspend_timeout(struct napi_struct *n,
+ unsigned long timeout)
+{
+ WRITE_ONCE(n->irq_suspend_timeout, timeout);
+}
+
+static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n)
+{
+ if (test_bit(NAPI_STATE_THREADED_BUSY_POLL, &n->state))
+ return NETDEV_NAPI_THREADED_BUSY_POLL;
+
+ if (test_bit(NAPI_STATE_THREADED, &n->state))
+ return NETDEV_NAPI_THREADED_ENABLED;
+
+ return NETDEV_NAPI_THREADED_DISABLED;
+}
+
+static inline enum netdev_napi_threaded
+napi_get_threaded_config(struct net_device *dev, struct napi_struct *n)
+{
+ if (n->config)
+ return n->config->threaded;
+ return dev->threaded;
+}
+
+int napi_set_threaded(struct napi_struct *n,
+ enum netdev_napi_threaded threaded);
+
+int netif_set_threaded(struct net_device *dev,
+ enum netdev_napi_threaded threaded);
+
+int rps_cpumask_housekeeping(struct cpumask *mask);
+
+#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
+void xdp_do_check_flushed(struct napi_struct *napi);
+#else
+static inline void xdp_do_check_flushed(struct napi_struct *napi) { }
+#endif
+
+/* Best effort check that NAPI is not idle (can't be scheduled to run) */
+static inline void napi_assert_will_not_race(const struct napi_struct *napi)
+{
+ /* uninitialized instance, can't race */
+ if (!napi->poll_list.next)
+ return;
+
+ /* SCHED bit is set on disabled instances */
+ WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state));
+ WARN_ON(READ_ONCE(napi->list_owner) != -1);
+}
+
+void kick_defer_list_purge(unsigned int cpu);
+
+#define XMIT_RECURSION_LIMIT 8
+
+#ifndef CONFIG_PREEMPT_RT
+static inline bool dev_xmit_recursion(void)
+{
+ return unlikely(__this_cpu_read(softnet_data.xmit.recursion) >
+ XMIT_RECURSION_LIMIT);
+}
+
+static inline void dev_xmit_recursion_inc(void)
+{
+ __this_cpu_inc(softnet_data.xmit.recursion);
+}
+
+static inline void dev_xmit_recursion_dec(void)
+{
+ __this_cpu_dec(softnet_data.xmit.recursion);
+}
+#else
+static inline bool dev_xmit_recursion(void)
+{
+ return unlikely(current->net_xmit.recursion > XMIT_RECURSION_LIMIT);
+}
+
+static inline void dev_xmit_recursion_inc(void)
+{
+ current->net_xmit.recursion++;
+}
+
+static inline void dev_xmit_recursion_dec(void)
+{
+ current->net_xmit.recursion--;
+}
+#endif
+
+int dev_set_hwtstamp_phylib(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg,
+ struct netlink_ext_ack *extack);
+int dev_get_hwtstamp_phylib(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg);
+int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg);
+
+#endif
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index d884d8f5f0e5..76c91f224886 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/core/dev_addr_lists.c - Functions for handling net device lists
* Copyright (c) 2010 Jiri Pirko <jpirko@redhat.com>
*
* This file contains functions for working with unicast, multicast and device
* addresses lists.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/netdevice.h>
@@ -16,14 +12,44 @@
#include <linux/export.h>
#include <linux/list.h>
+#include "dev.h"
+
/*
* General list handling functions
*/
-static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
- const unsigned char *addr, int addr_len,
- unsigned char addr_type, bool global,
- bool sync)
+static int __hw_addr_insert(struct netdev_hw_addr_list *list,
+ struct netdev_hw_addr *new, int addr_len)
+{
+ struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL;
+ struct netdev_hw_addr *ha;
+
+ while (*ins_point) {
+ int diff;
+
+ ha = rb_entry(*ins_point, struct netdev_hw_addr, node);
+ diff = memcmp(new->addr, ha->addr, addr_len);
+ if (diff == 0)
+ diff = memcmp(&new->type, &ha->type, sizeof(new->type));
+
+ parent = *ins_point;
+ if (diff < 0)
+ ins_point = &parent->rb_left;
+ else if (diff > 0)
+ ins_point = &parent->rb_right;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node_rcu(&new->node, parent, ins_point);
+ rb_insert_color(&new->node, &list->tree);
+
+ return 0;
+}
+
+static struct netdev_hw_addr*
+__hw_addr_create(const unsigned char *addr, int addr_len,
+ unsigned char addr_type, bool global, bool sync)
{
struct netdev_hw_addr *ha;
int alloc_size;
@@ -33,32 +59,44 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
alloc_size = L1_CACHE_BYTES;
ha = kmalloc(alloc_size, GFP_ATOMIC);
if (!ha)
- return -ENOMEM;
+ return NULL;
memcpy(ha->addr, addr, addr_len);
ha->type = addr_type;
ha->refcount = 1;
ha->global_use = global;
ha->synced = sync ? 1 : 0;
ha->sync_cnt = 0;
- list_add_tail_rcu(&ha->list, &list->list);
- list->count++;
- return 0;
+ return ha;
}
static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
const unsigned char *addr, int addr_len,
unsigned char addr_type, bool global, bool sync,
- int sync_count)
+ int sync_count, bool exclusive)
{
+ struct rb_node **ins_point = &list->tree.rb_node, *parent = NULL;
struct netdev_hw_addr *ha;
if (addr_len > MAX_ADDR_LEN)
return -EINVAL;
- list_for_each_entry(ha, &list->list, list) {
- if (ha->type == addr_type &&
- !memcmp(ha->addr, addr, addr_len)) {
+ while (*ins_point) {
+ int diff;
+
+ ha = rb_entry(*ins_point, struct netdev_hw_addr, node);
+ diff = memcmp(addr, ha->addr, addr_len);
+ if (diff == 0)
+ diff = memcmp(&addr_type, &ha->type, sizeof(addr_type));
+
+ parent = *ins_point;
+ if (diff < 0) {
+ ins_point = &parent->rb_left;
+ } else if (diff > 0) {
+ ins_point = &parent->rb_right;
+ } else {
+ if (exclusive)
+ return -EEXIST;
if (global) {
/* check if addr is already used as global */
if (ha->global_use)
@@ -77,8 +115,17 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
}
}
- return __hw_addr_create_ex(list, addr, addr_len, addr_type, global,
- sync);
+ ha = __hw_addr_create(addr, addr_len, addr_type, global, sync);
+ if (!ha)
+ return -ENOMEM;
+
+ rb_link_node(&ha->node, parent, ins_point);
+ rb_insert_color(&ha->node, &list->tree);
+
+ list_add_tail_rcu(&ha->list, &list->list);
+ list->count++;
+
+ return 0;
}
static int __hw_addr_add(struct netdev_hw_addr_list *list,
@@ -86,7 +133,7 @@ static int __hw_addr_add(struct netdev_hw_addr_list *list,
unsigned char addr_type)
{
return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false,
- 0);
+ 0, false);
}
static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
@@ -107,24 +154,50 @@ static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
if (--ha->refcount)
return 0;
+
+ rb_erase(&ha->node, &list->tree);
+
list_del_rcu(&ha->list);
kfree_rcu(ha, rcu_head);
list->count--;
return 0;
}
+static struct netdev_hw_addr *__hw_addr_lookup(struct netdev_hw_addr_list *list,
+ const unsigned char *addr, int addr_len,
+ unsigned char addr_type)
+{
+ struct rb_node *node;
+
+ node = list->tree.rb_node;
+
+ while (node) {
+ struct netdev_hw_addr *ha = rb_entry(node, struct netdev_hw_addr, node);
+ int diff = memcmp(addr, ha->addr, addr_len);
+
+ if (diff == 0 && addr_type)
+ diff = memcmp(&addr_type, &ha->type, sizeof(addr_type));
+
+ if (diff < 0)
+ node = node->rb_left;
+ else if (diff > 0)
+ node = node->rb_right;
+ else
+ return ha;
+ }
+
+ return NULL;
+}
+
static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
const unsigned char *addr, int addr_len,
unsigned char addr_type, bool global, bool sync)
{
- struct netdev_hw_addr *ha;
+ struct netdev_hw_addr *ha = __hw_addr_lookup(list, addr, addr_len, addr_type);
- list_for_each_entry(ha, &list->list, list) {
- if (!memcmp(ha->addr, addr, addr_len) &&
- (ha->type == addr_type || !addr_type))
- return __hw_addr_del_entry(list, ha, global, sync);
- }
- return -ENOENT;
+ if (!ha)
+ return -ENOENT;
+ return __hw_addr_del_entry(list, ha, global, sync);
}
static int __hw_addr_del(struct netdev_hw_addr_list *list,
@@ -141,7 +214,7 @@ static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list,
int err;
err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type,
- false, true, ha->sync_cnt);
+ false, true, ha->sync_cnt, false);
if (err && err != -EEXIST)
return err;
@@ -169,9 +242,9 @@ static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list,
__hw_addr_del_entry(from_list, ha, false, false);
}
-static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
- struct netdev_hw_addr_list *from_list,
- int addr_len)
+int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ int addr_len)
{
int err = 0;
struct netdev_hw_addr *ha, *tmp;
@@ -187,9 +260,10 @@ static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
}
return err;
}
+EXPORT_SYMBOL(__hw_addr_sync_multiple);
/* This function only works where there is a strict 1-1 relationship
- * between source and destionation of they synch. If you ever need to
+ * between source and destination of they synch. If you ever need to
* sync addresses to more then 1 destination, you need to use
* __hw_addr_sync_multiple().
*/
@@ -226,13 +300,13 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
EXPORT_SYMBOL(__hw_addr_unsync);
/**
- * __hw_addr_sync_dev - Synchonize device's multicast list
- * @list: address list to syncronize
+ * __hw_addr_sync_dev - Synchronize device's multicast list
+ * @list: address list to synchronize
* @dev: device to sync
* @sync: function to call if address should be added
* @unsync: function to call if address should be removed
*
- * This funciton is intended to be called from the ndo_set_rx_mode
+ * This function is intended to be called from the ndo_set_rx_mode
* function of devices that require explicit address add/remove
* notifications. The unsync function may be NULL in which case
* the addresses requiring removal will simply be removed without
@@ -278,6 +352,103 @@ int __hw_addr_sync_dev(struct netdev_hw_addr_list *list,
EXPORT_SYMBOL(__hw_addr_sync_dev);
/**
+ * __hw_addr_ref_sync_dev - Synchronize device's multicast address list taking
+ * into account references
+ * @list: address list to synchronize
+ * @dev: device to sync
+ * @sync: function to call if address or reference on it should be added
+ * @unsync: function to call if address or some reference on it should removed
+ *
+ * This function is intended to be called from the ndo_set_rx_mode
+ * function of devices that require explicit address or references on it
+ * add/remove notifications. The unsync function may be NULL in which case
+ * the addresses or references on it requiring removal will simply be
+ * removed without any notification to the device. That is responsibility of
+ * the driver to identify and distribute address or references on it between
+ * internal address tables.
+ **/
+int __hw_addr_ref_sync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*sync)(struct net_device *,
+ const unsigned char *, int),
+ int (*unsync)(struct net_device *,
+ const unsigned char *, int))
+{
+ struct netdev_hw_addr *ha, *tmp;
+ int err, ref_cnt;
+
+ /* first go through and flush out any unsynced/stale entries */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ /* sync if address is not used */
+ if ((ha->sync_cnt << 1) <= ha->refcount)
+ continue;
+
+ /* if fails defer unsyncing address */
+ ref_cnt = ha->refcount - ha->sync_cnt;
+ if (unsync && unsync(dev, ha->addr, ref_cnt))
+ continue;
+
+ ha->refcount = (ref_cnt << 1) + 1;
+ ha->sync_cnt = ref_cnt;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+
+ /* go through and sync updated/new entries to the list */
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ /* sync if address added or reused */
+ if ((ha->sync_cnt << 1) >= ha->refcount)
+ continue;
+
+ ref_cnt = ha->refcount - ha->sync_cnt;
+ err = sync(dev, ha->addr, ref_cnt);
+ if (err)
+ return err;
+
+ ha->refcount = ref_cnt << 1;
+ ha->sync_cnt = ref_cnt;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(__hw_addr_ref_sync_dev);
+
+/**
+ * __hw_addr_ref_unsync_dev - Remove synchronized addresses and references on
+ * it from device
+ * @list: address list to remove synchronized addresses (references on it) from
+ * @dev: device to sync
+ * @unsync: function to call if address and references on it should be removed
+ *
+ * Remove all addresses that were added to the device by
+ * __hw_addr_ref_sync_dev(). This function is intended to be called from the
+ * ndo_stop or ndo_open functions on devices that require explicit address (or
+ * references on it) add/remove notifications. If the unsync function pointer
+ * is NULL then this function can be used to just reset the sync_cnt for the
+ * addresses in the list.
+ **/
+void __hw_addr_ref_unsync_dev(struct netdev_hw_addr_list *list,
+ struct net_device *dev,
+ int (*unsync)(struct net_device *,
+ const unsigned char *, int))
+{
+ struct netdev_hw_addr *ha, *tmp;
+
+ list_for_each_entry_safe(ha, tmp, &list->list, list) {
+ if (!ha->sync_cnt)
+ continue;
+
+ /* if fails defer unsyncing address */
+ if (unsync && unsync(dev, ha->addr, ha->sync_cnt))
+ continue;
+
+ ha->refcount -= ha->sync_cnt - 1;
+ ha->sync_cnt = 0;
+ __hw_addr_del_entry(list, ha, false, false);
+ }
+}
+EXPORT_SYMBOL(__hw_addr_ref_unsync_dev);
+
+/**
* __hw_addr_unsync_dev - Remove synchronized addresses from device
* @list: address list to remove synchronized addresses from
* @dev: device to sync
@@ -314,6 +485,7 @@ static void __hw_addr_flush(struct netdev_hw_addr_list *list)
{
struct netdev_hw_addr *ha, *tmp;
+ list->tree = RB_ROOT;
list_for_each_entry_safe(ha, tmp, &list->list, list) {
list_del_rcu(&ha->list);
kfree_rcu(ha, rcu_head);
@@ -325,6 +497,7 @@ void __hw_addr_init(struct netdev_hw_addr_list *list)
{
INIT_LIST_HEAD(&list->list);
list->count = 0;
+ list->tree = RB_ROOT;
}
EXPORT_SYMBOL(__hw_addr_init);
@@ -332,6 +505,21 @@ EXPORT_SYMBOL(__hw_addr_init);
* Device addresses handling functions
*/
+/* Check that netdev->dev_addr is not written to directly as this would
+ * break the rbtree layout. All changes should go thru dev_addr_set() and co.
+ * Remove this check in mid-2024.
+ */
+void dev_addr_check(struct net_device *dev)
+{
+ if (!memcmp(dev->dev_addr, dev->dev_addr_shadow, MAX_ADDR_LEN))
+ return;
+
+ netdev_warn(dev, "Current addr: %*ph\n", MAX_ADDR_LEN, dev->dev_addr);
+ netdev_warn(dev, "Expected addr: %*ph\n",
+ MAX_ADDR_LEN, dev->dev_addr_shadow);
+ netdev_WARN(dev, "Incorrect netdev->dev_addr\n");
+}
+
/**
* dev_addr_flush - Flush device address list
* @dev: device
@@ -343,11 +531,11 @@ EXPORT_SYMBOL(__hw_addr_init);
void dev_addr_flush(struct net_device *dev)
{
/* rtnl_mutex must be held here */
+ dev_addr_check(dev);
__hw_addr_flush(&dev->dev_addrs);
dev->dev_addr = NULL;
}
-EXPORT_SYMBOL(dev_addr_flush);
/**
* dev_addr_init - Init device address list
@@ -381,7 +569,21 @@ int dev_addr_init(struct net_device *dev)
}
return err;
}
-EXPORT_SYMBOL(dev_addr_init);
+
+void dev_addr_mod(struct net_device *dev, unsigned int offset,
+ const void *addr, size_t len)
+{
+ struct netdev_hw_addr *ha;
+
+ dev_addr_check(dev);
+
+ ha = container_of(dev->dev_addr, struct netdev_hw_addr, addr[0]);
+ rb_erase(&ha->node, &dev->dev_addrs.tree);
+ memcpy(&ha->addr[offset], addr, len);
+ memcpy(&dev->dev_addr_shadow[offset], addr, len);
+ WARN_ON(__hw_addr_insert(&dev->dev_addrs, ha, dev->addr_len));
+}
+EXPORT_SYMBOL(dev_addr_mod);
/**
* dev_addr_add - Add a device address
@@ -401,6 +603,9 @@ int dev_addr_add(struct net_device *dev, const unsigned char *addr,
ASSERT_RTNL();
+ err = netif_pre_changeaddr_notify(dev, addr, NULL);
+ if (err)
+ return err;
err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
if (!err)
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
@@ -456,22 +661,14 @@ EXPORT_SYMBOL(dev_addr_del);
*/
int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr)
{
- struct netdev_hw_addr *ha;
int err;
netif_addr_lock_bh(dev);
- list_for_each_entry(ha, &dev->uc.list, list) {
- if (!memcmp(ha->addr, addr, dev->addr_len) &&
- ha->type == NETDEV_HW_ADDR_T_UNICAST) {
- err = -EEXIST;
- goto out;
- }
- }
- err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len,
- NETDEV_HW_ADDR_T_UNICAST, true, false);
+ err = __hw_addr_add_ex(&dev->uc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_UNICAST, true, false,
+ 0, true);
if (!err)
__dev_set_rx_mode(dev);
-out:
netif_addr_unlock_bh(dev);
return err;
}
@@ -541,7 +738,7 @@ int dev_uc_sync(struct net_device *to, struct net_device *from)
if (to->addr_len != from->addr_len)
return -EINVAL;
- netif_addr_lock_nested(to);
+ netif_addr_lock(to);
err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
if (!err)
__dev_set_rx_mode(to);
@@ -571,7 +768,7 @@ int dev_uc_sync_multiple(struct net_device *to, struct net_device *from)
if (to->addr_len != from->addr_len)
return -EINVAL;
- netif_addr_lock_nested(to);
+ netif_addr_lock(to);
err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len);
if (!err)
__dev_set_rx_mode(to);
@@ -594,8 +791,17 @@ void dev_uc_unsync(struct net_device *to, struct net_device *from)
if (to->addr_len != from->addr_len)
return;
+ /* netif_addr_lock_bh() uses lockdep subclass 0, this is okay for two
+ * reasons:
+ * 1) This is always called without any addr_list_lock, so as the
+ * outermost one here, it must be 0.
+ * 2) This is called by some callers after unlinking the upper device,
+ * so the dev->lower_level becomes 1 again.
+ * Therefore, the subclass for 'from' is 0, for 'to' is either 1 or
+ * larger.
+ */
netif_addr_lock_bh(from);
- netif_addr_lock_nested(to);
+ netif_addr_lock(to);
__hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
__dev_set_rx_mode(to);
netif_addr_unlock(to);
@@ -618,7 +824,7 @@ void dev_uc_flush(struct net_device *dev)
EXPORT_SYMBOL(dev_uc_flush);
/**
- * dev_uc_flush - Init unicast address list
+ * dev_uc_init - Init unicast address list
* @dev: device
*
* Init unicast address list.
@@ -640,22 +846,14 @@ EXPORT_SYMBOL(dev_uc_init);
*/
int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr)
{
- struct netdev_hw_addr *ha;
int err;
netif_addr_lock_bh(dev);
- list_for_each_entry(ha, &dev->mc.list, list) {
- if (!memcmp(ha->addr, addr, dev->addr_len) &&
- ha->type == NETDEV_HW_ADDR_T_MULTICAST) {
- err = -EEXIST;
- goto out;
- }
- }
- err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len,
- NETDEV_HW_ADDR_T_MULTICAST, true, false);
+ err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
+ NETDEV_HW_ADDR_T_MULTICAST, true, false,
+ 0, true);
if (!err)
__dev_set_rx_mode(dev);
-out:
netif_addr_unlock_bh(dev);
return err;
}
@@ -668,7 +866,8 @@ static int __dev_mc_add(struct net_device *dev, const unsigned char *addr,
netif_addr_lock_bh(dev);
err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
- NETDEV_HW_ADDR_T_MULTICAST, global, false, 0);
+ NETDEV_HW_ADDR_T_MULTICAST, global, false,
+ 0, false);
if (!err)
__dev_set_rx_mode(dev);
netif_addr_unlock_bh(dev);
@@ -762,7 +961,7 @@ int dev_mc_sync(struct net_device *to, struct net_device *from)
if (to->addr_len != from->addr_len)
return -EINVAL;
- netif_addr_lock_nested(to);
+ netif_addr_lock(to);
err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len);
if (!err)
__dev_set_rx_mode(to);
@@ -792,7 +991,7 @@ int dev_mc_sync_multiple(struct net_device *to, struct net_device *from)
if (to->addr_len != from->addr_len)
return -EINVAL;
- netif_addr_lock_nested(to);
+ netif_addr_lock(to);
err = __hw_addr_sync_multiple(&to->mc, &from->mc, to->addr_len);
if (!err)
__dev_set_rx_mode(to);
@@ -815,8 +1014,9 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from)
if (to->addr_len != from->addr_len)
return;
+ /* See the above comments inside dev_uc_unsync(). */
netif_addr_lock_bh(from);
- netif_addr_lock_nested(to);
+ netif_addr_lock(to);
__hw_addr_unsync(&to->mc, &from->mc, to->addr_len);
__dev_set_rx_mode(to);
netif_addr_unlock(to);
diff --git a/net/core/dev_addr_lists_test.c b/net/core/dev_addr_lists_test.c
new file mode 100644
index 000000000000..8e1dba825e94
--- /dev/null
+++ b/net/core/dev_addr_lists_test.c
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <kunit/test.h>
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+
+static const struct net_device_ops dummy_netdev_ops = {
+};
+
+struct dev_addr_test_priv {
+ u32 addr_seen;
+};
+
+static int dev_addr_test_sync(struct net_device *netdev, const unsigned char *a)
+{
+ struct dev_addr_test_priv *datp = netdev_priv(netdev);
+
+ if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN))
+ datp->addr_seen |= 1 << a[0];
+ return 0;
+}
+
+static int dev_addr_test_unsync(struct net_device *netdev,
+ const unsigned char *a)
+{
+ struct dev_addr_test_priv *datp = netdev_priv(netdev);
+
+ if (a[0] < 31 && !memchr_inv(a, a[0], ETH_ALEN))
+ datp->addr_seen &= ~(1 << a[0]);
+ return 0;
+}
+
+static int dev_addr_test_init(struct kunit *test)
+{
+ struct dev_addr_test_priv *datp;
+ struct net_device *netdev;
+ int err;
+
+ netdev = alloc_etherdev(sizeof(*datp));
+ KUNIT_ASSERT_TRUE(test, !!netdev);
+
+ test->priv = netdev;
+ netdev->netdev_ops = &dummy_netdev_ops;
+
+ err = register_netdev(netdev);
+ if (err) {
+ free_netdev(netdev);
+ KUNIT_FAIL(test, "Can't register netdev %d", err);
+ }
+
+ return 0;
+}
+
+static void dev_addr_test_exit(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+
+ unregister_netdev(netdev);
+ free_netdev(netdev);
+}
+
+static void dev_addr_test_basic(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ u8 addr[ETH_ALEN];
+
+ rtnl_lock();
+ KUNIT_EXPECT_TRUE(test, !!netdev->dev_addr);
+
+ memset(addr, 2, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+ KUNIT_EXPECT_MEMEQ(test, netdev->dev_addr, addr, sizeof(addr));
+
+ memset(addr, 3, sizeof(addr));
+ dev_addr_set(netdev, addr);
+ KUNIT_EXPECT_MEMEQ(test, netdev->dev_addr, addr, sizeof(addr));
+ rtnl_unlock();
+}
+
+static void dev_addr_test_sync_one(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ struct dev_addr_test_priv *datp;
+ u8 addr[ETH_ALEN];
+
+ datp = netdev_priv(netdev);
+
+ rtnl_lock();
+ memset(addr, 1, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 2, datp->addr_seen);
+
+ memset(addr, 2, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+
+ datp->addr_seen = 0;
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ /* It's not going to sync anything because the main address is
+ * considered synced and we overwrite in place.
+ */
+ KUNIT_EXPECT_EQ(test, 0, datp->addr_seen);
+ rtnl_unlock();
+}
+
+static void dev_addr_test_add_del(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ struct dev_addr_test_priv *datp;
+ u8 addr[ETH_ALEN];
+ int i;
+
+ datp = netdev_priv(netdev);
+
+ rtnl_lock();
+ for (i = 1; i < 4; i++) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ }
+ /* Add 3 again */
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 0xf, datp->addr_seen);
+
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 0xf, datp->addr_seen);
+
+ for (i = 1; i < 4; i++) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ }
+
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 1, datp->addr_seen);
+ rtnl_unlock();
+}
+
+static void dev_addr_test_del_main(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ u8 addr[ETH_ALEN];
+
+ rtnl_lock();
+ memset(addr, 1, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+
+ KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_del(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ KUNIT_EXPECT_EQ(test, -ENOENT, dev_addr_del(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ rtnl_unlock();
+}
+
+static void dev_addr_test_add_set(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ struct dev_addr_test_priv *datp;
+ u8 addr[ETH_ALEN];
+ int i;
+
+ datp = netdev_priv(netdev);
+
+ rtnl_lock();
+ /* There is no external API like dev_addr_add_excl(),
+ * so shuffle the tree a little bit and exploit aliasing.
+ */
+ for (i = 1; i < 16; i++) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ }
+
+ memset(addr, i, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+ KUNIT_EXPECT_EQ(test, 0, dev_addr_add(netdev, addr,
+ NETDEV_HW_ADDR_T_LAN));
+ memset(addr, 0, sizeof(addr));
+ eth_hw_addr_set(netdev, addr);
+
+ __hw_addr_sync_dev(&netdev->dev_addrs, netdev, dev_addr_test_sync,
+ dev_addr_test_unsync);
+ KUNIT_EXPECT_EQ(test, 0xffff, datp->addr_seen);
+ rtnl_unlock();
+}
+
+static void dev_addr_test_add_excl(struct kunit *test)
+{
+ struct net_device *netdev = test->priv;
+ u8 addr[ETH_ALEN];
+ int i;
+
+ rtnl_lock();
+ for (i = 0; i < 10; i++) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_add_excl(netdev, addr));
+ }
+ KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr));
+
+ for (i = 0; i < 10; i += 2) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, 0, dev_uc_del(netdev, addr));
+ }
+ for (i = 1; i < 10; i += 2) {
+ memset(addr, i, sizeof(addr));
+ KUNIT_EXPECT_EQ(test, -EEXIST, dev_uc_add_excl(netdev, addr));
+ }
+ rtnl_unlock();
+}
+
+static struct kunit_case dev_addr_test_cases[] = {
+ KUNIT_CASE(dev_addr_test_basic),
+ KUNIT_CASE(dev_addr_test_sync_one),
+ KUNIT_CASE(dev_addr_test_add_del),
+ KUNIT_CASE(dev_addr_test_del_main),
+ KUNIT_CASE(dev_addr_test_add_set),
+ KUNIT_CASE(dev_addr_test_add_excl),
+ {}
+};
+
+static struct kunit_suite dev_addr_test_suite = {
+ .name = "dev-addr-list-test",
+ .test_cases = dev_addr_test_cases,
+ .init = dev_addr_test_init,
+ .exit = dev_addr_test_exit,
+};
+kunit_test_suite(dev_addr_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests for struct netdev_hw_addr_list");
+MODULE_LICENSE("GPL");
diff --git a/net/core/dev_api.c b/net/core/dev_api.c
new file mode 100644
index 000000000000..f28852078aa6
--- /dev/null
+++ b/net/core/dev_api.c
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/netdevice.h>
+#include <net/netdev_lock.h>
+
+#include "dev.h"
+
+/**
+ * dev_change_name() - change name of a device
+ * @dev: device
+ * @newname: name (or format string) must be at least IFNAMSIZ
+ *
+ * Change name of a device, can pass format strings "eth%d".
+ * for wildcarding.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_name(struct net_device *dev, const char *newname)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_change_name(dev, newname);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+
+/**
+ * dev_set_alias() - change ifalias of a device
+ * @dev: device
+ * @alias: name up to IFALIASZ
+ * @len: limit of bytes to copy from info
+ *
+ * Set ifalias for a device.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_set_alias(dev, alias, len);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_alias);
+
+/**
+ * dev_change_flags() - change device settings
+ * @dev: device
+ * @flags: device state flags
+ * @extack: netlink extended ack
+ *
+ * Change settings on device based state flags. The flags are
+ * in the userspace exported format.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_flags(struct net_device *dev, unsigned int flags,
+ struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_change_flags(dev, flags, extack);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_change_flags);
+
+/**
+ * dev_set_group() - change group this device belongs to
+ * @dev: device
+ * @new_group: group this device should belong to
+ */
+void dev_set_group(struct net_device *dev, int new_group)
+{
+ netdev_lock_ops(dev);
+ netif_set_group(dev, new_group);
+ netdev_unlock_ops(dev);
+}
+
+int dev_set_mac_address_user(struct net_device *dev,
+ struct sockaddr_storage *ss,
+ struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ down_write(&dev_addr_sem);
+ netdev_lock_ops(dev);
+ ret = netif_set_mac_address(dev, ss, extack);
+ netdev_unlock_ops(dev);
+ up_write(&dev_addr_sem);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_mac_address_user);
+
+/**
+ * dev_change_net_namespace() - move device to different nethost namespace
+ * @dev: device
+ * @net: network namespace
+ * @pat: If not NULL name pattern to try if the current device name
+ * is already taken in the destination network namespace.
+ *
+ * This function shuts down a device interface and moves it
+ * to a new network namespace. On success 0 is returned, on
+ * a failure a netagive errno code is returned.
+ *
+ * Callers must hold the rtnl semaphore.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_net_namespace(struct net_device *dev, struct net *net,
+ const char *pat)
+{
+ return __dev_change_net_namespace(dev, net, pat, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(dev_change_net_namespace);
+
+/**
+ * dev_change_carrier() - change device carrier
+ * @dev: device
+ * @new_carrier: new value
+ *
+ * Change device carrier
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_carrier(struct net_device *dev, bool new_carrier)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_change_carrier(dev, new_carrier);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+
+/**
+ * dev_change_tx_queue_len() - change TX queue length of a netdevice
+ * @dev: device
+ * @new_len: new tx queue length
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_change_tx_queue_len(dev, new_len);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+
+/**
+ * dev_change_proto_down() - set carrier according to proto_down
+ * @dev: device
+ * @proto_down: new value
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_change_proto_down(struct net_device *dev, bool proto_down)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_change_proto_down(dev, proto_down);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+
+/**
+ * dev_open() - prepare an interface for use
+ * @dev: device to open
+ * @extack: netlink extended ack
+ *
+ * Takes a device from down to up state. The device's private open
+ * function is invoked and then the multicast lists are loaded. Finally
+ * the device is moved into the up state and a %NETDEV_UP message is
+ * sent to the netdev notifier chain.
+ *
+ * Calling this function on an active interface is a nop. On a failure
+ * a negative errno code is returned.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_open(dev, extack);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_open);
+
+/**
+ * dev_close() - shutdown an interface
+ * @dev: device to shutdown
+ *
+ * This function moves an active device into down state. A
+ * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
+ * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
+ * chain.
+ */
+void dev_close(struct net_device *dev)
+{
+ netdev_lock_ops(dev);
+ netif_close(dev);
+ netdev_unlock_ops(dev);
+}
+EXPORT_SYMBOL(dev_close);
+
+int dev_eth_ioctl(struct net_device *dev,
+ struct ifreq *ifr, unsigned int cmd)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ int ret = -ENODEV;
+
+ if (!ops->ndo_eth_ioctl)
+ return -EOPNOTSUPP;
+
+ netdev_lock_ops(dev);
+ if (netif_device_present(dev))
+ ret = ops->ndo_eth_ioctl(dev, ifr, cmd);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_eth_ioctl);
+
+int dev_set_mtu(struct net_device *dev, int new_mtu)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_set_mtu(dev, new_mtu);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_mtu);
+
+/**
+ * dev_disable_lro() - disable Large Receive Offload on a device
+ * @dev: device
+ *
+ * Disable Large Receive Offload (LRO) on a net device. Must be
+ * called under RTNL. This is needed if received packets may be
+ * forwarded to another interface.
+ */
+void dev_disable_lro(struct net_device *dev)
+{
+ netdev_lock_ops(dev);
+ netif_disable_lro(dev);
+ netdev_unlock_ops(dev);
+}
+EXPORT_SYMBOL(dev_disable_lro);
+
+/**
+ * dev_set_promiscuity() - update promiscuity count on a device
+ * @dev: device
+ * @inc: modifier
+ *
+ * Add or remove promiscuity from a device. While the count in the device
+ * remains above zero the interface remains promiscuous. Once it hits zero
+ * the device reverts back to normal filtering operation. A negative inc
+ * value is used to drop promiscuity on the device.
+ * Return 0 if successful or a negative errno code on error.
+ */
+int dev_set_promiscuity(struct net_device *dev, int inc)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_set_promiscuity(dev, inc);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_promiscuity);
+
+/**
+ * dev_set_allmulti() - update allmulti count on a device
+ * @dev: device
+ * @inc: modifier
+ *
+ * Add or remove reception of all multicast frames to a device. While the
+ * count in the device remains above zero the interface remains listening
+ * to all interfaces. Once it hits zero the device reverts back to normal
+ * filtering operation. A negative @inc value is used to drop the counter
+ * when releasing a resource needing all multicasts.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+
+int dev_set_allmulti(struct net_device *dev, int inc)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_set_allmulti(dev, inc, true);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_allmulti);
+
+/**
+ * dev_set_mac_address() - change Media Access Control Address
+ * @dev: device
+ * @ss: new address
+ * @extack: netlink extended ack
+ *
+ * Change the hardware (MAC) address of the device
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int dev_set_mac_address(struct net_device *dev, struct sockaddr_storage *ss,
+ struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_set_mac_address(dev, ss, extack);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_mac_address);
+
+int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
+{
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = netif_xdp_propagate(dev, bpf);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dev_xdp_propagate);
+
+/**
+ * netdev_state_change() - device changes state
+ * @dev: device to cause notification
+ *
+ * Called to indicate a device has changed state. This function calls
+ * the notifier chains for netdev_chain and sends a NEWLINK message
+ * to the routing socket.
+ */
+void netdev_state_change(struct net_device *dev)
+{
+ netdev_lock_ops(dev);
+ netif_state_change(dev);
+ netdev_unlock_ops(dev);
+}
+EXPORT_SYMBOL(netdev_state_change);
+
+int dev_set_threaded(struct net_device *dev,
+ enum netdev_napi_threaded threaded)
+{
+ int ret;
+
+ netdev_lock(dev);
+ ret = netif_set_threaded(dev, threaded);
+ netdev_unlock(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(dev_set_threaded);
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 90e8aa36881e..53a53357cfef 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -1,12 +1,20 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kmod.h>
#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
#include <linux/etherdevice.h>
#include <linux/rtnetlink.h>
#include <linux/net_tstamp.h>
+#include <linux/phylib_stubs.h>
+#include <linux/ptp_clock_kernel.h>
#include <linux/wireless.h>
+#include <linux/if_bridge.h>
+#include <net/dsa_stubs.h>
+#include <net/netdev_lock.h>
#include <net/wext.h>
+#include "dev.h"
+
/*
* Map an interface index to its name (SIOCGIFNAME)
*/
@@ -24,79 +32,108 @@ static int dev_ifname(struct net *net, struct ifreq *ifr)
return netdev_get_name(net, ifr->ifr_name, ifr->ifr_ifindex);
}
-static gifconf_func_t *gifconf_list[NPROTO];
-
-/**
- * register_gifconf - register a SIOCGIF handler
- * @family: Address family
- * @gifconf: Function handler
- *
- * Register protocol dependent address dumping routines. The handler
- * that is passed must not be freed or reused until it has been replaced
- * by another handler.
- */
-int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
-{
- if (family >= NPROTO)
- return -EINVAL;
- gifconf_list[family] = gifconf;
- return 0;
-}
-EXPORT_SYMBOL(register_gifconf);
-
/*
* Perform a SIOCGIFCONF call. This structure will change
* size eventually, and there is nothing I can do about it.
* Thus we will need a 'compatibility mode'.
*/
-
-int dev_ifconf(struct net *net, struct ifconf *ifc, int size)
+int dev_ifconf(struct net *net, struct ifconf __user *uifc)
{
struct net_device *dev;
- char __user *pos;
- int len;
- int total;
- int i;
+ void __user *pos;
+ size_t size;
+ int len, total = 0, done;
- /*
- * Fetch the caller's info block.
- */
+ /* both the ifconf and the ifreq structures are slightly different */
+ if (in_compat_syscall()) {
+ struct compat_ifconf ifc32;
- pos = ifc->ifc_buf;
- len = ifc->ifc_len;
+ if (copy_from_user(&ifc32, uifc, sizeof(struct compat_ifconf)))
+ return -EFAULT;
- /*
- * Loop over the interfaces, and write an info block for each.
- */
+ pos = compat_ptr(ifc32.ifcbuf);
+ len = ifc32.ifc_len;
+ size = sizeof(struct compat_ifreq);
+ } else {
+ struct ifconf ifc;
+
+ if (copy_from_user(&ifc, uifc, sizeof(struct ifconf)))
+ return -EFAULT;
+
+ pos = ifc.ifc_buf;
+ len = ifc.ifc_len;
+ size = sizeof(struct ifreq);
+ }
- total = 0;
+ /* Loop over the interfaces, and write an info block for each. */
+ rtnl_net_lock(net);
for_each_netdev(net, dev) {
- for (i = 0; i < NPROTO; i++) {
- if (gifconf_list[i]) {
- int done;
- if (!pos)
- done = gifconf_list[i](dev, NULL, 0, size);
- else
- done = gifconf_list[i](dev, pos + total,
- len - total, size);
- if (done < 0)
- return -EFAULT;
- total += done;
- }
+ if (!pos)
+ done = inet_gifconf(dev, NULL, 0, size);
+ else
+ done = inet_gifconf(dev, pos + total,
+ len - total, size);
+ if (done < 0) {
+ rtnl_net_unlock(net);
+ return -EFAULT;
}
+ total += done;
}
+ rtnl_net_unlock(net);
- /*
- * All done. Write the updated control block back to the caller.
- */
- ifc->ifc_len = total;
+ return put_user(total, &uifc->ifc_len);
+}
+
+static int dev_getifmap(struct net_device *dev, struct ifreq *ifr)
+{
+ struct ifmap *ifmap = &ifr->ifr_map;
+
+ if (in_compat_syscall()) {
+ struct compat_ifmap *cifmap = (struct compat_ifmap *)ifmap;
+
+ cifmap->mem_start = dev->mem_start;
+ cifmap->mem_end = dev->mem_end;
+ cifmap->base_addr = dev->base_addr;
+ cifmap->irq = dev->irq;
+ cifmap->dma = dev->dma;
+ cifmap->port = dev->if_port;
+
+ return 0;
+ }
+
+ ifmap->mem_start = dev->mem_start;
+ ifmap->mem_end = dev->mem_end;
+ ifmap->base_addr = dev->base_addr;
+ ifmap->irq = dev->irq;
+ ifmap->dma = dev->dma;
+ ifmap->port = dev->if_port;
- /*
- * Both BSD and Solaris return 0 here, so we do too.
- */
return 0;
}
+static int netif_setifmap(struct net_device *dev, struct ifreq *ifr)
+{
+ struct compat_ifmap *cifmap = (struct compat_ifmap *)&ifr->ifr_map;
+
+ if (!dev->netdev_ops->ndo_set_config)
+ return -EOPNOTSUPP;
+
+ if (in_compat_syscall()) {
+ struct ifmap ifmap = {
+ .mem_start = cifmap->mem_start,
+ .mem_end = cifmap->mem_end,
+ .base_addr = cifmap->base_addr,
+ .irq = cifmap->irq,
+ .dma = cifmap->dma,
+ .port = cifmap->port,
+ };
+
+ return dev->netdev_ops->ndo_set_config(dev, &ifmap);
+ }
+
+ return dev->netdev_ops->ndo_set_config(dev, &ifr->ifr_map);
+}
+
/*
* Perform the SIOCxIFxxx calls, inside rcu_read_lock()
*/
@@ -110,7 +147,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm
switch (cmd) {
case SIOCGIFFLAGS: /* Get interface flags */
- ifr->ifr_flags = (short) dev_get_flags(dev);
+ ifr->ifr_flags = (short)netif_get_flags(dev);
return 0;
case SIOCGIFMETRIC: /* Get the metric on the interface
@@ -122,29 +159,12 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm
ifr->ifr_mtu = dev->mtu;
return 0;
- case SIOCGIFHWADDR:
- if (!dev->addr_len)
- memset(ifr->ifr_hwaddr.sa_data, 0,
- sizeof(ifr->ifr_hwaddr.sa_data));
- else
- memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
- min(sizeof(ifr->ifr_hwaddr.sa_data),
- (size_t)dev->addr_len));
- ifr->ifr_hwaddr.sa_family = dev->type;
- return 0;
-
case SIOCGIFSLAVE:
err = -EINVAL;
break;
case SIOCGIFMAP:
- ifr->ifr_map.mem_start = dev->mem_start;
- ifr->ifr_map.mem_end = dev->mem_end;
- ifr->ifr_map.base_addr = dev->base_addr;
- ifr->ifr_map.irq = dev->irq;
- ifr->ifr_map.dma = dev->dma;
- ifr->ifr_map.port = dev->if_port;
- return 0;
+ return dev_getifmap(dev, ifr);
case SIOCGIFINDEX:
ifr->ifr_ifindex = dev->ifindex;
@@ -166,29 +186,29 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm
return err;
}
-static int net_hwtstamp_validate(struct ifreq *ifr)
+int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg)
{
- struct hwtstamp_config cfg;
enum hwtstamp_tx_types tx_type;
enum hwtstamp_rx_filters rx_filter;
int tx_type_valid = 0;
int rx_filter_valid = 0;
- if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
- return -EFAULT;
-
- if (cfg.flags) /* reserved for future extensions */
+ if (cfg->flags & ~HWTSTAMP_FLAG_MASK)
return -EINVAL;
- tx_type = cfg.tx_type;
- rx_filter = cfg.rx_filter;
+ tx_type = cfg->tx_type;
+ rx_filter = cfg->rx_filter;
switch (tx_type) {
case HWTSTAMP_TX_OFF:
case HWTSTAMP_TX_ON:
case HWTSTAMP_TX_ONESTEP_SYNC:
+ case HWTSTAMP_TX_ONESTEP_P2P:
tx_type_valid = 1;
break;
+ case __HWTSTAMP_TX_CNT:
+ /* not a real value */
+ break;
}
switch (rx_filter) {
@@ -210,6 +230,9 @@ static int net_hwtstamp_validate(struct ifreq *ifr)
case HWTSTAMP_FILTER_NTP_ALL:
rx_filter_valid = 1;
break;
+ case __HWTSTAMP_FILTER_CNT:
+ /* not a real value */
+ break;
}
if (!tx_type_valid || !rx_filter_valid)
@@ -218,10 +241,333 @@ static int net_hwtstamp_validate(struct ifreq *ifr)
return 0;
}
+/**
+ * dev_get_hwtstamp_phylib() - Get hardware timestamping settings of NIC
+ * or of attached phylib PHY
+ * @dev: Network device
+ * @cfg: Timestamping configuration structure
+ *
+ * Helper for calling the default hardware provider timestamping.
+ *
+ * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), but
+ * phydev->mii_ts has both hwtstamp_get() and hwtstamp_set() methods. So this
+ * will return -EOPNOTSUPP for phylib only if hwtstamp_get() is not
+ * implemented for now, which is still more accurate than letting the netdev
+ * handle the GET request.
+ */
+int dev_get_hwtstamp_phylib(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg)
+{
+ struct hwtstamp_provider *hwprov;
+
+ hwprov = rtnl_dereference(dev->hwprov);
+ if (hwprov) {
+ cfg->qualifier = hwprov->desc.qualifier;
+ if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB &&
+ hwprov->phydev)
+ return phy_hwtstamp_get(hwprov->phydev, cfg);
+
+ if (hwprov->source == HWTSTAMP_SOURCE_NETDEV)
+ return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg);
+
+ return -EOPNOTSUPP;
+ }
+
+ if (phy_is_default_hwtstamp(dev->phydev))
+ return phy_hwtstamp_get(dev->phydev, cfg);
+
+ return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg);
+}
+
+static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct kernel_hwtstamp_config kernel_cfg = {};
+ struct hwtstamp_config cfg;
+ int err;
+
+ if (!ops->ndo_hwtstamp_get)
+ return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); /* legacy */
+
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ kernel_cfg.ifr = ifr;
+ netdev_lock_ops(dev);
+ err = dev_get_hwtstamp_phylib(dev, &kernel_cfg);
+ netdev_unlock_ops(dev);
+ if (err)
+ return err;
+
+ /* If the request was resolved through an unconverted driver, omit
+ * the copy_to_user(), since the implementation has already done that
+ */
+ if (!kernel_cfg.copied_to_user) {
+ hwtstamp_config_from_kernel(&cfg, &kernel_cfg);
+
+ if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg)))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/**
+ * dev_set_hwtstamp_phylib() - Change hardware timestamping of NIC
+ * or of attached phylib PHY
+ * @dev: Network device
+ * @cfg: Timestamping configuration structure
+ * @extack: Netlink extended ack message structure, for error reporting
+ *
+ * Helper for enforcing a common policy that phylib timestamping, if available,
+ * should take precedence in front of hardware timestamping provided by the
+ * netdev. If the netdev driver needs to perform specific actions even for PHY
+ * timestamping to work properly (a switch port must trap the timestamped
+ * frames and not forward them), it must set dev->see_all_hwtstamp_requests.
+ */
+int dev_set_hwtstamp_phylib(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct kernel_hwtstamp_config old_cfg = {};
+ struct hwtstamp_provider *hwprov;
+ struct phy_device *phydev;
+ bool changed = false;
+ bool phy_ts;
+ int err;
+
+ hwprov = rtnl_dereference(dev->hwprov);
+ if (hwprov) {
+ if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB &&
+ hwprov->phydev) {
+ phy_ts = true;
+ phydev = hwprov->phydev;
+ } else if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) {
+ phy_ts = false;
+ } else {
+ return -EOPNOTSUPP;
+ }
+
+ cfg->qualifier = hwprov->desc.qualifier;
+ } else {
+ phy_ts = phy_is_default_hwtstamp(dev->phydev);
+ if (phy_ts)
+ phydev = dev->phydev;
+ }
+
+ cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV;
+
+ if (phy_ts && dev->see_all_hwtstamp_requests) {
+ err = ops->ndo_hwtstamp_get(dev, &old_cfg);
+ if (err)
+ return err;
+ }
+
+ if (!phy_ts || dev->see_all_hwtstamp_requests) {
+ err = ops->ndo_hwtstamp_set(dev, cfg, extack);
+ if (err) {
+ if (extack->_msg)
+ netdev_err(dev, "%s\n", extack->_msg);
+ return err;
+ }
+ }
+
+ if (phy_ts && dev->see_all_hwtstamp_requests)
+ changed = kernel_hwtstamp_config_changed(&old_cfg, cfg);
+
+ if (phy_ts) {
+ err = phy_hwtstamp_set(phydev, cfg, extack);
+ if (err) {
+ if (changed)
+ ops->ndo_hwtstamp_set(dev, &old_cfg, NULL);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+ struct kernel_hwtstamp_config kernel_cfg = {};
+ struct netlink_ext_ack extack = {};
+ struct hwtstamp_config cfg;
+ int err;
+
+ if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
+ return -EFAULT;
+
+ hwtstamp_config_to_kernel(&kernel_cfg, &cfg);
+ kernel_cfg.ifr = ifr;
+
+ err = net_hwtstamp_validate(&kernel_cfg);
+ if (err)
+ return err;
+
+ err = dsa_conduit_hwtstamp_validate(dev, &kernel_cfg, &extack);
+ if (err) {
+ if (extack._msg)
+ netdev_err(dev, "%s\n", extack._msg);
+ return err;
+ }
+
+ if (!ops->ndo_hwtstamp_set)
+ return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); /* legacy */
+
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ netdev_lock_ops(dev);
+ err = dev_set_hwtstamp_phylib(dev, &kernel_cfg, &extack);
+ netdev_unlock_ops(dev);
+ if (err)
+ return err;
+
+ /* The driver may have modified the configuration, so copy the
+ * updated version of it back to user space
+ */
+ if (!kernel_cfg.copied_to_user) {
+ hwtstamp_config_from_kernel(&cfg, &kernel_cfg);
+
+ if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg)))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd,
+ struct kernel_hwtstamp_config *kernel_cfg)
+{
+ struct ifreq ifrr;
+ int err;
+
+ if (!kernel_cfg->ifr)
+ return -EINVAL;
+
+ strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ);
+ ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru;
+
+ err = dev_eth_ioctl(dev, &ifrr, cmd);
+ if (err)
+ return err;
+
+ kernel_cfg->ifr->ifr_ifru = ifrr.ifr_ifru;
+ kernel_cfg->copied_to_user = true;
+
+ return 0;
+}
+
+int generic_hwtstamp_get_lower(struct net_device *dev,
+ struct kernel_hwtstamp_config *kernel_cfg)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ if (ops->ndo_hwtstamp_get) {
+ int err;
+
+ netdev_lock_ops(dev);
+ err = dev_get_hwtstamp_phylib(dev, kernel_cfg);
+ netdev_unlock_ops(dev);
+
+ return err;
+ }
+
+ /* Legacy path: unconverted lower driver */
+ return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg);
+}
+EXPORT_SYMBOL(generic_hwtstamp_get_lower);
+
+int generic_hwtstamp_set_lower(struct net_device *dev,
+ struct kernel_hwtstamp_config *kernel_cfg,
+ struct netlink_ext_ack *extack)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ if (ops->ndo_hwtstamp_set) {
+ int err;
+
+ netdev_lock_ops(dev);
+ err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack);
+ netdev_unlock_ops(dev);
+
+ return err;
+ }
+
+ /* Legacy path: unconverted lower driver */
+ return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg);
+}
+EXPORT_SYMBOL(generic_hwtstamp_set_lower);
+
+static int dev_siocbond(struct net_device *dev,
+ struct ifreq *ifr, unsigned int cmd)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_siocbond) {
+ int ret = -ENODEV;
+
+ netdev_lock_ops(dev);
+ if (netif_device_present(dev))
+ ret = ops->ndo_siocbond(dev, ifr, cmd);
+ netdev_unlock_ops(dev);
+
+ return ret;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int dev_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+ void __user *data, unsigned int cmd)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_siocdevprivate) {
+ int ret = -ENODEV;
+
+ netdev_lock_ops(dev);
+ if (netif_device_present(dev))
+ ret = ops->ndo_siocdevprivate(dev, ifr, data, cmd);
+ netdev_unlock_ops(dev);
+
+ return ret;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int dev_siocwandev(struct net_device *dev, struct if_settings *ifs)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ if (ops->ndo_siocwandev) {
+ int ret = -ENODEV;
+
+ netdev_lock_ops(dev);
+ if (netif_device_present(dev))
+ ret = ops->ndo_siocwandev(dev, ifs);
+ netdev_unlock_ops(dev);
+
+ return ret;
+ }
+
+ return -EOPNOTSUPP;
+}
+
/*
- * Perform the SIOCxIFxxx calls, inside rtnl_lock()
+ * Perform the SIOCxIFxxx calls, inside rtnl_net_lock()
*/
-static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
+static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
+ unsigned int cmd)
{
int err;
struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
@@ -234,7 +580,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
switch (cmd) {
case SIOCSIFFLAGS: /* Set interface flags */
- return dev_change_flags(dev, ifr->ifr_flags);
+ return dev_change_flags(dev, ifr->ifr_flags, NULL);
case SIOCSIFMETRIC: /* Set the metric on the interface
(currently unused) */
@@ -244,9 +590,11 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
return dev_set_mtu(dev, ifr->ifr_mtu);
case SIOCSIFHWADDR:
- if (dev->addr_len > sizeof(struct sockaddr))
+ if (dev->addr_len > sizeof(ifr->ifr_hwaddr))
return -EINVAL;
- return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
+ return dev_set_mac_address_user(dev,
+ (struct sockaddr_storage *)&ifr->ifr_hwaddr,
+ NULL);
case SIOCSIFHWBROADCAST:
if (ifr->ifr_hwaddr.sa_family != dev->type)
@@ -254,16 +602,16 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
min(sizeof(ifr->ifr_hwaddr.sa_data),
(size_t)dev->addr_len));
+ netdev_lock_ops(dev);
call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
+ netdev_unlock_ops(dev);
return 0;
case SIOCSIFMAP:
- if (ops->ndo_set_config) {
- if (!netif_device_present(dev))
- return -ENODEV;
- return ops->ndo_set_config(dev, &ifr->ifr_map);
- }
- return -EOPNOTSUPP;
+ netdev_lock_ops(dev);
+ err = netif_setifmap(dev, ifr);
+ netdev_unlock_ops(dev);
+ return err;
case SIOCADDMULTI:
if (!ops->ndo_set_rx_mode ||
@@ -271,7 +619,10 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
- return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
+ netdev_lock_ops(dev);
+ err = dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
+ netdev_unlock_ops(dev);
+ return err;
case SIOCDELMULTI:
if (!ops->ndo_set_rx_mode ||
@@ -279,7 +630,10 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
return -EINVAL;
if (!netif_device_present(dev))
return -ENODEV;
- return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
+ netdev_lock_ops(dev);
+ err = dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
+ netdev_unlock_ops(dev);
+ return err;
case SIOCSIFTXQLEN:
if (ifr->ifr_qlen < 0)
@@ -290,42 +644,34 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
ifr->ifr_newname[IFNAMSIZ-1] = '\0';
return dev_change_name(dev, ifr->ifr_newname);
+ case SIOCWANDEV:
+ return dev_siocwandev(dev, &ifr->ifr_settings);
+
+ case SIOCDEVPRIVATE ... SIOCDEVPRIVATE + 15:
+ return dev_siocdevprivate(dev, ifr, data, cmd);
+
case SIOCSHWTSTAMP:
- err = net_hwtstamp_validate(ifr);
- if (err)
- return err;
- /* fall through */
+ return dev_set_hwtstamp(dev, ifr);
- /*
- * Unknown or private ioctl
- */
- default:
- if ((cmd >= SIOCDEVPRIVATE &&
- cmd <= SIOCDEVPRIVATE + 15) ||
- cmd == SIOCBONDENSLAVE ||
- cmd == SIOCBONDRELEASE ||
- cmd == SIOCBONDSETHWADDR ||
- cmd == SIOCBONDSLAVEINFOQUERY ||
- cmd == SIOCBONDINFOQUERY ||
- cmd == SIOCBONDCHANGEACTIVE ||
- cmd == SIOCGMIIPHY ||
- cmd == SIOCGMIIREG ||
- cmd == SIOCSMIIREG ||
- cmd == SIOCBRADDIF ||
- cmd == SIOCBRDELIF ||
- cmd == SIOCSHWTSTAMP ||
- cmd == SIOCGHWTSTAMP ||
- cmd == SIOCWANDEV) {
- err = -EOPNOTSUPP;
- if (ops->ndo_do_ioctl) {
- if (netif_device_present(dev))
- err = ops->ndo_do_ioctl(dev, ifr, cmd);
- else
- err = -ENODEV;
- }
- } else
- err = -EINVAL;
+ case SIOCGHWTSTAMP:
+ return dev_get_hwtstamp(dev, ifr);
+
+ case SIOCGMIIPHY:
+ case SIOCGMIIREG:
+ case SIOCSMIIREG:
+ return dev_eth_ioctl(dev, ifr, cmd);
+
+ case SIOCBONDENSLAVE:
+ case SIOCBONDRELEASE:
+ case SIOCBONDSETHWADDR:
+ case SIOCBONDSLAVEINFOQUERY:
+ case SIOCBONDINFOQUERY:
+ case SIOCBONDCHANGEACTIVE:
+ return dev_siocbond(dev, ifr, cmd);
+ /* Unknown ioctl */
+ default:
+ err = -EINVAL;
}
return err;
}
@@ -366,7 +712,9 @@ EXPORT_SYMBOL(dev_load);
* dev_ioctl - network device ioctl
* @net: the applicable net namespace
* @cmd: command to issue
- * @arg: pointer to a struct ifreq in user space
+ * @ifr: pointer to a struct ifreq in user space
+ * @data: data exchanged with userspace
+ * @need_copyout: whether or not copy_to_user() should be called
*
* Issue ioctl functions to devices. This is normally called by the
* user space syscall interfaces but can sometimes be useful for
@@ -374,7 +722,8 @@ EXPORT_SYMBOL(dev_load);
* positive or a negative errno code on error.
*/
-int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_copyout)
+int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
+ void __user *data, bool *need_copyout)
{
int ret;
char *colon;
@@ -395,6 +744,13 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
*/
switch (cmd) {
+ case SIOCGIFHWADDR:
+ dev_load(net, ifr->ifr_name);
+ ret = netif_get_mac_address(&ifr->ifr_hwaddr, net,
+ ifr->ifr_name);
+ if (colon)
+ *colon = ':';
+ return ret;
/*
* These ioctl calls:
* - can be done by all.
@@ -404,7 +760,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
case SIOCGIFFLAGS:
case SIOCGIFMETRIC:
case SIOCGIFMTU:
- case SIOCGIFHWADDR:
case SIOCGIFSLAVE:
case SIOCGIFMAP:
case SIOCGIFINDEX:
@@ -419,9 +774,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
case SIOCETHTOOL:
dev_load(net, ifr->ifr_name);
- rtnl_lock();
- ret = dev_ethtool(net, ifr);
- rtnl_unlock();
+ ret = dev_ethtool(net, ifr, data);
if (colon)
*colon = ':';
return ret;
@@ -438,9 +791,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
dev_load(net, ifr->ifr_name);
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- rtnl_lock();
- ret = dev_ifsioc(net, ifr, cmd);
- rtnl_unlock();
+
+ rtnl_net_lock(net);
+ ret = dev_ifsioc(net, ifr, data, cmd);
+ rtnl_net_unlock(net);
+
if (colon)
*colon = ':';
return ret;
@@ -455,7 +810,7 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
case SIOCSIFTXQLEN:
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- /* fall through */
+ fallthrough;
/*
* These ioctl calls:
* - require local superuser power.
@@ -475,18 +830,18 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
case SIOCBONDRELEASE:
case SIOCBONDSETHWADDR:
case SIOCBONDCHANGEACTIVE:
- case SIOCBRADDIF:
- case SIOCBRDELIF:
case SIOCSHWTSTAMP:
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- /* fall through */
+ fallthrough;
case SIOCBONDSLAVEINFOQUERY:
case SIOCBONDINFOQUERY:
dev_load(net, ifr->ifr_name);
- rtnl_lock();
- ret = dev_ifsioc(net, ifr, cmd);
- rtnl_unlock();
+
+ rtnl_net_lock(net);
+ ret = dev_ifsioc(net, ifr, data, cmd);
+ rtnl_net_unlock(net);
+
if (need_copyout)
*need_copyout = false;
return ret;
@@ -509,9 +864,10 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_c
(cmd >= SIOCDEVPRIVATE &&
cmd <= SIOCDEVPRIVATE + 15)) {
dev_load(net, ifr->ifr_name);
- rtnl_lock();
- ret = dev_ifsioc(net, ifr, cmd);
- rtnl_unlock();
+
+ rtnl_net_lock(net);
+ ret = dev_ifsioc(net, ifr, data, cmd);
+ rtnl_net_unlock(net);
return ret;
}
return -ENOTTY;
diff --git a/net/core/devlink.c b/net/core/devlink.c
deleted file mode 100644
index 6bc42933be4a..000000000000
--- a/net/core/devlink.c
+++ /dev/null
@@ -1,4805 +0,0 @@
-/*
- * net/core/devlink.c - Network physical/parent device Netlink interface
- *
- * Heavily inspired by net/wireless/
- * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
- * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/gfp.h>
-#include <linux/device.h>
-#include <linux/list.h>
-#include <linux/netdevice.h>
-#include <rdma/ib_verbs.h>
-#include <net/netlink.h>
-#include <net/genetlink.h>
-#include <net/rtnetlink.h>
-#include <net/net_namespace.h>
-#include <net/sock.h>
-#include <net/devlink.h>
-#define CREATE_TRACE_POINTS
-#include <trace/events/devlink.h>
-
-static struct devlink_dpipe_field devlink_dpipe_fields_ethernet[] = {
- {
- .name = "destination mac",
- .id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC,
- .bitwidth = 48,
- },
-};
-
-struct devlink_dpipe_header devlink_dpipe_header_ethernet = {
- .name = "ethernet",
- .id = DEVLINK_DPIPE_HEADER_ETHERNET,
- .fields = devlink_dpipe_fields_ethernet,
- .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ethernet),
- .global = true,
-};
-EXPORT_SYMBOL(devlink_dpipe_header_ethernet);
-
-static struct devlink_dpipe_field devlink_dpipe_fields_ipv4[] = {
- {
- .name = "destination ip",
- .id = DEVLINK_DPIPE_FIELD_IPV4_DST_IP,
- .bitwidth = 32,
- },
-};
-
-struct devlink_dpipe_header devlink_dpipe_header_ipv4 = {
- .name = "ipv4",
- .id = DEVLINK_DPIPE_HEADER_IPV4,
- .fields = devlink_dpipe_fields_ipv4,
- .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv4),
- .global = true,
-};
-EXPORT_SYMBOL(devlink_dpipe_header_ipv4);
-
-static struct devlink_dpipe_field devlink_dpipe_fields_ipv6[] = {
- {
- .name = "destination ip",
- .id = DEVLINK_DPIPE_FIELD_IPV6_DST_IP,
- .bitwidth = 128,
- },
-};
-
-struct devlink_dpipe_header devlink_dpipe_header_ipv6 = {
- .name = "ipv6",
- .id = DEVLINK_DPIPE_HEADER_IPV6,
- .fields = devlink_dpipe_fields_ipv6,
- .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv6),
- .global = true,
-};
-EXPORT_SYMBOL(devlink_dpipe_header_ipv6);
-
-EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg);
-
-static LIST_HEAD(devlink_list);
-
-/* devlink_mutex
- *
- * An overall lock guarding every operation coming from userspace.
- * It also guards devlink devices list and it is taken when
- * driver registers/unregisters it.
- */
-static DEFINE_MUTEX(devlink_mutex);
-
-static struct net *devlink_net(const struct devlink *devlink)
-{
- return read_pnet(&devlink->_net);
-}
-
-static void devlink_net_set(struct devlink *devlink, struct net *net)
-{
- write_pnet(&devlink->_net, net);
-}
-
-static struct devlink *devlink_get_from_attrs(struct net *net,
- struct nlattr **attrs)
-{
- struct devlink *devlink;
- char *busname;
- char *devname;
-
- if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME])
- return ERR_PTR(-EINVAL);
-
- busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]);
- devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]);
-
- list_for_each_entry(devlink, &devlink_list, list) {
- if (strcmp(devlink->dev->bus->name, busname) == 0 &&
- strcmp(dev_name(devlink->dev), devname) == 0 &&
- net_eq(devlink_net(devlink), net))
- return devlink;
- }
-
- return ERR_PTR(-ENODEV);
-}
-
-static struct devlink *devlink_get_from_info(struct genl_info *info)
-{
- return devlink_get_from_attrs(genl_info_net(info), info->attrs);
-}
-
-static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
- int port_index)
-{
- struct devlink_port *devlink_port;
-
- list_for_each_entry(devlink_port, &devlink->port_list, list) {
- if (devlink_port->index == port_index)
- return devlink_port;
- }
- return NULL;
-}
-
-static bool devlink_port_index_exists(struct devlink *devlink, int port_index)
-{
- return devlink_port_get_by_index(devlink, port_index);
-}
-
-static struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink,
- struct nlattr **attrs)
-{
- if (attrs[DEVLINK_ATTR_PORT_INDEX]) {
- u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]);
- struct devlink_port *devlink_port;
-
- devlink_port = devlink_port_get_by_index(devlink, port_index);
- if (!devlink_port)
- return ERR_PTR(-ENODEV);
- return devlink_port;
- }
- return ERR_PTR(-EINVAL);
-}
-
-static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink,
- struct genl_info *info)
-{
- return devlink_port_get_from_attrs(devlink, info->attrs);
-}
-
-struct devlink_sb {
- struct list_head list;
- unsigned int index;
- u32 size;
- u16 ingress_pools_count;
- u16 egress_pools_count;
- u16 ingress_tc_count;
- u16 egress_tc_count;
-};
-
-static u16 devlink_sb_pool_count(struct devlink_sb *devlink_sb)
-{
- return devlink_sb->ingress_pools_count + devlink_sb->egress_pools_count;
-}
-
-static struct devlink_sb *devlink_sb_get_by_index(struct devlink *devlink,
- unsigned int sb_index)
-{
- struct devlink_sb *devlink_sb;
-
- list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
- if (devlink_sb->index == sb_index)
- return devlink_sb;
- }
- return NULL;
-}
-
-static bool devlink_sb_index_exists(struct devlink *devlink,
- unsigned int sb_index)
-{
- return devlink_sb_get_by_index(devlink, sb_index);
-}
-
-static struct devlink_sb *devlink_sb_get_from_attrs(struct devlink *devlink,
- struct nlattr **attrs)
-{
- if (attrs[DEVLINK_ATTR_SB_INDEX]) {
- u32 sb_index = nla_get_u32(attrs[DEVLINK_ATTR_SB_INDEX]);
- struct devlink_sb *devlink_sb;
-
- devlink_sb = devlink_sb_get_by_index(devlink, sb_index);
- if (!devlink_sb)
- return ERR_PTR(-ENODEV);
- return devlink_sb;
- }
- return ERR_PTR(-EINVAL);
-}
-
-static struct devlink_sb *devlink_sb_get_from_info(struct devlink *devlink,
- struct genl_info *info)
-{
- return devlink_sb_get_from_attrs(devlink, info->attrs);
-}
-
-static int devlink_sb_pool_index_get_from_attrs(struct devlink_sb *devlink_sb,
- struct nlattr **attrs,
- u16 *p_pool_index)
-{
- u16 val;
-
- if (!attrs[DEVLINK_ATTR_SB_POOL_INDEX])
- return -EINVAL;
-
- val = nla_get_u16(attrs[DEVLINK_ATTR_SB_POOL_INDEX]);
- if (val >= devlink_sb_pool_count(devlink_sb))
- return -EINVAL;
- *p_pool_index = val;
- return 0;
-}
-
-static int devlink_sb_pool_index_get_from_info(struct devlink_sb *devlink_sb,
- struct genl_info *info,
- u16 *p_pool_index)
-{
- return devlink_sb_pool_index_get_from_attrs(devlink_sb, info->attrs,
- p_pool_index);
-}
-
-static int
-devlink_sb_pool_type_get_from_attrs(struct nlattr **attrs,
- enum devlink_sb_pool_type *p_pool_type)
-{
- u8 val;
-
- if (!attrs[DEVLINK_ATTR_SB_POOL_TYPE])
- return -EINVAL;
-
- val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_TYPE]);
- if (val != DEVLINK_SB_POOL_TYPE_INGRESS &&
- val != DEVLINK_SB_POOL_TYPE_EGRESS)
- return -EINVAL;
- *p_pool_type = val;
- return 0;
-}
-
-static int
-devlink_sb_pool_type_get_from_info(struct genl_info *info,
- enum devlink_sb_pool_type *p_pool_type)
-{
- return devlink_sb_pool_type_get_from_attrs(info->attrs, p_pool_type);
-}
-
-static int
-devlink_sb_th_type_get_from_attrs(struct nlattr **attrs,
- enum devlink_sb_threshold_type *p_th_type)
-{
- u8 val;
-
- if (!attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE])
- return -EINVAL;
-
- val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE]);
- if (val != DEVLINK_SB_THRESHOLD_TYPE_STATIC &&
- val != DEVLINK_SB_THRESHOLD_TYPE_DYNAMIC)
- return -EINVAL;
- *p_th_type = val;
- return 0;
-}
-
-static int
-devlink_sb_th_type_get_from_info(struct genl_info *info,
- enum devlink_sb_threshold_type *p_th_type)
-{
- return devlink_sb_th_type_get_from_attrs(info->attrs, p_th_type);
-}
-
-static int
-devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb,
- struct nlattr **attrs,
- enum devlink_sb_pool_type pool_type,
- u16 *p_tc_index)
-{
- u16 val;
-
- if (!attrs[DEVLINK_ATTR_SB_TC_INDEX])
- return -EINVAL;
-
- val = nla_get_u16(attrs[DEVLINK_ATTR_SB_TC_INDEX]);
- if (pool_type == DEVLINK_SB_POOL_TYPE_INGRESS &&
- val >= devlink_sb->ingress_tc_count)
- return -EINVAL;
- if (pool_type == DEVLINK_SB_POOL_TYPE_EGRESS &&
- val >= devlink_sb->egress_tc_count)
- return -EINVAL;
- *p_tc_index = val;
- return 0;
-}
-
-static int
-devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
- struct genl_info *info,
- enum devlink_sb_pool_type pool_type,
- u16 *p_tc_index)
-{
- return devlink_sb_tc_index_get_from_attrs(devlink_sb, info->attrs,
- pool_type, p_tc_index);
-}
-
-struct devlink_region {
- struct devlink *devlink;
- struct list_head list;
- const char *name;
- struct list_head snapshot_list;
- u32 max_snapshots;
- u32 cur_snapshots;
- u64 size;
-};
-
-struct devlink_snapshot {
- struct list_head list;
- struct devlink_region *region;
- devlink_snapshot_data_dest_t *data_destructor;
- u64 data_len;
- u8 *data;
- u32 id;
-};
-
-static struct devlink_region *
-devlink_region_get_by_name(struct devlink *devlink, const char *region_name)
-{
- struct devlink_region *region;
-
- list_for_each_entry(region, &devlink->region_list, list)
- if (!strcmp(region->name, region_name))
- return region;
-
- return NULL;
-}
-
-static struct devlink_snapshot *
-devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id)
-{
- struct devlink_snapshot *snapshot;
-
- list_for_each_entry(snapshot, &region->snapshot_list, list)
- if (snapshot->id == id)
- return snapshot;
-
- return NULL;
-}
-
-static void devlink_region_snapshot_del(struct devlink_snapshot *snapshot)
-{
- snapshot->region->cur_snapshots--;
- list_del(&snapshot->list);
- (*snapshot->data_destructor)(snapshot->data);
- kfree(snapshot);
-}
-
-#define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0)
-#define DEVLINK_NL_FLAG_NEED_PORT BIT(1)
-#define DEVLINK_NL_FLAG_NEED_SB BIT(2)
-
-/* The per devlink instance lock is taken by default in the pre-doit
- * operation, yet several commands do not require this. The global
- * devlink lock is taken and protects from disruption by user-calls.
- */
-#define DEVLINK_NL_FLAG_NO_LOCK BIT(3)
-
-static int devlink_nl_pre_doit(const struct genl_ops *ops,
- struct sk_buff *skb, struct genl_info *info)
-{
- struct devlink *devlink;
- int err;
-
- mutex_lock(&devlink_mutex);
- devlink = devlink_get_from_info(info);
- if (IS_ERR(devlink)) {
- mutex_unlock(&devlink_mutex);
- return PTR_ERR(devlink);
- }
- if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
- mutex_lock(&devlink->lock);
- if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) {
- info->user_ptr[0] = devlink;
- } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) {
- struct devlink_port *devlink_port;
-
- devlink_port = devlink_port_get_from_info(devlink, info);
- if (IS_ERR(devlink_port)) {
- err = PTR_ERR(devlink_port);
- goto unlock;
- }
- info->user_ptr[0] = devlink_port;
- }
- if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_SB) {
- struct devlink_sb *devlink_sb;
-
- devlink_sb = devlink_sb_get_from_info(devlink, info);
- if (IS_ERR(devlink_sb)) {
- err = PTR_ERR(devlink_sb);
- goto unlock;
- }
- info->user_ptr[1] = devlink_sb;
- }
- return 0;
-
-unlock:
- if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
- mutex_unlock(&devlink->lock);
- mutex_unlock(&devlink_mutex);
- return err;
-}
-
-static void devlink_nl_post_doit(const struct genl_ops *ops,
- struct sk_buff *skb, struct genl_info *info)
-{
- struct devlink *devlink;
-
- devlink = devlink_get_from_info(info);
- if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK)
- mutex_unlock(&devlink->lock);
- mutex_unlock(&devlink_mutex);
-}
-
-static struct genl_family devlink_nl_family;
-
-enum devlink_multicast_groups {
- DEVLINK_MCGRP_CONFIG,
-};
-
-static const struct genl_multicast_group devlink_nl_mcgrps[] = {
- [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME },
-};
-
-static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink)
-{
- if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name))
- return -EMSGSIZE;
- if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev)))
- return -EMSGSIZE;
- return 0;
-}
-
-static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
- enum devlink_command cmd, u32 portid,
- u32 seq, int flags)
-{
- void *hdr;
-
- hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
- if (!hdr)
- return -EMSGSIZE;
-
- if (devlink_nl_put_handle(msg, devlink))
- goto nla_put_failure;
-
- genlmsg_end(msg, hdr);
- return 0;
-
-nla_put_failure:
- genlmsg_cancel(msg, hdr);
- return -EMSGSIZE;
-}
-
-static void devlink_notify(struct devlink *devlink, enum devlink_command cmd)
-{
- struct sk_buff *msg;
- int err;
-
- WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL);
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return;
-
- err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0);
- if (err) {
- nlmsg_free(msg);
- return;
- }
-
- genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
- msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
-}
-
-static int devlink_nl_port_attrs_put(struct sk_buff *msg,
- struct devlink_port *devlink_port)
-{
- struct devlink_port_attrs *attrs = &devlink_port->attrs;
-
- if (!attrs->set)
- return 0;
- if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
- return -EMSGSIZE;
- if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number))
- return -EMSGSIZE;
- if (!attrs->split)
- return 0;
- if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->port_number))
- return -EMSGSIZE;
- if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,
- attrs->split_subport_number))
- return -EMSGSIZE;
- return 0;
-}
-
-static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
- struct devlink_port *devlink_port,
- enum devlink_command cmd, u32 portid,
- u32 seq, int flags)
-{
- void *hdr;
-
- hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
- if (!hdr)
- return -EMSGSIZE;
-
- if (devlink_nl_put_handle(msg, devlink))
- goto nla_put_failure;
- if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
- goto nla_put_failure;
- if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))
- goto nla_put_failure;
- if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET &&
- nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE,
- devlink_port->desired_type))
- goto nla_put_failure;
- if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) {
- struct net_device *netdev = devlink_port->type_dev;
-
- if (netdev &&
- (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,
- netdev->ifindex) ||
- nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,
- netdev->name)))
- goto nla_put_failure;
- }
- if (devlink_port->type == DEVLINK_PORT_TYPE_IB) {
- struct ib_device *ibdev = devlink_port->type_dev;
-
- if (ibdev &&
- nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME,
- ibdev->name))
- goto nla_put_failure;
- }
- if (devlink_nl_port_attrs_put(msg, devlink_port))
- goto nla_put_failure;
-
- genlmsg_end(msg, hdr);
- return 0;
-
-nla_put_failure:
- genlmsg_cancel(msg, hdr);
- return -EMSGSIZE;
-}
-
-static void devlink_port_notify(struct devlink_port *devlink_port,
- enum devlink_command cmd)
-{
- struct devlink *devlink = devlink_port->devlink;
- struct sk_buff *msg;
- int err;
-
- if (!devlink_port->registered)
- return;
-
- WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL);
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return;
-
- err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0);
- if (err) {
- nlmsg_free(msg);
- return;
- }
-
- genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
- msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
-}
-
-static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- struct sk_buff *msg;
- int err;
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return -ENOMEM;
-
- err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
- info->snd_portid, info->snd_seq, 0);
- if (err) {
- nlmsg_free(msg);
- return err;
- }
-
- return genlmsg_reply(msg, info);
-}
-
-static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg,
- struct netlink_callback *cb)
-{
- struct devlink *devlink;
- int start = cb->args[0];
- int idx = 0;
- int err;
-
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
- continue;
- if (idx < start) {
- idx++;
- continue;
- }
- err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI);
- if (err)
- goto out;
- idx++;
- }
-out:
- mutex_unlock(&devlink_mutex);
-
- cb->args[0] = idx;
- return msg->len;
-}
-
-static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink_port *devlink_port = info->user_ptr[0];
- struct devlink *devlink = devlink_port->devlink;
- struct sk_buff *msg;
- int err;
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return -ENOMEM;
-
- err = devlink_nl_port_fill(msg, devlink, devlink_port,
- DEVLINK_CMD_PORT_NEW,
- info->snd_portid, info->snd_seq, 0);
- if (err) {
- nlmsg_free(msg);
- return err;
- }
-
- return genlmsg_reply(msg, info);
-}
-
-static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
- struct netlink_callback *cb)
-{
- struct devlink *devlink;
- struct devlink_port *devlink_port;
- int start = cb->args[0];
- int idx = 0;
- int err;
-
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
- continue;
- mutex_lock(&devlink->lock);
- list_for_each_entry(devlink_port, &devlink->port_list, list) {
- if (idx < start) {
- idx++;
- continue;
- }
- err = devlink_nl_port_fill(msg, devlink, devlink_port,
- DEVLINK_CMD_NEW,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NLM_F_MULTI);
- if (err) {
- mutex_unlock(&devlink->lock);
- goto out;
- }
- idx++;
- }
- mutex_unlock(&devlink->lock);
- }
-out:
- mutex_unlock(&devlink_mutex);
-
- cb->args[0] = idx;
- return msg->len;
-}
-
-static int devlink_port_type_set(struct devlink *devlink,
- struct devlink_port *devlink_port,
- enum devlink_port_type port_type)
-
-{
- int err;
-
- if (devlink->ops && devlink->ops->port_type_set) {
- if (port_type == DEVLINK_PORT_TYPE_NOTSET)
- return -EINVAL;
- if (port_type == devlink_port->type)
- return 0;
- err = devlink->ops->port_type_set(devlink_port, port_type);
- if (err)
- return err;
- devlink_port->desired_type = port_type;
- devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
- return 0;
- }
- return -EOPNOTSUPP;
-}
-
-static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink_port *devlink_port = info->user_ptr[0];
- struct devlink *devlink = devlink_port->devlink;
- int err;
-
- if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) {
- enum devlink_port_type port_type;
-
- port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]);
- err = devlink_port_type_set(devlink, devlink_port, port_type);
- if (err)
- return err;
- }
- return 0;
-}
-
-static int devlink_port_split(struct devlink *devlink, u32 port_index,
- u32 count, struct netlink_ext_ack *extack)
-
-{
- if (devlink->ops && devlink->ops->port_split)
- return devlink->ops->port_split(devlink, port_index, count,
- extack);
- return -EOPNOTSUPP;
-}
-
-static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- u32 port_index;
- u32 count;
-
- if (!info->attrs[DEVLINK_ATTR_PORT_INDEX] ||
- !info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT])
- return -EINVAL;
-
- port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
- count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]);
- return devlink_port_split(devlink, port_index, count, info->extack);
-}
-
-static int devlink_port_unsplit(struct devlink *devlink, u32 port_index,
- struct netlink_ext_ack *extack)
-
-{
- if (devlink->ops && devlink->ops->port_unsplit)
- return devlink->ops->port_unsplit(devlink, port_index, extack);
- return -EOPNOTSUPP;
-}
-
-static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- u32 port_index;
-
- if (!info->attrs[DEVLINK_ATTR_PORT_INDEX])
- return -EINVAL;
-
- port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
- return devlink_port_unsplit(devlink, port_index, info->extack);
-}
-
-static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink,
- struct devlink_sb *devlink_sb,
- enum devlink_command cmd, u32 portid,
- u32 seq, int flags)
-{
- void *hdr;
-
- hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
- if (!hdr)
- return -EMSGSIZE;
-
- if (devlink_nl_put_handle(msg, devlink))
- goto nla_put_failure;
- if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
- goto nla_put_failure;
- if (nla_put_u32(msg, DEVLINK_ATTR_SB_SIZE, devlink_sb->size))
- goto nla_put_failure;
- if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_POOL_COUNT,
- devlink_sb->ingress_pools_count))
- goto nla_put_failure;
- if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_POOL_COUNT,
- devlink_sb->egress_pools_count))
- goto nla_put_failure;
- if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_TC_COUNT,
- devlink_sb->ingress_tc_count))
- goto nla_put_failure;
- if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_TC_COUNT,
- devlink_sb->egress_tc_count))
- goto nla_put_failure;
-
- genlmsg_end(msg, hdr);
- return 0;
-
-nla_put_failure:
- genlmsg_cancel(msg, hdr);
- return -EMSGSIZE;
-}
-
-static int devlink_nl_cmd_sb_get_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- struct devlink_sb *devlink_sb = info->user_ptr[1];
- struct sk_buff *msg;
- int err;
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return -ENOMEM;
-
- err = devlink_nl_sb_fill(msg, devlink, devlink_sb,
- DEVLINK_CMD_SB_NEW,
- info->snd_portid, info->snd_seq, 0);
- if (err) {
- nlmsg_free(msg);
- return err;
- }
-
- return genlmsg_reply(msg, info);
-}
-
-static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg,
- struct netlink_callback *cb)
-{
- struct devlink *devlink;
- struct devlink_sb *devlink_sb;
- int start = cb->args[0];
- int idx = 0;
- int err;
-
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
- continue;
- mutex_lock(&devlink->lock);
- list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
- if (idx < start) {
- idx++;
- continue;
- }
- err = devlink_nl_sb_fill(msg, devlink, devlink_sb,
- DEVLINK_CMD_SB_NEW,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NLM_F_MULTI);
- if (err) {
- mutex_unlock(&devlink->lock);
- goto out;
- }
- idx++;
- }
- mutex_unlock(&devlink->lock);
- }
-out:
- mutex_unlock(&devlink_mutex);
-
- cb->args[0] = idx;
- return msg->len;
-}
-
-static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink,
- struct devlink_sb *devlink_sb,
- u16 pool_index, enum devlink_command cmd,
- u32 portid, u32 seq, int flags)
-{
- struct devlink_sb_pool_info pool_info;
- void *hdr;
- int err;
-
- err = devlink->ops->sb_pool_get(devlink, devlink_sb->index,
- pool_index, &pool_info);
- if (err)
- return err;
-
- hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
- if (!hdr)
- return -EMSGSIZE;
-
- if (devlink_nl_put_handle(msg, devlink))
- goto nla_put_failure;
- if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
- goto nla_put_failure;
- if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index))
- goto nla_put_failure;
- if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_info.pool_type))
- goto nla_put_failure;
- if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_SIZE, pool_info.size))
- goto nla_put_failure;
- if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE,
- pool_info.threshold_type))
- goto nla_put_failure;
-
- genlmsg_end(msg, hdr);
- return 0;
-
-nla_put_failure:
- genlmsg_cancel(msg, hdr);
- return -EMSGSIZE;
-}
-
-static int devlink_nl_cmd_sb_pool_get_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- struct devlink_sb *devlink_sb = info->user_ptr[1];
- struct sk_buff *msg;
- u16 pool_index;
- int err;
-
- err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
- &pool_index);
- if (err)
- return err;
-
- if (!devlink->ops || !devlink->ops->sb_pool_get)
- return -EOPNOTSUPP;
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return -ENOMEM;
-
- err = devlink_nl_sb_pool_fill(msg, devlink, devlink_sb, pool_index,
- DEVLINK_CMD_SB_POOL_NEW,
- info->snd_portid, info->snd_seq, 0);
- if (err) {
- nlmsg_free(msg);
- return err;
- }
-
- return genlmsg_reply(msg, info);
-}
-
-static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx,
- struct devlink *devlink,
- struct devlink_sb *devlink_sb,
- u32 portid, u32 seq)
-{
- u16 pool_count = devlink_sb_pool_count(devlink_sb);
- u16 pool_index;
- int err;
-
- for (pool_index = 0; pool_index < pool_count; pool_index++) {
- if (*p_idx < start) {
- (*p_idx)++;
- continue;
- }
- err = devlink_nl_sb_pool_fill(msg, devlink,
- devlink_sb,
- pool_index,
- DEVLINK_CMD_SB_POOL_NEW,
- portid, seq, NLM_F_MULTI);
- if (err)
- return err;
- (*p_idx)++;
- }
- return 0;
-}
-
-static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg,
- struct netlink_callback *cb)
-{
- struct devlink *devlink;
- struct devlink_sb *devlink_sb;
- int start = cb->args[0];
- int idx = 0;
- int err;
-
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
- !devlink->ops || !devlink->ops->sb_pool_get)
- continue;
- mutex_lock(&devlink->lock);
- list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
- err = __sb_pool_get_dumpit(msg, start, &idx, devlink,
- devlink_sb,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq);
- if (err && err != -EOPNOTSUPP) {
- mutex_unlock(&devlink->lock);
- goto out;
- }
- }
- mutex_unlock(&devlink->lock);
- }
-out:
- mutex_unlock(&devlink_mutex);
-
- cb->args[0] = idx;
- return msg->len;
-}
-
-static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index,
- u16 pool_index, u32 size,
- enum devlink_sb_threshold_type threshold_type)
-
-{
- const struct devlink_ops *ops = devlink->ops;
-
- if (ops && ops->sb_pool_set)
- return ops->sb_pool_set(devlink, sb_index, pool_index,
- size, threshold_type);
- return -EOPNOTSUPP;
-}
-
-static int devlink_nl_cmd_sb_pool_set_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- struct devlink_sb *devlink_sb = info->user_ptr[1];
- enum devlink_sb_threshold_type threshold_type;
- u16 pool_index;
- u32 size;
- int err;
-
- err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
- &pool_index);
- if (err)
- return err;
-
- err = devlink_sb_th_type_get_from_info(info, &threshold_type);
- if (err)
- return err;
-
- if (!info->attrs[DEVLINK_ATTR_SB_POOL_SIZE])
- return -EINVAL;
-
- size = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]);
- return devlink_sb_pool_set(devlink, devlink_sb->index,
- pool_index, size, threshold_type);
-}
-
-static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg,
- struct devlink *devlink,
- struct devlink_port *devlink_port,
- struct devlink_sb *devlink_sb,
- u16 pool_index,
- enum devlink_command cmd,
- u32 portid, u32 seq, int flags)
-{
- const struct devlink_ops *ops = devlink->ops;
- u32 threshold;
- void *hdr;
- int err;
-
- err = ops->sb_port_pool_get(devlink_port, devlink_sb->index,
- pool_index, &threshold);
- if (err)
- return err;
-
- hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
- if (!hdr)
- return -EMSGSIZE;
-
- if (devlink_nl_put_handle(msg, devlink))
- goto nla_put_failure;
- if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
- goto nla_put_failure;
- if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
- goto nla_put_failure;
- if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index))
- goto nla_put_failure;
- if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold))
- goto nla_put_failure;
-
- if (ops->sb_occ_port_pool_get) {
- u32 cur;
- u32 max;
-
- err = ops->sb_occ_port_pool_get(devlink_port, devlink_sb->index,
- pool_index, &cur, &max);
- if (err && err != -EOPNOTSUPP)
- return err;
- if (!err) {
- if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur))
- goto nla_put_failure;
- if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max))
- goto nla_put_failure;
- }
- }
-
- genlmsg_end(msg, hdr);
- return 0;
-
-nla_put_failure:
- genlmsg_cancel(msg, hdr);
- return -EMSGSIZE;
-}
-
-static int devlink_nl_cmd_sb_port_pool_get_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink_port *devlink_port = info->user_ptr[0];
- struct devlink *devlink = devlink_port->devlink;
- struct devlink_sb *devlink_sb = info->user_ptr[1];
- struct sk_buff *msg;
- u16 pool_index;
- int err;
-
- err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
- &pool_index);
- if (err)
- return err;
-
- if (!devlink->ops || !devlink->ops->sb_port_pool_get)
- return -EOPNOTSUPP;
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return -ENOMEM;
-
- err = devlink_nl_sb_port_pool_fill(msg, devlink, devlink_port,
- devlink_sb, pool_index,
- DEVLINK_CMD_SB_PORT_POOL_NEW,
- info->snd_portid, info->snd_seq, 0);
- if (err) {
- nlmsg_free(msg);
- return err;
- }
-
- return genlmsg_reply(msg, info);
-}
-
-static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx,
- struct devlink *devlink,
- struct devlink_sb *devlink_sb,
- u32 portid, u32 seq)
-{
- struct devlink_port *devlink_port;
- u16 pool_count = devlink_sb_pool_count(devlink_sb);
- u16 pool_index;
- int err;
-
- list_for_each_entry(devlink_port, &devlink->port_list, list) {
- for (pool_index = 0; pool_index < pool_count; pool_index++) {
- if (*p_idx < start) {
- (*p_idx)++;
- continue;
- }
- err = devlink_nl_sb_port_pool_fill(msg, devlink,
- devlink_port,
- devlink_sb,
- pool_index,
- DEVLINK_CMD_SB_PORT_POOL_NEW,
- portid, seq,
- NLM_F_MULTI);
- if (err)
- return err;
- (*p_idx)++;
- }
- }
- return 0;
-}
-
-static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg,
- struct netlink_callback *cb)
-{
- struct devlink *devlink;
- struct devlink_sb *devlink_sb;
- int start = cb->args[0];
- int idx = 0;
- int err;
-
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
- !devlink->ops || !devlink->ops->sb_port_pool_get)
- continue;
- mutex_lock(&devlink->lock);
- list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
- err = __sb_port_pool_get_dumpit(msg, start, &idx,
- devlink, devlink_sb,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq);
- if (err && err != -EOPNOTSUPP) {
- mutex_unlock(&devlink->lock);
- goto out;
- }
- }
- mutex_unlock(&devlink->lock);
- }
-out:
- mutex_unlock(&devlink_mutex);
-
- cb->args[0] = idx;
- return msg->len;
-}
-
-static int devlink_sb_port_pool_set(struct devlink_port *devlink_port,
- unsigned int sb_index, u16 pool_index,
- u32 threshold)
-
-{
- const struct devlink_ops *ops = devlink_port->devlink->ops;
-
- if (ops && ops->sb_port_pool_set)
- return ops->sb_port_pool_set(devlink_port, sb_index,
- pool_index, threshold);
- return -EOPNOTSUPP;
-}
-
-static int devlink_nl_cmd_sb_port_pool_set_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink_port *devlink_port = info->user_ptr[0];
- struct devlink_sb *devlink_sb = info->user_ptr[1];
- u16 pool_index;
- u32 threshold;
- int err;
-
- err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
- &pool_index);
- if (err)
- return err;
-
- if (!info->attrs[DEVLINK_ATTR_SB_THRESHOLD])
- return -EINVAL;
-
- threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]);
- return devlink_sb_port_pool_set(devlink_port, devlink_sb->index,
- pool_index, threshold);
-}
-
-static int
-devlink_nl_sb_tc_pool_bind_fill(struct sk_buff *msg, struct devlink *devlink,
- struct devlink_port *devlink_port,
- struct devlink_sb *devlink_sb, u16 tc_index,
- enum devlink_sb_pool_type pool_type,
- enum devlink_command cmd,
- u32 portid, u32 seq, int flags)
-{
- const struct devlink_ops *ops = devlink->ops;
- u16 pool_index;
- u32 threshold;
- void *hdr;
- int err;
-
- err = ops->sb_tc_pool_bind_get(devlink_port, devlink_sb->index,
- tc_index, pool_type,
- &pool_index, &threshold);
- if (err)
- return err;
-
- hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
- if (!hdr)
- return -EMSGSIZE;
-
- if (devlink_nl_put_handle(msg, devlink))
- goto nla_put_failure;
- if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
- goto nla_put_failure;
- if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
- goto nla_put_failure;
- if (nla_put_u16(msg, DEVLINK_ATTR_SB_TC_INDEX, tc_index))
- goto nla_put_failure;
- if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_type))
- goto nla_put_failure;
- if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index))
- goto nla_put_failure;
- if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold))
- goto nla_put_failure;
-
- if (ops->sb_occ_tc_port_bind_get) {
- u32 cur;
- u32 max;
-
- err = ops->sb_occ_tc_port_bind_get(devlink_port,
- devlink_sb->index,
- tc_index, pool_type,
- &cur, &max);
- if (err && err != -EOPNOTSUPP)
- return err;
- if (!err) {
- if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur))
- goto nla_put_failure;
- if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max))
- goto nla_put_failure;
- }
- }
-
- genlmsg_end(msg, hdr);
- return 0;
-
-nla_put_failure:
- genlmsg_cancel(msg, hdr);
- return -EMSGSIZE;
-}
-
-static int devlink_nl_cmd_sb_tc_pool_bind_get_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink_port *devlink_port = info->user_ptr[0];
- struct devlink *devlink = devlink_port->devlink;
- struct devlink_sb *devlink_sb = info->user_ptr[1];
- struct sk_buff *msg;
- enum devlink_sb_pool_type pool_type;
- u16 tc_index;
- int err;
-
- err = devlink_sb_pool_type_get_from_info(info, &pool_type);
- if (err)
- return err;
-
- err = devlink_sb_tc_index_get_from_info(devlink_sb, info,
- pool_type, &tc_index);
- if (err)
- return err;
-
- if (!devlink->ops || !devlink->ops->sb_tc_pool_bind_get)
- return -EOPNOTSUPP;
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return -ENOMEM;
-
- err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, devlink_port,
- devlink_sb, tc_index, pool_type,
- DEVLINK_CMD_SB_TC_POOL_BIND_NEW,
- info->snd_portid,
- info->snd_seq, 0);
- if (err) {
- nlmsg_free(msg);
- return err;
- }
-
- return genlmsg_reply(msg, info);
-}
-
-static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
- int start, int *p_idx,
- struct devlink *devlink,
- struct devlink_sb *devlink_sb,
- u32 portid, u32 seq)
-{
- struct devlink_port *devlink_port;
- u16 tc_index;
- int err;
-
- list_for_each_entry(devlink_port, &devlink->port_list, list) {
- for (tc_index = 0;
- tc_index < devlink_sb->ingress_tc_count; tc_index++) {
- if (*p_idx < start) {
- (*p_idx)++;
- continue;
- }
- err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink,
- devlink_port,
- devlink_sb,
- tc_index,
- DEVLINK_SB_POOL_TYPE_INGRESS,
- DEVLINK_CMD_SB_TC_POOL_BIND_NEW,
- portid, seq,
- NLM_F_MULTI);
- if (err)
- return err;
- (*p_idx)++;
- }
- for (tc_index = 0;
- tc_index < devlink_sb->egress_tc_count; tc_index++) {
- if (*p_idx < start) {
- (*p_idx)++;
- continue;
- }
- err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink,
- devlink_port,
- devlink_sb,
- tc_index,
- DEVLINK_SB_POOL_TYPE_EGRESS,
- DEVLINK_CMD_SB_TC_POOL_BIND_NEW,
- portid, seq,
- NLM_F_MULTI);
- if (err)
- return err;
- (*p_idx)++;
- }
- }
- return 0;
-}
-
-static int
-devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
- struct netlink_callback *cb)
-{
- struct devlink *devlink;
- struct devlink_sb *devlink_sb;
- int start = cb->args[0];
- int idx = 0;
- int err;
-
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) ||
- !devlink->ops || !devlink->ops->sb_tc_pool_bind_get)
- continue;
-
- mutex_lock(&devlink->lock);
- list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
- err = __sb_tc_pool_bind_get_dumpit(msg, start, &idx,
- devlink,
- devlink_sb,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq);
- if (err && err != -EOPNOTSUPP) {
- mutex_unlock(&devlink->lock);
- goto out;
- }
- }
- mutex_unlock(&devlink->lock);
- }
-out:
- mutex_unlock(&devlink_mutex);
-
- cb->args[0] = idx;
- return msg->len;
-}
-
-static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port,
- unsigned int sb_index, u16 tc_index,
- enum devlink_sb_pool_type pool_type,
- u16 pool_index, u32 threshold)
-
-{
- const struct devlink_ops *ops = devlink_port->devlink->ops;
-
- if (ops && ops->sb_tc_pool_bind_set)
- return ops->sb_tc_pool_bind_set(devlink_port, sb_index,
- tc_index, pool_type,
- pool_index, threshold);
- return -EOPNOTSUPP;
-}
-
-static int devlink_nl_cmd_sb_tc_pool_bind_set_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink_port *devlink_port = info->user_ptr[0];
- struct devlink_sb *devlink_sb = info->user_ptr[1];
- enum devlink_sb_pool_type pool_type;
- u16 tc_index;
- u16 pool_index;
- u32 threshold;
- int err;
-
- err = devlink_sb_pool_type_get_from_info(info, &pool_type);
- if (err)
- return err;
-
- err = devlink_sb_tc_index_get_from_info(devlink_sb, info,
- pool_type, &tc_index);
- if (err)
- return err;
-
- err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
- &pool_index);
- if (err)
- return err;
-
- if (!info->attrs[DEVLINK_ATTR_SB_THRESHOLD])
- return -EINVAL;
-
- threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]);
- return devlink_sb_tc_pool_bind_set(devlink_port, devlink_sb->index,
- tc_index, pool_type,
- pool_index, threshold);
-}
-
-static int devlink_nl_cmd_sb_occ_snapshot_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- struct devlink_sb *devlink_sb = info->user_ptr[1];
- const struct devlink_ops *ops = devlink->ops;
-
- if (ops && ops->sb_occ_snapshot)
- return ops->sb_occ_snapshot(devlink, devlink_sb->index);
- return -EOPNOTSUPP;
-}
-
-static int devlink_nl_cmd_sb_occ_max_clear_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- struct devlink_sb *devlink_sb = info->user_ptr[1];
- const struct devlink_ops *ops = devlink->ops;
-
- if (ops && ops->sb_occ_max_clear)
- return ops->sb_occ_max_clear(devlink, devlink_sb->index);
- return -EOPNOTSUPP;
-}
-
-static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
- enum devlink_command cmd, u32 portid,
- u32 seq, int flags)
-{
- const struct devlink_ops *ops = devlink->ops;
- u8 inline_mode, encap_mode;
- void *hdr;
- int err = 0;
- u16 mode;
-
- hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
- if (!hdr)
- return -EMSGSIZE;
-
- err = devlink_nl_put_handle(msg, devlink);
- if (err)
- goto nla_put_failure;
-
- if (ops->eswitch_mode_get) {
- err = ops->eswitch_mode_get(devlink, &mode);
- if (err)
- goto nla_put_failure;
- err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode);
- if (err)
- goto nla_put_failure;
- }
-
- if (ops->eswitch_inline_mode_get) {
- err = ops->eswitch_inline_mode_get(devlink, &inline_mode);
- if (err)
- goto nla_put_failure;
- err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE,
- inline_mode);
- if (err)
- goto nla_put_failure;
- }
-
- if (ops->eswitch_encap_mode_get) {
- err = ops->eswitch_encap_mode_get(devlink, &encap_mode);
- if (err)
- goto nla_put_failure;
- err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_ENCAP_MODE, encap_mode);
- if (err)
- goto nla_put_failure;
- }
-
- genlmsg_end(msg, hdr);
- return 0;
-
-nla_put_failure:
- genlmsg_cancel(msg, hdr);
- return err;
-}
-
-static int devlink_nl_cmd_eswitch_get_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- const struct devlink_ops *ops = devlink->ops;
- struct sk_buff *msg;
- int err;
-
- if (!ops)
- return -EOPNOTSUPP;
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return -ENOMEM;
-
- err = devlink_nl_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET,
- info->snd_portid, info->snd_seq, 0);
-
- if (err) {
- nlmsg_free(msg);
- return err;
- }
-
- return genlmsg_reply(msg, info);
-}
-
-static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- const struct devlink_ops *ops = devlink->ops;
- u8 inline_mode, encap_mode;
- int err = 0;
- u16 mode;
-
- if (!ops)
- return -EOPNOTSUPP;
-
- if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) {
- if (!ops->eswitch_mode_set)
- return -EOPNOTSUPP;
- mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
- err = ops->eswitch_mode_set(devlink, mode);
- if (err)
- return err;
- }
-
- if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) {
- if (!ops->eswitch_inline_mode_set)
- return -EOPNOTSUPP;
- inline_mode = nla_get_u8(
- info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]);
- err = ops->eswitch_inline_mode_set(devlink, inline_mode);
- if (err)
- return err;
- }
-
- if (info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]) {
- if (!ops->eswitch_encap_mode_set)
- return -EOPNOTSUPP;
- encap_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]);
- err = ops->eswitch_encap_mode_set(devlink, encap_mode);
- if (err)
- return err;
- }
-
- return 0;
-}
-
-int devlink_dpipe_match_put(struct sk_buff *skb,
- struct devlink_dpipe_match *match)
-{
- struct devlink_dpipe_header *header = match->header;
- struct devlink_dpipe_field *field = &header->fields[match->field_id];
- struct nlattr *match_attr;
-
- match_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_MATCH);
- if (!match_attr)
- return -EMSGSIZE;
-
- if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_MATCH_TYPE, match->type) ||
- nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, match->header_index) ||
- nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) ||
- nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) ||
- nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global))
- goto nla_put_failure;
-
- nla_nest_end(skb, match_attr);
- return 0;
-
-nla_put_failure:
- nla_nest_cancel(skb, match_attr);
- return -EMSGSIZE;
-}
-EXPORT_SYMBOL_GPL(devlink_dpipe_match_put);
-
-static int devlink_dpipe_matches_put(struct devlink_dpipe_table *table,
- struct sk_buff *skb)
-{
- struct nlattr *matches_attr;
-
- matches_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLE_MATCHES);
- if (!matches_attr)
- return -EMSGSIZE;
-
- if (table->table_ops->matches_dump(table->priv, skb))
- goto nla_put_failure;
-
- nla_nest_end(skb, matches_attr);
- return 0;
-
-nla_put_failure:
- nla_nest_cancel(skb, matches_attr);
- return -EMSGSIZE;
-}
-
-int devlink_dpipe_action_put(struct sk_buff *skb,
- struct devlink_dpipe_action *action)
-{
- struct devlink_dpipe_header *header = action->header;
- struct devlink_dpipe_field *field = &header->fields[action->field_id];
- struct nlattr *action_attr;
-
- action_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_ACTION);
- if (!action_attr)
- return -EMSGSIZE;
-
- if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_ACTION_TYPE, action->type) ||
- nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, action->header_index) ||
- nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) ||
- nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) ||
- nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global))
- goto nla_put_failure;
-
- nla_nest_end(skb, action_attr);
- return 0;
-
-nla_put_failure:
- nla_nest_cancel(skb, action_attr);
- return -EMSGSIZE;
-}
-EXPORT_SYMBOL_GPL(devlink_dpipe_action_put);
-
-static int devlink_dpipe_actions_put(struct devlink_dpipe_table *table,
- struct sk_buff *skb)
-{
- struct nlattr *actions_attr;
-
- actions_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLE_ACTIONS);
- if (!actions_attr)
- return -EMSGSIZE;
-
- if (table->table_ops->actions_dump(table->priv, skb))
- goto nla_put_failure;
-
- nla_nest_end(skb, actions_attr);
- return 0;
-
-nla_put_failure:
- nla_nest_cancel(skb, actions_attr);
- return -EMSGSIZE;
-}
-
-static int devlink_dpipe_table_put(struct sk_buff *skb,
- struct devlink_dpipe_table *table)
-{
- struct nlattr *table_attr;
- u64 table_size;
-
- table_size = table->table_ops->size_get(table->priv);
- table_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLE);
- if (!table_attr)
- return -EMSGSIZE;
-
- if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_TABLE_NAME, table->name) ||
- nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_SIZE, table_size,
- DEVLINK_ATTR_PAD))
- goto nla_put_failure;
- if (nla_put_u8(skb, DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED,
- table->counters_enabled))
- goto nla_put_failure;
-
- if (table->resource_valid) {
- if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,
- table->resource_id, DEVLINK_ATTR_PAD) ||
- nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,
- table->resource_units, DEVLINK_ATTR_PAD))
- goto nla_put_failure;
- }
- if (devlink_dpipe_matches_put(table, skb))
- goto nla_put_failure;
-
- if (devlink_dpipe_actions_put(table, skb))
- goto nla_put_failure;
-
- nla_nest_end(skb, table_attr);
- return 0;
-
-nla_put_failure:
- nla_nest_cancel(skb, table_attr);
- return -EMSGSIZE;
-}
-
-static int devlink_dpipe_send_and_alloc_skb(struct sk_buff **pskb,
- struct genl_info *info)
-{
- int err;
-
- if (*pskb) {
- err = genlmsg_reply(*pskb, info);
- if (err)
- return err;
- }
- *pskb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!*pskb)
- return -ENOMEM;
- return 0;
-}
-
-static int devlink_dpipe_tables_fill(struct genl_info *info,
- enum devlink_command cmd, int flags,
- struct list_head *dpipe_tables,
- const char *table_name)
-{
- struct devlink *devlink = info->user_ptr[0];
- struct devlink_dpipe_table *table;
- struct nlattr *tables_attr;
- struct sk_buff *skb = NULL;
- struct nlmsghdr *nlh;
- bool incomplete;
- void *hdr;
- int i;
- int err;
-
- table = list_first_entry(dpipe_tables,
- struct devlink_dpipe_table, list);
-start_again:
- err = devlink_dpipe_send_and_alloc_skb(&skb, info);
- if (err)
- return err;
-
- hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
- &devlink_nl_family, NLM_F_MULTI, cmd);
- if (!hdr) {
- nlmsg_free(skb);
- return -EMSGSIZE;
- }
-
- if (devlink_nl_put_handle(skb, devlink))
- goto nla_put_failure;
- tables_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLES);
- if (!tables_attr)
- goto nla_put_failure;
-
- i = 0;
- incomplete = false;
- list_for_each_entry_from(table, dpipe_tables, list) {
- if (!table_name) {
- err = devlink_dpipe_table_put(skb, table);
- if (err) {
- if (!i)
- goto err_table_put;
- incomplete = true;
- break;
- }
- } else {
- if (!strcmp(table->name, table_name)) {
- err = devlink_dpipe_table_put(skb, table);
- if (err)
- break;
- }
- }
- i++;
- }
-
- nla_nest_end(skb, tables_attr);
- genlmsg_end(skb, hdr);
- if (incomplete)
- goto start_again;
-
-send_done:
- nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
- NLMSG_DONE, 0, flags | NLM_F_MULTI);
- if (!nlh) {
- err = devlink_dpipe_send_and_alloc_skb(&skb, info);
- if (err)
- return err;
- goto send_done;
- }
-
- return genlmsg_reply(skb, info);
-
-nla_put_failure:
- err = -EMSGSIZE;
-err_table_put:
- nlmsg_free(skb);
- return err;
-}
-
-static int devlink_nl_cmd_dpipe_table_get(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- const char *table_name = NULL;
-
- if (info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME])
- table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
-
- return devlink_dpipe_tables_fill(info, DEVLINK_CMD_DPIPE_TABLE_GET, 0,
- &devlink->dpipe_table_list,
- table_name);
-}
-
-static int devlink_dpipe_value_put(struct sk_buff *skb,
- struct devlink_dpipe_value *value)
-{
- if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE,
- value->value_size, value->value))
- return -EMSGSIZE;
- if (value->mask)
- if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE_MASK,
- value->value_size, value->mask))
- return -EMSGSIZE;
- if (value->mapping_valid)
- if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_VALUE_MAPPING,
- value->mapping_value))
- return -EMSGSIZE;
- return 0;
-}
-
-static int devlink_dpipe_action_value_put(struct sk_buff *skb,
- struct devlink_dpipe_value *value)
-{
- if (!value->action)
- return -EINVAL;
- if (devlink_dpipe_action_put(skb, value->action))
- return -EMSGSIZE;
- if (devlink_dpipe_value_put(skb, value))
- return -EMSGSIZE;
- return 0;
-}
-
-static int devlink_dpipe_action_values_put(struct sk_buff *skb,
- struct devlink_dpipe_value *values,
- unsigned int values_count)
-{
- struct nlattr *action_attr;
- int i;
- int err;
-
- for (i = 0; i < values_count; i++) {
- action_attr = nla_nest_start(skb,
- DEVLINK_ATTR_DPIPE_ACTION_VALUE);
- if (!action_attr)
- return -EMSGSIZE;
- err = devlink_dpipe_action_value_put(skb, &values[i]);
- if (err)
- goto err_action_value_put;
- nla_nest_end(skb, action_attr);
- }
- return 0;
-
-err_action_value_put:
- nla_nest_cancel(skb, action_attr);
- return err;
-}
-
-static int devlink_dpipe_match_value_put(struct sk_buff *skb,
- struct devlink_dpipe_value *value)
-{
- if (!value->match)
- return -EINVAL;
- if (devlink_dpipe_match_put(skb, value->match))
- return -EMSGSIZE;
- if (devlink_dpipe_value_put(skb, value))
- return -EMSGSIZE;
- return 0;
-}
-
-static int devlink_dpipe_match_values_put(struct sk_buff *skb,
- struct devlink_dpipe_value *values,
- unsigned int values_count)
-{
- struct nlattr *match_attr;
- int i;
- int err;
-
- for (i = 0; i < values_count; i++) {
- match_attr = nla_nest_start(skb,
- DEVLINK_ATTR_DPIPE_MATCH_VALUE);
- if (!match_attr)
- return -EMSGSIZE;
- err = devlink_dpipe_match_value_put(skb, &values[i]);
- if (err)
- goto err_match_value_put;
- nla_nest_end(skb, match_attr);
- }
- return 0;
-
-err_match_value_put:
- nla_nest_cancel(skb, match_attr);
- return err;
-}
-
-static int devlink_dpipe_entry_put(struct sk_buff *skb,
- struct devlink_dpipe_entry *entry)
-{
- struct nlattr *entry_attr, *matches_attr, *actions_attr;
- int err;
-
- entry_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_ENTRY);
- if (!entry_attr)
- return -EMSGSIZE;
-
- if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_ENTRY_INDEX, entry->index,
- DEVLINK_ATTR_PAD))
- goto nla_put_failure;
- if (entry->counter_valid)
- if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_ENTRY_COUNTER,
- entry->counter, DEVLINK_ATTR_PAD))
- goto nla_put_failure;
-
- matches_attr = nla_nest_start(skb,
- DEVLINK_ATTR_DPIPE_ENTRY_MATCH_VALUES);
- if (!matches_attr)
- goto nla_put_failure;
-
- err = devlink_dpipe_match_values_put(skb, entry->match_values,
- entry->match_values_count);
- if (err) {
- nla_nest_cancel(skb, matches_attr);
- goto err_match_values_put;
- }
- nla_nest_end(skb, matches_attr);
-
- actions_attr = nla_nest_start(skb,
- DEVLINK_ATTR_DPIPE_ENTRY_ACTION_VALUES);
- if (!actions_attr)
- goto nla_put_failure;
-
- err = devlink_dpipe_action_values_put(skb, entry->action_values,
- entry->action_values_count);
- if (err) {
- nla_nest_cancel(skb, actions_attr);
- goto err_action_values_put;
- }
- nla_nest_end(skb, actions_attr);
-
- nla_nest_end(skb, entry_attr);
- return 0;
-
-nla_put_failure:
- err = -EMSGSIZE;
-err_match_values_put:
-err_action_values_put:
- nla_nest_cancel(skb, entry_attr);
- return err;
-}
-
-static struct devlink_dpipe_table *
-devlink_dpipe_table_find(struct list_head *dpipe_tables,
- const char *table_name)
-{
- struct devlink_dpipe_table *table;
-
- list_for_each_entry_rcu(table, dpipe_tables, list) {
- if (!strcmp(table->name, table_name))
- return table;
- }
- return NULL;
-}
-
-int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx)
-{
- struct devlink *devlink;
- int err;
-
- err = devlink_dpipe_send_and_alloc_skb(&dump_ctx->skb,
- dump_ctx->info);
- if (err)
- return err;
-
- dump_ctx->hdr = genlmsg_put(dump_ctx->skb,
- dump_ctx->info->snd_portid,
- dump_ctx->info->snd_seq,
- &devlink_nl_family, NLM_F_MULTI,
- dump_ctx->cmd);
- if (!dump_ctx->hdr)
- goto nla_put_failure;
-
- devlink = dump_ctx->info->user_ptr[0];
- if (devlink_nl_put_handle(dump_ctx->skb, devlink))
- goto nla_put_failure;
- dump_ctx->nest = nla_nest_start(dump_ctx->skb,
- DEVLINK_ATTR_DPIPE_ENTRIES);
- if (!dump_ctx->nest)
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- nlmsg_free(dump_ctx->skb);
- return -EMSGSIZE;
-}
-EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_prepare);
-
-int devlink_dpipe_entry_ctx_append(struct devlink_dpipe_dump_ctx *dump_ctx,
- struct devlink_dpipe_entry *entry)
-{
- return devlink_dpipe_entry_put(dump_ctx->skb, entry);
-}
-EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_append);
-
-int devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx)
-{
- nla_nest_end(dump_ctx->skb, dump_ctx->nest);
- genlmsg_end(dump_ctx->skb, dump_ctx->hdr);
- return 0;
-}
-EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_close);
-
-void devlink_dpipe_entry_clear(struct devlink_dpipe_entry *entry)
-
-{
- unsigned int value_count, value_index;
- struct devlink_dpipe_value *value;
-
- value = entry->action_values;
- value_count = entry->action_values_count;
- for (value_index = 0; value_index < value_count; value_index++) {
- kfree(value[value_index].value);
- kfree(value[value_index].mask);
- }
-
- value = entry->match_values;
- value_count = entry->match_values_count;
- for (value_index = 0; value_index < value_count; value_index++) {
- kfree(value[value_index].value);
- kfree(value[value_index].mask);
- }
-}
-EXPORT_SYMBOL(devlink_dpipe_entry_clear);
-
-static int devlink_dpipe_entries_fill(struct genl_info *info,
- enum devlink_command cmd, int flags,
- struct devlink_dpipe_table *table)
-{
- struct devlink_dpipe_dump_ctx dump_ctx;
- struct nlmsghdr *nlh;
- int err;
-
- dump_ctx.skb = NULL;
- dump_ctx.cmd = cmd;
- dump_ctx.info = info;
-
- err = table->table_ops->entries_dump(table->priv,
- table->counters_enabled,
- &dump_ctx);
- if (err)
- return err;
-
-send_done:
- nlh = nlmsg_put(dump_ctx.skb, info->snd_portid, info->snd_seq,
- NLMSG_DONE, 0, flags | NLM_F_MULTI);
- if (!nlh) {
- err = devlink_dpipe_send_and_alloc_skb(&dump_ctx.skb, info);
- if (err)
- return err;
- goto send_done;
- }
- return genlmsg_reply(dump_ctx.skb, info);
-}
-
-static int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- struct devlink_dpipe_table *table;
- const char *table_name;
-
- if (!info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME])
- return -EINVAL;
-
- table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
- table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
- table_name);
- if (!table)
- return -EINVAL;
-
- if (!table->table_ops->entries_dump)
- return -EINVAL;
-
- return devlink_dpipe_entries_fill(info, DEVLINK_CMD_DPIPE_ENTRIES_GET,
- 0, table);
-}
-
-static int devlink_dpipe_fields_put(struct sk_buff *skb,
- const struct devlink_dpipe_header *header)
-{
- struct devlink_dpipe_field *field;
- struct nlattr *field_attr;
- int i;
-
- for (i = 0; i < header->fields_count; i++) {
- field = &header->fields[i];
- field_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_FIELD);
- if (!field_attr)
- return -EMSGSIZE;
- if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_FIELD_NAME, field->name) ||
- nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) ||
- nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_BITWIDTH, field->bitwidth) ||
- nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_MAPPING_TYPE, field->mapping_type))
- goto nla_put_failure;
- nla_nest_end(skb, field_attr);
- }
- return 0;
-
-nla_put_failure:
- nla_nest_cancel(skb, field_attr);
- return -EMSGSIZE;
-}
-
-static int devlink_dpipe_header_put(struct sk_buff *skb,
- struct devlink_dpipe_header *header)
-{
- struct nlattr *fields_attr, *header_attr;
- int err;
-
- header_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_HEADER);
- if (!header_attr)
- return -EMSGSIZE;
-
- if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_HEADER_NAME, header->name) ||
- nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) ||
- nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global))
- goto nla_put_failure;
-
- fields_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_HEADER_FIELDS);
- if (!fields_attr)
- goto nla_put_failure;
-
- err = devlink_dpipe_fields_put(skb, header);
- if (err) {
- nla_nest_cancel(skb, fields_attr);
- goto nla_put_failure;
- }
- nla_nest_end(skb, fields_attr);
- nla_nest_end(skb, header_attr);
- return 0;
-
-nla_put_failure:
- err = -EMSGSIZE;
- nla_nest_cancel(skb, header_attr);
- return err;
-}
-
-static int devlink_dpipe_headers_fill(struct genl_info *info,
- enum devlink_command cmd, int flags,
- struct devlink_dpipe_headers *
- dpipe_headers)
-{
- struct devlink *devlink = info->user_ptr[0];
- struct nlattr *headers_attr;
- struct sk_buff *skb = NULL;
- struct nlmsghdr *nlh;
- void *hdr;
- int i, j;
- int err;
-
- i = 0;
-start_again:
- err = devlink_dpipe_send_and_alloc_skb(&skb, info);
- if (err)
- return err;
-
- hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
- &devlink_nl_family, NLM_F_MULTI, cmd);
- if (!hdr) {
- nlmsg_free(skb);
- return -EMSGSIZE;
- }
-
- if (devlink_nl_put_handle(skb, devlink))
- goto nla_put_failure;
- headers_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_HEADERS);
- if (!headers_attr)
- goto nla_put_failure;
-
- j = 0;
- for (; i < dpipe_headers->headers_count; i++) {
- err = devlink_dpipe_header_put(skb, dpipe_headers->headers[i]);
- if (err) {
- if (!j)
- goto err_table_put;
- break;
- }
- j++;
- }
- nla_nest_end(skb, headers_attr);
- genlmsg_end(skb, hdr);
- if (i != dpipe_headers->headers_count)
- goto start_again;
-
-send_done:
- nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
- NLMSG_DONE, 0, flags | NLM_F_MULTI);
- if (!nlh) {
- err = devlink_dpipe_send_and_alloc_skb(&skb, info);
- if (err)
- return err;
- goto send_done;
- }
- return genlmsg_reply(skb, info);
-
-nla_put_failure:
- err = -EMSGSIZE;
-err_table_put:
- nlmsg_free(skb);
- return err;
-}
-
-static int devlink_nl_cmd_dpipe_headers_get(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
-
- if (!devlink->dpipe_headers)
- return -EOPNOTSUPP;
- return devlink_dpipe_headers_fill(info, DEVLINK_CMD_DPIPE_HEADERS_GET,
- 0, devlink->dpipe_headers);
-}
-
-static int devlink_dpipe_table_counters_set(struct devlink *devlink,
- const char *table_name,
- bool enable)
-{
- struct devlink_dpipe_table *table;
-
- table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
- table_name);
- if (!table)
- return -EINVAL;
-
- if (table->counter_control_extern)
- return -EOPNOTSUPP;
-
- if (!(table->counters_enabled ^ enable))
- return 0;
-
- table->counters_enabled = enable;
- if (table->table_ops->counters_set_update)
- table->table_ops->counters_set_update(table->priv, enable);
- return 0;
-}
-
-static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- const char *table_name;
- bool counters_enable;
-
- if (!info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME] ||
- !info->attrs[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED])
- return -EINVAL;
-
- table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
- counters_enable = !!nla_get_u8(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED]);
-
- return devlink_dpipe_table_counters_set(devlink, table_name,
- counters_enable);
-}
-
-static struct devlink_resource *
-devlink_resource_find(struct devlink *devlink,
- struct devlink_resource *resource, u64 resource_id)
-{
- struct list_head *resource_list;
-
- if (resource)
- resource_list = &resource->resource_list;
- else
- resource_list = &devlink->resource_list;
-
- list_for_each_entry(resource, resource_list, list) {
- struct devlink_resource *child_resource;
-
- if (resource->id == resource_id)
- return resource;
-
- child_resource = devlink_resource_find(devlink, resource,
- resource_id);
- if (child_resource)
- return child_resource;
- }
- return NULL;
-}
-
-static void
-devlink_resource_validate_children(struct devlink_resource *resource)
-{
- struct devlink_resource *child_resource;
- bool size_valid = true;
- u64 parts_size = 0;
-
- if (list_empty(&resource->resource_list))
- goto out;
-
- list_for_each_entry(child_resource, &resource->resource_list, list)
- parts_size += child_resource->size_new;
-
- if (parts_size > resource->size_new)
- size_valid = false;
-out:
- resource->size_valid = size_valid;
-}
-
-static int
-devlink_resource_validate_size(struct devlink_resource *resource, u64 size,
- struct netlink_ext_ack *extack)
-{
- u64 reminder;
- int err = 0;
-
- if (size > resource->size_params.size_max) {
- NL_SET_ERR_MSG_MOD(extack, "Size larger than maximum");
- err = -EINVAL;
- }
-
- if (size < resource->size_params.size_min) {
- NL_SET_ERR_MSG_MOD(extack, "Size smaller than minimum");
- err = -EINVAL;
- }
-
- div64_u64_rem(size, resource->size_params.size_granularity, &reminder);
- if (reminder) {
- NL_SET_ERR_MSG_MOD(extack, "Wrong granularity");
- err = -EINVAL;
- }
-
- return err;
-}
-
-static int devlink_nl_cmd_resource_set(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- struct devlink_resource *resource;
- u64 resource_id;
- u64 size;
- int err;
-
- if (!info->attrs[DEVLINK_ATTR_RESOURCE_ID] ||
- !info->attrs[DEVLINK_ATTR_RESOURCE_SIZE])
- return -EINVAL;
- resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]);
-
- resource = devlink_resource_find(devlink, NULL, resource_id);
- if (!resource)
- return -EINVAL;
-
- size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]);
- err = devlink_resource_validate_size(resource, size, info->extack);
- if (err)
- return err;
-
- resource->size_new = size;
- devlink_resource_validate_children(resource);
- if (resource->parent)
- devlink_resource_validate_children(resource->parent);
- return 0;
-}
-
-static int
-devlink_resource_size_params_put(struct devlink_resource *resource,
- struct sk_buff *skb)
-{
- struct devlink_resource_size_params *size_params;
-
- size_params = &resource->size_params;
- if (nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN,
- size_params->size_granularity, DEVLINK_ATTR_PAD) ||
- nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX,
- size_params->size_max, DEVLINK_ATTR_PAD) ||
- nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN,
- size_params->size_min, DEVLINK_ATTR_PAD) ||
- nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit))
- return -EMSGSIZE;
- return 0;
-}
-
-static int devlink_resource_occ_put(struct devlink_resource *resource,
- struct sk_buff *skb)
-{
- if (!resource->occ_get)
- return 0;
- return nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC,
- resource->occ_get(resource->occ_get_priv),
- DEVLINK_ATTR_PAD);
-}
-
-static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb,
- struct devlink_resource *resource)
-{
- struct devlink_resource *child_resource;
- struct nlattr *child_resource_attr;
- struct nlattr *resource_attr;
-
- resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE);
- if (!resource_attr)
- return -EMSGSIZE;
-
- if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) ||
- nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size,
- DEVLINK_ATTR_PAD) ||
- nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id,
- DEVLINK_ATTR_PAD))
- goto nla_put_failure;
- if (resource->size != resource->size_new)
- nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW,
- resource->size_new, DEVLINK_ATTR_PAD);
- if (devlink_resource_occ_put(resource, skb))
- goto nla_put_failure;
- if (devlink_resource_size_params_put(resource, skb))
- goto nla_put_failure;
- if (list_empty(&resource->resource_list))
- goto out;
-
- if (nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_SIZE_VALID,
- resource->size_valid))
- goto nla_put_failure;
-
- child_resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST);
- if (!child_resource_attr)
- goto nla_put_failure;
-
- list_for_each_entry(child_resource, &resource->resource_list, list) {
- if (devlink_resource_put(devlink, skb, child_resource))
- goto resource_put_failure;
- }
-
- nla_nest_end(skb, child_resource_attr);
-out:
- nla_nest_end(skb, resource_attr);
- return 0;
-
-resource_put_failure:
- nla_nest_cancel(skb, child_resource_attr);
-nla_put_failure:
- nla_nest_cancel(skb, resource_attr);
- return -EMSGSIZE;
-}
-
-static int devlink_resource_fill(struct genl_info *info,
- enum devlink_command cmd, int flags)
-{
- struct devlink *devlink = info->user_ptr[0];
- struct devlink_resource *resource;
- struct nlattr *resources_attr;
- struct sk_buff *skb = NULL;
- struct nlmsghdr *nlh;
- bool incomplete;
- void *hdr;
- int i;
- int err;
-
- resource = list_first_entry(&devlink->resource_list,
- struct devlink_resource, list);
-start_again:
- err = devlink_dpipe_send_and_alloc_skb(&skb, info);
- if (err)
- return err;
-
- hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
- &devlink_nl_family, NLM_F_MULTI, cmd);
- if (!hdr) {
- nlmsg_free(skb);
- return -EMSGSIZE;
- }
-
- if (devlink_nl_put_handle(skb, devlink))
- goto nla_put_failure;
-
- resources_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST);
- if (!resources_attr)
- goto nla_put_failure;
-
- incomplete = false;
- i = 0;
- list_for_each_entry_from(resource, &devlink->resource_list, list) {
- err = devlink_resource_put(devlink, skb, resource);
- if (err) {
- if (!i)
- goto err_resource_put;
- incomplete = true;
- break;
- }
- i++;
- }
- nla_nest_end(skb, resources_attr);
- genlmsg_end(skb, hdr);
- if (incomplete)
- goto start_again;
-send_done:
- nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
- NLMSG_DONE, 0, flags | NLM_F_MULTI);
- if (!nlh) {
- err = devlink_dpipe_send_and_alloc_skb(&skb, info);
- if (err)
- return err;
- goto send_done;
- }
- return genlmsg_reply(skb, info);
-
-nla_put_failure:
- err = -EMSGSIZE;
-err_resource_put:
- nlmsg_free(skb);
- return err;
-}
-
-static int devlink_nl_cmd_resource_dump(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
-
- if (list_empty(&devlink->resource_list))
- return -EOPNOTSUPP;
-
- return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0);
-}
-
-static int
-devlink_resources_validate(struct devlink *devlink,
- struct devlink_resource *resource,
- struct genl_info *info)
-{
- struct list_head *resource_list;
- int err = 0;
-
- if (resource)
- resource_list = &resource->resource_list;
- else
- resource_list = &devlink->resource_list;
-
- list_for_each_entry(resource, resource_list, list) {
- if (!resource->size_valid)
- return -EINVAL;
- err = devlink_resources_validate(devlink, resource, info);
- if (err)
- return err;
- }
- return err;
-}
-
-static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- int err;
-
- if (!devlink->ops->reload)
- return -EOPNOTSUPP;
-
- err = devlink_resources_validate(devlink, NULL, info);
- if (err) {
- NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed");
- return err;
- }
- return devlink->ops->reload(devlink, info->extack);
-}
-
-static const struct devlink_param devlink_param_generic[] = {
- {
- .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET,
- .name = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_NAME,
- .type = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_TYPE,
- },
- {
- .id = DEVLINK_PARAM_GENERIC_ID_MAX_MACS,
- .name = DEVLINK_PARAM_GENERIC_MAX_MACS_NAME,
- .type = DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE,
- },
- {
- .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV,
- .name = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME,
- .type = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE,
- },
- {
- .id = DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT,
- .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME,
- .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE,
- },
-};
-
-static int devlink_param_generic_verify(const struct devlink_param *param)
-{
- /* verify it match generic parameter by id and name */
- if (param->id > DEVLINK_PARAM_GENERIC_ID_MAX)
- return -EINVAL;
- if (strcmp(param->name, devlink_param_generic[param->id].name))
- return -ENOENT;
-
- WARN_ON(param->type != devlink_param_generic[param->id].type);
-
- return 0;
-}
-
-static int devlink_param_driver_verify(const struct devlink_param *param)
-{
- int i;
-
- if (param->id <= DEVLINK_PARAM_GENERIC_ID_MAX)
- return -EINVAL;
- /* verify no such name in generic params */
- for (i = 0; i <= DEVLINK_PARAM_GENERIC_ID_MAX; i++)
- if (!strcmp(param->name, devlink_param_generic[i].name))
- return -EEXIST;
-
- return 0;
-}
-
-static struct devlink_param_item *
-devlink_param_find_by_name(struct list_head *param_list,
- const char *param_name)
-{
- struct devlink_param_item *param_item;
-
- list_for_each_entry(param_item, param_list, list)
- if (!strcmp(param_item->param->name, param_name))
- return param_item;
- return NULL;
-}
-
-static struct devlink_param_item *
-devlink_param_find_by_id(struct list_head *param_list, u32 param_id)
-{
- struct devlink_param_item *param_item;
-
- list_for_each_entry(param_item, param_list, list)
- if (param_item->param->id == param_id)
- return param_item;
- return NULL;
-}
-
-static bool
-devlink_param_cmode_is_supported(const struct devlink_param *param,
- enum devlink_param_cmode cmode)
-{
- return test_bit(cmode, &param->supported_cmodes);
-}
-
-static int devlink_param_get(struct devlink *devlink,
- const struct devlink_param *param,
- struct devlink_param_gset_ctx *ctx)
-{
- if (!param->get)
- return -EOPNOTSUPP;
- return param->get(devlink, param->id, ctx);
-}
-
-static int devlink_param_set(struct devlink *devlink,
- const struct devlink_param *param,
- struct devlink_param_gset_ctx *ctx)
-{
- if (!param->set)
- return -EOPNOTSUPP;
- return param->set(devlink, param->id, ctx);
-}
-
-static int
-devlink_param_type_to_nla_type(enum devlink_param_type param_type)
-{
- switch (param_type) {
- case DEVLINK_PARAM_TYPE_U8:
- return NLA_U8;
- case DEVLINK_PARAM_TYPE_U16:
- return NLA_U16;
- case DEVLINK_PARAM_TYPE_U32:
- return NLA_U32;
- case DEVLINK_PARAM_TYPE_STRING:
- return NLA_STRING;
- case DEVLINK_PARAM_TYPE_BOOL:
- return NLA_FLAG;
- default:
- return -EINVAL;
- }
-}
-
-static int
-devlink_nl_param_value_fill_one(struct sk_buff *msg,
- enum devlink_param_type type,
- enum devlink_param_cmode cmode,
- union devlink_param_value val)
-{
- struct nlattr *param_value_attr;
-
- param_value_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM_VALUE);
- if (!param_value_attr)
- goto nla_put_failure;
-
- if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode))
- goto value_nest_cancel;
-
- switch (type) {
- case DEVLINK_PARAM_TYPE_U8:
- if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu8))
- goto value_nest_cancel;
- break;
- case DEVLINK_PARAM_TYPE_U16:
- if (nla_put_u16(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu16))
- goto value_nest_cancel;
- break;
- case DEVLINK_PARAM_TYPE_U32:
- if (nla_put_u32(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu32))
- goto value_nest_cancel;
- break;
- case DEVLINK_PARAM_TYPE_STRING:
- if (nla_put_string(msg, DEVLINK_ATTR_PARAM_VALUE_DATA,
- val.vstr))
- goto value_nest_cancel;
- break;
- case DEVLINK_PARAM_TYPE_BOOL:
- if (val.vbool &&
- nla_put_flag(msg, DEVLINK_ATTR_PARAM_VALUE_DATA))
- goto value_nest_cancel;
- break;
- }
-
- nla_nest_end(msg, param_value_attr);
- return 0;
-
-value_nest_cancel:
- nla_nest_cancel(msg, param_value_attr);
-nla_put_failure:
- return -EMSGSIZE;
-}
-
-static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,
- struct devlink_param_item *param_item,
- enum devlink_command cmd,
- u32 portid, u32 seq, int flags)
-{
- union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1];
- const struct devlink_param *param = param_item->param;
- struct devlink_param_gset_ctx ctx;
- struct nlattr *param_values_list;
- struct nlattr *param_attr;
- int nla_type;
- void *hdr;
- int err;
- int i;
-
- /* Get value from driver part to driverinit configuration mode */
- for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) {
- if (!devlink_param_cmode_is_supported(param, i))
- continue;
- if (i == DEVLINK_PARAM_CMODE_DRIVERINIT) {
- if (!param_item->driverinit_value_valid)
- return -EOPNOTSUPP;
- param_value[i] = param_item->driverinit_value;
- } else {
- ctx.cmode = i;
- err = devlink_param_get(devlink, param, &ctx);
- if (err)
- return err;
- param_value[i] = ctx.val;
- }
- }
-
- hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
- if (!hdr)
- return -EMSGSIZE;
-
- if (devlink_nl_put_handle(msg, devlink))
- goto genlmsg_cancel;
- param_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM);
- if (!param_attr)
- goto genlmsg_cancel;
- if (nla_put_string(msg, DEVLINK_ATTR_PARAM_NAME, param->name))
- goto param_nest_cancel;
- if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC))
- goto param_nest_cancel;
-
- nla_type = devlink_param_type_to_nla_type(param->type);
- if (nla_type < 0)
- goto param_nest_cancel;
- if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, nla_type))
- goto param_nest_cancel;
-
- param_values_list = nla_nest_start(msg, DEVLINK_ATTR_PARAM_VALUES_LIST);
- if (!param_values_list)
- goto param_nest_cancel;
-
- for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) {
- if (!devlink_param_cmode_is_supported(param, i))
- continue;
- err = devlink_nl_param_value_fill_one(msg, param->type,
- i, param_value[i]);
- if (err)
- goto values_list_nest_cancel;
- }
-
- nla_nest_end(msg, param_values_list);
- nla_nest_end(msg, param_attr);
- genlmsg_end(msg, hdr);
- return 0;
-
-values_list_nest_cancel:
- nla_nest_end(msg, param_values_list);
-param_nest_cancel:
- nla_nest_cancel(msg, param_attr);
-genlmsg_cancel:
- genlmsg_cancel(msg, hdr);
- return -EMSGSIZE;
-}
-
-static void devlink_param_notify(struct devlink *devlink,
- struct devlink_param_item *param_item,
- enum devlink_command cmd)
-{
- struct sk_buff *msg;
- int err;
-
- WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL);
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return;
- err = devlink_nl_param_fill(msg, devlink, param_item, cmd, 0, 0, 0);
- if (err) {
- nlmsg_free(msg);
- return;
- }
-
- genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
- msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
-}
-
-static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg,
- struct netlink_callback *cb)
-{
- struct devlink_param_item *param_item;
- struct devlink *devlink;
- int start = cb->args[0];
- int idx = 0;
- int err;
-
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
- continue;
- mutex_lock(&devlink->lock);
- list_for_each_entry(param_item, &devlink->param_list, list) {
- if (idx < start) {
- idx++;
- continue;
- }
- err = devlink_nl_param_fill(msg, devlink, param_item,
- DEVLINK_CMD_PARAM_GET,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NLM_F_MULTI);
- if (err) {
- mutex_unlock(&devlink->lock);
- goto out;
- }
- idx++;
- }
- mutex_unlock(&devlink->lock);
- }
-out:
- mutex_unlock(&devlink_mutex);
-
- cb->args[0] = idx;
- return msg->len;
-}
-
-static int
-devlink_param_type_get_from_info(struct genl_info *info,
- enum devlink_param_type *param_type)
-{
- if (!info->attrs[DEVLINK_ATTR_PARAM_TYPE])
- return -EINVAL;
-
- switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) {
- case NLA_U8:
- *param_type = DEVLINK_PARAM_TYPE_U8;
- break;
- case NLA_U16:
- *param_type = DEVLINK_PARAM_TYPE_U16;
- break;
- case NLA_U32:
- *param_type = DEVLINK_PARAM_TYPE_U32;
- break;
- case NLA_STRING:
- *param_type = DEVLINK_PARAM_TYPE_STRING;
- break;
- case NLA_FLAG:
- *param_type = DEVLINK_PARAM_TYPE_BOOL;
- break;
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-static int
-devlink_param_value_get_from_info(const struct devlink_param *param,
- struct genl_info *info,
- union devlink_param_value *value)
-{
- int len;
-
- if (param->type != DEVLINK_PARAM_TYPE_BOOL &&
- !info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA])
- return -EINVAL;
-
- switch (param->type) {
- case DEVLINK_PARAM_TYPE_U8:
- value->vu8 = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
- break;
- case DEVLINK_PARAM_TYPE_U16:
- value->vu16 = nla_get_u16(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
- break;
- case DEVLINK_PARAM_TYPE_U32:
- value->vu32 = nla_get_u32(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]);
- break;
- case DEVLINK_PARAM_TYPE_STRING:
- len = strnlen(nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]),
- nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]));
- if (len == nla_len(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]) ||
- len >= __DEVLINK_PARAM_MAX_STRING_VALUE)
- return -EINVAL;
- strcpy(value->vstr,
- nla_data(info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]));
- break;
- case DEVLINK_PARAM_TYPE_BOOL:
- value->vbool = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA] ?
- true : false;
- break;
- }
- return 0;
-}
-
-static struct devlink_param_item *
-devlink_param_get_from_info(struct devlink *devlink,
- struct genl_info *info)
-{
- char *param_name;
-
- if (!info->attrs[DEVLINK_ATTR_PARAM_NAME])
- return NULL;
-
- param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]);
- return devlink_param_find_by_name(&devlink->param_list, param_name);
-}
-
-static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- struct devlink_param_item *param_item;
- struct sk_buff *msg;
- int err;
-
- param_item = devlink_param_get_from_info(devlink, info);
- if (!param_item)
- return -EINVAL;
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return -ENOMEM;
-
- err = devlink_nl_param_fill(msg, devlink, param_item,
- DEVLINK_CMD_PARAM_GET,
- info->snd_portid, info->snd_seq, 0);
- if (err) {
- nlmsg_free(msg);
- return err;
- }
-
- return genlmsg_reply(msg, info);
-}
-
-static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- enum devlink_param_type param_type;
- struct devlink_param_gset_ctx ctx;
- enum devlink_param_cmode cmode;
- struct devlink_param_item *param_item;
- const struct devlink_param *param;
- union devlink_param_value value;
- int err = 0;
-
- param_item = devlink_param_get_from_info(devlink, info);
- if (!param_item)
- return -EINVAL;
- param = param_item->param;
- err = devlink_param_type_get_from_info(info, &param_type);
- if (err)
- return err;
- if (param_type != param->type)
- return -EINVAL;
- err = devlink_param_value_get_from_info(param, info, &value);
- if (err)
- return err;
- if (param->validate) {
- err = param->validate(devlink, param->id, value, info->extack);
- if (err)
- return err;
- }
-
- if (!info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE])
- return -EINVAL;
- cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]);
- if (!devlink_param_cmode_is_supported(param, cmode))
- return -EOPNOTSUPP;
-
- if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) {
- if (param->type == DEVLINK_PARAM_TYPE_STRING)
- strcpy(param_item->driverinit_value.vstr, value.vstr);
- else
- param_item->driverinit_value = value;
- param_item->driverinit_value_valid = true;
- } else {
- if (!param->set)
- return -EOPNOTSUPP;
- ctx.val = value;
- ctx.cmode = cmode;
- err = devlink_param_set(devlink, param, &ctx);
- if (err)
- return err;
- }
-
- devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
- return 0;
-}
-
-static int devlink_param_register_one(struct devlink *devlink,
- const struct devlink_param *param)
-{
- struct devlink_param_item *param_item;
-
- if (devlink_param_find_by_name(&devlink->param_list,
- param->name))
- return -EEXIST;
-
- if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT))
- WARN_ON(param->get || param->set);
- else
- WARN_ON(!param->get || !param->set);
-
- param_item = kzalloc(sizeof(*param_item), GFP_KERNEL);
- if (!param_item)
- return -ENOMEM;
- param_item->param = param;
-
- list_add_tail(&param_item->list, &devlink->param_list);
- devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
- return 0;
-}
-
-static void devlink_param_unregister_one(struct devlink *devlink,
- const struct devlink_param *param)
-{
- struct devlink_param_item *param_item;
-
- param_item = devlink_param_find_by_name(&devlink->param_list,
- param->name);
- WARN_ON(!param_item);
- devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_DEL);
- list_del(&param_item->list);
- kfree(param_item);
-}
-
-static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg,
- struct devlink *devlink,
- struct devlink_snapshot *snapshot)
-{
- struct nlattr *snap_attr;
- int err;
-
- snap_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_SNAPSHOT);
- if (!snap_attr)
- return -EINVAL;
-
- err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id);
- if (err)
- goto nla_put_failure;
-
- nla_nest_end(msg, snap_attr);
- return 0;
-
-nla_put_failure:
- nla_nest_cancel(msg, snap_attr);
- return err;
-}
-
-static int devlink_nl_region_snapshots_id_put(struct sk_buff *msg,
- struct devlink *devlink,
- struct devlink_region *region)
-{
- struct devlink_snapshot *snapshot;
- struct nlattr *snapshots_attr;
- int err;
-
- snapshots_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_SNAPSHOTS);
- if (!snapshots_attr)
- return -EINVAL;
-
- list_for_each_entry(snapshot, &region->snapshot_list, list) {
- err = devlink_nl_region_snapshot_id_put(msg, devlink, snapshot);
- if (err)
- goto nla_put_failure;
- }
-
- nla_nest_end(msg, snapshots_attr);
- return 0;
-
-nla_put_failure:
- nla_nest_cancel(msg, snapshots_attr);
- return err;
-}
-
-static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink,
- enum devlink_command cmd, u32 portid,
- u32 seq, int flags,
- struct devlink_region *region)
-{
- void *hdr;
- int err;
-
- hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
- if (!hdr)
- return -EMSGSIZE;
-
- err = devlink_nl_put_handle(msg, devlink);
- if (err)
- goto nla_put_failure;
-
- err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->name);
- if (err)
- goto nla_put_failure;
-
- err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE,
- region->size,
- DEVLINK_ATTR_PAD);
- if (err)
- goto nla_put_failure;
-
- err = devlink_nl_region_snapshots_id_put(msg, devlink, region);
- if (err)
- goto nla_put_failure;
-
- genlmsg_end(msg, hdr);
- return 0;
-
-nla_put_failure:
- genlmsg_cancel(msg, hdr);
- return err;
-}
-
-static void devlink_nl_region_notify(struct devlink_region *region,
- struct devlink_snapshot *snapshot,
- enum devlink_command cmd)
-{
- struct devlink *devlink = region->devlink;
- struct sk_buff *msg;
- void *hdr;
- int err;
-
- WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL);
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return;
-
- hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd);
- if (!hdr)
- goto out_free_msg;
-
- err = devlink_nl_put_handle(msg, devlink);
- if (err)
- goto out_cancel_msg;
-
- err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME,
- region->name);
- if (err)
- goto out_cancel_msg;
-
- if (snapshot) {
- err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID,
- snapshot->id);
- if (err)
- goto out_cancel_msg;
- } else {
- err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_SIZE,
- region->size, DEVLINK_ATTR_PAD);
- if (err)
- goto out_cancel_msg;
- }
- genlmsg_end(msg, hdr);
-
- genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
- msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
-
- return;
-
-out_cancel_msg:
- genlmsg_cancel(msg, hdr);
-out_free_msg:
- nlmsg_free(msg);
-}
-
-static int devlink_nl_cmd_region_get_doit(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- struct devlink_region *region;
- const char *region_name;
- struct sk_buff *msg;
- int err;
-
- if (!info->attrs[DEVLINK_ATTR_REGION_NAME])
- return -EINVAL;
-
- region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
- region = devlink_region_get_by_name(devlink, region_name);
- if (!region)
- return -EINVAL;
-
- msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (!msg)
- return -ENOMEM;
-
- err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET,
- info->snd_portid, info->snd_seq, 0,
- region);
- if (err) {
- nlmsg_free(msg);
- return err;
- }
-
- return genlmsg_reply(msg, info);
-}
-
-static int devlink_nl_cmd_region_get_dumpit(struct sk_buff *msg,
- struct netlink_callback *cb)
-{
- struct devlink_region *region;
- struct devlink *devlink;
- int start = cb->args[0];
- int idx = 0;
- int err;
-
- mutex_lock(&devlink_mutex);
- list_for_each_entry(devlink, &devlink_list, list) {
- if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
- continue;
-
- mutex_lock(&devlink->lock);
- list_for_each_entry(region, &devlink->region_list, list) {
- if (idx < start) {
- idx++;
- continue;
- }
- err = devlink_nl_region_fill(msg, devlink,
- DEVLINK_CMD_REGION_GET,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- NLM_F_MULTI, region);
- if (err) {
- mutex_unlock(&devlink->lock);
- goto out;
- }
- idx++;
- }
- mutex_unlock(&devlink->lock);
- }
-out:
- mutex_unlock(&devlink_mutex);
- cb->args[0] = idx;
- return msg->len;
-}
-
-static int devlink_nl_cmd_region_del(struct sk_buff *skb,
- struct genl_info *info)
-{
- struct devlink *devlink = info->user_ptr[0];
- struct devlink_snapshot *snapshot;
- struct devlink_region *region;
- const char *region_name;
- u32 snapshot_id;
-
- if (!info->attrs[DEVLINK_ATTR_REGION_NAME] ||
- !info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID])
- return -EINVAL;
-
- region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
- snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
-
- region = devlink_region_get_by_name(devlink, region_name);
- if (!region)
- return -EINVAL;
-
- snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id);
- if (!snapshot)
- return -EINVAL;
-
- devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL);
- devlink_region_snapshot_del(snapshot);
- return 0;
-}
-
-static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg,
- struct devlink *devlink,
- u8 *chunk, u32 chunk_size,
- u64 addr)
-{
- struct nlattr *chunk_attr;
- int err;
-
- chunk_attr = nla_nest_start(msg, DEVLINK_ATTR_REGION_CHUNK);
- if (!chunk_attr)
- return -EINVAL;
-
- err = nla_put(msg, DEVLINK_ATTR_REGION_CHUNK_DATA, chunk_size, chunk);
- if (err)
- goto nla_put_failure;
-
- err = nla_put_u64_64bit(msg, DEVLINK_ATTR_REGION_CHUNK_ADDR, addr,
- DEVLINK_ATTR_PAD);
- if (err)
- goto nla_put_failure;
-
- nla_nest_end(msg, chunk_attr);
- return 0;
-
-nla_put_failure:
- nla_nest_cancel(msg, chunk_attr);
- return err;
-}
-
-#define DEVLINK_REGION_READ_CHUNK_SIZE 256
-
-static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb,
- struct devlink *devlink,
- struct devlink_region *region,
- struct nlattr **attrs,
- u64 start_offset,
- u64 end_offset,
- bool dump,
- u64 *new_offset)
-{
- struct devlink_snapshot *snapshot;
- u64 curr_offset = start_offset;
- u32 snapshot_id;
- int err = 0;
-
- *new_offset = start_offset;
-
- snapshot_id = nla_get_u32(attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
- snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id);
- if (!snapshot)
- return -EINVAL;
-
- if (end_offset > snapshot->data_len || dump)
- end_offset = snapshot->data_len;
-
- while (curr_offset < end_offset) {
- u32 data_size;
- u8 *data;
-
- if (end_offset - curr_offset < DEVLINK_REGION_READ_CHUNK_SIZE)
- data_size = end_offset - curr_offset;
- else
- data_size = DEVLINK_REGION_READ_CHUNK_SIZE;
-
- data = &snapshot->data[curr_offset];
- err = devlink_nl_cmd_region_read_chunk_fill(skb, devlink,
- data, data_size,
- curr_offset);
- if (err)
- break;
-
- curr_offset += data_size;
- }
- *new_offset = curr_offset;
-
- return err;
-}
-
-static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
- struct netlink_callback *cb)
-{
- u64 ret_offset, start_offset, end_offset = 0;
- struct nlattr *attrs[DEVLINK_ATTR_MAX + 1];
- const struct genl_ops *ops = cb->data;
- struct devlink_region *region;
- struct nlattr *chunks_attr;
- const char *region_name;
- struct devlink *devlink;
- bool dump = true;
- void *hdr;
- int err;
-
- start_offset = *((u64 *)&cb->args[0]);
-
- err = nlmsg_parse(cb->nlh, GENL_HDRLEN + devlink_nl_family.hdrsize,
- attrs, DEVLINK_ATTR_MAX, ops->policy, NULL);
- if (err)
- goto out;
-
- devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs);
- if (IS_ERR(devlink))
- goto out;
-
- mutex_lock(&devlink_mutex);
- mutex_lock(&devlink->lock);
-
- if (!attrs[DEVLINK_ATTR_REGION_NAME] ||
- !attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID])
- goto out_unlock;
-
- region_name = nla_data(attrs[DEVLINK_ATTR_REGION_NAME]);
- region = devlink_region_get_by_name(devlink, region_name);
- if (!region)
- goto out_unlock;
-
- hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
- &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI,
- DEVLINK_CMD_REGION_READ);
- if (!hdr)
- goto out_unlock;
-
- err = devlink_nl_put_handle(skb, devlink);
- if (err)
- goto nla_put_failure;
-
- err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name);
- if (err)
- goto nla_put_failure;
-
- chunks_attr = nla_nest_start(skb, DEVLINK_ATTR_REGION_CHUNKS);
- if (!chunks_attr)
- goto nla_put_failure;
-
- if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] &&
- attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) {
- if (!start_offset)
- start_offset =
- nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
-
- end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
- end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]);
- dump = false;
- }
-
- err = devlink_nl_region_read_snapshot_fill(skb, devlink,
- region, attrs,
- start_offset,
- end_offset, dump,
- &ret_offset);
-
- if (err && err != -EMSGSIZE)
- goto nla_put_failure;
-
- /* Check if there was any progress done to prevent infinite loop */
- if (ret_offset == start_offset)
- goto nla_put_failure;
-
- *((u64 *)&cb->args[0]) = ret_offset;
-
- nla_nest_end(skb, chunks_attr);
- genlmsg_end(skb, hdr);
- mutex_unlock(&devlink->lock);
- mutex_unlock(&devlink_mutex);
-
- return skb->len;
-
-nla_put_failure:
- genlmsg_cancel(skb, hdr);
-out_unlock:
- mutex_unlock(&devlink->lock);
- mutex_unlock(&devlink_mutex);
-out:
- return 0;
-}
-
-static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
- [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
- [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
- [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 },
- [DEVLINK_ATTR_PORT_TYPE] = { .type = NLA_U16 },
- [DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 },
- [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32 },
- [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16 },
- [DEVLINK_ATTR_SB_POOL_TYPE] = { .type = NLA_U8 },
- [DEVLINK_ATTR_SB_POOL_SIZE] = { .type = NLA_U32 },
- [DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = { .type = NLA_U8 },
- [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32 },
- [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 },
- [DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 },
- [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 },
- [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 },
- [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING },
- [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 },
- [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64},
- [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64},
- [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING },
- [DEVLINK_ATTR_PARAM_TYPE] = { .type = NLA_U8 },
- [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 },
- [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING },
- [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 },
-};
-
-static const struct genl_ops devlink_nl_ops[] = {
- {
- .cmd = DEVLINK_CMD_GET,
- .doit = devlink_nl_cmd_get_doit,
- .dumpit = devlink_nl_cmd_get_dumpit,
- .policy = devlink_nl_policy,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
- /* can be retrieved by unprivileged users */
- },
- {
- .cmd = DEVLINK_CMD_PORT_GET,
- .doit = devlink_nl_cmd_port_get_doit,
- .dumpit = devlink_nl_cmd_port_get_dumpit,
- .policy = devlink_nl_policy,
- .internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
- /* can be retrieved by unprivileged users */
- },
- {
- .cmd = DEVLINK_CMD_PORT_SET,
- .doit = devlink_nl_cmd_port_set_doit,
- .policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
- },
- {
- .cmd = DEVLINK_CMD_PORT_SPLIT,
- .doit = devlink_nl_cmd_port_split_doit,
- .policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NO_LOCK,
- },
- {
- .cmd = DEVLINK_CMD_PORT_UNSPLIT,
- .doit = devlink_nl_cmd_port_unsplit_doit,
- .policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NO_LOCK,
- },
- {
- .cmd = DEVLINK_CMD_SB_GET,
- .doit = devlink_nl_cmd_sb_get_doit,
- .dumpit = devlink_nl_cmd_sb_get_dumpit,
- .policy = devlink_nl_policy,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NEED_SB,
- /* can be retrieved by unprivileged users */
- },
- {
- .cmd = DEVLINK_CMD_SB_POOL_GET,
- .doit = devlink_nl_cmd_sb_pool_get_doit,
- .dumpit = devlink_nl_cmd_sb_pool_get_dumpit,
- .policy = devlink_nl_policy,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NEED_SB,
- /* can be retrieved by unprivileged users */
- },
- {
- .cmd = DEVLINK_CMD_SB_POOL_SET,
- .doit = devlink_nl_cmd_sb_pool_set_doit,
- .policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NEED_SB,
- },
- {
- .cmd = DEVLINK_CMD_SB_PORT_POOL_GET,
- .doit = devlink_nl_cmd_sb_port_pool_get_doit,
- .dumpit = devlink_nl_cmd_sb_port_pool_get_dumpit,
- .policy = devlink_nl_policy,
- .internal_flags = DEVLINK_NL_FLAG_NEED_PORT |
- DEVLINK_NL_FLAG_NEED_SB,
- /* can be retrieved by unprivileged users */
- },
- {
- .cmd = DEVLINK_CMD_SB_PORT_POOL_SET,
- .doit = devlink_nl_cmd_sb_port_pool_set_doit,
- .policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_PORT |
- DEVLINK_NL_FLAG_NEED_SB,
- },
- {
- .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET,
- .doit = devlink_nl_cmd_sb_tc_pool_bind_get_doit,
- .dumpit = devlink_nl_cmd_sb_tc_pool_bind_get_dumpit,
- .policy = devlink_nl_policy,
- .internal_flags = DEVLINK_NL_FLAG_NEED_PORT |
- DEVLINK_NL_FLAG_NEED_SB,
- /* can be retrieved by unprivileged users */
- },
- {
- .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_SET,
- .doit = devlink_nl_cmd_sb_tc_pool_bind_set_doit,
- .policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_PORT |
- DEVLINK_NL_FLAG_NEED_SB,
- },
- {
- .cmd = DEVLINK_CMD_SB_OCC_SNAPSHOT,
- .doit = devlink_nl_cmd_sb_occ_snapshot_doit,
- .policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NEED_SB,
- },
- {
- .cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR,
- .doit = devlink_nl_cmd_sb_occ_max_clear_doit,
- .policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NEED_SB,
- },
- {
- .cmd = DEVLINK_CMD_ESWITCH_GET,
- .doit = devlink_nl_cmd_eswitch_get_doit,
- .policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
- },
- {
- .cmd = DEVLINK_CMD_ESWITCH_SET,
- .doit = devlink_nl_cmd_eswitch_set_doit,
- .policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NO_LOCK,
- },
- {
- .cmd = DEVLINK_CMD_DPIPE_TABLE_GET,
- .doit = devlink_nl_cmd_dpipe_table_get,
- .policy = devlink_nl_policy,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
- /* can be retrieved by unprivileged users */
- },
- {
- .cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET,
- .doit = devlink_nl_cmd_dpipe_entries_get,
- .policy = devlink_nl_policy,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
- /* can be retrieved by unprivileged users */
- },
- {
- .cmd = DEVLINK_CMD_DPIPE_HEADERS_GET,
- .doit = devlink_nl_cmd_dpipe_headers_get,
- .policy = devlink_nl_policy,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
- /* can be retrieved by unprivileged users */
- },
- {
- .cmd = DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET,
- .doit = devlink_nl_cmd_dpipe_table_counters_set,
- .policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
- },
- {
- .cmd = DEVLINK_CMD_RESOURCE_SET,
- .doit = devlink_nl_cmd_resource_set,
- .policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
- },
- {
- .cmd = DEVLINK_CMD_RESOURCE_DUMP,
- .doit = devlink_nl_cmd_resource_dump,
- .policy = devlink_nl_policy,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
- /* can be retrieved by unprivileged users */
- },
- {
- .cmd = DEVLINK_CMD_RELOAD,
- .doit = devlink_nl_cmd_reload,
- .policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
- DEVLINK_NL_FLAG_NO_LOCK,
- },
- {
- .cmd = DEVLINK_CMD_PARAM_GET,
- .doit = devlink_nl_cmd_param_get_doit,
- .dumpit = devlink_nl_cmd_param_get_dumpit,
- .policy = devlink_nl_policy,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
- /* can be retrieved by unprivileged users */
- },
- {
- .cmd = DEVLINK_CMD_PARAM_SET,
- .doit = devlink_nl_cmd_param_set_doit,
- .policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
- },
- {
- .cmd = DEVLINK_CMD_REGION_GET,
- .doit = devlink_nl_cmd_region_get_doit,
- .dumpit = devlink_nl_cmd_region_get_dumpit,
- .policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
- },
- {
- .cmd = DEVLINK_CMD_REGION_DEL,
- .doit = devlink_nl_cmd_region_del,
- .policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
- },
- {
- .cmd = DEVLINK_CMD_REGION_READ,
- .dumpit = devlink_nl_cmd_region_read_dumpit,
- .policy = devlink_nl_policy,
- .flags = GENL_ADMIN_PERM,
- .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK,
- },
-};
-
-static struct genl_family devlink_nl_family __ro_after_init = {
- .name = DEVLINK_GENL_NAME,
- .version = DEVLINK_GENL_VERSION,
- .maxattr = DEVLINK_ATTR_MAX,
- .netnsok = true,
- .pre_doit = devlink_nl_pre_doit,
- .post_doit = devlink_nl_post_doit,
- .module = THIS_MODULE,
- .ops = devlink_nl_ops,
- .n_ops = ARRAY_SIZE(devlink_nl_ops),
- .mcgrps = devlink_nl_mcgrps,
- .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps),
-};
-
-/**
- * devlink_alloc - Allocate new devlink instance resources
- *
- * @ops: ops
- * @priv_size: size of user private data
- *
- * Allocate new devlink instance resources, including devlink index
- * and name.
- */
-struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
-{
- struct devlink *devlink;
-
- devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL);
- if (!devlink)
- return NULL;
- devlink->ops = ops;
- devlink_net_set(devlink, &init_net);
- INIT_LIST_HEAD(&devlink->port_list);
- INIT_LIST_HEAD(&devlink->sb_list);
- INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
- INIT_LIST_HEAD(&devlink->resource_list);
- INIT_LIST_HEAD(&devlink->param_list);
- INIT_LIST_HEAD(&devlink->region_list);
- mutex_init(&devlink->lock);
- return devlink;
-}
-EXPORT_SYMBOL_GPL(devlink_alloc);
-
-/**
- * devlink_register - Register devlink instance
- *
- * @devlink: devlink
- */
-int devlink_register(struct devlink *devlink, struct device *dev)
-{
- mutex_lock(&devlink_mutex);
- devlink->dev = dev;
- list_add_tail(&devlink->list, &devlink_list);
- devlink_notify(devlink, DEVLINK_CMD_NEW);
- mutex_unlock(&devlink_mutex);
- return 0;
-}
-EXPORT_SYMBOL_GPL(devlink_register);
-
-/**
- * devlink_unregister - Unregister devlink instance
- *
- * @devlink: devlink
- */
-void devlink_unregister(struct devlink *devlink)
-{
- mutex_lock(&devlink_mutex);
- devlink_notify(devlink, DEVLINK_CMD_DEL);
- list_del(&devlink->list);
- mutex_unlock(&devlink_mutex);
-}
-EXPORT_SYMBOL_GPL(devlink_unregister);
-
-/**
- * devlink_free - Free devlink instance resources
- *
- * @devlink: devlink
- */
-void devlink_free(struct devlink *devlink)
-{
- kfree(devlink);
-}
-EXPORT_SYMBOL_GPL(devlink_free);
-
-/**
- * devlink_port_register - Register devlink port
- *
- * @devlink: devlink
- * @devlink_port: devlink port
- * @port_index
- *
- * Register devlink port with provided port index. User can use
- * any indexing, even hw-related one. devlink_port structure
- * is convenient to be embedded inside user driver private structure.
- * Note that the caller should take care of zeroing the devlink_port
- * structure.
- */
-int devlink_port_register(struct devlink *devlink,
- struct devlink_port *devlink_port,
- unsigned int port_index)
-{
- mutex_lock(&devlink->lock);
- if (devlink_port_index_exists(devlink, port_index)) {
- mutex_unlock(&devlink->lock);
- return -EEXIST;
- }
- devlink_port->devlink = devlink;
- devlink_port->index = port_index;
- devlink_port->registered = true;
- list_add_tail(&devlink_port->list, &devlink->port_list);
- mutex_unlock(&devlink->lock);
- devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
- return 0;
-}
-EXPORT_SYMBOL_GPL(devlink_port_register);
-
-/**
- * devlink_port_unregister - Unregister devlink port
- *
- * @devlink_port: devlink port
- */
-void devlink_port_unregister(struct devlink_port *devlink_port)
-{
- struct devlink *devlink = devlink_port->devlink;
-
- devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
- mutex_lock(&devlink->lock);
- list_del(&devlink_port->list);
- mutex_unlock(&devlink->lock);
-}
-EXPORT_SYMBOL_GPL(devlink_port_unregister);
-
-static void __devlink_port_type_set(struct devlink_port *devlink_port,
- enum devlink_port_type type,
- void *type_dev)
-{
- devlink_port->type = type;
- devlink_port->type_dev = type_dev;
- devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
-}
-
-/**
- * devlink_port_type_eth_set - Set port type to Ethernet
- *
- * @devlink_port: devlink port
- * @netdev: related netdevice
- */
-void devlink_port_type_eth_set(struct devlink_port *devlink_port,
- struct net_device *netdev)
-{
- return __devlink_port_type_set(devlink_port,
- DEVLINK_PORT_TYPE_ETH, netdev);
-}
-EXPORT_SYMBOL_GPL(devlink_port_type_eth_set);
-
-/**
- * devlink_port_type_ib_set - Set port type to InfiniBand
- *
- * @devlink_port: devlink port
- * @ibdev: related IB device
- */
-void devlink_port_type_ib_set(struct devlink_port *devlink_port,
- struct ib_device *ibdev)
-{
- return __devlink_port_type_set(devlink_port,
- DEVLINK_PORT_TYPE_IB, ibdev);
-}
-EXPORT_SYMBOL_GPL(devlink_port_type_ib_set);
-
-/**
- * devlink_port_type_clear - Clear port type
- *
- * @devlink_port: devlink port
- */
-void devlink_port_type_clear(struct devlink_port *devlink_port)
-{
- return __devlink_port_type_set(devlink_port,
- DEVLINK_PORT_TYPE_NOTSET, NULL);
-}
-EXPORT_SYMBOL_GPL(devlink_port_type_clear);
-
-/**
- * devlink_port_attrs_set - Set port attributes
- *
- * @devlink_port: devlink port
- * @flavour: flavour of the port
- * @port_number: number of the port that is facing user, for example
- * the front panel port number
- * @split: indicates if this is split port
- * @split_subport_number: if the port is split, this is the number
- * of subport.
- */
-void devlink_port_attrs_set(struct devlink_port *devlink_port,
- enum devlink_port_flavour flavour,
- u32 port_number, bool split,
- u32 split_subport_number)
-{
- struct devlink_port_attrs *attrs = &devlink_port->attrs;
-
- attrs->set = true;
- attrs->flavour = flavour;
- attrs->port_number = port_number;
- attrs->split = split;
- attrs->split_subport_number = split_subport_number;
- devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
-}
-EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
-
-int devlink_port_get_phys_port_name(struct devlink_port *devlink_port,
- char *name, size_t len)
-{
- struct devlink_port_attrs *attrs = &devlink_port->attrs;
- int n = 0;
-
- if (!attrs->set)
- return -EOPNOTSUPP;
-
- switch (attrs->flavour) {
- case DEVLINK_PORT_FLAVOUR_PHYSICAL:
- if (!attrs->split)
- n = snprintf(name, len, "p%u", attrs->port_number);
- else
- n = snprintf(name, len, "p%us%u", attrs->port_number,
- attrs->split_subport_number);
- break;
- case DEVLINK_PORT_FLAVOUR_CPU:
- case DEVLINK_PORT_FLAVOUR_DSA:
- /* As CPU and DSA ports do not have a netdevice associated
- * case should not ever happen.
- */
- WARN_ON(1);
- return -EINVAL;
- }
-
- if (n >= len)
- return -EINVAL;
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(devlink_port_get_phys_port_name);
-
-int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
- u32 size, u16 ingress_pools_count,
- u16 egress_pools_count, u16 ingress_tc_count,
- u16 egress_tc_count)
-{
- struct devlink_sb *devlink_sb;
- int err = 0;
-
- mutex_lock(&devlink->lock);
- if (devlink_sb_index_exists(devlink, sb_index)) {
- err = -EEXIST;
- goto unlock;
- }
-
- devlink_sb = kzalloc(sizeof(*devlink_sb), GFP_KERNEL);
- if (!devlink_sb) {
- err = -ENOMEM;
- goto unlock;
- }
- devlink_sb->index = sb_index;
- devlink_sb->size = size;
- devlink_sb->ingress_pools_count = ingress_pools_count;
- devlink_sb->egress_pools_count = egress_pools_count;
- devlink_sb->ingress_tc_count = ingress_tc_count;
- devlink_sb->egress_tc_count = egress_tc_count;
- list_add_tail(&devlink_sb->list, &devlink->sb_list);
-unlock:
- mutex_unlock(&devlink->lock);
- return err;
-}
-EXPORT_SYMBOL_GPL(devlink_sb_register);
-
-void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index)
-{
- struct devlink_sb *devlink_sb;
-
- mutex_lock(&devlink->lock);
- devlink_sb = devlink_sb_get_by_index(devlink, sb_index);
- WARN_ON(!devlink_sb);
- list_del(&devlink_sb->list);
- mutex_unlock(&devlink->lock);
- kfree(devlink_sb);
-}
-EXPORT_SYMBOL_GPL(devlink_sb_unregister);
-
-/**
- * devlink_dpipe_headers_register - register dpipe headers
- *
- * @devlink: devlink
- * @dpipe_headers: dpipe header array
- *
- * Register the headers supported by hardware.
- */
-int devlink_dpipe_headers_register(struct devlink *devlink,
- struct devlink_dpipe_headers *dpipe_headers)
-{
- mutex_lock(&devlink->lock);
- devlink->dpipe_headers = dpipe_headers;
- mutex_unlock(&devlink->lock);
- return 0;
-}
-EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register);
-
-/**
- * devlink_dpipe_headers_unregister - unregister dpipe headers
- *
- * @devlink: devlink
- *
- * Unregister the headers supported by hardware.
- */
-void devlink_dpipe_headers_unregister(struct devlink *devlink)
-{
- mutex_lock(&devlink->lock);
- devlink->dpipe_headers = NULL;
- mutex_unlock(&devlink->lock);
-}
-EXPORT_SYMBOL_GPL(devlink_dpipe_headers_unregister);
-
-/**
- * devlink_dpipe_table_counter_enabled - check if counter allocation
- * required
- * @devlink: devlink
- * @table_name: tables name
- *
- * Used by driver to check if counter allocation is required.
- * After counter allocation is turned on the table entries
- * are updated to include counter statistics.
- *
- * After that point on the driver must respect the counter
- * state so that each entry added to the table is added
- * with a counter.
- */
-bool devlink_dpipe_table_counter_enabled(struct devlink *devlink,
- const char *table_name)
-{
- struct devlink_dpipe_table *table;
- bool enabled;
-
- rcu_read_lock();
- table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
- table_name);
- enabled = false;
- if (table)
- enabled = table->counters_enabled;
- rcu_read_unlock();
- return enabled;
-}
-EXPORT_SYMBOL_GPL(devlink_dpipe_table_counter_enabled);
-
-/**
- * devlink_dpipe_table_register - register dpipe table
- *
- * @devlink: devlink
- * @table_name: table name
- * @table_ops: table ops
- * @priv: priv
- * @counter_control_extern: external control for counters
- */
-int devlink_dpipe_table_register(struct devlink *devlink,
- const char *table_name,
- struct devlink_dpipe_table_ops *table_ops,
- void *priv, bool counter_control_extern)
-{
- struct devlink_dpipe_table *table;
-
- if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name))
- return -EEXIST;
-
- if (WARN_ON(!table_ops->size_get))
- return -EINVAL;
-
- table = kzalloc(sizeof(*table), GFP_KERNEL);
- if (!table)
- return -ENOMEM;
-
- table->name = table_name;
- table->table_ops = table_ops;
- table->priv = priv;
- table->counter_control_extern = counter_control_extern;
-
- mutex_lock(&devlink->lock);
- list_add_tail_rcu(&table->list, &devlink->dpipe_table_list);
- mutex_unlock(&devlink->lock);
- return 0;
-}
-EXPORT_SYMBOL_GPL(devlink_dpipe_table_register);
-
-/**
- * devlink_dpipe_table_unregister - unregister dpipe table
- *
- * @devlink: devlink
- * @table_name: table name
- */
-void devlink_dpipe_table_unregister(struct devlink *devlink,
- const char *table_name)
-{
- struct devlink_dpipe_table *table;
-
- mutex_lock(&devlink->lock);
- table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
- table_name);
- if (!table)
- goto unlock;
- list_del_rcu(&table->list);
- mutex_unlock(&devlink->lock);
- kfree_rcu(table, rcu);
- return;
-unlock:
- mutex_unlock(&devlink->lock);
-}
-EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister);
-
-/**
- * devlink_resource_register - devlink resource register
- *
- * @devlink: devlink
- * @resource_name: resource's name
- * @top_hierarchy: top hierarchy
- * @reload_required: reload is required for new configuration to
- * apply
- * @resource_size: resource's size
- * @resource_id: resource's id
- * @parent_reosurce_id: resource's parent id
- * @size params: size parameters
- */
-int devlink_resource_register(struct devlink *devlink,
- const char *resource_name,
- u64 resource_size,
- u64 resource_id,
- u64 parent_resource_id,
- const struct devlink_resource_size_params *size_params)
-{
- struct devlink_resource *resource;
- struct list_head *resource_list;
- bool top_hierarchy;
- int err = 0;
-
- top_hierarchy = parent_resource_id == DEVLINK_RESOURCE_ID_PARENT_TOP;
-
- mutex_lock(&devlink->lock);
- resource = devlink_resource_find(devlink, NULL, resource_id);
- if (resource) {
- err = -EINVAL;
- goto out;
- }
-
- resource = kzalloc(sizeof(*resource), GFP_KERNEL);
- if (!resource) {
- err = -ENOMEM;
- goto out;
- }
-
- if (top_hierarchy) {
- resource_list = &devlink->resource_list;
- } else {
- struct devlink_resource *parent_resource;
-
- parent_resource = devlink_resource_find(devlink, NULL,
- parent_resource_id);
- if (parent_resource) {
- resource_list = &parent_resource->resource_list;
- resource->parent = parent_resource;
- } else {
- kfree(resource);
- err = -EINVAL;
- goto out;
- }
- }
-
- resource->name = resource_name;
- resource->size = resource_size;
- resource->size_new = resource_size;
- resource->id = resource_id;
- resource->size_valid = true;
- memcpy(&resource->size_params, size_params,
- sizeof(resource->size_params));
- INIT_LIST_HEAD(&resource->resource_list);
- list_add_tail(&resource->list, resource_list);
-out:
- mutex_unlock(&devlink->lock);
- return err;
-}
-EXPORT_SYMBOL_GPL(devlink_resource_register);
-
-/**
- * devlink_resources_unregister - free all resources
- *
- * @devlink: devlink
- * @resource: resource
- */
-void devlink_resources_unregister(struct devlink *devlink,
- struct devlink_resource *resource)
-{
- struct devlink_resource *tmp, *child_resource;
- struct list_head *resource_list;
-
- if (resource)
- resource_list = &resource->resource_list;
- else
- resource_list = &devlink->resource_list;
-
- if (!resource)
- mutex_lock(&devlink->lock);
-
- list_for_each_entry_safe(child_resource, tmp, resource_list, list) {
- devlink_resources_unregister(devlink, child_resource);
- list_del(&child_resource->list);
- kfree(child_resource);
- }
-
- if (!resource)
- mutex_unlock(&devlink->lock);
-}
-EXPORT_SYMBOL_GPL(devlink_resources_unregister);
-
-/**
- * devlink_resource_size_get - get and update size
- *
- * @devlink: devlink
- * @resource_id: the requested resource id
- * @p_resource_size: ptr to update
- */
-int devlink_resource_size_get(struct devlink *devlink,
- u64 resource_id,
- u64 *p_resource_size)
-{
- struct devlink_resource *resource;
- int err = 0;
-
- mutex_lock(&devlink->lock);
- resource = devlink_resource_find(devlink, NULL, resource_id);
- if (!resource) {
- err = -EINVAL;
- goto out;
- }
- *p_resource_size = resource->size_new;
- resource->size = resource->size_new;
-out:
- mutex_unlock(&devlink->lock);
- return err;
-}
-EXPORT_SYMBOL_GPL(devlink_resource_size_get);
-
-/**
- * devlink_dpipe_table_resource_set - set the resource id
- *
- * @devlink: devlink
- * @table_name: table name
- * @resource_id: resource id
- * @resource_units: number of resource's units consumed per table's entry
- */
-int devlink_dpipe_table_resource_set(struct devlink *devlink,
- const char *table_name, u64 resource_id,
- u64 resource_units)
-{
- struct devlink_dpipe_table *table;
- int err = 0;
-
- mutex_lock(&devlink->lock);
- table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
- table_name);
- if (!table) {
- err = -EINVAL;
- goto out;
- }
- table->resource_id = resource_id;
- table->resource_units = resource_units;
- table->resource_valid = true;
-out:
- mutex_unlock(&devlink->lock);
- return err;
-}
-EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set);
-
-/**
- * devlink_resource_occ_get_register - register occupancy getter
- *
- * @devlink: devlink
- * @resource_id: resource id
- * @occ_get: occupancy getter callback
- * @occ_get_priv: occupancy getter callback priv
- */
-void devlink_resource_occ_get_register(struct devlink *devlink,
- u64 resource_id,
- devlink_resource_occ_get_t *occ_get,
- void *occ_get_priv)
-{
- struct devlink_resource *resource;
-
- mutex_lock(&devlink->lock);
- resource = devlink_resource_find(devlink, NULL, resource_id);
- if (WARN_ON(!resource))
- goto out;
- WARN_ON(resource->occ_get);
-
- resource->occ_get = occ_get;
- resource->occ_get_priv = occ_get_priv;
-out:
- mutex_unlock(&devlink->lock);
-}
-EXPORT_SYMBOL_GPL(devlink_resource_occ_get_register);
-
-/**
- * devlink_resource_occ_get_unregister - unregister occupancy getter
- *
- * @devlink: devlink
- * @resource_id: resource id
- */
-void devlink_resource_occ_get_unregister(struct devlink *devlink,
- u64 resource_id)
-{
- struct devlink_resource *resource;
-
- mutex_lock(&devlink->lock);
- resource = devlink_resource_find(devlink, NULL, resource_id);
- if (WARN_ON(!resource))
- goto out;
- WARN_ON(!resource->occ_get);
-
- resource->occ_get = NULL;
- resource->occ_get_priv = NULL;
-out:
- mutex_unlock(&devlink->lock);
-}
-EXPORT_SYMBOL_GPL(devlink_resource_occ_get_unregister);
-
-/**
- * devlink_params_register - register configuration parameters
- *
- * @devlink: devlink
- * @params: configuration parameters array
- * @params_count: number of parameters provided
- *
- * Register the configuration parameters supported by the driver.
- */
-int devlink_params_register(struct devlink *devlink,
- const struct devlink_param *params,
- size_t params_count)
-{
- const struct devlink_param *param = params;
- int i;
- int err;
-
- mutex_lock(&devlink->lock);
- for (i = 0; i < params_count; i++, param++) {
- if (!param || !param->name || !param->supported_cmodes) {
- err = -EINVAL;
- goto rollback;
- }
- if (param->generic) {
- err = devlink_param_generic_verify(param);
- if (err)
- goto rollback;
- } else {
- err = devlink_param_driver_verify(param);
- if (err)
- goto rollback;
- }
- err = devlink_param_register_one(devlink, param);
- if (err)
- goto rollback;
- }
-
- mutex_unlock(&devlink->lock);
- return 0;
-
-rollback:
- if (!i)
- goto unlock;
- for (param--; i > 0; i--, param--)
- devlink_param_unregister_one(devlink, param);
-unlock:
- mutex_unlock(&devlink->lock);
- return err;
-}
-EXPORT_SYMBOL_GPL(devlink_params_register);
-
-/**
- * devlink_params_unregister - unregister configuration parameters
- * @devlink: devlink
- * @params: configuration parameters to unregister
- * @params_count: number of parameters provided
- */
-void devlink_params_unregister(struct devlink *devlink,
- const struct devlink_param *params,
- size_t params_count)
-{
- const struct devlink_param *param = params;
- int i;
-
- mutex_lock(&devlink->lock);
- for (i = 0; i < params_count; i++, param++)
- devlink_param_unregister_one(devlink, param);
- mutex_unlock(&devlink->lock);
-}
-EXPORT_SYMBOL_GPL(devlink_params_unregister);
-
-/**
- * devlink_param_driverinit_value_get - get configuration parameter
- * value for driver initializing
- *
- * @devlink: devlink
- * @param_id: parameter ID
- * @init_val: value of parameter in driverinit configuration mode
- *
- * This function should be used by the driver to get driverinit
- * configuration for initialization after reload command.
- */
-int devlink_param_driverinit_value_get(struct devlink *devlink, u32 param_id,
- union devlink_param_value *init_val)
-{
- struct devlink_param_item *param_item;
-
- if (!devlink->ops || !devlink->ops->reload)
- return -EOPNOTSUPP;
-
- param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
- if (!param_item)
- return -EINVAL;
-
- if (!param_item->driverinit_value_valid ||
- !devlink_param_cmode_is_supported(param_item->param,
- DEVLINK_PARAM_CMODE_DRIVERINIT))
- return -EOPNOTSUPP;
-
- if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING)
- strcpy(init_val->vstr, param_item->driverinit_value.vstr);
- else
- *init_val = param_item->driverinit_value;
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get);
-
-/**
- * devlink_param_driverinit_value_set - set value of configuration
- * parameter for driverinit
- * configuration mode
- *
- * @devlink: devlink
- * @param_id: parameter ID
- * @init_val: value of parameter to set for driverinit configuration mode
- *
- * This function should be used by the driver to set driverinit
- * configuration mode default value.
- */
-int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id,
- union devlink_param_value init_val)
-{
- struct devlink_param_item *param_item;
-
- param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
- if (!param_item)
- return -EINVAL;
-
- if (!devlink_param_cmode_is_supported(param_item->param,
- DEVLINK_PARAM_CMODE_DRIVERINIT))
- return -EOPNOTSUPP;
-
- if (param_item->param->type == DEVLINK_PARAM_TYPE_STRING)
- strcpy(param_item->driverinit_value.vstr, init_val.vstr);
- else
- param_item->driverinit_value = init_val;
- param_item->driverinit_value_valid = true;
-
- devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
- return 0;
-}
-EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_set);
-
-/**
- * devlink_param_value_changed - notify devlink on a parameter's value
- * change. Should be called by the driver
- * right after the change.
- *
- * @devlink: devlink
- * @param_id: parameter ID
- *
- * This function should be used by the driver to notify devlink on value
- * change, excluding driverinit configuration mode.
- * For driverinit configuration mode driver should use the function
- * devlink_param_driverinit_value_set() instead.
- */
-void devlink_param_value_changed(struct devlink *devlink, u32 param_id)
-{
- struct devlink_param_item *param_item;
-
- param_item = devlink_param_find_by_id(&devlink->param_list, param_id);
- WARN_ON(!param_item);
-
- devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW);
-}
-EXPORT_SYMBOL_GPL(devlink_param_value_changed);
-
-/**
- * devlink_param_value_str_fill - Safely fill-up the string preventing
- * from overflow of the preallocated buffer
- *
- * @dst_val: destination devlink_param_value
- * @src: source buffer
- */
-void devlink_param_value_str_fill(union devlink_param_value *dst_val,
- const char *src)
-{
- size_t len;
-
- len = strlcpy(dst_val->vstr, src, __DEVLINK_PARAM_MAX_STRING_VALUE);
- WARN_ON(len >= __DEVLINK_PARAM_MAX_STRING_VALUE);
-}
-EXPORT_SYMBOL_GPL(devlink_param_value_str_fill);
-
-/**
- * devlink_region_create - create a new address region
- *
- * @devlink: devlink
- * @region_name: region name
- * @region_max_snapshots: Maximum supported number of snapshots for region
- * @region_size: size of region
- */
-struct devlink_region *devlink_region_create(struct devlink *devlink,
- const char *region_name,
- u32 region_max_snapshots,
- u64 region_size)
-{
- struct devlink_region *region;
- int err = 0;
-
- mutex_lock(&devlink->lock);
-
- if (devlink_region_get_by_name(devlink, region_name)) {
- err = -EEXIST;
- goto unlock;
- }
-
- region = kzalloc(sizeof(*region), GFP_KERNEL);
- if (!region) {
- err = -ENOMEM;
- goto unlock;
- }
-
- region->devlink = devlink;
- region->max_snapshots = region_max_snapshots;
- region->name = region_name;
- region->size = region_size;
- INIT_LIST_HEAD(&region->snapshot_list);
- list_add_tail(&region->list, &devlink->region_list);
- devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW);
-
- mutex_unlock(&devlink->lock);
- return region;
-
-unlock:
- mutex_unlock(&devlink->lock);
- return ERR_PTR(err);
-}
-EXPORT_SYMBOL_GPL(devlink_region_create);
-
-/**
- * devlink_region_destroy - destroy address region
- *
- * @region: devlink region to destroy
- */
-void devlink_region_destroy(struct devlink_region *region)
-{
- struct devlink *devlink = region->devlink;
- struct devlink_snapshot *snapshot, *ts;
-
- mutex_lock(&devlink->lock);
-
- /* Free all snapshots of region */
- list_for_each_entry_safe(snapshot, ts, &region->snapshot_list, list)
- devlink_region_snapshot_del(snapshot);
-
- list_del(&region->list);
-
- devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL);
- mutex_unlock(&devlink->lock);
- kfree(region);
-}
-EXPORT_SYMBOL_GPL(devlink_region_destroy);
-
-/**
- * devlink_region_shapshot_id_get - get snapshot ID
- *
- * This callback should be called when adding a new snapshot,
- * Driver should use the same id for multiple snapshots taken
- * on multiple regions at the same time/by the same trigger.
- *
- * @devlink: devlink
- */
-u32 devlink_region_shapshot_id_get(struct devlink *devlink)
-{
- u32 id;
-
- mutex_lock(&devlink->lock);
- id = ++devlink->snapshot_id;
- mutex_unlock(&devlink->lock);
-
- return id;
-}
-EXPORT_SYMBOL_GPL(devlink_region_shapshot_id_get);
-
-/**
- * devlink_region_snapshot_create - create a new snapshot
- * This will add a new snapshot of a region. The snapshot
- * will be stored on the region struct and can be accessed
- * from devlink. This is useful for future analyses of snapshots.
- * Multiple snapshots can be created on a region.
- * The @snapshot_id should be obtained using the getter function.
- *
- * @devlink_region: devlink region of the snapshot
- * @data_len: size of snapshot data
- * @data: snapshot data
- * @snapshot_id: snapshot id to be created
- * @data_destructor: pointer to destructor function to free data
- */
-int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len,
- u8 *data, u32 snapshot_id,
- devlink_snapshot_data_dest_t *data_destructor)
-{
- struct devlink *devlink = region->devlink;
- struct devlink_snapshot *snapshot;
- int err;
-
- mutex_lock(&devlink->lock);
-
- /* check if region can hold one more snapshot */
- if (region->cur_snapshots == region->max_snapshots) {
- err = -ENOMEM;
- goto unlock;
- }
-
- if (devlink_region_snapshot_get_by_id(region, snapshot_id)) {
- err = -EEXIST;
- goto unlock;
- }
-
- snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL);
- if (!snapshot) {
- err = -ENOMEM;
- goto unlock;
- }
-
- snapshot->id = snapshot_id;
- snapshot->region = region;
- snapshot->data = data;
- snapshot->data_len = data_len;
- snapshot->data_destructor = data_destructor;
-
- list_add_tail(&snapshot->list, &region->snapshot_list);
-
- region->cur_snapshots++;
-
- devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW);
- mutex_unlock(&devlink->lock);
- return 0;
-
-unlock:
- mutex_unlock(&devlink->lock);
- return err;
-}
-EXPORT_SYMBOL_GPL(devlink_region_snapshot_create);
-
-static int __init devlink_module_init(void)
-{
- return genl_register_family(&devlink_nl_family);
-}
-
-static void __exit devlink_module_exit(void)
-{
- genl_unregister_family(&devlink_nl_family);
-}
-
-module_init(devlink_module_init);
-module_exit(devlink_module_exit);
-
-MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
-MODULE_DESCRIPTION("Network physical device Netlink interface");
-MODULE_ALIAS_GENL_FAMILY(DEVLINK_GENL_NAME);
diff --git a/net/core/devmem.c b/net/core/devmem.c
new file mode 100644
index 000000000000..ec4217d6c0b4
--- /dev/null
+++ b/net/core/devmem.c
@@ -0,0 +1,522 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Devmem TCP
+ *
+ * Authors: Mina Almasry <almasrymina@google.com>
+ * Willem de Bruijn <willemdebruijn.kernel@gmail.com>
+ * Kaiyuan Zhang <kaiyuanz@google.com
+ */
+
+#include <linux/dma-buf.h>
+#include <linux/genalloc.h>
+#include <linux/mm.h>
+#include <linux/netdevice.h>
+#include <linux/types.h>
+#include <net/netdev_queues.h>
+#include <net/netdev_rx_queue.h>
+#include <net/page_pool/helpers.h>
+#include <net/page_pool/memory_provider.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <trace/events/page_pool.h>
+
+#include "devmem.h"
+#include "mp_dmabuf_devmem.h"
+#include "page_pool_priv.h"
+
+/* Device memory support */
+
+static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1);
+
+static const struct memory_provider_ops dmabuf_devmem_ops;
+
+bool net_is_devmem_iov(struct net_iov *niov)
+{
+ return niov->type == NET_IOV_DMABUF;
+}
+
+static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool,
+ struct gen_pool_chunk *chunk,
+ void *not_used)
+{
+ struct dmabuf_genpool_chunk_owner *owner = chunk->owner;
+
+ kvfree(owner->area.niovs);
+ kfree(owner);
+}
+
+static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov)
+{
+ struct dmabuf_genpool_chunk_owner *owner;
+
+ owner = net_devmem_iov_to_chunk_owner(niov);
+ return owner->base_dma_addr +
+ ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT);
+}
+
+void __net_devmem_dmabuf_binding_free(struct work_struct *wq)
+{
+ struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w);
+
+ size_t size, avail;
+
+ gen_pool_for_each_chunk(binding->chunk_pool,
+ net_devmem_dmabuf_free_chunk_owner, NULL);
+
+ size = gen_pool_size(binding->chunk_pool);
+ avail = gen_pool_avail(binding->chunk_pool);
+
+ if (!WARN(size != avail, "can't destroy genpool. size=%zu, avail=%zu",
+ size, avail))
+ gen_pool_destroy(binding->chunk_pool);
+
+ dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt,
+ binding->direction);
+ dma_buf_detach(binding->dmabuf, binding->attachment);
+ dma_buf_put(binding->dmabuf);
+ xa_destroy(&binding->bound_rxqs);
+ kvfree(binding->tx_vec);
+ kfree(binding);
+}
+
+struct net_iov *
+net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
+{
+ struct dmabuf_genpool_chunk_owner *owner;
+ unsigned long dma_addr;
+ struct net_iov *niov;
+ ssize_t offset;
+ ssize_t index;
+
+ dma_addr = gen_pool_alloc_owner(binding->chunk_pool, PAGE_SIZE,
+ (void **)&owner);
+ if (!dma_addr)
+ return NULL;
+
+ offset = dma_addr - owner->base_dma_addr;
+ index = offset / PAGE_SIZE;
+ niov = &owner->area.niovs[index];
+
+ niov->desc.pp_magic = 0;
+ niov->desc.pp = NULL;
+ atomic_long_set(&niov->desc.pp_ref_count, 0);
+
+ return niov;
+}
+
+void net_devmem_free_dmabuf(struct net_iov *niov)
+{
+ struct net_devmem_dmabuf_binding *binding = net_devmem_iov_binding(niov);
+ unsigned long dma_addr = net_devmem_get_dma_addr(niov);
+
+ if (WARN_ON(!gen_pool_has_addr(binding->chunk_pool, dma_addr,
+ PAGE_SIZE)))
+ return;
+
+ gen_pool_free(binding->chunk_pool, dma_addr, PAGE_SIZE);
+}
+
+void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
+{
+ struct netdev_rx_queue *rxq;
+ unsigned long xa_idx;
+ unsigned int rxq_idx;
+
+ xa_erase(&net_devmem_dmabuf_bindings, binding->id);
+
+ /* Ensure no tx net_devmem_lookup_dmabuf() are in flight after the
+ * erase.
+ */
+ synchronize_net();
+
+ if (binding->list.next)
+ list_del(&binding->list);
+
+ xa_for_each(&binding->bound_rxqs, xa_idx, rxq) {
+ const struct pp_memory_provider_params mp_params = {
+ .mp_priv = binding,
+ .mp_ops = &dmabuf_devmem_ops,
+ };
+
+ rxq_idx = get_netdev_rx_queue_index(rxq);
+
+ __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params);
+ }
+
+ net_devmem_dmabuf_binding_put(binding);
+}
+
+int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
+ struct net_devmem_dmabuf_binding *binding,
+ struct netlink_ext_ack *extack)
+{
+ struct pp_memory_provider_params mp_params = {
+ .mp_priv = binding,
+ .mp_ops = &dmabuf_devmem_ops,
+ };
+ struct netdev_rx_queue *rxq;
+ u32 xa_idx;
+ int err;
+
+ err = __net_mp_open_rxq(dev, rxq_idx, &mp_params, extack);
+ if (err)
+ return err;
+
+ rxq = __netif_get_rx_queue(dev, rxq_idx);
+ err = xa_alloc(&binding->bound_rxqs, &xa_idx, rxq, xa_limit_32b,
+ GFP_KERNEL);
+ if (err)
+ goto err_close_rxq;
+
+ return 0;
+
+err_close_rxq:
+ __net_mp_close_rxq(dev, rxq_idx, &mp_params);
+ return err;
+}
+
+struct net_devmem_dmabuf_binding *
+net_devmem_bind_dmabuf(struct net_device *dev,
+ struct device *dma_dev,
+ enum dma_data_direction direction,
+ unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
+ struct netlink_ext_ack *extack)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ static u32 id_alloc_next;
+ struct scatterlist *sg;
+ struct dma_buf *dmabuf;
+ unsigned int sg_idx, i;
+ unsigned long virtual;
+ int err;
+
+ if (!dma_dev) {
+ NL_SET_ERR_MSG(extack, "Device doesn't support DMA");
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ dmabuf = dma_buf_get(dmabuf_fd);
+ if (IS_ERR(dmabuf))
+ return ERR_CAST(dmabuf);
+
+ binding = kzalloc_node(sizeof(*binding), GFP_KERNEL,
+ dev_to_node(&dev->dev));
+ if (!binding) {
+ err = -ENOMEM;
+ goto err_put_dmabuf;
+ }
+
+ binding->dev = dev;
+ xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC);
+
+ refcount_set(&binding->ref, 1);
+
+ mutex_init(&binding->lock);
+
+ binding->dmabuf = dmabuf;
+ binding->direction = direction;
+
+ binding->attachment = dma_buf_attach(binding->dmabuf, dma_dev);
+ if (IS_ERR(binding->attachment)) {
+ err = PTR_ERR(binding->attachment);
+ NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device");
+ goto err_free_binding;
+ }
+
+ binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment,
+ direction);
+ if (IS_ERR(binding->sgt)) {
+ err = PTR_ERR(binding->sgt);
+ NL_SET_ERR_MSG(extack, "Failed to map dmabuf attachment");
+ goto err_detach;
+ }
+
+ if (direction == DMA_TO_DEVICE) {
+ binding->tx_vec = kvmalloc_array(dmabuf->size / PAGE_SIZE,
+ sizeof(struct net_iov *),
+ GFP_KERNEL);
+ if (!binding->tx_vec) {
+ err = -ENOMEM;
+ goto err_unmap;
+ }
+ }
+
+ /* For simplicity we expect to make PAGE_SIZE allocations, but the
+ * binding can be much more flexible than that. We may be able to
+ * allocate MTU sized chunks here. Leave that for future work...
+ */
+ binding->chunk_pool = gen_pool_create(PAGE_SHIFT,
+ dev_to_node(&dev->dev));
+ if (!binding->chunk_pool) {
+ err = -ENOMEM;
+ goto err_tx_vec;
+ }
+
+ virtual = 0;
+ for_each_sgtable_dma_sg(binding->sgt, sg, sg_idx) {
+ dma_addr_t dma_addr = sg_dma_address(sg);
+ struct dmabuf_genpool_chunk_owner *owner;
+ size_t len = sg_dma_len(sg);
+ struct net_iov *niov;
+
+ owner = kzalloc_node(sizeof(*owner), GFP_KERNEL,
+ dev_to_node(&dev->dev));
+ if (!owner) {
+ err = -ENOMEM;
+ goto err_free_chunks;
+ }
+
+ owner->area.base_virtual = virtual;
+ owner->base_dma_addr = dma_addr;
+ owner->area.num_niovs = len / PAGE_SIZE;
+ owner->binding = binding;
+
+ err = gen_pool_add_owner(binding->chunk_pool, dma_addr,
+ dma_addr, len, dev_to_node(&dev->dev),
+ owner);
+ if (err) {
+ kfree(owner);
+ err = -EINVAL;
+ goto err_free_chunks;
+ }
+
+ owner->area.niovs = kvmalloc_array(owner->area.num_niovs,
+ sizeof(*owner->area.niovs),
+ GFP_KERNEL);
+ if (!owner->area.niovs) {
+ err = -ENOMEM;
+ goto err_free_chunks;
+ }
+
+ for (i = 0; i < owner->area.num_niovs; i++) {
+ niov = &owner->area.niovs[i];
+ niov->type = NET_IOV_DMABUF;
+ niov->owner = &owner->area;
+ page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov),
+ net_devmem_get_dma_addr(niov));
+ if (direction == DMA_TO_DEVICE)
+ binding->tx_vec[owner->area.base_virtual / PAGE_SIZE + i] = niov;
+ }
+
+ virtual += len;
+ }
+
+ err = xa_alloc_cyclic(&net_devmem_dmabuf_bindings, &binding->id,
+ binding, xa_limit_32b, &id_alloc_next,
+ GFP_KERNEL);
+ if (err < 0)
+ goto err_free_chunks;
+
+ list_add(&binding->list, &priv->bindings);
+
+ return binding;
+
+err_free_chunks:
+ gen_pool_for_each_chunk(binding->chunk_pool,
+ net_devmem_dmabuf_free_chunk_owner, NULL);
+ gen_pool_destroy(binding->chunk_pool);
+err_tx_vec:
+ kvfree(binding->tx_vec);
+err_unmap:
+ dma_buf_unmap_attachment_unlocked(binding->attachment, binding->sgt,
+ direction);
+err_detach:
+ dma_buf_detach(dmabuf, binding->attachment);
+err_free_binding:
+ kfree(binding);
+err_put_dmabuf:
+ dma_buf_put(dmabuf);
+ return ERR_PTR(err);
+}
+
+struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id)
+{
+ struct net_devmem_dmabuf_binding *binding;
+
+ rcu_read_lock();
+ binding = xa_load(&net_devmem_dmabuf_bindings, id);
+ if (binding) {
+ if (!net_devmem_dmabuf_binding_get(binding))
+ binding = NULL;
+ }
+ rcu_read_unlock();
+
+ return binding;
+}
+
+void net_devmem_get_net_iov(struct net_iov *niov)
+{
+ net_devmem_dmabuf_binding_get(net_devmem_iov_binding(niov));
+}
+
+void net_devmem_put_net_iov(struct net_iov *niov)
+{
+ net_devmem_dmabuf_binding_put(net_devmem_iov_binding(niov));
+}
+
+struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk,
+ unsigned int dmabuf_id)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ struct net_device *dst_dev;
+ struct dst_entry *dst;
+ int err = 0;
+
+ binding = net_devmem_lookup_dmabuf(dmabuf_id);
+ if (!binding || !binding->tx_vec) {
+ err = -EINVAL;
+ goto out_err;
+ }
+
+ rcu_read_lock();
+ dst = __sk_dst_get(sk);
+ /* If dst is NULL (route expired), attempt to rebuild it. */
+ if (unlikely(!dst)) {
+ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) {
+ err = -EHOSTUNREACH;
+ goto out_unlock;
+ }
+ dst = __sk_dst_get(sk);
+ if (unlikely(!dst)) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+ }
+
+ /* The dma-addrs in this binding are only reachable to the corresponding
+ * net_device.
+ */
+ dst_dev = dst_dev_rcu(dst);
+ if (unlikely(!dst_dev) || unlikely(dst_dev != binding->dev)) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+
+ rcu_read_unlock();
+ return binding;
+
+out_unlock:
+ rcu_read_unlock();
+out_err:
+ if (binding)
+ net_devmem_dmabuf_binding_put(binding);
+
+ return ERR_PTR(err);
+}
+
+struct net_iov *
+net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding,
+ size_t virt_addr, size_t *off, size_t *size)
+{
+ if (virt_addr >= binding->dmabuf->size)
+ return NULL;
+
+ *off = virt_addr % PAGE_SIZE;
+ *size = PAGE_SIZE - *off;
+
+ return binding->tx_vec[virt_addr / PAGE_SIZE];
+}
+
+/*** "Dmabuf devmem memory provider" ***/
+
+int mp_dmabuf_devmem_init(struct page_pool *pool)
+{
+ struct net_devmem_dmabuf_binding *binding = pool->mp_priv;
+
+ if (!binding)
+ return -EINVAL;
+
+ /* dma-buf dma addresses do not need and should not be used with
+ * dma_sync_for_cpu/device. Force disable dma_sync.
+ */
+ pool->dma_sync = false;
+ pool->dma_sync_for_cpu = false;
+
+ if (pool->p.order != 0)
+ return -E2BIG;
+
+ net_devmem_dmabuf_binding_get(binding);
+ return 0;
+}
+
+netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp)
+{
+ struct net_devmem_dmabuf_binding *binding = pool->mp_priv;
+ struct net_iov *niov;
+ netmem_ref netmem;
+
+ niov = net_devmem_alloc_dmabuf(binding);
+ if (!niov)
+ return 0;
+
+ netmem = net_iov_to_netmem(niov);
+
+ page_pool_set_pp_info(pool, netmem);
+
+ pool->pages_state_hold_cnt++;
+ trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt);
+ return netmem;
+}
+
+void mp_dmabuf_devmem_destroy(struct page_pool *pool)
+{
+ struct net_devmem_dmabuf_binding *binding = pool->mp_priv;
+
+ net_devmem_dmabuf_binding_put(binding);
+}
+
+bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem)
+{
+ long refcount = atomic_long_read(netmem_get_pp_ref_count_ref(netmem));
+
+ if (WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
+ return false;
+
+ if (WARN_ON_ONCE(refcount != 1))
+ return false;
+
+ page_pool_clear_pp_info(netmem);
+
+ net_devmem_free_dmabuf(netmem_to_net_iov(netmem));
+
+ /* We don't want the page pool put_page()ing our net_iovs. */
+ return false;
+}
+
+static int mp_dmabuf_devmem_nl_fill(void *mp_priv, struct sk_buff *rsp,
+ struct netdev_rx_queue *rxq)
+{
+ const struct net_devmem_dmabuf_binding *binding = mp_priv;
+ int type = rxq ? NETDEV_A_QUEUE_DMABUF : NETDEV_A_PAGE_POOL_DMABUF;
+
+ return nla_put_u32(rsp, type, binding->id);
+}
+
+static void mp_dmabuf_devmem_uninstall(void *mp_priv,
+ struct netdev_rx_queue *rxq)
+{
+ struct net_devmem_dmabuf_binding *binding = mp_priv;
+ struct netdev_rx_queue *bound_rxq;
+ unsigned long xa_idx;
+
+ xa_for_each(&binding->bound_rxqs, xa_idx, bound_rxq) {
+ if (bound_rxq == rxq) {
+ xa_erase(&binding->bound_rxqs, xa_idx);
+ if (xa_empty(&binding->bound_rxqs)) {
+ mutex_lock(&binding->lock);
+ binding->dev = NULL;
+ mutex_unlock(&binding->lock);
+ }
+ break;
+ }
+ }
+}
+
+static const struct memory_provider_ops dmabuf_devmem_ops = {
+ .init = mp_dmabuf_devmem_init,
+ .destroy = mp_dmabuf_devmem_destroy,
+ .alloc_netmems = mp_dmabuf_devmem_alloc_netmems,
+ .release_netmem = mp_dmabuf_devmem_release_page,
+ .nl_fill = mp_dmabuf_devmem_nl_fill,
+ .uninstall = mp_dmabuf_devmem_uninstall,
+};
diff --git a/net/core/devmem.h b/net/core/devmem.h
new file mode 100644
index 000000000000..0b43a648cd2e
--- /dev/null
+++ b/net/core/devmem.h
@@ -0,0 +1,246 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Device memory TCP support
+ *
+ * Authors: Mina Almasry <almasrymina@google.com>
+ * Willem de Bruijn <willemb@google.com>
+ * Kaiyuan Zhang <kaiyuanz@google.com>
+ *
+ */
+#ifndef _NET_DEVMEM_H
+#define _NET_DEVMEM_H
+
+#include <net/netmem.h>
+#include <net/netdev_netlink.h>
+
+struct netlink_ext_ack;
+
+struct net_devmem_dmabuf_binding {
+ struct dma_buf *dmabuf;
+ struct dma_buf_attachment *attachment;
+ struct sg_table *sgt;
+ struct net_device *dev;
+ struct gen_pool *chunk_pool;
+ /* Protect dev */
+ struct mutex lock;
+
+ /* The user holds a ref (via the netlink API) for as long as they want
+ * the binding to remain alive. Each page pool using this binding holds
+ * a ref to keep the binding alive. The page_pool does not release the
+ * ref until all the net_iovs allocated from this binding are released
+ * back to the page_pool.
+ *
+ * The binding undos itself and unmaps the underlying dmabuf once all
+ * those refs are dropped and the binding is no longer desired or in
+ * use.
+ *
+ * net_devmem_get_net_iov() on dmabuf net_iovs will increment this
+ * reference, making sure that the binding remains alive until all the
+ * net_iovs are no longer used. net_iovs allocated from this binding
+ * that are stuck in the TX path for any reason (such as awaiting
+ * retransmits) hold a reference to the binding until the skb holding
+ * them is freed.
+ */
+ refcount_t ref;
+
+ /* The list of bindings currently active. Used for netlink to notify us
+ * of the user dropping the bind.
+ */
+ struct list_head list;
+
+ /* rxq's this binding is active on. */
+ struct xarray bound_rxqs;
+
+ /* ID of this binding. Globally unique to all bindings currently
+ * active.
+ */
+ u32 id;
+
+ /* DMA direction, FROM_DEVICE for Rx binding, TO_DEVICE for Tx. */
+ enum dma_data_direction direction;
+
+ /* Array of net_iov pointers for this binding, sorted by virtual
+ * address. This array is convenient to map the virtual addresses to
+ * net_iovs in the TX path.
+ */
+ struct net_iov **tx_vec;
+
+ struct work_struct unbind_w;
+};
+
+#if defined(CONFIG_NET_DEVMEM)
+/* Owner of the dma-buf chunks inserted into the gen pool. Each scatterlist
+ * entry from the dmabuf is inserted into the genpool as a chunk, and needs
+ * this owner struct to keep track of some metadata necessary to create
+ * allocations from this chunk.
+ */
+struct dmabuf_genpool_chunk_owner {
+ struct net_iov_area area;
+ struct net_devmem_dmabuf_binding *binding;
+
+ /* dma_addr of the start of the chunk. */
+ dma_addr_t base_dma_addr;
+};
+
+void __net_devmem_dmabuf_binding_free(struct work_struct *wq);
+struct net_devmem_dmabuf_binding *
+net_devmem_bind_dmabuf(struct net_device *dev,
+ struct device *dma_dev,
+ enum dma_data_direction direction,
+ unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
+ struct netlink_ext_ack *extack);
+struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id);
+void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding);
+int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
+ struct net_devmem_dmabuf_binding *binding,
+ struct netlink_ext_ack *extack);
+
+static inline struct dmabuf_genpool_chunk_owner *
+net_devmem_iov_to_chunk_owner(const struct net_iov *niov)
+{
+ struct net_iov_area *owner = net_iov_owner(niov);
+
+ return container_of(owner, struct dmabuf_genpool_chunk_owner, area);
+}
+
+static inline struct net_devmem_dmabuf_binding *
+net_devmem_iov_binding(const struct net_iov *niov)
+{
+ return net_devmem_iov_to_chunk_owner(niov)->binding;
+}
+
+static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov)
+{
+ return net_devmem_iov_binding(niov)->id;
+}
+
+static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov)
+{
+ struct net_iov_area *owner = net_iov_owner(niov);
+
+ return owner->base_virtual +
+ ((unsigned long)net_iov_idx(niov) << PAGE_SHIFT);
+}
+
+static inline bool
+net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding)
+{
+ return refcount_inc_not_zero(&binding->ref);
+}
+
+static inline void
+net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding)
+{
+ if (!refcount_dec_and_test(&binding->ref))
+ return;
+
+ INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free);
+ schedule_work(&binding->unbind_w);
+}
+
+void net_devmem_get_net_iov(struct net_iov *niov);
+void net_devmem_put_net_iov(struct net_iov *niov);
+
+struct net_iov *
+net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding);
+void net_devmem_free_dmabuf(struct net_iov *ppiov);
+
+bool net_is_devmem_iov(struct net_iov *niov);
+struct net_devmem_dmabuf_binding *
+net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id);
+struct net_iov *
+net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr,
+ size_t *off, size_t *size);
+
+#else
+struct net_devmem_dmabuf_binding;
+
+static inline void
+net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding)
+{
+}
+
+static inline void net_devmem_get_net_iov(struct net_iov *niov)
+{
+}
+
+static inline void net_devmem_put_net_iov(struct net_iov *niov)
+{
+}
+
+static inline struct net_devmem_dmabuf_binding *
+net_devmem_bind_dmabuf(struct net_device *dev,
+ struct device *dma_dev,
+ enum dma_data_direction direction,
+ unsigned int dmabuf_fd,
+ struct netdev_nl_sock *priv,
+ struct netlink_ext_ack *extack)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline struct net_devmem_dmabuf_binding *net_devmem_lookup_dmabuf(u32 id)
+{
+ return NULL;
+}
+
+static inline void
+net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
+{
+}
+
+static inline int
+net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
+ struct net_devmem_dmabuf_binding *binding,
+ struct netlink_ext_ack *extack)
+
+{
+ return -EOPNOTSUPP;
+}
+
+static inline struct net_iov *
+net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding)
+{
+ return NULL;
+}
+
+static inline void net_devmem_free_dmabuf(struct net_iov *ppiov)
+{
+}
+
+static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov)
+{
+ return 0;
+}
+
+static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov)
+{
+ return 0;
+}
+
+static inline bool net_is_devmem_iov(struct net_iov *niov)
+{
+ return false;
+}
+
+static inline struct net_devmem_dmabuf_binding *
+net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline struct net_iov *
+net_devmem_get_niov_at(struct net_devmem_dmabuf_binding *binding, size_t addr,
+ size_t *off, size_t *size)
+{
+ return NULL;
+}
+
+static inline struct net_devmem_dmabuf_binding *
+net_devmem_iov_binding(const struct net_iov *niov)
+{
+ return NULL;
+}
+#endif
+
+#endif /* _NET_DEVMEM_H */
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index c7785efeea57..60d31c2feed3 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Monitoring code for network dropped packet alerts
*
@@ -20,6 +21,7 @@
#include <linux/workqueue.h>
#include <linux/netlink.h>
#include <linux/net_dropmon.h>
+#include <linux/bitfield.h>
#include <linux/percpu.h>
#include <linux/timer.h>
#include <linux/bitops.h>
@@ -27,11 +29,15 @@
#include <linux/module.h>
#include <net/genetlink.h>
#include <net/netevent.h>
+#include <net/flow_offload.h>
+#include <net/dropreason.h>
+#include <net/devlink.h>
#include <trace/events/skb.h>
#include <trace/events/napi.h>
+#include <trace/events/devlink.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#define TRACE_ON 1
#define TRACE_OFF 0
@@ -42,31 +48,87 @@
* netlink alerts
*/
static int trace_state = TRACE_OFF;
-static DEFINE_MUTEX(trace_state_mutex);
+static bool monitor_hw;
+
+/* net_dm_mutex
+ *
+ * An overall lock guarding every operation coming from userspace.
+ */
+static DEFINE_MUTEX(net_dm_mutex);
+
+struct net_dm_stats {
+ u64_stats_t dropped;
+ struct u64_stats_sync syncp;
+};
+
+#define NET_DM_MAX_HW_TRAP_NAME_LEN 40
+
+struct net_dm_hw_entry {
+ char trap_name[NET_DM_MAX_HW_TRAP_NAME_LEN];
+ u32 count;
+};
+
+struct net_dm_hw_entries {
+ u32 num_entries;
+ struct net_dm_hw_entry entries[];
+};
struct per_cpu_dm_data {
- spinlock_t lock;
- struct sk_buff *skb;
+ raw_spinlock_t lock; /* Protects 'skb', 'hw_entries' and
+ * 'send_timer'
+ */
+ union {
+ struct sk_buff *skb;
+ struct net_dm_hw_entries *hw_entries;
+ };
+ struct sk_buff_head drop_queue;
struct work_struct dm_alert_work;
struct timer_list send_timer;
+ struct net_dm_stats stats;
};
struct dm_hw_stat_delta {
- struct net_device *dev;
unsigned long last_rx;
- struct list_head list;
- struct rcu_head rcu;
unsigned long last_drop_val;
+ struct rcu_head rcu;
};
static struct genl_family net_drop_monitor_family;
static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
+static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_hw_cpu_data);
static int dm_hit_limit = 64;
static int dm_delay = 1;
static unsigned long dm_hw_check_delta = 2*HZ;
-static LIST_HEAD(hw_stats_list);
+
+static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY;
+static u32 net_dm_trunc_len;
+static u32 net_dm_queue_len = 1000;
+
+struct net_dm_alert_ops {
+ void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb,
+ void *location,
+ enum skb_drop_reason reason,
+ struct sock *rx_sk);
+ void (*napi_poll_probe)(void *ignore, struct napi_struct *napi,
+ int work, int budget);
+ void (*work_item_func)(struct work_struct *work);
+ void (*hw_work_item_func)(struct work_struct *work);
+ void (*hw_trap_probe)(void *ignore, const struct devlink *devlink,
+ struct sk_buff *skb,
+ const struct devlink_trap_metadata *metadata);
+};
+
+struct net_dm_skb_cb {
+ union {
+ struct devlink_trap_metadata *hw_metadata;
+ void *pc;
+ };
+ enum skb_drop_reason reason;
+};
+
+#define NET_DM_SKB_CB(__skb) ((struct net_dm_skb_cb *)&((__skb)->cb[0]))
static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
{
@@ -107,9 +169,9 @@ static struct sk_buff *reset_per_cpu_data(struct per_cpu_dm_data *data)
err:
mod_timer(&data->send_timer, jiffies + HZ / 10);
out:
- spin_lock_irqsave(&data->lock, flags);
+ raw_spin_lock_irqsave(&data->lock, flags);
swap(data->skb, skb);
- spin_unlock_irqrestore(&data->lock, flags);
+ raw_spin_unlock_irqrestore(&data->lock, flags);
if (skb) {
struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data;
@@ -122,7 +184,7 @@ out:
}
static const struct genl_multicast_group dropmon_mcgrps[] = {
- { .name = "events", },
+ { .name = "events", .flags = GENL_MCAST_CAP_SYS_ADMIN, },
};
static void send_dm_alert(struct work_struct *work)
@@ -146,7 +208,7 @@ static void send_dm_alert(struct work_struct *work)
*/
static void sched_send_work(struct timer_list *t)
{
- struct per_cpu_dm_data *data = from_timer(data, t, send_timer);
+ struct per_cpu_dm_data *data = timer_container_of(data, t, send_timer);
schedule_work(&data->dm_alert_work);
}
@@ -154,6 +216,7 @@ static void sched_send_work(struct timer_list *t)
static void trace_drop_common(struct sk_buff *skb, void *location)
{
struct net_dm_alert_msg *msg;
+ struct net_dm_drop_point *point;
struct nlmsghdr *nlh;
struct nlattr *nla;
int i;
@@ -163,7 +226,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
local_irq_save(flags);
data = this_cpu_ptr(&dm_cpu_data);
- spin_lock(&data->lock);
+ raw_spin_lock(&data->lock);
dskb = data->skb;
if (!dskb)
@@ -172,11 +235,13 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
nlh = (struct nlmsghdr *)dskb->data;
nla = genlmsg_data(nlmsg_data(nlh));
msg = nla_data(nla);
+ point = msg->points;
for (i = 0; i < msg->entries; i++) {
- if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {
- msg->points[i].count++;
+ if (!memcmp(&location, &point->pc, sizeof(void *))) {
+ point->count++;
goto out;
}
+ point++;
}
if (msg->entries == dm_hit_limit)
goto out;
@@ -185,8 +250,8 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
*/
__nla_reserve_nohdr(dskb, sizeof(struct net_dm_drop_point));
nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));
- memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));
- msg->points[msg->entries].count = 1;
+ memcpy(point->pc, &location, sizeof(void *));
+ point->count = 1;
msg->entries++;
if (!timer_pending(&data->send_timer)) {
@@ -195,10 +260,13 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
}
out:
- spin_unlock_irqrestore(&data->lock, flags);
+ raw_spin_unlock_irqrestore(&data->lock, flags);
}
-static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *location)
+static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb,
+ void *location,
+ enum skb_drop_reason reason,
+ struct sock *rx_sk)
{
trace_drop_common(skb, location);
}
@@ -206,76 +274,967 @@ static void trace_kfree_skb_hit(void *ignore, struct sk_buff *skb, void *locatio
static void trace_napi_poll_hit(void *ignore, struct napi_struct *napi,
int work, int budget)
{
- struct dm_hw_stat_delta *new_stat;
-
+ struct net_device *dev = napi->dev;
+ struct dm_hw_stat_delta *stat;
/*
* Don't check napi structures with no associated device
*/
- if (!napi->dev)
+ if (!dev)
return;
rcu_read_lock();
- list_for_each_entry_rcu(new_stat, &hw_stats_list, list) {
+ stat = rcu_dereference(dev->dm_private);
+ if (stat) {
/*
* only add a note to our monitor buffer if:
- * 1) this is the dev we received on
- * 2) its after the last_rx delta
- * 3) our rx_dropped count has gone up
+ * 1) its after the last_rx delta
+ * 2) our rx_dropped count has gone up
*/
- if ((new_stat->dev == napi->dev) &&
- (time_after(jiffies, new_stat->last_rx + dm_hw_check_delta)) &&
- (napi->dev->stats.rx_dropped != new_stat->last_drop_val)) {
+ if (time_after(jiffies, stat->last_rx + dm_hw_check_delta) &&
+ (dev->stats.rx_dropped != stat->last_drop_val)) {
trace_drop_common(NULL, NULL);
- new_stat->last_drop_val = napi->dev->stats.rx_dropped;
- new_stat->last_rx = jiffies;
- break;
+ stat->last_drop_val = dev->stats.rx_dropped;
+ stat->last_rx = jiffies;
}
}
rcu_read_unlock();
}
-static int set_all_monitor_traces(int state)
+static struct net_dm_hw_entries *
+net_dm_hw_reset_per_cpu_data(struct per_cpu_dm_data *hw_data)
{
- int rc = 0;
- struct dm_hw_stat_delta *new_stat = NULL;
- struct dm_hw_stat_delta *temp;
+ struct net_dm_hw_entries *hw_entries;
+ unsigned long flags;
- mutex_lock(&trace_state_mutex);
+ hw_entries = kzalloc(struct_size(hw_entries, entries, dm_hit_limit),
+ GFP_KERNEL);
+ if (!hw_entries) {
+ /* If the memory allocation failed, we try to perform another
+ * allocation in 1/10 second. Otherwise, the probe function
+ * will constantly bail out.
+ */
+ mod_timer(&hw_data->send_timer, jiffies + HZ / 10);
+ }
- if (state == trace_state) {
- rc = -EAGAIN;
- goto out_unlock;
+ raw_spin_lock_irqsave(&hw_data->lock, flags);
+ swap(hw_data->hw_entries, hw_entries);
+ raw_spin_unlock_irqrestore(&hw_data->lock, flags);
+
+ return hw_entries;
+}
+
+static int net_dm_hw_entry_put(struct sk_buff *msg,
+ const struct net_dm_hw_entry *hw_entry)
+{
+ struct nlattr *attr;
+
+ attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRY);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME, hw_entry->trap_name))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NET_DM_ATTR_HW_TRAP_COUNT, hw_entry->count))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int net_dm_hw_entries_put(struct sk_buff *msg,
+ const struct net_dm_hw_entries *hw_entries)
+{
+ struct nlattr *attr;
+ int i;
+
+ attr = nla_nest_start(msg, NET_DM_ATTR_HW_ENTRIES);
+ if (!attr)
+ return -EMSGSIZE;
+
+ for (i = 0; i < hw_entries->num_entries; i++) {
+ int rc;
+
+ rc = net_dm_hw_entry_put(msg, &hw_entries->entries[i]);
+ if (rc)
+ goto nla_put_failure;
}
- switch (state) {
- case TRACE_ON:
- if (!try_module_get(THIS_MODULE)) {
- rc = -ENODEV;
- break;
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int
+net_dm_hw_summary_report_fill(struct sk_buff *msg,
+ const struct net_dm_hw_entries *hw_entries)
+{
+ struct net_dm_alert_msg anc_hdr = { 0 };
+ void *hdr;
+ int rc;
+
+ hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0,
+ NET_DM_CMD_ALERT);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ /* We need to put the ancillary header in order not to break user
+ * space.
+ */
+ if (nla_put(msg, NLA_UNSPEC, sizeof(anc_hdr), &anc_hdr))
+ goto nla_put_failure;
+
+ rc = net_dm_hw_entries_put(msg, hw_entries);
+ if (rc)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void net_dm_hw_summary_work(struct work_struct *work)
+{
+ struct net_dm_hw_entries *hw_entries;
+ struct per_cpu_dm_data *hw_data;
+ struct sk_buff *msg;
+ int rc;
+
+ hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
+
+ hw_entries = net_dm_hw_reset_per_cpu_data(hw_data);
+ if (!hw_entries)
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ goto out;
+
+ rc = net_dm_hw_summary_report_fill(msg, hw_entries);
+ if (rc) {
+ nlmsg_free(msg);
+ goto out;
+ }
+
+ genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL);
+
+out:
+ kfree(hw_entries);
+}
+
+static void
+net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink,
+ struct sk_buff *skb,
+ const struct devlink_trap_metadata *metadata)
+{
+ struct net_dm_hw_entries *hw_entries;
+ struct net_dm_hw_entry *hw_entry;
+ struct per_cpu_dm_data *hw_data;
+ unsigned long flags;
+ int i;
+
+ if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL)
+ return;
+
+ hw_data = this_cpu_ptr(&dm_hw_cpu_data);
+ raw_spin_lock_irqsave(&hw_data->lock, flags);
+ hw_entries = hw_data->hw_entries;
+
+ if (!hw_entries)
+ goto out;
+
+ for (i = 0; i < hw_entries->num_entries; i++) {
+ hw_entry = &hw_entries->entries[i];
+ if (!strncmp(hw_entry->trap_name, metadata->trap_name,
+ NET_DM_MAX_HW_TRAP_NAME_LEN - 1)) {
+ hw_entry->count++;
+ goto out;
}
+ }
+ if (WARN_ON_ONCE(hw_entries->num_entries == dm_hit_limit))
+ goto out;
- rc |= register_trace_kfree_skb(trace_kfree_skb_hit, NULL);
- rc |= register_trace_napi_poll(trace_napi_poll_hit, NULL);
- break;
+ hw_entry = &hw_entries->entries[hw_entries->num_entries];
+ strscpy(hw_entry->trap_name, metadata->trap_name,
+ NET_DM_MAX_HW_TRAP_NAME_LEN - 1);
+ hw_entry->count = 1;
+ hw_entries->num_entries++;
- case TRACE_OFF:
- rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit, NULL);
- rc |= unregister_trace_napi_poll(trace_napi_poll_hit, NULL);
+ if (!timer_pending(&hw_data->send_timer)) {
+ hw_data->send_timer.expires = jiffies + dm_delay * HZ;
+ add_timer(&hw_data->send_timer);
+ }
- tracepoint_synchronize_unregister();
+out:
+ raw_spin_unlock_irqrestore(&hw_data->lock, flags);
+}
- /*
- * Clean the device list
- */
- list_for_each_entry_safe(new_stat, temp, &hw_stats_list, list) {
- if (new_stat->dev == NULL) {
- list_del_rcu(&new_stat->list);
- kfree_rcu(new_stat, rcu);
- }
+static const struct net_dm_alert_ops net_dm_alert_summary_ops = {
+ .kfree_skb_probe = trace_kfree_skb_hit,
+ .napi_poll_probe = trace_napi_poll_hit,
+ .work_item_func = send_dm_alert,
+ .hw_work_item_func = net_dm_hw_summary_work,
+ .hw_trap_probe = net_dm_hw_trap_summary_probe,
+};
+
+static void net_dm_packet_trace_kfree_skb_hit(void *ignore,
+ struct sk_buff *skb,
+ void *location,
+ enum skb_drop_reason reason,
+ struct sock *rx_sk)
+{
+ ktime_t tstamp = ktime_get_real();
+ struct per_cpu_dm_data *data;
+ struct net_dm_skb_cb *cb;
+ struct sk_buff *nskb;
+ unsigned long flags;
+
+ if (!skb_mac_header_was_set(skb))
+ return;
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return;
+
+ cb = NET_DM_SKB_CB(nskb);
+ cb->reason = reason;
+ cb->pc = location;
+ /* Override the timestamp because we care about the time when the
+ * packet was dropped.
+ */
+ nskb->tstamp = tstamp;
+
+ data = this_cpu_ptr(&dm_cpu_data);
+
+ spin_lock_irqsave(&data->drop_queue.lock, flags);
+ if (skb_queue_len(&data->drop_queue) < net_dm_queue_len)
+ __skb_queue_tail(&data->drop_queue, nskb);
+ else
+ goto unlock_free;
+ spin_unlock_irqrestore(&data->drop_queue.lock, flags);
+
+ schedule_work(&data->dm_alert_work);
+
+ return;
+
+unlock_free:
+ spin_unlock_irqrestore(&data->drop_queue.lock, flags);
+ u64_stats_update_begin(&data->stats.syncp);
+ u64_stats_inc(&data->stats.dropped);
+ u64_stats_update_end(&data->stats.syncp);
+ consume_skb(nskb);
+}
+
+static void net_dm_packet_trace_napi_poll_hit(void *ignore,
+ struct napi_struct *napi,
+ int work, int budget)
+{
+}
+
+static size_t net_dm_in_port_size(void)
+{
+ /* NET_DM_ATTR_IN_PORT nest */
+ return nla_total_size(0) +
+ /* NET_DM_ATTR_PORT_NETDEV_IFINDEX */
+ nla_total_size(sizeof(u32)) +
+ /* NET_DM_ATTR_PORT_NETDEV_NAME */
+ nla_total_size(IFNAMSIZ + 1);
+}
+
+#define NET_DM_MAX_SYMBOL_LEN 40
+#define NET_DM_MAX_REASON_LEN 50
+
+static size_t net_dm_packet_report_size(size_t payload_len)
+{
+ size_t size;
+
+ size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize);
+
+ return NLMSG_ALIGN(size) +
+ /* NET_DM_ATTR_ORIGIN */
+ nla_total_size(sizeof(u16)) +
+ /* NET_DM_ATTR_PC */
+ nla_total_size(sizeof(u64)) +
+ /* NET_DM_ATTR_SYMBOL */
+ nla_total_size(NET_DM_MAX_SYMBOL_LEN + 1) +
+ /* NET_DM_ATTR_IN_PORT */
+ net_dm_in_port_size() +
+ /* NET_DM_ATTR_TIMESTAMP */
+ nla_total_size(sizeof(u64)) +
+ /* NET_DM_ATTR_ORIG_LEN */
+ nla_total_size(sizeof(u32)) +
+ /* NET_DM_ATTR_PROTO */
+ nla_total_size(sizeof(u16)) +
+ /* NET_DM_ATTR_REASON */
+ nla_total_size(NET_DM_MAX_REASON_LEN + 1) +
+ /* NET_DM_ATTR_PAYLOAD */
+ nla_total_size(payload_len);
+}
+
+static int net_dm_packet_report_in_port_put(struct sk_buff *msg, int ifindex,
+ const char *name)
+{
+ struct nlattr *attr;
+
+ attr = nla_nest_start(msg, NET_DM_ATTR_IN_PORT);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (ifindex &&
+ nla_put_u32(msg, NET_DM_ATTR_PORT_NETDEV_IFINDEX, ifindex))
+ goto nla_put_failure;
+
+ if (name && nla_put_string(msg, NET_DM_ATTR_PORT_NETDEV_NAME, name))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int net_dm_packet_report_fill(struct sk_buff *msg, struct sk_buff *skb,
+ size_t payload_len)
+{
+ struct net_dm_skb_cb *cb = NET_DM_SKB_CB(skb);
+ const struct drop_reason_list *list = NULL;
+ unsigned int subsys, subsys_reason;
+ char buf[NET_DM_MAX_SYMBOL_LEN];
+ struct nlattr *attr;
+ void *hdr;
+ int rc;
+
+ hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0,
+ NET_DM_CMD_PACKET_ALERT);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_SW))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, NET_DM_ATTR_PC, (u64)(uintptr_t)cb->pc,
+ NET_DM_ATTR_PAD))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+ subsys = u32_get_bits(cb->reason, SKB_DROP_REASON_SUBSYS_MASK);
+ if (subsys < SKB_DROP_REASON_SUBSYS_NUM)
+ list = rcu_dereference(drop_reasons_by_subsys[subsys]);
+ subsys_reason = cb->reason & ~SKB_DROP_REASON_SUBSYS_MASK;
+ if (!list ||
+ subsys_reason >= list->n_reasons ||
+ !list->reasons[subsys_reason] ||
+ strlen(list->reasons[subsys_reason]) > NET_DM_MAX_REASON_LEN) {
+ list = rcu_dereference(drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_CORE]);
+ subsys_reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ }
+ if (nla_put_string(msg, NET_DM_ATTR_REASON,
+ list->reasons[subsys_reason])) {
+ rcu_read_unlock();
+ goto nla_put_failure;
+ }
+ rcu_read_unlock();
+
+ snprintf(buf, sizeof(buf), "%pS", cb->pc);
+ if (nla_put_string(msg, NET_DM_ATTR_SYMBOL, buf))
+ goto nla_put_failure;
+
+ rc = net_dm_packet_report_in_port_put(msg, skb->skb_iif, NULL);
+ if (rc)
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP,
+ ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len))
+ goto nla_put_failure;
+
+ if (!payload_len)
+ goto out;
+
+ if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol)))
+ goto nla_put_failure;
+
+ attr = skb_put(msg, nla_total_size(payload_len));
+ attr->nla_type = NET_DM_ATTR_PAYLOAD;
+ attr->nla_len = nla_attr_size(payload_len);
+ if (skb_copy_bits(skb, 0, nla_data(attr), payload_len))
+ goto nla_put_failure;
+
+out:
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+#define NET_DM_MAX_PACKET_SIZE (0xffff - NLA_HDRLEN - NLA_ALIGNTO)
+
+static void net_dm_packet_report(struct sk_buff *skb)
+{
+ struct sk_buff *msg;
+ size_t payload_len;
+ int rc;
+
+ /* Make sure we start copying the packet from the MAC header */
+ if (skb->data > skb_mac_header(skb))
+ skb_push(skb, skb->data - skb_mac_header(skb));
+ else
+ skb_pull(skb, skb_mac_header(skb) - skb->data);
+
+ /* Ensure packet fits inside a single netlink attribute */
+ payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE);
+ if (net_dm_trunc_len)
+ payload_len = min_t(size_t, net_dm_trunc_len, payload_len);
+
+ msg = nlmsg_new(net_dm_packet_report_size(payload_len), GFP_KERNEL);
+ if (!msg)
+ goto out;
+
+ rc = net_dm_packet_report_fill(msg, skb, payload_len);
+ if (rc) {
+ nlmsg_free(msg);
+ goto out;
+ }
+
+ genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL);
+
+out:
+ consume_skb(skb);
+}
+
+static void net_dm_packet_work(struct work_struct *work)
+{
+ struct per_cpu_dm_data *data;
+ struct sk_buff_head list;
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
+
+ __skb_queue_head_init(&list);
+
+ spin_lock_irqsave(&data->drop_queue.lock, flags);
+ skb_queue_splice_tail_init(&data->drop_queue, &list);
+ spin_unlock_irqrestore(&data->drop_queue.lock, flags);
+
+ while ((skb = __skb_dequeue(&list)))
+ net_dm_packet_report(skb);
+}
+
+static size_t
+net_dm_flow_action_cookie_size(const struct devlink_trap_metadata *hw_metadata)
+{
+ return hw_metadata->fa_cookie ?
+ nla_total_size(hw_metadata->fa_cookie->cookie_len) : 0;
+}
+
+static size_t
+net_dm_hw_packet_report_size(size_t payload_len,
+ const struct devlink_trap_metadata *hw_metadata)
+{
+ size_t size;
+
+ size = nlmsg_msg_size(GENL_HDRLEN + net_drop_monitor_family.hdrsize);
+
+ return NLMSG_ALIGN(size) +
+ /* NET_DM_ATTR_ORIGIN */
+ nla_total_size(sizeof(u16)) +
+ /* NET_DM_ATTR_HW_TRAP_GROUP_NAME */
+ nla_total_size(strlen(hw_metadata->trap_group_name) + 1) +
+ /* NET_DM_ATTR_HW_TRAP_NAME */
+ nla_total_size(strlen(hw_metadata->trap_name) + 1) +
+ /* NET_DM_ATTR_IN_PORT */
+ net_dm_in_port_size() +
+ /* NET_DM_ATTR_FLOW_ACTION_COOKIE */
+ net_dm_flow_action_cookie_size(hw_metadata) +
+ /* NET_DM_ATTR_TIMESTAMP */
+ nla_total_size(sizeof(u64)) +
+ /* NET_DM_ATTR_ORIG_LEN */
+ nla_total_size(sizeof(u32)) +
+ /* NET_DM_ATTR_PROTO */
+ nla_total_size(sizeof(u16)) +
+ /* NET_DM_ATTR_PAYLOAD */
+ nla_total_size(payload_len);
+}
+
+static int net_dm_hw_packet_report_fill(struct sk_buff *msg,
+ struct sk_buff *skb, size_t payload_len)
+{
+ struct devlink_trap_metadata *hw_metadata;
+ struct nlattr *attr;
+ void *hdr;
+
+ hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
+
+ hdr = genlmsg_put(msg, 0, 0, &net_drop_monitor_family, 0,
+ NET_DM_CMD_PACKET_ALERT);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_u16(msg, NET_DM_ATTR_ORIGIN, NET_DM_ORIGIN_HW))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_GROUP_NAME,
+ hw_metadata->trap_group_name))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, NET_DM_ATTR_HW_TRAP_NAME,
+ hw_metadata->trap_name))
+ goto nla_put_failure;
+
+ if (hw_metadata->input_dev) {
+ struct net_device *dev = hw_metadata->input_dev;
+ int rc;
+
+ rc = net_dm_packet_report_in_port_put(msg, dev->ifindex,
+ dev->name);
+ if (rc)
+ goto nla_put_failure;
+ }
+
+ if (hw_metadata->fa_cookie &&
+ nla_put(msg, NET_DM_ATTR_FLOW_ACTION_COOKIE,
+ hw_metadata->fa_cookie->cookie_len,
+ hw_metadata->fa_cookie->cookie))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, NET_DM_ATTR_TIMESTAMP,
+ ktime_to_ns(skb->tstamp), NET_DM_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NET_DM_ATTR_ORIG_LEN, skb->len))
+ goto nla_put_failure;
+
+ if (!payload_len)
+ goto out;
+
+ if (nla_put_u16(msg, NET_DM_ATTR_PROTO, be16_to_cpu(skb->protocol)))
+ goto nla_put_failure;
+
+ attr = skb_put(msg, nla_total_size(payload_len));
+ attr->nla_type = NET_DM_ATTR_PAYLOAD;
+ attr->nla_len = nla_attr_size(payload_len);
+ if (skb_copy_bits(skb, 0, nla_data(attr), payload_len))
+ goto nla_put_failure;
+
+out:
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static struct devlink_trap_metadata *
+net_dm_hw_metadata_copy(const struct devlink_trap_metadata *metadata)
+{
+ const struct flow_action_cookie *fa_cookie;
+ struct devlink_trap_metadata *hw_metadata;
+ const char *trap_group_name;
+ const char *trap_name;
+
+ hw_metadata = kzalloc(sizeof(*hw_metadata), GFP_ATOMIC);
+ if (!hw_metadata)
+ return NULL;
+
+ trap_group_name = kstrdup(metadata->trap_group_name, GFP_ATOMIC);
+ if (!trap_group_name)
+ goto free_hw_metadata;
+ hw_metadata->trap_group_name = trap_group_name;
+
+ trap_name = kstrdup(metadata->trap_name, GFP_ATOMIC);
+ if (!trap_name)
+ goto free_trap_group;
+ hw_metadata->trap_name = trap_name;
+
+ if (metadata->fa_cookie) {
+ size_t cookie_size = sizeof(*fa_cookie) +
+ metadata->fa_cookie->cookie_len;
+
+ fa_cookie = kmemdup(metadata->fa_cookie, cookie_size,
+ GFP_ATOMIC);
+ if (!fa_cookie)
+ goto free_trap_name;
+ hw_metadata->fa_cookie = fa_cookie;
+ }
+
+ hw_metadata->input_dev = metadata->input_dev;
+ netdev_hold(hw_metadata->input_dev, &hw_metadata->dev_tracker,
+ GFP_ATOMIC);
+
+ return hw_metadata;
+
+free_trap_name:
+ kfree(trap_name);
+free_trap_group:
+ kfree(trap_group_name);
+free_hw_metadata:
+ kfree(hw_metadata);
+ return NULL;
+}
+
+static void
+net_dm_hw_metadata_free(struct devlink_trap_metadata *hw_metadata)
+{
+ netdev_put(hw_metadata->input_dev, &hw_metadata->dev_tracker);
+ kfree(hw_metadata->fa_cookie);
+ kfree(hw_metadata->trap_name);
+ kfree(hw_metadata->trap_group_name);
+ kfree(hw_metadata);
+}
+
+static void net_dm_hw_packet_report(struct sk_buff *skb)
+{
+ struct devlink_trap_metadata *hw_metadata;
+ struct sk_buff *msg;
+ size_t payload_len;
+ int rc;
+
+ if (skb->data > skb_mac_header(skb))
+ skb_push(skb, skb->data - skb_mac_header(skb));
+ else
+ skb_pull(skb, skb_mac_header(skb) - skb->data);
+
+ payload_len = min_t(size_t, skb->len, NET_DM_MAX_PACKET_SIZE);
+ if (net_dm_trunc_len)
+ payload_len = min_t(size_t, net_dm_trunc_len, payload_len);
+
+ hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
+ msg = nlmsg_new(net_dm_hw_packet_report_size(payload_len, hw_metadata),
+ GFP_KERNEL);
+ if (!msg)
+ goto out;
+
+ rc = net_dm_hw_packet_report_fill(msg, skb, payload_len);
+ if (rc) {
+ nlmsg_free(msg);
+ goto out;
+ }
+
+ genlmsg_multicast(&net_drop_monitor_family, msg, 0, 0, GFP_KERNEL);
+
+out:
+ net_dm_hw_metadata_free(NET_DM_SKB_CB(skb)->hw_metadata);
+ consume_skb(skb);
+}
+
+static void net_dm_hw_packet_work(struct work_struct *work)
+{
+ struct per_cpu_dm_data *hw_data;
+ struct sk_buff_head list;
+ struct sk_buff *skb;
+ unsigned long flags;
+
+ hw_data = container_of(work, struct per_cpu_dm_data, dm_alert_work);
+
+ __skb_queue_head_init(&list);
+
+ spin_lock_irqsave(&hw_data->drop_queue.lock, flags);
+ skb_queue_splice_tail_init(&hw_data->drop_queue, &list);
+ spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags);
+
+ while ((skb = __skb_dequeue(&list)))
+ net_dm_hw_packet_report(skb);
+}
+
+static void
+net_dm_hw_trap_packet_probe(void *ignore, const struct devlink *devlink,
+ struct sk_buff *skb,
+ const struct devlink_trap_metadata *metadata)
+{
+ struct devlink_trap_metadata *n_hw_metadata;
+ ktime_t tstamp = ktime_get_real();
+ struct per_cpu_dm_data *hw_data;
+ struct sk_buff *nskb;
+ unsigned long flags;
+
+ if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL)
+ return;
+
+ if (!skb_mac_header_was_set(skb))
+ return;
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return;
+
+ n_hw_metadata = net_dm_hw_metadata_copy(metadata);
+ if (!n_hw_metadata)
+ goto free;
+
+ NET_DM_SKB_CB(nskb)->hw_metadata = n_hw_metadata;
+ nskb->tstamp = tstamp;
+
+ hw_data = this_cpu_ptr(&dm_hw_cpu_data);
+
+ spin_lock_irqsave(&hw_data->drop_queue.lock, flags);
+ if (skb_queue_len(&hw_data->drop_queue) < net_dm_queue_len)
+ __skb_queue_tail(&hw_data->drop_queue, nskb);
+ else
+ goto unlock_free;
+ spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags);
+
+ schedule_work(&hw_data->dm_alert_work);
+
+ return;
+
+unlock_free:
+ spin_unlock_irqrestore(&hw_data->drop_queue.lock, flags);
+ u64_stats_update_begin(&hw_data->stats.syncp);
+ u64_stats_inc(&hw_data->stats.dropped);
+ u64_stats_update_end(&hw_data->stats.syncp);
+ net_dm_hw_metadata_free(n_hw_metadata);
+free:
+ consume_skb(nskb);
+}
+
+static const struct net_dm_alert_ops net_dm_alert_packet_ops = {
+ .kfree_skb_probe = net_dm_packet_trace_kfree_skb_hit,
+ .napi_poll_probe = net_dm_packet_trace_napi_poll_hit,
+ .work_item_func = net_dm_packet_work,
+ .hw_work_item_func = net_dm_hw_packet_work,
+ .hw_trap_probe = net_dm_hw_trap_packet_probe,
+};
+
+static const struct net_dm_alert_ops *net_dm_alert_ops_arr[] = {
+ [NET_DM_ALERT_MODE_SUMMARY] = &net_dm_alert_summary_ops,
+ [NET_DM_ALERT_MODE_PACKET] = &net_dm_alert_packet_ops,
+};
+
+#if IS_ENABLED(CONFIG_NET_DEVLINK)
+static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops)
+{
+ return register_trace_devlink_trap_report(ops->hw_trap_probe, NULL);
+}
+
+static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops)
+{
+ unregister_trace_devlink_trap_report(ops->hw_trap_probe, NULL);
+ tracepoint_synchronize_unregister();
+}
+#else
+static int net_dm_hw_probe_register(const struct net_dm_alert_ops *ops)
+{
+ return -EOPNOTSUPP;
+}
+
+static void net_dm_hw_probe_unregister(const struct net_dm_alert_ops *ops)
+{
+}
+#endif
+
+static int net_dm_hw_monitor_start(struct netlink_ext_ack *extack)
+{
+ const struct net_dm_alert_ops *ops;
+ int cpu, rc;
+
+ if (monitor_hw) {
+ NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already enabled");
+ return -EAGAIN;
+ }
+
+ ops = net_dm_alert_ops_arr[net_dm_alert_mode];
+
+ if (!try_module_get(THIS_MODULE)) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module");
+ return -ENODEV;
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+ struct net_dm_hw_entries *hw_entries;
+
+ INIT_WORK(&hw_data->dm_alert_work, ops->hw_work_item_func);
+ timer_setup(&hw_data->send_timer, sched_send_work, 0);
+ hw_entries = net_dm_hw_reset_per_cpu_data(hw_data);
+ kfree(hw_entries);
+ }
+
+ rc = net_dm_hw_probe_register(ops);
+ if (rc) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to devlink_trap_probe() tracepoint");
+ goto err_module_put;
+ }
+
+ monitor_hw = true;
+
+ return 0;
+
+err_module_put:
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+ struct sk_buff *skb;
+
+ timer_delete_sync(&hw_data->send_timer);
+ cancel_work_sync(&hw_data->dm_alert_work);
+ while ((skb = __skb_dequeue(&hw_data->drop_queue))) {
+ struct devlink_trap_metadata *hw_metadata;
+
+ hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
+ net_dm_hw_metadata_free(hw_metadata);
+ consume_skb(skb);
}
+ }
+ module_put(THIS_MODULE);
+ return rc;
+}
+
+static void net_dm_hw_monitor_stop(struct netlink_ext_ack *extack)
+{
+ const struct net_dm_alert_ops *ops;
+ int cpu;
+
+ if (!monitor_hw) {
+ NL_SET_ERR_MSG_MOD(extack, "Hardware monitoring already disabled");
+ return;
+ }
+
+ ops = net_dm_alert_ops_arr[net_dm_alert_mode];
+
+ monitor_hw = false;
+
+ net_dm_hw_probe_unregister(ops);
+
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+ struct sk_buff *skb;
+
+ timer_delete_sync(&hw_data->send_timer);
+ cancel_work_sync(&hw_data->dm_alert_work);
+ while ((skb = __skb_dequeue(&hw_data->drop_queue))) {
+ struct devlink_trap_metadata *hw_metadata;
+
+ hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
+ net_dm_hw_metadata_free(hw_metadata);
+ consume_skb(skb);
+ }
+ }
+
+ module_put(THIS_MODULE);
+}
+
+static int net_dm_trace_on_set(struct netlink_ext_ack *extack)
+{
+ const struct net_dm_alert_ops *ops;
+ int cpu, rc;
+
+ ops = net_dm_alert_ops_arr[net_dm_alert_mode];
+
+ if (!try_module_get(THIS_MODULE)) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to take reference on module");
+ return -ENODEV;
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
+ struct sk_buff *skb;
+
+ INIT_WORK(&data->dm_alert_work, ops->work_item_func);
+ timer_setup(&data->send_timer, sched_send_work, 0);
+ /* Allocate a new per-CPU skb for the summary alert message and
+ * free the old one which might contain stale data from
+ * previous tracing.
+ */
+ skb = reset_per_cpu_data(data);
+ consume_skb(skb);
+ }
+
+ rc = register_trace_kfree_skb(ops->kfree_skb_probe, NULL);
+ if (rc) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to kfree_skb() tracepoint");
+ goto err_module_put;
+ }
+
+ rc = register_trace_napi_poll(ops->napi_poll_probe, NULL);
+ if (rc) {
+ NL_SET_ERR_MSG_MOD(extack, "Failed to connect probe to napi_poll() tracepoint");
+ goto err_unregister_trace;
+ }
+
+ return 0;
+
+err_unregister_trace:
+ unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL);
+err_module_put:
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
+ struct sk_buff *skb;
+
+ timer_delete_sync(&data->send_timer);
+ cancel_work_sync(&data->dm_alert_work);
+ while ((skb = __skb_dequeue(&data->drop_queue)))
+ consume_skb(skb);
+ }
+ module_put(THIS_MODULE);
+ return rc;
+}
+
+static void net_dm_trace_off_set(void)
+{
+ const struct net_dm_alert_ops *ops;
+ int cpu;
+
+ ops = net_dm_alert_ops_arr[net_dm_alert_mode];
+
+ unregister_trace_napi_poll(ops->napi_poll_probe, NULL);
+ unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL);
+
+ tracepoint_synchronize_unregister();
+
+ /* Make sure we do not send notifications to user space after request
+ * to stop tracing returns.
+ */
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
+ struct sk_buff *skb;
+
+ timer_delete_sync(&data->send_timer);
+ cancel_work_sync(&data->dm_alert_work);
+ while ((skb = __skb_dequeue(&data->drop_queue)))
+ consume_skb(skb);
+ }
- module_put(THIS_MODULE);
+ module_put(THIS_MODULE);
+}
+
+static int set_all_monitor_traces(int state, struct netlink_ext_ack *extack)
+{
+ int rc = 0;
+
+ if (state == trace_state) {
+ NL_SET_ERR_MSG_MOD(extack, "Trace state already set to requested state");
+ return -EAGAIN;
+ }
+ switch (state) {
+ case TRACE_ON:
+ rc = net_dm_trace_on_set(extack);
+ break;
+ case TRACE_OFF:
+ net_dm_trace_off_set();
break;
default:
rc = 1;
@@ -287,93 +1246,426 @@ static int set_all_monitor_traces(int state)
else
rc = -EINPROGRESS;
-out_unlock:
- mutex_unlock(&trace_state_mutex);
-
return rc;
}
+static bool net_dm_is_monitoring(void)
+{
+ return trace_state == TRACE_ON || monitor_hw;
+}
+
+static int net_dm_alert_mode_get_from_info(struct genl_info *info,
+ enum net_dm_alert_mode *p_alert_mode)
+{
+ u8 val;
+
+ val = nla_get_u8(info->attrs[NET_DM_ATTR_ALERT_MODE]);
+
+ switch (val) {
+ case NET_DM_ALERT_MODE_SUMMARY:
+ case NET_DM_ALERT_MODE_PACKET:
+ *p_alert_mode = val;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int net_dm_alert_mode_set(struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ enum net_dm_alert_mode alert_mode;
+ int rc;
+
+ if (!info->attrs[NET_DM_ATTR_ALERT_MODE])
+ return 0;
+
+ rc = net_dm_alert_mode_get_from_info(info, &alert_mode);
+ if (rc) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid alert mode");
+ return -EINVAL;
+ }
+
+ net_dm_alert_mode = alert_mode;
+
+ return 0;
+}
+
+static void net_dm_trunc_len_set(struct genl_info *info)
+{
+ if (!info->attrs[NET_DM_ATTR_TRUNC_LEN])
+ return;
+
+ net_dm_trunc_len = nla_get_u32(info->attrs[NET_DM_ATTR_TRUNC_LEN]);
+}
+
+static void net_dm_queue_len_set(struct genl_info *info)
+{
+ if (!info->attrs[NET_DM_ATTR_QUEUE_LEN])
+ return;
+
+ net_dm_queue_len = nla_get_u32(info->attrs[NET_DM_ATTR_QUEUE_LEN]);
+}
static int net_dm_cmd_config(struct sk_buff *skb,
struct genl_info *info)
{
- return -ENOTSUPP;
+ struct netlink_ext_ack *extack = info->extack;
+ int rc;
+
+ if (net_dm_is_monitoring()) {
+ NL_SET_ERR_MSG_MOD(extack, "Cannot configure drop monitor during monitoring");
+ return -EBUSY;
+ }
+
+ rc = net_dm_alert_mode_set(info);
+ if (rc)
+ return rc;
+
+ net_dm_trunc_len_set(info);
+
+ net_dm_queue_len_set(info);
+
+ return 0;
+}
+
+static int net_dm_monitor_start(bool set_sw, bool set_hw,
+ struct netlink_ext_ack *extack)
+{
+ bool sw_set = false;
+ int rc;
+
+ if (set_sw) {
+ rc = set_all_monitor_traces(TRACE_ON, extack);
+ if (rc)
+ return rc;
+ sw_set = true;
+ }
+
+ if (set_hw) {
+ rc = net_dm_hw_monitor_start(extack);
+ if (rc)
+ goto err_monitor_hw;
+ }
+
+ return 0;
+
+err_monitor_hw:
+ if (sw_set)
+ set_all_monitor_traces(TRACE_OFF, extack);
+ return rc;
+}
+
+static void net_dm_monitor_stop(bool set_sw, bool set_hw,
+ struct netlink_ext_ack *extack)
+{
+ if (set_hw)
+ net_dm_hw_monitor_stop(extack);
+ if (set_sw)
+ set_all_monitor_traces(TRACE_OFF, extack);
}
static int net_dm_cmd_trace(struct sk_buff *skb,
struct genl_info *info)
{
+ bool set_sw = !!info->attrs[NET_DM_ATTR_SW_DROPS];
+ bool set_hw = !!info->attrs[NET_DM_ATTR_HW_DROPS];
+ struct netlink_ext_ack *extack = info->extack;
+
+ /* To maintain backward compatibility, we start / stop monitoring of
+ * software drops if no flag is specified.
+ */
+ if (!set_sw && !set_hw)
+ set_sw = true;
+
switch (info->genlhdr->cmd) {
case NET_DM_CMD_START:
- return set_all_monitor_traces(TRACE_ON);
+ return net_dm_monitor_start(set_sw, set_hw, extack);
case NET_DM_CMD_STOP:
- return set_all_monitor_traces(TRACE_OFF);
+ net_dm_monitor_stop(set_sw, set_hw, extack);
+ return 0;
}
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
+}
+
+static int net_dm_config_fill(struct sk_buff *msg, struct genl_info *info)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &net_drop_monitor_family, 0, NET_DM_CMD_CONFIG_NEW);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(msg, NET_DM_ATTR_ALERT_MODE, net_dm_alert_mode))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NET_DM_ATTR_TRUNC_LEN, net_dm_trunc_len))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NET_DM_ATTR_QUEUE_LEN, net_dm_queue_len))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int net_dm_cmd_config_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ int rc;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ rc = net_dm_config_fill(msg, info);
+ if (rc)
+ goto free_msg;
+
+ return genlmsg_reply(msg, info);
+
+free_msg:
+ nlmsg_free(msg);
+ return rc;
+}
+
+static void net_dm_stats_read(struct net_dm_stats *stats)
+{
+ int cpu;
+
+ memset(stats, 0, sizeof(*stats));
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
+ struct net_dm_stats *cpu_stats = &data->stats;
+ unsigned int start;
+ u64 dropped;
+
+ do {
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
+ dropped = u64_stats_read(&cpu_stats->dropped);
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
+
+ u64_stats_add(&stats->dropped, dropped);
+ }
+}
+
+static int net_dm_stats_put(struct sk_buff *msg)
+{
+ struct net_dm_stats stats;
+ struct nlattr *attr;
+
+ net_dm_stats_read(&stats);
+
+ attr = nla_nest_start(msg, NET_DM_ATTR_STATS);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED,
+ u64_stats_read(&stats.dropped), NET_DM_ATTR_PAD))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static void net_dm_hw_stats_read(struct net_dm_stats *stats)
+{
+ int cpu;
+
+ memset(stats, 0, sizeof(*stats));
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+ struct net_dm_stats *cpu_stats = &hw_data->stats;
+ unsigned int start;
+ u64 dropped;
+
+ do {
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
+ dropped = u64_stats_read(&cpu_stats->dropped);
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
+
+ u64_stats_add(&stats->dropped, dropped);
+ }
+}
+
+static int net_dm_hw_stats_put(struct sk_buff *msg)
+{
+ struct net_dm_stats stats;
+ struct nlattr *attr;
+
+ net_dm_hw_stats_read(&stats);
+
+ attr = nla_nest_start(msg, NET_DM_ATTR_HW_STATS);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u64_64bit(msg, NET_DM_ATTR_STATS_DROPPED,
+ u64_stats_read(&stats.dropped), NET_DM_ATTR_PAD))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int net_dm_stats_fill(struct sk_buff *msg, struct genl_info *info)
+{
+ void *hdr;
+ int rc;
+
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &net_drop_monitor_family, 0, NET_DM_CMD_STATS_NEW);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ rc = net_dm_stats_put(msg);
+ if (rc)
+ goto nla_put_failure;
+
+ rc = net_dm_hw_stats_put(msg);
+ if (rc)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int net_dm_cmd_stats_get(struct sk_buff *skb, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ int rc;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ rc = net_dm_stats_fill(msg, info);
+ if (rc)
+ goto free_msg;
+
+ return genlmsg_reply(msg, info);
+
+free_msg:
+ nlmsg_free(msg);
+ return rc;
}
static int dropmon_net_event(struct notifier_block *ev_block,
unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct dm_hw_stat_delta *new_stat = NULL;
- struct dm_hw_stat_delta *tmp;
+ struct dm_hw_stat_delta *stat;
switch (event) {
case NETDEV_REGISTER:
- new_stat = kzalloc(sizeof(struct dm_hw_stat_delta), GFP_KERNEL);
+ if (WARN_ON_ONCE(rtnl_dereference(dev->dm_private)))
+ break;
+ stat = kzalloc(sizeof(*stat), GFP_KERNEL);
+ if (!stat)
+ break;
- if (!new_stat)
- goto out;
+ stat->last_rx = jiffies;
+ rcu_assign_pointer(dev->dm_private, stat);
- new_stat->dev = dev;
- new_stat->last_rx = jiffies;
- mutex_lock(&trace_state_mutex);
- list_add_rcu(&new_stat->list, &hw_stats_list);
- mutex_unlock(&trace_state_mutex);
break;
case NETDEV_UNREGISTER:
- mutex_lock(&trace_state_mutex);
- list_for_each_entry_safe(new_stat, tmp, &hw_stats_list, list) {
- if (new_stat->dev == dev) {
- new_stat->dev = NULL;
- if (trace_state == TRACE_OFF) {
- list_del_rcu(&new_stat->list);
- kfree_rcu(new_stat, rcu);
- break;
- }
- }
+ stat = rtnl_dereference(dev->dm_private);
+ if (stat) {
+ rcu_assign_pointer(dev->dm_private, NULL);
+ kfree_rcu(stat, rcu);
}
- mutex_unlock(&trace_state_mutex);
break;
}
-out:
return NOTIFY_DONE;
}
-static const struct genl_ops dropmon_ops[] = {
+static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = {
+ [NET_DM_ATTR_UNSPEC] = { .strict_start_type = NET_DM_ATTR_UNSPEC + 1 },
+ [NET_DM_ATTR_ALERT_MODE] = { .type = NLA_U8 },
+ [NET_DM_ATTR_TRUNC_LEN] = { .type = NLA_U32 },
+ [NET_DM_ATTR_QUEUE_LEN] = { .type = NLA_U32 },
+ [NET_DM_ATTR_SW_DROPS] = {. type = NLA_FLAG },
+ [NET_DM_ATTR_HW_DROPS] = {. type = NLA_FLAG },
+};
+
+static const struct genl_small_ops dropmon_ops[] = {
{
.cmd = NET_DM_CMD_CONFIG,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = net_dm_cmd_config,
+ .flags = GENL_ADMIN_PERM,
},
{
.cmd = NET_DM_CMD_START,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = net_dm_cmd_trace,
+ .flags = GENL_ADMIN_PERM,
},
{
.cmd = NET_DM_CMD_STOP,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = net_dm_cmd_trace,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = NET_DM_CMD_CONFIG_GET,
+ .doit = net_dm_cmd_config_get,
+ },
+ {
+ .cmd = NET_DM_CMD_STATS_GET,
+ .doit = net_dm_cmd_stats_get,
},
};
+static int net_dm_nl_pre_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ mutex_lock(&net_dm_mutex);
+
+ return 0;
+}
+
+static void net_dm_nl_post_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ mutex_unlock(&net_dm_mutex);
+}
+
static struct genl_family net_drop_monitor_family __ro_after_init = {
.hdrsize = 0,
.name = "NET_DM",
.version = 2,
+ .maxattr = NET_DM_ATTR_MAX,
+ .policy = net_dm_nl_policy,
+ .pre_doit = net_dm_nl_pre_doit,
+ .post_doit = net_dm_nl_post_doit,
.module = THIS_MODULE,
- .ops = dropmon_ops,
- .n_ops = ARRAY_SIZE(dropmon_ops),
+ .small_ops = dropmon_ops,
+ .n_small_ops = ARRAY_SIZE(dropmon_ops),
+ .resv_start_op = NET_DM_CMD_STATS_GET + 1,
.mcgrps = dropmon_mcgrps,
.n_mcgrps = ARRAY_SIZE(dropmon_mcgrps),
};
@@ -382,9 +1674,57 @@ static struct notifier_block dropmon_net_notifier = {
.notifier_call = dropmon_net_event
};
-static int __init init_net_drop_monitor(void)
+static void __net_dm_cpu_data_init(struct per_cpu_dm_data *data)
+{
+ raw_spin_lock_init(&data->lock);
+ skb_queue_head_init(&data->drop_queue);
+ u64_stats_init(&data->stats.syncp);
+}
+
+static void __net_dm_cpu_data_fini(struct per_cpu_dm_data *data)
+{
+ WARN_ON(!skb_queue_empty(&data->drop_queue));
+}
+
+static void net_dm_cpu_data_init(int cpu)
{
struct per_cpu_dm_data *data;
+
+ data = &per_cpu(dm_cpu_data, cpu);
+ __net_dm_cpu_data_init(data);
+}
+
+static void net_dm_cpu_data_fini(int cpu)
+{
+ struct per_cpu_dm_data *data;
+
+ data = &per_cpu(dm_cpu_data, cpu);
+ /* At this point, we should have exclusive access
+ * to this struct and can free the skb inside it.
+ */
+ consume_skb(data->skb);
+ __net_dm_cpu_data_fini(data);
+}
+
+static void net_dm_hw_cpu_data_init(int cpu)
+{
+ struct per_cpu_dm_data *hw_data;
+
+ hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+ __net_dm_cpu_data_init(hw_data);
+}
+
+static void net_dm_hw_cpu_data_fini(int cpu)
+{
+ struct per_cpu_dm_data *hw_data;
+
+ hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+ kfree(hw_data->hw_entries);
+ __net_dm_cpu_data_fini(hw_data);
+}
+
+static int __init init_net_drop_monitor(void)
+{
int cpu, rc;
pr_info("Initializing network drop monitor service\n");
@@ -394,64 +1734,50 @@ static int __init init_net_drop_monitor(void)
return -ENOSPC;
}
- rc = genl_register_family(&net_drop_monitor_family);
- if (rc) {
- pr_err("Could not create drop monitor netlink family\n");
- return rc;
+ for_each_possible_cpu(cpu) {
+ net_dm_cpu_data_init(cpu);
+ net_dm_hw_cpu_data_init(cpu);
}
- WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT);
rc = register_netdevice_notifier(&dropmon_net_notifier);
if (rc < 0) {
pr_crit("Failed to register netdevice notifier\n");
- goto out_unreg;
+ return rc;
}
- rc = 0;
-
- for_each_possible_cpu(cpu) {
- data = &per_cpu(dm_cpu_data, cpu);
- INIT_WORK(&data->dm_alert_work, send_dm_alert);
- timer_setup(&data->send_timer, sched_send_work, 0);
- spin_lock_init(&data->lock);
- reset_per_cpu_data(data);
+ rc = genl_register_family(&net_drop_monitor_family);
+ if (rc) {
+ pr_err("Could not create drop monitor netlink family\n");
+ goto out_unreg;
}
+ WARN_ON(net_drop_monitor_family.mcgrp_offset != NET_DM_GRP_ALERT);
+ rc = 0;
goto out;
out_unreg:
- genl_unregister_family(&net_drop_monitor_family);
+ WARN_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
out:
return rc;
}
static void exit_net_drop_monitor(void)
{
- struct per_cpu_dm_data *data;
int cpu;
- BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
-
/*
* Because of the module_get/put we do in the trace state change path
- * we are guarnateed not to have any current users when we get here
- * all we need to do is make sure that we don't have any running timers
- * or pending schedule calls
+ * we are guaranteed not to have any current users when we get here
*/
+ BUG_ON(genl_unregister_family(&net_drop_monitor_family));
+
+ BUG_ON(unregister_netdevice_notifier(&dropmon_net_notifier));
for_each_possible_cpu(cpu) {
- data = &per_cpu(dm_cpu_data, cpu);
- del_timer_sync(&data->send_timer);
- cancel_work_sync(&data->dm_alert_work);
- /*
- * At this point, we should have exclusive access
- * to this struct and can free the skb inside it
- */
- kfree_skb(data->skb);
+ net_dm_hw_cpu_data_fini(cpu);
+ net_dm_cpu_data_fini(cpu);
}
-
- BUG_ON(genl_unregister_family(&net_drop_monitor_family));
}
module_init(init_net_drop_monitor);
@@ -460,3 +1786,4 @@ module_exit(exit_net_drop_monitor);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Neil Horman <nhorman@tuxdriver.com>");
MODULE_ALIAS_GENL_FAMILY("NET_DM");
+MODULE_DESCRIPTION("Monitoring code for network dropped packet alerts");
diff --git a/net/core/dst.c b/net/core/dst.c
index 81ccf20e2826..e9d35f49c9e7 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/core/dst.c Protocol independent destination cache.
*
@@ -26,23 +27,6 @@
#include <net/dst.h>
#include <net/dst_metadata.h>
-/*
- * Theory of operations:
- * 1) We use a list, protected by a spinlock, to add
- * new entries from both BH and non-BH context.
- * 2) In order to keep spinlock held for a small delay,
- * we use a second list where are stored long lived
- * entries, that are handled by the garbage collect thread
- * fired by a workqueue.
- * 3) This list is guarded by a mutex,
- * so that the gc_task and dst_dev_event() can be synchronized.
- */
-
-/*
- * We want to keep lock & list close together
- * to dirty as few cache lines as possible in __dst_free().
- * As this is not a very strong hint, we dont force an alignment on SMP.
- */
int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
kfree_skb(skb);
@@ -61,12 +45,11 @@ const struct dst_metrics dst_default_metrics = {
EXPORT_SYMBOL(dst_default_metrics);
void dst_init(struct dst_entry *dst, struct dst_ops *ops,
- struct net_device *dev, int initial_ref, int initial_obsolete,
+ struct net_device *dev, int initial_obsolete,
unsigned short flags)
{
dst->dev = dev;
- if (dev)
- dev_hold(dev);
+ netdev_hold(dev, &dst->dev_tracker, GFP_ATOMIC);
dst->ops = ops;
dst_init_metrics(dst, dst_default_metrics.metrics, true);
dst->expires = 0UL;
@@ -83,7 +66,8 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
dst->tclassid = 0;
#endif
dst->lwtstate = NULL;
- atomic_set(&dst->__refcnt, initial_ref);
+ rcuref_init(&dst->__rcuref, 1);
+ INIT_LIST_HEAD(&dst->rt_uncached);
dst->__use = 0;
dst->lastuse = jiffies;
dst->flags = flags;
@@ -93,26 +77,26 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
EXPORT_SYMBOL(dst_init);
void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
- int initial_ref, int initial_obsolete, unsigned short flags)
+ int initial_obsolete, unsigned short flags)
{
struct dst_entry *dst;
- if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
- if (ops->gc(ops))
- return NULL;
- }
+ if (ops->gc &&
+ !(flags & DST_NOCOUNT) &&
+ dst_entries_get_fast(ops) > ops->gc_thresh)
+ ops->gc(ops);
dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
if (!dst)
return NULL;
- dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags);
+ dst_init(dst, ops, dev, initial_obsolete, flags);
return dst;
}
EXPORT_SYMBOL(dst_alloc);
-struct dst_entry *dst_destroy(struct dst_entry * dst)
+static void dst_destroy(struct dst_entry *dst)
{
struct dst_entry *child = NULL;
@@ -125,13 +109,9 @@ struct dst_entry *dst_destroy(struct dst_entry * dst)
child = xdst->child;
}
#endif
- if (!(dst->flags & DST_NOCOUNT))
- dst_entries_add(dst->ops, -1);
-
if (dst->ops->destroy)
dst->ops->destroy(dst);
- if (dst->dev)
- dev_put(dst->dev);
+ netdev_put(dst->dev, &dst->dev_tracker);
lwtstate_put(dst->lwtstate);
@@ -143,20 +123,18 @@ struct dst_entry *dst_destroy(struct dst_entry * dst)
dst = child;
if (dst)
dst_release_immediate(dst);
- return NULL;
}
-EXPORT_SYMBOL(dst_destroy);
static void dst_destroy_rcu(struct rcu_head *head)
{
struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
- dst = dst_destroy(dst);
+ dst_destroy(dst);
}
/* Operations to mark dst as DEAD and clean up the net device referenced
* by dst:
- * 1. put the dst under loopback interface and discard all tx/rx packets
+ * 1. put the dst under blackhole interface and discard all tx/rx packets
* on this route.
* 2. release the net_device
* This function should be called when removing routes from the fib tree
@@ -167,43 +145,45 @@ void dst_dev_put(struct dst_entry *dst)
{
struct net_device *dev = dst->dev;
- dst->obsolete = DST_OBSOLETE_DEAD;
+ WRITE_ONCE(dst->obsolete, DST_OBSOLETE_DEAD);
if (dst->ops->ifdown)
- dst->ops->ifdown(dst, dev, true);
- dst->input = dst_discard;
- dst->output = dst_discard_out;
- dst->dev = dev_net(dst->dev)->loopback_dev;
- dev_hold(dst->dev);
- dev_put(dev);
+ dst->ops->ifdown(dst, dev);
+ WRITE_ONCE(dst->input, dst_discard);
+ WRITE_ONCE(dst->output, dst_discard_out);
+ rcu_assign_pointer(dst->dev_rcu, blackhole_netdev);
+ netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker,
+ GFP_ATOMIC);
}
EXPORT_SYMBOL(dst_dev_put);
+static void dst_count_dec(struct dst_entry *dst)
+{
+ if (!(dst->flags & DST_NOCOUNT))
+ dst_entries_add(dst->ops, -1);
+}
+
void dst_release(struct dst_entry *dst)
{
- if (dst) {
- int newrefcnt;
-
- newrefcnt = atomic_dec_return(&dst->__refcnt);
- if (unlikely(newrefcnt < 0))
- net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
- __func__, dst, newrefcnt);
- if (!newrefcnt)
- call_rcu(&dst->rcu_head, dst_destroy_rcu);
+ if (dst && rcuref_put(&dst->__rcuref)) {
+#ifdef CONFIG_DST_CACHE
+ if (dst->flags & DST_METADATA) {
+ struct metadata_dst *md_dst = (struct metadata_dst *)dst;
+
+ if (md_dst->type == METADATA_IP_TUNNEL)
+ dst_cache_reset_now(&md_dst->u.tun_info.dst_cache);
+ }
+#endif
+ dst_count_dec(dst);
+ call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
}
}
EXPORT_SYMBOL(dst_release);
void dst_release_immediate(struct dst_entry *dst)
{
- if (dst) {
- int newrefcnt;
-
- newrefcnt = atomic_dec_return(&dst->__refcnt);
- if (unlikely(newrefcnt < 0))
- net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
- __func__, dst, newrefcnt);
- if (!newrefcnt)
- dst_destroy(dst);
+ if (dst && rcuref_put(&dst->__rcuref)) {
+ dst_count_dec(dst);
+ dst_destroy(dst);
}
}
EXPORT_SYMBOL(dst_release_immediate);
@@ -249,37 +229,62 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
}
EXPORT_SYMBOL(__dst_destroy_metrics_generic);
-static struct dst_ops md_dst_ops = {
- .family = AF_UNSPEC,
-};
+struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie)
+{
+ return NULL;
+}
-static int dst_md_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
+u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old)
{
- WARN_ONCE(1, "Attempting to call output on metadata dst\n");
- kfree_skb(skb);
- return 0;
+ return NULL;
}
-static int dst_md_discard(struct sk_buff *skb)
+struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst,
+ struct sk_buff *skb,
+ const void *daddr)
+{
+ return NULL;
+}
+
+void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
+{
+}
+EXPORT_SYMBOL_GPL(dst_blackhole_update_pmtu);
+
+void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb)
{
- WARN_ONCE(1, "Attempting to call input on metadata dst\n");
- kfree_skb(skb);
- return 0;
}
+EXPORT_SYMBOL_GPL(dst_blackhole_redirect);
+
+unsigned int dst_blackhole_mtu(const struct dst_entry *dst)
+{
+ unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
+
+ return mtu ? : dst_dev(dst)->mtu;
+}
+EXPORT_SYMBOL_GPL(dst_blackhole_mtu);
+
+static struct dst_ops dst_blackhole_ops = {
+ .family = AF_UNSPEC,
+ .neigh_lookup = dst_blackhole_neigh_lookup,
+ .check = dst_blackhole_check,
+ .cow_metrics = dst_blackhole_cow_metrics,
+ .update_pmtu = dst_blackhole_update_pmtu,
+ .redirect = dst_blackhole_redirect,
+ .mtu = dst_blackhole_mtu,
+};
static void __metadata_dst_init(struct metadata_dst *md_dst,
enum metadata_type type, u8 optslen)
-
{
struct dst_entry *dst;
dst = &md_dst->dst;
- dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE,
+ dst_init(dst, &dst_blackhole_ops, NULL, DST_OBSOLETE_NONE,
DST_METADATA | DST_NOCOUNT);
-
- dst->input = dst_md_discard;
- dst->output = dst_md_discard_out;
-
memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst));
md_dst->type = type;
}
@@ -289,7 +294,8 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type,
{
struct metadata_dst *md_dst;
- md_dst = kmalloc(sizeof(*md_dst) + optslen, flags);
+ md_dst = kmalloc(struct_size(md_dst, u.tun_info.options, optslen),
+ flags);
if (!md_dst)
return NULL;
@@ -305,6 +311,8 @@ void metadata_dst_free(struct metadata_dst *md_dst)
if (md_dst->type == METADATA_IP_TUNNEL)
dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
#endif
+ if (md_dst->type == METADATA_XFRM)
+ dst_release(md_dst->u.xfrm_info.dst_orig);
kfree(md_dst);
}
EXPORT_SYMBOL_GPL(metadata_dst_free);
@@ -315,7 +323,8 @@ metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
int cpu;
struct metadata_dst __percpu *md_dst;
- md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen,
+ md_dst = __alloc_percpu_gfp(struct_size(md_dst, u.tun_info.options,
+ optslen),
__alignof__(struct metadata_dst), flags);
if (!md_dst)
return NULL;
@@ -329,16 +338,18 @@ EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst)
{
-#ifdef CONFIG_DST_CACHE
int cpu;
for_each_possible_cpu(cpu) {
struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu);
+#ifdef CONFIG_DST_CACHE
if (one_md_dst->type == METADATA_IP_TUNNEL)
dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache);
- }
#endif
+ if (one_md_dst->type == METADATA_XFRM)
+ dst_release(one_md_dst->u.xfrm_info.dst_orig);
+ }
free_percpu(md_dst);
}
EXPORT_SYMBOL_GPL(metadata_dst_free_percpu);
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
index 64cef977484a..9ab4902324e1 100644
--- a/net/core/dst_cache.c
+++ b/net/core/dst_cache.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/core/dst_cache.c - dst entry cache
*
* Copyright (c) 2016 Paolo Abeni <pabeni@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -21,6 +17,7 @@
struct dst_cache_pcpu {
unsigned long refresh_ts;
struct dst_entry *dst;
+ local_lock_t bh_lock;
u32 cookie;
union {
struct in_addr in_saddr;
@@ -31,6 +28,7 @@ struct dst_cache_pcpu {
static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache,
struct dst_entry *dst, u32 cookie)
{
+ DEBUG_NET_WARN_ON_ONCE(!in_softirq());
dst_release(dst_cache->dst);
if (dst)
dst_hold(dst);
@@ -44,6 +42,7 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache,
{
struct dst_entry *dst;
+ DEBUG_NET_WARN_ON_ONCE(!in_softirq());
dst = idst->dst;
if (!dst)
goto fail;
@@ -51,8 +50,9 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache,
/* the cache already hold a dst reference; it can't go away */
dst_hold(dst);
- if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) ||
- (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) {
+ if (unlikely(!time_after(idst->refresh_ts,
+ READ_ONCE(dst_cache->reset_ts)) ||
+ (READ_ONCE(dst->obsolete) && !dst->ops->check(dst, idst->cookie)))) {
dst_cache_per_cpu_dst_set(idst, NULL, 0);
dst_release(dst);
goto fail;
@@ -66,10 +66,15 @@ fail:
struct dst_entry *dst_cache_get(struct dst_cache *dst_cache)
{
+ struct dst_entry *dst;
+
if (!dst_cache->cache)
return NULL;
- return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache));
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
+ dst = dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache));
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
+ return dst;
}
EXPORT_SYMBOL_GPL(dst_cache_get);
@@ -81,13 +86,17 @@ struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr)
if (!dst_cache->cache)
return NULL;
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
idst = this_cpu_ptr(dst_cache->cache);
dst = dst_cache_per_cpu_get(dst_cache, idst);
- if (!dst)
+ if (!dst) {
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
return NULL;
+ }
*saddr = idst->in_saddr.s_addr;
- return container_of(dst, struct rtable, dst);
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
+ return dst_rtable(dst);
}
EXPORT_SYMBOL_GPL(dst_cache_get_ip4);
@@ -99,9 +108,11 @@ void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst,
if (!dst_cache->cache)
return;
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
idst = this_cpu_ptr(dst_cache->cache);
dst_cache_per_cpu_dst_set(idst, dst, 0);
idst->in_saddr.s_addr = saddr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
}
EXPORT_SYMBOL_GPL(dst_cache_set_ip4);
@@ -114,10 +125,13 @@ void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
if (!dst_cache->cache)
return;
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
+
idst = this_cpu_ptr(dst_cache->cache);
- dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst,
- rt6_get_cookie((struct rt6_info *)dst));
+ dst_cache_per_cpu_dst_set(idst, dst,
+ rt6_get_cookie(dst_rt6_info(dst)));
idst->in6_saddr = *saddr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
}
EXPORT_SYMBOL_GPL(dst_cache_set_ip6);
@@ -130,12 +144,17 @@ struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache,
if (!dst_cache->cache)
return NULL;
+ local_lock_nested_bh(&dst_cache->cache->bh_lock);
+
idst = this_cpu_ptr(dst_cache->cache);
dst = dst_cache_per_cpu_get(dst_cache, idst);
- if (!dst)
+ if (!dst) {
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
return NULL;
+ }
*saddr = idst->in6_saddr;
+ local_unlock_nested_bh(&dst_cache->cache->bh_lock);
return dst;
}
EXPORT_SYMBOL_GPL(dst_cache_get_ip6);
@@ -143,10 +162,14 @@ EXPORT_SYMBOL_GPL(dst_cache_get_ip6);
int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp)
{
+ unsigned int i;
+
dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu,
gfp | __GFP_ZERO);
if (!dst_cache->cache)
return -ENOMEM;
+ for_each_possible_cpu(i)
+ local_lock_init(&per_cpu_ptr(dst_cache->cache, i)->bh_lock);
dst_cache_reset(dst_cache);
return 0;
@@ -166,3 +189,22 @@ void dst_cache_destroy(struct dst_cache *dst_cache)
free_percpu(dst_cache->cache);
}
EXPORT_SYMBOL_GPL(dst_cache_destroy);
+
+void dst_cache_reset_now(struct dst_cache *dst_cache)
+{
+ int i;
+
+ if (!dst_cache->cache)
+ return;
+
+ dst_cache_reset(dst_cache);
+ for_each_possible_cpu(i) {
+ struct dst_cache_pcpu *idst = per_cpu_ptr(dst_cache->cache, i);
+ struct dst_entry *dst = idst->dst;
+
+ idst->cookie = 0;
+ idst->dst = NULL;
+ dst_release(dst);
+ }
+}
+EXPORT_SYMBOL_GPL(dst_cache_reset_now);
diff --git a/net/core/failover.c b/net/core/failover.c
index 4a92a98ccce9..2a140b3ea669 100644
--- a/net/core/failover.c
+++ b/net/core/failover.c
@@ -80,14 +80,14 @@ static int failover_slave_register(struct net_device *slave_dev)
goto err_upper_link;
}
- slave_dev->priv_flags |= IFF_FAILOVER_SLAVE;
+ slave_dev->priv_flags |= (IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF);
if (fops && fops->slave_register &&
!fops->slave_register(slave_dev, failover_dev))
return NOTIFY_OK;
netdev_upper_dev_unlink(slave_dev, failover_dev);
- slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
+ slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF);
err_upper_link:
netdev_rx_handler_unregister(slave_dev);
done:
@@ -121,7 +121,7 @@ int failover_slave_unregister(struct net_device *slave_dev)
netdev_rx_handler_unregister(slave_dev);
netdev_upper_dev_unlink(slave_dev, failover_dev);
- slave_dev->priv_flags &= ~IFF_FAILOVER_SLAVE;
+ slave_dev->priv_flags &= ~(IFF_FAILOVER_SLAVE | IFF_NO_ADDRCONF);
if (fops && fops->slave_unregister &&
!fops->slave_unregister(slave_dev, failover_dev))
@@ -252,7 +252,7 @@ struct failover *failover_register(struct net_device *dev,
return ERR_PTR(-ENOMEM);
rcu_assign_pointer(failover->ops, ops);
- dev_hold(dev);
+ netdev_hold(dev, &failover->dev_tracker, GFP_KERNEL);
dev->priv_flags |= IFF_FAILOVER;
rcu_assign_pointer(failover->failover_dev, dev);
@@ -285,7 +285,7 @@ void failover_unregister(struct failover *failover)
failover_dev->name);
failover_dev->priv_flags &= ~IFF_FAILOVER;
- dev_put(failover_dev);
+ netdev_put(failover_dev, &failover->dev_tracker);
spin_lock(&failover_lock);
list_del(&failover->list);
diff --git a/net/core/fib_notifier.c b/net/core/fib_notifier.c
index 13a40b831d6d..5cdca49b1d7c 100644
--- a/net/core/fib_notifier.c
+++ b/net/core/fib_notifier.c
@@ -5,17 +5,22 @@
#include <linux/module.h>
#include <linux/init.h>
#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include <net/fib_notifier.h>
-static ATOMIC_NOTIFIER_HEAD(fib_chain);
+static unsigned int fib_notifier_net_id;
-int call_fib_notifier(struct notifier_block *nb, struct net *net,
+struct fib_notifier_net {
+ struct list_head fib_notifier_ops;
+ struct atomic_notifier_head fib_chain;
+};
+
+int call_fib_notifier(struct notifier_block *nb,
enum fib_event_type event_type,
struct fib_notifier_info *info)
{
int err;
- info->net = net;
err = nb->notifier_call(nb, event_type, info);
return notifier_to_errno(err);
}
@@ -24,115 +29,111 @@ EXPORT_SYMBOL(call_fib_notifier);
int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
struct fib_notifier_info *info)
{
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
int err;
- info->net = net;
- err = atomic_notifier_call_chain(&fib_chain, event_type, info);
+ err = atomic_notifier_call_chain(&fn_net->fib_chain, event_type, info);
return notifier_to_errno(err);
}
EXPORT_SYMBOL(call_fib_notifiers);
-static unsigned int fib_seq_sum(void)
+static unsigned int fib_seq_sum(struct net *net)
{
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
struct fib_notifier_ops *ops;
unsigned int fib_seq = 0;
- struct net *net;
-
- rtnl_lock();
- down_read(&net_rwsem);
- for_each_net(net) {
- rcu_read_lock();
- list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) {
- if (!try_module_get(ops->owner))
- continue;
- fib_seq += ops->fib_seq_read(net);
- module_put(ops->owner);
- }
- rcu_read_unlock();
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) {
+ if (!try_module_get(ops->owner))
+ continue;
+ fib_seq += ops->fib_seq_read(net);
+ module_put(ops->owner);
}
- up_read(&net_rwsem);
- rtnl_unlock();
+ rcu_read_unlock();
return fib_seq;
}
-static int fib_net_dump(struct net *net, struct notifier_block *nb)
+static int fib_net_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
struct fib_notifier_ops *ops;
+ int err = 0;
- list_for_each_entry_rcu(ops, &net->fib_notifier_ops, list) {
- int err;
-
+ rcu_read_lock();
+ list_for_each_entry_rcu(ops, &fn_net->fib_notifier_ops, list) {
if (!try_module_get(ops->owner))
continue;
- err = ops->fib_dump(net, nb);
+ err = ops->fib_dump(net, nb, extack);
module_put(ops->owner);
if (err)
- return err;
+ goto unlock;
}
- return 0;
+unlock:
+ rcu_read_unlock();
+
+ return err;
}
-static bool fib_dump_is_consistent(struct notifier_block *nb,
+static bool fib_dump_is_consistent(struct net *net, struct notifier_block *nb,
void (*cb)(struct notifier_block *nb),
unsigned int fib_seq)
{
- atomic_notifier_chain_register(&fib_chain, nb);
- if (fib_seq == fib_seq_sum())
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
+
+ atomic_notifier_chain_register(&fn_net->fib_chain, nb);
+ if (fib_seq == fib_seq_sum(net))
return true;
- atomic_notifier_chain_unregister(&fib_chain, nb);
+ atomic_notifier_chain_unregister(&fn_net->fib_chain, nb);
if (cb)
cb(nb);
return false;
}
#define FIB_DUMP_MAX_RETRIES 5
-int register_fib_notifier(struct notifier_block *nb,
- void (*cb)(struct notifier_block *nb))
+int register_fib_notifier(struct net *net, struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb),
+ struct netlink_ext_ack *extack)
{
int retries = 0;
int err;
do {
- unsigned int fib_seq = fib_seq_sum();
- struct net *net;
-
- rcu_read_lock();
- for_each_net_rcu(net) {
- err = fib_net_dump(net, nb);
- if (err)
- goto err_fib_net_dump;
- }
- rcu_read_unlock();
-
- if (fib_dump_is_consistent(nb, cb, fib_seq))
+ unsigned int fib_seq = fib_seq_sum(net);
+
+ err = fib_net_dump(net, nb, extack);
+ if (err)
+ return err;
+
+ if (fib_dump_is_consistent(net, nb, cb, fib_seq))
return 0;
} while (++retries < FIB_DUMP_MAX_RETRIES);
return -EBUSY;
-
-err_fib_net_dump:
- rcu_read_unlock();
- return err;
}
EXPORT_SYMBOL(register_fib_notifier);
-int unregister_fib_notifier(struct notifier_block *nb)
+int unregister_fib_notifier(struct net *net, struct notifier_block *nb)
{
- return atomic_notifier_chain_unregister(&fib_chain, nb);
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
+
+ return atomic_notifier_chain_unregister(&fn_net->fib_chain, nb);
}
EXPORT_SYMBOL(unregister_fib_notifier);
static int __fib_notifier_ops_register(struct fib_notifier_ops *ops,
struct net *net)
{
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
struct fib_notifier_ops *o;
- list_for_each_entry(o, &net->fib_notifier_ops, list)
+ list_for_each_entry(o, &fn_net->fib_notifier_ops, list)
if (ops->family == o->family)
return -EEXIST;
- list_add_tail_rcu(&ops->list, &net->fib_notifier_ops);
+ list_add_tail_rcu(&ops->list, &fn_net->fib_notifier_ops);
return 0;
}
@@ -167,18 +168,25 @@ EXPORT_SYMBOL(fib_notifier_ops_unregister);
static int __net_init fib_notifier_net_init(struct net *net)
{
- INIT_LIST_HEAD(&net->fib_notifier_ops);
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
+
+ INIT_LIST_HEAD(&fn_net->fib_notifier_ops);
+ ATOMIC_INIT_NOTIFIER_HEAD(&fn_net->fib_chain);
return 0;
}
static void __net_exit fib_notifier_net_exit(struct net *net)
{
- WARN_ON_ONCE(!list_empty(&net->fib_notifier_ops));
+ struct fib_notifier_net *fn_net = net_generic(net, fib_notifier_net_id);
+
+ WARN_ON_ONCE(!list_empty(&fn_net->fib_notifier_ops));
}
static struct pernet_operations fib_notifier_net_ops = {
.init = fib_notifier_net_init,
.exit = fib_notifier_net_exit,
+ .id = &fib_notifier_net_id,
+ .size = sizeof(struct fib_notifier_net),
};
static int __init fib_notifier_init(void)
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 0ff3953f64aa..8ca634964e36 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/core/fib_rules.c Generic Routing Rules
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation, version 2.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
*/
@@ -14,9 +11,24 @@
#include <linux/list.h>
#include <linux/module.h>
#include <net/net_namespace.h>
+#include <net/inet_dscp.h>
#include <net/sock.h>
#include <net/fib_rules.h>
#include <net/ip_tunnels.h>
+#include <linux/indirect_call_wrapper.h>
+
+#if defined(CONFIG_IPV6) && defined(CONFIG_IPV6_MULTIPLE_TABLES)
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+#define INDIRECT_CALL_MT(f, f2, f1, ...) \
+ INDIRECT_CALL_INET(f, f2, f1, __VA_ARGS__)
+#else
+#define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__)
+#endif
+#elif defined(CONFIG_IP_MULTIPLE_TABLES)
+#define INDIRECT_CALL_MT(f, f2, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__)
+#else
+#define INDIRECT_CALL_MT(f, f2, f1, ...) f(__VA_ARGS__)
+#endif
static const struct fib_kuid_range fib_kuid_range_unset = {
KUIDT_INIT(0),
@@ -25,8 +37,8 @@ static const struct fib_kuid_range fib_kuid_range_unset = {
bool fib_rule_matchall(const struct fib_rule *rule)
{
- if (rule->iifindex || rule->oifindex || rule->mark || rule->tun_id ||
- rule->flags)
+ if (READ_ONCE(rule->iifindex) || READ_ONCE(rule->oifindex) ||
+ rule->mark || rule->tun_id || rule->flags)
return false;
if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1)
return false;
@@ -42,11 +54,11 @@ bool fib_rule_matchall(const struct fib_rule *rule)
EXPORT_SYMBOL_GPL(fib_rule_matchall);
int fib_default_rule_add(struct fib_rules_ops *ops,
- u32 pref, u32 table, u32 flags)
+ u32 pref, u32 table)
{
struct fib_rule *r;
- r = kzalloc(ops->rule_size, GFP_KERNEL);
+ r = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT);
if (r == NULL)
return -ENOMEM;
@@ -54,7 +66,6 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
r->action = FR_ACT_TO_TBL;
r->pref = pref;
r->table = table;
- r->flags = flags;
r->proto = RTPROT_KERNEL;
r->fr_net = ops->fro_net;
r->uid_range = fib_kuid_range_unset;
@@ -62,7 +73,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
r->suppress_prefixlen = -1;
r->suppress_ifgroup = -1;
- /* The lock is not required here, the list in unreacheable
+ /* The lock is not required here, the list in unreachable
* at the moment this function is called */
list_add_tail(&r->list, &ops->rules_list);
return 0;
@@ -90,7 +101,8 @@ static void notify_rule_change(int event, struct fib_rule *rule,
struct fib_rules_ops *ops, struct nlmsghdr *nlh,
u32 pid);
-static struct fib_rules_ops *lookup_rules_ops(struct net *net, int family)
+static struct fib_rules_ops *lookup_rules_ops(const struct net *net,
+ int family)
{
struct fib_rules_ops *ops;
@@ -245,16 +257,36 @@ static int nla_put_port_range(struct sk_buff *skb, int attrtype,
return nla_put(skb, attrtype, sizeof(*range), range);
}
+static bool fib_rule_iif_match(const struct fib_rule *rule, int iifindex,
+ const struct flowi *fl)
+{
+ u8 iif_is_l3_master = READ_ONCE(rule->iif_is_l3_master);
+
+ return iif_is_l3_master ? l3mdev_fib_rule_iif_match(fl, iifindex) :
+ fl->flowi_iif == iifindex;
+}
+
+static bool fib_rule_oif_match(const struct fib_rule *rule, int oifindex,
+ const struct flowi *fl)
+{
+ u8 oif_is_l3_master = READ_ONCE(rule->oif_is_l3_master);
+
+ return oif_is_l3_master ? l3mdev_fib_rule_oif_match(fl, oifindex) :
+ fl->flowi_oif == oifindex;
+}
+
static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
struct flowi *fl, int flags,
struct fib_lookup_arg *arg)
{
- int ret = 0;
+ int iifindex, oifindex, ret = 0;
- if (rule->iifindex && (rule->iifindex != fl->flowi_iif))
+ iifindex = READ_ONCE(rule->iifindex);
+ if (iifindex && !fib_rule_iif_match(rule, iifindex, fl))
goto out;
- if (rule->oifindex && (rule->oifindex != fl->flowi_oif))
+ oifindex = READ_ONCE(rule->oifindex);
+ if (oifindex && !fib_rule_oif_match(rule, oifindex, fl))
goto out;
if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask)
@@ -270,7 +302,10 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
uid_gt(fl->flowi_uid, rule->uid_range.end))
goto out;
- ret = ops->match(rule, fl, flags);
+ ret = INDIRECT_CALL_MT(ops->match,
+ fib6_rule_match,
+ fib4_rule_match,
+ rule, fl, flags);
out:
return (rule->flags & FIB_RULE_INVERT) ? !ret : ret;
}
@@ -301,9 +336,15 @@ jumped:
} else if (rule->action == FR_ACT_NOP)
continue;
else
- err = ops->action(rule, fl, flags, arg);
-
- if (!err && ops->suppress && ops->suppress(rule, arg))
+ err = INDIRECT_CALL_MT(ops->action,
+ fib6_rule_action,
+ fib4_rule_action,
+ rule, fl, flags, arg);
+
+ if (!err && ops->suppress && INDIRECT_CALL_MT(ops->suppress,
+ fib6_rule_suppress,
+ fib4_rule_suppress,
+ rule, flags, arg))
continue;
if (err != -EAGAIN) {
@@ -324,16 +365,18 @@ out:
}
EXPORT_SYMBOL_GPL(fib_rules_lookup);
-static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net,
+static int call_fib_rule_notifier(struct notifier_block *nb,
enum fib_event_type event_type,
- struct fib_rule *rule, int family)
+ struct fib_rule *rule, int family,
+ struct netlink_ext_ack *extack)
{
struct fib_rule_notifier_info info = {
.info.family = family,
+ .info.extack = extack,
.rule = rule,
};
- return call_fib_notifier(nb, net, event_type, &info.info);
+ return call_fib_notifier(nb, event_type, &info.info);
}
static int call_fib_rule_notifiers(struct net *net,
@@ -348,39 +391,46 @@ static int call_fib_rule_notifiers(struct net *net,
.rule = rule,
};
- ops->fib_rules_seq++;
+ ASSERT_RTNL_NET(net);
+
+ /* Paired with READ_ONCE() in fib_rules_seq() */
+ WRITE_ONCE(ops->fib_rules_seq, ops->fib_rules_seq + 1);
return call_fib_notifiers(net, event_type, &info.info);
}
/* Called with rcu_read_lock() */
-int fib_rules_dump(struct net *net, struct notifier_block *nb, int family)
+int fib_rules_dump(struct net *net, struct notifier_block *nb, int family,
+ struct netlink_ext_ack *extack)
{
struct fib_rules_ops *ops;
struct fib_rule *rule;
+ int err = 0;
ops = lookup_rules_ops(net, family);
if (!ops)
return -EAFNOSUPPORT;
- list_for_each_entry_rcu(rule, &ops->rules_list, list)
- call_fib_rule_notifier(nb, net, FIB_EVENT_RULE_ADD, rule,
- family);
+ list_for_each_entry_rcu(rule, &ops->rules_list, list) {
+ err = call_fib_rule_notifier(nb, FIB_EVENT_RULE_ADD,
+ rule, family, extack);
+ if (err)
+ break;
+ }
rules_ops_put(ops);
- return 0;
+ return err;
}
EXPORT_SYMBOL_GPL(fib_rules_dump);
-unsigned int fib_rules_seq_read(struct net *net, int family)
+unsigned int fib_rules_seq_read(const struct net *net, int family)
{
unsigned int fib_rules_seq;
struct fib_rules_ops *ops;
- ASSERT_RTNL();
-
ops = lookup_rules_ops(net, family);
if (!ops)
return 0;
- fib_rules_seq = ops->fib_rules_seq;
+ /* Paired with WRITE_ONCE() in call_fib_rule_notifiers() */
+ fib_rules_seq = READ_ONCE(ops->fib_rules_seq);
rules_ops_put(ops);
return fib_rules_seq;
@@ -430,9 +480,6 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops,
if (rule->tun_id && r->tun_id != rule->tun_id)
continue;
- if (r->fr_net != rule->fr_net)
- continue;
-
if (rule->l3mdev && r->l3mdev != rule->l3mdev)
continue;
@@ -452,11 +499,17 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops,
&rule->sport_range))
continue;
+ if (rule->sport_mask && r->sport_mask != rule->sport_mask)
+ continue;
+
if (fib_rule_port_range_set(&rule->dport_range) &&
!fib_rule_port_range_compare(&r->dport_range,
&rule->dport_range))
continue;
+ if (rule->dport_mask && r->dport_mask != rule->dport_mask)
+ continue;
+
if (!ops->compare(r, frh, tb))
continue;
return r;
@@ -486,14 +539,40 @@ static int fib_nl2rule_l3mdev(struct nlattr *nla, struct fib_rule *nlrule,
}
#endif
-static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
+static int fib_nl2rule_port_mask(const struct nlattr *mask_attr,
+ const struct fib_rule_port_range *range,
+ u16 *port_mask,
+ struct netlink_ext_ack *extack)
+{
+ if (!fib_rule_port_range_valid(range)) {
+ NL_SET_ERR_MSG_ATTR(extack, mask_attr,
+ "Cannot specify port mask without port value");
+ return -EINVAL;
+ }
+
+ if (fib_rule_port_is_range(range)) {
+ NL_SET_ERR_MSG_ATTR(extack, mask_attr,
+ "Cannot specify port mask for port range");
+ return -EINVAL;
+ }
+
+ if (range->start & ~nla_get_u16(mask_attr)) {
+ NL_SET_ERR_MSG_ATTR(extack, mask_attr, "Invalid port mask");
+ return -EINVAL;
+ }
+
+ *port_mask = nla_get_u16(mask_attr);
+
+ return 0;
+}
+
+static int fib_nl2rule(struct net *net, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack,
struct fib_rules_ops *ops,
struct nlattr *tb[],
struct fib_rule **rule,
bool *user_priority)
{
- struct net *net = sock_net(skb->sk);
struct fib_rule_hdr *frh = nlmsg_data(nlh);
struct fib_rule *nlrule = NULL;
int err = -EINVAL;
@@ -514,7 +593,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout;
}
- nlrule = kzalloc(ops->rule_size, GFP_KERNEL);
+ nlrule = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT);
if (!nlrule) {
err = -ENOMEM;
goto errout;
@@ -525,31 +604,18 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tb[FRA_PRIORITY]) {
nlrule->pref = nla_get_u32(tb[FRA_PRIORITY]);
*user_priority = true;
- } else {
- nlrule->pref = fib_default_rule_pref(ops);
}
- nlrule->proto = tb[FRA_PROTOCOL] ?
- nla_get_u8(tb[FRA_PROTOCOL]) : RTPROT_UNSPEC;
+ nlrule->proto = nla_get_u8_default(tb[FRA_PROTOCOL], RTPROT_UNSPEC);
if (tb[FRA_IIFNAME]) {
- struct net_device *dev;
-
nlrule->iifindex = -1;
- nla_strlcpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
- dev = __dev_get_by_name(net, nlrule->iifname);
- if (dev)
- nlrule->iifindex = dev->ifindex;
+ nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ);
}
if (tb[FRA_OIFNAME]) {
- struct net_device *dev;
-
nlrule->oifindex = -1;
- nla_strlcpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
- dev = __dev_get_by_name(net, nlrule->oifname);
- if (dev)
- nlrule->oifindex = dev->ifindex;
+ nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ);
}
if (tb[FRA_FWMARK]) {
@@ -567,7 +633,6 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tb[FRA_TUN_ID])
nlrule->tun_id = nla_get_be64(tb[FRA_TUN_ID]);
- err = -EINVAL;
if (tb[FRA_L3MDEV] &&
fib_nl2rule_l3mdev(tb[FRA_L3MDEV], nlrule, extack) < 0)
goto errout_free;
@@ -592,11 +657,6 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
}
nlrule->target = nla_get_u32(tb[FRA_GOTO]);
- /* Backward jumps are prohibited to avoid endless loops */
- if (nlrule->target <= nlrule->pref) {
- NL_SET_ERR_MSG(extack, "Backward goto not supported");
- goto errout_free;
- }
} else if (nlrule->action == FR_ACT_GOTO) {
NL_SET_ERR_MSG(extack, "Missing goto target for action goto");
goto errout_free;
@@ -635,6 +695,16 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
NL_SET_ERR_MSG(extack, "Invalid sport range");
goto errout_free;
}
+ if (!fib_rule_port_is_range(&nlrule->sport_range))
+ nlrule->sport_mask = U16_MAX;
+ }
+
+ if (tb[FRA_SPORT_MASK]) {
+ err = fib_nl2rule_port_mask(tb[FRA_SPORT_MASK],
+ &nlrule->sport_range,
+ &nlrule->sport_mask, extack);
+ if (err)
+ goto errout_free;
}
if (tb[FRA_DPORT_RANGE]) {
@@ -644,6 +714,16 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh,
NL_SET_ERR_MSG(extack, "Invalid dport range");
goto errout_free;
}
+ if (!fib_rule_port_is_range(&nlrule->dport_range))
+ nlrule->dport_mask = U16_MAX;
+ }
+
+ if (tb[FRA_DPORT_MASK]) {
+ err = fib_nl2rule_port_mask(tb[FRA_DPORT_MASK],
+ &nlrule->dport_range,
+ &nlrule->dport_mask, extack);
+ if (err)
+ goto errout_free;
}
*rule = nlrule;
@@ -656,6 +736,43 @@ errout:
return err;
}
+static int fib_nl2rule_rtnl(struct fib_rule *nlrule,
+ struct fib_rules_ops *ops,
+ struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ if (!tb[FRA_PRIORITY])
+ nlrule->pref = fib_default_rule_pref(ops);
+
+ /* Backward jumps are prohibited to avoid endless loops */
+ if (tb[FRA_GOTO] && nlrule->target <= nlrule->pref) {
+ NL_SET_ERR_MSG(extack, "Backward goto not supported");
+ return -EINVAL;
+ }
+
+ if (tb[FRA_IIFNAME]) {
+ struct net_device *dev;
+
+ dev = __dev_get_by_name(nlrule->fr_net, nlrule->iifname);
+ if (dev) {
+ nlrule->iifindex = dev->ifindex;
+ nlrule->iif_is_l3_master = netif_is_l3_master(dev);
+ }
+ }
+
+ if (tb[FRA_OIFNAME]) {
+ struct net_device *dev;
+
+ dev = __dev_get_by_name(nlrule->fr_net, nlrule->oifname);
+ if (dev) {
+ nlrule->oifindex = dev->ifindex;
+ nlrule->oif_is_l3_master = netif_is_l3_master(dev);
+ }
+ }
+
+ return 0;
+}
+
static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
struct nlattr **tb, struct fib_rule *rule)
{
@@ -692,9 +809,6 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
if (r->tun_id != rule->tun_id)
continue;
- if (r->fr_net != rule->fr_net)
- continue;
-
if (r->l3mdev != rule->l3mdev)
continue;
@@ -712,10 +826,16 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
&rule->sport_range))
continue;
+ if (r->sport_mask != rule->sport_mask)
+ continue;
+
if (!fib_rule_port_range_compare(&r->dport_range,
&rule->dport_range))
continue;
+ if (r->dport_mask != rule->dport_mask)
+ continue;
+
if (!ops->compare(r, frh, tb))
continue;
return 1;
@@ -723,18 +843,45 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh,
return 0;
}
-int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
+static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = {
+ [FRA_UNSPEC] = { .strict_start_type = FRA_DPORT_RANGE + 1 },
+ [FRA_IIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
+ [FRA_OIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
+ [FRA_PRIORITY] = { .type = NLA_U32 },
+ [FRA_FWMARK] = { .type = NLA_U32 },
+ [FRA_FLOW] = { .type = NLA_U32 },
+ [FRA_TUN_ID] = { .type = NLA_U64 },
+ [FRA_FWMASK] = { .type = NLA_U32 },
+ [FRA_TABLE] = { .type = NLA_U32 },
+ [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 },
+ [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 },
+ [FRA_GOTO] = { .type = NLA_U32 },
+ [FRA_L3MDEV] = { .type = NLA_U8 },
+ [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) },
+ [FRA_PROTOCOL] = { .type = NLA_U8 },
+ [FRA_IP_PROTO] = { .type = NLA_U8 },
+ [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) },
+ [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) },
+ [FRA_DSCP] = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2),
+ [FRA_FLOWLABEL] = { .type = NLA_BE32 },
+ [FRA_FLOWLABEL_MASK] = { .type = NLA_BE32 },
+ [FRA_SPORT_MASK] = { .type = NLA_U16 },
+ [FRA_DPORT_MASK] = { .type = NLA_U16 },
+ [FRA_DSCP_MASK] = NLA_POLICY_MASK(NLA_U8, INET_DSCP_MASK >> 2),
+};
+
+int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack, bool rtnl_held)
{
- struct net *net = sock_net(skb->sk);
- struct fib_rule_hdr *frh = nlmsg_data(nlh);
- struct fib_rules_ops *ops = NULL;
struct fib_rule *rule = NULL, *r, *last = NULL;
- struct nlattr *tb[FRA_MAX + 1];
int err = -EINVAL, unresolved = 0;
+ struct fib_rules_ops *ops = NULL;
+ struct nlattr *tb[FRA_MAX + 1];
bool user_priority = false;
+ struct fib_rule_hdr *frh;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ frh = nlmsg_payload(nlh, sizeof(*frh));
+ if (!frh) {
NL_SET_ERR_MSG(extack, "Invalid msg length");
goto errout;
}
@@ -746,16 +893,24 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout;
}
- err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
+ err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX,
+ fib_rule_policy, extack);
if (err < 0) {
NL_SET_ERR_MSG(extack, "Error parsing msg");
goto errout;
}
- err = fib_nl2rule(skb, nlh, extack, ops, tb, &rule, &user_priority);
+ err = fib_nl2rule(net, nlh, extack, ops, tb, &rule, &user_priority);
if (err)
goto errout;
+ if (!rtnl_held)
+ rtnl_net_lock(net);
+
+ err = fib_nl2rule_rtnl(rule, ops, tb, extack);
+ if (err)
+ goto errout_free;
+
if ((nlh->nlmsg_flags & NLM_F_EXCL) &&
rule_exists(ops, frh, tb, rule)) {
err = -EEXIST;
@@ -817,31 +972,45 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
if (rule->tun_id)
ip_tunnel_need_metadata();
+ fib_rule_get(rule);
+
+ if (!rtnl_held)
+ rtnl_net_unlock(net);
+
notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
+ fib_rule_put(rule);
flush_route_cache(ops);
rules_ops_put(ops);
return 0;
errout_free:
+ if (!rtnl_held)
+ rtnl_net_unlock(net);
kfree(rule);
errout:
rules_ops_put(ops);
return err;
}
-EXPORT_SYMBOL_GPL(fib_nl_newrule);
+EXPORT_SYMBOL_GPL(fib_newrule);
-int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
+static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
- struct net *net = sock_net(skb->sk);
- struct fib_rule_hdr *frh = nlmsg_data(nlh);
+ return fib_newrule(sock_net(skb->sk), skb, nlh, extack, false);
+}
+
+int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack, bool rtnl_held)
+{
+ struct fib_rule *rule = NULL, *nlrule = NULL;
struct fib_rules_ops *ops = NULL;
- struct fib_rule *rule = NULL, *r, *nlrule = NULL;
struct nlattr *tb[FRA_MAX+1];
- int err = -EINVAL;
bool user_priority = false;
+ struct fib_rule_hdr *frh;
+ int err = -EINVAL;
- if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh))) {
+ frh = nlmsg_payload(nlh, sizeof(*frh));
+ if (!frh) {
NL_SET_ERR_MSG(extack, "Invalid msg length");
goto errout;
}
@@ -853,31 +1022,39 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout;
}
- err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack);
+ err = nlmsg_parse_deprecated(nlh, sizeof(*frh), tb, FRA_MAX,
+ fib_rule_policy, extack);
if (err < 0) {
NL_SET_ERR_MSG(extack, "Error parsing msg");
goto errout;
}
- err = fib_nl2rule(skb, nlh, extack, ops, tb, &nlrule, &user_priority);
+ err = fib_nl2rule(net, nlh, extack, ops, tb, &nlrule, &user_priority);
if (err)
goto errout;
+ if (!rtnl_held)
+ rtnl_net_lock(net);
+
+ err = fib_nl2rule_rtnl(nlrule, ops, tb, extack);
+ if (err)
+ goto errout_free;
+
rule = rule_find(ops, frh, tb, nlrule, user_priority);
if (!rule) {
err = -ENOENT;
- goto errout;
+ goto errout_free;
}
if (rule->flags & FIB_RULE_PERMANENT) {
err = -EPERM;
- goto errout;
+ goto errout_free;
}
if (ops->delete) {
err = ops->delete(rule);
if (err)
- goto errout;
+ goto errout_free;
}
if (rule->tun_id)
@@ -899,7 +1076,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
* current if it is goto rule, have actually been added.
*/
if (ops->nr_goto_rules > 0) {
- struct fib_rule *n;
+ struct fib_rule *n, *r;
n = list_next_entry(rule, list);
if (&n->list == &ops->rules_list || n->pref != rule->pref)
@@ -913,22 +1090,33 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
}
}
- call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops,
- NULL);
- notify_rule_change(RTM_DELRULE, rule, ops, nlh,
- NETLINK_CB(skb).portid);
+ call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, NULL);
+
+ if (!rtnl_held)
+ rtnl_net_unlock(net);
+
+ notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
fib_rule_put(rule);
flush_route_cache(ops);
rules_ops_put(ops);
kfree(nlrule);
return 0;
-errout:
+errout_free:
+ if (!rtnl_held)
+ rtnl_net_unlock(net);
kfree(nlrule);
+errout:
rules_ops_put(ops);
return err;
}
-EXPORT_SYMBOL_GPL(fib_nl_delrule);
+EXPORT_SYMBOL_GPL(fib_delrule);
+
+static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ return fib_delrule(sock_net(skb->sk), skb, nlh, extack, false);
+}
static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
struct fib_rule *rule)
@@ -947,7 +1135,9 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops,
+ nla_total_size(1) /* FRA_PROTOCOL */
+ nla_total_size(1) /* FRA_IP_PROTO */
+ nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_SPORT_RANGE */
- + nla_total_size(sizeof(struct fib_rule_port_range)); /* FRA_DPORT_RANGE */
+ + nla_total_size(sizeof(struct fib_rule_port_range)) /* FRA_DPORT_RANGE */
+ + nla_total_size(2) /* FRA_SPORT_MASK */
+ + nla_total_size(2); /* FRA_DPORT_MASK */
if (ops->nlmsg_payload)
payload += ops->nlmsg_payload(rule);
@@ -968,7 +1158,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
frh = nlmsg_data(nlh);
frh->family = ops->family;
- frh->table = rule->table;
+ frh->table = rule->table < 256 ? rule->table : RT_TABLE_COMPAT;
if (nla_put_u32(skb, FRA_TABLE, rule->table))
goto nla_put_failure;
if (nla_put_u32(skb, FRA_SUPPRESS_PREFIXLEN, rule->suppress_prefixlen))
@@ -988,14 +1178,14 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
if (rule->iifname[0]) {
if (nla_put_string(skb, FRA_IIFNAME, rule->iifname))
goto nla_put_failure;
- if (rule->iifindex == -1)
+ if (READ_ONCE(rule->iifindex) == -1)
frh->flags |= FIB_RULE_IIF_DETACHED;
}
if (rule->oifname[0]) {
if (nla_put_string(skb, FRA_OIFNAME, rule->oifname))
goto nla_put_failure;
- if (rule->oifindex == -1)
+ if (READ_ONCE(rule->oifindex) == -1)
frh->flags |= FIB_RULE_OIF_DETACHED;
}
@@ -1015,8 +1205,12 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
nla_put_uid_range(skb, &rule->uid_range)) ||
(fib_rule_port_range_set(&rule->sport_range) &&
nla_put_port_range(skb, FRA_SPORT_RANGE, &rule->sport_range)) ||
+ (rule->sport_mask && nla_put_u16(skb, FRA_SPORT_MASK,
+ rule->sport_mask)) ||
(fib_rule_port_range_set(&rule->dport_range) &&
nla_put_port_range(skb, FRA_DPORT_RANGE, &rule->dport_range)) ||
+ (rule->dport_mask && nla_put_u16(skb, FRA_DPORT_MASK,
+ rule->dport_mask)) ||
(rule->ip_proto && nla_put_u8(skb, FRA_IP_PROTO, rule->ip_proto)))
goto nla_put_failure;
@@ -1063,30 +1257,64 @@ skip:
return err;
}
+static int fib_valid_dumprule_req(const struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_rule_hdr *frh;
+
+ frh = nlmsg_payload(nlh, sizeof(*frh));
+ if (!frh) {
+ NL_SET_ERR_MSG(extack, "Invalid header for fib rule dump request");
+ return -EINVAL;
+ }
+
+ if (frh->dst_len || frh->src_len || frh->tos || frh->table ||
+ frh->res1 || frh->res2 || frh->action || frh->flags) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid values in header for fib rule dump request");
+ return -EINVAL;
+ }
+
+ if (nlmsg_attrlen(nlh, sizeof(*frh))) {
+ NL_SET_ERR_MSG(extack, "Invalid data after header in fib rule dump request");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
struct fib_rules_ops *ops;
- int idx = 0, family;
+ int err, idx = 0, family;
+
+ if (cb->strict_check) {
+ err = fib_valid_dumprule_req(nlh, cb->extack);
- family = rtnl_msg_family(cb->nlh);
+ if (err < 0)
+ return err;
+ }
+
+ family = rtnl_msg_family(nlh);
if (family != AF_UNSPEC) {
/* Protocol specific dump request */
ops = lookup_rules_ops(net, family);
if (ops == NULL)
return -EAFNOSUPPORT;
- dump_rules(skb, cb, ops);
-
- return skb->len;
+ return dump_rules(skb, cb, ops);
}
+ err = 0;
rcu_read_lock();
list_for_each_entry_rcu(ops, &net->rules_ops, list) {
if (idx < cb->args[0] || !try_module_get(ops->owner))
goto skip;
- if (dump_rules(skb, cb, ops) < 0)
+ err = dump_rules(skb, cb, ops);
+ if (err < 0)
break;
cb->args[1] = 0;
@@ -1096,7 +1324,7 @@ skip:
rcu_read_unlock();
cb->args[0] = idx;
- return skb->len;
+ return err;
}
static void notify_rule_change(int event, struct fib_rule *rule,
@@ -1105,7 +1333,7 @@ static void notify_rule_change(int event, struct fib_rule *rule,
{
struct net *net;
struct sk_buff *skb;
- int err = -ENOBUFS;
+ int err = -ENOMEM;
net = ops->fro_net;
skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL);
@@ -1123,8 +1351,7 @@ static void notify_rule_change(int event, struct fib_rule *rule,
rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL);
return;
errout:
- if (err < 0)
- rtnl_set_sk_err(net, ops->nlgroup, err);
+ rtnl_set_sk_err(net, ops->nlgroup, err);
}
static void attach_rules(struct list_head *rules, struct net_device *dev)
@@ -1133,11 +1360,17 @@ static void attach_rules(struct list_head *rules, struct net_device *dev)
list_for_each_entry(rule, rules, list) {
if (rule->iifindex == -1 &&
- strcmp(dev->name, rule->iifname) == 0)
- rule->iifindex = dev->ifindex;
+ strcmp(dev->name, rule->iifname) == 0) {
+ WRITE_ONCE(rule->iifindex, dev->ifindex);
+ WRITE_ONCE(rule->iif_is_l3_master,
+ netif_is_l3_master(dev));
+ }
if (rule->oifindex == -1 &&
- strcmp(dev->name, rule->oifname) == 0)
- rule->oifindex = dev->ifindex;
+ strcmp(dev->name, rule->oifname) == 0) {
+ WRITE_ONCE(rule->oifindex, dev->ifindex);
+ WRITE_ONCE(rule->oif_is_l3_master,
+ netif_is_l3_master(dev));
+ }
}
}
@@ -1146,10 +1379,14 @@ static void detach_rules(struct list_head *rules, struct net_device *dev)
struct fib_rule *rule;
list_for_each_entry(rule, rules, list) {
- if (rule->iifindex == dev->ifindex)
- rule->iifindex = -1;
- if (rule->oifindex == dev->ifindex)
- rule->oifindex = -1;
+ if (rule->iifindex == dev->ifindex) {
+ WRITE_ONCE(rule->iifindex, -1);
+ WRITE_ONCE(rule->iif_is_l3_master, false);
+ }
+ if (rule->oifindex == dev->ifindex) {
+ WRITE_ONCE(rule->oifindex, -1);
+ WRITE_ONCE(rule->oif_is_l3_master, false);
+ }
}
}
@@ -1206,12 +1443,20 @@ static struct pernet_operations fib_rules_net_ops = {
.exit = fib_rules_net_exit,
};
+static const struct rtnl_msg_handler fib_rules_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_NEWRULE, .doit = fib_nl_newrule,
+ .flags = RTNL_FLAG_DOIT_PERNET},
+ {.msgtype = RTM_DELRULE, .doit = fib_nl_delrule,
+ .flags = RTNL_FLAG_DOIT_PERNET},
+ {.msgtype = RTM_GETRULE, .dumpit = fib_nl_dumprule,
+ .flags = RTNL_FLAG_DUMP_UNLOCKED},
+};
+
static int __init fib_rules_init(void)
{
int err;
- rtnl_register(PF_UNSPEC, RTM_NEWRULE, fib_nl_newrule, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_DELRULE, fib_nl_delrule, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_GETRULE, NULL, fib_nl_dumprule, 0);
+
+ rtnl_register_many(fib_rules_rtnl_msg_handlers);
err = register_pernet_subsys(&fib_rules_net_ops);
if (err < 0)
@@ -1226,9 +1471,7 @@ static int __init fib_rules_init(void)
fail_unregister:
unregister_pernet_subsys(&fib_rules_net_ops);
fail:
- rtnl_unregister(PF_UNSPEC, RTM_NEWRULE);
- rtnl_unregister(PF_UNSPEC, RTM_DELRULE);
- rtnl_unregister(PF_UNSPEC, RTM_GETRULE);
+ rtnl_unregister_many(fib_rules_rtnl_msg_handlers);
return err;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index 5e00f2b85a56..616e0520a0bb 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux Socket Filter - Kernel level socket filtering
*
@@ -12,15 +13,12 @@
* Alexei Starovoitov <ast@plumgrid.com>
* Daniel Borkmann <dborkman@redhat.com>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Andi Kleen - Fix a few bad bugs and races.
* Kris Katterjohn - Added many additional checks in bpf_check_classic()
*/
+#include <linux/atomic.h>
+#include <linux/bpf_verifier.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/mm.h>
@@ -38,18 +36,19 @@
#include <net/protocol.h>
#include <net/netlink.h>
#include <linux/skbuff.h>
+#include <linux/skmsg.h>
#include <net/sock.h>
#include <net/flow_dissector.h>
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/uaccess.h>
-#include <asm/unaligned.h>
-#include <asm/cmpxchg.h>
+#include <linux/unaligned.h>
#include <linux/filter.h>
#include <linux/ratelimit.h>
#include <linux/seccomp.h>
#include <linux/if_vlan.h>
#include <linux/bpf.h>
+#include <linux/btf.h>
#include <net/sch_generic.h>
#include <net/cls_cgroup.h>
#include <net/dst_metadata.h>
@@ -58,31 +57,82 @@
#include <net/busy_poll.h>
#include <net/tcp.h>
#include <net/xfrm.h>
+#include <net/udp.h>
#include <linux/bpf_trace.h>
#include <net/xdp_sock.h>
#include <linux/inetdevice.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
#include <net/ip_fib.h>
+#include <net/nexthop.h>
#include <net/flow.h>
#include <net/arp.h>
#include <net/ipv6.h>
+#include <net/net_namespace.h>
#include <linux/seg6_local.h>
#include <net/seg6.h>
#include <net/seg6_local.h>
+#include <net/lwtunnel.h>
+#include <net/ipv6_stubs.h>
+#include <net/bpf_sk_storage.h>
+#include <net/transp_v6.h>
+#include <linux/btf_ids.h>
+#include <net/tls.h>
+#include <net/xdp.h>
+#include <net/mptcp.h>
+#include <net/netfilter/nf_conntrack_bpf.h>
+#include <net/netkit.h>
+#include <linux/un.h>
+#include <net/xdp_sock_drv.h>
+#include <net/inet_dscp.h>
+
+#include "dev.h"
+
+/* Keep the struct bpf_fib_lookup small so that it fits into a cacheline */
+static_assert(sizeof(struct bpf_fib_lookup) == 64, "struct bpf_fib_lookup size check");
+
+static const struct bpf_func_proto *
+bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
+
+int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len)
+{
+ if (in_compat_syscall()) {
+ struct compat_sock_fprog f32;
+
+ if (len != sizeof(f32))
+ return -EINVAL;
+ if (copy_from_sockptr(&f32, src, sizeof(f32)))
+ return -EFAULT;
+ memset(dst, 0, sizeof(*dst));
+ dst->len = f32.len;
+ dst->filter = compat_ptr(f32.filter);
+ } else {
+ if (len != sizeof(*dst))
+ return -EINVAL;
+ if (copy_from_sockptr(dst, src, sizeof(*dst)))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user);
/**
* sk_filter_trim_cap - run a packet through a socket filter
* @sk: sock associated with &sk_buff
* @skb: buffer to filter
* @cap: limit on how short the eBPF program may trim the packet
+ * @reason: record drop reason on errors (negative return value)
*
* Run the eBPF program and then cut skb->data to correct size returned by
* the program. If pkt_len is 0 we toss packet. If skb->len is smaller
* than pkt_len we keep whole skb->data. This is the socket level
- * wrapper to BPF_PROG_RUN. It returns 0 if the packet should
+ * wrapper to bpf_prog_run. It returns 0 if the packet should
* be accepted or -EPERM if the packet should be tossed.
*
*/
-int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
+int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb,
+ unsigned int cap, enum skb_drop_reason *reason)
{
int err;
struct sk_filter *filter;
@@ -94,15 +144,20 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
*/
if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
+ *reason = SKB_DROP_REASON_PFMEMALLOC;
return -ENOMEM;
}
err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb);
- if (err)
+ if (err) {
+ *reason = SKB_DROP_REASON_SOCKET_FILTER;
return err;
+ }
err = security_sock_rcv_skb(sk, skb);
- if (err)
+ if (err) {
+ *reason = SKB_DROP_REASON_SECURITY_HOOK;
return err;
+ }
rcu_read_lock();
filter = rcu_dereference(sk->sk_filter);
@@ -114,6 +169,8 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
pkt_len = bpf_prog_run_save_cb(filter->prog, skb);
skb->sk = save_sk;
err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
+ if (err)
+ *reason = SKB_DROP_REASON_SOCKET_FILTER;
}
rcu_read_unlock();
@@ -160,7 +217,7 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
return 0;
nla = (struct nlattr *) &skb->data[a];
- if (nla->nla_len > skb->len - a)
+ if (!nla_ok(nla, skb->len - a))
return 0;
nla = nla_find_nested(nla, x);
@@ -170,24 +227,36 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
return 0;
}
+static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset)
+{
+ if (likely(offset >= 0))
+ return offset;
+
+ if (offset >= SKF_NET_OFF)
+ return offset - SKF_NET_OFF + skb_network_offset(skb);
+
+ if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb))
+ return offset - SKF_LL_OFF + skb_mac_offset(skb);
+
+ return INT_MIN;
+}
+
BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
data, int, headlen, int, offset)
{
- u8 tmp, *ptr;
+ u8 tmp;
const int len = sizeof(tmp);
- if (offset >= 0) {
- if (headlen - offset >= len)
- return *(u8 *)(data + offset);
- if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
- return tmp;
- } else {
- ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
- if (likely(ptr))
- return *(u8 *)ptr;
- }
+ offset = bpf_skb_load_helper_convert_offset(skb, offset);
+ if (offset == INT_MIN)
+ return -EFAULT;
- return -EFAULT;
+ if (headlen - offset >= len)
+ return *(u8 *)(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return tmp;
+ else
+ return -EFAULT;
}
BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
@@ -200,21 +269,19 @@ BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
data, int, headlen, int, offset)
{
- u16 tmp, *ptr;
+ __be16 tmp;
const int len = sizeof(tmp);
- if (offset >= 0) {
- if (headlen - offset >= len)
- return get_unaligned_be16(data + offset);
- if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
- return be16_to_cpu(tmp);
- } else {
- ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
- if (likely(ptr))
- return get_unaligned_be16(ptr);
- }
+ offset = bpf_skb_load_helper_convert_offset(skb, offset);
+ if (offset == INT_MIN)
+ return -EFAULT;
- return -EFAULT;
+ if (headlen - offset >= len)
+ return get_unaligned_be16(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return be16_to_cpu(tmp);
+ else
+ return -EFAULT;
}
BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
@@ -227,21 +294,19 @@ BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
data, int, headlen, int, offset)
{
- u32 tmp, *ptr;
+ __be32 tmp;
const int len = sizeof(tmp);
- if (likely(offset >= 0)) {
- if (headlen - offset >= len)
- return get_unaligned_be32(data + offset);
- if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
- return be32_to_cpu(tmp);
- } else {
- ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
- if (likely(ptr))
- return get_unaligned_be32(ptr);
- }
+ offset = bpf_skb_load_helper_convert_offset(skb, offset);
+ if (offset == INT_MIN)
+ return -EFAULT;
- return -EFAULT;
+ if (headlen - offset >= len)
+ return get_unaligned_be32(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return be32_to_cpu(tmp);
+ else
+ return -EFAULT;
}
BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
@@ -251,17 +316,6 @@ BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
offset);
}
-BPF_CALL_0(bpf_get_raw_cpu_id)
-{
- return raw_smp_processor_id();
-}
-
-static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = {
- .func = bpf_get_raw_cpu_id,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
-};
-
static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
struct bpf_insn *insn_buf)
{
@@ -269,14 +323,14 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
switch (skb_field) {
case SKF_AD_MARK:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, mark) != 4);
*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
offsetof(struct sk_buff, mark));
break;
case SKF_AD_PKTTYPE:
- *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET());
+ *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET);
*insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX);
#ifdef __BIG_ENDIAN_BITFIELD
*insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5);
@@ -284,29 +338,25 @@ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg,
break;
case SKF_AD_QUEUE:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, queue_mapping) != 2);
*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
offsetof(struct sk_buff, queue_mapping));
break;
case SKF_AD_VLAN_TAG:
- case SKF_AD_VLAN_TAG_PRESENT:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2);
- BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_tci) != 2);
/* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */
*insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg,
offsetof(struct sk_buff, vlan_tci));
- if (skb_field == SKF_AD_VLAN_TAG) {
- *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg,
- ~VLAN_TAG_PRESENT);
- } else {
- /* dst_reg >>= 12 */
- *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12);
- /* dst_reg &= 1 */
- *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1);
- }
+ break;
+ case SKF_AD_VLAN_TAG_PRESENT:
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_all) != 4);
+ *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+ offsetof(struct sk_buff, vlan_all));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1);
+ *insn++ = BPF_ALU32_IMM(BPF_MOV, dst_reg, 1);
break;
}
@@ -321,7 +371,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
switch (fp->k) {
case SKF_AD_OFF + SKF_AD_PROTOCOL:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, protocol) != 2);
/* A = *(u16 *) (CTX + offsetof(protocol)) */
*insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
@@ -337,8 +387,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
case SKF_AD_OFF + SKF_AD_IFINDEX:
case SKF_AD_OFF + SKF_AD_HATYPE:
- BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
- BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2);
+ BUILD_BUG_ON(sizeof_field(struct net_device, ifindex) != 4);
+ BUILD_BUG_ON(sizeof_field(struct net_device, type) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev),
BPF_REG_TMP, BPF_REG_CTX,
@@ -360,7 +410,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
break;
case SKF_AD_OFF + SKF_AD_RXHASH:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4);
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, hash) != 4);
*insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX,
offsetof(struct sk_buff, hash));
@@ -384,7 +434,7 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
break;
case SKF_AD_OFF + SKF_AD_VLAN_TPID:
- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, vlan_proto) != 2);
/* A = *(u16 *) (CTX + offsetof(vlan_proto)) */
*insn++ = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX,
@@ -462,7 +512,8 @@ static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
bool ldx_off_ok = offset <= S16_MAX;
*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
- *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
+ if (offset)
+ *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
*insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP,
size, 2 + endian + (!ldx_off_ok * 2));
if (ldx_off_ok) {
@@ -748,7 +799,7 @@ jmp_rest:
BPF_EMIT_JMP;
break;
- /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */
+ /* ldxb 4 * ([14] & 0xf) is remapped into 6 insns. */
case BPF_LDX | BPF_MSH | BPF_B: {
struct sock_filter tmp = {
.code = BPF_LD | BPF_ABS | BPF_B,
@@ -774,7 +825,7 @@ jmp_rest:
*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
break;
}
- /* RET_K is remaped into 2 insns. RET_A case doesn't need an
+ /* RET_K is remapped into 2 insns. RET_A case doesn't need an
* extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
*/
case BPF_RET | BPF_A:
@@ -1190,11 +1241,12 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
*/
static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp)
{
+ int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
u32 filter_size = bpf_prog_size(fp->prog->len);
/* same check as in sock_kmalloc() */
- if (filter_size <= sysctl_optmem_max &&
- atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) {
+ if (filter_size <= optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + filter_size < optmem_max) {
atomic_add(filter_size, &sk->sk_omem_alloc);
return true;
}
@@ -1220,10 +1272,9 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
int err, new_len, old_len = fp->len;
bool seen_ld_abs = false;
- /* We are free to overwrite insns et al right here as it
- * won't be used at this point in time anymore internally
- * after the migration to the internal BPF instruction
- * representation.
+ /* We are free to overwrite insns et al right here as it won't be used at
+ * this point in time anymore internally after the migration to the eBPF
+ * instruction representation.
*/
BUILD_BUG_ON(sizeof(struct sock_filter) !=
sizeof(struct bpf_insn));
@@ -1232,8 +1283,8 @@ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp)
* so we need to keep the user BPF around until the 2nd
* pass. At this time, the user BPF is stored in fp->insns.
*/
- old_prog = kmemdup(fp->insns, old_len * sizeof(struct sock_filter),
- GFP_KERNEL | __GFP_NOWARN);
+ old_prog = kmemdup_array(fp->insns, old_len, sizeof(struct sock_filter),
+ GFP_KERNEL | __GFP_NOWARN);
if (!old_prog) {
err = -ENOMEM;
goto out_err;
@@ -1314,8 +1365,8 @@ static struct bpf_prog *bpf_prepare_filter(struct bpf_prog *fp,
*/
bpf_jit_compile(fp);
- /* JIT compiler couldn't process this filter, so do the
- * internal BPF translation for the optimized interpreter.
+ /* JIT compiler couldn't process this filter, so do the eBPF translation
+ * for the optimized interpreter.
*/
if (!fp->jited)
fp = bpf_migrate_filter(fp);
@@ -1521,12 +1572,13 @@ EXPORT_SYMBOL_GPL(sk_attach_filter);
int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
struct bpf_prog *prog = __get_filter(fprog, sk);
- int err;
+ int err, optmem_max;
if (IS_ERR(prog))
return PTR_ERR(prog);
- if (bpf_prog_size(prog->len) > sysctl_optmem_max)
+ optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
+ if (bpf_prog_size(prog->len) > optmem_max)
err = -ENOMEM;
else
err = reuseport_attach_prog(sk, prog);
@@ -1565,13 +1617,13 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
{
struct bpf_prog *prog;
- int err;
+ int err, optmem_max;
if (sock_flag(sk, SOCK_FILTER_LOCKED))
return -EPERM;
prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER);
- if (IS_ERR(prog) && PTR_ERR(prog) == -EINVAL)
+ if (PTR_ERR(prog) == -EINVAL)
prog = bpf_prog_get_type(ufd, BPF_PROG_TYPE_SK_REUSEPORT);
if (IS_ERR(prog))
return PTR_ERR(prog);
@@ -1593,7 +1645,8 @@ int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
}
} else {
/* BPF_PROG_TYPE_SOCKET_FILTER */
- if (bpf_prog_size(prog->len) > sysctl_optmem_max) {
+ optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
+ if (bpf_prog_size(prog->len) > optmem_max) {
err = -ENOMEM;
goto err_prog_put;
}
@@ -1618,18 +1671,14 @@ void sk_reuseport_prog_free(struct bpf_prog *prog)
bpf_prog_destroy(prog);
}
-struct bpf_scratchpad {
- union {
- __be32 diff[MAX_BPF_STACK / sizeof(__be32)];
- u8 buff[MAX_BPF_STACK];
- };
-};
-
-static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
-
static inline int __bpf_try_make_writable(struct sk_buff *skb,
unsigned int write_len)
{
+#ifdef CONFIG_DEBUG_NET
+ /* Avoid a splat in pskb_may_pull_reason() */
+ if (write_len > INT_MAX)
+ return -EINVAL;
+#endif
return skb_ensure_writable(skb, write_len);
}
@@ -1666,7 +1715,7 @@ BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset,
if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
return -EINVAL;
- if (unlikely(offset > 0xffff))
+ if (unlikely(offset > INT_MAX))
return -EFAULT;
if (unlikely(bpf_try_make_writable(skb, offset + len)))
return -EFAULT;
@@ -1691,17 +1740,23 @@ static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE,
.arg5_type = ARG_ANYTHING,
};
+int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from,
+ u32 len, u64 flags)
+{
+ return ____bpf_skb_store_bytes(skb, offset, from, len, flags);
+}
+
BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset,
void *, to, u32, len)
{
void *ptr;
- if (unlikely(offset > 0xffff))
+ if (unlikely(offset > INT_MAX))
goto err_clear;
ptr = skb_header_pointer(skb, offset, len, to);
@@ -1726,29 +1781,70 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
.arg4_type = ARG_CONST_SIZE,
};
+int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len)
+{
+ return ____bpf_skb_load_bytes(skb, offset, to, len);
+}
+
+BPF_CALL_4(bpf_flow_dissector_load_bytes,
+ const struct bpf_flow_dissector *, ctx, u32, offset,
+ void *, to, u32, len)
+{
+ void *ptr;
+
+ if (unlikely(offset > 0xffff))
+ goto err_clear;
+
+ if (unlikely(!ctx->skb))
+ goto err_clear;
+
+ ptr = skb_header_pointer(ctx->skb, offset, len, to);
+ if (unlikely(!ptr))
+ goto err_clear;
+ if (ptr != to)
+ memcpy(to, ptr, len);
+
+ return 0;
+err_clear:
+ memset(to, 0, len);
+ return -EFAULT;
+}
+
+static const struct bpf_func_proto bpf_flow_dissector_load_bytes_proto = {
+ .func = bpf_flow_dissector_load_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
u32, offset, void *, to, u32, len, u32, start_header)
{
u8 *end = skb_tail_pointer(skb);
- u8 *net = skb_network_header(skb);
- u8 *mac = skb_mac_header(skb);
- u8 *ptr;
+ u8 *start, *ptr;
- if (unlikely(offset > 0xffff || len > (end - mac)))
+ if (unlikely(offset > 0xffff))
goto err_clear;
switch (start_header) {
case BPF_HDR_START_MAC:
- ptr = mac + offset;
+ if (unlikely(!skb_mac_header_was_set(skb)))
+ goto err_clear;
+ start = skb_mac_header(skb);
break;
case BPF_HDR_START_NET:
- ptr = net + offset;
+ start = skb_network_header(skb);
break;
default:
goto err_clear;
}
- if (likely(ptr >= mac && ptr + len <= end)) {
+ ptr = start + offset;
+
+ if (likely(ptr + len <= end)) {
memcpy(to, ptr, len);
return 0;
}
@@ -1791,13 +1887,22 @@ static const struct bpf_func_proto bpf_skb_pull_data_proto = {
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk)
+{
+ return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL;
+}
+
+static const struct bpf_func_proto bpf_sk_fullsock_proto = {
+ .func = bpf_sk_fullsock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_SOCK_COMMON,
+};
+
static inline int sk_skb_try_make_writable(struct sk_buff *skb,
unsigned int write_len)
{
- int err = __bpf_try_make_writable(skb, write_len);
-
- bpf_compute_data_end_sk_skb(skb);
- return err;
+ return __bpf_try_make_writable(skb, write_len);
}
BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
@@ -1872,10 +1977,11 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
bool do_mforce = flags & BPF_F_MARK_ENFORCE;
+ bool is_ipv6 = flags & BPF_F_IPV6;
__sum16 *ptr;
if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE |
- BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK)))
+ BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK | BPF_F_IPV6)))
return -EINVAL;
if (unlikely(offset > 0xffff || offset & 1))
return -EFAULT;
@@ -1891,7 +1997,7 @@ BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset,
if (unlikely(from != 0))
return -EINVAL;
- inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo);
+ inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo, is_ipv6);
break;
case 2:
inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
@@ -1922,10 +2028,6 @@ static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
__be32 *, to, u32, to_size, __wsum, seed)
{
- struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
- u32 diff_size = from_size + to_size;
- int i, j = 0;
-
/* This is quite flexible, some examples:
*
* from_size == 0, to_size > 0, seed := csum --> pushing data
@@ -1934,16 +2036,19 @@ BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size,
*
* Even for diffing, from_size and to_size don't need to be equal.
*/
- if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) ||
- diff_size > sizeof(sp->diff)))
- return -EINVAL;
- for (i = 0; i < from_size / sizeof(__be32); i++, j++)
- sp->diff[j] = ~from[i];
- for (i = 0; i < to_size / sizeof(__be32); i++, j++)
- sp->diff[j] = to[i];
+ __wsum ret = seed;
- return csum_partial(sp->diff, diff_size, seed);
+ if (from_size && to_size)
+ ret = csum_sub(csum_partial(to, to_size, ret),
+ csum_partial(from, from_size, 0));
+ else if (to_size)
+ ret = csum_partial(to, to_size, ret);
+
+ else if (from_size)
+ ret = ~csum_partial(from, from_size, ~ret);
+
+ return csum_from32to16((__force unsigned int)ret);
}
static const struct bpf_func_proto bpf_csum_diff_proto = {
@@ -1951,9 +2056,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = {
.gpl_only = false,
.pkt_access = true,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg1_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
- .arg3_type = ARG_PTR_TO_MEM_OR_NULL,
+ .arg3_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE_OR_ZERO,
.arg5_type = ARG_ANYTHING,
};
@@ -1978,15 +2083,49 @@ static const struct bpf_func_proto bpf_csum_update_proto = {
.arg2_type = ARG_ANYTHING,
};
+BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level)
+{
+ /* The interface is to be used in combination with bpf_skb_adjust_room()
+ * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET
+ * is passed as flags, for example.
+ */
+ switch (level) {
+ case BPF_CSUM_LEVEL_INC:
+ __skb_incr_checksum_unnecessary(skb);
+ break;
+ case BPF_CSUM_LEVEL_DEC:
+ __skb_decr_checksum_unnecessary(skb);
+ break;
+ case BPF_CSUM_LEVEL_RESET:
+ __skb_reset_checksum_unnecessary(skb);
+ break;
+ case BPF_CSUM_LEVEL_QUERY:
+ return skb->ip_summed == CHECKSUM_UNNECESSARY ?
+ skb->csum_level : -EACCES;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_csum_level_proto = {
+ .func = bpf_csum_level,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb)
{
- return dev_forward_skb(dev, skb);
+ return dev_forward_skb_nomtu(dev, skb);
}
static inline int __bpf_rx_skb_no_mac(struct net_device *dev,
struct sk_buff *skb)
{
- int ret = ____dev_forward_skb(dev, skb);
+ int ret = ____dev_forward_skb(dev, skb, false);
if (likely(!ret)) {
skb->dev = dev;
@@ -2000,17 +2139,19 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
{
int ret;
- if (unlikely(__this_cpu_read(xmit_recursion) > XMIT_RECURSION_LIMIT)) {
+ if (dev_xmit_recursion()) {
net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
kfree_skb(skb);
return -ENETDOWN;
}
skb->dev = dev;
+ skb_set_redirected_noclear(skb, skb_at_tc_ingress(skb));
+ skb_clear_tstamp(skb);
- __this_cpu_inc(xmit_recursion);
+ dev_xmit_recursion_inc();
ret = dev_queue_xmit(skb);
- __this_cpu_dec(xmit_recursion);
+ dev_xmit_recursion_dec();
return ret;
}
@@ -2018,18 +2159,24 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb)
static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev,
u32 flags)
{
- /* skb->mac_len is not set on normal egress */
- unsigned int mlen = skb->network_header - skb->mac_header;
+ unsigned int mlen = skb_network_offset(skb);
- __skb_pull(skb, mlen);
+ if (unlikely(skb->len <= mlen)) {
+ kfree_skb(skb);
+ return -ERANGE;
+ }
- /* At ingress, the mac header has already been pulled once.
- * At egress, skb_pospull_rcsum has to be done in case that
- * the skb is originated from ingress (i.e. a forwarded skb)
- * to ensure that rcsum starts at net header.
- */
- if (!skb_at_tc_ingress(skb))
- skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
+ if (mlen) {
+ __skb_pull(skb, mlen);
+
+ /* At ingress, the mac header has already been pulled once.
+ * At egress, skb_pospull_rcsum has to be done in case that
+ * the skb is originated from ingress (i.e. a forwarded skb)
+ * to ensure that rcsum starts at net header.
+ */
+ if (!skb_at_tc_ingress(skb))
+ skb_postpull_rcsum(skb, skb_mac_header(skb), mlen);
+ }
skb_pop_mac_header(skb);
skb_reset_mac_len(skb);
return flags & BPF_F_INGRESS ?
@@ -2040,7 +2187,7 @@ static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev,
u32 flags)
{
/* Verify that a link layer header is carried */
- if (unlikely(skb->mac_header >= skb->network_header)) {
+ if (unlikely(skb->mac_header >= skb->network_header || skb->len == 0)) {
kfree_skb(skb);
return -ERANGE;
}
@@ -2059,15 +2206,265 @@ static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev,
return __bpf_redirect_no_mac(skb, dev, flags);
}
+#if IS_ENABLED(CONFIG_IPV6)
+static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
+ struct net_device *dev, struct bpf_nh_params *nh)
+{
+ u32 hh_len = LL_RESERVED_SPACE(dev);
+ const struct in6_addr *nexthop;
+ struct dst_entry *dst = NULL;
+ struct neighbour *neigh;
+
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
+ goto out_drop;
+ }
+
+ skb->dev = dev;
+ skb_clear_tstamp(skb);
+
+ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+ skb = skb_expand_head(skb, hh_len);
+ if (!skb)
+ return -ENOMEM;
+ }
+
+ rcu_read_lock();
+ if (!nh) {
+ dst = skb_dst(skb);
+ nexthop = rt6_nexthop(dst_rt6_info(dst),
+ &ipv6_hdr(skb)->daddr);
+ } else {
+ nexthop = &nh->ipv6_nh;
+ }
+ neigh = ip_neigh_gw6(dev, nexthop);
+ if (likely(!IS_ERR(neigh))) {
+ int ret;
+
+ sock_confirm_neigh(skb, neigh);
+ local_bh_disable();
+ dev_xmit_recursion_inc();
+ ret = neigh_output(neigh, skb, false);
+ dev_xmit_recursion_dec();
+ local_bh_enable();
+ rcu_read_unlock();
+ return ret;
+ }
+ rcu_read_unlock();
+ if (dst)
+ IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
+out_drop:
+ kfree_skb(skb);
+ return -ENETDOWN;
+}
+
+static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
+ struct bpf_nh_params *nh)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ struct net *net = dev_net(dev);
+ int err, ret = NET_XMIT_DROP;
+
+ if (!nh) {
+ struct dst_entry *dst;
+ struct flowi6 fl6 = {
+ .flowi6_flags = FLOWI_FLAG_ANYSRC,
+ .flowi6_mark = skb->mark,
+ .flowlabel = ip6_flowinfo(ip6h),
+ .flowi6_oif = dev->ifindex,
+ .flowi6_proto = ip6h->nexthdr,
+ .daddr = ip6h->daddr,
+ .saddr = ip6h->saddr,
+ };
+
+ dst = ipv6_stub->ipv6_dst_lookup_flow(net, NULL, &fl6, NULL);
+ if (IS_ERR(dst))
+ goto out_drop;
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+ } else if (nh->nh_family != AF_INET6) {
+ goto out_drop;
+ }
+
+ err = bpf_out_neigh_v6(net, skb, dev, nh);
+ if (unlikely(net_xmit_eval(err)))
+ DEV_STATS_INC(dev, tx_errors);
+ else
+ ret = NET_XMIT_SUCCESS;
+ goto out_xmit;
+out_drop:
+ DEV_STATS_INC(dev, tx_errors);
+ kfree_skb(skb);
+out_xmit:
+ return ret;
+}
+#else
+static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev,
+ struct bpf_nh_params *nh)
+{
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+}
+#endif /* CONFIG_IPV6 */
+
+#if IS_ENABLED(CONFIG_INET)
+static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
+ struct net_device *dev, struct bpf_nh_params *nh)
+{
+ u32 hh_len = LL_RESERVED_SPACE(dev);
+ struct neighbour *neigh;
+ bool is_v6gw = false;
+
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("bpf: recursion limit reached on datapath, buggy bpf program?\n");
+ goto out_drop;
+ }
+
+ skb->dev = dev;
+ skb_clear_tstamp(skb);
+
+ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+ skb = skb_expand_head(skb, hh_len);
+ if (!skb)
+ return -ENOMEM;
+ }
+
+ rcu_read_lock();
+ if (!nh) {
+ struct rtable *rt = skb_rtable(skb);
+
+ neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
+ } else if (nh->nh_family == AF_INET6) {
+ neigh = ip_neigh_gw6(dev, &nh->ipv6_nh);
+ is_v6gw = true;
+ } else if (nh->nh_family == AF_INET) {
+ neigh = ip_neigh_gw4(dev, nh->ipv4_nh);
+ } else {
+ rcu_read_unlock();
+ goto out_drop;
+ }
+
+ if (likely(!IS_ERR(neigh))) {
+ int ret;
+
+ sock_confirm_neigh(skb, neigh);
+ local_bh_disable();
+ dev_xmit_recursion_inc();
+ ret = neigh_output(neigh, skb, is_v6gw);
+ dev_xmit_recursion_dec();
+ local_bh_enable();
+ rcu_read_unlock();
+ return ret;
+ }
+ rcu_read_unlock();
+out_drop:
+ kfree_skb(skb);
+ return -ENETDOWN;
+}
+
+static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
+ struct bpf_nh_params *nh)
+{
+ const struct iphdr *ip4h = ip_hdr(skb);
+ struct net *net = dev_net(dev);
+ int err, ret = NET_XMIT_DROP;
+
+ if (!nh) {
+ struct flowi4 fl4 = {
+ .flowi4_flags = FLOWI_FLAG_ANYSRC,
+ .flowi4_mark = skb->mark,
+ .flowi4_dscp = ip4h_dscp(ip4h),
+ .flowi4_oif = dev->ifindex,
+ .flowi4_proto = ip4h->protocol,
+ .daddr = ip4h->daddr,
+ .saddr = ip4h->saddr,
+ };
+ struct rtable *rt;
+
+ rt = ip_route_output_flow(net, &fl4, NULL);
+ if (IS_ERR(rt))
+ goto out_drop;
+ if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
+ ip_rt_put(rt);
+ goto out_drop;
+ }
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &rt->dst);
+ }
+
+ err = bpf_out_neigh_v4(net, skb, dev, nh);
+ if (unlikely(net_xmit_eval(err)))
+ DEV_STATS_INC(dev, tx_errors);
+ else
+ ret = NET_XMIT_SUCCESS;
+ goto out_xmit;
+out_drop:
+ DEV_STATS_INC(dev, tx_errors);
+ kfree_skb(skb);
+out_xmit:
+ return ret;
+}
+#else
+static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
+ struct bpf_nh_params *nh)
+{
+ kfree_skb(skb);
+ return NET_XMIT_DROP;
+}
+#endif /* CONFIG_INET */
+
+static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev,
+ struct bpf_nh_params *nh)
+{
+ struct ethhdr *ethh = eth_hdr(skb);
+
+ if (unlikely(skb->mac_header >= skb->network_header))
+ goto out;
+ bpf_push_mac_rcsum(skb);
+ if (is_multicast_ether_addr(ethh->h_dest))
+ goto out;
+
+ skb_pull(skb, sizeof(*ethh));
+ skb_unset_mac_header(skb);
+ skb_reset_network_header(skb);
+
+ if (skb->protocol == htons(ETH_P_IP))
+ return __bpf_redirect_neigh_v4(skb, dev, nh);
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ return __bpf_redirect_neigh_v6(skb, dev, nh);
+out:
+ kfree_skb(skb);
+ return -ENOTSUPP;
+}
+
+/* Internal, non-exposed redirect flags. */
+enum {
+ BPF_F_NEIGH = (1ULL << 16),
+ BPF_F_PEER = (1ULL << 17),
+ BPF_F_NEXTHOP = (1ULL << 18),
+#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER | BPF_F_NEXTHOP)
+};
+
BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
{
struct net_device *dev;
struct sk_buff *clone;
int ret;
- if (unlikely(flags & ~(BPF_F_INGRESS)))
+ BUILD_BUG_ON(BPF_F_REDIRECT_INTERNAL & BPF_F_REDIRECT_FLAGS);
+
+ if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
return -EINVAL;
+ /* BPF test infra's convert___skb_to_skb() can create type-less
+ * GSO packets. gso_features_check() will detect this as a bad
+ * offload. However, lets not leak them out in the first place.
+ */
+ if (unlikely(skb_is_gso(skb) && !skb_shinfo(skb)->gso_type))
+ return -EBADMSG;
+
dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
if (unlikely(!dev))
return -EINVAL;
@@ -2099,162 +2496,121 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = {
.arg3_type = ARG_ANYTHING,
};
-DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info);
-EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
-
-BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
+static struct net_device *skb_get_peer_dev(struct net_device *dev)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ const struct net_device_ops *ops = dev->netdev_ops;
- if (unlikely(flags & ~(BPF_F_INGRESS)))
- return TC_ACT_SHOT;
-
- ri->ifindex = ifindex;
- ri->flags = flags;
-
- return TC_ACT_REDIRECT;
+ if (likely(ops->ndo_get_peer_dev))
+ return INDIRECT_CALL_1(ops->ndo_get_peer_dev,
+ netkit_peer_dev, dev);
+ return NULL;
}
int skb_do_redirect(struct sk_buff *skb)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
+ struct net *net = dev_net(skb->dev);
struct net_device *dev;
+ u32 flags = ri->flags;
- dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex);
- ri->ifindex = 0;
- if (unlikely(!dev)) {
- kfree_skb(skb);
- return -EINVAL;
+ dev = dev_get_by_index_rcu(net, ri->tgt_index);
+ ri->tgt_index = 0;
+ ri->flags = 0;
+ if (unlikely(!dev))
+ goto out_drop;
+ if (flags & BPF_F_PEER) {
+ if (unlikely(!skb_at_tc_ingress(skb)))
+ goto out_drop;
+ dev = skb_get_peer_dev(dev);
+ if (unlikely(!dev ||
+ !(dev->flags & IFF_UP) ||
+ net_eq(net, dev_net(dev))))
+ goto out_drop;
+ skb->dev = dev;
+ dev_sw_netstats_rx_add(dev, skb->len);
+ skb_scrub_packet(skb, false);
+ return -EAGAIN;
}
-
- return __bpf_redirect(skb, dev, ri->flags);
+ return flags & BPF_F_NEIGH ?
+ __bpf_redirect_neigh(skb, dev, flags & BPF_F_NEXTHOP ?
+ &ri->nh : NULL) :
+ __bpf_redirect(skb, dev, flags);
+out_drop:
+ kfree_skb(skb);
+ return -EINVAL;
}
-static const struct bpf_func_proto bpf_redirect_proto = {
- .func = bpf_redirect,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_ANYTHING,
- .arg2_type = ARG_ANYTHING,
-};
-
-BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
- struct bpf_map *, map, void *, key, u64, flags)
+BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
{
- struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
- /* If user passes invalid input drop the packet. */
- if (unlikely(flags & ~(BPF_F_INGRESS)))
- return SK_DROP;
+ if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL)))
+ return TC_ACT_SHOT;
- tcb->bpf.flags = flags;
- tcb->bpf.sk_redir = __sock_hash_lookup_elem(map, key);
- if (!tcb->bpf.sk_redir)
- return SK_DROP;
+ ri->flags = flags;
+ ri->tgt_index = ifindex;
- return SK_PASS;
+ return TC_ACT_REDIRECT;
}
-static const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
- .func = bpf_sk_redirect_hash,
+static const struct bpf_func_proto bpf_redirect_proto = {
+ .func = bpf_redirect,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_PTR_TO_MAP_KEY,
- .arg4_type = ARG_ANYTHING,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_ANYTHING,
};
-BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
- struct bpf_map *, map, u32, key, u64, flags)
+BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
{
- struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
- /* If user passes invalid input drop the packet. */
- if (unlikely(flags & ~(BPF_F_INGRESS)))
- return SK_DROP;
-
- tcb->bpf.flags = flags;
- tcb->bpf.sk_redir = __sock_map_lookup_elem(map, key);
- if (!tcb->bpf.sk_redir)
- return SK_DROP;
-
- return SK_PASS;
-}
+ if (unlikely(flags))
+ return TC_ACT_SHOT;
-struct sock *do_sk_redirect_map(struct sk_buff *skb)
-{
- struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+ ri->flags = BPF_F_PEER;
+ ri->tgt_index = ifindex;
- return tcb->bpf.sk_redir;
+ return TC_ACT_REDIRECT;
}
-static const struct bpf_func_proto bpf_sk_redirect_map_proto = {
- .func = bpf_sk_redirect_map,
+static const struct bpf_func_proto bpf_redirect_peer_proto = {
+ .func = bpf_redirect_peer,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_ANYTHING,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_ANYTHING,
};
-BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg_buff *, msg,
- struct bpf_map *, map, void *, key, u64, flags)
+BPF_CALL_4(bpf_redirect_neigh, u32, ifindex, struct bpf_redir_neigh *, params,
+ int, plen, u64, flags)
{
- /* If user passes invalid input drop the packet. */
- if (unlikely(flags & ~(BPF_F_INGRESS)))
- return SK_DROP;
-
- msg->flags = flags;
- msg->sk_redir = __sock_hash_lookup_elem(map, key);
- if (!msg->sk_redir)
- return SK_DROP;
-
- return SK_PASS;
-}
-
-static const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
- .func = bpf_msg_redirect_hash,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_PTR_TO_MAP_KEY,
- .arg4_type = ARG_ANYTHING,
-};
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
-BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg,
- struct bpf_map *, map, u32, key, u64, flags)
-{
- /* If user passes invalid input drop the packet. */
- if (unlikely(flags & ~(BPF_F_INGRESS)))
- return SK_DROP;
+ if (unlikely((plen && plen < sizeof(*params)) || flags))
+ return TC_ACT_SHOT;
- msg->flags = flags;
- msg->sk_redir = __sock_map_lookup_elem(map, key);
- if (!msg->sk_redir)
- return SK_DROP;
+ ri->flags = BPF_F_NEIGH | (plen ? BPF_F_NEXTHOP : 0);
+ ri->tgt_index = ifindex;
- return SK_PASS;
-}
+ BUILD_BUG_ON(sizeof(struct bpf_redir_neigh) != sizeof(struct bpf_nh_params));
+ if (plen)
+ memcpy(&ri->nh, params, sizeof(ri->nh));
-struct sock *do_msg_redirect_map(struct sk_msg_buff *msg)
-{
- return msg->sk_redir;
+ return TC_ACT_REDIRECT;
}
-static const struct bpf_func_proto bpf_msg_redirect_map_proto = {
- .func = bpf_msg_redirect_map,
- .gpl_only = false,
- .ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_CONST_MAP_PTR,
- .arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_ANYTHING,
+static const struct bpf_func_proto bpf_redirect_neigh_proto = {
+ .func = bpf_redirect_neigh,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_ANYTHING,
+ .arg2_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
};
-BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg_buff *, msg, u32, bytes)
+BPF_CALL_2(bpf_msg_apply_bytes, struct sk_msg *, msg, u32, bytes)
{
msg->apply_bytes = bytes;
return 0;
@@ -2268,12 +2624,26 @@ static const struct bpf_func_proto bpf_msg_apply_bytes_proto = {
.arg2_type = ARG_ANYTHING,
};
-BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg_buff *, msg, u32, bytes)
+BPF_CALL_2(bpf_msg_cork_bytes, struct sk_msg *, msg, u32, bytes)
{
msg->cork_bytes = bytes;
return 0;
}
+static void sk_msg_reset_curr(struct sk_msg *msg)
+{
+ if (!msg->sg.size) {
+ msg->sg.curr = msg->sg.start;
+ msg->sg.copybreak = 0;
+ } else {
+ u32 i = msg->sg.end;
+
+ sk_msg_iter_var_prev(i);
+ msg->sg.curr = i;
+ msg->sg.copybreak = msg->sg.data[i].length;
+ }
+}
+
static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
.func = bpf_msg_cork_bytes,
.gpl_only = false,
@@ -2282,45 +2652,37 @@ static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
.arg2_type = ARG_ANYTHING,
};
-#define sk_msg_iter_var(var) \
- do { \
- var++; \
- if (var == MAX_SKB_FRAGS) \
- var = 0; \
- } while (0)
-
-BPF_CALL_4(bpf_msg_pull_data,
- struct sk_msg_buff *, msg, u32, start, u32, end, u64, flags)
+BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
+ u32, end, u64, flags)
{
- unsigned int len = 0, offset = 0, copy = 0, poffset = 0;
- int bytes = end - start, bytes_sg_total;
- struct scatterlist *sg = msg->sg_data;
- int first_sg, last_sg, i, shift;
- unsigned char *p, *to, *from;
+ u32 len = 0, offset = 0, copy = 0, poffset = 0, bytes = end - start;
+ u32 first_sge, last_sge, i, shift, bytes_sg_total;
+ struct scatterlist *sge;
+ u8 *raw, *to, *from;
struct page *page;
if (unlikely(flags || end <= start))
return -EINVAL;
/* First find the starting scatterlist element */
- i = msg->sg_start;
+ i = msg->sg.start;
do {
- len = sg[i].length;
+ offset += len;
+ len = sk_msg_elem(msg, i)->length;
if (start < offset + len)
break;
- offset += len;
- sk_msg_iter_var(i);
- } while (i != msg->sg_end);
+ sk_msg_iter_var_next(i);
+ } while (i != msg->sg.end);
if (unlikely(start >= offset + len))
return -EINVAL;
- first_sg = i;
+ first_sge = i;
/* The start may point into the sg element so we need to also
* account for the headroom.
*/
bytes_sg_total = start - offset + bytes;
- if (!msg->sg_copy[i] && bytes_sg_total <= len)
+ if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len)
goto out;
/* At this point we need to linearize multiple scatterlist
@@ -2334,12 +2696,12 @@ BPF_CALL_4(bpf_msg_pull_data,
* will copy the entire sg entry.
*/
do {
- copy += sg[i].length;
- sk_msg_iter_var(i);
+ copy += sk_msg_elem(msg, i)->length;
+ sk_msg_iter_var_next(i);
if (bytes_sg_total <= copy)
break;
- } while (i != msg->sg_end);
- last_sg = i;
+ } while (i != msg->sg.end);
+ last_sge = i;
if (unlikely(bytes_sg_total > copy))
return -EINVAL;
@@ -2348,63 +2710,62 @@ BPF_CALL_4(bpf_msg_pull_data,
get_order(copy));
if (unlikely(!page))
return -ENOMEM;
- p = page_address(page);
- i = first_sg;
+ raw = page_address(page);
+ i = first_sge;
do {
- from = sg_virt(&sg[i]);
- len = sg[i].length;
- to = p + poffset;
+ sge = sk_msg_elem(msg, i);
+ from = sg_virt(sge);
+ len = sge->length;
+ to = raw + poffset;
memcpy(to, from, len);
poffset += len;
- sg[i].length = 0;
- put_page(sg_page(&sg[i]));
+ sge->length = 0;
+ put_page(sg_page(sge));
- sk_msg_iter_var(i);
- } while (i != last_sg);
+ sk_msg_iter_var_next(i);
+ } while (i != last_sge);
- sg[first_sg].length = copy;
- sg_set_page(&sg[first_sg], page, copy, 0);
+ sg_set_page(&msg->sg.data[first_sge], page, copy, 0);
/* To repair sg ring we need to shift entries. If we only
* had a single entry though we can just replace it and
* be done. Otherwise walk the ring and shift the entries.
*/
- WARN_ON_ONCE(last_sg == first_sg);
- shift = last_sg > first_sg ?
- last_sg - first_sg - 1 :
- MAX_SKB_FRAGS - first_sg + last_sg - 1;
+ WARN_ON_ONCE(last_sge == first_sge);
+ shift = last_sge > first_sge ?
+ last_sge - first_sge - 1 :
+ NR_MSG_FRAG_IDS - first_sge + last_sge - 1;
if (!shift)
goto out;
- i = first_sg;
- sk_msg_iter_var(i);
+ i = first_sge;
+ sk_msg_iter_var_next(i);
do {
- int move_from;
+ u32 move_from;
- if (i + shift >= MAX_SKB_FRAGS)
- move_from = i + shift - MAX_SKB_FRAGS;
+ if (i + shift >= NR_MSG_FRAG_IDS)
+ move_from = i + shift - NR_MSG_FRAG_IDS;
else
move_from = i + shift;
-
- if (move_from == msg->sg_end)
+ if (move_from == msg->sg.end)
break;
- sg[i] = sg[move_from];
- sg[move_from].length = 0;
- sg[move_from].page_link = 0;
- sg[move_from].offset = 0;
-
- sk_msg_iter_var(i);
+ msg->sg.data[i] = msg->sg.data[move_from];
+ msg->sg.data[move_from].length = 0;
+ msg->sg.data[move_from].page_link = 0;
+ msg->sg.data[move_from].offset = 0;
+ sk_msg_iter_var_next(i);
} while (1);
- msg->sg_end -= shift;
- if (msg->sg_end < 0)
- msg->sg_end += MAX_SKB_FRAGS;
+
+ msg->sg.end = msg->sg.end - shift > msg->sg.end ?
+ msg->sg.end - shift + NR_MSG_FRAG_IDS :
+ msg->sg.end - shift;
out:
- msg->data = sg_virt(&sg[first_sg]) + start - offset;
+ sk_msg_reset_curr(msg);
+ msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
msg->data_end = msg->data + bytes;
-
return 0;
}
@@ -2418,6 +2779,357 @@ static const struct bpf_func_proto bpf_msg_pull_data_proto = {
.arg4_type = ARG_ANYTHING,
};
+BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
+ u32, len, u64, flags)
+{
+ struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
+ u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
+ u8 *raw, *to, *from;
+ struct page *page;
+
+ if (unlikely(flags))
+ return -EINVAL;
+
+ if (unlikely(len == 0))
+ return 0;
+
+ /* First find the starting scatterlist element */
+ i = msg->sg.start;
+ do {
+ offset += l;
+ l = sk_msg_elem(msg, i)->length;
+
+ if (start < offset + l)
+ break;
+ sk_msg_iter_var_next(i);
+ } while (i != msg->sg.end);
+
+ if (start > offset + l)
+ return -EINVAL;
+
+ space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
+
+ /* If no space available will fallback to copy, we need at
+ * least one scatterlist elem available to push data into
+ * when start aligns to the beginning of an element or two
+ * when it falls inside an element. We handle the start equals
+ * offset case because its the common case for inserting a
+ * header.
+ */
+ if (!space || (space == 1 && start != offset))
+ copy = msg->sg.data[i].length;
+
+ page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
+ get_order(copy + len));
+ if (unlikely(!page))
+ return -ENOMEM;
+
+ if (copy) {
+ int front, back;
+
+ raw = page_address(page);
+
+ if (i == msg->sg.end)
+ sk_msg_iter_var_prev(i);
+ psge = sk_msg_elem(msg, i);
+ front = start - offset;
+ back = psge->length - front;
+ from = sg_virt(psge);
+
+ if (front)
+ memcpy(raw, from, front);
+
+ if (back) {
+ from += front;
+ to = raw + front + len;
+
+ memcpy(to, from, back);
+ }
+
+ put_page(sg_page(psge));
+ new = i;
+ goto place_new;
+ }
+
+ if (start - offset) {
+ if (i == msg->sg.end)
+ sk_msg_iter_var_prev(i);
+ psge = sk_msg_elem(msg, i);
+ rsge = sk_msg_elem_cpy(msg, i);
+
+ psge->length = start - offset;
+ rsge.length -= psge->length;
+ rsge.offset += start;
+
+ sk_msg_iter_var_next(i);
+ sg_unmark_end(psge);
+ sg_unmark_end(&rsge);
+ }
+
+ /* Slot(s) to place newly allocated data */
+ sk_msg_iter_next(msg, end);
+ new = i;
+ sk_msg_iter_var_next(i);
+
+ if (i == msg->sg.end) {
+ if (!rsge.length)
+ goto place_new;
+ sk_msg_iter_next(msg, end);
+ goto place_new;
+ }
+
+ /* Shift one or two slots as needed */
+ sge = sk_msg_elem_cpy(msg, new);
+ sg_unmark_end(&sge);
+
+ nsge = sk_msg_elem_cpy(msg, i);
+ if (rsge.length) {
+ sk_msg_iter_var_next(i);
+ nnsge = sk_msg_elem_cpy(msg, i);
+ sk_msg_iter_next(msg, end);
+ }
+
+ while (i != msg->sg.end) {
+ msg->sg.data[i] = sge;
+ sge = nsge;
+ sk_msg_iter_var_next(i);
+ if (rsge.length) {
+ nsge = nnsge;
+ nnsge = sk_msg_elem_cpy(msg, i);
+ } else {
+ nsge = sk_msg_elem_cpy(msg, i);
+ }
+ }
+
+place_new:
+ /* Place newly allocated data buffer */
+ sk_mem_charge(msg->sk, len);
+ msg->sg.size += len;
+ __clear_bit(new, msg->sg.copy);
+ sg_set_page(&msg->sg.data[new], page, len + copy, 0);
+ if (rsge.length) {
+ get_page(sg_page(&rsge));
+ sk_msg_iter_var_next(new);
+ msg->sg.data[new] = rsge;
+ }
+
+ sk_msg_reset_curr(msg);
+ sk_msg_compute_data_pointers(msg);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_push_data_proto = {
+ .func = bpf_msg_push_data,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+static void sk_msg_shift_left(struct sk_msg *msg, int i)
+{
+ struct scatterlist *sge = sk_msg_elem(msg, i);
+ int prev;
+
+ put_page(sg_page(sge));
+ do {
+ prev = i;
+ sk_msg_iter_var_next(i);
+ msg->sg.data[prev] = msg->sg.data[i];
+ } while (i != msg->sg.end);
+
+ sk_msg_iter_prev(msg, end);
+}
+
+static void sk_msg_shift_right(struct sk_msg *msg, int i)
+{
+ struct scatterlist tmp, sge;
+
+ sk_msg_iter_next(msg, end);
+ sge = sk_msg_elem_cpy(msg, i);
+ sk_msg_iter_var_next(i);
+ tmp = sk_msg_elem_cpy(msg, i);
+
+ while (i != msg->sg.end) {
+ msg->sg.data[i] = sge;
+ sk_msg_iter_var_next(i);
+ sge = tmp;
+ tmp = sk_msg_elem_cpy(msg, i);
+ }
+}
+
+BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
+ u32, len, u64, flags)
+{
+ u32 i = 0, l = 0, space, offset = 0;
+ u64 last = start + len;
+ int pop;
+
+ if (unlikely(flags))
+ return -EINVAL;
+
+ if (unlikely(len == 0))
+ return 0;
+
+ /* First find the starting scatterlist element */
+ i = msg->sg.start;
+ do {
+ offset += l;
+ l = sk_msg_elem(msg, i)->length;
+
+ if (start < offset + l)
+ break;
+ sk_msg_iter_var_next(i);
+ } while (i != msg->sg.end);
+
+ /* Bounds checks: start and pop must be inside message */
+ if (start >= offset + l || last > msg->sg.size)
+ return -EINVAL;
+
+ space = MAX_MSG_FRAGS - sk_msg_elem_used(msg);
+
+ pop = len;
+ /* --------------| offset
+ * -| start |-------- len -------|
+ *
+ * |----- a ----|-------- pop -------|----- b ----|
+ * |______________________________________________| length
+ *
+ *
+ * a: region at front of scatter element to save
+ * b: region at back of scatter element to save when length > A + pop
+ * pop: region to pop from element, same as input 'pop' here will be
+ * decremented below per iteration.
+ *
+ * Two top-level cases to handle when start != offset, first B is non
+ * zero and second B is zero corresponding to when a pop includes more
+ * than one element.
+ *
+ * Then if B is non-zero AND there is no space allocate space and
+ * compact A, B regions into page. If there is space shift ring to
+ * the right free'ing the next element in ring to place B, leaving
+ * A untouched except to reduce length.
+ */
+ if (start != offset) {
+ struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
+ int a = start - offset;
+ int b = sge->length - pop - a;
+
+ sk_msg_iter_var_next(i);
+
+ if (b > 0) {
+ if (space) {
+ sge->length = a;
+ sk_msg_shift_right(msg, i);
+ nsge = sk_msg_elem(msg, i);
+ get_page(sg_page(sge));
+ sg_set_page(nsge,
+ sg_page(sge),
+ b, sge->offset + pop + a);
+ } else {
+ struct page *page, *orig;
+ u8 *to, *from;
+
+ page = alloc_pages(__GFP_NOWARN |
+ __GFP_COMP | GFP_ATOMIC,
+ get_order(a + b));
+ if (unlikely(!page))
+ return -ENOMEM;
+
+ orig = sg_page(sge);
+ from = sg_virt(sge);
+ to = page_address(page);
+ memcpy(to, from, a);
+ memcpy(to + a, from + a + pop, b);
+ sg_set_page(sge, page, a + b, 0);
+ put_page(orig);
+ }
+ pop = 0;
+ } else {
+ pop -= (sge->length - a);
+ sge->length = a;
+ }
+ }
+
+ /* From above the current layout _must_ be as follows,
+ *
+ * -| offset
+ * -| start
+ *
+ * |---- pop ---|---------------- b ------------|
+ * |____________________________________________| length
+ *
+ * Offset and start of the current msg elem are equal because in the
+ * previous case we handled offset != start and either consumed the
+ * entire element and advanced to the next element OR pop == 0.
+ *
+ * Two cases to handle here are first pop is less than the length
+ * leaving some remainder b above. Simply adjust the element's layout
+ * in this case. Or pop >= length of the element so that b = 0. In this
+ * case advance to next element decrementing pop.
+ */
+ while (pop) {
+ struct scatterlist *sge = sk_msg_elem(msg, i);
+
+ if (pop < sge->length) {
+ sge->length -= pop;
+ sge->offset += pop;
+ pop = 0;
+ } else {
+ pop -= sge->length;
+ sk_msg_shift_left(msg, i);
+ }
+ }
+
+ sk_mem_uncharge(msg->sk, len - pop);
+ msg->sg.size -= (len - pop);
+ sk_msg_reset_curr(msg);
+ sk_msg_compute_data_pointers(msg);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_msg_pop_data_proto = {
+ .func = bpf_msg_pop_data,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+#ifdef CONFIG_CGROUP_NET_CLASSID
+BPF_CALL_0(bpf_get_cgroup_classid_curr)
+{
+ return __task_get_classid(current);
+}
+
+const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = {
+ .func = bpf_get_cgroup_classid_curr,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+};
+
+BPF_CALL_1(bpf_skb_cgroup_classid, const struct sk_buff *, skb)
+{
+ struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk || !sk_fullsock(sk))
+ return 0;
+
+ return sock_cgroup_classid(&sk->sk_cgrp_data);
+}
+
+static const struct bpf_func_proto bpf_skb_cgroup_classid_proto = {
+ .func = bpf_skb_cgroup_classid,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+#endif
+
BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb)
{
return task_get_classid(skb);
@@ -2505,6 +3217,7 @@ BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto,
bpf_push_mac_rcsum(skb);
ret = skb_vlan_push(skb, vlan_proto, vlan_tci);
bpf_pull_mac_rcsum(skb);
+ skb_reset_mac_len(skb);
bpf_compute_data_pointers(skb);
return ret;
@@ -2538,13 +3251,20 @@ static const struct bpf_func_proto bpf_skb_vlan_pop_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto)
+{
+ skb->protocol = htons(proto);
+ if (skb_valid_dst(skb))
+ skb_dst_drop(skb);
+}
+
static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
{
- /* Caller already did skb_cow() with len as headroom,
+ /* Caller already did skb_cow() with meta_len+len as headroom,
* so no need to do it here.
*/
skb_push(skb, len);
- memmove(skb->data, skb->data + len, off);
+ skb_postpush_data_move(skb, len, off);
memset(skb->data + off, 0, len);
/* No skb_postpush_rcsum(skb, skb->data + off, len)
@@ -2557,15 +3277,18 @@ static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len)
static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len)
{
+ void *old_data;
+
/* skb_ensure_writable() is not needed here, as we're
* already working on an uncloned skb.
*/
if (unlikely(!pskb_may_pull(skb, off + len)))
return -ENOMEM;
- skb_postpull_rcsum(skb, skb->data + off, len);
- memmove(skb->data + len, skb->data, off);
+ old_data = skb->data;
__skb_pull(skb, len);
+ skb_postpull_rcsum(skb, old_data + off, len);
+ skb_postpull_data_move(skb, len, off);
return 0;
}
@@ -2610,14 +3333,11 @@ static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len)
static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
{
const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr);
+ const u8 meta_len = skb_metadata_len(skb);
u32 off = skb_mac_header_len(skb);
int ret;
- /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */
- if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb)))
- return -ENOTSUPP;
-
- ret = skb_cow(skb, len_diff);
+ ret = skb_cow(skb, meta_len + len_diff);
if (unlikely(ret < 0))
return ret;
@@ -2628,22 +3348,14 @@ static int bpf_skb_proto_4_to_6(struct sk_buff *skb)
if (skb_is_gso(skb)) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
- /* SKB_GSO_TCPV4 needs to be changed into
- * SKB_GSO_TCPV6.
- */
+ /* SKB_GSO_TCPV4 needs to be changed into SKB_GSO_TCPV6. */
if (shinfo->gso_type & SKB_GSO_TCPV4) {
shinfo->gso_type &= ~SKB_GSO_TCPV4;
shinfo->gso_type |= SKB_GSO_TCPV6;
}
-
- /* Due to IPv6 header, MSS needs to be downgraded. */
- skb_decrease_gso_size(shinfo, len_diff);
- /* Header must be checked, and gso_segs recomputed. */
- shinfo->gso_type |= SKB_GSO_DODGY;
- shinfo->gso_segs = 0;
}
- skb->protocol = htons(ETH_P_IPV6);
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
skb_clear_hash(skb);
return 0;
@@ -2655,10 +3367,6 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
u32 off = skb_mac_header_len(skb);
int ret;
- /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */
- if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb)))
- return -ENOTSUPP;
-
ret = skb_unclone(skb, GFP_ATOMIC);
if (unlikely(ret < 0))
return ret;
@@ -2670,22 +3378,14 @@ static int bpf_skb_proto_6_to_4(struct sk_buff *skb)
if (skb_is_gso(skb)) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
- /* SKB_GSO_TCPV6 needs to be changed into
- * SKB_GSO_TCPV4.
- */
+ /* SKB_GSO_TCPV6 needs to be changed into SKB_GSO_TCPV4. */
if (shinfo->gso_type & SKB_GSO_TCPV6) {
shinfo->gso_type &= ~SKB_GSO_TCPV6;
shinfo->gso_type |= SKB_GSO_TCPV4;
}
-
- /* Due to IPv4 header, MSS can be upgraded. */
- skb_increase_gso_size(shinfo, len_diff);
- /* Header must be checked, and gso_segs recomputed. */
- shinfo->gso_type |= SKB_GSO_DODGY;
- shinfo->gso_segs = 0;
}
- skb->protocol = htons(ETH_P_IP);
+ bpf_skb_change_protocol(skb, ETH_P_IP);
skb_clear_hash(skb);
return 0;
@@ -2776,44 +3476,151 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb)
}
}
-static int bpf_skb_net_grow(struct sk_buff *skb, u32 len_diff)
+#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \
+ BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
+
+#define BPF_F_ADJ_ROOM_DECAP_L3_MASK (BPF_F_ADJ_ROOM_DECAP_L3_IPV4 | \
+ BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
+
+#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \
+ BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
+ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
+ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
+ BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
+ BPF_F_ADJ_ROOM_ENCAP_L2( \
+ BPF_ADJ_ROOM_ENCAP_L2_MASK) | \
+ BPF_F_ADJ_ROOM_DECAP_L3_MASK)
+
+static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff,
+ u64 flags)
{
- u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
+ u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT;
+ bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK;
+ u16 mac_len = 0, inner_net = 0, inner_trans = 0;
+ const u8 meta_len = skb_metadata_len(skb);
+ unsigned int gso_type = SKB_GSO_DODGY;
int ret;
- /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */
- if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb)))
- return -ENOTSUPP;
+ if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
+ /* udp gso_size delineates datagrams, only allow if fixed */
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
+ !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
+ return -ENOTSUPP;
+ }
- ret = skb_cow(skb, len_diff);
+ ret = skb_cow_head(skb, meta_len + len_diff);
if (unlikely(ret < 0))
return ret;
+ if (encap) {
+ if (skb->protocol != htons(ETH_P_IP) &&
+ skb->protocol != htons(ETH_P_IPV6))
+ return -ENOTSUPP;
+
+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 &&
+ flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
+ return -EINVAL;
+
+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE &&
+ flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
+ return -EINVAL;
+
+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH &&
+ inner_mac_len < ETH_HLEN)
+ return -EINVAL;
+
+ if (skb->encapsulation)
+ return -EALREADY;
+
+ mac_len = skb->network_header - skb->mac_header;
+ inner_net = skb->network_header;
+ if (inner_mac_len > len_diff)
+ return -EINVAL;
+ inner_trans = skb->transport_header;
+ }
+
ret = bpf_skb_net_hdr_push(skb, off, len_diff);
if (unlikely(ret < 0))
return ret;
+ if (encap) {
+ skb->inner_mac_header = inner_net - inner_mac_len;
+ skb->inner_network_header = inner_net;
+ skb->inner_transport_header = inner_trans;
+
+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH)
+ skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+ else
+ skb_set_inner_protocol(skb, skb->protocol);
+
+ skb->encapsulation = 1;
+ skb_set_network_header(skb, mac_len);
+
+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
+ gso_type |= SKB_GSO_UDP_TUNNEL;
+ else if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE)
+ gso_type |= SKB_GSO_GRE;
+ else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
+ gso_type |= SKB_GSO_IPXIP6;
+ else if (flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
+ gso_type |= SKB_GSO_IPXIP4;
+
+ if (flags & BPF_F_ADJ_ROOM_ENCAP_L4_GRE ||
+ flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) {
+ int nh_len = flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 ?
+ sizeof(struct ipv6hdr) :
+ sizeof(struct iphdr);
+
+ skb_set_transport_header(skb, mac_len + nh_len);
+ }
+
+ /* Match skb->protocol to new outer l3 protocol */
+ if (skb->protocol == htons(ETH_P_IP) &&
+ flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV6)
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
+ else if (skb->protocol == htons(ETH_P_IPV6) &&
+ flags & BPF_F_ADJ_ROOM_ENCAP_L3_IPV4)
+ bpf_skb_change_protocol(skb, ETH_P_IP);
+ }
+
if (skb_is_gso(skb)) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
- /* Due to header grow, MSS needs to be downgraded. */
- skb_decrease_gso_size(shinfo, len_diff);
/* Header must be checked, and gso_segs recomputed. */
- shinfo->gso_type |= SKB_GSO_DODGY;
+ shinfo->gso_type |= gso_type;
shinfo->gso_segs = 0;
+
+ /* Due to header growth, MSS needs to be downgraded.
+ * There is a BUG_ON() when segmenting the frag_list with
+ * head_frag true, so linearize the skb after downgrading
+ * the MSS.
+ */
+ if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO)) {
+ skb_decrease_gso_size(shinfo, len_diff);
+ if (shinfo->frag_list)
+ return skb_linearize(skb);
+ }
}
return 0;
}
-static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
+static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff,
+ u64 flags)
{
- u32 off = skb_mac_header_len(skb) + bpf_skb_net_base_len(skb);
int ret;
- /* SCTP uses GSO_BY_FRAGS, thus cannot adjust it. */
- if (skb_is_gso(skb) && unlikely(skb_is_gso_sctp(skb)))
- return -ENOTSUPP;
+ if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO |
+ BPF_F_ADJ_ROOM_DECAP_L3_MASK |
+ BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
+ return -EINVAL;
+
+ if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) {
+ /* udp gso_size delineates datagrams, only allow if fixed */
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) ||
+ !(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
+ return -ENOTSUPP;
+ }
ret = skb_unclone(skb, GFP_ATOMIC);
if (unlikely(ret < 0))
@@ -2823,11 +3630,21 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
if (unlikely(ret < 0))
return ret;
+ /* Match skb->protocol to new outer l3 protocol */
+ if (skb->protocol == htons(ETH_P_IP) &&
+ flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV6)
+ bpf_skb_change_protocol(skb, ETH_P_IPV6);
+ else if (skb->protocol == htons(ETH_P_IPV6) &&
+ flags & BPF_F_ADJ_ROOM_DECAP_L3_IPV4)
+ bpf_skb_change_protocol(skb, ETH_P_IP);
+
if (skb_is_gso(skb)) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
/* Due to header shrink, MSS can be upgraded. */
- skb_increase_gso_size(shinfo, len_diff);
+ if (!(flags & BPF_F_ADJ_ROOM_FIXED_GSO))
+ skb_increase_gso_size(shinfo, len_diff);
+
/* Header must be checked, and gso_segs recomputed. */
shinfo->gso_type |= SKB_GSO_DODGY;
shinfo->gso_segs = 0;
@@ -2836,55 +3653,112 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
return 0;
}
-static u32 __bpf_skb_max_len(const struct sk_buff *skb)
+#define BPF_SKB_MAX_LEN SKB_MAX_ALLOC
+
+BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
+ u32, mode, u64, flags)
{
- return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len :
- SKB_MAX_ALLOC;
+ u32 len_diff_abs = abs(len_diff);
+ bool shrink = len_diff < 0;
+ int ret = 0;
+
+ if (unlikely(flags || mode))
+ return -EINVAL;
+ if (unlikely(len_diff_abs > 0xfffU))
+ return -EFAULT;
+
+ if (!shrink) {
+ ret = skb_cow(skb, len_diff);
+ if (unlikely(ret < 0))
+ return ret;
+ __skb_push(skb, len_diff_abs);
+ memset(skb->data, 0, len_diff_abs);
+ } else {
+ if (unlikely(!pskb_may_pull(skb, len_diff_abs)))
+ return -ENOMEM;
+ __skb_pull(skb, len_diff_abs);
+ }
+ if (tls_sw_has_ctx_rx(skb->sk)) {
+ struct strp_msg *rxm = strp_msg(skb);
+
+ rxm->full_len += len_diff;
+ }
+ return ret;
}
-static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff)
+static const struct bpf_func_proto sk_skb_adjust_room_proto = {
+ .func = sk_skb_adjust_room,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
+ u32, mode, u64, flags)
{
- bool trans_same = skb->transport_header == skb->network_header;
u32 len_cur, len_diff_abs = abs(len_diff);
u32 len_min = bpf_skb_net_base_len(skb);
- u32 len_max = __bpf_skb_max_len(skb);
+ u32 len_max = BPF_SKB_MAX_LEN;
__be16 proto = skb->protocol;
bool shrink = len_diff < 0;
+ u32 off;
int ret;
+ if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK |
+ BPF_F_ADJ_ROOM_NO_CSUM_RESET)))
+ return -EINVAL;
if (unlikely(len_diff_abs > 0xfffU))
return -EFAULT;
if (unlikely(proto != htons(ETH_P_IP) &&
proto != htons(ETH_P_IPV6)))
return -ENOTSUPP;
+ off = skb_mac_header_len(skb);
+ switch (mode) {
+ case BPF_ADJ_ROOM_NET:
+ off += bpf_skb_net_base_len(skb);
+ break;
+ case BPF_ADJ_ROOM_MAC:
+ break;
+ default:
+ return -ENOTSUPP;
+ }
+
+ if (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
+ if (!shrink)
+ return -EINVAL;
+
+ switch (flags & BPF_F_ADJ_ROOM_DECAP_L3_MASK) {
+ case BPF_F_ADJ_ROOM_DECAP_L3_IPV4:
+ len_min = sizeof(struct iphdr);
+ break;
+ case BPF_F_ADJ_ROOM_DECAP_L3_IPV6:
+ len_min = sizeof(struct ipv6hdr);
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
len_cur = skb->len - skb_network_offset(skb);
- if (skb_transport_header_was_set(skb) && !trans_same)
- len_cur = skb_network_header_len(skb);
if ((shrink && (len_diff_abs >= len_cur ||
len_cur - len_diff_abs < len_min)) ||
(!shrink && (skb->len + len_diff_abs > len_max &&
!skb_is_gso(skb))))
return -ENOTSUPP;
- ret = shrink ? bpf_skb_net_shrink(skb, len_diff_abs) :
- bpf_skb_net_grow(skb, len_diff_abs);
+ ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) :
+ bpf_skb_net_grow(skb, off, len_diff_abs, flags);
+ if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET))
+ __skb_reset_checksum_unnecessary(skb);
bpf_compute_data_pointers(skb);
return ret;
}
-BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff,
- u32, mode, u64, flags)
-{
- if (unlikely(flags))
- return -EINVAL;
- if (likely(mode == BPF_ADJ_ROOM_NET))
- return bpf_skb_adjust_net(skb, len_diff);
-
- return -ENOTSUPP;
-}
-
static const struct bpf_func_proto bpf_skb_adjust_room_proto = {
.func = bpf_skb_adjust_room,
.gpl_only = false,
@@ -2897,13 +3771,22 @@ static const struct bpf_func_proto bpf_skb_adjust_room_proto = {
static u32 __bpf_skb_min_len(const struct sk_buff *skb)
{
- u32 min_len = skb_network_offset(skb);
+ int offset = skb_network_offset(skb);
+ u32 min_len = 0;
- if (skb_transport_header_was_set(skb))
- min_len = skb_transport_offset(skb);
- if (skb->ip_summed == CHECKSUM_PARTIAL)
- min_len = skb_checksum_start_offset(skb) +
- skb->csum_offset + sizeof(__sum16);
+ if (offset > 0)
+ min_len = offset;
+ if (skb_transport_header_was_set(skb)) {
+ offset = skb_transport_offset(skb);
+ if (offset > 0)
+ min_len = offset;
+ }
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ offset = skb_checksum_start_offset(skb) +
+ skb->csum_offset + sizeof(__sum16);
+ if (offset > 0)
+ min_len = offset;
+ }
return min_len;
}
@@ -2926,7 +3809,7 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len,
u64 flags)
{
- u32 max_len = __bpf_skb_max_len(skb);
+ u32 max_len = BPF_SKB_MAX_LEN;
u32 min_len = __bpf_skb_min_len(skb);
int ret;
@@ -2984,10 +3867,7 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
u64, flags)
{
- int ret = __bpf_skb_change_tail(skb, new_len, flags);
-
- bpf_compute_data_end_sk_skb(skb);
- return ret;
+ return __bpf_skb_change_tail(skb, new_len, flags);
}
static const struct bpf_func_proto sk_skb_change_tail_proto = {
@@ -3002,15 +3882,17 @@ static const struct bpf_func_proto sk_skb_change_tail_proto = {
static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
u64 flags)
{
- u32 max_len = __bpf_skb_max_len(skb);
+ const u8 meta_len = skb_metadata_len(skb);
+ u32 max_len = BPF_SKB_MAX_LEN;
u32 new_len = skb->len + head_room;
int ret;
- if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) ||
+ if (unlikely(flags || (int)head_room < 0 ||
+ (!skb_is_gso(skb) && new_len > max_len) ||
new_len < skb->len))
return -EINVAL;
- ret = skb_cow(skb, head_room);
+ ret = skb_cow(skb, meta_len + head_room);
if (likely(!ret)) {
/* Idea for this helper is that we currently only
* allow to expand on mac header. This means that
@@ -3022,8 +3904,10 @@ static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
* for redirection into L2 device.
*/
__skb_push(skb, head_room);
+ skb_postpush_data_move(skb, head_room, 0);
memset(skb->data, 0, head_room);
skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
}
return ret;
@@ -3050,10 +3934,7 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = {
BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
u64, flags)
{
- int ret = __bpf_skb_change_head(skb, head_room, flags);
-
- bpf_compute_data_end_sk_skb(skb);
- return ret;
+ return __bpf_skb_change_head(skb, head_room, flags);
}
static const struct bpf_func_proto sk_skb_change_head_proto = {
@@ -3064,6 +3945,28 @@ static const struct bpf_func_proto sk_skb_change_head_proto = {
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
};
+
+BPF_CALL_1(bpf_xdp_get_buff_len, struct xdp_buff*, xdp)
+{
+ return xdp_get_buff_len(xdp);
+}
+
+static const struct bpf_func_proto bpf_xdp_get_buff_len_proto = {
+ .func = bpf_xdp_get_buff_len,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BTF_ID_LIST_SINGLE(bpf_xdp_get_buff_len_bpf_ids, struct, xdp_buff)
+
+const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto = {
+ .func = bpf_xdp_get_buff_len,
+ .gpl_only = false,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_xdp_get_buff_len_bpf_ids[0],
+};
+
static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
{
return xdp_data_meta_unsupported(xdp) ? 0 :
@@ -3098,17 +4001,266 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = {
.arg2_type = ARG_ANYTHING,
};
+void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off,
+ void *buf, unsigned long len, bool flush)
+{
+ unsigned long ptr_len, ptr_off = 0;
+ skb_frag_t *next_frag, *end_frag;
+ struct skb_shared_info *sinfo;
+ void *src, *dst;
+ u8 *ptr_buf;
+
+ if (likely(xdp->data_end - xdp->data >= off + len)) {
+ src = flush ? buf : xdp->data + off;
+ dst = flush ? xdp->data + off : buf;
+ memcpy(dst, src, len);
+ return;
+ }
+
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ end_frag = &sinfo->frags[sinfo->nr_frags];
+ next_frag = &sinfo->frags[0];
+
+ ptr_len = xdp->data_end - xdp->data;
+ ptr_buf = xdp->data;
+
+ while (true) {
+ if (off < ptr_off + ptr_len) {
+ unsigned long copy_off = off - ptr_off;
+ unsigned long copy_len = min(len, ptr_len - copy_off);
+
+ src = flush ? buf : ptr_buf + copy_off;
+ dst = flush ? ptr_buf + copy_off : buf;
+ memcpy(dst, src, copy_len);
+
+ off += copy_len;
+ len -= copy_len;
+ buf += copy_len;
+ }
+
+ if (!len || next_frag == end_frag)
+ break;
+
+ ptr_off += ptr_len;
+ ptr_buf = skb_frag_address(next_frag);
+ ptr_len = skb_frag_size(next_frag);
+ next_frag++;
+ }
+}
+
+void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len)
+{
+ u32 size = xdp->data_end - xdp->data;
+ struct skb_shared_info *sinfo;
+ void *addr = xdp->data;
+ int i;
+
+ if (unlikely(offset > 0xffff || len > 0xffff))
+ return ERR_PTR(-EFAULT);
+
+ if (unlikely(offset + len > xdp_get_buff_len(xdp)))
+ return ERR_PTR(-EINVAL);
+
+ if (likely(offset < size)) /* linear area */
+ goto out;
+
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ offset -= size;
+ for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */
+ u32 frag_size = skb_frag_size(&sinfo->frags[i]);
+
+ if (offset < frag_size) {
+ addr = skb_frag_address(&sinfo->frags[i]);
+ size = frag_size;
+ break;
+ }
+ offset -= frag_size;
+ }
+out:
+ return offset + len <= size ? addr + offset : NULL;
+}
+
+BPF_CALL_4(bpf_xdp_load_bytes, struct xdp_buff *, xdp, u32, offset,
+ void *, buf, u32, len)
+{
+ void *ptr;
+
+ ptr = bpf_xdp_pointer(xdp, offset, len);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+
+ if (!ptr)
+ bpf_xdp_copy_buf(xdp, offset, buf, len, false);
+ else
+ memcpy(buf, ptr, len);
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_load_bytes_proto = {
+ .func = bpf_xdp_load_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
+int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
+{
+ return ____bpf_xdp_load_bytes(xdp, offset, buf, len);
+}
+
+BPF_CALL_4(bpf_xdp_store_bytes, struct xdp_buff *, xdp, u32, offset,
+ void *, buf, u32, len)
+{
+ void *ptr;
+
+ ptr = bpf_xdp_pointer(xdp, offset, len);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+
+ if (!ptr)
+ bpf_xdp_copy_buf(xdp, offset, buf, len, true);
+ else
+ memcpy(ptr, buf, len);
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_xdp_store_bytes_proto = {
+ .func = bpf_xdp_store_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg4_type = ARG_CONST_SIZE,
+};
+
+int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len)
+{
+ return ____bpf_xdp_store_bytes(xdp, offset, buf, len);
+}
+
+static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
+{
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+ skb_frag_t *frag = &sinfo->frags[sinfo->nr_frags - 1];
+ struct xdp_rxq_info *rxq = xdp->rxq;
+ unsigned int tailroom;
+
+ if (!rxq->frag_size || rxq->frag_size > xdp->frame_sz)
+ return -EOPNOTSUPP;
+
+ tailroom = rxq->frag_size - skb_frag_size(frag) - skb_frag_off(frag);
+ if (unlikely(offset > tailroom))
+ return -EINVAL;
+
+ memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset);
+ skb_frag_size_add(frag, offset);
+ sinfo->xdp_frags_size += offset;
+ if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
+ xsk_buff_get_tail(xdp)->data_end += offset;
+
+ return 0;
+}
+
+static struct xdp_buff *bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
+ bool tail, bool release)
+{
+ struct xdp_buff *zc_frag = tail ? xsk_buff_get_tail(xdp) :
+ xsk_buff_get_head(xdp);
+
+ if (release) {
+ xsk_buff_del_frag(zc_frag);
+ } else {
+ if (tail)
+ zc_frag->data_end -= shrink;
+ else
+ zc_frag->data += shrink;
+ }
+
+ return zc_frag;
+}
+
+static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag,
+ int shrink, bool tail)
+{
+ enum xdp_mem_type mem_type = xdp->rxq->mem.type;
+ bool release = skb_frag_size(frag) == shrink;
+ netmem_ref netmem = skb_frag_netmem(frag);
+ struct xdp_buff *zc_frag = NULL;
+
+ if (mem_type == MEM_TYPE_XSK_BUFF_POOL) {
+ netmem = 0;
+ zc_frag = bpf_xdp_shrink_data_zc(xdp, shrink, tail, release);
+ }
+
+ if (release) {
+ __xdp_return(netmem, mem_type, false, zc_frag);
+ } else {
+ if (!tail)
+ skb_frag_off_add(frag, shrink);
+ skb_frag_size_sub(frag, shrink);
+ }
+
+ return release;
+}
+
+static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
+{
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+ int i, n_frags_free = 0, len_free = 0;
+
+ if (unlikely(offset > (int)xdp_get_buff_len(xdp) - ETH_HLEN))
+ return -EINVAL;
+
+ for (i = sinfo->nr_frags - 1; i >= 0 && offset > 0; i--) {
+ skb_frag_t *frag = &sinfo->frags[i];
+ int shrink = min_t(int, offset, skb_frag_size(frag));
+
+ len_free += shrink;
+ offset -= shrink;
+ if (bpf_xdp_shrink_data(xdp, frag, shrink, true))
+ n_frags_free++;
+ }
+ sinfo->nr_frags -= n_frags_free;
+ sinfo->xdp_frags_size -= len_free;
+
+ if (unlikely(!sinfo->nr_frags)) {
+ xdp_buff_clear_frags_flag(xdp);
+ xdp_buff_clear_frag_pfmemalloc(xdp);
+ xdp->data_end -= offset;
+ }
+
+ return 0;
+}
+
BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
{
+ void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
void *data_end = xdp->data_end + offset;
- /* only shrinking is allowed for now. */
- if (unlikely(offset >= 0))
+ if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */
+ if (offset < 0)
+ return bpf_xdp_frags_shrink_tail(xdp, -offset);
+
+ return bpf_xdp_frags_increase_tail(xdp, offset);
+ }
+
+ /* Notice that xdp_data_hard_end have reserved some tailroom */
+ if (unlikely(data_end > data_hard_end))
return -EINVAL;
if (unlikely(data_end < xdp->data + ETH_HLEN))
return -EINVAL;
+ /* Clear memory area on grow, can contain uninit kernel memory */
+ if (offset > 0)
+ memset(xdp->data_end, 0, offset);
+
xdp->data_end = data_end;
return 0;
@@ -3133,8 +4285,7 @@ BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
if (unlikely(meta < xdp_frame_end ||
meta > xdp->data))
return -EINVAL;
- if (unlikely((metalen & (sizeof(__u32) - 1)) ||
- (metalen > 32)))
+ if (unlikely(xdp_metalen_invalid(metalen)))
return -EACCES;
xdp->data_meta = meta;
@@ -3150,276 +4301,338 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
.arg2_type = ARG_ANYTHING,
};
-static int __bpf_tx_xdp(struct net_device *dev,
- struct bpf_map *map,
- struct xdp_buff *xdp,
- u32 index)
+/**
+ * DOC: xdp redirect
+ *
+ * XDP_REDIRECT works by a three-step process, implemented in the functions
+ * below:
+ *
+ * 1. The bpf_redirect() and bpf_redirect_map() helpers will lookup the target
+ * of the redirect and store it (along with some other metadata) in a per-CPU
+ * struct bpf_redirect_info.
+ *
+ * 2. When the program returns the XDP_REDIRECT return code, the driver will
+ * call xdp_do_redirect() which will use the information in struct
+ * bpf_redirect_info to actually enqueue the frame into a map type-specific
+ * bulk queue structure.
+ *
+ * 3. Before exiting its NAPI poll loop, the driver will call
+ * xdp_do_flush(), which will flush all the different bulk queues,
+ * thus completing the redirect. Note that xdp_do_flush() must be
+ * called before napi_complete_done() in the driver, as the
+ * XDP_REDIRECT logic relies on being inside a single NAPI instance
+ * through to the xdp_do_flush() call for RCU protection of all
+ * in-kernel data structures.
+ */
+/*
+ * Pointers to the map entries will be kept around for this whole sequence of
+ * steps, protected by RCU. However, there is no top-level rcu_read_lock() in
+ * the core code; instead, the RCU protection relies on everything happening
+ * inside a single NAPI poll sequence, which means it's between a pair of calls
+ * to local_bh_disable()/local_bh_enable().
+ *
+ * The map entries are marked as __rcu and the map code makes sure to
+ * dereference those pointers with rcu_dereference_check() in a way that works
+ * for both sections that to hold an rcu_read_lock() and sections that are
+ * called from NAPI without a separate rcu_read_lock(). The code below does not
+ * use RCU annotations, but relies on those in the map code.
+ */
+void xdp_do_flush(void)
{
- struct xdp_frame *xdpf;
- int err, sent;
-
- if (!dev->netdev_ops->ndo_xdp_xmit) {
- return -EOPNOTSUPP;
- }
-
- err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
- if (unlikely(err))
- return err;
-
- xdpf = convert_to_xdp_frame(xdp);
- if (unlikely(!xdpf))
- return -EOVERFLOW;
+ struct list_head *lh_map, *lh_dev, *lh_xsk;
- sent = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf, XDP_XMIT_FLUSH);
- if (sent <= 0)
- return sent;
- return 0;
+ bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
+ if (lh_dev)
+ __dev_flush(lh_dev);
+ if (lh_map)
+ __cpu_map_flush(lh_map);
+ if (lh_xsk)
+ __xsk_map_flush(lh_xsk);
}
+EXPORT_SYMBOL_GPL(xdp_do_flush);
-static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
- struct bpf_map *map,
- struct xdp_buff *xdp,
- u32 index)
+#if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL)
+void xdp_do_check_flushed(struct napi_struct *napi)
{
- int err;
+ struct list_head *lh_map, *lh_dev, *lh_xsk;
+ bool missed = false;
- switch (map->map_type) {
- case BPF_MAP_TYPE_DEVMAP: {
- struct bpf_dtab_netdev *dst = fwd;
-
- err = dev_map_enqueue(dst, xdp, dev_rx);
- if (err)
- return err;
- __dev_map_insert_ctx(map, index);
- break;
- }
- case BPF_MAP_TYPE_CPUMAP: {
- struct bpf_cpu_map_entry *rcpu = fwd;
-
- err = cpu_map_enqueue(rcpu, xdp, dev_rx);
- if (err)
- return err;
- __cpu_map_insert_ctx(map, index);
- break;
+ bpf_net_ctx_get_all_used_flush_lists(&lh_map, &lh_dev, &lh_xsk);
+ if (lh_dev) {
+ __dev_flush(lh_dev);
+ missed = true;
}
- case BPF_MAP_TYPE_XSKMAP: {
- struct xdp_sock *xs = fwd;
-
- err = __xsk_map_redirect(map, xdp, xs);
- return err;
+ if (lh_map) {
+ __cpu_map_flush(lh_map);
+ missed = true;
}
- default:
- break;
+ if (lh_xsk) {
+ __xsk_map_flush(lh_xsk);
+ missed = true;
}
- return 0;
-}
-void xdp_do_flush_map(void)
-{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- struct bpf_map *map = ri->map_to_flush;
-
- ri->map_to_flush = NULL;
- if (map) {
- switch (map->map_type) {
- case BPF_MAP_TYPE_DEVMAP:
- __dev_map_flush(map);
- break;
- case BPF_MAP_TYPE_CPUMAP:
- __cpu_map_flush(map);
- break;
- case BPF_MAP_TYPE_XSKMAP:
- __xsk_map_flush(map);
- break;
- default:
- break;
- }
- }
+ WARN_ONCE(missed, "Missing xdp_do_flush() invocation after NAPI by %ps\n",
+ napi->poll);
}
-EXPORT_SYMBOL_GPL(xdp_do_flush_map);
+#endif
-static void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
-{
- switch (map->map_type) {
- case BPF_MAP_TYPE_DEVMAP:
- return __dev_map_lookup_elem(map, index);
- case BPF_MAP_TYPE_CPUMAP:
- return __cpu_map_lookup_elem(map, index);
- case BPF_MAP_TYPE_XSKMAP:
- return __xsk_map_lookup_elem(map, index);
- default:
- return NULL;
- }
-}
+DEFINE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key);
+EXPORT_SYMBOL_GPL(bpf_master_redirect_enabled_key);
-void bpf_clear_redirect_map(struct bpf_map *map)
+u32 xdp_master_redirect(struct xdp_buff *xdp)
{
- struct bpf_redirect_info *ri;
- int cpu;
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
+ struct net_device *master, *slave;
- for_each_possible_cpu(cpu) {
- ri = per_cpu_ptr(&bpf_redirect_info, cpu);
- /* Avoid polluting remote cacheline due to writes if
- * not needed. Once we pass this test, we need the
- * cmpxchg() to make sure it hasn't been changed in
- * the meantime by remote CPU.
+ master = netdev_master_upper_dev_get_rcu(xdp->rxq->dev);
+ slave = master->netdev_ops->ndo_xdp_get_xmit_slave(master, xdp);
+ if (slave && slave != xdp->rxq->dev) {
+ /* The target device is different from the receiving device, so
+ * redirect it to the new device.
+ * Using XDP_REDIRECT gets the correct behaviour from XDP enabled
+ * drivers to unmap the packet from their rx ring.
*/
- if (unlikely(READ_ONCE(ri->map) == map))
- cmpxchg(&ri->map, map, NULL);
+ ri->tgt_index = slave->ifindex;
+ ri->map_id = INT_MAX;
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
+ return XDP_REDIRECT;
}
+ return XDP_TX;
}
+EXPORT_SYMBOL_GPL(xdp_master_redirect);
-static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog, struct bpf_map *map)
+static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri,
+ const struct net_device *dev,
+ struct xdp_buff *xdp,
+ const struct bpf_prog *xdp_prog)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- u32 index = ri->ifindex;
- void *fwd = NULL;
+ enum bpf_map_type map_type = ri->map_type;
+ void *fwd = ri->tgt_value;
+ u32 map_id = ri->map_id;
int err;
- ri->ifindex = 0;
- WRITE_ONCE(ri->map, NULL);
-
- fwd = __xdp_map_lookup_elem(map, index);
- if (!fwd) {
- err = -EINVAL;
- goto err;
- }
- if (ri->map_to_flush && ri->map_to_flush != map)
- xdp_do_flush_map();
+ ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
- err = __bpf_tx_xdp_map(dev, fwd, map, xdp, index);
+ err = __xsk_map_redirect(fwd, xdp);
if (unlikely(err))
goto err;
- ri->map_to_flush = map;
- _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+ _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
return 0;
err:
- _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+ _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
return err;
}
-int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+static __always_inline int
+__xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev,
+ struct xdp_frame *xdpf,
+ const struct bpf_prog *xdp_prog)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- struct bpf_map *map = READ_ONCE(ri->map);
- struct net_device *fwd;
- u32 index = ri->ifindex;
+ enum bpf_map_type map_type = ri->map_type;
+ void *fwd = ri->tgt_value;
+ u32 map_id = ri->map_id;
+ u32 flags = ri->flags;
+ struct bpf_map *map;
int err;
- if (map)
- return xdp_do_redirect_map(dev, xdp, xdp_prog, map);
+ ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+ ri->flags = 0;
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
- fwd = dev_get_by_index_rcu(dev_net(dev), index);
- ri->ifindex = 0;
- if (unlikely(!fwd)) {
- err = -EINVAL;
+ if (unlikely(!xdpf)) {
+ err = -EOVERFLOW;
goto err;
}
- err = __bpf_tx_xdp(fwd, NULL, xdp, 0);
+ switch (map_type) {
+ case BPF_MAP_TYPE_DEVMAP:
+ fallthrough;
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ if (unlikely(flags & BPF_F_BROADCAST)) {
+ map = READ_ONCE(ri->map);
+
+ /* The map pointer is cleared when the map is being torn
+ * down by dev_map_free()
+ */
+ if (unlikely(!map)) {
+ err = -ENOENT;
+ break;
+ }
+
+ WRITE_ONCE(ri->map, NULL);
+ err = dev_map_enqueue_multi(xdpf, dev, map,
+ flags & BPF_F_EXCLUDE_INGRESS);
+ } else {
+ err = dev_map_enqueue(fwd, xdpf, dev);
+ }
+ break;
+ case BPF_MAP_TYPE_CPUMAP:
+ err = cpu_map_enqueue(fwd, xdpf, dev);
+ break;
+ case BPF_MAP_TYPE_UNSPEC:
+ if (map_id == INT_MAX) {
+ fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
+ if (unlikely(!fwd)) {
+ err = -EINVAL;
+ break;
+ }
+ err = dev_xdp_enqueue(fwd, xdpf, dev);
+ break;
+ }
+ fallthrough;
+ default:
+ err = -EBADRQC;
+ }
+
if (unlikely(err))
goto err;
- _trace_xdp_redirect(dev, xdp_prog, index);
+ _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
return 0;
err:
- _trace_xdp_redirect_err(dev, xdp_prog, index, err);
+ _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
return err;
}
+
+int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
+ const struct bpf_prog *xdp_prog)
+{
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
+ enum bpf_map_type map_type = ri->map_type;
+
+ if (map_type == BPF_MAP_TYPE_XSKMAP)
+ return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);
+
+ return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp),
+ xdp_prog);
+}
EXPORT_SYMBOL_GPL(xdp_do_redirect);
+int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp,
+ struct xdp_frame *xdpf,
+ const struct bpf_prog *xdp_prog)
+{
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
+ enum bpf_map_type map_type = ri->map_type;
+
+ if (map_type == BPF_MAP_TYPE_XSKMAP)
+ return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog);
+
+ return __xdp_do_redirect_frame(ri, dev, xdpf, xdp_prog);
+}
+EXPORT_SYMBOL_GPL(xdp_do_redirect_frame);
+
static int xdp_do_generic_redirect_map(struct net_device *dev,
struct sk_buff *skb,
struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog,
- struct bpf_map *map)
+ const struct bpf_prog *xdp_prog,
+ void *fwd, enum bpf_map_type map_type,
+ u32 map_id, u32 flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- u32 index = ri->ifindex;
- void *fwd = NULL;
- int err = 0;
-
- ri->ifindex = 0;
- WRITE_ONCE(ri->map, NULL);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
+ struct bpf_map *map;
+ int err;
- fwd = __xdp_map_lookup_elem(map, index);
- if (unlikely(!fwd)) {
- err = -EINVAL;
- goto err;
- }
+ switch (map_type) {
+ case BPF_MAP_TYPE_DEVMAP:
+ fallthrough;
+ case BPF_MAP_TYPE_DEVMAP_HASH:
+ if (unlikely(flags & BPF_F_BROADCAST)) {
+ map = READ_ONCE(ri->map);
- if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
- struct bpf_dtab_netdev *dst = fwd;
+ /* The map pointer is cleared when the map is being torn
+ * down by dev_map_free()
+ */
+ if (unlikely(!map)) {
+ err = -ENOENT;
+ break;
+ }
- err = dev_map_generic_redirect(dst, skb, xdp_prog);
+ WRITE_ONCE(ri->map, NULL);
+ err = dev_map_redirect_multi(dev, skb, xdp_prog, map,
+ flags & BPF_F_EXCLUDE_INGRESS);
+ } else {
+ err = dev_map_generic_redirect(fwd, skb, xdp_prog);
+ }
if (unlikely(err))
goto err;
- } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
- struct xdp_sock *xs = fwd;
-
- err = xsk_generic_rcv(xs, xdp);
+ break;
+ case BPF_MAP_TYPE_XSKMAP:
+ err = xsk_generic_rcv(fwd, xdp);
if (err)
goto err;
consume_skb(skb);
- } else {
- /* TODO: Handle BPF_MAP_TYPE_CPUMAP */
+ break;
+ case BPF_MAP_TYPE_CPUMAP:
+ err = cpu_map_generic_redirect(fwd, skb);
+ if (unlikely(err))
+ goto err;
+ break;
+ default:
err = -EBADRQC;
goto err;
}
- _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+ _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
return 0;
err:
- _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+ _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
return err;
}
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
- struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
-{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- struct bpf_map *map = READ_ONCE(ri->map);
- u32 index = ri->ifindex;
- struct net_device *fwd;
- int err = 0;
-
- if (map)
- return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog,
- map);
- ri->ifindex = 0;
- fwd = dev_get_by_index_rcu(dev_net(dev), index);
- if (unlikely(!fwd)) {
- err = -EINVAL;
- goto err;
- }
+ struct xdp_buff *xdp,
+ const struct bpf_prog *xdp_prog)
+{
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
+ enum bpf_map_type map_type = ri->map_type;
+ void *fwd = ri->tgt_value;
+ u32 map_id = ri->map_id;
+ u32 flags = ri->flags;
+ int err;
- err = xdp_ok_fwd_dev(fwd, skb->len);
- if (unlikely(err))
- goto err;
+ ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+ ri->flags = 0;
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
- skb->dev = fwd;
- _trace_xdp_redirect(dev, xdp_prog, index);
- generic_xdp_tx(skb, xdp_prog);
- return 0;
+ if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
+ fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
+ if (unlikely(!fwd)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ err = xdp_ok_fwd_dev(fwd, skb->len);
+ if (unlikely(err))
+ goto err;
+
+ skb->dev = fwd;
+ _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index);
+ generic_xdp_tx(skb, xdp_prog);
+ return 0;
+ }
+
+ return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id, flags);
err:
- _trace_xdp_redirect_err(dev, xdp_prog, index, err);
+ _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err);
return err;
}
-EXPORT_SYMBOL_GPL(xdp_do_generic_redirect);
BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
+ struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
if (unlikely(flags))
return XDP_ABORTED;
- ri->ifindex = ifindex;
- ri->flags = flags;
- WRITE_ONCE(ri->map, NULL);
+ /* NB! Map type UNSPEC and map_id == INT_MAX (never generated
+ * by map_idr) is used for ifindex based XDP redirect.
+ */
+ ri->tgt_index = ifindex;
+ ri->map_id = INT_MAX;
+ ri->map_type = BPF_MAP_TYPE_UNSPEC;
return XDP_REDIRECT;
}
@@ -3432,19 +4645,10 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = {
.arg2_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
+BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u64, key,
u64, flags)
{
- struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
-
- if (unlikely(flags))
- return XDP_ABORTED;
-
- ri->ifindex = ifindex;
- ri->flags = flags;
- WRITE_ONCE(ri->map, map);
-
- return XDP_REDIRECT;
+ return map->ops->map_redirect(map, key, flags);
}
static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
@@ -3476,7 +4680,7 @@ BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map,
if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
return -EINVAL;
- if (unlikely(skb_size > skb->len))
+ if (unlikely(!skb || skb_size > skb->len))
return -EFAULT;
return bpf_event_output(map, flags, meta, meta_size, skb, skb_size,
@@ -3490,7 +4694,21 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BTF_ID_LIST_SINGLE(bpf_skb_output_btf_ids, struct, sk_buff)
+
+const struct bpf_func_proto bpf_skb_output_proto = {
+ .func = bpf_skb_event_output,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_skb_output_btf_ids[0],
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@@ -3507,7 +4725,8 @@ BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key
void *to_orig = to;
int err;
- if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) {
+ if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 |
+ BPF_F_TUNINFO_FLAGS)))) {
err = -EINVAL;
goto err_clear;
}
@@ -3518,6 +4737,7 @@ BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key
if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
err = -EINVAL;
switch (size) {
+ case offsetof(struct bpf_tunnel_key, local_ipv6[0]):
case offsetof(struct bpf_tunnel_key, tunnel_label):
case offsetof(struct bpf_tunnel_key, tunnel_ext):
goto set_compat;
@@ -3538,15 +4758,22 @@ set_compat:
to->tunnel_id = be64_to_cpu(info->key.tun_id);
to->tunnel_tos = info->key.tos;
to->tunnel_ttl = info->key.ttl;
- to->tunnel_ext = 0;
+ if (flags & BPF_F_TUNINFO_FLAGS)
+ to->tunnel_flags = ip_tunnel_flags_to_be16(info->key.tun_flags);
+ else
+ to->tunnel_ext = 0;
if (flags & BPF_F_TUNINFO_IPV6) {
memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
sizeof(to->remote_ipv6));
+ memcpy(to->local_ipv6, &info->key.u.ipv6.dst,
+ sizeof(to->local_ipv6));
to->tunnel_label = be32_to_cpu(info->key.label);
} else {
to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3);
+ to->local_ipv4 = be32_to_cpu(info->key.u.ipv4.dst);
+ memset(&to->local_ipv6[1], 0, sizeof(__u32) * 3);
to->tunnel_label = 0;
}
@@ -3575,7 +4802,7 @@ BPF_CALL_3(bpf_skb_get_tunnel_opt, struct sk_buff *, skb, u8 *, to, u32, size)
int err;
if (unlikely(!info ||
- !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT))) {
+ !ip_tunnel_is_options_present(info->key.tun_flags))) {
err = -ENOENT;
goto err_clear;
}
@@ -3613,10 +4840,12 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
struct ip_tunnel_info *info;
if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
- BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER)))
+ BPF_F_DONT_FRAGMENT | BPF_F_SEQ_NUMBER |
+ BPF_F_NO_TUNNEL_KEY)))
return -EINVAL;
if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
switch (size) {
+ case offsetof(struct bpf_tunnel_key, local_ipv6[0]):
case offsetof(struct bpf_tunnel_key, tunnel_label):
case offsetof(struct bpf_tunnel_key, tunnel_ext):
case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
@@ -3643,13 +4872,15 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
memset(info, 0, sizeof(*info));
info->mode = IP_TUNNEL_INFO_TX;
- info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
- if (flags & BPF_F_DONT_FRAGMENT)
- info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
- if (flags & BPF_F_ZERO_CSUM_TX)
- info->key.tun_flags &= ~TUNNEL_CSUM;
- if (flags & BPF_F_SEQ_NUMBER)
- info->key.tun_flags |= TUNNEL_SEQ;
+ __set_bit(IP_TUNNEL_NOCACHE_BIT, info->key.tun_flags);
+ __assign_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, info->key.tun_flags,
+ flags & BPF_F_DONT_FRAGMENT);
+ __assign_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags,
+ !(flags & BPF_F_ZERO_CSUM_TX));
+ __assign_bit(IP_TUNNEL_SEQ_BIT, info->key.tun_flags,
+ flags & BPF_F_SEQ_NUMBER);
+ __assign_bit(IP_TUNNEL_KEY_BIT, info->key.tun_flags,
+ !(flags & BPF_F_NO_TUNNEL_KEY));
info->key.tun_id = cpu_to_be64(from->tunnel_id);
info->key.tos = from->tunnel_tos;
@@ -3659,10 +4890,14 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb,
info->mode |= IP_TUNNEL_INFO_IPV6;
memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
sizeof(from->remote_ipv6));
+ memcpy(&info->key.u.ipv6.src, from->local_ipv6,
+ sizeof(from->local_ipv6));
info->key.label = cpu_to_be32(from->tunnel_label) &
IPV6_FLOWLABEL_MASK;
} else {
info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
+ info->key.u.ipv4.src = cpu_to_be32(from->local_ipv4);
+ info->key.flow_flags = FLOWI_FLAG_ANYSRC;
}
return 0;
@@ -3673,7 +4908,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
.arg4_type = ARG_ANYTHING,
};
@@ -3683,13 +4918,15 @@ BPF_CALL_3(bpf_skb_set_tunnel_opt, struct sk_buff *, skb,
{
struct ip_tunnel_info *info = skb_tunnel_info(skb);
const struct metadata_dst *md = this_cpu_ptr(md_dst);
+ IP_TUNNEL_DECLARE_FLAGS(present) = { };
if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
return -EINVAL;
if (unlikely(size > IP_TUNNEL_OPTS_MAX))
return -ENOMEM;
- ip_tunnel_info_opts_set(info, from, size, TUNNEL_OPTIONS_PRESENT);
+ ip_tunnel_set_options_present(present);
+ ip_tunnel_info_opts_set(info, from, size, present);
return 0;
}
@@ -3699,7 +4936,7 @@ static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};
@@ -3758,16 +4995,21 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = {
};
#ifdef CONFIG_SOCK_CGROUP_DATA
-BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
+static inline u64 __bpf_sk_cgroup_id(struct sock *sk)
{
- struct sock *sk = skb_to_full_sk(skb);
struct cgroup *cgrp;
+ sk = sk_to_full_sk(sk);
if (!sk || !sk_fullsock(sk))
return 0;
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- return cgrp->kn->id.id;
+ return cgroup_id(cgrp);
+}
+
+BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb)
+{
+ return __bpf_sk_cgroup_id(skb->sk);
}
static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
@@ -3777,13 +5019,13 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
- ancestor_level)
+static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk,
+ int ancestor_level)
{
- struct sock *sk = skb_to_full_sk(skb);
struct cgroup *ancestor;
struct cgroup *cgrp;
+ sk = sk_to_full_sk(sk);
if (!sk || !sk_fullsock(sk))
return 0;
@@ -3792,7 +5034,13 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
if (!ancestor)
return 0;
- return ancestor->kn->id.id;
+ return cgroup_id(ancestor);
+}
+
+BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int,
+ ancestor_level)
+{
+ return __bpf_sk_ancestor_cgroup_id(skb->sk, ancestor_level);
}
static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
@@ -3802,12 +5050,39 @@ static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
};
+
+BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk)
+{
+ return __bpf_sk_cgroup_id(sk);
+}
+
+static const struct bpf_func_proto bpf_sk_cgroup_id_proto = {
+ .func = bpf_sk_cgroup_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+};
+
+BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level)
+{
+ return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level);
+}
+
+static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = {
+ .func = bpf_sk_ancestor_cgroup_id,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_ANYTHING,
+};
#endif
-static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff,
+static unsigned long bpf_xdp_copy(void *dst, const void *ctx,
unsigned long off, unsigned long len)
{
- memcpy(dst_buff, src_buff + off, len);
+ struct xdp_buff *xdp = (struct xdp_buff *)ctx;
+
+ bpf_xdp_copy_buf(xdp, off, dst, len, false);
return 0;
}
@@ -3818,10 +5093,11 @@ BPF_CALL_5(bpf_xdp_event_output, struct xdp_buff *, xdp, struct bpf_map *, map,
if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK)))
return -EINVAL;
- if (unlikely(xdp_size > (unsigned long)(xdp->data_end - xdp->data)))
+
+ if (unlikely(!xdp || xdp_size > xdp_get_buff_len(xdp)))
return -EFAULT;
- return bpf_event_output(map, flags, meta, meta_size, xdp->data,
+ return bpf_event_output(map, flags, meta, meta_size, xdp,
xdp_size, bpf_xdp_copy);
}
@@ -3832,13 +5108,27 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BTF_ID_LIST_SINGLE(bpf_xdp_output_btf_ids, struct, xdp_buff)
+
+const struct bpf_func_proto bpf_xdp_output_proto = {
+ .func = bpf_xdp_event_output,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_xdp_output_btf_ids[0],
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb)
{
- return skb->sk ? sock_gen_cookie(skb->sk) : 0;
+ return skb->sk ? __sock_gen_cookie(skb->sk) : 0;
}
static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
@@ -3850,7 +5140,7 @@ static const struct bpf_func_proto bpf_get_socket_cookie_proto = {
BPF_CALL_1(bpf_get_socket_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
{
- return sock_gen_cookie(ctx->sk);
+ return __sock_gen_cookie(ctx->sk);
}
static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
@@ -3860,9 +5150,33 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_addr_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_1(bpf_get_socket_cookie_sock, struct sock *, ctx)
+{
+ return __sock_gen_cookie(ctx);
+}
+
+static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = {
+ .func = bpf_get_socket_cookie_sock,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk)
+{
+ return sk ? sock_gen_cookie(sk) : 0;
+}
+
+const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = {
+ .func = bpf_get_socket_ptr_cookie,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | PTR_MAYBE_NULL,
+};
+
BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
{
- return sock_gen_cookie(ctx->sk);
+ return __sock_gen_cookie(ctx->sk);
}
static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
@@ -3872,6 +5186,72 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
+static u64 __bpf_get_netns_cookie(struct sock *sk)
+{
+ const struct net *net = sk ? sock_net(sk) : &init_net;
+
+ return net->net_cookie;
+}
+
+BPF_CALL_1(bpf_get_netns_cookie, struct sk_buff *, skb)
+{
+ return __bpf_get_netns_cookie(skb && skb->sk ? skb->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_proto = {
+ .func = bpf_get_netns_cookie,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
+BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_proto = {
+ .func = bpf_get_netns_cookie_sock,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
+BPF_CALL_1(bpf_get_netns_cookie_sock_addr, struct bpf_sock_addr_kern *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_addr_proto = {
+ .func = bpf_get_netns_cookie_sock_addr,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
+BPF_CALL_1(bpf_get_netns_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sock_ops_proto = {
+ .func = bpf_get_netns_cookie_sock_ops,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
+BPF_CALL_1(bpf_get_netns_cookie_sk_msg, struct sk_msg *, ctx)
+{
+ return __bpf_get_netns_cookie(ctx ? ctx->sk : NULL);
+}
+
+static const struct bpf_func_proto bpf_get_netns_cookie_sk_msg_proto = {
+ .func = bpf_get_netns_cookie_sk_msg,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX_OR_NULL,
+};
+
BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb)
{
struct sock *sk = sk_to_full_sk(skb->sk);
@@ -3890,213 +5270,673 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = {
.arg1_type = ARG_PTR_TO_CTX,
};
-BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
- int, level, int, optname, char *, optval, int, optlen)
+static int sk_bpf_set_get_cb_flags(struct sock *sk, char *optval, bool getopt)
{
- struct sock *sk = bpf_sock->sk;
- int ret = 0;
- int val;
+ u32 sk_bpf_cb_flags;
- if (!sk_fullsock(sk))
+ if (getopt) {
+ *(u32 *)optval = sk->sk_bpf_cb_flags;
+ return 0;
+ }
+
+ sk_bpf_cb_flags = *(u32 *)optval;
+
+ if (sk_bpf_cb_flags & ~SK_BPF_CB_MASK)
return -EINVAL;
- if (level == SOL_SOCKET) {
- if (optlen != sizeof(int))
+ sk->sk_bpf_cb_flags = sk_bpf_cb_flags;
+
+ return 0;
+}
+
+static int sol_socket_sockopt(struct sock *sk, int optname,
+ char *optval, int *optlen,
+ bool getopt)
+{
+ switch (optname) {
+ case SO_REUSEADDR:
+ case SO_SNDBUF:
+ case SO_RCVBUF:
+ case SO_KEEPALIVE:
+ case SO_PRIORITY:
+ case SO_REUSEPORT:
+ case SO_RCVLOWAT:
+ case SO_MARK:
+ case SO_MAX_PACING_RATE:
+ case SO_BINDTOIFINDEX:
+ case SO_TXREHASH:
+ case SK_BPF_CB_FLAGS:
+ if (*optlen != sizeof(int))
return -EINVAL;
- val = *((int *)optval);
+ break;
+ case SO_BINDTODEVICE:
+ break;
+ default:
+ return -EINVAL;
+ }
- /* Only some socketops are supported */
- switch (optname) {
- case SO_RCVBUF:
- sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
- sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
- break;
- case SO_SNDBUF:
- sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
- sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
- break;
- case SO_MAX_PACING_RATE:
- sk->sk_max_pacing_rate = val;
- sk->sk_pacing_rate = min(sk->sk_pacing_rate,
- sk->sk_max_pacing_rate);
- break;
- case SO_PRIORITY:
- sk->sk_priority = val;
- break;
- case SO_RCVLOWAT:
- if (val < 0)
- val = INT_MAX;
- sk->sk_rcvlowat = val ? : 1;
- break;
- case SO_MARK:
- sk->sk_mark = val;
- break;
- default:
- ret = -EINVAL;
- }
-#ifdef CONFIG_INET
- } else if (level == SOL_IP) {
- if (optlen != sizeof(int) || sk->sk_family != AF_INET)
+ if (optname == SK_BPF_CB_FLAGS)
+ return sk_bpf_set_get_cb_flags(sk, optval, getopt);
+
+ if (getopt) {
+ if (optname == SO_BINDTODEVICE)
return -EINVAL;
+ return sk_getsockopt(sk, SOL_SOCKET, optname,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
+ }
- val = *((int *)optval);
- /* Only some options are supported */
- switch (optname) {
- case IP_TOS:
- if (val < -1 || val > 0xff) {
- ret = -EINVAL;
- } else {
- struct inet_sock *inet = inet_sk(sk);
+ return sk_setsockopt(sk, SOL_SOCKET, optname,
+ KERNEL_SOCKPTR(optval), *optlen);
+}
- if (val == -1)
- val = 0;
- inet->tos = val;
- }
- break;
- default:
- ret = -EINVAL;
- }
-#if IS_ENABLED(CONFIG_IPV6)
- } else if (level == SOL_IPV6) {
- if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
+static int bpf_sol_tcp_getsockopt(struct sock *sk, int optname,
+ char *optval, int optlen)
+{
+ if (optlen != sizeof(int))
+ return -EINVAL;
+
+ switch (optname) {
+ case TCP_BPF_SOCK_OPS_CB_FLAGS: {
+ int cb_flags = tcp_sk(sk)->bpf_sock_ops_cb_flags;
+
+ memcpy(optval, &cb_flags, optlen);
+ break;
+ }
+ case TCP_BPF_RTO_MIN: {
+ int rto_min_us = jiffies_to_usecs(inet_csk(sk)->icsk_rto_min);
+
+ memcpy(optval, &rto_min_us, optlen);
+ break;
+ }
+ case TCP_BPF_DELACK_MAX: {
+ int delack_max_us = jiffies_to_usecs(inet_csk(sk)->icsk_delack_max);
+
+ memcpy(optval, &delack_max_us, optlen);
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
+ char *optval, int optlen)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned long timeout;
+ int val;
+
+ if (optlen != sizeof(int))
+ return -EINVAL;
+
+ val = *(int *)optval;
+
+ /* Only some options are supported */
+ switch (optname) {
+ case TCP_BPF_IW:
+ if (val <= 0 || tp->data_segs_out > tp->syn_data)
+ return -EINVAL;
+ tcp_snd_cwnd_set(tp, val);
+ break;
+ case TCP_BPF_SNDCWND_CLAMP:
+ if (val <= 0)
+ return -EINVAL;
+ tp->snd_cwnd_clamp = val;
+ tp->snd_ssthresh = val;
+ break;
+ case TCP_BPF_DELACK_MAX:
+ timeout = usecs_to_jiffies(val);
+ if (timeout > TCP_DELACK_MAX ||
+ timeout < TCP_TIMEOUT_MIN)
return -EINVAL;
+ inet_csk(sk)->icsk_delack_max = timeout;
+ break;
+ case TCP_BPF_RTO_MIN:
+ timeout = usecs_to_jiffies(val);
+ if (timeout > TCP_RTO_MIN ||
+ timeout < TCP_TIMEOUT_MIN)
+ return -EINVAL;
+ inet_csk(sk)->icsk_rto_min = timeout;
+ break;
+ case TCP_BPF_SOCK_OPS_CB_FLAGS:
+ if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS))
+ return -EINVAL;
+ tp->bpf_sock_ops_cb_flags = val;
+ break;
+ default:
+ return -EINVAL;
+ }
- val = *((int *)optval);
- /* Only some options are supported */
- switch (optname) {
- case IPV6_TCLASS:
- if (val < -1 || val > 0xff) {
- ret = -EINVAL;
- } else {
- struct ipv6_pinfo *np = inet6_sk(sk);
+ return 0;
+}
- if (val == -1)
- val = 0;
- np->tclass = val;
- }
- break;
- default:
- ret = -EINVAL;
- }
-#endif
- } else if (level == SOL_TCP &&
- sk->sk_prot->setsockopt == tcp_setsockopt) {
- if (optname == TCP_CONGESTION) {
- char name[TCP_CA_NAME_MAX];
- bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN;
-
- strncpy(name, optval, min_t(long, optlen,
- TCP_CA_NAME_MAX-1));
- name[TCP_CA_NAME_MAX-1] = 0;
- ret = tcp_set_congestion_control(sk, name, false,
- reinit);
- } else {
+static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval,
+ int *optlen, bool getopt)
+{
+ struct tcp_sock *tp;
+ int ret;
+
+ if (*optlen < 2)
+ return -EINVAL;
+
+ if (getopt) {
+ if (!inet_csk(sk)->icsk_ca_ops)
+ return -EINVAL;
+ /* BPF expects NULL-terminated tcp-cc string */
+ optval[--(*optlen)] = '\0';
+ return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
+ }
+
+ /* "cdg" is the only cc that alloc a ptr
+ * in inet_csk_ca area. The bpf-tcp-cc may
+ * overwrite this ptr after switching to cdg.
+ */
+ if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen))
+ return -ENOTSUPP;
+
+ /* It stops this looping
+ *
+ * .init => bpf_setsockopt(tcp_cc) => .init =>
+ * bpf_setsockopt(tcp_cc)" => .init => ....
+ *
+ * The second bpf_setsockopt(tcp_cc) is not allowed
+ * in order to break the loop when both .init
+ * are the same bpf prog.
+ *
+ * This applies even the second bpf_setsockopt(tcp_cc)
+ * does not cause a loop. This limits only the first
+ * '.init' can call bpf_setsockopt(TCP_CONGESTION) to
+ * pick a fallback cc (eg. peer does not support ECN)
+ * and the second '.init' cannot fallback to
+ * another.
+ */
+ tp = tcp_sk(sk);
+ if (tp->bpf_chg_cc_inprogress)
+ return -EBUSY;
+
+ tp->bpf_chg_cc_inprogress = 1;
+ ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION,
+ KERNEL_SOCKPTR(optval), *optlen);
+ tp->bpf_chg_cc_inprogress = 0;
+ return ret;
+}
+
+static int sol_tcp_sockopt(struct sock *sk, int optname,
+ char *optval, int *optlen,
+ bool getopt)
+{
+ if (sk->sk_protocol != IPPROTO_TCP)
+ return -EINVAL;
+
+ switch (optname) {
+ case TCP_NODELAY:
+ case TCP_MAXSEG:
+ case TCP_KEEPIDLE:
+ case TCP_KEEPINTVL:
+ case TCP_KEEPCNT:
+ case TCP_SYNCNT:
+ case TCP_WINDOW_CLAMP:
+ case TCP_THIN_LINEAR_TIMEOUTS:
+ case TCP_USER_TIMEOUT:
+ case TCP_NOTSENT_LOWAT:
+ case TCP_SAVE_SYN:
+ case TCP_RTO_MAX_MS:
+ if (*optlen != sizeof(int))
+ return -EINVAL;
+ break;
+ case TCP_CONGESTION:
+ return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt);
+ case TCP_SAVED_SYN:
+ if (*optlen < 1)
+ return -EINVAL;
+ break;
+ default:
+ if (getopt)
+ return bpf_sol_tcp_getsockopt(sk, optname, optval, *optlen);
+ return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen);
+ }
+
+ if (getopt) {
+ if (optname == TCP_SAVED_SYN) {
struct tcp_sock *tp = tcp_sk(sk);
- if (optlen != sizeof(int))
+ if (!tp->saved_syn ||
+ *optlen > tcp_saved_syn_len(tp->saved_syn))
return -EINVAL;
-
- val = *((int *)optval);
- /* Only some options are supported */
- switch (optname) {
- case TCP_BPF_IW:
- if (val <= 0 || tp->data_segs_out > 0)
- ret = -EINVAL;
- else
- tp->snd_cwnd = val;
- break;
- case TCP_BPF_SNDCWND_CLAMP:
- if (val <= 0) {
- ret = -EINVAL;
- } else {
- tp->snd_cwnd_clamp = val;
- tp->snd_ssthresh = val;
- }
- break;
- default:
- ret = -EINVAL;
- }
+ memcpy(optval, tp->saved_syn->data, *optlen);
+ /* It cannot free tp->saved_syn here because it
+ * does not know if the user space still needs it.
+ */
+ return 0;
}
-#endif
- } else {
- ret = -EINVAL;
+
+ return do_tcp_getsockopt(sk, SOL_TCP, optname,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
}
- return ret;
+
+ return do_tcp_setsockopt(sk, SOL_TCP, optname,
+ KERNEL_SOCKPTR(optval), *optlen);
+}
+
+static int sol_ip_sockopt(struct sock *sk, int optname,
+ char *optval, int *optlen,
+ bool getopt)
+{
+ if (sk->sk_family != AF_INET)
+ return -EINVAL;
+
+ switch (optname) {
+ case IP_TOS:
+ if (*optlen != sizeof(int))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (getopt)
+ return do_ip_getsockopt(sk, SOL_IP, optname,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
+
+ return do_ip_setsockopt(sk, SOL_IP, optname,
+ KERNEL_SOCKPTR(optval), *optlen);
+}
+
+static int sol_ipv6_sockopt(struct sock *sk, int optname,
+ char *optval, int *optlen,
+ bool getopt)
+{
+ if (sk->sk_family != AF_INET6)
+ return -EINVAL;
+
+ switch (optname) {
+ case IPV6_TCLASS:
+ case IPV6_AUTOFLOWLABEL:
+ if (*optlen != sizeof(int))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (getopt)
+ return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname,
+ KERNEL_SOCKPTR(optval),
+ KERNEL_SOCKPTR(optlen));
+
+ return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname,
+ KERNEL_SOCKPTR(optval), *optlen);
+}
+
+static int __bpf_setsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen)
+{
+ if (!sk_fullsock(sk))
+ return -EINVAL;
+
+ if (level == SOL_SOCKET)
+ return sol_socket_sockopt(sk, optname, optval, &optlen, false);
+ else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
+ return sol_ip_sockopt(sk, optname, optval, &optlen, false);
+ else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
+ return sol_ipv6_sockopt(sk, optname, optval, &optlen, false);
+ else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
+ return sol_tcp_sockopt(sk, optname, optval, &optlen, false);
+
+ return -EINVAL;
+}
+
+static bool is_locked_tcp_sock_ops(struct bpf_sock_ops_kern *bpf_sock)
+{
+ return bpf_sock->op <= BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
+}
+
+static int _bpf_setsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen)
+{
+ if (sk_fullsock(sk))
+ sock_owned_by_me(sk);
+ return __bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+static int __bpf_getsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen)
+{
+ int err, saved_optlen = optlen;
+
+ if (!sk_fullsock(sk)) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (level == SOL_SOCKET)
+ err = sol_socket_sockopt(sk, optname, optval, &optlen, true);
+ else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP)
+ err = sol_tcp_sockopt(sk, optname, optval, &optlen, true);
+ else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP)
+ err = sol_ip_sockopt(sk, optname, optval, &optlen, true);
+ else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6)
+ err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true);
+ else
+ err = -EINVAL;
+
+done:
+ if (err)
+ optlen = 0;
+ if (optlen < saved_optlen)
+ memset(optval + optlen, 0, saved_optlen - optlen);
+ return err;
+}
+
+static int _bpf_getsockopt(struct sock *sk, int level, int optname,
+ char *optval, int optlen)
+{
+ if (sk_fullsock(sk))
+ sock_owned_by_me(sk);
+ return __bpf_getsockopt(sk, level, optname, optval, optlen);
+}
+
+BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ return _bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_sk_setsockopt_proto = {
+ .func = bpf_sk_setsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sk_getsockopt, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ return _bpf_getsockopt(sk, level, optname, optval, optlen);
}
-static const struct bpf_func_proto bpf_setsockopt_proto = {
- .func = bpf_setsockopt,
+const struct bpf_func_proto bpf_sk_getsockopt_proto = {
+ .func = bpf_sk_getsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ return __bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto = {
+ .func = bpf_unlocked_sk_setsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_unlocked_sk_getsockopt, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ return __bpf_getsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto = {
+ .func = bpf_unlocked_sk_getsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen);
+}
+
+static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = {
+ .func = bpf_sock_addr_setsockopt,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
- .arg4_type = ARG_PTR_TO_MEM,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE,
};
-BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx,
int, level, int, optname, char *, optval, int, optlen)
{
- struct sock *sk = bpf_sock->sk;
+ return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen);
+}
- if (!sk_fullsock(sk))
- goto err_clear;
+static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = {
+ .func = bpf_sock_addr_getsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
-#ifdef CONFIG_INET
- if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) {
- if (optname == TCP_CONGESTION) {
- struct inet_connection_sock *icsk = inet_csk(sk);
+static int sk_bpf_set_get_bypass_prot_mem(struct sock *sk,
+ char *optval, int optlen,
+ bool getopt)
+{
+ int val;
- if (!icsk->icsk_ca_ops || optlen <= 1)
- goto err_clear;
- strncpy(optval, icsk->icsk_ca_ops->name, optlen);
- optval[optlen - 1] = 0;
+ if (optlen != sizeof(int))
+ return -EINVAL;
+
+ if (!sk_has_account(sk))
+ return -EOPNOTSUPP;
+
+ if (getopt) {
+ *(int *)optval = sk->sk_bypass_prot_mem;
+ return 0;
+ }
+
+ val = *(int *)optval;
+ if (val < 0 || val > 1)
+ return -EINVAL;
+
+ sk->sk_bypass_prot_mem = val;
+ return 0;
+}
+
+BPF_CALL_5(bpf_sock_create_setsockopt, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM)
+ return sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, false);
+
+ return __bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+static const struct bpf_func_proto bpf_sock_create_setsockopt_proto = {
+ .func = bpf_sock_create_setsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sock_create_getsockopt, struct sock *, sk, int, level,
+ int, optname, char *, optval, int, optlen)
+{
+ if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) {
+ int err = sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, true);
+
+ if (err)
+ memset(optval, 0, optlen);
+
+ return err;
+ }
+
+ return __bpf_getsockopt(sk, level, optname, optval, optlen);
+}
+
+static const struct bpf_func_proto bpf_sock_create_getsockopt_proto = {
+ .func = bpf_sock_create_getsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_UNINIT_MEM,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ if (!is_locked_tcp_sock_ops(bpf_sock))
+ return -EOPNOTSUPP;
+
+ return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen);
+}
+
+static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = {
+ .func = bpf_sock_ops_setsockopt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock,
+ int optname, const u8 **start)
+{
+ struct sk_buff *syn_skb = bpf_sock->syn_skb;
+ const u8 *hdr_start;
+ int ret;
+
+ if (syn_skb) {
+ /* sk is a request_sock here */
+
+ if (optname == TCP_BPF_SYN) {
+ hdr_start = syn_skb->data;
+ ret = tcp_hdrlen(syn_skb);
+ } else if (optname == TCP_BPF_SYN_IP) {
+ hdr_start = skb_network_header(syn_skb);
+ ret = skb_network_header_len(syn_skb) +
+ tcp_hdrlen(syn_skb);
} else {
- goto err_clear;
+ /* optname == TCP_BPF_SYN_MAC */
+ hdr_start = skb_mac_header(syn_skb);
+ ret = skb_mac_header_len(syn_skb) +
+ skb_network_header_len(syn_skb) +
+ tcp_hdrlen(syn_skb);
}
- } else if (level == SOL_IP) {
- struct inet_sock *inet = inet_sk(sk);
+ } else {
+ struct sock *sk = bpf_sock->sk;
+ struct saved_syn *saved_syn;
- if (optlen != sizeof(int) || sk->sk_family != AF_INET)
- goto err_clear;
+ if (sk->sk_state == TCP_NEW_SYN_RECV)
+ /* synack retransmit. bpf_sock->syn_skb will
+ * not be available. It has to resort to
+ * saved_syn (if it is saved).
+ */
+ saved_syn = inet_reqsk(sk)->saved_syn;
+ else
+ saved_syn = tcp_sk(sk)->saved_syn;
- /* Only some options are supported */
- switch (optname) {
- case IP_TOS:
- *((int *)optval) = (int)inet->tos;
- break;
- default:
- goto err_clear;
+ if (!saved_syn)
+ return -ENOENT;
+
+ if (optname == TCP_BPF_SYN) {
+ hdr_start = saved_syn->data +
+ saved_syn->mac_hdrlen +
+ saved_syn->network_hdrlen;
+ ret = saved_syn->tcp_hdrlen;
+ } else if (optname == TCP_BPF_SYN_IP) {
+ hdr_start = saved_syn->data +
+ saved_syn->mac_hdrlen;
+ ret = saved_syn->network_hdrlen +
+ saved_syn->tcp_hdrlen;
+ } else {
+ /* optname == TCP_BPF_SYN_MAC */
+
+ /* TCP_SAVE_SYN may not have saved the mac hdr */
+ if (!saved_syn->mac_hdrlen)
+ return -ENOENT;
+
+ hdr_start = saved_syn->data;
+ ret = saved_syn->mac_hdrlen +
+ saved_syn->network_hdrlen +
+ saved_syn->tcp_hdrlen;
}
-#if IS_ENABLED(CONFIG_IPV6)
- } else if (level == SOL_IPV6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
+ }
- if (optlen != sizeof(int) || sk->sk_family != AF_INET6)
- goto err_clear;
+ *start = hdr_start;
+ return ret;
+}
- /* Only some options are supported */
- switch (optname) {
- case IPV6_TCLASS:
- *((int *)optval) = (int)np->tclass;
- break;
- default:
- goto err_clear;
+BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
+ int, level, int, optname, char *, optval, int, optlen)
+{
+ if (!is_locked_tcp_sock_ops(bpf_sock))
+ return -EOPNOTSUPP;
+
+ if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP &&
+ optname >= TCP_BPF_SYN && optname <= TCP_BPF_SYN_MAC) {
+ int ret, copy_len = 0;
+ const u8 *start;
+
+ ret = bpf_sock_ops_get_syn(bpf_sock, optname, &start);
+ if (ret > 0) {
+ copy_len = ret;
+ if (optlen < copy_len) {
+ copy_len = optlen;
+ ret = -ENOSPC;
+ }
+
+ memcpy(optval, start, copy_len);
}
-#endif
- } else {
- goto err_clear;
+
+ /* Zero out unused buffer at the end */
+ memset(optval + copy_len, 0, optlen - copy_len);
+
+ return ret;
}
- return 0;
-#endif
-err_clear:
- memset(optval, 0, optlen);
- return -EINVAL;
+
+ return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen);
}
-static const struct bpf_func_proto bpf_getsockopt_proto = {
- .func = bpf_getsockopt,
+static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = {
+ .func = bpf_sock_ops_getsockopt,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
@@ -4112,11 +5952,13 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
struct sock *sk = bpf_sock->sk;
int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
+ if (!is_locked_tcp_sock_ops(bpf_sock))
+ return -EOPNOTSUPP;
+
if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
return -EINVAL;
- if (val)
- tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
+ tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
}
@@ -4137,28 +5979,29 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr,
{
#ifdef CONFIG_INET
struct sock *sk = ctx->sk;
+ u32 flags = BIND_FROM_BPF;
int err;
- /* Binding to port can be expensive so it's prohibited in the helper.
- * Only binding to IP is supported.
- */
err = -EINVAL;
+ if (addr_len < offsetofend(struct sockaddr, sa_family))
+ return err;
if (addr->sa_family == AF_INET) {
if (addr_len < sizeof(struct sockaddr_in))
return err;
- if (((struct sockaddr_in *)addr)->sin_port != htons(0))
- return err;
- return __inet_bind(sk, addr, addr_len, true, false);
+ if (((struct sockaddr_in *)addr)->sin_port == htons(0))
+ flags |= BIND_FORCE_ADDRESS_NO_PORT;
+ return __inet_bind(sk, (struct sockaddr_unsized *)addr, addr_len, flags);
#if IS_ENABLED(CONFIG_IPV6)
} else if (addr->sa_family == AF_INET6) {
if (addr_len < SIN6_LEN_RFC2133)
return err;
- if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
- return err;
+ if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0))
+ flags |= BIND_FORCE_ADDRESS_NO_PORT;
/* ipv6_bpf_stub cannot be NULL, since it's called from
* bpf_cgroup_inet6_connect hook and ipv6 is already loaded
*/
- return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false);
+ return ipv6_bpf_stub->inet6_bind(sk, (struct sockaddr_unsized *)addr,
+ addr_len, flags);
#endif /* CONFIG_IPV6 */
}
#endif /* CONFIG_INET */
@@ -4171,11 +6014,20 @@ static const struct bpf_func_proto bpf_bind_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
- .arg2_type = ARG_PTR_TO_MEM,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};
#ifdef CONFIG_XFRM
+
+#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
+ (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
+
+struct metadata_dst __percpu *xfrm_bpf_md_dst;
+EXPORT_SYMBOL_GPL(xfrm_bpf_md_dst);
+
+#endif
+
BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
struct bpf_xfrm_state *, to, u32, size, u64, flags)
{
@@ -4222,15 +6074,12 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
#endif
#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
-static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
- const struct neighbour *neigh,
- const struct net_device *dev)
+static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, u32 mtu)
{
- memcpy(params->dmac, neigh->ha, ETH_ALEN);
- memcpy(params->smac, dev->dev_addr, ETH_ALEN);
params->h_vlan_TCI = 0;
params->h_vlan_proto = 0;
- params->ifindex = dev->ifindex;
+ if (mtu)
+ params->mtu_result = mtu; /* union with tot_len */
return 0;
}
@@ -4240,14 +6089,14 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
u32 flags, bool check_mtu)
{
+ struct fib_nh_common *nhc;
struct in_device *in_dev;
struct neighbour *neigh;
struct net_device *dev;
struct fib_result res;
- struct fib_nh *nh;
struct flowi4 fl4;
+ u32 mtu = 0;
int err;
- u32 mtu;
dev = dev_get_by_index_rcu(net, params->ifindex);
if (unlikely(!dev))
@@ -4265,7 +6114,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
fl4.flowi4_iif = params->ifindex;
fl4.flowi4_oif = 0;
}
- fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
+ fl4.flowi4_dscp = inet_dsfield_to_dscp(params->tos);
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_flags = 0;
@@ -4274,18 +6123,28 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
fl4.saddr = params->ipv4_src;
fl4.fl4_sport = params->sport;
fl4.fl4_dport = params->dport;
+ fl4.flowi4_multipath_hash = 0;
if (flags & BPF_FIB_LOOKUP_DIRECT) {
u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
struct fib_table *tb;
+ if (flags & BPF_FIB_LOOKUP_TBID) {
+ tbid = params->tbid;
+ /* zero out for vlan output */
+ params->tbid = 0;
+ }
+
tb = fib_get_table(net, tbid);
if (unlikely(!tb))
return BPF_FIB_LKUP_RET_NOT_FWDED;
err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
} else {
- fl4.flowi4_mark = 0;
+ if (flags & BPF_FIB_LOOKUP_MARK)
+ fl4.flowi4_mark = params->mark;
+ else
+ fl4.flowi4_mark = 0;
fl4.flowi4_secid = 0;
fl4.flowi4_tun_key.tun_id = 0;
fl4.flowi4_uid = sock_net_uid(net, NULL);
@@ -4308,35 +6167,60 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
if (res.type != RTN_UNICAST)
return BPF_FIB_LKUP_RET_NOT_FWDED;
- if (res.fi->fib_nhs > 1)
+ if (fib_info_num_path(res.fi) > 1)
fib_select_path(net, &res, &fl4, NULL);
if (check_mtu) {
mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst);
- if (params->tot_len > mtu)
+ if (params->tot_len > mtu) {
+ params->mtu_result = mtu; /* union with tot_len */
return BPF_FIB_LKUP_RET_FRAG_NEEDED;
+ }
}
- nh = &res.fi->fib_nh[res.nh_sel];
+ nhc = res.nhc;
/* do not handle lwt encaps right now */
- if (nh->nh_lwtstate)
+ if (nhc->nhc_lwtstate)
return BPF_FIB_LKUP_RET_UNSUPP_LWT;
- dev = nh->nh_dev;
- if (nh->nh_gw)
- params->ipv4_dst = nh->nh_gw;
+ dev = nhc->nhc_dev;
params->rt_metric = res.fi->fib_priority;
+ params->ifindex = dev->ifindex;
+
+ if (flags & BPF_FIB_LOOKUP_SRC)
+ params->ipv4_src = fib_result_prefsrc(net, &res);
/* xdp and cls_bpf programs are run in RCU-bh so
* rcu_read_lock_bh is not needed here
*/
- neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
- if (!neigh)
+ if (likely(nhc->nhc_gw_family != AF_INET6)) {
+ if (nhc->nhc_gw_family)
+ params->ipv4_dst = nhc->nhc_gw.ipv4;
+ } else {
+ struct in6_addr *dst = (struct in6_addr *)params->ipv6_dst;
+
+ params->family = AF_INET6;
+ *dst = nhc->nhc_gw.ipv6;
+ }
+
+ if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
+ goto set_fwd_params;
+
+ if (likely(nhc->nhc_gw_family != AF_INET6))
+ neigh = __ipv4_neigh_lookup_noref(dev,
+ (__force u32)params->ipv4_dst);
+ else
+ neigh = __ipv6_neigh_lookup_noref_stub(dev, params->ipv6_dst);
+
+ if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID))
return BPF_FIB_LKUP_RET_NO_NEIGH;
+ memcpy(params->dmac, neigh->ha, ETH_ALEN);
+ memcpy(params->smac, dev->dev_addr, ETH_ALEN);
- return bpf_fib_set_fwd_params(params, neigh, dev);
+set_fwd_params:
+ return bpf_fib_set_fwd_params(params, mtu);
}
#endif
@@ -4346,14 +6230,14 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
{
struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
+ struct fib6_result res = {};
struct neighbour *neigh;
struct net_device *dev;
struct inet6_dev *idev;
- struct fib6_info *f6i;
struct flowi6 fl6;
int strict = 0;
- int oif;
- u32 mtu;
+ int oif, err;
+ u32 mtu = 0;
/* link local addresses are never forwarded */
if (rt6_need_strict(dst) || rt6_need_strict(src))
@@ -4364,7 +6248,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
return -ENODEV;
idev = __in6_dev_get_safely(dev);
- if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
+ if (unlikely(!idev || !READ_ONCE(idev->cnf.forwarding)))
return BPF_FIB_LKUP_RET_FWD_DISABLED;
if (flags & BPF_FIB_LOOKUP_OUTPUT) {
@@ -4390,79 +6274,109 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
struct fib6_table *tb;
+ if (flags & BPF_FIB_LOOKUP_TBID) {
+ tbid = params->tbid;
+ /* zero out for vlan output */
+ params->tbid = 0;
+ }
+
tb = ipv6_stub->fib6_get_table(net, tbid);
if (unlikely(!tb))
return BPF_FIB_LKUP_RET_NOT_FWDED;
- f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
+ err = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, &res,
+ strict);
} else {
- fl6.flowi6_mark = 0;
+ if (flags & BPF_FIB_LOOKUP_MARK)
+ fl6.flowi6_mark = params->mark;
+ else
+ fl6.flowi6_mark = 0;
fl6.flowi6_secid = 0;
fl6.flowi6_tun_key.tun_id = 0;
fl6.flowi6_uid = sock_net_uid(net, NULL);
- f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
+ err = ipv6_stub->fib6_lookup(net, oif, &fl6, &res, strict);
}
- if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
+ if (unlikely(err || IS_ERR_OR_NULL(res.f6i) ||
+ res.f6i == net->ipv6.fib6_null_entry))
return BPF_FIB_LKUP_RET_NOT_FWDED;
- if (unlikely(f6i->fib6_flags & RTF_REJECT)) {
- switch (f6i->fib6_type) {
- case RTN_BLACKHOLE:
- return BPF_FIB_LKUP_RET_BLACKHOLE;
- case RTN_UNREACHABLE:
- return BPF_FIB_LKUP_RET_UNREACHABLE;
- case RTN_PROHIBIT:
- return BPF_FIB_LKUP_RET_PROHIBIT;
- default:
- return BPF_FIB_LKUP_RET_NOT_FWDED;
- }
- }
-
- if (f6i->fib6_type != RTN_UNICAST)
+ switch (res.fib6_type) {
+ /* only unicast is forwarded */
+ case RTN_UNICAST:
+ break;
+ case RTN_BLACKHOLE:
+ return BPF_FIB_LKUP_RET_BLACKHOLE;
+ case RTN_UNREACHABLE:
+ return BPF_FIB_LKUP_RET_UNREACHABLE;
+ case RTN_PROHIBIT:
+ return BPF_FIB_LKUP_RET_PROHIBIT;
+ default:
return BPF_FIB_LKUP_RET_NOT_FWDED;
+ }
- if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
- f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
- fl6.flowi6_oif, NULL,
- strict);
+ ipv6_stub->fib6_select_path(net, &res, &fl6, fl6.flowi6_oif,
+ fl6.flowi6_oif != 0, NULL, strict);
if (check_mtu) {
- mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src);
- if (params->tot_len > mtu)
+ mtu = ipv6_stub->ip6_mtu_from_fib6(&res, dst, src);
+ if (params->tot_len > mtu) {
+ params->mtu_result = mtu; /* union with tot_len */
return BPF_FIB_LKUP_RET_FRAG_NEEDED;
+ }
}
- if (f6i->fib6_nh.nh_lwtstate)
+ if (res.nh->fib_nh_lws)
return BPF_FIB_LKUP_RET_UNSUPP_LWT;
- if (f6i->fib6_flags & RTF_GATEWAY)
- *dst = f6i->fib6_nh.nh_gw;
+ if (res.nh->fib_nh_gw_family)
+ *dst = res.nh->fib_nh_gw6;
- dev = f6i->fib6_nh.nh_dev;
- params->rt_metric = f6i->fib6_metric;
+ dev = res.nh->fib_nh_dev;
+ params->rt_metric = res.f6i->fib6_metric;
+ params->ifindex = dev->ifindex;
+
+ if (flags & BPF_FIB_LOOKUP_SRC) {
+ if (res.f6i->fib6_prefsrc.plen) {
+ *src = res.f6i->fib6_prefsrc.addr;
+ } else {
+ err = ipv6_bpf_stub->ipv6_dev_get_saddr(net, dev,
+ &fl6.daddr, 0,
+ src);
+ if (err)
+ return BPF_FIB_LKUP_RET_NO_SRC_ADDR;
+ }
+ }
+
+ if (flags & BPF_FIB_LOOKUP_SKIP_NEIGH)
+ goto set_fwd_params;
/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
- * not needed here. Can not use __ipv6_neigh_lookup_noref here
- * because we need to get nd_tbl via the stub
+ * not needed here.
*/
- neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
- ndisc_hashfn, dst, dev);
- if (!neigh)
+ neigh = __ipv6_neigh_lookup_noref_stub(dev, dst);
+ if (!neigh || !(READ_ONCE(neigh->nud_state) & NUD_VALID))
return BPF_FIB_LKUP_RET_NO_NEIGH;
+ memcpy(params->dmac, neigh->ha, ETH_ALEN);
+ memcpy(params->smac, dev->dev_addr, ETH_ALEN);
- return bpf_fib_set_fwd_params(params, neigh, dev);
+set_fwd_params:
+ return bpf_fib_set_fwd_params(params, mtu);
}
#endif
+#define BPF_FIB_LOOKUP_MASK (BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT | \
+ BPF_FIB_LOOKUP_SKIP_NEIGH | BPF_FIB_LOOKUP_TBID | \
+ BPF_FIB_LOOKUP_SRC | BPF_FIB_LOOKUP_MARK)
+
BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
struct bpf_fib_lookup *, params, int, plen, u32, flags)
{
if (plen < sizeof(*params))
return -EINVAL;
- if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
+ if (flags & ~BPF_FIB_LOOKUP_MASK)
return -EINVAL;
switch (params->family) {
@@ -4495,32 +6409,41 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
{
struct net *net = dev_net(skb->dev);
int rc = -EAFNOSUPPORT;
+ bool check_mtu = false;
if (plen < sizeof(*params))
return -EINVAL;
- if (flags & ~(BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_OUTPUT))
+ if (flags & ~BPF_FIB_LOOKUP_MASK)
return -EINVAL;
+ if (params->tot_len)
+ check_mtu = true;
+
switch (params->family) {
#if IS_ENABLED(CONFIG_INET)
case AF_INET:
- rc = bpf_ipv4_fib_lookup(net, params, flags, false);
+ rc = bpf_ipv4_fib_lookup(net, params, flags, check_mtu);
break;
#endif
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- rc = bpf_ipv6_fib_lookup(net, params, flags, false);
+ rc = bpf_ipv6_fib_lookup(net, params, flags, check_mtu);
break;
#endif
}
- if (!rc) {
+ if (rc == BPF_FIB_LKUP_RET_SUCCESS && !check_mtu) {
struct net_device *dev;
+ /* When tot_len isn't provided by user, check skb
+ * against MTU of FIB lookup resulting net_device
+ */
dev = dev_get_by_index_rcu(net, params->ifindex);
if (!is_skb_forwardable(dev, skb))
rc = BPF_FIB_LKUP_RET_FRAG_NEEDED;
+
+ params->mtu_result = dev->mtu; /* union with tot_len */
}
return rc;
@@ -4536,13 +6459,126 @@ static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
.arg4_type = ARG_ANYTHING,
};
+static struct net_device *__dev_via_ifindex(struct net_device *dev_curr,
+ u32 ifindex)
+{
+ struct net *netns = dev_net(dev_curr);
+
+ /* Non-redirect use-cases can use ifindex=0 and save ifindex lookup */
+ if (ifindex == 0)
+ return dev_curr;
+
+ return dev_get_by_index_rcu(netns, ifindex);
+}
+
+BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb,
+ u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
+{
+ int ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
+ struct net_device *dev = skb->dev;
+ int mtu, dev_len, skb_len;
+
+ if (unlikely(flags & ~(BPF_MTU_CHK_SEGS)))
+ return -EINVAL;
+ if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len)))
+ return -EINVAL;
+
+ dev = __dev_via_ifindex(dev, ifindex);
+ if (unlikely(!dev))
+ return -ENODEV;
+
+ mtu = READ_ONCE(dev->mtu);
+ dev_len = mtu + dev->hard_header_len;
+
+ /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
+ skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len;
+
+ skb_len += len_diff; /* minus result pass check */
+ if (skb_len <= dev_len) {
+ ret = BPF_MTU_CHK_RET_SUCCESS;
+ goto out;
+ }
+ /* At this point, skb->len exceed MTU, but as it include length of all
+ * segments, it can still be below MTU. The SKB can possibly get
+ * re-segmented in transmit path (see validate_xmit_skb). Thus, user
+ * must choose if segs are to be MTU checked.
+ */
+ if (skb_is_gso(skb)) {
+ ret = BPF_MTU_CHK_RET_SUCCESS;
+ if (flags & BPF_MTU_CHK_SEGS) {
+ if (!skb_transport_header_was_set(skb))
+ return -EINVAL;
+ if (!skb_gso_validate_network_len(skb, mtu))
+ ret = BPF_MTU_CHK_RET_SEGS_TOOBIG;
+ }
+ }
+out:
+ *mtu_len = mtu;
+ return ret;
+}
+
+BPF_CALL_5(bpf_xdp_check_mtu, struct xdp_buff *, xdp,
+ u32, ifindex, u32 *, mtu_len, s32, len_diff, u64, flags)
+{
+ struct net_device *dev = xdp->rxq->dev;
+ int xdp_len = xdp->data_end - xdp->data;
+ int ret = BPF_MTU_CHK_RET_SUCCESS;
+ int mtu, dev_len;
+
+ /* XDP variant doesn't support multi-buffer segment check (yet) */
+ if (unlikely(flags))
+ return -EINVAL;
+
+ dev = __dev_via_ifindex(dev, ifindex);
+ if (unlikely(!dev))
+ return -ENODEV;
+
+ mtu = READ_ONCE(dev->mtu);
+ dev_len = mtu + dev->hard_header_len;
+
+ /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
+ if (*mtu_len)
+ xdp_len = *mtu_len + dev->hard_header_len;
+
+ xdp_len += len_diff; /* minus result pass check */
+ if (xdp_len > dev_len)
+ ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
+
+ *mtu_len = mtu;
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_skb_check_mtu_proto = {
+ .func = bpf_skb_check_mtu,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED,
+ .arg3_size = sizeof(u32),
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto bpf_xdp_check_mtu_proto = {
+ .func = bpf_xdp_check_mtu,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_WRITE | MEM_ALIGNED,
+ .arg3_size = sizeof(u32),
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len)
{
int err;
struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)hdr;
- if (!seg6_validate_srh(srh, len))
+ if (!seg6_validate_srh(srh, len, false))
return -EINVAL;
switch (type) {
@@ -4565,14 +6601,21 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
if (err)
return err;
- ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
skb_set_transport_header(skb, sizeof(struct ipv6hdr));
return seg6_lookup_nexthop(skb, NULL, 0);
}
#endif /* CONFIG_IPV6_SEG6_BPF */
-BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
+#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
+static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
+ bool ingress)
+{
+ return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
+}
+#endif
+
+BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
u32, len)
{
switch (type) {
@@ -4581,18 +6624,45 @@ BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
case BPF_LWT_ENCAP_SEG6_INLINE:
return bpf_push_seg6_encap(skb, type, hdr, len);
#endif
+#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
+ case BPF_LWT_ENCAP_IP:
+ return bpf_push_ip_encap(skb, hdr, len, true /* ingress */);
+#endif
default:
return -EINVAL;
}
}
-static const struct bpf_func_proto bpf_lwt_push_encap_proto = {
- .func = bpf_lwt_push_encap,
+BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type,
+ void *, hdr, u32, len)
+{
+ switch (type) {
+#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
+ case BPF_LWT_ENCAP_IP:
+ return bpf_push_ip_encap(skb, hdr, len, false /* egress */);
+#endif
+ default:
+ return -EINVAL;
+ }
+}
+
+static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {
+ .func = bpf_lwt_in_push_encap,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg4_type = ARG_CONST_SIZE
+};
+
+static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {
+ .func = bpf_lwt_xmit_push_encap,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE
};
@@ -4606,6 +6676,7 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
void *srh_tlvs, *srh_end, *ptr;
int srhoff = 0;
+ lockdep_assert_held(&srh_state->bh_lock);
if (srh == NULL)
return -EINVAL;
@@ -4635,7 +6706,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE
};
@@ -4662,6 +6733,7 @@ BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
int hdroff = 0;
int err;
+ lockdep_assert_held(&srh_state->bh_lock);
switch (action) {
case SEG6_LOCAL_ACTION_END_X:
if (!seg6_bpf_has_valid_srh(skb))
@@ -4723,7 +6795,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
- .arg3_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg4_type = ARG_CONST_SIZE
};
@@ -4738,6 +6810,7 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
int srhoff = 0;
int ret;
+ lockdep_assert_held(&srh_state->bh_lock);
if (unlikely(srh == NULL))
return -EINVAL;
@@ -4787,91 +6860,1324 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
};
#endif /* CONFIG_IPV6_SEG6_BPF */
-bool bpf_helper_changes_pkt_data(void *func)
-{
- if (func == bpf_skb_vlan_push ||
- func == bpf_skb_vlan_pop ||
- func == bpf_skb_store_bytes ||
- func == bpf_skb_change_proto ||
- func == bpf_skb_change_head ||
- func == sk_skb_change_head ||
- func == bpf_skb_change_tail ||
- func == sk_skb_change_tail ||
- func == bpf_skb_adjust_room ||
- func == bpf_skb_pull_data ||
- func == sk_skb_pull_data ||
- func == bpf_clone_redirect ||
- func == bpf_l3_csum_replace ||
- func == bpf_l4_csum_replace ||
- func == bpf_xdp_adjust_head ||
- func == bpf_xdp_adjust_meta ||
- func == bpf_msg_pull_data ||
- func == bpf_xdp_adjust_tail ||
-#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
- func == bpf_lwt_seg6_store_bytes ||
- func == bpf_lwt_seg6_adjust_srh ||
- func == bpf_lwt_seg6_action ||
+#ifdef CONFIG_INET
+static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
+ int dif, int sdif, u8 family, u8 proto)
+{
+ bool refcounted = false;
+ struct sock *sk = NULL;
+
+ if (family == AF_INET) {
+ __be32 src4 = tuple->ipv4.saddr;
+ __be32 dst4 = tuple->ipv4.daddr;
+
+ if (proto == IPPROTO_TCP)
+ sk = __inet_lookup(net, NULL, 0,
+ src4, tuple->ipv4.sport,
+ dst4, tuple->ipv4.dport,
+ dif, sdif, &refcounted);
+ else
+ sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport,
+ dst4, tuple->ipv4.dport,
+ dif, sdif, net->ipv4.udp_table, NULL);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else {
+ struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr;
+ struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
+
+ if (proto == IPPROTO_TCP)
+ sk = __inet6_lookup(net, NULL, 0,
+ src6, tuple->ipv6.sport,
+ dst6, ntohs(tuple->ipv6.dport),
+ dif, sdif, &refcounted);
+ else if (likely(ipv6_bpf_stub))
+ sk = ipv6_bpf_stub->udp6_lib_lookup(net,
+ src6, tuple->ipv6.sport,
+ dst6, tuple->ipv6.dport,
+ dif, sdif,
+ net->ipv4.udp_table, NULL);
#endif
- func == bpf_lwt_push_encap)
- return true;
+ }
- return false;
+ if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) {
+ WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
+ sk = NULL;
+ }
+ return sk;
}
-static const struct bpf_func_proto *
-bpf_base_func_proto(enum bpf_func_id func_id)
+/* bpf_skc_lookup performs the core lookup for different types of sockets,
+ * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE.
+ */
+static struct sock *
+__bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
+ struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
+ u64 flags, int sdif)
+{
+ struct sock *sk = NULL;
+ struct net *net;
+ u8 family;
+
+ if (len == sizeof(tuple->ipv4))
+ family = AF_INET;
+ else if (len == sizeof(tuple->ipv6))
+ family = AF_INET6;
+ else
+ return NULL;
+
+ if (unlikely(flags || !((s32)netns_id < 0 || netns_id <= S32_MAX)))
+ goto out;
+
+ if (sdif < 0) {
+ if (family == AF_INET)
+ sdif = inet_sdif(skb);
+ else
+ sdif = inet6_sdif(skb);
+ }
+
+ if ((s32)netns_id < 0) {
+ net = caller_net;
+ sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
+ } else {
+ net = get_net_ns_by_id(caller_net, netns_id);
+ if (unlikely(!net))
+ goto out;
+ sk = sk_lookup(net, tuple, ifindex, sdif, family, proto);
+ put_net(net);
+ }
+
+out:
+ return sk;
+}
+
+static struct sock *
+__bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
+ struct net *caller_net, u32 ifindex, u8 proto, u64 netns_id,
+ u64 flags, int sdif)
+{
+ struct sock *sk = __bpf_skc_lookup(skb, tuple, len, caller_net,
+ ifindex, proto, netns_id, flags,
+ sdif);
+
+ if (sk) {
+ struct sock *sk2 = sk_to_full_sk(sk);
+
+ /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
+ * sock refcnt is decremented to prevent a request_sock leak.
+ */
+ if (sk2 != sk) {
+ sock_gen_put(sk);
+ /* Ensure there is no need to bump sk2 refcnt */
+ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
+ WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
+ return NULL;
+ }
+ sk = sk2;
+ }
+ }
+
+ return sk;
+}
+
+static struct sock *
+bpf_skc_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
+ u8 proto, u64 netns_id, u64 flags)
+{
+ struct net *caller_net;
+ int ifindex;
+
+ if (skb->dev) {
+ caller_net = dev_net(skb->dev);
+ ifindex = skb->dev->ifindex;
+ } else {
+ caller_net = sock_net(skb->sk);
+ ifindex = 0;
+ }
+
+ return __bpf_skc_lookup(skb, tuple, len, caller_net, ifindex, proto,
+ netns_id, flags, -1);
+}
+
+static struct sock *
+bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len,
+ u8 proto, u64 netns_id, u64 flags)
+{
+ struct sock *sk = bpf_skc_lookup(skb, tuple, len, proto, netns_id,
+ flags);
+
+ if (sk) {
+ struct sock *sk2 = sk_to_full_sk(sk);
+
+ /* sk_to_full_sk() may return (sk)->rsk_listener, so make sure the original sk
+ * sock refcnt is decremented to prevent a request_sock leak.
+ */
+ if (sk2 != sk) {
+ sock_gen_put(sk);
+ /* Ensure there is no need to bump sk2 refcnt */
+ if (unlikely(sk2 && !sock_flag(sk2, SOCK_RCU_FREE))) {
+ WARN_ONCE(1, "Found non-RCU, unreferenced socket!");
+ return NULL;
+ }
+ sk = sk2;
+ }
+ }
+
+ return sk;
+}
+
+BPF_CALL_5(bpf_skc_lookup_tcp, struct sk_buff *, skb,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ return (unsigned long)bpf_skc_lookup(skb, tuple, len, IPPROTO_TCP,
+ netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_skc_lookup_tcp_proto = {
+ .func = bpf_skc_lookup_tcp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP,
+ netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = {
+ .func = bpf_sk_lookup_tcp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ return (unsigned long)bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP,
+ netns_id, flags);
+}
+
+static const struct bpf_func_proto bpf_sk_lookup_udp_proto = {
+ .func = bpf_sk_lookup_udp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_tc_skc_lookup_tcp, struct sk_buff *, skb,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ struct net_device *dev = skb->dev;
+ int ifindex = dev->ifindex, sdif = dev_sdif(dev);
+ struct net *caller_net = dev_net(dev);
+
+ return (unsigned long)__bpf_skc_lookup(skb, tuple, len, caller_net,
+ ifindex, IPPROTO_TCP, netns_id,
+ flags, sdif);
+}
+
+static const struct bpf_func_proto bpf_tc_skc_lookup_tcp_proto = {
+ .func = bpf_tc_skc_lookup_tcp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_tc_sk_lookup_tcp, struct sk_buff *, skb,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ struct net_device *dev = skb->dev;
+ int ifindex = dev->ifindex, sdif = dev_sdif(dev);
+ struct net *caller_net = dev_net(dev);
+
+ return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net,
+ ifindex, IPPROTO_TCP, netns_id,
+ flags, sdif);
+}
+
+static const struct bpf_func_proto bpf_tc_sk_lookup_tcp_proto = {
+ .func = bpf_tc_sk_lookup_tcp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_tc_sk_lookup_udp, struct sk_buff *, skb,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ struct net_device *dev = skb->dev;
+ int ifindex = dev->ifindex, sdif = dev_sdif(dev);
+ struct net *caller_net = dev_net(dev);
+
+ return (unsigned long)__bpf_sk_lookup(skb, tuple, len, caller_net,
+ ifindex, IPPROTO_UDP, netns_id,
+ flags, sdif);
+}
+
+static const struct bpf_func_proto bpf_tc_sk_lookup_udp_proto = {
+ .func = bpf_tc_sk_lookup_udp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_1(bpf_sk_release, struct sock *, sk)
+{
+ if (sk && sk_is_refcounted(sk))
+ sock_gen_put(sk);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_release_proto = {
+ .func = bpf_sk_release,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON | OBJ_RELEASE,
+};
+
+BPF_CALL_5(bpf_xdp_sk_lookup_udp, struct xdp_buff *, ctx,
+ struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
+{
+ struct net_device *dev = ctx->rxq->dev;
+ int ifindex = dev->ifindex, sdif = dev_sdif(dev);
+ struct net *caller_net = dev_net(dev);
+
+ return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
+ ifindex, IPPROTO_UDP, netns_id,
+ flags, sdif);
+}
+
+static const struct bpf_func_proto bpf_xdp_sk_lookup_udp_proto = {
+ .func = bpf_xdp_sk_lookup_udp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_xdp_skc_lookup_tcp, struct xdp_buff *, ctx,
+ struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
+{
+ struct net_device *dev = ctx->rxq->dev;
+ int ifindex = dev->ifindex, sdif = dev_sdif(dev);
+ struct net *caller_net = dev_net(dev);
+
+ return (unsigned long)__bpf_skc_lookup(NULL, tuple, len, caller_net,
+ ifindex, IPPROTO_TCP, netns_id,
+ flags, sdif);
+}
+
+static const struct bpf_func_proto bpf_xdp_skc_lookup_tcp_proto = {
+ .func = bpf_xdp_skc_lookup_tcp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_xdp_sk_lookup_tcp, struct xdp_buff *, ctx,
+ struct bpf_sock_tuple *, tuple, u32, len, u32, netns_id, u64, flags)
+{
+ struct net_device *dev = ctx->rxq->dev;
+ int ifindex = dev->ifindex, sdif = dev_sdif(dev);
+ struct net *caller_net = dev_net(dev);
+
+ return (unsigned long)__bpf_sk_lookup(NULL, tuple, len, caller_net,
+ ifindex, IPPROTO_TCP, netns_id,
+ flags, sdif);
+}
+
+static const struct bpf_func_proto bpf_xdp_sk_lookup_tcp_proto = {
+ .func = bpf_xdp_sk_lookup_tcp,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_sock_addr_skc_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ return (unsigned long)__bpf_skc_lookup(NULL, tuple, len,
+ sock_net(ctx->sk), 0,
+ IPPROTO_TCP, netns_id, flags,
+ -1);
+}
+
+static const struct bpf_func_proto bpf_sock_addr_skc_lookup_tcp_proto = {
+ .func = bpf_sock_addr_skc_lookup_tcp,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_SOCK_COMMON_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_sock_addr_sk_lookup_tcp, struct bpf_sock_addr_kern *, ctx,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
+ sock_net(ctx->sk), 0, IPPROTO_TCP,
+ netns_id, flags, -1);
+}
+
+static const struct bpf_func_proto bpf_sock_addr_sk_lookup_tcp_proto = {
+ .func = bpf_sock_addr_sk_lookup_tcp,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+BPF_CALL_5(bpf_sock_addr_sk_lookup_udp, struct bpf_sock_addr_kern *, ctx,
+ struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags)
+{
+ return (unsigned long)__bpf_sk_lookup(NULL, tuple, len,
+ sock_net(ctx->sk), 0, IPPROTO_UDP,
+ netns_id, flags, -1);
+}
+
+static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
+ .func = bpf_sock_addr_sk_lookup_udp,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+ .arg4_type = ARG_ANYTHING,
+ .arg5_type = ARG_ANYTHING,
+};
+
+bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= offsetofend(struct bpf_tcp_sock,
+ icsk_retransmits))
+ return false;
+
+ if (off % size != 0)
+ return false;
+
+ switch (off) {
+ case offsetof(struct bpf_tcp_sock, bytes_received):
+ case offsetof(struct bpf_tcp_sock, bytes_acked):
+ return size == sizeof(__u64);
+ default:
+ return size == sizeof(__u32);
+ }
+}
+
+u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+#define BPF_TCP_SOCK_GET_COMMON(FIELD) \
+ do { \
+ BUILD_BUG_ON(sizeof_field(struct tcp_sock, FIELD) > \
+ sizeof_field(struct bpf_tcp_sock, FIELD)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\
+ si->dst_reg, si->src_reg, \
+ offsetof(struct tcp_sock, FIELD)); \
+ } while (0)
+
+#define BPF_INET_SOCK_GET_COMMON(FIELD) \
+ do { \
+ BUILD_BUG_ON(sizeof_field(struct inet_connection_sock, \
+ FIELD) > \
+ sizeof_field(struct bpf_tcp_sock, FIELD)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct inet_connection_sock, \
+ FIELD), \
+ si->dst_reg, si->src_reg, \
+ offsetof( \
+ struct inet_connection_sock, \
+ FIELD)); \
+ } while (0)
+
+ BTF_TYPE_EMIT(struct bpf_tcp_sock);
+
+ switch (si->off) {
+ case offsetof(struct bpf_tcp_sock, rtt_min):
+ BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
+ sizeof(struct minmax));
+ BUILD_BUG_ON(sizeof(struct minmax) <
+ sizeof(struct minmax_sample));
+
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ offsetof(struct tcp_sock, rtt_min) +
+ offsetof(struct minmax_sample, v));
+ break;
+ case offsetof(struct bpf_tcp_sock, snd_cwnd):
+ BPF_TCP_SOCK_GET_COMMON(snd_cwnd);
+ break;
+ case offsetof(struct bpf_tcp_sock, srtt_us):
+ BPF_TCP_SOCK_GET_COMMON(srtt_us);
+ break;
+ case offsetof(struct bpf_tcp_sock, snd_ssthresh):
+ BPF_TCP_SOCK_GET_COMMON(snd_ssthresh);
+ break;
+ case offsetof(struct bpf_tcp_sock, rcv_nxt):
+ BPF_TCP_SOCK_GET_COMMON(rcv_nxt);
+ break;
+ case offsetof(struct bpf_tcp_sock, snd_nxt):
+ BPF_TCP_SOCK_GET_COMMON(snd_nxt);
+ break;
+ case offsetof(struct bpf_tcp_sock, snd_una):
+ BPF_TCP_SOCK_GET_COMMON(snd_una);
+ break;
+ case offsetof(struct bpf_tcp_sock, mss_cache):
+ BPF_TCP_SOCK_GET_COMMON(mss_cache);
+ break;
+ case offsetof(struct bpf_tcp_sock, ecn_flags):
+ BPF_TCP_SOCK_GET_COMMON(ecn_flags);
+ break;
+ case offsetof(struct bpf_tcp_sock, rate_delivered):
+ BPF_TCP_SOCK_GET_COMMON(rate_delivered);
+ break;
+ case offsetof(struct bpf_tcp_sock, rate_interval_us):
+ BPF_TCP_SOCK_GET_COMMON(rate_interval_us);
+ break;
+ case offsetof(struct bpf_tcp_sock, packets_out):
+ BPF_TCP_SOCK_GET_COMMON(packets_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, retrans_out):
+ BPF_TCP_SOCK_GET_COMMON(retrans_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, total_retrans):
+ BPF_TCP_SOCK_GET_COMMON(total_retrans);
+ break;
+ case offsetof(struct bpf_tcp_sock, segs_in):
+ BPF_TCP_SOCK_GET_COMMON(segs_in);
+ break;
+ case offsetof(struct bpf_tcp_sock, data_segs_in):
+ BPF_TCP_SOCK_GET_COMMON(data_segs_in);
+ break;
+ case offsetof(struct bpf_tcp_sock, segs_out):
+ BPF_TCP_SOCK_GET_COMMON(segs_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, data_segs_out):
+ BPF_TCP_SOCK_GET_COMMON(data_segs_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, lost_out):
+ BPF_TCP_SOCK_GET_COMMON(lost_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, sacked_out):
+ BPF_TCP_SOCK_GET_COMMON(sacked_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, bytes_received):
+ BPF_TCP_SOCK_GET_COMMON(bytes_received);
+ break;
+ case offsetof(struct bpf_tcp_sock, bytes_acked):
+ BPF_TCP_SOCK_GET_COMMON(bytes_acked);
+ break;
+ case offsetof(struct bpf_tcp_sock, dsack_dups):
+ BPF_TCP_SOCK_GET_COMMON(dsack_dups);
+ break;
+ case offsetof(struct bpf_tcp_sock, delivered):
+ BPF_TCP_SOCK_GET_COMMON(delivered);
+ break;
+ case offsetof(struct bpf_tcp_sock, delivered_ce):
+ BPF_TCP_SOCK_GET_COMMON(delivered_ce);
+ break;
+ case offsetof(struct bpf_tcp_sock, icsk_retransmits):
+ BPF_INET_SOCK_GET_COMMON(icsk_retransmits);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
+{
+ if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
+ return (unsigned long)sk;
+
+ return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_tcp_sock_proto = {
+ .func = bpf_tcp_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL,
+ .arg1_type = ARG_PTR_TO_SOCK_COMMON,
+};
+
+BPF_CALL_1(bpf_get_listener_sock, struct sock *, sk)
+{
+ sk = sk_to_full_sk(sk);
+
+ if (sk && sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_RCU_FREE))
+ return (unsigned long)sk;
+
+ return (unsigned long)NULL;
+}
+
+static const struct bpf_func_proto bpf_get_listener_sock_proto = {
+ .func = bpf_get_listener_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_SOCKET_OR_NULL,
+ .arg1_type = ARG_PTR_TO_SOCK_COMMON,
+};
+
+BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
+{
+ unsigned int iphdr_len;
+
+ switch (skb_protocol(skb, true)) {
+ case cpu_to_be16(ETH_P_IP):
+ iphdr_len = sizeof(struct iphdr);
+ break;
+ case cpu_to_be16(ETH_P_IPV6):
+ iphdr_len = sizeof(struct ipv6hdr);
+ break;
+ default:
+ return 0;
+ }
+
+ if (skb_headlen(skb) < iphdr_len)
+ return 0;
+
+ if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len))
+ return 0;
+
+ return INET_ECN_set_ce(skb);
+}
+
+bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id))
+ return false;
+
+ if (off % size != 0)
+ return false;
+
+ switch (off) {
+ default:
+ return size == sizeof(__u32);
+ }
+}
+
+u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+#define BPF_XDP_SOCK_GET(FIELD) \
+ do { \
+ BUILD_BUG_ON(sizeof_field(struct xdp_sock, FIELD) > \
+ sizeof_field(struct bpf_xdp_sock, FIELD)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\
+ si->dst_reg, si->src_reg, \
+ offsetof(struct xdp_sock, FIELD)); \
+ } while (0)
+
+ BTF_TYPE_EMIT(struct bpf_xdp_sock);
+
+ switch (si->off) {
+ case offsetof(struct bpf_xdp_sock, queue_id):
+ BPF_XDP_SOCK_GET(queue_id);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
+ .func = bpf_skb_ecn_set_ce,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_5(bpf_tcp_check_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
+ struct tcphdr *, th, u32, th_len)
+{
+#ifdef CONFIG_SYN_COOKIES
+ int ret;
+
+ if (unlikely(!sk || th_len < sizeof(*th)))
+ return -EINVAL;
+
+ /* sk_listener() allows TCP_NEW_SYN_RECV, which makes no sense here. */
+ if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
+ return -EINVAL;
+
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
+ return -EINVAL;
+
+ if (!th->ack || th->rst || th->syn)
+ return -ENOENT;
+
+ if (unlikely(iph_len < sizeof(struct iphdr)))
+ return -EINVAL;
+
+ if (tcp_synq_no_recent_overflow(sk))
+ return -ENOENT;
+
+ /* Both struct iphdr and struct ipv6hdr have the version field at the
+ * same offset so we can cast to the shorter header (struct iphdr).
+ */
+ switch (((struct iphdr *)iph)->version) {
+ case 4:
+ if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
+ return -EINVAL;
+
+ ret = __cookie_v4_check((struct iphdr *)iph, th);
+ break;
+
+#if IS_BUILTIN(CONFIG_IPV6)
+ case 6:
+ if (unlikely(iph_len < sizeof(struct ipv6hdr)))
+ return -EINVAL;
+
+ if (sk->sk_family != AF_INET6)
+ return -EINVAL;
+
+ ret = __cookie_v6_check((struct ipv6hdr *)iph, th);
+ break;
+#endif /* CONFIG_IPV6 */
+
+ default:
+ return -EPROTONOSUPPORT;
+ }
+
+ if (ret > 0)
+ return 0;
+
+ return -ENOENT;
+#else
+ return -ENOTSUPP;
+#endif
+}
+
+static const struct bpf_func_proto bpf_tcp_check_syncookie_proto = {
+ .func = bpf_tcp_check_syncookie,
+ .gpl_only = true,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_5(bpf_tcp_gen_syncookie, struct sock *, sk, void *, iph, u32, iph_len,
+ struct tcphdr *, th, u32, th_len)
+{
+#ifdef CONFIG_SYN_COOKIES
+ u32 cookie;
+ u16 mss;
+
+ if (unlikely(!sk || th_len < sizeof(*th) || th_len != th->doff * 4))
+ return -EINVAL;
+
+ if (sk->sk_protocol != IPPROTO_TCP || sk->sk_state != TCP_LISTEN)
+ return -EINVAL;
+
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies))
+ return -ENOENT;
+
+ if (!th->syn || th->ack || th->fin || th->rst)
+ return -EINVAL;
+
+ if (unlikely(iph_len < sizeof(struct iphdr)))
+ return -EINVAL;
+
+ /* Both struct iphdr and struct ipv6hdr have the version field at the
+ * same offset so we can cast to the shorter header (struct iphdr).
+ */
+ switch (((struct iphdr *)iph)->version) {
+ case 4:
+ if (sk->sk_family == AF_INET6 && ipv6_only_sock(sk))
+ return -EINVAL;
+
+ mss = tcp_v4_get_syncookie(sk, iph, th, &cookie);
+ break;
+
+#if IS_BUILTIN(CONFIG_IPV6)
+ case 6:
+ if (unlikely(iph_len < sizeof(struct ipv6hdr)))
+ return -EINVAL;
+
+ if (sk->sk_family != AF_INET6)
+ return -EINVAL;
+
+ mss = tcp_v6_get_syncookie(sk, iph, th, &cookie);
+ break;
+#endif /* CONFIG_IPV6 */
+
+ default:
+ return -EPROTONOSUPPORT;
+ }
+ if (mss == 0)
+ return -ENOENT;
+
+ return cookie | ((u64)mss << 32);
+#else
+ return -EOPNOTSUPP;
+#endif /* CONFIG_SYN_COOKIES */
+}
+
+static const struct bpf_func_proto bpf_tcp_gen_syncookie_proto = {
+ .func = bpf_tcp_gen_syncookie,
+ .gpl_only = true, /* __cookie_v*_init_sequence() is GPL */
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg5_type = ARG_CONST_SIZE,
+};
+
+BPF_CALL_3(bpf_sk_assign, struct sk_buff *, skb, struct sock *, sk, u64, flags)
+{
+ if (!sk || flags != 0)
+ return -EINVAL;
+ if (!skb_at_tc_ingress(skb))
+ return -EOPNOTSUPP;
+ if (unlikely(dev_net(skb->dev) != sock_net(sk)))
+ return -ENETUNREACH;
+ if (sk_unhashed(sk))
+ return -EOPNOTSUPP;
+ if (sk_is_refcounted(sk) &&
+ unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
+ return -ENOENT;
+
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = sock_pfree;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_assign_proto = {
+ .func = bpf_sk_assign,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static const u8 *bpf_search_tcp_opt(const u8 *op, const u8 *opend,
+ u8 search_kind, const u8 *magic,
+ u8 magic_len, bool *eol)
+{
+ u8 kind, kind_len;
+
+ *eol = false;
+
+ while (op < opend) {
+ kind = op[0];
+
+ if (kind == TCPOPT_EOL) {
+ *eol = true;
+ return ERR_PTR(-ENOMSG);
+ } else if (kind == TCPOPT_NOP) {
+ op++;
+ continue;
+ }
+
+ if (opend - op < 2 || opend - op < op[1] || op[1] < 2)
+ /* Something is wrong in the received header.
+ * Follow the TCP stack's tcp_parse_options()
+ * and just bail here.
+ */
+ return ERR_PTR(-EFAULT);
+
+ kind_len = op[1];
+ if (search_kind == kind) {
+ if (!magic_len)
+ return op;
+
+ if (magic_len > kind_len - 2)
+ return ERR_PTR(-ENOMSG);
+
+ if (!memcmp(&op[2], magic, magic_len))
+ return op;
+ }
+
+ op += kind_len;
+ }
+
+ return ERR_PTR(-ENOMSG);
+}
+
+BPF_CALL_4(bpf_sock_ops_load_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
+ void *, search_res, u32, len, u64, flags)
+{
+ bool eol, load_syn = flags & BPF_LOAD_HDR_OPT_TCP_SYN;
+ const u8 *op, *opend, *magic, *search = search_res;
+ u8 search_kind, search_len, copy_len, magic_len;
+ int ret;
+
+ if (!is_locked_tcp_sock_ops(bpf_sock))
+ return -EOPNOTSUPP;
+
+ /* 2 byte is the minimal option len except TCPOPT_NOP and
+ * TCPOPT_EOL which are useless for the bpf prog to learn
+ * and this helper disallow loading them also.
+ */
+ if (len < 2 || flags & ~BPF_LOAD_HDR_OPT_TCP_SYN)
+ return -EINVAL;
+
+ search_kind = search[0];
+ search_len = search[1];
+
+ if (search_len > len || search_kind == TCPOPT_NOP ||
+ search_kind == TCPOPT_EOL)
+ return -EINVAL;
+
+ if (search_kind == TCPOPT_EXP || search_kind == 253) {
+ /* 16 or 32 bit magic. +2 for kind and kind length */
+ if (search_len != 4 && search_len != 6)
+ return -EINVAL;
+ magic = &search[2];
+ magic_len = search_len - 2;
+ } else {
+ if (search_len)
+ return -EINVAL;
+ magic = NULL;
+ magic_len = 0;
+ }
+
+ if (load_syn) {
+ ret = bpf_sock_ops_get_syn(bpf_sock, TCP_BPF_SYN, &op);
+ if (ret < 0)
+ return ret;
+
+ opend = op + ret;
+ op += sizeof(struct tcphdr);
+ } else {
+ if (!bpf_sock->skb ||
+ bpf_sock->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB)
+ /* This bpf_sock->op cannot call this helper */
+ return -EPERM;
+
+ opend = bpf_sock->skb_data_end;
+ op = bpf_sock->skb->data + sizeof(struct tcphdr);
+ }
+
+ op = bpf_search_tcp_opt(op, opend, search_kind, magic, magic_len,
+ &eol);
+ if (IS_ERR(op))
+ return PTR_ERR(op);
+
+ copy_len = op[1];
+ ret = copy_len;
+ if (copy_len > len) {
+ ret = -ENOSPC;
+ copy_len = len;
+ }
+
+ memcpy(search_res, op, copy_len);
+ return ret;
+}
+
+static const struct bpf_func_proto bpf_sock_ops_load_hdr_opt_proto = {
+ .func = bpf_sock_ops_load_hdr_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sock_ops_store_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
+ const void *, from, u32, len, u64, flags)
+{
+ u8 new_kind, new_kind_len, magic_len = 0, *opend;
+ const u8 *op, *new_op, *magic = NULL;
+ struct sk_buff *skb;
+ bool eol;
+
+ if (bpf_sock->op != BPF_SOCK_OPS_WRITE_HDR_OPT_CB)
+ return -EPERM;
+
+ if (len < 2 || flags)
+ return -EINVAL;
+
+ new_op = from;
+ new_kind = new_op[0];
+ new_kind_len = new_op[1];
+
+ if (new_kind_len > len || new_kind == TCPOPT_NOP ||
+ new_kind == TCPOPT_EOL)
+ return -EINVAL;
+
+ if (new_kind_len > bpf_sock->remaining_opt_len)
+ return -ENOSPC;
+
+ /* 253 is another experimental kind */
+ if (new_kind == TCPOPT_EXP || new_kind == 253) {
+ if (new_kind_len < 4)
+ return -EINVAL;
+ /* Match for the 2 byte magic also.
+ * RFC 6994: the magic could be 2 or 4 bytes.
+ * Hence, matching by 2 byte only is on the
+ * conservative side but it is the right
+ * thing to do for the 'search-for-duplication'
+ * purpose.
+ */
+ magic = &new_op[2];
+ magic_len = 2;
+ }
+
+ /* Check for duplication */
+ skb = bpf_sock->skb;
+ op = skb->data + sizeof(struct tcphdr);
+ opend = bpf_sock->skb_data_end;
+
+ op = bpf_search_tcp_opt(op, opend, new_kind, magic, magic_len,
+ &eol);
+ if (!IS_ERR(op))
+ return -EEXIST;
+
+ if (PTR_ERR(op) != -ENOMSG)
+ return PTR_ERR(op);
+
+ if (eol)
+ /* The option has been ended. Treat it as no more
+ * header option can be written.
+ */
+ return -ENOSPC;
+
+ /* No duplication found. Store the header option. */
+ memcpy(opend, from, new_kind_len);
+
+ bpf_sock->remaining_opt_len -= new_kind_len;
+ bpf_sock->skb_data_end += new_kind_len;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sock_ops_store_hdr_opt_proto = {
+ .func = bpf_sock_ops_store_hdr_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
+ .arg3_type = ARG_CONST_SIZE,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_sock_ops_reserve_hdr_opt, struct bpf_sock_ops_kern *, bpf_sock,
+ u32, len, u64, flags)
+{
+ if (bpf_sock->op != BPF_SOCK_OPS_HDR_OPT_LEN_CB)
+ return -EPERM;
+
+ if (flags || len < 2)
+ return -EINVAL;
+
+ if (len > bpf_sock->remaining_opt_len)
+ return -ENOSPC;
+
+ bpf_sock->remaining_opt_len -= len;
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sock_ops_reserve_hdr_opt_proto = {
+ .func = bpf_sock_ops_reserve_hdr_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb,
+ u64, tstamp, u32, tstamp_type)
+{
+ /* skb_clear_delivery_time() is done for inet protocol */
+ if (skb->protocol != htons(ETH_P_IP) &&
+ skb->protocol != htons(ETH_P_IPV6))
+ return -EOPNOTSUPP;
+
+ switch (tstamp_type) {
+ case BPF_SKB_CLOCK_REALTIME:
+ skb->tstamp = tstamp;
+ skb->tstamp_type = SKB_CLOCK_REALTIME;
+ break;
+ case BPF_SKB_CLOCK_MONOTONIC:
+ if (!tstamp)
+ return -EINVAL;
+ skb->tstamp = tstamp;
+ skb->tstamp_type = SKB_CLOCK_MONOTONIC;
+ break;
+ case BPF_SKB_CLOCK_TAI:
+ if (!tstamp)
+ return -EINVAL;
+ skb->tstamp = tstamp;
+ skb->tstamp_type = SKB_CLOCK_TAI;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_set_tstamp_proto = {
+ .func = bpf_skb_set_tstamp,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+#ifdef CONFIG_SYN_COOKIES
+BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv4, struct iphdr *, iph,
+ struct tcphdr *, th, u32, th_len)
+{
+ u32 cookie;
+ u16 mss;
+
+ if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
+ return -EINVAL;
+
+ mss = tcp_parse_mss_option(th, 0) ?: TCP_MSS_DEFAULT;
+ cookie = __cookie_v4_init_sequence(iph, th, &mss);
+
+ return cookie | ((u64)mss << 32);
+}
+
+static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv4_proto = {
+ .func = bpf_tcp_raw_gen_syncookie_ipv4,
+ .gpl_only = true, /* __cookie_v4_init_sequence() is GPL */
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
+ .arg1_size = sizeof(struct iphdr),
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BPF_CALL_3(bpf_tcp_raw_gen_syncookie_ipv6, struct ipv6hdr *, iph,
+ struct tcphdr *, th, u32, th_len)
+{
+#if IS_BUILTIN(CONFIG_IPV6)
+ const u16 mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) -
+ sizeof(struct ipv6hdr);
+ u32 cookie;
+ u16 mss;
+
+ if (unlikely(th_len < sizeof(*th) || th_len != th->doff * 4))
+ return -EINVAL;
+
+ mss = tcp_parse_mss_option(th, 0) ?: mss_clamp;
+ cookie = __cookie_v6_init_sequence(iph, th, &mss);
+
+ return cookie | ((u64)mss << 32);
+#else
+ return -EPROTONOSUPPORT;
+#endif
+}
+
+static const struct bpf_func_proto bpf_tcp_raw_gen_syncookie_ipv6_proto = {
+ .func = bpf_tcp_raw_gen_syncookie_ipv6,
+ .gpl_only = true, /* __cookie_v6_init_sequence() is GPL */
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
+ .arg1_size = sizeof(struct ipv6hdr),
+ .arg2_type = ARG_PTR_TO_MEM,
+ .arg3_type = ARG_CONST_SIZE_OR_ZERO,
+};
+
+BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv4, struct iphdr *, iph,
+ struct tcphdr *, th)
+{
+ if (__cookie_v4_check(iph, th) > 0)
+ return 0;
+
+ return -EACCES;
+}
+
+static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv4_proto = {
+ .func = bpf_tcp_raw_check_syncookie_ipv4,
+ .gpl_only = true, /* __cookie_v4_check is GPL */
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
+ .arg1_size = sizeof(struct iphdr),
+ .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM,
+ .arg2_size = sizeof(struct tcphdr),
+};
+
+BPF_CALL_2(bpf_tcp_raw_check_syncookie_ipv6, struct ipv6hdr *, iph,
+ struct tcphdr *, th)
+{
+#if IS_BUILTIN(CONFIG_IPV6)
+ if (__cookie_v6_check(iph, th) > 0)
+ return 0;
+
+ return -EACCES;
+#else
+ return -EPROTONOSUPPORT;
+#endif
+}
+
+static const struct bpf_func_proto bpf_tcp_raw_check_syncookie_ipv6_proto = {
+ .func = bpf_tcp_raw_check_syncookie_ipv6,
+ .gpl_only = true, /* __cookie_v6_check is GPL */
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_FIXED_SIZE_MEM,
+ .arg1_size = sizeof(struct ipv6hdr),
+ .arg2_type = ARG_PTR_TO_FIXED_SIZE_MEM,
+ .arg2_size = sizeof(struct tcphdr),
+};
+#endif /* CONFIG_SYN_COOKIES */
+
+#endif /* CONFIG_INET */
+
+bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id)
{
switch (func_id) {
- case BPF_FUNC_map_lookup_elem:
- return &bpf_map_lookup_elem_proto;
- case BPF_FUNC_map_update_elem:
- return &bpf_map_update_elem_proto;
- case BPF_FUNC_map_delete_elem:
- return &bpf_map_delete_elem_proto;
- case BPF_FUNC_get_prandom_u32:
- return &bpf_get_prandom_u32_proto;
- case BPF_FUNC_get_smp_processor_id:
- return &bpf_get_raw_smp_processor_id_proto;
- case BPF_FUNC_get_numa_node_id:
- return &bpf_get_numa_node_id_proto;
+ case BPF_FUNC_clone_redirect:
+ case BPF_FUNC_l3_csum_replace:
+ case BPF_FUNC_l4_csum_replace:
+ case BPF_FUNC_lwt_push_encap:
+ case BPF_FUNC_lwt_seg6_action:
+ case BPF_FUNC_lwt_seg6_adjust_srh:
+ case BPF_FUNC_lwt_seg6_store_bytes:
+ case BPF_FUNC_msg_pop_data:
+ case BPF_FUNC_msg_pull_data:
+ case BPF_FUNC_msg_push_data:
+ case BPF_FUNC_skb_adjust_room:
+ case BPF_FUNC_skb_change_head:
+ case BPF_FUNC_skb_change_proto:
+ case BPF_FUNC_skb_change_tail:
+ case BPF_FUNC_skb_pull_data:
+ case BPF_FUNC_skb_store_bytes:
+ case BPF_FUNC_skb_vlan_pop:
+ case BPF_FUNC_skb_vlan_push:
+ case BPF_FUNC_store_hdr_opt:
+ case BPF_FUNC_xdp_adjust_head:
+ case BPF_FUNC_xdp_adjust_meta:
+ case BPF_FUNC_xdp_adjust_tail:
+ /* tail-called program could call any of the above */
case BPF_FUNC_tail_call:
- return &bpf_tail_call_proto;
- case BPF_FUNC_ktime_get_ns:
- return &bpf_ktime_get_ns_proto;
- case BPF_FUNC_trace_printk:
- if (capable(CAP_SYS_ADMIN))
- return bpf_get_trace_printk_proto();
- /* else: fall through */
+ return true;
default:
- return NULL;
+ return false;
}
}
+const struct bpf_func_proto bpf_event_output_data_proto __weak;
+const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak;
+
static const struct bpf_func_proto *
sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- /* inet and inet6 sockets are created in a process
- * context so there is always a valid uid/gid
- */
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_cookie_sock_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sock_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_cg_sock_proto;
+ case BPF_FUNC_ktime_get_coarse_ns:
+ return &bpf_ktime_get_coarse_ns_proto;
+ case BPF_FUNC_setsockopt:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ return &bpf_sock_create_setsockopt_proto;
+ default:
+ return NULL;
+ }
+ case BPF_FUNC_getsockopt:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ return &bpf_sock_create_getsockopt_proto;
+ default:
+ return NULL;
+ }
default:
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
}
static const struct bpf_func_proto *
sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- /* inet and inet6 sockets are created in a process
- * context so there is always a valid uid/gid
- */
- case BPF_FUNC_get_current_uid_gid:
- return &bpf_get_current_uid_gid_proto;
case BPF_FUNC_bind:
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET4_CONNECT:
@@ -4882,10 +8188,72 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
}
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_sock_addr_proto;
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sock_addr_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+#ifdef CONFIG_INET
+ case BPF_FUNC_sk_lookup_tcp:
+ return &bpf_sock_addr_sk_lookup_tcp_proto;
+ case BPF_FUNC_sk_lookup_udp:
+ return &bpf_sock_addr_sk_lookup_udp_proto;
+ case BPF_FUNC_sk_release:
+ return &bpf_sk_release_proto;
+ case BPF_FUNC_skc_lookup_tcp:
+ return &bpf_sock_addr_skc_lookup_tcp_proto;
+#endif /* CONFIG_INET */
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_setsockopt:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
+ return &bpf_sock_addr_setsockopt_proto;
+ default:
+ return NULL;
+ }
+ case BPF_FUNC_getsockopt:
+ switch (prog->expected_attach_type) {
+ case BPF_CGROUP_INET4_BIND:
+ case BPF_CGROUP_INET6_BIND:
+ case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_UNIX_CONNECT:
+ case BPF_CGROUP_UDP4_RECVMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
+ case BPF_CGROUP_UNIX_RECVMSG:
+ case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UNIX_SENDMSG:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_UNIX_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
+ case BPF_CGROUP_UNIX_GETSOCKNAME:
+ return &bpf_sock_addr_getsockopt_proto;
+ default:
+ return NULL;
+ }
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
@@ -4899,19 +8267,64 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_load_bytes_relative_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_proto;
case BPF_FUNC_get_socket_uid:
return &bpf_get_socket_uid_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
+const struct bpf_func_proto bpf_sk_storage_get_proto __weak;
+const struct bpf_func_proto bpf_sk_storage_delete_proto __weak;
+
static const struct bpf_func_proto *
cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
+ case BPF_FUNC_sk_fullsock:
+ return &bpf_sk_fullsock_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ case BPF_FUNC_skb_cgroup_id:
+ return &bpf_skb_cgroup_id_proto;
+ case BPF_FUNC_skb_ancestor_cgroup_id:
+ return &bpf_skb_ancestor_cgroup_id_proto;
+ case BPF_FUNC_sk_cgroup_id:
+ return &bpf_sk_cgroup_id_proto;
+ case BPF_FUNC_sk_ancestor_cgroup_id:
+ return &bpf_sk_ancestor_cgroup_id_proto;
+#endif
+#ifdef CONFIG_INET
+ case BPF_FUNC_sk_lookup_tcp:
+ return &bpf_sk_lookup_tcp_proto;
+ case BPF_FUNC_sk_lookup_udp:
+ return &bpf_sk_lookup_udp_proto;
+ case BPF_FUNC_sk_release:
+ return &bpf_sk_release_proto;
+ case BPF_FUNC_skc_lookup_tcp:
+ return &bpf_skc_lookup_tcp_proto;
+ case BPF_FUNC_tcp_sock:
+ return &bpf_tcp_sock_proto;
+ case BPF_FUNC_get_listener_sock:
+ return &bpf_get_listener_sock_proto;
+ case BPF_FUNC_skb_ecn_set_ce:
+ return &bpf_skb_ecn_set_ce_proto;
+#endif
default:
return sk_filter_func_proto(func_id, prog);
}
@@ -4933,6 +8346,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_csum_diff_proto;
case BPF_FUNC_csum_update:
return &bpf_csum_update_proto;
+ case BPF_FUNC_csum_level:
+ return &bpf_csum_level_proto;
case BPF_FUNC_l3_csum_replace:
return &bpf_l3_csum_replace_proto;
case BPF_FUNC_l4_csum_replace:
@@ -4953,6 +8368,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_adjust_room_proto;
case BPF_FUNC_skb_change_tail:
return &bpf_skb_change_tail_proto;
+ case BPF_FUNC_skb_change_head:
+ return &bpf_skb_change_head_proto;
case BPF_FUNC_skb_get_tunnel_key:
return &bpf_skb_get_tunnel_key_proto;
case BPF_FUNC_skb_set_tunnel_key:
@@ -4963,6 +8380,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return bpf_get_skb_set_tunnel_proto(func_id);
case BPF_FUNC_redirect:
return &bpf_redirect_proto;
+ case BPF_FUNC_redirect_neigh:
+ return &bpf_redirect_neigh_proto;
+ case BPF_FUNC_redirect_peer:
+ return &bpf_redirect_peer_proto;
case BPF_FUNC_get_route_realm:
return &bpf_get_route_realm_proto;
case BPF_FUNC_get_hash_recalc:
@@ -4979,22 +8400,70 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_under_cgroup_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_proto;
case BPF_FUNC_get_socket_uid:
return &bpf_get_socket_uid_proto;
case BPF_FUNC_fib_lookup:
return &bpf_skb_fib_lookup_proto;
+ case BPF_FUNC_check_mtu:
+ return &bpf_skb_check_mtu_proto;
+ case BPF_FUNC_sk_fullsock:
+ return &bpf_sk_fullsock_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
#ifdef CONFIG_XFRM
case BPF_FUNC_skb_get_xfrm_state:
return &bpf_skb_get_xfrm_state_proto;
#endif
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ case BPF_FUNC_skb_cgroup_classid:
+ return &bpf_skb_cgroup_classid_proto;
+#endif
#ifdef CONFIG_SOCK_CGROUP_DATA
case BPF_FUNC_skb_cgroup_id:
return &bpf_skb_cgroup_id_proto;
case BPF_FUNC_skb_ancestor_cgroup_id:
return &bpf_skb_ancestor_cgroup_id_proto;
#endif
+#ifdef CONFIG_INET
+ case BPF_FUNC_sk_lookup_tcp:
+ return &bpf_tc_sk_lookup_tcp_proto;
+ case BPF_FUNC_sk_lookup_udp:
+ return &bpf_tc_sk_lookup_udp_proto;
+ case BPF_FUNC_sk_release:
+ return &bpf_sk_release_proto;
+ case BPF_FUNC_tcp_sock:
+ return &bpf_tcp_sock_proto;
+ case BPF_FUNC_get_listener_sock:
+ return &bpf_get_listener_sock_proto;
+ case BPF_FUNC_skc_lookup_tcp:
+ return &bpf_tc_skc_lookup_tcp_proto;
+ case BPF_FUNC_tcp_check_syncookie:
+ return &bpf_tcp_check_syncookie_proto;
+ case BPF_FUNC_skb_ecn_set_ce:
+ return &bpf_skb_ecn_set_ce_proto;
+ case BPF_FUNC_tcp_gen_syncookie:
+ return &bpf_tcp_gen_syncookie_proto;
+ case BPF_FUNC_sk_assign:
+ return &bpf_sk_assign_proto;
+ case BPF_FUNC_skb_set_tstamp:
+ return &bpf_skb_set_tstamp_proto;
+#ifdef CONFIG_SYN_COOKIES
+ case BPF_FUNC_tcp_raw_gen_syncookie_ipv4:
+ return &bpf_tcp_raw_gen_syncookie_ipv4_proto;
+ case BPF_FUNC_tcp_raw_gen_syncookie_ipv6:
+ return &bpf_tcp_raw_gen_syncookie_ipv6_proto;
+ case BPF_FUNC_tcp_raw_check_syncookie_ipv4:
+ return &bpf_tcp_raw_check_syncookie_ipv4_proto;
+ case BPF_FUNC_tcp_raw_check_syncookie_ipv6:
+ return &bpf_tcp_raw_check_syncookie_ipv6_proto;
+#endif
+#endif
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
@@ -5018,21 +8487,75 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_xdp_redirect_map_proto;
case BPF_FUNC_xdp_adjust_tail:
return &bpf_xdp_adjust_tail_proto;
+ case BPF_FUNC_xdp_get_buff_len:
+ return &bpf_xdp_get_buff_len_proto;
+ case BPF_FUNC_xdp_load_bytes:
+ return &bpf_xdp_load_bytes_proto;
+ case BPF_FUNC_xdp_store_bytes:
+ return &bpf_xdp_store_bytes_proto;
case BPF_FUNC_fib_lookup:
return &bpf_xdp_fib_lookup_proto;
+ case BPF_FUNC_check_mtu:
+ return &bpf_xdp_check_mtu_proto;
+#ifdef CONFIG_INET
+ case BPF_FUNC_sk_lookup_udp:
+ return &bpf_xdp_sk_lookup_udp_proto;
+ case BPF_FUNC_sk_lookup_tcp:
+ return &bpf_xdp_sk_lookup_tcp_proto;
+ case BPF_FUNC_sk_release:
+ return &bpf_sk_release_proto;
+ case BPF_FUNC_skc_lookup_tcp:
+ return &bpf_xdp_skc_lookup_tcp_proto;
+ case BPF_FUNC_tcp_check_syncookie:
+ return &bpf_tcp_check_syncookie_proto;
+ case BPF_FUNC_tcp_gen_syncookie:
+ return &bpf_tcp_gen_syncookie_proto;
+#ifdef CONFIG_SYN_COOKIES
+ case BPF_FUNC_tcp_raw_gen_syncookie_ipv4:
+ return &bpf_tcp_raw_gen_syncookie_ipv4_proto;
+ case BPF_FUNC_tcp_raw_gen_syncookie_ipv6:
+ return &bpf_tcp_raw_gen_syncookie_ipv6_proto;
+ case BPF_FUNC_tcp_raw_check_syncookie_ipv4:
+ return &bpf_tcp_raw_check_syncookie_ipv4_proto;
+ case BPF_FUNC_tcp_raw_check_syncookie_ipv6:
+ return &bpf_tcp_raw_check_syncookie_ipv6_proto;
+#endif
+#endif
default:
- return bpf_base_func_proto(func_id);
- }
+ return bpf_sk_base_func_proto(func_id, prog);
+ }
+
+#if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES)
+ /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The
+ * kfuncs are defined in two different modules, and we want to be able
+ * to use them interchangeably with the same BTF type ID. Because modules
+ * can't de-duplicate BTF IDs between each other, we need the type to be
+ * referenced in the vmlinux BTF or the verifier will get confused about
+ * the different types. So we add this dummy type reference which will
+ * be included in vmlinux BTF, allowing both modules to refer to the
+ * same type ID.
+ */
+ BTF_TYPE_EMIT(struct nf_conn___init);
+#endif
}
+const struct bpf_func_proto bpf_sock_map_update_proto __weak;
+const struct bpf_func_proto bpf_sock_hash_update_proto __weak;
+
static const struct bpf_func_proto *
sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
+ func_proto = cgroup_common_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
switch (func_id) {
case BPF_FUNC_setsockopt:
- return &bpf_setsockopt_proto;
+ return &bpf_sock_ops_setsockopt_proto;
case BPF_FUNC_getsockopt:
- return &bpf_getsockopt_proto;
+ return &bpf_sock_ops_getsockopt_proto;
case BPF_FUNC_sock_ops_cb_flags_set:
return &bpf_sock_ops_cb_flags_set_proto;
case BPF_FUNC_sock_map_update:
@@ -5041,13 +8564,32 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sock_hash_update_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_sock_ops_proto;
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sock_ops_proto;
+#ifdef CONFIG_INET
+ case BPF_FUNC_load_hdr_opt:
+ return &bpf_sock_ops_load_hdr_opt_proto;
+ case BPF_FUNC_store_hdr_opt:
+ return &bpf_sock_ops_store_hdr_opt_proto;
+ case BPF_FUNC_reserve_hdr_opt:
+ return &bpf_sock_ops_reserve_hdr_opt_proto;
+ case BPF_FUNC_tcp_sock:
+ return &bpf_tcp_sock_proto;
+#endif /* CONFIG_INET */
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
+const struct bpf_func_proto bpf_msg_redirect_map_proto __weak;
+const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak;
+
static const struct bpf_func_proto *
sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -5062,13 +8604,26 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_msg_cork_bytes_proto;
case BPF_FUNC_msg_pull_data:
return &bpf_msg_pull_data_proto;
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
+ case BPF_FUNC_msg_push_data:
+ return &bpf_msg_push_data_proto;
+ case BPF_FUNC_msg_pop_data:
+ return &bpf_msg_pop_data_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_get_netns_cookie:
+ return &bpf_get_netns_cookie_sk_msg_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
+const struct bpf_func_proto bpf_sk_redirect_map_proto __weak;
+const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak;
+
static const struct bpf_func_proto *
sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@@ -5083,6 +8638,8 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &sk_skb_change_tail_proto;
case BPF_FUNC_skb_change_head:
return &sk_skb_change_head_proto;
+ case BPF_FUNC_skb_adjust_room:
+ return &sk_skb_adjust_room_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_proto;
case BPF_FUNC_get_socket_uid:
@@ -5091,10 +8648,31 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sk_redirect_map_proto;
case BPF_FUNC_sk_redirect_hash:
return &bpf_sk_redirect_hash_proto;
- case BPF_FUNC_get_local_storage:
- return &bpf_get_local_storage_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_skb_event_output_proto;
+#ifdef CONFIG_INET
+ case BPF_FUNC_sk_lookup_tcp:
+ return &bpf_sk_lookup_tcp_proto;
+ case BPF_FUNC_sk_lookup_udp:
+ return &bpf_sk_lookup_udp_proto;
+ case BPF_FUNC_sk_release:
+ return &bpf_sk_release_proto;
+ case BPF_FUNC_skc_lookup_tcp:
+ return &bpf_skc_lookup_tcp_proto;
+#endif
+ default:
+ return bpf_sk_base_func_proto(func_id, prog);
+ }
+}
+
+static const struct bpf_func_proto *
+flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_flow_dissector_load_bytes_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
@@ -5121,7 +8699,7 @@ lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_skb_under_cgroup:
return &bpf_skb_under_cgroup_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_sk_base_func_proto(func_id, prog);
}
}
@@ -5130,7 +8708,7 @@ lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_lwt_push_encap:
- return &bpf_lwt_push_encap_proto;
+ return &bpf_lwt_in_push_encap_proto;
default:
return lwt_out_func_proto(func_id, prog);
}
@@ -5160,12 +8738,16 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_skb_store_bytes_proto;
case BPF_FUNC_csum_update:
return &bpf_csum_update_proto;
+ case BPF_FUNC_csum_level:
+ return &bpf_csum_level_proto;
case BPF_FUNC_l3_csum_replace:
return &bpf_l3_csum_replace_proto;
case BPF_FUNC_l4_csum_replace:
return &bpf_l4_csum_replace_proto;
case BPF_FUNC_set_hash_invalid:
return &bpf_set_hash_invalid_proto;
+ case BPF_FUNC_lwt_push_encap:
+ return &bpf_lwt_xmit_push_encap_proto;
default:
return lwt_out_func_proto(func_id, prog);
}
@@ -5206,16 +8788,39 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
if (off + size > offsetofend(struct __sk_buff, cb[4]))
return false;
break;
+ case bpf_ctx_range(struct __sk_buff, data):
+ case bpf_ctx_range(struct __sk_buff, data_meta):
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ if (info->is_ldsx || size != size_default)
+ return false;
+ break;
case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]):
case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]):
case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4):
case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4):
- case bpf_ctx_range(struct __sk_buff, data):
- case bpf_ctx_range(struct __sk_buff, data_meta):
- case bpf_ctx_range(struct __sk_buff, data_end):
if (size != size_default)
return false;
break;
+ case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
+ return false;
+ case bpf_ctx_range(struct __sk_buff, hwtstamp):
+ if (type == BPF_WRITE || size != sizeof(__u64))
+ return false;
+ break;
+ case bpf_ctx_range(struct __sk_buff, tstamp):
+ if (size != sizeof(__u64))
+ return false;
+ break;
+ case bpf_ctx_range_ptr(struct __sk_buff, sk):
+ if (type == BPF_WRITE || size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
+ break;
+ case offsetof(struct __sk_buff, tstamp_type):
+ return false;
+ case offsetofend(struct __sk_buff, tstamp_type) ... offsetof(struct __sk_buff, hwtstamp) - 1:
+ /* Explicitly prohibit access to padding in __sk_buff. */
+ return false;
default:
/* Only narrow read access allowed for now. */
if (type == BPF_WRITE) {
@@ -5242,6 +8847,9 @@ static bool sk_filter_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, data_meta):
case bpf_ctx_range(struct __sk_buff, data_end):
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
+ case bpf_ctx_range(struct __sk_buff, tstamp):
+ case bpf_ctx_range(struct __sk_buff, wire_len):
+ case bpf_ctx_range(struct __sk_buff, hwtstamp):
return false;
}
@@ -5257,6 +8865,50 @@ static bool sk_filter_is_valid_access(int off, int size,
return bpf_skb_is_valid_access(off, size, type, prog, info);
}
+static bool cg_skb_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, tc_classid):
+ case bpf_ctx_range(struct __sk_buff, data_meta):
+ case bpf_ctx_range(struct __sk_buff, wire_len):
+ return false;
+ case bpf_ctx_range(struct __sk_buff, data):
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ if (!bpf_token_capable(prog->aux->token, CAP_BPF))
+ return false;
+ break;
+ }
+
+ if (type == BPF_WRITE) {
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, mark):
+ case bpf_ctx_range(struct __sk_buff, priority):
+ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+ break;
+ case bpf_ctx_range(struct __sk_buff, tstamp):
+ if (!bpf_token_capable(prog->aux->token, CAP_BPF))
+ return false;
+ break;
+ default:
+ return false;
+ }
+ }
+
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, data):
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ }
+
+ return bpf_skb_is_valid_access(off, size, type, prog, info);
+}
+
static bool lwt_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
@@ -5266,6 +8918,9 @@ static bool lwt_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
case bpf_ctx_range(struct __sk_buff, data_meta):
+ case bpf_ctx_range(struct __sk_buff, tstamp):
+ case bpf_ctx_range(struct __sk_buff, wire_len):
+ case bpf_ctx_range(struct __sk_buff, hwtstamp):
return false;
}
@@ -5303,6 +8958,7 @@ static bool __sock_filter_check_attach_type(int off,
case offsetof(struct bpf_sock, priority):
switch (attach_type) {
case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_INET_SOCK_RELEASE:
goto full_access;
default:
return false;
@@ -5336,16 +8992,50 @@ full_access:
return true;
}
-static bool __sock_filter_check_size(int off, int size,
+bool bpf_sock_common_is_valid_access(int off, int size,
+ enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
+ switch (off) {
+ case bpf_ctx_range_till(struct bpf_sock, type, priority):
+ return false;
+ default:
+ return bpf_sock_is_valid_access(off, size, type, info);
+ }
+}
+
+bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
const int size_default = sizeof(__u32);
+ int field_size;
+
+ if (off < 0 || off >= sizeof(struct bpf_sock))
+ return false;
+ if (off % size != 0)
+ return false;
switch (off) {
+ case offsetof(struct bpf_sock, state):
+ case offsetof(struct bpf_sock, family):
+ case offsetof(struct bpf_sock, type):
+ case offsetof(struct bpf_sock, protocol):
+ case offsetof(struct bpf_sock, src_port):
+ case offsetof(struct bpf_sock, rx_queue_mapping):
case bpf_ctx_range(struct bpf_sock, src_ip4):
case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
+ case bpf_ctx_range(struct bpf_sock, dst_ip4):
+ case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
bpf_ctx_record_field_size(info, size_default);
return bpf_ctx_narrow_access_ok(off, size, size_default);
+ case bpf_ctx_range(struct bpf_sock, dst_port):
+ field_size = size == size_default ?
+ size_default : sizeof_field(struct bpf_sock, dst_port);
+ bpf_ctx_record_field_size(info, field_size);
+ return bpf_ctx_narrow_access_ok(off, size, field_size);
+ case offsetofend(struct bpf_sock, dst_port) ...
+ offsetof(struct bpf_sock, dst_ip4) - 1:
+ return false;
}
return size == size_default;
@@ -5356,16 +9046,19 @@ static bool sock_filter_is_valid_access(int off, int size,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
- if (off < 0 || off >= sizeof(struct bpf_sock))
- return false;
- if (off % size != 0)
+ if (!bpf_sock_is_valid_access(off, size, type, info))
return false;
- if (!__sock_filter_check_attach_type(off, type,
- prog->expected_attach_type))
- return false;
- if (!__sock_filter_check_size(off, size, info))
- return false;
- return true;
+ return __sock_filter_check_attach_type(off, type,
+ prog->expected_attach_type);
+}
+
+static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write,
+ const struct bpf_prog *prog)
+{
+ /* Neither direct read nor direct write requires any preliminary
+ * action.
+ */
+ return 0;
}
static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
@@ -5382,7 +9075,7 @@ static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write,
* (Fast-path, otherwise approximation that we might be
* a clone, do the rest in helper.)
*/
- *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET());
+ *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET);
*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK);
*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7);
@@ -5413,8 +9106,6 @@ static int bpf_gen_ld_abs(const struct bpf_insn *orig,
bool indirect = BPF_MODE(orig->code) == BPF_IND;
struct bpf_insn *insn = insn_buf;
- /* We're guaranteed here that CTX is in R6. */
- *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);
if (!indirect) {
*insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm);
} else {
@@ -5422,6 +9113,8 @@ static int bpf_gen_ld_abs(const struct bpf_insn *orig,
if (orig->imm)
*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm);
}
+ /* We're guaranteed here that CTX is in R6. */
+ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);
switch (BPF_SIZE(orig->code)) {
case BPF_B:
@@ -5460,6 +9153,8 @@ static bool tc_cls_act_is_valid_access(int off, int size,
case bpf_ctx_range(struct __sk_buff, priority):
case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
+ case bpf_ctx_range(struct __sk_buff, tstamp):
+ case bpf_ctx_range(struct __sk_buff, queue_mapping):
break;
default:
return false;
@@ -5478,11 +9173,42 @@ static bool tc_cls_act_is_valid_access(int off, int size,
break;
case bpf_ctx_range_till(struct __sk_buff, family, local_port):
return false;
+ case offsetof(struct __sk_buff, tstamp_type):
+ /* The convert_ctx_access() on reading and writing
+ * __sk_buff->tstamp depends on whether the bpf prog
+ * has used __sk_buff->tstamp_type or not.
+ * Thus, we need to set prog->tstamp_type_access
+ * earlier during is_valid_access() here.
+ */
+ ((struct bpf_prog *)prog)->tstamp_type_access = 1;
+ return size == sizeof(__u8);
}
return bpf_skb_is_valid_access(off, size, type, prog, info);
}
+DEFINE_MUTEX(nf_conn_btf_access_lock);
+EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock);
+
+int (*nfct_btf_struct_access)(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size);
+EXPORT_SYMBOL_GPL(nfct_btf_struct_access);
+
+static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size)
+{
+ int ret = -EACCES;
+
+ mutex_lock(&nf_conn_btf_access_lock);
+ if (nfct_btf_struct_access)
+ ret = nfct_btf_struct_access(log, reg, off, size);
+ mutex_unlock(&nf_conn_btf_access_lock);
+
+ return ret;
+}
+
static bool __is_valid_xdp_access(int off, int size)
{
if (off < 0 || off >= sizeof(struct xdp_md))
@@ -5500,14 +9226,29 @@ static bool xdp_is_valid_access(int off, int size,
const struct bpf_prog *prog,
struct bpf_insn_access_aux *info)
{
+ if (prog->expected_attach_type != BPF_XDP_DEVMAP) {
+ switch (off) {
+ case offsetof(struct xdp_md, egress_ifindex):
+ return false;
+ }
+ }
+
if (type == BPF_WRITE) {
- if (bpf_prog_is_dev_bound(prog->aux)) {
+ if (bpf_prog_is_offloaded(prog->aux)) {
switch (off) {
case offsetof(struct xdp_md, rx_queue_index):
return __is_valid_xdp_access(off, size);
}
}
return false;
+ } else {
+ switch (off) {
+ case offsetof(struct xdp_md, data_meta):
+ case offsetof(struct xdp_md, data):
+ case offsetof(struct xdp_md, data_end):
+ if (info->is_ldsx)
+ return false;
+ }
}
switch (off) {
@@ -5525,16 +9266,31 @@ static bool xdp_is_valid_access(int off, int size,
return __is_valid_xdp_access(off, size);
}
-void bpf_warn_invalid_xdp_action(u32 act)
+void bpf_warn_invalid_xdp_action(const struct net_device *dev,
+ const struct bpf_prog *prog, u32 act)
{
const u32 act_max = XDP_REDIRECT;
- WARN_ONCE(1, "%s XDP return value %u, expect packet loss!\n",
- act > act_max ? "Illegal" : "Driver unsupported",
- act);
+ pr_warn_once("%s XDP return value %u on prog %s (id %d) dev %s, expect packet loss!\n",
+ act > act_max ? "Illegal" : "Driver unsupported",
+ act, prog->aux->name, prog->aux->id, dev ? dev->name : "N/A");
}
EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action);
+static int xdp_btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size)
+{
+ int ret = -EACCES;
+
+ mutex_lock(&nf_conn_btf_access_lock);
+ if (nfct_btf_struct_access)
+ ret = nfct_btf_struct_access(log, reg, off, size);
+ mutex_unlock(&nf_conn_btf_access_lock);
+
+ return ret;
+}
+
static bool sock_addr_is_valid_access(int off, int size,
enum bpf_access_type type,
const struct bpf_prog *prog,
@@ -5547,15 +9303,18 @@ static bool sock_addr_is_valid_access(int off, int size,
if (off % size != 0)
return false;
- /* Disallow access to IPv6 fields from IPv4 contex and vise
- * versa.
+ /* Disallow access to fields not belonging to the attach type's address
+ * family.
*/
switch (off) {
case bpf_ctx_range(struct bpf_sock_addr, user_ip4):
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET4_BIND:
case BPF_CGROUP_INET4_CONNECT:
+ case BPF_CGROUP_INET4_GETPEERNAME:
+ case BPF_CGROUP_INET4_GETSOCKNAME:
case BPF_CGROUP_UDP4_SENDMSG:
+ case BPF_CGROUP_UDP4_RECVMSG:
break;
default:
return false;
@@ -5565,7 +9324,10 @@ static bool sock_addr_is_valid_access(int off, int size,
switch (prog->expected_attach_type) {
case BPF_CGROUP_INET6_BIND:
case BPF_CGROUP_INET6_CONNECT:
+ case BPF_CGROUP_INET6_GETPEERNAME:
+ case BPF_CGROUP_INET6_GETSOCKNAME:
case BPF_CGROUP_UDP6_SENDMSG:
+ case BPF_CGROUP_UDP6_RECVMSG:
break;
default:
return false;
@@ -5596,27 +9358,55 @@ static bool sock_addr_is_valid_access(int off, int size,
case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4):
case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0],
msg_src_ip6[3]):
- /* Only narrow read access allowed for now. */
+ case bpf_ctx_range(struct bpf_sock_addr, user_port):
if (type == BPF_READ) {
bpf_ctx_record_field_size(info, size_default);
+
+ if (bpf_ctx_wide_access_ok(off, size,
+ struct bpf_sock_addr,
+ user_ip6))
+ return true;
+
+ if (bpf_ctx_wide_access_ok(off, size,
+ struct bpf_sock_addr,
+ msg_src_ip6))
+ return true;
+
if (!bpf_ctx_narrow_access_ok(off, size, size_default))
return false;
} else {
+ if (bpf_ctx_wide_access_ok(off, size,
+ struct bpf_sock_addr,
+ user_ip6))
+ return true;
+
+ if (bpf_ctx_wide_access_ok(off, size,
+ struct bpf_sock_addr,
+ msg_src_ip6))
+ return true;
+
if (size != size_default)
return false;
}
break;
- case bpf_ctx_range(struct bpf_sock_addr, user_port):
+ case bpf_ctx_range_ptr(struct bpf_sock_addr, sk):
+ if (type != BPF_READ)
+ return false;
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCKET;
+ break;
+ case bpf_ctx_range(struct bpf_sock_addr, user_family):
+ case bpf_ctx_range(struct bpf_sock_addr, family):
+ case bpf_ctx_range(struct bpf_sock_addr, type):
+ case bpf_ctx_range(struct bpf_sock_addr, protocol):
+ if (type != BPF_READ)
+ return false;
if (size != size_default)
return false;
break;
default:
- if (type == BPF_READ) {
- if (size != size_default)
- return false;
- } else {
- return false;
- }
+ return false;
}
return true;
@@ -5653,6 +9443,29 @@ static bool sock_ops_is_valid_access(int off, int size,
if (size != sizeof(__u64))
return false;
break;
+ case bpf_ctx_range_ptr(struct bpf_sock_ops, sk):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCKET_OR_NULL;
+ break;
+ case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET;
+ break;
+ case bpf_ctx_range_ptr(struct bpf_sock_ops, skb_data_end):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_PACKET_END;
+ break;
+ case offsetof(struct bpf_sock_ops, skb_tcp_flags):
+ bpf_ctx_record_field_size(info, size_default);
+ return bpf_ctx_narrow_access_ok(off, size,
+ size_default);
+ case bpf_ctx_range(struct bpf_sock_ops, skb_hwtstamp):
+ if (size != sizeof(__u64))
+ return false;
+ break;
default:
if (size != size_default)
return false;
@@ -5677,6 +9490,9 @@ static bool sk_skb_is_valid_access(int off, int size,
switch (off) {
case bpf_ctx_range(struct __sk_buff, tc_classid):
case bpf_ctx_range(struct __sk_buff, data_meta):
+ case bpf_ctx_range(struct __sk_buff, tstamp):
+ case bpf_ctx_range(struct __sk_buff, wire_len):
+ case bpf_ctx_range(struct __sk_buff, hwtstamp):
return false;
}
@@ -5712,30 +9528,225 @@ static bool sk_msg_is_valid_access(int off, int size,
if (type == BPF_WRITE)
return false;
+ if (off % size != 0)
+ return false;
+
switch (off) {
- case offsetof(struct sk_msg_md, data):
+ case bpf_ctx_range_ptr(struct sk_msg_md, data):
info->reg_type = PTR_TO_PACKET;
if (size != sizeof(__u64))
return false;
break;
- case offsetof(struct sk_msg_md, data_end):
+ case bpf_ctx_range_ptr(struct sk_msg_md, data_end):
info->reg_type = PTR_TO_PACKET_END;
if (size != sizeof(__u64))
return false;
break;
- default:
+ case bpf_ctx_range_ptr(struct sk_msg_md, sk):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCKET;
+ break;
+ case bpf_ctx_range(struct sk_msg_md, family):
+ case bpf_ctx_range(struct sk_msg_md, remote_ip4):
+ case bpf_ctx_range(struct sk_msg_md, local_ip4):
+ case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]):
+ case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]):
+ case bpf_ctx_range(struct sk_msg_md, remote_port):
+ case bpf_ctx_range(struct sk_msg_md, local_port):
+ case bpf_ctx_range(struct sk_msg_md, size):
if (size != sizeof(__u32))
return false;
+ break;
+ default:
+ return false;
}
+ return true;
+}
+
+static bool flow_dissector_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ const int size_default = sizeof(__u32);
- if (off < 0 || off >= sizeof(struct sk_msg_md))
+ if (off < 0 || off >= sizeof(struct __sk_buff))
return false;
+
if (off % size != 0)
return false;
- return true;
+ if (type == BPF_WRITE)
+ return false;
+
+ switch (off) {
+ case bpf_ctx_range(struct __sk_buff, data):
+ if (info->is_ldsx || size != size_default)
+ return false;
+ info->reg_type = PTR_TO_PACKET;
+ return true;
+ case bpf_ctx_range(struct __sk_buff, data_end):
+ if (info->is_ldsx || size != size_default)
+ return false;
+ info->reg_type = PTR_TO_PACKET_END;
+ return true;
+ case bpf_ctx_range_ptr(struct __sk_buff, flow_keys):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_FLOW_KEYS;
+ return true;
+ default:
+ return false;
+ }
+}
+
+static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog,
+ u32 *target_size)
+
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct __sk_buff, data):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_flow_dissector, data));
+ break;
+
+ case offsetof(struct __sk_buff, data_end):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, data_end),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_flow_dissector, data_end));
+ break;
+
+ case offsetof(struct __sk_buff, flow_keys):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_flow_dissector, flow_keys),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_flow_dissector, flow_keys));
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ __u8 value_reg = si->dst_reg;
+ __u8 skb_reg = si->src_reg;
+ BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI);
+ BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME);
+ BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC);
+ BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI);
+ *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
+ *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK);
+#ifdef __BIG_ENDIAN_BITFIELD
+ *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT);
+#else
+ BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1));
+#endif
+
+ return insn;
+}
+
+static struct bpf_insn *bpf_convert_shinfo_access(__u8 dst_reg, __u8 skb_reg,
+ struct bpf_insn *insn)
+{
+ /* si->dst_reg = skb_shinfo(SKB); */
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
+ BPF_REG_AX, skb_reg,
+ offsetof(struct sk_buff, end));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head),
+ dst_reg, skb_reg,
+ offsetof(struct sk_buff, head));
+ *insn++ = BPF_ALU64_REG(BPF_ADD, dst_reg, BPF_REG_AX);
+#else
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end),
+ dst_reg, skb_reg,
+ offsetof(struct sk_buff, end));
+#endif
+
+ return insn;
+}
+
+static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ __u8 value_reg = si->dst_reg;
+ __u8 skb_reg = si->src_reg;
+
+#ifdef CONFIG_NET_XGRESS
+ /* If the tstamp_type is read,
+ * the bpf prog is aware the tstamp could have delivery time.
+ * Thus, read skb->tstamp as is if tstamp_type_access is true.
+ */
+ if (!prog->tstamp_type_access) {
+ /* AX is needed because src_reg and dst_reg could be the same */
+ __u8 tmp_reg = BPF_REG_AX;
+
+ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
+ /* check if ingress mask bits is set */
+ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
+ *insn++ = BPF_JMP_A(4);
+ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1);
+ *insn++ = BPF_JMP_A(2);
+ /* skb->tc_at_ingress && skb->tstamp_type,
+ * read 0 as the (rcv) timestamp.
+ */
+ *insn++ = BPF_MOV64_IMM(value_reg, 0);
+ *insn++ = BPF_JMP_A(1);
+ }
+#endif
+
+ *insn++ = BPF_LDX_MEM(BPF_DW, value_reg, skb_reg,
+ offsetof(struct sk_buff, tstamp));
+ return insn;
+}
+
+static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ __u8 value_reg = si->src_reg;
+ __u8 skb_reg = si->dst_reg;
+
+#ifdef CONFIG_NET_XGRESS
+ /* If the tstamp_type is read,
+ * the bpf prog is aware the tstamp could have delivery time.
+ * Thus, write skb->tstamp as is if tstamp_type_access is true.
+ * Otherwise, writing at ingress will have to clear the
+ * skb->tstamp_type bit also.
+ */
+ if (!prog->tstamp_type_access) {
+ __u8 tmp_reg = BPF_REG_AX;
+
+ *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET);
+ /* Writing __sk_buff->tstamp as ingress, goto <clear> */
+ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1);
+ /* goto <store> */
+ *insn++ = BPF_JMP_A(2);
+ /* <clear>: skb->tstamp_type */
+ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK);
+ *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET);
+ }
+#endif
+
+ /* <store>: skb->tstamp = tstamp */
+ *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_DW | BPF_MEM,
+ skb_reg, value_reg, offsetof(struct sk_buff, tstamp), si->imm);
+ return insn;
}
+#define BPF_EMIT_STORE(size, si, off) \
+ BPF_RAW_INSN(BPF_CLASS((si)->code) | (size) | BPF_MEM, \
+ (si)->dst_reg, (si)->src_reg, (off), (si)->imm)
+
static u32 bpf_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
@@ -5765,9 +9776,9 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct __sk_buff, priority):
if (type == BPF_WRITE)
- *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
- bpf_target_off(struct sk_buff, priority, 4,
- target_size));
+ *insn++ = BPF_EMIT_STORE(BPF_W, si,
+ bpf_target_off(struct sk_buff, priority, 4,
+ target_size));
else
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
bpf_target_off(struct sk_buff, priority, 4,
@@ -5798,9 +9809,9 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct __sk_buff, mark):
if (type == BPF_WRITE)
- *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
- bpf_target_off(struct sk_buff, mark, 4,
- target_size));
+ *insn++ = BPF_EMIT_STORE(BPF_W, si,
+ bpf_target_off(struct sk_buff, mark, 4,
+ target_size));
else
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
bpf_target_off(struct sk_buff, mark, 4,
@@ -5810,7 +9821,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct __sk_buff, pkt_type):
*target_size = 1;
*insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg,
- PKT_TYPE_OFFSET());
+ PKT_TYPE_OFFSET);
*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX);
#ifdef __BIG_ENDIAN_BITFIELD
*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5);
@@ -5818,30 +9829,42 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct __sk_buff, queue_mapping):
- *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
- bpf_target_off(struct sk_buff, queue_mapping, 2,
- target_size));
+ if (type == BPF_WRITE) {
+ u32 offset = bpf_target_off(struct sk_buff, queue_mapping, 2, target_size);
+
+ if (BPF_CLASS(si->code) == BPF_ST && si->imm >= NO_QUEUE_MAPPING) {
+ *insn++ = BPF_JMP_A(0); /* noop */
+ break;
+ }
+
+ if (BPF_CLASS(si->code) == BPF_STX)
+ *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, NO_QUEUE_MAPPING, 1);
+ *insn++ = BPF_EMIT_STORE(BPF_H, si, offset);
+ } else {
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff,
+ queue_mapping,
+ 2, target_size));
+ }
break;
case offsetof(struct __sk_buff, vlan_present):
- case offsetof(struct __sk_buff, vlan_tci):
- BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000);
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct sk_buff,
+ vlan_all, 4, target_size));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_ALU32_IMM(BPF_MOV, si->dst_reg, 1);
+ break;
+ case offsetof(struct __sk_buff, vlan_tci):
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
bpf_target_off(struct sk_buff, vlan_tci, 2,
target_size));
- if (si->off == offsetof(struct __sk_buff, vlan_tci)) {
- *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg,
- ~VLAN_TAG_PRESENT);
- } else {
- *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 12);
- *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1);
- }
break;
case offsetof(struct __sk_buff, cb[0]) ...
offsetofend(struct __sk_buff, cb[4]) - 1:
- BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20);
+ BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, data) < 20);
BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
offsetof(struct qdisc_skb_cb, data)) %
sizeof(__u64));
@@ -5852,15 +9875,14 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
off += offsetof(struct sk_buff, cb);
off += offsetof(struct qdisc_skb_cb, data);
if (type == BPF_WRITE)
- *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg,
- si->src_reg, off);
+ *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off);
else
*insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
si->src_reg, off);
break;
case offsetof(struct __sk_buff, tc_classid):
- BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, tc_classid) != 2);
+ BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, tc_classid) != 2);
off = si->off;
off -= offsetof(struct __sk_buff, tc_classid);
@@ -5868,8 +9890,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
off += offsetof(struct qdisc_skb_cb, tc_classid);
*target_size = 2;
if (type == BPF_WRITE)
- *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg,
- si->src_reg, off);
+ *insn++ = BPF_EMIT_STORE(BPF_H, si, off);
else
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg,
si->src_reg, off);
@@ -5902,9 +9923,9 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct __sk_buff, tc_index):
#ifdef CONFIG_NET_SCHED
if (type == BPF_WRITE)
- *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg,
- bpf_target_off(struct sk_buff, tc_index, 2,
- target_size));
+ *insn++ = BPF_EMIT_STORE(BPF_H, si,
+ bpf_target_off(struct sk_buff, tc_index, 2,
+ target_size));
else
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
bpf_target_off(struct sk_buff, tc_index, 2,
@@ -5931,7 +9952,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
#endif
break;
case offsetof(struct __sk_buff, family):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
si->dst_reg, si->src_reg,
@@ -5942,7 +9963,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
2, target_size));
break;
case offsetof(struct __sk_buff, remote_ip4):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
si->dst_reg, si->src_reg,
@@ -5953,7 +9974,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
4, target_size));
break;
case offsetof(struct __sk_buff, local_ip4):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
skc_rcv_saddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
@@ -5967,7 +9988,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct __sk_buff, remote_ip6[0]) ...
offsetof(struct __sk_buff, remote_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
skc_v6_daddr.s6_addr32[0]) != 4);
off = si->off;
@@ -5987,7 +10008,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct __sk_buff, local_ip6[0]) ...
offsetof(struct __sk_buff, local_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
skc_v6_rcv_saddr.s6_addr32[0]) != 4);
off = si->off;
@@ -6006,7 +10027,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct __sk_buff, remote_port):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
si->dst_reg, si->src_reg,
@@ -6021,7 +10042,7 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct __sk_buff, local_port):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
si->dst_reg, si->src_reg,
@@ -6030,83 +10051,157 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
bpf_target_off(struct sock_common,
skc_num, 2, target_size));
break;
+
+ case offsetof(struct __sk_buff, tstamp):
+ BUILD_BUG_ON(sizeof_field(struct sk_buff, tstamp) != 8);
+
+ if (type == BPF_WRITE)
+ insn = bpf_convert_tstamp_write(prog, si, insn);
+ else
+ insn = bpf_convert_tstamp_read(prog, si, insn);
+ break;
+
+ case offsetof(struct __sk_buff, tstamp_type):
+ insn = bpf_convert_tstamp_type_read(si, insn);
+ break;
+
+ case offsetof(struct __sk_buff, gso_segs):
+ insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs),
+ si->dst_reg, si->dst_reg,
+ bpf_target_off(struct skb_shared_info,
+ gso_segs, 2,
+ target_size));
+ break;
+ case offsetof(struct __sk_buff, gso_size):
+ insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_size),
+ si->dst_reg, si->dst_reg,
+ bpf_target_off(struct skb_shared_info,
+ gso_size, 2,
+ target_size));
+ break;
+ case offsetof(struct __sk_buff, wire_len):
+ BUILD_BUG_ON(sizeof_field(struct qdisc_skb_cb, pkt_len) != 4);
+
+ off = si->off;
+ off -= offsetof(struct __sk_buff, wire_len);
+ off += offsetof(struct sk_buff, cb);
+ off += offsetof(struct qdisc_skb_cb, pkt_len);
+ *target_size = 4;
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off);
+ break;
+
+ case offsetof(struct __sk_buff, sk):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_buff, sk));
+ break;
+ case offsetof(struct __sk_buff, hwtstamp):
+ BUILD_BUG_ON(sizeof_field(struct skb_shared_hwtstamps, hwtstamp) != 8);
+ BUILD_BUG_ON(offsetof(struct skb_shared_hwtstamps, hwtstamp) != 0);
+
+ insn = bpf_convert_shinfo_access(si->dst_reg, si->src_reg, insn);
+ *insn++ = BPF_LDX_MEM(BPF_DW,
+ si->dst_reg, si->dst_reg,
+ bpf_target_off(struct skb_shared_info,
+ hwtstamps, 8,
+ target_size));
+ break;
}
return insn - insn_buf;
}
-static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
- const struct bpf_insn *si,
- struct bpf_insn *insn_buf,
- struct bpf_prog *prog, u32 *target_size)
+u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
int off;
switch (si->off) {
case offsetof(struct bpf_sock, bound_dev_if):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_bound_dev_if) != 4);
+ BUILD_BUG_ON(sizeof_field(struct sock, sk_bound_dev_if) != 4);
if (type == BPF_WRITE)
- *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sock, sk_bound_dev_if));
+ *insn++ = BPF_EMIT_STORE(BPF_W, si,
+ offsetof(struct sock, sk_bound_dev_if));
else
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
offsetof(struct sock, sk_bound_dev_if));
break;
case offsetof(struct bpf_sock, mark):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_mark) != 4);
+ BUILD_BUG_ON(sizeof_field(struct sock, sk_mark) != 4);
if (type == BPF_WRITE)
- *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sock, sk_mark));
+ *insn++ = BPF_EMIT_STORE(BPF_W, si,
+ offsetof(struct sock, sk_mark));
else
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
offsetof(struct sock, sk_mark));
break;
case offsetof(struct bpf_sock, priority):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4);
+ BUILD_BUG_ON(sizeof_field(struct sock, sk_priority) != 4);
if (type == BPF_WRITE)
- *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sock, sk_priority));
+ *insn++ = BPF_EMIT_STORE(BPF_W, si,
+ offsetof(struct sock, sk_priority));
else
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
offsetof(struct sock, sk_priority));
break;
case offsetof(struct bpf_sock, family):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
-
- *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
- offsetof(struct sock, sk_family));
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct sock_common, skc_family),
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock_common,
+ skc_family,
+ sizeof_field(struct sock_common,
+ skc_family),
+ target_size));
break;
case offsetof(struct bpf_sock, type):
- *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sock, __sk_flags_offset));
- *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);
- *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT);
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct sock, sk_type),
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock, sk_type,
+ sizeof_field(struct sock, sk_type),
+ target_size));
break;
case offsetof(struct bpf_sock, protocol):
- *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
- offsetof(struct sock, __sk_flags_offset));
- *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
- *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT);
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct sock, sk_protocol),
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock, sk_protocol,
+ sizeof_field(struct sock, sk_protocol),
+ target_size));
break;
case offsetof(struct bpf_sock, src_ip4):
*insn++ = BPF_LDX_MEM(
BPF_SIZE(si->code), si->dst_reg, si->src_reg,
bpf_target_off(struct sock_common, skc_rcv_saddr,
- FIELD_SIZEOF(struct sock_common,
+ sizeof_field(struct sock_common,
skc_rcv_saddr),
target_size));
break;
+ case offsetof(struct bpf_sock, dst_ip4):
+ *insn++ = BPF_LDX_MEM(
+ BPF_SIZE(si->code), si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock_common, skc_daddr,
+ sizeof_field(struct sock_common,
+ skc_daddr),
+ target_size));
+ break;
+
case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
off = si->off;
@@ -6116,7 +10211,7 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
bpf_target_off(
struct sock_common,
skc_v6_rcv_saddr.s6_addr32[0],
- FIELD_SIZEOF(struct sock_common,
+ sizeof_field(struct sock_common,
skc_v6_rcv_saddr.s6_addr32[0]),
target_size) + off);
#else
@@ -6125,15 +10220,69 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type,
#endif
break;
+ case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
+#if IS_ENABLED(CONFIG_IPV6)
+ off = si->off;
+ off -= offsetof(struct bpf_sock, dst_ip6[0]);
+ *insn++ = BPF_LDX_MEM(
+ BPF_SIZE(si->code), si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock_common,
+ skc_v6_daddr.s6_addr32[0],
+ sizeof_field(struct sock_common,
+ skc_v6_daddr.s6_addr32[0]),
+ target_size) + off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+ *target_size = 4;
+#endif
+ break;
+
case offsetof(struct bpf_sock, src_port):
*insn++ = BPF_LDX_MEM(
BPF_FIELD_SIZEOF(struct sock_common, skc_num),
si->dst_reg, si->src_reg,
bpf_target_off(struct sock_common, skc_num,
- FIELD_SIZEOF(struct sock_common,
+ sizeof_field(struct sock_common,
skc_num),
target_size));
break;
+
+ case offsetof(struct bpf_sock, dst_port):
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct sock_common, skc_dport),
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock_common, skc_dport,
+ sizeof_field(struct sock_common,
+ skc_dport),
+ target_size));
+ break;
+
+ case offsetof(struct bpf_sock, state):
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct sock_common, skc_state),
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock_common, skc_state,
+ sizeof_field(struct sock_common,
+ skc_state),
+ target_size));
+ break;
+ case offsetof(struct bpf_sock, rx_queue_mapping):
+#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
+ *insn++ = BPF_LDX_MEM(
+ BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping),
+ si->dst_reg, si->src_reg,
+ bpf_target_off(struct sock, sk_rx_queue_mapping,
+ sizeof_field(struct sock,
+ sk_rx_queue_mapping),
+ target_size));
+ *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING,
+ 1);
+ *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
+#else
+ *insn++ = BPF_MOV64_IMM(si->dst_reg, -1);
+ *target_size = 2;
+#endif
+ break;
}
return insn - insn_buf;
@@ -6204,6 +10353,16 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
offsetof(struct xdp_rxq_info,
queue_index));
break;
+ case offsetof(struct xdp_md, egress_ifindex):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq),
+ si->dst_reg, si->src_reg,
+ offsetof(struct xdp_buff, txq));
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct xdp_txq_info, dev));
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+ offsetof(struct net_device, ifindex));
+ break;
}
return insn - insn_buf;
@@ -6225,7 +10384,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
si->src_reg, offsetof(S, F)); \
*insn++ = BPF_LDX_MEM( \
SIZE, si->dst_reg, si->dst_reg, \
- bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \
+ bpf_target_off(NS, NF, sizeof_field(NS, NF), \
target_size) \
+ OFF); \
} while (0)
@@ -6237,9 +10396,6 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to
* SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation.
*
- * It doesn't support SIZE argument though since narrow stores are not
- * supported for now.
- *
* In addition it uses Temporary Field TF (member of struct S) as the 3rd
* "register" since two registers available in convert_ctx_access are not
* enough: we can't override neither SRC, since it contains value to store, nor
@@ -6247,7 +10403,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
* instructions. But we need a temporary place to save pointer to nested
* structure whose field we want to store to.
*/
-#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, TF) \
+#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF) \
do { \
int tmp_reg = BPF_REG_9; \
if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \
@@ -6258,11 +10414,12 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
offsetof(S, TF)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \
si->dst_reg, offsetof(S, F)); \
- *insn++ = BPF_STX_MEM( \
- BPF_FIELD_SIZEOF(NS, NF), tmp_reg, si->src_reg, \
- bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \
+ *insn++ = BPF_RAW_INSN(SIZE | BPF_MEM | BPF_CLASS(si->code), \
+ tmp_reg, si->src_reg, \
+ bpf_target_off(NS, NF, sizeof_field(NS, NF), \
target_size) \
- + OFF); \
+ + OFF, \
+ si->imm); \
*insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \
offsetof(S, TF)); \
} while (0)
@@ -6271,25 +10428,21 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
TF) \
do { \
if (type == BPF_WRITE) { \
- SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, \
- TF); \
+ SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, \
+ OFF, TF); \
} else { \
SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \
S, NS, F, NF, SIZE, OFF); \
} \
} while (0)
-#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \
- SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \
- S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF)
-
static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
struct bpf_prog *prog, u32 *target_size)
{
+ int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port);
struct bpf_insn *insn = insn_buf;
- int off;
switch (si->off) {
case offsetof(struct bpf_sock_addr, user_family):
@@ -6322,11 +10475,13 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
*/
BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) !=
offsetof(struct sockaddr_in6, sin6_port));
- BUILD_BUG_ON(FIELD_SIZEOF(struct sockaddr_in, sin_port) !=
- FIELD_SIZEOF(struct sockaddr_in6, sin6_port));
- SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern,
- struct sockaddr_in6, uaddr,
- sin6_port, tmp_reg);
+ BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) !=
+ sizeof_field(struct sockaddr_in6, sin6_port));
+ /* Account for sin6_port being smaller than user_port. */
+ port_size = min(port_size, BPF_LDST_BYTES(si));
+ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(
+ struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr,
+ sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg);
break;
case offsetof(struct bpf_sock_addr, family):
@@ -6335,20 +10490,13 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct bpf_sock_addr, type):
- SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(
- struct bpf_sock_addr_kern, struct sock, sk,
- __sk_flags_offset, BPF_W, 0);
- *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);
- *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT);
+ SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
+ struct sock, sk, sk_type);
break;
case offsetof(struct bpf_sock_addr, protocol):
- SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(
- struct bpf_sock_addr_kern, struct sock, sk,
- __sk_flags_offset, BPF_W, 0);
- *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
- *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
- SK_FL_PROTO_SHIFT);
+ SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern,
+ struct sock, sk, sk_protocol);
break;
case offsetof(struct bpf_sock_addr, msg_src_ip4):
@@ -6367,6 +10515,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
break;
+ case offsetof(struct bpf_sock_addr, sk):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_addr_kern, sk));
+ break;
}
return insn - insn_buf;
@@ -6381,28 +10534,166 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
struct bpf_insn *insn = insn_buf;
int off;
+/* Helper macro for adding read access to tcp_sock or sock fields. */
+#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
+ do { \
+ int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2; \
+ BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \
+ sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \
+ if (si->dst_reg == reg || si->src_reg == reg) \
+ reg--; \
+ if (si->dst_reg == reg || si->src_reg == reg) \
+ reg--; \
+ if (si->dst_reg == si->src_reg) { \
+ *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ fullsock_reg = reg; \
+ jmp += 2; \
+ } \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, \
+ is_locked_tcp_sock), \
+ fullsock_reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ is_locked_tcp_sock)); \
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \
+ if (si->dst_reg == si->src_reg) \
+ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, sk),\
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, sk));\
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \
+ OBJ_FIELD), \
+ si->dst_reg, si->dst_reg, \
+ offsetof(OBJ, OBJ_FIELD)); \
+ if (si->dst_reg == si->src_reg) { \
+ *insn++ = BPF_JMP_A(1); \
+ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ } \
+ } while (0)
+
+#define SOCK_OPS_GET_SK() \
+ do { \
+ int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 1; \
+ if (si->dst_reg == reg || si->src_reg == reg) \
+ reg--; \
+ if (si->dst_reg == reg || si->src_reg == reg) \
+ reg--; \
+ if (si->dst_reg == si->src_reg) { \
+ *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ fullsock_reg = reg; \
+ jmp += 2; \
+ } \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, \
+ is_fullsock), \
+ fullsock_reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ is_fullsock)); \
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp); \
+ if (si->dst_reg == si->src_reg) \
+ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, sk),\
+ si->dst_reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, sk));\
+ if (si->dst_reg == si->src_reg) { \
+ *insn++ = BPF_JMP_A(1); \
+ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ } \
+ } while (0)
+
+#define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \
+ SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock)
+
+/* Helper macro for adding write access to tcp_sock or sock fields.
+ * The macro is called with two registers, dst_reg which contains a pointer
+ * to ctx (context) and src_reg which contains the value that should be
+ * stored. However, we need an additional register since we cannot overwrite
+ * dst_reg because it may be used later in the program.
+ * Instead we "borrow" one of the other register. We first save its value
+ * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore
+ * it at the end of the macro.
+ */
+#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
+ do { \
+ int reg = BPF_REG_9; \
+ BUILD_BUG_ON(sizeof_field(OBJ, OBJ_FIELD) > \
+ sizeof_field(struct bpf_sock_ops, BPF_FIELD)); \
+ if (si->dst_reg == reg || si->src_reg == reg) \
+ reg--; \
+ if (si->dst_reg == reg || si->src_reg == reg) \
+ reg--; \
+ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, \
+ is_locked_tcp_sock), \
+ reg, si->dst_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ is_locked_tcp_sock)); \
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct bpf_sock_ops_kern, sk),\
+ reg, si->dst_reg, \
+ offsetof(struct bpf_sock_ops_kern, sk));\
+ *insn++ = BPF_RAW_INSN(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD) | \
+ BPF_MEM | BPF_CLASS(si->code), \
+ reg, si->src_reg, \
+ offsetof(OBJ, OBJ_FIELD), \
+ si->imm); \
+ *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \
+ offsetof(struct bpf_sock_ops_kern, \
+ temp)); \
+ } while (0)
+
+#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \
+ do { \
+ if (TYPE == BPF_WRITE) \
+ SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \
+ else \
+ SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \
+ } while (0)
+
switch (si->off) {
- case offsetof(struct bpf_sock_ops, op) ...
+ case offsetof(struct bpf_sock_ops, op):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ op),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, op));
+ break;
+
+ case offsetof(struct bpf_sock_ops, replylong[0]) ...
offsetof(struct bpf_sock_ops, replylong[3]):
- BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, op) !=
- FIELD_SIZEOF(struct bpf_sock_ops_kern, op));
- BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, reply) !=
- FIELD_SIZEOF(struct bpf_sock_ops_kern, reply));
- BUILD_BUG_ON(FIELD_SIZEOF(struct bpf_sock_ops, replylong) !=
- FIELD_SIZEOF(struct bpf_sock_ops_kern, replylong));
+ BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, reply) !=
+ sizeof_field(struct bpf_sock_ops_kern, reply));
+ BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, replylong) !=
+ sizeof_field(struct bpf_sock_ops_kern, replylong));
off = si->off;
- off -= offsetof(struct bpf_sock_ops, op);
- off += offsetof(struct bpf_sock_ops_kern, op);
+ off -= offsetof(struct bpf_sock_ops, replylong[0]);
+ off += offsetof(struct bpf_sock_ops_kern, replylong[0]);
if (type == BPF_WRITE)
- *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg,
- off);
+ *insn++ = BPF_EMIT_STORE(BPF_W, si, off);
else
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
off);
break;
case offsetof(struct bpf_sock_ops, family):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct bpf_sock_ops_kern, sk),
@@ -6413,7 +10704,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct bpf_sock_ops, remote_ip4):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct bpf_sock_ops_kern, sk),
@@ -6424,7 +10715,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct bpf_sock_ops, local_ip4):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
skc_rcv_saddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
@@ -6439,7 +10730,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct bpf_sock_ops, remote_ip6[0]) ...
offsetof(struct bpf_sock_ops, remote_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
skc_v6_daddr.s6_addr32[0]) != 4);
off = si->off;
@@ -6460,7 +10751,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct bpf_sock_ops, local_ip6[0]) ...
offsetof(struct bpf_sock_ops, local_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
skc_v6_rcv_saddr.s6_addr32[0]) != 4);
off = si->off;
@@ -6479,7 +10770,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct bpf_sock_ops, remote_port):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct bpf_sock_ops_kern, sk),
@@ -6493,7 +10784,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct bpf_sock_ops, local_port):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct bpf_sock_ops_kern, sk),
@@ -6513,7 +10804,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct bpf_sock_ops, state):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_state) != 1);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
struct bpf_sock_ops_kern, sk),
@@ -6524,7 +10815,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct bpf_sock_ops, rtt_min):
- BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) !=
+ BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
sizeof(struct minmax));
BUILD_BUG_ON(sizeof(struct minmax) <
sizeof(struct minmax_sample));
@@ -6535,85 +10826,7 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
offsetof(struct bpf_sock_ops_kern, sk));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct tcp_sock, rtt_min) +
- FIELD_SIZEOF(struct minmax_sample, t));
- break;
-
-/* Helper macro for adding read access to tcp_sock or sock fields. */
-#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
- do { \
- BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \
- FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
- struct bpf_sock_ops_kern, \
- is_fullsock), \
- si->dst_reg, si->src_reg, \
- offsetof(struct bpf_sock_ops_kern, \
- is_fullsock)); \
- *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
- struct bpf_sock_ops_kern, sk),\
- si->dst_reg, si->src_reg, \
- offsetof(struct bpf_sock_ops_kern, sk));\
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \
- OBJ_FIELD), \
- si->dst_reg, si->dst_reg, \
- offsetof(OBJ, OBJ_FIELD)); \
- } while (0)
-
-/* Helper macro for adding write access to tcp_sock or sock fields.
- * The macro is called with two registers, dst_reg which contains a pointer
- * to ctx (context) and src_reg which contains the value that should be
- * stored. However, we need an additional register since we cannot overwrite
- * dst_reg because it may be used later in the program.
- * Instead we "borrow" one of the other register. We first save its value
- * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore
- * it at the end of the macro.
- */
-#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
- do { \
- int reg = BPF_REG_9; \
- BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \
- FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \
- if (si->dst_reg == reg || si->src_reg == reg) \
- reg--; \
- if (si->dst_reg == reg || si->src_reg == reg) \
- reg--; \
- *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \
- offsetof(struct bpf_sock_ops_kern, \
- temp)); \
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
- struct bpf_sock_ops_kern, \
- is_fullsock), \
- reg, si->dst_reg, \
- offsetof(struct bpf_sock_ops_kern, \
- is_fullsock)); \
- *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
- struct bpf_sock_ops_kern, sk),\
- reg, si->dst_reg, \
- offsetof(struct bpf_sock_ops_kern, sk));\
- *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \
- reg, si->src_reg, \
- offsetof(OBJ, OBJ_FIELD)); \
- *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \
- offsetof(struct bpf_sock_ops_kern, \
- temp)); \
- } while (0)
-
-#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \
- do { \
- if (TYPE == BPF_WRITE) \
- SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \
- else \
- SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \
- } while (0)
-
- case offsetof(struct bpf_sock_ops, snd_cwnd):
- SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock);
- break;
-
- case offsetof(struct bpf_sock_ops, srtt_us):
- SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock);
+ sizeof_field(struct minmax_sample, t));
break;
case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags):
@@ -6621,96 +10834,190 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
struct tcp_sock);
break;
+ case offsetof(struct bpf_sock_ops, sk_txhash):
+ SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
+ struct sock, type);
+ break;
+ case offsetof(struct bpf_sock_ops, snd_cwnd):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd);
+ break;
+ case offsetof(struct bpf_sock_ops, srtt_us):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us);
+ break;
case offsetof(struct bpf_sock_ops, snd_ssthresh):
- SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock);
+ SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh);
break;
-
case offsetof(struct bpf_sock_ops, rcv_nxt):
- SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock);
+ SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt);
break;
-
case offsetof(struct bpf_sock_ops, snd_nxt):
- SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock);
+ SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt);
break;
-
case offsetof(struct bpf_sock_ops, snd_una):
- SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock);
+ SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una);
break;
-
case offsetof(struct bpf_sock_ops, mss_cache):
- SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock);
+ SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache);
break;
-
case offsetof(struct bpf_sock_ops, ecn_flags):
- SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock);
+ SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags);
break;
-
case offsetof(struct bpf_sock_ops, rate_delivered):
- SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered,
- struct tcp_sock);
+ SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered);
break;
-
case offsetof(struct bpf_sock_ops, rate_interval_us):
- SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us,
- struct tcp_sock);
+ SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us);
break;
-
case offsetof(struct bpf_sock_ops, packets_out):
- SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock);
+ SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out);
break;
-
case offsetof(struct bpf_sock_ops, retrans_out):
- SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock);
+ SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out);
break;
-
case offsetof(struct bpf_sock_ops, total_retrans):
- SOCK_OPS_GET_FIELD(total_retrans, total_retrans,
- struct tcp_sock);
+ SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans);
break;
-
case offsetof(struct bpf_sock_ops, segs_in):
- SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock);
+ SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in);
break;
-
case offsetof(struct bpf_sock_ops, data_segs_in):
- SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock);
+ SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in);
break;
-
case offsetof(struct bpf_sock_ops, segs_out):
- SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock);
+ SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out);
break;
-
case offsetof(struct bpf_sock_ops, data_segs_out):
- SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out,
- struct tcp_sock);
+ SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out);
break;
-
case offsetof(struct bpf_sock_ops, lost_out):
- SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock);
+ SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out);
break;
-
case offsetof(struct bpf_sock_ops, sacked_out):
- SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock);
+ SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out);
break;
-
- case offsetof(struct bpf_sock_ops, sk_txhash):
- SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
- struct sock, type);
- break;
-
case offsetof(struct bpf_sock_ops, bytes_received):
- SOCK_OPS_GET_FIELD(bytes_received, bytes_received,
- struct tcp_sock);
+ SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received);
break;
-
case offsetof(struct bpf_sock_ops, bytes_acked):
- SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock);
+ SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked);
+ break;
+ case offsetof(struct bpf_sock_ops, sk):
+ SOCK_OPS_GET_SK();
+ break;
+ case offsetof(struct bpf_sock_ops, skb_data_end):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb_data_end),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb_data_end));
+ break;
+ case offsetof(struct bpf_sock_ops, skb_data):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct sk_buff, data));
+ break;
+ case offsetof(struct bpf_sock_ops, skb_len):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
+ si->dst_reg, si->dst_reg,
+ offsetof(struct sk_buff, len));
+ break;
+ case offsetof(struct bpf_sock_ops, skb_tcp_flags):
+ off = offsetof(struct sk_buff, cb);
+ off += offsetof(struct tcp_skb_cb, tcp_flags);
+ *target_size = sizeof_field(struct tcp_skb_cb, tcp_flags);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_skb_cb,
+ tcp_flags),
+ si->dst_reg, si->dst_reg, off);
break;
+ case offsetof(struct bpf_sock_ops, skb_hwtstamp): {
+ struct bpf_insn *jmp_on_null_skb;
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_ops_kern,
+ skb),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ skb));
+ /* Reserve one insn to test skb == NULL */
+ jmp_on_null_skb = insn++;
+ insn = bpf_convert_shinfo_access(si->dst_reg, si->dst_reg, insn);
+ *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
+ bpf_target_off(struct skb_shared_info,
+ hwtstamps, 8,
+ target_size));
+ *jmp_on_null_skb = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0,
+ insn - jmp_on_null_skb - 1);
+ break;
+ }
}
return insn - insn_buf;
}
+/* data_end = skb->data + skb_headlen() */
+static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si,
+ struct bpf_insn *insn)
+{
+ int reg;
+ int temp_reg_off = offsetof(struct sk_buff, cb) +
+ offsetof(struct sk_skb_cb, temp_reg);
+
+ if (si->src_reg == si->dst_reg) {
+ /* We need an extra register, choose and save a register. */
+ reg = BPF_REG_9;
+ if (si->src_reg == reg || si->dst_reg == reg)
+ reg--;
+ if (si->src_reg == reg || si->dst_reg == reg)
+ reg--;
+ *insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg, temp_reg_off);
+ } else {
+ reg = si->dst_reg;
+ }
+
+ /* reg = skb->data */
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+ reg, si->src_reg,
+ offsetof(struct sk_buff, data));
+ /* AX = skb->len */
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
+ BPF_REG_AX, si->src_reg,
+ offsetof(struct sk_buff, len));
+ /* reg = skb->data + skb->len */
+ *insn++ = BPF_ALU64_REG(BPF_ADD, reg, BPF_REG_AX);
+ /* AX = skb->data_len */
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len),
+ BPF_REG_AX, si->src_reg,
+ offsetof(struct sk_buff, data_len));
+
+ /* reg = skb->data + skb->len - skb->data_len */
+ *insn++ = BPF_ALU64_REG(BPF_SUB, reg, BPF_REG_AX);
+
+ if (si->src_reg == si->dst_reg) {
+ /* Restore the saved register */
+ *insn++ = BPF_MOV64_REG(BPF_REG_AX, si->src_reg);
+ *insn++ = BPF_MOV64_REG(si->dst_reg, reg);
+ *insn++ = BPF_LDX_MEM(BPF_DW, reg, BPF_REG_AX, temp_reg_off);
+ }
+
+ return insn;
+}
+
static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
@@ -6721,13 +11028,28 @@ static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
switch (si->off) {
case offsetof(struct __sk_buff, data_end):
+ insn = bpf_convert_data_end_access(si, insn);
+ break;
+ case offsetof(struct __sk_buff, cb[0]) ...
+ offsetofend(struct __sk_buff, cb[4]) - 1:
+ BUILD_BUG_ON(sizeof_field(struct sk_skb_cb, data) < 20);
+ BUILD_BUG_ON((offsetof(struct sk_buff, cb) +
+ offsetof(struct sk_skb_cb, data)) %
+ sizeof(__u64));
+
+ prog->cb_access = 1;
off = si->off;
- off -= offsetof(struct __sk_buff, data_end);
+ off -= offsetof(struct __sk_buff, cb[0]);
off += offsetof(struct sk_buff, cb);
- off += offsetof(struct tcp_skb_cb, bpf.data_end);
- *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
- si->src_reg, off);
+ off += offsetof(struct sk_skb_cb, data);
+ if (type == BPF_WRITE)
+ *insn++ = BPF_EMIT_STORE(BPF_SIZE(si->code), si, off);
+ else
+ *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg,
+ si->src_reg, off);
break;
+
+
default:
return bpf_convert_ctx_access(type, si, insn_buf, prog,
target_size);
@@ -6746,47 +11068,50 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
int off;
#endif
+ /* convert ctx uses the fact sg element is first in struct */
+ BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0);
+
switch (si->off) {
case offsetof(struct sk_msg_md, data):
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, data));
+ offsetof(struct sk_msg, data));
break;
case offsetof(struct sk_msg_md, data_end):
- *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_buff, data_end),
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data_end),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, data_end));
+ offsetof(struct sk_msg, data_end));
break;
case offsetof(struct sk_msg_md, family):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_family) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct sk_msg_buff, sk),
+ struct sk_msg, sk),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, sk));
+ offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
offsetof(struct sock_common, skc_family));
break;
case offsetof(struct sk_msg_md, remote_ip4):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_daddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct sk_msg_buff, sk),
+ struct sk_msg, sk),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, sk));
+ offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct sock_common, skc_daddr));
break;
case offsetof(struct sk_msg_md, local_ip4):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
skc_rcv_saddr) != 4);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct sk_msg_buff, sk),
+ struct sk_msg, sk),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, sk));
+ offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct sock_common,
skc_rcv_saddr));
@@ -6795,15 +11120,15 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct sk_msg_md, remote_ip6[0]) ...
offsetof(struct sk_msg_md, remote_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
skc_v6_daddr.s6_addr32[0]) != 4);
off = si->off;
off -= offsetof(struct sk_msg_md, remote_ip6[0]);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct sk_msg_buff, sk),
+ struct sk_msg, sk),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, sk));
+ offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct sock_common,
skc_v6_daddr.s6_addr32[0]) +
@@ -6816,15 +11141,15 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct sk_msg_md, local_ip6[0]) ...
offsetof(struct sk_msg_md, local_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common,
+ BUILD_BUG_ON(sizeof_field(struct sock_common,
skc_v6_rcv_saddr.s6_addr32[0]) != 4);
off = si->off;
off -= offsetof(struct sk_msg_md, local_ip6[0]);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct sk_msg_buff, sk),
+ struct sk_msg, sk),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, sk));
+ offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
offsetof(struct sock_common,
skc_v6_rcv_saddr.s6_addr32[0]) +
@@ -6835,12 +11160,12 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct sk_msg_md, remote_port):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_dport) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct sk_msg_buff, sk),
+ struct sk_msg, sk),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, sk));
+ offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
offsetof(struct sock_common, skc_dport));
#ifndef __BIG_ENDIAN_BITFIELD
@@ -6849,15 +11174,27 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct sk_msg_md, local_port):
- BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2);
+ BUILD_BUG_ON(sizeof_field(struct sock_common, skc_num) != 2);
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
- struct sk_msg_buff, sk),
+ struct sk_msg, sk),
si->dst_reg, si->src_reg,
- offsetof(struct sk_msg_buff, sk));
+ offsetof(struct sk_msg, sk));
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg,
offsetof(struct sock_common, skc_num));
break;
+
+ case offsetof(struct sk_msg_md, size):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg_sg, size),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg_sg, size));
+ break;
+
+ case offsetof(struct sk_msg_md, sk):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct sk_msg, sk));
+ break;
}
return insn - insn_buf;
@@ -6880,6 +11217,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = {
.convert_ctx_access = tc_cls_act_convert_ctx_access,
.gen_prologue = tc_cls_act_prologue,
.gen_ld_abs = bpf_gen_ld_abs,
+ .btf_struct_access = tc_cls_act_btf_struct_access,
};
const struct bpf_prog_ops tc_cls_act_prog_ops = {
@@ -6890,6 +11228,8 @@ const struct bpf_verifier_ops xdp_verifier_ops = {
.get_func_proto = xdp_func_proto,
.is_valid_access = xdp_is_valid_access,
.convert_ctx_access = xdp_convert_ctx_access,
+ .gen_prologue = bpf_noop_prologue,
+ .btf_struct_access = xdp_btf_struct_access,
};
const struct bpf_prog_ops xdp_prog_ops = {
@@ -6898,7 +11238,7 @@ const struct bpf_prog_ops xdp_prog_ops = {
const struct bpf_verifier_ops cg_skb_verifier_ops = {
.get_func_proto = cg_skb_func_proto,
- .is_valid_access = sk_filter_is_valid_access,
+ .is_valid_access = cg_skb_is_valid_access,
.convert_ctx_access = bpf_convert_ctx_access,
};
@@ -6944,13 +11284,12 @@ const struct bpf_verifier_ops lwt_seg6local_verifier_ops = {
};
const struct bpf_prog_ops lwt_seg6local_prog_ops = {
- .test_run = bpf_prog_test_run_skb,
};
const struct bpf_verifier_ops cg_sock_verifier_ops = {
.get_func_proto = sock_filter_func_proto,
.is_valid_access = sock_filter_is_valid_access,
- .convert_ctx_access = sock_filter_convert_ctx_access,
+ .convert_ctx_access = bpf_sock_convert_ctx_access,
};
const struct bpf_prog_ops cg_sock_prog_ops = {
@@ -6988,11 +11327,22 @@ const struct bpf_verifier_ops sk_msg_verifier_ops = {
.get_func_proto = sk_msg_func_proto,
.is_valid_access = sk_msg_is_valid_access,
.convert_ctx_access = sk_msg_convert_ctx_access,
+ .gen_prologue = bpf_noop_prologue,
};
const struct bpf_prog_ops sk_msg_prog_ops = {
};
+const struct bpf_verifier_ops flow_dissector_verifier_ops = {
+ .get_func_proto = flow_dissector_func_proto,
+ .is_valid_access = flow_dissector_is_valid_access,
+ .convert_ctx_access = flow_dissector_convert_ctx_access,
+};
+
+const struct bpf_prog_ops flow_dissector_prog_ops = {
+ .test_run = bpf_prog_test_run_flow_dissector,
+};
+
int sk_detach_filter(struct sock *sk)
{
int ret = -ENOENT;
@@ -7013,14 +11363,13 @@ int sk_detach_filter(struct sock *sk)
}
EXPORT_SYMBOL_GPL(sk_detach_filter);
-int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
- unsigned int len)
+int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len)
{
struct sock_fprog_kern *fprog;
struct sk_filter *filter;
int ret = 0;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
filter = rcu_dereference_protected(sk->sk_filter,
lockdep_sock_is_held(sk));
if (!filter)
@@ -7045,7 +11394,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
goto out;
ret = -EFAULT;
- if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog)))
+ if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog)))
goto out;
/* Instead of bytes, the API requests to return the number
@@ -7053,29 +11402,21 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
*/
ret = fprog->len;
out:
- release_sock(sk);
+ sockopt_release_sock(sk);
return ret;
}
#ifdef CONFIG_INET
-struct sk_reuseport_kern {
- struct sk_buff *skb;
- struct sock *sk;
- struct sock *selected_sk;
- void *data_end;
- u32 hash;
- u32 reuseport_id;
- bool bind_inany;
-};
-
static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
struct sock_reuseport *reuse,
struct sock *sk, struct sk_buff *skb,
+ struct sock *migrating_sk,
u32 hash)
{
reuse_kern->skb = skb;
reuse_kern->sk = sk;
reuse_kern->selected_sk = NULL;
+ reuse_kern->migrating_sk = migrating_sk;
reuse_kern->data_end = skb->data + skb_headlen(skb);
reuse_kern->hash = hash;
reuse_kern->reuseport_id = reuse->reuseport_id;
@@ -7084,13 +11425,14 @@ static void bpf_init_reuseport_kern(struct sk_reuseport_kern *reuse_kern,
struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
struct bpf_prog *prog, struct sk_buff *skb,
+ struct sock *migrating_sk,
u32 hash)
{
struct sk_reuseport_kern reuse_kern;
enum sk_action action;
- bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash);
- action = BPF_PROG_RUN(prog, &reuse_kern);
+ bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, migrating_sk, hash);
+ action = bpf_prog_run(prog, &reuse_kern);
if (action == SK_PASS)
return reuse_kern.selected_sk;
@@ -7101,46 +11443,51 @@ struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern,
struct bpf_map *, map, void *, key, u32, flags)
{
+ bool is_sockarray = map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY;
struct sock_reuseport *reuse;
struct sock *selected_sk;
+ int err;
selected_sk = map->ops->map_lookup_elem(map, key);
if (!selected_sk)
return -ENOENT;
reuse = rcu_dereference(selected_sk->sk_reuseport_cb);
- if (!reuse)
- /* selected_sk is unhashed (e.g. by close()) after the
- * above map_lookup_elem(). Treat selected_sk has already
- * been removed from the map.
+ if (!reuse) {
+ /* reuseport_array has only sk with non NULL sk_reuseport_cb.
+ * The only (!reuse) case here is - the sk has already been
+ * unhashed (e.g. by close()), so treat it as -ENOENT.
+ *
+ * Other maps (e.g. sock_map) do not provide this guarantee and
+ * the sk may never be in the reuseport group to begin with.
*/
- return -ENOENT;
+ err = is_sockarray ? -ENOENT : -EINVAL;
+ goto error;
+ }
if (unlikely(reuse->reuseport_id != reuse_kern->reuseport_id)) {
- struct sock *sk;
-
- if (unlikely(!reuse_kern->reuseport_id))
- /* There is a small race between adding the
- * sk to the map and setting the
- * reuse_kern->reuseport_id.
- * Treat it as the sk has not been added to
- * the bpf map yet.
- */
- return -ENOENT;
-
- sk = reuse_kern->sk;
- if (sk->sk_protocol != selected_sk->sk_protocol)
- return -EPROTOTYPE;
- else if (sk->sk_family != selected_sk->sk_family)
- return -EAFNOSUPPORT;
+ struct sock *sk = reuse_kern->sk;
- /* Catch all. Likely bound to a different sockaddr. */
- return -EBADFD;
+ if (sk->sk_protocol != selected_sk->sk_protocol) {
+ err = -EPROTOTYPE;
+ } else if (sk->sk_family != selected_sk->sk_family) {
+ err = -EAFNOSUPPORT;
+ } else {
+ /* Catch all. Likely bound to a different sockaddr. */
+ err = -EBADFD;
+ }
+ goto error;
}
reuse_kern->selected_sk = selected_sk;
return 0;
+error:
+ /* Lookup in sock_map can return TCP ESTABLISHED sockets. */
+ if (sk_is_refcounted(selected_sk))
+ sock_put(selected_sk);
+
+ return err;
}
static const struct bpf_func_proto sk_select_reuseport_proto = {
@@ -7200,8 +11547,12 @@ sk_reuseport_func_proto(enum bpf_func_id func_id,
return &sk_reuseport_load_bytes_proto;
case BPF_FUNC_skb_load_bytes_relative:
return &sk_reuseport_load_bytes_relative_proto;
+ case BPF_FUNC_get_socket_cookie:
+ return &bpf_get_socket_ptr_cookie_proto;
+ case BPF_FUNC_ktime_get_coarse_ns:
+ return &bpf_ktime_get_coarse_ns_proto;
default:
- return bpf_base_func_proto(func_id);
+ return bpf_base_func_proto(func_id, prog);
}
}
@@ -7229,14 +11580,22 @@ sk_reuseport_is_valid_access(int off, int size,
case offsetof(struct sk_reuseport_md, hash):
return size == size_default;
+ case offsetof(struct sk_reuseport_md, sk):
+ info->reg_type = PTR_TO_SOCKET;
+ return size == sizeof(__u64);
+
+ case offsetof(struct sk_reuseport_md, migrating_sk):
+ info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
+ return size == sizeof(__u64);
+
/* Fields that allow narrowing */
- case offsetof(struct sk_reuseport_md, eth_protocol):
- if (size < FIELD_SIZEOF(struct sk_buff, protocol))
+ case bpf_ctx_range(struct sk_reuseport_md, eth_protocol):
+ if (size < sizeof_field(struct sk_buff, protocol))
return false;
- /* fall through */
- case offsetof(struct sk_reuseport_md, ip_protocol):
- case offsetof(struct sk_reuseport_md, bind_inany):
- case offsetof(struct sk_reuseport_md, len):
+ fallthrough;
+ case bpf_ctx_range(struct sk_reuseport_md, ip_protocol):
+ case bpf_ctx_range(struct sk_reuseport_md, bind_inany):
+ case bpf_ctx_range(struct sk_reuseport_md, len):
bpf_ctx_record_field_size(info, size_default);
return bpf_ctx_narrow_access_ok(off, size, size_default);
@@ -7249,7 +11608,7 @@ sk_reuseport_is_valid_access(int off, int size,
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_reuseport_kern, F), \
si->dst_reg, si->src_reg, \
bpf_target_off(struct sk_reuseport_kern, F, \
- FIELD_SIZEOF(struct sk_reuseport_kern, F), \
+ sizeof_field(struct sk_reuseport_kern, F), \
target_size)); \
})
@@ -7259,11 +11618,11 @@ sk_reuseport_is_valid_access(int off, int size,
skb, \
SKB_FIELD)
-#define SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(SK_FIELD, BPF_SIZE, EXTRA_OFF) \
- SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(struct sk_reuseport_kern, \
- struct sock, \
- sk, \
- SK_FIELD, BPF_SIZE, EXTRA_OFF)
+#define SK_REUSEPORT_LOAD_SK_FIELD(SK_FIELD) \
+ SOCK_ADDR_LOAD_NESTED_FIELD(struct sk_reuseport_kern, \
+ struct sock, \
+ sk, \
+ SK_FIELD)
static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
@@ -7287,16 +11646,7 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct sk_reuseport_md, ip_protocol):
- BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE);
- SK_REUSEPORT_LOAD_SK_FIELD_SIZE_OFF(__sk_flags_offset,
- BPF_W, 0);
- *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
- *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg,
- SK_FL_PROTO_SHIFT);
- /* SK_FL_PROTO_MASK and SK_FL_PROTO_SHIFT are endian
- * aware. No further narrowing or masking is needed.
- */
- *target_size = 1;
+ SK_REUSEPORT_LOAD_SK_FIELD(sk_protocol);
break;
case offsetof(struct sk_reuseport_md, data_end):
@@ -7310,6 +11660,14 @@ static u32 sk_reuseport_convert_ctx_access(enum bpf_access_type type,
case offsetof(struct sk_reuseport_md, bind_inany):
SK_REUSEPORT_LOAD_FIELD(bind_inany);
break;
+
+ case offsetof(struct sk_reuseport_md, sk):
+ SK_REUSEPORT_LOAD_FIELD(sk);
+ break;
+
+ case offsetof(struct sk_reuseport_md, migrating_sk):
+ SK_REUSEPORT_LOAD_FIELD(migrating_sk);
+ break;
}
return insn - insn_buf;
@@ -7323,4 +11681,898 @@ const struct bpf_verifier_ops sk_reuseport_verifier_ops = {
const struct bpf_prog_ops sk_reuseport_prog_ops = {
};
+
+DEFINE_STATIC_KEY_FALSE(bpf_sk_lookup_enabled);
+EXPORT_SYMBOL(bpf_sk_lookup_enabled);
+
+BPF_CALL_3(bpf_sk_lookup_assign, struct bpf_sk_lookup_kern *, ctx,
+ struct sock *, sk, u64, flags)
+{
+ if (unlikely(flags & ~(BPF_SK_LOOKUP_F_REPLACE |
+ BPF_SK_LOOKUP_F_NO_REUSEPORT)))
+ return -EINVAL;
+ if (unlikely(sk && sk_is_refcounted(sk)))
+ return -ESOCKTNOSUPPORT; /* reject non-RCU freed sockets */
+ if (unlikely(sk && sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN))
+ return -ESOCKTNOSUPPORT; /* only accept TCP socket in LISTEN */
+ if (unlikely(sk && sk_is_udp(sk) && sk->sk_state != TCP_CLOSE))
+ return -ESOCKTNOSUPPORT; /* only accept UDP socket in CLOSE */
+
+ /* Check if socket is suitable for packet L3/L4 protocol */
+ if (sk && sk->sk_protocol != ctx->protocol)
+ return -EPROTOTYPE;
+ if (sk && sk->sk_family != ctx->family &&
+ (sk->sk_family == AF_INET || ipv6_only_sock(sk)))
+ return -EAFNOSUPPORT;
+
+ if (ctx->selected_sk && !(flags & BPF_SK_LOOKUP_F_REPLACE))
+ return -EEXIST;
+
+ /* Select socket as lookup result */
+ ctx->selected_sk = sk;
+ ctx->no_reuseport = flags & BPF_SK_LOOKUP_F_NO_REUSEPORT;
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_sk_lookup_assign_proto = {
+ .func = bpf_sk_lookup_assign,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_SOCKET_OR_NULL,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static const struct bpf_func_proto *
+sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_perf_event_output:
+ return &bpf_event_output_data_proto;
+ case BPF_FUNC_sk_assign:
+ return &bpf_sk_lookup_assign_proto;
+ case BPF_FUNC_sk_release:
+ return &bpf_sk_release_proto;
+ default:
+ return bpf_sk_base_func_proto(func_id, prog);
+ }
+}
+
+static bool sk_lookup_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= sizeof(struct bpf_sk_lookup))
+ return false;
+ if (off % size != 0)
+ return false;
+ if (type != BPF_READ)
+ return false;
+
+ switch (off) {
+ case bpf_ctx_range_ptr(struct bpf_sk_lookup, sk):
+ info->reg_type = PTR_TO_SOCKET_OR_NULL;
+ return size == sizeof(__u64);
+
+ case bpf_ctx_range(struct bpf_sk_lookup, family):
+ case bpf_ctx_range(struct bpf_sk_lookup, protocol):
+ case bpf_ctx_range(struct bpf_sk_lookup, remote_ip4):
+ case bpf_ctx_range(struct bpf_sk_lookup, local_ip4):
+ case bpf_ctx_range_till(struct bpf_sk_lookup, remote_ip6[0], remote_ip6[3]):
+ case bpf_ctx_range_till(struct bpf_sk_lookup, local_ip6[0], local_ip6[3]):
+ case bpf_ctx_range(struct bpf_sk_lookup, local_port):
+ case bpf_ctx_range(struct bpf_sk_lookup, ingress_ifindex):
+ bpf_ctx_record_field_size(info, sizeof(__u32));
+ return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32));
+
+ case bpf_ctx_range(struct bpf_sk_lookup, remote_port):
+ /* Allow 4-byte access to 2-byte field for backward compatibility */
+ if (size == sizeof(__u32))
+ return true;
+ bpf_ctx_record_field_size(info, sizeof(__be16));
+ return bpf_ctx_narrow_access_ok(off, size, sizeof(__be16));
+
+ case offsetofend(struct bpf_sk_lookup, remote_port) ...
+ offsetof(struct bpf_sk_lookup, local_ip4) - 1:
+ /* Allow access to zero padding for backward compatibility */
+ bpf_ctx_record_field_size(info, sizeof(__u16));
+ return bpf_ctx_narrow_access_ok(off, size, sizeof(__u16));
+
+ default:
+ return false;
+ }
+}
+
+static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog,
+ u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+ switch (si->off) {
+ case offsetof(struct bpf_sk_lookup, sk):
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sk_lookup_kern, selected_sk));
+ break;
+
+ case offsetof(struct bpf_sk_lookup, family):
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ family, 2, target_size));
+ break;
+
+ case offsetof(struct bpf_sk_lookup, protocol):
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ protocol, 2, target_size));
+ break;
+
+ case offsetof(struct bpf_sk_lookup, remote_ip4):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ v4.saddr, 4, target_size));
+ break;
+
+ case offsetof(struct bpf_sk_lookup, local_ip4):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ v4.daddr, 4, target_size));
+ break;
+
+ case bpf_ctx_range_till(struct bpf_sk_lookup,
+ remote_ip6[0], remote_ip6[3]): {
+#if IS_ENABLED(CONFIG_IPV6)
+ int off = si->off;
+
+ off -= offsetof(struct bpf_sk_lookup, remote_ip6[0]);
+ off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sk_lookup_kern, v6.saddr));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+ }
+ case bpf_ctx_range_till(struct bpf_sk_lookup,
+ local_ip6[0], local_ip6[3]): {
+#if IS_ENABLED(CONFIG_IPV6)
+ int off = si->off;
+
+ off -= offsetof(struct bpf_sk_lookup, local_ip6[0]);
+ off += bpf_target_off(struct in6_addr, s6_addr32[0], 4, target_size);
+ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sk_lookup_kern, v6.daddr));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, off);
+#else
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+#endif
+ break;
+ }
+ case offsetof(struct bpf_sk_lookup, remote_port):
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ sport, 2, target_size));
+ break;
+
+ case offsetofend(struct bpf_sk_lookup, remote_port):
+ *target_size = 2;
+ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
+ break;
+
+ case offsetof(struct bpf_sk_lookup, local_port):
+ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ dport, 2, target_size));
+ break;
+
+ case offsetof(struct bpf_sk_lookup, ingress_ifindex):
+ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
+ bpf_target_off(struct bpf_sk_lookup_kern,
+ ingress_ifindex, 4, target_size));
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
+const struct bpf_prog_ops sk_lookup_prog_ops = {
+ .test_run = bpf_prog_test_run_sk_lookup,
+};
+
+const struct bpf_verifier_ops sk_lookup_verifier_ops = {
+ .get_func_proto = sk_lookup_func_proto,
+ .is_valid_access = sk_lookup_is_valid_access,
+ .convert_ctx_access = sk_lookup_convert_ctx_access,
+};
+
#endif /* CONFIG_INET */
+
+DEFINE_BPF_DISPATCHER(xdp)
+
+void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog)
+{
+ bpf_dispatcher_change_prog(BPF_DISPATCHER_PTR(xdp), prev_prog, prog);
+}
+
+BTF_ID_LIST_GLOBAL(btf_sock_ids, MAX_BTF_SOCK_TYPE)
+#define BTF_SOCK_TYPE(name, type) BTF_ID(struct, type)
+BTF_SOCK_TYPE_xxx
+#undef BTF_SOCK_TYPE
+
+BPF_CALL_1(bpf_skc_to_tcp6_sock, struct sock *, sk)
+{
+ /* tcp6_sock type is not generated in dwarf and hence btf,
+ * trigger an explicit type generation here.
+ */
+ BTF_TYPE_EMIT(struct tcp6_sock);
+ if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_family == AF_INET6)
+ return (unsigned long)sk;
+
+ return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto = {
+ .func = bpf_skc_to_tcp6_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP6],
+};
+
+BPF_CALL_1(bpf_skc_to_tcp_sock, struct sock *, sk)
+{
+ if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
+ return (unsigned long)sk;
+
+ return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_tcp_sock_proto = {
+ .func = bpf_skc_to_tcp_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP],
+};
+
+BPF_CALL_1(bpf_skc_to_tcp_timewait_sock, struct sock *, sk)
+{
+ /* BTF types for tcp_timewait_sock and inet_timewait_sock are not
+ * generated if CONFIG_INET=n. Trigger an explicit generation here.
+ */
+ BTF_TYPE_EMIT(struct inet_timewait_sock);
+ BTF_TYPE_EMIT(struct tcp_timewait_sock);
+
+#ifdef CONFIG_INET
+ if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_TIME_WAIT)
+ return (unsigned long)sk;
+#endif
+
+#if IS_BUILTIN(CONFIG_IPV6)
+ if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_TIME_WAIT)
+ return (unsigned long)sk;
+#endif
+
+ return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto = {
+ .func = bpf_skc_to_tcp_timewait_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_TW],
+};
+
+BPF_CALL_1(bpf_skc_to_tcp_request_sock, struct sock *, sk)
+{
+#ifdef CONFIG_INET
+ if (sk && sk->sk_prot == &tcp_prot && sk->sk_state == TCP_NEW_SYN_RECV)
+ return (unsigned long)sk;
+#endif
+
+#if IS_BUILTIN(CONFIG_IPV6)
+ if (sk && sk->sk_prot == &tcpv6_prot && sk->sk_state == TCP_NEW_SYN_RECV)
+ return (unsigned long)sk;
+#endif
+
+ return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_tcp_request_sock_proto = {
+ .func = bpf_skc_to_tcp_request_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_TCP_REQ],
+};
+
+BPF_CALL_1(bpf_skc_to_udp6_sock, struct sock *, sk)
+{
+ /* udp6_sock type is not generated in dwarf and hence btf,
+ * trigger an explicit type generation here.
+ */
+ BTF_TYPE_EMIT(struct udp6_sock);
+ if (sk && sk_fullsock(sk) && sk->sk_protocol == IPPROTO_UDP &&
+ sk->sk_type == SOCK_DGRAM && sk->sk_family == AF_INET6)
+ return (unsigned long)sk;
+
+ return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = {
+ .func = bpf_skc_to_udp6_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6],
+};
+
+BPF_CALL_1(bpf_skc_to_unix_sock, struct sock *, sk)
+{
+ /* unix_sock type is not generated in dwarf and hence btf,
+ * trigger an explicit type generation here.
+ */
+ BTF_TYPE_EMIT(struct unix_sock);
+ if (sk && sk_fullsock(sk) && sk->sk_family == AF_UNIX)
+ return (unsigned long)sk;
+
+ return (unsigned long)NULL;
+}
+
+const struct bpf_func_proto bpf_skc_to_unix_sock_proto = {
+ .func = bpf_skc_to_unix_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+ .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UNIX],
+};
+
+BPF_CALL_1(bpf_skc_to_mptcp_sock, struct sock *, sk)
+{
+ BTF_TYPE_EMIT(struct mptcp_sock);
+ return (unsigned long)bpf_mptcp_sock_from_subflow(sk);
+}
+
+const struct bpf_func_proto bpf_skc_to_mptcp_sock_proto = {
+ .func = bpf_skc_to_mptcp_sock,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .arg1_type = ARG_PTR_TO_SOCK_COMMON,
+ .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_MPTCP],
+};
+
+BPF_CALL_1(bpf_sock_from_file, struct file *, file)
+{
+ return (unsigned long)sock_from_file(file);
+}
+
+BTF_ID_LIST(bpf_sock_from_file_btf_ids)
+BTF_ID(struct, socket)
+BTF_ID(struct, file)
+
+const struct bpf_func_proto bpf_sock_from_file_proto = {
+ .func = bpf_sock_from_file,
+ .gpl_only = false,
+ .ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
+ .ret_btf_id = &bpf_sock_from_file_btf_ids[0],
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &bpf_sock_from_file_btf_ids[1],
+};
+
+static const struct bpf_func_proto *
+bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ const struct bpf_func_proto *func;
+
+ switch (func_id) {
+ case BPF_FUNC_skc_to_tcp6_sock:
+ func = &bpf_skc_to_tcp6_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_tcp_sock:
+ func = &bpf_skc_to_tcp_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_tcp_timewait_sock:
+ func = &bpf_skc_to_tcp_timewait_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_tcp_request_sock:
+ func = &bpf_skc_to_tcp_request_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_udp6_sock:
+ func = &bpf_skc_to_udp6_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_unix_sock:
+ func = &bpf_skc_to_unix_sock_proto;
+ break;
+ case BPF_FUNC_skc_to_mptcp_sock:
+ func = &bpf_skc_to_mptcp_sock_proto;
+ break;
+ case BPF_FUNC_ktime_get_coarse_ns:
+ return &bpf_ktime_get_coarse_ns_proto;
+ default:
+ return bpf_base_func_proto(func_id, prog);
+ }
+
+ if (!bpf_token_capable(prog->aux->token, CAP_PERFMON))
+ return NULL;
+
+ return func;
+}
+
+/**
+ * bpf_skb_meta_pointer() - Gets a mutable pointer within the skb metadata area.
+ * @skb: socket buffer carrying the metadata
+ * @offset: offset into the metadata area, must be <= skb_metadata_len()
+ */
+void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset)
+{
+ return skb_metadata_end(skb) - skb_metadata_len(skb) + offset;
+}
+
+int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset,
+ const void *from, u32 len, u64 flags)
+{
+ if (unlikely(flags))
+ return -EINVAL;
+ if (unlikely(bpf_try_make_writable(skb, 0)))
+ return -EFAULT;
+
+ memmove(bpf_skb_meta_pointer(skb, offset), from, len);
+ return 0;
+}
+
+__bpf_kfunc_start_defs();
+__bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags,
+ struct bpf_dynptr *ptr__uninit)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
+ struct sk_buff *skb = (struct sk_buff *)s;
+
+ if (flags) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB, 0, skb->len);
+
+ return 0;
+}
+
+/**
+ * bpf_dynptr_from_skb_meta() - Initialize a dynptr to the skb metadata area.
+ * @skb_: socket buffer carrying the metadata
+ * @flags: future use, must be zero
+ * @ptr__uninit: dynptr to initialize
+ *
+ * Set up a dynptr for access to the metadata area earlier allocated from the
+ * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to
+ * &__sk_buff->data_meta.
+ *
+ * Return:
+ * * %0 - dynptr ready to use
+ * * %-EINVAL - invalid flags, dynptr set to null
+ */
+__bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags,
+ struct bpf_dynptr *ptr__uninit)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
+ struct sk_buff *skb = (struct sk_buff *)skb_;
+
+ if (flags) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb));
+
+ return 0;
+}
+
+__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags,
+ struct bpf_dynptr *ptr__uninit)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
+ struct xdp_buff *xdp = (struct xdp_buff *)x;
+
+ if (flags) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ bpf_dynptr_init(ptr, xdp, BPF_DYNPTR_TYPE_XDP, 0, xdp_get_buff_len(xdp));
+
+ return 0;
+}
+
+__bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern,
+ const u8 *sun_path, u32 sun_path__sz)
+{
+ struct sockaddr_un *un;
+
+ if (sa_kern->sk->sk_family != AF_UNIX)
+ return -EINVAL;
+
+ /* We do not allow changing the address to unnamed or larger than the
+ * maximum allowed address size for a unix sockaddr.
+ */
+ if (sun_path__sz == 0 || sun_path__sz > UNIX_PATH_MAX)
+ return -EINVAL;
+
+ un = (struct sockaddr_un *)sa_kern->uaddr;
+ memcpy(un->sun_path, sun_path, sun_path__sz);
+ sa_kern->uaddrlen = offsetof(struct sockaddr_un, sun_path) + sun_path__sz;
+
+ return 0;
+}
+
+__bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct __sk_buff *s, struct sock *sk,
+ struct bpf_tcp_req_attrs *attrs, int attrs__sz)
+{
+#if IS_ENABLED(CONFIG_SYN_COOKIES)
+ struct sk_buff *skb = (struct sk_buff *)s;
+ const struct request_sock_ops *ops;
+ struct inet_request_sock *ireq;
+ struct tcp_request_sock *treq;
+ struct request_sock *req;
+ struct net *net;
+ __u16 min_mss;
+ u32 tsoff = 0;
+
+ if (attrs__sz != sizeof(*attrs) ||
+ attrs->reserved[0] || attrs->reserved[1] || attrs->reserved[2])
+ return -EINVAL;
+
+ if (!skb_at_tc_ingress(skb))
+ return -EINVAL;
+
+ net = dev_net(skb->dev);
+ if (net != sock_net(sk))
+ return -ENETUNREACH;
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ ops = &tcp_request_sock_ops;
+ min_mss = 536;
+ break;
+#if IS_BUILTIN(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ ops = &tcp6_request_sock_ops;
+ min_mss = IPV6_MIN_MTU - 60;
+ break;
+#endif
+ default:
+ return -EINVAL;
+ }
+
+ if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN ||
+ sk_is_mptcp(sk))
+ return -EINVAL;
+
+ if (attrs->mss < min_mss)
+ return -EINVAL;
+
+ if (attrs->wscale_ok) {
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_window_scaling))
+ return -EINVAL;
+
+ if (attrs->snd_wscale > TCP_MAX_WSCALE ||
+ attrs->rcv_wscale > TCP_MAX_WSCALE)
+ return -EINVAL;
+ }
+
+ if (attrs->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack))
+ return -EINVAL;
+
+ if (attrs->tstamp_ok) {
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps))
+ return -EINVAL;
+
+ tsoff = attrs->rcv_tsecr - tcp_ns_to_ts(attrs->usec_ts_ok, tcp_clock_ns());
+ }
+
+ req = inet_reqsk_alloc(ops, sk, false);
+ if (!req)
+ return -ENOMEM;
+
+ ireq = inet_rsk(req);
+ treq = tcp_rsk(req);
+
+ req->rsk_listener = sk;
+ req->syncookie = 1;
+ req->mss = attrs->mss;
+ req->ts_recent = attrs->rcv_tsval;
+
+ ireq->snd_wscale = attrs->snd_wscale;
+ ireq->rcv_wscale = attrs->rcv_wscale;
+ ireq->tstamp_ok = !!attrs->tstamp_ok;
+ ireq->sack_ok = !!attrs->sack_ok;
+ ireq->wscale_ok = !!attrs->wscale_ok;
+ ireq->ecn_ok = !!attrs->ecn_ok;
+
+ treq->req_usec_ts = !!attrs->usec_ts_ok;
+ treq->ts_off = tsoff;
+
+ skb_orphan(skb);
+ skb->sk = req_to_sk(req);
+ skb->destructor = sock_pfree;
+
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+__bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops,
+ u64 flags)
+{
+ struct sk_buff *skb;
+
+ if (skops->op != BPF_SOCK_OPS_TSTAMP_SENDMSG_CB)
+ return -EOPNOTSUPP;
+
+ if (flags)
+ return -EINVAL;
+
+ skb = skops->skb;
+ skb_shinfo(skb)->tx_flags |= SKBTX_BPF;
+ TCP_SKB_CB(skb)->txstamp_ack |= TSTAMP_ACK_BPF;
+ skb_shinfo(skb)->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
+
+ return 0;
+}
+
+/**
+ * bpf_xdp_pull_data() - Pull in non-linear xdp data.
+ * @x: &xdp_md associated with the XDP buffer
+ * @len: length of data to be made directly accessible in the linear part
+ *
+ * Pull in data in case the XDP buffer associated with @x is non-linear and
+ * not all @len are in the linear data area.
+ *
+ * Direct packet access allows reading and writing linear XDP data through
+ * packet pointers (i.e., &xdp_md->data + offsets). The amount of data which
+ * ends up in the linear part of the xdp_buff depends on the NIC and its
+ * configuration. When a frag-capable XDP program wants to directly access
+ * headers that may be in the non-linear area, call this kfunc to make sure
+ * the data is available in the linear area. Alternatively, use dynptr or
+ * bpf_xdp_{load,store}_bytes() to access data without pulling.
+ *
+ * This kfunc can also be used with bpf_xdp_adjust_head() to decapsulate
+ * headers in the non-linear data area.
+ *
+ * A call to this kfunc may reduce headroom. If there is not enough tailroom
+ * in the linear data area, metadata and data will be shifted down.
+ *
+ * A call to this kfunc is susceptible to change the buffer geometry.
+ * Therefore, at load time, all checks on pointers previously done by the
+ * verifier are invalidated and must be performed again, if the kfunc is used
+ * in combination with direct packet access.
+ *
+ * Return:
+ * * %0 - success
+ * * %-EINVAL - invalid len
+ */
+__bpf_kfunc int bpf_xdp_pull_data(struct xdp_md *x, u32 len)
+{
+ struct xdp_buff *xdp = (struct xdp_buff *)x;
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+ int i, delta, shift, headroom, tailroom, n_frags_free = 0;
+ void *data_hard_end = xdp_data_hard_end(xdp);
+ int data_len = xdp->data_end - xdp->data;
+ void *start;
+
+ if (len <= data_len)
+ return 0;
+
+ if (unlikely(len > xdp_get_buff_len(xdp)))
+ return -EINVAL;
+
+ start = xdp_data_meta_unsupported(xdp) ? xdp->data : xdp->data_meta;
+
+ headroom = start - xdp->data_hard_start - sizeof(struct xdp_frame);
+ tailroom = data_hard_end - xdp->data_end;
+
+ delta = len - data_len;
+ if (unlikely(delta > tailroom + headroom))
+ return -EINVAL;
+
+ shift = delta - tailroom;
+ if (shift > 0) {
+ memmove(start - shift, start, xdp->data_end - start);
+
+ xdp->data_meta -= shift;
+ xdp->data -= shift;
+ xdp->data_end -= shift;
+ }
+
+ for (i = 0; i < sinfo->nr_frags && delta; i++) {
+ skb_frag_t *frag = &sinfo->frags[i];
+ u32 shrink = min_t(u32, delta, skb_frag_size(frag));
+
+ memcpy(xdp->data_end, skb_frag_address(frag), shrink);
+
+ xdp->data_end += shrink;
+ sinfo->xdp_frags_size -= shrink;
+ delta -= shrink;
+ if (bpf_xdp_shrink_data(xdp, frag, shrink, false))
+ n_frags_free++;
+ }
+
+ if (unlikely(n_frags_free)) {
+ memmove(sinfo->frags, sinfo->frags + n_frags_free,
+ (sinfo->nr_frags - n_frags_free) * sizeof(skb_frag_t));
+
+ sinfo->nr_frags -= n_frags_free;
+
+ if (!sinfo->nr_frags) {
+ xdp_buff_clear_frags_flag(xdp);
+ xdp_buff_clear_frag_pfmemalloc(xdp);
+ }
+ }
+
+ return 0;
+}
+
+__bpf_kfunc_end_defs();
+
+int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags,
+ struct bpf_dynptr *ptr__uninit)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
+ int err;
+
+ err = bpf_dynptr_from_skb(skb, flags, ptr__uninit);
+ if (err)
+ return err;
+
+ bpf_dynptr_set_rdonly(ptr);
+
+ return 0;
+}
+
+BTF_KFUNCS_START(bpf_kfunc_check_set_skb)
+BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_kfunc_check_set_skb)
+
+BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta)
+BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta)
+
+BTF_KFUNCS_START(bpf_kfunc_check_set_xdp)
+BTF_ID_FLAGS(func, bpf_dynptr_from_xdp)
+BTF_ID_FLAGS(func, bpf_xdp_pull_data)
+BTF_KFUNCS_END(bpf_kfunc_check_set_xdp)
+
+BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr)
+BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path)
+BTF_KFUNCS_END(bpf_kfunc_check_set_sock_addr)
+
+BTF_KFUNCS_START(bpf_kfunc_check_set_tcp_reqsk)
+BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_kfunc_check_set_tcp_reqsk)
+
+BTF_KFUNCS_START(bpf_kfunc_check_set_sock_ops)
+BTF_ID_FLAGS(func, bpf_sock_ops_enable_tx_tstamp, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_kfunc_check_set_sock_ops)
+
+static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_skb,
+};
+
+static const struct btf_kfunc_id_set bpf_kfunc_set_skb_meta = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_skb_meta,
+};
+
+static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_xdp,
+};
+
+static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_sock_addr,
+};
+
+static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_tcp_reqsk,
+};
+
+static const struct btf_kfunc_id_set bpf_kfunc_set_sock_ops = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_sock_ops,
+};
+
+static int __init bpf_kfunc_init(void)
+{
+ int ret;
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SK_SKB, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCKET_FILTER, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SKB, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_OUT, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_IN, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb_meta);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb_meta);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
+ &bpf_kfunc_set_sock_addr);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk);
+ return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SOCK_OPS, &bpf_kfunc_set_sock_ops);
+}
+late_initcall(bpf_kfunc_init);
+
+__bpf_kfunc_start_defs();
+
+/* bpf_sock_destroy: Destroy the given socket with ECONNABORTED error code.
+ *
+ * The function expects a non-NULL pointer to a socket, and invokes the
+ * protocol specific socket destroy handlers.
+ *
+ * The helper can only be called from BPF contexts that have acquired the socket
+ * locks.
+ *
+ * Parameters:
+ * @sock: Pointer to socket to be destroyed
+ *
+ * Return:
+ * On error, may return EPROTONOSUPPORT, EINVAL.
+ * EPROTONOSUPPORT if protocol specific destroy handler is not supported.
+ * 0 otherwise
+ */
+__bpf_kfunc int bpf_sock_destroy(struct sock_common *sock)
+{
+ struct sock *sk = (struct sock *)sock;
+
+ /* The locking semantics that allow for synchronous execution of the
+ * destroy handlers are only supported for TCP and UDP.
+ * Supporting protocols will need to acquire sock lock in the BPF context
+ * prior to invoking this kfunc.
+ */
+ if (!sk->sk_prot->diag_destroy || (sk->sk_protocol != IPPROTO_TCP &&
+ sk->sk_protocol != IPPROTO_UDP))
+ return -EOPNOTSUPP;
+
+ return sk->sk_prot->diag_destroy(sk, ECONNABORTED);
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(bpf_sk_iter_kfunc_ids)
+BTF_ID_FLAGS(func, bpf_sock_destroy, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_sk_iter_kfunc_ids)
+
+static int tracing_iter_filter(const struct bpf_prog *prog, u32 kfunc_id)
+{
+ if (btf_id_set8_contains(&bpf_sk_iter_kfunc_ids, kfunc_id) &&
+ prog->expected_attach_type != BPF_TRACE_ITER)
+ return -EACCES;
+ return 0;
+}
+
+static const struct btf_kfunc_id_set bpf_sk_iter_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_sk_iter_kfunc_ids,
+ .filter = tracing_iter_filter,
+};
+
+static int init_subsystem(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_sk_iter_kfunc_set);
+}
+late_initcall(init_subsystem);
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index ce9eeeb7c024..1b61bb25ba0e 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -1,9 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/export.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/if_vlan.h>
+#include <linux/filter.h>
#include <net/dsa.h>
#include <net/dst_metadata.h>
#include <net/ip.h>
@@ -20,16 +22,25 @@
#include <linux/ppp_defs.h>
#include <linux/stddef.h>
#include <linux/if_ether.h>
+#include <linux/if_hsr.h>
#include <linux/mpls.h>
#include <linux/tcp.h>
+#include <linux/ptp_classify.h>
#include <net/flow_dissector.h>
+#include <net/pkt_cls.h>
#include <scsi/fc/fc_fcoe.h>
#include <uapi/linux/batadv_packet.h>
+#include <linux/bpf.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+#endif
+#include <linux/bpf-netns.h>
static void dissector_set_key(struct flow_dissector *flow_dissector,
enum flow_dissector_key_id key_id)
{
- flow_dissector->used_keys |= (1 << key_id);
+ flow_dissector->used_keys |= (1ULL << key_id);
}
void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
@@ -41,7 +52,7 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
memset(flow_dissector, 0, sizeof(*flow_dissector));
for (i = 0; i < key_count; i++, key++) {
- /* User should make sure that every key target offset is withing
+ /* User should make sure that every key target offset is within
* boundaries of unsigned short.
*/
BUG_ON(key->offset > USHRT_MAX);
@@ -62,30 +73,40 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
}
EXPORT_SYMBOL(skb_flow_dissector_init);
-/**
- * skb_flow_get_be16 - extract be16 entity
- * @skb: sk_buff to extract from
- * @poff: offset to extract at
- * @data: raw buffer pointer to the packet
- * @hlen: packet header length
- *
- * The function will try to retrieve a be32 entity at
- * offset poff
- */
-static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff,
- void *data, int hlen)
+#ifdef CONFIG_BPF_SYSCALL
+int flow_dissector_bpf_prog_attach_check(struct net *net,
+ struct bpf_prog *prog)
{
- __be16 *u, _u;
+ enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
- u = __skb_header_pointer(skb, poff, sizeof(_u), data, hlen, &_u);
- if (u)
- return *u;
+ if (net == &init_net) {
+ /* BPF flow dissector in the root namespace overrides
+ * any per-net-namespace one. When attaching to root,
+ * make sure we don't have any BPF program attached
+ * to the non-root namespaces.
+ */
+ struct net *ns;
+
+ for_each_net(ns) {
+ if (ns == &init_net)
+ continue;
+ if (rcu_access_pointer(ns->bpf.run_array[type]))
+ return -EEXIST;
+ }
+ } else {
+ /* Make sure root flow dissector is not attached
+ * when attaching to the non-root namespace.
+ */
+ if (rcu_access_pointer(init_net.bpf.run_array[type]))
+ return -EEXIST;
+ }
return 0;
}
+#endif /* CONFIG_BPF_SYSCALL */
/**
- * __skb_flow_get_ports - extract the upper layer ports and return them
+ * skb_flow_get_ports - extract the upper layer ports and return them
* @skb: sk_buff to extract the ports from
* @thoff: transport header offset
* @ip_proto: protocol for which to get port offset
@@ -95,8 +116,8 @@ static __be16 skb_flow_get_be16(const struct sk_buff *skb, int poff,
* The function will try to retrieve the ports at offset thoff + poff where poff
* is the protocol port offset returned from proto_ports_offset
*/
-__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
- void *data, int hlen)
+__be32 skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
+ const void *data, int hlen)
{
int poff = proto_ports_offset(ip_proto);
@@ -116,12 +137,172 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
return 0;
}
-EXPORT_SYMBOL(__skb_flow_get_ports);
+EXPORT_SYMBOL(skb_flow_get_ports);
-static void
-skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
+static bool icmp_has_id(u8 type)
+{
+ switch (type) {
+ case ICMP_ECHO:
+ case ICMP_ECHOREPLY:
+ case ICMP_TIMESTAMP:
+ case ICMP_TIMESTAMPREPLY:
+ case ICMPV6_ECHO_REQUEST:
+ case ICMPV6_ECHO_REPLY:
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * skb_flow_get_icmp_tci - extract ICMP(6) Type, Code and Identifier fields
+ * @skb: sk_buff to extract from
+ * @key_icmp: struct flow_dissector_key_icmp to fill
+ * @data: raw buffer pointer to the packet
+ * @thoff: offset to extract at
+ * @hlen: packet header length
+ */
+void skb_flow_get_icmp_tci(const struct sk_buff *skb,
+ struct flow_dissector_key_icmp *key_icmp,
+ const void *data, int thoff, int hlen)
+{
+ struct icmphdr *ih, _ih;
+
+ ih = __skb_header_pointer(skb, thoff, sizeof(_ih), data, hlen, &_ih);
+ if (!ih)
+ return;
+
+ key_icmp->type = ih->type;
+ key_icmp->code = ih->code;
+
+ /* As we use 0 to signal that the Id field is not present,
+ * avoid confusion with packets without such field
+ */
+ if (icmp_has_id(ih->type))
+ key_icmp->id = ih->un.echo.id ? ntohs(ih->un.echo.id) : 1;
+ else
+ key_icmp->id = 0;
+}
+EXPORT_SYMBOL(skb_flow_get_icmp_tci);
+
+/* If FLOW_DISSECTOR_KEY_ICMP is set, dissect an ICMP packet
+ * using skb_flow_get_icmp_tci().
+ */
+static void __skb_flow_dissect_icmp(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ int thoff, int hlen)
+{
+ struct flow_dissector_key_icmp *key_icmp;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ICMP))
+ return;
+
+ key_icmp = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ICMP,
+ target_container);
+
+ skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen);
+}
+
+static void __skb_flow_dissect_ah(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ int nhoff, int hlen)
+{
+ struct flow_dissector_key_ipsec *key_ah;
+ struct ip_auth_hdr _hdr, *hdr;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC))
+ return;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
+ if (!hdr)
+ return;
+
+ key_ah = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPSEC,
+ target_container);
+
+ key_ah->spi = hdr->spi;
+}
+
+static void __skb_flow_dissect_esp(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container)
+ void *target_container, const void *data,
+ int nhoff, int hlen)
+{
+ struct flow_dissector_key_ipsec *key_esp;
+ struct ip_esp_hdr _hdr, *hdr;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPSEC))
+ return;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
+ if (!hdr)
+ return;
+
+ key_esp = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPSEC,
+ target_container);
+
+ key_esp->spi = hdr->spi;
+}
+
+static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ int nhoff, int hlen)
+{
+ struct flow_dissector_key_l2tpv3 *key_l2tpv3;
+ struct {
+ __be32 session_id;
+ } *hdr, _hdr;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3))
+ return;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
+ if (!hdr)
+ return;
+
+ key_l2tpv3 = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_L2TPV3,
+ target_container);
+
+ key_l2tpv3->session_id = hdr->session_id;
+}
+
+void skb_flow_dissect_meta(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container)
+{
+ struct flow_dissector_key_meta *meta;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_META))
+ return;
+
+ meta = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_META,
+ target_container);
+ meta->ingress_ifindex = skb->skb_iif;
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ if (tc_skb_ext_tc_enabled()) {
+ struct tc_skb_ext *ext;
+
+ ext = skb_ext_find(skb, TC_SKB_EXT);
+ if (ext)
+ meta->l2_miss = ext->l2_miss;
+ }
+#endif
+}
+EXPORT_SYMBOL(skb_flow_dissect_meta);
+
+static void
+skb_flow_dissect_set_enc_control(enum flow_dissector_key_id type,
+ u32 ctrl_flags,
+ struct flow_dissector *flow_dissector,
+ void *target_container)
{
struct flow_dissector_key_control *ctrl;
@@ -132,15 +313,63 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
FLOW_DISSECTOR_KEY_ENC_CONTROL,
target_container);
ctrl->addr_type = type;
+ ctrl->flags = ctrl_flags;
}
void
+skb_flow_dissect_ct(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, u16 *ctinfo_map,
+ size_t mapsize, bool post_ct, u16 zone)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ struct flow_dissector_key_ct *key;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn_labels *cl;
+ struct nf_conn *ct;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CT))
+ return;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct && !post_ct)
+ return;
+
+ key = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_CT,
+ target_container);
+
+ if (!ct) {
+ key->ct_state = TCA_FLOWER_KEY_CT_FLAGS_TRACKED |
+ TCA_FLOWER_KEY_CT_FLAGS_INVALID;
+ key->ct_zone = zone;
+ return;
+ }
+
+ if (ctinfo < mapsize)
+ key->ct_state = ctinfo_map[ctinfo];
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)
+ key->ct_zone = ct->zone.id;
+#endif
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
+ key->ct_mark = READ_ONCE(ct->mark);
+#endif
+
+ cl = nf_ct_labels_find(ct);
+ if (cl)
+ memcpy(key->ct_labels, cl->bits, sizeof(key->ct_labels));
+#endif /* CONFIG_NF_CONNTRACK */
+}
+EXPORT_SYMBOL(skb_flow_dissect_ct);
+
+void
skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
void *target_container)
{
struct ip_tunnel_info *info;
struct ip_tunnel_key *key;
+ u32 ctrl_flags = 0;
/* A quick check to see if there might be something to do. */
if (!dissector_uses_key(flow_dissector,
@@ -165,11 +394,20 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
key = &info->key;
+ if (test_bit(IP_TUNNEL_CSUM_BIT, key->tun_flags))
+ ctrl_flags |= FLOW_DIS_F_TUNNEL_CSUM;
+ if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags))
+ ctrl_flags |= FLOW_DIS_F_TUNNEL_DONT_FRAGMENT;
+ if (test_bit(IP_TUNNEL_OAM_BIT, key->tun_flags))
+ ctrl_flags |= FLOW_DIS_F_TUNNEL_OAM;
+ if (test_bit(IP_TUNNEL_CRIT_OPT_BIT, key->tun_flags))
+ ctrl_flags |= FLOW_DIS_F_TUNNEL_CRIT_OPT;
+
switch (ip_tunnel_info_af(info)) {
case AF_INET:
- skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV4_ADDRS,
- flow_dissector,
- target_container);
+ skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+ ctrl_flags, flow_dissector,
+ target_container);
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS)) {
struct flow_dissector_key_ipv4_addrs *ipv4;
@@ -182,9 +420,9 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
}
break;
case AF_INET6:
- skb_flow_dissect_set_enc_addr_type(FLOW_DISSECTOR_KEY_IPV6_ADDRS,
- flow_dissector,
- target_container);
+ skb_flow_dissect_set_enc_control(FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ ctrl_flags, flow_dissector,
+ target_container);
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS)) {
struct flow_dissector_key_ipv6_addrs *ipv6;
@@ -196,6 +434,10 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
ipv6->dst = key->u.ipv6.dst;
}
break;
+ default:
+ skb_flow_dissect_set_enc_control(0, ctrl_flags, flow_dissector,
+ target_container);
+ break;
}
if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_KEYID)) {
@@ -229,71 +471,109 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_OPTS)) {
struct flow_dissector_key_enc_opts *enc_opt;
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+ u32 val;
enc_opt = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_ENC_OPTS,
target_container);
- if (info->options_len) {
- enc_opt->len = info->options_len;
- ip_tunnel_info_opts_get(enc_opt->data, info);
- enc_opt->dst_opt_type = info->key.tun_flags &
- TUNNEL_OPTIONS_PRESENT;
- }
+ if (!info->options_len)
+ return;
+
+ enc_opt->len = info->options_len;
+ ip_tunnel_info_opts_get(enc_opt->data, info);
+
+ ip_tunnel_set_options_present(flags);
+ ip_tunnel_flags_and(flags, info->key.tun_flags, flags);
+
+ val = find_next_bit(flags, __IP_TUNNEL_FLAG_NUM,
+ IP_TUNNEL_GENEVE_OPT_BIT);
+ enc_opt->dst_opt_type = val < __IP_TUNNEL_FLAG_NUM ? val : 0;
}
}
EXPORT_SYMBOL(skb_flow_dissect_tunnel_info);
+void skb_flow_dissect_hash(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container)
+{
+ struct flow_dissector_key_hash *key;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_HASH))
+ return;
+
+ key = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_HASH,
+ target_container);
+
+ key->hash = skb_get_hash_raw(skb);
+}
+EXPORT_SYMBOL(skb_flow_dissect_hash);
+
static enum flow_dissect_ret
__skb_flow_dissect_mpls(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, int nhoff, int hlen)
+ void *target_container, const void *data, int nhoff,
+ int hlen, int lse_index, bool *entropy_label)
{
- struct flow_dissector_key_keyid *key_keyid;
- struct mpls_label *hdr, _hdr[2];
- u32 entry, label;
+ struct mpls_label *hdr, _hdr;
+ u32 entry, label, bos;
if (!dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_MPLS_ENTROPY) &&
!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS))
return FLOW_DISSECT_RET_OUT_GOOD;
+ if (lse_index >= FLOW_DIS_MPLS_MAX)
+ return FLOW_DISSECT_RET_OUT_GOOD;
+
hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
hlen, &_hdr);
if (!hdr)
return FLOW_DISSECT_RET_OUT_BAD;
- entry = ntohl(hdr[0].entry);
+ entry = ntohl(hdr->entry);
label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT;
+ bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT;
if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) {
struct flow_dissector_key_mpls *key_mpls;
+ struct flow_dissector_mpls_lse *lse;
key_mpls = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_MPLS,
target_container);
- key_mpls->mpls_label = label;
- key_mpls->mpls_ttl = (entry & MPLS_LS_TTL_MASK)
- >> MPLS_LS_TTL_SHIFT;
- key_mpls->mpls_tc = (entry & MPLS_LS_TC_MASK)
- >> MPLS_LS_TC_SHIFT;
- key_mpls->mpls_bos = (entry & MPLS_LS_S_MASK)
- >> MPLS_LS_S_SHIFT;
+ lse = &key_mpls->ls[lse_index];
+
+ lse->mpls_ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
+ lse->mpls_bos = bos;
+ lse->mpls_tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT;
+ lse->mpls_label = label;
+ dissector_set_mpls_lse(key_mpls, lse_index);
}
- if (label == MPLS_LABEL_ENTROPY) {
+ if (*entropy_label &&
+ dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) {
+ struct flow_dissector_key_keyid *key_keyid;
+
key_keyid = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_MPLS_ENTROPY,
target_container);
- key_keyid->keyid = hdr[1].entry & htonl(MPLS_LS_LABEL_MASK);
+ key_keyid->keyid = cpu_to_be32(label);
}
- return FLOW_DISSECT_RET_OUT_GOOD;
+
+ *entropy_label = label == MPLS_LABEL_ENTROPY;
+
+ return bos ? FLOW_DISSECT_RET_OUT_GOOD : FLOW_DISSECT_RET_PROTO_AGAIN;
}
static enum flow_dissect_ret
__skb_flow_dissect_arp(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, int nhoff, int hlen)
+ void *target_container, const void *data,
+ int nhoff, int hlen)
{
struct flow_dissector_key_arp *key_arp;
struct {
@@ -346,10 +626,34 @@ __skb_flow_dissect_arp(const struct sk_buff *skb,
}
static enum flow_dissect_ret
+__skb_flow_dissect_cfm(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ int nhoff, int hlen)
+{
+ struct flow_dissector_key_cfm *key, *hdr, _hdr;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CFM))
+ return FLOW_DISSECT_RET_OUT_GOOD;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(*key), data, hlen, &_hdr);
+ if (!hdr)
+ return FLOW_DISSECT_RET_OUT_BAD;
+
+ key = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_CFM,
+ target_container);
+
+ key->mdl_ver = hdr->mdl_ver;
+ key->opcode = hdr->opcode;
+
+ return FLOW_DISSECT_RET_OUT_GOOD;
+}
+
+static enum flow_dissect_ret
__skb_flow_dissect_gre(const struct sk_buff *skb,
struct flow_dissector_key_control *key_control,
struct flow_dissector *flow_dissector,
- void *target_container, void *data,
+ void *target_container, const void *data,
__be16 *p_proto, int *p_nhoff, int *p_hlen,
unsigned int flags)
{
@@ -382,8 +686,8 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
offset += sizeof(struct gre_base_hdr);
if (hdr->flags & GRE_CSUM)
- offset += sizeof(((struct gre_full_hdr *) 0)->csum) +
- sizeof(((struct gre_full_hdr *) 0)->reserved1);
+ offset += sizeof_field(struct gre_full_hdr, csum) +
+ sizeof_field(struct gre_full_hdr, reserved1);
if (hdr->flags & GRE_KEY) {
const __be32 *keyid;
@@ -405,11 +709,11 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
else
key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK;
}
- offset += sizeof(((struct gre_full_hdr *) 0)->key);
+ offset += sizeof_field(struct gre_full_hdr, key);
}
if (hdr->flags & GRE_SEQ)
- offset += sizeof(((struct pptp_gre_header *) 0)->seq);
+ offset += sizeof_field(struct pptp_gre_header, seq);
if (gre_ver == 0) {
if (*p_proto == htons(ETH_P_TEB)) {
@@ -436,7 +740,7 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
u8 *ppp_hdr;
if (hdr->flags & GRE_ACK)
- offset += sizeof(((struct pptp_gre_header *) 0)->ack);
+ offset += sizeof_field(struct pptp_gre_header, ack);
ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset,
sizeof(_ppp_hdr),
@@ -489,8 +793,8 @@ __skb_flow_dissect_gre(const struct sk_buff *skb,
static enum flow_dissect_ret
__skb_flow_dissect_batadv(const struct sk_buff *skb,
struct flow_dissector_key_control *key_control,
- void *data, __be16 *p_proto, int *p_nhoff, int hlen,
- unsigned int flags)
+ const void *data, __be16 *p_proto, int *p_nhoff,
+ int hlen, unsigned int flags)
{
struct {
struct batadv_unicast_packet batadv_unicast;
@@ -521,7 +825,8 @@ __skb_flow_dissect_batadv(const struct sk_buff *skb,
static void
__skb_flow_dissect_tcp(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, int thoff, int hlen)
+ void *target_container, const void *data,
+ int thoff, int hlen)
{
struct flow_dissector_key_tcp *key_tcp;
struct tcphdr *th, _th;
@@ -543,9 +848,42 @@ __skb_flow_dissect_tcp(const struct sk_buff *skb,
}
static void
+__skb_flow_dissect_ports(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container, const void *data,
+ int nhoff, u8 ip_proto, int hlen)
+{
+ struct flow_dissector_key_ports_range *key_ports_range = NULL;
+ struct flow_dissector_key_ports *key_ports = NULL;
+ __be32 ports;
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS))
+ key_ports = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS,
+ target_container);
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS_RANGE))
+ key_ports_range = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS_RANGE,
+ target_container);
+
+ if (!key_ports && !key_ports_range)
+ return;
+
+ ports = skb_flow_get_ports(skb, nhoff, ip_proto, data, hlen);
+
+ if (key_ports)
+ key_ports->ports = ports;
+
+ if (key_ports_range)
+ key_ports_range->tp.ports = ports;
+}
+
+static void
__skb_flow_dissect_ipv4(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, const struct iphdr *iph)
+ void *target_container, const void *data,
+ const struct iphdr *iph)
{
struct flow_dissector_key_ip *key_ip;
@@ -562,7 +900,8 @@ __skb_flow_dissect_ipv4(const struct sk_buff *skb,
static void
__skb_flow_dissect_ipv6(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container, void *data, const struct ipv6hdr *iph)
+ void *target_container, const void *data,
+ const struct ipv6hdr *iph)
{
struct flow_dissector_key_ip *key_ip;
@@ -588,8 +927,117 @@ static bool skb_flow_dissect_allowed(int *num_hdrs)
return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS);
}
+static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys,
+ struct flow_dissector *flow_dissector,
+ void *target_container)
+{
+ struct flow_dissector_key_ports_range *key_ports_range = NULL;
+ struct flow_dissector_key_ports *key_ports = NULL;
+ struct flow_dissector_key_control *key_control;
+ struct flow_dissector_key_basic *key_basic;
+ struct flow_dissector_key_addrs *key_addrs;
+ struct flow_dissector_key_tags *key_tags;
+
+ key_control = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_CONTROL,
+ target_container);
+ key_control->thoff = flow_keys->thoff;
+ if (flow_keys->is_frag)
+ key_control->flags |= FLOW_DIS_IS_FRAGMENT;
+ if (flow_keys->is_first_frag)
+ key_control->flags |= FLOW_DIS_FIRST_FRAG;
+ if (flow_keys->is_encap)
+ key_control->flags |= FLOW_DIS_ENCAPSULATION;
+
+ key_basic = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_BASIC,
+ target_container);
+ key_basic->n_proto = flow_keys->n_proto;
+ key_basic->ip_proto = flow_keys->ip_proto;
+
+ if (flow_keys->addr_proto == ETH_P_IP &&
+ dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+ target_container);
+ key_addrs->v4addrs.src = flow_keys->ipv4_src;
+ key_addrs->v4addrs.dst = flow_keys->ipv4_dst;
+ key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ } else if (flow_keys->addr_proto == ETH_P_IPV6 &&
+ dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ target_container);
+ memcpy(&key_addrs->v6addrs.src, &flow_keys->ipv6_src,
+ sizeof(key_addrs->v6addrs.src));
+ memcpy(&key_addrs->v6addrs.dst, &flow_keys->ipv6_dst,
+ sizeof(key_addrs->v6addrs.dst));
+ key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ }
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) {
+ key_ports = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS,
+ target_container);
+ key_ports->src = flow_keys->sport;
+ key_ports->dst = flow_keys->dport;
+ }
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS_RANGE)) {
+ key_ports_range = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PORTS_RANGE,
+ target_container);
+ key_ports_range->tp.src = flow_keys->sport;
+ key_ports_range->tp.dst = flow_keys->dport;
+ }
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
+ key_tags = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_FLOW_LABEL,
+ target_container);
+ key_tags->flow_label = ntohl(flow_keys->flow_label);
+ }
+}
+
+u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
+ __be16 proto, int nhoff, int hlen, unsigned int flags)
+{
+ struct bpf_flow_keys *flow_keys = ctx->flow_keys;
+ u32 result;
+
+ /* Pass parameters to the BPF program */
+ memset(flow_keys, 0, sizeof(*flow_keys));
+ flow_keys->n_proto = proto;
+ flow_keys->nhoff = nhoff;
+ flow_keys->thoff = flow_keys->nhoff;
+
+ BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG !=
+ (int)FLOW_DISSECTOR_F_PARSE_1ST_FRAG);
+ BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL !=
+ (int)FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+ BUILD_BUG_ON((int)BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP !=
+ (int)FLOW_DISSECTOR_F_STOP_AT_ENCAP);
+ flow_keys->flags = flags;
+
+ result = bpf_prog_run_pin_on_cpu(prog, ctx);
+
+ flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen);
+ flow_keys->thoff = clamp_t(u16, flow_keys->thoff,
+ flow_keys->nhoff, hlen);
+
+ return result;
+}
+
+static bool is_pppoe_ses_hdr_valid(const struct pppoe_hdr *hdr)
+{
+ return hdr->ver == 1 && hdr->type == 1 && hdr->code == 0;
+}
+
/**
* __skb_flow_dissect - extract the flow_keys struct and return it
+ * @net: associated network namespace, derived from @skb if NULL
* @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
* @flow_dissector: list of keys to dissect
* @target_container: target structure to put dissected values into
@@ -597,6 +1045,8 @@ static bool skb_flow_dissect_allowed(int *num_hdrs)
* @proto: protocol for which to get the flow, if @data is NULL use skb->protocol
* @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
* @hlen: packet header length, if @data is NULL use skb_headlen(skb)
+ * @flags: flags that control the dissection process, e.g.
+ * FLOW_DISSECTOR_F_STOP_AT_ENCAP.
*
* The function will try to retrieve individual keys into target specified
* by flow_dissector from either the skbuff or a raw buffer specified by the
@@ -604,21 +1054,21 @@ static bool skb_flow_dissect_allowed(int *num_hdrs)
*
* Caller must take care of zeroing target container memory.
*/
-bool __skb_flow_dissect(const struct sk_buff *skb,
+bool __skb_flow_dissect(const struct net *net,
+ const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
- void *target_container,
- void *data, __be16 proto, int nhoff, int hlen,
- unsigned int flags)
+ void *target_container, const void *data,
+ __be16 proto, int nhoff, int hlen, unsigned int flags)
{
struct flow_dissector_key_control *key_control;
struct flow_dissector_key_basic *key_basic;
struct flow_dissector_key_addrs *key_addrs;
- struct flow_dissector_key_ports *key_ports;
- struct flow_dissector_key_icmp *key_icmp;
struct flow_dissector_key_tags *key_tags;
struct flow_dissector_key_vlan *key_vlan;
enum flow_dissect_ret fdret;
enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX;
+ bool mpls_el = false;
+ int mpls_lse = 0;
int num_hdrs = 0;
u8 ip_proto = 0;
bool ret;
@@ -630,13 +1080,22 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
nhoff = skb_network_offset(skb);
hlen = skb_headlen(skb);
#if IS_ENABLED(CONFIG_NET_DSA)
- if (unlikely(skb->dev && netdev_uses_dsa(skb->dev))) {
+ if (unlikely(skb->dev && netdev_uses_dsa(skb->dev) &&
+ proto == htons(ETH_P_XDSA))) {
+ struct metadata_dst *md_dst = skb_metadata_dst(skb);
const struct dsa_device_ops *ops;
- int offset;
+ int offset = 0;
ops = skb->dev->dsa_ptr->tag_ops;
- if (ops->flow_dissect &&
- !ops->flow_dissect(skb, &proto, &offset)) {
+ /* Only DSA header taggers break flow dissection */
+ if (ops->needed_headroom &&
+ (!md_dst || md_dst->type != METADATA_HW_PORT_MUX)) {
+ if (ops->flow_dissect)
+ ops->flow_dissect(skb, &proto, &offset);
+ else
+ dsa_tag_generic_flow_dissect(skb,
+ &proto,
+ &offset);
hlen -= offset;
nhoff += offset;
}
@@ -658,6 +1117,60 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
FLOW_DISSECTOR_KEY_BASIC,
target_container);
+ rcu_read_lock();
+
+ if (skb) {
+ if (!net) {
+ if (skb->dev)
+ net = dev_net_rcu(skb->dev);
+ else if (skb->sk)
+ net = sock_net(skb->sk);
+ }
+ }
+
+ DEBUG_NET_WARN_ON_ONCE(!net);
+ if (net) {
+ enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR;
+ struct bpf_prog_array *run_array;
+
+ run_array = rcu_dereference(init_net.bpf.run_array[type]);
+ if (!run_array)
+ run_array = rcu_dereference(net->bpf.run_array[type]);
+
+ if (run_array) {
+ struct bpf_flow_keys flow_keys;
+ struct bpf_flow_dissector ctx = {
+ .flow_keys = &flow_keys,
+ .data = data,
+ .data_end = data + hlen,
+ };
+ __be16 n_proto = proto;
+ struct bpf_prog *prog;
+ u32 result;
+
+ if (skb) {
+ ctx.skb = skb;
+ /* we can't use 'proto' in the skb case
+ * because it might be set to skb->vlan_proto
+ * which has been pulled from the data
+ */
+ n_proto = skb->protocol;
+ }
+
+ prog = READ_ONCE(run_array->items[0].prog);
+ result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff,
+ hlen, flags);
+ if (result != BPF_FLOW_DISSECTOR_CONTINUE) {
+ __skb_flow_bpf_to_target(&flow_keys, flow_dissector,
+ target_container);
+ rcu_read_unlock();
+ return result == BPF_OK;
+ }
+ }
+ }
+
+ rcu_read_unlock();
+
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_ETH_ADDRS)) {
struct ethhdr *eth = eth_hdr(skb);
@@ -666,7 +1179,17 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
key_eth_addrs = skb_flow_dissector_target(flow_dissector,
FLOW_DISSECTOR_KEY_ETH_ADDRS,
target_container);
- memcpy(key_eth_addrs, &eth->h_dest, sizeof(*key_eth_addrs));
+ memcpy(key_eth_addrs, eth, sizeof(*key_eth_addrs));
+ }
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_NUM_OF_VLANS)) {
+ struct flow_dissector_key_num_of_vlans *key_num_of_vlans;
+
+ key_num_of_vlans = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_NUM_OF_VLANS,
+ target_container);
+ key_num_of_vlans->num_of_vlans = 0;
}
proto_again:
@@ -693,11 +1216,16 @@ proto_again:
FLOW_DISSECTOR_KEY_IPV4_ADDRS,
target_container);
- memcpy(&key_addrs->v4addrs, &iph->saddr,
- sizeof(key_addrs->v4addrs));
+ memcpy(&key_addrs->v4addrs.src, &iph->saddr,
+ sizeof(key_addrs->v4addrs.src));
+ memcpy(&key_addrs->v4addrs.dst, &iph->daddr,
+ sizeof(key_addrs->v4addrs.dst));
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
}
+ __skb_flow_dissect_ipv4(skb, flow_dissector,
+ target_container, data, iph);
+
if (ip_is_fragment(iph)) {
key_control->flags |= FLOW_DIS_IS_FRAGMENT;
@@ -714,14 +1242,6 @@ proto_again:
}
}
- __skb_flow_dissect_ipv4(skb, flow_dissector,
- target_container, data, iph);
-
- if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) {
- fdret = FLOW_DISSECT_RET_OUT_GOOD;
- break;
- }
-
break;
}
case htons(ETH_P_IPV6): {
@@ -743,8 +1263,10 @@ proto_again:
FLOW_DISSECTOR_KEY_IPV6_ADDRS,
target_container);
- memcpy(&key_addrs->v6addrs, &iph->saddr,
- sizeof(key_addrs->v6addrs));
+ memcpy(&key_addrs->v6addrs.src, &iph->saddr,
+ sizeof(key_addrs->v6addrs.src));
+ memcpy(&key_addrs->v6addrs.dst, &iph->daddr,
+ sizeof(key_addrs->v6addrs.dst));
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}
@@ -770,9 +1292,6 @@ proto_again:
__skb_flow_dissect_ipv6(skb, flow_dissector,
target_container, data, iph);
- if (flags & FLOW_DISSECTOR_F_STOP_AT_L3)
- fdret = FLOW_DISSECT_RET_OUT_GOOD;
-
break;
}
case htons(ETH_P_8021AD):
@@ -796,6 +1315,16 @@ proto_again:
nhoff += sizeof(*vlan);
}
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS) &&
+ !(key_control->flags & FLOW_DIS_ENCAPSULATION)) {
+ struct flow_dissector_key_num_of_vlans *key_nvs;
+
+ key_nvs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_NUM_OF_VLANS,
+ target_container);
+ key_nvs->num_of_vlans++;
+ }
+
if (dissector_vlan == FLOW_DISSECTOR_KEY_MAX) {
dissector_vlan = FLOW_DISSECTOR_KEY_VLAN;
} else if (dissector_vlan == FLOW_DISSECTOR_KEY_VLAN) {
@@ -812,8 +1341,7 @@ proto_again:
if (!vlan) {
key_vlan->vlan_id = skb_vlan_tag_get_id(skb);
- key_vlan->vlan_priority =
- (skb_vlan_tag_get_prio(skb) >> VLAN_PRIO_SHIFT);
+ key_vlan->vlan_priority = skb_vlan_tag_get_prio(skb);
} else {
key_vlan->vlan_id = ntohs(vlan->h_vlan_TCI) &
VLAN_VID_MASK;
@@ -822,6 +1350,7 @@ proto_again:
VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
}
key_vlan->vlan_tpid = saved_vlan_tpid;
+ key_vlan->vlan_eth_type = proto;
}
fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
@@ -832,27 +1361,60 @@ proto_again:
struct pppoe_hdr hdr;
__be16 proto;
} *hdr, _hdr;
+ u16 ppp_proto;
+
hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
if (!hdr) {
fdret = FLOW_DISSECT_RET_OUT_BAD;
break;
}
- proto = hdr->proto;
- nhoff += PPPOE_SES_HLEN;
- switch (proto) {
- case htons(PPP_IP):
+ if (!is_pppoe_ses_hdr_valid(&hdr->hdr)) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ /* least significant bit of the most significant octet
+ * indicates if protocol field was compressed
+ */
+ ppp_proto = ntohs(hdr->proto);
+ if (ppp_proto & 0x0100) {
+ ppp_proto = ppp_proto >> 8;
+ nhoff += PPPOE_SES_HLEN - 1;
+ } else {
+ nhoff += PPPOE_SES_HLEN;
+ }
+
+ if (ppp_proto == PPP_IP) {
proto = htons(ETH_P_IP);
fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
- break;
- case htons(PPP_IPV6):
+ } else if (ppp_proto == PPP_IPV6) {
proto = htons(ETH_P_IPV6);
fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
- break;
- default:
+ } else if (ppp_proto == PPP_MPLS_UC) {
+ proto = htons(ETH_P_MPLS_UC);
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ } else if (ppp_proto == PPP_MPLS_MC) {
+ proto = htons(ETH_P_MPLS_MC);
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ } else if (ppp_proto_is_valid(ppp_proto)) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ } else {
fdret = FLOW_DISSECT_RET_OUT_BAD;
break;
}
+
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_PPPOE)) {
+ struct flow_dissector_key_pppoe *key_pppoe;
+
+ key_pppoe = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_PPPOE,
+ target_container);
+ key_pppoe->session_id = hdr->hdr.sid;
+ key_pppoe->ppp_proto = htons(ppp_proto);
+ key_pppoe->type = htons(ETH_P_PPP_SES);
+ }
break;
}
case htons(ETH_P_TIPC): {
@@ -881,7 +1443,10 @@ proto_again:
case htons(ETH_P_MPLS_MC):
fdret = __skb_flow_dissect_mpls(skb, flow_dissector,
target_container, data,
- nhoff, hlen);
+ nhoff, hlen, mpls_lse,
+ &mpls_el);
+ nhoff += sizeof(struct mpls_label);
+ mpls_lse++;
break;
case htons(ETH_P_FCOE):
if ((hlen - nhoff) < FCOE_HEADER_LEN) {
@@ -905,6 +1470,44 @@ proto_again:
&proto, &nhoff, hlen, flags);
break;
+ case htons(ETH_P_1588): {
+ struct ptp_header *hdr, _hdr;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data,
+ hlen, &_hdr);
+ if (!hdr) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ nhoff += sizeof(struct ptp_header);
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+
+ case htons(ETH_P_PRP):
+ case htons(ETH_P_HSR): {
+ struct hsr_tag *hdr, _hdr;
+
+ hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen,
+ &_hdr);
+ if (!hdr) {
+ fdret = FLOW_DISSECT_RET_OUT_BAD;
+ break;
+ }
+
+ proto = hdr->encap_proto;
+ nhoff += HSR_HLEN;
+ fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
+ break;
+ }
+
+ case htons(ETH_P_CFM):
+ fdret = __skb_flow_dissect_cfm(skb, flow_dissector,
+ target_container, data,
+ nhoff, hlen);
+ break;
+
default:
fdret = FLOW_DISSECT_RET_OUT_BAD;
break;
@@ -931,6 +1534,11 @@ ip_proto_again:
switch (ip_proto) {
case IPPROTO_GRE:
+ if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+
fdret = __skb_flow_dissect_gre(skb, key_control, flow_dissector,
target_container, data,
&proto, &nhoff, &hlen, flags);
@@ -988,6 +1596,11 @@ ip_proto_again:
break;
}
case IPPROTO_IPIP:
+ if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+
proto = htons(ETH_P_IP);
key_control->flags |= FLOW_DIS_ENCAPSULATION;
@@ -1000,6 +1613,11 @@ ip_proto_again:
break;
case IPPROTO_IPV6:
+ if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) {
+ fdret = FLOW_DISSECT_RET_OUT_GOOD;
+ break;
+ }
+
proto = htons(ETH_P_IPV6);
key_control->flags |= FLOW_DIS_ENCAPSULATION;
@@ -1022,26 +1640,30 @@ ip_proto_again:
data, nhoff, hlen);
break;
+ case IPPROTO_ICMP:
+ case IPPROTO_ICMPV6:
+ __skb_flow_dissect_icmp(skb, flow_dissector, target_container,
+ data, nhoff, hlen);
+ break;
+ case IPPROTO_L2TP:
+ __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container,
+ data, nhoff, hlen);
+ break;
+ case IPPROTO_ESP:
+ __skb_flow_dissect_esp(skb, flow_dissector, target_container,
+ data, nhoff, hlen);
+ break;
+ case IPPROTO_AH:
+ __skb_flow_dissect_ah(skb, flow_dissector, target_container,
+ data, nhoff, hlen);
+ break;
default:
break;
}
- if (dissector_uses_key(flow_dissector,
- FLOW_DISSECTOR_KEY_PORTS)) {
- key_ports = skb_flow_dissector_target(flow_dissector,
- FLOW_DISSECTOR_KEY_PORTS,
- target_container);
- key_ports->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
- data, hlen);
- }
-
- if (dissector_uses_key(flow_dissector,
- FLOW_DISSECTOR_KEY_ICMP)) {
- key_icmp = skb_flow_dissector_target(flow_dissector,
- FLOW_DISSECTOR_KEY_ICMP,
- target_container);
- key_icmp->icmp = skb_flow_get_be16(skb, nhoff, data, hlen);
- }
+ if (!(key_control->flags & FLOW_DIS_IS_FRAGMENT))
+ __skb_flow_dissect_ports(skb, flow_dissector, target_container,
+ data, nhoff, ip_proto, hlen);
/* Process result of IP proto processing */
switch (fdret) {
@@ -1077,32 +1699,23 @@ out_bad:
}
EXPORT_SYMBOL(__skb_flow_dissect);
-static u32 hashrnd __read_mostly;
+static siphash_aligned_key_t hashrnd;
static __always_inline void __flow_hash_secret_init(void)
{
net_get_random_once(&hashrnd, sizeof(hashrnd));
}
-static __always_inline u32 __flow_hash_words(const u32 *words, u32 length,
- u32 keyval)
+static const void *flow_keys_hash_start(const struct flow_keys *flow)
{
- return jhash2(words, length, keyval);
-}
-
-static inline const u32 *flow_keys_hash_start(const struct flow_keys *flow)
-{
- const void *p = flow;
-
- BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32));
- return (const u32 *)(p + FLOW_KEYS_HASH_OFFSET);
+ BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % SIPHASH_ALIGNMENT);
+ return &flow->FLOW_KEYS_HASH_START_FIELD;
}
static inline size_t flow_keys_hash_length(const struct flow_keys *flow)
{
size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs);
+
BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32));
- BUILD_BUG_ON(offsetof(typeof(*flow), addrs) !=
- sizeof(*flow) - sizeof(flow->addrs));
switch (flow->control.addr_type) {
case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
@@ -1115,7 +1728,7 @@ static inline size_t flow_keys_hash_length(const struct flow_keys *flow)
diff -= sizeof(flow->addrs.tipckey);
break;
}
- return (sizeof(*flow) - diff) / sizeof(u32);
+ return sizeof(*flow) - diff;
}
__be32 flow_get_u32_src(const struct flow_keys *flow)
@@ -1148,19 +1761,21 @@ __be32 flow_get_u32_dst(const struct flow_keys *flow)
}
EXPORT_SYMBOL(flow_get_u32_dst);
+/* Sort the source and destination IP and the ports,
+ * to have consistent hash within the two directions
+ */
static inline void __flow_hash_consistentify(struct flow_keys *keys)
{
int addr_diff, i;
switch (keys->control.addr_type) {
case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
- addr_diff = (__force u32)keys->addrs.v4addrs.dst -
- (__force u32)keys->addrs.v4addrs.src;
- if ((addr_diff < 0) ||
- (addr_diff == 0 &&
- ((__force u16)keys->ports.dst <
- (__force u16)keys->ports.src))) {
+ if ((__force u32)keys->addrs.v4addrs.dst <
+ (__force u32)keys->addrs.v4addrs.src)
swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst);
+
+ if ((__force u16)keys->ports.dst <
+ (__force u16)keys->ports.src) {
swap(keys->ports.src, keys->ports.dst);
}
break;
@@ -1168,27 +1783,28 @@ static inline void __flow_hash_consistentify(struct flow_keys *keys)
addr_diff = memcmp(&keys->addrs.v6addrs.dst,
&keys->addrs.v6addrs.src,
sizeof(keys->addrs.v6addrs.dst));
- if ((addr_diff < 0) ||
- (addr_diff == 0 &&
- ((__force u16)keys->ports.dst <
- (__force u16)keys->ports.src))) {
+ if (addr_diff < 0) {
for (i = 0; i < 4; i++)
swap(keys->addrs.v6addrs.src.s6_addr32[i],
keys->addrs.v6addrs.dst.s6_addr32[i]);
+ }
+ if ((__force u16)keys->ports.dst <
+ (__force u16)keys->ports.src) {
swap(keys->ports.src, keys->ports.dst);
}
break;
}
}
-static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
+static inline u32 __flow_hash_from_keys(struct flow_keys *keys,
+ const siphash_key_t *keyval)
{
u32 hash;
__flow_hash_consistentify(keys);
- hash = __flow_hash_words(flow_keys_hash_start(keys),
- flow_keys_hash_length(keys), keyval);
+ hash = siphash(flow_keys_hash_start(keys),
+ flow_keys_hash_length(keys), keyval);
if (!hash)
hash = 1;
@@ -1198,12 +1814,20 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
u32 flow_hash_from_keys(struct flow_keys *keys)
{
__flow_hash_secret_init();
- return __flow_hash_from_keys(keys, hashrnd);
+ return __flow_hash_from_keys(keys, &hashrnd);
}
EXPORT_SYMBOL(flow_hash_from_keys);
+u32 flow_hash_from_keys_seed(struct flow_keys *keys,
+ const siphash_key_t *keyval)
+{
+ return __flow_hash_from_keys(keys, keyval);
+}
+EXPORT_SYMBOL(flow_hash_from_keys_seed);
+
static inline u32 ___skb_get_hash(const struct sk_buff *skb,
- struct flow_keys *keys, u32 keyval)
+ struct flow_keys *keys,
+ const siphash_key_t *keyval)
{
skb_flow_dissect_flow_keys(skb, keys,
FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
@@ -1240,23 +1864,23 @@ EXPORT_SYMBOL(make_flow_keys_digest);
static struct flow_dissector flow_keys_dissector_symmetric __read_mostly;
-u32 __skb_get_hash_symmetric(const struct sk_buff *skb)
+u32 __skb_get_hash_symmetric_net(const struct net *net, const struct sk_buff *skb)
{
struct flow_keys keys;
__flow_hash_secret_init();
memset(&keys, 0, sizeof(keys));
- __skb_flow_dissect(skb, &flow_keys_dissector_symmetric, &keys,
- NULL, 0, 0, 0,
- FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+ __skb_flow_dissect(net, skb, &flow_keys_dissector_symmetric,
+ &keys, NULL, 0, 0, 0, 0);
- return __flow_hash_from_keys(&keys, hashrnd);
+ return __flow_hash_from_keys(&keys, &hashrnd);
}
-EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric);
+EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric_net);
/**
- * __skb_get_hash: calculate a flow hash
+ * __skb_get_hash_net: calculate a flow hash
+ * @net: associated network namespace, derived from @skb if NULL
* @skb: sk_buff to calculate flow hash from
*
* This function calculates a flow hash based on src/dst addresses
@@ -1264,20 +1888,27 @@ EXPORT_SYMBOL_GPL(__skb_get_hash_symmetric);
* on success, zero indicates no valid hash. Also, sets l4_hash in skb
* if hash is a canonical 4-tuple hash over transport ports.
*/
-void __skb_get_hash(struct sk_buff *skb)
+void __skb_get_hash_net(const struct net *net, struct sk_buff *skb)
{
struct flow_keys keys;
u32 hash;
+ memset(&keys, 0, sizeof(keys));
+
+ __skb_flow_dissect(net, skb, &flow_keys_dissector,
+ &keys, NULL, 0, 0, 0,
+ FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
+
__flow_hash_secret_init();
- hash = ___skb_get_hash(skb, &keys, hashrnd);
+ hash = __flow_hash_from_keys(&keys, &hashrnd);
__skb_set_sw_hash(skb, hash, flow_keys_have_l4(&keys));
}
-EXPORT_SYMBOL(__skb_get_hash);
+EXPORT_SYMBOL(__skb_get_hash_net);
-__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
+__u32 skb_get_hash_perturb(const struct sk_buff *skb,
+ const siphash_key_t *perturb)
{
struct flow_keys keys;
@@ -1285,7 +1916,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
}
EXPORT_SYMBOL(skb_get_hash_perturb);
-u32 __skb_get_poff(const struct sk_buff *skb, void *data,
+u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
const struct flow_keys_basic *keys, int hlen)
{
u32 poff = keys->control.thoff;
@@ -1349,7 +1980,8 @@ u32 skb_get_poff(const struct sk_buff *skb)
{
struct flow_keys_basic keys;
- if (!skb_flow_dissect_flow_keys_basic(skb, &keys, NULL, 0, 0, 0, 0))
+ if (!skb_flow_dissect_flow_keys_basic(NULL, skb, &keys,
+ NULL, 0, 0, 0, 0))
return 0;
return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
@@ -1466,5 +2098,4 @@ static int __init init_default_flow_dissectors(void)
ARRAY_SIZE(flow_keys_basic_dissector_keys));
return 0;
}
-
core_initcall(init_default_flow_dissectors);
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
new file mode 100644
index 000000000000..bc5169482710
--- /dev/null
+++ b/net/core/flow_offload.c
@@ -0,0 +1,638 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <net/act_api.h>
+#include <net/flow_offload.h>
+#include <linux/rtnetlink.h>
+#include <linux/mutex.h>
+#include <linux/rhashtable.h>
+
+struct flow_rule *flow_rule_alloc(unsigned int num_actions)
+{
+ struct flow_rule *rule;
+ int i;
+
+ rule = kzalloc(struct_size(rule, action.entries, num_actions),
+ GFP_KERNEL);
+ if (!rule)
+ return NULL;
+
+ rule->action.num_entries = num_actions;
+ /* Pre-fill each action hw_stats with DONT_CARE.
+ * Caller can override this if it wants stats for a given action.
+ */
+ for (i = 0; i < num_actions; i++)
+ rule->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE;
+
+ return rule;
+}
+EXPORT_SYMBOL(flow_rule_alloc);
+
+struct flow_offload_action *offload_action_alloc(unsigned int num_actions)
+{
+ struct flow_offload_action *fl_action;
+ int i;
+
+ fl_action = kzalloc(struct_size(fl_action, action.entries, num_actions),
+ GFP_KERNEL);
+ if (!fl_action)
+ return NULL;
+
+ fl_action->action.num_entries = num_actions;
+ /* Pre-fill each action hw_stats with DONT_CARE.
+ * Caller can override this if it wants stats for a given action.
+ */
+ for (i = 0; i < num_actions; i++)
+ fl_action->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE;
+
+ return fl_action;
+}
+
+#define FLOW_DISSECTOR_MATCH(__rule, __type, __out) \
+ const struct flow_match *__m = &(__rule)->match; \
+ struct flow_dissector *__d = (__m)->dissector; \
+ \
+ (__out)->key = skb_flow_dissector_target(__d, __type, (__m)->key); \
+ (__out)->mask = skb_flow_dissector_target(__d, __type, (__m)->mask); \
+
+void flow_rule_match_meta(const struct flow_rule *rule,
+ struct flow_match_meta *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_META, out);
+}
+EXPORT_SYMBOL(flow_rule_match_meta);
+
+void flow_rule_match_basic(const struct flow_rule *rule,
+ struct flow_match_basic *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_BASIC, out);
+}
+EXPORT_SYMBOL(flow_rule_match_basic);
+
+void flow_rule_match_control(const struct flow_rule *rule,
+ struct flow_match_control *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CONTROL, out);
+}
+EXPORT_SYMBOL(flow_rule_match_control);
+
+void flow_rule_match_eth_addrs(const struct flow_rule *rule,
+ struct flow_match_eth_addrs *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ETH_ADDRS, out);
+}
+EXPORT_SYMBOL(flow_rule_match_eth_addrs);
+
+void flow_rule_match_vlan(const struct flow_rule *rule,
+ struct flow_match_vlan *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_VLAN, out);
+}
+EXPORT_SYMBOL(flow_rule_match_vlan);
+
+void flow_rule_match_cvlan(const struct flow_rule *rule,
+ struct flow_match_vlan *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CVLAN, out);
+}
+EXPORT_SYMBOL(flow_rule_match_cvlan);
+
+void flow_rule_match_arp(const struct flow_rule *rule,
+ struct flow_match_arp *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ARP, out);
+}
+EXPORT_SYMBOL(flow_rule_match_arp);
+
+void flow_rule_match_ipv4_addrs(const struct flow_rule *rule,
+ struct flow_match_ipv4_addrs *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPV4_ADDRS, out);
+}
+EXPORT_SYMBOL(flow_rule_match_ipv4_addrs);
+
+void flow_rule_match_ipv6_addrs(const struct flow_rule *rule,
+ struct flow_match_ipv6_addrs *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPV6_ADDRS, out);
+}
+EXPORT_SYMBOL(flow_rule_match_ipv6_addrs);
+
+void flow_rule_match_ip(const struct flow_rule *rule,
+ struct flow_match_ip *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IP, out);
+}
+EXPORT_SYMBOL(flow_rule_match_ip);
+
+void flow_rule_match_ports(const struct flow_rule *rule,
+ struct flow_match_ports *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PORTS, out);
+}
+EXPORT_SYMBOL(flow_rule_match_ports);
+
+void flow_rule_match_ports_range(const struct flow_rule *rule,
+ struct flow_match_ports_range *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PORTS_RANGE, out);
+}
+EXPORT_SYMBOL(flow_rule_match_ports_range);
+
+void flow_rule_match_tcp(const struct flow_rule *rule,
+ struct flow_match_tcp *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_TCP, out);
+}
+EXPORT_SYMBOL(flow_rule_match_tcp);
+
+void flow_rule_match_ipsec(const struct flow_rule *rule,
+ struct flow_match_ipsec *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPSEC, out);
+}
+EXPORT_SYMBOL(flow_rule_match_ipsec);
+
+void flow_rule_match_icmp(const struct flow_rule *rule,
+ struct flow_match_icmp *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ICMP, out);
+}
+EXPORT_SYMBOL(flow_rule_match_icmp);
+
+void flow_rule_match_mpls(const struct flow_rule *rule,
+ struct flow_match_mpls *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_MPLS, out);
+}
+EXPORT_SYMBOL(flow_rule_match_mpls);
+
+void flow_rule_match_enc_control(const struct flow_rule *rule,
+ struct flow_match_control *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_CONTROL, out);
+}
+EXPORT_SYMBOL(flow_rule_match_enc_control);
+
+void flow_rule_match_enc_ipv4_addrs(const struct flow_rule *rule,
+ struct flow_match_ipv4_addrs *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, out);
+}
+EXPORT_SYMBOL(flow_rule_match_enc_ipv4_addrs);
+
+void flow_rule_match_enc_ipv6_addrs(const struct flow_rule *rule,
+ struct flow_match_ipv6_addrs *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, out);
+}
+EXPORT_SYMBOL(flow_rule_match_enc_ipv6_addrs);
+
+void flow_rule_match_enc_ip(const struct flow_rule *rule,
+ struct flow_match_ip *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IP, out);
+}
+EXPORT_SYMBOL(flow_rule_match_enc_ip);
+
+void flow_rule_match_enc_ports(const struct flow_rule *rule,
+ struct flow_match_ports *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_PORTS, out);
+}
+EXPORT_SYMBOL(flow_rule_match_enc_ports);
+
+void flow_rule_match_enc_keyid(const struct flow_rule *rule,
+ struct flow_match_enc_keyid *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_KEYID, out);
+}
+EXPORT_SYMBOL(flow_rule_match_enc_keyid);
+
+void flow_rule_match_enc_opts(const struct flow_rule *rule,
+ struct flow_match_enc_opts *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_OPTS, out);
+}
+EXPORT_SYMBOL(flow_rule_match_enc_opts);
+
+struct flow_action_cookie *flow_action_cookie_create(void *data,
+ unsigned int len,
+ gfp_t gfp)
+{
+ struct flow_action_cookie *cookie;
+
+ cookie = kmalloc(sizeof(*cookie) + len, gfp);
+ if (!cookie)
+ return NULL;
+ cookie->cookie_len = len;
+ memcpy(cookie->cookie, data, len);
+ return cookie;
+}
+EXPORT_SYMBOL(flow_action_cookie_create);
+
+void flow_action_cookie_destroy(struct flow_action_cookie *cookie)
+{
+ kfree(cookie);
+}
+EXPORT_SYMBOL(flow_action_cookie_destroy);
+
+void flow_rule_match_ct(const struct flow_rule *rule,
+ struct flow_match_ct *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CT, out);
+}
+EXPORT_SYMBOL(flow_rule_match_ct);
+
+void flow_rule_match_pppoe(const struct flow_rule *rule,
+ struct flow_match_pppoe *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PPPOE, out);
+}
+EXPORT_SYMBOL(flow_rule_match_pppoe);
+
+void flow_rule_match_l2tpv3(const struct flow_rule *rule,
+ struct flow_match_l2tpv3 *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_L2TPV3, out);
+}
+EXPORT_SYMBOL(flow_rule_match_l2tpv3);
+
+struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb,
+ void *cb_ident, void *cb_priv,
+ void (*release)(void *cb_priv))
+{
+ struct flow_block_cb *block_cb;
+
+ block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL);
+ if (!block_cb)
+ return ERR_PTR(-ENOMEM);
+
+ block_cb->cb = cb;
+ block_cb->cb_ident = cb_ident;
+ block_cb->cb_priv = cb_priv;
+ block_cb->release = release;
+
+ return block_cb;
+}
+EXPORT_SYMBOL(flow_block_cb_alloc);
+
+void flow_block_cb_free(struct flow_block_cb *block_cb)
+{
+ if (block_cb->release)
+ block_cb->release(block_cb->cb_priv);
+
+ kfree(block_cb);
+}
+EXPORT_SYMBOL(flow_block_cb_free);
+
+struct flow_block_cb *flow_block_cb_lookup(struct flow_block *block,
+ flow_setup_cb_t *cb, void *cb_ident)
+{
+ struct flow_block_cb *block_cb;
+
+ list_for_each_entry(block_cb, &block->cb_list, list) {
+ if (block_cb->cb == cb &&
+ block_cb->cb_ident == cb_ident)
+ return block_cb;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL(flow_block_cb_lookup);
+
+void *flow_block_cb_priv(struct flow_block_cb *block_cb)
+{
+ return block_cb->cb_priv;
+}
+EXPORT_SYMBOL(flow_block_cb_priv);
+
+void flow_block_cb_incref(struct flow_block_cb *block_cb)
+{
+ block_cb->refcnt++;
+}
+EXPORT_SYMBOL(flow_block_cb_incref);
+
+unsigned int flow_block_cb_decref(struct flow_block_cb *block_cb)
+{
+ return --block_cb->refcnt;
+}
+EXPORT_SYMBOL(flow_block_cb_decref);
+
+bool flow_block_cb_is_busy(flow_setup_cb_t *cb, void *cb_ident,
+ struct list_head *driver_block_list)
+{
+ struct flow_block_cb *block_cb;
+
+ list_for_each_entry(block_cb, driver_block_list, driver_list) {
+ if (block_cb->cb == cb &&
+ block_cb->cb_ident == cb_ident)
+ return true;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL(flow_block_cb_is_busy);
+
+int flow_block_cb_setup_simple(struct flow_block_offload *f,
+ struct list_head *driver_block_list,
+ flow_setup_cb_t *cb,
+ void *cb_ident, void *cb_priv,
+ bool ingress_only)
+{
+ struct flow_block_cb *block_cb;
+
+ if (ingress_only &&
+ f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+ return -EOPNOTSUPP;
+
+ f->driver_block_list = driver_block_list;
+
+ switch (f->command) {
+ case FLOW_BLOCK_BIND:
+ if (flow_block_cb_is_busy(cb, cb_ident, driver_block_list))
+ return -EBUSY;
+
+ block_cb = flow_block_cb_alloc(cb, cb_ident, cb_priv, NULL);
+ if (IS_ERR(block_cb))
+ return PTR_ERR(block_cb);
+
+ flow_block_cb_add(block_cb, f);
+ list_add_tail(&block_cb->driver_list, driver_block_list);
+ return 0;
+ case FLOW_BLOCK_UNBIND:
+ block_cb = flow_block_cb_lookup(f->block, cb, cb_ident);
+ if (!block_cb)
+ return -ENOENT;
+
+ flow_block_cb_remove(block_cb, f);
+ list_del(&block_cb->driver_list);
+ return 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+EXPORT_SYMBOL(flow_block_cb_setup_simple);
+
+static DEFINE_MUTEX(flow_indr_block_lock);
+static LIST_HEAD(flow_block_indr_list);
+static LIST_HEAD(flow_block_indr_dev_list);
+static LIST_HEAD(flow_indir_dev_list);
+
+struct flow_indr_dev {
+ struct list_head list;
+ flow_indr_block_bind_cb_t *cb;
+ void *cb_priv;
+ refcount_t refcnt;
+};
+
+static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb,
+ void *cb_priv)
+{
+ struct flow_indr_dev *indr_dev;
+
+ indr_dev = kmalloc(sizeof(*indr_dev), GFP_KERNEL);
+ if (!indr_dev)
+ return NULL;
+
+ indr_dev->cb = cb;
+ indr_dev->cb_priv = cb_priv;
+ refcount_set(&indr_dev->refcnt, 1);
+
+ return indr_dev;
+}
+
+struct flow_indir_dev_info {
+ void *data;
+ struct net_device *dev;
+ struct Qdisc *sch;
+ enum tc_setup_type type;
+ void (*cleanup)(struct flow_block_cb *block_cb);
+ struct list_head list;
+ enum flow_block_command command;
+ enum flow_block_binder_type binder_type;
+ struct list_head *cb_list;
+};
+
+static void existing_qdiscs_register(flow_indr_block_bind_cb_t *cb, void *cb_priv)
+{
+ struct flow_block_offload bo;
+ struct flow_indir_dev_info *cur;
+
+ list_for_each_entry(cur, &flow_indir_dev_list, list) {
+ memset(&bo, 0, sizeof(bo));
+ bo.command = cur->command;
+ bo.binder_type = cur->binder_type;
+ INIT_LIST_HEAD(&bo.cb_list);
+ cb(cur->dev, cur->sch, cb_priv, cur->type, &bo, cur->data, cur->cleanup);
+ list_splice(&bo.cb_list, cur->cb_list);
+ }
+}
+
+int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv)
+{
+ struct flow_indr_dev *indr_dev;
+
+ mutex_lock(&flow_indr_block_lock);
+ list_for_each_entry(indr_dev, &flow_block_indr_dev_list, list) {
+ if (indr_dev->cb == cb &&
+ indr_dev->cb_priv == cb_priv) {
+ refcount_inc(&indr_dev->refcnt);
+ mutex_unlock(&flow_indr_block_lock);
+ return 0;
+ }
+ }
+
+ indr_dev = flow_indr_dev_alloc(cb, cb_priv);
+ if (!indr_dev) {
+ mutex_unlock(&flow_indr_block_lock);
+ return -ENOMEM;
+ }
+
+ list_add(&indr_dev->list, &flow_block_indr_dev_list);
+ existing_qdiscs_register(cb, cb_priv);
+ mutex_unlock(&flow_indr_block_lock);
+
+ tcf_action_reoffload_cb(cb, cb_priv, true);
+
+ return 0;
+}
+EXPORT_SYMBOL(flow_indr_dev_register);
+
+static void __flow_block_indr_cleanup(void (*release)(void *cb_priv),
+ void *cb_priv,
+ struct list_head *cleanup_list)
+{
+ struct flow_block_cb *this, *next;
+
+ list_for_each_entry_safe(this, next, &flow_block_indr_list, indr.list) {
+ if (this->release == release &&
+ this->indr.cb_priv == cb_priv)
+ list_move(&this->indr.list, cleanup_list);
+ }
+}
+
+static void flow_block_indr_notify(struct list_head *cleanup_list)
+{
+ struct flow_block_cb *this, *next;
+
+ list_for_each_entry_safe(this, next, cleanup_list, indr.list) {
+ list_del(&this->indr.list);
+ this->indr.cleanup(this);
+ }
+}
+
+void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv,
+ void (*release)(void *cb_priv))
+{
+ struct flow_indr_dev *this, *next, *indr_dev = NULL;
+ LIST_HEAD(cleanup_list);
+
+ mutex_lock(&flow_indr_block_lock);
+ list_for_each_entry_safe(this, next, &flow_block_indr_dev_list, list) {
+ if (this->cb == cb &&
+ this->cb_priv == cb_priv &&
+ refcount_dec_and_test(&this->refcnt)) {
+ indr_dev = this;
+ list_del(&indr_dev->list);
+ break;
+ }
+ }
+
+ if (!indr_dev) {
+ mutex_unlock(&flow_indr_block_lock);
+ return;
+ }
+
+ __flow_block_indr_cleanup(release, cb_priv, &cleanup_list);
+ mutex_unlock(&flow_indr_block_lock);
+
+ tcf_action_reoffload_cb(cb, cb_priv, false);
+ flow_block_indr_notify(&cleanup_list);
+ kfree(indr_dev);
+}
+EXPORT_SYMBOL(flow_indr_dev_unregister);
+
+static void flow_block_indr_init(struct flow_block_cb *flow_block,
+ struct flow_block_offload *bo,
+ struct net_device *dev, struct Qdisc *sch, void *data,
+ void *cb_priv,
+ void (*cleanup)(struct flow_block_cb *block_cb))
+{
+ flow_block->indr.binder_type = bo->binder_type;
+ flow_block->indr.data = data;
+ flow_block->indr.cb_priv = cb_priv;
+ flow_block->indr.dev = dev;
+ flow_block->indr.sch = sch;
+ flow_block->indr.cleanup = cleanup;
+}
+
+struct flow_block_cb *flow_indr_block_cb_alloc(flow_setup_cb_t *cb,
+ void *cb_ident, void *cb_priv,
+ void (*release)(void *cb_priv),
+ struct flow_block_offload *bo,
+ struct net_device *dev,
+ struct Qdisc *sch, void *data,
+ void *indr_cb_priv,
+ void (*cleanup)(struct flow_block_cb *block_cb))
+{
+ struct flow_block_cb *block_cb;
+
+ block_cb = flow_block_cb_alloc(cb, cb_ident, cb_priv, release);
+ if (IS_ERR(block_cb))
+ goto out;
+
+ flow_block_indr_init(block_cb, bo, dev, sch, data, indr_cb_priv, cleanup);
+ list_add(&block_cb->indr.list, &flow_block_indr_list);
+
+out:
+ return block_cb;
+}
+EXPORT_SYMBOL(flow_indr_block_cb_alloc);
+
+static struct flow_indir_dev_info *find_indir_dev(void *data)
+{
+ struct flow_indir_dev_info *cur;
+
+ list_for_each_entry(cur, &flow_indir_dev_list, list) {
+ if (cur->data == data)
+ return cur;
+ }
+ return NULL;
+}
+
+static int indir_dev_add(void *data, struct net_device *dev, struct Qdisc *sch,
+ enum tc_setup_type type, void (*cleanup)(struct flow_block_cb *block_cb),
+ struct flow_block_offload *bo)
+{
+ struct flow_indir_dev_info *info;
+
+ info = find_indir_dev(data);
+ if (info)
+ return -EEXIST;
+
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ info->data = data;
+ info->dev = dev;
+ info->sch = sch;
+ info->type = type;
+ info->cleanup = cleanup;
+ info->command = bo->command;
+ info->binder_type = bo->binder_type;
+ info->cb_list = bo->cb_list_head;
+
+ list_add(&info->list, &flow_indir_dev_list);
+ return 0;
+}
+
+static int indir_dev_remove(void *data)
+{
+ struct flow_indir_dev_info *info;
+
+ info = find_indir_dev(data);
+ if (!info)
+ return -ENOENT;
+
+ list_del(&info->list);
+
+ kfree(info);
+ return 0;
+}
+
+int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch,
+ enum tc_setup_type type, void *data,
+ struct flow_block_offload *bo,
+ void (*cleanup)(struct flow_block_cb *block_cb))
+{
+ struct flow_indr_dev *this;
+ u32 count = 0;
+ int err;
+
+ mutex_lock(&flow_indr_block_lock);
+ if (bo) {
+ if (bo->command == FLOW_BLOCK_BIND)
+ indir_dev_add(data, dev, sch, type, cleanup, bo);
+ else if (bo->command == FLOW_BLOCK_UNBIND)
+ indir_dev_remove(data);
+ }
+
+ list_for_each_entry(this, &flow_block_indr_dev_list, list) {
+ err = this->cb(dev, sch, this->cb_priv, type, bo, data, cleanup);
+ if (!err)
+ count++;
+ }
+
+ mutex_unlock(&flow_indr_block_lock);
+
+ return (bo && list_empty(&bo->cb_list)) ? -EOPNOTSUPP : count;
+}
+EXPORT_SYMBOL(flow_indr_dev_setup_offload);
+
+bool flow_indr_dev_exists(void)
+{
+ return !list_empty(&flow_block_indr_dev_list);
+}
+EXPORT_SYMBOL(flow_indr_dev_exists);
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index e4e442d70c2d..f112156db587 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/sched/gen_estimator.c Simple rate estimator.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* Eric Dumazet <edumazet@google.com>
*
@@ -44,15 +40,15 @@
*/
struct net_rate_estimator {
- struct gnet_stats_basic_packed *bstats;
+ struct gnet_stats_basic_sync *bstats;
spinlock_t *stats_lock;
- seqcount_t *running;
- struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+ bool running;
+ struct gnet_stats_basic_sync __percpu *cpu_bstats;
u8 ewma_log;
u8 intvl_log; /* period : (250ms << intvl_log) */
seqcount_t seq;
- u32 last_packets;
+ u64 last_packets;
u64 last_bytes;
u64 avpps;
@@ -64,13 +60,13 @@ struct net_rate_estimator {
};
static void est_fetch_counters(struct net_rate_estimator *e,
- struct gnet_stats_basic_packed *b)
+ struct gnet_stats_basic_sync *b)
{
- memset(b, 0, sizeof(*b));
+ gnet_stats_basic_sync_init(b);
if (e->stats_lock)
spin_lock(e->stats_lock);
- __gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats);
+ gnet_stats_add_basic(b, e->cpu_bstats, e->bstats, e->running);
if (e->stats_lock)
spin_unlock(e->stats_lock);
@@ -79,24 +75,30 @@ static void est_fetch_counters(struct net_rate_estimator *e,
static void est_timer(struct timer_list *t)
{
- struct net_rate_estimator *est = from_timer(est, t, timer);
- struct gnet_stats_basic_packed b;
+ struct net_rate_estimator *est = timer_container_of(est, t, timer);
+ struct gnet_stats_basic_sync b;
+ u64 b_bytes, b_packets;
u64 rate, brate;
est_fetch_counters(est, &b);
- brate = (b.bytes - est->last_bytes) << (10 - est->ewma_log - est->intvl_log);
- brate -= (est->avbps >> est->ewma_log);
+ b_bytes = u64_stats_read(&b.bytes);
+ b_packets = u64_stats_read(&b.packets);
+
+ brate = (b_bytes - est->last_bytes) << (10 - est->intvl_log);
+ brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log);
- rate = (u64)(b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log);
- rate -= (est->avpps >> est->ewma_log);
+ rate = (b_packets - est->last_packets) << (10 - est->intvl_log);
+ rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log);
+ preempt_disable_nested();
write_seqcount_begin(&est->seq);
est->avbps += brate;
est->avpps += rate;
write_seqcount_end(&est->seq);
+ preempt_enable_nested();
- est->last_bytes = b.bytes;
- est->last_packets = b.packets;
+ est->last_bytes = b_bytes;
+ est->last_packets = b_packets;
est->next_jiffies += ((HZ/4) << est->intvl_log);
@@ -113,7 +115,9 @@ static void est_timer(struct timer_list *t)
* @cpu_bstats: bstats per cpu
* @rate_est: rate estimator statistics
* @lock: lock for statistics and control path
- * @running: qdisc running seqcount
+ * @running: true if @bstats represents a running qdisc, thus @bstats'
+ * internal values might change during basic reads. Only used
+ * if @bstats_cpu is NULL
* @opt: rate estimator configuration TLV
*
* Creates a new rate estimator with &bstats as source and &rate_est
@@ -125,16 +129,16 @@ static void est_timer(struct timer_list *t)
* Returns 0 on success or a negative error code.
*
*/
-int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
- struct gnet_stats_basic_cpu __percpu *cpu_bstats,
+int gen_new_estimator(struct gnet_stats_basic_sync *bstats,
+ struct gnet_stats_basic_sync __percpu *cpu_bstats,
struct net_rate_estimator __rcu **rate_est,
spinlock_t *lock,
- seqcount_t *running,
+ bool running,
struct nlattr *opt)
{
struct gnet_estimator *parm = nla_data(opt);
struct net_rate_estimator *old, *est;
- struct gnet_stats_basic_packed b;
+ struct gnet_stats_basic_sync b;
int intvl_log;
if (nla_len(opt) < sizeof(*parm))
@@ -147,6 +151,9 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
if (parm->interval < -2 || parm->interval > 3)
return -EINVAL;
+ if (parm->ewma_log == 0 || parm->ewma_log >= 31)
+ return -EINVAL;
+
est = kzalloc(sizeof(*est), GFP_KERNEL);
if (!est)
return -ENOBUFS;
@@ -165,14 +172,14 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
est_fetch_counters(est, &b);
if (lock)
local_bh_enable();
- est->last_bytes = b.bytes;
- est->last_packets = b.packets;
+ est->last_bytes = u64_stats_read(&b.bytes);
+ est->last_packets = u64_stats_read(&b.packets);
if (lock)
spin_lock_bh(lock);
old = rcu_dereference_protected(*rate_est, 1);
if (old) {
- del_timer_sync(&old->timer);
+ timer_delete_sync(&old->timer);
est->avbps = old->avbps;
est->avpps = old->avpps;
}
@@ -201,9 +208,9 @@ void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est)
{
struct net_rate_estimator *est;
- est = xchg((__force struct net_rate_estimator **)rate_est, NULL);
+ est = unrcu_pointer(xchg(rate_est, NULL));
if (est) {
- del_timer_sync(&est->timer);
+ timer_shutdown_sync(&est->timer);
kfree_rcu(est, rcu);
}
}
@@ -215,7 +222,9 @@ EXPORT_SYMBOL(gen_kill_estimator);
* @cpu_bstats: bstats per cpu
* @rate_est: rate estimator statistics
* @lock: lock for statistics and control path
- * @running: qdisc running seqcount (might be NULL)
+ * @running: true if @bstats represents a running qdisc, thus @bstats'
+ * internal values might change during basic reads. Only used
+ * if @cpu_bstats is NULL
* @opt: rate estimator configuration TLV
*
* Replaces the configuration of a rate estimator by calling
@@ -223,11 +232,11 @@ EXPORT_SYMBOL(gen_kill_estimator);
*
* Returns 0 on success or a negative error code.
*/
-int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
- struct gnet_stats_basic_cpu __percpu *cpu_bstats,
+int gen_replace_estimator(struct gnet_stats_basic_sync *bstats,
+ struct gnet_stats_basic_sync __percpu *cpu_bstats,
struct net_rate_estimator __rcu **rate_est,
spinlock_t *lock,
- seqcount_t *running, struct nlattr *opt)
+ bool running, struct nlattr *opt)
{
return gen_new_estimator(bstats, cpu_bstats, rate_est,
lock, running, opt);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 188d693cb251..b71ccaec0991 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -1,16 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/core/gen_stats.c
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
* Jamal Hadi Salim
* Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
- * See Documentation/networking/gen_stats.txt
+ * See Documentation/networking/gen_stats.rst
*/
#include <linux/types.h>
@@ -22,7 +18,7 @@
#include <linux/gen_stats.h>
#include <net/netlink.h>
#include <net/gen_stats.h>
-
+#include <net/sch_generic.h>
static inline int
gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr)
@@ -118,56 +114,141 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock,
}
EXPORT_SYMBOL(gnet_stats_start_copy);
-static void
-__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
- struct gnet_stats_basic_cpu __percpu *cpu)
+/* Must not be inlined, due to u64_stats seqcount_t lockdep key */
+void gnet_stats_basic_sync_init(struct gnet_stats_basic_sync *b)
+{
+ u64_stats_set(&b->bytes, 0);
+ u64_stats_set(&b->packets, 0);
+ u64_stats_init(&b->syncp);
+}
+EXPORT_SYMBOL(gnet_stats_basic_sync_init);
+
+static void gnet_stats_add_basic_cpu(struct gnet_stats_basic_sync *bstats,
+ struct gnet_stats_basic_sync __percpu *cpu)
{
+ u64 t_bytes = 0, t_packets = 0;
int i;
for_each_possible_cpu(i) {
- struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i);
+ struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i);
unsigned int start;
- u64 bytes;
- u32 packets;
+ u64 bytes, packets;
do {
- start = u64_stats_fetch_begin_irq(&bcpu->syncp);
- bytes = bcpu->bstats.bytes;
- packets = bcpu->bstats.packets;
- } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start));
+ start = u64_stats_fetch_begin(&bcpu->syncp);
+ bytes = u64_stats_read(&bcpu->bytes);
+ packets = u64_stats_read(&bcpu->packets);
+ } while (u64_stats_fetch_retry(&bcpu->syncp, start));
+
+ t_bytes += bytes;
+ t_packets += packets;
+ }
+ _bstats_update(bstats, t_bytes, t_packets);
+}
- bstats->bytes += bytes;
- bstats->packets += packets;
+void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats,
+ struct gnet_stats_basic_sync __percpu *cpu,
+ struct gnet_stats_basic_sync *b, bool running)
+{
+ unsigned int start;
+ u64 bytes = 0;
+ u64 packets = 0;
+
+ WARN_ON_ONCE((cpu || running) && in_hardirq());
+
+ if (cpu) {
+ gnet_stats_add_basic_cpu(bstats, cpu);
+ return;
}
+ do {
+ if (running)
+ start = u64_stats_fetch_begin(&b->syncp);
+ bytes = u64_stats_read(&b->bytes);
+ packets = u64_stats_read(&b->packets);
+ } while (running && u64_stats_fetch_retry(&b->syncp, start));
+
+ _bstats_update(bstats, bytes, packets);
}
+EXPORT_SYMBOL(gnet_stats_add_basic);
-void
-__gnet_stats_copy_basic(const seqcount_t *running,
- struct gnet_stats_basic_packed *bstats,
- struct gnet_stats_basic_cpu __percpu *cpu,
- struct gnet_stats_basic_packed *b)
+static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets,
+ struct gnet_stats_basic_sync __percpu *cpu,
+ struct gnet_stats_basic_sync *b, bool running)
{
- unsigned int seq;
+ unsigned int start;
if (cpu) {
- __gnet_stats_copy_basic_cpu(bstats, cpu);
+ u64 t_bytes = 0, t_packets = 0;
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i);
+ unsigned int start;
+ u64 bytes, packets;
+
+ do {
+ start = u64_stats_fetch_begin(&bcpu->syncp);
+ bytes = u64_stats_read(&bcpu->bytes);
+ packets = u64_stats_read(&bcpu->packets);
+ } while (u64_stats_fetch_retry(&bcpu->syncp, start));
+
+ t_bytes += bytes;
+ t_packets += packets;
+ }
+ *ret_bytes = t_bytes;
+ *ret_packets = t_packets;
return;
}
do {
if (running)
- seq = read_seqcount_begin(running);
- bstats->bytes = b->bytes;
- bstats->packets = b->packets;
- } while (running && read_seqcount_retry(running, seq));
+ start = u64_stats_fetch_begin(&b->syncp);
+ *ret_bytes = u64_stats_read(&b->bytes);
+ *ret_packets = u64_stats_read(&b->packets);
+ } while (running && u64_stats_fetch_retry(&b->syncp, start));
+}
+
+static int
+___gnet_stats_copy_basic(struct gnet_dump *d,
+ struct gnet_stats_basic_sync __percpu *cpu,
+ struct gnet_stats_basic_sync *b,
+ int type, bool running)
+{
+ u64 bstats_bytes, bstats_packets;
+
+ gnet_stats_read_basic(&bstats_bytes, &bstats_packets, cpu, b, running);
+
+ if (d->compat_tc_stats && type == TCA_STATS_BASIC) {
+ d->tc_stats.bytes = bstats_bytes;
+ d->tc_stats.packets = bstats_packets;
+ }
+
+ if (d->tail) {
+ struct gnet_stats_basic sb;
+ int res;
+
+ memset(&sb, 0, sizeof(sb));
+ sb.bytes = bstats_bytes;
+ sb.packets = bstats_packets;
+ res = gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD);
+ if (res < 0 || sb.packets == bstats_packets)
+ return res;
+ /* emit 64bit stats only if needed */
+ return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats_packets,
+ sizeof(bstats_packets), TCA_STATS_PAD);
+ }
+ return 0;
}
-EXPORT_SYMBOL(__gnet_stats_copy_basic);
/**
* gnet_stats_copy_basic - copy basic statistics into statistic TLV
- * @running: seqcount_t pointer
* @d: dumping handle
* @cpu: copy statistic per cpu
* @b: basic statistics
+ * @running: true if @b represents a running qdisc, thus @b's
+ * internal values might change during basic reads.
+ * Only used if @cpu is NULL
+ *
+ * Context: task; must not be run from IRQ or BH contexts
*
* Appends the basic statistics to the top level TLV created by
* gnet_stats_start_copy().
@@ -176,34 +257,43 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic);
* if the room in the socket buffer was not sufficient.
*/
int
-gnet_stats_copy_basic(const seqcount_t *running,
- struct gnet_dump *d,
- struct gnet_stats_basic_cpu __percpu *cpu,
- struct gnet_stats_basic_packed *b)
+gnet_stats_copy_basic(struct gnet_dump *d,
+ struct gnet_stats_basic_sync __percpu *cpu,
+ struct gnet_stats_basic_sync *b,
+ bool running)
{
- struct gnet_stats_basic_packed bstats = {0};
-
- __gnet_stats_copy_basic(running, &bstats, cpu, b);
-
- if (d->compat_tc_stats) {
- d->tc_stats.bytes = bstats.bytes;
- d->tc_stats.packets = bstats.packets;
- }
-
- if (d->tail) {
- struct gnet_stats_basic sb;
-
- memset(&sb, 0, sizeof(sb));
- sb.bytes = bstats.bytes;
- sb.packets = bstats.packets;
- return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb),
- TCA_STATS_PAD);
- }
- return 0;
+ return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC, running);
}
EXPORT_SYMBOL(gnet_stats_copy_basic);
/**
+ * gnet_stats_copy_basic_hw - copy basic hw statistics into statistic TLV
+ * @d: dumping handle
+ * @cpu: copy statistic per cpu
+ * @b: basic statistics
+ * @running: true if @b represents a running qdisc, thus @b's
+ * internal values might change during basic reads.
+ * Only used if @cpu is NULL
+ *
+ * Context: task; must not be run from IRQ or BH contexts
+ *
+ * Appends the basic statistics to the top level TLV created by
+ * gnet_stats_start_copy().
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_basic_hw(struct gnet_dump *d,
+ struct gnet_stats_basic_sync __percpu *cpu,
+ struct gnet_stats_basic_sync *b,
+ bool running)
+{
+ return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC_HW, running);
+}
+EXPORT_SYMBOL(gnet_stats_copy_basic_hw);
+
+/**
* gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
* @d: dumping handle
* @rate_est: rate estimator
@@ -247,16 +337,15 @@ gnet_stats_copy_rate_est(struct gnet_dump *d,
}
EXPORT_SYMBOL(gnet_stats_copy_rate_est);
-static void
-__gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats,
- const struct gnet_stats_queue __percpu *q)
+static void gnet_stats_add_queue_cpu(struct gnet_stats_queue *qstats,
+ const struct gnet_stats_queue __percpu *q)
{
int i;
for_each_possible_cpu(i) {
const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i);
- qstats->qlen = 0;
+ qstats->qlen += qcpu->qlen;
qstats->backlog += qcpu->backlog;
qstats->drops += qcpu->drops;
qstats->requeues += qcpu->requeues;
@@ -264,24 +353,21 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats,
}
}
-void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats,
- const struct gnet_stats_queue __percpu *cpu,
- const struct gnet_stats_queue *q,
- __u32 qlen)
+void gnet_stats_add_queue(struct gnet_stats_queue *qstats,
+ const struct gnet_stats_queue __percpu *cpu,
+ const struct gnet_stats_queue *q)
{
if (cpu) {
- __gnet_stats_copy_queue_cpu(qstats, cpu);
+ gnet_stats_add_queue_cpu(qstats, cpu);
} else {
- qstats->qlen = q->qlen;
- qstats->backlog = q->backlog;
- qstats->drops = q->drops;
- qstats->requeues = q->requeues;
- qstats->overlimits = q->overlimits;
+ qstats->qlen += q->qlen;
+ qstats->backlog += q->backlog;
+ qstats->drops += q->drops;
+ qstats->requeues += q->requeues;
+ qstats->overlimits += q->overlimits;
}
-
- qstats->qlen = qlen;
}
-EXPORT_SYMBOL(__gnet_stats_copy_queue);
+EXPORT_SYMBOL(gnet_stats_add_queue);
/**
* gnet_stats_copy_queue - copy queue statistics into statistics TLV
@@ -304,7 +390,8 @@ gnet_stats_copy_queue(struct gnet_dump *d,
{
struct gnet_stats_queue qstats = {0};
- __gnet_stats_copy_queue(&qstats, cpu_q, q, qlen);
+ gnet_stats_add_queue(&qstats, cpu_q, q);
+ qstats.qlen = qlen;
if (d->compat_tc_stats) {
d->tc_stats.drops = qstats.drops;
diff --git a/net/core/gro.c b/net/core/gro.c
new file mode 100644
index 000000000000..76f9c3712422
--- /dev/null
+++ b/net/core/gro.c
@@ -0,0 +1,835 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <net/psp.h>
+#include <net/gro.h>
+#include <net/dst_metadata.h>
+#include <net/busy_poll.h>
+#include <trace/events/net.h>
+#include <linux/skbuff_ref.h>
+
+#define MAX_GRO_SKBS 8
+
+static DEFINE_SPINLOCK(offload_lock);
+
+/**
+ * dev_add_offload - register offload handlers
+ * @po: protocol offload declaration
+ *
+ * Add protocol offload handlers to the networking stack. The passed
+ * &proto_offload is linked into kernel lists and may not be freed until
+ * it has been removed from the kernel lists.
+ *
+ * This call does not sleep therefore it can not
+ * guarantee all CPU's that are in middle of receiving packets
+ * will see the new offload handlers (until the next received packet).
+ */
+void dev_add_offload(struct packet_offload *po)
+{
+ struct packet_offload *elem;
+
+ spin_lock(&offload_lock);
+ list_for_each_entry(elem, &net_hotdata.offload_base, list) {
+ if (po->priority < elem->priority)
+ break;
+ }
+ list_add_rcu(&po->list, elem->list.prev);
+ spin_unlock(&offload_lock);
+}
+EXPORT_SYMBOL(dev_add_offload);
+
+/**
+ * __dev_remove_offload - remove offload handler
+ * @po: packet offload declaration
+ *
+ * Remove a protocol offload handler that was previously added to the
+ * kernel offload handlers by dev_add_offload(). The passed &offload_type
+ * is removed from the kernel lists and can be freed or reused once this
+ * function returns.
+ *
+ * The packet type might still be in use by receivers
+ * and must not be freed until after all the CPU's have gone
+ * through a quiescent state.
+ */
+static void __dev_remove_offload(struct packet_offload *po)
+{
+ struct list_head *head = &net_hotdata.offload_base;
+ struct packet_offload *po1;
+
+ spin_lock(&offload_lock);
+
+ list_for_each_entry(po1, head, list) {
+ if (po == po1) {
+ list_del_rcu(&po->list);
+ goto out;
+ }
+ }
+
+ pr_warn("dev_remove_offload: %p not found\n", po);
+out:
+ spin_unlock(&offload_lock);
+}
+
+/**
+ * dev_remove_offload - remove packet offload handler
+ * @po: packet offload declaration
+ *
+ * Remove a packet offload handler that was previously added to the kernel
+ * offload handlers by dev_add_offload(). The passed &offload_type is
+ * removed from the kernel lists and can be freed or reused once this
+ * function returns.
+ *
+ * This call sleeps to guarantee that no CPU is looking at the packet
+ * type after return.
+ */
+void dev_remove_offload(struct packet_offload *po)
+{
+ __dev_remove_offload(po);
+
+ synchronize_net();
+}
+EXPORT_SYMBOL(dev_remove_offload);
+
+
+int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
+{
+ struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
+ unsigned int offset = skb_gro_offset(skb);
+ unsigned int headlen = skb_headlen(skb);
+ unsigned int len = skb_gro_len(skb);
+ unsigned int delta_truesize;
+ unsigned int new_truesize;
+ struct sk_buff *lp;
+ int segs;
+
+ /* Do not splice page pool based packets w/ non-page pool
+ * packets. This can result in reference count issues as page
+ * pool pages will not decrement the reference count and will
+ * instead be immediately returned to the pool or have frag
+ * count decremented.
+ */
+ if (p->pp_recycle != skb->pp_recycle)
+ return -ETOOMANYREFS;
+
+ if (unlikely(p->len + len >= netif_get_gro_max_size(p->dev, p) ||
+ NAPI_GRO_CB(skb)->flush))
+ return -E2BIG;
+
+ if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) {
+ if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP ||
+ (p->protocol == htons(ETH_P_IPV6) &&
+ skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) ||
+ p->encapsulation)
+ return -E2BIG;
+ }
+
+ segs = NAPI_GRO_CB(skb)->count;
+ lp = NAPI_GRO_CB(p)->last;
+ pinfo = skb_shinfo(lp);
+
+ if (headlen <= offset) {
+ skb_frag_t *frag;
+ skb_frag_t *frag2;
+ int i = skbinfo->nr_frags;
+ int nr_frags = pinfo->nr_frags + i;
+
+ if (nr_frags > MAX_SKB_FRAGS)
+ goto merge;
+
+ offset -= headlen;
+ pinfo->nr_frags = nr_frags;
+ skbinfo->nr_frags = 0;
+
+ frag = pinfo->frags + nr_frags;
+ frag2 = skbinfo->frags + i;
+ do {
+ *--frag = *--frag2;
+ } while (--i);
+
+ skb_frag_off_add(frag, offset);
+ skb_frag_size_sub(frag, offset);
+
+ /* all fragments truesize : remove (head size + sk_buff) */
+ new_truesize = SKB_TRUESIZE(skb_end_offset(skb));
+ delta_truesize = skb->truesize - new_truesize;
+
+ skb->truesize = new_truesize;
+ skb->len -= skb->data_len;
+ skb->data_len = 0;
+
+ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
+ goto done;
+ } else if (skb->head_frag) {
+ int nr_frags = pinfo->nr_frags;
+ skb_frag_t *frag = pinfo->frags + nr_frags;
+ struct page *page = virt_to_head_page(skb->head);
+ unsigned int first_size = headlen - offset;
+ unsigned int first_offset;
+
+ if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
+ goto merge;
+
+ first_offset = skb->data -
+ (unsigned char *)page_address(page) +
+ offset;
+
+ pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
+
+ skb_frag_fill_page_desc(frag, page, first_offset, first_size);
+
+ memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
+ /* We dont need to clear skbinfo->nr_frags here */
+
+ new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff));
+ delta_truesize = skb->truesize - new_truesize;
+ skb->truesize = new_truesize;
+ NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
+ goto done;
+ }
+
+merge:
+ /* sk ownership - if any - completely transferred to the aggregated packet */
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ delta_truesize = skb->truesize;
+ if (offset > headlen) {
+ unsigned int eat = offset - headlen;
+
+ skb_frag_off_add(&skbinfo->frags[0], eat);
+ skb_frag_size_sub(&skbinfo->frags[0], eat);
+ skb->data_len -= eat;
+ skb->len -= eat;
+ offset = headlen;
+ }
+
+ __skb_pull(skb, offset);
+
+ if (NAPI_GRO_CB(p)->last == p)
+ skb_shinfo(p)->frag_list = skb;
+ else
+ NAPI_GRO_CB(p)->last->next = skb;
+ NAPI_GRO_CB(p)->last = skb;
+ __skb_header_release(skb);
+ lp = p;
+
+done:
+ NAPI_GRO_CB(p)->count += segs;
+ p->data_len += len;
+ p->truesize += delta_truesize;
+ p->len += len;
+ if (lp != p) {
+ lp->data_len += len;
+ lp->truesize += delta_truesize;
+ lp->len += len;
+ }
+ NAPI_GRO_CB(skb)->same_flow = 1;
+ return 0;
+}
+
+int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb)
+{
+ if (unlikely(p->len + skb->len >= 65536))
+ return -E2BIG;
+
+ if (NAPI_GRO_CB(p)->last == p)
+ skb_shinfo(p)->frag_list = skb;
+ else
+ NAPI_GRO_CB(p)->last->next = skb;
+
+ skb_pull(skb, skb_gro_offset(skb));
+
+ NAPI_GRO_CB(p)->last = skb;
+ NAPI_GRO_CB(p)->count++;
+ p->data_len += skb->len;
+
+ /* sk ownership - if any - completely transferred to the aggregated packet */
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ p->truesize += skb->truesize;
+ p->len += skb->len;
+
+ NAPI_GRO_CB(skb)->same_flow = 1;
+
+ return 0;
+}
+
+static void gro_complete(struct gro_node *gro, struct sk_buff *skb)
+{
+ struct list_head *head = &net_hotdata.offload_base;
+ struct packet_offload *ptype;
+ __be16 type = skb->protocol;
+ int err = -ENOENT;
+
+ BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
+
+ if (NAPI_GRO_CB(skb)->count == 1) {
+ skb_shinfo(skb)->gso_size = 0;
+ goto out;
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_complete)
+ continue;
+
+ err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
+ ipv6_gro_complete, inet_gro_complete,
+ skb, 0);
+ break;
+ }
+ rcu_read_unlock();
+
+ if (err) {
+ WARN_ON(&ptype->list == head);
+ kfree_skb(skb);
+ return;
+ }
+
+out:
+ gro_normal_one(gro, skb, NAPI_GRO_CB(skb)->count);
+}
+
+static void __gro_flush_chain(struct gro_node *gro, u32 index, bool flush_old)
+{
+ struct list_head *head = &gro->hash[index].list;
+ struct sk_buff *skb, *p;
+
+ list_for_each_entry_safe_reverse(skb, p, head, list) {
+ if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
+ return;
+ skb_list_del_init(skb);
+ gro_complete(gro, skb);
+ gro->hash[index].count--;
+ }
+
+ if (!gro->hash[index].count)
+ __clear_bit(index, &gro->bitmask);
+}
+
+/*
+ * gro->hash[].list contains packets ordered by age.
+ * youngest packets at the head of it.
+ * Complete skbs in reverse order to reduce latencies.
+ */
+void __gro_flush(struct gro_node *gro, bool flush_old)
+{
+ unsigned long bitmask = gro->bitmask;
+ unsigned int i, base = ~0U;
+
+ while ((i = ffs(bitmask)) != 0) {
+ bitmask >>= i;
+ base += i;
+ __gro_flush_chain(gro, base, flush_old);
+ }
+}
+EXPORT_SYMBOL(__gro_flush);
+
+static unsigned long gro_list_prepare_tc_ext(const struct sk_buff *skb,
+ const struct sk_buff *p,
+ unsigned long diffs)
+{
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ struct tc_skb_ext *skb_ext;
+ struct tc_skb_ext *p_ext;
+
+ skb_ext = skb_ext_find(skb, TC_SKB_EXT);
+ p_ext = skb_ext_find(p, TC_SKB_EXT);
+
+ diffs |= (!!p_ext) ^ (!!skb_ext);
+ if (!diffs && unlikely(skb_ext))
+ diffs |= p_ext->chain ^ skb_ext->chain;
+#endif
+ return diffs;
+}
+
+static void gro_list_prepare(const struct list_head *head,
+ const struct sk_buff *skb)
+{
+ unsigned int maclen = skb->dev->hard_header_len;
+ u32 hash = skb_get_hash_raw(skb);
+ struct sk_buff *p;
+
+ list_for_each_entry(p, head, list) {
+ unsigned long diffs;
+
+ if (hash != skb_get_hash_raw(p)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
+ diffs |= p->vlan_all ^ skb->vlan_all;
+ diffs |= skb_metadata_differs(p, skb);
+ if (maclen == ETH_HLEN)
+ diffs |= compare_ether_header(skb_mac_header(p),
+ skb_mac_header(skb));
+ else if (!diffs)
+ diffs = memcmp(skb_mac_header(p),
+ skb_mac_header(skb),
+ maclen);
+
+ /* in most common scenarios 'slow_gro' is 0
+ * otherwise we are already on some slower paths
+ * either skip all the infrequent tests altogether or
+ * avoid trying too hard to skip each of them individually
+ */
+ if (!diffs && unlikely(skb->slow_gro | p->slow_gro)) {
+ diffs |= p->sk != skb->sk;
+ diffs |= skb_metadata_dst_cmp(p, skb);
+ diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb);
+
+ diffs |= gro_list_prepare_tc_ext(skb, p, diffs);
+ diffs |= __psp_skb_coalesce_diff(skb, p, diffs);
+ }
+
+ NAPI_GRO_CB(p)->same_flow = !diffs;
+ }
+}
+
+static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff)
+{
+ const struct skb_shared_info *pinfo;
+ const skb_frag_t *frag0;
+ unsigned int headlen;
+
+ NAPI_GRO_CB(skb)->network_offset = 0;
+ NAPI_GRO_CB(skb)->data_offset = 0;
+ headlen = skb_headlen(skb);
+ NAPI_GRO_CB(skb)->frag0 = skb->data;
+ NAPI_GRO_CB(skb)->frag0_len = headlen;
+ if (headlen)
+ return;
+
+ pinfo = skb_shinfo(skb);
+ frag0 = &pinfo->frags[0];
+
+ if (pinfo->nr_frags && skb_frag_page(frag0) &&
+ !PageHighMem(skb_frag_page(frag0)) &&
+ (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) {
+ NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
+ NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
+ skb_frag_size(frag0),
+ skb->end - skb->tail);
+ }
+}
+
+static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
+{
+ struct skb_shared_info *pinfo = skb_shinfo(skb);
+
+ BUG_ON(skb->end - skb->tail < grow);
+
+ memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
+
+ skb->data_len -= grow;
+ skb->tail += grow;
+
+ skb_frag_off_add(&pinfo->frags[0], grow);
+ skb_frag_size_sub(&pinfo->frags[0], grow);
+
+ if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
+ skb_frag_unref(skb, 0);
+ memmove(pinfo->frags, pinfo->frags + 1,
+ --pinfo->nr_frags * sizeof(pinfo->frags[0]));
+ }
+}
+
+static void gro_try_pull_from_frag0(struct sk_buff *skb)
+{
+ int grow = skb_gro_offset(skb) - skb_headlen(skb);
+
+ if (grow > 0)
+ gro_pull_from_frag0(skb, grow);
+}
+
+static void gro_flush_oldest(struct gro_node *gro, struct list_head *head)
+{
+ struct sk_buff *oldest;
+
+ oldest = list_last_entry(head, struct sk_buff, list);
+
+ /* We are called with head length >= MAX_GRO_SKBS, so this is
+ * impossible.
+ */
+ if (WARN_ON_ONCE(!oldest))
+ return;
+
+ /* Do not adjust napi->gro_hash[].count, caller is adding a new
+ * SKB to the chain.
+ */
+ skb_list_del_init(oldest);
+ gro_complete(gro, oldest);
+}
+
+static enum gro_result dev_gro_receive(struct gro_node *gro,
+ struct sk_buff *skb)
+{
+ u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
+ struct list_head *head = &net_hotdata.offload_base;
+ struct gro_list *gro_list = &gro->hash[bucket];
+ struct packet_offload *ptype;
+ __be16 type = skb->protocol;
+ struct sk_buff *pp = NULL;
+ enum gro_result ret;
+ int same_flow;
+
+ if (netif_elide_gro(skb->dev))
+ goto normal;
+
+ gro_list_prepare(&gro_list->list, skb);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, head, list) {
+ if (ptype->type == type && ptype->callbacks.gro_receive)
+ goto found_ptype;
+ }
+ rcu_read_unlock();
+ goto normal;
+
+found_ptype:
+ skb_set_network_header(skb, skb_gro_offset(skb));
+ skb_reset_mac_len(skb);
+ BUILD_BUG_ON(sizeof_field(struct napi_gro_cb, zeroed) != sizeof(u32));
+ BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed),
+ sizeof(u32))); /* Avoid slow unaligned acc */
+ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0;
+ NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb);
+ NAPI_GRO_CB(skb)->count = 1;
+ if (unlikely(skb_is_gso(skb))) {
+ NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs;
+ /* Only support TCP and non DODGY users. */
+ if (!skb_is_gso_tcp(skb) ||
+ (skb_shinfo(skb)->gso_type & SKB_GSO_DODGY))
+ NAPI_GRO_CB(skb)->flush = 1;
+ }
+
+ /* Setup for GRO checksum validation */
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ NAPI_GRO_CB(skb)->csum = skb->csum;
+ NAPI_GRO_CB(skb)->csum_valid = 1;
+ break;
+ case CHECKSUM_UNNECESSARY:
+ NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
+ break;
+ }
+
+ pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
+ ipv6_gro_receive, inet_gro_receive,
+ &gro_list->list, skb);
+
+ rcu_read_unlock();
+
+ if (PTR_ERR(pp) == -EINPROGRESS) {
+ ret = GRO_CONSUMED;
+ goto ok;
+ }
+
+ same_flow = NAPI_GRO_CB(skb)->same_flow;
+ ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
+
+ if (pp) {
+ skb_list_del_init(pp);
+ gro_complete(gro, pp);
+ gro_list->count--;
+ }
+
+ if (same_flow)
+ goto ok;
+
+ if (NAPI_GRO_CB(skb)->flush)
+ goto normal;
+
+ if (unlikely(gro_list->count >= MAX_GRO_SKBS))
+ gro_flush_oldest(gro, &gro_list->list);
+ else
+ gro_list->count++;
+
+ /* Must be called before setting NAPI_GRO_CB(skb)->{age|last} */
+ gro_try_pull_from_frag0(skb);
+ NAPI_GRO_CB(skb)->age = jiffies;
+ NAPI_GRO_CB(skb)->last = skb;
+ if (!skb_is_gso(skb))
+ skb_shinfo(skb)->gso_size = skb_gro_len(skb);
+ list_add(&skb->list, &gro_list->list);
+ ret = GRO_HELD;
+ok:
+ if (gro_list->count) {
+ if (!test_bit(bucket, &gro->bitmask))
+ __set_bit(bucket, &gro->bitmask);
+ } else if (test_bit(bucket, &gro->bitmask)) {
+ __clear_bit(bucket, &gro->bitmask);
+ }
+
+ return ret;
+
+normal:
+ ret = GRO_NORMAL;
+ gro_try_pull_from_frag0(skb);
+ goto ok;
+}
+
+struct packet_offload *gro_find_receive_by_type(__be16 type)
+{
+ struct list_head *offload_head = &net_hotdata.offload_base;
+ struct packet_offload *ptype;
+
+ list_for_each_entry_rcu(ptype, offload_head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_receive)
+ continue;
+ return ptype;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(gro_find_receive_by_type);
+
+struct packet_offload *gro_find_complete_by_type(__be16 type)
+{
+ struct list_head *offload_head = &net_hotdata.offload_base;
+ struct packet_offload *ptype;
+
+ list_for_each_entry_rcu(ptype, offload_head, list) {
+ if (ptype->type != type || !ptype->callbacks.gro_complete)
+ continue;
+ return ptype;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(gro_find_complete_by_type);
+
+static gro_result_t gro_skb_finish(struct gro_node *gro, struct sk_buff *skb,
+ gro_result_t ret)
+{
+ switch (ret) {
+ case GRO_NORMAL:
+ gro_normal_one(gro, skb, 1);
+ break;
+
+ case GRO_MERGED_FREE:
+ if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+ napi_skb_free_stolen_head(skb);
+ else if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
+ __kfree_skb(skb);
+ else
+ __napi_kfree_skb(skb, SKB_CONSUMED);
+ break;
+
+ case GRO_HELD:
+ case GRO_MERGED:
+ case GRO_CONSUMED:
+ break;
+ }
+
+ return ret;
+}
+
+gro_result_t gro_receive_skb(struct gro_node *gro, struct sk_buff *skb)
+{
+ gro_result_t ret;
+
+ __skb_mark_napi_id(skb, gro);
+ trace_napi_gro_receive_entry(skb);
+
+ skb_gro_reset_offset(skb, 0);
+
+ ret = gro_skb_finish(gro, skb, dev_gro_receive(gro, skb));
+ trace_napi_gro_receive_exit(ret);
+
+ return ret;
+}
+EXPORT_SYMBOL(gro_receive_skb);
+
+static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
+{
+ struct skb_shared_info *shinfo;
+
+ if (unlikely(skb->pfmemalloc)) {
+ consume_skb(skb);
+ return;
+ }
+ __skb_pull(skb, skb_headlen(skb));
+ /* restore the reserve we had after netdev_alloc_skb_ip_align() */
+ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
+ __vlan_hwaccel_clear_tag(skb);
+ skb->dev = napi->dev;
+ skb->skb_iif = 0;
+
+ /* eth_type_trans() assumes pkt_type is PACKET_HOST */
+ skb->pkt_type = PACKET_HOST;
+
+ skb->encapsulation = 0;
+ skb->ip_summed = CHECKSUM_NONE;
+
+ shinfo = skb_shinfo(skb);
+ shinfo->gso_type = 0;
+ shinfo->gso_size = 0;
+ shinfo->hwtstamps.hwtstamp = 0;
+
+ if (unlikely(skb->slow_gro)) {
+ skb_orphan(skb);
+ skb_ext_reset(skb);
+ nf_reset_ct(skb);
+ skb->slow_gro = 0;
+ }
+
+ napi->skb = skb;
+}
+
+struct sk_buff *napi_get_frags(struct napi_struct *napi)
+{
+ struct sk_buff *skb = napi->skb;
+
+ if (!skb) {
+ skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
+ if (skb) {
+ napi->skb = skb;
+ skb_mark_napi_id(skb, napi);
+ }
+ }
+ return skb;
+}
+EXPORT_SYMBOL(napi_get_frags);
+
+static gro_result_t napi_frags_finish(struct napi_struct *napi,
+ struct sk_buff *skb,
+ gro_result_t ret)
+{
+ switch (ret) {
+ case GRO_NORMAL:
+ case GRO_HELD:
+ __skb_push(skb, ETH_HLEN);
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ if (ret == GRO_NORMAL)
+ gro_normal_one(&napi->gro, skb, 1);
+ break;
+
+ case GRO_MERGED_FREE:
+ if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+ napi_skb_free_stolen_head(skb);
+ else
+ napi_reuse_skb(napi, skb);
+ break;
+
+ case GRO_MERGED:
+ case GRO_CONSUMED:
+ break;
+ }
+
+ return ret;
+}
+
+/* Upper GRO stack assumes network header starts at gro_offset=0
+ * Drivers could call both napi_gro_frags() and napi_gro_receive()
+ * We copy ethernet header into skb->data to have a common layout.
+ */
+static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
+{
+ struct sk_buff *skb = napi->skb;
+ const struct ethhdr *eth;
+ unsigned int hlen = sizeof(*eth);
+
+ napi->skb = NULL;
+
+ skb_reset_mac_header(skb);
+ skb_gro_reset_offset(skb, hlen);
+
+ if (unlikely(!skb_gro_may_pull(skb, hlen))) {
+ eth = skb_gro_header_slow(skb, hlen, 0);
+ if (unlikely(!eth)) {
+ net_warn_ratelimited("%s: dropping impossible skb from %s\n",
+ __func__, napi->dev->name);
+ napi_reuse_skb(napi, skb);
+ return NULL;
+ }
+ } else {
+ eth = (const struct ethhdr *)skb->data;
+
+ if (NAPI_GRO_CB(skb)->frag0 != skb->data)
+ gro_pull_from_frag0(skb, hlen);
+
+ NAPI_GRO_CB(skb)->frag0 += hlen;
+ NAPI_GRO_CB(skb)->frag0_len -= hlen;
+ }
+ __skb_pull(skb, hlen);
+
+ /*
+ * This works because the only protocols we care about don't require
+ * special handling.
+ * We'll fix it up properly in napi_frags_finish()
+ */
+ skb->protocol = eth->h_proto;
+
+ return skb;
+}
+
+gro_result_t napi_gro_frags(struct napi_struct *napi)
+{
+ gro_result_t ret;
+ struct sk_buff *skb = napi_frags_skb(napi);
+
+ trace_napi_gro_frags_entry(skb);
+
+ ret = napi_frags_finish(napi, skb, dev_gro_receive(&napi->gro, skb));
+ trace_napi_gro_frags_exit(ret);
+
+ return ret;
+}
+EXPORT_SYMBOL(napi_gro_frags);
+
+/* Compute the checksum from gro_offset and return the folded value
+ * after adding in any pseudo checksum.
+ */
+__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
+{
+ __wsum wsum;
+ __sum16 sum;
+
+ wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
+
+ /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
+ sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
+ /* See comments in __skb_checksum_complete(). */
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev, skb);
+ }
+
+ NAPI_GRO_CB(skb)->csum = wsum;
+ NAPI_GRO_CB(skb)->csum_valid = 1;
+
+ return sum;
+}
+EXPORT_SYMBOL(__skb_gro_checksum_complete);
+
+void gro_init(struct gro_node *gro)
+{
+ for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) {
+ INIT_LIST_HEAD(&gro->hash[i].list);
+ gro->hash[i].count = 0;
+ }
+
+ gro->bitmask = 0;
+ gro->cached_napi_id = 0;
+
+ INIT_LIST_HEAD(&gro->rx_list);
+ gro->rx_count = 0;
+}
+
+void gro_cleanup(struct gro_node *gro)
+{
+ struct sk_buff *skb, *n;
+
+ for (u32 i = 0; i < GRO_HASH_BUCKETS; i++) {
+ list_for_each_entry_safe(skb, n, &gro->hash[i].list, list)
+ kfree_skb(skb);
+
+ gro->hash[i].count = 0;
+ }
+
+ gro->bitmask = 0;
+ gro->cached_napi_id = 0;
+
+ list_for_each_entry_safe(skb, n, &gro->rx_list, list)
+ kfree_skb(skb);
+
+ gro->rx_count = 0;
+}
diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c
index 4b54e5f107c6..a725d21159a6 100644
--- a/net/core/gro_cells.c
+++ b/net/core/gro_cells.c
@@ -3,32 +3,53 @@
#include <linux/slab.h>
#include <linux/netdevice.h>
#include <net/gro_cells.h>
+#include <net/hotdata.h>
struct gro_cell {
struct sk_buff_head napi_skbs;
struct napi_struct napi;
+ local_lock_t bh_lock;
};
int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
+ bool have_bh_lock = false;
struct gro_cell *cell;
+ int res;
- if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev))
- return netif_rx(skb);
+ rcu_read_lock();
+ if (unlikely(!(dev->flags & IFF_UP)))
+ goto drop;
+ if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev)) {
+ res = netif_rx(skb);
+ goto unlock;
+ }
+
+ local_lock_nested_bh(&gcells->cells->bh_lock);
+ have_bh_lock = true;
cell = this_cpu_ptr(gcells->cells);
- if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) {
- atomic_long_inc(&dev->rx_dropped);
+ if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(net_hotdata.max_backlog)) {
+drop:
+ dev_core_stats_rx_dropped_inc(dev);
kfree_skb(skb);
- return NET_RX_DROP;
+ res = NET_RX_DROP;
+ goto unlock;
}
__skb_queue_tail(&cell->napi_skbs, skb);
if (skb_queue_len(&cell->napi_skbs) == 1)
napi_schedule(&cell->napi);
- return NET_RX_SUCCESS;
+
+ res = NET_RX_SUCCESS;
+
+unlock:
+ if (have_bh_lock)
+ local_unlock_nested_bh(&gcells->cells->bh_lock);
+ rcu_read_unlock();
+ return res;
}
EXPORT_SYMBOL(gro_cells_receive);
@@ -40,7 +61,9 @@ static int gro_cell_poll(struct napi_struct *napi, int budget)
int work_done = 0;
while (work_done < budget) {
+ __local_lock_nested_bh(&cell->bh_lock);
skb = __skb_dequeue(&cell->napi_skbs);
+ __local_unlock_nested_bh(&cell->bh_lock);
if (!skb)
break;
napi_gro_receive(napi, skb);
@@ -64,19 +87,34 @@ int gro_cells_init(struct gro_cells *gcells, struct net_device *dev)
struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
__skb_queue_head_init(&cell->napi_skbs);
+ local_lock_init(&cell->bh_lock);
set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state);
- netif_napi_add(dev, &cell->napi, gro_cell_poll,
- NAPI_POLL_WEIGHT);
+ netif_napi_add(dev, &cell->napi, gro_cell_poll);
napi_enable(&cell->napi);
}
return 0;
}
EXPORT_SYMBOL(gro_cells_init);
+struct percpu_free_defer {
+ struct rcu_head rcu;
+ void __percpu *ptr;
+};
+
+static void percpu_free_defer_callback(struct rcu_head *head)
+{
+ struct percpu_free_defer *defer;
+
+ defer = container_of(head, struct percpu_free_defer, rcu);
+ free_percpu(defer->ptr);
+ kfree(defer);
+}
+
void gro_cells_destroy(struct gro_cells *gcells)
{
+ struct percpu_free_defer *defer;
int i;
if (!gcells->cells)
@@ -84,10 +122,27 @@ void gro_cells_destroy(struct gro_cells *gcells)
for_each_possible_cpu(i) {
struct gro_cell *cell = per_cpu_ptr(gcells->cells, i);
- netif_napi_del(&cell->napi);
+ napi_disable(&cell->napi);
+ __netif_napi_del(&cell->napi);
__skb_queue_purge(&cell->napi_skbs);
}
- free_percpu(gcells->cells);
+ /* We need to observe an rcu grace period before freeing ->cells,
+ * because netpoll could access dev->napi_list under rcu protection.
+ * Try hard using call_rcu() instead of synchronize_rcu(),
+ * because we might be called from cleanup_net(), and we
+ * definitely do not want to block this critical task.
+ */
+ defer = kmalloc(sizeof(*defer), GFP_KERNEL | __GFP_NOWARN);
+ if (likely(defer)) {
+ defer->ptr = gcells->cells;
+ call_rcu(&defer->rcu, percpu_free_defer_callback);
+ } else {
+ /* We do not hold RTNL at this point, synchronize_net()
+ * would not be able to expedite this sync.
+ */
+ synchronize_rcu_expedited();
+ free_percpu(gcells->cells);
+ }
gcells->cells = NULL;
}
EXPORT_SYMBOL(gro_cells_destroy);
diff --git a/net/core/gso.c b/net/core/gso.c
new file mode 100644
index 000000000000..bcd156372f4d
--- /dev/null
+++ b/net/core/gso.c
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/skbuff.h>
+#include <linux/sctp.h>
+#include <net/gso.h>
+#include <net/gro.h>
+
+/**
+ * skb_eth_gso_segment - segmentation handler for ethernet protocols.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ * @type: Ethernet Protocol ID
+ */
+struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb,
+ netdev_features_t features, __be16 type)
+{
+ struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+ struct packet_offload *ptype;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) {
+ if (ptype->type == type && ptype->callbacks.gso_segment) {
+ segs = ptype->callbacks.gso_segment(skb, features);
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return segs;
+}
+EXPORT_SYMBOL(skb_eth_gso_segment);
+
+/**
+ * skb_mac_gso_segment - mac layer segmentation handler.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ */
+struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+ struct packet_offload *ptype;
+ int vlan_depth = skb->mac_len;
+ __be16 type = skb_network_protocol(skb, &vlan_depth);
+
+ if (unlikely(!type))
+ return ERR_PTR(-EINVAL);
+
+ __skb_pull(skb, vlan_depth);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) {
+ if (ptype->type == type && ptype->callbacks.gso_segment) {
+ segs = ptype->callbacks.gso_segment(skb, features);
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ __skb_push(skb, skb->data - skb_mac_header(skb));
+
+ return segs;
+}
+EXPORT_SYMBOL(skb_mac_gso_segment);
+/* openvswitch calls this on rx path, so we need a different check.
+ */
+static bool skb_needs_check(const struct sk_buff *skb, bool tx_path)
+{
+ if (tx_path)
+ return skb->ip_summed != CHECKSUM_PARTIAL &&
+ skb->ip_summed != CHECKSUM_UNNECESSARY;
+
+ return skb->ip_summed == CHECKSUM_NONE;
+}
+
+/**
+ * __skb_gso_segment - Perform segmentation on skb.
+ * @skb: buffer to segment
+ * @features: features for the output path (see dev->features)
+ * @tx_path: whether it is called in TX path
+ *
+ * This function segments the given skb and returns a list of segments.
+ *
+ * It may return NULL if the skb requires no segmentation. This is
+ * only possible when GSO is used for verifying header integrity.
+ *
+ * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
+ */
+struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
+ netdev_features_t features, bool tx_path)
+{
+ struct sk_buff *segs;
+
+ if (unlikely(skb_needs_check(skb, tx_path))) {
+ int err;
+
+ /* We're going to init ->check field in TCP or UDP header */
+ err = skb_cow_head(skb, 0);
+ if (err < 0)
+ return ERR_PTR(err);
+ }
+
+ /* Only report GSO partial support if it will enable us to
+ * support segmentation on this frame without needing additional
+ * work.
+ */
+ if (features & NETIF_F_GSO_PARTIAL) {
+ netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
+ struct net_device *dev = skb->dev;
+
+ partial_features |= dev->features & dev->gso_partial_features;
+ if (!skb_gso_ok(skb, features | partial_features))
+ features &= ~NETIF_F_GSO_PARTIAL;
+ }
+
+ BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
+ sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
+
+ SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
+ SKB_GSO_CB(skb)->encap_level = 0;
+
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ segs = skb_mac_gso_segment(skb, features);
+
+ if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
+ skb_warn_bad_offload(skb);
+
+ return segs;
+}
+EXPORT_SYMBOL(__skb_gso_segment);
+
+/**
+ * skb_gso_transport_seglen - Return length of individual segments of a gso packet
+ *
+ * @skb: GSO skb
+ *
+ * skb_gso_transport_seglen is used to determine the real size of the
+ * individual segments, including Layer4 headers (TCP/UDP).
+ *
+ * The MAC/L2 or network (IP, IPv6) headers are not accounted for.
+ */
+static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
+{
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
+ unsigned int thlen = 0;
+
+ if (skb->encapsulation) {
+ thlen = skb_inner_transport_header(skb) -
+ skb_transport_header(skb);
+
+ if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
+ thlen += inner_tcp_hdrlen(skb);
+ } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
+ thlen = tcp_hdrlen(skb);
+ } else if (unlikely(skb_is_gso_sctp(skb))) {
+ thlen = sizeof(struct sctphdr);
+ } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
+ thlen = sizeof(struct udphdr);
+ }
+ /* UFO sets gso_size to the size of the fragmentation
+ * payload, i.e. the size of the L4 (UDP) header is already
+ * accounted for.
+ */
+ return thlen + shinfo->gso_size;
+}
+
+/**
+ * skb_gso_network_seglen - Return length of individual segments of a gso packet
+ *
+ * @skb: GSO skb
+ *
+ * skb_gso_network_seglen is used to determine the real size of the
+ * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
+ *
+ * The MAC/L2 header is not accounted for.
+ */
+static unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
+{
+ unsigned int hdr_len = skb_transport_header(skb) -
+ skb_network_header(skb);
+
+ return hdr_len + skb_gso_transport_seglen(skb);
+}
+
+/**
+ * skb_gso_mac_seglen - Return length of individual segments of a gso packet
+ *
+ * @skb: GSO skb
+ *
+ * skb_gso_mac_seglen is used to determine the real size of the
+ * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4
+ * headers (TCP/UDP).
+ */
+static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
+{
+ unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
+
+ return hdr_len + skb_gso_transport_seglen(skb);
+}
+
+/**
+ * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS
+ *
+ * There are a couple of instances where we have a GSO skb, and we
+ * want to determine what size it would be after it is segmented.
+ *
+ * We might want to check:
+ * - L3+L4+payload size (e.g. IP forwarding)
+ * - L2+L3+L4+payload size (e.g. sanity check before passing to driver)
+ *
+ * This is a helper to do that correctly considering GSO_BY_FRAGS.
+ *
+ * @skb: GSO skb
+ *
+ * @seg_len: The segmented length (from skb_gso_*_seglen). In the
+ * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS].
+ *
+ * @max_len: The maximum permissible length.
+ *
+ * Returns true if the segmented length <= max length.
+ */
+static inline bool skb_gso_size_check(const struct sk_buff *skb,
+ unsigned int seg_len,
+ unsigned int max_len) {
+ const struct skb_shared_info *shinfo = skb_shinfo(skb);
+ const struct sk_buff *iter;
+
+ if (shinfo->gso_size != GSO_BY_FRAGS)
+ return seg_len <= max_len;
+
+ /* Undo this so we can re-use header sizes */
+ seg_len -= GSO_BY_FRAGS;
+
+ skb_walk_frags(skb, iter) {
+ if (seg_len + skb_headlen(iter) > max_len)
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU?
+ *
+ * @skb: GSO skb
+ * @mtu: MTU to validate against
+ *
+ * skb_gso_validate_network_len validates if a given skb will fit a
+ * wanted MTU once split. It considers L3 headers, L4 headers, and the
+ * payload.
+ */
+bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu)
+{
+ return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu);
+}
+EXPORT_SYMBOL_GPL(skb_gso_validate_network_len);
+
+/**
+ * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length?
+ *
+ * @skb: GSO skb
+ * @len: length to validate against
+ *
+ * skb_gso_validate_mac_len validates if a given skb will fit a wanted
+ * length once split, including L2, L3 and L4 headers and the payload.
+ */
+bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len)
+{
+ return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len);
+}
+EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len);
+
diff --git a/net/core/hotdata.c b/net/core/hotdata.c
new file mode 100644
index 000000000000..dddd5c287cf0
--- /dev/null
+++ b/net/core/hotdata.c
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/cache.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+#include <net/aligned_data.h>
+#include <net/hotdata.h>
+#include <net/ip.h>
+#include <net/proto_memory.h>
+
+struct net_hotdata net_hotdata __cacheline_aligned = {
+ .offload_base = LIST_HEAD_INIT(net_hotdata.offload_base),
+ .gro_normal_batch = 8,
+
+ .netdev_budget = 300,
+ /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
+ .netdev_budget_usecs = 2 * USEC_PER_SEC / HZ,
+
+ .tstamp_prequeue = 1,
+ .max_backlog = 1000,
+ .dev_tx_weight = 64,
+ .dev_rx_weight = 64,
+ .sysctl_max_skb_frags = MAX_SKB_FRAGS,
+ .sysctl_skb_defer_max = 128,
+ .sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE
+};
+EXPORT_SYMBOL(net_hotdata);
+
+struct net_aligned_data net_aligned_data;
+EXPORT_IPV6_MOD(net_aligned_data);
diff --git a/net/core/hwbm.c b/net/core/hwbm.c
index 2cab489ae62e..ac1a66df9adc 100644
--- a/net/core/hwbm.c
+++ b/net/core/hwbm.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* Support for hardware buffer manager.
*
* Copyright (C) 2016 Marvell
*
* Gregory CLEMENT <gregory.clement@free-electrons.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/kernel.h>
#include <linux/printk.h>
@@ -47,34 +43,33 @@ int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp)
}
EXPORT_SYMBOL_GPL(hwbm_pool_refill);
-int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
+int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num)
{
int err, i;
- unsigned long flags;
- spin_lock_irqsave(&bm_pool->lock, flags);
+ mutex_lock(&bm_pool->buf_lock);
if (bm_pool->buf_num == bm_pool->size) {
pr_warn("pool already filled\n");
- spin_unlock_irqrestore(&bm_pool->lock, flags);
+ mutex_unlock(&bm_pool->buf_lock);
return bm_pool->buf_num;
}
if (buf_num + bm_pool->buf_num > bm_pool->size) {
pr_warn("cannot allocate %d buffers for pool\n",
buf_num);
- spin_unlock_irqrestore(&bm_pool->lock, flags);
+ mutex_unlock(&bm_pool->buf_lock);
return 0;
}
if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) {
pr_warn("Adding %d buffers to the %d current buffers will overflow\n",
buf_num, bm_pool->buf_num);
- spin_unlock_irqrestore(&bm_pool->lock, flags);
+ mutex_unlock(&bm_pool->buf_lock);
return 0;
}
for (i = 0; i < buf_num; i++) {
- err = hwbm_pool_refill(bm_pool, gfp);
+ err = hwbm_pool_refill(bm_pool, GFP_KERNEL);
if (err < 0)
break;
}
@@ -83,7 +78,7 @@ int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
bm_pool->buf_num += i;
pr_debug("hwpm pool: %d of %d buffers added\n", i, buf_num);
- spin_unlock_irqrestore(&bm_pool->lock, flags);
+ mutex_unlock(&bm_pool->buf_lock);
return i;
}
diff --git a/net/core/ieee8021q_helpers.c b/net/core/ieee8021q_helpers.c
new file mode 100644
index 000000000000..669b357b73b2
--- /dev/null
+++ b/net/core/ieee8021q_helpers.c
@@ -0,0 +1,224 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2024 Pengutronix, Oleksij Rempel <kernel@pengutronix.de>
+
+#include <linux/array_size.h>
+#include <linux/printk.h>
+#include <linux/types.h>
+#include <net/dscp.h>
+#include <net/ieee8021q.h>
+
+/* verify that table covers all 8 traffic types */
+#define TT_MAP_SIZE_OK(tbl) \
+ compiletime_assert(ARRAY_SIZE(tbl) == IEEE8021Q_TT_MAX, \
+ #tbl " size mismatch")
+
+/* The following arrays map Traffic Types (TT) to traffic classes (TC) for
+ * different number of queues as shown in the example provided by
+ * IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic class mapping" and
+ * Table I-1 "Traffic type to traffic class mapping".
+ */
+static const u8 ieee8021q_8queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0,
+ [IEEE8021Q_TT_BE] = 1,
+ [IEEE8021Q_TT_EE] = 2,
+ [IEEE8021Q_TT_CA] = 3,
+ [IEEE8021Q_TT_VI] = 4,
+ [IEEE8021Q_TT_VO] = 5,
+ [IEEE8021Q_TT_IC] = 6,
+ [IEEE8021Q_TT_NC] = 7,
+};
+
+static const u8 ieee8021q_7queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0,
+ [IEEE8021Q_TT_BE] = 1,
+ [IEEE8021Q_TT_EE] = 2,
+ [IEEE8021Q_TT_CA] = 3,
+ [IEEE8021Q_TT_VI] = 4, [IEEE8021Q_TT_VO] = 4,
+ [IEEE8021Q_TT_IC] = 5,
+ [IEEE8021Q_TT_NC] = 6,
+};
+
+static const u8 ieee8021q_6queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0,
+ [IEEE8021Q_TT_BE] = 1,
+ [IEEE8021Q_TT_EE] = 2, [IEEE8021Q_TT_CA] = 2,
+ [IEEE8021Q_TT_VI] = 3, [IEEE8021Q_TT_VO] = 3,
+ [IEEE8021Q_TT_IC] = 4,
+ [IEEE8021Q_TT_NC] = 5,
+};
+
+static const u8 ieee8021q_5queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+ [IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1,
+ [IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2,
+ [IEEE8021Q_TT_IC] = 3,
+ [IEEE8021Q_TT_NC] = 4,
+};
+
+static const u8 ieee8021q_4queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+ [IEEE8021Q_TT_EE] = 1, [IEEE8021Q_TT_CA] = 1,
+ [IEEE8021Q_TT_VI] = 2, [IEEE8021Q_TT_VO] = 2,
+ [IEEE8021Q_TT_IC] = 3, [IEEE8021Q_TT_NC] = 3,
+};
+
+static const u8 ieee8021q_3queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+ [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0,
+ [IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1,
+ [IEEE8021Q_TT_IC] = 2, [IEEE8021Q_TT_NC] = 2,
+};
+
+static const u8 ieee8021q_2queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+ [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0,
+ [IEEE8021Q_TT_VI] = 1, [IEEE8021Q_TT_VO] = 1,
+ [IEEE8021Q_TT_IC] = 1, [IEEE8021Q_TT_NC] = 1,
+};
+
+static const u8 ieee8021q_1queue_tt_tc_map[] = {
+ [IEEE8021Q_TT_BK] = 0, [IEEE8021Q_TT_BE] = 0,
+ [IEEE8021Q_TT_EE] = 0, [IEEE8021Q_TT_CA] = 0,
+ [IEEE8021Q_TT_VI] = 0, [IEEE8021Q_TT_VO] = 0,
+ [IEEE8021Q_TT_IC] = 0, [IEEE8021Q_TT_NC] = 0,
+};
+
+/**
+ * ieee8021q_tt_to_tc - Map IEEE 802.1Q Traffic Type to Traffic Class
+ * @tt: IEEE 802.1Q Traffic Type
+ * @num_queues: Number of queues
+ *
+ * This function maps an IEEE 802.1Q Traffic Type to a Traffic Class (TC) based
+ * on the number of queues configured on the NIC. The mapping is based on the
+ * example provided by IEEE 802.1Q-2022 in Annex I "I.3 Traffic type to traffic
+ * class mapping" and Table I-1 "Traffic type to traffic class mapping".
+ *
+ * Return: Traffic Class corresponding to the given Traffic Type or negative
+ * value in case of error.
+ */
+int ieee8021q_tt_to_tc(enum ieee8021q_traffic_type tt, unsigned int num_queues)
+{
+ if (tt < 0 || tt >= IEEE8021Q_TT_MAX) {
+ pr_err("Requested Traffic Type (%d) is out of range (%d)\n", tt,
+ IEEE8021Q_TT_MAX);
+ return -EINVAL;
+ }
+
+ switch (num_queues) {
+ case 8:
+ TT_MAP_SIZE_OK(ieee8021q_8queue_tt_tc_map);
+ return ieee8021q_8queue_tt_tc_map[tt];
+ case 7:
+ TT_MAP_SIZE_OK(ieee8021q_7queue_tt_tc_map);
+ return ieee8021q_7queue_tt_tc_map[tt];
+ case 6:
+ TT_MAP_SIZE_OK(ieee8021q_6queue_tt_tc_map);
+ return ieee8021q_6queue_tt_tc_map[tt];
+ case 5:
+ TT_MAP_SIZE_OK(ieee8021q_5queue_tt_tc_map);
+ return ieee8021q_5queue_tt_tc_map[tt];
+ case 4:
+ TT_MAP_SIZE_OK(ieee8021q_4queue_tt_tc_map);
+ return ieee8021q_4queue_tt_tc_map[tt];
+ case 3:
+ TT_MAP_SIZE_OK(ieee8021q_3queue_tt_tc_map);
+ return ieee8021q_3queue_tt_tc_map[tt];
+ case 2:
+ TT_MAP_SIZE_OK(ieee8021q_2queue_tt_tc_map);
+ return ieee8021q_2queue_tt_tc_map[tt];
+ case 1:
+ TT_MAP_SIZE_OK(ieee8021q_1queue_tt_tc_map);
+ return ieee8021q_1queue_tt_tc_map[tt];
+ }
+
+ pr_err("Invalid number of queues %d\n", num_queues);
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(ieee8021q_tt_to_tc);
+
+/**
+ * ietf_dscp_to_ieee8021q_tt - Map IETF DSCP to IEEE 802.1Q Traffic Type
+ * @dscp: IETF DSCP value
+ *
+ * This function maps an IETF DSCP value to an IEEE 802.1Q Traffic Type (TT).
+ * Since there is no corresponding mapping between DSCP and IEEE 802.1Q Traffic
+ * Type, this function is inspired by the RFC8325 documentation which describe
+ * the mapping between DSCP and 802.11 User Priority (UP) values.
+ *
+ * Return: IEEE 802.1Q Traffic Type corresponding to the given DSCP value
+ */
+int ietf_dscp_to_ieee8021q_tt(u8 dscp)
+{
+ switch (dscp) {
+ case DSCP_CS0:
+ /* Comment from RFC8325:
+ * [RFC4594], Section 4.8, recommends High-Throughput Data be marked
+ * AF1x (that is, AF11, AF12, and AF13, according to the rules defined
+ * in [RFC2475]).
+ *
+ * By default (as described in Section 2.3), High-Throughput Data will
+ * map to UP 1 and, thus, to the Background Access Category (AC_BK),
+ * which is contrary to the intent expressed in [RFC4594].
+
+ * Unfortunately, there really is no corresponding fit for the High-
+ * Throughput Data service class within the constrained 4 Access
+ * Category [IEEE.802.11-2016] model. If the High-Throughput Data
+ * service class is assigned to the Best Effort Access Category (AC_BE),
+ * then it would contend with Low-Latency Data (while [RFC4594]
+ * recommends a distinction in servicing between these service classes)
+ * as well as with the default service class; alternatively, if it is
+ * assigned to the Background Access Category (AC_BK), then it would
+ * receive a less-then-best-effort service and contend with Low-Priority
+ * Data (as discussed in Section 4.2.10).
+ *
+ * As such, since there is no directly corresponding fit for the High-
+ * Throughout Data service class within the [IEEE.802.11-2016] model, it
+ * is generally RECOMMENDED to map High-Throughput Data to UP 0, thereby
+ * admitting it to the Best Effort Access Category (AC_BE).
+ *
+ * Note: The above text is from RFC8325 which is describing the mapping
+ * between DSCP and 802.11 User Priority (UP) values. The mapping
+ * between UP and IEEE 802.1Q Traffic Type is not defined in the RFC but
+ * the 802.11 AC_BK and AC_BE are closely related to the IEEE 802.1Q
+ * Traffic Types BE and BK.
+ */
+ case DSCP_AF11:
+ case DSCP_AF12:
+ case DSCP_AF13:
+ return IEEE8021Q_TT_BE;
+ /* Comment from RFC8325:
+ * RFC3662 and RFC4594 both recommend Low-Priority Data be marked
+ * with DSCP CS1. The Low-Priority Data service class loosely
+ * corresponds to the [IEEE.802.11-2016] Background Access Category
+ */
+ case DSCP_CS1:
+ return IEEE8021Q_TT_BK;
+ case DSCP_CS2:
+ case DSCP_AF21:
+ case DSCP_AF22:
+ case DSCP_AF23:
+ return IEEE8021Q_TT_EE;
+ case DSCP_CS3:
+ case DSCP_AF31:
+ case DSCP_AF32:
+ case DSCP_AF33:
+ return IEEE8021Q_TT_CA;
+ case DSCP_CS4:
+ case DSCP_AF41:
+ case DSCP_AF42:
+ case DSCP_AF43:
+ return IEEE8021Q_TT_VI;
+ case DSCP_CS5:
+ case DSCP_EF:
+ case DSCP_VOICE_ADMIT:
+ return IEEE8021Q_TT_VO;
+ case DSCP_CS6:
+ return IEEE8021Q_TT_IC;
+ case DSCP_CS7:
+ return IEEE8021Q_TT_NC;
+ }
+
+ return SIMPLE_IETF_DSCP_TO_IEEE8021Q_TT(dscp);
+}
+EXPORT_SYMBOL_GPL(ietf_dscp_to_ieee8021q_tt);
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index e38e641e98d5..212cde35affa 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -1,14 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux network device link state notification
*
* Author:
* Stefan Rompf <sux@loplof.de>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/module.h>
@@ -23,6 +18,7 @@
#include <linux/bitops.h>
#include <linux/types.h>
+#include "dev.h"
enum lw_bits {
LW_URGENT = 0,
@@ -37,11 +33,37 @@ static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event);
static LIST_HEAD(lweventlist);
static DEFINE_SPINLOCK(lweventlist_lock);
-static unsigned char default_operstate(const struct net_device *dev)
+static unsigned int default_operstate(const struct net_device *dev)
{
- if (!netif_carrier_ok(dev))
- return (dev->ifindex != dev_get_iflink(dev) ?
- IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN);
+ if (netif_testing(dev))
+ return IF_OPER_TESTING;
+
+ /* Some uppers (DSA) have additional sources for being down, so
+ * first check whether lower is indeed the source of its down state.
+ */
+ if (!netif_carrier_ok(dev)) {
+ struct net_device *peer;
+ int iflink;
+
+ /* If called from netdev_run_todo()/linkwatch_sync_dev(),
+ * dev_net(dev) can be already freed, and RTNL is not held.
+ */
+ if (dev->reg_state <= NETREG_REGISTERED)
+ iflink = dev_get_iflink(dev);
+ else
+ iflink = dev->ifindex;
+
+ if (iflink == dev->ifindex)
+ return IF_OPER_DOWN;
+
+ ASSERT_RTNL();
+ peer = __dev_get_by_index(dev_net(dev), iflink);
+ if (!peer)
+ return IF_OPER_DOWN;
+
+ return netif_carrier_ok(peer) ? IF_OPER_DOWN :
+ IF_OPER_LOWERLAYERDOWN;
+ }
if (netif_dormant(dev))
return IF_OPER_DORMANT;
@@ -49,37 +71,37 @@ static unsigned char default_operstate(const struct net_device *dev)
return IF_OPER_UP;
}
-
static void rfc2863_policy(struct net_device *dev)
{
- unsigned char operstate = default_operstate(dev);
+ unsigned int operstate = default_operstate(dev);
- if (operstate == dev->operstate)
+ if (operstate == READ_ONCE(dev->operstate))
return;
- write_lock_bh(&dev_base_lock);
-
switch(dev->link_mode) {
+ case IF_LINK_MODE_TESTING:
+ if (operstate == IF_OPER_UP)
+ operstate = IF_OPER_TESTING;
+ break;
+
case IF_LINK_MODE_DORMANT:
if (operstate == IF_OPER_UP)
operstate = IF_OPER_DORMANT;
break;
-
case IF_LINK_MODE_DEFAULT:
default:
break;
}
- dev->operstate = operstate;
-
- write_unlock_bh(&dev_base_lock);
+ WRITE_ONCE(dev->operstate, operstate);
}
void linkwatch_init_dev(struct net_device *dev)
{
/* Handle pre-registration link state changes */
- if (!netif_carrier_ok(dev) || netif_dormant(dev))
+ if (!netif_carrier_ok(dev) || netif_dormant(dev) ||
+ netif_testing(dev))
rfc2863_policy(dev);
}
@@ -106,7 +128,7 @@ static void linkwatch_add_event(struct net_device *dev)
spin_lock_irqsave(&lweventlist_lock, flags);
if (list_empty(&dev->link_watch_list)) {
list_add_tail(&dev->link_watch_list, &lweventlist);
- dev_hold(dev);
+ netdev_hold(dev, &dev->linkwatch_dev_tracker, GFP_ATOMIC);
}
spin_unlock_irqrestore(&lweventlist_lock, flags);
}
@@ -135,9 +157,9 @@ static void linkwatch_schedule_work(int urgent)
* override the existing timer.
*/
if (test_bit(LW_URGENT, &linkwatch_flags))
- mod_delayed_work(system_wq, &linkwatch_work, 0);
+ mod_delayed_work(system_dfl_wq, &linkwatch_work, 0);
else
- schedule_delayed_work(&linkwatch_work, delay);
+ queue_delayed_work(system_dfl_wq, &linkwatch_work, delay);
}
@@ -161,16 +183,29 @@ static void linkwatch_do_dev(struct net_device *dev)
else
dev_deactivate(dev);
- netdev_state_change(dev);
+ netif_state_change(dev);
}
- dev_put(dev);
+ /* Note: our callers are responsible for calling netdev_tracker_free().
+ * This is the reason we use __dev_put() instead of dev_put().
+ */
+ __dev_put(dev);
}
static void __linkwatch_run_queue(int urgent_only)
{
- struct net_device *dev;
+#define MAX_DO_DEV_PER_LOOP 100
+
+ int do_dev = MAX_DO_DEV_PER_LOOP;
+ /* Use a local list here since we add non-urgent
+ * events back to the global one when called with
+ * urgent_only=1.
+ */
LIST_HEAD(wrk);
+ /* Give urgent case more budget */
+ if (urgent_only)
+ do_dev += MAX_DO_DEV_PER_LOOP;
+
/*
* Limit the number of linkwatch events to one
* per second so that a runaway driver does not
@@ -189,40 +224,72 @@ static void __linkwatch_run_queue(int urgent_only)
spin_lock_irq(&lweventlist_lock);
list_splice_init(&lweventlist, &wrk);
- while (!list_empty(&wrk)) {
+ while (!list_empty(&wrk) && do_dev > 0) {
+ struct net_device *dev;
dev = list_first_entry(&wrk, struct net_device, link_watch_list);
list_del_init(&dev->link_watch_list);
- if (urgent_only && !linkwatch_urgent_event(dev)) {
+ if (!netif_device_present(dev) ||
+ (urgent_only && !linkwatch_urgent_event(dev))) {
list_add_tail(&dev->link_watch_list, &lweventlist);
continue;
}
+ /* We must free netdev tracker under
+ * the spinlock protection.
+ */
+ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker);
spin_unlock_irq(&lweventlist_lock);
+ netdev_lock_ops(dev);
linkwatch_do_dev(dev);
+ netdev_unlock_ops(dev);
+ do_dev--;
spin_lock_irq(&lweventlist_lock);
}
+ /* Add the remaining work back to lweventlist */
+ list_splice_init(&wrk, &lweventlist);
+
if (!list_empty(&lweventlist))
linkwatch_schedule_work(0);
spin_unlock_irq(&lweventlist_lock);
}
-void linkwatch_forget_dev(struct net_device *dev)
+static bool linkwatch_clean_dev(struct net_device *dev)
{
unsigned long flags;
- int clean = 0;
+ bool clean = false;
spin_lock_irqsave(&lweventlist_lock, flags);
if (!list_empty(&dev->link_watch_list)) {
list_del_init(&dev->link_watch_list);
- clean = 1;
+ clean = true;
+ /* We must release netdev tracker under
+ * the spinlock protection.
+ */
+ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker);
}
spin_unlock_irqrestore(&lweventlist_lock, flags);
- if (clean)
+
+ return clean;
+}
+
+void __linkwatch_sync_dev(struct net_device *dev)
+{
+ netdev_ops_assert_locked(dev);
+
+ if (linkwatch_clean_dev(dev))
linkwatch_do_dev(dev);
}
+void linkwatch_sync_dev(struct net_device *dev)
+{
+ if (linkwatch_clean_dev(dev)) {
+ netdev_lock_ops(dev);
+ linkwatch_do_dev(dev);
+ netdev_unlock_ops(dev);
+ }
+}
/* Must be called with the rtnl semaphore held */
void linkwatch_run_queue(void)
diff --git a/net/core/lock_debug.c b/net/core/lock_debug.c
new file mode 100644
index 000000000000..9e9fb25314b9
--- /dev/null
+++ b/net/core/lock_debug.c
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright Amazon.com Inc. or its affiliates. */
+
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/notifier.h>
+#include <linux/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/netdev_lock.h>
+#include <net/netns/generic.h>
+
+int netdev_debug_event(struct notifier_block *nb, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct net *net = dev_net(dev);
+ enum netdev_cmd cmd = event;
+
+ /* Keep enum and don't add default to trigger -Werror=switch */
+ switch (cmd) {
+ case NETDEV_XDP_FEAT_CHANGE:
+ netdev_assert_locked(dev);
+ fallthrough;
+ case NETDEV_CHANGE:
+ case NETDEV_REGISTER:
+ case NETDEV_UP:
+ netdev_ops_assert_locked(dev);
+ fallthrough;
+ case NETDEV_DOWN:
+ case NETDEV_REBOOT:
+ case NETDEV_UNREGISTER:
+ case NETDEV_CHANGEMTU:
+ case NETDEV_CHANGEADDR:
+ case NETDEV_PRE_CHANGEADDR:
+ case NETDEV_GOING_DOWN:
+ case NETDEV_FEAT_CHANGE:
+ case NETDEV_BONDING_FAILOVER:
+ case NETDEV_PRE_UP:
+ case NETDEV_PRE_TYPE_CHANGE:
+ case NETDEV_POST_TYPE_CHANGE:
+ case NETDEV_POST_INIT:
+ case NETDEV_PRE_UNINIT:
+ case NETDEV_RELEASE:
+ case NETDEV_NOTIFY_PEERS:
+ case NETDEV_JOIN:
+ case NETDEV_CHANGEUPPER:
+ case NETDEV_RESEND_IGMP:
+ case NETDEV_PRECHANGEMTU:
+ case NETDEV_CHANGEINFODATA:
+ case NETDEV_BONDING_INFO:
+ case NETDEV_PRECHANGEUPPER:
+ case NETDEV_CHANGELOWERSTATE:
+ case NETDEV_UDP_TUNNEL_PUSH_INFO:
+ case NETDEV_UDP_TUNNEL_DROP_INFO:
+ case NETDEV_CHANGE_TX_QUEUE_LEN:
+ case NETDEV_CVLAN_FILTER_PUSH_INFO:
+ case NETDEV_CVLAN_FILTER_DROP_INFO:
+ case NETDEV_SVLAN_FILTER_PUSH_INFO:
+ case NETDEV_SVLAN_FILTER_DROP_INFO:
+ case NETDEV_OFFLOAD_XSTATS_ENABLE:
+ case NETDEV_OFFLOAD_XSTATS_DISABLE:
+ case NETDEV_OFFLOAD_XSTATS_REPORT_USED:
+ case NETDEV_OFFLOAD_XSTATS_REPORT_DELTA:
+ ASSERT_RTNL();
+ break;
+
+ case NETDEV_CHANGENAME:
+ ASSERT_RTNL_NET(net);
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+EXPORT_SYMBOL_NS_GPL(netdev_debug_event, "NETDEV_INTERNAL");
+
+static int rtnl_net_debug_net_id;
+
+static int __net_init rtnl_net_debug_net_init(struct net *net)
+{
+ struct notifier_block *nb;
+
+ nb = net_generic(net, rtnl_net_debug_net_id);
+ nb->notifier_call = netdev_debug_event;
+
+ return register_netdevice_notifier_net(net, nb);
+}
+
+static void __net_exit rtnl_net_debug_net_exit(struct net *net)
+{
+ struct notifier_block *nb;
+
+ nb = net_generic(net, rtnl_net_debug_net_id);
+ unregister_netdevice_notifier_net(net, nb);
+}
+
+static struct pernet_operations rtnl_net_debug_net_ops __net_initdata = {
+ .init = rtnl_net_debug_net_init,
+ .exit = rtnl_net_debug_net_exit,
+ .id = &rtnl_net_debug_net_id,
+ .size = sizeof(struct notifier_block),
+};
+
+static struct notifier_block rtnl_net_debug_block = {
+ .notifier_call = netdev_debug_event,
+};
+
+static int __init rtnl_net_debug_init(void)
+{
+ int ret;
+
+ ret = register_pernet_subsys(&rtnl_net_debug_net_ops);
+ if (ret)
+ return ret;
+
+ ret = register_netdevice_notifier(&rtnl_net_debug_block);
+ if (ret)
+ unregister_pernet_subsys(&rtnl_net_debug_net_ops);
+
+ return ret;
+}
+
+subsys_initcall(rtnl_net_debug_init);
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 3e85437f7106..9f40be0c3e71 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -1,21 +1,19 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
+#include <linux/filter.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/types.h>
#include <linux/bpf.h>
+#include <net/flow.h>
#include <net/lwtunnel.h>
+#include <net/gre.h>
+#include <net/ip.h>
+#include <net/ip6_route.h>
+#include <net/ipv6_stubs.h>
struct bpf_lwt_prog {
struct bpf_prog *prog;
@@ -42,19 +40,20 @@ static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt)
static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
struct dst_entry *dst, bool can_redirect)
{
+ struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
int ret;
- /* Preempt disable is needed to protect per-cpu redirect_info between
- * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and
- * access to maps strictly require a rcu_read_lock() for protection,
- * mixing with BH RCU lock doesn't work.
+ /* Disabling BH is needed to protect per-CPU bpf_redirect_info between
+ * BPF prog and skb_do_redirect().
*/
- preempt_disable();
+ local_bh_disable();
+ bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx);
bpf_compute_data_pointers(skb);
ret = bpf_prog_run_save_cb(lwt->prog, skb);
switch (ret) {
case BPF_OK:
+ case BPF_LWT_REROUTE:
break;
case BPF_REDIRECT:
@@ -63,9 +62,9 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
lwt->name ? : "<unknown>");
ret = BPF_OK;
} else {
- ret = skb_do_redirect(skb);
- if (ret == 0)
- ret = BPF_REDIRECT;
+ skb_reset_mac_header(skb);
+ skb_do_redirect(skb);
+ ret = BPF_REDIRECT;
}
break;
@@ -81,11 +80,43 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
break;
}
- preempt_enable();
+ bpf_net_ctx_clear(bpf_net_ctx);
+ local_bh_enable();
return ret;
}
+static int bpf_lwt_input_reroute(struct sk_buff *skb)
+{
+ enum skb_drop_reason reason;
+ int err = -EINVAL;
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ struct net_device *dev = skb_dst(skb)->dev;
+ const struct iphdr *iph = ip_hdr(skb);
+
+ dev_hold(dev);
+ skb_dst_drop(skb);
+ reason = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ ip4h_dscp(iph), dev);
+ err = reason ? -EINVAL : 0;
+ dev_put(dev);
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ skb_dst_drop(skb);
+ err = ipv6_stub->ipv6_route_input(skb);
+ } else {
+ err = -EAFNOSUPPORT;
+ }
+
+ if (err)
+ goto err;
+ return dst_input(skb);
+
+err:
+ kfree_skb(skb);
+ return err;
+}
+
static int bpf_input(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
@@ -97,11 +128,11 @@ static int bpf_input(struct sk_buff *skb)
ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
if (ret < 0)
return ret;
+ if (ret == BPF_LWT_REROUTE)
+ return bpf_lwt_input_reroute(skb);
}
if (unlikely(!dst->lwtstate->orig_input)) {
- pr_warn_once("orig_input not set on dst for prog %s\n",
- bpf->out.name);
kfree_skb(skb);
return -EINVAL;
}
@@ -132,10 +163,8 @@ static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
return dst->lwtstate->orig_output(net, sk, skb);
}
-static int xmit_check_hhlen(struct sk_buff *skb)
+static int xmit_check_hhlen(struct sk_buff *skb, int hh_len)
{
- int hh_len = skb_dst(skb)->dev->hard_header_len;
-
if (skb_headroom(skb) < hh_len) {
int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
@@ -146,6 +175,100 @@ static int xmit_check_hhlen(struct sk_buff *skb)
return 0;
}
+static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
+{
+ struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev);
+ int oif = l3mdev ? l3mdev->ifindex : 0;
+ struct dst_entry *dst = NULL;
+ int err = -EAFNOSUPPORT;
+ struct sock *sk;
+ struct net *net;
+ bool ipv4;
+
+ if (skb->protocol == htons(ETH_P_IP))
+ ipv4 = true;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ ipv4 = false;
+ else
+ goto err;
+
+ sk = sk_to_full_sk(skb->sk);
+ if (sk) {
+ if (sk->sk_bound_dev_if)
+ oif = sk->sk_bound_dev_if;
+ net = sock_net(sk);
+ } else {
+ net = dev_net(skb_dst(skb)->dev);
+ }
+
+ if (ipv4) {
+ struct iphdr *iph = ip_hdr(skb);
+ struct flowi4 fl4 = {};
+ struct rtable *rt;
+
+ fl4.flowi4_oif = oif;
+ fl4.flowi4_mark = skb->mark;
+ fl4.flowi4_uid = sock_net_uid(net, sk);
+ fl4.flowi4_dscp = ip4h_dscp(iph);
+ fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
+ fl4.flowi4_proto = iph->protocol;
+ fl4.daddr = iph->daddr;
+ fl4.saddr = iph->saddr;
+
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ goto err;
+ }
+ dst = &rt->dst;
+ } else {
+ struct ipv6hdr *iph6 = ipv6_hdr(skb);
+ struct flowi6 fl6 = {};
+
+ fl6.flowi6_oif = oif;
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_uid = sock_net_uid(net, sk);
+ fl6.flowlabel = ip6_flowinfo(iph6);
+ fl6.flowi6_proto = iph6->nexthdr;
+ fl6.daddr = iph6->daddr;
+ fl6.saddr = iph6->saddr;
+
+ dst = ipv6_stub->ipv6_dst_lookup_flow(net, skb->sk, &fl6, NULL);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ goto err;
+ }
+ }
+ if (unlikely(dst->error)) {
+ err = dst->error;
+ dst_release(dst);
+ goto err;
+ }
+
+ /* Although skb header was reserved in bpf_lwt_push_ip_encap(), it
+ * was done for the previous dst, so we are doing it here again, in
+ * case the new dst needs much more space. The call below is a noop
+ * if there is enough header space in skb.
+ */
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
+ if (unlikely(err))
+ goto err;
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb);
+ if (unlikely(err))
+ return net_xmit_errno(err);
+
+ /* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
+ return LWTUNNEL_XMIT_DONE;
+
+err:
+ kfree_skb(skb);
+ return err;
+}
+
static int bpf_xmit(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
@@ -153,21 +276,33 @@ static int bpf_xmit(struct sk_buff *skb)
bpf = bpf_lwt_lwtunnel(dst->lwtstate);
if (bpf->xmit.prog) {
+ int hh_len = dst->dev->hard_header_len;
+ __be16 proto = skb->protocol;
int ret;
ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
switch (ret) {
case BPF_OK:
+ /* If the header changed, e.g. via bpf_lwt_push_encap,
+ * BPF_LWT_REROUTE below should have been used if the
+ * protocol was also changed.
+ */
+ if (skb->protocol != proto) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
/* If the header was expanded, headroom might be too
* small for L2 header to come, expand as needed.
*/
- ret = xmit_check_hhlen(skb);
+ ret = xmit_check_hhlen(skb, hh_len);
if (unlikely(ret))
return ret;
return LWTUNNEL_XMIT_CONTINUE;
case BPF_REDIRECT:
return LWTUNNEL_XMIT_DONE;
+ case BPF_LWT_REROUTE:
+ return bpf_lwt_xmit_reroute(skb);
default:
return ret;
}
@@ -207,8 +342,8 @@ static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog,
int ret;
u32 fd;
- ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy,
- NULL);
+ ret = nla_parse_nested_deprecated(tb, LWT_BPF_PROG_MAX, attr,
+ bpf_prog_policy, NULL);
if (ret < 0)
return ret;
@@ -236,7 +371,7 @@ static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = {
[LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 },
};
-static int bpf_build_state(struct nlattr *nla,
+static int bpf_build_state(struct net *net, struct nlattr *nla,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
@@ -249,7 +384,8 @@ static int bpf_build_state(struct nlattr *nla,
if (family != AF_INET && family != AF_INET6)
return -EAFNOSUPPORT;
- ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack);
+ ret = nla_parse_nested_deprecated(tb, LWT_BPF_MAX, nla, bpf_nl_policy,
+ extack);
if (ret < 0)
return ret;
@@ -317,7 +453,7 @@ static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr,
if (!prog->prog)
return 0;
- nest = nla_nest_start(skb, attr);
+ nest = nla_nest_start_noflag(skb, attr);
if (!nest)
return -EMSGSIZE;
@@ -389,6 +525,135 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = {
.owner = THIS_MODULE,
};
+static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type,
+ int encap_len)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+ gso_type |= SKB_GSO_DODGY;
+ shinfo->gso_type |= gso_type;
+ skb_decrease_gso_size(shinfo, encap_len);
+ shinfo->gso_segs = 0;
+ return 0;
+}
+
+static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len)
+{
+ int next_hdr_offset;
+ void *next_hdr;
+ __u8 protocol;
+
+ /* SCTP and UDP_L4 gso need more nuanced handling than what
+ * handle_gso_type() does above: skb_decrease_gso_size() is not enough.
+ * So at the moment only TCP GSO packets are let through.
+ */
+ if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
+ return -ENOTSUPP;
+
+ if (ipv4) {
+ protocol = ip_hdr(skb)->protocol;
+ next_hdr_offset = sizeof(struct iphdr);
+ next_hdr = skb_network_header(skb) + next_hdr_offset;
+ } else {
+ protocol = ipv6_hdr(skb)->nexthdr;
+ next_hdr_offset = sizeof(struct ipv6hdr);
+ next_hdr = skb_network_header(skb) + next_hdr_offset;
+ }
+
+ switch (protocol) {
+ case IPPROTO_GRE:
+ next_hdr_offset += sizeof(struct gre_base_hdr);
+ if (next_hdr_offset > encap_len)
+ return -EINVAL;
+
+ if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM)
+ return handle_gso_type(skb, SKB_GSO_GRE_CSUM,
+ encap_len);
+ return handle_gso_type(skb, SKB_GSO_GRE, encap_len);
+
+ case IPPROTO_UDP:
+ next_hdr_offset += sizeof(struct udphdr);
+ if (next_hdr_offset > encap_len)
+ return -EINVAL;
+
+ if (((struct udphdr *)next_hdr)->check)
+ return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM,
+ encap_len);
+ return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len);
+
+ case IPPROTO_IP:
+ case IPPROTO_IPV6:
+ if (ipv4)
+ return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len);
+ else
+ return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len);
+
+ default:
+ return -EPROTONOSUPPORT;
+ }
+}
+
+int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
+{
+ struct iphdr *iph;
+ bool ipv4;
+ int err;
+
+ if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM))
+ return -EINVAL;
+
+ /* validate protocol and length */
+ iph = (struct iphdr *)hdr;
+ if (iph->version == 4) {
+ ipv4 = true;
+ if (unlikely(len < iph->ihl * 4))
+ return -EINVAL;
+ } else if (iph->version == 6) {
+ ipv4 = false;
+ if (unlikely(len < sizeof(struct ipv6hdr)))
+ return -EINVAL;
+ } else {
+ return -EINVAL;
+ }
+
+ if (ingress)
+ err = skb_cow_head(skb, len + skb->mac_len);
+ else
+ err = skb_cow_head(skb,
+ len + LL_RESERVED_SPACE(skb_dst(skb)->dev));
+ if (unlikely(err))
+ return err;
+
+ /* push the encap headers and fix pointers */
+ skb_reset_inner_headers(skb);
+ skb_reset_inner_mac_header(skb); /* mac header is not yet set */
+ skb_set_inner_protocol(skb, skb->protocol);
+ skb->encapsulation = 1;
+ skb_push(skb, len);
+ if (ingress)
+ skb_postpush_rcsum(skb, iph, len);
+ skb_reset_network_header(skb);
+ memcpy(skb_network_header(skb), hdr, len);
+ bpf_compute_data_pointers(skb);
+ skb_clear_hash(skb);
+
+ if (ipv4) {
+ skb->protocol = htons(ETH_P_IP);
+ iph = ip_hdr(skb);
+
+ if (!iph->check)
+ iph->check = ip_fast_csum((unsigned char *)iph,
+ iph->ihl);
+ } else {
+ skb->protocol = htons(ETH_P_IPV6);
+ }
+
+ if (skb_is_gso(skb))
+ return handle_gso_encap(skb, ipv4, len);
+
+ return 0;
+}
+
static int __init bpf_lwt_init(void)
{
return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 0b171756453c..f9d76d85d04f 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* lwtunnel Infrastructure for light weight tunnels like mpls
*
* Authors: Roopa Prabhu, <roopa@cumulusnetworks.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/capability.h>
@@ -26,7 +21,12 @@
#include <net/lwtunnel.h>
#include <net/rtnetlink.h>
#include <net/ip6_fib.h>
-#include <net/nexthop.h>
+#include <net/rtnh.h>
+
+#include "dev.h"
+
+DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled);
+EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled);
#ifdef CONFIG_MODULES
@@ -46,6 +46,13 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
return "BPF";
case LWTUNNEL_ENCAP_SEG6_LOCAL:
return "SEG6LOCAL";
+ case LWTUNNEL_ENCAP_RPL:
+ return "RPL";
+ case LWTUNNEL_ENCAP_IOAM6:
+ return "IOAM6";
+ case LWTUNNEL_ENCAP_XFRM:
+ /* module autoload not supported for encap type */
+ return NULL;
case LWTUNNEL_ENCAP_IP6:
case LWTUNNEL_ENCAP_IP:
case LWTUNNEL_ENCAP_NONE:
@@ -103,7 +110,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops,
}
EXPORT_SYMBOL_GPL(lwtunnel_encap_del_ops);
-int lwtunnel_build_state(u16 encap_type,
+int lwtunnel_build_state(struct net *net, u16 encap_type,
struct nlattr *encap, unsigned int family,
const void *cfg, struct lwtunnel_state **lws,
struct netlink_ext_ack *extack)
@@ -122,18 +129,18 @@ int lwtunnel_build_state(u16 encap_type,
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[encap_type]);
- if (likely(ops && ops->build_state && try_module_get(ops->owner))) {
+ if (likely(ops && ops->build_state && try_module_get(ops->owner)))
found = true;
- ret = ops->build_state(encap, family, cfg, lws, extack);
- if (ret)
- module_put(ops->owner);
- }
rcu_read_unlock();
- /* don't rely on -EOPNOTSUPP to detect match as build_state
- * handlers could return it
- */
- if (!found) {
+ if (found) {
+ ret = ops->build_state(net, encap, family, cfg, lws, extack);
+ if (ret)
+ module_put(ops->owner);
+ } else {
+ /* don't rely on -EOPNOTSUPP to detect match as build_state
+ * handlers could return it
+ */
NL_SET_ERR_MSG_ATTR(extack, encap,
"LWT encapsulation type not supported");
}
@@ -153,21 +160,14 @@ int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack)
return ret;
}
- rcu_read_lock();
- ops = rcu_dereference(lwtun_encaps[encap_type]);
- rcu_read_unlock();
+ ops = rcu_access_pointer(lwtun_encaps[encap_type]);
#ifdef CONFIG_MODULES
if (!ops) {
const char *encap_type_str = lwtunnel_encap_str(encap_type);
if (encap_type_str) {
- __rtnl_unlock();
request_module("rtnl-lwt-%s", encap_type_str);
- rtnl_lock();
-
- rcu_read_lock();
- ops = rcu_dereference(lwtun_encaps[encap_type]);
- rcu_read_unlock();
+ ops = rcu_access_pointer(lwtun_encaps[encap_type]);
}
}
#endif
@@ -195,10 +195,13 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining,
nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
if (nla_entype) {
+ if (nla_len(nla_entype) < sizeof(u16)) {
+ NL_SET_ERR_MSG(extack, "Invalid RTA_ENCAP_TYPE");
+ return -EINVAL;
+ }
encap_type = nla_get_u16(nla_entype);
- if (lwtunnel_valid_encap_type(encap_type,
- extack) != 0)
+ if (lwtunnel_valid_encap_type(encap_type, extack))
return -EOPNOTSUPP;
}
}
@@ -223,7 +226,8 @@ void lwtstate_free(struct lwtunnel_state *lws)
}
EXPORT_SYMBOL_GPL(lwtstate_free);
-int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate)
+int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate,
+ int encap_attr, int encap_type_attr)
{
const struct lwtunnel_encap_ops *ops;
struct nlattr *nest;
@@ -236,7 +240,7 @@ int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate)
lwtstate->type > LWTUNNEL_ENCAP_MAX)
return 0;
- nest = nla_nest_start(skb, RTA_ENCAP);
+ nest = nla_nest_start_noflag(skb, encap_attr);
if (!nest)
return -EMSGSIZE;
@@ -250,7 +254,7 @@ int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate)
if (ret)
goto nla_put_failure;
nla_nest_end(skb, nest);
- ret = nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type);
+ ret = nla_put_u16(skb, encap_type_attr, lwtstate->type);
if (ret)
goto nla_put_failure;
@@ -315,82 +319,132 @@ EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap);
int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb);
const struct lwtunnel_encap_ops *ops;
struct lwtunnel_state *lwtstate;
- int ret = -EINVAL;
+ struct dst_entry *dst;
+ int ret;
+
+ local_bh_disable();
- if (!dst)
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
+ __func__);
+ ret = -ENETDOWN;
goto drop;
+ }
+
+ dst = skb_dst(skb);
+ if (!dst) {
+ ret = -EINVAL;
+ goto drop;
+ }
lwtstate = dst->lwtstate;
if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
- lwtstate->type > LWTUNNEL_ENCAP_MAX)
- return 0;
+ lwtstate->type > LWTUNNEL_ENCAP_MAX) {
+ ret = 0;
+ goto out;
+ }
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
- if (likely(ops && ops->output))
+ if (likely(ops && ops->output)) {
+ dev_xmit_recursion_inc();
ret = ops->output(net, sk, skb);
+ dev_xmit_recursion_dec();
+ }
rcu_read_unlock();
if (ret == -EOPNOTSUPP)
goto drop;
- return ret;
+ goto out;
drop:
kfree_skb(skb);
+out:
+ local_bh_enable();
return ret;
}
EXPORT_SYMBOL_GPL(lwtunnel_output);
int lwtunnel_xmit(struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb);
const struct lwtunnel_encap_ops *ops;
struct lwtunnel_state *lwtstate;
- int ret = -EINVAL;
+ struct dst_entry *dst;
+ int ret;
+
+ local_bh_disable();
- if (!dst)
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
+ __func__);
+ ret = -ENETDOWN;
goto drop;
+ }
+
+ dst = skb_dst(skb);
+ if (!dst) {
+ ret = -EINVAL;
+ goto drop;
+ }
lwtstate = dst->lwtstate;
if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
- lwtstate->type > LWTUNNEL_ENCAP_MAX)
- return 0;
+ lwtstate->type > LWTUNNEL_ENCAP_MAX) {
+ ret = 0;
+ goto out;
+ }
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
- if (likely(ops && ops->xmit))
+ if (likely(ops && ops->xmit)) {
+ dev_xmit_recursion_inc();
ret = ops->xmit(skb);
+ dev_xmit_recursion_dec();
+ }
rcu_read_unlock();
if (ret == -EOPNOTSUPP)
goto drop;
- return ret;
+ goto out;
drop:
kfree_skb(skb);
+out:
+ local_bh_enable();
return ret;
}
EXPORT_SYMBOL_GPL(lwtunnel_xmit);
int lwtunnel_input(struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb);
const struct lwtunnel_encap_ops *ops;
struct lwtunnel_state *lwtstate;
- int ret = -EINVAL;
+ struct dst_entry *dst;
+ int ret;
+
+ DEBUG_NET_WARN_ON_ONCE(!in_softirq());
- if (!dst)
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
+ __func__);
+ ret = -ENETDOWN;
goto drop;
+ }
+
+ dst = skb_dst(skb);
+ if (!dst) {
+ ret = -EINVAL;
+ goto drop;
+ }
lwtstate = dst->lwtstate;
if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
@@ -400,8 +454,11 @@ int lwtunnel_input(struct sk_buff *skb)
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
- if (likely(ops && ops->input))
+ if (likely(ops && ops->input)) {
+ dev_xmit_recursion_inc();
ret = ops->input(skb);
+ dev_xmit_recursion_dec();
+ }
rcu_read_unlock();
if (ret == -EOPNOTSUPP)
diff --git a/net/core/mp_dmabuf_devmem.h b/net/core/mp_dmabuf_devmem.h
new file mode 100644
index 000000000000..67cd0dd7319c
--- /dev/null
+++ b/net/core/mp_dmabuf_devmem.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Dmabuf device memory provider.
+ *
+ * Authors: Mina Almasry <almasrymina@google.com>
+ *
+ */
+#ifndef _NET_MP_DMABUF_DEVMEM_H
+#define _NET_MP_DMABUF_DEVMEM_H
+
+#include <net/netmem.h>
+
+#if defined(CONFIG_NET_DEVMEM)
+int mp_dmabuf_devmem_init(struct page_pool *pool);
+
+netmem_ref mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp);
+
+void mp_dmabuf_devmem_destroy(struct page_pool *pool);
+
+bool mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem);
+#else
+static inline int mp_dmabuf_devmem_init(struct page_pool *pool)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline netmem_ref
+mp_dmabuf_devmem_alloc_netmems(struct page_pool *pool, gfp_t gfp)
+{
+ return 0;
+}
+
+static inline void mp_dmabuf_devmem_destroy(struct page_pool *pool)
+{
+}
+
+static inline bool
+mp_dmabuf_devmem_release_page(struct page_pool *pool, netmem_ref netmem)
+{
+ return false;
+}
+#endif
+
+#endif /* _NET_MP_DMABUF_DEVMEM_H */
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 91592fceeaad..96a3b1a93252 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Generic address resolution entity
*
@@ -5,11 +6,6 @@
* Pedro Roque <roque@di.fc.ul.pt>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Fixes:
* Vitaly E. Lavrov releasing NULL neighbor in neigh_add.
* Harald Welte Add neighbour cache statistics like rtstat
@@ -30,7 +26,9 @@
#include <linux/times.h>
#include <net/net_namespace.h>
#include <net/neighbour.h>
+#include <net/arp.h>
#include <net/dst.h>
+#include <net/ip.h>
#include <net/sock.h>
#include <net/netevent.h>
#include <net/netlink.h>
@@ -41,7 +39,8 @@
#include <linux/inetdevice.h>
#include <net/addrconf.h>
-#define DEBUG
+#include <trace/events/neigh.h>
+
#define NEIGH_DEBUG 1
#define neigh_dbg(level, fmt, ...) \
do { \
@@ -55,15 +54,34 @@ static void neigh_timer_handler(struct timer_list *t);
static void __neigh_notify(struct neighbour *n, int type, int flags,
u32 pid);
static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid);
-static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
- struct net_device *dev);
+static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
+ bool skip_perm);
#ifdef CONFIG_PROC_FS
static const struct seq_operations neigh_stat_seq_ops;
#endif
+static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family)
+{
+ int i;
+
+ switch (family) {
+ default:
+ DEBUG_NET_WARN_ON_ONCE(1);
+ fallthrough; /* to avoid panic by null-ptr-deref */
+ case AF_INET:
+ i = NEIGH_ARP_TABLE;
+ break;
+ case AF_INET6:
+ i = NEIGH_ND_TABLE;
+ break;
+ }
+
+ return &dev->neighbours[i];
+}
+
/*
- Neighbour hash table buckets are protected with rwlock tbl->lock.
+ Neighbour hash table buckets are protected with tbl->lock.
- All the scans/updates to hash buckets MUST be made under this lock.
- NOTHING clever should be made under this lock: no callbacks
@@ -98,9 +116,7 @@ static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb)
static void neigh_cleanup_and_release(struct neighbour *neigh)
{
- if (neigh->parms->neigh_cleanup)
- neigh->parms->neigh_cleanup(neigh);
-
+ trace_neigh_cleanup_and_release(neigh, 0);
__neigh_notify(neigh, RTM_DELNEIGH, 0, 0);
call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
neigh_release(neigh);
@@ -114,97 +130,186 @@ static void neigh_cleanup_and_release(struct neighbour *neigh)
unsigned long neigh_rand_reach_time(unsigned long base)
{
- return base ? (prandom_u32() % base) + (base >> 1) : 0;
+ return base ? get_random_u32_below(base) + (base >> 1) : 0;
}
EXPORT_SYMBOL(neigh_rand_reach_time);
+static void neigh_mark_dead(struct neighbour *n)
+{
+ n->dead = 1;
+ if (!list_empty(&n->gc_list)) {
+ list_del_init(&n->gc_list);
+ atomic_dec(&n->tbl->gc_entries);
+ }
+ if (!list_empty(&n->managed_list))
+ list_del_init(&n->managed_list);
+}
-static bool neigh_del(struct neighbour *n, __u8 state, __u8 flags,
- struct neighbour __rcu **np, struct neigh_table *tbl)
+static void neigh_update_gc_list(struct neighbour *n)
{
- bool retval = false;
+ bool on_gc_list, exempt_from_gc;
+ spin_lock_bh(&n->tbl->lock);
write_lock(&n->lock);
- if (refcount_read(&n->refcnt) == 1 && !(n->nud_state & state) &&
- !(n->flags & flags)) {
- struct neighbour *neigh;
+ if (n->dead)
+ goto out;
- neigh = rcu_dereference_protected(n->next,
- lockdep_is_held(&tbl->lock));
- rcu_assign_pointer(*np, neigh);
- n->dead = 1;
- retval = true;
+ /* remove from the gc list if new state is permanent or if neighbor is
+ * externally learned / validated; otherwise entry should be on the gc
+ * list
+ */
+ exempt_from_gc = n->nud_state & NUD_PERMANENT ||
+ n->flags & (NTF_EXT_LEARNED | NTF_EXT_VALIDATED);
+ on_gc_list = !list_empty(&n->gc_list);
+
+ if (exempt_from_gc && on_gc_list) {
+ list_del_init(&n->gc_list);
+ atomic_dec(&n->tbl->gc_entries);
+ } else if (!exempt_from_gc && !on_gc_list) {
+ /* add entries to the tail; cleaning removes from the front */
+ list_add_tail(&n->gc_list, &n->tbl->gc_list);
+ atomic_inc(&n->tbl->gc_entries);
}
+out:
write_unlock(&n->lock);
- if (retval)
- neigh_cleanup_and_release(n);
- return retval;
+ spin_unlock_bh(&n->tbl->lock);
}
-bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl)
+static void neigh_update_managed_list(struct neighbour *n)
{
- struct neigh_hash_table *nht;
- void *pkey = ndel->primary_key;
- u32 hash_val;
- struct neighbour *n;
- struct neighbour __rcu **np;
+ bool on_managed_list, add_to_managed;
- nht = rcu_dereference_protected(tbl->nht,
- lockdep_is_held(&tbl->lock));
- hash_val = tbl->hash(pkey, ndel->dev, nht->hash_rnd);
- hash_val = hash_val >> (32 - nht->hash_shift);
+ spin_lock_bh(&n->tbl->lock);
+ write_lock(&n->lock);
+ if (n->dead)
+ goto out;
+
+ add_to_managed = n->flags & NTF_MANAGED;
+ on_managed_list = !list_empty(&n->managed_list);
+
+ if (!add_to_managed && on_managed_list)
+ list_del_init(&n->managed_list);
+ else if (add_to_managed && !on_managed_list)
+ list_add_tail(&n->managed_list, &n->tbl->managed_list);
+out:
+ write_unlock(&n->lock);
+ spin_unlock_bh(&n->tbl->lock);
+}
+
+static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify,
+ bool *gc_update, bool *managed_update)
+{
+ u32 ndm_flags, old_flags = neigh->flags;
+
+ if (!(flags & NEIGH_UPDATE_F_ADMIN))
+ return;
- np = &nht->hash_buckets[hash_val];
- while ((n = rcu_dereference_protected(*np,
- lockdep_is_held(&tbl->lock)))) {
- if (n == ndel)
- return neigh_del(n, 0, 0, np, tbl);
- np = &n->next;
+ ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0;
+ ndm_flags |= (flags & NEIGH_UPDATE_F_MANAGED) ? NTF_MANAGED : 0;
+ ndm_flags |= (flags & NEIGH_UPDATE_F_EXT_VALIDATED) ? NTF_EXT_VALIDATED : 0;
+
+ if ((old_flags ^ ndm_flags) & NTF_EXT_LEARNED) {
+ if (ndm_flags & NTF_EXT_LEARNED)
+ neigh->flags |= NTF_EXT_LEARNED;
+ else
+ neigh->flags &= ~NTF_EXT_LEARNED;
+ *notify = 1;
+ *gc_update = true;
+ }
+ if ((old_flags ^ ndm_flags) & NTF_MANAGED) {
+ if (ndm_flags & NTF_MANAGED)
+ neigh->flags |= NTF_MANAGED;
+ else
+ neigh->flags &= ~NTF_MANAGED;
+ *notify = 1;
+ *managed_update = true;
+ }
+ if ((old_flags ^ ndm_flags) & NTF_EXT_VALIDATED) {
+ if (ndm_flags & NTF_EXT_VALIDATED)
+ neigh->flags |= NTF_EXT_VALIDATED;
+ else
+ neigh->flags &= ~NTF_EXT_VALIDATED;
+ *notify = 1;
+ *gc_update = true;
}
- return false;
+}
+
+bool neigh_remove_one(struct neighbour *n)
+{
+ bool retval = false;
+
+ write_lock(&n->lock);
+ if (refcount_read(&n->refcnt) == 1) {
+ hlist_del_rcu(&n->hash);
+ hlist_del_rcu(&n->dev_list);
+ neigh_mark_dead(n);
+ retval = true;
+ }
+ write_unlock(&n->lock);
+ if (retval)
+ neigh_cleanup_and_release(n);
+ return retval;
}
static int neigh_forced_gc(struct neigh_table *tbl)
{
+ int max_clean = atomic_read(&tbl->gc_entries) -
+ READ_ONCE(tbl->gc_thresh2);
+ u64 tmax = ktime_get_ns() + NSEC_PER_MSEC;
+ unsigned long tref = jiffies - 5 * HZ;
+ struct neighbour *n, *tmp;
int shrunk = 0;
- int i;
- struct neigh_hash_table *nht;
+ int loop = 0;
NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
- write_lock_bh(&tbl->lock);
- nht = rcu_dereference_protected(tbl->nht,
- lockdep_is_held(&tbl->lock));
- for (i = 0; i < (1 << nht->hash_shift); i++) {
- struct neighbour *n;
- struct neighbour __rcu **np;
-
- np = &nht->hash_buckets[i];
- while ((n = rcu_dereference_protected(*np,
- lockdep_is_held(&tbl->lock))) != NULL) {
- /* Neighbour record may be discarded if:
- * - nobody refers to it.
- * - it is not permanent
- */
- if (neigh_del(n, NUD_PERMANENT, NTF_EXT_LEARNED, np,
- tbl)) {
- shrunk = 1;
- continue;
+ spin_lock_bh(&tbl->lock);
+
+ list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) {
+ if (refcount_read(&n->refcnt) == 1) {
+ bool remove = false;
+
+ write_lock(&n->lock);
+ if ((n->nud_state == NUD_FAILED) ||
+ (n->nud_state == NUD_NOARP) ||
+ (tbl->is_multicast &&
+ tbl->is_multicast(n->primary_key)) ||
+ !time_in_range(n->updated, tref, jiffies))
+ remove = true;
+ write_unlock(&n->lock);
+
+ if (remove && neigh_remove_one(n))
+ shrunk++;
+ if (shrunk >= max_clean)
+ break;
+ if (++loop == 16) {
+ if (ktime_get_ns() > tmax)
+ goto unlock;
+ loop = 0;
}
- np = &n->next;
}
}
- tbl->last_flush = jiffies;
-
- write_unlock_bh(&tbl->lock);
+ WRITE_ONCE(tbl->last_flush, jiffies);
+unlock:
+ spin_unlock_bh(&tbl->lock);
return shrunk;
}
static void neigh_add_timer(struct neighbour *n, unsigned long when)
{
+ /* Use safe distance from the jiffies - LONG_MAX point while timer
+ * is running in DELAY/PROBE state but still show to user space
+ * large times in the past.
+ */
+ unsigned long mint = jiffies - (LONG_MAX - 86400 * HZ);
+
neigh_hold(n);
+ if (!time_in_range(n->confirmed, mint, jiffies))
+ n->confirmed = mint;
+ if (time_before(n->used, n->confirmed))
+ n->used = n->confirmed;
if (unlikely(mod_timer(&n->timer, when))) {
printk("NEIGH: BUG, double timer add, state is %x\n",
n->nud_state);
@@ -215,105 +320,197 @@ static void neigh_add_timer(struct neighbour *n, unsigned long when)
static int neigh_del_timer(struct neighbour *n)
{
if ((n->nud_state & NUD_IN_TIMER) &&
- del_timer(&n->timer)) {
+ timer_delete(&n->timer)) {
neigh_release(n);
return 1;
}
return 0;
}
-static void pneigh_queue_purge(struct sk_buff_head *list)
+static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev,
+ int family)
+{
+ switch (family) {
+ case AF_INET:
+ return __in_dev_arp_parms_get_rcu(dev);
+ case AF_INET6:
+ return __in6_dev_nd_parms_get_rcu(dev);
+ }
+ return NULL;
+}
+
+static void neigh_parms_qlen_dec(struct net_device *dev, int family)
{
+ struct neigh_parms *p;
+
+ rcu_read_lock();
+ p = neigh_get_dev_parms_rcu(dev, family);
+ if (p)
+ p->qlen--;
+ rcu_read_unlock();
+}
+
+static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net,
+ int family)
+{
+ struct sk_buff_head tmp;
+ unsigned long flags;
struct sk_buff *skb;
- while ((skb = skb_dequeue(list)) != NULL) {
+ skb_queue_head_init(&tmp);
+ spin_lock_irqsave(&list->lock, flags);
+ skb = skb_peek(list);
+ while (skb != NULL) {
+ struct sk_buff *skb_next = skb_peek_next(skb, list);
+ struct net_device *dev = skb->dev;
+
+ if (net == NULL || net_eq(dev_net(dev), net)) {
+ neigh_parms_qlen_dec(dev, family);
+ __skb_unlink(skb, list);
+ __skb_queue_tail(&tmp, skb);
+ }
+ skb = skb_next;
+ }
+ spin_unlock_irqrestore(&list->lock, flags);
+
+ while ((skb = __skb_dequeue(&tmp))) {
dev_put(skb->dev);
kfree_skb(skb);
}
}
-static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
+static void neigh_flush_one(struct neighbour *n)
+{
+ hlist_del_rcu(&n->hash);
+ hlist_del_rcu(&n->dev_list);
+
+ write_lock(&n->lock);
+
+ neigh_del_timer(n);
+ neigh_mark_dead(n);
+
+ if (refcount_read(&n->refcnt) != 1) {
+ /* The most unpleasant situation.
+ * We must destroy neighbour entry,
+ * but someone still uses it.
+ *
+ * The destroy will be delayed until
+ * the last user releases us, but
+ * we must kill timers etc. and move
+ * it to safe state.
+ */
+ __skb_queue_purge(&n->arp_queue);
+ n->arp_queue_len_bytes = 0;
+ WRITE_ONCE(n->output, neigh_blackhole);
+
+ if (n->nud_state & NUD_VALID)
+ n->nud_state = NUD_NOARP;
+ else
+ n->nud_state = NUD_NONE;
+
+ neigh_dbg(2, "neigh %p is stray\n", n);
+ }
+
+ write_unlock(&n->lock);
+
+ neigh_cleanup_and_release(n);
+}
+
+static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev,
+ bool skip_perm)
+{
+ struct hlist_head *dev_head;
+ struct hlist_node *tmp;
+ struct neighbour *n;
+
+ dev_head = neigh_get_dev_table(dev, tbl->family);
+
+ hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) {
+ if (skip_perm &&
+ (n->nud_state & NUD_PERMANENT ||
+ n->flags & NTF_EXT_VALIDATED))
+ continue;
+
+ neigh_flush_one(n);
+ }
+}
+
+static void neigh_flush_table(struct neigh_table *tbl)
{
- int i;
struct neigh_hash_table *nht;
+ int i;
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
for (i = 0; i < (1 << nht->hash_shift); i++) {
+ struct hlist_node *tmp;
struct neighbour *n;
- struct neighbour __rcu **np = &nht->hash_buckets[i];
- while ((n = rcu_dereference_protected(*np,
- lockdep_is_held(&tbl->lock))) != NULL) {
- if (dev && n->dev != dev) {
- np = &n->next;
- continue;
- }
- rcu_assign_pointer(*np,
- rcu_dereference_protected(n->next,
- lockdep_is_held(&tbl->lock)));
- write_lock(&n->lock);
- neigh_del_timer(n);
- n->dead = 1;
-
- if (refcount_read(&n->refcnt) != 1) {
- /* The most unpleasant situation.
- We must destroy neighbour entry,
- but someone still uses it.
-
- The destroy will be delayed until
- the last user releases us, but
- we must kill timers etc. and move
- it to safe state.
- */
- __skb_queue_purge(&n->arp_queue);
- n->arp_queue_len_bytes = 0;
- n->output = neigh_blackhole;
- if (n->nud_state & NUD_VALID)
- n->nud_state = NUD_NOARP;
- else
- n->nud_state = NUD_NONE;
- neigh_dbg(2, "neigh %p is stray\n", n);
- }
- write_unlock(&n->lock);
- neigh_cleanup_and_release(n);
- }
+ neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i])
+ neigh_flush_one(n);
}
}
void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev)
{
- write_lock_bh(&tbl->lock);
- neigh_flush_dev(tbl, dev);
- write_unlock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
+ neigh_flush_dev(tbl, dev, false);
+ spin_unlock_bh(&tbl->lock);
}
EXPORT_SYMBOL(neigh_changeaddr);
-int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
+ bool skip_perm)
{
- write_lock_bh(&tbl->lock);
- neigh_flush_dev(tbl, dev);
- pneigh_ifdown_and_unlock(tbl, dev);
+ spin_lock_bh(&tbl->lock);
+ if (likely(dev)) {
+ neigh_flush_dev(tbl, dev, skip_perm);
+ } else {
+ DEBUG_NET_WARN_ON_ONCE(skip_perm);
+ neigh_flush_table(tbl);
+ }
+ spin_unlock_bh(&tbl->lock);
+
+ pneigh_ifdown(tbl, dev, skip_perm);
+ pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL,
+ tbl->family);
+ if (skb_queue_empty_lockless(&tbl->proxy_queue))
+ timer_delete_sync(&tbl->proxy_timer);
+ return 0;
+}
- del_timer_sync(&tbl->proxy_timer);
- pneigh_queue_purge(&tbl->proxy_queue);
+int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev)
+{
+ __neigh_ifdown(tbl, dev, true);
+ return 0;
+}
+EXPORT_SYMBOL(neigh_carrier_down);
+
+int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
+{
+ __neigh_ifdown(tbl, dev, false);
return 0;
}
EXPORT_SYMBOL(neigh_ifdown);
-static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
+static struct neighbour *neigh_alloc(struct neigh_table *tbl,
+ struct net_device *dev,
+ u32 flags, bool exempt_from_gc)
{
struct neighbour *n = NULL;
unsigned long now = jiffies;
- int entries;
-
- entries = atomic_inc_return(&tbl->entries) - 1;
- if (entries >= tbl->gc_thresh3 ||
- (entries >= tbl->gc_thresh2 &&
- time_after(now, tbl->last_flush + 5 * HZ))) {
- if (!neigh_forced_gc(tbl) &&
- entries >= tbl->gc_thresh3) {
+ int entries, gc_thresh3;
+
+ if (exempt_from_gc)
+ goto do_alloc;
+
+ entries = atomic_inc_return(&tbl->gc_entries) - 1;
+ gc_thresh3 = READ_ONCE(tbl->gc_thresh3);
+ if (entries >= gc_thresh3 ||
+ (entries >= READ_ONCE(tbl->gc_thresh2) &&
+ time_after(now, READ_ONCE(tbl->last_flush) + 5 * HZ))) {
+ if (!neigh_forced_gc(tbl) && entries >= gc_thresh3) {
net_info_ratelimited("%s: neighbor table overflow!\n",
tbl->id);
NEIGH_CACHE_STAT_INC(tbl, table_fulls);
@@ -321,6 +518,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
}
}
+do_alloc:
n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
if (!n)
goto out_entries;
@@ -331,6 +529,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
n->updated = n->used = now;
n->nud_state = NUD_NONE;
n->output = neigh_blackhole;
+ n->flags = flags;
seqlock_init(&n->hh.hh_lock);
n->parms = neigh_parms_clone(&tbl->parms);
timer_setup(&n->timer, neigh_timer_handler, 0);
@@ -339,11 +538,16 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
n->tbl = tbl;
refcount_set(&n->refcnt, 1);
n->dead = 1;
+ INIT_LIST_HEAD(&n->gc_list);
+ INIT_LIST_HEAD(&n->managed_list);
+
+ atomic_inc(&tbl->entries);
out:
return n;
out_entries:
- atomic_dec(&tbl->entries);
+ if (!exempt_from_gc)
+ atomic_dec(&tbl->gc_entries);
goto out;
}
@@ -354,25 +558,21 @@ static void neigh_get_hash_rnd(u32 *x)
static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
{
- size_t size = (1 << shift) * sizeof(struct neighbour *);
+ size_t size = (1 << shift) * sizeof(struct hlist_head);
+ struct hlist_head *hash_heads;
struct neigh_hash_table *ret;
- struct neighbour __rcu **buckets;
int i;
ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
if (!ret)
return NULL;
- if (size <= PAGE_SIZE)
- buckets = kzalloc(size, GFP_ATOMIC);
- else
- buckets = (struct neighbour __rcu **)
- __get_free_pages(GFP_ATOMIC | __GFP_ZERO,
- get_order(size));
- if (!buckets) {
+
+ hash_heads = kzalloc(size, GFP_ATOMIC);
+ if (!hash_heads) {
kfree(ret);
return NULL;
}
- ret->hash_buckets = buckets;
+ ret->hash_heads = hash_heads;
ret->hash_shift = shift;
for (i = 0; i < NEIGH_NUM_HASH_RND; i++)
neigh_get_hash_rnd(&ret->hash_rnd[i]);
@@ -384,13 +584,8 @@ static void neigh_hash_free_rcu(struct rcu_head *head)
struct neigh_hash_table *nht = container_of(head,
struct neigh_hash_table,
rcu);
- size_t size = (1 << nht->hash_shift) * sizeof(struct neighbour *);
- struct neighbour __rcu **buckets = nht->hash_buckets;
- if (size <= PAGE_SIZE)
- kfree(buckets);
- else
- free_pages((unsigned long)buckets, get_order(size));
+ kfree(nht->hash_heads);
kfree(nht);
}
@@ -409,24 +604,17 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
return old_nht;
for (i = 0; i < (1 << old_nht->hash_shift); i++) {
- struct neighbour *n, *next;
+ struct hlist_node *tmp;
+ struct neighbour *n;
- for (n = rcu_dereference_protected(old_nht->hash_buckets[i],
- lockdep_is_held(&tbl->lock));
- n != NULL;
- n = next) {
+ neigh_for_each_in_bucket_safe(n, tmp, &old_nht->hash_heads[i]) {
hash = tbl->hash(n->primary_key, n->dev,
new_nht->hash_rnd);
hash >>= (32 - new_nht->hash_shift);
- next = rcu_dereference_protected(n->next,
- lockdep_is_held(&tbl->lock));
- rcu_assign_pointer(n->next,
- rcu_dereference_protected(
- new_nht->hash_buckets[hash],
- lockdep_is_held(&tbl->lock)));
- rcu_assign_pointer(new_nht->hash_buckets[hash], n);
+ hlist_del_rcu(&n->hash);
+ hlist_add_head_rcu(&n->hash, &new_nht->hash_heads[hash]);
}
}
@@ -442,7 +630,7 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
NEIGH_CACHE_STAT_INC(tbl, lookups);
- rcu_read_lock_bh();
+ rcu_read_lock();
n = __neigh_lookup_noref(tbl, pkey, dev);
if (n) {
if (!refcount_inc_not_zero(&n->refcnt))
@@ -450,51 +638,23 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
NEIGH_CACHE_STAT_INC(tbl, hits);
}
- rcu_read_unlock_bh();
+ rcu_read_unlock();
return n;
}
EXPORT_SYMBOL(neigh_lookup);
-struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
- const void *pkey)
+static struct neighbour *
+___neigh_create(struct neigh_table *tbl, const void *pkey,
+ struct net_device *dev, u32 flags,
+ bool exempt_from_gc, bool want_ref)
{
- struct neighbour *n;
- unsigned int key_len = tbl->key_len;
- u32 hash_val;
+ u32 hash_val, key_len = tbl->key_len;
+ struct neighbour *n1, *rc, *n;
struct neigh_hash_table *nht;
-
- NEIGH_CACHE_STAT_INC(tbl, lookups);
-
- rcu_read_lock_bh();
- nht = rcu_dereference_bh(tbl->nht);
- hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) >> (32 - nht->hash_shift);
-
- for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
- n != NULL;
- n = rcu_dereference_bh(n->next)) {
- if (!memcmp(n->primary_key, pkey, key_len) &&
- net_eq(dev_net(n->dev), net)) {
- if (!refcount_inc_not_zero(&n->refcnt))
- n = NULL;
- NEIGH_CACHE_STAT_INC(tbl, hits);
- break;
- }
- }
-
- rcu_read_unlock_bh();
- return n;
-}
-EXPORT_SYMBOL(neigh_lookup_nodev);
-
-struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
- struct net_device *dev, bool want_ref)
-{
- u32 hash_val;
- unsigned int key_len = tbl->key_len;
int error;
- struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);
- struct neigh_hash_table *nht;
+ n = neigh_alloc(tbl, dev, flags, exempt_from_gc);
+ trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc);
if (!n) {
rc = ERR_PTR(-ENOBUFS);
goto out;
@@ -502,7 +662,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
memcpy(n->primary_key, pkey, key_len);
n->dev = dev;
- dev_hold(dev);
+ netdev_hold(dev, &n->dev_tracker, GFP_ATOMIC);
/* Protocol specific setup. */
if (tbl->constructor && (error = tbl->constructor(n)) < 0) {
@@ -527,7 +687,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1);
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
@@ -541,11 +701,7 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
goto out_tbl_unlock;
}
- for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val],
- lockdep_is_held(&tbl->lock));
- n1 != NULL;
- n1 = rcu_dereference_protected(n1->next,
- lockdep_is_held(&tbl->lock))) {
+ neigh_for_each_in_bucket(n1, &nht->hash_heads[hash_val]) {
if (dev == n1->dev && !memcmp(n1->primary_key, n->primary_key, key_len)) {
if (want_ref)
neigh_hold(n1);
@@ -555,23 +711,38 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
}
n->dead = 0;
+ if (!exempt_from_gc)
+ list_add_tail(&n->gc_list, &n->tbl->gc_list);
+ if (n->flags & NTF_MANAGED)
+ list_add_tail(&n->managed_list, &n->tbl->managed_list);
if (want_ref)
neigh_hold(n);
- rcu_assign_pointer(n->next,
- rcu_dereference_protected(nht->hash_buckets[hash_val],
- lockdep_is_held(&tbl->lock)));
- rcu_assign_pointer(nht->hash_buckets[hash_val], n);
- write_unlock_bh(&tbl->lock);
+ hlist_add_head_rcu(&n->hash, &nht->hash_heads[hash_val]);
+
+ hlist_add_head_rcu(&n->dev_list,
+ neigh_get_dev_table(dev, tbl->family));
+
+ spin_unlock_bh(&tbl->lock);
neigh_dbg(2, "neigh %p is created\n", n);
rc = n;
out:
return rc;
out_tbl_unlock:
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
out_neigh_release:
+ if (!exempt_from_gc)
+ atomic_dec(&tbl->gc_entries);
neigh_release(n);
goto out;
}
+
+struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
+ struct net_device *dev, bool want_ref)
+{
+ bool exempt_from_gc = !!(dev->flags & IFF_LOOPBACK);
+
+ return ___neigh_create(tbl, pkey, dev, 0, exempt_from_gc, want_ref);
+}
EXPORT_SYMBOL(__neigh_create);
static u32 pneigh_hash(const void *pkey, unsigned int key_len)
@@ -584,142 +755,160 @@ static u32 pneigh_hash(const void *pkey, unsigned int key_len)
return hash_val;
}
-static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n,
- struct net *net,
- const void *pkey,
- unsigned int key_len,
- struct net_device *dev)
+struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl,
+ struct net *net, const void *pkey,
+ struct net_device *dev)
{
+ struct pneigh_entry *n;
+ unsigned int key_len;
+ u32 hash_val;
+
+ key_len = tbl->key_len;
+ hash_val = pneigh_hash(pkey, key_len);
+ n = rcu_dereference_check(tbl->phash_buckets[hash_val],
+ lockdep_is_held(&tbl->phash_lock));
+
while (n) {
if (!memcmp(n->key, pkey, key_len) &&
net_eq(pneigh_net(n), net) &&
(n->dev == dev || !n->dev))
return n;
- n = n->next;
- }
- return NULL;
-}
-struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl,
- struct net *net, const void *pkey, struct net_device *dev)
-{
- unsigned int key_len = tbl->key_len;
- u32 hash_val = pneigh_hash(pkey, key_len);
+ n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->phash_lock));
+ }
- return __pneigh_lookup_1(tbl->phash_buckets[hash_val],
- net, pkey, key_len, dev);
+ return NULL;
}
-EXPORT_SYMBOL_GPL(__pneigh_lookup);
+EXPORT_IPV6_MOD(pneigh_lookup);
-struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl,
- struct net *net, const void *pkey,
- struct net_device *dev, int creat)
+int pneigh_create(struct neigh_table *tbl, struct net *net,
+ const void *pkey, struct net_device *dev,
+ u32 flags, u8 protocol, bool permanent)
{
struct pneigh_entry *n;
- unsigned int key_len = tbl->key_len;
- u32 hash_val = pneigh_hash(pkey, key_len);
-
- read_lock_bh(&tbl->lock);
- n = __pneigh_lookup_1(tbl->phash_buckets[hash_val],
- net, pkey, key_len, dev);
- read_unlock_bh(&tbl->lock);
+ unsigned int key_len;
+ u32 hash_val;
+ int err = 0;
- if (n || !creat)
- goto out;
+ mutex_lock(&tbl->phash_lock);
- ASSERT_RTNL();
+ n = pneigh_lookup(tbl, net, pkey, dev);
+ if (n)
+ goto update;
- n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL);
- if (!n)
+ key_len = tbl->key_len;
+ n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL);
+ if (!n) {
+ err = -ENOBUFS;
goto out;
+ }
write_pnet(&n->net, net);
memcpy(n->key, pkey, key_len);
n->dev = dev;
- if (dev)
- dev_hold(dev);
+ netdev_hold(dev, &n->dev_tracker, GFP_KERNEL);
if (tbl->pconstructor && tbl->pconstructor(n)) {
- if (dev)
- dev_put(dev);
+ netdev_put(dev, &n->dev_tracker);
kfree(n);
- n = NULL;
+ err = -ENOBUFS;
goto out;
}
- write_lock_bh(&tbl->lock);
+ hash_val = pneigh_hash(pkey, key_len);
n->next = tbl->phash_buckets[hash_val];
- tbl->phash_buckets[hash_val] = n;
- write_unlock_bh(&tbl->lock);
+ rcu_assign_pointer(tbl->phash_buckets[hash_val], n);
+update:
+ WRITE_ONCE(n->flags, flags);
+ n->permanent = permanent;
+ WRITE_ONCE(n->protocol, protocol);
out:
- return n;
+ mutex_unlock(&tbl->phash_lock);
+ return err;
}
-EXPORT_SYMBOL(pneigh_lookup);
+static void pneigh_destroy(struct rcu_head *rcu)
+{
+ struct pneigh_entry *n = container_of(rcu, struct pneigh_entry, rcu);
+
+ netdev_put(n->dev, &n->dev_tracker);
+ kfree(n);
+}
int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey,
struct net_device *dev)
{
- struct pneigh_entry *n, **np;
- unsigned int key_len = tbl->key_len;
- u32 hash_val = pneigh_hash(pkey, key_len);
+ struct pneigh_entry *n, __rcu **np;
+ unsigned int key_len;
+ u32 hash_val;
- write_lock_bh(&tbl->lock);
- for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL;
+ key_len = tbl->key_len;
+ hash_val = pneigh_hash(pkey, key_len);
+
+ mutex_lock(&tbl->phash_lock);
+
+ for (np = &tbl->phash_buckets[hash_val];
+ (n = rcu_dereference_protected(*np, 1)) != NULL;
np = &n->next) {
if (!memcmp(n->key, pkey, key_len) && n->dev == dev &&
net_eq(pneigh_net(n), net)) {
- *np = n->next;
- write_unlock_bh(&tbl->lock);
+ rcu_assign_pointer(*np, n->next);
+
+ mutex_unlock(&tbl->phash_lock);
+
if (tbl->pdestructor)
tbl->pdestructor(n);
- if (n->dev)
- dev_put(n->dev);
- kfree(n);
+
+ call_rcu(&n->rcu, pneigh_destroy);
return 0;
}
}
- write_unlock_bh(&tbl->lock);
+
+ mutex_unlock(&tbl->phash_lock);
return -ENOENT;
}
-static int pneigh_ifdown_and_unlock(struct neigh_table *tbl,
- struct net_device *dev)
+static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
+ bool skip_perm)
{
- struct pneigh_entry *n, **np, *freelist = NULL;
+ struct pneigh_entry *n, __rcu **np;
+ LIST_HEAD(head);
u32 h;
+ mutex_lock(&tbl->phash_lock);
+
for (h = 0; h <= PNEIGH_HASHMASK; h++) {
np = &tbl->phash_buckets[h];
- while ((n = *np) != NULL) {
+ while ((n = rcu_dereference_protected(*np, 1)) != NULL) {
+ if (skip_perm && n->permanent)
+ goto skip;
if (!dev || n->dev == dev) {
- *np = n->next;
- n->next = freelist;
- freelist = n;
+ rcu_assign_pointer(*np, n->next);
+ list_add(&n->free_node, &head);
continue;
}
+skip:
np = &n->next;
}
}
- write_unlock_bh(&tbl->lock);
- while ((n = freelist)) {
- freelist = n->next;
- n->next = NULL;
+
+ mutex_unlock(&tbl->phash_lock);
+
+ while (!list_empty(&head)) {
+ n = list_first_entry(&head, typeof(*n), free_node);
+ list_del(&n->free_node);
+
if (tbl->pdestructor)
tbl->pdestructor(n);
- if (n->dev)
- dev_put(n->dev);
- kfree(n);
+
+ call_rcu(&n->rcu, pneigh_destroy);
}
- return -ENOENT;
}
-static void neigh_parms_destroy(struct neigh_parms *parms);
-
static inline void neigh_parms_put(struct neigh_parms *parms)
{
if (refcount_dec_and_test(&parms->refcnt))
- neigh_parms_destroy(parms);
+ kfree(parms);
}
/*
@@ -749,7 +938,7 @@ void neigh_destroy(struct neighbour *neigh)
if (dev->netdev_ops->ndo_neigh_destroy)
dev->netdev_ops->ndo_neigh_destroy(dev, neigh);
- dev_put(dev);
+ netdev_put(dev, &neigh->dev_tracker);
neigh_parms_put(neigh->parms);
neigh_dbg(2, "neigh %p is destroyed\n", neigh);
@@ -768,7 +957,7 @@ static void neigh_suspect(struct neighbour *neigh)
{
neigh_dbg(2, "neigh %p is suspected\n", neigh);
- neigh->output = neigh->ops->output;
+ WRITE_ONCE(neigh->output, neigh->ops->output);
}
/* Neighbour state is OK;
@@ -780,20 +969,20 @@ static void neigh_connect(struct neighbour *neigh)
{
neigh_dbg(2, "neigh %p is connected\n", neigh);
- neigh->output = neigh->ops->connected_output;
+ WRITE_ONCE(neigh->output, neigh->ops->connected_output);
}
static void neigh_periodic_work(struct work_struct *work)
{
struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
+ struct neigh_hash_table *nht;
+ struct hlist_node *tmp;
struct neighbour *n;
- struct neighbour __rcu **np;
unsigned int i;
- struct neigh_hash_table *nht;
NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
@@ -803,55 +992,53 @@ static void neigh_periodic_work(struct work_struct *work)
if (time_after(jiffies, tbl->last_rand + 300 * HZ)) {
struct neigh_parms *p;
- tbl->last_rand = jiffies;
+
+ WRITE_ONCE(tbl->last_rand, jiffies);
list_for_each_entry(p, &tbl->parms_list, list)
- p->reachable_time =
- neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+ neigh_set_reach_time(p);
}
- if (atomic_read(&tbl->entries) < tbl->gc_thresh1)
+ if (atomic_read(&tbl->entries) < READ_ONCE(tbl->gc_thresh1))
goto out;
for (i = 0 ; i < (1 << nht->hash_shift); i++) {
- np = &nht->hash_buckets[i];
-
- while ((n = rcu_dereference_protected(*np,
- lockdep_is_held(&tbl->lock))) != NULL) {
+ neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[i]) {
unsigned int state;
write_lock(&n->lock);
state = n->nud_state;
if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) ||
- (n->flags & NTF_EXT_LEARNED)) {
+ (n->flags &
+ (NTF_EXT_LEARNED | NTF_EXT_VALIDATED))) {
write_unlock(&n->lock);
- goto next_elt;
+ continue;
}
- if (time_before(n->used, n->confirmed))
+ if (time_before(n->used, n->confirmed) &&
+ time_is_before_eq_jiffies(n->confirmed))
n->used = n->confirmed;
if (refcount_read(&n->refcnt) == 1 &&
(state == NUD_FAILED ||
- time_after(jiffies, n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
- *np = n->next;
- n->dead = 1;
+ !time_in_range_open(jiffies, n->used,
+ n->used + NEIGH_VAR(n->parms, GC_STALETIME)))) {
+ hlist_del_rcu(&n->hash);
+ hlist_del_rcu(&n->dev_list);
+ neigh_mark_dead(n);
write_unlock(&n->lock);
neigh_cleanup_and_release(n);
continue;
}
write_unlock(&n->lock);
-
-next_elt:
- np = &n->next;
}
/*
* It's fine to release lock here, even if hash table
* grows while we are preempted.
*/
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
cond_resched();
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
}
@@ -862,7 +1049,7 @@ out:
*/
queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1);
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
}
static __inline__ int neigh_max_probes(struct neighbour *n)
@@ -909,7 +1096,7 @@ static void neigh_probe(struct neighbour *neigh)
if (neigh->ops->solicit)
neigh->ops->solicit(neigh, skb);
atomic_inc(&neigh->probes);
- kfree_skb(skb);
+ consume_skb(skb);
}
/* Called when a timer expires for a neighbour entry. */
@@ -917,7 +1104,7 @@ static void neigh_probe(struct neighbour *neigh)
static void neigh_timer_handler(struct timer_list *t)
{
unsigned long now, next;
- struct neighbour *neigh = from_timer(neigh, t, timer);
+ struct neighbour *neigh = timer_container_of(neigh, t, timer);
unsigned int state;
int notify = 0;
@@ -939,13 +1126,13 @@ static void neigh_timer_handler(struct timer_list *t)
neigh->used +
NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
neigh_dbg(2, "neigh %p is delayed\n", neigh);
- neigh->nud_state = NUD_DELAY;
+ WRITE_ONCE(neigh->nud_state, NUD_DELAY);
neigh->updated = jiffies;
neigh_suspect(neigh);
next = now + NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME);
} else {
neigh_dbg(2, "neigh %p is suspected\n", neigh);
- neigh->nud_state = NUD_STALE;
+ WRITE_ONCE(neigh->nud_state, NUD_STALE);
neigh->updated = jiffies;
neigh_suspect(neigh);
notify = 1;
@@ -955,35 +1142,42 @@ static void neigh_timer_handler(struct timer_list *t)
neigh->confirmed +
NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME))) {
neigh_dbg(2, "neigh %p is now reachable\n", neigh);
- neigh->nud_state = NUD_REACHABLE;
+ WRITE_ONCE(neigh->nud_state, NUD_REACHABLE);
neigh->updated = jiffies;
neigh_connect(neigh);
notify = 1;
next = neigh->confirmed + neigh->parms->reachable_time;
} else {
neigh_dbg(2, "neigh %p is probed\n", neigh);
- neigh->nud_state = NUD_PROBE;
+ WRITE_ONCE(neigh->nud_state, NUD_PROBE);
neigh->updated = jiffies;
atomic_set(&neigh->probes, 0);
notify = 1;
- next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
+ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
+ HZ/100);
}
} else {
/* NUD_PROBE|NUD_INCOMPLETE */
- next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
+ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100);
}
if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) {
- neigh->nud_state = NUD_FAILED;
+ if (neigh->nud_state == NUD_PROBE &&
+ neigh->flags & NTF_EXT_VALIDATED) {
+ WRITE_ONCE(neigh->nud_state, NUD_STALE);
+ neigh->updated = jiffies;
+ } else {
+ WRITE_ONCE(neigh->nud_state, NUD_FAILED);
+ neigh_invalidate(neigh);
+ }
notify = 1;
- neigh_invalidate(neigh);
goto out;
}
if (neigh->nud_state & NUD_IN_TIMER) {
- if (time_before(next, jiffies + HZ/2))
- next = jiffies + HZ/2;
+ if (time_before(next, jiffies + HZ/100))
+ next = jiffies + HZ/100;
if (!mod_timer(&neigh->timer, next))
neigh_hold(neigh);
}
@@ -997,10 +1191,13 @@ out:
if (notify)
neigh_update_notify(neigh, 0);
+ trace_neigh_timer_handler(neigh, 0);
+
neigh_release(neigh);
}
-int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
+int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb,
+ const bool immediate_ok)
{
int rc;
bool immediate_probe = false;
@@ -1020,23 +1217,30 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
atomic_set(&neigh->probes,
NEIGH_VAR(neigh->parms, UCAST_PROBES));
- neigh->nud_state = NUD_INCOMPLETE;
+ neigh_del_timer(neigh);
+ WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE);
neigh->updated = now;
- next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
- HZ/2);
+ if (!immediate_ok) {
+ next = now + 1;
+ } else {
+ immediate_probe = true;
+ next = now + max(NEIGH_VAR(neigh->parms,
+ RETRANS_TIME),
+ HZ / 100);
+ }
neigh_add_timer(neigh, next);
- immediate_probe = true;
} else {
- neigh->nud_state = NUD_FAILED;
+ WRITE_ONCE(neigh->nud_state, NUD_FAILED);
neigh->updated = jiffies;
write_unlock_bh(&neigh->lock);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED);
return 1;
}
} else if (neigh->nud_state & NUD_STALE) {
neigh_dbg(2, "neigh %p is delayed\n", neigh);
- neigh->nud_state = NUD_DELAY;
+ neigh_del_timer(neigh);
+ WRITE_ONCE(neigh->nud_state, NUD_DELAY);
neigh->updated = jiffies;
neigh_add_timer(neigh, jiffies +
NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME));
@@ -1052,7 +1256,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
if (!buff)
break;
neigh->arp_queue_len_bytes -= buff->truesize;
- kfree_skb(buff);
+ kfree_skb_reason(buff, SKB_DROP_REASON_NEIGH_QUEUEFULL);
NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
}
skb_dst_force(skb);
@@ -1067,13 +1271,15 @@ out_unlock_bh:
else
write_unlock(&neigh->lock);
local_bh_enable();
+ trace_neigh_event_send_done(neigh, rc);
return rc;
out_dead:
if (neigh->nud_state & NUD_STALE)
goto out_unlock_bh;
write_unlock_bh(&neigh->lock);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_DEAD);
+ trace_neigh_event_send_dead(neigh, 1);
return 1;
}
EXPORT_SYMBOL(__neigh_event_send);
@@ -1089,7 +1295,7 @@ static void neigh_update_hhs(struct neighbour *neigh)
if (update) {
hh = &neigh->hh;
- if (hh->hh_len) {
+ if (READ_ONCE(hh->hh_len)) {
write_seqlock_bh(&hh->hh_lock);
update(hh, neigh->dev, neigh->ha);
write_sequnlock_bh(&hh->hh_lock);
@@ -1097,8 +1303,6 @@ static void neigh_update_hhs(struct neighbour *neigh)
}
}
-
-
/* Generic update routine.
-- lladdr is new lladdr or NULL, if it is not supplied.
-- new is new state.
@@ -1109,23 +1313,28 @@ static void neigh_update_hhs(struct neighbour *neigh)
lladdr instead of overriding it
if it is different.
NEIGH_UPDATE_F_ADMIN means that the change is administrative.
-
+ NEIGH_UPDATE_F_USE means that the entry is user triggered.
+ NEIGH_UPDATE_F_MANAGED means that the entry will be auto-refreshed.
NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
NTF_ROUTER flag.
NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as
a router.
+ NEIGH_UPDATE_F_EXT_VALIDATED means that the entry will not be removed
+ or invalidated.
Caller MUST hold reference count on the entry.
*/
-
-int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
- u32 flags, u32 nlmsg_pid)
+static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
+ u8 new, u32 flags, u32 nlmsg_pid,
+ struct netlink_ext_ack *extack)
{
- u8 old;
- int err;
- int notify = 0;
- struct net_device *dev;
+ bool gc_update = false, managed_update = false;
int update_isrouter = 0;
+ struct net_device *dev;
+ int err, notify = 0;
+ u8 old;
+
+ trace_neigh_update(neigh, lladdr, new, flags, nlmsg_pid);
write_lock_bh(&neigh->lock);
@@ -1133,23 +1342,31 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
old = neigh->nud_state;
err = -EPERM;
+ if (neigh->dead) {
+ NL_SET_ERR_MSG(extack, "Neighbor entry is now dead");
+ new = old;
+ goto out;
+ }
if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
(old & (NUD_NOARP | NUD_PERMANENT)))
goto out;
- if (neigh->dead)
- goto out;
- neigh_update_ext_learned(neigh, flags, &notify);
+ neigh_update_flags(neigh, flags, &notify, &gc_update, &managed_update);
+ if (flags & (NEIGH_UPDATE_F_USE | NEIGH_UPDATE_F_MANAGED)) {
+ new = old & ~NUD_PERMANENT;
+ WRITE_ONCE(neigh->nud_state, new);
+ err = 0;
+ goto out;
+ }
if (!(new & NUD_VALID)) {
neigh_del_timer(neigh);
if (old & NUD_CONNECTED)
neigh_suspect(neigh);
- neigh->nud_state = new;
+ WRITE_ONCE(neigh->nud_state, new);
err = 0;
notify = old & NUD_VALID;
- if (((old & (NUD_INCOMPLETE | NUD_PROBE)) ||
- (flags & NEIGH_UPDATE_F_ADMIN)) &&
+ if ((old & (NUD_INCOMPLETE | NUD_PROBE)) &&
(new & NUD_FAILED)) {
neigh_invalidate(neigh);
notify = 1;
@@ -1175,8 +1392,10 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
use it, otherwise discard the request.
*/
err = -EINVAL;
- if (!(old & NUD_VALID))
+ if (!(old & NUD_VALID)) {
+ NL_SET_ERR_MSG(extack, "No link layer address given");
goto out;
+ }
lladdr = neigh->ha;
}
@@ -1223,7 +1442,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
((new & NUD_REACHABLE) ?
neigh->parms->reachable_time :
0)));
- neigh->nud_state = new;
+ WRITE_ONCE(neigh->nud_state, new);
notify = 1;
}
@@ -1264,12 +1483,13 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
* we can reinject the packet there.
*/
n2 = NULL;
- if (dst) {
+ if (dst &&
+ READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) {
n2 = dst_neigh_lookup_skb(dst, skb);
if (n2)
n1 = n2;
}
- n1->output(n1, skb);
+ READ_ONCE(n1->output)(n1, skb);
if (n2)
neigh_release(n2);
rcu_read_unlock();
@@ -1280,18 +1500,24 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
neigh->arp_queue_len_bytes = 0;
}
out:
- if (update_isrouter) {
- neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ?
- (neigh->flags | NTF_ROUTER) :
- (neigh->flags & ~NTF_ROUTER);
- }
+ if (update_isrouter)
+ neigh_update_is_router(neigh, flags, &notify);
write_unlock_bh(&neigh->lock);
-
+ if (((new ^ old) & NUD_PERMANENT) || gc_update)
+ neigh_update_gc_list(neigh);
+ if (managed_update)
+ neigh_update_managed_list(neigh);
if (notify)
neigh_update_notify(neigh, nlmsg_pid);
-
+ trace_neigh_update_done(neigh, err);
return err;
}
+
+int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
+ u32 flags, u32 nlmsg_pid)
+{
+ return __neigh_update(neigh, lladdr, new, flags, nlmsg_pid, NULL);
+}
EXPORT_SYMBOL(neigh_update);
/* Update the neigh to listen temporarily for probe responses, even if it is
@@ -1304,10 +1530,11 @@ void __neigh_set_probe_once(struct neighbour *neigh)
neigh->updated = jiffies;
if (!(neigh->nud_state & NUD_FAILED))
return;
- neigh->nud_state = NUD_INCOMPLETE;
+ WRITE_ONCE(neigh->nud_state, NUD_INCOMPLETE);
atomic_set(&neigh->probes, neigh_max_probes(neigh));
neigh_add_timer(neigh,
- jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME));
+ jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
+ HZ/100));
}
EXPORT_SYMBOL(__neigh_set_probe_once);
@@ -1353,7 +1580,7 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
struct net_device *dev = neigh->dev;
unsigned int seq;
- if (dev->header_ops->cache && !neigh->hh.hh_len)
+ if (dev->header_ops->cache && !READ_ONCE(neigh->hh.hh_len))
neigh_hh_init(neigh);
do {
@@ -1372,7 +1599,7 @@ out:
return rc;
out_kfree_skb:
rc = -EINVAL;
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL);
goto out;
}
EXPORT_SYMBOL(neigh_resolve_output);
@@ -1396,7 +1623,7 @@ int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb)
err = dev_queue_xmit(skb);
else {
err = -EINVAL;
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_HH_FILLFAIL);
}
return err;
}
@@ -1408,9 +1635,23 @@ int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb)
}
EXPORT_SYMBOL(neigh_direct_output);
+static void neigh_managed_work(struct work_struct *work)
+{
+ struct neigh_table *tbl = container_of(work, struct neigh_table,
+ managed_work.work);
+ struct neighbour *neigh;
+
+ spin_lock_bh(&tbl->lock);
+ list_for_each_entry(neigh, &tbl->managed_list, managed_list)
+ neigh_event_send_probe(neigh, NULL, false);
+ queue_delayed_work(system_power_efficient_wq, &tbl->managed_work,
+ NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS));
+ spin_unlock_bh(&tbl->lock);
+}
+
static void neigh_proxy_process(struct timer_list *t)
{
- struct neigh_table *tbl = from_timer(tbl, t, proxy_timer);
+ struct neigh_table *tbl = timer_container_of(tbl, t, proxy_timer);
long sched_next = 0;
unsigned long now = jiffies;
struct sk_buff *skb, *n;
@@ -1423,7 +1664,9 @@ static void neigh_proxy_process(struct timer_list *t)
if (tdif <= 0) {
struct net_device *dev = skb->dev;
+ neigh_parms_qlen_dec(dev, tbl->family);
__skb_unlink(skb, &tbl->proxy_queue);
+
if (tbl->proxy_redo && netif_running(dev)) {
rcu_read_lock();
tbl->proxy_redo(skb);
@@ -1436,21 +1679,29 @@ static void neigh_proxy_process(struct timer_list *t)
} else if (!sched_next || tdif < sched_next)
sched_next = tdif;
}
- del_timer(&tbl->proxy_timer);
+ timer_delete(&tbl->proxy_timer);
if (sched_next)
mod_timer(&tbl->proxy_timer, jiffies + sched_next);
spin_unlock(&tbl->proxy_queue.lock);
}
+static unsigned long neigh_proxy_delay(struct neigh_parms *p)
+{
+ /* If proxy_delay is zero, do not call get_random_u32_below()
+ * as it is undefined behavior.
+ */
+ unsigned long proxy_delay = NEIGH_VAR(p, PROXY_DELAY);
+
+ return proxy_delay ?
+ jiffies + get_random_u32_below(proxy_delay) : jiffies;
+}
+
void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
struct sk_buff *skb)
{
- unsigned long now = jiffies;
+ unsigned long sched_next = neigh_proxy_delay(p);
- unsigned long sched_next = now + (prandom_u32() %
- NEIGH_VAR(p, PROXY_DELAY));
-
- if (tbl->proxy_queue.qlen > NEIGH_VAR(p, PROXY_QLEN)) {
+ if (p->qlen > NEIGH_VAR(p, PROXY_QLEN)) {
kfree_skb(skb);
return;
}
@@ -1459,13 +1710,14 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p,
NEIGH_CB(skb)->flags |= LOCALLY_ENQUEUED;
spin_lock(&tbl->proxy_queue.lock);
- if (del_timer(&tbl->proxy_timer)) {
+ if (timer_delete(&tbl->proxy_timer)) {
if (time_before(tbl->proxy_timer.expires, sched_next))
sched_next = tbl->proxy_timer.expires;
}
skb_dst_drop(skb);
dev_hold(skb->dev);
__skb_queue_tail(&tbl->proxy_queue, skb);
+ p->qlen++;
mod_timer(&tbl->proxy_timer, sched_next);
spin_unlock(&tbl->proxy_queue.lock);
}
@@ -1496,22 +1748,22 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
if (p) {
p->tbl = tbl;
refcount_set(&p->refcnt, 1);
- p->reachable_time =
- neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
- dev_hold(dev);
+ neigh_set_reach_time(p);
+ p->qlen = 0;
+ netdev_hold(dev, &p->dev_tracker, GFP_KERNEL);
p->dev = dev;
write_pnet(&p->net, net);
p->sysctl_table = NULL;
if (ops->ndo_neigh_setup && ops->ndo_neigh_setup(dev, p)) {
- dev_put(dev);
+ netdev_put(dev, &p->dev_tracker);
kfree(p);
return NULL;
}
- write_lock_bh(&tbl->lock);
- list_add(&p->list, &tbl->parms.list);
- write_unlock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
+ list_add_rcu(&p->list, &tbl->parms.list);
+ spin_unlock_bh(&tbl->lock);
neigh_parms_data_state_cleanall(p);
}
@@ -1531,24 +1783,20 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms)
{
if (!parms || parms == &tbl->parms)
return;
- write_lock_bh(&tbl->lock);
- list_del(&parms->list);
+
+ spin_lock_bh(&tbl->lock);
+ list_del_rcu(&parms->list);
parms->dead = 1;
- write_unlock_bh(&tbl->lock);
- if (parms->dev)
- dev_put(parms->dev);
+ spin_unlock_bh(&tbl->lock);
+
+ netdev_put(parms->dev, &parms->dev_tracker);
call_rcu(&parms->rcu_head, neigh_rcu_free_parms);
}
EXPORT_SYMBOL(neigh_parms_release);
-static void neigh_parms_destroy(struct neigh_parms *parms)
-{
- kfree(parms);
-}
-
static struct lock_class_key neigh_table_proxy_queue_class;
-static struct neigh_table *neigh_tables[NEIGH_NR_TABLES] __read_mostly;
+static struct neigh_table __rcu *neigh_tables[NEIGH_NR_TABLES] __read_mostly;
void neigh_table_init(int index, struct neigh_table *tbl)
{
@@ -1556,11 +1804,14 @@ void neigh_table_init(int index, struct neigh_table *tbl)
unsigned long phsize;
INIT_LIST_HEAD(&tbl->parms_list);
+ INIT_LIST_HEAD(&tbl->gc_list);
+ INIT_LIST_HEAD(&tbl->managed_list);
+
list_add(&tbl->parms.list, &tbl->parms_list);
write_pnet(&tbl->parms.net, &init_net);
refcount_set(&tbl->parms.refcnt, 1);
- tbl->parms.reachable_time =
- neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME));
+ neigh_set_reach_time(&tbl->parms);
+ tbl->parms.qlen = 0;
tbl->stats = alloc_percpu(struct neigh_statistics);
if (!tbl->stats)
@@ -1586,10 +1837,15 @@ void neigh_table_init(int index, struct neigh_table *tbl)
else
WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN);
- rwlock_init(&tbl->lock);
+ spin_lock_init(&tbl->lock);
+ mutex_init(&tbl->phash_lock);
+
INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work);
queue_delayed_work(system_power_efficient_wq, &tbl->gc_work,
tbl->parms.reachable_time);
+ INIT_DEFERRABLE_WORK(&tbl->managed_work, neigh_managed_work);
+ queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, 0);
+
timer_setup(&tbl->proxy_timer, neigh_proxy_process, 0);
skb_queue_head_init_class(&tbl->proxy_queue,
&neigh_table_proxy_queue_class);
@@ -1597,17 +1853,24 @@ void neigh_table_init(int index, struct neigh_table *tbl)
tbl->last_flush = now;
tbl->last_rand = now + tbl->parms.reachable_time * 20;
- neigh_tables[index] = tbl;
+ rcu_assign_pointer(neigh_tables[index], tbl);
}
EXPORT_SYMBOL(neigh_table_init);
+/*
+ * Only called from ndisc_cleanup(), which means this is dead code
+ * because we no longer can unload IPv6 module.
+ */
int neigh_table_clear(int index, struct neigh_table *tbl)
{
- neigh_tables[index] = NULL;
+ RCU_INIT_POINTER(neigh_tables[index], NULL);
+ synchronize_rcu();
+
/* It is not clean... Fix it to unload IPv6 module safely */
+ cancel_delayed_work_sync(&tbl->managed_work);
cancel_delayed_work_sync(&tbl->gc_work);
- del_timer_sync(&tbl->proxy_timer);
- pneigh_queue_purge(&tbl->proxy_queue);
+ timer_delete_sync(&tbl->proxy_timer);
+ pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family);
neigh_ifdown(tbl, NULL);
if (atomic_read(&tbl->entries))
pr_crit("neighbour leakage\n");
@@ -1634,19 +1897,33 @@ static struct neigh_table *neigh_find_table(int family)
switch (family) {
case AF_INET:
- tbl = neigh_tables[NEIGH_ARP_TABLE];
+ tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ARP_TABLE]);
break;
case AF_INET6:
- tbl = neigh_tables[NEIGH_ND_TABLE];
- break;
- case AF_DECnet:
- tbl = neigh_tables[NEIGH_DN_TABLE];
+ tbl = rcu_dereference_rtnl(neigh_tables[NEIGH_ND_TABLE]);
break;
}
return tbl;
}
+const struct nla_policy nda_policy[NDA_MAX+1] = {
+ [NDA_UNSPEC] = { .strict_start_type = NDA_NH_ID },
+ [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
+ [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
+ [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) },
+ [NDA_PROBES] = { .type = NLA_U32 },
+ [NDA_VLAN] = { .type = NLA_U16 },
+ [NDA_PORT] = { .type = NLA_U16 },
+ [NDA_VNI] = { .type = NLA_U32 },
+ [NDA_IFINDEX] = { .type = NLA_U32 },
+ [NDA_MASTER] = { .type = NLA_U32 },
+ [NDA_PROTOCOL] = { .type = NLA_U8 },
+ [NDA_NH_ID] = { .type = NLA_U32 },
+ [NDA_FLAGS_EXT] = NLA_POLICY_MASK(NLA_U32, NTF_EXT_MASK),
+ [NDA_FDB_EXT_ATTRS] = { .type = NLA_NESTED },
+};
+
static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
@@ -1663,8 +1940,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
goto out;
dst_attr = nlmsg_find_attr(nlh, sizeof(*ndm), NDA_DST);
- if (dst_attr == NULL)
+ if (!dst_attr) {
+ NL_SET_ERR_MSG(extack, "Network address not specified");
goto out;
+ }
ndm = nlmsg_data(nlh);
if (ndm->ndm_ifindex) {
@@ -1679,8 +1958,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tbl == NULL)
return -EAFNOSUPPORT;
- if (nla_len(dst_attr) < (int)tbl->key_len)
+ if (nla_len(dst_attr) < (int)tbl->key_len) {
+ NL_SET_ERR_MSG(extack, "Invalid network address");
goto out;
+ }
if (ndm->ndm_flags & NTF_PROXY) {
err = pneigh_delete(tbl, net, nla_data(dst_attr), dev);
@@ -1696,14 +1977,13 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
goto out;
}
- err = neigh_update(neigh, NULL, NUD_FAILED,
- NEIGH_UPDATE_F_OVERRIDE |
- NEIGH_UPDATE_F_ADMIN,
- NETLINK_CB(skb).portid);
- write_lock_bh(&tbl->lock);
+ err = __neigh_update(neigh, NULL, NUD_FAILED,
+ NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN,
+ NETLINK_CB(skb).portid, extack);
+ spin_lock_bh(&tbl->lock);
neigh_release(neigh);
- neigh_remove_one(neigh, tbl);
- write_unlock_bh(&tbl->lock);
+ neigh_remove_one(neigh);
+ spin_unlock_bh(&tbl->lock);
out:
return err;
@@ -1712,7 +1992,8 @@ out:
static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
- int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE;
+ int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE |
+ NEIGH_UPDATE_F_OVERRIDE_ISROUTER;
struct net *net = sock_net(skb->sk);
struct ndmsg *ndm;
struct nlattr *tb[NDA_MAX+1];
@@ -1720,18 +2001,32 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net_device *dev = NULL;
struct neighbour *neigh;
void *dst, *lladdr;
+ u8 protocol = 0;
+ u32 ndm_flags;
int err;
ASSERT_RTNL();
- err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack);
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX,
+ nda_policy, extack);
if (err < 0)
goto out;
err = -EINVAL;
- if (tb[NDA_DST] == NULL)
+ if (!tb[NDA_DST]) {
+ NL_SET_ERR_MSG(extack, "Network address not specified");
goto out;
+ }
ndm = nlmsg_data(nlh);
+ ndm_flags = ndm->ndm_flags;
+ if (tb[NDA_FLAGS_EXT]) {
+ u32 ext = nla_get_u32(tb[NDA_FLAGS_EXT]);
+
+ BUILD_BUG_ON(sizeof(neigh->flags) * BITS_PER_BYTE <
+ (sizeof(ndm->ndm_flags) * BITS_PER_BYTE +
+ hweight32(NTF_EXT_MASK)));
+ ndm_flags |= (ext << NTF_EXT_SHIFT);
+ }
if (ndm->ndm_ifindex) {
dev = __dev_get_by_index(net, ndm->ndm_ifindex);
if (dev == NULL) {
@@ -1739,42 +2034,85 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
goto out;
}
- if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len)
+ if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) {
+ NL_SET_ERR_MSG(extack, "Invalid link address");
goto out;
+ }
}
tbl = neigh_find_table(ndm->ndm_family);
if (tbl == NULL)
return -EAFNOSUPPORT;
- if (nla_len(tb[NDA_DST]) < (int)tbl->key_len)
+ if (nla_len(tb[NDA_DST]) < (int)tbl->key_len) {
+ NL_SET_ERR_MSG(extack, "Invalid network address");
goto out;
+ }
+
dst = nla_data(tb[NDA_DST]);
lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL;
- if (ndm->ndm_flags & NTF_PROXY) {
- struct pneigh_entry *pn;
-
- err = -ENOBUFS;
- pn = pneigh_lookup(tbl, net, dst, dev, 1);
- if (pn) {
- pn->flags = ndm->ndm_flags;
- err = 0;
+ if (tb[NDA_PROTOCOL])
+ protocol = nla_get_u8(tb[NDA_PROTOCOL]);
+ if (ndm_flags & NTF_PROXY) {
+ if (ndm_flags & (NTF_MANAGED | NTF_EXT_VALIDATED)) {
+ NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination");
+ goto out;
}
+
+ err = pneigh_create(tbl, net, dst, dev, ndm_flags, protocol,
+ !!(ndm->ndm_state & NUD_PERMANENT));
goto out;
}
- if (dev == NULL)
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Device not specified");
+ goto out;
+ }
+
+ if (tbl->allow_add && !tbl->allow_add(dev, extack)) {
+ err = -EINVAL;
goto out;
+ }
neigh = neigh_lookup(tbl, dst, dev);
if (neigh == NULL) {
+ bool ndm_permanent = ndm->ndm_state & NUD_PERMANENT;
+ bool exempt_from_gc = ndm_permanent ||
+ ndm_flags & (NTF_EXT_LEARNED |
+ NTF_EXT_VALIDATED);
+
if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
err = -ENOENT;
goto out;
}
+ if (ndm_permanent && (ndm_flags & NTF_MANAGED)) {
+ NL_SET_ERR_MSG(extack, "Invalid NTF_* flag for permanent entry");
+ err = -EINVAL;
+ goto out;
+ }
+ if (ndm_flags & NTF_EXT_VALIDATED) {
+ u8 state = ndm->ndm_state;
+
+ /* NTF_USE and NTF_MANAGED will result in the neighbor
+ * being created with an invalid state (NUD_NONE).
+ */
+ if (ndm_flags & (NTF_USE | NTF_MANAGED))
+ state = NUD_NONE;
+
+ if (!(state & NUD_VALID)) {
+ NL_SET_ERR_MSG(extack,
+ "Cannot create externally validated neighbor with an invalid state");
+ err = -EINVAL;
+ goto out;
+ }
+ }
- neigh = __neigh_lookup_errno(tbl, dst, dev);
+ neigh = ___neigh_create(tbl, dst, dev,
+ ndm_flags &
+ (NTF_EXT_LEARNED | NTF_MANAGED |
+ NTF_EXT_VALIDATED),
+ exempt_from_gc, true);
if (IS_ERR(neigh)) {
err = PTR_ERR(neigh);
goto out;
@@ -1785,22 +2123,48 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh,
neigh_release(neigh);
goto out;
}
+ if (ndm_flags & NTF_EXT_VALIDATED) {
+ u8 state = ndm->ndm_state;
+
+ /* NTF_USE and NTF_MANAGED do not update the existing
+ * state other than clearing it if it was
+ * NUD_PERMANENT.
+ */
+ if (ndm_flags & (NTF_USE | NTF_MANAGED))
+ state = READ_ONCE(neigh->nud_state) & ~NUD_PERMANENT;
+
+ if (!(state & NUD_VALID)) {
+ NL_SET_ERR_MSG(extack,
+ "Cannot mark neighbor as externally validated with an invalid state");
+ err = -EINVAL;
+ neigh_release(neigh);
+ goto out;
+ }
+ }
if (!(nlh->nlmsg_flags & NLM_F_REPLACE))
- flags &= ~NEIGH_UPDATE_F_OVERRIDE;
+ flags &= ~(NEIGH_UPDATE_F_OVERRIDE |
+ NEIGH_UPDATE_F_OVERRIDE_ISROUTER);
}
- if (ndm->ndm_flags & NTF_EXT_LEARNED)
+ if (protocol)
+ neigh->protocol = protocol;
+ if (ndm_flags & NTF_EXT_LEARNED)
flags |= NEIGH_UPDATE_F_EXT_LEARNED;
-
- if (ndm->ndm_flags & NTF_USE) {
+ if (ndm_flags & NTF_ROUTER)
+ flags |= NEIGH_UPDATE_F_ISROUTER;
+ if (ndm_flags & NTF_MANAGED)
+ flags |= NEIGH_UPDATE_F_MANAGED;
+ if (ndm_flags & NTF_USE)
+ flags |= NEIGH_UPDATE_F_USE;
+ if (ndm_flags & NTF_EXT_VALIDATED)
+ flags |= NEIGH_UPDATE_F_EXT_VALIDATED;
+
+ err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags,
+ NETLINK_CB(skb).portid, extack);
+ if (!err && ndm_flags & (NTF_USE | NTF_MANAGED))
neigh_event_send(neigh, NULL);
- err = 0;
- } else
- err = neigh_update(neigh, lladdr, ndm->ndm_state, flags,
- NETLINK_CB(skb).portid);
neigh_release(neigh);
-
out:
return err;
}
@@ -1809,12 +2173,12 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
{
struct nlattr *nest;
- nest = nla_nest_start(skb, NDTA_PARMS);
+ nest = nla_nest_start_noflag(skb, NDTA_PARMS);
if (nest == NULL)
return -ENOBUFS;
if ((parms->dev &&
- nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) ||
+ nla_put_u32(skb, NDTPA_IFINDEX, READ_ONCE(parms->dev->ifindex))) ||
nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) ||
nla_put_u32(skb, NDTPA_QUEUE_LENBYTES,
NEIGH_VAR(parms, QUEUE_LEN_BYTES)) ||
@@ -1829,7 +2193,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
NEIGH_VAR(parms, MCAST_PROBES)) ||
nla_put_u32(skb, NDTPA_MCAST_REPROBES,
NEIGH_VAR(parms, MCAST_REPROBES)) ||
- nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time,
+ nla_put_msecs(skb, NDTPA_REACHABLE_TIME, READ_ONCE(parms->reachable_time),
NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME,
NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) ||
@@ -1844,7 +2208,9 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
nla_put_msecs(skb, NDTPA_PROXY_DELAY,
NEIGH_VAR(parms, PROXY_DELAY), NDTPA_PAD) ||
nla_put_msecs(skb, NDTPA_LOCKTIME,
- NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD))
+ NEIGH_VAR(parms, LOCKTIME), NDTPA_PAD) ||
+ nla_put_msecs(skb, NDTPA_INTERVAL_PROBE_TIME_MS,
+ NEIGH_VAR(parms, INTERVAL_PROBE_TIME_MS), NDTPA_PAD))
goto nla_put_failure;
return nla_nest_end(skb, nest);
@@ -1864,22 +2230,21 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
return -EMSGSIZE;
ndtmsg = nlmsg_data(nlh);
-
- read_lock_bh(&tbl->lock);
ndtmsg->ndtm_family = tbl->family;
ndtmsg->ndtm_pad1 = 0;
ndtmsg->ndtm_pad2 = 0;
if (nla_put_string(skb, NDTA_NAME, tbl->id) ||
- nla_put_msecs(skb, NDTA_GC_INTERVAL, tbl->gc_interval, NDTA_PAD) ||
- nla_put_u32(skb, NDTA_THRESH1, tbl->gc_thresh1) ||
- nla_put_u32(skb, NDTA_THRESH2, tbl->gc_thresh2) ||
- nla_put_u32(skb, NDTA_THRESH3, tbl->gc_thresh3))
+ nla_put_msecs(skb, NDTA_GC_INTERVAL, READ_ONCE(tbl->gc_interval),
+ NDTA_PAD) ||
+ nla_put_u32(skb, NDTA_THRESH1, READ_ONCE(tbl->gc_thresh1)) ||
+ nla_put_u32(skb, NDTA_THRESH2, READ_ONCE(tbl->gc_thresh2)) ||
+ nla_put_u32(skb, NDTA_THRESH3, READ_ONCE(tbl->gc_thresh3)))
goto nla_put_failure;
{
unsigned long now = jiffies;
- unsigned int flush_delta = now - tbl->last_flush;
- unsigned int rand_delta = now - tbl->last_rand;
+ long flush_delta = now - READ_ONCE(tbl->last_flush);
+ long rand_delta = now - READ_ONCE(tbl->last_rand);
struct neigh_hash_table *nht;
struct ndt_config ndc = {
.ndtc_key_len = tbl->key_len,
@@ -1887,14 +2252,12 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
.ndtc_entries = atomic_read(&tbl->entries),
.ndtc_last_flush = jiffies_to_msecs(flush_delta),
.ndtc_last_rand = jiffies_to_msecs(rand_delta),
- .ndtc_proxy_qlen = tbl->proxy_queue.qlen,
+ .ndtc_proxy_qlen = READ_ONCE(tbl->proxy_queue.qlen),
};
- rcu_read_lock_bh();
- nht = rcu_dereference_bh(tbl->nht);
+ nht = rcu_dereference(tbl->nht);
ndc.ndtc_hash_rnd = nht->hash_rnd[0];
ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1);
- rcu_read_unlock_bh();
if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc))
goto nla_put_failure;
@@ -1910,17 +2273,17 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
struct neigh_statistics *st;
st = per_cpu_ptr(tbl->stats, cpu);
- ndst.ndts_allocs += st->allocs;
- ndst.ndts_destroys += st->destroys;
- ndst.ndts_hash_grows += st->hash_grows;
- ndst.ndts_res_failed += st->res_failed;
- ndst.ndts_lookups += st->lookups;
- ndst.ndts_hits += st->hits;
- ndst.ndts_rcv_probes_mcast += st->rcv_probes_mcast;
- ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast;
- ndst.ndts_periodic_gc_runs += st->periodic_gc_runs;
- ndst.ndts_forced_gc_runs += st->forced_gc_runs;
- ndst.ndts_table_fulls += st->table_fulls;
+ ndst.ndts_allocs += READ_ONCE(st->allocs);
+ ndst.ndts_destroys += READ_ONCE(st->destroys);
+ ndst.ndts_hash_grows += READ_ONCE(st->hash_grows);
+ ndst.ndts_res_failed += READ_ONCE(st->res_failed);
+ ndst.ndts_lookups += READ_ONCE(st->lookups);
+ ndst.ndts_hits += READ_ONCE(st->hits);
+ ndst.ndts_rcv_probes_mcast += READ_ONCE(st->rcv_probes_mcast);
+ ndst.ndts_rcv_probes_ucast += READ_ONCE(st->rcv_probes_ucast);
+ ndst.ndts_periodic_gc_runs += READ_ONCE(st->periodic_gc_runs);
+ ndst.ndts_forced_gc_runs += READ_ONCE(st->forced_gc_runs);
+ ndst.ndts_table_fulls += READ_ONCE(st->table_fulls);
}
if (nla_put_64bit(skb, NDTA_STATS, sizeof(ndst), &ndst,
@@ -1932,12 +2295,10 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
if (neightbl_fill_parms(skb, &tbl->parms) < 0)
goto nla_put_failure;
- read_unlock_bh(&tbl->lock);
nlmsg_end(skb, nlh);
return 0;
nla_put_failure:
- read_unlock_bh(&tbl->lock);
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
@@ -1956,8 +2317,6 @@ static int neightbl_fill_param_info(struct sk_buff *skb,
return -EMSGSIZE;
ndtmsg = nlmsg_data(nlh);
-
- read_lock_bh(&tbl->lock);
ndtmsg->ndtm_family = tbl->family;
ndtmsg->ndtm_pad1 = 0;
ndtmsg->ndtm_pad2 = 0;
@@ -1966,11 +2325,9 @@ static int neightbl_fill_param_info(struct sk_buff *skb,
neightbl_fill_parms(skb, parms) < 0)
goto errout;
- read_unlock_bh(&tbl->lock);
nlmsg_end(skb, nlh);
return 0;
errout:
- read_unlock_bh(&tbl->lock);
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
}
@@ -1987,6 +2344,7 @@ static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = {
static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
[NDTPA_IFINDEX] = { .type = NLA_U32 },
[NDTPA_QUEUE_LEN] = { .type = NLA_U32 },
+ [NDTPA_QUEUE_LENBYTES] = { .type = NLA_U32 },
[NDTPA_PROXY_QLEN] = { .type = NLA_U32 },
[NDTPA_APP_PROBES] = { .type = NLA_U32 },
[NDTPA_UCAST_PROBES] = { .type = NLA_U32 },
@@ -1999,20 +2357,21 @@ static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
[NDTPA_ANYCAST_DELAY] = { .type = NLA_U64 },
[NDTPA_PROXY_DELAY] = { .type = NLA_U64 },
[NDTPA_LOCKTIME] = { .type = NLA_U64 },
+ [NDTPA_INTERVAL_PROBE_TIME_MS] = { .type = NLA_U64, .min = 1 },
};
static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
+ struct nlattr *tb[NDTA_MAX + 1];
struct neigh_table *tbl;
struct ndtmsg *ndtmsg;
- struct nlattr *tb[NDTA_MAX+1];
bool found = false;
int err, tidx;
- err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX,
- nl_neightbl_policy, extack);
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ndtmsg), tb, NDTA_MAX,
+ nl_neightbl_policy, extack);
if (err < 0)
goto errout;
@@ -2023,34 +2382,42 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
ndtmsg = nlmsg_data(nlh);
+ rcu_read_lock();
+
for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
- tbl = neigh_tables[tidx];
+ tbl = rcu_dereference(neigh_tables[tidx]);
if (!tbl)
continue;
+
if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family)
continue;
+
if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) {
found = true;
break;
}
}
- if (!found)
- return -ENOENT;
+ if (!found) {
+ rcu_read_unlock();
+ err = -ENOENT;
+ goto errout;
+ }
/*
* We acquire tbl->lock to be nice to the periodic timers and
* make sure they always see a consistent set of values.
*/
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
if (tb[NDTA_PARMS]) {
struct nlattr *tbp[NDTPA_MAX+1];
struct neigh_parms *p;
int i, ifindex = 0;
- err = nla_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS],
- nl_ntbl_parm_policy, extack);
+ err = nla_parse_nested_deprecated(tbp, NDTPA_MAX,
+ tb[NDTA_PARMS],
+ nl_ntbl_parm_policy, extack);
if (err < 0)
goto errout_tbl_lock;
@@ -2104,8 +2471,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
* only be effective after the next time neigh_periodic_work
* decides to recompute it (can be multiple minutes)
*/
- p->reachable_time =
- neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+ neigh_set_reach_time(p);
break;
case NDTPA_GC_STALETIME:
NEIGH_VAR_SET(p, GC_STALETIME,
@@ -2116,6 +2482,10 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
nla_get_msecs(tbp[i]));
call_netevent_notifiers(NETEVENT_DELAY_PROBE_TIME_UPDATE, p);
break;
+ case NDTPA_INTERVAL_PROBE_TIME_MS:
+ NEIGH_VAR_SET(p, INTERVAL_PROBE_TIME_MS,
+ nla_get_msecs(tbp[i]));
+ break;
case NDTPA_RETRANS_TIME:
NEIGH_VAR_SET(p, RETRANS_TIME,
nla_get_msecs(tbp[i]));
@@ -2143,39 +2513,74 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout_tbl_lock;
if (tb[NDTA_THRESH1])
- tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]);
+ WRITE_ONCE(tbl->gc_thresh1, nla_get_u32(tb[NDTA_THRESH1]));
if (tb[NDTA_THRESH2])
- tbl->gc_thresh2 = nla_get_u32(tb[NDTA_THRESH2]);
+ WRITE_ONCE(tbl->gc_thresh2, nla_get_u32(tb[NDTA_THRESH2]));
if (tb[NDTA_THRESH3])
- tbl->gc_thresh3 = nla_get_u32(tb[NDTA_THRESH3]);
+ WRITE_ONCE(tbl->gc_thresh3, nla_get_u32(tb[NDTA_THRESH3]));
if (tb[NDTA_GC_INTERVAL])
- tbl->gc_interval = nla_get_msecs(tb[NDTA_GC_INTERVAL]);
+ WRITE_ONCE(tbl->gc_interval, nla_get_msecs(tb[NDTA_GC_INTERVAL]));
err = 0;
errout_tbl_lock:
- write_unlock_bh(&tbl->lock);
+ spin_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
errout:
return err;
}
+static int neightbl_valid_dump_info(const struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct ndtmsg *ndtm;
+
+ ndtm = nlmsg_payload(nlh, sizeof(*ndtm));
+ if (!ndtm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for neighbor table dump request");
+ return -EINVAL;
+ }
+
+ if (ndtm->ndtm_pad1 || ndtm->ndtm_pad2) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor table dump request");
+ return -EINVAL;
+ }
+
+ if (nlmsg_attrlen(nlh, sizeof(*ndtm))) {
+ NL_SET_ERR_MSG(extack, "Invalid data after header in neighbor table dump request");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
int family, tidx, nidx = 0;
int tbl_skip = cb->args[0];
int neigh_skip = cb->args[1];
struct neigh_table *tbl;
- family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
+ if (cb->strict_check) {
+ int err = neightbl_valid_dump_info(nlh, cb->extack);
+
+ if (err < 0)
+ return err;
+ }
+
+ family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
+
+ rcu_read_lock();
for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) {
struct neigh_parms *p;
- tbl = neigh_tables[tidx];
+ tbl = rcu_dereference(neigh_tables[tidx]);
if (!tbl)
continue;
@@ -2183,13 +2588,13 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
continue;
if (neightbl_fill_info(skb, tbl, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, RTM_NEWNEIGHTBL,
+ nlh->nlmsg_seq, RTM_NEWNEIGHTBL,
NLM_F_MULTI) < 0)
break;
nidx = 0;
p = list_next_entry(&tbl->parms, list);
- list_for_each_entry_from(p, &tbl->parms_list, list) {
+ list_for_each_entry_from_rcu(p, &tbl->parms_list, list) {
if (!net_eq(neigh_parms_net(p), net))
continue;
@@ -2198,7 +2603,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
if (neightbl_fill_param_info(skb, tbl, p,
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
+ nlh->nlmsg_seq,
RTM_NEWNEIGHTBL,
NLM_F_MULTI) < 0)
goto out;
@@ -2209,6 +2614,8 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
neigh_skip = 0;
}
out:
+ rcu_read_unlock();
+
cb->args[0] = tidx;
cb->args[1] = nidx;
@@ -2218,6 +2625,7 @@ out:
static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
u32 pid, u32 seq, int type, unsigned int flags)
{
+ u32 neigh_flags, neigh_flags_ext;
unsigned long now = jiffies;
struct nda_cacheinfo ci;
struct nlmsghdr *nlh;
@@ -2227,11 +2635,14 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
if (nlh == NULL)
return -EMSGSIZE;
+ neigh_flags_ext = neigh->flags >> NTF_EXT_SHIFT;
+ neigh_flags = neigh->flags & NTF_OLD_MASK;
+
ndm = nlmsg_data(nlh);
ndm->ndm_family = neigh->ops->family;
ndm->ndm_pad1 = 0;
ndm->ndm_pad2 = 0;
- ndm->ndm_flags = neigh->flags;
+ ndm->ndm_flags = neigh_flags;
ndm->ndm_type = neigh->type;
ndm->ndm_ifindex = neigh->dev->ifindex;
@@ -2260,6 +2671,11 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
goto nla_put_failure;
+ if (neigh->protocol && nla_put_u8(skb, NDA_PROTOCOL, neigh->protocol))
+ goto nla_put_failure;
+ if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
@@ -2272,18 +2688,24 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
u32 pid, u32 seq, int type, unsigned int flags,
struct neigh_table *tbl)
{
+ u32 neigh_flags, neigh_flags_ext;
struct nlmsghdr *nlh;
struct ndmsg *ndm;
+ u8 protocol;
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), flags);
if (nlh == NULL)
return -EMSGSIZE;
+ neigh_flags = READ_ONCE(pn->flags);
+ neigh_flags_ext = neigh_flags >> NTF_EXT_SHIFT;
+ neigh_flags &= NTF_OLD_MASK;
+
ndm = nlmsg_data(nlh);
ndm->ndm_family = tbl->family;
ndm->ndm_pad1 = 0;
ndm->ndm_pad2 = 0;
- ndm->ndm_flags = pn->flags | NTF_PROXY;
+ ndm->ndm_flags = neigh_flags | NTF_PROXY;
ndm->ndm_type = RTN_UNICAST;
ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0;
ndm->ndm_state = NUD_NONE;
@@ -2291,6 +2713,12 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
if (nla_put(skb, NDA_DST, tbl->key_len, pn->key))
goto nla_put_failure;
+ protocol = READ_ONCE(pn->protocol);
+ if (protocol && nla_put_u8(skb, NDA_PROTOCOL, protocol))
+ goto nla_put_failure;
+ if (neigh_flags_ext && nla_put_u32(skb, NDA_FLAGS_EXT, neigh_flags_ext))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
@@ -2312,7 +2740,14 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx)
if (!master_idx)
return false;
- master = netdev_master_upper_dev_get(dev);
+ master = dev ? netdev_master_upper_dev_get_rcu(dev) : NULL;
+
+ /* 0 is already used to denote NDA_MASTER wasn't passed, therefore need another
+ * invalid value for ifindex to denote "no master".
+ */
+ if (master_idx == -1)
+ return !!master;
+
if (!master || master->ifindex != master_idx)
return true;
@@ -2321,133 +2756,187 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx)
static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx)
{
- if (filter_idx && dev->ifindex != filter_idx)
+ if (filter_idx && (!dev || dev->ifindex != filter_idx))
return true;
return false;
}
+struct neigh_dump_filter {
+ int master_idx;
+ int dev_idx;
+};
+
static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
- struct netlink_callback *cb)
+ struct netlink_callback *cb,
+ struct neigh_dump_filter *filter)
{
struct net *net = sock_net(skb->sk);
- const struct nlmsghdr *nlh = cb->nlh;
- struct nlattr *tb[NDA_MAX + 1];
struct neighbour *n;
- int rc, h, s_h = cb->args[1];
+ int err = 0, h, s_h = cb->args[1];
int idx, s_idx = idx = cb->args[2];
struct neigh_hash_table *nht;
- int filter_master_idx = 0, filter_idx = 0;
unsigned int flags = NLM_F_MULTI;
- int err;
- err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, NULL);
- if (!err) {
- if (tb[NDA_IFINDEX]) {
- if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
- return -EINVAL;
- filter_idx = nla_get_u32(tb[NDA_IFINDEX]);
- }
- if (tb[NDA_MASTER]) {
- if (nla_len(tb[NDA_MASTER]) != sizeof(u32))
- return -EINVAL;
- filter_master_idx = nla_get_u32(tb[NDA_MASTER]);
- }
- if (filter_idx || filter_master_idx)
- flags |= NLM_F_DUMP_FILTERED;
- }
+ if (filter->dev_idx || filter->master_idx)
+ flags |= NLM_F_DUMP_FILTERED;
- rcu_read_lock_bh();
- nht = rcu_dereference_bh(tbl->nht);
+ nht = rcu_dereference(tbl->nht);
for (h = s_h; h < (1 << nht->hash_shift); h++) {
if (h > s_h)
s_idx = 0;
- for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0;
- n != NULL;
- n = rcu_dereference_bh(n->next)) {
+ idx = 0;
+ neigh_for_each_in_bucket_rcu(n, &nht->hash_heads[h]) {
if (idx < s_idx || !net_eq(dev_net(n->dev), net))
goto next;
- if (neigh_ifindex_filtered(n->dev, filter_idx) ||
- neigh_master_filtered(n->dev, filter_master_idx))
+ if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||
+ neigh_master_filtered(n->dev, filter->master_idx))
goto next;
- if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWNEIGH,
- flags) < 0) {
- rc = -1;
+ err = neigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH, flags);
+ if (err < 0)
goto out;
- }
next:
idx++;
}
}
- rc = skb->len;
out:
- rcu_read_unlock_bh();
cb->args[1] = h;
cb->args[2] = idx;
- return rc;
+ return err;
}
static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
- struct netlink_callback *cb)
+ struct netlink_callback *cb,
+ struct neigh_dump_filter *filter)
{
struct pneigh_entry *n;
struct net *net = sock_net(skb->sk);
- int rc, h, s_h = cb->args[3];
+ int err = 0, h, s_h = cb->args[3];
int idx, s_idx = idx = cb->args[4];
+ unsigned int flags = NLM_F_MULTI;
- read_lock_bh(&tbl->lock);
+ if (filter->dev_idx || filter->master_idx)
+ flags |= NLM_F_DUMP_FILTERED;
for (h = s_h; h <= PNEIGH_HASHMASK; h++) {
if (h > s_h)
s_idx = 0;
- for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) {
+ for (n = rcu_dereference(tbl->phash_buckets[h]), idx = 0;
+ n;
+ n = rcu_dereference(n->next)) {
if (idx < s_idx || pneigh_net(n) != net)
goto next;
- if (pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWNEIGH,
- NLM_F_MULTI, tbl) < 0) {
- read_unlock_bh(&tbl->lock);
- rc = -1;
+ if (neigh_ifindex_filtered(n->dev, filter->dev_idx) ||
+ neigh_master_filtered(n->dev, filter->master_idx))
+ goto next;
+ err = pneigh_fill_info(skb, n, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH, flags, tbl);
+ if (err < 0)
goto out;
- }
next:
idx++;
}
}
- read_unlock_bh(&tbl->lock);
- rc = skb->len;
out:
cb->args[3] = h;
cb->args[4] = idx;
- return rc;
+ return err;
+}
+
+static int neigh_valid_dump_req(const struct nlmsghdr *nlh,
+ bool strict_check,
+ struct neigh_dump_filter *filter,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[NDA_MAX + 1];
+ int err, i;
+
+ if (strict_check) {
+ struct ndmsg *ndm;
+
+ ndm = nlmsg_payload(nlh, sizeof(*ndm));
+ if (!ndm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for neighbor dump request");
+ return -EINVAL;
+ }
+
+ if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_ifindex ||
+ ndm->ndm_state || ndm->ndm_type) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor dump request");
+ return -EINVAL;
+ }
+ if (ndm->ndm_flags & ~NTF_PROXY) {
+ NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor dump request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg),
+ tb, NDA_MAX, nda_policy,
+ extack);
+ } else {
+ err = nlmsg_parse_deprecated(nlh, sizeof(struct ndmsg), tb,
+ NDA_MAX, nda_policy, extack);
+ }
+ if (err < 0)
+ return err;
+
+ for (i = 0; i <= NDA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ /* all new attributes should require strict_check */
+ switch (i) {
+ case NDA_IFINDEX:
+ filter->dev_idx = nla_get_u32(tb[i]);
+ break;
+ case NDA_MASTER:
+ filter->master_idx = nla_get_u32(tb[i]);
+ break;
+ default:
+ if (strict_check) {
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor dump request");
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
}
static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct neigh_dump_filter filter = {};
struct neigh_table *tbl;
int t, family, s_t;
int proxy = 0;
int err;
- family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
+ family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
/* check for full ndmsg structure presence, family member is
* the same for both structures
*/
- if (nlmsg_len(cb->nlh) >= sizeof(struct ndmsg) &&
- ((struct ndmsg *) nlmsg_data(cb->nlh))->ndm_flags == NTF_PROXY)
+ if (nlmsg_len(nlh) >= sizeof(struct ndmsg) &&
+ ((struct ndmsg *)nlmsg_data(nlh))->ndm_flags == NTF_PROXY)
proxy = 1;
+ err = neigh_valid_dump_req(nlh, cb->strict_check, &filter, cb->extack);
+ if (err < 0 && cb->strict_check)
+ return err;
+ err = 0;
+
s_t = cb->args[0];
+ rcu_read_lock();
for (t = 0; t < NEIGH_NR_TABLES; t++) {
- tbl = neigh_tables[t];
+ tbl = rcu_dereference(neigh_tables[t]);
if (!tbl)
continue;
@@ -2457,15 +2946,177 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
memset(&cb->args[1], 0, sizeof(cb->args) -
sizeof(cb->args[0]));
if (proxy)
- err = pneigh_dump_table(tbl, skb, cb);
+ err = pneigh_dump_table(tbl, skb, cb, &filter);
else
- err = neigh_dump_table(tbl, skb, cb);
+ err = neigh_dump_table(tbl, skb, cb, &filter);
if (err < 0)
break;
}
+ rcu_read_unlock();
cb->args[0] = t;
- return skb->len;
+ return err;
+}
+
+static struct ndmsg *neigh_valid_get_req(const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct ndmsg *ndm;
+ int err, i;
+
+ ndm = nlmsg_payload(nlh, sizeof(*ndm));
+ if (!ndm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for neighbor get request");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state ||
+ ndm->ndm_type) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for neighbor get request");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (ndm->ndm_flags & ~NTF_PROXY) {
+ NL_SET_ERR_MSG(extack, "Invalid flags in header for neighbor get request");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (!(ndm->ndm_flags & NTF_PROXY) && !ndm->ndm_ifindex) {
+ NL_SET_ERR_MSG(extack, "No device specified");
+ return ERR_PTR(-EINVAL);
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb,
+ NDA_MAX, nda_policy, extack);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ for (i = 0; i <= NDA_MAX; ++i) {
+ switch (i) {
+ case NDA_DST:
+ if (!tb[i]) {
+ NL_SET_ERR_ATTR_MISS(extack, NULL, NDA_DST);
+ return ERR_PTR(-EINVAL);
+ }
+ break;
+ default:
+ if (!tb[i])
+ continue;
+
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in neighbor get request");
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ return ndm;
+}
+
+static inline size_t neigh_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ndmsg))
+ + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */
+ + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */
+ + nla_total_size(sizeof(struct nda_cacheinfo))
+ + nla_total_size(4) /* NDA_PROBES */
+ + nla_total_size(4) /* NDA_FLAGS_EXT */
+ + nla_total_size(1); /* NDA_PROTOCOL */
+}
+
+static inline size_t pneigh_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ndmsg))
+ + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */
+ + nla_total_size(4) /* NDA_FLAGS_EXT */
+ + nla_total_size(1); /* NDA_PROTOCOL */
+}
+
+static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(in_skb->sk);
+ u32 pid = NETLINK_CB(in_skb).portid;
+ struct nlattr *tb[NDA_MAX + 1];
+ struct net_device *dev = NULL;
+ u32 seq = nlh->nlmsg_seq;
+ struct neigh_table *tbl;
+ struct neighbour *neigh;
+ struct sk_buff *skb;
+ struct ndmsg *ndm;
+ void *dst;
+ int err;
+
+ ndm = neigh_valid_get_req(nlh, tb, extack);
+ if (IS_ERR(ndm))
+ return PTR_ERR(ndm);
+
+ if (ndm->ndm_flags & NTF_PROXY)
+ skb = nlmsg_new(neigh_nlmsg_size(), GFP_KERNEL);
+ else
+ skb = nlmsg_new(pneigh_nlmsg_size(), GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ rcu_read_lock();
+
+ tbl = neigh_find_table(ndm->ndm_family);
+ if (!tbl) {
+ NL_SET_ERR_MSG(extack, "Unsupported family in header for neighbor get request");
+ err = -EAFNOSUPPORT;
+ goto err_unlock;
+ }
+
+ if (nla_len(tb[NDA_DST]) != (int)tbl->key_len) {
+ NL_SET_ERR_MSG(extack, "Invalid network address in neighbor get request");
+ err = -EINVAL;
+ goto err_unlock;
+ }
+
+ dst = nla_data(tb[NDA_DST]);
+
+ if (ndm->ndm_ifindex) {
+ dev = dev_get_by_index_rcu(net, ndm->ndm_ifindex);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Unknown device ifindex");
+ err = -ENODEV;
+ goto err_unlock;
+ }
+ }
+
+ if (ndm->ndm_flags & NTF_PROXY) {
+ struct pneigh_entry *pn;
+
+ pn = pneigh_lookup(tbl, net, dst, dev);
+ if (!pn) {
+ NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found");
+ err = -ENOENT;
+ goto err_unlock;
+ }
+
+ err = pneigh_fill_info(skb, pn, pid, seq, RTM_NEWNEIGH, 0, tbl);
+ if (err)
+ goto err_unlock;
+ } else {
+ neigh = neigh_lookup(tbl, dst, dev);
+ if (!neigh) {
+ NL_SET_ERR_MSG(extack, "Neighbour entry not found");
+ err = -ENOENT;
+ goto err_unlock;
+ }
+
+ err = neigh_fill_info(skb, neigh, pid, seq, RTM_NEWNEIGH, 0);
+ neigh_release(neigh);
+ if (err)
+ goto err_unlock;
+ }
+
+ rcu_read_unlock();
+
+ return rtnl_unicast(skb, net, pid);
+err_unlock:
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return err;
}
void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie)
@@ -2473,20 +3124,18 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void
int chain;
struct neigh_hash_table *nht;
- rcu_read_lock_bh();
- nht = rcu_dereference_bh(tbl->nht);
+ rcu_read_lock();
+ nht = rcu_dereference(tbl->nht);
- read_lock(&tbl->lock); /* avoid resizes */
+ spin_lock_bh(&tbl->lock); /* avoid resizes */
for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
struct neighbour *n;
- for (n = rcu_dereference_bh(nht->hash_buckets[chain]);
- n != NULL;
- n = rcu_dereference_bh(n->next))
+ neigh_for_each_in_bucket(n, &nht->hash_heads[chain])
cb(n, cookie);
}
- read_unlock(&tbl->lock);
- rcu_read_unlock_bh();
+ spin_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
}
EXPORT_SYMBOL(neigh_for_each);
@@ -2494,29 +3143,25 @@ EXPORT_SYMBOL(neigh_for_each);
void __neigh_for_each_release(struct neigh_table *tbl,
int (*cb)(struct neighbour *))
{
- int chain;
struct neigh_hash_table *nht;
+ int chain;
nht = rcu_dereference_protected(tbl->nht,
lockdep_is_held(&tbl->lock));
for (chain = 0; chain < (1 << nht->hash_shift); chain++) {
+ struct hlist_node *tmp;
struct neighbour *n;
- struct neighbour __rcu **np;
- np = &nht->hash_buckets[chain];
- while ((n = rcu_dereference_protected(*np,
- lockdep_is_held(&tbl->lock))) != NULL) {
+ neigh_for_each_in_bucket_safe(n, tmp, &nht->hash_heads[chain]) {
int release;
write_lock(&n->lock);
release = cb(n);
if (release) {
- rcu_assign_pointer(*np,
- rcu_dereference_protected(n->next,
- lockdep_is_held(&tbl->lock)));
- n->dead = 1;
- } else
- np = &n->next;
+ hlist_del_rcu(&n->hash);
+ hlist_del_rcu(&n->dev_list);
+ neigh_mark_dead(n);
+ }
write_unlock(&n->lock);
if (release)
neigh_cleanup_and_release(n);
@@ -2529,24 +3174,32 @@ int neigh_xmit(int index, struct net_device *dev,
const void *addr, struct sk_buff *skb)
{
int err = -EAFNOSUPPORT;
+
if (likely(index < NEIGH_NR_TABLES)) {
struct neigh_table *tbl;
struct neighbour *neigh;
- tbl = neigh_tables[index];
+ rcu_read_lock();
+ tbl = rcu_dereference(neigh_tables[index]);
if (!tbl)
- goto out;
- rcu_read_lock_bh();
- neigh = __neigh_lookup_noref(tbl, addr, dev);
+ goto out_unlock;
+ if (index == NEIGH_ARP_TABLE) {
+ u32 key = *((u32 *)addr);
+
+ neigh = __ipv4_neigh_lookup_noref(dev, key);
+ } else {
+ neigh = __neigh_lookup_noref(tbl, addr, dev);
+ }
if (!neigh)
neigh = __neigh_create(tbl, addr, dev, false);
err = PTR_ERR(neigh);
if (IS_ERR(neigh)) {
- rcu_read_unlock_bh();
+ rcu_read_unlock();
goto out_kfree_skb;
}
- err = neigh->output(neigh, skb);
- rcu_read_unlock_bh();
+ err = READ_ONCE(neigh->output)(neigh, skb);
+out_unlock:
+ rcu_read_unlock();
}
else if (index == NEIGH_LINK_TABLE) {
err = dev_hard_header(skb, dev, ntohs(skb->protocol),
@@ -2565,43 +3218,53 @@ EXPORT_SYMBOL(neigh_xmit);
#ifdef CONFIG_PROC_FS
-static struct neighbour *neigh_get_first(struct seq_file *seq)
+static struct neighbour *neigh_get_valid(struct seq_file *seq,
+ struct neighbour *n,
+ loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
struct net *net = seq_file_net(seq);
+
+ if (!net_eq(dev_net(n->dev), net))
+ return NULL;
+
+ if (state->neigh_sub_iter) {
+ loff_t fakep = 0;
+ void *v;
+
+ v = state->neigh_sub_iter(state, n, pos ? pos : &fakep);
+ if (!v)
+ return NULL;
+ if (pos)
+ return v;
+ }
+
+ if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
+ return n;
+
+ if (READ_ONCE(n->nud_state) & ~NUD_NOARP)
+ return n;
+
+ return NULL;
+}
+
+static struct neighbour *neigh_get_first(struct seq_file *seq)
+{
+ struct neigh_seq_state *state = seq->private;
struct neigh_hash_table *nht = state->nht;
- struct neighbour *n = NULL;
- int bucket = state->bucket;
+ struct neighbour *n, *tmp;
state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
- for (bucket = 0; bucket < (1 << nht->hash_shift); bucket++) {
- n = rcu_dereference_bh(nht->hash_buckets[bucket]);
- while (n) {
- if (!net_eq(dev_net(n->dev), net))
- goto next;
- if (state->neigh_sub_iter) {
- loff_t fakep = 0;
- void *v;
-
- v = state->neigh_sub_iter(state, n, &fakep);
- if (!v)
- goto next;
- }
- if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
- break;
- if (n->nud_state & ~NUD_NOARP)
- break;
-next:
- n = rcu_dereference_bh(n->next);
+ while (++state->bucket < (1 << nht->hash_shift)) {
+ neigh_for_each_in_bucket(n, &nht->hash_heads[state->bucket]) {
+ tmp = neigh_get_valid(seq, n, NULL);
+ if (tmp)
+ return tmp;
}
-
- if (n)
- break;
}
- state->bucket = bucket;
- return n;
+ return NULL;
}
static struct neighbour *neigh_get_next(struct seq_file *seq,
@@ -2609,46 +3272,28 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
loff_t *pos)
{
struct neigh_seq_state *state = seq->private;
- struct net *net = seq_file_net(seq);
- struct neigh_hash_table *nht = state->nht;
+ struct neighbour *tmp;
if (state->neigh_sub_iter) {
void *v = state->neigh_sub_iter(state, n, pos);
+
if (v)
return n;
}
- n = rcu_dereference_bh(n->next);
-
- while (1) {
- while (n) {
- if (!net_eq(dev_net(n->dev), net))
- goto next;
- if (state->neigh_sub_iter) {
- void *v = state->neigh_sub_iter(state, n, pos);
- if (v)
- return n;
- goto next;
- }
- if (!(state->flags & NEIGH_SEQ_SKIP_NOARP))
- break;
- if (n->nud_state & ~NUD_NOARP)
- break;
-next:
- n = rcu_dereference_bh(n->next);
+ hlist_for_each_entry_continue(n, hash) {
+ tmp = neigh_get_valid(seq, n, pos);
+ if (tmp) {
+ n = tmp;
+ goto out;
}
-
- if (n)
- break;
-
- if (++state->bucket >= (1 << nht->hash_shift))
- break;
-
- n = rcu_dereference_bh(nht->hash_buckets[state->bucket]);
}
+ n = neigh_get_first(seq);
+out:
if (n && pos)
--(*pos);
+
return n;
}
@@ -2673,13 +3318,14 @@ static struct pneigh_entry *pneigh_get_first(struct seq_file *seq)
struct net *net = seq_file_net(seq);
struct neigh_table *tbl = state->tbl;
struct pneigh_entry *pn = NULL;
- int bucket = state->bucket;
+ int bucket;
state->flags |= NEIGH_SEQ_IS_PNEIGH;
for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) {
- pn = tbl->phash_buckets[bucket];
+ pn = rcu_dereference(tbl->phash_buckets[bucket]);
+
while (pn && !net_eq(pneigh_net(pn), net))
- pn = pn->next;
+ pn = rcu_dereference(pn->next);
if (pn)
break;
}
@@ -2697,15 +3343,17 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq,
struct neigh_table *tbl = state->tbl;
do {
- pn = pn->next;
+ pn = rcu_dereference(pn->next);
} while (pn && !net_eq(pneigh_net(pn), net));
while (!pn) {
if (++state->bucket > PNEIGH_HASHMASK)
break;
- pn = tbl->phash_buckets[state->bucket];
+
+ pn = rcu_dereference(tbl->phash_buckets[state->bucket]);
+
while (pn && !net_eq(pneigh_net(pn), net))
- pn = pn->next;
+ pn = rcu_dereference(pn->next);
if (pn)
break;
}
@@ -2745,16 +3393,18 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos)
}
void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags)
- __acquires(rcu_bh)
+ __acquires(tbl->lock)
+ __acquires(rcu)
{
struct neigh_seq_state *state = seq->private;
state->tbl = tbl;
- state->bucket = 0;
+ state->bucket = -1;
state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
- rcu_read_lock_bh();
- state->nht = rcu_dereference_bh(tbl->nht);
+ rcu_read_lock();
+ state->nht = rcu_dereference(tbl->nht);
+ spin_lock_bh(&tbl->lock);
return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN;
}
@@ -2788,9 +3438,14 @@ out:
EXPORT_SYMBOL(neigh_seq_next);
void neigh_seq_stop(struct seq_file *seq, void *v)
- __releases(rcu_bh)
+ __releases(tbl->lock)
+ __releases(rcu)
{
- rcu_read_unlock_bh();
+ struct neigh_seq_state *state = seq->private;
+ struct neigh_table *tbl = state->tbl;
+
+ spin_unlock_bh(&tbl->lock);
+ rcu_read_unlock();
}
EXPORT_SYMBOL(neigh_seq_stop);
@@ -2798,7 +3453,7 @@ EXPORT_SYMBOL(neigh_seq_stop);
static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)
{
- struct neigh_table *tbl = PDE_DATA(file_inode(seq->file));
+ struct neigh_table *tbl = pde_data(file_inode(seq->file));
int cpu;
if (*pos == 0)
@@ -2815,7 +3470,7 @@ static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos)
static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct neigh_table *tbl = PDE_DATA(file_inode(seq->file));
+ struct neigh_table *tbl = pde_data(file_inode(seq->file));
int cpu;
for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
@@ -2824,6 +3479,7 @@ static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
*pos = cpu+1;
return per_cpu_ptr(tbl->stats, cpu);
}
+ (*pos)++;
return NULL;
}
@@ -2834,16 +3490,17 @@ static void neigh_stat_seq_stop(struct seq_file *seq, void *v)
static int neigh_stat_seq_show(struct seq_file *seq, void *v)
{
- struct neigh_table *tbl = PDE_DATA(file_inode(seq->file));
+ struct neigh_table *tbl = pde_data(file_inode(seq->file));
struct neigh_statistics *st = v;
if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n");
+ seq_puts(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n");
return 0;
}
- seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx "
- "%08lx %08lx %08lx %08lx %08lx %08lx\n",
+ seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx "
+ "%08lx %08lx %08lx "
+ "%08lx %08lx %08lx\n",
atomic_read(&tbl->entries),
st->allocs,
@@ -2875,22 +3532,15 @@ static const struct seq_operations neigh_stat_seq_ops = {
};
#endif /* CONFIG_PROC_FS */
-static inline size_t neigh_nlmsg_size(void)
-{
- return NLMSG_ALIGN(sizeof(struct ndmsg))
- + nla_total_size(MAX_ADDR_LEN) /* NDA_DST */
- + nla_total_size(MAX_ADDR_LEN) /* NDA_LLADDR */
- + nla_total_size(sizeof(struct nda_cacheinfo))
- + nla_total_size(4); /* NDA_PROBES */
-}
-
static void __neigh_notify(struct neighbour *n, int type, int flags,
u32 pid)
{
- struct net *net = dev_net(n->dev);
struct sk_buff *skb;
int err = -ENOBUFS;
+ struct net *net;
+ rcu_read_lock();
+ net = dev_net_rcu(n->dev);
skb = nlmsg_new(neigh_nlmsg_size(), GFP_ATOMIC);
if (skb == NULL)
goto errout;
@@ -2903,10 +3553,11 @@ static void __neigh_notify(struct neighbour *n, int type, int flags,
goto errout;
}
rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
- return;
+ goto out;
errout:
- if (err < 0)
- rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+ rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+out:
+ rcu_read_unlock();
}
void neigh_app_ns(struct neighbour *n)
@@ -2916,17 +3567,15 @@ void neigh_app_ns(struct neighbour *n)
EXPORT_SYMBOL(neigh_app_ns);
#ifdef CONFIG_SYSCTL
-static int zero;
-static int int_max = INT_MAX;
static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN);
-static int proc_unres_qlen(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int proc_unres_qlen(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int size, ret;
struct ctl_table tmp = *ctl;
- tmp.extra1 = &zero;
+ tmp.extra1 = SYSCTL_ZERO;
tmp.extra2 = &unres_qlen_max;
tmp.data = &size;
@@ -2938,18 +3587,6 @@ static int proc_unres_qlen(struct ctl_table *ctl, int write,
return ret;
}
-static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev,
- int family)
-{
- switch (family) {
- case AF_INET:
- return __in_dev_arp_parms_get_rcu(dev);
- case AF_INET6:
- return __in6_dev_nd_parms_get_rcu(dev);
- }
- return NULL;
-}
-
static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p,
int index)
{
@@ -2967,7 +3604,7 @@ static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p,
rcu_read_unlock();
}
-static void neigh_proc_update(struct ctl_table *ctl, int write)
+static void neigh_proc_update(const struct ctl_table *ctl, int write)
{
struct net_device *dev = ctl->extra1;
struct neigh_parms *p = ctl->extra2;
@@ -2984,23 +3621,39 @@ static void neigh_proc_update(struct ctl_table *ctl, int write)
neigh_copy_dflt_parms(net, p, index);
}
-static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+static int neigh_proc_dointvec_zero_intmax(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
{
struct ctl_table tmp = *ctl;
int ret;
- tmp.extra1 = &zero;
- tmp.extra2 = &int_max;
+ tmp.extra1 = SYSCTL_ZERO;
+ tmp.extra2 = SYSCTL_INT_MAX;
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
neigh_proc_update(ctl, write);
return ret;
}
-int neigh_proc_dointvec(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int neigh_proc_dointvec_ms_jiffies_positive(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table tmp = *ctl;
+ int ret;
+
+ int min = msecs_to_jiffies(1);
+
+ tmp.extra1 = &min;
+ tmp.extra2 = NULL;
+
+ ret = proc_dointvec_ms_jiffies_minmax(&tmp, write, buffer, lenp, ppos);
+ neigh_proc_update(ctl, write);
+ return ret;
+}
+
+int neigh_proc_dointvec(const struct ctl_table *ctl, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
@@ -3009,8 +3662,7 @@ int neigh_proc_dointvec(struct ctl_table *ctl, int write,
}
EXPORT_SYMBOL(neigh_proc_dointvec);
-int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write,
- void __user *buffer,
+int neigh_proc_dointvec_jiffies(const struct ctl_table *ctl, int write, void *buffer,
size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
@@ -3020,9 +3672,9 @@ int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write,
}
EXPORT_SYMBOL(neigh_proc_dointvec_jiffies);
-static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+static int neigh_proc_dointvec_userhz_jiffies(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
{
int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos);
@@ -3030,9 +3682,8 @@ static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write,
return ret;
}
-int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+int neigh_proc_dointvec_ms_jiffies(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos);
@@ -3041,9 +3692,9 @@ int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write,
}
EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies);
-static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+static int neigh_proc_dointvec_unres_qlen(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
{
int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos);
@@ -3051,9 +3702,9 @@ static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write,
return ret;
}
-static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+static int neigh_proc_base_reachable_time(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
{
struct neigh_parms *p = ctl->extra2;
int ret;
@@ -3070,8 +3721,7 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,
* only be effective after the next time neigh_periodic_work
* decides to recompute it
*/
- p->reachable_time =
- neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME));
+ neigh_set_reach_time(p);
}
return ret;
}
@@ -3097,8 +3747,8 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,
#define NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(attr, name) \
NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_userhz_jiffies)
-#define NEIGH_SYSCTL_MS_JIFFIES_ENTRY(attr, name) \
- NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
+#define NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(attr, name) \
+ NEIGH_SYSCTL_ENTRY(attr, attr, name, 0644, neigh_proc_dointvec_ms_jiffies_positive)
#define NEIGH_SYSCTL_MS_JIFFIES_REUSED_ENTRY(attr, data_attr, name) \
NEIGH_SYSCTL_ENTRY(attr, data_attr, name, 0644, neigh_proc_dointvec_ms_jiffies)
@@ -3108,7 +3758,7 @@ static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write,
static struct neigh_sysctl_table {
struct ctl_table_header *sysctl_header;
- struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1];
+ struct ctl_table neigh_vars[NEIGH_VAR_MAX];
} neigh_sysctl_template __read_mostly = {
.neigh_vars = {
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(MCAST_PROBES, "mcast_solicit"),
@@ -3118,6 +3768,8 @@ static struct neigh_sysctl_table {
NEIGH_SYSCTL_USERHZ_JIFFIES_ENTRY(RETRANS_TIME, "retrans_time"),
NEIGH_SYSCTL_JIFFIES_ENTRY(BASE_REACHABLE_TIME, "base_reachable_time"),
NEIGH_SYSCTL_JIFFIES_ENTRY(DELAY_PROBE_TIME, "delay_first_probe_time"),
+ NEIGH_SYSCTL_MS_JIFFIES_POSITIVE_ENTRY(INTERVAL_PROBE_TIME_MS,
+ "interval_probe_time_ms"),
NEIGH_SYSCTL_JIFFIES_ENTRY(GC_STALETIME, "gc_stale_time"),
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(QUEUE_LEN_BYTES, "unres_qlen_bytes"),
NEIGH_SYSCTL_ZERO_INTMAX_ENTRY(PROXY_QLEN, "proxy_qlen"),
@@ -3137,27 +3789,26 @@ static struct neigh_sysctl_table {
.procname = "gc_thresh1",
.maxlen = sizeof(int),
.mode = 0644,
- .extra1 = &zero,
- .extra2 = &int_max,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_INT_MAX,
.proc_handler = proc_dointvec_minmax,
},
[NEIGH_VAR_GC_THRESH2] = {
.procname = "gc_thresh2",
.maxlen = sizeof(int),
.mode = 0644,
- .extra1 = &zero,
- .extra2 = &int_max,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_INT_MAX,
.proc_handler = proc_dointvec_minmax,
},
[NEIGH_VAR_GC_THRESH3] = {
.procname = "gc_thresh3",
.maxlen = sizeof(int),
.mode = 0644,
- .extra1 = &zero,
- .extra2 = &int_max,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_INT_MAX,
.proc_handler = proc_dointvec_minmax,
},
- {},
},
};
@@ -3169,8 +3820,9 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
const char *dev_name_source;
char neigh_path[ sizeof("net//neigh/") + IFNAMSIZ + IFNAMSIZ ];
char *p_name;
+ size_t neigh_vars_size;
- t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL);
+ t = kmemdup(&neigh_sysctl_template, sizeof(*t), GFP_KERNEL_ACCOUNT);
if (!t)
goto err;
@@ -3180,11 +3832,11 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
t->neigh_vars[i].extra2 = p;
}
+ neigh_vars_size = ARRAY_SIZE(t->neigh_vars);
if (dev) {
dev_name_source = dev->name;
/* Terminate the table early */
- memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0,
- sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));
+ neigh_vars_size = NEIGH_VAR_BASE_REACHABLE_TIME_MS + 1;
} else {
struct neigh_table *tbl = p->tbl;
dev_name_source = "default";
@@ -3218,10 +3870,6 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
neigh_proc_base_reachable_time;
}
- /* Don't export sysctls to unprivileged users */
- if (neigh_parms_net(p)->user_ns != &init_user_ns)
- t->neigh_vars[0].procname = NULL;
-
switch (neigh_parms_family(p)) {
case AF_INET:
p_name = "ipv4";
@@ -3235,8 +3883,9 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
snprintf(neigh_path, sizeof(neigh_path), "net/%s/neigh/%s",
p_name, dev_name_source);
- t->sysctl_header =
- register_net_sysctl(neigh_parms_net(p), neigh_path, t->neigh_vars);
+ t->sysctl_header = register_net_sysctl_sz(neigh_parms_net(p),
+ neigh_path, t->neigh_vars,
+ neigh_vars_size);
if (!t->sysctl_header)
goto free;
@@ -3263,16 +3912,20 @@ EXPORT_SYMBOL(neigh_sysctl_unregister);
#endif /* CONFIG_SYSCTL */
+static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_NEWNEIGH, .doit = neigh_add},
+ {.msgtype = RTM_DELNEIGH, .doit = neigh_delete},
+ {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
+ {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info,
+ .flags = RTNL_FLAG_DUMP_UNLOCKED},
+ {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED},
+};
+
static int __init neigh_init(void)
{
- rtnl_register(PF_UNSPEC, RTM_NEWNEIGH, neigh_add, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_DELNEIGH, neigh_delete, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_GETNEIGH, NULL, neigh_dump_info, 0);
-
- rtnl_register(PF_UNSPEC, RTM_GETNEIGHTBL, NULL, neightbl_dump_info,
- 0);
- rtnl_register(PF_UNSPEC, RTM_SETNEIGHTBL, neightbl_set, NULL, 0);
-
+ rtnl_register_many(neigh_rtnl_msg_handlers);
return 0;
}
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 63881f72ef71..70e0e9a3b650 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -3,53 +3,22 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <net/wext.h>
+#include <net/hotdata.h>
-#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
+#include "dev.h"
-#define get_bucket(x) ((x) >> BUCKET_SPACE)
-#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
-#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
-
-extern struct list_head ptype_all __read_mostly;
-extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
-
-static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
+static void *dev_seq_from_index(struct seq_file *seq, loff_t *pos)
{
- struct net *net = seq_file_net(seq);
+ unsigned long ifindex = *pos;
struct net_device *dev;
- struct hlist_head *h;
- unsigned int count = 0, offset = get_offset(*pos);
- h = &net->dev_name_head[get_bucket(*pos)];
- hlist_for_each_entry_rcu(dev, h, name_hlist) {
- if (++count == offset)
- return dev;
+ for_each_netdev_dump(seq_file_net(seq), dev, ifindex) {
+ *pos = dev->ifindex;
+ return dev;
}
-
- return NULL;
-}
-
-static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
-{
- struct net_device *dev;
- unsigned int bucket;
-
- do {
- dev = dev_from_same_bucket(seq, pos);
- if (dev)
- return dev;
-
- bucket = get_bucket(*pos) + 1;
- *pos = set_bucket_offset(bucket, 1);
- } while (bucket < NETDEV_HASHENTRIES);
-
return NULL;
}
-/*
- * This is invoked by the /proc filesystem handler to display a device
- * in detail.
- */
static void *dev_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
@@ -57,16 +26,13 @@ static void *dev_seq_start(struct seq_file *seq, loff_t *pos)
if (!*pos)
return SEQ_START_TOKEN;
- if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
- return NULL;
-
- return dev_from_bucket(seq, pos);
+ return dev_seq_from_index(seq, pos);
}
static void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
++*pos;
- return dev_from_bucket(seq, pos);
+ return dev_seq_from_index(seq, pos);
}
static void dev_seq_stop(struct seq_file *seq, void *v)
@@ -116,6 +82,16 @@ static int dev_seq_show(struct seq_file *seq, void *v)
return 0;
}
+static u32 softnet_input_pkt_queue_len(struct softnet_data *sd)
+{
+ return skb_queue_len_lockless(&sd->input_pkt_queue);
+}
+
+static u32 softnet_process_queue_len(struct softnet_data *sd)
+{
+ return skb_queue_len_lockless(&sd->process_queue);
+}
+
static struct softnet_data *softnet_get_online(loff_t *pos)
{
struct softnet_data *sd = NULL;
@@ -147,6 +123,8 @@ static void softnet_seq_stop(struct seq_file *seq, void *v)
static int softnet_seq_show(struct seq_file *seq, void *v)
{
struct softnet_data *sd = v;
+ u32 input_qlen = softnet_input_pkt_queue_len(sd);
+ u32 process_qlen = softnet_process_queue_len(sd);
unsigned int flow_limit_count = 0;
#ifdef CONFIG_NET_FLOW_LIMIT
@@ -154,17 +132,27 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
rcu_read_lock();
fl = rcu_dereference(sd->flow_limit);
+ /* Pairs with WRITE_ONCE() in skb_flow_limit() */
if (fl)
- flow_limit_count = fl->count;
+ flow_limit_count = READ_ONCE(fl->count);
rcu_read_unlock();
#endif
+ /* the index is the CPU id owing this sd. Since offline CPUs are not
+ * displayed, it would be othrwise not trivial for the user-space
+ * mapping the data a specific CPU
+ */
seq_printf(seq,
- "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
- sd->processed, sd->dropped, sd->time_squeeze, 0,
+ "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x "
+ "%08x %08x\n",
+ READ_ONCE(sd->processed),
+ numa_drop_read(&sd->drop_counters),
+ READ_ONCE(sd->time_squeeze), 0,
0, 0, 0, 0, /* was fastroute */
0, /* was cpu_collision */
- sd->received_rps, flow_limit_count);
+ READ_ONCE(sd->received_rps), flow_limit_count,
+ input_qlen + process_qlen, (int)seq->index,
+ input_qlen, process_qlen);
return 0;
}
@@ -182,13 +170,30 @@ static const struct seq_operations softnet_seq_ops = {
.show = softnet_seq_show,
};
-static void *ptype_get_idx(loff_t pos)
+static void *ptype_get_idx(struct seq_file *seq, loff_t pos)
{
+ struct list_head *ptype_list = NULL;
struct packet_type *pt = NULL;
+ struct net_device *dev;
loff_t i = 0;
int t;
- list_for_each_entry_rcu(pt, &ptype_all, list) {
+ for_each_netdev_rcu(seq_file_net(seq), dev) {
+ ptype_list = &dev->ptype_all;
+ list_for_each_entry_rcu(pt, ptype_list, list) {
+ if (i == pos)
+ return pt;
+ ++i;
+ }
+ }
+
+ list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_all, list) {
+ if (i == pos)
+ return pt;
+ ++i;
+ }
+
+ list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_specific, list) {
if (i == pos)
return pt;
++i;
@@ -208,24 +213,50 @@ static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
rcu_read_lock();
- return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
+ return *pos ? ptype_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
+ struct net *net = seq_file_net(seq);
+ struct net_device *dev;
struct packet_type *pt;
struct list_head *nxt;
int hash;
++*pos;
if (v == SEQ_START_TOKEN)
- return ptype_get_idx(0);
+ return ptype_get_idx(seq, 0);
pt = v;
nxt = pt->list.next;
- if (pt->type == htons(ETH_P_ALL)) {
- if (nxt != &ptype_all)
+ if (pt->dev) {
+ if (nxt != &pt->dev->ptype_all)
+ goto found;
+
+ dev = pt->dev;
+ for_each_netdev_continue_rcu(seq_file_net(seq), dev) {
+ if (!list_empty(&dev->ptype_all)) {
+ nxt = dev->ptype_all.next;
+ goto found;
+ }
+ }
+ nxt = net->ptype_all.next;
+ goto net_ptype_all;
+ }
+
+ if (pt->af_packet_net) {
+net_ptype_all:
+ if (nxt != &net->ptype_all && nxt != &net->ptype_specific)
goto found;
+
+ if (nxt == &net->ptype_all) {
+ /* continue with ->ptype_specific if it's not empty */
+ nxt = net->ptype_specific.next;
+ if (nxt != &net->ptype_specific)
+ goto found;
+ }
+
hash = 0;
nxt = ptype_base[0].next;
} else
@@ -252,13 +283,14 @@ static int ptype_seq_show(struct seq_file *seq, void *v)
if (v == SEQ_START_TOKEN)
seq_puts(seq, "Type Device Function\n");
- else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
+ else if ((!pt->af_packet_net || net_eq(pt->af_packet_net, seq_file_net(seq))) &&
+ (!pt->dev || net_eq(dev_net(pt->dev), seq_file_net(seq)))) {
if (pt->type == htons(ETH_P_ALL))
seq_puts(seq, "ALL ");
else
seq_printf(seq, "%04x", ntohs(pt->type));
- seq_printf(seq, " %-8s %pf\n",
+ seq_printf(seq, " %-8s %ps\n",
pt->dev ? pt->dev->name : "", pt->func);
}
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index bd67c4d0fcfd..ca878525ad7c 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1,21 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net-sysfs.c - network device class and attributes
*
* Copyright (c) 2003 Stephen Hemminger <shemminger@osdl.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/capability.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
-#include <net/switchdev.h>
#include <linux/if_arp.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
+#include <linux/sched/isolation.h>
#include <linux/nsproxy.h>
#include <net/sock.h>
#include <net/net_namespace.h>
@@ -27,18 +23,105 @@
#include <linux/of.h>
#include <linux/of_net.h>
#include <linux/cpu.h>
+#include <net/netdev_lock.h>
+#include <net/netdev_rx_queue.h>
+#include <net/rps.h>
+#include "dev.h"
#include "net-sysfs.h"
#ifdef CONFIG_SYSFS
static const char fmt_hex[] = "%#x\n";
static const char fmt_dec[] = "%d\n";
+static const char fmt_uint[] = "%u\n";
static const char fmt_ulong[] = "%lu\n";
static const char fmt_u64[] = "%llu\n";
+/* Caller holds RTNL, netdev->lock or RCU */
static inline int dev_isalive(const struct net_device *dev)
{
- return dev->reg_state <= NETREG_REGISTERED;
+ return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED;
+}
+
+/* There is a possible ABBA deadlock between rtnl_lock and kernfs_node->active,
+ * when unregistering a net device and accessing associated sysfs files. The
+ * potential deadlock is as follow:
+ *
+ * CPU 0 CPU 1
+ *
+ * rtnl_lock vfs_read
+ * unregister_netdevice_many kernfs_seq_start
+ * device_del / kobject_put kernfs_get_active (kn->active++)
+ * kernfs_drain sysfs_kf_seq_show
+ * wait_event( rtnl_lock
+ * kn->active == KN_DEACTIVATED_BIAS) -> waits on CPU 0 to release
+ * -> waits on CPU 1 to decrease kn->active the rtnl lock.
+ *
+ * The historical fix was to use rtnl_trylock with restart_syscall to bail out
+ * of sysfs operations when the lock couldn't be taken. This fixed the above
+ * issue as it allowed CPU 1 to bail out of the ABBA situation.
+ *
+ * But it came with performances issues, as syscalls are being restarted in
+ * loops when there was contention on the rtnl lock, with huge slow downs in
+ * specific scenarios (e.g. lots of virtual interfaces created and userspace
+ * daemons querying their attributes).
+ *
+ * The idea below is to bail out of the active kernfs_node protection
+ * (kn->active) while trying to take the rtnl lock.
+ *
+ * This replaces rtnl_lock() and still has to be used with rtnl_unlock(). The
+ * net device is guaranteed to be alive if this returns successfully.
+ */
+static int sysfs_rtnl_lock(struct kobject *kobj, struct attribute *attr,
+ struct net_device *ndev)
+{
+ struct kernfs_node *kn;
+ int ret = 0;
+
+ /* First, we hold a reference to the net device as the unregistration
+ * path might run in parallel. This will ensure the net device and the
+ * associated sysfs objects won't be freed while we try to take the rtnl
+ * lock.
+ */
+ dev_hold(ndev);
+ /* sysfs_break_active_protection was introduced to allow self-removal of
+ * devices and their associated sysfs files by bailing out of the
+ * sysfs/kernfs protection. We do this here to allow the unregistration
+ * path to complete in parallel. The following takes a reference on the
+ * kobject and the kernfs_node being accessed.
+ *
+ * This works because we hold a reference onto the net device and the
+ * unregistration path will wait for us eventually in netdev_run_todo
+ * (outside an rtnl lock section).
+ */
+ kn = sysfs_break_active_protection(kobj, attr);
+ /* We can now try to take the rtnl lock. This can't deadlock us as the
+ * unregistration path is able to drain sysfs files (kernfs_node) thanks
+ * to the above dance.
+ */
+ if (rtnl_lock_interruptible()) {
+ ret = -ERESTARTSYS;
+ goto unbreak;
+ }
+ /* Check dismantle on the device hasn't started, otherwise deny the
+ * operation.
+ */
+ if (!dev_isalive(ndev)) {
+ rtnl_unlock();
+ ret = -ENODEV;
+ goto unbreak;
+ }
+ /* We are now sure the device dismantle hasn't started nor that it can
+ * start before we exit the locking section as we hold the rtnl lock.
+ * There's no need to keep unbreaking the sysfs protection nor to hold
+ * a net device reference from that point; that was only needed to take
+ * the rtnl lock.
+ */
+unbreak:
+ sysfs_unbreak_active_protection(kn);
+ dev_put(ndev);
+
+ return ret;
}
/* use same locking rules as GIF* ioctl's */
@@ -49,10 +132,10 @@ static ssize_t netdev_show(const struct device *dev,
struct net_device *ndev = to_net_dev(dev);
ssize_t ret = -EINVAL;
- read_lock(&dev_base_lock);
+ rcu_read_lock();
if (dev_isalive(ndev))
ret = (*format)(ndev, buf);
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
return ret;
}
@@ -61,7 +144,7 @@ static ssize_t netdev_show(const struct device *dev,
#define NETDEVICE_SHOW(field, format_string) \
static ssize_t format_##field(const struct net_device *dev, char *buf) \
{ \
- return sprintf(buf, format_string, dev->field); \
+ return sysfs_emit(buf, format_string, READ_ONCE(dev->field)); \
} \
static ssize_t field##_show(struct device *dev, \
struct device_attribute *attr, char *buf) \
@@ -85,7 +168,7 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
struct net_device *netdev = to_net_dev(dev);
struct net *net = dev_net(netdev);
unsigned long new;
- int ret = -EINVAL;
+ int ret;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
@@ -94,16 +177,46 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
if (ret)
goto err;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ goto err;
+
+ ret = (*set)(netdev, new);
+ if (ret == 0)
+ ret = len;
+
+ rtnl_unlock();
+ err:
+ return ret;
+}
+
+/* Same as netdev_store() but takes netdev_lock() instead of rtnl_lock() */
+static ssize_t
+netdev_lock_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len,
+ int (*set)(struct net_device *, unsigned long))
+{
+ struct net_device *netdev = to_net_dev(dev);
+ struct net *net = dev_net(netdev);
+ unsigned long new;
+ int ret;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ ret = kstrtoul(buf, 0, &new);
+ if (ret)
+ return ret;
+
+ netdev_lock(netdev);
if (dev_isalive(netdev)) {
ret = (*set)(netdev, new);
if (ret == 0)
ret = len;
}
- rtnl_unlock();
- err:
+ netdev_unlock(netdev);
+
return ret;
}
@@ -120,13 +233,13 @@ static ssize_t iflink_show(struct device *dev, struct device_attribute *attr,
{
struct net_device *ndev = to_net_dev(dev);
- return sprintf(buf, fmt_dec, dev_get_iflink(ndev));
+ return sysfs_emit(buf, fmt_dec, dev_get_iflink(ndev));
}
static DEVICE_ATTR_RO(iflink);
static ssize_t format_name_assign_type(const struct net_device *dev, char *buf)
{
- return sprintf(buf, fmt_dec, dev->name_assign_type);
+ return sysfs_emit(buf, fmt_dec, READ_ONCE(dev->name_assign_type));
}
static ssize_t name_assign_type_show(struct device *dev,
@@ -136,24 +249,28 @@ static ssize_t name_assign_type_show(struct device *dev,
struct net_device *ndev = to_net_dev(dev);
ssize_t ret = -EINVAL;
- if (ndev->name_assign_type != NET_NAME_UNKNOWN)
+ if (READ_ONCE(ndev->name_assign_type) != NET_NAME_UNKNOWN)
ret = netdev_show(dev, attr, buf, format_name_assign_type);
return ret;
}
static DEVICE_ATTR_RO(name_assign_type);
-/* use same locking rules as GIFHWADDR ioctl's */
+/* use same locking rules as GIFHWADDR ioctl's (netif_get_mac_address()) */
static ssize_t address_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct net_device *ndev = to_net_dev(dev);
ssize_t ret = -EINVAL;
- read_lock(&dev_base_lock);
+ down_read(&dev_addr_sem);
+
+ rcu_read_lock();
if (dev_isalive(ndev))
ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len);
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
+
+ up_read(&dev_addr_sem);
return ret;
}
static DEVICE_ATTR_RO(address);
@@ -162,10 +279,13 @@ static ssize_t broadcast_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *ndev = to_net_dev(dev);
+ int ret = -EINVAL;
+ rcu_read_lock();
if (dev_isalive(ndev))
- return sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len);
- return -EINVAL;
+ ret = sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len);
+ rcu_read_unlock();
+ return ret;
}
static DEVICE_ATTR_RO(broadcast);
@@ -179,6 +299,14 @@ static int change_carrier(struct net_device *dev, unsigned long new_carrier)
static ssize_t carrier_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len)
{
+ struct net_device *netdev = to_net_dev(dev);
+
+ /* The check is also done in change_carrier; this helps returning early
+ * without hitting the locking section in netdev_store.
+ */
+ if (!netdev->netdev_ops->ndo_change_carrier)
+ return -EOPNOTSUPP;
+
return netdev_store(dev, attr, buf, len, change_carrier);
}
@@ -186,11 +314,24 @@ static ssize_t carrier_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
+ int ret;
- if (netif_running(netdev))
- return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev));
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
- return -EINVAL;
+ ret = -EINVAL;
+ if (netif_running(netdev)) {
+ /* Synchronize carrier state with link watch,
+ * see also rtnl_getlink().
+ */
+ linkwatch_sync_dev(netdev);
+
+ ret = sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev));
+ }
+
+ rtnl_unlock();
+ return ret;
}
static DEVICE_ATTR_RW(carrier);
@@ -200,14 +341,22 @@ static ssize_t speed_show(struct device *dev,
struct net_device *netdev = to_net_dev(dev);
int ret = -EINVAL;
- if (!rtnl_trylock())
- return restart_syscall();
+ /* The check is also done in __ethtool_get_link_ksettings; this helps
+ * returning early without hitting the locking section below.
+ */
+ if (!netdev->ethtool_ops->get_link_ksettings)
+ return ret;
+
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
+ ret = -EINVAL;
if (netif_running(netdev)) {
struct ethtool_link_ksettings cmd;
if (!__ethtool_get_link_ksettings(netdev, &cmd))
- ret = sprintf(buf, fmt_dec, cmd.base.speed);
+ ret = sysfs_emit(buf, fmt_dec, cmd.base.speed);
}
rtnl_unlock();
return ret;
@@ -220,9 +369,17 @@ static ssize_t duplex_show(struct device *dev,
struct net_device *netdev = to_net_dev(dev);
int ret = -EINVAL;
- if (!rtnl_trylock())
- return restart_syscall();
+ /* The check is also done in __ethtool_get_link_ksettings; this helps
+ * returning early without hitting the locking section below.
+ */
+ if (!netdev->ethtool_ops->get_link_ksettings)
+ return ret;
+
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
+ ret = -EINVAL;
if (netif_running(netdev)) {
struct ethtool_link_ksettings cmd;
@@ -240,7 +397,7 @@ static ssize_t duplex_show(struct device *dev,
duplex = "unknown";
break;
}
- ret = sprintf(buf, "%s\n", duplex);
+ ret = sysfs_emit(buf, "%s\n", duplex);
}
}
rtnl_unlock();
@@ -248,13 +405,25 @@ static ssize_t duplex_show(struct device *dev,
}
static DEVICE_ATTR_RO(duplex);
+static ssize_t testing_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+
+ if (netif_running(netdev))
+ return sysfs_emit(buf, fmt_dec, !!netif_testing(netdev));
+
+ return -EINVAL;
+}
+static DEVICE_ATTR_RO(testing);
+
static ssize_t dormant_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
if (netif_running(netdev))
- return sprintf(buf, fmt_dec, !!netif_dormant(netdev));
+ return sysfs_emit(buf, fmt_dec, !!netif_dormant(netdev));
return -EINVAL;
}
@@ -265,7 +434,7 @@ static const char *const operstates[] = {
"notpresent", /* currently unused */
"down",
"lowerlayerdown",
- "testing", /* currently unused */
+ "testing",
"dormant",
"up"
};
@@ -276,16 +445,14 @@ static ssize_t operstate_show(struct device *dev,
const struct net_device *netdev = to_net_dev(dev);
unsigned char operstate;
- read_lock(&dev_base_lock);
- operstate = netdev->operstate;
+ operstate = READ_ONCE(netdev->operstate);
if (!netif_running(netdev))
operstate = IF_OPER_DOWN;
- read_unlock(&dev_base_lock);
if (operstate >= ARRAY_SIZE(operstates))
return -EINVAL; /* should not happen */
- return sprintf(buf, "%s\n", operstates[operstate]);
+ return sysfs_emit(buf, "%s\n", operstates[operstate]);
}
static DEVICE_ATTR_RO(operstate);
@@ -295,9 +462,9 @@ static ssize_t carrier_changes_show(struct device *dev,
{
struct net_device *netdev = to_net_dev(dev);
- return sprintf(buf, fmt_dec,
- atomic_read(&netdev->carrier_up_count) +
- atomic_read(&netdev->carrier_down_count));
+ return sysfs_emit(buf, fmt_dec,
+ atomic_read(&netdev->carrier_up_count) +
+ atomic_read(&netdev->carrier_down_count));
}
static DEVICE_ATTR_RO(carrier_changes);
@@ -307,7 +474,7 @@ static ssize_t carrier_up_count_show(struct device *dev,
{
struct net_device *netdev = to_net_dev(dev);
- return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_up_count));
+ return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_up_count));
}
static DEVICE_ATTR_RO(carrier_up_count);
@@ -317,7 +484,7 @@ static ssize_t carrier_down_count_show(struct device *dev,
{
struct net_device *netdev = to_net_dev(dev);
- return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_down_count));
+ return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_down_count));
}
static DEVICE_ATTR_RO(carrier_down_count);
@@ -337,7 +504,7 @@ NETDEVICE_SHOW_RW(mtu, fmt_dec);
static int change_flags(struct net_device *dev, unsigned long new_flags)
{
- return dev_change_flags(dev, (unsigned int)new_flags);
+ return dev_change_flags(dev, (unsigned int)new_flags, NULL);
}
static ssize_t flags_store(struct device *dev, struct device_attribute *attr,
@@ -360,7 +527,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec);
static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
{
- dev->gro_flush_timeout = val;
+ netdev_set_gro_flush_timeout(dev, val);
return 0;
}
@@ -371,17 +538,38 @@ static ssize_t gro_flush_timeout_store(struct device *dev,
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- return netdev_store(dev, attr, buf, len, change_gro_flush_timeout);
+ return netdev_lock_store(dev, attr, buf, len, change_gro_flush_timeout);
}
NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
+static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val)
+{
+ if (val > S32_MAX)
+ return -ERANGE;
+
+ netdev_set_defer_hard_irqs(dev, (u32)val);
+ return 0;
+}
+
+static ssize_t napi_defer_hard_irqs_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ return netdev_lock_store(dev, attr, buf, len,
+ change_napi_defer_hard_irqs);
+}
+NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_uint);
+
static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len)
{
struct net_device *netdev = to_net_dev(dev);
struct net *net = dev_net(netdev);
size_t count = len;
- ssize_t ret = 0;
+ ssize_t ret;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
@@ -390,16 +578,15 @@ static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
if (len > 0 && buf[len - 1] == '\n')
--count;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
- if (dev_isalive(netdev)) {
- ret = dev_set_alias(netdev, buf, count);
- if (ret < 0)
- goto err;
- ret = len;
- netdev_state_change(netdev);
- }
+ ret = dev_set_alias(netdev, buf, count);
+ if (ret < 0)
+ goto err;
+ ret = len;
+ netdev_state_change(netdev);
err:
rtnl_unlock();
@@ -411,11 +598,11 @@ static ssize_t ifalias_show(struct device *dev,
{
const struct net_device *netdev = to_net_dev(dev);
char tmp[IFALIASZ];
- ssize_t ret = 0;
+ ssize_t ret;
ret = dev_get_alias(netdev, tmp, sizeof(tmp));
if (ret > 0)
- ret = sprintf(buf, "%s\n", tmp);
+ ret = sysfs_emit(buf, "%s\n", tmp);
return ret;
}
static DEVICE_ATTR_RW(ifalias);
@@ -451,18 +638,17 @@ static ssize_t phys_port_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
- ssize_t ret = -EINVAL;
+ struct netdev_phys_item_id ppid;
+ ssize_t ret;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
- if (dev_isalive(netdev)) {
- struct netdev_phys_item_id ppid;
+ ret = dev_get_phys_port_id(netdev, &ppid);
+ if (!ret)
+ ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
- ret = dev_get_phys_port_id(netdev, &ppid);
- if (!ret)
- ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id);
- }
rtnl_unlock();
return ret;
@@ -473,18 +659,17 @@ static ssize_t phys_port_name_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
- ssize_t ret = -EINVAL;
+ char name[IFNAMSIZ];
+ ssize_t ret;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
- if (dev_isalive(netdev)) {
- char name[IFNAMSIZ];
+ ret = dev_get_phys_port_name(netdev, name, sizeof(name));
+ if (!ret)
+ ret = sysfs_emit(buf, "%s\n", name);
- ret = dev_get_phys_port_name(netdev, name, sizeof(name));
- if (!ret)
- ret = sprintf(buf, "%s\n", name);
- }
rtnl_unlock();
return ret;
@@ -495,29 +680,96 @@ static ssize_t phys_switch_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct net_device *netdev = to_net_dev(dev);
- ssize_t ret = -EINVAL;
+ struct netdev_phys_item_id ppid = { };
+ ssize_t ret;
+
+ ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev);
+ if (ret)
+ return ret;
- if (!rtnl_trylock())
- return restart_syscall();
+ ret = netif_get_port_parent_id(netdev, &ppid, false);
+ if (!ret)
+ ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id);
- if (dev_isalive(netdev)) {
- struct switchdev_attr attr = {
- .orig_dev = netdev,
- .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
- .flags = SWITCHDEV_F_NO_RECURSE,
- };
-
- ret = switchdev_port_attr_get(netdev, &attr);
- if (!ret)
- ret = sprintf(buf, "%*phN\n", attr.u.ppid.id_len,
- attr.u.ppid.id);
- }
rtnl_unlock();
return ret;
}
static DEVICE_ATTR_RO(phys_switch_id);
+static struct attribute *netdev_phys_attrs[] __ro_after_init = {
+ &dev_attr_phys_port_id.attr,
+ &dev_attr_phys_port_name.attr,
+ &dev_attr_phys_switch_id.attr,
+ NULL,
+};
+
+static umode_t netdev_phys_is_visible(struct kobject *kobj,
+ struct attribute *attr, int index)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct net_device *netdev = to_net_dev(dev);
+
+ if (attr == &dev_attr_phys_port_id.attr) {
+ if (!netdev->netdev_ops->ndo_get_phys_port_id)
+ return 0;
+ } else if (attr == &dev_attr_phys_port_name.attr) {
+ if (!netdev->netdev_ops->ndo_get_phys_port_name &&
+ !netdev->devlink_port)
+ return 0;
+ } else if (attr == &dev_attr_phys_switch_id.attr) {
+ if (!netdev->netdev_ops->ndo_get_port_parent_id &&
+ !netdev->devlink_port)
+ return 0;
+ }
+
+ return attr->mode;
+}
+
+static const struct attribute_group netdev_phys_group = {
+ .attrs = netdev_phys_attrs,
+ .is_visible = netdev_phys_is_visible,
+};
+
+static ssize_t threaded_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct net_device *netdev = to_net_dev(dev);
+ ssize_t ret = -EINVAL;
+
+ rcu_read_lock();
+
+ if (dev_isalive(netdev))
+ ret = sysfs_emit(buf, fmt_dec, READ_ONCE(netdev->threaded));
+
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int modify_napi_threaded(struct net_device *dev, unsigned long val)
+{
+ int ret;
+
+ if (list_empty(&dev->napi_list))
+ return -EOPNOTSUPP;
+
+ if (val != 0 && val != 1)
+ return -EOPNOTSUPP;
+
+ ret = netif_set_threaded(dev, val);
+
+ return ret;
+}
+
+static ssize_t threaded_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ return netdev_lock_store(dev, attr, buf, len, modify_napi_threaded);
+}
+static DEVICE_ATTR_RW(threaded);
+
static struct attribute *net_class_attrs[] __ro_after_init = {
&dev_attr_netdev_group.attr,
&dev_attr_type.attr,
@@ -534,6 +786,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
&dev_attr_speed.attr,
&dev_attr_duplex.attr,
&dev_attr_dormant.attr,
+ &dev_attr_testing.attr,
&dev_attr_operstate.attr,
&dev_attr_carrier_changes.attr,
&dev_attr_ifalias.attr,
@@ -542,12 +795,11 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
&dev_attr_flags.attr,
&dev_attr_tx_queue_len.attr,
&dev_attr_gro_flush_timeout.attr,
- &dev_attr_phys_port_id.attr,
- &dev_attr_phys_port_name.attr,
- &dev_attr_phys_switch_id.attr,
+ &dev_attr_napi_defer_hard_irqs.attr,
&dev_attr_proto_down.attr,
&dev_attr_carrier_up_count.attr,
&dev_attr_carrier_down_count.attr,
+ &dev_attr_threaded.attr,
NULL,
};
ATTRIBUTE_GROUPS(net_class);
@@ -563,14 +815,14 @@ static ssize_t netstat_show(const struct device *d,
WARN_ON(offset > sizeof(struct rtnl_link_stats64) ||
offset % sizeof(u64) != 0);
- read_lock(&dev_base_lock);
+ rcu_read_lock();
if (dev_isalive(dev)) {
struct rtnl_link_stats64 temp;
const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
- ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset));
+ ret = sysfs_emit(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset));
}
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
return ret;
}
@@ -642,7 +894,6 @@ static const struct attribute_group netstat_group = {
.attrs = netstat_attrs,
};
-#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)
static struct attribute *wireless_attrs[] = {
NULL
};
@@ -651,7 +902,19 @@ static const struct attribute_group wireless_group = {
.name = "wireless",
.attrs = wireless_attrs,
};
+
+static bool wireless_group_needed(struct net_device *ndev)
+{
+#if IS_ENABLED(CONFIG_CFG80211)
+ if (ndev->ieee80211_ptr)
+ return true;
+#endif
+#if IS_ENABLED(CONFIG_WIRELESS_EXT)
+ if (ndev->wireless_handlers)
+ return true;
#endif
+ return false;
+}
#else /* CONFIG_SYSFS */
#define net_class_groups NULL
@@ -708,40 +971,25 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf)
for (i = 0; i < map->len; i++)
cpumask_set_cpu(map->cpus[i], mask);
- len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
+ len = sysfs_emit(buf, "%*pb\n", cpumask_pr_args(mask));
rcu_read_unlock();
free_cpumask_var(mask);
return len < PAGE_SIZE ? len : -EINVAL;
}
-static ssize_t store_rps_map(struct netdev_rx_queue *queue,
- const char *buf, size_t len)
+static int netdev_rx_queue_set_rps_mask(struct netdev_rx_queue *queue,
+ cpumask_var_t mask)
{
- struct rps_map *old_map, *map;
- cpumask_var_t mask;
- int err, cpu, i;
static DEFINE_MUTEX(rps_map_mutex);
-
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
- if (!alloc_cpumask_var(&mask, GFP_KERNEL))
- return -ENOMEM;
-
- err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
- if (err) {
- free_cpumask_var(mask);
- return err;
- }
+ struct rps_map *old_map, *map;
+ int cpu, i;
map = kzalloc(max_t(unsigned int,
RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
GFP_KERNEL);
- if (!map) {
- free_cpumask_var(mask);
+ if (!map)
return -ENOMEM;
- }
i = 0;
for_each_cpu_and(cpu, mask, cpu_online_mask)
@@ -760,17 +1008,53 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
rcu_assign_pointer(queue->rps_map, map);
if (map)
- static_key_slow_inc(&rps_needed);
+ static_branch_inc(&rps_needed);
if (old_map)
- static_key_slow_dec(&rps_needed);
+ static_branch_dec(&rps_needed);
mutex_unlock(&rps_map_mutex);
if (old_map)
kfree_rcu(old_map, rcu);
+ return 0;
+}
+
+int rps_cpumask_housekeeping(struct cpumask *mask)
+{
+ if (!cpumask_empty(mask)) {
+ cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN));
+ cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ));
+ if (cpumask_empty(mask))
+ return -EINVAL;
+ }
+ return 0;
+}
+static ssize_t store_rps_map(struct netdev_rx_queue *queue,
+ const char *buf, size_t len)
+{
+ cpumask_var_t mask;
+ int err;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
+ if (err)
+ goto out;
+
+ err = rps_cpumask_housekeeping(mask);
+ if (err)
+ goto out;
+
+ err = netdev_rx_queue_set_rps_mask(queue, mask);
+
+out:
free_cpumask_var(mask);
- return len;
+ return err ? : len;
}
static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
@@ -782,10 +1066,10 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
rcu_read_lock();
flow_table = rcu_dereference(queue->rps_flow_table);
if (flow_table)
- val = (unsigned long)flow_table->mask + 1;
+ val = 1UL << flow_table->log;
rcu_read_unlock();
- return sprintf(buf, "%lu\n", val);
+ return sysfs_emit(buf, "%lu\n", val);
}
static void rps_dev_flow_table_release(struct rcu_head *rcu)
@@ -835,9 +1119,11 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
if (!table)
return -ENOMEM;
- table->mask = mask;
- for (count = 0; count <= mask; count++)
+ table->log = ilog2(mask) + 1;
+ for (count = 0; count <= mask; count++) {
table->flows[count].cpu = RPS_NO_CPU;
+ table->flows[count].filter = RPS_NO_FILTER;
+ }
} else {
table = NULL;
}
@@ -869,6 +1155,7 @@ static struct attribute *rx_queue_default_attrs[] __ro_after_init = {
#endif
NULL
};
+ATTRIBUTE_GROUPS(rx_queue_default);
static void rx_queue_release(struct kobject *kobj)
{
@@ -891,22 +1178,22 @@ static void rx_queue_release(struct kobject *kobj)
#endif
memset(kobj, 0, sizeof(*kobj));
- dev_put(queue->dev);
+ netdev_put(queue->dev, &queue->dev_tracker);
}
-static const void *rx_queue_namespace(struct kobject *kobj)
+static const void *rx_queue_namespace(const struct kobject *kobj)
{
struct netdev_rx_queue *queue = to_rx_queue(kobj);
struct device *dev = &queue->dev->dev;
const void *ns = NULL;
- if (dev->class && dev->class->ns_type)
+ if (dev->class && dev->class->namespace)
ns = dev->class->namespace(dev);
return ns;
}
-static void rx_queue_get_ownership(struct kobject *kobj,
+static void rx_queue_get_ownership(const struct kobject *kobj,
kuid_t *uid, kgid_t *gid)
{
const struct net *net = rx_queue_namespace(kobj);
@@ -914,36 +1201,107 @@ static void rx_queue_get_ownership(struct kobject *kobj,
net_ns_get_ownership(net, uid, gid);
}
-static struct kobj_type rx_queue_ktype __ro_after_init = {
+static const struct kobj_type rx_queue_ktype = {
.sysfs_ops = &rx_queue_sysfs_ops,
.release = rx_queue_release,
- .default_attrs = rx_queue_default_attrs,
.namespace = rx_queue_namespace,
.get_ownership = rx_queue_get_ownership,
};
+static int rx_queue_default_mask(struct net_device *dev,
+ struct netdev_rx_queue *queue)
+{
+#if IS_ENABLED(CONFIG_RPS) && IS_ENABLED(CONFIG_SYSCTL)
+ struct cpumask *rps_default_mask;
+ int res = 0;
+
+ mutex_lock(&rps_default_mask_mutex);
+
+ rps_default_mask = dev_net(dev)->core.rps_default_mask;
+ if (rps_default_mask && !cpumask_empty(rps_default_mask))
+ res = netdev_rx_queue_set_rps_mask(queue, rps_default_mask);
+
+ mutex_unlock(&rps_default_mask_mutex);
+
+ return res;
+#else
+ return 0;
+#endif
+}
+
static int rx_queue_add_kobject(struct net_device *dev, int index)
{
struct netdev_rx_queue *queue = dev->_rx + index;
struct kobject *kobj = &queue->kobj;
int error = 0;
+ /* Rx queues are cleared in rx_queue_release to allow later
+ * re-registration. This is triggered when their kobj refcount is
+ * dropped.
+ *
+ * If a queue is removed while both a read (or write) operation and a
+ * the re-addition of the same queue are pending (waiting on rntl_lock)
+ * it might happen that the re-addition will execute before the read,
+ * making the initial removal to never happen (queue's kobj refcount
+ * won't drop enough because of the pending read). In such rare case,
+ * return to allow the removal operation to complete.
+ */
+ if (unlikely(kobj->state_initialized)) {
+ netdev_warn_once(dev, "Cannot re-add rx queues before their removal completed");
+ return -EAGAIN;
+ }
+
+ /* Kobject_put later will trigger rx_queue_release call which
+ * decreases dev refcount: Take that reference here
+ */
+ netdev_hold(queue->dev, &queue->dev_tracker, GFP_KERNEL);
+
kobj->kset = dev->queues_kset;
error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
"rx-%u", index);
if (error)
- return error;
+ goto err;
+
+ queue->groups = rx_queue_default_groups;
+ error = sysfs_create_groups(kobj, queue->groups);
+ if (error)
+ goto err;
if (dev->sysfs_rx_queue_group) {
error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group);
- if (error) {
- kobject_put(kobj);
- return error;
- }
+ if (error)
+ goto err_default_groups;
}
+ error = rx_queue_default_mask(dev, queue);
+ if (error)
+ goto err_default_groups;
+
kobject_uevent(kobj, KOBJ_ADD);
- dev_hold(queue->dev);
+
+ return error;
+
+err_default_groups:
+ sysfs_remove_groups(kobj, queue->groups);
+err:
+ kobject_put(kobj);
+ return error;
+}
+
+static int rx_queue_change_owner(struct net_device *dev, int index, kuid_t kuid,
+ kgid_t kgid)
+{
+ struct netdev_rx_queue *queue = dev->_rx + index;
+ struct kobject *kobj = &queue->kobj;
+ int error;
+
+ error = sysfs_change_owner(kobj, kuid, kgid);
+ if (error)
+ return error;
+
+ if (dev->sysfs_rx_queue_group)
+ error = sysfs_group_change_owner(
+ kobj, dev->sysfs_rx_queue_group, kuid, kgid);
return error;
}
@@ -969,12 +1327,14 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
}
while (--i >= new_num) {
- struct kobject *kobj = &dev->_rx[i].kobj;
+ struct netdev_rx_queue *queue = &dev->_rx[i];
+ struct kobject *kobj = &queue->kobj;
- if (!refcount_read(&dev_net(dev)->count))
+ if (!check_net(dev_net(dev)))
kobj->uevent_suppress = 1;
if (dev->sysfs_rx_queue_group)
sysfs_remove_group(kobj, dev->sysfs_rx_queue_group);
+ sysfs_remove_groups(kobj, queue->groups);
kobject_put(kobj);
}
@@ -984,15 +1344,40 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
#endif
}
+static int net_rx_queue_change_owner(struct net_device *dev, int num,
+ kuid_t kuid, kgid_t kgid)
+{
+#ifdef CONFIG_SYSFS
+ int error = 0;
+ int i;
+
+#ifndef CONFIG_RPS
+ if (!dev->sysfs_rx_queue_group)
+ return 0;
+#endif
+ for (i = 0; i < num; i++) {
+ error = rx_queue_change_owner(dev, i, kuid, kgid);
+ if (error)
+ break;
+ }
+
+ return error;
+#else
+ return 0;
+#endif
+}
+
#ifdef CONFIG_SYSFS
/*
* netdev_queue sysfs structures and functions.
*/
struct netdev_queue_attribute {
struct attribute attr;
- ssize_t (*show)(struct netdev_queue *queue, char *buf);
- ssize_t (*store)(struct netdev_queue *queue,
- const char *buf, size_t len);
+ ssize_t (*show)(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf);
+ ssize_t (*store)(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len);
};
#define to_netdev_queue_attr(_attr) \
container_of(_attr, struct netdev_queue_attribute, attr)
@@ -1009,7 +1394,7 @@ static ssize_t netdev_queue_attr_show(struct kobject *kobj,
if (!attribute->show)
return -EIO;
- return attribute->show(queue, buf);
+ return attribute->show(kobj, attr, queue, buf);
}
static ssize_t netdev_queue_attr_store(struct kobject *kobj,
@@ -1023,7 +1408,7 @@ static ssize_t netdev_queue_attr_store(struct kobject *kobj,
if (!attribute->store)
return -EIO;
- return attribute->store(queue, buf, count);
+ return attribute->store(kobj, attr, queue, buf, count);
}
static const struct sysfs_ops netdev_queue_sysfs_ops = {
@@ -1031,15 +1416,12 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = {
.store = netdev_queue_attr_store,
};
-static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf)
+static ssize_t tx_timeout_show(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
- unsigned long trans_timeout;
+ unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout);
- spin_lock_irq(&queue->_xmit_lock);
- trans_timeout = queue->trans_timeout;
- spin_unlock_irq(&queue->_xmit_lock);
-
- return sprintf(buf, "%lu", trans_timeout);
+ return sysfs_emit(buf, fmt_ulong, trans_timeout);
}
static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
@@ -1053,22 +1435,29 @@ static unsigned int get_netdev_queue_index(struct netdev_queue *queue)
return i;
}
-static ssize_t traffic_class_show(struct netdev_queue *queue,
- char *buf)
+static ssize_t traffic_class_show(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
struct net_device *dev = queue->dev;
- int index;
- int tc;
+ int num_tc, tc, index, ret;
if (!netif_is_multiqueue(dev))
return -ENOENT;
+ ret = sysfs_rtnl_lock(kobj, attr, queue->dev);
+ if (ret)
+ return ret;
+
index = get_netdev_queue_index(queue);
/* If queue belongs to subordinate dev use its TC mapping */
dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+ num_tc = dev->num_tc;
tc = netdev_txq_to_tc(dev, index);
+
+ rtnl_unlock();
+
if (tc < 0)
return -EINVAL;
@@ -1079,43 +1468,55 @@ static ssize_t traffic_class_show(struct netdev_queue *queue,
* belongs to the root device it will be reported with just the
* traffic class, so just "0" for TC 0 for example.
*/
- return dev->num_tc < 0 ? sprintf(buf, "%u%d\n", tc, dev->num_tc) :
- sprintf(buf, "%u\n", tc);
+ return num_tc < 0 ? sysfs_emit(buf, "%d%d\n", tc, num_tc) :
+ sysfs_emit(buf, "%d\n", tc);
}
#ifdef CONFIG_XPS
-static ssize_t tx_maxrate_show(struct netdev_queue *queue,
- char *buf)
+static ssize_t tx_maxrate_show(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
- return sprintf(buf, "%lu\n", queue->tx_maxrate);
+ return sysfs_emit(buf, "%lu\n", queue->tx_maxrate);
}
-static ssize_t tx_maxrate_store(struct netdev_queue *queue,
- const char *buf, size_t len)
+static ssize_t tx_maxrate_store(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len)
{
- struct net_device *dev = queue->dev;
int err, index = get_netdev_queue_index(queue);
+ struct net_device *dev = queue->dev;
u32 rate = 0;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
+ /* The check is also done later; this helps returning early without
+ * hitting the locking section below.
+ */
+ if (!dev->netdev_ops->ndo_set_tx_maxrate)
+ return -EOPNOTSUPP;
+
err = kstrtou32(buf, 10, &rate);
if (err < 0)
return err;
- if (!rtnl_trylock())
- return restart_syscall();
+ err = sysfs_rtnl_lock(kobj, attr, dev);
+ if (err)
+ return err;
err = -EOPNOTSUPP;
+ netdev_lock_ops(dev);
if (dev->netdev_ops->ndo_set_tx_maxrate)
err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate);
+ netdev_unlock_ops(dev);
- rtnl_unlock();
if (!err) {
queue->tx_maxrate = rate;
+ rtnl_unlock();
return len;
}
+
+ rtnl_unlock();
return err;
}
@@ -1135,7 +1536,7 @@ static struct netdev_queue_attribute queue_traffic_class __ro_after_init
*/
static ssize_t bql_show(char *buf, unsigned int value)
{
- return sprintf(buf, "%u\n", value);
+ return sysfs_emit(buf, "%u\n", value);
}
static ssize_t bql_set(const char *buf, const size_t count,
@@ -1159,16 +1560,17 @@ static ssize_t bql_set(const char *buf, const size_t count,
return count;
}
-static ssize_t bql_show_hold_time(struct netdev_queue *queue,
- char *buf)
+static ssize_t bql_show_hold_time(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
struct dql *dql = &queue->dql;
- return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time));
+ return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time));
}
-static ssize_t bql_set_hold_time(struct netdev_queue *queue,
- const char *buf, size_t len)
+static ssize_t bql_set_hold_time(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len)
{
struct dql *dql = &queue->dql;
unsigned int value;
@@ -1187,25 +1589,92 @@ static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init
= __ATTR(hold_time, 0644,
bql_show_hold_time, bql_set_hold_time);
-static ssize_t bql_show_inflight(struct netdev_queue *queue,
- char *buf)
+static ssize_t bql_show_stall_thrs(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
struct dql *dql = &queue->dql;
- return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed);
+ return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs));
+}
+
+static ssize_t bql_set_stall_thrs(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len)
+{
+ struct dql *dql = &queue->dql;
+ unsigned int value;
+ int err;
+
+ err = kstrtouint(buf, 10, &value);
+ if (err < 0)
+ return err;
+
+ value = msecs_to_jiffies(value);
+ if (value && (value < 4 || value > 4 / 2 * BITS_PER_LONG))
+ return -ERANGE;
+
+ if (!dql->stall_thrs && value)
+ dql->last_reap = jiffies;
+ /* Force last_reap to be live */
+ smp_wmb();
+ dql->stall_thrs = value;
+
+ return len;
+}
+
+static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init =
+ __ATTR(stall_thrs, 0644, bql_show_stall_thrs, bql_set_stall_thrs);
+
+static ssize_t bql_show_stall_max(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
+{
+ return sysfs_emit(buf, "%u\n", READ_ONCE(queue->dql.stall_max));
+}
+
+static ssize_t bql_set_stall_max(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len)
+{
+ WRITE_ONCE(queue->dql.stall_max, 0);
+ return len;
+}
+
+static struct netdev_queue_attribute bql_stall_max_attribute __ro_after_init =
+ __ATTR(stall_max, 0644, bql_show_stall_max, bql_set_stall_max);
+
+static ssize_t bql_show_stall_cnt(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
+{
+ struct dql *dql = &queue->dql;
+
+ return sysfs_emit(buf, "%lu\n", dql->stall_cnt);
+}
+
+static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init =
+ __ATTR(stall_cnt, 0444, bql_show_stall_cnt, NULL);
+
+static ssize_t bql_show_inflight(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
+{
+ struct dql *dql = &queue->dql;
+
+ return sysfs_emit(buf, "%u\n", dql->num_queued - dql->num_completed);
}
static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init =
__ATTR(inflight, 0444, bql_show_inflight, NULL);
#define BQL_ATTR(NAME, FIELD) \
-static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \
- char *buf) \
+static ssize_t bql_show_ ## NAME(struct kobject *kobj, \
+ struct attribute *attr, \
+ struct netdev_queue *queue, char *buf) \
{ \
return bql_show(buf, queue->dql.FIELD); \
} \
\
-static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \
+static ssize_t bql_set_ ## NAME(struct kobject *kobj, \
+ struct attribute *attr, \
+ struct netdev_queue *queue, \
const char *buf, size_t len) \
{ \
return bql_set(buf, len, &queue->dql.FIELD); \
@@ -1225,6 +1694,9 @@ static struct attribute *dql_attrs[] __ro_after_init = {
&bql_limit_min_attribute.attr,
&bql_hold_time_attribute.attr,
&bql_inflight_attribute.attr,
+ &bql_stall_thrs_attribute.attr,
+ &bql_stall_cnt_attribute.attr,
+ &bql_stall_max_attribute.attr,
NULL
};
@@ -1232,71 +1704,105 @@ static const struct attribute_group dql_group = {
.name = "byte_queue_limits",
.attrs = dql_attrs,
};
+#else
+/* Fake declaration, all the code using it should be dead */
+static const struct attribute_group dql_group = {};
#endif /* CONFIG_BQL */
#ifdef CONFIG_XPS
-static ssize_t xps_cpus_show(struct netdev_queue *queue,
- char *buf)
+static ssize_t xps_queue_show(struct net_device *dev, unsigned int index,
+ int tc, char *buf, enum xps_map_type type)
{
- struct net_device *dev = queue->dev;
- int cpu, len, num_tc = 1, tc = 0;
struct xps_dev_maps *dev_maps;
- cpumask_var_t mask;
- unsigned long index;
+ unsigned long *mask;
+ unsigned int nr_ids;
+ int j, len;
- if (!netif_is_multiqueue(dev))
- return -ENOENT;
+ rcu_read_lock();
+ dev_maps = rcu_dereference(dev->xps_maps[type]);
- index = get_netdev_queue_index(queue);
+ /* Default to nr_cpu_ids/dev->num_rx_queues and do not just return 0
+ * when dev_maps hasn't been allocated yet, to be backward compatible.
+ */
+ nr_ids = dev_maps ? dev_maps->nr_ids :
+ (type == XPS_CPUS ? nr_cpu_ids : dev->num_rx_queues);
- if (dev->num_tc) {
- /* Do not allow XPS on subordinate device directly */
- num_tc = dev->num_tc;
- if (num_tc < 0)
- return -EINVAL;
+ mask = bitmap_zalloc(nr_ids, GFP_NOWAIT);
+ if (!mask) {
+ rcu_read_unlock();
+ return -ENOMEM;
+ }
- /* If queue belongs to subordinate dev use its map */
- dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+ if (!dev_maps || tc >= dev_maps->num_tc)
+ goto out_no_maps;
- tc = netdev_txq_to_tc(dev, index);
- if (tc < 0)
- return -EINVAL;
- }
+ for (j = 0; j < nr_ids; j++) {
+ int i, tci = j * dev_maps->num_tc + tc;
+ struct xps_map *map;
- if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
- return -ENOMEM;
+ map = rcu_dereference(dev_maps->attr_map[tci]);
+ if (!map)
+ continue;
- rcu_read_lock();
- dev_maps = rcu_dereference(dev->xps_cpus_map);
- if (dev_maps) {
- for_each_possible_cpu(cpu) {
- int i, tci = cpu * num_tc + tc;
- struct xps_map *map;
-
- map = rcu_dereference(dev_maps->attr_map[tci]);
- if (!map)
- continue;
-
- for (i = map->len; i--;) {
- if (map->queues[i] == index) {
- cpumask_set_cpu(cpu, mask);
- break;
- }
+ for (i = map->len; i--;) {
+ if (map->queues[i] == index) {
+ __set_bit(j, mask);
+ break;
}
}
}
+out_no_maps:
rcu_read_unlock();
- len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
- free_cpumask_var(mask);
+ len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids);
+ bitmap_free(mask);
+
return len < PAGE_SIZE ? len : -EINVAL;
}
-static ssize_t xps_cpus_store(struct netdev_queue *queue,
- const char *buf, size_t len)
+static ssize_t xps_cpus_show(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
+{
+ struct net_device *dev = queue->dev;
+ unsigned int index;
+ int len, tc, ret;
+
+ if (!netif_is_multiqueue(dev))
+ return -ENOENT;
+
+ index = get_netdev_queue_index(queue);
+
+ ret = sysfs_rtnl_lock(kobj, attr, queue->dev);
+ if (ret)
+ return ret;
+
+ /* If queue belongs to subordinate dev use its map */
+ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
+ tc = netdev_txq_to_tc(dev, index);
+ if (tc < 0) {
+ rtnl_unlock();
+ return -EINVAL;
+ }
+
+ /* Increase the net device refcnt to make sure it won't be freed while
+ * xps_queue_show is running.
+ */
+ dev_hold(dev);
+ rtnl_unlock();
+
+ len = xps_queue_show(dev, index, tc, buf, XPS_CPUS);
+
+ dev_put(dev);
+ return len;
+}
+
+static ssize_t xps_cpus_store(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
+ size_t len)
{
struct net_device *dev = queue->dev;
- unsigned long index;
+ unsigned int index;
cpumask_var_t mask;
int err;
@@ -1317,7 +1823,14 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
return err;
}
+ err = sysfs_rtnl_lock(kobj, attr, dev);
+ if (err) {
+ free_cpumask_var(mask);
+ return err;
+ }
+
err = netif_set_xps_queue(dev, mask, index);
+ rtnl_unlock();
free_cpumask_var(mask);
@@ -1327,69 +1840,46 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
= __ATTR_RW(xps_cpus);
-static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
+static ssize_t xps_rxqs_show(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, char *buf)
{
struct net_device *dev = queue->dev;
- struct xps_dev_maps *dev_maps;
- unsigned long *mask, index;
- int j, len, num_tc = 1, tc = 0;
+ unsigned int index;
+ int tc, ret;
index = get_netdev_queue_index(queue);
- if (dev->num_tc) {
- num_tc = dev->num_tc;
- tc = netdev_txq_to_tc(dev, index);
- if (tc < 0)
- return -EINVAL;
- }
- mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
- GFP_KERNEL);
- if (!mask)
- return -ENOMEM;
-
- rcu_read_lock();
- dev_maps = rcu_dereference(dev->xps_rxqs_map);
- if (!dev_maps)
- goto out_no_maps;
-
- for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues),
- j < dev->num_rx_queues;) {
- int i, tci = j * num_tc + tc;
- struct xps_map *map;
-
- map = rcu_dereference(dev_maps->attr_map[tci]);
- if (!map)
- continue;
+ ret = sysfs_rtnl_lock(kobj, attr, dev);
+ if (ret)
+ return ret;
- for (i = map->len; i--;) {
- if (map->queues[i] == index) {
- set_bit(j, mask);
- break;
- }
- }
- }
-out_no_maps:
- rcu_read_unlock();
+ tc = netdev_txq_to_tc(dev, index);
- len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
- kfree(mask);
+ /* Increase the net device refcnt to make sure it won't be freed while
+ * xps_queue_show is running.
+ */
+ dev_hold(dev);
+ rtnl_unlock();
- return len < PAGE_SIZE ? len : -EINVAL;
+ ret = tc >= 0 ? xps_queue_show(dev, index, tc, buf, XPS_RXQS) : -EINVAL;
+ dev_put(dev);
+ return ret;
}
-static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
+static ssize_t xps_rxqs_store(struct kobject *kobj, struct attribute *attr,
+ struct netdev_queue *queue, const char *buf,
size_t len)
{
struct net_device *dev = queue->dev;
struct net *net = dev_net(dev);
- unsigned long *mask, index;
+ unsigned long *mask;
+ unsigned int index;
int err;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
- GFP_KERNEL);
+ mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL);
if (!mask)
return -ENOMEM;
@@ -1397,15 +1887,23 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
err = bitmap_parse(buf, len, mask, dev->num_rx_queues);
if (err) {
- kfree(mask);
+ bitmap_free(mask);
+ return err;
+ }
+
+ err = sysfs_rtnl_lock(kobj, attr, dev);
+ if (err) {
+ bitmap_free(mask);
return err;
}
cpus_read_lock();
- err = __netif_set_xps_queue(dev, mask, index, true);
+ err = __netif_set_xps_queue(dev, mask, index, XPS_RXQS);
cpus_read_unlock();
- kfree(mask);
+ rtnl_unlock();
+
+ bitmap_free(mask);
return err ? : len;
}
@@ -1423,28 +1921,29 @@ static struct attribute *netdev_queue_default_attrs[] __ro_after_init = {
#endif
NULL
};
+ATTRIBUTE_GROUPS(netdev_queue_default);
static void netdev_queue_release(struct kobject *kobj)
{
struct netdev_queue *queue = to_netdev_queue(kobj);
memset(kobj, 0, sizeof(*kobj));
- dev_put(queue->dev);
+ netdev_put(queue->dev, &queue->dev_tracker);
}
-static const void *netdev_queue_namespace(struct kobject *kobj)
+static const void *netdev_queue_namespace(const struct kobject *kobj)
{
struct netdev_queue *queue = to_netdev_queue(kobj);
struct device *dev = &queue->dev->dev;
const void *ns = NULL;
- if (dev->class && dev->class->ns_type)
+ if (dev->class && dev->class->namespace)
ns = dev->class->namespace(dev);
return ns;
}
-static void netdev_queue_get_ownership(struct kobject *kobj,
+static void netdev_queue_get_ownership(const struct kobject *kobj,
kuid_t *uid, kgid_t *gid)
{
const struct net *net = netdev_queue_namespace(kobj);
@@ -1452,38 +1951,90 @@ static void netdev_queue_get_ownership(struct kobject *kobj,
net_ns_get_ownership(net, uid, gid);
}
-static struct kobj_type netdev_queue_ktype __ro_after_init = {
+static const struct kobj_type netdev_queue_ktype = {
.sysfs_ops = &netdev_queue_sysfs_ops,
.release = netdev_queue_release,
- .default_attrs = netdev_queue_default_attrs,
.namespace = netdev_queue_namespace,
.get_ownership = netdev_queue_get_ownership,
};
+static bool netdev_uses_bql(const struct net_device *dev)
+{
+ if (dev->lltx || (dev->priv_flags & IFF_NO_QUEUE))
+ return false;
+
+ return IS_ENABLED(CONFIG_BQL);
+}
+
static int netdev_queue_add_kobject(struct net_device *dev, int index)
{
struct netdev_queue *queue = dev->_tx + index;
struct kobject *kobj = &queue->kobj;
int error = 0;
+ /* Tx queues are cleared in netdev_queue_release to allow later
+ * re-registration. This is triggered when their kobj refcount is
+ * dropped.
+ *
+ * If a queue is removed while both a read (or write) operation and a
+ * the re-addition of the same queue are pending (waiting on rntl_lock)
+ * it might happen that the re-addition will execute before the read,
+ * making the initial removal to never happen (queue's kobj refcount
+ * won't drop enough because of the pending read). In such rare case,
+ * return to allow the removal operation to complete.
+ */
+ if (unlikely(kobj->state_initialized)) {
+ netdev_warn_once(dev, "Cannot re-add tx queues before their removal completed");
+ return -EAGAIN;
+ }
+
+ /* Kobject_put later will trigger netdev_queue_release call
+ * which decreases dev refcount: Take that reference here
+ */
+ netdev_hold(queue->dev, &queue->dev_tracker, GFP_KERNEL);
+
kobj->kset = dev->queues_kset;
error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
"tx-%u", index);
if (error)
- return error;
+ goto err;
-#ifdef CONFIG_BQL
- error = sysfs_create_group(kobj, &dql_group);
- if (error) {
- kobject_put(kobj);
- return error;
+ queue->groups = netdev_queue_default_groups;
+ error = sysfs_create_groups(kobj, queue->groups);
+ if (error)
+ goto err;
+
+ if (netdev_uses_bql(dev)) {
+ error = sysfs_create_group(kobj, &dql_group);
+ if (error)
+ goto err_default_groups;
}
-#endif
kobject_uevent(kobj, KOBJ_ADD);
- dev_hold(queue->dev);
-
return 0;
+
+err_default_groups:
+ sysfs_remove_groups(kobj, queue->groups);
+err:
+ kobject_put(kobj);
+ return error;
+}
+
+static int tx_queue_change_owner(struct net_device *ndev, int index,
+ kuid_t kuid, kgid_t kgid)
+{
+ struct netdev_queue *queue = ndev->_tx + index;
+ struct kobject *kobj = &queue->kobj;
+ int error;
+
+ error = sysfs_change_owner(kobj, kuid, kgid);
+ if (error)
+ return error;
+
+ if (netdev_uses_bql(ndev))
+ error = sysfs_group_change_owner(kobj, &dql_group, kuid, kgid);
+
+ return error;
}
#endif /* CONFIG_SYSFS */
@@ -1494,6 +2045,13 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
int i;
int error = 0;
+ /* Tx queue kobjects are allowed to be updated when a device is being
+ * unregistered, but solely to remove queues from qdiscs. Any path
+ * adding queues should be fixed.
+ */
+ WARN(dev->reg_state == NETREG_UNREGISTERING && new_num > old_num,
+ "New queues can't be registered after device unregistration.");
+
for (i = old_num; i < new_num; i++) {
error = netdev_queue_add_kobject(dev, i);
if (error) {
@@ -1505,11 +2063,13 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
while (--i >= new_num) {
struct netdev_queue *queue = dev->_tx + i;
- if (!refcount_read(&dev_net(dev)->count))
+ if (!check_net(dev_net(dev)))
queue->kobj.uevent_suppress = 1;
-#ifdef CONFIG_BQL
- sysfs_remove_group(&queue->kobj, &dql_group);
-#endif
+
+ if (netdev_uses_bql(dev))
+ sysfs_remove_group(&queue->kobj, &dql_group);
+
+ sysfs_remove_groups(&queue->kobj, queue->groups);
kobject_put(&queue->kobj);
}
@@ -1519,6 +2079,25 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
#endif /* CONFIG_SYSFS */
}
+static int net_tx_queue_change_owner(struct net_device *dev, int num,
+ kuid_t kuid, kgid_t kgid)
+{
+#ifdef CONFIG_SYSFS
+ int error = 0;
+ int i;
+
+ for (i = 0; i < num; i++) {
+ error = tx_queue_change_owner(dev, i, kuid, kgid);
+ if (error)
+ break;
+ }
+
+ return error;
+#else
+ return 0;
+#endif /* CONFIG_SYSFS */
+}
+
static int register_queue_kobjects(struct net_device *dev)
{
int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0;
@@ -1547,9 +2126,37 @@ static int register_queue_kobjects(struct net_device *dev)
error:
netdev_queue_update_kobjects(dev, txq, 0);
net_rx_queue_update_kobjects(dev, rxq, 0);
+#ifdef CONFIG_SYSFS
+ kset_unregister(dev->queues_kset);
+#endif
return error;
}
+static int queue_change_owner(struct net_device *ndev, kuid_t kuid, kgid_t kgid)
+{
+ int error = 0, real_rx = 0, real_tx = 0;
+
+#ifdef CONFIG_SYSFS
+ if (ndev->queues_kset) {
+ error = sysfs_change_owner(&ndev->queues_kset->kobj, kuid, kgid);
+ if (error)
+ return error;
+ }
+ real_rx = ndev->real_num_rx_queues;
+#endif
+ real_tx = ndev->real_num_tx_queues;
+
+ error = net_rx_queue_change_owner(ndev, real_rx, kuid, kgid);
+ if (error)
+ return error;
+
+ error = net_tx_queue_change_owner(ndev, real_tx, kuid, kgid);
+ if (error)
+ return error;
+
+ return 0;
+}
+
static void remove_queue_kobjects(struct net_device *dev)
{
int real_rx = 0, real_tx = 0;
@@ -1561,6 +2168,11 @@ static void remove_queue_kobjects(struct net_device *dev)
net_rx_queue_update_kobjects(dev, real_rx, 0);
netdev_queue_update_kobjects(dev, real_tx, 0);
+
+ netdev_lock_ops(dev);
+ dev->real_num_rx_queues = 0;
+ dev->real_num_tx_queues = 0;
+ netdev_unlock_ops(dev);
#ifdef CONFIG_SYSFS
kset_unregister(dev->queues_kset);
#endif
@@ -1603,9 +2215,9 @@ const struct kobj_ns_type_operations net_ns_type_operations = {
};
EXPORT_SYMBOL_GPL(net_ns_type_operations);
-static int netdev_uevent(struct device *d, struct kobj_uevent_env *env)
+static int netdev_uevent(const struct device *d, struct kobj_uevent_env *env)
{
- struct net_device *dev = to_net_dev(d);
+ const struct net_device *dev = to_net_dev(d);
int retval;
/* pass interface to uevent. */
@@ -1637,25 +2249,25 @@ static void netdev_release(struct device *d)
* device is dead and about to be freed.
*/
kfree(rcu_access_pointer(dev->ifalias));
- netdev_freemem(dev);
+ kvfree(dev);
}
-static const void *net_namespace(struct device *d)
+static const void *net_namespace(const struct device *d)
{
- struct net_device *dev = to_net_dev(d);
+ const struct net_device *dev = to_net_dev(d);
return dev_net(dev);
}
-static void net_get_ownership(struct device *d, kuid_t *uid, kgid_t *gid)
+static void net_get_ownership(const struct device *d, kuid_t *uid, kgid_t *gid)
{
- struct net_device *dev = to_net_dev(d);
+ const struct net_device *dev = to_net_dev(d);
const struct net *net = dev_net(dev);
net_ns_get_ownership(net, uid, gid);
}
-static struct class net_class __ro_after_init = {
+static const struct class net_class = {
.name = "net",
.dev_release = netdev_release,
.dev_groups = net_class_groups,
@@ -1665,15 +2277,15 @@ static struct class net_class __ro_after_init = {
.get_ownership = net_get_ownership,
};
-#ifdef CONFIG_OF_NET
+#ifdef CONFIG_OF
static int of_dev_node_match(struct device *dev, const void *data)
{
- int ret = 0;
-
- if (dev->parent)
- ret = dev->parent->of_node == data;
+ for (; dev; dev = dev->parent) {
+ if (dev->of_node == data)
+ return 1;
+ }
- return ret == 0 ? dev->of_node == data : ret;
+ return 0;
}
/*
@@ -1705,7 +2317,7 @@ void netdev_unregister_kobject(struct net_device *ndev)
{
struct device *dev = &ndev->dev;
- if (!refcount_read(&dev_net(ndev)->count))
+ if (!check_net(dev_net(ndev)))
dev_set_uevent_suppress(dev, 1);
kobject_get(&dev->kobj);
@@ -1737,15 +2349,10 @@ int netdev_register_kobject(struct net_device *ndev)
groups++;
*groups++ = &netstat_group;
+ *groups++ = &netdev_phys_group;
-#if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211)
- if (ndev->ieee80211_ptr)
- *groups++ = &wireless_group;
-#if IS_ENABLED(CONFIG_WIRELESS_EXT)
- else if (ndev->wireless_handlers)
+ if (wireless_group_needed(ndev))
*groups++ = &wireless_group;
-#endif
-#endif
#endif /* CONFIG_SYSFS */
error = device_add(dev);
@@ -1763,6 +2370,37 @@ int netdev_register_kobject(struct net_device *ndev)
return error;
}
+/* Change owner for sysfs entries when moving network devices across network
+ * namespaces owned by different user namespaces.
+ */
+int netdev_change_owner(struct net_device *ndev, const struct net *net_old,
+ const struct net *net_new)
+{
+ kuid_t old_uid = GLOBAL_ROOT_UID, new_uid = GLOBAL_ROOT_UID;
+ kgid_t old_gid = GLOBAL_ROOT_GID, new_gid = GLOBAL_ROOT_GID;
+ struct device *dev = &ndev->dev;
+ int error;
+
+ net_ns_get_ownership(net_old, &old_uid, &old_gid);
+ net_ns_get_ownership(net_new, &new_uid, &new_gid);
+
+ /* The network namespace was changed but the owning user namespace is
+ * identical so there's no need to change the owner of sysfs entries.
+ */
+ if (uid_eq(old_uid, new_uid) && gid_eq(old_gid, new_gid))
+ return 0;
+
+ error = device_change_owner(dev, new_uid, new_gid);
+ if (error)
+ return error;
+
+ error = queue_change_owner(ndev, new_uid, new_gid);
+ if (error)
+ return error;
+
+ return 0;
+}
+
int netdev_class_create_file_ns(const struct class_attribute *class_attr,
const void *ns)
{
diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h
index 006876c7b78d..e938f25e8e86 100644
--- a/net/core/net-sysfs.h
+++ b/net/core/net-sysfs.h
@@ -8,5 +8,9 @@ void netdev_unregister_kobject(struct net_device *);
int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num);
int netdev_queue_update_kobjects(struct net_device *net,
int old_num, int new_num);
+int netdev_change_owner(struct net_device *, const struct net *net_old,
+ const struct net *net_new);
+
+extern struct mutex rps_default_mask_mutex;
#endif
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 419af6dfe29f..f2fa34b1d78d 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -23,7 +23,7 @@
#include <linux/net_dropmon.h>
#include <linux/slab.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <asm/bitops.h>
#define CREATE_TRACE_POINTS
@@ -41,10 +41,28 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add);
EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_external_learn_add);
EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete);
EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update);
+EXPORT_TRACEPOINT_SYMBOL_GPL(br_mdb_full);
#endif
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+#include <trace/events/page_pool.h>
+#endif
+
+#include <trace/events/neigh.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update);
+EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update_done);
+EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_timer_handler);
+EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_event_send_done);
+EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_event_send_dead);
+EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_cleanup_and_release);
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll);
EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset);
+EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_bad_csum);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(udp_fail_queue_rcv_skb);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(sk_data_ready);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 670c84b1bfc2..a6e6a964a287 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/workqueue.h>
@@ -18,7 +19,10 @@
#include <linux/net_namespace.h>
#include <linux/sched/task.h>
#include <linux/uidgid.h>
+#include <linux/proc_fs.h>
+#include <linux/nstree.h>
+#include <net/aligned_data.h>
#include <net/sock.h>
#include <net/netlink.h>
#include <net/net_namespace.h>
@@ -38,10 +42,11 @@ EXPORT_SYMBOL_GPL(net_namespace_list);
DECLARE_RWSEM(net_rwsem);
EXPORT_SYMBOL_GPL(net_rwsem);
-struct net init_net = {
- .count = REFCOUNT_INIT(1),
- .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
-};
+#ifdef CONFIG_KEYS
+static struct key_tag init_net_key_domain = { .usage = REFCOUNT_INIT(1) };
+#endif
+
+struct net init_net;
EXPORT_SYMBOL(init_net);
static bool init_net_initialized;
@@ -52,7 +57,6 @@ static bool init_net_initialized;
* outside.
*/
DECLARE_RWSEM(pernet_ops_rwsem);
-EXPORT_SYMBOL_GPL(pernet_ops_rwsem);
#define MIN_PERNET_OPS_ID \
((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
@@ -63,12 +67,15 @@ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
static struct net_generic *net_alloc_generic(void)
{
+ unsigned int gen_ptrs = READ_ONCE(max_gen_ptrs);
+ unsigned int generic_size;
struct net_generic *ng;
- unsigned int generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]);
+
+ generic_size = offsetof(struct net_generic, ptr[gen_ptrs]);
ng = kzalloc(generic_size, GFP_KERNEL);
if (ng)
- ng->s.len = max_gen_ptrs;
+ ng->s.len = gen_ptrs;
return ng;
}
@@ -87,7 +94,7 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data)
}
ng = net_alloc_generic();
- if (ng == NULL)
+ if (!ng)
return -ENOMEM;
/*
@@ -112,10 +119,11 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data)
static int ops_init(const struct pernet_operations *ops, struct net *net)
{
+ struct net_generic *ng;
int err = -ENOMEM;
void *data = NULL;
- if (ops->id && ops->size) {
+ if (ops->id) {
data = kzalloc(ops->size, GFP_KERNEL);
if (!data)
goto out;
@@ -130,6 +138,12 @@ static int ops_init(const struct pernet_operations *ops, struct net *net)
if (!err)
return 0;
+ if (ops->id) {
+ ng = rcu_dereference_protected(net->gen,
+ lockdep_is_held(&pernet_ops_rwsem));
+ ng->ptr[*ops->id] = NULL;
+ }
+
cleanup:
kfree(data);
@@ -137,21 +151,56 @@ out:
return err;
}
-static void ops_free(const struct pernet_operations *ops, struct net *net)
+static void ops_pre_exit_list(const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
{
- if (ops->id && ops->size) {
- kfree(net_generic(net, *ops->id));
+ struct net *net;
+
+ if (ops->pre_exit) {
+ list_for_each_entry(net, net_exit_list, exit_list)
+ ops->pre_exit(net);
}
}
+static void ops_exit_rtnl_list(const struct list_head *ops_list,
+ const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ const struct pernet_operations *saved_ops = ops;
+ LIST_HEAD(dev_kill_list);
+ struct net *net;
+
+ rtnl_lock();
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ __rtnl_net_lock(net);
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, ops_list, list) {
+ if (ops->exit_rtnl)
+ ops->exit_rtnl(net, &dev_kill_list);
+ }
+
+ __rtnl_net_unlock(net);
+ }
+
+ unregister_netdevice_many(&dev_kill_list);
+
+ rtnl_unlock();
+}
+
static void ops_exit_list(const struct pernet_operations *ops,
struct list_head *net_exit_list)
{
- struct net *net;
if (ops->exit) {
- list_for_each_entry(net, net_exit_list, exit_list)
+ struct net *net;
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
ops->exit(net);
+ cond_resched();
+ }
}
+
if (ops->exit_batch)
ops->exit_batch(net_exit_list);
}
@@ -160,10 +209,61 @@ static void ops_free_list(const struct pernet_operations *ops,
struct list_head *net_exit_list)
{
struct net *net;
- if (ops->size && ops->id) {
+
+ if (ops->id) {
list_for_each_entry(net, net_exit_list, exit_list)
- ops_free(ops, net);
+ kfree(net_generic(net, *ops->id));
+ }
+}
+
+static void ops_undo_list(const struct list_head *ops_list,
+ const struct pernet_operations *ops,
+ struct list_head *net_exit_list,
+ bool expedite_rcu)
+{
+ const struct pernet_operations *saved_ops;
+ bool hold_rtnl = false;
+
+ if (!ops)
+ ops = list_entry(ops_list, typeof(*ops), list);
+
+ saved_ops = ops;
+
+ list_for_each_entry_continue_reverse(ops, ops_list, list) {
+ hold_rtnl |= !!ops->exit_rtnl;
+ ops_pre_exit_list(ops, net_exit_list);
}
+
+ /* Another CPU might be rcu-iterating the list, wait for it.
+ * This needs to be before calling the exit() notifiers, so the
+ * rcu_barrier() after ops_undo_list() isn't sufficient alone.
+ * Also the pre_exit() and exit() methods need this barrier.
+ */
+ if (expedite_rcu)
+ synchronize_rcu_expedited();
+ else
+ synchronize_rcu();
+
+ if (hold_rtnl)
+ ops_exit_rtnl_list(ops_list, saved_ops, net_exit_list);
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, ops_list, list)
+ ops_exit_list(ops, net_exit_list);
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, ops_list, list)
+ ops_free_list(ops, net_exit_list);
+}
+
+static void ops_undo_single(struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ LIST_HEAD(ops_list);
+
+ list_add(&ops->list, &ops_list);
+ ops_undo_list(&ops_list, NULL, net_exit_list, false);
+ list_del(&ops->list);
}
/* should be called with nsid_lock held */
@@ -192,16 +292,10 @@ static int net_eq_idr(int id, void *net, void *peer)
return 0;
}
-/* Should be called with nsid_lock held. If a new id is assigned, the bool alloc
- * is set to true, thus the caller knows that the new id must be notified via
- * rtnl.
- */
-static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc)
+/* Must be called from RCU-critical section or with nsid_lock held */
+static int __peernet2id(const struct net *net, struct net *peer)
{
int id = idr_for_each(&net->netns_ids, net_eq_idr, peer);
- bool alloc_it = *alloc;
-
- *alloc = false;
/* Magic value for id 0. */
if (id == NET_ID_ZERO)
@@ -209,61 +303,60 @@ static int __peernet2id_alloc(struct net *net, struct net *peer, bool *alloc)
if (id > 0)
return id;
- if (alloc_it) {
- id = alloc_netid(net, peer, -1);
- *alloc = true;
- return id >= 0 ? id : NETNSA_NSID_NOT_ASSIGNED;
- }
-
return NETNSA_NSID_NOT_ASSIGNED;
}
-/* should be called with nsid_lock held */
-static int __peernet2id(struct net *net, struct net *peer)
-{
- bool no = false;
-
- return __peernet2id_alloc(net, peer, &no);
-}
-
-static void rtnl_net_notifyid(struct net *net, int cmd, int id);
+static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid,
+ struct nlmsghdr *nlh, gfp_t gfp);
/* This function returns the id of a peer netns. If no id is assigned, one will
* be allocated and returned.
*/
-int peernet2id_alloc(struct net *net, struct net *peer)
+int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp)
{
- bool alloc = false, alive = false;
int id;
- if (refcount_read(&net->count) == 0)
+ if (!check_net(net))
return NETNSA_NSID_NOT_ASSIGNED;
- spin_lock_bh(&net->nsid_lock);
- /*
- * When peer is obtained from RCU lists, we may race with
+
+ spin_lock(&net->nsid_lock);
+ id = __peernet2id(net, peer);
+ if (id >= 0) {
+ spin_unlock(&net->nsid_lock);
+ return id;
+ }
+
+ /* When peer is obtained from RCU lists, we may race with
* its cleanup. Check whether it's alive, and this guarantees
* we never hash a peer back to net->netns_ids, after it has
* just been idr_remove()'d from there in cleanup_net().
*/
- if (maybe_get_net(peer))
- alive = alloc = true;
- id = __peernet2id_alloc(net, peer, &alloc);
- spin_unlock_bh(&net->nsid_lock);
- if (alloc && id >= 0)
- rtnl_net_notifyid(net, RTM_NEWNSID, id);
- if (alive)
- put_net(peer);
+ if (!maybe_get_net(peer)) {
+ spin_unlock(&net->nsid_lock);
+ return NETNSA_NSID_NOT_ASSIGNED;
+ }
+
+ id = alloc_netid(net, peer, -1);
+ spin_unlock(&net->nsid_lock);
+
+ put_net(peer);
+ if (id < 0)
+ return NETNSA_NSID_NOT_ASSIGNED;
+
+ rtnl_net_notifyid(net, RTM_NEWNSID, id, 0, NULL, gfp);
+
return id;
}
EXPORT_SYMBOL_GPL(peernet2id_alloc);
/* This function returns, if assigned, the id of a peer netns. */
-int peernet2id(struct net *net, struct net *peer)
+int peernet2id(const struct net *net, struct net *peer)
{
int id;
- spin_lock_bh(&net->nsid_lock);
+ rcu_read_lock();
id = __peernet2id(net, peer);
- spin_unlock_bh(&net->nsid_lock);
+ rcu_read_unlock();
+
return id;
}
EXPORT_SYMBOL(peernet2id);
@@ -271,12 +364,12 @@ EXPORT_SYMBOL(peernet2id);
/* This function returns true is the peer netns has an id assigned into the
* current netns.
*/
-bool peernet_has_id(struct net *net, struct net *peer)
+bool peernet_has_id(const struct net *net, struct net *peer)
{
return peernet2id(net, peer) >= 0;
}
-struct net *get_net_ns_by_id(struct net *net, int id)
+struct net *get_net_ns_by_id(const struct net *net, int id)
{
struct net *peer;
@@ -291,25 +384,64 @@ struct net *get_net_ns_by_id(struct net *net, int id)
return peer;
}
+EXPORT_SYMBOL_GPL(get_net_ns_by_id);
-/*
- * setup_net runs the initializers for the network namespace object.
- */
-static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
+static __net_init void preinit_net_sysctl(struct net *net)
{
- /* Must be called with pernet_ops_rwsem held */
- const struct pernet_operations *ops, *saved_ops;
- int error = 0;
- LIST_HEAD(net_exit_list);
+ net->core.sysctl_somaxconn = SOMAXCONN;
+ /* Limits per socket sk_omem_alloc usage.
+ * TCP zerocopy regular usage needs 128 KB.
+ */
+ net->core.sysctl_optmem_max = 128 * 1024;
+ net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED;
+ net->core.sysctl_tstamp_allow_data = 1;
+ net->core.sysctl_txq_reselection = msecs_to_jiffies(1000);
+}
+
+/* init code that must occur even if setup_net() is not called. */
+static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns)
+{
+ int ret;
+
+ ret = ns_common_init(net);
+ if (ret)
+ return ret;
- refcount_set(&net->count, 1);
refcount_set(&net->passive, 1);
+ ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt");
+ ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt");
+
+ get_random_bytes(&net->hash_mix, sizeof(u32));
net->dev_base_seq = 1;
net->user_ns = user_ns;
+
idr_init(&net->netns_ids);
spin_lock_init(&net->nsid_lock);
mutex_init(&net->ipv4.ra_mutex);
+#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
+ mutex_init(&net->rtnl_mutex);
+ lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL);
+#endif
+
+ INIT_LIST_HEAD(&net->ptype_all);
+ INIT_LIST_HEAD(&net->ptype_specific);
+ preinit_net_sysctl(net);
+ return 0;
+}
+
+/*
+ * setup_net runs the initializers for the network namespace object.
+ */
+static __net_init int setup_net(struct net *net)
+{
+ /* Must be called with pernet_ops_rwsem held */
+ const struct pernet_operations *ops;
+ LIST_HEAD(net_exit_list);
+ int error = 0;
+
+ net->net_cookie = ns_tree_gen_id(net);
+
list_for_each_entry(ops, &pernet_list, list) {
error = ops_init(ops, net);
if (error < 0)
@@ -318,6 +450,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
down_write(&net_rwsem);
list_add_tail_rcu(&net->list, &net_namespace_list);
up_write(&net_rwsem);
+ ns_tree_add_raw(net);
out:
return error;
@@ -326,38 +459,11 @@ out_undo:
* for the pernet modules whose init functions did not fail.
*/
list_add(&net->exit_list, &net_exit_list);
- saved_ops = ops;
- list_for_each_entry_continue_reverse(ops, &pernet_list, list)
- ops_exit_list(ops, &net_exit_list);
-
- ops = saved_ops;
- list_for_each_entry_continue_reverse(ops, &pernet_list, list)
- ops_free_list(ops, &net_exit_list);
-
+ ops_undo_list(&pernet_list, ops, &net_exit_list, false);
rcu_barrier();
goto out;
}
-static int __net_init net_defaults_init_net(struct net *net)
-{
- net->core.sysctl_somaxconn = SOMAXCONN;
- return 0;
-}
-
-static struct pernet_operations net_defaults_ops = {
- .init = net_defaults_init_net,
-};
-
-static __init int net_defaults_init(void)
-{
- if (register_pernet_subsys(&net_defaults_ops))
- panic("Cannot initialize net default settings");
-
- return 0;
-}
-
-core_initcall(net_defaults_init);
-
#ifdef CONFIG_NET_NS
static struct ucounts *inc_net_namespaces(struct user_namespace *ns)
{
@@ -385,29 +491,64 @@ static struct net *net_alloc(void)
if (!net)
goto out_free;
+#ifdef CONFIG_KEYS
+ net->key_domain = kzalloc(sizeof(struct key_tag), GFP_KERNEL);
+ if (!net->key_domain)
+ goto out_free_2;
+ refcount_set(&net->key_domain->usage, 1);
+#endif
+
rcu_assign_pointer(net->gen, ng);
out:
return net;
+#ifdef CONFIG_KEYS
+out_free_2:
+ kmem_cache_free(net_cachep, net);
+ net = NULL;
+#endif
out_free:
kfree(ng);
goto out;
}
-static void net_free(struct net *net)
+static LLIST_HEAD(defer_free_list);
+
+static void net_complete_free(void)
{
- kfree(rcu_access_pointer(net->gen));
- kmem_cache_free(net_cachep, net);
+ struct llist_node *kill_list;
+ struct net *net, *next;
+
+ /* Get the list of namespaces to free from last round. */
+ kill_list = llist_del_all(&defer_free_list);
+
+ llist_for_each_entry_safe(net, next, kill_list, defer_free_list)
+ kmem_cache_free(net_cachep, net);
+
+}
+
+void net_passive_dec(struct net *net)
+{
+ if (refcount_dec_and_test(&net->passive)) {
+ kfree(rcu_access_pointer(net->gen));
+
+ /* There should not be any trackers left there. */
+ ref_tracker_dir_exit(&net->notrefcnt_tracker);
+
+ /* Wait for an extra rcu_barrier() before final free. */
+ llist_add(&net->defer_free_list, &defer_free_list);
+ }
}
void net_drop_ns(void *p)
{
- struct net *ns = p;
- if (ns && refcount_dec_and_test(&ns->passive))
- net_free(ns);
+ struct net *net = (struct net *)p;
+
+ if (net)
+ net_passive_dec(net);
}
-struct net *copy_net_ns(unsigned long flags,
+struct net *copy_net_ns(u64 flags,
struct user_namespace *user_ns, struct net *old_net)
{
struct ucounts *ucounts;
@@ -426,7 +567,10 @@ struct net *copy_net_ns(unsigned long flags,
rv = -ENOMEM;
goto dec_ucounts;
}
- refcount_set(&net->passive, 1);
+
+ rv = preinit_net(net, user_ns);
+ if (rv < 0)
+ goto dec_ucounts;
net->ucounts = ucounts;
get_user_ns(user_ns);
@@ -434,14 +578,18 @@ struct net *copy_net_ns(unsigned long flags,
if (rv < 0)
goto put_userns;
- rv = setup_net(net, user_ns);
+ rv = setup_net(net);
up_read(&pernet_ops_rwsem);
if (rv < 0) {
put_userns:
+ ns_common_free(net);
+#ifdef CONFIG_KEYS
+ key_remove_domain(net->key_domain);
+#endif
put_user_ns(user_ns);
- net_drop_ns(net);
+ net_passive_dec(net);
dec_ucounts:
dec_net_namespaces(ucounts);
return ERR_PTR(rv);
@@ -488,30 +636,34 @@ static void unhash_nsid(struct net *net, struct net *last)
for_each_net(tmp) {
int id;
- spin_lock_bh(&tmp->nsid_lock);
+ spin_lock(&tmp->nsid_lock);
id = __peernet2id(tmp, net);
if (id >= 0)
idr_remove(&tmp->netns_ids, id);
- spin_unlock_bh(&tmp->nsid_lock);
+ spin_unlock(&tmp->nsid_lock);
if (id >= 0)
- rtnl_net_notifyid(tmp, RTM_DELNSID, id);
+ rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL,
+ GFP_KERNEL);
if (tmp == last)
break;
}
- spin_lock_bh(&net->nsid_lock);
+ spin_lock(&net->nsid_lock);
idr_destroy(&net->netns_ids);
- spin_unlock_bh(&net->nsid_lock);
+ spin_unlock(&net->nsid_lock);
}
static LLIST_HEAD(cleanup_list);
+struct task_struct *cleanup_net_task;
+
static void cleanup_net(struct work_struct *work)
{
- const struct pernet_operations *ops;
- struct net *net, *tmp, *last;
struct llist_node *net_kill_list;
+ struct net *net, *tmp, *last;
LIST_HEAD(net_exit_list);
+ WRITE_ONCE(cleanup_net_task, current);
+
/* Atomically snapshot the list of namespaces to cleanup */
net_kill_list = llist_del_all(&cleanup_list);
@@ -519,8 +671,10 @@ static void cleanup_net(struct work_struct *work)
/* Don't let anyone else find us. */
down_write(&net_rwsem);
- llist_for_each_entry(net, net_kill_list, cleanup_list)
+ llist_for_each_entry(net, net_kill_list, cleanup_list) {
+ ns_tree_remove(net);
list_del_rcu(&net->list);
+ }
/* Cache last net. After we unlock rtnl, no one new net
* added to net_namespace_list can assign nsid pointer
* to a net from net_kill_list (see peernet2id_alloc()).
@@ -539,20 +693,7 @@ static void cleanup_net(struct work_struct *work)
list_add_tail(&net->exit_list, &net_exit_list);
}
- /*
- * Another CPU might be rcu-iterating the list, wait for it.
- * This needs to be before calling the exit() notifiers, so
- * the rcu_barrier() below isn't sufficient alone.
- */
- synchronize_rcu();
-
- /* Run all of the network namespace exit methods */
- list_for_each_entry_reverse(ops, &pernet_list, list)
- ops_exit_list(ops, &net_exit_list);
-
- /* Free the net generic variables */
- list_for_each_entry_reverse(ops, &pernet_list, list)
- ops_free_list(ops, &net_exit_list);
+ ops_undo_list(&pernet_list, NULL, &net_exit_list, true);
up_read(&pernet_ops_rwsem);
@@ -561,13 +702,20 @@ static void cleanup_net(struct work_struct *work)
*/
rcu_barrier();
+ net_complete_free();
+
/* Finally it is safe to free my network namespace structure */
list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
list_del_init(&net->exit_list);
+ ns_common_free(net);
dec_net_namespaces(net->ucounts);
+#ifdef CONFIG_KEYS
+ key_remove_domain(net->key_domain);
+#endif
put_user_ns(net->user_ns);
- net_drop_ns(net);
+ net_passive_dec(net);
}
+ WRITE_ONCE(cleanup_net_task, NULL);
}
/**
@@ -590,39 +738,47 @@ static DECLARE_WORK(net_cleanup_work, cleanup_net);
void __put_net(struct net *net)
{
+ ref_tracker_dir_exit(&net->refcnt_tracker);
/* Cleanup the network namespace in process context */
if (llist_add(&net->cleanup_list, &cleanup_list))
queue_work(netns_wq, &net_cleanup_work);
}
EXPORT_SYMBOL_GPL(__put_net);
-struct net *get_net_ns_by_fd(int fd)
+/**
+ * get_net_ns - increment the refcount of the network namespace
+ * @ns: common namespace (net)
+ *
+ * Returns the net's common namespace or ERR_PTR() if ref is zero.
+ */
+struct ns_common *get_net_ns(struct ns_common *ns)
{
- struct file *file;
- struct ns_common *ns;
struct net *net;
- file = proc_ns_fget(fd);
- if (IS_ERR(file))
- return ERR_CAST(file);
-
- ns = get_proc_ns(file_inode(file));
- if (ns->ops == &netns_operations)
- net = get_net(container_of(ns, struct net, ns));
- else
- net = ERR_PTR(-EINVAL);
-
- fput(file);
- return net;
+ net = maybe_get_net(container_of(ns, struct net, ns));
+ if (net)
+ return &net->ns;
+ return ERR_PTR(-EINVAL);
}
+EXPORT_SYMBOL_GPL(get_net_ns);
-#else
struct net *get_net_ns_by_fd(int fd)
{
+ CLASS(fd, f)(fd);
+
+ if (fd_empty(f))
+ return ERR_PTR(-EBADF);
+
+ if (proc_ns_file(fd_file(f))) {
+ struct ns_common *ns = get_proc_ns(file_inode(fd_file(f)));
+ if (ns->ops == &netns_operations)
+ return get_net(container_of(ns, struct net, ns));
+ }
+
return ERR_PTR(-EINVAL);
}
-#endif
EXPORT_SYMBOL_GPL(get_net_ns_by_fd);
+#endif
struct net *get_net_ns_by_pid(pid_t pid)
{
@@ -646,22 +802,37 @@ struct net *get_net_ns_by_pid(pid_t pid)
}
EXPORT_SYMBOL_GPL(get_net_ns_by_pid);
-static __net_init int net_ns_net_init(struct net *net)
+#ifdef CONFIG_NET_NS_REFCNT_TRACKER
+static void net_ns_net_debugfs(struct net *net)
{
-#ifdef CONFIG_NET_NS
- net->ns.ops = &netns_operations;
-#endif
- return ns_alloc_inum(&net->ns);
+ ref_tracker_dir_symlink(&net->refcnt_tracker, "netns-%llx-%u-refcnt",
+ net->net_cookie, net->ns.inum);
+ ref_tracker_dir_symlink(&net->notrefcnt_tracker, "netns-%llx-%u-notrefcnt",
+ net->net_cookie, net->ns.inum);
}
-static __net_exit void net_ns_net_exit(struct net *net)
+static int __init init_net_debugfs(void)
{
- ns_free_inum(&net->ns);
+ ref_tracker_dir_debugfs(&init_net.refcnt_tracker);
+ ref_tracker_dir_debugfs(&init_net.notrefcnt_tracker);
+ net_ns_net_debugfs(&init_net);
+ return 0;
+}
+late_initcall(init_net_debugfs);
+#else
+static void net_ns_net_debugfs(struct net *net)
+{
+}
+#endif
+
+static __net_init int net_ns_net_init(struct net *net)
+{
+ net_ns_net_debugfs(net);
+ return 0;
}
static struct pernet_operations __net_initdata net_ns_ops = {
.init = net_ns_net_init,
- .exit = net_ns_net_exit,
};
static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
@@ -669,6 +840,7 @@ static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
[NETNSA_NSID] = { .type = NLA_S32 },
[NETNSA_PID] = { .type = NLA_U32 },
[NETNSA_FD] = { .type = NLA_U32 },
+ [NETNSA_TARGET_NSID] = { .type = NLA_S32 },
};
static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -680,8 +852,8 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net *peer;
int nsid, err;
- err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
- rtnl_net_policy, extack);
+ err = nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg), tb,
+ NETNSA_MAX, rtnl_net_policy, extack);
if (err < 0)
return err;
if (!tb[NETNSA_NSID]) {
@@ -706,9 +878,9 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
return PTR_ERR(peer);
}
- spin_lock_bh(&net->nsid_lock);
+ spin_lock(&net->nsid_lock);
if (__peernet2id(net, peer) >= 0) {
- spin_unlock_bh(&net->nsid_lock);
+ spin_unlock(&net->nsid_lock);
err = -EEXIST;
NL_SET_BAD_ATTR(extack, nla);
NL_SET_ERR_MSG(extack,
@@ -717,9 +889,10 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh,
}
err = alloc_netid(net, peer, nsid);
- spin_unlock_bh(&net->nsid_lock);
+ spin_unlock(&net->nsid_lock);
if (err >= 0) {
- rtnl_net_notifyid(net, RTM_NEWNSID, err);
+ rtnl_net_notifyid(net, RTM_NEWNSID, err, NETLINK_CB(skb).portid,
+ nlh, GFP_KERNEL);
err = 0;
} else if (err == -ENOSPC && nsid >= 0) {
err = -EEXIST;
@@ -735,23 +908,38 @@ static int rtnl_net_get_size(void)
{
return NLMSG_ALIGN(sizeof(struct rtgenmsg))
+ nla_total_size(sizeof(s32)) /* NETNSA_NSID */
+ + nla_total_size(sizeof(s32)) /* NETNSA_CURRENT_NSID */
;
}
-static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags,
- int cmd, struct net *net, int nsid)
+struct net_fill_args {
+ u32 portid;
+ u32 seq;
+ int flags;
+ int cmd;
+ int nsid;
+ bool add_ref;
+ int ref_nsid;
+};
+
+static int rtnl_net_fill(struct sk_buff *skb, struct net_fill_args *args)
{
struct nlmsghdr *nlh;
struct rtgenmsg *rth;
- nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rth), flags);
+ nlh = nlmsg_put(skb, args->portid, args->seq, args->cmd, sizeof(*rth),
+ args->flags);
if (!nlh)
return -EMSGSIZE;
rth = nlmsg_data(nlh);
rth->rtgen_family = AF_UNSPEC;
- if (nla_put_s32(skb, NETNSA_NSID, nsid))
+ if (nla_put_s32(skb, NETNSA_NSID, args->nsid))
+ goto nla_put_failure;
+
+ if (args->add_ref &&
+ nla_put_s32(skb, NETNSA_CURRENT_NSID, args->ref_nsid))
goto nla_put_failure;
nlmsg_end(skb, nlh);
@@ -762,18 +950,59 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int rtnl_net_valid_getid_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ int i, err;
+
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse_deprecated(nlh, sizeof(struct rtgenmsg),
+ tb, NETNSA_MAX, rtnl_net_policy,
+ extack);
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb,
+ NETNSA_MAX, rtnl_net_policy,
+ extack);
+ if (err)
+ return err;
+
+ for (i = 0; i <= NETNSA_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case NETNSA_PID:
+ case NETNSA_FD:
+ case NETNSA_NSID:
+ case NETNSA_TARGET_NSID:
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in peer netns getid request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tb[NETNSA_MAX + 1];
+ struct net_fill_args fillargs = {
+ .portid = NETLINK_CB(skb).portid,
+ .seq = nlh->nlmsg_seq,
+ .cmd = RTM_NEWNSID,
+ };
+ struct net *peer, *target = net;
struct nlattr *nla;
struct sk_buff *msg;
- struct net *peer;
- int err, id;
+ int err;
- err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX,
- rtnl_net_policy, extack);
+ err = rtnl_net_valid_getid_req(skb, nlh, tb, extack);
if (err < 0)
return err;
if (tb[NETNSA_PID]) {
@@ -782,6 +1011,11 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
} else if (tb[NETNSA_FD]) {
peer = get_net_ns_by_fd(nla_get_u32(tb[NETNSA_FD]));
nla = tb[NETNSA_FD];
+ } else if (tb[NETNSA_NSID]) {
+ peer = get_net_ns_by_id(net, nla_get_s32(tb[NETNSA_NSID]));
+ if (!peer)
+ peer = ERR_PTR(-ENOENT);
+ nla = tb[NETNSA_NSID];
} else {
NL_SET_ERR_MSG(extack, "Peer netns reference is missing");
return -EINVAL;
@@ -793,15 +1027,29 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
return PTR_ERR(peer);
}
+ if (tb[NETNSA_TARGET_NSID]) {
+ int id = nla_get_s32(tb[NETNSA_TARGET_NSID]);
+
+ target = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, id);
+ if (IS_ERR(target)) {
+ NL_SET_BAD_ATTR(extack, tb[NETNSA_TARGET_NSID]);
+ NL_SET_ERR_MSG(extack,
+ "Target netns reference is invalid");
+ err = PTR_ERR(target);
+ goto out;
+ }
+ fillargs.add_ref = true;
+ fillargs.ref_nsid = peernet2id(net, peer);
+ }
+
msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
if (!msg) {
err = -ENOMEM;
goto out;
}
- id = peernet2id(net, peer);
- err = rtnl_net_fill(msg, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
- RTM_NEWNSID, net, id);
+ fillargs.nsid = peernet2id(target, peer);
+ err = rtnl_net_fill(msg, &fillargs);
if (err < 0)
goto err_out;
@@ -811,18 +1059,22 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh,
err_out:
nlmsg_free(msg);
out:
+ if (fillargs.add_ref)
+ put_net(target);
put_net(peer);
return err;
}
struct rtnl_net_dump_cb {
- struct net *net;
+ struct net *tgt_net;
+ struct net *ref_net;
struct sk_buff *skb;
- struct netlink_callback *cb;
+ struct net_fill_args fillargs;
int idx;
int s_idx;
};
+/* Runs in RCU-critical section. */
static int rtnl_net_dumpid_one(int id, void *peer, void *data)
{
struct rtnl_net_dump_cb *net_cb = (struct rtnl_net_dump_cb *)data;
@@ -831,9 +1083,10 @@ static int rtnl_net_dumpid_one(int id, void *peer, void *data)
if (net_cb->idx < net_cb->s_idx)
goto cont;
- ret = rtnl_net_fill(net_cb->skb, NETLINK_CB(net_cb->cb->skb).portid,
- net_cb->cb->nlh->nlmsg_seq, NLM_F_MULTI,
- RTM_NEWNSID, net_cb->net, id);
+ net_cb->fillargs.nsid = id;
+ if (net_cb->fillargs.add_ref)
+ net_cb->fillargs.ref_nsid = __peernet2id(net_cb->ref_net, peer);
+ ret = rtnl_net_fill(net_cb->skb, &net_cb->fillargs);
if (ret < 0)
return ret;
@@ -842,39 +1095,102 @@ cont:
return 0;
}
+static int rtnl_valid_dump_net_req(const struct nlmsghdr *nlh, struct sock *sk,
+ struct rtnl_net_dump_cb *net_cb,
+ struct netlink_callback *cb)
+{
+ struct netlink_ext_ack *extack = cb->extack;
+ struct nlattr *tb[NETNSA_MAX + 1];
+ int err, i;
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct rtgenmsg), tb,
+ NETNSA_MAX, rtnl_net_policy,
+ extack);
+ if (err < 0)
+ return err;
+
+ for (i = 0; i <= NETNSA_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ if (i == NETNSA_TARGET_NSID) {
+ struct net *net;
+
+ net = rtnl_get_net_ns_capable(sk, nla_get_s32(tb[i]));
+ if (IS_ERR(net)) {
+ NL_SET_BAD_ATTR(extack, tb[i]);
+ NL_SET_ERR_MSG(extack,
+ "Invalid target network namespace id");
+ return PTR_ERR(net);
+ }
+ net_cb->fillargs.add_ref = true;
+ net_cb->ref_net = net_cb->tgt_net;
+ net_cb->tgt_net = net;
+ } else {
+ NL_SET_BAD_ATTR(extack, tb[i]);
+ NL_SET_ERR_MSG(extack,
+ "Unsupported attribute in dump request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static int rtnl_net_dumpid(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct net *net = sock_net(skb->sk);
struct rtnl_net_dump_cb net_cb = {
- .net = net,
+ .tgt_net = sock_net(skb->sk),
.skb = skb,
- .cb = cb,
+ .fillargs = {
+ .portid = NETLINK_CB(cb->skb).portid,
+ .seq = cb->nlh->nlmsg_seq,
+ .flags = NLM_F_MULTI,
+ .cmd = RTM_NEWNSID,
+ },
.idx = 0,
.s_idx = cb->args[0],
};
+ int err = 0;
+
+ if (cb->strict_check) {
+ err = rtnl_valid_dump_net_req(cb->nlh, skb->sk, &net_cb, cb);
+ if (err < 0)
+ goto end;
+ }
- spin_lock_bh(&net->nsid_lock);
- idr_for_each(&net->netns_ids, rtnl_net_dumpid_one, &net_cb);
- spin_unlock_bh(&net->nsid_lock);
+ rcu_read_lock();
+ idr_for_each(&net_cb.tgt_net->netns_ids, rtnl_net_dumpid_one, &net_cb);
+ rcu_read_unlock();
cb->args[0] = net_cb.idx;
- return skb->len;
+end:
+ if (net_cb.fillargs.add_ref)
+ put_net(net_cb.tgt_net);
+ return err;
}
-static void rtnl_net_notifyid(struct net *net, int cmd, int id)
+static void rtnl_net_notifyid(struct net *net, int cmd, int id, u32 portid,
+ struct nlmsghdr *nlh, gfp_t gfp)
{
+ struct net_fill_args fillargs = {
+ .portid = portid,
+ .seq = nlh ? nlh->nlmsg_seq : 0,
+ .cmd = cmd,
+ .nsid = id,
+ };
struct sk_buff *msg;
int err = -ENOMEM;
- msg = nlmsg_new(rtnl_net_get_size(), GFP_KERNEL);
+ msg = nlmsg_new(rtnl_net_get_size(), gfp);
if (!msg)
goto out;
- err = rtnl_net_fill(msg, 0, 0, 0, cmd, net, id);
+ err = rtnl_net_fill(msg, &fillargs);
if (err < 0)
goto err_out;
- rtnl_notify(msg, net, 0, RTNLGRP_NSID, NULL, 0);
+ rtnl_notify(msg, net, portid, RTNLGRP_NSID, nlh, gfp);
return;
err_out:
@@ -883,11 +1199,63 @@ out:
rtnl_set_sk_err(net, RTNLGRP_NSID, err);
}
-static int __init net_ns_init(void)
+#ifdef CONFIG_NET_NS
+static void __init netns_ipv4_struct_check(void)
+{
+ /* TX readonly hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_early_retrans);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_tso_win_divisor);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_tso_rtt_log);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_autocorking);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_min_snd_mss);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_notsent_lowat);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_limit_output_bytes);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_min_rtt_wlen);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_tcp_wmem);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx,
+ sysctl_ip_fwd_use_pmtu);
+
+ /* RX readonly hotpath cache line */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_tcp_moderate_rcvbuf);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_tcp_rcvbuf_low_rtt);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_ip_early_demux);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_tcp_early_demux);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_tcp_l3mdev_accept);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_tcp_reordering);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx,
+ sysctl_tcp_rmem);
+}
+#endif
+
+static const struct rtnl_msg_handler net_ns_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_NEWNSID, .doit = rtnl_net_newid,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED},
+ {.msgtype = RTM_GETNSID, .doit = rtnl_net_getid,
+ .dumpit = rtnl_net_dumpid,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
+};
+
+void __init net_ns_init(void)
{
struct net_generic *ng;
#ifdef CONFIG_NET_NS
+ netns_ipv4_struct_check();
net_cachep = kmem_cache_create("net_namespace", sizeof(struct net),
SMP_CACHE_BYTES,
SLAB_PANIC|SLAB_ACCOUNT, NULL);
@@ -904,35 +1272,39 @@ static int __init net_ns_init(void)
rcu_assign_pointer(init_net.gen, ng);
+#ifdef CONFIG_KEYS
+ init_net.key_domain = &init_net_key_domain;
+#endif
+ /*
+ * This currently cannot fail as the initial network namespace
+ * has a static inode number.
+ */
+ if (preinit_net(&init_net, &init_user_ns))
+ panic("Could not preinitialize the initial network namespace");
+
down_write(&pernet_ops_rwsem);
- if (setup_net(&init_net, &init_user_ns))
+ if (setup_net(&init_net))
panic("Could not setup the initial network namespace");
init_net_initialized = true;
up_write(&pernet_ops_rwsem);
- register_pernet_subsys(&net_ns_ops);
-
- rtnl_register(PF_UNSPEC, RTM_NEWNSID, rtnl_net_newid, NULL,
- RTNL_FLAG_DOIT_UNLOCKED);
- rtnl_register(PF_UNSPEC, RTM_GETNSID, rtnl_net_getid, rtnl_net_dumpid,
- RTNL_FLAG_DOIT_UNLOCKED);
+ if (register_pernet_subsys(&net_ns_ops))
+ panic("Could not register network namespace subsystems");
- return 0;
+ rtnl_register_many(net_ns_rtnl_msg_handlers);
}
-pure_initcall(net_ns_init);
-
#ifdef CONFIG_NET_NS
static int __register_pernet_operations(struct list_head *list,
struct pernet_operations *ops)
{
+ LIST_HEAD(net_exit_list);
struct net *net;
int error;
- LIST_HEAD(net_exit_list);
list_add_tail(&ops->list, list);
- if (ops->init || (ops->id && ops->size)) {
+ if (ops->init || ops->id) {
/* We held write locked pernet_ops_rwsem, and parallel
* setup_net() and cleanup_net() are not possible.
*/
@@ -948,22 +1320,21 @@ static int __register_pernet_operations(struct list_head *list,
out_undo:
/* If I have an error cleanup all namespaces I initialized */
list_del(&ops->list);
- ops_exit_list(ops, &net_exit_list);
- ops_free_list(ops, &net_exit_list);
+ ops_undo_single(ops, &net_exit_list);
return error;
}
static void __unregister_pernet_operations(struct pernet_operations *ops)
{
- struct net *net;
LIST_HEAD(net_exit_list);
+ struct net *net;
- list_del(&ops->list);
/* See comment in __register_pernet_operations() */
for_each_net(net)
list_add_tail(&net->exit_list, &net_exit_list);
- ops_exit_list(ops, &net_exit_list);
- ops_free_list(ops, &net_exit_list);
+
+ list_del(&ops->list);
+ ops_undo_single(ops, &net_exit_list);
}
#else
@@ -985,9 +1356,9 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
list_del(&ops->list);
} else {
LIST_HEAD(net_exit_list);
+
list_add(&init_net.exit_list, &net_exit_list);
- ops_exit_list(ops, &net_exit_list);
- ops_free_list(ops, &net_exit_list);
+ ops_undo_single(ops, &net_exit_list);
}
}
@@ -1000,13 +1371,20 @@ static int register_pernet_operations(struct list_head *list,
{
int error;
+ if (WARN_ON(!!ops->id ^ !!ops->size))
+ return -EINVAL;
+
if (ops->id) {
error = ida_alloc_min(&net_generic_ids, MIN_PERNET_OPS_ID,
GFP_KERNEL);
if (error < 0)
return error;
*ops->id = error;
- max_gen_ptrs = max(max_gen_ptrs, *ops->id + 1);
+ /* This does not require READ_ONCE as writers already hold
+ * pernet_ops_rwsem. But WRITE_ONCE is needed to protect
+ * net_alloc_generic.
+ */
+ WRITE_ONCE(max_gen_ptrs, max(max_gen_ptrs, *ops->id + 1));
}
error = __register_pernet_operations(list, ops);
if (error) {
@@ -1137,22 +1515,18 @@ static struct ns_common *netns_get(struct task_struct *task)
return net ? &net->ns : NULL;
}
-static inline struct net *to_net_ns(struct ns_common *ns)
-{
- return container_of(ns, struct net, ns);
-}
-
static void netns_put(struct ns_common *ns)
{
put_net(to_net_ns(ns));
}
-static int netns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+static int netns_install(struct nsset *nsset, struct ns_common *ns)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct net *net = to_net_ns(ns);
if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) ||
- !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
return -EPERM;
put_net(nsproxy->net_ns);
@@ -1167,7 +1541,6 @@ static struct user_namespace *netns_owner(struct ns_common *ns)
const struct proc_ns_operations netns_operations = {
.name = "net",
- .type = CLONE_NEWNET,
.get = netns_get,
.put = netns_put,
.install = netns_install,
diff --git a/net/core/net_test.c b/net/core/net_test.c
new file mode 100644
index 000000000000..9c3a590865d2
--- /dev/null
+++ b/net/core/net_test.c
@@ -0,0 +1,387 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <kunit/test.h>
+
+/* GSO */
+
+#include <linux/skbuff.h>
+
+static const char hdr[] = "abcdefgh";
+#define GSO_TEST_SIZE 1000
+
+static void __init_skb(struct sk_buff *skb)
+{
+ skb_reset_mac_header(skb);
+ memcpy(skb_mac_header(skb), hdr, sizeof(hdr));
+
+ /* skb_segment expects skb->data at start of payload */
+ skb_pull(skb, sizeof(hdr));
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+
+ /* proto is arbitrary, as long as not ETH_P_TEB or vlan */
+ skb->protocol = htons(ETH_P_ATALK);
+ skb_shinfo(skb)->gso_size = GSO_TEST_SIZE;
+}
+
+enum gso_test_nr {
+ GSO_TEST_LINEAR,
+ GSO_TEST_NO_GSO,
+ GSO_TEST_FRAGS,
+ GSO_TEST_FRAGS_PURE,
+ GSO_TEST_GSO_PARTIAL,
+ GSO_TEST_FRAG_LIST,
+ GSO_TEST_FRAG_LIST_PURE,
+ GSO_TEST_FRAG_LIST_NON_UNIFORM,
+ GSO_TEST_GSO_BY_FRAGS,
+};
+
+struct gso_test_case {
+ enum gso_test_nr id;
+ const char *name;
+
+ /* input */
+ unsigned int linear_len;
+ unsigned int nr_frags;
+ const unsigned int *frags;
+ unsigned int nr_frag_skbs;
+ const unsigned int *frag_skbs;
+
+ /* output as expected */
+ unsigned int nr_segs;
+ const unsigned int *segs;
+};
+
+static struct gso_test_case cases[] = {
+ {
+ .id = GSO_TEST_NO_GSO,
+ .name = "no_gso",
+ .linear_len = GSO_TEST_SIZE,
+ .nr_segs = 1,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE },
+ },
+ {
+ .id = GSO_TEST_LINEAR,
+ .name = "linear",
+ .linear_len = GSO_TEST_SIZE + GSO_TEST_SIZE + 1,
+ .nr_segs = 3,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 1 },
+ },
+ {
+ .id = GSO_TEST_FRAGS,
+ .name = "frags",
+ .linear_len = GSO_TEST_SIZE,
+ .nr_frags = 2,
+ .frags = (const unsigned int[]) { GSO_TEST_SIZE, 1 },
+ .nr_segs = 3,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 1 },
+ },
+ {
+ .id = GSO_TEST_FRAGS_PURE,
+ .name = "frags_pure",
+ .nr_frags = 3,
+ .frags = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 2 },
+ .nr_segs = 3,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, 2 },
+ },
+ {
+ .id = GSO_TEST_GSO_PARTIAL,
+ .name = "gso_partial",
+ .linear_len = GSO_TEST_SIZE,
+ .nr_frags = 2,
+ .frags = (const unsigned int[]) { GSO_TEST_SIZE, 3 },
+ .nr_segs = 2,
+ .segs = (const unsigned int[]) { 2 * GSO_TEST_SIZE, 3 },
+ },
+ {
+ /* commit 89319d3801d1: frag_list on mss boundaries */
+ .id = GSO_TEST_FRAG_LIST,
+ .name = "frag_list",
+ .linear_len = GSO_TEST_SIZE,
+ .nr_frag_skbs = 2,
+ .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE },
+ .nr_segs = 3,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, GSO_TEST_SIZE },
+ },
+ {
+ .id = GSO_TEST_FRAG_LIST_PURE,
+ .name = "frag_list_pure",
+ .nr_frag_skbs = 2,
+ .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE },
+ .nr_segs = 2,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE },
+ },
+ {
+ /* commit 43170c4e0ba7: GRO of frag_list trains */
+ .id = GSO_TEST_FRAG_LIST_NON_UNIFORM,
+ .name = "frag_list_non_uniform",
+ .linear_len = GSO_TEST_SIZE,
+ .nr_frag_skbs = 4,
+ .frag_skbs = (const unsigned int[]) { GSO_TEST_SIZE, 1, GSO_TEST_SIZE, 2 },
+ .nr_segs = 4,
+ .segs = (const unsigned int[]) { GSO_TEST_SIZE, GSO_TEST_SIZE, GSO_TEST_SIZE, 3 },
+ },
+ {
+ /* commit 3953c46c3ac7 ("sk_buff: allow segmenting based on frag sizes") and
+ * commit 90017accff61 ("sctp: Add GSO support")
+ *
+ * "there will be a cover skb with protocol headers and
+ * children ones containing the actual segments"
+ */
+ .id = GSO_TEST_GSO_BY_FRAGS,
+ .name = "gso_by_frags",
+ .nr_frag_skbs = 4,
+ .frag_skbs = (const unsigned int[]) { 100, 200, 300, 400 },
+ .nr_segs = 4,
+ .segs = (const unsigned int[]) { 100, 200, 300, 400 },
+ },
+};
+
+static void gso_test_case_to_desc(struct gso_test_case *t, char *desc)
+{
+ sprintf(desc, "%s", t->name);
+}
+
+KUNIT_ARRAY_PARAM(gso_test, cases, gso_test_case_to_desc);
+
+static void gso_test_func(struct kunit *test)
+{
+ const int shinfo_size = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ struct sk_buff *skb, *segs, *cur, *next, *last;
+ const struct gso_test_case *tcase;
+ netdev_features_t features;
+ struct page *page;
+ int i;
+
+ tcase = test->param_value;
+
+ page = alloc_page(GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, page);
+ skb = build_skb(page_address(page), sizeof(hdr) + tcase->linear_len + shinfo_size);
+ KUNIT_ASSERT_NOT_NULL(test, skb);
+ __skb_put(skb, sizeof(hdr) + tcase->linear_len);
+
+ __init_skb(skb);
+
+ if (tcase->nr_frags) {
+ unsigned int pg_off = 0;
+
+ page = alloc_page(GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, page);
+ page_ref_add(page, tcase->nr_frags - 1);
+
+ for (i = 0; i < tcase->nr_frags; i++) {
+ skb_fill_page_desc(skb, i, page, pg_off, tcase->frags[i]);
+ pg_off += tcase->frags[i];
+ }
+
+ KUNIT_ASSERT_LE(test, pg_off, PAGE_SIZE);
+
+ skb->data_len = pg_off;
+ skb->len += skb->data_len;
+ skb->truesize += skb->data_len;
+ }
+
+ if (tcase->frag_skbs) {
+ unsigned int total_size = 0, total_true_size = 0;
+ struct sk_buff *frag_skb, *prev = NULL;
+
+ for (i = 0; i < tcase->nr_frag_skbs; i++) {
+ unsigned int frag_size;
+
+ page = alloc_page(GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, page);
+
+ frag_size = tcase->frag_skbs[i];
+ frag_skb = build_skb(page_address(page),
+ frag_size + shinfo_size);
+ KUNIT_ASSERT_NOT_NULL(test, frag_skb);
+ __skb_put(frag_skb, frag_size);
+
+ if (prev)
+ prev->next = frag_skb;
+ else
+ skb_shinfo(skb)->frag_list = frag_skb;
+ prev = frag_skb;
+
+ total_size += frag_size;
+ total_true_size += frag_skb->truesize;
+ }
+
+ skb->len += total_size;
+ skb->data_len += total_size;
+ skb->truesize += total_true_size;
+
+ if (tcase->id == GSO_TEST_GSO_BY_FRAGS)
+ skb_shinfo(skb)->gso_size = GSO_BY_FRAGS;
+ }
+
+ features = NETIF_F_SG | NETIF_F_HW_CSUM;
+ if (tcase->id == GSO_TEST_GSO_PARTIAL)
+ features |= NETIF_F_GSO_PARTIAL;
+
+ /* TODO: this should also work with SG,
+ * rather than hit BUG_ON(i >= nfrags)
+ */
+ if (tcase->id == GSO_TEST_FRAG_LIST_NON_UNIFORM)
+ features &= ~NETIF_F_SG;
+
+ segs = skb_segment(skb, features);
+ if (IS_ERR(segs)) {
+ KUNIT_FAIL(test, "segs error %pe", segs);
+ goto free_gso_skb;
+ } else if (!segs) {
+ KUNIT_FAIL(test, "no segments");
+ goto free_gso_skb;
+ }
+
+ last = segs->prev;
+ for (cur = segs, i = 0; cur; cur = next, i++) {
+ next = cur->next;
+
+ KUNIT_ASSERT_EQ(test, cur->len, sizeof(hdr) + tcase->segs[i]);
+
+ /* segs have skb->data pointing to the mac header */
+ KUNIT_ASSERT_PTR_EQ(test, skb_mac_header(cur), cur->data);
+ KUNIT_ASSERT_PTR_EQ(test, skb_network_header(cur), cur->data + sizeof(hdr));
+
+ /* header was copied to all segs */
+ KUNIT_ASSERT_EQ(test, memcmp(skb_mac_header(cur), hdr, sizeof(hdr)), 0);
+
+ /* last seg can be found through segs->prev pointer */
+ if (!next)
+ KUNIT_ASSERT_PTR_EQ(test, cur, last);
+
+ consume_skb(cur);
+ }
+
+ KUNIT_ASSERT_EQ(test, i, tcase->nr_segs);
+
+free_gso_skb:
+ consume_skb(skb);
+}
+
+/* IP tunnel flags */
+
+#include <net/ip_tunnels.h>
+
+struct ip_tunnel_flags_test {
+ const char *name;
+
+ const u16 *src_bits;
+ const u16 *exp_bits;
+ u8 src_num;
+ u8 exp_num;
+
+ __be16 exp_val;
+ bool exp_comp;
+};
+
+#define IP_TUNNEL_FLAGS_TEST(n, src, comp, eval, exp) { \
+ .name = (n), \
+ .src_bits = (src), \
+ .src_num = ARRAY_SIZE(src), \
+ .exp_comp = (comp), \
+ .exp_val = (eval), \
+ .exp_bits = (exp), \
+ .exp_num = ARRAY_SIZE(exp), \
+}
+
+/* These are __be16-compatible and can be compared as is */
+static const u16 ip_tunnel_flags_1[] = {
+ IP_TUNNEL_KEY_BIT,
+ IP_TUNNEL_STRICT_BIT,
+ IP_TUNNEL_ERSPAN_OPT_BIT,
+};
+
+/* Due to the previous flags design limitation, setting either
+ * ``IP_TUNNEL_CSUM_BIT`` (on Big Endian) or ``IP_TUNNEL_DONT_FRAGMENT_BIT``
+ * (on Little) also sets VTI/ISATAP bit. In the bitmap implementation, they
+ * correspond to ``BIT(16)``, which is bigger than ``U16_MAX``, but still is
+ * backward-compatible.
+ */
+#ifdef __LITTLE_ENDIAN
+#define IP_TUNNEL_CONFLICT_BIT IP_TUNNEL_DONT_FRAGMENT_BIT
+#else
+#define IP_TUNNEL_CONFLICT_BIT IP_TUNNEL_CSUM_BIT
+#endif
+
+static const u16 ip_tunnel_flags_2_src[] = {
+ IP_TUNNEL_CONFLICT_BIT,
+};
+
+static const u16 ip_tunnel_flags_2_exp[] = {
+ IP_TUNNEL_CONFLICT_BIT,
+ IP_TUNNEL_SIT_ISATAP_BIT,
+};
+
+/* Bits 17 and higher are not compatible with __be16 flags */
+static const u16 ip_tunnel_flags_3_src[] = {
+ IP_TUNNEL_VXLAN_OPT_BIT,
+ 17,
+ 18,
+ 20,
+};
+
+static const u16 ip_tunnel_flags_3_exp[] = {
+ IP_TUNNEL_VXLAN_OPT_BIT,
+};
+
+static const struct ip_tunnel_flags_test ip_tunnel_flags_test[] = {
+ IP_TUNNEL_FLAGS_TEST("compat", ip_tunnel_flags_1, true,
+ cpu_to_be16(BIT(IP_TUNNEL_KEY_BIT) |
+ BIT(IP_TUNNEL_STRICT_BIT) |
+ BIT(IP_TUNNEL_ERSPAN_OPT_BIT)),
+ ip_tunnel_flags_1),
+ IP_TUNNEL_FLAGS_TEST("conflict", ip_tunnel_flags_2_src, true,
+ VTI_ISVTI, ip_tunnel_flags_2_exp),
+ IP_TUNNEL_FLAGS_TEST("new", ip_tunnel_flags_3_src, false,
+ cpu_to_be16(BIT(IP_TUNNEL_VXLAN_OPT_BIT)),
+ ip_tunnel_flags_3_exp),
+};
+
+static void
+ip_tunnel_flags_test_case_to_desc(const struct ip_tunnel_flags_test *t,
+ char *desc)
+{
+ strscpy(desc, t->name, KUNIT_PARAM_DESC_SIZE);
+}
+KUNIT_ARRAY_PARAM(ip_tunnel_flags_test, ip_tunnel_flags_test,
+ ip_tunnel_flags_test_case_to_desc);
+
+static void ip_tunnel_flags_test_run(struct kunit *test)
+{
+ const struct ip_tunnel_flags_test *t = test->param_value;
+ IP_TUNNEL_DECLARE_FLAGS(src) = { };
+ IP_TUNNEL_DECLARE_FLAGS(exp) = { };
+ IP_TUNNEL_DECLARE_FLAGS(out);
+
+ for (u32 j = 0; j < t->src_num; j++)
+ __set_bit(t->src_bits[j], src);
+ for (u32 j = 0; j < t->exp_num; j++)
+ __set_bit(t->exp_bits[j], exp);
+
+ KUNIT_ASSERT_EQ(test, t->exp_comp,
+ ip_tunnel_flags_is_be16_compat(src));
+ KUNIT_ASSERT_EQ(test, (__force u16)t->exp_val,
+ (__force u16)ip_tunnel_flags_to_be16(src));
+
+ ip_tunnel_flags_from_be16(out, t->exp_val);
+ KUNIT_ASSERT_TRUE(test, __ipt_flag_op(bitmap_equal, exp, out));
+}
+
+static struct kunit_case net_test_cases[] = {
+ KUNIT_CASE_PARAM(gso_test_func, gso_test_gen_params),
+ KUNIT_CASE_PARAM(ip_tunnel_flags_test_run,
+ ip_tunnel_flags_test_gen_params),
+ { },
+};
+
+static struct kunit_suite net_test_suite = {
+ .name = "net_core",
+ .test_cases = net_test_cases,
+};
+kunit_test_suite(net_test_suite);
+
+MODULE_DESCRIPTION("KUnit tests for networking core");
+MODULE_LICENSE("GPL");
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 5e4f04004a49..dff66d8fb325 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/core/netclassid_cgroup.c Classid Cgroupfs Handling
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Thomas Graf <tgraf@suug.ch>
*/
@@ -25,7 +21,9 @@ static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state
struct cgroup_cls_state *task_cls_state(struct task_struct *p)
{
return css_cls_state(task_css_check(p, net_cls_cgrp_id,
- rcu_read_lock_bh_held()));
+ rcu_read_lock_held() ||
+ rcu_read_lock_bh_held() ||
+ rcu_read_lock_trace_held()));
}
EXPORT_SYMBOL_GPL(task_cls_state);
@@ -57,30 +55,62 @@ static void cgrp_css_free(struct cgroup_subsys_state *css)
kfree(css_cls_state(css));
}
-static int update_classid_sock(const void *v, struct file *file, unsigned n)
+/*
+ * To avoid freezing of sockets creation for tasks with big number of threads
+ * and opened sockets lets release file_lock every 1000 iterated descriptors.
+ * New sockets will already have been created with new classid.
+ */
+
+struct update_classid_context {
+ u32 classid;
+ unsigned int batch;
+};
+
+#define UPDATE_CLASSID_BATCH 1000
+
+static int update_classid_sock(const void *v, struct file *file, unsigned int n)
{
- int err;
- struct socket *sock = sock_from_file(file, &err);
-
- if (sock) {
- spin_lock(&cgroup_sk_update_lock);
- sock_cgroup_set_classid(&sock->sk->sk_cgrp_data,
- (unsigned long)v);
- spin_unlock(&cgroup_sk_update_lock);
+ struct update_classid_context *ctx = (void *)v;
+ struct socket *sock = sock_from_file(file);
+
+ if (sock)
+ sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid);
+ if (--ctx->batch == 0) {
+ ctx->batch = UPDATE_CLASSID_BATCH;
+ return n + 1;
}
return 0;
}
+static void update_classid_task(struct task_struct *p, u32 classid)
+{
+ struct update_classid_context ctx = {
+ .classid = classid,
+ .batch = UPDATE_CLASSID_BATCH
+ };
+ unsigned int fd = 0;
+
+ /* Only update the leader task, when many threads in this task,
+ * so it can avoid the useless traversal.
+ */
+ if (p != p->group_leader)
+ return;
+
+ do {
+ task_lock(p);
+ fd = iterate_fd(p->files, fd, update_classid_sock, &ctx);
+ task_unlock(p);
+ cond_resched();
+ } while (fd);
+}
+
static void cgrp_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
struct task_struct *p;
cgroup_taskset_for_each(p, css, tset) {
- task_lock(p);
- iterate_fd(p->files, 0, update_classid_sock,
- (void *)(unsigned long)css_cls_state(css)->classid);
- task_unlock(p);
+ update_classid_task(p, css_cls_state(css)->classid);
}
}
@@ -96,17 +126,11 @@ static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
struct css_task_iter it;
struct task_struct *p;
- cgroup_sk_alloc_disable();
-
cs->classid = (u32)value;
css_task_iter_start(css, 0, &it);
- while ((p = css_task_iter_next(&it))) {
- task_lock(p);
- iterate_fd(p->files, 0, update_classid_sock,
- (void *)(unsigned long)cs->classid);
- task_unlock(p);
- }
+ while ((p = css_task_iter_next(&it)))
+ update_classid_task(p, cs->classid);
css_task_iter_end(&it);
return 0;
diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c
new file mode 100644
index 000000000000..ba673e81716f
--- /dev/null
+++ b/net/core/netdev-genl-gen.c
@@ -0,0 +1,238 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/netdev.yaml */
+/* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "netdev-genl-gen.h"
+
+#include <uapi/linux/netdev.h>
+#include <net/netdev_netlink.h>
+
+/* Integer value ranges */
+static const struct netlink_range_validation netdev_a_page_pool_id_range = {
+ .min = 1ULL,
+ .max = U32_MAX,
+};
+
+static const struct netlink_range_validation netdev_a_page_pool_ifindex_range = {
+ .min = 1ULL,
+ .max = S32_MAX,
+};
+
+static const struct netlink_range_validation netdev_a_napi_defer_hard_irqs_range = {
+ .max = S32_MAX,
+};
+
+/* Common nested types */
+const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = {
+ [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range),
+ [NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range),
+};
+
+const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1] = {
+ [NETDEV_A_QUEUE_ID] = { .type = NLA_U32, },
+ [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1),
+};
+
+/* NETDEV_CMD_DEV_GET - do */
+static const struct nla_policy netdev_dev_get_nl_policy[NETDEV_A_DEV_IFINDEX + 1] = {
+ [NETDEV_A_DEV_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+};
+
+/* NETDEV_CMD_PAGE_POOL_GET - do */
+#ifdef CONFIG_PAGE_POOL
+static const struct nla_policy netdev_page_pool_get_nl_policy[NETDEV_A_PAGE_POOL_ID + 1] = {
+ [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range),
+};
+#endif /* CONFIG_PAGE_POOL */
+
+/* NETDEV_CMD_PAGE_POOL_STATS_GET - do */
+#ifdef CONFIG_PAGE_POOL_STATS
+static const struct nla_policy netdev_page_pool_stats_get_nl_policy[NETDEV_A_PAGE_POOL_STATS_INFO + 1] = {
+ [NETDEV_A_PAGE_POOL_STATS_INFO] = NLA_POLICY_NESTED(netdev_page_pool_info_nl_policy),
+};
+#endif /* CONFIG_PAGE_POOL_STATS */
+
+/* NETDEV_CMD_QUEUE_GET - do */
+static const struct nla_policy netdev_queue_get_do_nl_policy[NETDEV_A_QUEUE_TYPE + 1] = {
+ [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+ [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1),
+ [NETDEV_A_QUEUE_ID] = { .type = NLA_U32, },
+};
+
+/* NETDEV_CMD_QUEUE_GET - dump */
+static const struct nla_policy netdev_queue_get_dump_nl_policy[NETDEV_A_QUEUE_IFINDEX + 1] = {
+ [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+};
+
+/* NETDEV_CMD_NAPI_GET - do */
+static const struct nla_policy netdev_napi_get_do_nl_policy[NETDEV_A_NAPI_ID + 1] = {
+ [NETDEV_A_NAPI_ID] = { .type = NLA_U32, },
+};
+
+/* NETDEV_CMD_NAPI_GET - dump */
+static const struct nla_policy netdev_napi_get_dump_nl_policy[NETDEV_A_NAPI_IFINDEX + 1] = {
+ [NETDEV_A_NAPI_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+};
+
+/* NETDEV_CMD_QSTATS_GET - dump */
+static const struct nla_policy netdev_qstats_get_nl_policy[NETDEV_A_QSTATS_SCOPE + 1] = {
+ [NETDEV_A_QSTATS_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+ [NETDEV_A_QSTATS_SCOPE] = NLA_POLICY_MASK(NLA_UINT, 0x1),
+};
+
+/* NETDEV_CMD_BIND_RX - do */
+static const struct nla_policy netdev_bind_rx_nl_policy[NETDEV_A_DMABUF_FD + 1] = {
+ [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+ [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, },
+ [NETDEV_A_DMABUF_QUEUES] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy),
+};
+
+/* NETDEV_CMD_NAPI_SET - do */
+static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED + 1] = {
+ [NETDEV_A_NAPI_ID] = { .type = NLA_U32, },
+ [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range),
+ [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, },
+ [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, },
+ [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 2),
+};
+
+/* NETDEV_CMD_BIND_TX - do */
+static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] = {
+ [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+ [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, },
+};
+
+/* Ops table for netdev */
+static const struct genl_split_ops netdev_nl_ops[] = {
+ {
+ .cmd = NETDEV_CMD_DEV_GET,
+ .doit = netdev_nl_dev_get_doit,
+ .policy = netdev_dev_get_nl_policy,
+ .maxattr = NETDEV_A_DEV_IFINDEX,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_DEV_GET,
+ .dumpit = netdev_nl_dev_get_dumpit,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+#ifdef CONFIG_PAGE_POOL
+ {
+ .cmd = NETDEV_CMD_PAGE_POOL_GET,
+ .doit = netdev_nl_page_pool_get_doit,
+ .policy = netdev_page_pool_get_nl_policy,
+ .maxattr = NETDEV_A_PAGE_POOL_ID,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_PAGE_POOL_GET,
+ .dumpit = netdev_nl_page_pool_get_dumpit,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+#endif /* CONFIG_PAGE_POOL */
+#ifdef CONFIG_PAGE_POOL_STATS
+ {
+ .cmd = NETDEV_CMD_PAGE_POOL_STATS_GET,
+ .doit = netdev_nl_page_pool_stats_get_doit,
+ .policy = netdev_page_pool_stats_get_nl_policy,
+ .maxattr = NETDEV_A_PAGE_POOL_STATS_INFO,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_PAGE_POOL_STATS_GET,
+ .dumpit = netdev_nl_page_pool_stats_get_dumpit,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+#endif /* CONFIG_PAGE_POOL_STATS */
+ {
+ .cmd = NETDEV_CMD_QUEUE_GET,
+ .doit = netdev_nl_queue_get_doit,
+ .policy = netdev_queue_get_do_nl_policy,
+ .maxattr = NETDEV_A_QUEUE_TYPE,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_QUEUE_GET,
+ .dumpit = netdev_nl_queue_get_dumpit,
+ .policy = netdev_queue_get_dump_nl_policy,
+ .maxattr = NETDEV_A_QUEUE_IFINDEX,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = NETDEV_CMD_NAPI_GET,
+ .doit = netdev_nl_napi_get_doit,
+ .policy = netdev_napi_get_do_nl_policy,
+ .maxattr = NETDEV_A_NAPI_ID,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_NAPI_GET,
+ .dumpit = netdev_nl_napi_get_dumpit,
+ .policy = netdev_napi_get_dump_nl_policy,
+ .maxattr = NETDEV_A_NAPI_IFINDEX,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = NETDEV_CMD_QSTATS_GET,
+ .dumpit = netdev_nl_qstats_get_dumpit,
+ .policy = netdev_qstats_get_nl_policy,
+ .maxattr = NETDEV_A_QSTATS_SCOPE,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = NETDEV_CMD_BIND_RX,
+ .doit = netdev_nl_bind_rx_doit,
+ .policy = netdev_bind_rx_nl_policy,
+ .maxattr = NETDEV_A_DMABUF_FD,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_NAPI_SET,
+ .doit = netdev_nl_napi_set_doit,
+ .policy = netdev_napi_set_nl_policy,
+ .maxattr = NETDEV_A_NAPI_THREADED,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = NETDEV_CMD_BIND_TX,
+ .doit = netdev_nl_bind_tx_doit,
+ .policy = netdev_bind_tx_nl_policy,
+ .maxattr = NETDEV_A_DMABUF_FD,
+ .flags = GENL_CMD_CAP_DO,
+ },
+};
+
+static const struct genl_multicast_group netdev_nl_mcgrps[] = {
+ [NETDEV_NLGRP_MGMT] = { "mgmt", },
+ [NETDEV_NLGRP_PAGE_POOL] = { "page-pool", },
+};
+
+static void __netdev_nl_sock_priv_init(void *priv)
+{
+ netdev_nl_sock_priv_init(priv);
+}
+
+static void __netdev_nl_sock_priv_destroy(void *priv)
+{
+ netdev_nl_sock_priv_destroy(priv);
+}
+
+struct genl_family netdev_nl_family __ro_after_init = {
+ .name = NETDEV_FAMILY_NAME,
+ .version = NETDEV_FAMILY_VERSION,
+ .netnsok = true,
+ .parallel_ops = true,
+ .module = THIS_MODULE,
+ .split_ops = netdev_nl_ops,
+ .n_split_ops = ARRAY_SIZE(netdev_nl_ops),
+ .mcgrps = netdev_nl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(netdev_nl_mcgrps),
+ .sock_priv_size = sizeof(struct netdev_nl_sock),
+ .sock_priv_init = __netdev_nl_sock_priv_init,
+ .sock_priv_destroy = __netdev_nl_sock_priv_destroy,
+};
diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h
new file mode 100644
index 000000000000..cffc08517a41
--- /dev/null
+++ b/net/core/netdev-genl-gen.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/netdev.yaml */
+/* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
+
+#ifndef _LINUX_NETDEV_GEN_H
+#define _LINUX_NETDEV_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/netdev.h>
+#include <net/netdev_netlink.h>
+
+/* Common nested types */
+extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1];
+extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1];
+
+int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int netdev_nl_page_pool_stats_get_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_queue_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info);
+int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info);
+
+enum {
+ NETDEV_NLGRP_MGMT,
+ NETDEV_NLGRP_PAGE_POOL,
+};
+
+extern struct genl_family netdev_nl_family;
+
+void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv);
+void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv);
+
+#endif /* _LINUX_NETDEV_GEN_H */
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
new file mode 100644
index 000000000000..470fabbeacd9
--- /dev/null
+++ b/net/core/netdev-genl.c
@@ -0,0 +1,1203 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/netdevice.h>
+#include <linux/notifier.h>
+#include <linux/rtnetlink.h>
+#include <net/busy_poll.h>
+#include <net/net_namespace.h>
+#include <net/netdev_queues.h>
+#include <net/netdev_rx_queue.h>
+#include <net/sock.h>
+#include <net/xdp.h>
+#include <net/xdp_sock.h>
+#include <net/page_pool/memory_provider.h>
+
+#include "dev.h"
+#include "devmem.h"
+#include "netdev-genl-gen.h"
+
+struct netdev_nl_dump_ctx {
+ unsigned long ifindex;
+ unsigned int rxq_idx;
+ unsigned int txq_idx;
+ unsigned int napi_id;
+};
+
+static struct netdev_nl_dump_ctx *netdev_dump_ctx(struct netlink_callback *cb)
+{
+ NL_ASSERT_CTX_FITS(struct netdev_nl_dump_ctx);
+
+ return (struct netdev_nl_dump_ctx *)cb->ctx;
+}
+
+static int
+netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp,
+ const struct genl_info *info)
+{
+ u64 xsk_features = 0;
+ u64 xdp_rx_meta = 0;
+ void *hdr;
+
+ netdev_assert_locked(netdev); /* note: rtnl_lock may not be held! */
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr)
+ return -EMSGSIZE;
+
+#define XDP_METADATA_KFUNC(_, flag, __, xmo) \
+ if (netdev->xdp_metadata_ops && netdev->xdp_metadata_ops->xmo) \
+ xdp_rx_meta |= flag;
+XDP_METADATA_KFUNC_xxx
+#undef XDP_METADATA_KFUNC
+
+ if (netdev->xsk_tx_metadata_ops) {
+ if (netdev->xsk_tx_metadata_ops->tmo_fill_timestamp)
+ xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP;
+ if (netdev->xsk_tx_metadata_ops->tmo_request_checksum)
+ xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM;
+ if (netdev->xsk_tx_metadata_ops->tmo_request_launch_time)
+ xsk_features |= NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO;
+ }
+
+ if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) ||
+ nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_FEATURES,
+ netdev->xdp_features, NETDEV_A_DEV_PAD) ||
+ nla_put_u64_64bit(rsp, NETDEV_A_DEV_XDP_RX_METADATA_FEATURES,
+ xdp_rx_meta, NETDEV_A_DEV_PAD) ||
+ nla_put_u64_64bit(rsp, NETDEV_A_DEV_XSK_FEATURES,
+ xsk_features, NETDEV_A_DEV_PAD))
+ goto err_cancel_msg;
+
+ if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) {
+ if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS,
+ netdev->xdp_zc_max_segs))
+ goto err_cancel_msg;
+ }
+
+ genlmsg_end(rsp, hdr);
+
+ return 0;
+
+err_cancel_msg:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
+}
+
+static void
+netdev_genl_dev_notify(struct net_device *netdev, int cmd)
+{
+ struct genl_info info;
+ struct sk_buff *ntf;
+
+ if (!genl_has_listeners(&netdev_nl_family, dev_net(netdev),
+ NETDEV_NLGRP_MGMT))
+ return;
+
+ genl_info_init_ntf(&info, &netdev_nl_family, cmd);
+
+ ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!ntf)
+ return;
+
+ if (netdev_nl_dev_fill(netdev, ntf, &info)) {
+ nlmsg_free(ntf);
+ return;
+ }
+
+ genlmsg_multicast_netns(&netdev_nl_family, dev_net(netdev), ntf,
+ 0, NETDEV_NLGRP_MGMT, GFP_KERNEL);
+}
+
+int netdev_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_device *netdev;
+ struct sk_buff *rsp;
+ u32 ifindex;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX))
+ return -EINVAL;
+
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]);
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
+ if (!netdev) {
+ err = -ENODEV;
+ goto err_free_msg;
+ }
+
+ err = netdev_nl_dev_fill(netdev, rsp, info);
+ netdev_unlock(netdev);
+
+ if (err)
+ goto err_free_msg;
+
+ return genlmsg_reply(rsp, info);
+
+err_free_msg:
+ nlmsg_free(rsp);
+ return err;
+}
+
+int netdev_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb);
+ struct net *net = sock_net(skb->sk);
+ int err;
+
+ for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) {
+ err = netdev_nl_dev_fill(netdev, skb, genl_info_dump(cb));
+ if (err < 0)
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
+ const struct genl_info *info)
+{
+ unsigned long irq_suspend_timeout;
+ unsigned long gro_flush_timeout;
+ u32 napi_defer_hard_irqs;
+ void *hdr;
+ pid_t pid;
+
+ if (!napi->dev->up)
+ return 0;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id))
+ goto nla_put_failure;
+
+ if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex))
+ goto nla_put_failure;
+
+ if (napi->irq >= 0 && nla_put_u32(rsp, NETDEV_A_NAPI_IRQ, napi->irq))
+ goto nla_put_failure;
+
+ if (nla_put_uint(rsp, NETDEV_A_NAPI_THREADED,
+ napi_get_threaded(napi)))
+ goto nla_put_failure;
+
+ if (napi->thread) {
+ pid = task_pid_nr(napi->thread);
+ if (nla_put_u32(rsp, NETDEV_A_NAPI_PID, pid))
+ goto nla_put_failure;
+ }
+
+ napi_defer_hard_irqs = napi_get_defer_hard_irqs(napi);
+ if (nla_put_s32(rsp, NETDEV_A_NAPI_DEFER_HARD_IRQS,
+ napi_defer_hard_irqs))
+ goto nla_put_failure;
+
+ irq_suspend_timeout = napi_get_irq_suspend_timeout(napi);
+ if (nla_put_uint(rsp, NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT,
+ irq_suspend_timeout))
+ goto nla_put_failure;
+
+ gro_flush_timeout = napi_get_gro_flush_timeout(napi);
+ if (nla_put_uint(rsp, NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT,
+ gro_flush_timeout))
+ goto nla_put_failure;
+
+ genlmsg_end(rsp, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
+}
+
+int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct napi_struct *napi;
+ struct sk_buff *rsp;
+ u32 napi_id;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID))
+ return -EINVAL;
+
+ napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]);
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id);
+ if (napi) {
+ err = netdev_nl_napi_fill_one(rsp, napi, info);
+ netdev_unlock(napi->dev);
+ } else {
+ NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]);
+ err = -ENOENT;
+ }
+
+ if (err) {
+ goto err_free_msg;
+ } else if (!rsp->len) {
+ err = -ENOENT;
+ goto err_free_msg;
+ }
+
+ return genlmsg_reply(rsp, info);
+
+err_free_msg:
+ nlmsg_free(rsp);
+ return err;
+}
+
+static int
+netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp,
+ const struct genl_info *info,
+ struct netdev_nl_dump_ctx *ctx)
+{
+ struct napi_struct *napi;
+ unsigned int prev_id;
+ int err = 0;
+
+ if (!netdev->up)
+ return err;
+
+ prev_id = UINT_MAX;
+ list_for_each_entry(napi, &netdev->napi_list, dev_list) {
+ if (!napi_id_valid(napi->napi_id))
+ continue;
+
+ /* Dump continuation below depends on the list being sorted */
+ WARN_ON_ONCE(napi->napi_id >= prev_id);
+ prev_id = napi->napi_id;
+
+ if (ctx->napi_id && napi->napi_id >= ctx->napi_id)
+ continue;
+
+ err = netdev_nl_napi_fill_one(rsp, napi, info);
+ if (err)
+ return err;
+ ctx->napi_id = napi->napi_id;
+ }
+ return err;
+}
+
+int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb);
+ const struct genl_info *info = genl_info_dump(cb);
+ struct net *net = sock_net(skb->sk);
+ struct net_device *netdev;
+ u32 ifindex = 0;
+ int err = 0;
+
+ if (info->attrs[NETDEV_A_NAPI_IFINDEX])
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]);
+
+ if (ifindex) {
+ netdev = netdev_get_by_index_lock(net, ifindex);
+ if (netdev) {
+ err = netdev_nl_napi_dump_one(netdev, skb, info, ctx);
+ netdev_unlock(netdev);
+ } else {
+ err = -ENODEV;
+ }
+ } else {
+ for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) {
+ err = netdev_nl_napi_dump_one(netdev, skb, info, ctx);
+ if (err < 0)
+ break;
+ ctx->napi_id = 0;
+ }
+ }
+
+ return err;
+}
+
+static int
+netdev_nl_napi_set_config(struct napi_struct *napi, struct genl_info *info)
+{
+ u64 irq_suspend_timeout = 0;
+ u64 gro_flush_timeout = 0;
+ u8 threaded = 0;
+ u32 defer = 0;
+
+ if (info->attrs[NETDEV_A_NAPI_THREADED]) {
+ int ret;
+
+ threaded = nla_get_uint(info->attrs[NETDEV_A_NAPI_THREADED]);
+ ret = napi_set_threaded(napi, threaded);
+ if (ret)
+ return ret;
+ }
+
+ if (info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]) {
+ defer = nla_get_u32(info->attrs[NETDEV_A_NAPI_DEFER_HARD_IRQS]);
+ napi_set_defer_hard_irqs(napi, defer);
+ }
+
+ if (info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]) {
+ irq_suspend_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT]);
+ napi_set_irq_suspend_timeout(napi, irq_suspend_timeout);
+ }
+
+ if (info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]) {
+ gro_flush_timeout = nla_get_uint(info->attrs[NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT]);
+ napi_set_gro_flush_timeout(napi, gro_flush_timeout);
+ }
+
+ return 0;
+}
+
+int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct napi_struct *napi;
+ unsigned int napi_id;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_NAPI_ID))
+ return -EINVAL;
+
+ napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]);
+
+ napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id);
+ if (napi) {
+ err = netdev_nl_napi_set_config(napi, info);
+ netdev_unlock(napi->dev);
+ } else {
+ NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]);
+ err = -ENOENT;
+ }
+
+ return err;
+}
+
+static int nla_put_napi_id(struct sk_buff *skb, const struct napi_struct *napi)
+{
+ if (napi && napi_id_valid(napi->napi_id))
+ return nla_put_u32(skb, NETDEV_A_QUEUE_NAPI_ID, napi->napi_id);
+ return 0;
+}
+
+static int
+netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev,
+ u32 q_idx, u32 q_type, const struct genl_info *info)
+{
+ struct pp_memory_provider_params *params;
+ struct netdev_rx_queue *rxq;
+ struct netdev_queue *txq;
+ void *hdr;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, q_idx) ||
+ nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type) ||
+ nla_put_u32(rsp, NETDEV_A_QUEUE_IFINDEX, netdev->ifindex))
+ goto nla_put_failure;
+
+ switch (q_type) {
+ case NETDEV_QUEUE_TYPE_RX:
+ rxq = __netif_get_rx_queue(netdev, q_idx);
+ if (nla_put_napi_id(rsp, rxq->napi))
+ goto nla_put_failure;
+
+ params = &rxq->mp_params;
+ if (params->mp_ops &&
+ params->mp_ops->nl_fill(params->mp_priv, rsp, rxq))
+ goto nla_put_failure;
+#ifdef CONFIG_XDP_SOCKETS
+ if (rxq->pool)
+ if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK))
+ goto nla_put_failure;
+#endif
+
+ break;
+ case NETDEV_QUEUE_TYPE_TX:
+ txq = netdev_get_tx_queue(netdev, q_idx);
+ if (nla_put_napi_id(rsp, txq->napi))
+ goto nla_put_failure;
+#ifdef CONFIG_XDP_SOCKETS
+ if (txq->pool)
+ if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK))
+ goto nla_put_failure;
+#endif
+ break;
+ }
+
+ genlmsg_end(rsp, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
+}
+
+static int netdev_nl_queue_validate(struct net_device *netdev, u32 q_id,
+ u32 q_type)
+{
+ switch (q_type) {
+ case NETDEV_QUEUE_TYPE_RX:
+ if (q_id >= netdev->real_num_rx_queues)
+ return -EINVAL;
+ return 0;
+ case NETDEV_QUEUE_TYPE_TX:
+ if (q_id >= netdev->real_num_tx_queues)
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int
+netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx,
+ u32 q_type, const struct genl_info *info)
+{
+ int err;
+
+ if (!netdev->up)
+ return -ENOENT;
+
+ err = netdev_nl_queue_validate(netdev, q_idx, q_type);
+ if (err)
+ return err;
+
+ return netdev_nl_queue_fill_one(rsp, netdev, q_idx, q_type, info);
+}
+
+int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ u32 q_id, q_type, ifindex;
+ struct net_device *netdev;
+ struct sk_buff *rsp;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_ID) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX))
+ return -EINVAL;
+
+ q_id = nla_get_u32(info->attrs[NETDEV_A_QUEUE_ID]);
+ q_type = nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]);
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]);
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ netdev = netdev_get_by_index_lock_ops_compat(genl_info_net(info),
+ ifindex);
+ if (netdev) {
+ err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info);
+ netdev_unlock_ops_compat(netdev);
+ } else {
+ err = -ENODEV;
+ }
+
+ if (err)
+ goto err_free_msg;
+
+ return genlmsg_reply(rsp, info);
+
+err_free_msg:
+ nlmsg_free(rsp);
+ return err;
+}
+
+static int
+netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp,
+ const struct genl_info *info,
+ struct netdev_nl_dump_ctx *ctx)
+{
+ int err = 0;
+
+ if (!netdev->up)
+ return err;
+
+ for (; ctx->rxq_idx < netdev->real_num_rx_queues; ctx->rxq_idx++) {
+ err = netdev_nl_queue_fill_one(rsp, netdev, ctx->rxq_idx,
+ NETDEV_QUEUE_TYPE_RX, info);
+ if (err)
+ return err;
+ }
+ for (; ctx->txq_idx < netdev->real_num_tx_queues; ctx->txq_idx++) {
+ err = netdev_nl_queue_fill_one(rsp, netdev, ctx->txq_idx,
+ NETDEV_QUEUE_TYPE_TX, info);
+ if (err)
+ return err;
+ }
+
+ return err;
+}
+
+int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb);
+ const struct genl_info *info = genl_info_dump(cb);
+ struct net *net = sock_net(skb->sk);
+ struct net_device *netdev;
+ u32 ifindex = 0;
+ int err = 0;
+
+ if (info->attrs[NETDEV_A_QUEUE_IFINDEX])
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]);
+
+ if (ifindex) {
+ netdev = netdev_get_by_index_lock_ops_compat(net, ifindex);
+ if (netdev) {
+ err = netdev_nl_queue_dump_one(netdev, skb, info, ctx);
+ netdev_unlock_ops_compat(netdev);
+ } else {
+ err = -ENODEV;
+ }
+ } else {
+ for_each_netdev_lock_ops_compat_scoped(net, netdev,
+ ctx->ifindex) {
+ err = netdev_nl_queue_dump_one(netdev, skb, info, ctx);
+ if (err < 0)
+ break;
+ ctx->rxq_idx = 0;
+ ctx->txq_idx = 0;
+ }
+ }
+
+ return err;
+}
+
+#define NETDEV_STAT_NOT_SET (~0ULL)
+
+static void netdev_nl_stats_add(void *_sum, const void *_add, size_t size)
+{
+ const u64 *add = _add;
+ u64 *sum = _sum;
+
+ while (size) {
+ if (*add != NETDEV_STAT_NOT_SET && *sum != NETDEV_STAT_NOT_SET)
+ *sum += *add;
+ sum++;
+ add++;
+ size -= 8;
+ }
+}
+
+static int netdev_stat_put(struct sk_buff *rsp, unsigned int attr_id, u64 value)
+{
+ if (value == NETDEV_STAT_NOT_SET)
+ return 0;
+ return nla_put_uint(rsp, attr_id, value);
+}
+
+static int
+netdev_nl_stats_write_rx(struct sk_buff *rsp, struct netdev_queue_stats_rx *rx)
+{
+ if (netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_PACKETS, rx->packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_BYTES, rx->bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_ALLOC_FAIL, rx->alloc_fail) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROPS, rx->hw_drops) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, rx->hw_drop_overruns) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_COMPLETE, rx->csum_complete) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, rx->csum_unnecessary) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_NONE, rx->csum_none) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_CSUM_BAD, rx->csum_bad) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, rx->hw_gro_packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_BYTES, rx->hw_gro_bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, rx->hw_gro_wire_packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, rx->hw_gro_wire_bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, rx->hw_drop_ratelimits))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int
+netdev_nl_stats_write_tx(struct sk_buff *rsp, struct netdev_queue_stats_tx *tx)
+{
+ if (netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_PACKETS, tx->packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_BYTES, tx->bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROPS, tx->hw_drops) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, tx->hw_drop_errors) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_CSUM_NONE, tx->csum_none) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_NEEDS_CSUM, tx->needs_csum) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, tx->hw_gso_packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_BYTES, tx->hw_gso_bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, tx->hw_gso_wire_packets) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, tx->hw_gso_wire_bytes) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, tx->hw_drop_ratelimits) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_STOP, tx->stop) ||
+ netdev_stat_put(rsp, NETDEV_A_QSTATS_TX_WAKE, tx->wake))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int
+netdev_nl_stats_queue(struct net_device *netdev, struct sk_buff *rsp,
+ u32 q_type, int i, const struct genl_info *info)
+{
+ const struct netdev_stat_ops *ops = netdev->stat_ops;
+ struct netdev_queue_stats_rx rx;
+ struct netdev_queue_stats_tx tx;
+ void *hdr;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr)
+ return -EMSGSIZE;
+ if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex) ||
+ nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_TYPE, q_type) ||
+ nla_put_u32(rsp, NETDEV_A_QSTATS_QUEUE_ID, i))
+ goto nla_put_failure;
+
+ switch (q_type) {
+ case NETDEV_QUEUE_TYPE_RX:
+ memset(&rx, 0xff, sizeof(rx));
+ ops->get_queue_stats_rx(netdev, i, &rx);
+ if (!memchr_inv(&rx, 0xff, sizeof(rx)))
+ goto nla_cancel;
+ if (netdev_nl_stats_write_rx(rsp, &rx))
+ goto nla_put_failure;
+ break;
+ case NETDEV_QUEUE_TYPE_TX:
+ memset(&tx, 0xff, sizeof(tx));
+ ops->get_queue_stats_tx(netdev, i, &tx);
+ if (!memchr_inv(&tx, 0xff, sizeof(tx)))
+ goto nla_cancel;
+ if (netdev_nl_stats_write_tx(rsp, &tx))
+ goto nla_put_failure;
+ break;
+ }
+
+ genlmsg_end(rsp, hdr);
+ return 0;
+
+nla_cancel:
+ genlmsg_cancel(rsp, hdr);
+ return 0;
+nla_put_failure:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
+}
+
+static int
+netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp,
+ const struct genl_info *info,
+ struct netdev_nl_dump_ctx *ctx)
+{
+ const struct netdev_stat_ops *ops = netdev->stat_ops;
+ int i, err;
+
+ if (!(netdev->flags & IFF_UP))
+ return 0;
+
+ i = ctx->rxq_idx;
+ while (ops->get_queue_stats_rx && i < netdev->real_num_rx_queues) {
+ err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_RX,
+ i, info);
+ if (err)
+ return err;
+ ctx->rxq_idx = ++i;
+ }
+ i = ctx->txq_idx;
+ while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) {
+ err = netdev_nl_stats_queue(netdev, rsp, NETDEV_QUEUE_TYPE_TX,
+ i, info);
+ if (err)
+ return err;
+ ctx->txq_idx = ++i;
+ }
+
+ ctx->rxq_idx = 0;
+ ctx->txq_idx = 0;
+ return 0;
+}
+
+/**
+ * netdev_stat_queue_sum() - add up queue stats from range of queues
+ * @netdev: net_device
+ * @rx_start: index of the first Rx queue to query
+ * @rx_end: index after the last Rx queue (first *not* to query)
+ * @rx_sum: output Rx stats, should be already initialized
+ * @tx_start: index of the first Tx queue to query
+ * @tx_end: index after the last Tx queue (first *not* to query)
+ * @tx_sum: output Tx stats, should be already initialized
+ *
+ * Add stats from [start, end) range of queue IDs to *x_sum structs.
+ * The sum structs must be already initialized. Usually this
+ * helper is invoked from the .get_base_stats callbacks of drivers
+ * to account for stats of disabled queues. In that case the ranges
+ * are usually [netdev->real_num_*x_queues, netdev->num_*x_queues).
+ */
+void netdev_stat_queue_sum(struct net_device *netdev,
+ int rx_start, int rx_end,
+ struct netdev_queue_stats_rx *rx_sum,
+ int tx_start, int tx_end,
+ struct netdev_queue_stats_tx *tx_sum)
+{
+ const struct netdev_stat_ops *ops;
+ struct netdev_queue_stats_rx rx;
+ struct netdev_queue_stats_tx tx;
+ int i;
+
+ ops = netdev->stat_ops;
+
+ for (i = rx_start; i < rx_end; i++) {
+ memset(&rx, 0xff, sizeof(rx));
+ if (ops->get_queue_stats_rx)
+ ops->get_queue_stats_rx(netdev, i, &rx);
+ netdev_nl_stats_add(rx_sum, &rx, sizeof(rx));
+ }
+ for (i = tx_start; i < tx_end; i++) {
+ memset(&tx, 0xff, sizeof(tx));
+ if (ops->get_queue_stats_tx)
+ ops->get_queue_stats_tx(netdev, i, &tx);
+ netdev_nl_stats_add(tx_sum, &tx, sizeof(tx));
+ }
+}
+EXPORT_SYMBOL(netdev_stat_queue_sum);
+
+static int
+netdev_nl_stats_by_netdev(struct net_device *netdev, struct sk_buff *rsp,
+ const struct genl_info *info)
+{
+ struct netdev_queue_stats_rx rx_sum;
+ struct netdev_queue_stats_tx tx_sum;
+ void *hdr;
+
+ /* Netdev can't guarantee any complete counters */
+ if (!netdev->stat_ops->get_base_stats)
+ return 0;
+
+ memset(&rx_sum, 0xff, sizeof(rx_sum));
+ memset(&tx_sum, 0xff, sizeof(tx_sum));
+
+ netdev->stat_ops->get_base_stats(netdev, &rx_sum, &tx_sum);
+
+ /* The op was there, but nothing reported, don't bother */
+ if (!memchr_inv(&rx_sum, 0xff, sizeof(rx_sum)) &&
+ !memchr_inv(&tx_sum, 0xff, sizeof(tx_sum)))
+ return 0;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr)
+ return -EMSGSIZE;
+ if (nla_put_u32(rsp, NETDEV_A_QSTATS_IFINDEX, netdev->ifindex))
+ goto nla_put_failure;
+
+ netdev_stat_queue_sum(netdev, 0, netdev->real_num_rx_queues, &rx_sum,
+ 0, netdev->real_num_tx_queues, &tx_sum);
+
+ if (netdev_nl_stats_write_rx(rsp, &rx_sum) ||
+ netdev_nl_stats_write_tx(rsp, &tx_sum))
+ goto nla_put_failure;
+
+ genlmsg_end(rsp, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
+}
+
+static int
+netdev_nl_qstats_get_dump_one(struct net_device *netdev, unsigned int scope,
+ struct sk_buff *skb, const struct genl_info *info,
+ struct netdev_nl_dump_ctx *ctx)
+{
+ if (!netdev->stat_ops)
+ return 0;
+
+ switch (scope) {
+ case 0:
+ return netdev_nl_stats_by_netdev(netdev, skb, info);
+ case NETDEV_QSTATS_SCOPE_QUEUE:
+ return netdev_nl_stats_by_queue(netdev, skb, info, ctx);
+ }
+
+ return -EINVAL; /* Should not happen, per netlink policy */
+}
+
+int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct netdev_nl_dump_ctx *ctx = netdev_dump_ctx(cb);
+ const struct genl_info *info = genl_info_dump(cb);
+ struct net *net = sock_net(skb->sk);
+ struct net_device *netdev;
+ unsigned int ifindex;
+ unsigned int scope;
+ int err = 0;
+
+ scope = 0;
+ if (info->attrs[NETDEV_A_QSTATS_SCOPE])
+ scope = nla_get_uint(info->attrs[NETDEV_A_QSTATS_SCOPE]);
+
+ ifindex = 0;
+ if (info->attrs[NETDEV_A_QSTATS_IFINDEX])
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_QSTATS_IFINDEX]);
+
+ if (ifindex) {
+ netdev = netdev_get_by_index_lock_ops_compat(net, ifindex);
+ if (!netdev) {
+ NL_SET_BAD_ATTR(info->extack,
+ info->attrs[NETDEV_A_QSTATS_IFINDEX]);
+ return -ENODEV;
+ }
+ if (netdev->stat_ops) {
+ err = netdev_nl_qstats_get_dump_one(netdev, scope, skb,
+ info, ctx);
+ } else {
+ NL_SET_BAD_ATTR(info->extack,
+ info->attrs[NETDEV_A_QSTATS_IFINDEX]);
+ err = -EOPNOTSUPP;
+ }
+ netdev_unlock_ops_compat(netdev);
+ return err;
+ }
+
+ for_each_netdev_lock_ops_compat_scoped(net, netdev, ctx->ifindex) {
+ err = netdev_nl_qstats_get_dump_one(netdev, scope, skb,
+ info, ctx);
+ if (err < 0)
+ break;
+ }
+
+ return err;
+}
+
+static int netdev_nl_read_rxq_bitmap(struct genl_info *info,
+ u32 rxq_bitmap_len,
+ unsigned long *rxq_bitmap)
+{
+ const int maxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1;
+ struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)];
+ struct nlattr *attr;
+ int rem, err = 0;
+ u32 rxq_idx;
+
+ nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES,
+ genlmsg_data(info->genlhdr),
+ genlmsg_len(info->genlhdr), rem) {
+ err = nla_parse_nested(tb, maxtype, attr,
+ netdev_queue_id_nl_policy, info->extack);
+ if (err < 0)
+ return err;
+
+ if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) ||
+ NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE))
+ return -EINVAL;
+
+ if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) {
+ NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]);
+ return -EINVAL;
+ }
+
+ rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]);
+ if (rxq_idx >= rxq_bitmap_len) {
+ NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_ID]);
+ return -EINVAL;
+ }
+
+ bitmap_set(rxq_bitmap, rxq_idx, 1);
+ }
+
+ return 0;
+}
+
+static struct device *
+netdev_nl_get_dma_dev(struct net_device *netdev, unsigned long *rxq_bitmap,
+ struct netlink_ext_ack *extack)
+{
+ struct device *dma_dev = NULL;
+ u32 rxq_idx, prev_rxq_idx;
+
+ for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) {
+ struct device *rxq_dma_dev;
+
+ rxq_dma_dev = netdev_queue_get_dma_dev(netdev, rxq_idx);
+ if (dma_dev && rxq_dma_dev != dma_dev) {
+ NL_SET_ERR_MSG_FMT(extack, "DMA device mismatch between queue %u and %u (multi-PF device?)",
+ rxq_idx, prev_rxq_idx);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ dma_dev = rxq_dma_dev;
+ prev_rxq_idx = rxq_idx;
+ }
+
+ return dma_dev;
+}
+
+int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ u32 ifindex, dmabuf_fd, rxq_idx;
+ struct netdev_nl_sock *priv;
+ struct net_device *netdev;
+ unsigned long *rxq_bitmap;
+ struct device *dma_dev;
+ struct sk_buff *rsp;
+ int err = 0;
+ void *hdr;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_QUEUES))
+ return -EINVAL;
+
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]);
+ dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]);
+
+ priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk);
+ if (IS_ERR(priv))
+ return PTR_ERR(priv);
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto err_genlmsg_free;
+ }
+
+ mutex_lock(&priv->lock);
+
+ err = 0;
+ netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
+ if (!netdev) {
+ err = -ENODEV;
+ goto err_unlock_sock;
+ }
+ if (!netif_device_present(netdev))
+ err = -ENODEV;
+ else if (!netdev_need_ops_lock(netdev))
+ err = -EOPNOTSUPP;
+ if (err) {
+ NL_SET_BAD_ATTR(info->extack,
+ info->attrs[NETDEV_A_DEV_IFINDEX]);
+ goto err_unlock;
+ }
+
+ rxq_bitmap = bitmap_zalloc(netdev->real_num_rx_queues, GFP_KERNEL);
+ if (!rxq_bitmap) {
+ err = -ENOMEM;
+ goto err_unlock;
+ }
+
+ err = netdev_nl_read_rxq_bitmap(info, netdev->real_num_rx_queues,
+ rxq_bitmap);
+ if (err)
+ goto err_rxq_bitmap;
+
+ dma_dev = netdev_nl_get_dma_dev(netdev, rxq_bitmap, info->extack);
+ if (IS_ERR(dma_dev)) {
+ err = PTR_ERR(dma_dev);
+ goto err_rxq_bitmap;
+ }
+
+ binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE,
+ dmabuf_fd, priv, info->extack);
+ if (IS_ERR(binding)) {
+ err = PTR_ERR(binding);
+ goto err_rxq_bitmap;
+ }
+
+ for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) {
+ err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding,
+ info->extack);
+ if (err)
+ goto err_unbind;
+ }
+
+ nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id);
+ genlmsg_end(rsp, hdr);
+
+ err = genlmsg_reply(rsp, info);
+ if (err)
+ goto err_unbind;
+
+ bitmap_free(rxq_bitmap);
+
+ netdev_unlock(netdev);
+
+ mutex_unlock(&priv->lock);
+
+ return 0;
+
+err_unbind:
+ net_devmem_unbind_dmabuf(binding);
+err_rxq_bitmap:
+ bitmap_free(rxq_bitmap);
+err_unlock:
+ netdev_unlock(netdev);
+err_unlock_sock:
+ mutex_unlock(&priv->lock);
+err_genlmsg_free:
+ nlmsg_free(rsp);
+ return err;
+}
+
+int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ struct netdev_nl_sock *priv;
+ struct net_device *netdev;
+ struct device *dma_dev;
+ u32 ifindex, dmabuf_fd;
+ struct sk_buff *rsp;
+ int err = 0;
+ void *hdr;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) ||
+ GENL_REQ_ATTR_CHECK(info, NETDEV_A_DMABUF_FD))
+ return -EINVAL;
+
+ ifindex = nla_get_u32(info->attrs[NETDEV_A_DEV_IFINDEX]);
+ dmabuf_fd = nla_get_u32(info->attrs[NETDEV_A_DMABUF_FD]);
+
+ priv = genl_sk_priv_get(&netdev_nl_family, NETLINK_CB(skb).sk);
+ if (IS_ERR(priv))
+ return PTR_ERR(priv);
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto err_genlmsg_free;
+ }
+
+ mutex_lock(&priv->lock);
+
+ netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
+ if (!netdev) {
+ err = -ENODEV;
+ goto err_unlock_sock;
+ }
+
+ if (!netif_device_present(netdev)) {
+ err = -ENODEV;
+ goto err_unlock_netdev;
+ }
+
+ if (!netdev->netmem_tx) {
+ err = -EOPNOTSUPP;
+ NL_SET_ERR_MSG(info->extack,
+ "Driver does not support netmem TX");
+ goto err_unlock_netdev;
+ }
+
+ dma_dev = netdev_queue_get_dma_dev(netdev, 0);
+ binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE,
+ dmabuf_fd, priv, info->extack);
+ if (IS_ERR(binding)) {
+ err = PTR_ERR(binding);
+ goto err_unlock_netdev;
+ }
+
+ nla_put_u32(rsp, NETDEV_A_DMABUF_ID, binding->id);
+ genlmsg_end(rsp, hdr);
+
+ netdev_unlock(netdev);
+ mutex_unlock(&priv->lock);
+
+ return genlmsg_reply(rsp, info);
+
+err_unlock_netdev:
+ netdev_unlock(netdev);
+err_unlock_sock:
+ mutex_unlock(&priv->lock);
+err_genlmsg_free:
+ nlmsg_free(rsp);
+ return err;
+}
+
+void netdev_nl_sock_priv_init(struct netdev_nl_sock *priv)
+{
+ INIT_LIST_HEAD(&priv->bindings);
+ mutex_init(&priv->lock);
+}
+
+void netdev_nl_sock_priv_destroy(struct netdev_nl_sock *priv)
+{
+ struct net_devmem_dmabuf_binding *binding;
+ struct net_devmem_dmabuf_binding *temp;
+ netdevice_tracker dev_tracker;
+ struct net_device *dev;
+
+ mutex_lock(&priv->lock);
+ list_for_each_entry_safe(binding, temp, &priv->bindings, list) {
+ mutex_lock(&binding->lock);
+ dev = binding->dev;
+ if (!dev) {
+ mutex_unlock(&binding->lock);
+ net_devmem_unbind_dmabuf(binding);
+ continue;
+ }
+ netdev_hold(dev, &dev_tracker, GFP_KERNEL);
+ mutex_unlock(&binding->lock);
+
+ netdev_lock(dev);
+ net_devmem_unbind_dmabuf(binding);
+ netdev_unlock(dev);
+ netdev_put(dev, &dev_tracker);
+ }
+ mutex_unlock(&priv->lock);
+}
+
+static int netdev_genl_netdevice_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ netdev_lock_ops_to_full(netdev);
+ netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_ADD_NTF);
+ netdev_unlock_full_to_ops(netdev);
+ break;
+ case NETDEV_UNREGISTER:
+ netdev_lock(netdev);
+ netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_DEL_NTF);
+ netdev_unlock(netdev);
+ break;
+ case NETDEV_XDP_FEAT_CHANGE:
+ netdev_genl_dev_notify(netdev, NETDEV_CMD_DEV_CHANGE_NTF);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block netdev_genl_nb = {
+ .notifier_call = netdev_genl_netdevice_event,
+};
+
+static int __init netdev_genl_init(void)
+{
+ int err;
+
+ err = register_netdevice_notifier(&netdev_genl_nb);
+ if (err)
+ return err;
+
+ err = genl_register_family(&netdev_nl_family);
+ if (err)
+ goto err_unreg_ntf;
+
+ return 0;
+
+err_unreg_ntf:
+ unregister_netdevice_notifier(&netdev_genl_nb);
+ return err;
+}
+
+subsys_initcall(netdev_genl_init);
diff --git a/net/core/netdev_queues.c b/net/core/netdev_queues.c
new file mode 100644
index 000000000000..251f27a8307f
--- /dev/null
+++ b/net/core/netdev_queues.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <net/netdev_queues.h>
+
+/**
+ * netdev_queue_get_dma_dev() - get dma device for zero-copy operations
+ * @dev: net_device
+ * @idx: queue index
+ *
+ * Get dma device for zero-copy operations to be used for this queue.
+ * When such device is not available or valid, the function will return NULL.
+ *
+ * Return: Device or NULL on error
+ */
+struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx)
+{
+ const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops;
+ struct device *dma_dev;
+
+ if (queue_ops && queue_ops->ndo_queue_get_dma_dev)
+ dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx);
+ else
+ dma_dev = dev->dev.parent;
+
+ return dma_dev && dma_dev->dma_mask ? dma_dev : NULL;
+}
+
diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c
new file mode 100644
index 000000000000..c7d9341b7630
--- /dev/null
+++ b/net/core/netdev_rx_queue.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/ethtool_netlink.h>
+#include <linux/netdevice.h>
+#include <net/netdev_lock.h>
+#include <net/netdev_queues.h>
+#include <net/netdev_rx_queue.h>
+#include <net/page_pool/memory_provider.h>
+
+#include "page_pool_priv.h"
+
+/* See also page_pool_is_unreadable() */
+bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx)
+{
+ struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx);
+
+ return !!rxq->mp_params.mp_ops;
+}
+EXPORT_SYMBOL(netif_rxq_has_unreadable_mp);
+
+int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
+{
+ struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx);
+ const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops;
+ void *new_mem, *old_mem;
+ int err;
+
+ if (!qops || !qops->ndo_queue_stop || !qops->ndo_queue_mem_free ||
+ !qops->ndo_queue_mem_alloc || !qops->ndo_queue_start)
+ return -EOPNOTSUPP;
+
+ netdev_assert_locked(dev);
+
+ new_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL);
+ if (!new_mem)
+ return -ENOMEM;
+
+ old_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL);
+ if (!old_mem) {
+ err = -ENOMEM;
+ goto err_free_new_mem;
+ }
+
+ err = qops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx);
+ if (err)
+ goto err_free_old_mem;
+
+ err = page_pool_check_memory_provider(dev, rxq);
+ if (err)
+ goto err_free_new_queue_mem;
+
+ if (netif_running(dev)) {
+ err = qops->ndo_queue_stop(dev, old_mem, rxq_idx);
+ if (err)
+ goto err_free_new_queue_mem;
+
+ err = qops->ndo_queue_start(dev, new_mem, rxq_idx);
+ if (err)
+ goto err_start_queue;
+ } else {
+ swap(new_mem, old_mem);
+ }
+
+ qops->ndo_queue_mem_free(dev, old_mem);
+
+ kvfree(old_mem);
+ kvfree(new_mem);
+
+ return 0;
+
+err_start_queue:
+ /* Restarting the queue with old_mem should be successful as we haven't
+ * changed any of the queue configuration, and there is not much we can
+ * do to recover from a failure here.
+ *
+ * WARN if we fail to recover the old rx queue, and at least free
+ * old_mem so we don't also leak that.
+ */
+ if (qops->ndo_queue_start(dev, old_mem, rxq_idx)) {
+ WARN(1,
+ "Failed to restart old queue in error path. RX queue %d may be unhealthy.",
+ rxq_idx);
+ qops->ndo_queue_mem_free(dev, old_mem);
+ }
+
+err_free_new_queue_mem:
+ qops->ndo_queue_mem_free(dev, new_mem);
+
+err_free_old_mem:
+ kvfree(old_mem);
+
+err_free_new_mem:
+ kvfree(new_mem);
+
+ return err;
+}
+EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL");
+
+int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
+ const struct pp_memory_provider_params *p,
+ struct netlink_ext_ack *extack)
+{
+ struct netdev_rx_queue *rxq;
+ int ret;
+
+ if (!netdev_need_ops_lock(dev))
+ return -EOPNOTSUPP;
+
+ if (rxq_idx >= dev->real_num_rx_queues) {
+ NL_SET_ERR_MSG(extack, "rx queue index out of range");
+ return -ERANGE;
+ }
+ rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues);
+
+ if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) {
+ NL_SET_ERR_MSG(extack, "tcp-data-split is disabled");
+ return -EINVAL;
+ }
+ if (dev->cfg->hds_thresh) {
+ NL_SET_ERR_MSG(extack, "hds-thresh is not zero");
+ return -EINVAL;
+ }
+ if (dev_xdp_prog_count(dev)) {
+ NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached");
+ return -EEXIST;
+ }
+
+ rxq = __netif_get_rx_queue(dev, rxq_idx);
+ if (rxq->mp_params.mp_ops) {
+ NL_SET_ERR_MSG(extack, "designated queue already memory provider bound");
+ return -EEXIST;
+ }
+#ifdef CONFIG_XDP_SOCKETS
+ if (rxq->pool) {
+ NL_SET_ERR_MSG(extack, "designated queue already in use by AF_XDP");
+ return -EBUSY;
+ }
+#endif
+
+ rxq->mp_params = *p;
+ ret = netdev_rx_queue_restart(dev, rxq_idx);
+ if (ret) {
+ rxq->mp_params.mp_ops = NULL;
+ rxq->mp_params.mp_priv = NULL;
+ }
+ return ret;
+}
+
+int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
+ struct pp_memory_provider_params *p)
+{
+ int ret;
+
+ netdev_lock(dev);
+ ret = __net_mp_open_rxq(dev, rxq_idx, p, NULL);
+ netdev_unlock(dev);
+ return ret;
+}
+
+void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
+ const struct pp_memory_provider_params *old_p)
+{
+ struct netdev_rx_queue *rxq;
+ int err;
+
+ if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues))
+ return;
+
+ rxq = __netif_get_rx_queue(dev, ifq_idx);
+
+ /* Callers holding a netdev ref may get here after we already
+ * went thru shutdown via dev_memory_provider_uninstall().
+ */
+ if (dev->reg_state > NETREG_REGISTERED &&
+ !rxq->mp_params.mp_ops)
+ return;
+
+ if (WARN_ON_ONCE(rxq->mp_params.mp_ops != old_p->mp_ops ||
+ rxq->mp_params.mp_priv != old_p->mp_priv))
+ return;
+
+ rxq->mp_params.mp_ops = NULL;
+ rxq->mp_params.mp_priv = NULL;
+ err = netdev_rx_queue_restart(dev, ifq_idx);
+ WARN_ON(err && err != -ENETDOWN);
+}
+
+void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx,
+ struct pp_memory_provider_params *old_p)
+{
+ netdev_lock(dev);
+ __net_mp_close_rxq(dev, ifq_idx, old_p);
+ netdev_unlock(dev);
+}
diff --git a/net/core/netevent.c b/net/core/netevent.c
index 8b3bc4fac613..5bb615e963cc 100644
--- a/net/core/netevent.c
+++ b/net/core/netevent.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Network event notifiers
*
@@ -5,11 +6,6 @@
* Tom Tucker <tom@opengridcomputing.com>
* Steve Wise <swise@opengridcomputing.com>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Fixes:
*/
@@ -36,7 +32,7 @@ int register_netevent_notifier(struct notifier_block *nb)
EXPORT_SYMBOL_GPL(register_netevent_notifier);
/**
- * netevent_unregister_notifier - unregister a netevent notifier block
+ * unregister_netevent_notifier - unregister a netevent notifier block
* @nb: notifier
*
* Unregister a notifier previously registered by
diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h
new file mode 100644
index 000000000000..23175cb2bd86
--- /dev/null
+++ b/net/core/netmem_priv.h
@@ -0,0 +1,62 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __NETMEM_PRIV_H
+#define __NETMEM_PRIV_H
+
+static inline unsigned long netmem_get_pp_magic(netmem_ref netmem)
+{
+ return netmem_to_nmdesc(netmem)->pp_magic & ~PP_DMA_INDEX_MASK;
+}
+
+static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic)
+{
+ netmem_to_nmdesc(netmem)->pp_magic |= pp_magic;
+}
+
+static inline void netmem_clear_pp_magic(netmem_ref netmem)
+{
+ WARN_ON_ONCE(netmem_to_nmdesc(netmem)->pp_magic & PP_DMA_INDEX_MASK);
+
+ netmem_to_nmdesc(netmem)->pp_magic = 0;
+}
+
+static inline bool netmem_is_pp(netmem_ref netmem)
+{
+ return (netmem_get_pp_magic(netmem) & PP_MAGIC_MASK) == PP_SIGNATURE;
+}
+
+static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool)
+{
+ netmem_to_nmdesc(netmem)->pp = pool;
+}
+
+static inline void netmem_set_dma_addr(netmem_ref netmem,
+ unsigned long dma_addr)
+{
+ netmem_to_nmdesc(netmem)->dma_addr = dma_addr;
+}
+
+static inline unsigned long netmem_get_dma_index(netmem_ref netmem)
+{
+ unsigned long magic;
+
+ if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
+ return 0;
+
+ magic = netmem_to_nmdesc(netmem)->pp_magic;
+
+ return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT;
+}
+
+static inline void netmem_set_dma_index(netmem_ref netmem,
+ unsigned long id)
+{
+ unsigned long magic;
+
+ if (WARN_ON_ONCE(netmem_is_net_iov(netmem)))
+ return;
+
+ magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT);
+ netmem_to_nmdesc(netmem)->pp_magic = magic;
+}
+#endif
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index de1d1ba92f2d..09f72f10813c 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Common framework for low-level network console, dump, and debugger code
*
@@ -33,8 +34,9 @@
#include <net/addrconf.h>
#include <net/ndisc.h>
#include <net/ip6_checksum.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <trace/events/napi.h>
+#include <linux/kconfig.h>
/*
* We maintain a small pool of fully-sized skbs, to make sure the
@@ -43,11 +45,6 @@
#define MAX_UDP_CHUNK 1460
#define MAX_SKBS 32
-
-static struct sk_buff_head skb_pool;
-
-DEFINE_STATIC_SRCU(netpoll_srcu);
-
#define USEC_PER_POLL 50
#define MAX_SKB_SIZE \
@@ -57,22 +54,15 @@ DEFINE_STATIC_SRCU(netpoll_srcu);
MAX_UDP_CHUNK)
static void zap_completion_queue(void);
-static void netpoll_async_cleanup(struct work_struct *work);
static unsigned int carrier_timeout = 4;
module_param(carrier_timeout, uint, 0644);
-#define np_info(np, fmt, ...) \
- pr_info("%s: " fmt, np->name, ##__VA_ARGS__)
-#define np_err(np, fmt, ...) \
- pr_err("%s: " fmt, np->name, ##__VA_ARGS__)
-#define np_notice(np, fmt, ...) \
- pr_notice("%s: " fmt, np->name, ##__VA_ARGS__)
-
-static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev,
- struct netdev_queue *txq)
+static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb,
+ struct net_device *dev,
+ struct netdev_queue *txq)
{
- int status = NETDEV_TX_OK;
+ netdev_tx_t status = NETDEV_TX_OK;
netdev_features_t features;
features = netif_skb_features(skb);
@@ -122,7 +112,7 @@ static void queue_process(struct work_struct *work)
txq = netdev_get_tx_queue(dev, q_index);
HARD_TX_LOCK(dev, txq, smp_processor_id());
if (netif_xmit_frozen_or_stopped(txq) ||
- netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) {
+ !dev_xmit_complete(netpoll_start_xmit(skb, dev, txq))) {
skb_queue_head(&npinfo->txq, skb);
HARD_TX_UNLOCK(dev, txq);
local_irq_restore(flags);
@@ -135,6 +125,20 @@ static void queue_process(struct work_struct *work)
}
}
+static int netif_local_xmit_active(struct net_device *dev)
+{
+ int i;
+
+ for (i = 0; i < dev->num_tx_queues; i++) {
+ struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+
+ if (READ_ONCE(txq->xmit_lock_owner) == smp_processor_id())
+ return 1;
+ }
+
+ return 0;
+}
+
static void poll_one_napi(struct napi_struct *napi)
{
int work;
@@ -146,11 +150,11 @@ static void poll_one_napi(struct napi_struct *napi)
if (test_and_set_bit(NAPI_STATE_NPSVC, &napi->state))
return;
- /* We explicilty pass the polling call a budget of 0 to
+ /* We explicitly pass the polling call a budget of 0 to
* indicate that we are clearing the Tx path only.
*/
work = napi->poll(napi, 0);
- WARN_ONCE(work, "%pF exceeded budget in poll\n", napi->poll);
+ WARN_ONCE(work, "%pS exceeded budget in poll\n", napi->poll);
trace_napi_poll(napi, work, 0);
clear_bit(NAPI_STATE_NPSVC, &napi->state);
@@ -161,7 +165,7 @@ static void poll_napi(struct net_device *dev)
struct napi_struct *napi;
int cpu = smp_processor_id();
- list_for_each_entry(napi, &dev->napi_list, dev_list) {
+ list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) {
if (cmpxchg(&napi->poll_owner, -1, cpu) == -1) {
poll_one_napi(napi);
smp_store_release(&napi->poll_owner, -1);
@@ -181,7 +185,10 @@ void netpoll_poll_dev(struct net_device *dev)
if (!ni || down_trylock(&ni->dev_lock))
return;
- if (!netif_running(dev)) {
+ /* Some drivers will take the same locks in poll and xmit,
+ * we can't poll if local CPU is already in xmit.
+ */
+ if (!netif_running(dev) || netif_local_xmit_active(dev)) {
up(&ni->dev_lock);
return;
}
@@ -201,41 +208,36 @@ EXPORT_SYMBOL(netpoll_poll_dev);
void netpoll_poll_disable(struct net_device *dev)
{
struct netpoll_info *ni;
- int idx;
+
might_sleep();
- idx = srcu_read_lock(&netpoll_srcu);
- ni = srcu_dereference(dev->npinfo, &netpoll_srcu);
+ ni = rtnl_dereference(dev->npinfo);
if (ni)
down(&ni->dev_lock);
- srcu_read_unlock(&netpoll_srcu, idx);
}
-EXPORT_SYMBOL(netpoll_poll_disable);
void netpoll_poll_enable(struct net_device *dev)
{
struct netpoll_info *ni;
- rcu_read_lock();
- ni = rcu_dereference(dev->npinfo);
+
+ ni = rtnl_dereference(dev->npinfo);
if (ni)
up(&ni->dev_lock);
- rcu_read_unlock();
}
-EXPORT_SYMBOL(netpoll_poll_enable);
-static void refill_skbs(void)
+static void refill_skbs(struct netpoll *np)
{
+ struct sk_buff_head *skb_pool;
struct sk_buff *skb;
- unsigned long flags;
- spin_lock_irqsave(&skb_pool.lock, flags);
- while (skb_pool.qlen < MAX_SKBS) {
+ skb_pool = &np->skb_pool;
+
+ while (READ_ONCE(skb_pool->qlen) < MAX_SKBS) {
skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC);
if (!skb)
break;
- __skb_queue_tail(&skb_pool, skb);
+ skb_queue_tail(skb_pool, skb);
}
- spin_unlock_irqrestore(&skb_pool.lock, flags);
}
static void zap_completion_queue(void)
@@ -272,12 +274,13 @@ static struct sk_buff *find_skb(struct netpoll *np, int len, int reserve)
struct sk_buff *skb;
zap_completion_queue();
- refill_skbs();
repeat:
skb = alloc_skb(len, GFP_ATOMIC);
- if (!skb)
- skb = skb_dequeue(&skb_pool);
+ if (!skb) {
+ skb = skb_dequeue(&np->skb_pool);
+ schedule_work(&np->refill_wq);
+ }
if (!skb) {
if (++count < 10) {
@@ -296,36 +299,39 @@ static int netpoll_owner_active(struct net_device *dev)
{
struct napi_struct *napi;
- list_for_each_entry(napi, &dev->napi_list, dev_list) {
- if (napi->poll_owner == smp_processor_id())
+ list_for_each_entry_rcu(napi, &dev->napi_list, dev_list) {
+ if (READ_ONCE(napi->poll_owner) == smp_processor_id())
return 1;
}
return 0;
}
/* call with IRQ disabled */
-void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
- struct net_device *dev)
+static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
{
- int status = NETDEV_TX_BUSY;
+ netdev_tx_t status = NETDEV_TX_BUSY;
+ netdev_tx_t ret = NET_XMIT_DROP;
+ struct net_device *dev;
unsigned long tries;
/* It is up to the caller to keep npinfo alive. */
struct netpoll_info *npinfo;
- rcu_read_lock_bh();
lockdep_assert_irqs_disabled();
- npinfo = rcu_dereference_bh(np->dev->npinfo);
+ dev = np->dev;
+ rcu_read_lock();
+ npinfo = rcu_dereference_bh(dev->npinfo);
+
if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
dev_kfree_skb_irq(skb);
- return;
+ goto out;
}
/* don't get messages out of order, and no recursion */
if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) {
struct netdev_queue *txq;
- txq = netdev_pick_tx(dev, skb, NULL);
+ txq = netdev_core_pick_tx(dev, skb, NULL);
/* try until next clock tick */
for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
@@ -336,7 +342,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
HARD_TX_UNLOCK(dev, txq);
- if (status == NETDEV_TX_OK)
+ if (dev_xmit_complete(status))
break;
}
@@ -348,240 +354,198 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
}
WARN_ONCE(!irqs_disabled(),
- "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n",
+ "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pS)\n",
dev->name, dev->netdev_ops->ndo_start_xmit);
}
- if (status != NETDEV_TX_OK) {
+ if (!dev_xmit_complete(status)) {
skb_queue_tail(&npinfo->txq, skb);
schedule_delayed_work(&npinfo->tx_work,0);
}
- rcu_read_unlock_bh();
+ ret = NETDEV_TX_OK;
+out:
+ rcu_read_unlock();
+ return ret;
}
-EXPORT_SYMBOL(netpoll_send_skb_on_dev);
-void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
+static void netpoll_udp_checksum(struct netpoll *np, struct sk_buff *skb,
+ int len)
{
- int total_len, ip_len, udp_len;
- struct sk_buff *skb;
struct udphdr *udph;
- struct iphdr *iph;
- struct ethhdr *eth;
- static atomic_t ip_ident;
- struct ipv6hdr *ip6h;
+ int udp_len;
- WARN_ON_ONCE(!irqs_disabled());
+ udp_len = len + sizeof(struct udphdr);
+ udph = udp_hdr(skb);
- udp_len = len + sizeof(*udph);
+ /* check needs to be set, since it will be consumed in csum_partial */
+ udph->check = 0;
if (np->ipv6)
- ip_len = udp_len + sizeof(*ip6h);
+ udph->check = csum_ipv6_magic(&np->local_ip.in6,
+ &np->remote_ip.in6,
+ udp_len, IPPROTO_UDP,
+ csum_partial(udph, udp_len, 0));
else
- ip_len = udp_len + sizeof(*iph);
+ udph->check = csum_tcpudp_magic(np->local_ip.ip,
+ np->remote_ip.ip,
+ udp_len, IPPROTO_UDP,
+ csum_partial(udph, udp_len, 0));
+ if (udph->check == 0)
+ udph->check = CSUM_MANGLED_0;
+}
- total_len = ip_len + LL_RESERVED_SPACE(np->dev);
+netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
+{
+ unsigned long flags;
+ netdev_tx_t ret;
- skb = find_skb(np, total_len + np->dev->needed_tailroom,
- total_len - len);
- if (!skb)
- return;
+ if (unlikely(!np)) {
+ dev_kfree_skb_irq(skb);
+ ret = NET_XMIT_DROP;
+ } else {
+ local_irq_save(flags);
+ ret = __netpoll_send_skb(np, skb);
+ local_irq_restore(flags);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(netpoll_send_skb);
- skb_copy_to_linear_data(skb, msg, len);
- skb_put(skb, len);
+static void push_ipv6(struct netpoll *np, struct sk_buff *skb, int len)
+{
+ struct ipv6hdr *ip6h;
+
+ skb_push(skb, sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ ip6h = ipv6_hdr(skb);
+
+ /* ip6h->version = 6; ip6h->priority = 0; */
+ *(unsigned char *)ip6h = 0x60;
+ ip6h->flow_lbl[0] = 0;
+ ip6h->flow_lbl[1] = 0;
+ ip6h->flow_lbl[2] = 0;
+
+ ip6h->payload_len = htons(sizeof(struct udphdr) + len);
+ ip6h->nexthdr = IPPROTO_UDP;
+ ip6h->hop_limit = 32;
+ ip6h->saddr = np->local_ip.in6;
+ ip6h->daddr = np->remote_ip.in6;
- skb_push(skb, sizeof(*udph));
+ skb->protocol = htons(ETH_P_IPV6);
+}
+
+static void push_ipv4(struct netpoll *np, struct sk_buff *skb, int len)
+{
+ static atomic_t ip_ident;
+ struct iphdr *iph;
+ int ip_len;
+
+ ip_len = len + sizeof(struct udphdr) + sizeof(struct iphdr);
+
+ skb_push(skb, sizeof(struct iphdr));
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+
+ /* iph->version = 4; iph->ihl = 5; */
+ *(unsigned char *)iph = 0x45;
+ iph->tos = 0;
+ put_unaligned(htons(ip_len), &iph->tot_len);
+ iph->id = htons(atomic_inc_return(&ip_ident));
+ iph->frag_off = 0;
+ iph->ttl = 64;
+ iph->protocol = IPPROTO_UDP;
+ iph->check = 0;
+ put_unaligned(np->local_ip.ip, &iph->saddr);
+ put_unaligned(np->remote_ip.ip, &iph->daddr);
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+ skb->protocol = htons(ETH_P_IP);
+}
+
+static void push_udp(struct netpoll *np, struct sk_buff *skb, int len)
+{
+ struct udphdr *udph;
+ int udp_len;
+
+ udp_len = len + sizeof(struct udphdr);
+
+ skb_push(skb, sizeof(struct udphdr));
skb_reset_transport_header(skb);
+
udph = udp_hdr(skb);
udph->source = htons(np->local_port);
udph->dest = htons(np->remote_port);
udph->len = htons(udp_len);
- if (np->ipv6) {
- udph->check = 0;
- udph->check = csum_ipv6_magic(&np->local_ip.in6,
- &np->remote_ip.in6,
- udp_len, IPPROTO_UDP,
- csum_partial(udph, udp_len, 0));
- if (udph->check == 0)
- udph->check = CSUM_MANGLED_0;
-
- skb_push(skb, sizeof(*ip6h));
- skb_reset_network_header(skb);
- ip6h = ipv6_hdr(skb);
-
- /* ip6h->version = 6; ip6h->priority = 0; */
- put_unaligned(0x60, (unsigned char *)ip6h);
- ip6h->flow_lbl[0] = 0;
- ip6h->flow_lbl[1] = 0;
- ip6h->flow_lbl[2] = 0;
-
- ip6h->payload_len = htons(sizeof(struct udphdr) + len);
- ip6h->nexthdr = IPPROTO_UDP;
- ip6h->hop_limit = 32;
- ip6h->saddr = np->local_ip.in6;
- ip6h->daddr = np->remote_ip.in6;
-
- eth = skb_push(skb, ETH_HLEN);
- skb_reset_mac_header(skb);
- skb->protocol = eth->h_proto = htons(ETH_P_IPV6);
- } else {
- udph->check = 0;
- udph->check = csum_tcpudp_magic(np->local_ip.ip,
- np->remote_ip.ip,
- udp_len, IPPROTO_UDP,
- csum_partial(udph, udp_len, 0));
- if (udph->check == 0)
- udph->check = CSUM_MANGLED_0;
-
- skb_push(skb, sizeof(*iph));
- skb_reset_network_header(skb);
- iph = ip_hdr(skb);
-
- /* iph->version = 4; iph->ihl = 5; */
- put_unaligned(0x45, (unsigned char *)iph);
- iph->tos = 0;
- put_unaligned(htons(ip_len), &(iph->tot_len));
- iph->id = htons(atomic_inc_return(&ip_ident));
- iph->frag_off = 0;
- iph->ttl = 64;
- iph->protocol = IPPROTO_UDP;
- iph->check = 0;
- put_unaligned(np->local_ip.ip, &(iph->saddr));
- put_unaligned(np->remote_ip.ip, &(iph->daddr));
- iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
-
- eth = skb_push(skb, ETH_HLEN);
- skb_reset_mac_header(skb);
- skb->protocol = eth->h_proto = htons(ETH_P_IP);
- }
+ netpoll_udp_checksum(np, skb, len);
+}
+static void push_eth(struct netpoll *np, struct sk_buff *skb)
+{
+ struct ethhdr *eth;
+
+ eth = skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
ether_addr_copy(eth->h_source, np->dev->dev_addr);
ether_addr_copy(eth->h_dest, np->remote_mac);
-
- skb->dev = np->dev;
-
- netpoll_send_skb(np, skb);
+ if (np->ipv6)
+ eth->h_proto = htons(ETH_P_IPV6);
+ else
+ eth->h_proto = htons(ETH_P_IP);
}
-EXPORT_SYMBOL(netpoll_send_udp);
-void netpoll_print_options(struct netpoll *np)
+int netpoll_send_udp(struct netpoll *np, const char *msg, int len)
{
- np_info(np, "local port %d\n", np->local_port);
+ int total_len, ip_len, udp_len;
+ struct sk_buff *skb;
+
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ WARN_ON_ONCE(!irqs_disabled());
+
+ udp_len = len + sizeof(struct udphdr);
if (np->ipv6)
- np_info(np, "local IPv6 address %pI6c\n", &np->local_ip.in6);
+ ip_len = udp_len + sizeof(struct ipv6hdr);
else
- np_info(np, "local IPv4 address %pI4\n", &np->local_ip.ip);
- np_info(np, "interface '%s'\n", np->dev_name);
- np_info(np, "remote port %d\n", np->remote_port);
+ ip_len = udp_len + sizeof(struct iphdr);
+
+ total_len = ip_len + LL_RESERVED_SPACE(np->dev);
+
+ skb = find_skb(np, total_len + np->dev->needed_tailroom,
+ total_len - len);
+ if (!skb)
+ return -ENOMEM;
+
+ skb_copy_to_linear_data(skb, msg, len);
+ skb_put(skb, len);
+
+ push_udp(np, skb, len);
if (np->ipv6)
- np_info(np, "remote IPv6 address %pI6c\n", &np->remote_ip.in6);
+ push_ipv6(np, skb, len);
else
- np_info(np, "remote IPv4 address %pI4\n", &np->remote_ip.ip);
- np_info(np, "remote ethernet address %pM\n", np->remote_mac);
+ push_ipv4(np, skb, len);
+ push_eth(np, skb);
+ skb->dev = np->dev;
+
+ return (int)netpoll_send_skb(np, skb);
}
-EXPORT_SYMBOL(netpoll_print_options);
+EXPORT_SYMBOL(netpoll_send_udp);
-static int netpoll_parse_ip_addr(const char *str, union inet_addr *addr)
+
+static void skb_pool_flush(struct netpoll *np)
{
- const char *end;
+ struct sk_buff_head *skb_pool;
- if (!strchr(str, ':') &&
- in4_pton(str, -1, (void *)addr, -1, &end) > 0) {
- if (!*end)
- return 0;
- }
- if (in6_pton(str, -1, addr->in6.s6_addr, -1, &end) > 0) {
-#if IS_ENABLED(CONFIG_IPV6)
- if (!*end)
- return 1;
-#else
- return -1;
-#endif
- }
- return -1;
+ cancel_work_sync(&np->refill_wq);
+ skb_pool = &np->skb_pool;
+ skb_queue_purge_reason(skb_pool, SKB_CONSUMED);
}
-int netpoll_parse_options(struct netpoll *np, char *opt)
+static void refill_skbs_work_handler(struct work_struct *work)
{
- char *cur=opt, *delim;
- int ipv6;
- bool ipversion_set = false;
+ struct netpoll *np =
+ container_of(work, struct netpoll, refill_wq);
- if (*cur != '@') {
- if ((delim = strchr(cur, '@')) == NULL)
- goto parse_failed;
- *delim = 0;
- if (kstrtou16(cur, 10, &np->local_port))
- goto parse_failed;
- cur = delim;
- }
- cur++;
-
- if (*cur != '/') {
- ipversion_set = true;
- if ((delim = strchr(cur, '/')) == NULL)
- goto parse_failed;
- *delim = 0;
- ipv6 = netpoll_parse_ip_addr(cur, &np->local_ip);
- if (ipv6 < 0)
- goto parse_failed;
- else
- np->ipv6 = (bool)ipv6;
- cur = delim;
- }
- cur++;
-
- if (*cur != ',') {
- /* parse out dev name */
- if ((delim = strchr(cur, ',')) == NULL)
- goto parse_failed;
- *delim = 0;
- strlcpy(np->dev_name, cur, sizeof(np->dev_name));
- cur = delim;
- }
- cur++;
-
- if (*cur != '@') {
- /* dst port */
- if ((delim = strchr(cur, '@')) == NULL)
- goto parse_failed;
- *delim = 0;
- if (*cur == ' ' || *cur == '\t')
- np_info(np, "warning: whitespace is not allowed\n");
- if (kstrtou16(cur, 10, &np->remote_port))
- goto parse_failed;
- cur = delim;
- }
- cur++;
-
- /* dst ip */
- if ((delim = strchr(cur, '/')) == NULL)
- goto parse_failed;
- *delim = 0;
- ipv6 = netpoll_parse_ip_addr(cur, &np->remote_ip);
- if (ipv6 < 0)
- goto parse_failed;
- else if (ipversion_set && np->ipv6 != (bool)ipv6)
- goto parse_failed;
- else
- np->ipv6 = (bool)ipv6;
- cur = delim + 1;
-
- if (*cur != 0) {
- /* MAC address */
- if (!mac_pton(cur, np->remote_mac))
- goto parse_failed;
- }
-
- netpoll_print_options(np);
-
- return 0;
-
- parse_failed:
- np_info(np, "couldn't parse config at '%s'!\n", cur);
- return -1;
+ refill_skbs(np);
}
-EXPORT_SYMBOL(netpoll_parse_options);
int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
{
@@ -589,18 +553,18 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
const struct net_device_ops *ops;
int err;
- np->dev = ndev;
- strlcpy(np->dev_name, ndev->name, IFNAMSIZ);
- INIT_WORK(&np->cleanup_work, netpoll_async_cleanup);
+ skb_queue_head_init(&np->skb_pool);
+ INIT_WORK(&np->refill_wq, refill_skbs_work_handler);
if (ndev->priv_flags & IFF_DISABLE_NETPOLL) {
np_err(np, "%s doesn't support polling, aborting\n",
- np->dev_name);
+ ndev->name);
err = -ENOTSUPP;
goto out;
}
- if (!ndev->npinfo) {
+ npinfo = rtnl_dereference(ndev->npinfo);
+ if (!npinfo) {
npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
if (!npinfo) {
err = -ENOMEM;
@@ -613,18 +577,21 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
refcount_set(&npinfo->refcnt, 1);
- ops = np->dev->netdev_ops;
+ ops = ndev->netdev_ops;
if (ops->ndo_netpoll_setup) {
- err = ops->ndo_netpoll_setup(ndev, npinfo);
+ err = ops->ndo_netpoll_setup(ndev);
if (err)
goto free_npinfo;
}
} else {
- npinfo = rtnl_dereference(ndev->npinfo);
refcount_inc(&npinfo->refcnt);
}
- npinfo->netpoll = np;
+ np->dev = ndev;
+ strscpy(np->dev_name, ndev->name, IFNAMSIZ);
+
+ /* fill up the skb queue */
+ refill_skbs(np);
/* last thing to do is link it to the net device structure */
rcu_assign_pointer(ndev->npinfo, npinfo);
@@ -638,137 +605,187 @@ out:
}
EXPORT_SYMBOL_GPL(__netpoll_setup);
+/*
+ * Returns a pointer to a string representation of the identifier used
+ * to select the egress interface for the given netpoll instance. buf
+ * must be a buffer of length at least MAC_ADDR_STR_LEN + 1.
+ */
+static char *egress_dev(struct netpoll *np, char *buf)
+{
+ if (np->dev_name[0])
+ return np->dev_name;
+
+ snprintf(buf, MAC_ADDR_STR_LEN, "%pM", np->dev_mac);
+ return buf;
+}
+
+static void netpoll_wait_carrier(struct netpoll *np, struct net_device *ndev,
+ unsigned int timeout)
+{
+ unsigned long atmost;
+
+ atmost = jiffies + timeout * HZ;
+ while (!netif_carrier_ok(ndev)) {
+ if (time_after(jiffies, atmost)) {
+ np_notice(np, "timeout waiting for carrier\n");
+ break;
+ }
+ msleep(1);
+ }
+}
+
+/*
+ * Take the IPv6 from ndev and populate local_ip structure in netpoll
+ */
+static int netpoll_take_ipv6(struct netpoll *np, struct net_device *ndev)
+{
+ char buf[MAC_ADDR_STR_LEN + 1];
+ int err = -EDESTADDRREQ;
+ struct inet6_dev *idev;
+
+ if (!IS_ENABLED(CONFIG_IPV6)) {
+ np_err(np, "IPv6 is not supported %s, aborting\n",
+ egress_dev(np, buf));
+ return -EINVAL;
+ }
+
+ idev = __in6_dev_get(ndev);
+ if (idev) {
+ struct inet6_ifaddr *ifp;
+
+ read_lock_bh(&idev->lock);
+ list_for_each_entry(ifp, &idev->addr_list, if_list) {
+ if (!!(ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL) !=
+ !!(ipv6_addr_type(&np->remote_ip.in6) & IPV6_ADDR_LINKLOCAL))
+ continue;
+ /* Got the IP, let's return */
+ np->local_ip.in6 = ifp->addr;
+ err = 0;
+ break;
+ }
+ read_unlock_bh(&idev->lock);
+ }
+ if (err) {
+ np_err(np, "no IPv6 address for %s, aborting\n",
+ egress_dev(np, buf));
+ return err;
+ }
+
+ np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6);
+ return 0;
+}
+
+/*
+ * Take the IPv4 from ndev and populate local_ip structure in netpoll
+ */
+static int netpoll_take_ipv4(struct netpoll *np, struct net_device *ndev)
+{
+ char buf[MAC_ADDR_STR_LEN + 1];
+ const struct in_ifaddr *ifa;
+ struct in_device *in_dev;
+
+ in_dev = __in_dev_get_rtnl(ndev);
+ if (!in_dev) {
+ np_err(np, "no IP address for %s, aborting\n",
+ egress_dev(np, buf));
+ return -EDESTADDRREQ;
+ }
+
+ ifa = rtnl_dereference(in_dev->ifa_list);
+ if (!ifa) {
+ np_err(np, "no IP address for %s, aborting\n",
+ egress_dev(np, buf));
+ return -EDESTADDRREQ;
+ }
+
+ np->local_ip.ip = ifa->ifa_local;
+ np_info(np, "local IP %pI4\n", &np->local_ip.ip);
+
+ return 0;
+}
+
int netpoll_setup(struct netpoll *np)
{
+ struct net *net = current->nsproxy->net_ns;
+ char buf[MAC_ADDR_STR_LEN + 1];
struct net_device *ndev = NULL;
- struct in_device *in_dev;
+ bool ip_overwritten = false;
int err;
rtnl_lock();
- if (np->dev_name[0]) {
- struct net *net = current->nsproxy->net_ns;
+ if (np->dev_name[0])
ndev = __dev_get_by_name(net, np->dev_name);
- }
+ else if (is_valid_ether_addr(np->dev_mac))
+ ndev = dev_getbyhwaddr(net, ARPHRD_ETHER, np->dev_mac);
+
if (!ndev) {
- np_err(np, "%s doesn't exist, aborting\n", np->dev_name);
+ np_err(np, "%s doesn't exist, aborting\n", egress_dev(np, buf));
err = -ENODEV;
goto unlock;
}
- dev_hold(ndev);
+ netdev_hold(ndev, &np->dev_tracker, GFP_KERNEL);
if (netdev_master_upper_dev_get(ndev)) {
- np_err(np, "%s is a slave device, aborting\n", np->dev_name);
+ np_err(np, "%s is a slave device, aborting\n",
+ egress_dev(np, buf));
err = -EBUSY;
goto put;
}
if (!netif_running(ndev)) {
- unsigned long atmost, atleast;
-
- np_info(np, "device %s not up yet, forcing it\n", np->dev_name);
-
- err = dev_open(ndev);
+ np_info(np, "device %s not up yet, forcing it\n",
+ egress_dev(np, buf));
+ err = dev_open(ndev, NULL);
if (err) {
np_err(np, "failed to open %s\n", ndev->name);
goto put;
}
rtnl_unlock();
- atleast = jiffies + HZ/10;
- atmost = jiffies + carrier_timeout * HZ;
- while (!netif_carrier_ok(ndev)) {
- if (time_after(jiffies, atmost)) {
- np_notice(np, "timeout waiting for carrier\n");
- break;
- }
- msleep(1);
- }
-
- /* If carrier appears to come up instantly, we don't
- * trust it and pause so that we don't pump all our
- * queued console messages into the bitbucket.
- */
-
- if (time_before(jiffies, atleast)) {
- np_notice(np, "carrier detect appears untrustworthy, waiting 4 seconds\n");
- msleep(4000);
- }
+ netpoll_wait_carrier(np, ndev, carrier_timeout);
rtnl_lock();
}
if (!np->local_ip.ip) {
if (!np->ipv6) {
- in_dev = __in_dev_get_rtnl(ndev);
-
- if (!in_dev || !in_dev->ifa_list) {
- np_err(np, "no IP address for %s, aborting\n",
- np->dev_name);
- err = -EDESTADDRREQ;
+ err = netpoll_take_ipv4(np, ndev);
+ if (err)
goto put;
- }
-
- np->local_ip.ip = in_dev->ifa_list->ifa_local;
- np_info(np, "local IP %pI4\n", &np->local_ip.ip);
} else {
-#if IS_ENABLED(CONFIG_IPV6)
- struct inet6_dev *idev;
-
- err = -EDESTADDRREQ;
- idev = __in6_dev_get(ndev);
- if (idev) {
- struct inet6_ifaddr *ifp;
-
- read_lock_bh(&idev->lock);
- list_for_each_entry(ifp, &idev->addr_list, if_list) {
- if (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)
- continue;
- np->local_ip.in6 = ifp->addr;
- err = 0;
- break;
- }
- read_unlock_bh(&idev->lock);
- }
- if (err) {
- np_err(np, "no IPv6 address for %s, aborting\n",
- np->dev_name);
+ err = netpoll_take_ipv6(np, ndev);
+ if (err)
goto put;
- } else
- np_info(np, "local IPv6 %pI6c\n", &np->local_ip.in6);
-#else
- np_err(np, "IPv6 is not supported %s, aborting\n",
- np->dev_name);
- err = -EINVAL;
- goto put;
-#endif
}
+ ip_overwritten = true;
}
- /* fill up the skb queue */
- refill_skbs();
-
err = __netpoll_setup(np, ndev);
if (err)
- goto put;
-
+ goto flush;
rtnl_unlock();
+
+ /* Make sure all NAPI polls which started before dev->npinfo
+ * was visible have exited before we start calling NAPI poll.
+ * NAPI skips locking if dev->npinfo is NULL.
+ */
+ synchronize_rcu();
+
return 0;
+flush:
+ skb_pool_flush(np);
put:
- dev_put(ndev);
+ DEBUG_NET_WARN_ON_ONCE(np->dev);
+ if (ip_overwritten)
+ memset(&np->local_ip, 0, sizeof(np->local_ip));
+ netdev_put(ndev, &np->dev_tracker);
unlock:
rtnl_unlock();
return err;
}
EXPORT_SYMBOL(netpoll_setup);
-static int __init netpoll_init(void)
-{
- skb_queue_head_init(&skb_pool);
- return 0;
-}
-core_initcall(netpoll_init);
-
static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
{
struct netpoll_info *npinfo =
@@ -786,20 +803,18 @@ static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
kfree(npinfo);
}
-void __netpoll_cleanup(struct netpoll *np)
+static void __netpoll_cleanup(struct netpoll *np)
{
struct netpoll_info *npinfo;
- /* rtnl_dereference would be preferable here but
- * rcu_cleanup_netpoll path can put us in here safely without
- * holding the rtnl, so plain rcu_dereference it is
- */
npinfo = rtnl_dereference(np->dev->npinfo);
if (!npinfo)
return;
- synchronize_srcu(&netpoll_srcu);
-
+ /* At this point, there is a single npinfo instance per netdevice, and
+ * its refcnt tracks how many netpoll structures are linked to it. We
+ * only perform npinfo cleanup when the refcnt decrements to zero.
+ */
if (refcount_dec_and_test(&npinfo->refcnt)) {
const struct net_device_ops *ops;
@@ -808,36 +823,37 @@ void __netpoll_cleanup(struct netpoll *np)
ops->ndo_netpoll_cleanup(np->dev);
RCU_INIT_POINTER(np->dev->npinfo, NULL);
- call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info);
- } else
- RCU_INIT_POINTER(np->dev->npinfo, NULL);
+ call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info);
+ }
+
+ skb_pool_flush(np);
}
-EXPORT_SYMBOL_GPL(__netpoll_cleanup);
-static void netpoll_async_cleanup(struct work_struct *work)
+void __netpoll_free(struct netpoll *np)
{
- struct netpoll *np = container_of(work, struct netpoll, cleanup_work);
+ ASSERT_RTNL();
- rtnl_lock();
+ /* Wait for transmitting packets to finish before freeing. */
+ synchronize_net();
__netpoll_cleanup(np);
- rtnl_unlock();
kfree(np);
}
+EXPORT_SYMBOL_GPL(__netpoll_free);
-void __netpoll_free_async(struct netpoll *np)
+void do_netpoll_cleanup(struct netpoll *np)
{
- schedule_work(&np->cleanup_work);
+ __netpoll_cleanup(np);
+ netdev_put(np->dev, &np->dev_tracker);
+ np->dev = NULL;
}
-EXPORT_SYMBOL_GPL(__netpoll_free_async);
+EXPORT_SYMBOL(do_netpoll_cleanup);
void netpoll_cleanup(struct netpoll *np)
{
rtnl_lock();
if (!np->dev)
goto out;
- __netpoll_cleanup(np);
- dev_put(np->dev);
- np->dev = NULL;
+ do_netpoll_cleanup(np);
out:
rtnl_unlock();
}
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index b9057478d69c..8456dfbe2eb4 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/core/netprio_cgroup.c Priority Control Group
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Neil Horman <nhorman@tuxdriver.com>
*/
@@ -97,7 +93,7 @@ static int extend_netdev_table(struct net_device *dev, u32 target_idx)
static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev)
{
struct netprio_map *map = rcu_dereference_rtnl(dev->priomap);
- int id = css->cgroup->id;
+ int id = css->id;
if (map && id < map->priomap_len)
return map->priomap[id];
@@ -117,7 +113,7 @@ static int netprio_set_prio(struct cgroup_subsys_state *css,
struct net_device *dev, u32 prio)
{
struct netprio_map *map;
- int id = css->cgroup->id;
+ int id = css->id;
int ret;
/* avoid extending priomap for zero writes */
@@ -181,7 +177,7 @@ static void cgrp_css_free(struct cgroup_subsys_state *css)
static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
{
- return css->cgroup->id;
+ return css->id;
}
static int read_priomap(struct seq_file *sf, void *v)
@@ -211,8 +207,6 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
if (!dev)
return -ENODEV;
- cgroup_sk_alloc_disable();
-
rtnl_lock();
ret = netprio_set_prio(of_css(of), dev, prio);
@@ -224,14 +218,11 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
static int update_netprio(const void *v, struct file *file, unsigned n)
{
- int err;
- struct socket *sock = sock_from_file(file, &err);
- if (sock) {
- spin_lock(&cgroup_sk_update_lock);
+ struct socket *sock = sock_from_file(file);
+
+ if (sock)
sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
(unsigned long)v);
- spin_unlock(&cgroup_sk_update_lock);
- }
return 0;
}
@@ -241,7 +232,7 @@ static void net_prio_attach(struct cgroup_taskset *tset)
struct cgroup_subsys_state *css;
cgroup_taskset_for_each(p, css, tset) {
- void *v = (void *)(unsigned long)css->cgroup->id;
+ void *v = (void *)(unsigned long)css->id;
task_lock(p);
iterate_fd(p->files, 0, update_netprio, v);
@@ -301,6 +292,4 @@ static int __init init_cgroup_netprio(void)
register_netdevice_notifier(&netprio_device_notifier);
return 0;
}
-
subsys_initcall(init_cgroup_netprio);
-MODULE_LICENSE("GPL v2");
diff --git a/net/core/of_net.c b/net/core/of_net.c
new file mode 100644
index 000000000000..93ea425b9248
--- /dev/null
+++ b/net/core/of_net.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * OF helpers for network devices.
+ *
+ * Initially copied out of arch/powerpc/kernel/prom_parse.c
+ */
+#include <linux/etherdevice.h>
+#include <linux/kernel.h>
+#include <linux/of_net.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/phy.h>
+#include <linux/export.h>
+#include <linux/device.h>
+#include <linux/nvmem-consumer.h>
+
+/**
+ * of_get_phy_mode - Get phy mode for given device_node
+ * @np: Pointer to the given device_node
+ * @interface: Pointer to the result
+ *
+ * The function gets phy interface string from property 'phy-mode' or
+ * 'phy-connection-type'. The index in phy_modes table is set in
+ * interface and 0 returned. In case of error interface is set to
+ * PHY_INTERFACE_MODE_NA and an errno is returned, e.g. -ENODEV.
+ */
+int of_get_phy_mode(struct device_node *np, phy_interface_t *interface)
+{
+ const char *pm;
+ int err, i;
+
+ *interface = PHY_INTERFACE_MODE_NA;
+
+ err = of_property_read_string(np, "phy-mode", &pm);
+ if (err < 0)
+ err = of_property_read_string(np, "phy-connection-type", &pm);
+ if (err < 0)
+ return err;
+
+ for (i = 0; i < PHY_INTERFACE_MODE_MAX; i++)
+ if (!strcasecmp(pm, phy_modes(i))) {
+ *interface = i;
+ return 0;
+ }
+
+ return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(of_get_phy_mode);
+
+static int of_get_mac_addr(struct device_node *np, const char *name, u8 *addr)
+{
+ struct property *pp = of_find_property(np, name, NULL);
+
+ if (pp && pp->length == ETH_ALEN && is_valid_ether_addr(pp->value)) {
+ memcpy(addr, pp->value, ETH_ALEN);
+ return 0;
+ }
+ return -ENODEV;
+}
+
+int of_get_mac_address_nvmem(struct device_node *np, u8 *addr)
+{
+ struct platform_device *pdev = of_find_device_by_node(np);
+ struct nvmem_cell *cell;
+ const void *mac;
+ size_t len;
+ int ret;
+
+ /* Try lookup by device first, there might be a nvmem_cell_lookup
+ * associated with a given device.
+ */
+ if (pdev) {
+ ret = nvmem_get_mac_address(&pdev->dev, addr);
+ put_device(&pdev->dev);
+ return ret;
+ }
+
+ cell = of_nvmem_cell_get(np, "mac-address");
+ if (IS_ERR(cell))
+ return PTR_ERR(cell);
+
+ mac = nvmem_cell_read(cell, &len);
+ nvmem_cell_put(cell);
+
+ if (IS_ERR(mac))
+ return PTR_ERR(mac);
+
+ if (len != ETH_ALEN || !is_valid_ether_addr(mac)) {
+ kfree(mac);
+ return -EINVAL;
+ }
+
+ memcpy(addr, mac, ETH_ALEN);
+ kfree(mac);
+
+ return 0;
+}
+EXPORT_SYMBOL(of_get_mac_address_nvmem);
+
+/**
+ * of_get_mac_address()
+ * @np: Caller's Device Node
+ * @addr: Pointer to a six-byte array for the result
+ *
+ * Search the device tree for the best MAC address to use. 'mac-address' is
+ * checked first, because that is supposed to contain to "most recent" MAC
+ * address. If that isn't set, then 'local-mac-address' is checked next,
+ * because that is the default address. If that isn't set, then the obsolete
+ * 'address' is checked, just in case we're using an old device tree. If any
+ * of the above isn't set, then try to get MAC address from nvmem cell named
+ * 'mac-address'.
+ *
+ * Note that the 'address' property is supposed to contain a virtual address of
+ * the register set, but some DTS files have redefined that property to be the
+ * MAC address.
+ *
+ * All-zero MAC addresses are rejected, because those could be properties that
+ * exist in the device tree, but were not set by U-Boot. For example, the
+ * DTS could define 'mac-address' and 'local-mac-address', with zero MAC
+ * addresses. Some older U-Boots only initialized 'local-mac-address'. In
+ * this case, the real MAC is in 'local-mac-address', and 'mac-address' exists
+ * but is all zeros.
+ *
+ * Return: 0 on success and errno in case of error.
+*/
+int of_get_mac_address(struct device_node *np, u8 *addr)
+{
+ int ret;
+
+ if (!np)
+ return -ENODEV;
+
+ ret = of_get_mac_addr(np, "mac-address", addr);
+ if (!ret)
+ return 0;
+
+ ret = of_get_mac_addr(np, "local-mac-address", addr);
+ if (!ret)
+ return 0;
+
+ ret = of_get_mac_addr(np, "address", addr);
+ if (!ret)
+ return 0;
+
+ return of_get_mac_address_nvmem(np, addr);
+}
+EXPORT_SYMBOL(of_get_mac_address);
+
+/**
+ * of_get_ethdev_address()
+ * @np: Caller's Device Node
+ * @dev: Pointer to netdevice which address will be updated
+ *
+ * Search the device tree for the best MAC address to use.
+ * If found set @dev->dev_addr to that address.
+ *
+ * See documentation of of_get_mac_address() for more information on how
+ * the best address is determined.
+ *
+ * Return: 0 on success and errno in case of error.
+ */
+int of_get_ethdev_address(struct device_node *np, struct net_device *dev)
+{
+ u8 addr[ETH_ALEN];
+ int ret;
+
+ ret = of_get_mac_address(np, addr);
+ if (!ret)
+ eth_hw_addr_set(dev, addr);
+ return ret;
+}
+EXPORT_SYMBOL(of_get_ethdev_address);
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 43a932cb609b..265a729431bb 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -4,208 +4,791 @@
* Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
* Copyright (C) 2016 Red Hat, Inc.
*/
+
+#include <linux/error-injection.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
+#include <linux/device.h>
+
+#include <net/netdev_lock.h>
+#include <net/netdev_rx_queue.h>
+#include <net/page_pool/helpers.h>
+#include <net/page_pool/memory_provider.h>
+#include <net/xdp.h>
-#include <net/page_pool.h>
#include <linux/dma-direction.h>
#include <linux/dma-mapping.h>
#include <linux/page-flags.h>
-#include <linux/mm.h> /* for __put_page() */
+#include <linux/mm.h> /* for put_page() */
+#include <linux/poison.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+
+#include <trace/events/page_pool.h>
+
+#include "dev.h"
+#include "mp_dmabuf_devmem.h"
+#include "netmem_priv.h"
+#include "page_pool_priv.h"
+
+DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers);
+
+#define DEFER_TIME (msecs_to_jiffies(1000))
+#define DEFER_WARN_INTERVAL (60 * HZ)
+
+#define BIAS_MAX (LONG_MAX >> 1)
+
+#ifdef CONFIG_PAGE_POOL_STATS
+static DEFINE_PER_CPU(struct page_pool_recycle_stats, pp_system_recycle_stats);
+
+/* alloc_stat_inc is intended to be used in softirq context */
+#define alloc_stat_inc(pool, __stat) (pool->alloc_stats.__stat++)
+/* recycle_stat_inc is safe to use when preemption is possible. */
+#define recycle_stat_inc(pool, __stat) \
+ do { \
+ struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
+ this_cpu_inc(s->__stat); \
+ } while (0)
+
+#define recycle_stat_add(pool, __stat, val) \
+ do { \
+ struct page_pool_recycle_stats __percpu *s = pool->recycle_stats; \
+ this_cpu_add(s->__stat, val); \
+ } while (0)
+
+static const char pp_stats[][ETH_GSTRING_LEN] = {
+ "rx_pp_alloc_fast",
+ "rx_pp_alloc_slow",
+ "rx_pp_alloc_slow_ho",
+ "rx_pp_alloc_empty",
+ "rx_pp_alloc_refill",
+ "rx_pp_alloc_waive",
+ "rx_pp_recycle_cached",
+ "rx_pp_recycle_cache_full",
+ "rx_pp_recycle_ring",
+ "rx_pp_recycle_ring_full",
+ "rx_pp_recycle_released_ref",
+};
+
+/**
+ * page_pool_get_stats() - fetch page pool stats
+ * @pool: pool from which page was allocated
+ * @stats: struct page_pool_stats to fill in
+ *
+ * Retrieve statistics about the page_pool. This API is only available
+ * if the kernel has been configured with ``CONFIG_PAGE_POOL_STATS=y``.
+ * A pointer to a caller allocated struct page_pool_stats structure
+ * is passed to this API which is filled in. The caller can then report
+ * those stats to the user (perhaps via ethtool, debugfs, etc.).
+ */
+bool page_pool_get_stats(const struct page_pool *pool,
+ struct page_pool_stats *stats)
+{
+ int cpu = 0;
+
+ if (!stats)
+ return false;
+
+ /* The caller is responsible to initialize stats. */
+ stats->alloc_stats.fast += pool->alloc_stats.fast;
+ stats->alloc_stats.slow += pool->alloc_stats.slow;
+ stats->alloc_stats.slow_high_order += pool->alloc_stats.slow_high_order;
+ stats->alloc_stats.empty += pool->alloc_stats.empty;
+ stats->alloc_stats.refill += pool->alloc_stats.refill;
+ stats->alloc_stats.waive += pool->alloc_stats.waive;
+
+ for_each_possible_cpu(cpu) {
+ const struct page_pool_recycle_stats *pcpu =
+ per_cpu_ptr(pool->recycle_stats, cpu);
+
+ stats->recycle_stats.cached += pcpu->cached;
+ stats->recycle_stats.cache_full += pcpu->cache_full;
+ stats->recycle_stats.ring += pcpu->ring;
+ stats->recycle_stats.ring_full += pcpu->ring_full;
+ stats->recycle_stats.released_refcnt += pcpu->released_refcnt;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL(page_pool_get_stats);
+
+u8 *page_pool_ethtool_stats_get_strings(u8 *data)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(pp_stats); i++) {
+ memcpy(data, pp_stats[i], ETH_GSTRING_LEN);
+ data += ETH_GSTRING_LEN;
+ }
+
+ return data;
+}
+EXPORT_SYMBOL(page_pool_ethtool_stats_get_strings);
+
+int page_pool_ethtool_stats_get_count(void)
+{
+ return ARRAY_SIZE(pp_stats);
+}
+EXPORT_SYMBOL(page_pool_ethtool_stats_get_count);
+
+u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats)
+{
+ const struct page_pool_stats *pool_stats = stats;
+
+ *data++ = pool_stats->alloc_stats.fast;
+ *data++ = pool_stats->alloc_stats.slow;
+ *data++ = pool_stats->alloc_stats.slow_high_order;
+ *data++ = pool_stats->alloc_stats.empty;
+ *data++ = pool_stats->alloc_stats.refill;
+ *data++ = pool_stats->alloc_stats.waive;
+ *data++ = pool_stats->recycle_stats.cached;
+ *data++ = pool_stats->recycle_stats.cache_full;
+ *data++ = pool_stats->recycle_stats.ring;
+ *data++ = pool_stats->recycle_stats.ring_full;
+ *data++ = pool_stats->recycle_stats.released_refcnt;
+
+ return data;
+}
+EXPORT_SYMBOL(page_pool_ethtool_stats_get);
+
+#else
+#define alloc_stat_inc(...) do { } while (0)
+#define recycle_stat_inc(...) do { } while (0)
+#define recycle_stat_add(...) do { } while (0)
+#endif
+
+static bool page_pool_producer_lock(struct page_pool *pool)
+ __acquires(&pool->ring.producer_lock)
+{
+ bool in_softirq = in_softirq();
+
+ if (in_softirq)
+ spin_lock(&pool->ring.producer_lock);
+ else
+ spin_lock_bh(&pool->ring.producer_lock);
+
+ return in_softirq;
+}
+
+static void page_pool_producer_unlock(struct page_pool *pool,
+ bool in_softirq)
+ __releases(&pool->ring.producer_lock)
+{
+ if (in_softirq)
+ spin_unlock(&pool->ring.producer_lock);
+ else
+ spin_unlock_bh(&pool->ring.producer_lock);
+}
+
+static void page_pool_struct_check(void)
+{
+ CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset);
+ CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag,
+ PAGE_POOL_FRAG_GROUP_ALIGN);
+}
static int page_pool_init(struct page_pool *pool,
- const struct page_pool_params *params)
+ const struct page_pool_params *params,
+ int cpuid)
{
unsigned int ring_qsize = 1024; /* Default */
+ struct netdev_rx_queue *rxq;
+ int err;
+
+ page_pool_struct_check();
- memcpy(&pool->p, params, sizeof(pool->p));
+ memcpy(&pool->p, &params->fast, sizeof(pool->p));
+ memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
+
+ pool->cpuid = cpuid;
+ pool->dma_sync_for_cpu = true;
/* Validate only known flags were used */
- if (pool->p.flags & ~(PP_FLAG_ALL))
+ if (pool->slow.flags & ~PP_FLAG_ALL)
return -EINVAL;
if (pool->p.pool_size)
- ring_qsize = pool->p.pool_size;
-
- /* Sanity limit mem that can be pinned down */
- if (ring_qsize > 32768)
- return -E2BIG;
+ ring_qsize = min(pool->p.pool_size, 16384);
/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
* DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
* which is the XDP_TX use-case.
*/
- if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
- (pool->p.dma_dir != DMA_BIDIRECTIONAL))
- return -EINVAL;
+ if (pool->slow.flags & PP_FLAG_DMA_MAP) {
+ if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
+ (pool->p.dma_dir != DMA_BIDIRECTIONAL))
+ return -EINVAL;
+
+ pool->dma_map = true;
+ }
+
+ if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) {
+ /* In order to request DMA-sync-for-device the page
+ * needs to be mapped
+ */
+ if (!(pool->slow.flags & PP_FLAG_DMA_MAP))
+ return -EINVAL;
+
+ if (!pool->p.max_len)
+ return -EINVAL;
+
+ pool->dma_sync = true;
+
+ /* pool->p.offset has to be set according to the address
+ * offset used by the DMA engine to start copying rx data
+ */
+ }
+
+ pool->has_init_callback = !!pool->slow.init_callback;
- if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
+#ifdef CONFIG_PAGE_POOL_STATS
+ if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) {
+ pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats);
+ if (!pool->recycle_stats)
+ return -ENOMEM;
+ } else {
+ /* For system page pool instance we use a singular stats object
+ * instead of allocating a separate percpu variable for each
+ * (also percpu) page pool instance.
+ */
+ pool->recycle_stats = &pp_system_recycle_stats;
+ pool->system = true;
+ }
+#endif
+
+ if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
+#ifdef CONFIG_PAGE_POOL_STATS
+ if (!pool->system)
+ free_percpu(pool->recycle_stats);
+#endif
return -ENOMEM;
+ }
+
+ atomic_set(&pool->pages_state_release_cnt, 0);
+
+ /* Driver calling page_pool_create() also call page_pool_destroy() */
+ refcount_set(&pool->user_cnt, 1);
+
+ xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1);
+
+ if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) {
+ netdev_assert_locked(pool->slow.netdev);
+ rxq = __netif_get_rx_queue(pool->slow.netdev,
+ pool->slow.queue_idx);
+ pool->mp_priv = rxq->mp_params.mp_priv;
+ pool->mp_ops = rxq->mp_params.mp_ops;
+ }
+
+ if (pool->mp_ops) {
+ if (!pool->dma_map || !pool->dma_sync) {
+ err = -EOPNOTSUPP;
+ goto free_ptr_ring;
+ }
+
+ if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) {
+ err = -EFAULT;
+ goto free_ptr_ring;
+ }
+
+ err = pool->mp_ops->init(pool);
+ if (err) {
+ pr_warn("%s() mem-provider init failed %d\n", __func__,
+ err);
+ goto free_ptr_ring;
+ }
+
+ static_branch_inc(&page_pool_mem_providers);
+ } else if (pool->p.order > MAX_PAGE_ORDER) {
+ err = -EINVAL;
+ goto free_ptr_ring;
+ }
return 0;
+
+free_ptr_ring:
+ ptr_ring_cleanup(&pool->ring, NULL);
+ xa_destroy(&pool->dma_mapped);
+#ifdef CONFIG_PAGE_POOL_STATS
+ if (!pool->system)
+ free_percpu(pool->recycle_stats);
+#endif
+ return err;
}
-struct page_pool *page_pool_create(const struct page_pool_params *params)
+static void page_pool_uninit(struct page_pool *pool)
+{
+ ptr_ring_cleanup(&pool->ring, NULL);
+ xa_destroy(&pool->dma_mapped);
+
+#ifdef CONFIG_PAGE_POOL_STATS
+ if (!pool->system)
+ free_percpu(pool->recycle_stats);
+#endif
+}
+
+/**
+ * page_pool_create_percpu() - create a page pool for a given cpu.
+ * @params: parameters, see struct page_pool_params
+ * @cpuid: cpu identifier
+ */
+struct page_pool *
+page_pool_create_percpu(const struct page_pool_params *params, int cpuid)
{
struct page_pool *pool;
- int err = 0;
+ int err;
pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
if (!pool)
return ERR_PTR(-ENOMEM);
- err = page_pool_init(pool, params);
- if (err < 0) {
- pr_warn("%s() gave up with errno %d\n", __func__, err);
- kfree(pool);
- return ERR_PTR(err);
- }
+ err = page_pool_init(pool, params, cpuid);
+ if (err < 0)
+ goto err_free;
+
+ err = page_pool_list(pool);
+ if (err)
+ goto err_uninit;
+
return pool;
+
+err_uninit:
+ page_pool_uninit(pool);
+err_free:
+ pr_warn("%s() gave up with errno %d\n", __func__, err);
+ kfree(pool);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL(page_pool_create_percpu);
+
+/**
+ * page_pool_create() - create a page pool
+ * @params: parameters, see struct page_pool_params
+ */
+struct page_pool *page_pool_create(const struct page_pool_params *params)
+{
+ return page_pool_create_percpu(params, -1);
}
EXPORT_SYMBOL(page_pool_create);
-/* fast path */
-static struct page *__page_pool_get_cached(struct page_pool *pool)
+static void page_pool_return_netmem(struct page_pool *pool, netmem_ref netmem);
+
+static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool)
{
struct ptr_ring *r = &pool->ring;
- struct page *page;
+ netmem_ref netmem;
+ int pref_nid; /* preferred NUMA node */
/* Quicker fallback, avoid locks when ring is empty */
- if (__ptr_ring_empty(r))
- return NULL;
+ if (__ptr_ring_empty(r)) {
+ alloc_stat_inc(pool, empty);
+ return 0;
+ }
- /* Test for safe-context, caller should provide this guarantee */
- if (likely(in_serving_softirq())) {
- if (likely(pool->alloc.count)) {
- /* Fast-path */
- page = pool->alloc.cache[--pool->alloc.count];
- return page;
- }
- /* Slower-path: Alloc array empty, time to refill
- *
- * Open-coded bulk ptr_ring consumer.
- *
- * Discussion: the ring consumer lock is not really
- * needed due to the softirq/NAPI protection, but
- * later need the ability to reclaim pages on the
- * ring. Thus, keeping the locks.
- */
- spin_lock(&r->consumer_lock);
- while ((page = __ptr_ring_consume(r))) {
- if (pool->alloc.count == PP_ALLOC_CACHE_REFILL)
- break;
- pool->alloc.cache[pool->alloc.count++] = page;
+ /* Softirq guarantee CPU and thus NUMA node is stable. This,
+ * assumes CPU refilling driver RX-ring will also run RX-NAPI.
+ */
+#ifdef CONFIG_NUMA
+ pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid;
+#else
+ /* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */
+ pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */
+#endif
+
+ /* Refill alloc array, but only if NUMA match */
+ do {
+ netmem = (__force netmem_ref)__ptr_ring_consume(r);
+ if (unlikely(!netmem))
+ break;
+
+ if (likely(netmem_is_pref_nid(netmem, pref_nid))) {
+ pool->alloc.cache[pool->alloc.count++] = netmem;
+ } else {
+ /* NUMA mismatch;
+ * (1) release 1 page to page-allocator and
+ * (2) break out to fallthrough to alloc_pages_node.
+ * This limit stress on page buddy alloactor.
+ */
+ page_pool_return_netmem(pool, netmem);
+ alloc_stat_inc(pool, waive);
+ netmem = 0;
+ break;
}
- spin_unlock(&r->consumer_lock);
- return page;
+ } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL);
+
+ /* Return last page */
+ if (likely(pool->alloc.count > 0)) {
+ netmem = pool->alloc.cache[--pool->alloc.count];
+ alloc_stat_inc(pool, refill);
}
- /* Slow-path: Get page from locked ring queue */
- page = ptr_ring_consume(&pool->ring);
- return page;
+ return netmem;
}
-/* slow path */
-noinline
-static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
- gfp_t _gfp)
+/* fast path */
+static netmem_ref __page_pool_get_cached(struct page_pool *pool)
+{
+ netmem_ref netmem;
+
+ /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */
+ if (likely(pool->alloc.count)) {
+ /* Fast-path */
+ netmem = pool->alloc.cache[--pool->alloc.count];
+ alloc_stat_inc(pool, fast);
+ } else {
+ netmem = page_pool_refill_alloc_cache(pool);
+ }
+
+ return netmem;
+}
+
+static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
+ netmem_ref netmem,
+ u32 dma_sync_size)
+{
+#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC)
+ dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem);
+
+ dma_sync_size = min(dma_sync_size, pool->p.max_len);
+ __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
+ dma_sync_size, pool->p.dma_dir);
+#endif
+}
+
+static __always_inline void
+page_pool_dma_sync_for_device(const struct page_pool *pool,
+ netmem_ref netmem,
+ u32 dma_sync_size)
+{
+ if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) {
+ rcu_read_lock();
+ /* re-check under rcu_read_lock() to sync with page_pool_scrub() */
+ if (pool->dma_sync)
+ __page_pool_dma_sync_for_device(pool, netmem,
+ dma_sync_size);
+ rcu_read_unlock();
+ }
+}
+
+static int page_pool_register_dma_index(struct page_pool *pool,
+ netmem_ref netmem, gfp_t gfp)
+{
+ int err = 0;
+ u32 id;
+
+ if (unlikely(!PP_DMA_INDEX_BITS))
+ goto out;
+
+ if (in_softirq())
+ err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem),
+ PP_DMA_INDEX_LIMIT, gfp);
+ else
+ err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem),
+ PP_DMA_INDEX_LIMIT, gfp);
+ if (err) {
+ WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@");
+ goto out;
+ }
+
+ netmem_set_dma_index(netmem, id);
+out:
+ return err;
+}
+
+static int page_pool_release_dma_index(struct page_pool *pool,
+ netmem_ref netmem)
+{
+ struct page *old, *page = netmem_to_page(netmem);
+ unsigned long id;
+
+ if (unlikely(!PP_DMA_INDEX_BITS))
+ return 0;
+
+ id = netmem_get_dma_index(netmem);
+ if (!id)
+ return -1;
+
+ if (in_softirq())
+ old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0);
+ else
+ old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0);
+ if (old != page)
+ return -1;
+
+ netmem_set_dma_index(netmem, 0);
+
+ return 0;
+}
+
+static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp)
{
- struct page *page;
- gfp_t gfp = _gfp;
dma_addr_t dma;
+ int err;
- /* We could always set __GFP_COMP, and avoid this branch, as
- * prep_new_page() can handle order-0 with __GFP_COMP.
+ /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
+ * since dma_addr_t can be either 32 or 64 bits and does not always fit
+ * into page private data (i.e 32bit cpu with 64bit DMA caps)
+ * This mapping is kept for lifetime of page, until leaving pool.
*/
- if (pool->p.order)
- gfp |= __GFP_COMP;
+ dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0,
+ (PAGE_SIZE << pool->p.order), pool->p.dma_dir,
+ DMA_ATTR_SKIP_CPU_SYNC |
+ DMA_ATTR_WEAK_ORDERING);
+ if (dma_mapping_error(pool->p.dev, dma))
+ return false;
- /* FUTURE development:
- *
- * Current slow-path essentially falls back to single page
- * allocations, which doesn't improve performance. This code
- * need bulk allocation support from the page allocator code.
- */
+ if (page_pool_set_dma_addr_netmem(netmem, dma)) {
+ WARN_ONCE(1, "unexpected DMA address, please report to netdev@");
+ goto unmap_failed;
+ }
+
+ err = page_pool_register_dma_index(pool, netmem, gfp);
+ if (err)
+ goto unset_failed;
+
+ page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len);
+
+ return true;
- /* Cache was empty, do real allocation */
+unset_failed:
+ page_pool_set_dma_addr_netmem(netmem, 0);
+unmap_failed:
+ dma_unmap_page_attrs(pool->p.dev, dma,
+ PAGE_SIZE << pool->p.order, pool->p.dma_dir,
+ DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
+ return false;
+}
+
+static struct page *__page_pool_alloc_page_order(struct page_pool *pool,
+ gfp_t gfp)
+{
+ struct page *page;
+
+ gfp |= __GFP_COMP;
page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
- if (!page)
+ if (unlikely(!page))
return NULL;
- if (!(pool->p.flags & PP_FLAG_DMA_MAP))
- goto skip_dma_map;
-
- /* Setup DMA mapping: use page->private for DMA-addr
- * This mapping is kept for lifetime of page, until leaving pool.
- */
- dma = dma_map_page(pool->p.dev, page, 0,
- (PAGE_SIZE << pool->p.order),
- pool->p.dma_dir);
- if (dma_mapping_error(pool->p.dev, dma)) {
+ if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) {
put_page(page);
return NULL;
}
- set_page_private(page, dma); /* page->private = dma; */
-skip_dma_map:
- /* When page just alloc'ed is should/must have refcnt 1. */
+ alloc_stat_inc(pool, slow_high_order);
+ page_pool_set_pp_info(pool, page_to_netmem(page));
+
+ /* Track how many pages are held 'in-flight' */
+ pool->pages_state_hold_cnt++;
+ trace_page_pool_state_hold(pool, page_to_netmem(page),
+ pool->pages_state_hold_cnt);
return page;
}
+/* slow path */
+static noinline netmem_ref __page_pool_alloc_netmems_slow(struct page_pool *pool,
+ gfp_t gfp)
+{
+ const int bulk = PP_ALLOC_CACHE_REFILL;
+ unsigned int pp_order = pool->p.order;
+ bool dma_map = pool->dma_map;
+ netmem_ref netmem;
+ int i, nr_pages;
+
+ /* Unconditionally set NOWARN if allocating from NAPI.
+ * Drivers forget to set it, and OOM reports on packet Rx are useless.
+ */
+ if ((gfp & GFP_ATOMIC) == GFP_ATOMIC)
+ gfp |= __GFP_NOWARN;
+
+ /* Don't support bulk alloc for high-order pages */
+ if (unlikely(pp_order))
+ return page_to_netmem(__page_pool_alloc_page_order(pool, gfp));
+
+ /* Unnecessary as alloc cache is empty, but guarantees zero count */
+ if (unlikely(pool->alloc.count > 0))
+ return pool->alloc.cache[--pool->alloc.count];
+
+ /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */
+ memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
+
+ nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk,
+ (struct page **)pool->alloc.cache);
+ if (unlikely(!nr_pages))
+ return 0;
+
+ /* Pages have been filled into alloc.cache array, but count is zero and
+ * page element have not been (possibly) DMA mapped.
+ */
+ for (i = 0; i < nr_pages; i++) {
+ netmem = pool->alloc.cache[i];
+ if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) {
+ put_page(netmem_to_page(netmem));
+ continue;
+ }
+
+ page_pool_set_pp_info(pool, netmem);
+ pool->alloc.cache[pool->alloc.count++] = netmem;
+ /* Track how many pages are held 'in-flight' */
+ pool->pages_state_hold_cnt++;
+ trace_page_pool_state_hold(pool, netmem,
+ pool->pages_state_hold_cnt);
+ }
+
+ /* Return last page */
+ if (likely(pool->alloc.count > 0)) {
+ netmem = pool->alloc.cache[--pool->alloc.count];
+ alloc_stat_inc(pool, slow);
+ } else {
+ netmem = 0;
+ }
+
+ /* When page just alloc'ed is should/must have refcnt 1. */
+ return netmem;
+}
+
/* For using page_pool replace: alloc_pages() API calls, but provide
* synchronization guarantee for allocation side.
*/
-struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
+netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp)
{
- struct page *page;
+ netmem_ref netmem;
/* Fast-path: Get a page from cache */
- page = __page_pool_get_cached(pool);
- if (page)
- return page;
+ netmem = __page_pool_get_cached(pool);
+ if (netmem)
+ return netmem;
/* Slow-path: cache empty, do real allocation */
- page = __page_pool_alloc_pages_slow(pool, gfp);
- return page;
+ if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops)
+ netmem = pool->mp_ops->alloc_netmems(pool, gfp);
+ else
+ netmem = __page_pool_alloc_netmems_slow(pool, gfp);
+ return netmem;
+}
+EXPORT_SYMBOL(page_pool_alloc_netmems);
+ALLOW_ERROR_INJECTION(page_pool_alloc_netmems, NULL);
+
+struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
+{
+ return netmem_to_page(page_pool_alloc_netmems(pool, gfp));
}
EXPORT_SYMBOL(page_pool_alloc_pages);
-/* Cleanup page_pool state from page */
-static void __page_pool_clean_page(struct page_pool *pool,
- struct page *page)
+/* Calculate distance between two u32 values, valid if distance is below 2^(31)
+ * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
+ */
+#define _distance(a, b) (s32)((a) - (b))
+
+s32 page_pool_inflight(const struct page_pool *pool, bool strict)
+{
+ u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
+ u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
+ s32 inflight;
+
+ inflight = _distance(hold_cnt, release_cnt);
+
+ if (strict) {
+ trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);
+ WARN(inflight < 0, "Negative(%d) inflight packet-pages",
+ inflight);
+ } else {
+ inflight = max(0, inflight);
+ }
+
+ return inflight;
+}
+
+void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem)
{
- if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+ netmem_set_pp(netmem, pool);
+ netmem_or_pp_magic(netmem, PP_SIGNATURE);
+
+ /* Ensuring all pages have been split into one fragment initially:
+ * page_pool_set_pp_info() is only called once for every page when it
+ * is allocated from the page allocator and page_pool_fragment_page()
+ * is dirtying the same cache line as the page->pp_magic above, so
+ * the overhead is negligible.
+ */
+ page_pool_fragment_netmem(netmem, 1);
+ if (pool->has_init_callback)
+ pool->slow.init_callback(netmem, pool->slow.init_arg);
+}
+
+void page_pool_clear_pp_info(netmem_ref netmem)
+{
+ netmem_clear_pp_magic(netmem);
+ netmem_set_pp(netmem, NULL);
+}
+
+static __always_inline void __page_pool_release_netmem_dma(struct page_pool *pool,
+ netmem_ref netmem)
+{
+ dma_addr_t dma;
+
+ if (!pool->dma_map)
+ /* Always account for inflight pages, even if we didn't
+ * map them
+ */
+ return;
+
+ if (page_pool_release_dma_index(pool, netmem))
return;
- /* DMA unmap */
- dma_unmap_page(pool->p.dev, page_private(page),
- PAGE_SIZE << pool->p.order, pool->p.dma_dir);
- set_page_private(page, 0);
+ dma = page_pool_get_dma_addr_netmem(netmem);
+
+ /* When page is unmapped, it cannot be returned to our pool */
+ dma_unmap_page_attrs(pool->p.dev, dma,
+ PAGE_SIZE << pool->p.order, pool->p.dma_dir,
+ DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING);
+ page_pool_set_dma_addr_netmem(netmem, 0);
}
-/* Return a page to the page allocator, cleaning up our state */
-static void __page_pool_return_page(struct page_pool *pool, struct page *page)
+/* Disconnects a page (from a page_pool). API users can have a need
+ * to disconnect a page (from a page_pool), to allow it to be used as
+ * a regular page (that will eventually be returned to the normal
+ * page-allocator via put_page).
+ */
+static void page_pool_return_netmem(struct page_pool *pool, netmem_ref netmem)
{
- __page_pool_clean_page(pool, page);
- put_page(page);
+ int count;
+ bool put;
+
+ put = true;
+ if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops)
+ put = pool->mp_ops->release_netmem(pool, netmem);
+ else
+ __page_pool_release_netmem_dma(pool, netmem);
+
+ /* This may be the last page returned, releasing the pool, so
+ * it is not safe to reference pool afterwards.
+ */
+ count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
+ trace_page_pool_state_release(pool, netmem, count);
+
+ if (put) {
+ page_pool_clear_pp_info(netmem);
+ put_page(netmem_to_page(netmem));
+ }
/* An optimization would be to call __free_pages(page, pool->p.order)
* knowing page is not part of page-cache (thus avoiding a
* __page_cache_release() call).
*/
}
-static bool __page_pool_recycle_into_ring(struct page_pool *pool,
- struct page *page)
+static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem)
{
- int ret;
- /* BH protection not needed if current is serving softirq */
- if (in_serving_softirq())
- ret = ptr_ring_produce(&pool->ring, page);
- else
- ret = ptr_ring_produce_bh(&pool->ring, page);
+ bool in_softirq, ret;
+
+ /* BH protection not needed if current is softirq */
+ in_softirq = page_pool_producer_lock(pool);
+ ret = !__ptr_ring_produce(&pool->ring, (__force void *)netmem);
+ if (ret)
+ recycle_stat_inc(pool, ring);
+ page_pool_producer_unlock(pool, in_softirq);
- return (ret == 0) ? true : false;
+ return ret;
}
/* Only allow direct recycling in special circumstances, into the
@@ -213,39 +796,60 @@ static bool __page_pool_recycle_into_ring(struct page_pool *pool,
*
* Caller must provide appropriate safe context.
*/
-static bool __page_pool_recycle_direct(struct page *page,
+static bool page_pool_recycle_in_cache(netmem_ref netmem,
struct page_pool *pool)
{
- if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
+ if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) {
+ recycle_stat_inc(pool, cache_full);
return false;
+ }
/* Caller MUST have verified/know (page_ref_count(page) == 1) */
- pool->alloc.cache[pool->alloc.count++] = page;
+ pool->alloc.cache[pool->alloc.count++] = netmem;
+ recycle_stat_inc(pool, cached);
return true;
}
-void __page_pool_put_page(struct page_pool *pool,
- struct page *page, bool allow_direct)
+static bool __page_pool_page_can_be_recycled(netmem_ref netmem)
{
+ return netmem_is_net_iov(netmem) ||
+ (page_ref_count(netmem_to_page(netmem)) == 1 &&
+ !page_is_pfmemalloc(netmem_to_page(netmem)));
+}
+
+/* If the page refcnt == 1, this will try to recycle the page.
+ * If pool->dma_sync is set, we'll try to sync the DMA area for
+ * the configured size min(dma_sync_size, pool->max_len).
+ * If the page refcnt != 1, then the page will be returned to memory
+ * subsystem.
+ */
+static __always_inline netmem_ref
+__page_pool_put_page(struct page_pool *pool, netmem_ref netmem,
+ unsigned int dma_sync_size, bool allow_direct)
+{
+ lockdep_assert_no_hardirq();
+
/* This allocator is optimized for the XDP mode that uses
* one-frame-per-page, but have fallbacks that act like the
* regular page allocator APIs.
*
* refcnt == 1 means page_pool owns page, and can recycle it.
+ *
+ * page is NOT reusable when allocated when system is under
+ * some pressure. (page_is_pfmemalloc)
*/
- if (likely(page_ref_count(page) == 1)) {
+ if (likely(__page_pool_page_can_be_recycled(netmem))) {
/* Read barrier done in page_ref_count / READ_ONCE */
- if (allow_direct && in_serving_softirq())
- if (__page_pool_recycle_direct(page, pool))
- return;
+ page_pool_dma_sync_for_device(pool, netmem, dma_sync_size);
- if (!__page_pool_recycle_into_ring(pool, page)) {
- /* Cache full, fallback to free pages */
- __page_pool_return_page(pool, page);
- }
- return;
+ if (allow_direct && page_pool_recycle_in_cache(netmem, pool))
+ return 0;
+
+ /* Page found as candidate for recycling */
+ return netmem;
}
+
/* Fallback/non-XDP mode: API user have elevated refcnt.
*
* Many drivers split up the page into fragments, and some
@@ -259,59 +863,492 @@ void __page_pool_put_page(struct page_pool *pool,
* doing refcnt based recycle tricks, meaning another process
* will be invoking put_page.
*/
- __page_pool_clean_page(pool, page);
- put_page(page);
+ recycle_stat_inc(pool, released_refcnt);
+ page_pool_return_netmem(pool, netmem);
+
+ return 0;
}
-EXPORT_SYMBOL(__page_pool_put_page);
-static void __page_pool_empty_ring(struct page_pool *pool)
+static bool page_pool_napi_local(const struct page_pool *pool)
{
- struct page *page;
+ const struct napi_struct *napi;
+ u32 cpuid;
+
+ /* On PREEMPT_RT the softirq can be preempted by the consumer */
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return false;
+
+ if (unlikely(!in_softirq()))
+ return false;
+
+ /* Allow direct recycle if we have reasons to believe that we are
+ * in the same context as the consumer would run, so there's
+ * no possible race.
+ * __page_pool_put_page() makes sure we're not in hardirq context
+ * and interrupts are enabled prior to accessing the cache.
+ */
+ cpuid = smp_processor_id();
+ if (READ_ONCE(pool->cpuid) == cpuid)
+ return true;
+
+ napi = READ_ONCE(pool->p.napi);
+
+ return napi && READ_ONCE(napi->list_owner) == cpuid;
+}
+
+void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem,
+ unsigned int dma_sync_size, bool allow_direct)
+{
+ if (!allow_direct)
+ allow_direct = page_pool_napi_local(pool);
+
+ netmem = __page_pool_put_page(pool, netmem, dma_sync_size,
+ allow_direct);
+ if (netmem && !page_pool_recycle_in_ring(pool, netmem)) {
+ /* Cache full, fallback to free pages */
+ recycle_stat_inc(pool, ring_full);
+ page_pool_return_netmem(pool, netmem);
+ }
+}
+EXPORT_SYMBOL(page_pool_put_unrefed_netmem);
+
+void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
+ unsigned int dma_sync_size, bool allow_direct)
+{
+ page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size,
+ allow_direct);
+}
+EXPORT_SYMBOL(page_pool_put_unrefed_page);
+
+static void page_pool_recycle_ring_bulk(struct page_pool *pool,
+ netmem_ref *bulk,
+ u32 bulk_len)
+{
+ bool in_softirq;
+ u32 i;
+
+ /* Bulk produce into ptr_ring page_pool cache */
+ in_softirq = page_pool_producer_lock(pool);
+
+ for (i = 0; i < bulk_len; i++) {
+ if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) {
+ /* ring full */
+ recycle_stat_inc(pool, ring_full);
+ break;
+ }
+ }
+
+ page_pool_producer_unlock(pool, in_softirq);
+ recycle_stat_add(pool, ring, i);
+
+ /* Hopefully all pages were returned into ptr_ring */
+ if (likely(i == bulk_len))
+ return;
+
+ /*
+ * ptr_ring cache is full, free remaining pages outside producer lock
+ * since put_page() with refcnt == 1 can be an expensive operation.
+ */
+ for (; i < bulk_len; i++)
+ page_pool_return_netmem(pool, bulk[i]);
+}
+
+/**
+ * page_pool_put_netmem_bulk() - release references on multiple netmems
+ * @data: array holding netmem references
+ * @count: number of entries in @data
+ *
+ * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring
+ * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk()
+ * will release leftover netmems to the memory provider.
+ * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx
+ * completion loop for the XDP_REDIRECT use case.
+ *
+ * Please note the caller must not use data area after running
+ * page_pool_put_netmem_bulk(), as this function overwrites it.
+ */
+void page_pool_put_netmem_bulk(netmem_ref *data, u32 count)
+{
+ u32 bulk_len = 0;
+
+ for (u32 i = 0; i < count; i++) {
+ netmem_ref netmem = netmem_compound_head(data[i]);
+
+ if (page_pool_unref_and_test(netmem))
+ data[bulk_len++] = netmem;
+ }
+
+ count = bulk_len;
+ while (count) {
+ netmem_ref bulk[XDP_BULK_QUEUE_SIZE];
+ struct page_pool *pool = NULL;
+ bool allow_direct;
+ u32 foreign = 0;
+
+ bulk_len = 0;
+
+ for (u32 i = 0; i < count; i++) {
+ struct page_pool *netmem_pp;
+ netmem_ref netmem = data[i];
+
+ netmem_pp = netmem_get_pp(netmem);
+ if (unlikely(!pool)) {
+ pool = netmem_pp;
+ allow_direct = page_pool_napi_local(pool);
+ } else if (netmem_pp != pool) {
+ /*
+ * If the netmem belongs to a different
+ * page_pool, save it for another round.
+ */
+ data[foreign++] = netmem;
+ continue;
+ }
+
+ netmem = __page_pool_put_page(pool, netmem, -1,
+ allow_direct);
+ /* Approved for bulk recycling in ptr_ring cache */
+ if (netmem)
+ bulk[bulk_len++] = netmem;
+ }
+
+ if (bulk_len)
+ page_pool_recycle_ring_bulk(pool, bulk, bulk_len);
+
+ count = foreign;
+ }
+}
+EXPORT_SYMBOL(page_pool_put_netmem_bulk);
+
+static netmem_ref page_pool_drain_frag(struct page_pool *pool,
+ netmem_ref netmem)
+{
+ long drain_count = BIAS_MAX - pool->frag_users;
+
+ /* Some user is still using the page frag */
+ if (likely(page_pool_unref_netmem(netmem, drain_count)))
+ return 0;
+
+ if (__page_pool_page_can_be_recycled(netmem)) {
+ page_pool_dma_sync_for_device(pool, netmem, -1);
+ return netmem;
+ }
+
+ page_pool_return_netmem(pool, netmem);
+ return 0;
+}
+
+static void page_pool_free_frag(struct page_pool *pool)
+{
+ long drain_count = BIAS_MAX - pool->frag_users;
+ netmem_ref netmem = pool->frag_page;
+
+ pool->frag_page = 0;
+
+ if (!netmem || page_pool_unref_netmem(netmem, drain_count))
+ return;
+
+ page_pool_return_netmem(pool, netmem);
+}
+
+netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
+ unsigned int *offset, unsigned int size,
+ gfp_t gfp)
+{
+ unsigned int max_size = PAGE_SIZE << pool->p.order;
+ netmem_ref netmem = pool->frag_page;
+
+ if (WARN_ON(size > max_size))
+ return 0;
+
+ size = ALIGN(size, dma_get_cache_alignment());
+ *offset = pool->frag_offset;
+
+ if (netmem && *offset + size > max_size) {
+ netmem = page_pool_drain_frag(pool, netmem);
+ if (netmem) {
+ recycle_stat_inc(pool, cached);
+ alloc_stat_inc(pool, fast);
+ goto frag_reset;
+ }
+ }
+
+ if (!netmem) {
+ netmem = page_pool_alloc_netmems(pool, gfp);
+ if (unlikely(!netmem)) {
+ pool->frag_page = 0;
+ return 0;
+ }
+
+ pool->frag_page = netmem;
+
+frag_reset:
+ pool->frag_users = 1;
+ *offset = 0;
+ pool->frag_offset = size;
+ page_pool_fragment_netmem(netmem, BIAS_MAX);
+ return netmem;
+ }
+
+ pool->frag_users++;
+ pool->frag_offset = *offset + size;
+ return netmem;
+}
+EXPORT_SYMBOL(page_pool_alloc_frag_netmem);
+
+struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
+ unsigned int size, gfp_t gfp)
+{
+ return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size,
+ gfp));
+}
+EXPORT_SYMBOL(page_pool_alloc_frag);
+
+static void page_pool_empty_ring(struct page_pool *pool)
+{
+ netmem_ref netmem;
/* Empty recycle ring */
- while ((page = ptr_ring_consume_bh(&pool->ring))) {
+ while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) {
/* Verify the refcnt invariant of cached pages */
- if (!(page_ref_count(page) == 1))
+ if (!(netmem_ref_count(netmem) == 1))
pr_crit("%s() page_pool refcnt %d violation\n",
- __func__, page_ref_count(page));
+ __func__, netmem_ref_count(netmem));
- __page_pool_return_page(pool, page);
+ page_pool_return_netmem(pool, netmem);
}
}
-static void __page_pool_destroy_rcu(struct rcu_head *rcu)
+static void __page_pool_destroy(struct page_pool *pool)
{
- struct page_pool *pool;
+ if (pool->disconnect)
+ pool->disconnect(pool);
- pool = container_of(rcu, struct page_pool, rcu);
+ page_pool_unlist(pool);
+ page_pool_uninit(pool);
- WARN(pool->alloc.count, "API usage violation");
+ if (pool->mp_ops) {
+ pool->mp_ops->destroy(pool);
+ static_branch_dec(&page_pool_mem_providers);
+ }
- __page_pool_empty_ring(pool);
- ptr_ring_cleanup(&pool->ring, NULL);
kfree(pool);
}
-/* Cleanup and release resources */
-void page_pool_destroy(struct page_pool *pool)
+static void page_pool_empty_alloc_cache_once(struct page_pool *pool)
{
- struct page *page;
+ netmem_ref netmem;
+
+ if (pool->destroy_cnt)
+ return;
/* Empty alloc cache, assume caller made sure this is
* no-longer in use, and page_pool_alloc_pages() cannot be
* call concurrently.
*/
while (pool->alloc.count) {
- page = pool->alloc.cache[--pool->alloc.count];
- __page_pool_return_page(pool, page);
+ netmem = pool->alloc.cache[--pool->alloc.count];
+ page_pool_return_netmem(pool, netmem);
+ }
+}
+
+static void page_pool_scrub(struct page_pool *pool)
+{
+ unsigned long id;
+ void *ptr;
+
+ page_pool_empty_alloc_cache_once(pool);
+ if (!pool->destroy_cnt++ && pool->dma_map) {
+ if (pool->dma_sync) {
+ /* Disable page_pool_dma_sync_for_device() */
+ pool->dma_sync = false;
+
+ /* Make sure all concurrent returns that may see the old
+ * value of dma_sync (and thus perform a sync) have
+ * finished before doing the unmapping below. Skip the
+ * wait if the device doesn't actually need syncing, or
+ * if there are no outstanding mapped pages.
+ */
+ if (dma_dev_need_sync(pool->p.dev) &&
+ !xa_empty(&pool->dma_mapped))
+ synchronize_net();
+ }
+
+ xa_for_each(&pool->dma_mapped, id, ptr)
+ __page_pool_release_netmem_dma(pool, page_to_netmem((struct page *)ptr));
}
/* No more consumers should exist, but producers could still
* be in-flight.
*/
- __page_pool_empty_ring(pool);
+ page_pool_empty_ring(pool);
+}
+
+static int page_pool_release(struct page_pool *pool)
+{
+ bool in_softirq;
+ int inflight;
+
+ page_pool_scrub(pool);
+ inflight = page_pool_inflight(pool, true);
+ /* Acquire producer lock to make sure producers have exited. */
+ in_softirq = page_pool_producer_lock(pool);
+ page_pool_producer_unlock(pool, in_softirq);
+ if (!inflight)
+ __page_pool_destroy(pool);
- /* An xdp_mem_allocator can still ref page_pool pointer */
- call_rcu(&pool->rcu, __page_pool_destroy_rcu);
+ return inflight;
+}
+
+static void page_pool_release_retry(struct work_struct *wq)
+{
+ struct delayed_work *dwq = to_delayed_work(wq);
+ struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw);
+ void *netdev;
+ int inflight;
+
+ inflight = page_pool_release(pool);
+ /* In rare cases, a driver bug may cause inflight to go negative.
+ * Don't reschedule release if inflight is 0 or negative.
+ * - If 0, the page_pool has been destroyed
+ * - if negative, we will never recover
+ * in both cases no reschedule is necessary.
+ */
+ if (inflight <= 0)
+ return;
+
+ /* Periodic warning for page pools the user can't see */
+ netdev = READ_ONCE(pool->slow.netdev);
+ if (time_after_eq(jiffies, pool->defer_warn) &&
+ (!netdev || netdev == NET_PTR_POISON)) {
+ int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ;
+
+ pr_warn("%s() stalled pool shutdown: id %u, %d inflight %d sec\n",
+ __func__, pool->user.id, inflight, sec);
+ pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
+ }
+
+ /* Still not ready to be disconnected, retry later */
+ schedule_delayed_work(&pool->release_dw, DEFER_TIME);
+}
+
+void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *),
+ const struct xdp_mem_info *mem)
+{
+ refcount_inc(&pool->user_cnt);
+ pool->disconnect = disconnect;
+ pool->xdp_mem_id = mem->id;
+}
+
+/**
+ * page_pool_enable_direct_recycling() - mark page pool as owned by NAPI
+ * @pool: page pool to modify
+ * @napi: NAPI instance to associate the page pool with
+ *
+ * Associate a page pool with a NAPI instance for lockless page recycling.
+ * This is useful when a new page pool has to be added to a NAPI instance
+ * without disabling that NAPI instance, to mark the point at which control
+ * path "hands over" the page pool to the NAPI instance. In most cases driver
+ * can simply set the @napi field in struct page_pool_params, and does not
+ * have to call this helper.
+ *
+ * The function is idempotent, but does not implement any refcounting.
+ * Single page_pool_disable_direct_recycling() will disable recycling,
+ * no matter how many times enable was called.
+ */
+void page_pool_enable_direct_recycling(struct page_pool *pool,
+ struct napi_struct *napi)
+{
+ if (READ_ONCE(pool->p.napi) == napi)
+ return;
+ WARN_ON(!napi || pool->p.napi);
+
+ mutex_lock(&page_pools_lock);
+ WRITE_ONCE(pool->p.napi, napi);
+ mutex_unlock(&page_pools_lock);
+}
+EXPORT_SYMBOL(page_pool_enable_direct_recycling);
+
+void page_pool_disable_direct_recycling(struct page_pool *pool)
+{
+ /* Disable direct recycling based on pool->cpuid.
+ * Paired with READ_ONCE() in page_pool_napi_local().
+ */
+ WRITE_ONCE(pool->cpuid, -1);
+
+ if (!pool->p.napi)
+ return;
+
+ napi_assert_will_not_race(pool->p.napi);
+
+ mutex_lock(&page_pools_lock);
+ WRITE_ONCE(pool->p.napi, NULL);
+ mutex_unlock(&page_pools_lock);
+}
+EXPORT_SYMBOL(page_pool_disable_direct_recycling);
+
+void page_pool_destroy(struct page_pool *pool)
+{
+ if (!pool)
+ return;
+
+ if (!page_pool_put(pool))
+ return;
+
+ page_pool_disable_direct_recycling(pool);
+ page_pool_free_frag(pool);
+
+ if (!page_pool_release(pool))
+ return;
+
+ page_pool_detached(pool);
+ pool->defer_start = jiffies;
+ pool->defer_warn = jiffies + DEFER_WARN_INTERVAL;
+
+ INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry);
+ schedule_delayed_work(&pool->release_dw, DEFER_TIME);
}
EXPORT_SYMBOL(page_pool_destroy);
+
+/* Caller must provide appropriate safe context, e.g. NAPI. */
+void page_pool_update_nid(struct page_pool *pool, int new_nid)
+{
+ netmem_ref netmem;
+
+ trace_page_pool_update_nid(pool, new_nid);
+ pool->p.nid = new_nid;
+
+ /* Flush pool alloc cache, as refill will check NUMA node */
+ while (pool->alloc.count) {
+ netmem = pool->alloc.cache[--pool->alloc.count];
+ page_pool_return_netmem(pool, netmem);
+ }
+}
+EXPORT_SYMBOL(page_pool_update_nid);
+
+bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr)
+{
+ return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr);
+}
+
+/* Associate a niov with a page pool. Should follow with a matching
+ * net_mp_niov_clear_page_pool()
+ */
+void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov)
+{
+ netmem_ref netmem = net_iov_to_netmem(niov);
+
+ page_pool_set_pp_info(pool, netmem);
+
+ pool->pages_state_hold_cnt++;
+ trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt);
+}
+
+/* Disassociate a niov from a page pool. Should only be used in the
+ * ->release_netmem() path.
+ */
+void net_mp_niov_clear_page_pool(struct net_iov *niov)
+{
+ netmem_ref netmem = net_iov_to_netmem(niov);
+
+ page_pool_clear_pp_info(netmem);
+}
diff --git a/net/core/page_pool_priv.h b/net/core/page_pool_priv.h
new file mode 100644
index 000000000000..2fb06d5f6d55
--- /dev/null
+++ b/net/core/page_pool_priv.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __PAGE_POOL_PRIV_H
+#define __PAGE_POOL_PRIV_H
+
+#include <net/page_pool/helpers.h>
+
+#include "netmem_priv.h"
+
+extern struct mutex page_pools_lock;
+
+s32 page_pool_inflight(const struct page_pool *pool, bool strict);
+
+int page_pool_list(struct page_pool *pool);
+void page_pool_detached(struct page_pool *pool);
+void page_pool_unlist(struct page_pool *pool);
+
+static inline bool
+page_pool_set_dma_addr_netmem(netmem_ref netmem, dma_addr_t addr)
+{
+ if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) {
+ netmem_set_dma_addr(netmem, addr >> PAGE_SHIFT);
+
+ /* We assume page alignment to shave off bottom bits,
+ * if this "compression" doesn't work we need to drop.
+ */
+ return addr != (dma_addr_t)netmem_get_dma_addr(netmem)
+ << PAGE_SHIFT;
+ }
+
+ netmem_set_dma_addr(netmem, addr);
+ return false;
+}
+
+static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
+{
+ return page_pool_set_dma_addr_netmem(page_to_netmem(page), addr);
+}
+
+#if defined(CONFIG_PAGE_POOL)
+void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem);
+void page_pool_clear_pp_info(netmem_ref netmem);
+int page_pool_check_memory_provider(struct net_device *dev,
+ struct netdev_rx_queue *rxq);
+#else
+static inline void page_pool_set_pp_info(struct page_pool *pool,
+ netmem_ref netmem)
+{
+}
+static inline void page_pool_clear_pp_info(netmem_ref netmem)
+{
+}
+static inline int page_pool_check_memory_provider(struct net_device *dev,
+ struct netdev_rx_queue *rxq)
+{
+ return 0;
+}
+#endif
+
+#endif
diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c
new file mode 100644
index 000000000000..c82a95beceff
--- /dev/null
+++ b/net/core/page_pool_user.c
@@ -0,0 +1,441 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/xarray.h>
+#include <net/busy_poll.h>
+#include <net/net_debug.h>
+#include <net/netdev_rx_queue.h>
+#include <net/page_pool/helpers.h>
+#include <net/page_pool/types.h>
+#include <net/page_pool/memory_provider.h>
+#include <net/sock.h>
+
+#include "page_pool_priv.h"
+#include "netdev-genl-gen.h"
+
+static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1);
+/* Protects: page_pools, netdevice->page_pools, pool->p.napi, pool->slow.netdev,
+ * pool->user.
+ * Ordering: inside rtnl_lock
+ */
+DEFINE_MUTEX(page_pools_lock);
+
+/* Page pools are only reachable from user space (via netlink) if they are
+ * linked to a netdev at creation time. Following page pool "visibility"
+ * states are possible:
+ * - normal
+ * - user.list: linked to real netdev, netdev: real netdev
+ * - orphaned - real netdev has disappeared
+ * - user.list: linked to lo, netdev: lo
+ * - invisible - either (a) created without netdev linking, (b) unlisted due
+ * to error, or (c) the entire namespace which owned this pool disappeared
+ * - user.list: unhashed, netdev: unknown
+ */
+
+typedef int (*pp_nl_fill_cb)(struct sk_buff *rsp, const struct page_pool *pool,
+ const struct genl_info *info);
+
+static int
+netdev_nl_page_pool_get_do(struct genl_info *info, u32 id, pp_nl_fill_cb fill)
+{
+ struct page_pool *pool;
+ struct sk_buff *rsp;
+ int err;
+
+ mutex_lock(&page_pools_lock);
+ pool = xa_load(&page_pools, id);
+ if (!pool || hlist_unhashed(&pool->user.list) ||
+ !net_eq(dev_net(pool->slow.netdev), genl_info_net(info))) {
+ err = -ENOENT;
+ goto err_unlock;
+ }
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp) {
+ err = -ENOMEM;
+ goto err_unlock;
+ }
+
+ err = fill(rsp, pool, info);
+ if (err)
+ goto err_free_msg;
+
+ mutex_unlock(&page_pools_lock);
+
+ return genlmsg_reply(rsp, info);
+
+err_free_msg:
+ nlmsg_free(rsp);
+err_unlock:
+ mutex_unlock(&page_pools_lock);
+ return err;
+}
+
+struct page_pool_dump_cb {
+ unsigned long ifindex;
+ u32 pp_id;
+};
+
+static int
+netdev_nl_page_pool_get_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ pp_nl_fill_cb fill)
+{
+ struct page_pool_dump_cb *state = (void *)cb->ctx;
+ const struct genl_info *info = genl_info_dump(cb);
+ struct net *net = sock_net(skb->sk);
+ struct net_device *netdev;
+ struct page_pool *pool;
+ int err = 0;
+
+ rtnl_lock();
+ mutex_lock(&page_pools_lock);
+ for_each_netdev_dump(net, netdev, state->ifindex) {
+ hlist_for_each_entry(pool, &netdev->page_pools, user.list) {
+ if (state->pp_id && state->pp_id < pool->user.id)
+ continue;
+
+ state->pp_id = pool->user.id;
+ err = fill(skb, pool, info);
+ if (err)
+ goto out;
+ }
+
+ state->pp_id = 0;
+ }
+out:
+ mutex_unlock(&page_pools_lock);
+ rtnl_unlock();
+
+ return err;
+}
+
+static int
+page_pool_nl_stats_fill(struct sk_buff *rsp, const struct page_pool *pool,
+ const struct genl_info *info)
+{
+#ifdef CONFIG_PAGE_POOL_STATS
+ struct page_pool_stats stats = {};
+ struct nlattr *nest;
+ void *hdr;
+
+ if (!page_pool_get_stats(pool, &stats))
+ return 0;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ nest = nla_nest_start(rsp, NETDEV_A_PAGE_POOL_STATS_INFO);
+
+ if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id) ||
+ (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX &&
+ nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX,
+ pool->slow.netdev->ifindex)))
+ goto err_cancel_nest;
+
+ nla_nest_end(rsp, nest);
+
+ if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_FAST,
+ stats.alloc_stats.fast) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW,
+ stats.alloc_stats.slow) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW_HIGH_ORDER,
+ stats.alloc_stats.slow_high_order) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_EMPTY,
+ stats.alloc_stats.empty) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_REFILL,
+ stats.alloc_stats.refill) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_WAIVE,
+ stats.alloc_stats.waive) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHED,
+ stats.recycle_stats.cached) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHE_FULL,
+ stats.recycle_stats.cache_full) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING,
+ stats.recycle_stats.ring) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING_FULL,
+ stats.recycle_stats.ring_full) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RELEASED_REFCNT,
+ stats.recycle_stats.released_refcnt))
+ goto err_cancel_msg;
+
+ genlmsg_end(rsp, hdr);
+
+ return 0;
+err_cancel_nest:
+ nla_nest_cancel(rsp, nest);
+err_cancel_msg:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
+#else
+ GENL_SET_ERR_MSG(info, "kernel built without CONFIG_PAGE_POOL_STATS");
+ return -EOPNOTSUPP;
+#endif
+}
+
+int netdev_nl_page_pool_stats_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct nlattr *tb[ARRAY_SIZE(netdev_page_pool_info_nl_policy)];
+ struct nlattr *nest;
+ int err;
+ u32 id;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_STATS_INFO))
+ return -EINVAL;
+
+ nest = info->attrs[NETDEV_A_PAGE_POOL_STATS_INFO];
+ err = nla_parse_nested(tb, ARRAY_SIZE(tb) - 1, nest,
+ netdev_page_pool_info_nl_policy,
+ info->extack);
+ if (err)
+ return err;
+
+ if (NL_REQ_ATTR_CHECK(info->extack, nest, tb, NETDEV_A_PAGE_POOL_ID))
+ return -EINVAL;
+ if (tb[NETDEV_A_PAGE_POOL_IFINDEX]) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[NETDEV_A_PAGE_POOL_IFINDEX],
+ "selecting by ifindex not supported");
+ return -EINVAL;
+ }
+
+ id = nla_get_uint(tb[NETDEV_A_PAGE_POOL_ID]);
+
+ return netdev_nl_page_pool_get_do(info, id, page_pool_nl_stats_fill);
+}
+
+int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_stats_fill);
+}
+
+static int
+page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool,
+ const struct genl_info *info)
+{
+ size_t inflight, refsz;
+ unsigned int napi_id;
+ void *hdr;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id))
+ goto err_cancel;
+
+ if (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX &&
+ nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX,
+ pool->slow.netdev->ifindex))
+ goto err_cancel;
+
+ napi_id = pool->p.napi ? READ_ONCE(pool->p.napi->napi_id) : 0;
+ if (napi_id_valid(napi_id) &&
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, napi_id))
+ goto err_cancel;
+
+ inflight = page_pool_inflight(pool, false);
+ refsz = PAGE_SIZE << pool->p.order;
+ if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT, inflight) ||
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT_MEM,
+ inflight * refsz))
+ goto err_cancel;
+ if (pool->user.detach_time &&
+ nla_put_uint(rsp, NETDEV_A_PAGE_POOL_DETACH_TIME,
+ pool->user.detach_time))
+ goto err_cancel;
+
+ if (pool->mp_ops && pool->mp_ops->nl_fill(pool->mp_priv, rsp, NULL))
+ goto err_cancel;
+
+ genlmsg_end(rsp, hdr);
+
+ return 0;
+err_cancel:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
+}
+
+static void netdev_nl_page_pool_event(const struct page_pool *pool, u32 cmd)
+{
+ struct genl_info info;
+ struct sk_buff *ntf;
+ struct net *net;
+
+ lockdep_assert_held(&page_pools_lock);
+
+ /* 'invisible' page pools don't matter */
+ if (hlist_unhashed(&pool->user.list))
+ return;
+ net = dev_net(pool->slow.netdev);
+
+ if (!genl_has_listeners(&netdev_nl_family, net, NETDEV_NLGRP_PAGE_POOL))
+ return;
+
+ genl_info_init_ntf(&info, &netdev_nl_family, cmd);
+
+ ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!ntf)
+ return;
+
+ if (page_pool_nl_fill(ntf, pool, &info)) {
+ nlmsg_free(ntf);
+ return;
+ }
+
+ genlmsg_multicast_netns(&netdev_nl_family, net, ntf,
+ 0, NETDEV_NLGRP_PAGE_POOL, GFP_KERNEL);
+}
+
+int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ u32 id;
+
+ if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_ID))
+ return -EINVAL;
+
+ id = nla_get_uint(info->attrs[NETDEV_A_PAGE_POOL_ID]);
+
+ return netdev_nl_page_pool_get_do(info, id, page_pool_nl_fill);
+}
+
+int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_fill);
+}
+
+int page_pool_list(struct page_pool *pool)
+{
+ static u32 id_alloc_next;
+ int err;
+
+ mutex_lock(&page_pools_lock);
+ err = xa_alloc_cyclic(&page_pools, &pool->user.id, pool, xa_limit_32b,
+ &id_alloc_next, GFP_KERNEL);
+ if (err < 0)
+ goto err_unlock;
+
+ INIT_HLIST_NODE(&pool->user.list);
+ if (pool->slow.netdev) {
+ hlist_add_head(&pool->user.list,
+ &pool->slow.netdev->page_pools);
+ netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_ADD_NTF);
+ }
+
+ mutex_unlock(&page_pools_lock);
+ return 0;
+
+err_unlock:
+ mutex_unlock(&page_pools_lock);
+ return err;
+}
+
+void page_pool_detached(struct page_pool *pool)
+{
+ mutex_lock(&page_pools_lock);
+ pool->user.detach_time = ktime_get_boottime_seconds();
+ netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_CHANGE_NTF);
+ mutex_unlock(&page_pools_lock);
+}
+
+void page_pool_unlist(struct page_pool *pool)
+{
+ mutex_lock(&page_pools_lock);
+ netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_DEL_NTF);
+ xa_erase(&page_pools, pool->user.id);
+ if (!hlist_unhashed(&pool->user.list))
+ hlist_del(&pool->user.list);
+ mutex_unlock(&page_pools_lock);
+}
+
+int page_pool_check_memory_provider(struct net_device *dev,
+ struct netdev_rx_queue *rxq)
+{
+ void *binding = rxq->mp_params.mp_priv;
+ struct page_pool *pool;
+ struct hlist_node *n;
+
+ if (!binding)
+ return 0;
+
+ mutex_lock(&page_pools_lock);
+ hlist_for_each_entry_safe(pool, n, &dev->page_pools, user.list) {
+ if (pool->mp_priv != binding)
+ continue;
+
+ if (pool->slow.queue_idx == get_netdev_rx_queue_index(rxq)) {
+ mutex_unlock(&page_pools_lock);
+ return 0;
+ }
+ }
+ mutex_unlock(&page_pools_lock);
+ return -ENODATA;
+}
+
+static void page_pool_unreg_netdev_wipe(struct net_device *netdev)
+{
+ struct page_pool *pool;
+ struct hlist_node *n;
+
+ mutex_lock(&page_pools_lock);
+ hlist_for_each_entry_safe(pool, n, &netdev->page_pools, user.list) {
+ hlist_del_init(&pool->user.list);
+ pool->slow.netdev = NET_PTR_POISON;
+ }
+ mutex_unlock(&page_pools_lock);
+}
+
+static void page_pool_unreg_netdev(struct net_device *netdev)
+{
+ struct page_pool *pool, *last;
+ struct net_device *lo;
+
+ lo = dev_net(netdev)->loopback_dev;
+
+ mutex_lock(&page_pools_lock);
+ last = NULL;
+ hlist_for_each_entry(pool, &netdev->page_pools, user.list) {
+ pool->slow.netdev = lo;
+ netdev_nl_page_pool_event(pool,
+ NETDEV_CMD_PAGE_POOL_CHANGE_NTF);
+ last = pool;
+ }
+ if (last)
+ hlist_splice_init(&netdev->page_pools, &last->user.list,
+ &lo->page_pools);
+ mutex_unlock(&page_pools_lock);
+}
+
+static int
+page_pool_netdevice_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+
+ if (event != NETDEV_UNREGISTER)
+ return NOTIFY_DONE;
+
+ if (hlist_empty(&netdev->page_pools))
+ return NOTIFY_OK;
+
+ if (netdev->ifindex != LOOPBACK_IFINDEX)
+ page_pool_unreg_netdev(netdev);
+ else
+ page_pool_unreg_netdev_wipe(netdev);
+ return NOTIFY_OK;
+}
+
+static struct notifier_block page_pool_netdevice_nb = {
+ .notifier_call = page_pool_netdevice_event,
+};
+
+static int __init page_pool_user_init(void)
+{
+ return register_netdevice_notifier(&page_pool_netdevice_nb);
+}
+
+subsys_initcall(page_pool_user_init);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 7f6938405fa1..d41b03fd1f63 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Authors:
* Copyright 2001, 2002 by Robert Olsson <robert.olsson@its.uu.se>
@@ -8,12 +9,6 @@
* Ben Greear <greearb@candelatech.com>
* Jens Låås <jens.laas@data.slu.se>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- *
* A tool for loading the network with preconfigurated packets.
* The tool is implemented as a linux module. Parameters are output
* device, delay (to hard_xmit), number of packets, and whether
@@ -60,9 +55,8 @@
*
* Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br)
*
- *
* 021124 Finished major redesign and rewrite for new functionality.
- * See Documentation/networking/pktgen.txt for how to use this.
+ * See Documentation/networking/pktgen.rst for how to use this.
*
* The new operation:
* For each CPU one thread/process is created at start. This process checks
@@ -75,7 +69,7 @@
*
* By design there should only be *one* "controlling" process. In practice
* multiple write accesses gives unpredictable result. Understood by "write"
- * to /proc gives result code thats should be read be the "writer".
+ * to /proc gives result code that should be read be the "writer".
* For practical use this should be no problem.
*
* Note when adding devices to a specific CPU there good idea to also assign
@@ -114,13 +108,13 @@
*
* Fixed src_mac command to set source mac of packet to value specified in
* command by Adit Ranadive <adit.262@gmail.com>
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/sys.h>
#include <linux/types.h>
+#include <linux/minmax.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/kernel.h>
@@ -158,15 +152,14 @@
#include <linux/etherdevice.h>
#include <linux/kthread.h>
#include <linux/prefetch.h>
+#include <linux/mmzone.h>
#include <net/net_namespace.h>
#include <net/checksum.h>
#include <net/ipv6.h>
#include <net/udp.h>
#include <net/ip6_checksum.h>
#include <net/addrconf.h>
-#ifdef CONFIG_XFRM
#include <net/xfrm.h>
-#endif
#include <net/netns/generic.h>
#include <asm/byteorder.h>
#include <linux/rcupdate.h>
@@ -181,8 +174,11 @@
#define IP_NAME_SZ 32
#define MAX_MPLS_LABELS 16 /* This is the max label stack depth */
#define MPLS_STACK_BOTTOM htonl(0x00000100)
+/* Max number of internet mix entries that can be specified in imix_weights. */
+#define MAX_IMIX_ENTRIES 20
+#define IMIX_PRECISION 100 /* Precision of IMIX distribution */
-#define func_enter() pr_debug("entering %s\n", __func__);
+#define func_enter() pr_debug("entering %s\n", __func__)
#define PKT_FLAGS \
pf(IPV6) /* Interface in IPV6 Mode */ \
@@ -203,6 +199,7 @@
pf(VID_RND) /* Random VLAN ID */ \
pf(SVID_RND) /* Random SVLAN ID */ \
pf(NODE) /* Node memory alloc*/ \
+ pf(SHARED) /* Shared SKB */ \
#define pf(flag) flag##_SHIFT,
enum pkt_flags {
@@ -231,12 +228,12 @@ static char *pkt_flag_names[] = {
/* Xmit modes */
#define M_START_XMIT 0 /* Default normal TX */
-#define M_NETIF_RECEIVE 1 /* Inject packets into stack */
+#define M_NETIF_RECEIVE 1 /* Inject packets into stack */
#define M_QUEUE_XMIT 2 /* Inject packet into qdisc */
/* If lock -- protects updating of if_list */
-#define if_lock(t) mutex_lock(&(t->if_lock));
-#define if_unlock(t) mutex_unlock(&(t->if_lock));
+#define if_lock(t) mutex_lock(&(t->if_lock))
+#define if_unlock(t) mutex_unlock(&(t->if_lock))
/* Used to help with determining the pkts on receive */
#define PKTGEN_MAGIC 0xbe9be955
@@ -248,6 +245,12 @@ static char *pkt_flag_names[] = {
#define VLAN_TAG_SIZE(x) ((x)->vlan_id == 0xffff ? 0 : 4)
#define SVLAN_TAG_SIZE(x) ((x)->svlan_id == 0xffff ? 0 : 4)
+struct imix_pkt {
+ u64 size;
+ u64 weight;
+ u64 count_so_far;
+};
+
struct flow_state {
__be32 cur_daddr;
int count;
@@ -281,7 +284,8 @@ struct pktgen_dev {
int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */
int nfrags;
int removal_mark; /* non-zero => the device is marked for
- * removal by worker thread */
+ * removal by worker thread
+ */
struct page *page;
u64 delay; /* nano-seconds */
@@ -344,10 +348,18 @@ struct pktgen_dev {
__u16 udp_dst_max; /* exclusive, dest UDP port */
/* DSCP + ECN */
- __u8 tos; /* six MSB of (former) IPv4 TOS
- are for dscp codepoint */
- __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6
- (see RFC 3260, sec. 4) */
+ __u8 tos; /* six MSB of (former) IPv4 TOS
+ * are for dscp codepoint
+ */
+ __u8 traffic_class; /* ditto for the (former) Traffic Class in IPv6
+ * (see RFC 3260, sec. 4)
+ */
+
+ /* IMIX */
+ unsigned int n_imix_entries;
+ struct imix_pkt imix_entries[MAX_IMIX_ENTRIES];
+ /* Maps 0-IMIX_PRECISION range to imix_entry based on probability*/
+ __u8 imix_distribution[IMIX_PRECISION];
/* MPLS */
unsigned int nr_labels; /* Depth of stack, 0 = no MPLS */
@@ -381,12 +393,12 @@ struct pktgen_dev {
__u8 hh[14];
/* = {
- 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB,
-
- We fill in SRC address later
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- 0x08, 0x00
- };
+ * 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB,
+ *
+ * We fill in SRC address later
+ * 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ * 0x08, 0x00
+ * };
*/
__u16 pad; /* pad out the hh struct to an even 16 bytes */
@@ -401,6 +413,7 @@ struct pktgen_dev {
* device name (not when the inject is
* started as it used to do.)
*/
+ netdevice_tracker dev_tracker;
char odevname[32];
struct flow_state *flows;
unsigned int cflows; /* Concurrent flows (config) */
@@ -449,7 +462,8 @@ struct pktgen_thread {
char result[512];
/* Field for thread to receive "posted" events terminate,
- stop ifs etc. */
+ * stop ifs etc.
+ */
u32 control;
int cpu;
@@ -463,8 +477,7 @@ struct pktgen_thread {
#define FIND 0
static const char version[] =
- "Packet Generator for packet performance testing. "
- "Version: " VERSION "\n";
+ "Packet Generator for packet performance testing. Version: " VERSION "\n";
static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *i);
static int pktgen_add_device(struct pktgen_thread *t, const char *ifname);
@@ -473,10 +486,11 @@ static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t,
static int pktgen_device_event(struct notifier_block *, unsigned long, void *);
static void pktgen_run_all_threads(struct pktgen_net *pn);
static void pktgen_reset_all_threads(struct pktgen_net *pn);
-static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn);
+static void pktgen_stop_all_threads(struct pktgen_net *pn);
static void pktgen_stop(struct pktgen_thread *t);
static void pktgen_clear_counters(struct pktgen_dev *pkt_dev);
+static void fill_imix_distribution(struct pktgen_dev *pkt_dev);
/* Module parameters, defaults. */
static int pg_count_d __read_mostly = 1000;
@@ -505,31 +519,30 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
char data[128];
+ size_t max;
struct pktgen_net *pn = net_generic(current->nsproxy->net_ns, pg_net_id);
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- if (count == 0)
+ if (count < 1)
return -EINVAL;
- if (count > sizeof(data))
- count = sizeof(data);
-
- if (copy_from_user(data, buf, count))
+ max = min(count, sizeof(data) - 1);
+ if (copy_from_user(data, buf, max))
return -EFAULT;
- data[count - 1] = 0; /* Strip trailing '\n' and terminate string */
+ if (data[max - 1] == '\n')
+ data[max - 1] = 0; /* strip trailing '\n', terminate string */
+ else
+ data[max] = 0; /* terminate string */
if (!strcmp(data, "stop"))
- pktgen_stop_all_threads_ifs(pn);
-
+ pktgen_stop_all_threads(pn);
else if (!strcmp(data, "start"))
pktgen_run_all_threads(pn);
-
else if (!strcmp(data, "reset"))
pktgen_reset_all_threads(pn);
-
else
return -EINVAL;
@@ -538,15 +551,15 @@ static ssize_t pgctrl_write(struct file *file, const char __user *buf,
static int pgctrl_open(struct inode *inode, struct file *file)
{
- return single_open(file, pgctrl_show, PDE_DATA(inode));
+ return single_open(file, pgctrl_show, pde_data(inode));
}
-static const struct file_operations pktgen_fops = {
- .open = pgctrl_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = pgctrl_write,
- .release = single_release,
+static const struct proc_ops pktgen_proc_ops = {
+ .proc_open = pgctrl_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = pgctrl_write,
+ .proc_release = single_release,
};
static int pktgen_if_show(struct seq_file *seq, void *v)
@@ -561,6 +574,16 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
(unsigned long long)pkt_dev->count, pkt_dev->min_pkt_size,
pkt_dev->max_pkt_size);
+ if (pkt_dev->n_imix_entries > 0) {
+ seq_puts(seq, " imix_weights: ");
+ for (i = 0; i < pkt_dev->n_imix_entries; i++) {
+ seq_printf(seq, "%llu,%llu ",
+ pkt_dev->imix_entries[i].size,
+ pkt_dev->imix_entries[i].weight);
+ }
+ seq_puts(seq, "\n");
+ }
+
seq_printf(seq,
" frags: %d delay: %llu clone_skb: %d ifname: %s\n",
pkt_dev->nfrags, (unsigned long long) pkt_dev->delay,
@@ -605,8 +628,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
seq_printf(seq, "%pM\n", pkt_dev->dst_mac);
seq_printf(seq,
- " udp_src_min: %d udp_src_max: %d"
- " udp_dst_min: %d udp_dst_max: %d\n",
+ " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n",
pkt_dev->udp_src_min, pkt_dev->udp_src_max,
pkt_dev->udp_dst_min, pkt_dev->udp_dst_max);
@@ -651,19 +673,19 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
seq_puts(seq, " Flags: ");
for (i = 0; i < NR_PKT_FLAGS; i++) {
- if (i == F_FLOW_SEQ)
+ if (i == FLOW_SEQ_SHIFT)
if (!pkt_dev->cflows)
continue;
- if (pkt_dev->flags & (1 << i))
+ if (pkt_dev->flags & (1 << i)) {
seq_printf(seq, "%s ", pkt_flag_names[i]);
- else if (i == F_FLOW_SEQ)
- seq_puts(seq, "FLOW_RND ");
-
#ifdef CONFIG_XFRM
- if (i == F_IPSEC && pkt_dev->spi)
- seq_printf(seq, "spi:%u", pkt_dev->spi);
+ if (i == IPSEC_SHIFT && pkt_dev->spi)
+ seq_printf(seq, "spi:%u ", pkt_dev->spi);
#endif
+ } else if (i == FLOW_SEQ_SHIFT) {
+ seq_puts(seq, "FLOW_RND ");
+ }
}
seq_puts(seq, "\n");
@@ -678,6 +700,18 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
(unsigned long long)pkt_dev->sofar,
(unsigned long long)pkt_dev->errors);
+ if (pkt_dev->n_imix_entries > 0) {
+ int i;
+
+ seq_puts(seq, " imix_size_counts: ");
+ for (i = 0; i < pkt_dev->n_imix_entries; i++) {
+ seq_printf(seq, "%llu,%llu ",
+ pkt_dev->imix_entries[i].size,
+ pkt_dev->imix_entries[i].count_so_far);
+ }
+ seq_puts(seq, "\n");
+ }
+
seq_printf(seq,
" started: %lluus stopped: %lluus idle: %lluus\n",
(unsigned long long) ktime_to_us(pkt_dev->started_at),
@@ -713,34 +747,37 @@ static int pktgen_if_show(struct seq_file *seq, void *v)
}
-static int hex32_arg(const char __user *user_buffer, unsigned long maxlen,
- __u32 *num)
+static ssize_t hex32_arg(const char __user *user_buffer, size_t maxlen,
+ __u32 *num)
{
- int i = 0;
+ size_t i = 0;
+
*num = 0;
for (; i < maxlen; i++) {
int value;
char c;
- *num <<= 4;
+
if (get_user(c, &user_buffer[i]))
return -EFAULT;
value = hex_to_bin(c);
- if (value >= 0)
+ if (value >= 0) {
+ *num <<= 4;
*num |= value;
- else
+ } else {
break;
+ }
}
return i;
}
-static int count_trail_chars(const char __user * user_buffer,
- unsigned int maxlen)
+static ssize_t count_trail_chars(const char __user *user_buffer, size_t maxlen)
{
- int i;
+ size_t i;
for (i = 0; i < maxlen; i++) {
char c;
+
if (get_user(c, &user_buffer[i]))
return -EFAULT;
switch (c) {
@@ -759,14 +796,15 @@ done:
return i;
}
-static long num_arg(const char __user *user_buffer, unsigned long maxlen,
- unsigned long *num)
+static ssize_t num_arg(const char __user *user_buffer, size_t maxlen,
+ unsigned long *num)
{
- int i;
+ size_t i;
*num = 0;
for (i = 0; i < maxlen; i++) {
char c;
+
if (get_user(c, &user_buffer[i]))
return -EFAULT;
if ((c >= '0') && (c <= '9')) {
@@ -778,12 +816,13 @@ static long num_arg(const char __user *user_buffer, unsigned long maxlen,
return i;
}
-static int strn_len(const char __user * user_buffer, unsigned int maxlen)
+static ssize_t strn_len(const char __user *user_buffer, size_t maxlen)
{
- int i;
+ size_t i;
for (i = 0; i < maxlen; i++) {
char c;
+
if (get_user(c, &user_buffer[i]))
return -EFAULT;
switch (c) {
@@ -792,6 +831,7 @@ static int strn_len(const char __user * user_buffer, unsigned int maxlen)
case '\r':
case '\t':
case ' ':
+ case '=':
goto done_str;
default:
break;
@@ -801,29 +841,110 @@ done_str:
return i;
}
-static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev)
+/* Parses imix entries from user buffer.
+ * The user buffer should consist of imix entries separated by spaces
+ * where each entry consists of size and weight delimited by commas.
+ * "size1,weight_1 size2,weight_2 ... size_n,weight_n" for example.
+ */
+static ssize_t get_imix_entries(const char __user *buffer,
+ size_t maxlen,
+ struct pktgen_dev *pkt_dev)
+{
+ size_t i = 0, max;
+ ssize_t len;
+ char c;
+
+ pkt_dev->n_imix_entries = 0;
+
+ do {
+ unsigned long weight;
+ unsigned long size;
+
+ if (pkt_dev->n_imix_entries >= MAX_IMIX_ENTRIES)
+ return -E2BIG;
+
+ if (i >= maxlen)
+ return -EINVAL;
+
+ max = min(10, maxlen - i);
+ len = num_arg(&buffer[i], max, &size);
+ if (len < 0)
+ return len;
+ i += len;
+ if (i >= maxlen)
+ return -EINVAL;
+ if (get_user(c, &buffer[i]))
+ return -EFAULT;
+ /* Check for comma between size_i and weight_i */
+ if (c != ',')
+ return -EINVAL;
+ i++;
+ if (i >= maxlen)
+ return -EINVAL;
+
+ if (size < 14 + 20 + 8)
+ size = 14 + 20 + 8;
+
+ max = min(10, maxlen - i);
+ len = num_arg(&buffer[i], max, &weight);
+ if (len < 0)
+ return len;
+ if (weight <= 0)
+ return -EINVAL;
+
+ pkt_dev->imix_entries[pkt_dev->n_imix_entries].size = size;
+ pkt_dev->imix_entries[pkt_dev->n_imix_entries].weight = weight;
+
+ i += len;
+ pkt_dev->n_imix_entries++;
+
+ if (i >= maxlen)
+ break;
+ if (get_user(c, &buffer[i]))
+ return -EFAULT;
+ i++;
+ } while (c == ' ');
+
+ return i;
+}
+
+static ssize_t get_labels(const char __user *buffer,
+ size_t maxlen, struct pktgen_dev *pkt_dev)
{
unsigned int n = 0;
+ size_t i = 0, max;
+ ssize_t len;
char c;
- ssize_t i = 0;
- int len;
pkt_dev->nr_labels = 0;
do {
__u32 tmp;
- len = hex32_arg(&buffer[i], 8, &tmp);
- if (len <= 0)
+
+ if (n >= MAX_MPLS_LABELS)
+ return -E2BIG;
+
+ if (i >= maxlen)
+ return -EINVAL;
+
+ max = min(8, maxlen - i);
+ len = hex32_arg(&buffer[i], max, &tmp);
+ if (len < 0)
return len;
+
+ /* return empty list in case of invalid input or zero value */
+ if (len == 0 || tmp == 0)
+ return maxlen;
+
pkt_dev->labels[n] = htonl(tmp);
if (pkt_dev->labels[n] & MPLS_STACK_BOTTOM)
pkt_dev->flags |= F_MPLS_RND;
i += len;
+ n++;
+ if (i >= maxlen)
+ break;
if (get_user(c, &buffer[i]))
return -EFAULT;
i++;
- n++;
- if (n >= MAX_MPLS_LABELS)
- return -E2BIG;
} while (c == ',');
pkt_dev->nr_labels = n;
@@ -860,16 +981,16 @@ static __u32 pktgen_read_flag(const char *f, bool *disable)
}
static ssize_t pktgen_if_write(struct file *file,
- const char __user * user_buffer, size_t count,
- loff_t * offset)
+ const char __user *user_buffer, size_t count,
+ loff_t *offset)
{
struct seq_file *seq = file->private_data;
struct pktgen_dev *pkt_dev = seq->private;
- int i, max, len;
+ size_t i, max;
+ ssize_t len;
char name[16], valstr[32];
unsigned long value = 0;
char *pg_result = NULL;
- int tmp = 0;
char buf[128];
pg_result = &(pkt_dev->result[0]);
@@ -880,16 +1001,16 @@ static ssize_t pktgen_if_write(struct file *file,
}
max = count;
- tmp = count_trail_chars(user_buffer, max);
- if (tmp < 0) {
+ len = count_trail_chars(user_buffer, max);
+ if (len < 0) {
pr_warn("illegal format\n");
- return tmp;
+ return len;
}
- i = tmp;
+ i = len;
/* Read variable name */
-
- len = strn_len(&user_buffer[i], sizeof(name) - 1);
+ max = min(sizeof(name) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -917,35 +1038,35 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "min_pkt_size")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value < 14 + 20 + 8)
value = 14 + 20 + 8;
if (value != pkt_dev->min_pkt_size) {
pkt_dev->min_pkt_size = value;
pkt_dev->cur_pkt_size = value;
}
- sprintf(pg_result, "OK: min_pkt_size=%u",
+ sprintf(pg_result, "OK: min_pkt_size=%d",
pkt_dev->min_pkt_size);
return count;
}
if (!strcmp(name, "max_pkt_size")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value < 14 + 20 + 8)
value = 14 + 20 + 8;
if (value != pkt_dev->max_pkt_size) {
pkt_dev->max_pkt_size = value;
pkt_dev->cur_pkt_size = value;
}
- sprintf(pg_result, "OK: max_pkt_size=%u",
+ sprintf(pg_result, "OK: max_pkt_size=%d",
pkt_dev->max_pkt_size);
return count;
}
@@ -953,11 +1074,11 @@ static ssize_t pktgen_if_write(struct file *file,
/* Shortcut for min = max */
if (!strcmp(name, "pkt_size")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value < 14 + 20 + 8)
value = 14 + 20 + 8;
if (value != pkt_dev->min_pkt_size) {
@@ -965,37 +1086,51 @@ static ssize_t pktgen_if_write(struct file *file,
pkt_dev->max_pkt_size = value;
pkt_dev->cur_pkt_size = value;
}
- sprintf(pg_result, "OK: pkt_size=%u", pkt_dev->min_pkt_size);
+ sprintf(pg_result, "OK: pkt_size=%d", pkt_dev->min_pkt_size);
+ return count;
+ }
+
+ if (!strcmp(name, "imix_weights")) {
+ if (pkt_dev->clone_skb > 0)
+ return -EINVAL;
+
+ max = count - i;
+ len = get_imix_entries(&user_buffer[i], max, pkt_dev);
+ if (len < 0)
+ return len;
+
+ fill_imix_distribution(pkt_dev);
+
return count;
}
if (!strcmp(name, "debug")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
debug = value;
sprintf(pg_result, "OK: debug=%u", debug);
return count;
}
if (!strcmp(name, "frags")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
pkt_dev->nfrags = value;
- sprintf(pg_result, "OK: frags=%u", pkt_dev->nfrags);
+ sprintf(pg_result, "OK: frags=%d", pkt_dev->nfrags);
return count;
}
if (!strcmp(name, "delay")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value == 0x7FFFFFFF)
pkt_dev->delay = ULLONG_MAX;
else
@@ -1006,13 +1141,13 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "rate")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (!value)
- return len;
+ return -EINVAL;
pkt_dev->delay = pkt_dev->min_pkt_size*8*NSEC_PER_USEC/value;
if (debug)
pr_info("Delay set at: %llu ns\n", pkt_dev->delay);
@@ -1021,13 +1156,13 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "ratep")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (!value)
- return len;
+ return -EINVAL;
pkt_dev->delay = NSEC_PER_SEC/value;
if (debug)
pr_info("Delay set at: %llu ns\n", pkt_dev->delay);
@@ -1036,11 +1171,11 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "udp_src_min")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value != pkt_dev->udp_src_min) {
pkt_dev->udp_src_min = value;
pkt_dev->cur_udp_src = value;
@@ -1049,11 +1184,11 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "udp_dst_min")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value != pkt_dev->udp_dst_min) {
pkt_dev->udp_dst_min = value;
pkt_dev->cur_udp_dst = value;
@@ -1062,11 +1197,11 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "udp_src_max")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value != pkt_dev->udp_src_max) {
pkt_dev->udp_src_max = value;
pkt_dev->cur_udp_src = value;
@@ -1075,11 +1210,11 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "udp_dst_max")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value != pkt_dev->udp_dst_max) {
pkt_dev->udp_dst_max = value;
pkt_dev->cur_udp_dst = value;
@@ -1088,36 +1223,43 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "clone_skb")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
+ /* clone_skb is not supported for netif_receive xmit_mode and
+ * IMIX mode.
+ */
if ((value > 0) &&
((pkt_dev->xmit_mode == M_NETIF_RECEIVE) ||
!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))
- return -ENOTSUPP;
- i += len;
+ return -EOPNOTSUPP;
+ if (value > 0 && (pkt_dev->n_imix_entries > 0 ||
+ !(pkt_dev->flags & F_SHARED)))
+ return -EINVAL;
+
pkt_dev->clone_skb = value;
sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb);
return count;
}
if (!strcmp(name, "count")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
pkt_dev->count = value;
sprintf(pg_result, "OK: count=%llu",
(unsigned long long)pkt_dev->count);
return count;
}
if (!strcmp(name, "src_mac_count")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (pkt_dev->src_mac_count != value) {
pkt_dev->src_mac_count = value;
pkt_dev->cur_src_mac_offset = 0;
@@ -1127,11 +1269,11 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "dst_mac_count")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (pkt_dev->dst_mac_count != value) {
pkt_dev->dst_mac_count = value;
pkt_dev->cur_dst_mac_offset = 0;
@@ -1141,27 +1283,30 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "burst")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if ((value > 1) &&
((pkt_dev->xmit_mode == M_QUEUE_XMIT) ||
((pkt_dev->xmit_mode == M_START_XMIT) &&
(!(pkt_dev->odev->priv_flags & IFF_TX_SKB_SHARING)))))
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
+
+ if (value > 1 && !(pkt_dev->flags & F_SHARED))
+ return -EINVAL;
+
pkt_dev->burst = value < 1 ? 1 : value;
- sprintf(pg_result, "OK: burst=%d", pkt_dev->burst);
+ sprintf(pg_result, "OK: burst=%u", pkt_dev->burst);
return count;
}
if (!strcmp(name, "node")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
-
if (node_possible(value)) {
pkt_dev->node = value;
sprintf(pg_result, "OK: node=%d", pkt_dev->node);
@@ -1169,29 +1314,29 @@ static ssize_t pktgen_if_write(struct file *file,
put_page(pkt_dev->page);
pkt_dev->page = NULL;
}
- }
- else
+ } else {
sprintf(pg_result, "ERROR: node not possible");
+ }
return count;
}
if (!strcmp(name, "xmit_mode")) {
char f[32];
- memset(f, 0, 32);
- len = strn_len(&user_buffer[i], sizeof(f) - 1);
+ max = min(sizeof(f) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
+ memset(f, 0, sizeof(f));
if (copy_from_user(f, &user_buffer[i], len))
return -EFAULT;
- i += len;
if (strcmp(f, "start_xmit") == 0) {
pkt_dev->xmit_mode = M_START_XMIT;
} else if (strcmp(f, "netif_receive") == 0) {
/* clone_skb set earlier, not supported in this mode */
if (pkt_dev->clone_skb > 0)
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
pkt_dev->xmit_mode = M_NETIF_RECEIVE;
@@ -1199,11 +1344,6 @@ static ssize_t pktgen_if_write(struct file *file,
* pktgen_xmit() is called
*/
pkt_dev->last_ok = 1;
-
- /* override clone_skb if user passed default value
- * at module loading time
- */
- pkt_dev->clone_skb = 0;
} else if (strcmp(f, "queue_xmit") == 0) {
pkt_dev->xmit_mode = M_QUEUE_XMIT;
pkt_dev->last_ok = 1;
@@ -1217,46 +1357,62 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "flag")) {
+ bool disable = false;
__u32 flag;
char f[32];
- bool disable = false;
+ char *end;
- memset(f, 0, 32);
- len = strn_len(&user_buffer[i], sizeof(f) - 1);
+ max = min(sizeof(f) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
+ memset(f, 0, 32);
if (copy_from_user(f, &user_buffer[i], len))
return -EFAULT;
- i += len;
flag = pktgen_read_flag(f, &disable);
-
if (flag) {
- if (disable)
+ if (disable) {
+ /* If "clone_skb", or "burst" parameters are
+ * configured, it means that the skb still
+ * needs to be referenced by the pktgen, so
+ * the skb must be shared.
+ */
+ if (flag == F_SHARED && (pkt_dev->clone_skb ||
+ pkt_dev->burst > 1))
+ return -EINVAL;
pkt_dev->flags &= ~flag;
- else
+ } else {
pkt_dev->flags |= flag;
- } else {
- sprintf(pg_result,
- "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s",
- f,
- "IPSRC_RND, IPDST_RND, UDPSRC_RND, UDPDST_RND, "
- "MACSRC_RND, MACDST_RND, TXSIZE_RND, IPV6, "
- "MPLS_RND, VID_RND, SVID_RND, FLOW_SEQ, "
- "QUEUE_MAP_RND, QUEUE_MAP_CPU, UDPCSUM, "
- "NO_TIMESTAMP, "
-#ifdef CONFIG_XFRM
- "IPSEC, "
-#endif
- "NODE_ALLOC\n");
+ }
+
+ sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
return count;
}
- sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags);
+
+ /* Unknown flag */
+ end = pkt_dev->result + sizeof(pkt_dev->result);
+ pg_result += sprintf(pg_result,
+ "Flag -:%s:- unknown\n"
+ "Available flags, (prepend ! to un-set flag):\n", f);
+
+ for (int n = 0; n < NR_PKT_FLAGS && pg_result < end; n++) {
+ if (!IS_ENABLED(CONFIG_XFRM) && n == IPSEC_SHIFT)
+ continue;
+ pg_result += snprintf(pg_result, end - pg_result,
+ "%s, ", pkt_flag_names[n]);
+ }
+ if (!WARN_ON_ONCE(pg_result >= end)) {
+ /* Remove the comma and whitespace at the end */
+ *(pg_result - 2) = '\0';
+ }
+
return count;
}
if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) {
- len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1);
+ max = min(sizeof(pkt_dev->dst_min) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1264,19 +1420,19 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
if (strcmp(buf, pkt_dev->dst_min) != 0) {
- memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min));
- strcpy(pkt_dev->dst_min, buf);
+ strscpy_pad(pkt_dev->dst_min, buf);
pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
pkt_dev->cur_daddr = pkt_dev->daddr_min;
}
if (debug)
pr_debug("dst_min set to: %s\n", pkt_dev->dst_min);
- i += len;
+
sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min);
return count;
}
if (!strcmp(name, "dst_max")) {
- len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1);
+ max = min(sizeof(pkt_dev->dst_max) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1284,19 +1440,19 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
if (strcmp(buf, pkt_dev->dst_max) != 0) {
- memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max));
- strcpy(pkt_dev->dst_max, buf);
+ strscpy_pad(pkt_dev->dst_max, buf);
pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
pkt_dev->cur_daddr = pkt_dev->daddr_max;
}
if (debug)
pr_debug("dst_max set to: %s\n", pkt_dev->dst_max);
- i += len;
+
sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max);
return count;
}
if (!strcmp(name, "dst6")) {
- len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ max = min(sizeof(buf) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1314,12 +1470,12 @@ static ssize_t pktgen_if_write(struct file *file,
if (debug)
pr_debug("dst6 set to: %s\n", buf);
- i += len;
sprintf(pg_result, "OK: dst6=%s", buf);
return count;
}
if (!strcmp(name, "dst6_min")) {
- len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ max = min(sizeof(buf) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1336,12 +1492,12 @@ static ssize_t pktgen_if_write(struct file *file,
if (debug)
pr_debug("dst6_min set to: %s\n", buf);
- i += len;
sprintf(pg_result, "OK: dst6_min=%s", buf);
return count;
}
if (!strcmp(name, "dst6_max")) {
- len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ max = min(sizeof(buf) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1357,12 +1513,12 @@ static ssize_t pktgen_if_write(struct file *file,
if (debug)
pr_debug("dst6_max set to: %s\n", buf);
- i += len;
sprintf(pg_result, "OK: dst6_max=%s", buf);
return count;
}
if (!strcmp(name, "src6")) {
- len = strn_len(&user_buffer[i], sizeof(buf) - 1);
+ max = min(sizeof(buf) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1380,12 +1536,12 @@ static ssize_t pktgen_if_write(struct file *file,
if (debug)
pr_debug("src6 set to: %s\n", buf);
- i += len;
sprintf(pg_result, "OK: src6=%s", buf);
return count;
}
if (!strcmp(name, "src_min")) {
- len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1);
+ max = min(sizeof(pkt_dev->src_min) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1393,19 +1549,19 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
if (strcmp(buf, pkt_dev->src_min) != 0) {
- memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min));
- strcpy(pkt_dev->src_min, buf);
+ strscpy_pad(pkt_dev->src_min, buf);
pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
pkt_dev->cur_saddr = pkt_dev->saddr_min;
}
if (debug)
pr_debug("src_min set to: %s\n", pkt_dev->src_min);
- i += len;
+
sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min);
return count;
}
if (!strcmp(name, "src_max")) {
- len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1);
+ max = min(sizeof(pkt_dev->src_max) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1413,19 +1569,19 @@ static ssize_t pktgen_if_write(struct file *file,
return -EFAULT;
buf[len] = 0;
if (strcmp(buf, pkt_dev->src_max) != 0) {
- memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max));
- strcpy(pkt_dev->src_max, buf);
+ strscpy_pad(pkt_dev->src_max, buf);
pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
pkt_dev->cur_saddr = pkt_dev->saddr_max;
}
if (debug)
pr_debug("src_max set to: %s\n", pkt_dev->src_max);
- i += len;
+
sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max);
return count;
}
if (!strcmp(name, "dst_mac")) {
- len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
+ max = min(sizeof(valstr) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1442,7 +1598,8 @@ static ssize_t pktgen_if_write(struct file *file,
return count;
}
if (!strcmp(name, "src_mac")) {
- len = strn_len(&user_buffer[i], sizeof(valstr) - 1);
+ max = min(sizeof(valstr) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1466,11 +1623,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "flows")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value > MAX_CFLOWS)
value = MAX_CFLOWS;
@@ -1480,44 +1637,44 @@ static ssize_t pktgen_if_write(struct file *file,
}
#ifdef CONFIG_XFRM
if (!strcmp(name, "spi")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
pkt_dev->spi = value;
sprintf(pg_result, "OK: spi=%u", pkt_dev->spi);
return count;
}
#endif
if (!strcmp(name, "flowlen")) {
- len = num_arg(&user_buffer[i], 10, &value);
+ max = min(10, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
pkt_dev->lflow = value;
sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow);
return count;
}
if (!strcmp(name, "queue_map_min")) {
- len = num_arg(&user_buffer[i], 5, &value);
+ max = min(5, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
pkt_dev->queue_map_min = value;
sprintf(pg_result, "OK: queue_map_min=%u", pkt_dev->queue_map_min);
return count;
}
if (!strcmp(name, "queue_map_max")) {
- len = num_arg(&user_buffer[i], 5, &value);
+ max = min(5, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
pkt_dev->queue_map_max = value;
sprintf(pg_result, "OK: queue_map_max=%u", pkt_dev->queue_map_max);
return count;
@@ -1526,10 +1683,11 @@ static ssize_t pktgen_if_write(struct file *file,
if (!strcmp(name, "mpls")) {
unsigned int n, cnt;
- len = get_labels(&user_buffer[i], pkt_dev);
+ max = count - i;
+ len = get_labels(&user_buffer[i], max, pkt_dev);
if (len < 0)
return len;
- i += len;
+
cnt = sprintf(pg_result, "OK: mpls=");
for (n = 0; n < pkt_dev->nr_labels; n++)
cnt += sprintf(pg_result + cnt,
@@ -1547,11 +1705,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "vlan_id")) {
- len = num_arg(&user_buffer[i], 4, &value);
+ max = min(4, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if (value <= 4095) {
pkt_dev->vlan_id = value; /* turn on VLAN */
@@ -1574,11 +1732,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "vlan_p")) {
- len = num_arg(&user_buffer[i], 1, &value);
+ max = min(1, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if ((value <= 7) && (pkt_dev->vlan_id != 0xffff)) {
pkt_dev->vlan_p = value;
sprintf(pg_result, "OK: vlan_p=%u", pkt_dev->vlan_p);
@@ -1589,11 +1747,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "vlan_cfi")) {
- len = num_arg(&user_buffer[i], 1, &value);
+ max = min(1, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if ((value <= 1) && (pkt_dev->vlan_id != 0xffff)) {
pkt_dev->vlan_cfi = value;
sprintf(pg_result, "OK: vlan_cfi=%u", pkt_dev->vlan_cfi);
@@ -1604,11 +1762,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "svlan_id")) {
- len = num_arg(&user_buffer[i], 4, &value);
+ max = min(4, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if ((value <= 4095) && ((pkt_dev->vlan_id != 0xffff))) {
pkt_dev->svlan_id = value; /* turn on SVLAN */
@@ -1631,11 +1789,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "svlan_p")) {
- len = num_arg(&user_buffer[i], 1, &value);
+ max = min(1, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if ((value <= 7) && (pkt_dev->svlan_id != 0xffff)) {
pkt_dev->svlan_p = value;
sprintf(pg_result, "OK: svlan_p=%u", pkt_dev->svlan_p);
@@ -1646,11 +1804,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "svlan_cfi")) {
- len = num_arg(&user_buffer[i], 1, &value);
+ max = min(1, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
if ((value <= 1) && (pkt_dev->svlan_id != 0xffff)) {
pkt_dev->svlan_cfi = value;
sprintf(pg_result, "OK: svlan_cfi=%u", pkt_dev->svlan_cfi);
@@ -1661,12 +1819,13 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "tos")) {
- __u32 tmp_value = 0;
- len = hex32_arg(&user_buffer[i], 2, &tmp_value);
+ __u32 tmp_value;
+
+ max = min(2, count - i);
+ len = hex32_arg(&user_buffer[i], max, &tmp_value);
if (len < 0)
return len;
- i += len;
if (len == 2) {
pkt_dev->tos = tmp_value;
sprintf(pg_result, "OK: tos=0x%02x", pkt_dev->tos);
@@ -1677,12 +1836,13 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "traffic_class")) {
- __u32 tmp_value = 0;
- len = hex32_arg(&user_buffer[i], 2, &tmp_value);
+ __u32 tmp_value;
+
+ max = min(2, count - i);
+ len = hex32_arg(&user_buffer[i], max, &tmp_value);
if (len < 0)
return len;
- i += len;
if (len == 2) {
pkt_dev->traffic_class = tmp_value;
sprintf(pg_result, "OK: traffic_class=0x%02x", pkt_dev->traffic_class);
@@ -1693,11 +1853,11 @@ static ssize_t pktgen_if_write(struct file *file,
}
if (!strcmp(name, "skb_priority")) {
- len = num_arg(&user_buffer[i], 9, &value);
+ max = min(9, count - i);
+ len = num_arg(&user_buffer[i], max, &value);
if (len < 0)
return len;
- i += len;
pkt_dev->skb_priority = value;
sprintf(pg_result, "OK: skb_priority=%i",
pkt_dev->skb_priority);
@@ -1710,15 +1870,15 @@ static ssize_t pktgen_if_write(struct file *file,
static int pktgen_if_open(struct inode *inode, struct file *file)
{
- return single_open(file, pktgen_if_show, PDE_DATA(inode));
+ return single_open(file, pktgen_if_show, pde_data(inode));
}
-static const struct file_operations pktgen_if_fops = {
- .open = pktgen_if_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = pktgen_if_write,
- .release = single_release,
+static const struct proc_ops pktgen_if_proc_ops = {
+ .proc_open = pktgen_if_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = pktgen_if_write,
+ .proc_release = single_release,
};
static int pktgen_thread_show(struct seq_file *seq, void *v)
@@ -1752,12 +1912,13 @@ static int pktgen_thread_show(struct seq_file *seq, void *v)
}
static ssize_t pktgen_thread_write(struct file *file,
- const char __user * user_buffer,
- size_t count, loff_t * offset)
+ const char __user *user_buffer,
+ size_t count, loff_t *offset)
{
struct seq_file *seq = file->private_data;
struct pktgen_thread *t = seq->private;
- int i, max, len, ret;
+ size_t i, max;
+ ssize_t len, ret;
char name[40];
char *pg_result;
@@ -1774,8 +1935,8 @@ static ssize_t pktgen_thread_write(struct file *file,
i = len;
/* Read variable name */
-
- len = strn_len(&user_buffer[i], sizeof(name) - 1);
+ max = min(sizeof(name) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0)
return len;
@@ -1804,15 +1965,17 @@ static ssize_t pktgen_thread_write(struct file *file,
if (!strcmp(name, "add_device")) {
char f[32];
+
memset(f, 0, 32);
- len = strn_len(&user_buffer[i], sizeof(f) - 1);
+ max = min(sizeof(f) - 1, count - i);
+ len = strn_len(&user_buffer[i], max);
if (len < 0) {
ret = len;
goto out;
}
if (copy_from_user(f, &user_buffer[i], len))
return -EFAULT;
- i += len;
+
mutex_lock(&pktgen_thread_lock);
ret = pktgen_add_device(t, f);
mutex_unlock(&pktgen_thread_lock);
@@ -1847,15 +2010,15 @@ out:
static int pktgen_thread_open(struct inode *inode, struct file *file)
{
- return single_open(file, pktgen_thread_show, PDE_DATA(inode));
+ return single_open(file, pktgen_thread_show, pde_data(inode));
}
-static const struct file_operations pktgen_thread_fops = {
- .open = pktgen_thread_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .write = pktgen_thread_write,
- .release = single_release,
+static const struct proc_ops pktgen_thread_proc_ops = {
+ .proc_open = pktgen_thread_open,
+ .proc_read = seq_read,
+ .proc_lseek = seq_lseek,
+ .proc_write = pktgen_thread_write,
+ .proc_release = single_release,
};
/* Think find or remove for NN */
@@ -1932,7 +2095,7 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d
pkt_dev->entry = proc_create_data(dev->name, 0600,
pn->proc_dir,
- &pktgen_if_fops,
+ &pktgen_if_proc_ops,
pkt_dev);
if (!pkt_dev->entry)
pr_err("can't move proc entry for '%s'\n",
@@ -1999,7 +2162,7 @@ static int pktgen_setup_dev(const struct pktgen_net *pn,
/* Clean old setups */
if (pkt_dev->odev) {
- dev_put(pkt_dev->odev);
+ netdev_put(pkt_dev->odev, &pkt_dev->dev_tracker);
pkt_dev->odev = NULL;
}
@@ -2009,14 +2172,15 @@ static int pktgen_setup_dev(const struct pktgen_net *pn,
return -ENODEV;
}
- if (odev->type != ARPHRD_ETHER) {
- pr_err("not an ethernet device: \"%s\"\n", ifname);
+ if (odev->type != ARPHRD_ETHER && odev->type != ARPHRD_LOOPBACK) {
+ pr_err("not an ethernet or loopback device: \"%s\"\n", ifname);
err = -EINVAL;
} else if (!netif_running(odev)) {
pr_err("device is down: \"%s\"\n", ifname);
err = -ENETDOWN;
} else {
pkt_dev->odev = odev;
+ netdev_tracker_alloc(odev, &pkt_dev->dev_tracker, GFP_KERNEL);
return 0;
}
@@ -2124,9 +2288,11 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
rcu_read_lock();
in_dev = __in_dev_get_rcu(pkt_dev->odev);
if (in_dev) {
- if (in_dev->ifa_list) {
- pkt_dev->saddr_min =
- in_dev->ifa_list->ifa_address;
+ const struct in_ifaddr *ifa;
+
+ ifa = rcu_dereference(in_dev->ifa_list);
+ if (ifa) {
+ pkt_dev->saddr_min = ifa->ifa_address;
pkt_dev->saddr_max = pkt_dev->saddr_min;
}
}
@@ -2160,7 +2326,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
s64 remaining;
struct hrtimer_sleeper t;
- hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ hrtimer_setup_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
hrtimer_set_expires(&t.timer, spin_until);
remaining = ktime_to_ns(hrtimer_expires_remaining(&t.timer));
@@ -2174,11 +2340,9 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
end_time = ktime_get();
} while (ktime_compare(end_time, spin_until) < 0);
} else {
- /* see do_nanosleep */
- hrtimer_init_sleeper(&t, current);
do {
set_current_state(TASK_INTERRUPTIBLE);
- hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
+ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_ABS);
if (likely(t.task))
schedule();
@@ -2222,7 +2386,7 @@ static inline int f_pick(struct pktgen_dev *pkt_dev)
pkt_dev->curfl = 0; /*reset */
}
} else {
- flow = prandom_u32() % pkt_dev->cflows;
+ flow = get_random_u32_below(pkt_dev->cflows);
pkt_dev->curfl = flow;
if (pkt_dev->flows[flow].count > pkt_dev->lflow) {
@@ -2235,24 +2399,25 @@ static inline int f_pick(struct pktgen_dev *pkt_dev)
}
-#ifdef CONFIG_XFRM
/* If there was already an IPSEC SA, we keep it as is, else
* we go look for it ...
-*/
+ */
#define DUMMY_MARK 0
static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)
{
+#ifdef CONFIG_XFRM
struct xfrm_state *x = pkt_dev->flows[flow].x;
struct pktgen_net *pn = net_generic(dev_net(pkt_dev->odev), pg_net_id);
+
if (!x) {
if (pkt_dev->spi) {
/* We need as quick as possible to find the right SA
- * Searching with minimum criteria to archieve this.
+ * Searching with minimum criteria to achieve, this.
*/
x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET);
} else {
- /* slow path: we dont already have xfrm_state */
+ /* slow path: we don't already have xfrm_state */
x = xfrm_stateonly_find(pn->net, DUMMY_MARK, 0,
(xfrm_address_t *)&pkt_dev->cur_daddr,
(xfrm_address_t *)&pkt_dev->cur_saddr,
@@ -2267,21 +2432,19 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)
}
}
-}
#endif
+}
static void set_cur_queue_map(struct pktgen_dev *pkt_dev)
{
-
if (pkt_dev->flags & F_QUEUE_MAP_CPU)
pkt_dev->cur_queue_map = smp_processor_id();
else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) {
__u16 t;
+
if (pkt_dev->flags & F_QUEUE_MAP_RND) {
- t = prandom_u32() %
- (pkt_dev->queue_map_max -
- pkt_dev->queue_map_min + 1)
- + pkt_dev->queue_map_min;
+ t = get_random_u32_inclusive(pkt_dev->queue_map_min,
+ pkt_dev->queue_map_max);
} else {
t = pkt_dev->cur_queue_map + 1;
if (t > pkt_dev->queue_map_max)
@@ -2310,7 +2473,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
__u32 tmp;
if (pkt_dev->flags & F_MACSRC_RND)
- mc = prandom_u32() % pkt_dev->src_mac_count;
+ mc = get_random_u32_below(pkt_dev->src_mac_count);
else {
mc = pkt_dev->cur_src_mac_offset++;
if (pkt_dev->cur_src_mac_offset >=
@@ -2336,7 +2499,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
__u32 tmp;
if (pkt_dev->flags & F_MACDST_RND)
- mc = prandom_u32() % pkt_dev->dst_mac_count;
+ mc = get_random_u32_below(pkt_dev->dst_mac_count);
else {
mc = pkt_dev->cur_dst_mac_offset++;
@@ -2360,26 +2523,26 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->flags & F_MPLS_RND) {
unsigned int i;
+
for (i = 0; i < pkt_dev->nr_labels; i++)
if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM)
pkt_dev->labels[i] = MPLS_STACK_BOTTOM |
- ((__force __be32)prandom_u32() &
+ ((__force __be32)get_random_u32() &
htonl(0x000fffff));
}
if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) {
- pkt_dev->vlan_id = prandom_u32() & (4096 - 1);
+ pkt_dev->vlan_id = get_random_u32_below(4096);
}
if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) {
- pkt_dev->svlan_id = prandom_u32() & (4096 - 1);
+ pkt_dev->svlan_id = get_random_u32_below(4096);
}
if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) {
if (pkt_dev->flags & F_UDPSRC_RND)
- pkt_dev->cur_udp_src = prandom_u32() %
- (pkt_dev->udp_src_max - pkt_dev->udp_src_min)
- + pkt_dev->udp_src_min;
+ pkt_dev->cur_udp_src = get_random_u32_inclusive(pkt_dev->udp_src_min,
+ pkt_dev->udp_src_max - 1);
else {
pkt_dev->cur_udp_src++;
@@ -2390,9 +2553,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) {
if (pkt_dev->flags & F_UDPDST_RND) {
- pkt_dev->cur_udp_dst = prandom_u32() %
- (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min)
- + pkt_dev->udp_dst_min;
+ pkt_dev->cur_udp_dst = get_random_u32_inclusive(pkt_dev->udp_dst_min,
+ pkt_dev->udp_dst_max - 1);
} else {
pkt_dev->cur_udp_dst++;
if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max)
@@ -2406,8 +2568,9 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
imx = ntohl(pkt_dev->saddr_max);
if (imn < imx) {
__u32 t;
+
if (pkt_dev->flags & F_IPSRC_RND)
- t = prandom_u32() % (imx - imn) + imn;
+ t = get_random_u32_inclusive(imn, imx - 1);
else {
t = ntohl(pkt_dev->cur_saddr);
t++;
@@ -2426,11 +2589,11 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (imn < imx) {
__u32 t;
__be32 s;
+
if (pkt_dev->flags & F_IPDST_RND) {
do {
- t = prandom_u32() %
- (imx - imn) + imn;
+ t = get_random_u32_inclusive(imn, imx - 1);
s = htonl(t);
} while (ipv4_is_loopback(s) ||
ipv4_is_multicast(s) ||
@@ -2451,10 +2614,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
pkt_dev->flows[flow].flags |= F_INIT;
pkt_dev->flows[flow].cur_daddr =
pkt_dev->cur_daddr;
-#ifdef CONFIG_XFRM
if (pkt_dev->flags & F_IPSEC)
get_ipsec_sa(pkt_dev, flow);
-#endif
pkt_dev->nflows++;
}
}
@@ -2467,7 +2628,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
for (i = 0; i < 4; i++) {
pkt_dev->cur_in6_daddr.s6_addr32[i] =
- (((__force __be32)prandom_u32() |
+ (((__force __be32)get_random_u32() |
pkt_dev->min_in6_daddr.s6_addr32[i]) &
pkt_dev->max_in6_daddr.s6_addr32[i]);
}
@@ -2476,16 +2637,24 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) {
__u32 t;
+
if (pkt_dev->flags & F_TXSIZE_RND) {
- t = prandom_u32() %
- (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size)
- + pkt_dev->min_pkt_size;
+ t = get_random_u32_inclusive(pkt_dev->min_pkt_size,
+ pkt_dev->max_pkt_size - 1);
} else {
t = pkt_dev->cur_pkt_size + 1;
if (t > pkt_dev->max_pkt_size)
t = pkt_dev->min_pkt_size;
}
pkt_dev->cur_pkt_size = t;
+ } else if (pkt_dev->n_imix_entries > 0) {
+ struct imix_pkt *entry;
+ __u32 t = get_random_u32_below(IMIX_PRECISION);
+ __u8 entry_index = pkt_dev->imix_distribution[t];
+
+ entry = &pkt_dev->imix_entries[entry_index];
+ entry->count_so_far++;
+ pkt_dev->cur_pkt_size = entry->size;
}
set_cur_queue_map(pkt_dev);
@@ -2493,6 +2662,32 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev)
pkt_dev->flows[flow].count++;
}
+static void fill_imix_distribution(struct pktgen_dev *pkt_dev)
+{
+ int cumulative_probabilites[MAX_IMIX_ENTRIES];
+ int j = 0;
+ __u64 cumulative_prob = 0;
+ __u64 total_weight = 0;
+ int i = 0;
+
+ for (i = 0; i < pkt_dev->n_imix_entries; i++)
+ total_weight += pkt_dev->imix_entries[i].weight;
+
+ /* Fill cumulative_probabilites with sum of normalized probabilities */
+ for (i = 0; i < pkt_dev->n_imix_entries - 1; i++) {
+ cumulative_prob += div64_u64(pkt_dev->imix_entries[i].weight *
+ IMIX_PRECISION,
+ total_weight);
+ cumulative_probabilites[i] = cumulative_prob;
+ }
+ cumulative_probabilites[pkt_dev->n_imix_entries - 1] = 100;
+
+ for (i = 0; i < IMIX_PRECISION; i++) {
+ if (i == cumulative_probabilites[j])
+ j++;
+ pkt_dev->imix_distribution[i] = j;
+ }
+}
#ifdef CONFIG_XFRM
static u32 pktgen_dst_metrics[RTAX_MAX + 1] = {
@@ -2509,7 +2704,8 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev)
if (!x)
return 0;
/* XXX: we dont support tunnel mode for now until
- * we resolve the dst issue */
+ * we resolve the dst issue
+ */
if ((x->props.mode != XFRM_MODE_TRANSPORT) && (pkt_dev->spi == 0))
return 0;
@@ -2520,7 +2716,7 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev)
skb->_skb_refdst = (unsigned long)&pkt_dev->xdst.u.dst | SKB_DST_NOREF;
rcu_read_lock_bh();
- err = x->outer_mode->output(x, skb);
+ err = pktgen_xfrm_outer_mode_output(x, skb);
rcu_read_unlock_bh();
if (err) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR);
@@ -2544,8 +2740,10 @@ static void free_SAs(struct pktgen_dev *pkt_dev)
if (pkt_dev->cflows) {
/* let go of the SAs if we have them */
int i;
+
for (i = 0; i < pkt_dev->cflows; i++) {
struct xfrm_state *x = pkt_dev->flows[i].x;
+
if (x) {
xfrm_state_put(x);
pkt_dev->flows[i].x = NULL;
@@ -2560,6 +2758,7 @@ static int process_ipsec(struct pktgen_dev *pkt_dev,
if (pkt_dev->flags & F_IPSEC) {
struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x;
int nhead = 0;
+
if (x) {
struct ethhdr *eth;
struct iphdr *iph;
@@ -2603,6 +2802,7 @@ err:
static void mpls_push(__be32 *mpls, struct pktgen_dev *pkt_dev)
{
unsigned int i;
+
for (i = 0; i < pkt_dev->nr_labels; i++)
*mpls++ = pkt_dev->labels[i] & ~MPLS_STACK_BOTTOM;
@@ -2642,8 +2842,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
}
i = 0;
- frag_len = (datalen/frags) < PAGE_SIZE ?
- (datalen/frags) : PAGE_SIZE;
+ frag_len = min_t(int, datalen / frags, PAGE_SIZE);
while (datalen > 0) {
if (unlikely(!pkt_dev->page)) {
int node = numa_node_id();
@@ -2655,14 +2854,16 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
break;
}
get_page(pkt_dev->page);
- skb_frag_set_page(skb, i, pkt_dev->page);
- skb_shinfo(skb)->frags[i].page_offset = 0;
+
/*last fragment, fill rest of data*/
if (i == (frags - 1))
- skb_frag_size_set(&skb_shinfo(skb)->frags[i],
- (datalen < PAGE_SIZE ? datalen : PAGE_SIZE));
+ skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i],
+ pkt_dev->page, 0,
+ min(datalen, PAGE_SIZE));
else
- skb_frag_size_set(&skb_shinfo(skb)->frags[i], frag_len);
+ skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i],
+ pkt_dev->page, 0, frag_len);
+
datalen -= skb_frag_size(&skb_shinfo(skb)->frags[i]);
skb->len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
skb->data_len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
@@ -2712,7 +2913,7 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
skb->dev = dev;
}
} else {
- skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT);
+ skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT);
}
/* the caller pre-fetches from skb->data and reserves for the mac hdr */
@@ -2793,7 +2994,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
skb->priority = pkt_dev->skb_priority;
memcpy(eth, pkt_dev->hh, 12);
- *(__be16 *) & eth[12] = protocol;
+ *(__be16 *)&eth[12] = protocol;
/* Eth + IPh + UDPh + mpls */
datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8 -
@@ -3022,31 +3223,36 @@ static void pktgen_run(struct pktgen_thread *t)
set_pkt_overhead(pkt_dev);
- strcpy(pkt_dev->result, "Starting");
+ strscpy(pkt_dev->result, "Starting");
pkt_dev->running = 1; /* Cranke yeself! */
started++;
} else
- strcpy(pkt_dev->result, "Error starting");
+ strscpy(pkt_dev->result, "Error starting");
}
rcu_read_unlock();
if (started)
t->control &= ~(T_STOP);
}
-static void pktgen_stop_all_threads_ifs(struct pktgen_net *pn)
+static void pktgen_handle_all_threads(struct pktgen_net *pn, u32 flags)
{
struct pktgen_thread *t;
- func_enter();
-
mutex_lock(&pktgen_thread_lock);
list_for_each_entry(t, &pn->pktgen_threads, th_list)
- t->control |= T_STOP;
+ t->control |= (flags);
mutex_unlock(&pktgen_thread_lock);
}
+static void pktgen_stop_all_threads(struct pktgen_net *pn)
+{
+ func_enter();
+
+ pktgen_handle_all_threads(pn, T_STOP);
+}
+
static int thread_is_running(const struct pktgen_thread *t)
{
const struct pktgen_dev *pkt_dev;
@@ -3065,7 +3271,13 @@ static int pktgen_wait_thread_run(struct pktgen_thread *t)
{
while (thread_is_running(t)) {
+ /* note: 't' will still be around even after the unlock/lock
+ * cycle because pktgen_thread threads are only cleared at
+ * net exit
+ */
+ mutex_unlock(&pktgen_thread_lock);
msleep_interruptible(100);
+ mutex_lock(&pktgen_thread_lock);
if (signal_pending(current))
goto signal;
@@ -3080,6 +3292,10 @@ static int pktgen_wait_all_threads_run(struct pktgen_net *pn)
struct pktgen_thread *t;
int sig = 1;
+ /* prevent from racing with rmmod */
+ if (!try_module_get(THIS_MODULE))
+ return sig;
+
mutex_lock(&pktgen_thread_lock);
list_for_each_entry(t, &pn->pktgen_threads, th_list) {
@@ -3093,21 +3309,15 @@ static int pktgen_wait_all_threads_run(struct pktgen_net *pn)
t->control |= (T_STOP);
mutex_unlock(&pktgen_thread_lock);
+ module_put(THIS_MODULE);
return sig;
}
static void pktgen_run_all_threads(struct pktgen_net *pn)
{
- struct pktgen_thread *t;
-
func_enter();
- mutex_lock(&pktgen_thread_lock);
-
- list_for_each_entry(t, &pn->pktgen_threads, th_list)
- t->control |= (T_RUN);
-
- mutex_unlock(&pktgen_thread_lock);
+ pktgen_handle_all_threads(pn, T_RUN);
/* Propagate thread->control */
schedule_timeout_interruptible(msecs_to_jiffies(125));
@@ -3117,16 +3327,9 @@ static void pktgen_run_all_threads(struct pktgen_net *pn)
static void pktgen_reset_all_threads(struct pktgen_net *pn)
{
- struct pktgen_thread *t;
-
func_enter();
- mutex_lock(&pktgen_thread_lock);
-
- list_for_each_entry(t, &pn->pktgen_threads, th_list)
- t->control |= (T_REMDEVALL);
-
- mutex_unlock(&pktgen_thread_lock);
+ pktgen_handle_all_threads(pn, T_REMDEVALL);
/* Propagate thread->control */
schedule_timeout_interruptible(msecs_to_jiffies(125));
@@ -3152,7 +3355,19 @@ static void show_results(struct pktgen_dev *pkt_dev, int nr_frags)
pps = div64_u64(pkt_dev->sofar * NSEC_PER_SEC,
ktime_to_ns(elapsed));
- bps = pps * 8 * pkt_dev->cur_pkt_size;
+ if (pkt_dev->n_imix_entries > 0) {
+ int i;
+ struct imix_pkt *entry;
+
+ bps = 0;
+ for (i = 0; i < pkt_dev->n_imix_entries; i++) {
+ entry = &pkt_dev->imix_entries[i];
+ bps += entry->size * entry->count_so_far;
+ }
+ bps = div64_u64(bps * 8 * NSEC_PER_SEC, ktime_to_ns(elapsed));
+ } else {
+ bps = pps * 8 * pkt_dev->cur_pkt_size;
+ }
mbps = bps;
do_div(mbps, 1000000);
@@ -3271,6 +3486,7 @@ static void pktgen_rem_thread(struct pktgen_thread *t)
static void pktgen_resched(struct pktgen_dev *pkt_dev)
{
ktime_t idle_start = ktime_get();
+
schedule();
pkt_dev->idle_acc += ktime_to_ns(ktime_sub(ktime_get(), idle_start));
}
@@ -3293,12 +3509,24 @@ static void pktgen_wait_for_skb(struct pktgen_dev *pkt_dev)
static void pktgen_xmit(struct pktgen_dev *pkt_dev)
{
- unsigned int burst = READ_ONCE(pkt_dev->burst);
+ bool skb_shared = !!(READ_ONCE(pkt_dev->flags) & F_SHARED);
struct net_device *odev = pkt_dev->odev;
struct netdev_queue *txq;
+ unsigned int burst = 1;
struct sk_buff *skb;
+ int clone_skb = 0;
int ret;
+ /* If 'skb_shared' is false, the read of possible
+ * new values (if any) for 'burst' and 'clone_skb' will be skipped to
+ * prevent some concurrent changes from slipping in. And the stabilized
+ * config will be read in during the next run of pktgen_xmit.
+ */
+ if (skb_shared) {
+ burst = READ_ONCE(pkt_dev->burst);
+ clone_skb = READ_ONCE(pkt_dev->clone_skb);
+ }
+
/* If device is offline, then don't send */
if (unlikely(!netif_running(odev) || !netif_carrier_ok(odev))) {
pktgen_stop_device(pkt_dev);
@@ -3315,7 +3543,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
/* If no skb or clone count exhausted then get new one */
if (!pkt_dev->skb || (pkt_dev->last_ok &&
- ++pkt_dev->clone_count >= pkt_dev->clone_skb)) {
+ ++pkt_dev->clone_count >= clone_skb)) {
/* build a new pkt */
kfree_skb(pkt_dev->skb);
@@ -3336,7 +3564,8 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
if (pkt_dev->xmit_mode == M_NETIF_RECEIVE) {
skb = pkt_dev->skb;
skb->protocol = eth_type_trans(skb, skb->dev);
- refcount_add(burst, &skb->users);
+ if (skb_shared)
+ refcount_add(burst, &skb->users);
local_bh_disable();
do {
ret = netif_receive_skb(skb);
@@ -3344,6 +3573,10 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
pkt_dev->errors++;
pkt_dev->sofar++;
pkt_dev->seq_num++;
+ if (unlikely(!skb_shared)) {
+ pkt_dev->skb = NULL;
+ break;
+ }
if (refcount_read(&skb->users) != burst) {
/* skb was queued by rps/rfs or taps,
* so cannot reuse this skb
@@ -3357,14 +3590,19 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
/* skb was 'freed' by stack, so clean few
* bits and reuse it
*/
- skb_reset_tc(skb);
+ skb_reset_redirect(skb);
} while (--burst > 0);
goto out; /* Skips xmit_mode M_START_XMIT */
} else if (pkt_dev->xmit_mode == M_QUEUE_XMIT) {
local_bh_disable();
- refcount_inc(&pkt_dev->skb->users);
+ if (skb_shared)
+ refcount_inc(&pkt_dev->skb->users);
ret = dev_queue_xmit(pkt_dev->skb);
+
+ if (!skb_shared && dev_xmit_complete(ret))
+ pkt_dev->skb = NULL;
+
switch (ret) {
case NET_XMIT_SUCCESS:
pkt_dev->sofar++;
@@ -3399,15 +3637,18 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
HARD_TX_LOCK(odev, txq, smp_processor_id());
if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) {
- ret = NETDEV_TX_BUSY;
pkt_dev->last_ok = 0;
goto unlock;
}
- refcount_add(burst, &pkt_dev->skb->users);
+ if (skb_shared)
+ refcount_add(burst, &pkt_dev->skb->users);
xmit_more:
ret = netdev_start_xmit(pkt_dev->skb, odev, txq, --burst > 0);
+ if (!skb_shared && dev_xmit_complete(ret))
+ pkt_dev->skb = NULL;
+
switch (ret) {
case NETDEV_TX_OK:
pkt_dev->last_ok = 1;
@@ -3426,10 +3667,11 @@ xmit_more:
net_info_ratelimited("%s xmit error: %d\n",
pkt_dev->odevname, ret);
pkt_dev->errors++;
- /* fallthru */
+ fallthrough;
case NETDEV_TX_BUSY:
/* Retry it next time */
- refcount_dec(&(pkt_dev->skb->users));
+ if (skb_shared)
+ refcount_dec(&pkt_dev->skb->users);
pkt_dev->last_ok = 0;
}
if (unlikely(burst))
@@ -3442,7 +3684,8 @@ out:
/* If pkt_dev->count is zero, then run forever */
if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) {
- pktgen_wait_for_skb(pkt_dev);
+ if (pkt_dev->skb)
+ pktgen_wait_for_skb(pkt_dev);
/* Done with this */
pktgen_stop_device(pkt_dev);
@@ -3455,12 +3698,11 @@ out:
static int pktgen_thread_worker(void *arg)
{
- DEFINE_WAIT(wait);
struct pktgen_thread *t = arg;
struct pktgen_dev *pkt_dev = NULL;
int cpu = t->cpu;
- BUG_ON(smp_processor_id() != cpu);
+ WARN_ON_ONCE(smp_processor_id() != cpu);
init_waitqueue_head(&t->queue);
complete(&t->start_done);
@@ -3475,10 +3717,8 @@ static int pktgen_thread_worker(void *arg)
if (unlikely(!pkt_dev && t->control == 0)) {
if (t->net->pktgen_exiting)
break;
- wait_event_interruptible_timeout(t->queue,
- t->control != 0,
- HZ/10);
- try_to_freeze();
+ wait_event_freezable_timeout(t->queue,
+ t->control != 0, HZ / 10);
continue;
}
@@ -3562,7 +3802,8 @@ static int add_dev_to_thread(struct pktgen_thread *t,
* userspace on another CPU than the kthread. The if_lock()
* is used here to sync with concurrent instances of
* _rem_dev_from_if_list() invoked via kthread, which is also
- * updating the if_list */
+ * updating the if_list
+ */
if_lock(t);
if (pkt_dev->pg_thread) {
@@ -3600,7 +3841,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
if (!pkt_dev)
return -ENOMEM;
- strcpy(pkt_dev->odevname, ifname);
+ strscpy(pkt_dev->odevname, ifname);
pkt_dev->flows = vzalloc_node(array_size(MAX_CFLOWS,
sizeof(struct flow_state)),
node);
@@ -3625,7 +3866,8 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
pkt_dev->svlan_cfi = 0;
pkt_dev->svlan_id = 0xffff;
pkt_dev->burst = 1;
- pkt_dev->node = -1;
+ pkt_dev->node = NUMA_NO_NODE;
+ pkt_dev->flags = F_SHARED; /* SKB shared by default */
err = pktgen_setup_dev(t->net, pkt_dev, ifname);
if (err)
@@ -3634,7 +3876,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
pkt_dev->clone_skb = pg_clone_skb_d;
pkt_dev->entry = proc_create_data(ifname, 0600, t->net->proc_dir,
- &pktgen_if_fops, pkt_dev);
+ &pktgen_if_proc_ops, pkt_dev);
if (!pkt_dev->entry) {
pr_err("cannot create %s/%s procfs entry\n",
PG_PROC_DIR, ifname);
@@ -3645,8 +3887,8 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
pkt_dev->ipsmode = XFRM_MODE_TRANSPORT;
pkt_dev->ipsproto = IPPROTO_ESP;
- /* xfrm tunnel mode needs additional dst to extract outter
- * ip header protocol/ttl/id field, here creat a phony one.
+ /* xfrm tunnel mode needs additional dst to extract outer
+ * ip header protocol/ttl/id field, here create a phony one.
* instead of looking for a valid rt, which definitely hurting
* performance under such circumstance.
*/
@@ -3659,7 +3901,7 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname)
return add_dev_to_thread(t, pkt_dev);
out2:
- dev_put(pkt_dev->odev);
+ netdev_put(pkt_dev->odev, &pkt_dev->dev_tracker);
out1:
#ifdef CONFIG_XFRM
free_SAs(pkt_dev);
@@ -3690,21 +3932,18 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn)
list_add_tail(&t->th_list, &pn->pktgen_threads);
init_completion(&t->start_done);
- p = kthread_create_on_node(pktgen_thread_worker,
- t,
- cpu_to_node(cpu),
- "kpktgend_%d", cpu);
+ p = kthread_create_on_cpu(pktgen_thread_worker, t, cpu, "kpktgend_%d");
if (IS_ERR(p)) {
- pr_err("kernel_thread() failed for cpu %d\n", t->cpu);
+ pr_err("kthread_create_on_node() failed for cpu %d\n", t->cpu);
list_del(&t->th_list);
kfree(t);
return PTR_ERR(p);
}
- kthread_bind(p, cpu);
+
t->tsk = p;
pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir,
- &pktgen_thread_fops, t);
+ &pktgen_thread_proc_ops, t);
if (!pe) {
pr_err("cannot create %s/%s procfs entry\n",
PG_PROC_DIR, t->tsk->comm);
@@ -3753,13 +3992,14 @@ static int pktgen_remove_device(struct pktgen_thread *t,
/* Dis-associate from the interface */
if (pkt_dev->odev) {
- dev_put(pkt_dev->odev);
+ netdev_put(pkt_dev->odev, &pkt_dev->dev_tracker);
pkt_dev->odev = NULL;
}
/* Remove proc before if_list entry, because add_device uses
* list to determine if interface already exist, avoid race
- * with proc_create_data() */
+ * with proc_create_data()
+ */
proc_remove(pkt_dev->entry);
/* And update the thread if_list */
@@ -3789,13 +4029,14 @@ static int __net_init pg_net_init(struct net *net)
pr_warn("cannot create /proc/net/%s\n", PG_PROC_DIR);
return -ENODEV;
}
- pe = proc_create(PGCTRL, 0600, pn->proc_dir, &pktgen_fops);
+ pe = proc_create(PGCTRL, 0600, pn->proc_dir, &pktgen_proc_ops);
if (pe == NULL) {
pr_err("cannot create %s procfs entry\n", PGCTRL);
ret = -EINVAL;
goto remove;
}
+ cpus_read_lock();
for_each_online_cpu(cpu) {
int err;
@@ -3804,6 +4045,7 @@ static int __net_init pg_net_init(struct net *net)
pr_warn("Cannot create thread for cpu %d (%d)\n",
cpu, err);
}
+ cpus_read_unlock();
if (list_empty(&pn->pktgen_threads)) {
pr_err("Initialization failed for all threads\n");
@@ -3837,8 +4079,7 @@ static void __net_exit pg_net_exit(struct net *net)
list_for_each_safe(q, n, &list) {
t = list_entry(q, struct pktgen_thread, th_list);
list_del(&t->th_list);
- kthread_stop(t->tsk);
- put_task_struct(t->tsk);
+ kthread_stop_put(t->tsk);
kfree(t);
}
diff --git a/net/core/ptp_classifier.c b/net/core/ptp_classifier.c
index 703cf76aa7c2..598041b0499e 100644
--- a/net/core/ptp_classifier.c
+++ b/net/core/ptp_classifier.c
@@ -1,13 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* PTP classifier
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
/* The below program is the bpf_asm (tools/net/) representation of
@@ -111,10 +103,52 @@ static struct bpf_prog *ptp_insns __read_mostly;
unsigned int ptp_classify_raw(const struct sk_buff *skb)
{
- return BPF_PROG_RUN(ptp_insns, skb);
+ return bpf_prog_run(ptp_insns, skb);
}
EXPORT_SYMBOL_GPL(ptp_classify_raw);
+struct ptp_header *ptp_parse_header(struct sk_buff *skb, unsigned int type)
+{
+ u8 *ptr = skb_mac_header(skb);
+
+ if (type & PTP_CLASS_VLAN)
+ ptr += VLAN_HLEN;
+
+ switch (type & PTP_CLASS_PMASK) {
+ case PTP_CLASS_IPV4:
+ ptr += IPV4_HLEN(ptr) + UDP_HLEN;
+ break;
+ case PTP_CLASS_IPV6:
+ ptr += IP6_HLEN + UDP_HLEN;
+ break;
+ case PTP_CLASS_L2:
+ break;
+ default:
+ return NULL;
+ }
+
+ ptr += ETH_HLEN;
+
+ /* Ensure that the entire header is present in this packet. */
+ if (ptr + sizeof(struct ptp_header) > skb->data + skb->len)
+ return NULL;
+
+ return (struct ptp_header *)ptr;
+}
+EXPORT_SYMBOL_GPL(ptp_parse_header);
+
+bool ptp_msg_is_sync(struct sk_buff *skb, unsigned int type)
+{
+ struct ptp_header *hdr;
+
+ hdr = ptp_parse_header(skb, type);
+ if (!hdr)
+ return false;
+
+ return ptp_get_msgtype(hdr, type) == PTP_MSGTYPE_SYNC;
+}
+EXPORT_SYMBOL_GPL(ptp_msg_is_sync);
+
void __init ptp_classifier_init(void)
{
static struct sock_filter ptp_filter[] __initdata = {
@@ -185,9 +219,10 @@ void __init ptp_classifier_init(void)
{ 0x16, 0, 0, 0x00000000 },
{ 0x06, 0, 0, 0x00000000 },
};
- struct sock_fprog_kern ptp_prog = {
- .len = ARRAY_SIZE(ptp_filter), .filter = ptp_filter,
- };
+ struct sock_fprog_kern ptp_prog;
+
+ ptp_prog.len = ARRAY_SIZE(ptp_filter);
+ ptp_prog.filter = ptp_filter;
BUG_ON(bpf_prog_create(&ptp_insns, &ptp_prog));
}
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 9b8727c67b58..897a8f01a67b 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* NET Generic infrastructure for Network protocols.
*
* Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
*
* From code originally in include/net/tcp.h
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
@@ -37,9 +33,6 @@
void reqsk_queue_alloc(struct request_sock_queue *queue)
{
- spin_lock_init(&queue->rskq_lock);
-
- spin_lock_init(&queue->fastopenq.lock);
queue->fastopenq.rskq_rst_head = NULL;
queue->fastopenq.rskq_rst_tail = NULL;
queue->fastopenq.qlen = 0;
@@ -84,9 +77,7 @@ void reqsk_queue_alloc(struct request_sock_queue *queue)
* a simple spin lock - one must consider sock_owned_by_user() and arrange
* to use sk_add_backlog() stuff. But what really makes it infeasible is the
* locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
- * acquire a child's lock while holding listener's socket lock. A corner
- * case might also exist in tcp_v4_hnd_req() that will trigger this locking
- * order.
+ * acquire a child's lock while holding listener's socket lock.
*
* This function also sets "treq->tfo_listener" to false.
* treq->tfo_listener is used by the listener so it is protected by the
@@ -100,7 +91,7 @@ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq;
- tcp_sk(sk)->fastopen_rsk = NULL;
+ RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL);
spin_lock_bh(&fastopenq->lock);
fastopenq->qlen--;
tcp_rsk(req)->tfo_listener = false;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 37c7936124e6..b1ed55141d8a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -7,13 +8,8 @@
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Fixes:
- * Vitaly E. Lavrov RTA_OK arithmetics was wrong.
+ * Vitaly E. Lavrov RTA_OK arithmetic was wrong.
*/
#include <linux/bitops.h>
@@ -46,7 +42,6 @@
#include <linux/inet.h>
#include <linux/netdevice.h>
-#include <net/switchdev.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/arp.h>
@@ -58,9 +53,17 @@
#include <net/fib_rules.h>
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
+#include <net/netdev_lock.h>
+#include <net/devlink.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/addrconf.h>
+#endif
+#include <linux/dpll.h>
-#define RTNL_MAX_TYPE 48
-#define RTNL_SLAVE_MAX_TYPE 36
+#include "dev.h"
+
+#define RTNL_MAX_TYPE 50
+#define RTNL_SLAVE_MAX_TYPE 44
struct rtnl_link {
rtnl_doit_func doit;
@@ -78,11 +81,15 @@ void rtnl_lock(void)
}
EXPORT_SYMBOL(rtnl_lock);
+int rtnl_lock_interruptible(void)
+{
+ return mutex_lock_interruptible(&rtnl_mutex);
+}
+
int rtnl_lock_killable(void)
{
return mutex_lock_killable(&rtnl_mutex);
}
-EXPORT_SYMBOL(rtnl_lock_killable);
static struct sk_buff *defer_kfree_skb_list;
void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail)
@@ -100,6 +107,39 @@ void __rtnl_unlock(void)
defer_kfree_skb_list = NULL;
+ /* Ensure that we didn't actually add any TODO item when __rtnl_unlock()
+ * is used. In some places, e.g. in cfg80211, we have code that will do
+ * something like
+ * rtnl_lock()
+ * wiphy_lock()
+ * ...
+ * rtnl_unlock()
+ *
+ * and because netdev_run_todo() acquires the RTNL for items on the list
+ * we could cause a situation such as this:
+ * Thread 1 Thread 2
+ * rtnl_lock()
+ * unregister_netdevice()
+ * __rtnl_unlock()
+ * rtnl_lock()
+ * wiphy_lock()
+ * rtnl_unlock()
+ * netdev_run_todo()
+ * __rtnl_unlock()
+ *
+ * // list not empty now
+ * // because of thread 2
+ * rtnl_lock()
+ * while (!list_empty(...))
+ * rtnl_lock()
+ * wiphy_lock()
+ * **** DEADLOCK ****
+ *
+ * However, usage of __rtnl_unlock() is rare, and so we can ensure that
+ * it's not used in cases where something is added to do the list.
+ */
+ WARN_ON(!list_empty(&net_todo_list));
+
mutex_unlock(&rtnl_mutex);
while (head) {
@@ -130,6 +170,12 @@ int rtnl_is_locked(void)
}
EXPORT_SYMBOL(rtnl_is_locked);
+bool refcount_dec_and_rtnl_lock(refcount_t *r)
+{
+ return refcount_dec_and_mutex_lock(r, &rtnl_mutex);
+}
+EXPORT_SYMBOL(refcount_dec_and_rtnl_lock);
+
#ifdef CONFIG_PROVE_LOCKING
bool lockdep_rtnl_is_held(void)
{
@@ -138,7 +184,177 @@ bool lockdep_rtnl_is_held(void)
EXPORT_SYMBOL(lockdep_rtnl_is_held);
#endif /* #ifdef CONFIG_PROVE_LOCKING */
-static struct rtnl_link *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
+#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
+void __rtnl_net_lock(struct net *net)
+{
+ ASSERT_RTNL();
+
+ mutex_lock(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(__rtnl_net_lock);
+
+void __rtnl_net_unlock(struct net *net)
+{
+ ASSERT_RTNL();
+
+ mutex_unlock(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(__rtnl_net_unlock);
+
+void rtnl_net_lock(struct net *net)
+{
+ rtnl_lock();
+ __rtnl_net_lock(net);
+}
+EXPORT_SYMBOL(rtnl_net_lock);
+
+void rtnl_net_unlock(struct net *net)
+{
+ __rtnl_net_unlock(net);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL(rtnl_net_unlock);
+
+int rtnl_net_trylock(struct net *net)
+{
+ int ret = rtnl_trylock();
+
+ if (ret)
+ __rtnl_net_lock(net);
+
+ return ret;
+}
+EXPORT_SYMBOL(rtnl_net_trylock);
+
+int rtnl_net_lock_killable(struct net *net)
+{
+ int ret = rtnl_lock_killable();
+
+ if (!ret)
+ __rtnl_net_lock(net);
+
+ return ret;
+}
+
+static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b)
+{
+ if (net_eq(net_a, net_b))
+ return 0;
+
+ /* always init_net first */
+ if (net_eq(net_a, &init_net))
+ return -1;
+
+ if (net_eq(net_b, &init_net))
+ return 1;
+
+ /* otherwise lock in ascending order */
+ return net_a < net_b ? -1 : 1;
+}
+
+int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b)
+{
+ const struct net *net_a, *net_b;
+
+ net_a = container_of(a, struct net, rtnl_mutex.dep_map);
+ net_b = container_of(b, struct net, rtnl_mutex.dep_map);
+
+ return rtnl_net_cmp_locks(net_a, net_b);
+}
+
+bool rtnl_net_is_locked(struct net *net)
+{
+ return rtnl_is_locked() && mutex_is_locked(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(rtnl_net_is_locked);
+
+bool lockdep_rtnl_net_is_held(struct net *net)
+{
+ return lockdep_rtnl_is_held() && lockdep_is_held(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(lockdep_rtnl_net_is_held);
+#else
+static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b)
+{
+ /* No need to swap */
+ return -1;
+}
+#endif
+
+struct rtnl_nets {
+ /* ->newlink() needs to freeze 3 netns at most;
+ * 2 for the new device, 1 for its peer.
+ */
+ struct net *net[3];
+ unsigned char len;
+};
+
+static void rtnl_nets_init(struct rtnl_nets *rtnl_nets)
+{
+ memset(rtnl_nets, 0, sizeof(*rtnl_nets));
+}
+
+static void rtnl_nets_destroy(struct rtnl_nets *rtnl_nets)
+{
+ int i;
+
+ for (i = 0; i < rtnl_nets->len; i++) {
+ put_net(rtnl_nets->net[i]);
+ rtnl_nets->net[i] = NULL;
+ }
+
+ rtnl_nets->len = 0;
+}
+
+/**
+ * rtnl_nets_add - Add netns to be locked before ->newlink().
+ *
+ * @rtnl_nets: rtnl_nets pointer passed to ->get_peer_net().
+ * @net: netns pointer with an extra refcnt held.
+ *
+ * The extra refcnt is released in rtnl_nets_destroy().
+ */
+static void rtnl_nets_add(struct rtnl_nets *rtnl_nets, struct net *net)
+{
+ int i;
+
+ DEBUG_NET_WARN_ON_ONCE(rtnl_nets->len == ARRAY_SIZE(rtnl_nets->net));
+
+ for (i = 0; i < rtnl_nets->len; i++) {
+ switch (rtnl_net_cmp_locks(rtnl_nets->net[i], net)) {
+ case 0:
+ put_net(net);
+ return;
+ case 1:
+ swap(rtnl_nets->net[i], net);
+ }
+ }
+
+ rtnl_nets->net[i] = net;
+ rtnl_nets->len++;
+}
+
+static void rtnl_nets_lock(struct rtnl_nets *rtnl_nets)
+{
+ int i;
+
+ rtnl_lock();
+
+ for (i = 0; i < rtnl_nets->len; i++)
+ __rtnl_net_lock(rtnl_nets->net[i]);
+}
+
+static void rtnl_nets_unlock(struct rtnl_nets *rtnl_nets)
+{
+ int i;
+
+ for (i = 0; i < rtnl_nets->len; i++)
+ __rtnl_net_unlock(rtnl_nets->net[i]);
+
+ rtnl_unlock();
+}
+
+static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
static inline int rtm_msgindex(int msgtype)
{
@@ -156,7 +372,7 @@ static inline int rtm_msgindex(int msgtype)
static struct rtnl_link *rtnl_get_link(int protocol, int msgtype)
{
- struct rtnl_link **tab;
+ struct rtnl_link __rcu **tab;
if (protocol >= ARRAY_SIZE(rtnl_msg_handlers))
protocol = PF_UNSPEC;
@@ -165,7 +381,7 @@ static struct rtnl_link *rtnl_get_link(int protocol, int msgtype)
if (!tab)
tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]);
- return tab[msgtype];
+ return rcu_dereference_rtnl(tab[msgtype]);
}
static int rtnl_register_internal(struct module *owner,
@@ -182,7 +398,7 @@ static int rtnl_register_internal(struct module *owner,
msgindex = rtm_msgindex(msgtype);
rtnl_lock();
- tab = rtnl_msg_handlers[protocol];
+ tab = rtnl_dereference(rtnl_msg_handlers[protocol]);
if (tab == NULL) {
tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL);
if (!tab)
@@ -213,6 +429,8 @@ static int rtnl_register_internal(struct module *owner,
if (dumpit)
link->dumpit = dumpit;
+ WARN_ON(rtnl_msgtype_kind(msgtype) != RTNL_KIND_DEL &&
+ (flags & RTNL_FLAG_BULK_DEL_SUPPORTED));
link->flags |= flags;
/* publish protocol:msgtype */
@@ -226,66 +444,16 @@ unlock:
}
/**
- * rtnl_register_module - Register a rtnetlink message type
- *
- * @owner: module registering the hook (THIS_MODULE)
- * @protocol: Protocol family or PF_UNSPEC
- * @msgtype: rtnetlink message type
- * @doit: Function pointer called for each request message
- * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
- * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
- *
- * Like rtnl_register, but for use by removable modules.
- */
-int rtnl_register_module(struct module *owner,
- int protocol, int msgtype,
- rtnl_doit_func doit, rtnl_dumpit_func dumpit,
- unsigned int flags)
-{
- return rtnl_register_internal(owner, protocol, msgtype,
- doit, dumpit, flags);
-}
-EXPORT_SYMBOL_GPL(rtnl_register_module);
-
-/**
- * rtnl_register - Register a rtnetlink message type
- * @protocol: Protocol family or PF_UNSPEC
- * @msgtype: rtnetlink message type
- * @doit: Function pointer called for each request message
- * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message
- * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions
- *
- * Registers the specified function pointers (at least one of them has
- * to be non-NULL) to be called whenever a request message for the
- * specified protocol family and message type is received.
- *
- * The special protocol family PF_UNSPEC may be used to define fallback
- * function pointers for the case when no entry for the specific protocol
- * family exists.
- */
-void rtnl_register(int protocol, int msgtype,
- rtnl_doit_func doit, rtnl_dumpit_func dumpit,
- unsigned int flags)
-{
- int err;
-
- err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit,
- flags);
- if (err)
- pr_err("Unable to register rtnetlink message handler, "
- "protocol = %d, message type = %d\n", protocol, msgtype);
-}
-
-/**
* rtnl_unregister - Unregister a rtnetlink message type
* @protocol: Protocol family or PF_UNSPEC
* @msgtype: rtnetlink message type
*
* Returns 0 on success or a negative error code.
*/
-int rtnl_unregister(int protocol, int msgtype)
+static int rtnl_unregister(int protocol, int msgtype)
{
- struct rtnl_link **tab, *link;
+ struct rtnl_link __rcu **tab;
+ struct rtnl_link *link;
int msgindex;
BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
@@ -298,43 +466,37 @@ int rtnl_unregister(int protocol, int msgtype)
return -ENOENT;
}
- link = tab[msgindex];
- rcu_assign_pointer(tab[msgindex], NULL);
+ link = rcu_replace_pointer_rtnl(tab[msgindex], NULL);
rtnl_unlock();
kfree_rcu(link, rcu);
return 0;
}
-EXPORT_SYMBOL_GPL(rtnl_unregister);
/**
* rtnl_unregister_all - Unregister all rtnetlink message type of a protocol
* @protocol : Protocol family or PF_UNSPEC
*
- * Identical to calling rtnl_unregster() for all registered message types
+ * Identical to calling rtnl_unregister() for all registered message types
* of a certain protocol family.
*/
void rtnl_unregister_all(int protocol)
{
- struct rtnl_link **tab, *link;
+ struct rtnl_link __rcu **tab;
+ struct rtnl_link *link;
int msgindex;
BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX);
rtnl_lock();
- tab = rtnl_msg_handlers[protocol];
+ tab = rcu_replace_pointer_rtnl(rtnl_msg_handlers[protocol], NULL);
if (!tab) {
rtnl_unlock();
return;
}
- RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL);
for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) {
- link = tab[msgindex];
- if (!link)
- continue;
-
- rcu_assign_pointer(tab[msgindex], NULL);
+ link = rcu_replace_pointer_rtnl(tab[msgindex], NULL);
kfree_rcu(link, rcu);
}
rtnl_unlock();
@@ -345,46 +507,86 @@ void rtnl_unregister_all(int protocol)
}
EXPORT_SYMBOL_GPL(rtnl_unregister_all);
-static LIST_HEAD(link_ops);
-
-static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind)
+/**
+ * __rtnl_register_many - Register rtnetlink message types
+ * @handlers: Array of struct rtnl_msg_handlers
+ * @n: The length of @handlers
+ *
+ * Registers the specified function pointers (at least one of them has
+ * to be non-NULL) to be called whenever a request message for the
+ * specified protocol family and message type is received.
+ *
+ * The special protocol family PF_UNSPEC may be used to define fallback
+ * function pointers for the case when no entry for the specific protocol
+ * family exists.
+ *
+ * When one element of @handlers fails to register,
+ * 1) built-in: panics.
+ * 2) modules : the previous successful registrations are unwinded
+ * and an error is returned.
+ *
+ * Use rtnl_register_many().
+ */
+int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n)
{
- const struct rtnl_link_ops *ops;
+ const struct rtnl_msg_handler *handler;
+ int i, err;
+
+ for (i = 0, handler = handlers; i < n; i++, handler++) {
+ err = rtnl_register_internal(handler->owner, handler->protocol,
+ handler->msgtype, handler->doit,
+ handler->dumpit, handler->flags);
+ if (err) {
+ if (!handler->owner)
+ panic("Unable to register rtnetlink message "
+ "handlers, %pS\n", handlers);
- list_for_each_entry(ops, &link_ops, list) {
- if (!strcmp(ops->kind, kind))
- return ops;
+ __rtnl_unregister_many(handlers, i);
+ break;
+ }
}
- return NULL;
+
+ return err;
}
+EXPORT_SYMBOL_GPL(__rtnl_register_many);
-/**
- * __rtnl_link_register - Register rtnl_link_ops with rtnetlink.
- * @ops: struct rtnl_link_ops * to register
- *
- * The caller must hold the rtnl_mutex. This function should be used
- * by drivers that create devices during module initialization. It
- * must be called before registering the devices.
- *
- * Returns 0 on success or a negative error code.
- */
-int __rtnl_link_register(struct rtnl_link_ops *ops)
+void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n)
{
- if (rtnl_link_ops_get(ops->kind))
- return -EEXIST;
+ const struct rtnl_msg_handler *handler;
+ int i;
- /* The check for setup is here because if ops
- * does not have that filled up, it is not possible
- * to use the ops for creating device. So do not
- * fill up dellink as well. That disables rtnl_dellink.
- */
- if (ops->setup && !ops->dellink)
- ops->dellink = unregister_netdevice_queue;
+ for (i = n - 1, handler = handlers + n - 1; i >= 0; i--, handler--)
+ rtnl_unregister(handler->protocol, handler->msgtype);
+}
+EXPORT_SYMBOL_GPL(__rtnl_unregister_many);
- list_add_tail(&ops->list, &link_ops);
- return 0;
+static DEFINE_MUTEX(link_ops_mutex);
+static LIST_HEAD(link_ops);
+
+static struct rtnl_link_ops *rtnl_link_ops_get(const char *kind, int *srcu_index)
+{
+ struct rtnl_link_ops *ops;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(ops, &link_ops, list) {
+ if (!strcmp(ops->kind, kind)) {
+ *srcu_index = srcu_read_lock(&ops->srcu);
+ goto unlock;
+ }
+ }
+
+ ops = NULL;
+unlock:
+ rcu_read_unlock();
+
+ return ops;
+}
+
+static void rtnl_link_ops_put(struct rtnl_link_ops *ops, int srcu_index)
+{
+ srcu_read_unlock(&ops->srcu, srcu_index);
}
-EXPORT_SYMBOL_GPL(__rtnl_link_register);
/**
* rtnl_link_register - Register rtnl_link_ops with rtnetlink.
@@ -394,6 +596,7 @@ EXPORT_SYMBOL_GPL(__rtnl_link_register);
*/
int rtnl_link_register(struct rtnl_link_ops *ops)
{
+ struct rtnl_link_ops *tmp;
int err;
/* Sanity-check max sizes to avoid stack buffer overflow. */
@@ -401,9 +604,31 @@ int rtnl_link_register(struct rtnl_link_ops *ops)
ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE))
return -EINVAL;
- rtnl_lock();
- err = __rtnl_link_register(ops);
- rtnl_unlock();
+ /* The check for alloc/setup is here because if ops
+ * does not have that filled up, it is not possible
+ * to use the ops for creating device. So do not
+ * fill up dellink as well. That disables rtnl_dellink.
+ */
+ if ((ops->alloc || ops->setup) && !ops->dellink)
+ ops->dellink = unregister_netdevice_queue;
+
+ err = init_srcu_struct(&ops->srcu);
+ if (err)
+ return err;
+
+ mutex_lock(&link_ops_mutex);
+
+ list_for_each_entry(tmp, &link_ops, list) {
+ if (!strcmp(ops->kind, tmp->kind)) {
+ err = -EEXIST;
+ goto unlock;
+ }
+ }
+
+ list_add_tail_rcu(&ops->list, &link_ops);
+unlock:
+ mutex_unlock(&link_ops_mutex);
+
return err;
}
EXPORT_SYMBOL_GPL(rtnl_link_register);
@@ -420,48 +645,20 @@ static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
unregister_netdevice_many(&list_kill);
}
-/**
- * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
- * @ops: struct rtnl_link_ops * to unregister
- *
- * The caller must hold the rtnl_mutex and guarantee net_namespace_list
- * integrity (hold pernet_ops_rwsem for writing to close the race
- * with setup_net() and cleanup_net()).
- */
-void __rtnl_link_unregister(struct rtnl_link_ops *ops)
-{
- struct net *net;
-
- for_each_net(net) {
- __rtnl_kill_links(net, ops);
- }
- list_del(&ops->list);
-}
-EXPORT_SYMBOL_GPL(__rtnl_link_unregister);
-
/* Return with the rtnl_lock held when there are no network
* devices unregistering in any network namespace.
*/
static void rtnl_lock_unregistering_all(void)
{
- struct net *net;
- bool unregistering;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
add_wait_queue(&netdev_unregistering_wq, &wait);
for (;;) {
- unregistering = false;
rtnl_lock();
/* We held write locked pernet_ops_rwsem, and parallel
* setup_net() and cleanup_net() are not possible.
*/
- for_each_net(net) {
- if (net->dev_unreg_count > 0) {
- unregistering = true;
- break;
- }
- }
- if (!unregistering)
+ if (!atomic_read(&dev_unreg_count))
break;
__rtnl_unlock();
@@ -476,10 +673,22 @@ static void rtnl_lock_unregistering_all(void)
*/
void rtnl_link_unregister(struct rtnl_link_ops *ops)
{
+ struct net *net;
+
+ mutex_lock(&link_ops_mutex);
+ list_del_rcu(&ops->list);
+ mutex_unlock(&link_ops_mutex);
+
+ synchronize_srcu(&ops->srcu);
+ cleanup_srcu_struct(&ops->srcu);
+
/* Close the race with setup_net() and cleanup_net() */
down_write(&pernet_ops_rwsem);
rtnl_lock_unregistering_all();
- __rtnl_link_unregister(ops);
+
+ for_each_net(net)
+ __rtnl_kill_links(net, ops);
+
rtnl_unlock();
up_write(&pernet_ops_rwsem);
}
@@ -536,29 +745,51 @@ static size_t rtnl_link_get_size(const struct net_device *dev)
static LIST_HEAD(rtnl_af_ops);
-static const struct rtnl_af_ops *rtnl_af_lookup(const int family)
+static struct rtnl_af_ops *rtnl_af_lookup(const int family, int *srcu_index)
{
- const struct rtnl_af_ops *ops;
+ struct rtnl_af_ops *ops;
+
+ ASSERT_RTNL();
+
+ rcu_read_lock();
list_for_each_entry_rcu(ops, &rtnl_af_ops, list) {
- if (ops->family == family)
- return ops;
+ if (ops->family == family) {
+ *srcu_index = srcu_read_lock(&ops->srcu);
+ goto unlock;
+ }
}
- return NULL;
+ ops = NULL;
+unlock:
+ rcu_read_unlock();
+
+ return ops;
+}
+
+static void rtnl_af_put(struct rtnl_af_ops *ops, int srcu_index)
+{
+ srcu_read_unlock(&ops->srcu, srcu_index);
}
/**
* rtnl_af_register - Register rtnl_af_ops with rtnetlink.
* @ops: struct rtnl_af_ops * to register
*
- * Returns 0 on success or a negative error code.
+ * Return: 0 on success or a negative error code.
*/
-void rtnl_af_register(struct rtnl_af_ops *ops)
+int rtnl_af_register(struct rtnl_af_ops *ops)
{
+ int err = init_srcu_struct(&ops->srcu);
+
+ if (err)
+ return err;
+
rtnl_lock();
list_add_tail_rcu(&ops->list, &rtnl_af_ops);
rtnl_unlock();
+
+ return 0;
}
EXPORT_SYMBOL_GPL(rtnl_af_register);
@@ -573,6 +804,8 @@ void rtnl_af_unregister(struct rtnl_af_ops *ops)
rtnl_unlock();
synchronize_rcu();
+ synchronize_srcu(&ops->srcu);
+ cleanup_srcu_struct(&ops->srcu);
}
EXPORT_SYMBOL_GPL(rtnl_af_unregister);
@@ -629,7 +862,7 @@ static int rtnl_link_slave_info_fill(struct sk_buff *skb,
if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0)
return -EMSGSIZE;
if (ops->fill_slave_info) {
- slave_data = nla_nest_start(skb, IFLA_INFO_SLAVE_DATA);
+ slave_data = nla_nest_start_noflag(skb, IFLA_INFO_SLAVE_DATA);
if (!slave_data)
return -EMSGSIZE;
err = ops->fill_slave_info(skb, master_dev, dev);
@@ -661,7 +894,7 @@ static int rtnl_link_info_fill(struct sk_buff *skb,
return err;
}
if (ops->fill_info) {
- data = nla_nest_start(skb, IFLA_INFO_DATA);
+ data = nla_nest_start_noflag(skb, IFLA_INFO_DATA);
if (data == NULL)
return -EMSGSIZE;
err = ops->fill_info(skb, dev);
@@ -681,7 +914,7 @@ static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev)
struct nlattr *linkinfo;
int err = -EMSGSIZE;
- linkinfo = nla_nest_start(skb, IFLA_LINKINFO);
+ linkinfo = nla_nest_start_noflag(skb, IFLA_LINKINFO);
if (linkinfo == NULL)
goto out;
@@ -705,15 +938,8 @@ out:
int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo)
{
struct sock *rtnl = net->rtnl;
- int err = 0;
- NETLINK_CB(skb).dst_group = group;
- if (echo)
- refcount_inc(&skb->users);
- netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL);
- if (echo)
- err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
- return err;
+ return nlmsg_notify(rtnl, skb, pid, group, echo, GFP_KERNEL);
}
int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
@@ -725,15 +951,11 @@ int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
EXPORT_SYMBOL(rtnl_unicast);
void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
- struct nlmsghdr *nlh, gfp_t flags)
+ const struct nlmsghdr *nlh, gfp_t flags)
{
struct sock *rtnl = net->rtnl;
- int report = 0;
- if (nlh)
- report = nlmsg_report(nlh);
-
- nlmsg_notify(rtnl, skb, pid, group, report, flags);
+ nlmsg_notify(rtnl, skb, pid, group, nlmsg_report(nlh), flags);
}
EXPORT_SYMBOL(rtnl_notify);
@@ -750,7 +972,11 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
struct nlattr *mx;
int i, valid = 0;
- mx = nla_nest_start(skb, RTA_METRICS);
+ /* nothing is dumped for dst_default_metrics, so just skip the loop */
+ if (metrics == dst_default_metrics.metrics)
+ return 0;
+
+ mx = nla_nest_start_noflag(skb, RTA_METRICS);
if (mx == NULL)
return -ENOBUFS;
@@ -800,11 +1026,13 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
.rta_error = error,
.rta_id = id,
};
+ unsigned long delta;
if (dst) {
- ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse);
+ delta = jiffies - READ_ONCE(dst->lastuse);
+ ci.rta_lastuse = jiffies_delta_to_clock_t(delta);
ci.rta_used = dst->__use;
- ci.rta_clntref = atomic_read(&dst->__refcnt);
+ ci.rta_clntref = rcuref_read(&dst->__rcuref);
}
if (expires) {
unsigned long clock;
@@ -817,31 +1045,44 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
}
EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo);
+void netif_set_operstate(struct net_device *dev, int newstate)
+{
+ unsigned int old = READ_ONCE(dev->operstate);
+
+ do {
+ if (old == newstate)
+ return;
+ } while (!try_cmpxchg(&dev->operstate, &old, newstate));
+
+ netif_state_change(dev);
+}
+EXPORT_SYMBOL(netif_set_operstate);
+
static void set_operstate(struct net_device *dev, unsigned char transition)
{
- unsigned char operstate = dev->operstate;
+ unsigned char operstate = READ_ONCE(dev->operstate);
switch (transition) {
case IF_OPER_UP:
if ((operstate == IF_OPER_DORMANT ||
+ operstate == IF_OPER_TESTING ||
operstate == IF_OPER_UNKNOWN) &&
- !netif_dormant(dev))
+ !netif_dormant(dev) && !netif_testing(dev))
operstate = IF_OPER_UP;
break;
+ case IF_OPER_TESTING:
+ if (netif_oper_up(dev))
+ operstate = IF_OPER_TESTING;
+ break;
+
case IF_OPER_DORMANT:
- if (operstate == IF_OPER_UP ||
- operstate == IF_OPER_UNKNOWN)
+ if (netif_oper_up(dev))
operstate = IF_OPER_DORMANT;
break;
}
- if (dev->operstate != operstate) {
- write_lock_bh(&dev_base_lock);
- dev->operstate = operstate;
- write_unlock_bh(&dev_base_lock);
- netdev_state_change(dev);
- }
+ netif_set_operstate(dev, operstate);
}
static unsigned int rtnl_dev_get_flags(const struct net_device *dev)
@@ -907,6 +1148,7 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
size += num_vfs *
(nla_total_size(0) +
nla_total_size(sizeof(struct ifla_vf_mac)) +
+ nla_total_size(sizeof(struct ifla_vf_broadcast)) +
nla_total_size(sizeof(struct ifla_vf_vlan)) +
nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */
nla_total_size(MAX_VLAN_LIST_LEN *
@@ -916,24 +1158,30 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
nla_total_size(sizeof(struct ifla_vf_rate)) +
nla_total_size(sizeof(struct ifla_vf_link_state)) +
nla_total_size(sizeof(struct ifla_vf_rss_query_en)) +
- nla_total_size(0) + /* nest IFLA_VF_STATS */
- /* IFLA_VF_STATS_RX_PACKETS */
- nla_total_size_64bit(sizeof(__u64)) +
- /* IFLA_VF_STATS_TX_PACKETS */
- nla_total_size_64bit(sizeof(__u64)) +
- /* IFLA_VF_STATS_RX_BYTES */
- nla_total_size_64bit(sizeof(__u64)) +
- /* IFLA_VF_STATS_TX_BYTES */
- nla_total_size_64bit(sizeof(__u64)) +
- /* IFLA_VF_STATS_BROADCAST */
- nla_total_size_64bit(sizeof(__u64)) +
- /* IFLA_VF_STATS_MULTICAST */
- nla_total_size_64bit(sizeof(__u64)) +
- /* IFLA_VF_STATS_RX_DROPPED */
- nla_total_size_64bit(sizeof(__u64)) +
- /* IFLA_VF_STATS_TX_DROPPED */
- nla_total_size_64bit(sizeof(__u64)) +
nla_total_size(sizeof(struct ifla_vf_trust)));
+ if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) {
+ size += num_vfs *
+ (nla_total_size(0) + /* nest IFLA_VF_STATS */
+ /* IFLA_VF_STATS_RX_PACKETS */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_TX_PACKETS */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_RX_BYTES */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_TX_BYTES */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_BROADCAST */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_MULTICAST */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_RX_DROPPED */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* IFLA_VF_STATS_TX_DROPPED */
+ nla_total_size_64bit(sizeof(__u64)));
+ }
+ if (dev->netdev_ops->ndo_get_vf_guid)
+ size += num_vfs * 2 *
+ nla_total_size(sizeof(struct ifla_vf_guid));
return size;
} else
return 0;
@@ -974,16 +1222,61 @@ static size_t rtnl_xdp_size(void)
return xdp_size;
}
+static size_t rtnl_prop_list_size(const struct net_device *dev)
+{
+ struct netdev_name_node *name_node;
+ unsigned int cnt = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(name_node, &dev->name_node->list, list)
+ cnt++;
+ rcu_read_unlock();
+
+ if (!cnt)
+ return 0;
+
+ return nla_total_size(0) + cnt * nla_total_size(ALTIFNAMSIZ);
+}
+
+static size_t rtnl_proto_down_size(const struct net_device *dev)
+{
+ size_t size = nla_total_size(1);
+
+ /* Assume dev->proto_down_reason is not zero. */
+ size += nla_total_size(0) + nla_total_size(4);
+
+ return size;
+}
+
+static size_t rtnl_devlink_port_size(const struct net_device *dev)
+{
+ size_t size = nla_total_size(0); /* nest IFLA_DEVLINK_PORT */
+
+ if (dev->devlink_port)
+ size += devlink_nl_port_handle_size(dev->devlink_port);
+
+ return size;
+}
+
+static size_t rtnl_dpll_pin_size(const struct net_device *dev)
+{
+ size_t size = nla_total_size(0); /* nest IFLA_DPLL_PIN */
+
+ size += dpll_netdev_pin_handle_size(dev);
+
+ return size;
+}
+
static noinline size_t if_nlmsg_size(const struct net_device *dev,
u32 ext_filter_mask)
{
- return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+ size_t size;
+
+ size = NLMSG_ALIGN(sizeof(struct ifinfomsg))
+ nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+ nla_total_size(IFALIASZ) /* IFLA_IFALIAS */
+ nla_total_size(IFNAMSIZ) /* IFLA_QDISC */
+ nla_total_size_64bit(sizeof(struct rtnl_link_ifmap))
- + nla_total_size(sizeof(struct rtnl_link_stats))
- + nla_total_size_64bit(sizeof(struct rtnl_link_stats64))
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+ nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */
+ nla_total_size(4) /* IFLA_TXQLEN */
@@ -993,12 +1286,19 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(4) /* IFLA_MASTER */
+ nla_total_size(1) /* IFLA_CARRIER */
+ nla_total_size(4) /* IFLA_PROMISCUITY */
+ + nla_total_size(4) /* IFLA_ALLMULTI */
+ nla_total_size(4) /* IFLA_NUM_TX_QUEUES */
+ nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
+ nla_total_size(4) /* IFLA_GSO_MAX_SEGS */
+ nla_total_size(4) /* IFLA_GSO_MAX_SIZE */
+ + nla_total_size(4) /* IFLA_GRO_MAX_SIZE */
+ + nla_total_size(4) /* IFLA_GSO_IPV4_MAX_SIZE */
+ + nla_total_size(4) /* IFLA_GRO_IPV4_MAX_SIZE */
+ + nla_total_size(4) /* IFLA_TSO_MAX_SIZE */
+ + nla_total_size(4) /* IFLA_TSO_MAX_SEGS */
+ nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(1) /* IFLA_LINKMODE */
+ + nla_total_size(1) /* IFLA_NETNS_IMMUTABLE */
+ nla_total_size(4) /* IFLA_CARRIER_CHANGES */
+ nla_total_size(4) /* IFLA_LINK_NETNSID */
+ nla_total_size(4) /* IFLA_GROUP */
@@ -1015,13 +1315,26 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(4) /* IFLA_EVENT */
+ nla_total_size(4) /* IFLA_NEW_NETNSID */
+ nla_total_size(4) /* IFLA_NEW_IFINDEX */
- + nla_total_size(1) /* IFLA_PROTO_DOWN */
- + nla_total_size(4) /* IFLA_IF_NETNSID */
+ + rtnl_proto_down_size(dev) /* proto down */
+ + nla_total_size(4) /* IFLA_TARGET_NETNSID */
+ nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */
+ nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */
+ nla_total_size(4) /* IFLA_MIN_MTU */
+ nla_total_size(4) /* IFLA_MAX_MTU */
+ + rtnl_prop_list_size(dev)
+ + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */
+ + rtnl_devlink_port_size(dev)
+ + rtnl_dpll_pin_size(dev)
+ + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */
+ + nla_total_size(2) /* IFLA_HEADROOM */
+ + nla_total_size(2) /* IFLA_TAILROOM */
+ 0;
+
+ if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS))
+ size += nla_total_size(sizeof(struct rtnl_link_stats)) +
+ nla_total_size_64bit(sizeof(struct rtnl_link_stats64));
+
+ return size;
}
static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
@@ -1031,12 +1344,12 @@ static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev)
int vf;
int err;
- vf_ports = nla_nest_start(skb, IFLA_VF_PORTS);
+ vf_ports = nla_nest_start_noflag(skb, IFLA_VF_PORTS);
if (!vf_ports)
return -EMSGSIZE;
for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) {
- vf_port = nla_nest_start(skb, IFLA_VF_PORT);
+ vf_port = nla_nest_start_noflag(skb, IFLA_VF_PORT);
if (!vf_port)
goto nla_put_failure;
if (nla_put_u32(skb, IFLA_PORT_VF, vf))
@@ -1065,7 +1378,7 @@ static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev)
struct nlattr *port_self;
int err;
- port_self = nla_nest_start(skb, IFLA_PORT_SELF);
+ port_self = nla_nest_start_noflag(skb, IFLA_PORT_SELF);
if (!port_self)
return -EMSGSIZE;
@@ -1140,22 +1453,17 @@ static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev)
static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
{
+ struct netdev_phys_item_id ppid = { };
int err;
- struct switchdev_attr attr = {
- .orig_dev = dev,
- .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
- .flags = SWITCHDEV_F_NO_RECURSE,
- };
- err = switchdev_port_attr_get(dev, &attr);
+ err = netif_get_port_parent_id(dev, &ppid, false);
if (err) {
if (err == -EOPNOTSUPP)
return 0;
return err;
}
- if (nla_put(skb, IFLA_PHYS_SWITCH_ID, attr.u.ppid.id_len,
- attr.u.ppid.id))
+ if (nla_put(skb, IFLA_PHYS_SWITCH_ID, ppid.id_len, ppid.id))
return -EMSGSIZE;
return 0;
@@ -1188,7 +1496,7 @@ static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb,
static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
struct net_device *dev,
int vfs_num,
- struct nlattr *vfinfo)
+ u32 ext_filter_mask)
{
struct ifla_vf_rss_query_en vf_rss_query_en;
struct nlattr *vf, *vfstats, *vfvlanlist;
@@ -1201,7 +1509,10 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
struct ifla_vf_vlan vf_vlan;
struct ifla_vf_rate vf_rate;
struct ifla_vf_mac vf_mac;
+ struct ifla_vf_broadcast vf_broadcast;
struct ifla_vf_info ivi;
+ struct ifla_vf_guid node_guid;
+ struct ifla_vf_guid port_guid;
memset(&ivi, 0, sizeof(ivi));
@@ -1223,6 +1534,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
return 0;
memset(&vf_vlan_info, 0, sizeof(vf_vlan_info));
+ memset(&node_guid, 0, sizeof(node_guid));
+ memset(&port_guid, 0, sizeof(port_guid));
vf_mac.vf =
vf_vlan.vf =
@@ -1232,9 +1545,12 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
vf_spoofchk.vf =
vf_linkstate.vf =
vf_rss_query_en.vf =
- vf_trust.vf = ivi.vf;
+ vf_trust.vf =
+ node_guid.vf =
+ port_guid.vf = ivi.vf;
memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
+ memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len);
vf_vlan.vlan = ivi.vlan;
vf_vlan.qos = ivi.qos;
vf_vlan_info.vlan = ivi.vlan;
@@ -1247,10 +1563,11 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
vf_linkstate.link_state = ivi.linkstate;
vf_rss_query_en.setting = ivi.rss_query_en;
vf_trust.setting = ivi.trusted;
- vf = nla_nest_start(skb, IFLA_VF_INFO);
+ vf = nla_nest_start_noflag(skb, IFLA_VF_INFO);
if (!vf)
- goto nla_put_vfinfo_failure;
+ return -EMSGSIZE;
if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
+ nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) ||
nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
&vf_rate) ||
@@ -1266,7 +1583,17 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
nla_put(skb, IFLA_VF_TRUST,
sizeof(vf_trust), &vf_trust))
goto nla_put_vf_failure;
- vfvlanlist = nla_nest_start(skb, IFLA_VF_VLAN_LIST);
+
+ if (dev->netdev_ops->ndo_get_vf_guid &&
+ !dev->netdev_ops->ndo_get_vf_guid(dev, vfs_num, &node_guid,
+ &port_guid)) {
+ if (nla_put(skb, IFLA_VF_IB_NODE_GUID, sizeof(node_guid),
+ &node_guid) ||
+ nla_put(skb, IFLA_VF_IB_PORT_GUID, sizeof(port_guid),
+ &port_guid))
+ goto nla_put_vf_failure;
+ }
+ vfvlanlist = nla_nest_start_noflag(skb, IFLA_VF_VLAN_LIST);
if (!vfvlanlist)
goto nla_put_vf_failure;
if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info),
@@ -1275,40 +1602,40 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
goto nla_put_vf_failure;
}
nla_nest_end(skb, vfvlanlist);
- memset(&vf_stats, 0, sizeof(vf_stats));
- if (dev->netdev_ops->ndo_get_vf_stats)
- dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num,
- &vf_stats);
- vfstats = nla_nest_start(skb, IFLA_VF_STATS);
- if (!vfstats)
- goto nla_put_vf_failure;
- if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS,
- vf_stats.rx_packets, IFLA_VF_STATS_PAD) ||
- nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS,
- vf_stats.tx_packets, IFLA_VF_STATS_PAD) ||
- nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES,
- vf_stats.rx_bytes, IFLA_VF_STATS_PAD) ||
- nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES,
- vf_stats.tx_bytes, IFLA_VF_STATS_PAD) ||
- nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
- vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
- nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
- vf_stats.multicast, IFLA_VF_STATS_PAD) ||
- nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED,
- vf_stats.rx_dropped, IFLA_VF_STATS_PAD) ||
- nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED,
- vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) {
- nla_nest_cancel(skb, vfstats);
- goto nla_put_vf_failure;
+ if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) {
+ memset(&vf_stats, 0, sizeof(vf_stats));
+ if (dev->netdev_ops->ndo_get_vf_stats)
+ dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num,
+ &vf_stats);
+ vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS);
+ if (!vfstats)
+ goto nla_put_vf_failure;
+ if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS,
+ vf_stats.rx_packets, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS,
+ vf_stats.tx_packets, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES,
+ vf_stats.rx_bytes, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES,
+ vf_stats.tx_bytes, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST,
+ vf_stats.broadcast, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST,
+ vf_stats.multicast, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED,
+ vf_stats.rx_dropped, IFLA_VF_STATS_PAD) ||
+ nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED,
+ vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) {
+ nla_nest_cancel(skb, vfstats);
+ goto nla_put_vf_failure;
+ }
+ nla_nest_end(skb, vfstats);
}
- nla_nest_end(skb, vfstats);
nla_nest_end(skb, vf);
return 0;
nla_put_vf_failure:
nla_nest_cancel(skb, vf);
-nla_put_vfinfo_failure:
- nla_nest_cancel(skb, vfinfo);
return -EMSGSIZE;
}
@@ -1329,30 +1656,33 @@ static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb,
if (!dev->netdev_ops->ndo_get_vf_config)
return 0;
- vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST);
+ vfinfo = nla_nest_start_noflag(skb, IFLA_VFINFO_LIST);
if (!vfinfo)
return -EMSGSIZE;
for (i = 0; i < num_vfs; i++) {
- if (rtnl_fill_vfinfo(skb, dev, i, vfinfo))
+ if (rtnl_fill_vfinfo(skb, dev, i, ext_filter_mask)) {
+ nla_nest_cancel(skb, vfinfo);
return -EMSGSIZE;
+ }
}
nla_nest_end(skb, vfinfo);
return 0;
}
-static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
+static int rtnl_fill_link_ifmap(struct sk_buff *skb,
+ const struct net_device *dev)
{
struct rtnl_link_ifmap map;
memset(&map, 0, sizeof(map));
- map.mem_start = dev->mem_start;
- map.mem_end = dev->mem_end;
- map.base_addr = dev->base_addr;
- map.irq = dev->irq;
- map.dma = dev->dma;
- map.port = dev->if_port;
+ map.mem_start = READ_ONCE(dev->mem_start);
+ map.mem_end = READ_ONCE(dev->mem_end);
+ map.base_addr = READ_ONCE(dev->base_addr);
+ map.irq = READ_ONCE(dev->irq);
+ map.dma = READ_ONCE(dev->dma);
+ map.port = READ_ONCE(dev->if_port);
if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD))
return -EMSGSIZE;
@@ -1363,24 +1693,25 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
static u32 rtnl_xdp_prog_skb(struct net_device *dev)
{
const struct bpf_prog *generic_xdp_prog;
+ u32 res = 0;
- ASSERT_RTNL();
+ rcu_read_lock();
+ generic_xdp_prog = rcu_dereference(dev->xdp_prog);
+ if (generic_xdp_prog)
+ res = generic_xdp_prog->aux->id;
+ rcu_read_unlock();
- generic_xdp_prog = rtnl_dereference(dev->xdp_prog);
- if (!generic_xdp_prog)
- return 0;
- return generic_xdp_prog->aux->id;
+ return res;
}
static u32 rtnl_xdp_prog_drv(struct net_device *dev)
{
- return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG);
+ return dev_xdp_prog_id(dev, XDP_MODE_DRV);
}
static u32 rtnl_xdp_prog_hw(struct net_device *dev)
{
- return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf,
- XDP_QUERY_PROG_HW);
+ return dev_xdp_prog_id(dev, XDP_MODE_HW);
}
static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev,
@@ -1414,7 +1745,7 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
int err;
u8 mode;
- xdp = nla_nest_start(skb, IFLA_XDP);
+ xdp = nla_nest_start_noflag(skb, IFLA_XDP);
if (!xdp)
return -EMSGSIZE;
@@ -1490,20 +1821,22 @@ static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev)
upper_dev = netdev_master_upper_dev_get_rcu(dev);
if (upper_dev)
- ret = nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex);
+ ret = nla_put_u32(skb, IFLA_MASTER,
+ READ_ONCE(upper_dev->ifindex));
rcu_read_unlock();
return ret;
}
-static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev)
+static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev,
+ bool force)
{
- int ifindex = dev_get_iflink(dev);
+ int iflink = dev_get_iflink(dev);
- if (dev->ifindex == ifindex)
- return 0;
+ if (force || READ_ONCE(dev->ifindex) != iflink)
+ return nla_put_u32(skb, IFLA_LINK, iflink);
- return nla_put_u32(skb, IFLA_LINK, ifindex);
+ return 0;
}
static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb,
@@ -1518,20 +1851,24 @@ static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb,
static int rtnl_fill_link_netnsid(struct sk_buff *skb,
const struct net_device *dev,
- struct net *src_net)
+ struct net *src_net, gfp_t gfp)
{
+ bool put_iflink = false;
+
if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) {
struct net *link_net = dev->rtnl_link_ops->get_link_net(dev);
if (!net_eq(dev_net(dev), link_net)) {
- int id = peernet2id_alloc(src_net, link_net);
+ int id = peernet2id_alloc(src_net, link_net, gfp);
if (nla_put_s32(skb, IFLA_LINK_NETNSID, id))
return -EMSGSIZE;
+
+ put_iflink = true;
}
}
- return 0;
+ return nla_put_iflink(skb, dev, put_iflink);
}
static int rtnl_fill_link_af(struct sk_buff *skb,
@@ -1541,7 +1878,7 @@ static int rtnl_fill_link_af(struct sk_buff *skb,
const struct rtnl_af_ops *af_ops;
struct nlattr *af_spec;
- af_spec = nla_nest_start(skb, IFLA_AF_SPEC);
+ af_spec = nla_nest_start_noflag(skb, IFLA_AF_SPEC);
if (!af_spec)
return -EMSGSIZE;
@@ -1552,7 +1889,7 @@ static int rtnl_fill_link_af(struct sk_buff *skb,
if (!af_ops->fill_link_af)
continue;
- af = nla_nest_start(skb, af_ops->family);
+ af = nla_nest_start_noflag(skb, af_ops->family);
if (!af)
return -EMSGSIZE;
@@ -1575,15 +1912,129 @@ static int rtnl_fill_link_af(struct sk_buff *skb,
return 0;
}
+static int rtnl_fill_alt_ifnames(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct netdev_name_node *name_node;
+ int count = 0;
+
+ list_for_each_entry_rcu(name_node, &dev->name_node->list, list) {
+ if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name))
+ return -EMSGSIZE;
+ count++;
+ }
+ return count;
+}
+
+/* RCU protected. */
+static int rtnl_fill_prop_list(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct nlattr *prop_list;
+ int ret;
+
+ prop_list = nla_nest_start(skb, IFLA_PROP_LIST);
+ if (!prop_list)
+ return -EMSGSIZE;
+
+ ret = rtnl_fill_alt_ifnames(skb, dev);
+ if (ret <= 0)
+ goto nest_cancel;
+
+ nla_nest_end(skb, prop_list);
+ return 0;
+
+nest_cancel:
+ nla_nest_cancel(skb, prop_list);
+ return ret;
+}
+
+static int rtnl_fill_proto_down(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct nlattr *pr;
+ u32 preason;
+
+ if (nla_put_u8(skb, IFLA_PROTO_DOWN, READ_ONCE(dev->proto_down)))
+ goto nla_put_failure;
+
+ preason = READ_ONCE(dev->proto_down_reason);
+ if (!preason)
+ return 0;
+
+ pr = nla_nest_start(skb, IFLA_PROTO_DOWN_REASON);
+ if (!pr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, IFLA_PROTO_DOWN_REASON_VALUE, preason)) {
+ nla_nest_cancel(skb, pr);
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(skb, pr);
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int rtnl_fill_devlink_port(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct nlattr *devlink_port_nest;
+ int ret;
+
+ devlink_port_nest = nla_nest_start(skb, IFLA_DEVLINK_PORT);
+ if (!devlink_port_nest)
+ return -EMSGSIZE;
+
+ if (dev->devlink_port) {
+ ret = devlink_nl_port_handle_fill(skb, dev->devlink_port);
+ if (ret < 0)
+ goto nest_cancel;
+ }
+
+ nla_nest_end(skb, devlink_port_nest);
+ return 0;
+
+nest_cancel:
+ nla_nest_cancel(skb, devlink_port_nest);
+ return ret;
+}
+
+static int rtnl_fill_dpll_pin(struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct nlattr *dpll_pin_nest;
+ int ret;
+
+ dpll_pin_nest = nla_nest_start(skb, IFLA_DPLL_PIN);
+ if (!dpll_pin_nest)
+ return -EMSGSIZE;
+
+ ret = dpll_netdev_add_pin_handle(skb, dev);
+ if (ret < 0)
+ goto nest_cancel;
+
+ nla_nest_end(skb, dpll_pin_nest);
+ return 0;
+
+nest_cancel:
+ nla_nest_cancel(skb, dpll_pin_nest);
+ return ret;
+}
+
static int rtnl_fill_ifinfo(struct sk_buff *skb,
struct net_device *dev, struct net *src_net,
int type, u32 pid, u32 seq, u32 change,
unsigned int flags, u32 ext_filter_mask,
u32 event, int *new_nsid, int new_ifindex,
- int tgt_netnsid)
+ int tgt_netnsid, gfp_t gfp)
{
+ char devname[IFNAMSIZ];
struct ifinfomsg *ifm;
struct nlmsghdr *nlh;
+ struct Qdisc *qdisc;
ASSERT_RTNL();
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags);
@@ -1593,44 +2044,69 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
ifm = nlmsg_data(nlh);
ifm->ifi_family = AF_UNSPEC;
ifm->__ifi_pad = 0;
- ifm->ifi_type = dev->type;
- ifm->ifi_index = dev->ifindex;
- ifm->ifi_flags = dev_get_flags(dev);
+ ifm->ifi_type = READ_ONCE(dev->type);
+ ifm->ifi_index = READ_ONCE(dev->ifindex);
+ ifm->ifi_flags = netif_get_flags(dev);
ifm->ifi_change = change;
- if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_IF_NETNSID, tgt_netnsid))
+ if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid))
goto nla_put_failure;
- if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
- nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) ||
+ netdev_copy_name(dev, devname);
+ if (nla_put_string(skb, IFLA_IFNAME, devname))
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, IFLA_TXQLEN, READ_ONCE(dev->tx_queue_len)) ||
nla_put_u8(skb, IFLA_OPERSTATE,
- netif_running(dev) ? dev->operstate : IF_OPER_DOWN) ||
- nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) ||
- nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
- nla_put_u32(skb, IFLA_MIN_MTU, dev->min_mtu) ||
- nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) ||
- nla_put_u32(skb, IFLA_GROUP, dev->group) ||
- nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) ||
- nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) ||
- nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) ||
- nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) ||
+ netif_running(dev) ? READ_ONCE(dev->operstate) :
+ IF_OPER_DOWN) ||
+ nla_put_u8(skb, IFLA_LINKMODE, READ_ONCE(dev->link_mode)) ||
+ nla_put_u8(skb, IFLA_NETNS_IMMUTABLE, dev->netns_immutable) ||
+ nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) ||
+ nla_put_u32(skb, IFLA_MIN_MTU, READ_ONCE(dev->min_mtu)) ||
+ nla_put_u32(skb, IFLA_MAX_MTU, READ_ONCE(dev->max_mtu)) ||
+ nla_put_u32(skb, IFLA_GROUP, READ_ONCE(dev->group)) ||
+ nla_put_u32(skb, IFLA_PROMISCUITY, READ_ONCE(dev->promiscuity)) ||
+ nla_put_u32(skb, IFLA_ALLMULTI, READ_ONCE(dev->allmulti)) ||
+ nla_put_u32(skb, IFLA_NUM_TX_QUEUES,
+ READ_ONCE(dev->num_tx_queues)) ||
+ nla_put_u32(skb, IFLA_GSO_MAX_SEGS,
+ READ_ONCE(dev->gso_max_segs)) ||
+ nla_put_u32(skb, IFLA_GSO_MAX_SIZE,
+ READ_ONCE(dev->gso_max_size)) ||
+ nla_put_u32(skb, IFLA_GRO_MAX_SIZE,
+ READ_ONCE(dev->gro_max_size)) ||
+ nla_put_u32(skb, IFLA_GSO_IPV4_MAX_SIZE,
+ READ_ONCE(dev->gso_ipv4_max_size)) ||
+ nla_put_u32(skb, IFLA_GRO_IPV4_MAX_SIZE,
+ READ_ONCE(dev->gro_ipv4_max_size)) ||
+ nla_put_u32(skb, IFLA_TSO_MAX_SIZE,
+ READ_ONCE(dev->tso_max_size)) ||
+ nla_put_u32(skb, IFLA_TSO_MAX_SEGS,
+ READ_ONCE(dev->tso_max_segs)) ||
+ nla_put_uint(skb, IFLA_MAX_PACING_OFFLOAD_HORIZON,
+ READ_ONCE(dev->max_pacing_offload_horizon)) ||
#ifdef CONFIG_RPS
- nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) ||
+ nla_put_u32(skb, IFLA_NUM_RX_QUEUES,
+ READ_ONCE(dev->num_rx_queues)) ||
#endif
- nla_put_iflink(skb, dev) ||
put_master_ifindex(skb, dev) ||
nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) ||
- (dev->qdisc &&
- nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) ||
nla_put_ifalias(skb, dev) ||
nla_put_u32(skb, IFLA_CARRIER_CHANGES,
atomic_read(&dev->carrier_up_count) +
atomic_read(&dev->carrier_down_count)) ||
- nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down) ||
nla_put_u32(skb, IFLA_CARRIER_UP_COUNT,
atomic_read(&dev->carrier_up_count)) ||
nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT,
- atomic_read(&dev->carrier_down_count)))
+ atomic_read(&dev->carrier_down_count)) ||
+ nla_put_u16(skb, IFLA_HEADROOM,
+ READ_ONCE(dev->needed_headroom)) ||
+ nla_put_u16(skb, IFLA_TAILROOM,
+ READ_ONCE(dev->needed_tailroom)))
+ goto nla_put_failure;
+
+ if (rtnl_fill_proto_down(skb, dev))
goto nla_put_failure;
if (event != IFLA_EVENT_NONE) {
@@ -1638,9 +2114,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
goto nla_put_failure;
}
- if (rtnl_fill_link_ifmap(skb, dev))
- goto nla_put_failure;
-
if (dev->addr_len) {
if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) ||
nla_put(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast))
@@ -1656,7 +2129,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
if (rtnl_phys_switch_id_fill(skb, dev))
goto nla_put_failure;
- if (rtnl_fill_stats(skb, dev))
+ if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS) &&
+ rtnl_fill_stats(skb, dev))
goto nla_put_failure;
if (rtnl_fill_vf(skb, dev, ext_filter_mask))
@@ -1673,9 +2147,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
goto nla_put_failure;
}
- if (rtnl_fill_link_netnsid(skb, dev, src_net))
- goto nla_put_failure;
-
if (new_nsid &&
nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0)
goto nla_put_failure;
@@ -1683,12 +2154,40 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0)
goto nla_put_failure;
+ if (memchr_inv(dev->perm_addr, '\0', dev->addr_len) &&
+ nla_put(skb, IFLA_PERM_ADDRESS, dev->addr_len, dev->perm_addr))
+ goto nla_put_failure;
rcu_read_lock();
+ if (rtnl_fill_link_netnsid(skb, dev, src_net, GFP_ATOMIC))
+ goto nla_put_failure_rcu;
+ qdisc = rcu_dereference(dev->qdisc);
+ if (qdisc && nla_put_string(skb, IFLA_QDISC, qdisc->ops->id))
+ goto nla_put_failure_rcu;
if (rtnl_fill_link_af(skb, dev, ext_filter_mask))
goto nla_put_failure_rcu;
+ if (rtnl_fill_link_ifmap(skb, dev))
+ goto nla_put_failure_rcu;
+ if (rtnl_fill_prop_list(skb, dev))
+ goto nla_put_failure_rcu;
rcu_read_unlock();
+ if (dev->dev.parent &&
+ nla_put_string(skb, IFLA_PARENT_DEV_NAME,
+ dev_name(dev->dev.parent)))
+ goto nla_put_failure;
+
+ if (dev->dev.parent && dev->dev.parent->bus &&
+ nla_put_string(skb, IFLA_PARENT_DEV_BUS_NAME,
+ dev->dev.parent->bus->name))
+ goto nla_put_failure;
+
+ if (rtnl_fill_devlink_port(skb, dev))
+ goto nla_put_failure;
+
+ if (rtnl_fill_dpll_pin(skb, dev))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
return 0;
@@ -1700,6 +2199,7 @@ nla_put_failure:
}
static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
+ [IFLA_UNSPEC] = { .strict_start_type = IFLA_DPLL_PIN },
[IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 },
[IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
[IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
@@ -1728,7 +2228,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 },
[IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 },
[IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 },
- [IFLA_GSO_MAX_SIZE] = { .type = NLA_U32 },
+ [IFLA_GSO_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1),
[IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
[IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */
[IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
@@ -1737,11 +2237,27 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_XDP] = { .type = NLA_NESTED },
[IFLA_EVENT] = { .type = NLA_U32 },
[IFLA_GROUP] = { .type = NLA_U32 },
- [IFLA_IF_NETNSID] = { .type = NLA_S32 },
+ [IFLA_TARGET_NETNSID] = { .type = NLA_S32 },
[IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 },
[IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 },
[IFLA_MIN_MTU] = { .type = NLA_U32 },
[IFLA_MAX_MTU] = { .type = NLA_U32 },
+ [IFLA_PROP_LIST] = { .type = NLA_NESTED },
+ [IFLA_ALT_IFNAME] = { .type = NLA_STRING,
+ .len = ALTIFNAMSIZ - 1 },
+ [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT },
+ [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED },
+ [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1),
+ [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING },
+ [IFLA_GRO_MAX_SIZE] = { .type = NLA_U32 },
+ [IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT },
+ [IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT },
+ [IFLA_ALLMULTI] = { .type = NLA_REJECT },
+ [IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1),
+ [IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 },
+ [IFLA_NETNS_IMMUTABLE] = { .type = NLA_REJECT },
+ [IFLA_HEADROOM] = { .type = NLA_REJECT },
+ [IFLA_TAILROOM] = { .type = NLA_REJECT },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -1753,6 +2269,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
[IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) },
+ [IFLA_VF_BROADCAST] = { .type = NLA_REJECT },
[IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) },
[IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED },
[IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) },
@@ -1786,26 +2303,28 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
};
static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = {
+ [IFLA_XDP_UNSPEC] = { .strict_start_type = IFLA_XDP_EXPECTED_FD },
[IFLA_XDP_FD] = { .type = NLA_S32 },
+ [IFLA_XDP_EXPECTED_FD] = { .type = NLA_S32 },
[IFLA_XDP_ATTACHED] = { .type = NLA_U8 },
[IFLA_XDP_FLAGS] = { .type = NLA_U32 },
[IFLA_XDP_PROG_ID] = { .type = NLA_U32 },
};
-static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
+static struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla,
+ int *ops_srcu_index)
{
- const struct rtnl_link_ops *ops = NULL;
struct nlattr *linfo[IFLA_INFO_MAX + 1];
+ struct rtnl_link_ops *ops = NULL;
- if (nla_parse_nested(linfo, IFLA_INFO_MAX, nla,
- ifla_info_policy, NULL) < 0)
+ if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0)
return NULL;
if (linfo[IFLA_INFO_KIND]) {
char kind[MODULE_NAME_LEN];
- nla_strlcpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind));
- ops = rtnl_link_ops_get(kind);
+ nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind));
+ ops = rtnl_link_ops_get(kind, ops_srcu_index);
}
return ops;
@@ -1819,6 +2338,13 @@ static bool link_master_filtered(struct net_device *dev, int master_idx)
return false;
master = netdev_master_upper_dev_get(dev);
+
+ /* 0 is already used to denote IFLA_MASTER wasn't passed, therefore need
+ * another invalid value for ifindex to denote "no master".
+ */
+ if (master_idx == -1)
+ return !!master;
+
if (!master || master->ifindex != master_idx)
return true;
@@ -1845,7 +2371,15 @@ static bool link_dump_filtered(struct net_device *dev,
return false;
}
-static struct net *get_target_net(struct sock *sk, int netnsid)
+/**
+ * rtnl_get_net_ns_capable - Get netns if sufficiently privileged.
+ * @sk: netlink socket
+ * @netnsid: network namespace identifier
+ *
+ * Returns the network namespace identified by netnsid on success or an error
+ * pointer on failure.
+ */
+struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid)
{
struct net *net;
@@ -1862,26 +2396,37 @@ static struct net *get_target_net(struct sock *sk, int netnsid)
}
return net;
}
+EXPORT_SYMBOL_GPL(rtnl_get_net_ns_capable);
-static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
+static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh,
+ bool strict_check, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
{
- struct net *net = sock_net(skb->sk);
- struct net *tgt_net = net;
- int h, s_h;
- int idx = 0, s_idx;
- struct net_device *dev;
- struct hlist_head *head;
- struct nlattr *tb[IFLA_MAX+1];
- u32 ext_filter_mask = 0;
- const struct rtnl_link_ops *kind_ops = NULL;
- unsigned int flags = NLM_F_MULTI;
- int master_idx = 0;
- int netnsid = -1;
- int err;
int hdrlen;
- s_h = cb->args[0];
- s_idx = cb->args[1];
+ if (strict_check) {
+ struct ifinfomsg *ifm;
+
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for link dump");
+ return -EINVAL;
+ }
+
+ if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
+ ifm->ifi_change) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request");
+ return -EINVAL;
+ }
+ if (ifm->ifi_index) {
+ NL_SET_ERR_MSG(extack, "Filter by device index not supported for link dumps");
+ return -EINVAL;
+ }
+
+ return nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb,
+ IFLA_MAX, ifla_policy,
+ extack);
+ }
/* A hack to preserve kernel<->userspace interface.
* The correct header is ifinfomsg. It is consistent with rtnl_getlink.
@@ -1890,80 +2435,131 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
* We can detect the old iproute2. Even including the IFLA_EXT_MASK
* attribute, its netlink message is shorter than struct ifinfomsg.
*/
- hdrlen = nlmsg_len(cb->nlh) < sizeof(struct ifinfomsg) ?
+ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ?
sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
- if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX,
- ifla_policy, NULL) >= 0) {
- if (tb[IFLA_IF_NETNSID]) {
- netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
- tgt_net = get_target_net(skb->sk, netnsid);
- if (IS_ERR(tgt_net))
- return PTR_ERR(tgt_net);
- }
-
- if (tb[IFLA_EXT_MASK])
- ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
+ return nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy,
+ extack);
+}
- if (tb[IFLA_MASTER])
- master_idx = nla_get_u32(tb[IFLA_MASTER]);
+static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct netlink_ext_ack *extack = cb->extack;
+ struct rtnl_link_ops *kind_ops = NULL;
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct net *net = sock_net(skb->sk);
+ unsigned int flags = NLM_F_MULTI;
+ struct nlattr *tb[IFLA_MAX+1];
+ struct {
+ unsigned long ifindex;
+ } *ctx = (void *)cb->ctx;
+ struct net *tgt_net = net;
+ u32 ext_filter_mask = 0;
+ struct net_device *dev;
+ int ops_srcu_index;
+ int master_idx = 0;
+ int netnsid = -1;
+ int err, i;
- if (tb[IFLA_LINKINFO])
- kind_ops = linkinfo_to_kind_ops(tb[IFLA_LINKINFO]);
+ err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack);
+ if (err < 0) {
+ if (cb->strict_check)
+ return err;
- if (master_idx || kind_ops)
- flags |= NLM_F_DUMP_FILTERED;
+ goto walk_entries;
}
- for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
- idx = 0;
- head = &tgt_net->dev_index_head[h];
- hlist_for_each_entry(dev, head, index_hlist) {
- if (link_dump_filtered(dev, master_idx, kind_ops))
- goto cont;
- if (idx < s_idx)
- goto cont;
- err = rtnl_fill_ifinfo(skb, dev, net,
- RTM_NEWLINK,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, 0,
- flags,
- ext_filter_mask, 0, NULL, 0,
- netnsid);
-
- if (err < 0) {
- if (likely(skb->len))
- goto out;
+ for (i = 0; i <= IFLA_MAX; ++i) {
+ if (!tb[i])
+ continue;
- goto out_err;
+ /* new attributes should only be added with strict checking */
+ switch (i) {
+ case IFLA_TARGET_NETNSID:
+ netnsid = nla_get_s32(tb[i]);
+ tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid);
+ if (IS_ERR(tgt_net)) {
+ NL_SET_ERR_MSG(extack, "Invalid target network namespace id");
+ err = PTR_ERR(tgt_net);
+ netnsid = -1;
+ goto out;
+ }
+ break;
+ case IFLA_EXT_MASK:
+ ext_filter_mask = nla_get_u32(tb[i]);
+ break;
+ case IFLA_MASTER:
+ master_idx = nla_get_u32(tb[i]);
+ break;
+ case IFLA_LINKINFO:
+ kind_ops = linkinfo_to_kind_ops(tb[i], &ops_srcu_index);
+ break;
+ default:
+ if (cb->strict_check) {
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request");
+ err = -EINVAL;
+ goto out;
}
-cont:
- idx++;
}
}
-out:
- err = skb->len;
-out_err:
- cb->args[1] = idx;
- cb->args[0] = h;
- cb->seq = net->dev_base_seq;
+
+ if (master_idx || kind_ops)
+ flags |= NLM_F_DUMP_FILTERED;
+
+walk_entries:
+ err = 0;
+ for_each_netdev_dump(tgt_net, dev, ctx->ifindex) {
+ if (link_dump_filtered(dev, master_idx, kind_ops))
+ continue;
+ err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK,
+ NETLINK_CB(cb->skb).portid,
+ nlh->nlmsg_seq, 0, flags,
+ ext_filter_mask, 0, NULL, 0,
+ netnsid, GFP_KERNEL);
+ if (err < 0)
+ break;
+ }
+
+
+ cb->seq = tgt_net->dev_base_seq;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+
+out:
+
+ if (kind_ops)
+ rtnl_link_ops_put(kind_ops, ops_srcu_index);
if (netnsid >= 0)
put_net(tgt_net);
return err;
}
-int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len,
- struct netlink_ext_ack *exterr)
+int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer,
+ struct netlink_ext_ack *exterr)
{
- return nla_parse(tb, IFLA_MAX, head, len, ifla_policy, exterr);
+ const struct ifinfomsg *ifmp;
+ const struct nlattr *attrs;
+ size_t len;
+
+ ifmp = nla_data(nla_peer);
+ attrs = nla_data(nla_peer) + sizeof(struct ifinfomsg);
+ len = nla_len(nla_peer) - sizeof(struct ifinfomsg);
+
+ if (ifmp->ifi_index < 0) {
+ NL_SET_ERR_MSG_ATTR(exterr, nla_peer,
+ "ifindex can't be negative");
+ return -EINVAL;
+ }
+
+ return nla_parse_deprecated(tb, IFLA_MAX, attrs, len, ifla_policy,
+ exterr);
}
-EXPORT_SYMBOL(rtnl_nla_parse_ifla);
+EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg);
-struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
+static struct net *rtnl_link_get_net_ifla(struct nlattr *tb[])
{
- struct net *net;
+ struct net *net = NULL;
+
/* Examine the link attributes and figure out which
* network namespace we are talking about.
*/
@@ -1971,8 +2567,17 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
else if (tb[IFLA_NET_NS_FD])
net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD]));
- else
+
+ return net;
+}
+
+struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
+{
+ struct net *net = rtnl_link_get_net_ifla(tb);
+
+ if (!net)
net = get_net(src_net);
+
return net;
}
EXPORT_SYMBOL(rtnl_link_get_net);
@@ -1982,7 +2587,7 @@ EXPORT_SYMBOL(rtnl_link_get_net);
*
* 1. IFLA_NET_NS_PID
* 2. IFLA_NET_NS_FD
- * 3. IFLA_IF_NETNSID
+ * 3. IFLA_TARGET_NETNSID
*/
static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net,
struct nlattr *tb[])
@@ -1992,10 +2597,10 @@ static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net,
if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])
return rtnl_link_get_net(src_net, tb);
- if (!tb[IFLA_IF_NETNSID])
+ if (!tb[IFLA_TARGET_NETNSID])
return get_net(src_net);
- net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_IF_NETNSID]));
+ net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_TARGET_NETNSID]));
if (!net)
return ERR_PTR(-EINVAL);
@@ -2036,13 +2641,13 @@ static int rtnl_ensure_unique_netns(struct nlattr *tb[],
return -EOPNOTSUPP;
}
- if (tb[IFLA_IF_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]))
+ if (tb[IFLA_TARGET_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]))
goto invalid_attr;
- if (tb[IFLA_NET_NS_PID] && (tb[IFLA_IF_NETNSID] || tb[IFLA_NET_NS_FD]))
+ if (tb[IFLA_NET_NS_PID] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_FD]))
goto invalid_attr;
- if (tb[IFLA_NET_NS_FD] && (tb[IFLA_IF_NETNSID] || tb[IFLA_NET_NS_PID]))
+ if (tb[IFLA_NET_NS_FD] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_PID]))
goto invalid_attr;
return 0;
@@ -2052,16 +2657,59 @@ invalid_attr:
return -EINVAL;
}
-static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
+static int rtnl_set_vf_rate(struct net_device *dev, int vf, int min_tx_rate,
+ int max_tx_rate)
{
- if (dev) {
- if (tb[IFLA_ADDRESS] &&
- nla_len(tb[IFLA_ADDRESS]) < dev->addr_len)
- return -EINVAL;
+ const struct net_device_ops *ops = dev->netdev_ops;
- if (tb[IFLA_BROADCAST] &&
- nla_len(tb[IFLA_BROADCAST]) < dev->addr_len)
- return -EINVAL;
+ if (!ops->ndo_set_vf_rate)
+ return -EOPNOTSUPP;
+ if (max_tx_rate && max_tx_rate < min_tx_rate)
+ return -EINVAL;
+
+ return ops->ndo_set_vf_rate(dev, vf, min_tx_rate, max_tx_rate);
+}
+
+static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ if (tb[IFLA_ADDRESS] &&
+ nla_len(tb[IFLA_ADDRESS]) < dev->addr_len)
+ return -EINVAL;
+
+ if (tb[IFLA_BROADCAST] &&
+ nla_len(tb[IFLA_BROADCAST]) < dev->addr_len)
+ return -EINVAL;
+
+ if (tb[IFLA_GSO_MAX_SIZE] &&
+ nla_get_u32(tb[IFLA_GSO_MAX_SIZE]) > dev->tso_max_size) {
+ NL_SET_ERR_MSG(extack, "too big gso_max_size");
+ return -EINVAL;
+ }
+
+ if (tb[IFLA_GSO_MAX_SEGS] &&
+ (nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > GSO_MAX_SEGS ||
+ nla_get_u32(tb[IFLA_GSO_MAX_SEGS]) > dev->tso_max_segs)) {
+ NL_SET_ERR_MSG(extack, "too big gso_max_segs");
+ return -EINVAL;
+ }
+
+ if (tb[IFLA_GRO_MAX_SIZE] &&
+ nla_get_u32(tb[IFLA_GRO_MAX_SIZE]) > GRO_MAX_SIZE) {
+ NL_SET_ERR_MSG(extack, "too big gro_max_size");
+ return -EINVAL;
+ }
+
+ if (tb[IFLA_GSO_IPV4_MAX_SIZE] &&
+ nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]) > dev->tso_max_size) {
+ NL_SET_ERR_MSG(extack, "too big gso_ipv4_max_size");
+ return -EINVAL;
+ }
+
+ if (tb[IFLA_GRO_IPV4_MAX_SIZE] &&
+ nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]) > GRO_MAX_SIZE) {
+ NL_SET_ERR_MSG(extack, "too big gro_ipv4_max_size");
+ return -EINVAL;
}
if (tb[IFLA_AF_SPEC]) {
@@ -2069,29 +2717,24 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
int rem, err;
nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
- const struct rtnl_af_ops *af_ops;
+ struct rtnl_af_ops *af_ops;
+ int af_ops_srcu_index;
- rcu_read_lock();
- af_ops = rtnl_af_lookup(nla_type(af));
- if (!af_ops) {
- rcu_read_unlock();
+ af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index);
+ if (!af_ops)
return -EAFNOSUPPORT;
- }
- if (!af_ops->set_link_af) {
- rcu_read_unlock();
- return -EOPNOTSUPP;
- }
+ if (!af_ops->set_link_af)
+ err = -EOPNOTSUPP;
+ else if (af_ops->validate_link_af)
+ err = af_ops->validate_link_af(dev, af, extack);
+ else
+ err = 0;
- if (af_ops->validate_link_af) {
- err = af_ops->validate_link_af(dev, af);
- if (err < 0) {
- rcu_read_unlock();
- return err;
- }
- }
+ rtnl_af_put(af_ops, af_ops_srcu_index);
- rcu_read_unlock();
+ if (err < 0)
+ return err;
}
}
@@ -2122,6 +2765,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_MAC]) {
struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]);
+ if (ivm->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_mac)
err = ops->ndo_set_vf_mac(dev, ivm->vf,
@@ -2133,6 +2778,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_VLAN]) {
struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]);
+ if (ivv->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_vlan)
err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan,
@@ -2153,7 +2800,7 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) {
if (nla_type(attr) != IFLA_VF_VLAN_INFO ||
- nla_len(attr) < NLA_HDRLEN) {
+ nla_len(attr) < sizeof(struct ifla_vf_vlan_info)) {
return -EINVAL;
}
if (len >= MAX_VLAN_LIST_LEN)
@@ -2165,6 +2812,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (len == 0)
return -EINVAL;
+ if (ivvl[0]->vf >= INT_MAX)
+ return -EINVAL;
err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan,
ivvl[0]->qos, ivvl[0]->vlan_proto);
if (err < 0)
@@ -2175,17 +2824,16 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]);
struct ifla_vf_info ivf;
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_get_vf_config)
err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf);
if (err < 0)
return err;
- err = -EOPNOTSUPP;
- if (ops->ndo_set_vf_rate)
- err = ops->ndo_set_vf_rate(dev, ivt->vf,
- ivf.min_tx_rate,
- ivt->rate);
+ err = rtnl_set_vf_rate(dev, ivt->vf,
+ ivf.min_tx_rate, ivt->rate);
if (err < 0)
return err;
}
@@ -2193,11 +2841,11 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_RATE]) {
struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]);
- err = -EOPNOTSUPP;
- if (ops->ndo_set_vf_rate)
- err = ops->ndo_set_vf_rate(dev, ivt->vf,
- ivt->min_tx_rate,
- ivt->max_tx_rate);
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
+
+ err = rtnl_set_vf_rate(dev, ivt->vf,
+ ivt->min_tx_rate, ivt->max_tx_rate);
if (err < 0)
return err;
}
@@ -2205,6 +2853,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_SPOOFCHK]) {
struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]);
+ if (ivs->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_spoofchk)
err = ops->ndo_set_vf_spoofchk(dev, ivs->vf,
@@ -2216,6 +2866,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_LINK_STATE]) {
struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]);
+ if (ivl->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_link_state)
err = ops->ndo_set_vf_link_state(dev, ivl->vf,
@@ -2229,6 +2881,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
err = -EOPNOTSUPP;
ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]);
+ if (ivrssq_en->vf >= INT_MAX)
+ return -EINVAL;
if (ops->ndo_set_vf_rss_query_en)
err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf,
ivrssq_en->setting);
@@ -2239,6 +2893,8 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_TRUST]) {
struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]);
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
err = -EOPNOTSUPP;
if (ops->ndo_set_vf_trust)
err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting);
@@ -2249,15 +2905,18 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
if (tb[IFLA_VF_IB_NODE_GUID]) {
struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]);
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
if (!ops->ndo_set_vf_guid)
return -EOPNOTSUPP;
-
return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID);
}
if (tb[IFLA_VF_IB_PORT_GUID]) {
struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]);
+ if (ivt->vf >= INT_MAX)
+ return -EINVAL;
if (!ops->ndo_set_vf_guid)
return -EOPNOTSUPP;
@@ -2274,12 +2933,19 @@ static int do_set_master(struct net_device *dev, int ifindex,
const struct net_device_ops *ops;
int err;
+ /* Release the lower lock, the upper is responsible for locking
+ * the lower if needed. None of the existing upper devices
+ * use netdev instance lock, so don't grab it.
+ */
+
if (upper_dev) {
if (upper_dev->ifindex == ifindex)
return 0;
ops = upper_dev->netdev_ops;
if (ops->ndo_del_slave) {
+ netdev_unlock_ops(dev);
err = ops->ndo_del_slave(upper_dev, dev);
+ netdev_lock_ops(dev);
if (err)
return err;
} else {
@@ -2293,7 +2959,9 @@ static int do_set_master(struct net_device *dev, int ifindex,
return -EINVAL;
ops = upper_dev->netdev_ops;
if (ops->ndo_add_slave) {
+ netdev_unlock_ops(dev);
err = ops->ndo_add_slave(upper_dev, dev, extack);
+ netdev_lock_ops(dev);
if (err)
return err;
} else {
@@ -2303,36 +2971,102 @@ static int do_set_master(struct net_device *dev, int ifindex,
return 0;
}
+static const struct nla_policy ifla_proto_down_reason_policy[IFLA_PROTO_DOWN_REASON_VALUE + 1] = {
+ [IFLA_PROTO_DOWN_REASON_MASK] = { .type = NLA_U32 },
+ [IFLA_PROTO_DOWN_REASON_VALUE] = { .type = NLA_U32 },
+};
+
+static int do_set_proto_down(struct net_device *dev,
+ struct nlattr *nl_proto_down,
+ struct nlattr *nl_proto_down_reason,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1];
+ unsigned long mask = 0;
+ u32 value;
+ bool proto_down;
+ int err;
+
+ if (!dev->change_proto_down) {
+ NL_SET_ERR_MSG(extack, "Protodown not supported by device");
+ return -EOPNOTSUPP;
+ }
+
+ if (nl_proto_down_reason) {
+ err = nla_parse_nested_deprecated(pdreason,
+ IFLA_PROTO_DOWN_REASON_MAX,
+ nl_proto_down_reason,
+ ifla_proto_down_reason_policy,
+ NULL);
+ if (err < 0)
+ return err;
+
+ if (!pdreason[IFLA_PROTO_DOWN_REASON_VALUE]) {
+ NL_SET_ERR_MSG(extack, "Invalid protodown reason value");
+ return -EINVAL;
+ }
+
+ value = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_VALUE]);
+
+ if (pdreason[IFLA_PROTO_DOWN_REASON_MASK])
+ mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]);
+
+ netdev_change_proto_down_reason_locked(dev, mask, value);
+ }
+
+ if (nl_proto_down) {
+ proto_down = nla_get_u8(nl_proto_down);
+
+ /* Don't turn off protodown if there are active reasons */
+ if (!proto_down && dev->proto_down_reason) {
+ NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons");
+ return -EBUSY;
+ }
+ err = netif_change_proto_down(dev, proto_down);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
#define DO_SETLINK_MODIFIED 0x01
/* notify flag means notify + modified. */
#define DO_SETLINK_NOTIFY 0x03
-static int do_setlink(const struct sk_buff *skb,
- struct net_device *dev, struct ifinfomsg *ifm,
+static int do_setlink(const struct sk_buff *skb, struct net_device *dev,
+ struct net *tgt_net, struct ifinfomsg *ifm,
struct netlink_ext_ack *extack,
- struct nlattr **tb, char *ifname, int status)
+ struct nlattr **tb, int status)
{
const struct net_device_ops *ops = dev->netdev_ops;
+ char ifname[IFNAMSIZ];
int err;
- err = validate_linkmsg(dev, tb);
+ err = validate_linkmsg(dev, tb, extack);
if (err < 0)
return err;
- if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_IF_NETNSID]) {
- struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev),
- tb, CAP_NET_ADMIN);
- if (IS_ERR(net)) {
- err = PTR_ERR(net);
- goto errout;
- }
+ if (tb[IFLA_IFNAME])
+ nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ else
+ ifname[0] = '\0';
- err = dev_change_net_namespace(dev, net, ifname);
- put_net(net);
+ if (!net_eq(tgt_net, dev_net(dev))) {
+ const char *pat = ifname[0] ? ifname : NULL;
+ int new_ifindex;
+
+ new_ifindex = nla_get_s32_default(tb[IFLA_NEW_IFINDEX], 0);
+
+ err = __dev_change_net_namespace(dev, tgt_net, pat,
+ new_ifindex, extack);
if (err)
- goto errout;
+ return err;
+
status |= DO_SETLINK_MODIFIED;
}
+ netdev_lock_ops(dev);
+
if (tb[IFLA_MAP]) {
struct rtnl_link_ifmap *u_map;
struct ifmap k_map;
@@ -2363,35 +3097,35 @@ static int do_setlink(const struct sk_buff *skb,
}
if (tb[IFLA_ADDRESS]) {
- struct sockaddr *sa;
- int len;
-
- len = sizeof(sa_family_t) + max_t(size_t, dev->addr_len,
- sizeof(*sa));
- sa = kmalloc(len, GFP_KERNEL);
- if (!sa) {
- err = -ENOMEM;
+ struct sockaddr_storage ss = { };
+
+ netdev_unlock_ops(dev);
+
+ /* dev_addr_sem is an outer lock, enforce proper ordering */
+ down_write(&dev_addr_sem);
+ netdev_lock_ops(dev);
+
+ ss.ss_family = dev->type;
+ memcpy(ss.__data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len);
+ err = netif_set_mac_address(dev, &ss, extack);
+ if (err) {
+ up_write(&dev_addr_sem);
goto errout;
}
- sa->sa_family = dev->type;
- memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]),
- dev->addr_len);
- err = dev_set_mac_address(dev, sa);
- kfree(sa);
- if (err)
- goto errout;
status |= DO_SETLINK_MODIFIED;
+
+ up_write(&dev_addr_sem);
}
if (tb[IFLA_MTU]) {
- err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack);
+ err = netif_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack);
if (err < 0)
goto errout;
status |= DO_SETLINK_MODIFIED;
}
if (tb[IFLA_GROUP]) {
- dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+ netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
status |= DO_SETLINK_NOTIFY;
}
@@ -2401,15 +3135,15 @@ static int do_setlink(const struct sk_buff *skb,
* requested.
*/
if (ifm->ifi_index > 0 && ifname[0]) {
- err = dev_change_name(dev, ifname);
+ err = netif_change_name(dev, ifname);
if (err < 0)
goto errout;
status |= DO_SETLINK_MODIFIED;
}
if (tb[IFLA_IFALIAS]) {
- err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]),
- nla_len(tb[IFLA_IFALIAS]));
+ err = netif_set_alias(dev, nla_data(tb[IFLA_IFALIAS]),
+ nla_len(tb[IFLA_IFALIAS]));
if (err < 0)
goto errout;
status |= DO_SETLINK_NOTIFY;
@@ -2421,7 +3155,8 @@ static int do_setlink(const struct sk_buff *skb,
}
if (ifm->ifi_flags || ifm->ifi_change) {
- err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm));
+ err = netif_change_flags(dev, rtnl_dev_combine_flags(dev, ifm),
+ extack);
if (err < 0)
goto errout;
}
@@ -2434,7 +3169,7 @@ static int do_setlink(const struct sk_buff *skb,
}
if (tb[IFLA_CARRIER]) {
- err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER]));
+ err = netif_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER]));
if (err)
goto errout;
status |= DO_SETLINK_MODIFIED;
@@ -2443,7 +3178,7 @@ static int do_setlink(const struct sk_buff *skb,
if (tb[IFLA_TXQLEN]) {
unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]);
- err = dev_change_tx_queue_len(dev, value);
+ err = netif_change_tx_queue_len(dev, value);
if (err)
goto errout;
status |= DO_SETLINK_MODIFIED;
@@ -2452,11 +3187,6 @@ static int do_setlink(const struct sk_buff *skb,
if (tb[IFLA_GSO_MAX_SIZE]) {
u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]);
- if (max_size > GSO_MAX_SIZE) {
- err = -EINVAL;
- goto errout;
- }
-
if (dev->gso_max_size ^ max_size) {
netif_set_gso_max_size(dev, max_size);
status |= DO_SETLINK_MODIFIED;
@@ -2466,13 +3196,35 @@ static int do_setlink(const struct sk_buff *skb,
if (tb[IFLA_GSO_MAX_SEGS]) {
u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
- if (max_segs > GSO_MAX_SEGS) {
- err = -EINVAL;
- goto errout;
+ if (dev->gso_max_segs ^ max_segs) {
+ netif_set_gso_max_segs(dev, max_segs);
+ status |= DO_SETLINK_MODIFIED;
}
+ }
- if (dev->gso_max_segs ^ max_segs) {
- dev->gso_max_segs = max_segs;
+ if (tb[IFLA_GRO_MAX_SIZE]) {
+ u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_MAX_SIZE]);
+
+ if (dev->gro_max_size ^ gro_max_size) {
+ netif_set_gro_max_size(dev, gro_max_size);
+ status |= DO_SETLINK_MODIFIED;
+ }
+ }
+
+ if (tb[IFLA_GSO_IPV4_MAX_SIZE]) {
+ u32 max_size = nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]);
+
+ if (dev->gso_ipv4_max_size ^ max_size) {
+ netif_set_gso_ipv4_max_size(dev, max_size);
+ status |= DO_SETLINK_MODIFIED;
+ }
+ }
+
+ if (tb[IFLA_GRO_IPV4_MAX_SIZE]) {
+ u32 gro_max_size = nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]);
+
+ if (dev->gro_ipv4_max_size ^ gro_max_size) {
+ netif_set_gro_ipv4_max_size(dev, gro_max_size);
status |= DO_SETLINK_MODIFIED;
}
}
@@ -2483,11 +3235,9 @@ static int do_setlink(const struct sk_buff *skb,
if (tb[IFLA_LINKMODE]) {
unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]);
- write_lock_bh(&dev_base_lock);
if (dev->link_mode ^ value)
status |= DO_SETLINK_NOTIFY;
- dev->link_mode = value;
- write_unlock_bh(&dev_base_lock);
+ WRITE_ONCE(dev->link_mode, value);
}
if (tb[IFLA_VFINFO_LIST]) {
@@ -2501,8 +3251,10 @@ static int do_setlink(const struct sk_buff *skb,
err = -EINVAL;
goto errout;
}
- err = nla_parse_nested(vfinfo, IFLA_VF_MAX, attr,
- ifla_vf_policy, NULL);
+ err = nla_parse_nested_deprecated(vfinfo, IFLA_VF_MAX,
+ attr,
+ ifla_vf_policy,
+ NULL);
if (err < 0)
goto errout;
err = do_setvfinfo(dev, vfinfo);
@@ -2529,8 +3281,10 @@ static int do_setlink(const struct sk_buff *skb,
err = -EINVAL;
goto errout;
}
- err = nla_parse_nested(port, IFLA_PORT_MAX, attr,
- ifla_port_policy, NULL);
+ err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX,
+ attr,
+ ifla_port_policy,
+ NULL);
if (err < 0)
goto errout;
if (!port[IFLA_PORT_VF]) {
@@ -2549,9 +3303,9 @@ static int do_setlink(const struct sk_buff *skb,
if (tb[IFLA_PORT_SELF]) {
struct nlattr *port[IFLA_PORT_MAX+1];
- err = nla_parse_nested(port, IFLA_PORT_MAX,
- tb[IFLA_PORT_SELF], ifla_port_policy,
- NULL);
+ err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX,
+ tb[IFLA_PORT_SELF],
+ ifla_port_policy, NULL);
if (err < 0)
goto errout;
@@ -2568,27 +3322,29 @@ static int do_setlink(const struct sk_buff *skb,
int rem;
nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) {
- const struct rtnl_af_ops *af_ops;
+ struct rtnl_af_ops *af_ops;
+ int af_ops_srcu_index;
- rcu_read_lock();
+ af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index);
+ if (!af_ops) {
+ err = -EAFNOSUPPORT;
+ goto errout;
+ }
- BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af))));
+ err = af_ops->set_link_af(dev, af, extack);
+ rtnl_af_put(af_ops, af_ops_srcu_index);
- err = af_ops->set_link_af(dev, af);
- if (err < 0) {
- rcu_read_unlock();
+ if (err < 0)
goto errout;
- }
- rcu_read_unlock();
status |= DO_SETLINK_NOTIFY;
}
}
err = 0;
- if (tb[IFLA_PROTO_DOWN]) {
- err = dev_change_proto_down(dev,
- nla_get_u8(tb[IFLA_PROTO_DOWN]));
+ if (tb[IFLA_PROTO_DOWN] || tb[IFLA_PROTO_DOWN_REASON]) {
+ err = do_set_proto_down(dev, tb[IFLA_PROTO_DOWN],
+ tb[IFLA_PROTO_DOWN_REASON], extack);
if (err)
goto errout;
status |= DO_SETLINK_NOTIFY;
@@ -2598,8 +3354,9 @@ static int do_setlink(const struct sk_buff *skb,
struct nlattr *xdp[IFLA_XDP_MAX + 1];
u32 xdp_flags = 0;
- err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP],
- ifla_xdp_policy, NULL);
+ err = nla_parse_nested_deprecated(xdp, IFLA_XDP_MAX,
+ tb[IFLA_XDP],
+ ifla_xdp_policy, NULL);
if (err < 0)
goto errout;
@@ -2621,8 +3378,20 @@ static int do_setlink(const struct sk_buff *skb,
}
if (xdp[IFLA_XDP_FD]) {
+ int expected_fd = -1;
+
+ if (xdp_flags & XDP_FLAGS_REPLACE) {
+ if (!xdp[IFLA_XDP_EXPECTED_FD]) {
+ err = -EINVAL;
+ goto errout;
+ }
+ expected_fd =
+ nla_get_s32(xdp[IFLA_XDP_EXPECTED_FD]);
+ }
+
err = dev_change_xdp_fd(dev, extack,
nla_get_s32(xdp[IFLA_XDP_FD]),
+ expected_fd,
xdp_flags);
if (err)
goto errout;
@@ -2633,28 +3402,46 @@ static int do_setlink(const struct sk_buff *skb,
errout:
if (status & DO_SETLINK_MODIFIED) {
if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY)
- netdev_state_change(dev);
+ netif_state_change(dev);
if (err < 0)
net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n",
dev->name);
}
+ netdev_unlock_ops(dev);
+
return err;
}
+static struct net_device *rtnl_dev_get(struct net *net,
+ struct nlattr *tb[])
+{
+ char ifname[ALTIFNAMSIZ];
+
+ if (tb[IFLA_IFNAME])
+ nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ else if (tb[IFLA_ALT_IFNAME])
+ nla_strscpy(ifname, tb[IFLA_ALT_IFNAME], ALTIFNAMSIZ);
+ else
+ return NULL;
+
+ return __dev_get_by_name(net, ifname);
+}
+
static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
+ struct ifinfomsg *ifm = nlmsg_data(nlh);
struct net *net = sock_net(skb->sk);
- struct ifinfomsg *ifm;
- struct net_device *dev;
- int err;
struct nlattr *tb[IFLA_MAX+1];
- char ifname[IFNAMSIZ];
+ struct net_device *dev = NULL;
+ struct rtnl_nets rtnl_nets;
+ struct net *tgt_net;
+ int err;
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy,
- extack);
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
+ ifla_policy, extack);
if (err < 0)
goto errout;
@@ -2662,26 +3449,32 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
goto errout;
- if (tb[IFLA_IFNAME])
- nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
- else
- ifname[0] = '\0';
+ tgt_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN);
+ if (IS_ERR(tgt_net)) {
+ err = PTR_ERR(tgt_net);
+ goto errout;
+ }
+
+ rtnl_nets_init(&rtnl_nets);
+ rtnl_nets_add(&rtnl_nets, get_net(net));
+ rtnl_nets_add(&rtnl_nets, tgt_net);
+
+ rtnl_nets_lock(&rtnl_nets);
- err = -EINVAL;
- ifm = nlmsg_data(nlh);
if (ifm->ifi_index > 0)
dev = __dev_get_by_index(net, ifm->ifi_index);
- else if (tb[IFLA_IFNAME])
- dev = __dev_get_by_name(net, ifname);
+ else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
+ dev = rtnl_dev_get(net, tb);
else
- goto errout;
+ err = -EINVAL;
- if (dev == NULL) {
+ if (dev)
+ err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0);
+ else if (!err)
err = -ENODEV;
- goto errout;
- }
- err = do_setlink(skb, dev, ifm, extack, tb, ifname, 0);
+ rtnl_nets_unlock(&rtnl_nets);
+ rtnl_nets_destroy(&rtnl_nets);
errout:
return err;
}
@@ -2722,7 +3515,7 @@ static int rtnl_group_dellink(const struct net *net, int group)
return 0;
}
-int rtnl_delete_link(struct net_device *dev)
+int rtnl_delete_link(struct net_device *dev, u32 portid, const struct nlmsghdr *nlh)
{
const struct rtnl_link_ops *ops;
LIST_HEAD(list_kill);
@@ -2732,7 +3525,7 @@ int rtnl_delete_link(struct net_device *dev)
return -EOPNOTSUPP;
ops->dellink(dev, &list_kill);
- unregister_netdevice_many(&list_kill);
+ unregister_netdevice_many_notify(&list_kill, portid, nlh);
return 0;
}
@@ -2741,16 +3534,17 @@ EXPORT_SYMBOL_GPL(rtnl_delete_link);
static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
+ struct ifinfomsg *ifm = nlmsg_data(nlh);
struct net *net = sock_net(skb->sk);
- struct net *tgt_net = net;
- struct net_device *dev = NULL;
- struct ifinfomsg *ifm;
- char ifname[IFNAMSIZ];
+ u32 portid = NETLINK_CB(skb).portid;
struct nlattr *tb[IFLA_MAX+1];
- int err;
+ struct net_device *dev = NULL;
+ struct net *tgt_net = net;
int netnsid = -1;
+ int err;
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack);
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
+ ifla_policy, extack);
if (err < 0)
return err;
@@ -2758,72 +3552,72 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
return err;
- if (tb[IFLA_IFNAME])
- nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
-
- if (tb[IFLA_IF_NETNSID]) {
- netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
- tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid);
+ if (tb[IFLA_TARGET_NETNSID]) {
+ netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]);
+ tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid);
if (IS_ERR(tgt_net))
return PTR_ERR(tgt_net);
}
- err = -EINVAL;
- ifm = nlmsg_data(nlh);
+ rtnl_net_lock(tgt_net);
+
if (ifm->ifi_index > 0)
dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
- else if (tb[IFLA_IFNAME])
- dev = __dev_get_by_name(tgt_net, ifname);
+ else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
+ dev = rtnl_dev_get(tgt_net, tb);
+
+ if (dev)
+ err = rtnl_delete_link(dev, portid, nlh);
+ else if (ifm->ifi_index > 0 || tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
+ err = -ENODEV;
else if (tb[IFLA_GROUP])
err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP]));
else
- goto out;
-
- if (!dev) {
- if (tb[IFLA_IFNAME] || ifm->ifi_index > 0)
- err = -ENODEV;
+ err = -EINVAL;
- goto out;
- }
+ rtnl_net_unlock(tgt_net);
- err = rtnl_delete_link(dev);
-
-out:
if (netnsid >= 0)
put_net(tgt_net);
return err;
}
-int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
+int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm,
+ u32 portid, const struct nlmsghdr *nlh)
{
- unsigned int old_flags;
+ unsigned int old_flags, changed;
int err;
old_flags = dev->flags;
if (ifm && (ifm->ifi_flags || ifm->ifi_change)) {
- err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm));
+ err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm),
+ NULL);
if (err < 0)
return err;
}
- if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) {
- __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags));
- } else {
- dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
- __dev_notify_flags(dev, old_flags, ~0U);
+ changed = old_flags ^ dev->flags;
+ if (dev->rtnl_link_initializing) {
+ dev->rtnl_link_initializing = false;
+ changed = ~0U;
}
+
+ __dev_notify_flags(dev, old_flags, changed, portid, nlh);
return 0;
}
EXPORT_SYMBOL(rtnl_configure_link);
-struct net_device *rtnl_create_link(struct net *net,
- const char *ifname, unsigned char name_assign_type,
- const struct rtnl_link_ops *ops, struct nlattr *tb[])
+struct net_device *rtnl_create_link(struct net *net, const char *ifname,
+ unsigned char name_assign_type,
+ const struct rtnl_link_ops *ops,
+ struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
{
struct net_device *dev;
unsigned int num_tx_queues = 1;
unsigned int num_rx_queues = 1;
+ int err;
if (tb[IFLA_NUM_TX_QUEUES])
num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]);
@@ -2835,26 +3629,53 @@ struct net_device *rtnl_create_link(struct net *net,
else if (ops->get_num_rx_queues)
num_rx_queues = ops->get_num_rx_queues();
- if (num_tx_queues < 1 || num_tx_queues > 4096)
+ if (num_tx_queues < 1 || num_tx_queues > 4096) {
+ NL_SET_ERR_MSG(extack, "Invalid number of transmit queues");
return ERR_PTR(-EINVAL);
+ }
- if (num_rx_queues < 1 || num_rx_queues > 4096)
+ if (num_rx_queues < 1 || num_rx_queues > 4096) {
+ NL_SET_ERR_MSG(extack, "Invalid number of receive queues");
return ERR_PTR(-EINVAL);
+ }
+
+ if (ops->alloc) {
+ dev = ops->alloc(tb, ifname, name_assign_type,
+ num_tx_queues, num_rx_queues);
+ if (IS_ERR(dev))
+ return dev;
+ } else {
+ dev = alloc_netdev_mqs(ops->priv_size, ifname,
+ name_assign_type, ops->setup,
+ num_tx_queues, num_rx_queues);
+ }
- dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type,
- ops->setup, num_tx_queues, num_rx_queues);
if (!dev)
return ERR_PTR(-ENOMEM);
+ err = validate_linkmsg(dev, tb, extack);
+ if (err < 0) {
+ free_netdev(dev);
+ return ERR_PTR(err);
+ }
+
dev_net_set(dev, net);
dev->rtnl_link_ops = ops;
- dev->rtnl_link_state = RTNL_LINK_INITIALIZING;
+ dev->rtnl_link_initializing = true;
+
+ if (tb[IFLA_MTU]) {
+ u32 mtu = nla_get_u32(tb[IFLA_MTU]);
- if (tb[IFLA_MTU])
- dev->mtu = nla_get_u32(tb[IFLA_MTU]);
+ err = dev_validate_mtu(dev, mtu, extack);
+ if (err) {
+ free_netdev(dev);
+ return ERR_PTR(err);
+ }
+ dev->mtu = mtu;
+ }
if (tb[IFLA_ADDRESS]) {
- memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]),
- nla_len(tb[IFLA_ADDRESS]));
+ __dev_addr_set(dev, nla_data(tb[IFLA_ADDRESS]),
+ nla_len(tb[IFLA_ADDRESS]));
dev->addr_assign_type = NET_ADDR_SET;
}
if (tb[IFLA_BROADCAST])
@@ -2867,28 +3688,106 @@ struct net_device *rtnl_create_link(struct net *net,
if (tb[IFLA_LINKMODE])
dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
if (tb[IFLA_GROUP])
- dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
+ netif_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
if (tb[IFLA_GSO_MAX_SIZE])
netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE]));
if (tb[IFLA_GSO_MAX_SEGS])
- dev->gso_max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]);
+ netif_set_gso_max_segs(dev, nla_get_u32(tb[IFLA_GSO_MAX_SEGS]));
+ if (tb[IFLA_GRO_MAX_SIZE])
+ netif_set_gro_max_size(dev, nla_get_u32(tb[IFLA_GRO_MAX_SIZE]));
+ if (tb[IFLA_GSO_IPV4_MAX_SIZE])
+ netif_set_gso_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GSO_IPV4_MAX_SIZE]));
+ if (tb[IFLA_GRO_IPV4_MAX_SIZE])
+ netif_set_gro_ipv4_max_size(dev, nla_get_u32(tb[IFLA_GRO_IPV4_MAX_SIZE]));
return dev;
}
EXPORT_SYMBOL(rtnl_create_link);
+struct rtnl_newlink_tbs {
+ struct nlattr *tb[IFLA_MAX + 1];
+ struct nlattr *linkinfo[IFLA_INFO_MAX + 1];
+ struct nlattr *attr[RTNL_MAX_TYPE + 1];
+ struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
+};
+
+static int rtnl_changelink(const struct sk_buff *skb, struct nlmsghdr *nlh,
+ const struct rtnl_link_ops *ops,
+ struct net_device *dev, struct net *tgt_net,
+ struct rtnl_newlink_tbs *tbs,
+ struct nlattr **data,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr ** const linkinfo = tbs->linkinfo;
+ struct nlattr ** const tb = tbs->tb;
+ int status = 0;
+ int err;
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ return -EEXIST;
+
+ if (nlh->nlmsg_flags & NLM_F_REPLACE)
+ return -EOPNOTSUPP;
+
+ if (linkinfo[IFLA_INFO_DATA]) {
+ if (!ops || ops != dev->rtnl_link_ops || !ops->changelink)
+ return -EOPNOTSUPP;
+
+ err = ops->changelink(dev, tb, data, extack);
+ if (err < 0)
+ return err;
+
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ if (linkinfo[IFLA_INFO_SLAVE_DATA]) {
+ const struct rtnl_link_ops *m_ops = NULL;
+ struct nlattr **slave_data = NULL;
+ struct net_device *master_dev;
+
+ master_dev = netdev_master_upper_dev_get(dev);
+ if (master_dev)
+ m_ops = master_dev->rtnl_link_ops;
+
+ if (!m_ops || !m_ops->slave_changelink)
+ return -EOPNOTSUPP;
+
+ if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)
+ return -EINVAL;
+
+ if (m_ops->slave_maxtype) {
+ err = nla_parse_nested_deprecated(tbs->slave_attr,
+ m_ops->slave_maxtype,
+ linkinfo[IFLA_INFO_SLAVE_DATA],
+ m_ops->slave_policy, extack);
+ if (err < 0)
+ return err;
+
+ slave_data = tbs->slave_attr;
+ }
+
+ err = m_ops->slave_changelink(master_dev, dev, tb, slave_data, extack);
+ if (err < 0)
+ return err;
+
+ status |= DO_SETLINK_NOTIFY;
+ }
+
+ return do_setlink(skb, dev, tgt_net, nlmsg_data(nlh), extack, tb, status);
+}
+
static int rtnl_group_changelink(const struct sk_buff *skb,
- struct net *net, int group,
- struct ifinfomsg *ifm,
- struct netlink_ext_ack *extack,
- struct nlattr **tb)
+ struct net *net, struct net *tgt_net,
+ int group, struct ifinfomsg *ifm,
+ struct netlink_ext_ack *extack,
+ struct nlattr **tb)
{
struct net_device *dev, *aux;
int err;
for_each_netdev_safe(net, dev, aux) {
if (dev->group == group) {
- err = do_setlink(skb, dev, ifm, extack, tb, NULL, 0);
+ err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0);
if (err < 0)
return err;
}
@@ -2897,262 +3796,338 @@ static int rtnl_group_changelink(const struct sk_buff *skb,
return 0;
}
-static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
+static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
+ const struct rtnl_link_ops *ops,
+ struct net *tgt_net, struct net *link_net,
+ struct net *peer_net,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb, struct nlattr **data,
+ struct netlink_ext_ack *extack)
{
- struct net *net = sock_net(skb->sk);
- const struct rtnl_link_ops *ops;
- const struct rtnl_link_ops *m_ops = NULL;
+ unsigned char name_assign_type = NET_NAME_USER;
+ struct rtnl_newlink_params params = {
+ .src_net = sock_net(skb->sk),
+ .link_net = link_net,
+ .peer_net = peer_net,
+ .tb = tb,
+ .data = data,
+ };
+ u32 portid = NETLINK_CB(skb).portid;
struct net_device *dev;
- struct net_device *master_dev = NULL;
- struct ifinfomsg *ifm;
- char kind[MODULE_NAME_LEN];
char ifname[IFNAMSIZ];
- struct nlattr *tb[IFLA_MAX+1];
- struct nlattr *linkinfo[IFLA_INFO_MAX+1];
- unsigned char name_assign_type = NET_NAME_USER;
int err;
-#ifdef CONFIG_MODULES
-replay:
-#endif
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack);
- if (err < 0)
- return err;
+ if (!ops->alloc && !ops->setup)
+ return -EOPNOTSUPP;
- err = rtnl_ensure_unique_netns(tb, extack, false);
- if (err < 0)
- return err;
+ if (tb[IFLA_IFNAME]) {
+ nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ } else {
+ snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
+ name_assign_type = NET_NAME_ENUM;
+ }
- if (tb[IFLA_IFNAME])
- nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ dev = rtnl_create_link(tgt_net, ifname, name_assign_type, ops, tb,
+ extack);
+ if (IS_ERR(dev)) {
+ err = PTR_ERR(dev);
+ goto out;
+ }
+
+ dev->ifindex = ifm->ifi_index;
+
+ if (ops->newlink)
+ err = ops->newlink(dev, &params, extack);
else
- ifname[0] = '\0';
+ err = register_netdevice(dev);
+ if (err < 0) {
+ free_netdev(dev);
+ goto out;
+ }
- ifm = nlmsg_data(nlh);
- if (ifm->ifi_index > 0)
- dev = __dev_get_by_index(net, ifm->ifi_index);
- else {
- if (ifname[0])
- dev = __dev_get_by_name(net, ifname);
- else
- dev = NULL;
+ netdev_lock_ops(dev);
+
+ err = rtnl_configure_link(dev, ifm, portid, nlh);
+ if (err < 0)
+ goto out_unregister;
+ if (tb[IFLA_MASTER]) {
+ err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack);
+ if (err)
+ goto out_unregister;
}
- if (dev) {
- master_dev = netdev_master_upper_dev_get(dev);
- if (master_dev)
- m_ops = master_dev->rtnl_link_ops;
+ netdev_unlock_ops(dev);
+out:
+ return err;
+out_unregister:
+ netdev_unlock_ops(dev);
+ if (ops->newlink) {
+ LIST_HEAD(list_kill);
+
+ ops->dellink(dev, &list_kill);
+ unregister_netdevice_many(&list_kill);
+ } else {
+ unregister_netdevice(dev);
}
+ goto out;
+}
+
+static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops,
+ struct nlattr *tbp[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_MAX + 1];
+ int err;
+
+ if (!data || !data[ops->peer_type])
+ return rtnl_link_get_net_ifla(tbp);
- err = validate_linkmsg(dev, tb);
+ err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack);
if (err < 0)
- return err;
+ return ERR_PTR(err);
- if (tb[IFLA_LINKINFO]) {
- err = nla_parse_nested(linkinfo, IFLA_INFO_MAX,
- tb[IFLA_LINKINFO], ifla_info_policy,
- NULL);
+ if (ops->validate) {
+ err = ops->validate(tb, NULL, extack);
if (err < 0)
- return err;
- } else
- memset(linkinfo, 0, sizeof(linkinfo));
+ return ERR_PTR(err);
+ }
- if (linkinfo[IFLA_INFO_KIND]) {
- nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
- ops = rtnl_link_ops_get(kind);
+ return rtnl_link_get_net_ifla(tb);
+}
+
+static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ const struct rtnl_link_ops *ops,
+ struct net *tgt_net, struct net *link_net,
+ struct net *peer_net,
+ struct rtnl_newlink_tbs *tbs,
+ struct nlattr **data,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr ** const tb = tbs->tb;
+ struct net *net = sock_net(skb->sk);
+ struct net *device_net;
+ struct net_device *dev;
+ struct ifinfomsg *ifm;
+ bool link_specified;
+
+ /* When creating, lookup for existing device in target net namespace */
+ device_net = (nlh->nlmsg_flags & NLM_F_CREATE) &&
+ (nlh->nlmsg_flags & NLM_F_EXCL) ?
+ tgt_net : net;
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_index > 0) {
+ link_specified = true;
+ dev = __dev_get_by_index(device_net, ifm->ifi_index);
+ } else if (ifm->ifi_index < 0) {
+ NL_SET_ERR_MSG(extack, "ifindex can't be negative");
+ return -EINVAL;
+ } else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) {
+ link_specified = true;
+ dev = rtnl_dev_get(device_net, tb);
} else {
- kind[0] = '\0';
- ops = NULL;
+ link_specified = false;
+ dev = NULL;
}
- if (1) {
- struct nlattr *attr[RTNL_MAX_TYPE + 1];
- struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1];
- struct nlattr **data = NULL;
- struct nlattr **slave_data = NULL;
- struct net *dest_net, *link_net = NULL;
+ if (dev)
+ return rtnl_changelink(skb, nlh, ops, dev, tgt_net, tbs, data, extack);
- if (ops) {
- if (ops->maxtype > RTNL_MAX_TYPE)
- return -EINVAL;
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
+ /* No dev found and NLM_F_CREATE not set. Requested dev does not exist,
+ * or it's for a group
+ */
+ if (link_specified || !tb[IFLA_GROUP])
+ return -ENODEV;
- if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
- err = nla_parse_nested(attr, ops->maxtype,
- linkinfo[IFLA_INFO_DATA],
- ops->policy, NULL);
- if (err < 0)
- return err;
- data = attr;
- }
- if (ops->validate) {
- err = ops->validate(tb, data, extack);
- if (err < 0)
- return err;
- }
- }
+ return rtnl_group_changelink(skb, net, tgt_net,
+ nla_get_u32(tb[IFLA_GROUP]),
+ ifm, extack, tb);
+ }
- if (m_ops) {
- if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)
- return -EINVAL;
+ if (tb[IFLA_MAP] || tb[IFLA_PROTINFO])
+ return -EOPNOTSUPP;
- if (m_ops->slave_maxtype &&
- linkinfo[IFLA_INFO_SLAVE_DATA]) {
- err = nla_parse_nested(slave_attr,
- m_ops->slave_maxtype,
- linkinfo[IFLA_INFO_SLAVE_DATA],
- m_ops->slave_policy,
- NULL);
- if (err < 0)
- return err;
- slave_data = slave_attr;
- }
- }
+ if (!ops) {
+ NL_SET_ERR_MSG(extack, "Unknown device type");
+ return -EOPNOTSUPP;
+ }
- if (dev) {
- int status = 0;
+ return rtnl_newlink_create(skb, ifm, ops, tgt_net, link_net, peer_net, nlh,
+ tb, data, extack);
+}
- if (nlh->nlmsg_flags & NLM_F_EXCL)
- return -EEXIST;
- if (nlh->nlmsg_flags & NLM_F_REPLACE)
- return -EOPNOTSUPP;
+static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *tgt_net, *link_net = NULL, *peer_net = NULL;
+ struct nlattr **tb, **linkinfo, **data = NULL;
+ struct rtnl_link_ops *ops = NULL;
+ struct rtnl_newlink_tbs *tbs;
+ struct rtnl_nets rtnl_nets;
+ int ops_srcu_index;
+ int ret;
- if (linkinfo[IFLA_INFO_DATA]) {
- if (!ops || ops != dev->rtnl_link_ops ||
- !ops->changelink)
- return -EOPNOTSUPP;
+ tbs = kmalloc(sizeof(*tbs), GFP_KERNEL);
+ if (!tbs)
+ return -ENOMEM;
- err = ops->changelink(dev, tb, data, extack);
- if (err < 0)
- return err;
- status |= DO_SETLINK_NOTIFY;
- }
+ tb = tbs->tb;
+ ret = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb,
+ IFLA_MAX, ifla_policy, extack);
+ if (ret < 0)
+ goto free;
- if (linkinfo[IFLA_INFO_SLAVE_DATA]) {
- if (!m_ops || !m_ops->slave_changelink)
- return -EOPNOTSUPP;
+ ret = rtnl_ensure_unique_netns(tb, extack, false);
+ if (ret < 0)
+ goto free;
- err = m_ops->slave_changelink(master_dev, dev,
- tb, slave_data,
- extack);
- if (err < 0)
- return err;
- status |= DO_SETLINK_NOTIFY;
- }
+ linkinfo = tbs->linkinfo;
+ if (tb[IFLA_LINKINFO]) {
+ ret = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX,
+ tb[IFLA_LINKINFO],
+ ifla_info_policy, NULL);
+ if (ret < 0)
+ goto free;
+ } else {
+ memset(linkinfo, 0, sizeof(tbs->linkinfo));
+ }
- return do_setlink(skb, dev, ifm, extack, tb, ifname,
- status);
- }
+ if (linkinfo[IFLA_INFO_KIND]) {
+ char kind[MODULE_NAME_LEN];
- if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
- if (ifm->ifi_index == 0 && tb[IFLA_GROUP])
- return rtnl_group_changelink(skb, net,
- nla_get_u32(tb[IFLA_GROUP]),
- ifm, extack, tb);
- return -ENODEV;
+ nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind));
+ ops = rtnl_link_ops_get(kind, &ops_srcu_index);
+#ifdef CONFIG_MODULES
+ if (!ops) {
+ request_module("rtnl-link-%s", kind);
+ ops = rtnl_link_ops_get(kind, &ops_srcu_index);
}
+#endif
+ }
- if (tb[IFLA_MAP] || tb[IFLA_PROTINFO])
- return -EOPNOTSUPP;
+ rtnl_nets_init(&rtnl_nets);
- if (!ops) {
-#ifdef CONFIG_MODULES
- if (kind[0]) {
- __rtnl_unlock();
- request_module("rtnl-link-%s", kind);
- rtnl_lock();
- ops = rtnl_link_ops_get(kind);
- if (ops)
- goto replay;
- }
-#endif
- return -EOPNOTSUPP;
+ if (ops) {
+ if (ops->maxtype > RTNL_MAX_TYPE) {
+ ret = -EINVAL;
+ goto put_ops;
}
- if (!ops->setup)
- return -EOPNOTSUPP;
+ if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) {
+ ret = nla_parse_nested_deprecated(tbs->attr, ops->maxtype,
+ linkinfo[IFLA_INFO_DATA],
+ ops->policy, extack);
+ if (ret < 0)
+ goto put_ops;
- if (!ifname[0]) {
- snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind);
- name_assign_type = NET_NAME_ENUM;
+ data = tbs->attr;
}
- dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN);
- if (IS_ERR(dest_net))
- return PTR_ERR(dest_net);
-
- if (tb[IFLA_LINK_NETNSID]) {
- int id = nla_get_s32(tb[IFLA_LINK_NETNSID]);
+ if (ops->validate) {
+ ret = ops->validate(tb, data, extack);
+ if (ret < 0)
+ goto put_ops;
+ }
- link_net = get_net_ns_by_id(dest_net, id);
- if (!link_net) {
- err = -EINVAL;
- goto out;
+ if (ops->peer_type) {
+ peer_net = rtnl_get_peer_net(ops, tb, data, extack);
+ if (IS_ERR(peer_net)) {
+ ret = PTR_ERR(peer_net);
+ goto put_ops;
}
- err = -EPERM;
- if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN))
- goto out;
+ if (peer_net)
+ rtnl_nets_add(&rtnl_nets, peer_net);
}
+ }
- dev = rtnl_create_link(link_net ? : dest_net, ifname,
- name_assign_type, ops, tb);
- if (IS_ERR(dev)) {
- err = PTR_ERR(dev);
- goto out;
- }
+ tgt_net = rtnl_link_get_net_capable(skb, sock_net(skb->sk), tb, CAP_NET_ADMIN);
+ if (IS_ERR(tgt_net)) {
+ ret = PTR_ERR(tgt_net);
+ goto put_net;
+ }
- dev->ifindex = ifm->ifi_index;
+ rtnl_nets_add(&rtnl_nets, tgt_net);
- if (ops->newlink) {
- err = ops->newlink(link_net ? : net, dev, tb, data,
- extack);
- /* Drivers should call free_netdev() in ->destructor
- * and unregister it on failure after registration
- * so that device could be finally freed in rtnl_unlock.
- */
- if (err < 0) {
- /* If device is not registered at all, free it now */
- if (dev->reg_state == NETREG_UNINITIALIZED)
- free_netdev(dev);
- goto out;
- }
- } else {
- err = register_netdevice(dev);
- if (err < 0) {
- free_netdev(dev);
- goto out;
- }
- }
- err = rtnl_configure_link(dev, ifm);
- if (err < 0)
- goto out_unregister;
- if (link_net) {
- err = dev_change_net_namespace(dev, dest_net, ifname);
- if (err < 0)
- goto out_unregister;
+ if (tb[IFLA_LINK_NETNSID]) {
+ int id = nla_get_s32(tb[IFLA_LINK_NETNSID]);
+
+ link_net = get_net_ns_by_id(tgt_net, id);
+ if (!link_net) {
+ NL_SET_ERR_MSG(extack, "Unknown network namespace id");
+ ret = -EINVAL;
+ goto put_net;
}
- if (tb[IFLA_MASTER]) {
- err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]),
- extack);
- if (err)
- goto out_unregister;
+
+ rtnl_nets_add(&rtnl_nets, link_net);
+
+ if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ goto put_net;
}
-out:
- if (link_net)
- put_net(link_net);
- put_net(dest_net);
+ }
+
+ rtnl_nets_lock(&rtnl_nets);
+ ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, peer_net, tbs, data, extack);
+ rtnl_nets_unlock(&rtnl_nets);
+
+put_net:
+ rtnl_nets_destroy(&rtnl_nets);
+put_ops:
+ if (ops)
+ rtnl_link_ops_put(ops, ops_srcu_index);
+free:
+ kfree(tbs);
+ return ret;
+}
+
+static int rtnl_valid_getlink_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct ifinfomsg *ifm;
+ int i, err;
+
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for get link");
+ return -EINVAL;
+ }
+
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX,
+ ifla_policy, extack);
+
+ if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
+ ifm->ifi_change) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for get link request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFLA_MAX,
+ ifla_policy, extack);
+ if (err)
return err;
-out_unregister:
- if (ops->newlink) {
- LIST_HEAD(list_kill);
- ops->dellink(dev, &list_kill);
- unregister_netdevice_many(&list_kill);
- } else {
- unregister_netdevice(dev);
+ for (i = 0; i <= IFLA_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case IFLA_IFNAME:
+ case IFLA_ALT_IFNAME:
+ case IFLA_EXT_MASK:
+ case IFLA_TARGET_NETNSID:
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in get link request");
+ return -EINVAL;
}
- goto out;
}
+
+ return 0;
}
static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -3161,7 +4136,6 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net *net = sock_net(skb->sk);
struct net *tgt_net = net;
struct ifinfomsg *ifm;
- char ifname[IFNAMSIZ];
struct nlattr *tb[IFLA_MAX+1];
struct net_device *dev = NULL;
struct sk_buff *nskb;
@@ -3169,7 +4143,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
int err;
u32 ext_filter_mask = 0;
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack);
+ err = rtnl_valid_getlink_req(skb, nlh, tb, extack);
if (err < 0)
return err;
@@ -3177,16 +4151,13 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
return err;
- if (tb[IFLA_IF_NETNSID]) {
- netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]);
- tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid);
+ if (tb[IFLA_TARGET_NETNSID]) {
+ netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]);
+ tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid);
if (IS_ERR(tgt_net))
return PTR_ERR(tgt_net);
}
- if (tb[IFLA_IFNAME])
- nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
-
if (tb[IFLA_EXT_MASK])
ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
@@ -3194,8 +4165,8 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
ifm = nlmsg_data(nlh);
if (ifm->ifi_index > 0)
dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
- else if (tb[IFLA_IFNAME])
- dev = __dev_get_by_name(tgt_net, ifname);
+ else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
+ dev = rtnl_dev_get(tgt_net, tb);
else
goto out;
@@ -3204,14 +4175,22 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
goto out;
err = -ENOBUFS;
- nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL);
+ nskb = nlmsg_new_large(if_nlmsg_size(dev, ext_filter_mask));
if (nskb == NULL)
goto out;
+ /* Synchronize the carrier state so we don't report a state
+ * that we're not actually going to honour immediately; if
+ * the driver just did a carrier off->on transition, we can
+ * only TX if link watch work has run, but without this we'd
+ * already report carrier on, even if it doesn't work yet.
+ */
+ linkwatch_sync_dev(dev);
+
err = rtnl_fill_ifinfo(nskb, dev, net,
RTM_NEWLINK, NETLINK_CB(skb).portid,
nlh->nlmsg_seq, 0, 0, ext_filter_mask,
- 0, NULL, 0, netnsid);
+ 0, NULL, 0, netnsid, GFP_KERNEL);
if (err < 0) {
/* -EMSGSIZE implies BUG in if_nlmsg_size */
WARN_ON(err == -EMSGSIZE);
@@ -3225,22 +4204,130 @@ out:
return err;
}
-static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr,
+ bool *changed, struct netlink_ext_ack *extack)
+{
+ char *alt_ifname;
+ size_t size;
+ int err;
+
+ err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack);
+ if (err)
+ return err;
+
+ if (cmd == RTM_NEWLINKPROP) {
+ size = rtnl_prop_list_size(dev);
+ size += nla_total_size(ALTIFNAMSIZ);
+ if (size >= U16_MAX) {
+ NL_SET_ERR_MSG(extack,
+ "effective property list too long");
+ return -EINVAL;
+ }
+ }
+
+ alt_ifname = nla_strdup(attr, GFP_KERNEL_ACCOUNT);
+ if (!alt_ifname)
+ return -ENOMEM;
+
+ if (cmd == RTM_NEWLINKPROP) {
+ err = netdev_name_node_alt_create(dev, alt_ifname);
+ if (!err)
+ alt_ifname = NULL;
+ } else if (cmd == RTM_DELLINKPROP) {
+ err = netdev_name_node_alt_destroy(dev, alt_ifname);
+ } else {
+ WARN_ON_ONCE(1);
+ err = -EINVAL;
+ }
+
+ kfree(alt_ifname);
+ if (!err)
+ *changed = true;
+ return err;
+}
+
+static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
+ struct nlattr *tb[IFLA_MAX + 1];
struct net_device *dev;
- struct nlattr *tb[IFLA_MAX+1];
+ struct ifinfomsg *ifm;
+ bool changed = false;
+ struct nlattr *attr;
+ int err, rem;
+
+ err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack);
+ if (err)
+ return err;
+
+ err = rtnl_ensure_unique_netns(tb, extack, true);
+ if (err)
+ return err;
+
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifi_index > 0)
+ dev = __dev_get_by_index(net, ifm->ifi_index);
+ else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
+ dev = rtnl_dev_get(net, tb);
+ else
+ return -EINVAL;
+
+ if (!dev)
+ return -ENODEV;
+
+ if (!tb[IFLA_PROP_LIST])
+ return 0;
+
+ nla_for_each_nested(attr, tb[IFLA_PROP_LIST], rem) {
+ switch (nla_type(attr)) {
+ case IFLA_ALT_IFNAME:
+ err = rtnl_alt_ifname(cmd, dev, attr, &changed, extack);
+ if (err)
+ return err;
+ break;
+ }
+ }
+
+ if (changed)
+ netdev_state_change(dev);
+ return 0;
+}
+
+static int rtnl_newlinkprop(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ return rtnl_linkprop(RTM_NEWLINKPROP, skb, nlh, extack);
+}
+
+static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack);
+}
+
+static noinline_for_stack u32 rtnl_calcit(struct sk_buff *skb,
+ struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ size_t min_ifinfo_dump_size = 0;
u32 ext_filter_mask = 0;
- u16 min_ifinfo_dump_size = 0;
- int hdrlen;
+ struct net_device *dev;
+ struct nlattr *nla;
+ int hdrlen, rem;
/* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */
hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ?
sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg);
- if (nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, NULL) >= 0) {
- if (tb[IFLA_EXT_MASK])
- ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
+ if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen))
+ return NLMSG_GOODSIZE;
+
+ nla_for_each_attr_type(nla, IFLA_EXT_MASK,
+ nlmsg_attrdata(nlh, hdrlen),
+ nlmsg_attrlen(nlh, hdrlen), rem) {
+ if (nla_len(nla) == sizeof(u32))
+ ext_filter_mask = nla_get_u32(nla);
}
if (!ext_filter_mask)
@@ -3251,9 +4338,8 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh)
*/
rcu_read_lock();
for_each_netdev_rcu(net, dev) {
- min_ifinfo_dump_size = max_t(u16, min_ifinfo_dump_size,
- if_nlmsg_size(dev,
- ext_filter_mask));
+ min_ifinfo_dump_size = max(min_ifinfo_dump_size,
+ if_nlmsg_size(dev, ext_filter_mask));
}
rcu_read_unlock();
@@ -3264,13 +4350,14 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
{
int idx;
int s_idx = cb->family;
+ int type = cb->nlh->nlmsg_type - RTM_BASE;
+ int ret = 0;
if (s_idx == 0)
s_idx = 1;
for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) {
- struct rtnl_link **tab;
- int type = cb->nlh->nlmsg_type-RTM_BASE;
+ struct rtnl_link __rcu **tab;
struct rtnl_link *link;
rtnl_dumpit_func dumpit;
@@ -3284,7 +4371,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
if (!tab)
continue;
- link = tab[type];
+ link = rcu_dereference_rtnl(tab[type]);
if (!link)
continue;
@@ -3297,31 +4384,38 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
cb->prev_seq = 0;
cb->seq = 0;
}
- if (dumpit(skb, cb))
+ ret = dumpit(skb, cb);
+ if (ret)
break;
}
cb->family = idx;
- return skb->len;
+ return skb->len ? : ret;
}
struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
unsigned int change,
u32 event, gfp_t flags, int *new_nsid,
- int new_ifindex)
+ int new_ifindex, u32 portid,
+ const struct nlmsghdr *nlh)
{
struct net *net = dev_net(dev);
struct sk_buff *skb;
int err = -ENOBUFS;
- size_t if_info_size;
+ u32 seq = 0;
- skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), flags);
+ skb = nlmsg_new(if_nlmsg_size(dev, 0), flags);
if (skb == NULL)
goto errout;
+ if (nlmsg_report(nlh))
+ seq = nlmsg_seq(nlh);
+ else
+ portid = 0;
+
err = rtnl_fill_ifinfo(skb, dev, dev_net(dev),
- type, 0, 0, change, 0, 0, event,
- new_nsid, new_ifindex, -1);
+ type, portid, seq, change, 0, 0, event,
+ new_nsid, new_ifindex, -1, flags);
if (err < 0) {
/* -EMSGSIZE implies BUG in if_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
@@ -3330,21 +4424,22 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
}
return skb;
errout:
- if (err < 0)
- rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+ rtnl_set_sk_err(net, RTNLGRP_LINK, err);
return NULL;
}
-void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
+void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags,
+ u32 portid, const struct nlmsghdr *nlh)
{
struct net *net = dev_net(dev);
- rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);
+ rtnl_notify(skb, net, portid, RTNLGRP_LINK, nlh, flags);
}
static void rtmsg_ifinfo_event(int type, struct net_device *dev,
unsigned int change, u32 event,
- gfp_t flags, int *new_nsid, int new_ifindex)
+ gfp_t flags, int *new_nsid, int new_ifindex,
+ u32 portid, const struct nlmsghdr *nlh)
{
struct sk_buff *skb;
@@ -3352,23 +4447,23 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev,
return;
skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid,
- new_ifindex);
+ new_ifindex, portid, nlh);
if (skb)
- rtmsg_ifinfo_send(skb, dev, flags);
+ rtmsg_ifinfo_send(skb, dev, flags, portid, nlh);
}
void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
- gfp_t flags)
+ gfp_t flags, u32 portid, const struct nlmsghdr *nlh)
{
rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
- NULL, 0);
+ NULL, 0, portid, nlh);
}
void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
gfp_t flags, int *new_nsid, int new_ifindex)
{
rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
- new_nsid, new_ifindex);
+ new_nsid, new_ifindex, 0, NULL);
}
static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
@@ -3393,7 +4488,7 @@ static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
ndm->ndm_ifindex = dev->ifindex;
ndm->ndm_state = ndm_state;
- if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr))
+ if (nla_put(skb, NDA_LLADDR, dev->addr_len, addr))
goto nla_put_failure;
if (vid)
if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid))
@@ -3407,10 +4502,10 @@ nla_put_failure:
return -EMSGSIZE;
}
-static inline size_t rtnl_fdb_nlmsg_size(void)
+static inline size_t rtnl_fdb_nlmsg_size(const struct net_device *dev)
{
return NLMSG_ALIGN(sizeof(struct ndmsg)) +
- nla_total_size(ETH_ALEN) + /* NDA_LLADDR */
+ nla_total_size(dev->addr_len) + /* NDA_LLADDR */
nla_total_size(sizeof(u16)) + /* NDA_VLAN */
0;
}
@@ -3422,7 +4517,7 @@ static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type,
struct sk_buff *skb;
int err = -ENOBUFS;
- skb = nlmsg_new(rtnl_fdb_nlmsg_size(), GFP_ATOMIC);
+ skb = nlmsg_new(rtnl_fdb_nlmsg_size(dev), GFP_ATOMIC);
if (!skb)
goto errout;
@@ -3439,7 +4534,7 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
}
-/**
+/*
* ndo_dflt_fdb_add - default netdevice operation to add an FDB entry
*/
int ndo_dflt_fdb_add(struct ndmsg *ndm,
@@ -3454,12 +4549,17 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm,
* implement its own handler for this.
*/
if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) {
- pr_info("%s: FDB only supports static addresses\n", dev->name);
+ netdev_info(dev, "default FDB implementation only supports local addresses\n");
+ return err;
+ }
+
+ if (tb[NDA_FLAGS_EXT]) {
+ netdev_info(dev, "invalid flags given to default FDB implementation\n");
return err;
}
if (vid) {
- pr_info("%s: vlans aren't supported yet for dev_uc|mc_add()\n", dev->name);
+ netdev_info(dev, "vlans aren't supported yet for dev_uc|mc_add()\n");
return err;
}
@@ -3509,7 +4609,8 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
u16 vid;
int err;
- err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack);
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL,
+ extack);
if (err < 0)
return err;
@@ -3530,6 +4631,11 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
return -EINVAL;
}
+ if (dev->type != ARPHRD_ETHER) {
+ NL_SET_ERR_MSG(extack, "FDB add only supported for Ethernet devices");
+ return -EINVAL;
+ }
+
addr = nla_data(tb[NDA_LLADDR]);
err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
@@ -3540,12 +4646,13 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
/* Support fdb on master device the net/bridge default case */
if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
- (dev->priv_flags & IFF_BRIDGE_PORT)) {
+ netif_is_bridge_port(dev)) {
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
const struct net_device_ops *ops = br_dev->netdev_ops;
+ bool notified = false;
err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid,
- nlh->nlmsg_flags);
+ nlh->nlmsg_flags, &notified, extack);
if (err)
goto out;
else
@@ -3554,15 +4661,18 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
/* Embedded bridge, macvlan, and any other device support */
if ((ndm->ndm_flags & NTF_SELF)) {
+ bool notified = false;
+
if (dev->netdev_ops->ndo_fdb_add)
err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr,
vid,
- nlh->nlmsg_flags);
+ nlh->nlmsg_flags,
+ &notified, extack);
else
err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid,
nlh->nlmsg_flags);
- if (!err) {
+ if (!err && !notified) {
rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH,
ndm->ndm_state);
ndm->ndm_flags &= ~NTF_SELF;
@@ -3572,7 +4682,7 @@ out:
return err;
}
-/**
+/*
* ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry
*/
int ndo_dflt_fdb_del(struct ndmsg *ndm,
@@ -3586,7 +4696,7 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm,
* implement its own handler for this.
*/
if (!(ndm->ndm_state & NUD_PERMANENT)) {
- pr_info("%s: FDB only supports static addresses\n", dev->name);
+ netdev_info(dev, "default FDB implementation only supports local addresses\n");
return err;
}
@@ -3602,18 +4712,25 @@ EXPORT_SYMBOL(ndo_dflt_fdb_del);
static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
+ bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK);
struct net *net = sock_net(skb->sk);
+ const struct net_device_ops *ops;
struct ndmsg *ndm;
struct nlattr *tb[NDA_MAX+1];
struct net_device *dev;
- int err = -EINVAL;
- __u8 *addr;
+ __u8 *addr = NULL;
+ int err;
u16 vid;
- if (!netlink_capable(skb, CAP_NET_ADMIN))
- return -EPERM;
-
- err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack);
+ if (!del_bulk) {
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX,
+ NULL, extack);
+ } else {
+ /* For bulk delete, the drivers will parse the message with
+ * policy.
+ */
+ err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack);
+ }
if (err < 0)
return err;
@@ -3629,27 +4746,40 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
return -ENODEV;
}
- if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
- NL_SET_ERR_MSG(extack, "invalid address");
- return -EINVAL;
- }
+ if (!del_bulk) {
+ if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) {
+ NL_SET_ERR_MSG(extack, "invalid address");
+ return -EINVAL;
+ }
+ addr = nla_data(tb[NDA_LLADDR]);
- addr = nla_data(tb[NDA_LLADDR]);
+ err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
+ if (err)
+ return err;
+ }
- err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack);
- if (err)
- return err;
+ if (dev->type != ARPHRD_ETHER) {
+ NL_SET_ERR_MSG(extack, "FDB delete only supported for Ethernet devices");
+ return -EINVAL;
+ }
err = -EOPNOTSUPP;
/* Support fdb on master device the net/bridge default case */
if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) &&
- (dev->priv_flags & IFF_BRIDGE_PORT)) {
+ netif_is_bridge_port(dev)) {
struct net_device *br_dev = netdev_master_upper_dev_get(dev);
- const struct net_device_ops *ops = br_dev->netdev_ops;
+ bool notified = false;
- if (ops->ndo_fdb_del)
- err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid);
+ ops = br_dev->netdev_ops;
+ if (!del_bulk) {
+ if (ops->ndo_fdb_del)
+ err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid,
+ &notified, extack);
+ } else {
+ if (ops->ndo_fdb_del_bulk)
+ err = ops->ndo_fdb_del_bulk(nlh, dev, extack);
+ }
if (err)
goto out;
@@ -3659,15 +4789,26 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
/* Embedded bridge, macvlan, and any other device support */
if (ndm->ndm_flags & NTF_SELF) {
- if (dev->netdev_ops->ndo_fdb_del)
- err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr,
- vid);
- else
- err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
+ bool notified = false;
+
+ ops = dev->netdev_ops;
+ if (!del_bulk) {
+ if (ops->ndo_fdb_del)
+ err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid,
+ &notified, extack);
+ else
+ err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
+ } else {
+ /* in case err was cleared by NTF_MASTER call */
+ err = -EOPNOTSUPP;
+ if (ops->ndo_fdb_del_bulk)
+ err = ops->ndo_fdb_del_bulk(nlh, dev, extack);
+ }
if (!err) {
- rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH,
- ndm->ndm_state);
+ if (!del_bulk && !notified)
+ rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH,
+ ndm->ndm_state);
ndm->ndm_flags &= ~NTF_SELF;
}
}
@@ -3681,15 +4822,16 @@ static int nlmsg_populate_fdb(struct sk_buff *skb,
int *idx,
struct netdev_hw_addr_list *list)
{
+ struct ndo_fdb_dump_context *ctx = (void *)cb->ctx;
struct netdev_hw_addr *ha;
- int err;
u32 portid, seq;
+ int err;
portid = NETLINK_CB(cb->skb).portid;
seq = cb->nlh->nlmsg_seq;
list_for_each_entry(ha, &list->list, list) {
- if (*idx < cb->args[2])
+ if (*idx < ctx->fdb_idx)
goto skip;
err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0,
@@ -3706,8 +4848,11 @@ skip:
/**
* ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table.
- * @nlh: netlink message header
+ * @skb: socket buffer to store message in
+ * @cb: netlink callback
* @dev: netdevice
+ * @filter_dev: ignored
+ * @idx: the number of FDB table entries dumped is added to *@idx
*
* Default netdevice operation to dump the existing unicast address list.
* Returns number of addresses from list put in skb.
@@ -3720,6 +4865,9 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb,
{
int err;
+ if (dev->type != ARPHRD_ETHER)
+ return -EINVAL;
+
netif_addr_lock_bh(dev);
err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc);
if (err)
@@ -3731,22 +4879,66 @@ out:
}
EXPORT_SYMBOL(ndo_dflt_fdb_dump);
-static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
+static int valid_fdb_dump_strict(const struct nlmsghdr *nlh,
+ int *br_idx, int *brport_idx,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[NDA_MAX + 1];
+ struct ndmsg *ndm;
+ int err, i;
+
+ ndm = nlmsg_payload(nlh, sizeof(*ndm));
+ if (!ndm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request");
+ return -EINVAL;
+ }
+
+ if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state ||
+ ndm->ndm_flags || ndm->ndm_type) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb,
+ NDA_MAX, NULL, extack);
+ if (err < 0)
+ return err;
+
+ *brport_idx = ndm->ndm_ifindex;
+ for (i = 0; i <= NDA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case NDA_IFINDEX:
+ if (nla_len(tb[i]) != sizeof(u32)) {
+ NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in fdb dump request");
+ return -EINVAL;
+ }
+ *brport_idx = nla_get_u32(tb[NDA_IFINDEX]);
+ break;
+ case NDA_MASTER:
+ if (nla_len(tb[i]) != sizeof(u32)) {
+ NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in fdb dump request");
+ return -EINVAL;
+ }
+ *br_idx = nla_get_u32(tb[NDA_MASTER]);
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb dump request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh,
+ int *br_idx, int *brport_idx,
+ struct netlink_ext_ack *extack)
{
- struct net_device *dev;
struct nlattr *tb[IFLA_MAX+1];
- struct net_device *br_dev = NULL;
- const struct net_device_ops *ops = NULL;
- const struct net_device_ops *cops = NULL;
- struct ifinfomsg *ifm = nlmsg_data(cb->nlh);
- struct net *net = sock_net(skb->sk);
- struct hlist_head *head;
- int brport_idx = 0;
- int br_idx = 0;
- int h, s_h;
- int idx = 0, s_idx;
- int err = 0;
- int fidx = 0;
+ int err;
/* A hack to preserve kernel<->userspace interface.
* Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0.
@@ -3755,20 +4947,48 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
* Fortunately these sizes don't conflict with the size of ifinfomsg
* with an optional attribute.
*/
- if (nlmsg_len(cb->nlh) != sizeof(struct ndmsg) &&
- (nlmsg_len(cb->nlh) != sizeof(struct ndmsg) +
+ if (nlmsg_len(nlh) != sizeof(struct ndmsg) &&
+ (nlmsg_len(nlh) != sizeof(struct ndmsg) +
nla_attr_size(sizeof(u32)))) {
- err = nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb,
- IFLA_MAX, ifla_policy, NULL);
+ struct ifinfomsg *ifm;
+
+ err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg),
+ tb, IFLA_MAX, ifla_policy,
+ extack);
if (err < 0) {
return -EINVAL;
} else if (err == 0) {
if (tb[IFLA_MASTER])
- br_idx = nla_get_u32(tb[IFLA_MASTER]);
+ *br_idx = nla_get_u32(tb[IFLA_MASTER]);
}
- brport_idx = ifm->ifi_index;
+ ifm = nlmsg_data(nlh);
+ *brport_idx = ifm->ifi_index;
}
+ return 0;
+}
+
+static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const struct net_device_ops *ops = NULL, *cops = NULL;
+ struct ndo_fdb_dump_context *ctx = (void *)cb->ctx;
+ struct net_device *dev, *br_dev = NULL;
+ struct net *net = sock_net(skb->sk);
+ int brport_idx = 0;
+ int br_idx = 0;
+ int fidx = 0;
+ int err;
+
+ NL_ASSERT_CTX_FITS(struct ndo_fdb_dump_context);
+
+ if (cb->strict_check)
+ err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx,
+ cb->extack);
+ else
+ err = valid_fdb_dump_legacy(cb->nlh, &br_idx, &brport_idx,
+ cb->extack);
+ if (err < 0)
+ return err;
if (br_idx) {
br_dev = __dev_get_by_index(net, br_idx);
@@ -3778,72 +4998,212 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
ops = br_dev->netdev_ops;
}
- s_h = cb->args[0];
- s_idx = cb->args[1];
-
- for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
- idx = 0;
- head = &net->dev_index_head[h];
- hlist_for_each_entry(dev, head, index_hlist) {
+ for_each_netdev_dump(net, dev, ctx->ifindex) {
+ if (brport_idx && (dev->ifindex != brport_idx))
+ continue;
- if (brport_idx && (dev->ifindex != brport_idx))
+ if (!br_idx) { /* user did not specify a specific bridge */
+ if (netif_is_bridge_port(dev)) {
+ br_dev = netdev_master_upper_dev_get(dev);
+ cops = br_dev->netdev_ops;
+ }
+ } else {
+ if (dev != br_dev &&
+ !netif_is_bridge_port(dev))
continue;
- if (!br_idx) { /* user did not specify a specific bridge */
- if (dev->priv_flags & IFF_BRIDGE_PORT) {
- br_dev = netdev_master_upper_dev_get(dev);
- cops = br_dev->netdev_ops;
- }
- } else {
- if (dev != br_dev &&
- !(dev->priv_flags & IFF_BRIDGE_PORT))
- continue;
+ if (br_dev != netdev_master_upper_dev_get(dev) &&
+ !netif_is_bridge_master(dev))
+ continue;
+ cops = ops;
+ }
- if (br_dev != netdev_master_upper_dev_get(dev) &&
- !(dev->priv_flags & IFF_EBRIDGE))
- continue;
- cops = ops;
+ if (netif_is_bridge_port(dev)) {
+ if (cops && cops->ndo_fdb_dump) {
+ err = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
+ &fidx);
+ if (err == -EMSGSIZE)
+ break;
}
+ }
- if (idx < s_idx)
- goto cont;
+ if (dev->netdev_ops->ndo_fdb_dump)
+ err = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL,
+ &fidx);
+ else
+ err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, &fidx);
+ if (err == -EMSGSIZE)
+ break;
- if (dev->priv_flags & IFF_BRIDGE_PORT) {
- if (cops && cops->ndo_fdb_dump) {
- err = cops->ndo_fdb_dump(skb, cb,
- br_dev, dev,
- &fidx);
- if (err == -EMSGSIZE)
- goto out;
- }
+ cops = NULL;
+
+ /* reset fdb offset to 0 for rest of the interfaces */
+ ctx->fdb_idx = 0;
+ fidx = 0;
+ }
+
+ ctx->fdb_idx = fidx;
+
+ return skb->len;
+}
+
+static int valid_fdb_get_strict(const struct nlmsghdr *nlh,
+ struct nlattr **tb, u8 *ndm_flags,
+ int *br_idx, int *brport_idx, u8 **addr,
+ u16 *vid, struct netlink_ext_ack *extack)
+{
+ struct ndmsg *ndm;
+ int err, i;
+
+ ndm = nlmsg_payload(nlh, sizeof(*ndm));
+ if (!ndm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for fdb get request");
+ return -EINVAL;
+ }
+
+ if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state ||
+ ndm->ndm_type) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request");
+ return -EINVAL;
+ }
+
+ if (ndm->ndm_flags & ~(NTF_MASTER | NTF_SELF)) {
+ NL_SET_ERR_MSG(extack, "Invalid flags in header for fdb get request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb,
+ NDA_MAX, nda_policy, extack);
+ if (err < 0)
+ return err;
+
+ *ndm_flags = ndm->ndm_flags;
+ *brport_idx = ndm->ndm_ifindex;
+ for (i = 0; i <= NDA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case NDA_MASTER:
+ *br_idx = nla_get_u32(tb[i]);
+ break;
+ case NDA_LLADDR:
+ if (nla_len(tb[i]) != ETH_ALEN) {
+ NL_SET_ERR_MSG(extack, "Invalid address in fdb get request");
+ return -EINVAL;
}
+ *addr = nla_data(tb[i]);
+ break;
+ case NDA_VLAN:
+ err = fdb_vid_parse(tb[i], vid, extack);
+ if (err)
+ return err;
+ break;
+ case NDA_VNI:
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb get request");
+ return -EINVAL;
+ }
+ }
- if (dev->netdev_ops->ndo_fdb_dump)
- err = dev->netdev_ops->ndo_fdb_dump(skb, cb,
- dev, NULL,
- &fidx);
- else
- err = ndo_dflt_fdb_dump(skb, cb, dev, NULL,
- &fidx);
- if (err == -EMSGSIZE)
- goto out;
+ return 0;
+}
+
+static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *dev = NULL, *br_dev = NULL;
+ const struct net_device_ops *ops = NULL;
+ struct net *net = sock_net(in_skb->sk);
+ struct nlattr *tb[NDA_MAX + 1];
+ struct sk_buff *skb;
+ int brport_idx = 0;
+ u8 ndm_flags = 0;
+ int br_idx = 0;
+ u8 *addr = NULL;
+ u16 vid = 0;
+ int err;
- cops = NULL;
+ err = valid_fdb_get_strict(nlh, tb, &ndm_flags, &br_idx,
+ &brport_idx, &addr, &vid, extack);
+ if (err < 0)
+ return err;
- /* reset fdb offset to 0 for rest of the interfaces */
- cb->args[2] = 0;
- fidx = 0;
-cont:
- idx++;
+ if (!addr) {
+ NL_SET_ERR_MSG(extack, "Missing lookup address for fdb get request");
+ return -EINVAL;
+ }
+
+ if (brport_idx) {
+ dev = __dev_get_by_index(net, brport_idx);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Unknown device ifindex");
+ return -ENODEV;
}
}
-out:
- cb->args[0] = h;
- cb->args[1] = idx;
- cb->args[2] = fidx;
+ if (br_idx) {
+ if (dev) {
+ NL_SET_ERR_MSG(extack, "Master and device are mutually exclusive");
+ return -EINVAL;
+ }
- return skb->len;
+ br_dev = __dev_get_by_index(net, br_idx);
+ if (!br_dev) {
+ NL_SET_ERR_MSG(extack, "Invalid master ifindex");
+ return -EINVAL;
+ }
+ ops = br_dev->netdev_ops;
+ }
+
+ if (dev) {
+ if (!ndm_flags || (ndm_flags & NTF_MASTER)) {
+ if (!netif_is_bridge_port(dev)) {
+ NL_SET_ERR_MSG(extack, "Device is not a bridge port");
+ return -EINVAL;
+ }
+ br_dev = netdev_master_upper_dev_get(dev);
+ if (!br_dev) {
+ NL_SET_ERR_MSG(extack, "Master of device not found");
+ return -EINVAL;
+ }
+ ops = br_dev->netdev_ops;
+ } else {
+ if (!(ndm_flags & NTF_SELF)) {
+ NL_SET_ERR_MSG(extack, "Missing NTF_SELF");
+ return -EINVAL;
+ }
+ ops = dev->netdev_ops;
+ }
+ }
+
+ if (!br_dev && !dev) {
+ NL_SET_ERR_MSG(extack, "No device specified");
+ return -ENODEV;
+ }
+
+ if (!ops || !ops->ndo_fdb_get) {
+ NL_SET_ERR_MSG(extack, "Fdb get operation not supported by device");
+ return -EOPNOTSUPP;
+ }
+
+ skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (br_dev)
+ dev = br_dev;
+ err = ops->ndo_fdb_get(skb, tb, dev, addr, vid,
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, extack);
+ if (err)
+ goto out;
+
+ return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+out:
+ kfree_skb(skb);
+ return err;
}
static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask,
@@ -3879,7 +5239,7 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
ifm->__ifi_pad = 0;
ifm->ifi_type = dev->type;
ifm->ifi_index = dev->ifindex;
- ifm->ifi_flags = dev_get_flags(dev);
+ ifm->ifi_flags = netif_get_flags(dev);
ifm->ifi_change = 0;
@@ -3894,7 +5254,7 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))))
goto nla_put_failure;
- br_afspec = nla_nest_start(skb, IFLA_AF_SPEC);
+ br_afspec = nla_nest_start_noflag(skb, IFLA_AF_SPEC);
if (!br_afspec)
goto nla_put_failure;
@@ -3918,7 +5278,7 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
}
nla_nest_end(skb, br_afspec);
- protinfo = nla_nest_start(skb, IFLA_PROTINFO | NLA_F_NESTED);
+ protinfo = nla_nest_start(skb, IFLA_PROTINFO);
if (!protinfo)
goto nla_put_failure;
@@ -3938,7 +5298,11 @@ int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
brport_nla_put_flag(skb, flags, mask,
IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) ||
brport_nla_put_flag(skb, flags, mask,
- IFLA_BRPORT_PROXYARP, BR_PROXYARP)) {
+ IFLA_BRPORT_PROXYARP, BR_PROXYARP) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD) ||
+ brport_nla_put_flag(skb, flags, mask,
+ IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD)) {
nla_nest_cancel(skb, protinfo);
goto nla_put_failure;
}
@@ -3953,28 +5317,75 @@ nla_put_failure:
}
EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink);
+static int valid_bridge_getlink_req(const struct nlmsghdr *nlh,
+ bool strict_check, u32 *filter_mask,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_MAX+1];
+ int err, i;
+
+ if (strict_check) {
+ struct ifinfomsg *ifm;
+
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump");
+ return -EINVAL;
+ }
+
+ if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
+ ifm->ifi_change || ifm->ifi_index) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh,
+ sizeof(struct ifinfomsg),
+ tb, IFLA_MAX, ifla_policy,
+ extack);
+ } else {
+ err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg),
+ tb, IFLA_MAX, ifla_policy,
+ extack);
+ }
+ if (err < 0)
+ return err;
+
+ /* new attributes should only be added with strict checking */
+ for (i = 0; i <= IFLA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case IFLA_EXT_MASK:
+ *filter_mask = nla_get_u32(tb[i]);
+ break;
+ default:
+ if (strict_check) {
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in bridge link dump request");
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
struct net_device *dev;
int idx = 0;
u32 portid = NETLINK_CB(cb->skb).portid;
- u32 seq = cb->nlh->nlmsg_seq;
+ u32 seq = nlh->nlmsg_seq;
u32 filter_mask = 0;
int err;
- if (nlmsg_len(cb->nlh) > sizeof(struct ifinfomsg)) {
- struct nlattr *extfilt;
-
- extfilt = nlmsg_find_attr(cb->nlh, sizeof(struct ifinfomsg),
- IFLA_EXT_MASK);
- if (extfilt) {
- if (nla_len(extfilt) < sizeof(filter_mask))
- return -EINVAL;
-
- filter_mask = nla_get_u32(extfilt);
- }
- }
+ err = valid_bridge_getlink_req(nlh, cb->strict_check, &filter_mask,
+ cb->extack);
+ if (err < 0 && cb->strict_check)
+ return err;
rcu_read_lock();
for_each_netdev_rcu(net, dev) {
@@ -4054,6 +5465,10 @@ static int rtnl_bridge_notify(struct net_device *dev)
if (err < 0)
goto errout;
+ /* Notification info is only filled for bridge ports, not the bridge
+ * device itself. Therefore, a zero notification length is valid and
+ * should not result in an error.
+ */
if (!skb->len)
goto errout;
@@ -4073,10 +5488,9 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net *net = sock_net(skb->sk);
struct ifinfomsg *ifm;
struct net_device *dev;
- struct nlattr *br_spec, *attr = NULL;
+ struct nlattr *br_spec, *attr, *br_flags_attr = NULL;
int rem, err = -EOPNOTSUPP;
u16 flags = 0;
- bool have_flags = false;
if (nlmsg_len(nlh) < sizeof(*ifm))
return -EINVAL;
@@ -4094,13 +5508,17 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
if (br_spec) {
nla_for_each_nested(attr, br_spec, rem) {
- if (nla_type(attr) == IFLA_BRIDGE_FLAGS) {
+ if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !br_flags_attr) {
if (nla_len(attr) < sizeof(flags))
return -EINVAL;
- have_flags = true;
+ br_flags_attr = attr;
flags = nla_get_u16(attr);
- break;
+ }
+
+ if (nla_type(attr) == IFLA_BRIDGE_MODE) {
+ if (nla_len(attr) < sizeof(u16))
+ return -EINVAL;
}
}
}
@@ -4113,7 +5531,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
goto out;
}
- err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags);
+ err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags,
+ extack);
if (err)
goto out;
@@ -4125,7 +5544,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
err = -EOPNOTSUPP;
else
err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh,
- flags);
+ flags,
+ extack);
if (!err) {
flags &= ~BRIDGE_FLAGS_SELF;
@@ -4136,8 +5556,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
}
}
- if (have_flags)
- memcpy(nla_data(attr), &flags, sizeof(flags));
+ if (br_flags_attr)
+ memcpy(nla_data(br_flags_attr), &flags, sizeof(flags));
out:
return err;
}
@@ -4168,15 +5588,14 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
if (br_spec) {
- nla_for_each_nested(attr, br_spec, rem) {
- if (nla_type(attr) == IFLA_BRIDGE_FLAGS) {
- if (nla_len(attr) < sizeof(flags))
- return -EINVAL;
+ nla_for_each_nested_type(attr, IFLA_BRIDGE_FLAGS, br_spec,
+ rem) {
+ if (nla_len(attr) < sizeof(flags))
+ return -EINVAL;
- have_flags = true;
- flags = nla_get_u16(attr);
- break;
- }
+ have_flags = true;
+ flags = nla_get_u16(attr);
+ break;
}
}
@@ -4224,82 +5643,257 @@ static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr)
(!idxattr || idxattr == attrid);
}
-#define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1)
-static int rtnl_get_offload_stats_attr_size(int attr_id)
+static bool
+rtnl_offload_xstats_have_ndo(const struct net_device *dev, int attr_id)
{
- switch (attr_id) {
- case IFLA_OFFLOAD_XSTATS_CPU_HIT:
- return sizeof(struct rtnl_link_stats64);
- }
+ return dev->netdev_ops &&
+ dev->netdev_ops->ndo_has_offload_stats &&
+ dev->netdev_ops->ndo_get_offload_stats &&
+ dev->netdev_ops->ndo_has_offload_stats(dev, attr_id);
+}
- return 0;
+static unsigned int
+rtnl_offload_xstats_get_size_ndo(const struct net_device *dev, int attr_id)
+{
+ return rtnl_offload_xstats_have_ndo(dev, attr_id) ?
+ sizeof(struct rtnl_link_stats64) : 0;
}
-static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev,
- int *prividx)
+static int
+rtnl_offload_xstats_fill_ndo(struct net_device *dev, int attr_id,
+ struct sk_buff *skb)
{
+ unsigned int size = rtnl_offload_xstats_get_size_ndo(dev, attr_id);
struct nlattr *attr = NULL;
- int attr_id, size;
void *attr_data;
int err;
- if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
- dev->netdev_ops->ndo_get_offload_stats))
+ if (!size)
return -ENODATA;
- for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
- attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
- if (attr_id < *prividx)
- continue;
+ attr = nla_reserve_64bit(skb, attr_id, size,
+ IFLA_OFFLOAD_XSTATS_UNSPEC);
+ if (!attr)
+ return -EMSGSIZE;
- size = rtnl_get_offload_stats_attr_size(attr_id);
- if (!size)
- continue;
+ attr_data = nla_data(attr);
+ memset(attr_data, 0, size);
- if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
- continue;
+ err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, attr_data);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static unsigned int
+rtnl_offload_xstats_get_size_stats(const struct net_device *dev,
+ enum netdev_offload_xstats_type type)
+{
+ bool enabled = netdev_offload_xstats_enabled(dev, type);
+
+ return enabled ? sizeof(struct rtnl_hw_stats64) : 0;
+}
+
+struct rtnl_offload_xstats_request_used {
+ bool request;
+ bool used;
+};
+
+static int
+rtnl_offload_xstats_get_stats(struct net_device *dev,
+ enum netdev_offload_xstats_type type,
+ struct rtnl_offload_xstats_request_used *ru,
+ struct rtnl_hw_stats64 *stats,
+ struct netlink_ext_ack *extack)
+{
+ bool request;
+ bool used;
+ int err;
+
+ request = netdev_offload_xstats_enabled(dev, type);
+ if (!request) {
+ used = false;
+ goto out;
+ }
+
+ err = netdev_offload_xstats_get(dev, type, stats, &used, extack);
+ if (err)
+ return err;
+
+out:
+ if (ru) {
+ ru->request = request;
+ ru->used = used;
+ }
+ return 0;
+}
+
+static int
+rtnl_offload_xstats_fill_hw_s_info_one(struct sk_buff *skb, int attr_id,
+ struct rtnl_offload_xstats_request_used *ru)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, attr_id);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST, ru->request))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED, ru->used))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int
+rtnl_offload_xstats_fill_hw_s_info(struct sk_buff *skb, struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+ struct rtnl_offload_xstats_request_used ru_l3;
+ struct nlattr *nest;
+ int err;
+
+ err = rtnl_offload_xstats_get_stats(dev, t_l3, &ru_l3, NULL, extack);
+ if (err)
+ return err;
- attr = nla_reserve_64bit(skb, attr_id, size,
+ nest = nla_nest_start(skb, IFLA_OFFLOAD_XSTATS_HW_S_INFO);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (rtnl_offload_xstats_fill_hw_s_info_one(skb,
+ IFLA_OFFLOAD_XSTATS_L3_STATS,
+ &ru_l3))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int rtnl_offload_xstats_fill(struct sk_buff *skb, struct net_device *dev,
+ int *prividx, u32 off_filter_mask,
+ struct netlink_ext_ack *extack)
+{
+ enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+ int attr_id_hw_s_info = IFLA_OFFLOAD_XSTATS_HW_S_INFO;
+ int attr_id_l3_stats = IFLA_OFFLOAD_XSTATS_L3_STATS;
+ int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT;
+ bool have_data = false;
+ int err;
+
+ if (*prividx <= attr_id_cpu_hit &&
+ (off_filter_mask &
+ IFLA_STATS_FILTER_BIT(attr_id_cpu_hit))) {
+ err = rtnl_offload_xstats_fill_ndo(dev, attr_id_cpu_hit, skb);
+ if (!err) {
+ have_data = true;
+ } else if (err != -ENODATA) {
+ *prividx = attr_id_cpu_hit;
+ return err;
+ }
+ }
+
+ if (*prividx <= attr_id_hw_s_info &&
+ (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_hw_s_info))) {
+ *prividx = attr_id_hw_s_info;
+
+ err = rtnl_offload_xstats_fill_hw_s_info(skb, dev, extack);
+ if (err)
+ return err;
+
+ have_data = true;
+ *prividx = 0;
+ }
+
+ if (*prividx <= attr_id_l3_stats &&
+ (off_filter_mask & IFLA_STATS_FILTER_BIT(attr_id_l3_stats))) {
+ unsigned int size_l3;
+ struct nlattr *attr;
+
+ *prividx = attr_id_l3_stats;
+
+ size_l3 = rtnl_offload_xstats_get_size_stats(dev, t_l3);
+ if (!size_l3)
+ goto skip_l3_stats;
+ attr = nla_reserve_64bit(skb, attr_id_l3_stats, size_l3,
IFLA_OFFLOAD_XSTATS_UNSPEC);
if (!attr)
- goto nla_put_failure;
+ return -EMSGSIZE;
- attr_data = nla_data(attr);
- memset(attr_data, 0, size);
- err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev,
- attr_data);
+ err = rtnl_offload_xstats_get_stats(dev, t_l3, NULL,
+ nla_data(attr), extack);
if (err)
- goto get_offload_stats_failure;
+ return err;
+
+ have_data = true;
+skip_l3_stats:
+ *prividx = 0;
}
- if (!attr)
+ if (!have_data)
return -ENODATA;
*prividx = 0;
return 0;
+}
-nla_put_failure:
- err = -EMSGSIZE;
-get_offload_stats_failure:
- *prividx = attr_id;
- return err;
+static unsigned int
+rtnl_offload_xstats_get_size_hw_s_info_one(const struct net_device *dev,
+ enum netdev_offload_xstats_type type)
+{
+ return nla_total_size(0) +
+ /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST */
+ nla_total_size(sizeof(u8)) +
+ /* IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED */
+ nla_total_size(sizeof(u8)) +
+ 0;
}
-static int rtnl_get_offload_stats_size(const struct net_device *dev)
+static unsigned int
+rtnl_offload_xstats_get_size_hw_s_info(const struct net_device *dev)
{
+ enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+
+ return nla_total_size(0) +
+ /* IFLA_OFFLOAD_XSTATS_L3_STATS */
+ rtnl_offload_xstats_get_size_hw_s_info_one(dev, t_l3) +
+ 0;
+}
+
+static int rtnl_offload_xstats_get_size(const struct net_device *dev,
+ u32 off_filter_mask)
+{
+ enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+ int attr_id_cpu_hit = IFLA_OFFLOAD_XSTATS_CPU_HIT;
int nla_size = 0;
- int attr_id;
int size;
- if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats &&
- dev->netdev_ops->ndo_get_offload_stats))
- return 0;
+ if (off_filter_mask &
+ IFLA_STATS_FILTER_BIT(attr_id_cpu_hit)) {
+ size = rtnl_offload_xstats_get_size_ndo(dev, attr_id_cpu_hit);
+ nla_size += nla_total_size_64bit(size);
+ }
- for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST;
- attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) {
- if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id))
- continue;
- size = rtnl_get_offload_stats_attr_size(attr_id);
+ if (off_filter_mask &
+ IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO))
+ nla_size += rtnl_offload_xstats_get_size_hw_s_info(dev);
+
+ if (off_filter_mask &
+ IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_L3_STATS)) {
+ size = rtnl_offload_xstats_get_size_stats(dev, t_l3);
nla_size += nla_total_size_64bit(size);
}
@@ -4309,11 +5903,21 @@ static int rtnl_get_offload_stats_size(const struct net_device *dev)
return nla_size;
}
+struct rtnl_stats_dump_filters {
+ /* mask[0] filters outer attributes. Then individual nests have their
+ * filtering mask at the index of the nested attribute.
+ */
+ u32 mask[IFLA_STATS_MAX + 1];
+};
+
static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
int type, u32 pid, u32 seq, u32 change,
- unsigned int flags, unsigned int filter_mask,
- int *idxattr, int *prividx)
+ unsigned int flags,
+ const struct rtnl_stats_dump_filters *filters,
+ int *idxattr, int *prividx,
+ struct netlink_ext_ack *extack)
{
+ unsigned int filter_mask = filters->mask[0];
struct if_stats_msg *ifsm;
struct nlmsghdr *nlh;
struct nlattr *attr;
@@ -4339,8 +5943,10 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64,
sizeof(struct rtnl_link_stats64),
IFLA_STATS_UNSPEC);
- if (!attr)
+ if (!attr) {
+ err = -EMSGSIZE;
goto nla_put_failure;
+ }
sp = nla_data(attr);
dev_get_stats(dev, sp);
@@ -4351,10 +5957,12 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
if (ops && ops->fill_linkxstats) {
*idxattr = IFLA_STATS_LINK_XSTATS;
- attr = nla_nest_start(skb,
- IFLA_STATS_LINK_XSTATS);
- if (!attr)
+ attr = nla_nest_start_noflag(skb,
+ IFLA_STATS_LINK_XSTATS);
+ if (!attr) {
+ err = -EMSGSIZE;
goto nla_put_failure;
+ }
err = ops->fill_linkxstats(skb, dev, prividx, *idxattr);
nla_nest_end(skb, attr);
@@ -4374,10 +5982,12 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
ops = master->rtnl_link_ops;
if (ops && ops->fill_linkxstats) {
*idxattr = IFLA_STATS_LINK_XSTATS_SLAVE;
- attr = nla_nest_start(skb,
- IFLA_STATS_LINK_XSTATS_SLAVE);
- if (!attr)
+ attr = nla_nest_start_noflag(skb,
+ IFLA_STATS_LINK_XSTATS_SLAVE);
+ if (!attr) {
+ err = -EMSGSIZE;
goto nla_put_failure;
+ }
err = ops->fill_linkxstats(skb, dev, prividx, *idxattr);
nla_nest_end(skb, attr);
@@ -4389,12 +5999,19 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS,
*idxattr)) {
+ u32 off_filter_mask;
+
+ off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS];
*idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS;
- attr = nla_nest_start(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS);
- if (!attr)
+ attr = nla_nest_start_noflag(skb,
+ IFLA_STATS_LINK_OFFLOAD_XSTATS);
+ if (!attr) {
+ err = -EMSGSIZE;
goto nla_put_failure;
+ }
- err = rtnl_get_offload_stats(skb, dev, prividx);
+ err = rtnl_offload_xstats_fill(skb, dev, prividx,
+ off_filter_mask, extack);
if (err == -ENODATA)
nla_nest_cancel(skb, attr);
else
@@ -4409,19 +6026,22 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev,
struct rtnl_af_ops *af_ops;
*idxattr = IFLA_STATS_AF_SPEC;
- attr = nla_nest_start(skb, IFLA_STATS_AF_SPEC);
- if (!attr)
+ attr = nla_nest_start_noflag(skb, IFLA_STATS_AF_SPEC);
+ if (!attr) {
+ err = -EMSGSIZE;
goto nla_put_failure;
+ }
rcu_read_lock();
list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) {
if (af_ops->fill_stats_af) {
struct nlattr *af;
- int err;
- af = nla_nest_start(skb, af_ops->family);
+ af = nla_nest_start_noflag(skb,
+ af_ops->family);
if (!af) {
rcu_read_unlock();
+ err = -EMSGSIZE;
goto nla_put_failure;
}
err = af_ops->fill_stats_af(skb, dev);
@@ -4454,13 +6074,14 @@ nla_put_failure:
else
nlmsg_end(skb, nlh);
- return -EMSGSIZE;
+ return err;
}
static size_t if_nlmsg_stats_size(const struct net_device *dev,
- u32 filter_mask)
+ const struct rtnl_stats_dump_filters *filters)
{
- size_t size = 0;
+ size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg));
+ unsigned int filter_mask = filters->mask[0];
if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0))
size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64));
@@ -4496,8 +6117,12 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
}
}
- if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
- size += rtnl_get_offload_stats_size(dev);
+ if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) {
+ u32 off_filter_mask;
+
+ off_filter_mask = filters->mask[IFLA_STATS_LINK_OFFLOAD_XSTATS];
+ size += rtnl_offload_xstats_get_size(dev, off_filter_mask);
+ }
if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) {
struct rtnl_af_ops *af_ops;
@@ -4521,19 +6146,123 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev,
return size;
}
+#define RTNL_STATS_OFFLOAD_XSTATS_VALID ((1 << __IFLA_OFFLOAD_XSTATS_MAX) - 1)
+
+static const struct nla_policy
+rtnl_stats_get_policy_filters[IFLA_STATS_MAX + 1] = {
+ [IFLA_STATS_LINK_OFFLOAD_XSTATS] =
+ NLA_POLICY_MASK(NLA_U32, RTNL_STATS_OFFLOAD_XSTATS_VALID),
+};
+
+static const struct nla_policy
+rtnl_stats_get_policy[IFLA_STATS_GETSET_MAX + 1] = {
+ [IFLA_STATS_GET_FILTERS] =
+ NLA_POLICY_NESTED(rtnl_stats_get_policy_filters),
+};
+
+static const struct nla_policy
+ifla_stats_set_policy[IFLA_STATS_GETSET_MAX + 1] = {
+ [IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS] = NLA_POLICY_MAX(NLA_U8, 1),
+};
+
+static int rtnl_stats_get_parse_filters(struct nlattr *ifla_filters,
+ struct rtnl_stats_dump_filters *filters,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_STATS_MAX + 1];
+ int err;
+ int at;
+
+ err = nla_parse_nested(tb, IFLA_STATS_MAX, ifla_filters,
+ rtnl_stats_get_policy_filters, extack);
+ if (err < 0)
+ return err;
+
+ for (at = 1; at <= IFLA_STATS_MAX; at++) {
+ if (tb[at]) {
+ if (!(filters->mask[0] & IFLA_STATS_FILTER_BIT(at))) {
+ NL_SET_ERR_MSG(extack, "Filtered attribute not enabled in filter_mask");
+ return -EINVAL;
+ }
+ filters->mask[at] = nla_get_u32(tb[at]);
+ }
+ }
+
+ return 0;
+}
+
+static int rtnl_stats_get_parse(const struct nlmsghdr *nlh,
+ u32 filter_mask,
+ struct rtnl_stats_dump_filters *filters,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1];
+ int err;
+ int i;
+
+ filters->mask[0] = filter_mask;
+ for (i = 1; i < ARRAY_SIZE(filters->mask); i++)
+ filters->mask[i] = -1U;
+
+ err = nlmsg_parse(nlh, sizeof(struct if_stats_msg), tb,
+ IFLA_STATS_GETSET_MAX, rtnl_stats_get_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_STATS_GET_FILTERS]) {
+ err = rtnl_stats_get_parse_filters(tb[IFLA_STATS_GET_FILTERS],
+ filters, extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check,
+ bool is_dump, struct netlink_ext_ack *extack)
+{
+ struct if_stats_msg *ifsm;
+
+ ifsm = nlmsg_payload(nlh, sizeof(*ifsm));
+ if (!ifsm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for stats dump");
+ return -EINVAL;
+ }
+
+ if (!strict_check)
+ return 0;
+
+ /* only requests using strict checks can pass data to influence
+ * the dump. The legacy exception is filter_mask.
+ */
+ if (ifsm->pad1 || ifsm->pad2 || (is_dump && ifsm->ifindex)) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request");
+ return -EINVAL;
+ }
+ if (ifsm->filter_mask >= IFLA_STATS_FILTER_BIT(IFLA_STATS_MAX + 1)) {
+ NL_SET_ERR_MSG(extack, "Invalid stats requested through filter mask");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
+ struct rtnl_stats_dump_filters filters;
struct net *net = sock_net(skb->sk);
struct net_device *dev = NULL;
int idxattr = 0, prividx = 0;
struct if_stats_msg *ifsm;
struct sk_buff *nskb;
- u32 filter_mask;
int err;
- if (nlmsg_len(nlh) < sizeof(*ifsm))
- return -EINVAL;
+ err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb),
+ false, extack);
+ if (err)
+ return err;
ifsm = nlmsg_data(nlh);
if (ifsm->ifindex > 0)
@@ -4544,17 +6273,22 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,
if (!dev)
return -ENODEV;
- filter_mask = ifsm->filter_mask;
- if (!filter_mask)
+ if (!ifsm->filter_mask) {
+ NL_SET_ERR_MSG(extack, "Filter mask must be set for stats get");
return -EINVAL;
+ }
- nskb = nlmsg_new(if_nlmsg_stats_size(dev, filter_mask), GFP_KERNEL);
+ err = rtnl_stats_get_parse(nlh, ifsm->filter_mask, &filters, extack);
+ if (err)
+ return err;
+
+ nskb = nlmsg_new(if_nlmsg_stats_size(dev, &filters), GFP_KERNEL);
if (!nskb)
return -ENOBUFS;
err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS,
NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0,
- 0, filter_mask, &idxattr, &prividx);
+ 0, &filters, &idxattr, &prividx, extack);
if (err < 0) {
/* -EMSGSIZE implies BUG in if_nlmsg_stats_size */
WARN_ON(err == -EMSGSIZE);
@@ -4568,76 +6302,570 @@ static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh,
static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
- int h, s_h, err, s_idx, s_idxattr, s_prividx;
+ struct netlink_ext_ack *extack = cb->extack;
+ struct rtnl_stats_dump_filters filters;
struct net *net = sock_net(skb->sk);
unsigned int flags = NLM_F_MULTI;
struct if_stats_msg *ifsm;
- struct hlist_head *head;
+ struct {
+ unsigned long ifindex;
+ int idxattr;
+ int prividx;
+ } *ctx = (void *)cb->ctx;
struct net_device *dev;
- u32 filter_mask = 0;
- int idx = 0;
-
- s_h = cb->args[0];
- s_idx = cb->args[1];
- s_idxattr = cb->args[2];
- s_prividx = cb->args[3];
+ int err;
cb->seq = net->dev_base_seq;
- if (nlmsg_len(cb->nlh) < sizeof(*ifsm))
- return -EINVAL;
+ err = rtnl_valid_stats_req(cb->nlh, cb->strict_check, true, extack);
+ if (err)
+ return err;
ifsm = nlmsg_data(cb->nlh);
- filter_mask = ifsm->filter_mask;
- if (!filter_mask)
+ if (!ifsm->filter_mask) {
+ NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump");
return -EINVAL;
+ }
- for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
- idx = 0;
- head = &net->dev_index_head[h];
- hlist_for_each_entry(dev, head, index_hlist) {
- if (idx < s_idx)
- goto cont;
- err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, 0,
- flags, filter_mask,
- &s_idxattr, &s_prividx);
- /* If we ran out of room on the first message,
- * we're in trouble
- */
- WARN_ON((err == -EMSGSIZE) && (skb->len == 0));
+ err = rtnl_stats_get_parse(cb->nlh, ifsm->filter_mask, &filters,
+ extack);
+ if (err)
+ return err;
- if (err < 0)
- goto out;
- s_prividx = 0;
- s_idxattr = 0;
- nl_dump_check_consistent(cb, nlmsg_hdr(skb));
-cont:
- idx++;
- }
+ for_each_netdev_dump(net, dev, ctx->ifindex) {
+ err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, 0,
+ flags, &filters,
+ &ctx->idxattr, &ctx->prividx,
+ extack);
+ /* If we ran out of room on the first message,
+ * we're in trouble.
+ */
+ WARN_ON((err == -EMSGSIZE) && (skb->len == 0));
+
+ if (err < 0)
+ break;
+ ctx->prividx = 0;
+ ctx->idxattr = 0;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
}
-out:
- cb->args[3] = s_prividx;
- cb->args[2] = s_idxattr;
- cb->args[1] = idx;
- cb->args[0] = h;
+ return err;
+}
+
+void rtnl_offload_xstats_notify(struct net_device *dev)
+{
+ struct rtnl_stats_dump_filters response_filters = {};
+ struct net *net = dev_net(dev);
+ int idxattr = 0, prividx = 0;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ ASSERT_RTNL();
+
+ response_filters.mask[0] |=
+ IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS);
+ response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |=
+ IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO);
+
+ skb = nlmsg_new(if_nlmsg_stats_size(dev, &response_filters),
+ GFP_KERNEL);
+ if (!skb)
+ goto errout;
+
+ err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, 0, 0, 0, 0,
+ &response_filters, &idxattr, &prividx, NULL);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_STATS, NULL, GFP_KERNEL);
+ return;
+
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_STATS, err);
+}
+EXPORT_SYMBOL(rtnl_offload_xstats_notify);
+
+static int rtnl_stats_set(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ enum netdev_offload_xstats_type t_l3 = NETDEV_OFFLOAD_XSTATS_TYPE_L3;
+ struct rtnl_stats_dump_filters response_filters = {};
+ struct nlattr *tb[IFLA_STATS_GETSET_MAX + 1];
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev = NULL;
+ struct if_stats_msg *ifsm;
+ bool notify = false;
+ int err;
+
+ err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb),
+ false, extack);
+ if (err)
+ return err;
+
+ ifsm = nlmsg_data(nlh);
+ if (ifsm->family != AF_UNSPEC) {
+ NL_SET_ERR_MSG(extack, "Address family should be AF_UNSPEC");
+ return -EINVAL;
+ }
+
+ if (ifsm->ifindex > 0)
+ dev = __dev_get_by_index(net, ifsm->ifindex);
+ else
+ return -EINVAL;
+
+ if (!dev)
+ return -ENODEV;
+
+ if (ifsm->filter_mask) {
+ NL_SET_ERR_MSG(extack, "Filter mask must be 0 for stats set");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse(nlh, sizeof(*ifsm), tb, IFLA_STATS_GETSET_MAX,
+ ifla_stats_set_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]) {
+ u8 req = nla_get_u8(tb[IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS]);
+
+ if (req)
+ err = netdev_offload_xstats_enable(dev, t_l3, extack);
+ else
+ err = netdev_offload_xstats_disable(dev, t_l3);
+
+ if (!err)
+ notify = true;
+ else if (err != -EALREADY)
+ return err;
+
+ response_filters.mask[0] |=
+ IFLA_STATS_FILTER_BIT(IFLA_STATS_LINK_OFFLOAD_XSTATS);
+ response_filters.mask[IFLA_STATS_LINK_OFFLOAD_XSTATS] |=
+ IFLA_STATS_FILTER_BIT(IFLA_OFFLOAD_XSTATS_HW_S_INFO);
+ }
+
+ if (notify)
+ rtnl_offload_xstats_notify(dev);
+
+ return 0;
+}
+
+static int rtnl_mdb_valid_dump_req(const struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct br_port_msg *bpm;
+
+ bpm = nlmsg_payload(nlh, sizeof(*bpm));
+ if (!bpm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for mdb dump request");
+ return -EINVAL;
+ }
+
+ if (bpm->ifindex) {
+ NL_SET_ERR_MSG(extack, "Filtering by device index is not supported for mdb dump request");
+ return -EINVAL;
+ }
+ if (nlmsg_attrlen(nlh, sizeof(*bpm))) {
+ NL_SET_ERR_MSG(extack, "Invalid data after header in mdb dump request");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+struct rtnl_mdb_dump_ctx {
+ long idx;
+};
+
+static int rtnl_mdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct rtnl_mdb_dump_ctx *ctx = (void *)cb->ctx;
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ int idx, s_idx;
+ int err;
+
+ NL_ASSERT_CTX_FITS(struct rtnl_mdb_dump_ctx);
+
+ if (cb->strict_check) {
+ err = rtnl_mdb_valid_dump_req(cb->nlh, cb->extack);
+ if (err)
+ return err;
+ }
+
+ s_idx = ctx->idx;
+ idx = 0;
+
+ for_each_netdev(net, dev) {
+ if (idx < s_idx)
+ goto skip;
+ if (!dev->netdev_ops->ndo_mdb_dump)
+ goto skip;
+
+ err = dev->netdev_ops->ndo_mdb_dump(dev, skb, cb);
+ if (err == -EMSGSIZE)
+ goto out;
+ /* Moving on to next device, reset markers and sequence
+ * counters since they are all maintained per-device.
+ */
+ memset(cb->ctx, 0, sizeof(cb->ctx));
+ cb->prev_seq = 0;
+ cb->seq = 0;
+skip:
+ idx++;
+ }
+
+out:
+ ctx->idx = idx;
return skb->len;
}
+static int rtnl_validate_mdb_entry_get(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct br_mdb_entry *entry = nla_data(attr);
+
+ if (nla_len(attr) != sizeof(struct br_mdb_entry)) {
+ NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length");
+ return -EINVAL;
+ }
+
+ if (entry->ifindex) {
+ NL_SET_ERR_MSG(extack, "Entry ifindex cannot be specified");
+ return -EINVAL;
+ }
+
+ if (entry->state) {
+ NL_SET_ERR_MSG(extack, "Entry state cannot be specified");
+ return -EINVAL;
+ }
+
+ if (entry->flags) {
+ NL_SET_ERR_MSG(extack, "Entry flags cannot be specified");
+ return -EINVAL;
+ }
+
+ if (entry->vid >= VLAN_VID_MASK) {
+ NL_SET_ERR_MSG(extack, "Invalid entry VLAN id");
+ return -EINVAL;
+ }
+
+ if (entry->addr.proto != htons(ETH_P_IP) &&
+ entry->addr.proto != htons(ETH_P_IPV6) &&
+ entry->addr.proto != 0) {
+ NL_SET_ERR_MSG(extack, "Unknown entry protocol");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct nla_policy mdba_get_policy[MDBA_GET_ENTRY_MAX + 1] = {
+ [MDBA_GET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY,
+ rtnl_validate_mdb_entry_get,
+ sizeof(struct br_mdb_entry)),
+ [MDBA_GET_ENTRY_ATTRS] = { .type = NLA_NESTED },
+};
+
+static int rtnl_mdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[MDBA_GET_ENTRY_MAX + 1];
+ struct net *net = sock_net(in_skb->sk);
+ struct br_port_msg *bpm;
+ struct net_device *dev;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(struct br_port_msg), tb,
+ MDBA_GET_ENTRY_MAX, mdba_get_policy, extack);
+ if (err)
+ return err;
+
+ bpm = nlmsg_data(nlh);
+ if (!bpm->ifindex) {
+ NL_SET_ERR_MSG(extack, "Invalid ifindex");
+ return -EINVAL;
+ }
+
+ dev = __dev_get_by_index(net, bpm->ifindex);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Device doesn't exist");
+ return -ENODEV;
+ }
+
+ if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_GET_ENTRY)) {
+ NL_SET_ERR_MSG(extack, "Missing MDBA_GET_ENTRY attribute");
+ return -EINVAL;
+ }
+
+ if (!dev->netdev_ops->ndo_mdb_get) {
+ NL_SET_ERR_MSG(extack, "Device does not support MDB operations");
+ return -EOPNOTSUPP;
+ }
+
+ return dev->netdev_ops->ndo_mdb_get(dev, tb, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, extack);
+}
+
+static int rtnl_validate_mdb_entry(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct br_mdb_entry *entry = nla_data(attr);
+
+ if (nla_len(attr) != sizeof(struct br_mdb_entry)) {
+ NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length");
+ return -EINVAL;
+ }
+
+ if (entry->ifindex == 0) {
+ NL_SET_ERR_MSG(extack, "Zero entry ifindex is not allowed");
+ return -EINVAL;
+ }
+
+ if (entry->addr.proto == htons(ETH_P_IP)) {
+ if (!ipv4_is_multicast(entry->addr.u.ip4) &&
+ !ipv4_is_zeronet(entry->addr.u.ip4)) {
+ NL_SET_ERR_MSG(extack, "IPv4 entry group address is not multicast or 0.0.0.0");
+ return -EINVAL;
+ }
+ if (ipv4_is_local_multicast(entry->addr.u.ip4)) {
+ NL_SET_ERR_MSG(extack, "IPv4 entry group address is local multicast");
+ return -EINVAL;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (entry->addr.proto == htons(ETH_P_IPV6)) {
+ if (ipv6_addr_is_ll_all_nodes(&entry->addr.u.ip6)) {
+ NL_SET_ERR_MSG(extack, "IPv6 entry group address is link-local all nodes");
+ return -EINVAL;
+ }
+#endif
+ } else if (entry->addr.proto == 0) {
+ /* L2 mdb */
+ if (!is_multicast_ether_addr(entry->addr.u.mac_addr)) {
+ NL_SET_ERR_MSG(extack, "L2 entry group is not multicast");
+ return -EINVAL;
+ }
+ } else {
+ NL_SET_ERR_MSG(extack, "Unknown entry protocol");
+ return -EINVAL;
+ }
+
+ if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) {
+ NL_SET_ERR_MSG(extack, "Unknown entry state");
+ return -EINVAL;
+ }
+ if (entry->vid >= VLAN_VID_MASK) {
+ NL_SET_ERR_MSG(extack, "Invalid entry VLAN id");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct nla_policy mdba_policy[MDBA_SET_ENTRY_MAX + 1] = {
+ [MDBA_SET_ENTRY_UNSPEC] = { .strict_start_type = MDBA_SET_ENTRY_ATTRS + 1 },
+ [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY,
+ rtnl_validate_mdb_entry,
+ sizeof(struct br_mdb_entry)),
+ [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED },
+};
+
+static int rtnl_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1];
+ struct net *net = sock_net(skb->sk);
+ struct br_port_msg *bpm;
+ struct net_device *dev;
+ int err;
+
+ err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb,
+ MDBA_SET_ENTRY_MAX, mdba_policy, extack);
+ if (err)
+ return err;
+
+ bpm = nlmsg_data(nlh);
+ if (!bpm->ifindex) {
+ NL_SET_ERR_MSG(extack, "Invalid ifindex");
+ return -EINVAL;
+ }
+
+ dev = __dev_get_by_index(net, bpm->ifindex);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Device doesn't exist");
+ return -ENODEV;
+ }
+
+ if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY)) {
+ NL_SET_ERR_MSG(extack, "Missing MDBA_SET_ENTRY attribute");
+ return -EINVAL;
+ }
+
+ if (!dev->netdev_ops->ndo_mdb_add) {
+ NL_SET_ERR_MSG(extack, "Device does not support MDB operations");
+ return -EOPNOTSUPP;
+ }
+
+ return dev->netdev_ops->ndo_mdb_add(dev, tb, nlh->nlmsg_flags, extack);
+}
+
+static int rtnl_validate_mdb_entry_del_bulk(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct br_mdb_entry *entry = nla_data(attr);
+ struct br_mdb_entry zero_entry = {};
+
+ if (nla_len(attr) != sizeof(struct br_mdb_entry)) {
+ NL_SET_ERR_MSG_ATTR(extack, attr, "Invalid attribute length");
+ return -EINVAL;
+ }
+
+ if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) {
+ NL_SET_ERR_MSG(extack, "Unknown entry state");
+ return -EINVAL;
+ }
+
+ if (entry->flags) {
+ NL_SET_ERR_MSG(extack, "Entry flags cannot be set");
+ return -EINVAL;
+ }
+
+ if (entry->vid >= VLAN_N_VID - 1) {
+ NL_SET_ERR_MSG(extack, "Invalid entry VLAN id");
+ return -EINVAL;
+ }
+
+ if (memcmp(&entry->addr, &zero_entry.addr, sizeof(entry->addr))) {
+ NL_SET_ERR_MSG(extack, "Entry address cannot be set");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static const struct nla_policy mdba_del_bulk_policy[MDBA_SET_ENTRY_MAX + 1] = {
+ [MDBA_SET_ENTRY] = NLA_POLICY_VALIDATE_FN(NLA_BINARY,
+ rtnl_validate_mdb_entry_del_bulk,
+ sizeof(struct br_mdb_entry)),
+ [MDBA_SET_ENTRY_ATTRS] = { .type = NLA_NESTED },
+};
+
+static int rtnl_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ bool del_bulk = !!(nlh->nlmsg_flags & NLM_F_BULK);
+ struct nlattr *tb[MDBA_SET_ENTRY_MAX + 1];
+ struct net *net = sock_net(skb->sk);
+ struct br_port_msg *bpm;
+ struct net_device *dev;
+ int err;
+
+ if (!del_bulk)
+ err = nlmsg_parse_deprecated(nlh, sizeof(*bpm), tb,
+ MDBA_SET_ENTRY_MAX, mdba_policy,
+ extack);
+ else
+ err = nlmsg_parse(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX,
+ mdba_del_bulk_policy, extack);
+ if (err)
+ return err;
+
+ bpm = nlmsg_data(nlh);
+ if (!bpm->ifindex) {
+ NL_SET_ERR_MSG(extack, "Invalid ifindex");
+ return -EINVAL;
+ }
+
+ dev = __dev_get_by_index(net, bpm->ifindex);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Device doesn't exist");
+ return -ENODEV;
+ }
+
+ if (NL_REQ_ATTR_CHECK(extack, NULL, tb, MDBA_SET_ENTRY)) {
+ NL_SET_ERR_MSG(extack, "Missing MDBA_SET_ENTRY attribute");
+ return -EINVAL;
+ }
+
+ if (del_bulk) {
+ if (!dev->netdev_ops->ndo_mdb_del_bulk) {
+ NL_SET_ERR_MSG(extack, "Device does not support MDB bulk deletion");
+ return -EOPNOTSUPP;
+ }
+ return dev->netdev_ops->ndo_mdb_del_bulk(dev, tb, extack);
+ }
+
+ if (!dev->netdev_ops->ndo_mdb_del) {
+ NL_SET_ERR_MSG(extack, "Device does not support MDB operations");
+ return -EOPNOTSUPP;
+ }
+
+ return dev->netdev_ops->ndo_mdb_del(dev, tb, extack);
+}
+
/* Process one rtnetlink message. */
+static int rtnl_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const bool needs_lock = !(cb->flags & RTNL_FLAG_DUMP_UNLOCKED);
+ rtnl_dumpit_func dumpit = cb->data;
+ int err;
+
+ /* Previous iteration have already finished, avoid calling->dumpit()
+ * again, it may not expect to be called after it reached the end.
+ */
+ if (!dumpit)
+ return 0;
+
+ if (needs_lock)
+ rtnl_lock();
+ err = dumpit(skb, cb);
+ if (needs_lock)
+ rtnl_unlock();
+
+ /* Old dump handlers used to send NLM_DONE as in a separate recvmsg().
+ * Some applications which parse netlink manually depend on this.
+ */
+ if (cb->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE) {
+ if (err < 0 && err != -EMSGSIZE)
+ return err;
+ if (!err)
+ cb->data = NULL;
+
+ return skb->len;
+ }
+ return err;
+}
+
+static int rtnetlink_dump_start(struct sock *ssk, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct netlink_dump_control *control)
+{
+ if (control->flags & RTNL_FLAG_DUMP_SPLIT_NLM_DONE ||
+ !(control->flags & RTNL_FLAG_DUMP_UNLOCKED)) {
+ WARN_ON(control->data);
+ control->data = control->dump;
+ control->dump = rtnl_dumpit;
+ }
+
+ return netlink_dump_start(ssk, skb, nlh, control);
+}
+
static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
struct rtnl_link *link;
+ enum rtnl_kinds kind;
struct module *owner;
int err = -EOPNOTSUPP;
rtnl_doit_func doit;
unsigned int flags;
- int kind;
int family;
int type;
@@ -4652,16 +6880,16 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
return 0;
family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
- kind = type&3;
+ kind = rtnl_msgtype_kind(type);
- if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN))
+ if (kind != RTNL_KIND_GET && !netlink_net_capable(skb, CAP_NET_ADMIN))
return -EPERM;
rcu_read_lock();
- if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
+ if (kind == RTNL_KIND_GET && (nlh->nlmsg_flags & NLM_F_DUMP)) {
struct sock *rtnl;
rtnl_dumpit_func dumpit;
- u16 min_dump_alloc = 0;
+ u32 min_dump_alloc = 0;
link = rtnl_get_link(family, type);
if (!link || !link->dumpit) {
@@ -4672,6 +6900,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
}
owner = link->owner;
dumpit = link->dumpit;
+ flags = link->flags;
if (type == RTM_GETLINK - RTM_BASE)
min_dump_alloc = rtnl_calcit(skb, nlh);
@@ -4689,8 +6918,9 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
.dump = dumpit,
.min_dump_alloc = min_dump_alloc,
.module = owner,
+ .flags = flags,
};
- err = netlink_dump_start(rtnl, skb, nlh, &c);
+ err = rtnetlink_dump_start(rtnl, skb, nlh, &c);
/* netlink_dump_start() will keep a reference on
* module if dump is still in progress.
*/
@@ -4714,6 +6944,13 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
}
flags = link->flags;
+ if (kind == RTNL_KIND_DEL && (nlh->nlmsg_flags & NLM_F_BULK) &&
+ !(flags & RTNL_FLAG_BULK_DEL_SUPPORTED)) {
+ NL_SET_ERR_MSG(extack, "Bulk delete is not supported");
+ module_put(owner);
+ goto err_unlock;
+ }
+
if (flags & RTNL_FLAG_DOIT_UNLOCKED) {
doit = link->doit;
rcu_read_unlock();
@@ -4779,7 +7016,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
case NETDEV_CHANGELOWERSTATE:
case NETDEV_CHANGE_TX_QUEUE_LEN:
rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event),
- GFP_KERNEL, NULL, 0);
+ GFP_KERNEL, NULL, 0, 0, NULL);
break;
default:
break;
@@ -4798,7 +7035,6 @@ static int __net_init rtnetlink_net_init(struct net *net)
struct netlink_kernel_cfg cfg = {
.groups = RTNLGRP_MAX,
.input = rtnetlink_rcv,
- .cb_mutex = &rtnl_mutex,
.flags = NL_CFG_F_NONROOT_RECV,
.bind = rtnetlink_bind,
};
@@ -4821,6 +7057,41 @@ static struct pernet_operations rtnetlink_net_ops = {
.exit = rtnetlink_net_exit,
};
+static const struct rtnl_msg_handler rtnetlink_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_NEWLINK, .doit = rtnl_newlink,
+ .flags = RTNL_FLAG_DOIT_PERNET},
+ {.msgtype = RTM_DELLINK, .doit = rtnl_dellink,
+ .flags = RTNL_FLAG_DOIT_PERNET_WIP},
+ {.msgtype = RTM_GETLINK, .doit = rtnl_getlink,
+ .dumpit = rtnl_dump_ifinfo, .flags = RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
+ {.msgtype = RTM_SETLINK, .doit = rtnl_setlink,
+ .flags = RTNL_FLAG_DOIT_PERNET_WIP},
+ {.msgtype = RTM_GETADDR, .dumpit = rtnl_dump_all},
+ {.msgtype = RTM_GETROUTE, .dumpit = rtnl_dump_all},
+ {.msgtype = RTM_GETNETCONF, .dumpit = rtnl_dump_all},
+ {.msgtype = RTM_GETSTATS, .doit = rtnl_stats_get,
+ .dumpit = rtnl_stats_dump},
+ {.msgtype = RTM_SETSTATS, .doit = rtnl_stats_set},
+ {.msgtype = RTM_NEWLINKPROP, .doit = rtnl_newlinkprop},
+ {.msgtype = RTM_DELLINKPROP, .doit = rtnl_dellinkprop},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_GETLINK,
+ .dumpit = rtnl_bridge_getlink},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_DELLINK,
+ .doit = rtnl_bridge_dellink},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_SETLINK,
+ .doit = rtnl_bridge_setlink},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_NEWNEIGH, .doit = rtnl_fdb_add},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_DELNEIGH, .doit = rtnl_fdb_del,
+ .flags = RTNL_FLAG_BULK_DEL_SUPPORTED},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_GETNEIGH, .doit = rtnl_fdb_get,
+ .dumpit = rtnl_fdb_dump},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_NEWMDB, .doit = rtnl_mdb_add},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_DELMDB, .doit = rtnl_mdb_del,
+ .flags = RTNL_FLAG_BULK_DEL_SUPPORTED},
+ {.protocol = PF_BRIDGE, .msgtype = RTM_GETMDB, .doit = rtnl_mdb_get,
+ .dumpit = rtnl_mdb_dump},
+};
+
void __init rtnetlink_init(void)
{
if (register_pernet_subsys(&rtnetlink_net_ops))
@@ -4828,24 +7099,5 @@ void __init rtnetlink_init(void)
register_netdevice_notifier(&rtnetlink_dev_notifier);
- rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink,
- rtnl_dump_ifinfo, 0);
- rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, 0);
-
- rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, 0);
- rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0);
- rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0);
-
- rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0);
- rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0);
- rtnl_register(PF_BRIDGE, RTM_GETNEIGH, NULL, rtnl_fdb_dump, 0);
-
- rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0);
- rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0);
- rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, 0);
-
- rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump,
- 0);
+ rtnl_register_many(rtnetlink_rtnl_msg_handlers);
}
diff --git a/net/core/scm.c b/net/core/scm.c
index b1ff8a441748..cd87f66671aa 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* scm.c - Socket level control messages processing.
*
* Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* Alignment and value checking mods by Craig Metz
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
@@ -27,8 +23,12 @@
#include <linux/security.h>
#include <linux/pid_namespace.h>
#include <linux/pid.h>
+#include <uapi/linux/pidfd.h>
+#include <linux/pidfs.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
+#include <linux/errqueue.h>
+#include <linux/io_uring.h>
#include <linux/uaccess.h>
@@ -38,6 +38,7 @@
#include <net/compat.h>
#include <net/scm.h>
#include <net/cls_cgroup.h>
+#include <net/af_unix.h>
/*
@@ -82,13 +83,20 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
if (!fpl)
{
- fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL);
+ fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_ACCOUNT);
if (!fpl)
return -ENOMEM;
*fplp = fpl;
fpl->count = 0;
+ fpl->count_unix = 0;
fpl->max = SCM_MAX_FD;
fpl->user = NULL;
+#if IS_ENABLED(CONFIG_UNIX)
+ fpl->inflight = false;
+ fpl->dead = false;
+ fpl->edges = NULL;
+ INIT_LIST_HEAD(&fpl->vertices);
+#endif
}
fpp = &fpl->fp[fpl->count];
@@ -106,6 +114,14 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
if (fd < 0 || !(file = fget_raw(fd)))
return -EBADF;
+ /* don't allow io_uring files */
+ if (io_is_uring_fops(file)) {
+ fput(file);
+ return -EINVAL;
+ }
+ if (unix_get_socket(file))
+ fpl->count_unix++;
+
*fpp++ = file;
fpl->count++;
}
@@ -131,8 +147,25 @@ void __scm_destroy(struct scm_cookie *scm)
}
EXPORT_SYMBOL(__scm_destroy);
+static inline int scm_replace_pid(struct scm_cookie *scm, struct pid *pid)
+{
+ int err;
+
+ /* drop all previous references */
+ scm_destroy_cred(scm);
+
+ err = pidfs_register_pid(pid);
+ if (unlikely(err))
+ return err;
+
+ scm->pid = pid;
+ scm->creds.pid = pid_vnr(pid);
+ return 0;
+}
+
int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
{
+ const struct proto_ops *ops = READ_ONCE(sock->ops);
struct cmsghdr *cmsg;
int err;
@@ -156,7 +189,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
switch (cmsg->cmsg_type)
{
case SCM_RIGHTS:
- if (!sock->ops || sock->ops->family != PF_UNIX)
+ if (!ops || ops->family != PF_UNIX)
goto error;
err=scm_fp_copy(cmsg, &p->fp);
if (err<0)
@@ -174,15 +207,21 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
if (err)
goto error;
- p->creds.pid = creds.pid;
if (!p->pid || pid_vnr(p->pid) != creds.pid) {
struct pid *pid;
err = -ESRCH;
pid = find_get_pid(creds.pid);
if (!pid)
goto error;
- put_pid(p->pid);
- p->pid = pid;
+
+ /* pass a struct pid reference from
+ * find_get_pid() to scm_replace_pid().
+ */
+ err = scm_replace_pid(p, pid);
+ if (err) {
+ put_pid(pid);
+ goto error;
+ }
}
err = -EINVAL;
@@ -215,16 +254,12 @@ EXPORT_SYMBOL(__scm_send);
int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
{
- struct cmsghdr __user *cm
- = (__force struct cmsghdr __user *)msg->msg_control;
- struct cmsghdr cmhdr;
int cmlen = CMSG_LEN(len);
- int err;
- if (MSG_CMSG_COMPAT & msg->msg_flags)
+ if (msg->msg_flags & MSG_CMSG_COMPAT)
return put_cmsg_compat(msg, level, type, len, data);
- if (cm==NULL || msg->msg_controllen < sizeof(*cm)) {
+ if (!msg->msg_control || msg->msg_controllen < sizeof(struct cmsghdr)) {
msg->msg_flags |= MSG_CTRUNC;
return 0; /* XXX: return error? check spec. */
}
@@ -232,98 +267,133 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
msg->msg_flags |= MSG_CTRUNC;
cmlen = msg->msg_controllen;
}
- cmhdr.cmsg_level = level;
- cmhdr.cmsg_type = type;
- cmhdr.cmsg_len = cmlen;
-
- err = -EFAULT;
- if (copy_to_user(cm, &cmhdr, sizeof cmhdr))
- goto out;
- if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr)))
- goto out;
- cmlen = CMSG_SPACE(len);
- if (msg->msg_controllen < cmlen)
- cmlen = msg->msg_controllen;
- msg->msg_control += cmlen;
+
+ if (msg->msg_control_is_user) {
+ struct cmsghdr __user *cm = msg->msg_control_user;
+
+ check_object_size(data, cmlen - sizeof(*cm), true);
+
+ scoped_user_write_access_size(cm, cmlen, efault) {
+ unsafe_put_user(cmlen, &cm->cmsg_len, efault);
+ unsafe_put_user(level, &cm->cmsg_level, efault);
+ unsafe_put_user(type, &cm->cmsg_type, efault);
+ unsafe_copy_to_user(CMSG_USER_DATA(cm), data,
+ cmlen - sizeof(*cm), efault);
+ }
+ } else {
+ struct cmsghdr *cm = msg->msg_control;
+
+ cm->cmsg_level = level;
+ cm->cmsg_type = type;
+ cm->cmsg_len = cmlen;
+ memcpy(CMSG_DATA(cm), data, cmlen - sizeof(*cm));
+ }
+
+ cmlen = min(CMSG_SPACE(len), msg->msg_controllen);
+ if (msg->msg_control_is_user)
+ msg->msg_control_user += cmlen;
+ else
+ msg->msg_control += cmlen;
msg->msg_controllen -= cmlen;
- err = 0;
-out:
- return err;
+ return 0;
+
+efault:
+ return -EFAULT;
}
EXPORT_SYMBOL(put_cmsg);
-void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
+int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len,
+ void *data)
{
- struct cmsghdr __user *cm
- = (__force struct cmsghdr __user*)msg->msg_control;
+ /* Don't produce truncated CMSGs */
+ if (!msg->msg_control || msg->msg_controllen < CMSG_LEN(len))
+ return -ETOOSMALL;
- int fdmax = 0;
- int fdnum = scm->fp->count;
- struct file **fp = scm->fp->fp;
- int __user *cmfptr;
+ return put_cmsg(msg, level, type, len, data);
+}
+
+void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss_internal)
+{
+ struct scm_timestamping64 tss;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(tss.ts); i++) {
+ tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec;
+ tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec;
+ }
+
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_NEW, sizeof(tss), &tss);
+}
+EXPORT_SYMBOL(put_cmsg_scm_timestamping64);
+
+void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_internal *tss_internal)
+{
+ struct scm_timestamping tss;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(tss.ts); i++) {
+ tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec;
+ tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec;
+ }
+
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_OLD, sizeof(tss), &tss);
+}
+EXPORT_SYMBOL(put_cmsg_scm_timestamping);
+
+static int scm_max_fds(struct msghdr *msg)
+{
+ if (msg->msg_controllen <= sizeof(struct cmsghdr))
+ return 0;
+ return (msg->msg_controllen - sizeof(struct cmsghdr)) / sizeof(int);
+}
+
+void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
+{
+ struct cmsghdr __user *cm =
+ (__force struct cmsghdr __user *)msg->msg_control_user;
+ unsigned int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0;
+ int fdmax = min_t(int, scm_max_fds(msg), scm->fp->count);
+ int __user *cmsg_data = CMSG_USER_DATA(cm);
int err = 0, i;
- if (MSG_CMSG_COMPAT & msg->msg_flags) {
+ /* no use for FD passing from kernel space callers */
+ if (WARN_ON_ONCE(!msg->msg_control_is_user))
+ return;
+
+ if (msg->msg_flags & MSG_CMSG_COMPAT) {
scm_detach_fds_compat(msg, scm);
return;
}
- if (msg->msg_controllen > sizeof(struct cmsghdr))
- fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr))
- / sizeof(int));
-
- if (fdnum < fdmax)
- fdmax = fdnum;
-
- for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;
- i++, cmfptr++)
- {
- struct socket *sock;
- int new_fd;
- err = security_file_receive(fp[i]);
- if (err)
- break;
- err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & msg->msg_flags
- ? O_CLOEXEC : 0);
+ for (i = 0; i < fdmax; i++) {
+ err = scm_recv_one_fd(scm->fp->fp[i], cmsg_data + i, o_flags);
if (err < 0)
break;
- new_fd = err;
- err = put_user(new_fd, cmfptr);
- if (err) {
- put_unused_fd(new_fd);
- break;
- }
- /* Bump the usage count and install the file. */
- sock = sock_from_file(fp[i], &err);
- if (sock) {
- sock_update_netprioidx(&sock->sk->sk_cgrp_data);
- sock_update_classid(&sock->sk->sk_cgrp_data);
- }
- fd_install(new_fd, get_file(fp[i]));
}
- if (i > 0)
- {
- int cmlen = CMSG_LEN(i*sizeof(int));
+ if (i > 0) {
+ int cmlen = CMSG_LEN(i * sizeof(int));
+
err = put_user(SOL_SOCKET, &cm->cmsg_level);
if (!err)
err = put_user(SCM_RIGHTS, &cm->cmsg_type);
if (!err)
err = put_user(cmlen, &cm->cmsg_len);
if (!err) {
- cmlen = CMSG_SPACE(i*sizeof(int));
+ cmlen = CMSG_SPACE(i * sizeof(int));
if (msg->msg_controllen < cmlen)
cmlen = msg->msg_controllen;
- msg->msg_control += cmlen;
+ msg->msg_control_user += cmlen;
msg->msg_controllen -= cmlen;
}
}
- if (i < fdnum || (fdnum && fdmax <= 0))
+
+ if (i < scm->fp->count || (scm->fp->count && fdmax <= 0))
msg->msg_flags |= MSG_CTRUNC;
/*
- * All of the files that fit in the message have had their
- * usage counts incremented, so we just free the list.
+ * All of the files that fit in the message have had their usage counts
+ * incremented, so we just free the list.
*/
__scm_destroy(scm);
}
@@ -338,13 +408,141 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
return NULL;
new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (new_fpl) {
for (i = 0; i < fpl->count; i++)
get_file(fpl->fp[i]);
+
new_fpl->max = new_fpl->count;
new_fpl->user = get_uid(fpl->user);
+#if IS_ENABLED(CONFIG_UNIX)
+ new_fpl->inflight = false;
+ new_fpl->edges = NULL;
+ INIT_LIST_HEAD(&new_fpl->vertices);
+#endif
}
return new_fpl;
}
EXPORT_SYMBOL(scm_fp_dup);
+
+#ifdef CONFIG_SECURITY_NETWORK
+static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm)
+{
+ struct lsm_context ctx;
+ int err;
+
+ if (sk->sk_scm_security) {
+ err = security_secid_to_secctx(scm->secid, &ctx);
+
+ if (err >= 0) {
+ put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, ctx.len,
+ ctx.context);
+
+ security_release_secctx(&ctx);
+ }
+ }
+}
+
+static bool scm_has_secdata(struct sock *sk)
+{
+ return sk->sk_scm_security;
+}
+#else
+static void scm_passec(struct sock *sk, struct msghdr *msg, struct scm_cookie *scm)
+{
+}
+
+static bool scm_has_secdata(struct sock *sk)
+{
+ return false;
+}
+#endif
+
+static void scm_pidfd_recv(struct msghdr *msg, struct scm_cookie *scm)
+{
+ struct file *pidfd_file = NULL;
+ int len, pidfd;
+
+ /* put_cmsg() doesn't return an error if CMSG is truncated,
+ * that's why we need to opencode these checks here.
+ */
+ if (msg->msg_flags & MSG_CMSG_COMPAT)
+ len = sizeof(struct compat_cmsghdr) + sizeof(int);
+ else
+ len = sizeof(struct cmsghdr) + sizeof(int);
+
+ if (msg->msg_controllen < len) {
+ msg->msg_flags |= MSG_CTRUNC;
+ return;
+ }
+
+ if (!scm->pid)
+ return;
+
+ pidfd = pidfd_prepare(scm->pid, PIDFD_STALE, &pidfd_file);
+
+ if (put_cmsg(msg, SOL_SOCKET, SCM_PIDFD, sizeof(int), &pidfd)) {
+ if (pidfd_file) {
+ put_unused_fd(pidfd);
+ fput(pidfd_file);
+ }
+
+ return;
+ }
+
+ if (pidfd_file)
+ fd_install(pidfd, pidfd_file);
+}
+
+static bool __scm_recv_common(struct sock *sk, struct msghdr *msg,
+ struct scm_cookie *scm, int flags)
+{
+ if (!msg->msg_control) {
+ if (sk->sk_scm_credentials || sk->sk_scm_pidfd ||
+ scm->fp || scm_has_secdata(sk))
+ msg->msg_flags |= MSG_CTRUNC;
+
+ scm_destroy(scm);
+ return false;
+ }
+
+ if (sk->sk_scm_credentials) {
+ struct user_namespace *current_ns = current_user_ns();
+ struct ucred ucreds = {
+ .pid = scm->creds.pid,
+ .uid = from_kuid_munged(current_ns, scm->creds.uid),
+ .gid = from_kgid_munged(current_ns, scm->creds.gid),
+ };
+
+ put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds);
+ }
+
+ scm_passec(sk, msg, scm);
+
+ if (scm->fp)
+ scm_detach_fds(msg, scm);
+
+ return true;
+}
+
+void scm_recv(struct socket *sock, struct msghdr *msg,
+ struct scm_cookie *scm, int flags)
+{
+ if (!__scm_recv_common(sock->sk, msg, scm, flags))
+ return;
+
+ scm_destroy_cred(scm);
+}
+EXPORT_SYMBOL(scm_recv);
+
+void scm_recv_unix(struct socket *sock, struct msghdr *msg,
+ struct scm_cookie *scm, int flags)
+{
+ if (!__scm_recv_common(sock->sk, msg, scm, flags))
+ return;
+
+ if (sock->sk->sk_scm_pidfd)
+ scm_pidfd_recv(msg, scm);
+
+ scm_destroy_cred(scm);
+}
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index af6ad467ed61..9a3965680451 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -1,10 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#include <linux/kernel.h>
#include <linux/init.h>
-#include <linux/cryptohash.h>
#include <linux/module.h>
#include <linux/cache.h>
#include <linux/random.h>
@@ -19,8 +19,10 @@
#include <linux/in6.h>
#include <net/tcp.h>
-static siphash_key_t net_secret __read_mostly;
-static siphash_key_t ts_secret __read_mostly;
+static siphash_aligned_key_t net_secret;
+static siphash_aligned_key_t ts_secret;
+
+#define EPHEMERAL_PORT_SHUFFLE_PERIOD (10 * HZ)
static __always_inline void net_secret_init(void)
{
@@ -62,14 +64,14 @@ u32 secure_tcpv6_ts_off(const struct net *net,
.daddr = *(struct in6_addr *)daddr,
};
- if (net->ipv4.sysctl_tcp_timestamps != 1)
+ if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
return 0;
ts_secret_init();
return siphash(&combined, offsetofend(typeof(combined), daddr),
&ts_secret);
}
-EXPORT_SYMBOL(secure_tcpv6_ts_off);
+EXPORT_IPV6_MOD(secure_tcpv6_ts_off);
u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
__be16 sport, __be16 dport)
@@ -94,17 +96,19 @@ u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr,
}
EXPORT_SYMBOL(secure_tcpv6_seq);
-u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
+u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr,
__be16 dport)
{
const struct {
struct in6_addr saddr;
struct in6_addr daddr;
+ unsigned int timeseed;
__be16 dport;
} __aligned(SIPHASH_ALIGNMENT) combined = {
.saddr = *(struct in6_addr *)saddr,
.daddr = *(struct in6_addr *)daddr,
- .dport = dport
+ .timeseed = jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD,
+ .dport = dport,
};
net_secret_init();
return siphash(&combined, offsetofend(typeof(combined), dport),
@@ -116,7 +120,7 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral);
#ifdef CONFIG_INET
u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr)
{
- if (net->ipv4.sysctl_tcp_timestamps != 1)
+ if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1)
return 0;
ts_secret_init();
@@ -142,53 +146,13 @@ u32 secure_tcp_seq(__be32 saddr, __be32 daddr,
}
EXPORT_SYMBOL_GPL(secure_tcp_seq);
-u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
+u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
{
net_secret_init();
- return siphash_3u32((__force u32)saddr, (__force u32)daddr,
- (__force u16)dport, &net_secret);
+ return siphash_4u32((__force u32)saddr, (__force u32)daddr,
+ (__force u16)dport,
+ jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD,
+ &net_secret);
}
EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
#endif
-
-#if IS_ENABLED(CONFIG_IP_DCCP)
-u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
- __be16 sport, __be16 dport)
-{
- u64 seq;
- net_secret_init();
- seq = siphash_3u32((__force u32)saddr, (__force u32)daddr,
- (__force u32)sport << 16 | (__force u32)dport,
- &net_secret);
- seq += ktime_get_real_ns();
- seq &= (1ull << 48) - 1;
- return seq;
-}
-EXPORT_SYMBOL(secure_dccp_sequence_number);
-
-#if IS_ENABLED(CONFIG_IPV6)
-u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
- __be16 sport, __be16 dport)
-{
- const struct {
- struct in6_addr saddr;
- struct in6_addr daddr;
- __be16 sport;
- __be16 dport;
- } __aligned(SIPHASH_ALIGNMENT) combined = {
- .saddr = *(struct in6_addr *)saddr,
- .daddr = *(struct in6_addr *)daddr,
- .sport = sport,
- .dport = dport
- };
- u64 seq;
- net_secret_init();
- seq = siphash(&combined, offsetofend(typeof(combined), dport),
- &net_secret);
- seq += ktime_get_real_ns();
- seq &= (1ull << 48) - 1;
- return seq;
-}
-EXPORT_SYMBOL(secure_dccpv6_sequence_number);
-#endif
-#endif
diff --git a/net/core/selftests.c b/net/core/selftests.c
new file mode 100644
index 000000000000..8b81feb82c4a
--- /dev/null
+++ b/net/core/selftests.c
@@ -0,0 +1,448 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Synopsys, Inc. and/or its affiliates.
+ * stmmac Selftests Support
+ *
+ * Author: Jose Abreu <joabreu@synopsys.com>
+ *
+ * Ported from stmmac by:
+ * Copyright (C) 2021 Oleksij Rempel <o.rempel@pengutronix.de>
+ */
+
+#include <linux/phy.h>
+#include <net/selftests.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+
+static u8 net_test_next_id;
+
+struct sk_buff *net_test_get_skb(struct net_device *ndev, u8 id,
+ struct net_packet_attrs *attr)
+{
+ struct sk_buff *skb = NULL;
+ struct udphdr *uhdr = NULL;
+ struct tcphdr *thdr = NULL;
+ struct netsfhdr *shdr;
+ struct ethhdr *ehdr;
+ struct iphdr *ihdr;
+ int iplen, size;
+
+ size = attr->size + NET_TEST_PKT_SIZE;
+
+ if (attr->tcp)
+ size += sizeof(struct tcphdr);
+ else
+ size += sizeof(struct udphdr);
+
+ if (attr->max_size && attr->max_size > size)
+ size = attr->max_size;
+
+ skb = netdev_alloc_skb(ndev, size);
+ if (!skb)
+ return NULL;
+
+ prefetchw(skb->data);
+
+ ehdr = skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+
+ skb_set_network_header(skb, skb->len);
+ ihdr = skb_put(skb, sizeof(*ihdr));
+
+ skb_set_transport_header(skb, skb->len);
+ if (attr->tcp)
+ thdr = skb_put(skb, sizeof(*thdr));
+ else
+ uhdr = skb_put(skb, sizeof(*uhdr));
+
+ eth_zero_addr(ehdr->h_dest);
+
+ if (attr->src)
+ ether_addr_copy(ehdr->h_source, attr->src);
+ if (attr->dst)
+ ether_addr_copy(ehdr->h_dest, attr->dst);
+
+ ehdr->h_proto = htons(ETH_P_IP);
+
+ if (attr->tcp) {
+ memset(thdr, 0, sizeof(*thdr));
+ thdr->source = htons(attr->sport);
+ thdr->dest = htons(attr->dport);
+ thdr->doff = sizeof(struct tcphdr) / 4;
+ } else {
+ uhdr->source = htons(attr->sport);
+ uhdr->dest = htons(attr->dport);
+ uhdr->len = htons(sizeof(*shdr) + sizeof(*uhdr) + attr->size);
+ if (attr->max_size)
+ uhdr->len = htons(attr->max_size -
+ (sizeof(*ihdr) + sizeof(*ehdr)));
+ uhdr->check = 0;
+ }
+
+ ihdr->ihl = 5;
+ ihdr->ttl = 32;
+ ihdr->version = 4;
+ if (attr->tcp)
+ ihdr->protocol = IPPROTO_TCP;
+ else
+ ihdr->protocol = IPPROTO_UDP;
+ iplen = sizeof(*ihdr) + sizeof(*shdr) + attr->size;
+ if (attr->tcp)
+ iplen += sizeof(*thdr);
+ else
+ iplen += sizeof(*uhdr);
+
+ if (attr->max_size)
+ iplen = attr->max_size - sizeof(*ehdr);
+
+ ihdr->tot_len = htons(iplen);
+ ihdr->frag_off = 0;
+ ihdr->saddr = htonl(attr->ip_src);
+ ihdr->daddr = htonl(attr->ip_dst);
+ ihdr->tos = 0;
+ ihdr->id = 0;
+ ip_send_check(ihdr);
+
+ shdr = skb_put(skb, sizeof(*shdr));
+ shdr->version = 0;
+ shdr->magic = cpu_to_be64(NET_TEST_PKT_MAGIC);
+ attr->id = id;
+ shdr->id = id;
+
+ if (attr->size) {
+ void *payload = skb_put(skb, attr->size);
+
+ memset(payload, 0, attr->size);
+ }
+
+ if (attr->max_size && attr->max_size > skb->len) {
+ size_t pad_len = attr->max_size - skb->len;
+ void *pad = skb_put(skb, pad_len);
+
+ memset(pad, 0, pad_len);
+ }
+
+ skb->csum = 0;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ if (attr->tcp) {
+ int l4len = skb->len - skb_transport_offset(skb);
+
+ thdr->check = ~tcp_v4_check(l4len, ihdr->saddr, ihdr->daddr, 0);
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct tcphdr, check);
+
+ if (attr->bad_csum) {
+ /* Force mangled checksum */
+ if (skb_checksum_help(skb)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ if (thdr->check != CSUM_MANGLED_0)
+ thdr->check = CSUM_MANGLED_0;
+ else
+ thdr->check = csum16_sub(thdr->check,
+ cpu_to_be16(1));
+ }
+ } else {
+ udp4_hwcsum(skb, ihdr->saddr, ihdr->daddr);
+ }
+
+ skb->protocol = htons(ETH_P_IP);
+ skb->pkt_type = PACKET_HOST;
+ skb->dev = ndev;
+
+ return skb;
+}
+EXPORT_SYMBOL_GPL(net_test_get_skb);
+
+static int net_test_loopback_validate(struct sk_buff *skb,
+ struct net_device *ndev,
+ struct packet_type *pt,
+ struct net_device *orig_ndev)
+{
+ struct net_test_priv *tpriv = pt->af_packet_priv;
+ const unsigned char *src = tpriv->packet->src;
+ const unsigned char *dst = tpriv->packet->dst;
+ struct netsfhdr *shdr;
+ struct ethhdr *ehdr;
+ struct udphdr *uhdr;
+ struct tcphdr *thdr;
+ struct iphdr *ihdr;
+
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (!skb)
+ goto out;
+
+ if (skb_linearize(skb))
+ goto out;
+ if (skb_headlen(skb) < (NET_TEST_PKT_SIZE - ETH_HLEN))
+ goto out;
+
+ ehdr = (struct ethhdr *)skb_mac_header(skb);
+ if (dst) {
+ if (!ether_addr_equal_unaligned(ehdr->h_dest, dst))
+ goto out;
+ }
+
+ if (src) {
+ if (!ether_addr_equal_unaligned(ehdr->h_source, src))
+ goto out;
+ }
+
+ ihdr = ip_hdr(skb);
+ if (tpriv->double_vlan)
+ ihdr = (struct iphdr *)(skb_network_header(skb) + 4);
+
+ if (tpriv->packet->tcp) {
+ if (ihdr->protocol != IPPROTO_TCP)
+ goto out;
+
+ thdr = (struct tcphdr *)((u8 *)ihdr + 4 * ihdr->ihl);
+ if (thdr->dest != htons(tpriv->packet->dport))
+ goto out;
+
+ shdr = (struct netsfhdr *)((u8 *)thdr + sizeof(*thdr));
+ } else {
+ if (ihdr->protocol != IPPROTO_UDP)
+ goto out;
+
+ uhdr = (struct udphdr *)((u8 *)ihdr + 4 * ihdr->ihl);
+ if (uhdr->dest != htons(tpriv->packet->dport))
+ goto out;
+
+ shdr = (struct netsfhdr *)((u8 *)uhdr + sizeof(*uhdr));
+ }
+
+ if (shdr->magic != cpu_to_be64(NET_TEST_PKT_MAGIC))
+ goto out;
+ if (tpriv->packet->id != shdr->id)
+ goto out;
+
+ if (tpriv->packet->bad_csum && skb->ip_summed == CHECKSUM_UNNECESSARY)
+ tpriv->ok = -EIO;
+ else
+ tpriv->ok = true;
+
+ complete(&tpriv->comp);
+out:
+ kfree_skb(skb);
+ return 0;
+}
+
+static int __net_test_loopback(struct net_device *ndev,
+ struct net_packet_attrs *attr)
+{
+ struct net_test_priv *tpriv;
+ struct sk_buff *skb = NULL;
+ int ret = 0;
+
+ tpriv = kzalloc(sizeof(*tpriv), GFP_KERNEL);
+ if (!tpriv)
+ return -ENOMEM;
+
+ tpriv->ok = false;
+ init_completion(&tpriv->comp);
+
+ tpriv->pt.type = htons(ETH_P_IP);
+ tpriv->pt.func = net_test_loopback_validate;
+ tpriv->pt.dev = ndev;
+ tpriv->pt.af_packet_priv = tpriv;
+ tpriv->packet = attr;
+ dev_add_pack(&tpriv->pt);
+
+ skb = net_test_get_skb(ndev, net_test_next_id, attr);
+ if (!skb) {
+ ret = -ENOMEM;
+ goto cleanup;
+ }
+
+ net_test_next_id++;
+ ret = dev_direct_xmit(skb, attr->queue_mapping);
+ if (ret < 0) {
+ goto cleanup;
+ } else if (ret > 0) {
+ ret = -ENETUNREACH;
+ goto cleanup;
+ }
+
+ if (!attr->timeout)
+ attr->timeout = NET_LB_TIMEOUT;
+
+ wait_for_completion_timeout(&tpriv->comp, attr->timeout);
+ if (tpriv->ok < 0)
+ ret = tpriv->ok;
+ else if (!tpriv->ok)
+ ret = -ETIMEDOUT;
+ else
+ ret = 0;
+
+cleanup:
+ dev_remove_pack(&tpriv->pt);
+ kfree(tpriv);
+ return ret;
+}
+
+static int net_test_netif_carrier(struct net_device *ndev)
+{
+ return netif_carrier_ok(ndev) ? 0 : -ENOLINK;
+}
+
+static int net_test_phy_phydev(struct net_device *ndev)
+{
+ return ndev->phydev ? 0 : -EOPNOTSUPP;
+}
+
+static int net_test_phy_loopback_enable(struct net_device *ndev)
+{
+ if (!ndev->phydev)
+ return -EOPNOTSUPP;
+
+ return phy_loopback(ndev->phydev, true, 0);
+}
+
+static int net_test_phy_loopback_disable(struct net_device *ndev)
+{
+ if (!ndev->phydev)
+ return -EOPNOTSUPP;
+
+ return phy_loopback(ndev->phydev, false, 0);
+}
+
+static int net_test_phy_loopback_udp(struct net_device *ndev)
+{
+ struct net_packet_attrs attr = { };
+
+ attr.dst = ndev->dev_addr;
+ return __net_test_loopback(ndev, &attr);
+}
+
+static int net_test_phy_loopback_udp_mtu(struct net_device *ndev)
+{
+ struct net_packet_attrs attr = { };
+
+ attr.dst = ndev->dev_addr;
+ attr.max_size = ndev->mtu;
+ return __net_test_loopback(ndev, &attr);
+}
+
+static int net_test_phy_loopback_tcp(struct net_device *ndev)
+{
+ struct net_packet_attrs attr = { };
+
+ attr.dst = ndev->dev_addr;
+ attr.tcp = true;
+ return __net_test_loopback(ndev, &attr);
+}
+
+/**
+ * net_test_phy_loopback_tcp_bad_csum - PHY loopback test with a deliberately
+ * corrupted TCP checksum
+ * @ndev: the network device to test
+ *
+ * Builds the same minimal Ethernet/IPv4/TCP frame as
+ * net_test_phy_loopback_tcp(), then flips the least-significant bit of the TCP
+ * checksum so the resulting value is provably invalid (neither 0 nor 0xFFFF).
+ * The frame is transmitted through the device’s internal PHY loopback path:
+ *
+ * test code -> MAC driver -> MAC HW -> xMII -> PHY ->
+ * internal PHY loopback -> xMII -> MAC HW -> MAC driver -> test code
+ *
+ * Result interpretation
+ * ---------------------
+ * 0 The frame is delivered to the stack and the driver reports
+ * ip_summed as CHECKSUM_NONE or CHECKSUM_COMPLETE - both are
+ * valid ways to indicate “bad checksum, let the stack verify.”
+ * -ETIMEDOUT The MAC/PHY silently dropped the frame; hardware checksum
+ * verification filtered it out before the driver saw it.
+ * -EIO The driver returned the frame with ip_summed ==
+ * CHECKSUM_UNNECESSARY, falsely claiming a valid checksum and
+ * indicating a serious RX-path defect.
+ *
+ * Return: 0 on success or a negative error code on failure.
+ */
+static int net_test_phy_loopback_tcp_bad_csum(struct net_device *ndev)
+{
+ struct net_packet_attrs attr = { };
+
+ attr.dst = ndev->dev_addr;
+ attr.tcp = true;
+ attr.bad_csum = true;
+ return __net_test_loopback(ndev, &attr);
+}
+
+static const struct net_test {
+ char name[ETH_GSTRING_LEN];
+ int (*fn)(struct net_device *ndev);
+} net_selftests[] = {
+ {
+ .name = "Carrier ",
+ .fn = net_test_netif_carrier,
+ }, {
+ .name = "PHY dev is present ",
+ .fn = net_test_phy_phydev,
+ }, {
+ /* This test should be done before all PHY loopback test */
+ .name = "PHY internal loopback, enable ",
+ .fn = net_test_phy_loopback_enable,
+ }, {
+ .name = "PHY internal loopback, UDP ",
+ .fn = net_test_phy_loopback_udp,
+ }, {
+ .name = "PHY internal loopback, MTU ",
+ .fn = net_test_phy_loopback_udp_mtu,
+ }, {
+ .name = "PHY internal loopback, TCP ",
+ .fn = net_test_phy_loopback_tcp,
+ }, {
+ .name = "PHY loopback, bad TCP csum ",
+ .fn = net_test_phy_loopback_tcp_bad_csum,
+ }, {
+ /* This test should be done after all PHY loopback test */
+ .name = "PHY internal loopback, disable",
+ .fn = net_test_phy_loopback_disable,
+ },
+};
+
+void net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf)
+{
+ int count = net_selftest_get_count();
+ int i;
+
+ memset(buf, 0, sizeof(*buf) * count);
+ net_test_next_id = 0;
+
+ if (etest->flags != ETH_TEST_FL_OFFLINE) {
+ netdev_err(ndev, "Only offline tests are supported\n");
+ etest->flags |= ETH_TEST_FL_FAILED;
+ return;
+ }
+
+
+ for (i = 0; i < count; i++) {
+ buf[i] = net_selftests[i].fn(ndev);
+ if (buf[i] && (buf[i] != -EOPNOTSUPP))
+ etest->flags |= ETH_TEST_FL_FAILED;
+ }
+}
+EXPORT_SYMBOL_GPL(net_selftest);
+
+int net_selftest_get_count(void)
+{
+ return ARRAY_SIZE(net_selftests);
+}
+EXPORT_SYMBOL_GPL(net_selftest_get_count);
+
+void net_selftest_get_strings(u8 *data)
+{
+ int i;
+
+ for (i = 0; i < net_selftest_get_count(); i++)
+ ethtool_sprintf(&data, "%2d. %s", i + 1,
+ net_selftests[i].name);
+}
+EXPORT_SYMBOL_GPL(net_selftest_get_strings);
+
+MODULE_DESCRIPTION("Common library for generic PHY ethtool selftests");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Oleksij Rempel <o.rempel@pengutronix.de>");
diff --git a/net/core/skb_fault_injection.c b/net/core/skb_fault_injection.c
new file mode 100644
index 000000000000..4235db6bdfad
--- /dev/null
+++ b/net/core/skb_fault_injection.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/debugfs.h>
+#include <linux/fault-inject.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+
+static struct {
+ struct fault_attr attr;
+ char devname[IFNAMSIZ];
+ bool filtered;
+} skb_realloc = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .filtered = false,
+};
+
+static bool should_fail_net_realloc_skb(struct sk_buff *skb)
+{
+ struct net_device *net = skb->dev;
+
+ if (skb_realloc.filtered &&
+ strncmp(net->name, skb_realloc.devname, IFNAMSIZ))
+ /* device name filter set, but names do not match */
+ return false;
+
+ if (!should_fail(&skb_realloc.attr, 1))
+ return false;
+
+ return true;
+}
+ALLOW_ERROR_INJECTION(should_fail_net_realloc_skb, TRUE);
+
+void skb_might_realloc(struct sk_buff *skb)
+{
+ if (!should_fail_net_realloc_skb(skb))
+ return;
+
+ pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+}
+EXPORT_SYMBOL(skb_might_realloc);
+
+static int __init fail_skb_realloc_setup(char *str)
+{
+ return setup_fault_attr(&skb_realloc.attr, str);
+}
+__setup("fail_skb_realloc=", fail_skb_realloc_setup);
+
+static void reset_settings(void)
+{
+ skb_realloc.filtered = false;
+ memset(&skb_realloc.devname, 0, IFNAMSIZ);
+}
+
+static ssize_t devname_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *ppos)
+{
+ ssize_t ret;
+
+ reset_settings();
+ ret = simple_write_to_buffer(&skb_realloc.devname, IFNAMSIZ,
+ ppos, buffer, count);
+ if (ret < 0)
+ return ret;
+
+ skb_realloc.devname[IFNAMSIZ - 1] = '\0';
+ /* Remove a possible \n at the end of devname */
+ strim(skb_realloc.devname);
+
+ if (strnlen(skb_realloc.devname, IFNAMSIZ))
+ skb_realloc.filtered = true;
+
+ return count;
+}
+
+static ssize_t devname_read(struct file *file,
+ char __user *buffer,
+ size_t size, loff_t *ppos)
+{
+ if (!skb_realloc.filtered)
+ return 0;
+
+ return simple_read_from_buffer(buffer, size, ppos, &skb_realloc.devname,
+ strlen(skb_realloc.devname));
+}
+
+static const struct file_operations devname_ops = {
+ .write = devname_write,
+ .read = devname_read,
+};
+
+static int __init fail_skb_realloc_debugfs(void)
+{
+ umode_t mode = S_IFREG | 0600;
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_skb_realloc", NULL,
+ &skb_realloc.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ debugfs_create_file("devname", mode, dir, NULL, &devname_ops);
+
+ return 0;
+}
+
+late_initcall(fail_skb_realloc_debugfs);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b2c807f67aba..a00808f7be6a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Routines having to do with the 'struct sk_buff' memory handlers.
*
@@ -25,11 +26,6 @@
* disabled, or you better be *real* sure that the operation is atomic
* with respect to whatever list is being frobbed (e.g. via lock_sock()
* or via disabling bottom half handlers, etc).
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/*
@@ -55,6 +51,7 @@
#endif
#include <linux/string.h>
#include <linux/skbuff.h>
+#include <linux/skbuff_ref.h>
#include <linux/splice.h>
#include <linux/cache.h>
#include <linux/rtnetlink.h>
@@ -62,25 +59,137 @@
#include <linux/scatterlist.h>
#include <linux/errqueue.h>
#include <linux/prefetch.h>
+#include <linux/bitfield.h>
#include <linux/if_vlan.h>
+#include <linux/mpls.h>
+#include <linux/kcov.h>
+#include <linux/iov_iter.h>
+#include <linux/crc32.h>
#include <net/protocol.h>
#include <net/dst.h>
#include <net/sock.h>
#include <net/checksum.h>
+#include <net/gro.h>
+#include <net/gso.h>
+#include <net/hotdata.h>
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
+#include <net/mpls.h>
+#include <net/mptcp.h>
+#include <net/mctp.h>
+#include <net/page_pool/helpers.h>
+#include <net/psp/types.h>
+#include <net/dropreason.h>
+#include <net/xdp_sock.h>
#include <linux/uaccess.h>
#include <trace/events/skb.h>
#include <linux/highmem.h>
#include <linux/capability.h>
#include <linux/user_namespace.h>
+#include <linux/indirect_call_wrapper.h>
+#include <linux/textsearch.h>
+
+#include "dev.h"
+#include "devmem.h"
+#include "netmem_priv.h"
+#include "sock_destructor.h"
+
+#ifdef CONFIG_SKB_EXTENSIONS
+static struct kmem_cache *skbuff_ext_cache __ro_after_init;
+#endif
+
+#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN)
+#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \
+ GRO_MAX_HEAD_PAD))
+
+/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
+ * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
+ * size, and we can differentiate heads from skb_small_head_cache
+ * vs system slabs by looking at their size (skb_end_offset()).
+ */
+#define SKB_SMALL_HEAD_CACHE_SIZE \
+ (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \
+ (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \
+ SKB_SMALL_HEAD_SIZE)
+
+#define SKB_SMALL_HEAD_HEADROOM \
+ SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)
+
+/* kcm_write_msgs() relies on casting paged frags to bio_vec to use
+ * iov_iter_bvec(). These static asserts ensure the cast is valid is long as the
+ * netmem is a page.
+ */
+static_assert(offsetof(struct bio_vec, bv_page) ==
+ offsetof(skb_frag_t, netmem));
+static_assert(sizeof_field(struct bio_vec, bv_page) ==
+ sizeof_field(skb_frag_t, netmem));
+
+static_assert(offsetof(struct bio_vec, bv_len) == offsetof(skb_frag_t, len));
+static_assert(sizeof_field(struct bio_vec, bv_len) ==
+ sizeof_field(skb_frag_t, len));
+
+static_assert(offsetof(struct bio_vec, bv_offset) ==
+ offsetof(skb_frag_t, offset));
+static_assert(sizeof_field(struct bio_vec, bv_offset) ==
+ sizeof_field(skb_frag_t, offset));
+
+#undef FN
+#define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
+static const char * const drop_reasons[] = {
+ [SKB_CONSUMED] = "CONSUMED",
+ DEFINE_DROP_REASON(FN, FN)
+};
+
+static const struct drop_reason_list drop_reasons_core = {
+ .reasons = drop_reasons,
+ .n_reasons = ARRAY_SIZE(drop_reasons),
+};
+
+const struct drop_reason_list __rcu *
+drop_reasons_by_subsys[SKB_DROP_REASON_SUBSYS_NUM] = {
+ [SKB_DROP_REASON_SUBSYS_CORE] = RCU_INITIALIZER(&drop_reasons_core),
+};
+EXPORT_SYMBOL(drop_reasons_by_subsys);
+
+/**
+ * drop_reasons_register_subsys - register another drop reason subsystem
+ * @subsys: the subsystem to register, must not be the core
+ * @list: the list of drop reasons within the subsystem, must point to
+ * a statically initialized list
+ */
+void drop_reasons_register_subsys(enum skb_drop_reason_subsys subsys,
+ const struct drop_reason_list *list)
+{
+ if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
+ subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
+ "invalid subsystem %d\n", subsys))
+ return;
+
+ /* must point to statically allocated memory, so INIT is OK */
+ RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], list);
+}
+EXPORT_SYMBOL_GPL(drop_reasons_register_subsys);
-struct kmem_cache *skbuff_head_cache __ro_after_init;
-static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
-int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
-EXPORT_SYMBOL(sysctl_max_skb_frags);
+/**
+ * drop_reasons_unregister_subsys - unregister a drop reason subsystem
+ * @subsys: the subsystem to remove, must not be the core
+ *
+ * Note: This will synchronize_rcu() to ensure no users when it returns.
+ */
+void drop_reasons_unregister_subsys(enum skb_drop_reason_subsys subsys)
+{
+ if (WARN(subsys <= SKB_DROP_REASON_SUBSYS_CORE ||
+ subsys >= ARRAY_SIZE(drop_reasons_by_subsys),
+ "invalid subsystem %d\n", subsys))
+ return;
+
+ RCU_INIT_POINTER(drop_reasons_by_subsys[subsys], NULL);
+
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(drop_reasons_unregister_subsys);
/**
* skb_panic - private function for out-of-line support
@@ -97,7 +206,7 @@ EXPORT_SYMBOL(sysctl_max_skb_frags);
static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr,
const char msg[])
{
- pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n",
+ pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n",
msg, addr, skb->len, sz, skb->head, skb->data,
(unsigned long)skb->tail, (unsigned long)skb->end,
skb->dev ? skb->dev->name : "<NULL>");
@@ -114,154 +223,240 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
skb_panic(skb, sz, addr, __func__);
}
-/*
- * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
- * the caller if emergency pfmemalloc reserves are being used. If it is and
- * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
- * may be used. Otherwise, the packet data may be discarded until enough
- * memory is free
- */
-#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
- __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
+#define NAPI_SKB_CACHE_SIZE 128
+#define NAPI_SKB_CACHE_BULK 32
+#define NAPI_SKB_CACHE_FREE 32
-static void *__kmalloc_reserve(size_t size, gfp_t flags, int node,
- unsigned long ip, bool *pfmemalloc)
+struct napi_alloc_cache {
+ local_lock_t bh_lock;
+ struct page_frag_cache page;
+ unsigned int skb_count;
+ void *skb_cache[NAPI_SKB_CACHE_SIZE];
+};
+
+static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
+static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
+
+void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
- void *obj;
- bool ret_pfmemalloc = false;
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ void *data;
- /*
- * Try a regular allocation, when that fails and we're not entitled
- * to the reserves, fail.
- */
- obj = kmalloc_node_track_caller(size,
- flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
- node);
- if (obj || !(gfp_pfmemalloc_allowed(flags)))
- goto out;
+ fragsz = SKB_DATA_ALIGN(fragsz);
- /* Try again but now we are using pfmemalloc reserves */
- ret_pfmemalloc = true;
- obj = kmalloc_node_track_caller(size, flags, node);
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+ data = __page_frag_alloc_align(&nc->page, fragsz,
+ GFP_ATOMIC | __GFP_NOWARN, align_mask);
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
+ return data;
-out:
- if (pfmemalloc)
- *pfmemalloc = ret_pfmemalloc;
+}
+EXPORT_SYMBOL(__napi_alloc_frag_align);
- return obj;
+void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
+{
+ void *data;
+
+ if (in_hardirq() || irqs_disabled()) {
+ struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
+
+ fragsz = SKB_DATA_ALIGN(fragsz);
+ data = __page_frag_alloc_align(nc, fragsz,
+ GFP_ATOMIC | __GFP_NOWARN,
+ align_mask);
+ } else {
+ local_bh_disable();
+ data = __napi_alloc_frag_align(fragsz, align_mask);
+ local_bh_enable();
+ }
+ return data;
}
+EXPORT_SYMBOL(__netdev_alloc_frag_align);
-/* Allocate a new skbuff. We do this ourselves so we can fill in a few
- * 'private' fields and also do memory statistics to find all the
- * [BEEP] leaks.
- *
+/* Cache kmem_cache_size(net_hotdata.skbuff_cache) to help the compiler
+ * remove dead code (and skbuff_cache_size) when CONFIG_KASAN is unset.
*/
+static u32 skbuff_cache_size __read_mostly;
+
+static struct sk_buff *napi_skb_cache_get(bool alloc)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct sk_buff *skb;
+
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+ if (unlikely(!nc->skb_count)) {
+ if (alloc)
+ nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN,
+ NAPI_SKB_CACHE_BULK,
+ nc->skb_cache);
+ if (unlikely(!nc->skb_count)) {
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
+ return NULL;
+ }
+ }
+
+ skb = nc->skb_cache[--nc->skb_count];
+ if (nc->skb_count)
+ prefetch(nc->skb_cache[nc->skb_count - 1]);
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
+ kasan_mempool_unpoison_object(skb, skbuff_cache_size);
+
+ return skb;
+}
/**
- * __alloc_skb - allocate a network buffer
- * @size: size to allocate
- * @gfp_mask: allocation mask
- * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
- * instead of head cache and allocate a cloned (child) skb.
- * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
- * allocations in case the data is required for writeback
- * @node: numa node to allocate memory on
+ * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache
+ * @skbs: pointer to an at least @n-sized array to fill with skb pointers
+ * @n: number of entries to provide
*
- * Allocate a new &sk_buff. The returned buffer has no headroom and a
- * tail room of at least size bytes. The object has a reference count
- * of one. The return is the buffer. On a failure the return is %NULL.
+ * Tries to obtain @n &sk_buff entries from the NAPI percpu cache and writes
+ * the pointers into the provided array @skbs. If there are less entries
+ * available, tries to replenish the cache and bulk-allocates the diff from
+ * the MM layer if needed.
+ * The heads are being zeroed with either memset() or %__GFP_ZERO, so they are
+ * ready for {,__}build_skb_around() and don't have any data buffers attached.
+ * Must be called *only* from the BH context.
*
- * Buffers may only be allocated from interrupts using a @gfp_mask of
- * %GFP_ATOMIC.
+ * Return: number of successfully allocated skbs (@n if no actual allocation
+ * needed or kmem_cache_alloc_bulk() didn't fail).
*/
-struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
- int flags, int node)
+u32 napi_skb_cache_get_bulk(void **skbs, u32 n)
{
- struct kmem_cache *cache;
- struct skb_shared_info *shinfo;
- struct sk_buff *skb;
- u8 *data;
- bool pfmemalloc;
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ u32 bulk, total = n;
+
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+
+ if (nc->skb_count >= n)
+ goto get;
+
+ /* No enough cached skbs. Try refilling the cache first */
+ bulk = min(NAPI_SKB_CACHE_SIZE - nc->skb_count, NAPI_SKB_CACHE_BULK);
+ nc->skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN, bulk,
+ &nc->skb_cache[nc->skb_count]);
+ if (likely(nc->skb_count >= n))
+ goto get;
+
+ /* Still not enough. Bulk-allocate the missing part directly, zeroed */
+ n -= kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN,
+ n - nc->skb_count, &skbs[nc->skb_count]);
+ if (likely(nc->skb_count >= n))
+ goto get;
+
+ /* kmem_cache didn't allocate the number we need, limit the output */
+ total -= n - nc->skb_count;
+ n = nc->skb_count;
+
+get:
+ for (u32 base = nc->skb_count - n, i = 0; i < n; i++) {
+ skbs[i] = nc->skb_cache[base + i];
+
+ kasan_mempool_unpoison_object(skbs[i], skbuff_cache_size);
+ memset(skbs[i], 0, offsetof(struct sk_buff, tail));
+ }
- cache = (flags & SKB_ALLOC_FCLONE)
- ? skbuff_fclone_cache : skbuff_head_cache;
+ nc->skb_count -= n;
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
- if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
- gfp_mask |= __GFP_MEMALLOC;
+ return total;
+}
+EXPORT_SYMBOL_GPL(napi_skb_cache_get_bulk);
- /* Get the HEAD */
- skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
- if (!skb)
- goto out;
- prefetchw(skb);
+static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
+ unsigned int size)
+{
+ struct skb_shared_info *shinfo;
- /* We do our best to align skb_shared_info on a separate cache
- * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
- * aligned memory blocks, unless SLUB/SLAB debug is enabled.
- * Both skb->head and skb_shared_info are cache line aligned.
- */
- size = SKB_DATA_ALIGN(size);
- size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
- if (!data)
- goto nodata;
- /* kmalloc(size) might give us more room than requested.
- * Put skb_shared_info exactly at the end of allocated zone,
- * to allow max possible filling before reallocation.
- */
- size = SKB_WITH_OVERHEAD(ksize(data));
- prefetchw(data + size);
+ size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- /*
- * Only clear those fields we need to clear, not those that we will
- * actually initialise below. Hence, don't put any more fields after
- * the tail pointer in struct sk_buff!
- */
- memset(skb, 0, offsetof(struct sk_buff, tail));
- /* Account for allocated memory : skb + skb->head */
+ /* Assumes caller memset cleared SKB */
skb->truesize = SKB_TRUESIZE(size);
- skb->pfmemalloc = pfmemalloc;
refcount_set(&skb->users, 1);
skb->head = data;
skb->data = data;
skb_reset_tail_pointer(skb);
- skb->end = skb->tail + size;
+ skb_set_end_offset(skb, size);
skb->mac_header = (typeof(skb->mac_header))~0U;
skb->transport_header = (typeof(skb->transport_header))~0U;
-
+ skb->alloc_cpu = raw_smp_processor_id();
/* make sure we initialize shinfo sequentially */
shinfo = skb_shinfo(skb);
memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
atomic_set(&shinfo->dataref, 1);
- if (flags & SKB_ALLOC_FCLONE) {
- struct sk_buff_fclones *fclones;
+ skb_set_kcov_handle(skb, kcov_common_handle());
+}
- fclones = container_of(skb, struct sk_buff_fclones, skb1);
+static inline void *__slab_build_skb(void *data, unsigned int *size)
+{
+ void *resized;
- skb->fclone = SKB_FCLONE_ORIG;
- refcount_set(&fclones->fclone_ref, 1);
+ /* Must find the allocation size (and grow it to match). */
+ *size = ksize(data);
+ /* krealloc() will immediately return "data" when
+ * "ksize(data)" is requested: it is the existing upper
+ * bounds. As a result, GFP_ATOMIC will be ignored. Note
+ * that this "new" pointer needs to be passed back to the
+ * caller for use so the __alloc_size hinting will be
+ * tracked correctly.
+ */
+ resized = krealloc(data, *size, GFP_ATOMIC);
+ WARN_ON_ONCE(resized != data);
+ return resized;
+}
+
+/* build_skb() variant which can operate on slab buffers.
+ * Note that this should be used sparingly as slab buffers
+ * cannot be combined efficiently by GRO!
+ */
+struct sk_buff *slab_build_skb(void *data)
+{
+ struct sk_buff *skb;
+ unsigned int size;
+
+ skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (unlikely(!skb))
+ return NULL;
+
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ data = __slab_build_skb(data, &size);
+ __finalize_skb_around(skb, data, size);
- fclones->skb2.fclone = SKB_FCLONE_CLONE;
- }
-out:
return skb;
-nodata:
- kmem_cache_free(cache, skb);
- skb = NULL;
- goto out;
}
-EXPORT_SYMBOL(__alloc_skb);
+EXPORT_SYMBOL(slab_build_skb);
+
+/* Caller must provide SKB that is memset cleared */
+static void __build_skb_around(struct sk_buff *skb, void *data,
+ unsigned int frag_size)
+{
+ unsigned int size = frag_size;
+
+ /* frag_size == 0 is considered deprecated now. Callers
+ * using slab buffer should use slab_build_skb() instead.
+ */
+ if (WARN_ONCE(size == 0, "Use slab_build_skb() instead"))
+ data = __slab_build_skb(data, &size);
+
+ __finalize_skb_around(skb, data, size);
+}
/**
* __build_skb - build a network buffer
* @data: data buffer provided by caller
- * @frag_size: size of data, or 0 if head was kmalloced
+ * @frag_size: size of data (must not be 0)
*
* Allocate a new &sk_buff. Caller provides space holding head and
- * skb_shared_info. @data must have been allocated by kmalloc() only if
- * @frag_size is 0, otherwise data should come from the page allocator
- * or vmalloc()
+ * skb_shared_info. @data must have been allocated from the page
+ * allocator or vmalloc(). (A @frag_size of 0 to indicate a kmalloc()
+ * allocation is deprecated, and callers should use slab_build_skb()
+ * instead.)
* The return is the new skb buffer.
* On a failure the return is %NULL, and @data is not freed.
* Notes :
@@ -274,101 +469,258 @@ EXPORT_SYMBOL(__alloc_skb);
*/
struct sk_buff *__build_skb(void *data, unsigned int frag_size)
{
- struct skb_shared_info *shinfo;
struct sk_buff *skb;
- unsigned int size = frag_size ? : ksize(data);
- skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
- if (!skb)
+ skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (unlikely(!skb))
return NULL;
- size -= SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-
memset(skb, 0, offsetof(struct sk_buff, tail));
- skb->truesize = SKB_TRUESIZE(size);
- refcount_set(&skb->users, 1);
- skb->head = data;
- skb->data = data;
- skb_reset_tail_pointer(skb);
- skb->end = skb->tail + size;
- skb->mac_header = (typeof(skb->mac_header))~0U;
- skb->transport_header = (typeof(skb->transport_header))~0U;
-
- /* make sure we initialize shinfo sequentially */
- shinfo = skb_shinfo(skb);
- memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
- atomic_set(&shinfo->dataref, 1);
+ __build_skb_around(skb, data, frag_size);
return skb;
}
/* build_skb() is wrapper over __build_skb(), that specifically
* takes care of skb->head and skb->pfmemalloc
- * This means that if @frag_size is not zero, then @data must be backed
- * by a page fragment, not kmalloc() or vmalloc()
*/
struct sk_buff *build_skb(void *data, unsigned int frag_size)
{
struct sk_buff *skb = __build_skb(data, frag_size);
- if (skb && frag_size) {
+ if (likely(skb && frag_size)) {
skb->head_frag = 1;
- if (page_is_pfmemalloc(virt_to_head_page(data)))
- skb->pfmemalloc = 1;
+ skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
}
return skb;
}
EXPORT_SYMBOL(build_skb);
-#define NAPI_SKB_CACHE_SIZE 64
+/**
+ * build_skb_around - build a network buffer around provided skb
+ * @skb: sk_buff provide by caller, must be memset cleared
+ * @data: data buffer provided by caller
+ * @frag_size: size of data
+ */
+struct sk_buff *build_skb_around(struct sk_buff *skb,
+ void *data, unsigned int frag_size)
+{
+ if (unlikely(!skb))
+ return NULL;
-struct napi_alloc_cache {
- struct page_frag_cache page;
- unsigned int skb_count;
- void *skb_cache[NAPI_SKB_CACHE_SIZE];
-};
+ __build_skb_around(skb, data, frag_size);
-static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
-static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
+ if (frag_size) {
+ skb->head_frag = 1;
+ skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
+ }
+ return skb;
+}
+EXPORT_SYMBOL(build_skb_around);
-static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+/**
+ * __napi_build_skb - build a network buffer
+ * @data: data buffer provided by caller
+ * @frag_size: size of data
+ *
+ * Version of __build_skb() that uses NAPI percpu caches to obtain
+ * skbuff_head instead of inplace allocation.
+ *
+ * Returns a new &sk_buff on success, %NULL on allocation failure.
+ */
+static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
{
- struct page_frag_cache *nc;
- unsigned long flags;
- void *data;
+ struct sk_buff *skb;
- local_irq_save(flags);
- nc = this_cpu_ptr(&netdev_alloc_cache);
- data = page_frag_alloc(nc, fragsz, gfp_mask);
- local_irq_restore(flags);
- return data;
+ skb = napi_skb_cache_get(true);
+ if (unlikely(!skb))
+ return NULL;
+
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ __build_skb_around(skb, data, frag_size);
+
+ return skb;
}
/**
- * netdev_alloc_frag - allocate a page fragment
- * @fragsz: fragment size
+ * napi_build_skb - build a network buffer
+ * @data: data buffer provided by caller
+ * @frag_size: size of data
*
- * Allocates a frag from a page for receive buffer.
- * Uses GFP_ATOMIC allocations.
+ * Version of __napi_build_skb() that takes care of skb->head_frag
+ * and skb->pfmemalloc when the data is a page or page fragment.
+ *
+ * Returns a new &sk_buff on success, %NULL on allocation failure.
*/
-void *netdev_alloc_frag(unsigned int fragsz)
+struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
{
- return __netdev_alloc_frag(fragsz, GFP_ATOMIC);
+ struct sk_buff *skb = __napi_build_skb(data, frag_size);
+
+ if (likely(skb) && frag_size) {
+ skb->head_frag = 1;
+ skb_propagate_pfmemalloc(virt_to_head_page(data), skb);
+ }
+
+ return skb;
}
-EXPORT_SYMBOL(netdev_alloc_frag);
+EXPORT_SYMBOL(napi_build_skb);
-static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+/*
+ * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
+ * the caller if emergency pfmemalloc reserves are being used. If it is and
+ * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
+ * may be used. Otherwise, the packet data may be discarded until enough
+ * memory is free
+ */
+static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
+ bool *pfmemalloc)
{
- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ bool ret_pfmemalloc = false;
+ size_t obj_size;
+ void *obj;
+
+ obj_size = SKB_HEAD_ALIGN(*size);
+ if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
+ !(flags & KMALLOC_NOT_NORMAL_BITS)) {
+ obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache,
+ flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ node);
+ *size = SKB_SMALL_HEAD_CACHE_SIZE;
+ if (obj || !(gfp_pfmemalloc_allowed(flags)))
+ goto out;
+ /* Try again but now we are using pfmemalloc reserves */
+ ret_pfmemalloc = true;
+ obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node);
+ goto out;
+ }
+
+ obj_size = kmalloc_size_roundup(obj_size);
+ /* The following cast might truncate high-order bits of obj_size, this
+ * is harmless because kmalloc(obj_size >= 2^32) will fail anyway.
+ */
+ *size = (unsigned int)obj_size;
- return page_frag_alloc(&nc->page, fragsz, gfp_mask);
+ /*
+ * Try a regular allocation, when that fails and we're not entitled
+ * to the reserves, fail.
+ */
+ obj = kmalloc_node_track_caller(obj_size,
+ flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ node);
+ if (obj || !(gfp_pfmemalloc_allowed(flags)))
+ goto out;
+
+ /* Try again but now we are using pfmemalloc reserves */
+ ret_pfmemalloc = true;
+ obj = kmalloc_node_track_caller(obj_size, flags, node);
+
+out:
+ if (pfmemalloc)
+ *pfmemalloc = ret_pfmemalloc;
+
+ return obj;
}
-void *napi_alloc_frag(unsigned int fragsz)
+/* Allocate a new skbuff. We do this ourselves so we can fill in a few
+ * 'private' fields and also do memory statistics to find all the
+ * [BEEP] leaks.
+ *
+ */
+
+/**
+ * __alloc_skb - allocate a network buffer
+ * @size: size to allocate
+ * @gfp_mask: allocation mask
+ * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
+ * instead of head cache and allocate a cloned (child) skb.
+ * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
+ * allocations in case the data is required for writeback
+ * @node: numa node to allocate memory on
+ *
+ * Allocate a new &sk_buff. The returned buffer has no headroom and a
+ * tail room of at least size bytes. The object has a reference count
+ * of one. The return is the buffer. On a failure the return is %NULL.
+ *
+ * Buffers may only be allocated from interrupts using a @gfp_mask of
+ * %GFP_ATOMIC.
+ */
+struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
+ int flags, int node)
{
- return __napi_alloc_frag(fragsz, GFP_ATOMIC);
+ struct sk_buff *skb = NULL;
+ struct kmem_cache *cache;
+ bool pfmemalloc;
+ u8 *data;
+
+ if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
+ gfp_mask |= __GFP_MEMALLOC;
+
+ if (flags & SKB_ALLOC_FCLONE) {
+ cache = net_hotdata.skbuff_fclone_cache;
+ goto fallback;
+ }
+ cache = net_hotdata.skbuff_cache;
+ if (unlikely(node != NUMA_NO_NODE && node != numa_mem_id()))
+ goto fallback;
+
+ if (flags & SKB_ALLOC_NAPI) {
+ skb = napi_skb_cache_get(true);
+ if (unlikely(!skb))
+ return NULL;
+ } else if (!in_hardirq() && !irqs_disabled()) {
+ local_bh_disable();
+ skb = napi_skb_cache_get(false);
+ local_bh_enable();
+ }
+
+ if (!skb) {
+fallback:
+ skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
+ if (unlikely(!skb))
+ return NULL;
+ }
+ prefetchw(skb);
+
+ /* We do our best to align skb_shared_info on a separate cache
+ * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
+ * aligned memory blocks, unless SLUB/SLAB debug is enabled.
+ * Both skb->head and skb_shared_info are cache line aligned.
+ */
+ data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
+ if (unlikely(!data))
+ goto nodata;
+ /* kmalloc_size_roundup() might give us more room than requested.
+ * Put skb_shared_info exactly at the end of allocated zone,
+ * to allow max possible filling before reallocation.
+ */
+ prefetchw(data + SKB_WITH_OVERHEAD(size));
+
+ /*
+ * Only clear those fields we need to clear, not those that we will
+ * actually initialise below. Hence, don't put any more fields after
+ * the tail pointer in struct sk_buff!
+ */
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ __build_skb_around(skb, data, size);
+ skb->pfmemalloc = pfmemalloc;
+
+ if (flags & SKB_ALLOC_FCLONE) {
+ struct sk_buff_fclones *fclones;
+
+ fclones = container_of(skb, struct sk_buff_fclones, skb1);
+
+ skb->fclone = SKB_FCLONE_ORIG;
+ refcount_set(&fclones->fclone_ref, 1);
+ }
+
+ return skb;
+
+nodata:
+ kmem_cache_free(cache, skb);
+ return NULL;
}
-EXPORT_SYMBOL(napi_alloc_frag);
+EXPORT_SYMBOL(__alloc_skb);
/**
* __netdev_alloc_skb - allocate an skbuff for rx on a specific device
@@ -387,14 +739,17 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
gfp_t gfp_mask)
{
struct page_frag_cache *nc;
- unsigned long flags;
struct sk_buff *skb;
bool pfmemalloc;
void *data;
len += NET_SKB_PAD;
- if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
+ /* If requested length is either too small or too big,
+ * we use kmalloc() for skb->head allocation.
+ */
+ if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
+ len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
if (!skb)
@@ -402,19 +757,26 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
goto skb_success;
}
- len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- len = SKB_DATA_ALIGN(len);
+ len = SKB_HEAD_ALIGN(len);
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- local_irq_save(flags);
+ if (in_hardirq() || irqs_disabled()) {
+ nc = this_cpu_ptr(&netdev_alloc_cache);
+ data = page_frag_alloc(nc, len, gfp_mask);
+ pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
+ } else {
+ local_bh_disable();
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
- nc = this_cpu_ptr(&netdev_alloc_cache);
- data = page_frag_alloc(nc, len, gfp_mask);
- pfmemalloc = nc->pfmemalloc;
+ nc = this_cpu_ptr(&napi_alloc_cache.page);
+ data = page_frag_alloc(nc, len, gfp_mask);
+ pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
- local_irq_restore(flags);
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
+ local_bh_enable();
+ }
if (unlikely(!data))
return NULL;
@@ -425,7 +787,6 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
return NULL;
}
- /* use OR instead of assignment to avoid clearing of bits in mask */
if (pfmemalloc)
skb->pfmemalloc = 1;
skb->head_frag = 1;
@@ -440,10 +801,9 @@ skb_fail:
EXPORT_SYMBOL(__netdev_alloc_skb);
/**
- * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
+ * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
* @napi: napi instance this buffer was allocated for
* @len: length to allocate
- * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
*
* Allocate a new sk_buff for use in NAPI receive. This buffer will
* attempt to allocate the head from a special reserved region used
@@ -452,41 +812,52 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
*
* %NULL is returned if there is no free memory.
*/
-struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
- gfp_t gfp_mask)
+struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
{
- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN;
+ struct napi_alloc_cache *nc;
struct sk_buff *skb;
+ bool pfmemalloc;
void *data;
+ DEBUG_NET_WARN_ON_ONCE(!in_softirq());
len += NET_SKB_PAD + NET_IP_ALIGN;
- if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
+ /* If requested length is either too small or too big,
+ * we use kmalloc() for skb->head allocation.
+ */
+ if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
+ len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
- skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
+ skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
+ NUMA_NO_NODE);
if (!skb)
goto skb_fail;
goto skb_success;
}
- len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- len = SKB_DATA_ALIGN(len);
+ len = SKB_HEAD_ALIGN(len);
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+ nc = this_cpu_ptr(&napi_alloc_cache);
+
data = page_frag_alloc(&nc->page, len, gfp_mask);
+ pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
+
if (unlikely(!data))
return NULL;
- skb = __build_skb(data, len);
+ skb = __napi_build_skb(data, len);
if (unlikely(!skb)) {
skb_free_frag(data);
return NULL;
}
- /* use OR instead of assignment to avoid clearing of bits in mask */
- if (nc->page.pfmemalloc)
+ if (pfmemalloc)
skb->pfmemalloc = 1;
skb->head_frag = 1;
@@ -497,23 +868,27 @@ skb_success:
skb_fail:
return skb;
}
-EXPORT_SYMBOL(__napi_alloc_skb);
+EXPORT_SYMBOL(napi_alloc_skb);
-void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off,
- int size, unsigned int truesize)
+void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,
+ int off, int size, unsigned int truesize)
{
- skb_fill_page_desc(skb, i, page, off, size);
+ DEBUG_NET_WARN_ON_ONCE(size > truesize);
+
+ skb_fill_netmem_desc(skb, i, netmem, off, size);
skb->len += size;
skb->data_len += size;
skb->truesize += truesize;
}
-EXPORT_SYMBOL(skb_add_rx_frag);
+EXPORT_SYMBOL(skb_add_rx_frag_netmem);
void skb_coalesce_rx_frag(struct sk_buff *skb, int i, int size,
unsigned int truesize)
{
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ DEBUG_NET_WARN_ON_ONCE(size > truesize);
+
skb_frag_size_add(frag, size);
skb->len += size;
skb->data_len += size;
@@ -540,34 +915,207 @@ static void skb_clone_fraglist(struct sk_buff *skb)
skb_get(list);
}
+int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
+ unsigned int headroom)
+{
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+ u32 size, truesize, len, max_head_size, off;
+ struct sk_buff *skb = *pskb, *nskb;
+ int err, i, head_off;
+ void *data;
+
+ /* XDP does not support fraglist so we need to linearize
+ * the skb.
+ */
+ if (skb_has_frag_list(skb))
+ return -EOPNOTSUPP;
+
+ max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom);
+ if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE)
+ return -ENOMEM;
+
+ size = min_t(u32, skb->len, max_head_size);
+ truesize = SKB_HEAD_ALIGN(size) + headroom;
+ data = page_pool_dev_alloc_va(pool, &truesize);
+ if (!data)
+ return -ENOMEM;
+
+ nskb = napi_build_skb(data, truesize);
+ if (!nskb) {
+ page_pool_free_va(pool, data, true);
+ return -ENOMEM;
+ }
+
+ skb_reserve(nskb, headroom);
+ skb_copy_header(nskb, skb);
+ skb_mark_for_recycle(nskb);
+
+ err = skb_copy_bits(skb, 0, nskb->data, size);
+ if (err) {
+ consume_skb(nskb);
+ return err;
+ }
+ skb_put(nskb, size);
+
+ head_off = skb_headroom(nskb) - skb_headroom(skb);
+ skb_headers_offset_update(nskb, head_off);
+
+ off = size;
+ len = skb->len - off;
+ for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) {
+ struct page *page;
+ u32 page_off;
+
+ size = min_t(u32, len, PAGE_SIZE);
+ truesize = size;
+
+ page = page_pool_dev_alloc(pool, &page_off, &truesize);
+ if (!page) {
+ consume_skb(nskb);
+ return -ENOMEM;
+ }
+
+ skb_add_rx_frag(nskb, i, page, page_off, size, truesize);
+ err = skb_copy_bits(skb, off, page_address(page) + page_off,
+ size);
+ if (err) {
+ consume_skb(nskb);
+ return err;
+ }
+
+ len -= size;
+ off += size;
+ }
+
+ consume_skb(skb);
+ *pskb = nskb;
+
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+EXPORT_SYMBOL(skb_pp_cow_data);
+
+int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
+ const struct bpf_prog *prog)
+{
+ if (!prog->aux->xdp_has_frags)
+ return -EINVAL;
+
+ return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM);
+}
+EXPORT_SYMBOL(skb_cow_data_for_xdp);
+
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+bool napi_pp_put_page(netmem_ref netmem)
+{
+ netmem = netmem_compound_head(netmem);
+
+ if (unlikely(!netmem_is_pp(netmem)))
+ return false;
+
+ page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false);
+
+ return true;
+}
+EXPORT_SYMBOL(napi_pp_put_page);
+#endif
+
+static bool skb_pp_recycle(struct sk_buff *skb, void *data)
+{
+ if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
+ return false;
+ return napi_pp_put_page(page_to_netmem(virt_to_page(data)));
+}
+
+/**
+ * skb_pp_frag_ref() - Increase fragment references of a page pool aware skb
+ * @skb: page pool aware skb
+ *
+ * Increase the fragment reference count (pp_ref_count) of a skb. This is
+ * intended to gain fragment references only for page pool aware skbs,
+ * i.e. when skb->pp_recycle is true, and not for fragments in a
+ * non-pp-recycling skb. It has a fallback to increase references on normal
+ * pages, as page pool aware skbs may also have normal page fragments.
+ */
+static int skb_pp_frag_ref(struct sk_buff *skb)
+{
+ struct skb_shared_info *shinfo;
+ netmem_ref head_netmem;
+ int i;
+
+ if (!skb->pp_recycle)
+ return -EINVAL;
+
+ shinfo = skb_shinfo(skb);
+
+ for (i = 0; i < shinfo->nr_frags; i++) {
+ head_netmem = netmem_compound_head(shinfo->frags[i].netmem);
+ if (likely(netmem_is_pp(head_netmem)))
+ page_pool_ref_netmem(head_netmem);
+ else
+ page_ref_inc(netmem_to_page(head_netmem));
+ }
+ return 0;
+}
+
+static void skb_kfree_head(void *head, unsigned int end_offset)
+{
+ if (end_offset == SKB_SMALL_HEAD_HEADROOM)
+ kmem_cache_free(net_hotdata.skb_small_head_cache, head);
+ else
+ kfree(head);
+}
+
static void skb_free_head(struct sk_buff *skb)
{
unsigned char *head = skb->head;
- if (skb->head_frag)
+ if (skb->head_frag) {
+ if (skb_pp_recycle(skb, head))
+ return;
skb_free_frag(head);
- else
- kfree(head);
+ } else {
+ skb_kfree_head(head, skb_end_offset(skb));
+ }
}
-static void skb_release_data(struct sk_buff *skb)
+static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
int i;
- if (skb->cloned &&
- atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
- &shinfo->dataref))
- return;
+ if (!skb_data_unref(skb, shinfo))
+ goto exit;
+
+ if (skb_zcopy(skb)) {
+ bool skip_unref = shinfo->flags & SKBFL_MANAGED_FRAG_REFS;
+
+ skb_zcopy_clear(skb, true);
+ if (skip_unref)
+ goto free_head;
+ }
for (i = 0; i < shinfo->nr_frags; i++)
- __skb_frag_unref(&shinfo->frags[i]);
+ __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
+free_head:
if (shinfo->frag_list)
- kfree_skb_list(shinfo->frag_list);
+ kfree_skb_list_reason(shinfo->frag_list, reason);
- skb_zcopy_clear(skb, true);
skb_free_head(skb);
+exit:
+ /* When we clone an SKB we copy the reycling bit. The pp_recycle
+ * bit is only set on the head though, so in order to avoid races
+ * while trying to recycle fragments on __skb_frag_unref() we need
+ * to make one SKB responsible for triggering the recycle path.
+ * So disable the recycling bit if an SKB is cloned and we have
+ * additional references to the fragmented part of the SKB.
+ * Eventually the last SKB will have the recycling bit set and it's
+ * dataref set to 0, which will trigger the recycling
+ */
+ skb->pp_recycle = 0;
}
/*
@@ -579,7 +1127,7 @@ static void kfree_skbmem(struct sk_buff *skb)
switch (skb->fclone) {
case SKB_FCLONE_UNAVAILABLE:
- kmem_cache_free(skbuff_head_cache, skb);
+ kmem_cache_free(net_hotdata.skbuff_cache, skb);
return;
case SKB_FCLONE_ORIG:
@@ -600,31 +1148,38 @@ static void kfree_skbmem(struct sk_buff *skb)
if (!refcount_dec_and_test(&fclones->fclone_ref))
return;
fastpath:
- kmem_cache_free(skbuff_fclone_cache, fclones);
+ kmem_cache_free(net_hotdata.skbuff_fclone_cache, fclones);
}
void skb_release_head_state(struct sk_buff *skb)
{
skb_dst_drop(skb);
- secpath_reset(skb);
if (skb->destructor) {
- WARN_ON(in_irq());
- skb->destructor(skb);
- }
-#if IS_ENABLED(CONFIG_NF_CONNTRACK)
- nf_conntrack_put(skb_nfct(skb));
-#endif
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- nf_bridge_put(skb->nf_bridge);
+ DEBUG_NET_WARN_ON_ONCE(in_hardirq());
+#ifdef CONFIG_INET
+ INDIRECT_CALL_4(skb->destructor,
+ tcp_wfree, __sock_wfree, sock_wfree,
+ xsk_destruct_skb,
+ skb);
+#else
+ INDIRECT_CALL_2(skb->destructor,
+ sock_wfree, xsk_destruct_skb,
+ skb);
+
#endif
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ }
+ nf_reset_ct(skb);
+ skb_ext_reset(skb);
}
/* Free everything but the sk_buff shell. */
-static void skb_release_all(struct sk_buff *skb)
+static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
{
skb_release_head_state(skb);
if (likely(skb->head))
- skb_release_data(skb);
+ skb_release_data(skb, reason);
}
/**
@@ -638,38 +1193,206 @@ static void skb_release_all(struct sk_buff *skb)
void __kfree_skb(struct sk_buff *skb)
{
- skb_release_all(skb);
+ skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
kfree_skbmem(skb);
}
EXPORT_SYMBOL(__kfree_skb);
+static __always_inline
+bool __sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb,
+ enum skb_drop_reason reason)
+{
+ if (unlikely(!skb_unref(skb)))
+ return false;
+
+ DEBUG_NET_WARN_ON_ONCE(reason == SKB_NOT_DROPPED_YET ||
+ u32_get_bits(reason,
+ SKB_DROP_REASON_SUBSYS_MASK) >=
+ SKB_DROP_REASON_SUBSYS_NUM);
+
+ if (reason == SKB_CONSUMED)
+ trace_consume_skb(skb, __builtin_return_address(0));
+ else
+ trace_kfree_skb(skb, __builtin_return_address(0), reason, sk);
+ return true;
+}
+
/**
- * kfree_skb - free an sk_buff
+ * sk_skb_reason_drop - free an sk_buff with special reason
+ * @sk: the socket to receive @skb, or NULL if not applicable
* @skb: buffer to free
+ * @reason: reason why this skb is dropped
*
- * Drop a reference to the buffer and free it if the usage count has
- * hit zero.
+ * Drop a reference to the buffer and free it if the usage count has hit
+ * zero. Meanwhile, pass the receiving socket and drop reason to
+ * 'kfree_skb' tracepoint.
*/
-void kfree_skb(struct sk_buff *skb)
+void __fix_address
+sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason)
{
- if (!skb_unref(skb))
+ if (__sk_skb_reason_drop(sk, skb, reason))
+ __kfree_skb(skb);
+}
+EXPORT_SYMBOL(sk_skb_reason_drop);
+
+#define KFREE_SKB_BULK_SIZE 16
+
+struct skb_free_array {
+ unsigned int skb_count;
+ void *skb_array[KFREE_SKB_BULK_SIZE];
+};
+
+static void kfree_skb_add_bulk(struct sk_buff *skb,
+ struct skb_free_array *sa,
+ enum skb_drop_reason reason)
+{
+ /* if SKB is a clone, don't handle this case */
+ if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) {
+ __kfree_skb(skb);
return;
+ }
- trace_kfree_skb(skb, __builtin_return_address(0));
- __kfree_skb(skb);
+ skb_release_all(skb, reason);
+ sa->skb_array[sa->skb_count++] = skb;
+
+ if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
+ kmem_cache_free_bulk(net_hotdata.skbuff_cache, KFREE_SKB_BULK_SIZE,
+ sa->skb_array);
+ sa->skb_count = 0;
+ }
}
-EXPORT_SYMBOL(kfree_skb);
-void kfree_skb_list(struct sk_buff *segs)
+void __fix_address
+kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason)
{
+ struct skb_free_array sa;
+
+ sa.skb_count = 0;
+
while (segs) {
struct sk_buff *next = segs->next;
- kfree_skb(segs);
+ if (__sk_skb_reason_drop(NULL, segs, reason)) {
+ skb_poison_list(segs);
+ kfree_skb_add_bulk(segs, &sa, reason);
+ }
+
segs = next;
}
+
+ if (sa.skb_count)
+ kmem_cache_free_bulk(net_hotdata.skbuff_cache, sa.skb_count, sa.skb_array);
+}
+EXPORT_SYMBOL(kfree_skb_list_reason);
+
+/* Dump skb information and contents.
+ *
+ * Must only be called from net_ratelimit()-ed paths.
+ *
+ * Dumps whole packets if full_pkt, only headers otherwise.
+ */
+void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
+{
+ struct skb_shared_info *sh = skb_shinfo(skb);
+ struct net_device *dev = skb->dev;
+ struct sock *sk = skb->sk;
+ struct sk_buff *list_skb;
+ bool has_mac, has_trans;
+ int headroom, tailroom;
+ int i, len, seg_len;
+
+ if (full_pkt)
+ len = skb->len;
+ else
+ len = min_t(int, skb->len, MAX_HEADER + 128);
+
+ headroom = skb_headroom(skb);
+ tailroom = skb_tailroom(skb);
+
+ has_mac = skb_mac_header_was_set(skb);
+ has_trans = skb_transport_header_was_set(skb);
+
+ printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n"
+ "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n"
+ "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
+ "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
+ "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n"
+ "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n"
+ "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n",
+ level, skb->len, headroom, skb_headlen(skb), tailroom,
+ has_mac ? skb->mac_header : -1,
+ has_mac ? skb_mac_header_len(skb) : -1,
+ skb->mac_len,
+ skb->network_header,
+ has_trans ? skb_network_header_len(skb) : -1,
+ has_trans ? skb->transport_header : -1,
+ sh->tx_flags, sh->nr_frags,
+ sh->gso_size, sh->gso_type, sh->gso_segs,
+ skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed,
+ skb->csum_complete_sw, skb->csum_valid, skb->csum_level,
+ skb->hash, skb->sw_hash, skb->l4_hash,
+ ntohs(skb->protocol), skb->pkt_type, skb->skb_iif,
+ skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all,
+ skb->encapsulation, skb->inner_protocol, skb->inner_mac_header,
+ skb->inner_network_header, skb->inner_transport_header);
+
+ if (dev)
+ printk("%sdev name=%s feat=%pNF\n",
+ level, dev->name, &dev->features);
+ if (sk)
+ printk("%ssk family=%hu type=%u proto=%u\n",
+ level, sk->sk_family, sk->sk_type, sk->sk_protocol);
+
+ if (full_pkt && headroom)
+ print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET,
+ 16, 1, skb->head, headroom, false);
+
+ seg_len = min_t(int, skb_headlen(skb), len);
+ if (seg_len)
+ print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET,
+ 16, 1, skb->data, seg_len, false);
+ len -= seg_len;
+
+ if (full_pkt && tailroom)
+ print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET,
+ 16, 1, skb_tail_pointer(skb), tailroom, false);
+
+ for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ u32 p_off, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
+
+ if (skb_frag_is_net_iov(frag)) {
+ printk("%sskb frag %d: not readable\n", level, i);
+ len -= skb_frag_size(frag);
+ if (!len)
+ break;
+ continue;
+ }
+
+ skb_frag_foreach_page(frag, skb_frag_off(frag),
+ skb_frag_size(frag), p, p_off, p_len,
+ copied) {
+ seg_len = min_t(int, p_len, len);
+ vaddr = kmap_atomic(p);
+ print_hex_dump(level, "skb frag: ",
+ DUMP_PREFIX_OFFSET,
+ 16, 1, vaddr + p_off, seg_len, false);
+ kunmap_atomic(vaddr);
+ len -= seg_len;
+ if (!len)
+ break;
+ }
+ }
+
+ if (full_pkt && skb_has_frag_list(skb)) {
+ printk("skb fraglist:\n");
+ skb_walk_frags(skb, list_skb)
+ skb_dump(level, list_skb, true);
+ }
}
-EXPORT_SYMBOL(kfree_skb_list);
+EXPORT_SYMBOL(skb_dump);
/**
* skb_tx_error - report an sk_buff xmit error
@@ -680,10 +1403,14 @@ EXPORT_SYMBOL(kfree_skb_list);
*/
void skb_tx_error(struct sk_buff *skb)
{
- skb_zcopy_clear(skb, true);
+ if (skb) {
+ skb_zcopy_downgrade_managed(skb);
+ skb_zcopy_clear(skb, true);
+ }
}
EXPORT_SYMBOL(skb_tx_error);
+#ifdef CONFIG_TRACEPOINTS
/**
* consume_skb - free an skbuff
* @skb: buffer to free
@@ -697,13 +1424,14 @@ void consume_skb(struct sk_buff *skb)
if (!skb_unref(skb))
return;
- trace_consume_skb(skb);
+ trace_consume_skb(skb, __builtin_return_address(0));
__kfree_skb(skb);
}
EXPORT_SYMBOL(consume_skb);
+#endif
/**
- * consume_stateless_skb - free an skbuff, assuming it is stateless
+ * __consume_stateless_skb - free an skbuff, assuming it is stateless
* @skb: buffer to free
*
* Alike consume_skb(), but this variant assumes that this is the last
@@ -711,66 +1439,74 @@ EXPORT_SYMBOL(consume_skb);
*/
void __consume_stateless_skb(struct sk_buff *skb)
{
- trace_consume_skb(skb);
- skb_release_data(skb);
+ trace_consume_skb(skb, __builtin_return_address(0));
+ skb_release_data(skb, SKB_CONSUMED);
kfree_skbmem(skb);
}
-void __kfree_skb_flush(void)
+static void napi_skb_cache_put(struct sk_buff *skb)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
- /* flush skb_cache if containing objects */
- if (nc->skb_count) {
- kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
- nc->skb_cache);
- nc->skb_count = 0;
- }
-}
-
-static inline void _kfree_skb_defer(struct sk_buff *skb)
-{
- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
-
- /* drop skb->head and call any destructors for packet */
- skb_release_all(skb);
+ if (!kasan_mempool_poison_object(skb))
+ return;
- /* record skb to CPU local list */
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
nc->skb_cache[nc->skb_count++] = skb;
-#ifdef CONFIG_SLUB
- /* SLUB writes into objects when freeing */
- prefetchw(skb);
-#endif
-
- /* flush skb_cache if it is filled */
if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
- kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE,
- nc->skb_cache);
- nc->skb_count = 0;
+ u32 i, remaining = NAPI_SKB_CACHE_SIZE - NAPI_SKB_CACHE_FREE;
+
+ for (i = remaining; i < NAPI_SKB_CACHE_SIZE; i++)
+ kasan_mempool_unpoison_object(nc->skb_cache[i],
+ skbuff_cache_size);
+
+ kmem_cache_free_bulk(net_hotdata.skbuff_cache,
+ NAPI_SKB_CACHE_FREE,
+ nc->skb_cache + remaining);
+ nc->skb_count = remaining;
}
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
}
-void __kfree_skb_defer(struct sk_buff *skb)
+
+void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)
{
- _kfree_skb_defer(skb);
+ skb_release_all(skb, reason);
+ napi_skb_cache_put(skb);
}
-void napi_consume_skb(struct sk_buff *skb, int budget)
+void napi_skb_free_stolen_head(struct sk_buff *skb)
{
- if (unlikely(!skb))
- return;
+ if (unlikely(skb->slow_gro)) {
+ nf_reset_ct(skb);
+ skb_dst_drop(skb);
+ skb_ext_put(skb);
+ skb_orphan(skb);
+ skb->slow_gro = 0;
+ }
+ napi_skb_cache_put(skb);
+}
+void napi_consume_skb(struct sk_buff *skb, int budget)
+{
/* Zero budget indicate non-NAPI context called us, like netpoll */
- if (unlikely(!budget)) {
+ if (unlikely(!budget || !skb)) {
dev_consume_skb_any(skb);
return;
}
+ DEBUG_NET_WARN_ON_ONCE(!in_softirq());
+
+ if (skb->alloc_cpu != smp_processor_id() && !skb_shared(skb)) {
+ skb_release_head_state(skb);
+ return skb_attempt_defer_free(skb);
+ }
+
if (!skb_unref(skb))
return;
/* if reaching here SKB is ready to free */
- trace_consume_skb(skb);
+ trace_consume_skb(skb, __builtin_return_address(0));
/* if SKB is a clone, don't handle this case */
if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
@@ -778,16 +1514,15 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
return;
}
- _kfree_skb_defer(skb);
+ skb_release_all(skb, SKB_CONSUMED);
+ napi_skb_cache_put(skb);
}
EXPORT_SYMBOL(napi_consume_skb);
-/* Make sure a field is enclosed inside headers_start/headers_end section */
+/* Make sure a field is contained by headers group */
#define CHECK_SKB_FIELD(field) \
- BUILD_BUG_ON(offsetof(struct sk_buff, field) < \
- offsetof(struct sk_buff, headers_start)); \
- BUILD_BUG_ON(offsetof(struct sk_buff, field) > \
- offsetof(struct sk_buff, headers_end)); \
+ BUILD_BUG_ON(offsetof(struct sk_buff, field) != \
+ offsetof(struct sk_buff, headers.field)); \
static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
{
@@ -796,19 +1531,15 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->dev = old->dev;
memcpy(new->cb, old->cb, sizeof(old->cb));
skb_dst_copy(new, old);
-#ifdef CONFIG_XFRM
- new->sp = secpath_get(old->sp);
-#endif
+ __skb_ext_copy(new, old);
__nf_copy(new, old, false);
- /* Note : this field could be in headers_start/headers_end section
+ /* Note : this field could be in the headers group.
* It is not yet because we do not want to have a 16 bit hole
*/
new->queue_mapping = old->queue_mapping;
- memcpy(&new->headers_start, &old->headers_start,
- offsetof(struct sk_buff, headers_end) -
- offsetof(struct sk_buff, headers_start));
+ memcpy(&new->headers, &old->headers, sizeof(new->headers));
CHECK_SKB_FIELD(protocol);
CHECK_SKB_FIELD(csum);
CHECK_SKB_FIELD(hash);
@@ -830,6 +1561,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
#ifdef CONFIG_NET_RX_BUSY_POLL
CHECK_SKB_FIELD(napi_id);
#endif
+ CHECK_SKB_FIELD(alloc_cpu);
#ifdef CONFIG_XPS
CHECK_SKB_FIELD(sender_cpu);
#endif
@@ -859,6 +1591,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
n->nohdr = 0;
n->peeked = 0;
C(pfmemalloc);
+ C(pp_recycle);
n->destructor = NULL;
C(tail);
C(end);
@@ -876,6 +1609,31 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
}
/**
+ * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg
+ * @first: first sk_buff of the msg
+ */
+struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)
+{
+ struct sk_buff *n;
+
+ n = alloc_skb(0, GFP_ATOMIC);
+ if (!n)
+ return NULL;
+
+ n->len = first->len;
+ n->data_len = first->len;
+ n->truesize = first->truesize;
+
+ skb_shinfo(n)->frag_list = first;
+
+ __copy_skb_header(n, first);
+ n->destructor = NULL;
+
+ return n;
+}
+EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
+
+/**
* skb_morph - morph one skb into another
* @dst: the skb to receive the contents
* @src: the skb to supply the contents
@@ -887,30 +1645,33 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
*/
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
- skb_release_all(dst);
+ skb_release_all(dst, SKB_CONSUMED);
return __skb_clone(dst, src);
}
EXPORT_SYMBOL_GPL(skb_morph);
int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
{
- unsigned long max_pg, num_pg, new_pg, old_pg;
+ unsigned long max_pg, num_pg, new_pg, old_pg, rlim;
struct user_struct *user;
if (capable(CAP_IPC_LOCK) || !size)
return 0;
+ rlim = rlimit(RLIMIT_MEMLOCK);
+ if (rlim == RLIM_INFINITY)
+ return 0;
+
num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */
- max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ max_pg = rlim >> PAGE_SHIFT;
user = mmp->user ? : current_user();
+ old_pg = atomic_long_read(&user->locked_vm);
do {
- old_pg = atomic_long_read(&user->locked_vm);
new_pg = old_pg + num_pg;
if (new_pg > max_pg)
return -ENOBUFS;
- } while (atomic_long_cmpxchg(&user->locked_vm, old_pg, new_pg) !=
- old_pg);
+ } while (!atomic_long_try_cmpxchg(&user->locked_vm, &old_pg, new_pg));
if (!mmp->user) {
mmp->user = get_uid(user);
@@ -932,9 +1693,10 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp)
}
EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
-struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
+static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size,
+ bool devmem)
{
- struct ubuf_info *uarg;
+ struct ubuf_info_msgzc *uarg;
struct sk_buff *skb;
WARN_ON_ONCE(!in_task());
@@ -947,35 +1709,40 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
uarg = (void *)skb->cb;
uarg->mmp.user = NULL;
- if (mm_account_pinned_pages(&uarg->mmp, size)) {
+ if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) {
kfree_skb(skb);
return NULL;
}
- uarg->callback = sock_zerocopy_callback;
+ uarg->ubuf.ops = &msg_zerocopy_ubuf_ops;
uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
uarg->len = 1;
uarg->bytelen = size;
uarg->zerocopy = 1;
- refcount_set(&uarg->refcnt, 1);
+ uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN;
+ refcount_set(&uarg->ubuf.refcnt, 1);
sock_hold(sk);
- return uarg;
+ return &uarg->ubuf;
}
-EXPORT_SYMBOL_GPL(sock_zerocopy_alloc);
-static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
+static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg)
{
return container_of((void *)uarg, struct sk_buff, cb);
}
-struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
- struct ubuf_info *uarg)
+struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
+ struct ubuf_info *uarg, bool devmem)
{
if (uarg) {
+ struct ubuf_info_msgzc *uarg_zc;
const u32 byte_limit = 1 << 19; /* limit to a few TSO */
u32 bytelen, next;
+ /* there might be non MSG_ZEROCOPY users */
+ if (uarg->ops != &msg_zerocopy_ubuf_ops)
+ return NULL;
+
/* realloc only when socket is locked (TCP, UDP cork),
* so uarg->len and sk_zckey access is serialized
*/
@@ -984,8 +1751,9 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
return NULL;
}
- bytelen = uarg->bytelen + size;
- if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
+ uarg_zc = uarg_to_msgzc(uarg);
+ bytelen = uarg_zc->bytelen + size;
+ if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) {
/* TCP can create new skb to attach new uarg */
if (sk->sk_type == SOCK_STREAM)
goto new_alloc;
@@ -993,21 +1761,26 @@ struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
}
next = (u32)atomic_read(&sk->sk_zckey);
- if ((u32)(uarg->id + uarg->len) == next) {
- if (mm_account_pinned_pages(&uarg->mmp, size))
+ if ((u32)(uarg_zc->id + uarg_zc->len) == next) {
+ if (likely(!devmem) &&
+ mm_account_pinned_pages(&uarg_zc->mmp, size))
return NULL;
- uarg->len++;
- uarg->bytelen = bytelen;
+ uarg_zc->len++;
+ uarg_zc->bytelen = bytelen;
atomic_set(&sk->sk_zckey, ++next);
- sock_zerocopy_get(uarg);
+
+ /* no extra ref when appending to datagram (MSG_MORE) */
+ if (sk->sk_type == SOCK_STREAM)
+ net_zcopy_get(uarg);
+
return uarg;
}
}
new_alloc:
- return sock_zerocopy_alloc(sk, size);
+ return msg_zerocopy_alloc(sk, size, devmem);
}
-EXPORT_SYMBOL_GPL(sock_zerocopy_realloc);
+EXPORT_SYMBOL_GPL(msg_zerocopy_realloc);
static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
{
@@ -1029,13 +1802,14 @@ static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
return true;
}
-void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
+static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg)
{
struct sk_buff *tail, *skb = skb_from_uarg(uarg);
struct sock_exterr_skb *serr;
struct sock *sk = skb->sk;
struct sk_buff_head *q;
unsigned long flags;
+ bool is_zerocopy;
u32 lo, hi;
u16 len;
@@ -1050,6 +1824,7 @@ void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
len = uarg->len;
lo = uarg->id;
hi = uarg->id + len - 1;
+ is_zerocopy = uarg->zerocopy;
serr = SKB_EXT_ERR(skb);
memset(serr, 0, sizeof(*serr));
@@ -1057,7 +1832,7 @@ void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
serr->ee.ee_data = hi;
serr->ee.ee_info = lo;
- if (!success)
+ if (!is_zerocopy)
serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
q = &sk->sk_error_queue;
@@ -1070,72 +1845,91 @@ void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
}
spin_unlock_irqrestore(&q->lock, flags);
- sk->sk_error_report(sk);
+ sk_error_report(sk);
release:
consume_skb(skb);
sock_put(sk);
}
-EXPORT_SYMBOL_GPL(sock_zerocopy_callback);
-void sock_zerocopy_put(struct ubuf_info *uarg)
+static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg,
+ bool success)
{
- if (uarg && refcount_dec_and_test(&uarg->refcnt)) {
- if (uarg->callback)
- uarg->callback(uarg, uarg->zerocopy);
- else
- consume_skb(skb_from_uarg(uarg));
- }
+ struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg);
+
+ uarg_zc->zerocopy = uarg_zc->zerocopy & success;
+
+ if (refcount_dec_and_test(&uarg->refcnt))
+ __msg_zerocopy_callback(uarg_zc);
}
-EXPORT_SYMBOL_GPL(sock_zerocopy_put);
-void sock_zerocopy_put_abort(struct ubuf_info *uarg)
+void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
- if (uarg) {
- struct sock *sk = skb_from_uarg(uarg)->sk;
+ struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk;
- atomic_dec(&sk->sk_zckey);
- uarg->len--;
+ atomic_dec(&sk->sk_zckey);
+ uarg_to_msgzc(uarg)->len--;
- sock_zerocopy_put(uarg);
- }
+ if (have_uref)
+ msg_zerocopy_complete(NULL, uarg, true);
}
-EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
+EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort);
-extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
- struct iov_iter *from, size_t length);
+const struct ubuf_info_ops msg_zerocopy_ubuf_ops = {
+ .complete = msg_zerocopy_complete,
+};
+EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops);
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len,
- struct ubuf_info *uarg)
+ struct ubuf_info *uarg,
+ struct net_devmem_dmabuf_binding *binding)
{
- struct ubuf_info *orig_uarg = skb_zcopy(skb);
- struct iov_iter orig_iter = msg->msg_iter;
int err, orig_len = skb->len;
- /* An skb can only point to one uarg. This edge case happens when
- * TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
- */
- if (orig_uarg && uarg != orig_uarg)
- return -EEXIST;
+ if (uarg->ops->link_skb) {
+ err = uarg->ops->link_skb(skb, uarg);
+ if (err)
+ return err;
+ } else {
+ struct ubuf_info *orig_uarg = skb_zcopy(skb);
+
+ /* An skb can only point to one uarg. This edge case happens
+ * when TCP appends to an skb, but zerocopy_realloc triggered
+ * a new alloc.
+ */
+ if (orig_uarg && uarg != orig_uarg)
+ return -EEXIST;
+ }
- err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
+ err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len,
+ binding);
if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
struct sock *save_sk = skb->sk;
/* Streams do not free skb on error. Reset to prev state. */
- msg->msg_iter = orig_iter;
+ iov_iter_revert(&msg->msg_iter, skb->len - orig_len);
skb->sk = sk;
___pskb_trim(skb, orig_len);
skb->sk = save_sk;
return err;
}
- skb_zcopy_set(skb, uarg);
+ skb_zcopy_set(skb, uarg, NULL);
return skb->len - orig_len;
}
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
+void __skb_zcopy_downgrade_managed(struct sk_buff *skb)
+{
+ int i;
+
+ skb_shinfo(skb)->flags &= ~SKBFL_MANAGED_FRAG_REFS;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ skb_frag_ref(skb, i);
+}
+EXPORT_SYMBOL_GPL(__skb_zcopy_downgrade_managed);
+
static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
gfp_t gfp_mask)
{
@@ -1151,7 +1945,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
if (skb_copy_ubufs(nskb, GFP_ATOMIC))
return -EIO;
}
- skb_zcopy_set(nskb, skb_uarg(orig));
+ skb_zcopy_set(nskb, skb_uarg(orig), NULL);
}
return 0;
}
@@ -1161,7 +1955,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
* @skb: the skb to modify
* @gfp_mask: allocation priority
*
- * This must be called on SKBTX_DEV_ZEROCOPY skb.
+ * This must be called on skb with SKBFL_ZEROCOPY_ENABLE.
* It will copy all frags into kernel and drop the reference
* to userspace pages.
*
@@ -1175,18 +1969,29 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
{
int num_frags = skb_shinfo(skb)->nr_frags;
struct page *page, *head = NULL;
- int i, new_frags;
+ int i, order, psize, new_frags;
u32 d_off;
if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
return -EINVAL;
+ if (!skb_frags_readable(skb))
+ return -EFAULT;
+
if (!num_frags)
goto release;
- new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ /* We might have to allocate high order pages, so compute what minimum
+ * page order is needed.
+ */
+ order = 0;
+ while ((PAGE_SIZE << order) * MAX_SKB_FRAGS < __skb_pagelen(skb))
+ order++;
+ psize = (PAGE_SIZE << order);
+
+ new_frags = (__skb_pagelen(skb) + psize - 1) >> (PAGE_SHIFT + order);
for (i = 0; i < new_frags; i++) {
- page = alloc_page(gfp_mask);
+ page = alloc_pages(gfp_mask | __GFP_COMP, order);
if (!page) {
while (head) {
struct page *next = (struct page *)page_private(head);
@@ -1207,17 +2012,17 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
struct page *p;
u8 *vaddr;
- skb_frag_foreach_page(f, f->page_offset, skb_frag_size(f),
+ skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
p, p_off, p_len, copied) {
u32 copy, done = 0;
vaddr = kmap_atomic(p);
while (done < p_len) {
- if (d_off == PAGE_SIZE) {
+ if (d_off == psize) {
d_off = 0;
page = (struct page *)page_private(page);
}
- copy = min_t(u32, PAGE_SIZE - d_off, p_len - done);
+ copy = min_t(u32, psize - d_off, p_len - done);
memcpy(page_address(page) + d_off,
vaddr + p_off + done, copy);
done += copy;
@@ -1233,10 +2038,11 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
/* skb frags point to kernel buffers */
for (i = 0; i < new_frags - 1; i++) {
- __skb_fill_page_desc(skb, i, head, 0, PAGE_SIZE);
+ __skb_fill_netmem_desc(skb, i, page_to_netmem(head), 0, psize);
head = (struct page *)page_private(head);
}
- __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
+ __skb_fill_netmem_desc(skb, new_frags - 1, page_to_netmem(head), 0,
+ d_off);
skb_shinfo(skb)->nr_frags = new_frags;
release:
@@ -1273,11 +2079,12 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
refcount_read(&fclones->fclone_ref) == 1) {
n = &fclones->skb2;
refcount_set(&fclones->fclone_ref, 2);
+ n->fclone = SKB_FCLONE_CLONE;
} else {
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
- n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
+ n = kmem_cache_alloc(net_hotdata.skbuff_cache, gfp_mask);
if (!n)
return NULL;
@@ -1340,11 +2147,20 @@ static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
{
- int headerlen = skb_headroom(skb);
- unsigned int size = skb_end_offset(skb) + skb->data_len;
- struct sk_buff *n = __alloc_skb(size, gfp_mask,
- skb_alloc_rx_flag(skb), NUMA_NO_NODE);
+ struct sk_buff *n;
+ unsigned int size;
+ int headerlen;
+
+ if (!skb_frags_readable(skb))
+ return NULL;
+ if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
+ return NULL;
+
+ headerlen = skb_headroom(skb);
+ size = skb_end_offset(skb) + skb->data_len;
+ n = __alloc_skb(size, gfp_mask,
+ skb_alloc_rx_flag(skb), NUMA_NO_NODE);
if (!n)
return NULL;
@@ -1439,29 +2255,34 @@ EXPORT_SYMBOL(__pskb_copy_fclone);
*
* All the pointers pointing into skb header may change and must be
* reloaded after call to this function.
+ *
+ * Note: If you skb_push() the start of the buffer after reallocating the
+ * header, call skb_postpush_data_move() first to move the metadata out of
+ * the way before writing to &sk_buff->data.
*/
int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
gfp_t gfp_mask)
{
- int i, osize = skb_end_offset(skb);
- int size = osize + nhead + ntail;
+ unsigned int osize = skb_end_offset(skb);
+ unsigned int size = osize + nhead + ntail;
long off;
u8 *data;
+ int i;
BUG_ON(nhead < 0);
BUG_ON(skb_shared(skb));
- size = SKB_DATA_ALIGN(size);
+ skb_zcopy_downgrade_managed(skb);
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
- data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
- gfp_mask, NUMA_NO_NODE, NULL);
+
+ data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
goto nodata;
- size = SKB_WITH_OVERHEAD(ksize(data));
+ size = SKB_WITH_OVERHEAD(size);
/* Copy only real data... and, alas, header. This should be
* optimized for the cases when header is void.
@@ -1488,7 +2309,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
- skb_release_data(skb);
+ skb_release_data(skb, SKB_CONSUMED);
} else {
skb_free_head(skb);
}
@@ -1497,11 +2318,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
skb->head = data;
skb->head_frag = 0;
skb->data += off;
+
+ skb_set_end_offset(skb, size);
#ifdef NET_SKBUFF_DATA_USES_OFFSET
- skb->end = size;
off = nhead;
-#else
- skb->end = skb->head + size;
#endif
skb->tail += off;
skb_headers_offset_update(skb, nhead);
@@ -1510,8 +2330,6 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
skb->nohdr = 0;
atomic_set(&skb_shinfo(skb)->dataref, 1);
- skb_metadata_clear(skb);
-
/* It is not generally safe to change skb->truesize.
* For the moment, we really care of rx path, or
* when skb is orphaned (not attached to a socket).
@@ -1522,7 +2340,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
return 0;
nofrags:
- kfree(data);
+ skb_kfree_head(data, size);
nodata:
return -ENOMEM;
}
@@ -1549,6 +2367,104 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
}
EXPORT_SYMBOL(skb_realloc_headroom);
+/* Note: We plan to rework this in linux-6.4 */
+int __skb_unclone_keeptruesize(struct sk_buff *skb, gfp_t pri)
+{
+ unsigned int saved_end_offset, saved_truesize;
+ struct skb_shared_info *shinfo;
+ int res;
+
+ saved_end_offset = skb_end_offset(skb);
+ saved_truesize = skb->truesize;
+
+ res = pskb_expand_head(skb, 0, 0, pri);
+ if (res)
+ return res;
+
+ skb->truesize = saved_truesize;
+
+ if (likely(skb_end_offset(skb) == saved_end_offset))
+ return 0;
+
+ /* We can not change skb->end if the original or new value
+ * is SKB_SMALL_HEAD_HEADROOM, as it might break skb_kfree_head().
+ */
+ if (saved_end_offset == SKB_SMALL_HEAD_HEADROOM ||
+ skb_end_offset(skb) == SKB_SMALL_HEAD_HEADROOM) {
+ /* We think this path should not be taken.
+ * Add a temporary trace to warn us just in case.
+ */
+ pr_err_once("__skb_unclone_keeptruesize() skb_end_offset() %u -> %u\n",
+ saved_end_offset, skb_end_offset(skb));
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ shinfo = skb_shinfo(skb);
+
+ /* We are about to change back skb->end,
+ * we need to move skb_shinfo() to its new location.
+ */
+ memmove(skb->head + saved_end_offset,
+ shinfo,
+ offsetof(struct skb_shared_info, frags[shinfo->nr_frags]));
+
+ skb_set_end_offset(skb, saved_end_offset);
+
+ return 0;
+}
+
+/**
+ * skb_expand_head - reallocate header of &sk_buff
+ * @skb: buffer to reallocate
+ * @headroom: needed headroom
+ *
+ * Unlike skb_realloc_headroom, this one does not allocate a new skb
+ * if possible; copies skb->sk to new skb as needed
+ * and frees original skb in case of failures.
+ *
+ * It expect increased headroom and generates warning otherwise.
+ */
+
+struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
+{
+ int delta = headroom - skb_headroom(skb);
+ int osize = skb_end_offset(skb);
+ struct sock *sk = skb->sk;
+
+ if (WARN_ONCE(delta <= 0,
+ "%s is expecting an increase in the headroom", __func__))
+ return skb;
+
+ delta = SKB_DATA_ALIGN(delta);
+ /* pskb_expand_head() might crash, if skb is shared. */
+ if (skb_shared(skb) || !is_skb_wmem(skb)) {
+ struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+
+ if (unlikely(!nskb))
+ goto fail;
+
+ if (sk)
+ skb_set_owner_w(nskb, sk);
+ consume_skb(skb);
+ skb = nskb;
+ }
+ if (pskb_expand_head(skb, delta, 0, GFP_ATOMIC))
+ goto fail;
+
+ if (sk && is_skb_wmem(skb)) {
+ delta = skb_end_offset(skb) - osize;
+ refcount_add(delta, &sk->sk_wmem_alloc);
+ skb->truesize += delta;
+ }
+ return skb;
+
+fail:
+ kfree_skb(skb);
+ return NULL;
+}
+EXPORT_SYMBOL(skb_expand_head);
+
/**
* skb_copy_expand - copy and expand sk_buff
* @skb: buffer to copy
@@ -1574,12 +2490,20 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
/*
* Allocate the copy buffer
*/
- struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
- gfp_mask, skb_alloc_rx_flag(skb),
- NUMA_NO_NODE);
- int oldheadroom = skb_headroom(skb);
int head_copy_len, head_copy_off;
+ struct sk_buff *n;
+ int oldheadroom;
+ if (!skb_frags_readable(skb))
+ return NULL;
+
+ if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
+ return NULL;
+
+ oldheadroom = skb_headroom(skb);
+ n = __alloc_skb(newheadroom + skb->len + newtailroom,
+ gfp_mask, skb_alloc_rx_flag(skb),
+ NUMA_NO_NODE);
if (!n)
return NULL;
@@ -1736,6 +2660,30 @@ void *skb_pull(struct sk_buff *skb, unsigned int len)
EXPORT_SYMBOL(skb_pull);
/**
+ * skb_pull_data - remove data from the start of a buffer returning its
+ * original position.
+ * @skb: buffer to use
+ * @len: amount of data to remove
+ *
+ * This function removes data from the start of a buffer, returning
+ * the memory to the headroom. A pointer to the original data in the buffer
+ * is returned after checking if there is enough data to pull. Once the
+ * data has been pulled future pushes will overwrite the old data.
+ */
+void *skb_pull_data(struct sk_buff *skb, size_t len)
+{
+ void *data = skb->data;
+
+ if (skb->len < len)
+ return NULL;
+
+ skb_pull(skb, len);
+
+ return data;
+}
+EXPORT_SYMBOL(skb_pull_data);
+
+/**
* skb_trim - remove end from a buffer
* @skb: buffer to alter
* @len: new length
@@ -1846,8 +2794,15 @@ int pskb_trim_rcsum_slow(struct sk_buff *skb, unsigned int len)
if (skb->ip_summed == CHECKSUM_COMPLETE) {
int delta = skb->len - len;
- skb->csum = csum_sub(skb->csum,
- skb_checksum(skb, len, delta, 0));
+ skb->csum = csum_block_sub(skb->csum,
+ skb_checksum(skb, len, delta, 0),
+ len);
+ } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
+ int offset = skb_checksum_start_offset(skb) + skb->csum_offset;
+
+ if (offset + sizeof(__sum16) > hdlen)
+ return -EINVAL;
}
return __pskb_trim(skb, len);
}
@@ -1886,6 +2841,9 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
*/
int i, k, eat = (skb->tail + delta) - skb->end;
+ if (!skb_frags_readable(skb))
+ return NULL;
+
if (eat > 0 || skb_cloned(skb)) {
if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
GFP_ATOMIC))
@@ -1924,8 +2882,6 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
struct sk_buff *insp = NULL;
do {
- BUG_ON(!list);
-
if (list->len <= eat) {
/* Eaten as whole. */
eat -= list->len;
@@ -1933,6 +2889,9 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
insp = list;
} else {
/* Eaten partially. */
+ if (skb_is_gso(skb) && !list->head_frag &&
+ skb_headlen(list))
+ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
if (skb_shared(list)) {
/* Sucks! We need to fork list. :-( */
@@ -1957,7 +2916,7 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
/* Free pulled out fragments. */
while ((list = skb_shinfo(skb)->frag_list) != insp) {
skb_shinfo(skb)->frag_list = list->next;
- kfree_skb(list);
+ consume_skb(list);
}
/* And insert new clone at head. */
if (clone) {
@@ -1977,10 +2936,12 @@ pull_pages:
skb_frag_unref(skb, i);
eat -= size;
} else {
- skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[k];
+
+ *frag = skb_shinfo(skb)->frags[i];
if (eat) {
- skb_shinfo(skb)->frags[k].page_offset += eat;
- skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);
+ skb_frag_off_add(frag, eat);
+ skb_frag_size_sub(frag, eat);
if (!i)
goto end;
eat = 0;
@@ -2036,6 +2997,9 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
to += copy;
}
+ if (!skb_frags_readable(skb))
+ goto fault;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
skb_frag_t *f = &skb_shinfo(skb)->frags[i];
@@ -2052,7 +3016,7 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
copy = len;
skb_frag_foreach_page(f,
- f->page_offset + offset - start,
+ skb_frag_off(f) + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_atomic(p);
memcpy(to + copied, vaddr + p_off, p_len);
@@ -2135,10 +3099,8 @@ static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
/*
* Fill page/offset/length into spd, if it can hold more pages.
*/
-static bool spd_fill_page(struct splice_pipe_desc *spd,
- struct pipe_inode_info *pipe, struct page *page,
- unsigned int *len, unsigned int offset,
- bool linear,
+static bool spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
+ unsigned int *len, unsigned int offset, bool linear,
struct sock *sk)
{
if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
@@ -2166,8 +3128,7 @@ static bool __splice_segment(struct page *page, unsigned int poff,
unsigned int plen, unsigned int *off,
unsigned int *len,
struct splice_pipe_desc *spd, bool linear,
- struct sock *sk,
- struct pipe_inode_info *pipe)
+ struct sock *sk)
{
if (!*len)
return true;
@@ -2186,13 +3147,14 @@ static bool __splice_segment(struct page *page, unsigned int poff,
do {
unsigned int flen = min(*len, plen);
- if (spd_fill_page(spd, pipe, page, &flen, poff,
- linear, sk))
+ if (spd_fill_page(spd, page, &flen, poff, linear, sk))
return true;
poff += flen;
plen -= flen;
*len -= flen;
- } while (*len && plen);
+ if (!*len)
+ return true;
+ } while (plen);
return false;
}
@@ -2205,8 +3167,8 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
unsigned int *offset, unsigned int *len,
struct splice_pipe_desc *spd, struct sock *sk)
{
- int seg;
struct sk_buff *iter;
+ int seg;
/* map the linear part :
* If skb->head_frag is set, this 'linear' part is backed by a
@@ -2218,18 +3180,24 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
skb_headlen(skb),
offset, len, spd,
skb_head_is_locked(skb),
- sk, pipe))
+ sk))
return true;
/*
* then map the fragments
*/
+ if (!skb_frags_readable(skb))
+ return false;
+
for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
+ if (WARN_ON_ONCE(!skb_frag_page(f)))
+ return false;
+
if (__splice_segment(skb_frag_page(f),
- f->page_offset, skb_frag_size(f),
- offset, len, spd, false, sk, pipe))
+ skb_frag_off(f), skb_frag_size(f),
+ offset, len, spd, false, sk))
return true;
}
@@ -2277,10 +3245,34 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
}
EXPORT_SYMBOL_GPL(skb_splice_bits);
-/* Send skb data on a socket. Socket must be locked. */
-int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
- int len)
+static int sendmsg_locked(struct sock *sk, struct msghdr *msg)
+{
+ struct socket *sock = sk->sk_socket;
+ size_t size = msg_data_left(msg);
+
+ if (!sock)
+ return -EINVAL;
+
+ if (!sock->ops->sendmsg_locked)
+ return sock_no_sendmsg_locked(sk, msg, size);
+
+ return sock->ops->sendmsg_locked(sk, msg, size);
+}
+
+static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (!sock)
+ return -EINVAL;
+ return sock_sendmsg(sock, msg);
+}
+
+typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg);
+static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
+ int len, sendmsg_func sendmsg, int flags)
{
+ int more_hint = sk_is_tcp(sk) ? MSG_MORE : 0;
unsigned int orig_len = len;
struct sk_buff *head = skb;
unsigned short fragidx;
@@ -2297,8 +3289,13 @@ do_frag_list:
kv.iov_base = skb->data + offset;
kv.iov_len = slen;
memset(&msg, 0, sizeof(msg));
+ msg.msg_flags = MSG_DONTWAIT | flags;
+ if (slen < len)
+ msg.msg_flags |= more_hint;
- ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen);
+ iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen);
+ ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
+ sendmsg_unlocked, sk, &msg);
if (ret <= 0)
goto error;
@@ -2317,21 +3314,33 @@ do_frag_list:
for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx];
- if (offset < frag->size)
+ if (offset < skb_frag_size(frag))
break;
- offset -= frag->size;
+ offset -= skb_frag_size(frag);
}
for (; len && fragidx < skb_shinfo(skb)->nr_frags; fragidx++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[fragidx];
- slen = min_t(size_t, len, frag->size - offset);
+ slen = min_t(size_t, len, skb_frag_size(frag) - offset);
while (slen) {
- ret = kernel_sendpage_locked(sk, frag->page.p,
- frag->page_offset + offset,
- slen, MSG_DONTWAIT);
+ struct bio_vec bvec;
+ struct msghdr msg = {
+ .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT |
+ flags,
+ };
+
+ if (slen < len)
+ msg.msg_flags |= more_hint;
+ bvec_set_page(&bvec, skb_frag_page(frag), slen,
+ skb_frag_off(frag) + offset);
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1,
+ slen);
+
+ ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
+ sendmsg_unlocked, sk, &msg);
if (ret <= 0)
goto error;
@@ -2363,20 +3372,27 @@ out:
error:
return orig_len == len ? ret : orig_len - len;
}
-EXPORT_SYMBOL_GPL(skb_send_sock_locked);
-/* Send skb data on a socket. */
-int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
+/* Send skb data on a socket. Socket must be locked. */
+int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
+ int len)
{
- int ret = 0;
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, 0);
+}
+EXPORT_SYMBOL_GPL(skb_send_sock_locked);
- lock_sock(sk);
- ret = skb_send_sock_locked(sk, skb, offset, len);
- release_sock(sk);
+int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb,
+ int offset, int len, int flags)
+{
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, flags);
+}
+EXPORT_SYMBOL_GPL(skb_send_sock_locked_with_flags);
- return ret;
+/* Send skb data on a socket. Socket must be unlocked. */
+int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
+{
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 0);
}
-EXPORT_SYMBOL_GPL(skb_send_sock);
/**
* skb_store_bits - store bits from kernel buffer to skb
@@ -2409,6 +3425,9 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
from += copy;
}
+ if (!skb_frags_readable(skb))
+ goto fault;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
int end;
@@ -2425,7 +3444,7 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
copy = len;
skb_frag_foreach_page(frag,
- frag->page_offset + offset - start,
+ skb_frag_off(frag) + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_atomic(p);
memcpy(vaddr + p_off, from + copied, p_len);
@@ -2468,8 +3487,7 @@ fault:
EXPORT_SYMBOL(skb_store_bits);
/* Checksum skb data. */
-__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
- __wsum csum, const struct skb_checksum_ops *ops)
+__wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
@@ -2480,13 +3498,16 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
if (copy > 0) {
if (copy > len)
copy = len;
- csum = ops->update(skb->data + offset, copy, csum);
+ csum = csum_partial(skb->data + offset, copy, csum);
if ((len -= copy) == 0)
return csum;
offset += copy;
pos = copy;
}
+ if (WARN_ON_ONCE(!skb_frags_readable(skb)))
+ return 0;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
@@ -2504,12 +3525,12 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
copy = len;
skb_frag_foreach_page(frag,
- frag->page_offset + offset - start,
+ skb_frag_off(frag) + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_atomic(p);
- csum2 = ops->update(vaddr + p_off, p_len, 0);
+ csum2 = csum_partial(vaddr + p_off, p_len, 0);
kunmap_atomic(vaddr);
- csum = ops->combine(csum, csum2, pos, p_len);
+ csum = csum_block_add(csum, csum2, pos);
pos += p_len;
}
@@ -2530,9 +3551,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
__wsum csum2;
if (copy > len)
copy = len;
- csum2 = __skb_checksum(frag_iter, offset - start,
- copy, 0, ops);
- csum = ops->combine(csum, csum2, pos, copy);
+ csum2 = skb_checksum(frag_iter, offset - start, copy,
+ 0);
+ csum = csum_block_add(csum, csum2, pos);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -2544,36 +3565,25 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
return csum;
}
-EXPORT_SYMBOL(__skb_checksum);
-
-__wsum skb_checksum(const struct sk_buff *skb, int offset,
- int len, __wsum csum)
-{
- const struct skb_checksum_ops ops = {
- .update = csum_partial_ext,
- .combine = csum_block_add_ext,
- };
-
- return __skb_checksum(skb, offset, len, csum, &ops);
-}
EXPORT_SYMBOL(skb_checksum);
/* Both of above in one bottle. */
__wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
- u8 *to, int len, __wsum csum)
+ u8 *to, int len)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
struct sk_buff *frag_iter;
int pos = 0;
+ __wsum csum = 0;
/* Copy header. */
if (copy > 0) {
if (copy > len)
copy = len;
csum = csum_partial_copy_nocheck(skb->data + offset, to,
- copy, csum);
+ copy);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -2581,6 +3591,9 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
pos = copy;
}
+ if (!skb_frags_readable(skb))
+ return 0;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
@@ -2598,12 +3611,12 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
copy = len;
skb_frag_foreach_page(frag,
- frag->page_offset + offset - start,
+ skb_frag_off(frag) + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_atomic(p);
csum2 = csum_partial_copy_nocheck(vaddr + p_off,
to + copied,
- p_len, 0);
+ p_len);
kunmap_atomic(vaddr);
csum = csum_block_add(csum, csum2, pos);
pos += p_len;
@@ -2629,7 +3642,7 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
copy = len;
csum2 = skb_copy_and_csum_bits(frag_iter,
offset - start,
- to, copy, 0);
+ to, copy);
csum = csum_block_add(csum, csum2, pos);
if ((len -= copy) == 0)
return csum;
@@ -2644,31 +3657,136 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
}
EXPORT_SYMBOL(skb_copy_and_csum_bits);
-static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum)
+#ifdef CONFIG_NET_CRC32C
+u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc)
{
- net_warn_ratelimited(
- "%s: attempt to compute crc32c without libcrc32c.ko\n",
- __func__);
- return 0;
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+
+ if (copy > 0) {
+ copy = min(copy, len);
+ crc = crc32c(crc, skb->data + offset, copy);
+ len -= copy;
+ if (len == 0)
+ return crc;
+ offset += copy;
+ }
+
+ if (WARN_ON_ONCE(!skb_frags_readable(skb)))
+ return 0;
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ copy = end - offset;
+ if (copy > 0) {
+ u32 p_off, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
+
+ copy = min(copy, len);
+ skb_frag_foreach_page(frag,
+ skb_frag_off(frag) + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_atomic(p);
+ crc = crc32c(crc, vaddr + p_off, p_len);
+ kunmap_atomic(vaddr);
+ }
+ len -= copy;
+ if (len == 0)
+ return crc;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ copy = end - offset;
+ if (copy > 0) {
+ copy = min(copy, len);
+ crc = skb_crc32c(frag_iter, offset - start, copy, crc);
+ len -= copy;
+ if (len == 0)
+ return crc;
+ offset += copy;
+ }
+ start = end;
+ }
+ BUG_ON(len);
+
+ return crc;
}
+EXPORT_SYMBOL(skb_crc32c);
+#endif /* CONFIG_NET_CRC32C */
-static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2,
- int offset, int len)
+__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
{
- net_warn_ratelimited(
- "%s: attempt to compute crc32c without libcrc32c.ko\n",
- __func__);
- return 0;
+ __sum16 sum;
+
+ sum = csum_fold(skb_checksum(skb, 0, len, skb->csum));
+ /* See comments in __skb_checksum_complete(). */
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev, skb);
+ }
+ if (!skb_shared(skb))
+ skb->csum_valid = !sum;
+ return sum;
}
+EXPORT_SYMBOL(__skb_checksum_complete_head);
-static const struct skb_checksum_ops default_crc32c_ops = {
- .update = warn_crc32c_csum_update,
- .combine = warn_crc32c_csum_combine,
-};
+/* This function assumes skb->csum already holds pseudo header's checksum,
+ * which has been changed from the hardware checksum, for example, by
+ * __skb_checksum_validate_complete(). And, the original skb->csum must
+ * have been validated unsuccessfully for CHECKSUM_COMPLETE case.
+ *
+ * It returns non-zero if the recomputed checksum is still invalid, otherwise
+ * zero. The new checksum is stored back into skb->csum unless the skb is
+ * shared.
+ */
+__sum16 __skb_checksum_complete(struct sk_buff *skb)
+{
+ __wsum csum;
+ __sum16 sum;
-const struct skb_checksum_ops *crc32c_csum_stub __read_mostly =
- &default_crc32c_ops;
-EXPORT_SYMBOL(crc32c_csum_stub);
+ csum = skb_checksum(skb, 0, skb->len, 0);
+
+ sum = csum_fold(csum_add(skb->csum, csum));
+ /* This check is inverted, because we already knew the hardware
+ * checksum is invalid before calling this function. So, if the
+ * re-computed checksum is valid instead, then we have a mismatch
+ * between the original skb->csum and skb_checksum(). This means either
+ * the original hardware checksum is incorrect or we screw up skb->csum
+ * when moving skb->data around.
+ */
+ if (likely(!sum)) {
+ if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
+ !skb->csum_complete_sw)
+ netdev_rx_csum_fault(skb->dev, skb);
+ }
+
+ if (!skb_shared(skb)) {
+ /* Save full packet checksum */
+ skb->csum = csum;
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ skb->csum_complete_sw = 1;
+ skb->csum_valid = !sum;
+ }
+
+ return sum;
+}
+EXPORT_SYMBOL(__skb_checksum_complete);
/**
* skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
@@ -2684,8 +3802,11 @@ skb_zerocopy_headlen(const struct sk_buff *from)
if (!from->head_frag ||
skb_headlen(from) < L1_CACHE_BYTES ||
- skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS)
+ skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) {
hlen = skb_headlen(from);
+ if (!hlen)
+ hlen = from->len;
+ }
if (skb_has_frag_list(from))
hlen = from->len;
@@ -2737,16 +3858,15 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
if (plen) {
page = virt_to_head_page(from->head);
offset = from->data - (unsigned char *)page_address(page);
- __skb_fill_page_desc(to, 0, page, offset, plen);
+ __skb_fill_netmem_desc(to, 0, page_to_netmem(page),
+ offset, plen);
get_page(page);
j = 1;
len -= plen;
}
}
- to->truesize += len + plen;
- to->len += len + plen;
- to->data_len += len + plen;
+ skb_len_add(to, len + plen);
if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) {
skb_tx_error(from);
@@ -2755,11 +3875,15 @@ skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen)
skb_zerocopy_clone(to, from, GFP_ATOMIC);
for (i = 0; i < skb_shinfo(from)->nr_frags; i++) {
+ int size;
+
if (!len)
break;
skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i];
- skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len);
- len -= skb_shinfo(to)->frags[j].size;
+ size = min_t(int, skb_frag_size(&skb_shinfo(to)->frags[j]),
+ len);
+ skb_frag_size_set(&skb_shinfo(to)->frags[j], size);
+ len -= size;
skb_frag_ref(to, j);
j++;
}
@@ -2786,7 +3910,7 @@ void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to)
csum = 0;
if (csstart != skb->len)
csum = skb_copy_and_csum_bits(skb, csstart, to + csstart,
- skb->len - csstart, 0);
+ skb->len - csstart);
if (skb->ip_summed == CHECKSUM_PARTIAL) {
long csstuff = csstart + skb->csum_offset;
@@ -2838,20 +3962,32 @@ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
EXPORT_SYMBOL(skb_dequeue_tail);
/**
- * skb_queue_purge - empty a list
+ * skb_queue_purge_reason - empty a list
* @list: list to empty
+ * @reason: drop reason
*
* Delete all buffers on an &sk_buff list. Each buffer is removed from
* the list and one reference dropped. This function takes the list
* lock and is atomic with respect to other list locking functions.
*/
-void skb_queue_purge(struct sk_buff_head *list)
+void skb_queue_purge_reason(struct sk_buff_head *list,
+ enum skb_drop_reason reason)
{
- struct sk_buff *skb;
- while ((skb = skb_dequeue(list)) != NULL)
- kfree_skb(skb);
+ struct sk_buff_head tmp;
+ unsigned long flags;
+
+ if (skb_queue_empty_lockless(list))
+ return;
+
+ __skb_queue_head_init(&tmp);
+
+ spin_lock_irqsave(&list->lock, flags);
+ skb_queue_splice_init(list, &tmp);
+ spin_unlock_irqrestore(&list->lock, flags);
+
+ __skb_queue_purge_reason(&tmp, reason);
}
-EXPORT_SYMBOL(skb_queue_purge);
+EXPORT_SYMBOL(skb_queue_purge_reason);
/**
* skb_rbtree_purge - empty a skb rbtree
@@ -2879,6 +4015,27 @@ unsigned int skb_rbtree_purge(struct rb_root *root)
return sum;
}
+void skb_errqueue_purge(struct sk_buff_head *list)
+{
+ struct sk_buff *skb, *next;
+ struct sk_buff_head kill;
+ unsigned long flags;
+
+ __skb_queue_head_init(&kill);
+
+ spin_lock_irqsave(&list->lock, flags);
+ skb_queue_walk_safe(list, skb, next) {
+ if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY ||
+ SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING)
+ continue;
+ __skb_unlink(skb, list);
+ __skb_queue_tail(&kill, skb);
+ }
+ spin_unlock_irqrestore(&list->lock, flags);
+ __skb_queue_purge(&kill);
+}
+EXPORT_SYMBOL(skb_errqueue_purge);
+
/**
* skb_queue_head - queue a buffer at the list head
* @list: list to use
@@ -2961,28 +4118,6 @@ void skb_append(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head
}
EXPORT_SYMBOL(skb_append);
-/**
- * skb_insert - insert a buffer
- * @old: buffer to insert before
- * @newsk: buffer to insert
- * @list: list to use
- *
- * Place a packet before a given packet in a list. The list locks are
- * taken and this function is atomic with respect to other list locked
- * calls.
- *
- * A buffer cannot be placed on two lists at the same time.
- */
-void skb_insert(struct sk_buff *old, struct sk_buff *newsk, struct sk_buff_head *list)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&list->lock, flags);
- __skb_insert(newsk, old->prev, old, list);
- spin_unlock_irqrestore(&list->lock, flags);
-}
-EXPORT_SYMBOL(skb_insert);
-
static inline void skb_split_inside_header(struct sk_buff *skb,
struct sk_buff* skb1,
const u32 len, const int pos)
@@ -2996,6 +4131,7 @@ static inline void skb_split_inside_header(struct sk_buff *skb,
skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
+ skb1->unreadable = skb->unreadable;
skb_shinfo(skb)->nr_frags = 0;
skb1->data_len = skb->data_len;
skb1->len += skb1->data_len;
@@ -3032,7 +4168,7 @@ static inline void skb_split_no_header(struct sk_buff *skb,
* 2. Split is accurately. We make this.
*/
skb_frag_ref(skb, i);
- skb_shinfo(skb1)->frags[0].page_offset += len - pos;
+ skb_frag_off_add(&skb_shinfo(skb1)->frags[0], len - pos);
skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
skb_shinfo(skb)->nr_frags++;
@@ -3043,6 +4179,8 @@ static inline void skb_split_no_header(struct sk_buff *skb,
pos += size;
}
skb_shinfo(skb1)->nr_frags = k;
+
+ skb1->unreadable = skb->unreadable;
}
/**
@@ -3054,9 +4192,11 @@ static inline void skb_split_no_header(struct sk_buff *skb,
void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len)
{
int pos = skb_headlen(skb);
+ const int zc_flags = SKBFL_SHARED_FRAG | SKBFL_PURE_ZEROCOPY;
- skb_shinfo(skb1)->tx_flags |= skb_shinfo(skb)->tx_flags &
- SKBTX_SHARED_FRAG;
+ skb_zcopy_downgrade_managed(skb);
+
+ skb_shinfo(skb1)->flags |= skb_shinfo(skb)->flags & zc_flags;
skb_zerocopy_clone(skb1, skb, 0);
if (len < pos) /* Split line is inside header. */
skb_split_inside_header(skb, skb1, len, pos);
@@ -3071,7 +4211,7 @@ EXPORT_SYMBOL(skb_split);
*/
static int skb_prepare_for_shift(struct sk_buff *skb)
{
- return skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+ return skb_unclone_keeptruesize(skb, GFP_ATOMIC);
}
/**
@@ -3095,7 +4235,7 @@ static int skb_prepare_for_shift(struct sk_buff *skb)
int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
{
int from, to, merge, todo;
- struct skb_frag_struct *fragfrom, *fragto;
+ skb_frag_t *fragfrom, *fragto;
BUG_ON(shiftlen > skb->len);
@@ -3104,6 +4244,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
if (skb_zcopy(tgt) || skb_zcopy(skb))
return 0;
+ DEBUG_NET_WARN_ON_ONCE(tgt->pp_recycle != skb->pp_recycle);
+ DEBUG_NET_WARN_ON_ONCE(skb_cmp_decrypted(tgt, skb));
+
todo = shiftlen;
from = 0;
to = skb_shinfo(tgt)->nr_frags;
@@ -3112,9 +4255,8 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
/* Actual merge is delayed until the point when we know we can
* commit all, so that we don't have to undo partial changes
*/
- if (!to ||
- !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
- fragfrom->page_offset)) {
+ if (!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
+ skb_frag_off(fragfrom))) {
merge = -1;
} else {
merge = to - 1;
@@ -3131,7 +4273,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
skb_frag_size_add(fragto, shiftlen);
skb_frag_size_sub(fragfrom, shiftlen);
- fragfrom->page_offset += shiftlen;
+ skb_frag_off_add(fragfrom, shiftlen);
goto onlymerged;
}
@@ -3162,11 +4304,11 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
} else {
__skb_frag_ref(fragfrom);
- fragto->page = fragfrom->page;
- fragto->page_offset = fragfrom->page_offset;
+ skb_frag_page_copy(fragto, fragfrom);
+ skb_frag_off_copy(fragto, fragfrom);
skb_frag_size_set(fragto, todo);
- fragfrom->page_offset += todo;
+ skb_frag_off_add(fragfrom, todo);
skb_frag_size_sub(fragfrom, todo);
todo = 0;
@@ -3183,7 +4325,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
fragto = &skb_shinfo(tgt)->frags[merge];
skb_frag_size_add(fragto, skb_frag_size(fragfrom));
- __skb_frag_unref(fragfrom);
+ __skb_frag_unref(fragfrom, skb->pp_recycle);
}
/* Reposition in the original skb */
@@ -3201,13 +4343,8 @@ onlymerged:
tgt->ip_summed = CHECKSUM_PARTIAL;
skb->ip_summed = CHECKSUM_PARTIAL;
- /* Yak, is it really working this way? Some helper please? */
- skb->len -= shiftlen;
- skb->data_len -= shiftlen;
- skb->truesize -= shiftlen;
- tgt->len += shiftlen;
- tgt->data_len += shiftlen;
- tgt->truesize += shiftlen;
+ skb_len_add(skb, -shiftlen);
+ skb_len_add(tgt, shiftlen);
return shiftlen;
}
@@ -3230,6 +4367,7 @@ void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from,
st->root_skb = st->cur_skb = skb;
st->frag_idx = st->stepped_offset = 0;
st->frag_data = NULL;
+ st->frag_off = 0;
}
EXPORT_SYMBOL(skb_prepare_seq_read);
@@ -3280,18 +4418,34 @@ next_skb:
return block_limit - abs_offset;
}
+ if (!skb_frags_readable(st->cur_skb))
+ return 0;
+
if (st->frag_idx == 0 && !st->frag_data)
st->stepped_offset += skb_headlen(st->cur_skb);
while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
+ unsigned int pg_idx, pg_off, pg_sz;
+
frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
- block_limit = skb_frag_size(frag) + st->stepped_offset;
+ pg_idx = 0;
+ pg_off = skb_frag_off(frag);
+ pg_sz = skb_frag_size(frag);
+
+ if (skb_frag_must_loop(skb_frag_page(frag))) {
+ pg_idx = (pg_off + st->frag_off) >> PAGE_SHIFT;
+ pg_off = offset_in_page(pg_off + st->frag_off);
+ pg_sz = min_t(unsigned int, pg_sz - st->frag_off,
+ PAGE_SIZE - pg_off);
+ }
+
+ block_limit = pg_sz + st->stepped_offset;
if (abs_offset < block_limit) {
if (!st->frag_data)
- st->frag_data = kmap_atomic(skb_frag_page(frag));
+ st->frag_data = kmap_atomic(skb_frag_page(frag) + pg_idx);
- *data = (u8 *) st->frag_data + frag->page_offset +
+ *data = (u8 *)st->frag_data + pg_off +
(abs_offset - st->stepped_offset);
return block_limit - abs_offset;
@@ -3302,8 +4456,12 @@ next_skb:
st->frag_data = NULL;
}
- st->frag_idx++;
- st->stepped_offset += skb_frag_size(frag);
+ st->stepped_offset += pg_sz;
+ st->frag_off += pg_sz;
+ if (st->frag_off == skb_frag_size(frag)) {
+ st->frag_off = 0;
+ st->frag_idx++;
+ }
}
if (st->frag_data) {
@@ -3339,6 +4497,41 @@ void skb_abort_seq_read(struct skb_seq_state *st)
}
EXPORT_SYMBOL(skb_abort_seq_read);
+/**
+ * skb_copy_seq_read() - copy from a skb_seq_state to a buffer
+ * @st: source skb_seq_state
+ * @offset: offset in source
+ * @to: destination buffer
+ * @len: number of bytes to copy
+ *
+ * Copy @len bytes from @offset bytes into the source @st to the destination
+ * buffer @to. `offset` should increase (or be unchanged) with each subsequent
+ * call to this function. If offset needs to decrease from the previous use `st`
+ * should be reset first.
+ *
+ * Return: 0 on success or -EINVAL if the copy ended early
+ */
+int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len)
+{
+ const u8 *data;
+ u32 sqlen;
+
+ for (;;) {
+ sqlen = skb_seq_read(offset, &data, st);
+ if (sqlen == 0)
+ return -EINVAL;
+ if (sqlen >= len) {
+ memcpy(to, data, len);
+ return 0;
+ }
+ memcpy(to, data, sqlen);
+ to += sqlen;
+ offset += sqlen;
+ len -= sqlen;
+ }
+}
+EXPORT_SYMBOL(skb_copy_seq_read);
+
#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
@@ -3368,87 +4561,33 @@ static void skb_ts_finish(struct ts_config *conf, struct ts_state *state)
unsigned int skb_find_text(struct sk_buff *skb, unsigned int from,
unsigned int to, struct ts_config *config)
{
+ unsigned int patlen = config->ops->get_pattern_len(config);
struct ts_state state;
unsigned int ret;
+ BUILD_BUG_ON(sizeof(struct skb_seq_state) > sizeof(state.cb));
+
config->get_next_block = skb_ts_get_next_block;
config->finish = skb_ts_finish;
skb_prepare_seq_read(skb, from, to, TS_SKB_CB(&state));
ret = textsearch_find(config, &state);
- return (ret <= to - from ? ret : UINT_MAX);
+ return (ret + patlen <= to - from ? ret : UINT_MAX);
}
EXPORT_SYMBOL(skb_find_text);
-/**
- * skb_append_datato_frags - append the user data to a skb
- * @sk: sock structure
- * @skb: skb structure to be appended with user data.
- * @getfrag: call back function to be used for getting the user data
- * @from: pointer to user message iov
- * @length: length of the iov message
- *
- * Description: This procedure append the user data in the fragment part
- * of the skb if any page alloc fails user this procedure returns -ENOMEM
- */
-int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
- int (*getfrag)(void *from, char *to, int offset,
- int len, int odd, struct sk_buff *skb),
- void *from, int length)
-{
- int frg_cnt = skb_shinfo(skb)->nr_frags;
- int copy;
- int offset = 0;
- int ret;
- struct page_frag *pfrag = &current->task_frag;
-
- do {
- /* Return error if we don't have space for new frag */
- if (frg_cnt >= MAX_SKB_FRAGS)
- return -EMSGSIZE;
-
- if (!sk_page_frag_refill(sk, pfrag))
- return -ENOMEM;
-
- /* copy the user data to page */
- copy = min_t(int, length, pfrag->size - pfrag->offset);
-
- ret = getfrag(from, page_address(pfrag->page) + pfrag->offset,
- offset, copy, 0, skb);
- if (ret < 0)
- return -EFAULT;
-
- /* copy was successful so update the size parameters */
- skb_fill_page_desc(skb, frg_cnt, pfrag->page, pfrag->offset,
- copy);
- frg_cnt++;
- pfrag->offset += copy;
- get_page(pfrag->page);
-
- skb->truesize += copy;
- refcount_add(copy, &sk->sk_wmem_alloc);
- skb->len += copy;
- skb->data_len += copy;
- offset += copy;
- length -= copy;
-
- } while (length > 0);
-
- return 0;
-}
-EXPORT_SYMBOL(skb_append_datato_frags);
-
int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
- int offset, size_t size)
+ int offset, size_t size, size_t max_frags)
{
int i = skb_shinfo(skb)->nr_frags;
if (skb_can_coalesce(skb, i, page, offset)) {
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
- } else if (i < MAX_SKB_FRAGS) {
+ } else if (i < max_frags) {
+ skb_zcopy_downgrade_managed(skb);
get_page(page);
- skb_fill_page_desc(skb, i, page, offset, size);
+ skb_fill_page_desc_noacc(skb, i, page, offset, size);
} else {
return -EMSGSIZE;
}
@@ -3485,13 +4624,104 @@ static inline skb_frag_t skb_head_frag_to_page_desc(struct sk_buff *frag_skb)
struct page *page;
page = virt_to_head_page(frag_skb->head);
- head_frag.page.p = page;
- head_frag.page_offset = frag_skb->data -
- (unsigned char *)page_address(page);
- head_frag.size = skb_headlen(frag_skb);
+ skb_frag_fill_page_desc(&head_frag, page, frag_skb->data -
+ (unsigned char *)page_address(page),
+ skb_headlen(frag_skb));
return head_frag;
}
+struct sk_buff *skb_segment_list(struct sk_buff *skb,
+ netdev_features_t features,
+ unsigned int offset)
+{
+ struct sk_buff *list_skb = skb_shinfo(skb)->frag_list;
+ unsigned int tnl_hlen = skb_tnl_header_len(skb);
+ unsigned int delta_truesize = 0;
+ unsigned int delta_len = 0;
+ struct sk_buff *tail = NULL;
+ struct sk_buff *nskb, *tmp;
+ int len_diff, err;
+
+ skb_push(skb, -skb_network_offset(skb) + offset);
+
+ /* Ensure the head is writeable before touching the shared info */
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (err)
+ goto err_linearize;
+
+ skb_shinfo(skb)->frag_list = NULL;
+
+ while (list_skb) {
+ nskb = list_skb;
+ list_skb = list_skb->next;
+
+ err = 0;
+ delta_truesize += nskb->truesize;
+ if (skb_shared(nskb)) {
+ tmp = skb_clone(nskb, GFP_ATOMIC);
+ if (tmp) {
+ consume_skb(nskb);
+ nskb = tmp;
+ err = skb_unclone(nskb, GFP_ATOMIC);
+ } else {
+ err = -ENOMEM;
+ }
+ }
+
+ if (!tail)
+ skb->next = nskb;
+ else
+ tail->next = nskb;
+
+ if (unlikely(err)) {
+ nskb->next = list_skb;
+ goto err_linearize;
+ }
+
+ tail = nskb;
+
+ delta_len += nskb->len;
+
+ skb_push(nskb, -skb_network_offset(nskb) + offset);
+
+ skb_release_head_state(nskb);
+ len_diff = skb_network_header_len(nskb) - skb_network_header_len(skb);
+ __copy_skb_header(nskb, skb);
+
+ skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb));
+ nskb->transport_header += len_diff;
+ skb_copy_from_linear_data_offset(skb, -tnl_hlen,
+ nskb->data - tnl_hlen,
+ offset + tnl_hlen);
+
+ if (skb_needs_linearize(nskb, features) &&
+ __skb_linearize(nskb))
+ goto err_linearize;
+ }
+
+ skb->truesize = skb->truesize - delta_truesize;
+ skb->data_len = skb->data_len - delta_len;
+ skb->len = skb->len - delta_len;
+
+ skb_gso_reset(skb);
+
+ skb->prev = tail;
+
+ if (skb_needs_linearize(skb, features) &&
+ __skb_linearize(skb))
+ goto err_linearize;
+
+ skb_get(skb);
+
+ return skb;
+
+err_linearize:
+ kfree_skb_list(skb->next);
+ skb->next = NULL;
+ return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL_GPL(skb_segment_list);
+
/**
* skb_segment - Perform protocol segmentation on skb.
* @head_skb: buffer to segment
@@ -3507,25 +4737,44 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
struct sk_buff *segs = NULL;
struct sk_buff *tail = NULL;
struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
- skb_frag_t *frag = skb_shinfo(head_skb)->frags;
unsigned int mss = skb_shinfo(head_skb)->gso_size;
unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
- struct sk_buff *frag_skb = head_skb;
unsigned int offset = doffset;
unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
unsigned int partial_segs = 0;
unsigned int headroom;
unsigned int len = head_skb->len;
+ struct sk_buff *frag_skb;
+ skb_frag_t *frag;
__be16 proto;
bool csum, sg;
- int nfrags = skb_shinfo(head_skb)->nr_frags;
int err = -ENOMEM;
int i = 0;
- int pos;
- int dummy;
+ int nfrags, pos;
+
+ if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) &&
+ mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) {
+ struct sk_buff *check_skb;
+
+ for (check_skb = list_skb; check_skb; check_skb = check_skb->next) {
+ if (skb_headlen(check_skb) && !check_skb->head_frag) {
+ /* gso_size is untrusted, and we have a frag_list with
+ * a linear non head_frag item.
+ *
+ * If head_skb's headlen does not fit requested gso_size,
+ * it means that the frag_list members do NOT terminate
+ * on exact gso_size boundaries. Hence we cannot perform
+ * skb_frag_t page sharing. Therefore we must fallback to
+ * copying the frag_list skbs; we do so by disabling SG.
+ */
+ features &= ~NETIF_F_SG;
+ break;
+ }
+ }
+ }
__skb_push(head_skb, doffset);
- proto = skb_network_protocol(head_skb, &dummy);
+ proto = skb_network_protocol(head_skb, NULL);
if (unlikely(!proto))
return ERR_PTR(-EINVAL);
@@ -3566,8 +4815,9 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
/* GSO partial only requires that we trim off any excess that
* doesn't fit into an MSS sized block, so take care of that
* now.
+ * Cap len to not accidentally hit GSO_BY_FRAGS.
*/
- partial_segs = len / mss;
+ partial_segs = min(len, GSO_BY_FRAGS - 1) / mss;
if (partial_segs > 1)
mss *= partial_segs;
else
@@ -3578,6 +4828,13 @@ normal:
headroom = skb_headroom(head_skb);
pos = skb_headlen(head_skb);
+ if (skb_orphan_frags(head_skb, GFP_ATOMIC))
+ return ERR_PTR(-ENOMEM);
+
+ nfrags = skb_shinfo(head_skb)->nr_frags;
+ frag = skb_shinfo(head_skb)->frags;
+ frag_skb = head_skb;
+
do {
struct sk_buff *nskb;
skb_frag_t *nskb_frag;
@@ -3593,15 +4850,15 @@ normal:
}
hsize = skb_headlen(head_skb) - offset;
- if (hsize < 0)
- hsize = 0;
- if (hsize > len || !sg)
- hsize = len;
- if (!hsize && i >= nfrags && skb_headlen(list_skb) &&
+ if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) &&
(skb_headlen(list_skb) == len || sg)) {
BUG_ON(skb_headlen(list_skb) > len);
+ nskb = skb_clone(list_skb, GFP_ATOMIC);
+ if (unlikely(!nskb))
+ goto err;
+
i = 0;
nfrags = skb_shinfo(list_skb)->nr_frags;
frag = skb_shinfo(list_skb)->frags;
@@ -3620,12 +4877,8 @@ normal:
frag++;
}
- nskb = skb_clone(list_skb, GFP_ATOMIC);
list_skb = list_skb->next;
- if (unlikely(!nskb))
- goto err;
-
if (unlikely(pskb_trim(nskb, len))) {
kfree_skb(nskb);
goto err;
@@ -3641,6 +4894,11 @@ normal:
skb_release_head_state(nskb);
__skb_push(nskb, doffset);
} else {
+ if (hsize < 0)
+ hsize = 0;
+ if (hsize > len || !sg)
+ hsize = len;
+
nskb = __alloc_skb(hsize + doffset + headroom,
GFP_ATOMIC, skb_alloc_rx_flag(head_skb),
NUMA_NO_NODE);
@@ -3671,14 +4929,20 @@ normal:
goto perform_csum_check;
if (!sg) {
- if (!nskb->remcsum_offload)
- nskb->ip_summed = CHECKSUM_NONE;
- SKB_GSO_CB(nskb)->csum =
- skb_copy_and_csum_bits(head_skb, offset,
- skb_put(nskb, len),
- len, 0);
- SKB_GSO_CB(nskb)->csum_start =
- skb_headroom(nskb) + doffset;
+ if (!csum) {
+ if (!nskb->remcsum_offload)
+ nskb->ip_summed = CHECKSUM_NONE;
+ SKB_GSO_CB(nskb)->csum =
+ skb_copy_and_csum_bits(head_skb, offset,
+ skb_put(nskb,
+ len),
+ len);
+ SKB_GSO_CB(nskb)->csum_start =
+ skb_headroom(nskb) + doffset;
+ } else {
+ if (skb_copy_bits(head_skb, offset, skb_put(nskb, len), len))
+ goto err;
+ }
continue;
}
@@ -3687,15 +4951,19 @@ normal:
skb_copy_from_linear_data_offset(head_skb, offset,
skb_put(nskb, hsize), hsize);
- skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
- SKBTX_SHARED_FRAG;
+ skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags &
+ SKBFL_SHARED_FRAG;
- if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
- skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
+ if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
goto err;
while (pos < offset + len) {
if (i >= nfrags) {
+ if (skb_orphan_frags(list_skb, GFP_ATOMIC) ||
+ skb_zerocopy_clone(nskb, list_skb,
+ GFP_ATOMIC))
+ goto err;
+
i = 0;
nfrags = skb_shinfo(list_skb)->nr_frags;
frag = skb_shinfo(list_skb)->frags;
@@ -3709,10 +4977,6 @@ normal:
i--;
frag--;
}
- if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
- skb_zerocopy_clone(nskb, frag_skb,
- GFP_ATOMIC))
- goto err;
list_skb = list_skb->next;
}
@@ -3731,7 +4995,7 @@ normal:
size = skb_frag_size(nskb_frag);
if (pos < offset) {
- nskb_frag->page_offset += offset - pos;
+ skb_frag_off_add(nskb_frag, offset - pos);
skb_frag_size_sub(nskb_frag, offset - pos);
}
@@ -3818,132 +5082,98 @@ err:
}
EXPORT_SYMBOL_GPL(skb_segment);
-int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
-{
- struct skb_shared_info *pinfo, *skbinfo = skb_shinfo(skb);
- unsigned int offset = skb_gro_offset(skb);
- unsigned int headlen = skb_headlen(skb);
- unsigned int len = skb_gro_len(skb);
- unsigned int delta_truesize;
- struct sk_buff *lp;
-
- if (unlikely(p->len + len >= 65536))
- return -E2BIG;
-
- lp = NAPI_GRO_CB(p)->last;
- pinfo = skb_shinfo(lp);
-
- if (headlen <= offset) {
- skb_frag_t *frag;
- skb_frag_t *frag2;
- int i = skbinfo->nr_frags;
- int nr_frags = pinfo->nr_frags + i;
+#ifdef CONFIG_SKB_EXTENSIONS
+#define SKB_EXT_ALIGN_VALUE 8
+#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE)
- if (nr_frags > MAX_SKB_FRAGS)
- goto merge;
-
- offset -= headlen;
- pinfo->nr_frags = nr_frags;
- skbinfo->nr_frags = 0;
-
- frag = pinfo->frags + nr_frags;
- frag2 = skbinfo->frags + i;
- do {
- *--frag = *--frag2;
- } while (--i);
-
- frag->page_offset += offset;
- skb_frag_size_sub(frag, offset);
-
- /* all fragments truesize : remove (head size + sk_buff) */
- delta_truesize = skb->truesize -
- SKB_TRUESIZE(skb_end_offset(skb));
-
- skb->truesize -= skb->data_len;
- skb->len -= skb->data_len;
- skb->data_len = 0;
-
- NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE;
- goto done;
- } else if (skb->head_frag) {
- int nr_frags = pinfo->nr_frags;
- skb_frag_t *frag = pinfo->frags + nr_frags;
- struct page *page = virt_to_head_page(skb->head);
- unsigned int first_size = headlen - offset;
- unsigned int first_offset;
-
- if (nr_frags + 1 + skbinfo->nr_frags > MAX_SKB_FRAGS)
- goto merge;
-
- first_offset = skb->data -
- (unsigned char *)page_address(page) +
- offset;
-
- pinfo->nr_frags = nr_frags + 1 + skbinfo->nr_frags;
-
- frag->page.p = page;
- frag->page_offset = first_offset;
- skb_frag_size_set(frag, first_size);
-
- memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * skbinfo->nr_frags);
- /* We dont need to clear skbinfo->nr_frags here */
-
- delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct sk_buff));
- NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
- goto done;
- }
+static const u8 skb_ext_type_len[] = {
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info),
+#endif
+#ifdef CONFIG_XFRM
+ [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
+#endif
+#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
+ [TC_SKB_EXT] = SKB_EXT_CHUNKSIZEOF(struct tc_skb_ext),
+#endif
+#if IS_ENABLED(CONFIG_MPTCP)
+ [SKB_EXT_MPTCP] = SKB_EXT_CHUNKSIZEOF(struct mptcp_ext),
+#endif
+#if IS_ENABLED(CONFIG_MCTP_FLOWS)
+ [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow),
+#endif
+#if IS_ENABLED(CONFIG_INET_PSP)
+ [SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext),
+#endif
+};
-merge:
- delta_truesize = skb->truesize;
- if (offset > headlen) {
- unsigned int eat = offset - headlen;
+static __always_inline unsigned int skb_ext_total_length(void)
+{
+ unsigned int l = SKB_EXT_CHUNKSIZEOF(struct skb_ext);
+ int i;
- skbinfo->frags[0].page_offset += eat;
- skb_frag_size_sub(&skbinfo->frags[0], eat);
- skb->data_len -= eat;
- skb->len -= eat;
- offset = headlen;
- }
+ for (i = 0; i < ARRAY_SIZE(skb_ext_type_len); i++)
+ l += skb_ext_type_len[i];
- __skb_pull(skb, offset);
+ return l;
+}
- if (NAPI_GRO_CB(p)->last == p)
- skb_shinfo(p)->frag_list = skb;
- else
- NAPI_GRO_CB(p)->last->next = skb;
- NAPI_GRO_CB(p)->last = skb;
- __skb_header_release(skb);
- lp = p;
+static void skb_extensions_init(void)
+{
+ BUILD_BUG_ON(SKB_EXT_NUM >= 8);
+#if !IS_ENABLED(CONFIG_KCOV_INSTRUMENT_ALL)
+ BUILD_BUG_ON(skb_ext_total_length() > 255);
+#endif
-done:
- NAPI_GRO_CB(p)->count++;
- p->data_len += len;
- p->truesize += delta_truesize;
- p->len += len;
- if (lp != p) {
- lp->data_len += len;
- lp->truesize += delta_truesize;
- lp->len += len;
- }
- NAPI_GRO_CB(skb)->same_flow = 1;
- return 0;
+ skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache",
+ SKB_EXT_ALIGN_VALUE * skb_ext_total_length(),
+ 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ NULL);
}
-EXPORT_SYMBOL_GPL(skb_gro_receive);
+#else
+static void skb_extensions_init(void) {}
+#endif
+
+/* The SKB kmem_cache slab is critical for network performance. Never
+ * merge/alias the slab with similar sized objects. This avoids fragmentation
+ * that hurts performance of kmem_cache_{alloc,free}_bulk APIs.
+ */
+#ifndef CONFIG_SLUB_TINY
+#define FLAG_SKB_NO_MERGE SLAB_NO_MERGE
+#else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */
+#define FLAG_SKB_NO_MERGE 0
+#endif
void __init skb_init(void)
{
- skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache",
+ net_hotdata.skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache",
sizeof(struct sk_buff),
0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|
+ FLAG_SKB_NO_MERGE,
offsetof(struct sk_buff, cb),
sizeof_field(struct sk_buff, cb),
NULL);
- skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
+ skbuff_cache_size = kmem_cache_size(net_hotdata.skbuff_cache);
+
+ net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
sizeof(struct sk_buff_fclones),
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
+ /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes.
+ * struct skb_shared_info is located at the end of skb->head,
+ * and should not be copied to/from user.
+ */
+ net_hotdata.skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head",
+ SKB_SMALL_HEAD_CACHE_SIZE,
+ 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC,
+ 0,
+ SKB_SMALL_HEAD_HEADROOM,
+ NULL);
+ skb_extensions_init();
}
static int
@@ -3982,7 +5212,7 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len,
if (copy > len)
copy = len;
sg_set_page(&sg[elt], skb_frag_page(frag), copy,
- frag->page_offset+offset-start);
+ skb_frag_off(frag) + offset - start);
elt++;
if (!(len -= copy))
return elt;
@@ -4059,7 +5289,7 @@ EXPORT_SYMBOL_GPL(skb_to_sgvec);
* 3. sg_unmark_end
* 4. skb_to_sgvec(payload2)
*
- * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
+ * When mapping multiple payload conditionally, skb_to_sgvec_nomark
* is more preferable.
*/
int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
@@ -4099,7 +5329,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
* at the moment even if they are anonymous).
*/
if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) &&
- __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL)
+ !__pskb_pull_tail(skb, __skb_pagelen(skb)))
return -ENOMEM;
/* Easy case. Most of packets will go this way. */
@@ -4203,7 +5433,7 @@ static void skb_set_err_queue(struct sk_buff *skb)
int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
{
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
- (unsigned int)sk->sk_rcvbuf)
+ (unsigned int)READ_ONCE(sk->sk_rcvbuf))
return -ENOMEM;
skb_orphan(skb);
@@ -4217,7 +5447,7 @@ int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
skb_queue_tail(&sk->sk_error_queue, skb);
if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_error_report(sk);
+ sk_error_report(sk);
return 0;
}
EXPORT_SYMBOL(sock_queue_err_skb);
@@ -4235,12 +5465,15 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
bool icmp_next = false;
unsigned long flags;
+ if (skb_queue_empty_lockless(q))
+ return NULL;
+
spin_lock_irqsave(&q->lock, flags);
skb = __skb_dequeue(q);
if (skb && (skb_next = skb_peek(q))) {
icmp_next = is_icmp_err_skb(skb_next);
if (icmp_next)
- sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_origin;
+ sk->sk_err = SKB_EXT_ERR(skb_next)->ee.ee_errno;
}
spin_unlock_irqrestore(&q->lock, flags);
@@ -4248,7 +5481,7 @@ struct sk_buff *sock_dequeue_err_skb(struct sock *sk)
sk->sk_err = 0;
if (skb_next)
- sk->sk_error_report(sk);
+ sk_error_report(sk);
return skb;
}
@@ -4305,11 +5538,10 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb,
serr->ee.ee_info = tstype;
serr->opt_stats = opt_stats;
serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
- if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
+ if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
serr->ee.ee_data = skb_shinfo(skb)->tskey;
- if (sk->sk_protocol == IPPROTO_TCP &&
- sk->sk_type == SOCK_STREAM)
- serr->ee.ee_data -= sk->sk_tskey;
+ if (sk_is_tcp(sk))
+ serr->ee.ee_data -= atomic_read(&sk->sk_tskey);
}
err = sock_queue_err_skb(sk, skb);
@@ -4322,7 +5554,7 @@ static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
{
bool ret;
- if (likely(sysctl_tstamp_allow_data || tsonly))
+ if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data)))
return true;
read_lock_bh(&sk->sk_callback_lock);
@@ -4355,36 +5587,99 @@ err:
}
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
+static bool skb_tstamp_tx_report_so_timestamping(struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ int tstype)
+{
+ switch (tstype) {
+ case SCM_TSTAMP_SCHED:
+ return skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP;
+ case SCM_TSTAMP_SND:
+ return skb_shinfo(skb)->tx_flags & (hwtstamps ? SKBTX_HW_TSTAMP_NOBPF :
+ SKBTX_SW_TSTAMP);
+ case SCM_TSTAMP_ACK:
+ return TCP_SKB_CB(skb)->txstamp_ack & TSTAMP_ACK_SK;
+ case SCM_TSTAMP_COMPLETION:
+ return skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP;
+ }
+
+ return false;
+}
+
+static void skb_tstamp_tx_report_bpf_timestamping(struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ struct sock *sk,
+ int tstype)
+{
+ int op;
+
+ switch (tstype) {
+ case SCM_TSTAMP_SCHED:
+ op = BPF_SOCK_OPS_TSTAMP_SCHED_CB;
+ break;
+ case SCM_TSTAMP_SND:
+ if (hwtstamps) {
+ op = BPF_SOCK_OPS_TSTAMP_SND_HW_CB;
+ *skb_hwtstamps(skb) = *hwtstamps;
+ } else {
+ op = BPF_SOCK_OPS_TSTAMP_SND_SW_CB;
+ }
+ break;
+ case SCM_TSTAMP_ACK:
+ op = BPF_SOCK_OPS_TSTAMP_ACK_CB;
+ break;
+ default:
+ return;
+ }
+
+ bpf_skops_tx_timestamping(sk, skb, op);
+}
+
void __skb_tstamp_tx(struct sk_buff *orig_skb,
+ const struct sk_buff *ack_skb,
struct skb_shared_hwtstamps *hwtstamps,
struct sock *sk, int tstype)
{
struct sk_buff *skb;
bool tsonly, opt_stats = false;
+ u32 tsflags;
if (!sk)
return;
- if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
+ if (skb_shinfo(orig_skb)->tx_flags & SKBTX_BPF)
+ skb_tstamp_tx_report_bpf_timestamping(orig_skb, hwtstamps,
+ sk, tstype);
+
+ if (!skb_tstamp_tx_report_so_timestamping(orig_skb, hwtstamps, tstype))
+ return;
+
+ tsflags = READ_ONCE(sk->sk_tsflags);
+ if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
return;
- tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
+ tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
if (!skb_may_tx_timestamp(sk, tsonly))
return;
if (tsonly) {
#ifdef CONFIG_INET
- if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
- sk->sk_protocol == IPPROTO_TCP &&
- sk->sk_type == SOCK_STREAM) {
- skb = tcp_get_timestamping_opt_stats(sk);
+ if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
+ sk_is_tcp(sk)) {
+ skb = tcp_get_timestamping_opt_stats(sk, orig_skb,
+ ack_skb);
opt_stats = true;
} else
#endif
skb = alloc_skb(0, GFP_ATOMIC);
} else {
skb = skb_clone(orig_skb, GFP_ATOMIC);
+
+ if (skb_orphan_frags_rx(skb, GFP_ATOMIC)) {
+ kfree_skb(skb);
+ return;
+ }
}
if (!skb)
return;
@@ -4398,7 +5693,7 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (hwtstamps)
*skb_hwtstamps(skb) = *hwtstamps;
else
- skb->tstamp = ktime_get_real();
+ __net_timestamp(skb);
__skb_complete_tx_timestamp(skb, sk, tstype, opt_stats);
}
@@ -4407,11 +5702,12 @@ EXPORT_SYMBOL_GPL(__skb_tstamp_tx);
void skb_tstamp_tx(struct sk_buff *orig_skb,
struct skb_shared_hwtstamps *hwtstamps)
{
- return __skb_tstamp_tx(orig_skb, hwtstamps, orig_skb->sk,
+ return __skb_tstamp_tx(orig_skb, NULL, hwtstamps, orig_skb->sk,
SCM_TSTAMP_SND);
}
EXPORT_SYMBOL_GPL(skb_tstamp_tx);
+#ifdef CONFIG_WIRELESS
void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
{
struct sock *sk = skb->sk;
@@ -4437,6 +5733,7 @@ void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
+#endif /* CONFIG_WIRELESS */
/**
* skb_partial_csum_set - set up and verify partial csum values for packet
@@ -4452,16 +5749,18 @@ EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
*/
bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
{
- if (unlikely(start > skb_headlen(skb)) ||
- unlikely((int)start + off > skb_headlen(skb) - 2)) {
- net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n",
- start, off, skb_headlen(skb));
+ u32 csum_end = (u32)start + (u32)off + sizeof(__sum16);
+ u32 csum_start = skb_headroom(skb) + (u32)start;
+
+ if (unlikely(csum_start >= U16_MAX || csum_end > skb_headlen(skb))) {
+ net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n",
+ start, off, skb_headroom(skb), skb_headlen(skb));
return false;
}
skb->ip_summed = CHECKSUM_PARTIAL;
- skb->csum_start = skb_headroom(skb) + start;
+ skb->csum_start = csum_start;
skb->csum_offset = off;
- skb_set_transport_header(skb, start);
+ skb->transport_header = csum_start;
return true;
}
EXPORT_SYMBOL_GPL(skb_partial_csum_set);
@@ -4493,9 +5792,9 @@ static __sum16 *skb_checksum_setup_ip(struct sk_buff *skb,
typeof(IPPROTO_IP) proto,
unsigned int off)
{
- switch (proto) {
- int err;
+ int err;
+ switch (proto) {
case IPPROTO_TCP:
err = skb_maybe_pull_tail(skb, off + sizeof(struct tcphdr),
off + MAX_TCP_HDR_LEN);
@@ -4538,7 +5837,7 @@ static int skb_checksum_setup_ipv4(struct sk_buff *skb, bool recalculate)
if (err < 0)
goto out;
- if (ip_hdr(skb)->frag_off & htons(IP_OFFSET | IP_MF))
+ if (ip_is_fragment(ip_hdr(skb)))
fragment = true;
off = ip_hdrlen(skb);
@@ -4795,7 +6094,7 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
{
if (head_stolen) {
skb_release_head_state(skb);
- kmem_cache_free(skbuff_head_cache, skb);
+ kmem_cache_free(net_hotdata.skbuff_cache, skb);
} else {
__kfree_skb(skb);
}
@@ -4820,7 +6119,19 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
if (skb_cloned(to))
return false;
- if (len <= skb_tailroom(to)) {
+ /* In general, avoid mixing page_pool and non-page_pool allocated
+ * pages within the same SKB. In theory we could take full
+ * references if @from is cloned and !@to->pp_recycle but its
+ * tricky (due to potential race with the clone disappearing) and
+ * rare, so not worth dealing with.
+ */
+ if (to->pp_recycle != from->pp_recycle)
+ return false;
+
+ if (skb_frags_readable(from) != skb_frags_readable(to))
+ return false;
+
+ if (len <= skb_tailroom(to) && skb_frags_readable(from)) {
if (len)
BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
*delta_truesize = 0;
@@ -4874,8 +6185,10 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
/* if the skb is not cloned this does nothing
* since we set nr_frags to 0.
*/
- for (i = 0; i < from_shinfo->nr_frags; i++)
- __skb_frag_ref(&from_shinfo->frags[i]);
+ if (skb_pp_frag_ref(from)) {
+ for (i = 0; i < from_shinfo->nr_frags; i++)
+ __skb_frag_ref(&from_shinfo->frags[i]);
+ }
to->truesize += delta;
to->len += len;
@@ -4892,7 +6205,7 @@ EXPORT_SYMBOL(skb_try_coalesce);
* @skb: buffer to clean
* @xnet: packet is crossing netns
*
- * skb_scrub_packet can be used after encapsulating or decapsulting a packet
+ * skb_scrub_packet can be used after encapsulating or decapsulating a packet
* into/from a tunnel. Some information have to be cleared during these
* operations.
* skb_scrub_packet can also be used to clean a skb before injecting it in
@@ -4905,161 +6218,28 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
skb->skb_iif = 0;
skb->ignore_df = 0;
skb_dst_drop(skb);
- secpath_reset(skb);
- nf_reset(skb);
+ skb_ext_reset(skb);
+ nf_reset_ct(skb);
nf_reset_trace(skb);
+#ifdef CONFIG_NET_SWITCHDEV
+ skb->offload_fwd_mark = 0;
+ skb->offload_l3_fwd_mark = 0;
+#endif
+ ipvs_reset(skb);
+
if (!xnet)
return;
- ipvs_reset(skb);
skb->mark = 0;
- skb->tstamp = 0;
+ skb_clear_tstamp(skb);
}
EXPORT_SYMBOL_GPL(skb_scrub_packet);
-/**
- * skb_gso_transport_seglen - Return length of individual segments of a gso packet
- *
- * @skb: GSO skb
- *
- * skb_gso_transport_seglen is used to determine the real size of the
- * individual segments, including Layer4 headers (TCP/UDP).
- *
- * The MAC/L2 or network (IP, IPv6) headers are not accounted for.
- */
-static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb)
-{
- const struct skb_shared_info *shinfo = skb_shinfo(skb);
- unsigned int thlen = 0;
-
- if (skb->encapsulation) {
- thlen = skb_inner_transport_header(skb) -
- skb_transport_header(skb);
-
- if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
- thlen += inner_tcp_hdrlen(skb);
- } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
- thlen = tcp_hdrlen(skb);
- } else if (unlikely(skb_is_gso_sctp(skb))) {
- thlen = sizeof(struct sctphdr);
- } else if (shinfo->gso_type & SKB_GSO_UDP_L4) {
- thlen = sizeof(struct udphdr);
- }
- /* UFO sets gso_size to the size of the fragmentation
- * payload, i.e. the size of the L4 (UDP) header is already
- * accounted for.
- */
- return thlen + shinfo->gso_size;
-}
-
-/**
- * skb_gso_network_seglen - Return length of individual segments of a gso packet
- *
- * @skb: GSO skb
- *
- * skb_gso_network_seglen is used to determine the real size of the
- * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP).
- *
- * The MAC/L2 header is not accounted for.
- */
-static unsigned int skb_gso_network_seglen(const struct sk_buff *skb)
-{
- unsigned int hdr_len = skb_transport_header(skb) -
- skb_network_header(skb);
-
- return hdr_len + skb_gso_transport_seglen(skb);
-}
-
-/**
- * skb_gso_mac_seglen - Return length of individual segments of a gso packet
- *
- * @skb: GSO skb
- *
- * skb_gso_mac_seglen is used to determine the real size of the
- * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4
- * headers (TCP/UDP).
- */
-static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb)
-{
- unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
-
- return hdr_len + skb_gso_transport_seglen(skb);
-}
-
-/**
- * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS
- *
- * There are a couple of instances where we have a GSO skb, and we
- * want to determine what size it would be after it is segmented.
- *
- * We might want to check:
- * - L3+L4+payload size (e.g. IP forwarding)
- * - L2+L3+L4+payload size (e.g. sanity check before passing to driver)
- *
- * This is a helper to do that correctly considering GSO_BY_FRAGS.
- *
- * @seg_len: The segmented length (from skb_gso_*_seglen). In the
- * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS].
- *
- * @max_len: The maximum permissible length.
- *
- * Returns true if the segmented length <= max length.
- */
-static inline bool skb_gso_size_check(const struct sk_buff *skb,
- unsigned int seg_len,
- unsigned int max_len) {
- const struct skb_shared_info *shinfo = skb_shinfo(skb);
- const struct sk_buff *iter;
-
- if (shinfo->gso_size != GSO_BY_FRAGS)
- return seg_len <= max_len;
-
- /* Undo this so we can re-use header sizes */
- seg_len -= GSO_BY_FRAGS;
-
- skb_walk_frags(skb, iter) {
- if (seg_len + skb_headlen(iter) > max_len)
- return false;
- }
-
- return true;
-}
-
-/**
- * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU?
- *
- * @skb: GSO skb
- * @mtu: MTU to validate against
- *
- * skb_gso_validate_network_len validates if a given skb will fit a
- * wanted MTU once split. It considers L3 headers, L4 headers, and the
- * payload.
- */
-bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu)
-{
- return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu);
-}
-EXPORT_SYMBOL_GPL(skb_gso_validate_network_len);
-
-/**
- * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length?
- *
- * @skb: GSO skb
- * @len: length to validate against
- *
- * skb_gso_validate_mac_len validates if a given skb will fit a wanted
- * length once split, including L2, L3 and L4 headers and the payload.
- */
-bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len)
-{
- return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len);
-}
-EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len);
-
static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
{
- int mac_len;
+ int mac_len, meta_len;
+ void *meta;
if (skb_cow(skb, skb_headroom(skb)) < 0) {
kfree_skb(skb);
@@ -5071,6 +6251,13 @@ static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
memmove(skb_mac_header(skb) + VLAN_HLEN, skb_mac_header(skb),
mac_len - VLAN_HLEN - ETH_TLEN);
}
+
+ meta_len = skb_metadata_len(skb);
+ if (meta_len) {
+ meta = skb_metadata_end(skb) - meta_len;
+ memmove(meta + VLAN_HLEN, meta, meta_len);
+ }
+
skb->mac_header += VLAN_HLEN;
return skb;
}
@@ -5088,8 +6275,8 @@ struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
skb = skb_share_check(skb, GFP_ATOMIC);
if (unlikely(!skb))
goto err_free;
-
- if (unlikely(!pskb_may_pull(skb, VLAN_HLEN)))
+ /* We may access the two bytes after vlan_hdr in vlan_set_encap_proto(). */
+ if (unlikely(!pskb_may_pull(skb, VLAN_HLEN + sizeof(unsigned short))))
goto err_free;
vhdr = (struct vlan_hdr *)skb->data;
@@ -5104,7 +6291,8 @@ struct sk_buff *skb_vlan_untag(struct sk_buff *skb)
goto err_free;
skb_reset_network_header(skb);
- skb_reset_transport_header(skb);
+ if (!skb_transport_header_was_set(skb))
+ skb_reset_transport_header(skb);
skb_reset_mac_len(skb);
return skb;
@@ -5115,7 +6303,7 @@ err_free:
}
EXPORT_SYMBOL(skb_vlan_untag);
-int skb_ensure_writable(struct sk_buff *skb, int write_len)
+int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len)
{
if (!pskb_may_pull(skb, write_len))
return -ENOMEM;
@@ -5127,12 +6315,36 @@ int skb_ensure_writable(struct sk_buff *skb, int write_len)
}
EXPORT_SYMBOL(skb_ensure_writable);
+int skb_ensure_writable_head_tail(struct sk_buff *skb, struct net_device *dev)
+{
+ int needed_headroom = dev->needed_headroom;
+ int needed_tailroom = dev->needed_tailroom;
+
+ /* For tail taggers, we need to pad short frames ourselves, to ensure
+ * that the tail tag does not fail at its role of being at the end of
+ * the packet, once the conduit interface pads the frame. Account for
+ * that pad length here, and pad later.
+ */
+ if (unlikely(needed_tailroom && skb->len < ETH_ZLEN))
+ needed_tailroom += ETH_ZLEN - skb->len;
+ /* skb_headroom() returns unsigned int... */
+ needed_headroom = max_t(int, needed_headroom - skb_headroom(skb), 0);
+ needed_tailroom = max_t(int, needed_tailroom - skb_tailroom(skb), 0);
+
+ if (likely(!needed_headroom && !needed_tailroom && !skb_cloned(skb)))
+ /* No reallocation needed, yay! */
+ return 0;
+
+ return pskb_expand_head(skb, needed_headroom, needed_tailroom,
+ GFP_ATOMIC);
+}
+EXPORT_SYMBOL(skb_ensure_writable_head_tail);
+
/* remove VLAN header from packet and update csum accordingly.
* expects a non skb_vlan_tag_present skb with a vlan tag payload
*/
int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
{
- struct vlan_hdr *vhdr;
int offset = skb->data - skb_mac_header(skb);
int err;
@@ -5148,13 +6360,8 @@ int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
- vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
- *vlan_tci = ntohs(vhdr->h_vlan_TCI);
+ vlan_remove_tag(skb, vlan_tci);
- memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
- __skb_pull(skb, VLAN_HLEN);
-
- vlan_set_encap_proto(skb, vhdr);
skb->mac_header += VLAN_HLEN;
if (skb_network_offset(skb) < ETH_HLEN)
@@ -5176,7 +6383,7 @@ int skb_vlan_pop(struct sk_buff *skb)
int err;
if (likely(skb_vlan_tag_present(skb))) {
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
} else {
if (unlikely(!eth_type_vlan(skb->protocol)))
return 0;
@@ -5220,7 +6427,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
return err;
skb->protocol = skb->vlan_proto;
- skb->mac_len += VLAN_HLEN;
+ skb->network_header -= VLAN_HLEN;
skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
}
@@ -5230,11 +6437,257 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
EXPORT_SYMBOL(skb_vlan_push);
/**
+ * skb_eth_pop() - Drop the Ethernet header at the head of a packet
+ *
+ * @skb: Socket buffer to modify
+ *
+ * Drop the Ethernet header of @skb.
+ *
+ * Expects that skb->data points to the mac header and that no VLAN tags are
+ * present.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_eth_pop(struct sk_buff *skb)
+{
+ if (!pskb_may_pull(skb, ETH_HLEN) || skb_vlan_tagged(skb) ||
+ skb_network_offset(skb) < ETH_HLEN)
+ return -EPROTO;
+
+ skb_pull_rcsum(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ return 0;
+}
+EXPORT_SYMBOL(skb_eth_pop);
+
+/**
+ * skb_eth_push() - Add a new Ethernet header at the head of a packet
+ *
+ * @skb: Socket buffer to modify
+ * @dst: Destination MAC address of the new header
+ * @src: Source MAC address of the new header
+ *
+ * Prepend @skb with a new Ethernet header.
+ *
+ * Expects that skb->data points to the mac header, which must be empty.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_eth_push(struct sk_buff *skb, const unsigned char *dst,
+ const unsigned char *src)
+{
+ struct ethhdr *eth;
+ int err;
+
+ if (skb_network_offset(skb) || skb_vlan_tag_present(skb))
+ return -EPROTO;
+
+ err = skb_cow_head(skb, sizeof(*eth));
+ if (err < 0)
+ return err;
+
+ skb_push(skb, sizeof(*eth));
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ eth = eth_hdr(skb);
+ ether_addr_copy(eth->h_dest, dst);
+ ether_addr_copy(eth->h_source, src);
+ eth->h_proto = skb->protocol;
+
+ skb_postpush_rcsum(skb, eth, sizeof(*eth));
+
+ return 0;
+}
+EXPORT_SYMBOL(skb_eth_push);
+
+/* Update the ethertype of hdr and the skb csum value if required. */
+static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
+ __be16 ethertype)
+{
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ __be16 diff[] = { ~hdr->h_proto, ethertype };
+
+ skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
+ }
+
+ hdr->h_proto = ethertype;
+}
+
+/**
+ * skb_mpls_push() - push a new MPLS header after mac_len bytes from start of
+ * the packet
+ *
+ * @skb: buffer
+ * @mpls_lse: MPLS label stack entry to push
+ * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848)
+ * @mac_len: length of the MAC header
+ * @ethernet: flag to indicate if the resulting packet after skb_mpls_push is
+ * ethernet
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto,
+ int mac_len, bool ethernet)
+{
+ struct mpls_shim_hdr *lse;
+ int err;
+
+ if (unlikely(!eth_p_mpls(mpls_proto)))
+ return -EINVAL;
+
+ /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */
+ if (skb->encapsulation)
+ return -EINVAL;
+
+ err = skb_cow_head(skb, MPLS_HLEN);
+ if (unlikely(err))
+ return err;
+
+ if (!skb->inner_protocol) {
+ skb_set_inner_network_header(skb, skb_network_offset(skb));
+ skb_set_inner_protocol(skb, skb->protocol);
+ }
+
+ skb_push(skb, MPLS_HLEN);
+ memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
+ mac_len);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+ skb_reset_mac_len(skb);
+
+ lse = mpls_hdr(skb);
+ lse->label_stack_entry = mpls_lse;
+ skb_postpush_rcsum(skb, lse, MPLS_HLEN);
+
+ if (ethernet && mac_len >= ETH_HLEN)
+ skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
+ skb->protocol = mpls_proto;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_mpls_push);
+
+/**
+ * skb_mpls_pop() - pop the outermost MPLS header
+ *
+ * @skb: buffer
+ * @next_proto: ethertype of header after popped MPLS header
+ * @mac_len: length of the MAC header
+ * @ethernet: flag to indicate if the packet is ethernet
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto, int mac_len,
+ bool ethernet)
+{
+ int err;
+
+ if (unlikely(!eth_p_mpls(skb->protocol)))
+ return 0;
+
+ err = skb_ensure_writable(skb, mac_len + MPLS_HLEN);
+ if (unlikely(err))
+ return err;
+
+ skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
+ memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
+ mac_len);
+
+ __skb_pull(skb, MPLS_HLEN);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, mac_len);
+
+ if (ethernet && mac_len >= ETH_HLEN) {
+ struct ethhdr *hdr;
+
+ /* use mpls_hdr() to get ethertype to account for VLANs. */
+ hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
+ skb_mod_eth_type(skb, hdr, next_proto);
+ }
+ skb->protocol = next_proto;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_mpls_pop);
+
+/**
+ * skb_mpls_update_lse() - modify outermost MPLS header and update csum
+ *
+ * @skb: buffer
+ * @mpls_lse: new MPLS label stack entry to update to
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)
+{
+ int err;
+
+ if (unlikely(!eth_p_mpls(skb->protocol)))
+ return -EINVAL;
+
+ err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
+ if (unlikely(err))
+ return err;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse };
+
+ skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
+ }
+
+ mpls_hdr(skb)->label_stack_entry = mpls_lse;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_mpls_update_lse);
+
+/**
+ * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header
+ *
+ * @skb: buffer
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_dec_ttl(struct sk_buff *skb)
+{
+ u32 lse;
+ u8 ttl;
+
+ if (unlikely(!eth_p_mpls(skb->protocol)))
+ return -EINVAL;
+
+ if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN))
+ return -ENOMEM;
+
+ lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
+ ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
+ if (!--ttl)
+ return -EINVAL;
+
+ lse &= ~MPLS_LS_TTL_MASK;
+ lse |= ttl << MPLS_LS_TTL_SHIFT;
+
+ return skb_mpls_update_lse(skb, cpu_to_be32(lse));
+}
+EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);
+
+/**
* alloc_skb_with_frags - allocate skb with page frags
*
* @header_len: size of linear part
* @data_len: needed length in frags
- * @max_page_order: max page order desired.
+ * @order: max page order desired.
* @errcode: pointer to error code if any
* @gfp_mask: allocation mask
*
@@ -5242,61 +6695,50 @@ EXPORT_SYMBOL(skb_vlan_push);
*/
struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
unsigned long data_len,
- int max_page_order,
+ int order,
int *errcode,
gfp_t gfp_mask)
{
- int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
unsigned long chunk;
struct sk_buff *skb;
struct page *page;
- gfp_t gfp_head;
- int i;
+ int nr_frags = 0;
*errcode = -EMSGSIZE;
- /* Note this test could be relaxed, if we succeed to allocate
- * high order pages...
- */
- if (npages > MAX_SKB_FRAGS)
+ if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order)))
return NULL;
- gfp_head = gfp_mask;
- if (gfp_head & __GFP_DIRECT_RECLAIM)
- gfp_head |= __GFP_RETRY_MAYFAIL;
-
*errcode = -ENOBUFS;
- skb = alloc_skb(header_len, gfp_head);
+ skb = alloc_skb(header_len, gfp_mask);
if (!skb)
return NULL;
- skb->truesize += npages << PAGE_SHIFT;
-
- for (i = 0; npages > 0; i++) {
- int order = max_page_order;
-
- while (order) {
- if (npages >= 1 << order) {
- page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
- __GFP_COMP |
- __GFP_NOWARN,
- order);
- if (page)
- goto fill_page;
- /* Do not retry other high order allocations */
- order = 1;
- max_page_order = 0;
- }
+ while (data_len) {
+ if (nr_frags == MAX_SKB_FRAGS)
+ goto failure;
+ while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order))
order--;
+
+ if (order) {
+ page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
+ __GFP_COMP |
+ __GFP_NOWARN,
+ order);
+ if (!page) {
+ order--;
+ continue;
+ }
+ } else {
+ page = alloc_page(gfp_mask);
+ if (!page)
+ goto failure;
}
- page = alloc_page(gfp_mask);
- if (!page)
- goto failure;
-fill_page:
chunk = min_t(unsigned long, data_len,
PAGE_SIZE << order);
- skb_fill_page_desc(skb, i, page, 0, chunk);
+ skb_fill_page_desc(skb, nr_frags, page, 0, chunk);
+ nr_frags++;
+ skb->truesize += (PAGE_SIZE << order);
data_len -= chunk;
- npages -= 1 << order;
}
return skb;
@@ -5311,21 +6753,17 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
const int headlen, gfp_t gfp_mask)
{
int i;
- int size = skb_end_offset(skb);
+ unsigned int size = skb_end_offset(skb);
int new_hlen = headlen - off;
u8 *data;
- size = SKB_DATA_ALIGN(size);
-
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
- data = kmalloc_reserve(size +
- SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
- gfp_mask, NUMA_NO_NODE, NULL);
+
+ data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
return -ENOMEM;
-
- size = SKB_WITH_OVERHEAD(ksize(data));
+ size = SKB_WITH_OVERHEAD(size);
/* Copy real data, and all frags */
skb_copy_from_linear_data_offset(skb, off, data, new_hlen);
@@ -5338,14 +6776,14 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
if (skb_cloned(skb)) {
/* drop the old head gracefully */
if (skb_orphan_frags(skb, gfp_mask)) {
- kfree(data);
+ skb_kfree_head(data, size);
return -ENOMEM;
}
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
skb_frag_ref(skb, i);
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
- skb_release_data(skb);
+ skb_release_data(skb, SKB_CONSUMED);
} else {
/* we can reuse existing recount- all we did was
* relocate values
@@ -5356,11 +6794,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
skb->head = data;
skb->data = data;
skb->head_frag = 0;
-#ifdef NET_SKBUFF_DATA_USES_OFFSET
- skb->end = size;
-#else
- skb->end = skb->head + size;
-#endif
+ skb_set_end_offset(skb, size);
skb_set_tail_pointer(skb, skb_headlen(skb));
skb_headers_offset_update(skb, 0);
skb->cloned = 0;
@@ -5376,8 +6810,7 @@ static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);
/* carve out the first eat bytes from skb's frag_list. May recurse into
* pskb_carve()
*/
-static int pskb_carve_frag_list(struct sk_buff *skb,
- struct skb_shared_info *shinfo, int eat,
+static int pskb_carve_frag_list(struct skb_shared_info *shinfo, int eat,
gfp_t gfp_mask)
{
struct sk_buff *list = shinfo->frag_list;
@@ -5417,7 +6850,7 @@ static int pskb_carve_frag_list(struct sk_buff *skb,
/* Free pulled out fragments. */
while ((list = shinfo->frag_list) != insp) {
shinfo->frag_list = list->next;
- kfree_skb(list);
+ consume_skb(list);
}
/* And insert new clone at head. */
if (clone) {
@@ -5434,28 +6867,23 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
int pos, gfp_t gfp_mask)
{
int i, k = 0;
- int size = skb_end_offset(skb);
+ unsigned int size = skb_end_offset(skb);
u8 *data;
const int nfrags = skb_shinfo(skb)->nr_frags;
struct skb_shared_info *shinfo;
- size = SKB_DATA_ALIGN(size);
-
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
- data = kmalloc_reserve(size +
- SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
- gfp_mask, NUMA_NO_NODE, NULL);
+
+ data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
return -ENOMEM;
-
- size = SKB_WITH_OVERHEAD(ksize(data));
+ size = SKB_WITH_OVERHEAD(size);
memcpy((struct skb_shared_info *)(data + size),
- skb_shinfo(skb), offsetof(struct skb_shared_info,
- frags[skb_shinfo(skb)->nr_frags]));
+ skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
if (skb_orphan_frags(skb, gfp_mask)) {
- kfree(data);
+ skb_kfree_head(data, size);
return -ENOMEM;
}
shinfo = (struct skb_shared_info *)(data + size);
@@ -5474,7 +6902,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
* where splitting is expensive.
* 2. Split is accurately. We make this.
*/
- shinfo->frags[0].page_offset += off - pos;
+ skb_frag_off_add(&shinfo->frags[0], off - pos);
skb_frag_size_sub(&shinfo->frags[0], off - pos);
}
skb_frag_ref(skb, i);
@@ -5486,20 +6914,20 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
- if (k == 0) {
- /* split line is in frag list */
- pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask);
+ /* split line is in frag list */
+ if (k == 0 && pskb_carve_frag_list(shinfo, off - pos, gfp_mask)) {
+ /* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
+ if (skb_has_frag_list(skb))
+ kfree_skb_list(skb_shinfo(skb)->frag_list);
+ skb_kfree_head(data, size);
+ return -ENOMEM;
}
- skb_release_data(skb);
+ skb_release_data(skb, SKB_CONSUMED);
skb->head = data;
skb->head_frag = 0;
skb->data = data;
-#ifdef NET_SKBUFF_DATA_USES_OFFSET
- skb->end = size;
-#else
- skb->end = skb->head + size;
-#endif
+ skb_set_end_offset(skb, size);
skb_reset_tail_pointer(skb);
skb_headers_offset_update(skb, 0);
skb->cloned = 0;
@@ -5558,7 +6986,7 @@ void skb_condense(struct sk_buff *skb)
{
if (skb->data_len) {
if (skb->data_len > skb->end - skb->tail ||
- skb_cloned(skb))
+ skb_cloned(skb) || !skb_frags_readable(skb))
return;
/* Nice, we can free page frag(s) right now */
@@ -5573,3 +7001,419 @@ void skb_condense(struct sk_buff *skb)
*/
skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
}
+EXPORT_SYMBOL(skb_condense);
+
+#ifdef CONFIG_SKB_EXTENSIONS
+static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
+{
+ return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE);
+}
+
+/**
+ * __skb_ext_alloc - allocate a new skb extensions storage
+ *
+ * @flags: See kmalloc().
+ *
+ * Returns the newly allocated pointer. The pointer can later attached to a
+ * skb via __skb_ext_set().
+ * Note: caller must handle the skb_ext as an opaque data.
+ */
+struct skb_ext *__skb_ext_alloc(gfp_t flags)
+{
+ struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags);
+
+ if (new) {
+ memset(new->offset, 0, sizeof(new->offset));
+ refcount_set(&new->refcnt, 1);
+ }
+
+ return new;
+}
+
+static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old,
+ unsigned int old_active)
+{
+ struct skb_ext *new;
+
+ if (refcount_read(&old->refcnt) == 1)
+ return old;
+
+ new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
+ if (!new)
+ return NULL;
+
+ memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE);
+ refcount_set(&new->refcnt, 1);
+
+#ifdef CONFIG_XFRM
+ if (old_active & (1 << SKB_EXT_SEC_PATH)) {
+ struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH);
+ unsigned int i;
+
+ for (i = 0; i < sp->len; i++)
+ xfrm_state_hold(sp->xvec[i]);
+ }
+#endif
+#ifdef CONFIG_MCTP_FLOWS
+ if (old_active & (1 << SKB_EXT_MCTP)) {
+ struct mctp_flow *flow = skb_ext_get_ptr(old, SKB_EXT_MCTP);
+
+ if (flow->key)
+ refcount_inc(&flow->key->refs);
+ }
+#endif
+ __skb_ext_put(old);
+ return new;
+}
+
+/**
+ * __skb_ext_set - attach the specified extension storage to this skb
+ * @skb: buffer
+ * @id: extension id
+ * @ext: extension storage previously allocated via __skb_ext_alloc()
+ *
+ * Existing extensions, if any, are cleared.
+ *
+ * Returns the pointer to the extension.
+ */
+void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
+ struct skb_ext *ext)
+{
+ unsigned int newlen, newoff = SKB_EXT_CHUNKSIZEOF(*ext);
+
+ skb_ext_put(skb);
+ newlen = newoff + skb_ext_type_len[id];
+ ext->chunks = newlen;
+ ext->offset[id] = newoff;
+ skb->extensions = ext;
+ skb->active_extensions = 1 << id;
+ return skb_ext_get_ptr(ext, id);
+}
+EXPORT_SYMBOL_NS_GPL(__skb_ext_set, "NETDEV_INTERNAL");
+
+/**
+ * skb_ext_add - allocate space for given extension, COW if needed
+ * @skb: buffer
+ * @id: extension to allocate space for
+ *
+ * Allocates enough space for the given extension.
+ * If the extension is already present, a pointer to that extension
+ * is returned.
+ *
+ * If the skb was cloned, COW applies and the returned memory can be
+ * modified without changing the extension space of clones buffers.
+ *
+ * Returns pointer to the extension or NULL on allocation failure.
+ */
+void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
+{
+ struct skb_ext *new, *old = NULL;
+ unsigned int newlen, newoff;
+
+ if (skb->active_extensions) {
+ old = skb->extensions;
+
+ new = skb_ext_maybe_cow(old, skb->active_extensions);
+ if (!new)
+ return NULL;
+
+ if (__skb_ext_exist(new, id))
+ goto set_active;
+
+ newoff = new->chunks;
+ } else {
+ newoff = SKB_EXT_CHUNKSIZEOF(*new);
+
+ new = __skb_ext_alloc(GFP_ATOMIC);
+ if (!new)
+ return NULL;
+ }
+
+ newlen = newoff + skb_ext_type_len[id];
+ new->chunks = newlen;
+ new->offset[id] = newoff;
+set_active:
+ skb->slow_gro = 1;
+ skb->extensions = new;
+ skb->active_extensions |= 1 << id;
+ return skb_ext_get_ptr(new, id);
+}
+EXPORT_SYMBOL(skb_ext_add);
+
+#ifdef CONFIG_XFRM
+static void skb_ext_put_sp(struct sec_path *sp)
+{
+ unsigned int i;
+
+ for (i = 0; i < sp->len; i++)
+ xfrm_state_put(sp->xvec[i]);
+}
+#endif
+
+#ifdef CONFIG_MCTP_FLOWS
+static void skb_ext_put_mctp(struct mctp_flow *flow)
+{
+ if (flow->key)
+ mctp_key_unref(flow->key);
+}
+#endif
+
+void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
+{
+ struct skb_ext *ext = skb->extensions;
+
+ skb->active_extensions &= ~(1 << id);
+ if (skb->active_extensions == 0) {
+ skb->extensions = NULL;
+ __skb_ext_put(ext);
+#ifdef CONFIG_XFRM
+ } else if (id == SKB_EXT_SEC_PATH &&
+ refcount_read(&ext->refcnt) == 1) {
+ struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH);
+
+ skb_ext_put_sp(sp);
+ sp->len = 0;
+#endif
+ }
+}
+EXPORT_SYMBOL(__skb_ext_del);
+
+void __skb_ext_put(struct skb_ext *ext)
+{
+ /* If this is last clone, nothing can increment
+ * it after check passes. Avoids one atomic op.
+ */
+ if (refcount_read(&ext->refcnt) == 1)
+ goto free_now;
+
+ if (!refcount_dec_and_test(&ext->refcnt))
+ return;
+free_now:
+#ifdef CONFIG_XFRM
+ if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH))
+ skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH));
+#endif
+#ifdef CONFIG_MCTP_FLOWS
+ if (__skb_ext_exist(ext, SKB_EXT_MCTP))
+ skb_ext_put_mctp(skb_ext_get_ptr(ext, SKB_EXT_MCTP));
+#endif
+
+ kmem_cache_free(skbuff_ext_cache, ext);
+}
+EXPORT_SYMBOL(__skb_ext_put);
+#endif /* CONFIG_SKB_EXTENSIONS */
+
+static void kfree_skb_napi_cache(struct sk_buff *skb)
+{
+ /* if SKB is a clone, don't handle this case */
+ if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
+ __kfree_skb(skb);
+ return;
+ }
+
+ local_bh_disable();
+ __napi_kfree_skb(skb, SKB_CONSUMED);
+ local_bh_enable();
+}
+
+/**
+ * skb_attempt_defer_free - queue skb for remote freeing
+ * @skb: buffer
+ *
+ * Put @skb in a per-cpu list, using the cpu which
+ * allocated the skb/pages to reduce false sharing
+ * and memory zone spinlock contention.
+ */
+void skb_attempt_defer_free(struct sk_buff *skb)
+{
+ struct skb_defer_node *sdn;
+ unsigned long defer_count;
+ int cpu = skb->alloc_cpu;
+ unsigned int defer_max;
+ bool kick;
+
+ if (cpu == raw_smp_processor_id() ||
+ WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
+ !cpu_online(cpu)) {
+nodefer: kfree_skb_napi_cache(skb);
+ return;
+ }
+
+ DEBUG_NET_WARN_ON_ONCE(skb_dst(skb));
+ DEBUG_NET_WARN_ON_ONCE(skb->destructor);
+ DEBUG_NET_WARN_ON_ONCE(skb_nfct(skb));
+
+ sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + numa_node_id();
+
+ defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max);
+ defer_count = atomic_long_inc_return(&sdn->defer_count);
+
+ if (defer_count >= defer_max)
+ goto nodefer;
+
+ llist_add(&skb->ll_node, &sdn->defer_list);
+
+ /* Send an IPI every time queue reaches half capacity. */
+ kick = (defer_count - 1) == (defer_max >> 1);
+
+ /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
+ * if we are unlucky enough (this seems very unlikely).
+ */
+ if (unlikely(kick))
+ kick_defer_list_purge(cpu);
+}
+
+static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
+ size_t offset, size_t len)
+{
+ const char *kaddr;
+ __wsum csum;
+
+ kaddr = kmap_local_page(page);
+ csum = csum_partial(kaddr + offset, len, 0);
+ kunmap_local(kaddr);
+ skb->csum = csum_block_add(skb->csum, csum, skb->len);
+}
+
+/**
+ * skb_splice_from_iter - Splice (or copy) pages to skbuff
+ * @skb: The buffer to add pages to
+ * @iter: Iterator representing the pages to be added
+ * @maxsize: Maximum amount of pages to be added
+ *
+ * This is a common helper function for supporting MSG_SPLICE_PAGES. It
+ * extracts pages from an iterator and adds them to the socket buffer if
+ * possible, copying them to fragments if not possible (such as if they're slab
+ * pages).
+ *
+ * Returns the amount of data spliced/copied or -EMSGSIZE if there's
+ * insufficient space in the buffer to transfer anything.
+ */
+ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
+ ssize_t maxsize)
+{
+ size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags);
+ struct page *pages[8], **ppages = pages;
+ ssize_t spliced = 0, ret = 0;
+ unsigned int i;
+
+ while (iter->count > 0) {
+ ssize_t space, nr, len;
+ size_t off;
+
+ ret = -EMSGSIZE;
+ space = frag_limit - skb_shinfo(skb)->nr_frags;
+ if (space < 0)
+ break;
+
+ /* We might be able to coalesce without increasing nr_frags */
+ nr = clamp_t(size_t, space, 1, ARRAY_SIZE(pages));
+
+ len = iov_iter_extract_pages(iter, &ppages, maxsize, nr, 0, &off);
+ if (len <= 0) {
+ ret = len ?: -EIO;
+ break;
+ }
+
+ i = 0;
+ do {
+ struct page *page = pages[i++];
+ size_t part = min_t(size_t, PAGE_SIZE - off, len);
+
+ ret = -EIO;
+ if (WARN_ON_ONCE(!sendpage_ok(page)))
+ goto out;
+
+ ret = skb_append_pagefrags(skb, page, off, part,
+ frag_limit);
+ if (ret < 0) {
+ iov_iter_revert(iter, len);
+ goto out;
+ }
+
+ if (skb->ip_summed == CHECKSUM_NONE)
+ skb_splice_csum_page(skb, page, off, part);
+
+ off = 0;
+ spliced += part;
+ maxsize -= part;
+ len -= part;
+ } while (len > 0);
+
+ if (maxsize <= 0)
+ break;
+ }
+
+out:
+ skb_len_add(skb, spliced);
+ return spliced ?: ret;
+}
+EXPORT_SYMBOL(skb_splice_from_iter);
+
+static __always_inline
+size_t memcpy_from_iter_csum(void *iter_from, size_t progress,
+ size_t len, void *to, void *priv2)
+{
+ __wsum *csum = priv2;
+ __wsum next = csum_partial_copy_nocheck(iter_from, to + progress, len);
+
+ *csum = csum_block_add(*csum, next, progress);
+ return 0;
+}
+
+static __always_inline
+size_t copy_from_user_iter_csum(void __user *iter_from, size_t progress,
+ size_t len, void *to, void *priv2)
+{
+ __wsum next, *csum = priv2;
+
+ next = csum_and_copy_from_user(iter_from, to + progress, len);
+ *csum = csum_block_add(*csum, next, progress);
+ return next ? 0 : len;
+}
+
+bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
+ __wsum *csum, struct iov_iter *i)
+{
+ size_t copied;
+
+ if (WARN_ON_ONCE(!i->data_source))
+ return false;
+ copied = iterate_and_advance2(i, bytes, addr, csum,
+ copy_from_user_iter_csum,
+ memcpy_from_iter_csum);
+ if (likely(copied == bytes))
+ return true;
+ iov_iter_revert(i, copied);
+ return false;
+}
+EXPORT_SYMBOL(csum_and_copy_from_iter_full);
+
+void get_netmem(netmem_ref netmem)
+{
+ struct net_iov *niov;
+
+ if (netmem_is_net_iov(netmem)) {
+ niov = netmem_to_net_iov(netmem);
+ if (net_is_devmem_iov(niov))
+ net_devmem_get_net_iov(netmem_to_net_iov(netmem));
+ return;
+ }
+ get_page(netmem_to_page(netmem));
+}
+EXPORT_SYMBOL(get_netmem);
+
+void put_netmem(netmem_ref netmem)
+{
+ struct net_iov *niov;
+
+ if (netmem_is_net_iov(netmem)) {
+ niov = netmem_to_net_iov(netmem);
+ if (net_is_devmem_iov(niov))
+ net_devmem_put_net_iov(netmem_to_net_iov(netmem));
+ return;
+ }
+
+ put_page(netmem_to_page(netmem));
+}
+EXPORT_SYMBOL(put_netmem);
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
new file mode 100644
index 000000000000..2ac7731e1e0a
--- /dev/null
+++ b/net/core/skmsg.c
@@ -0,0 +1,1289 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/skmsg.h>
+#include <linux/skbuff.h>
+#include <linux/scatterlist.h>
+
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/tls.h>
+#include <trace/events/sock.h>
+
+static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce)
+{
+ if (msg->sg.end > msg->sg.start &&
+ elem_first_coalesce < msg->sg.end)
+ return true;
+
+ if (msg->sg.end < msg->sg.start &&
+ (elem_first_coalesce > msg->sg.start ||
+ elem_first_coalesce < msg->sg.end))
+ return true;
+
+ return false;
+}
+
+int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
+ int elem_first_coalesce)
+{
+ struct page_frag *pfrag = sk_page_frag(sk);
+ u32 osize = msg->sg.size;
+ int ret = 0;
+
+ len -= msg->sg.size;
+ while (len > 0) {
+ struct scatterlist *sge;
+ u32 orig_offset;
+ int use, i;
+
+ if (!sk_page_frag_refill(sk, pfrag)) {
+ ret = -ENOMEM;
+ goto msg_trim;
+ }
+
+ orig_offset = pfrag->offset;
+ use = min_t(int, len, pfrag->size - orig_offset);
+ if (!sk_wmem_schedule(sk, use)) {
+ ret = -ENOMEM;
+ goto msg_trim;
+ }
+
+ i = msg->sg.end;
+ sk_msg_iter_var_prev(i);
+ sge = &msg->sg.data[i];
+
+ if (sk_msg_try_coalesce_ok(msg, elem_first_coalesce) &&
+ sg_page(sge) == pfrag->page &&
+ sge->offset + sge->length == orig_offset) {
+ sge->length += use;
+ } else {
+ if (sk_msg_full(msg)) {
+ ret = -ENOSPC;
+ break;
+ }
+
+ sge = &msg->sg.data[msg->sg.end];
+ sg_unmark_end(sge);
+ sg_set_page(sge, pfrag->page, use, orig_offset);
+ get_page(pfrag->page);
+ sk_msg_iter_next(msg, end);
+ }
+
+ sk_mem_charge(sk, use);
+ msg->sg.size += use;
+ pfrag->offset += use;
+ len -= use;
+ }
+
+ return ret;
+
+msg_trim:
+ sk_msg_trim(sk, msg, osize);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_alloc);
+
+int sk_msg_clone(struct sock *sk, struct sk_msg *dst, struct sk_msg *src,
+ u32 off, u32 len)
+{
+ int i = src->sg.start;
+ struct scatterlist *sge = sk_msg_elem(src, i);
+ struct scatterlist *sgd = NULL;
+ u32 sge_len, sge_off;
+
+ while (off) {
+ if (sge->length > off)
+ break;
+ off -= sge->length;
+ sk_msg_iter_var_next(i);
+ if (i == src->sg.end && off)
+ return -ENOSPC;
+ sge = sk_msg_elem(src, i);
+ }
+
+ while (len) {
+ sge_len = sge->length - off;
+ if (sge_len > len)
+ sge_len = len;
+
+ if (dst->sg.end)
+ sgd = sk_msg_elem(dst, dst->sg.end - 1);
+
+ if (sgd &&
+ (sg_page(sge) == sg_page(sgd)) &&
+ (sg_virt(sge) + off == sg_virt(sgd) + sgd->length)) {
+ sgd->length += sge_len;
+ dst->sg.size += sge_len;
+ } else if (!sk_msg_full(dst)) {
+ sge_off = sge->offset + off;
+ sk_msg_page_add(dst, sg_page(sge), sge_len, sge_off);
+ } else {
+ return -ENOSPC;
+ }
+
+ off = 0;
+ len -= sge_len;
+ sk_mem_charge(sk, sge_len);
+ sk_msg_iter_var_next(i);
+ if (i == src->sg.end && len)
+ return -ENOSPC;
+ sge = sk_msg_elem(src, i);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sk_msg_clone);
+
+void sk_msg_return_zero(struct sock *sk, struct sk_msg *msg, int bytes)
+{
+ int i = msg->sg.start;
+
+ do {
+ struct scatterlist *sge = sk_msg_elem(msg, i);
+
+ if (bytes < sge->length) {
+ sge->length -= bytes;
+ sge->offset += bytes;
+ sk_mem_uncharge(sk, bytes);
+ break;
+ }
+
+ sk_mem_uncharge(sk, sge->length);
+ bytes -= sge->length;
+ sge->length = 0;
+ sge->offset = 0;
+ sk_msg_iter_var_next(i);
+ } while (bytes && i != msg->sg.end);
+ msg->sg.start = i;
+}
+EXPORT_SYMBOL_GPL(sk_msg_return_zero);
+
+void sk_msg_return(struct sock *sk, struct sk_msg *msg, int bytes)
+{
+ int i = msg->sg.start;
+
+ do {
+ struct scatterlist *sge = &msg->sg.data[i];
+ int uncharge = (bytes < sge->length) ? bytes : sge->length;
+
+ sk_mem_uncharge(sk, uncharge);
+ bytes -= uncharge;
+ sk_msg_iter_var_next(i);
+ } while (i != msg->sg.end);
+}
+EXPORT_SYMBOL_GPL(sk_msg_return);
+
+static int sk_msg_free_elem(struct sock *sk, struct sk_msg *msg, u32 i,
+ bool charge)
+{
+ struct scatterlist *sge = sk_msg_elem(msg, i);
+ u32 len = sge->length;
+
+ /* When the skb owns the memory we free it from consume_skb path. */
+ if (!msg->skb) {
+ if (charge)
+ sk_mem_uncharge(sk, len);
+ put_page(sg_page(sge));
+ }
+ memset(sge, 0, sizeof(*sge));
+ return len;
+}
+
+static int __sk_msg_free(struct sock *sk, struct sk_msg *msg, u32 i,
+ bool charge)
+{
+ struct scatterlist *sge = sk_msg_elem(msg, i);
+ int freed = 0;
+
+ while (msg->sg.size) {
+ msg->sg.size -= sge->length;
+ freed += sk_msg_free_elem(sk, msg, i, charge);
+ sk_msg_iter_var_next(i);
+ sk_msg_check_to_free(msg, i, msg->sg.size);
+ sge = sk_msg_elem(msg, i);
+ }
+ consume_skb(msg->skb);
+ sk_msg_init(msg);
+ return freed;
+}
+
+int sk_msg_free_nocharge(struct sock *sk, struct sk_msg *msg)
+{
+ return __sk_msg_free(sk, msg, msg->sg.start, false);
+}
+EXPORT_SYMBOL_GPL(sk_msg_free_nocharge);
+
+int sk_msg_free(struct sock *sk, struct sk_msg *msg)
+{
+ return __sk_msg_free(sk, msg, msg->sg.start, true);
+}
+EXPORT_SYMBOL_GPL(sk_msg_free);
+
+static void __sk_msg_free_partial(struct sock *sk, struct sk_msg *msg,
+ u32 bytes, bool charge)
+{
+ struct scatterlist *sge;
+ u32 i = msg->sg.start;
+
+ while (bytes) {
+ sge = sk_msg_elem(msg, i);
+ if (!sge->length)
+ break;
+ if (bytes < sge->length) {
+ if (charge)
+ sk_mem_uncharge(sk, bytes);
+ sge->length -= bytes;
+ sge->offset += bytes;
+ msg->sg.size -= bytes;
+ break;
+ }
+
+ msg->sg.size -= sge->length;
+ bytes -= sge->length;
+ sk_msg_free_elem(sk, msg, i, charge);
+ sk_msg_iter_var_next(i);
+ sk_msg_check_to_free(msg, i, bytes);
+ }
+ msg->sg.start = i;
+}
+
+void sk_msg_free_partial(struct sock *sk, struct sk_msg *msg, u32 bytes)
+{
+ __sk_msg_free_partial(sk, msg, bytes, true);
+}
+EXPORT_SYMBOL_GPL(sk_msg_free_partial);
+
+void sk_msg_free_partial_nocharge(struct sock *sk, struct sk_msg *msg,
+ u32 bytes)
+{
+ __sk_msg_free_partial(sk, msg, bytes, false);
+}
+
+void sk_msg_trim(struct sock *sk, struct sk_msg *msg, int len)
+{
+ int trim = msg->sg.size - len;
+ u32 i = msg->sg.end;
+
+ if (trim <= 0) {
+ WARN_ON(trim < 0);
+ return;
+ }
+
+ sk_msg_iter_var_prev(i);
+ msg->sg.size = len;
+ while (msg->sg.data[i].length &&
+ trim >= msg->sg.data[i].length) {
+ trim -= msg->sg.data[i].length;
+ sk_msg_free_elem(sk, msg, i, true);
+ sk_msg_iter_var_prev(i);
+ if (!trim)
+ goto out;
+ }
+
+ msg->sg.data[i].length -= trim;
+ sk_mem_uncharge(sk, trim);
+ /* Adjust copybreak if it falls into the trimmed part of last buf */
+ if (msg->sg.curr == i && msg->sg.copybreak > msg->sg.data[i].length)
+ msg->sg.copybreak = msg->sg.data[i].length;
+out:
+ sk_msg_iter_var_next(i);
+ msg->sg.end = i;
+
+ /* If we trim data a full sg elem before curr pointer update
+ * copybreak and current so that any future copy operations
+ * start at new copy location.
+ * However trimmed data that has not yet been used in a copy op
+ * does not require an update.
+ */
+ if (!msg->sg.size) {
+ msg->sg.curr = msg->sg.start;
+ msg->sg.copybreak = 0;
+ } else if (sk_msg_iter_dist(msg->sg.start, msg->sg.curr) >=
+ sk_msg_iter_dist(msg->sg.start, msg->sg.end)) {
+ sk_msg_iter_var_prev(i);
+ msg->sg.curr = i;
+ msg->sg.copybreak = msg->sg.data[i].length;
+ }
+}
+EXPORT_SYMBOL_GPL(sk_msg_trim);
+
+int sk_msg_zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
+ struct sk_msg *msg, u32 bytes)
+{
+ int i, maxpages, ret = 0, num_elems = sk_msg_elem_used(msg);
+ const int to_max_pages = MAX_MSG_FRAGS;
+ struct page *pages[MAX_MSG_FRAGS];
+ ssize_t orig, copied, use, offset;
+
+ orig = msg->sg.size;
+ while (bytes > 0) {
+ i = 0;
+ maxpages = to_max_pages - num_elems;
+ if (maxpages == 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ copied = iov_iter_get_pages2(from, pages, bytes, maxpages,
+ &offset);
+ if (copied <= 0) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ bytes -= copied;
+ msg->sg.size += copied;
+
+ while (copied) {
+ use = min_t(int, copied, PAGE_SIZE - offset);
+ sg_set_page(&msg->sg.data[msg->sg.end],
+ pages[i], use, offset);
+ sg_unmark_end(&msg->sg.data[msg->sg.end]);
+ sk_mem_charge(sk, use);
+
+ offset = 0;
+ copied -= use;
+ sk_msg_iter_next(msg, end);
+ num_elems++;
+ i++;
+ }
+ /* When zerocopy is mixed with sk_msg_*copy* operations we
+ * may have a copybreak set in this case clear and prefer
+ * zerocopy remainder when possible.
+ */
+ msg->sg.copybreak = 0;
+ msg->sg.curr = msg->sg.end;
+ }
+out:
+ /* Revert iov_iter updates, msg will need to use 'trim' later if it
+ * also needs to be cleared.
+ */
+ if (ret)
+ iov_iter_revert(from, msg->sg.size - orig);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_msg_zerocopy_from_iter);
+
+int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from,
+ struct sk_msg *msg, u32 bytes)
+{
+ int ret = -ENOSPC, i = msg->sg.curr;
+ u32 copy, buf_size, copied = 0;
+ struct scatterlist *sge;
+ void *to;
+
+ do {
+ sge = sk_msg_elem(msg, i);
+ /* This is possible if a trim operation shrunk the buffer */
+ if (msg->sg.copybreak >= sge->length) {
+ msg->sg.copybreak = 0;
+ sk_msg_iter_var_next(i);
+ if (i == msg->sg.end)
+ break;
+ sge = sk_msg_elem(msg, i);
+ }
+
+ buf_size = sge->length - msg->sg.copybreak;
+ copy = (buf_size > bytes) ? bytes : buf_size;
+ to = sg_virt(sge) + msg->sg.copybreak;
+ msg->sg.copybreak += copy;
+ if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY)
+ ret = copy_from_iter_nocache(to, copy, from);
+ else
+ ret = copy_from_iter(to, copy, from);
+ if (ret != copy) {
+ ret = -EFAULT;
+ goto out;
+ }
+ bytes -= copy;
+ copied += copy;
+ if (!bytes)
+ break;
+ msg->sg.copybreak = 0;
+ sk_msg_iter_var_next(i);
+ } while (i != msg->sg.end);
+out:
+ msg->sg.curr = i;
+ return (ret < 0) ? ret : copied;
+}
+EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter);
+
+/* Receive sk_msg from psock->ingress_msg to @msg. */
+int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg,
+ int len, int flags)
+{
+ struct iov_iter *iter = &msg->msg_iter;
+ int peek = flags & MSG_PEEK;
+ struct sk_msg *msg_rx;
+ int i, copied = 0;
+
+ msg_rx = sk_psock_peek_msg(psock);
+ while (copied != len) {
+ struct scatterlist *sge;
+
+ if (unlikely(!msg_rx))
+ break;
+
+ i = msg_rx->sg.start;
+ do {
+ struct page *page;
+ int copy;
+
+ sge = sk_msg_elem(msg_rx, i);
+ copy = sge->length;
+ page = sg_page(sge);
+ if (copied + copy > len)
+ copy = len - copied;
+ if (copy)
+ copy = copy_page_to_iter(page, sge->offset, copy, iter);
+ if (!copy) {
+ copied = copied ? copied : -EFAULT;
+ goto out;
+ }
+
+ copied += copy;
+ if (likely(!peek)) {
+ sge->offset += copy;
+ sge->length -= copy;
+ if (!msg_rx->skb) {
+ sk_mem_uncharge(sk, copy);
+ atomic_sub(copy, &sk->sk_rmem_alloc);
+ }
+ msg_rx->sg.size -= copy;
+
+ if (!sge->length) {
+ sk_msg_iter_var_next(i);
+ if (!msg_rx->skb)
+ put_page(page);
+ }
+ } else {
+ /* Lets not optimize peek case if copy_page_to_iter
+ * didn't copy the entire length lets just break.
+ */
+ if (copy != sge->length)
+ goto out;
+ sk_msg_iter_var_next(i);
+ }
+
+ if (copied == len)
+ break;
+ } while ((i != msg_rx->sg.end) && !sg_is_last(sge));
+
+ if (unlikely(peek)) {
+ msg_rx = sk_psock_next_msg(psock, msg_rx);
+ if (!msg_rx)
+ break;
+ continue;
+ }
+
+ msg_rx->sg.start = i;
+ if (!sge->length && (i == msg_rx->sg.end || sg_is_last(sge))) {
+ msg_rx = sk_psock_dequeue_msg(psock);
+ kfree_sk_msg(msg_rx);
+ }
+ msg_rx = sk_psock_peek_msg(psock);
+ }
+out:
+ return copied;
+}
+EXPORT_SYMBOL_GPL(sk_msg_recvmsg);
+
+bool sk_msg_is_readable(struct sock *sk)
+{
+ struct sk_psock *psock;
+ bool empty = true;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock))
+ empty = list_empty(&psock->ingress_msg);
+ rcu_read_unlock();
+ return !empty;
+}
+EXPORT_SYMBOL_GPL(sk_msg_is_readable);
+
+static struct sk_msg *alloc_sk_msg(gfp_t gfp)
+{
+ struct sk_msg *msg;
+
+ msg = kzalloc(sizeof(*msg), gfp | __GFP_NOWARN);
+ if (unlikely(!msg))
+ return NULL;
+ sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS);
+ return msg;
+}
+
+static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk,
+ struct sk_buff *skb)
+{
+ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
+ return NULL;
+
+ if (!sk_rmem_schedule(sk, skb, skb->truesize))
+ return NULL;
+
+ return alloc_sk_msg(GFP_KERNEL);
+}
+
+static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
+ u32 off, u32 len,
+ struct sk_psock *psock,
+ struct sock *sk,
+ struct sk_msg *msg,
+ bool take_ref)
+{
+ int num_sge, copied;
+
+ /* skb_to_sgvec will fail when the total number of fragments in
+ * frag_list and frags exceeds MAX_MSG_FRAGS. For example, the
+ * caller may aggregate multiple skbs.
+ */
+ num_sge = skb_to_sgvec(skb, msg->sg.data, off, len);
+ if (num_sge < 0) {
+ /* skb linearize may fail with ENOMEM, but lets simply try again
+ * later if this happens. Under memory pressure we don't want to
+ * drop the skb. We need to linearize the skb so that the mapping
+ * in skb_to_sgvec can not error.
+ * Note that skb_linearize requires the skb not to be shared.
+ */
+ if (skb_linearize(skb))
+ return -EAGAIN;
+
+ num_sge = skb_to_sgvec(skb, msg->sg.data, off, len);
+ if (unlikely(num_sge < 0))
+ return num_sge;
+ }
+
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+ psock->ingress_bytes += len;
+#endif
+ copied = len;
+ msg->sg.start = 0;
+ msg->sg.size = copied;
+ msg->sg.end = num_sge;
+ msg->skb = take_ref ? skb_get(skb) : skb;
+
+ sk_psock_queue_msg(psock, msg);
+ sk_psock_data_ready(sk, psock);
+ return copied;
+}
+
+static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
+ u32 off, u32 len, bool take_ref);
+
+static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb,
+ u32 off, u32 len)
+{
+ struct sock *sk = psock->sk;
+ struct sk_msg *msg;
+ int err;
+
+ /* If we are receiving on the same sock skb->sk is already assigned,
+ * skip memory accounting and owner transition seeing it already set
+ * correctly.
+ */
+ if (unlikely(skb->sk == sk))
+ return sk_psock_skb_ingress_self(psock, skb, off, len, true);
+ msg = sk_psock_create_ingress_msg(sk, skb);
+ if (!msg)
+ return -EAGAIN;
+
+ /* This will transition ownership of the data from the socket where
+ * the BPF program was run initiating the redirect to the socket
+ * we will eventually receive this data on. The data will be released
+ * from skb_consume found in __tcp_bpf_recvmsg() after its been copied
+ * into user buffers.
+ */
+ skb_set_owner_r(skb, sk);
+ err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, true);
+ if (err < 0)
+ kfree(msg);
+ return err;
+}
+
+/* Puts an skb on the ingress queue of the socket already assigned to the
+ * skb. In this case we do not need to check memory limits or skb_set_owner_r
+ * because the skb is already accounted for here.
+ */
+static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb,
+ u32 off, u32 len, bool take_ref)
+{
+ struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC);
+ struct sock *sk = psock->sk;
+ int err;
+
+ if (unlikely(!msg))
+ return -EAGAIN;
+ skb_set_owner_r(skb, sk);
+ err = sk_psock_skb_ingress_enqueue(skb, off, len, psock, sk, msg, take_ref);
+ if (err < 0)
+ kfree(msg);
+ return err;
+}
+
+static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
+ u32 off, u32 len, bool ingress)
+{
+ if (!ingress) {
+ if (!sock_writeable(psock->sk))
+ return -EAGAIN;
+ return skb_send_sock(psock->sk, skb, off, len);
+ }
+
+ return sk_psock_skb_ingress(psock, skb, off, len);
+}
+
+static void sk_psock_skb_state(struct sk_psock *psock,
+ struct sk_psock_work_state *state,
+ int len, int off)
+{
+ spin_lock_bh(&psock->ingress_lock);
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
+ state->len = len;
+ state->off = off;
+ }
+ spin_unlock_bh(&psock->ingress_lock);
+}
+
+static void sk_psock_backlog(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct sk_psock *psock = container_of(dwork, struct sk_psock, work);
+ struct sk_psock_work_state *state = &psock->work_state;
+ struct sk_buff *skb = NULL;
+ u32 len = 0, off = 0;
+ bool ingress;
+ int ret;
+
+ /* If sk is quickly removed from the map and then added back, the old
+ * psock should not be scheduled, because there are now two psocks
+ * pointing to the same sk.
+ */
+ if (!sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+ return;
+
+ /* Increment the psock refcnt to synchronize with close(fd) path in
+ * sock_map_close(), ensuring we wait for backlog thread completion
+ * before sk_socket freed. If refcnt increment fails, it indicates
+ * sock_map_close() completed with sk_socket potentially already freed.
+ */
+ if (!sk_psock_get(psock->sk))
+ return;
+ mutex_lock(&psock->work_mutex);
+ while ((skb = skb_peek(&psock->ingress_skb))) {
+ len = skb->len;
+ off = 0;
+ if (skb_bpf_strparser(skb)) {
+ struct strp_msg *stm = strp_msg(skb);
+
+ off = stm->offset;
+ len = stm->full_len;
+ }
+
+ /* Resume processing from previous partial state */
+ if (unlikely(state->len)) {
+ len = state->len;
+ off = state->off;
+ }
+
+ ingress = skb_bpf_ingress(skb);
+ skb_bpf_redirect_clear(skb);
+ do {
+ ret = -EIO;
+ if (!sock_flag(psock->sk, SOCK_DEAD))
+ ret = sk_psock_handle_skb(psock, skb, off,
+ len, ingress);
+ if (ret <= 0) {
+ if (ret == -EAGAIN) {
+ sk_psock_skb_state(psock, state, len, off);
+ /* Restore redir info we cleared before */
+ skb_bpf_set_redir(skb, psock->sk, ingress);
+ /* Delay slightly to prioritize any
+ * other work that might be here.
+ */
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+ schedule_delayed_work(&psock->work, 1);
+ goto end;
+ }
+ /* Hard errors break pipe and stop xmit. */
+ sk_psock_report_error(psock, ret ? -ret : EPIPE);
+ sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+ goto end;
+ }
+ off += ret;
+ len -= ret;
+ } while (len);
+
+ /* The entire skb sent, clear state */
+ sk_psock_skb_state(psock, state, 0, 0);
+ skb = skb_dequeue(&psock->ingress_skb);
+ kfree_skb(skb);
+ }
+end:
+ mutex_unlock(&psock->work_mutex);
+ sk_psock_put(psock->sk, psock);
+}
+
+struct sk_psock *sk_psock_init(struct sock *sk, int node)
+{
+ struct sk_psock *psock;
+ struct proto *prot;
+
+ write_lock_bh(&sk->sk_callback_lock);
+
+ if (sk_is_inet(sk) && inet_csk_has_ulp(sk)) {
+ psock = ERR_PTR(-EINVAL);
+ goto out;
+ }
+
+ if (sk->sk_user_data) {
+ psock = ERR_PTR(-EBUSY);
+ goto out;
+ }
+
+ psock = kzalloc_node(sizeof(*psock), GFP_ATOMIC | __GFP_NOWARN, node);
+ if (!psock) {
+ psock = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ prot = READ_ONCE(sk->sk_prot);
+ psock->sk = sk;
+ psock->eval = __SK_NONE;
+ psock->sk_proto = prot;
+ psock->saved_unhash = prot->unhash;
+ psock->saved_destroy = prot->destroy;
+ psock->saved_close = prot->close;
+ psock->saved_write_space = sk->sk_write_space;
+
+ INIT_LIST_HEAD(&psock->link);
+ spin_lock_init(&psock->link_lock);
+
+ INIT_DELAYED_WORK(&psock->work, sk_psock_backlog);
+ mutex_init(&psock->work_mutex);
+ INIT_LIST_HEAD(&psock->ingress_msg);
+ spin_lock_init(&psock->ingress_lock);
+ skb_queue_head_init(&psock->ingress_skb);
+
+ sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED);
+ refcount_set(&psock->refcnt, 1);
+
+ __rcu_assign_sk_user_data_with_flags(sk, psock,
+ SK_USER_DATA_NOCOPY |
+ SK_USER_DATA_PSOCK);
+ sock_hold(sk);
+
+out:
+ write_unlock_bh(&sk->sk_callback_lock);
+ return psock;
+}
+EXPORT_SYMBOL_GPL(sk_psock_init);
+
+struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock)
+{
+ struct sk_psock_link *link;
+
+ spin_lock_bh(&psock->link_lock);
+ link = list_first_entry_or_null(&psock->link, struct sk_psock_link,
+ list);
+ if (link)
+ list_del(&link->list);
+ spin_unlock_bh(&psock->link_lock);
+ return link;
+}
+
+static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
+{
+ struct sk_msg *msg, *tmp;
+
+ list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) {
+ list_del(&msg->list);
+ if (!msg->skb)
+ atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc);
+ sk_msg_free(psock->sk, msg);
+ kfree(msg);
+ }
+}
+
+static void __sk_psock_zap_ingress(struct sk_psock *psock)
+{
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) {
+ skb_bpf_redirect_clear(skb);
+ sock_drop(psock->sk, skb);
+ }
+ __sk_psock_purge_ingress_msg(psock);
+}
+
+static void sk_psock_link_destroy(struct sk_psock *psock)
+{
+ struct sk_psock_link *link, *tmp;
+
+ list_for_each_entry_safe(link, tmp, &psock->link, list) {
+ list_del(&link->list);
+ sk_psock_free_link(link);
+ }
+}
+
+void sk_psock_stop(struct sk_psock *psock)
+{
+ spin_lock_bh(&psock->ingress_lock);
+ sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
+ sk_psock_cork_free(psock);
+ spin_unlock_bh(&psock->ingress_lock);
+}
+
+static void sk_psock_done_strp(struct sk_psock *psock);
+
+static void sk_psock_destroy(struct work_struct *work)
+{
+ struct sk_psock *psock = container_of(to_rcu_work(work),
+ struct sk_psock, rwork);
+ /* No sk_callback_lock since already detached. */
+
+ sk_psock_done_strp(psock);
+
+ cancel_delayed_work_sync(&psock->work);
+ __sk_psock_zap_ingress(psock);
+ mutex_destroy(&psock->work_mutex);
+
+ psock_progs_drop(&psock->progs);
+
+ sk_psock_link_destroy(psock);
+ sk_psock_cork_free(psock);
+
+ if (psock->sk_redir)
+ sock_put(psock->sk_redir);
+ if (psock->sk_pair)
+ sock_put(psock->sk_pair);
+ sock_put(psock->sk);
+ kfree(psock);
+}
+
+void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
+{
+ write_lock_bh(&sk->sk_callback_lock);
+ sk_psock_restore_proto(sk, psock);
+ rcu_assign_sk_user_data(sk, NULL);
+ if (psock->progs.stream_parser)
+ sk_psock_stop_strp(sk, psock);
+ else if (psock->progs.stream_verdict || psock->progs.skb_verdict)
+ sk_psock_stop_verdict(sk, psock);
+ write_unlock_bh(&sk->sk_callback_lock);
+
+ sk_psock_stop(psock);
+
+ INIT_RCU_WORK(&psock->rwork, sk_psock_destroy);
+ queue_rcu_work(system_percpu_wq, &psock->rwork);
+}
+EXPORT_SYMBOL_GPL(sk_psock_drop);
+
+static int sk_psock_map_verd(int verdict, bool redir)
+{
+ switch (verdict) {
+ case SK_PASS:
+ return redir ? __SK_REDIRECT : __SK_PASS;
+ case SK_DROP:
+ default:
+ break;
+ }
+
+ return __SK_DROP;
+}
+
+int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
+ struct sk_msg *msg)
+{
+ struct bpf_prog *prog;
+ int ret;
+
+ rcu_read_lock();
+ prog = READ_ONCE(psock->progs.msg_parser);
+ if (unlikely(!prog)) {
+ ret = __SK_PASS;
+ goto out;
+ }
+
+ sk_msg_compute_data_pointers(msg);
+ msg->sk = sk;
+ ret = bpf_prog_run_pin_on_cpu(prog, msg);
+ ret = sk_psock_map_verd(ret, msg->sk_redir);
+ psock->apply_bytes = msg->apply_bytes;
+ if (ret == __SK_REDIRECT) {
+ if (psock->sk_redir) {
+ sock_put(psock->sk_redir);
+ psock->sk_redir = NULL;
+ }
+ if (!msg->sk_redir) {
+ ret = __SK_DROP;
+ goto out;
+ }
+ psock->redir_ingress = sk_msg_to_ingress(msg);
+ psock->sk_redir = msg->sk_redir;
+ sock_hold(psock->sk_redir);
+ }
+out:
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
+
+static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb)
+{
+ struct sk_psock *psock_other;
+ struct sock *sk_other;
+
+ sk_other = skb_bpf_redirect_fetch(skb);
+ /* This error is a buggy BPF program, it returned a redirect
+ * return code, but then didn't set a redirect interface.
+ */
+ if (unlikely(!sk_other)) {
+ skb_bpf_redirect_clear(skb);
+ sock_drop(from->sk, skb);
+ return -EIO;
+ }
+ psock_other = sk_psock(sk_other);
+ /* This error indicates the socket is being torn down or had another
+ * error that caused the pipe to break. We can't send a packet on
+ * a socket that is in this state so we drop the skb.
+ */
+ if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) {
+ skb_bpf_redirect_clear(skb);
+ sock_drop(from->sk, skb);
+ return -EIO;
+ }
+ spin_lock_bh(&psock_other->ingress_lock);
+ if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) {
+ spin_unlock_bh(&psock_other->ingress_lock);
+ skb_bpf_redirect_clear(skb);
+ sock_drop(from->sk, skb);
+ return -EIO;
+ }
+
+ skb_queue_tail(&psock_other->ingress_skb, skb);
+ schedule_delayed_work(&psock_other->work, 0);
+ spin_unlock_bh(&psock_other->ingress_lock);
+ return 0;
+}
+
+static void sk_psock_tls_verdict_apply(struct sk_buff *skb,
+ struct sk_psock *from, int verdict)
+{
+ switch (verdict) {
+ case __SK_REDIRECT:
+ sk_psock_skb_redirect(from, skb);
+ break;
+ case __SK_PASS:
+ case __SK_DROP:
+ default:
+ break;
+ }
+}
+
+int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb)
+{
+ struct bpf_prog *prog;
+ int ret = __SK_PASS;
+
+ rcu_read_lock();
+ prog = READ_ONCE(psock->progs.stream_verdict);
+ if (likely(prog)) {
+ skb->sk = psock->sk;
+ skb_dst_drop(skb);
+ skb_bpf_redirect_clear(skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
+ ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
+ skb->sk = NULL;
+ }
+ sk_psock_tls_verdict_apply(skb, psock, ret);
+ rcu_read_unlock();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read);
+
+static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
+ int verdict)
+{
+ struct sock *sk_other;
+ int err = 0;
+ u32 len, off;
+
+ switch (verdict) {
+ case __SK_PASS:
+ err = -EIO;
+ sk_other = psock->sk;
+ if (sock_flag(sk_other, SOCK_DEAD) ||
+ !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+ goto out_free;
+
+ skb_bpf_set_ingress(skb);
+
+ /* If the queue is empty then we can submit directly
+ * into the msg queue. If its not empty we have to
+ * queue work otherwise we may get OOO data. Otherwise,
+ * if sk_psock_skb_ingress errors will be handled by
+ * retrying later from workqueue.
+ */
+ if (skb_queue_empty(&psock->ingress_skb)) {
+ len = skb->len;
+ off = 0;
+ if (skb_bpf_strparser(skb)) {
+ struct strp_msg *stm = strp_msg(skb);
+
+ off = stm->offset;
+ len = stm->full_len;
+ }
+ err = sk_psock_skb_ingress_self(psock, skb, off, len, false);
+ }
+ if (err < 0) {
+ spin_lock_bh(&psock->ingress_lock);
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
+ skb_queue_tail(&psock->ingress_skb, skb);
+ schedule_delayed_work(&psock->work, 0);
+ err = 0;
+ }
+ spin_unlock_bh(&psock->ingress_lock);
+ if (err < 0)
+ goto out_free;
+ }
+ break;
+ case __SK_REDIRECT:
+ tcp_eat_skb(psock->sk, skb);
+ err = sk_psock_skb_redirect(psock, skb);
+ break;
+ case __SK_DROP:
+ default:
+out_free:
+ skb_bpf_redirect_clear(skb);
+ tcp_eat_skb(psock->sk, skb);
+ sock_drop(psock->sk, skb);
+ }
+
+ return err;
+}
+
+static void sk_psock_write_space(struct sock *sk)
+{
+ struct sk_psock *psock;
+ void (*write_space)(struct sock *sk) = NULL;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock)) {
+ if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
+ schedule_delayed_work(&psock->work, 0);
+ write_space = psock->saved_write_space;
+ }
+ rcu_read_unlock();
+ if (write_space)
+ write_space(sk);
+}
+
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
+{
+ struct sk_psock *psock;
+ struct bpf_prog *prog;
+ int ret = __SK_DROP;
+ struct sock *sk;
+
+ rcu_read_lock();
+ sk = strp->sk;
+ psock = sk_psock(sk);
+ if (unlikely(!psock)) {
+ sock_drop(sk, skb);
+ goto out;
+ }
+ prog = READ_ONCE(psock->progs.stream_verdict);
+ if (likely(prog)) {
+ skb->sk = sk;
+ skb_dst_drop(skb);
+ skb_bpf_redirect_clear(skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
+ skb_bpf_set_strparser(skb);
+ ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
+ skb->sk = NULL;
+ }
+ sk_psock_verdict_apply(psock, skb, ret);
+out:
+ rcu_read_unlock();
+}
+
+static int sk_psock_strp_read_done(struct strparser *strp, int err)
+{
+ return err;
+}
+
+static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
+{
+ struct sk_psock *psock = container_of(strp, struct sk_psock, strp);
+ struct bpf_prog *prog;
+ int ret = skb->len;
+
+ rcu_read_lock();
+ prog = READ_ONCE(psock->progs.stream_parser);
+ if (likely(prog)) {
+ skb->sk = psock->sk;
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
+ skb->sk = NULL;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+/* Called with socket lock held. */
+static void sk_psock_strp_data_ready(struct sock *sk)
+{
+ struct sk_psock *psock;
+
+ trace_sk_data_ready(sk);
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock)) {
+ if (tls_sw_has_ctx_rx(sk)) {
+ psock->saved_data_ready(sk);
+ } else {
+ read_lock_bh(&sk->sk_callback_lock);
+ strp_data_ready(&psock->strp);
+ read_unlock_bh(&sk->sk_callback_lock);
+ }
+ }
+ rcu_read_unlock();
+}
+
+int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
+{
+ int ret;
+
+ static const struct strp_callbacks cb = {
+ .rcv_msg = sk_psock_strp_read,
+ .read_sock_done = sk_psock_strp_read_done,
+ .parse_msg = sk_psock_strp_parse,
+ };
+
+ ret = strp_init(&psock->strp, sk, &cb);
+ if (!ret)
+ sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED);
+
+ if (sk_is_tcp(sk)) {
+ psock->strp.cb.read_sock = tcp_bpf_strp_read_sock;
+ psock->copied_seq = tcp_sk(sk)->copied_seq;
+ }
+ return ret;
+}
+
+void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
+{
+ if (psock->saved_data_ready)
+ return;
+
+ psock->saved_data_ready = sk->sk_data_ready;
+ sk->sk_data_ready = sk_psock_strp_data_ready;
+ sk->sk_write_space = sk_psock_write_space;
+}
+
+void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock)
+{
+ psock_set_prog(&psock->progs.stream_parser, NULL);
+
+ if (!psock->saved_data_ready)
+ return;
+
+ sk->sk_data_ready = psock->saved_data_ready;
+ psock->saved_data_ready = NULL;
+ strp_stop(&psock->strp);
+}
+
+static void sk_psock_done_strp(struct sk_psock *psock)
+{
+ /* Parser has been stopped */
+ if (sk_psock_test_state(psock, SK_PSOCK_RX_STRP_ENABLED))
+ strp_done(&psock->strp);
+}
+#else
+static void sk_psock_done_strp(struct sk_psock *psock)
+{
+}
+#endif /* CONFIG_BPF_STREAM_PARSER */
+
+static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_psock *psock;
+ struct bpf_prog *prog;
+ int ret = __SK_DROP;
+ int len = skb->len;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (unlikely(!psock)) {
+ len = 0;
+ tcp_eat_skb(sk, skb);
+ sock_drop(sk, skb);
+ goto out;
+ }
+ prog = READ_ONCE(psock->progs.stream_verdict);
+ if (!prog)
+ prog = READ_ONCE(psock->progs.skb_verdict);
+ if (likely(prog)) {
+ skb_dst_drop(skb);
+ skb_bpf_redirect_clear(skb);
+ ret = bpf_prog_run_pin_on_cpu(prog, skb);
+ ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
+ }
+ ret = sk_psock_verdict_apply(psock, skb, ret);
+ if (ret < 0)
+ len = ret;
+out:
+ rcu_read_unlock();
+ return len;
+}
+
+static void sk_psock_verdict_data_ready(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+ const struct proto_ops *ops;
+ int copied;
+
+ trace_sk_data_ready(sk);
+
+ if (unlikely(!sock))
+ return;
+ ops = READ_ONCE(sock->ops);
+ if (!ops || !ops->read_skb)
+ return;
+ copied = ops->read_skb(sk, sk_psock_verdict_recv);
+ if (copied >= 0) {
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (psock)
+ sk_psock_data_ready(sk, psock);
+ rcu_read_unlock();
+ }
+}
+
+void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock)
+{
+ if (psock->saved_data_ready)
+ return;
+
+ psock->saved_data_ready = sk->sk_data_ready;
+ sk->sk_data_ready = sk_psock_verdict_data_ready;
+ sk->sk_write_space = sk_psock_write_space;
+}
+
+void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock)
+{
+ psock_set_prog(&psock->progs.stream_verdict, NULL);
+ psock_set_prog(&psock->progs.skb_verdict, NULL);
+
+ if (!psock->saved_data_ready)
+ return;
+
+ sk->sk_data_ready = psock->saved_data_ready;
+ psock->saved_data_ready = NULL;
+}
diff --git a/net/core/sock.c b/net/core/sock.c
index 3730eb855095..45c98bf524b2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -6,7 +7,6 @@
* Generic socket support routines. Memory allocators, socket lock/release
* handler for protocols to use and generic option handler.
*
- *
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Florian La Roche, <flla@stud.uni-sb.de>
@@ -81,17 +81,11 @@
* Arnaldo C. Melo : cleanups, use skb_queue_purge
*
* To Fix:
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/errqueue.h>
@@ -113,21 +107,28 @@
#include <linux/interrupt.h>
#include <linux/poll.h>
#include <linux/tcp.h>
+#include <linux/udp.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/user_namespace.h>
#include <linux/static_key.h>
#include <linux/memcontrol.h>
#include <linux/prefetch.h>
+#include <linux/compat.h>
+#include <linux/mroute.h>
+#include <linux/mroute6.h>
+#include <linux/icmpv6.h>
#include <linux/uaccess.h>
#include <linux/netdevice.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
+#include <linux/skbuff_ref.h>
#include <net/net_namespace.h>
#include <net/request_sock.h>
#include <net/sock.h>
+#include <net/proto_memory.h>
#include <linux/net_tstamp.h>
#include <net/xfrm.h>
#include <linux/ipsec.h>
@@ -137,16 +138,25 @@
#include <linux/filter.h>
#include <net/sock_reuseport.h>
+#include <net/bpf_sk_storage.h>
#include <trace/events/sock.h>
#include <net/tcp.h>
#include <net/busy_poll.h>
+#include <net/phonet/phonet.h>
+
+#include <linux/ethtool.h>
+
+#include <uapi/linux/pidfd.h>
+
+#include "dev.h"
static DEFINE_MUTEX(proto_list_mutex);
static LIST_HEAD(proto_list);
-static void sock_inuse_add(struct net *net, int val);
+static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc);
+static void sock_def_write_space(struct sock *sk);
/**
* sk_ns_capable - General socket capability test
@@ -228,6 +238,7 @@ static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
+ x "AF_MCTP" , \
x "AF_MAX"
static const char *const af_family_key_strings[AF_MAX+1] = {
@@ -270,18 +281,12 @@ static struct lock_class_key af_elock_keys[AF_MAX];
static struct lock_class_key af_kern_callback_keys[AF_MAX];
/* Run time adjustable parameters. */
-__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
+__u32 sysctl_wmem_max __read_mostly = 4 << 20;
EXPORT_SYMBOL(sysctl_wmem_max);
-__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
+__u32 sysctl_rmem_max __read_mostly = 4 << 20;
EXPORT_SYMBOL(sysctl_rmem_max);
-__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
-__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
-
-/* Maximal space eaten by iovec or ancillary data plus some space */
-int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
-EXPORT_SYMBOL(sysctl_optmem_max);
-
-int sysctl_tstamp_allow_data __read_mostly = 1;
+__u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT;
+__u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT;
DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
EXPORT_SYMBOL_GPL(memalloc_socks_key);
@@ -328,28 +333,113 @@ int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
noreclaim_flag = memalloc_noreclaim_save();
- ret = sk->sk_backlog_rcv(sk, skb);
+ ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
+ tcp_v6_do_rcv,
+ tcp_v4_do_rcv,
+ sk, skb);
memalloc_noreclaim_restore(noreclaim_flag);
return ret;
}
EXPORT_SYMBOL(__sk_backlog_rcv);
-static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
+void sk_error_report(struct sock *sk)
{
- struct timeval tv;
+ sk->sk_error_report(sk);
+
+ switch (sk->sk_family) {
+ case AF_INET:
+ fallthrough;
+ case AF_INET6:
+ trace_inet_sk_error_report(sk);
+ break;
+ default:
+ break;
+ }
+}
+EXPORT_SYMBOL(sk_error_report);
+
+int sock_get_timeout(long timeo, void *optval, bool old_timeval)
+{
+ struct __kernel_sock_timeval tv;
+
+ if (timeo == MAX_SCHEDULE_TIMEOUT) {
+ tv.tv_sec = 0;
+ tv.tv_usec = 0;
+ } else {
+ tv.tv_sec = timeo / HZ;
+ tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
+ }
+
+ if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
+ struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
+ *(struct old_timeval32 *)optval = tv32;
+ return sizeof(tv32);
+ }
+
+ if (old_timeval) {
+ struct __kernel_old_timeval old_tv;
+ old_tv.tv_sec = tv.tv_sec;
+ old_tv.tv_usec = tv.tv_usec;
+ *(struct __kernel_old_timeval *)optval = old_tv;
+ return sizeof(old_tv);
+ }
+
+ *(struct __kernel_sock_timeval *)optval = tv;
+ return sizeof(tv);
+}
+EXPORT_SYMBOL(sock_get_timeout);
+
+int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
+ sockptr_t optval, int optlen, bool old_timeval)
+{
+ if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
+ struct old_timeval32 tv32;
+
+ if (optlen < sizeof(tv32))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
+ return -EFAULT;
+ tv->tv_sec = tv32.tv_sec;
+ tv->tv_usec = tv32.tv_usec;
+ } else if (old_timeval) {
+ struct __kernel_old_timeval old_tv;
+
+ if (optlen < sizeof(old_tv))
+ return -EINVAL;
+ if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
+ return -EFAULT;
+ tv->tv_sec = old_tv.tv_sec;
+ tv->tv_usec = old_tv.tv_usec;
+ } else {
+ if (optlen < sizeof(*tv))
+ return -EINVAL;
+ if (copy_from_sockptr(tv, optval, sizeof(*tv)))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(sock_copy_user_timeval);
+
+static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
+ bool old_timeval)
+{
+ struct __kernel_sock_timeval tv;
+ int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
+ long val;
+
+ if (err)
+ return err;
- if (optlen < sizeof(tv))
- return -EINVAL;
- if (copy_from_user(&tv, optval, sizeof(tv)))
- return -EFAULT;
if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
return -EDOM;
if (tv.tv_sec < 0) {
static int warned __read_mostly;
- *timeo_p = 0;
+ WRITE_ONCE(*timeo_p, 0);
if (warned < 10 && net_ratelimit()) {
warned++;
pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
@@ -357,24 +447,20 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
}
return 0;
}
- *timeo_p = MAX_SCHEDULE_TIMEOUT;
- if (tv.tv_sec == 0 && tv.tv_usec == 0)
- return 0;
- if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
- *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
+ val = MAX_SCHEDULE_TIMEOUT;
+ if ((tv.tv_sec || tv.tv_usec) &&
+ (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
+ val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
+ USEC_PER_SEC / HZ);
+ WRITE_ONCE(*timeo_p, val);
return 0;
}
-static void sock_warn_obsolete_bsdism(const char *name)
+static bool sk_set_prio_allowed(const struct sock *sk, int val)
{
- static int warned;
- static char warncomm[TASK_COMM_LEN];
- if (strcmp(warncomm, current->comm) && warned < 5) {
- strcpy(warncomm, current->comm);
- pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
- warncomm, name);
- warned++;
- }
+ return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN));
}
static bool sock_needs_netstamp(const struct sock *sk)
@@ -404,14 +490,14 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
unsigned long flags;
struct sk_buff_head *list = &sk->sk_receive_queue;
- if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
- atomic_inc(&sk->sk_drops);
+ if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
+ sk_drops_inc(sk);
trace_sock_rcvqueue_full(sk, skb);
return -ENOMEM;
}
if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
return -ENOBUFS;
}
@@ -434,30 +520,50 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(__sock_queue_rcv_skb);
-int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
+ enum skb_drop_reason *reason)
{
+ enum skb_drop_reason drop_reason;
int err;
- err = sk_filter(sk, skb);
+ err = sk_filter_reason(sk, skb, &drop_reason);
if (err)
- return err;
+ goto out;
- return __sock_queue_rcv_skb(sk, skb);
+ err = __sock_queue_rcv_skb(sk, skb);
+ switch (err) {
+ case -ENOMEM:
+ drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
+ break;
+ case -ENOBUFS:
+ drop_reason = SKB_DROP_REASON_PROTO_MEM;
+ break;
+ default:
+ drop_reason = SKB_NOT_DROPPED_YET;
+ break;
+ }
+out:
+ if (reason)
+ *reason = drop_reason;
+ return err;
}
-EXPORT_SYMBOL(sock_queue_rcv_skb);
+EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
const int nested, unsigned int trim_cap, bool refcounted)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
int rc = NET_RX_SUCCESS;
+ int err;
- if (sk_filter_trim_cap(sk, skb, trim_cap))
+ if (sk_filter_trim_cap(sk, skb, trim_cap, &reason))
goto discard_and_relse;
skb->dev = NULL;
- if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
- atomic_inc(&sk->sk_drops);
+ if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
+ sk_drops_inc(sk);
+ reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
goto discard_and_relse;
}
if (nested)
@@ -472,10 +578,14 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
rc = sk_backlog_rcv(sk, skb);
- mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
- } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
+ mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
+ } else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) {
bh_unlock_sock(sk);
- atomic_inc(&sk->sk_drops);
+ if (err == -ENOMEM)
+ reason = SKB_DROP_REASON_PFMEMALLOC;
+ if (err == -ENOBUFS)
+ reason = SKB_DROP_REASON_SOCKET_BACKLOG;
+ sk_drops_inc(sk);
goto discard_and_relse;
}
@@ -485,18 +595,24 @@ out:
sock_put(sk);
return rc;
discard_and_relse:
- kfree_skb(skb);
+ sk_skb_reason_drop(sk, skb, reason);
goto out;
}
EXPORT_SYMBOL(__sk_receive_skb);
+INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
+ u32));
+INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
+ u32));
struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
{
struct dst_entry *dst = __sk_dst_get(sk);
- if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ if (dst && READ_ONCE(dst->obsolete) &&
+ INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
+ dst, cookie) == NULL) {
sk_tx_queue_clear(sk);
- sk->sk_dst_pending_confirm = 0;
+ WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
dst_release(dst);
return NULL;
@@ -510,7 +626,9 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
{
struct dst_entry *dst = sk_dst_get(sk);
- if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+ if (dst && READ_ONCE(dst->obsolete) &&
+ INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
+ dst, cookie) == NULL) {
sk_dst_reset(sk);
dst_release(dst);
return NULL;
@@ -520,21 +638,59 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
}
EXPORT_SYMBOL(sk_dst_check);
-static int sock_setbindtodevice(struct sock *sk, char __user *optval,
- int optlen)
+static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
{
int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
struct net *net = sock_net(sk);
- char devname[IFNAMSIZ];
- int index;
/* Sorry... */
ret = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_RAW))
+ if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
goto out;
ret = -EINVAL;
+ if (ifindex < 0)
+ goto out;
+
+ /* Paired with all READ_ONCE() done locklessly. */
+ WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
+
+ if (sk->sk_prot->rehash)
+ sk->sk_prot->rehash(sk);
+ sk_dst_reset(sk);
+
+ ret = 0;
+
+out:
+#endif
+
+ return ret;
+}
+
+int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
+{
+ int ret;
+
+ if (lock_sk)
+ lock_sock(sk);
+ ret = sock_bindtoindex_locked(sk, ifindex);
+ if (lock_sk)
+ release_sock(sk);
+
+ return ret;
+}
+EXPORT_SYMBOL(sock_bindtoindex);
+
+static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
+{
+ int ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+ struct net *net = sock_net(sk);
+ char devname[IFNAMSIZ];
+ int index;
+
+ ret = -EINVAL;
if (optlen < 0)
goto out;
@@ -548,7 +704,7 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval,
memset(devname, 0, sizeof(devname));
ret = -EFAULT;
- if (copy_from_user(devname, optval, optlen))
+ if (copy_from_sockptr(devname, optval, optlen))
goto out;
index = 0;
@@ -565,28 +721,25 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval,
goto out;
}
- lock_sock(sk);
- sk->sk_bound_dev_if = index;
- sk_dst_reset(sk);
- release_sock(sk);
-
- ret = 0;
-
+ sockopt_lock_sock(sk);
+ ret = sock_bindtoindex_locked(sk, index);
+ sockopt_release_sock(sk);
out:
#endif
return ret;
}
-static int sock_getbindtodevice(struct sock *sk, char __user *optval,
- int __user *optlen, int len)
+static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
+ sockptr_t optlen, int len)
{
int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
+ int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
struct net *net = sock_net(sk);
char devname[IFNAMSIZ];
- if (sk->sk_bound_dev_if == 0) {
+ if (bound_dev_if == 0) {
len = 0;
goto zero;
}
@@ -595,19 +748,19 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval,
if (len < IFNAMSIZ)
goto out;
- ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
+ ret = netdev_get_name(net, devname, bound_dev_if);
if (ret)
goto out;
len = strlen(devname) + 1;
ret = -EFAULT;
- if (copy_to_user(optval, devname, len))
+ if (copy_to_sockptr(optval, devname, len))
goto out;
zero:
ret = -EFAULT;
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
goto out;
ret = 0;
@@ -618,43 +771,431 @@ out:
return ret;
}
-static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
-{
- if (valbool)
- sock_set_flag(sk, bit);
- else
- sock_reset_flag(sk, bit);
-}
-
-bool sk_mc_loop(struct sock *sk)
+bool sk_mc_loop(const struct sock *sk)
{
if (dev_recursion_level())
return false;
if (!sk)
return true;
- switch (sk->sk_family) {
+ /* IPV6_ADDRFORM can change sk->sk_family under us. */
+ switch (READ_ONCE(sk->sk_family)) {
case AF_INET:
- return inet_sk(sk)->mc_loop;
+ return inet_test_bit(MC_LOOP, sk);
#if IS_ENABLED(CONFIG_IPV6)
case AF_INET6:
- return inet6_sk(sk)->mc_loop;
+ return inet6_test_bit(MC6_LOOP, sk);
#endif
}
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return true;
}
EXPORT_SYMBOL(sk_mc_loop);
+void sock_set_reuseaddr(struct sock *sk)
+{
+ lock_sock(sk);
+ sk->sk_reuse = SK_CAN_REUSE;
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_reuseaddr);
+
+void sock_set_reuseport(struct sock *sk)
+{
+ lock_sock(sk);
+ sk->sk_reuseport = true;
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_reuseport);
+
+void sock_no_linger(struct sock *sk)
+{
+ lock_sock(sk);
+ WRITE_ONCE(sk->sk_lingertime, 0);
+ sock_set_flag(sk, SOCK_LINGER);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_no_linger);
+
+void sock_set_priority(struct sock *sk, u32 priority)
+{
+ WRITE_ONCE(sk->sk_priority, priority);
+}
+EXPORT_SYMBOL(sock_set_priority);
+
+void sock_set_sndtimeo(struct sock *sk, s64 secs)
+{
+ if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
+ WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
+ else
+ WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
+}
+EXPORT_SYMBOL(sock_set_sndtimeo);
+
+static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
+{
+ sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
+ sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
+ if (val) {
+ sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
+ }
+}
+
+void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
+{
+ switch (optname) {
+ case SO_TIMESTAMP_OLD:
+ __sock_set_timestamps(sk, valbool, false, false);
+ break;
+ case SO_TIMESTAMP_NEW:
+ __sock_set_timestamps(sk, valbool, true, false);
+ break;
+ case SO_TIMESTAMPNS_OLD:
+ __sock_set_timestamps(sk, valbool, false, true);
+ break;
+ case SO_TIMESTAMPNS_NEW:
+ __sock_set_timestamps(sk, valbool, true, true);
+ break;
+ }
+}
+
+static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
+{
+ struct net *net = sock_net(sk);
+ struct net_device *dev = NULL;
+ bool match = false;
+ int *vclock_index;
+ int i, num;
+
+ if (sk->sk_bound_dev_if)
+ dev = dev_get_by_index(net, sk->sk_bound_dev_if);
+
+ if (!dev) {
+ pr_err("%s: sock not bind to device\n", __func__);
+ return -EOPNOTSUPP;
+ }
+
+ num = ethtool_get_phc_vclocks(dev, &vclock_index);
+ dev_put(dev);
+
+ for (i = 0; i < num; i++) {
+ if (*(vclock_index + i) == phc_index) {
+ match = true;
+ break;
+ }
+ }
+
+ if (num > 0)
+ kfree(vclock_index);
+
+ if (!match)
+ return -EINVAL;
+
+ WRITE_ONCE(sk->sk_bind_phc, phc_index);
+
+ return 0;
+}
+
+int sock_set_timestamping(struct sock *sk, int optname,
+ struct so_timestamping timestamping)
+{
+ int val = timestamping.flags;
+ int ret;
+
+ if (val & ~SOF_TIMESTAMPING_MASK)
+ return -EINVAL;
+
+ if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
+ !(val & SOF_TIMESTAMPING_OPT_ID))
+ return -EINVAL;
+
+ if (val & SOF_TIMESTAMPING_OPT_ID &&
+ !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
+ if (sk_is_tcp(sk)) {
+ if ((1 << sk->sk_state) &
+ (TCPF_CLOSE | TCPF_LISTEN))
+ return -EINVAL;
+ if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
+ atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
+ else
+ atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
+ } else {
+ atomic_set(&sk->sk_tskey, 0);
+ }
+ }
+
+ if (val & SOF_TIMESTAMPING_OPT_STATS &&
+ !(val & SOF_TIMESTAMPING_OPT_TSONLY))
+ return -EINVAL;
+
+ if (val & SOF_TIMESTAMPING_BIND_PHC) {
+ ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
+ if (ret)
+ return ret;
+ }
+
+ WRITE_ONCE(sk->sk_tsflags, val);
+ sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
+ sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY));
+
+ if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
+ sock_enable_timestamp(sk,
+ SOCK_TIMESTAMPING_RX_SOFTWARE);
+ else
+ sock_disable_timestamp(sk,
+ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
+ return 0;
+}
+
+#if defined(CONFIG_CGROUP_BPF)
+void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
+{
+ struct bpf_sock_ops_kern sock_ops;
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+ sock_ops.op = op;
+ sock_ops.is_fullsock = 1;
+ sock_ops.sk = sk;
+ bpf_skops_init_skb(&sock_ops, skb, 0);
+ __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
+}
+#endif
+
+void sock_set_keepalive(struct sock *sk)
+{
+ lock_sock(sk);
+ if (sk->sk_prot->keepalive)
+ sk->sk_prot->keepalive(sk, true);
+ sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_keepalive);
+
+static void __sock_set_rcvbuf(struct sock *sk, int val)
+{
+ /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
+ * as a negative value.
+ */
+ val = min_t(int, val, INT_MAX / 2);
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+
+ /* We double it on the way in to account for "struct sk_buff" etc.
+ * overhead. Applications assume that the SO_RCVBUF setting they make
+ * will allow that much actual data to be received on that socket.
+ *
+ * Applications are unaware that "struct sk_buff" and other overheads
+ * allocate from the receive buffer during socket buffer allocation.
+ *
+ * And after considering the possible alternatives, returning the value
+ * we actually used in getsockopt is the most desirable behavior.
+ */
+ WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
+}
+
+void sock_set_rcvbuf(struct sock *sk, int val)
+{
+ lock_sock(sk);
+ __sock_set_rcvbuf(sk, val);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_rcvbuf);
+
+static void __sock_set_mark(struct sock *sk, u32 val)
+{
+ if (val != sk->sk_mark) {
+ WRITE_ONCE(sk->sk_mark, val);
+ sk_dst_reset(sk);
+ }
+}
+
+void sock_set_mark(struct sock *sk, u32 val)
+{
+ lock_sock(sk);
+ __sock_set_mark(sk, val);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sock_set_mark);
+
+static void sock_release_reserved_memory(struct sock *sk, int bytes)
+{
+ /* Round down bytes to multiple of pages */
+ bytes = round_down(bytes, PAGE_SIZE);
+
+ WARN_ON(bytes > sk->sk_reserved_mem);
+ WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
+ sk_mem_reclaim(sk);
+}
+
+static int sock_reserve_memory(struct sock *sk, int bytes)
+{
+ long allocated;
+ bool charged;
+ int pages;
+
+ if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk))
+ return -EOPNOTSUPP;
+
+ if (!bytes)
+ return 0;
+
+ pages = sk_mem_pages(bytes);
+
+ /* pre-charge to memcg */
+ charged = mem_cgroup_sk_charge(sk, pages,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ if (!charged)
+ return -ENOMEM;
+
+ if (sk->sk_bypass_prot_mem)
+ goto success;
+
+ /* pre-charge to forward_alloc */
+ sk_memory_allocated_add(sk, pages);
+ allocated = sk_memory_allocated(sk);
+
+ /* If the system goes into memory pressure with this
+ * precharge, give up and return error.
+ */
+ if (allocated > sk_prot_mem_limits(sk, 1)) {
+ sk_memory_allocated_sub(sk, pages);
+ mem_cgroup_sk_uncharge(sk, pages);
+ return -ENOMEM;
+ }
+
+success:
+ sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
+
+ WRITE_ONCE(sk->sk_reserved_mem,
+ sk->sk_reserved_mem + (pages << PAGE_SHIFT));
+
+ return 0;
+}
+
+#ifdef CONFIG_PAGE_POOL
+
+/* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
+ * in 1 syscall. The limit exists to limit the amount of memory the kernel
+ * allocates to copy these tokens, and to prevent looping over the frags for
+ * too long.
+ */
+#define MAX_DONTNEED_TOKENS 128
+#define MAX_DONTNEED_FRAGS 1024
+
+static noinline_for_stack int
+sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
+{
+ unsigned int num_tokens, i, j, k, netmem_num = 0;
+ struct dmabuf_token *tokens;
+ int ret = 0, num_frags = 0;
+ netmem_ref netmems[16];
+
+ if (!sk_is_tcp(sk))
+ return -EBADF;
+
+ if (optlen % sizeof(*tokens) ||
+ optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
+ return -EINVAL;
+
+ num_tokens = optlen / sizeof(*tokens);
+ tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
+ if (!tokens)
+ return -ENOMEM;
+
+ if (copy_from_sockptr(tokens, optval, optlen)) {
+ kvfree(tokens);
+ return -EFAULT;
+ }
+
+ xa_lock_bh(&sk->sk_user_frags);
+ for (i = 0; i < num_tokens; i++) {
+ for (j = 0; j < tokens[i].token_count; j++) {
+ if (++num_frags > MAX_DONTNEED_FRAGS)
+ goto frag_limit_reached;
+
+ netmem_ref netmem = (__force netmem_ref)__xa_erase(
+ &sk->sk_user_frags, tokens[i].token_start + j);
+
+ if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
+ continue;
+
+ netmems[netmem_num++] = netmem;
+ if (netmem_num == ARRAY_SIZE(netmems)) {
+ xa_unlock_bh(&sk->sk_user_frags);
+ for (k = 0; k < netmem_num; k++)
+ WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
+ netmem_num = 0;
+ xa_lock_bh(&sk->sk_user_frags);
+ }
+ ret++;
+ }
+ }
+
+frag_limit_reached:
+ xa_unlock_bh(&sk->sk_user_frags);
+ for (k = 0; k < netmem_num; k++)
+ WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
+
+ kvfree(tokens);
+ return ret;
+}
+#endif
+
+void sockopt_lock_sock(struct sock *sk)
+{
+ /* When current->bpf_ctx is set, the setsockopt is called from
+ * a bpf prog. bpf has ensured the sk lock has been
+ * acquired before calling setsockopt().
+ */
+ if (has_current_bpf_ctx())
+ return;
+
+ lock_sock(sk);
+}
+EXPORT_SYMBOL(sockopt_lock_sock);
+
+void sockopt_release_sock(struct sock *sk)
+{
+ if (has_current_bpf_ctx())
+ return;
+
+ release_sock(sk);
+}
+EXPORT_SYMBOL(sockopt_release_sock);
+
+bool sockopt_ns_capable(struct user_namespace *ns, int cap)
+{
+ return has_current_bpf_ctx() || ns_capable(ns, cap);
+}
+EXPORT_SYMBOL(sockopt_ns_capable);
+
+bool sockopt_capable(int cap)
+{
+ return has_current_bpf_ctx() || capable(cap);
+}
+EXPORT_SYMBOL(sockopt_capable);
+
+static int sockopt_validate_clockid(__kernel_clockid_t value)
+{
+ switch (value) {
+ case CLOCK_REALTIME:
+ case CLOCK_MONOTONIC:
+ case CLOCK_TAI:
+ return 0;
+ }
+ return -EINVAL;
+}
+
/*
* This is meant for all protocols to use and covers goings on
* at the socket level. Everything here is generic.
*/
-int sock_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+int sk_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
+ struct so_timestamping timestamping;
+ struct socket *sock = sk->sk_socket;
struct sock_txtime sk_txtime;
- struct sock *sk = sock->sk;
int val;
int valbool;
struct linger ling;
@@ -670,16 +1211,107 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
if (optlen < sizeof(int))
return -EINVAL;
- if (get_user(val, (int __user *)optval))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
valbool = val ? 1 : 0;
- lock_sock(sk);
+ /* handle options which do not require locking the socket. */
+ switch (optname) {
+ case SO_PRIORITY:
+ if (sk_set_prio_allowed(sk, val)) {
+ sock_set_priority(sk, val);
+ return 0;
+ }
+ return -EPERM;
+ case SO_TYPE:
+ case SO_PROTOCOL:
+ case SO_DOMAIN:
+ case SO_ERROR:
+ return -ENOPROTOOPT;
+#ifdef CONFIG_NET_RX_BUSY_POLL
+ case SO_BUSY_POLL:
+ if (val < 0)
+ return -EINVAL;
+ WRITE_ONCE(sk->sk_ll_usec, val);
+ return 0;
+ case SO_PREFER_BUSY_POLL:
+ if (valbool && !sockopt_capable(CAP_NET_ADMIN))
+ return -EPERM;
+ WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
+ return 0;
+ case SO_BUSY_POLL_BUDGET:
+ if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
+ !sockopt_capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (val < 0 || val > U16_MAX)
+ return -EINVAL;
+ WRITE_ONCE(sk->sk_busy_poll_budget, val);
+ return 0;
+#endif
+ case SO_MAX_PACING_RATE:
+ {
+ unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
+ unsigned long pacing_rate;
+
+ if (sizeof(ulval) != sizeof(val) &&
+ optlen >= sizeof(ulval) &&
+ copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
+ return -EFAULT;
+ }
+ if (ulval != ~0UL)
+ cmpxchg(&sk->sk_pacing_status,
+ SK_PACING_NONE,
+ SK_PACING_NEEDED);
+ /* Pairs with READ_ONCE() from sk_getsockopt() */
+ WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
+ pacing_rate = READ_ONCE(sk->sk_pacing_rate);
+ if (ulval < pacing_rate)
+ WRITE_ONCE(sk->sk_pacing_rate, ulval);
+ return 0;
+ }
+ case SO_TXREHASH:
+ if (!sk_is_tcp(sk))
+ return -EOPNOTSUPP;
+ if (val < -1 || val > 1)
+ return -EINVAL;
+ if ((u8)val == SOCK_TXREHASH_DEFAULT)
+ val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
+ /* Paired with READ_ONCE() in tcp_rtx_synack()
+ * and sk_getsockopt().
+ */
+ WRITE_ONCE(sk->sk_txrehash, (u8)val);
+ return 0;
+ case SO_PEEK_OFF:
+ {
+ int (*set_peek_off)(struct sock *sk, int val);
+
+ set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
+ if (set_peek_off)
+ ret = set_peek_off(sk, val);
+ else
+ ret = -EOPNOTSUPP;
+ return ret;
+ }
+#ifdef CONFIG_PAGE_POOL
+ case SO_DEVMEM_DONTNEED:
+ return sock_devmem_dontneed(sk, optval, optlen);
+#endif
+ case SO_SNDTIMEO_OLD:
+ case SO_SNDTIMEO_NEW:
+ return sock_set_timeout(&sk->sk_sndtimeo, optval,
+ optlen, optname == SO_SNDTIMEO_OLD);
+ case SO_RCVTIMEO_OLD:
+ case SO_RCVTIMEO_NEW:
+ return sock_set_timeout(&sk->sk_rcvtimeo, optval,
+ optlen, optname == SO_RCVTIMEO_OLD);
+ }
+
+ sockopt_lock_sock(sk);
switch (optname) {
case SO_DEBUG:
- if (val && !capable(CAP_NET_ADMIN))
+ if (val && !sockopt_capable(CAP_NET_ADMIN))
ret = -EACCES;
else
sock_valbool_flag(sk, SOCK_DBG, valbool);
@@ -688,16 +1320,14 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
break;
case SO_REUSEPORT:
- sk->sk_reuseport = valbool;
- break;
- case SO_TYPE:
- case SO_PROTOCOL:
- case SO_DOMAIN:
- case SO_ERROR:
- ret = -ENOPROTOOPT;
+ if (valbool && !sk_is_inet(sk))
+ ret = -EOPNOTSUPP;
+ else
+ sk->sk_reuseport = valbool;
break;
case SO_DONTROUTE:
sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
+ sk_dst_reset(sk);
break;
case SO_BROADCAST:
sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
@@ -708,19 +1338,30 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
* play 'guess the biggest size' games. RCVBUF/SNDBUF
* are treated in BSD as hints
*/
- val = min_t(u32, val, sysctl_wmem_max);
+ val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
set_sndbuf:
+ /* Ensure val * 2 fits into an int, to prevent max_t()
+ * from treating it as a negative value.
+ */
+ val = min_t(int, val, INT_MAX / 2);
sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
- sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
+ WRITE_ONCE(sk->sk_sndbuf,
+ max_t(int, val * 2, SOCK_MIN_SNDBUF));
/* Wake up sending tasks if we upped the value. */
sk->sk_write_space(sk);
break;
case SO_SNDBUFFORCE:
- if (!capable(CAP_NET_ADMIN)) {
+ if (!sockopt_capable(CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
+
+ /* No negative values (to prevent underflow, as val will be
+ * multiplied by 2).
+ */
+ if (val < 0)
+ val = 0;
goto set_sndbuf;
case SO_RCVBUF:
@@ -729,33 +1370,20 @@ set_sndbuf:
* play 'guess the biggest size' games. RCVBUF/SNDBUF
* are treated in BSD as hints
*/
- val = min_t(u32, val, sysctl_rmem_max);
-set_rcvbuf:
- sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
- /*
- * We double it on the way in to account for
- * "struct sk_buff" etc. overhead. Applications
- * assume that the SO_RCVBUF setting they make will
- * allow that much actual data to be received on that
- * socket.
- *
- * Applications are unaware that "struct sk_buff" and
- * other overheads allocate from the receive buffer
- * during socket buffer allocation.
- *
- * And after considering the possible alternatives,
- * returning the value we actually used in getsockopt
- * is the most desirable behavior.
- */
- sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
+ __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
break;
case SO_RCVBUFFORCE:
- if (!capable(CAP_NET_ADMIN)) {
+ if (!sockopt_capable(CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
- goto set_rcvbuf;
+
+ /* No negative values (to prevent underflow, as val will be
+ * multiplied by 2).
+ */
+ __sock_set_rcvbuf(sk, max(val, 0));
+ break;
case SO_KEEPALIVE:
if (sk->sk_prot->keepalive)
@@ -771,167 +1399,113 @@ set_rcvbuf:
sk->sk_no_check_tx = valbool;
break;
- case SO_PRIORITY:
- if ((val >= 0 && val <= 6) ||
- ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
- sk->sk_priority = val;
- else
- ret = -EPERM;
- break;
-
case SO_LINGER:
if (optlen < sizeof(ling)) {
ret = -EINVAL; /* 1003.1g */
break;
}
- if (copy_from_user(&ling, optval, sizeof(ling))) {
+ if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
ret = -EFAULT;
break;
}
- if (!ling.l_onoff)
+ if (!ling.l_onoff) {
sock_reset_flag(sk, SOCK_LINGER);
- else {
-#if (BITS_PER_LONG == 32)
- if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
- sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
+ } else {
+ unsigned long t_sec = ling.l_linger;
+
+ if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
+ WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
else
-#endif
- sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
+ WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
sock_set_flag(sk, SOCK_LINGER);
}
break;
case SO_BSDCOMPAT:
- sock_warn_obsolete_bsdism("setsockopt");
break;
- case SO_PASSCRED:
- if (valbool)
- set_bit(SOCK_PASSCRED, &sock->flags);
- else
- clear_bit(SOCK_PASSCRED, &sock->flags);
+ case SO_TIMESTAMP_OLD:
+ case SO_TIMESTAMP_NEW:
+ case SO_TIMESTAMPNS_OLD:
+ case SO_TIMESTAMPNS_NEW:
+ sock_set_timestamp(sk, optname, valbool);
break;
- case SO_TIMESTAMP:
- case SO_TIMESTAMPNS:
- if (valbool) {
- if (optname == SO_TIMESTAMP)
- sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
- else
- sock_set_flag(sk, SOCK_RCVTSTAMPNS);
- sock_set_flag(sk, SOCK_RCVTSTAMP);
- sock_enable_timestamp(sk, SOCK_TIMESTAMP);
- } else {
- sock_reset_flag(sk, SOCK_RCVTSTAMP);
- sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
- }
- break;
-
- case SO_TIMESTAMPING:
- if (val & ~SOF_TIMESTAMPING_MASK) {
- ret = -EINVAL;
- break;
- }
-
- if (val & SOF_TIMESTAMPING_OPT_ID &&
- !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
- if (sk->sk_protocol == IPPROTO_TCP &&
- sk->sk_type == SOCK_STREAM) {
- if ((1 << sk->sk_state) &
- (TCPF_CLOSE | TCPF_LISTEN)) {
- ret = -EINVAL;
- break;
- }
- sk->sk_tskey = tcp_sk(sk)->snd_una;
- } else {
- sk->sk_tskey = 0;
+ case SO_TIMESTAMPING_NEW:
+ case SO_TIMESTAMPING_OLD:
+ if (optlen == sizeof(timestamping)) {
+ if (copy_from_sockptr(&timestamping, optval,
+ sizeof(timestamping))) {
+ ret = -EFAULT;
+ break;
}
+ } else {
+ memset(&timestamping, 0, sizeof(timestamping));
+ timestamping.flags = val;
}
-
- if (val & SOF_TIMESTAMPING_OPT_STATS &&
- !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
- ret = -EINVAL;
- break;
- }
-
- sk->sk_tsflags = val;
- if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
- sock_enable_timestamp(sk,
- SOCK_TIMESTAMPING_RX_SOFTWARE);
- else
- sock_disable_timestamp(sk,
- (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
+ ret = sock_set_timestamping(sk, optname, timestamping);
break;
case SO_RCVLOWAT:
+ {
+ int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
+
if (val < 0)
val = INT_MAX;
- if (sock->ops->set_rcvlowat)
- ret = sock->ops->set_rcvlowat(sk, val);
+ if (sock)
+ set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
+ if (set_rcvlowat)
+ ret = set_rcvlowat(sk, val);
else
- sk->sk_rcvlowat = val ? : 1;
- break;
-
- case SO_RCVTIMEO:
- ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
+ WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
break;
+ }
+ case SO_ATTACH_FILTER: {
+ struct sock_fprog fprog;
- case SO_SNDTIMEO:
- ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
- break;
-
- case SO_ATTACH_FILTER:
- ret = -EINVAL;
- if (optlen == sizeof(struct sock_fprog)) {
- struct sock_fprog fprog;
-
- ret = -EFAULT;
- if (copy_from_user(&fprog, optval, sizeof(fprog)))
- break;
-
+ ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
+ if (!ret)
ret = sk_attach_filter(&fprog, sk);
- }
break;
-
+ }
case SO_ATTACH_BPF:
ret = -EINVAL;
if (optlen == sizeof(u32)) {
u32 ufd;
ret = -EFAULT;
- if (copy_from_user(&ufd, optval, sizeof(ufd)))
+ if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
break;
ret = sk_attach_bpf(ufd, sk);
}
break;
- case SO_ATTACH_REUSEPORT_CBPF:
- ret = -EINVAL;
- if (optlen == sizeof(struct sock_fprog)) {
- struct sock_fprog fprog;
-
- ret = -EFAULT;
- if (copy_from_user(&fprog, optval, sizeof(fprog)))
- break;
+ case SO_ATTACH_REUSEPORT_CBPF: {
+ struct sock_fprog fprog;
+ ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
+ if (!ret)
ret = sk_reuseport_attach_filter(&fprog, sk);
- }
break;
-
+ }
case SO_ATTACH_REUSEPORT_EBPF:
ret = -EINVAL;
if (optlen == sizeof(u32)) {
u32 ufd;
ret = -EFAULT;
- if (copy_from_user(&ufd, optval, sizeof(ufd)))
+ if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
break;
ret = sk_reuseport_attach_bpf(ufd, sk);
}
break;
+ case SO_DETACH_REUSEPORT_BPF:
+ ret = reuseport_detach_prog(sk);
+ break;
+
case SO_DETACH_FILTER:
ret = sk_detach_filter(sk);
break;
@@ -943,17 +1517,21 @@ set_rcvbuf:
sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
break;
- case SO_PASSSEC:
- if (valbool)
- set_bit(SOCK_PASSSEC, &sock->flags);
- else
- clear_bit(SOCK_PASSSEC, &sock->flags);
- break;
case SO_MARK:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
ret = -EPERM;
- else
- sk->sk_mark = val;
+ break;
+ }
+
+ __sock_set_mark(sk, val);
+ break;
+ case SO_RCVMARK:
+ sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
+ break;
+
+ case SO_RCVPRIORITY:
+ sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool);
break;
case SO_RXQ_OVFL:
@@ -964,13 +1542,6 @@ set_rcvbuf:
sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
break;
- case SO_PEEK_OFF:
- if (sock->ops->set_peek_off)
- ret = sock->ops->set_peek_off(sk, val);
- else
- ret = -EOPNOTSUPP;
- break;
-
case SO_NOFCS:
sock_valbool_flag(sk, SOCK_NOFCS, valbool);
break;
@@ -979,32 +1550,36 @@ set_rcvbuf:
sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
break;
-#ifdef CONFIG_NET_RX_BUSY_POLL
- case SO_BUSY_POLL:
- /* allow unprivileged users to decrease the value */
- if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
- ret = -EPERM;
- else {
- if (val < 0)
- ret = -EINVAL;
- else
- sk->sk_ll_usec = val;
- }
+ case SO_PASSCRED:
+ if (sk_may_scm_recv(sk))
+ sk->sk_scm_credentials = valbool;
+ else
+ ret = -EOPNOTSUPP;
break;
-#endif
- case SO_MAX_PACING_RATE:
- if (val != ~0U)
- cmpxchg(&sk->sk_pacing_status,
- SK_PACING_NONE,
- SK_PACING_NEEDED);
- sk->sk_max_pacing_rate = val;
- sk->sk_pacing_rate = min(sk->sk_pacing_rate,
- sk->sk_max_pacing_rate);
+ case SO_PASSSEC:
+ if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk))
+ sk->sk_scm_security = valbool;
+ else
+ ret = -EOPNOTSUPP;
+ break;
+
+ case SO_PASSPIDFD:
+ if (sk_is_unix(sk))
+ sk->sk_scm_pidfd = valbool;
+ else
+ ret = -EOPNOTSUPP;
+ break;
+
+ case SO_PASSRIGHTS:
+ if (sk_is_unix(sk))
+ sk->sk_scm_rights = valbool;
+ else
+ ret = -EOPNOTSUPP;
break;
case SO_INCOMING_CPU:
- sk->sk_incoming_cpu = val;
+ reuseport_update_incoming_cpu(sk, val);
break;
case SO_CNX_ADVICE:
@@ -1014,10 +1589,12 @@ set_rcvbuf:
case SO_ZEROCOPY:
if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
- if (sk->sk_protocol != IPPROTO_TCP)
- ret = -ENOTSUPP;
+ if (!(sk_is_tcp(sk) ||
+ (sk->sk_type == SOCK_DGRAM &&
+ sk->sk_protocol == IPPROTO_UDP)))
+ ret = -EOPNOTSUPP;
} else if (sk->sk_family != PF_RDS) {
- ret = -ENOTSUPP;
+ ret = -EOPNOTSUPP;
}
if (!ret) {
if (val < 0 || val > 1)
@@ -1028,34 +1605,94 @@ set_rcvbuf:
break;
case SO_TXTIME:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
- ret = -EPERM;
- } else if (optlen != sizeof(struct sock_txtime)) {
+ if (optlen != sizeof(struct sock_txtime)) {
ret = -EINVAL;
- } else if (copy_from_user(&sk_txtime, optval,
+ break;
+ } else if (copy_from_sockptr(&sk_txtime, optval,
sizeof(struct sock_txtime))) {
ret = -EFAULT;
+ break;
} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
ret = -EINVAL;
- } else {
- sock_valbool_flag(sk, SOCK_TXTIME, true);
- sk->sk_clockid = sk_txtime.clockid;
- sk->sk_txtime_deadline_mode =
- !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
- sk->sk_txtime_report_errors =
- !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
+ break;
+ }
+ /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
+ * scheduler has enough safe guards.
+ */
+ if (sk_txtime.clockid != CLOCK_MONOTONIC &&
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ ret = -EPERM;
+ break;
+ }
+
+ ret = sockopt_validate_clockid(sk_txtime.clockid);
+ if (ret)
+ break;
+
+ sock_valbool_flag(sk, SOCK_TXTIME, true);
+ sk->sk_clockid = sk_txtime.clockid;
+ sk->sk_txtime_deadline_mode =
+ !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
+ sk->sk_txtime_report_errors =
+ !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
+ break;
+
+ case SO_BINDTOIFINDEX:
+ ret = sock_bindtoindex_locked(sk, val);
+ break;
+
+ case SO_BUF_LOCK:
+ if (val & ~SOCK_BUF_LOCK_MASK) {
+ ret = -EINVAL;
+ break;
}
+ sk->sk_userlocks = val | (sk->sk_userlocks &
+ ~SOCK_BUF_LOCK_MASK);
break;
+ case SO_RESERVE_MEM:
+ {
+ int delta;
+
+ if (val < 0) {
+ ret = -EINVAL;
+ break;
+ }
+
+ delta = val - sk->sk_reserved_mem;
+ if (delta < 0)
+ sock_release_reserved_memory(sk, -delta);
+ else
+ ret = sock_reserve_memory(sk, delta);
+ break;
+ }
+
default:
ret = -ENOPROTOOPT;
break;
}
- release_sock(sk);
+ sockopt_release_sock(sk);
return ret;
}
+
+int sock_setsockopt(struct socket *sock, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
+{
+ return sk_setsockopt(sock->sk, level, optname,
+ optval, optlen);
+}
EXPORT_SYMBOL(sock_setsockopt);
+static const struct cred *sk_get_peer_cred(struct sock *sk)
+{
+ const struct cred *cred;
+
+ spin_lock(&sk->sk_peer_lock);
+ cred = get_cred(sk->sk_peer_cred);
+ spin_unlock(&sk->sk_peer_lock);
+
+ return cred;
+}
static void cred_to_ucred(struct pid *pid, const struct cred *cred,
struct ucred *ucred)
@@ -1070,35 +1707,42 @@ static void cred_to_ucred(struct pid *pid, const struct cred *cred,
}
}
-static int groups_to_user(gid_t __user *dst, const struct group_info *src)
+static int groups_to_user(sockptr_t dst, const struct group_info *src)
{
struct user_namespace *user_ns = current_user_ns();
int i;
- for (i = 0; i < src->ngroups; i++)
- if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
+ for (i = 0; i < src->ngroups; i++) {
+ gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
+
+ if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
return -EFAULT;
+ }
return 0;
}
-int sock_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
+int sk_getsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, sockptr_t optlen)
{
- struct sock *sk = sock->sk;
+ struct socket *sock = sk->sk_socket;
union {
int val;
u64 val64;
+ unsigned long ulval;
struct linger ling;
- struct timeval tm;
+ struct old_timeval32 tm32;
+ struct __kernel_old_timeval tm;
+ struct __kernel_sock_timeval stm;
struct sock_txtime txtime;
+ struct so_timestamping timestamping;
} v;
int lv = sizeof(int);
int len;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
if (len < 0)
return -EINVAL;
@@ -1119,11 +1763,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_SNDBUF:
- v.val = sk->sk_sndbuf;
+ v.val = READ_ONCE(sk->sk_sndbuf);
break;
case SO_RCVBUF:
- v.val = sk->sk_rcvbuf;
+ v.val = READ_ONCE(sk->sk_rcvbuf);
break;
case SO_REUSEADDR:
@@ -1165,56 +1809,63 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_PRIORITY:
- v.val = sk->sk_priority;
+ v.val = READ_ONCE(sk->sk_priority);
break;
case SO_LINGER:
lv = sizeof(v.ling);
v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
- v.ling.l_linger = sk->sk_lingertime / HZ;
+ v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ;
break;
case SO_BSDCOMPAT:
- sock_warn_obsolete_bsdism("getsockopt");
break;
- case SO_TIMESTAMP:
+ case SO_TIMESTAMP_OLD:
v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
+ !sock_flag(sk, SOCK_TSTAMP_NEW) &&
!sock_flag(sk, SOCK_RCVTSTAMPNS);
break;
- case SO_TIMESTAMPNS:
- v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
+ case SO_TIMESTAMPNS_OLD:
+ v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
break;
- case SO_TIMESTAMPING:
- v.val = sk->sk_tsflags;
+ case SO_TIMESTAMP_NEW:
+ v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
break;
- case SO_RCVTIMEO:
- lv = sizeof(struct timeval);
- if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
- v.tm.tv_sec = 0;
- v.tm.tv_usec = 0;
- } else {
- v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
- v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
- }
+ case SO_TIMESTAMPNS_NEW:
+ v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
break;
- case SO_SNDTIMEO:
- lv = sizeof(struct timeval);
- if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
- v.tm.tv_sec = 0;
- v.tm.tv_usec = 0;
- } else {
- v.tm.tv_sec = sk->sk_sndtimeo / HZ;
- v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
+ case SO_TIMESTAMPING_OLD:
+ case SO_TIMESTAMPING_NEW:
+ lv = sizeof(v.timestamping);
+ /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
+ * returning the flags when they were set through the same option.
+ * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
+ */
+ if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
+ v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
+ v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
}
break;
+ case SO_RCVTIMEO_OLD:
+ case SO_RCVTIMEO_NEW:
+ lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
+ SO_RCVTIMEO_OLD == optname);
+ break;
+
+ case SO_SNDTIMEO_OLD:
+ case SO_SNDTIMEO_NEW:
+ lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
+ SO_SNDTIMEO_OLD == optname);
+ break;
+
case SO_RCVLOWAT:
- v.val = sk->sk_rcvlowat;
+ v.val = READ_ONCE(sk->sk_rcvlowat);
break;
case SO_SNDLOWAT:
@@ -1222,7 +1873,24 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_PASSCRED:
- v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
+ if (!sk_may_scm_recv(sk))
+ return -EOPNOTSUPP;
+
+ v.val = sk->sk_scm_credentials;
+ break;
+
+ case SO_PASSPIDFD:
+ if (!sk_is_unix(sk))
+ return -EOPNOTSUPP;
+
+ v.val = sk->sk_scm_pidfd;
+ break;
+
+ case SO_PASSRIGHTS:
+ if (!sk_is_unix(sk))
+ return -EOPNOTSUPP;
+
+ v.val = sk->sk_scm_rights;
break;
case SO_PEERCRED:
@@ -1230,28 +1898,76 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
struct ucred peercred;
if (len > sizeof(peercred))
len = sizeof(peercred);
+
+ spin_lock(&sk->sk_peer_lock);
cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
- if (copy_to_user(optval, &peercred, len))
+ spin_unlock(&sk->sk_peer_lock);
+
+ if (copy_to_sockptr(optval, &peercred, len))
return -EFAULT;
goto lenout;
}
+ case SO_PEERPIDFD:
+ {
+ struct pid *peer_pid;
+ struct file *pidfd_file = NULL;
+ unsigned int flags = 0;
+ int pidfd;
+
+ if (len > sizeof(pidfd))
+ len = sizeof(pidfd);
+
+ spin_lock(&sk->sk_peer_lock);
+ peer_pid = get_pid(sk->sk_peer_pid);
+ spin_unlock(&sk->sk_peer_lock);
+
+ if (!peer_pid)
+ return -ENODATA;
+
+ /* The use of PIDFD_STALE requires stashing of struct pid
+ * on pidfs with pidfs_register_pid() and only AF_UNIX
+ * were prepared for this.
+ */
+ if (sk->sk_family == AF_UNIX)
+ flags = PIDFD_STALE;
+
+ pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file);
+ put_pid(peer_pid);
+ if (pidfd < 0)
+ return pidfd;
+
+ if (copy_to_sockptr(optval, &pidfd, len) ||
+ copy_to_sockptr(optlen, &len, sizeof(int))) {
+ put_unused_fd(pidfd);
+ fput(pidfd_file);
+
+ return -EFAULT;
+ }
+
+ fd_install(pidfd, pidfd_file);
+ return 0;
+ }
+
case SO_PEERGROUPS:
{
+ const struct cred *cred;
int ret, n;
- if (!sk->sk_peer_cred)
+ cred = sk_get_peer_cred(sk);
+ if (!cred)
return -ENODATA;
- n = sk->sk_peer_cred->group_info->ngroups;
+ n = cred->group_info->ngroups;
if (len < n * sizeof(gid_t)) {
len = n * sizeof(gid_t);
- return put_user(len, optlen) ? -EFAULT : -ERANGE;
+ put_cred(cred);
+ return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
}
len = n * sizeof(gid_t);
- ret = groups_to_user((gid_t __user *)optval,
- sk->sk_peer_cred->group_info);
+ ret = groups_to_user(optval, cred->group_info);
+ put_cred(cred);
if (ret)
return ret;
goto lenout;
@@ -1259,14 +1975,14 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
case SO_PEERNAME:
{
- char address[128];
+ struct sockaddr_storage address;
- lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
+ lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
if (lv < 0)
return -ENOTCONN;
if (lv < len)
return -EINVAL;
- if (copy_to_user(optval, address, len))
+ if (copy_to_sockptr(optval, &address, len))
return -EFAULT;
goto lenout;
}
@@ -1279,14 +1995,26 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_PASSSEC:
- v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
+ if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk))
+ return -EOPNOTSUPP;
+
+ v.val = sk->sk_scm_security;
break;
case SO_PEERSEC:
- return security_socket_getpeersec_stream(sock, optval, optlen, len);
+ return security_socket_getpeersec_stream(sock,
+ optval, optlen, len);
case SO_MARK:
- v.val = sk->sk_mark;
+ v.val = READ_ONCE(sk->sk_mark);
+ break;
+
+ case SO_RCVMARK:
+ v.val = sock_flag(sk, SOCK_RCVMARK);
+ break;
+
+ case SO_RCVPRIORITY:
+ v.val = sock_flag(sk, SOCK_RCVPRIORITY);
break;
case SO_RXQ_OVFL:
@@ -1298,10 +2026,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
break;
case SO_PEEK_OFF:
- if (!sock->ops->set_peek_off)
+ if (!READ_ONCE(sock->ops)->set_peek_off)
return -EOPNOTSUPP;
- v.val = sk->sk_peek_off;
+ v.val = READ_ONCE(sk->sk_peek_off);
break;
case SO_NOFCS:
v.val = sock_flag(sk, SOCK_NOFCS);
@@ -1311,7 +2039,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
return sock_getbindtodevice(sk, optval, optlen, len);
case SO_GET_FILTER:
- len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
+ len = sk_get_filter(sk, optval, len);
if (len < 0)
return len;
@@ -1331,29 +2059,37 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
#ifdef CONFIG_NET_RX_BUSY_POLL
case SO_BUSY_POLL:
- v.val = sk->sk_ll_usec;
+ v.val = READ_ONCE(sk->sk_ll_usec);
+ break;
+ case SO_PREFER_BUSY_POLL:
+ v.val = READ_ONCE(sk->sk_prefer_busy_poll);
break;
#endif
case SO_MAX_PACING_RATE:
- v.val = sk->sk_max_pacing_rate;
+ /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
+ if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
+ lv = sizeof(v.ulval);
+ v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
+ } else {
+ /* 32bit version */
+ v.val = min_t(unsigned long, ~0U,
+ READ_ONCE(sk->sk_max_pacing_rate));
+ }
break;
case SO_INCOMING_CPU:
- v.val = sk->sk_incoming_cpu;
+ v.val = READ_ONCE(sk->sk_incoming_cpu);
break;
case SO_MEMINFO:
{
u32 meminfo[SK_MEMINFO_VARS];
- if (get_user(len, optlen))
- return -EFAULT;
-
sk_get_meminfo(sk, meminfo);
len = min_t(unsigned int, len, sizeof(meminfo));
- if (copy_to_user(optval, &meminfo, len))
+ if (copy_to_sockptr(optval, &meminfo, len))
return -EFAULT;
goto lenout;
@@ -1364,7 +2100,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = READ_ONCE(sk->sk_napi_id);
/* aggregate non-NAPI IDs down to 0 */
- if (v.val < MIN_NAPI_ID)
+ if (!napi_id_valid(v.val))
v.val = 0;
break;
@@ -1390,6 +2126,33 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
SOF_TXTIME_REPORT_ERRORS : 0;
break;
+ case SO_BINDTOIFINDEX:
+ v.val = READ_ONCE(sk->sk_bound_dev_if);
+ break;
+
+ case SO_NETNS_COOKIE:
+ lv = sizeof(u64);
+ if (len != lv)
+ return -EINVAL;
+ v.val64 = sock_net(sk)->net_cookie;
+ break;
+
+ case SO_BUF_LOCK:
+ v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
+ break;
+
+ case SO_RESERVE_MEM:
+ v.val = READ_ONCE(sk->sk_reserved_mem);
+ break;
+
+ case SO_TXREHASH:
+ if (!sk_is_tcp(sk))
+ return -EOPNOTSUPP;
+
+ /* Paired with WRITE_ONCE() in sk_setsockopt() */
+ v.val = READ_ONCE(sk->sk_txrehash);
+ break;
+
default:
/* We implement the SO_SNDLOWAT etc to not be settable
* (1003.1g 7).
@@ -1399,10 +2162,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
if (len > lv)
len = lv;
- if (copy_to_user(optval, &v, len))
+ if (copy_to_sockptr(optval, &v, len))
return -EFAULT;
lenout:
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
return 0;
}
@@ -1414,6 +2177,8 @@ lenout:
*/
static inline void sock_lock_init(struct sock *sk)
{
+ sk_owner_clear(sk);
+
if (sk->sk_kern_sock)
sock_lock_init_class_and_name(
sk,
@@ -1432,18 +2197,30 @@ static inline void sock_lock_init(struct sock *sk)
/*
* Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
- * even temporarly, because of RCU lookups. sk_node should also be left as is.
+ * even temporarily, because of RCU lookups. sk_node should also be left as is.
* We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
*/
static void sock_copy(struct sock *nsk, const struct sock *osk)
{
+ const struct proto *prot = READ_ONCE(osk->sk_prot);
#ifdef CONFIG_SECURITY_NETWORK
void *sptr = nsk->sk_security;
#endif
+
+ /* If we move sk_tx_queue_mapping out of the private section,
+ * we must check if sk_tx_queue_clear() is called after
+ * sock_copy() in sk_clone_lock().
+ */
+ BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
+ offsetof(struct sock, sk_dontcopy_begin) ||
+ offsetof(struct sock, sk_tx_queue_mapping) >=
+ offsetof(struct sock, sk_dontcopy_end));
+
memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
- memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
- osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
+ unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
+ prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
+ /* alloc is larger than struct, see sk_prot_alloc() */);
#ifdef CONFIG_SECURITY_NETWORK
nsk->sk_security = sptr;
@@ -1462,7 +2239,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
if (!sk)
return sk;
- if (priority & __GFP_ZERO)
+ if (want_init_on_alloc(priority))
sk_prot_clear_nulls(sk, prot->obj_size);
} else
sk = kmalloc(prot->obj_size, priority);
@@ -1473,7 +2250,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
if (!try_module_get(prot->owner))
goto out_free_sec;
- sk_tx_queue_clear(sk);
}
return sk;
@@ -1499,6 +2275,9 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
cgroup_sk_free(&sk->sk_cgrp_data);
mem_cgroup_sk_free(sk);
security_sk_free(sk);
+
+ sk_owner_put(sk);
+
if (slab != NULL)
kmem_cache_free(slab, sk);
else
@@ -1527,21 +2306,31 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
* why we need sk_prot_creator -acme
*/
sk->sk_prot = sk->sk_prot_creator = prot;
+
+ if (READ_ONCE(net->core.sysctl_bypass_prot_mem))
+ sk->sk_bypass_prot_mem = 1;
+
sk->sk_kern_sock = kern;
sock_lock_init(sk);
+
sk->sk_net_refcnt = kern ? 0 : 1;
if (likely(sk->sk_net_refcnt)) {
- get_net(net);
+ get_net_track(net, &sk->ns_tracker, priority);
sock_inuse_add(net, 1);
+ } else {
+ net_passive_inc(net);
+ __netns_tracker_alloc(net, &sk->ns_tracker,
+ false, priority);
}
sock_net_set(sk, net);
- refcount_set(&sk->sk_wmem_alloc, 1);
+ refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
mem_cgroup_sk_alloc(sk);
cgroup_sk_alloc(&sk->sk_cgrp_data);
sock_update_classid(&sk->sk_cgrp_data);
sock_update_netprioidx(&sk->sk_cgrp_data);
+ sk_tx_queue_clear(sk);
}
return sk;
@@ -1554,6 +2343,7 @@ EXPORT_SYMBOL(sk_alloc);
static void __sk_destruct(struct rcu_head *head)
{
struct sock *sk = container_of(head, struct sock, sk_rcu);
+ struct net *net = sock_net(sk);
struct sk_filter *filter;
if (sk->sk_destruct)
@@ -1565,11 +2355,13 @@ static void __sk_destruct(struct rcu_head *head)
sk_filter_uncharge(sk, filter);
RCU_INIT_POINTER(sk->sk_filter, NULL);
}
- if (rcu_access_pointer(sk->sk_reuseport_cb))
- reuseport_detach_sock(sk);
sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
+#ifdef CONFIG_BPF_SYSCALL
+ bpf_sk_storage_free(sk);
+#endif
+
if (atomic_read(&sk->sk_omem_alloc))
pr_debug("%s: optmem leakage (%d bytes) detected\n",
__func__, atomic_read(&sk->sk_omem_alloc));
@@ -1579,17 +2371,42 @@ static void __sk_destruct(struct rcu_head *head)
sk->sk_frag.page = NULL;
}
- if (sk->sk_peer_cred)
- put_cred(sk->sk_peer_cred);
+ /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
+ put_cred(sk->sk_peer_cred);
put_pid(sk->sk_peer_pid);
- if (likely(sk->sk_net_refcnt))
- put_net(sock_net(sk));
+
+ if (likely(sk->sk_net_refcnt)) {
+ put_net_track(net, &sk->ns_tracker);
+ } else {
+ __netns_tracker_free(net, &sk->ns_tracker, false);
+ net_passive_dec(net);
+ }
sk_prot_free(sk->sk_prot_creator, sk);
}
+void sk_net_refcnt_upgrade(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+
+ WARN_ON_ONCE(sk->sk_net_refcnt);
+ __netns_tracker_free(net, &sk->ns_tracker, false);
+ net_passive_dec(net);
+ sk->sk_net_refcnt = 1;
+ get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
+ sock_inuse_add(net, 1);
+}
+EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);
+
void sk_destruct(struct sock *sk)
{
- if (sock_flag(sk, SOCK_RCU_FREE))
+ bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
+
+ if (rcu_access_pointer(sk->sk_reuseport_cb)) {
+ reuseport_detach_sock(sk);
+ use_call_rcu = true;
+ }
+
+ if (use_call_rcu)
call_rcu(&sk->sk_rcu, __sk_destruct);
else
__sk_destruct(&sk->sk_rcu);
@@ -1634,156 +2451,209 @@ static void sk_init_common(struct sock *sk)
lockdep_set_class_and_name(&sk->sk_error_queue.lock,
af_elock_keys + sk->sk_family,
af_family_elock_key_strings[sk->sk_family]);
- lockdep_set_class_and_name(&sk->sk_callback_lock,
+ if (sk->sk_kern_sock)
+ lockdep_set_class_and_name(&sk->sk_callback_lock,
+ af_kern_callback_keys + sk->sk_family,
+ af_family_kern_clock_key_strings[sk->sk_family]);
+ else
+ lockdep_set_class_and_name(&sk->sk_callback_lock,
af_callback_keys + sk->sk_family,
af_family_clock_key_strings[sk->sk_family]);
}
/**
- * sk_clone_lock - clone a socket, and lock its clone
- * @sk: the socket to clone
- * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ * sk_clone - clone a socket
+ * @sk: the socket to clone
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ * @lock: if true, lock the cloned sk
*
- * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
+ * If @lock is true, the clone is locked by bh_lock_sock(), and
+ * caller must unlock socket even in error path by bh_unlock_sock().
*/
-struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
+struct sock *sk_clone(const struct sock *sk, const gfp_t priority,
+ bool lock)
{
- struct sock *newsk;
+ struct proto *prot = READ_ONCE(sk->sk_prot);
+ struct sk_filter *filter;
bool is_charged = true;
+ struct sock *newsk;
- newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
- if (newsk != NULL) {
- struct sk_filter *filter;
+ newsk = sk_prot_alloc(prot, priority, sk->sk_family);
+ if (!newsk)
+ goto out;
- sock_copy(newsk, sk);
+ sock_copy(newsk, sk);
- newsk->sk_prot_creator = sk->sk_prot;
+ newsk->sk_prot_creator = prot;
- /* SANITY */
- if (likely(newsk->sk_net_refcnt))
- get_net(sock_net(newsk));
- sk_node_init(&newsk->sk_node);
- sock_lock_init(newsk);
+ /* SANITY */
+ if (likely(newsk->sk_net_refcnt)) {
+ get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
+ sock_inuse_add(sock_net(newsk), 1);
+ } else {
+ /* Kernel sockets are not elevating the struct net refcount.
+ * Instead, use a tracker to more easily detect if a layer
+ * is not properly dismantling its kernel sockets at netns
+ * destroy time.
+ */
+ net_passive_inc(sock_net(newsk));
+ __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
+ false, priority);
+ }
+
+ sk_node_init(&newsk->sk_node);
+ sock_lock_init(newsk);
+
+ if (lock)
bh_lock_sock(newsk);
- newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
- newsk->sk_backlog.len = 0;
- atomic_set(&newsk->sk_rmem_alloc, 0);
- /*
- * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
- */
- refcount_set(&newsk->sk_wmem_alloc, 1);
- atomic_set(&newsk->sk_omem_alloc, 0);
- sk_init_common(newsk);
-
- newsk->sk_dst_cache = NULL;
- newsk->sk_dst_pending_confirm = 0;
- newsk->sk_wmem_queued = 0;
- newsk->sk_forward_alloc = 0;
- atomic_set(&newsk->sk_drops, 0);
- newsk->sk_send_head = NULL;
- newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
- atomic_set(&newsk->sk_zckey, 0);
-
- sock_reset_flag(newsk, SOCK_DONE);
- mem_cgroup_sk_alloc(newsk);
- cgroup_sk_alloc(&newsk->sk_cgrp_data);
+ newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
+ newsk->sk_backlog.len = 0;
- rcu_read_lock();
- filter = rcu_dereference(sk->sk_filter);
- if (filter != NULL)
- /* though it's an empty new sock, the charging may fail
- * if sysctl_optmem_max was changed between creation of
- * original socket and cloning
- */
- is_charged = sk_filter_charge(newsk, filter);
- RCU_INIT_POINTER(newsk->sk_filter, filter);
- rcu_read_unlock();
+ atomic_set(&newsk->sk_rmem_alloc, 0);
- if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
- /* We need to make sure that we don't uncharge the new
- * socket if we couldn't charge it in the first place
- * as otherwise we uncharge the parent's filter.
- */
- if (!is_charged)
- RCU_INIT_POINTER(newsk->sk_filter, NULL);
- sk_free_unlock_clone(newsk);
- newsk = NULL;
- goto out;
- }
- RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
+ refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS);
- newsk->sk_err = 0;
- newsk->sk_err_soft = 0;
- newsk->sk_priority = 0;
- newsk->sk_incoming_cpu = raw_smp_processor_id();
- atomic64_set(&newsk->sk_cookie, 0);
- if (likely(newsk->sk_net_refcnt))
- sock_inuse_add(sock_net(newsk), 1);
+ atomic_set(&newsk->sk_omem_alloc, 0);
+ sk_init_common(newsk);
- /*
- * Before updating sk_refcnt, we must commit prior changes to memory
- * (Documentation/RCU/rculist_nulls.txt for details)
- */
- smp_wmb();
- refcount_set(&newsk->sk_refcnt, 2);
+ newsk->sk_dst_cache = NULL;
+ newsk->sk_dst_pending_confirm = 0;
+ newsk->sk_wmem_queued = 0;
+ newsk->sk_forward_alloc = 0;
+ newsk->sk_reserved_mem = 0;
+ DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters);
+ sk_drops_reset(newsk);
+ newsk->sk_send_head = NULL;
+ newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+ atomic_set(&newsk->sk_zckey, 0);
- /*
- * Increment the counter in the same struct proto as the master
- * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
- * is the same as sk->sk_prot->socks, as this field was copied
- * with memcpy).
- *
- * This _changes_ the previous behaviour, where
- * tcp_create_openreq_child always was incrementing the
- * equivalent to tcp_prot->socks (inet_sock_nr), so this have
- * to be taken into account in all callers. -acme
+ sock_reset_flag(newsk, SOCK_DONE);
+
+#ifdef CONFIG_MEMCG
+ /* sk->sk_memcg will be populated at accept() time */
+ newsk->sk_memcg = NULL;
+#endif
+
+ cgroup_sk_clone(&newsk->sk_cgrp_data);
+
+ rcu_read_lock();
+ filter = rcu_dereference(sk->sk_filter);
+ if (filter != NULL)
+ /* though it's an empty new sock, the charging may fail
+ * if sysctl_optmem_max was changed between creation of
+ * original socket and cloning
*/
- sk_refcnt_debug_inc(newsk);
- sk_set_socket(newsk, NULL);
- newsk->sk_wq = NULL;
+ is_charged = sk_filter_charge(newsk, filter);
+ RCU_INIT_POINTER(newsk->sk_filter, filter);
+ rcu_read_unlock();
- if (newsk->sk_prot->sockets_allocated)
- sk_sockets_allocated_inc(newsk);
+ if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
+ /* We need to make sure that we don't uncharge the new
+ * socket if we couldn't charge it in the first place
+ * as otherwise we uncharge the parent's filter.
+ */
+ if (!is_charged)
+ RCU_INIT_POINTER(newsk->sk_filter, NULL);
- if (sock_needs_netstamp(sk) &&
- newsk->sk_flags & SK_FLAGS_TIMESTAMP)
- net_enable_timestamp();
+ goto free;
}
+
+ RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
+
+ if (bpf_sk_storage_clone(sk, newsk))
+ goto free;
+
+ /* Clear sk_user_data if parent had the pointer tagged
+ * as not suitable for copying when cloning.
+ */
+ if (sk_user_data_is_nocopy(newsk))
+ newsk->sk_user_data = NULL;
+
+ newsk->sk_err = 0;
+ newsk->sk_err_soft = 0;
+ newsk->sk_priority = 0;
+ newsk->sk_incoming_cpu = raw_smp_processor_id();
+
+ /* Before updating sk_refcnt, we must commit prior changes to memory
+ * (Documentation/RCU/rculist_nulls.rst for details)
+ */
+ smp_wmb();
+ refcount_set(&newsk->sk_refcnt, 2);
+
+ sk_set_socket(newsk, NULL);
+ sk_tx_queue_clear(newsk);
+ RCU_INIT_POINTER(newsk->sk_wq, NULL);
+
+ if (newsk->sk_prot->sockets_allocated)
+ sk_sockets_allocated_inc(newsk);
+
+ if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
+ net_enable_timestamp();
out:
return newsk;
+free:
+ /* It is still raw copy of parent, so invalidate
+ * destructor and make plain sk_free()
+ */
+ newsk->sk_destruct = NULL;
+ if (lock)
+ bh_unlock_sock(newsk);
+ sk_free(newsk);
+ newsk = NULL;
+ goto out;
}
-EXPORT_SYMBOL_GPL(sk_clone_lock);
+EXPORT_SYMBOL_GPL(sk_clone);
-void sk_free_unlock_clone(struct sock *sk)
+static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev)
{
- /* It is still raw copy of parent, so invalidate
- * destructor and make plain sk_free() */
- sk->sk_destruct = NULL;
- bh_unlock_sock(sk);
- sk_free(sk);
+ bool is_ipv6 = false;
+ u32 max_size;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ is_ipv6 = (sk->sk_family == AF_INET6 &&
+ !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
+#endif
+ /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
+ max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) :
+ READ_ONCE(dev->gso_ipv4_max_size);
+ if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
+ max_size = GSO_LEGACY_MAX_SIZE;
+
+ return max_size - (MAX_TCP_HEADER + 1);
}
-EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
+ const struct net_device *dev;
u32 max_segs = 1;
- sk_dst_set(sk, dst);
- sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
+ rcu_read_lock();
+ dev = dst_dev_rcu(dst);
+ sk->sk_route_caps = dev->features;
+ if (sk_is_tcp(sk)) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ sk->sk_route_caps |= NETIF_F_GSO;
+ icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK);
+ }
if (sk->sk_route_caps & NETIF_F_GSO)
sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
- sk->sk_route_caps &= ~sk->sk_route_nocaps;
+ if (unlikely(sk->sk_gso_disabled))
+ sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
if (sk_can_gso(sk)) {
if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
} else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
- sk->sk_gso_max_size = dst->dev->gso_max_size;
- max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
+ sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev);
+ /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
+ max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1);
}
}
sk->sk_gso_max_segs = max_segs;
+ sk_dst_set(sk, dst);
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(sk_setup_caps);
@@ -1797,10 +2667,24 @@ EXPORT_SYMBOL_GPL(sk_setup_caps);
*/
void sock_wfree(struct sk_buff *skb)
{
- struct sock *sk = skb->sk;
unsigned int len = skb->truesize;
+ struct sock *sk = skb->sk;
+ bool free;
+ int old;
if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
+ if (sock_flag(sk, SOCK_RCU_FREE) &&
+ sk->sk_write_space == sock_def_write_space) {
+ rcu_read_lock();
+ free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc,
+ &old);
+ sock_def_write_space_wfree(sk, old - len);
+ rcu_read_unlock();
+ if (unlikely(free))
+ __sk_free(sk);
+ return;
+ }
+
/*
* Keep a reference on sk_wmem_alloc, this will be released
* after sk_write_space() call
@@ -1831,26 +2715,45 @@ void __sock_wfree(struct sk_buff *skb)
void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
{
+ int old_wmem;
+
skb_orphan(skb);
- skb->sk = sk;
#ifdef CONFIG_INET
- if (unlikely(!sk_fullsock(sk))) {
- skb->destructor = sock_edemux;
- sock_hold(sk);
- return;
- }
+ if (unlikely(!sk_fullsock(sk)))
+ return skb_set_owner_edemux(skb, sk);
#endif
+ skb->sk = sk;
skb->destructor = sock_wfree;
skb_set_hash_from_sk(skb, sk);
/*
* We used to take a refcount on sk, but following operation
- * is enough to guarantee sk_free() wont free this sock until
+ * is enough to guarantee sk_free() won't free this sock until
* all in-flight packets are completed
*/
- refcount_add(skb->truesize, &sk->sk_wmem_alloc);
+ __refcount_add(skb->truesize, &sk->sk_wmem_alloc, &old_wmem);
+
+ /* (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket
+ * is in a host queue (qdisc, NIC queue).
+ * Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue
+ * based on XPS for better performance.
+ * Otherwise clear ooo_okay to not risk Out Of Order delivery.
+ */
+ skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS);
}
EXPORT_SYMBOL(skb_set_owner_w);
+static bool can_skb_orphan_partial(const struct sk_buff *skb)
+{
+ /* Drivers depend on in-order delivery for crypto offload,
+ * partial orphan breaks out-of-order-OK logic.
+ */
+ if (skb_is_decrypted(skb))
+ return false;
+
+ return (skb->destructor == sock_wfree ||
+ (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
+}
+
/* This helper is used by netem, as it can hold packets in its
* delay queue. We want to allow the owner socket to send more
* packets, as if they were already TX completed by a typical driver.
@@ -1862,20 +2765,10 @@ void skb_orphan_partial(struct sk_buff *skb)
if (skb_is_tcp_pure_ack(skb))
return;
- if (skb->destructor == sock_wfree
-#ifdef CONFIG_INET
- || skb->destructor == tcp_wfree
-#endif
- ) {
- struct sock *sk = skb->sk;
+ if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
+ return;
- if (refcount_inc_not_zero(&sk->sk_refcnt)) {
- WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
- skb->destructor = sock_efree;
- }
- } else {
- skb_orphan(skb);
- }
+ skb_orphan(skb);
}
EXPORT_SYMBOL(skb_orphan_partial);
@@ -1902,27 +2795,27 @@ void sock_efree(struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_efree);
-kuid_t sock_i_uid(struct sock *sk)
+/* Buffer destructor for prefetch/receive path where reference count may
+ * not be held, e.g. for listen sockets.
+ */
+#ifdef CONFIG_INET
+void sock_pfree(struct sk_buff *skb)
{
- kuid_t uid;
+ struct sock *sk = skb->sk;
- read_lock_bh(&sk->sk_callback_lock);
- uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
- read_unlock_bh(&sk->sk_callback_lock);
- return uid;
-}
-EXPORT_SYMBOL(sock_i_uid);
+ if (!sk_is_refcounted(sk))
+ return;
-unsigned long sock_i_ino(struct sock *sk)
-{
- unsigned long ino;
+ if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
+ inet_reqsk(sk)->rsk_listener = NULL;
+ reqsk_free(inet_reqsk(sk));
+ return;
+ }
- read_lock_bh(&sk->sk_callback_lock);
- ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
- read_unlock_bh(&sk->sk_callback_lock);
- return ino;
+ sock_gen_put(sk);
}
-EXPORT_SYMBOL(sock_i_ino);
+EXPORT_SYMBOL(sock_pfree);
+#endif /* CONFIG_INET */
/*
* Allocate a skb from the socket's send buffer.
@@ -1930,8 +2823,10 @@ EXPORT_SYMBOL(sock_i_ino);
struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
gfp_t priority)
{
- if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
+ if (force ||
+ refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
struct sk_buff *skb = alloc_skb(size, priority);
+
if (skb) {
skb_set_owner_w(skb, sk);
return skb;
@@ -1955,7 +2850,7 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
- sysctl_optmem_max)
+ READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
return NULL;
skb = alloc_skb(size, priority);
@@ -1973,8 +2868,10 @@ struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
*/
void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
{
- if ((unsigned int)size <= sysctl_optmem_max &&
- atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
+ int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
+
+ if ((unsigned int)size <= optmem_max &&
+ atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
void *mem;
/* First do the add, to avoid the race if kmalloc
* might sleep.
@@ -1989,6 +2886,22 @@ void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
}
EXPORT_SYMBOL(sock_kmalloc);
+/*
+ * Duplicate the input "src" memory block using the socket's
+ * option memory buffer.
+ */
+void *sock_kmemdup(struct sock *sk, const void *src,
+ int size, gfp_t priority)
+{
+ void *mem;
+
+ mem = sock_kmalloc(sk, size, priority);
+ if (mem)
+ memcpy(mem, src, size);
+ return mem;
+}
+EXPORT_SYMBOL(sock_kmemdup);
+
/* Free an option memory block. Note, we actually want the inline
* here as this allows gcc to detect the nullify and fold away the
* condition entirely.
@@ -1999,7 +2912,7 @@ static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
if (WARN_ON_ONCE(!mem))
return;
if (nullify)
- kzfree(mem);
+ kfree_sensitive(mem);
else
kfree(mem);
atomic_sub(size, &sk->sk_omem_alloc);
@@ -2032,11 +2945,11 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
break;
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
+ if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
break;
- if (sk->sk_shutdown & SEND_SHUTDOWN)
+ if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
break;
- if (sk->sk_err)
+ if (READ_ONCE(sk->sk_err))
break;
timeo = schedule_timeout(timeo);
}
@@ -2064,10 +2977,10 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
goto failure;
err = -EPIPE;
- if (sk->sk_shutdown & SEND_SHUTDOWN)
+ if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
goto failure;
- if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
+ if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
break;
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
@@ -2093,27 +3006,24 @@ failure:
}
EXPORT_SYMBOL(sock_alloc_send_pskb);
-struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
- int noblock, int *errcode)
-{
- return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
-}
-EXPORT_SYMBOL(sock_alloc_send_skb);
-
-int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
+int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
struct sockcm_cookie *sockc)
{
u32 tsflags;
+ BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
+
switch (cmsg->cmsg_type) {
case SO_MARK:
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
return -EINVAL;
sockc->mark = *(u32 *)CMSG_DATA(cmsg);
break;
- case SO_TIMESTAMPING:
+ case SO_TIMESTAMPING_OLD:
+ case SO_TIMESTAMPING_NEW:
if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
return -EINVAL;
@@ -2131,10 +3041,33 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
return -EINVAL;
sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
break;
+ case SCM_TS_OPT_ID:
+ if (sk_is_tcp(sk))
+ return -EINVAL;
+ tsflags = READ_ONCE(sk->sk_tsflags);
+ if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
+ return -EINVAL;
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
+ sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
+ break;
/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
case SCM_RIGHTS:
case SCM_CREDENTIALS:
break;
+ case SO_PRIORITY:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
+ return -EPERM;
+ sockc->priority = *(u32 *)CMSG_DATA(cmsg);
+ break;
+ case SCM_DEVMEM_DMABUF:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg);
+ break;
default:
return -EINVAL;
}
@@ -2153,7 +3086,7 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
return -EINVAL;
if (cmsg->cmsg_level != SOL_SOCKET)
continue;
- ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
+ ret = __sock_cmsg_send(sk, cmsg, sockc);
if (ret)
return ret;
}
@@ -2172,17 +3105,17 @@ static void sk_enter_memory_pressure(struct sock *sk)
static void sk_leave_memory_pressure(struct sock *sk)
{
if (sk->sk_prot->leave_memory_pressure) {
- sk->sk_prot->leave_memory_pressure(sk);
+ INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
+ tcp_leave_memory_pressure, sk);
} else {
unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
- if (memory_pressure && *memory_pressure)
- *memory_pressure = 0;
+ if (memory_pressure && READ_ONCE(*memory_pressure))
+ WRITE_ONCE(*memory_pressure, 0);
}
}
-/* On 32bit arches, an skb frag is limited to 2^15 */
-#define SKB_FRAG_PAGE_ORDER get_order(32768)
+DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
/**
* skb_page_frag_refill - check that a page_frag contains enough room
@@ -2207,7 +3140,8 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
}
pfrag->offset = 0;
- if (SKB_FRAG_PAGE_ORDER) {
+ if (SKB_FRAG_PAGE_ORDER &&
+ !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
/* Avoid direct reclaim but allow kswapd to wake */
pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
__GFP_COMP | __GFP_NOWARN |
@@ -2232,74 +3166,16 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
return true;
- sk_enter_memory_pressure(sk);
+ if (!sk->sk_bypass_prot_mem)
+ sk_enter_memory_pressure(sk);
+
sk_stream_moderate_sndbuf(sk);
+
return false;
}
EXPORT_SYMBOL(sk_page_frag_refill);
-int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
- int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
- int first_coalesce)
-{
- int sg_curr = *sg_curr_index, use = 0, rc = 0;
- unsigned int size = *sg_curr_size;
- struct page_frag *pfrag;
- struct scatterlist *sge;
-
- len -= size;
- pfrag = sk_page_frag(sk);
-
- while (len > 0) {
- unsigned int orig_offset;
-
- if (!sk_page_frag_refill(sk, pfrag)) {
- rc = -ENOMEM;
- goto out;
- }
-
- use = min_t(int, len, pfrag->size - pfrag->offset);
-
- if (!sk_wmem_schedule(sk, use)) {
- rc = -ENOMEM;
- goto out;
- }
-
- sk_mem_charge(sk, use);
- size += use;
- orig_offset = pfrag->offset;
- pfrag->offset += use;
-
- sge = sg + sg_curr - 1;
- if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page &&
- sge->offset + sge->length == orig_offset) {
- sge->length += use;
- } else {
- sge = sg + sg_curr;
- sg_unmark_end(sge);
- sg_set_page(sge, pfrag->page, use, orig_offset);
- get_page(pfrag->page);
- sg_curr++;
-
- if (sg_curr == MAX_SKB_FRAGS)
- sg_curr = 0;
-
- if (sg_curr == sg_start) {
- rc = -ENOSPC;
- break;
- }
- }
-
- len -= use;
- }
-out:
- *sg_curr_size = size;
- *sg_curr_index = sg_curr;
- return rc;
-}
-EXPORT_SYMBOL(sk_alloc_sg);
-
-static void __lock_sock(struct sock *sk)
+void __lock_sock(struct sock *sk)
__releases(&sk->sk_lock.slock)
__acquires(&sk->sk_lock.slock)
{
@@ -2317,28 +3193,32 @@ static void __lock_sock(struct sock *sk)
finish_wait(&sk->sk_lock.wq, &wait);
}
-static void __release_sock(struct sock *sk)
+void __release_sock(struct sock *sk)
__releases(&sk->sk_lock.slock)
__acquires(&sk->sk_lock.slock)
{
struct sk_buff *skb, *next;
+ int nb = 0;
while ((skb = sk->sk_backlog.head) != NULL) {
sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
spin_unlock_bh(&sk->sk_lock.slock);
- do {
+ while (1) {
next = skb->next;
prefetch(next);
- WARN_ON_ONCE(skb_dst_is_noref(skb));
- skb->next = NULL;
+ DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
+ skb_mark_not_on_list(skb);
sk_backlog_rcv(sk, skb);
- cond_resched();
-
skb = next;
- } while (skb != NULL);
+ if (!skb)
+ break;
+
+ if (!(++nb & 15))
+ cond_resched();
+ }
spin_lock_bh(&sk->sk_lock.slock);
}
@@ -2354,8 +3234,14 @@ void __sk_flush_backlog(struct sock *sk)
{
spin_lock_bh(&sk->sk_lock.slock);
__release_sock(sk);
+
+ if (sk->sk_prot->release_cb)
+ INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
+ tcp_release_cb, sk);
+
spin_unlock_bh(&sk->sk_lock.slock);
}
+EXPORT_SYMBOL_GPL(__sk_flush_backlog);
/**
* sk_wait_data - wait for data to arrive at sk_receive_queue
@@ -2389,17 +3275,34 @@ EXPORT_SYMBOL(sk_wait_data);
* @amt: pages to allocate
* @kind: allocation type
*
- * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
+ * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
+ *
+ * Unlike the globally shared limits among the sockets under same protocol,
+ * consuming the budget of a memcg won't have direct effect on other ones.
+ * So be optimistic about memcg's tolerance, and leave the callers to decide
+ * whether or not to raise allocated through sk_under_memory_pressure() or
+ * its variants.
*/
int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
+ bool memcg_enabled = false, charged = false;
struct proto *prot = sk->sk_prot;
- long allocated = sk_memory_allocated_add(sk, amt);
- bool charged = true;
+ long allocated = 0;
- if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
- !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
- goto suppress_allocation;
+ if (!sk->sk_bypass_prot_mem) {
+ sk_memory_allocated_add(sk, amt);
+ allocated = sk_memory_allocated(sk);
+ }
+
+ if (mem_cgroup_sk_enabled(sk)) {
+ memcg_enabled = true;
+ charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge());
+ if (!charged)
+ goto suppress_allocation;
+ }
+
+ if (!allocated)
+ return 1;
/* Under limit. */
if (allocated <= sk_prot_mem_limits(sk, 0)) {
@@ -2415,7 +3318,14 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
if (allocated > sk_prot_mem_limits(sk, 2))
goto suppress_allocation;
- /* guarantee minimum buffer size under pressure */
+ /* Guarantee minimum buffer size under pressure (either global
+ * or memcg) to make sure features described in RFC 7323 (TCP
+ * Extensions for High Performance) work properly.
+ *
+ * This rule does NOT stand when exceeds global or memcg's hard
+ * limit, or else a DoS attack can be taken place by spawning
+ * lots of sockets whose usage are under minimum buffer size.
+ */
if (kind == SK_MEM_RECV) {
if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
return 1;
@@ -2432,10 +3342,19 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
}
if (sk_has_memory_pressure(sk)) {
- int alloc;
+ u64 alloc;
- if (!sk_under_memory_pressure(sk))
+ /* The following 'average' heuristic is within the
+ * scope of global accounting, so it only makes
+ * sense for global memory pressure.
+ */
+ if (!sk_under_global_memory_pressure(sk))
return 1;
+
+ /* Try to be fair among all the sockets under global
+ * pressure by allowing the ones that below average
+ * usage to raise.
+ */
alloc = sk_sockets_allocated_read_positive(sk);
if (sk_prot_mem_limits(sk, 2) > alloc *
sk_mem_pages(sk->sk_wmem_queued +
@@ -2452,21 +3371,25 @@ suppress_allocation:
/* Fail only if socket is _under_ its sndbuf.
* In this case we cannot block, so that we have to fail.
*/
- if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
+ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
+ /* Force charge with __GFP_NOFAIL */
+ if (memcg_enabled && !charged)
+ mem_cgroup_sk_charge(sk, amt,
+ gfp_memcg_charge() | __GFP_NOFAIL);
return 1;
+ }
}
- if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
- trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
+ trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
- sk_memory_allocated_sub(sk, amt);
+ if (allocated)
+ sk_memory_allocated_sub(sk, amt);
- if (mem_cgroup_sockets_enabled && sk->sk_memcg)
- mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
+ if (charged)
+ mem_cgroup_sk_uncharge(sk, amt);
return 0;
}
-EXPORT_SYMBOL(__sk_mem_raise_allocated);
/**
* __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
@@ -2482,10 +3405,10 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
{
int ret, amt = sk_mem_pages(size);
- sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
+ sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
ret = __sk_mem_raise_allocated(sk, size, amt, kind);
if (!ret)
- sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
+ sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
return ret;
}
EXPORT_SYMBOL(__sk_mem_schedule);
@@ -2499,33 +3422,53 @@ EXPORT_SYMBOL(__sk_mem_schedule);
*/
void __sk_mem_reduce_allocated(struct sock *sk, int amount)
{
- sk_memory_allocated_sub(sk, amount);
+ if (mem_cgroup_sk_enabled(sk))
+ mem_cgroup_sk_uncharge(sk, amount);
- if (mem_cgroup_sockets_enabled && sk->sk_memcg)
- mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
+ if (sk->sk_bypass_prot_mem)
+ return;
+
+ sk_memory_allocated_sub(sk, amount);
- if (sk_under_memory_pressure(sk) &&
+ if (sk_under_global_memory_pressure(sk) &&
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
sk_leave_memory_pressure(sk);
}
-EXPORT_SYMBOL(__sk_mem_reduce_allocated);
/**
* __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
* @sk: socket
- * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
+ * @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
*/
void __sk_mem_reclaim(struct sock *sk, int amount)
{
- amount >>= SK_MEM_QUANTUM_SHIFT;
- sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
+ amount >>= PAGE_SHIFT;
+ sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
__sk_mem_reduce_allocated(sk, amount);
}
EXPORT_SYMBOL(__sk_mem_reclaim);
+void __sk_charge(struct sock *sk, gfp_t gfp)
+{
+ int amt;
+
+ gfp |= __GFP_NOFAIL;
+ if (mem_cgroup_from_sk(sk)) {
+ /* The socket has not been accepted yet, no need
+ * to look at newsk->sk_wmem_queued.
+ */
+ amt = sk_mem_pages(sk->sk_forward_alloc +
+ atomic_read(&sk->sk_rmem_alloc));
+ if (amt)
+ mem_cgroup_sk_charge(sk, amt, gfp);
+ }
+
+ kmem_cache_charge(sk, gfp);
+}
+
int sk_set_peek_off(struct sock *sk, int val)
{
- sk->sk_peek_off = val;
+ WRITE_ONCE(sk->sk_peek_off, val);
return 0;
}
EXPORT_SYMBOL_GPL(sk_set_peek_off);
@@ -2537,13 +3480,13 @@ EXPORT_SYMBOL_GPL(sk_set_peek_off);
* function, some default processing is provided.
*/
-int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
+int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len)
{
return -EOPNOTSUPP;
}
EXPORT_SYMBOL(sock_no_bind);
-int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
+int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr,
int len, int flags)
{
return -EOPNOTSUPP;
@@ -2556,8 +3499,8 @@ int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
}
EXPORT_SYMBOL(sock_no_socketpair);
-int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
- bool kern)
+int sock_no_accept(struct socket *sock, struct socket *newsock,
+ struct proto_accept_arg *arg)
{
return -EOPNOTSUPP;
}
@@ -2588,20 +3531,6 @@ int sock_no_shutdown(struct socket *sock, int how)
}
EXPORT_SYMBOL(sock_no_shutdown);
-int sock_no_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- return -EOPNOTSUPP;
-}
-EXPORT_SYMBOL(sock_no_setsockopt);
-
-int sock_no_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- return -EOPNOTSUPP;
-}
-EXPORT_SYMBOL(sock_no_getsockopt);
-
int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
{
return -EOPNOTSUPP;
@@ -2628,35 +3557,20 @@ int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *
}
EXPORT_SYMBOL(sock_no_mmap);
-ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
-{
- ssize_t res;
- struct msghdr msg = {.msg_flags = flags};
- struct kvec iov;
- char *kaddr = kmap(page);
- iov.iov_base = kaddr + offset;
- iov.iov_len = size;
- res = kernel_sendmsg(sock, &msg, &iov, 1, size);
- kunmap(page);
- return res;
-}
-EXPORT_SYMBOL(sock_no_sendpage);
-
-ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
- int offset, size_t size, int flags)
+/*
+ * When a file is received (via SCM_RIGHTS, etc), we must bump the
+ * various sock-based usage counts.
+ */
+void __receive_sock(struct file *file)
{
- ssize_t res;
- struct msghdr msg = {.msg_flags = flags};
- struct kvec iov;
- char *kaddr = kmap(page);
+ struct socket *sock;
- iov.iov_base = kaddr + offset;
- iov.iov_len = size;
- res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
- kunmap(page);
- return res;
+ sock = sock_from_file(file);
+ if (sock) {
+ sock_update_netprioidx(&sock->sk->sk_cgrp_data);
+ sock_update_classid(&sock->sk->sk_cgrp_data);
+ }
}
-EXPORT_SYMBOL(sock_no_sendpage_locked);
/*
* Default Socket Callbacks
@@ -2681,20 +3595,22 @@ static void sock_def_error_report(struct sock *sk)
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_poll(&wq->wait, EPOLLERR);
- sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
+ sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
rcu_read_unlock();
}
-static void sock_def_readable(struct sock *sk)
+void sock_def_readable(struct sock *sk)
{
struct socket_wq *wq;
+ trace_sk_data_ready(sk);
+
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
EPOLLRDNORM | EPOLLRDBAND);
- sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
+ sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
rcu_read_unlock();
}
@@ -2707,20 +3623,42 @@ static void sock_def_write_space(struct sock *sk)
/* Do not wake up a writer until he can make "significant"
* progress. --DaveM
*/
- if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
+ if (sock_writeable(sk)) {
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
EPOLLWRNORM | EPOLLWRBAND);
/* Should agree with poll, otherwise some programs break */
- if (sock_writeable(sk))
- sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
}
rcu_read_unlock();
}
+/* An optimised version of sock_def_write_space(), should only be called
+ * for SOCK_RCU_FREE sockets under RCU read section and after putting
+ * ->sk_wmem_alloc.
+ */
+static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc)
+{
+ /* Do not wake up a writer until he can make "significant"
+ * progress. --DaveM
+ */
+ if (__sock_writeable(sk, wmem_alloc)) {
+ struct socket_wq *wq = rcu_dereference(sk->sk_wq);
+
+ /* rely on refcount_sub from sock_wfree() */
+ smp_mb__after_atomic();
+ if (wq && waitqueue_active(&wq->wait))
+ wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
+ EPOLLWRNORM | EPOLLWRBAND);
+
+ /* Should agree with poll, otherwise some programs break */
+ sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ }
+}
+
static void sock_def_destruct(struct sock *sk)
{
}
@@ -2728,7 +3666,7 @@ static void sock_def_destruct(struct sock *sk)
void sk_send_sigurg(struct sock *sk)
{
if (sk->sk_socket && sk->sk_socket->file)
- if (send_sigurg(&sk->sk_socket->file->f_owner))
+ if (send_sigurg(sk->sk_socket->file))
sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
}
EXPORT_SYMBOL(sk_send_sigurg);
@@ -2743,12 +3681,19 @@ EXPORT_SYMBOL(sk_reset_timer);
void sk_stop_timer(struct sock *sk, struct timer_list* timer)
{
- if (del_timer(timer))
+ if (timer_delete(timer))
__sock_put(sk);
}
EXPORT_SYMBOL(sk_stop_timer);
-void sock_init_data(struct socket *sock, struct sock *sk)
+void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
+{
+ if (timer_delete_sync(timer))
+ __sock_put(sk);
+}
+EXPORT_SYMBOL(sk_stop_timer_sync);
+
+void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
{
sk_init_common(sk);
sk->sk_send_head = NULL;
@@ -2756,34 +3701,22 @@ void sock_init_data(struct socket *sock, struct sock *sk)
timer_setup(&sk->sk_timer, NULL, 0);
sk->sk_allocation = GFP_KERNEL;
- sk->sk_rcvbuf = sysctl_rmem_default;
- sk->sk_sndbuf = sysctl_wmem_default;
+ sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
+ sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
sk->sk_state = TCP_CLOSE;
+ sk->sk_use_task_frag = true;
sk_set_socket(sk, sock);
sock_set_flag(sk, SOCK_ZAPPED);
if (sock) {
sk->sk_type = sock->type;
- sk->sk_wq = sock->wq;
+ RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
sock->sk = sk;
- sk->sk_uid = SOCK_INODE(sock)->i_uid;
} else {
- sk->sk_wq = NULL;
- sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0);
+ RCU_INIT_POINTER(sk->sk_wq, NULL);
}
-
- rwlock_init(&sk->sk_callback_lock);
- if (sk->sk_kern_sock)
- lockdep_set_class_and_name(
- &sk->sk_callback_lock,
- af_kern_callback_keys + sk->sk_family,
- af_family_kern_clock_key_strings[sk->sk_family]);
- else
- lockdep_set_class_and_name(
- &sk->sk_callback_lock,
- af_callback_keys + sk->sk_family,
- af_family_clock_key_strings[sk->sk_family]);
+ sk->sk_uid = uid;
sk->sk_state_change = sock_def_wakeup;
sk->sk_data_ready = sock_def_readable;
@@ -2797,48 +3730,61 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_peer_pid = NULL;
sk->sk_peer_cred = NULL;
+ spin_lock_init(&sk->sk_peer_lock);
+
sk->sk_write_pending = 0;
sk->sk_rcvlowat = 1;
sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_stamp = SK_DEFAULT_STAMP;
+#if BITS_PER_LONG==32
+ seqlock_init(&sk->sk_stamp_seq);
+#endif
atomic_set(&sk->sk_zckey, 0);
#ifdef CONFIG_NET_RX_BUSY_POLL
sk->sk_napi_id = 0;
- sk->sk_ll_usec = sysctl_net_busy_read;
+ sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
#endif
- sk->sk_max_pacing_rate = ~0U;
- sk->sk_pacing_rate = ~0U;
- sk->sk_pacing_shift = 10;
+ sk->sk_max_pacing_rate = ~0UL;
+ sk->sk_pacing_rate = ~0UL;
+ WRITE_ONCE(sk->sk_pacing_shift, 10);
sk->sk_incoming_cpu = -1;
sk_rx_queue_clear(sk);
/*
* Before updating sk_refcnt, we must commit prior changes to memory
- * (Documentation/RCU/rculist_nulls.txt for details)
+ * (Documentation/RCU/rculist_nulls.rst for details)
*/
smp_wmb();
refcount_set(&sk->sk_refcnt, 1);
- atomic_set(&sk->sk_drops, 0);
+ sk_drops_reset(sk);
+}
+EXPORT_SYMBOL(sock_init_data_uid);
+
+void sock_init_data(struct socket *sock, struct sock *sk)
+{
+ kuid_t uid = sock ?
+ SOCK_INODE(sock)->i_uid :
+ make_kuid(sock_net(sk)->user_ns, 0);
+
+ sock_init_data_uid(sock, sk, uid);
}
EXPORT_SYMBOL(sock_init_data);
void lock_sock_nested(struct sock *sk, int subclass)
{
+ /* The sk_lock has mutex_lock() semantics here. */
+ mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
+
might_sleep();
spin_lock_bh(&sk->sk_lock.slock);
- if (sk->sk_lock.owned)
+ if (sock_owned_by_user_nocheck(sk))
__lock_sock(sk);
sk->sk_lock.owned = 1;
- spin_unlock(&sk->sk_lock.slock);
- /*
- * The sk_lock has mutex_lock() semantics here:
- */
- mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
- local_bh_enable();
+ spin_unlock_bh(&sk->sk_lock.slock);
}
EXPORT_SYMBOL(lock_sock_nested);
@@ -2848,11 +3794,9 @@ void release_sock(struct sock *sk)
if (sk->sk_backlog.tail)
__release_sock(sk);
- /* Warning : release_cb() might need to release sk ownership,
- * ie call sock_release_ownership(sk) before us.
- */
if (sk->sk_prot->release_cb)
- sk->sk_prot->release_cb(sk);
+ INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
+ tcp_release_cb, sk);
sock_release_ownership(sk);
if (waitqueue_active(&sk->sk_lock.wq))
@@ -2861,75 +3805,78 @@ void release_sock(struct sock *sk)
}
EXPORT_SYMBOL(release_sock);
-/**
- * lock_sock_fast - fast version of lock_sock
- * @sk: socket
- *
- * This version should be used for very small section, where process wont block
- * return false if fast path is taken:
- *
- * sk_lock.slock locked, owned = 0, BH disabled
- *
- * return true if slow path is taken:
- *
- * sk_lock.slock unlocked, owned = 1, BH enabled
- */
-bool lock_sock_fast(struct sock *sk)
+bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
{
might_sleep();
spin_lock_bh(&sk->sk_lock.slock);
- if (!sk->sk_lock.owned)
+ if (!sock_owned_by_user_nocheck(sk)) {
/*
- * Note : We must disable BH
+ * Fast path return with bottom halves disabled and
+ * sock::sk_lock.slock held.
+ *
+ * The 'mutex' is not contended and holding
+ * sock::sk_lock.slock prevents all other lockers to
+ * proceed so the corresponding unlock_sock_fast() can
+ * avoid the slow path of release_sock() completely and
+ * just release slock.
+ *
+ * From a semantical POV this is equivalent to 'acquiring'
+ * the 'mutex', hence the corresponding lockdep
+ * mutex_release() has to happen in the fast path of
+ * unlock_sock_fast().
*/
return false;
+ }
__lock_sock(sk);
sk->sk_lock.owned = 1;
- spin_unlock(&sk->sk_lock.slock);
- /*
- * The sk_lock has mutex_lock() semantics here:
- */
- mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
- local_bh_enable();
+ __acquire(&sk->sk_lock.slock);
+ spin_unlock_bh(&sk->sk_lock.slock);
return true;
}
-EXPORT_SYMBOL(lock_sock_fast);
+EXPORT_SYMBOL(__lock_sock_fast);
-int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
+int sock_gettstamp(struct socket *sock, void __user *userstamp,
+ bool timeval, bool time32)
{
- struct timeval tv;
+ struct sock *sk = sock->sk;
+ struct timespec64 ts;
sock_enable_timestamp(sk, SOCK_TIMESTAMP);
- tv = ktime_to_timeval(sk->sk_stamp);
- if (tv.tv_sec == -1)
+ ts = ktime_to_timespec64(sock_read_timestamp(sk));
+ if (ts.tv_sec == -1)
return -ENOENT;
- if (tv.tv_sec == 0) {
- sk->sk_stamp = ktime_get_real();
- tv = ktime_to_timeval(sk->sk_stamp);
+ if (ts.tv_sec == 0) {
+ ktime_t kt = ktime_get_real();
+ sock_write_timestamp(sk, kt);
+ ts = ktime_to_timespec64(kt);
}
- return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
-}
-EXPORT_SYMBOL(sock_get_timestamp);
-int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
-{
- struct timespec ts;
+ if (timeval)
+ ts.tv_nsec /= 1000;
- sock_enable_timestamp(sk, SOCK_TIMESTAMP);
- ts = ktime_to_timespec(sk->sk_stamp);
- if (ts.tv_sec == -1)
- return -ENOENT;
- if (ts.tv_sec == 0) {
- sk->sk_stamp = ktime_get_real();
- ts = ktime_to_timespec(sk->sk_stamp);
+#ifdef CONFIG_COMPAT_32BIT_TIME
+ if (time32)
+ return put_old_timespec32(&ts, userstamp);
+#endif
+#ifdef CONFIG_SPARC64
+ /* beware of padding in sparc64 timeval */
+ if (timeval && !in_compat_syscall()) {
+ struct __kernel_old_timeval __user tv = {
+ .tv_sec = ts.tv_sec,
+ .tv_usec = ts.tv_nsec,
+ };
+ if (copy_to_user(userstamp, &tv, sizeof(tv)))
+ return -EFAULT;
+ return 0;
}
- return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
+#endif
+ return put_timespec64(&ts, userstamp);
}
-EXPORT_SYMBOL(sock_get_timestampns);
+EXPORT_SYMBOL(sock_gettstamp);
-void sock_enable_timestamp(struct sock *sk, int flag)
+void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
{
if (!sock_flag(sk, flag)) {
unsigned long previous_flags = sk->sk_flags;
@@ -2987,31 +3934,18 @@ EXPORT_SYMBOL(sock_recv_errqueue);
*
* FIX: POSIX 1003.1g is very ambiguous here. It states that
* asynchronous errors should be reported by getsockopt. We assume
- * this means if you specify SO_ERROR (otherwise whats the point of it).
+ * this means if you specify SO_ERROR (otherwise what is the point of it).
*/
int sock_common_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
struct sock *sk = sock->sk;
- return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(sock_common_getsockopt);
-#ifdef CONFIG_COMPAT
-int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- struct sock *sk = sock->sk;
-
- if (sk->sk_prot->compat_getsockopt != NULL)
- return sk->sk_prot->compat_getsockopt(sk, level, optname,
- optval, optlen);
- return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
-}
-EXPORT_SYMBOL(compat_sock_common_getsockopt);
-#endif
-
int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int flags)
{
@@ -3019,8 +3953,7 @@ int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int addr_len = 0;
int err;
- err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
- flags & ~MSG_DONTWAIT, &addr_len);
+ err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
if (err >= 0)
msg->msg_namelen = addr_len;
return err;
@@ -3031,35 +3964,22 @@ EXPORT_SYMBOL(sock_common_recvmsg);
* Set socket options on an inet socket.
*/
int sock_common_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
- return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
}
EXPORT_SYMBOL(sock_common_setsockopt);
-#ifdef CONFIG_COMPAT
-int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- struct sock *sk = sock->sk;
-
- if (sk->sk_prot->compat_setsockopt != NULL)
- return sk->sk_prot->compat_setsockopt(sk, level, optname,
- optval, optlen);
- return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
-}
-EXPORT_SYMBOL(compat_sock_common_setsockopt);
-#endif
-
void sk_common_release(struct sock *sk)
{
if (sk->sk_prot->destroy)
sk->sk_prot->destroy(sk);
/*
- * Observation: when sock_common_release is called, processes have
+ * Observation: when sk_common_release is called, processes have
* no access to socket. But net still has.
* Step one, detach it from networking:
*
@@ -3084,8 +4004,6 @@ void sk_common_release(struct sock *sk)
xfrm_sk_free_policy(sk);
- sk_refcnt_debug_release(sk);
-
sock_put(sk);
}
EXPORT_SYMBOL(sk_common_release);
@@ -3095,30 +4013,19 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem)
memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
- mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
+ mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
- mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
- mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
- mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
+ mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
+ mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc);
+ mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
- mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
- mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
+ mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
+ mem[SK_MEMINFO_DROPS] = sk_drops_read(sk);
}
#ifdef CONFIG_PROC_FS
-#define PROTO_INUSE_NR 64 /* should be enough for the first time */
-struct prot_inuse {
- int val[PROTO_INUSE_NR];
-};
-
static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
-void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
-{
- __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
-}
-EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
-
int sock_prot_inuse_get(struct net *net, struct proto *prot)
{
int cpu, idx = prot->inuse_idx;
@@ -3131,17 +4038,12 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot)
}
EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
-static void sock_inuse_add(struct net *net, int val)
-{
- this_cpu_add(*net->core.sock_inuse, val);
-}
-
int sock_inuse_get(struct net *net)
{
int cpu, res = 0;
for_each_possible_cpu(cpu)
- res += *per_cpu_ptr(net->core.sock_inuse, cpu);
+ res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
return res;
}
@@ -3153,22 +4055,12 @@ static int __net_init sock_inuse_init_net(struct net *net)
net->core.prot_inuse = alloc_percpu(struct prot_inuse);
if (net->core.prot_inuse == NULL)
return -ENOMEM;
-
- net->core.sock_inuse = alloc_percpu(int);
- if (net->core.sock_inuse == NULL)
- goto out;
-
return 0;
-
-out:
- free_percpu(net->core.prot_inuse);
- return -ENOMEM;
}
static void __net_exit sock_inuse_exit_net(struct net *net)
{
free_percpu(net->core.prot_inuse);
- free_percpu(net->core.sock_inuse);
}
static struct pernet_operations net_inuse_ops = {
@@ -3186,36 +4078,71 @@ static __init int net_inuse_init(void)
core_initcall(net_inuse_init);
-static void assign_proto_idx(struct proto *prot)
+static int assign_proto_idx(struct proto *prot)
{
prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
- if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
+ if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) {
pr_err("PROTO_INUSE_NR exhausted\n");
- return;
+ return -ENOSPC;
}
set_bit(prot->inuse_idx, proto_inuse_idx);
+ return 0;
}
static void release_proto_idx(struct proto *prot)
{
- if (prot->inuse_idx != PROTO_INUSE_NR - 1)
+ if (prot->inuse_idx != PROTO_INUSE_NR)
clear_bit(prot->inuse_idx, proto_inuse_idx);
}
#else
-static inline void assign_proto_idx(struct proto *prot)
+static inline int assign_proto_idx(struct proto *prot)
{
+ return 0;
}
static inline void release_proto_idx(struct proto *prot)
{
}
-static void sock_inuse_add(struct net *net, int val)
+#endif
+
+static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
{
+ if (!twsk_prot)
+ return;
+ kfree(twsk_prot->twsk_slab_name);
+ twsk_prot->twsk_slab_name = NULL;
+ kmem_cache_destroy(twsk_prot->twsk_slab);
+ twsk_prot->twsk_slab = NULL;
+}
+
+static int tw_prot_init(const struct proto *prot)
+{
+ struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
+
+ if (!twsk_prot)
+ return 0;
+
+ twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
+ prot->name);
+ if (!twsk_prot->twsk_slab_name)
+ return -ENOMEM;
+
+ twsk_prot->twsk_slab =
+ kmem_cache_create(twsk_prot->twsk_slab_name,
+ twsk_prot->twsk_obj_size, 0,
+ SLAB_ACCOUNT | prot->slab_flags,
+ NULL);
+ if (!twsk_prot->twsk_slab) {
+ pr_crit("%s: Can't create timewait sock SLAB cache!\n",
+ prot->name);
+ return -ENOMEM;
+ }
+
+ return 0;
}
-#endif
static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
{
@@ -3254,6 +4181,16 @@ static int req_prot_init(const struct proto *prot)
int proto_register(struct proto *prot, int alloc_slab)
{
+ int ret = -ENOBUFS;
+
+ if (prot->memory_allocated && !prot->sysctl_mem) {
+ pr_err("%s: missing sysctl_mem\n", prot->name);
+ return -EINVAL;
+ }
+ if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
+ pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
+ return -EINVAL;
+ }
if (alloc_slab) {
prot->slab = kmem_cache_create_usercopy(prot->name,
prot->obj_size, 0,
@@ -3271,39 +4208,32 @@ int proto_register(struct proto *prot, int alloc_slab)
if (req_prot_init(prot))
goto out_free_request_sock_slab;
- if (prot->twsk_prot != NULL) {
- prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
-
- if (prot->twsk_prot->twsk_slab_name == NULL)
- goto out_free_request_sock_slab;
-
- prot->twsk_prot->twsk_slab =
- kmem_cache_create(prot->twsk_prot->twsk_slab_name,
- prot->twsk_prot->twsk_obj_size,
- 0,
- SLAB_ACCOUNT |
- prot->slab_flags,
- NULL);
- if (prot->twsk_prot->twsk_slab == NULL)
- goto out_free_timewait_sock_slab_name;
- }
+ if (tw_prot_init(prot))
+ goto out_free_timewait_sock_slab;
}
mutex_lock(&proto_list_mutex);
+ ret = assign_proto_idx(prot);
+ if (ret) {
+ mutex_unlock(&proto_list_mutex);
+ goto out_free_timewait_sock_slab;
+ }
list_add(&prot->node, &proto_list);
- assign_proto_idx(prot);
mutex_unlock(&proto_list_mutex);
- return 0;
+ return ret;
-out_free_timewait_sock_slab_name:
- kfree(prot->twsk_prot->twsk_slab_name);
+out_free_timewait_sock_slab:
+ if (alloc_slab)
+ tw_prot_cleanup(prot->twsk_prot);
out_free_request_sock_slab:
- req_prot_cleanup(prot->rsk_prot);
+ if (alloc_slab) {
+ req_prot_cleanup(prot->rsk_prot);
- kmem_cache_destroy(prot->slab);
- prot->slab = NULL;
+ kmem_cache_destroy(prot->slab);
+ prot->slab = NULL;
+ }
out:
- return -ENOBUFS;
+ return ret;
}
EXPORT_SYMBOL(proto_register);
@@ -3318,12 +4248,7 @@ void proto_unregister(struct proto *prot)
prot->slab = NULL;
req_prot_cleanup(prot->rsk_prot);
-
- if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
- kmem_cache_destroy(prot->twsk_prot->twsk_slab);
- kfree(prot->twsk_prot->twsk_slab_name);
- prot->twsk_prot->twsk_slab = NULL;
- }
+ tw_prot_cleanup(prot->twsk_prot);
}
EXPORT_SYMBOL(proto_unregister);
@@ -3339,6 +4264,8 @@ int sock_load_diag_module(int family, int protocol)
#ifdef CONFIG_INET
if (family == AF_INET &&
+ protocol != IPPROTO_RAW &&
+ protocol < MAX_INET_PROTOS &&
!rcu_access_pointer(inet_protos[protocol]))
return -ENOENT;
#endif
@@ -3376,7 +4303,7 @@ static long sock_prot_memory_allocated(struct proto *proto)
return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
}
-static char *sock_prot_memory_pressure(struct proto *proto)
+static const char *sock_prot_memory_pressure(struct proto *proto)
{
return proto->memory_pressure != NULL ?
proto_memory_pressure(proto) ? "yes" : "no" : "NI";
@@ -3386,7 +4313,7 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
{
seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
- "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
+ "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
proto->name,
proto->obj_size,
sock_prot_inuse_get(seq_file_net(seq), proto),
@@ -3407,7 +4334,6 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
proto_method_implemented(proto->getsockopt),
proto_method_implemented(proto->sendmsg),
proto_method_implemented(proto->recvmsg),
- proto_method_implemented(proto->sendpage),
proto_method_implemented(proto->bind),
proto_method_implemented(proto->backlog_rcv),
proto_method_implemented(proto->hash),
@@ -3428,7 +4354,7 @@ static int proto_seq_show(struct seq_file *seq, void *v)
"maxhdr",
"slab",
"module",
- "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
+ "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
else
proto_seq_printf(seq, list_entry(v, struct proto, node));
return 0;
@@ -3475,8 +4401,149 @@ bool sk_busy_loop_end(void *p, unsigned long start_time)
{
struct sock *sk = p;
- return !skb_queue_empty(&sk->sk_receive_queue) ||
- sk_busy_loop_timeout(sk, start_time);
+ if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
+ return true;
+
+ if (sk_is_udp(sk) &&
+ !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
+ return true;
+
+ return sk_busy_loop_timeout(sk, start_time);
}
EXPORT_SYMBOL(sk_busy_loop_end);
#endif /* CONFIG_NET_RX_BUSY_POLL */
+
+int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len)
+{
+ if (!sk->sk_prot->bind_add)
+ return -EOPNOTSUPP;
+ return sk->sk_prot->bind_add(sk, addr, addr_len);
+}
+EXPORT_SYMBOL(sock_bind_add);
+
+/* Copy 'size' bytes from userspace and return `size` back to userspace */
+int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
+ void __user *arg, void *karg, size_t size)
+{
+ int ret;
+
+ if (copy_from_user(karg, arg, size))
+ return -EFAULT;
+
+ ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
+ if (ret)
+ return ret;
+
+ if (copy_to_user(arg, karg, size))
+ return -EFAULT;
+
+ return 0;
+}
+EXPORT_SYMBOL(sock_ioctl_inout);
+
+/* This is the most common ioctl prep function, where the result (4 bytes) is
+ * copied back to userspace if the ioctl() returns successfully. No input is
+ * copied from userspace as input argument.
+ */
+static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+ int ret, karg = 0;
+
+ ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
+ if (ret)
+ return ret;
+
+ return put_user(karg, (int __user *)arg);
+}
+
+/* A wrapper around sock ioctls, which copies the data from userspace
+ * (depending on the protocol/ioctl), and copies back the result to userspace.
+ * The main motivation for this function is to pass kernel memory to the
+ * protocol ioctl callbacks, instead of userspace memory.
+ */
+int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+ int rc = 1;
+
+ if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
+ rc = ipmr_sk_ioctl(sk, cmd, arg);
+ else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
+ rc = ip6mr_sk_ioctl(sk, cmd, arg);
+ else if (sk_is_phonet(sk))
+ rc = phonet_sk_ioctl(sk, cmd, arg);
+
+ /* If ioctl was processed, returns its value */
+ if (rc <= 0)
+ return rc;
+
+ /* Otherwise call the default handler */
+ return sock_ioctl_out(sk, cmd, arg);
+}
+EXPORT_SYMBOL(sk_ioctl);
+
+static int __init sock_struct_check(void)
+{
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
+
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
+
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
+#ifdef CONFIG_MEMCG
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
+#endif
+
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
+
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
+
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_pending_confirm);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_status);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
+ return 0;
+}
+
+core_initcall(sock_struct_check);
diff --git a/net/core/sock_destructor.h b/net/core/sock_destructor.h
new file mode 100644
index 000000000000..2f396e6bfba5
--- /dev/null
+++ b/net/core/sock_destructor.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _NET_CORE_SOCK_DESTRUCTOR_H
+#define _NET_CORE_SOCK_DESTRUCTOR_H
+#include <net/tcp.h>
+
+static inline bool is_skb_wmem(const struct sk_buff *skb)
+{
+ return skb->destructor == sock_wfree ||
+ skb->destructor == __sock_wfree ||
+ (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree);
+}
+#endif
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 3312a5849a97..026ce9bd9e5e 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -1,5 +1,6 @@
/* License: GPL */
+#include <linux/filter.h>
#include <linux/mutex.h>
#include <linux/socket.h>
#include <linux/skbuff.h>
@@ -11,25 +12,31 @@
#include <linux/tcp.h>
#include <linux/workqueue.h>
#include <linux/nospec.h>
-
+#include <linux/cookie.h>
#include <linux/inet_diag.h>
#include <linux/sock_diag.h>
-static const struct sock_diag_handler *sock_diag_handlers[AF_MAX];
-static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
-static DEFINE_MUTEX(sock_diag_table_mutex);
+static const struct sock_diag_handler __rcu *sock_diag_handlers[AF_MAX];
+
+static const struct sock_diag_inet_compat __rcu *inet_rcv_compat;
+
static struct workqueue_struct *broadcast_wq;
-u64 sock_gen_cookie(struct sock *sk)
+DEFINE_COOKIE(sock_cookie);
+
+u64 __sock_gen_cookie(struct sock *sk)
{
- while (1) {
- u64 res = atomic64_read(&sk->sk_cookie);
+ u64 res = atomic64_read(&sk->sk_cookie);
- if (res)
- return res;
- res = atomic64_inc_return(&sock_net(sk)->cookie_gen);
- atomic64_cmpxchg(&sk->sk_cookie, 0, res);
+ if (!res) {
+ u64 new = gen_cookie_next(&sock_cookie);
+
+ atomic64_cmpxchg(&sk->sk_cookie, res, new);
+
+ /* Another thread might have changed sk_cookie before us. */
+ res = atomic64_read(&sk->sk_cookie);
}
+ return res;
}
int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie)
@@ -116,6 +123,24 @@ static size_t sock_diag_nlmsg_size(void)
+ nla_total_size_64bit(sizeof(struct tcp_info))); /* INET_DIAG_INFO */
}
+static const struct sock_diag_handler *sock_diag_lock_handler(int family)
+{
+ const struct sock_diag_handler *handler;
+
+ rcu_read_lock();
+ handler = rcu_dereference(sock_diag_handlers[family]);
+ if (handler && !try_module_get(handler->owner))
+ handler = NULL;
+ rcu_read_unlock();
+
+ return handler;
+}
+
+static void sock_diag_unlock_handler(const struct sock_diag_handler *handler)
+{
+ module_put(handler->owner);
+}
+
static void sock_diag_broadcast_destroy_work(struct work_struct *work)
{
struct broadcast_sk *bsk =
@@ -132,12 +157,12 @@ static void sock_diag_broadcast_destroy_work(struct work_struct *work)
if (!skb)
goto out;
- mutex_lock(&sock_diag_table_mutex);
- hndl = sock_diag_handlers[sk->sk_family];
- if (hndl && hndl->get_info)
- err = hndl->get_info(skb, sk);
- mutex_unlock(&sock_diag_table_mutex);
-
+ hndl = sock_diag_lock_handler(sk->sk_family);
+ if (hndl) {
+ if (hndl->get_info)
+ err = hndl->get_info(skb, sk);
+ sock_diag_unlock_handler(hndl);
+ }
if (!err)
nlmsg_multicast(sock_net(sk)->diag_nlsk, skb, 0, group,
GFP_KERNEL);
@@ -160,51 +185,43 @@ void sock_diag_broadcast_destroy(struct sock *sk)
queue_work(broadcast_wq, &bsk->work);
}
-void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
+void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr)
{
- mutex_lock(&sock_diag_table_mutex);
- inet_rcv_compat = fn;
- mutex_unlock(&sock_diag_table_mutex);
+ xchg(&inet_rcv_compat, RCU_INITIALIZER(ptr));
}
EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat);
-void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
+void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr)
{
- mutex_lock(&sock_diag_table_mutex);
- inet_rcv_compat = NULL;
- mutex_unlock(&sock_diag_table_mutex);
+ const struct sock_diag_inet_compat *old;
+
+ old = unrcu_pointer(xchg(&inet_rcv_compat, NULL));
+ WARN_ON_ONCE(old != ptr);
}
EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat);
int sock_diag_register(const struct sock_diag_handler *hndl)
{
- int err = 0;
+ int family = hndl->family;
- if (hndl->family >= AF_MAX)
+ if (family >= AF_MAX)
return -EINVAL;
- mutex_lock(&sock_diag_table_mutex);
- if (sock_diag_handlers[hndl->family])
- err = -EBUSY;
- else
- sock_diag_handlers[hndl->family] = hndl;
- mutex_unlock(&sock_diag_table_mutex);
-
- return err;
+ return !cmpxchg((const struct sock_diag_handler **)
+ &sock_diag_handlers[family],
+ NULL, hndl) ? 0 : -EBUSY;
}
EXPORT_SYMBOL_GPL(sock_diag_register);
-void sock_diag_unregister(const struct sock_diag_handler *hnld)
+void sock_diag_unregister(const struct sock_diag_handler *hndl)
{
- int family = hnld->family;
+ int family = hndl->family;
if (family >= AF_MAX)
return;
- mutex_lock(&sock_diag_table_mutex);
- BUG_ON(sock_diag_handlers[family] != hnld);
- sock_diag_handlers[family] = NULL;
- mutex_unlock(&sock_diag_table_mutex);
+ xchg((const struct sock_diag_handler **)&sock_diag_handlers[family],
+ NULL);
}
EXPORT_SYMBOL_GPL(sock_diag_unregister);
@@ -221,20 +238,20 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
return -EINVAL;
req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX);
- if (sock_diag_handlers[req->sdiag_family] == NULL)
+ if (!rcu_access_pointer(sock_diag_handlers[req->sdiag_family]))
sock_load_diag_module(req->sdiag_family, 0);
- mutex_lock(&sock_diag_table_mutex);
- hndl = sock_diag_handlers[req->sdiag_family];
+ hndl = sock_diag_lock_handler(req->sdiag_family);
if (hndl == NULL)
- err = -ENOENT;
- else if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY)
+ return -ENOENT;
+
+ if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY)
err = hndl->dump(skb, nlh);
else if (nlh->nlmsg_type == SOCK_DESTROY && hndl->destroy)
err = hndl->destroy(skb, nlh);
else
err = -EOPNOTSUPP;
- mutex_unlock(&sock_diag_table_mutex);
+ sock_diag_unlock_handler(hndl);
return err;
}
@@ -242,20 +259,25 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
+ const struct sock_diag_inet_compat *ptr;
int ret;
switch (nlh->nlmsg_type) {
case TCPDIAG_GETSOCK:
- case DCCPDIAG_GETSOCK:
- if (inet_rcv_compat == NULL)
+ if (!rcu_access_pointer(inet_rcv_compat))
sock_load_diag_module(AF_INET, 0);
- mutex_lock(&sock_diag_table_mutex);
- if (inet_rcv_compat != NULL)
- ret = inet_rcv_compat(skb, nlh);
- else
- ret = -EOPNOTSUPP;
- mutex_unlock(&sock_diag_table_mutex);
+ rcu_read_lock();
+ ptr = rcu_dereference(inet_rcv_compat);
+ if (ptr && !try_module_get(ptr->owner))
+ ptr = NULL;
+ rcu_read_unlock();
+
+ ret = -EOPNOTSUPP;
+ if (ptr) {
+ ret = ptr->fn(skb, nlh);
+ module_put(ptr->owner);
+ }
return ret;
case SOCK_DIAG_BY_FAMILY:
@@ -266,13 +288,9 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
}
}
-static DEFINE_MUTEX(sock_diag_mutex);
-
static void sock_diag_rcv(struct sk_buff *skb)
{
- mutex_lock(&sock_diag_mutex);
netlink_rcv_skb(skb, &sock_diag_rcv_msg);
- mutex_unlock(&sock_diag_mutex);
}
static int sock_diag_bind(struct net *net, int group)
@@ -280,12 +298,12 @@ static int sock_diag_bind(struct net *net, int group)
switch (group) {
case SKNLGRP_INET_TCP_DESTROY:
case SKNLGRP_INET_UDP_DESTROY:
- if (!sock_diag_handlers[AF_INET])
+ if (!rcu_access_pointer(sock_diag_handlers[AF_INET]))
sock_load_diag_module(AF_INET, 0);
break;
case SKNLGRP_INET6_TCP_DESTROY:
case SKNLGRP_INET6_UDP_DESTROY:
- if (!sock_diag_handlers[AF_INET6])
+ if (!rcu_access_pointer(sock_diag_handlers[AF_INET6]))
sock_load_diag_module(AF_INET6, 0);
break;
}
@@ -330,7 +348,7 @@ static struct pernet_operations diag_net_ops = {
static int __init sock_diag_init(void)
{
- broadcast_wq = alloc_workqueue("sock_diag_events", 0, 0);
+ broadcast_wq = alloc_workqueue("sock_diag_events", WQ_PERCPU, 0);
BUG_ON(!broadcast_wq);
return register_pernet_subsys(&diag_net_ops);
}
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
new file mode 100644
index 000000000000..5947b38e4f8b
--- /dev/null
+++ b/net/core/sock_map.c
@@ -0,0 +1,1959 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <linux/filter.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/net.h>
+#include <linux/workqueue.h>
+#include <linux/skmsg.h>
+#include <linux/list.h>
+#include <linux/jhash.h>
+#include <linux/sock_diag.h>
+#include <net/udp.h>
+
+struct bpf_stab {
+ struct bpf_map map;
+ struct sock **sks;
+ struct sk_psock_progs progs;
+ spinlock_t lock;
+};
+
+#define SOCK_CREATE_FLAG_MASK \
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
+/* This mutex is used to
+ * - protect race between prog/link attach/detach and link prog update, and
+ * - protect race between releasing and accessing map in bpf_link.
+ * A single global mutex lock is used since it is expected contention is low.
+ */
+static DEFINE_MUTEX(sockmap_mutex);
+
+static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+ struct bpf_prog *old, struct bpf_link *link,
+ u32 which);
+static struct sk_psock_progs *sock_map_progs(struct bpf_map *map);
+
+static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_stab *stab;
+
+ if (attr->max_entries == 0 ||
+ attr->key_size != 4 ||
+ (attr->value_size != sizeof(u32) &&
+ attr->value_size != sizeof(u64)) ||
+ attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+ return ERR_PTR(-EINVAL);
+
+ stab = bpf_map_area_alloc(sizeof(*stab), NUMA_NO_NODE);
+ if (!stab)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&stab->map, attr);
+ spin_lock_init(&stab->lock);
+
+ stab->sks = bpf_map_area_alloc((u64) stab->map.max_entries *
+ sizeof(struct sock *),
+ stab->map.numa_node);
+ if (!stab->sks) {
+ bpf_map_area_free(stab);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return &stab->map;
+}
+
+int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_map *map;
+ int ret;
+
+ if (attr->attach_flags || attr->replace_bpf_fd)
+ return -EINVAL;
+
+ CLASS(fd, f)(attr->target_fd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+ mutex_lock(&sockmap_mutex);
+ ret = sock_map_prog_update(map, prog, NULL, NULL, attr->attach_type);
+ mutex_unlock(&sockmap_mutex);
+ return ret;
+}
+
+int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
+{
+ struct bpf_prog *prog;
+ struct bpf_map *map;
+ int ret;
+
+ if (attr->attach_flags || attr->replace_bpf_fd)
+ return -EINVAL;
+
+ CLASS(fd, f)(attr->target_fd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ prog = bpf_prog_get(attr->attach_bpf_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->type != ptype) {
+ ret = -EINVAL;
+ goto put_prog;
+ }
+
+ mutex_lock(&sockmap_mutex);
+ ret = sock_map_prog_update(map, NULL, prog, NULL, attr->attach_type);
+ mutex_unlock(&sockmap_mutex);
+put_prog:
+ bpf_prog_put(prog);
+ return ret;
+}
+
+static void sock_map_sk_acquire(struct sock *sk)
+ __acquires(&sk->sk_lock.slock)
+{
+ lock_sock(sk);
+ rcu_read_lock();
+}
+
+static void sock_map_sk_release(struct sock *sk)
+ __releases(&sk->sk_lock.slock)
+{
+ rcu_read_unlock();
+ release_sock(sk);
+}
+
+static void sock_map_add_link(struct sk_psock *psock,
+ struct sk_psock_link *link,
+ struct bpf_map *map, void *link_raw)
+{
+ link->link_raw = link_raw;
+ link->map = map;
+ spin_lock_bh(&psock->link_lock);
+ list_add_tail(&link->list, &psock->link);
+ spin_unlock_bh(&psock->link_lock);
+}
+
+static void sock_map_del_link(struct sock *sk,
+ struct sk_psock *psock, void *link_raw)
+{
+ bool strp_stop = false, verdict_stop = false;
+ struct sk_psock_link *link, *tmp;
+
+ spin_lock_bh(&psock->link_lock);
+ list_for_each_entry_safe(link, tmp, &psock->link, list) {
+ if (link->link_raw == link_raw) {
+ struct bpf_map *map = link->map;
+ struct sk_psock_progs *progs = sock_map_progs(map);
+
+ if (psock->saved_data_ready && progs->stream_parser)
+ strp_stop = true;
+ if (psock->saved_data_ready && progs->stream_verdict)
+ verdict_stop = true;
+ if (psock->saved_data_ready && progs->skb_verdict)
+ verdict_stop = true;
+ list_del(&link->list);
+ sk_psock_free_link(link);
+ break;
+ }
+ }
+ spin_unlock_bh(&psock->link_lock);
+ if (strp_stop || verdict_stop) {
+ write_lock_bh(&sk->sk_callback_lock);
+ if (strp_stop)
+ sk_psock_stop_strp(sk, psock);
+ if (verdict_stop)
+ sk_psock_stop_verdict(sk, psock);
+
+ if (psock->psock_update_sk_prot)
+ psock->psock_update_sk_prot(sk, psock, false);
+ write_unlock_bh(&sk->sk_callback_lock);
+ }
+}
+
+static void sock_map_unref(struct sock *sk, void *link_raw)
+{
+ struct sk_psock *psock = sk_psock(sk);
+
+ if (likely(psock)) {
+ sock_map_del_link(sk, psock, link_raw);
+ sk_psock_put(sk, psock);
+ }
+}
+
+static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock)
+{
+ if (!sk->sk_prot->psock_update_sk_prot)
+ return -EINVAL;
+ psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot;
+ return sk->sk_prot->psock_update_sk_prot(sk, psock, false);
+}
+
+static struct sk_psock *sock_map_psock_get_checked(struct sock *sk)
+{
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (psock) {
+ if (sk->sk_prot->close != sock_map_close) {
+ psock = ERR_PTR(-EBUSY);
+ goto out;
+ }
+
+ if (!refcount_inc_not_zero(&psock->refcnt))
+ psock = ERR_PTR(-EBUSY);
+ }
+out:
+ rcu_read_unlock();
+ return psock;
+}
+
+static int sock_map_link(struct bpf_map *map, struct sock *sk)
+{
+ struct sk_psock_progs *progs = sock_map_progs(map);
+ struct bpf_prog *stream_verdict = NULL;
+ struct bpf_prog *stream_parser = NULL;
+ struct bpf_prog *skb_verdict = NULL;
+ struct bpf_prog *msg_parser = NULL;
+ struct sk_psock *psock;
+ int ret;
+
+ stream_verdict = READ_ONCE(progs->stream_verdict);
+ if (stream_verdict) {
+ stream_verdict = bpf_prog_inc_not_zero(stream_verdict);
+ if (IS_ERR(stream_verdict))
+ return PTR_ERR(stream_verdict);
+ }
+
+ stream_parser = READ_ONCE(progs->stream_parser);
+ if (stream_parser) {
+ stream_parser = bpf_prog_inc_not_zero(stream_parser);
+ if (IS_ERR(stream_parser)) {
+ ret = PTR_ERR(stream_parser);
+ goto out_put_stream_verdict;
+ }
+ }
+
+ msg_parser = READ_ONCE(progs->msg_parser);
+ if (msg_parser) {
+ msg_parser = bpf_prog_inc_not_zero(msg_parser);
+ if (IS_ERR(msg_parser)) {
+ ret = PTR_ERR(msg_parser);
+ goto out_put_stream_parser;
+ }
+ }
+
+ skb_verdict = READ_ONCE(progs->skb_verdict);
+ if (skb_verdict) {
+ skb_verdict = bpf_prog_inc_not_zero(skb_verdict);
+ if (IS_ERR(skb_verdict)) {
+ ret = PTR_ERR(skb_verdict);
+ goto out_put_msg_parser;
+ }
+ }
+
+ psock = sock_map_psock_get_checked(sk);
+ if (IS_ERR(psock)) {
+ ret = PTR_ERR(psock);
+ goto out_progs;
+ }
+
+ if (psock) {
+ if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) ||
+ (stream_parser && READ_ONCE(psock->progs.stream_parser)) ||
+ (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) ||
+ (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) ||
+ (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) ||
+ (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) {
+ sk_psock_put(sk, psock);
+ ret = -EBUSY;
+ goto out_progs;
+ }
+ } else {
+ psock = sk_psock_init(sk, map->numa_node);
+ if (IS_ERR(psock)) {
+ ret = PTR_ERR(psock);
+ goto out_progs;
+ }
+ }
+
+ if (msg_parser)
+ psock_set_prog(&psock->progs.msg_parser, msg_parser);
+ if (stream_parser)
+ psock_set_prog(&psock->progs.stream_parser, stream_parser);
+ if (stream_verdict)
+ psock_set_prog(&psock->progs.stream_verdict, stream_verdict);
+ if (skb_verdict)
+ psock_set_prog(&psock->progs.skb_verdict, skb_verdict);
+
+ /* msg_* and stream_* programs references tracked in psock after this
+ * point. Reference dec and cleanup will occur through psock destructor
+ */
+ ret = sock_map_init_proto(sk, psock);
+ if (ret < 0) {
+ sk_psock_put(sk, psock);
+ goto out;
+ }
+
+ write_lock_bh(&sk->sk_callback_lock);
+ if (stream_parser && stream_verdict && !psock->saved_data_ready) {
+ if (sk_is_tcp(sk))
+ ret = sk_psock_init_strp(sk, psock);
+ else
+ ret = -EOPNOTSUPP;
+ if (ret) {
+ write_unlock_bh(&sk->sk_callback_lock);
+ sk_psock_put(sk, psock);
+ goto out;
+ }
+ sk_psock_start_strp(sk, psock);
+ } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) {
+ sk_psock_start_verdict(sk,psock);
+ } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) {
+ sk_psock_start_verdict(sk, psock);
+ }
+ write_unlock_bh(&sk->sk_callback_lock);
+ return 0;
+out_progs:
+ if (skb_verdict)
+ bpf_prog_put(skb_verdict);
+out_put_msg_parser:
+ if (msg_parser)
+ bpf_prog_put(msg_parser);
+out_put_stream_parser:
+ if (stream_parser)
+ bpf_prog_put(stream_parser);
+out_put_stream_verdict:
+ if (stream_verdict)
+ bpf_prog_put(stream_verdict);
+out:
+ return ret;
+}
+
+static void sock_map_free(struct bpf_map *map)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ int i;
+
+ /* After the sync no updates or deletes will be in-flight so it
+ * is safe to walk map and remove entries without risking a race
+ * in EEXIST update case.
+ */
+ synchronize_rcu();
+ for (i = 0; i < stab->map.max_entries; i++) {
+ struct sock **psk = &stab->sks[i];
+ struct sock *sk;
+
+ sk = xchg(psk, NULL);
+ if (sk) {
+ sock_hold(sk);
+ lock_sock(sk);
+ rcu_read_lock();
+ sock_map_unref(sk, psk);
+ rcu_read_unlock();
+ release_sock(sk);
+ sock_put(sk);
+ }
+ }
+
+ /* wait for psock readers accessing its map link */
+ synchronize_rcu();
+
+ bpf_map_area_free(stab->sks);
+ bpf_map_area_free(stab);
+}
+
+static void sock_map_release_progs(struct bpf_map *map)
+{
+ psock_progs_drop(&container_of(map, struct bpf_stab, map)->progs);
+}
+
+static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (unlikely(key >= map->max_entries))
+ return NULL;
+ return READ_ONCE(stab->sks[key]);
+}
+
+static void *sock_map_lookup(struct bpf_map *map, void *key)
+{
+ struct sock *sk;
+
+ sk = __sock_map_lookup_elem(map, *(u32 *)key);
+ if (!sk)
+ return NULL;
+ if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt))
+ return NULL;
+ return sk;
+}
+
+static void *sock_map_lookup_sys(struct bpf_map *map, void *key)
+{
+ struct sock *sk;
+
+ if (map->value_size != sizeof(u64))
+ return ERR_PTR(-ENOSPC);
+
+ sk = __sock_map_lookup_elem(map, *(u32 *)key);
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ __sock_gen_cookie(sk);
+ return &sk->sk_cookie;
+}
+
+static int __sock_map_delete(struct bpf_stab *stab, struct sock *sk_test,
+ struct sock **psk)
+{
+ struct sock *sk = NULL;
+ int err = 0;
+
+ spin_lock_bh(&stab->lock);
+ if (!sk_test || sk_test == *psk)
+ sk = xchg(psk, NULL);
+
+ if (likely(sk))
+ sock_map_unref(sk, psk);
+ else
+ err = -EINVAL;
+
+ spin_unlock_bh(&stab->lock);
+ return err;
+}
+
+static void sock_map_delete_from_link(struct bpf_map *map, struct sock *sk,
+ void *link_raw)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+
+ __sock_map_delete(stab, sk, link_raw);
+}
+
+static long sock_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ u32 i = *(u32 *)key;
+ struct sock **psk;
+
+ if (unlikely(i >= map->max_entries))
+ return -EINVAL;
+
+ psk = &stab->sks[i];
+ return __sock_map_delete(stab, NULL, psk);
+}
+
+static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ u32 i = key ? *(u32 *)key : U32_MAX;
+ u32 *key_next = next;
+
+ if (i == stab->map.max_entries - 1)
+ return -ENOENT;
+ if (i >= stab->map.max_entries)
+ *key_next = 0;
+ else
+ *key_next = i + 1;
+ return 0;
+}
+
+static int sock_map_update_common(struct bpf_map *map, u32 idx,
+ struct sock *sk, u64 flags)
+{
+ struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
+ struct sk_psock_link *link;
+ struct sk_psock *psock;
+ struct sock *osk;
+ int ret;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+ if (unlikely(idx >= map->max_entries))
+ return -E2BIG;
+
+ link = sk_psock_init_link();
+ if (!link)
+ return -ENOMEM;
+
+ ret = sock_map_link(map, sk);
+ if (ret < 0)
+ goto out_free;
+
+ psock = sk_psock(sk);
+ WARN_ON_ONCE(!psock);
+
+ spin_lock_bh(&stab->lock);
+ osk = stab->sks[idx];
+ if (osk && flags == BPF_NOEXIST) {
+ ret = -EEXIST;
+ goto out_unlock;
+ } else if (!osk && flags == BPF_EXIST) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ sock_map_add_link(psock, link, map, &stab->sks[idx]);
+ stab->sks[idx] = sk;
+ if (osk)
+ sock_map_unref(osk, &stab->sks[idx]);
+ spin_unlock_bh(&stab->lock);
+ return 0;
+out_unlock:
+ spin_unlock_bh(&stab->lock);
+ if (psock)
+ sk_psock_put(sk, psock);
+out_free:
+ sk_psock_free_link(link);
+ return ret;
+}
+
+static bool sock_map_op_okay(const struct bpf_sock_ops_kern *ops)
+{
+ return ops->op == BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB ||
+ ops->op == BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB ||
+ ops->op == BPF_SOCK_OPS_TCP_LISTEN_CB;
+}
+
+static bool sock_map_redirect_allowed(const struct sock *sk)
+{
+ if (sk_is_tcp(sk))
+ return sk->sk_state != TCP_LISTEN;
+ else
+ return sk->sk_state == TCP_ESTABLISHED;
+}
+
+static bool sock_map_sk_is_suitable(const struct sock *sk)
+{
+ return !!sk->sk_prot->psock_update_sk_prot;
+}
+
+static bool sock_map_sk_state_allowed(const struct sock *sk)
+{
+ if (sk_is_tcp(sk))
+ return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_LISTEN);
+ if (sk_is_stream_unix(sk))
+ return (1 << sk->sk_state) & TCPF_ESTABLISHED;
+ if (sk_is_vsock(sk) &&
+ (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET))
+ return (1 << sk->sk_state) & TCPF_ESTABLISHED;
+ return true;
+}
+
+static int sock_hash_update_common(struct bpf_map *map, void *key,
+ struct sock *sk, u64 flags);
+
+int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value,
+ u64 flags)
+{
+ struct socket *sock;
+ struct sock *sk;
+ int ret;
+ u64 ufd;
+
+ if (map->value_size == sizeof(u64))
+ ufd = *(u64 *)value;
+ else
+ ufd = *(u32 *)value;
+ if (ufd > S32_MAX)
+ return -EINVAL;
+
+ sock = sockfd_lookup(ufd, &ret);
+ if (!sock)
+ return ret;
+ sk = sock->sk;
+ if (!sk) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!sock_map_sk_is_suitable(sk)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ sock_map_sk_acquire(sk);
+ if (!sock_map_sk_state_allowed(sk))
+ ret = -EOPNOTSUPP;
+ else if (map->map_type == BPF_MAP_TYPE_SOCKMAP)
+ ret = sock_map_update_common(map, *(u32 *)key, sk, flags);
+ else
+ ret = sock_hash_update_common(map, key, sk, flags);
+ sock_map_sk_release(sk);
+out:
+ sockfd_put(sock);
+ return ret;
+}
+
+static long sock_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 flags)
+{
+ struct sock *sk = (struct sock *)value;
+ int ret;
+
+ if (unlikely(!sk || !sk_fullsock(sk)))
+ return -EINVAL;
+
+ if (!sock_map_sk_is_suitable(sk))
+ return -EOPNOTSUPP;
+
+ local_bh_disable();
+ bh_lock_sock(sk);
+ if (!sock_map_sk_state_allowed(sk))
+ ret = -EOPNOTSUPP;
+ else if (map->map_type == BPF_MAP_TYPE_SOCKMAP)
+ ret = sock_map_update_common(map, *(u32 *)key, sk, flags);
+ else
+ ret = sock_hash_update_common(map, key, sk, flags);
+ bh_unlock_sock(sk);
+ local_bh_enable();
+ return ret;
+}
+
+BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (likely(sock_map_sk_is_suitable(sops->sk) &&
+ sock_map_op_okay(sops)))
+ return sock_map_update_common(map, *(u32 *)key, sops->sk,
+ flags);
+ return -EOPNOTSUPP;
+}
+
+const struct bpf_func_proto bpf_sock_map_update_proto = {
+ .func = bpf_sock_map_update,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb,
+ struct bpf_map *, map, u32, key, u64, flags)
+{
+ struct sock *sk;
+
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+
+ sk = __sock_map_lookup_elem(map, key);
+ if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
+ return SK_DROP;
+ if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk))
+ return SK_DROP;
+
+ skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
+ return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_sk_redirect_map_proto = {
+ .func = bpf_sk_redirect_map,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg *, msg,
+ struct bpf_map *, map, u32, key, u64, flags)
+{
+ struct sock *sk;
+
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+
+ sk = __sock_map_lookup_elem(map, key);
+ if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
+ return SK_DROP;
+ if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk))
+ return SK_DROP;
+ if (sk_is_vsock(sk))
+ return SK_DROP;
+
+ msg->flags = flags;
+ msg->sk_redir = sk;
+ return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_msg_redirect_map_proto = {
+ .func = bpf_msg_redirect_map,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_ANYTHING,
+};
+
+struct sock_map_seq_info {
+ struct bpf_map *map;
+ struct sock *sk;
+ u32 index;
+};
+
+struct bpf_iter__sockmap {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct bpf_map *, map);
+ __bpf_md_ptr(void *, key);
+ __bpf_md_ptr(struct sock *, sk);
+};
+
+DEFINE_BPF_ITER_FUNC(sockmap, struct bpf_iter_meta *meta,
+ struct bpf_map *map, void *key,
+ struct sock *sk)
+
+static void *sock_map_seq_lookup_elem(struct sock_map_seq_info *info)
+{
+ if (unlikely(info->index >= info->map->max_entries))
+ return NULL;
+
+ info->sk = __sock_map_lookup_elem(info->map, info->index);
+
+ /* can't return sk directly, since that might be NULL */
+ return info;
+}
+
+static void *sock_map_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
+{
+ struct sock_map_seq_info *info = seq->private;
+
+ if (*pos == 0)
+ ++*pos;
+
+ /* pairs with sock_map_seq_stop */
+ rcu_read_lock();
+ return sock_map_seq_lookup_elem(info);
+}
+
+static void *sock_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+ __must_hold(rcu)
+{
+ struct sock_map_seq_info *info = seq->private;
+
+ ++*pos;
+ ++info->index;
+
+ return sock_map_seq_lookup_elem(info);
+}
+
+static int sock_map_seq_show(struct seq_file *seq, void *v)
+ __must_hold(rcu)
+{
+ struct sock_map_seq_info *info = seq->private;
+ struct bpf_iter__sockmap ctx = {};
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, !v);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (v) {
+ ctx.key = &info->index;
+ ctx.sk = info->sk;
+ }
+
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static void sock_map_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
+{
+ if (!v)
+ (void)sock_map_seq_show(seq, NULL);
+
+ /* pairs with sock_map_seq_start */
+ rcu_read_unlock();
+}
+
+static const struct seq_operations sock_map_seq_ops = {
+ .start = sock_map_seq_start,
+ .next = sock_map_seq_next,
+ .stop = sock_map_seq_stop,
+ .show = sock_map_seq_show,
+};
+
+static int sock_map_init_seq_private(void *priv_data,
+ struct bpf_iter_aux_info *aux)
+{
+ struct sock_map_seq_info *info = priv_data;
+
+ bpf_map_inc_with_uref(aux->map);
+ info->map = aux->map;
+ return 0;
+}
+
+static void sock_map_fini_seq_private(void *priv_data)
+{
+ struct sock_map_seq_info *info = priv_data;
+
+ bpf_map_put_with_uref(info->map);
+}
+
+static u64 sock_map_mem_usage(const struct bpf_map *map)
+{
+ u64 usage = sizeof(struct bpf_stab);
+
+ usage += (u64)map->max_entries * sizeof(struct sock *);
+ return usage;
+}
+
+static const struct bpf_iter_seq_info sock_map_iter_seq_info = {
+ .seq_ops = &sock_map_seq_ops,
+ .init_seq_private = sock_map_init_seq_private,
+ .fini_seq_private = sock_map_fini_seq_private,
+ .seq_priv_size = sizeof(struct sock_map_seq_info),
+};
+
+BTF_ID_LIST_SINGLE(sock_map_btf_ids, struct, bpf_stab)
+const struct bpf_map_ops sock_map_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = sock_map_alloc,
+ .map_free = sock_map_free,
+ .map_get_next_key = sock_map_get_next_key,
+ .map_lookup_elem_sys_only = sock_map_lookup_sys,
+ .map_update_elem = sock_map_update_elem,
+ .map_delete_elem = sock_map_delete_elem,
+ .map_lookup_elem = sock_map_lookup,
+ .map_release_uref = sock_map_release_progs,
+ .map_check_btf = map_check_no_btf,
+ .map_mem_usage = sock_map_mem_usage,
+ .map_btf_id = &sock_map_btf_ids[0],
+ .iter_seq_info = &sock_map_iter_seq_info,
+};
+
+struct bpf_shtab_elem {
+ struct rcu_head rcu;
+ u32 hash;
+ struct sock *sk;
+ struct hlist_node node;
+ u8 key[];
+};
+
+struct bpf_shtab_bucket {
+ struct hlist_head head;
+ spinlock_t lock;
+};
+
+struct bpf_shtab {
+ struct bpf_map map;
+ struct bpf_shtab_bucket *buckets;
+ u32 buckets_num;
+ u32 elem_size;
+ struct sk_psock_progs progs;
+ atomic_t count;
+};
+
+static inline u32 sock_hash_bucket_hash(const void *key, u32 len)
+{
+ return jhash(key, len, 0);
+}
+
+static struct bpf_shtab_bucket *sock_hash_select_bucket(struct bpf_shtab *htab,
+ u32 hash)
+{
+ return &htab->buckets[hash & (htab->buckets_num - 1)];
+}
+
+static struct bpf_shtab_elem *
+sock_hash_lookup_elem_raw(struct hlist_head *head, u32 hash, void *key,
+ u32 key_size)
+{
+ struct bpf_shtab_elem *elem;
+
+ hlist_for_each_entry_rcu(elem, head, node) {
+ if (elem->hash == hash &&
+ !memcmp(&elem->key, key, key_size))
+ return elem;
+ }
+
+ return NULL;
+}
+
+static struct sock *__sock_hash_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
+ u32 key_size = map->key_size, hash;
+ struct bpf_shtab_bucket *bucket;
+ struct bpf_shtab_elem *elem;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ hash = sock_hash_bucket_hash(key, key_size);
+ bucket = sock_hash_select_bucket(htab, hash);
+ elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
+
+ return elem ? elem->sk : NULL;
+}
+
+static void sock_hash_free_elem(struct bpf_shtab *htab,
+ struct bpf_shtab_elem *elem)
+{
+ atomic_dec(&htab->count);
+ kfree_rcu(elem, rcu);
+}
+
+static void sock_hash_delete_from_link(struct bpf_map *map, struct sock *sk,
+ void *link_raw)
+{
+ struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
+ struct bpf_shtab_elem *elem_probe, *elem = link_raw;
+ struct bpf_shtab_bucket *bucket;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ bucket = sock_hash_select_bucket(htab, elem->hash);
+
+ /* elem may be deleted in parallel from the map, but access here
+ * is okay since it's going away only after RCU grace period.
+ * However, we need to check whether it's still present.
+ */
+ spin_lock_bh(&bucket->lock);
+ elem_probe = sock_hash_lookup_elem_raw(&bucket->head, elem->hash,
+ elem->key, map->key_size);
+ if (elem_probe && elem_probe == elem) {
+ hlist_del_rcu(&elem->node);
+ sock_map_unref(elem->sk, elem);
+ sock_hash_free_elem(htab, elem);
+ }
+ spin_unlock_bh(&bucket->lock);
+}
+
+static long sock_hash_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
+ u32 hash, key_size = map->key_size;
+ struct bpf_shtab_bucket *bucket;
+ struct bpf_shtab_elem *elem;
+ int ret = -ENOENT;
+
+ hash = sock_hash_bucket_hash(key, key_size);
+ bucket = sock_hash_select_bucket(htab, hash);
+
+ spin_lock_bh(&bucket->lock);
+ elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
+ if (elem) {
+ hlist_del_rcu(&elem->node);
+ sock_map_unref(elem->sk, elem);
+ sock_hash_free_elem(htab, elem);
+ ret = 0;
+ }
+ spin_unlock_bh(&bucket->lock);
+ return ret;
+}
+
+static struct bpf_shtab_elem *sock_hash_alloc_elem(struct bpf_shtab *htab,
+ void *key, u32 key_size,
+ u32 hash, struct sock *sk,
+ struct bpf_shtab_elem *old)
+{
+ struct bpf_shtab_elem *new;
+
+ if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
+ if (!old) {
+ atomic_dec(&htab->count);
+ return ERR_PTR(-E2BIG);
+ }
+ }
+
+ new = bpf_map_kmalloc_node(&htab->map, htab->elem_size,
+ GFP_ATOMIC | __GFP_NOWARN,
+ htab->map.numa_node);
+ if (!new) {
+ atomic_dec(&htab->count);
+ return ERR_PTR(-ENOMEM);
+ }
+ memcpy(new->key, key, key_size);
+ new->sk = sk;
+ new->hash = hash;
+ return new;
+}
+
+static int sock_hash_update_common(struct bpf_map *map, void *key,
+ struct sock *sk, u64 flags)
+{
+ struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
+ u32 key_size = map->key_size, hash;
+ struct bpf_shtab_elem *elem, *elem_new;
+ struct bpf_shtab_bucket *bucket;
+ struct sk_psock_link *link;
+ struct sk_psock *psock;
+ int ret;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ if (unlikely(flags > BPF_EXIST))
+ return -EINVAL;
+
+ link = sk_psock_init_link();
+ if (!link)
+ return -ENOMEM;
+
+ ret = sock_map_link(map, sk);
+ if (ret < 0)
+ goto out_free;
+
+ psock = sk_psock(sk);
+ WARN_ON_ONCE(!psock);
+
+ hash = sock_hash_bucket_hash(key, key_size);
+ bucket = sock_hash_select_bucket(htab, hash);
+
+ spin_lock_bh(&bucket->lock);
+ elem = sock_hash_lookup_elem_raw(&bucket->head, hash, key, key_size);
+ if (elem && flags == BPF_NOEXIST) {
+ ret = -EEXIST;
+ goto out_unlock;
+ } else if (!elem && flags == BPF_EXIST) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ elem_new = sock_hash_alloc_elem(htab, key, key_size, hash, sk, elem);
+ if (IS_ERR(elem_new)) {
+ ret = PTR_ERR(elem_new);
+ goto out_unlock;
+ }
+
+ sock_map_add_link(psock, link, map, elem_new);
+ /* Add new element to the head of the list, so that
+ * concurrent search will find it before old elem.
+ */
+ hlist_add_head_rcu(&elem_new->node, &bucket->head);
+ if (elem) {
+ hlist_del_rcu(&elem->node);
+ sock_map_unref(elem->sk, elem);
+ sock_hash_free_elem(htab, elem);
+ }
+ spin_unlock_bh(&bucket->lock);
+ return 0;
+out_unlock:
+ spin_unlock_bh(&bucket->lock);
+ sk_psock_put(sk, psock);
+out_free:
+ sk_psock_free_link(link);
+ return ret;
+}
+
+static int sock_hash_get_next_key(struct bpf_map *map, void *key,
+ void *key_next)
+{
+ struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
+ struct bpf_shtab_elem *elem, *elem_next;
+ u32 hash, key_size = map->key_size;
+ struct hlist_head *head;
+ int i = 0;
+
+ if (!key)
+ goto find_first_elem;
+ hash = sock_hash_bucket_hash(key, key_size);
+ head = &sock_hash_select_bucket(htab, hash)->head;
+ elem = sock_hash_lookup_elem_raw(head, hash, key, key_size);
+ if (!elem)
+ goto find_first_elem;
+
+ elem_next = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&elem->node)),
+ struct bpf_shtab_elem, node);
+ if (elem_next) {
+ memcpy(key_next, elem_next->key, key_size);
+ return 0;
+ }
+
+ i = hash & (htab->buckets_num - 1);
+ i++;
+find_first_elem:
+ for (; i < htab->buckets_num; i++) {
+ head = &sock_hash_select_bucket(htab, i)->head;
+ elem_next = hlist_entry_safe(rcu_dereference(hlist_first_rcu(head)),
+ struct bpf_shtab_elem, node);
+ if (elem_next) {
+ memcpy(key_next, elem_next->key, key_size);
+ return 0;
+ }
+ }
+
+ return -ENOENT;
+}
+
+static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
+{
+ struct bpf_shtab *htab;
+ int i, err;
+
+ if (attr->max_entries == 0 ||
+ attr->key_size == 0 ||
+ (attr->value_size != sizeof(u32) &&
+ attr->value_size != sizeof(u64)) ||
+ attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
+ return ERR_PTR(-EINVAL);
+ if (attr->key_size > MAX_BPF_STACK)
+ return ERR_PTR(-E2BIG);
+
+ htab = bpf_map_area_alloc(sizeof(*htab), NUMA_NO_NODE);
+ if (!htab)
+ return ERR_PTR(-ENOMEM);
+
+ bpf_map_init_from_attr(&htab->map, attr);
+
+ htab->buckets_num = roundup_pow_of_two(htab->map.max_entries);
+ htab->elem_size = sizeof(struct bpf_shtab_elem) +
+ round_up(htab->map.key_size, 8);
+ if (htab->buckets_num == 0 ||
+ htab->buckets_num > U32_MAX / sizeof(struct bpf_shtab_bucket)) {
+ err = -EINVAL;
+ goto free_htab;
+ }
+
+ htab->buckets = bpf_map_area_alloc(htab->buckets_num *
+ sizeof(struct bpf_shtab_bucket),
+ htab->map.numa_node);
+ if (!htab->buckets) {
+ err = -ENOMEM;
+ goto free_htab;
+ }
+
+ for (i = 0; i < htab->buckets_num; i++) {
+ INIT_HLIST_HEAD(&htab->buckets[i].head);
+ spin_lock_init(&htab->buckets[i].lock);
+ }
+
+ return &htab->map;
+free_htab:
+ bpf_map_area_free(htab);
+ return ERR_PTR(err);
+}
+
+static void sock_hash_free(struct bpf_map *map)
+{
+ struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
+ struct bpf_shtab_bucket *bucket;
+ struct hlist_head unlink_list;
+ struct bpf_shtab_elem *elem;
+ struct hlist_node *node;
+ int i;
+
+ /* After the sync no updates or deletes will be in-flight so it
+ * is safe to walk map and remove entries without risking a race
+ * in EEXIST update case.
+ */
+ synchronize_rcu();
+ for (i = 0; i < htab->buckets_num; i++) {
+ bucket = sock_hash_select_bucket(htab, i);
+
+ /* We are racing with sock_hash_delete_from_link to
+ * enter the spin-lock critical section. Every socket on
+ * the list is still linked to sockhash. Since link
+ * exists, psock exists and holds a ref to socket. That
+ * lets us to grab a socket ref too.
+ */
+ spin_lock_bh(&bucket->lock);
+ hlist_for_each_entry(elem, &bucket->head, node)
+ sock_hold(elem->sk);
+ hlist_move_list(&bucket->head, &unlink_list);
+ spin_unlock_bh(&bucket->lock);
+
+ /* Process removed entries out of atomic context to
+ * block for socket lock before deleting the psock's
+ * link to sockhash.
+ */
+ hlist_for_each_entry_safe(elem, node, &unlink_list, node) {
+ hlist_del(&elem->node);
+ lock_sock(elem->sk);
+ rcu_read_lock();
+ sock_map_unref(elem->sk, elem);
+ rcu_read_unlock();
+ release_sock(elem->sk);
+ sock_put(elem->sk);
+ sock_hash_free_elem(htab, elem);
+ }
+ cond_resched();
+ }
+
+ /* wait for psock readers accessing its map link */
+ synchronize_rcu();
+
+ bpf_map_area_free(htab->buckets);
+ bpf_map_area_free(htab);
+}
+
+static void *sock_hash_lookup_sys(struct bpf_map *map, void *key)
+{
+ struct sock *sk;
+
+ if (map->value_size != sizeof(u64))
+ return ERR_PTR(-ENOSPC);
+
+ sk = __sock_hash_lookup_elem(map, key);
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ __sock_gen_cookie(sk);
+ return &sk->sk_cookie;
+}
+
+static void *sock_hash_lookup(struct bpf_map *map, void *key)
+{
+ struct sock *sk;
+
+ sk = __sock_hash_lookup_elem(map, key);
+ if (!sk)
+ return NULL;
+ if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt))
+ return NULL;
+ return sk;
+}
+
+static void sock_hash_release_progs(struct bpf_map *map)
+{
+ psock_progs_drop(&container_of(map, struct bpf_shtab, map)->progs);
+}
+
+BPF_CALL_4(bpf_sock_hash_update, struct bpf_sock_ops_kern *, sops,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ if (likely(sock_map_sk_is_suitable(sops->sk) &&
+ sock_map_op_okay(sops)))
+ return sock_hash_update_common(map, key, sops->sk, flags);
+ return -EOPNOTSUPP;
+}
+
+const struct bpf_func_proto bpf_sock_hash_update_proto = {
+ .func = bpf_sock_hash_update,
+ .gpl_only = false,
+ .pkt_access = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ struct sock *sk;
+
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+
+ sk = __sock_hash_lookup_elem(map, key);
+ if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
+ return SK_DROP;
+ if ((flags & BPF_F_INGRESS) && sk_is_vsock(sk))
+ return SK_DROP;
+
+ skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS);
+ return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_sk_redirect_hash_proto = {
+ .func = bpf_sk_redirect_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_msg_redirect_hash, struct sk_msg *, msg,
+ struct bpf_map *, map, void *, key, u64, flags)
+{
+ struct sock *sk;
+
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return SK_DROP;
+
+ sk = __sock_hash_lookup_elem(map, key);
+ if (unlikely(!sk || !sock_map_redirect_allowed(sk)))
+ return SK_DROP;
+ if (!(flags & BPF_F_INGRESS) && !sk_is_tcp(sk))
+ return SK_DROP;
+ if (sk_is_vsock(sk))
+ return SK_DROP;
+
+ msg->flags = flags;
+ msg->sk_redir = sk;
+ return SK_PASS;
+}
+
+const struct bpf_func_proto bpf_msg_redirect_hash_proto = {
+ .func = bpf_msg_redirect_hash,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_PTR_TO_MAP_KEY,
+ .arg4_type = ARG_ANYTHING,
+};
+
+struct sock_hash_seq_info {
+ struct bpf_map *map;
+ struct bpf_shtab *htab;
+ u32 bucket_id;
+};
+
+static void *sock_hash_seq_find_next(struct sock_hash_seq_info *info,
+ struct bpf_shtab_elem *prev_elem)
+{
+ const struct bpf_shtab *htab = info->htab;
+ struct bpf_shtab_bucket *bucket;
+ struct bpf_shtab_elem *elem;
+ struct hlist_node *node;
+
+ /* try to find next elem in the same bucket */
+ if (prev_elem) {
+ node = rcu_dereference(hlist_next_rcu(&prev_elem->node));
+ elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
+ if (elem)
+ return elem;
+
+ /* no more elements, continue in the next bucket */
+ info->bucket_id++;
+ }
+
+ for (; info->bucket_id < htab->buckets_num; info->bucket_id++) {
+ bucket = &htab->buckets[info->bucket_id];
+ node = rcu_dereference(hlist_first_rcu(&bucket->head));
+ elem = hlist_entry_safe(node, struct bpf_shtab_elem, node);
+ if (elem)
+ return elem;
+ }
+
+ return NULL;
+}
+
+static void *sock_hash_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
+{
+ struct sock_hash_seq_info *info = seq->private;
+
+ if (*pos == 0)
+ ++*pos;
+
+ /* pairs with sock_hash_seq_stop */
+ rcu_read_lock();
+ return sock_hash_seq_find_next(info, NULL);
+}
+
+static void *sock_hash_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+ __must_hold(rcu)
+{
+ struct sock_hash_seq_info *info = seq->private;
+
+ ++*pos;
+ return sock_hash_seq_find_next(info, v);
+}
+
+static int sock_hash_seq_show(struct seq_file *seq, void *v)
+ __must_hold(rcu)
+{
+ struct sock_hash_seq_info *info = seq->private;
+ struct bpf_iter__sockmap ctx = {};
+ struct bpf_shtab_elem *elem = v;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, !elem);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.map = info->map;
+ if (elem) {
+ ctx.key = elem->key;
+ ctx.sk = elem->sk;
+ }
+
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static void sock_hash_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
+{
+ if (!v)
+ (void)sock_hash_seq_show(seq, NULL);
+
+ /* pairs with sock_hash_seq_start */
+ rcu_read_unlock();
+}
+
+static const struct seq_operations sock_hash_seq_ops = {
+ .start = sock_hash_seq_start,
+ .next = sock_hash_seq_next,
+ .stop = sock_hash_seq_stop,
+ .show = sock_hash_seq_show,
+};
+
+static int sock_hash_init_seq_private(void *priv_data,
+ struct bpf_iter_aux_info *aux)
+{
+ struct sock_hash_seq_info *info = priv_data;
+
+ bpf_map_inc_with_uref(aux->map);
+ info->map = aux->map;
+ info->htab = container_of(aux->map, struct bpf_shtab, map);
+ return 0;
+}
+
+static void sock_hash_fini_seq_private(void *priv_data)
+{
+ struct sock_hash_seq_info *info = priv_data;
+
+ bpf_map_put_with_uref(info->map);
+}
+
+static u64 sock_hash_mem_usage(const struct bpf_map *map)
+{
+ struct bpf_shtab *htab = container_of(map, struct bpf_shtab, map);
+ u64 usage = sizeof(*htab);
+
+ usage += htab->buckets_num * sizeof(struct bpf_shtab_bucket);
+ usage += atomic_read(&htab->count) * (u64)htab->elem_size;
+ return usage;
+}
+
+static const struct bpf_iter_seq_info sock_hash_iter_seq_info = {
+ .seq_ops = &sock_hash_seq_ops,
+ .init_seq_private = sock_hash_init_seq_private,
+ .fini_seq_private = sock_hash_fini_seq_private,
+ .seq_priv_size = sizeof(struct sock_hash_seq_info),
+};
+
+BTF_ID_LIST_SINGLE(sock_hash_map_btf_ids, struct, bpf_shtab)
+const struct bpf_map_ops sock_hash_ops = {
+ .map_meta_equal = bpf_map_meta_equal,
+ .map_alloc = sock_hash_alloc,
+ .map_free = sock_hash_free,
+ .map_get_next_key = sock_hash_get_next_key,
+ .map_update_elem = sock_map_update_elem,
+ .map_delete_elem = sock_hash_delete_elem,
+ .map_lookup_elem = sock_hash_lookup,
+ .map_lookup_elem_sys_only = sock_hash_lookup_sys,
+ .map_release_uref = sock_hash_release_progs,
+ .map_check_btf = map_check_no_btf,
+ .map_mem_usage = sock_hash_mem_usage,
+ .map_btf_id = &sock_hash_map_btf_ids[0],
+ .iter_seq_info = &sock_hash_iter_seq_info,
+};
+
+static struct sk_psock_progs *sock_map_progs(struct bpf_map *map)
+{
+ switch (map->map_type) {
+ case BPF_MAP_TYPE_SOCKMAP:
+ return &container_of(map, struct bpf_stab, map)->progs;
+ case BPF_MAP_TYPE_SOCKHASH:
+ return &container_of(map, struct bpf_shtab, map)->progs;
+ default:
+ break;
+ }
+
+ return NULL;
+}
+
+static int sock_map_prog_link_lookup(struct bpf_map *map, struct bpf_prog ***pprog,
+ struct bpf_link ***plink, u32 which)
+{
+ struct sk_psock_progs *progs = sock_map_progs(map);
+ struct bpf_prog **cur_pprog;
+ struct bpf_link **cur_plink;
+
+ if (!progs)
+ return -EOPNOTSUPP;
+
+ switch (which) {
+ case BPF_SK_MSG_VERDICT:
+ cur_pprog = &progs->msg_parser;
+ cur_plink = &progs->msg_parser_link;
+ break;
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+ case BPF_SK_SKB_STREAM_PARSER:
+ cur_pprog = &progs->stream_parser;
+ cur_plink = &progs->stream_parser_link;
+ break;
+#endif
+ case BPF_SK_SKB_STREAM_VERDICT:
+ if (progs->skb_verdict)
+ return -EBUSY;
+ cur_pprog = &progs->stream_verdict;
+ cur_plink = &progs->stream_verdict_link;
+ break;
+ case BPF_SK_SKB_VERDICT:
+ if (progs->stream_verdict)
+ return -EBUSY;
+ cur_pprog = &progs->skb_verdict;
+ cur_plink = &progs->skb_verdict_link;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ *pprog = cur_pprog;
+ if (plink)
+ *plink = cur_plink;
+ return 0;
+}
+
+/* Handle the following four cases:
+ * prog_attach: prog != NULL, old == NULL, link == NULL
+ * prog_detach: prog == NULL, old != NULL, link == NULL
+ * link_attach: prog != NULL, old == NULL, link != NULL
+ * link_detach: prog == NULL, old != NULL, link != NULL
+ */
+static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
+ struct bpf_prog *old, struct bpf_link *link,
+ u32 which)
+{
+ struct bpf_prog **pprog;
+ struct bpf_link **plink;
+ int ret;
+
+ ret = sock_map_prog_link_lookup(map, &pprog, &plink, which);
+ if (ret)
+ return ret;
+
+ /* for prog_attach/prog_detach/link_attach, return error if a bpf_link
+ * exists for that prog.
+ */
+ if ((!link || prog) && *plink)
+ return -EBUSY;
+
+ if (old) {
+ ret = psock_replace_prog(pprog, prog, old);
+ if (!ret)
+ *plink = NULL;
+ } else {
+ psock_set_prog(pprog, prog);
+ if (link)
+ *plink = link;
+ }
+
+ return ret;
+}
+
+int sock_map_bpf_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+ u32 prog_cnt = 0, flags = 0;
+ struct bpf_prog **pprog;
+ struct bpf_prog *prog;
+ struct bpf_map *map;
+ u32 id = 0;
+ int ret;
+
+ if (attr->query.query_flags)
+ return -EINVAL;
+
+ CLASS(fd, f)(attr->target_fd);
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ rcu_read_lock();
+
+ ret = sock_map_prog_link_lookup(map, &pprog, NULL, attr->query.attach_type);
+ if (ret)
+ goto end;
+
+ prog = *pprog;
+ prog_cnt = !prog ? 0 : 1;
+
+ if (!attr->query.prog_cnt || !prog_ids || !prog_cnt)
+ goto end;
+
+ /* we do not hold the refcnt, the bpf prog may be released
+ * asynchronously and the id would be set to 0.
+ */
+ id = data_race(prog->aux->id);
+ if (id == 0)
+ prog_cnt = 0;
+
+end:
+ rcu_read_unlock();
+
+ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)) ||
+ (id != 0 && copy_to_user(prog_ids, &id, sizeof(u32))) ||
+ copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt)))
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static void sock_map_unlink(struct sock *sk, struct sk_psock_link *link)
+{
+ switch (link->map->map_type) {
+ case BPF_MAP_TYPE_SOCKMAP:
+ return sock_map_delete_from_link(link->map, sk,
+ link->link_raw);
+ case BPF_MAP_TYPE_SOCKHASH:
+ return sock_hash_delete_from_link(link->map, sk,
+ link->link_raw);
+ default:
+ break;
+ }
+}
+
+static void sock_map_remove_links(struct sock *sk, struct sk_psock *psock)
+{
+ struct sk_psock_link *link;
+
+ while ((link = sk_psock_link_pop(psock))) {
+ sock_map_unlink(sk, link);
+ sk_psock_free_link(link);
+ }
+}
+
+void sock_map_unhash(struct sock *sk)
+{
+ void (*saved_unhash)(struct sock *sk);
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ saved_unhash = READ_ONCE(sk->sk_prot)->unhash;
+ } else {
+ saved_unhash = psock->saved_unhash;
+ sock_map_remove_links(sk, psock);
+ rcu_read_unlock();
+ }
+ if (WARN_ON_ONCE(saved_unhash == sock_map_unhash))
+ return;
+ if (saved_unhash)
+ saved_unhash(sk);
+}
+EXPORT_SYMBOL_GPL(sock_map_unhash);
+
+void sock_map_destroy(struct sock *sk)
+{
+ void (*saved_destroy)(struct sock *sk);
+ struct sk_psock *psock;
+
+ rcu_read_lock();
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock)) {
+ rcu_read_unlock();
+ saved_destroy = READ_ONCE(sk->sk_prot)->destroy;
+ } else {
+ saved_destroy = psock->saved_destroy;
+ sock_map_remove_links(sk, psock);
+ rcu_read_unlock();
+ sk_psock_stop(psock);
+ sk_psock_put(sk, psock);
+ }
+ if (WARN_ON_ONCE(saved_destroy == sock_map_destroy))
+ return;
+ if (saved_destroy)
+ saved_destroy(sk);
+}
+EXPORT_SYMBOL_GPL(sock_map_destroy);
+
+void sock_map_close(struct sock *sk, long timeout)
+{
+ void (*saved_close)(struct sock *sk, long timeout);
+ struct sk_psock *psock;
+
+ lock_sock(sk);
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (likely(psock)) {
+ saved_close = psock->saved_close;
+ sock_map_remove_links(sk, psock);
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ goto no_psock;
+ rcu_read_unlock();
+ sk_psock_stop(psock);
+ release_sock(sk);
+ cancel_delayed_work_sync(&psock->work);
+ sk_psock_put(sk, psock);
+ } else {
+ saved_close = READ_ONCE(sk->sk_prot)->close;
+no_psock:
+ rcu_read_unlock();
+ release_sock(sk);
+ }
+
+ /* Make sure we do not recurse. This is a bug.
+ * Leak the socket instead of crashing on a stack overflow.
+ */
+ if (WARN_ON_ONCE(saved_close == sock_map_close))
+ return;
+ saved_close(sk, timeout);
+}
+EXPORT_SYMBOL_GPL(sock_map_close);
+
+struct sockmap_link {
+ struct bpf_link link;
+ struct bpf_map *map;
+};
+
+static void sock_map_link_release(struct bpf_link *link)
+{
+ struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+
+ mutex_lock(&sockmap_mutex);
+ if (!sockmap_link->map)
+ goto out;
+
+ WARN_ON_ONCE(sock_map_prog_update(sockmap_link->map, NULL, link->prog, link,
+ link->attach_type));
+
+ bpf_map_put_with_uref(sockmap_link->map);
+ sockmap_link->map = NULL;
+out:
+ mutex_unlock(&sockmap_mutex);
+}
+
+static int sock_map_link_detach(struct bpf_link *link)
+{
+ sock_map_link_release(link);
+ return 0;
+}
+
+static void sock_map_link_dealloc(struct bpf_link *link)
+{
+ kfree(link);
+}
+
+/* Handle the following two cases:
+ * case 1: link != NULL, prog != NULL, old != NULL
+ * case 2: link != NULL, prog != NULL, old == NULL
+ */
+static int sock_map_link_update_prog(struct bpf_link *link,
+ struct bpf_prog *prog,
+ struct bpf_prog *old)
+{
+ const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+ struct bpf_prog **pprog, *old_link_prog;
+ struct bpf_link **plink;
+ int ret = 0;
+
+ mutex_lock(&sockmap_mutex);
+
+ /* If old prog is not NULL, ensure old prog is the same as link->prog. */
+ if (old && link->prog != old) {
+ ret = -EPERM;
+ goto out;
+ }
+ /* Ensure link->prog has the same type/attach_type as the new prog. */
+ if (link->prog->type != prog->type ||
+ link->prog->expected_attach_type != prog->expected_attach_type) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!sockmap_link->map) {
+ ret = -ENOLINK;
+ goto out;
+ }
+
+ ret = sock_map_prog_link_lookup(sockmap_link->map, &pprog, &plink,
+ link->attach_type);
+ if (ret)
+ goto out;
+
+ /* return error if the stored bpf_link does not match the incoming bpf_link. */
+ if (link != *plink) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ if (old) {
+ ret = psock_replace_prog(pprog, prog, old);
+ if (ret)
+ goto out;
+ } else {
+ psock_set_prog(pprog, prog);
+ }
+
+ bpf_prog_inc(prog);
+ old_link_prog = xchg(&link->prog, prog);
+ bpf_prog_put(old_link_prog);
+
+out:
+ mutex_unlock(&sockmap_mutex);
+ return ret;
+}
+
+static u32 sock_map_link_get_map_id(const struct sockmap_link *sockmap_link)
+{
+ u32 map_id = 0;
+
+ mutex_lock(&sockmap_mutex);
+ if (sockmap_link->map)
+ map_id = sockmap_link->map->id;
+ mutex_unlock(&sockmap_mutex);
+ return map_id;
+}
+
+static int sock_map_link_fill_info(const struct bpf_link *link,
+ struct bpf_link_info *info)
+{
+ const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+ u32 map_id = sock_map_link_get_map_id(sockmap_link);
+
+ info->sockmap.map_id = map_id;
+ info->sockmap.attach_type = link->attach_type;
+ return 0;
+}
+
+static void sock_map_link_show_fdinfo(const struct bpf_link *link,
+ struct seq_file *seq)
+{
+ const struct sockmap_link *sockmap_link = container_of(link, struct sockmap_link, link);
+ u32 map_id = sock_map_link_get_map_id(sockmap_link);
+
+ seq_printf(seq, "map_id:\t%u\n", map_id);
+ seq_printf(seq, "attach_type:\t%u\n", link->attach_type);
+}
+
+static const struct bpf_link_ops sock_map_link_ops = {
+ .release = sock_map_link_release,
+ .dealloc = sock_map_link_dealloc,
+ .detach = sock_map_link_detach,
+ .update_prog = sock_map_link_update_prog,
+ .fill_link_info = sock_map_link_fill_info,
+ .show_fdinfo = sock_map_link_show_fdinfo,
+};
+
+int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_link_primer link_primer;
+ struct sockmap_link *sockmap_link;
+ enum bpf_attach_type attach_type;
+ struct bpf_map *map;
+ int ret;
+
+ if (attr->link_create.flags)
+ return -EINVAL;
+
+ map = bpf_map_get_with_uref(attr->link_create.target_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+ if (map->map_type != BPF_MAP_TYPE_SOCKMAP && map->map_type != BPF_MAP_TYPE_SOCKHASH) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ sockmap_link = kzalloc(sizeof(*sockmap_link), GFP_USER);
+ if (!sockmap_link) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ attach_type = attr->link_create.attach_type;
+ bpf_link_init(&sockmap_link->link, BPF_LINK_TYPE_SOCKMAP, &sock_map_link_ops, prog,
+ attach_type);
+ sockmap_link->map = map;
+
+ ret = bpf_link_prime(&sockmap_link->link, &link_primer);
+ if (ret) {
+ kfree(sockmap_link);
+ goto out;
+ }
+
+ mutex_lock(&sockmap_mutex);
+ ret = sock_map_prog_update(map, prog, NULL, &sockmap_link->link, attach_type);
+ mutex_unlock(&sockmap_mutex);
+ if (ret) {
+ bpf_link_cleanup(&link_primer);
+ goto out;
+ }
+
+ /* Increase refcnt for the prog since when old prog is replaced with
+ * psock_replace_prog() and psock_set_prog() its refcnt will be decreased.
+ *
+ * Actually, we do not need to increase refcnt for the prog since bpf_link
+ * will hold a reference. But in order to have less complexity w.r.t.
+ * replacing/setting prog, let us increase the refcnt to make things simpler.
+ */
+ bpf_prog_inc(prog);
+
+ return bpf_link_settle(&link_primer);
+
+out:
+ bpf_map_put_with_uref(map);
+ return ret;
+}
+
+static int sock_map_iter_attach_target(struct bpf_prog *prog,
+ union bpf_iter_link_info *linfo,
+ struct bpf_iter_aux_info *aux)
+{
+ struct bpf_map *map;
+ int err = -EINVAL;
+
+ if (!linfo->map.map_fd)
+ return -EBADF;
+
+ map = bpf_map_get_with_uref(linfo->map.map_fd);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ if (map->map_type != BPF_MAP_TYPE_SOCKMAP &&
+ map->map_type != BPF_MAP_TYPE_SOCKHASH)
+ goto put_map;
+
+ if (prog->aux->max_rdonly_access > map->key_size) {
+ err = -EACCES;
+ goto put_map;
+ }
+
+ aux->map = map;
+ return 0;
+
+put_map:
+ bpf_map_put_with_uref(map);
+ return err;
+}
+
+static void sock_map_iter_detach_target(struct bpf_iter_aux_info *aux)
+{
+ bpf_map_put_with_uref(aux->map);
+}
+
+static struct bpf_iter_reg sock_map_iter_reg = {
+ .target = "sockmap",
+ .attach_target = sock_map_iter_attach_target,
+ .detach_target = sock_map_iter_detach_target,
+ .show_fdinfo = bpf_iter_map_show_fdinfo,
+ .fill_link_info = bpf_iter_map_fill_link_info,
+ .ctx_arg_info_size = 2,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__sockmap, key),
+ PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY },
+ { offsetof(struct bpf_iter__sockmap, sk),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+};
+
+static int __init bpf_sockmap_iter_init(void)
+{
+ sock_map_iter_reg.ctx_arg_info[1].btf_id =
+ btf_sock_ids[BTF_SOCK_TYPE_SOCK];
+ return bpf_iter_reg_target(&sock_map_iter_reg);
+}
+late_initcall(bpf_sockmap_iter_init);
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index ba5cba56f574..4211710393a8 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -6,6 +6,7 @@
* selecting the socket index from the array of available sockets.
*/
+#include <net/ip.h>
#include <net/sock_reuseport.h>
#include <linux/bpf.h>
#include <linux/idr.h>
@@ -16,33 +17,165 @@
DEFINE_SPINLOCK(reuseport_lock);
-#define REUSEPORT_MIN_ID 1
static DEFINE_IDA(reuseport_ida);
+static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
+ struct sock_reuseport *reuse, bool bind_inany);
-int reuseport_get_id(struct sock_reuseport *reuse)
+void reuseport_has_conns_set(struct sock *sk)
{
- int id;
+ struct sock_reuseport *reuse;
- if (reuse->reuseport_id)
- return reuse->reuseport_id;
+ if (!rcu_access_pointer(sk->sk_reuseport_cb))
+ return;
- id = ida_simple_get(&reuseport_ida, REUSEPORT_MIN_ID, 0,
- /* Called under reuseport_lock */
- GFP_ATOMIC);
- if (id < 0)
- return id;
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ if (likely(reuse))
+ reuse->has_conns = 1;
+ spin_unlock_bh(&reuseport_lock);
+}
+EXPORT_SYMBOL(reuseport_has_conns_set);
- reuse->reuseport_id = id;
+static void __reuseport_get_incoming_cpu(struct sock_reuseport *reuse)
+{
+ /* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */
+ WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu + 1);
+}
+
+static void __reuseport_put_incoming_cpu(struct sock_reuseport *reuse)
+{
+ /* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */
+ WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu - 1);
+}
+
+static void reuseport_get_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse)
+{
+ if (sk->sk_incoming_cpu >= 0)
+ __reuseport_get_incoming_cpu(reuse);
+}
+
+static void reuseport_put_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse)
+{
+ if (sk->sk_incoming_cpu >= 0)
+ __reuseport_put_incoming_cpu(reuse);
+}
+
+void reuseport_update_incoming_cpu(struct sock *sk, int val)
+{
+ struct sock_reuseport *reuse;
+ int old_sk_incoming_cpu;
+
+ if (unlikely(!rcu_access_pointer(sk->sk_reuseport_cb))) {
+ /* Paired with REAE_ONCE() in sk_incoming_cpu_update()
+ * and compute_score().
+ */
+ WRITE_ONCE(sk->sk_incoming_cpu, val);
+ return;
+ }
+
+ spin_lock_bh(&reuseport_lock);
+
+ /* This must be done under reuseport_lock to avoid a race with
+ * reuseport_grow(), which accesses sk->sk_incoming_cpu without
+ * lock_sock() when detaching a shutdown()ed sk.
+ *
+ * Paired with READ_ONCE() in reuseport_select_sock_by_hash().
+ */
+ old_sk_incoming_cpu = sk->sk_incoming_cpu;
+ WRITE_ONCE(sk->sk_incoming_cpu, val);
- return reuse->reuseport_id;
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+
+ /* reuseport_grow() has detached a closed sk. */
+ if (!reuse)
+ goto out;
+
+ if (old_sk_incoming_cpu < 0 && val >= 0)
+ __reuseport_get_incoming_cpu(reuse);
+ else if (old_sk_incoming_cpu >= 0 && val < 0)
+ __reuseport_put_incoming_cpu(reuse);
+
+out:
+ spin_unlock_bh(&reuseport_lock);
+}
+
+static int reuseport_sock_index(struct sock *sk,
+ const struct sock_reuseport *reuse,
+ bool closed)
+{
+ int left, right;
+
+ if (!closed) {
+ left = 0;
+ right = reuse->num_socks;
+ } else {
+ left = reuse->max_socks - reuse->num_closed_socks;
+ right = reuse->max_socks;
+ }
+
+ for (; left < right; left++)
+ if (reuse->socks[left] == sk)
+ return left;
+ return -1;
+}
+
+static void __reuseport_add_sock(struct sock *sk,
+ struct sock_reuseport *reuse)
+{
+ reuse->socks[reuse->num_socks] = sk;
+ /* paired with smp_rmb() in reuseport_(select|migrate)_sock() */
+ smp_wmb();
+ reuse->num_socks++;
+ reuseport_get_incoming_cpu(sk, reuse);
+}
+
+static bool __reuseport_detach_sock(struct sock *sk,
+ struct sock_reuseport *reuse)
+{
+ int i = reuseport_sock_index(sk, reuse, false);
+
+ if (i == -1)
+ return false;
+
+ reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
+ reuse->num_socks--;
+ reuseport_put_incoming_cpu(sk, reuse);
+
+ return true;
+}
+
+static void __reuseport_add_closed_sock(struct sock *sk,
+ struct sock_reuseport *reuse)
+{
+ reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk;
+ /* paired with READ_ONCE() in inet_csk_bind_conflict() */
+ WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1);
+ reuseport_get_incoming_cpu(sk, reuse);
+}
+
+static bool __reuseport_detach_closed_sock(struct sock *sk,
+ struct sock_reuseport *reuse)
+{
+ int i = reuseport_sock_index(sk, reuse, true);
+
+ if (i == -1)
+ return false;
+
+ reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
+ /* paired with READ_ONCE() in inet_csk_bind_conflict() */
+ WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1);
+ reuseport_put_incoming_cpu(sk, reuse);
+
+ return true;
}
static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
{
- unsigned int size = sizeof(struct sock_reuseport) +
- sizeof(struct sock *) * max_socks;
- struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC);
+ struct sock_reuseport *reuse;
+ reuse = kzalloc(struct_size(reuse, socks, max_socks), GFP_ATOMIC);
if (!reuse)
return NULL;
@@ -55,6 +188,7 @@ static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
int reuseport_alloc(struct sock *sk, bool bind_inany)
{
struct sock_reuseport *reuse;
+ int id, ret = 0;
/* bh lock used since this function call may precede hlist lock in
* soft irq of receive path or setsockopt from process context
@@ -67,6 +201,12 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
if (reuse) {
+ if (reuse->num_closed_socks) {
+ /* sk was shutdown()ed before */
+ ret = reuseport_resurrect(sk, reuse, NULL, bind_inany);
+ goto out;
+ }
+
/* Only set reuse->bind_inany if the bind_inany is true.
* Otherwise, it will overwrite the reuse->bind_inany
* which was set by the bind/hash path.
@@ -78,19 +218,28 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
reuse = __reuseport_alloc(INIT_SOCKS);
if (!reuse) {
- spin_unlock_bh(&reuseport_lock);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
+ if (id < 0) {
+ kfree(reuse);
+ ret = id;
+ goto out;
}
+ reuse->reuseport_id = id;
+ reuse->bind_inany = bind_inany;
reuse->socks[0] = sk;
reuse->num_socks = 1;
- reuse->bind_inany = bind_inany;
+ reuseport_get_incoming_cpu(sk, reuse);
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
out:
spin_unlock_bh(&reuseport_lock);
- return 0;
+ return ret;
}
EXPORT_SYMBOL(reuseport_alloc);
@@ -100,24 +249,45 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
u32 more_socks_size, i;
more_socks_size = reuse->max_socks * 2U;
- if (more_socks_size > U16_MAX)
+ if (more_socks_size > U16_MAX) {
+ if (reuse->num_closed_socks) {
+ /* Make room by removing a closed sk.
+ * The child has already been migrated.
+ * Only reqsk left at this point.
+ */
+ struct sock *sk;
+
+ sk = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
+ RCU_INIT_POINTER(sk->sk_reuseport_cb, NULL);
+ __reuseport_detach_closed_sock(sk, reuse);
+
+ return reuse;
+ }
+
return NULL;
+ }
more_reuse = __reuseport_alloc(more_socks_size);
if (!more_reuse)
return NULL;
- more_reuse->max_socks = more_socks_size;
more_reuse->num_socks = reuse->num_socks;
+ more_reuse->num_closed_socks = reuse->num_closed_socks;
more_reuse->prog = reuse->prog;
more_reuse->reuseport_id = reuse->reuseport_id;
more_reuse->bind_inany = reuse->bind_inany;
+ more_reuse->has_conns = reuse->has_conns;
+ more_reuse->incoming_cpu = reuse->incoming_cpu;
memcpy(more_reuse->socks, reuse->socks,
reuse->num_socks * sizeof(struct sock *));
+ memcpy(more_reuse->socks +
+ (more_reuse->max_socks - more_reuse->num_closed_socks),
+ reuse->socks + (reuse->max_socks - reuse->num_closed_socks),
+ reuse->num_closed_socks * sizeof(struct sock *));
more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
- for (i = 0; i < reuse->num_socks; ++i)
+ for (i = 0; i < reuse->max_socks; ++i)
rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
more_reuse);
@@ -135,8 +305,7 @@ static void reuseport_free_rcu(struct rcu_head *head)
reuse = container_of(head, struct sock_reuseport, rcu);
sk_reuseport_prog_free(rcu_dereference_protected(reuse->prog, 1));
- if (reuse->reuseport_id)
- ida_simple_remove(&reuseport_ida, reuse->reuseport_id);
+ ida_free(&reuseport_ida, reuse->reuseport_id);
kfree(reuse);
}
@@ -144,6 +313,8 @@ static void reuseport_free_rcu(struct rcu_head *head)
* reuseport_add_sock - Add a socket to the reuseport group of another.
* @sk: New socket to add to the group.
* @sk2: Socket belonging to the existing reuseport group.
+ * @bind_inany: Whether or not the group is bound to a local INANY address.
+ *
* May return ENOMEM and not add socket to group under memory pressure.
*/
int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
@@ -161,13 +332,21 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
- lockdep_is_held(&reuseport_lock));
+ lockdep_is_held(&reuseport_lock));
+ if (old_reuse && old_reuse->num_closed_socks) {
+ /* sk was shutdown()ed before */
+ int err = reuseport_resurrect(sk, old_reuse, reuse, reuse->bind_inany);
+
+ spin_unlock_bh(&reuseport_lock);
+ return err;
+ }
+
if (old_reuse && old_reuse->num_socks != 1) {
spin_unlock_bh(&reuseport_lock);
return -EBUSY;
}
- if (reuse->num_socks == reuse->max_socks) {
+ if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
reuse = reuseport_grow(reuse);
if (!reuse) {
spin_unlock_bh(&reuseport_lock);
@@ -175,10 +354,7 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
}
}
- reuse->socks[reuse->num_socks] = sk;
- /* paired with smp_rmb() in reuseport_select_sock() */
- smp_wmb();
- reuse->num_socks++;
+ __reuseport_add_sock(sk, reuse);
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
spin_unlock_bh(&reuseport_lock);
@@ -187,38 +363,137 @@ int reuseport_add_sock(struct sock *sk, struct sock *sk2, bool bind_inany)
call_rcu(&old_reuse->rcu, reuseport_free_rcu);
return 0;
}
+EXPORT_SYMBOL(reuseport_add_sock);
+
+static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse,
+ struct sock_reuseport *reuse, bool bind_inany)
+{
+ if (old_reuse == reuse) {
+ /* If sk was in the same reuseport group, just pop sk out of
+ * the closed section and push sk into the listening section.
+ */
+ __reuseport_detach_closed_sock(sk, old_reuse);
+ __reuseport_add_sock(sk, old_reuse);
+ return 0;
+ }
+
+ if (!reuse) {
+ /* In bind()/listen() path, we cannot carry over the eBPF prog
+ * for the shutdown()ed socket. In setsockopt() path, we should
+ * not change the eBPF prog of listening sockets by attaching a
+ * prog to the shutdown()ed socket. Thus, we will allocate a new
+ * reuseport group and detach sk from the old group.
+ */
+ int id;
+
+ reuse = __reuseport_alloc(INIT_SOCKS);
+ if (!reuse)
+ return -ENOMEM;
+
+ id = ida_alloc(&reuseport_ida, GFP_ATOMIC);
+ if (id < 0) {
+ kfree(reuse);
+ return id;
+ }
+
+ reuse->reuseport_id = id;
+ reuse->bind_inany = bind_inany;
+ } else {
+ /* Move sk from the old group to the new one if
+ * - all the other listeners in the old group were close()d or
+ * shutdown()ed, and then sk2 has listen()ed on the same port
+ * OR
+ * - sk listen()ed without bind() (or with autobind), was
+ * shutdown()ed, and then listen()s on another port which
+ * sk2 listen()s on.
+ */
+ if (reuse->num_socks + reuse->num_closed_socks == reuse->max_socks) {
+ reuse = reuseport_grow(reuse);
+ if (!reuse)
+ return -ENOMEM;
+ }
+ }
+
+ __reuseport_detach_closed_sock(sk, old_reuse);
+ __reuseport_add_sock(sk, reuse);
+ rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
+
+ if (old_reuse->num_socks + old_reuse->num_closed_socks == 0)
+ call_rcu(&old_reuse->rcu, reuseport_free_rcu);
+
+ return 0;
+}
void reuseport_detach_sock(struct sock *sk)
{
struct sock_reuseport *reuse;
- int i;
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
- /* At least one of the sk in this reuseport group is added to
- * a bpf map. Notify the bpf side. The bpf map logic will
- * remove the sk if it is indeed added to a bpf map.
+ /* reuseport_grow() has detached a closed sk */
+ if (!reuse)
+ goto out;
+
+ /* Notify the bpf side. The sk may be added to a sockarray
+ * map. If so, sockarray logic will remove it from the map.
+ *
+ * Other bpf map types that work with reuseport, like sockmap,
+ * don't need an explicit callback from here. They override sk
+ * unhash/close ops to remove the sk from the map before we
+ * get to this point.
*/
- if (reuse->reuseport_id)
- bpf_sk_reuseport_detach(sk);
+ bpf_sk_reuseport_detach(sk);
rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
- for (i = 0; i < reuse->num_socks; i++) {
- if (reuse->socks[i] == sk) {
- reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
- reuse->num_socks--;
- if (reuse->num_socks == 0)
- call_rcu(&reuse->rcu, reuseport_free_rcu);
- break;
- }
- }
+ if (!__reuseport_detach_closed_sock(sk, reuse))
+ __reuseport_detach_sock(sk, reuse);
+
+ if (reuse->num_socks + reuse->num_closed_socks == 0)
+ call_rcu(&reuse->rcu, reuseport_free_rcu);
+
+out:
spin_unlock_bh(&reuseport_lock);
}
EXPORT_SYMBOL(reuseport_detach_sock);
+void reuseport_stop_listen_sock(struct sock *sk)
+{
+ if (sk->sk_protocol == IPPROTO_TCP) {
+ struct sock_reuseport *reuse;
+ struct bpf_prog *prog;
+
+ spin_lock_bh(&reuseport_lock);
+
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ prog = rcu_dereference_protected(reuse->prog,
+ lockdep_is_held(&reuseport_lock));
+
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req) ||
+ (prog && prog->expected_attach_type == BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)) {
+ /* Migration capable, move sk from the listening section
+ * to the closed section.
+ */
+ bpf_sk_reuseport_detach(sk);
+
+ __reuseport_detach_sock(sk, reuse);
+ __reuseport_add_closed_sock(sk, reuse);
+
+ spin_unlock_bh(&reuseport_lock);
+ return;
+ }
+
+ spin_unlock_bh(&reuseport_lock);
+ }
+
+ /* Not capable to do migration, detach immediately */
+ reuseport_detach_sock(sk);
+}
+EXPORT_SYMBOL(reuseport_stop_listen_sock);
+
static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
struct bpf_prog *prog, struct sk_buff *skb,
int hdr_len)
@@ -249,6 +524,37 @@ static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
return reuse->socks[index];
}
+static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse,
+ u32 hash, u16 num_socks)
+{
+ struct sock *first_valid_sk = NULL;
+ int i, j;
+
+ i = j = reciprocal_scale(hash, num_socks);
+ do {
+ struct sock *sk = reuse->socks[i];
+
+ if (sk->sk_state != TCP_ESTABLISHED) {
+ /* Paired with WRITE_ONCE() in __reuseport_(get|put)_incoming_cpu(). */
+ if (!READ_ONCE(reuse->incoming_cpu))
+ return sk;
+
+ /* Paired with WRITE_ONCE() in reuseport_update_incoming_cpu(). */
+ if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
+ return sk;
+
+ if (!first_valid_sk)
+ first_valid_sk = sk;
+ }
+
+ i++;
+ if (i >= num_socks)
+ i = 0;
+ } while (i != j);
+
+ return first_valid_sk;
+}
+
/**
* reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
* @sk: First socket in the group.
@@ -279,21 +585,21 @@ struct sock *reuseport_select_sock(struct sock *sk,
prog = rcu_dereference(reuse->prog);
socks = READ_ONCE(reuse->num_socks);
if (likely(socks)) {
- /* paired with smp_wmb() in reuseport_add_sock() */
+ /* paired with smp_wmb() in __reuseport_add_sock() */
smp_rmb();
if (!prog || !skb)
goto select_by_hash;
if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
- sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash);
+ sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, NULL, hash);
else
sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
select_by_hash:
/* no bpf or invalid bpf result: fall back to hash usage */
if (!sk2)
- sk2 = reuse->socks[reciprocal_scale(hash, socks)];
+ sk2 = reuseport_select_sock_by_hash(reuse, hash, socks);
}
out:
@@ -302,14 +608,90 @@ out:
}
EXPORT_SYMBOL(reuseport_select_sock);
+/**
+ * reuseport_migrate_sock - Select a socket from an SO_REUSEPORT group.
+ * @sk: close()ed or shutdown()ed socket in the group.
+ * @migrating_sk: ESTABLISHED/SYN_RECV full socket in the accept queue or
+ * NEW_SYN_RECV request socket during 3WHS.
+ * @skb: skb to run through BPF filter.
+ * Returns a socket (with sk_refcnt +1) that should accept the child socket
+ * (or NULL on error).
+ */
+struct sock *reuseport_migrate_sock(struct sock *sk,
+ struct sock *migrating_sk,
+ struct sk_buff *skb)
+{
+ struct sock_reuseport *reuse;
+ struct sock *nsk = NULL;
+ bool allocated = false;
+ struct bpf_prog *prog;
+ u16 socks;
+ u32 hash;
+
+ rcu_read_lock();
+
+ reuse = rcu_dereference(sk->sk_reuseport_cb);
+ if (!reuse)
+ goto out;
+
+ socks = READ_ONCE(reuse->num_socks);
+ if (unlikely(!socks))
+ goto failure;
+
+ /* paired with smp_wmb() in __reuseport_add_sock() */
+ smp_rmb();
+
+ hash = migrating_sk->sk_hash;
+ prog = rcu_dereference(reuse->prog);
+ if (!prog || prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE) {
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_migrate_req))
+ goto select_by_hash;
+ goto failure;
+ }
+
+ if (!skb) {
+ skb = alloc_skb(0, GFP_ATOMIC);
+ if (!skb)
+ goto failure;
+ allocated = true;
+ }
+
+ nsk = bpf_run_sk_reuseport(reuse, sk, prog, skb, migrating_sk, hash);
+
+ if (allocated)
+ kfree_skb(skb);
+
+select_by_hash:
+ if (!nsk)
+ nsk = reuseport_select_sock_by_hash(reuse, hash, socks);
+
+ if (IS_ERR_OR_NULL(nsk) || unlikely(!refcount_inc_not_zero(&nsk->sk_refcnt))) {
+ nsk = NULL;
+ goto failure;
+ }
+
+out:
+ rcu_read_unlock();
+ return nsk;
+
+failure:
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
+ goto out;
+}
+EXPORT_SYMBOL(reuseport_migrate_sock);
+
int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
{
struct sock_reuseport *reuse;
struct bpf_prog *old_prog;
- if (sk_unhashed(sk) && sk->sk_reuseport) {
- int err = reuseport_alloc(sk, false);
+ if (sk_unhashed(sk)) {
+ int err;
+ if (!sk->sk_reuseport)
+ return -EINVAL;
+
+ err = reuseport_alloc(sk, false);
if (err)
return err;
} else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
@@ -329,3 +711,38 @@ int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
return 0;
}
EXPORT_SYMBOL(reuseport_attach_prog);
+
+int reuseport_detach_prog(struct sock *sk)
+{
+ struct sock_reuseport *reuse;
+ struct bpf_prog *old_prog;
+
+ old_prog = NULL;
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+
+ /* reuse must be checked after acquiring the reuseport_lock
+ * because reuseport_grow() can detach a closed sk.
+ */
+ if (!reuse) {
+ spin_unlock_bh(&reuseport_lock);
+ return sk->sk_reuseport ? -ENOENT : -EINVAL;
+ }
+
+ if (sk_unhashed(sk) && reuse->num_closed_socks) {
+ spin_unlock_bh(&reuseport_lock);
+ return -ENOENT;
+ }
+
+ old_prog = rcu_replace_pointer(reuse->prog, old_prog,
+ lockdep_is_held(&reuseport_lock));
+ spin_unlock_bh(&reuseport_lock);
+
+ if (!old_prog)
+ return -ENOENT;
+
+ sk_reuseport_prog_free(old_prog);
+ return 0;
+}
+EXPORT_SYMBOL(reuseport_detach_prog);
diff --git a/net/core/stream.c b/net/core/stream.c
index 7d329fb1f553..7a37e7dd2c43 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -23,16 +23,20 @@
/**
* sk_stream_write_space - stream socket write_space callback.
- * @sk: socket
+ * @sk: pointer to the socket structure
*
- * FIXME: write proper description
+ * This function is invoked when there's space available in the socket's
+ * send buffer for writing. It first checks if the socket is writable,
+ * clears the SOCK_NOSPACE flag indicating that memory for writing
+ * is now available, wakes up any processes waiting for write operations
+ * and sends asynchronous notifications if needed.
*/
void sk_stream_write_space(struct sock *sk)
{
struct socket *sock = sk->sk_socket;
struct socket_wq *wq;
- if (sk_stream_is_writeable(sk) && sock) {
+ if (__sk_stream_is_writeable(sk, 1) && sock) {
clear_bit(SOCK_NOSPACE, &sock->flags);
rcu_read_lock();
@@ -73,13 +77,13 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p)
add_wait_queue(sk_sleep(sk), &wait);
sk->sk_write_pending++;
done = sk_wait_event(sk, timeo_p,
- !sk->sk_err &&
- !((1 << sk->sk_state) &
+ !READ_ONCE(sk->sk_err) &&
+ !((1 << READ_ONCE(sk->sk_state)) &
~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)), &wait);
remove_wait_queue(sk_sleep(sk), &wait);
sk->sk_write_pending--;
} while (!done);
- return 0;
+ return done < 0 ? done : 0;
}
EXPORT_SYMBOL(sk_stream_wait_connect);
@@ -87,9 +91,9 @@ EXPORT_SYMBOL(sk_stream_wait_connect);
* sk_stream_closing - Return 1 if we still have things to send in our buffers.
* @sk: socket to verify
*/
-static inline int sk_stream_closing(struct sock *sk)
+static int sk_stream_closing(const struct sock *sk)
{
- return (1 << sk->sk_state) &
+ return (1 << READ_ONCE(sk->sk_state)) &
(TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK);
}
@@ -117,14 +121,13 @@ EXPORT_SYMBOL(sk_stream_wait_close);
*/
int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
{
- int err = 0;
+ int ret, err = 0;
long vm_wait = 0;
long current_timeo = *timeo_p;
- bool noblock = (*timeo_p ? false : true);
DEFINE_WAIT_FUNC(wait, woken_wake_function);
if (sk_stream_memory_free(sk))
- current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
+ current_timeo = vm_wait = get_random_u32_below(HZ / 5) + 2;
add_wait_queue(sk_sleep(sk), &wait);
@@ -133,11 +136,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto do_error;
- if (!*timeo_p) {
- if (noblock)
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
- goto do_nonblock;
- }
+ if (!*timeo_p)
+ goto do_eagain;
if (signal_pending(current))
goto do_interrupted;
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
@@ -146,11 +146,13 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
sk->sk_write_pending++;
- sk_wait_event(sk, &current_timeo, sk->sk_err ||
- (sk->sk_shutdown & SEND_SHUTDOWN) ||
- (sk_stream_memory_free(sk) &&
- !vm_wait), &wait);
+ ret = sk_wait_event(sk, &current_timeo, READ_ONCE(sk->sk_err) ||
+ (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) ||
+ (sk_stream_memory_free(sk) && !vm_wait),
+ &wait);
sk->sk_write_pending--;
+ if (ret < 0)
+ goto do_error;
if (vm_wait) {
vm_wait -= current_timeo;
@@ -163,13 +165,20 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
*timeo_p = current_timeo;
}
out:
- remove_wait_queue(sk_sleep(sk), &wait);
+ if (!sock_flag(sk, SOCK_DEAD))
+ remove_wait_queue(sk_sleep(sk), &wait);
return err;
do_error:
err = -EPIPE;
goto out;
-do_nonblock:
+do_eagain:
+ /* Make sure that whenever EAGAIN is returned, EPOLLOUT event can
+ * be generated later.
+ * When TCP receives ACK packets that make room, tcp_check_space()
+ * only calls tcp_new_space() if SOCK_NOSPACE is set.
+ */
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
err = -EAGAIN;
goto out;
do_interrupted:
@@ -193,17 +202,19 @@ void sk_stream_kill_queues(struct sock *sk)
/* First the read buffer. */
__skb_queue_purge(&sk->sk_receive_queue);
- /* Next, the error queue. */
- __skb_queue_purge(&sk->sk_error_queue);
+ /* Next, the error queue.
+ * We need to use queue lock, because other threads might
+ * add packets to the queue without socket lock being held.
+ */
+ skb_queue_purge(&sk->sk_error_queue);
/* Next, the write queue. */
- WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
+ WARN_ON_ONCE(!skb_queue_empty(&sk->sk_write_queue));
/* Account for returned memory. */
- sk_mem_reclaim(sk);
+ sk_mem_reclaim_final(sk);
- WARN_ON(sk->sk_wmem_queued);
- WARN_ON(sk->sk_forward_alloc);
+ WARN_ON_ONCE(sk->sk_wmem_queued);
/* It is _impossible_ for the backlog to contain anything
* when we get here. All user references to this socket
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index b1a2c5e38530..8d4decb2606f 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -6,6 +6,7 @@
* Added /proc/sys/net/core directory entry (empty =) ). [MS]
*/
+#include <linux/filter.h>
#include <linux/mm.h>
#include <linux/sysctl.h>
#include <linux/module.h>
@@ -15,28 +16,126 @@
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/sched/isolation.h>
#include <net/ip.h>
#include <net/sock.h>
#include <net/net_ratelimit.h>
#include <net/busy_poll.h>
#include <net/pkt_sched.h>
+#include <net/hotdata.h>
+#include <net/proto_memory.h>
+#include <net/rps.h>
-static int zero = 0;
-static int one = 1;
-static int two __maybe_unused = 2;
+#include "dev.h"
+#include "net-sysfs.h"
+
+static int int_3600 = 3600;
static int min_sndbuf = SOCK_MIN_SNDBUF;
static int min_rcvbuf = SOCK_MIN_RCVBUF;
static int max_skb_frags = MAX_SKB_FRAGS;
+static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE;
+static int netdev_budget_usecs_min = 2 * USEC_PER_SEC / HZ;
static int net_msg_warn; /* Unused, but still a sysctl */
int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0;
EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net);
+/* 0 - Keep current behavior:
+ * IPv4: inherit all current settings from init_net
+ * IPv6: reset all settings to default
+ * 1 - Both inherit all current settings from init_net
+ * 2 - Both reset all settings to default
+ * 3 - Both inherit all settings from current netns
+ */
+int sysctl_devconf_inherit_init_net __read_mostly;
+EXPORT_SYMBOL(sysctl_devconf_inherit_init_net);
+
+#if IS_ENABLED(CONFIG_NET_FLOW_LIMIT) || IS_ENABLED(CONFIG_RPS)
+static int dump_cpumask(void *buffer, size_t *lenp, loff_t *ppos,
+ struct cpumask *mask)
+{
+ char *kbuf;
+ int len;
+
+ if (*ppos || !*lenp) {
+ *lenp = 0;
+ return 0;
+ }
+
+ /* CPUs are displayed as a hex bitmap + a comma between each groups of 8
+ * nibbles (except the last one which has a newline instead).
+ * Guesstimate the buffer size at the group granularity level.
+ */
+ len = min(DIV_ROUND_UP(nr_cpumask_bits, 32) * (8 + 1), *lenp);
+ kbuf = kmalloc(len, GFP_KERNEL);
+ if (!kbuf) {
+ *lenp = 0;
+ return -ENOMEM;
+ }
+
+ len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask));
+ if (!len) {
+ *lenp = 0;
+ goto free_buf;
+ }
+
+ /* scnprintf writes a trailing null char not counted in the returned
+ * length, override it with a newline.
+ */
+ kbuf[len++] = '\n';
+ memcpy(buffer, kbuf, len);
+ *lenp = len;
+ *ppos += len;
+
+free_buf:
+ kfree(kbuf);
+ return 0;
+}
+#endif
+
#ifdef CONFIG_RPS
-static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+
+DEFINE_MUTEX(rps_default_mask_mutex);
+
+static int rps_default_mask_sysctl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net = (struct net *)table->data;
+ struct cpumask *mask;
+ int err = 0;
+
+ mutex_lock(&rps_default_mask_mutex);
+ mask = net->core.rps_default_mask;
+ if (write) {
+ if (!mask) {
+ mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ net->core.rps_default_mask = mask;
+ }
+ err = -ENOMEM;
+ if (!mask)
+ goto done;
+
+ err = cpumask_parse(buffer, mask);
+ if (err)
+ goto done;
+
+ err = rps_cpumask_housekeeping(mask);
+ if (err)
+ goto done;
+ } else {
+ err = dump_cpumask(buffer, lenp, ppos,
+ mask ?: cpu_none_mask);
+ }
+
+done:
+ mutex_unlock(&rps_default_mask_mutex);
+ return err;
+}
+
+static int rps_sock_flow_sysctl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned int orig_size, size;
int ret, i;
@@ -50,7 +149,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
mutex_lock(&sock_flow_mutex);
- orig_sock_table = rcu_dereference_protected(rps_sock_flow_table,
+ orig_sock_table = rcu_dereference_protected(
+ net_hotdata.rps_sock_flow_table,
lockdep_is_held(&sock_flow_mutex));
size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0;
@@ -71,7 +171,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
mutex_unlock(&sock_flow_mutex);
return -ENOMEM;
}
- rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1;
+ net_hotdata.rps_cpu_mask =
+ roundup_pow_of_two(nr_cpu_ids) - 1;
sock_table->mask = size - 1;
} else
sock_table = orig_sock_table;
@@ -82,16 +183,16 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
sock_table = NULL;
if (sock_table != orig_sock_table) {
- rcu_assign_pointer(rps_sock_flow_table, sock_table);
+ rcu_assign_pointer(net_hotdata.rps_sock_flow_table,
+ sock_table);
if (sock_table) {
- static_key_slow_inc(&rps_needed);
- static_key_slow_inc(&rfs_needed);
+ static_branch_inc(&rps_needed);
+ static_branch_inc(&rfs_needed);
}
if (orig_sock_table) {
- static_key_slow_dec(&rps_needed);
- static_key_slow_dec(&rfs_needed);
- synchronize_rcu();
- vfree(orig_sock_table);
+ static_branch_dec(&rps_needed);
+ static_branch_dec(&rfs_needed);
+ kvfree_rcu(orig_sock_table, rcu);
}
}
}
@@ -105,9 +206,8 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
#ifdef CONFIG_NET_FLOW_LIMIT
static DEFINE_MUTEX(flow_limit_update_mutex);
-static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+static int flow_limit_cpu_sysctl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct sd_flow_limit *cur;
struct softnet_data *sd;
@@ -118,7 +218,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
return -ENOMEM;
if (write) {
- ret = cpumask_parse_user(buffer, *lenp, mask);
+ ret = cpumask_parse(buffer, mask);
if (ret)
goto done;
@@ -130,8 +230,7 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
lockdep_is_held(&flow_limit_update_mutex));
if (cur && !cpumask_test_cpu(i, mask)) {
RCU_INIT_POINTER(sd->flow_limit, NULL);
- synchronize_rcu();
- kfree(cur);
+ kfree_rcu(cur, rcu);
} else if (!cur && cpumask_test_cpu(i, mask)) {
cur = kzalloc_node(len, GFP_KERNEL,
cpu_to_node(i));
@@ -140,20 +239,13 @@ static int flow_limit_cpu_sysctl(struct ctl_table *table, int write,
ret = -ENOMEM;
goto write_unlock;
}
- cur->num_buckets = netdev_flow_limit_table_len;
+ cur->log_buckets = ilog2(netdev_flow_limit_table_len);
rcu_assign_pointer(sd->flow_limit, cur);
}
}
write_unlock:
mutex_unlock(&flow_limit_update_mutex);
} else {
- char kbuf[128];
-
- if (*ppos || !*lenp) {
- *lenp = 0;
- goto done;
- }
-
cpumask_clear(mask);
rcu_read_lock();
for_each_possible_cpu(i) {
@@ -163,20 +255,7 @@ write_unlock:
}
rcu_read_unlock();
- len = min(sizeof(kbuf) - 1, *lenp);
- len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask));
- if (!len) {
- *lenp = 0;
- goto done;
- }
- if (len < *lenp)
- kbuf[len++] = '\n';
- if (copy_to_user(buffer, kbuf, len)) {
- ret = -EFAULT;
- goto done;
- }
- *lenp = len;
- *ppos += len;
+ ret = dump_cpumask(buffer, lenp, ppos, mask);
}
done:
@@ -184,9 +263,8 @@ done:
return ret;
}
-static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+static int flow_limit_table_len_sysctl(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned int old, *ptr;
int ret;
@@ -207,8 +285,8 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write,
#endif /* CONFIG_NET_FLOW_LIMIT */
#ifdef CONFIG_NET_SCHED
-static int set_default_qdisc(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int set_default_qdisc(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
char id[IFNAMSIZ];
struct ctl_table tbl = {
@@ -226,23 +304,26 @@ static int set_default_qdisc(struct ctl_table *table, int write,
}
#endif
-static int proc_do_dev_weight(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int proc_do_dev_weight(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
- int ret;
-
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
- if (ret != 0)
- return ret;
-
- dev_rx_weight = weight_p * dev_weight_rx_bias;
- dev_tx_weight = weight_p * dev_weight_tx_bias;
+ static DEFINE_MUTEX(dev_weight_mutex);
+ int ret, weight;
+
+ mutex_lock(&dev_weight_mutex);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (!ret && write) {
+ weight = READ_ONCE(weight_p);
+ WRITE_ONCE(net_hotdata.dev_rx_weight, weight * dev_weight_rx_bias);
+ WRITE_ONCE(net_hotdata.dev_tx_weight, weight * dev_weight_tx_bias);
+ }
+ mutex_unlock(&dev_weight_mutex);
return ret;
}
-static int proc_do_rss_key(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int proc_do_rss_key(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table fake_table;
char buf[NETDEV_RSS_KEY_LEN * 3];
@@ -254,11 +335,13 @@ static int proc_do_rss_key(struct ctl_table *table, int write,
}
#ifdef CONFIG_BPF_JIT
-static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
+static int proc_dointvec_minmax_bpf_enable(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int ret, jit_enable = *(int *)table->data;
+ int min = *(int *)table->extra1;
+ int max = *(int *)table->extra2;
struct ctl_table tmp = *table;
if (write && !capable(CAP_SYS_ADMIN))
@@ -268,7 +351,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
if (write && !ret) {
if (jit_enable < 2 ||
- (jit_enable == 2 && bpf_dump_raw_ok())) {
+ (jit_enable == 2 && bpf_dump_raw_ok(current_cred()))) {
*(int *)table->data = jit_enable;
if (jit_enable == 2)
pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n");
@@ -276,56 +359,44 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write,
ret = -EPERM;
}
}
+
+ if (write && ret && min == max)
+ pr_info_once("CONFIG_BPF_JIT_ALWAYS_ON is enabled, bpf_jit_enable is permanently set to 1.\n");
+
return ret;
}
# ifdef CONFIG_HAVE_EBPF_JIT
static int
-proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+proc_dointvec_minmax_bpf_restricted(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
}
-# endif
+# endif /* CONFIG_HAVE_EBPF_JIT */
+
+static int
+proc_dolongvec_minmax_bpf_restricted(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+}
#endif
static struct ctl_table net_core_table[] = {
-#ifdef CONFIG_NET
- {
- .procname = "wmem_max",
- .data = &sysctl_wmem_max,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &min_sndbuf,
- },
- {
- .procname = "rmem_max",
- .data = &sysctl_rmem_max,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &min_rcvbuf,
- },
{
- .procname = "wmem_default",
- .data = &sysctl_wmem_default,
+ .procname = "mem_pcpu_rsv",
+ .data = &net_hotdata.sysctl_mem_pcpu_rsv,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &min_sndbuf,
- },
- {
- .procname = "rmem_default",
- .data = &sysctl_rmem_default,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &min_rcvbuf,
+ .extra1 = &min_mem_pcpu_rsv,
},
{
.procname = "dev_weight",
@@ -333,6 +404,7 @@ static struct ctl_table net_core_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_dev_weight,
+ .extra1 = SYSCTL_ONE,
},
{
.procname = "dev_weight_rx_bias",
@@ -340,6 +412,7 @@ static struct ctl_table net_core_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_dev_weight,
+ .extra1 = SYSCTL_ONE,
},
{
.procname = "dev_weight_tx_bias",
@@ -347,10 +420,11 @@ static struct ctl_table net_core_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_do_dev_weight,
+ .extra1 = SYSCTL_ONE,
},
{
.procname = "netdev_max_backlog",
- .data = &netdev_max_backlog,
+ .data = &net_hotdata.max_backlog,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
@@ -370,11 +444,11 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax_bpf_enable,
# ifdef CONFIG_BPF_JIT_ALWAYS_ON
- .extra1 = &one,
- .extra2 = &one,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE,
# else
- .extra1 = &zero,
- .extra2 = &two,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
# endif
},
# ifdef CONFIG_HAVE_EBPF_JIT
@@ -384,8 +458,8 @@ static struct ctl_table net_core_table[] = {
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = proc_dointvec_minmax_bpf_restricted,
- .extra1 = &zero,
- .extra2 = &two,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
},
{
.procname = "bpf_jit_kallsyms",
@@ -393,14 +467,23 @@ static struct ctl_table net_core_table[] = {
.maxlen = sizeof(int),
.mode = 0600,
.proc_handler = proc_dointvec_minmax_bpf_restricted,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
# endif
+ {
+ .procname = "bpf_jit_limit",
+ .data = &bpf_jit_limit,
+ .maxlen = sizeof(long),
+ .mode = 0600,
+ .proc_handler = proc_dolongvec_minmax_bpf_restricted,
+ .extra1 = SYSCTL_LONG_ONE,
+ .extra2 = &bpf_jit_limit_max,
+ },
#endif
{
.procname = "netdev_tstamp_prequeue",
- .data = &netdev_tstamp_prequeue,
+ .data = &net_hotdata.tstamp_prequeue,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
@@ -419,22 +502,6 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
- .procname = "optmem_max",
- .data = &sysctl_optmem_max,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "tstamp_allow_data",
- .data = &sysctl_tstamp_allow_data,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one
- },
#ifdef CONFIG_RPS
{
.procname = "rps_sock_flow_entries",
@@ -464,7 +531,7 @@ static struct ctl_table net_core_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
{
.procname = "busy_read",
@@ -472,7 +539,7 @@ static struct ctl_table net_core_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
#endif
#ifdef CONFIG_NET_SCHED
@@ -483,10 +550,9 @@ static struct ctl_table net_core_table[] = {
.proc_handler = set_default_qdisc
},
#endif
-#endif /* CONFIG_NET */
{
.procname = "netdev_budget",
- .data = &netdev_budget,
+ .data = &net_hotdata.netdev_budget,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
@@ -500,20 +566,20 @@ static struct ctl_table net_core_table[] = {
},
{
.procname = "max_skb_frags",
- .data = &sysctl_max_skb_frags,
+ .data = &net_hotdata.sysctl_max_skb_frags,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
+ .extra1 = SYSCTL_ONE,
.extra2 = &max_skb_frags,
},
{
.procname = "netdev_budget_usecs",
- .data = &netdev_budget_usecs,
+ .data = &net_hotdata.netdev_budget_usecs,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
+ .extra1 = &netdev_budget_usecs_min,
},
{
.procname = "fb_tunnels_only_for_init_net",
@@ -521,43 +587,184 @@ static struct ctl_table net_core_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "devconf_inherit_init_net",
+ .data = &sysctl_devconf_inherit_init_net,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_THREE,
+ },
+ {
+ .procname = "high_order_alloc_disable",
+ .data = &net_high_order_alloc_disable_key.key,
+ .maxlen = sizeof(net_high_order_alloc_disable_key),
+ .mode = 0644,
+ .proc_handler = proc_do_static_key,
+ },
+ {
+ .procname = "gro_normal_batch",
+ .data = &net_hotdata.gro_normal_batch,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ },
+ {
+ .procname = "netdev_unregister_timeout_secs",
+ .data = &netdev_unregister_timeout_secs,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = &int_3600,
+ },
+ {
+ .procname = "skb_defer_max",
+ .data = &net_hotdata.sysctl_skb_defer_max,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
},
- { }
};
static struct ctl_table netns_core_table[] = {
+#if IS_ENABLED(CONFIG_RPS)
+ {
+ .procname = "rps_default_mask",
+ .data = &init_net,
+ .mode = 0644,
+ .proc_handler = rps_default_mask_sysctl
+ },
+#endif
{
.procname = "somaxconn",
.data = &init_net.core.sysctl_somaxconn,
.maxlen = sizeof(int),
.mode = 0644,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
.proc_handler = proc_dointvec_minmax
},
- { }
+ {
+ .procname = "optmem_max",
+ .data = &init_net.core.sysctl_optmem_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .extra1 = SYSCTL_ZERO,
+ .proc_handler = proc_dointvec_minmax
+ },
+ {
+ .procname = "txrehash",
+ .data = &init_net.core.sysctl_txrehash,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "txq_reselection_ms",
+ .data = &init_net.core.sysctl_txq_reselection,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_ms_jiffies,
+ },
+ {
+ .procname = "tstamp_allow_data",
+ .data = &init_net.core.sysctl_tstamp_allow_data,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
+ },
+ {
+ .procname = "bypass_prot_mem",
+ .data = &init_net.core.sysctl_bypass_prot_mem,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
+ },
+ /* sysctl_core_net_init() will set the values after this
+ * to readonly in network namespaces
+ */
+ {
+ .procname = "wmem_max",
+ .data = &sysctl_wmem_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_sndbuf,
+ },
+ {
+ .procname = "rmem_max",
+ .data = &sysctl_rmem_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_rcvbuf,
+ },
+ {
+ .procname = "wmem_default",
+ .data = &sysctl_wmem_default,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_sndbuf,
+ },
+ {
+ .procname = "rmem_default",
+ .data = &sysctl_rmem_default,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &min_rcvbuf,
+ },
};
+static int __init fb_tunnels_only_for_init_net_sysctl_setup(char *str)
+{
+ /* fallback tunnels for initns only */
+ if (!strncmp(str, "initns", 6))
+ sysctl_fb_tunnels_only_for_init_net = 1;
+ /* no fallback tunnels anywhere */
+ else if (!strncmp(str, "none", 4))
+ sysctl_fb_tunnels_only_for_init_net = 2;
+
+ return 1;
+}
+__setup("fb_tunnels=", fb_tunnels_only_for_init_net_sysctl_setup);
+
static __net_init int sysctl_core_net_init(struct net *net)
{
+ size_t table_size = ARRAY_SIZE(netns_core_table);
struct ctl_table *tbl;
tbl = netns_core_table;
if (!net_eq(net, &init_net)) {
+ int i;
tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL);
if (tbl == NULL)
goto err_dup;
- tbl[0].data = &net->core.sysctl_somaxconn;
+ for (i = 0; i < table_size; ++i) {
+ if (tbl[i].data == &sysctl_wmem_max)
+ break;
- /* Don't export any sysctls to unprivileged users */
- if (net->user_ns != &init_user_ns) {
- tbl[0].procname = NULL;
+ tbl[i].data += (char *)net - (char *)&init_net;
}
+ for (; i < table_size; ++i)
+ tbl[i].mode &= ~0222;
}
- net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl);
+ net->core.sysctl_hdr = register_net_sysctl_sz(net, "net/core", tbl, table_size);
if (net->core.sysctl_hdr == NULL)
goto err_reg;
@@ -572,11 +779,14 @@ err_dup:
static __net_exit void sysctl_core_net_exit(struct net *net)
{
- struct ctl_table *tbl;
+ const struct ctl_table *tbl;
tbl = net->core.sysctl_hdr->ctl_table_arg;
unregister_net_sysctl_table(net->core.sysctl_hdr);
BUG_ON(tbl == netns_core_table);
+#if IS_ENABLED(CONFIG_RPS)
+ kfree(net->core.rps_default_mask);
+#endif
kfree(tbl);
}
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index 42689d5c468c..a50a7ef49ae8 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -1,32 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PTP 1588 clock support - support for timestamping in PHY devices
*
* Copyright (C) 2010 OMICRON electronics GmbH
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/errqueue.h>
#include <linux/phy.h>
#include <linux/ptp_classify.h>
#include <linux/skbuff.h>
#include <linux/export.h>
+#include <linux/ptp_clock_kernel.h>
static unsigned int classify(const struct sk_buff *skb)
{
if (likely(skb->dev && skb->dev->phydev &&
- skb->dev->phydev->drv))
+ skb->dev->phydev->mii_ts))
return ptp_classify_raw(skb);
else
return PTP_CLASS_NONE;
@@ -34,35 +22,77 @@ static unsigned int classify(const struct sk_buff *skb)
void skb_clone_tx_timestamp(struct sk_buff *skb)
{
+ struct hwtstamp_provider *hwprov;
+ struct mii_timestamper *mii_ts;
struct phy_device *phydev;
struct sk_buff *clone;
unsigned int type;
- if (!skb->sk)
+ if (!skb->sk || !skb->dev)
return;
+ rcu_read_lock();
+ hwprov = rcu_dereference(skb->dev->hwprov);
+ if (hwprov) {
+ if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB ||
+ !hwprov->phydev) {
+ rcu_read_unlock();
+ return;
+ }
+
+ phydev = hwprov->phydev;
+ } else {
+ phydev = skb->dev->phydev;
+ if (!phy_is_default_hwtstamp(phydev)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ rcu_read_unlock();
+
type = classify(skb);
if (type == PTP_CLASS_NONE)
return;
- phydev = skb->dev->phydev;
- if (likely(phydev->drv->txtstamp)) {
+ mii_ts = phydev->mii_ts;
+ if (likely(mii_ts->txtstamp)) {
clone = skb_clone_sk(skb);
if (!clone)
return;
- phydev->drv->txtstamp(phydev, clone, type);
+ mii_ts->txtstamp(mii_ts, clone, type);
}
}
EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);
bool skb_defer_rx_timestamp(struct sk_buff *skb)
{
+ struct hwtstamp_provider *hwprov;
+ struct mii_timestamper *mii_ts;
struct phy_device *phydev;
unsigned int type;
- if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->drv)
+ if (!skb->dev)
return false;
+ rcu_read_lock();
+ hwprov = rcu_dereference(skb->dev->hwprov);
+ if (hwprov) {
+ if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB ||
+ !hwprov->phydev) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ phydev = hwprov->phydev;
+ } else {
+ phydev = skb->dev->phydev;
+ if (!phy_is_default_hwtstamp(phydev)) {
+ rcu_read_unlock();
+ return false;
+ }
+ }
+ rcu_read_unlock();
+
if (skb_headroom(skb) < ETH_HLEN)
return false;
@@ -75,9 +105,9 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)
if (type == PTP_CLASS_NONE)
return false;
- phydev = skb->dev->phydev;
- if (likely(phydev->drv->rxtstamp))
- return phydev->drv->rxtstamp(phydev, skb, type);
+ mii_ts = phydev->mii_ts;
+ if (likely(mii_ts->rxtstamp))
+ return mii_ts->rxtstamp(mii_ts, skb, type);
return false;
}
diff --git a/net/core/tso.c b/net/core/tso.c
index 43f4eba61933..6df997b9076e 100644
--- a/net/core/tso.c
+++ b/net/core/tso.c
@@ -3,21 +3,12 @@
#include <linux/if_vlan.h>
#include <net/ip.h>
#include <net/tso.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
-/* Calculate expected number of TX descriptors */
-int tso_count_descs(struct sk_buff *skb)
-{
- /* The Marvell Way */
- return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags;
-}
-EXPORT_SYMBOL(tso_count_descs);
-
-void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso,
+void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso,
int size, bool is_last)
{
- struct tcphdr *tcph;
- int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ int hdr_len = skb_transport_offset(skb) + tso->tlen;
int mac_hdr_len = skb_network_offset(skb);
memcpy(hdr, skb->data, hdr_len);
@@ -30,23 +21,31 @@ void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso,
} else {
struct ipv6hdr *iph = (void *)(hdr + mac_hdr_len);
- iph->payload_len = htons(size + tcp_hdrlen(skb));
+ iph->payload_len = htons(size + tso->tlen);
}
- tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb));
- put_unaligned_be32(tso->tcp_seq, &tcph->seq);
+ hdr += skb_transport_offset(skb);
+ if (tso->tlen != sizeof(struct udphdr)) {
+ struct tcphdr *tcph = (struct tcphdr *)hdr;
+
+ put_unaligned_be32(tso->tcp_seq, &tcph->seq);
+
+ if (!is_last) {
+ /* Clear all special flags for not last packet */
+ tcph->psh = 0;
+ tcph->fin = 0;
+ tcph->rst = 0;
+ }
+ } else {
+ struct udphdr *uh = (struct udphdr *)hdr;
- if (!is_last) {
- /* Clear all special flags for not last packet */
- tcph->psh = 0;
- tcph->fin = 0;
- tcph->rst = 0;
+ uh->len = htons(sizeof(*uh) + size);
}
}
EXPORT_SYMBOL(tso_build_hdr);
-void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size)
+void tso_build_data(const struct sk_buff *skb, struct tso_t *tso, int size)
{
- tso->tcp_seq += size;
+ tso->tcp_seq += size; /* not worth avoiding this operation for UDP */
tso->size -= size;
tso->data += size;
@@ -55,19 +54,21 @@ void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size)
skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
/* Move to next segment */
- tso->size = frag->size;
- tso->data = page_address(frag->page.p) + frag->page_offset;
+ tso->size = skb_frag_size(frag);
+ tso->data = skb_frag_address(frag);
tso->next_frag_idx++;
}
}
EXPORT_SYMBOL(tso_build_data);
-void tso_start(struct sk_buff *skb, struct tso_t *tso)
+int tso_start(struct sk_buff *skb, struct tso_t *tso)
{
- int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ int tlen = skb_is_gso_tcp(skb) ? tcp_hdrlen(skb) : sizeof(struct udphdr);
+ int hdr_len = skb_transport_offset(skb) + tlen;
+ tso->tlen = tlen;
tso->ip_id = ntohs(ip_hdr(skb)->id);
- tso->tcp_seq = ntohl(tcp_hdr(skb)->seq);
+ tso->tcp_seq = (tlen != sizeof(struct udphdr)) ? ntohl(tcp_hdr(skb)->seq) : 0;
tso->next_frag_idx = 0;
tso->ipv6 = vlan_get_protocol(skb) == htons(ETH_P_IPV6);
@@ -79,9 +80,10 @@ void tso_start(struct sk_buff *skb, struct tso_t *tso)
skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
/* Move to next segment */
- tso->size = frag->size;
- tso->data = page_address(frag->page.p) + frag->page_offset;
+ tso->size = skb_frag_size(frag);
+ tso->data = skb_frag_address(frag);
tso->next_frag_idx++;
}
+ return hdr_len;
}
EXPORT_SYMBOL(tso_start);
diff --git a/net/core/utils.c b/net/core/utils.c
index 2a597ac7808e..5e63b0ea21f3 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * Generic address resultion entity
+ * Generic address resolution entity
*
* Authors:
* net_random Alan Cox
@@ -7,11 +8,6 @@
* in{4,6}_pton YOSHIFUJI Hideaki, Copyright (C)2006 USAGI/WIDE Project
*
* Created by Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
@@ -306,7 +302,7 @@ static int inet4_pton(const char *src, u16 port_num,
struct sockaddr_storage *addr)
{
struct sockaddr_in *addr4 = (struct sockaddr_in *)addr;
- int srclen = strlen(src);
+ size_t srclen = strlen(src);
if (srclen > INET_ADDRSTRLEN)
return -EINVAL;
@@ -326,7 +322,7 @@ static int inet6_pton(struct net *net, const char *src, u16 port_num,
{
struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr;
const char *scope_delim;
- int srclen = strlen(src);
+ size_t srclen = strlen(src);
if (srclen > INET6_ADDRSTRLEN)
return -EINVAL;
@@ -403,9 +399,9 @@ int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af,
}
EXPORT_SYMBOL(inet_pton_with_scope);
-bool inet_addr_is_any(struct sockaddr *addr)
+bool inet_addr_is_any(struct sockaddr_storage *addr)
{
- if (addr->sa_family == AF_INET6) {
+ if (addr->ss_family == AF_INET6) {
struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)addr;
const struct sockaddr_in6 in6_any =
{ .sin6_addr = IN6ADDR_ANY_INIT };
@@ -413,13 +409,13 @@ bool inet_addr_is_any(struct sockaddr *addr)
if (!memcmp(in6->sin6_addr.s6_addr,
in6_any.sin6_addr.s6_addr, 16))
return true;
- } else if (addr->sa_family == AF_INET) {
+ } else if (addr->ss_family == AF_INET) {
struct sockaddr_in *in = (struct sockaddr_in *)addr;
if (in->sin_addr.s_addr == htonl(INADDR_ANY))
return true;
} else {
- pr_warn("unexpected address family %u\n", addr->sa_family);
+ pr_warn("unexpected address family %u\n", addr->ss_family);
}
return false;
@@ -442,6 +438,23 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb,
}
EXPORT_SYMBOL(inet_proto_csum_replace4);
+/**
+ * inet_proto_csum_replace16 - update layer 4 header checksum field
+ * @sum: Layer 4 header checksum field
+ * @skb: sk_buff for the packet
+ * @from: old IPv6 address
+ * @to: new IPv6 address
+ * @pseudohdr: True if layer 4 header checksum includes pseudoheader
+ *
+ * Update layer 4 header as per the update in IPv6 src/dst address.
+ *
+ * There is no need to update skb->csum in this function, because update in two
+ * fields a.) IPv6 src/dst address and b.) L4 header checksum cancels each other
+ * for skb->csum calculation. Whereas inet_proto_csum_replace4 function needs to
+ * update skb->csum, because update in 3 fields a.) IPv4 src/dst address,
+ * b.) IPv4 Header checksum and c.) L4 header checksum results in same diff as
+ * L4 Header checksum for skb->csum calculation.
+ */
void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
const __be32 *from, const __be32 *to,
bool pseudohdr)
@@ -453,9 +466,6 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
if (skb->ip_summed != CHECKSUM_PARTIAL) {
*sum = csum_fold(csum_partial(diff, sizeof(diff),
~csum_unfold(*sum)));
- if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
- skb->csum = ~csum_partial(diff, sizeof(diff),
- ~skb->csum);
} else if (pseudohdr)
*sum = ~csum_fold(csum_partial(diff, sizeof(diff),
csum_unfold(*sum)));
@@ -463,12 +473,12 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb,
EXPORT_SYMBOL(inet_proto_csum_replace16);
void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb,
- __wsum diff, bool pseudohdr)
+ __wsum diff, bool pseudohdr, bool ipv6)
{
if (skb->ip_summed != CHECKSUM_PARTIAL) {
- *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum)));
- if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
- skb->csum = ~csum_add(diff, ~skb->csum);
+ csum_replace_by_diff(sum, diff);
+ if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr && !ipv6)
+ skb->csum = ~csum_sub(diff, skb->csum);
} else if (pseudohdr) {
*sum = ~csum_fold(csum_add(diff, csum_unfold(*sum)));
}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 89b6785cef2a..9100e160113a 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -1,9 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* net/core/xdp.c
*
* Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
- * Released under terms in GPL version 2. See COPYING.
*/
#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
#include <linux/filter.h>
#include <linux/types.h>
#include <linux/mm.h>
@@ -11,9 +13,15 @@
#include <linux/slab.h>
#include <linux/idr.h>
#include <linux/rhashtable.h>
-#include <net/page_pool.h>
+#include <linux/bug.h>
+#include <net/page_pool/helpers.h>
+#include <net/hotdata.h>
+#include <net/netdev_lock.h>
#include <net/xdp.h>
+#include <net/xdp_priv.h> /* struct xdp_mem_allocator */
+#include <trace/events/xdp.h>
+#include <net/xdp_sock_drv.h>
#define REG_STATE_NEW 0x0
#define REG_STATE_REGISTERED 0x1
@@ -29,23 +37,12 @@ static int mem_id_next = MEM_ID_MIN;
static bool mem_id_init; /* false */
static struct rhashtable *mem_id_ht;
-struct xdp_mem_allocator {
- struct xdp_mem_info mem;
- union {
- void *allocator;
- struct page_pool *page_pool;
- struct zero_copy_allocator *zc_alloc;
- };
- struct rhash_head node;
- struct rcu_head rcu;
-};
-
static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
{
const u32 *k = data;
const u32 key = *k;
- BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id)
+ BUILD_BUG_ON(sizeof_field(struct xdp_mem_allocator, mem.id)
!= sizeof(u32));
/* Use cyclic increasing ID as direct hash key */
@@ -65,7 +62,7 @@ static const struct rhashtable_params mem_id_rht_params = {
.nelem_hint = 64,
.head_offset = offsetof(struct xdp_mem_allocator, node),
.key_offset = offsetof(struct xdp_mem_allocator, mem.id),
- .key_len = FIELD_SIZEOF(struct xdp_mem_allocator, mem.id),
+ .key_len = sizeof_field(struct xdp_mem_allocator, mem.id),
.max_size = MEM_ID_MAX,
.min_size = 8,
.automatic_shrinking = true,
@@ -80,36 +77,73 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
xa = container_of(rcu, struct xdp_mem_allocator, rcu);
/* Allow this ID to be reused */
- ida_simple_remove(&mem_id_pool, xa->mem.id);
+ ida_free(&mem_id_pool, xa->mem.id);
- /* Notice, driver is expected to free the *allocator,
- * e.g. page_pool, and MUST also use RCU free.
- */
+ kfree(xa);
+}
- /* Poison memory */
- xa->mem.id = 0xFFFF;
- xa->mem.type = 0xF0F0;
- xa->allocator = (void *)0xDEAD9001;
+static void mem_xa_remove(struct xdp_mem_allocator *xa)
+{
+ trace_mem_disconnect(xa);
- kfree(xa);
+ if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
+ call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+}
+
+static void mem_allocator_disconnect(void *allocator)
+{
+ struct xdp_mem_allocator *xa;
+ struct rhashtable_iter iter;
+
+ mutex_lock(&mem_id_lock);
+
+ rhashtable_walk_enter(mem_id_ht, &iter);
+ do {
+ rhashtable_walk_start(&iter);
+
+ while ((xa = rhashtable_walk_next(&iter)) && !IS_ERR(xa)) {
+ if (xa->allocator == allocator)
+ mem_xa_remove(xa);
+ }
+
+ rhashtable_walk_stop(&iter);
+
+ } while (xa == ERR_PTR(-EAGAIN));
+ rhashtable_walk_exit(&iter);
+
+ mutex_unlock(&mem_id_lock);
}
-static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
+void xdp_unreg_mem_model(struct xdp_mem_info *mem)
{
struct xdp_mem_allocator *xa;
- int id = xdp_rxq->mem.id;
+ int type = mem->type;
+ int id = mem->id;
+
+ /* Reset mem info to defaults */
+ mem->id = 0;
+ mem->type = 0;
if (id == 0)
return;
- mutex_lock(&mem_id_lock);
+ if (type == MEM_TYPE_PAGE_POOL) {
+ xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
+ page_pool_destroy(xa->page_pool);
+ }
+}
+EXPORT_SYMBOL_GPL(xdp_unreg_mem_model);
- xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
- if (xa && !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
- call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
+{
+ if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
+ WARN(1, "Missing register, driver bug");
+ return;
+ }
- mutex_unlock(&mem_id_lock);
+ xdp_unreg_mem_model(&xdp_rxq->mem);
}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
{
@@ -117,16 +151,10 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
if (xdp_rxq->reg_state == REG_STATE_UNUSED)
return;
- WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG");
-
- __xdp_rxq_info_unreg_mem_model(xdp_rxq);
+ xdp_rxq_info_unreg_mem_model(xdp_rxq);
xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
xdp_rxq->dev = NULL;
-
- /* Reset mem info to defaults */
- xdp_rxq->mem.id = 0;
- xdp_rxq->mem.type = 0;
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);
@@ -136,9 +164,15 @@ static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq)
}
/* Returns 0 on success, negative on failure */
-int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
- struct net_device *dev, u32 queue_index)
+int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
+ struct net_device *dev, u32 queue_index,
+ unsigned int napi_id, u32 frag_size)
{
+ if (!dev) {
+ WARN(1, "Missing net_device from driver");
+ return -ENODEV;
+ }
+
if (xdp_rxq->reg_state == REG_STATE_UNUSED) {
WARN(1, "Driver promised not to register this");
return -EINVAL;
@@ -149,20 +183,16 @@ int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
xdp_rxq_info_unreg(xdp_rxq);
}
- if (!dev) {
- WARN(1, "Missing net_device from driver");
- return -ENODEV;
- }
-
/* State either UNREGISTERED or NEW */
xdp_rxq_info_init(xdp_rxq);
xdp_rxq->dev = dev;
xdp_rxq->queue_index = queue_index;
+ xdp_rxq->frag_size = frag_size;
xdp_rxq->reg_state = REG_STATE_REGISTERED;
return 0;
}
-EXPORT_SYMBOL_GPL(xdp_rxq_info_reg);
+EXPORT_SYMBOL_GPL(__xdp_rxq_info_reg);
void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq)
{
@@ -211,7 +241,7 @@ static int __mem_id_cyclic_get(gfp_t gfp)
int id;
again:
- id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp);
+ id = ida_alloc_range(&mem_id_pool, mem_id_next, MEM_ID_MAX - 1, gfp);
if (id < 0) {
if (id == -ENOSPC) {
/* Cyclic allocator, reset next id */
@@ -238,28 +268,24 @@ static bool __is_supported_mem_type(enum xdp_mem_type type)
return true;
}
-int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
- enum xdp_mem_type type, void *allocator)
+static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem,
+ enum xdp_mem_type type,
+ void *allocator)
{
struct xdp_mem_allocator *xdp_alloc;
gfp_t gfp = GFP_KERNEL;
int id, errno, ret;
void *ptr;
- if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
- WARN(1, "Missing register, driver bug");
- return -EFAULT;
- }
-
if (!__is_supported_mem_type(type))
- return -EOPNOTSUPP;
+ return ERR_PTR(-EOPNOTSUPP);
- xdp_rxq->mem.type = type;
+ mem->type = type;
if (!allocator) {
- if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY)
- return -EINVAL; /* Setup time check page_pool req */
- return 0;
+ if (type == MEM_TYPE_PAGE_POOL)
+ return ERR_PTR(-EINVAL); /* Setup time check page_pool req */
+ return NULL;
}
/* Delay init of rhashtable to save memory if feature isn't used */
@@ -267,15 +293,13 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
mutex_lock(&mem_id_lock);
ret = __mem_id_init_hash_table();
mutex_unlock(&mem_id_lock);
- if (ret < 0) {
- WARN_ON(1);
- return ret;
- }
+ if (ret < 0)
+ return ERR_PTR(ret);
}
xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp);
if (!xdp_alloc)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
mutex_lock(&mem_id_lock);
id = __mem_id_cyclic_get(gfp);
@@ -283,111 +307,261 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
errno = id;
goto err;
}
- xdp_rxq->mem.id = id;
- xdp_alloc->mem = xdp_rxq->mem;
+ mem->id = id;
+ xdp_alloc->mem = *mem;
xdp_alloc->allocator = allocator;
/* Insert allocator into ID lookup table */
ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
if (IS_ERR(ptr)) {
+ ida_free(&mem_id_pool, mem->id);
+ mem->id = 0;
errno = PTR_ERR(ptr);
goto err;
}
+ if (type == MEM_TYPE_PAGE_POOL)
+ page_pool_use_xdp_mem(allocator, mem_allocator_disconnect, mem);
+
mutex_unlock(&mem_id_lock);
- return 0;
+ return xdp_alloc;
err:
mutex_unlock(&mem_id_lock);
kfree(xdp_alloc);
- return errno;
+ return ERR_PTR(errno);
}
+
+int xdp_reg_mem_model(struct xdp_mem_info *mem,
+ enum xdp_mem_type type, void *allocator)
+{
+ struct xdp_mem_allocator *xdp_alloc;
+
+ xdp_alloc = __xdp_reg_mem_model(mem, type, allocator);
+ if (IS_ERR(xdp_alloc))
+ return PTR_ERR(xdp_alloc);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xdp_reg_mem_model);
+
+int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
+ enum xdp_mem_type type, void *allocator)
+{
+ struct xdp_mem_allocator *xdp_alloc;
+
+ if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
+ WARN(1, "Missing register, driver bug");
+ return -EFAULT;
+ }
+
+ xdp_alloc = __xdp_reg_mem_model(&xdp_rxq->mem, type, allocator);
+ if (IS_ERR(xdp_alloc))
+ return PTR_ERR(xdp_alloc);
+
+ if (type == MEM_TYPE_XSK_BUFF_POOL && allocator)
+ xsk_pool_set_rxq_info(allocator, xdp_rxq);
+
+ if (trace_mem_connect_enabled() && xdp_alloc)
+ trace_mem_connect(xdp_alloc, xdp_rxq);
+ return 0;
+}
+
EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
+/**
+ * xdp_reg_page_pool - register &page_pool as a memory provider for XDP
+ * @pool: &page_pool to register
+ *
+ * Can be used to register pools manually without connecting to any XDP RxQ
+ * info, so that the XDP layer will be aware of them. Then, they can be
+ * attached to an RxQ info manually via xdp_rxq_info_attach_page_pool().
+ *
+ * Return: %0 on success, -errno on error.
+ */
+int xdp_reg_page_pool(struct page_pool *pool)
+{
+ struct xdp_mem_info mem;
+
+ return xdp_reg_mem_model(&mem, MEM_TYPE_PAGE_POOL, pool);
+}
+EXPORT_SYMBOL_GPL(xdp_reg_page_pool);
+
+/**
+ * xdp_unreg_page_pool - unregister &page_pool from the memory providers list
+ * @pool: &page_pool to unregister
+ *
+ * A shorthand for manual unregistering page pools. If the pool was previously
+ * attached to an RxQ info, it must be detached first.
+ */
+void xdp_unreg_page_pool(const struct page_pool *pool)
+{
+ struct xdp_mem_info mem = {
+ .type = MEM_TYPE_PAGE_POOL,
+ .id = pool->xdp_mem_id,
+ };
+
+ xdp_unreg_mem_model(&mem);
+}
+EXPORT_SYMBOL_GPL(xdp_unreg_page_pool);
+
+/**
+ * xdp_rxq_info_attach_page_pool - attach registered pool to RxQ info
+ * @xdp_rxq: XDP RxQ info to attach the pool to
+ * @pool: pool to attach
+ *
+ * If the pool was registered manually, this function must be called instead
+ * of xdp_rxq_info_reg_mem_model() to connect it to the RxQ info.
+ */
+void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq,
+ const struct page_pool *pool)
+{
+ struct xdp_mem_info mem = {
+ .type = MEM_TYPE_PAGE_POOL,
+ .id = pool->xdp_mem_id,
+ };
+
+ xdp_rxq_info_attach_mem_model(xdp_rxq, &mem);
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_attach_page_pool);
+
/* XDP RX runs under NAPI protection, and in different delivery error
* scenarios (e.g. queue full), it is possible to return the xdp_frame
- * while still leveraging this protection. The @napi_direct boolian
+ * while still leveraging this protection. The @napi_direct boolean
* is used for those calls sites. Thus, allowing for faster recycling
* of xdp_frames/pages in those cases.
*/
-static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
- unsigned long handle)
+void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type,
+ bool napi_direct, struct xdp_buff *xdp)
{
- struct xdp_mem_allocator *xa;
- struct page *page;
-
- switch (mem->type) {
+ switch (mem_type) {
case MEM_TYPE_PAGE_POOL:
- rcu_read_lock();
- /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
- xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
- page = virt_to_head_page(data);
- if (xa) {
- napi_direct &= !xdp_return_frame_no_direct();
- page_pool_put_page(xa->page_pool, page, napi_direct);
- } else {
- put_page(page);
- }
- rcu_read_unlock();
+ netmem = netmem_compound_head(netmem);
+ if (napi_direct && xdp_return_frame_no_direct())
+ napi_direct = false;
+ /* No need to check netmem_is_pp() as mem->type knows this a
+ * page_pool page
+ */
+ page_pool_put_full_netmem(netmem_get_pp(netmem), netmem,
+ napi_direct);
break;
case MEM_TYPE_PAGE_SHARED:
- page_frag_free(data);
+ page_frag_free(__netmem_address(netmem));
break;
case MEM_TYPE_PAGE_ORDER0:
- page = virt_to_page(data); /* Assumes order0 page*/
- put_page(page);
+ put_page(__netmem_to_page(netmem));
break;
- case MEM_TYPE_ZERO_COPY:
+ case MEM_TYPE_XSK_BUFF_POOL:
/* NB! Only valid from an xdp_buff! */
- rcu_read_lock();
- /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
- xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
- xa->zc_alloc->free(xa->zc_alloc, handle);
- rcu_read_unlock();
+ xsk_buff_free(xdp);
+ break;
default:
/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
+ WARN(1, "Incorrect XDP memory type (%d) usage", mem_type);
break;
}
}
void xdp_return_frame(struct xdp_frame *xdpf)
{
- __xdp_return(xdpf->data, &xdpf->mem, false, 0);
+ struct skb_shared_info *sinfo;
+
+ if (likely(!xdp_frame_has_frags(xdpf)))
+ goto out;
+
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
+ for (u32 i = 0; i < sinfo->nr_frags; i++)
+ __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type,
+ false, NULL);
+
+out:
+ __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, false, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame);
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
{
- __xdp_return(xdpf->data, &xdpf->mem, true, 0);
+ struct skb_shared_info *sinfo;
+
+ if (likely(!xdp_frame_has_frags(xdpf)))
+ goto out;
+
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
+ for (u32 i = 0; i < sinfo->nr_frags; i++)
+ __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type,
+ true, NULL);
+
+out:
+ __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, true, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
-void xdp_return_buff(struct xdp_buff *xdp)
+/* XDP bulk APIs introduce a defer/flush mechanism to return
+ * pages belonging to the same xdp_mem_allocator object
+ * (identified via the mem.id field) in bulk to optimize
+ * I-cache and D-cache.
+ * The bulk queue size is set to 16 to be aligned to how
+ * XDP_REDIRECT bulking works. The bulk is flushed when
+ * it is full or when mem.id changes.
+ * xdp_frame_bulk is usually stored/allocated on the function
+ * call-stack to avoid locking penalties.
+ */
+
+/* Must be called with rcu_read_lock held */
+void xdp_return_frame_bulk(struct xdp_frame *xdpf,
+ struct xdp_frame_bulk *bq)
{
- __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle);
+ if (xdpf->mem_type != MEM_TYPE_PAGE_POOL) {
+ xdp_return_frame(xdpf);
+ return;
+ }
+
+ if (bq->count == XDP_BULK_QUEUE_SIZE)
+ xdp_flush_frame_bulk(bq);
+
+ if (unlikely(xdp_frame_has_frags(xdpf))) {
+ struct skb_shared_info *sinfo;
+ int i;
+
+ sinfo = xdp_get_shared_info_from_frame(xdpf);
+ for (i = 0; i < sinfo->nr_frags; i++) {
+ skb_frag_t *frag = &sinfo->frags[i];
+
+ bq->q[bq->count++] = skb_frag_netmem(frag);
+ if (bq->count == XDP_BULK_QUEUE_SIZE)
+ xdp_flush_frame_bulk(bq);
+ }
+ }
+ bq->q[bq->count++] = virt_to_netmem(xdpf->data);
}
-EXPORT_SYMBOL_GPL(xdp_return_buff);
+EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);
-int xdp_attachment_query(struct xdp_attachment_info *info,
- struct netdev_bpf *bpf)
+/**
+ * xdp_return_frag -- free one XDP frag or decrement its refcount
+ * @netmem: network memory reference to release
+ * @xdp: &xdp_buff to release the frag for
+ */
+void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp)
{
- bpf->prog_id = info->prog ? info->prog->aux->id : 0;
- bpf->prog_flags = info->prog ? info->flags : 0;
- return 0;
+ __xdp_return(netmem, xdp->rxq->mem.type, true, NULL);
}
-EXPORT_SYMBOL_GPL(xdp_attachment_query);
+EXPORT_SYMBOL_GPL(xdp_return_frag);
-bool xdp_attachment_flags_ok(struct xdp_attachment_info *info,
- struct netdev_bpf *bpf)
+void xdp_return_buff(struct xdp_buff *xdp)
{
- if (info->prog && (bpf->flags ^ info->flags) & XDP_FLAGS_MODES) {
- NL_SET_ERR_MSG(bpf->extack,
- "program loaded with different flags");
- return false;
- }
- return true;
+ struct skb_shared_info *sinfo;
+
+ if (likely(!xdp_buff_has_frags(xdp)))
+ goto out;
+
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ for (u32 i = 0; i < sinfo->nr_frags; i++)
+ __xdp_return(skb_frag_netmem(&sinfo->frags[i]),
+ xdp->rxq->mem.type, true, xdp);
+
+out:
+ __xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, true, xdp);
}
-EXPORT_SYMBOL_GPL(xdp_attachment_flags_ok);
+EXPORT_SYMBOL_GPL(xdp_return_buff);
void xdp_attachment_setup(struct xdp_attachment_info *info,
struct netdev_bpf *bpf)
@@ -398,3 +572,484 @@ void xdp_attachment_setup(struct xdp_attachment_info *info,
info->flags = bpf->flags;
}
EXPORT_SYMBOL_GPL(xdp_attachment_setup);
+
+struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
+{
+ unsigned int metasize, totsize;
+ void *addr, *data_to_copy;
+ struct xdp_frame *xdpf;
+ struct page *page;
+
+ /* Clone into a MEM_TYPE_PAGE_ORDER0 xdp_frame. */
+ metasize = xdp_data_meta_unsupported(xdp) ? 0 :
+ xdp->data - xdp->data_meta;
+ totsize = xdp->data_end - xdp->data + metasize;
+
+ if (sizeof(*xdpf) + totsize > PAGE_SIZE)
+ return NULL;
+
+ page = dev_alloc_page();
+ if (!page)
+ return NULL;
+
+ addr = page_to_virt(page);
+ xdpf = addr;
+ memset(xdpf, 0, sizeof(*xdpf));
+
+ addr += sizeof(*xdpf);
+ data_to_copy = metasize ? xdp->data_meta : xdp->data;
+ memcpy(addr, data_to_copy, totsize);
+
+ xdpf->data = addr + metasize;
+ xdpf->len = totsize - metasize;
+ xdpf->headroom = 0;
+ xdpf->metasize = metasize;
+ xdpf->frame_sz = PAGE_SIZE;
+ xdpf->mem_type = MEM_TYPE_PAGE_ORDER0;
+
+ xsk_buff_free(xdp);
+ return xdpf;
+}
+EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame);
+
+/* Used by XDP_WARN macro, to avoid inlining WARN() in fast-path */
+void xdp_warn(const char *msg, const char *func, const int line)
+{
+ WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg);
+};
+EXPORT_SYMBOL_GPL(xdp_warn);
+
+/**
+ * xdp_build_skb_from_buff - create an skb from &xdp_buff
+ * @xdp: &xdp_buff to convert to an skb
+ *
+ * Perform common operations to create a new skb to pass up the stack from
+ * &xdp_buff: allocate an skb head from the NAPI percpu cache, initialize
+ * skb data pointers and offsets, set the recycle bit if the buff is
+ * PP-backed, Rx queue index, protocol and update frags info.
+ *
+ * Return: new &sk_buff on success, %NULL on error.
+ */
+struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp)
+{
+ const struct xdp_rxq_info *rxq = xdp->rxq;
+ const struct skb_shared_info *sinfo;
+ struct sk_buff *skb;
+ u32 nr_frags = 0;
+ int metalen;
+
+ if (unlikely(xdp_buff_has_frags(xdp))) {
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ nr_frags = sinfo->nr_frags;
+ }
+
+ skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz);
+ if (unlikely(!skb))
+ return NULL;
+
+ skb_reserve(skb, xdp->data - xdp->data_hard_start);
+ __skb_put(skb, xdp->data_end - xdp->data);
+
+ metalen = xdp->data - xdp->data_meta;
+ if (metalen > 0)
+ skb_metadata_set(skb, metalen);
+
+ if (rxq->mem.type == MEM_TYPE_PAGE_POOL)
+ skb_mark_for_recycle(skb);
+
+ skb_record_rx_queue(skb, rxq->queue_index);
+
+ if (unlikely(nr_frags)) {
+ u32 tsize;
+
+ tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz;
+ xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size,
+ tsize, xdp_buff_get_skb_flags(xdp));
+ }
+
+ skb->protocol = eth_type_trans(skb, rxq->dev);
+
+ return skb;
+}
+EXPORT_SYMBOL_GPL(xdp_build_skb_from_buff);
+
+/**
+ * xdp_copy_frags_from_zc - copy frags from XSk buff to skb
+ * @skb: skb to copy frags to
+ * @xdp: XSk &xdp_buff from which the frags will be copied
+ * @pp: &page_pool backing page allocation, if available
+ *
+ * Copy all frags from XSk &xdp_buff to the skb to pass it up the stack.
+ * Allocate a new buffer for each frag, copy it and attach to the skb.
+ *
+ * Return: true on success, false on netmem allocation fail.
+ */
+static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb,
+ const struct xdp_buff *xdp,
+ struct page_pool *pp)
+{
+ struct skb_shared_info *sinfo = skb_shinfo(skb);
+ const struct skb_shared_info *xinfo;
+ u32 nr_frags, tsize = 0;
+ u32 flags = 0;
+
+ xinfo = xdp_get_shared_info_from_buff(xdp);
+ nr_frags = xinfo->nr_frags;
+
+ for (u32 i = 0; i < nr_frags; i++) {
+ const skb_frag_t *frag = &xinfo->frags[i];
+ u32 len = skb_frag_size(frag);
+ u32 offset, truesize = len;
+ struct page *page;
+
+ page = page_pool_dev_alloc(pp, &offset, &truesize);
+ if (unlikely(!page)) {
+ sinfo->nr_frags = i;
+ return false;
+ }
+
+ memcpy(page_address(page) + offset, skb_frag_address(frag),
+ LARGEST_ALIGN(len));
+ __skb_fill_page_desc_noacc(sinfo, i, page, offset, len);
+
+ tsize += truesize;
+ if (page_is_pfmemalloc(page))
+ flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC;
+ }
+
+ xdp_update_skb_frags_info(skb, nr_frags, xinfo->xdp_frags_size, tsize,
+ flags);
+
+ return true;
+}
+
+/**
+ * xdp_build_skb_from_zc - create an skb from XSk &xdp_buff
+ * @xdp: source XSk buff
+ *
+ * Similar to xdp_build_skb_from_buff(), but for XSk frames. Allocate an skb
+ * head, new buffer for the head, copy the data and initialize the skb fields.
+ * If there are frags, allocate new buffers for them and copy.
+ * Buffers are allocated from the system percpu pools to try recycling them.
+ * If new skb was built successfully, @xdp is returned to XSk pool's freelist.
+ * On error, it remains untouched and the caller must take care of this.
+ *
+ * Return: new &sk_buff on success, %NULL on error.
+ */
+struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp)
+{
+ const struct xdp_rxq_info *rxq = xdp->rxq;
+ u32 len = xdp->data_end - xdp->data_meta;
+ u32 truesize = xdp->frame_sz;
+ struct sk_buff *skb = NULL;
+ struct page_pool *pp;
+ int metalen;
+ void *data;
+
+ if (!IS_ENABLED(CONFIG_PAGE_POOL))
+ return NULL;
+
+ local_lock_nested_bh(&system_page_pool.bh_lock);
+ pp = this_cpu_read(system_page_pool.pool);
+ data = page_pool_dev_alloc_va(pp, &truesize);
+ if (unlikely(!data))
+ goto out;
+
+ skb = napi_build_skb(data, truesize);
+ if (unlikely(!skb)) {
+ page_pool_free_va(pp, data, true);
+ goto out;
+ }
+
+ skb_mark_for_recycle(skb);
+ skb_reserve(skb, xdp->data_meta - xdp->data_hard_start);
+
+ memcpy(__skb_put(skb, len), xdp->data_meta, LARGEST_ALIGN(len));
+
+ metalen = xdp->data - xdp->data_meta;
+ if (metalen > 0) {
+ skb_metadata_set(skb, metalen);
+ __skb_pull(skb, metalen);
+ }
+
+ skb_record_rx_queue(skb, rxq->queue_index);
+
+ if (unlikely(xdp_buff_has_frags(xdp)) &&
+ unlikely(!xdp_copy_frags_from_zc(skb, xdp, pp))) {
+ napi_consume_skb(skb, true);
+ skb = NULL;
+ goto out;
+ }
+
+ xsk_buff_free(xdp);
+
+ skb->protocol = eth_type_trans(skb, rxq->dev);
+
+out:
+ local_unlock_nested_bh(&system_page_pool.bh_lock);
+ return skb;
+}
+EXPORT_SYMBOL_GPL(xdp_build_skb_from_zc);
+
+struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
+ struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_frame(xdpf);
+ unsigned int headroom, frame_size;
+ void *hard_start;
+ u8 nr_frags;
+
+ /* xdp frags frame */
+ if (unlikely(xdp_frame_has_frags(xdpf)))
+ nr_frags = sinfo->nr_frags;
+
+ /* Part of headroom was reserved to xdpf */
+ headroom = sizeof(*xdpf) + xdpf->headroom;
+
+ /* Memory size backing xdp_frame data already have reserved
+ * room for build_skb to place skb_shared_info in tailroom.
+ */
+ frame_size = xdpf->frame_sz;
+
+ hard_start = xdpf->data - headroom;
+ skb = build_skb_around(skb, hard_start, frame_size);
+ if (unlikely(!skb))
+ return NULL;
+
+ skb_reserve(skb, headroom);
+ __skb_put(skb, xdpf->len);
+ if (xdpf->metasize)
+ skb_metadata_set(skb, xdpf->metasize);
+
+ if (unlikely(xdp_frame_has_frags(xdpf)))
+ xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size,
+ nr_frags * xdpf->frame_sz,
+ xdp_frame_get_skb_flags(xdpf));
+
+ /* Essential SKB info: protocol and skb->dev */
+ skb->protocol = eth_type_trans(skb, dev);
+
+ /* Optional SKB info, currently missing:
+ * - HW checksum info (skb->ip_summed)
+ * - HW RX hash (skb_set_hash)
+ * - RX ring dev queue index (skb_record_rx_queue)
+ */
+
+ if (xdpf->mem_type == MEM_TYPE_PAGE_POOL)
+ skb_mark_for_recycle(skb);
+
+ /* Allow SKB to reuse area used by xdp_frame */
+ xdp_scrub_frame(xdpf);
+
+ return skb;
+}
+EXPORT_SYMBOL_GPL(__xdp_build_skb_from_frame);
+
+struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
+ struct net_device *dev)
+{
+ struct sk_buff *skb;
+
+ skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
+ if (unlikely(!skb))
+ return NULL;
+
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+
+ return __xdp_build_skb_from_frame(xdpf, skb, dev);
+}
+EXPORT_SYMBOL_GPL(xdp_build_skb_from_frame);
+
+struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
+{
+ unsigned int headroom, totalsize;
+ struct xdp_frame *nxdpf;
+ struct page *page;
+ void *addr;
+
+ headroom = xdpf->headroom + sizeof(*xdpf);
+ totalsize = headroom + xdpf->len;
+
+ if (unlikely(totalsize > PAGE_SIZE))
+ return NULL;
+ page = dev_alloc_page();
+ if (!page)
+ return NULL;
+ addr = page_to_virt(page);
+
+ memcpy(addr, xdpf, totalsize);
+
+ nxdpf = addr;
+ nxdpf->data = addr + headroom;
+ nxdpf->frame_sz = PAGE_SIZE;
+ nxdpf->mem_type = MEM_TYPE_PAGE_ORDER0;
+
+ return nxdpf;
+}
+
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_xdp_metadata_rx_timestamp - Read XDP frame RX timestamp.
+ * @ctx: XDP context pointer.
+ * @timestamp: Return value pointer.
+ *
+ * Return:
+ * * Returns 0 on success or ``-errno`` on error.
+ * * ``-EOPNOTSUPP`` : means device driver does not implement kfunc
+ * * ``-ENODATA`` : means no RX-timestamp available for this frame
+ */
+__bpf_kfunc int bpf_xdp_metadata_rx_timestamp(const struct xdp_md *ctx, u64 *timestamp)
+{
+ return -EOPNOTSUPP;
+}
+
+/**
+ * bpf_xdp_metadata_rx_hash - Read XDP frame RX hash.
+ * @ctx: XDP context pointer.
+ * @hash: Return value pointer.
+ * @rss_type: Return value pointer for RSS type.
+ *
+ * The RSS hash type (@rss_type) specifies what portion of packet headers NIC
+ * hardware used when calculating RSS hash value. The RSS type can be decoded
+ * via &enum xdp_rss_hash_type either matching on individual L3/L4 bits
+ * ``XDP_RSS_L*`` or by combined traditional *RSS Hashing Types*
+ * ``XDP_RSS_TYPE_L*``.
+ *
+ * Return:
+ * * Returns 0 on success or ``-errno`` on error.
+ * * ``-EOPNOTSUPP`` : means device driver doesn't implement kfunc
+ * * ``-ENODATA`` : means no RX-hash available for this frame
+ */
+__bpf_kfunc int bpf_xdp_metadata_rx_hash(const struct xdp_md *ctx, u32 *hash,
+ enum xdp_rss_hash_type *rss_type)
+{
+ return -EOPNOTSUPP;
+}
+
+/**
+ * bpf_xdp_metadata_rx_vlan_tag - Get XDP packet outermost VLAN tag
+ * @ctx: XDP context pointer.
+ * @vlan_proto: Destination pointer for VLAN Tag protocol identifier (TPID).
+ * @vlan_tci: Destination pointer for VLAN TCI (VID + DEI + PCP)
+ *
+ * In case of success, ``vlan_proto`` contains *Tag protocol identifier (TPID)*,
+ * usually ``ETH_P_8021Q`` or ``ETH_P_8021AD``, but some networks can use
+ * custom TPIDs. ``vlan_proto`` is stored in **network byte order (BE)**
+ * and should be used as follows:
+ * ``if (vlan_proto == bpf_htons(ETH_P_8021Q)) do_something();``
+ *
+ * ``vlan_tci`` contains the remaining 16 bits of a VLAN tag.
+ * Driver is expected to provide those in **host byte order (usually LE)**,
+ * so the bpf program should not perform byte conversion.
+ * According to 802.1Q standard, *VLAN TCI (Tag control information)*
+ * is a bit field that contains:
+ * *VLAN identifier (VID)* that can be read with ``vlan_tci & 0xfff``,
+ * *Drop eligible indicator (DEI)* - 1 bit,
+ * *Priority code point (PCP)* - 3 bits.
+ * For detailed meaning of DEI and PCP, please refer to other sources.
+ *
+ * Return:
+ * * Returns 0 on success or ``-errno`` on error.
+ * * ``-EOPNOTSUPP`` : device driver doesn't implement kfunc
+ * * ``-ENODATA`` : VLAN tag was not stripped or is not available
+ */
+__bpf_kfunc int bpf_xdp_metadata_rx_vlan_tag(const struct xdp_md *ctx,
+ __be16 *vlan_proto, u16 *vlan_tci)
+{
+ return -EOPNOTSUPP;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(xdp_metadata_kfunc_ids)
+#define XDP_METADATA_KFUNC(_, __, name, ___) BTF_ID_FLAGS(func, name, KF_TRUSTED_ARGS)
+XDP_METADATA_KFUNC_xxx
+#undef XDP_METADATA_KFUNC
+BTF_KFUNCS_END(xdp_metadata_kfunc_ids)
+
+static const struct btf_kfunc_id_set xdp_metadata_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &xdp_metadata_kfunc_ids,
+};
+
+BTF_ID_LIST(xdp_metadata_kfunc_ids_unsorted)
+#define XDP_METADATA_KFUNC(name, _, str, __) BTF_ID(func, str)
+XDP_METADATA_KFUNC_xxx
+#undef XDP_METADATA_KFUNC
+
+u32 bpf_xdp_metadata_kfunc_id(int id)
+{
+ /* xdp_metadata_kfunc_ids is sorted and can't be used */
+ return xdp_metadata_kfunc_ids_unsorted[id];
+}
+
+bool bpf_dev_bound_kfunc_id(u32 btf_id)
+{
+ return btf_id_set8_contains(&xdp_metadata_kfunc_ids, btf_id);
+}
+
+static int __init xdp_metadata_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &xdp_metadata_kfunc_set);
+}
+late_initcall(xdp_metadata_init);
+
+void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val)
+{
+ val &= NETDEV_XDP_ACT_MASK;
+ if (dev->xdp_features == val)
+ return;
+
+ netdev_assert_locked_or_invisible(dev);
+ dev->xdp_features = val;
+
+ if (dev->reg_state == NETREG_REGISTERED)
+ call_netdevice_notifiers(NETDEV_XDP_FEAT_CHANGE, dev);
+}
+EXPORT_SYMBOL_GPL(xdp_set_features_flag_locked);
+
+void xdp_set_features_flag(struct net_device *dev, xdp_features_t val)
+{
+ netdev_lock(dev);
+ xdp_set_features_flag_locked(dev, val);
+ netdev_unlock(dev);
+}
+EXPORT_SYMBOL_GPL(xdp_set_features_flag);
+
+void xdp_features_set_redirect_target_locked(struct net_device *dev,
+ bool support_sg)
+{
+ xdp_features_t val = (dev->xdp_features | NETDEV_XDP_ACT_NDO_XMIT);
+
+ if (support_sg)
+ val |= NETDEV_XDP_ACT_NDO_XMIT_SG;
+ xdp_set_features_flag_locked(dev, val);
+}
+EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target_locked);
+
+void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg)
+{
+ netdev_lock(dev);
+ xdp_features_set_redirect_target_locked(dev, support_sg);
+ netdev_unlock(dev);
+}
+EXPORT_SYMBOL_GPL(xdp_features_set_redirect_target);
+
+void xdp_features_clear_redirect_target_locked(struct net_device *dev)
+{
+ xdp_features_t val = dev->xdp_features;
+
+ val &= ~(NETDEV_XDP_ACT_NDO_XMIT | NETDEV_XDP_ACT_NDO_XMIT_SG);
+ xdp_set_features_flag_locked(dev, val);
+}
+EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target_locked);
+
+void xdp_features_clear_redirect_target(struct net_device *dev)
+{
+ netdev_lock(dev);
+ xdp_features_clear_redirect_target_locked(dev);
+ netdev_unlock(dev);
+}
+EXPORT_SYMBOL_GPL(xdp_features_clear_redirect_target);
diff --git a/net/dcb/Kconfig b/net/dcb/Kconfig
index 4066d59c8de5..efee8b9fe1d4 100644
--- a/net/dcb/Kconfig
+++ b/net/dcb/Kconfig
@@ -1,7 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
config DCB
bool "Data Center Bridging support"
default n
- ---help---
+ help
This enables support for configuring Data Center Bridging (DCB)
features on DCB capable Ethernet adapters via rtnetlink. Say 'Y'
if you have a DCB capable Ethernet adapter which supports this
diff --git a/net/dcb/Makefile b/net/dcb/Makefile
index c1282c9e64fa..2c0fa16ee2a9 100644
--- a/net/dcb/Makefile
+++ b/net/dcb/Makefile
@@ -1 +1,2 @@
-obj-$(CONFIG_DCB) += dcbnl.o dcbevent.o
+# SPDX-License-Identifier: GPL-2.0-only
+obj-y += dcbnl.o dcbevent.o
diff --git a/net/dcb/dcbevent.c b/net/dcb/dcbevent.c
index a520d8004d89..8620564c2b0b 100644
--- a/net/dcb/dcbevent.c
+++ b/net/dcb/dcbevent.c
@@ -1,18 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2010, Intel Corporation.
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
* Author: John Fastabend <john.r.fastabend@intel.com>
*/
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index a556cd708885..03eb1d941fca 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -1,18 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008-2011, Intel Corporation.
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, see <http://www.gnu.org/licenses/>.
- *
* Description: Data Center Bridging netlink interface
* Author: Lucy Liu <lucy.liu@intel.com>
*/
@@ -177,6 +166,7 @@ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = {
[DCB_ATTR_IEEE_QCN] = {.len = sizeof(struct ieee_qcn)},
[DCB_ATTR_IEEE_QCN_STATS] = {.len = sizeof(struct ieee_qcn_stats)},
[DCB_ATTR_DCB_BUFFER] = {.len = sizeof(struct dcbnl_buffer)},
+ [DCB_ATTR_DCB_APP_TRUST_TABLE] = {.type = NLA_NESTED},
};
/* DCB number of traffic classes nested attributes. */
@@ -188,8 +178,41 @@ static const struct nla_policy dcbnl_featcfg_nest[DCB_FEATCFG_ATTR_MAX + 1] = {
};
static LIST_HEAD(dcb_app_list);
+static LIST_HEAD(dcb_rewr_list);
static DEFINE_SPINLOCK(dcb_lock);
+static enum ieee_attrs_app dcbnl_app_attr_type_get(u8 selector)
+{
+ switch (selector) {
+ case IEEE_8021QAZ_APP_SEL_ETHERTYPE:
+ case IEEE_8021QAZ_APP_SEL_STREAM:
+ case IEEE_8021QAZ_APP_SEL_DGRAM:
+ case IEEE_8021QAZ_APP_SEL_ANY:
+ case IEEE_8021QAZ_APP_SEL_DSCP:
+ return DCB_ATTR_IEEE_APP;
+ case DCB_APP_SEL_PCP:
+ return DCB_ATTR_DCB_APP;
+ default:
+ return DCB_ATTR_IEEE_APP_UNSPEC;
+ }
+}
+
+static bool dcbnl_app_attr_type_validate(enum ieee_attrs_app type)
+{
+ switch (type) {
+ case DCB_ATTR_IEEE_APP:
+ case DCB_ATTR_DCB_APP:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool dcbnl_app_selector_validate(enum ieee_attrs_app type, u8 selector)
+{
+ return dcbnl_app_attr_type_get(selector) == type;
+}
+
static struct sk_buff *dcbnl_newmsg(int type, u8 cmd, u32 port, u32 seq,
u32 flags, struct nlmsghdr **nlhp)
{
@@ -241,12 +264,13 @@ static int dcbnl_getpfccfg(struct net_device *netdev, struct nlmsghdr *nlh,
if (!netdev->dcbnl_ops->getpfccfg)
return -EOPNOTSUPP;
- ret = nla_parse_nested(data, DCB_PFC_UP_ATTR_MAX,
- tb[DCB_ATTR_PFC_CFG], dcbnl_pfc_up_nest, NULL);
+ ret = nla_parse_nested_deprecated(data, DCB_PFC_UP_ATTR_MAX,
+ tb[DCB_ATTR_PFC_CFG],
+ dcbnl_pfc_up_nest, NULL);
if (ret)
return ret;
- nest = nla_nest_start(skb, DCB_ATTR_PFC_CFG);
+ nest = nla_nest_start_noflag(skb, DCB_ATTR_PFC_CFG);
if (!nest)
return -EMSGSIZE;
@@ -299,12 +323,13 @@ static int dcbnl_getcap(struct net_device *netdev, struct nlmsghdr *nlh,
if (!netdev->dcbnl_ops->getcap)
return -EOPNOTSUPP;
- ret = nla_parse_nested(data, DCB_CAP_ATTR_MAX, tb[DCB_ATTR_CAP],
- dcbnl_cap_nest, NULL);
+ ret = nla_parse_nested_deprecated(data, DCB_CAP_ATTR_MAX,
+ tb[DCB_ATTR_CAP], dcbnl_cap_nest,
+ NULL);
if (ret)
return ret;
- nest = nla_nest_start(skb, DCB_ATTR_CAP);
+ nest = nla_nest_start_noflag(skb, DCB_ATTR_CAP);
if (!nest)
return -EMSGSIZE;
@@ -343,12 +368,13 @@ static int dcbnl_getnumtcs(struct net_device *netdev, struct nlmsghdr *nlh,
if (!netdev->dcbnl_ops->getnumtcs)
return -EOPNOTSUPP;
- ret = nla_parse_nested(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS],
- dcbnl_numtcs_nest, NULL);
+ ret = nla_parse_nested_deprecated(data, DCB_NUMTCS_ATTR_MAX,
+ tb[DCB_ATTR_NUMTCS],
+ dcbnl_numtcs_nest, NULL);
if (ret)
return ret;
- nest = nla_nest_start(skb, DCB_ATTR_NUMTCS);
+ nest = nla_nest_start_noflag(skb, DCB_ATTR_NUMTCS);
if (!nest)
return -EMSGSIZE;
@@ -388,8 +414,9 @@ static int dcbnl_setnumtcs(struct net_device *netdev, struct nlmsghdr *nlh,
if (!netdev->dcbnl_ops->setnumtcs)
return -EOPNOTSUPP;
- ret = nla_parse_nested(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS],
- dcbnl_numtcs_nest, NULL);
+ ret = nla_parse_nested_deprecated(data, DCB_NUMTCS_ATTR_MAX,
+ tb[DCB_ATTR_NUMTCS],
+ dcbnl_numtcs_nest, NULL);
if (ret)
return ret;
@@ -447,8 +474,9 @@ static int dcbnl_getapp(struct net_device *netdev, struct nlmsghdr *nlh,
if (!tb[DCB_ATTR_APP])
return -EINVAL;
- ret = nla_parse_nested(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP],
- dcbnl_app_nest, NULL);
+ ret = nla_parse_nested_deprecated(app_tb, DCB_APP_ATTR_MAX,
+ tb[DCB_ATTR_APP], dcbnl_app_nest,
+ NULL);
if (ret)
return ret;
@@ -479,7 +507,7 @@ static int dcbnl_getapp(struct net_device *netdev, struct nlmsghdr *nlh,
up = dcb_getapp(netdev, &app);
}
- app_nest = nla_nest_start(skb, DCB_ATTR_APP);
+ app_nest = nla_nest_start_noflag(skb, DCB_ATTR_APP);
if (!app_nest)
return -EMSGSIZE;
@@ -515,8 +543,9 @@ static int dcbnl_setapp(struct net_device *netdev, struct nlmsghdr *nlh,
if (!tb[DCB_ATTR_APP])
return -EINVAL;
- ret = nla_parse_nested(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP],
- dcbnl_app_nest, NULL);
+ ret = nla_parse_nested_deprecated(app_tb, DCB_APP_ATTR_MAX,
+ tb[DCB_ATTR_APP], dcbnl_app_nest,
+ NULL);
if (ret)
return ret;
@@ -573,12 +602,13 @@ static int __dcbnl_pg_getcfg(struct net_device *netdev, struct nlmsghdr *nlh,
!netdev->dcbnl_ops->getpgbwgcfgrx)
return -EOPNOTSUPP;
- ret = nla_parse_nested(pg_tb, DCB_PG_ATTR_MAX, tb[DCB_ATTR_PG_CFG],
- dcbnl_pg_nest, NULL);
+ ret = nla_parse_nested_deprecated(pg_tb, DCB_PG_ATTR_MAX,
+ tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest,
+ NULL);
if (ret)
return ret;
- pg_nest = nla_nest_start(skb, DCB_ATTR_PG_CFG);
+ pg_nest = nla_nest_start_noflag(skb, DCB_ATTR_PG_CFG);
if (!pg_nest)
return -EMSGSIZE;
@@ -593,12 +623,13 @@ static int __dcbnl_pg_getcfg(struct net_device *netdev, struct nlmsghdr *nlh,
data = pg_tb[DCB_PG_ATTR_TC_ALL];
else
data = pg_tb[i];
- ret = nla_parse_nested(param_tb, DCB_TC_ATTR_PARAM_MAX, data,
- dcbnl_tc_param_nest, NULL);
+ ret = nla_parse_nested_deprecated(param_tb,
+ DCB_TC_ATTR_PARAM_MAX, data,
+ dcbnl_tc_param_nest, NULL);
if (ret)
goto err_pg;
- param_nest = nla_nest_start(skb, i);
+ param_nest = nla_nest_start_noflag(skb, i);
if (!param_nest)
goto err_pg;
@@ -730,8 +761,9 @@ static int dcbnl_setpfccfg(struct net_device *netdev, struct nlmsghdr *nlh,
if (!netdev->dcbnl_ops->setpfccfg)
return -EOPNOTSUPP;
- ret = nla_parse_nested(data, DCB_PFC_UP_ATTR_MAX,
- tb[DCB_ATTR_PFC_CFG], dcbnl_pfc_up_nest, NULL);
+ ret = nla_parse_nested_deprecated(data, DCB_PFC_UP_ATTR_MAX,
+ tb[DCB_ATTR_PFC_CFG],
+ dcbnl_pfc_up_nest, NULL);
if (ret)
return ret;
@@ -786,8 +818,9 @@ static int __dcbnl_pg_setcfg(struct net_device *netdev, struct nlmsghdr *nlh,
!netdev->dcbnl_ops->setpgbwgcfgrx)
return -EOPNOTSUPP;
- ret = nla_parse_nested(pg_tb, DCB_PG_ATTR_MAX, tb[DCB_ATTR_PG_CFG],
- dcbnl_pg_nest, NULL);
+ ret = nla_parse_nested_deprecated(pg_tb, DCB_PG_ATTR_MAX,
+ tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest,
+ NULL);
if (ret)
return ret;
@@ -795,8 +828,10 @@ static int __dcbnl_pg_setcfg(struct net_device *netdev, struct nlmsghdr *nlh,
if (!pg_tb[i])
continue;
- ret = nla_parse_nested(param_tb, DCB_TC_ATTR_PARAM_MAX,
- pg_tb[i], dcbnl_tc_param_nest, NULL);
+ ret = nla_parse_nested_deprecated(param_tb,
+ DCB_TC_ATTR_PARAM_MAX,
+ pg_tb[i],
+ dcbnl_tc_param_nest, NULL);
if (ret)
return ret;
@@ -884,12 +919,13 @@ static int dcbnl_bcn_getcfg(struct net_device *netdev, struct nlmsghdr *nlh,
!netdev->dcbnl_ops->getbcncfg)
return -EOPNOTSUPP;
- ret = nla_parse_nested(bcn_tb, DCB_BCN_ATTR_MAX, tb[DCB_ATTR_BCN],
- dcbnl_bcn_nest, NULL);
+ ret = nla_parse_nested_deprecated(bcn_tb, DCB_BCN_ATTR_MAX,
+ tb[DCB_ATTR_BCN], dcbnl_bcn_nest,
+ NULL);
if (ret)
return ret;
- bcn_nest = nla_nest_start(skb, DCB_ATTR_BCN);
+ bcn_nest = nla_nest_start_noflag(skb, DCB_ATTR_BCN);
if (!bcn_nest)
return -EMSGSIZE;
@@ -943,8 +979,9 @@ static int dcbnl_bcn_setcfg(struct net_device *netdev, struct nlmsghdr *nlh,
!netdev->dcbnl_ops->setbcnrp)
return -EOPNOTSUPP;
- ret = nla_parse_nested(data, DCB_BCN_ATTR_MAX, tb[DCB_ATTR_BCN],
- dcbnl_pfc_up_nest, NULL);
+ ret = nla_parse_nested_deprecated(data, DCB_BCN_ATTR_MAX,
+ tb[DCB_ATTR_BCN], dcbnl_bcn_nest,
+ NULL);
if (ret)
return ret;
@@ -1002,7 +1039,7 @@ static int dcbnl_build_peer_app(struct net_device *netdev, struct sk_buff* skb,
*/
err = -EMSGSIZE;
- app = nla_nest_start(skb, app_nested_type);
+ app = nla_nest_start_noflag(skb, app_nested_type);
if (!app)
goto nla_put_failure;
@@ -1024,19 +1061,93 @@ nla_put_failure:
return err;
}
+static int dcbnl_getapptrust(struct net_device *netdev, struct sk_buff *skb)
+{
+ const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+ enum ieee_attrs_app type;
+ struct nlattr *apptrust;
+ int nselectors, err, i;
+ u8 *selectors;
+
+ selectors = kzalloc(IEEE_8021QAZ_APP_SEL_MAX + 1, GFP_KERNEL);
+ if (!selectors)
+ return -ENOMEM;
+
+ err = ops->dcbnl_getapptrust(netdev, selectors, &nselectors);
+ if (err) {
+ err = 0;
+ goto out;
+ }
+
+ apptrust = nla_nest_start(skb, DCB_ATTR_DCB_APP_TRUST_TABLE);
+ if (!apptrust) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ for (i = 0; i < nselectors; i++) {
+ type = dcbnl_app_attr_type_get(selectors[i]);
+ err = nla_put_u8(skb, type, selectors[i]);
+ if (err) {
+ nla_nest_cancel(skb, apptrust);
+ goto out;
+ }
+ }
+ nla_nest_end(skb, apptrust);
+
+out:
+ kfree(selectors);
+ return err;
+}
+
+/* Set or delete APP table or rewrite table entries. The APP struct is validated
+ * and the appropriate callback function is called.
+ */
+static int dcbnl_app_table_setdel(struct nlattr *attr,
+ struct net_device *netdev,
+ int (*setdel)(struct net_device *dev,
+ struct dcb_app *app))
+{
+ struct dcb_app *app_data;
+ enum ieee_attrs_app type;
+ struct nlattr *attr_itr;
+ int rem, err;
+
+ nla_for_each_nested(attr_itr, attr, rem) {
+ type = nla_type(attr_itr);
+
+ if (!dcbnl_app_attr_type_validate(type))
+ continue;
+
+ if (nla_len(attr_itr) < sizeof(struct dcb_app))
+ return -ERANGE;
+
+ app_data = nla_data(attr_itr);
+
+ if (!dcbnl_app_selector_validate(type, app_data->selector))
+ return -EINVAL;
+
+ err = setdel(netdev, app_data);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
/* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb GET commands. */
static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
{
- struct nlattr *ieee, *app;
- struct dcb_app_type *itr;
const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+ struct nlattr *ieee, *app, *rewr;
+ struct dcb_app_type *itr;
int dcbx;
int err;
if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name))
return -EMSGSIZE;
- ieee = nla_nest_start(skb, DCB_ATTR_IEEE);
+ ieee = nla_nest_start_noflag(skb, DCB_ATTR_IEEE);
if (!ieee)
return -EMSGSIZE;
@@ -1106,15 +1217,16 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
return -EMSGSIZE;
}
- app = nla_nest_start(skb, DCB_ATTR_IEEE_APP_TABLE);
+ app = nla_nest_start_noflag(skb, DCB_ATTR_IEEE_APP_TABLE);
if (!app)
return -EMSGSIZE;
spin_lock_bh(&dcb_lock);
list_for_each_entry(itr, &dcb_app_list, list) {
if (itr->ifindex == netdev->ifindex) {
- err = nla_put(skb, DCB_ATTR_IEEE_APP, sizeof(itr->app),
- &itr->app);
+ enum ieee_attrs_app type =
+ dcbnl_app_attr_type_get(itr->app.selector);
+ err = nla_put(skb, type, sizeof(itr->app), &itr->app);
if (err) {
spin_unlock_bh(&dcb_lock);
return -EMSGSIZE;
@@ -1130,6 +1242,33 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
spin_unlock_bh(&dcb_lock);
nla_nest_end(skb, app);
+ rewr = nla_nest_start(skb, DCB_ATTR_DCB_REWR_TABLE);
+ if (!rewr)
+ return -EMSGSIZE;
+
+ spin_lock_bh(&dcb_lock);
+ list_for_each_entry(itr, &dcb_rewr_list, list) {
+ if (itr->ifindex == netdev->ifindex) {
+ enum ieee_attrs_app type =
+ dcbnl_app_attr_type_get(itr->app.selector);
+ err = nla_put(skb, type, sizeof(itr->app), &itr->app);
+ if (err) {
+ spin_unlock_bh(&dcb_lock);
+ nla_nest_cancel(skb, rewr);
+ return -EMSGSIZE;
+ }
+ }
+ }
+
+ spin_unlock_bh(&dcb_lock);
+ nla_nest_end(skb, rewr);
+
+ if (ops->dcbnl_getapptrust) {
+ err = dcbnl_getapptrust(netdev, skb);
+ if (err)
+ return err;
+ }
+
/* get peer info if available */
if (ops->ieee_peer_getets) {
struct ieee_ets ets;
@@ -1174,13 +1313,13 @@ static int dcbnl_cee_pg_fill(struct sk_buff *skb, struct net_device *dev,
u8 pgid, up_map, prio, tc_pct;
const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops;
int i = dir ? DCB_ATTR_CEE_TX_PG : DCB_ATTR_CEE_RX_PG;
- struct nlattr *pg = nla_nest_start(skb, i);
+ struct nlattr *pg = nla_nest_start_noflag(skb, i);
if (!pg)
return -EMSGSIZE;
for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) {
- struct nlattr *tc_nest = nla_nest_start(skb, i);
+ struct nlattr *tc_nest = nla_nest_start_noflag(skb, i);
if (!tc_nest)
return -EMSGSIZE;
@@ -1231,7 +1370,7 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev)
if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name))
goto nla_put_failure;
- cee = nla_nest_start(skb, DCB_ATTR_CEE);
+ cee = nla_nest_start_noflag(skb, DCB_ATTR_CEE);
if (!cee)
goto nla_put_failure;
@@ -1250,7 +1389,8 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev)
/* local pfc */
if (ops->getpfccfg) {
- struct nlattr *pfc_nest = nla_nest_start(skb, DCB_ATTR_CEE_PFC);
+ struct nlattr *pfc_nest = nla_nest_start_noflag(skb,
+ DCB_ATTR_CEE_PFC);
if (!pfc_nest)
goto nla_put_failure;
@@ -1265,14 +1405,14 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev)
/* local app */
spin_lock_bh(&dcb_lock);
- app = nla_nest_start(skb, DCB_ATTR_CEE_APP_TABLE);
+ app = nla_nest_start_noflag(skb, DCB_ATTR_CEE_APP_TABLE);
if (!app)
goto dcb_unlock;
list_for_each_entry(itr, &dcb_app_list, list) {
if (itr->ifindex == netdev->ifindex) {
- struct nlattr *app_nest = nla_nest_start(skb,
- DCB_ATTR_APP);
+ struct nlattr *app_nest = nla_nest_start_noflag(skb,
+ DCB_ATTR_APP);
if (!app_nest)
goto dcb_unlock;
@@ -1305,7 +1445,8 @@ static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev)
/* features flags */
if (ops->getfeatcfg) {
- struct nlattr *feat = nla_nest_start(skb, DCB_ATTR_CEE_FEAT);
+ struct nlattr *feat = nla_nest_start_noflag(skb,
+ DCB_ATTR_CEE_FEAT);
if (!feat)
goto nla_put_failure;
@@ -1376,7 +1517,7 @@ static int dcbnl_notify(struct net_device *dev, int event, int cmd,
skb = dcbnl_newmsg(event, cmd, portid, seq, 0, &nlh);
if (!skb)
- return -ENOBUFS;
+ return -ENOMEM;
if (dcbx_ver == DCB_CAP_DCBX_VER_IEEE)
err = dcbnl_ieee_fill(skb, dev);
@@ -1421,6 +1562,7 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh,
{
const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1];
+ int prio;
int err;
if (!ops)
@@ -1429,8 +1571,9 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh,
if (!tb[DCB_ATTR_IEEE])
return -EINVAL;
- err = nla_parse_nested(ieee, DCB_ATTR_IEEE_MAX, tb[DCB_ATTR_IEEE],
- dcbnl_ieee_policy, NULL);
+ err = nla_parse_nested_deprecated(ieee, DCB_ATTR_IEEE_MAX,
+ tb[DCB_ATTR_IEEE],
+ dcbnl_ieee_policy, NULL);
if (err)
return err;
@@ -1469,34 +1612,79 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh,
struct dcbnl_buffer *buffer =
nla_data(ieee[DCB_ATTR_DCB_BUFFER]);
+ for (prio = 0; prio < ARRAY_SIZE(buffer->prio2buffer); prio++) {
+ if (buffer->prio2buffer[prio] >= DCBX_MAX_BUFFERS) {
+ err = -EINVAL;
+ goto err;
+ }
+ }
+
err = ops->dcbnl_setbuffer(netdev, buffer);
if (err)
goto err;
}
+ if (ieee[DCB_ATTR_DCB_REWR_TABLE]) {
+ err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE],
+ netdev,
+ ops->dcbnl_setrewr ?: dcb_setrewr);
+ if (err)
+ goto err;
+ }
+
if (ieee[DCB_ATTR_IEEE_APP_TABLE]) {
+ err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE],
+ netdev, ops->ieee_setapp ?:
+ dcb_ieee_setapp);
+ if (err)
+ goto err;
+ }
+
+ if (ieee[DCB_ATTR_DCB_APP_TRUST_TABLE]) {
+ u8 selectors[IEEE_8021QAZ_APP_SEL_MAX + 1] = {0};
struct nlattr *attr;
+ int nselectors = 0;
int rem;
- nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) {
- struct dcb_app *app_data;
+ if (!ops->dcbnl_setapptrust) {
+ err = -EOPNOTSUPP;
+ goto err;
+ }
- if (nla_type(attr) != DCB_ATTR_IEEE_APP)
- continue;
+ nla_for_each_nested(attr, ieee[DCB_ATTR_DCB_APP_TRUST_TABLE],
+ rem) {
+ enum ieee_attrs_app type = nla_type(attr);
+ u8 selector;
+ int i;
- if (nla_len(attr) < sizeof(struct dcb_app)) {
- err = -ERANGE;
+ if (!dcbnl_app_attr_type_validate(type) ||
+ nla_len(attr) != 1 ||
+ nselectors >= sizeof(selectors)) {
+ err = -EINVAL;
goto err;
}
- app_data = nla_data(attr);
- if (ops->ieee_setapp)
- err = ops->ieee_setapp(netdev, app_data);
- else
- err = dcb_ieee_setapp(netdev, app_data);
- if (err)
+ selector = nla_get_u8(attr);
+
+ if (!dcbnl_app_selector_validate(type, selector)) {
+ err = -EINVAL;
goto err;
+ }
+
+ /* Duplicate selector ? */
+ for (i = 0; i < nselectors; i++) {
+ if (selectors[i] == selector) {
+ err = -EINVAL;
+ goto err;
+ }
+ }
+
+ selectors[nselectors++] = selector;
}
+
+ err = ops->dcbnl_setapptrust(netdev, selectors, nselectors);
+ if (err)
+ goto err;
}
err:
@@ -1529,28 +1717,26 @@ static int dcbnl_ieee_del(struct net_device *netdev, struct nlmsghdr *nlh,
if (!tb[DCB_ATTR_IEEE])
return -EINVAL;
- err = nla_parse_nested(ieee, DCB_ATTR_IEEE_MAX, tb[DCB_ATTR_IEEE],
- dcbnl_ieee_policy, NULL);
+ err = nla_parse_nested_deprecated(ieee, DCB_ATTR_IEEE_MAX,
+ tb[DCB_ATTR_IEEE],
+ dcbnl_ieee_policy, NULL);
if (err)
return err;
if (ieee[DCB_ATTR_IEEE_APP_TABLE]) {
- struct nlattr *attr;
- int rem;
-
- nla_for_each_nested(attr, ieee[DCB_ATTR_IEEE_APP_TABLE], rem) {
- struct dcb_app *app_data;
+ err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE],
+ netdev, ops->ieee_delapp ?:
+ dcb_ieee_delapp);
+ if (err)
+ goto err;
+ }
- if (nla_type(attr) != DCB_ATTR_IEEE_APP)
- continue;
- app_data = nla_data(attr);
- if (ops->ieee_delapp)
- err = ops->ieee_delapp(netdev, app_data);
- else
- err = dcb_ieee_delapp(netdev, app_data);
- if (err)
- goto err;
- }
+ if (ieee[DCB_ATTR_DCB_REWR_TABLE]) {
+ err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE],
+ netdev,
+ ops->dcbnl_delrewr ?: dcb_delrewr);
+ if (err)
+ goto err;
}
err:
@@ -1602,12 +1788,13 @@ static int dcbnl_getfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh,
if (!tb[DCB_ATTR_FEATCFG])
return -EINVAL;
- ret = nla_parse_nested(data, DCB_FEATCFG_ATTR_MAX,
- tb[DCB_ATTR_FEATCFG], dcbnl_featcfg_nest, NULL);
+ ret = nla_parse_nested_deprecated(data, DCB_FEATCFG_ATTR_MAX,
+ tb[DCB_ATTR_FEATCFG],
+ dcbnl_featcfg_nest, NULL);
if (ret)
return ret;
- nest = nla_nest_start(skb, DCB_ATTR_FEATCFG);
+ nest = nla_nest_start_noflag(skb, DCB_ATTR_FEATCFG);
if (!nest)
return -EMSGSIZE;
@@ -1646,8 +1833,9 @@ static int dcbnl_setfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh,
if (!tb[DCB_ATTR_FEATCFG])
return -EINVAL;
- ret = nla_parse_nested(data, DCB_FEATCFG_ATTR_MAX,
- tb[DCB_ATTR_FEATCFG], dcbnl_featcfg_nest, NULL);
+ ret = nla_parse_nested_deprecated(data, DCB_FEATCFG_ATTR_MAX,
+ tb[DCB_ATTR_FEATCFG],
+ dcbnl_featcfg_nest, NULL);
if (ret)
goto err;
@@ -1727,7 +1915,7 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net_device *netdev;
struct dcbmsg *dcb = nlmsg_data(nlh);
struct nlattr *tb[DCB_ATTR_MAX + 1];
- u32 portid = skb ? NETLINK_CB(skb).portid : 0;
+ u32 portid = NETLINK_CB(skb).portid;
int ret = -EINVAL;
struct sk_buff *reply_skb;
struct nlmsghdr *reply_nlh = NULL;
@@ -1736,8 +1924,8 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
if ((nlh->nlmsg_type == RTM_SETDCB) && !netlink_capable(skb, CAP_NET_ADMIN))
return -EPERM;
- ret = nlmsg_parse(nlh, sizeof(*dcb), tb, DCB_ATTR_MAX,
- dcbnl_rtnl_policy, extack);
+ ret = nlmsg_parse_deprecated(nlh, sizeof(*dcb), tb, DCB_ATTR_MAX,
+ dcbnl_rtnl_policy, extack);
if (ret < 0)
return ret;
@@ -1748,6 +1936,8 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
fn = &reply_funcs[dcb->cmd];
if (!fn->cb)
return -EOPNOTSUPP;
+ if (fn->type == RTM_SETDCB && !netlink_capable(skb, CAP_NET_ADMIN))
+ return -EPERM;
if (!tb[DCB_ATTR_IFNAME])
return -EINVAL;
@@ -1762,7 +1952,7 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
reply_skb = dcbnl_newmsg(fn->type, dcb->cmd, portid, nlh->nlmsg_seq,
nlh->nlmsg_flags, &reply_nlh);
if (!reply_skb)
- return -ENOBUFS;
+ return -ENOMEM;
ret = fn->cb(netdev, nlh, nlh->nlmsg_seq, tb, reply_skb);
if (ret < 0) {
@@ -1777,6 +1967,22 @@ out:
return ret;
}
+static struct dcb_app_type *dcb_rewr_lookup(const struct dcb_app *app,
+ int ifindex, int proto)
+{
+ struct dcb_app_type *itr;
+
+ list_for_each_entry(itr, &dcb_rewr_list, list) {
+ if (itr->app.selector == app->selector &&
+ itr->app.priority == app->priority &&
+ itr->ifindex == ifindex &&
+ ((proto == -1) || itr->app.protocol == proto))
+ return itr;
+ }
+
+ return NULL;
+}
+
static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app,
int ifindex, int prio)
{
@@ -1793,7 +1999,8 @@ static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app,
return NULL;
}
-static int dcb_app_add(const struct dcb_app *app, int ifindex)
+static int dcb_app_add(struct list_head *list, const struct dcb_app *app,
+ int ifindex)
{
struct dcb_app_type *entry;
@@ -1803,13 +2010,15 @@ static int dcb_app_add(const struct dcb_app *app, int ifindex)
memcpy(&entry->app, app, sizeof(*app));
entry->ifindex = ifindex;
- list_add(&entry->list, &dcb_app_list);
+ list_add(&entry->list, list);
return 0;
}
/**
* dcb_getapp - retrieve the DCBX application user priority
+ * @dev: network interface
+ * @app: application to get user priority of
*
* On success returns a non-zero 802.1p user priority bitmap
* otherwise returns 0 as the invalid user priority bitmap to
@@ -1832,6 +2041,8 @@ EXPORT_SYMBOL(dcb_getapp);
/**
* dcb_setapp - add CEE dcb application data to app list
+ * @dev: network interface
+ * @new: application data to add
*
* Priority 0 is an invalid priority in CEE spec. This routine
* removes applications from the app list if the priority is
@@ -1862,7 +2073,7 @@ int dcb_setapp(struct net_device *dev, struct dcb_app *new)
}
/* App type does not exist add new application type */
if (new->priority)
- err = dcb_app_add(new, dev->ifindex);
+ err = dcb_app_add(&dcb_app_list, new, dev->ifindex);
out:
spin_unlock_bh(&dcb_lock);
if (!err)
@@ -1873,6 +2084,8 @@ EXPORT_SYMBOL(dcb_setapp);
/**
* dcb_ieee_getapp_mask - retrieve the IEEE DCB application priority
+ * @dev: network interface
+ * @app: where to store the retrieve application data
*
* Helper routine which on success returns a non-zero 802.1Qaz user
* priority bitmap otherwise returns 0 to indicate the dcb_app was
@@ -1893,8 +2106,67 @@ u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app)
}
EXPORT_SYMBOL(dcb_ieee_getapp_mask);
+/* Get protocol value from rewrite entry. */
+u16 dcb_getrewr(struct net_device *dev, struct dcb_app *app)
+{
+ struct dcb_app_type *itr;
+ u16 proto = 0;
+
+ spin_lock_bh(&dcb_lock);
+ itr = dcb_rewr_lookup(app, dev->ifindex, -1);
+ if (itr)
+ proto = itr->app.protocol;
+ spin_unlock_bh(&dcb_lock);
+
+ return proto;
+}
+EXPORT_SYMBOL(dcb_getrewr);
+
+ /* Add rewrite entry to the rewrite list. */
+int dcb_setrewr(struct net_device *dev, struct dcb_app *new)
+{
+ int err;
+
+ spin_lock_bh(&dcb_lock);
+ /* Search for existing match and abort if found. */
+ if (dcb_rewr_lookup(new, dev->ifindex, new->protocol)) {
+ err = -EEXIST;
+ goto out;
+ }
+
+ err = dcb_app_add(&dcb_rewr_list, new, dev->ifindex);
+out:
+ spin_unlock_bh(&dcb_lock);
+
+ return err;
+}
+EXPORT_SYMBOL(dcb_setrewr);
+
+/* Delete rewrite entry from the rewrite list. */
+int dcb_delrewr(struct net_device *dev, struct dcb_app *del)
+{
+ struct dcb_app_type *itr;
+ int err = -ENOENT;
+
+ spin_lock_bh(&dcb_lock);
+ /* Search for existing match and remove it. */
+ itr = dcb_rewr_lookup(del, dev->ifindex, del->protocol);
+ if (itr) {
+ list_del(&itr->list);
+ kfree(itr);
+ err = 0;
+ }
+
+ spin_unlock_bh(&dcb_lock);
+
+ return err;
+}
+EXPORT_SYMBOL(dcb_delrewr);
+
/**
* dcb_ieee_setapp - add IEEE dcb application data to app list
+ * @dev: network interface
+ * @new: application data to add
*
* This adds Application data to the list. Multiple application
* entries may exists for the same selector and protocol as long
@@ -1918,7 +2190,7 @@ int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new)
goto out;
}
- err = dcb_app_add(new, dev->ifindex);
+ err = dcb_app_add(&dcb_app_list, new, dev->ifindex);
out:
spin_unlock_bh(&dcb_lock);
if (!err)
@@ -1929,6 +2201,8 @@ EXPORT_SYMBOL(dcb_ieee_setapp);
/**
* dcb_ieee_delapp - delete IEEE dcb application data from list
+ * @dev: network interface
+ * @del: application data to delete
*
* This removes a matching APP data from the APP list
*/
@@ -1958,7 +2232,59 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del)
}
EXPORT_SYMBOL(dcb_ieee_delapp);
-/**
+/* dcb_getrewr_prio_pcp_mask_map - For a given device, find mapping from
+ * priorities to the PCP and DEI values assigned to that priority.
+ */
+void dcb_getrewr_prio_pcp_mask_map(const struct net_device *dev,
+ struct dcb_rewr_prio_pcp_map *p_map)
+{
+ int ifindex = dev->ifindex;
+ struct dcb_app_type *itr;
+ u8 prio;
+
+ memset(p_map->map, 0, sizeof(p_map->map));
+
+ spin_lock_bh(&dcb_lock);
+ list_for_each_entry(itr, &dcb_rewr_list, list) {
+ if (itr->ifindex == ifindex &&
+ itr->app.selector == DCB_APP_SEL_PCP &&
+ itr->app.protocol < 16 &&
+ itr->app.priority < IEEE_8021QAZ_MAX_TCS) {
+ prio = itr->app.priority;
+ p_map->map[prio] |= 1 << itr->app.protocol;
+ }
+ }
+ spin_unlock_bh(&dcb_lock);
+}
+EXPORT_SYMBOL(dcb_getrewr_prio_pcp_mask_map);
+
+/* dcb_getrewr_prio_dscp_mask_map - For a given device, find mapping from
+ * priorities to the DSCP values assigned to that priority.
+ */
+void dcb_getrewr_prio_dscp_mask_map(const struct net_device *dev,
+ struct dcb_ieee_app_prio_map *p_map)
+{
+ int ifindex = dev->ifindex;
+ struct dcb_app_type *itr;
+ u8 prio;
+
+ memset(p_map->map, 0, sizeof(p_map->map));
+
+ spin_lock_bh(&dcb_lock);
+ list_for_each_entry(itr, &dcb_rewr_list, list) {
+ if (itr->ifindex == ifindex &&
+ itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP &&
+ itr->app.protocol < 64 &&
+ itr->app.priority < IEEE_8021QAZ_MAX_TCS) {
+ prio = itr->app.priority;
+ p_map->map[prio] |= 1ULL << itr->app.protocol;
+ }
+ }
+ spin_unlock_bh(&dcb_lock);
+}
+EXPORT_SYMBOL(dcb_getrewr_prio_dscp_mask_map);
+
+/*
* dcb_ieee_getapp_prio_dscp_mask_map - For a given device, find mapping from
* priorities to the DSCP values assigned to that priority. Initialize p_map
* such that each map element holds a bit mask of DSCP values configured for
@@ -1987,7 +2313,7 @@ void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev,
}
EXPORT_SYMBOL(dcb_ieee_getapp_prio_dscp_mask_map);
-/**
+/*
* dcb_ieee_getapp_dscp_prio_mask_map - For a given device, find mapping from
* DSCP values to the priorities assigned to that DSCP value. Initialize p_map
* such that each map element holds a bit mask of priorities configured for a
@@ -2014,7 +2340,7 @@ dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev,
}
EXPORT_SYMBOL(dcb_ieee_getapp_dscp_prio_mask_map);
-/**
+/*
* Per 802.1Q-2014, the selector value of 1 is used for matching on Ethernet
* type, with valid PID values >= 1536. A special meaning is then assigned to
* protocol value of 0: "default priority. For use when priority is not
@@ -2044,12 +2370,58 @@ u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev)
}
EXPORT_SYMBOL(dcb_ieee_getapp_default_prio_mask);
+static void dcbnl_flush_dev(struct net_device *dev)
+{
+ struct dcb_app_type *itr, *tmp;
+
+ spin_lock_bh(&dcb_lock);
+
+ list_for_each_entry_safe(itr, tmp, &dcb_app_list, list) {
+ if (itr->ifindex == dev->ifindex) {
+ list_del(&itr->list);
+ kfree(itr);
+ }
+ }
+
+ spin_unlock_bh(&dcb_lock);
+}
+
+static int dcbnl_netdevice_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ switch (event) {
+ case NETDEV_UNREGISTER:
+ if (!dev->dcbnl_ops)
+ return NOTIFY_DONE;
+
+ dcbnl_flush_dev(dev);
+
+ return NOTIFY_OK;
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static struct notifier_block dcbnl_nb __read_mostly = {
+ .notifier_call = dcbnl_netdevice_event,
+};
+
+static const struct rtnl_msg_handler dcbnl_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_GETDCB, .doit = dcb_doit},
+ {.msgtype = RTM_SETDCB, .doit = dcb_doit},
+};
+
static int __init dcbnl_init(void)
{
- INIT_LIST_HEAD(&dcb_app_list);
+ int err;
+
+ err = register_netdevice_notifier(&dcbnl_nb);
+ if (err)
+ return err;
- rtnl_register(PF_UNSPEC, RTM_GETDCB, dcb_doit, NULL, 0);
- rtnl_register(PF_UNSPEC, RTM_SETDCB, dcb_doit, NULL, 0);
+ rtnl_register_many(dcbnl_rtnl_msg_handlers);
return 0;
}
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig
deleted file mode 100644
index b270e84d9c13..000000000000
--- a/net/dccp/Kconfig
+++ /dev/null
@@ -1,45 +0,0 @@
-menuconfig IP_DCCP
- tristate "The DCCP Protocol"
- depends on INET
- ---help---
- Datagram Congestion Control Protocol (RFC 4340)
-
- From http://www.ietf.org/rfc/rfc4340.txt:
-
- The Datagram Congestion Control Protocol (DCCP) is a transport
- protocol that implements bidirectional, unicast connections of
- congestion-controlled, unreliable datagrams. It should be suitable
- for use by applications such as streaming media, Internet telephony,
- and on-line games.
-
- To compile this protocol support as a module, choose M here: the
- module will be called dccp.
-
- If in doubt, say N.
-
-if IP_DCCP
-
-config INET_DCCP_DIAG
- depends on INET_DIAG
- def_tristate y if (IP_DCCP = y && INET_DIAG = y)
- def_tristate m
-
-source "net/dccp/ccids/Kconfig"
-
-menu "DCCP Kernel Hacking"
- depends on DEBUG_KERNEL=y
-
-config IP_DCCP_DEBUG
- bool "DCCP debug messages"
- ---help---
- Only use this if you're hacking DCCP.
-
- When compiling DCCP as a module, this debugging output can be toggled
- by setting the parameter dccp_debug of the `dccp' module to 0 or 1.
-
- Just say N.
-
-
-endmenu
-
-endif # IP_DDCP
diff --git a/net/dccp/Makefile b/net/dccp/Makefile
deleted file mode 100644
index 5b4ff37bc806..000000000000
--- a/net/dccp/Makefile
+++ /dev/null
@@ -1,30 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o
-
-dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o \
- qpolicy.o
-#
-# CCID algorithms to be used by dccp.ko
-#
-# CCID-2 is default (RFC 4340, p. 77) and has Ack Vectors as dependency
-dccp-y += ccids/ccid2.o ackvec.o
-dccp-$(CONFIG_IP_DCCP_CCID3) += ccids/ccid3.o
-dccp-$(CONFIG_IP_DCCP_TFRC_LIB) += ccids/lib/tfrc.o \
- ccids/lib/tfrc_equation.o \
- ccids/lib/packet_history.o \
- ccids/lib/loss_interval.o
-
-dccp_ipv4-y := ipv4.o
-
-# build dccp_ipv6 as module whenever either IPv6 or DCCP is a module
-obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o
-dccp_ipv6-y := ipv6.o
-
-obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o
-
-dccp-$(CONFIG_SYSCTL) += sysctl.o
-
-dccp_diag-y := diag.o
-
-# build with local directory for trace.h
-CFLAGS_proto.o := -I$(src)
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
deleted file mode 100644
index 2a24f7d171a5..000000000000
--- a/net/dccp/ackvec.c
+++ /dev/null
@@ -1,405 +0,0 @@
-/*
- * net/dccp/ackvec.c
- *
- * An implementation of Ack Vectors for the DCCP protocol
- * Copyright (c) 2007 University of Aberdeen, Scotland, UK
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; version 2 of the License;
- */
-#include "dccp.h"
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-
-static struct kmem_cache *dccp_ackvec_slab;
-static struct kmem_cache *dccp_ackvec_record_slab;
-
-struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority)
-{
- struct dccp_ackvec *av = kmem_cache_zalloc(dccp_ackvec_slab, priority);
-
- if (av != NULL) {
- av->av_buf_head = av->av_buf_tail = DCCPAV_MAX_ACKVEC_LEN - 1;
- INIT_LIST_HEAD(&av->av_records);
- }
- return av;
-}
-
-static void dccp_ackvec_purge_records(struct dccp_ackvec *av)
-{
- struct dccp_ackvec_record *cur, *next;
-
- list_for_each_entry_safe(cur, next, &av->av_records, avr_node)
- kmem_cache_free(dccp_ackvec_record_slab, cur);
- INIT_LIST_HEAD(&av->av_records);
-}
-
-void dccp_ackvec_free(struct dccp_ackvec *av)
-{
- if (likely(av != NULL)) {
- dccp_ackvec_purge_records(av);
- kmem_cache_free(dccp_ackvec_slab, av);
- }
-}
-
-/**
- * dccp_ackvec_update_records - Record information about sent Ack Vectors
- * @av: Ack Vector records to update
- * @seqno: Sequence number of the packet carrying the Ack Vector just sent
- * @nonce_sum: The sum of all buffer nonces contained in the Ack Vector
- */
-int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seqno, u8 nonce_sum)
-{
- struct dccp_ackvec_record *avr;
-
- avr = kmem_cache_alloc(dccp_ackvec_record_slab, GFP_ATOMIC);
- if (avr == NULL)
- return -ENOBUFS;
-
- avr->avr_ack_seqno = seqno;
- avr->avr_ack_ptr = av->av_buf_head;
- avr->avr_ack_ackno = av->av_buf_ackno;
- avr->avr_ack_nonce = nonce_sum;
- avr->avr_ack_runlen = dccp_ackvec_runlen(av->av_buf + av->av_buf_head);
- /*
- * When the buffer overflows, we keep no more than one record. This is
- * the simplest way of disambiguating sender-Acks dating from before the
- * overflow from sender-Acks which refer to after the overflow; a simple
- * solution is preferable here since we are handling an exception.
- */
- if (av->av_overflow)
- dccp_ackvec_purge_records(av);
- /*
- * Since GSS is incremented for each packet, the list is automatically
- * arranged in descending order of @ack_seqno.
- */
- list_add(&avr->avr_node, &av->av_records);
-
- dccp_pr_debug("Added Vector, ack_seqno=%llu, ack_ackno=%llu (rl=%u)\n",
- (unsigned long long)avr->avr_ack_seqno,
- (unsigned long long)avr->avr_ack_ackno,
- avr->avr_ack_runlen);
- return 0;
-}
-
-static struct dccp_ackvec_record *dccp_ackvec_lookup(struct list_head *av_list,
- const u64 ackno)
-{
- struct dccp_ackvec_record *avr;
- /*
- * Exploit that records are inserted in descending order of sequence
- * number, start with the oldest record first. If @ackno is `before'
- * the earliest ack_ackno, the packet is too old to be considered.
- */
- list_for_each_entry_reverse(avr, av_list, avr_node) {
- if (avr->avr_ack_seqno == ackno)
- return avr;
- if (before48(ackno, avr->avr_ack_seqno))
- break;
- }
- return NULL;
-}
-
-/*
- * Buffer index and length computation using modulo-buffersize arithmetic.
- * Note that, as pointers move from right to left, head is `before' tail.
- */
-static inline u16 __ackvec_idx_add(const u16 a, const u16 b)
-{
- return (a + b) % DCCPAV_MAX_ACKVEC_LEN;
-}
-
-static inline u16 __ackvec_idx_sub(const u16 a, const u16 b)
-{
- return __ackvec_idx_add(a, DCCPAV_MAX_ACKVEC_LEN - b);
-}
-
-u16 dccp_ackvec_buflen(const struct dccp_ackvec *av)
-{
- if (unlikely(av->av_overflow))
- return DCCPAV_MAX_ACKVEC_LEN;
- return __ackvec_idx_sub(av->av_buf_tail, av->av_buf_head);
-}
-
-/**
- * dccp_ackvec_update_old - Update previous state as per RFC 4340, 11.4.1
- * @av: non-empty buffer to update
- * @distance: negative or zero distance of @seqno from buf_ackno downward
- * @seqno: the (old) sequence number whose record is to be updated
- * @state: state in which packet carrying @seqno was received
- */
-static void dccp_ackvec_update_old(struct dccp_ackvec *av, s64 distance,
- u64 seqno, enum dccp_ackvec_states state)
-{
- u16 ptr = av->av_buf_head;
-
- BUG_ON(distance > 0);
- if (unlikely(dccp_ackvec_is_empty(av)))
- return;
-
- do {
- u8 runlen = dccp_ackvec_runlen(av->av_buf + ptr);
-
- if (distance + runlen >= 0) {
- /*
- * Only update the state if packet has not been received
- * yet. This is OK as per the second table in RFC 4340,
- * 11.4.1; i.e. here we are using the following table:
- * RECEIVED
- * 0 1 3
- * S +---+---+---+
- * T 0 | 0 | 0 | 0 |
- * O +---+---+---+
- * R 1 | 1 | 1 | 1 |
- * E +---+---+---+
- * D 3 | 0 | 1 | 3 |
- * +---+---+---+
- * The "Not Received" state was set by reserve_seats().
- */
- if (av->av_buf[ptr] == DCCPAV_NOT_RECEIVED)
- av->av_buf[ptr] = state;
- else
- dccp_pr_debug("Not changing %llu state to %u\n",
- (unsigned long long)seqno, state);
- break;
- }
-
- distance += runlen + 1;
- ptr = __ackvec_idx_add(ptr, 1);
-
- } while (ptr != av->av_buf_tail);
-}
-
-/* Mark @num entries after buf_head as "Not yet received". */
-static void dccp_ackvec_reserve_seats(struct dccp_ackvec *av, u16 num)
-{
- u16 start = __ackvec_idx_add(av->av_buf_head, 1),
- len = DCCPAV_MAX_ACKVEC_LEN - start;
-
- /* check for buffer wrap-around */
- if (num > len) {
- memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, len);
- start = 0;
- num -= len;
- }
- if (num)
- memset(av->av_buf + start, DCCPAV_NOT_RECEIVED, num);
-}
-
-/**
- * dccp_ackvec_add_new - Record one or more new entries in Ack Vector buffer
- * @av: container of buffer to update (can be empty or non-empty)
- * @num_packets: number of packets to register (must be >= 1)
- * @seqno: sequence number of the first packet in @num_packets
- * @state: state in which packet carrying @seqno was received
- */
-static void dccp_ackvec_add_new(struct dccp_ackvec *av, u32 num_packets,
- u64 seqno, enum dccp_ackvec_states state)
-{
- u32 num_cells = num_packets;
-
- if (num_packets > DCCPAV_BURST_THRESH) {
- u32 lost_packets = num_packets - 1;
-
- DCCP_WARN("Warning: large burst loss (%u)\n", lost_packets);
- /*
- * We received 1 packet and have a loss of size "num_packets-1"
- * which we squeeze into num_cells-1 rather than reserving an
- * entire byte for each lost packet.
- * The reason is that the vector grows in O(burst_length); when
- * it grows too large there will no room left for the payload.
- * This is a trade-off: if a few packets out of the burst show
- * up later, their state will not be changed; it is simply too
- * costly to reshuffle/reallocate/copy the buffer each time.
- * Should such problems persist, we will need to switch to a
- * different underlying data structure.
- */
- for (num_packets = num_cells = 1; lost_packets; ++num_cells) {
- u8 len = min_t(u32, lost_packets, DCCPAV_MAX_RUNLEN);
-
- av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, 1);
- av->av_buf[av->av_buf_head] = DCCPAV_NOT_RECEIVED | len;
-
- lost_packets -= len;
- }
- }
-
- if (num_cells + dccp_ackvec_buflen(av) >= DCCPAV_MAX_ACKVEC_LEN) {
- DCCP_CRIT("Ack Vector buffer overflow: dropping old entries");
- av->av_overflow = true;
- }
-
- av->av_buf_head = __ackvec_idx_sub(av->av_buf_head, num_packets);
- if (av->av_overflow)
- av->av_buf_tail = av->av_buf_head;
-
- av->av_buf[av->av_buf_head] = state;
- av->av_buf_ackno = seqno;
-
- if (num_packets > 1)
- dccp_ackvec_reserve_seats(av, num_packets - 1);
-}
-
-/**
- * dccp_ackvec_input - Register incoming packet in the buffer
- */
-void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb)
-{
- u64 seqno = DCCP_SKB_CB(skb)->dccpd_seq;
- enum dccp_ackvec_states state = DCCPAV_RECEIVED;
-
- if (dccp_ackvec_is_empty(av)) {
- dccp_ackvec_add_new(av, 1, seqno, state);
- av->av_tail_ackno = seqno;
-
- } else {
- s64 num_packets = dccp_delta_seqno(av->av_buf_ackno, seqno);
- u8 *current_head = av->av_buf + av->av_buf_head;
-
- if (num_packets == 1 &&
- dccp_ackvec_state(current_head) == state &&
- dccp_ackvec_runlen(current_head) < DCCPAV_MAX_RUNLEN) {
-
- *current_head += 1;
- av->av_buf_ackno = seqno;
-
- } else if (num_packets > 0) {
- dccp_ackvec_add_new(av, num_packets, seqno, state);
- } else {
- dccp_ackvec_update_old(av, num_packets, seqno, state);
- }
- }
-}
-
-/**
- * dccp_ackvec_clear_state - Perform house-keeping / garbage-collection
- * This routine is called when the peer acknowledges the receipt of Ack Vectors
- * up to and including @ackno. While based on on section A.3 of RFC 4340, here
- * are additional precautions to prevent corrupted buffer state. In particular,
- * we use tail_ackno to identify outdated records; it always marks the earliest
- * packet of group (2) in 11.4.2.
- */
-void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno)
-{
- struct dccp_ackvec_record *avr, *next;
- u8 runlen_now, eff_runlen;
- s64 delta;
-
- avr = dccp_ackvec_lookup(&av->av_records, ackno);
- if (avr == NULL)
- return;
- /*
- * Deal with outdated acknowledgments: this arises when e.g. there are
- * several old records and the acks from the peer come in slowly. In
- * that case we may still have records that pre-date tail_ackno.
- */
- delta = dccp_delta_seqno(av->av_tail_ackno, avr->avr_ack_ackno);
- if (delta < 0)
- goto free_records;
- /*
- * Deal with overlapping Ack Vectors: don't subtract more than the
- * number of packets between tail_ackno and ack_ackno.
- */
- eff_runlen = delta < avr->avr_ack_runlen ? delta : avr->avr_ack_runlen;
-
- runlen_now = dccp_ackvec_runlen(av->av_buf + avr->avr_ack_ptr);
- /*
- * The run length of Ack Vector cells does not decrease over time. If
- * the run length is the same as at the time the Ack Vector was sent, we
- * free the ack_ptr cell. That cell can however not be freed if the run
- * length has increased: in this case we need to move the tail pointer
- * backwards (towards higher indices), to its next-oldest neighbour.
- */
- if (runlen_now > eff_runlen) {
-
- av->av_buf[avr->avr_ack_ptr] -= eff_runlen + 1;
- av->av_buf_tail = __ackvec_idx_add(avr->avr_ack_ptr, 1);
-
- /* This move may not have cleared the overflow flag. */
- if (av->av_overflow)
- av->av_overflow = (av->av_buf_head == av->av_buf_tail);
- } else {
- av->av_buf_tail = avr->avr_ack_ptr;
- /*
- * We have made sure that avr points to a valid cell within the
- * buffer. This cell is either older than head, or equals head
- * (empty buffer): in both cases we no longer have any overflow.
- */
- av->av_overflow = 0;
- }
-
- /*
- * The peer has acknowledged up to and including ack_ackno. Hence the
- * first packet in group (2) of 11.4.2 is the successor of ack_ackno.
- */
- av->av_tail_ackno = ADD48(avr->avr_ack_ackno, 1);
-
-free_records:
- list_for_each_entry_safe_from(avr, next, &av->av_records, avr_node) {
- list_del(&avr->avr_node);
- kmem_cache_free(dccp_ackvec_record_slab, avr);
- }
-}
-
-/*
- * Routines to keep track of Ack Vectors received in an skb
- */
-int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce)
-{
- struct dccp_ackvec_parsed *new = kmalloc(sizeof(*new), GFP_ATOMIC);
-
- if (new == NULL)
- return -ENOBUFS;
- new->vec = vec;
- new->len = len;
- new->nonce = nonce;
-
- list_add_tail(&new->node, head);
- return 0;
-}
-EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_add);
-
-void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks)
-{
- struct dccp_ackvec_parsed *cur, *next;
-
- list_for_each_entry_safe(cur, next, parsed_chunks, node)
- kfree(cur);
- INIT_LIST_HEAD(parsed_chunks);
-}
-EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup);
-
-int __init dccp_ackvec_init(void)
-{
- dccp_ackvec_slab = kmem_cache_create("dccp_ackvec",
- sizeof(struct dccp_ackvec), 0,
- SLAB_HWCACHE_ALIGN, NULL);
- if (dccp_ackvec_slab == NULL)
- goto out_err;
-
- dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record",
- sizeof(struct dccp_ackvec_record),
- 0, SLAB_HWCACHE_ALIGN, NULL);
- if (dccp_ackvec_record_slab == NULL)
- goto out_destroy_slab;
-
- return 0;
-
-out_destroy_slab:
- kmem_cache_destroy(dccp_ackvec_slab);
- dccp_ackvec_slab = NULL;
-out_err:
- DCCP_CRIT("Unable to create Ack Vector slab cache");
- return -ENOBUFS;
-}
-
-void dccp_ackvec_exit(void)
-{
- kmem_cache_destroy(dccp_ackvec_slab);
- dccp_ackvec_slab = NULL;
- kmem_cache_destroy(dccp_ackvec_record_slab);
- dccp_ackvec_record_slab = NULL;
-}
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
deleted file mode 100644
index 3284bfa988c0..000000000000
--- a/net/dccp/ackvec.h
+++ /dev/null
@@ -1,138 +0,0 @@
-#ifndef _ACKVEC_H
-#define _ACKVEC_H
-/*
- * net/dccp/ackvec.h
- *
- * An implementation of Ack Vectors for the DCCP protocol
- * Copyright (c) 2007 University of Aberdeen, Scotland, UK
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@mandriva.com>
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/dccp.h>
-#include <linux/compiler.h>
-#include <linux/list.h>
-#include <linux/types.h>
-
-/*
- * Ack Vector buffer space is static, in multiples of %DCCP_SINGLE_OPT_MAXLEN,
- * the maximum size of a single Ack Vector. Setting %DCCPAV_NUM_ACKVECS to 1
- * will be sufficient for most cases of low Ack Ratios, using a value of 2 gives
- * more headroom if Ack Ratio is higher or when the sender acknowledges slowly.
- * The maximum value is bounded by the u16 types for indices and functions.
- */
-#define DCCPAV_NUM_ACKVECS 2
-#define DCCPAV_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * DCCPAV_NUM_ACKVECS)
-
-/* Estimated minimum average Ack Vector length - used for updating MPS */
-#define DCCPAV_MIN_OPTLEN 16
-
-/* Threshold for coping with large bursts of losses */
-#define DCCPAV_BURST_THRESH (DCCPAV_MAX_ACKVEC_LEN / 8)
-
-enum dccp_ackvec_states {
- DCCPAV_RECEIVED = 0x00,
- DCCPAV_ECN_MARKED = 0x40,
- DCCPAV_RESERVED = 0x80,
- DCCPAV_NOT_RECEIVED = 0xC0
-};
-#define DCCPAV_MAX_RUNLEN 0x3F
-
-static inline u8 dccp_ackvec_runlen(const u8 *cell)
-{
- return *cell & DCCPAV_MAX_RUNLEN;
-}
-
-static inline u8 dccp_ackvec_state(const u8 *cell)
-{
- return *cell & ~DCCPAV_MAX_RUNLEN;
-}
-
-/**
- * struct dccp_ackvec - Ack Vector main data structure
- *
- * This implements a fixed-size circular buffer within an array and is largely
- * based on Appendix A of RFC 4340.
- *
- * @av_buf: circular buffer storage area
- * @av_buf_head: head index; begin of live portion in @av_buf
- * @av_buf_tail: tail index; first index _after_ the live portion in @av_buf
- * @av_buf_ackno: highest seqno of acknowledgeable packet recorded in @av_buf
- * @av_tail_ackno: lowest seqno of acknowledgeable packet recorded in @av_buf
- * @av_buf_nonce: ECN nonce sums, each covering subsequent segments of up to
- * %DCCP_SINGLE_OPT_MAXLEN cells in the live portion of @av_buf
- * @av_overflow: if 1 then buf_head == buf_tail indicates buffer wraparound
- * @av_records: list of %dccp_ackvec_record (Ack Vectors sent previously)
- */
-struct dccp_ackvec {
- u8 av_buf[DCCPAV_MAX_ACKVEC_LEN];
- u16 av_buf_head;
- u16 av_buf_tail;
- u64 av_buf_ackno:48;
- u64 av_tail_ackno:48;
- bool av_buf_nonce[DCCPAV_NUM_ACKVECS];
- u8 av_overflow:1;
- struct list_head av_records;
-};
-
-/**
- * struct dccp_ackvec_record - Records information about sent Ack Vectors
- *
- * These list entries define the additional information which the HC-Receiver
- * keeps about recently-sent Ack Vectors; again refer to RFC 4340, Appendix A.
- *
- * @avr_node: the list node in @av_records
- * @avr_ack_seqno: sequence number of the packet the Ack Vector was sent on
- * @avr_ack_ackno: the Ack number that this record/Ack Vector refers to
- * @avr_ack_ptr: pointer into @av_buf where this record starts
- * @avr_ack_runlen: run length of @avr_ack_ptr at the time of sending
- * @avr_ack_nonce: the sum of @av_buf_nonce's at the time this record was sent
- *
- * The list as a whole is sorted in descending order by @avr_ack_seqno.
- */
-struct dccp_ackvec_record {
- struct list_head avr_node;
- u64 avr_ack_seqno:48;
- u64 avr_ack_ackno:48;
- u16 avr_ack_ptr;
- u8 avr_ack_runlen;
- u8 avr_ack_nonce:1;
-};
-
-int dccp_ackvec_init(void);
-void dccp_ackvec_exit(void);
-
-struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority);
-void dccp_ackvec_free(struct dccp_ackvec *av);
-
-void dccp_ackvec_input(struct dccp_ackvec *av, struct sk_buff *skb);
-int dccp_ackvec_update_records(struct dccp_ackvec *av, u64 seq, u8 sum);
-void dccp_ackvec_clear_state(struct dccp_ackvec *av, const u64 ackno);
-u16 dccp_ackvec_buflen(const struct dccp_ackvec *av);
-
-static inline bool dccp_ackvec_is_empty(const struct dccp_ackvec *av)
-{
- return av->av_overflow == 0 && av->av_buf_head == av->av_buf_tail;
-}
-
-/**
- * struct dccp_ackvec_parsed - Record offsets of Ack Vectors in skb
- * @vec: start of vector (offset into skb)
- * @len: length of @vec
- * @nonce: whether @vec had an ECN nonce of 0 or 1
- * @node: FIFO - arranged in descending order of ack_ackno
- *
- * This structure is used by CCIDs to access Ack Vectors in a received skb.
- */
-struct dccp_ackvec_parsed {
- u8 *vec,
- len,
- nonce:1;
- struct list_head node;
-};
-
-int dccp_ackvec_parsed_add(struct list_head *head, u8 *vec, u8 len, u8 nonce);
-void dccp_ackvec_parsed_cleanup(struct list_head *parsed_chunks);
-#endif /* _ACKVEC_H */
diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c
deleted file mode 100644
index 90f77d08cc37..000000000000
--- a/net/dccp/ccid.c
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * net/dccp/ccid.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * CCID infrastructure
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/slab.h>
-
-#include "ccid.h"
-#include "ccids/lib/tfrc.h"
-
-static struct ccid_operations *ccids[] = {
- &ccid2_ops,
-#ifdef CONFIG_IP_DCCP_CCID3
- &ccid3_ops,
-#endif
-};
-
-static struct ccid_operations *ccid_by_number(const u8 id)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(ccids); i++)
- if (ccids[i]->ccid_id == id)
- return ccids[i];
- return NULL;
-}
-
-/* check that up to @array_len members in @ccid_array are supported */
-bool ccid_support_check(u8 const *ccid_array, u8 array_len)
-{
- while (array_len > 0)
- if (ccid_by_number(ccid_array[--array_len]) == NULL)
- return false;
- return true;
-}
-
-/**
- * ccid_get_builtin_ccids - Populate a list of built-in CCIDs
- * @ccid_array: pointer to copy into
- * @array_len: value to return length into
- *
- * This function allocates memory - caller must see that it is freed after use.
- */
-int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len)
-{
- *ccid_array = kmalloc(ARRAY_SIZE(ccids), gfp_any());
- if (*ccid_array == NULL)
- return -ENOBUFS;
-
- for (*array_len = 0; *array_len < ARRAY_SIZE(ccids); *array_len += 1)
- (*ccid_array)[*array_len] = ccids[*array_len]->ccid_id;
- return 0;
-}
-
-int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
- char __user *optval, int __user *optlen)
-{
- u8 *ccid_array, array_len;
- int err = 0;
-
- if (ccid_get_builtin_ccids(&ccid_array, &array_len))
- return -ENOBUFS;
-
- if (put_user(array_len, optlen))
- err = -EFAULT;
- else if (len > 0 && copy_to_user(optval, ccid_array,
- len > array_len ? array_len : len))
- err = -EFAULT;
-
- kfree(ccid_array);
- return err;
-}
-
-static struct kmem_cache *ccid_kmem_cache_create(int obj_size, char *slab_name_fmt, const char *fmt,...)
-{
- struct kmem_cache *slab;
- va_list args;
-
- va_start(args, fmt);
- vsnprintf(slab_name_fmt, CCID_SLAB_NAME_LENGTH, fmt, args);
- va_end(args);
-
- slab = kmem_cache_create(slab_name_fmt, sizeof(struct ccid) + obj_size, 0,
- SLAB_HWCACHE_ALIGN, NULL);
- return slab;
-}
-
-static void ccid_kmem_cache_destroy(struct kmem_cache *slab)
-{
- kmem_cache_destroy(slab);
-}
-
-static int __init ccid_activate(struct ccid_operations *ccid_ops)
-{
- int err = -ENOBUFS;
-
- ccid_ops->ccid_hc_rx_slab =
- ccid_kmem_cache_create(ccid_ops->ccid_hc_rx_obj_size,
- ccid_ops->ccid_hc_rx_slab_name,
- "ccid%u_hc_rx_sock",
- ccid_ops->ccid_id);
- if (ccid_ops->ccid_hc_rx_slab == NULL)
- goto out;
-
- ccid_ops->ccid_hc_tx_slab =
- ccid_kmem_cache_create(ccid_ops->ccid_hc_tx_obj_size,
- ccid_ops->ccid_hc_tx_slab_name,
- "ccid%u_hc_tx_sock",
- ccid_ops->ccid_id);
- if (ccid_ops->ccid_hc_tx_slab == NULL)
- goto out_free_rx_slab;
-
- pr_info("DCCP: Activated CCID %d (%s)\n",
- ccid_ops->ccid_id, ccid_ops->ccid_name);
- err = 0;
-out:
- return err;
-out_free_rx_slab:
- ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab);
- ccid_ops->ccid_hc_rx_slab = NULL;
- goto out;
-}
-
-static void ccid_deactivate(struct ccid_operations *ccid_ops)
-{
- ccid_kmem_cache_destroy(ccid_ops->ccid_hc_tx_slab);
- ccid_ops->ccid_hc_tx_slab = NULL;
- ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab);
- ccid_ops->ccid_hc_rx_slab = NULL;
-
- pr_info("DCCP: Deactivated CCID %d (%s)\n",
- ccid_ops->ccid_id, ccid_ops->ccid_name);
-}
-
-struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx)
-{
- struct ccid_operations *ccid_ops = ccid_by_number(id);
- struct ccid *ccid = NULL;
-
- if (ccid_ops == NULL)
- goto out;
-
- ccid = kmem_cache_alloc(rx ? ccid_ops->ccid_hc_rx_slab :
- ccid_ops->ccid_hc_tx_slab, gfp_any());
- if (ccid == NULL)
- goto out;
- ccid->ccid_ops = ccid_ops;
- if (rx) {
- memset(ccid + 1, 0, ccid_ops->ccid_hc_rx_obj_size);
- if (ccid->ccid_ops->ccid_hc_rx_init != NULL &&
- ccid->ccid_ops->ccid_hc_rx_init(ccid, sk) != 0)
- goto out_free_ccid;
- } else {
- memset(ccid + 1, 0, ccid_ops->ccid_hc_tx_obj_size);
- if (ccid->ccid_ops->ccid_hc_tx_init != NULL &&
- ccid->ccid_ops->ccid_hc_tx_init(ccid, sk) != 0)
- goto out_free_ccid;
- }
-out:
- return ccid;
-out_free_ccid:
- kmem_cache_free(rx ? ccid_ops->ccid_hc_rx_slab :
- ccid_ops->ccid_hc_tx_slab, ccid);
- ccid = NULL;
- goto out;
-}
-
-void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk)
-{
- if (ccid != NULL) {
- if (ccid->ccid_ops->ccid_hc_rx_exit != NULL)
- ccid->ccid_ops->ccid_hc_rx_exit(sk);
- kmem_cache_free(ccid->ccid_ops->ccid_hc_rx_slab, ccid);
- }
-}
-
-void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk)
-{
- if (ccid != NULL) {
- if (ccid->ccid_ops->ccid_hc_tx_exit != NULL)
- ccid->ccid_ops->ccid_hc_tx_exit(sk);
- kmem_cache_free(ccid->ccid_ops->ccid_hc_tx_slab, ccid);
- }
-}
-
-int __init ccid_initialize_builtins(void)
-{
- int i, err = tfrc_lib_init();
-
- if (err)
- return err;
-
- for (i = 0; i < ARRAY_SIZE(ccids); i++) {
- err = ccid_activate(ccids[i]);
- if (err)
- goto unwind_registrations;
- }
- return 0;
-
-unwind_registrations:
- while(--i >= 0)
- ccid_deactivate(ccids[i]);
- tfrc_lib_exit();
- return err;
-}
-
-void ccid_cleanup_builtins(void)
-{
- int i;
-
- for (i = 0; i < ARRAY_SIZE(ccids); i++)
- ccid_deactivate(ccids[i]);
- tfrc_lib_exit();
-}
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
deleted file mode 100644
index 6eb837a47b5c..000000000000
--- a/net/dccp/ccid.h
+++ /dev/null
@@ -1,265 +0,0 @@
-#ifndef _CCID_H
-#define _CCID_H
-/*
- * net/dccp/ccid.h
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * CCID infrastructure
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <net/sock.h>
-#include <linux/compiler.h>
-#include <linux/dccp.h>
-#include <linux/list.h>
-#include <linux/module.h>
-
-/* maximum value for a CCID (RFC 4340, 19.5) */
-#define CCID_MAX 255
-#define CCID_SLAB_NAME_LENGTH 32
-
-struct tcp_info;
-
-/**
- * struct ccid_operations - Interface to Congestion-Control Infrastructure
- *
- * @ccid_id: numerical CCID ID (up to %CCID_MAX, cf. table 5 in RFC 4340, 10.)
- * @ccid_ccmps: the CCMPS including network/transport headers (0 when disabled)
- * @ccid_name: alphabetical identifier string for @ccid_id
- * @ccid_hc_{r,t}x_slab: memory pool for the receiver/sender half-connection
- * @ccid_hc_{r,t}x_obj_size: size of the receiver/sender half-connection socket
- *
- * @ccid_hc_{r,t}x_init: CCID-specific initialisation routine (before startup)
- * @ccid_hc_{r,t}x_exit: CCID-specific cleanup routine (before destruction)
- * @ccid_hc_rx_packet_recv: implements the HC-receiver side
- * @ccid_hc_{r,t}x_parse_options: parsing routine for CCID/HC-specific options
- * @ccid_hc_{r,t}x_insert_options: insert routine for CCID/HC-specific options
- * @ccid_hc_tx_packet_recv: implements feedback processing for the HC-sender
- * @ccid_hc_tx_send_packet: implements the sending part of the HC-sender
- * @ccid_hc_tx_packet_sent: does accounting for packets in flight by HC-sender
- * @ccid_hc_{r,t}x_get_info: INET_DIAG information for HC-receiver/sender
- * @ccid_hc_{r,t}x_getsockopt: socket options specific to HC-receiver/sender
- */
-struct ccid_operations {
- unsigned char ccid_id;
- __u32 ccid_ccmps;
- const char *ccid_name;
- struct kmem_cache *ccid_hc_rx_slab,
- *ccid_hc_tx_slab;
- char ccid_hc_rx_slab_name[CCID_SLAB_NAME_LENGTH];
- char ccid_hc_tx_slab_name[CCID_SLAB_NAME_LENGTH];
- __u32 ccid_hc_rx_obj_size,
- ccid_hc_tx_obj_size;
- /* Interface Routines */
- int (*ccid_hc_rx_init)(struct ccid *ccid, struct sock *sk);
- int (*ccid_hc_tx_init)(struct ccid *ccid, struct sock *sk);
- void (*ccid_hc_rx_exit)(struct sock *sk);
- void (*ccid_hc_tx_exit)(struct sock *sk);
- void (*ccid_hc_rx_packet_recv)(struct sock *sk,
- struct sk_buff *skb);
- int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt,
- u8 opt, u8 *val, u8 len);
- int (*ccid_hc_rx_insert_options)(struct sock *sk,
- struct sk_buff *skb);
- void (*ccid_hc_tx_packet_recv)(struct sock *sk,
- struct sk_buff *skb);
- int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt,
- u8 opt, u8 *val, u8 len);
- int (*ccid_hc_tx_send_packet)(struct sock *sk,
- struct sk_buff *skb);
- void (*ccid_hc_tx_packet_sent)(struct sock *sk,
- unsigned int len);
- void (*ccid_hc_rx_get_info)(struct sock *sk,
- struct tcp_info *info);
- void (*ccid_hc_tx_get_info)(struct sock *sk,
- struct tcp_info *info);
- int (*ccid_hc_rx_getsockopt)(struct sock *sk,
- const int optname, int len,
- u32 __user *optval,
- int __user *optlen);
- int (*ccid_hc_tx_getsockopt)(struct sock *sk,
- const int optname, int len,
- u32 __user *optval,
- int __user *optlen);
-};
-
-extern struct ccid_operations ccid2_ops;
-#ifdef CONFIG_IP_DCCP_CCID3
-extern struct ccid_operations ccid3_ops;
-#endif
-
-int ccid_initialize_builtins(void);
-void ccid_cleanup_builtins(void);
-
-struct ccid {
- struct ccid_operations *ccid_ops;
- char ccid_priv[0];
-};
-
-static inline void *ccid_priv(const struct ccid *ccid)
-{
- return (void *)ccid->ccid_priv;
-}
-
-bool ccid_support_check(u8 const *ccid_array, u8 array_len);
-int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len);
-int ccid_getsockopt_builtin_ccids(struct sock *sk, int len,
- char __user *, int __user *);
-
-struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx);
-
-static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp)
-{
- struct ccid *ccid = dp->dccps_hc_rx_ccid;
-
- if (ccid == NULL || ccid->ccid_ops == NULL)
- return -1;
- return ccid->ccid_ops->ccid_id;
-}
-
-static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp)
-{
- struct ccid *ccid = dp->dccps_hc_tx_ccid;
-
- if (ccid == NULL || ccid->ccid_ops == NULL)
- return -1;
- return ccid->ccid_ops->ccid_id;
-}
-
-void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk);
-void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk);
-
-/*
- * Congestion control of queued data packets via CCID decision.
- *
- * The TX CCID performs its congestion-control by indicating whether and when a
- * queued packet may be sent, using the return code of ccid_hc_tx_send_packet().
- * The following modes are supported via the symbolic constants below:
- * - timer-based pacing (CCID returns a delay value in milliseconds);
- * - autonomous dequeueing (CCID internally schedules dccps_xmitlet).
- */
-
-enum ccid_dequeueing_decision {
- CCID_PACKET_SEND_AT_ONCE = 0x00000, /* "green light": no delay */
- CCID_PACKET_DELAY_MAX = 0x0FFFF, /* maximum delay in msecs */
- CCID_PACKET_DELAY = 0x10000, /* CCID msec-delay mode */
- CCID_PACKET_WILL_DEQUEUE_LATER = 0x20000, /* CCID autonomous mode */
- CCID_PACKET_ERR = 0xF0000, /* error condition */
-};
-
-static inline int ccid_packet_dequeue_eval(const int return_code)
-{
- if (return_code < 0)
- return CCID_PACKET_ERR;
- if (return_code == 0)
- return CCID_PACKET_SEND_AT_ONCE;
- if (return_code <= CCID_PACKET_DELAY_MAX)
- return CCID_PACKET_DELAY;
- return return_code;
-}
-
-static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk,
- struct sk_buff *skb)
-{
- if (ccid->ccid_ops->ccid_hc_tx_send_packet != NULL)
- return ccid->ccid_ops->ccid_hc_tx_send_packet(sk, skb);
- return CCID_PACKET_SEND_AT_ONCE;
-}
-
-static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk,
- unsigned int len)
-{
- if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL)
- ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len);
-}
-
-static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk,
- struct sk_buff *skb)
-{
- if (ccid->ccid_ops->ccid_hc_rx_packet_recv != NULL)
- ccid->ccid_ops->ccid_hc_rx_packet_recv(sk, skb);
-}
-
-static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk,
- struct sk_buff *skb)
-{
- if (ccid->ccid_ops->ccid_hc_tx_packet_recv != NULL)
- ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb);
-}
-
-/**
- * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver
- * @pkt: type of packet that @opt appears on (RFC 4340, 5.1)
- * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3)
- * @val: value of @opt
- * @len: length of @val in bytes
- */
-static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk,
- u8 pkt, u8 opt, u8 *val, u8 len)
-{
- if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL)
- return 0;
- return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len);
-}
-
-/**
- * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender
- * Arguments are analogous to ccid_hc_tx_parse_options()
- */
-static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk,
- u8 pkt, u8 opt, u8 *val, u8 len)
-{
- if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL)
- return 0;
- return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len);
-}
-
-static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk,
- struct sk_buff *skb)
-{
- if (ccid->ccid_ops->ccid_hc_rx_insert_options != NULL)
- return ccid->ccid_ops->ccid_hc_rx_insert_options(sk, skb);
- return 0;
-}
-
-static inline void ccid_hc_rx_get_info(struct ccid *ccid, struct sock *sk,
- struct tcp_info *info)
-{
- if (ccid->ccid_ops->ccid_hc_rx_get_info != NULL)
- ccid->ccid_ops->ccid_hc_rx_get_info(sk, info);
-}
-
-static inline void ccid_hc_tx_get_info(struct ccid *ccid, struct sock *sk,
- struct tcp_info *info)
-{
- if (ccid->ccid_ops->ccid_hc_tx_get_info != NULL)
- ccid->ccid_ops->ccid_hc_tx_get_info(sk, info);
-}
-
-static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk,
- const int optname, int len,
- u32 __user *optval, int __user *optlen)
-{
- int rc = -ENOPROTOOPT;
- if (ccid != NULL && ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL)
- rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len,
- optval, optlen);
- return rc;
-}
-
-static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk,
- const int optname, int len,
- u32 __user *optval, int __user *optlen)
-{
- int rc = -ENOPROTOOPT;
- if (ccid != NULL && ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL)
- rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len,
- optval, optlen);
- return rc;
-}
-#endif /* _CCID_H */
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
deleted file mode 100644
index 8ba3fc9d6d16..000000000000
--- a/net/dccp/ccids/Kconfig
+++ /dev/null
@@ -1,54 +0,0 @@
-menu "DCCP CCIDs Configuration"
-
-config IP_DCCP_CCID2_DEBUG
- bool "CCID-2 debugging messages"
- ---help---
- Enable CCID-2 specific debugging messages.
-
- The debugging output can additionally be toggled by setting the
- ccid2_debug parameter to 0 or 1.
-
- If in doubt, say N.
-
-config IP_DCCP_CCID3
- bool "CCID-3 (TCP-Friendly)"
- def_bool y if (IP_DCCP = y || IP_DCCP = m)
- ---help---
- CCID-3 denotes TCP-Friendly Rate Control (TFRC), an equation-based
- rate-controlled congestion control mechanism. TFRC is designed to
- be reasonably fair when competing for bandwidth with TCP-like flows,
- where a flow is "reasonably fair" if its sending rate is generally
- within a factor of two of the sending rate of a TCP flow under the
- same conditions. However, TFRC has a much lower variation of
- throughput over time compared with TCP, which makes CCID-3 more
- suitable than CCID-2 for applications such streaming media where a
- relatively smooth sending rate is of importance.
-
- CCID-3 is further described in RFC 4342,
- http://www.ietf.org/rfc/rfc4342.txt
-
- The TFRC congestion control algorithms were initially described in
- RFC 5348.
-
- This text was extracted from RFC 4340 (sec. 10.2),
- http://www.ietf.org/rfc/rfc4340.txt
-
- If in doubt, say N.
-
-config IP_DCCP_CCID3_DEBUG
- bool "CCID-3 debugging messages"
- depends on IP_DCCP_CCID3
- ---help---
- Enable CCID-3 specific debugging messages.
-
- The debugging output can additionally be toggled by setting the
- ccid3_debug parameter to 0 or 1.
-
- If in doubt, say N.
-
-config IP_DCCP_TFRC_LIB
- def_bool y if IP_DCCP_CCID3
-
-config IP_DCCP_TFRC_DEBUG
- def_bool y if IP_DCCP_CCID3_DEBUG
-endmenu
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
deleted file mode 100644
index 842a9c7c73a3..000000000000
--- a/net/dccp/ccids/ccid2.c
+++ /dev/null
@@ -1,801 +0,0 @@
-/*
- * Copyright (c) 2005, 2006 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
- *
- * Changes to meet Linux coding standards, and DCCP infrastructure fixes.
- *
- * Copyright (c) 2006 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/*
- * This implementation should follow RFC 4341
- */
-#include <linux/slab.h>
-#include "../feat.h"
-#include "ccid2.h"
-
-
-#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
-static bool ccid2_debug;
-#define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a)
-#else
-#define ccid2_pr_debug(format, a...)
-#endif
-
-static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc)
-{
- struct ccid2_seq *seqp;
- int i;
-
- /* check if we have space to preserve the pointer to the buffer */
- if (hc->tx_seqbufc >= (sizeof(hc->tx_seqbuf) /
- sizeof(struct ccid2_seq *)))
- return -ENOMEM;
-
- /* allocate buffer and initialize linked list */
- seqp = kmalloc_array(CCID2_SEQBUF_LEN, sizeof(struct ccid2_seq),
- gfp_any());
- if (seqp == NULL)
- return -ENOMEM;
-
- for (i = 0; i < (CCID2_SEQBUF_LEN - 1); i++) {
- seqp[i].ccid2s_next = &seqp[i + 1];
- seqp[i + 1].ccid2s_prev = &seqp[i];
- }
- seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = seqp;
- seqp->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
-
- /* This is the first allocation. Initiate the head and tail. */
- if (hc->tx_seqbufc == 0)
- hc->tx_seqh = hc->tx_seqt = seqp;
- else {
- /* link the existing list with the one we just created */
- hc->tx_seqh->ccid2s_next = seqp;
- seqp->ccid2s_prev = hc->tx_seqh;
-
- hc->tx_seqt->ccid2s_prev = &seqp[CCID2_SEQBUF_LEN - 1];
- seqp[CCID2_SEQBUF_LEN - 1].ccid2s_next = hc->tx_seqt;
- }
-
- /* store the original pointer to the buffer so we can free it */
- hc->tx_seqbuf[hc->tx_seqbufc] = seqp;
- hc->tx_seqbufc++;
-
- return 0;
-}
-
-static int ccid2_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
-{
- if (ccid2_cwnd_network_limited(ccid2_hc_tx_sk(sk)))
- return CCID_PACKET_WILL_DEQUEUE_LATER;
- return CCID_PACKET_SEND_AT_ONCE;
-}
-
-static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val)
-{
- u32 max_ratio = DIV_ROUND_UP(ccid2_hc_tx_sk(sk)->tx_cwnd, 2);
-
- /*
- * Ensure that Ack Ratio does not exceed ceil(cwnd/2), which is (2) from
- * RFC 4341, 6.1.2. We ignore the statement that Ack Ratio 2 is always
- * acceptable since this causes starvation/deadlock whenever cwnd < 2.
- * The same problem arises when Ack Ratio is 0 (ie. Ack Ratio disabled).
- */
- if (val == 0 || val > max_ratio) {
- DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio);
- val = max_ratio;
- }
- dccp_feat_signal_nn_change(sk, DCCPF_ACK_RATIO,
- min_t(u32, val, DCCPF_ACK_RATIO_MAX));
-}
-
-static void ccid2_check_l_ack_ratio(struct sock *sk)
-{
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
-
- /*
- * After a loss, idle period, application limited period, or RTO we
- * need to check that the ack ratio is still less than the congestion
- * window. Otherwise, we will send an entire congestion window of
- * packets and got no response because we haven't sent ack ratio
- * packets yet.
- * If the ack ratio does need to be reduced, we reduce it to half of
- * the congestion window (or 1 if that's zero) instead of to the
- * congestion window. This prevents problems if one ack is lost.
- */
- if (dccp_feat_nn_get(sk, DCCPF_ACK_RATIO) > hc->tx_cwnd)
- ccid2_change_l_ack_ratio(sk, hc->tx_cwnd/2 ? : 1U);
-}
-
-static void ccid2_change_l_seq_window(struct sock *sk, u64 val)
-{
- dccp_feat_signal_nn_change(sk, DCCPF_SEQUENCE_WINDOW,
- clamp_val(val, DCCPF_SEQ_WMIN,
- DCCPF_SEQ_WMAX));
-}
-
-static void dccp_tasklet_schedule(struct sock *sk)
-{
- struct tasklet_struct *t = &dccp_sk(sk)->dccps_xmitlet;
-
- if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
- sock_hold(sk);
- __tasklet_schedule(t);
- }
-}
-
-static void ccid2_hc_tx_rto_expire(struct timer_list *t)
-{
- struct ccid2_hc_tx_sock *hc = from_timer(hc, t, tx_rtotimer);
- struct sock *sk = hc->sk;
- const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
-
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk)) {
- sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + HZ / 5);
- goto out;
- }
-
- ccid2_pr_debug("RTO_EXPIRE\n");
-
- if (sk->sk_state == DCCP_CLOSED)
- goto out;
-
- /* back-off timer */
- hc->tx_rto <<= 1;
- if (hc->tx_rto > DCCP_RTO_MAX)
- hc->tx_rto = DCCP_RTO_MAX;
-
- /* adjust pipe, cwnd etc */
- hc->tx_ssthresh = hc->tx_cwnd / 2;
- if (hc->tx_ssthresh < 2)
- hc->tx_ssthresh = 2;
- hc->tx_cwnd = 1;
- hc->tx_pipe = 0;
-
- /* clear state about stuff we sent */
- hc->tx_seqt = hc->tx_seqh;
- hc->tx_packets_acked = 0;
-
- /* clear ack ratio state. */
- hc->tx_rpseq = 0;
- hc->tx_rpdupack = -1;
- ccid2_change_l_ack_ratio(sk, 1);
-
- /* if we were blocked before, we may now send cwnd=1 packet */
- if (sender_was_blocked)
- dccp_tasklet_schedule(sk);
- /* restart backed-off timer */
- sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
-}
-
-/*
- * Congestion window validation (RFC 2861).
- */
-static bool ccid2_do_cwv = true;
-module_param(ccid2_do_cwv, bool, 0644);
-MODULE_PARM_DESC(ccid2_do_cwv, "Perform RFC2861 Congestion Window Validation");
-
-/**
- * ccid2_update_used_window - Track how much of cwnd is actually used
- * This is done in addition to CWV. The sender needs to have an idea of how many
- * packets may be in flight, to set the local Sequence Window value accordingly
- * (RFC 4340, 7.5.2). The CWV mechanism is exploited to keep track of the
- * maximum-used window. We use an EWMA low-pass filter to filter out noise.
- */
-static void ccid2_update_used_window(struct ccid2_hc_tx_sock *hc, u32 new_wnd)
-{
- hc->tx_expected_wnd = (3 * hc->tx_expected_wnd + new_wnd) / 4;
-}
-
-/* This borrows the code of tcp_cwnd_application_limited() */
-static void ccid2_cwnd_application_limited(struct sock *sk, const u32 now)
-{
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- /* don't reduce cwnd below the initial window (IW) */
- u32 init_win = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache),
- win_used = max(hc->tx_cwnd_used, init_win);
-
- if (win_used < hc->tx_cwnd) {
- hc->tx_ssthresh = max(hc->tx_ssthresh,
- (hc->tx_cwnd >> 1) + (hc->tx_cwnd >> 2));
- hc->tx_cwnd = (hc->tx_cwnd + win_used) >> 1;
- }
- hc->tx_cwnd_used = 0;
- hc->tx_cwnd_stamp = now;
-
- ccid2_check_l_ack_ratio(sk);
-}
-
-/* This borrows the code of tcp_cwnd_restart() */
-static void ccid2_cwnd_restart(struct sock *sk, const u32 now)
-{
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- u32 cwnd = hc->tx_cwnd, restart_cwnd,
- iwnd = rfc3390_bytes_to_packets(dccp_sk(sk)->dccps_mss_cache);
- s32 delta = now - hc->tx_lsndtime;
-
- hc->tx_ssthresh = max(hc->tx_ssthresh, (cwnd >> 1) + (cwnd >> 2));
-
- /* don't reduce cwnd below the initial window (IW) */
- restart_cwnd = min(cwnd, iwnd);
-
- while ((delta -= hc->tx_rto) >= 0 && cwnd > restart_cwnd)
- cwnd >>= 1;
- hc->tx_cwnd = max(cwnd, restart_cwnd);
- hc->tx_cwnd_stamp = now;
- hc->tx_cwnd_used = 0;
-
- ccid2_check_l_ack_ratio(sk);
-}
-
-static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- const u32 now = ccid2_jiffies32;
- struct ccid2_seq *next;
-
- /* slow-start after idle periods (RFC 2581, RFC 2861) */
- if (ccid2_do_cwv && !hc->tx_pipe &&
- (s32)(now - hc->tx_lsndtime) >= hc->tx_rto)
- ccid2_cwnd_restart(sk, now);
-
- hc->tx_lsndtime = now;
- hc->tx_pipe += 1;
-
- /* see whether cwnd was fully used (RFC 2861), update expected window */
- if (ccid2_cwnd_network_limited(hc)) {
- ccid2_update_used_window(hc, hc->tx_cwnd);
- hc->tx_cwnd_used = 0;
- hc->tx_cwnd_stamp = now;
- } else {
- if (hc->tx_pipe > hc->tx_cwnd_used)
- hc->tx_cwnd_used = hc->tx_pipe;
-
- ccid2_update_used_window(hc, hc->tx_cwnd_used);
-
- if (ccid2_do_cwv && (s32)(now - hc->tx_cwnd_stamp) >= hc->tx_rto)
- ccid2_cwnd_application_limited(sk, now);
- }
-
- hc->tx_seqh->ccid2s_seq = dp->dccps_gss;
- hc->tx_seqh->ccid2s_acked = 0;
- hc->tx_seqh->ccid2s_sent = now;
-
- next = hc->tx_seqh->ccid2s_next;
- /* check if we need to alloc more space */
- if (next == hc->tx_seqt) {
- if (ccid2_hc_tx_alloc_seq(hc)) {
- DCCP_CRIT("packet history - out of memory!");
- /* FIXME: find a more graceful way to bail out */
- return;
- }
- next = hc->tx_seqh->ccid2s_next;
- BUG_ON(next == hc->tx_seqt);
- }
- hc->tx_seqh = next;
-
- ccid2_pr_debug("cwnd=%d pipe=%d\n", hc->tx_cwnd, hc->tx_pipe);
-
- /*
- * FIXME: The code below is broken and the variables have been removed
- * from the socket struct. The `ackloss' variable was always set to 0,
- * and with arsent there are several problems:
- * (i) it doesn't just count the number of Acks, but all sent packets;
- * (ii) it is expressed in # of packets, not # of windows, so the
- * comparison below uses the wrong formula: Appendix A of RFC 4341
- * comes up with the number K = cwnd / (R^2 - R) of consecutive windows
- * of data with no lost or marked Ack packets. If arsent were the # of
- * consecutive Acks received without loss, then Ack Ratio needs to be
- * decreased by 1 when
- * arsent >= K * cwnd / R = cwnd^2 / (R^3 - R^2)
- * where cwnd / R is the number of Acks received per window of data
- * (cf. RFC 4341, App. A). The problems are that
- * - arsent counts other packets as well;
- * - the comparison uses a formula different from RFC 4341;
- * - computing a cubic/quadratic equation each time is too complicated.
- * Hence a different algorithm is needed.
- */
-#if 0
- /* Ack Ratio. Need to maintain a concept of how many windows we sent */
- hc->tx_arsent++;
- /* We had an ack loss in this window... */
- if (hc->tx_ackloss) {
- if (hc->tx_arsent >= hc->tx_cwnd) {
- hc->tx_arsent = 0;
- hc->tx_ackloss = 0;
- }
- } else {
- /* No acks lost up to now... */
- /* decrease ack ratio if enough packets were sent */
- if (dp->dccps_l_ack_ratio > 1) {
- /* XXX don't calculate denominator each time */
- int denom = dp->dccps_l_ack_ratio * dp->dccps_l_ack_ratio -
- dp->dccps_l_ack_ratio;
-
- denom = hc->tx_cwnd * hc->tx_cwnd / denom;
-
- if (hc->tx_arsent >= denom) {
- ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio - 1);
- hc->tx_arsent = 0;
- }
- } else {
- /* we can't increase ack ratio further [1] */
- hc->tx_arsent = 0; /* or maybe set it to cwnd*/
- }
- }
-#endif
-
- sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
-
-#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
- do {
- struct ccid2_seq *seqp = hc->tx_seqt;
-
- while (seqp != hc->tx_seqh) {
- ccid2_pr_debug("out seq=%llu acked=%d time=%u\n",
- (unsigned long long)seqp->ccid2s_seq,
- seqp->ccid2s_acked, seqp->ccid2s_sent);
- seqp = seqp->ccid2s_next;
- }
- } while (0);
- ccid2_pr_debug("=========\n");
-#endif
-}
-
-/**
- * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm
- * This code is almost identical with TCP's tcp_rtt_estimator(), since
- * - it has a higher sampling frequency (recommended by RFC 1323),
- * - the RTO does not collapse into RTT due to RTTVAR going towards zero,
- * - it is simple (cf. more complex proposals such as Eifel timer or research
- * which suggests that the gain should be set according to window size),
- * - in tests it was found to work well with CCID2 [gerrit].
- */
-static void ccid2_rtt_estimator(struct sock *sk, const long mrtt)
-{
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- long m = mrtt ? : 1;
-
- if (hc->tx_srtt == 0) {
- /* First measurement m */
- hc->tx_srtt = m << 3;
- hc->tx_mdev = m << 1;
-
- hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk));
- hc->tx_rttvar = hc->tx_mdev_max;
-
- hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
- } else {
- /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */
- m -= (hc->tx_srtt >> 3);
- hc->tx_srtt += m;
-
- /* Similarly, update scaled mdev with regard to |m| */
- if (m < 0) {
- m = -m;
- m -= (hc->tx_mdev >> 2);
- /*
- * This neutralises RTO increase when RTT < SRTT - mdev
- * (see P. Sarolahti, A. Kuznetsov,"Congestion Control
- * in Linux TCP", USENIX 2002, pp. 49-62).
- */
- if (m > 0)
- m >>= 3;
- } else {
- m -= (hc->tx_mdev >> 2);
- }
- hc->tx_mdev += m;
-
- if (hc->tx_mdev > hc->tx_mdev_max) {
- hc->tx_mdev_max = hc->tx_mdev;
- if (hc->tx_mdev_max > hc->tx_rttvar)
- hc->tx_rttvar = hc->tx_mdev_max;
- }
-
- /*
- * Decay RTTVAR at most once per flight, exploiting that
- * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2)
- * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1)
- * GAR is a useful bound for FlightSize = pipe.
- * AWL is probably too low here, as it over-estimates pipe.
- */
- if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) {
- if (hc->tx_mdev_max < hc->tx_rttvar)
- hc->tx_rttvar -= (hc->tx_rttvar -
- hc->tx_mdev_max) >> 2;
- hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss;
- hc->tx_mdev_max = tcp_rto_min(sk);
- }
- }
-
- /*
- * Set RTO from SRTT and RTTVAR
- * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms.
- * This agrees with RFC 4341, 5:
- * "Because DCCP does not retransmit data, DCCP does not require
- * TCP's recommended minimum timeout of one second".
- */
- hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar;
-
- if (hc->tx_rto > DCCP_RTO_MAX)
- hc->tx_rto = DCCP_RTO_MAX;
-}
-
-static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp,
- unsigned int *maxincr)
-{
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- int r_seq_used = hc->tx_cwnd / dp->dccps_l_ack_ratio;
-
- if (hc->tx_cwnd < dp->dccps_l_seq_win &&
- r_seq_used < dp->dccps_r_seq_win) {
- if (hc->tx_cwnd < hc->tx_ssthresh) {
- if (*maxincr > 0 && ++hc->tx_packets_acked >= 2) {
- hc->tx_cwnd += 1;
- *maxincr -= 1;
- hc->tx_packets_acked = 0;
- }
- } else if (++hc->tx_packets_acked >= hc->tx_cwnd) {
- hc->tx_cwnd += 1;
- hc->tx_packets_acked = 0;
- }
- }
-
- /*
- * Adjust the local sequence window and the ack ratio to allow about
- * 5 times the number of packets in the network (RFC 4340 7.5.2)
- */
- if (r_seq_used * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_r_seq_win)
- ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio * 2);
- else if (r_seq_used * CCID2_WIN_CHANGE_FACTOR < dp->dccps_r_seq_win/2)
- ccid2_change_l_ack_ratio(sk, dp->dccps_l_ack_ratio / 2 ? : 1U);
-
- if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR >= dp->dccps_l_seq_win)
- ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win * 2);
- else if (hc->tx_cwnd * CCID2_WIN_CHANGE_FACTOR < dp->dccps_l_seq_win/2)
- ccid2_change_l_seq_window(sk, dp->dccps_l_seq_win / 2);
-
- /*
- * FIXME: RTT is sampled several times per acknowledgment (for each
- * entry in the Ack Vector), instead of once per Ack (as in TCP SACK).
- * This causes the RTT to be over-estimated, since the older entries
- * in the Ack Vector have earlier sending times.
- * The cleanest solution is to not use the ccid2s_sent field at all
- * and instead use DCCP timestamps: requires changes in other places.
- */
- ccid2_rtt_estimator(sk, ccid2_jiffies32 - seqp->ccid2s_sent);
-}
-
-static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp)
-{
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
-
- if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) {
- ccid2_pr_debug("Multiple losses in an RTT---treating as one\n");
- return;
- }
-
- hc->tx_last_cong = ccid2_jiffies32;
-
- hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U;
- hc->tx_ssthresh = max(hc->tx_cwnd, 2U);
-
- ccid2_check_l_ack_ratio(sk);
-}
-
-static int ccid2_hc_tx_parse_options(struct sock *sk, u8 packet_type,
- u8 option, u8 *optval, u8 optlen)
-{
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
-
- switch (option) {
- case DCCPO_ACK_VECTOR_0:
- case DCCPO_ACK_VECTOR_1:
- return dccp_ackvec_parsed_add(&hc->tx_av_chunks, optval, optlen,
- option - DCCPO_ACK_VECTOR_0);
- }
- return 0;
-}
-
-static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- const bool sender_was_blocked = ccid2_cwnd_network_limited(hc);
- struct dccp_ackvec_parsed *avp;
- u64 ackno, seqno;
- struct ccid2_seq *seqp;
- int done = 0;
- unsigned int maxincr = 0;
-
- /* check reverse path congestion */
- seqno = DCCP_SKB_CB(skb)->dccpd_seq;
-
- /* XXX this whole "algorithm" is broken. Need to fix it to keep track
- * of the seqnos of the dupacks so that rpseq and rpdupack are correct
- * -sorbo.
- */
- /* need to bootstrap */
- if (hc->tx_rpdupack == -1) {
- hc->tx_rpdupack = 0;
- hc->tx_rpseq = seqno;
- } else {
- /* check if packet is consecutive */
- if (dccp_delta_seqno(hc->tx_rpseq, seqno) == 1)
- hc->tx_rpseq = seqno;
- /* it's a later packet */
- else if (after48(seqno, hc->tx_rpseq)) {
- hc->tx_rpdupack++;
-
- /* check if we got enough dupacks */
- if (hc->tx_rpdupack >= NUMDUPACK) {
- hc->tx_rpdupack = -1; /* XXX lame */
- hc->tx_rpseq = 0;
-#ifdef __CCID2_COPES_GRACEFULLY_WITH_ACK_CONGESTION_CONTROL__
- /*
- * FIXME: Ack Congestion Control is broken; in
- * the current state instabilities occurred with
- * Ack Ratios greater than 1; causing hang-ups
- * and long RTO timeouts. This needs to be fixed
- * before opening up dynamic changes. -- gerrit
- */
- ccid2_change_l_ack_ratio(sk, 2 * dp->dccps_l_ack_ratio);
-#endif
- }
- }
- }
-
- /* check forward path congestion */
- if (dccp_packet_without_ack(skb))
- return;
-
- /* still didn't send out new data packets */
- if (hc->tx_seqh == hc->tx_seqt)
- goto done;
-
- ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
- if (after48(ackno, hc->tx_high_ack))
- hc->tx_high_ack = ackno;
-
- seqp = hc->tx_seqt;
- while (before48(seqp->ccid2s_seq, ackno)) {
- seqp = seqp->ccid2s_next;
- if (seqp == hc->tx_seqh) {
- seqp = hc->tx_seqh->ccid2s_prev;
- break;
- }
- }
-
- /*
- * In slow-start, cwnd can increase up to a maximum of Ack Ratio/2
- * packets per acknowledgement. Rounding up avoids that cwnd is not
- * advanced when Ack Ratio is 1 and gives a slight edge otherwise.
- */
- if (hc->tx_cwnd < hc->tx_ssthresh)
- maxincr = DIV_ROUND_UP(dp->dccps_l_ack_ratio, 2);
-
- /* go through all ack vectors */
- list_for_each_entry(avp, &hc->tx_av_chunks, node) {
- /* go through this ack vector */
- for (; avp->len--; avp->vec++) {
- u64 ackno_end_rl = SUB48(ackno,
- dccp_ackvec_runlen(avp->vec));
-
- ccid2_pr_debug("ackvec %llu |%u,%u|\n",
- (unsigned long long)ackno,
- dccp_ackvec_state(avp->vec) >> 6,
- dccp_ackvec_runlen(avp->vec));
- /* if the seqno we are analyzing is larger than the
- * current ackno, then move towards the tail of our
- * seqnos.
- */
- while (after48(seqp->ccid2s_seq, ackno)) {
- if (seqp == hc->tx_seqt) {
- done = 1;
- break;
- }
- seqp = seqp->ccid2s_prev;
- }
- if (done)
- break;
-
- /* check all seqnos in the range of the vector
- * run length
- */
- while (between48(seqp->ccid2s_seq,ackno_end_rl,ackno)) {
- const u8 state = dccp_ackvec_state(avp->vec);
-
- /* new packet received or marked */
- if (state != DCCPAV_NOT_RECEIVED &&
- !seqp->ccid2s_acked) {
- if (state == DCCPAV_ECN_MARKED)
- ccid2_congestion_event(sk,
- seqp);
- else
- ccid2_new_ack(sk, seqp,
- &maxincr);
-
- seqp->ccid2s_acked = 1;
- ccid2_pr_debug("Got ack for %llu\n",
- (unsigned long long)seqp->ccid2s_seq);
- hc->tx_pipe--;
- }
- if (seqp == hc->tx_seqt) {
- done = 1;
- break;
- }
- seqp = seqp->ccid2s_prev;
- }
- if (done)
- break;
-
- ackno = SUB48(ackno_end_rl, 1);
- }
- if (done)
- break;
- }
-
- /* The state about what is acked should be correct now
- * Check for NUMDUPACK
- */
- seqp = hc->tx_seqt;
- while (before48(seqp->ccid2s_seq, hc->tx_high_ack)) {
- seqp = seqp->ccid2s_next;
- if (seqp == hc->tx_seqh) {
- seqp = hc->tx_seqh->ccid2s_prev;
- break;
- }
- }
- done = 0;
- while (1) {
- if (seqp->ccid2s_acked) {
- done++;
- if (done == NUMDUPACK)
- break;
- }
- if (seqp == hc->tx_seqt)
- break;
- seqp = seqp->ccid2s_prev;
- }
-
- /* If there are at least 3 acknowledgements, anything unacknowledged
- * below the last sequence number is considered lost
- */
- if (done == NUMDUPACK) {
- struct ccid2_seq *last_acked = seqp;
-
- /* check for lost packets */
- while (1) {
- if (!seqp->ccid2s_acked) {
- ccid2_pr_debug("Packet lost: %llu\n",
- (unsigned long long)seqp->ccid2s_seq);
- /* XXX need to traverse from tail -> head in
- * order to detect multiple congestion events in
- * one ack vector.
- */
- ccid2_congestion_event(sk, seqp);
- hc->tx_pipe--;
- }
- if (seqp == hc->tx_seqt)
- break;
- seqp = seqp->ccid2s_prev;
- }
-
- hc->tx_seqt = last_acked;
- }
-
- /* trim acked packets in tail */
- while (hc->tx_seqt != hc->tx_seqh) {
- if (!hc->tx_seqt->ccid2s_acked)
- break;
-
- hc->tx_seqt = hc->tx_seqt->ccid2s_next;
- }
-
- /* restart RTO timer if not all outstanding data has been acked */
- if (hc->tx_pipe == 0)
- sk_stop_timer(sk, &hc->tx_rtotimer);
- else
- sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto);
-done:
- /* check if incoming Acks allow pending packets to be sent */
- if (sender_was_blocked && !ccid2_cwnd_network_limited(hc))
- dccp_tasklet_schedule(sk);
- dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
-}
-
-static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
-{
- struct ccid2_hc_tx_sock *hc = ccid_priv(ccid);
- struct dccp_sock *dp = dccp_sk(sk);
- u32 max_ratio;
-
- /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
- hc->tx_ssthresh = ~0U;
-
- /* Use larger initial windows (RFC 4341, section 5). */
- hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache);
- hc->tx_expected_wnd = hc->tx_cwnd;
-
- /* Make sure that Ack Ratio is enabled and within bounds. */
- max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2);
- if (dp->dccps_l_ack_ratio == 0 || dp->dccps_l_ack_ratio > max_ratio)
- dp->dccps_l_ack_ratio = max_ratio;
-
- /* XXX init ~ to window size... */
- if (ccid2_hc_tx_alloc_seq(hc))
- return -ENOMEM;
-
- hc->tx_rto = DCCP_TIMEOUT_INIT;
- hc->tx_rpdupack = -1;
- hc->tx_last_cong = hc->tx_lsndtime = hc->tx_cwnd_stamp = ccid2_jiffies32;
- hc->tx_cwnd_used = 0;
- hc->sk = sk;
- timer_setup(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, 0);
- INIT_LIST_HEAD(&hc->tx_av_chunks);
- return 0;
-}
-
-static void ccid2_hc_tx_exit(struct sock *sk)
-{
- struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk);
- int i;
-
- sk_stop_timer(sk, &hc->tx_rtotimer);
-
- for (i = 0; i < hc->tx_seqbufc; i++)
- kfree(hc->tx_seqbuf[i]);
- hc->tx_seqbufc = 0;
- dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
-}
-
-static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
-{
- struct ccid2_hc_rx_sock *hc = ccid2_hc_rx_sk(sk);
-
- if (!dccp_data_packet(skb))
- return;
-
- if (++hc->rx_num_data_pkts >= dccp_sk(sk)->dccps_r_ack_ratio) {
- dccp_send_ack(sk);
- hc->rx_num_data_pkts = 0;
- }
-}
-
-struct ccid_operations ccid2_ops = {
- .ccid_id = DCCPC_CCID2,
- .ccid_name = "TCP-like",
- .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock),
- .ccid_hc_tx_init = ccid2_hc_tx_init,
- .ccid_hc_tx_exit = ccid2_hc_tx_exit,
- .ccid_hc_tx_send_packet = ccid2_hc_tx_send_packet,
- .ccid_hc_tx_packet_sent = ccid2_hc_tx_packet_sent,
- .ccid_hc_tx_parse_options = ccid2_hc_tx_parse_options,
- .ccid_hc_tx_packet_recv = ccid2_hc_tx_packet_recv,
- .ccid_hc_rx_obj_size = sizeof(struct ccid2_hc_rx_sock),
- .ccid_hc_rx_packet_recv = ccid2_hc_rx_packet_recv,
-};
-
-#ifdef CONFIG_IP_DCCP_CCID2_DEBUG
-module_param(ccid2_debug, bool, 0644);
-MODULE_PARM_DESC(ccid2_debug, "Enable CCID-2 debug messages");
-#endif
diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h
deleted file mode 100644
index 1af0116dc6ce..000000000000
--- a/net/dccp/ccids/ccid2.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#ifndef _DCCP_CCID2_H_
-#define _DCCP_CCID2_H_
-
-#include <linux/timer.h>
-#include <linux/types.h>
-#include "../ccid.h"
-#include "../dccp.h"
-
-/*
- * CCID-2 timestamping faces the same issues as TCP timestamping.
- * Hence we reuse/share as much of the code as possible.
- */
-#define ccid2_jiffies32 ((u32)jiffies)
-
-/* NUMDUPACK parameter from RFC 4341, p. 6 */
-#define NUMDUPACK 3
-
-struct ccid2_seq {
- u64 ccid2s_seq;
- u32 ccid2s_sent;
- int ccid2s_acked;
- struct ccid2_seq *ccid2s_prev;
- struct ccid2_seq *ccid2s_next;
-};
-
-#define CCID2_SEQBUF_LEN 1024
-#define CCID2_SEQBUF_MAX 128
-
-/*
- * Multiple of congestion window to keep the sequence window at
- * (RFC 4340 7.5.2)
- */
-#define CCID2_WIN_CHANGE_FACTOR 5
-
-/**
- * struct ccid2_hc_tx_sock - CCID2 TX half connection
- * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5
- * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465)
- * @tx_srtt: smoothed RTT estimate, scaled by 2^3
- * @tx_mdev: smoothed RTT variation, scaled by 2^2
- * @tx_mdev_max: maximum of @mdev during one flight
- * @tx_rttvar: moving average/maximum of @mdev_max
- * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988)
- * @tx_rtt_seq: to decay RTTVAR at most once per flight
- * @tx_cwnd_used: actually used cwnd, W_used of RFC 2861
- * @tx_expected_wnd: moving average of @tx_cwnd_used
- * @tx_cwnd_stamp: to track idle periods in CWV
- * @tx_lsndtime: last time (in jiffies) a data packet was sent
- * @tx_rpseq: last consecutive seqno
- * @tx_rpdupack: dupacks since rpseq
- * @tx_av_chunks: list of Ack Vectors received on current skb
- */
-struct ccid2_hc_tx_sock {
- u32 tx_cwnd;
- u32 tx_ssthresh;
- u32 tx_pipe;
- u32 tx_packets_acked;
- struct ccid2_seq *tx_seqbuf[CCID2_SEQBUF_MAX];
- int tx_seqbufc;
- struct ccid2_seq *tx_seqh;
- struct ccid2_seq *tx_seqt;
-
- /* RTT measurement: variables/principles are the same as in TCP */
- u32 tx_srtt,
- tx_mdev,
- tx_mdev_max,
- tx_rttvar,
- tx_rto;
- u64 tx_rtt_seq:48;
- struct timer_list tx_rtotimer;
- struct sock *sk;
-
- /* Congestion Window validation (optional, RFC 2861) */
- u32 tx_cwnd_used,
- tx_expected_wnd,
- tx_cwnd_stamp,
- tx_lsndtime;
-
- u64 tx_rpseq;
- int tx_rpdupack;
- u32 tx_last_cong;
- u64 tx_high_ack;
- struct list_head tx_av_chunks;
-};
-
-static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc)
-{
- return hc->tx_pipe >= hc->tx_cwnd;
-}
-
-/*
- * Convert RFC 3390 larger initial window into an equivalent number of packets.
- * This is based on the numbers specified in RFC 5681, 3.1.
- */
-static inline u32 rfc3390_bytes_to_packets(const u32 smss)
-{
- return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3);
-}
-
-/**
- * struct ccid2_hc_rx_sock - Receiving end of CCID-2 half-connection
- * @rx_num_data_pkts: number of data packets received since last feedback
- */
-struct ccid2_hc_rx_sock {
- u32 rx_num_data_pkts;
-};
-
-static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk)
-{
- return ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid);
-}
-
-static inline struct ccid2_hc_rx_sock *ccid2_hc_rx_sk(const struct sock *sk)
-{
- return ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid);
-}
-#endif /* _DCCP_CCID2_H_ */
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
deleted file mode 100644
index 12877a1514e7..000000000000
--- a/net/dccp/ccids/ccid3.c
+++ /dev/null
@@ -1,873 +0,0 @@
-/*
- * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
- * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
- *
- * An implementation of the DCCP protocol
- *
- * This code has been developed by the University of Waikato WAND
- * research group. For further information please see http://www.wand.net.nz/
- *
- * This code also uses code from Lulea University, rereleased as GPL by its
- * authors:
- * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
- *
- * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
- * and to make it work as a loadable module in the DCCP stack written by
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
- *
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#include "../dccp.h"
-#include "ccid3.h"
-
-#include <asm/unaligned.h>
-
-#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
-static bool ccid3_debug;
-#define ccid3_pr_debug(format, a...) DCCP_PR_DEBUG(ccid3_debug, format, ##a)
-#else
-#define ccid3_pr_debug(format, a...)
-#endif
-
-/*
- * Transmitter Half-Connection Routines
- */
-#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
-static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state)
-{
- static const char *const ccid3_state_names[] = {
- [TFRC_SSTATE_NO_SENT] = "NO_SENT",
- [TFRC_SSTATE_NO_FBACK] = "NO_FBACK",
- [TFRC_SSTATE_FBACK] = "FBACK",
- };
-
- return ccid3_state_names[state];
-}
-#endif
-
-static void ccid3_hc_tx_set_state(struct sock *sk,
- enum ccid3_hc_tx_states state)
-{
- struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- enum ccid3_hc_tx_states oldstate = hc->tx_state;
-
- ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
- dccp_role(sk), sk, ccid3_tx_state_name(oldstate),
- ccid3_tx_state_name(state));
- WARN_ON(state == oldstate);
- hc->tx_state = state;
-}
-
-/*
- * Compute the initial sending rate X_init in the manner of RFC 3390:
- *
- * X_init = min(4 * s, max(2 * s, 4380 bytes)) / RTT
- *
- * Note that RFC 3390 uses MSS, RFC 4342 refers to RFC 3390, and rfc3448bis
- * (rev-02) clarifies the use of RFC 3390 with regard to the above formula.
- * For consistency with other parts of the code, X_init is scaled by 2^6.
- */
-static inline u64 rfc3390_initial_rate(struct sock *sk)
-{
- const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- const __u32 w_init = clamp_t(__u32, 4380U, 2 * hc->tx_s, 4 * hc->tx_s);
-
- return scaled_div(w_init << 6, hc->tx_rtt);
-}
-
-/**
- * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst
- * This respects the granularity of X_inst (64 * bytes/second).
- */
-static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc)
-{
- hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x);
-
- DCCP_BUG_ON(hc->tx_t_ipi == 0);
- ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi,
- hc->tx_s, (unsigned int)(hc->tx_x >> 6));
-}
-
-static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now)
-{
- u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count);
-
- return delta / hc->tx_rtt;
-}
-
-/**
- * ccid3_hc_tx_update_x - Update allowed sending rate X
- * @stamp: most recent time if available - can be left NULL.
- *
- * This function tracks draft rfc3448bis, check there for latest details.
- *
- * Note: X and X_recv are both stored in units of 64 * bytes/second, to support
- * fine-grained resolution of sending rates. This requires scaling by 2^6
- * throughout the code. Only X_calc is unscaled (in bytes/second).
- *
- */
-static void ccid3_hc_tx_update_x(struct sock *sk, ktime_t *stamp)
-{
- struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- __u64 min_rate = 2 * hc->tx_x_recv;
- const __u64 old_x = hc->tx_x;
- ktime_t now = stamp ? *stamp : ktime_get_real();
-
- /*
- * Handle IDLE periods: do not reduce below RFC3390 initial sending rate
- * when idling [RFC 4342, 5.1]. Definition of idling is from rfc3448bis:
- * a sender is idle if it has not sent anything over a 2-RTT-period.
- * For consistency with X and X_recv, min_rate is also scaled by 2^6.
- */
- if (ccid3_hc_tx_idle_rtt(hc, now) >= 2) {
- min_rate = rfc3390_initial_rate(sk);
- min_rate = max(min_rate, 2 * hc->tx_x_recv);
- }
-
- if (hc->tx_p > 0) {
-
- hc->tx_x = min(((__u64)hc->tx_x_calc) << 6, min_rate);
- hc->tx_x = max(hc->tx_x, (((__u64)hc->tx_s) << 6) / TFRC_T_MBI);
-
- } else if (ktime_us_delta(now, hc->tx_t_ld) - (s64)hc->tx_rtt >= 0) {
-
- hc->tx_x = min(2 * hc->tx_x, min_rate);
- hc->tx_x = max(hc->tx_x,
- scaled_div(((__u64)hc->tx_s) << 6, hc->tx_rtt));
- hc->tx_t_ld = now;
- }
-
- if (hc->tx_x != old_x) {
- ccid3_pr_debug("X_prev=%u, X_now=%u, X_calc=%u, "
- "X_recv=%u\n", (unsigned int)(old_x >> 6),
- (unsigned int)(hc->tx_x >> 6), hc->tx_x_calc,
- (unsigned int)(hc->tx_x_recv >> 6));
-
- ccid3_update_send_interval(hc);
- }
-}
-
-/**
- * ccid3_hc_tx_update_s - Track the mean packet size `s'
- * @len: DCCP packet payload size in bytes
- *
- * cf. RFC 4342, 5.3 and RFC 3448, 4.1
- */
-static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hc, int len)
-{
- const u16 old_s = hc->tx_s;
-
- hc->tx_s = tfrc_ewma(hc->tx_s, len, 9);
-
- if (hc->tx_s != old_s)
- ccid3_update_send_interval(hc);
-}
-
-/*
- * Update Window Counter using the algorithm from [RFC 4342, 8.1].
- * As elsewhere, RTT > 0 is assumed by using dccp_sample_rtt().
- */
-static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hc,
- ktime_t now)
-{
- u32 delta = ktime_us_delta(now, hc->tx_t_last_win_count),
- quarter_rtts = (4 * delta) / hc->tx_rtt;
-
- if (quarter_rtts > 0) {
- hc->tx_t_last_win_count = now;
- hc->tx_last_win_count += min(quarter_rtts, 5U);
- hc->tx_last_win_count &= 0xF; /* mod 16 */
- }
-}
-
-static void ccid3_hc_tx_no_feedback_timer(struct timer_list *t)
-{
- struct ccid3_hc_tx_sock *hc = from_timer(hc, t, tx_no_feedback_timer);
- struct sock *sk = hc->sk;
- unsigned long t_nfb = USEC_PER_SEC / 5;
-
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk)) {
- /* Try again later. */
- /* XXX: set some sensible MIB */
- goto restart_timer;
- }
-
- ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk,
- ccid3_tx_state_name(hc->tx_state));
-
- /* Ignore and do not restart after leaving the established state */
- if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
- goto out;
-
- /* Reset feedback state to "no feedback received" */
- if (hc->tx_state == TFRC_SSTATE_FBACK)
- ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
-
- /*
- * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4
- * RTO is 0 if and only if no feedback has been received yet.
- */
- if (hc->tx_t_rto == 0 || hc->tx_p == 0) {
-
- /* halve send rate directly */
- hc->tx_x = max(hc->tx_x / 2,
- (((__u64)hc->tx_s) << 6) / TFRC_T_MBI);
- ccid3_update_send_interval(hc);
- } else {
- /*
- * Modify the cached value of X_recv
- *
- * If (X_calc > 2 * X_recv)
- * X_recv = max(X_recv / 2, s / (2 * t_mbi));
- * Else
- * X_recv = X_calc / 4;
- *
- * Note that X_recv is scaled by 2^6 while X_calc is not
- */
- if (hc->tx_x_calc > (hc->tx_x_recv >> 5))
- hc->tx_x_recv =
- max(hc->tx_x_recv / 2,
- (((__u64)hc->tx_s) << 6) / (2*TFRC_T_MBI));
- else {
- hc->tx_x_recv = hc->tx_x_calc;
- hc->tx_x_recv <<= 4;
- }
- ccid3_hc_tx_update_x(sk, NULL);
- }
- ccid3_pr_debug("Reduced X to %llu/64 bytes/sec\n",
- (unsigned long long)hc->tx_x);
-
- /*
- * Set new timeout for the nofeedback timer.
- * See comments in packet_recv() regarding the value of t_RTO.
- */
- if (unlikely(hc->tx_t_rto == 0)) /* no feedback received yet */
- t_nfb = TFRC_INITIAL_TIMEOUT;
- else
- t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi);
-
-restart_timer:
- sk_reset_timer(sk, &hc->tx_no_feedback_timer,
- jiffies + usecs_to_jiffies(t_nfb));
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
-}
-
-/**
- * ccid3_hc_tx_send_packet - Delay-based dequeueing of TX packets
- * @skb: next packet candidate to send on @sk
- *
- * This function uses the convention of ccid_packet_dequeue_eval() and
- * returns a millisecond-delay value between 0 and t_mbi = 64000 msec.
- */
-static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- ktime_t now = ktime_get_real();
- s64 delay;
-
- /*
- * This function is called only for Data and DataAck packets. Sending
- * zero-sized Data(Ack)s is theoretically possible, but for congestion
- * control this case is pathological - ignore it.
- */
- if (unlikely(skb->len == 0))
- return -EBADMSG;
-
- if (hc->tx_state == TFRC_SSTATE_NO_SENT) {
- sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies +
- usecs_to_jiffies(TFRC_INITIAL_TIMEOUT)));
- hc->tx_last_win_count = 0;
- hc->tx_t_last_win_count = now;
-
- /* Set t_0 for initial packet */
- hc->tx_t_nom = now;
-
- hc->tx_s = skb->len;
-
- /*
- * Use initial RTT sample when available: recommended by erratum
- * to RFC 4342. This implements the initialisation procedure of
- * draft rfc3448bis, section 4.2. Remember, X is scaled by 2^6.
- */
- if (dp->dccps_syn_rtt) {
- ccid3_pr_debug("SYN RTT = %uus\n", dp->dccps_syn_rtt);
- hc->tx_rtt = dp->dccps_syn_rtt;
- hc->tx_x = rfc3390_initial_rate(sk);
- hc->tx_t_ld = now;
- } else {
- /*
- * Sender does not have RTT sample:
- * - set fallback RTT (RFC 4340, 3.4) since a RTT value
- * is needed in several parts (e.g. window counter);
- * - set sending rate X_pps = 1pps as per RFC 3448, 4.2.
- */
- hc->tx_rtt = DCCP_FALLBACK_RTT;
- hc->tx_x = hc->tx_s;
- hc->tx_x <<= 6;
- }
- ccid3_update_send_interval(hc);
-
- ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK);
-
- } else {
- delay = ktime_us_delta(hc->tx_t_nom, now);
- ccid3_pr_debug("delay=%ld\n", (long)delay);
- /*
- * Scheduling of packet transmissions (RFC 5348, 8.3)
- *
- * if (t_now > t_nom - delta)
- * // send the packet now
- * else
- * // send the packet in (t_nom - t_now) milliseconds.
- */
- if (delay >= TFRC_T_DELTA)
- return (u32)delay / USEC_PER_MSEC;
-
- ccid3_hc_tx_update_win_count(hc, now);
- }
-
- /* prepare to send now (add options etc.) */
- dp->dccps_hc_tx_insert_options = 1;
- DCCP_SKB_CB(skb)->dccpd_ccval = hc->tx_last_win_count;
-
- /* set the nominal send time for the next following packet */
- hc->tx_t_nom = ktime_add_us(hc->tx_t_nom, hc->tx_t_ipi);
- return CCID_PACKET_SEND_AT_ONCE;
-}
-
-static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len)
-{
- struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
-
- ccid3_hc_tx_update_s(hc, len);
-
- if (tfrc_tx_hist_add(&hc->tx_hist, dccp_sk(sk)->dccps_gss))
- DCCP_CRIT("packet history - out of memory!");
-}
-
-static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb)
-{
- struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- struct tfrc_tx_hist_entry *acked;
- ktime_t now;
- unsigned long t_nfb;
- u32 r_sample;
-
- /* we are only interested in ACKs */
- if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK ||
- DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK))
- return;
- /*
- * Locate the acknowledged packet in the TX history.
- *
- * Returning "entry not found" here can for instance happen when
- * - the host has not sent out anything (e.g. a passive server),
- * - the Ack is outdated (packet with higher Ack number was received),
- * - it is a bogus Ack (for a packet not sent on this connection).
- */
- acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb));
- if (acked == NULL)
- return;
- /* For the sake of RTT sampling, ignore/remove all older entries */
- tfrc_tx_hist_purge(&acked->next);
-
- /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */
- now = ktime_get_real();
- r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp));
- hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9);
-
- /*
- * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3
- */
- if (hc->tx_state == TFRC_SSTATE_NO_FBACK) {
- ccid3_hc_tx_set_state(sk, TFRC_SSTATE_FBACK);
-
- if (hc->tx_t_rto == 0) {
- /*
- * Initial feedback packet: Larger Initial Windows (4.2)
- */
- hc->tx_x = rfc3390_initial_rate(sk);
- hc->tx_t_ld = now;
-
- ccid3_update_send_interval(hc);
-
- goto done_computing_x;
- } else if (hc->tx_p == 0) {
- /*
- * First feedback after nofeedback timer expiry (4.3)
- */
- goto done_computing_x;
- }
- }
-
- /* Update sending rate (step 4 of [RFC 3448, 4.3]) */
- if (hc->tx_p > 0)
- hc->tx_x_calc = tfrc_calc_x(hc->tx_s, hc->tx_rtt, hc->tx_p);
- ccid3_hc_tx_update_x(sk, &now);
-
-done_computing_x:
- ccid3_pr_debug("%s(%p), RTT=%uus (sample=%uus), s=%u, "
- "p=%u, X_calc=%u, X_recv=%u, X=%u\n",
- dccp_role(sk), sk, hc->tx_rtt, r_sample,
- hc->tx_s, hc->tx_p, hc->tx_x_calc,
- (unsigned int)(hc->tx_x_recv >> 6),
- (unsigned int)(hc->tx_x >> 6));
-
- /* unschedule no feedback timer */
- sk_stop_timer(sk, &hc->tx_no_feedback_timer);
-
- /*
- * As we have calculated new ipi, delta, t_nom it is possible
- * that we now can send a packet, so wake up dccp_wait_for_ccid
- */
- sk->sk_write_space(sk);
-
- /*
- * Update timeout interval for the nofeedback timer. In order to control
- * rate halving on networks with very low RTTs (<= 1 ms), use per-route
- * tunable RTAX_RTO_MIN value as the lower bound.
- */
- hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt,
- USEC_PER_SEC/HZ * tcp_rto_min(sk));
- /*
- * Schedule no feedback timer to expire in
- * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi)
- */
- t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi);
-
- ccid3_pr_debug("%s(%p), Scheduled no feedback timer to "
- "expire in %lu jiffies (%luus)\n",
- dccp_role(sk), sk, usecs_to_jiffies(t_nfb), t_nfb);
-
- sk_reset_timer(sk, &hc->tx_no_feedback_timer,
- jiffies + usecs_to_jiffies(t_nfb));
-}
-
-static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type,
- u8 option, u8 *optval, u8 optlen)
-{
- struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- __be32 opt_val;
-
- switch (option) {
- case TFRC_OPT_RECEIVE_RATE:
- case TFRC_OPT_LOSS_EVENT_RATE:
- /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */
- if (packet_type == DCCP_PKT_DATA)
- break;
- if (unlikely(optlen != 4)) {
- DCCP_WARN("%s(%p), invalid len %d for %u\n",
- dccp_role(sk), sk, optlen, option);
- return -EINVAL;
- }
- opt_val = ntohl(get_unaligned((__be32 *)optval));
-
- if (option == TFRC_OPT_RECEIVE_RATE) {
- /* Receive Rate is kept in units of 64 bytes/second */
- hc->tx_x_recv = opt_val;
- hc->tx_x_recv <<= 6;
-
- ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n",
- dccp_role(sk), sk, opt_val);
- } else {
- /* Update the fixpoint Loss Event Rate fraction */
- hc->tx_p = tfrc_invert_loss_event_rate(opt_val);
-
- ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n",
- dccp_role(sk), sk, opt_val);
- }
- }
- return 0;
-}
-
-static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk)
-{
- struct ccid3_hc_tx_sock *hc = ccid_priv(ccid);
-
- hc->tx_state = TFRC_SSTATE_NO_SENT;
- hc->tx_hist = NULL;
- hc->sk = sk;
- timer_setup(&hc->tx_no_feedback_timer,
- ccid3_hc_tx_no_feedback_timer, 0);
- return 0;
-}
-
-static void ccid3_hc_tx_exit(struct sock *sk)
-{
- struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
-
- sk_stop_timer(sk, &hc->tx_no_feedback_timer);
- tfrc_tx_hist_purge(&hc->tx_hist);
-}
-
-static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info)
-{
- info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto;
- info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt;
-}
-
-static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
- u32 __user *optval, int __user *optlen)
-{
- const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk);
- struct tfrc_tx_info tfrc;
- const void *val;
-
- switch (optname) {
- case DCCP_SOCKOPT_CCID_TX_INFO:
- if (len < sizeof(tfrc))
- return -EINVAL;
- memset(&tfrc, 0, sizeof(tfrc));
- tfrc.tfrctx_x = hc->tx_x;
- tfrc.tfrctx_x_recv = hc->tx_x_recv;
- tfrc.tfrctx_x_calc = hc->tx_x_calc;
- tfrc.tfrctx_rtt = hc->tx_rtt;
- tfrc.tfrctx_p = hc->tx_p;
- tfrc.tfrctx_rto = hc->tx_t_rto;
- tfrc.tfrctx_ipi = hc->tx_t_ipi;
- len = sizeof(tfrc);
- val = &tfrc;
- break;
- default:
- return -ENOPROTOOPT;
- }
-
- if (put_user(len, optlen) || copy_to_user(optval, val, len))
- return -EFAULT;
-
- return 0;
-}
-
-/*
- * Receiver Half-Connection Routines
- */
-
-/* CCID3 feedback types */
-enum ccid3_fback_type {
- CCID3_FBACK_NONE = 0,
- CCID3_FBACK_INITIAL,
- CCID3_FBACK_PERIODIC,
- CCID3_FBACK_PARAM_CHANGE
-};
-
-#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
-static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state)
-{
- static const char *const ccid3_rx_state_names[] = {
- [TFRC_RSTATE_NO_DATA] = "NO_DATA",
- [TFRC_RSTATE_DATA] = "DATA",
- };
-
- return ccid3_rx_state_names[state];
-}
-#endif
-
-static void ccid3_hc_rx_set_state(struct sock *sk,
- enum ccid3_hc_rx_states state)
-{
- struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
- enum ccid3_hc_rx_states oldstate = hc->rx_state;
-
- ccid3_pr_debug("%s(%p) %-8.8s -> %s\n",
- dccp_role(sk), sk, ccid3_rx_state_name(oldstate),
- ccid3_rx_state_name(state));
- WARN_ON(state == oldstate);
- hc->rx_state = state;
-}
-
-static void ccid3_hc_rx_send_feedback(struct sock *sk,
- const struct sk_buff *skb,
- enum ccid3_fback_type fbtype)
-{
- struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- ktime_t now = ktime_get();
- s64 delta = 0;
-
- switch (fbtype) {
- case CCID3_FBACK_INITIAL:
- hc->rx_x_recv = 0;
- hc->rx_pinv = ~0U; /* see RFC 4342, 8.5 */
- break;
- case CCID3_FBACK_PARAM_CHANGE:
- /*
- * When parameters change (new loss or p > p_prev), we do not
- * have a reliable estimate for R_m of [RFC 3448, 6.2] and so
- * need to reuse the previous value of X_recv. However, when
- * X_recv was 0 (due to early loss), this would kill X down to
- * s/t_mbi (i.e. one packet in 64 seconds).
- * To avoid such drastic reduction, we approximate X_recv as
- * the number of bytes since last feedback.
- * This is a safe fallback, since X is bounded above by X_calc.
- */
- if (hc->rx_x_recv > 0)
- break;
- /* fall through */
- case CCID3_FBACK_PERIODIC:
- delta = ktime_us_delta(now, hc->rx_tstamp_last_feedback);
- if (delta <= 0)
- delta = 1;
- hc->rx_x_recv = scaled_div32(hc->rx_bytes_recv, delta);
- break;
- default:
- return;
- }
-
- ccid3_pr_debug("Interval %lldusec, X_recv=%u, 1/p=%u\n", delta,
- hc->rx_x_recv, hc->rx_pinv);
-
- hc->rx_tstamp_last_feedback = now;
- hc->rx_last_counter = dccp_hdr(skb)->dccph_ccval;
- hc->rx_bytes_recv = 0;
-
- dp->dccps_hc_rx_insert_options = 1;
- dccp_send_ack(sk);
-}
-
-static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb)
-{
- const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
- __be32 x_recv, pinv;
-
- if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN))
- return 0;
-
- if (dccp_packet_without_ack(skb))
- return 0;
-
- x_recv = htonl(hc->rx_x_recv);
- pinv = htonl(hc->rx_pinv);
-
- if (dccp_insert_option(skb, TFRC_OPT_LOSS_EVENT_RATE,
- &pinv, sizeof(pinv)) ||
- dccp_insert_option(skb, TFRC_OPT_RECEIVE_RATE,
- &x_recv, sizeof(x_recv)))
- return -1;
-
- return 0;
-}
-
-/**
- * ccid3_first_li - Implements [RFC 5348, 6.3.1]
- *
- * Determine the length of the first loss interval via inverse lookup.
- * Assume that X_recv can be computed by the throughput equation
- * s
- * X_recv = --------
- * R * fval
- * Find some p such that f(p) = fval; return 1/p (scaled).
- */
-static u32 ccid3_first_li(struct sock *sk)
-{
- struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
- u32 x_recv, p;
- s64 delta;
- u64 fval;
-
- if (hc->rx_rtt == 0) {
- DCCP_WARN("No RTT estimate available, using fallback RTT\n");
- hc->rx_rtt = DCCP_FALLBACK_RTT;
- }
-
- delta = ktime_us_delta(ktime_get(), hc->rx_tstamp_last_feedback);
- if (delta <= 0)
- delta = 1;
- x_recv = scaled_div32(hc->rx_bytes_recv, delta);
- if (x_recv == 0) { /* would also trigger divide-by-zero */
- DCCP_WARN("X_recv==0\n");
- if (hc->rx_x_recv == 0) {
- DCCP_BUG("stored value of X_recv is zero");
- return ~0U;
- }
- x_recv = hc->rx_x_recv;
- }
-
- fval = scaled_div(hc->rx_s, hc->rx_rtt);
- fval = scaled_div32(fval, x_recv);
- p = tfrc_calc_x_reverse_lookup(fval);
-
- ccid3_pr_debug("%s(%p), receive rate=%u bytes/s, implied "
- "loss rate=%u\n", dccp_role(sk), sk, x_recv, p);
-
- return p == 0 ? ~0U : scaled_div(1, p);
-}
-
-static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
-{
- struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
- enum ccid3_fback_type do_feedback = CCID3_FBACK_NONE;
- const u64 ndp = dccp_sk(sk)->dccps_options_received.dccpor_ndp;
- const bool is_data_packet = dccp_data_packet(skb);
-
- if (unlikely(hc->rx_state == TFRC_RSTATE_NO_DATA)) {
- if (is_data_packet) {
- const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
- do_feedback = CCID3_FBACK_INITIAL;
- ccid3_hc_rx_set_state(sk, TFRC_RSTATE_DATA);
- hc->rx_s = payload;
- /*
- * Not necessary to update rx_bytes_recv here,
- * since X_recv = 0 for the first feedback packet (cf.
- * RFC 3448, 6.3) -- gerrit
- */
- }
- goto update_records;
- }
-
- if (tfrc_rx_hist_duplicate(&hc->rx_hist, skb))
- return; /* done receiving */
-
- if (is_data_packet) {
- const u32 payload = skb->len - dccp_hdr(skb)->dccph_doff * 4;
- /*
- * Update moving-average of s and the sum of received payload bytes
- */
- hc->rx_s = tfrc_ewma(hc->rx_s, payload, 9);
- hc->rx_bytes_recv += payload;
- }
-
- /*
- * Perform loss detection and handle pending losses
- */
- if (tfrc_rx_handle_loss(&hc->rx_hist, &hc->rx_li_hist,
- skb, ndp, ccid3_first_li, sk)) {
- do_feedback = CCID3_FBACK_PARAM_CHANGE;
- goto done_receiving;
- }
-
- if (tfrc_rx_hist_loss_pending(&hc->rx_hist))
- return; /* done receiving */
-
- /*
- * Handle data packets: RTT sampling and monitoring p
- */
- if (unlikely(!is_data_packet))
- goto update_records;
-
- if (!tfrc_lh_is_initialised(&hc->rx_li_hist)) {
- const u32 sample = tfrc_rx_hist_sample_rtt(&hc->rx_hist, skb);
- /*
- * Empty loss history: no loss so far, hence p stays 0.
- * Sample RTT values, since an RTT estimate is required for the
- * computation of p when the first loss occurs; RFC 3448, 6.3.1.
- */
- if (sample != 0)
- hc->rx_rtt = tfrc_ewma(hc->rx_rtt, sample, 9);
-
- } else if (tfrc_lh_update_i_mean(&hc->rx_li_hist, skb)) {
- /*
- * Step (3) of [RFC 3448, 6.1]: Recompute I_mean and, if I_mean
- * has decreased (resp. p has increased), send feedback now.
- */
- do_feedback = CCID3_FBACK_PARAM_CHANGE;
- }
-
- /*
- * Check if the periodic once-per-RTT feedback is due; RFC 4342, 10.3
- */
- if (SUB16(dccp_hdr(skb)->dccph_ccval, hc->rx_last_counter) > 3)
- do_feedback = CCID3_FBACK_PERIODIC;
-
-update_records:
- tfrc_rx_hist_add_packet(&hc->rx_hist, skb, ndp);
-
-done_receiving:
- if (do_feedback)
- ccid3_hc_rx_send_feedback(sk, skb, do_feedback);
-}
-
-static int ccid3_hc_rx_init(struct ccid *ccid, struct sock *sk)
-{
- struct ccid3_hc_rx_sock *hc = ccid_priv(ccid);
-
- hc->rx_state = TFRC_RSTATE_NO_DATA;
- tfrc_lh_init(&hc->rx_li_hist);
- return tfrc_rx_hist_alloc(&hc->rx_hist);
-}
-
-static void ccid3_hc_rx_exit(struct sock *sk)
-{
- struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
-
- tfrc_rx_hist_purge(&hc->rx_hist);
- tfrc_lh_cleanup(&hc->rx_li_hist);
-}
-
-static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info)
-{
- info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state;
- info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
- info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rx_rtt;
-}
-
-static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len,
- u32 __user *optval, int __user *optlen)
-{
- const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk);
- struct tfrc_rx_info rx_info;
- const void *val;
-
- switch (optname) {
- case DCCP_SOCKOPT_CCID_RX_INFO:
- if (len < sizeof(rx_info))
- return -EINVAL;
- rx_info.tfrcrx_x_recv = hc->rx_x_recv;
- rx_info.tfrcrx_rtt = hc->rx_rtt;
- rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hc->rx_pinv);
- len = sizeof(rx_info);
- val = &rx_info;
- break;
- default:
- return -ENOPROTOOPT;
- }
-
- if (put_user(len, optlen) || copy_to_user(optval, val, len))
- return -EFAULT;
-
- return 0;
-}
-
-struct ccid_operations ccid3_ops = {
- .ccid_id = DCCPC_CCID3,
- .ccid_name = "TCP-Friendly Rate Control",
- .ccid_hc_tx_obj_size = sizeof(struct ccid3_hc_tx_sock),
- .ccid_hc_tx_init = ccid3_hc_tx_init,
- .ccid_hc_tx_exit = ccid3_hc_tx_exit,
- .ccid_hc_tx_send_packet = ccid3_hc_tx_send_packet,
- .ccid_hc_tx_packet_sent = ccid3_hc_tx_packet_sent,
- .ccid_hc_tx_packet_recv = ccid3_hc_tx_packet_recv,
- .ccid_hc_tx_parse_options = ccid3_hc_tx_parse_options,
- .ccid_hc_rx_obj_size = sizeof(struct ccid3_hc_rx_sock),
- .ccid_hc_rx_init = ccid3_hc_rx_init,
- .ccid_hc_rx_exit = ccid3_hc_rx_exit,
- .ccid_hc_rx_insert_options = ccid3_hc_rx_insert_options,
- .ccid_hc_rx_packet_recv = ccid3_hc_rx_packet_recv,
- .ccid_hc_rx_get_info = ccid3_hc_rx_get_info,
- .ccid_hc_tx_get_info = ccid3_hc_tx_get_info,
- .ccid_hc_rx_getsockopt = ccid3_hc_rx_getsockopt,
- .ccid_hc_tx_getsockopt = ccid3_hc_tx_getsockopt,
-};
-
-#ifdef CONFIG_IP_DCCP_CCID3_DEBUG
-module_param(ccid3_debug, bool, 0644);
-MODULE_PARM_DESC(ccid3_debug, "Enable CCID-3 debug messages");
-#endif
diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h
deleted file mode 100644
index 813d91c6e1e2..000000000000
--- a/net/dccp/ccids/ccid3.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
- *
- * An implementation of the DCCP protocol
- *
- * This code has been developed by the University of Waikato WAND
- * research group. For further information please see http://www.wand.net.nz/
- * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
- *
- * This code also uses code from Lulea University, rereleased as GPL by its
- * authors:
- * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
- *
- * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
- * and to make it work as a loadable module in the DCCP stack written by
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
- *
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-#ifndef _DCCP_CCID3_H_
-#define _DCCP_CCID3_H_
-
-#include <linux/ktime.h>
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/tfrc.h>
-#include "lib/tfrc.h"
-#include "../ccid.h"
-
-/* Two seconds as per RFC 5348, 4.2 */
-#define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC)
-
-/* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */
-#define TFRC_T_MBI 64
-
-/*
- * The t_delta parameter (RFC 5348, 8.3): delays of less than %USEC_PER_MSEC are
- * rounded down to 0, since sk_reset_timer() here uses millisecond granularity.
- * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse
- * resolution of HZ < 500 means that the error is below one timer tick (t_gran)
- * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ).
- */
-#if (HZ >= 500)
-# define TFRC_T_DELTA USEC_PER_MSEC
-#else
-# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ))
-#endif
-
-enum ccid3_options {
- TFRC_OPT_LOSS_EVENT_RATE = 192,
- TFRC_OPT_LOSS_INTERVALS = 193,
- TFRC_OPT_RECEIVE_RATE = 194,
-};
-
-/* TFRC sender states */
-enum ccid3_hc_tx_states {
- TFRC_SSTATE_NO_SENT = 1,
- TFRC_SSTATE_NO_FBACK,
- TFRC_SSTATE_FBACK,
-};
-
-/**
- * struct ccid3_hc_tx_sock - CCID3 sender half-connection socket
- * @tx_x: Current sending rate in 64 * bytes per second
- * @tx_x_recv: Receive rate in 64 * bytes per second
- * @tx_x_calc: Calculated rate in bytes per second
- * @tx_rtt: Estimate of current round trip time in usecs
- * @tx_p: Current loss event rate (0-1) scaled by 1000000
- * @tx_s: Packet size in bytes
- * @tx_t_rto: Nofeedback Timer setting in usecs
- * @tx_t_ipi: Interpacket (send) interval (RFC 3448, 4.6) in usecs
- * @tx_state: Sender state, one of %ccid3_hc_tx_states
- * @tx_last_win_count: Last window counter sent
- * @tx_t_last_win_count: Timestamp of earliest packet
- * with last_win_count value sent
- * @tx_no_feedback_timer: Handle to no feedback timer
- * @tx_t_ld: Time last doubled during slow start
- * @tx_t_nom: Nominal send time of next packet
- * @tx_hist: Packet history
- */
-struct ccid3_hc_tx_sock {
- u64 tx_x;
- u64 tx_x_recv;
- u32 tx_x_calc;
- u32 tx_rtt;
- u32 tx_p;
- u32 tx_t_rto;
- u32 tx_t_ipi;
- u16 tx_s;
- enum ccid3_hc_tx_states tx_state:8;
- u8 tx_last_win_count;
- ktime_t tx_t_last_win_count;
- struct timer_list tx_no_feedback_timer;
- struct sock *sk;
- ktime_t tx_t_ld;
- ktime_t tx_t_nom;
- struct tfrc_tx_hist_entry *tx_hist;
-};
-
-static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk)
-{
- struct ccid3_hc_tx_sock *hctx = ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid);
- BUG_ON(hctx == NULL);
- return hctx;
-}
-
-/* TFRC receiver states */
-enum ccid3_hc_rx_states {
- TFRC_RSTATE_NO_DATA = 1,
- TFRC_RSTATE_DATA,
-};
-
-/**
- * struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket
- * @rx_last_counter: Tracks window counter (RFC 4342, 8.1)
- * @rx_state: Receiver state, one of %ccid3_hc_rx_states
- * @rx_bytes_recv: Total sum of DCCP payload bytes
- * @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3)
- * @rx_rtt: Receiver estimate of RTT
- * @rx_tstamp_last_feedback: Time at which last feedback was sent
- * @rx_hist: Packet history (loss detection + RTT sampling)
- * @rx_li_hist: Loss Interval database
- * @rx_s: Received packet size in bytes
- * @rx_pinv: Inverse of Loss Event Rate (RFC 4342, sec. 8.5)
- */
-struct ccid3_hc_rx_sock {
- u8 rx_last_counter:4;
- enum ccid3_hc_rx_states rx_state:8;
- u32 rx_bytes_recv;
- u32 rx_x_recv;
- u32 rx_rtt;
- ktime_t rx_tstamp_last_feedback;
- struct tfrc_rx_hist rx_hist;
- struct tfrc_loss_hist rx_li_hist;
- u16 rx_s;
-#define rx_pinv rx_li_hist.i_mean
-};
-
-static inline struct ccid3_hc_rx_sock *ccid3_hc_rx_sk(const struct sock *sk)
-{
- struct ccid3_hc_rx_sock *hcrx = ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid);
- BUG_ON(hcrx == NULL);
- return hcrx;
-}
-
-#endif /* _DCCP_CCID3_H_ */
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
deleted file mode 100644
index 57f9fd78c4df..000000000000
--- a/net/dccp/ccids/lib/loss_interval.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
- * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-#include <net/sock.h>
-#include "tfrc.h"
-
-static struct kmem_cache *tfrc_lh_slab __read_mostly;
-/* Loss Interval weights from [RFC 3448, 5.4], scaled by 10 */
-static const int tfrc_lh_weights[NINTERVAL] = { 10, 10, 10, 10, 8, 6, 4, 2 };
-
-/* implements LIFO semantics on the array */
-static inline u8 LIH_INDEX(const u8 ctr)
-{
- return LIH_SIZE - 1 - (ctr % LIH_SIZE);
-}
-
-/* the `counter' index always points at the next entry to be populated */
-static inline struct tfrc_loss_interval *tfrc_lh_peek(struct tfrc_loss_hist *lh)
-{
- return lh->counter ? lh->ring[LIH_INDEX(lh->counter - 1)] : NULL;
-}
-
-/* given i with 0 <= i <= k, return I_i as per the rfc3448bis notation */
-static inline u32 tfrc_lh_get_interval(struct tfrc_loss_hist *lh, const u8 i)
-{
- BUG_ON(i >= lh->counter);
- return lh->ring[LIH_INDEX(lh->counter - i - 1)]->li_length;
-}
-
-/*
- * On-demand allocation and de-allocation of entries
- */
-static struct tfrc_loss_interval *tfrc_lh_demand_next(struct tfrc_loss_hist *lh)
-{
- if (lh->ring[LIH_INDEX(lh->counter)] == NULL)
- lh->ring[LIH_INDEX(lh->counter)] = kmem_cache_alloc(tfrc_lh_slab,
- GFP_ATOMIC);
- return lh->ring[LIH_INDEX(lh->counter)];
-}
-
-void tfrc_lh_cleanup(struct tfrc_loss_hist *lh)
-{
- if (!tfrc_lh_is_initialised(lh))
- return;
-
- for (lh->counter = 0; lh->counter < LIH_SIZE; lh->counter++)
- if (lh->ring[LIH_INDEX(lh->counter)] != NULL) {
- kmem_cache_free(tfrc_lh_slab,
- lh->ring[LIH_INDEX(lh->counter)]);
- lh->ring[LIH_INDEX(lh->counter)] = NULL;
- }
-}
-
-static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh)
-{
- u32 i_i, i_tot0 = 0, i_tot1 = 0, w_tot = 0;
- int i, k = tfrc_lh_length(lh) - 1; /* k is as in rfc3448bis, 5.4 */
-
- if (k <= 0)
- return;
-
- for (i = 0; i <= k; i++) {
- i_i = tfrc_lh_get_interval(lh, i);
-
- if (i < k) {
- i_tot0 += i_i * tfrc_lh_weights[i];
- w_tot += tfrc_lh_weights[i];
- }
- if (i > 0)
- i_tot1 += i_i * tfrc_lh_weights[i-1];
- }
-
- lh->i_mean = max(i_tot0, i_tot1) / w_tot;
-}
-
-/**
- * tfrc_lh_update_i_mean - Update the `open' loss interval I_0
- * For recomputing p: returns `true' if p > p_prev <=> 1/p < 1/p_prev
- */
-u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
-{
- struct tfrc_loss_interval *cur = tfrc_lh_peek(lh);
- u32 old_i_mean = lh->i_mean;
- s64 len;
-
- if (cur == NULL) /* not initialised */
- return 0;
-
- len = dccp_delta_seqno(cur->li_seqno, DCCP_SKB_CB(skb)->dccpd_seq) + 1;
-
- if (len - (s64)cur->li_length <= 0) /* duplicate or reordered */
- return 0;
-
- if (SUB16(dccp_hdr(skb)->dccph_ccval, cur->li_ccval) > 4)
- /*
- * Implements RFC 4342, 10.2:
- * If a packet S (skb) exists whose seqno comes `after' the one
- * starting the current loss interval (cur) and if the modulo-16
- * distance from C(cur) to C(S) is greater than 4, consider all
- * subsequent packets as belonging to a new loss interval. This
- * test is necessary since CCVal may wrap between intervals.
- */
- cur->li_is_closed = 1;
-
- if (tfrc_lh_length(lh) == 1) /* due to RFC 3448, 6.3.1 */
- return 0;
-
- cur->li_length = len;
- tfrc_lh_calc_i_mean(lh);
-
- return lh->i_mean < old_i_mean;
-}
-
-/* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
-static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur,
- struct tfrc_rx_hist_entry *new_loss)
-{
- return dccp_delta_seqno(cur->li_seqno, new_loss->tfrchrx_seqno) > 0 &&
- (cur->li_is_closed || SUB16(new_loss->tfrchrx_ccval, cur->li_ccval) > 4);
-}
-
-/**
- * tfrc_lh_interval_add - Insert new record into the Loss Interval database
- * @lh: Loss Interval database
- * @rh: Receive history containing a fresh loss event
- * @calc_first_li: Caller-dependent routine to compute length of first interval
- * @sk: Used by @calc_first_li in caller-specific way (subtyping)
- *
- * Updates I_mean and returns 1 if a new interval has in fact been added to @lh.
- */
-int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh,
- u32 (*calc_first_li)(struct sock *), struct sock *sk)
-{
- struct tfrc_loss_interval *cur = tfrc_lh_peek(lh), *new;
-
- if (cur != NULL && !tfrc_lh_is_new_loss(cur, tfrc_rx_hist_loss_prev(rh)))
- return 0;
-
- new = tfrc_lh_demand_next(lh);
- if (unlikely(new == NULL)) {
- DCCP_CRIT("Cannot allocate/add loss record.");
- return 0;
- }
-
- new->li_seqno = tfrc_rx_hist_loss_prev(rh)->tfrchrx_seqno;
- new->li_ccval = tfrc_rx_hist_loss_prev(rh)->tfrchrx_ccval;
- new->li_is_closed = 0;
-
- if (++lh->counter == 1)
- lh->i_mean = new->li_length = (*calc_first_li)(sk);
- else {
- cur->li_length = dccp_delta_seqno(cur->li_seqno, new->li_seqno);
- new->li_length = dccp_delta_seqno(new->li_seqno,
- tfrc_rx_hist_last_rcv(rh)->tfrchrx_seqno) + 1;
- if (lh->counter > (2*LIH_SIZE))
- lh->counter -= LIH_SIZE;
-
- tfrc_lh_calc_i_mean(lh);
- }
- return 1;
-}
-
-int __init tfrc_li_init(void)
-{
- tfrc_lh_slab = kmem_cache_create("tfrc_li_hist",
- sizeof(struct tfrc_loss_interval), 0,
- SLAB_HWCACHE_ALIGN, NULL);
- return tfrc_lh_slab == NULL ? -ENOBUFS : 0;
-}
-
-void tfrc_li_exit(void)
-{
- if (tfrc_lh_slab != NULL) {
- kmem_cache_destroy(tfrc_lh_slab);
- tfrc_lh_slab = NULL;
- }
-}
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
deleted file mode 100644
index 57f631a86ccd..000000000000
--- a/net/dccp/ccids/lib/loss_interval.h
+++ /dev/null
@@ -1,73 +0,0 @@
-#ifndef _DCCP_LI_HIST_
-#define _DCCP_LI_HIST_
-/*
- * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
- * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2005-7 Ian McDonald <ian.mcdonald@jandi.co.nz>
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- */
-#include <linux/ktime.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-
-/*
- * Number of loss intervals (RFC 4342, 8.6.1). The history size is one more than
- * NINTERVAL, since the `open' interval I_0 is always stored as the first entry.
- */
-#define NINTERVAL 8
-#define LIH_SIZE (NINTERVAL + 1)
-
-/**
- * tfrc_loss_interval - Loss history record for TFRC-based protocols
- * @li_seqno: Highest received seqno before the start of loss
- * @li_ccval: The CCVal belonging to @li_seqno
- * @li_is_closed: Whether @li_seqno is older than 1 RTT
- * @li_length: Loss interval sequence length
- */
-struct tfrc_loss_interval {
- u64 li_seqno:48,
- li_ccval:4,
- li_is_closed:1;
- u32 li_length;
-};
-
-/**
- * tfrc_loss_hist - Loss record database
- * @ring: Circular queue managed in LIFO manner
- * @counter: Current count of entries (can be more than %LIH_SIZE)
- * @i_mean: Current Average Loss Interval [RFC 3448, 5.4]
- */
-struct tfrc_loss_hist {
- struct tfrc_loss_interval *ring[LIH_SIZE];
- u8 counter;
- u32 i_mean;
-};
-
-static inline void tfrc_lh_init(struct tfrc_loss_hist *lh)
-{
- memset(lh, 0, sizeof(struct tfrc_loss_hist));
-}
-
-static inline u8 tfrc_lh_is_initialised(struct tfrc_loss_hist *lh)
-{
- return lh->counter > 0;
-}
-
-static inline u8 tfrc_lh_length(struct tfrc_loss_hist *lh)
-{
- return min(lh->counter, (u8)LIH_SIZE);
-}
-
-struct tfrc_rx_hist;
-
-int tfrc_lh_interval_add(struct tfrc_loss_hist *, struct tfrc_rx_hist *,
- u32 (*first_li)(struct sock *), struct sock *);
-u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *);
-void tfrc_lh_cleanup(struct tfrc_loss_hist *lh);
-
-#endif /* _DCCP_LI_HIST_ */
diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c
deleted file mode 100644
index 876e18592d71..000000000000
--- a/net/dccp/ccids/lib/packet_history.c
+++ /dev/null
@@ -1,447 +0,0 @@
-/*
- * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
- * Copyright (c) 2005-7 The University of Waikato, Hamilton, New Zealand.
- *
- * An implementation of the DCCP protocol
- *
- * This code has been developed by the University of Waikato WAND
- * research group. For further information please see http://www.wand.net.nz/
- * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
- *
- * This code also uses code from Lulea University, rereleased as GPL by its
- * authors:
- * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
- *
- * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
- * and to make it work as a loadable module in the DCCP stack written by
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
- *
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#include <linux/string.h>
-#include <linux/slab.h>
-#include "packet_history.h"
-#include "../../dccp.h"
-
-/*
- * Transmitter History Routines
- */
-static struct kmem_cache *tfrc_tx_hist_slab;
-
-int __init tfrc_tx_packet_history_init(void)
-{
- tfrc_tx_hist_slab = kmem_cache_create("tfrc_tx_hist",
- sizeof(struct tfrc_tx_hist_entry),
- 0, SLAB_HWCACHE_ALIGN, NULL);
- return tfrc_tx_hist_slab == NULL ? -ENOBUFS : 0;
-}
-
-void tfrc_tx_packet_history_exit(void)
-{
- if (tfrc_tx_hist_slab != NULL) {
- kmem_cache_destroy(tfrc_tx_hist_slab);
- tfrc_tx_hist_slab = NULL;
- }
-}
-
-int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno)
-{
- struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any());
-
- if (entry == NULL)
- return -ENOBUFS;
- entry->seqno = seqno;
- entry->stamp = ktime_get_real();
- entry->next = *headp;
- *headp = entry;
- return 0;
-}
-
-void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp)
-{
- struct tfrc_tx_hist_entry *head = *headp;
-
- while (head != NULL) {
- struct tfrc_tx_hist_entry *next = head->next;
-
- kmem_cache_free(tfrc_tx_hist_slab, head);
- head = next;
- }
-
- *headp = NULL;
-}
-
-/*
- * Receiver History Routines
- */
-static struct kmem_cache *tfrc_rx_hist_slab;
-
-int __init tfrc_rx_packet_history_init(void)
-{
- tfrc_rx_hist_slab = kmem_cache_create("tfrc_rxh_cache",
- sizeof(struct tfrc_rx_hist_entry),
- 0, SLAB_HWCACHE_ALIGN, NULL);
- return tfrc_rx_hist_slab == NULL ? -ENOBUFS : 0;
-}
-
-void tfrc_rx_packet_history_exit(void)
-{
- if (tfrc_rx_hist_slab != NULL) {
- kmem_cache_destroy(tfrc_rx_hist_slab);
- tfrc_rx_hist_slab = NULL;
- }
-}
-
-static inline void tfrc_rx_hist_entry_from_skb(struct tfrc_rx_hist_entry *entry,
- const struct sk_buff *skb,
- const u64 ndp)
-{
- const struct dccp_hdr *dh = dccp_hdr(skb);
-
- entry->tfrchrx_seqno = DCCP_SKB_CB(skb)->dccpd_seq;
- entry->tfrchrx_ccval = dh->dccph_ccval;
- entry->tfrchrx_type = dh->dccph_type;
- entry->tfrchrx_ndp = ndp;
- entry->tfrchrx_tstamp = ktime_get_real();
-}
-
-void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h,
- const struct sk_buff *skb,
- const u64 ndp)
-{
- struct tfrc_rx_hist_entry *entry = tfrc_rx_hist_last_rcv(h);
-
- tfrc_rx_hist_entry_from_skb(entry, skb, ndp);
-}
-
-/* has the packet contained in skb been seen before? */
-int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb)
-{
- const u64 seq = DCCP_SKB_CB(skb)->dccpd_seq;
- int i;
-
- if (dccp_delta_seqno(tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno, seq) <= 0)
- return 1;
-
- for (i = 1; i <= h->loss_count; i++)
- if (tfrc_rx_hist_entry(h, i)->tfrchrx_seqno == seq)
- return 1;
-
- return 0;
-}
-
-static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b)
-{
- const u8 idx_a = tfrc_rx_hist_index(h, a),
- idx_b = tfrc_rx_hist_index(h, b);
-
- swap(h->ring[idx_a], h->ring[idx_b]);
-}
-
-/*
- * Private helper functions for loss detection.
- *
- * In the descriptions, `Si' refers to the sequence number of entry number i,
- * whose NDP count is `Ni' (lower case is used for variables).
- * Note: All __xxx_loss functions expect that a test against duplicates has been
- * performed already: the seqno of the skb must not be less than the seqno
- * of loss_prev; and it must not equal that of any valid history entry.
- */
-static void __do_track_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u64 n1)
-{
- u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
- s1 = DCCP_SKB_CB(skb)->dccpd_seq;
-
- if (!dccp_loss_free(s0, s1, n1)) { /* gap between S0 and S1 */
- h->loss_count = 1;
- tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n1);
- }
-}
-
-static void __one_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n2)
-{
- u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
- s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
- s2 = DCCP_SKB_CB(skb)->dccpd_seq;
-
- if (likely(dccp_delta_seqno(s1, s2) > 0)) { /* S1 < S2 */
- h->loss_count = 2;
- tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n2);
- return;
- }
-
- /* S0 < S2 < S1 */
-
- if (dccp_loss_free(s0, s2, n2)) {
- u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp;
-
- if (dccp_loss_free(s2, s1, n1)) {
- /* hole is filled: S0, S2, and S1 are consecutive */
- h->loss_count = 0;
- h->loss_start = tfrc_rx_hist_index(h, 1);
- } else
- /* gap between S2 and S1: just update loss_prev */
- tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n2);
-
- } else { /* gap between S0 and S2 */
- /*
- * Reorder history to insert S2 between S0 and S1
- */
- tfrc_rx_hist_swap(h, 0, 3);
- h->loss_start = tfrc_rx_hist_index(h, 3);
- tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n2);
- h->loss_count = 2;
- }
-}
-
-/* return 1 if a new loss event has been identified */
-static int __two_after_loss(struct tfrc_rx_hist *h, struct sk_buff *skb, u32 n3)
-{
- u64 s0 = tfrc_rx_hist_loss_prev(h)->tfrchrx_seqno,
- s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
- s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno,
- s3 = DCCP_SKB_CB(skb)->dccpd_seq;
-
- if (likely(dccp_delta_seqno(s2, s3) > 0)) { /* S2 < S3 */
- h->loss_count = 3;
- tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 3), skb, n3);
- return 1;
- }
-
- /* S3 < S2 */
-
- if (dccp_delta_seqno(s1, s3) > 0) { /* S1 < S3 < S2 */
- /*
- * Reorder history to insert S3 between S1 and S2
- */
- tfrc_rx_hist_swap(h, 2, 3);
- tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 2), skb, n3);
- h->loss_count = 3;
- return 1;
- }
-
- /* S0 < S3 < S1 */
-
- if (dccp_loss_free(s0, s3, n3)) {
- u64 n1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_ndp;
-
- if (dccp_loss_free(s3, s1, n1)) {
- /* hole between S0 and S1 filled by S3 */
- u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp;
-
- if (dccp_loss_free(s1, s2, n2)) {
- /* entire hole filled by S0, S3, S1, S2 */
- h->loss_start = tfrc_rx_hist_index(h, 2);
- h->loss_count = 0;
- } else {
- /* gap remains between S1 and S2 */
- h->loss_start = tfrc_rx_hist_index(h, 1);
- h->loss_count = 1;
- }
-
- } else /* gap exists between S3 and S1, loss_count stays at 2 */
- tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_loss_prev(h), skb, n3);
-
- return 0;
- }
-
- /*
- * The remaining case: S0 < S3 < S1 < S2; gap between S0 and S3
- * Reorder history to insert S3 between S0 and S1.
- */
- tfrc_rx_hist_swap(h, 0, 3);
- h->loss_start = tfrc_rx_hist_index(h, 3);
- tfrc_rx_hist_entry_from_skb(tfrc_rx_hist_entry(h, 1), skb, n3);
- h->loss_count = 3;
-
- return 1;
-}
-
-/* recycle RX history records to continue loss detection if necessary */
-static void __three_after_loss(struct tfrc_rx_hist *h)
-{
- /*
- * At this stage we know already that there is a gap between S0 and S1
- * (since S0 was the highest sequence number received before detecting
- * the loss). To recycle the loss record, it is thus only necessary to
- * check for other possible gaps between S1/S2 and between S2/S3.
- */
- u64 s1 = tfrc_rx_hist_entry(h, 1)->tfrchrx_seqno,
- s2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_seqno,
- s3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_seqno;
- u64 n2 = tfrc_rx_hist_entry(h, 2)->tfrchrx_ndp,
- n3 = tfrc_rx_hist_entry(h, 3)->tfrchrx_ndp;
-
- if (dccp_loss_free(s1, s2, n2)) {
-
- if (dccp_loss_free(s2, s3, n3)) {
- /* no gap between S2 and S3: entire hole is filled */
- h->loss_start = tfrc_rx_hist_index(h, 3);
- h->loss_count = 0;
- } else {
- /* gap between S2 and S3 */
- h->loss_start = tfrc_rx_hist_index(h, 2);
- h->loss_count = 1;
- }
-
- } else { /* gap between S1 and S2 */
- h->loss_start = tfrc_rx_hist_index(h, 1);
- h->loss_count = 2;
- }
-}
-
-/**
- * tfrc_rx_handle_loss - Loss detection and further processing
- * @h: The non-empty RX history object
- * @lh: Loss Intervals database to update
- * @skb: Currently received packet
- * @ndp: The NDP count belonging to @skb
- * @calc_first_li: Caller-dependent computation of first loss interval in @lh
- * @sk: Used by @calc_first_li (see tfrc_lh_interval_add)
- *
- * Chooses action according to pending loss, updates LI database when a new
- * loss was detected, and does required post-processing. Returns 1 when caller
- * should send feedback, 0 otherwise.
- * Since it also takes care of reordering during loss detection and updates the
- * records accordingly, the caller should not perform any more RX history
- * operations when loss_count is greater than 0 after calling this function.
- */
-int tfrc_rx_handle_loss(struct tfrc_rx_hist *h,
- struct tfrc_loss_hist *lh,
- struct sk_buff *skb, const u64 ndp,
- u32 (*calc_first_li)(struct sock *), struct sock *sk)
-{
- int is_new_loss = 0;
-
- if (h->loss_count == 0) {
- __do_track_loss(h, skb, ndp);
- } else if (h->loss_count == 1) {
- __one_after_loss(h, skb, ndp);
- } else if (h->loss_count != 2) {
- DCCP_BUG("invalid loss_count %d", h->loss_count);
- } else if (__two_after_loss(h, skb, ndp)) {
- /*
- * Update Loss Interval database and recycle RX records
- */
- is_new_loss = tfrc_lh_interval_add(lh, h, calc_first_li, sk);
- __three_after_loss(h);
- }
- return is_new_loss;
-}
-
-int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h)
-{
- int i;
-
- for (i = 0; i <= TFRC_NDUPACK; i++) {
- h->ring[i] = kmem_cache_alloc(tfrc_rx_hist_slab, GFP_ATOMIC);
- if (h->ring[i] == NULL)
- goto out_free;
- }
-
- h->loss_count = h->loss_start = 0;
- return 0;
-
-out_free:
- while (i-- != 0) {
- kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]);
- h->ring[i] = NULL;
- }
- return -ENOBUFS;
-}
-
-void tfrc_rx_hist_purge(struct tfrc_rx_hist *h)
-{
- int i;
-
- for (i = 0; i <= TFRC_NDUPACK; ++i)
- if (h->ring[i] != NULL) {
- kmem_cache_free(tfrc_rx_hist_slab, h->ring[i]);
- h->ring[i] = NULL;
- }
-}
-
-/**
- * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against
- */
-static inline struct tfrc_rx_hist_entry *
- tfrc_rx_hist_rtt_last_s(const struct tfrc_rx_hist *h)
-{
- return h->ring[0];
-}
-
-/**
- * tfrc_rx_hist_rtt_prev_s - previously suitable (wrt rtt_last_s) RTT-sampling entry
- */
-static inline struct tfrc_rx_hist_entry *
- tfrc_rx_hist_rtt_prev_s(const struct tfrc_rx_hist *h)
-{
- return h->ring[h->rtt_sample_prev];
-}
-
-/**
- * tfrc_rx_hist_sample_rtt - Sample RTT from timestamp / CCVal
- * Based on ideas presented in RFC 4342, 8.1. Returns 0 if it was not able
- * to compute a sample with given data - calling function should check this.
- */
-u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb)
-{
- u32 sample = 0,
- delta_v = SUB16(dccp_hdr(skb)->dccph_ccval,
- tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
-
- if (delta_v < 1 || delta_v > 4) { /* unsuitable CCVal delta */
- if (h->rtt_sample_prev == 2) { /* previous candidate stored */
- sample = SUB16(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
- tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
- if (sample)
- sample = 4 / sample *
- ktime_us_delta(tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_tstamp,
- tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp);
- else /*
- * FIXME: This condition is in principle not
- * possible but occurs when CCID is used for
- * two-way data traffic. I have tried to trace
- * it, but the cause does not seem to be here.
- */
- DCCP_BUG("please report to dccp@vger.kernel.org"
- " => prev = %u, last = %u",
- tfrc_rx_hist_rtt_prev_s(h)->tfrchrx_ccval,
- tfrc_rx_hist_rtt_last_s(h)->tfrchrx_ccval);
- } else if (delta_v < 1) {
- h->rtt_sample_prev = 1;
- goto keep_ref_for_next_time;
- }
-
- } else if (delta_v == 4) /* optimal match */
- sample = ktime_to_us(net_timedelta(tfrc_rx_hist_rtt_last_s(h)->tfrchrx_tstamp));
- else { /* suboptimal match */
- h->rtt_sample_prev = 2;
- goto keep_ref_for_next_time;
- }
-
- if (unlikely(sample > DCCP_SANE_RTT_MAX)) {
- DCCP_WARN("RTT sample %u too large, using max\n", sample);
- sample = DCCP_SANE_RTT_MAX;
- }
-
- h->rtt_sample_prev = 0; /* use current entry as next reference */
-keep_ref_for_next_time:
-
- return sample;
-}
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
deleted file mode 100644
index ee362b0b630d..000000000000
--- a/net/dccp/ccids/lib/packet_history.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Packet RX/TX history data structures and routines for TFRC-based protocols.
- *
- * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
- * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand.
- *
- * This code has been developed by the University of Waikato WAND
- * research group. For further information please see http://www.wand.net.nz/
- * or e-mail Ian McDonald - ian.mcdonald@jandi.co.nz
- *
- * This code also uses code from Lulea University, rereleased as GPL by its
- * authors:
- * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
- *
- * Changes to meet Linux coding standards, to make it meet latest ccid3 draft
- * and to make it work as a loadable module in the DCCP stack written by
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>.
- *
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-#ifndef _DCCP_PKT_HIST_
-#define _DCCP_PKT_HIST_
-
-#include <linux/list.h>
-#include <linux/slab.h>
-#include "tfrc.h"
-
-/**
- * tfrc_tx_hist_entry - Simple singly-linked TX history list
- * @next: next oldest entry (LIFO order)
- * @seqno: sequence number of this entry
- * @stamp: send time of packet with sequence number @seqno
- */
-struct tfrc_tx_hist_entry {
- struct tfrc_tx_hist_entry *next;
- u64 seqno;
- ktime_t stamp;
-};
-
-static inline struct tfrc_tx_hist_entry *
- tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno)
-{
- while (head != NULL && head->seqno != seqno)
- head = head->next;
- return head;
-}
-
-int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno);
-void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp);
-
-/* Subtraction a-b modulo-16, respects circular wrap-around */
-#define SUB16(a, b) (((a) + 16 - (b)) & 0xF)
-
-/* Number of packets to wait after a missing packet (RFC 4342, 6.1) */
-#define TFRC_NDUPACK 3
-
-/**
- * tfrc_rx_hist_entry - Store information about a single received packet
- * @tfrchrx_seqno: DCCP packet sequence number
- * @tfrchrx_ccval: window counter value of packet (RFC 4342, 8.1)
- * @tfrchrx_ndp: the NDP count (if any) of the packet
- * @tfrchrx_tstamp: actual receive time of packet
- */
-struct tfrc_rx_hist_entry {
- u64 tfrchrx_seqno:48,
- tfrchrx_ccval:4,
- tfrchrx_type:4;
- u64 tfrchrx_ndp:48;
- ktime_t tfrchrx_tstamp;
-};
-
-/**
- * tfrc_rx_hist - RX history structure for TFRC-based protocols
- * @ring: Packet history for RTT sampling and loss detection
- * @loss_count: Number of entries in circular history
- * @loss_start: Movable index (for loss detection)
- * @rtt_sample_prev: Used during RTT sampling, points to candidate entry
- */
-struct tfrc_rx_hist {
- struct tfrc_rx_hist_entry *ring[TFRC_NDUPACK + 1];
- u8 loss_count:2,
- loss_start:2;
-#define rtt_sample_prev loss_start
-};
-
-/**
- * tfrc_rx_hist_index - index to reach n-th entry after loss_start
- */
-static inline u8 tfrc_rx_hist_index(const struct tfrc_rx_hist *h, const u8 n)
-{
- return (h->loss_start + n) & TFRC_NDUPACK;
-}
-
-/**
- * tfrc_rx_hist_last_rcv - entry with highest-received-seqno so far
- */
-static inline struct tfrc_rx_hist_entry *
- tfrc_rx_hist_last_rcv(const struct tfrc_rx_hist *h)
-{
- return h->ring[tfrc_rx_hist_index(h, h->loss_count)];
-}
-
-/**
- * tfrc_rx_hist_entry - return the n-th history entry after loss_start
- */
-static inline struct tfrc_rx_hist_entry *
- tfrc_rx_hist_entry(const struct tfrc_rx_hist *h, const u8 n)
-{
- return h->ring[tfrc_rx_hist_index(h, n)];
-}
-
-/**
- * tfrc_rx_hist_loss_prev - entry with highest-received-seqno before loss was detected
- */
-static inline struct tfrc_rx_hist_entry *
- tfrc_rx_hist_loss_prev(const struct tfrc_rx_hist *h)
-{
- return h->ring[h->loss_start];
-}
-
-/* indicate whether previously a packet was detected missing */
-static inline bool tfrc_rx_hist_loss_pending(const struct tfrc_rx_hist *h)
-{
- return h->loss_count > 0;
-}
-
-void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, const struct sk_buff *skb,
- const u64 ndp);
-
-int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb);
-
-struct tfrc_loss_hist;
-int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, struct tfrc_loss_hist *lh,
- struct sk_buff *skb, const u64 ndp,
- u32 (*first_li)(struct sock *sk), struct sock *sk);
-u32 tfrc_rx_hist_sample_rtt(struct tfrc_rx_hist *h, const struct sk_buff *skb);
-int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h);
-void tfrc_rx_hist_purge(struct tfrc_rx_hist *h);
-
-#endif /* _DCCP_PKT_HIST_ */
diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c
deleted file mode 100644
index d7f265e1f50c..000000000000
--- a/net/dccp/ccids/lib/tfrc.c
+++ /dev/null
@@ -1,46 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * TFRC library initialisation
- *
- * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
- * Copyright (c) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
- */
-#include <linux/moduleparam.h>
-#include "tfrc.h"
-
-#ifdef CONFIG_IP_DCCP_TFRC_DEBUG
-bool tfrc_debug;
-module_param(tfrc_debug, bool, 0644);
-MODULE_PARM_DESC(tfrc_debug, "Enable TFRC debug messages");
-#endif
-
-int __init tfrc_lib_init(void)
-{
- int rc = tfrc_li_init();
-
- if (rc)
- goto out;
-
- rc = tfrc_tx_packet_history_init();
- if (rc)
- goto out_free_loss_intervals;
-
- rc = tfrc_rx_packet_history_init();
- if (rc)
- goto out_free_tx_history;
- return 0;
-
-out_free_tx_history:
- tfrc_tx_packet_history_exit();
-out_free_loss_intervals:
- tfrc_li_exit();
-out:
- return rc;
-}
-
-void tfrc_lib_exit(void)
-{
- tfrc_rx_packet_history_exit();
- tfrc_tx_packet_history_exit();
- tfrc_li_exit();
-}
diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h
deleted file mode 100644
index 40ee7d62b652..000000000000
--- a/net/dccp/ccids/lib/tfrc.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef _TFRC_H_
-#define _TFRC_H_
-/*
- * Copyright (c) 2007 The University of Aberdeen, Scotland, UK
- * Copyright (c) 2005-6 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz>
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-#include <linux/types.h>
-#include <linux/math64.h>
-#include "../../dccp.h"
-
-/* internal includes that this library exports: */
-#include "loss_interval.h"
-#include "packet_history.h"
-
-#ifdef CONFIG_IP_DCCP_TFRC_DEBUG
-extern bool tfrc_debug;
-#define tfrc_pr_debug(format, a...) DCCP_PR_DEBUG(tfrc_debug, format, ##a)
-#else
-#define tfrc_pr_debug(format, a...)
-#endif
-
-/* integer-arithmetic divisions of type (a * 1000000)/b */
-static inline u64 scaled_div(u64 a, u64 b)
-{
- BUG_ON(b == 0);
- return div64_u64(a * 1000000, b);
-}
-
-static inline u32 scaled_div32(u64 a, u64 b)
-{
- u64 result = scaled_div(a, b);
-
- if (result > UINT_MAX) {
- DCCP_CRIT("Overflow: %llu/%llu > UINT_MAX",
- (unsigned long long)a, (unsigned long long)b);
- return UINT_MAX;
- }
- return result;
-}
-
-/**
- * tfrc_ewma - Exponentially weighted moving average
- * @weight: Weight to be used as damping factor, in units of 1/10
- */
-static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight)
-{
- return avg ? (weight * avg + (10 - weight) * newval) / 10 : newval;
-}
-
-u32 tfrc_calc_x(u16 s, u32 R, u32 p);
-u32 tfrc_calc_x_reverse_lookup(u32 fvalue);
-u32 tfrc_invert_loss_event_rate(u32 loss_event_rate);
-
-int tfrc_tx_packet_history_init(void);
-void tfrc_tx_packet_history_exit(void);
-int tfrc_rx_packet_history_init(void);
-void tfrc_rx_packet_history_exit(void);
-
-int tfrc_li_init(void);
-void tfrc_li_exit(void);
-
-#ifdef CONFIG_IP_DCCP_TFRC_LIB
-int tfrc_lib_init(void);
-void tfrc_lib_exit(void);
-#else
-#define tfrc_lib_init() (0)
-#define tfrc_lib_exit()
-#endif
-#endif /* _TFRC_H_ */
diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c
deleted file mode 100644
index 88ef98285bec..000000000000
--- a/net/dccp/ccids/lib/tfrc_equation.c
+++ /dev/null
@@ -1,705 +0,0 @@
-/*
- * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand.
- * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz>
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include "../../dccp.h"
-#include "tfrc.h"
-
-#define TFRC_CALC_X_ARRSIZE 500
-#define TFRC_CALC_X_SPLIT 50000 /* 0.05 * 1000000, details below */
-#define TFRC_SMALLEST_P (TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE)
-
-/*
- TFRC TCP Reno Throughput Equation Lookup Table for f(p)
-
- The following two-column lookup table implements a part of the TCP throughput
- equation from [RFC 3448, sec. 3.1]:
-
- s
- X_calc = --------------------------------------------------------------
- R * sqrt(2*b*p/3) + (3 * t_RTO * sqrt(3*b*p/8) * (p + 32*p^3))
-
- Where:
- X is the transmit rate in bytes/second
- s is the packet size in bytes
- R is the round trip time in seconds
- p is the loss event rate, between 0 and 1.0, of the number of loss
- events as a fraction of the number of packets transmitted
- t_RTO is the TCP retransmission timeout value in seconds
- b is the number of packets acknowledged by a single TCP ACK
-
- We can assume that b = 1 and t_RTO is 4 * R. The equation now becomes:
-
- s
- X_calc = -------------------------------------------------------
- R * sqrt(p*2/3) + (12 * R * sqrt(p*3/8) * (p + 32*p^3))
-
- which we can break down into:
-
- s
- X_calc = ---------
- R * f(p)
-
- where f(p) is given for 0 < p <= 1 by:
-
- f(p) = sqrt(2*p/3) + 12 * sqrt(3*p/8) * (p + 32*p^3)
-
- Since this is kernel code, floating-point arithmetic is avoided in favour of
- integer arithmetic. This means that nearly all fractional parameters are
- scaled by 1000000:
- * the parameters p and R
- * the return result f(p)
- The lookup table therefore actually tabulates the following function g(q):
-
- g(q) = 1000000 * f(q/1000000)
-
- Hence, when p <= 1, q must be less than or equal to 1000000. To achieve finer
- granularity for the practically more relevant case of small values of p (up to
- 5%), the second column is used; the first one ranges up to 100%. This split
- corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also
- determines the smallest resolution possible with this lookup table:
-
- TFRC_SMALLEST_P = TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE
-
- The entire table is generated by:
- for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) {
- lookup[i][0] = g((i+1) * 1000000/TFRC_CALC_X_ARRSIZE);
- lookup[i][1] = g((i+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE);
- }
-
- With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1,
- lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%)
- lookup[M][0] = g(1000000) = 1000000 * f(100%)
- lookup[0][1] = g(TFRC_SMALLEST_P) = 1000000 * f(0.01%)
- lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%)
-
- In summary, the two columns represent f(p) for the following ranges:
- * The first column is for 0.002 <= p <= 1.0
- * The second column is for 0.0001 <= p <= 0.05
- Where the columns overlap, the second (finer-grained) is given preference,
- i.e. the first column is used only for p >= 0.05.
- */
-static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = {
- { 37172, 8172 },
- { 53499, 11567 },
- { 66664, 14180 },
- { 78298, 16388 },
- { 89021, 18339 },
- { 99147, 20108 },
- { 108858, 21738 },
- { 118273, 23260 },
- { 127474, 24693 },
- { 136520, 26052 },
- { 145456, 27348 },
- { 154316, 28589 },
- { 163130, 29783 },
- { 171919, 30935 },
- { 180704, 32049 },
- { 189502, 33130 },
- { 198328, 34180 },
- { 207194, 35202 },
- { 216114, 36198 },
- { 225097, 37172 },
- { 234153, 38123 },
- { 243294, 39055 },
- { 252527, 39968 },
- { 261861, 40864 },
- { 271305, 41743 },
- { 280866, 42607 },
- { 290553, 43457 },
- { 300372, 44293 },
- { 310333, 45117 },
- { 320441, 45929 },
- { 330705, 46729 },
- { 341131, 47518 },
- { 351728, 48297 },
- { 362501, 49066 },
- { 373460, 49826 },
- { 384609, 50577 },
- { 395958, 51320 },
- { 407513, 52054 },
- { 419281, 52780 },
- { 431270, 53499 },
- { 443487, 54211 },
- { 455940, 54916 },
- { 468635, 55614 },
- { 481581, 56306 },
- { 494785, 56991 },
- { 508254, 57671 },
- { 521996, 58345 },
- { 536019, 59014 },
- { 550331, 59677 },
- { 564939, 60335 },
- { 579851, 60988 },
- { 595075, 61636 },
- { 610619, 62279 },
- { 626491, 62918 },
- { 642700, 63553 },
- { 659253, 64183 },
- { 676158, 64809 },
- { 693424, 65431 },
- { 711060, 66050 },
- { 729073, 66664 },
- { 747472, 67275 },
- { 766266, 67882 },
- { 785464, 68486 },
- { 805073, 69087 },
- { 825103, 69684 },
- { 845562, 70278 },
- { 866460, 70868 },
- { 887805, 71456 },
- { 909606, 72041 },
- { 931873, 72623 },
- { 954614, 73202 },
- { 977839, 73778 },
- { 1001557, 74352 },
- { 1025777, 74923 },
- { 1050508, 75492 },
- { 1075761, 76058 },
- { 1101544, 76621 },
- { 1127867, 77183 },
- { 1154739, 77741 },
- { 1182172, 78298 },
- { 1210173, 78852 },
- { 1238753, 79405 },
- { 1267922, 79955 },
- { 1297689, 80503 },
- { 1328066, 81049 },
- { 1359060, 81593 },
- { 1390684, 82135 },
- { 1422947, 82675 },
- { 1455859, 83213 },
- { 1489430, 83750 },
- { 1523671, 84284 },
- { 1558593, 84817 },
- { 1594205, 85348 },
- { 1630518, 85878 },
- { 1667543, 86406 },
- { 1705290, 86932 },
- { 1743770, 87457 },
- { 1782994, 87980 },
- { 1822973, 88501 },
- { 1863717, 89021 },
- { 1905237, 89540 },
- { 1947545, 90057 },
- { 1990650, 90573 },
- { 2034566, 91087 },
- { 2079301, 91600 },
- { 2124869, 92111 },
- { 2171279, 92622 },
- { 2218543, 93131 },
- { 2266673, 93639 },
- { 2315680, 94145 },
- { 2365575, 94650 },
- { 2416371, 95154 },
- { 2468077, 95657 },
- { 2520707, 96159 },
- { 2574271, 96660 },
- { 2628782, 97159 },
- { 2684250, 97658 },
- { 2740689, 98155 },
- { 2798110, 98651 },
- { 2856524, 99147 },
- { 2915944, 99641 },
- { 2976382, 100134 },
- { 3037850, 100626 },
- { 3100360, 101117 },
- { 3163924, 101608 },
- { 3228554, 102097 },
- { 3294263, 102586 },
- { 3361063, 103073 },
- { 3428966, 103560 },
- { 3497984, 104045 },
- { 3568131, 104530 },
- { 3639419, 105014 },
- { 3711860, 105498 },
- { 3785467, 105980 },
- { 3860253, 106462 },
- { 3936229, 106942 },
- { 4013410, 107422 },
- { 4091808, 107902 },
- { 4171435, 108380 },
- { 4252306, 108858 },
- { 4334431, 109335 },
- { 4417825, 109811 },
- { 4502501, 110287 },
- { 4588472, 110762 },
- { 4675750, 111236 },
- { 4764349, 111709 },
- { 4854283, 112182 },
- { 4945564, 112654 },
- { 5038206, 113126 },
- { 5132223, 113597 },
- { 5227627, 114067 },
- { 5324432, 114537 },
- { 5422652, 115006 },
- { 5522299, 115474 },
- { 5623389, 115942 },
- { 5725934, 116409 },
- { 5829948, 116876 },
- { 5935446, 117342 },
- { 6042439, 117808 },
- { 6150943, 118273 },
- { 6260972, 118738 },
- { 6372538, 119202 },
- { 6485657, 119665 },
- { 6600342, 120128 },
- { 6716607, 120591 },
- { 6834467, 121053 },
- { 6953935, 121514 },
- { 7075025, 121976 },
- { 7197752, 122436 },
- { 7322131, 122896 },
- { 7448175, 123356 },
- { 7575898, 123815 },
- { 7705316, 124274 },
- { 7836442, 124733 },
- { 7969291, 125191 },
- { 8103877, 125648 },
- { 8240216, 126105 },
- { 8378321, 126562 },
- { 8518208, 127018 },
- { 8659890, 127474 },
- { 8803384, 127930 },
- { 8948702, 128385 },
- { 9095861, 128840 },
- { 9244875, 129294 },
- { 9395760, 129748 },
- { 9548529, 130202 },
- { 9703198, 130655 },
- { 9859782, 131108 },
- { 10018296, 131561 },
- { 10178755, 132014 },
- { 10341174, 132466 },
- { 10505569, 132917 },
- { 10671954, 133369 },
- { 10840345, 133820 },
- { 11010757, 134271 },
- { 11183206, 134721 },
- { 11357706, 135171 },
- { 11534274, 135621 },
- { 11712924, 136071 },
- { 11893673, 136520 },
- { 12076536, 136969 },
- { 12261527, 137418 },
- { 12448664, 137867 },
- { 12637961, 138315 },
- { 12829435, 138763 },
- { 13023101, 139211 },
- { 13218974, 139658 },
- { 13417071, 140106 },
- { 13617407, 140553 },
- { 13819999, 140999 },
- { 14024862, 141446 },
- { 14232012, 141892 },
- { 14441465, 142339 },
- { 14653238, 142785 },
- { 14867346, 143230 },
- { 15083805, 143676 },
- { 15302632, 144121 },
- { 15523842, 144566 },
- { 15747453, 145011 },
- { 15973479, 145456 },
- { 16201939, 145900 },
- { 16432847, 146345 },
- { 16666221, 146789 },
- { 16902076, 147233 },
- { 17140429, 147677 },
- { 17381297, 148121 },
- { 17624696, 148564 },
- { 17870643, 149007 },
- { 18119154, 149451 },
- { 18370247, 149894 },
- { 18623936, 150336 },
- { 18880241, 150779 },
- { 19139176, 151222 },
- { 19400759, 151664 },
- { 19665007, 152107 },
- { 19931936, 152549 },
- { 20201564, 152991 },
- { 20473907, 153433 },
- { 20748982, 153875 },
- { 21026807, 154316 },
- { 21307399, 154758 },
- { 21590773, 155199 },
- { 21876949, 155641 },
- { 22165941, 156082 },
- { 22457769, 156523 },
- { 22752449, 156964 },
- { 23049999, 157405 },
- { 23350435, 157846 },
- { 23653774, 158287 },
- { 23960036, 158727 },
- { 24269236, 159168 },
- { 24581392, 159608 },
- { 24896521, 160049 },
- { 25214642, 160489 },
- { 25535772, 160929 },
- { 25859927, 161370 },
- { 26187127, 161810 },
- { 26517388, 162250 },
- { 26850728, 162690 },
- { 27187165, 163130 },
- { 27526716, 163569 },
- { 27869400, 164009 },
- { 28215234, 164449 },
- { 28564236, 164889 },
- { 28916423, 165328 },
- { 29271815, 165768 },
- { 29630428, 166208 },
- { 29992281, 166647 },
- { 30357392, 167087 },
- { 30725779, 167526 },
- { 31097459, 167965 },
- { 31472452, 168405 },
- { 31850774, 168844 },
- { 32232445, 169283 },
- { 32617482, 169723 },
- { 33005904, 170162 },
- { 33397730, 170601 },
- { 33792976, 171041 },
- { 34191663, 171480 },
- { 34593807, 171919 },
- { 34999428, 172358 },
- { 35408544, 172797 },
- { 35821174, 173237 },
- { 36237335, 173676 },
- { 36657047, 174115 },
- { 37080329, 174554 },
- { 37507197, 174993 },
- { 37937673, 175433 },
- { 38371773, 175872 },
- { 38809517, 176311 },
- { 39250924, 176750 },
- { 39696012, 177190 },
- { 40144800, 177629 },
- { 40597308, 178068 },
- { 41053553, 178507 },
- { 41513554, 178947 },
- { 41977332, 179386 },
- { 42444904, 179825 },
- { 42916290, 180265 },
- { 43391509, 180704 },
- { 43870579, 181144 },
- { 44353520, 181583 },
- { 44840352, 182023 },
- { 45331092, 182462 },
- { 45825761, 182902 },
- { 46324378, 183342 },
- { 46826961, 183781 },
- { 47333531, 184221 },
- { 47844106, 184661 },
- { 48358706, 185101 },
- { 48877350, 185541 },
- { 49400058, 185981 },
- { 49926849, 186421 },
- { 50457743, 186861 },
- { 50992759, 187301 },
- { 51531916, 187741 },
- { 52075235, 188181 },
- { 52622735, 188622 },
- { 53174435, 189062 },
- { 53730355, 189502 },
- { 54290515, 189943 },
- { 54854935, 190383 },
- { 55423634, 190824 },
- { 55996633, 191265 },
- { 56573950, 191706 },
- { 57155606, 192146 },
- { 57741621, 192587 },
- { 58332014, 193028 },
- { 58926806, 193470 },
- { 59526017, 193911 },
- { 60129666, 194352 },
- { 60737774, 194793 },
- { 61350361, 195235 },
- { 61967446, 195677 },
- { 62589050, 196118 },
- { 63215194, 196560 },
- { 63845897, 197002 },
- { 64481179, 197444 },
- { 65121061, 197886 },
- { 65765563, 198328 },
- { 66414705, 198770 },
- { 67068508, 199213 },
- { 67726992, 199655 },
- { 68390177, 200098 },
- { 69058085, 200540 },
- { 69730735, 200983 },
- { 70408147, 201426 },
- { 71090343, 201869 },
- { 71777343, 202312 },
- { 72469168, 202755 },
- { 73165837, 203199 },
- { 73867373, 203642 },
- { 74573795, 204086 },
- { 75285124, 204529 },
- { 76001380, 204973 },
- { 76722586, 205417 },
- { 77448761, 205861 },
- { 78179926, 206306 },
- { 78916102, 206750 },
- { 79657310, 207194 },
- { 80403571, 207639 },
- { 81154906, 208084 },
- { 81911335, 208529 },
- { 82672880, 208974 },
- { 83439562, 209419 },
- { 84211402, 209864 },
- { 84988421, 210309 },
- { 85770640, 210755 },
- { 86558080, 211201 },
- { 87350762, 211647 },
- { 88148708, 212093 },
- { 88951938, 212539 },
- { 89760475, 212985 },
- { 90574339, 213432 },
- { 91393551, 213878 },
- { 92218133, 214325 },
- { 93048107, 214772 },
- { 93883493, 215219 },
- { 94724314, 215666 },
- { 95570590, 216114 },
- { 96422343, 216561 },
- { 97279594, 217009 },
- { 98142366, 217457 },
- { 99010679, 217905 },
- { 99884556, 218353 },
- { 100764018, 218801 },
- { 101649086, 219250 },
- { 102539782, 219698 },
- { 103436128, 220147 },
- { 104338146, 220596 },
- { 105245857, 221046 },
- { 106159284, 221495 },
- { 107078448, 221945 },
- { 108003370, 222394 },
- { 108934074, 222844 },
- { 109870580, 223294 },
- { 110812910, 223745 },
- { 111761087, 224195 },
- { 112715133, 224646 },
- { 113675069, 225097 },
- { 114640918, 225548 },
- { 115612702, 225999 },
- { 116590442, 226450 },
- { 117574162, 226902 },
- { 118563882, 227353 },
- { 119559626, 227805 },
- { 120561415, 228258 },
- { 121569272, 228710 },
- { 122583219, 229162 },
- { 123603278, 229615 },
- { 124629471, 230068 },
- { 125661822, 230521 },
- { 126700352, 230974 },
- { 127745083, 231428 },
- { 128796039, 231882 },
- { 129853241, 232336 },
- { 130916713, 232790 },
- { 131986475, 233244 },
- { 133062553, 233699 },
- { 134144966, 234153 },
- { 135233739, 234608 },
- { 136328894, 235064 },
- { 137430453, 235519 },
- { 138538440, 235975 },
- { 139652876, 236430 },
- { 140773786, 236886 },
- { 141901190, 237343 },
- { 143035113, 237799 },
- { 144175576, 238256 },
- { 145322604, 238713 },
- { 146476218, 239170 },
- { 147636442, 239627 },
- { 148803298, 240085 },
- { 149976809, 240542 },
- { 151156999, 241000 },
- { 152343890, 241459 },
- { 153537506, 241917 },
- { 154737869, 242376 },
- { 155945002, 242835 },
- { 157158929, 243294 },
- { 158379673, 243753 },
- { 159607257, 244213 },
- { 160841704, 244673 },
- { 162083037, 245133 },
- { 163331279, 245593 },
- { 164586455, 246054 },
- { 165848586, 246514 },
- { 167117696, 246975 },
- { 168393810, 247437 },
- { 169676949, 247898 },
- { 170967138, 248360 },
- { 172264399, 248822 },
- { 173568757, 249284 },
- { 174880235, 249747 },
- { 176198856, 250209 },
- { 177524643, 250672 },
- { 178857621, 251136 },
- { 180197813, 251599 },
- { 181545242, 252063 },
- { 182899933, 252527 },
- { 184261908, 252991 },
- { 185631191, 253456 },
- { 187007807, 253920 },
- { 188391778, 254385 },
- { 189783129, 254851 },
- { 191181884, 255316 },
- { 192588065, 255782 },
- { 194001698, 256248 },
- { 195422805, 256714 },
- { 196851411, 257181 },
- { 198287540, 257648 },
- { 199731215, 258115 },
- { 201182461, 258582 },
- { 202641302, 259050 },
- { 204107760, 259518 },
- { 205581862, 259986 },
- { 207063630, 260454 },
- { 208553088, 260923 },
- { 210050262, 261392 },
- { 211555174, 261861 },
- { 213067849, 262331 },
- { 214588312, 262800 },
- { 216116586, 263270 },
- { 217652696, 263741 },
- { 219196666, 264211 },
- { 220748520, 264682 },
- { 222308282, 265153 },
- { 223875978, 265625 },
- { 225451630, 266097 },
- { 227035265, 266569 },
- { 228626905, 267041 },
- { 230226576, 267514 },
- { 231834302, 267986 },
- { 233450107, 268460 },
- { 235074016, 268933 },
- { 236706054, 269407 },
- { 238346244, 269881 },
- { 239994613, 270355 },
- { 241651183, 270830 },
- { 243315981, 271305 }
-};
-
-/* return largest index i such that fval <= lookup[i][small] */
-static inline u32 tfrc_binsearch(u32 fval, u8 small)
-{
- u32 try, low = 0, high = TFRC_CALC_X_ARRSIZE - 1;
-
- while (low < high) {
- try = (low + high) / 2;
- if (fval <= tfrc_calc_x_lookup[try][small])
- high = try;
- else
- low = try + 1;
- }
- return high;
-}
-
-/**
- * tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448
- * @s: packet size in bytes
- * @R: RTT scaled by 1000000 (i.e., microseconds)
- * @p: loss ratio estimate scaled by 1000000
- *
- * Returns X_calc in bytes per second (not scaled).
- */
-u32 tfrc_calc_x(u16 s, u32 R, u32 p)
-{
- u16 index;
- u32 f;
- u64 result;
-
- /* check against invalid parameters and divide-by-zero */
- BUG_ON(p > 1000000); /* p must not exceed 100% */
- BUG_ON(p == 0); /* f(0) = 0, divide by zero */
- if (R == 0) { /* possible divide by zero */
- DCCP_CRIT("WARNING: RTT is 0, returning maximum X_calc.");
- return ~0U;
- }
-
- if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */
- if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */
- DCCP_WARN("Value of p (%d) below resolution. "
- "Substituting %d\n", p, TFRC_SMALLEST_P);
- index = 0;
- } else /* 0.0001 <= p <= 0.05 */
- index = p/TFRC_SMALLEST_P - 1;
-
- f = tfrc_calc_x_lookup[index][1];
-
- } else { /* 0.05 < p <= 1.00 */
- index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1;
-
- f = tfrc_calc_x_lookup[index][0];
- }
-
- /*
- * Compute X = s/(R*f(p)) in bytes per second.
- * Since f(p) and R are both scaled by 1000000, we need to multiply by
- * 1000000^2. To avoid overflow, the result is computed in two stages.
- * This works under almost all reasonable operational conditions, for a
- * wide range of parameters. Yet, should some strange combination of
- * parameters result in overflow, the use of scaled_div32 will catch
- * this and return UINT_MAX - which is a logically adequate consequence.
- */
- result = scaled_div(s, R);
- return scaled_div32(result, f);
-}
-
-/**
- * tfrc_calc_x_reverse_lookup - try to find p given f(p)
- * @fvalue: function value to match, scaled by 1000000
- *
- * Returns closest match for p, also scaled by 1000000
- */
-u32 tfrc_calc_x_reverse_lookup(u32 fvalue)
-{
- int index;
-
- if (fvalue == 0) /* f(p) = 0 whenever p = 0 */
- return 0;
-
- /* Error cases. */
- if (fvalue < tfrc_calc_x_lookup[0][1]) {
- DCCP_WARN("fvalue %u smaller than resolution\n", fvalue);
- return TFRC_SMALLEST_P;
- }
- if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) {
- DCCP_WARN("fvalue %u exceeds bounds!\n", fvalue);
- return 1000000;
- }
-
- if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) {
- index = tfrc_binsearch(fvalue, 1);
- return (index + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE;
- }
-
- /* else ... it must be in the coarse-grained column */
- index = tfrc_binsearch(fvalue, 0);
- return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE;
-}
-
-/**
- * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100%
- * When @loss_event_rate is large, there is a chance that p is truncated to 0.
- * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0.
- */
-u32 tfrc_invert_loss_event_rate(u32 loss_event_rate)
-{
- if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */
- return 0;
- if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */
- return 1000000;
- return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P);
-}
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
deleted file mode 100644
index f91e3816806b..000000000000
--- a/net/dccp/dccp.h
+++ /dev/null
@@ -1,501 +0,0 @@
-#ifndef _DCCP_H
-#define _DCCP_H
-/*
- * net/dccp/dccp.h
- *
- * An implementation of the DCCP protocol
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- * Copyright (c) 2005-6 Ian McDonald <ian.mcdonald@jandi.co.nz>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/dccp.h>
-#include <linux/ktime.h>
-#include <net/snmp.h>
-#include <net/sock.h>
-#include <net/tcp.h>
-#include "ackvec.h"
-
-/*
- * DCCP - specific warning and debugging macros.
- */
-#define DCCP_WARN(fmt, ...) \
- net_warn_ratelimited("%s: " fmt, __func__, ##__VA_ARGS__)
-#define DCCP_CRIT(fmt, a...) printk(KERN_CRIT fmt " at %s:%d/%s()\n", ##a, \
- __FILE__, __LINE__, __func__)
-#define DCCP_BUG(a...) do { DCCP_CRIT("BUG: " a); dump_stack(); } while(0)
-#define DCCP_BUG_ON(cond) do { if (unlikely((cond) != 0)) \
- DCCP_BUG("\"%s\" holds (exception!)", \
- __stringify(cond)); \
- } while (0)
-
-#define DCCP_PRINTK(enable, fmt, args...) do { if (enable) \
- printk(fmt, ##args); \
- } while(0)
-#define DCCP_PR_DEBUG(enable, fmt, a...) DCCP_PRINTK(enable, KERN_DEBUG \
- "%s: " fmt, __func__, ##a)
-
-#ifdef CONFIG_IP_DCCP_DEBUG
-extern bool dccp_debug;
-#define dccp_pr_debug(format, a...) DCCP_PR_DEBUG(dccp_debug, format, ##a)
-#define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a)
-#define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a)
-#else
-#define dccp_pr_debug(format, a...)
-#define dccp_pr_debug_cat(format, a...)
-#define dccp_debug(format, a...)
-#endif
-
-extern struct inet_hashinfo dccp_hashinfo;
-
-extern struct percpu_counter dccp_orphan_count;
-
-void dccp_time_wait(struct sock *sk, int state, int timeo);
-
-/*
- * Set safe upper bounds for header and option length. Since Data Offset is 8
- * bits (RFC 4340, sec. 5.1), the total header length can never be more than
- * 4 * 255 = 1020 bytes. The largest possible header length is 28 bytes (X=1):
- * - DCCP-Response with ACK Subheader and 4 bytes of Service code OR
- * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields
- * Hence a safe upper bound for the maximum option length is 1020-28 = 992
- */
-#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t))
-#define DCCP_MAX_PACKET_HDR 28
-#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR)
-#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER)
-
-/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */
-#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t))
-
-#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
- * state, about 60 seconds */
-
-/* RFC 1122, 4.2.3.1 initial RTO value */
-#define DCCP_TIMEOUT_INIT ((unsigned int)(3 * HZ))
-
-/*
- * The maximum back-off value for retransmissions. This is needed for
- * - retransmitting client-Requests (sec. 8.1.1),
- * - retransmitting Close/CloseReq when closing (sec. 8.3),
- * - feature-negotiation retransmission (sec. 6.6.3),
- * - Acks in client-PARTOPEN state (sec. 8.1.5).
- */
-#define DCCP_RTO_MAX ((unsigned int)(64 * HZ))
-
-/*
- * RTT sampling: sanity bounds and fallback RTT value from RFC 4340, section 3.4
- */
-#define DCCP_SANE_RTT_MIN 100
-#define DCCP_FALLBACK_RTT (USEC_PER_SEC / 5)
-#define DCCP_SANE_RTT_MAX (3 * USEC_PER_SEC)
-
-/* sysctl variables for DCCP */
-extern int sysctl_dccp_request_retries;
-extern int sysctl_dccp_retries1;
-extern int sysctl_dccp_retries2;
-extern int sysctl_dccp_tx_qlen;
-extern int sysctl_dccp_sync_ratelimit;
-
-/*
- * 48-bit sequence number arithmetic (signed and unsigned)
- */
-#define INT48_MIN 0x800000000000LL /* 2^47 */
-#define UINT48_MAX 0xFFFFFFFFFFFFLL /* 2^48 - 1 */
-#define COMPLEMENT48(x) (0x1000000000000LL - (x)) /* 2^48 - x */
-#define TO_SIGNED48(x) (((x) < INT48_MIN)? (x) : -COMPLEMENT48( (x)))
-#define TO_UNSIGNED48(x) (((x) >= 0)? (x) : COMPLEMENT48(-(x)))
-#define ADD48(a, b) (((a) + (b)) & UINT48_MAX)
-#define SUB48(a, b) ADD48((a), COMPLEMENT48(b))
-
-static inline void dccp_set_seqno(u64 *seqno, u64 value)
-{
- *seqno = value & UINT48_MAX;
-}
-
-static inline void dccp_inc_seqno(u64 *seqno)
-{
- *seqno = ADD48(*seqno, 1);
-}
-
-/* signed mod-2^48 distance: pos. if seqno1 < seqno2, neg. if seqno1 > seqno2 */
-static inline s64 dccp_delta_seqno(const u64 seqno1, const u64 seqno2)
-{
- u64 delta = SUB48(seqno2, seqno1);
-
- return TO_SIGNED48(delta);
-}
-
-/* is seq1 < seq2 ? */
-static inline int before48(const u64 seq1, const u64 seq2)
-{
- return (s64)((seq2 << 16) - (seq1 << 16)) > 0;
-}
-
-/* is seq1 > seq2 ? */
-#define after48(seq1, seq2) before48(seq2, seq1)
-
-/* is seq2 <= seq1 <= seq3 ? */
-static inline int between48(const u64 seq1, const u64 seq2, const u64 seq3)
-{
- return (seq3 << 16) - (seq2 << 16) >= (seq1 << 16) - (seq2 << 16);
-}
-
-static inline u64 max48(const u64 seq1, const u64 seq2)
-{
- return after48(seq1, seq2) ? seq1 : seq2;
-}
-
-/**
- * dccp_loss_count - Approximate the number of lost data packets in a burst loss
- * @s1: last known sequence number before the loss ('hole')
- * @s2: first sequence number seen after the 'hole'
- * @ndp: NDP count on packet with sequence number @s2
- */
-static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp)
-{
- s64 delta = dccp_delta_seqno(s1, s2);
-
- WARN_ON(delta < 0);
- delta -= ndp + 1;
-
- return delta > 0 ? delta : 0;
-}
-
-/**
- * dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1
- */
-static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp)
-{
- return dccp_loss_count(s1, s2, ndp) == 0;
-}
-
-enum {
- DCCP_MIB_NUM = 0,
- DCCP_MIB_ACTIVEOPENS, /* ActiveOpens */
- DCCP_MIB_ESTABRESETS, /* EstabResets */
- DCCP_MIB_CURRESTAB, /* CurrEstab */
- DCCP_MIB_OUTSEGS, /* OutSegs */
- DCCP_MIB_OUTRSTS,
- DCCP_MIB_ABORTONTIMEOUT,
- DCCP_MIB_TIMEOUTS,
- DCCP_MIB_ABORTFAILED,
- DCCP_MIB_PASSIVEOPENS,
- DCCP_MIB_ATTEMPTFAILS,
- DCCP_MIB_OUTDATAGRAMS,
- DCCP_MIB_INERRS,
- DCCP_MIB_OPTMANDATORYERROR,
- DCCP_MIB_INVALIDOPT,
- __DCCP_MIB_MAX
-};
-
-#define DCCP_MIB_MAX __DCCP_MIB_MAX
-struct dccp_mib {
- unsigned long mibs[DCCP_MIB_MAX];
-};
-
-DECLARE_SNMP_STAT(struct dccp_mib, dccp_statistics);
-#define DCCP_INC_STATS(field) SNMP_INC_STATS(dccp_statistics, field)
-#define __DCCP_INC_STATS(field) __SNMP_INC_STATS(dccp_statistics, field)
-#define DCCP_DEC_STATS(field) SNMP_DEC_STATS(dccp_statistics, field)
-
-/*
- * Checksumming routines
- */
-static inline unsigned int dccp_csum_coverage(const struct sk_buff *skb)
-{
- const struct dccp_hdr* dh = dccp_hdr(skb);
-
- if (dh->dccph_cscov == 0)
- return skb->len;
- return (dh->dccph_doff + dh->dccph_cscov - 1) * sizeof(u32);
-}
-
-static inline void dccp_csum_outgoing(struct sk_buff *skb)
-{
- unsigned int cov = dccp_csum_coverage(skb);
-
- if (cov >= skb->len)
- dccp_hdr(skb)->dccph_cscov = 0;
-
- skb->csum = skb_checksum(skb, 0, (cov > skb->len)? skb->len : cov, 0);
-}
-
-void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb);
-
-int dccp_retransmit_skb(struct sock *sk);
-
-void dccp_send_ack(struct sock *sk);
-void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
- struct request_sock *rsk);
-
-void dccp_send_sync(struct sock *sk, const u64 seq,
- const enum dccp_pkt_type pkt_type);
-
-/*
- * TX Packet Dequeueing Interface
- */
-void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb);
-bool dccp_qpolicy_full(struct sock *sk);
-void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb);
-struct sk_buff *dccp_qpolicy_top(struct sock *sk);
-struct sk_buff *dccp_qpolicy_pop(struct sock *sk);
-bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param);
-
-/*
- * TX Packet Output and TX Timers
- */
-void dccp_write_xmit(struct sock *sk);
-void dccp_write_space(struct sock *sk);
-void dccp_flush_write_queue(struct sock *sk, long *time_budget);
-
-void dccp_init_xmit_timers(struct sock *sk);
-static inline void dccp_clear_xmit_timers(struct sock *sk)
-{
- inet_csk_clear_xmit_timers(sk);
-}
-
-unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu);
-
-const char *dccp_packet_name(const int type);
-
-void dccp_set_state(struct sock *sk, const int state);
-void dccp_done(struct sock *sk);
-
-int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp,
- struct sk_buff const *skb);
-
-int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
-
-struct sock *dccp_create_openreq_child(const struct sock *sk,
- const struct request_sock *req,
- const struct sk_buff *skb);
-
-int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
-
-struct sock *dccp_v4_request_recv_sock(const struct sock *sk, struct sk_buff *skb,
- struct request_sock *req,
- struct dst_entry *dst,
- struct request_sock *req_unhash,
- bool *own_req);
-struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req);
-
-int dccp_child_process(struct sock *parent, struct sock *child,
- struct sk_buff *skb);
-int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- struct dccp_hdr *dh, unsigned int len);
-int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct dccp_hdr *dh, const unsigned int len);
-
-int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized);
-void dccp_destroy_sock(struct sock *sk);
-
-void dccp_close(struct sock *sk, long timeout);
-struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst,
- struct request_sock *req);
-
-int dccp_connect(struct sock *sk);
-int dccp_disconnect(struct sock *sk, int flags);
-int dccp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen);
-int dccp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen);
-#ifdef CONFIG_COMPAT
-int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen);
-int compat_dccp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen);
-#endif
-int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg);
-int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
-int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
- int flags, int *addr_len);
-void dccp_shutdown(struct sock *sk, int how);
-int inet_dccp_listen(struct socket *sock, int backlog);
-__poll_t dccp_poll(struct file *file, struct socket *sock,
- poll_table *wait);
-int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
-void dccp_req_err(struct sock *sk, u64 seq);
-
-struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *skb);
-int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code);
-void dccp_send_close(struct sock *sk, const int active);
-int dccp_invalid_packet(struct sk_buff *skb);
-u32 dccp_sample_rtt(struct sock *sk, long delta);
-
-static inline bool dccp_bad_service_code(const struct sock *sk,
- const __be32 service)
-{
- const struct dccp_sock *dp = dccp_sk(sk);
-
- if (dp->dccps_service == service)
- return false;
- return !dccp_list_has_service(dp->dccps_service_list, service);
-}
-
-/**
- * dccp_skb_cb - DCCP per-packet control information
- * @dccpd_type: one of %dccp_pkt_type (or unknown)
- * @dccpd_ccval: CCVal field (5.1), see e.g. RFC 4342, 8.1
- * @dccpd_reset_code: one of %dccp_reset_codes
- * @dccpd_reset_data: Data1..3 fields (depend on @dccpd_reset_code)
- * @dccpd_opt_len: total length of all options (5.8) in the packet
- * @dccpd_seq: sequence number
- * @dccpd_ack_seq: acknowledgment number subheader field value
- *
- * This is used for transmission as well as for reception.
- */
-struct dccp_skb_cb {
- union {
- struct inet_skb_parm h4;
-#if IS_ENABLED(CONFIG_IPV6)
- struct inet6_skb_parm h6;
-#endif
- } header;
- __u8 dccpd_type:4;
- __u8 dccpd_ccval:4;
- __u8 dccpd_reset_code,
- dccpd_reset_data[3];
- __u16 dccpd_opt_len;
- __u64 dccpd_seq;
- __u64 dccpd_ack_seq;
-};
-
-#define DCCP_SKB_CB(__skb) ((struct dccp_skb_cb *)&((__skb)->cb[0]))
-
-/* RFC 4340, sec. 7.7 */
-static inline int dccp_non_data_packet(const struct sk_buff *skb)
-{
- const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
-
- return type == DCCP_PKT_ACK ||
- type == DCCP_PKT_CLOSE ||
- type == DCCP_PKT_CLOSEREQ ||
- type == DCCP_PKT_RESET ||
- type == DCCP_PKT_SYNC ||
- type == DCCP_PKT_SYNCACK;
-}
-
-/* RFC 4340, sec. 7.7 */
-static inline int dccp_data_packet(const struct sk_buff *skb)
-{
- const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
-
- return type == DCCP_PKT_DATA ||
- type == DCCP_PKT_DATAACK ||
- type == DCCP_PKT_REQUEST ||
- type == DCCP_PKT_RESPONSE;
-}
-
-static inline int dccp_packet_without_ack(const struct sk_buff *skb)
-{
- const __u8 type = DCCP_SKB_CB(skb)->dccpd_type;
-
- return type == DCCP_PKT_DATA || type == DCCP_PKT_REQUEST;
-}
-
-#define DCCP_PKT_WITHOUT_ACK_SEQ (UINT48_MAX << 2)
-
-static inline void dccp_hdr_set_seq(struct dccp_hdr *dh, const u64 gss)
-{
- struct dccp_hdr_ext *dhx = (struct dccp_hdr_ext *)((void *)dh +
- sizeof(*dh));
- dh->dccph_seq2 = 0;
- dh->dccph_seq = htons((gss >> 32) & 0xfffff);
- dhx->dccph_seq_low = htonl(gss & 0xffffffff);
-}
-
-static inline void dccp_hdr_set_ack(struct dccp_hdr_ack_bits *dhack,
- const u64 gsr)
-{
- dhack->dccph_reserved1 = 0;
- dhack->dccph_ack_nr_high = htons(gsr >> 32);
- dhack->dccph_ack_nr_low = htonl(gsr & 0xffffffff);
-}
-
-static inline void dccp_update_gsr(struct sock *sk, u64 seq)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- if (after48(seq, dp->dccps_gsr))
- dp->dccps_gsr = seq;
- /* Sequence validity window depends on remote Sequence Window (7.5.1) */
- dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4);
- /*
- * Adjust SWL so that it is not below ISR. In contrast to RFC 4340,
- * 7.5.1 we perform this check beyond the initial handshake: W/W' are
- * always > 32, so for the first W/W' packets in the lifetime of a
- * connection we always have to adjust SWL.
- * A second reason why we are doing this is that the window depends on
- * the feature-remote value of Sequence Window: nothing stops the peer
- * from updating this value while we are busy adjusting SWL for the
- * first W packets (we would have to count from scratch again then).
- * Therefore it is safer to always make sure that the Sequence Window
- * is not artificially extended by a peer who grows SWL downwards by
- * continually updating the feature-remote Sequence-Window.
- * If sequence numbers wrap it is bad luck. But that will take a while
- * (48 bit), and this measure prevents Sequence-number attacks.
- */
- if (before48(dp->dccps_swl, dp->dccps_isr))
- dp->dccps_swl = dp->dccps_isr;
- dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4);
-}
-
-static inline void dccp_update_gss(struct sock *sk, u64 seq)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- dp->dccps_gss = seq;
- /* Ack validity window depends on local Sequence Window value (7.5.1) */
- dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win);
- /* Adjust AWL so that it is not below ISS - see comment above for SWL */
- if (before48(dp->dccps_awl, dp->dccps_iss))
- dp->dccps_awl = dp->dccps_iss;
- dp->dccps_awh = dp->dccps_gss;
-}
-
-static inline int dccp_ackvec_pending(const struct sock *sk)
-{
- return dccp_sk(sk)->dccps_hc_rx_ackvec != NULL &&
- !dccp_ackvec_is_empty(dccp_sk(sk)->dccps_hc_rx_ackvec);
-}
-
-static inline int dccp_ack_pending(const struct sock *sk)
-{
- return dccp_ackvec_pending(sk) || inet_csk_ack_scheduled(sk);
-}
-
-int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val);
-int dccp_feat_finalise_settings(struct dccp_sock *dp);
-int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq);
-int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*,
- struct sk_buff *skb);
-int dccp_feat_activate_values(struct sock *sk, struct list_head *fn);
-void dccp_feat_list_purge(struct list_head *fn_list);
-
-int dccp_insert_options(struct sock *sk, struct sk_buff *skb);
-int dccp_insert_options_rsk(struct dccp_request_sock *, struct sk_buff *);
-u32 dccp_timestamp(void);
-void dccp_timestamping_init(void);
-int dccp_insert_option(struct sk_buff *skb, unsigned char option,
- const void *value, unsigned char len);
-
-#ifdef CONFIG_SYSCTL
-int dccp_sysctl_init(void);
-void dccp_sysctl_exit(void);
-#else
-static inline int dccp_sysctl_init(void)
-{
- return 0;
-}
-
-static inline void dccp_sysctl_exit(void)
-{
-}
-#endif
-
-#endif /* _DCCP_H */
diff --git a/net/dccp/diag.c b/net/dccp/diag.c
deleted file mode 100644
index 2d84303ea6bf..000000000000
--- a/net/dccp/diag.c
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * net/dccp/diag.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@mandriva.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-
-#include <linux/module.h>
-#include <linux/inet_diag.h>
-
-#include "ccid.h"
-#include "dccp.h"
-
-static void dccp_get_info(struct sock *sk, struct tcp_info *info)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- const struct inet_connection_sock *icsk = inet_csk(sk);
-
- memset(info, 0, sizeof(*info));
-
- info->tcpi_state = sk->sk_state;
- info->tcpi_retransmits = icsk->icsk_retransmits;
- info->tcpi_probes = icsk->icsk_probes_out;
- info->tcpi_backoff = icsk->icsk_backoff;
- info->tcpi_pmtu = icsk->icsk_pmtu_cookie;
-
- if (dp->dccps_hc_rx_ackvec != NULL)
- info->tcpi_options |= TCPI_OPT_SACK;
-
- if (dp->dccps_hc_rx_ccid != NULL)
- ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info);
-
- if (dp->dccps_hc_tx_ccid != NULL)
- ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info);
-}
-
-static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
- void *_info)
-{
- r->idiag_rqueue = r->idiag_wqueue = 0;
-
- if (_info != NULL)
- dccp_get_info(sk, _info);
-}
-
-static void dccp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r, struct nlattr *bc)
-{
- inet_diag_dump_icsk(&dccp_hashinfo, skb, cb, r, bc);
-}
-
-static int dccp_diag_dump_one(struct sk_buff *in_skb,
- const struct nlmsghdr *nlh,
- const struct inet_diag_req_v2 *req)
-{
- return inet_diag_dump_one_icsk(&dccp_hashinfo, in_skb, nlh, req);
-}
-
-static const struct inet_diag_handler dccp_diag_handler = {
- .dump = dccp_diag_dump,
- .dump_one = dccp_diag_dump_one,
- .idiag_get_info = dccp_diag_get_info,
- .idiag_type = IPPROTO_DCCP,
- .idiag_info_size = sizeof(struct tcp_info),
-};
-
-static int __init dccp_diag_init(void)
-{
- return inet_diag_register(&dccp_diag_handler);
-}
-
-static void __exit dccp_diag_fini(void)
-{
- inet_diag_unregister(&dccp_diag_handler);
-}
-
-module_init(dccp_diag_init);
-module_exit(dccp_diag_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
-MODULE_DESCRIPTION("DCCP inet_diag handler");
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-33 /* AF_INET - IPPROTO_DCCP */);
diff --git a/net/dccp/feat.c b/net/dccp/feat.c
deleted file mode 100644
index f227f002c73d..000000000000
--- a/net/dccp/feat.c
+++ /dev/null
@@ -1,1564 +0,0 @@
-/*
- * net/dccp/feat.c
- *
- * Feature negotiation for the DCCP protocol (RFC 4340, section 6)
- *
- * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
- * Rewrote from scratch, some bits from earlier code by
- * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
- *
- *
- * ASSUMPTIONS
- * -----------
- * o Feature negotiation is coordinated with connection setup (as in TCP), wild
- * changes of parameters of an established connection are not supported.
- * o Changing non-negotiable (NN) values is supported in state OPEN/PARTOPEN.
- * o All currently known SP features have 1-byte quantities. If in the future
- * extensions of RFCs 4340..42 define features with item lengths larger than
- * one byte, a feature-specific extension of the code will be required.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/module.h>
-#include <linux/slab.h>
-#include "ccid.h"
-#include "feat.h"
-
-/* feature-specific sysctls - initialised to the defaults from RFC 4340, 6.4 */
-unsigned long sysctl_dccp_sequence_window __read_mostly = 100;
-int sysctl_dccp_rx_ccid __read_mostly = 2,
- sysctl_dccp_tx_ccid __read_mostly = 2;
-
-/*
- * Feature activation handlers.
- *
- * These all use an u64 argument, to provide enough room for NN/SP features. At
- * this stage the negotiated values have been checked to be within their range.
- */
-static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct ccid *new_ccid = ccid_new(ccid, sk, rx);
-
- if (new_ccid == NULL)
- return -ENOMEM;
-
- if (rx) {
- ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
- dp->dccps_hc_rx_ccid = new_ccid;
- } else {
- ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
- dp->dccps_hc_tx_ccid = new_ccid;
- }
- return 0;
-}
-
-static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- if (rx) {
- dp->dccps_r_seq_win = seq_win;
- /* propagate changes to update SWL/SWH */
- dccp_update_gsr(sk, dp->dccps_gsr);
- } else {
- dp->dccps_l_seq_win = seq_win;
- /* propagate changes to update AWL */
- dccp_update_gss(sk, dp->dccps_gss);
- }
- return 0;
-}
-
-static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx)
-{
- if (rx)
- dccp_sk(sk)->dccps_r_ack_ratio = ratio;
- else
- dccp_sk(sk)->dccps_l_ack_ratio = ratio;
- return 0;
-}
-
-static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- if (rx) {
- if (enable && dp->dccps_hc_rx_ackvec == NULL) {
- dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any());
- if (dp->dccps_hc_rx_ackvec == NULL)
- return -ENOMEM;
- } else if (!enable) {
- dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
- dp->dccps_hc_rx_ackvec = NULL;
- }
- }
- return 0;
-}
-
-static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx)
-{
- if (!rx)
- dccp_sk(sk)->dccps_send_ndp_count = (enable > 0);
- return 0;
-}
-
-/*
- * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that
- * `rx' holds when the sending peer informs about his partial coverage via a
- * ChangeR() option. In the other case, we are the sender and the receiver
- * announces its coverage via ChangeL() options. The policy here is to honour
- * such communication by enabling the corresponding partial coverage - but only
- * if it has not been set manually before; the warning here means that all
- * packets will be dropped.
- */
-static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- if (rx)
- dp->dccps_pcrlen = cscov;
- else {
- if (dp->dccps_pcslen == 0)
- dp->dccps_pcslen = cscov;
- else if (cscov > dp->dccps_pcslen)
- DCCP_WARN("CsCov %u too small, peer requires >= %u\n",
- dp->dccps_pcslen, (u8)cscov);
- }
- return 0;
-}
-
-static const struct {
- u8 feat_num; /* DCCPF_xxx */
- enum dccp_feat_type rxtx; /* RX or TX */
- enum dccp_feat_type reconciliation; /* SP or NN */
- u8 default_value; /* as in 6.4 */
- int (*activation_hdlr)(struct sock *sk, u64 val, bool rx);
-/*
- * Lookup table for location and type of features (from RFC 4340/4342)
- * +--------------------------+----+-----+----+----+---------+-----------+
- * | Feature | Location | Reconc. | Initial | Section |
- * | | RX | TX | SP | NN | Value | Reference |
- * +--------------------------+----+-----+----+----+---------+-----------+
- * | DCCPF_CCID | | X | X | | 2 | 10 |
- * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 |
- * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 |
- * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 |
- * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 |
- * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 |
- * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 |
- * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 |
- * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 |
- * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 |
- * +--------------------------+----+-----+----+----+---------+-----------+
- */
-} dccp_feat_table[] = {
- { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid },
- { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL },
- { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win },
- { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL },
- { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio},
- { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec },
- { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp },
- { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov},
- { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL },
- { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL },
-};
-#define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table)
-
-/**
- * dccp_feat_index - Hash function to map feature number into array position
- * Returns consecutive array index or -1 if the feature is not understood.
- */
-static int dccp_feat_index(u8 feat_num)
-{
- /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */
- if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM)
- return feat_num - 1;
-
- /*
- * Other features: add cases for new feature types here after adding
- * them to the above table.
- */
- switch (feat_num) {
- case DCCPF_SEND_LEV_RATE:
- return DCCP_FEAT_SUPPORTED_MAX - 1;
- }
- return -1;
-}
-
-static u8 dccp_feat_type(u8 feat_num)
-{
- int idx = dccp_feat_index(feat_num);
-
- if (idx < 0)
- return FEAT_UNKNOWN;
- return dccp_feat_table[idx].reconciliation;
-}
-
-static int dccp_feat_default_value(u8 feat_num)
-{
- int idx = dccp_feat_index(feat_num);
- /*
- * There are no default values for unknown features, so encountering a
- * negative index here indicates a serious problem somewhere else.
- */
- DCCP_BUG_ON(idx < 0);
-
- return idx < 0 ? 0 : dccp_feat_table[idx].default_value;
-}
-
-/*
- * Debugging and verbose-printing section
- */
-static const char *dccp_feat_fname(const u8 feat)
-{
- static const char *const feature_names[] = {
- [DCCPF_RESERVED] = "Reserved",
- [DCCPF_CCID] = "CCID",
- [DCCPF_SHORT_SEQNOS] = "Allow Short Seqnos",
- [DCCPF_SEQUENCE_WINDOW] = "Sequence Window",
- [DCCPF_ECN_INCAPABLE] = "ECN Incapable",
- [DCCPF_ACK_RATIO] = "Ack Ratio",
- [DCCPF_SEND_ACK_VECTOR] = "Send ACK Vector",
- [DCCPF_SEND_NDP_COUNT] = "Send NDP Count",
- [DCCPF_MIN_CSUM_COVER] = "Min. Csum Coverage",
- [DCCPF_DATA_CHECKSUM] = "Send Data Checksum",
- };
- if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC)
- return feature_names[DCCPF_RESERVED];
-
- if (feat == DCCPF_SEND_LEV_RATE)
- return "Send Loss Event Rate";
- if (feat >= DCCPF_MIN_CCID_SPECIFIC)
- return "CCID-specific";
-
- return feature_names[feat];
-}
-
-static const char *const dccp_feat_sname[] = {
- "DEFAULT", "INITIALISING", "CHANGING", "UNSTABLE", "STABLE",
-};
-
-#ifdef CONFIG_IP_DCCP_DEBUG
-static const char *dccp_feat_oname(const u8 opt)
-{
- switch (opt) {
- case DCCPO_CHANGE_L: return "Change_L";
- case DCCPO_CONFIRM_L: return "Confirm_L";
- case DCCPO_CHANGE_R: return "Change_R";
- case DCCPO_CONFIRM_R: return "Confirm_R";
- }
- return NULL;
-}
-
-static void dccp_feat_printval(u8 feat_num, dccp_feat_val const *val)
-{
- u8 i, type = dccp_feat_type(feat_num);
-
- if (val == NULL || (type == FEAT_SP && val->sp.vec == NULL))
- dccp_pr_debug_cat("(NULL)");
- else if (type == FEAT_SP)
- for (i = 0; i < val->sp.len; i++)
- dccp_pr_debug_cat("%s%u", i ? " " : "", val->sp.vec[i]);
- else if (type == FEAT_NN)
- dccp_pr_debug_cat("%llu", (unsigned long long)val->nn);
- else
- dccp_pr_debug_cat("unknown type %u", type);
-}
-
-static void dccp_feat_printvals(u8 feat_num, u8 *list, u8 len)
-{
- u8 type = dccp_feat_type(feat_num);
- dccp_feat_val fval = { .sp.vec = list, .sp.len = len };
-
- if (type == FEAT_NN)
- fval.nn = dccp_decode_value_var(list, len);
- dccp_feat_printval(feat_num, &fval);
-}
-
-static void dccp_feat_print_entry(struct dccp_feat_entry const *entry)
-{
- dccp_debug(" * %s %s = ", entry->is_local ? "local" : "remote",
- dccp_feat_fname(entry->feat_num));
- dccp_feat_printval(entry->feat_num, &entry->val);
- dccp_pr_debug_cat(", state=%s %s\n", dccp_feat_sname[entry->state],
- entry->needs_confirm ? "(Confirm pending)" : "");
-}
-
-#define dccp_feat_print_opt(opt, feat, val, len, mandatory) do { \
- dccp_pr_debug("%s(%s, ", dccp_feat_oname(opt), dccp_feat_fname(feat));\
- dccp_feat_printvals(feat, val, len); \
- dccp_pr_debug_cat(") %s\n", mandatory ? "!" : ""); } while (0)
-
-#define dccp_feat_print_fnlist(fn_list) { \
- const struct dccp_feat_entry *___entry; \
- \
- dccp_pr_debug("List Dump:\n"); \
- list_for_each_entry(___entry, fn_list, node) \
- dccp_feat_print_entry(___entry); \
-}
-#else /* ! CONFIG_IP_DCCP_DEBUG */
-#define dccp_feat_print_opt(opt, feat, val, len, mandatory)
-#define dccp_feat_print_fnlist(fn_list)
-#endif
-
-static int __dccp_feat_activate(struct sock *sk, const int idx,
- const bool is_local, dccp_feat_val const *fval)
-{
- bool rx;
- u64 val;
-
- if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX)
- return -1;
- if (dccp_feat_table[idx].activation_hdlr == NULL)
- return 0;
-
- if (fval == NULL) {
- val = dccp_feat_table[idx].default_value;
- } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) {
- if (fval->sp.vec == NULL) {
- /*
- * This can happen when an empty Confirm is sent
- * for an SP (i.e. known) feature. In this case
- * we would be using the default anyway.
- */
- DCCP_CRIT("Feature #%d undefined: using default", idx);
- val = dccp_feat_table[idx].default_value;
- } else {
- val = fval->sp.vec[0];
- }
- } else {
- val = fval->nn;
- }
-
- /* Location is RX if this is a local-RX or remote-TX feature */
- rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX));
-
- dccp_debug(" -> activating %s %s, %sval=%llu\n", rx ? "RX" : "TX",
- dccp_feat_fname(dccp_feat_table[idx].feat_num),
- fval ? "" : "default ", (unsigned long long)val);
-
- return dccp_feat_table[idx].activation_hdlr(sk, val, rx);
-}
-
-/**
- * dccp_feat_activate - Activate feature value on socket
- * @sk: fully connected DCCP socket (after handshake is complete)
- * @feat_num: feature to activate, one of %dccp_feature_numbers
- * @local: whether local (1) or remote (0) @feat_num is meant
- * @fval: the value (SP or NN) to activate, or NULL to use the default value
- *
- * For general use this function is preferable over __dccp_feat_activate().
- */
-static int dccp_feat_activate(struct sock *sk, u8 feat_num, bool local,
- dccp_feat_val const *fval)
-{
- return __dccp_feat_activate(sk, dccp_feat_index(feat_num), local, fval);
-}
-
-/* Test for "Req'd" feature (RFC 4340, 6.4) */
-static inline int dccp_feat_must_be_understood(u8 feat_num)
-{
- return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS ||
- feat_num == DCCPF_SEQUENCE_WINDOW;
-}
-
-/* copy constructor, fval must not already contain allocated memory */
-static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len)
-{
- fval->sp.len = len;
- if (fval->sp.len > 0) {
- fval->sp.vec = kmemdup(val, len, gfp_any());
- if (fval->sp.vec == NULL) {
- fval->sp.len = 0;
- return -ENOBUFS;
- }
- }
- return 0;
-}
-
-static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val)
-{
- if (unlikely(val == NULL))
- return;
- if (dccp_feat_type(feat_num) == FEAT_SP)
- kfree(val->sp.vec);
- memset(val, 0, sizeof(*val));
-}
-
-static struct dccp_feat_entry *
- dccp_feat_clone_entry(struct dccp_feat_entry const *original)
-{
- struct dccp_feat_entry *new;
- u8 type = dccp_feat_type(original->feat_num);
-
- if (type == FEAT_UNKNOWN)
- return NULL;
-
- new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any());
- if (new == NULL)
- return NULL;
-
- if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val,
- original->val.sp.vec,
- original->val.sp.len)) {
- kfree(new);
- return NULL;
- }
- return new;
-}
-
-static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry)
-{
- if (entry != NULL) {
- dccp_feat_val_destructor(entry->feat_num, &entry->val);
- kfree(entry);
- }
-}
-
-/*
- * List management functions
- *
- * Feature negotiation lists rely on and maintain the following invariants:
- * - each feat_num in the list is known, i.e. we know its type and default value
- * - each feat_num/is_local combination is unique (old entries are overwritten)
- * - SP values are always freshly allocated
- * - list is sorted in increasing order of feature number (faster lookup)
- */
-static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list,
- u8 feat_num, bool is_local)
-{
- struct dccp_feat_entry *entry;
-
- list_for_each_entry(entry, fn_list, node) {
- if (entry->feat_num == feat_num && entry->is_local == is_local)
- return entry;
- else if (entry->feat_num > feat_num)
- break;
- }
- return NULL;
-}
-
-/**
- * dccp_feat_entry_new - Central list update routine (called by all others)
- * @head: list to add to
- * @feat: feature number
- * @local: whether the local (1) or remote feature with number @feat is meant
- *
- * This is the only constructor and serves to ensure the above invariants.
- */
-static struct dccp_feat_entry *
- dccp_feat_entry_new(struct list_head *head, u8 feat, bool local)
-{
- struct dccp_feat_entry *entry;
-
- list_for_each_entry(entry, head, node)
- if (entry->feat_num == feat && entry->is_local == local) {
- dccp_feat_val_destructor(entry->feat_num, &entry->val);
- return entry;
- } else if (entry->feat_num > feat) {
- head = &entry->node;
- break;
- }
-
- entry = kmalloc(sizeof(*entry), gfp_any());
- if (entry != NULL) {
- entry->feat_num = feat;
- entry->is_local = local;
- list_add_tail(&entry->node, head);
- }
- return entry;
-}
-
-/**
- * dccp_feat_push_change - Add/overwrite a Change option in the list
- * @fn_list: feature-negotiation list to update
- * @feat: one of %dccp_feature_numbers
- * @local: whether local (1) or remote (0) @feat_num is meant
- * @mandatory: whether to use Mandatory feature negotiation options
- * @fval: pointer to NN/SP value to be inserted (will be copied)
- */
-static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local,
- u8 mandatory, dccp_feat_val *fval)
-{
- struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
-
- if (new == NULL)
- return -ENOMEM;
-
- new->feat_num = feat;
- new->is_local = local;
- new->state = FEAT_INITIALISING;
- new->needs_confirm = false;
- new->empty_confirm = false;
- new->val = *fval;
- new->needs_mandatory = mandatory;
-
- return 0;
-}
-
-/**
- * dccp_feat_push_confirm - Add a Confirm entry to the FN list
- * @fn_list: feature-negotiation list to add to
- * @feat: one of %dccp_feature_numbers
- * @local: whether local (1) or remote (0) @feat_num is being confirmed
- * @fval: pointer to NN/SP value to be inserted or NULL
- *
- * Returns 0 on success, a Reset code for further processing otherwise.
- */
-static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local,
- dccp_feat_val *fval)
-{
- struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local);
-
- if (new == NULL)
- return DCCP_RESET_CODE_TOO_BUSY;
-
- new->feat_num = feat;
- new->is_local = local;
- new->state = FEAT_STABLE; /* transition in 6.6.2 */
- new->needs_confirm = true;
- new->empty_confirm = (fval == NULL);
- new->val.nn = 0; /* zeroes the whole structure */
- if (!new->empty_confirm)
- new->val = *fval;
- new->needs_mandatory = false;
-
- return 0;
-}
-
-static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local)
-{
- return dccp_feat_push_confirm(fn_list, feat, local, NULL);
-}
-
-static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry)
-{
- list_del(&entry->node);
- dccp_feat_entry_destructor(entry);
-}
-
-void dccp_feat_list_purge(struct list_head *fn_list)
-{
- struct dccp_feat_entry *entry, *next;
-
- list_for_each_entry_safe(entry, next, fn_list, node)
- dccp_feat_entry_destructor(entry);
- INIT_LIST_HEAD(fn_list);
-}
-EXPORT_SYMBOL_GPL(dccp_feat_list_purge);
-
-/* generate @to as full clone of @from - @to must not contain any nodes */
-int dccp_feat_clone_list(struct list_head const *from, struct list_head *to)
-{
- struct dccp_feat_entry *entry, *new;
-
- INIT_LIST_HEAD(to);
- list_for_each_entry(entry, from, node) {
- new = dccp_feat_clone_entry(entry);
- if (new == NULL)
- goto cloning_failed;
- list_add_tail(&new->node, to);
- }
- return 0;
-
-cloning_failed:
- dccp_feat_list_purge(to);
- return -ENOMEM;
-}
-
-/**
- * dccp_feat_valid_nn_length - Enforce length constraints on NN options
- * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only,
- * incoming options are accepted as long as their values are valid.
- */
-static u8 dccp_feat_valid_nn_length(u8 feat_num)
-{
- if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */
- return 2;
- if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */
- return 6;
- return 0;
-}
-
-static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val)
-{
- switch (feat_num) {
- case DCCPF_ACK_RATIO:
- return val <= DCCPF_ACK_RATIO_MAX;
- case DCCPF_SEQUENCE_WINDOW:
- return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX;
- }
- return 0; /* feature unknown - so we can't tell */
-}
-
-/* check that SP values are within the ranges defined in RFC 4340 */
-static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val)
-{
- switch (feat_num) {
- case DCCPF_CCID:
- return val == DCCPC_CCID2 || val == DCCPC_CCID3;
- /* Type-check Boolean feature values: */
- case DCCPF_SHORT_SEQNOS:
- case DCCPF_ECN_INCAPABLE:
- case DCCPF_SEND_ACK_VECTOR:
- case DCCPF_SEND_NDP_COUNT:
- case DCCPF_DATA_CHECKSUM:
- case DCCPF_SEND_LEV_RATE:
- return val < 2;
- case DCCPF_MIN_CSUM_COVER:
- return val < 16;
- }
- return 0; /* feature unknown */
-}
-
-static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len)
-{
- if (sp_list == NULL || sp_len < 1)
- return 0;
- while (sp_len--)
- if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++))
- return 0;
- return 1;
-}
-
-/**
- * dccp_feat_insert_opts - Generate FN options from current list state
- * @skb: next sk_buff to be sent to the peer
- * @dp: for client during handshake and general negotiation
- * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND)
- */
-int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq,
- struct sk_buff *skb)
-{
- struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg;
- struct dccp_feat_entry *pos, *next;
- u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN];
- bool rpt;
-
- /* put entries into @skb in the order they appear in the list */
- list_for_each_entry_safe_reverse(pos, next, fn, node) {
- opt = dccp_feat_genopt(pos);
- type = dccp_feat_type(pos->feat_num);
- rpt = false;
-
- if (pos->empty_confirm) {
- len = 0;
- ptr = NULL;
- } else {
- if (type == FEAT_SP) {
- len = pos->val.sp.len;
- ptr = pos->val.sp.vec;
- rpt = pos->needs_confirm;
- } else if (type == FEAT_NN) {
- len = dccp_feat_valid_nn_length(pos->feat_num);
- ptr = nn_in_nbo;
- dccp_encode_value_var(pos->val.nn, ptr, len);
- } else {
- DCCP_BUG("unknown feature %u", pos->feat_num);
- return -1;
- }
- }
- dccp_feat_print_opt(opt, pos->feat_num, ptr, len, 0);
-
- if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt))
- return -1;
- if (pos->needs_mandatory && dccp_insert_option_mandatory(skb))
- return -1;
-
- if (skb->sk->sk_state == DCCP_OPEN &&
- (opt == DCCPO_CONFIRM_R || opt == DCCPO_CONFIRM_L)) {
- /*
- * Confirms don't get retransmitted (6.6.3) once the
- * connection is in state OPEN
- */
- dccp_feat_list_pop(pos);
- } else {
- /*
- * Enter CHANGING after transmitting the Change
- * option (6.6.2).
- */
- if (pos->state == FEAT_INITIALISING)
- pos->state = FEAT_CHANGING;
- }
- }
- return 0;
-}
-
-/**
- * __feat_register_nn - Register new NN value on socket
- * @fn: feature-negotiation list to register with
- * @feat: an NN feature from %dccp_feature_numbers
- * @mandatory: use Mandatory option if 1
- * @nn_val: value to register (restricted to 4 bytes)
- *
- * Note that NN features are local by definition (RFC 4340, 6.3.2).
- */
-static int __feat_register_nn(struct list_head *fn, u8 feat,
- u8 mandatory, u64 nn_val)
-{
- dccp_feat_val fval = { .nn = nn_val };
-
- if (dccp_feat_type(feat) != FEAT_NN ||
- !dccp_feat_is_valid_nn_val(feat, nn_val))
- return -EINVAL;
-
- /* Don't bother with default values, they will be activated anyway. */
- if (nn_val - (u64)dccp_feat_default_value(feat) == 0)
- return 0;
-
- return dccp_feat_push_change(fn, feat, 1, mandatory, &fval);
-}
-
-/**
- * __feat_register_sp - Register new SP value/list on socket
- * @fn: feature-negotiation list to register with
- * @feat: an SP feature from %dccp_feature_numbers
- * @is_local: whether the local (1) or the remote (0) @feat is meant
- * @mandatory: use Mandatory option if 1
- * @sp_val: SP value followed by optional preference list
- * @sp_len: length of @sp_val in bytes
- */
-static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local,
- u8 mandatory, u8 const *sp_val, u8 sp_len)
-{
- dccp_feat_val fval;
-
- if (dccp_feat_type(feat) != FEAT_SP ||
- !dccp_feat_sp_list_ok(feat, sp_val, sp_len))
- return -EINVAL;
-
- /* Avoid negotiating alien CCIDs by only advertising supported ones */
- if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len))
- return -EOPNOTSUPP;
-
- if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len))
- return -ENOMEM;
-
- return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval);
-}
-
-/**
- * dccp_feat_register_sp - Register requests to change SP feature values
- * @sk: client or listening socket
- * @feat: one of %dccp_feature_numbers
- * @is_local: whether the local (1) or remote (0) @feat is meant
- * @list: array of preferred values, in descending order of preference
- * @len: length of @list in bytes
- */
-int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
- u8 const *list, u8 len)
-{ /* any changes must be registered before establishing the connection */
- if (sk->sk_state != DCCP_CLOSED)
- return -EISCONN;
- if (dccp_feat_type(feat) != FEAT_SP)
- return -EINVAL;
- return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local,
- 0, list, len);
-}
-
-/**
- * dccp_feat_nn_get - Query current/pending value of NN feature
- * @sk: DCCP socket of an established connection
- * @feat: NN feature number from %dccp_feature_numbers
- *
- * For a known NN feature, returns value currently being negotiated, or
- * current (confirmed) value if no negotiation is going on.
- */
-u64 dccp_feat_nn_get(struct sock *sk, u8 feat)
-{
- if (dccp_feat_type(feat) == FEAT_NN) {
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_feat_entry *entry;
-
- entry = dccp_feat_list_lookup(&dp->dccps_featneg, feat, 1);
- if (entry != NULL)
- return entry->val.nn;
-
- switch (feat) {
- case DCCPF_ACK_RATIO:
- return dp->dccps_l_ack_ratio;
- case DCCPF_SEQUENCE_WINDOW:
- return dp->dccps_l_seq_win;
- }
- }
- DCCP_BUG("attempt to look up unsupported feature %u", feat);
- return 0;
-}
-EXPORT_SYMBOL_GPL(dccp_feat_nn_get);
-
-/**
- * dccp_feat_signal_nn_change - Update NN values for an established connection
- * @sk: DCCP socket of an established connection
- * @feat: NN feature number from %dccp_feature_numbers
- * @nn_val: the new value to use
- *
- * This function is used to communicate NN updates out-of-band.
- */
-int dccp_feat_signal_nn_change(struct sock *sk, u8 feat, u64 nn_val)
-{
- struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
- dccp_feat_val fval = { .nn = nn_val };
- struct dccp_feat_entry *entry;
-
- if (sk->sk_state != DCCP_OPEN && sk->sk_state != DCCP_PARTOPEN)
- return 0;
-
- if (dccp_feat_type(feat) != FEAT_NN ||
- !dccp_feat_is_valid_nn_val(feat, nn_val))
- return -EINVAL;
-
- if (nn_val == dccp_feat_nn_get(sk, feat))
- return 0; /* already set or negotiation under way */
-
- entry = dccp_feat_list_lookup(fn, feat, 1);
- if (entry != NULL) {
- dccp_pr_debug("Clobbering existing NN entry %llu -> %llu\n",
- (unsigned long long)entry->val.nn,
- (unsigned long long)nn_val);
- dccp_feat_list_pop(entry);
- }
-
- inet_csk_schedule_ack(sk);
- return dccp_feat_push_change(fn, feat, 1, 0, &fval);
-}
-EXPORT_SYMBOL_GPL(dccp_feat_signal_nn_change);
-
-/*
- * Tracking features whose value depend on the choice of CCID
- *
- * This is designed with an extension in mind so that a list walk could be done
- * before activating any features. However, the existing framework was found to
- * work satisfactorily up until now, the automatic verification is left open.
- * When adding new CCIDs, add a corresponding dependency table here.
- */
-static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local)
-{
- static const struct ccid_dependency ccid2_dependencies[2][2] = {
- /*
- * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX
- * feature and Send Ack Vector is an RX feature, `is_local'
- * needs to be reversed.
- */
- { /* Dependencies of the receiver-side (remote) CCID2 */
- {
- .dependent_feat = DCCPF_SEND_ACK_VECTOR,
- .is_local = true,
- .is_mandatory = true,
- .val = 1
- },
- { 0, 0, 0, 0 }
- },
- { /* Dependencies of the sender-side (local) CCID2 */
- {
- .dependent_feat = DCCPF_SEND_ACK_VECTOR,
- .is_local = false,
- .is_mandatory = true,
- .val = 1
- },
- { 0, 0, 0, 0 }
- }
- };
- static const struct ccid_dependency ccid3_dependencies[2][5] = {
- { /*
- * Dependencies of the receiver-side CCID3
- */
- { /* locally disable Ack Vectors */
- .dependent_feat = DCCPF_SEND_ACK_VECTOR,
- .is_local = true,
- .is_mandatory = false,
- .val = 0
- },
- { /* see below why Send Loss Event Rate is on */
- .dependent_feat = DCCPF_SEND_LEV_RATE,
- .is_local = true,
- .is_mandatory = true,
- .val = 1
- },
- { /* NDP Count is needed as per RFC 4342, 6.1.1 */
- .dependent_feat = DCCPF_SEND_NDP_COUNT,
- .is_local = false,
- .is_mandatory = true,
- .val = 1
- },
- { 0, 0, 0, 0 },
- },
- { /*
- * CCID3 at the TX side: we request that the HC-receiver
- * will not send Ack Vectors (they will be ignored, so
- * Mandatory is not set); we enable Send Loss Event Rate
- * (Mandatory since the implementation does not support
- * the Loss Intervals option of RFC 4342, 8.6).
- * The last two options are for peer's information only.
- */
- {
- .dependent_feat = DCCPF_SEND_ACK_VECTOR,
- .is_local = false,
- .is_mandatory = false,
- .val = 0
- },
- {
- .dependent_feat = DCCPF_SEND_LEV_RATE,
- .is_local = false,
- .is_mandatory = true,
- .val = 1
- },
- { /* this CCID does not support Ack Ratio */
- .dependent_feat = DCCPF_ACK_RATIO,
- .is_local = true,
- .is_mandatory = false,
- .val = 0
- },
- { /* tell receiver we are sending NDP counts */
- .dependent_feat = DCCPF_SEND_NDP_COUNT,
- .is_local = true,
- .is_mandatory = false,
- .val = 1
- },
- { 0, 0, 0, 0 }
- }
- };
- switch (ccid) {
- case DCCPC_CCID2:
- return ccid2_dependencies[is_local];
- case DCCPC_CCID3:
- return ccid3_dependencies[is_local];
- default:
- return NULL;
- }
-}
-
-/**
- * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID
- * @fn: feature-negotiation list to update
- * @id: CCID number to track
- * @is_local: whether TX CCID (1) or RX CCID (0) is meant
- *
- * This function needs to be called after registering all other features.
- */
-static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local)
-{
- const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local);
- int i, rc = (table == NULL);
-
- for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++)
- if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP)
- rc = __feat_register_sp(fn, table[i].dependent_feat,
- table[i].is_local,
- table[i].is_mandatory,
- &table[i].val, 1);
- else
- rc = __feat_register_nn(fn, table[i].dependent_feat,
- table[i].is_mandatory,
- table[i].val);
- return rc;
-}
-
-/**
- * dccp_feat_finalise_settings - Finalise settings before starting negotiation
- * @dp: client or listening socket (settings will be inherited)
- *
- * This is called after all registrations (socket initialisation, sysctls, and
- * sockopt calls), and before sending the first packet containing Change options
- * (ie. client-Request or server-Response), to ensure internal consistency.
- */
-int dccp_feat_finalise_settings(struct dccp_sock *dp)
-{
- struct list_head *fn = &dp->dccps_featneg;
- struct dccp_feat_entry *entry;
- int i = 2, ccids[2] = { -1, -1 };
-
- /*
- * Propagating CCIDs:
- * 1) not useful to propagate CCID settings if this host advertises more
- * than one CCID: the choice of CCID may still change - if this is
- * the client, or if this is the server and the client sends
- * singleton CCID values.
- * 2) since is that propagate_ccid changes the list, we defer changing
- * the sorted list until after the traversal.
- */
- list_for_each_entry(entry, fn, node)
- if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1)
- ccids[entry->is_local] = entry->val.sp.vec[0];
- while (i--)
- if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i))
- return -1;
- dccp_feat_print_fnlist(fn);
- return 0;
-}
-
-/**
- * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features
- * It is the server which resolves the dependencies once the CCID has been
- * fully negotiated. If no CCID has been negotiated, it uses the default CCID.
- */
-int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq)
-{
- struct list_head *fn = &dreq->dreq_featneg;
- struct dccp_feat_entry *entry;
- u8 is_local, ccid;
-
- for (is_local = 0; is_local <= 1; is_local++) {
- entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local);
-
- if (entry != NULL && !entry->empty_confirm)
- ccid = entry->val.sp.vec[0];
- else
- ccid = dccp_feat_default_value(DCCPF_CCID);
-
- if (dccp_feat_propagate_ccid(fn, ccid, is_local))
- return -1;
- }
- return 0;
-}
-
-/* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */
-static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen)
-{
- u8 c, s;
-
- for (s = 0; s < slen; s++)
- for (c = 0; c < clen; c++)
- if (servlist[s] == clilist[c])
- return servlist[s];
- return -1;
-}
-
-/**
- * dccp_feat_prefer - Move preferred entry to the start of array
- * Reorder the @array_len elements in @array so that @preferred_value comes
- * first. Returns >0 to indicate that @preferred_value does occur in @array.
- */
-static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len)
-{
- u8 i, does_occur = 0;
-
- if (array != NULL) {
- for (i = 0; i < array_len; i++)
- if (array[i] == preferred_value) {
- array[i] = array[0];
- does_occur++;
- }
- if (does_occur)
- array[0] = preferred_value;
- }
- return does_occur;
-}
-
-/**
- * dccp_feat_reconcile - Reconcile SP preference lists
- * @fv: SP list to reconcile into
- * @arr: received SP preference list
- * @len: length of @arr in bytes
- * @is_server: whether this side is the server (and @fv is the server's list)
- * @reorder: whether to reorder the list in @fv after reconciling with @arr
- * When successful, > 0 is returned and the reconciled list is in @fval.
- * A value of 0 means that negotiation failed (no shared entry).
- */
-static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len,
- bool is_server, bool reorder)
-{
- int rc;
-
- if (!fv->sp.vec || !arr) {
- DCCP_CRIT("NULL feature value or array");
- return 0;
- }
-
- if (is_server)
- rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len);
- else
- rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len);
-
- if (!reorder)
- return rc;
- if (rc < 0)
- return 0;
-
- /*
- * Reorder list: used for activating features and in dccp_insert_fn_opt.
- */
- return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len);
-}
-
-/**
- * dccp_feat_change_recv - Process incoming ChangeL/R options
- * @fn: feature-negotiation list to update
- * @is_mandatory: whether the Change was preceded by a Mandatory option
- * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R
- * @feat: one of %dccp_feature_numbers
- * @val: NN value or SP value/preference list
- * @len: length of @val in bytes
- * @server: whether this node is the server (1) or the client (0)
- */
-static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
- u8 feat, u8 *val, u8 len, const bool server)
-{
- u8 defval, type = dccp_feat_type(feat);
- const bool local = (opt == DCCPO_CHANGE_R);
- struct dccp_feat_entry *entry;
- dccp_feat_val fval;
-
- if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */
- goto unknown_feature_or_value;
-
- dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
-
- /*
- * Negotiation of NN features: Change R is invalid, so there is no
- * simultaneous negotiation; hence we do not look up in the list.
- */
- if (type == FEAT_NN) {
- if (local || len > sizeof(fval.nn))
- goto unknown_feature_or_value;
-
- /* 6.3.2: "The feature remote MUST accept any valid value..." */
- fval.nn = dccp_decode_value_var(val, len);
- if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
- goto unknown_feature_or_value;
-
- return dccp_feat_push_confirm(fn, feat, local, &fval);
- }
-
- /*
- * Unidirectional/simultaneous negotiation of SP features (6.3.1)
- */
- entry = dccp_feat_list_lookup(fn, feat, local);
- if (entry == NULL) {
- /*
- * No particular preferences have been registered. We deal with
- * this situation by assuming that all valid values are equally
- * acceptable, and apply the following checks:
- * - if the peer's list is a singleton, we accept a valid value;
- * - if we are the server, we first try to see if the peer (the
- * client) advertises the default value. If yes, we use it,
- * otherwise we accept the preferred value;
- * - else if we are the client, we use the first list element.
- */
- if (dccp_feat_clone_sp_val(&fval, val, 1))
- return DCCP_RESET_CODE_TOO_BUSY;
-
- if (len > 1 && server) {
- defval = dccp_feat_default_value(feat);
- if (dccp_feat_preflist_match(&defval, 1, val, len) > -1)
- fval.sp.vec[0] = defval;
- } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) {
- kfree(fval.sp.vec);
- goto unknown_feature_or_value;
- }
-
- /* Treat unsupported CCIDs like invalid values */
- if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) {
- kfree(fval.sp.vec);
- goto not_valid_or_not_known;
- }
-
- return dccp_feat_push_confirm(fn, feat, local, &fval);
-
- } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */
- return 0;
- }
-
- if (dccp_feat_reconcile(&entry->val, val, len, server, true)) {
- entry->empty_confirm = false;
- } else if (is_mandatory) {
- return DCCP_RESET_CODE_MANDATORY_ERROR;
- } else if (entry->state == FEAT_INITIALISING) {
- /*
- * Failed simultaneous negotiation (server only): try to `save'
- * the connection by checking whether entry contains the default
- * value for @feat. If yes, send an empty Confirm to signal that
- * the received Change was not understood - which implies using
- * the default value.
- * If this also fails, we use Reset as the last resort.
- */
- WARN_ON(!server);
- defval = dccp_feat_default_value(feat);
- if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true))
- return DCCP_RESET_CODE_OPTION_ERROR;
- entry->empty_confirm = true;
- }
- entry->needs_confirm = true;
- entry->needs_mandatory = false;
- entry->state = FEAT_STABLE;
- return 0;
-
-unknown_feature_or_value:
- if (!is_mandatory)
- return dccp_push_empty_confirm(fn, feat, local);
-
-not_valid_or_not_known:
- return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
- : DCCP_RESET_CODE_OPTION_ERROR;
-}
-
-/**
- * dccp_feat_confirm_recv - Process received Confirm options
- * @fn: feature-negotiation list to update
- * @is_mandatory: whether @opt was preceded by a Mandatory option
- * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R
- * @feat: one of %dccp_feature_numbers
- * @val: NN value or SP value/preference list
- * @len: length of @val in bytes
- * @server: whether this node is server (1) or client (0)
- */
-static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt,
- u8 feat, u8 *val, u8 len, const bool server)
-{
- u8 *plist, plen, type = dccp_feat_type(feat);
- const bool local = (opt == DCCPO_CONFIRM_R);
- struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local);
-
- dccp_feat_print_opt(opt, feat, val, len, is_mandatory);
-
- if (entry == NULL) { /* nothing queued: ignore or handle error */
- if (is_mandatory && type == FEAT_UNKNOWN)
- return DCCP_RESET_CODE_MANDATORY_ERROR;
-
- if (!local && type == FEAT_NN) /* 6.3.2 */
- goto confirmation_failed;
- return 0;
- }
-
- if (entry->state != FEAT_CHANGING) /* 6.6.2 */
- return 0;
-
- if (len == 0) {
- if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */
- goto confirmation_failed;
- /*
- * Empty Confirm during connection setup: this means reverting
- * to the `old' value, which in this case is the default. Since
- * we handle default values automatically when no other values
- * have been set, we revert to the old value by removing this
- * entry from the list.
- */
- dccp_feat_list_pop(entry);
- return 0;
- }
-
- if (type == FEAT_NN) {
- if (len > sizeof(entry->val.nn))
- goto confirmation_failed;
-
- if (entry->val.nn == dccp_decode_value_var(val, len))
- goto confirmation_succeeded;
-
- DCCP_WARN("Bogus Confirm for non-existing value\n");
- goto confirmation_failed;
- }
-
- /*
- * Parsing SP Confirms: the first element of @val is the preferred
- * SP value which the peer confirms, the remainder depends on @len.
- * Note that only the confirmed value need to be a valid SP value.
- */
- if (!dccp_feat_is_valid_sp_val(feat, *val))
- goto confirmation_failed;
-
- if (len == 1) { /* peer didn't supply a preference list */
- plist = val;
- plen = len;
- } else { /* preferred value + preference list */
- plist = val + 1;
- plen = len - 1;
- }
-
- /* Check whether the peer got the reconciliation right (6.6.8) */
- if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) {
- DCCP_WARN("Confirm selected the wrong value %u\n", *val);
- return DCCP_RESET_CODE_OPTION_ERROR;
- }
- entry->val.sp.vec[0] = *val;
-
-confirmation_succeeded:
- entry->state = FEAT_STABLE;
- return 0;
-
-confirmation_failed:
- DCCP_WARN("Confirmation failed\n");
- return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
- : DCCP_RESET_CODE_OPTION_ERROR;
-}
-
-/**
- * dccp_feat_handle_nn_established - Fast-path reception of NN options
- * @sk: socket of an established DCCP connection
- * @mandatory: whether @opt was preceded by a Mandatory option
- * @opt: %DCCPO_CHANGE_L | %DCCPO_CONFIRM_R (NN only)
- * @feat: NN number, one of %dccp_feature_numbers
- * @val: NN value
- * @len: length of @val in bytes
- *
- * This function combines the functionality of change_recv/confirm_recv, with
- * the following differences (reset codes are the same):
- * - cleanup after receiving the Confirm;
- * - values are directly activated after successful parsing;
- * - deliberately restricted to NN features.
- * The restriction to NN features is essential since SP features can have non-
- * predictable outcomes (depending on the remote configuration), and are inter-
- * dependent (CCIDs for instance cause further dependencies).
- */
-static u8 dccp_feat_handle_nn_established(struct sock *sk, u8 mandatory, u8 opt,
- u8 feat, u8 *val, u8 len)
-{
- struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
- const bool local = (opt == DCCPO_CONFIRM_R);
- struct dccp_feat_entry *entry;
- u8 type = dccp_feat_type(feat);
- dccp_feat_val fval;
-
- dccp_feat_print_opt(opt, feat, val, len, mandatory);
-
- /* Ignore non-mandatory unknown and non-NN features */
- if (type == FEAT_UNKNOWN) {
- if (local && !mandatory)
- return 0;
- goto fast_path_unknown;
- } else if (type != FEAT_NN) {
- return 0;
- }
-
- /*
- * We don't accept empty Confirms, since in fast-path feature
- * negotiation the values are enabled immediately after sending
- * the Change option.
- * Empty Changes on the other hand are invalid (RFC 4340, 6.1).
- */
- if (len == 0 || len > sizeof(fval.nn))
- goto fast_path_unknown;
-
- if (opt == DCCPO_CHANGE_L) {
- fval.nn = dccp_decode_value_var(val, len);
- if (!dccp_feat_is_valid_nn_val(feat, fval.nn))
- goto fast_path_unknown;
-
- if (dccp_feat_push_confirm(fn, feat, local, &fval) ||
- dccp_feat_activate(sk, feat, local, &fval))
- return DCCP_RESET_CODE_TOO_BUSY;
-
- /* set the `Ack Pending' flag to piggyback a Confirm */
- inet_csk_schedule_ack(sk);
-
- } else if (opt == DCCPO_CONFIRM_R) {
- entry = dccp_feat_list_lookup(fn, feat, local);
- if (entry == NULL || entry->state != FEAT_CHANGING)
- return 0;
-
- fval.nn = dccp_decode_value_var(val, len);
- /*
- * Just ignore a value that doesn't match our current value.
- * If the option changes twice within two RTTs, then at least
- * one CONFIRM will be received for the old value after a
- * new CHANGE was sent.
- */
- if (fval.nn != entry->val.nn)
- return 0;
-
- /* Only activate after receiving the Confirm option (6.6.1). */
- dccp_feat_activate(sk, feat, local, &fval);
-
- /* It has been confirmed - so remove the entry */
- dccp_feat_list_pop(entry);
-
- } else {
- DCCP_WARN("Received illegal option %u\n", opt);
- goto fast_path_failed;
- }
- return 0;
-
-fast_path_unknown:
- if (!mandatory)
- return dccp_push_empty_confirm(fn, feat, local);
-
-fast_path_failed:
- return mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR
- : DCCP_RESET_CODE_OPTION_ERROR;
-}
-
-/**
- * dccp_feat_parse_options - Process Feature-Negotiation Options
- * @sk: for general use and used by the client during connection setup
- * @dreq: used by the server during connection setup
- * @mandatory: whether @opt was preceded by a Mandatory option
- * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R
- * @feat: one of %dccp_feature_numbers
- * @val: value contents of @opt
- * @len: length of @val in bytes
- *
- * Returns 0 on success, a Reset code for ending the connection otherwise.
- */
-int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
- u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg;
- bool server = false;
-
- switch (sk->sk_state) {
- /*
- * Negotiation during connection setup
- */
- case DCCP_LISTEN:
- server = true; /* fall through */
- case DCCP_REQUESTING:
- switch (opt) {
- case DCCPO_CHANGE_L:
- case DCCPO_CHANGE_R:
- return dccp_feat_change_recv(fn, mandatory, opt, feat,
- val, len, server);
- case DCCPO_CONFIRM_R:
- case DCCPO_CONFIRM_L:
- return dccp_feat_confirm_recv(fn, mandatory, opt, feat,
- val, len, server);
- }
- break;
- /*
- * Support for exchanging NN options on an established connection.
- */
- case DCCP_OPEN:
- case DCCP_PARTOPEN:
- return dccp_feat_handle_nn_established(sk, mandatory, opt, feat,
- val, len);
- }
- return 0; /* ignore FN options in all other states */
-}
-
-/**
- * dccp_feat_init - Seed feature negotiation with host-specific defaults
- * This initialises global defaults, depending on the value of the sysctls.
- * These can later be overridden by registering changes via setsockopt calls.
- * The last link in the chain is finalise_settings, to make sure that between
- * here and the start of actual feature negotiation no inconsistencies enter.
- *
- * All features not appearing below use either defaults or are otherwise
- * later adjusted through dccp_feat_finalise_settings().
- */
-int dccp_feat_init(struct sock *sk)
-{
- struct list_head *fn = &dccp_sk(sk)->dccps_featneg;
- u8 on = 1, off = 0;
- int rc;
- struct {
- u8 *val;
- u8 len;
- } tx, rx;
-
- /* Non-negotiable (NN) features */
- rc = __feat_register_nn(fn, DCCPF_SEQUENCE_WINDOW, 0,
- sysctl_dccp_sequence_window);
- if (rc)
- return rc;
-
- /* Server-priority (SP) features */
-
- /* Advertise that short seqnos are not supported (7.6.1) */
- rc = __feat_register_sp(fn, DCCPF_SHORT_SEQNOS, true, true, &off, 1);
- if (rc)
- return rc;
-
- /* RFC 4340 12.1: "If a DCCP is not ECN capable, ..." */
- rc = __feat_register_sp(fn, DCCPF_ECN_INCAPABLE, true, true, &on, 1);
- if (rc)
- return rc;
-
- /*
- * We advertise the available list of CCIDs and reorder according to
- * preferences, to avoid failure resulting from negotiating different
- * singleton values (which always leads to failure).
- * These settings can still (later) be overridden via sockopts.
- */
- if (ccid_get_builtin_ccids(&tx.val, &tx.len))
- return -ENOBUFS;
- if (ccid_get_builtin_ccids(&rx.val, &rx.len)) {
- kfree(tx.val);
- return -ENOBUFS;
- }
-
- if (!dccp_feat_prefer(sysctl_dccp_tx_ccid, tx.val, tx.len) ||
- !dccp_feat_prefer(sysctl_dccp_rx_ccid, rx.val, rx.len))
- goto free_ccid_lists;
-
- rc = __feat_register_sp(fn, DCCPF_CCID, true, false, tx.val, tx.len);
- if (rc)
- goto free_ccid_lists;
-
- rc = __feat_register_sp(fn, DCCPF_CCID, false, false, rx.val, rx.len);
-
-free_ccid_lists:
- kfree(tx.val);
- kfree(rx.val);
- return rc;
-}
-
-int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_feat_entry *cur, *next;
- int idx;
- dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = {
- [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL }
- };
-
- list_for_each_entry(cur, fn_list, node) {
- /*
- * An empty Confirm means that either an unknown feature type
- * or an invalid value was present. In the first case there is
- * nothing to activate, in the other the default value is used.
- */
- if (cur->empty_confirm)
- continue;
-
- idx = dccp_feat_index(cur->feat_num);
- if (idx < 0) {
- DCCP_BUG("Unknown feature %u", cur->feat_num);
- goto activation_failed;
- }
- if (cur->state != FEAT_STABLE) {
- DCCP_CRIT("Negotiation of %s %s failed in state %s",
- cur->is_local ? "local" : "remote",
- dccp_feat_fname(cur->feat_num),
- dccp_feat_sname[cur->state]);
- goto activation_failed;
- }
- fvals[idx][cur->is_local] = &cur->val;
- }
-
- /*
- * Activate in decreasing order of index, so that the CCIDs are always
- * activated as the last feature. This avoids the case where a CCID
- * relies on the initialisation of one or more features that it depends
- * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features).
- */
- for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;)
- if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) ||
- __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) {
- DCCP_CRIT("Could not activate %d", idx);
- goto activation_failed;
- }
-
- /* Clean up Change options which have been confirmed already */
- list_for_each_entry_safe(cur, next, fn_list, node)
- if (!cur->needs_confirm)
- dccp_feat_list_pop(cur);
-
- dccp_pr_debug("Activation OK\n");
- return 0;
-
-activation_failed:
- /*
- * We clean up everything that may have been allocated, since
- * it is difficult to track at which stage negotiation failed.
- * This is ok, since all allocation functions below are robust
- * against NULL arguments.
- */
- ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
- ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
- dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
- dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
- dp->dccps_hc_rx_ackvec = NULL;
- return -1;
-}
diff --git a/net/dccp/feat.h b/net/dccp/feat.h
deleted file mode 100644
index 0e75cebb2187..000000000000
--- a/net/dccp/feat.h
+++ /dev/null
@@ -1,137 +0,0 @@
-#ifndef _DCCP_FEAT_H
-#define _DCCP_FEAT_H
-/*
- * net/dccp/feat.h
- *
- * Feature negotiation for the DCCP protocol (RFC 4340, section 6)
- * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk>
- * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/types.h>
-#include "dccp.h"
-
-/*
- * Known limit values
- */
-/* Ack Ratio takes 2-byte integer values (11.3) */
-#define DCCPF_ACK_RATIO_MAX 0xFFFF
-/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */
-#define DCCPF_SEQ_WMIN 32
-#define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull
-/* Maximum number of SP values that fit in a single (Confirm) option */
-#define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2)
-
-enum dccp_feat_type {
- FEAT_AT_RX = 1, /* located at RX side of half-connection */
- FEAT_AT_TX = 2, /* located at TX side of half-connection */
- FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */
- FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */
- FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */
-};
-
-enum dccp_feat_state {
- FEAT_DEFAULT = 0, /* using default values from 6.4 */
- FEAT_INITIALISING, /* feature is being initialised */
- FEAT_CHANGING, /* Change sent but not confirmed yet */
- FEAT_UNSTABLE, /* local modification in state CHANGING */
- FEAT_STABLE /* both ends (think they) agree */
-};
-
-/**
- * dccp_feat_val - Container for SP or NN feature values
- * @nn: single NN value
- * @sp.vec: single SP value plus optional preference list
- * @sp.len: length of @sp.vec in bytes
- */
-typedef union {
- u64 nn;
- struct {
- u8 *vec;
- u8 len;
- } sp;
-} dccp_feat_val;
-
-/**
- * struct feat_entry - Data structure to perform feature negotiation
- * @val: feature's current value (SP features may have preference list)
- * @state: feature's current state
- * @feat_num: one of %dccp_feature_numbers
- * @needs_mandatory: whether Mandatory options should be sent
- * @needs_confirm: whether to send a Confirm instead of a Change
- * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm)
- * @is_local: feature location (1) or feature-remote (0)
- * @node: list pointers, entries arranged in FIFO order
- */
-struct dccp_feat_entry {
- dccp_feat_val val;
- enum dccp_feat_state state:8;
- u8 feat_num;
-
- bool needs_mandatory,
- needs_confirm,
- empty_confirm,
- is_local;
-
- struct list_head node;
-};
-
-static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry)
-{
- if (entry->needs_confirm)
- return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R;
- return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R;
-}
-
-/**
- * struct ccid_dependency - Track changes resulting from choosing a CCID
- * @dependent_feat: one of %dccp_feature_numbers
- * @is_local: local (1) or remote (0) @dependent_feat
- * @is_mandatory: whether presence of @dependent_feat is mission-critical or not
- * @val: corresponding default value for @dependent_feat (u8 is sufficient here)
- */
-struct ccid_dependency {
- u8 dependent_feat;
- bool is_local:1,
- is_mandatory:1;
- u8 val;
-};
-
-/*
- * Sysctls to seed defaults for feature negotiation
- */
-extern unsigned long sysctl_dccp_sequence_window;
-extern int sysctl_dccp_rx_ccid;
-extern int sysctl_dccp_tx_ccid;
-
-int dccp_feat_init(struct sock *sk);
-void dccp_feat_initialise_sysctls(void);
-int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local,
- u8 const *list, u8 len);
-int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *,
- u8 mand, u8 opt, u8 feat, u8 *val, u8 len);
-int dccp_feat_clone_list(struct list_head const *, struct list_head *);
-
-/*
- * Encoding variable-length options and their maximum length.
- *
- * This affects NN options (SP options are all u8) and other variable-length
- * options (see table 3 in RFC 4340). The limit is currently given the Sequence
- * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other
- * options consume less than 6 bytes (timestamps are 4 bytes).
- * When updating this constant (e.g. due to new internet drafts / RFCs), make
- * sure that you also update all code which refers to it.
- */
-#define DCCP_OPTVAL_MAXLEN 6
-
-void dccp_encode_value_var(const u64 value, u8 *to, const u8 len);
-u64 dccp_decode_value_var(const u8 *bf, const u8 len);
-u64 dccp_feat_nn_get(struct sock *sk, u8 feat);
-
-int dccp_insert_option_mandatory(struct sk_buff *skb);
-int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, u8 *val, u8 len,
- bool repeat_first);
-#endif /* _DCCP_FEAT_H */
diff --git a/net/dccp/input.c b/net/dccp/input.c
deleted file mode 100644
index 85d6c879383d..000000000000
--- a/net/dccp/input.c
+++ /dev/null
@@ -1,742 +0,0 @@
-/*
- * net/dccp/input.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/dccp.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-
-#include <net/sock.h>
-
-#include "ackvec.h"
-#include "ccid.h"
-#include "dccp.h"
-
-/* rate-limit for syncs in reply to sequence-invalid packets; RFC 4340, 7.5.4 */
-int sysctl_dccp_sync_ratelimit __read_mostly = HZ / 8;
-
-static void dccp_enqueue_skb(struct sock *sk, struct sk_buff *skb)
-{
- __skb_pull(skb, dccp_hdr(skb)->dccph_doff * 4);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
- skb_set_owner_r(skb, sk);
- sk->sk_data_ready(sk);
-}
-
-static void dccp_fin(struct sock *sk, struct sk_buff *skb)
-{
- /*
- * On receiving Close/CloseReq, both RD/WR shutdown are performed.
- * RFC 4340, 8.3 says that we MAY send further Data/DataAcks after
- * receiving the closing segment, but there is no guarantee that such
- * data will be processed at all.
- */
- sk->sk_shutdown = SHUTDOWN_MASK;
- sock_set_flag(sk, SOCK_DONE);
- dccp_enqueue_skb(sk, skb);
-}
-
-static int dccp_rcv_close(struct sock *sk, struct sk_buff *skb)
-{
- int queued = 0;
-
- switch (sk->sk_state) {
- /*
- * We ignore Close when received in one of the following states:
- * - CLOSED (may be a late or duplicate packet)
- * - PASSIVE_CLOSEREQ (the peer has sent a CloseReq earlier)
- * - RESPOND (already handled by dccp_check_req)
- */
- case DCCP_CLOSING:
- /*
- * Simultaneous-close: receiving a Close after sending one. This
- * can happen if both client and server perform active-close and
- * will result in an endless ping-pong of crossing and retrans-
- * mitted Close packets, which only terminates when one of the
- * nodes times out (min. 64 seconds). Quicker convergence can be
- * achieved when one of the nodes acts as tie-breaker.
- * This is ok as both ends are done with data transfer and each
- * end is just waiting for the other to acknowledge termination.
- */
- if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT)
- break;
- /* fall through */
- case DCCP_REQUESTING:
- case DCCP_ACTIVE_CLOSEREQ:
- dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED);
- dccp_done(sk);
- break;
- case DCCP_OPEN:
- case DCCP_PARTOPEN:
- /* Give waiting application a chance to read pending data */
- queued = 1;
- dccp_fin(sk, skb);
- dccp_set_state(sk, DCCP_PASSIVE_CLOSE);
- /* fall through */
- case DCCP_PASSIVE_CLOSE:
- /*
- * Retransmitted Close: we have already enqueued the first one.
- */
- sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
- }
- return queued;
-}
-
-static int dccp_rcv_closereq(struct sock *sk, struct sk_buff *skb)
-{
- int queued = 0;
-
- /*
- * Step 7: Check for unexpected packet types
- * If (S.is_server and P.type == CloseReq)
- * Send Sync packet acknowledging P.seqno
- * Drop packet and return
- */
- if (dccp_sk(sk)->dccps_role != DCCP_ROLE_CLIENT) {
- dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_PKT_SYNC);
- return queued;
- }
-
- /* Step 13: process relevant Client states < CLOSEREQ */
- switch (sk->sk_state) {
- case DCCP_REQUESTING:
- dccp_send_close(sk, 0);
- dccp_set_state(sk, DCCP_CLOSING);
- break;
- case DCCP_OPEN:
- case DCCP_PARTOPEN:
- /* Give waiting application a chance to read pending data */
- queued = 1;
- dccp_fin(sk, skb);
- dccp_set_state(sk, DCCP_PASSIVE_CLOSEREQ);
- /* fall through */
- case DCCP_PASSIVE_CLOSEREQ:
- sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP);
- }
- return queued;
-}
-
-static u16 dccp_reset_code_convert(const u8 code)
-{
- static const u16 error_code[] = {
- [DCCP_RESET_CODE_CLOSED] = 0, /* normal termination */
- [DCCP_RESET_CODE_UNSPECIFIED] = 0, /* nothing known */
- [DCCP_RESET_CODE_ABORTED] = ECONNRESET,
-
- [DCCP_RESET_CODE_NO_CONNECTION] = ECONNREFUSED,
- [DCCP_RESET_CODE_CONNECTION_REFUSED] = ECONNREFUSED,
- [DCCP_RESET_CODE_TOO_BUSY] = EUSERS,
- [DCCP_RESET_CODE_AGGRESSION_PENALTY] = EDQUOT,
-
- [DCCP_RESET_CODE_PACKET_ERROR] = ENOMSG,
- [DCCP_RESET_CODE_BAD_INIT_COOKIE] = EBADR,
- [DCCP_RESET_CODE_BAD_SERVICE_CODE] = EBADRQC,
- [DCCP_RESET_CODE_OPTION_ERROR] = EILSEQ,
- [DCCP_RESET_CODE_MANDATORY_ERROR] = EOPNOTSUPP,
- };
-
- return code >= DCCP_MAX_RESET_CODES ? 0 : error_code[code];
-}
-
-static void dccp_rcv_reset(struct sock *sk, struct sk_buff *skb)
-{
- u16 err = dccp_reset_code_convert(dccp_hdr_reset(skb)->dccph_reset_code);
-
- sk->sk_err = err;
-
- /* Queue the equivalent of TCP fin so that dccp_recvmsg exits the loop */
- dccp_fin(sk, skb);
-
- if (err && !sock_flag(sk, SOCK_DEAD))
- sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
- dccp_time_wait(sk, DCCP_TIME_WAIT, 0);
-}
-
-static void dccp_handle_ackvec_processing(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_ackvec *av = dccp_sk(sk)->dccps_hc_rx_ackvec;
-
- if (av == NULL)
- return;
- if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
- dccp_ackvec_clear_state(av, DCCP_SKB_CB(skb)->dccpd_ack_seq);
- dccp_ackvec_input(av, skb);
-}
-
-static void dccp_deliver_input_to_ccids(struct sock *sk, struct sk_buff *skb)
-{
- const struct dccp_sock *dp = dccp_sk(sk);
-
- /* Don't deliver to RX CCID when node has shut down read end. */
- if (!(sk->sk_shutdown & RCV_SHUTDOWN))
- ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb);
- /*
- * Until the TX queue has been drained, we can not honour SHUT_WR, since
- * we need received feedback as input to adjust congestion control.
- */
- if (sk->sk_write_queue.qlen > 0 || !(sk->sk_shutdown & SEND_SHUTDOWN))
- ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb);
-}
-
-static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb)
-{
- const struct dccp_hdr *dh = dccp_hdr(skb);
- struct dccp_sock *dp = dccp_sk(sk);
- u64 lswl, lawl, seqno = DCCP_SKB_CB(skb)->dccpd_seq,
- ackno = DCCP_SKB_CB(skb)->dccpd_ack_seq;
-
- /*
- * Step 5: Prepare sequence numbers for Sync
- * If P.type == Sync or P.type == SyncAck,
- * If S.AWL <= P.ackno <= S.AWH and P.seqno >= S.SWL,
- * / * P is valid, so update sequence number variables
- * accordingly. After this update, P will pass the tests
- * in Step 6. A SyncAck is generated if necessary in
- * Step 15 * /
- * Update S.GSR, S.SWL, S.SWH
- * Otherwise,
- * Drop packet and return
- */
- if (dh->dccph_type == DCCP_PKT_SYNC ||
- dh->dccph_type == DCCP_PKT_SYNCACK) {
- if (between48(ackno, dp->dccps_awl, dp->dccps_awh) &&
- dccp_delta_seqno(dp->dccps_swl, seqno) >= 0)
- dccp_update_gsr(sk, seqno);
- else
- return -1;
- }
-
- /*
- * Step 6: Check sequence numbers
- * Let LSWL = S.SWL and LAWL = S.AWL
- * If P.type == CloseReq or P.type == Close or P.type == Reset,
- * LSWL := S.GSR + 1, LAWL := S.GAR
- * If LSWL <= P.seqno <= S.SWH
- * and (P.ackno does not exist or LAWL <= P.ackno <= S.AWH),
- * Update S.GSR, S.SWL, S.SWH
- * If P.type != Sync,
- * Update S.GAR
- */
- lswl = dp->dccps_swl;
- lawl = dp->dccps_awl;
-
- if (dh->dccph_type == DCCP_PKT_CLOSEREQ ||
- dh->dccph_type == DCCP_PKT_CLOSE ||
- dh->dccph_type == DCCP_PKT_RESET) {
- lswl = ADD48(dp->dccps_gsr, 1);
- lawl = dp->dccps_gar;
- }
-
- if (between48(seqno, lswl, dp->dccps_swh) &&
- (ackno == DCCP_PKT_WITHOUT_ACK_SEQ ||
- between48(ackno, lawl, dp->dccps_awh))) {
- dccp_update_gsr(sk, seqno);
-
- if (dh->dccph_type != DCCP_PKT_SYNC &&
- ackno != DCCP_PKT_WITHOUT_ACK_SEQ &&
- after48(ackno, dp->dccps_gar))
- dp->dccps_gar = ackno;
- } else {
- unsigned long now = jiffies;
- /*
- * Step 6: Check sequence numbers
- * Otherwise,
- * If P.type == Reset,
- * Send Sync packet acknowledging S.GSR
- * Otherwise,
- * Send Sync packet acknowledging P.seqno
- * Drop packet and return
- *
- * These Syncs are rate-limited as per RFC 4340, 7.5.4:
- * at most 1 / (dccp_sync_rate_limit * HZ) Syncs per second.
- */
- if (time_before(now, (dp->dccps_rate_last +
- sysctl_dccp_sync_ratelimit)))
- return -1;
-
- DCCP_WARN("Step 6 failed for %s packet, "
- "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and "
- "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), "
- "sending SYNC...\n", dccp_packet_name(dh->dccph_type),
- (unsigned long long) lswl, (unsigned long long) seqno,
- (unsigned long long) dp->dccps_swh,
- (ackno == DCCP_PKT_WITHOUT_ACK_SEQ) ? "doesn't exist"
- : "exists",
- (unsigned long long) lawl, (unsigned long long) ackno,
- (unsigned long long) dp->dccps_awh);
-
- dp->dccps_rate_last = now;
-
- if (dh->dccph_type == DCCP_PKT_RESET)
- seqno = dp->dccps_gsr;
- dccp_send_sync(sk, seqno, DCCP_PKT_SYNC);
- return -1;
- }
-
- return 0;
-}
-
-static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct dccp_hdr *dh, const unsigned int len)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- switch (dccp_hdr(skb)->dccph_type) {
- case DCCP_PKT_DATAACK:
- case DCCP_PKT_DATA:
- /*
- * FIXME: schedule DATA_DROPPED (RFC 4340, 11.7.2) if and when
- * - sk_shutdown == RCV_SHUTDOWN, use Code 1, "Not Listening"
- * - sk_receive_queue is full, use Code 2, "Receive Buffer"
- */
- dccp_enqueue_skb(sk, skb);
- return 0;
- case DCCP_PKT_ACK:
- goto discard;
- case DCCP_PKT_RESET:
- /*
- * Step 9: Process Reset
- * If P.type == Reset,
- * Tear down connection
- * S.state := TIMEWAIT
- * Set TIMEWAIT timer
- * Drop packet and return
- */
- dccp_rcv_reset(sk, skb);
- return 0;
- case DCCP_PKT_CLOSEREQ:
- if (dccp_rcv_closereq(sk, skb))
- return 0;
- goto discard;
- case DCCP_PKT_CLOSE:
- if (dccp_rcv_close(sk, skb))
- return 0;
- goto discard;
- case DCCP_PKT_REQUEST:
- /* Step 7
- * or (S.is_server and P.type == Response)
- * or (S.is_client and P.type == Request)
- * or (S.state >= OPEN and P.type == Request
- * and P.seqno >= S.OSR)
- * or (S.state >= OPEN and P.type == Response
- * and P.seqno >= S.OSR)
- * or (S.state == RESPOND and P.type == Data),
- * Send Sync packet acknowledging P.seqno
- * Drop packet and return
- */
- if (dp->dccps_role != DCCP_ROLE_LISTEN)
- goto send_sync;
- goto check_seq;
- case DCCP_PKT_RESPONSE:
- if (dp->dccps_role != DCCP_ROLE_CLIENT)
- goto send_sync;
-check_seq:
- if (dccp_delta_seqno(dp->dccps_osr,
- DCCP_SKB_CB(skb)->dccpd_seq) >= 0) {
-send_sync:
- dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
- DCCP_PKT_SYNC);
- }
- break;
- case DCCP_PKT_SYNC:
- dccp_send_sync(sk, DCCP_SKB_CB(skb)->dccpd_seq,
- DCCP_PKT_SYNCACK);
- /*
- * From RFC 4340, sec. 5.7
- *
- * As with DCCP-Ack packets, DCCP-Sync and DCCP-SyncAck packets
- * MAY have non-zero-length application data areas, whose
- * contents receivers MUST ignore.
- */
- goto discard;
- }
-
- DCCP_INC_STATS(DCCP_MIB_INERRS);
-discard:
- __kfree_skb(skb);
- return 0;
-}
-
-int dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
- const struct dccp_hdr *dh, const unsigned int len)
-{
- if (dccp_check_seqno(sk, skb))
- goto discard;
-
- if (dccp_parse_options(sk, NULL, skb))
- return 1;
-
- dccp_handle_ackvec_processing(sk, skb);
- dccp_deliver_input_to_ccids(sk, skb);
-
- return __dccp_rcv_established(sk, skb, dh, len);
-discard:
- __kfree_skb(skb);
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_rcv_established);
-
-static int dccp_rcv_request_sent_state_process(struct sock *sk,
- struct sk_buff *skb,
- const struct dccp_hdr *dh,
- const unsigned int len)
-{
- /*
- * Step 4: Prepare sequence numbers in REQUEST
- * If S.state == REQUEST,
- * If (P.type == Response or P.type == Reset)
- * and S.AWL <= P.ackno <= S.AWH,
- * / * Set sequence number variables corresponding to the
- * other endpoint, so P will pass the tests in Step 6 * /
- * Set S.GSR, S.ISR, S.SWL, S.SWH
- * / * Response processing continues in Step 10; Reset
- * processing continues in Step 9 * /
- */
- if (dh->dccph_type == DCCP_PKT_RESPONSE) {
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- long tstamp = dccp_timestamp();
-
- if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
- dp->dccps_awl, dp->dccps_awh)) {
- dccp_pr_debug("invalid ackno: S.AWL=%llu, "
- "P.ackno=%llu, S.AWH=%llu\n",
- (unsigned long long)dp->dccps_awl,
- (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq,
- (unsigned long long)dp->dccps_awh);
- goto out_invalid_packet;
- }
-
- /*
- * If option processing (Step 8) failed, return 1 here so that
- * dccp_v4_do_rcv() sends a Reset. The Reset code depends on
- * the option type and is set in dccp_parse_options().
- */
- if (dccp_parse_options(sk, NULL, skb))
- return 1;
-
- /* Obtain usec RTT sample from SYN exchange (used by TFRC). */
- if (likely(dp->dccps_options_received.dccpor_timestamp_echo))
- dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp -
- dp->dccps_options_received.dccpor_timestamp_echo));
-
- /* Stop the REQUEST timer */
- inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
- WARN_ON(sk->sk_send_head == NULL);
- kfree_skb(sk->sk_send_head);
- sk->sk_send_head = NULL;
-
- /*
- * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect
- * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH
- * is done as part of activating the feature values below, since
- * these settings depend on the local/remote Sequence Window
- * features, which were undefined or not confirmed until now.
- */
- dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq;
-
- dccp_sync_mss(sk, icsk->icsk_pmtu_cookie);
-
- /*
- * Step 10: Process REQUEST state (second part)
- * If S.state == REQUEST,
- * / * If we get here, P is a valid Response from the
- * server (see Step 4), and we should move to
- * PARTOPEN state. PARTOPEN means send an Ack,
- * don't send Data packets, retransmit Acks
- * periodically, and always include any Init Cookie
- * from the Response * /
- * S.state := PARTOPEN
- * Set PARTOPEN timer
- * Continue with S.state == PARTOPEN
- * / * Step 12 will send the Ack completing the
- * three-way handshake * /
- */
- dccp_set_state(sk, DCCP_PARTOPEN);
-
- /*
- * If feature negotiation was successful, activate features now;
- * an activation failure means that this host could not activate
- * one ore more features (e.g. insufficient memory), which would
- * leave at least one feature in an undefined state.
- */
- if (dccp_feat_activate_values(sk, &dp->dccps_featneg))
- goto unable_to_proceed;
-
- /* Make sure socket is routed, for correct metrics. */
- icsk->icsk_af_ops->rebuild_header(sk);
-
- if (!sock_flag(sk, SOCK_DEAD)) {
- sk->sk_state_change(sk);
- sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
- }
-
- if (sk->sk_write_pending || icsk->icsk_ack.pingpong ||
- icsk->icsk_accept_queue.rskq_defer_accept) {
- /* Save one ACK. Data will be ready after
- * several ticks, if write_pending is set.
- *
- * It may be deleted, but with this feature tcpdumps
- * look so _wonderfully_ clever, that I was not able
- * to stand against the temptation 8) --ANK
- */
- /*
- * OK, in DCCP we can as well do a similar trick, its
- * even in the draft, but there is no need for us to
- * schedule an ack here, as dccp_sendmsg does this for
- * us, also stated in the draft. -acme
- */
- __kfree_skb(skb);
- return 0;
- }
- dccp_send_ack(sk);
- return -1;
- }
-
-out_invalid_packet:
- /* dccp_v4_do_rcv will send a reset */
- DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
- return 1;
-
-unable_to_proceed:
- DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED;
- /*
- * We mark this socket as no longer usable, so that the loop in
- * dccp_sendmsg() terminates and the application gets notified.
- */
- dccp_set_state(sk, DCCP_CLOSED);
- sk->sk_err = ECOMM;
- return 1;
-}
-
-static int dccp_rcv_respond_partopen_state_process(struct sock *sk,
- struct sk_buff *skb,
- const struct dccp_hdr *dh,
- const unsigned int len)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- u32 sample = dp->dccps_options_received.dccpor_timestamp_echo;
- int queued = 0;
-
- switch (dh->dccph_type) {
- case DCCP_PKT_RESET:
- inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
- break;
- case DCCP_PKT_DATA:
- if (sk->sk_state == DCCP_RESPOND)
- break;
- /* fall through */
- case DCCP_PKT_DATAACK:
- case DCCP_PKT_ACK:
- /*
- * FIXME: we should be resetting the PARTOPEN (DELACK) timer
- * here but only if we haven't used the DELACK timer for
- * something else, like sending a delayed ack for a TIMESTAMP
- * echo, etc, for now were not clearing it, sending an extra
- * ACK when there is nothing else to do in DELACK is not a big
- * deal after all.
- */
-
- /* Stop the PARTOPEN timer */
- if (sk->sk_state == DCCP_PARTOPEN)
- inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
-
- /* Obtain usec RTT sample from SYN exchange (used by TFRC). */
- if (likely(sample)) {
- long delta = dccp_timestamp() - sample;
-
- dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * delta);
- }
-
- dp->dccps_osr = DCCP_SKB_CB(skb)->dccpd_seq;
- dccp_set_state(sk, DCCP_OPEN);
-
- if (dh->dccph_type == DCCP_PKT_DATAACK ||
- dh->dccph_type == DCCP_PKT_DATA) {
- __dccp_rcv_established(sk, skb, dh, len);
- queued = 1; /* packet was queued
- (by __dccp_rcv_established) */
- }
- break;
- }
-
- return queued;
-}
-
-int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
- struct dccp_hdr *dh, unsigned int len)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
- const int old_state = sk->sk_state;
- bool acceptable;
- int queued = 0;
-
- /*
- * Step 3: Process LISTEN state
- *
- * If S.state == LISTEN,
- * If P.type == Request or P contains a valid Init Cookie option,
- * (* Must scan the packet's options to check for Init
- * Cookies. Only Init Cookies are processed here,
- * however; other options are processed in Step 8. This
- * scan need only be performed if the endpoint uses Init
- * Cookies *)
- * (* Generate a new socket and switch to that socket *)
- * Set S := new socket for this port pair
- * S.state = RESPOND
- * Choose S.ISS (initial seqno) or set from Init Cookies
- * Initialize S.GAR := S.ISS
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init
- * Cookies Continue with S.state == RESPOND
- * (* A Response packet will be generated in Step 11 *)
- * Otherwise,
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- */
- if (sk->sk_state == DCCP_LISTEN) {
- if (dh->dccph_type == DCCP_PKT_REQUEST) {
- /* It is possible that we process SYN packets from backlog,
- * so we need to make sure to disable BH and RCU right there.
- */
- rcu_read_lock();
- local_bh_disable();
- acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0;
- local_bh_enable();
- rcu_read_unlock();
- if (!acceptable)
- return 1;
- consume_skb(skb);
- return 0;
- }
- if (dh->dccph_type == DCCP_PKT_RESET)
- goto discard;
-
- /* Caller (dccp_v4_do_rcv) will send Reset */
- dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
- return 1;
- } else if (sk->sk_state == DCCP_CLOSED) {
- dcb->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION;
- return 1;
- }
-
- /* Step 6: Check sequence numbers (omitted in LISTEN/REQUEST state) */
- if (sk->sk_state != DCCP_REQUESTING && dccp_check_seqno(sk, skb))
- goto discard;
-
- /*
- * Step 7: Check for unexpected packet types
- * If (S.is_server and P.type == Response)
- * or (S.is_client and P.type == Request)
- * or (S.state == RESPOND and P.type == Data),
- * Send Sync packet acknowledging P.seqno
- * Drop packet and return
- */
- if ((dp->dccps_role != DCCP_ROLE_CLIENT &&
- dh->dccph_type == DCCP_PKT_RESPONSE) ||
- (dp->dccps_role == DCCP_ROLE_CLIENT &&
- dh->dccph_type == DCCP_PKT_REQUEST) ||
- (sk->sk_state == DCCP_RESPOND && dh->dccph_type == DCCP_PKT_DATA)) {
- dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNC);
- goto discard;
- }
-
- /* Step 8: Process options */
- if (dccp_parse_options(sk, NULL, skb))
- return 1;
-
- /*
- * Step 9: Process Reset
- * If P.type == Reset,
- * Tear down connection
- * S.state := TIMEWAIT
- * Set TIMEWAIT timer
- * Drop packet and return
- */
- if (dh->dccph_type == DCCP_PKT_RESET) {
- dccp_rcv_reset(sk, skb);
- return 0;
- } else if (dh->dccph_type == DCCP_PKT_CLOSEREQ) { /* Step 13 */
- if (dccp_rcv_closereq(sk, skb))
- return 0;
- goto discard;
- } else if (dh->dccph_type == DCCP_PKT_CLOSE) { /* Step 14 */
- if (dccp_rcv_close(sk, skb))
- return 0;
- goto discard;
- }
-
- switch (sk->sk_state) {
- case DCCP_REQUESTING:
- queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len);
- if (queued >= 0)
- return queued;
-
- __kfree_skb(skb);
- return 0;
-
- case DCCP_PARTOPEN:
- /* Step 8: if using Ack Vectors, mark packet acknowledgeable */
- dccp_handle_ackvec_processing(sk, skb);
- dccp_deliver_input_to_ccids(sk, skb);
- /* fall through */
- case DCCP_RESPOND:
- queued = dccp_rcv_respond_partopen_state_process(sk, skb,
- dh, len);
- break;
- }
-
- if (dh->dccph_type == DCCP_PKT_ACK ||
- dh->dccph_type == DCCP_PKT_DATAACK) {
- switch (old_state) {
- case DCCP_PARTOPEN:
- sk->sk_state_change(sk);
- sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
- break;
- }
- } else if (unlikely(dh->dccph_type == DCCP_PKT_SYNC)) {
- dccp_send_sync(sk, dcb->dccpd_seq, DCCP_PKT_SYNCACK);
- goto discard;
- }
-
- if (!queued) {
-discard:
- __kfree_skb(skb);
- }
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_rcv_state_process);
-
-/**
- * dccp_sample_rtt - Validate and finalise computation of RTT sample
- * @delta: number of microseconds between packet and acknowledgment
- *
- * The routine is kept generic to work in different contexts. It should be
- * called immediately when the ACK used for the RTT sample arrives.
- */
-u32 dccp_sample_rtt(struct sock *sk, long delta)
-{
- /* dccpor_elapsed_time is either zeroed out or set and > 0 */
- delta -= dccp_sk(sk)->dccps_options_received.dccpor_elapsed_time * 10;
-
- if (unlikely(delta <= 0)) {
- DCCP_WARN("unusable RTT sample %ld, using min\n", delta);
- return DCCP_SANE_RTT_MIN;
- }
- if (unlikely(delta > DCCP_SANE_RTT_MAX)) {
- DCCP_WARN("RTT sample %ld too large, using max\n", delta);
- return DCCP_SANE_RTT_MAX;
- }
-
- return delta;
-}
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
deleted file mode 100644
index 8e08cea6f178..000000000000
--- a/net/dccp/ipv4.c
+++ /dev/null
@@ -1,1086 +0,0 @@
-/*
- * net/dccp/ipv4.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/dccp.h>
-#include <linux/icmp.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/random.h>
-
-#include <net/icmp.h>
-#include <net/inet_common.h>
-#include <net/inet_hashtables.h>
-#include <net/inet_sock.h>
-#include <net/protocol.h>
-#include <net/sock.h>
-#include <net/timewait_sock.h>
-#include <net/tcp_states.h>
-#include <net/xfrm.h>
-#include <net/secure_seq.h>
-
-#include "ackvec.h"
-#include "ccid.h"
-#include "dccp.h"
-#include "feat.h"
-
-/*
- * The per-net dccp.v4_ctl_sk socket is used for responding to
- * the Out-of-the-blue (OOTB) packets. A control sock will be created
- * for this socket at the initialization time.
- */
-
-int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
-{
- const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
- struct inet_sock *inet = inet_sk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- __be16 orig_sport, orig_dport;
- __be32 daddr, nexthop;
- struct flowi4 *fl4;
- struct rtable *rt;
- int err;
- struct ip_options_rcu *inet_opt;
-
- dp->dccps_role = DCCP_ROLE_CLIENT;
-
- if (addr_len < sizeof(struct sockaddr_in))
- return -EINVAL;
-
- if (usin->sin_family != AF_INET)
- return -EAFNOSUPPORT;
-
- nexthop = daddr = usin->sin_addr.s_addr;
-
- inet_opt = rcu_dereference_protected(inet->inet_opt,
- lockdep_sock_is_held(sk));
- if (inet_opt != NULL && inet_opt->opt.srr) {
- if (daddr == 0)
- return -EINVAL;
- nexthop = inet_opt->opt.faddr;
- }
-
- orig_sport = inet->inet_sport;
- orig_dport = usin->sin_port;
- fl4 = &inet->cork.fl.u.ip4;
- rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
- RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
- IPPROTO_DCCP,
- orig_sport, orig_dport, sk);
- if (IS_ERR(rt))
- return PTR_ERR(rt);
-
- if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
- ip_rt_put(rt);
- return -ENETUNREACH;
- }
-
- if (inet_opt == NULL || !inet_opt->opt.srr)
- daddr = fl4->daddr;
-
- if (inet->inet_saddr == 0)
- inet->inet_saddr = fl4->saddr;
- sk_rcv_saddr_set(sk, inet->inet_saddr);
- inet->inet_dport = usin->sin_port;
- sk_daddr_set(sk, daddr);
-
- inet_csk(sk)->icsk_ext_hdr_len = 0;
- if (inet_opt)
- inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
- /*
- * Socket identity is still unknown (sport may be zero).
- * However we set state to DCCP_REQUESTING and not releasing socket
- * lock select source port, enter ourselves into the hash tables and
- * complete initialization after this.
- */
- dccp_set_state(sk, DCCP_REQUESTING);
- err = inet_hash_connect(&dccp_death_row, sk);
- if (err != 0)
- goto failure;
-
- rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
- inet->inet_sport, inet->inet_dport, sk);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
- rt = NULL;
- goto failure;
- }
- /* OK, now commit destination to socket. */
- sk_setup_caps(sk, &rt->dst);
-
- dp->dccps_iss = secure_dccp_sequence_number(inet->inet_saddr,
- inet->inet_daddr,
- inet->inet_sport,
- inet->inet_dport);
- inet->inet_id = dp->dccps_iss ^ jiffies;
-
- err = dccp_connect(sk);
- rt = NULL;
- if (err != 0)
- goto failure;
-out:
- return err;
-failure:
- /*
- * This unhashes the socket and releases the local port, if necessary.
- */
- dccp_set_state(sk, DCCP_CLOSED);
- ip_rt_put(rt);
- sk->sk_route_caps = 0;
- inet->inet_dport = 0;
- goto out;
-}
-EXPORT_SYMBOL_GPL(dccp_v4_connect);
-
-/*
- * This routine does path mtu discovery as defined in RFC1191.
- */
-static inline void dccp_do_pmtu_discovery(struct sock *sk,
- const struct iphdr *iph,
- u32 mtu)
-{
- struct dst_entry *dst;
- const struct inet_sock *inet = inet_sk(sk);
- const struct dccp_sock *dp = dccp_sk(sk);
-
- /* We are not interested in DCCP_LISTEN and request_socks (RESPONSEs
- * send out by Linux are always < 576bytes so they should go through
- * unfragmented).
- */
- if (sk->sk_state == DCCP_LISTEN)
- return;
-
- dst = inet_csk_update_pmtu(sk, mtu);
- if (!dst)
- return;
-
- /* Something is about to be wrong... Remember soft error
- * for the case, if this connection will not able to recover.
- */
- if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
- sk->sk_err_soft = EMSGSIZE;
-
- mtu = dst_mtu(dst);
-
- if (inet->pmtudisc != IP_PMTUDISC_DONT &&
- ip_sk_accept_pmtu(sk) &&
- inet_csk(sk)->icsk_pmtu_cookie > mtu) {
- dccp_sync_mss(sk, mtu);
-
- /*
- * From RFC 4340, sec. 14.1:
- *
- * DCCP-Sync packets are the best choice for upward
- * probing, since DCCP-Sync probes do not risk application
- * data loss.
- */
- dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
- } /* else let the usual retransmit timer handle it */
-}
-
-static void dccp_do_redirect(struct sk_buff *skb, struct sock *sk)
-{
- struct dst_entry *dst = __sk_dst_check(sk, 0);
-
- if (dst)
- dst->ops->redirect(dst, sk, skb);
-}
-
-void dccp_req_err(struct sock *sk, u64 seq)
- {
- struct request_sock *req = inet_reqsk(sk);
- struct net *net = sock_net(sk);
-
- /*
- * ICMPs are not backlogged, hence we cannot get an established
- * socket here.
- */
- if (!between48(seq, dccp_rsk(req)->dreq_iss, dccp_rsk(req)->dreq_gss)) {
- __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
- } else {
- /*
- * Still in RESPOND, just remove it silently.
- * There is no good way to pass the error to the newly
- * created socket, and POSIX does not want network
- * errors returned from accept().
- */
- inet_csk_reqsk_queue_drop(req->rsk_listener, req);
- }
- reqsk_put(req);
-}
-EXPORT_SYMBOL(dccp_req_err);
-
-/*
- * This routine is called by the ICMP module when it gets some sort of error
- * condition. If err < 0 then the socket should be closed and the error
- * returned to the user. If err > 0 it's just the icmp type << 8 | icmp code.
- * After adjustment header points to the first 8 bytes of the tcp header. We
- * need to find the appropriate port.
- *
- * The locking strategy used here is very "optimistic". When someone else
- * accesses the socket the ICMP is just dropped and for some paths there is no
- * check at all. A more general error queue to queue errors for later handling
- * is probably better.
- */
-static void dccp_v4_err(struct sk_buff *skb, u32 info)
-{
- const struct iphdr *iph = (struct iphdr *)skb->data;
- const u8 offset = iph->ihl << 2;
- const struct dccp_hdr *dh;
- struct dccp_sock *dp;
- struct inet_sock *inet;
- const int type = icmp_hdr(skb)->type;
- const int code = icmp_hdr(skb)->code;
- struct sock *sk;
- __u64 seq;
- int err;
- struct net *net = dev_net(skb->dev);
-
- /* Only need dccph_dport & dccph_sport which are the first
- * 4 bytes in dccp header.
- * Our caller (icmp_socket_deliver()) already pulled 8 bytes for us.
- */
- BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_sport) > 8);
- BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_dport) > 8);
- dh = (struct dccp_hdr *)(skb->data + offset);
-
- sk = __inet_lookup_established(net, &dccp_hashinfo,
- iph->daddr, dh->dccph_dport,
- iph->saddr, ntohs(dh->dccph_sport),
- inet_iif(skb), 0);
- if (!sk) {
- __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
- return;
- }
-
- if (sk->sk_state == DCCP_TIME_WAIT) {
- inet_twsk_put(inet_twsk(sk));
- return;
- }
- seq = dccp_hdr_seq(dh);
- if (sk->sk_state == DCCP_NEW_SYN_RECV)
- return dccp_req_err(sk, seq);
-
- bh_lock_sock(sk);
- /* If too many ICMPs get dropped on busy
- * servers this needs to be solved differently.
- */
- if (sock_owned_by_user(sk))
- __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
-
- if (sk->sk_state == DCCP_CLOSED)
- goto out;
-
- dp = dccp_sk(sk);
- if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) &&
- !between48(seq, dp->dccps_awl, dp->dccps_awh)) {
- __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
- goto out;
- }
-
- switch (type) {
- case ICMP_REDIRECT:
- if (!sock_owned_by_user(sk))
- dccp_do_redirect(skb, sk);
- goto out;
- case ICMP_SOURCE_QUENCH:
- /* Just silently ignore these. */
- goto out;
- case ICMP_PARAMETERPROB:
- err = EPROTO;
- break;
- case ICMP_DEST_UNREACH:
- if (code > NR_ICMP_UNREACH)
- goto out;
-
- if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
- if (!sock_owned_by_user(sk))
- dccp_do_pmtu_discovery(sk, iph, info);
- goto out;
- }
-
- err = icmp_err_convert[code].errno;
- break;
- case ICMP_TIME_EXCEEDED:
- err = EHOSTUNREACH;
- break;
- default:
- goto out;
- }
-
- switch (sk->sk_state) {
- case DCCP_REQUESTING:
- case DCCP_RESPOND:
- if (!sock_owned_by_user(sk)) {
- __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS);
- sk->sk_err = err;
-
- sk->sk_error_report(sk);
-
- dccp_done(sk);
- } else
- sk->sk_err_soft = err;
- goto out;
- }
-
- /* If we've already connected we will keep trying
- * until we time out, or the user gives up.
- *
- * rfc1122 4.2.3.9 allows to consider as hard errors
- * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
- * but it is obsoleted by pmtu discovery).
- *
- * Note, that in modern internet, where routing is unreliable
- * and in each dark corner broken firewalls sit, sending random
- * errors ordered by their masters even this two messages finally lose
- * their original sense (even Linux sends invalid PORT_UNREACHs)
- *
- * Now we are in compliance with RFCs.
- * --ANK (980905)
- */
-
- inet = inet_sk(sk);
- if (!sock_owned_by_user(sk) && inet->recverr) {
- sk->sk_err = err;
- sk->sk_error_report(sk);
- } else /* Only an error on timeout */
- sk->sk_err_soft = err;
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
-}
-
-static inline __sum16 dccp_v4_csum_finish(struct sk_buff *skb,
- __be32 src, __be32 dst)
-{
- return csum_tcpudp_magic(src, dst, skb->len, IPPROTO_DCCP, skb->csum);
-}
-
-void dccp_v4_send_check(struct sock *sk, struct sk_buff *skb)
-{
- const struct inet_sock *inet = inet_sk(sk);
- struct dccp_hdr *dh = dccp_hdr(skb);
-
- dccp_csum_outgoing(skb);
- dh->dccph_checksum = dccp_v4_csum_finish(skb,
- inet->inet_saddr,
- inet->inet_daddr);
-}
-EXPORT_SYMBOL_GPL(dccp_v4_send_check);
-
-static inline u64 dccp_v4_init_sequence(const struct sk_buff *skb)
-{
- return secure_dccp_sequence_number(ip_hdr(skb)->daddr,
- ip_hdr(skb)->saddr,
- dccp_hdr(skb)->dccph_dport,
- dccp_hdr(skb)->dccph_sport);
-}
-
-/*
- * The three way handshake has completed - we got a valid ACK or DATAACK -
- * now create the new socket.
- *
- * This is the equivalent of TCP's tcp_v4_syn_recv_sock
- */
-struct sock *dccp_v4_request_recv_sock(const struct sock *sk,
- struct sk_buff *skb,
- struct request_sock *req,
- struct dst_entry *dst,
- struct request_sock *req_unhash,
- bool *own_req)
-{
- struct inet_request_sock *ireq;
- struct inet_sock *newinet;
- struct sock *newsk;
-
- if (sk_acceptq_is_full(sk))
- goto exit_overflow;
-
- newsk = dccp_create_openreq_child(sk, req, skb);
- if (newsk == NULL)
- goto exit_nonewsk;
-
- newinet = inet_sk(newsk);
- ireq = inet_rsk(req);
- sk_daddr_set(newsk, ireq->ir_rmt_addr);
- sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
- newinet->inet_saddr = ireq->ir_loc_addr;
- RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt));
- newinet->mc_index = inet_iif(skb);
- newinet->mc_ttl = ip_hdr(skb)->ttl;
- newinet->inet_id = jiffies;
-
- if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL)
- goto put_and_exit;
-
- sk_setup_caps(newsk, dst);
-
- dccp_sync_mss(newsk, dst_mtu(dst));
-
- if (__inet_inherit_port(sk, newsk) < 0)
- goto put_and_exit;
- *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
- if (*own_req)
- ireq->ireq_opt = NULL;
- else
- newinet->inet_opt = NULL;
- return newsk;
-
-exit_overflow:
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
-exit_nonewsk:
- dst_release(dst);
-exit:
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
- return NULL;
-put_and_exit:
- newinet->inet_opt = NULL;
- inet_csk_prepare_forced_close(newsk);
- dccp_done(newsk);
- goto exit;
-}
-EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock);
-
-static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
- struct sk_buff *skb)
-{
- struct rtable *rt;
- const struct iphdr *iph = ip_hdr(skb);
- struct flowi4 fl4 = {
- .flowi4_oif = inet_iif(skb),
- .daddr = iph->saddr,
- .saddr = iph->daddr,
- .flowi4_tos = RT_CONN_FLAGS(sk),
- .flowi4_proto = sk->sk_protocol,
- .fl4_sport = dccp_hdr(skb)->dccph_dport,
- .fl4_dport = dccp_hdr(skb)->dccph_sport,
- };
-
- security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
- rt = ip_route_output_flow(net, &fl4, sk);
- if (IS_ERR(rt)) {
- IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
- return NULL;
- }
-
- return &rt->dst;
-}
-
-static int dccp_v4_send_response(const struct sock *sk, struct request_sock *req)
-{
- int err = -1;
- struct sk_buff *skb;
- struct dst_entry *dst;
- struct flowi4 fl4;
-
- dst = inet_csk_route_req(sk, &fl4, req);
- if (dst == NULL)
- goto out;
-
- skb = dccp_make_response(sk, dst, req);
- if (skb != NULL) {
- const struct inet_request_sock *ireq = inet_rsk(req);
- struct dccp_hdr *dh = dccp_hdr(skb);
-
- dh->dccph_checksum = dccp_v4_csum_finish(skb, ireq->ir_loc_addr,
- ireq->ir_rmt_addr);
- rcu_read_lock();
- err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
- ireq->ir_rmt_addr,
- rcu_dereference(ireq->ireq_opt));
- rcu_read_unlock();
- err = net_xmit_eval(err);
- }
-
-out:
- dst_release(dst);
- return err;
-}
-
-static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
-{
- int err;
- const struct iphdr *rxiph;
- struct sk_buff *skb;
- struct dst_entry *dst;
- struct net *net = dev_net(skb_dst(rxskb)->dev);
- struct sock *ctl_sk = net->dccp.v4_ctl_sk;
-
- /* Never send a reset in response to a reset. */
- if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET)
- return;
-
- if (skb_rtable(rxskb)->rt_type != RTN_LOCAL)
- return;
-
- dst = dccp_v4_route_skb(net, ctl_sk, rxskb);
- if (dst == NULL)
- return;
-
- skb = dccp_ctl_make_reset(ctl_sk, rxskb);
- if (skb == NULL)
- goto out;
-
- rxiph = ip_hdr(rxskb);
- dccp_hdr(skb)->dccph_checksum = dccp_v4_csum_finish(skb, rxiph->saddr,
- rxiph->daddr);
- skb_dst_set(skb, dst_clone(dst));
-
- local_bh_disable();
- bh_lock_sock(ctl_sk);
- err = ip_build_and_send_pkt(skb, ctl_sk,
- rxiph->daddr, rxiph->saddr, NULL);
- bh_unlock_sock(ctl_sk);
-
- if (net_xmit_eval(err) == 0) {
- __DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
- __DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
- }
- local_bh_enable();
-out:
- dst_release(dst);
-}
-
-static void dccp_v4_reqsk_destructor(struct request_sock *req)
-{
- dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
- kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
-}
-
-void dccp_syn_ack_timeout(const struct request_sock *req)
-{
-}
-EXPORT_SYMBOL(dccp_syn_ack_timeout);
-
-static struct request_sock_ops dccp_request_sock_ops __read_mostly = {
- .family = PF_INET,
- .obj_size = sizeof(struct dccp_request_sock),
- .rtx_syn_ack = dccp_v4_send_response,
- .send_ack = dccp_reqsk_send_ack,
- .destructor = dccp_v4_reqsk_destructor,
- .send_reset = dccp_v4_ctl_send_reset,
- .syn_ack_timeout = dccp_syn_ack_timeout,
-};
-
-int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
-{
- struct inet_request_sock *ireq;
- struct request_sock *req;
- struct dccp_request_sock *dreq;
- const __be32 service = dccp_hdr_request(skb)->dccph_req_service;
- struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
-
- /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */
- if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
- return 0; /* discard, don't send a reset here */
-
- if (dccp_bad_service_code(sk, service)) {
- dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE;
- goto drop;
- }
- /*
- * TW buckets are converted to open requests without
- * limitations, they conserve resources and peer is
- * evidently real one.
- */
- dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
- if (inet_csk_reqsk_queue_is_full(sk))
- goto drop;
-
- if (sk_acceptq_is_full(sk))
- goto drop;
-
- req = inet_reqsk_alloc(&dccp_request_sock_ops, sk, true);
- if (req == NULL)
- goto drop;
-
- if (dccp_reqsk_init(req, dccp_sk(sk), skb))
- goto drop_and_free;
-
- dreq = dccp_rsk(req);
- if (dccp_parse_options(sk, dreq, skb))
- goto drop_and_free;
-
- if (security_inet_conn_request(sk, skb, req))
- goto drop_and_free;
-
- ireq = inet_rsk(req);
- sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
- sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
- ireq->ir_mark = inet_request_mark(sk, skb);
- ireq->ireq_family = AF_INET;
- ireq->ir_iif = sk->sk_bound_dev_if;
-
- /*
- * Step 3: Process LISTEN state
- *
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
- *
- * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child().
- */
- dreq->dreq_isr = dcb->dccpd_seq;
- dreq->dreq_gsr = dreq->dreq_isr;
- dreq->dreq_iss = dccp_v4_init_sequence(skb);
- dreq->dreq_gss = dreq->dreq_iss;
- dreq->dreq_service = service;
-
- if (dccp_v4_send_response(sk, req))
- goto drop_and_free;
-
- inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
- reqsk_put(req);
- return 0;
-
-drop_and_free:
- reqsk_free(req);
-drop:
- __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS);
- return -1;
-}
-EXPORT_SYMBOL_GPL(dccp_v4_conn_request);
-
-int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_hdr *dh = dccp_hdr(skb);
-
- if (sk->sk_state == DCCP_OPEN) { /* Fast path */
- if (dccp_rcv_established(sk, skb, dh, skb->len))
- goto reset;
- return 0;
- }
-
- /*
- * Step 3: Process LISTEN state
- * If P.type == Request or P contains a valid Init Cookie option,
- * (* Must scan the packet's options to check for Init
- * Cookies. Only Init Cookies are processed here,
- * however; other options are processed in Step 8. This
- * scan need only be performed if the endpoint uses Init
- * Cookies *)
- * (* Generate a new socket and switch to that socket *)
- * Set S := new socket for this port pair
- * S.state = RESPOND
- * Choose S.ISS (initial seqno) or set from Init Cookies
- * Initialize S.GAR := S.ISS
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
- * Continue with S.state == RESPOND
- * (* A Response packet will be generated in Step 11 *)
- * Otherwise,
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- *
- * NOTE: the check for the packet types is done in
- * dccp_rcv_state_process
- */
-
- if (dccp_rcv_state_process(sk, skb, dh, skb->len))
- goto reset;
- return 0;
-
-reset:
- dccp_v4_ctl_send_reset(sk, skb);
- kfree_skb(skb);
- return 0;
-}
-EXPORT_SYMBOL_GPL(dccp_v4_do_rcv);
-
-/**
- * dccp_invalid_packet - check for malformed packets
- * Implements RFC 4340, 8.5: Step 1: Check header basics
- * Packets that fail these checks are ignored and do not receive Resets.
- */
-int dccp_invalid_packet(struct sk_buff *skb)
-{
- const struct dccp_hdr *dh;
- unsigned int cscov;
- u8 dccph_doff;
-
- if (skb->pkt_type != PACKET_HOST)
- return 1;
-
- /* If the packet is shorter than 12 bytes, drop packet and return */
- if (!pskb_may_pull(skb, sizeof(struct dccp_hdr))) {
- DCCP_WARN("pskb_may_pull failed\n");
- return 1;
- }
-
- dh = dccp_hdr(skb);
-
- /* If P.type is not understood, drop packet and return */
- if (dh->dccph_type >= DCCP_PKT_INVALID) {
- DCCP_WARN("invalid packet type\n");
- return 1;
- }
-
- /*
- * If P.Data Offset is too small for packet type, drop packet and return
- */
- dccph_doff = dh->dccph_doff;
- if (dccph_doff < dccp_hdr_len(skb) / sizeof(u32)) {
- DCCP_WARN("P.Data Offset(%u) too small\n", dccph_doff);
- return 1;
- }
- /*
- * If P.Data Offset is too too large for packet, drop packet and return
- */
- if (!pskb_may_pull(skb, dccph_doff * sizeof(u32))) {
- DCCP_WARN("P.Data Offset(%u) too large\n", dccph_doff);
- return 1;
- }
- dh = dccp_hdr(skb);
- /*
- * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet
- * has short sequence numbers), drop packet and return
- */
- if ((dh->dccph_type < DCCP_PKT_DATA ||
- dh->dccph_type > DCCP_PKT_DATAACK) && dh->dccph_x == 0) {
- DCCP_WARN("P.type (%s) not Data || [Data]Ack, while P.X == 0\n",
- dccp_packet_name(dh->dccph_type));
- return 1;
- }
-
- /*
- * If P.CsCov is too large for the packet size, drop packet and return.
- * This must come _before_ checksumming (not as RFC 4340 suggests).
- */
- cscov = dccp_csum_coverage(skb);
- if (cscov > skb->len) {
- DCCP_WARN("P.CsCov %u exceeds packet length %d\n",
- dh->dccph_cscov, skb->len);
- return 1;
- }
-
- /* If header checksum is incorrect, drop packet and return.
- * (This step is completed in the AF-dependent functions.) */
- skb->csum = skb_checksum(skb, 0, cscov, 0);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(dccp_invalid_packet);
-
-/* this is called when real data arrives */
-static int dccp_v4_rcv(struct sk_buff *skb)
-{
- const struct dccp_hdr *dh;
- const struct iphdr *iph;
- bool refcounted;
- struct sock *sk;
- int min_cov;
-
- /* Step 1: Check header basics */
-
- if (dccp_invalid_packet(skb))
- goto discard_it;
-
- iph = ip_hdr(skb);
- /* Step 1: If header checksum is incorrect, drop packet and return */
- if (dccp_v4_csum_finish(skb, iph->saddr, iph->daddr)) {
- DCCP_WARN("dropped packet with invalid checksum\n");
- goto discard_it;
- }
-
- dh = dccp_hdr(skb);
-
- DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh);
- DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
-
- dccp_pr_debug("%8.8s src=%pI4@%-5d dst=%pI4@%-5d seq=%llu",
- dccp_packet_name(dh->dccph_type),
- &iph->saddr, ntohs(dh->dccph_sport),
- &iph->daddr, ntohs(dh->dccph_dport),
- (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq);
-
- if (dccp_packet_without_ack(skb)) {
- DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ;
- dccp_pr_debug_cat("\n");
- } else {
- DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
- dccp_pr_debug_cat(", ack=%llu\n", (unsigned long long)
- DCCP_SKB_CB(skb)->dccpd_ack_seq);
- }
-
-lookup:
- sk = __inet_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh),
- dh->dccph_sport, dh->dccph_dport, 0, &refcounted);
- if (!sk) {
- dccp_pr_debug("failed to look up flow ID in table and "
- "get corresponding socket\n");
- goto no_dccp_socket;
- }
-
- /*
- * Step 2:
- * ... or S.state == TIMEWAIT,
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- */
- if (sk->sk_state == DCCP_TIME_WAIT) {
- dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n");
- inet_twsk_put(inet_twsk(sk));
- goto no_dccp_socket;
- }
-
- if (sk->sk_state == DCCP_NEW_SYN_RECV) {
- struct request_sock *req = inet_reqsk(sk);
- struct sock *nsk;
-
- sk = req->rsk_listener;
- if (unlikely(sk->sk_state != DCCP_LISTEN)) {
- inet_csk_reqsk_queue_drop_and_put(sk, req);
- goto lookup;
- }
- sock_hold(sk);
- refcounted = true;
- nsk = dccp_check_req(sk, skb, req);
- if (!nsk) {
- reqsk_put(req);
- goto discard_and_relse;
- }
- if (nsk == sk) {
- reqsk_put(req);
- } else if (dccp_child_process(sk, nsk, skb)) {
- dccp_v4_ctl_send_reset(sk, skb);
- goto discard_and_relse;
- } else {
- sock_put(sk);
- return 0;
- }
- }
- /*
- * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage
- * o if MinCsCov = 0, only packets with CsCov = 0 are accepted
- * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov
- */
- min_cov = dccp_sk(sk)->dccps_pcrlen;
- if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) {
- dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n",
- dh->dccph_cscov, min_cov);
- /* FIXME: "Such packets SHOULD be reported using Data Dropped
- * options (Section 11.7) with Drop Code 0, Protocol
- * Constraints." */
- goto discard_and_relse;
- }
-
- if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
- goto discard_and_relse;
- nf_reset(skb);
-
- return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, refcounted);
-
-no_dccp_socket:
- if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
- goto discard_it;
- /*
- * Step 2:
- * If no socket ...
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- */
- if (dh->dccph_type != DCCP_PKT_RESET) {
- DCCP_SKB_CB(skb)->dccpd_reset_code =
- DCCP_RESET_CODE_NO_CONNECTION;
- dccp_v4_ctl_send_reset(sk, skb);
- }
-
-discard_it:
- kfree_skb(skb);
- return 0;
-
-discard_and_relse:
- if (refcounted)
- sock_put(sk);
- goto discard_it;
-}
-
-static const struct inet_connection_sock_af_ops dccp_ipv4_af_ops = {
- .queue_xmit = ip_queue_xmit,
- .send_check = dccp_v4_send_check,
- .rebuild_header = inet_sk_rebuild_header,
- .conn_request = dccp_v4_conn_request,
- .syn_recv_sock = dccp_v4_request_recv_sock,
- .net_header_len = sizeof(struct iphdr),
- .setsockopt = ip_setsockopt,
- .getsockopt = ip_getsockopt,
- .addr2sockaddr = inet_csk_addr2sockaddr,
- .sockaddr_len = sizeof(struct sockaddr_in),
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_ip_setsockopt,
- .compat_getsockopt = compat_ip_getsockopt,
-#endif
-};
-
-static int dccp_v4_init_sock(struct sock *sk)
-{
- static __u8 dccp_v4_ctl_sock_initialized;
- int err = dccp_init_sock(sk, dccp_v4_ctl_sock_initialized);
-
- if (err == 0) {
- if (unlikely(!dccp_v4_ctl_sock_initialized))
- dccp_v4_ctl_sock_initialized = 1;
- inet_csk(sk)->icsk_af_ops = &dccp_ipv4_af_ops;
- }
-
- return err;
-}
-
-static struct timewait_sock_ops dccp_timewait_sock_ops = {
- .twsk_obj_size = sizeof(struct inet_timewait_sock),
-};
-
-static struct proto dccp_v4_prot = {
- .name = "DCCP",
- .owner = THIS_MODULE,
- .close = dccp_close,
- .connect = dccp_v4_connect,
- .disconnect = dccp_disconnect,
- .ioctl = dccp_ioctl,
- .init = dccp_v4_init_sock,
- .setsockopt = dccp_setsockopt,
- .getsockopt = dccp_getsockopt,
- .sendmsg = dccp_sendmsg,
- .recvmsg = dccp_recvmsg,
- .backlog_rcv = dccp_v4_do_rcv,
- .hash = inet_hash,
- .unhash = inet_unhash,
- .accept = inet_csk_accept,
- .get_port = inet_csk_get_port,
- .shutdown = dccp_shutdown,
- .destroy = dccp_destroy_sock,
- .orphan_count = &dccp_orphan_count,
- .max_header = MAX_DCCP_HEADER,
- .obj_size = sizeof(struct dccp_sock),
- .slab_flags = SLAB_TYPESAFE_BY_RCU,
- .rsk_prot = &dccp_request_sock_ops,
- .twsk_prot = &dccp_timewait_sock_ops,
- .h.hashinfo = &dccp_hashinfo,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_dccp_setsockopt,
- .compat_getsockopt = compat_dccp_getsockopt,
-#endif
-};
-
-static const struct net_protocol dccp_v4_protocol = {
- .handler = dccp_v4_rcv,
- .err_handler = dccp_v4_err,
- .no_policy = 1,
- .netns_ok = 1,
- .icmp_strict_tag_validation = 1,
-};
-
-static const struct proto_ops inet_dccp_ops = {
- .family = PF_INET,
- .owner = THIS_MODULE,
- .release = inet_release,
- .bind = inet_bind,
- .connect = inet_stream_connect,
- .socketpair = sock_no_socketpair,
- .accept = inet_accept,
- .getname = inet_getname,
- /* FIXME: work on tcp_poll to rename it to inet_csk_poll */
- .poll = dccp_poll,
- .ioctl = inet_ioctl,
- /* FIXME: work on inet_listen to rename it to sock_common_listen */
- .listen = inet_dccp_listen,
- .shutdown = inet_shutdown,
- .setsockopt = sock_common_setsockopt,
- .getsockopt = sock_common_getsockopt,
- .sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
- .mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
-#endif
-};
-
-static struct inet_protosw dccp_v4_protosw = {
- .type = SOCK_DCCP,
- .protocol = IPPROTO_DCCP,
- .prot = &dccp_v4_prot,
- .ops = &inet_dccp_ops,
- .flags = INET_PROTOSW_ICSK,
-};
-
-static int __net_init dccp_v4_init_net(struct net *net)
-{
- if (dccp_hashinfo.bhash == NULL)
- return -ESOCKTNOSUPPORT;
-
- return inet_ctl_sock_create(&net->dccp.v4_ctl_sk, PF_INET,
- SOCK_DCCP, IPPROTO_DCCP, net);
-}
-
-static void __net_exit dccp_v4_exit_net(struct net *net)
-{
- inet_ctl_sock_destroy(net->dccp.v4_ctl_sk);
-}
-
-static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list)
-{
- inet_twsk_purge(&dccp_hashinfo, AF_INET);
-}
-
-static struct pernet_operations dccp_v4_ops = {
- .init = dccp_v4_init_net,
- .exit = dccp_v4_exit_net,
- .exit_batch = dccp_v4_exit_batch,
-};
-
-static int __init dccp_v4_init(void)
-{
- int err = proto_register(&dccp_v4_prot, 1);
-
- if (err)
- goto out;
-
- inet_register_protosw(&dccp_v4_protosw);
-
- err = register_pernet_subsys(&dccp_v4_ops);
- if (err)
- goto out_destroy_ctl_sock;
-
- err = inet_add_protocol(&dccp_v4_protocol, IPPROTO_DCCP);
- if (err)
- goto out_proto_unregister;
-
-out:
- return err;
-out_proto_unregister:
- unregister_pernet_subsys(&dccp_v4_ops);
-out_destroy_ctl_sock:
- inet_unregister_protosw(&dccp_v4_protosw);
- proto_unregister(&dccp_v4_prot);
- goto out;
-}
-
-static void __exit dccp_v4_exit(void)
-{
- inet_del_protocol(&dccp_v4_protocol, IPPROTO_DCCP);
- unregister_pernet_subsys(&dccp_v4_ops);
- inet_unregister_protosw(&dccp_v4_protosw);
- proto_unregister(&dccp_v4_prot);
-}
-
-module_init(dccp_v4_init);
-module_exit(dccp_v4_exit);
-
-/*
- * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
- * values directly, Also cover the case where the protocol is not specified,
- * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP
- */
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 33, 6);
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 0, 6);
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
-MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
deleted file mode 100644
index 6344f1b18a6a..000000000000
--- a/net/dccp/ipv6.c
+++ /dev/null
@@ -1,1169 +0,0 @@
-/*
- * DCCP over IPv6
- * Linux INET6 implementation
- *
- * Based on net/dccp6/ipv6.c
- *
- * Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-#include <linux/xfrm.h>
-#include <linux/string.h>
-
-#include <net/addrconf.h>
-#include <net/inet_common.h>
-#include <net/inet_hashtables.h>
-#include <net/inet_sock.h>
-#include <net/inet6_connection_sock.h>
-#include <net/inet6_hashtables.h>
-#include <net/ip6_route.h>
-#include <net/ipv6.h>
-#include <net/protocol.h>
-#include <net/transp_v6.h>
-#include <net/ip6_checksum.h>
-#include <net/xfrm.h>
-#include <net/secure_seq.h>
-#include <net/sock.h>
-
-#include "dccp.h"
-#include "ipv6.h"
-#include "feat.h"
-
-/* The per-net dccp.v6_ctl_sk is used for sending RSTs and ACKs */
-
-static const struct inet_connection_sock_af_ops dccp_ipv6_mapped;
-static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops;
-
-/* add pseudo-header to DCCP checksum stored in skb->csum */
-static inline __sum16 dccp_v6_csum_finish(struct sk_buff *skb,
- const struct in6_addr *saddr,
- const struct in6_addr *daddr)
-{
- return csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_DCCP, skb->csum);
-}
-
-static inline void dccp_v6_send_check(struct sock *sk, struct sk_buff *skb)
-{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct dccp_hdr *dh = dccp_hdr(skb);
-
- dccp_csum_outgoing(skb);
- dh->dccph_checksum = dccp_v6_csum_finish(skb, &np->saddr, &sk->sk_v6_daddr);
-}
-
-static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb)
-{
- return secure_dccpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32,
- ipv6_hdr(skb)->saddr.s6_addr32,
- dccp_hdr(skb)->dccph_dport,
- dccp_hdr(skb)->dccph_sport );
-
-}
-
-static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
- u8 type, u8 code, int offset, __be32 info)
-{
- const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
- const struct dccp_hdr *dh;
- struct dccp_sock *dp;
- struct ipv6_pinfo *np;
- struct sock *sk;
- int err;
- __u64 seq;
- struct net *net = dev_net(skb->dev);
-
- /* Only need dccph_dport & dccph_sport which are the first
- * 4 bytes in dccp header.
- * Our caller (icmpv6_notify()) already pulled 8 bytes for us.
- */
- BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_sport) > 8);
- BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_dport) > 8);
- dh = (struct dccp_hdr *)(skb->data + offset);
-
- sk = __inet6_lookup_established(net, &dccp_hashinfo,
- &hdr->daddr, dh->dccph_dport,
- &hdr->saddr, ntohs(dh->dccph_sport),
- inet6_iif(skb), 0);
-
- if (!sk) {
- __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
- ICMP6_MIB_INERRORS);
- return;
- }
-
- if (sk->sk_state == DCCP_TIME_WAIT) {
- inet_twsk_put(inet_twsk(sk));
- return;
- }
- seq = dccp_hdr_seq(dh);
- if (sk->sk_state == DCCP_NEW_SYN_RECV)
- return dccp_req_err(sk, seq);
-
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk))
- __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
-
- if (sk->sk_state == DCCP_CLOSED)
- goto out;
-
- dp = dccp_sk(sk);
- if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) &&
- !between48(seq, dp->dccps_awl, dp->dccps_awh)) {
- __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
- goto out;
- }
-
- np = inet6_sk(sk);
-
- if (type == NDISC_REDIRECT) {
- if (!sock_owned_by_user(sk)) {
- struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
-
- if (dst)
- dst->ops->redirect(dst, sk, skb);
- }
- goto out;
- }
-
- if (type == ICMPV6_PKT_TOOBIG) {
- struct dst_entry *dst = NULL;
-
- if (!ip6_sk_accept_pmtu(sk))
- goto out;
-
- if (sock_owned_by_user(sk))
- goto out;
- if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED))
- goto out;
-
- dst = inet6_csk_update_pmtu(sk, ntohl(info));
- if (!dst)
- goto out;
-
- if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst))
- dccp_sync_mss(sk, dst_mtu(dst));
- goto out;
- }
-
- icmpv6_err_convert(type, code, &err);
-
- /* Might be for an request_sock */
- switch (sk->sk_state) {
- case DCCP_REQUESTING:
- case DCCP_RESPOND: /* Cannot happen.
- It can, it SYNs are crossed. --ANK */
- if (!sock_owned_by_user(sk)) {
- __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS);
- sk->sk_err = err;
- /*
- * Wake people up to see the error
- * (see connect in sock.c)
- */
- sk->sk_error_report(sk);
- dccp_done(sk);
- } else
- sk->sk_err_soft = err;
- goto out;
- }
-
- if (!sock_owned_by_user(sk) && np->recverr) {
- sk->sk_err = err;
- sk->sk_error_report(sk);
- } else
- sk->sk_err_soft = err;
-
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
-}
-
-
-static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req)
-{
- struct inet_request_sock *ireq = inet_rsk(req);
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct sk_buff *skb;
- struct in6_addr *final_p, final;
- struct flowi6 fl6;
- int err = -1;
- struct dst_entry *dst;
-
- memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_DCCP;
- fl6.daddr = ireq->ir_v6_rmt_addr;
- fl6.saddr = ireq->ir_v6_loc_addr;
- fl6.flowlabel = 0;
- fl6.flowi6_oif = ireq->ir_iif;
- fl6.fl6_dport = ireq->ir_rmt_port;
- fl6.fl6_sport = htons(ireq->ir_num);
- security_req_classify_flow(req, flowi6_to_flowi(&fl6));
-
-
- rcu_read_lock();
- final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final);
- rcu_read_unlock();
-
- dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
- if (IS_ERR(dst)) {
- err = PTR_ERR(dst);
- dst = NULL;
- goto done;
- }
-
- skb = dccp_make_response(sk, dst, req);
- if (skb != NULL) {
- struct dccp_hdr *dh = dccp_hdr(skb);
- struct ipv6_txoptions *opt;
-
- dh->dccph_checksum = dccp_v6_csum_finish(skb,
- &ireq->ir_v6_loc_addr,
- &ireq->ir_v6_rmt_addr);
- fl6.daddr = ireq->ir_v6_rmt_addr;
- rcu_read_lock();
- opt = ireq->ipv6_opt;
- if (!opt)
- opt = rcu_dereference(np->opt);
- err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass);
- rcu_read_unlock();
- err = net_xmit_eval(err);
- }
-
-done:
- dst_release(dst);
- return err;
-}
-
-static void dccp_v6_reqsk_destructor(struct request_sock *req)
-{
- dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
- kfree(inet_rsk(req)->ipv6_opt);
- kfree_skb(inet_rsk(req)->pktopts);
-}
-
-static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
-{
- const struct ipv6hdr *rxip6h;
- struct sk_buff *skb;
- struct flowi6 fl6;
- struct net *net = dev_net(skb_dst(rxskb)->dev);
- struct sock *ctl_sk = net->dccp.v6_ctl_sk;
- struct dst_entry *dst;
-
- if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET)
- return;
-
- if (!ipv6_unicast_destination(rxskb))
- return;
-
- skb = dccp_ctl_make_reset(ctl_sk, rxskb);
- if (skb == NULL)
- return;
-
- rxip6h = ipv6_hdr(rxskb);
- dccp_hdr(skb)->dccph_checksum = dccp_v6_csum_finish(skb, &rxip6h->saddr,
- &rxip6h->daddr);
-
- memset(&fl6, 0, sizeof(fl6));
- fl6.daddr = rxip6h->saddr;
- fl6.saddr = rxip6h->daddr;
-
- fl6.flowi6_proto = IPPROTO_DCCP;
- fl6.flowi6_oif = inet6_iif(rxskb);
- fl6.fl6_dport = dccp_hdr(skb)->dccph_dport;
- fl6.fl6_sport = dccp_hdr(skb)->dccph_sport;
- security_skb_classify_flow(rxskb, flowi6_to_flowi(&fl6));
-
- /* sk = NULL, but it is safe for now. RST socket required. */
- dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
- if (!IS_ERR(dst)) {
- skb_dst_set(skb, dst);
- ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0);
- DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
- DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
- return;
- }
-
- kfree_skb(skb);
-}
-
-static struct request_sock_ops dccp6_request_sock_ops = {
- .family = AF_INET6,
- .obj_size = sizeof(struct dccp6_request_sock),
- .rtx_syn_ack = dccp_v6_send_response,
- .send_ack = dccp_reqsk_send_ack,
- .destructor = dccp_v6_reqsk_destructor,
- .send_reset = dccp_v6_ctl_send_reset,
- .syn_ack_timeout = dccp_syn_ack_timeout,
-};
-
-static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
-{
- struct request_sock *req;
- struct dccp_request_sock *dreq;
- struct inet_request_sock *ireq;
- struct ipv6_pinfo *np = inet6_sk(sk);
- const __be32 service = dccp_hdr_request(skb)->dccph_req_service;
- struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
-
- if (skb->protocol == htons(ETH_P_IP))
- return dccp_v4_conn_request(sk, skb);
-
- if (!ipv6_unicast_destination(skb))
- return 0; /* discard, don't send a reset here */
-
- if (dccp_bad_service_code(sk, service)) {
- dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE;
- goto drop;
- }
- /*
- * There are no SYN attacks on IPv6, yet...
- */
- dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
- if (inet_csk_reqsk_queue_is_full(sk))
- goto drop;
-
- if (sk_acceptq_is_full(sk))
- goto drop;
-
- req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk, true);
- if (req == NULL)
- goto drop;
-
- if (dccp_reqsk_init(req, dccp_sk(sk), skb))
- goto drop_and_free;
-
- dreq = dccp_rsk(req);
- if (dccp_parse_options(sk, dreq, skb))
- goto drop_and_free;
-
- if (security_inet_conn_request(sk, skb, req))
- goto drop_and_free;
-
- ireq = inet_rsk(req);
- ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
- ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
- ireq->ireq_family = AF_INET6;
- ireq->ir_mark = inet_request_mark(sk, skb);
-
- if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) ||
- np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
- np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
- refcount_inc(&skb->users);
- ireq->pktopts = skb;
- }
- ireq->ir_iif = sk->sk_bound_dev_if;
-
- /* So that link locals have meaning */
- if (!sk->sk_bound_dev_if &&
- ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
- ireq->ir_iif = inet6_iif(skb);
-
- /*
- * Step 3: Process LISTEN state
- *
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie
- *
- * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child().
- */
- dreq->dreq_isr = dcb->dccpd_seq;
- dreq->dreq_gsr = dreq->dreq_isr;
- dreq->dreq_iss = dccp_v6_init_sequence(skb);
- dreq->dreq_gss = dreq->dreq_iss;
- dreq->dreq_service = service;
-
- if (dccp_v6_send_response(sk, req))
- goto drop_and_free;
-
- inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
- reqsk_put(req);
- return 0;
-
-drop_and_free:
- reqsk_free(req);
-drop:
- __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS);
- return -1;
-}
-
-static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
- struct sk_buff *skb,
- struct request_sock *req,
- struct dst_entry *dst,
- struct request_sock *req_unhash,
- bool *own_req)
-{
- struct inet_request_sock *ireq = inet_rsk(req);
- struct ipv6_pinfo *newnp;
- const struct ipv6_pinfo *np = inet6_sk(sk);
- struct ipv6_txoptions *opt;
- struct inet_sock *newinet;
- struct dccp6_sock *newdp6;
- struct sock *newsk;
-
- if (skb->protocol == htons(ETH_P_IP)) {
- /*
- * v6 mapped
- */
- newsk = dccp_v4_request_recv_sock(sk, skb, req, dst,
- req_unhash, own_req);
- if (newsk == NULL)
- return NULL;
-
- newdp6 = (struct dccp6_sock *)newsk;
- newinet = inet_sk(newsk);
- newinet->pinet6 = &newdp6->inet6;
- newnp = inet6_sk(newsk);
-
- memcpy(newnp, np, sizeof(struct ipv6_pinfo));
-
- newnp->saddr = newsk->sk_v6_rcv_saddr;
-
- inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped;
- newsk->sk_backlog_rcv = dccp_v4_do_rcv;
- newnp->pktoptions = NULL;
- newnp->opt = NULL;
- newnp->ipv6_mc_list = NULL;
- newnp->ipv6_ac_list = NULL;
- newnp->ipv6_fl_list = NULL;
- newnp->mcast_oif = inet6_iif(skb);
- newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
-
- /*
- * No need to charge this sock to the relevant IPv6 refcnt debug socks count
- * here, dccp_create_openreq_child now does this for us, see the comment in
- * that function for the gory details. -acme
- */
-
- /* It is tricky place. Until this moment IPv4 tcp
- worked with IPv6 icsk.icsk_af_ops.
- Sync it now.
- */
- dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie);
-
- return newsk;
- }
-
-
- if (sk_acceptq_is_full(sk))
- goto out_overflow;
-
- if (!dst) {
- struct flowi6 fl6;
-
- dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_DCCP);
- if (!dst)
- goto out;
- }
-
- newsk = dccp_create_openreq_child(sk, req, skb);
- if (newsk == NULL)
- goto out_nonewsk;
-
- /*
- * No need to charge this sock to the relevant IPv6 refcnt debug socks
- * count here, dccp_create_openreq_child now does this for us, see the
- * comment in that function for the gory details. -acme
- */
-
- ip6_dst_store(newsk, dst, NULL, NULL);
- newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM |
- NETIF_F_TSO);
- newdp6 = (struct dccp6_sock *)newsk;
- newinet = inet_sk(newsk);
- newinet->pinet6 = &newdp6->inet6;
- newnp = inet6_sk(newsk);
-
- memcpy(newnp, np, sizeof(struct ipv6_pinfo));
-
- newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr;
- newnp->saddr = ireq->ir_v6_loc_addr;
- newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr;
- newsk->sk_bound_dev_if = ireq->ir_iif;
-
- /* Now IPv6 options...
-
- First: no IPv4 options.
- */
- newinet->inet_opt = NULL;
-
- /* Clone RX bits */
- newnp->rxopt.all = np->rxopt.all;
-
- newnp->ipv6_mc_list = NULL;
- newnp->ipv6_ac_list = NULL;
- newnp->ipv6_fl_list = NULL;
- newnp->pktoptions = NULL;
- newnp->opt = NULL;
- newnp->mcast_oif = inet6_iif(skb);
- newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
-
- /*
- * Clone native IPv6 options from listening socket (if any)
- *
- * Yes, keeping reference count would be much more clever, but we make
- * one more one thing there: reattach optmem to newsk.
- */
- opt = ireq->ipv6_opt;
- if (!opt)
- opt = rcu_dereference(np->opt);
- if (opt) {
- opt = ipv6_dup_options(newsk, opt);
- RCU_INIT_POINTER(newnp->opt, opt);
- }
- inet_csk(newsk)->icsk_ext_hdr_len = 0;
- if (opt)
- inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen +
- opt->opt_flen;
-
- dccp_sync_mss(newsk, dst_mtu(dst));
-
- newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
- newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
-
- if (__inet_inherit_port(sk, newsk) < 0) {
- inet_csk_prepare_forced_close(newsk);
- dccp_done(newsk);
- goto out;
- }
- *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
- /* Clone pktoptions received with SYN, if we own the req */
- if (*own_req && ireq->pktopts) {
- newnp->pktoptions = skb_clone(ireq->pktopts, GFP_ATOMIC);
- consume_skb(ireq->pktopts);
- ireq->pktopts = NULL;
- if (newnp->pktoptions)
- skb_set_owner_r(newnp->pktoptions, newsk);
- }
-
- return newsk;
-
-out_overflow:
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
-out_nonewsk:
- dst_release(dst);
-out:
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
- return NULL;
-}
-
-/* The socket must have it's spinlock held when we get
- * here.
- *
- * We have a potential double-lock case here, so even when
- * doing backlog processing we use the BH locking scheme.
- * This is because we cannot sleep with the original spinlock
- * held.
- */
-static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
-{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct sk_buff *opt_skb = NULL;
-
- /* Imagine: socket is IPv6. IPv4 packet arrives,
- goes to IPv4 receive handler and backlogged.
- From backlog it always goes here. Kerboom...
- Fortunately, dccp_rcv_established and rcv_established
- handle them correctly, but it is not case with
- dccp_v6_hnd_req and dccp_v6_ctl_send_reset(). --ANK
- */
-
- if (skb->protocol == htons(ETH_P_IP))
- return dccp_v4_do_rcv(sk, skb);
-
- if (sk_filter(sk, skb))
- goto discard;
-
- /*
- * socket locking is here for SMP purposes as backlog rcv is currently
- * called with bh processing disabled.
- */
-
- /* Do Stevens' IPV6_PKTOPTIONS.
-
- Yes, guys, it is the only place in our code, where we
- may make it not affecting IPv4.
- The rest of code is protocol independent,
- and I do not like idea to uglify IPv4.
-
- Actually, all the idea behind IPV6_PKTOPTIONS
- looks not very well thought. For now we latch
- options, received in the last packet, enqueued
- by tcp. Feel free to propose better solution.
- --ANK (980728)
- */
- if (np->rxopt.all)
- opt_skb = skb_clone(skb, GFP_ATOMIC);
-
- if (sk->sk_state == DCCP_OPEN) { /* Fast path */
- if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len))
- goto reset;
- if (opt_skb)
- goto ipv6_pktoptions;
- return 0;
- }
-
- /*
- * Step 3: Process LISTEN state
- * If S.state == LISTEN,
- * If P.type == Request or P contains a valid Init Cookie option,
- * (* Must scan the packet's options to check for Init
- * Cookies. Only Init Cookies are processed here,
- * however; other options are processed in Step 8. This
- * scan need only be performed if the endpoint uses Init
- * Cookies *)
- * (* Generate a new socket and switch to that socket *)
- * Set S := new socket for this port pair
- * S.state = RESPOND
- * Choose S.ISS (initial seqno) or set from Init Cookies
- * Initialize S.GAR := S.ISS
- * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies
- * Continue with S.state == RESPOND
- * (* A Response packet will be generated in Step 11 *)
- * Otherwise,
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- *
- * NOTE: the check for the packet types is done in
- * dccp_rcv_state_process
- */
-
- if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len))
- goto reset;
- if (opt_skb)
- goto ipv6_pktoptions;
- return 0;
-
-reset:
- dccp_v6_ctl_send_reset(sk, skb);
-discard:
- if (opt_skb != NULL)
- __kfree_skb(opt_skb);
- kfree_skb(skb);
- return 0;
-
-/* Handling IPV6_PKTOPTIONS skb the similar
- * way it's done for net/ipv6/tcp_ipv6.c
- */
-ipv6_pktoptions:
- if (!((1 << sk->sk_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) {
- if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo)
- np->mcast_oif = inet6_iif(opt_skb);
- if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)
- np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit;
- if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass)
- np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb));
- if (np->repflow)
- np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
- if (ipv6_opt_accepted(sk, opt_skb,
- &DCCP_SKB_CB(opt_skb)->header.h6)) {
- skb_set_owner_r(opt_skb, sk);
- memmove(IP6CB(opt_skb),
- &DCCP_SKB_CB(opt_skb)->header.h6,
- sizeof(struct inet6_skb_parm));
- opt_skb = xchg(&np->pktoptions, opt_skb);
- } else {
- __kfree_skb(opt_skb);
- opt_skb = xchg(&np->pktoptions, NULL);
- }
- }
-
- kfree_skb(opt_skb);
- return 0;
-}
-
-static int dccp_v6_rcv(struct sk_buff *skb)
-{
- const struct dccp_hdr *dh;
- bool refcounted;
- struct sock *sk;
- int min_cov;
-
- /* Step 1: Check header basics */
-
- if (dccp_invalid_packet(skb))
- goto discard_it;
-
- /* Step 1: If header checksum is incorrect, drop packet and return. */
- if (dccp_v6_csum_finish(skb, &ipv6_hdr(skb)->saddr,
- &ipv6_hdr(skb)->daddr)) {
- DCCP_WARN("dropped packet with invalid checksum\n");
- goto discard_it;
- }
-
- dh = dccp_hdr(skb);
-
- DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh);
- DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type;
-
- if (dccp_packet_without_ack(skb))
- DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ;
- else
- DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
-
-lookup:
- sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh),
- dh->dccph_sport, dh->dccph_dport,
- inet6_iif(skb), 0, &refcounted);
- if (!sk) {
- dccp_pr_debug("failed to look up flow ID in table and "
- "get corresponding socket\n");
- goto no_dccp_socket;
- }
-
- /*
- * Step 2:
- * ... or S.state == TIMEWAIT,
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- */
- if (sk->sk_state == DCCP_TIME_WAIT) {
- dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n");
- inet_twsk_put(inet_twsk(sk));
- goto no_dccp_socket;
- }
-
- if (sk->sk_state == DCCP_NEW_SYN_RECV) {
- struct request_sock *req = inet_reqsk(sk);
- struct sock *nsk;
-
- sk = req->rsk_listener;
- if (unlikely(sk->sk_state != DCCP_LISTEN)) {
- inet_csk_reqsk_queue_drop_and_put(sk, req);
- goto lookup;
- }
- sock_hold(sk);
- refcounted = true;
- nsk = dccp_check_req(sk, skb, req);
- if (!nsk) {
- reqsk_put(req);
- goto discard_and_relse;
- }
- if (nsk == sk) {
- reqsk_put(req);
- } else if (dccp_child_process(sk, nsk, skb)) {
- dccp_v6_ctl_send_reset(sk, skb);
- goto discard_and_relse;
- } else {
- sock_put(sk);
- return 0;
- }
- }
- /*
- * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage
- * o if MinCsCov = 0, only packets with CsCov = 0 are accepted
- * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov
- */
- min_cov = dccp_sk(sk)->dccps_pcrlen;
- if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) {
- dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n",
- dh->dccph_cscov, min_cov);
- /* FIXME: send Data Dropped option (see also dccp_v4_rcv) */
- goto discard_and_relse;
- }
-
- if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
- goto discard_and_relse;
-
- return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4,
- refcounted) ? -1 : 0;
-
-no_dccp_socket:
- if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
- goto discard_it;
- /*
- * Step 2:
- * If no socket ...
- * Generate Reset(No Connection) unless P.type == Reset
- * Drop packet and return
- */
- if (dh->dccph_type != DCCP_PKT_RESET) {
- DCCP_SKB_CB(skb)->dccpd_reset_code =
- DCCP_RESET_CODE_NO_CONNECTION;
- dccp_v6_ctl_send_reset(sk, skb);
- }
-
-discard_it:
- kfree_skb(skb);
- return 0;
-
-discard_and_relse:
- if (refcounted)
- sock_put(sk);
- goto discard_it;
-}
-
-static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
- int addr_len)
-{
- struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr;
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct inet_sock *inet = inet_sk(sk);
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- struct in6_addr *saddr = NULL, *final_p, final;
- struct ipv6_txoptions *opt;
- struct flowi6 fl6;
- struct dst_entry *dst;
- int addr_type;
- int err;
-
- dp->dccps_role = DCCP_ROLE_CLIENT;
-
- if (addr_len < SIN6_LEN_RFC2133)
- return -EINVAL;
-
- if (usin->sin6_family != AF_INET6)
- return -EAFNOSUPPORT;
-
- memset(&fl6, 0, sizeof(fl6));
-
- if (np->sndflow) {
- fl6.flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
- IP6_ECN_flow_init(fl6.flowlabel);
- if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) {
- struct ip6_flowlabel *flowlabel;
- flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (flowlabel == NULL)
- return -EINVAL;
- fl6_sock_release(flowlabel);
- }
- }
- /*
- * connect() to INADDR_ANY means loopback (BSD'ism).
- */
- if (ipv6_addr_any(&usin->sin6_addr))
- usin->sin6_addr.s6_addr[15] = 1;
-
- addr_type = ipv6_addr_type(&usin->sin6_addr);
-
- if (addr_type & IPV6_ADDR_MULTICAST)
- return -ENETUNREACH;
-
- if (addr_type & IPV6_ADDR_LINKLOCAL) {
- if (addr_len >= sizeof(struct sockaddr_in6) &&
- usin->sin6_scope_id) {
- /* If interface is set while binding, indices
- * must coincide.
- */
- if (sk->sk_bound_dev_if &&
- sk->sk_bound_dev_if != usin->sin6_scope_id)
- return -EINVAL;
-
- sk->sk_bound_dev_if = usin->sin6_scope_id;
- }
-
- /* Connect to link-local address requires an interface */
- if (!sk->sk_bound_dev_if)
- return -EINVAL;
- }
-
- sk->sk_v6_daddr = usin->sin6_addr;
- np->flow_label = fl6.flowlabel;
-
- /*
- * DCCP over IPv4
- */
- if (addr_type == IPV6_ADDR_MAPPED) {
- u32 exthdrlen = icsk->icsk_ext_hdr_len;
- struct sockaddr_in sin;
-
- SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
-
- if (__ipv6_only_sock(sk))
- return -ENETUNREACH;
-
- sin.sin_family = AF_INET;
- sin.sin_port = usin->sin6_port;
- sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
-
- icsk->icsk_af_ops = &dccp_ipv6_mapped;
- sk->sk_backlog_rcv = dccp_v4_do_rcv;
-
- err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
- if (err) {
- icsk->icsk_ext_hdr_len = exthdrlen;
- icsk->icsk_af_ops = &dccp_ipv6_af_ops;
- sk->sk_backlog_rcv = dccp_v6_do_rcv;
- goto failure;
- }
- np->saddr = sk->sk_v6_rcv_saddr;
- return err;
- }
-
- if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr))
- saddr = &sk->sk_v6_rcv_saddr;
-
- fl6.flowi6_proto = IPPROTO_DCCP;
- fl6.daddr = sk->sk_v6_daddr;
- fl6.saddr = saddr ? *saddr : np->saddr;
- fl6.flowi6_oif = sk->sk_bound_dev_if;
- fl6.fl6_dport = usin->sin6_port;
- fl6.fl6_sport = inet->inet_sport;
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
-
- opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
- final_p = fl6_update_dst(&fl6, opt, &final);
-
- dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
- if (IS_ERR(dst)) {
- err = PTR_ERR(dst);
- goto failure;
- }
-
- if (saddr == NULL) {
- saddr = &fl6.saddr;
- sk->sk_v6_rcv_saddr = *saddr;
- }
-
- /* set the source address */
- np->saddr = *saddr;
- inet->inet_rcv_saddr = LOOPBACK4_IPV6;
-
- ip6_dst_store(sk, dst, NULL, NULL);
-
- icsk->icsk_ext_hdr_len = 0;
- if (opt)
- icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen;
-
- inet->inet_dport = usin->sin6_port;
-
- dccp_set_state(sk, DCCP_REQUESTING);
- err = inet6_hash_connect(&dccp_death_row, sk);
- if (err)
- goto late_failure;
-
- dp->dccps_iss = secure_dccpv6_sequence_number(np->saddr.s6_addr32,
- sk->sk_v6_daddr.s6_addr32,
- inet->inet_sport,
- inet->inet_dport);
- err = dccp_connect(sk);
- if (err)
- goto late_failure;
-
- return 0;
-
-late_failure:
- dccp_set_state(sk, DCCP_CLOSED);
- __sk_dst_reset(sk);
-failure:
- inet->inet_dport = 0;
- sk->sk_route_caps = 0;
- return err;
-}
-
-static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = {
- .queue_xmit = inet6_csk_xmit,
- .send_check = dccp_v6_send_check,
- .rebuild_header = inet6_sk_rebuild_header,
- .conn_request = dccp_v6_conn_request,
- .syn_recv_sock = dccp_v6_request_recv_sock,
- .net_header_len = sizeof(struct ipv6hdr),
- .setsockopt = ipv6_setsockopt,
- .getsockopt = ipv6_getsockopt,
- .addr2sockaddr = inet6_csk_addr2sockaddr,
- .sockaddr_len = sizeof(struct sockaddr_in6),
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_ipv6_setsockopt,
- .compat_getsockopt = compat_ipv6_getsockopt,
-#endif
-};
-
-/*
- * DCCP over IPv4 via INET6 API
- */
-static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = {
- .queue_xmit = ip_queue_xmit,
- .send_check = dccp_v4_send_check,
- .rebuild_header = inet_sk_rebuild_header,
- .conn_request = dccp_v6_conn_request,
- .syn_recv_sock = dccp_v6_request_recv_sock,
- .net_header_len = sizeof(struct iphdr),
- .setsockopt = ipv6_setsockopt,
- .getsockopt = ipv6_getsockopt,
- .addr2sockaddr = inet6_csk_addr2sockaddr,
- .sockaddr_len = sizeof(struct sockaddr_in6),
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_ipv6_setsockopt,
- .compat_getsockopt = compat_ipv6_getsockopt,
-#endif
-};
-
-/* NOTE: A lot of things set to zero explicitly by call to
- * sk_alloc() so need not be done here.
- */
-static int dccp_v6_init_sock(struct sock *sk)
-{
- static __u8 dccp_v6_ctl_sock_initialized;
- int err = dccp_init_sock(sk, dccp_v6_ctl_sock_initialized);
-
- if (err == 0) {
- if (unlikely(!dccp_v6_ctl_sock_initialized))
- dccp_v6_ctl_sock_initialized = 1;
- inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops;
- }
-
- return err;
-}
-
-static void dccp_v6_destroy_sock(struct sock *sk)
-{
- dccp_destroy_sock(sk);
- inet6_destroy_sock(sk);
-}
-
-static struct timewait_sock_ops dccp6_timewait_sock_ops = {
- .twsk_obj_size = sizeof(struct dccp6_timewait_sock),
-};
-
-static struct proto dccp_v6_prot = {
- .name = "DCCPv6",
- .owner = THIS_MODULE,
- .close = dccp_close,
- .connect = dccp_v6_connect,
- .disconnect = dccp_disconnect,
- .ioctl = dccp_ioctl,
- .init = dccp_v6_init_sock,
- .setsockopt = dccp_setsockopt,
- .getsockopt = dccp_getsockopt,
- .sendmsg = dccp_sendmsg,
- .recvmsg = dccp_recvmsg,
- .backlog_rcv = dccp_v6_do_rcv,
- .hash = inet6_hash,
- .unhash = inet_unhash,
- .accept = inet_csk_accept,
- .get_port = inet_csk_get_port,
- .shutdown = dccp_shutdown,
- .destroy = dccp_v6_destroy_sock,
- .orphan_count = &dccp_orphan_count,
- .max_header = MAX_DCCP_HEADER,
- .obj_size = sizeof(struct dccp6_sock),
- .slab_flags = SLAB_TYPESAFE_BY_RCU,
- .rsk_prot = &dccp6_request_sock_ops,
- .twsk_prot = &dccp6_timewait_sock_ops,
- .h.hashinfo = &dccp_hashinfo,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_dccp_setsockopt,
- .compat_getsockopt = compat_dccp_getsockopt,
-#endif
-};
-
-static const struct inet6_protocol dccp_v6_protocol = {
- .handler = dccp_v6_rcv,
- .err_handler = dccp_v6_err,
- .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
-};
-
-static const struct proto_ops inet6_dccp_ops = {
- .family = PF_INET6,
- .owner = THIS_MODULE,
- .release = inet6_release,
- .bind = inet6_bind,
- .connect = inet_stream_connect,
- .socketpair = sock_no_socketpair,
- .accept = inet_accept,
- .getname = inet6_getname,
- .poll = dccp_poll,
- .ioctl = inet6_ioctl,
- .listen = inet_dccp_listen,
- .shutdown = inet_shutdown,
- .setsockopt = sock_common_setsockopt,
- .getsockopt = sock_common_getsockopt,
- .sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
- .mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
-#endif
-};
-
-static struct inet_protosw dccp_v6_protosw = {
- .type = SOCK_DCCP,
- .protocol = IPPROTO_DCCP,
- .prot = &dccp_v6_prot,
- .ops = &inet6_dccp_ops,
- .flags = INET_PROTOSW_ICSK,
-};
-
-static int __net_init dccp_v6_init_net(struct net *net)
-{
- if (dccp_hashinfo.bhash == NULL)
- return -ESOCKTNOSUPPORT;
-
- return inet_ctl_sock_create(&net->dccp.v6_ctl_sk, PF_INET6,
- SOCK_DCCP, IPPROTO_DCCP, net);
-}
-
-static void __net_exit dccp_v6_exit_net(struct net *net)
-{
- inet_ctl_sock_destroy(net->dccp.v6_ctl_sk);
-}
-
-static void __net_exit dccp_v6_exit_batch(struct list_head *net_exit_list)
-{
- inet_twsk_purge(&dccp_hashinfo, AF_INET6);
-}
-
-static struct pernet_operations dccp_v6_ops = {
- .init = dccp_v6_init_net,
- .exit = dccp_v6_exit_net,
- .exit_batch = dccp_v6_exit_batch,
-};
-
-static int __init dccp_v6_init(void)
-{
- int err = proto_register(&dccp_v6_prot, 1);
-
- if (err)
- goto out;
-
- inet6_register_protosw(&dccp_v6_protosw);
-
- err = register_pernet_subsys(&dccp_v6_ops);
- if (err)
- goto out_destroy_ctl_sock;
-
- err = inet6_add_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
- if (err)
- goto out_unregister_proto;
-
-out:
- return err;
-out_unregister_proto:
- unregister_pernet_subsys(&dccp_v6_ops);
-out_destroy_ctl_sock:
- inet6_unregister_protosw(&dccp_v6_protosw);
- proto_unregister(&dccp_v6_prot);
- goto out;
-}
-
-static void __exit dccp_v6_exit(void)
-{
- inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP);
- unregister_pernet_subsys(&dccp_v6_ops);
- inet6_unregister_protosw(&dccp_v6_protosw);
- proto_unregister(&dccp_v6_prot);
-}
-
-module_init(dccp_v6_init);
-module_exit(dccp_v6_exit);
-
-/*
- * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
- * values directly, Also cover the case where the protocol is not specified,
- * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP
- */
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 33, 6);
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 0, 6);
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>");
-MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h
deleted file mode 100644
index af259e15e7f0..000000000000
--- a/net/dccp/ipv6.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef _DCCP_IPV6_H
-#define _DCCP_IPV6_H
-/*
- * net/dccp/ipv6.h
- *
- * An implementation of the DCCP protocol
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/dccp.h>
-#include <linux/ipv6.h>
-
-struct dccp6_sock {
- struct dccp_sock dccp;
- /*
- * ipv6_pinfo has to be the last member of dccp6_sock,
- * see inet6_sk_generic.
- */
- struct ipv6_pinfo inet6;
-};
-
-struct dccp6_request_sock {
- struct dccp_request_sock dccp;
-};
-
-struct dccp6_timewait_sock {
- struct inet_timewait_sock inet;
-};
-
-#endif /* _DCCP_IPV6_H */
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
deleted file mode 100644
index ba6fc3c1186b..000000000000
--- a/net/dccp/minisocks.c
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * net/dccp/minisocks.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/dccp.h>
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/timer.h>
-
-#include <net/sock.h>
-#include <net/xfrm.h>
-#include <net/inet_timewait_sock.h>
-
-#include "ackvec.h"
-#include "ccid.h"
-#include "dccp.h"
-#include "feat.h"
-
-struct inet_timewait_death_row dccp_death_row = {
- .sysctl_max_tw_buckets = NR_FILE * 2,
- .hashinfo = &dccp_hashinfo,
-};
-
-EXPORT_SYMBOL_GPL(dccp_death_row);
-
-void dccp_time_wait(struct sock *sk, int state, int timeo)
-{
- struct inet_timewait_sock *tw;
-
- tw = inet_twsk_alloc(sk, &dccp_death_row, state);
-
- if (tw != NULL) {
- const struct inet_connection_sock *icsk = inet_csk(sk);
- const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
-#if IS_ENABLED(CONFIG_IPV6)
- if (tw->tw_family == PF_INET6) {
- tw->tw_v6_daddr = sk->sk_v6_daddr;
- tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
- tw->tw_ipv6only = sk->sk_ipv6only;
- }
-#endif
-
- /* Get the TIME_WAIT timeout firing. */
- if (timeo < rto)
- timeo = rto;
-
- if (state == DCCP_TIME_WAIT)
- timeo = DCCP_TIMEWAIT_LEN;
-
- /* tw_timer is pinned, so we need to make sure BH are disabled
- * in following section, otherwise timer handler could run before
- * we complete the initialization.
- */
- local_bh_disable();
- inet_twsk_schedule(tw, timeo);
- /* Linkage updates.
- * Note that access to tw after this point is illegal.
- */
- inet_twsk_hashdance(tw, sk, &dccp_hashinfo);
- local_bh_enable();
- } else {
- /* Sorry, if we're out of memory, just CLOSE this
- * socket up. We've got bigger problems than
- * non-graceful socket closings.
- */
- DCCP_WARN("time wait bucket table overflow\n");
- }
-
- dccp_done(sk);
-}
-
-struct sock *dccp_create_openreq_child(const struct sock *sk,
- const struct request_sock *req,
- const struct sk_buff *skb)
-{
- /*
- * Step 3: Process LISTEN state
- *
- * (* Generate a new socket and switch to that socket *)
- * Set S := new socket for this port pair
- */
- struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
-
- if (newsk != NULL) {
- struct dccp_request_sock *dreq = dccp_rsk(req);
- struct inet_connection_sock *newicsk = inet_csk(newsk);
- struct dccp_sock *newdp = dccp_sk(newsk);
-
- newdp->dccps_role = DCCP_ROLE_SERVER;
- newdp->dccps_hc_rx_ackvec = NULL;
- newdp->dccps_service_list = NULL;
- newdp->dccps_service = dreq->dreq_service;
- newdp->dccps_timestamp_echo = dreq->dreq_timestamp_echo;
- newdp->dccps_timestamp_time = dreq->dreq_timestamp_time;
- newicsk->icsk_rto = DCCP_TIMEOUT_INIT;
-
- INIT_LIST_HEAD(&newdp->dccps_featneg);
- /*
- * Step 3: Process LISTEN state
- *
- * Choose S.ISS (initial seqno) or set from Init Cookies
- * Initialize S.GAR := S.ISS
- * Set S.ISR, S.GSR from packet (or Init Cookies)
- *
- * Setting AWL/AWH and SWL/SWH happens as part of the feature
- * activation below, as these windows all depend on the local
- * and remote Sequence Window feature values (7.5.2).
- */
- newdp->dccps_iss = dreq->dreq_iss;
- newdp->dccps_gss = dreq->dreq_gss;
- newdp->dccps_gar = newdp->dccps_iss;
- newdp->dccps_isr = dreq->dreq_isr;
- newdp->dccps_gsr = dreq->dreq_gsr;
-
- /*
- * Activate features: initialise CCIDs, sequence windows etc.
- */
- if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) {
- sk_free_unlock_clone(newsk);
- return NULL;
- }
- dccp_init_xmit_timers(newsk);
-
- __DCCP_INC_STATS(DCCP_MIB_PASSIVEOPENS);
- }
- return newsk;
-}
-
-EXPORT_SYMBOL_GPL(dccp_create_openreq_child);
-
-/*
- * Process an incoming packet for RESPOND sockets represented
- * as an request_sock.
- */
-struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
- struct request_sock *req)
-{
- struct sock *child = NULL;
- struct dccp_request_sock *dreq = dccp_rsk(req);
- bool own_req;
-
- /* TCP/DCCP listeners became lockless.
- * DCCP stores complex state in its request_sock, so we need
- * a protection for them, now this code runs without being protected
- * by the parent (listener) lock.
- */
- spin_lock_bh(&dreq->dreq_lock);
-
- /* Check for retransmitted REQUEST */
- if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
-
- if (after48(DCCP_SKB_CB(skb)->dccpd_seq, dreq->dreq_gsr)) {
- dccp_pr_debug("Retransmitted REQUEST\n");
- dreq->dreq_gsr = DCCP_SKB_CB(skb)->dccpd_seq;
- /*
- * Send another RESPONSE packet
- * To protect against Request floods, increment retrans
- * counter (backoff, monitored by dccp_response_timer).
- */
- inet_rtx_syn_ack(sk, req);
- }
- /* Network Duplicate, discard packet */
- goto out;
- }
-
- DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
-
- if (dccp_hdr(skb)->dccph_type != DCCP_PKT_ACK &&
- dccp_hdr(skb)->dccph_type != DCCP_PKT_DATAACK)
- goto drop;
-
- /* Invalid ACK */
- if (!between48(DCCP_SKB_CB(skb)->dccpd_ack_seq,
- dreq->dreq_iss, dreq->dreq_gss)) {
- dccp_pr_debug("Invalid ACK number: ack_seq=%llu, "
- "dreq_iss=%llu, dreq_gss=%llu\n",
- (unsigned long long)
- DCCP_SKB_CB(skb)->dccpd_ack_seq,
- (unsigned long long) dreq->dreq_iss,
- (unsigned long long) dreq->dreq_gss);
- goto drop;
- }
-
- if (dccp_parse_options(sk, dreq, skb))
- goto drop;
-
- child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
- req, &own_req);
- if (child) {
- child = inet_csk_complete_hashdance(sk, child, req, own_req);
- goto out;
- }
-
- DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
-drop:
- if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET)
- req->rsk_ops->send_reset(sk, skb);
-
- inet_csk_reqsk_queue_drop(sk, req);
-out:
- spin_unlock_bh(&dreq->dreq_lock);
- return child;
-}
-
-EXPORT_SYMBOL_GPL(dccp_check_req);
-
-/*
- * Queue segment on the new socket if the new socket is active,
- * otherwise we just shortcircuit this and continue with
- * the new socket.
- */
-int dccp_child_process(struct sock *parent, struct sock *child,
- struct sk_buff *skb)
-{
- int ret = 0;
- const int state = child->sk_state;
-
- if (!sock_owned_by_user(child)) {
- ret = dccp_rcv_state_process(child, skb, dccp_hdr(skb),
- skb->len);
-
- /* Wakeup parent, send SIGIO */
- if (state == DCCP_RESPOND && child->sk_state != state)
- parent->sk_data_ready(parent);
- } else {
- /* Alas, it is possible again, because we do lookup
- * in main socket hash table and lock on listening
- * socket does not protect us more.
- */
- __sk_add_backlog(child, skb);
- }
-
- bh_unlock_sock(child);
- sock_put(child);
- return ret;
-}
-
-EXPORT_SYMBOL_GPL(dccp_child_process);
-
-void dccp_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
- struct request_sock *rsk)
-{
- DCCP_BUG("DCCP-ACK packets are never sent in LISTEN/RESPOND state");
-}
-
-EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack);
-
-int dccp_reqsk_init(struct request_sock *req,
- struct dccp_sock const *dp, struct sk_buff const *skb)
-{
- struct dccp_request_sock *dreq = dccp_rsk(req);
-
- spin_lock_init(&dreq->dreq_lock);
- inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport;
- inet_rsk(req)->ir_num = ntohs(dccp_hdr(skb)->dccph_dport);
- inet_rsk(req)->acked = 0;
- dreq->dreq_timestamp_echo = 0;
-
- /* inherit feature negotiation options from listening socket */
- return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg);
-}
-
-EXPORT_SYMBOL_GPL(dccp_reqsk_init);
diff --git a/net/dccp/options.c b/net/dccp/options.c
deleted file mode 100644
index 4e40db017e19..000000000000
--- a/net/dccp/options.c
+++ /dev/null
@@ -1,609 +0,0 @@
-/*
- * net/dccp/options.c
- *
- * An implementation of the DCCP protocol
- * Copyright (c) 2005 Aristeu Sergio Rozanski Filho <aris@cathedrallabs.org>
- * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
- * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/dccp.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <asm/unaligned.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-
-#include "ackvec.h"
-#include "ccid.h"
-#include "dccp.h"
-#include "feat.h"
-
-u64 dccp_decode_value_var(const u8 *bf, const u8 len)
-{
- u64 value = 0;
-
- if (len >= DCCP_OPTVAL_MAXLEN)
- value += ((u64)*bf++) << 40;
- if (len > 4)
- value += ((u64)*bf++) << 32;
- if (len > 3)
- value += ((u64)*bf++) << 24;
- if (len > 2)
- value += ((u64)*bf++) << 16;
- if (len > 1)
- value += ((u64)*bf++) << 8;
- if (len > 0)
- value += *bf;
-
- return value;
-}
-
-/**
- * dccp_parse_options - Parse DCCP options present in @skb
- * @sk: client|server|listening dccp socket (when @dreq != NULL)
- * @dreq: request socket to use during connection setup, or NULL
- */
-int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
- struct sk_buff *skb)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- const struct dccp_hdr *dh = dccp_hdr(skb);
- const u8 pkt_type = DCCP_SKB_CB(skb)->dccpd_type;
- unsigned char *options = (unsigned char *)dh + dccp_hdr_len(skb);
- unsigned char *opt_ptr = options;
- const unsigned char *opt_end = (unsigned char *)dh +
- (dh->dccph_doff * 4);
- struct dccp_options_received *opt_recv = &dp->dccps_options_received;
- unsigned char opt, len;
- unsigned char *uninitialized_var(value);
- u32 elapsed_time;
- __be32 opt_val;
- int rc;
- int mandatory = 0;
-
- memset(opt_recv, 0, sizeof(*opt_recv));
-
- opt = len = 0;
- while (opt_ptr != opt_end) {
- opt = *opt_ptr++;
- len = 0;
- value = NULL;
-
- /* Check if this isn't a single byte option */
- if (opt > DCCPO_MAX_RESERVED) {
- if (opt_ptr == opt_end)
- goto out_nonsensical_length;
-
- len = *opt_ptr++;
- if (len < 2)
- goto out_nonsensical_length;
- /*
- * Remove the type and len fields, leaving
- * just the value size
- */
- len -= 2;
- value = opt_ptr;
- opt_ptr += len;
-
- if (opt_ptr > opt_end)
- goto out_nonsensical_length;
- }
-
- /*
- * CCID-specific options are ignored during connection setup, as
- * negotiation may still be in progress (see RFC 4340, 10.3).
- * The same applies to Ack Vectors, as these depend on the CCID.
- */
- if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC ||
- opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1))
- goto ignore_option;
-
- switch (opt) {
- case DCCPO_PADDING:
- break;
- case DCCPO_MANDATORY:
- if (mandatory)
- goto out_invalid_option;
- if (pkt_type != DCCP_PKT_DATA)
- mandatory = 1;
- break;
- case DCCPO_NDP_COUNT:
- if (len > 6)
- goto out_invalid_option;
-
- opt_recv->dccpor_ndp = dccp_decode_value_var(value, len);
- dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk),
- (unsigned long long)opt_recv->dccpor_ndp);
- break;
- case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R:
- if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */
- break;
- if (len == 0)
- goto out_invalid_option;
- rc = dccp_feat_parse_options(sk, dreq, mandatory, opt,
- *value, value + 1, len - 1);
- if (rc)
- goto out_featneg_failed;
- break;
- case DCCPO_TIMESTAMP:
- if (len != 4)
- goto out_invalid_option;
- /*
- * RFC 4340 13.1: "The precise time corresponding to
- * Timestamp Value zero is not specified". We use
- * zero to indicate absence of a meaningful timestamp.
- */
- opt_val = get_unaligned((__be32 *)value);
- if (unlikely(opt_val == 0)) {
- DCCP_WARN("Timestamp with zero value\n");
- break;
- }
-
- if (dreq != NULL) {
- dreq->dreq_timestamp_echo = ntohl(opt_val);
- dreq->dreq_timestamp_time = dccp_timestamp();
- } else {
- opt_recv->dccpor_timestamp =
- dp->dccps_timestamp_echo = ntohl(opt_val);
- dp->dccps_timestamp_time = dccp_timestamp();
- }
- dccp_pr_debug("%s rx opt: TIMESTAMP=%u, ackno=%llu\n",
- dccp_role(sk), ntohl(opt_val),
- (unsigned long long)
- DCCP_SKB_CB(skb)->dccpd_ack_seq);
- /* schedule an Ack in case this sender is quiescent */
- inet_csk_schedule_ack(sk);
- break;
- case DCCPO_TIMESTAMP_ECHO:
- if (len != 4 && len != 6 && len != 8)
- goto out_invalid_option;
-
- opt_val = get_unaligned((__be32 *)value);
- opt_recv->dccpor_timestamp_echo = ntohl(opt_val);
-
- dccp_pr_debug("%s rx opt: TIMESTAMP_ECHO=%u, len=%d, "
- "ackno=%llu", dccp_role(sk),
- opt_recv->dccpor_timestamp_echo,
- len + 2,
- (unsigned long long)
- DCCP_SKB_CB(skb)->dccpd_ack_seq);
-
- value += 4;
-
- if (len == 4) { /* no elapsed time included */
- dccp_pr_debug_cat("\n");
- break;
- }
-
- if (len == 6) { /* 2-byte elapsed time */
- __be16 opt_val2 = get_unaligned((__be16 *)value);
- elapsed_time = ntohs(opt_val2);
- } else { /* 4-byte elapsed time */
- opt_val = get_unaligned((__be32 *)value);
- elapsed_time = ntohl(opt_val);
- }
-
- dccp_pr_debug_cat(", ELAPSED_TIME=%u\n", elapsed_time);
-
- /* Give precedence to the biggest ELAPSED_TIME */
- if (elapsed_time > opt_recv->dccpor_elapsed_time)
- opt_recv->dccpor_elapsed_time = elapsed_time;
- break;
- case DCCPO_ELAPSED_TIME:
- if (dccp_packet_without_ack(skb)) /* RFC 4340, 13.2 */
- break;
-
- if (len == 2) {
- __be16 opt_val2 = get_unaligned((__be16 *)value);
- elapsed_time = ntohs(opt_val2);
- } else if (len == 4) {
- opt_val = get_unaligned((__be32 *)value);
- elapsed_time = ntohl(opt_val);
- } else {
- goto out_invalid_option;
- }
-
- if (elapsed_time > opt_recv->dccpor_elapsed_time)
- opt_recv->dccpor_elapsed_time = elapsed_time;
-
- dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n",
- dccp_role(sk), elapsed_time);
- break;
- case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC:
- if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk,
- pkt_type, opt, value, len))
- goto out_invalid_option;
- break;
- case DCCPO_ACK_VECTOR_0:
- case DCCPO_ACK_VECTOR_1:
- if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */
- break;
- /*
- * Ack vectors are processed by the TX CCID if it is
- * interested. The RX CCID need not parse Ack Vectors,
- * since it is only interested in clearing old state.
- */
- /* fall through */
- case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC:
- if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk,
- pkt_type, opt, value, len))
- goto out_invalid_option;
- break;
- default:
- DCCP_CRIT("DCCP(%p): option %d(len=%d) not "
- "implemented, ignoring", sk, opt, len);
- break;
- }
-ignore_option:
- if (opt != DCCPO_MANDATORY)
- mandatory = 0;
- }
-
- /* mandatory was the last byte in option list -> reset connection */
- if (mandatory)
- goto out_invalid_option;
-
-out_nonsensical_length:
- /* RFC 4340, 5.8: ignore option and all remaining option space */
- return 0;
-
-out_invalid_option:
- DCCP_INC_STATS(DCCP_MIB_INVALIDOPT);
- rc = DCCP_RESET_CODE_OPTION_ERROR;
-out_featneg_failed:
- DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc);
- DCCP_SKB_CB(skb)->dccpd_reset_code = rc;
- DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt;
- DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0;
- DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0;
- return -1;
-}
-
-EXPORT_SYMBOL_GPL(dccp_parse_options);
-
-void dccp_encode_value_var(const u64 value, u8 *to, const u8 len)
-{
- if (len >= DCCP_OPTVAL_MAXLEN)
- *to++ = (value & 0xFF0000000000ull) >> 40;
- if (len > 4)
- *to++ = (value & 0xFF00000000ull) >> 32;
- if (len > 3)
- *to++ = (value & 0xFF000000) >> 24;
- if (len > 2)
- *to++ = (value & 0xFF0000) >> 16;
- if (len > 1)
- *to++ = (value & 0xFF00) >> 8;
- if (len > 0)
- *to++ = (value & 0xFF);
-}
-
-static inline u8 dccp_ndp_len(const u64 ndp)
-{
- if (likely(ndp <= 0xFF))
- return 1;
- return likely(ndp <= USHRT_MAX) ? 2 : (ndp <= UINT_MAX ? 4 : 6);
-}
-
-int dccp_insert_option(struct sk_buff *skb, const unsigned char option,
- const void *value, const unsigned char len)
-{
- unsigned char *to;
-
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 2 > DCCP_MAX_OPT_LEN)
- return -1;
-
- DCCP_SKB_CB(skb)->dccpd_opt_len += len + 2;
-
- to = skb_push(skb, len + 2);
- *to++ = option;
- *to++ = len + 2;
-
- memcpy(to, value, len);
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_insert_option);
-
-static int dccp_insert_option_ndp(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- u64 ndp = dp->dccps_ndp_count;
-
- if (dccp_non_data_packet(skb))
- ++dp->dccps_ndp_count;
- else
- dp->dccps_ndp_count = 0;
-
- if (ndp > 0) {
- unsigned char *ptr;
- const int ndp_len = dccp_ndp_len(ndp);
- const int len = ndp_len + 2;
-
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
- return -1;
-
- DCCP_SKB_CB(skb)->dccpd_opt_len += len;
-
- ptr = skb_push(skb, len);
- *ptr++ = DCCPO_NDP_COUNT;
- *ptr++ = len;
- dccp_encode_value_var(ndp, ptr, ndp_len);
- }
-
- return 0;
-}
-
-static inline int dccp_elapsed_time_len(const u32 elapsed_time)
-{
- return elapsed_time == 0 ? 0 : elapsed_time <= 0xFFFF ? 2 : 4;
-}
-
-static int dccp_insert_option_timestamp(struct sk_buff *skb)
-{
- __be32 now = htonl(dccp_timestamp());
- /* yes this will overflow but that is the point as we want a
- * 10 usec 32 bit timer which mean it wraps every 11.9 hours */
-
- return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now));
-}
-
-static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp,
- struct dccp_request_sock *dreq,
- struct sk_buff *skb)
-{
- __be32 tstamp_echo;
- unsigned char *to;
- u32 elapsed_time, elapsed_time_len, len;
-
- if (dreq != NULL) {
- elapsed_time = dccp_timestamp() - dreq->dreq_timestamp_time;
- tstamp_echo = htonl(dreq->dreq_timestamp_echo);
- dreq->dreq_timestamp_echo = 0;
- } else {
- elapsed_time = dccp_timestamp() - dp->dccps_timestamp_time;
- tstamp_echo = htonl(dp->dccps_timestamp_echo);
- dp->dccps_timestamp_echo = 0;
- }
-
- elapsed_time_len = dccp_elapsed_time_len(elapsed_time);
- len = 6 + elapsed_time_len;
-
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + len > DCCP_MAX_OPT_LEN)
- return -1;
-
- DCCP_SKB_CB(skb)->dccpd_opt_len += len;
-
- to = skb_push(skb, len);
- *to++ = DCCPO_TIMESTAMP_ECHO;
- *to++ = len;
-
- memcpy(to, &tstamp_echo, 4);
- to += 4;
-
- if (elapsed_time_len == 2) {
- const __be16 var16 = htons((u16)elapsed_time);
- memcpy(to, &var16, 2);
- } else if (elapsed_time_len == 4) {
- const __be32 var32 = htonl(elapsed_time);
- memcpy(to, &var32, 4);
- }
-
- return 0;
-}
-
-static int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec;
- struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
- const u16 buflen = dccp_ackvec_buflen(av);
- /* Figure out how many options do we need to represent the ackvec */
- const u8 nr_opts = DIV_ROUND_UP(buflen, DCCP_SINGLE_OPT_MAXLEN);
- u16 len = buflen + 2 * nr_opts;
- u8 i, nonce = 0;
- const unsigned char *tail, *from;
- unsigned char *to;
-
- if (dcb->dccpd_opt_len + len > DCCP_MAX_OPT_LEN) {
- DCCP_WARN("Lacking space for %u bytes on %s packet\n", len,
- dccp_packet_name(dcb->dccpd_type));
- return -1;
- }
- /*
- * Since Ack Vectors are variable-length, we can not always predict
- * their size. To catch exception cases where the space is running out
- * on the skb, a separate Sync is scheduled to carry the Ack Vector.
- */
- if (len > DCCPAV_MIN_OPTLEN &&
- len + dcb->dccpd_opt_len + skb->len > dp->dccps_mss_cache) {
- DCCP_WARN("No space left for Ack Vector (%u) on skb (%u+%u), "
- "MPS=%u ==> reduce payload size?\n", len, skb->len,
- dcb->dccpd_opt_len, dp->dccps_mss_cache);
- dp->dccps_sync_scheduled = 1;
- return 0;
- }
- dcb->dccpd_opt_len += len;
-
- to = skb_push(skb, len);
- len = buflen;
- from = av->av_buf + av->av_buf_head;
- tail = av->av_buf + DCCPAV_MAX_ACKVEC_LEN;
-
- for (i = 0; i < nr_opts; ++i) {
- int copylen = len;
-
- if (len > DCCP_SINGLE_OPT_MAXLEN)
- copylen = DCCP_SINGLE_OPT_MAXLEN;
-
- /*
- * RFC 4340, 12.2: Encode the Nonce Echo for this Ack Vector via
- * its type; ack_nonce is the sum of all individual buf_nonce's.
- */
- nonce ^= av->av_buf_nonce[i];
-
- *to++ = DCCPO_ACK_VECTOR_0 + av->av_buf_nonce[i];
- *to++ = copylen + 2;
-
- /* Check if buf_head wraps */
- if (from + copylen > tail) {
- const u16 tailsize = tail - from;
-
- memcpy(to, from, tailsize);
- to += tailsize;
- len -= tailsize;
- copylen -= tailsize;
- from = av->av_buf;
- }
-
- memcpy(to, from, copylen);
- from += copylen;
- to += copylen;
- len -= copylen;
- }
- /*
- * Each sent Ack Vector is recorded in the list, as per A.2 of RFC 4340.
- */
- if (dccp_ackvec_update_records(av, dcb->dccpd_seq, nonce))
- return -ENOBUFS;
- return 0;
-}
-
-/**
- * dccp_insert_option_mandatory - Mandatory option (5.8.2)
- * Note that since we are using skb_push, this function needs to be called
- * _after_ inserting the option it is supposed to influence (stack order).
- */
-int dccp_insert_option_mandatory(struct sk_buff *skb)
-{
- if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN)
- return -1;
-
- DCCP_SKB_CB(skb)->dccpd_opt_len++;
- *(u8 *)skb_push(skb, 1) = DCCPO_MANDATORY;
- return 0;
-}
-
-/**
- * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb
- * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R
- * @feat: one out of %dccp_feature_numbers
- * @val: NN value or SP array (preferred element first) to copy
- * @len: true length of @val in bytes (excluding first element repetition)
- * @repeat_first: whether to copy the first element of @val twice
- *
- * The last argument is used to construct Confirm options, where the preferred
- * value and the preference list appear separately (RFC 4340, 6.3.1). Preference
- * lists are kept such that the preferred entry is always first, so we only need
- * to copy twice, and avoid the overhead of cloning into a bigger array.
- */
-int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat,
- u8 *val, u8 len, bool repeat_first)
-{
- u8 tot_len, *to;
-
- /* take the `Feature' field and possible repetition into account */
- if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) {
- DCCP_WARN("length %u for feature %u too large\n", len, feat);
- return -1;
- }
-
- if (unlikely(val == NULL || len == 0))
- len = repeat_first = false;
- tot_len = 3 + repeat_first + len;
-
- if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) {
- DCCP_WARN("packet too small for feature %d option!\n", feat);
- return -1;
- }
- DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len;
-
- to = skb_push(skb, tot_len);
- *to++ = type;
- *to++ = tot_len;
- *to++ = feat;
-
- if (repeat_first)
- *to++ = *val;
- if (len)
- memcpy(to, val, len);
- return 0;
-}
-
-/* The length of all options needs to be a multiple of 4 (5.8) */
-static void dccp_insert_option_padding(struct sk_buff *skb)
-{
- int padding = DCCP_SKB_CB(skb)->dccpd_opt_len % 4;
-
- if (padding != 0) {
- padding = 4 - padding;
- memset(skb_push(skb, padding), 0, padding);
- DCCP_SKB_CB(skb)->dccpd_opt_len += padding;
- }
-}
-
-int dccp_insert_options(struct sock *sk, struct sk_buff *skb)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
-
- if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb))
- return -1;
-
- if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) {
-
- /* Feature Negotiation */
- if (dccp_feat_insert_opts(dp, NULL, skb))
- return -1;
-
- if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) {
- /*
- * Obtain RTT sample from Request/Response exchange.
- * This is currently used for TFRC initialisation.
- */
- if (dccp_insert_option_timestamp(skb))
- return -1;
-
- } else if (dccp_ackvec_pending(sk) &&
- dccp_insert_option_ackvec(sk, skb)) {
- return -1;
- }
- }
-
- if (dp->dccps_hc_rx_insert_options) {
- if (ccid_hc_rx_insert_options(dp->dccps_hc_rx_ccid, sk, skb))
- return -1;
- dp->dccps_hc_rx_insert_options = 0;
- }
-
- if (dp->dccps_timestamp_echo != 0 &&
- dccp_insert_option_timestamp_echo(dp, NULL, skb))
- return -1;
-
- dccp_insert_option_padding(skb);
- return 0;
-}
-
-int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb)
-{
- DCCP_SKB_CB(skb)->dccpd_opt_len = 0;
-
- if (dccp_feat_insert_opts(NULL, dreq, skb))
- return -1;
-
- /* Obtain RTT sample from Response/Ack exchange (used by TFRC). */
- if (dccp_insert_option_timestamp(skb))
- return -1;
-
- if (dreq->dreq_timestamp_echo != 0 &&
- dccp_insert_option_timestamp_echo(NULL, dreq, skb))
- return -1;
-
- dccp_insert_option_padding(skb);
- return 0;
-}
diff --git a/net/dccp/output.c b/net/dccp/output.c
deleted file mode 100644
index 91a15b3c4915..000000000000
--- a/net/dccp/output.c
+++ /dev/null
@@ -1,704 +0,0 @@
-/*
- * net/dccp/output.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/dccp.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-#include <linux/sched/signal.h>
-
-#include <net/inet_sock.h>
-#include <net/sock.h>
-
-#include "ackvec.h"
-#include "ccid.h"
-#include "dccp.h"
-
-static inline void dccp_event_ack_sent(struct sock *sk)
-{
- inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
-}
-
-/* enqueue @skb on sk_send_head for retransmission, return clone to send now */
-static struct sk_buff *dccp_skb_entail(struct sock *sk, struct sk_buff *skb)
-{
- skb_set_owner_w(skb, sk);
- WARN_ON(sk->sk_send_head);
- sk->sk_send_head = skb;
- return skb_clone(sk->sk_send_head, gfp_any());
-}
-
-/*
- * All SKB's seen here are completely headerless. It is our
- * job to build the DCCP header, and pass the packet down to
- * IP so it can do the same plus pass the packet off to the
- * device.
- */
-static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb)
-{
- if (likely(skb != NULL)) {
- struct inet_sock *inet = inet_sk(sk);
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
- struct dccp_hdr *dh;
- /* XXX For now we're using only 48 bits sequence numbers */
- const u32 dccp_header_size = sizeof(*dh) +
- sizeof(struct dccp_hdr_ext) +
- dccp_packet_hdr_len(dcb->dccpd_type);
- int err, set_ack = 1;
- u64 ackno = dp->dccps_gsr;
- /*
- * Increment GSS here already in case the option code needs it.
- * Update GSS for real only if option processing below succeeds.
- */
- dcb->dccpd_seq = ADD48(dp->dccps_gss, 1);
-
- switch (dcb->dccpd_type) {
- case DCCP_PKT_DATA:
- set_ack = 0;
- /* fall through */
- case DCCP_PKT_DATAACK:
- case DCCP_PKT_RESET:
- break;
-
- case DCCP_PKT_REQUEST:
- set_ack = 0;
- /* Use ISS on the first (non-retransmitted) Request. */
- if (icsk->icsk_retransmits == 0)
- dcb->dccpd_seq = dp->dccps_iss;
- /* fall through */
-
- case DCCP_PKT_SYNC:
- case DCCP_PKT_SYNCACK:
- ackno = dcb->dccpd_ack_seq;
- /* fall through */
- default:
- /*
- * Set owner/destructor: some skbs are allocated via
- * alloc_skb (e.g. when retransmission may happen).
- * Only Data, DataAck, and Reset packets should come
- * through here with skb->sk set.
- */
- WARN_ON(skb->sk);
- skb_set_owner_w(skb, sk);
- break;
- }
-
- if (dccp_insert_options(sk, skb)) {
- kfree_skb(skb);
- return -EPROTO;
- }
-
-
- /* Build DCCP header and checksum it. */
- dh = dccp_zeroed_hdr(skb, dccp_header_size);
- dh->dccph_type = dcb->dccpd_type;
- dh->dccph_sport = inet->inet_sport;
- dh->dccph_dport = inet->inet_dport;
- dh->dccph_doff = (dccp_header_size + dcb->dccpd_opt_len) / 4;
- dh->dccph_ccval = dcb->dccpd_ccval;
- dh->dccph_cscov = dp->dccps_pcslen;
- /* XXX For now we're using only 48 bits sequence numbers */
- dh->dccph_x = 1;
-
- dccp_update_gss(sk, dcb->dccpd_seq);
- dccp_hdr_set_seq(dh, dp->dccps_gss);
- if (set_ack)
- dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), ackno);
-
- switch (dcb->dccpd_type) {
- case DCCP_PKT_REQUEST:
- dccp_hdr_request(skb)->dccph_req_service =
- dp->dccps_service;
- /*
- * Limit Ack window to ISS <= P.ackno <= GSS, so that
- * only Responses to Requests we sent are considered.
- */
- dp->dccps_awl = dp->dccps_iss;
- break;
- case DCCP_PKT_RESET:
- dccp_hdr_reset(skb)->dccph_reset_code =
- dcb->dccpd_reset_code;
- break;
- }
-
- icsk->icsk_af_ops->send_check(sk, skb);
-
- if (set_ack)
- dccp_event_ack_sent(sk);
-
- DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
-
- err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
- return net_xmit_eval(err);
- }
- return -ENOBUFS;
-}
-
-/**
- * dccp_determine_ccmps - Find out about CCID-specific packet-size limits
- * We only consider the HC-sender CCID for setting the CCMPS (RFC 4340, 14.),
- * since the RX CCID is restricted to feedback packets (Acks), which are small
- * in comparison with the data traffic. A value of 0 means "no current CCMPS".
- */
-static u32 dccp_determine_ccmps(const struct dccp_sock *dp)
-{
- const struct ccid *tx_ccid = dp->dccps_hc_tx_ccid;
-
- if (tx_ccid == NULL || tx_ccid->ccid_ops == NULL)
- return 0;
- return tx_ccid->ccid_ops->ccid_ccmps;
-}
-
-unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- u32 ccmps = dccp_determine_ccmps(dp);
- u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
-
- /* Account for header lengths and IPv4/v6 option overhead */
- cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len +
- sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext));
-
- /*
- * Leave enough headroom for common DCCP header options.
- * This only considers options which may appear on DCCP-Data packets, as
- * per table 3 in RFC 4340, 5.8. When running out of space for other
- * options (eg. Ack Vector which can take up to 255 bytes), it is better
- * to schedule a separate Ack. Thus we leave headroom for the following:
- * - 1 byte for Slow Receiver (11.6)
- * - 6 bytes for Timestamp (13.1)
- * - 10 bytes for Timestamp Echo (13.3)
- * - 8 bytes for NDP count (7.7, when activated)
- * - 6 bytes for Data Checksum (9.3)
- * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled)
- */
- cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 +
- (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4);
-
- /* And store cached results */
- icsk->icsk_pmtu_cookie = pmtu;
- dp->dccps_mss_cache = cur_mps;
-
- return cur_mps;
-}
-
-EXPORT_SYMBOL_GPL(dccp_sync_mss);
-
-void dccp_write_space(struct sock *sk)
-{
- struct socket_wq *wq;
-
- rcu_read_lock();
- wq = rcu_dereference(sk->sk_wq);
- if (skwq_has_sleeper(wq))
- wake_up_interruptible(&wq->wait);
- /* Should agree with poll, otherwise some programs break */
- if (sock_writeable(sk))
- sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
-
- rcu_read_unlock();
-}
-
-/**
- * dccp_wait_for_ccid - Await CCID send permission
- * @sk: socket to wait for
- * @delay: timeout in jiffies
- *
- * This is used by CCIDs which need to delay the send time in process context.
- */
-static int dccp_wait_for_ccid(struct sock *sk, unsigned long delay)
-{
- DEFINE_WAIT(wait);
- long remaining;
-
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- sk->sk_write_pending++;
- release_sock(sk);
-
- remaining = schedule_timeout(delay);
-
- lock_sock(sk);
- sk->sk_write_pending--;
- finish_wait(sk_sleep(sk), &wait);
-
- if (signal_pending(current) || sk->sk_err)
- return -1;
- return remaining;
-}
-
-/**
- * dccp_xmit_packet - Send data packet under control of CCID
- * Transmits next-queued payload and informs CCID to account for the packet.
- */
-static void dccp_xmit_packet(struct sock *sk)
-{
- int err, len;
- struct dccp_sock *dp = dccp_sk(sk);
- struct sk_buff *skb = dccp_qpolicy_pop(sk);
-
- if (unlikely(skb == NULL))
- return;
- len = skb->len;
-
- if (sk->sk_state == DCCP_PARTOPEN) {
- const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
- /*
- * See 8.1.5 - Handshake Completion.
- *
- * For robustness we resend Confirm options until the client has
- * entered OPEN. During the initial feature negotiation, the MPS
- * is smaller than usual, reduced by the Change/Confirm options.
- */
- if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
- DCCP_WARN("Payload too large (%d) for featneg.\n", len);
- dccp_send_ack(sk);
- dccp_feat_list_purge(&dp->dccps_featneg);
- }
-
- inet_csk_schedule_ack(sk);
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- inet_csk(sk)->icsk_rto,
- DCCP_RTO_MAX);
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
- } else if (dccp_ack_pending(sk)) {
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATAACK;
- } else {
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_DATA;
- }
-
- err = dccp_transmit_skb(sk, skb);
- if (err)
- dccp_pr_debug("transmit_skb() returned err=%d\n", err);
- /*
- * Register this one as sent even if an error occurred. To the remote
- * end a local packet drop is indistinguishable from network loss, i.e.
- * any local drop will eventually be reported via receiver feedback.
- */
- ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len);
-
- /*
- * If the CCID needs to transfer additional header options out-of-band
- * (e.g. Ack Vectors or feature-negotiation options), it activates this
- * flag to schedule a Sync. The Sync will automatically incorporate all
- * currently pending header options, thus clearing the backlog.
- */
- if (dp->dccps_sync_scheduled)
- dccp_send_sync(sk, dp->dccps_gsr, DCCP_PKT_SYNC);
-}
-
-/**
- * dccp_flush_write_queue - Drain queue at end of connection
- * Since dccp_sendmsg queues packets without waiting for them to be sent, it may
- * happen that the TX queue is not empty at the end of a connection. We give the
- * HC-sender CCID a grace period of up to @time_budget jiffies. If this function
- * returns with a non-empty write queue, it will be purged later.
- */
-void dccp_flush_write_queue(struct sock *sk, long *time_budget)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct sk_buff *skb;
- long delay, rc;
-
- while (*time_budget > 0 && (skb = skb_peek(&sk->sk_write_queue))) {
- rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
-
- switch (ccid_packet_dequeue_eval(rc)) {
- case CCID_PACKET_WILL_DEQUEUE_LATER:
- /*
- * If the CCID determines when to send, the next sending
- * time is unknown or the CCID may not even send again
- * (e.g. remote host crashes or lost Ack packets).
- */
- DCCP_WARN("CCID did not manage to send all packets\n");
- return;
- case CCID_PACKET_DELAY:
- delay = msecs_to_jiffies(rc);
- if (delay > *time_budget)
- return;
- rc = dccp_wait_for_ccid(sk, delay);
- if (rc < 0)
- return;
- *time_budget -= (delay - rc);
- /* check again if we can send now */
- break;
- case CCID_PACKET_SEND_AT_ONCE:
- dccp_xmit_packet(sk);
- break;
- case CCID_PACKET_ERR:
- skb_dequeue(&sk->sk_write_queue);
- kfree_skb(skb);
- dccp_pr_debug("packet discarded due to err=%ld\n", rc);
- }
- }
-}
-
-void dccp_write_xmit(struct sock *sk)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct sk_buff *skb;
-
- while ((skb = dccp_qpolicy_top(sk))) {
- int rc = ccid_hc_tx_send_packet(dp->dccps_hc_tx_ccid, sk, skb);
-
- switch (ccid_packet_dequeue_eval(rc)) {
- case CCID_PACKET_WILL_DEQUEUE_LATER:
- return;
- case CCID_PACKET_DELAY:
- sk_reset_timer(sk, &dp->dccps_xmit_timer,
- jiffies + msecs_to_jiffies(rc));
- return;
- case CCID_PACKET_SEND_AT_ONCE:
- dccp_xmit_packet(sk);
- break;
- case CCID_PACKET_ERR:
- dccp_qpolicy_drop(sk, skb);
- dccp_pr_debug("packet discarded due to err=%d\n", rc);
- }
- }
-}
-
-/**
- * dccp_retransmit_skb - Retransmit Request, Close, or CloseReq packets
- * There are only four retransmittable packet types in DCCP:
- * - Request in client-REQUEST state (sec. 8.1.1),
- * - CloseReq in server-CLOSEREQ state (sec. 8.3),
- * - Close in node-CLOSING state (sec. 8.3),
- * - Acks in client-PARTOPEN state (sec. 8.1.5, handled by dccp_delack_timer()).
- * This function expects sk->sk_send_head to contain the original skb.
- */
-int dccp_retransmit_skb(struct sock *sk)
-{
- WARN_ON(sk->sk_send_head == NULL);
-
- if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0)
- return -EHOSTUNREACH; /* Routing failure or similar. */
-
- /* this count is used to distinguish original and retransmitted skb */
- inet_csk(sk)->icsk_retransmits++;
-
- return dccp_transmit_skb(sk, skb_clone(sk->sk_send_head, GFP_ATOMIC));
-}
-
-struct sk_buff *dccp_make_response(const struct sock *sk, struct dst_entry *dst,
- struct request_sock *req)
-{
- struct dccp_hdr *dh;
- struct dccp_request_sock *dreq;
- const u32 dccp_header_size = sizeof(struct dccp_hdr) +
- sizeof(struct dccp_hdr_ext) +
- sizeof(struct dccp_hdr_response);
- struct sk_buff *skb;
-
- /* sk is marked const to clearly express we dont hold socket lock.
- * sock_wmalloc() will atomically change sk->sk_wmem_alloc,
- * it is safe to promote sk to non const.
- */
- skb = sock_wmalloc((struct sock *)sk, MAX_DCCP_HEADER, 1,
- GFP_ATOMIC);
- if (!skb)
- return NULL;
-
- skb_reserve(skb, MAX_DCCP_HEADER);
-
- skb_dst_set(skb, dst_clone(dst));
-
- dreq = dccp_rsk(req);
- if (inet_rsk(req)->acked) /* increase GSS upon retransmission */
- dccp_inc_seqno(&dreq->dreq_gss);
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE;
- DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_gss;
-
- /* Resolve feature dependencies resulting from choice of CCID */
- if (dccp_feat_server_ccid_dependencies(dreq))
- goto response_failed;
-
- if (dccp_insert_options_rsk(dreq, skb))
- goto response_failed;
-
- /* Build and checksum header */
- dh = dccp_zeroed_hdr(skb, dccp_header_size);
-
- dh->dccph_sport = htons(inet_rsk(req)->ir_num);
- dh->dccph_dport = inet_rsk(req)->ir_rmt_port;
- dh->dccph_doff = (dccp_header_size +
- DCCP_SKB_CB(skb)->dccpd_opt_len) / 4;
- dh->dccph_type = DCCP_PKT_RESPONSE;
- dh->dccph_x = 1;
- dccp_hdr_set_seq(dh, dreq->dreq_gss);
- dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dreq->dreq_gsr);
- dccp_hdr_response(skb)->dccph_resp_service = dreq->dreq_service;
-
- dccp_csum_outgoing(skb);
-
- /* We use `acked' to remember that a Response was already sent. */
- inet_rsk(req)->acked = 1;
- DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
- return skb;
-response_failed:
- kfree_skb(skb);
- return NULL;
-}
-
-EXPORT_SYMBOL_GPL(dccp_make_response);
-
-/* answer offending packet in @rcv_skb with Reset from control socket @ctl */
-struct sk_buff *dccp_ctl_make_reset(struct sock *sk, struct sk_buff *rcv_skb)
-{
- struct dccp_hdr *rxdh = dccp_hdr(rcv_skb), *dh;
- struct dccp_skb_cb *dcb = DCCP_SKB_CB(rcv_skb);
- const u32 dccp_hdr_reset_len = sizeof(struct dccp_hdr) +
- sizeof(struct dccp_hdr_ext) +
- sizeof(struct dccp_hdr_reset);
- struct dccp_hdr_reset *dhr;
- struct sk_buff *skb;
-
- skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC);
- if (skb == NULL)
- return NULL;
-
- skb_reserve(skb, sk->sk_prot->max_header);
-
- /* Swap the send and the receive. */
- dh = dccp_zeroed_hdr(skb, dccp_hdr_reset_len);
- dh->dccph_type = DCCP_PKT_RESET;
- dh->dccph_sport = rxdh->dccph_dport;
- dh->dccph_dport = rxdh->dccph_sport;
- dh->dccph_doff = dccp_hdr_reset_len / 4;
- dh->dccph_x = 1;
-
- dhr = dccp_hdr_reset(skb);
- dhr->dccph_reset_code = dcb->dccpd_reset_code;
-
- switch (dcb->dccpd_reset_code) {
- case DCCP_RESET_CODE_PACKET_ERROR:
- dhr->dccph_reset_data[0] = rxdh->dccph_type;
- break;
- case DCCP_RESET_CODE_OPTION_ERROR: /* fall through */
- case DCCP_RESET_CODE_MANDATORY_ERROR:
- memcpy(dhr->dccph_reset_data, dcb->dccpd_reset_data, 3);
- break;
- }
- /*
- * From RFC 4340, 8.3.1:
- * If P.ackno exists, set R.seqno := P.ackno + 1.
- * Else set R.seqno := 0.
- */
- if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ)
- dccp_hdr_set_seq(dh, ADD48(dcb->dccpd_ack_seq, 1));
- dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), dcb->dccpd_seq);
-
- dccp_csum_outgoing(skb);
- return skb;
-}
-
-EXPORT_SYMBOL_GPL(dccp_ctl_make_reset);
-
-/* send Reset on established socket, to close or abort the connection */
-int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code)
-{
- struct sk_buff *skb;
- /*
- * FIXME: what if rebuild_header fails?
- * Should we be doing a rebuild_header here?
- */
- int err = inet_csk(sk)->icsk_af_ops->rebuild_header(sk);
-
- if (err != 0)
- return err;
-
- skb = sock_wmalloc(sk, sk->sk_prot->max_header, 1, GFP_ATOMIC);
- if (skb == NULL)
- return -ENOBUFS;
-
- /* Reserve space for headers and prepare control bits. */
- skb_reserve(skb, sk->sk_prot->max_header);
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESET;
- DCCP_SKB_CB(skb)->dccpd_reset_code = code;
-
- return dccp_transmit_skb(sk, skb);
-}
-
-/*
- * Do all connect socket setups that can be done AF independent.
- */
-int dccp_connect(struct sock *sk)
-{
- struct sk_buff *skb;
- struct dccp_sock *dp = dccp_sk(sk);
- struct dst_entry *dst = __sk_dst_get(sk);
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- sk->sk_err = 0;
- sock_reset_flag(sk, SOCK_DONE);
-
- dccp_sync_mss(sk, dst_mtu(dst));
-
- /* do not connect if feature negotiation setup fails */
- if (dccp_feat_finalise_settings(dccp_sk(sk)))
- return -EPROTO;
-
- /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */
- dp->dccps_gar = dp->dccps_iss;
-
- skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation);
- if (unlikely(skb == NULL))
- return -ENOBUFS;
-
- /* Reserve space for headers. */
- skb_reserve(skb, sk->sk_prot->max_header);
-
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_REQUEST;
-
- dccp_transmit_skb(sk, dccp_skb_entail(sk, skb));
- DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS);
-
- /* Timer for repeating the REQUEST until an answer. */
- icsk->icsk_retransmits = 0;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- icsk->icsk_rto, DCCP_RTO_MAX);
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_connect);
-
-void dccp_send_ack(struct sock *sk)
-{
- /* If we have been reset, we may not send again. */
- if (sk->sk_state != DCCP_CLOSED) {
- struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header,
- GFP_ATOMIC);
-
- if (skb == NULL) {
- inet_csk_schedule_ack(sk);
- inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- TCP_DELACK_MAX,
- DCCP_RTO_MAX);
- return;
- }
-
- /* Reserve space for headers */
- skb_reserve(skb, sk->sk_prot->max_header);
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_ACK;
- dccp_transmit_skb(sk, skb);
- }
-}
-
-EXPORT_SYMBOL_GPL(dccp_send_ack);
-
-#if 0
-/* FIXME: Is this still necessary (11.3) - currently nowhere used by DCCP. */
-void dccp_send_delayed_ack(struct sock *sk)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- /*
- * FIXME: tune this timer. elapsed time fixes the skew, so no problem
- * with using 2s, and active senders also piggyback the ACK into a
- * DATAACK packet, so this is really for quiescent senders.
- */
- unsigned long timeout = jiffies + 2 * HZ;
-
- /* Use new timeout only if there wasn't a older one earlier. */
- if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
- /* If delack timer was blocked or is about to expire,
- * send ACK now.
- *
- * FIXME: check the "about to expire" part
- */
- if (icsk->icsk_ack.blocked) {
- dccp_send_ack(sk);
- return;
- }
-
- if (!time_before(timeout, icsk->icsk_ack.timeout))
- timeout = icsk->icsk_ack.timeout;
- }
- icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
- icsk->icsk_ack.timeout = timeout;
- sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
-}
-#endif
-
-void dccp_send_sync(struct sock *sk, const u64 ackno,
- const enum dccp_pkt_type pkt_type)
-{
- /*
- * We are not putting this on the write queue, so
- * dccp_transmit_skb() will set the ownership to this
- * sock.
- */
- struct sk_buff *skb = alloc_skb(sk->sk_prot->max_header, GFP_ATOMIC);
-
- if (skb == NULL) {
- /* FIXME: how to make sure the sync is sent? */
- DCCP_CRIT("could not send %s", dccp_packet_name(pkt_type));
- return;
- }
-
- /* Reserve space for headers and prepare control bits. */
- skb_reserve(skb, sk->sk_prot->max_header);
- DCCP_SKB_CB(skb)->dccpd_type = pkt_type;
- DCCP_SKB_CB(skb)->dccpd_ack_seq = ackno;
-
- /*
- * Clear the flag in case the Sync was scheduled for out-of-band data,
- * such as carrying a long Ack Vector.
- */
- dccp_sk(sk)->dccps_sync_scheduled = 0;
-
- dccp_transmit_skb(sk, skb);
-}
-
-EXPORT_SYMBOL_GPL(dccp_send_sync);
-
-/*
- * Send a DCCP_PKT_CLOSE/CLOSEREQ. The caller locks the socket for us. This
- * cannot be allowed to fail queueing a DCCP_PKT_CLOSE/CLOSEREQ frame under
- * any circumstances.
- */
-void dccp_send_close(struct sock *sk, const int active)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct sk_buff *skb;
- const gfp_t prio = active ? GFP_KERNEL : GFP_ATOMIC;
-
- skb = alloc_skb(sk->sk_prot->max_header, prio);
- if (skb == NULL)
- return;
-
- /* Reserve space for headers and prepare control bits. */
- skb_reserve(skb, sk->sk_prot->max_header);
- if (dp->dccps_role == DCCP_ROLE_SERVER && !dp->dccps_server_timewait)
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSEREQ;
- else
- DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_CLOSE;
-
- if (active) {
- skb = dccp_skb_entail(sk, skb);
- /*
- * Retransmission timer for active-close: RFC 4340, 8.3 requires
- * to retransmit the Close/CloseReq until the CLOSING/CLOSEREQ
- * state can be left. The initial timeout is 2 RTTs.
- * Since RTT measurement is done by the CCIDs, there is no easy
- * way to get an RTT sample. The fallback RTT from RFC 4340, 3.4
- * is too low (200ms); we use a high value to avoid unnecessary
- * retransmissions when the link RTT is > 0.2 seconds.
- * FIXME: Let main module sample RTTs and use that instead.
- */
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- DCCP_TIMEOUT_INIT, DCCP_RTO_MAX);
- }
- dccp_transmit_skb(sk, skb);
-}
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
deleted file mode 100644
index 875858c8b059..000000000000
--- a/net/dccp/proto.c
+++ /dev/null
@@ -1,1276 +0,0 @@
-/*
- * net/dccp/proto.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/dccp.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/in.h>
-#include <linux/if_arp.h>
-#include <linux/init.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-#include <net/checksum.h>
-
-#include <net/inet_sock.h>
-#include <net/inet_common.h>
-#include <net/sock.h>
-#include <net/xfrm.h>
-
-#include <asm/ioctls.h>
-#include <linux/spinlock.h>
-#include <linux/timer.h>
-#include <linux/delay.h>
-#include <linux/poll.h>
-
-#include "ccid.h"
-#include "dccp.h"
-#include "feat.h"
-
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
-DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
-
-EXPORT_SYMBOL_GPL(dccp_statistics);
-
-struct percpu_counter dccp_orphan_count;
-EXPORT_SYMBOL_GPL(dccp_orphan_count);
-
-struct inet_hashinfo dccp_hashinfo;
-EXPORT_SYMBOL_GPL(dccp_hashinfo);
-
-/* the maximum queue length for tx in packets. 0 is no limit */
-int sysctl_dccp_tx_qlen __read_mostly = 5;
-
-#ifdef CONFIG_IP_DCCP_DEBUG
-static const char *dccp_state_name(const int state)
-{
- static const char *const dccp_state_names[] = {
- [DCCP_OPEN] = "OPEN",
- [DCCP_REQUESTING] = "REQUESTING",
- [DCCP_PARTOPEN] = "PARTOPEN",
- [DCCP_LISTEN] = "LISTEN",
- [DCCP_RESPOND] = "RESPOND",
- [DCCP_CLOSING] = "CLOSING",
- [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ",
- [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE",
- [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ",
- [DCCP_TIME_WAIT] = "TIME_WAIT",
- [DCCP_CLOSED] = "CLOSED",
- };
-
- if (state >= DCCP_MAX_STATES)
- return "INVALID STATE!";
- else
- return dccp_state_names[state];
-}
-#endif
-
-void dccp_set_state(struct sock *sk, const int state)
-{
- const int oldstate = sk->sk_state;
-
- dccp_pr_debug("%s(%p) %s --> %s\n", dccp_role(sk), sk,
- dccp_state_name(oldstate), dccp_state_name(state));
- WARN_ON(state == oldstate);
-
- switch (state) {
- case DCCP_OPEN:
- if (oldstate != DCCP_OPEN)
- DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
- /* Client retransmits all Confirm options until entering OPEN */
- if (oldstate == DCCP_PARTOPEN)
- dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg);
- break;
-
- case DCCP_CLOSED:
- if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ ||
- oldstate == DCCP_CLOSING)
- DCCP_INC_STATS(DCCP_MIB_ESTABRESETS);
-
- sk->sk_prot->unhash(sk);
- if (inet_csk(sk)->icsk_bind_hash != NULL &&
- !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
- inet_put_port(sk);
- /* fall through */
- default:
- if (oldstate == DCCP_OPEN)
- DCCP_DEC_STATS(DCCP_MIB_CURRESTAB);
- }
-
- /* Change state AFTER socket is unhashed to avoid closed
- * socket sitting in hash tables.
- */
- inet_sk_set_state(sk, state);
-}
-
-EXPORT_SYMBOL_GPL(dccp_set_state);
-
-static void dccp_finish_passive_close(struct sock *sk)
-{
- switch (sk->sk_state) {
- case DCCP_PASSIVE_CLOSE:
- /* Node (client or server) has received Close packet. */
- dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED);
- dccp_set_state(sk, DCCP_CLOSED);
- break;
- case DCCP_PASSIVE_CLOSEREQ:
- /*
- * Client received CloseReq. We set the `active' flag so that
- * dccp_send_close() retransmits the Close as per RFC 4340, 8.3.
- */
- dccp_send_close(sk, 1);
- dccp_set_state(sk, DCCP_CLOSING);
- }
-}
-
-void dccp_done(struct sock *sk)
-{
- dccp_set_state(sk, DCCP_CLOSED);
- dccp_clear_xmit_timers(sk);
-
- sk->sk_shutdown = SHUTDOWN_MASK;
-
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_state_change(sk);
- else
- inet_csk_destroy_sock(sk);
-}
-
-EXPORT_SYMBOL_GPL(dccp_done);
-
-const char *dccp_packet_name(const int type)
-{
- static const char *const dccp_packet_names[] = {
- [DCCP_PKT_REQUEST] = "REQUEST",
- [DCCP_PKT_RESPONSE] = "RESPONSE",
- [DCCP_PKT_DATA] = "DATA",
- [DCCP_PKT_ACK] = "ACK",
- [DCCP_PKT_DATAACK] = "DATAACK",
- [DCCP_PKT_CLOSEREQ] = "CLOSEREQ",
- [DCCP_PKT_CLOSE] = "CLOSE",
- [DCCP_PKT_RESET] = "RESET",
- [DCCP_PKT_SYNC] = "SYNC",
- [DCCP_PKT_SYNCACK] = "SYNCACK",
- };
-
- if (type >= DCCP_NR_PKT_TYPES)
- return "INVALID";
- else
- return dccp_packet_names[type];
-}
-
-EXPORT_SYMBOL_GPL(dccp_packet_name);
-
-static void dccp_sk_destruct(struct sock *sk)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
- dp->dccps_hc_tx_ccid = NULL;
- inet_sock_destruct(sk);
-}
-
-int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- icsk->icsk_rto = DCCP_TIMEOUT_INIT;
- icsk->icsk_syn_retries = sysctl_dccp_request_retries;
- sk->sk_state = DCCP_CLOSED;
- sk->sk_write_space = dccp_write_space;
- sk->sk_destruct = dccp_sk_destruct;
- icsk->icsk_sync_mss = dccp_sync_mss;
- dp->dccps_mss_cache = 536;
- dp->dccps_rate_last = jiffies;
- dp->dccps_role = DCCP_ROLE_UNDEFINED;
- dp->dccps_service = DCCP_SERVICE_CODE_IS_ABSENT;
- dp->dccps_tx_qlen = sysctl_dccp_tx_qlen;
-
- dccp_init_xmit_timers(sk);
-
- INIT_LIST_HEAD(&dp->dccps_featneg);
- /* control socket doesn't need feat nego */
- if (likely(ctl_sock_initialized))
- return dccp_feat_init(sk);
- return 0;
-}
-
-EXPORT_SYMBOL_GPL(dccp_init_sock);
-
-void dccp_destroy_sock(struct sock *sk)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- __skb_queue_purge(&sk->sk_write_queue);
- if (sk->sk_send_head != NULL) {
- kfree_skb(sk->sk_send_head);
- sk->sk_send_head = NULL;
- }
-
- /* Clean up a referenced DCCP bind bucket. */
- if (inet_csk(sk)->icsk_bind_hash != NULL)
- inet_put_port(sk);
-
- kfree(dp->dccps_service_list);
- dp->dccps_service_list = NULL;
-
- if (dp->dccps_hc_rx_ackvec != NULL) {
- dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
- dp->dccps_hc_rx_ackvec = NULL;
- }
- ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
- dp->dccps_hc_rx_ccid = NULL;
-
- /* clean up feature negotiation state */
- dccp_feat_list_purge(&dp->dccps_featneg);
-}
-
-EXPORT_SYMBOL_GPL(dccp_destroy_sock);
-
-static inline int dccp_listen_start(struct sock *sk, int backlog)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- dp->dccps_role = DCCP_ROLE_LISTEN;
- /* do not start to listen if feature negotiation setup fails */
- if (dccp_feat_finalise_settings(dp))
- return -EPROTO;
- return inet_csk_listen_start(sk, backlog);
-}
-
-static inline int dccp_need_reset(int state)
-{
- return state != DCCP_CLOSED && state != DCCP_LISTEN &&
- state != DCCP_REQUESTING;
-}
-
-int dccp_disconnect(struct sock *sk, int flags)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
- struct inet_sock *inet = inet_sk(sk);
- struct dccp_sock *dp = dccp_sk(sk);
- int err = 0;
- const int old_state = sk->sk_state;
-
- if (old_state != DCCP_CLOSED)
- dccp_set_state(sk, DCCP_CLOSED);
-
- /*
- * This corresponds to the ABORT function of RFC793, sec. 3.8
- * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted".
- */
- if (old_state == DCCP_LISTEN) {
- inet_csk_listen_stop(sk);
- } else if (dccp_need_reset(old_state)) {
- dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
- sk->sk_err = ECONNRESET;
- } else if (old_state == DCCP_REQUESTING)
- sk->sk_err = ECONNRESET;
-
- dccp_clear_xmit_timers(sk);
- ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
- dp->dccps_hc_rx_ccid = NULL;
-
- __skb_queue_purge(&sk->sk_receive_queue);
- __skb_queue_purge(&sk->sk_write_queue);
- if (sk->sk_send_head != NULL) {
- __kfree_skb(sk->sk_send_head);
- sk->sk_send_head = NULL;
- }
-
- inet->inet_dport = 0;
-
- if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
- inet_reset_saddr(sk);
-
- sk->sk_shutdown = 0;
- sock_reset_flag(sk, SOCK_DONE);
-
- icsk->icsk_backoff = 0;
- inet_csk_delack_init(sk);
- __sk_dst_reset(sk);
-
- WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
-
- sk->sk_error_report(sk);
- return err;
-}
-
-EXPORT_SYMBOL_GPL(dccp_disconnect);
-
-/*
- * Wait for a DCCP event.
- *
- * Note that we don't need to lock the socket, as the upper poll layers
- * take care of normal races (between the test and the event) and we don't
- * go look at any of the socket buffers directly.
- */
-__poll_t dccp_poll(struct file *file, struct socket *sock,
- poll_table *wait)
-{
- __poll_t mask;
- struct sock *sk = sock->sk;
-
- sock_poll_wait(file, wait);
- if (sk->sk_state == DCCP_LISTEN)
- return inet_csk_listen_poll(sk);
-
- /* Socket is not locked. We are protected from async events
- by poll logic and correct handling of state changes
- made by another threads is impossible in any case.
- */
-
- mask = 0;
- if (sk->sk_err)
- mask = EPOLLERR;
-
- if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
- mask |= EPOLLHUP;
- if (sk->sk_shutdown & RCV_SHUTDOWN)
- mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
-
- /* Connected? */
- if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
- if (atomic_read(&sk->sk_rmem_alloc) > 0)
- mask |= EPOLLIN | EPOLLRDNORM;
-
- if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
- if (sk_stream_is_writeable(sk)) {
- mask |= EPOLLOUT | EPOLLWRNORM;
- } else { /* send SIGIO later */
- sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-
- /* Race breaker. If space is freed after
- * wspace test but before the flags are set,
- * IO signal will be lost.
- */
- if (sk_stream_is_writeable(sk))
- mask |= EPOLLOUT | EPOLLWRNORM;
- }
- }
- }
- return mask;
-}
-
-EXPORT_SYMBOL_GPL(dccp_poll);
-
-int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
-{
- int rc = -ENOTCONN;
-
- lock_sock(sk);
-
- if (sk->sk_state == DCCP_LISTEN)
- goto out;
-
- switch (cmd) {
- case SIOCINQ: {
- struct sk_buff *skb;
- unsigned long amount = 0;
-
- skb = skb_peek(&sk->sk_receive_queue);
- if (skb != NULL) {
- /*
- * We will only return the amount of this packet since
- * that is all that will be read.
- */
- amount = skb->len;
- }
- rc = put_user(amount, (int __user *)arg);
- }
- break;
- default:
- rc = -ENOIOCTLCMD;
- break;
- }
-out:
- release_sock(sk);
- return rc;
-}
-
-EXPORT_SYMBOL_GPL(dccp_ioctl);
-
-static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
- char __user *optval, unsigned int optlen)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct dccp_service_list *sl = NULL;
-
- if (service == DCCP_SERVICE_INVALID_VALUE ||
- optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32))
- return -EINVAL;
-
- if (optlen > sizeof(service)) {
- sl = kmalloc(optlen, GFP_KERNEL);
- if (sl == NULL)
- return -ENOMEM;
-
- sl->dccpsl_nr = optlen / sizeof(u32) - 1;
- if (copy_from_user(sl->dccpsl_list,
- optval + sizeof(service),
- optlen - sizeof(service)) ||
- dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) {
- kfree(sl);
- return -EFAULT;
- }
- }
-
- lock_sock(sk);
- dp->dccps_service = service;
-
- kfree(dp->dccps_service_list);
-
- dp->dccps_service_list = sl;
- release_sock(sk);
- return 0;
-}
-
-static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx)
-{
- u8 *list, len;
- int i, rc;
-
- if (cscov < 0 || cscov > 15)
- return -EINVAL;
- /*
- * Populate a list of permissible values, in the range cscov...15. This
- * is necessary since feature negotiation of single values only works if
- * both sides incidentally choose the same value. Since the list starts
- * lowest-value first, negotiation will pick the smallest shared value.
- */
- if (cscov == 0)
- return 0;
- len = 16 - cscov;
-
- list = kmalloc(len, GFP_KERNEL);
- if (list == NULL)
- return -ENOBUFS;
-
- for (i = 0; i < len; i++)
- list[i] = cscov++;
-
- rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len);
-
- if (rc == 0) {
- if (rx)
- dccp_sk(sk)->dccps_pcrlen = cscov;
- else
- dccp_sk(sk)->dccps_pcslen = cscov;
- }
- kfree(list);
- return rc;
-}
-
-static int dccp_setsockopt_ccid(struct sock *sk, int type,
- char __user *optval, unsigned int optlen)
-{
- u8 *val;
- int rc = 0;
-
- if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS)
- return -EINVAL;
-
- val = memdup_user(optval, optlen);
- if (IS_ERR(val))
- return PTR_ERR(val);
-
- lock_sock(sk);
- if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID)
- rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen);
-
- if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID))
- rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen);
- release_sock(sk);
-
- kfree(val);
- return rc;
-}
-
-static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- int val, err = 0;
-
- switch (optname) {
- case DCCP_SOCKOPT_PACKET_SIZE:
- DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
- return 0;
- case DCCP_SOCKOPT_CHANGE_L:
- case DCCP_SOCKOPT_CHANGE_R:
- DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
- return 0;
- case DCCP_SOCKOPT_CCID:
- case DCCP_SOCKOPT_RX_CCID:
- case DCCP_SOCKOPT_TX_CCID:
- return dccp_setsockopt_ccid(sk, optname, optval, optlen);
- }
-
- if (optlen < (int)sizeof(int))
- return -EINVAL;
-
- if (get_user(val, (int __user *)optval))
- return -EFAULT;
-
- if (optname == DCCP_SOCKOPT_SERVICE)
- return dccp_setsockopt_service(sk, val, optval, optlen);
-
- lock_sock(sk);
- switch (optname) {
- case DCCP_SOCKOPT_SERVER_TIMEWAIT:
- if (dp->dccps_role != DCCP_ROLE_SERVER)
- err = -EOPNOTSUPP;
- else
- dp->dccps_server_timewait = (val != 0);
- break;
- case DCCP_SOCKOPT_SEND_CSCOV:
- err = dccp_setsockopt_cscov(sk, val, false);
- break;
- case DCCP_SOCKOPT_RECV_CSCOV:
- err = dccp_setsockopt_cscov(sk, val, true);
- break;
- case DCCP_SOCKOPT_QPOLICY_ID:
- if (sk->sk_state != DCCP_CLOSED)
- err = -EISCONN;
- else if (val < 0 || val >= DCCPQ_POLICY_MAX)
- err = -EINVAL;
- else
- dp->dccps_qpolicy = val;
- break;
- case DCCP_SOCKOPT_QPOLICY_TXQLEN:
- if (val < 0)
- err = -EINVAL;
- else
- dp->dccps_tx_qlen = val;
- break;
- default:
- err = -ENOPROTOOPT;
- break;
- }
- release_sock(sk);
-
- return err;
-}
-
-int dccp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- if (level != SOL_DCCP)
- return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
- optname, optval,
- optlen);
- return do_dccp_setsockopt(sk, level, optname, optval, optlen);
-}
-
-EXPORT_SYMBOL_GPL(dccp_setsockopt);
-
-#ifdef CONFIG_COMPAT
-int compat_dccp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- if (level != SOL_DCCP)
- return inet_csk_compat_setsockopt(sk, level, optname,
- optval, optlen);
- return do_dccp_setsockopt(sk, level, optname, optval, optlen);
-}
-
-EXPORT_SYMBOL_GPL(compat_dccp_setsockopt);
-#endif
-
-static int dccp_getsockopt_service(struct sock *sk, int len,
- __be32 __user *optval,
- int __user *optlen)
-{
- const struct dccp_sock *dp = dccp_sk(sk);
- const struct dccp_service_list *sl;
- int err = -ENOENT, slen = 0, total_len = sizeof(u32);
-
- lock_sock(sk);
- if ((sl = dp->dccps_service_list) != NULL) {
- slen = sl->dccpsl_nr * sizeof(u32);
- total_len += slen;
- }
-
- err = -EINVAL;
- if (total_len > len)
- goto out;
-
- err = 0;
- if (put_user(total_len, optlen) ||
- put_user(dp->dccps_service, optval) ||
- (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen)))
- err = -EFAULT;
-out:
- release_sock(sk);
- return err;
-}
-
-static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- struct dccp_sock *dp;
- int val, len;
-
- if (get_user(len, optlen))
- return -EFAULT;
-
- if (len < (int)sizeof(int))
- return -EINVAL;
-
- dp = dccp_sk(sk);
-
- switch (optname) {
- case DCCP_SOCKOPT_PACKET_SIZE:
- DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
- return 0;
- case DCCP_SOCKOPT_SERVICE:
- return dccp_getsockopt_service(sk, len,
- (__be32 __user *)optval, optlen);
- case DCCP_SOCKOPT_GET_CUR_MPS:
- val = dp->dccps_mss_cache;
- break;
- case DCCP_SOCKOPT_AVAILABLE_CCIDS:
- return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
- case DCCP_SOCKOPT_TX_CCID:
- val = ccid_get_current_tx_ccid(dp);
- if (val < 0)
- return -ENOPROTOOPT;
- break;
- case DCCP_SOCKOPT_RX_CCID:
- val = ccid_get_current_rx_ccid(dp);
- if (val < 0)
- return -ENOPROTOOPT;
- break;
- case DCCP_SOCKOPT_SERVER_TIMEWAIT:
- val = dp->dccps_server_timewait;
- break;
- case DCCP_SOCKOPT_SEND_CSCOV:
- val = dp->dccps_pcslen;
- break;
- case DCCP_SOCKOPT_RECV_CSCOV:
- val = dp->dccps_pcrlen;
- break;
- case DCCP_SOCKOPT_QPOLICY_ID:
- val = dp->dccps_qpolicy;
- break;
- case DCCP_SOCKOPT_QPOLICY_TXQLEN:
- val = dp->dccps_tx_qlen;
- break;
- case 128 ... 191:
- return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
- len, (u32 __user *)optval, optlen);
- case 192 ... 255:
- return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname,
- len, (u32 __user *)optval, optlen);
- default:
- return -ENOPROTOOPT;
- }
-
- len = sizeof(val);
- if (put_user(len, optlen) || copy_to_user(optval, &val, len))
- return -EFAULT;
-
- return 0;
-}
-
-int dccp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- if (level != SOL_DCCP)
- return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
- optname, optval,
- optlen);
- return do_dccp_getsockopt(sk, level, optname, optval, optlen);
-}
-
-EXPORT_SYMBOL_GPL(dccp_getsockopt);
-
-#ifdef CONFIG_COMPAT
-int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- if (level != SOL_DCCP)
- return inet_csk_compat_getsockopt(sk, level, optname,
- optval, optlen);
- return do_dccp_getsockopt(sk, level, optname, optval, optlen);
-}
-
-EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
-#endif
-
-static int dccp_msghdr_parse(struct msghdr *msg, struct sk_buff *skb)
-{
- struct cmsghdr *cmsg;
-
- /*
- * Assign an (opaque) qpolicy priority value to skb->priority.
- *
- * We are overloading this skb field for use with the qpolicy subystem.
- * The skb->priority is normally used for the SO_PRIORITY option, which
- * is initialised from sk_priority. Since the assignment of sk_priority
- * to skb->priority happens later (on layer 3), we overload this field
- * for use with queueing priorities as long as the skb is on layer 4.
- * The default priority value (if nothing is set) is 0.
- */
- skb->priority = 0;
-
- for_each_cmsghdr(cmsg, msg) {
- if (!CMSG_OK(msg, cmsg))
- return -EINVAL;
-
- if (cmsg->cmsg_level != SOL_DCCP)
- continue;
-
- if (cmsg->cmsg_type <= DCCP_SCM_QPOLICY_MAX &&
- !dccp_qpolicy_param_ok(skb->sk, cmsg->cmsg_type))
- return -EINVAL;
-
- switch (cmsg->cmsg_type) {
- case DCCP_SCM_PRIORITY:
- if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u32)))
- return -EINVAL;
- skb->priority = *(__u32 *)CMSG_DATA(cmsg);
- break;
- default:
- return -EINVAL;
- }
- }
- return 0;
-}
-
-int dccp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
-{
- const struct dccp_sock *dp = dccp_sk(sk);
- const int flags = msg->msg_flags;
- const int noblock = flags & MSG_DONTWAIT;
- struct sk_buff *skb;
- int rc, size;
- long timeo;
-
- trace_dccp_probe(sk, len);
-
- if (len > dp->dccps_mss_cache)
- return -EMSGSIZE;
-
- lock_sock(sk);
-
- if (dccp_qpolicy_full(sk)) {
- rc = -EAGAIN;
- goto out_release;
- }
-
- timeo = sock_sndtimeo(sk, noblock);
-
- /*
- * We have to use sk_stream_wait_connect here to set sk_write_pending,
- * so that the trick in dccp_rcv_request_sent_state_process.
- */
- /* Wait for a connection to finish. */
- if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
- if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
- goto out_release;
-
- size = sk->sk_prot->max_header + len;
- release_sock(sk);
- skb = sock_alloc_send_skb(sk, size, noblock, &rc);
- lock_sock(sk);
- if (skb == NULL)
- goto out_release;
-
- if (sk->sk_state == DCCP_CLOSED) {
- rc = -ENOTCONN;
- goto out_discard;
- }
-
- skb_reserve(skb, sk->sk_prot->max_header);
- rc = memcpy_from_msg(skb_put(skb, len), msg, len);
- if (rc != 0)
- goto out_discard;
-
- rc = dccp_msghdr_parse(msg, skb);
- if (rc != 0)
- goto out_discard;
-
- dccp_qpolicy_push(sk, skb);
- /*
- * The xmit_timer is set if the TX CCID is rate-based and will expire
- * when congestion control permits to release further packets into the
- * network. Window-based CCIDs do not use this timer.
- */
- if (!timer_pending(&dp->dccps_xmit_timer))
- dccp_write_xmit(sk);
-out_release:
- release_sock(sk);
- return rc ? : len;
-out_discard:
- kfree_skb(skb);
- goto out_release;
-}
-
-EXPORT_SYMBOL_GPL(dccp_sendmsg);
-
-int dccp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
- int flags, int *addr_len)
-{
- const struct dccp_hdr *dh;
- long timeo;
-
- lock_sock(sk);
-
- if (sk->sk_state == DCCP_LISTEN) {
- len = -ENOTCONN;
- goto out;
- }
-
- timeo = sock_rcvtimeo(sk, nonblock);
-
- do {
- struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
-
- if (skb == NULL)
- goto verify_sock_status;
-
- dh = dccp_hdr(skb);
-
- switch (dh->dccph_type) {
- case DCCP_PKT_DATA:
- case DCCP_PKT_DATAACK:
- goto found_ok_skb;
-
- case DCCP_PKT_CLOSE:
- case DCCP_PKT_CLOSEREQ:
- if (!(flags & MSG_PEEK))
- dccp_finish_passive_close(sk);
- /* fall through */
- case DCCP_PKT_RESET:
- dccp_pr_debug("found fin (%s) ok!\n",
- dccp_packet_name(dh->dccph_type));
- len = 0;
- goto found_fin_ok;
- default:
- dccp_pr_debug("packet_type=%s\n",
- dccp_packet_name(dh->dccph_type));
- sk_eat_skb(sk, skb);
- }
-verify_sock_status:
- if (sock_flag(sk, SOCK_DONE)) {
- len = 0;
- break;
- }
-
- if (sk->sk_err) {
- len = sock_error(sk);
- break;
- }
-
- if (sk->sk_shutdown & RCV_SHUTDOWN) {
- len = 0;
- break;
- }
-
- if (sk->sk_state == DCCP_CLOSED) {
- if (!sock_flag(sk, SOCK_DONE)) {
- /* This occurs when user tries to read
- * from never connected socket.
- */
- len = -ENOTCONN;
- break;
- }
- len = 0;
- break;
- }
-
- if (!timeo) {
- len = -EAGAIN;
- break;
- }
-
- if (signal_pending(current)) {
- len = sock_intr_errno(timeo);
- break;
- }
-
- sk_wait_data(sk, &timeo, NULL);
- continue;
- found_ok_skb:
- if (len > skb->len)
- len = skb->len;
- else if (len < skb->len)
- msg->msg_flags |= MSG_TRUNC;
-
- if (skb_copy_datagram_msg(skb, 0, msg, len)) {
- /* Exception. Bailout! */
- len = -EFAULT;
- break;
- }
- if (flags & MSG_TRUNC)
- len = skb->len;
- found_fin_ok:
- if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb);
- break;
- } while (1);
-out:
- release_sock(sk);
- return len;
-}
-
-EXPORT_SYMBOL_GPL(dccp_recvmsg);
-
-int inet_dccp_listen(struct socket *sock, int backlog)
-{
- struct sock *sk = sock->sk;
- unsigned char old_state;
- int err;
-
- lock_sock(sk);
-
- err = -EINVAL;
- if (sock->state != SS_UNCONNECTED || sock->type != SOCK_DCCP)
- goto out;
-
- old_state = sk->sk_state;
- if (!((1 << old_state) & (DCCPF_CLOSED | DCCPF_LISTEN)))
- goto out;
-
- /* Really, if the socket is already in listen state
- * we can only allow the backlog to be adjusted.
- */
- if (old_state != DCCP_LISTEN) {
- /*
- * FIXME: here it probably should be sk->sk_prot->listen_start
- * see tcp_listen_start
- */
- err = dccp_listen_start(sk, backlog);
- if (err)
- goto out;
- }
- sk->sk_max_ack_backlog = backlog;
- err = 0;
-
-out:
- release_sock(sk);
- return err;
-}
-
-EXPORT_SYMBOL_GPL(inet_dccp_listen);
-
-static void dccp_terminate_connection(struct sock *sk)
-{
- u8 next_state = DCCP_CLOSED;
-
- switch (sk->sk_state) {
- case DCCP_PASSIVE_CLOSE:
- case DCCP_PASSIVE_CLOSEREQ:
- dccp_finish_passive_close(sk);
- break;
- case DCCP_PARTOPEN:
- dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk);
- inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
- /* fall through */
- case DCCP_OPEN:
- dccp_send_close(sk, 1);
-
- if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER &&
- !dccp_sk(sk)->dccps_server_timewait)
- next_state = DCCP_ACTIVE_CLOSEREQ;
- else
- next_state = DCCP_CLOSING;
- /* fall through */
- default:
- dccp_set_state(sk, next_state);
- }
-}
-
-void dccp_close(struct sock *sk, long timeout)
-{
- struct dccp_sock *dp = dccp_sk(sk);
- struct sk_buff *skb;
- u32 data_was_unread = 0;
- int state;
-
- lock_sock(sk);
-
- sk->sk_shutdown = SHUTDOWN_MASK;
-
- if (sk->sk_state == DCCP_LISTEN) {
- dccp_set_state(sk, DCCP_CLOSED);
-
- /* Special case. */
- inet_csk_listen_stop(sk);
-
- goto adjudge_to_death;
- }
-
- sk_stop_timer(sk, &dp->dccps_xmit_timer);
-
- /*
- * We need to flush the recv. buffs. We do this only on the
- * descriptor close, not protocol-sourced closes, because the
- *reader process may not have drained the data yet!
- */
- while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
- data_was_unread += skb->len;
- __kfree_skb(skb);
- }
-
- /* If socket has been already reset kill it. */
- if (sk->sk_state == DCCP_CLOSED)
- goto adjudge_to_death;
-
- if (data_was_unread) {
- /* Unread data was tossed, send an appropriate Reset Code */
- DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread);
- dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
- dccp_set_state(sk, DCCP_CLOSED);
- } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
- /* Check zero linger _after_ checking for unread data. */
- sk->sk_prot->disconnect(sk, 0);
- } else if (sk->sk_state != DCCP_CLOSED) {
- /*
- * Normal connection termination. May need to wait if there are
- * still packets in the TX queue that are delayed by the CCID.
- */
- dccp_flush_write_queue(sk, &timeout);
- dccp_terminate_connection(sk);
- }
-
- /*
- * Flush write queue. This may be necessary in several cases:
- * - we have been closed by the peer but still have application data;
- * - abortive termination (unread data or zero linger time),
- * - normal termination but queue could not be flushed within time limit
- */
- __skb_queue_purge(&sk->sk_write_queue);
-
- sk_stream_wait_close(sk, timeout);
-
-adjudge_to_death:
- state = sk->sk_state;
- sock_hold(sk);
- sock_orphan(sk);
-
- /*
- * It is the last release_sock in its life. It will remove backlog.
- */
- release_sock(sk);
- /*
- * Now socket is owned by kernel and we acquire BH lock
- * to finish close. No need to check for user refs.
- */
- local_bh_disable();
- bh_lock_sock(sk);
- WARN_ON(sock_owned_by_user(sk));
-
- percpu_counter_inc(sk->sk_prot->orphan_count);
-
- /* Have we already been destroyed by a softirq or backlog? */
- if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
- goto out;
-
- if (sk->sk_state == DCCP_CLOSED)
- inet_csk_destroy_sock(sk);
-
- /* Otherwise, socket is reprieved until protocol close. */
-
-out:
- bh_unlock_sock(sk);
- local_bh_enable();
- sock_put(sk);
-}
-
-EXPORT_SYMBOL_GPL(dccp_close);
-
-void dccp_shutdown(struct sock *sk, int how)
-{
- dccp_pr_debug("called shutdown(%x)\n", how);
-}
-
-EXPORT_SYMBOL_GPL(dccp_shutdown);
-
-static inline int __init dccp_mib_init(void)
-{
- dccp_statistics = alloc_percpu(struct dccp_mib);
- if (!dccp_statistics)
- return -ENOMEM;
- return 0;
-}
-
-static inline void dccp_mib_exit(void)
-{
- free_percpu(dccp_statistics);
-}
-
-static int thash_entries;
-module_param(thash_entries, int, 0444);
-MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
-
-#ifdef CONFIG_IP_DCCP_DEBUG
-bool dccp_debug;
-module_param(dccp_debug, bool, 0644);
-MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
-
-EXPORT_SYMBOL_GPL(dccp_debug);
-#endif
-
-static int __init dccp_init(void)
-{
- unsigned long goal;
- int ehash_order, bhash_order, i;
- int rc;
-
- BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
- FIELD_SIZEOF(struct sk_buff, cb));
- rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL);
- if (rc)
- goto out_fail;
- rc = -ENOBUFS;
- inet_hashinfo_init(&dccp_hashinfo);
- dccp_hashinfo.bind_bucket_cachep =
- kmem_cache_create("dccp_bind_bucket",
- sizeof(struct inet_bind_bucket), 0,
- SLAB_HWCACHE_ALIGN, NULL);
- if (!dccp_hashinfo.bind_bucket_cachep)
- goto out_free_percpu;
-
- /*
- * Size and allocate the main established and bind bucket
- * hash tables.
- *
- * The methodology is similar to that of the buffer cache.
- */
- if (totalram_pages >= (128 * 1024))
- goal = totalram_pages >> (21 - PAGE_SHIFT);
- else
- goal = totalram_pages >> (23 - PAGE_SHIFT);
-
- if (thash_entries)
- goal = (thash_entries *
- sizeof(struct inet_ehash_bucket)) >> PAGE_SHIFT;
- for (ehash_order = 0; (1UL << ehash_order) < goal; ehash_order++)
- ;
- do {
- unsigned long hash_size = (1UL << ehash_order) * PAGE_SIZE /
- sizeof(struct inet_ehash_bucket);
-
- while (hash_size & (hash_size - 1))
- hash_size--;
- dccp_hashinfo.ehash_mask = hash_size - 1;
- dccp_hashinfo.ehash = (struct inet_ehash_bucket *)
- __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, ehash_order);
- } while (!dccp_hashinfo.ehash && --ehash_order > 0);
-
- if (!dccp_hashinfo.ehash) {
- DCCP_CRIT("Failed to allocate DCCP established hash table");
- goto out_free_bind_bucket_cachep;
- }
-
- for (i = 0; i <= dccp_hashinfo.ehash_mask; i++)
- INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i);
-
- if (inet_ehash_locks_alloc(&dccp_hashinfo))
- goto out_free_dccp_ehash;
-
- bhash_order = ehash_order;
-
- do {
- dccp_hashinfo.bhash_size = (1UL << bhash_order) * PAGE_SIZE /
- sizeof(struct inet_bind_hashbucket);
- if ((dccp_hashinfo.bhash_size > (64 * 1024)) &&
- bhash_order > 0)
- continue;
- dccp_hashinfo.bhash = (struct inet_bind_hashbucket *)
- __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, bhash_order);
- } while (!dccp_hashinfo.bhash && --bhash_order >= 0);
-
- if (!dccp_hashinfo.bhash) {
- DCCP_CRIT("Failed to allocate DCCP bind hash table");
- goto out_free_dccp_locks;
- }
-
- for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
- spin_lock_init(&dccp_hashinfo.bhash[i].lock);
- INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
- }
-
- rc = dccp_mib_init();
- if (rc)
- goto out_free_dccp_bhash;
-
- rc = dccp_ackvec_init();
- if (rc)
- goto out_free_dccp_mib;
-
- rc = dccp_sysctl_init();
- if (rc)
- goto out_ackvec_exit;
-
- rc = ccid_initialize_builtins();
- if (rc)
- goto out_sysctl_exit;
-
- dccp_timestamping_init();
-
- return 0;
-
-out_sysctl_exit:
- dccp_sysctl_exit();
-out_ackvec_exit:
- dccp_ackvec_exit();
-out_free_dccp_mib:
- dccp_mib_exit();
-out_free_dccp_bhash:
- free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
-out_free_dccp_locks:
- inet_ehash_locks_free(&dccp_hashinfo);
-out_free_dccp_ehash:
- free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
-out_free_bind_bucket_cachep:
- kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
-out_free_percpu:
- percpu_counter_destroy(&dccp_orphan_count);
-out_fail:
- dccp_hashinfo.bhash = NULL;
- dccp_hashinfo.ehash = NULL;
- dccp_hashinfo.bind_bucket_cachep = NULL;
- return rc;
-}
-
-static void __exit dccp_fini(void)
-{
- ccid_cleanup_builtins();
- dccp_mib_exit();
- free_pages((unsigned long)dccp_hashinfo.bhash,
- get_order(dccp_hashinfo.bhash_size *
- sizeof(struct inet_bind_hashbucket)));
- free_pages((unsigned long)dccp_hashinfo.ehash,
- get_order((dccp_hashinfo.ehash_mask + 1) *
- sizeof(struct inet_ehash_bucket)));
- inet_ehash_locks_free(&dccp_hashinfo);
- kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
- dccp_ackvec_exit();
- dccp_sysctl_exit();
- percpu_counter_destroy(&dccp_orphan_count);
-}
-
-module_init(dccp_init);
-module_exit(dccp_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>");
-MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");
diff --git a/net/dccp/qpolicy.c b/net/dccp/qpolicy.c
deleted file mode 100644
index 63c30bfa4703..000000000000
--- a/net/dccp/qpolicy.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * net/dccp/qpolicy.c
- *
- * Policy-based packet dequeueing interface for DCCP.
- *
- * Copyright (c) 2008 Tomasz Grobelny <tomasz@grobelny.oswiecenia.net>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License v2
- * as published by the Free Software Foundation.
- */
-#include "dccp.h"
-
-/*
- * Simple Dequeueing Policy:
- * If tx_qlen is different from 0, enqueue up to tx_qlen elements.
- */
-static void qpolicy_simple_push(struct sock *sk, struct sk_buff *skb)
-{
- skb_queue_tail(&sk->sk_write_queue, skb);
-}
-
-static bool qpolicy_simple_full(struct sock *sk)
-{
- return dccp_sk(sk)->dccps_tx_qlen &&
- sk->sk_write_queue.qlen >= dccp_sk(sk)->dccps_tx_qlen;
-}
-
-static struct sk_buff *qpolicy_simple_top(struct sock *sk)
-{
- return skb_peek(&sk->sk_write_queue);
-}
-
-/*
- * Priority-based Dequeueing Policy:
- * If tx_qlen is different from 0 and the queue has reached its upper bound
- * of tx_qlen elements, replace older packets lowest-priority-first.
- */
-static struct sk_buff *qpolicy_prio_best_skb(struct sock *sk)
-{
- struct sk_buff *skb, *best = NULL;
-
- skb_queue_walk(&sk->sk_write_queue, skb)
- if (best == NULL || skb->priority > best->priority)
- best = skb;
- return best;
-}
-
-static struct sk_buff *qpolicy_prio_worst_skb(struct sock *sk)
-{
- struct sk_buff *skb, *worst = NULL;
-
- skb_queue_walk(&sk->sk_write_queue, skb)
- if (worst == NULL || skb->priority < worst->priority)
- worst = skb;
- return worst;
-}
-
-static bool qpolicy_prio_full(struct sock *sk)
-{
- if (qpolicy_simple_full(sk))
- dccp_qpolicy_drop(sk, qpolicy_prio_worst_skb(sk));
- return false;
-}
-
-/**
- * struct dccp_qpolicy_operations - TX Packet Dequeueing Interface
- * @push: add a new @skb to the write queue
- * @full: indicates that no more packets will be admitted
- * @top: peeks at whatever the queueing policy defines as its `top'
- */
-static struct dccp_qpolicy_operations {
- void (*push) (struct sock *sk, struct sk_buff *skb);
- bool (*full) (struct sock *sk);
- struct sk_buff* (*top) (struct sock *sk);
- __be32 params;
-
-} qpol_table[DCCPQ_POLICY_MAX] = {
- [DCCPQ_POLICY_SIMPLE] = {
- .push = qpolicy_simple_push,
- .full = qpolicy_simple_full,
- .top = qpolicy_simple_top,
- .params = 0,
- },
- [DCCPQ_POLICY_PRIO] = {
- .push = qpolicy_simple_push,
- .full = qpolicy_prio_full,
- .top = qpolicy_prio_best_skb,
- .params = DCCP_SCM_PRIORITY,
- },
-};
-
-/*
- * Externally visible interface
- */
-void dccp_qpolicy_push(struct sock *sk, struct sk_buff *skb)
-{
- qpol_table[dccp_sk(sk)->dccps_qpolicy].push(sk, skb);
-}
-
-bool dccp_qpolicy_full(struct sock *sk)
-{
- return qpol_table[dccp_sk(sk)->dccps_qpolicy].full(sk);
-}
-
-void dccp_qpolicy_drop(struct sock *sk, struct sk_buff *skb)
-{
- if (skb != NULL) {
- skb_unlink(skb, &sk->sk_write_queue);
- kfree_skb(skb);
- }
-}
-
-struct sk_buff *dccp_qpolicy_top(struct sock *sk)
-{
- return qpol_table[dccp_sk(sk)->dccps_qpolicy].top(sk);
-}
-
-struct sk_buff *dccp_qpolicy_pop(struct sock *sk)
-{
- struct sk_buff *skb = dccp_qpolicy_top(sk);
-
- if (skb != NULL) {
- /* Clear any skb fields that we used internally */
- skb->priority = 0;
- skb_unlink(skb, &sk->sk_write_queue);
- }
- return skb;
-}
-
-bool dccp_qpolicy_param_ok(struct sock *sk, __be32 param)
-{
- /* check if exactly one bit is set */
- if (!param || (param & (param - 1)))
- return false;
- return (qpol_table[dccp_sk(sk)->dccps_qpolicy].params & param) == param;
-}
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
deleted file mode 100644
index 53731e45403c..000000000000
--- a/net/dccp/sysctl.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * net/dccp/sysctl.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@mandriva.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License v2
- * as published by the Free Software Foundation.
- */
-
-#include <linux/mm.h>
-#include <linux/sysctl.h>
-#include "dccp.h"
-#include "feat.h"
-
-#ifndef CONFIG_SYSCTL
-#error This file should not be compiled without CONFIG_SYSCTL defined
-#endif
-
-/* Boundary values */
-static int zero = 0,
- one = 1,
- u8_max = 0xFF;
-static unsigned long seqw_min = DCCPF_SEQ_WMIN,
- seqw_max = 0xFFFFFFFF; /* maximum on 32 bit */
-
-static struct ctl_table dccp_default_table[] = {
- {
- .procname = "seq_window",
- .data = &sysctl_dccp_sequence_window,
- .maxlen = sizeof(sysctl_dccp_sequence_window),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- .extra1 = &seqw_min, /* RFC 4340, 7.5.2 */
- .extra2 = &seqw_max,
- },
- {
- .procname = "rx_ccid",
- .data = &sysctl_dccp_rx_ccid,
- .maxlen = sizeof(sysctl_dccp_rx_ccid),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &u8_max, /* RFC 4340, 10. */
- },
- {
- .procname = "tx_ccid",
- .data = &sysctl_dccp_tx_ccid,
- .maxlen = sizeof(sysctl_dccp_tx_ccid),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &u8_max, /* RFC 4340, 10. */
- },
- {
- .procname = "request_retries",
- .data = &sysctl_dccp_request_retries,
- .maxlen = sizeof(sysctl_dccp_request_retries),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
- .extra2 = &u8_max,
- },
- {
- .procname = "retries1",
- .data = &sysctl_dccp_retries1,
- .maxlen = sizeof(sysctl_dccp_retries1),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &u8_max,
- },
- {
- .procname = "retries2",
- .data = &sysctl_dccp_retries2,
- .maxlen = sizeof(sysctl_dccp_retries2),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &u8_max,
- },
- {
- .procname = "tx_qlen",
- .data = &sysctl_dccp_tx_qlen,
- .maxlen = sizeof(sysctl_dccp_tx_qlen),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- },
- {
- .procname = "sync_ratelimit",
- .data = &sysctl_dccp_sync_ratelimit,
- .maxlen = sizeof(sysctl_dccp_sync_ratelimit),
- .mode = 0644,
- .proc_handler = proc_dointvec_ms_jiffies,
- },
-
- { }
-};
-
-static struct ctl_table_header *dccp_table_header;
-
-int __init dccp_sysctl_init(void)
-{
- dccp_table_header = register_net_sysctl(&init_net, "net/dccp/default",
- dccp_default_table);
-
- return dccp_table_header != NULL ? 0 : -ENOMEM;
-}
-
-void dccp_sysctl_exit(void)
-{
- if (dccp_table_header != NULL) {
- unregister_net_sysctl_table(dccp_table_header);
- dccp_table_header = NULL;
- }
-}
diff --git a/net/dccp/timer.c b/net/dccp/timer.c
deleted file mode 100644
index 1501a20a94ca..000000000000
--- a/net/dccp/timer.c
+++ /dev/null
@@ -1,275 +0,0 @@
-/*
- * net/dccp/timer.c
- *
- * An implementation of the DCCP protocol
- * Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/dccp.h>
-#include <linux/skbuff.h>
-#include <linux/export.h>
-
-#include "dccp.h"
-
-/* sysctl variables governing numbers of retransmission attempts */
-int sysctl_dccp_request_retries __read_mostly = TCP_SYN_RETRIES;
-int sysctl_dccp_retries1 __read_mostly = TCP_RETR1;
-int sysctl_dccp_retries2 __read_mostly = TCP_RETR2;
-
-static void dccp_write_err(struct sock *sk)
-{
- sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
- sk->sk_error_report(sk);
-
- dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
- dccp_done(sk);
- __DCCP_INC_STATS(DCCP_MIB_ABORTONTIMEOUT);
-}
-
-/* A write timeout has occurred. Process the after effects. */
-static int dccp_write_timeout(struct sock *sk)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- int retry_until;
-
- if (sk->sk_state == DCCP_REQUESTING || sk->sk_state == DCCP_PARTOPEN) {
- if (icsk->icsk_retransmits != 0)
- dst_negative_advice(sk);
- retry_until = icsk->icsk_syn_retries ?
- : sysctl_dccp_request_retries;
- } else {
- if (icsk->icsk_retransmits >= sysctl_dccp_retries1) {
- /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu
- black hole detection. :-(
-
- It is place to make it. It is not made. I do not want
- to make it. It is disguisting. It does not work in any
- case. Let me to cite the same draft, which requires for
- us to implement this:
-
- "The one security concern raised by this memo is that ICMP black holes
- are often caused by over-zealous security administrators who block
- all ICMP messages. It is vitally important that those who design and
- deploy security systems understand the impact of strict filtering on
- upper-layer protocols. The safest web site in the world is worthless
- if most TCP implementations cannot transfer data from it. It would
- be far nicer to have all of the black holes fixed rather than fixing
- all of the TCP implementations."
-
- Golden words :-).
- */
-
- dst_negative_advice(sk);
- }
-
- retry_until = sysctl_dccp_retries2;
- /*
- * FIXME: see tcp_write_timout and tcp_out_of_resources
- */
- }
-
- if (icsk->icsk_retransmits >= retry_until) {
- /* Has it gone just too far? */
- dccp_write_err(sk);
- return 1;
- }
- return 0;
-}
-
-/*
- * The DCCP retransmit timer.
- */
-static void dccp_retransmit_timer(struct sock *sk)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- /*
- * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was
- * sent, no need to retransmit, this sock is dead.
- */
- if (dccp_write_timeout(sk))
- return;
-
- /*
- * We want to know the number of packets retransmitted, not the
- * total number of retransmissions of clones of original packets.
- */
- if (icsk->icsk_retransmits == 0)
- __DCCP_INC_STATS(DCCP_MIB_TIMEOUTS);
-
- if (dccp_retransmit_skb(sk) != 0) {
- /*
- * Retransmission failed because of local congestion,
- * do not backoff.
- */
- if (--icsk->icsk_retransmits == 0)
- icsk->icsk_retransmits = 1;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- min(icsk->icsk_rto,
- TCP_RESOURCE_PROBE_INTERVAL),
- DCCP_RTO_MAX);
- return;
- }
-
- icsk->icsk_backoff++;
-
- icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX);
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto,
- DCCP_RTO_MAX);
- if (icsk->icsk_retransmits > sysctl_dccp_retries1)
- __sk_dst_reset(sk);
-}
-
-static void dccp_write_timer(struct timer_list *t)
-{
- struct inet_connection_sock *icsk =
- from_timer(icsk, t, icsk_retransmit_timer);
- struct sock *sk = &icsk->icsk_inet.sk;
- int event = 0;
-
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk)) {
- /* Try again later */
- sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
- jiffies + (HZ / 20));
- goto out;
- }
-
- if (sk->sk_state == DCCP_CLOSED || !icsk->icsk_pending)
- goto out;
-
- if (time_after(icsk->icsk_timeout, jiffies)) {
- sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
- icsk->icsk_timeout);
- goto out;
- }
-
- event = icsk->icsk_pending;
- icsk->icsk_pending = 0;
-
- switch (event) {
- case ICSK_TIME_RETRANS:
- dccp_retransmit_timer(sk);
- break;
- }
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
-}
-
-static void dccp_keepalive_timer(struct timer_list *t)
-{
- struct sock *sk = from_timer(sk, t, sk_timer);
-
- pr_err("dccp should not use a keepalive timer !\n");
- sock_put(sk);
-}
-
-/* This is the same as tcp_delack_timer, sans prequeue & mem_reclaim stuff */
-static void dccp_delack_timer(struct timer_list *t)
-{
- struct inet_connection_sock *icsk =
- from_timer(icsk, t, icsk_delack_timer);
- struct sock *sk = &icsk->icsk_inet.sk;
-
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk)) {
- /* Try again later. */
- icsk->icsk_ack.blocked = 1;
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
- sk_reset_timer(sk, &icsk->icsk_delack_timer,
- jiffies + TCP_DELACK_MIN);
- goto out;
- }
-
- if (sk->sk_state == DCCP_CLOSED ||
- !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
- goto out;
- if (time_after(icsk->icsk_ack.timeout, jiffies)) {
- sk_reset_timer(sk, &icsk->icsk_delack_timer,
- icsk->icsk_ack.timeout);
- goto out;
- }
-
- icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
-
- if (inet_csk_ack_scheduled(sk)) {
- if (!icsk->icsk_ack.pingpong) {
- /* Delayed ACK missed: inflate ATO. */
- icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1,
- icsk->icsk_rto);
- } else {
- /* Delayed ACK missed: leave pingpong mode and
- * deflate ATO.
- */
- icsk->icsk_ack.pingpong = 0;
- icsk->icsk_ack.ato = TCP_ATO_MIN;
- }
- dccp_send_ack(sk);
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
- }
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
-}
-
-/**
- * dccp_write_xmitlet - Workhorse for CCID packet dequeueing interface
- * See the comments above %ccid_dequeueing_decision for supported modes.
- */
-static void dccp_write_xmitlet(unsigned long data)
-{
- struct sock *sk = (struct sock *)data;
-
- bh_lock_sock(sk);
- if (sock_owned_by_user(sk))
- sk_reset_timer(sk, &dccp_sk(sk)->dccps_xmit_timer, jiffies + 1);
- else
- dccp_write_xmit(sk);
- bh_unlock_sock(sk);
- sock_put(sk);
-}
-
-static void dccp_write_xmit_timer(struct timer_list *t)
-{
- struct dccp_sock *dp = from_timer(dp, t, dccps_xmit_timer);
- struct sock *sk = &dp->dccps_inet_connection.icsk_inet.sk;
-
- dccp_write_xmitlet((unsigned long)sk);
-}
-
-void dccp_init_xmit_timers(struct sock *sk)
-{
- struct dccp_sock *dp = dccp_sk(sk);
-
- tasklet_init(&dp->dccps_xmitlet, dccp_write_xmitlet, (unsigned long)sk);
- timer_setup(&dp->dccps_xmit_timer, dccp_write_xmit_timer, 0);
- inet_csk_init_xmit_timers(sk, &dccp_write_timer, &dccp_delack_timer,
- &dccp_keepalive_timer);
-}
-
-static ktime_t dccp_timestamp_seed;
-/**
- * dccp_timestamp - 10s of microseconds time source
- * Returns the number of 10s of microseconds since loading DCCP. This is native
- * DCCP time difference format (RFC 4340, sec. 13).
- * Please note: This will wrap around about circa every 11.9 hours.
- */
-u32 dccp_timestamp(void)
-{
- u64 delta = (u64)ktime_us_delta(ktime_get_real(), dccp_timestamp_seed);
-
- do_div(delta, 10);
- return delta;
-}
-EXPORT_SYMBOL_GPL(dccp_timestamp);
-
-void __init dccp_timestamping_init(void)
-{
- dccp_timestamp_seed = ktime_get_real();
-}
diff --git a/net/dccp/trace.h b/net/dccp/trace.h
deleted file mode 100644
index 5062421beee9..000000000000
--- a/net/dccp/trace.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM dccp
-
-#if !defined(_TRACE_DCCP_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_DCCP_H
-
-#include <net/sock.h>
-#include "dccp.h"
-#include "ccids/ccid3.h"
-#include <linux/tracepoint.h>
-#include <trace/events/net_probe_common.h>
-
-TRACE_EVENT(dccp_probe,
-
- TP_PROTO(struct sock *sk, size_t size),
-
- TP_ARGS(sk, size),
-
- TP_STRUCT__entry(
- /* sockaddr_in6 is always bigger than sockaddr_in */
- __array(__u8, saddr, sizeof(struct sockaddr_in6))
- __array(__u8, daddr, sizeof(struct sockaddr_in6))
- __field(__u16, sport)
- __field(__u16, dport)
- __field(__u16, size)
- __field(__u16, tx_s)
- __field(__u32, tx_rtt)
- __field(__u32, tx_p)
- __field(__u32, tx_x_calc)
- __field(__u64, tx_x_recv)
- __field(__u64, tx_x)
- __field(__u32, tx_t_ipi)
- ),
-
- TP_fast_assign(
- const struct inet_sock *inet = inet_sk(sk);
- struct ccid3_hc_tx_sock *hc = NULL;
-
- if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3)
- hc = ccid3_hc_tx_sk(sk);
-
- memset(__entry->saddr, 0, sizeof(struct sockaddr_in6));
- memset(__entry->daddr, 0, sizeof(struct sockaddr_in6));
-
- TP_STORE_ADDR_PORTS(__entry, inet, sk);
-
- /* For filtering use */
- __entry->sport = ntohs(inet->inet_sport);
- __entry->dport = ntohs(inet->inet_dport);
-
- __entry->size = size;
- if (hc) {
- __entry->tx_s = hc->tx_s;
- __entry->tx_rtt = hc->tx_rtt;
- __entry->tx_p = hc->tx_p;
- __entry->tx_x_calc = hc->tx_x_calc;
- __entry->tx_x_recv = hc->tx_x_recv >> 6;
- __entry->tx_x = hc->tx_x >> 6;
- __entry->tx_t_ipi = hc->tx_t_ipi;
- } else {
- __entry->tx_s = 0;
- memset(&__entry->tx_rtt, 0, (void *)&__entry->tx_t_ipi -
- (void *)&__entry->tx_rtt +
- sizeof(__entry->tx_t_ipi));
- }
- ),
-
- TP_printk("src=%pISpc dest=%pISpc size=%d tx_s=%d tx_rtt=%d "
- "tx_p=%d tx_x_calc=%u tx_x_recv=%llu tx_x=%llu tx_t_ipi=%d",
- __entry->saddr, __entry->daddr, __entry->size,
- __entry->tx_s, __entry->tx_rtt, __entry->tx_p,
- __entry->tx_x_calc, __entry->tx_x_recv, __entry->tx_x,
- __entry->tx_t_ipi)
-);
-
-#endif /* _TRACE_TCP_H */
-
-/* This part must be outside protection */
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH .
-#undef TRACE_INCLUDE_FILE
-#define TRACE_INCLUDE_FILE trace
-#include <trace/define_trace.h>
diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig
deleted file mode 100644
index dcc74956badd..000000000000
--- a/net/decnet/Kconfig
+++ /dev/null
@@ -1,42 +0,0 @@
-#
-# DECnet configuration
-#
-config DECNET
- tristate "DECnet Support"
- ---help---
- The DECnet networking protocol was used in many products made by
- Digital (now Compaq). It provides reliable stream and sequenced
- packet communications over which run a variety of services similar
- to those which run over TCP/IP.
-
- To find some tools to use with the kernel layer support, please
- look at Patrick Caulfield's web site:
- <http://linux-decnet.sourceforge.net/>.
-
- More detailed documentation is available in
- <file:Documentation/networking/decnet.txt>.
-
- Be sure to say Y to "/proc file system support" and "Sysctl support"
- below when using DECnet, since you will need sysctl support to aid
- in configuration at run time.
-
- The DECnet code is also available as a module ( = code which can be
- inserted in and removed from the running kernel whenever you want).
- The module is called decnet.
-
-config DECNET_ROUTER
- bool "DECnet: router support"
- depends on DECNET
- select FIB_RULES
- ---help---
- Add support for turning your DECnet Endnode into a level 1 or 2
- router. This is an experimental, but functional option. If you
- do say Y here, then make sure that you also say Y to "Kernel/User
- network link driver", "Routing messages" and "Network packet
- filtering". The first two are required to allow configuration via
- rtnetlink (you will need Alexey Kuznetsov's iproute2 package
- from <ftp://ftp.tux.org/pub/net/ip-routing/>). The "Network packet
- filtering" option will be required for the forthcoming routing daemon
- to work.
-
- See <file:Documentation/networking/decnet.txt> for more information.
diff --git a/net/decnet/Makefile b/net/decnet/Makefile
deleted file mode 100644
index 07b38e441b2d..000000000000
--- a/net/decnet/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-
-obj-$(CONFIG_DECNET) += decnet.o
-
-decnet-y := af_decnet.o dn_nsp_in.o dn_nsp_out.o \
- dn_route.o dn_dev.o dn_neigh.o dn_timer.o
-decnet-$(CONFIG_DECNET_ROUTER) += dn_fib.o dn_rules.o dn_table.o
-decnet-y += sysctl_net_decnet.o
-
-obj-$(CONFIG_NETFILTER) += netfilter/
diff --git a/net/decnet/README b/net/decnet/README
deleted file mode 100644
index 60e7ec88c81f..000000000000
--- a/net/decnet/README
+++ /dev/null
@@ -1,8 +0,0 @@
- Linux DECnet Project
- ======================
-
-The documentation for this kernel subsystem is available in the
-Documentation/networking subdirectory of this distribution and also
-on line at http://www.chygwyn.com/DECnet/
-
-Steve Whitehouse <SteveW@ACM.org>
diff --git a/net/decnet/TODO b/net/decnet/TODO
deleted file mode 100644
index 358e9eb49016..000000000000
--- a/net/decnet/TODO
+++ /dev/null
@@ -1,40 +0,0 @@
-Steve's quick list of things that need finishing off:
-[they are in no particular order and range from the trivial to the long winded]
-
- o Proper timeouts on each neighbour (in routing mode) rather than
- just the 60 second On-Ethernet cache value.
-
- o Support for X.25 linklayer
-
- o Support for DDCMP link layer
-
- o The DDCMP device itself
-
- o PPP support (rfc1762)
-
- o Lots of testing with real applications
-
- o Verify errors etc. against POSIX 1003.1g (draft)
-
- o Using send/recvmsg() to get at connect/disconnect data (POSIX 1003.1g)
- [maybe this should be done at socket level... the control data in the
- send/recvmsg() calls should simply be a vector of set/getsockopt()
- calls]
-
- o check MSG_CTRUNC is set where it should be.
-
- o Find all the commonality between DECnet and IPv4 routing code and extract
- it into a small library of routines. [probably a project for 2.7.xx]
-
- o Add perfect socket hashing - an idea suggested by Paul Koning. Currently
- we have a half-way house scheme which seems to work reasonably well, but
- the full scheme is still worth implementing, its not not top of my list
- right now.
-
- o Add session control message flow control
-
- o Add NSP message flow control
-
- o DECnet sendpages() function
-
- o AIO for DECnet
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
deleted file mode 100644
index 7d6ff983ba2c..000000000000
--- a/net/decnet/af_decnet.c
+++ /dev/null
@@ -1,2411 +0,0 @@
-
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Socket Layer Interface
- *
- * Authors: Eduardo Marcelo Serrat <emserrat@geocities.com>
- * Patrick Caulfield <patrick@pandh.demon.co.uk>
- *
- * Changes:
- * Steve Whitehouse: Copied from Eduardo Serrat and Patrick Caulfield's
- * version of the code. Original copyright preserved
- * below.
- * Steve Whitehouse: Some bug fixes, cleaning up some code to make it
- * compatible with my routing layer.
- * Steve Whitehouse: Merging changes from Eduardo Serrat and Patrick
- * Caulfield.
- * Steve Whitehouse: Further bug fixes, checking module code still works
- * with new routing layer.
- * Steve Whitehouse: Additional set/get_sockopt() calls.
- * Steve Whitehouse: Fixed TIOCINQ ioctl to be same as Eduardo's new
- * code.
- * Steve Whitehouse: recvmsg() changed to try and behave in a POSIX like
- * way. Didn't manage it entirely, but its better.
- * Steve Whitehouse: ditto for sendmsg().
- * Steve Whitehouse: A selection of bug fixes to various things.
- * Steve Whitehouse: Added TIOCOUTQ ioctl.
- * Steve Whitehouse: Fixes to username2sockaddr & sockaddr2username.
- * Steve Whitehouse: Fixes to connect() error returns.
- * Patrick Caulfield: Fixes to delayed acceptance logic.
- * David S. Miller: New socket locking
- * Steve Whitehouse: Socket list hashing/locking
- * Arnaldo C. Melo: use capable, not suser
- * Steve Whitehouse: Removed unused code. Fix to use sk->allocation
- * when required.
- * Patrick Caulfield: /proc/net/decnet now has object name/number
- * Steve Whitehouse: Fixed local port allocation, hashed sk list
- * Matthew Wilcox: Fixes for dn_ioctl()
- * Steve Whitehouse: New connect/accept logic to allow timeouts and
- * prepare for sendpage etc.
- */
-
-
-/******************************************************************************
- (c) 1995-1998 E.M. Serrat emserrat@geocities.com
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
-HISTORY:
-
-Version Kernel Date Author/Comments
-------- ------ ---- ---------------
-Version 0.0.1 2.0.30 01-dic-97 Eduardo Marcelo Serrat
- (emserrat@geocities.com)
-
- First Development of DECnet Socket La-
- yer for Linux. Only supports outgoing
- connections.
-
-Version 0.0.2 2.1.105 20-jun-98 Patrick J. Caulfield
- (patrick@pandh.demon.co.uk)
-
- Port to new kernel development version.
-
-Version 0.0.3 2.1.106 25-jun-98 Eduardo Marcelo Serrat
- (emserrat@geocities.com)
- _
- Added support for incoming connections
- so we can start developing server apps
- on Linux.
- -
- Module Support
-Version 0.0.4 2.1.109 21-jul-98 Eduardo Marcelo Serrat
- (emserrat@geocities.com)
- _
- Added support for X11R6.4. Now we can
- use DECnet transport for X on Linux!!!
- -
-Version 0.0.5 2.1.110 01-aug-98 Eduardo Marcelo Serrat
- (emserrat@geocities.com)
- Removed bugs on flow control
- Removed bugs on incoming accessdata
- order
- -
-Version 0.0.6 2.1.110 07-aug-98 Eduardo Marcelo Serrat
- dn_recvmsg fixes
-
- Patrick J. Caulfield
- dn_bind fixes
-*******************************************************************************/
-
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/kernel.h>
-#include <linux/sched/signal.h>
-#include <linux/timer.h>
-#include <linux/string.h>
-#include <linux/sockios.h>
-#include <linux/net.h>
-#include <linux/netdevice.h>
-#include <linux/inet.h>
-#include <linux/route.h>
-#include <linux/netfilter.h>
-#include <linux/seq_file.h>
-#include <net/sock.h>
-#include <net/tcp_states.h>
-#include <net/flow.h>
-#include <asm/ioctls.h>
-#include <linux/capability.h>
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
-#include <linux/init.h>
-#include <linux/poll.h>
-#include <linux/jiffies.h>
-#include <net/net_namespace.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/fib_rules.h>
-#include <net/tcp.h>
-#include <net/dn.h>
-#include <net/dn_nsp.h>
-#include <net/dn_dev.h>
-#include <net/dn_route.h>
-#include <net/dn_fib.h>
-#include <net/dn_neigh.h>
-
-struct dn_sock {
- struct sock sk;
- struct dn_scp scp;
-};
-
-static void dn_keepalive(struct sock *sk);
-
-#define DN_SK_HASH_SHIFT 8
-#define DN_SK_HASH_SIZE (1 << DN_SK_HASH_SHIFT)
-#define DN_SK_HASH_MASK (DN_SK_HASH_SIZE - 1)
-
-
-static const struct proto_ops dn_proto_ops;
-static DEFINE_RWLOCK(dn_hash_lock);
-static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE];
-static struct hlist_head dn_wild_sk;
-static atomic_long_t decnet_memory_allocated;
-
-static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen, int flags);
-static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags);
-
-static struct hlist_head *dn_find_list(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- if (scp->addr.sdn_flags & SDF_WILD)
- return hlist_empty(&dn_wild_sk) ? &dn_wild_sk : NULL;
-
- return &dn_sk_hash[le16_to_cpu(scp->addrloc) & DN_SK_HASH_MASK];
-}
-
-/*
- * Valid ports are those greater than zero and not already in use.
- */
-static int check_port(__le16 port)
-{
- struct sock *sk;
-
- if (port == 0)
- return -1;
-
- sk_for_each(sk, &dn_sk_hash[le16_to_cpu(port) & DN_SK_HASH_MASK]) {
- struct dn_scp *scp = DN_SK(sk);
- if (scp->addrloc == port)
- return -1;
- }
- return 0;
-}
-
-static unsigned short port_alloc(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
-static unsigned short port = 0x2000;
- unsigned short i_port = port;
-
- while(check_port(cpu_to_le16(++port)) != 0) {
- if (port == i_port)
- return 0;
- }
-
- scp->addrloc = cpu_to_le16(port);
-
- return 1;
-}
-
-/*
- * Since this is only ever called from user
- * level, we don't need a write_lock() version
- * of this.
- */
-static int dn_hash_sock(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
- struct hlist_head *list;
- int rv = -EUSERS;
-
- BUG_ON(sk_hashed(sk));
-
- write_lock_bh(&dn_hash_lock);
-
- if (!scp->addrloc && !port_alloc(sk))
- goto out;
-
- rv = -EADDRINUSE;
- if ((list = dn_find_list(sk)) == NULL)
- goto out;
-
- sk_add_node(sk, list);
- rv = 0;
-out:
- write_unlock_bh(&dn_hash_lock);
- return rv;
-}
-
-static void dn_unhash_sock(struct sock *sk)
-{
- write_lock(&dn_hash_lock);
- sk_del_node_init(sk);
- write_unlock(&dn_hash_lock);
-}
-
-static void dn_unhash_sock_bh(struct sock *sk)
-{
- write_lock_bh(&dn_hash_lock);
- sk_del_node_init(sk);
- write_unlock_bh(&dn_hash_lock);
-}
-
-static struct hlist_head *listen_hash(struct sockaddr_dn *addr)
-{
- int i;
- unsigned int hash = addr->sdn_objnum;
-
- if (hash == 0) {
- hash = addr->sdn_objnamel;
- for(i = 0; i < le16_to_cpu(addr->sdn_objnamel); i++) {
- hash ^= addr->sdn_objname[i];
- hash ^= (hash << 3);
- }
- }
-
- return &dn_sk_hash[hash & DN_SK_HASH_MASK];
-}
-
-/*
- * Called to transform a socket from bound (i.e. with a local address)
- * into a listening socket (doesn't need a local port number) and rehashes
- * based upon the object name/number.
- */
-static void dn_rehash_sock(struct sock *sk)
-{
- struct hlist_head *list;
- struct dn_scp *scp = DN_SK(sk);
-
- if (scp->addr.sdn_flags & SDF_WILD)
- return;
-
- write_lock_bh(&dn_hash_lock);
- sk_del_node_init(sk);
- DN_SK(sk)->addrloc = 0;
- list = listen_hash(&DN_SK(sk)->addr);
- sk_add_node(sk, list);
- write_unlock_bh(&dn_hash_lock);
-}
-
-int dn_sockaddr2username(struct sockaddr_dn *sdn, unsigned char *buf, unsigned char type)
-{
- int len = 2;
-
- *buf++ = type;
-
- switch (type) {
- case 0:
- *buf++ = sdn->sdn_objnum;
- break;
- case 1:
- *buf++ = 0;
- *buf++ = le16_to_cpu(sdn->sdn_objnamel);
- memcpy(buf, sdn->sdn_objname, le16_to_cpu(sdn->sdn_objnamel));
- len = 3 + le16_to_cpu(sdn->sdn_objnamel);
- break;
- case 2:
- memset(buf, 0, 5);
- buf += 5;
- *buf++ = le16_to_cpu(sdn->sdn_objnamel);
- memcpy(buf, sdn->sdn_objname, le16_to_cpu(sdn->sdn_objnamel));
- len = 7 + le16_to_cpu(sdn->sdn_objnamel);
- break;
- }
-
- return len;
-}
-
-/*
- * On reception of usernames, we handle types 1 and 0 for destination
- * addresses only. Types 2 and 4 are used for source addresses, but the
- * UIC, GIC are ignored and they are both treated the same way. Type 3
- * is never used as I've no idea what its purpose might be or what its
- * format is.
- */
-int dn_username2sockaddr(unsigned char *data, int len, struct sockaddr_dn *sdn, unsigned char *fmt)
-{
- unsigned char type;
- int size = len;
- int namel = 12;
-
- sdn->sdn_objnum = 0;
- sdn->sdn_objnamel = cpu_to_le16(0);
- memset(sdn->sdn_objname, 0, DN_MAXOBJL);
-
- if (len < 2)
- return -1;
-
- len -= 2;
- *fmt = *data++;
- type = *data++;
-
- switch (*fmt) {
- case 0:
- sdn->sdn_objnum = type;
- return 2;
- case 1:
- namel = 16;
- break;
- case 2:
- len -= 4;
- data += 4;
- break;
- case 4:
- len -= 8;
- data += 8;
- break;
- default:
- return -1;
- }
-
- len -= 1;
-
- if (len < 0)
- return -1;
-
- sdn->sdn_objnamel = cpu_to_le16(*data++);
- len -= le16_to_cpu(sdn->sdn_objnamel);
-
- if ((len < 0) || (le16_to_cpu(sdn->sdn_objnamel) > namel))
- return -1;
-
- memcpy(sdn->sdn_objname, data, le16_to_cpu(sdn->sdn_objnamel));
-
- return size - len;
-}
-
-struct sock *dn_sklist_find_listener(struct sockaddr_dn *addr)
-{
- struct hlist_head *list = listen_hash(addr);
- struct sock *sk;
-
- read_lock(&dn_hash_lock);
- sk_for_each(sk, list) {
- struct dn_scp *scp = DN_SK(sk);
- if (sk->sk_state != TCP_LISTEN)
- continue;
- if (scp->addr.sdn_objnum) {
- if (scp->addr.sdn_objnum != addr->sdn_objnum)
- continue;
- } else {
- if (addr->sdn_objnum)
- continue;
- if (scp->addr.sdn_objnamel != addr->sdn_objnamel)
- continue;
- if (memcmp(scp->addr.sdn_objname, addr->sdn_objname, le16_to_cpu(addr->sdn_objnamel)) != 0)
- continue;
- }
- sock_hold(sk);
- read_unlock(&dn_hash_lock);
- return sk;
- }
-
- sk = sk_head(&dn_wild_sk);
- if (sk) {
- if (sk->sk_state == TCP_LISTEN)
- sock_hold(sk);
- else
- sk = NULL;
- }
-
- read_unlock(&dn_hash_lock);
- return sk;
-}
-
-struct sock *dn_find_by_skb(struct sk_buff *skb)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- struct sock *sk;
- struct dn_scp *scp;
-
- read_lock(&dn_hash_lock);
- sk_for_each(sk, &dn_sk_hash[le16_to_cpu(cb->dst_port) & DN_SK_HASH_MASK]) {
- scp = DN_SK(sk);
- if (cb->src != dn_saddr2dn(&scp->peer))
- continue;
- if (cb->dst_port != scp->addrloc)
- continue;
- if (scp->addrrem && (cb->src_port != scp->addrrem))
- continue;
- sock_hold(sk);
- goto found;
- }
- sk = NULL;
-found:
- read_unlock(&dn_hash_lock);
- return sk;
-}
-
-
-
-static void dn_destruct(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- skb_queue_purge(&scp->data_xmit_queue);
- skb_queue_purge(&scp->other_xmit_queue);
- skb_queue_purge(&scp->other_receive_queue);
-
- dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
-}
-
-static unsigned long dn_memory_pressure;
-
-static void dn_enter_memory_pressure(struct sock *sk)
-{
- if (!dn_memory_pressure) {
- dn_memory_pressure = 1;
- }
-}
-
-static struct proto dn_proto = {
- .name = "NSP",
- .owner = THIS_MODULE,
- .enter_memory_pressure = dn_enter_memory_pressure,
- .memory_pressure = &dn_memory_pressure,
- .memory_allocated = &decnet_memory_allocated,
- .sysctl_mem = sysctl_decnet_mem,
- .sysctl_wmem = sysctl_decnet_wmem,
- .sysctl_rmem = sysctl_decnet_rmem,
- .max_header = DN_MAX_NSP_DATA_HEADER + 64,
- .obj_size = sizeof(struct dn_sock),
-};
-
-static struct sock *dn_alloc_sock(struct net *net, struct socket *sock, gfp_t gfp, int kern)
-{
- struct dn_scp *scp;
- struct sock *sk = sk_alloc(net, PF_DECnet, gfp, &dn_proto, kern);
-
- if (!sk)
- goto out;
-
- if (sock)
- sock->ops = &dn_proto_ops;
- sock_init_data(sock, sk);
-
- sk->sk_backlog_rcv = dn_nsp_backlog_rcv;
- sk->sk_destruct = dn_destruct;
- sk->sk_no_check_tx = 1;
- sk->sk_family = PF_DECnet;
- sk->sk_protocol = 0;
- sk->sk_allocation = gfp;
- sk->sk_sndbuf = sysctl_decnet_wmem[1];
- sk->sk_rcvbuf = sysctl_decnet_rmem[1];
-
- /* Initialization of DECnet Session Control Port */
- scp = DN_SK(sk);
- scp->state = DN_O; /* Open */
- scp->numdat = 1; /* Next data seg to tx */
- scp->numoth = 1; /* Next oth data to tx */
- scp->ackxmt_dat = 0; /* Last data seg ack'ed */
- scp->ackxmt_oth = 0; /* Last oth data ack'ed */
- scp->ackrcv_dat = 0; /* Highest data ack recv*/
- scp->ackrcv_oth = 0; /* Last oth data ack rec*/
- scp->flowrem_sw = DN_SEND;
- scp->flowloc_sw = DN_SEND;
- scp->flowrem_dat = 0;
- scp->flowrem_oth = 1;
- scp->flowloc_dat = 0;
- scp->flowloc_oth = 1;
- scp->services_rem = 0;
- scp->services_loc = 1 | NSP_FC_NONE;
- scp->info_rem = 0;
- scp->info_loc = 0x03; /* NSP version 4.1 */
- scp->segsize_rem = 230 - DN_MAX_NSP_DATA_HEADER; /* Default: Updated by remote segsize */
- scp->nonagle = 0;
- scp->multi_ireq = 1;
- scp->accept_mode = ACC_IMMED;
- scp->addr.sdn_family = AF_DECnet;
- scp->peer.sdn_family = AF_DECnet;
- scp->accessdata.acc_accl = 5;
- memcpy(scp->accessdata.acc_acc, "LINUX", 5);
-
- scp->max_window = NSP_MAX_WINDOW;
- scp->snd_window = NSP_MIN_WINDOW;
- scp->nsp_srtt = NSP_INITIAL_SRTT;
- scp->nsp_rttvar = NSP_INITIAL_RTTVAR;
- scp->nsp_rxtshift = 0;
-
- skb_queue_head_init(&scp->data_xmit_queue);
- skb_queue_head_init(&scp->other_xmit_queue);
- skb_queue_head_init(&scp->other_receive_queue);
-
- scp->persist = 0;
- scp->persist_fxn = NULL;
- scp->keepalive = 10 * HZ;
- scp->keepalive_fxn = dn_keepalive;
-
- dn_start_slow_timer(sk);
-out:
- return sk;
-}
-
-/*
- * Keepalive timer.
- * FIXME: Should respond to SO_KEEPALIVE etc.
- */
-static void dn_keepalive(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- /*
- * By checking the other_data transmit queue is empty
- * we are double checking that we are not sending too
- * many of these keepalive frames.
- */
- if (skb_queue_empty(&scp->other_xmit_queue))
- dn_nsp_send_link(sk, DN_NOCHANGE, 0);
-}
-
-
-/*
- * Timer for shutdown/destroyed sockets.
- * When socket is dead & no packets have been sent for a
- * certain amount of time, they are removed by this
- * routine. Also takes care of sending out DI & DC
- * frames at correct times.
- */
-int dn_destroy_timer(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- scp->persist = dn_nsp_persist(sk);
-
- switch (scp->state) {
- case DN_DI:
- dn_nsp_send_disc(sk, NSP_DISCINIT, 0, GFP_ATOMIC);
- if (scp->nsp_rxtshift >= decnet_di_count)
- scp->state = DN_CN;
- return 0;
-
- case DN_DR:
- dn_nsp_send_disc(sk, NSP_DISCINIT, 0, GFP_ATOMIC);
- if (scp->nsp_rxtshift >= decnet_dr_count)
- scp->state = DN_DRC;
- return 0;
-
- case DN_DN:
- if (scp->nsp_rxtshift < decnet_dn_count) {
- /* printk(KERN_DEBUG "dn_destroy_timer: DN\n"); */
- dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC,
- GFP_ATOMIC);
- return 0;
- }
- }
-
- scp->persist = (HZ * decnet_time_wait);
-
- if (sk->sk_socket)
- return 0;
-
- if (time_after_eq(jiffies, scp->stamp + HZ * decnet_time_wait)) {
- dn_unhash_sock(sk);
- sock_put(sk);
- return 1;
- }
-
- return 0;
-}
-
-static void dn_destroy_sock(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- scp->nsp_rxtshift = 0; /* reset back off */
-
- if (sk->sk_socket) {
- if (sk->sk_socket->state != SS_UNCONNECTED)
- sk->sk_socket->state = SS_DISCONNECTING;
- }
-
- sk->sk_state = TCP_CLOSE;
-
- switch (scp->state) {
- case DN_DN:
- dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC,
- sk->sk_allocation);
- scp->persist_fxn = dn_destroy_timer;
- scp->persist = dn_nsp_persist(sk);
- break;
- case DN_CR:
- scp->state = DN_DR;
- goto disc_reject;
- case DN_RUN:
- scp->state = DN_DI;
- /* fall through */
- case DN_DI:
- case DN_DR:
-disc_reject:
- dn_nsp_send_disc(sk, NSP_DISCINIT, 0, sk->sk_allocation);
- /* fall through */
- case DN_NC:
- case DN_NR:
- case DN_RJ:
- case DN_DIC:
- case DN_CN:
- case DN_DRC:
- case DN_CI:
- case DN_CD:
- scp->persist_fxn = dn_destroy_timer;
- scp->persist = dn_nsp_persist(sk);
- break;
- default:
- printk(KERN_DEBUG "DECnet: dn_destroy_sock passed socket in invalid state\n");
- /* fall through */
- case DN_O:
- dn_stop_slow_timer(sk);
-
- dn_unhash_sock_bh(sk);
- sock_put(sk);
-
- break;
- }
-}
-
-char *dn_addr2asc(__u16 addr, char *buf)
-{
- unsigned short node, area;
-
- node = addr & 0x03ff;
- area = addr >> 10;
- sprintf(buf, "%hd.%hd", area, node);
-
- return buf;
-}
-
-
-
-static int dn_create(struct net *net, struct socket *sock, int protocol,
- int kern)
-{
- struct sock *sk;
-
- if (protocol < 0 || protocol > SK_PROTOCOL_MAX)
- return -EINVAL;
-
- if (!net_eq(net, &init_net))
- return -EAFNOSUPPORT;
-
- switch (sock->type) {
- case SOCK_SEQPACKET:
- if (protocol != DNPROTO_NSP)
- return -EPROTONOSUPPORT;
- break;
- case SOCK_STREAM:
- break;
- default:
- return -ESOCKTNOSUPPORT;
- }
-
-
- if ((sk = dn_alloc_sock(net, sock, GFP_KERNEL, kern)) == NULL)
- return -ENOBUFS;
-
- sk->sk_protocol = protocol;
-
- return 0;
-}
-
-
-static int
-dn_release(struct socket *sock)
-{
- struct sock *sk = sock->sk;
-
- if (sk) {
- sock_orphan(sk);
- sock_hold(sk);
- lock_sock(sk);
- dn_destroy_sock(sk);
- release_sock(sk);
- sock_put(sk);
- }
-
- return 0;
-}
-
-static int dn_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
-{
- struct sock *sk = sock->sk;
- struct dn_scp *scp = DN_SK(sk);
- struct sockaddr_dn *saddr = (struct sockaddr_dn *)uaddr;
- struct net_device *dev, *ldev;
- int rv;
-
- if (addr_len != sizeof(struct sockaddr_dn))
- return -EINVAL;
-
- if (saddr->sdn_family != AF_DECnet)
- return -EINVAL;
-
- if (le16_to_cpu(saddr->sdn_nodeaddrl) && (le16_to_cpu(saddr->sdn_nodeaddrl) != 2))
- return -EINVAL;
-
- if (le16_to_cpu(saddr->sdn_objnamel) > DN_MAXOBJL)
- return -EINVAL;
-
- if (saddr->sdn_flags & ~SDF_WILD)
- return -EINVAL;
-
- if (!capable(CAP_NET_BIND_SERVICE) && (saddr->sdn_objnum ||
- (saddr->sdn_flags & SDF_WILD)))
- return -EACCES;
-
- if (!(saddr->sdn_flags & SDF_WILD)) {
- if (le16_to_cpu(saddr->sdn_nodeaddrl)) {
- rcu_read_lock();
- ldev = NULL;
- for_each_netdev_rcu(&init_net, dev) {
- if (!dev->dn_ptr)
- continue;
- if (dn_dev_islocal(dev, dn_saddr2dn(saddr))) {
- ldev = dev;
- break;
- }
- }
- rcu_read_unlock();
- if (ldev == NULL)
- return -EADDRNOTAVAIL;
- }
- }
-
- rv = -EINVAL;
- lock_sock(sk);
- if (sock_flag(sk, SOCK_ZAPPED)) {
- memcpy(&scp->addr, saddr, addr_len);
- sock_reset_flag(sk, SOCK_ZAPPED);
-
- rv = dn_hash_sock(sk);
- if (rv)
- sock_set_flag(sk, SOCK_ZAPPED);
- }
- release_sock(sk);
-
- return rv;
-}
-
-
-static int dn_auto_bind(struct socket *sock)
-{
- struct sock *sk = sock->sk;
- struct dn_scp *scp = DN_SK(sk);
- int rv;
-
- sock_reset_flag(sk, SOCK_ZAPPED);
-
- scp->addr.sdn_flags = 0;
- scp->addr.sdn_objnum = 0;
-
- /*
- * This stuff is to keep compatibility with Eduardo's
- * patch. I hope I can dispense with it shortly...
- */
- if ((scp->accessdata.acc_accl != 0) &&
- (scp->accessdata.acc_accl <= 12)) {
-
- scp->addr.sdn_objnamel = cpu_to_le16(scp->accessdata.acc_accl);
- memcpy(scp->addr.sdn_objname, scp->accessdata.acc_acc, le16_to_cpu(scp->addr.sdn_objnamel));
-
- scp->accessdata.acc_accl = 0;
- memset(scp->accessdata.acc_acc, 0, 40);
- }
- /* End of compatibility stuff */
-
- scp->addr.sdn_add.a_len = cpu_to_le16(2);
- rv = dn_dev_bind_default((__le16 *)scp->addr.sdn_add.a_addr);
- if (rv == 0) {
- rv = dn_hash_sock(sk);
- if (rv)
- sock_set_flag(sk, SOCK_ZAPPED);
- }
-
- return rv;
-}
-
-static int dn_confirm_accept(struct sock *sk, long *timeo, gfp_t allocation)
-{
- struct dn_scp *scp = DN_SK(sk);
- DEFINE_WAIT(wait);
- int err;
-
- if (scp->state != DN_CR)
- return -EINVAL;
-
- scp->state = DN_CC;
- scp->segsize_loc = dst_metric_advmss(__sk_dst_get(sk));
- dn_send_conn_conf(sk, allocation);
-
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- for(;;) {
- release_sock(sk);
- if (scp->state == DN_CC)
- *timeo = schedule_timeout(*timeo);
- lock_sock(sk);
- err = 0;
- if (scp->state == DN_RUN)
- break;
- err = sock_error(sk);
- if (err)
- break;
- err = sock_intr_errno(*timeo);
- if (signal_pending(current))
- break;
- err = -EAGAIN;
- if (!*timeo)
- break;
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- }
- finish_wait(sk_sleep(sk), &wait);
- if (err == 0) {
- sk->sk_socket->state = SS_CONNECTED;
- } else if (scp->state != DN_CC) {
- sk->sk_socket->state = SS_UNCONNECTED;
- }
- return err;
-}
-
-static int dn_wait_run(struct sock *sk, long *timeo)
-{
- struct dn_scp *scp = DN_SK(sk);
- DEFINE_WAIT(wait);
- int err = 0;
-
- if (scp->state == DN_RUN)
- goto out;
-
- if (!*timeo)
- return -EALREADY;
-
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- for(;;) {
- release_sock(sk);
- if (scp->state == DN_CI || scp->state == DN_CC)
- *timeo = schedule_timeout(*timeo);
- lock_sock(sk);
- err = 0;
- if (scp->state == DN_RUN)
- break;
- err = sock_error(sk);
- if (err)
- break;
- err = sock_intr_errno(*timeo);
- if (signal_pending(current))
- break;
- err = -ETIMEDOUT;
- if (!*timeo)
- break;
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- }
- finish_wait(sk_sleep(sk), &wait);
-out:
- if (err == 0) {
- sk->sk_socket->state = SS_CONNECTED;
- } else if (scp->state != DN_CI && scp->state != DN_CC) {
- sk->sk_socket->state = SS_UNCONNECTED;
- }
- return err;
-}
-
-static int __dn_connect(struct sock *sk, struct sockaddr_dn *addr, int addrlen, long *timeo, int flags)
-{
- struct socket *sock = sk->sk_socket;
- struct dn_scp *scp = DN_SK(sk);
- int err = -EISCONN;
- struct flowidn fld;
- struct dst_entry *dst;
-
- if (sock->state == SS_CONNECTED)
- goto out;
-
- if (sock->state == SS_CONNECTING) {
- err = 0;
- if (scp->state == DN_RUN) {
- sock->state = SS_CONNECTED;
- goto out;
- }
- err = -ECONNREFUSED;
- if (scp->state != DN_CI && scp->state != DN_CC) {
- sock->state = SS_UNCONNECTED;
- goto out;
- }
- return dn_wait_run(sk, timeo);
- }
-
- err = -EINVAL;
- if (scp->state != DN_O)
- goto out;
-
- if (addr == NULL || addrlen != sizeof(struct sockaddr_dn))
- goto out;
- if (addr->sdn_family != AF_DECnet)
- goto out;
- if (addr->sdn_flags & SDF_WILD)
- goto out;
-
- if (sock_flag(sk, SOCK_ZAPPED)) {
- err = dn_auto_bind(sk->sk_socket);
- if (err)
- goto out;
- }
-
- memcpy(&scp->peer, addr, sizeof(struct sockaddr_dn));
-
- err = -EHOSTUNREACH;
- memset(&fld, 0, sizeof(fld));
- fld.flowidn_oif = sk->sk_bound_dev_if;
- fld.daddr = dn_saddr2dn(&scp->peer);
- fld.saddr = dn_saddr2dn(&scp->addr);
- dn_sk_ports_copy(&fld, scp);
- fld.flowidn_proto = DNPROTO_NSP;
- if (dn_route_output_sock(&sk->sk_dst_cache, &fld, sk, flags) < 0)
- goto out;
- dst = __sk_dst_get(sk);
- sk->sk_route_caps = dst->dev->features;
- sock->state = SS_CONNECTING;
- scp->state = DN_CI;
- scp->segsize_loc = dst_metric_advmss(dst);
-
- dn_nsp_send_conninit(sk, NSP_CI);
- err = -EINPROGRESS;
- if (*timeo) {
- err = dn_wait_run(sk, timeo);
- }
-out:
- return err;
-}
-
-static int dn_connect(struct socket *sock, struct sockaddr *uaddr, int addrlen, int flags)
-{
- struct sockaddr_dn *addr = (struct sockaddr_dn *)uaddr;
- struct sock *sk = sock->sk;
- int err;
- long timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
-
- lock_sock(sk);
- err = __dn_connect(sk, addr, addrlen, &timeo, 0);
- release_sock(sk);
-
- return err;
-}
-
-static inline int dn_check_state(struct sock *sk, struct sockaddr_dn *addr, int addrlen, long *timeo, int flags)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- switch (scp->state) {
- case DN_RUN:
- return 0;
- case DN_CR:
- return dn_confirm_accept(sk, timeo, sk->sk_allocation);
- case DN_CI:
- case DN_CC:
- return dn_wait_run(sk, timeo);
- case DN_O:
- return __dn_connect(sk, addr, addrlen, timeo, flags);
- }
-
- return -EINVAL;
-}
-
-
-static void dn_access_copy(struct sk_buff *skb, struct accessdata_dn *acc)
-{
- unsigned char *ptr = skb->data;
-
- acc->acc_userl = *ptr++;
- memcpy(&acc->acc_user, ptr, acc->acc_userl);
- ptr += acc->acc_userl;
-
- acc->acc_passl = *ptr++;
- memcpy(&acc->acc_pass, ptr, acc->acc_passl);
- ptr += acc->acc_passl;
-
- acc->acc_accl = *ptr++;
- memcpy(&acc->acc_acc, ptr, acc->acc_accl);
-
- skb_pull(skb, acc->acc_accl + acc->acc_passl + acc->acc_userl + 3);
-
-}
-
-static void dn_user_copy(struct sk_buff *skb, struct optdata_dn *opt)
-{
- unsigned char *ptr = skb->data;
- u16 len = *ptr++; /* yes, it's 8bit on the wire */
-
- BUG_ON(len > 16); /* we've checked the contents earlier */
- opt->opt_optl = cpu_to_le16(len);
- opt->opt_status = 0;
- memcpy(opt->opt_data, ptr, len);
- skb_pull(skb, len + 1);
-}
-
-static struct sk_buff *dn_wait_for_connect(struct sock *sk, long *timeo)
-{
- DEFINE_WAIT(wait);
- struct sk_buff *skb = NULL;
- int err = 0;
-
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- for(;;) {
- release_sock(sk);
- skb = skb_dequeue(&sk->sk_receive_queue);
- if (skb == NULL) {
- *timeo = schedule_timeout(*timeo);
- skb = skb_dequeue(&sk->sk_receive_queue);
- }
- lock_sock(sk);
- if (skb != NULL)
- break;
- err = -EINVAL;
- if (sk->sk_state != TCP_LISTEN)
- break;
- err = sock_intr_errno(*timeo);
- if (signal_pending(current))
- break;
- err = -EAGAIN;
- if (!*timeo)
- break;
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- }
- finish_wait(sk_sleep(sk), &wait);
-
- return skb == NULL ? ERR_PTR(err) : skb;
-}
-
-static int dn_accept(struct socket *sock, struct socket *newsock, int flags,
- bool kern)
-{
- struct sock *sk = sock->sk, *newsk;
- struct sk_buff *skb = NULL;
- struct dn_skb_cb *cb;
- unsigned char menuver;
- int err = 0;
- unsigned char type;
- long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
- struct dst_entry *dst;
-
- lock_sock(sk);
-
- if (sk->sk_state != TCP_LISTEN || DN_SK(sk)->state != DN_O) {
- release_sock(sk);
- return -EINVAL;
- }
-
- skb = skb_dequeue(&sk->sk_receive_queue);
- if (skb == NULL) {
- skb = dn_wait_for_connect(sk, &timeo);
- if (IS_ERR(skb)) {
- release_sock(sk);
- return PTR_ERR(skb);
- }
- }
-
- cb = DN_SKB_CB(skb);
- sk->sk_ack_backlog--;
- newsk = dn_alloc_sock(sock_net(sk), newsock, sk->sk_allocation, kern);
- if (newsk == NULL) {
- release_sock(sk);
- kfree_skb(skb);
- return -ENOBUFS;
- }
- release_sock(sk);
-
- dst = skb_dst(skb);
- sk_dst_set(newsk, dst);
- skb_dst_set(skb, NULL);
-
- DN_SK(newsk)->state = DN_CR;
- DN_SK(newsk)->addrrem = cb->src_port;
- DN_SK(newsk)->services_rem = cb->services;
- DN_SK(newsk)->info_rem = cb->info;
- DN_SK(newsk)->segsize_rem = cb->segsize;
- DN_SK(newsk)->accept_mode = DN_SK(sk)->accept_mode;
-
- if (DN_SK(newsk)->segsize_rem < 230)
- DN_SK(newsk)->segsize_rem = 230;
-
- if ((DN_SK(newsk)->services_rem & NSP_FC_MASK) == NSP_FC_NONE)
- DN_SK(newsk)->max_window = decnet_no_fc_max_cwnd;
-
- newsk->sk_state = TCP_LISTEN;
- memcpy(&(DN_SK(newsk)->addr), &(DN_SK(sk)->addr), sizeof(struct sockaddr_dn));
-
- /*
- * If we are listening on a wild socket, we don't want
- * the newly created socket on the wrong hash queue.
- */
- DN_SK(newsk)->addr.sdn_flags &= ~SDF_WILD;
-
- skb_pull(skb, dn_username2sockaddr(skb->data, skb->len, &(DN_SK(newsk)->addr), &type));
- skb_pull(skb, dn_username2sockaddr(skb->data, skb->len, &(DN_SK(newsk)->peer), &type));
- *(__le16 *)(DN_SK(newsk)->peer.sdn_add.a_addr) = cb->src;
- *(__le16 *)(DN_SK(newsk)->addr.sdn_add.a_addr) = cb->dst;
-
- menuver = *skb->data;
- skb_pull(skb, 1);
-
- if (menuver & DN_MENUVER_ACC)
- dn_access_copy(skb, &(DN_SK(newsk)->accessdata));
-
- if (menuver & DN_MENUVER_USR)
- dn_user_copy(skb, &(DN_SK(newsk)->conndata_in));
-
- if (menuver & DN_MENUVER_PRX)
- DN_SK(newsk)->peer.sdn_flags |= SDF_PROXY;
-
- if (menuver & DN_MENUVER_UIC)
- DN_SK(newsk)->peer.sdn_flags |= SDF_UICPROXY;
-
- kfree_skb(skb);
-
- memcpy(&(DN_SK(newsk)->conndata_out), &(DN_SK(sk)->conndata_out),
- sizeof(struct optdata_dn));
- memcpy(&(DN_SK(newsk)->discdata_out), &(DN_SK(sk)->discdata_out),
- sizeof(struct optdata_dn));
-
- lock_sock(newsk);
- err = dn_hash_sock(newsk);
- if (err == 0) {
- sock_reset_flag(newsk, SOCK_ZAPPED);
- dn_send_conn_ack(newsk);
-
- /*
- * Here we use sk->sk_allocation since although the conn conf is
- * for the newsk, the context is the old socket.
- */
- if (DN_SK(newsk)->accept_mode == ACC_IMMED)
- err = dn_confirm_accept(newsk, &timeo,
- sk->sk_allocation);
- }
- release_sock(newsk);
- return err;
-}
-
-
-static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int peer)
-{
- struct sockaddr_dn *sa = (struct sockaddr_dn *)uaddr;
- struct sock *sk = sock->sk;
- struct dn_scp *scp = DN_SK(sk);
-
- lock_sock(sk);
-
- if (peer) {
- if ((sock->state != SS_CONNECTED &&
- sock->state != SS_CONNECTING) &&
- scp->accept_mode == ACC_IMMED) {
- release_sock(sk);
- return -ENOTCONN;
- }
-
- memcpy(sa, &scp->peer, sizeof(struct sockaddr_dn));
- } else {
- memcpy(sa, &scp->addr, sizeof(struct sockaddr_dn));
- }
-
- release_sock(sk);
-
- return sizeof(struct sockaddr_dn);
-}
-
-
-static __poll_t dn_poll(struct file *file, struct socket *sock, poll_table *wait)
-{
- struct sock *sk = sock->sk;
- struct dn_scp *scp = DN_SK(sk);
- __poll_t mask = datagram_poll(file, sock, wait);
-
- if (!skb_queue_empty(&scp->other_receive_queue))
- mask |= EPOLLRDBAND;
-
- return mask;
-}
-
-static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
-{
- struct sock *sk = sock->sk;
- struct dn_scp *scp = DN_SK(sk);
- int err = -EOPNOTSUPP;
- long amount = 0;
- struct sk_buff *skb;
- int val;
-
- switch(cmd)
- {
- case SIOCGIFADDR:
- case SIOCSIFADDR:
- return dn_dev_ioctl(cmd, (void __user *)arg);
-
- case SIOCATMARK:
- lock_sock(sk);
- val = !skb_queue_empty(&scp->other_receive_queue);
- if (scp->state != DN_RUN)
- val = -ENOTCONN;
- release_sock(sk);
- return val;
-
- case TIOCOUTQ:
- amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk);
- if (amount < 0)
- amount = 0;
- err = put_user(amount, (int __user *)arg);
- break;
-
- case TIOCINQ:
- lock_sock(sk);
- skb = skb_peek(&scp->other_receive_queue);
- if (skb) {
- amount = skb->len;
- } else {
- skb_queue_walk(&sk->sk_receive_queue, skb)
- amount += skb->len;
- }
- release_sock(sk);
- err = put_user(amount, (int __user *)arg);
- break;
-
- default:
- err = -ENOIOCTLCMD;
- break;
- }
-
- return err;
-}
-
-static int dn_listen(struct socket *sock, int backlog)
-{
- struct sock *sk = sock->sk;
- int err = -EINVAL;
-
- lock_sock(sk);
-
- if (sock_flag(sk, SOCK_ZAPPED))
- goto out;
-
- if ((DN_SK(sk)->state != DN_O) || (sk->sk_state == TCP_LISTEN))
- goto out;
-
- sk->sk_max_ack_backlog = backlog;
- sk->sk_ack_backlog = 0;
- sk->sk_state = TCP_LISTEN;
- err = 0;
- dn_rehash_sock(sk);
-
-out:
- release_sock(sk);
-
- return err;
-}
-
-
-static int dn_shutdown(struct socket *sock, int how)
-{
- struct sock *sk = sock->sk;
- struct dn_scp *scp = DN_SK(sk);
- int err = -ENOTCONN;
-
- lock_sock(sk);
-
- if (sock->state == SS_UNCONNECTED)
- goto out;
-
- err = 0;
- if (sock->state == SS_DISCONNECTING)
- goto out;
-
- err = -EINVAL;
- if (scp->state == DN_O)
- goto out;
-
- if (how != SHUT_RDWR)
- goto out;
-
- sk->sk_shutdown = SHUTDOWN_MASK;
- dn_destroy_sock(sk);
- err = 0;
-
-out:
- release_sock(sk);
-
- return err;
-}
-
-static int dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
-{
- struct sock *sk = sock->sk;
- int err;
-
- lock_sock(sk);
- err = __dn_setsockopt(sock, level, optname, optval, optlen, 0);
- release_sock(sk);
-#ifdef CONFIG_NETFILTER
- /* we need to exclude all possible ENOPROTOOPTs except default case */
- if (err == -ENOPROTOOPT && optname != DSO_LINKINFO &&
- optname != DSO_STREAM && optname != DSO_SEQPACKET)
- err = nf_setsockopt(sk, PF_DECnet, optname, optval, optlen);
-#endif
-
- return err;
-}
-
-static int __dn_setsockopt(struct socket *sock, int level,int optname, char __user *optval, unsigned int optlen, int flags)
-{
- struct sock *sk = sock->sk;
- struct dn_scp *scp = DN_SK(sk);
- long timeo;
- union {
- struct optdata_dn opt;
- struct accessdata_dn acc;
- int mode;
- unsigned long win;
- int val;
- unsigned char services;
- unsigned char info;
- } u;
- int err;
-
- if (optlen && !optval)
- return -EINVAL;
-
- if (optlen > sizeof(u))
- return -EINVAL;
-
- if (copy_from_user(&u, optval, optlen))
- return -EFAULT;
-
- switch (optname) {
- case DSO_CONDATA:
- if (sock->state == SS_CONNECTED)
- return -EISCONN;
- if ((scp->state != DN_O) && (scp->state != DN_CR))
- return -EINVAL;
-
- if (optlen != sizeof(struct optdata_dn))
- return -EINVAL;
-
- if (le16_to_cpu(u.opt.opt_optl) > 16)
- return -EINVAL;
-
- memcpy(&scp->conndata_out, &u.opt, optlen);
- break;
-
- case DSO_DISDATA:
- if (sock->state != SS_CONNECTED &&
- scp->accept_mode == ACC_IMMED)
- return -ENOTCONN;
-
- if (optlen != sizeof(struct optdata_dn))
- return -EINVAL;
-
- if (le16_to_cpu(u.opt.opt_optl) > 16)
- return -EINVAL;
-
- memcpy(&scp->discdata_out, &u.opt, optlen);
- break;
-
- case DSO_CONACCESS:
- if (sock->state == SS_CONNECTED)
- return -EISCONN;
- if (scp->state != DN_O)
- return -EINVAL;
-
- if (optlen != sizeof(struct accessdata_dn))
- return -EINVAL;
-
- if ((u.acc.acc_accl > DN_MAXACCL) ||
- (u.acc.acc_passl > DN_MAXACCL) ||
- (u.acc.acc_userl > DN_MAXACCL))
- return -EINVAL;
-
- memcpy(&scp->accessdata, &u.acc, optlen);
- break;
-
- case DSO_ACCEPTMODE:
- if (sock->state == SS_CONNECTED)
- return -EISCONN;
- if (scp->state != DN_O)
- return -EINVAL;
-
- if (optlen != sizeof(int))
- return -EINVAL;
-
- if ((u.mode != ACC_IMMED) && (u.mode != ACC_DEFER))
- return -EINVAL;
-
- scp->accept_mode = (unsigned char)u.mode;
- break;
-
- case DSO_CONACCEPT:
- if (scp->state != DN_CR)
- return -EINVAL;
- timeo = sock_rcvtimeo(sk, 0);
- err = dn_confirm_accept(sk, &timeo, sk->sk_allocation);
- return err;
-
- case DSO_CONREJECT:
- if (scp->state != DN_CR)
- return -EINVAL;
-
- scp->state = DN_DR;
- sk->sk_shutdown = SHUTDOWN_MASK;
- dn_nsp_send_disc(sk, 0x38, 0, sk->sk_allocation);
- break;
-
- case DSO_MAXWINDOW:
- if (optlen != sizeof(unsigned long))
- return -EINVAL;
- if (u.win > NSP_MAX_WINDOW)
- u.win = NSP_MAX_WINDOW;
- if (u.win == 0)
- return -EINVAL;
- scp->max_window = u.win;
- if (scp->snd_window > u.win)
- scp->snd_window = u.win;
- break;
-
- case DSO_NODELAY:
- if (optlen != sizeof(int))
- return -EINVAL;
- if (scp->nonagle == TCP_NAGLE_CORK)
- return -EINVAL;
- scp->nonagle = (u.val == 0) ? 0 : TCP_NAGLE_OFF;
- /* if (scp->nonagle == 1) { Push pending frames } */
- break;
-
- case DSO_CORK:
- if (optlen != sizeof(int))
- return -EINVAL;
- if (scp->nonagle == TCP_NAGLE_OFF)
- return -EINVAL;
- scp->nonagle = (u.val == 0) ? 0 : TCP_NAGLE_CORK;
- /* if (scp->nonagle == 0) { Push pending frames } */
- break;
-
- case DSO_SERVICES:
- if (optlen != sizeof(unsigned char))
- return -EINVAL;
- if ((u.services & ~NSP_FC_MASK) != 0x01)
- return -EINVAL;
- if ((u.services & NSP_FC_MASK) == NSP_FC_MASK)
- return -EINVAL;
- scp->services_loc = u.services;
- break;
-
- case DSO_INFO:
- if (optlen != sizeof(unsigned char))
- return -EINVAL;
- if (u.info & 0xfc)
- return -EINVAL;
- scp->info_loc = u.info;
- break;
-
- case DSO_LINKINFO:
- case DSO_STREAM:
- case DSO_SEQPACKET:
- default:
- return -ENOPROTOOPT;
- }
-
- return 0;
-}
-
-static int dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
-{
- struct sock *sk = sock->sk;
- int err;
-
- lock_sock(sk);
- err = __dn_getsockopt(sock, level, optname, optval, optlen, 0);
- release_sock(sk);
-#ifdef CONFIG_NETFILTER
- if (err == -ENOPROTOOPT && optname != DSO_STREAM &&
- optname != DSO_SEQPACKET && optname != DSO_CONACCEPT &&
- optname != DSO_CONREJECT) {
- int len;
-
- if (get_user(len, optlen))
- return -EFAULT;
-
- err = nf_getsockopt(sk, PF_DECnet, optname, optval, &len);
- if (err >= 0)
- err = put_user(len, optlen);
- }
-#endif
-
- return err;
-}
-
-static int __dn_getsockopt(struct socket *sock, int level,int optname, char __user *optval,int __user *optlen, int flags)
-{
- struct sock *sk = sock->sk;
- struct dn_scp *scp = DN_SK(sk);
- struct linkinfo_dn link;
- unsigned int r_len;
- void *r_data = NULL;
- unsigned int val;
-
- if(get_user(r_len , optlen))
- return -EFAULT;
-
- switch (optname) {
- case DSO_CONDATA:
- if (r_len > sizeof(struct optdata_dn))
- r_len = sizeof(struct optdata_dn);
- r_data = &scp->conndata_in;
- break;
-
- case DSO_DISDATA:
- if (r_len > sizeof(struct optdata_dn))
- r_len = sizeof(struct optdata_dn);
- r_data = &scp->discdata_in;
- break;
-
- case DSO_CONACCESS:
- if (r_len > sizeof(struct accessdata_dn))
- r_len = sizeof(struct accessdata_dn);
- r_data = &scp->accessdata;
- break;
-
- case DSO_ACCEPTMODE:
- if (r_len > sizeof(unsigned char))
- r_len = sizeof(unsigned char);
- r_data = &scp->accept_mode;
- break;
-
- case DSO_LINKINFO:
- if (r_len > sizeof(struct linkinfo_dn))
- r_len = sizeof(struct linkinfo_dn);
-
- memset(&link, 0, sizeof(link));
-
- switch (sock->state) {
- case SS_CONNECTING:
- link.idn_linkstate = LL_CONNECTING;
- break;
- case SS_DISCONNECTING:
- link.idn_linkstate = LL_DISCONNECTING;
- break;
- case SS_CONNECTED:
- link.idn_linkstate = LL_RUNNING;
- break;
- default:
- link.idn_linkstate = LL_INACTIVE;
- }
-
- link.idn_segsize = scp->segsize_rem;
- r_data = &link;
- break;
-
- case DSO_MAXWINDOW:
- if (r_len > sizeof(unsigned long))
- r_len = sizeof(unsigned long);
- r_data = &scp->max_window;
- break;
-
- case DSO_NODELAY:
- if (r_len > sizeof(int))
- r_len = sizeof(int);
- val = (scp->nonagle == TCP_NAGLE_OFF);
- r_data = &val;
- break;
-
- case DSO_CORK:
- if (r_len > sizeof(int))
- r_len = sizeof(int);
- val = (scp->nonagle == TCP_NAGLE_CORK);
- r_data = &val;
- break;
-
- case DSO_SERVICES:
- if (r_len > sizeof(unsigned char))
- r_len = sizeof(unsigned char);
- r_data = &scp->services_rem;
- break;
-
- case DSO_INFO:
- if (r_len > sizeof(unsigned char))
- r_len = sizeof(unsigned char);
- r_data = &scp->info_rem;
- break;
-
- case DSO_STREAM:
- case DSO_SEQPACKET:
- case DSO_CONACCEPT:
- case DSO_CONREJECT:
- default:
- return -ENOPROTOOPT;
- }
-
- if (r_data) {
- if (copy_to_user(optval, r_data, r_len))
- return -EFAULT;
- if (put_user(r_len, optlen))
- return -EFAULT;
- }
-
- return 0;
-}
-
-
-static int dn_data_ready(struct sock *sk, struct sk_buff_head *q, int flags, int target)
-{
- struct sk_buff *skb;
- int len = 0;
-
- if (flags & MSG_OOB)
- return !skb_queue_empty(q) ? 1 : 0;
-
- skb_queue_walk(q, skb) {
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- len += skb->len;
-
- if (cb->nsp_flags & 0x40) {
- /* SOCK_SEQPACKET reads to EOM */
- if (sk->sk_type == SOCK_SEQPACKET)
- return 1;
- /* so does SOCK_STREAM unless WAITALL is specified */
- if (!(flags & MSG_WAITALL))
- return 1;
- }
-
- /* minimum data length for read exceeded */
- if (len >= target)
- return 1;
- }
-
- return 0;
-}
-
-
-static int dn_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
- int flags)
-{
- struct sock *sk = sock->sk;
- struct dn_scp *scp = DN_SK(sk);
- struct sk_buff_head *queue = &sk->sk_receive_queue;
- size_t target = size > 1 ? 1 : 0;
- size_t copied = 0;
- int rv = 0;
- struct sk_buff *skb, *n;
- struct dn_skb_cb *cb = NULL;
- unsigned char eor = 0;
- long timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
-
- lock_sock(sk);
-
- if (sock_flag(sk, SOCK_ZAPPED)) {
- rv = -EADDRNOTAVAIL;
- goto out;
- }
-
- if (sk->sk_shutdown & RCV_SHUTDOWN) {
- rv = 0;
- goto out;
- }
-
- rv = dn_check_state(sk, NULL, 0, &timeo, flags);
- if (rv)
- goto out;
-
- if (flags & ~(MSG_CMSG_COMPAT|MSG_PEEK|MSG_OOB|MSG_WAITALL|MSG_DONTWAIT|MSG_NOSIGNAL)) {
- rv = -EOPNOTSUPP;
- goto out;
- }
-
- if (flags & MSG_OOB)
- queue = &scp->other_receive_queue;
-
- if (flags & MSG_WAITALL)
- target = size;
-
-
- /*
- * See if there is data ready to read, sleep if there isn't
- */
- for(;;) {
- DEFINE_WAIT_FUNC(wait, woken_wake_function);
-
- if (sk->sk_err)
- goto out;
-
- if (!skb_queue_empty(&scp->other_receive_queue)) {
- if (!(flags & MSG_OOB)) {
- msg->msg_flags |= MSG_OOB;
- if (!scp->other_report) {
- scp->other_report = 1;
- goto out;
- }
- }
- }
-
- if (scp->state != DN_RUN)
- goto out;
-
- if (signal_pending(current)) {
- rv = sock_intr_errno(timeo);
- goto out;
- }
-
- if (dn_data_ready(sk, queue, flags, target))
- break;
-
- if (flags & MSG_DONTWAIT) {
- rv = -EWOULDBLOCK;
- goto out;
- }
-
- add_wait_queue(sk_sleep(sk), &wait);
- sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- sk_wait_event(sk, &timeo, dn_data_ready(sk, queue, flags, target), &wait);
- sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- remove_wait_queue(sk_sleep(sk), &wait);
- }
-
- skb_queue_walk_safe(queue, skb, n) {
- unsigned int chunk = skb->len;
- cb = DN_SKB_CB(skb);
-
- if ((chunk + copied) > size)
- chunk = size - copied;
-
- if (memcpy_to_msg(msg, skb->data, chunk)) {
- rv = -EFAULT;
- break;
- }
- copied += chunk;
-
- if (!(flags & MSG_PEEK))
- skb_pull(skb, chunk);
-
- eor = cb->nsp_flags & 0x40;
-
- if (skb->len == 0) {
- skb_unlink(skb, queue);
- kfree_skb(skb);
- /*
- * N.B. Don't refer to skb or cb after this point
- * in loop.
- */
- if ((scp->flowloc_sw == DN_DONTSEND) && !dn_congested(sk)) {
- scp->flowloc_sw = DN_SEND;
- dn_nsp_send_link(sk, DN_SEND, 0);
- }
- }
-
- if (eor) {
- if (sk->sk_type == SOCK_SEQPACKET)
- break;
- if (!(flags & MSG_WAITALL))
- break;
- }
-
- if (flags & MSG_OOB)
- break;
-
- if (copied >= target)
- break;
- }
-
- rv = copied;
-
-
- if (eor && (sk->sk_type == SOCK_SEQPACKET))
- msg->msg_flags |= MSG_EOR;
-
-out:
- if (rv == 0)
- rv = (flags & MSG_PEEK) ? -sk->sk_err : sock_error(sk);
-
- if ((rv >= 0) && msg->msg_name) {
- __sockaddr_check_size(sizeof(struct sockaddr_dn));
- memcpy(msg->msg_name, &scp->peer, sizeof(struct sockaddr_dn));
- msg->msg_namelen = sizeof(struct sockaddr_dn);
- }
-
- release_sock(sk);
-
- return rv;
-}
-
-
-static inline int dn_queue_too_long(struct dn_scp *scp, struct sk_buff_head *queue, int flags)
-{
- unsigned char fctype = scp->services_rem & NSP_FC_MASK;
- if (skb_queue_len(queue) >= scp->snd_window)
- return 1;
- if (fctype != NSP_FC_NONE) {
- if (flags & MSG_OOB) {
- if (scp->flowrem_oth == 0)
- return 1;
- } else {
- if (scp->flowrem_dat == 0)
- return 1;
- }
- }
- return 0;
-}
-
-/*
- * The DECnet spec requires that the "routing layer" accepts packets which
- * are at least 230 bytes in size. This excludes any headers which the NSP
- * layer might add, so we always assume that we'll be using the maximal
- * length header on data packets. The variation in length is due to the
- * inclusion (or not) of the two 16 bit acknowledgement fields so it doesn't
- * make much practical difference.
- */
-unsigned int dn_mss_from_pmtu(struct net_device *dev, int mtu)
-{
- unsigned int mss = 230 - DN_MAX_NSP_DATA_HEADER;
- if (dev) {
- struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
- mtu -= LL_RESERVED_SPACE(dev);
- if (dn_db->use_long)
- mtu -= 21;
- else
- mtu -= 6;
- mtu -= DN_MAX_NSP_DATA_HEADER;
- } else {
- /*
- * 21 = long header, 16 = guess at MAC header length
- */
- mtu -= (21 + DN_MAX_NSP_DATA_HEADER + 16);
- }
- if (mtu > mss)
- mss = mtu;
- return mss;
-}
-
-static inline unsigned int dn_current_mss(struct sock *sk, int flags)
-{
- struct dst_entry *dst = __sk_dst_get(sk);
- struct dn_scp *scp = DN_SK(sk);
- int mss_now = min_t(int, scp->segsize_loc, scp->segsize_rem);
-
- /* Other data messages are limited to 16 bytes per packet */
- if (flags & MSG_OOB)
- return 16;
-
- /* This works out the maximum size of segment we can send out */
- if (dst) {
- u32 mtu = dst_mtu(dst);
- mss_now = min_t(int, dn_mss_from_pmtu(dst->dev, mtu), mss_now);
- }
-
- return mss_now;
-}
-
-/*
- * N.B. We get the timeout wrong here, but then we always did get it
- * wrong before and this is another step along the road to correcting
- * it. It ought to get updated each time we pass through the routine,
- * but in practise it probably doesn't matter too much for now.
- */
-static inline struct sk_buff *dn_alloc_send_pskb(struct sock *sk,
- unsigned long datalen, int noblock,
- int *errcode)
-{
- struct sk_buff *skb = sock_alloc_send_skb(sk, datalen,
- noblock, errcode);
- if (skb) {
- skb->protocol = htons(ETH_P_DNA_RT);
- skb->pkt_type = PACKET_OUTGOING;
- }
- return skb;
-}
-
-static int dn_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
-{
- struct sock *sk = sock->sk;
- struct dn_scp *scp = DN_SK(sk);
- size_t mss;
- struct sk_buff_head *queue = &scp->data_xmit_queue;
- int flags = msg->msg_flags;
- int err = 0;
- size_t sent = 0;
- int addr_len = msg->msg_namelen;
- DECLARE_SOCKADDR(struct sockaddr_dn *, addr, msg->msg_name);
- struct sk_buff *skb = NULL;
- struct dn_skb_cb *cb;
- size_t len;
- unsigned char fctype;
- long timeo;
-
- if (flags & ~(MSG_TRYHARD|MSG_OOB|MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL|MSG_MORE|MSG_CMSG_COMPAT))
- return -EOPNOTSUPP;
-
- if (addr_len && (addr_len != sizeof(struct sockaddr_dn)))
- return -EINVAL;
-
- lock_sock(sk);
- timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
- /*
- * The only difference between stream sockets and sequenced packet
- * sockets is that the stream sockets always behave as if MSG_EOR
- * has been set.
- */
- if (sock->type == SOCK_STREAM) {
- if (flags & MSG_EOR) {
- err = -EINVAL;
- goto out;
- }
- flags |= MSG_EOR;
- }
-
-
- err = dn_check_state(sk, addr, addr_len, &timeo, flags);
- if (err)
- goto out_err;
-
- if (sk->sk_shutdown & SEND_SHUTDOWN) {
- err = -EPIPE;
- if (!(flags & MSG_NOSIGNAL))
- send_sig(SIGPIPE, current, 0);
- goto out_err;
- }
-
- if ((flags & MSG_TRYHARD) && sk->sk_dst_cache)
- dst_negative_advice(sk);
-
- mss = scp->segsize_rem;
- fctype = scp->services_rem & NSP_FC_MASK;
-
- mss = dn_current_mss(sk, flags);
-
- if (flags & MSG_OOB) {
- queue = &scp->other_xmit_queue;
- if (size > mss) {
- err = -EMSGSIZE;
- goto out;
- }
- }
-
- scp->persist_fxn = dn_nsp_xmit_timeout;
-
- while(sent < size) {
- err = sock_error(sk);
- if (err)
- goto out;
-
- if (signal_pending(current)) {
- err = sock_intr_errno(timeo);
- goto out;
- }
-
- /*
- * Calculate size that we wish to send.
- */
- len = size - sent;
-
- if (len > mss)
- len = mss;
-
- /*
- * Wait for queue size to go down below the window
- * size.
- */
- if (dn_queue_too_long(scp, queue, flags)) {
- DEFINE_WAIT_FUNC(wait, woken_wake_function);
-
- if (flags & MSG_DONTWAIT) {
- err = -EWOULDBLOCK;
- goto out;
- }
-
- add_wait_queue(sk_sleep(sk), &wait);
- sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- sk_wait_event(sk, &timeo,
- !dn_queue_too_long(scp, queue, flags), &wait);
- sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
- remove_wait_queue(sk_sleep(sk), &wait);
- continue;
- }
-
- /*
- * Get a suitably sized skb.
- * 64 is a bit of a hack really, but its larger than any
- * link-layer headers and has served us well as a good
- * guess as to their real length.
- */
- skb = dn_alloc_send_pskb(sk, len + 64 + DN_MAX_NSP_DATA_HEADER,
- flags & MSG_DONTWAIT, &err);
-
- if (err)
- break;
-
- if (!skb)
- continue;
-
- cb = DN_SKB_CB(skb);
-
- skb_reserve(skb, 64 + DN_MAX_NSP_DATA_HEADER);
-
- if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
- err = -EFAULT;
- goto out;
- }
-
- if (flags & MSG_OOB) {
- cb->nsp_flags = 0x30;
- if (fctype != NSP_FC_NONE)
- scp->flowrem_oth--;
- } else {
- cb->nsp_flags = 0x00;
- if (scp->seg_total == 0)
- cb->nsp_flags |= 0x20;
-
- scp->seg_total += len;
-
- if (((sent + len) == size) && (flags & MSG_EOR)) {
- cb->nsp_flags |= 0x40;
- scp->seg_total = 0;
- if (fctype == NSP_FC_SCMC)
- scp->flowrem_dat--;
- }
- if (fctype == NSP_FC_SRC)
- scp->flowrem_dat--;
- }
-
- sent += len;
- dn_nsp_queue_xmit(sk, skb, sk->sk_allocation, flags & MSG_OOB);
- skb = NULL;
-
- scp->persist = dn_nsp_persist(sk);
-
- }
-out:
-
- kfree_skb(skb);
-
- release_sock(sk);
-
- return sent ? sent : err;
-
-out_err:
- err = sk_stream_error(sk, flags, err);
- release_sock(sk);
- return err;
-}
-
-static int dn_device_event(struct notifier_block *this, unsigned long event,
- void *ptr)
-{
- struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-
- if (!net_eq(dev_net(dev), &init_net))
- return NOTIFY_DONE;
-
- switch (event) {
- case NETDEV_UP:
- dn_dev_up(dev);
- break;
- case NETDEV_DOWN:
- dn_dev_down(dev);
- break;
- default:
- break;
- }
-
- return NOTIFY_DONE;
-}
-
-static struct notifier_block dn_dev_notifier = {
- .notifier_call = dn_device_event,
-};
-
-static struct packet_type dn_dix_packet_type __read_mostly = {
- .type = cpu_to_be16(ETH_P_DNA_RT),
- .func = dn_route_rcv,
-};
-
-#ifdef CONFIG_PROC_FS
-struct dn_iter_state {
- int bucket;
-};
-
-static struct sock *dn_socket_get_first(struct seq_file *seq)
-{
- struct dn_iter_state *state = seq->private;
- struct sock *n = NULL;
-
- for(state->bucket = 0;
- state->bucket < DN_SK_HASH_SIZE;
- ++state->bucket) {
- n = sk_head(&dn_sk_hash[state->bucket]);
- if (n)
- break;
- }
-
- return n;
-}
-
-static struct sock *dn_socket_get_next(struct seq_file *seq,
- struct sock *n)
-{
- struct dn_iter_state *state = seq->private;
-
- n = sk_next(n);
-try_again:
- if (n)
- goto out;
- if (++state->bucket >= DN_SK_HASH_SIZE)
- goto out;
- n = sk_head(&dn_sk_hash[state->bucket]);
- goto try_again;
-out:
- return n;
-}
-
-static struct sock *socket_get_idx(struct seq_file *seq, loff_t *pos)
-{
- struct sock *sk = dn_socket_get_first(seq);
-
- if (sk) {
- while(*pos && (sk = dn_socket_get_next(seq, sk)))
- --*pos;
- }
- return *pos ? NULL : sk;
-}
-
-static void *dn_socket_get_idx(struct seq_file *seq, loff_t pos)
-{
- void *rc;
- read_lock_bh(&dn_hash_lock);
- rc = socket_get_idx(seq, &pos);
- if (!rc) {
- read_unlock_bh(&dn_hash_lock);
- }
- return rc;
-}
-
-static void *dn_socket_seq_start(struct seq_file *seq, loff_t *pos)
-{
- return *pos ? dn_socket_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
-}
-
-static void *dn_socket_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- void *rc;
-
- if (v == SEQ_START_TOKEN) {
- rc = dn_socket_get_idx(seq, 0);
- goto out;
- }
-
- rc = dn_socket_get_next(seq, v);
- if (rc)
- goto out;
- read_unlock_bh(&dn_hash_lock);
-out:
- ++*pos;
- return rc;
-}
-
-static void dn_socket_seq_stop(struct seq_file *seq, void *v)
-{
- if (v && v != SEQ_START_TOKEN)
- read_unlock_bh(&dn_hash_lock);
-}
-
-#define IS_NOT_PRINTABLE(x) ((x) < 32 || (x) > 126)
-
-static void dn_printable_object(struct sockaddr_dn *dn, unsigned char *buf)
-{
- int i;
-
- switch (le16_to_cpu(dn->sdn_objnamel)) {
- case 0:
- sprintf(buf, "%d", dn->sdn_objnum);
- break;
- default:
- for (i = 0; i < le16_to_cpu(dn->sdn_objnamel); i++) {
- buf[i] = dn->sdn_objname[i];
- if (IS_NOT_PRINTABLE(buf[i]))
- buf[i] = '.';
- }
- buf[i] = 0;
- }
-}
-
-static char *dn_state2asc(unsigned char state)
-{
- switch (state) {
- case DN_O:
- return "OPEN";
- case DN_CR:
- return " CR";
- case DN_DR:
- return " DR";
- case DN_DRC:
- return " DRC";
- case DN_CC:
- return " CC";
- case DN_CI:
- return " CI";
- case DN_NR:
- return " NR";
- case DN_NC:
- return " NC";
- case DN_CD:
- return " CD";
- case DN_RJ:
- return " RJ";
- case DN_RUN:
- return " RUN";
- case DN_DI:
- return " DI";
- case DN_DIC:
- return " DIC";
- case DN_DN:
- return " DN";
- case DN_CL:
- return " CL";
- case DN_CN:
- return " CN";
- }
-
- return "????";
-}
-
-static inline void dn_socket_format_entry(struct seq_file *seq, struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
- char buf1[DN_ASCBUF_LEN];
- char buf2[DN_ASCBUF_LEN];
- char local_object[DN_MAXOBJL+3];
- char remote_object[DN_MAXOBJL+3];
-
- dn_printable_object(&scp->addr, local_object);
- dn_printable_object(&scp->peer, remote_object);
-
- seq_printf(seq,
- "%6s/%04X %04d:%04d %04d:%04d %01d %-16s "
- "%6s/%04X %04d:%04d %04d:%04d %01d %-16s %4s %s\n",
- dn_addr2asc(le16_to_cpu(dn_saddr2dn(&scp->addr)), buf1),
- scp->addrloc,
- scp->numdat,
- scp->numoth,
- scp->ackxmt_dat,
- scp->ackxmt_oth,
- scp->flowloc_sw,
- local_object,
- dn_addr2asc(le16_to_cpu(dn_saddr2dn(&scp->peer)), buf2),
- scp->addrrem,
- scp->numdat_rcv,
- scp->numoth_rcv,
- scp->ackrcv_dat,
- scp->ackrcv_oth,
- scp->flowrem_sw,
- remote_object,
- dn_state2asc(scp->state),
- ((scp->accept_mode == ACC_IMMED) ? "IMMED" : "DEFER"));
-}
-
-static int dn_socket_seq_show(struct seq_file *seq, void *v)
-{
- if (v == SEQ_START_TOKEN) {
- seq_puts(seq, "Local Remote\n");
- } else {
- dn_socket_format_entry(seq, v);
- }
- return 0;
-}
-
-static const struct seq_operations dn_socket_seq_ops = {
- .start = dn_socket_seq_start,
- .next = dn_socket_seq_next,
- .stop = dn_socket_seq_stop,
- .show = dn_socket_seq_show,
-};
-#endif
-
-static const struct net_proto_family dn_family_ops = {
- .family = AF_DECnet,
- .create = dn_create,
- .owner = THIS_MODULE,
-};
-
-static const struct proto_ops dn_proto_ops = {
- .family = AF_DECnet,
- .owner = THIS_MODULE,
- .release = dn_release,
- .bind = dn_bind,
- .connect = dn_connect,
- .socketpair = sock_no_socketpair,
- .accept = dn_accept,
- .getname = dn_getname,
- .poll = dn_poll,
- .ioctl = dn_ioctl,
- .listen = dn_listen,
- .shutdown = dn_shutdown,
- .setsockopt = dn_setsockopt,
- .getsockopt = dn_getsockopt,
- .sendmsg = dn_sendmsg,
- .recvmsg = dn_recvmsg,
- .mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
-};
-
-MODULE_DESCRIPTION("The Linux DECnet Network Protocol");
-MODULE_AUTHOR("Linux DECnet Project Team");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NETPROTO(PF_DECnet);
-
-static const char banner[] __initconst = KERN_INFO
-"NET4: DECnet for Linux: V.2.5.68s (C) 1995-2003 Linux DECnet Project Team\n";
-
-static int __init decnet_init(void)
-{
- int rc;
-
- printk(banner);
-
- rc = proto_register(&dn_proto, 1);
- if (rc != 0)
- goto out;
-
- dn_neigh_init();
- dn_dev_init();
- dn_route_init();
- dn_fib_init();
-
- sock_register(&dn_family_ops);
- dev_add_pack(&dn_dix_packet_type);
- register_netdevice_notifier(&dn_dev_notifier);
-
- proc_create_seq_private("decnet", 0444, init_net.proc_net,
- &dn_socket_seq_ops, sizeof(struct dn_iter_state),
- NULL);
- dn_register_sysctl();
-out:
- return rc;
-
-}
-module_init(decnet_init);
-
-/*
- * Prevent DECnet module unloading until its fixed properly.
- * Requires an audit of the code to check for memory leaks and
- * initialisation problems etc.
- */
-#if 0
-static void __exit decnet_exit(void)
-{
- sock_unregister(AF_DECnet);
- rtnl_unregister_all(PF_DECnet);
- dev_remove_pack(&dn_dix_packet_type);
-
- dn_unregister_sysctl();
-
- unregister_netdevice_notifier(&dn_dev_notifier);
-
- dn_route_cleanup();
- dn_dev_cleanup();
- dn_neigh_cleanup();
- dn_fib_cleanup();
-
- remove_proc_entry("decnet", init_net.proc_net);
-
- proto_unregister(&dn_proto);
-
- rcu_barrier_bh(); /* Wait for completion of call_rcu_bh()'s */
-}
-module_exit(decnet_exit);
-#endif
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
deleted file mode 100644
index bfd43e8f2c06..000000000000
--- a/net/decnet/dn_dev.c
+++ /dev/null
@@ -1,1438 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Device Layer
- *
- * Authors: Steve Whitehouse <SteveW@ACM.org>
- * Eduardo Marcelo Serrat <emserrat@geocities.com>
- *
- * Changes:
- * Steve Whitehouse : Devices now see incoming frames so they
- * can mark on who it came from.
- * Steve Whitehouse : Fixed bug in creating neighbours. Each neighbour
- * can now have a device specific setup func.
- * Steve Whitehouse : Added /proc/sys/net/decnet/conf/<dev>/
- * Steve Whitehouse : Fixed bug which sometimes killed timer
- * Steve Whitehouse : Multiple ifaddr support
- * Steve Whitehouse : SIOCGIFCONF is now a compile time option
- * Steve Whitehouse : /proc/sys/net/decnet/conf/<sys>/forwarding
- * Steve Whitehouse : Removed timer1 - it's a user space issue now
- * Patrick Caulfield : Fixed router hello message format
- * Steve Whitehouse : Got rid of constant sizes for blksize for
- * devices. All mtu based now.
- */
-
-#include <linux/capability.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/net.h>
-#include <linux/netdevice.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/timer.h>
-#include <linux/string.h>
-#include <linux/if_addr.h>
-#include <linux/if_arp.h>
-#include <linux/if_ether.h>
-#include <linux/skbuff.h>
-#include <linux/sysctl.h>
-#include <linux/notifier.h>
-#include <linux/slab.h>
-#include <linux/jiffies.h>
-#include <linux/uaccess.h>
-#include <net/net_namespace.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/fib_rules.h>
-#include <net/netlink.h>
-#include <net/dn.h>
-#include <net/dn_dev.h>
-#include <net/dn_route.h>
-#include <net/dn_neigh.h>
-#include <net/dn_fib.h>
-
-#define DN_IFREQ_SIZE (sizeof(struct ifreq) - sizeof(struct sockaddr) + sizeof(struct sockaddr_dn))
-
-static char dn_rt_all_end_mcast[ETH_ALEN] = {0xAB,0x00,0x00,0x04,0x00,0x00};
-static char dn_rt_all_rt_mcast[ETH_ALEN] = {0xAB,0x00,0x00,0x03,0x00,0x00};
-static char dn_hiord[ETH_ALEN] = {0xAA,0x00,0x04,0x00,0x00,0x00};
-static unsigned char dn_eco_version[3] = {0x02,0x00,0x00};
-
-extern struct neigh_table dn_neigh_table;
-
-/*
- * decnet_address is kept in network order.
- */
-__le16 decnet_address = 0;
-
-static DEFINE_SPINLOCK(dndev_lock);
-static struct net_device *decnet_default_device;
-static BLOCKING_NOTIFIER_HEAD(dnaddr_chain);
-
-static struct dn_dev *dn_dev_create(struct net_device *dev, int *err);
-static void dn_dev_delete(struct net_device *dev);
-static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa);
-
-static int dn_eth_up(struct net_device *);
-static void dn_eth_down(struct net_device *);
-static void dn_send_brd_hello(struct net_device *dev, struct dn_ifaddr *ifa);
-static void dn_send_ptp_hello(struct net_device *dev, struct dn_ifaddr *ifa);
-
-static struct dn_dev_parms dn_dev_list[] = {
-{
- .type = ARPHRD_ETHER, /* Ethernet */
- .mode = DN_DEV_BCAST,
- .state = DN_DEV_S_RU,
- .t2 = 1,
- .t3 = 10,
- .name = "ethernet",
- .up = dn_eth_up,
- .down = dn_eth_down,
- .timer3 = dn_send_brd_hello,
-},
-{
- .type = ARPHRD_IPGRE, /* DECnet tunneled over GRE in IP */
- .mode = DN_DEV_BCAST,
- .state = DN_DEV_S_RU,
- .t2 = 1,
- .t3 = 10,
- .name = "ipgre",
- .timer3 = dn_send_brd_hello,
-},
-#if 0
-{
- .type = ARPHRD_X25, /* Bog standard X.25 */
- .mode = DN_DEV_UCAST,
- .state = DN_DEV_S_DS,
- .t2 = 1,
- .t3 = 120,
- .name = "x25",
- .timer3 = dn_send_ptp_hello,
-},
-#endif
-#if 0
-{
- .type = ARPHRD_PPP, /* DECnet over PPP */
- .mode = DN_DEV_BCAST,
- .state = DN_DEV_S_RU,
- .t2 = 1,
- .t3 = 10,
- .name = "ppp",
- .timer3 = dn_send_brd_hello,
-},
-#endif
-{
- .type = ARPHRD_DDCMP, /* DECnet over DDCMP */
- .mode = DN_DEV_UCAST,
- .state = DN_DEV_S_DS,
- .t2 = 1,
- .t3 = 120,
- .name = "ddcmp",
- .timer3 = dn_send_ptp_hello,
-},
-{
- .type = ARPHRD_LOOPBACK, /* Loopback interface - always last */
- .mode = DN_DEV_BCAST,
- .state = DN_DEV_S_RU,
- .t2 = 1,
- .t3 = 10,
- .name = "loopback",
- .timer3 = dn_send_brd_hello,
-}
-};
-
-#define DN_DEV_LIST_SIZE ARRAY_SIZE(dn_dev_list)
-
-#define DN_DEV_PARMS_OFFSET(x) offsetof(struct dn_dev_parms, x)
-
-#ifdef CONFIG_SYSCTL
-
-static int min_t2[] = { 1 };
-static int max_t2[] = { 60 }; /* No max specified, but this seems sensible */
-static int min_t3[] = { 1 };
-static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MULT or T3MULT */
-
-static int min_priority[1];
-static int max_priority[] = { 127 }; /* From DECnet spec */
-
-static int dn_forwarding_proc(struct ctl_table *, int,
- void __user *, size_t *, loff_t *);
-static struct dn_dev_sysctl_table {
- struct ctl_table_header *sysctl_header;
- struct ctl_table dn_dev_vars[5];
-} dn_dev_sysctl = {
- NULL,
- {
- {
- .procname = "forwarding",
- .data = (void *)DN_DEV_PARMS_OFFSET(forwarding),
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = dn_forwarding_proc,
- },
- {
- .procname = "priority",
- .data = (void *)DN_DEV_PARMS_OFFSET(priority),
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &min_priority,
- .extra2 = &max_priority
- },
- {
- .procname = "t2",
- .data = (void *)DN_DEV_PARMS_OFFSET(t2),
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &min_t2,
- .extra2 = &max_t2
- },
- {
- .procname = "t3",
- .data = (void *)DN_DEV_PARMS_OFFSET(t3),
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &min_t3,
- .extra2 = &max_t3
- },
- { }
- },
-};
-
-static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms *parms)
-{
- struct dn_dev_sysctl_table *t;
- int i;
-
- char path[sizeof("net/decnet/conf/") + IFNAMSIZ];
-
- t = kmemdup(&dn_dev_sysctl, sizeof(*t), GFP_KERNEL);
- if (t == NULL)
- return;
-
- for(i = 0; i < ARRAY_SIZE(t->dn_dev_vars) - 1; i++) {
- long offset = (long)t->dn_dev_vars[i].data;
- t->dn_dev_vars[i].data = ((char *)parms) + offset;
- }
-
- snprintf(path, sizeof(path), "net/decnet/conf/%s",
- dev? dev->name : parms->name);
-
- t->dn_dev_vars[0].extra1 = (void *)dev;
-
- t->sysctl_header = register_net_sysctl(&init_net, path, t->dn_dev_vars);
- if (t->sysctl_header == NULL)
- kfree(t);
- else
- parms->sysctl = t;
-}
-
-static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms)
-{
- if (parms->sysctl) {
- struct dn_dev_sysctl_table *t = parms->sysctl;
- parms->sysctl = NULL;
- unregister_net_sysctl_table(t->sysctl_header);
- kfree(t);
- }
-}
-
-static int dn_forwarding_proc(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
-#ifdef CONFIG_DECNET_ROUTER
- struct net_device *dev = table->extra1;
- struct dn_dev *dn_db;
- int err;
- int tmp, old;
-
- if (table->extra1 == NULL)
- return -EINVAL;
-
- dn_db = rcu_dereference_raw(dev->dn_ptr);
- old = dn_db->parms.forwarding;
-
- err = proc_dointvec(table, write, buffer, lenp, ppos);
-
- if ((err >= 0) && write) {
- if (dn_db->parms.forwarding < 0)
- dn_db->parms.forwarding = 0;
- if (dn_db->parms.forwarding > 2)
- dn_db->parms.forwarding = 2;
- /*
- * What an ugly hack this is... its works, just. It
- * would be nice if sysctl/proc were just that little
- * bit more flexible so I don't have to write a special
- * routine, or suffer hacks like this - SJW
- */
- tmp = dn_db->parms.forwarding;
- dn_db->parms.forwarding = old;
- if (dn_db->parms.down)
- dn_db->parms.down(dev);
- dn_db->parms.forwarding = tmp;
- if (dn_db->parms.up)
- dn_db->parms.up(dev);
- }
-
- return err;
-#else
- return -EINVAL;
-#endif
-}
-
-#else /* CONFIG_SYSCTL */
-static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms)
-{
-}
-static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms *parms)
-{
-}
-
-#endif /* CONFIG_SYSCTL */
-
-static inline __u16 mtu2blksize(struct net_device *dev)
-{
- u32 blksize = dev->mtu;
- if (blksize > 0xffff)
- blksize = 0xffff;
-
- if (dev->type == ARPHRD_ETHER ||
- dev->type == ARPHRD_PPP ||
- dev->type == ARPHRD_IPGRE ||
- dev->type == ARPHRD_LOOPBACK)
- blksize -= 2;
-
- return (__u16)blksize;
-}
-
-static struct dn_ifaddr *dn_dev_alloc_ifa(void)
-{
- struct dn_ifaddr *ifa;
-
- ifa = kzalloc(sizeof(*ifa), GFP_KERNEL);
-
- return ifa;
-}
-
-static void dn_dev_free_ifa(struct dn_ifaddr *ifa)
-{
- kfree_rcu(ifa, rcu);
-}
-
-static void dn_dev_del_ifa(struct dn_dev *dn_db, struct dn_ifaddr __rcu **ifap, int destroy)
-{
- struct dn_ifaddr *ifa1 = rtnl_dereference(*ifap);
- unsigned char mac_addr[6];
- struct net_device *dev = dn_db->dev;
-
- ASSERT_RTNL();
-
- *ifap = ifa1->ifa_next;
-
- if (dn_db->dev->type == ARPHRD_ETHER) {
- if (ifa1->ifa_local != dn_eth2dn(dev->dev_addr)) {
- dn_dn2eth(mac_addr, ifa1->ifa_local);
- dev_mc_del(dev, mac_addr);
- }
- }
-
- dn_ifaddr_notify(RTM_DELADDR, ifa1);
- blocking_notifier_call_chain(&dnaddr_chain, NETDEV_DOWN, ifa1);
- if (destroy) {
- dn_dev_free_ifa(ifa1);
-
- if (dn_db->ifa_list == NULL)
- dn_dev_delete(dn_db->dev);
- }
-}
-
-static int dn_dev_insert_ifa(struct dn_dev *dn_db, struct dn_ifaddr *ifa)
-{
- struct net_device *dev = dn_db->dev;
- struct dn_ifaddr *ifa1;
- unsigned char mac_addr[6];
-
- ASSERT_RTNL();
-
- /* Check for duplicates */
- for (ifa1 = rtnl_dereference(dn_db->ifa_list);
- ifa1 != NULL;
- ifa1 = rtnl_dereference(ifa1->ifa_next)) {
- if (ifa1->ifa_local == ifa->ifa_local)
- return -EEXIST;
- }
-
- if (dev->type == ARPHRD_ETHER) {
- if (ifa->ifa_local != dn_eth2dn(dev->dev_addr)) {
- dn_dn2eth(mac_addr, ifa->ifa_local);
- dev_mc_add(dev, mac_addr);
- }
- }
-
- ifa->ifa_next = dn_db->ifa_list;
- rcu_assign_pointer(dn_db->ifa_list, ifa);
-
- dn_ifaddr_notify(RTM_NEWADDR, ifa);
- blocking_notifier_call_chain(&dnaddr_chain, NETDEV_UP, ifa);
-
- return 0;
-}
-
-static int dn_dev_set_ifa(struct net_device *dev, struct dn_ifaddr *ifa)
-{
- struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr);
- int rv;
-
- if (dn_db == NULL) {
- int err;
- dn_db = dn_dev_create(dev, &err);
- if (dn_db == NULL)
- return err;
- }
-
- ifa->ifa_dev = dn_db;
-
- if (dev->flags & IFF_LOOPBACK)
- ifa->ifa_scope = RT_SCOPE_HOST;
-
- rv = dn_dev_insert_ifa(dn_db, ifa);
- if (rv)
- dn_dev_free_ifa(ifa);
- return rv;
-}
-
-
-int dn_dev_ioctl(unsigned int cmd, void __user *arg)
-{
- char buffer[DN_IFREQ_SIZE];
- struct ifreq *ifr = (struct ifreq *)buffer;
- struct sockaddr_dn *sdn = (struct sockaddr_dn *)&ifr->ifr_addr;
- struct dn_dev *dn_db;
- struct net_device *dev;
- struct dn_ifaddr *ifa = NULL;
- struct dn_ifaddr __rcu **ifap = NULL;
- int ret = 0;
-
- if (copy_from_user(ifr, arg, DN_IFREQ_SIZE))
- return -EFAULT;
- ifr->ifr_name[IFNAMSIZ-1] = 0;
-
- dev_load(&init_net, ifr->ifr_name);
-
- switch (cmd) {
- case SIOCGIFADDR:
- break;
- case SIOCSIFADDR:
- if (!capable(CAP_NET_ADMIN))
- return -EACCES;
- if (sdn->sdn_family != AF_DECnet)
- return -EINVAL;
- break;
- default:
- return -EINVAL;
- }
-
- rtnl_lock();
-
- if ((dev = __dev_get_by_name(&init_net, ifr->ifr_name)) == NULL) {
- ret = -ENODEV;
- goto done;
- }
-
- if ((dn_db = rtnl_dereference(dev->dn_ptr)) != NULL) {
- for (ifap = &dn_db->ifa_list;
- (ifa = rtnl_dereference(*ifap)) != NULL;
- ifap = &ifa->ifa_next)
- if (strcmp(ifr->ifr_name, ifa->ifa_label) == 0)
- break;
- }
-
- if (ifa == NULL && cmd != SIOCSIFADDR) {
- ret = -EADDRNOTAVAIL;
- goto done;
- }
-
- switch (cmd) {
- case SIOCGIFADDR:
- *((__le16 *)sdn->sdn_nodeaddr) = ifa->ifa_local;
- goto rarok;
-
- case SIOCSIFADDR:
- if (!ifa) {
- if ((ifa = dn_dev_alloc_ifa()) == NULL) {
- ret = -ENOBUFS;
- break;
- }
- memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
- } else {
- if (ifa->ifa_local == dn_saddr2dn(sdn))
- break;
- dn_dev_del_ifa(dn_db, ifap, 0);
- }
-
- ifa->ifa_local = ifa->ifa_address = dn_saddr2dn(sdn);
-
- ret = dn_dev_set_ifa(dev, ifa);
- }
-done:
- rtnl_unlock();
-
- return ret;
-rarok:
- if (copy_to_user(arg, ifr, DN_IFREQ_SIZE))
- ret = -EFAULT;
- goto done;
-}
-
-struct net_device *dn_dev_get_default(void)
-{
- struct net_device *dev;
-
- spin_lock(&dndev_lock);
- dev = decnet_default_device;
- if (dev) {
- if (dev->dn_ptr)
- dev_hold(dev);
- else
- dev = NULL;
- }
- spin_unlock(&dndev_lock);
-
- return dev;
-}
-
-int dn_dev_set_default(struct net_device *dev, int force)
-{
- struct net_device *old = NULL;
- int rv = -EBUSY;
- if (!dev->dn_ptr)
- return -ENODEV;
-
- spin_lock(&dndev_lock);
- if (force || decnet_default_device == NULL) {
- old = decnet_default_device;
- decnet_default_device = dev;
- rv = 0;
- }
- spin_unlock(&dndev_lock);
-
- if (old)
- dev_put(old);
- return rv;
-}
-
-static void dn_dev_check_default(struct net_device *dev)
-{
- spin_lock(&dndev_lock);
- if (dev == decnet_default_device) {
- decnet_default_device = NULL;
- } else {
- dev = NULL;
- }
- spin_unlock(&dndev_lock);
-
- if (dev)
- dev_put(dev);
-}
-
-/*
- * Called with RTNL
- */
-static struct dn_dev *dn_dev_by_index(int ifindex)
-{
- struct net_device *dev;
- struct dn_dev *dn_dev = NULL;
-
- dev = __dev_get_by_index(&init_net, ifindex);
- if (dev)
- dn_dev = rtnl_dereference(dev->dn_ptr);
-
- return dn_dev;
-}
-
-static const struct nla_policy dn_ifa_policy[IFA_MAX+1] = {
- [IFA_ADDRESS] = { .type = NLA_U16 },
- [IFA_LOCAL] = { .type = NLA_U16 },
- [IFA_LABEL] = { .type = NLA_STRING,
- .len = IFNAMSIZ - 1 },
- [IFA_FLAGS] = { .type = NLA_U32 },
-};
-
-static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
-{
- struct net *net = sock_net(skb->sk);
- struct nlattr *tb[IFA_MAX+1];
- struct dn_dev *dn_db;
- struct ifaddrmsg *ifm;
- struct dn_ifaddr *ifa;
- struct dn_ifaddr __rcu **ifap;
- int err = -EINVAL;
-
- if (!netlink_capable(skb, CAP_NET_ADMIN))
- return -EPERM;
-
- if (!net_eq(net, &init_net))
- goto errout;
-
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, dn_ifa_policy,
- extack);
- if (err < 0)
- goto errout;
-
- err = -ENODEV;
- ifm = nlmsg_data(nlh);
- if ((dn_db = dn_dev_by_index(ifm->ifa_index)) == NULL)
- goto errout;
-
- err = -EADDRNOTAVAIL;
- for (ifap = &dn_db->ifa_list;
- (ifa = rtnl_dereference(*ifap)) != NULL;
- ifap = &ifa->ifa_next) {
- if (tb[IFA_LOCAL] &&
- nla_memcmp(tb[IFA_LOCAL], &ifa->ifa_local, 2))
- continue;
-
- if (tb[IFA_LABEL] && nla_strcmp(tb[IFA_LABEL], ifa->ifa_label))
- continue;
-
- dn_dev_del_ifa(dn_db, ifap, 1);
- return 0;
- }
-
-errout:
- return err;
-}
-
-static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
-{
- struct net *net = sock_net(skb->sk);
- struct nlattr *tb[IFA_MAX+1];
- struct net_device *dev;
- struct dn_dev *dn_db;
- struct ifaddrmsg *ifm;
- struct dn_ifaddr *ifa;
- int err;
-
- if (!netlink_capable(skb, CAP_NET_ADMIN))
- return -EPERM;
-
- if (!net_eq(net, &init_net))
- return -EINVAL;
-
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, dn_ifa_policy,
- extack);
- if (err < 0)
- return err;
-
- if (tb[IFA_LOCAL] == NULL)
- return -EINVAL;
-
- ifm = nlmsg_data(nlh);
- if ((dev = __dev_get_by_index(&init_net, ifm->ifa_index)) == NULL)
- return -ENODEV;
-
- if ((dn_db = rtnl_dereference(dev->dn_ptr)) == NULL) {
- dn_db = dn_dev_create(dev, &err);
- if (!dn_db)
- return err;
- }
-
- if ((ifa = dn_dev_alloc_ifa()) == NULL)
- return -ENOBUFS;
-
- if (tb[IFA_ADDRESS] == NULL)
- tb[IFA_ADDRESS] = tb[IFA_LOCAL];
-
- ifa->ifa_local = nla_get_le16(tb[IFA_LOCAL]);
- ifa->ifa_address = nla_get_le16(tb[IFA_ADDRESS]);
- ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) :
- ifm->ifa_flags;
- ifa->ifa_scope = ifm->ifa_scope;
- ifa->ifa_dev = dn_db;
-
- if (tb[IFA_LABEL])
- nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
- else
- memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
-
- err = dn_dev_insert_ifa(dn_db, ifa);
- if (err)
- dn_dev_free_ifa(ifa);
-
- return err;
-}
-
-static inline size_t dn_ifaddr_nlmsg_size(void)
-{
- return NLMSG_ALIGN(sizeof(struct ifaddrmsg))
- + nla_total_size(IFNAMSIZ) /* IFA_LABEL */
- + nla_total_size(2) /* IFA_ADDRESS */
- + nla_total_size(2) /* IFA_LOCAL */
- + nla_total_size(4); /* IFA_FLAGS */
-}
-
-static int dn_nl_fill_ifaddr(struct sk_buff *skb, struct dn_ifaddr *ifa,
- u32 portid, u32 seq, int event, unsigned int flags)
-{
- struct ifaddrmsg *ifm;
- struct nlmsghdr *nlh;
- u32 ifa_flags = ifa->ifa_flags | IFA_F_PERMANENT;
-
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags);
- if (nlh == NULL)
- return -EMSGSIZE;
-
- ifm = nlmsg_data(nlh);
- ifm->ifa_family = AF_DECnet;
- ifm->ifa_prefixlen = 16;
- ifm->ifa_flags = ifa_flags;
- ifm->ifa_scope = ifa->ifa_scope;
- ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
-
- if ((ifa->ifa_address &&
- nla_put_le16(skb, IFA_ADDRESS, ifa->ifa_address)) ||
- (ifa->ifa_local &&
- nla_put_le16(skb, IFA_LOCAL, ifa->ifa_local)) ||
- (ifa->ifa_label[0] &&
- nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
- nla_put_u32(skb, IFA_FLAGS, ifa_flags))
- goto nla_put_failure;
- nlmsg_end(skb, nlh);
- return 0;
-
-nla_put_failure:
- nlmsg_cancel(skb, nlh);
- return -EMSGSIZE;
-}
-
-static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa)
-{
- struct sk_buff *skb;
- int err = -ENOBUFS;
-
- skb = alloc_skb(dn_ifaddr_nlmsg_size(), GFP_KERNEL);
- if (skb == NULL)
- goto errout;
-
- err = dn_nl_fill_ifaddr(skb, ifa, 0, 0, event, 0);
- if (err < 0) {
- /* -EMSGSIZE implies BUG in dn_ifaddr_nlmsg_size() */
- WARN_ON(err == -EMSGSIZE);
- kfree_skb(skb);
- goto errout;
- }
- rtnl_notify(skb, &init_net, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL);
- return;
-errout:
- if (err < 0)
- rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_IFADDR, err);
-}
-
-static int dn_nl_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
-{
- struct net *net = sock_net(skb->sk);
- int idx, dn_idx = 0, skip_ndevs, skip_naddr;
- struct net_device *dev;
- struct dn_dev *dn_db;
- struct dn_ifaddr *ifa;
-
- if (!net_eq(net, &init_net))
- return 0;
-
- skip_ndevs = cb->args[0];
- skip_naddr = cb->args[1];
-
- idx = 0;
- rcu_read_lock();
- for_each_netdev_rcu(&init_net, dev) {
- if (idx < skip_ndevs)
- goto cont;
- else if (idx > skip_ndevs) {
- /* Only skip over addresses for first dev dumped
- * in this iteration (idx == skip_ndevs) */
- skip_naddr = 0;
- }
-
- if ((dn_db = rcu_dereference(dev->dn_ptr)) == NULL)
- goto cont;
-
- for (ifa = rcu_dereference(dn_db->ifa_list), dn_idx = 0; ifa;
- ifa = rcu_dereference(ifa->ifa_next), dn_idx++) {
- if (dn_idx < skip_naddr)
- continue;
-
- if (dn_nl_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, RTM_NEWADDR,
- NLM_F_MULTI) < 0)
- goto done;
- }
-cont:
- idx++;
- }
-done:
- rcu_read_unlock();
- cb->args[0] = idx;
- cb->args[1] = dn_idx;
-
- return skb->len;
-}
-
-static int dn_dev_get_first(struct net_device *dev, __le16 *addr)
-{
- struct dn_dev *dn_db;
- struct dn_ifaddr *ifa;
- int rv = -ENODEV;
-
- rcu_read_lock();
- dn_db = rcu_dereference(dev->dn_ptr);
- if (dn_db == NULL)
- goto out;
-
- ifa = rcu_dereference(dn_db->ifa_list);
- if (ifa != NULL) {
- *addr = ifa->ifa_local;
- rv = 0;
- }
-out:
- rcu_read_unlock();
- return rv;
-}
-
-/*
- * Find a default address to bind to.
- *
- * This is one of those areas where the initial VMS concepts don't really
- * map onto the Linux concepts, and since we introduced multiple addresses
- * per interface we have to cope with slightly odd ways of finding out what
- * "our address" really is. Mostly it's not a problem; for this we just guess
- * a sensible default. Eventually the routing code will take care of all the
- * nasties for us I hope.
- */
-int dn_dev_bind_default(__le16 *addr)
-{
- struct net_device *dev;
- int rv;
- dev = dn_dev_get_default();
-last_chance:
- if (dev) {
- rv = dn_dev_get_first(dev, addr);
- dev_put(dev);
- if (rv == 0 || dev == init_net.loopback_dev)
- return rv;
- }
- dev = init_net.loopback_dev;
- dev_hold(dev);
- goto last_chance;
-}
-
-static void dn_send_endnode_hello(struct net_device *dev, struct dn_ifaddr *ifa)
-{
- struct endnode_hello_message *msg;
- struct sk_buff *skb = NULL;
- __le16 *pktlen;
- struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
-
- if ((skb = dn_alloc_skb(NULL, sizeof(*msg), GFP_ATOMIC)) == NULL)
- return;
-
- skb->dev = dev;
-
- msg = skb_put(skb, sizeof(*msg));
-
- msg->msgflg = 0x0D;
- memcpy(msg->tiver, dn_eco_version, 3);
- dn_dn2eth(msg->id, ifa->ifa_local);
- msg->iinfo = DN_RT_INFO_ENDN;
- msg->blksize = cpu_to_le16(mtu2blksize(dev));
- msg->area = 0x00;
- memset(msg->seed, 0, 8);
- memcpy(msg->neighbor, dn_hiord, ETH_ALEN);
-
- if (dn_db->router) {
- struct dn_neigh *dn = (struct dn_neigh *)dn_db->router;
- dn_dn2eth(msg->neighbor, dn->addr);
- }
-
- msg->timer = cpu_to_le16((unsigned short)dn_db->parms.t3);
- msg->mpd = 0x00;
- msg->datalen = 0x02;
- memset(msg->data, 0xAA, 2);
-
- pktlen = skb_push(skb, 2);
- *pktlen = cpu_to_le16(skb->len - 2);
-
- skb_reset_network_header(skb);
-
- dn_rt_finish_output(skb, dn_rt_all_rt_mcast, msg->id);
-}
-
-
-#define DRDELAY (5 * HZ)
-
-static int dn_am_i_a_router(struct dn_neigh *dn, struct dn_dev *dn_db, struct dn_ifaddr *ifa)
-{
- /* First check time since device went up */
- if (time_before(jiffies, dn_db->uptime + DRDELAY))
- return 0;
-
- /* If there is no router, then yes... */
- if (!dn_db->router)
- return 1;
-
- /* otherwise only if we have a higher priority or.. */
- if (dn->priority < dn_db->parms.priority)
- return 1;
-
- /* if we have equal priority and a higher node number */
- if (dn->priority != dn_db->parms.priority)
- return 0;
-
- if (le16_to_cpu(dn->addr) < le16_to_cpu(ifa->ifa_local))
- return 1;
-
- return 0;
-}
-
-static void dn_send_router_hello(struct net_device *dev, struct dn_ifaddr *ifa)
-{
- int n;
- struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
- struct dn_neigh *dn = (struct dn_neigh *)dn_db->router;
- struct sk_buff *skb;
- size_t size;
- unsigned char *ptr;
- unsigned char *i1, *i2;
- __le16 *pktlen;
- char *src;
-
- if (mtu2blksize(dev) < (26 + 7))
- return;
-
- n = mtu2blksize(dev) - 26;
- n /= 7;
-
- if (n > 32)
- n = 32;
-
- size = 2 + 26 + 7 * n;
-
- if ((skb = dn_alloc_skb(NULL, size, GFP_ATOMIC)) == NULL)
- return;
-
- skb->dev = dev;
- ptr = skb_put(skb, size);
-
- *ptr++ = DN_RT_PKT_CNTL | DN_RT_PKT_ERTH;
- *ptr++ = 2; /* ECO */
- *ptr++ = 0;
- *ptr++ = 0;
- dn_dn2eth(ptr, ifa->ifa_local);
- src = ptr;
- ptr += ETH_ALEN;
- *ptr++ = dn_db->parms.forwarding == 1 ?
- DN_RT_INFO_L1RT : DN_RT_INFO_L2RT;
- *((__le16 *)ptr) = cpu_to_le16(mtu2blksize(dev));
- ptr += 2;
- *ptr++ = dn_db->parms.priority; /* Priority */
- *ptr++ = 0; /* Area: Reserved */
- *((__le16 *)ptr) = cpu_to_le16((unsigned short)dn_db->parms.t3);
- ptr += 2;
- *ptr++ = 0; /* MPD: Reserved */
- i1 = ptr++;
- memset(ptr, 0, 7); /* Name: Reserved */
- ptr += 7;
- i2 = ptr++;
-
- n = dn_neigh_elist(dev, ptr, n);
-
- *i2 = 7 * n;
- *i1 = 8 + *i2;
-
- skb_trim(skb, (27 + *i2));
-
- pktlen = skb_push(skb, 2);
- *pktlen = cpu_to_le16(skb->len - 2);
-
- skb_reset_network_header(skb);
-
- if (dn_am_i_a_router(dn, dn_db, ifa)) {
- struct sk_buff *skb2 = skb_copy(skb, GFP_ATOMIC);
- if (skb2) {
- dn_rt_finish_output(skb2, dn_rt_all_end_mcast, src);
- }
- }
-
- dn_rt_finish_output(skb, dn_rt_all_rt_mcast, src);
-}
-
-static void dn_send_brd_hello(struct net_device *dev, struct dn_ifaddr *ifa)
-{
- struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
-
- if (dn_db->parms.forwarding == 0)
- dn_send_endnode_hello(dev, ifa);
- else
- dn_send_router_hello(dev, ifa);
-}
-
-static void dn_send_ptp_hello(struct net_device *dev, struct dn_ifaddr *ifa)
-{
- int tdlen = 16;
- int size = dev->hard_header_len + 2 + 4 + tdlen;
- struct sk_buff *skb = dn_alloc_skb(NULL, size, GFP_ATOMIC);
- int i;
- unsigned char *ptr;
- char src[ETH_ALEN];
-
- if (skb == NULL)
- return ;
-
- skb->dev = dev;
- skb_push(skb, dev->hard_header_len);
- ptr = skb_put(skb, 2 + 4 + tdlen);
-
- *ptr++ = DN_RT_PKT_HELO;
- *((__le16 *)ptr) = ifa->ifa_local;
- ptr += 2;
- *ptr++ = tdlen;
-
- for(i = 0; i < tdlen; i++)
- *ptr++ = 0252;
-
- dn_dn2eth(src, ifa->ifa_local);
- dn_rt_finish_output(skb, dn_rt_all_rt_mcast, src);
-}
-
-static int dn_eth_up(struct net_device *dev)
-{
- struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
-
- if (dn_db->parms.forwarding == 0)
- dev_mc_add(dev, dn_rt_all_end_mcast);
- else
- dev_mc_add(dev, dn_rt_all_rt_mcast);
-
- dn_db->use_long = 1;
-
- return 0;
-}
-
-static void dn_eth_down(struct net_device *dev)
-{
- struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
-
- if (dn_db->parms.forwarding == 0)
- dev_mc_del(dev, dn_rt_all_end_mcast);
- else
- dev_mc_del(dev, dn_rt_all_rt_mcast);
-}
-
-static void dn_dev_set_timer(struct net_device *dev);
-
-static void dn_dev_timer_func(struct timer_list *t)
-{
- struct dn_dev *dn_db = from_timer(dn_db, t, timer);
- struct net_device *dev;
- struct dn_ifaddr *ifa;
-
- rcu_read_lock();
- dev = dn_db->dev;
- if (dn_db->t3 <= dn_db->parms.t2) {
- if (dn_db->parms.timer3) {
- for (ifa = rcu_dereference(dn_db->ifa_list);
- ifa;
- ifa = rcu_dereference(ifa->ifa_next)) {
- if (!(ifa->ifa_flags & IFA_F_SECONDARY))
- dn_db->parms.timer3(dev, ifa);
- }
- }
- dn_db->t3 = dn_db->parms.t3;
- } else {
- dn_db->t3 -= dn_db->parms.t2;
- }
- rcu_read_unlock();
- dn_dev_set_timer(dev);
-}
-
-static void dn_dev_set_timer(struct net_device *dev)
-{
- struct dn_dev *dn_db = rcu_dereference_raw(dev->dn_ptr);
-
- if (dn_db->parms.t2 > dn_db->parms.t3)
- dn_db->parms.t2 = dn_db->parms.t3;
-
- dn_db->timer.expires = jiffies + (dn_db->parms.t2 * HZ);
-
- add_timer(&dn_db->timer);
-}
-
-static struct dn_dev *dn_dev_create(struct net_device *dev, int *err)
-{
- int i;
- struct dn_dev_parms *p = dn_dev_list;
- struct dn_dev *dn_db;
-
- for(i = 0; i < DN_DEV_LIST_SIZE; i++, p++) {
- if (p->type == dev->type)
- break;
- }
-
- *err = -ENODEV;
- if (i == DN_DEV_LIST_SIZE)
- return NULL;
-
- *err = -ENOBUFS;
- if ((dn_db = kzalloc(sizeof(struct dn_dev), GFP_ATOMIC)) == NULL)
- return NULL;
-
- memcpy(&dn_db->parms, p, sizeof(struct dn_dev_parms));
-
- rcu_assign_pointer(dev->dn_ptr, dn_db);
- dn_db->dev = dev;
- timer_setup(&dn_db->timer, dn_dev_timer_func, 0);
-
- dn_db->uptime = jiffies;
-
- dn_db->neigh_parms = neigh_parms_alloc(dev, &dn_neigh_table);
- if (!dn_db->neigh_parms) {
- RCU_INIT_POINTER(dev->dn_ptr, NULL);
- kfree(dn_db);
- return NULL;
- }
-
- if (dn_db->parms.up) {
- if (dn_db->parms.up(dev) < 0) {
- neigh_parms_release(&dn_neigh_table, dn_db->neigh_parms);
- dev->dn_ptr = NULL;
- kfree(dn_db);
- return NULL;
- }
- }
-
- dn_dev_sysctl_register(dev, &dn_db->parms);
-
- dn_dev_set_timer(dev);
-
- *err = 0;
- return dn_db;
-}
-
-
-/*
- * This processes a device up event. We only start up
- * the loopback device & ethernet devices with correct
- * MAC addresses automatically. Others must be started
- * specifically.
- *
- * FIXME: How should we configure the loopback address ? If we could dispense
- * with using decnet_address here and for autobind, it will be one less thing
- * for users to worry about setting up.
- */
-
-void dn_dev_up(struct net_device *dev)
-{
- struct dn_ifaddr *ifa;
- __le16 addr = decnet_address;
- int maybe_default = 0;
- struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr);
-
- if ((dev->type != ARPHRD_ETHER) && (dev->type != ARPHRD_LOOPBACK))
- return;
-
- /*
- * Need to ensure that loopback device has a dn_db attached to it
- * to allow creation of neighbours against it, even though it might
- * not have a local address of its own. Might as well do the same for
- * all autoconfigured interfaces.
- */
- if (dn_db == NULL) {
- int err;
- dn_db = dn_dev_create(dev, &err);
- if (dn_db == NULL)
- return;
- }
-
- if (dev->type == ARPHRD_ETHER) {
- if (memcmp(dev->dev_addr, dn_hiord, 4) != 0)
- return;
- addr = dn_eth2dn(dev->dev_addr);
- maybe_default = 1;
- }
-
- if (addr == 0)
- return;
-
- if ((ifa = dn_dev_alloc_ifa()) == NULL)
- return;
-
- ifa->ifa_local = ifa->ifa_address = addr;
- ifa->ifa_flags = 0;
- ifa->ifa_scope = RT_SCOPE_UNIVERSE;
- strcpy(ifa->ifa_label, dev->name);
-
- dn_dev_set_ifa(dev, ifa);
-
- /*
- * Automagically set the default device to the first automatically
- * configured ethernet card in the system.
- */
- if (maybe_default) {
- dev_hold(dev);
- if (dn_dev_set_default(dev, 0))
- dev_put(dev);
- }
-}
-
-static void dn_dev_delete(struct net_device *dev)
-{
- struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr);
-
- if (dn_db == NULL)
- return;
-
- del_timer_sync(&dn_db->timer);
- dn_dev_sysctl_unregister(&dn_db->parms);
- dn_dev_check_default(dev);
- neigh_ifdown(&dn_neigh_table, dev);
-
- if (dn_db->parms.down)
- dn_db->parms.down(dev);
-
- dev->dn_ptr = NULL;
-
- neigh_parms_release(&dn_neigh_table, dn_db->neigh_parms);
- neigh_ifdown(&dn_neigh_table, dev);
-
- if (dn_db->router)
- neigh_release(dn_db->router);
- if (dn_db->peer)
- neigh_release(dn_db->peer);
-
- kfree(dn_db);
-}
-
-void dn_dev_down(struct net_device *dev)
-{
- struct dn_dev *dn_db = rtnl_dereference(dev->dn_ptr);
- struct dn_ifaddr *ifa;
-
- if (dn_db == NULL)
- return;
-
- while ((ifa = rtnl_dereference(dn_db->ifa_list)) != NULL) {
- dn_dev_del_ifa(dn_db, &dn_db->ifa_list, 0);
- dn_dev_free_ifa(ifa);
- }
-
- dn_dev_delete(dev);
-}
-
-void dn_dev_init_pkt(struct sk_buff *skb)
-{
-}
-
-void dn_dev_veri_pkt(struct sk_buff *skb)
-{
-}
-
-void dn_dev_hello(struct sk_buff *skb)
-{
-}
-
-void dn_dev_devices_off(void)
-{
- struct net_device *dev;
-
- rtnl_lock();
- for_each_netdev(&init_net, dev)
- dn_dev_down(dev);
- rtnl_unlock();
-
-}
-
-void dn_dev_devices_on(void)
-{
- struct net_device *dev;
-
- rtnl_lock();
- for_each_netdev(&init_net, dev) {
- if (dev->flags & IFF_UP)
- dn_dev_up(dev);
- }
- rtnl_unlock();
-}
-
-int register_dnaddr_notifier(struct notifier_block *nb)
-{
- return blocking_notifier_chain_register(&dnaddr_chain, nb);
-}
-
-int unregister_dnaddr_notifier(struct notifier_block *nb)
-{
- return blocking_notifier_chain_unregister(&dnaddr_chain, nb);
-}
-
-#ifdef CONFIG_PROC_FS
-static inline int is_dn_dev(struct net_device *dev)
-{
- return dev->dn_ptr != NULL;
-}
-
-static void *dn_dev_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(RCU)
-{
- int i;
- struct net_device *dev;
-
- rcu_read_lock();
-
- if (*pos == 0)
- return SEQ_START_TOKEN;
-
- i = 1;
- for_each_netdev_rcu(&init_net, dev) {
- if (!is_dn_dev(dev))
- continue;
-
- if (i++ == *pos)
- return dev;
- }
-
- return NULL;
-}
-
-static void *dn_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- struct net_device *dev;
-
- ++*pos;
-
- dev = v;
- if (v == SEQ_START_TOKEN)
- dev = net_device_entry(&init_net.dev_base_head);
-
- for_each_netdev_continue_rcu(&init_net, dev) {
- if (!is_dn_dev(dev))
- continue;
-
- return dev;
- }
-
- return NULL;
-}
-
-static void dn_dev_seq_stop(struct seq_file *seq, void *v)
- __releases(RCU)
-{
- rcu_read_unlock();
-}
-
-static char *dn_type2asc(char type)
-{
- switch (type) {
- case DN_DEV_BCAST:
- return "B";
- case DN_DEV_UCAST:
- return "U";
- case DN_DEV_MPOINT:
- return "M";
- }
-
- return "?";
-}
-
-static int dn_dev_seq_show(struct seq_file *seq, void *v)
-{
- if (v == SEQ_START_TOKEN)
- seq_puts(seq, "Name Flags T1 Timer1 T3 Timer3 BlkSize Pri State DevType Router Peer\n");
- else {
- struct net_device *dev = v;
- char peer_buf[DN_ASCBUF_LEN];
- char router_buf[DN_ASCBUF_LEN];
- struct dn_dev *dn_db = rcu_dereference(dev->dn_ptr);
-
- seq_printf(seq, "%-8s %1s %04u %04u %04lu %04lu"
- " %04hu %03d %02x %-10s %-7s %-7s\n",
- dev->name ? dev->name : "???",
- dn_type2asc(dn_db->parms.mode),
- 0, 0,
- dn_db->t3, dn_db->parms.t3,
- mtu2blksize(dev),
- dn_db->parms.priority,
- dn_db->parms.state, dn_db->parms.name,
- dn_db->router ? dn_addr2asc(le16_to_cpu(*(__le16 *)dn_db->router->primary_key), router_buf) : "",
- dn_db->peer ? dn_addr2asc(le16_to_cpu(*(__le16 *)dn_db->peer->primary_key), peer_buf) : "");
- }
- return 0;
-}
-
-static const struct seq_operations dn_dev_seq_ops = {
- .start = dn_dev_seq_start,
- .next = dn_dev_seq_next,
- .stop = dn_dev_seq_stop,
- .show = dn_dev_seq_show,
-};
-#endif /* CONFIG_PROC_FS */
-
-static int addr[2];
-module_param_array(addr, int, NULL, 0444);
-MODULE_PARM_DESC(addr, "The DECnet address of this machine: area,node");
-
-void __init dn_dev_init(void)
-{
- if (addr[0] > 63 || addr[0] < 0) {
- printk(KERN_ERR "DECnet: Area must be between 0 and 63");
- return;
- }
-
- if (addr[1] > 1023 || addr[1] < 0) {
- printk(KERN_ERR "DECnet: Node must be between 0 and 1023");
- return;
- }
-
- decnet_address = cpu_to_le16((addr[0] << 10) | addr[1]);
-
- dn_dev_devices_on();
-
- rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_NEWADDR,
- dn_nl_newaddr, NULL, 0);
- rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELADDR,
- dn_nl_deladdr, NULL, 0);
- rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETADDR,
- NULL, dn_nl_dump_ifaddr, 0);
-
- proc_create_seq("decnet_dev", 0444, init_net.proc_net, &dn_dev_seq_ops);
-
-#ifdef CONFIG_SYSCTL
- {
- int i;
- for(i = 0; i < DN_DEV_LIST_SIZE; i++)
- dn_dev_sysctl_register(NULL, &dn_dev_list[i]);
- }
-#endif /* CONFIG_SYSCTL */
-}
-
-void __exit dn_dev_cleanup(void)
-{
-#ifdef CONFIG_SYSCTL
- {
- int i;
- for(i = 0; i < DN_DEV_LIST_SIZE; i++)
- dn_dev_sysctl_unregister(&dn_dev_list[i]);
- }
-#endif /* CONFIG_SYSCTL */
-
- remove_proc_entry("decnet_dev", init_net.proc_net);
-
- dn_dev_devices_off();
-}
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
deleted file mode 100644
index f78fe58eafc8..000000000000
--- a/net/decnet/dn_fib.c
+++ /dev/null
@@ -1,799 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Routing Forwarding Information Base (Glue/Info List)
- *
- * Author: Steve Whitehouse <SteveW@ACM.org>
- *
- *
- * Changes:
- * Alexey Kuznetsov : SMP locking changes
- * Steve Whitehouse : Rewrote it... Well to be more correct, I
- * copied most of it from the ipv4 fib code.
- * Steve Whitehouse : Updated it in style and fixed a few bugs
- * which were fixed in the ipv4 code since
- * this code was copied from it.
- *
- */
-#include <linux/string.h>
-#include <linux/net.h>
-#include <linux/socket.h>
-#include <linux/slab.h>
-#include <linux/sockios.h>
-#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/rtnetlink.h>
-#include <linux/proc_fs.h>
-#include <linux/netdevice.h>
-#include <linux/timer.h>
-#include <linux/spinlock.h>
-#include <linux/atomic.h>
-#include <linux/uaccess.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/fib_rules.h>
-#include <net/dn.h>
-#include <net/dn_route.h>
-#include <net/dn_fib.h>
-#include <net/dn_neigh.h>
-#include <net/dn_dev.h>
-#include <net/nexthop.h>
-
-#define RT_MIN_TABLE 1
-
-#define for_fib_info() { struct dn_fib_info *fi;\
- for(fi = dn_fib_info_list; fi; fi = fi->fib_next)
-#define endfor_fib_info() }
-
-#define for_nexthops(fi) { int nhsel; const struct dn_fib_nh *nh;\
- for(nhsel = 0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
-
-#define change_nexthops(fi) { int nhsel; struct dn_fib_nh *nh;\
- for(nhsel = 0, nh = (struct dn_fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
-
-#define endfor_nexthops(fi) }
-
-static DEFINE_SPINLOCK(dn_fib_multipath_lock);
-static struct dn_fib_info *dn_fib_info_list;
-static DEFINE_SPINLOCK(dn_fib_info_lock);
-
-static struct
-{
- int error;
- u8 scope;
-} dn_fib_props[RTN_MAX+1] = {
- [RTN_UNSPEC] = { .error = 0, .scope = RT_SCOPE_NOWHERE },
- [RTN_UNICAST] = { .error = 0, .scope = RT_SCOPE_UNIVERSE },
- [RTN_LOCAL] = { .error = 0, .scope = RT_SCOPE_HOST },
- [RTN_BROADCAST] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE },
- [RTN_ANYCAST] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE },
- [RTN_MULTICAST] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE },
- [RTN_BLACKHOLE] = { .error = -EINVAL, .scope = RT_SCOPE_UNIVERSE },
- [RTN_UNREACHABLE] = { .error = -EHOSTUNREACH, .scope = RT_SCOPE_UNIVERSE },
- [RTN_PROHIBIT] = { .error = -EACCES, .scope = RT_SCOPE_UNIVERSE },
- [RTN_THROW] = { .error = -EAGAIN, .scope = RT_SCOPE_UNIVERSE },
- [RTN_NAT] = { .error = 0, .scope = RT_SCOPE_NOWHERE },
- [RTN_XRESOLVE] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE },
-};
-
-static int dn_fib_sync_down(__le16 local, struct net_device *dev, int force);
-static int dn_fib_sync_up(struct net_device *dev);
-
-void dn_fib_free_info(struct dn_fib_info *fi)
-{
- if (fi->fib_dead == 0) {
- printk(KERN_DEBUG "DECnet: BUG! Attempt to free alive dn_fib_info\n");
- return;
- }
-
- change_nexthops(fi) {
- if (nh->nh_dev)
- dev_put(nh->nh_dev);
- nh->nh_dev = NULL;
- } endfor_nexthops(fi);
- kfree(fi);
-}
-
-void dn_fib_release_info(struct dn_fib_info *fi)
-{
- spin_lock(&dn_fib_info_lock);
- if (fi && --fi->fib_treeref == 0) {
- if (fi->fib_next)
- fi->fib_next->fib_prev = fi->fib_prev;
- if (fi->fib_prev)
- fi->fib_prev->fib_next = fi->fib_next;
- if (fi == dn_fib_info_list)
- dn_fib_info_list = fi->fib_next;
- fi->fib_dead = 1;
- dn_fib_info_put(fi);
- }
- spin_unlock(&dn_fib_info_lock);
-}
-
-static inline int dn_fib_nh_comp(const struct dn_fib_info *fi, const struct dn_fib_info *ofi)
-{
- const struct dn_fib_nh *onh = ofi->fib_nh;
-
- for_nexthops(fi) {
- if (nh->nh_oif != onh->nh_oif ||
- nh->nh_gw != onh->nh_gw ||
- nh->nh_scope != onh->nh_scope ||
- nh->nh_weight != onh->nh_weight ||
- ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
- return -1;
- onh++;
- } endfor_nexthops(fi);
- return 0;
-}
-
-static inline struct dn_fib_info *dn_fib_find_info(const struct dn_fib_info *nfi)
-{
- for_fib_info() {
- if (fi->fib_nhs != nfi->fib_nhs)
- continue;
- if (nfi->fib_protocol == fi->fib_protocol &&
- nfi->fib_prefsrc == fi->fib_prefsrc &&
- nfi->fib_priority == fi->fib_priority &&
- memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(fi->fib_metrics)) == 0 &&
- ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
- (nfi->fib_nhs == 0 || dn_fib_nh_comp(fi, nfi) == 0))
- return fi;
- } endfor_fib_info();
- return NULL;
-}
-
-static int dn_fib_count_nhs(const struct nlattr *attr)
-{
- struct rtnexthop *nhp = nla_data(attr);
- int nhs = 0, nhlen = nla_len(attr);
-
- while (rtnh_ok(nhp, nhlen)) {
- nhs++;
- nhp = rtnh_next(nhp, &nhlen);
- }
-
- /* leftover implies invalid nexthop configuration, discard it */
- return nhlen > 0 ? 0 : nhs;
-}
-
-static int dn_fib_get_nhs(struct dn_fib_info *fi, const struct nlattr *attr,
- const struct rtmsg *r)
-{
- struct rtnexthop *nhp = nla_data(attr);
- int nhlen = nla_len(attr);
-
- change_nexthops(fi) {
- int attrlen;
-
- if (!rtnh_ok(nhp, nhlen))
- return -EINVAL;
-
- nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
- nh->nh_oif = nhp->rtnh_ifindex;
- nh->nh_weight = nhp->rtnh_hops + 1;
-
- attrlen = rtnh_attrlen(nhp);
- if (attrlen > 0) {
- struct nlattr *gw_attr;
-
- gw_attr = nla_find((struct nlattr *) (nhp + 1), attrlen, RTA_GATEWAY);
- nh->nh_gw = gw_attr ? nla_get_le16(gw_attr) : 0;
- }
-
- nhp = rtnh_next(nhp, &nhlen);
- } endfor_nexthops(fi);
-
- return 0;
-}
-
-
-static int dn_fib_check_nh(const struct rtmsg *r, struct dn_fib_info *fi, struct dn_fib_nh *nh)
-{
- int err;
-
- if (nh->nh_gw) {
- struct flowidn fld;
- struct dn_fib_res res;
-
- if (nh->nh_flags&RTNH_F_ONLINK) {
- struct net_device *dev;
-
- if (r->rtm_scope >= RT_SCOPE_LINK)
- return -EINVAL;
- if (dnet_addr_type(nh->nh_gw) != RTN_UNICAST)
- return -EINVAL;
- if ((dev = __dev_get_by_index(&init_net, nh->nh_oif)) == NULL)
- return -ENODEV;
- if (!(dev->flags&IFF_UP))
- return -ENETDOWN;
- nh->nh_dev = dev;
- dev_hold(dev);
- nh->nh_scope = RT_SCOPE_LINK;
- return 0;
- }
-
- memset(&fld, 0, sizeof(fld));
- fld.daddr = nh->nh_gw;
- fld.flowidn_oif = nh->nh_oif;
- fld.flowidn_scope = r->rtm_scope + 1;
-
- if (fld.flowidn_scope < RT_SCOPE_LINK)
- fld.flowidn_scope = RT_SCOPE_LINK;
-
- if ((err = dn_fib_lookup(&fld, &res)) != 0)
- return err;
-
- err = -EINVAL;
- if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
- goto out;
- nh->nh_scope = res.scope;
- nh->nh_oif = DN_FIB_RES_OIF(res);
- nh->nh_dev = DN_FIB_RES_DEV(res);
- if (nh->nh_dev == NULL)
- goto out;
- dev_hold(nh->nh_dev);
- err = -ENETDOWN;
- if (!(nh->nh_dev->flags & IFF_UP))
- goto out;
- err = 0;
-out:
- dn_fib_res_put(&res);
- return err;
- } else {
- struct net_device *dev;
-
- if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
- return -EINVAL;
-
- dev = __dev_get_by_index(&init_net, nh->nh_oif);
- if (dev == NULL || dev->dn_ptr == NULL)
- return -ENODEV;
- if (!(dev->flags&IFF_UP))
- return -ENETDOWN;
- nh->nh_dev = dev;
- dev_hold(nh->nh_dev);
- nh->nh_scope = RT_SCOPE_HOST;
- }
-
- return 0;
-}
-
-
-struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct nlattr *attrs[],
- const struct nlmsghdr *nlh, int *errp)
-{
- int err;
- struct dn_fib_info *fi = NULL;
- struct dn_fib_info *ofi;
- int nhs = 1;
-
- if (r->rtm_type > RTN_MAX)
- goto err_inval;
-
- if (dn_fib_props[r->rtm_type].scope > r->rtm_scope)
- goto err_inval;
-
- if (attrs[RTA_MULTIPATH] &&
- (nhs = dn_fib_count_nhs(attrs[RTA_MULTIPATH])) == 0)
- goto err_inval;
-
- fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct dn_fib_nh), GFP_KERNEL);
- err = -ENOBUFS;
- if (fi == NULL)
- goto failure;
-
- fi->fib_protocol = r->rtm_protocol;
- fi->fib_nhs = nhs;
- fi->fib_flags = r->rtm_flags;
-
- if (attrs[RTA_PRIORITY])
- fi->fib_priority = nla_get_u32(attrs[RTA_PRIORITY]);
-
- if (attrs[RTA_METRICS]) {
- struct nlattr *attr;
- int rem;
-
- nla_for_each_nested(attr, attrs[RTA_METRICS], rem) {
- int type = nla_type(attr);
-
- if (type) {
- if (type > RTAX_MAX || type == RTAX_CC_ALGO ||
- nla_len(attr) < 4)
- goto err_inval;
-
- fi->fib_metrics[type-1] = nla_get_u32(attr);
- }
- }
- }
-
- if (attrs[RTA_PREFSRC])
- fi->fib_prefsrc = nla_get_le16(attrs[RTA_PREFSRC]);
-
- if (attrs[RTA_MULTIPATH]) {
- if ((err = dn_fib_get_nhs(fi, attrs[RTA_MULTIPATH], r)) != 0)
- goto failure;
-
- if (attrs[RTA_OIF] &&
- fi->fib_nh->nh_oif != nla_get_u32(attrs[RTA_OIF]))
- goto err_inval;
-
- if (attrs[RTA_GATEWAY] &&
- fi->fib_nh->nh_gw != nla_get_le16(attrs[RTA_GATEWAY]))
- goto err_inval;
- } else {
- struct dn_fib_nh *nh = fi->fib_nh;
-
- if (attrs[RTA_OIF])
- nh->nh_oif = nla_get_u32(attrs[RTA_OIF]);
-
- if (attrs[RTA_GATEWAY])
- nh->nh_gw = nla_get_le16(attrs[RTA_GATEWAY]);
-
- nh->nh_flags = r->rtm_flags;
- nh->nh_weight = 1;
- }
-
- if (r->rtm_type == RTN_NAT) {
- if (!attrs[RTA_GATEWAY] || nhs != 1 || attrs[RTA_OIF])
- goto err_inval;
-
- fi->fib_nh->nh_gw = nla_get_le16(attrs[RTA_GATEWAY]);
- goto link_it;
- }
-
- if (dn_fib_props[r->rtm_type].error) {
- if (attrs[RTA_GATEWAY] || attrs[RTA_OIF] || attrs[RTA_MULTIPATH])
- goto err_inval;
-
- goto link_it;
- }
-
- if (r->rtm_scope > RT_SCOPE_HOST)
- goto err_inval;
-
- if (r->rtm_scope == RT_SCOPE_HOST) {
- struct dn_fib_nh *nh = fi->fib_nh;
-
- /* Local address is added */
- if (nhs != 1 || nh->nh_gw)
- goto err_inval;
- nh->nh_scope = RT_SCOPE_NOWHERE;
- nh->nh_dev = dev_get_by_index(&init_net, fi->fib_nh->nh_oif);
- err = -ENODEV;
- if (nh->nh_dev == NULL)
- goto failure;
- } else {
- change_nexthops(fi) {
- if ((err = dn_fib_check_nh(r, fi, nh)) != 0)
- goto failure;
- } endfor_nexthops(fi)
- }
-
- if (fi->fib_prefsrc) {
- if (r->rtm_type != RTN_LOCAL || !attrs[RTA_DST] ||
- fi->fib_prefsrc != nla_get_le16(attrs[RTA_DST]))
- if (dnet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
- goto err_inval;
- }
-
-link_it:
- if ((ofi = dn_fib_find_info(fi)) != NULL) {
- fi->fib_dead = 1;
- dn_fib_free_info(fi);
- ofi->fib_treeref++;
- return ofi;
- }
-
- fi->fib_treeref++;
- refcount_set(&fi->fib_clntref, 1);
- spin_lock(&dn_fib_info_lock);
- fi->fib_next = dn_fib_info_list;
- fi->fib_prev = NULL;
- if (dn_fib_info_list)
- dn_fib_info_list->fib_prev = fi;
- dn_fib_info_list = fi;
- spin_unlock(&dn_fib_info_lock);
- return fi;
-
-err_inval:
- err = -EINVAL;
-
-failure:
- *errp = err;
- if (fi) {
- fi->fib_dead = 1;
- dn_fib_free_info(fi);
- }
-
- return NULL;
-}
-
-int dn_fib_semantic_match(int type, struct dn_fib_info *fi, const struct flowidn *fld, struct dn_fib_res *res)
-{
- int err = dn_fib_props[type].error;
-
- if (err == 0) {
- if (fi->fib_flags & RTNH_F_DEAD)
- return 1;
-
- res->fi = fi;
-
- switch (type) {
- case RTN_NAT:
- DN_FIB_RES_RESET(*res);
- refcount_inc(&fi->fib_clntref);
- return 0;
- case RTN_UNICAST:
- case RTN_LOCAL:
- for_nexthops(fi) {
- if (nh->nh_flags & RTNH_F_DEAD)
- continue;
- if (!fld->flowidn_oif ||
- fld->flowidn_oif == nh->nh_oif)
- break;
- }
- if (nhsel < fi->fib_nhs) {
- res->nh_sel = nhsel;
- refcount_inc(&fi->fib_clntref);
- return 0;
- }
- endfor_nexthops(fi);
- res->fi = NULL;
- return 1;
- default:
- net_err_ratelimited("DECnet: impossible routing event : dn_fib_semantic_match type=%d\n",
- type);
- res->fi = NULL;
- return -EINVAL;
- }
- }
- return err;
-}
-
-void dn_fib_select_multipath(const struct flowidn *fld, struct dn_fib_res *res)
-{
- struct dn_fib_info *fi = res->fi;
- int w;
-
- spin_lock_bh(&dn_fib_multipath_lock);
- if (fi->fib_power <= 0) {
- int power = 0;
- change_nexthops(fi) {
- if (!(nh->nh_flags&RTNH_F_DEAD)) {
- power += nh->nh_weight;
- nh->nh_power = nh->nh_weight;
- }
- } endfor_nexthops(fi);
- fi->fib_power = power;
- if (power < 0) {
- spin_unlock_bh(&dn_fib_multipath_lock);
- res->nh_sel = 0;
- return;
- }
- }
-
- w = jiffies % fi->fib_power;
-
- change_nexthops(fi) {
- if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
- if ((w -= nh->nh_power) <= 0) {
- nh->nh_power--;
- fi->fib_power--;
- res->nh_sel = nhsel;
- spin_unlock_bh(&dn_fib_multipath_lock);
- return;
- }
- }
- } endfor_nexthops(fi);
- res->nh_sel = 0;
- spin_unlock_bh(&dn_fib_multipath_lock);
-}
-
-static inline u32 rtm_get_table(struct nlattr *attrs[], u8 table)
-{
- if (attrs[RTA_TABLE])
- table = nla_get_u32(attrs[RTA_TABLE]);
-
- return table;
-}
-
-static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
-{
- struct net *net = sock_net(skb->sk);
- struct dn_fib_table *tb;
- struct rtmsg *r = nlmsg_data(nlh);
- struct nlattr *attrs[RTA_MAX+1];
- int err;
-
- if (!netlink_capable(skb, CAP_NET_ADMIN))
- return -EPERM;
-
- if (!net_eq(net, &init_net))
- return -EINVAL;
-
- err = nlmsg_parse(nlh, sizeof(*r), attrs, RTA_MAX, rtm_dn_policy,
- extack);
- if (err < 0)
- return err;
-
- tb = dn_fib_get_table(rtm_get_table(attrs, r->rtm_table), 0);
- if (!tb)
- return -ESRCH;
-
- return tb->delete(tb, r, attrs, nlh, &NETLINK_CB(skb));
-}
-
-static int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
-{
- struct net *net = sock_net(skb->sk);
- struct dn_fib_table *tb;
- struct rtmsg *r = nlmsg_data(nlh);
- struct nlattr *attrs[RTA_MAX+1];
- int err;
-
- if (!netlink_capable(skb, CAP_NET_ADMIN))
- return -EPERM;
-
- if (!net_eq(net, &init_net))
- return -EINVAL;
-
- err = nlmsg_parse(nlh, sizeof(*r), attrs, RTA_MAX, rtm_dn_policy,
- extack);
- if (err < 0)
- return err;
-
- tb = dn_fib_get_table(rtm_get_table(attrs, r->rtm_table), 1);
- if (!tb)
- return -ENOBUFS;
-
- return tb->insert(tb, r, attrs, nlh, &NETLINK_CB(skb));
-}
-
-static void fib_magic(int cmd, int type, __le16 dst, int dst_len, struct dn_ifaddr *ifa)
-{
- struct dn_fib_table *tb;
- struct {
- struct nlmsghdr nlh;
- struct rtmsg rtm;
- } req;
- struct {
- struct nlattr hdr;
- __le16 dst;
- } dst_attr = {
- .dst = dst,
- };
- struct {
- struct nlattr hdr;
- __le16 prefsrc;
- } prefsrc_attr = {
- .prefsrc = ifa->ifa_local,
- };
- struct {
- struct nlattr hdr;
- u32 oif;
- } oif_attr = {
- .oif = ifa->ifa_dev->dev->ifindex,
- };
- struct nlattr *attrs[RTA_MAX+1] = {
- [RTA_DST] = (struct nlattr *) &dst_attr,
- [RTA_PREFSRC] = (struct nlattr * ) &prefsrc_attr,
- [RTA_OIF] = (struct nlattr *) &oif_attr,
- };
-
- memset(&req.rtm, 0, sizeof(req.rtm));
-
- if (type == RTN_UNICAST)
- tb = dn_fib_get_table(RT_MIN_TABLE, 1);
- else
- tb = dn_fib_get_table(RT_TABLE_LOCAL, 1);
-
- if (tb == NULL)
- return;
-
- req.nlh.nlmsg_len = sizeof(req);
- req.nlh.nlmsg_type = cmd;
- req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE|NLM_F_APPEND;
- req.nlh.nlmsg_pid = 0;
- req.nlh.nlmsg_seq = 0;
-
- req.rtm.rtm_dst_len = dst_len;
- req.rtm.rtm_table = tb->n;
- req.rtm.rtm_protocol = RTPROT_KERNEL;
- req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST);
- req.rtm.rtm_type = type;
-
- if (cmd == RTM_NEWROUTE)
- tb->insert(tb, &req.rtm, attrs, &req.nlh, NULL);
- else
- tb->delete(tb, &req.rtm, attrs, &req.nlh, NULL);
-}
-
-static void dn_fib_add_ifaddr(struct dn_ifaddr *ifa)
-{
-
- fib_magic(RTM_NEWROUTE, RTN_LOCAL, ifa->ifa_local, 16, ifa);
-
-#if 0
- if (!(dev->flags&IFF_UP))
- return;
- /* In the future, we will want to add default routes here */
-
-#endif
-}
-
-static void dn_fib_del_ifaddr(struct dn_ifaddr *ifa)
-{
- int found_it = 0;
- struct net_device *dev;
- struct dn_dev *dn_db;
- struct dn_ifaddr *ifa2;
-
- ASSERT_RTNL();
-
- /* Scan device list */
- rcu_read_lock();
- for_each_netdev_rcu(&init_net, dev) {
- dn_db = rcu_dereference(dev->dn_ptr);
- if (dn_db == NULL)
- continue;
- for (ifa2 = rcu_dereference(dn_db->ifa_list);
- ifa2 != NULL;
- ifa2 = rcu_dereference(ifa2->ifa_next)) {
- if (ifa2->ifa_local == ifa->ifa_local) {
- found_it = 1;
- break;
- }
- }
- }
- rcu_read_unlock();
-
- if (found_it == 0) {
- fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 16, ifa);
-
- if (dnet_addr_type(ifa->ifa_local) != RTN_LOCAL) {
- if (dn_fib_sync_down(ifa->ifa_local, NULL, 0))
- dn_fib_flush();
- }
- }
-}
-
-static void dn_fib_disable_addr(struct net_device *dev, int force)
-{
- if (dn_fib_sync_down(0, dev, force))
- dn_fib_flush();
- dn_rt_cache_flush(0);
- neigh_ifdown(&dn_neigh_table, dev);
-}
-
-static int dn_fib_dnaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
-{
- struct dn_ifaddr *ifa = (struct dn_ifaddr *)ptr;
-
- switch (event) {
- case NETDEV_UP:
- dn_fib_add_ifaddr(ifa);
- dn_fib_sync_up(ifa->ifa_dev->dev);
- dn_rt_cache_flush(-1);
- break;
- case NETDEV_DOWN:
- dn_fib_del_ifaddr(ifa);
- if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) {
- dn_fib_disable_addr(ifa->ifa_dev->dev, 1);
- } else {
- dn_rt_cache_flush(-1);
- }
- break;
- }
- return NOTIFY_DONE;
-}
-
-static int dn_fib_sync_down(__le16 local, struct net_device *dev, int force)
-{
- int ret = 0;
- int scope = RT_SCOPE_NOWHERE;
-
- if (force)
- scope = -1;
-
- for_fib_info() {
- /*
- * This makes no sense for DECnet.... we will almost
- * certainly have more than one local address the same
- * over all our interfaces. It needs thinking about
- * some more.
- */
- if (local && fi->fib_prefsrc == local) {
- fi->fib_flags |= RTNH_F_DEAD;
- ret++;
- } else if (dev && fi->fib_nhs) {
- int dead = 0;
-
- change_nexthops(fi) {
- if (nh->nh_flags&RTNH_F_DEAD)
- dead++;
- else if (nh->nh_dev == dev &&
- nh->nh_scope != scope) {
- spin_lock_bh(&dn_fib_multipath_lock);
- nh->nh_flags |= RTNH_F_DEAD;
- fi->fib_power -= nh->nh_power;
- nh->nh_power = 0;
- spin_unlock_bh(&dn_fib_multipath_lock);
- dead++;
- }
- } endfor_nexthops(fi)
- if (dead == fi->fib_nhs) {
- fi->fib_flags |= RTNH_F_DEAD;
- ret++;
- }
- }
- } endfor_fib_info();
- return ret;
-}
-
-
-static int dn_fib_sync_up(struct net_device *dev)
-{
- int ret = 0;
-
- if (!(dev->flags&IFF_UP))
- return 0;
-
- for_fib_info() {
- int alive = 0;
-
- change_nexthops(fi) {
- if (!(nh->nh_flags&RTNH_F_DEAD)) {
- alive++;
- continue;
- }
- if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
- continue;
- if (nh->nh_dev != dev || dev->dn_ptr == NULL)
- continue;
- alive++;
- spin_lock_bh(&dn_fib_multipath_lock);
- nh->nh_power = 0;
- nh->nh_flags &= ~RTNH_F_DEAD;
- spin_unlock_bh(&dn_fib_multipath_lock);
- } endfor_nexthops(fi);
-
- if (alive > 0) {
- fi->fib_flags &= ~RTNH_F_DEAD;
- ret++;
- }
- } endfor_fib_info();
- return ret;
-}
-
-static struct notifier_block dn_fib_dnaddr_notifier = {
- .notifier_call = dn_fib_dnaddr_event,
-};
-
-void __exit dn_fib_cleanup(void)
-{
- dn_fib_table_cleanup();
- dn_fib_rules_cleanup();
-
- unregister_dnaddr_notifier(&dn_fib_dnaddr_notifier);
-}
-
-
-void __init dn_fib_init(void)
-{
- dn_fib_table_init();
- dn_fib_rules_init();
-
- register_dnaddr_notifier(&dn_fib_dnaddr_notifier);
-
- rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_NEWROUTE,
- dn_fib_rtm_newroute, NULL, 0);
- rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELROUTE,
- dn_fib_rtm_delroute, NULL, 0);
-}
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
deleted file mode 100644
index 94b306f6d551..000000000000
--- a/net/decnet/dn_neigh.c
+++ /dev/null
@@ -1,605 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Neighbour Functions (Adjacency Database and
- * On-Ethernet Cache)
- *
- * Author: Steve Whitehouse <SteveW@ACM.org>
- *
- *
- * Changes:
- * Steve Whitehouse : Fixed router listing routine
- * Steve Whitehouse : Added error_report functions
- * Steve Whitehouse : Added default router detection
- * Steve Whitehouse : Hop counts in outgoing messages
- * Steve Whitehouse : Fixed src/dst in outgoing messages so
- * forwarding now stands a good chance of
- * working.
- * Steve Whitehouse : Fixed neighbour states (for now anyway).
- * Steve Whitehouse : Made error_report functions dummies. This
- * is not the right place to return skbs.
- * Steve Whitehouse : Convert to seq_file
- *
- */
-
-#include <linux/net.h>
-#include <linux/module.h>
-#include <linux/socket.h>
-#include <linux/if_arp.h>
-#include <linux/slab.h>
-#include <linux/if_ether.h>
-#include <linux/init.h>
-#include <linux/proc_fs.h>
-#include <linux/string.h>
-#include <linux/netfilter_decnet.h>
-#include <linux/spinlock.h>
-#include <linux/seq_file.h>
-#include <linux/rcupdate.h>
-#include <linux/jhash.h>
-#include <linux/atomic.h>
-#include <net/net_namespace.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/dn.h>
-#include <net/dn_dev.h>
-#include <net/dn_neigh.h>
-#include <net/dn_route.h>
-
-static int dn_neigh_construct(struct neighbour *);
-static void dn_neigh_error_report(struct neighbour *, struct sk_buff *);
-static int dn_neigh_output(struct neighbour *neigh, struct sk_buff *skb);
-
-/*
- * Operations for adding the link layer header.
- */
-static const struct neigh_ops dn_neigh_ops = {
- .family = AF_DECnet,
- .error_report = dn_neigh_error_report,
- .output = dn_neigh_output,
- .connected_output = dn_neigh_output,
-};
-
-static u32 dn_neigh_hash(const void *pkey,
- const struct net_device *dev,
- __u32 *hash_rnd)
-{
- return jhash_2words(*(__u16 *)pkey, 0, hash_rnd[0]);
-}
-
-static bool dn_key_eq(const struct neighbour *neigh, const void *pkey)
-{
- return neigh_key_eq16(neigh, pkey);
-}
-
-struct neigh_table dn_neigh_table = {
- .family = PF_DECnet,
- .entry_size = NEIGH_ENTRY_SIZE(sizeof(struct dn_neigh)),
- .key_len = sizeof(__le16),
- .protocol = cpu_to_be16(ETH_P_DNA_RT),
- .hash = dn_neigh_hash,
- .key_eq = dn_key_eq,
- .constructor = dn_neigh_construct,
- .id = "dn_neigh_cache",
- .parms ={
- .tbl = &dn_neigh_table,
- .reachable_time = 30 * HZ,
- .data = {
- [NEIGH_VAR_MCAST_PROBES] = 0,
- [NEIGH_VAR_UCAST_PROBES] = 0,
- [NEIGH_VAR_APP_PROBES] = 0,
- [NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
- [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
- [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
- [NEIGH_VAR_GC_STALETIME] = 60 * HZ,
- [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
- [NEIGH_VAR_PROXY_QLEN] = 0,
- [NEIGH_VAR_ANYCAST_DELAY] = 0,
- [NEIGH_VAR_PROXY_DELAY] = 0,
- [NEIGH_VAR_LOCKTIME] = 1 * HZ,
- },
- },
- .gc_interval = 30 * HZ,
- .gc_thresh1 = 128,
- .gc_thresh2 = 512,
- .gc_thresh3 = 1024,
-};
-
-static int dn_neigh_construct(struct neighbour *neigh)
-{
- struct net_device *dev = neigh->dev;
- struct dn_neigh *dn = container_of(neigh, struct dn_neigh, n);
- struct dn_dev *dn_db;
- struct neigh_parms *parms;
-
- rcu_read_lock();
- dn_db = rcu_dereference(dev->dn_ptr);
- if (dn_db == NULL) {
- rcu_read_unlock();
- return -EINVAL;
- }
-
- parms = dn_db->neigh_parms;
- if (!parms) {
- rcu_read_unlock();
- return -EINVAL;
- }
-
- __neigh_parms_put(neigh->parms);
- neigh->parms = neigh_parms_clone(parms);
- rcu_read_unlock();
-
- neigh->ops = &dn_neigh_ops;
- neigh->nud_state = NUD_NOARP;
- neigh->output = neigh->ops->connected_output;
-
- if ((dev->type == ARPHRD_IPGRE) || (dev->flags & IFF_POINTOPOINT))
- memcpy(neigh->ha, dev->broadcast, dev->addr_len);
- else if ((dev->type == ARPHRD_ETHER) || (dev->type == ARPHRD_LOOPBACK))
- dn_dn2eth(neigh->ha, dn->addr);
- else {
- net_dbg_ratelimited("Trying to create neigh for hw %d\n",
- dev->type);
- return -EINVAL;
- }
-
- /*
- * Make an estimate of the remote block size by assuming that its
- * two less then the device mtu, which it true for ethernet (and
- * other things which support long format headers) since there is
- * an extra length field (of 16 bits) which isn't part of the
- * ethernet headers and which the DECnet specs won't admit is part
- * of the DECnet routing headers either.
- *
- * If we over estimate here its no big deal, the NSP negotiations
- * will prevent us from sending packets which are too large for the
- * remote node to handle. In any case this figure is normally updated
- * by a hello message in most cases.
- */
- dn->blksize = dev->mtu - 2;
-
- return 0;
-}
-
-static void dn_neigh_error_report(struct neighbour *neigh, struct sk_buff *skb)
-{
- printk(KERN_DEBUG "dn_neigh_error_report: called\n");
- kfree_skb(skb);
-}
-
-static int dn_neigh_output(struct neighbour *neigh, struct sk_buff *skb)
-{
- struct dst_entry *dst = skb_dst(skb);
- struct dn_route *rt = (struct dn_route *)dst;
- struct net_device *dev = neigh->dev;
- char mac_addr[ETH_ALEN];
- unsigned int seq;
- int err;
-
- dn_dn2eth(mac_addr, rt->rt_local_src);
- do {
- seq = read_seqbegin(&neigh->ha_lock);
- err = dev_hard_header(skb, dev, ntohs(skb->protocol),
- neigh->ha, mac_addr, skb->len);
- } while (read_seqretry(&neigh->ha_lock, seq));
-
- if (err >= 0)
- err = dev_queue_xmit(skb);
- else {
- kfree_skb(skb);
- err = -EINVAL;
- }
- return err;
-}
-
-static int dn_neigh_output_packet(struct net *net, struct sock *sk, struct sk_buff *skb)
-{
- struct dst_entry *dst = skb_dst(skb);
- struct dn_route *rt = (struct dn_route *)dst;
- struct neighbour *neigh = rt->n;
-
- return neigh->output(neigh, skb);
-}
-
-/*
- * For talking to broadcast devices: Ethernet & PPP
- */
-static int dn_long_output(struct neighbour *neigh, struct sock *sk,
- struct sk_buff *skb)
-{
- struct net_device *dev = neigh->dev;
- int headroom = dev->hard_header_len + sizeof(struct dn_long_packet) + 3;
- unsigned char *data;
- struct dn_long_packet *lp;
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
-
-
- if (skb_headroom(skb) < headroom) {
- struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom);
- if (skb2 == NULL) {
- net_crit_ratelimited("dn_long_output: no memory\n");
- kfree_skb(skb);
- return -ENOBUFS;
- }
- consume_skb(skb);
- skb = skb2;
- net_info_ratelimited("dn_long_output: Increasing headroom\n");
- }
-
- data = skb_push(skb, sizeof(struct dn_long_packet) + 3);
- lp = (struct dn_long_packet *)(data+3);
-
- *((__le16 *)data) = cpu_to_le16(skb->len - 2);
- *(data + 2) = 1 | DN_RT_F_PF; /* Padding */
-
- lp->msgflg = DN_RT_PKT_LONG|(cb->rt_flags&(DN_RT_F_IE|DN_RT_F_RQR|DN_RT_F_RTS));
- lp->d_area = lp->d_subarea = 0;
- dn_dn2eth(lp->d_id, cb->dst);
- lp->s_area = lp->s_subarea = 0;
- dn_dn2eth(lp->s_id, cb->src);
- lp->nl2 = 0;
- lp->visit_ct = cb->hops & 0x3f;
- lp->s_class = 0;
- lp->pt = 0;
-
- skb_reset_network_header(skb);
-
- return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING,
- &init_net, sk, skb, NULL, neigh->dev,
- dn_neigh_output_packet);
-}
-
-/*
- * For talking to pointopoint and multidrop devices: DDCMP and X.25
- */
-static int dn_short_output(struct neighbour *neigh, struct sock *sk,
- struct sk_buff *skb)
-{
- struct net_device *dev = neigh->dev;
- int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2;
- struct dn_short_packet *sp;
- unsigned char *data;
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
-
-
- if (skb_headroom(skb) < headroom) {
- struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom);
- if (skb2 == NULL) {
- net_crit_ratelimited("dn_short_output: no memory\n");
- kfree_skb(skb);
- return -ENOBUFS;
- }
- consume_skb(skb);
- skb = skb2;
- net_info_ratelimited("dn_short_output: Increasing headroom\n");
- }
-
- data = skb_push(skb, sizeof(struct dn_short_packet) + 2);
- *((__le16 *)data) = cpu_to_le16(skb->len - 2);
- sp = (struct dn_short_packet *)(data+2);
-
- sp->msgflg = DN_RT_PKT_SHORT|(cb->rt_flags&(DN_RT_F_RQR|DN_RT_F_RTS));
- sp->dstnode = cb->dst;
- sp->srcnode = cb->src;
- sp->forward = cb->hops & 0x3f;
-
- skb_reset_network_header(skb);
-
- return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING,
- &init_net, sk, skb, NULL, neigh->dev,
- dn_neigh_output_packet);
-}
-
-/*
- * For talking to DECnet phase III nodes
- * Phase 3 output is the same as short output, execpt that
- * it clears the area bits before transmission.
- */
-static int dn_phase3_output(struct neighbour *neigh, struct sock *sk,
- struct sk_buff *skb)
-{
- struct net_device *dev = neigh->dev;
- int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2;
- struct dn_short_packet *sp;
- unsigned char *data;
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
-
- if (skb_headroom(skb) < headroom) {
- struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom);
- if (skb2 == NULL) {
- net_crit_ratelimited("dn_phase3_output: no memory\n");
- kfree_skb(skb);
- return -ENOBUFS;
- }
- consume_skb(skb);
- skb = skb2;
- net_info_ratelimited("dn_phase3_output: Increasing headroom\n");
- }
-
- data = skb_push(skb, sizeof(struct dn_short_packet) + 2);
- *((__le16 *)data) = cpu_to_le16(skb->len - 2);
- sp = (struct dn_short_packet *)(data + 2);
-
- sp->msgflg = DN_RT_PKT_SHORT|(cb->rt_flags&(DN_RT_F_RQR|DN_RT_F_RTS));
- sp->dstnode = cb->dst & cpu_to_le16(0x03ff);
- sp->srcnode = cb->src & cpu_to_le16(0x03ff);
- sp->forward = cb->hops & 0x3f;
-
- skb_reset_network_header(skb);
-
- return NF_HOOK(NFPROTO_DECNET, NF_DN_POST_ROUTING,
- &init_net, sk, skb, NULL, neigh->dev,
- dn_neigh_output_packet);
-}
-
-int dn_to_neigh_output(struct net *net, struct sock *sk, struct sk_buff *skb)
-{
- struct dst_entry *dst = skb_dst(skb);
- struct dn_route *rt = (struct dn_route *) dst;
- struct neighbour *neigh = rt->n;
- struct dn_neigh *dn = container_of(neigh, struct dn_neigh, n);
- struct dn_dev *dn_db;
- bool use_long;
-
- rcu_read_lock();
- dn_db = rcu_dereference(neigh->dev->dn_ptr);
- if (dn_db == NULL) {
- rcu_read_unlock();
- return -EINVAL;
- }
- use_long = dn_db->use_long;
- rcu_read_unlock();
-
- if (dn->flags & DN_NDFLAG_P3)
- return dn_phase3_output(neigh, sk, skb);
- if (use_long)
- return dn_long_output(neigh, sk, skb);
- else
- return dn_short_output(neigh, sk, skb);
-}
-
-/*
- * Unfortunately, the neighbour code uses the device in its hash
- * function, so we don't get any advantage from it. This function
- * basically does a neigh_lookup(), but without comparing the device
- * field. This is required for the On-Ethernet cache
- */
-
-/*
- * Pointopoint link receives a hello message
- */
-void dn_neigh_pointopoint_hello(struct sk_buff *skb)
-{
- kfree_skb(skb);
-}
-
-/*
- * Ethernet router hello message received
- */
-int dn_neigh_router_hello(struct net *net, struct sock *sk, struct sk_buff *skb)
-{
- struct rtnode_hello_message *msg = (struct rtnode_hello_message *)skb->data;
-
- struct neighbour *neigh;
- struct dn_neigh *dn;
- struct dn_dev *dn_db;
- __le16 src;
-
- src = dn_eth2dn(msg->id);
-
- neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1);
-
- dn = container_of(neigh, struct dn_neigh, n);
-
- if (neigh) {
- write_lock(&neigh->lock);
-
- neigh->used = jiffies;
- dn_db = rcu_dereference(neigh->dev->dn_ptr);
-
- if (!(neigh->nud_state & NUD_PERMANENT)) {
- neigh->updated = jiffies;
-
- if (neigh->dev->type == ARPHRD_ETHER)
- memcpy(neigh->ha, &eth_hdr(skb)->h_source, ETH_ALEN);
-
- dn->blksize = le16_to_cpu(msg->blksize);
- dn->priority = msg->priority;
-
- dn->flags &= ~DN_NDFLAG_P3;
-
- switch (msg->iinfo & DN_RT_INFO_TYPE) {
- case DN_RT_INFO_L1RT:
- dn->flags &=~DN_NDFLAG_R2;
- dn->flags |= DN_NDFLAG_R1;
- break;
- case DN_RT_INFO_L2RT:
- dn->flags |= DN_NDFLAG_R2;
- }
- }
-
- /* Only use routers in our area */
- if ((le16_to_cpu(src)>>10) == (le16_to_cpu((decnet_address))>>10)) {
- if (!dn_db->router) {
- dn_db->router = neigh_clone(neigh);
- } else {
- if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority)
- neigh_release(xchg(&dn_db->router, neigh_clone(neigh)));
- }
- }
- write_unlock(&neigh->lock);
- neigh_release(neigh);
- }
-
- kfree_skb(skb);
- return 0;
-}
-
-/*
- * Endnode hello message received
- */
-int dn_neigh_endnode_hello(struct net *net, struct sock *sk, struct sk_buff *skb)
-{
- struct endnode_hello_message *msg = (struct endnode_hello_message *)skb->data;
- struct neighbour *neigh;
- struct dn_neigh *dn;
- __le16 src;
-
- src = dn_eth2dn(msg->id);
-
- neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1);
-
- dn = container_of(neigh, struct dn_neigh, n);
-
- if (neigh) {
- write_lock(&neigh->lock);
-
- neigh->used = jiffies;
-
- if (!(neigh->nud_state & NUD_PERMANENT)) {
- neigh->updated = jiffies;
-
- if (neigh->dev->type == ARPHRD_ETHER)
- memcpy(neigh->ha, &eth_hdr(skb)->h_source, ETH_ALEN);
- dn->flags &= ~(DN_NDFLAG_R1 | DN_NDFLAG_R2);
- dn->blksize = le16_to_cpu(msg->blksize);
- dn->priority = 0;
- }
-
- write_unlock(&neigh->lock);
- neigh_release(neigh);
- }
-
- kfree_skb(skb);
- return 0;
-}
-
-static char *dn_find_slot(char *base, int max, int priority)
-{
- int i;
- unsigned char *min = NULL;
-
- base += 6; /* skip first id */
-
- for(i = 0; i < max; i++) {
- if (!min || (*base < *min))
- min = base;
- base += 7; /* find next priority */
- }
-
- if (!min)
- return NULL;
-
- return (*min < priority) ? (min - 6) : NULL;
-}
-
-struct elist_cb_state {
- struct net_device *dev;
- unsigned char *ptr;
- unsigned char *rs;
- int t, n;
-};
-
-static void neigh_elist_cb(struct neighbour *neigh, void *_info)
-{
- struct elist_cb_state *s = _info;
- struct dn_neigh *dn;
-
- if (neigh->dev != s->dev)
- return;
-
- dn = container_of(neigh, struct dn_neigh, n);
- if (!(dn->flags & (DN_NDFLAG_R1|DN_NDFLAG_R2)))
- return;
-
- if (s->t == s->n)
- s->rs = dn_find_slot(s->ptr, s->n, dn->priority);
- else
- s->t++;
- if (s->rs == NULL)
- return;
-
- dn_dn2eth(s->rs, dn->addr);
- s->rs += 6;
- *(s->rs) = neigh->nud_state & NUD_CONNECTED ? 0x80 : 0x0;
- *(s->rs) |= dn->priority;
- s->rs++;
-}
-
-int dn_neigh_elist(struct net_device *dev, unsigned char *ptr, int n)
-{
- struct elist_cb_state state;
-
- state.dev = dev;
- state.t = 0;
- state.n = n;
- state.ptr = ptr;
- state.rs = ptr;
-
- neigh_for_each(&dn_neigh_table, neigh_elist_cb, &state);
-
- return state.t;
-}
-
-
-#ifdef CONFIG_PROC_FS
-
-static inline void dn_neigh_format_entry(struct seq_file *seq,
- struct neighbour *n)
-{
- struct dn_neigh *dn = container_of(n, struct dn_neigh, n);
- char buf[DN_ASCBUF_LEN];
-
- read_lock(&n->lock);
- seq_printf(seq, "%-7s %s%s%s %02x %02d %07ld %-8s\n",
- dn_addr2asc(le16_to_cpu(dn->addr), buf),
- (dn->flags&DN_NDFLAG_R1) ? "1" : "-",
- (dn->flags&DN_NDFLAG_R2) ? "2" : "-",
- (dn->flags&DN_NDFLAG_P3) ? "3" : "-",
- dn->n.nud_state,
- refcount_read(&dn->n.refcnt),
- dn->blksize,
- (dn->n.dev) ? dn->n.dev->name : "?");
- read_unlock(&n->lock);
-}
-
-static int dn_neigh_seq_show(struct seq_file *seq, void *v)
-{
- if (v == SEQ_START_TOKEN) {
- seq_puts(seq, "Addr Flags State Use Blksize Dev\n");
- } else {
- dn_neigh_format_entry(seq, v);
- }
-
- return 0;
-}
-
-static void *dn_neigh_seq_start(struct seq_file *seq, loff_t *pos)
-{
- return neigh_seq_start(seq, pos, &dn_neigh_table,
- NEIGH_SEQ_NEIGH_ONLY);
-}
-
-static const struct seq_operations dn_neigh_seq_ops = {
- .start = dn_neigh_seq_start,
- .next = neigh_seq_next,
- .stop = neigh_seq_stop,
- .show = dn_neigh_seq_show,
-};
-#endif
-
-void __init dn_neigh_init(void)
-{
- neigh_table_init(NEIGH_DN_TABLE, &dn_neigh_table);
- proc_create_net("decnet_neigh", 0444, init_net.proc_net,
- &dn_neigh_seq_ops, sizeof(struct neigh_seq_state));
-}
-
-void __exit dn_neigh_cleanup(void)
-{
- remove_proc_entry("decnet_neigh", init_net.proc_net);
- neigh_table_clear(NEIGH_DN_TABLE, &dn_neigh_table);
-}
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
deleted file mode 100644
index 2fb5e055ba25..000000000000
--- a/net/decnet/dn_nsp_in.c
+++ /dev/null
@@ -1,914 +0,0 @@
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Network Services Protocol (Input)
- *
- * Author: Eduardo Marcelo Serrat <emserrat@geocities.com>
- *
- * Changes:
- *
- * Steve Whitehouse: Split into dn_nsp_in.c and dn_nsp_out.c from
- * original dn_nsp.c.
- * Steve Whitehouse: Updated to work with my new routing architecture.
- * Steve Whitehouse: Add changes from Eduardo Serrat's patches.
- * Steve Whitehouse: Put all ack handling code in a common routine.
- * Steve Whitehouse: Put other common bits into dn_nsp_rx()
- * Steve Whitehouse: More checks on skb->len to catch bogus packets
- * Fixed various race conditions and possible nasties.
- * Steve Whitehouse: Now handles returned conninit frames.
- * David S. Miller: New socket locking
- * Steve Whitehouse: Fixed lockup when socket filtering was enabled.
- * Paul Koning: Fix to push CC sockets into RUN when acks are
- * received.
- * Steve Whitehouse:
- * Patrick Caulfield: Checking conninits for correctness & sending of error
- * responses.
- * Steve Whitehouse: Added backlog congestion level return codes.
- * Patrick Caulfield:
- * Steve Whitehouse: Added flow control support (outbound)
- * Steve Whitehouse: Prepare for nonlinear skbs
- */
-
-/******************************************************************************
- (c) 1995-1998 E.M. Serrat emserrat@geocities.com
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-*******************************************************************************/
-
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <linux/string.h>
-#include <linux/sockios.h>
-#include <linux/net.h>
-#include <linux/netdevice.h>
-#include <linux/inet.h>
-#include <linux/route.h>
-#include <linux/slab.h>
-#include <net/sock.h>
-#include <net/tcp_states.h>
-#include <linux/fcntl.h>
-#include <linux/mm.h>
-#include <linux/termios.h>
-#include <linux/interrupt.h>
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
-#include <linux/init.h>
-#include <linux/poll.h>
-#include <linux/netfilter_decnet.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/dn.h>
-#include <net/dn_nsp.h>
-#include <net/dn_dev.h>
-#include <net/dn_route.h>
-
-extern int decnet_log_martians;
-
-static void dn_log_martian(struct sk_buff *skb, const char *msg)
-{
- if (decnet_log_martians) {
- char *devname = skb->dev ? skb->dev->name : "???";
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- net_info_ratelimited("DECnet: Martian packet (%s) dev=%s src=0x%04hx dst=0x%04hx srcport=0x%04hx dstport=0x%04hx\n",
- msg, devname,
- le16_to_cpu(cb->src),
- le16_to_cpu(cb->dst),
- le16_to_cpu(cb->src_port),
- le16_to_cpu(cb->dst_port));
- }
-}
-
-/*
- * For this function we've flipped the cross-subchannel bit
- * if the message is an otherdata or linkservice message. Thus
- * we can use it to work out what to update.
- */
-static void dn_ack(struct sock *sk, struct sk_buff *skb, unsigned short ack)
-{
- struct dn_scp *scp = DN_SK(sk);
- unsigned short type = ((ack >> 12) & 0x0003);
- int wakeup = 0;
-
- switch (type) {
- case 0: /* ACK - Data */
- if (dn_after(ack, scp->ackrcv_dat)) {
- scp->ackrcv_dat = ack & 0x0fff;
- wakeup |= dn_nsp_check_xmit_queue(sk, skb,
- &scp->data_xmit_queue,
- ack);
- }
- break;
- case 1: /* NAK - Data */
- break;
- case 2: /* ACK - OtherData */
- if (dn_after(ack, scp->ackrcv_oth)) {
- scp->ackrcv_oth = ack & 0x0fff;
- wakeup |= dn_nsp_check_xmit_queue(sk, skb,
- &scp->other_xmit_queue,
- ack);
- }
- break;
- case 3: /* NAK - OtherData */
- break;
- }
-
- if (wakeup && !sock_flag(sk, SOCK_DEAD))
- sk->sk_state_change(sk);
-}
-
-/*
- * This function is a universal ack processor.
- */
-static int dn_process_ack(struct sock *sk, struct sk_buff *skb, int oth)
-{
- __le16 *ptr = (__le16 *)skb->data;
- int len = 0;
- unsigned short ack;
-
- if (skb->len < 2)
- return len;
-
- if ((ack = le16_to_cpu(*ptr)) & 0x8000) {
- skb_pull(skb, 2);
- ptr++;
- len += 2;
- if ((ack & 0x4000) == 0) {
- if (oth)
- ack ^= 0x2000;
- dn_ack(sk, skb, ack);
- }
- }
-
- if (skb->len < 2)
- return len;
-
- if ((ack = le16_to_cpu(*ptr)) & 0x8000) {
- skb_pull(skb, 2);
- len += 2;
- if ((ack & 0x4000) == 0) {
- if (oth)
- ack ^= 0x2000;
- dn_ack(sk, skb, ack);
- }
- }
-
- return len;
-}
-
-
-/**
- * dn_check_idf - Check an image data field format is correct.
- * @pptr: Pointer to pointer to image data
- * @len: Pointer to length of image data
- * @max: The maximum allowed length of the data in the image data field
- * @follow_on: Check that this many bytes exist beyond the end of the image data
- *
- * Returns: 0 if ok, -1 on error
- */
-static inline int dn_check_idf(unsigned char **pptr, int *len, unsigned char max, unsigned char follow_on)
-{
- unsigned char *ptr = *pptr;
- unsigned char flen = *ptr++;
-
- (*len)--;
- if (flen > max)
- return -1;
- if ((flen + follow_on) > *len)
- return -1;
-
- *len -= flen;
- *pptr = ptr + flen;
- return 0;
-}
-
-/*
- * Table of reason codes to pass back to node which sent us a badly
- * formed message, plus text messages for the log. A zero entry in
- * the reason field means "don't reply" otherwise a disc init is sent with
- * the specified reason code.
- */
-static struct {
- unsigned short reason;
- const char *text;
-} ci_err_table[] = {
- { 0, "CI: Truncated message" },
- { NSP_REASON_ID, "CI: Destination username error" },
- { NSP_REASON_ID, "CI: Destination username type" },
- { NSP_REASON_US, "CI: Source username error" },
- { 0, "CI: Truncated at menuver" },
- { 0, "CI: Truncated before access or user data" },
- { NSP_REASON_IO, "CI: Access data format error" },
- { NSP_REASON_IO, "CI: User data format error" }
-};
-
-/*
- * This function uses a slightly different lookup method
- * to find its sockets, since it searches on object name/number
- * rather than port numbers. Various tests are done to ensure that
- * the incoming data is in the correct format before it is queued to
- * a socket.
- */
-static struct sock *dn_find_listener(struct sk_buff *skb, unsigned short *reason)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- struct nsp_conn_init_msg *msg = (struct nsp_conn_init_msg *)skb->data;
- struct sockaddr_dn dstaddr;
- struct sockaddr_dn srcaddr;
- unsigned char type = 0;
- int dstlen;
- int srclen;
- unsigned char *ptr;
- int len;
- int err = 0;
- unsigned char menuver;
-
- memset(&dstaddr, 0, sizeof(struct sockaddr_dn));
- memset(&srcaddr, 0, sizeof(struct sockaddr_dn));
-
- /*
- * 1. Decode & remove message header
- */
- cb->src_port = msg->srcaddr;
- cb->dst_port = msg->dstaddr;
- cb->services = msg->services;
- cb->info = msg->info;
- cb->segsize = le16_to_cpu(msg->segsize);
-
- if (!pskb_may_pull(skb, sizeof(*msg)))
- goto err_out;
-
- skb_pull(skb, sizeof(*msg));
-
- len = skb->len;
- ptr = skb->data;
-
- /*
- * 2. Check destination end username format
- */
- dstlen = dn_username2sockaddr(ptr, len, &dstaddr, &type);
- err++;
- if (dstlen < 0)
- goto err_out;
-
- err++;
- if (type > 1)
- goto err_out;
-
- len -= dstlen;
- ptr += dstlen;
-
- /*
- * 3. Check source end username format
- */
- srclen = dn_username2sockaddr(ptr, len, &srcaddr, &type);
- err++;
- if (srclen < 0)
- goto err_out;
-
- len -= srclen;
- ptr += srclen;
- err++;
- if (len < 1)
- goto err_out;
-
- menuver = *ptr;
- ptr++;
- len--;
-
- /*
- * 4. Check that optional data actually exists if menuver says it does
- */
- err++;
- if ((menuver & (DN_MENUVER_ACC | DN_MENUVER_USR)) && (len < 1))
- goto err_out;
-
- /*
- * 5. Check optional access data format
- */
- err++;
- if (menuver & DN_MENUVER_ACC) {
- if (dn_check_idf(&ptr, &len, 39, 1))
- goto err_out;
- if (dn_check_idf(&ptr, &len, 39, 1))
- goto err_out;
- if (dn_check_idf(&ptr, &len, 39, (menuver & DN_MENUVER_USR) ? 1 : 0))
- goto err_out;
- }
-
- /*
- * 6. Check optional user data format
- */
- err++;
- if (menuver & DN_MENUVER_USR) {
- if (dn_check_idf(&ptr, &len, 16, 0))
- goto err_out;
- }
-
- /*
- * 7. Look up socket based on destination end username
- */
- return dn_sklist_find_listener(&dstaddr);
-err_out:
- dn_log_martian(skb, ci_err_table[err].text);
- *reason = ci_err_table[err].reason;
- return NULL;
-}
-
-
-static void dn_nsp_conn_init(struct sock *sk, struct sk_buff *skb)
-{
- if (sk_acceptq_is_full(sk)) {
- kfree_skb(skb);
- return;
- }
-
- sk->sk_ack_backlog++;
- skb_queue_tail(&sk->sk_receive_queue, skb);
- sk->sk_state_change(sk);
-}
-
-static void dn_nsp_conn_conf(struct sock *sk, struct sk_buff *skb)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- struct dn_scp *scp = DN_SK(sk);
- unsigned char *ptr;
-
- if (skb->len < 4)
- goto out;
-
- ptr = skb->data;
- cb->services = *ptr++;
- cb->info = *ptr++;
- cb->segsize = le16_to_cpu(*(__le16 *)ptr);
-
- if ((scp->state == DN_CI) || (scp->state == DN_CD)) {
- scp->persist = 0;
- scp->addrrem = cb->src_port;
- sk->sk_state = TCP_ESTABLISHED;
- scp->state = DN_RUN;
- scp->services_rem = cb->services;
- scp->info_rem = cb->info;
- scp->segsize_rem = cb->segsize;
-
- if ((scp->services_rem & NSP_FC_MASK) == NSP_FC_NONE)
- scp->max_window = decnet_no_fc_max_cwnd;
-
- if (skb->len > 0) {
- u16 dlen = *skb->data;
- if ((dlen <= 16) && (dlen <= skb->len)) {
- scp->conndata_in.opt_optl = cpu_to_le16(dlen);
- skb_copy_from_linear_data_offset(skb, 1,
- scp->conndata_in.opt_data, dlen);
- }
- }
- dn_nsp_send_link(sk, DN_NOCHANGE, 0);
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_state_change(sk);
- }
-
-out:
- kfree_skb(skb);
-}
-
-static void dn_nsp_conn_ack(struct sock *sk, struct sk_buff *skb)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- if (scp->state == DN_CI) {
- scp->state = DN_CD;
- scp->persist = 0;
- }
-
- kfree_skb(skb);
-}
-
-static void dn_nsp_disc_init(struct sock *sk, struct sk_buff *skb)
-{
- struct dn_scp *scp = DN_SK(sk);
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- unsigned short reason;
-
- if (skb->len < 2)
- goto out;
-
- reason = le16_to_cpu(*(__le16 *)skb->data);
- skb_pull(skb, 2);
-
- scp->discdata_in.opt_status = cpu_to_le16(reason);
- scp->discdata_in.opt_optl = 0;
- memset(scp->discdata_in.opt_data, 0, 16);
-
- if (skb->len > 0) {
- u16 dlen = *skb->data;
- if ((dlen <= 16) && (dlen <= skb->len)) {
- scp->discdata_in.opt_optl = cpu_to_le16(dlen);
- skb_copy_from_linear_data_offset(skb, 1, scp->discdata_in.opt_data, dlen);
- }
- }
-
- scp->addrrem = cb->src_port;
- sk->sk_state = TCP_CLOSE;
-
- switch (scp->state) {
- case DN_CI:
- case DN_CD:
- scp->state = DN_RJ;
- sk->sk_err = ECONNREFUSED;
- break;
- case DN_RUN:
- sk->sk_shutdown |= SHUTDOWN_MASK;
- scp->state = DN_DN;
- break;
- case DN_DI:
- scp->state = DN_DIC;
- break;
- }
-
- if (!sock_flag(sk, SOCK_DEAD)) {
- if (sk->sk_socket->state != SS_UNCONNECTED)
- sk->sk_socket->state = SS_DISCONNECTING;
- sk->sk_state_change(sk);
- }
-
- /*
- * It appears that its possible for remote machines to send disc
- * init messages with no port identifier if we are in the CI and
- * possibly also the CD state. Obviously we shouldn't reply with
- * a message if we don't know what the end point is.
- */
- if (scp->addrrem) {
- dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC, GFP_ATOMIC);
- }
- scp->persist_fxn = dn_destroy_timer;
- scp->persist = dn_nsp_persist(sk);
-
-out:
- kfree_skb(skb);
-}
-
-/*
- * disc_conf messages are also called no_resources or no_link
- * messages depending upon the "reason" field.
- */
-static void dn_nsp_disc_conf(struct sock *sk, struct sk_buff *skb)
-{
- struct dn_scp *scp = DN_SK(sk);
- unsigned short reason;
-
- if (skb->len != 2)
- goto out;
-
- reason = le16_to_cpu(*(__le16 *)skb->data);
-
- sk->sk_state = TCP_CLOSE;
-
- switch (scp->state) {
- case DN_CI:
- scp->state = DN_NR;
- break;
- case DN_DR:
- if (reason == NSP_REASON_DC)
- scp->state = DN_DRC;
- if (reason == NSP_REASON_NL)
- scp->state = DN_CN;
- break;
- case DN_DI:
- scp->state = DN_DIC;
- break;
- case DN_RUN:
- sk->sk_shutdown |= SHUTDOWN_MASK;
- /* fall through */
- case DN_CC:
- scp->state = DN_CN;
- }
-
- if (!sock_flag(sk, SOCK_DEAD)) {
- if (sk->sk_socket->state != SS_UNCONNECTED)
- sk->sk_socket->state = SS_DISCONNECTING;
- sk->sk_state_change(sk);
- }
-
- scp->persist_fxn = dn_destroy_timer;
- scp->persist = dn_nsp_persist(sk);
-
-out:
- kfree_skb(skb);
-}
-
-static void dn_nsp_linkservice(struct sock *sk, struct sk_buff *skb)
-{
- struct dn_scp *scp = DN_SK(sk);
- unsigned short segnum;
- unsigned char lsflags;
- signed char fcval;
- int wake_up = 0;
- char *ptr = skb->data;
- unsigned char fctype = scp->services_rem & NSP_FC_MASK;
-
- if (skb->len != 4)
- goto out;
-
- segnum = le16_to_cpu(*(__le16 *)ptr);
- ptr += 2;
- lsflags = *(unsigned char *)ptr++;
- fcval = *ptr;
-
- /*
- * Here we ignore erronous packets which should really
- * should cause a connection abort. It is not critical
- * for now though.
- */
- if (lsflags & 0xf8)
- goto out;
-
- if (seq_next(scp->numoth_rcv, segnum)) {
- seq_add(&scp->numoth_rcv, 1);
- switch(lsflags & 0x04) { /* FCVAL INT */
- case 0x00: /* Normal Request */
- switch(lsflags & 0x03) { /* FCVAL MOD */
- case 0x00: /* Request count */
- if (fcval < 0) {
- unsigned char p_fcval = -fcval;
- if ((scp->flowrem_dat > p_fcval) &&
- (fctype == NSP_FC_SCMC)) {
- scp->flowrem_dat -= p_fcval;
- }
- } else if (fcval > 0) {
- scp->flowrem_dat += fcval;
- wake_up = 1;
- }
- break;
- case 0x01: /* Stop outgoing data */
- scp->flowrem_sw = DN_DONTSEND;
- break;
- case 0x02: /* Ok to start again */
- scp->flowrem_sw = DN_SEND;
- dn_nsp_output(sk);
- wake_up = 1;
- }
- break;
- case 0x04: /* Interrupt Request */
- if (fcval > 0) {
- scp->flowrem_oth += fcval;
- wake_up = 1;
- }
- break;
- }
- if (wake_up && !sock_flag(sk, SOCK_DEAD))
- sk->sk_state_change(sk);
- }
-
- dn_nsp_send_oth_ack(sk);
-
-out:
- kfree_skb(skb);
-}
-
-/*
- * Copy of sock_queue_rcv_skb (from sock.h) without
- * bh_lock_sock() (its already held when this is called) which
- * also allows data and other data to be queued to a socket.
- */
-static __inline__ int dn_queue_skb(struct sock *sk, struct sk_buff *skb, int sig, struct sk_buff_head *queue)
-{
- int err;
-
- /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
- number of warnings when compiling with -W --ANK
- */
- if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
- (unsigned int)sk->sk_rcvbuf) {
- err = -ENOMEM;
- goto out;
- }
-
- err = sk_filter(sk, skb);
- if (err)
- goto out;
-
- skb_set_owner_r(skb, sk);
- skb_queue_tail(queue, skb);
-
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk);
-out:
- return err;
-}
-
-static void dn_nsp_otherdata(struct sock *sk, struct sk_buff *skb)
-{
- struct dn_scp *scp = DN_SK(sk);
- unsigned short segnum;
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- int queued = 0;
-
- if (skb->len < 2)
- goto out;
-
- cb->segnum = segnum = le16_to_cpu(*(__le16 *)skb->data);
- skb_pull(skb, 2);
-
- if (seq_next(scp->numoth_rcv, segnum)) {
-
- if (dn_queue_skb(sk, skb, SIGURG, &scp->other_receive_queue) == 0) {
- seq_add(&scp->numoth_rcv, 1);
- scp->other_report = 0;
- queued = 1;
- }
- }
-
- dn_nsp_send_oth_ack(sk);
-out:
- if (!queued)
- kfree_skb(skb);
-}
-
-static void dn_nsp_data(struct sock *sk, struct sk_buff *skb)
-{
- int queued = 0;
- unsigned short segnum;
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- struct dn_scp *scp = DN_SK(sk);
-
- if (skb->len < 2)
- goto out;
-
- cb->segnum = segnum = le16_to_cpu(*(__le16 *)skb->data);
- skb_pull(skb, 2);
-
- if (seq_next(scp->numdat_rcv, segnum)) {
- if (dn_queue_skb(sk, skb, SIGIO, &sk->sk_receive_queue) == 0) {
- seq_add(&scp->numdat_rcv, 1);
- queued = 1;
- }
-
- if ((scp->flowloc_sw == DN_SEND) && dn_congested(sk)) {
- scp->flowloc_sw = DN_DONTSEND;
- dn_nsp_send_link(sk, DN_DONTSEND, 0);
- }
- }
-
- dn_nsp_send_data_ack(sk);
-out:
- if (!queued)
- kfree_skb(skb);
-}
-
-/*
- * If one of our conninit messages is returned, this function
- * deals with it. It puts the socket into the NO_COMMUNICATION
- * state.
- */
-static void dn_returned_conn_init(struct sock *sk, struct sk_buff *skb)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- if (scp->state == DN_CI) {
- scp->state = DN_NC;
- sk->sk_state = TCP_CLOSE;
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_state_change(sk);
- }
-
- kfree_skb(skb);
-}
-
-static int dn_nsp_no_socket(struct sk_buff *skb, unsigned short reason)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- int ret = NET_RX_DROP;
-
- /* Must not reply to returned packets */
- if (cb->rt_flags & DN_RT_F_RTS)
- goto out;
-
- if ((reason != NSP_REASON_OK) && ((cb->nsp_flags & 0x0c) == 0x08)) {
- switch (cb->nsp_flags & 0x70) {
- case 0x10:
- case 0x60: /* (Retransmitted) Connect Init */
- dn_nsp_return_disc(skb, NSP_DISCINIT, reason);
- ret = NET_RX_SUCCESS;
- break;
- case 0x20: /* Connect Confirm */
- dn_nsp_return_disc(skb, NSP_DISCCONF, reason);
- ret = NET_RX_SUCCESS;
- break;
- }
- }
-
-out:
- kfree_skb(skb);
- return ret;
-}
-
-static int dn_nsp_rx_packet(struct net *net, struct sock *sk2,
- struct sk_buff *skb)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- struct sock *sk = NULL;
- unsigned char *ptr = (unsigned char *)skb->data;
- unsigned short reason = NSP_REASON_NL;
-
- if (!pskb_may_pull(skb, 2))
- goto free_out;
-
- skb_reset_transport_header(skb);
- cb->nsp_flags = *ptr++;
-
- if (decnet_debug_level & 2)
- printk(KERN_DEBUG "dn_nsp_rx: Message type 0x%02x\n", (int)cb->nsp_flags);
-
- if (cb->nsp_flags & 0x83)
- goto free_out;
-
- /*
- * Filter out conninits and useless packet types
- */
- if ((cb->nsp_flags & 0x0c) == 0x08) {
- switch (cb->nsp_flags & 0x70) {
- case 0x00: /* NOP */
- case 0x70: /* Reserved */
- case 0x50: /* Reserved, Phase II node init */
- goto free_out;
- case 0x10:
- case 0x60:
- if (unlikely(cb->rt_flags & DN_RT_F_RTS))
- goto free_out;
- sk = dn_find_listener(skb, &reason);
- goto got_it;
- }
- }
-
- if (!pskb_may_pull(skb, 3))
- goto free_out;
-
- /*
- * Grab the destination address.
- */
- cb->dst_port = *(__le16 *)ptr;
- cb->src_port = 0;
- ptr += 2;
-
- /*
- * If not a connack, grab the source address too.
- */
- if (pskb_may_pull(skb, 5)) {
- cb->src_port = *(__le16 *)ptr;
- ptr += 2;
- skb_pull(skb, 5);
- }
-
- /*
- * Returned packets...
- * Swap src & dst and look up in the normal way.
- */
- if (unlikely(cb->rt_flags & DN_RT_F_RTS)) {
- swap(cb->dst_port, cb->src_port);
- swap(cb->dst, cb->src);
- }
-
- /*
- * Find the socket to which this skb is destined.
- */
- sk = dn_find_by_skb(skb);
-got_it:
- if (sk != NULL) {
- struct dn_scp *scp = DN_SK(sk);
-
- /* Reset backoff */
- scp->nsp_rxtshift = 0;
-
- /*
- * We linearize everything except data segments here.
- */
- if (cb->nsp_flags & ~0x60) {
- if (unlikely(skb_linearize(skb)))
- goto free_out;
- }
-
- return sk_receive_skb(sk, skb, 0);
- }
-
- return dn_nsp_no_socket(skb, reason);
-
-free_out:
- kfree_skb(skb);
- return NET_RX_DROP;
-}
-
-int dn_nsp_rx(struct sk_buff *skb)
-{
- return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_IN,
- &init_net, NULL, skb, skb->dev, NULL,
- dn_nsp_rx_packet);
-}
-
-/*
- * This is the main receive routine for sockets. It is called
- * from the above when the socket is not busy, and also from
- * sock_release() when there is a backlog queued up.
- */
-int dn_nsp_backlog_rcv(struct sock *sk, struct sk_buff *skb)
-{
- struct dn_scp *scp = DN_SK(sk);
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
-
- if (cb->rt_flags & DN_RT_F_RTS) {
- if (cb->nsp_flags == 0x18 || cb->nsp_flags == 0x68)
- dn_returned_conn_init(sk, skb);
- else
- kfree_skb(skb);
- return NET_RX_SUCCESS;
- }
-
- /*
- * Control packet.
- */
- if ((cb->nsp_flags & 0x0c) == 0x08) {
- switch (cb->nsp_flags & 0x70) {
- case 0x10:
- case 0x60:
- dn_nsp_conn_init(sk, skb);
- break;
- case 0x20:
- dn_nsp_conn_conf(sk, skb);
- break;
- case 0x30:
- dn_nsp_disc_init(sk, skb);
- break;
- case 0x40:
- dn_nsp_disc_conf(sk, skb);
- break;
- }
-
- } else if (cb->nsp_flags == 0x24) {
- /*
- * Special for connacks, 'cos they don't have
- * ack data or ack otherdata info.
- */
- dn_nsp_conn_ack(sk, skb);
- } else {
- int other = 1;
-
- /* both data and ack frames can kick a CC socket into RUN */
- if ((scp->state == DN_CC) && !sock_flag(sk, SOCK_DEAD)) {
- scp->state = DN_RUN;
- sk->sk_state = TCP_ESTABLISHED;
- sk->sk_state_change(sk);
- }
-
- if ((cb->nsp_flags & 0x1c) == 0)
- other = 0;
- if (cb->nsp_flags == 0x04)
- other = 0;
-
- /*
- * Read out ack data here, this applies equally
- * to data, other data, link serivce and both
- * ack data and ack otherdata.
- */
- dn_process_ack(sk, skb, other);
-
- /*
- * If we've some sort of data here then call a
- * suitable routine for dealing with it, otherwise
- * the packet is an ack and can be discarded.
- */
- if ((cb->nsp_flags & 0x0c) == 0) {
-
- if (scp->state != DN_RUN)
- goto free_out;
-
- switch (cb->nsp_flags) {
- case 0x10: /* LS */
- dn_nsp_linkservice(sk, skb);
- break;
- case 0x30: /* OD */
- dn_nsp_otherdata(sk, skb);
- break;
- default:
- dn_nsp_data(sk, skb);
- }
-
- } else { /* Ack, chuck it out here */
-free_out:
- kfree_skb(skb);
- }
- }
-
- return NET_RX_SUCCESS;
-}
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
deleted file mode 100644
index a1779de6bd9c..000000000000
--- a/net/decnet/dn_nsp_out.c
+++ /dev/null
@@ -1,703 +0,0 @@
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Network Services Protocol (Output)
- *
- * Author: Eduardo Marcelo Serrat <emserrat@geocities.com>
- *
- * Changes:
- *
- * Steve Whitehouse: Split into dn_nsp_in.c and dn_nsp_out.c from
- * original dn_nsp.c.
- * Steve Whitehouse: Updated to work with my new routing architecture.
- * Steve Whitehouse: Added changes from Eduardo Serrat's patches.
- * Steve Whitehouse: Now conninits have the "return" bit set.
- * Steve Whitehouse: Fixes to check alloc'd skbs are non NULL!
- * Moved output state machine into one function
- * Steve Whitehouse: New output state machine
- * Paul Koning: Connect Confirm message fix.
- * Eduardo Serrat: Fix to stop dn_nsp_do_disc() sending malformed packets.
- * Steve Whitehouse: dn_nsp_output() and friends needed a spring clean
- * Steve Whitehouse: Moved dn_nsp_send() in here from route.h
- */
-
-/******************************************************************************
- (c) 1995-1998 E.M. Serrat emserrat@geocities.com
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-*******************************************************************************/
-
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/kernel.h>
-#include <linux/timer.h>
-#include <linux/string.h>
-#include <linux/sockios.h>
-#include <linux/net.h>
-#include <linux/netdevice.h>
-#include <linux/inet.h>
-#include <linux/route.h>
-#include <linux/slab.h>
-#include <net/sock.h>
-#include <linux/fcntl.h>
-#include <linux/mm.h>
-#include <linux/termios.h>
-#include <linux/interrupt.h>
-#include <linux/proc_fs.h>
-#include <linux/stat.h>
-#include <linux/init.h>
-#include <linux/poll.h>
-#include <linux/if_packet.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/dn.h>
-#include <net/dn_nsp.h>
-#include <net/dn_dev.h>
-#include <net/dn_route.h>
-
-
-static int nsp_backoff[NSP_MAXRXTSHIFT + 1] = { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
-
-static void dn_nsp_send(struct sk_buff *skb)
-{
- struct sock *sk = skb->sk;
- struct dn_scp *scp = DN_SK(sk);
- struct dst_entry *dst;
- struct flowidn fld;
-
- skb_reset_transport_header(skb);
- scp->stamp = jiffies;
-
- dst = sk_dst_check(sk, 0);
- if (dst) {
-try_again:
- skb_dst_set(skb, dst);
- dst_output(&init_net, skb->sk, skb);
- return;
- }
-
- memset(&fld, 0, sizeof(fld));
- fld.flowidn_oif = sk->sk_bound_dev_if;
- fld.saddr = dn_saddr2dn(&scp->addr);
- fld.daddr = dn_saddr2dn(&scp->peer);
- dn_sk_ports_copy(&fld, scp);
- fld.flowidn_proto = DNPROTO_NSP;
- if (dn_route_output_sock(&sk->sk_dst_cache, &fld, sk, 0) == 0) {
- dst = sk_dst_get(sk);
- sk->sk_route_caps = dst->dev->features;
- goto try_again;
- }
-
- sk->sk_err = EHOSTUNREACH;
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_state_change(sk);
-}
-
-
-/*
- * If sk == NULL, then we assume that we are supposed to be making
- * a routing layer skb. If sk != NULL, then we are supposed to be
- * creating an skb for the NSP layer.
- *
- * The eventual aim is for each socket to have a cached header size
- * for its outgoing packets, and to set hdr from this when sk != NULL.
- */
-struct sk_buff *dn_alloc_skb(struct sock *sk, int size, gfp_t pri)
-{
- struct sk_buff *skb;
- int hdr = 64;
-
- if ((skb = alloc_skb(size + hdr, pri)) == NULL)
- return NULL;
-
- skb->protocol = htons(ETH_P_DNA_RT);
- skb->pkt_type = PACKET_OUTGOING;
-
- if (sk)
- skb_set_owner_w(skb, sk);
-
- skb_reserve(skb, hdr);
-
- return skb;
-}
-
-/*
- * Calculate persist timer based upon the smoothed round
- * trip time and the variance. Backoff according to the
- * nsp_backoff[] array.
- */
-unsigned long dn_nsp_persist(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- unsigned long t = ((scp->nsp_srtt >> 2) + scp->nsp_rttvar) >> 1;
-
- t *= nsp_backoff[scp->nsp_rxtshift];
-
- if (t < HZ) t = HZ;
- if (t > (600*HZ)) t = (600*HZ);
-
- if (scp->nsp_rxtshift < NSP_MAXRXTSHIFT)
- scp->nsp_rxtshift++;
-
- /* printk(KERN_DEBUG "rxtshift %lu, t=%lu\n", scp->nsp_rxtshift, t); */
-
- return t;
-}
-
-/*
- * This is called each time we get an estimate for the rtt
- * on the link.
- */
-static void dn_nsp_rtt(struct sock *sk, long rtt)
-{
- struct dn_scp *scp = DN_SK(sk);
- long srtt = (long)scp->nsp_srtt;
- long rttvar = (long)scp->nsp_rttvar;
- long delta;
-
- /*
- * If the jiffies clock flips over in the middle of timestamp
- * gathering this value might turn out negative, so we make sure
- * that is it always positive here.
- */
- if (rtt < 0)
- rtt = -rtt;
- /*
- * Add new rtt to smoothed average
- */
- delta = ((rtt << 3) - srtt);
- srtt += (delta >> 3);
- if (srtt >= 1)
- scp->nsp_srtt = (unsigned long)srtt;
- else
- scp->nsp_srtt = 1;
-
- /*
- * Add new rtt varience to smoothed varience
- */
- delta >>= 1;
- rttvar += ((((delta>0)?(delta):(-delta)) - rttvar) >> 2);
- if (rttvar >= 1)
- scp->nsp_rttvar = (unsigned long)rttvar;
- else
- scp->nsp_rttvar = 1;
-
- /* printk(KERN_DEBUG "srtt=%lu rttvar=%lu\n", scp->nsp_srtt, scp->nsp_rttvar); */
-}
-
-/**
- * dn_nsp_clone_and_send - Send a data packet by cloning it
- * @skb: The packet to clone and transmit
- * @gfp: memory allocation flag
- *
- * Clone a queued data or other data packet and transmit it.
- *
- * Returns: The number of times the packet has been sent previously
- */
-static inline unsigned int dn_nsp_clone_and_send(struct sk_buff *skb,
- gfp_t gfp)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- struct sk_buff *skb2;
- int ret = 0;
-
- if ((skb2 = skb_clone(skb, gfp)) != NULL) {
- ret = cb->xmit_count;
- cb->xmit_count++;
- cb->stamp = jiffies;
- skb2->sk = skb->sk;
- dn_nsp_send(skb2);
- }
-
- return ret;
-}
-
-/**
- * dn_nsp_output - Try and send something from socket queues
- * @sk: The socket whose queues are to be investigated
- *
- * Try and send the packet on the end of the data and other data queues.
- * Other data gets priority over data, and if we retransmit a packet we
- * reduce the window by dividing it in two.
- *
- */
-void dn_nsp_output(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
- struct sk_buff *skb;
- unsigned int reduce_win = 0;
-
- /*
- * First we check for otherdata/linkservice messages
- */
- if ((skb = skb_peek(&scp->other_xmit_queue)) != NULL)
- reduce_win = dn_nsp_clone_and_send(skb, GFP_ATOMIC);
-
- /*
- * If we may not send any data, we don't.
- * If we are still trying to get some other data down the
- * channel, we don't try and send any data.
- */
- if (reduce_win || (scp->flowrem_sw != DN_SEND))
- goto recalc_window;
-
- if ((skb = skb_peek(&scp->data_xmit_queue)) != NULL)
- reduce_win = dn_nsp_clone_and_send(skb, GFP_ATOMIC);
-
- /*
- * If we've sent any frame more than once, we cut the
- * send window size in half. There is always a minimum
- * window size of one available.
- */
-recalc_window:
- if (reduce_win) {
- scp->snd_window >>= 1;
- if (scp->snd_window < NSP_MIN_WINDOW)
- scp->snd_window = NSP_MIN_WINDOW;
- }
-}
-
-int dn_nsp_xmit_timeout(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- dn_nsp_output(sk);
-
- if (!skb_queue_empty(&scp->data_xmit_queue) ||
- !skb_queue_empty(&scp->other_xmit_queue))
- scp->persist = dn_nsp_persist(sk);
-
- return 0;
-}
-
-static inline __le16 *dn_mk_common_header(struct dn_scp *scp, struct sk_buff *skb, unsigned char msgflag, int len)
-{
- unsigned char *ptr = skb_push(skb, len);
-
- BUG_ON(len < 5);
-
- *ptr++ = msgflag;
- *((__le16 *)ptr) = scp->addrrem;
- ptr += 2;
- *((__le16 *)ptr) = scp->addrloc;
- ptr += 2;
- return (__le16 __force *)ptr;
-}
-
-static __le16 *dn_mk_ack_header(struct sock *sk, struct sk_buff *skb, unsigned char msgflag, int hlen, int other)
-{
- struct dn_scp *scp = DN_SK(sk);
- unsigned short acknum = scp->numdat_rcv & 0x0FFF;
- unsigned short ackcrs = scp->numoth_rcv & 0x0FFF;
- __le16 *ptr;
-
- BUG_ON(hlen < 9);
-
- scp->ackxmt_dat = acknum;
- scp->ackxmt_oth = ackcrs;
- acknum |= 0x8000;
- ackcrs |= 0x8000;
-
- /* If this is an "other data/ack" message, swap acknum and ackcrs */
- if (other)
- swap(acknum, ackcrs);
-
- /* Set "cross subchannel" bit in ackcrs */
- ackcrs |= 0x2000;
-
- ptr = dn_mk_common_header(scp, skb, msgflag, hlen);
-
- *ptr++ = cpu_to_le16(acknum);
- *ptr++ = cpu_to_le16(ackcrs);
-
- return ptr;
-}
-
-static __le16 *dn_nsp_mk_data_header(struct sock *sk, struct sk_buff *skb, int oth)
-{
- struct dn_scp *scp = DN_SK(sk);
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- __le16 *ptr = dn_mk_ack_header(sk, skb, cb->nsp_flags, 11, oth);
-
- if (unlikely(oth)) {
- cb->segnum = scp->numoth;
- seq_add(&scp->numoth, 1);
- } else {
- cb->segnum = scp->numdat;
- seq_add(&scp->numdat, 1);
- }
- *(ptr++) = cpu_to_le16(cb->segnum);
-
- return ptr;
-}
-
-void dn_nsp_queue_xmit(struct sock *sk, struct sk_buff *skb,
- gfp_t gfp, int oth)
-{
- struct dn_scp *scp = DN_SK(sk);
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- unsigned long t = ((scp->nsp_srtt >> 2) + scp->nsp_rttvar) >> 1;
-
- cb->xmit_count = 0;
- dn_nsp_mk_data_header(sk, skb, oth);
-
- /*
- * Slow start: If we have been idle for more than
- * one RTT, then reset window to min size.
- */
- if ((jiffies - scp->stamp) > t)
- scp->snd_window = NSP_MIN_WINDOW;
-
- if (oth)
- skb_queue_tail(&scp->other_xmit_queue, skb);
- else
- skb_queue_tail(&scp->data_xmit_queue, skb);
-
- if (scp->flowrem_sw != DN_SEND)
- return;
-
- dn_nsp_clone_and_send(skb, gfp);
-}
-
-
-int dn_nsp_check_xmit_queue(struct sock *sk, struct sk_buff *skb, struct sk_buff_head *q, unsigned short acknum)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- struct dn_scp *scp = DN_SK(sk);
- struct sk_buff *skb2, *n, *ack = NULL;
- int wakeup = 0;
- int try_retrans = 0;
- unsigned long reftime = cb->stamp;
- unsigned long pkttime;
- unsigned short xmit_count;
- unsigned short segnum;
-
- skb_queue_walk_safe(q, skb2, n) {
- struct dn_skb_cb *cb2 = DN_SKB_CB(skb2);
-
- if (dn_before_or_equal(cb2->segnum, acknum))
- ack = skb2;
-
- /* printk(KERN_DEBUG "ack: %s %04x %04x\n", ack ? "ACK" : "SKIP", (int)cb2->segnum, (int)acknum); */
-
- if (ack == NULL)
- continue;
-
- /* printk(KERN_DEBUG "check_xmit_queue: %04x, %d\n", acknum, cb2->xmit_count); */
-
- /* Does _last_ packet acked have xmit_count > 1 */
- try_retrans = 0;
- /* Remember to wake up the sending process */
- wakeup = 1;
- /* Keep various statistics */
- pkttime = cb2->stamp;
- xmit_count = cb2->xmit_count;
- segnum = cb2->segnum;
- /* Remove and drop ack'ed packet */
- skb_unlink(ack, q);
- kfree_skb(ack);
- ack = NULL;
-
- /*
- * We don't expect to see acknowledgements for packets we
- * haven't sent yet.
- */
- WARN_ON(xmit_count == 0);
-
- /*
- * If the packet has only been sent once, we can use it
- * to calculate the RTT and also open the window a little
- * further.
- */
- if (xmit_count == 1) {
- if (dn_equal(segnum, acknum))
- dn_nsp_rtt(sk, (long)(pkttime - reftime));
-
- if (scp->snd_window < scp->max_window)
- scp->snd_window++;
- }
-
- /*
- * Packet has been sent more than once. If this is the last
- * packet to be acknowledged then we want to send the next
- * packet in the send queue again (assumes the remote host does
- * go-back-N error control).
- */
- if (xmit_count > 1)
- try_retrans = 1;
- }
-
- if (try_retrans)
- dn_nsp_output(sk);
-
- return wakeup;
-}
-
-void dn_nsp_send_data_ack(struct sock *sk)
-{
- struct sk_buff *skb = NULL;
-
- if ((skb = dn_alloc_skb(sk, 9, GFP_ATOMIC)) == NULL)
- return;
-
- skb_reserve(skb, 9);
- dn_mk_ack_header(sk, skb, 0x04, 9, 0);
- dn_nsp_send(skb);
-}
-
-void dn_nsp_send_oth_ack(struct sock *sk)
-{
- struct sk_buff *skb = NULL;
-
- if ((skb = dn_alloc_skb(sk, 9, GFP_ATOMIC)) == NULL)
- return;
-
- skb_reserve(skb, 9);
- dn_mk_ack_header(sk, skb, 0x14, 9, 1);
- dn_nsp_send(skb);
-}
-
-
-void dn_send_conn_ack (struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
- struct sk_buff *skb = NULL;
- struct nsp_conn_ack_msg *msg;
-
- if ((skb = dn_alloc_skb(sk, 3, sk->sk_allocation)) == NULL)
- return;
-
- msg = skb_put(skb, 3);
- msg->msgflg = 0x24;
- msg->dstaddr = scp->addrrem;
-
- dn_nsp_send(skb);
-}
-
-static int dn_nsp_retrans_conn_conf(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- if (scp->state == DN_CC)
- dn_send_conn_conf(sk, GFP_ATOMIC);
-
- return 0;
-}
-
-void dn_send_conn_conf(struct sock *sk, gfp_t gfp)
-{
- struct dn_scp *scp = DN_SK(sk);
- struct sk_buff *skb = NULL;
- struct nsp_conn_init_msg *msg;
- __u8 len = (__u8)le16_to_cpu(scp->conndata_out.opt_optl);
-
- if ((skb = dn_alloc_skb(sk, 50 + len, gfp)) == NULL)
- return;
-
- msg = skb_put(skb, sizeof(*msg));
- msg->msgflg = 0x28;
- msg->dstaddr = scp->addrrem;
- msg->srcaddr = scp->addrloc;
- msg->services = scp->services_loc;
- msg->info = scp->info_loc;
- msg->segsize = cpu_to_le16(scp->segsize_loc);
-
- skb_put_u8(skb, len);
-
- if (len > 0)
- skb_put_data(skb, scp->conndata_out.opt_data, len);
-
-
- dn_nsp_send(skb);
-
- scp->persist = dn_nsp_persist(sk);
- scp->persist_fxn = dn_nsp_retrans_conn_conf;
-}
-
-
-static __inline__ void dn_nsp_do_disc(struct sock *sk, unsigned char msgflg,
- unsigned short reason, gfp_t gfp,
- struct dst_entry *dst,
- int ddl, unsigned char *dd, __le16 rem, __le16 loc)
-{
- struct sk_buff *skb = NULL;
- int size = 7 + ddl + ((msgflg == NSP_DISCINIT) ? 1 : 0);
- unsigned char *msg;
-
- if ((dst == NULL) || (rem == 0)) {
- net_dbg_ratelimited("DECnet: dn_nsp_do_disc: BUG! Please report this to SteveW@ACM.org rem=%u dst=%p\n",
- le16_to_cpu(rem), dst);
- return;
- }
-
- if ((skb = dn_alloc_skb(sk, size, gfp)) == NULL)
- return;
-
- msg = skb_put(skb, size);
- *msg++ = msgflg;
- *(__le16 *)msg = rem;
- msg += 2;
- *(__le16 *)msg = loc;
- msg += 2;
- *(__le16 *)msg = cpu_to_le16(reason);
- msg += 2;
- if (msgflg == NSP_DISCINIT)
- *msg++ = ddl;
-
- if (ddl) {
- memcpy(msg, dd, ddl);
- }
-
- /*
- * This doesn't go via the dn_nsp_send() function since we need
- * to be able to send disc packets out which have no socket
- * associations.
- */
- skb_dst_set(skb, dst_clone(dst));
- dst_output(&init_net, skb->sk, skb);
-}
-
-
-void dn_nsp_send_disc(struct sock *sk, unsigned char msgflg,
- unsigned short reason, gfp_t gfp)
-{
- struct dn_scp *scp = DN_SK(sk);
- int ddl = 0;
-
- if (msgflg == NSP_DISCINIT)
- ddl = le16_to_cpu(scp->discdata_out.opt_optl);
-
- if (reason == 0)
- reason = le16_to_cpu(scp->discdata_out.opt_status);
-
- dn_nsp_do_disc(sk, msgflg, reason, gfp, __sk_dst_get(sk), ddl,
- scp->discdata_out.opt_data, scp->addrrem, scp->addrloc);
-}
-
-
-void dn_nsp_return_disc(struct sk_buff *skb, unsigned char msgflg,
- unsigned short reason)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- int ddl = 0;
- gfp_t gfp = GFP_ATOMIC;
-
- dn_nsp_do_disc(NULL, msgflg, reason, gfp, skb_dst(skb), ddl,
- NULL, cb->src_port, cb->dst_port);
-}
-
-
-void dn_nsp_send_link(struct sock *sk, unsigned char lsflags, char fcval)
-{
- struct dn_scp *scp = DN_SK(sk);
- struct sk_buff *skb;
- unsigned char *ptr;
- gfp_t gfp = GFP_ATOMIC;
-
- if ((skb = dn_alloc_skb(sk, DN_MAX_NSP_DATA_HEADER + 2, gfp)) == NULL)
- return;
-
- skb_reserve(skb, DN_MAX_NSP_DATA_HEADER);
- ptr = skb_put(skb, 2);
- DN_SKB_CB(skb)->nsp_flags = 0x10;
- *ptr++ = lsflags;
- *ptr = fcval;
-
- dn_nsp_queue_xmit(sk, skb, gfp, 1);
-
- scp->persist = dn_nsp_persist(sk);
- scp->persist_fxn = dn_nsp_xmit_timeout;
-}
-
-static int dn_nsp_retrans_conninit(struct sock *sk)
-{
- struct dn_scp *scp = DN_SK(sk);
-
- if (scp->state == DN_CI)
- dn_nsp_send_conninit(sk, NSP_RCI);
-
- return 0;
-}
-
-void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg)
-{
- struct dn_scp *scp = DN_SK(sk);
- struct nsp_conn_init_msg *msg;
- unsigned char aux;
- unsigned char menuver;
- struct dn_skb_cb *cb;
- unsigned char type = 1;
- gfp_t allocation = (msgflg == NSP_CI) ? sk->sk_allocation : GFP_ATOMIC;
- struct sk_buff *skb = dn_alloc_skb(sk, 200, allocation);
-
- if (!skb)
- return;
-
- cb = DN_SKB_CB(skb);
- msg = skb_put(skb, sizeof(*msg));
-
- msg->msgflg = msgflg;
- msg->dstaddr = 0x0000; /* Remote Node will assign it*/
-
- msg->srcaddr = scp->addrloc;
- msg->services = scp->services_loc; /* Requested flow control */
- msg->info = scp->info_loc; /* Version Number */
- msg->segsize = cpu_to_le16(scp->segsize_loc); /* Max segment size */
-
- if (scp->peer.sdn_objnum)
- type = 0;
-
- skb_put(skb, dn_sockaddr2username(&scp->peer,
- skb_tail_pointer(skb), type));
- skb_put(skb, dn_sockaddr2username(&scp->addr,
- skb_tail_pointer(skb), 2));
-
- menuver = DN_MENUVER_ACC | DN_MENUVER_USR;
- if (scp->peer.sdn_flags & SDF_PROXY)
- menuver |= DN_MENUVER_PRX;
- if (scp->peer.sdn_flags & SDF_UICPROXY)
- menuver |= DN_MENUVER_UIC;
-
- skb_put_u8(skb, menuver); /* Menu Version */
-
- aux = scp->accessdata.acc_userl;
- skb_put_u8(skb, aux);
- if (aux > 0)
- skb_put_data(skb, scp->accessdata.acc_user, aux);
-
- aux = scp->accessdata.acc_passl;
- skb_put_u8(skb, aux);
- if (aux > 0)
- skb_put_data(skb, scp->accessdata.acc_pass, aux);
-
- aux = scp->accessdata.acc_accl;
- skb_put_u8(skb, aux);
- if (aux > 0)
- skb_put_data(skb, scp->accessdata.acc_acc, aux);
-
- aux = (__u8)le16_to_cpu(scp->conndata_out.opt_optl);
- skb_put_u8(skb, aux);
- if (aux > 0)
- skb_put_data(skb, scp->conndata_out.opt_data, aux);
-
- scp->persist = dn_nsp_persist(sk);
- scp->persist_fxn = dn_nsp_retrans_conninit;
-
- cb->rt_flags = DN_RT_F_RQR;
-
- dn_nsp_send(skb);
-}
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
deleted file mode 100644
index 1c002c0fb712..000000000000
--- a/net/decnet/dn_route.c
+++ /dev/null
@@ -1,1927 +0,0 @@
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Routing Functions (Endnode and Router)
- *
- * Authors: Steve Whitehouse <SteveW@ACM.org>
- * Eduardo Marcelo Serrat <emserrat@geocities.com>
- *
- * Changes:
- * Steve Whitehouse : Fixes to allow "intra-ethernet" and
- * "return-to-sender" bits on outgoing
- * packets.
- * Steve Whitehouse : Timeouts for cached routes.
- * Steve Whitehouse : Use dst cache for input routes too.
- * Steve Whitehouse : Fixed error values in dn_send_skb.
- * Steve Whitehouse : Rework routing functions to better fit
- * DECnet routing design
- * Alexey Kuznetsov : New SMP locking
- * Steve Whitehouse : More SMP locking changes & dn_cache_dump()
- * Steve Whitehouse : Prerouting NF hook, now really is prerouting.
- * Fixed possible skb leak in rtnetlink funcs.
- * Steve Whitehouse : Dave Miller's dynamic hash table sizing and
- * Alexey Kuznetsov's finer grained locking
- * from ipv4/route.c.
- * Steve Whitehouse : Routing is now starting to look like a
- * sensible set of code now, mainly due to
- * my copying the IPv4 routing code. The
- * hooks here are modified and will continue
- * to evolve for a while.
- * Steve Whitehouse : Real SMP at last :-) Also new netfilter
- * stuff. Look out raw sockets your days
- * are numbered!
- * Steve Whitehouse : Added return-to-sender functions. Added
- * backlog congestion level return codes.
- * Steve Whitehouse : Fixed bug where routes were set up with
- * no ref count on net devices.
- * Steve Whitehouse : RCU for the route cache
- * Steve Whitehouse : Preparations for the flow cache
- * Steve Whitehouse : Prepare for nonlinear skbs
- */
-
-/******************************************************************************
- (c) 1995-1998 E.M. Serrat emserrat@geocities.com
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-*******************************************************************************/
-
-#include <linux/errno.h>
-#include <linux/types.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/kernel.h>
-#include <linux/sockios.h>
-#include <linux/net.h>
-#include <linux/netdevice.h>
-#include <linux/inet.h>
-#include <linux/route.h>
-#include <linux/in_route.h>
-#include <linux/slab.h>
-#include <net/sock.h>
-#include <linux/mm.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/init.h>
-#include <linux/rtnetlink.h>
-#include <linux/string.h>
-#include <linux/netfilter_decnet.h>
-#include <linux/rcupdate.h>
-#include <linux/times.h>
-#include <linux/export.h>
-#include <asm/errno.h>
-#include <net/net_namespace.h>
-#include <net/netlink.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/fib_rules.h>
-#include <net/dn.h>
-#include <net/dn_dev.h>
-#include <net/dn_nsp.h>
-#include <net/dn_route.h>
-#include <net/dn_neigh.h>
-#include <net/dn_fib.h>
-
-struct dn_rt_hash_bucket
-{
- struct dn_route __rcu *chain;
- spinlock_t lock;
-};
-
-extern struct neigh_table dn_neigh_table;
-
-
-static unsigned char dn_hiord_addr[6] = {0xAA,0x00,0x04,0x00,0x00,0x00};
-
-static const int dn_rt_min_delay = 2 * HZ;
-static const int dn_rt_max_delay = 10 * HZ;
-static const int dn_rt_mtu_expires = 10 * 60 * HZ;
-
-static unsigned long dn_rt_deadline;
-
-static int dn_dst_gc(struct dst_ops *ops);
-static struct dst_entry *dn_dst_check(struct dst_entry *, __u32);
-static unsigned int dn_dst_default_advmss(const struct dst_entry *dst);
-static unsigned int dn_dst_mtu(const struct dst_entry *dst);
-static void dn_dst_destroy(struct dst_entry *);
-static void dn_dst_ifdown(struct dst_entry *, struct net_device *dev, int how);
-static struct dst_entry *dn_dst_negative_advice(struct dst_entry *);
-static void dn_dst_link_failure(struct sk_buff *);
-static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb , u32 mtu);
-static void dn_dst_redirect(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb);
-static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst,
- struct sk_buff *skb,
- const void *daddr);
-static int dn_route_input(struct sk_buff *);
-static void dn_run_flush(struct timer_list *unused);
-
-static struct dn_rt_hash_bucket *dn_rt_hash_table;
-static unsigned int dn_rt_hash_mask;
-
-static struct timer_list dn_route_timer;
-static DEFINE_TIMER(dn_rt_flush_timer, dn_run_flush);
-int decnet_dst_gc_interval = 2;
-
-static struct dst_ops dn_dst_ops = {
- .family = PF_DECnet,
- .gc_thresh = 128,
- .gc = dn_dst_gc,
- .check = dn_dst_check,
- .default_advmss = dn_dst_default_advmss,
- .mtu = dn_dst_mtu,
- .cow_metrics = dst_cow_metrics_generic,
- .destroy = dn_dst_destroy,
- .ifdown = dn_dst_ifdown,
- .negative_advice = dn_dst_negative_advice,
- .link_failure = dn_dst_link_failure,
- .update_pmtu = dn_dst_update_pmtu,
- .redirect = dn_dst_redirect,
- .neigh_lookup = dn_dst_neigh_lookup,
-};
-
-static void dn_dst_destroy(struct dst_entry *dst)
-{
- struct dn_route *rt = (struct dn_route *) dst;
-
- if (rt->n)
- neigh_release(rt->n);
- dst_destroy_metrics_generic(dst);
-}
-
-static void dn_dst_ifdown(struct dst_entry *dst, struct net_device *dev, int how)
-{
- if (how) {
- struct dn_route *rt = (struct dn_route *) dst;
- struct neighbour *n = rt->n;
-
- if (n && n->dev == dev) {
- n->dev = dev_net(dev)->loopback_dev;
- dev_hold(n->dev);
- dev_put(dev);
- }
- }
-}
-
-static __inline__ unsigned int dn_hash(__le16 src, __le16 dst)
-{
- __u16 tmp = (__u16 __force)(src ^ dst);
- tmp ^= (tmp >> 3);
- tmp ^= (tmp >> 5);
- tmp ^= (tmp >> 10);
- return dn_rt_hash_mask & (unsigned int)tmp;
-}
-
-static void dn_dst_check_expire(struct timer_list *unused)
-{
- int i;
- struct dn_route *rt;
- struct dn_route __rcu **rtp;
- unsigned long now = jiffies;
- unsigned long expire = 120 * HZ;
-
- for (i = 0; i <= dn_rt_hash_mask; i++) {
- rtp = &dn_rt_hash_table[i].chain;
-
- spin_lock(&dn_rt_hash_table[i].lock);
- while ((rt = rcu_dereference_protected(*rtp,
- lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) {
- if (atomic_read(&rt->dst.__refcnt) > 1 ||
- (now - rt->dst.lastuse) < expire) {
- rtp = &rt->dn_next;
- continue;
- }
- *rtp = rt->dn_next;
- rt->dn_next = NULL;
- dst_dev_put(&rt->dst);
- dst_release(&rt->dst);
- }
- spin_unlock(&dn_rt_hash_table[i].lock);
-
- if ((jiffies - now) > 0)
- break;
- }
-
- mod_timer(&dn_route_timer, now + decnet_dst_gc_interval * HZ);
-}
-
-static int dn_dst_gc(struct dst_ops *ops)
-{
- struct dn_route *rt;
- struct dn_route __rcu **rtp;
- int i;
- unsigned long now = jiffies;
- unsigned long expire = 10 * HZ;
-
- for (i = 0; i <= dn_rt_hash_mask; i++) {
-
- spin_lock_bh(&dn_rt_hash_table[i].lock);
- rtp = &dn_rt_hash_table[i].chain;
-
- while ((rt = rcu_dereference_protected(*rtp,
- lockdep_is_held(&dn_rt_hash_table[i].lock))) != NULL) {
- if (atomic_read(&rt->dst.__refcnt) > 1 ||
- (now - rt->dst.lastuse) < expire) {
- rtp = &rt->dn_next;
- continue;
- }
- *rtp = rt->dn_next;
- rt->dn_next = NULL;
- dst_dev_put(&rt->dst);
- dst_release(&rt->dst);
- break;
- }
- spin_unlock_bh(&dn_rt_hash_table[i].lock);
- }
-
- return 0;
-}
-
-/*
- * The decnet standards don't impose a particular minimum mtu, what they
- * do insist on is that the routing layer accepts a datagram of at least
- * 230 bytes long. Here we have to subtract the routing header length from
- * 230 to get the minimum acceptable mtu. If there is no neighbour, then we
- * assume the worst and use a long header size.
- *
- * We update both the mtu and the advertised mss (i.e. the segment size we
- * advertise to the other end).
- */
-static void dn_dst_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
-{
- struct dn_route *rt = (struct dn_route *) dst;
- struct neighbour *n = rt->n;
- u32 min_mtu = 230;
- struct dn_dev *dn;
-
- dn = n ? rcu_dereference_raw(n->dev->dn_ptr) : NULL;
-
- if (dn && dn->use_long == 0)
- min_mtu -= 6;
- else
- min_mtu -= 21;
-
- if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= min_mtu) {
- if (!(dst_metric_locked(dst, RTAX_MTU))) {
- dst_metric_set(dst, RTAX_MTU, mtu);
- dst_set_expires(dst, dn_rt_mtu_expires);
- }
- if (!(dst_metric_locked(dst, RTAX_ADVMSS))) {
- u32 mss = mtu - DN_MAX_NSP_DATA_HEADER;
- u32 existing_mss = dst_metric_raw(dst, RTAX_ADVMSS);
- if (!existing_mss || existing_mss > mss)
- dst_metric_set(dst, RTAX_ADVMSS, mss);
- }
- }
-}
-
-static void dn_dst_redirect(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb)
-{
-}
-
-/*
- * When a route has been marked obsolete. (e.g. routing cache flush)
- */
-static struct dst_entry *dn_dst_check(struct dst_entry *dst, __u32 cookie)
-{
- return NULL;
-}
-
-static struct dst_entry *dn_dst_negative_advice(struct dst_entry *dst)
-{
- dst_release(dst);
- return NULL;
-}
-
-static void dn_dst_link_failure(struct sk_buff *skb)
-{
-}
-
-static inline int compare_keys(struct flowidn *fl1, struct flowidn *fl2)
-{
- return ((fl1->daddr ^ fl2->daddr) |
- (fl1->saddr ^ fl2->saddr) |
- (fl1->flowidn_mark ^ fl2->flowidn_mark) |
- (fl1->flowidn_scope ^ fl2->flowidn_scope) |
- (fl1->flowidn_oif ^ fl2->flowidn_oif) |
- (fl1->flowidn_iif ^ fl2->flowidn_iif)) == 0;
-}
-
-static int dn_insert_route(struct dn_route *rt, unsigned int hash, struct dn_route **rp)
-{
- struct dn_route *rth;
- struct dn_route __rcu **rthp;
- unsigned long now = jiffies;
-
- rthp = &dn_rt_hash_table[hash].chain;
-
- spin_lock_bh(&dn_rt_hash_table[hash].lock);
- while ((rth = rcu_dereference_protected(*rthp,
- lockdep_is_held(&dn_rt_hash_table[hash].lock))) != NULL) {
- if (compare_keys(&rth->fld, &rt->fld)) {
- /* Put it first */
- *rthp = rth->dn_next;
- rcu_assign_pointer(rth->dn_next,
- dn_rt_hash_table[hash].chain);
- rcu_assign_pointer(dn_rt_hash_table[hash].chain, rth);
-
- dst_hold_and_use(&rth->dst, now);
- spin_unlock_bh(&dn_rt_hash_table[hash].lock);
-
- dst_release_immediate(&rt->dst);
- *rp = rth;
- return 0;
- }
- rthp = &rth->dn_next;
- }
-
- rcu_assign_pointer(rt->dn_next, dn_rt_hash_table[hash].chain);
- rcu_assign_pointer(dn_rt_hash_table[hash].chain, rt);
-
- dst_hold_and_use(&rt->dst, now);
- spin_unlock_bh(&dn_rt_hash_table[hash].lock);
- *rp = rt;
- return 0;
-}
-
-static void dn_run_flush(struct timer_list *unused)
-{
- int i;
- struct dn_route *rt, *next;
-
- for (i = 0; i < dn_rt_hash_mask; i++) {
- spin_lock_bh(&dn_rt_hash_table[i].lock);
-
- if ((rt = xchg((struct dn_route **)&dn_rt_hash_table[i].chain, NULL)) == NULL)
- goto nothing_to_declare;
-
- for(; rt; rt = next) {
- next = rcu_dereference_raw(rt->dn_next);
- RCU_INIT_POINTER(rt->dn_next, NULL);
- dst_dev_put(&rt->dst);
- dst_release(&rt->dst);
- }
-
-nothing_to_declare:
- spin_unlock_bh(&dn_rt_hash_table[i].lock);
- }
-}
-
-static DEFINE_SPINLOCK(dn_rt_flush_lock);
-
-void dn_rt_cache_flush(int delay)
-{
- unsigned long now = jiffies;
- int user_mode = !in_interrupt();
-
- if (delay < 0)
- delay = dn_rt_min_delay;
-
- spin_lock_bh(&dn_rt_flush_lock);
-
- if (del_timer(&dn_rt_flush_timer) && delay > 0 && dn_rt_deadline) {
- long tmo = (long)(dn_rt_deadline - now);
-
- if (user_mode && tmo < dn_rt_max_delay - dn_rt_min_delay)
- tmo = 0;
-
- if (delay > tmo)
- delay = tmo;
- }
-
- if (delay <= 0) {
- spin_unlock_bh(&dn_rt_flush_lock);
- dn_run_flush(NULL);
- return;
- }
-
- if (dn_rt_deadline == 0)
- dn_rt_deadline = now + dn_rt_max_delay;
-
- dn_rt_flush_timer.expires = now + delay;
- add_timer(&dn_rt_flush_timer);
- spin_unlock_bh(&dn_rt_flush_lock);
-}
-
-/**
- * dn_return_short - Return a short packet to its sender
- * @skb: The packet to return
- *
- */
-static int dn_return_short(struct sk_buff *skb)
-{
- struct dn_skb_cb *cb;
- unsigned char *ptr;
- __le16 *src;
- __le16 *dst;
-
- /* Add back headers */
- skb_push(skb, skb->data - skb_network_header(skb));
-
- if ((skb = skb_unshare(skb, GFP_ATOMIC)) == NULL)
- return NET_RX_DROP;
-
- cb = DN_SKB_CB(skb);
- /* Skip packet length and point to flags */
- ptr = skb->data + 2;
- *ptr++ = (cb->rt_flags & ~DN_RT_F_RQR) | DN_RT_F_RTS;
-
- dst = (__le16 *)ptr;
- ptr += 2;
- src = (__le16 *)ptr;
- ptr += 2;
- *ptr = 0; /* Zero hop count */
-
- swap(*src, *dst);
-
- skb->pkt_type = PACKET_OUTGOING;
- dn_rt_finish_output(skb, NULL, NULL);
- return NET_RX_SUCCESS;
-}
-
-/**
- * dn_return_long - Return a long packet to its sender
- * @skb: The long format packet to return
- *
- */
-static int dn_return_long(struct sk_buff *skb)
-{
- struct dn_skb_cb *cb;
- unsigned char *ptr;
- unsigned char *src_addr, *dst_addr;
- unsigned char tmp[ETH_ALEN];
-
- /* Add back all headers */
- skb_push(skb, skb->data - skb_network_header(skb));
-
- if ((skb = skb_unshare(skb, GFP_ATOMIC)) == NULL)
- return NET_RX_DROP;
-
- cb = DN_SKB_CB(skb);
- /* Ignore packet length and point to flags */
- ptr = skb->data + 2;
-
- /* Skip padding */
- if (*ptr & DN_RT_F_PF) {
- char padlen = (*ptr & ~DN_RT_F_PF);
- ptr += padlen;
- }
-
- *ptr++ = (cb->rt_flags & ~DN_RT_F_RQR) | DN_RT_F_RTS;
- ptr += 2;
- dst_addr = ptr;
- ptr += 8;
- src_addr = ptr;
- ptr += 6;
- *ptr = 0; /* Zero hop count */
-
- /* Swap source and destination */
- memcpy(tmp, src_addr, ETH_ALEN);
- memcpy(src_addr, dst_addr, ETH_ALEN);
- memcpy(dst_addr, tmp, ETH_ALEN);
-
- skb->pkt_type = PACKET_OUTGOING;
- dn_rt_finish_output(skb, dst_addr, src_addr);
- return NET_RX_SUCCESS;
-}
-
-/**
- * dn_route_rx_packet - Try and find a route for an incoming packet
- * @skb: The packet to find a route for
- *
- * Returns: result of input function if route is found, error code otherwise
- */
-static int dn_route_rx_packet(struct net *net, struct sock *sk, struct sk_buff *skb)
-{
- struct dn_skb_cb *cb;
- int err;
-
- if ((err = dn_route_input(skb)) == 0)
- return dst_input(skb);
-
- cb = DN_SKB_CB(skb);
- if (decnet_debug_level & 4) {
- char *devname = skb->dev ? skb->dev->name : "???";
-
- printk(KERN_DEBUG
- "DECnet: dn_route_rx_packet: rt_flags=0x%02x dev=%s len=%d src=0x%04hx dst=0x%04hx err=%d type=%d\n",
- (int)cb->rt_flags, devname, skb->len,
- le16_to_cpu(cb->src), le16_to_cpu(cb->dst),
- err, skb->pkt_type);
- }
-
- if ((skb->pkt_type == PACKET_HOST) && (cb->rt_flags & DN_RT_F_RQR)) {
- switch (cb->rt_flags & DN_RT_PKT_MSK) {
- case DN_RT_PKT_SHORT:
- return dn_return_short(skb);
- case DN_RT_PKT_LONG:
- return dn_return_long(skb);
- }
- }
-
- kfree_skb(skb);
- return NET_RX_DROP;
-}
-
-static int dn_route_rx_long(struct sk_buff *skb)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- unsigned char *ptr = skb->data;
-
- if (!pskb_may_pull(skb, 21)) /* 20 for long header, 1 for shortest nsp */
- goto drop_it;
-
- skb_pull(skb, 20);
- skb_reset_transport_header(skb);
-
- /* Destination info */
- ptr += 2;
- cb->dst = dn_eth2dn(ptr);
- if (memcmp(ptr, dn_hiord_addr, 4) != 0)
- goto drop_it;
- ptr += 6;
-
-
- /* Source info */
- ptr += 2;
- cb->src = dn_eth2dn(ptr);
- if (memcmp(ptr, dn_hiord_addr, 4) != 0)
- goto drop_it;
- ptr += 6;
- /* Other junk */
- ptr++;
- cb->hops = *ptr++; /* Visit Count */
-
- return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING,
- &init_net, NULL, skb, skb->dev, NULL,
- dn_route_rx_packet);
-
-drop_it:
- kfree_skb(skb);
- return NET_RX_DROP;
-}
-
-
-
-static int dn_route_rx_short(struct sk_buff *skb)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- unsigned char *ptr = skb->data;
-
- if (!pskb_may_pull(skb, 6)) /* 5 for short header + 1 for shortest nsp */
- goto drop_it;
-
- skb_pull(skb, 5);
- skb_reset_transport_header(skb);
-
- cb->dst = *(__le16 *)ptr;
- ptr += 2;
- cb->src = *(__le16 *)ptr;
- ptr += 2;
- cb->hops = *ptr & 0x3f;
-
- return NF_HOOK(NFPROTO_DECNET, NF_DN_PRE_ROUTING,
- &init_net, NULL, skb, skb->dev, NULL,
- dn_route_rx_packet);
-
-drop_it:
- kfree_skb(skb);
- return NET_RX_DROP;
-}
-
-static int dn_route_discard(struct net *net, struct sock *sk, struct sk_buff *skb)
-{
- /*
- * I know we drop the packet here, but thats considered success in
- * this case
- */
- kfree_skb(skb);
- return NET_RX_SUCCESS;
-}
-
-static int dn_route_ptp_hello(struct net *net, struct sock *sk, struct sk_buff *skb)
-{
- dn_dev_hello(skb);
- dn_neigh_pointopoint_hello(skb);
- return NET_RX_SUCCESS;
-}
-
-int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
-{
- struct dn_skb_cb *cb;
- unsigned char flags = 0;
- __u16 len = le16_to_cpu(*(__le16 *)skb->data);
- struct dn_dev *dn = rcu_dereference(dev->dn_ptr);
- unsigned char padlen = 0;
-
- if (!net_eq(dev_net(dev), &init_net))
- goto dump_it;
-
- if (dn == NULL)
- goto dump_it;
-
- if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
- goto out;
-
- if (!pskb_may_pull(skb, 3))
- goto dump_it;
-
- skb_pull(skb, 2);
-
- if (len > skb->len)
- goto dump_it;
-
- skb_trim(skb, len);
-
- flags = *skb->data;
-
- cb = DN_SKB_CB(skb);
- cb->stamp = jiffies;
- cb->iif = dev->ifindex;
-
- /*
- * If we have padding, remove it.
- */
- if (flags & DN_RT_F_PF) {
- padlen = flags & ~DN_RT_F_PF;
- if (!pskb_may_pull(skb, padlen + 1))
- goto dump_it;
- skb_pull(skb, padlen);
- flags = *skb->data;
- }
-
- skb_reset_network_header(skb);
-
- /*
- * Weed out future version DECnet
- */
- if (flags & DN_RT_F_VER)
- goto dump_it;
-
- cb->rt_flags = flags;
-
- if (decnet_debug_level & 1)
- printk(KERN_DEBUG
- "dn_route_rcv: got 0x%02x from %s [%d %d %d]\n",
- (int)flags, (dev) ? dev->name : "???", len, skb->len,
- padlen);
-
- if (flags & DN_RT_PKT_CNTL) {
- if (unlikely(skb_linearize(skb)))
- goto dump_it;
-
- switch (flags & DN_RT_CNTL_MSK) {
- case DN_RT_PKT_INIT:
- dn_dev_init_pkt(skb);
- break;
- case DN_RT_PKT_VERI:
- dn_dev_veri_pkt(skb);
- break;
- }
-
- if (dn->parms.state != DN_DEV_S_RU)
- goto dump_it;
-
- switch (flags & DN_RT_CNTL_MSK) {
- case DN_RT_PKT_HELO:
- return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO,
- &init_net, NULL, skb, skb->dev, NULL,
- dn_route_ptp_hello);
-
- case DN_RT_PKT_L1RT:
- case DN_RT_PKT_L2RT:
- return NF_HOOK(NFPROTO_DECNET, NF_DN_ROUTE,
- &init_net, NULL, skb, skb->dev, NULL,
- dn_route_discard);
- case DN_RT_PKT_ERTH:
- return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO,
- &init_net, NULL, skb, skb->dev, NULL,
- dn_neigh_router_hello);
-
- case DN_RT_PKT_EEDH:
- return NF_HOOK(NFPROTO_DECNET, NF_DN_HELLO,
- &init_net, NULL, skb, skb->dev, NULL,
- dn_neigh_endnode_hello);
- }
- } else {
- if (dn->parms.state != DN_DEV_S_RU)
- goto dump_it;
-
- skb_pull(skb, 1); /* Pull flags */
-
- switch (flags & DN_RT_PKT_MSK) {
- case DN_RT_PKT_LONG:
- return dn_route_rx_long(skb);
- case DN_RT_PKT_SHORT:
- return dn_route_rx_short(skb);
- }
- }
-
-dump_it:
- kfree_skb(skb);
-out:
- return NET_RX_DROP;
-}
-
-static int dn_output(struct net *net, struct sock *sk, struct sk_buff *skb)
-{
- struct dst_entry *dst = skb_dst(skb);
- struct dn_route *rt = (struct dn_route *)dst;
- struct net_device *dev = dst->dev;
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
-
- int err = -EINVAL;
-
- if (rt->n == NULL)
- goto error;
-
- skb->dev = dev;
-
- cb->src = rt->rt_saddr;
- cb->dst = rt->rt_daddr;
-
- /*
- * Always set the Intra-Ethernet bit on all outgoing packets
- * originated on this node. Only valid flag from upper layers
- * is return-to-sender-requested. Set hop count to 0 too.
- */
- cb->rt_flags &= ~DN_RT_F_RQR;
- cb->rt_flags |= DN_RT_F_IE;
- cb->hops = 0;
-
- return NF_HOOK(NFPROTO_DECNET, NF_DN_LOCAL_OUT,
- &init_net, sk, skb, NULL, dev,
- dn_to_neigh_output);
-
-error:
- net_dbg_ratelimited("dn_output: This should not happen\n");
-
- kfree_skb(skb);
-
- return err;
-}
-
-static int dn_forward(struct sk_buff *skb)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- struct dst_entry *dst = skb_dst(skb);
- struct dn_dev *dn_db = rcu_dereference(dst->dev->dn_ptr);
- struct dn_route *rt;
- int header_len;
- struct net_device *dev = skb->dev;
-
- if (skb->pkt_type != PACKET_HOST)
- goto drop;
-
- /* Ensure that we have enough space for headers */
- rt = (struct dn_route *)skb_dst(skb);
- header_len = dn_db->use_long ? 21 : 6;
- if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+header_len))
- goto drop;
-
- /*
- * Hop count exceeded.
- */
- if (++cb->hops > 30)
- goto drop;
-
- skb->dev = rt->dst.dev;
-
- /*
- * If packet goes out same interface it came in on, then set
- * the Intra-Ethernet bit. This has no effect for short
- * packets, so we don't need to test for them here.
- */
- cb->rt_flags &= ~DN_RT_F_IE;
- if (rt->rt_flags & RTCF_DOREDIRECT)
- cb->rt_flags |= DN_RT_F_IE;
-
- return NF_HOOK(NFPROTO_DECNET, NF_DN_FORWARD,
- &init_net, NULL, skb, dev, skb->dev,
- dn_to_neigh_output);
-
-drop:
- kfree_skb(skb);
- return NET_RX_DROP;
-}
-
-/*
- * Used to catch bugs. This should never normally get
- * called.
- */
-static int dn_rt_bug_out(struct net *net, struct sock *sk, struct sk_buff *skb)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
-
- net_dbg_ratelimited("dn_rt_bug: skb from:%04x to:%04x\n",
- le16_to_cpu(cb->src), le16_to_cpu(cb->dst));
-
- kfree_skb(skb);
-
- return NET_RX_DROP;
-}
-
-static int dn_rt_bug(struct sk_buff *skb)
-{
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
-
- net_dbg_ratelimited("dn_rt_bug: skb from:%04x to:%04x\n",
- le16_to_cpu(cb->src), le16_to_cpu(cb->dst));
-
- kfree_skb(skb);
-
- return NET_RX_DROP;
-}
-
-static unsigned int dn_dst_default_advmss(const struct dst_entry *dst)
-{
- return dn_mss_from_pmtu(dst->dev, dst_mtu(dst));
-}
-
-static unsigned int dn_dst_mtu(const struct dst_entry *dst)
-{
- unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
-
- return mtu ? : dst->dev->mtu;
-}
-
-static struct neighbour *dn_dst_neigh_lookup(const struct dst_entry *dst,
- struct sk_buff *skb,
- const void *daddr)
-{
- return __neigh_lookup_errno(&dn_neigh_table, daddr, dst->dev);
-}
-
-static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res)
-{
- struct dn_fib_info *fi = res->fi;
- struct net_device *dev = rt->dst.dev;
- unsigned int mss_metric;
- struct neighbour *n;
-
- if (fi) {
- if (DN_FIB_RES_GW(*res) &&
- DN_FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
- rt->rt_gateway = DN_FIB_RES_GW(*res);
- dst_init_metrics(&rt->dst, fi->fib_metrics, true);
- }
- rt->rt_type = res->type;
-
- if (dev != NULL && rt->n == NULL) {
- n = __neigh_lookup_errno(&dn_neigh_table, &rt->rt_gateway, dev);
- if (IS_ERR(n))
- return PTR_ERR(n);
- rt->n = n;
- }
-
- if (dst_metric(&rt->dst, RTAX_MTU) > rt->dst.dev->mtu)
- dst_metric_set(&rt->dst, RTAX_MTU, rt->dst.dev->mtu);
- mss_metric = dst_metric_raw(&rt->dst, RTAX_ADVMSS);
- if (mss_metric) {
- unsigned int mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->dst));
- if (mss_metric > mss)
- dst_metric_set(&rt->dst, RTAX_ADVMSS, mss);
- }
- return 0;
-}
-
-static inline int dn_match_addr(__le16 addr1, __le16 addr2)
-{
- __u16 tmp = le16_to_cpu(addr1) ^ le16_to_cpu(addr2);
- int match = 16;
- while(tmp) {
- tmp >>= 1;
- match--;
- }
- return match;
-}
-
-static __le16 dnet_select_source(const struct net_device *dev, __le16 daddr, int scope)
-{
- __le16 saddr = 0;
- struct dn_dev *dn_db;
- struct dn_ifaddr *ifa;
- int best_match = 0;
- int ret;
-
- rcu_read_lock();
- dn_db = rcu_dereference(dev->dn_ptr);
- for (ifa = rcu_dereference(dn_db->ifa_list);
- ifa != NULL;
- ifa = rcu_dereference(ifa->ifa_next)) {
- if (ifa->ifa_scope > scope)
- continue;
- if (!daddr) {
- saddr = ifa->ifa_local;
- break;
- }
- ret = dn_match_addr(daddr, ifa->ifa_local);
- if (ret > best_match)
- saddr = ifa->ifa_local;
- if (best_match == 0)
- saddr = ifa->ifa_local;
- }
- rcu_read_unlock();
-
- return saddr;
-}
-
-static inline __le16 __dn_fib_res_prefsrc(struct dn_fib_res *res)
-{
- return dnet_select_source(DN_FIB_RES_DEV(*res), DN_FIB_RES_GW(*res), res->scope);
-}
-
-static inline __le16 dn_fib_rules_map_destination(__le16 daddr, struct dn_fib_res *res)
-{
- __le16 mask = dnet_make_mask(res->prefixlen);
- return (daddr&~mask)|res->fi->fib_nh->nh_gw;
-}
-
-static int dn_route_output_slow(struct dst_entry **pprt, const struct flowidn *oldflp, int try_hard)
-{
- struct flowidn fld = {
- .daddr = oldflp->daddr,
- .saddr = oldflp->saddr,
- .flowidn_scope = RT_SCOPE_UNIVERSE,
- .flowidn_mark = oldflp->flowidn_mark,
- .flowidn_iif = LOOPBACK_IFINDEX,
- .flowidn_oif = oldflp->flowidn_oif,
- };
- struct dn_route *rt = NULL;
- struct net_device *dev_out = NULL, *dev;
- struct neighbour *neigh = NULL;
- unsigned int hash;
- unsigned int flags = 0;
- struct dn_fib_res res = { .fi = NULL, .type = RTN_UNICAST };
- int err;
- int free_res = 0;
- __le16 gateway = 0;
-
- if (decnet_debug_level & 16)
- printk(KERN_DEBUG
- "dn_route_output_slow: dst=%04x src=%04x mark=%d"
- " iif=%d oif=%d\n", le16_to_cpu(oldflp->daddr),
- le16_to_cpu(oldflp->saddr),
- oldflp->flowidn_mark, LOOPBACK_IFINDEX,
- oldflp->flowidn_oif);
-
- /* If we have an output interface, verify its a DECnet device */
- if (oldflp->flowidn_oif) {
- dev_out = dev_get_by_index(&init_net, oldflp->flowidn_oif);
- err = -ENODEV;
- if (dev_out && dev_out->dn_ptr == NULL) {
- dev_put(dev_out);
- dev_out = NULL;
- }
- if (dev_out == NULL)
- goto out;
- }
-
- /* If we have a source address, verify that its a local address */
- if (oldflp->saddr) {
- err = -EADDRNOTAVAIL;
-
- if (dev_out) {
- if (dn_dev_islocal(dev_out, oldflp->saddr))
- goto source_ok;
- dev_put(dev_out);
- goto out;
- }
- rcu_read_lock();
- for_each_netdev_rcu(&init_net, dev) {
- if (!dev->dn_ptr)
- continue;
- if (!dn_dev_islocal(dev, oldflp->saddr))
- continue;
- if ((dev->flags & IFF_LOOPBACK) &&
- oldflp->daddr &&
- !dn_dev_islocal(dev, oldflp->daddr))
- continue;
-
- dev_out = dev;
- break;
- }
- rcu_read_unlock();
- if (dev_out == NULL)
- goto out;
- dev_hold(dev_out);
-source_ok:
- ;
- }
-
- /* No destination? Assume its local */
- if (!fld.daddr) {
- fld.daddr = fld.saddr;
-
- if (dev_out)
- dev_put(dev_out);
- err = -EINVAL;
- dev_out = init_net.loopback_dev;
- if (!dev_out->dn_ptr)
- goto out;
- err = -EADDRNOTAVAIL;
- dev_hold(dev_out);
- if (!fld.daddr) {
- fld.daddr =
- fld.saddr = dnet_select_source(dev_out, 0,
- RT_SCOPE_HOST);
- if (!fld.daddr)
- goto out;
- }
- fld.flowidn_oif = LOOPBACK_IFINDEX;
- res.type = RTN_LOCAL;
- goto make_route;
- }
-
- if (decnet_debug_level & 16)
- printk(KERN_DEBUG
- "dn_route_output_slow: initial checks complete."
- " dst=%04x src=%04x oif=%d try_hard=%d\n",
- le16_to_cpu(fld.daddr), le16_to_cpu(fld.saddr),
- fld.flowidn_oif, try_hard);
-
- /*
- * N.B. If the kernel is compiled without router support then
- * dn_fib_lookup() will evaluate to non-zero so this if () block
- * will always be executed.
- */
- err = -ESRCH;
- if (try_hard || (err = dn_fib_lookup(&fld, &res)) != 0) {
- struct dn_dev *dn_db;
- if (err != -ESRCH)
- goto out;
- /*
- * Here the fallback is basically the standard algorithm for
- * routing in endnodes which is described in the DECnet routing
- * docs
- *
- * If we are not trying hard, look in neighbour cache.
- * The result is tested to ensure that if a specific output
- * device/source address was requested, then we honour that
- * here
- */
- if (!try_hard) {
- neigh = neigh_lookup_nodev(&dn_neigh_table, &init_net, &fld.daddr);
- if (neigh) {
- if ((oldflp->flowidn_oif &&
- (neigh->dev->ifindex != oldflp->flowidn_oif)) ||
- (oldflp->saddr &&
- (!dn_dev_islocal(neigh->dev,
- oldflp->saddr)))) {
- neigh_release(neigh);
- neigh = NULL;
- } else {
- if (dev_out)
- dev_put(dev_out);
- if (dn_dev_islocal(neigh->dev, fld.daddr)) {
- dev_out = init_net.loopback_dev;
- res.type = RTN_LOCAL;
- } else {
- dev_out = neigh->dev;
- }
- dev_hold(dev_out);
- goto select_source;
- }
- }
- }
-
- /* Not there? Perhaps its a local address */
- if (dev_out == NULL)
- dev_out = dn_dev_get_default();
- err = -ENODEV;
- if (dev_out == NULL)
- goto out;
- dn_db = rcu_dereference_raw(dev_out->dn_ptr);
- if (!dn_db)
- goto e_inval;
- /* Possible improvement - check all devices for local addr */
- if (dn_dev_islocal(dev_out, fld.daddr)) {
- dev_put(dev_out);
- dev_out = init_net.loopback_dev;
- dev_hold(dev_out);
- res.type = RTN_LOCAL;
- goto select_source;
- }
- /* Not local either.... try sending it to the default router */
- neigh = neigh_clone(dn_db->router);
- BUG_ON(neigh && neigh->dev != dev_out);
-
- /* Ok then, we assume its directly connected and move on */
-select_source:
- if (neigh)
- gateway = ((struct dn_neigh *)neigh)->addr;
- if (gateway == 0)
- gateway = fld.daddr;
- if (fld.saddr == 0) {
- fld.saddr = dnet_select_source(dev_out, gateway,
- res.type == RTN_LOCAL ?
- RT_SCOPE_HOST :
- RT_SCOPE_LINK);
- if (fld.saddr == 0 && res.type != RTN_LOCAL)
- goto e_addr;
- }
- fld.flowidn_oif = dev_out->ifindex;
- goto make_route;
- }
- free_res = 1;
-
- if (res.type == RTN_NAT)
- goto e_inval;
-
- if (res.type == RTN_LOCAL) {
- if (!fld.saddr)
- fld.saddr = fld.daddr;
- if (dev_out)
- dev_put(dev_out);
- dev_out = init_net.loopback_dev;
- dev_hold(dev_out);
- if (!dev_out->dn_ptr)
- goto e_inval;
- fld.flowidn_oif = dev_out->ifindex;
- if (res.fi)
- dn_fib_info_put(res.fi);
- res.fi = NULL;
- goto make_route;
- }
-
- if (res.fi->fib_nhs > 1 && fld.flowidn_oif == 0)
- dn_fib_select_multipath(&fld, &res);
-
- /*
- * We could add some logic to deal with default routes here and
- * get rid of some of the special casing above.
- */
-
- if (!fld.saddr)
- fld.saddr = DN_FIB_RES_PREFSRC(res);
-
- if (dev_out)
- dev_put(dev_out);
- dev_out = DN_FIB_RES_DEV(res);
- dev_hold(dev_out);
- fld.flowidn_oif = dev_out->ifindex;
- gateway = DN_FIB_RES_GW(res);
-
-make_route:
- if (dev_out->flags & IFF_LOOPBACK)
- flags |= RTCF_LOCAL;
-
- rt = dst_alloc(&dn_dst_ops, dev_out, 0, DST_OBSOLETE_NONE, DST_HOST);
- if (rt == NULL)
- goto e_nobufs;
-
- rt->dn_next = NULL;
- memset(&rt->fld, 0, sizeof(rt->fld));
- rt->fld.saddr = oldflp->saddr;
- rt->fld.daddr = oldflp->daddr;
- rt->fld.flowidn_oif = oldflp->flowidn_oif;
- rt->fld.flowidn_iif = 0;
- rt->fld.flowidn_mark = oldflp->flowidn_mark;
-
- rt->rt_saddr = fld.saddr;
- rt->rt_daddr = fld.daddr;
- rt->rt_gateway = gateway ? gateway : fld.daddr;
- rt->rt_local_src = fld.saddr;
-
- rt->rt_dst_map = fld.daddr;
- rt->rt_src_map = fld.saddr;
-
- rt->n = neigh;
- neigh = NULL;
-
- rt->dst.lastuse = jiffies;
- rt->dst.output = dn_output;
- rt->dst.input = dn_rt_bug;
- rt->rt_flags = flags;
- if (flags & RTCF_LOCAL)
- rt->dst.input = dn_nsp_rx;
-
- err = dn_rt_set_next_hop(rt, &res);
- if (err)
- goto e_neighbour;
-
- hash = dn_hash(rt->fld.saddr, rt->fld.daddr);
- /* dn_insert_route() increments dst->__refcnt */
- dn_insert_route(rt, hash, (struct dn_route **)pprt);
-
-done:
- if (neigh)
- neigh_release(neigh);
- if (free_res)
- dn_fib_res_put(&res);
- if (dev_out)
- dev_put(dev_out);
-out:
- return err;
-
-e_addr:
- err = -EADDRNOTAVAIL;
- goto done;
-e_inval:
- err = -EINVAL;
- goto done;
-e_nobufs:
- err = -ENOBUFS;
- goto done;
-e_neighbour:
- dst_release_immediate(&rt->dst);
- goto e_nobufs;
-}
-
-
-/*
- * N.B. The flags may be moved into the flowi at some future stage.
- */
-static int __dn_route_output_key(struct dst_entry **pprt, const struct flowidn *flp, int flags)
-{
- unsigned int hash = dn_hash(flp->saddr, flp->daddr);
- struct dn_route *rt = NULL;
-
- if (!(flags & MSG_TRYHARD)) {
- rcu_read_lock_bh();
- for (rt = rcu_dereference_bh(dn_rt_hash_table[hash].chain); rt;
- rt = rcu_dereference_bh(rt->dn_next)) {
- if ((flp->daddr == rt->fld.daddr) &&
- (flp->saddr == rt->fld.saddr) &&
- (flp->flowidn_mark == rt->fld.flowidn_mark) &&
- dn_is_output_route(rt) &&
- (rt->fld.flowidn_oif == flp->flowidn_oif)) {
- dst_hold_and_use(&rt->dst, jiffies);
- rcu_read_unlock_bh();
- *pprt = &rt->dst;
- return 0;
- }
- }
- rcu_read_unlock_bh();
- }
-
- return dn_route_output_slow(pprt, flp, flags);
-}
-
-static int dn_route_output_key(struct dst_entry **pprt, struct flowidn *flp, int flags)
-{
- int err;
-
- err = __dn_route_output_key(pprt, flp, flags);
- if (err == 0 && flp->flowidn_proto) {
- *pprt = xfrm_lookup(&init_net, *pprt,
- flowidn_to_flowi(flp), NULL, 0);
- if (IS_ERR(*pprt)) {
- err = PTR_ERR(*pprt);
- *pprt = NULL;
- }
- }
- return err;
-}
-
-int dn_route_output_sock(struct dst_entry __rcu **pprt, struct flowidn *fl, struct sock *sk, int flags)
-{
- int err;
-
- err = __dn_route_output_key(pprt, fl, flags & MSG_TRYHARD);
- if (err == 0 && fl->flowidn_proto) {
- *pprt = xfrm_lookup(&init_net, *pprt,
- flowidn_to_flowi(fl), sk, 0);
- if (IS_ERR(*pprt)) {
- err = PTR_ERR(*pprt);
- *pprt = NULL;
- }
- }
- return err;
-}
-
-static int dn_route_input_slow(struct sk_buff *skb)
-{
- struct dn_route *rt = NULL;
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- struct net_device *in_dev = skb->dev;
- struct net_device *out_dev = NULL;
- struct dn_dev *dn_db;
- struct neighbour *neigh = NULL;
- unsigned int hash;
- int flags = 0;
- __le16 gateway = 0;
- __le16 local_src = 0;
- struct flowidn fld = {
- .daddr = cb->dst,
- .saddr = cb->src,
- .flowidn_scope = RT_SCOPE_UNIVERSE,
- .flowidn_mark = skb->mark,
- .flowidn_iif = skb->dev->ifindex,
- };
- struct dn_fib_res res = { .fi = NULL, .type = RTN_UNREACHABLE };
- int err = -EINVAL;
- int free_res = 0;
-
- dev_hold(in_dev);
-
- if ((dn_db = rcu_dereference(in_dev->dn_ptr)) == NULL)
- goto out;
-
- /* Zero source addresses are not allowed */
- if (fld.saddr == 0)
- goto out;
-
- /*
- * In this case we've just received a packet from a source
- * outside ourselves pretending to come from us. We don't
- * allow it any further to prevent routing loops, spoofing and
- * other nasties. Loopback packets already have the dst attached
- * so this only affects packets which have originated elsewhere.
- */
- err = -ENOTUNIQ;
- if (dn_dev_islocal(in_dev, cb->src))
- goto out;
-
- err = dn_fib_lookup(&fld, &res);
- if (err) {
- if (err != -ESRCH)
- goto out;
- /*
- * Is the destination us ?
- */
- if (!dn_dev_islocal(in_dev, cb->dst))
- goto e_inval;
-
- res.type = RTN_LOCAL;
- } else {
- __le16 src_map = fld.saddr;
- free_res = 1;
-
- out_dev = DN_FIB_RES_DEV(res);
- if (out_dev == NULL) {
- net_crit_ratelimited("Bug in dn_route_input_slow() No output device\n");
- goto e_inval;
- }
- dev_hold(out_dev);
-
- if (res.r)
- src_map = fld.saddr; /* no NAT support for now */
-
- gateway = DN_FIB_RES_GW(res);
- if (res.type == RTN_NAT) {
- fld.daddr = dn_fib_rules_map_destination(fld.daddr, &res);
- dn_fib_res_put(&res);
- free_res = 0;
- if (dn_fib_lookup(&fld, &res))
- goto e_inval;
- free_res = 1;
- if (res.type != RTN_UNICAST)
- goto e_inval;
- flags |= RTCF_DNAT;
- gateway = fld.daddr;
- }
- fld.saddr = src_map;
- }
-
- switch(res.type) {
- case RTN_UNICAST:
- /*
- * Forwarding check here, we only check for forwarding
- * being turned off, if you want to only forward intra
- * area, its up to you to set the routing tables up
- * correctly.
- */
- if (dn_db->parms.forwarding == 0)
- goto e_inval;
-
- if (res.fi->fib_nhs > 1 && fld.flowidn_oif == 0)
- dn_fib_select_multipath(&fld, &res);
-
- /*
- * Check for out_dev == in_dev. We use the RTCF_DOREDIRECT
- * flag as a hint to set the intra-ethernet bit when
- * forwarding. If we've got NAT in operation, we don't do
- * this optimisation.
- */
- if (out_dev == in_dev && !(flags & RTCF_NAT))
- flags |= RTCF_DOREDIRECT;
-
- local_src = DN_FIB_RES_PREFSRC(res);
-
- case RTN_BLACKHOLE:
- case RTN_UNREACHABLE:
- break;
- case RTN_LOCAL:
- flags |= RTCF_LOCAL;
- fld.saddr = cb->dst;
- fld.daddr = cb->src;
-
- /* Routing tables gave us a gateway */
- if (gateway)
- goto make_route;
-
- /* Packet was intra-ethernet, so we know its on-link */
- if (cb->rt_flags & DN_RT_F_IE) {
- gateway = cb->src;
- goto make_route;
- }
-
- /* Use the default router if there is one */
- neigh = neigh_clone(dn_db->router);
- if (neigh) {
- gateway = ((struct dn_neigh *)neigh)->addr;
- goto make_route;
- }
-
- /* Close eyes and pray */
- gateway = cb->src;
- goto make_route;
- default:
- goto e_inval;
- }
-
-make_route:
- rt = dst_alloc(&dn_dst_ops, out_dev, 1, DST_OBSOLETE_NONE, DST_HOST);
- if (rt == NULL)
- goto e_nobufs;
-
- rt->dn_next = NULL;
- memset(&rt->fld, 0, sizeof(rt->fld));
- rt->rt_saddr = fld.saddr;
- rt->rt_daddr = fld.daddr;
- rt->rt_gateway = fld.daddr;
- if (gateway)
- rt->rt_gateway = gateway;
- rt->rt_local_src = local_src ? local_src : rt->rt_saddr;
-
- rt->rt_dst_map = fld.daddr;
- rt->rt_src_map = fld.saddr;
-
- rt->fld.saddr = cb->src;
- rt->fld.daddr = cb->dst;
- rt->fld.flowidn_oif = 0;
- rt->fld.flowidn_iif = in_dev->ifindex;
- rt->fld.flowidn_mark = fld.flowidn_mark;
-
- rt->n = neigh;
- rt->dst.lastuse = jiffies;
- rt->dst.output = dn_rt_bug_out;
- switch (res.type) {
- case RTN_UNICAST:
- rt->dst.input = dn_forward;
- break;
- case RTN_LOCAL:
- rt->dst.output = dn_output;
- rt->dst.input = dn_nsp_rx;
- rt->dst.dev = in_dev;
- flags |= RTCF_LOCAL;
- break;
- default:
- case RTN_UNREACHABLE:
- case RTN_BLACKHOLE:
- rt->dst.input = dst_discard;
- }
- rt->rt_flags = flags;
-
- err = dn_rt_set_next_hop(rt, &res);
- if (err)
- goto e_neighbour;
-
- hash = dn_hash(rt->fld.saddr, rt->fld.daddr);
- /* dn_insert_route() increments dst->__refcnt */
- dn_insert_route(rt, hash, &rt);
- skb_dst_set(skb, &rt->dst);
-
-done:
- if (neigh)
- neigh_release(neigh);
- if (free_res)
- dn_fib_res_put(&res);
- dev_put(in_dev);
- if (out_dev)
- dev_put(out_dev);
-out:
- return err;
-
-e_inval:
- err = -EINVAL;
- goto done;
-
-e_nobufs:
- err = -ENOBUFS;
- goto done;
-
-e_neighbour:
- dst_release_immediate(&rt->dst);
- goto done;
-}
-
-static int dn_route_input(struct sk_buff *skb)
-{
- struct dn_route *rt;
- struct dn_skb_cb *cb = DN_SKB_CB(skb);
- unsigned int hash = dn_hash(cb->src, cb->dst);
-
- if (skb_dst(skb))
- return 0;
-
- rcu_read_lock();
- for(rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt != NULL;
- rt = rcu_dereference(rt->dn_next)) {
- if ((rt->fld.saddr == cb->src) &&
- (rt->fld.daddr == cb->dst) &&
- (rt->fld.flowidn_oif == 0) &&
- (rt->fld.flowidn_mark == skb->mark) &&
- (rt->fld.flowidn_iif == cb->iif)) {
- dst_hold_and_use(&rt->dst, jiffies);
- rcu_read_unlock();
- skb_dst_set(skb, (struct dst_entry *)rt);
- return 0;
- }
- }
- rcu_read_unlock();
-
- return dn_route_input_slow(skb);
-}
-
-static int dn_rt_fill_info(struct sk_buff *skb, u32 portid, u32 seq,
- int event, int nowait, unsigned int flags)
-{
- struct dn_route *rt = (struct dn_route *)skb_dst(skb);
- struct rtmsg *r;
- struct nlmsghdr *nlh;
- long expires;
-
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
- if (!nlh)
- return -EMSGSIZE;
-
- r = nlmsg_data(nlh);
- r->rtm_family = AF_DECnet;
- r->rtm_dst_len = 16;
- r->rtm_src_len = 0;
- r->rtm_tos = 0;
- r->rtm_table = RT_TABLE_MAIN;
- r->rtm_type = rt->rt_type;
- r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
- r->rtm_scope = RT_SCOPE_UNIVERSE;
- r->rtm_protocol = RTPROT_UNSPEC;
-
- if (rt->rt_flags & RTCF_NOTIFY)
- r->rtm_flags |= RTM_F_NOTIFY;
-
- if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN) < 0 ||
- nla_put_le16(skb, RTA_DST, rt->rt_daddr) < 0)
- goto errout;
-
- if (rt->fld.saddr) {
- r->rtm_src_len = 16;
- if (nla_put_le16(skb, RTA_SRC, rt->fld.saddr) < 0)
- goto errout;
- }
- if (rt->dst.dev &&
- nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex) < 0)
- goto errout;
-
- /*
- * Note to self - change this if input routes reverse direction when
- * they deal only with inputs and not with replies like they do
- * currently.
- */
- if (nla_put_le16(skb, RTA_PREFSRC, rt->rt_local_src) < 0)
- goto errout;
-
- if (rt->rt_daddr != rt->rt_gateway &&
- nla_put_le16(skb, RTA_GATEWAY, rt->rt_gateway) < 0)
- goto errout;
-
- if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
- goto errout;
-
- expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
- if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires,
- rt->dst.error) < 0)
- goto errout;
-
- if (dn_is_input_route(rt) &&
- nla_put_u32(skb, RTA_IIF, rt->fld.flowidn_iif) < 0)
- goto errout;
-
- nlmsg_end(skb, nlh);
- return 0;
-
-errout:
- nlmsg_cancel(skb, nlh);
- return -EMSGSIZE;
-}
-
-const struct nla_policy rtm_dn_policy[RTA_MAX + 1] = {
- [RTA_DST] = { .type = NLA_U16 },
- [RTA_SRC] = { .type = NLA_U16 },
- [RTA_IIF] = { .type = NLA_U32 },
- [RTA_OIF] = { .type = NLA_U32 },
- [RTA_GATEWAY] = { .type = NLA_U16 },
- [RTA_PRIORITY] = { .type = NLA_U32 },
- [RTA_PREFSRC] = { .type = NLA_U16 },
- [RTA_METRICS] = { .type = NLA_NESTED },
- [RTA_MULTIPATH] = { .type = NLA_NESTED },
- [RTA_TABLE] = { .type = NLA_U32 },
- [RTA_MARK] = { .type = NLA_U32 },
-};
-
-/*
- * This is called by both endnodes and routers now.
- */
-static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
- struct netlink_ext_ack *extack)
-{
- struct net *net = sock_net(in_skb->sk);
- struct rtmsg *rtm = nlmsg_data(nlh);
- struct dn_route *rt = NULL;
- struct dn_skb_cb *cb;
- int err;
- struct sk_buff *skb;
- struct flowidn fld;
- struct nlattr *tb[RTA_MAX+1];
-
- if (!net_eq(net, &init_net))
- return -EINVAL;
-
- err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_dn_policy,
- extack);
- if (err < 0)
- return err;
-
- memset(&fld, 0, sizeof(fld));
- fld.flowidn_proto = DNPROTO_NSP;
-
- skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
- if (skb == NULL)
- return -ENOBUFS;
- skb_reset_mac_header(skb);
- cb = DN_SKB_CB(skb);
-
- if (tb[RTA_SRC])
- fld.saddr = nla_get_le16(tb[RTA_SRC]);
-
- if (tb[RTA_DST])
- fld.daddr = nla_get_le16(tb[RTA_DST]);
-
- if (tb[RTA_IIF])
- fld.flowidn_iif = nla_get_u32(tb[RTA_IIF]);
-
- if (fld.flowidn_iif) {
- struct net_device *dev;
- dev = __dev_get_by_index(&init_net, fld.flowidn_iif);
- if (!dev || !dev->dn_ptr) {
- kfree_skb(skb);
- return -ENODEV;
- }
- skb->protocol = htons(ETH_P_DNA_RT);
- skb->dev = dev;
- cb->src = fld.saddr;
- cb->dst = fld.daddr;
- local_bh_disable();
- err = dn_route_input(skb);
- local_bh_enable();
- memset(cb, 0, sizeof(struct dn_skb_cb));
- rt = (struct dn_route *)skb_dst(skb);
- if (!err && -rt->dst.error)
- err = rt->dst.error;
- } else {
- if (tb[RTA_OIF])
- fld.flowidn_oif = nla_get_u32(tb[RTA_OIF]);
-
- err = dn_route_output_key((struct dst_entry **)&rt, &fld, 0);
- }
-
- skb->dev = NULL;
- if (err)
- goto out_free;
- skb_dst_set(skb, &rt->dst);
- if (rtm->rtm_flags & RTM_F_NOTIFY)
- rt->rt_flags |= RTCF_NOTIFY;
-
- err = dn_rt_fill_info(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0);
- if (err < 0) {
- err = -EMSGSIZE;
- goto out_free;
- }
-
- return rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).portid);
-
-out_free:
- kfree_skb(skb);
- return err;
-}
-
-/*
- * For routers, this is called from dn_fib_dump, but for endnodes its
- * called directly from the rtnetlink dispatch table.
- */
-int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb)
-{
- struct net *net = sock_net(skb->sk);
- struct dn_route *rt;
- int h, s_h;
- int idx, s_idx;
- struct rtmsg *rtm;
-
- if (!net_eq(net, &init_net))
- return 0;
-
- if (nlmsg_len(cb->nlh) < sizeof(struct rtmsg))
- return -EINVAL;
-
- rtm = nlmsg_data(cb->nlh);
- if (!(rtm->rtm_flags & RTM_F_CLONED))
- return 0;
-
- s_h = cb->args[0];
- s_idx = idx = cb->args[1];
- for(h = 0; h <= dn_rt_hash_mask; h++) {
- if (h < s_h)
- continue;
- if (h > s_h)
- s_idx = 0;
- rcu_read_lock_bh();
- for(rt = rcu_dereference_bh(dn_rt_hash_table[h].chain), idx = 0;
- rt;
- rt = rcu_dereference_bh(rt->dn_next), idx++) {
- if (idx < s_idx)
- continue;
- skb_dst_set(skb, dst_clone(&rt->dst));
- if (dn_rt_fill_info(skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, RTM_NEWROUTE,
- 1, NLM_F_MULTI) < 0) {
- skb_dst_drop(skb);
- rcu_read_unlock_bh();
- goto done;
- }
- skb_dst_drop(skb);
- }
- rcu_read_unlock_bh();
- }
-
-done:
- cb->args[0] = h;
- cb->args[1] = idx;
- return skb->len;
-}
-
-#ifdef CONFIG_PROC_FS
-struct dn_rt_cache_iter_state {
- int bucket;
-};
-
-static struct dn_route *dn_rt_cache_get_first(struct seq_file *seq)
-{
- struct dn_route *rt = NULL;
- struct dn_rt_cache_iter_state *s = seq->private;
-
- for(s->bucket = dn_rt_hash_mask; s->bucket >= 0; --s->bucket) {
- rcu_read_lock_bh();
- rt = rcu_dereference_bh(dn_rt_hash_table[s->bucket].chain);
- if (rt)
- break;
- rcu_read_unlock_bh();
- }
- return rt;
-}
-
-static struct dn_route *dn_rt_cache_get_next(struct seq_file *seq, struct dn_route *rt)
-{
- struct dn_rt_cache_iter_state *s = seq->private;
-
- rt = rcu_dereference_bh(rt->dn_next);
- while (!rt) {
- rcu_read_unlock_bh();
- if (--s->bucket < 0)
- break;
- rcu_read_lock_bh();
- rt = rcu_dereference_bh(dn_rt_hash_table[s->bucket].chain);
- }
- return rt;
-}
-
-static void *dn_rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
-{
- struct dn_route *rt = dn_rt_cache_get_first(seq);
-
- if (rt) {
- while(*pos && (rt = dn_rt_cache_get_next(seq, rt)))
- --*pos;
- }
- return *pos ? NULL : rt;
-}
-
-static void *dn_rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- struct dn_route *rt = dn_rt_cache_get_next(seq, v);
- ++*pos;
- return rt;
-}
-
-static void dn_rt_cache_seq_stop(struct seq_file *seq, void *v)
-{
- if (v)
- rcu_read_unlock_bh();
-}
-
-static int dn_rt_cache_seq_show(struct seq_file *seq, void *v)
-{
- struct dn_route *rt = v;
- char buf1[DN_ASCBUF_LEN], buf2[DN_ASCBUF_LEN];
-
- seq_printf(seq, "%-8s %-7s %-7s %04d %04d %04d\n",
- rt->dst.dev ? rt->dst.dev->name : "*",
- dn_addr2asc(le16_to_cpu(rt->rt_daddr), buf1),
- dn_addr2asc(le16_to_cpu(rt->rt_saddr), buf2),
- atomic_read(&rt->dst.__refcnt),
- rt->dst.__use, 0);
- return 0;
-}
-
-static const struct seq_operations dn_rt_cache_seq_ops = {
- .start = dn_rt_cache_seq_start,
- .next = dn_rt_cache_seq_next,
- .stop = dn_rt_cache_seq_stop,
- .show = dn_rt_cache_seq_show,
-};
-#endif /* CONFIG_PROC_FS */
-
-void __init dn_route_init(void)
-{
- int i, goal, order;
-
- dn_dst_ops.kmem_cachep =
- kmem_cache_create("dn_dst_cache", sizeof(struct dn_route), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
- dst_entries_init(&dn_dst_ops);
- timer_setup(&dn_route_timer, dn_dst_check_expire, 0);
- dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ;
- add_timer(&dn_route_timer);
-
- goal = totalram_pages >> (26 - PAGE_SHIFT);
-
- for(order = 0; (1UL << order) < goal; order++)
- /* NOTHING */;
-
- /*
- * Only want 1024 entries max, since the table is very, very unlikely
- * to be larger than that.
- */
- while(order && ((((1UL << order) * PAGE_SIZE) /
- sizeof(struct dn_rt_hash_bucket)) >= 2048))
- order--;
-
- do {
- dn_rt_hash_mask = (1UL << order) * PAGE_SIZE /
- sizeof(struct dn_rt_hash_bucket);
- while(dn_rt_hash_mask & (dn_rt_hash_mask - 1))
- dn_rt_hash_mask--;
- dn_rt_hash_table = (struct dn_rt_hash_bucket *)
- __get_free_pages(GFP_ATOMIC, order);
- } while (dn_rt_hash_table == NULL && --order > 0);
-
- if (!dn_rt_hash_table)
- panic("Failed to allocate DECnet route cache hash table\n");
-
- printk(KERN_INFO
- "DECnet: Routing cache hash table of %u buckets, %ldKbytes\n",
- dn_rt_hash_mask,
- (long)(dn_rt_hash_mask*sizeof(struct dn_rt_hash_bucket))/1024);
-
- dn_rt_hash_mask--;
- for(i = 0; i <= dn_rt_hash_mask; i++) {
- spin_lock_init(&dn_rt_hash_table[i].lock);
- dn_rt_hash_table[i].chain = NULL;
- }
-
- dn_dst_ops.gc_thresh = (dn_rt_hash_mask + 1);
-
- proc_create_seq_private("decnet_cache", 0444, init_net.proc_net,
- &dn_rt_cache_seq_ops,
- sizeof(struct dn_rt_cache_iter_state), NULL);
-
-#ifdef CONFIG_DECNET_ROUTER
- rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETROUTE,
- dn_cache_getroute, dn_fib_dump, 0);
-#else
- rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_GETROUTE,
- dn_cache_getroute, dn_cache_dump, 0);
-#endif
-}
-
-void __exit dn_route_cleanup(void)
-{
- del_timer(&dn_route_timer);
- dn_run_flush(NULL);
-
- remove_proc_entry("decnet_cache", init_net.proc_net);
- dst_entries_destroy(&dn_dst_ops);
-}
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
deleted file mode 100644
index 4a4e3c17740c..000000000000
--- a/net/decnet/dn_rules.c
+++ /dev/null
@@ -1,258 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Routing Forwarding Information Base (Rules)
- *
- * Author: Steve Whitehouse <SteveW@ACM.org>
- * Mostly copied from Alexey Kuznetsov's ipv4/fib_rules.c
- *
- *
- * Changes:
- * Steve Whitehouse <steve@chygwyn.com>
- * Updated for Thomas Graf's generic rules
- *
- */
-#include <linux/net.h>
-#include <linux/init.h>
-#include <linux/netlink.h>
-#include <linux/rtnetlink.h>
-#include <linux/netdevice.h>
-#include <linux/spinlock.h>
-#include <linux/list.h>
-#include <linux/rcupdate.h>
-#include <linux/export.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/fib_rules.h>
-#include <net/dn.h>
-#include <net/dn_fib.h>
-#include <net/dn_neigh.h>
-#include <net/dn_dev.h>
-#include <net/dn_route.h>
-
-static struct fib_rules_ops *dn_fib_rules_ops;
-
-struct dn_fib_rule
-{
- struct fib_rule common;
- unsigned char dst_len;
- unsigned char src_len;
- __le16 src;
- __le16 srcmask;
- __le16 dst;
- __le16 dstmask;
- __le16 srcmap;
- u8 flags;
-};
-
-
-int dn_fib_lookup(struct flowidn *flp, struct dn_fib_res *res)
-{
- struct fib_lookup_arg arg = {
- .result = res,
- };
- int err;
-
- err = fib_rules_lookup(dn_fib_rules_ops,
- flowidn_to_flowi(flp), 0, &arg);
- res->r = arg.rule;
-
- return err;
-}
-
-static int dn_fib_rule_action(struct fib_rule *rule, struct flowi *flp,
- int flags, struct fib_lookup_arg *arg)
-{
- struct flowidn *fld = &flp->u.dn;
- int err = -EAGAIN;
- struct dn_fib_table *tbl;
-
- switch(rule->action) {
- case FR_ACT_TO_TBL:
- break;
-
- case FR_ACT_UNREACHABLE:
- err = -ENETUNREACH;
- goto errout;
-
- case FR_ACT_PROHIBIT:
- err = -EACCES;
- goto errout;
-
- case FR_ACT_BLACKHOLE:
- default:
- err = -EINVAL;
- goto errout;
- }
-
- tbl = dn_fib_get_table(rule->table, 0);
- if (tbl == NULL)
- goto errout;
-
- err = tbl->lookup(tbl, fld, (struct dn_fib_res *)arg->result);
- if (err > 0)
- err = -EAGAIN;
-errout:
- return err;
-}
-
-static const struct nla_policy dn_fib_rule_policy[FRA_MAX+1] = {
- FRA_GENERIC_POLICY,
-};
-
-static int dn_fib_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
-{
- struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
- struct flowidn *fld = &fl->u.dn;
- __le16 daddr = fld->daddr;
- __le16 saddr = fld->saddr;
-
- if (((saddr ^ r->src) & r->srcmask) ||
- ((daddr ^ r->dst) & r->dstmask))
- return 0;
-
- return 1;
-}
-
-static int dn_fib_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
- struct fib_rule_hdr *frh,
- struct nlattr **tb,
- struct netlink_ext_ack *extack)
-{
- int err = -EINVAL;
- struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
-
- if (frh->tos) {
- NL_SET_ERR_MSG(extack, "Invalid tos value");
- goto errout;
- }
-
- if (rule->table == RT_TABLE_UNSPEC) {
- if (rule->action == FR_ACT_TO_TBL) {
- struct dn_fib_table *table;
-
- table = dn_fib_empty_table();
- if (table == NULL) {
- err = -ENOBUFS;
- goto errout;
- }
-
- rule->table = table->n;
- }
- }
-
- if (frh->src_len)
- r->src = nla_get_le16(tb[FRA_SRC]);
-
- if (frh->dst_len)
- r->dst = nla_get_le16(tb[FRA_DST]);
-
- r->src_len = frh->src_len;
- r->srcmask = dnet_make_mask(r->src_len);
- r->dst_len = frh->dst_len;
- r->dstmask = dnet_make_mask(r->dst_len);
- err = 0;
-errout:
- return err;
-}
-
-static int dn_fib_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
- struct nlattr **tb)
-{
- struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
-
- if (frh->src_len && (r->src_len != frh->src_len))
- return 0;
-
- if (frh->dst_len && (r->dst_len != frh->dst_len))
- return 0;
-
- if (frh->src_len && (r->src != nla_get_le16(tb[FRA_SRC])))
- return 0;
-
- if (frh->dst_len && (r->dst != nla_get_le16(tb[FRA_DST])))
- return 0;
-
- return 1;
-}
-
-unsigned int dnet_addr_type(__le16 addr)
-{
- struct flowidn fld = { .daddr = addr };
- struct dn_fib_res res;
- unsigned int ret = RTN_UNICAST;
- struct dn_fib_table *tb = dn_fib_get_table(RT_TABLE_LOCAL, 0);
-
- res.r = NULL;
-
- if (tb) {
- if (!tb->lookup(tb, &fld, &res)) {
- ret = res.type;
- dn_fib_res_put(&res);
- }
- }
- return ret;
-}
-
-static int dn_fib_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
- struct fib_rule_hdr *frh)
-{
- struct dn_fib_rule *r = (struct dn_fib_rule *)rule;
-
- frh->dst_len = r->dst_len;
- frh->src_len = r->src_len;
- frh->tos = 0;
-
- if ((r->dst_len &&
- nla_put_le16(skb, FRA_DST, r->dst)) ||
- (r->src_len &&
- nla_put_le16(skb, FRA_SRC, r->src)))
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return -ENOBUFS;
-}
-
-static void dn_fib_rule_flush_cache(struct fib_rules_ops *ops)
-{
- dn_rt_cache_flush(-1);
-}
-
-static const struct fib_rules_ops __net_initconst dn_fib_rules_ops_template = {
- .family = AF_DECnet,
- .rule_size = sizeof(struct dn_fib_rule),
- .addr_size = sizeof(u16),
- .action = dn_fib_rule_action,
- .match = dn_fib_rule_match,
- .configure = dn_fib_rule_configure,
- .compare = dn_fib_rule_compare,
- .fill = dn_fib_rule_fill,
- .flush_cache = dn_fib_rule_flush_cache,
- .nlgroup = RTNLGRP_DECnet_RULE,
- .policy = dn_fib_rule_policy,
- .owner = THIS_MODULE,
- .fro_net = &init_net,
-};
-
-void __init dn_fib_rules_init(void)
-{
- dn_fib_rules_ops =
- fib_rules_register(&dn_fib_rules_ops_template, &init_net);
- BUG_ON(IS_ERR(dn_fib_rules_ops));
- BUG_ON(fib_default_rule_add(dn_fib_rules_ops, 0x7fff,
- RT_TABLE_MAIN, 0));
-}
-
-void __exit dn_fib_rules_cleanup(void)
-{
- rtnl_lock();
- fib_rules_unregister(dn_fib_rules_ops);
- rtnl_unlock();
- rcu_barrier();
-}
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
deleted file mode 100644
index f0710b5d037d..000000000000
--- a/net/decnet/dn_table.c
+++ /dev/null
@@ -1,928 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Routing Forwarding Information Base (Routing Tables)
- *
- * Author: Steve Whitehouse <SteveW@ACM.org>
- * Mostly copied from the IPv4 routing code
- *
- *
- * Changes:
- *
- */
-#include <linux/string.h>
-#include <linux/net.h>
-#include <linux/socket.h>
-#include <linux/slab.h>
-#include <linux/sockios.h>
-#include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/rtnetlink.h>
-#include <linux/proc_fs.h>
-#include <linux/netdevice.h>
-#include <linux/timer.h>
-#include <linux/spinlock.h>
-#include <linux/atomic.h>
-#include <linux/uaccess.h>
-#include <linux/route.h> /* RTF_xxx */
-#include <net/neighbour.h>
-#include <net/netlink.h>
-#include <net/tcp.h>
-#include <net/dst.h>
-#include <net/flow.h>
-#include <net/fib_rules.h>
-#include <net/dn.h>
-#include <net/dn_route.h>
-#include <net/dn_fib.h>
-#include <net/dn_neigh.h>
-#include <net/dn_dev.h>
-
-struct dn_zone
-{
- struct dn_zone *dz_next;
- struct dn_fib_node **dz_hash;
- int dz_nent;
- int dz_divisor;
- u32 dz_hashmask;
-#define DZ_HASHMASK(dz) ((dz)->dz_hashmask)
- int dz_order;
- __le16 dz_mask;
-#define DZ_MASK(dz) ((dz)->dz_mask)
-};
-
-struct dn_hash
-{
- struct dn_zone *dh_zones[17];
- struct dn_zone *dh_zone_list;
-};
-
-#define dz_key_0(key) ((key).datum = 0)
-
-#define for_nexthops(fi) { int nhsel; const struct dn_fib_nh *nh;\
- for(nhsel = 0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
-
-#define endfor_nexthops(fi) }
-
-#define DN_MAX_DIVISOR 1024
-#define DN_S_ZOMBIE 1
-#define DN_S_ACCESSED 2
-
-#define DN_FIB_SCAN(f, fp) \
-for( ; ((f) = *(fp)) != NULL; (fp) = &(f)->fn_next)
-
-#define DN_FIB_SCAN_KEY(f, fp, key) \
-for( ; ((f) = *(fp)) != NULL && dn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_next)
-
-#define RT_TABLE_MIN 1
-#define DN_FIB_TABLE_HASHSZ 256
-static struct hlist_head dn_fib_table_hash[DN_FIB_TABLE_HASHSZ];
-static DEFINE_RWLOCK(dn_fib_tables_lock);
-
-static struct kmem_cache *dn_hash_kmem __read_mostly;
-static int dn_fib_hash_zombies;
-
-static inline dn_fib_idx_t dn_hash(dn_fib_key_t key, struct dn_zone *dz)
-{
- u16 h = le16_to_cpu(key.datum)>>(16 - dz->dz_order);
- h ^= (h >> 10);
- h ^= (h >> 6);
- h &= DZ_HASHMASK(dz);
- return *(dn_fib_idx_t *)&h;
-}
-
-static inline dn_fib_key_t dz_key(__le16 dst, struct dn_zone *dz)
-{
- dn_fib_key_t k;
- k.datum = dst & DZ_MASK(dz);
- return k;
-}
-
-static inline struct dn_fib_node **dn_chain_p(dn_fib_key_t key, struct dn_zone *dz)
-{
- return &dz->dz_hash[dn_hash(key, dz).datum];
-}
-
-static inline struct dn_fib_node *dz_chain(dn_fib_key_t key, struct dn_zone *dz)
-{
- return dz->dz_hash[dn_hash(key, dz).datum];
-}
-
-static inline int dn_key_eq(dn_fib_key_t a, dn_fib_key_t b)
-{
- return a.datum == b.datum;
-}
-
-static inline int dn_key_leq(dn_fib_key_t a, dn_fib_key_t b)
-{
- return a.datum <= b.datum;
-}
-
-static inline void dn_rebuild_zone(struct dn_zone *dz,
- struct dn_fib_node **old_ht,
- int old_divisor)
-{
- struct dn_fib_node *f, **fp, *next;
- int i;
-
- for(i = 0; i < old_divisor; i++) {
- for(f = old_ht[i]; f; f = next) {
- next = f->fn_next;
- for(fp = dn_chain_p(f->fn_key, dz);
- *fp && dn_key_leq((*fp)->fn_key, f->fn_key);
- fp = &(*fp)->fn_next)
- /* NOTHING */;
- f->fn_next = *fp;
- *fp = f;
- }
- }
-}
-
-static void dn_rehash_zone(struct dn_zone *dz)
-{
- struct dn_fib_node **ht, **old_ht;
- int old_divisor, new_divisor;
- u32 new_hashmask;
-
- old_divisor = dz->dz_divisor;
-
- switch (old_divisor) {
- case 16:
- new_divisor = 256;
- new_hashmask = 0xFF;
- break;
- default:
- printk(KERN_DEBUG "DECnet: dn_rehash_zone: BUG! %d\n",
- old_divisor);
- /* fall through */
- case 256:
- new_divisor = 1024;
- new_hashmask = 0x3FF;
- break;
- }
-
- ht = kcalloc(new_divisor, sizeof(struct dn_fib_node*), GFP_KERNEL);
- if (ht == NULL)
- return;
-
- write_lock_bh(&dn_fib_tables_lock);
- old_ht = dz->dz_hash;
- dz->dz_hash = ht;
- dz->dz_hashmask = new_hashmask;
- dz->dz_divisor = new_divisor;
- dn_rebuild_zone(dz, old_ht, old_divisor);
- write_unlock_bh(&dn_fib_tables_lock);
- kfree(old_ht);
-}
-
-static void dn_free_node(struct dn_fib_node *f)
-{
- dn_fib_release_info(DN_FIB_INFO(f));
- kmem_cache_free(dn_hash_kmem, f);
-}
-
-
-static struct dn_zone *dn_new_zone(struct dn_hash *table, int z)
-{
- int i;
- struct dn_zone *dz = kzalloc(sizeof(struct dn_zone), GFP_KERNEL);
- if (!dz)
- return NULL;
-
- if (z) {
- dz->dz_divisor = 16;
- dz->dz_hashmask = 0x0F;
- } else {
- dz->dz_divisor = 1;
- dz->dz_hashmask = 0;
- }
-
- dz->dz_hash = kcalloc(dz->dz_divisor, sizeof(struct dn_fib_node *), GFP_KERNEL);
- if (!dz->dz_hash) {
- kfree(dz);
- return NULL;
- }
-
- dz->dz_order = z;
- dz->dz_mask = dnet_make_mask(z);
-
- for(i = z + 1; i <= 16; i++)
- if (table->dh_zones[i])
- break;
-
- write_lock_bh(&dn_fib_tables_lock);
- if (i>16) {
- dz->dz_next = table->dh_zone_list;
- table->dh_zone_list = dz;
- } else {
- dz->dz_next = table->dh_zones[i]->dz_next;
- table->dh_zones[i]->dz_next = dz;
- }
- table->dh_zones[z] = dz;
- write_unlock_bh(&dn_fib_tables_lock);
- return dz;
-}
-
-
-static int dn_fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct nlattr *attrs[], struct dn_fib_info *fi)
-{
- struct rtnexthop *nhp;
- int nhlen;
-
- if (attrs[RTA_PRIORITY] &&
- nla_get_u32(attrs[RTA_PRIORITY]) != fi->fib_priority)
- return 1;
-
- if (attrs[RTA_OIF] || attrs[RTA_GATEWAY]) {
- if ((!attrs[RTA_OIF] || nla_get_u32(attrs[RTA_OIF]) == fi->fib_nh->nh_oif) &&
- (!attrs[RTA_GATEWAY] || nla_get_le16(attrs[RTA_GATEWAY]) != fi->fib_nh->nh_gw))
- return 0;
- return 1;
- }
-
- if (!attrs[RTA_MULTIPATH])
- return 0;
-
- nhp = nla_data(attrs[RTA_MULTIPATH]);
- nhlen = nla_len(attrs[RTA_MULTIPATH]);
-
- for_nexthops(fi) {
- int attrlen = nhlen - sizeof(struct rtnexthop);
- __le16 gw;
-
- if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
- return -EINVAL;
- if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
- return 1;
- if (attrlen) {
- struct nlattr *gw_attr;
-
- gw_attr = nla_find((struct nlattr *) (nhp + 1), attrlen, RTA_GATEWAY);
- gw = gw_attr ? nla_get_le16(gw_attr) : 0;
-
- if (gw && gw != nh->nh_gw)
- return 1;
- }
- nhp = RTNH_NEXT(nhp);
- } endfor_nexthops(fi);
-
- return 0;
-}
-
-static inline size_t dn_fib_nlmsg_size(struct dn_fib_info *fi)
-{
- size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
- + nla_total_size(4) /* RTA_TABLE */
- + nla_total_size(2) /* RTA_DST */
- + nla_total_size(4) /* RTA_PRIORITY */
- + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
-
- /* space for nested metrics */
- payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
-
- if (fi->fib_nhs) {
- /* Also handles the special case fib_nhs == 1 */
-
- /* each nexthop is packed in an attribute */
- size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
-
- /* may contain a gateway attribute */
- nhsize += nla_total_size(4);
-
- /* all nexthops are packed in a nested attribute */
- payload += nla_total_size(fi->fib_nhs * nhsize);
- }
-
- return payload;
-}
-
-static int dn_fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
- u32 tb_id, u8 type, u8 scope, void *dst, int dst_len,
- struct dn_fib_info *fi, unsigned int flags)
-{
- struct rtmsg *rtm;
- struct nlmsghdr *nlh;
-
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
- if (!nlh)
- return -EMSGSIZE;
-
- rtm = nlmsg_data(nlh);
- rtm->rtm_family = AF_DECnet;
- rtm->rtm_dst_len = dst_len;
- rtm->rtm_src_len = 0;
- rtm->rtm_tos = 0;
- rtm->rtm_table = tb_id;
- rtm->rtm_flags = fi->fib_flags;
- rtm->rtm_scope = scope;
- rtm->rtm_type = type;
- rtm->rtm_protocol = fi->fib_protocol;
-
- if (nla_put_u32(skb, RTA_TABLE, tb_id) < 0)
- goto errout;
-
- if (rtm->rtm_dst_len &&
- nla_put(skb, RTA_DST, 2, dst) < 0)
- goto errout;
-
- if (fi->fib_priority &&
- nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority) < 0)
- goto errout;
-
- if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
- goto errout;
-
- if (fi->fib_nhs == 1) {
- if (fi->fib_nh->nh_gw &&
- nla_put_le16(skb, RTA_GATEWAY, fi->fib_nh->nh_gw) < 0)
- goto errout;
-
- if (fi->fib_nh->nh_oif &&
- nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif) < 0)
- goto errout;
- }
-
- if (fi->fib_nhs > 1) {
- struct rtnexthop *nhp;
- struct nlattr *mp_head;
-
- if (!(mp_head = nla_nest_start(skb, RTA_MULTIPATH)))
- goto errout;
-
- for_nexthops(fi) {
- if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp))))
- goto errout;
-
- nhp->rtnh_flags = nh->nh_flags & 0xFF;
- nhp->rtnh_hops = nh->nh_weight - 1;
- nhp->rtnh_ifindex = nh->nh_oif;
-
- if (nh->nh_gw &&
- nla_put_le16(skb, RTA_GATEWAY, nh->nh_gw) < 0)
- goto errout;
-
- nhp->rtnh_len = skb_tail_pointer(skb) - (unsigned char *)nhp;
- } endfor_nexthops(fi);
-
- nla_nest_end(skb, mp_head);
- }
-
- nlmsg_end(skb, nlh);
- return 0;
-
-errout:
- nlmsg_cancel(skb, nlh);
- return -EMSGSIZE;
-}
-
-
-static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, u32 tb_id,
- struct nlmsghdr *nlh, struct netlink_skb_parms *req)
-{
- struct sk_buff *skb;
- u32 portid = req ? req->portid : 0;
- int err = -ENOBUFS;
-
- skb = nlmsg_new(dn_fib_nlmsg_size(DN_FIB_INFO(f)), GFP_KERNEL);
- if (skb == NULL)
- goto errout;
-
- err = dn_fib_dump_info(skb, portid, nlh->nlmsg_seq, event, tb_id,
- f->fn_type, f->fn_scope, &f->fn_key, z,
- DN_FIB_INFO(f), 0);
- if (err < 0) {
- /* -EMSGSIZE implies BUG in dn_fib_nlmsg_size() */
- WARN_ON(err == -EMSGSIZE);
- kfree_skb(skb);
- goto errout;
- }
- rtnl_notify(skb, &init_net, portid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL);
- return;
-errout:
- if (err < 0)
- rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_ROUTE, err);
-}
-
-static __inline__ int dn_hash_dump_bucket(struct sk_buff *skb,
- struct netlink_callback *cb,
- struct dn_fib_table *tb,
- struct dn_zone *dz,
- struct dn_fib_node *f)
-{
- int i, s_i;
-
- s_i = cb->args[4];
- for(i = 0; f; i++, f = f->fn_next) {
- if (i < s_i)
- continue;
- if (f->fn_state & DN_S_ZOMBIE)
- continue;
- if (dn_fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWROUTE,
- tb->n,
- (f->fn_state & DN_S_ZOMBIE) ? 0 : f->fn_type,
- f->fn_scope, &f->fn_key, dz->dz_order,
- f->fn_info, NLM_F_MULTI) < 0) {
- cb->args[4] = i;
- return -1;
- }
- }
- cb->args[4] = i;
- return skb->len;
-}
-
-static __inline__ int dn_hash_dump_zone(struct sk_buff *skb,
- struct netlink_callback *cb,
- struct dn_fib_table *tb,
- struct dn_zone *dz)
-{
- int h, s_h;
-
- s_h = cb->args[3];
- for(h = 0; h < dz->dz_divisor; h++) {
- if (h < s_h)
- continue;
- if (h > s_h)
- memset(&cb->args[4], 0, sizeof(cb->args) - 4*sizeof(cb->args[0]));
- if (dz->dz_hash == NULL || dz->dz_hash[h] == NULL)
- continue;
- if (dn_hash_dump_bucket(skb, cb, tb, dz, dz->dz_hash[h]) < 0) {
- cb->args[3] = h;
- return -1;
- }
- }
- cb->args[3] = h;
- return skb->len;
-}
-
-static int dn_fib_table_dump(struct dn_fib_table *tb, struct sk_buff *skb,
- struct netlink_callback *cb)
-{
- int m, s_m;
- struct dn_zone *dz;
- struct dn_hash *table = (struct dn_hash *)tb->data;
-
- s_m = cb->args[2];
- read_lock(&dn_fib_tables_lock);
- for(dz = table->dh_zone_list, m = 0; dz; dz = dz->dz_next, m++) {
- if (m < s_m)
- continue;
- if (m > s_m)
- memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0]));
-
- if (dn_hash_dump_zone(skb, cb, tb, dz) < 0) {
- cb->args[2] = m;
- read_unlock(&dn_fib_tables_lock);
- return -1;
- }
- }
- read_unlock(&dn_fib_tables_lock);
- cb->args[2] = m;
-
- return skb->len;
-}
-
-int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb)
-{
- struct net *net = sock_net(skb->sk);
- unsigned int h, s_h;
- unsigned int e = 0, s_e;
- struct dn_fib_table *tb;
- int dumped = 0;
-
- if (!net_eq(net, &init_net))
- return 0;
-
- if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
- ((struct rtmsg *)nlmsg_data(cb->nlh))->rtm_flags&RTM_F_CLONED)
- return dn_cache_dump(skb, cb);
-
- s_h = cb->args[0];
- s_e = cb->args[1];
-
- for (h = s_h; h < DN_FIB_TABLE_HASHSZ; h++, s_h = 0) {
- e = 0;
- hlist_for_each_entry(tb, &dn_fib_table_hash[h], hlist) {
- if (e < s_e)
- goto next;
- if (dumped)
- memset(&cb->args[2], 0, sizeof(cb->args) -
- 2 * sizeof(cb->args[0]));
- if (tb->dump(tb, skb, cb) < 0)
- goto out;
- dumped = 1;
-next:
- e++;
- }
- }
-out:
- cb->args[1] = e;
- cb->args[0] = h;
-
- return skb->len;
-}
-
-static int dn_fib_table_insert(struct dn_fib_table *tb, struct rtmsg *r, struct nlattr *attrs[],
- struct nlmsghdr *n, struct netlink_skb_parms *req)
-{
- struct dn_hash *table = (struct dn_hash *)tb->data;
- struct dn_fib_node *new_f, *f, **fp, **del_fp;
- struct dn_zone *dz;
- struct dn_fib_info *fi;
- int z = r->rtm_dst_len;
- int type = r->rtm_type;
- dn_fib_key_t key;
- int err;
-
- if (z > 16)
- return -EINVAL;
-
- dz = table->dh_zones[z];
- if (!dz && !(dz = dn_new_zone(table, z)))
- return -ENOBUFS;
-
- dz_key_0(key);
- if (attrs[RTA_DST]) {
- __le16 dst = nla_get_le16(attrs[RTA_DST]);
- if (dst & ~DZ_MASK(dz))
- return -EINVAL;
- key = dz_key(dst, dz);
- }
-
- if ((fi = dn_fib_create_info(r, attrs, n, &err)) == NULL)
- return err;
-
- if (dz->dz_nent > (dz->dz_divisor << 2) &&
- dz->dz_divisor > DN_MAX_DIVISOR &&
- (z==16 || (1<<z) > dz->dz_divisor))
- dn_rehash_zone(dz);
-
- fp = dn_chain_p(key, dz);
-
- DN_FIB_SCAN(f, fp) {
- if (dn_key_leq(key, f->fn_key))
- break;
- }
-
- del_fp = NULL;
-
- if (f && (f->fn_state & DN_S_ZOMBIE) &&
- dn_key_eq(f->fn_key, key)) {
- del_fp = fp;
- fp = &f->fn_next;
- f = *fp;
- goto create;
- }
-
- DN_FIB_SCAN_KEY(f, fp, key) {
- if (fi->fib_priority <= DN_FIB_INFO(f)->fib_priority)
- break;
- }
-
- if (f && dn_key_eq(f->fn_key, key) &&
- fi->fib_priority == DN_FIB_INFO(f)->fib_priority) {
- struct dn_fib_node **ins_fp;
-
- err = -EEXIST;
- if (n->nlmsg_flags & NLM_F_EXCL)
- goto out;
-
- if (n->nlmsg_flags & NLM_F_REPLACE) {
- del_fp = fp;
- fp = &f->fn_next;
- f = *fp;
- goto replace;
- }
-
- ins_fp = fp;
- err = -EEXIST;
-
- DN_FIB_SCAN_KEY(f, fp, key) {
- if (fi->fib_priority != DN_FIB_INFO(f)->fib_priority)
- break;
- if (f->fn_type == type &&
- f->fn_scope == r->rtm_scope &&
- DN_FIB_INFO(f) == fi)
- goto out;
- }
-
- if (!(n->nlmsg_flags & NLM_F_APPEND)) {
- fp = ins_fp;
- f = *fp;
- }
- }
-
-create:
- err = -ENOENT;
- if (!(n->nlmsg_flags & NLM_F_CREATE))
- goto out;
-
-replace:
- err = -ENOBUFS;
- new_f = kmem_cache_zalloc(dn_hash_kmem, GFP_KERNEL);
- if (new_f == NULL)
- goto out;
-
- new_f->fn_key = key;
- new_f->fn_type = type;
- new_f->fn_scope = r->rtm_scope;
- DN_FIB_INFO(new_f) = fi;
-
- new_f->fn_next = f;
- write_lock_bh(&dn_fib_tables_lock);
- *fp = new_f;
- write_unlock_bh(&dn_fib_tables_lock);
- dz->dz_nent++;
-
- if (del_fp) {
- f = *del_fp;
- write_lock_bh(&dn_fib_tables_lock);
- *del_fp = f->fn_next;
- write_unlock_bh(&dn_fib_tables_lock);
-
- if (!(f->fn_state & DN_S_ZOMBIE))
- dn_rtmsg_fib(RTM_DELROUTE, f, z, tb->n, n, req);
- if (f->fn_state & DN_S_ACCESSED)
- dn_rt_cache_flush(-1);
- dn_free_node(f);
- dz->dz_nent--;
- } else {
- dn_rt_cache_flush(-1);
- }
-
- dn_rtmsg_fib(RTM_NEWROUTE, new_f, z, tb->n, n, req);
-
- return 0;
-out:
- dn_fib_release_info(fi);
- return err;
-}
-
-
-static int dn_fib_table_delete(struct dn_fib_table *tb, struct rtmsg *r, struct nlattr *attrs[],
- struct nlmsghdr *n, struct netlink_skb_parms *req)
-{
- struct dn_hash *table = (struct dn_hash*)tb->data;
- struct dn_fib_node **fp, **del_fp, *f;
- int z = r->rtm_dst_len;
- struct dn_zone *dz;
- dn_fib_key_t key;
- int matched;
-
-
- if (z > 16)
- return -EINVAL;
-
- if ((dz = table->dh_zones[z]) == NULL)
- return -ESRCH;
-
- dz_key_0(key);
- if (attrs[RTA_DST]) {
- __le16 dst = nla_get_le16(attrs[RTA_DST]);
- if (dst & ~DZ_MASK(dz))
- return -EINVAL;
- key = dz_key(dst, dz);
- }
-
- fp = dn_chain_p(key, dz);
-
- DN_FIB_SCAN(f, fp) {
- if (dn_key_eq(f->fn_key, key))
- break;
- if (dn_key_leq(key, f->fn_key))
- return -ESRCH;
- }
-
- matched = 0;
- del_fp = NULL;
- DN_FIB_SCAN_KEY(f, fp, key) {
- struct dn_fib_info *fi = DN_FIB_INFO(f);
-
- if (f->fn_state & DN_S_ZOMBIE)
- return -ESRCH;
-
- matched++;
-
- if (del_fp == NULL &&
- (!r->rtm_type || f->fn_type == r->rtm_type) &&
- (r->rtm_scope == RT_SCOPE_NOWHERE || f->fn_scope == r->rtm_scope) &&
- (!r->rtm_protocol ||
- fi->fib_protocol == r->rtm_protocol) &&
- dn_fib_nh_match(r, n, attrs, fi) == 0)
- del_fp = fp;
- }
-
- if (del_fp) {
- f = *del_fp;
- dn_rtmsg_fib(RTM_DELROUTE, f, z, tb->n, n, req);
-
- if (matched != 1) {
- write_lock_bh(&dn_fib_tables_lock);
- *del_fp = f->fn_next;
- write_unlock_bh(&dn_fib_tables_lock);
-
- if (f->fn_state & DN_S_ACCESSED)
- dn_rt_cache_flush(-1);
- dn_free_node(f);
- dz->dz_nent--;
- } else {
- f->fn_state |= DN_S_ZOMBIE;
- if (f->fn_state & DN_S_ACCESSED) {
- f->fn_state &= ~DN_S_ACCESSED;
- dn_rt_cache_flush(-1);
- }
- if (++dn_fib_hash_zombies > 128)
- dn_fib_flush();
- }
-
- return 0;
- }
-
- return -ESRCH;
-}
-
-static inline int dn_flush_list(struct dn_fib_node **fp, int z, struct dn_hash *table)
-{
- int found = 0;
- struct dn_fib_node *f;
-
- while((f = *fp) != NULL) {
- struct dn_fib_info *fi = DN_FIB_INFO(f);
-
- if (fi && ((f->fn_state & DN_S_ZOMBIE) || (fi->fib_flags & RTNH_F_DEAD))) {
- write_lock_bh(&dn_fib_tables_lock);
- *fp = f->fn_next;
- write_unlock_bh(&dn_fib_tables_lock);
-
- dn_free_node(f);
- found++;
- continue;
- }
- fp = &f->fn_next;
- }
-
- return found;
-}
-
-static int dn_fib_table_flush(struct dn_fib_table *tb)
-{
- struct dn_hash *table = (struct dn_hash *)tb->data;
- struct dn_zone *dz;
- int found = 0;
-
- dn_fib_hash_zombies = 0;
- for(dz = table->dh_zone_list; dz; dz = dz->dz_next) {
- int i;
- int tmp = 0;
- for(i = dz->dz_divisor-1; i >= 0; i--)
- tmp += dn_flush_list(&dz->dz_hash[i], dz->dz_order, table);
- dz->dz_nent -= tmp;
- found += tmp;
- }
-
- return found;
-}
-
-static int dn_fib_table_lookup(struct dn_fib_table *tb, const struct flowidn *flp, struct dn_fib_res *res)
-{
- int err;
- struct dn_zone *dz;
- struct dn_hash *t = (struct dn_hash *)tb->data;
-
- read_lock(&dn_fib_tables_lock);
- for(dz = t->dh_zone_list; dz; dz = dz->dz_next) {
- struct dn_fib_node *f;
- dn_fib_key_t k = dz_key(flp->daddr, dz);
-
- for(f = dz_chain(k, dz); f; f = f->fn_next) {
- if (!dn_key_eq(k, f->fn_key)) {
- if (dn_key_leq(k, f->fn_key))
- break;
- else
- continue;
- }
-
- f->fn_state |= DN_S_ACCESSED;
-
- if (f->fn_state&DN_S_ZOMBIE)
- continue;
-
- if (f->fn_scope < flp->flowidn_scope)
- continue;
-
- err = dn_fib_semantic_match(f->fn_type, DN_FIB_INFO(f), flp, res);
-
- if (err == 0) {
- res->type = f->fn_type;
- res->scope = f->fn_scope;
- res->prefixlen = dz->dz_order;
- goto out;
- }
- if (err < 0)
- goto out;
- }
- }
- err = 1;
-out:
- read_unlock(&dn_fib_tables_lock);
- return err;
-}
-
-
-struct dn_fib_table *dn_fib_get_table(u32 n, int create)
-{
- struct dn_fib_table *t;
- unsigned int h;
-
- if (n < RT_TABLE_MIN)
- return NULL;
-
- if (n > RT_TABLE_MAX)
- return NULL;
-
- h = n & (DN_FIB_TABLE_HASHSZ - 1);
- rcu_read_lock();
- hlist_for_each_entry_rcu(t, &dn_fib_table_hash[h], hlist) {
- if (t->n == n) {
- rcu_read_unlock();
- return t;
- }
- }
- rcu_read_unlock();
-
- if (!create)
- return NULL;
-
- if (in_interrupt()) {
- net_dbg_ratelimited("DECnet: BUG! Attempt to create routing table from interrupt\n");
- return NULL;
- }
-
- t = kzalloc(sizeof(struct dn_fib_table) + sizeof(struct dn_hash),
- GFP_KERNEL);
- if (t == NULL)
- return NULL;
-
- t->n = n;
- t->insert = dn_fib_table_insert;
- t->delete = dn_fib_table_delete;
- t->lookup = dn_fib_table_lookup;
- t->flush = dn_fib_table_flush;
- t->dump = dn_fib_table_dump;
- hlist_add_head_rcu(&t->hlist, &dn_fib_table_hash[h]);
-
- return t;
-}
-
-struct dn_fib_table *dn_fib_empty_table(void)
-{
- u32 id;
-
- for(id = RT_TABLE_MIN; id <= RT_TABLE_MAX; id++)
- if (dn_fib_get_table(id, 0) == NULL)
- return dn_fib_get_table(id, 1);
- return NULL;
-}
-
-void dn_fib_flush(void)
-{
- int flushed = 0;
- struct dn_fib_table *tb;
- unsigned int h;
-
- for (h = 0; h < DN_FIB_TABLE_HASHSZ; h++) {
- hlist_for_each_entry(tb, &dn_fib_table_hash[h], hlist)
- flushed += tb->flush(tb);
- }
-
- if (flushed)
- dn_rt_cache_flush(-1);
-}
-
-void __init dn_fib_table_init(void)
-{
- dn_hash_kmem = kmem_cache_create("dn_fib_info_cache",
- sizeof(struct dn_fib_info),
- 0, SLAB_HWCACHE_ALIGN,
- NULL);
-}
-
-void __exit dn_fib_table_cleanup(void)
-{
- struct dn_fib_table *t;
- struct hlist_node *next;
- unsigned int h;
-
- write_lock(&dn_fib_tables_lock);
- for (h = 0; h < DN_FIB_TABLE_HASHSZ; h++) {
- hlist_for_each_entry_safe(t, next, &dn_fib_table_hash[h],
- hlist) {
- hlist_del(&t->hlist);
- kfree(t);
- }
- }
- write_unlock(&dn_fib_tables_lock);
-}
diff --git a/net/decnet/dn_timer.c b/net/decnet/dn_timer.c
deleted file mode 100644
index aa4155875ca8..000000000000
--- a/net/decnet/dn_timer.c
+++ /dev/null
@@ -1,104 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Socket Timer Functions
- *
- * Author: Steve Whitehouse <SteveW@ACM.org>
- *
- *
- * Changes:
- * Steve Whitehouse : Made keepalive timer part of the same
- * timer idea.
- * Steve Whitehouse : Added checks for sk->sock_readers
- * David S. Miller : New socket locking
- * Steve Whitehouse : Timer grabs socket ref.
- */
-#include <linux/net.h>
-#include <linux/socket.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/timer.h>
-#include <linux/spinlock.h>
-#include <net/sock.h>
-#include <linux/atomic.h>
-#include <linux/jiffies.h>
-#include <net/flow.h>
-#include <net/dn.h>
-
-/*
- * Slow timer is for everything else (n * 500mS)
- */
-
-#define SLOW_INTERVAL (HZ/2)
-
-static void dn_slow_timer(struct timer_list *t);
-
-void dn_start_slow_timer(struct sock *sk)
-{
- timer_setup(&sk->sk_timer, dn_slow_timer, 0);
- sk_reset_timer(sk, &sk->sk_timer, jiffies + SLOW_INTERVAL);
-}
-
-void dn_stop_slow_timer(struct sock *sk)
-{
- sk_stop_timer(sk, &sk->sk_timer);
-}
-
-static void dn_slow_timer(struct timer_list *t)
-{
- struct sock *sk = from_timer(sk, t, sk_timer);
- struct dn_scp *scp = DN_SK(sk);
-
- bh_lock_sock(sk);
-
- if (sock_owned_by_user(sk)) {
- sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ / 10);
- goto out;
- }
-
- /*
- * The persist timer is the standard slow timer used for retransmits
- * in both connection establishment and disconnection as well as
- * in the RUN state. The different states are catered for by changing
- * the function pointer in the socket. Setting the timer to a value
- * of zero turns it off. We allow the persist_fxn to turn the
- * timer off in a permant way by returning non-zero, so that
- * timer based routines may remove sockets. This is why we have a
- * sock_hold()/sock_put() around the timer to prevent the socket
- * going away in the middle.
- */
- if (scp->persist && scp->persist_fxn) {
- if (scp->persist <= SLOW_INTERVAL) {
- scp->persist = 0;
-
- if (scp->persist_fxn(sk))
- goto out;
- } else {
- scp->persist -= SLOW_INTERVAL;
- }
- }
-
- /*
- * Check for keepalive timeout. After the other timer 'cos if
- * the previous timer caused a retransmit, we don't need to
- * do this. scp->stamp is the last time that we sent a packet.
- * The keepalive function sends a link service packet to the
- * other end. If it remains unacknowledged, the standard
- * socket timers will eventually shut the socket down. Each
- * time we do this, scp->stamp will be updated, thus
- * we won't try and send another until scp->keepalive has passed
- * since the last successful transmission.
- */
- if (scp->keepalive && scp->keepalive_fxn && (scp->state == DN_RUN)) {
- if (time_after_eq(jiffies, scp->stamp + scp->keepalive))
- scp->keepalive_fxn(sk);
- }
-
- sk_reset_timer(sk, &sk->sk_timer, jiffies + SLOW_INTERVAL);
-out:
- bh_unlock_sock(sk);
- sock_put(sk);
-}
diff --git a/net/decnet/netfilter/Kconfig b/net/decnet/netfilter/Kconfig
deleted file mode 100644
index 8d7c109d5109..000000000000
--- a/net/decnet/netfilter/Kconfig
+++ /dev/null
@@ -1,16 +0,0 @@
-#
-# DECnet netfilter configuration
-#
-
-menu "DECnet: Netfilter Configuration"
- depends on DECNET && NETFILTER
- depends on NETFILTER_ADVANCED
-
-config DECNET_NF_GRABULATOR
- tristate "Routing message grabulator (for userland routing daemon)"
- help
- Enable this module if you want to use the userland DECnet routing
- daemon. You will also need to enable routing support for DECnet
- unless you just want to monitor routing messages from other nodes.
-
-endmenu
diff --git a/net/decnet/netfilter/Makefile b/net/decnet/netfilter/Makefile
deleted file mode 100644
index b579e52130aa..000000000000
--- a/net/decnet/netfilter/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-#
-# Makefile for DECnet netfilter modules
-#
-
-obj-$(CONFIG_DECNET_NF_GRABULATOR) += dn_rtmsg.o
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
deleted file mode 100644
index a4faacadd8a8..000000000000
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet Routing Message Grabulator
- *
- * (C) 2000 ChyGwyn Limited - http://www.chygwyn.com/
- * This code may be copied under the GPL v.2 or at your option
- * any later version.
- *
- * Author: Steven Whitehouse <steve@chygwyn.com>
- *
- */
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/netdevice.h>
-#include <linux/netfilter.h>
-#include <linux/spinlock.h>
-#include <net/netlink.h>
-#include <linux/netfilter_decnet.h>
-
-#include <net/sock.h>
-#include <net/flow.h>
-#include <net/dn.h>
-#include <net/dn_route.h>
-
-static struct sock *dnrmg = NULL;
-
-
-static struct sk_buff *dnrmg_build_message(struct sk_buff *rt_skb, int *errp)
-{
- struct sk_buff *skb = NULL;
- size_t size;
- sk_buff_data_t old_tail;
- struct nlmsghdr *nlh;
- unsigned char *ptr;
- struct nf_dn_rtmsg *rtm;
-
- size = NLMSG_ALIGN(rt_skb->len) +
- NLMSG_ALIGN(sizeof(struct nf_dn_rtmsg));
- skb = nlmsg_new(size, GFP_ATOMIC);
- if (!skb) {
- *errp = -ENOMEM;
- return NULL;
- }
- old_tail = skb->tail;
- nlh = nlmsg_put(skb, 0, 0, 0, size, 0);
- if (!nlh) {
- kfree_skb(skb);
- *errp = -ENOMEM;
- return NULL;
- }
- rtm = (struct nf_dn_rtmsg *)nlmsg_data(nlh);
- rtm->nfdn_ifindex = rt_skb->dev->ifindex;
- ptr = NFDN_RTMSG(rtm);
- skb_copy_from_linear_data(rt_skb, ptr, rt_skb->len);
- nlh->nlmsg_len = skb->tail - old_tail;
- return skb;
-}
-
-static void dnrmg_send_peer(struct sk_buff *skb)
-{
- struct sk_buff *skb2;
- int status = 0;
- int group = 0;
- unsigned char flags = *skb->data;
-
- switch (flags & DN_RT_CNTL_MSK) {
- case DN_RT_PKT_L1RT:
- group = DNRNG_NLGRP_L1;
- break;
- case DN_RT_PKT_L2RT:
- group = DNRNG_NLGRP_L2;
- break;
- default:
- return;
- }
-
- skb2 = dnrmg_build_message(skb, &status);
- if (skb2 == NULL)
- return;
- NETLINK_CB(skb2).dst_group = group;
- netlink_broadcast(dnrmg, skb2, 0, group, GFP_ATOMIC);
-}
-
-
-static unsigned int dnrmg_hook(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- dnrmg_send_peer(skb);
- return NF_ACCEPT;
-}
-
-
-#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err), NULL); return; } while (0)
-
-static inline void dnrmg_receive_user_skb(struct sk_buff *skb)
-{
- struct nlmsghdr *nlh = nlmsg_hdr(skb);
-
- if (skb->len < sizeof(*nlh) ||
- nlh->nlmsg_len < sizeof(*nlh) ||
- skb->len < nlh->nlmsg_len)
- return;
-
- if (!netlink_capable(skb, CAP_NET_ADMIN))
- RCV_SKB_FAIL(-EPERM);
-
- /* Eventually we might send routing messages too */
-
- RCV_SKB_FAIL(-EINVAL);
-}
-
-static const struct nf_hook_ops dnrmg_ops = {
- .hook = dnrmg_hook,
- .pf = NFPROTO_DECNET,
- .hooknum = NF_DN_ROUTE,
- .priority = NF_DN_PRI_DNRTMSG,
-};
-
-static int __init dn_rtmsg_init(void)
-{
- int rv = 0;
- struct netlink_kernel_cfg cfg = {
- .groups = DNRNG_NLGRP_MAX,
- .input = dnrmg_receive_user_skb,
- };
-
- dnrmg = netlink_kernel_create(&init_net, NETLINK_DNRTMSG, &cfg);
- if (dnrmg == NULL) {
- printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket");
- return -ENOMEM;
- }
-
- rv = nf_register_net_hook(&init_net, &dnrmg_ops);
- if (rv) {
- netlink_kernel_release(dnrmg);
- }
-
- return rv;
-}
-
-static void __exit dn_rtmsg_fini(void)
-{
- nf_unregister_net_hook(&init_net, &dnrmg_ops);
- netlink_kernel_release(dnrmg);
-}
-
-
-MODULE_DESCRIPTION("DECnet Routing Message Grabulator");
-MODULE_AUTHOR("Steven Whitehouse <steve@chygwyn.com>");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_DNRTMSG);
-
-module_init(dn_rtmsg_init);
-module_exit(dn_rtmsg_fini);
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
deleted file mode 100644
index 55bf64a22b59..000000000000
--- a/net/decnet/sysctl_net_decnet.c
+++ /dev/null
@@ -1,373 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * DECnet An implementation of the DECnet protocol suite for the LINUX
- * operating system. DECnet is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * DECnet sysctl support functions
- *
- * Author: Steve Whitehouse <SteveW@ACM.org>
- *
- *
- * Changes:
- * Steve Whitehouse - C99 changes and default device handling
- * Steve Whitehouse - Memory buffer settings, like the tcp ones
- *
- */
-#include <linux/mm.h>
-#include <linux/sysctl.h>
-#include <linux/fs.h>
-#include <linux/netdevice.h>
-#include <linux/string.h>
-#include <net/neighbour.h>
-#include <net/dst.h>
-#include <net/flow.h>
-
-#include <linux/uaccess.h>
-
-#include <net/dn.h>
-#include <net/dn_dev.h>
-#include <net/dn_route.h>
-
-
-int decnet_debug_level;
-int decnet_time_wait = 30;
-int decnet_dn_count = 1;
-int decnet_di_count = 3;
-int decnet_dr_count = 3;
-int decnet_log_martians = 1;
-int decnet_no_fc_max_cwnd = NSP_MIN_WINDOW;
-
-/* Reasonable defaults, I hope, based on tcp's defaults */
-long sysctl_decnet_mem[3] = { 768 << 3, 1024 << 3, 1536 << 3 };
-int sysctl_decnet_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
-int sysctl_decnet_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
-
-#ifdef CONFIG_SYSCTL
-extern int decnet_dst_gc_interval;
-static int min_decnet_time_wait[] = { 5 };
-static int max_decnet_time_wait[] = { 600 };
-static int min_state_count[] = { 1 };
-static int max_state_count[] = { NSP_MAXRXTSHIFT };
-static int min_decnet_dst_gc_interval[] = { 1 };
-static int max_decnet_dst_gc_interval[] = { 60 };
-static int min_decnet_no_fc_max_cwnd[] = { NSP_MIN_WINDOW };
-static int max_decnet_no_fc_max_cwnd[] = { NSP_MAX_WINDOW };
-static char node_name[7] = "???";
-
-static struct ctl_table_header *dn_table_header = NULL;
-
-/*
- * ctype.h :-)
- */
-#define ISNUM(x) (((x) >= '0') && ((x) <= '9'))
-#define ISLOWER(x) (((x) >= 'a') && ((x) <= 'z'))
-#define ISUPPER(x) (((x) >= 'A') && ((x) <= 'Z'))
-#define ISALPHA(x) (ISLOWER(x) || ISUPPER(x))
-#define INVALID_END_CHAR(x) (ISNUM(x) || ISALPHA(x))
-
-static void strip_it(char *str)
-{
- for(;;) {
- switch (*str) {
- case ' ':
- case '\n':
- case '\r':
- case ':':
- *str = 0;
- /* Fallthrough */
- case 0:
- return;
- }
- str++;
- }
-}
-
-/*
- * Simple routine to parse an ascii DECnet address
- * into a network order address.
- */
-static int parse_addr(__le16 *addr, char *str)
-{
- __u16 area, node;
-
- while(*str && !ISNUM(*str)) str++;
-
- if (*str == 0)
- return -1;
-
- area = (*str++ - '0');
- if (ISNUM(*str)) {
- area *= 10;
- area += (*str++ - '0');
- }
-
- if (*str++ != '.')
- return -1;
-
- if (!ISNUM(*str))
- return -1;
-
- node = *str++ - '0';
- if (ISNUM(*str)) {
- node *= 10;
- node += (*str++ - '0');
- }
- if (ISNUM(*str)) {
- node *= 10;
- node += (*str++ - '0');
- }
- if (ISNUM(*str)) {
- node *= 10;
- node += (*str++ - '0');
- }
-
- if ((node > 1023) || (area > 63))
- return -1;
-
- if (INVALID_END_CHAR(*str))
- return -1;
-
- *addr = cpu_to_le16((area << 10) | node);
-
- return 0;
-}
-
-static int dn_node_address_handler(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- char addr[DN_ASCBUF_LEN];
- size_t len;
- __le16 dnaddr;
-
- if (!*lenp || (*ppos && !write)) {
- *lenp = 0;
- return 0;
- }
-
- if (write) {
- len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1);
-
- if (copy_from_user(addr, buffer, len))
- return -EFAULT;
-
- addr[len] = 0;
- strip_it(addr);
-
- if (parse_addr(&dnaddr, addr))
- return -EINVAL;
-
- dn_dev_devices_off();
-
- decnet_address = dnaddr;
-
- dn_dev_devices_on();
-
- *ppos += len;
-
- return 0;
- }
-
- dn_addr2asc(le16_to_cpu(decnet_address), addr);
- len = strlen(addr);
- addr[len++] = '\n';
-
- if (len > *lenp) len = *lenp;
-
- if (copy_to_user(buffer, addr, len))
- return -EFAULT;
-
- *lenp = len;
- *ppos += len;
-
- return 0;
-}
-
-static int dn_def_dev_handler(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- size_t len;
- struct net_device *dev;
- char devname[17];
-
- if (!*lenp || (*ppos && !write)) {
- *lenp = 0;
- return 0;
- }
-
- if (write) {
- if (*lenp > 16)
- return -E2BIG;
-
- if (copy_from_user(devname, buffer, *lenp))
- return -EFAULT;
-
- devname[*lenp] = 0;
- strip_it(devname);
-
- dev = dev_get_by_name(&init_net, devname);
- if (dev == NULL)
- return -ENODEV;
-
- if (dev->dn_ptr == NULL) {
- dev_put(dev);
- return -ENODEV;
- }
-
- if (dn_dev_set_default(dev, 1)) {
- dev_put(dev);
- return -ENODEV;
- }
- *ppos += *lenp;
-
- return 0;
- }
-
- dev = dn_dev_get_default();
- if (dev == NULL) {
- *lenp = 0;
- return 0;
- }
-
- strcpy(devname, dev->name);
- dev_put(dev);
- len = strlen(devname);
- devname[len++] = '\n';
-
- if (len > *lenp) len = *lenp;
-
- if (copy_to_user(buffer, devname, len))
- return -EFAULT;
-
- *lenp = len;
- *ppos += len;
-
- return 0;
-}
-
-static struct ctl_table dn_table[] = {
- {
- .procname = "node_address",
- .maxlen = 7,
- .mode = 0644,
- .proc_handler = dn_node_address_handler,
- },
- {
- .procname = "node_name",
- .data = node_name,
- .maxlen = 7,
- .mode = 0644,
- .proc_handler = proc_dostring,
- },
- {
- .procname = "default_device",
- .maxlen = 16,
- .mode = 0644,
- .proc_handler = dn_def_dev_handler,
- },
- {
- .procname = "time_wait",
- .data = &decnet_time_wait,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &min_decnet_time_wait,
- .extra2 = &max_decnet_time_wait
- },
- {
- .procname = "dn_count",
- .data = &decnet_dn_count,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &min_state_count,
- .extra2 = &max_state_count
- },
- {
- .procname = "di_count",
- .data = &decnet_di_count,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &min_state_count,
- .extra2 = &max_state_count
- },
- {
- .procname = "dr_count",
- .data = &decnet_dr_count,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &min_state_count,
- .extra2 = &max_state_count
- },
- {
- .procname = "dst_gc_interval",
- .data = &decnet_dst_gc_interval,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &min_decnet_dst_gc_interval,
- .extra2 = &max_decnet_dst_gc_interval
- },
- {
- .procname = "no_fc_max_cwnd",
- .data = &decnet_no_fc_max_cwnd,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &min_decnet_no_fc_max_cwnd,
- .extra2 = &max_decnet_no_fc_max_cwnd
- },
- {
- .procname = "decnet_mem",
- .data = &sysctl_decnet_mem,
- .maxlen = sizeof(sysctl_decnet_mem),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax
- },
- {
- .procname = "decnet_rmem",
- .data = &sysctl_decnet_rmem,
- .maxlen = sizeof(sysctl_decnet_rmem),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "decnet_wmem",
- .data = &sysctl_decnet_wmem,
- .maxlen = sizeof(sysctl_decnet_wmem),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
- .procname = "debug",
- .data = &decnet_debug_level,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- { }
-};
-
-void dn_register_sysctl(void)
-{
- dn_table_header = register_net_sysctl(&init_net, "net/decnet", dn_table);
-}
-
-void dn_unregister_sysctl(void)
-{
- unregister_net_sysctl_table(dn_table_header);
-}
-
-#else /* CONFIG_SYSCTL */
-void dn_unregister_sysctl(void)
-{
-}
-void dn_register_sysctl(void)
-{
-}
-
-#endif
diff --git a/net/devlink/Makefile b/net/devlink/Makefile
new file mode 100644
index 000000000000..000da622116a
--- /dev/null
+++ b/net/devlink/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-y := core.o netlink.o netlink_gen.o dev.o port.o sb.o dpipe.o \
+ resource.o param.o region.o health.o trap.o rate.o linecard.o
diff --git a/net/devlink/core.c b/net/devlink/core.c
new file mode 100644
index 000000000000..58093f49c090
--- /dev/null
+++ b/net/devlink/core.c
@@ -0,0 +1,551 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include <net/genetlink.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/devlink.h>
+
+#include "devl_internal.h"
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg);
+EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr);
+EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report);
+
+DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC);
+
+static struct devlink *devlinks_xa_get(unsigned long index)
+{
+ struct devlink *devlink;
+
+ rcu_read_lock();
+ devlink = xa_find(&devlinks, &index, index, DEVLINK_REGISTERED);
+ if (!devlink || !devlink_try_get(devlink))
+ devlink = NULL;
+ rcu_read_unlock();
+ return devlink;
+}
+
+/* devlink_rels xarray contains 1:1 relationships between
+ * devlink object and related nested devlink instance.
+ * The xarray index is used to get the nested object from
+ * the nested-in object code.
+ */
+static DEFINE_XARRAY_FLAGS(devlink_rels, XA_FLAGS_ALLOC1);
+
+#define DEVLINK_REL_IN_USE XA_MARK_0
+
+struct devlink_rel {
+ u32 index;
+ refcount_t refcount;
+ u32 devlink_index;
+ struct {
+ u32 devlink_index;
+ u32 obj_index;
+ devlink_rel_notify_cb_t *notify_cb;
+ devlink_rel_cleanup_cb_t *cleanup_cb;
+ struct delayed_work notify_work;
+ } nested_in;
+};
+
+static void devlink_rel_free(struct devlink_rel *rel)
+{
+ xa_erase(&devlink_rels, rel->index);
+ kfree(rel);
+}
+
+static void __devlink_rel_get(struct devlink_rel *rel)
+{
+ refcount_inc(&rel->refcount);
+}
+
+static void __devlink_rel_put(struct devlink_rel *rel)
+{
+ if (refcount_dec_and_test(&rel->refcount))
+ devlink_rel_free(rel);
+}
+
+static void devlink_rel_nested_in_notify_work(struct work_struct *work)
+{
+ struct devlink_rel *rel = container_of(work, struct devlink_rel,
+ nested_in.notify_work.work);
+ struct devlink *devlink;
+
+ devlink = devlinks_xa_get(rel->nested_in.devlink_index);
+ if (!devlink)
+ goto rel_put;
+ if (!devl_trylock(devlink)) {
+ devlink_put(devlink);
+ goto reschedule_work;
+ }
+ if (!devl_is_registered(devlink)) {
+ devl_unlock(devlink);
+ devlink_put(devlink);
+ goto rel_put;
+ }
+ if (!xa_get_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE))
+ rel->nested_in.cleanup_cb(devlink, rel->nested_in.obj_index, rel->index);
+ rel->nested_in.notify_cb(devlink, rel->nested_in.obj_index);
+ devl_unlock(devlink);
+ devlink_put(devlink);
+
+rel_put:
+ __devlink_rel_put(rel);
+ return;
+
+reschedule_work:
+ schedule_delayed_work(&rel->nested_in.notify_work, 1);
+}
+
+static void devlink_rel_nested_in_notify_work_schedule(struct devlink_rel *rel)
+{
+ __devlink_rel_get(rel);
+ schedule_delayed_work(&rel->nested_in.notify_work, 0);
+}
+
+static struct devlink_rel *devlink_rel_alloc(void)
+{
+ struct devlink_rel *rel;
+ static u32 next;
+ int err;
+
+ rel = kzalloc(sizeof(*rel), GFP_KERNEL);
+ if (!rel)
+ return ERR_PTR(-ENOMEM);
+
+ err = xa_alloc_cyclic(&devlink_rels, &rel->index, rel,
+ xa_limit_32b, &next, GFP_KERNEL);
+ if (err < 0) {
+ kfree(rel);
+ return ERR_PTR(err);
+ }
+
+ refcount_set(&rel->refcount, 1);
+ INIT_DELAYED_WORK(&rel->nested_in.notify_work,
+ &devlink_rel_nested_in_notify_work);
+ return rel;
+}
+
+static void devlink_rel_put(struct devlink *devlink)
+{
+ struct devlink_rel *rel = devlink->rel;
+
+ if (!rel)
+ return;
+ xa_clear_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE);
+ devlink_rel_nested_in_notify_work_schedule(rel);
+ __devlink_rel_put(rel);
+ devlink->rel = NULL;
+}
+
+void devlink_rel_nested_in_clear(u32 rel_index)
+{
+ xa_clear_mark(&devlink_rels, rel_index, DEVLINK_REL_IN_USE);
+}
+
+int devlink_rel_nested_in_add(u32 *rel_index, u32 devlink_index,
+ u32 obj_index, devlink_rel_notify_cb_t *notify_cb,
+ devlink_rel_cleanup_cb_t *cleanup_cb,
+ struct devlink *devlink)
+{
+ struct devlink_rel *rel = devlink_rel_alloc();
+
+ ASSERT_DEVLINK_NOT_REGISTERED(devlink);
+
+ if (IS_ERR(rel))
+ return PTR_ERR(rel);
+
+ rel->devlink_index = devlink->index;
+ rel->nested_in.devlink_index = devlink_index;
+ rel->nested_in.obj_index = obj_index;
+ rel->nested_in.notify_cb = notify_cb;
+ rel->nested_in.cleanup_cb = cleanup_cb;
+ *rel_index = rel->index;
+ xa_set_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE);
+ devlink->rel = rel;
+ return 0;
+}
+
+/**
+ * devlink_rel_nested_in_notify - Notify the object this devlink
+ * instance is nested in.
+ * @devlink: devlink
+ *
+ * This is called upon network namespace change of devlink instance.
+ * In case this devlink instance is nested in another devlink object,
+ * a notification of a change of this object should be sent
+ * over netlink. The parent devlink instance lock needs to be
+ * taken during the notification preparation.
+ * However, since the devlink lock of nested instance is held here,
+ * we would end with wrong devlink instance lock ordering and
+ * deadlock. Therefore the work is utilized to avoid that.
+ */
+void devlink_rel_nested_in_notify(struct devlink *devlink)
+{
+ struct devlink_rel *rel = devlink->rel;
+
+ if (!rel)
+ return;
+ devlink_rel_nested_in_notify_work_schedule(rel);
+}
+
+static struct devlink_rel *devlink_rel_find(unsigned long rel_index)
+{
+ return xa_find(&devlink_rels, &rel_index, rel_index,
+ DEVLINK_REL_IN_USE);
+}
+
+static struct devlink *devlink_rel_devlink_get(u32 rel_index)
+{
+ struct devlink_rel *rel;
+ u32 devlink_index;
+
+ if (!rel_index)
+ return NULL;
+ xa_lock(&devlink_rels);
+ rel = devlink_rel_find(rel_index);
+ if (rel)
+ devlink_index = rel->devlink_index;
+ xa_unlock(&devlink_rels);
+ if (!rel)
+ return NULL;
+ return devlinks_xa_get(devlink_index);
+}
+
+int devlink_rel_devlink_handle_put(struct sk_buff *msg, struct devlink *devlink,
+ u32 rel_index, int attrtype,
+ bool *msg_updated)
+{
+ struct net *net = devlink_net(devlink);
+ struct devlink *rel_devlink;
+ int err;
+
+ rel_devlink = devlink_rel_devlink_get(rel_index);
+ if (!rel_devlink)
+ return 0;
+ err = devlink_nl_put_nested_handle(msg, net, rel_devlink, attrtype);
+ devlink_put(rel_devlink);
+ if (!err && msg_updated)
+ *msg_updated = true;
+ return err;
+}
+
+void *devlink_priv(struct devlink *devlink)
+{
+ return &devlink->priv;
+}
+EXPORT_SYMBOL_GPL(devlink_priv);
+
+struct devlink *priv_to_devlink(void *priv)
+{
+ return container_of(priv, struct devlink, priv);
+}
+EXPORT_SYMBOL_GPL(priv_to_devlink);
+
+struct device *devlink_to_dev(const struct devlink *devlink)
+{
+ return devlink->dev;
+}
+EXPORT_SYMBOL_GPL(devlink_to_dev);
+
+struct net *devlink_net(const struct devlink *devlink)
+{
+ return read_pnet(&devlink->_net);
+}
+EXPORT_SYMBOL_GPL(devlink_net);
+
+void devl_assert_locked(struct devlink *devlink)
+{
+ lockdep_assert_held(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devl_assert_locked);
+
+#ifdef CONFIG_LOCKDEP
+/* For use in conjunction with LOCKDEP only e.g. rcu_dereference_protected() */
+bool devl_lock_is_held(struct devlink *devlink)
+{
+ return lockdep_is_held(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devl_lock_is_held);
+#endif
+
+void devl_lock(struct devlink *devlink)
+{
+ mutex_lock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devl_lock);
+
+int devl_trylock(struct devlink *devlink)
+{
+ return mutex_trylock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devl_trylock);
+
+void devl_unlock(struct devlink *devlink)
+{
+ mutex_unlock(&devlink->lock);
+}
+EXPORT_SYMBOL_GPL(devl_unlock);
+
+/**
+ * devlink_try_get() - try to obtain a reference on a devlink instance
+ * @devlink: instance to reference
+ *
+ * Obtain a reference on a devlink instance. A reference on a devlink instance
+ * only implies that it's safe to take the instance lock. It does not imply
+ * that the instance is registered, use devl_is_registered() after taking
+ * the instance lock to check registration status.
+ */
+struct devlink *__must_check devlink_try_get(struct devlink *devlink)
+{
+ if (refcount_inc_not_zero(&devlink->refcount))
+ return devlink;
+ return NULL;
+}
+
+static void devlink_release(struct work_struct *work)
+{
+ struct devlink *devlink;
+
+ devlink = container_of(to_rcu_work(work), struct devlink, rwork);
+
+ mutex_destroy(&devlink->lock);
+ lockdep_unregister_key(&devlink->lock_key);
+ put_device(devlink->dev);
+ kvfree(devlink);
+}
+
+void devlink_put(struct devlink *devlink)
+{
+ if (refcount_dec_and_test(&devlink->refcount))
+ queue_rcu_work(system_percpu_wq, &devlink->rwork);
+}
+
+struct devlink *devlinks_xa_find_get(struct net *net, unsigned long *indexp)
+{
+ struct devlink *devlink = NULL;
+
+ rcu_read_lock();
+retry:
+ devlink = xa_find(&devlinks, indexp, ULONG_MAX, DEVLINK_REGISTERED);
+ if (!devlink)
+ goto unlock;
+
+ if (!devlink_try_get(devlink))
+ goto next;
+ if (!net_eq(devlink_net(devlink), net)) {
+ devlink_put(devlink);
+ goto next;
+ }
+unlock:
+ rcu_read_unlock();
+ return devlink;
+
+next:
+ (*indexp)++;
+ goto retry;
+}
+
+/**
+ * devl_register - Register devlink instance
+ * @devlink: devlink
+ */
+int devl_register(struct devlink *devlink)
+{
+ ASSERT_DEVLINK_NOT_REGISTERED(devlink);
+ devl_assert_locked(devlink);
+
+ xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
+ devlink_notify_register(devlink);
+ devlink_rel_nested_in_notify(devlink);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_register);
+
+void devlink_register(struct devlink *devlink)
+{
+ devl_lock(devlink);
+ devl_register(devlink);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_register);
+
+/**
+ * devl_unregister - Unregister devlink instance
+ * @devlink: devlink
+ */
+void devl_unregister(struct devlink *devlink)
+{
+ ASSERT_DEVLINK_REGISTERED(devlink);
+ devl_assert_locked(devlink);
+
+ devlink_notify_unregister(devlink);
+ xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
+ devlink_rel_put(devlink);
+}
+EXPORT_SYMBOL_GPL(devl_unregister);
+
+void devlink_unregister(struct devlink *devlink)
+{
+ devl_lock(devlink);
+ devl_unregister(devlink);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_unregister);
+
+/**
+ * devlink_alloc_ns - Allocate new devlink instance resources
+ * in specific namespace
+ *
+ * @ops: ops
+ * @priv_size: size of user private data
+ * @net: net namespace
+ * @dev: parent device
+ *
+ * Allocate new devlink instance resources, including devlink index
+ * and name.
+ */
+struct devlink *devlink_alloc_ns(const struct devlink_ops *ops,
+ size_t priv_size, struct net *net,
+ struct device *dev)
+{
+ struct devlink *devlink;
+ static u32 last_id;
+ int ret;
+
+ WARN_ON(!ops || !dev);
+ if (!devlink_reload_actions_valid(ops))
+ return NULL;
+
+ devlink = kvzalloc(struct_size(devlink, priv, priv_size), GFP_KERNEL);
+ if (!devlink)
+ return NULL;
+
+ ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b,
+ &last_id, GFP_KERNEL);
+ if (ret < 0)
+ goto err_xa_alloc;
+
+ devlink->dev = get_device(dev);
+ devlink->ops = ops;
+ xa_init_flags(&devlink->ports, XA_FLAGS_ALLOC);
+ xa_init_flags(&devlink->params, XA_FLAGS_ALLOC);
+ xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC);
+ xa_init_flags(&devlink->nested_rels, XA_FLAGS_ALLOC);
+ write_pnet(&devlink->_net, net);
+ INIT_LIST_HEAD(&devlink->rate_list);
+ INIT_LIST_HEAD(&devlink->linecard_list);
+ INIT_LIST_HEAD(&devlink->sb_list);
+ INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list);
+ INIT_LIST_HEAD(&devlink->resource_list);
+ INIT_LIST_HEAD(&devlink->region_list);
+ INIT_LIST_HEAD(&devlink->reporter_list);
+ INIT_LIST_HEAD(&devlink->trap_list);
+ INIT_LIST_HEAD(&devlink->trap_group_list);
+ INIT_LIST_HEAD(&devlink->trap_policer_list);
+ INIT_RCU_WORK(&devlink->rwork, devlink_release);
+ lockdep_register_key(&devlink->lock_key);
+ mutex_init(&devlink->lock);
+ lockdep_set_class(&devlink->lock, &devlink->lock_key);
+ refcount_set(&devlink->refcount, 1);
+
+ return devlink;
+
+err_xa_alloc:
+ kvfree(devlink);
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(devlink_alloc_ns);
+
+/**
+ * devlink_free - Free devlink instance resources
+ *
+ * @devlink: devlink
+ */
+void devlink_free(struct devlink *devlink)
+{
+ ASSERT_DEVLINK_NOT_REGISTERED(devlink);
+
+ WARN_ON(!list_empty(&devlink->trap_policer_list));
+ WARN_ON(!list_empty(&devlink->trap_group_list));
+ WARN_ON(!list_empty(&devlink->trap_list));
+ WARN_ON(!list_empty(&devlink->reporter_list));
+ WARN_ON(!list_empty(&devlink->region_list));
+ WARN_ON(!list_empty(&devlink->resource_list));
+ WARN_ON(!list_empty(&devlink->dpipe_table_list));
+ WARN_ON(!list_empty(&devlink->sb_list));
+ WARN_ON(!list_empty(&devlink->rate_list));
+ WARN_ON(!list_empty(&devlink->linecard_list));
+ WARN_ON(!xa_empty(&devlink->ports));
+
+ xa_destroy(&devlink->nested_rels);
+ xa_destroy(&devlink->snapshot_ids);
+ xa_destroy(&devlink->params);
+ xa_destroy(&devlink->ports);
+
+ xa_erase(&devlinks, devlink->index);
+
+ devlink_put(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_free);
+
+static void __net_exit devlink_pernet_pre_exit(struct net *net)
+{
+ struct devlink *devlink;
+ u32 actions_performed;
+ unsigned long index;
+ int err;
+
+ /* In case network namespace is getting destroyed, reload
+ * all devlink instances from this namespace into init_net.
+ */
+ devlinks_xa_for_each_registered_get(net, index, devlink) {
+ devl_dev_lock(devlink, true);
+ err = 0;
+ if (devl_is_registered(devlink))
+ err = devlink_reload(devlink, &init_net,
+ DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
+ DEVLINK_RELOAD_LIMIT_UNSPEC,
+ &actions_performed, NULL);
+ devl_dev_unlock(devlink, true);
+ devlink_put(devlink);
+ if (err && err != -EOPNOTSUPP)
+ pr_warn("Failed to reload devlink instance into init_net\n");
+ }
+}
+
+static struct pernet_operations devlink_pernet_ops __net_initdata = {
+ .pre_exit = devlink_pernet_pre_exit,
+};
+
+static struct notifier_block devlink_port_netdevice_nb = {
+ .notifier_call = devlink_port_netdevice_event,
+};
+
+static int __init devlink_init(void)
+{
+ int err;
+
+ err = register_pernet_subsys(&devlink_pernet_ops);
+ if (err)
+ goto out;
+ err = genl_register_family(&devlink_nl_family);
+ if (err)
+ goto out_unreg_pernet_subsys;
+ err = register_netdevice_notifier(&devlink_port_netdevice_nb);
+ if (!err)
+ return 0;
+
+ genl_unregister_family(&devlink_nl_family);
+
+out_unreg_pernet_subsys:
+ unregister_pernet_subsys(&devlink_pernet_ops);
+out:
+ WARN_ON(err);
+ return err;
+}
+
+subsys_initcall(devlink_init);
diff --git a/net/devlink/dev.c b/net/devlink/dev.c
new file mode 100644
index 000000000000..02602704bdea
--- /dev/null
+++ b/net/devlink/dev.c
@@ -0,0 +1,1442 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include <linux/device.h>
+#include <net/genetlink.h>
+#include <net/sock.h>
+#include "devl_internal.h"
+
+struct devlink_info_req {
+ struct sk_buff *msg;
+ void (*version_cb)(const char *version_name,
+ enum devlink_info_version_type version_type,
+ void *version_cb_priv);
+ void *version_cb_priv;
+};
+
+struct devlink_reload_combination {
+ enum devlink_reload_action action;
+ enum devlink_reload_limit limit;
+};
+
+static const struct devlink_reload_combination devlink_reload_invalid_combinations[] = {
+ {
+ /* can't reinitialize driver with no down time */
+ .action = DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
+ .limit = DEVLINK_RELOAD_LIMIT_NO_RESET,
+ },
+};
+
+static bool
+devlink_reload_combination_is_invalid(enum devlink_reload_action action,
+ enum devlink_reload_limit limit)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++)
+ if (devlink_reload_invalid_combinations[i].action == action &&
+ devlink_reload_invalid_combinations[i].limit == limit)
+ return true;
+ return false;
+}
+
+static bool
+devlink_reload_action_is_supported(struct devlink *devlink, enum devlink_reload_action action)
+{
+ return test_bit(action, &devlink->ops->reload_actions);
+}
+
+static bool
+devlink_reload_limit_is_supported(struct devlink *devlink, enum devlink_reload_limit limit)
+{
+ return test_bit(limit, &devlink->ops->reload_limits);
+}
+
+static int devlink_reload_stat_put(struct sk_buff *msg,
+ enum devlink_reload_limit limit, u32 value)
+{
+ struct nlattr *reload_stats_entry;
+
+ reload_stats_entry = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS_ENTRY);
+ if (!reload_stats_entry)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_STATS_LIMIT, limit) ||
+ nla_put_u32(msg, DEVLINK_ATTR_RELOAD_STATS_VALUE, value))
+ goto nla_put_failure;
+ nla_nest_end(msg, reload_stats_entry);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, reload_stats_entry);
+ return -EMSGSIZE;
+}
+
+static int
+devlink_reload_stats_put(struct sk_buff *msg, struct devlink *devlink, bool is_remote)
+{
+ struct nlattr *reload_stats_attr, *act_info, *act_stats;
+ int i, j, stat_idx;
+ u32 value;
+
+ if (!is_remote)
+ reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_STATS);
+ else
+ reload_stats_attr = nla_nest_start(msg, DEVLINK_ATTR_REMOTE_RELOAD_STATS);
+
+ if (!reload_stats_attr)
+ return -EMSGSIZE;
+
+ for (i = 0; i <= DEVLINK_RELOAD_ACTION_MAX; i++) {
+ if ((!is_remote &&
+ !devlink_reload_action_is_supported(devlink, i)) ||
+ i == DEVLINK_RELOAD_ACTION_UNSPEC)
+ continue;
+ act_info = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_INFO);
+ if (!act_info)
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_ACTION, i))
+ goto action_info_nest_cancel;
+ act_stats = nla_nest_start(msg, DEVLINK_ATTR_RELOAD_ACTION_STATS);
+ if (!act_stats)
+ goto action_info_nest_cancel;
+
+ for (j = 0; j <= DEVLINK_RELOAD_LIMIT_MAX; j++) {
+ /* Remote stats are shown even if not locally supported.
+ * Stats of actions with unspecified limit are shown
+ * though drivers don't need to register unspecified
+ * limit.
+ */
+ if ((!is_remote && j != DEVLINK_RELOAD_LIMIT_UNSPEC &&
+ !devlink_reload_limit_is_supported(devlink, j)) ||
+ devlink_reload_combination_is_invalid(i, j))
+ continue;
+
+ stat_idx = j * __DEVLINK_RELOAD_ACTION_MAX + i;
+ if (!is_remote)
+ value = devlink->stats.reload_stats[stat_idx];
+ else
+ value = devlink->stats.remote_reload_stats[stat_idx];
+ if (devlink_reload_stat_put(msg, j, value))
+ goto action_stats_nest_cancel;
+ }
+ nla_nest_end(msg, act_stats);
+ nla_nest_end(msg, act_info);
+ }
+ nla_nest_end(msg, reload_stats_attr);
+ return 0;
+
+action_stats_nest_cancel:
+ nla_nest_cancel(msg, act_stats);
+action_info_nest_cancel:
+ nla_nest_cancel(msg, act_info);
+nla_put_failure:
+ nla_nest_cancel(msg, reload_stats_attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_nested_fill(struct sk_buff *msg, struct devlink *devlink)
+{
+ unsigned long rel_index;
+ void *unused;
+ int err;
+
+ xa_for_each(&devlink->nested_rels, rel_index, unused) {
+ err = devlink_rel_devlink_handle_put(msg, devlink,
+ rel_index,
+ DEVLINK_ATTR_NESTED_DEVLINK,
+ NULL);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ struct nlattr *dev_stats;
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_RELOAD_FAILED, devlink->reload_failed))
+ goto nla_put_failure;
+
+ dev_stats = nla_nest_start(msg, DEVLINK_ATTR_DEV_STATS);
+ if (!dev_stats)
+ goto nla_put_failure;
+
+ if (devlink_reload_stats_put(msg, devlink, false))
+ goto dev_stats_nest_cancel;
+ if (devlink_reload_stats_put(msg, devlink, true))
+ goto dev_stats_nest_cancel;
+
+ nla_nest_end(msg, dev_stats);
+
+ if (devlink_nl_nested_fill(msg, devlink))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+dev_stats_nest_cancel:
+ nla_nest_cancel(msg, dev_stats);
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_notify(struct devlink *devlink, enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL);
+ WARN_ON(!devl_is_registered(devlink));
+
+ if (!devlink_nl_notify_need(devlink))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ devlink_nl_notify_send(devlink, msg);
+}
+
+int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int
+devlink_nl_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
+ struct netlink_callback *cb, int flags)
+{
+ return devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags);
+}
+
+int devlink_nl_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(msg, cb, devlink_nl_get_dump_one);
+}
+
+static void devlink_rel_notify_cb(struct devlink *devlink, u32 obj_index)
+{
+ devlink_notify(devlink, DEVLINK_CMD_NEW);
+}
+
+static void devlink_rel_cleanup_cb(struct devlink *devlink, u32 obj_index,
+ u32 rel_index)
+{
+ xa_erase(&devlink->nested_rels, rel_index);
+}
+
+int devl_nested_devlink_set(struct devlink *devlink,
+ struct devlink *nested_devlink)
+{
+ u32 rel_index;
+ int err;
+
+ err = devlink_rel_nested_in_add(&rel_index, devlink->index, 0,
+ devlink_rel_notify_cb,
+ devlink_rel_cleanup_cb,
+ nested_devlink);
+ if (err)
+ return err;
+ return xa_insert(&devlink->nested_rels, rel_index,
+ xa_mk_value(0), GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(devl_nested_devlink_set);
+
+void devlink_notify_register(struct devlink *devlink)
+{
+ devlink_notify(devlink, DEVLINK_CMD_NEW);
+ devlink_linecards_notify_register(devlink);
+ devlink_ports_notify_register(devlink);
+ devlink_trap_policers_notify_register(devlink);
+ devlink_trap_groups_notify_register(devlink);
+ devlink_traps_notify_register(devlink);
+ devlink_rates_notify_register(devlink);
+ devlink_regions_notify_register(devlink);
+ devlink_params_notify_register(devlink);
+}
+
+void devlink_notify_unregister(struct devlink *devlink)
+{
+ devlink_params_notify_unregister(devlink);
+ devlink_regions_notify_unregister(devlink);
+ devlink_rates_notify_unregister(devlink);
+ devlink_traps_notify_unregister(devlink);
+ devlink_trap_groups_notify_unregister(devlink);
+ devlink_trap_policers_notify_unregister(devlink);
+ devlink_ports_notify_unregister(devlink);
+ devlink_linecards_notify_unregister(devlink);
+ devlink_notify(devlink, DEVLINK_CMD_DEL);
+}
+
+static void devlink_reload_failed_set(struct devlink *devlink,
+ bool reload_failed)
+{
+ if (devlink->reload_failed == reload_failed)
+ return;
+ devlink->reload_failed = reload_failed;
+ devlink_notify(devlink, DEVLINK_CMD_NEW);
+}
+
+bool devlink_is_reload_failed(const struct devlink *devlink)
+{
+ return devlink->reload_failed;
+}
+EXPORT_SYMBOL_GPL(devlink_is_reload_failed);
+
+static void
+__devlink_reload_stats_update(struct devlink *devlink, u32 *reload_stats,
+ enum devlink_reload_limit limit, u32 actions_performed)
+{
+ unsigned long actions = actions_performed;
+ int stat_idx;
+ int action;
+
+ for_each_set_bit(action, &actions, __DEVLINK_RELOAD_ACTION_MAX) {
+ stat_idx = limit * __DEVLINK_RELOAD_ACTION_MAX + action;
+ reload_stats[stat_idx]++;
+ }
+ devlink_notify(devlink, DEVLINK_CMD_NEW);
+}
+
+static void
+devlink_reload_stats_update(struct devlink *devlink, enum devlink_reload_limit limit,
+ u32 actions_performed)
+{
+ __devlink_reload_stats_update(devlink, devlink->stats.reload_stats, limit,
+ actions_performed);
+}
+
+/**
+ * devlink_remote_reload_actions_performed - Update devlink on reload actions
+ * performed which are not a direct result of devlink reload call.
+ *
+ * This should be called by a driver after performing reload actions in case it was not
+ * a result of devlink reload call. For example fw_activate was performed as a result
+ * of devlink reload triggered fw_activate on another host.
+ * The motivation for this function is to keep data on reload actions performed on this
+ * function whether it was done due to direct devlink reload call or not.
+ *
+ * @devlink: devlink
+ * @limit: reload limit
+ * @actions_performed: bitmask of actions performed
+ */
+void devlink_remote_reload_actions_performed(struct devlink *devlink,
+ enum devlink_reload_limit limit,
+ u32 actions_performed)
+{
+ if (WARN_ON(!actions_performed ||
+ actions_performed & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) ||
+ actions_performed >= BIT(__DEVLINK_RELOAD_ACTION_MAX) ||
+ limit > DEVLINK_RELOAD_LIMIT_MAX))
+ return;
+
+ __devlink_reload_stats_update(devlink, devlink->stats.remote_reload_stats, limit,
+ actions_performed);
+}
+EXPORT_SYMBOL_GPL(devlink_remote_reload_actions_performed);
+
+static struct net *devlink_netns_get(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct nlattr *netns_pid_attr = info->attrs[DEVLINK_ATTR_NETNS_PID];
+ struct nlattr *netns_fd_attr = info->attrs[DEVLINK_ATTR_NETNS_FD];
+ struct nlattr *netns_id_attr = info->attrs[DEVLINK_ATTR_NETNS_ID];
+ struct net *net;
+
+ if (!!netns_pid_attr + !!netns_fd_attr + !!netns_id_attr > 1) {
+ NL_SET_ERR_MSG(info->extack, "multiple netns identifying attributes specified");
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (netns_pid_attr) {
+ net = get_net_ns_by_pid(nla_get_u32(netns_pid_attr));
+ } else if (netns_fd_attr) {
+ net = get_net_ns_by_fd(nla_get_u32(netns_fd_attr));
+ } else if (netns_id_attr) {
+ net = get_net_ns_by_id(sock_net(skb->sk),
+ nla_get_u32(netns_id_attr));
+ if (!net)
+ net = ERR_PTR(-EINVAL);
+ } else {
+ WARN_ON(1);
+ net = ERR_PTR(-EINVAL);
+ }
+ if (IS_ERR(net)) {
+ NL_SET_ERR_MSG(info->extack, "Unknown network namespace");
+ return ERR_PTR(-EINVAL);
+ }
+ if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
+ put_net(net);
+ return ERR_PTR(-EPERM);
+ }
+ return net;
+}
+
+static void devlink_reload_netns_change(struct devlink *devlink,
+ struct net *curr_net,
+ struct net *dest_net)
+{
+ /* Userspace needs to be notified about devlink objects
+ * removed from original and entering new network namespace.
+ * The rest of the devlink objects are re-created during
+ * reload process so the notifications are generated separatelly.
+ */
+ devlink_notify_unregister(devlink);
+ write_pnet(&devlink->_net, dest_net);
+ devlink_notify_register(devlink);
+ devlink_rel_nested_in_notify(devlink);
+}
+
+static void devlink_reload_reinit_sanity_check(struct devlink *devlink)
+{
+ WARN_ON(!list_empty(&devlink->trap_policer_list));
+ WARN_ON(!list_empty(&devlink->trap_group_list));
+ WARN_ON(!list_empty(&devlink->trap_list));
+ WARN_ON(!list_empty(&devlink->dpipe_table_list));
+ WARN_ON(!list_empty(&devlink->sb_list));
+ WARN_ON(!list_empty(&devlink->rate_list));
+ WARN_ON(!list_empty(&devlink->linecard_list));
+ WARN_ON(!xa_empty(&devlink->ports));
+}
+
+int devlink_reload(struct devlink *devlink, struct net *dest_net,
+ enum devlink_reload_action action,
+ enum devlink_reload_limit limit,
+ u32 *actions_performed, struct netlink_ext_ack *extack)
+{
+ u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE];
+ struct net *curr_net;
+ int err;
+
+ /* Make sure the reload operations are invoked with the device lock
+ * held to allow drivers to trigger functionality that expects it
+ * (e.g., PCI reset) and to close possible races between these
+ * operations and probe/remove.
+ */
+ device_lock_assert(devlink->dev);
+
+ memcpy(remote_reload_stats, devlink->stats.remote_reload_stats,
+ sizeof(remote_reload_stats));
+
+ err = devlink->ops->reload_down(devlink, !!dest_net, action, limit, extack);
+ if (err)
+ return err;
+
+ curr_net = devlink_net(devlink);
+ if (dest_net && !net_eq(dest_net, curr_net))
+ devlink_reload_netns_change(devlink, curr_net, dest_net);
+
+ if (action == DEVLINK_RELOAD_ACTION_DRIVER_REINIT) {
+ devlink_params_driverinit_load_new(devlink);
+ devlink_reload_reinit_sanity_check(devlink);
+ }
+
+ err = devlink->ops->reload_up(devlink, action, limit, actions_performed, extack);
+ devlink_reload_failed_set(devlink, !!err);
+ if (err)
+ return err;
+
+ WARN_ON(!(*actions_performed & BIT(action)));
+ /* Catch driver on updating the remote action within devlink reload */
+ WARN_ON(memcmp(remote_reload_stats, devlink->stats.remote_reload_stats,
+ sizeof(remote_reload_stats)));
+ devlink_reload_stats_update(devlink, limit, *actions_performed);
+ return 0;
+}
+
+static int
+devlink_nl_reload_actions_performed_snd(struct devlink *devlink, u32 actions_performed,
+ enum devlink_command cmd, struct genl_info *info)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq, &devlink_nl_family, 0, cmd);
+ if (!hdr)
+ goto free_msg;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_bitfield32(msg, DEVLINK_ATTR_RELOAD_ACTIONS_PERFORMED, actions_performed,
+ actions_performed))
+ goto nla_put_failure;
+ genlmsg_end(msg, hdr);
+
+ return genlmsg_reply(msg, info);
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+int devlink_nl_reload_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ enum devlink_reload_action action;
+ enum devlink_reload_limit limit;
+ struct net *dest_net = NULL;
+ u32 actions_performed;
+ int err;
+
+ err = devlink_resources_validate(devlink, NULL, info);
+ if (err) {
+ NL_SET_ERR_MSG(info->extack, "resources size validation failed");
+ return err;
+ }
+
+ action = nla_get_u8_default(info->attrs[DEVLINK_ATTR_RELOAD_ACTION],
+ DEVLINK_RELOAD_ACTION_DRIVER_REINIT);
+
+ if (!devlink_reload_action_is_supported(devlink, action)) {
+ NL_SET_ERR_MSG(info->extack, "Requested reload action is not supported by the driver");
+ return -EOPNOTSUPP;
+ }
+
+ limit = DEVLINK_RELOAD_LIMIT_UNSPEC;
+ if (info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]) {
+ struct nla_bitfield32 limits;
+ u32 limits_selected;
+
+ limits = nla_get_bitfield32(info->attrs[DEVLINK_ATTR_RELOAD_LIMITS]);
+ limits_selected = limits.value & limits.selector;
+ if (!limits_selected) {
+ NL_SET_ERR_MSG(info->extack, "Invalid limit selected");
+ return -EINVAL;
+ }
+ for (limit = 0 ; limit <= DEVLINK_RELOAD_LIMIT_MAX ; limit++)
+ if (limits_selected & BIT(limit))
+ break;
+ /* UAPI enables multiselection, but currently it is not used */
+ if (limits_selected != BIT(limit)) {
+ NL_SET_ERR_MSG(info->extack, "Multiselection of limit is not supported");
+ return -EOPNOTSUPP;
+ }
+ if (!devlink_reload_limit_is_supported(devlink, limit)) {
+ NL_SET_ERR_MSG(info->extack, "Requested limit is not supported by the driver");
+ return -EOPNOTSUPP;
+ }
+ if (devlink_reload_combination_is_invalid(action, limit)) {
+ NL_SET_ERR_MSG(info->extack, "Requested limit is invalid for this action");
+ return -EINVAL;
+ }
+ }
+ if (info->attrs[DEVLINK_ATTR_NETNS_PID] ||
+ info->attrs[DEVLINK_ATTR_NETNS_FD] ||
+ info->attrs[DEVLINK_ATTR_NETNS_ID]) {
+ dest_net = devlink_netns_get(skb, info);
+ if (IS_ERR(dest_net))
+ return PTR_ERR(dest_net);
+ if (!net_eq(dest_net, devlink_net(devlink)) &&
+ action != DEVLINK_RELOAD_ACTION_DRIVER_REINIT) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Changing namespace is only supported for reinit action");
+ return -EOPNOTSUPP;
+ }
+ }
+
+ err = devlink_reload(devlink, dest_net, action, limit, &actions_performed, info->extack);
+
+ if (dest_net)
+ put_net(dest_net);
+
+ if (err)
+ return err;
+ /* For backward compatibility generate reply only if attributes used by user */
+ if (!info->attrs[DEVLINK_ATTR_RELOAD_ACTION] && !info->attrs[DEVLINK_ATTR_RELOAD_LIMITS])
+ return 0;
+
+ return devlink_nl_reload_actions_performed_snd(devlink, actions_performed,
+ DEVLINK_CMD_RELOAD, info);
+}
+
+bool devlink_reload_actions_valid(const struct devlink_ops *ops)
+{
+ const struct devlink_reload_combination *comb;
+ int i;
+
+ if (!devlink_reload_supported(ops)) {
+ if (WARN_ON(ops->reload_actions))
+ return false;
+ return true;
+ }
+
+ if (WARN_ON(!ops->reload_actions ||
+ ops->reload_actions & BIT(DEVLINK_RELOAD_ACTION_UNSPEC) ||
+ ops->reload_actions >= BIT(__DEVLINK_RELOAD_ACTION_MAX)))
+ return false;
+
+ if (WARN_ON(ops->reload_limits & BIT(DEVLINK_RELOAD_LIMIT_UNSPEC) ||
+ ops->reload_limits >= BIT(__DEVLINK_RELOAD_LIMIT_MAX)))
+ return false;
+
+ for (i = 0; i < ARRAY_SIZE(devlink_reload_invalid_combinations); i++) {
+ comb = &devlink_reload_invalid_combinations[i];
+ if (ops->reload_actions == BIT(comb->action) &&
+ ops->reload_limits == BIT(comb->limit))
+ return false;
+ }
+ return true;
+}
+
+static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ const struct devlink_ops *ops = devlink->ops;
+ enum devlink_eswitch_encap_mode encap_mode;
+ u8 inline_mode;
+ void *hdr;
+ int err = 0;
+ u16 mode;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ err = devlink_nl_put_handle(msg, devlink);
+ if (err)
+ goto nla_put_failure;
+
+ if (ops->eswitch_mode_get) {
+ err = ops->eswitch_mode_get(devlink, &mode);
+ if (err)
+ goto nla_put_failure;
+ err = nla_put_u16(msg, DEVLINK_ATTR_ESWITCH_MODE, mode);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ if (ops->eswitch_inline_mode_get) {
+ err = ops->eswitch_inline_mode_get(devlink, &inline_mode);
+ if (err)
+ goto nla_put_failure;
+ err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_INLINE_MODE,
+ inline_mode);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ if (ops->eswitch_encap_mode_get) {
+ err = ops->eswitch_encap_mode_get(devlink, &encap_mode);
+ if (err)
+ goto nla_put_failure;
+ err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_ENCAP_MODE, encap_mode);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+int devlink_nl_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_eswitch_fill(msg, devlink, DEVLINK_CMD_ESWITCH_GET,
+ info->snd_portid, info->snd_seq, 0);
+
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+int devlink_nl_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ const struct devlink_ops *ops = devlink->ops;
+ enum devlink_eswitch_encap_mode encap_mode;
+ u8 inline_mode;
+ int err = 0;
+ u16 mode;
+
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) {
+ if (!ops->eswitch_mode_set)
+ return -EOPNOTSUPP;
+ mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
+ err = devlink_rate_nodes_check(devlink, mode, info->extack);
+ if (err)
+ return err;
+ err = ops->eswitch_mode_set(devlink, mode, info->extack);
+ if (err)
+ return err;
+ }
+
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]) {
+ if (!ops->eswitch_inline_mode_set)
+ return -EOPNOTSUPP;
+ inline_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_INLINE_MODE]);
+ err = ops->eswitch_inline_mode_set(devlink, inline_mode,
+ info->extack);
+ if (err)
+ return err;
+ }
+
+ if (info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]) {
+ if (!ops->eswitch_encap_mode_set)
+ return -EOPNOTSUPP;
+ encap_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]);
+ err = ops->eswitch_encap_mode_set(devlink, encap_mode,
+ info->extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn)
+{
+ if (!req->msg)
+ return 0;
+ return nla_put_string(req->msg, DEVLINK_ATTR_INFO_SERIAL_NUMBER, sn);
+}
+EXPORT_SYMBOL_GPL(devlink_info_serial_number_put);
+
+int devlink_info_board_serial_number_put(struct devlink_info_req *req,
+ const char *bsn)
+{
+ if (!req->msg)
+ return 0;
+ return nla_put_string(req->msg, DEVLINK_ATTR_INFO_BOARD_SERIAL_NUMBER,
+ bsn);
+}
+EXPORT_SYMBOL_GPL(devlink_info_board_serial_number_put);
+
+static int devlink_info_version_put(struct devlink_info_req *req, int attr,
+ const char *version_name,
+ const char *version_value,
+ enum devlink_info_version_type version_type)
+{
+ struct nlattr *nest;
+ int err;
+
+ if (req->version_cb)
+ req->version_cb(version_name, version_type,
+ req->version_cb_priv);
+
+ if (!req->msg || !*version_value)
+ return 0;
+
+ nest = nla_nest_start_noflag(req->msg, attr);
+ if (!nest)
+ return -EMSGSIZE;
+
+ err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_NAME,
+ version_name);
+ if (err)
+ goto nla_put_failure;
+
+ err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_VALUE,
+ version_value);
+ if (err)
+ goto nla_put_failure;
+
+ nla_nest_end(req->msg, nest);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(req->msg, nest);
+ return err;
+}
+
+int devlink_info_version_fixed_put(struct devlink_info_req *req,
+ const char *version_name,
+ const char *version_value)
+{
+ return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_FIXED,
+ version_name, version_value,
+ DEVLINK_INFO_VERSION_TYPE_NONE);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_fixed_put);
+
+int devlink_info_version_stored_put(struct devlink_info_req *req,
+ const char *version_name,
+ const char *version_value)
+{
+ return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED,
+ version_name, version_value,
+ DEVLINK_INFO_VERSION_TYPE_NONE);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_stored_put);
+
+int devlink_info_version_stored_put_ext(struct devlink_info_req *req,
+ const char *version_name,
+ const char *version_value,
+ enum devlink_info_version_type version_type)
+{
+ return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED,
+ version_name, version_value,
+ version_type);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_stored_put_ext);
+
+int devlink_info_version_running_put(struct devlink_info_req *req,
+ const char *version_name,
+ const char *version_value)
+{
+ return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING,
+ version_name, version_value,
+ DEVLINK_INFO_VERSION_TYPE_NONE);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_running_put);
+
+int devlink_info_version_running_put_ext(struct devlink_info_req *req,
+ const char *version_name,
+ const char *version_value,
+ enum devlink_info_version_type version_type)
+{
+ return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING,
+ version_name, version_value,
+ version_type);
+}
+EXPORT_SYMBOL_GPL(devlink_info_version_running_put_ext);
+
+static int devlink_nl_driver_info_get(struct device_driver *drv,
+ struct devlink_info_req *req)
+{
+ if (!drv)
+ return 0;
+
+ if (drv->name[0])
+ return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME,
+ drv->name);
+
+ return 0;
+}
+
+static int
+devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags, struct netlink_ext_ack *extack)
+{
+ struct device *dev = devlink_to_dev(devlink);
+ struct devlink_info_req req = {};
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ err = -EMSGSIZE;
+ if (devlink_nl_put_handle(msg, devlink))
+ goto err_cancel_msg;
+
+ req.msg = msg;
+ if (devlink->ops->info_get) {
+ err = devlink->ops->info_get(devlink, &req, extack);
+ if (err)
+ goto err_cancel_msg;
+ }
+
+ err = devlink_nl_driver_info_get(dev->driver, &req);
+ if (err)
+ goto err_cancel_msg;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+err_cancel_msg:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+int devlink_nl_info_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET,
+ info->snd_portid, info->snd_seq, 0,
+ info->extack);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int
+devlink_nl_info_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
+ struct netlink_callback *cb, int flags)
+{
+ int err;
+
+ err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags,
+ cb->extack);
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ return err;
+}
+
+int devlink_nl_info_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(msg, cb, devlink_nl_info_get_dump_one);
+}
+
+static int devlink_nl_flash_update_fill(struct sk_buff *msg,
+ struct devlink *devlink,
+ enum devlink_command cmd,
+ struct devlink_flash_notify *params)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS)
+ goto out;
+
+ if (params->status_msg &&
+ nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG,
+ params->status_msg))
+ goto nla_put_failure;
+ if (params->component &&
+ nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT,
+ params->component))
+ goto nla_put_failure;
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE,
+ params->done))
+ goto nla_put_failure;
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL,
+ params->total))
+ goto nla_put_failure;
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TIMEOUT,
+ params->timeout))
+ goto nla_put_failure;
+
+out:
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void __devlink_flash_update_notify(struct devlink *devlink,
+ enum devlink_command cmd,
+ struct devlink_flash_notify *params)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE &&
+ cmd != DEVLINK_CMD_FLASH_UPDATE_END &&
+ cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS);
+
+ if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_flash_update_fill(msg, devlink, cmd, params);
+ if (err)
+ goto out_free_msg;
+
+ devlink_nl_notify_send(devlink, msg);
+ return;
+
+out_free_msg:
+ nlmsg_free(msg);
+}
+
+static void devlink_flash_update_begin_notify(struct devlink *devlink)
+{
+ struct devlink_flash_notify params = {};
+
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE,
+ &params);
+}
+
+static void devlink_flash_update_end_notify(struct devlink *devlink)
+{
+ struct devlink_flash_notify params = {};
+
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE_END,
+ &params);
+}
+
+void devlink_flash_update_status_notify(struct devlink *devlink,
+ const char *status_msg,
+ const char *component,
+ unsigned long done,
+ unsigned long total)
+{
+ struct devlink_flash_notify params = {
+ .status_msg = status_msg,
+ .component = component,
+ .done = done,
+ .total = total,
+ };
+
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE_STATUS,
+ &params);
+}
+EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify);
+
+void devlink_flash_update_timeout_notify(struct devlink *devlink,
+ const char *status_msg,
+ const char *component,
+ unsigned long timeout)
+{
+ struct devlink_flash_notify params = {
+ .status_msg = status_msg,
+ .component = component,
+ .timeout = timeout,
+ };
+
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE_STATUS,
+ &params);
+}
+EXPORT_SYMBOL_GPL(devlink_flash_update_timeout_notify);
+
+struct devlink_flash_component_lookup_ctx {
+ const char *lookup_name;
+ bool lookup_name_found;
+};
+
+static void
+devlink_flash_component_lookup_cb(const char *version_name,
+ enum devlink_info_version_type version_type,
+ void *version_cb_priv)
+{
+ struct devlink_flash_component_lookup_ctx *lookup_ctx = version_cb_priv;
+
+ if (version_type != DEVLINK_INFO_VERSION_TYPE_COMPONENT ||
+ lookup_ctx->lookup_name_found)
+ return;
+
+ lookup_ctx->lookup_name_found =
+ !strcmp(lookup_ctx->lookup_name, version_name);
+}
+
+static int devlink_flash_component_get(struct devlink *devlink,
+ struct nlattr *nla_component,
+ const char **p_component,
+ struct netlink_ext_ack *extack)
+{
+ struct devlink_flash_component_lookup_ctx lookup_ctx = {};
+ struct devlink_info_req req = {};
+ const char *component;
+ int ret;
+
+ if (!nla_component)
+ return 0;
+
+ component = nla_data(nla_component);
+
+ if (!devlink->ops->info_get) {
+ NL_SET_ERR_MSG_ATTR(extack, nla_component,
+ "component update is not supported by this device");
+ return -EOPNOTSUPP;
+ }
+
+ lookup_ctx.lookup_name = component;
+ req.version_cb = devlink_flash_component_lookup_cb;
+ req.version_cb_priv = &lookup_ctx;
+
+ ret = devlink->ops->info_get(devlink, &req, NULL);
+ if (ret)
+ return ret;
+
+ if (!lookup_ctx.lookup_name_found) {
+ NL_SET_ERR_MSG_ATTR(extack, nla_component,
+ "selected component is not supported by this device");
+ return -EINVAL;
+ }
+ *p_component = component;
+ return 0;
+}
+
+int devlink_nl_flash_update_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *nla_overwrite_mask, *nla_file_name;
+ struct devlink_flash_update_params params = {};
+ struct devlink *devlink = info->user_ptr[0];
+ const char *file_name;
+ u32 supported_params;
+ int ret;
+
+ if (!devlink->ops->flash_update)
+ return -EOPNOTSUPP;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME))
+ return -EINVAL;
+
+ ret = devlink_flash_component_get(devlink,
+ info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT],
+ &params.component, info->extack);
+ if (ret)
+ return ret;
+
+ supported_params = devlink->ops->supported_flash_update_params;
+
+ nla_overwrite_mask = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK];
+ if (nla_overwrite_mask) {
+ struct nla_bitfield32 sections;
+
+ if (!(supported_params & DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK)) {
+ NL_SET_ERR_MSG_ATTR(info->extack, nla_overwrite_mask,
+ "overwrite settings are not supported by this device");
+ return -EOPNOTSUPP;
+ }
+ sections = nla_get_bitfield32(nla_overwrite_mask);
+ params.overwrite_mask = sections.value & sections.selector;
+ }
+
+ nla_file_name = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME];
+ file_name = nla_data(nla_file_name);
+ ret = request_firmware(&params.fw, file_name, devlink->dev);
+ if (ret) {
+ NL_SET_ERR_MSG_ATTR(info->extack, nla_file_name,
+ "failed to locate the requested firmware file");
+ return ret;
+ }
+
+ devlink_flash_update_begin_notify(devlink);
+ ret = devlink->ops->flash_update(devlink, &params, info->extack);
+ devlink_flash_update_end_notify(devlink);
+
+ release_firmware(params.fw);
+
+ return ret;
+}
+
+static void __devlink_compat_running_version(struct devlink *devlink,
+ char *buf, size_t len)
+{
+ struct devlink_info_req req = {};
+ const struct nlattr *nlattr;
+ struct sk_buff *msg;
+ int rem, err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ req.msg = msg;
+ err = devlink->ops->info_get(devlink, &req, NULL);
+ if (err)
+ goto free_msg;
+
+ nla_for_each_attr_type(nlattr, DEVLINK_ATTR_INFO_VERSION_RUNNING,
+ (void *)msg->data, msg->len, rem) {
+ const struct nlattr *kv;
+ int rem_kv;
+
+ nla_for_each_nested_type(kv, DEVLINK_ATTR_INFO_VERSION_VALUE,
+ nlattr, rem_kv) {
+ strlcat(buf, nla_data(kv), len);
+ strlcat(buf, " ", len);
+ }
+ }
+free_msg:
+ nlmsg_consume(msg);
+}
+
+void devlink_compat_running_version(struct devlink *devlink,
+ char *buf, size_t len)
+{
+ if (!devlink->ops->info_get)
+ return;
+
+ devl_lock(devlink);
+ if (devl_is_registered(devlink))
+ __devlink_compat_running_version(devlink, buf, len);
+ devl_unlock(devlink);
+}
+
+int devlink_compat_flash_update(struct devlink *devlink, const char *file_name)
+{
+ struct devlink_flash_update_params params = {};
+ int ret;
+
+ devl_lock(devlink);
+ if (!devl_is_registered(devlink)) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+
+ if (!devlink->ops->flash_update) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
+ ret = request_firmware(&params.fw, file_name, devlink->dev);
+ if (ret)
+ goto out_unlock;
+
+ devlink_flash_update_begin_notify(devlink);
+ ret = devlink->ops->flash_update(devlink, &params, NULL);
+ devlink_flash_update_end_notify(devlink);
+
+ release_firmware(params.fw);
+out_unlock:
+ devl_unlock(devlink);
+
+ return ret;
+}
+
+static int
+devlink_nl_selftests_fill(struct sk_buff *msg, struct devlink *devlink,
+ u32 portid, u32 seq, int flags,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *selftests;
+ void *hdr;
+ int err;
+ int i;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags,
+ DEVLINK_CMD_SELFTESTS_GET);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ err = -EMSGSIZE;
+ if (devlink_nl_put_handle(msg, devlink))
+ goto err_cancel_msg;
+
+ selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS);
+ if (!selftests)
+ goto err_cancel_msg;
+
+ for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1;
+ i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) {
+ if (devlink->ops->selftest_check(devlink, i, extack)) {
+ err = nla_put_flag(msg, i);
+ if (err)
+ goto err_cancel_msg;
+ }
+ }
+
+ nla_nest_end(msg, selftests);
+ genlmsg_end(msg, hdr);
+ return 0;
+
+err_cancel_msg:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+int devlink_nl_selftests_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ if (!devlink->ops->selftest_check)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_selftests_fill(msg, devlink, info->snd_portid,
+ info->snd_seq, 0, info->extack);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_selftests_get_dump_one(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb,
+ int flags)
+{
+ if (!devlink->ops->selftest_check)
+ return 0;
+
+ return devlink_nl_selftests_fill(msg, devlink,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags,
+ cb->extack);
+}
+
+int devlink_nl_selftests_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_selftests_get_dump_one);
+}
+
+static int devlink_selftest_result_put(struct sk_buff *skb, unsigned int id,
+ enum devlink_selftest_status test_status)
+{
+ struct nlattr *result_attr;
+
+ result_attr = nla_nest_start(skb, DEVLINK_ATTR_SELFTEST_RESULT);
+ if (!result_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, DEVLINK_ATTR_SELFTEST_RESULT_ID, id) ||
+ nla_put_u8(skb, DEVLINK_ATTR_SELFTEST_RESULT_STATUS,
+ test_status))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, result_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, result_attr);
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = {
+ [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG },
+};
+
+int devlink_nl_selftests_run_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct nlattr *tb[DEVLINK_ATTR_SELFTEST_ID_MAX + 1];
+ struct devlink *devlink = info->user_ptr[0];
+ struct nlattr *attrs, *selftests;
+ struct sk_buff *msg;
+ void *hdr;
+ int err;
+ int i;
+
+ if (!devlink->ops->selftest_run || !devlink->ops->selftest_check)
+ return -EOPNOTSUPP;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SELFTESTS))
+ return -EINVAL;
+
+ attrs = info->attrs[DEVLINK_ATTR_SELFTESTS];
+
+ err = nla_parse_nested(tb, DEVLINK_ATTR_SELFTEST_ID_MAX, attrs,
+ devlink_selftest_nl_policy, info->extack);
+ if (err < 0)
+ return err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = -EMSGSIZE;
+ hdr = genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &devlink_nl_family, 0, DEVLINK_CMD_SELFTESTS_RUN);
+ if (!hdr)
+ goto free_msg;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto genlmsg_cancel;
+
+ selftests = nla_nest_start(msg, DEVLINK_ATTR_SELFTESTS);
+ if (!selftests)
+ goto genlmsg_cancel;
+
+ for (i = DEVLINK_ATTR_SELFTEST_ID_UNSPEC + 1;
+ i <= DEVLINK_ATTR_SELFTEST_ID_MAX; i++) {
+ enum devlink_selftest_status test_status;
+
+ if (nla_get_flag(tb[i])) {
+ if (!devlink->ops->selftest_check(devlink, i,
+ info->extack)) {
+ if (devlink_selftest_result_put(msg, i,
+ DEVLINK_SELFTEST_STATUS_SKIP))
+ goto selftests_nest_cancel;
+ continue;
+ }
+
+ test_status = devlink->ops->selftest_run(devlink, i,
+ info->extack);
+ if (devlink_selftest_result_put(msg, i, test_status))
+ goto selftests_nest_cancel;
+ }
+ }
+
+ nla_nest_end(msg, selftests);
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+selftests_nest_cancel:
+ nla_nest_cancel(msg, selftests);
+genlmsg_cancel:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return err;
+}
diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h
new file mode 100644
index 000000000000..14eaad9cfe35
--- /dev/null
+++ b/net/devlink/devl_internal.h
@@ -0,0 +1,304 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include <linux/device.h>
+#include <linux/etherdevice.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/notifier.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/xarray.h>
+#include <net/devlink.h>
+#include <net/net_namespace.h>
+#include <net/rtnetlink.h>
+#include <rdma/ib_verbs.h>
+
+#include "netlink_gen.h"
+
+struct devlink_rel;
+
+#define DEVLINK_REGISTERED XA_MARK_1
+
+#define DEVLINK_RELOAD_STATS_ARRAY_SIZE \
+ (__DEVLINK_RELOAD_LIMIT_MAX * __DEVLINK_RELOAD_ACTION_MAX)
+
+struct devlink_dev_stats {
+ u32 reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE];
+ u32 remote_reload_stats[DEVLINK_RELOAD_STATS_ARRAY_SIZE];
+};
+
+struct devlink {
+ u32 index;
+ struct xarray ports;
+ struct list_head rate_list;
+ struct list_head sb_list;
+ struct list_head dpipe_table_list;
+ struct list_head resource_list;
+ struct xarray params;
+ struct list_head region_list;
+ struct list_head reporter_list;
+ struct devlink_dpipe_headers *dpipe_headers;
+ struct list_head trap_list;
+ struct list_head trap_group_list;
+ struct list_head trap_policer_list;
+ struct list_head linecard_list;
+ const struct devlink_ops *ops;
+ struct xarray snapshot_ids;
+ struct devlink_dev_stats stats;
+ struct device *dev;
+ possible_net_t _net;
+ /* Serializes access to devlink instance specific objects such as
+ * port, sb, dpipe, resource, params, region, traps and more.
+ */
+ struct mutex lock;
+ struct lock_class_key lock_key;
+ u8 reload_failed:1;
+ refcount_t refcount;
+ struct rcu_work rwork;
+ struct devlink_rel *rel;
+ struct xarray nested_rels;
+ char priv[] __aligned(NETDEV_ALIGN);
+};
+
+extern struct xarray devlinks;
+extern struct genl_family devlink_nl_family;
+
+/* devlink instances are open to the access from the user space after
+ * devlink_register() call. Such logical barrier allows us to have certain
+ * expectations related to locking.
+ *
+ * Before *_register() - we are in initialization stage and no parallel
+ * access possible to the devlink instance. All drivers perform that phase
+ * by implicitly holding device_lock.
+ *
+ * After *_register() - users and driver can access devlink instance at
+ * the same time.
+ */
+#define ASSERT_DEVLINK_REGISTERED(d) \
+ WARN_ON_ONCE(!xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED))
+#define ASSERT_DEVLINK_NOT_REGISTERED(d) \
+ WARN_ON_ONCE(xa_get_mark(&devlinks, (d)->index, DEVLINK_REGISTERED))
+
+/* Iterate over devlink pointers which were possible to get reference to.
+ * devlink_put() needs to be called for each iterated devlink pointer
+ * in loop body in order to release the reference.
+ */
+#define devlinks_xa_for_each_registered_get(net, index, devlink) \
+ for (index = 0; (devlink = devlinks_xa_find_get(net, &index)); index++)
+
+struct devlink *devlinks_xa_find_get(struct net *net, unsigned long *indexp);
+
+static inline bool __devl_is_registered(struct devlink *devlink)
+{
+ return xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
+}
+
+static inline bool devl_is_registered(struct devlink *devlink)
+{
+ devl_assert_locked(devlink);
+ return __devl_is_registered(devlink);
+}
+
+static inline void devl_dev_lock(struct devlink *devlink, bool dev_lock)
+{
+ if (dev_lock)
+ device_lock(devlink->dev);
+ devl_lock(devlink);
+}
+
+static inline void devl_dev_unlock(struct devlink *devlink, bool dev_lock)
+{
+ devl_unlock(devlink);
+ if (dev_lock)
+ device_unlock(devlink->dev);
+}
+
+typedef void devlink_rel_notify_cb_t(struct devlink *devlink, u32 obj_index);
+typedef void devlink_rel_cleanup_cb_t(struct devlink *devlink, u32 obj_index,
+ u32 rel_index);
+
+void devlink_rel_nested_in_clear(u32 rel_index);
+int devlink_rel_nested_in_add(u32 *rel_index, u32 devlink_index,
+ u32 obj_index, devlink_rel_notify_cb_t *notify_cb,
+ devlink_rel_cleanup_cb_t *cleanup_cb,
+ struct devlink *devlink);
+void devlink_rel_nested_in_notify(struct devlink *devlink);
+int devlink_rel_devlink_handle_put(struct sk_buff *msg, struct devlink *devlink,
+ u32 rel_index, int attrtype,
+ bool *msg_updated);
+
+/* Netlink */
+enum devlink_multicast_groups {
+ DEVLINK_MCGRP_CONFIG,
+};
+
+/* state held across netlink dumps */
+struct devlink_nl_dump_state {
+ unsigned long instance;
+ int idx;
+ union {
+ /* DEVLINK_CMD_REGION_READ */
+ struct {
+ u64 start_offset;
+ };
+ /* DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET */
+ struct {
+ u64 dump_ts;
+ };
+ };
+};
+
+typedef int devlink_nl_dump_one_func_t(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb,
+ int flags);
+
+struct devlink *
+devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs,
+ bool dev_lock);
+
+int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb,
+ devlink_nl_dump_one_func_t *dump_one);
+
+static inline struct devlink_nl_dump_state *
+devlink_dump_state(struct netlink_callback *cb)
+{
+ NL_ASSERT_CTX_FITS(struct devlink_nl_dump_state);
+
+ return (struct devlink_nl_dump_state *)cb->ctx;
+}
+
+static inline int
+devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink)
+{
+ if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name))
+ return -EMSGSIZE;
+ if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev)))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static inline int devlink_nl_put_u64(struct sk_buff *msg, int attrtype, u64 val)
+{
+ return nla_put_u64_64bit(msg, attrtype, val, DEVLINK_ATTR_PAD);
+}
+
+int devlink_nl_put_nested_handle(struct sk_buff *msg, struct net *net,
+ struct devlink *devlink, int attrtype);
+int devlink_nl_msg_reply_and_new(struct sk_buff **msg, struct genl_info *info);
+
+static inline bool devlink_nl_notify_need(struct devlink *devlink)
+{
+ return genl_has_listeners(&devlink_nl_family, devlink_net(devlink),
+ DEVLINK_MCGRP_CONFIG);
+}
+
+struct devlink_obj_desc {
+ struct rcu_head rcu;
+ const char *bus_name;
+ const char *dev_name;
+ unsigned int port_index;
+ bool port_index_valid;
+ long data[];
+};
+
+static inline void devlink_nl_obj_desc_init(struct devlink_obj_desc *desc,
+ struct devlink *devlink)
+{
+ memset(desc, 0, sizeof(*desc));
+ desc->bus_name = devlink->dev->bus->name;
+ desc->dev_name = dev_name(devlink->dev);
+}
+
+static inline void devlink_nl_obj_desc_port_set(struct devlink_obj_desc *desc,
+ struct devlink_port *devlink_port)
+{
+ desc->port_index = devlink_port->index;
+ desc->port_index_valid = true;
+}
+
+int devlink_nl_notify_filter(struct sock *dsk, struct sk_buff *skb, void *data);
+
+static inline void devlink_nl_notify_send_desc(struct devlink *devlink,
+ struct sk_buff *msg,
+ struct devlink_obj_desc *desc)
+{
+ genlmsg_multicast_netns_filtered(&devlink_nl_family,
+ devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG,
+ GFP_KERNEL,
+ devlink_nl_notify_filter, desc);
+}
+
+static inline void devlink_nl_notify_send(struct devlink *devlink,
+ struct sk_buff *msg)
+{
+ struct devlink_obj_desc desc;
+
+ devlink_nl_obj_desc_init(&desc, devlink);
+ devlink_nl_notify_send_desc(devlink, msg, &desc);
+}
+
+/* Notify */
+void devlink_notify_register(struct devlink *devlink);
+void devlink_notify_unregister(struct devlink *devlink);
+void devlink_ports_notify_register(struct devlink *devlink);
+void devlink_ports_notify_unregister(struct devlink *devlink);
+void devlink_params_notify_register(struct devlink *devlink);
+void devlink_params_notify_unregister(struct devlink *devlink);
+void devlink_regions_notify_register(struct devlink *devlink);
+void devlink_regions_notify_unregister(struct devlink *devlink);
+void devlink_trap_policers_notify_register(struct devlink *devlink);
+void devlink_trap_policers_notify_unregister(struct devlink *devlink);
+void devlink_trap_groups_notify_register(struct devlink *devlink);
+void devlink_trap_groups_notify_unregister(struct devlink *devlink);
+void devlink_traps_notify_register(struct devlink *devlink);
+void devlink_traps_notify_unregister(struct devlink *devlink);
+void devlink_rates_notify_register(struct devlink *devlink);
+void devlink_rates_notify_unregister(struct devlink *devlink);
+void devlink_linecards_notify_register(struct devlink *devlink);
+void devlink_linecards_notify_unregister(struct devlink *devlink);
+
+/* Ports */
+#define ASSERT_DEVLINK_PORT_INITIALIZED(devlink_port) \
+ WARN_ON_ONCE(!(devlink_port)->initialized)
+
+struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
+ unsigned int port_index);
+int devlink_port_netdevice_event(struct notifier_block *nb,
+ unsigned long event, void *ptr);
+struct devlink_port *
+devlink_port_get_from_info(struct devlink *devlink, struct genl_info *info);
+struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink,
+ struct nlattr **attrs);
+
+/* Reload */
+bool devlink_reload_actions_valid(const struct devlink_ops *ops);
+int devlink_reload(struct devlink *devlink, struct net *dest_net,
+ enum devlink_reload_action action,
+ enum devlink_reload_limit limit,
+ u32 *actions_performed, struct netlink_ext_ack *extack);
+
+static inline bool devlink_reload_supported(const struct devlink_ops *ops)
+{
+ return ops->reload_down && ops->reload_up;
+}
+
+/* Params */
+void devlink_params_driverinit_load_new(struct devlink *devlink);
+
+/* Resources */
+struct devlink_resource;
+int devlink_resources_validate(struct devlink *devlink,
+ struct devlink_resource *resource,
+ struct genl_info *info);
+
+/* Rates */
+int devlink_rate_nodes_check(struct devlink *devlink, u16 mode,
+ struct netlink_ext_ack *extack);
+
+/* Linecards */
+unsigned int devlink_linecard_index(struct devlink_linecard *linecard);
diff --git a/net/devlink/dpipe.c b/net/devlink/dpipe.c
new file mode 100644
index 000000000000..e55701b007f0
--- /dev/null
+++ b/net/devlink/dpipe.c
@@ -0,0 +1,915 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include "devl_internal.h"
+
+static struct devlink_dpipe_field devlink_dpipe_fields_ethernet[] = {
+ {
+ .name = "destination mac",
+ .id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC,
+ .bitwidth = 48,
+ },
+};
+
+struct devlink_dpipe_header devlink_dpipe_header_ethernet = {
+ .name = "ethernet",
+ .id = DEVLINK_DPIPE_HEADER_ETHERNET,
+ .fields = devlink_dpipe_fields_ethernet,
+ .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ethernet),
+ .global = true,
+};
+EXPORT_SYMBOL_GPL(devlink_dpipe_header_ethernet);
+
+static struct devlink_dpipe_field devlink_dpipe_fields_ipv4[] = {
+ {
+ .name = "destination ip",
+ .id = DEVLINK_DPIPE_FIELD_IPV4_DST_IP,
+ .bitwidth = 32,
+ },
+};
+
+struct devlink_dpipe_header devlink_dpipe_header_ipv4 = {
+ .name = "ipv4",
+ .id = DEVLINK_DPIPE_HEADER_IPV4,
+ .fields = devlink_dpipe_fields_ipv4,
+ .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv4),
+ .global = true,
+};
+EXPORT_SYMBOL_GPL(devlink_dpipe_header_ipv4);
+
+static struct devlink_dpipe_field devlink_dpipe_fields_ipv6[] = {
+ {
+ .name = "destination ip",
+ .id = DEVLINK_DPIPE_FIELD_IPV6_DST_IP,
+ .bitwidth = 128,
+ },
+};
+
+struct devlink_dpipe_header devlink_dpipe_header_ipv6 = {
+ .name = "ipv6",
+ .id = DEVLINK_DPIPE_HEADER_IPV6,
+ .fields = devlink_dpipe_fields_ipv6,
+ .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv6),
+ .global = true,
+};
+EXPORT_SYMBOL_GPL(devlink_dpipe_header_ipv6);
+
+int devlink_dpipe_match_put(struct sk_buff *skb,
+ struct devlink_dpipe_match *match)
+{
+ struct devlink_dpipe_header *header = match->header;
+ struct devlink_dpipe_field *field = &header->fields[match->field_id];
+ struct nlattr *match_attr;
+
+ match_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_MATCH);
+ if (!match_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_MATCH_TYPE, match->type) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, match->header_index) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) ||
+ nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, match_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, match_attr);
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_match_put);
+
+static int devlink_dpipe_matches_put(struct devlink_dpipe_table *table,
+ struct sk_buff *skb)
+{
+ struct nlattr *matches_attr;
+
+ matches_attr = nla_nest_start_noflag(skb,
+ DEVLINK_ATTR_DPIPE_TABLE_MATCHES);
+ if (!matches_attr)
+ return -EMSGSIZE;
+
+ if (table->table_ops->matches_dump(table->priv, skb))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, matches_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, matches_attr);
+ return -EMSGSIZE;
+}
+
+int devlink_dpipe_action_put(struct sk_buff *skb,
+ struct devlink_dpipe_action *action)
+{
+ struct devlink_dpipe_header *header = action->header;
+ struct devlink_dpipe_field *field = &header->fields[action->field_id];
+ struct nlattr *action_attr;
+
+ action_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_ACTION);
+ if (!action_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_ACTION_TYPE, action->type) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, action->header_index) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) ||
+ nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, action_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, action_attr);
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_action_put);
+
+static int devlink_dpipe_actions_put(struct devlink_dpipe_table *table,
+ struct sk_buff *skb)
+{
+ struct nlattr *actions_attr;
+
+ actions_attr = nla_nest_start_noflag(skb,
+ DEVLINK_ATTR_DPIPE_TABLE_ACTIONS);
+ if (!actions_attr)
+ return -EMSGSIZE;
+
+ if (table->table_ops->actions_dump(table->priv, skb))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, actions_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, actions_attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_dpipe_table_put(struct sk_buff *skb,
+ struct devlink_dpipe_table *table)
+{
+ struct nlattr *table_attr;
+ u64 table_size;
+
+ table_size = table->table_ops->size_get(table->priv);
+ table_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_TABLE);
+ if (!table_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_TABLE_NAME, table->name) ||
+ devlink_nl_put_u64(skb, DEVLINK_ATTR_DPIPE_TABLE_SIZE, table_size))
+ goto nla_put_failure;
+ if (nla_put_u8(skb, DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED,
+ table->counters_enabled))
+ goto nla_put_failure;
+
+ if (table->resource_valid) {
+ if (devlink_nl_put_u64(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID,
+ table->resource_id) ||
+ devlink_nl_put_u64(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS,
+ table->resource_units))
+ goto nla_put_failure;
+ }
+ if (devlink_dpipe_matches_put(table, skb))
+ goto nla_put_failure;
+
+ if (devlink_dpipe_actions_put(table, skb))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, table_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, table_attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_dpipe_send_and_alloc_skb(struct sk_buff **pskb,
+ struct genl_info *info)
+{
+ int err;
+
+ if (*pskb) {
+ err = genlmsg_reply(*pskb, info);
+ if (err)
+ return err;
+ }
+ *pskb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!*pskb)
+ return -ENOMEM;
+ return 0;
+}
+
+static int devlink_dpipe_tables_fill(struct genl_info *info,
+ enum devlink_command cmd, int flags,
+ struct list_head *dpipe_tables,
+ const char *table_name)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_dpipe_table *table;
+ struct nlattr *tables_attr;
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ bool incomplete;
+ void *hdr;
+ int i;
+ int err;
+
+ table = list_first_entry(dpipe_tables,
+ struct devlink_dpipe_table, list);
+start_again:
+ err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
+ &devlink_nl_family, NLM_F_MULTI, cmd);
+ if (!hdr) {
+ nlmsg_free(skb);
+ return -EMSGSIZE;
+ }
+
+ if (devlink_nl_put_handle(skb, devlink))
+ goto nla_put_failure;
+ tables_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_TABLES);
+ if (!tables_attr)
+ goto nla_put_failure;
+
+ i = 0;
+ incomplete = false;
+ list_for_each_entry_from(table, dpipe_tables, list) {
+ if (!table_name) {
+ err = devlink_dpipe_table_put(skb, table);
+ if (err) {
+ if (!i)
+ goto err_table_put;
+ incomplete = true;
+ break;
+ }
+ } else {
+ if (!strcmp(table->name, table_name)) {
+ err = devlink_dpipe_table_put(skb, table);
+ if (err)
+ break;
+ }
+ }
+ i++;
+ }
+
+ nla_nest_end(skb, tables_attr);
+ genlmsg_end(skb, hdr);
+ if (incomplete)
+ goto start_again;
+
+send_done:
+ nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
+ NLMSG_DONE, 0, flags | NLM_F_MULTI);
+ if (!nlh) {
+ err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+ if (err)
+ return err;
+ goto send_done;
+ }
+
+ return genlmsg_reply(skb, info);
+
+nla_put_failure:
+ err = -EMSGSIZE;
+err_table_put:
+ nlmsg_free(skb);
+ return err;
+}
+
+int devlink_nl_dpipe_table_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ const char *table_name = NULL;
+
+ if (info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME])
+ table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
+
+ return devlink_dpipe_tables_fill(info, DEVLINK_CMD_DPIPE_TABLE_GET, 0,
+ &devlink->dpipe_table_list,
+ table_name);
+}
+
+static int devlink_dpipe_value_put(struct sk_buff *skb,
+ struct devlink_dpipe_value *value)
+{
+ if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE,
+ value->value_size, value->value))
+ return -EMSGSIZE;
+ if (value->mask)
+ if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE_MASK,
+ value->value_size, value->mask))
+ return -EMSGSIZE;
+ if (value->mapping_valid)
+ if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_VALUE_MAPPING,
+ value->mapping_value))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int devlink_dpipe_action_value_put(struct sk_buff *skb,
+ struct devlink_dpipe_value *value)
+{
+ if (!value->action)
+ return -EINVAL;
+ if (devlink_dpipe_action_put(skb, value->action))
+ return -EMSGSIZE;
+ if (devlink_dpipe_value_put(skb, value))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int devlink_dpipe_action_values_put(struct sk_buff *skb,
+ struct devlink_dpipe_value *values,
+ unsigned int values_count)
+{
+ struct nlattr *action_attr;
+ int i;
+ int err;
+
+ for (i = 0; i < values_count; i++) {
+ action_attr = nla_nest_start_noflag(skb,
+ DEVLINK_ATTR_DPIPE_ACTION_VALUE);
+ if (!action_attr)
+ return -EMSGSIZE;
+ err = devlink_dpipe_action_value_put(skb, &values[i]);
+ if (err)
+ goto err_action_value_put;
+ nla_nest_end(skb, action_attr);
+ }
+ return 0;
+
+err_action_value_put:
+ nla_nest_cancel(skb, action_attr);
+ return err;
+}
+
+static int devlink_dpipe_match_value_put(struct sk_buff *skb,
+ struct devlink_dpipe_value *value)
+{
+ if (!value->match)
+ return -EINVAL;
+ if (devlink_dpipe_match_put(skb, value->match))
+ return -EMSGSIZE;
+ if (devlink_dpipe_value_put(skb, value))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int devlink_dpipe_match_values_put(struct sk_buff *skb,
+ struct devlink_dpipe_value *values,
+ unsigned int values_count)
+{
+ struct nlattr *match_attr;
+ int i;
+ int err;
+
+ for (i = 0; i < values_count; i++) {
+ match_attr = nla_nest_start_noflag(skb,
+ DEVLINK_ATTR_DPIPE_MATCH_VALUE);
+ if (!match_attr)
+ return -EMSGSIZE;
+ err = devlink_dpipe_match_value_put(skb, &values[i]);
+ if (err)
+ goto err_match_value_put;
+ nla_nest_end(skb, match_attr);
+ }
+ return 0;
+
+err_match_value_put:
+ nla_nest_cancel(skb, match_attr);
+ return err;
+}
+
+static int devlink_dpipe_entry_put(struct sk_buff *skb,
+ struct devlink_dpipe_entry *entry)
+{
+ struct nlattr *entry_attr, *matches_attr, *actions_attr;
+ int err;
+
+ entry_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_ENTRY);
+ if (!entry_attr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_u64(skb, DEVLINK_ATTR_DPIPE_ENTRY_INDEX, entry->index))
+ goto nla_put_failure;
+ if (entry->counter_valid)
+ if (devlink_nl_put_u64(skb, DEVLINK_ATTR_DPIPE_ENTRY_COUNTER,
+ entry->counter))
+ goto nla_put_failure;
+
+ matches_attr = nla_nest_start_noflag(skb,
+ DEVLINK_ATTR_DPIPE_ENTRY_MATCH_VALUES);
+ if (!matches_attr)
+ goto nla_put_failure;
+
+ err = devlink_dpipe_match_values_put(skb, entry->match_values,
+ entry->match_values_count);
+ if (err) {
+ nla_nest_cancel(skb, matches_attr);
+ goto err_match_values_put;
+ }
+ nla_nest_end(skb, matches_attr);
+
+ actions_attr = nla_nest_start_noflag(skb,
+ DEVLINK_ATTR_DPIPE_ENTRY_ACTION_VALUES);
+ if (!actions_attr)
+ goto nla_put_failure;
+
+ err = devlink_dpipe_action_values_put(skb, entry->action_values,
+ entry->action_values_count);
+ if (err) {
+ nla_nest_cancel(skb, actions_attr);
+ goto err_action_values_put;
+ }
+ nla_nest_end(skb, actions_attr);
+
+ nla_nest_end(skb, entry_attr);
+ return 0;
+
+nla_put_failure:
+ err = -EMSGSIZE;
+err_match_values_put:
+err_action_values_put:
+ nla_nest_cancel(skb, entry_attr);
+ return err;
+}
+
+static struct devlink_dpipe_table *
+devlink_dpipe_table_find(struct list_head *dpipe_tables,
+ const char *table_name, struct devlink *devlink)
+{
+ struct devlink_dpipe_table *table;
+
+ list_for_each_entry_rcu(table, dpipe_tables, list,
+ lockdep_is_held(&devlink->lock)) {
+ if (!strcmp(table->name, table_name))
+ return table;
+ }
+ return NULL;
+}
+
+int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx)
+{
+ struct devlink *devlink;
+ int err;
+
+ err = devlink_dpipe_send_and_alloc_skb(&dump_ctx->skb,
+ dump_ctx->info);
+ if (err)
+ return err;
+
+ dump_ctx->hdr = genlmsg_put(dump_ctx->skb,
+ dump_ctx->info->snd_portid,
+ dump_ctx->info->snd_seq,
+ &devlink_nl_family, NLM_F_MULTI,
+ dump_ctx->cmd);
+ if (!dump_ctx->hdr)
+ goto nla_put_failure;
+
+ devlink = dump_ctx->info->user_ptr[0];
+ if (devlink_nl_put_handle(dump_ctx->skb, devlink))
+ goto nla_put_failure;
+ dump_ctx->nest = nla_nest_start_noflag(dump_ctx->skb,
+ DEVLINK_ATTR_DPIPE_ENTRIES);
+ if (!dump_ctx->nest)
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ nlmsg_free(dump_ctx->skb);
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_prepare);
+
+int devlink_dpipe_entry_ctx_append(struct devlink_dpipe_dump_ctx *dump_ctx,
+ struct devlink_dpipe_entry *entry)
+{
+ return devlink_dpipe_entry_put(dump_ctx->skb, entry);
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_append);
+
+int devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx)
+{
+ nla_nest_end(dump_ctx->skb, dump_ctx->nest);
+ genlmsg_end(dump_ctx->skb, dump_ctx->hdr);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_close);
+
+void devlink_dpipe_entry_clear(struct devlink_dpipe_entry *entry)
+
+{
+ unsigned int value_count, value_index;
+ struct devlink_dpipe_value *value;
+
+ value = entry->action_values;
+ value_count = entry->action_values_count;
+ for (value_index = 0; value_index < value_count; value_index++) {
+ kfree(value[value_index].value);
+ kfree(value[value_index].mask);
+ }
+
+ value = entry->match_values;
+ value_count = entry->match_values_count;
+ for (value_index = 0; value_index < value_count; value_index++) {
+ kfree(value[value_index].value);
+ kfree(value[value_index].mask);
+ }
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_entry_clear);
+
+static int devlink_dpipe_entries_fill(struct genl_info *info,
+ enum devlink_command cmd, int flags,
+ struct devlink_dpipe_table *table)
+{
+ struct devlink_dpipe_dump_ctx dump_ctx;
+ struct nlmsghdr *nlh;
+ int err;
+
+ dump_ctx.skb = NULL;
+ dump_ctx.cmd = cmd;
+ dump_ctx.info = info;
+
+ err = table->table_ops->entries_dump(table->priv,
+ table->counters_enabled,
+ &dump_ctx);
+ if (err)
+ return err;
+
+send_done:
+ nlh = nlmsg_put(dump_ctx.skb, info->snd_portid, info->snd_seq,
+ NLMSG_DONE, 0, flags | NLM_F_MULTI);
+ if (!nlh) {
+ err = devlink_dpipe_send_and_alloc_skb(&dump_ctx.skb, info);
+ if (err)
+ return err;
+ goto send_done;
+ }
+ return genlmsg_reply(dump_ctx.skb, info);
+}
+
+int devlink_nl_dpipe_entries_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_dpipe_table *table;
+ const char *table_name;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_DPIPE_TABLE_NAME))
+ return -EINVAL;
+
+ table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
+ table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
+ table_name, devlink);
+ if (!table)
+ return -EINVAL;
+
+ if (!table->table_ops->entries_dump)
+ return -EINVAL;
+
+ return devlink_dpipe_entries_fill(info, DEVLINK_CMD_DPIPE_ENTRIES_GET,
+ 0, table);
+}
+
+static int devlink_dpipe_fields_put(struct sk_buff *skb,
+ const struct devlink_dpipe_header *header)
+{
+ struct devlink_dpipe_field *field;
+ struct nlattr *field_attr;
+ int i;
+
+ for (i = 0; i < header->fields_count; i++) {
+ field = &header->fields[i];
+ field_attr = nla_nest_start_noflag(skb,
+ DEVLINK_ATTR_DPIPE_FIELD);
+ if (!field_attr)
+ return -EMSGSIZE;
+ if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_FIELD_NAME, field->name) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_BITWIDTH, field->bitwidth) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_MAPPING_TYPE, field->mapping_type))
+ goto nla_put_failure;
+ nla_nest_end(skb, field_attr);
+ }
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, field_attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_dpipe_header_put(struct sk_buff *skb,
+ struct devlink_dpipe_header *header)
+{
+ struct nlattr *fields_attr, *header_attr;
+ int err;
+
+ header_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_HEADER);
+ if (!header_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_HEADER_NAME, header->name) ||
+ nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) ||
+ nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global))
+ goto nla_put_failure;
+
+ fields_attr = nla_nest_start_noflag(skb,
+ DEVLINK_ATTR_DPIPE_HEADER_FIELDS);
+ if (!fields_attr)
+ goto nla_put_failure;
+
+ err = devlink_dpipe_fields_put(skb, header);
+ if (err) {
+ nla_nest_cancel(skb, fields_attr);
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, fields_attr);
+ nla_nest_end(skb, header_attr);
+ return 0;
+
+nla_put_failure:
+ err = -EMSGSIZE;
+ nla_nest_cancel(skb, header_attr);
+ return err;
+}
+
+static int devlink_dpipe_headers_fill(struct genl_info *info,
+ enum devlink_command cmd, int flags,
+ struct devlink_dpipe_headers *
+ dpipe_headers)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct nlattr *headers_attr;
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ void *hdr;
+ int i, j;
+ int err;
+
+ i = 0;
+start_again:
+ err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
+ &devlink_nl_family, NLM_F_MULTI, cmd);
+ if (!hdr) {
+ nlmsg_free(skb);
+ return -EMSGSIZE;
+ }
+
+ if (devlink_nl_put_handle(skb, devlink))
+ goto nla_put_failure;
+ headers_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_DPIPE_HEADERS);
+ if (!headers_attr)
+ goto nla_put_failure;
+
+ j = 0;
+ for (; i < dpipe_headers->headers_count; i++) {
+ err = devlink_dpipe_header_put(skb, dpipe_headers->headers[i]);
+ if (err) {
+ if (!j)
+ goto err_table_put;
+ break;
+ }
+ j++;
+ }
+ nla_nest_end(skb, headers_attr);
+ genlmsg_end(skb, hdr);
+ if (i != dpipe_headers->headers_count)
+ goto start_again;
+
+send_done:
+ nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
+ NLMSG_DONE, 0, flags | NLM_F_MULTI);
+ if (!nlh) {
+ err = devlink_dpipe_send_and_alloc_skb(&skb, info);
+ if (err)
+ return err;
+ goto send_done;
+ }
+ return genlmsg_reply(skb, info);
+
+nla_put_failure:
+ err = -EMSGSIZE;
+err_table_put:
+ nlmsg_free(skb);
+ return err;
+}
+
+int devlink_nl_dpipe_headers_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+
+ if (!devlink->dpipe_headers)
+ return -EOPNOTSUPP;
+ return devlink_dpipe_headers_fill(info, DEVLINK_CMD_DPIPE_HEADERS_GET,
+ 0, devlink->dpipe_headers);
+}
+
+static int devlink_dpipe_table_counters_set(struct devlink *devlink,
+ const char *table_name,
+ bool enable)
+{
+ struct devlink_dpipe_table *table;
+
+ table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
+ table_name, devlink);
+ if (!table)
+ return -EINVAL;
+
+ if (table->counter_control_extern)
+ return -EOPNOTSUPP;
+
+ if (!(table->counters_enabled ^ enable))
+ return 0;
+
+ table->counters_enabled = enable;
+ if (table->table_ops->counters_set_update)
+ table->table_ops->counters_set_update(table->priv, enable);
+ return 0;
+}
+
+int devlink_nl_dpipe_table_counters_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ const char *table_name;
+ bool counters_enable;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_DPIPE_TABLE_NAME) ||
+ GENL_REQ_ATTR_CHECK(info,
+ DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED))
+ return -EINVAL;
+
+ table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]);
+ counters_enable = !!nla_get_u8(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED]);
+
+ return devlink_dpipe_table_counters_set(devlink, table_name,
+ counters_enable);
+}
+
+/**
+ * devl_dpipe_headers_register - register dpipe headers
+ *
+ * @devlink: devlink
+ * @dpipe_headers: dpipe header array
+ *
+ * Register the headers supported by hardware.
+ */
+void devl_dpipe_headers_register(struct devlink *devlink,
+ struct devlink_dpipe_headers *dpipe_headers)
+{
+ lockdep_assert_held(&devlink->lock);
+
+ devlink->dpipe_headers = dpipe_headers;
+}
+EXPORT_SYMBOL_GPL(devl_dpipe_headers_register);
+
+/**
+ * devl_dpipe_headers_unregister - unregister dpipe headers
+ *
+ * @devlink: devlink
+ *
+ * Unregister the headers supported by hardware.
+ */
+void devl_dpipe_headers_unregister(struct devlink *devlink)
+{
+ lockdep_assert_held(&devlink->lock);
+
+ devlink->dpipe_headers = NULL;
+}
+EXPORT_SYMBOL_GPL(devl_dpipe_headers_unregister);
+
+/**
+ * devlink_dpipe_table_counter_enabled - check if counter allocation
+ * required
+ * @devlink: devlink
+ * @table_name: tables name
+ *
+ * Used by driver to check if counter allocation is required.
+ * After counter allocation is turned on the table entries
+ * are updated to include counter statistics.
+ *
+ * After that point on the driver must respect the counter
+ * state so that each entry added to the table is added
+ * with a counter.
+ */
+bool devlink_dpipe_table_counter_enabled(struct devlink *devlink,
+ const char *table_name)
+{
+ struct devlink_dpipe_table *table;
+ bool enabled;
+
+ rcu_read_lock();
+ table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
+ table_name, devlink);
+ enabled = false;
+ if (table)
+ enabled = table->counters_enabled;
+ rcu_read_unlock();
+ return enabled;
+}
+EXPORT_SYMBOL_GPL(devlink_dpipe_table_counter_enabled);
+
+/**
+ * devl_dpipe_table_register - register dpipe table
+ *
+ * @devlink: devlink
+ * @table_name: table name
+ * @table_ops: table ops
+ * @priv: priv
+ * @counter_control_extern: external control for counters
+ */
+int devl_dpipe_table_register(struct devlink *devlink,
+ const char *table_name,
+ const struct devlink_dpipe_table_ops *table_ops,
+ void *priv, bool counter_control_extern)
+{
+ struct devlink_dpipe_table *table;
+
+ lockdep_assert_held(&devlink->lock);
+
+ if (WARN_ON(!table_ops->size_get))
+ return -EINVAL;
+
+ if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name,
+ devlink))
+ return -EEXIST;
+
+ table = kzalloc(sizeof(*table), GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+
+ table->name = table_name;
+ table->table_ops = table_ops;
+ table->priv = priv;
+ table->counter_control_extern = counter_control_extern;
+
+ list_add_tail_rcu(&table->list, &devlink->dpipe_table_list);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_dpipe_table_register);
+
+/**
+ * devl_dpipe_table_unregister - unregister dpipe table
+ *
+ * @devlink: devlink
+ * @table_name: table name
+ */
+void devl_dpipe_table_unregister(struct devlink *devlink,
+ const char *table_name)
+{
+ struct devlink_dpipe_table *table;
+
+ lockdep_assert_held(&devlink->lock);
+
+ table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
+ table_name, devlink);
+ if (!table)
+ return;
+ list_del_rcu(&table->list);
+ kfree_rcu(table, rcu);
+}
+EXPORT_SYMBOL_GPL(devl_dpipe_table_unregister);
+
+/**
+ * devl_dpipe_table_resource_set - set the resource id
+ *
+ * @devlink: devlink
+ * @table_name: table name
+ * @resource_id: resource id
+ * @resource_units: number of resource's units consumed per table's entry
+ */
+int devl_dpipe_table_resource_set(struct devlink *devlink,
+ const char *table_name, u64 resource_id,
+ u64 resource_units)
+{
+ struct devlink_dpipe_table *table;
+
+ table = devlink_dpipe_table_find(&devlink->dpipe_table_list,
+ table_name, devlink);
+ if (!table)
+ return -EINVAL;
+
+ table->resource_id = resource_id;
+ table->resource_units = resource_units;
+ table->resource_valid = true;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_dpipe_table_resource_set);
diff --git a/net/devlink/health.c b/net/devlink/health.c
new file mode 100644
index 000000000000..136a67c36a20
--- /dev/null
+++ b/net/devlink/health.c
@@ -0,0 +1,1350 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include <net/genetlink.h>
+#include <net/sock.h>
+#include <trace/events/devlink.h>
+#include "devl_internal.h"
+
+struct devlink_fmsg_item {
+ struct list_head list;
+ int attrtype;
+ u8 nla_type;
+ u16 len;
+ int value[];
+};
+
+struct devlink_fmsg {
+ struct list_head item_list;
+ int err; /* first error encountered on some devlink_fmsg_XXX() call */
+ bool putting_binary; /* This flag forces enclosing of binary data
+ * in an array brackets. It forces using
+ * of designated API:
+ * devlink_fmsg_binary_pair_nest_start()
+ * devlink_fmsg_binary_pair_nest_end()
+ */
+};
+
+static struct devlink_fmsg *devlink_fmsg_alloc(void)
+{
+ struct devlink_fmsg *fmsg;
+
+ fmsg = kzalloc(sizeof(*fmsg), GFP_KERNEL);
+ if (!fmsg)
+ return NULL;
+
+ INIT_LIST_HEAD(&fmsg->item_list);
+
+ return fmsg;
+}
+
+static void devlink_fmsg_free(struct devlink_fmsg *fmsg)
+{
+ struct devlink_fmsg_item *item, *tmp;
+
+ list_for_each_entry_safe(item, tmp, &fmsg->item_list, list) {
+ list_del(&item->list);
+ kfree(item);
+ }
+ kfree(fmsg);
+}
+
+struct devlink_health_reporter {
+ struct list_head list;
+ void *priv;
+ const struct devlink_health_reporter_ops *ops;
+ struct devlink *devlink;
+ struct devlink_port *devlink_port;
+ struct devlink_fmsg *dump_fmsg;
+ u64 graceful_period;
+ u64 burst_period;
+ bool auto_recover;
+ bool auto_dump;
+ u8 health_state;
+ u64 dump_ts;
+ u64 dump_real_ts;
+ u64 error_count;
+ u64 recovery_count;
+ u64 last_recovery_ts;
+};
+
+void *
+devlink_health_reporter_priv(struct devlink_health_reporter *reporter)
+{
+ return reporter->priv;
+}
+EXPORT_SYMBOL_GPL(devlink_health_reporter_priv);
+
+static struct devlink_health_reporter *
+__devlink_health_reporter_find_by_name(struct list_head *reporter_list,
+ const char *reporter_name)
+{
+ struct devlink_health_reporter *reporter;
+
+ list_for_each_entry(reporter, reporter_list, list)
+ if (!strcmp(reporter->ops->name, reporter_name))
+ return reporter;
+ return NULL;
+}
+
+static struct devlink_health_reporter *
+devlink_health_reporter_find_by_name(struct devlink *devlink,
+ const char *reporter_name)
+{
+ return __devlink_health_reporter_find_by_name(&devlink->reporter_list,
+ reporter_name);
+}
+
+static struct devlink_health_reporter *
+devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port,
+ const char *reporter_name)
+{
+ return __devlink_health_reporter_find_by_name(&devlink_port->reporter_list,
+ reporter_name);
+}
+
+static struct devlink_health_reporter *
+__devlink_health_reporter_create(struct devlink *devlink,
+ const struct devlink_health_reporter_ops *ops,
+ void *priv)
+{
+ struct devlink_health_reporter *reporter;
+
+ if (WARN_ON(ops->default_graceful_period && !ops->recover))
+ return ERR_PTR(-EINVAL);
+
+ if (WARN_ON(ops->default_burst_period && !ops->default_graceful_period))
+ return ERR_PTR(-EINVAL);
+
+ reporter = kzalloc(sizeof(*reporter), GFP_KERNEL);
+ if (!reporter)
+ return ERR_PTR(-ENOMEM);
+
+ reporter->priv = priv;
+ reporter->ops = ops;
+ reporter->devlink = devlink;
+ reporter->graceful_period = ops->default_graceful_period;
+ reporter->burst_period = ops->default_burst_period;
+ reporter->auto_recover = !!ops->recover;
+ reporter->auto_dump = !!ops->dump;
+ return reporter;
+}
+
+/**
+ * devl_port_health_reporter_create() - create devlink health reporter for
+ * specified port instance
+ *
+ * @port: devlink_port to which health reports will relate
+ * @ops: devlink health reporter ops
+ * @priv: driver priv pointer
+ */
+struct devlink_health_reporter *
+devl_port_health_reporter_create(struct devlink_port *port,
+ const struct devlink_health_reporter_ops *ops,
+ void *priv)
+{
+ struct devlink_health_reporter *reporter;
+
+ devl_assert_locked(port->devlink);
+
+ if (__devlink_health_reporter_find_by_name(&port->reporter_list,
+ ops->name))
+ return ERR_PTR(-EEXIST);
+
+ reporter = __devlink_health_reporter_create(port->devlink, ops, priv);
+ if (IS_ERR(reporter))
+ return reporter;
+
+ reporter->devlink_port = port;
+ list_add_tail(&reporter->list, &port->reporter_list);
+ return reporter;
+}
+EXPORT_SYMBOL_GPL(devl_port_health_reporter_create);
+
+struct devlink_health_reporter *
+devlink_port_health_reporter_create(struct devlink_port *port,
+ const struct devlink_health_reporter_ops *ops,
+ void *priv)
+{
+ struct devlink_health_reporter *reporter;
+ struct devlink *devlink = port->devlink;
+
+ devl_lock(devlink);
+ reporter = devl_port_health_reporter_create(port, ops, priv);
+ devl_unlock(devlink);
+ return reporter;
+}
+EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create);
+
+/**
+ * devl_health_reporter_create - create devlink health reporter
+ *
+ * @devlink: devlink instance which the health reports will relate
+ * @ops: devlink health reporter ops
+ * @priv: driver priv pointer
+ */
+struct devlink_health_reporter *
+devl_health_reporter_create(struct devlink *devlink,
+ const struct devlink_health_reporter_ops *ops,
+ void *priv)
+{
+ struct devlink_health_reporter *reporter;
+
+ devl_assert_locked(devlink);
+
+ if (devlink_health_reporter_find_by_name(devlink, ops->name))
+ return ERR_PTR(-EEXIST);
+
+ reporter = __devlink_health_reporter_create(devlink, ops, priv);
+ if (IS_ERR(reporter))
+ return reporter;
+
+ list_add_tail(&reporter->list, &devlink->reporter_list);
+ return reporter;
+}
+EXPORT_SYMBOL_GPL(devl_health_reporter_create);
+
+struct devlink_health_reporter *
+devlink_health_reporter_create(struct devlink *devlink,
+ const struct devlink_health_reporter_ops *ops,
+ void *priv)
+{
+ struct devlink_health_reporter *reporter;
+
+ devl_lock(devlink);
+ reporter = devl_health_reporter_create(devlink, ops, priv);
+ devl_unlock(devlink);
+ return reporter;
+}
+EXPORT_SYMBOL_GPL(devlink_health_reporter_create);
+
+static void
+devlink_health_reporter_free(struct devlink_health_reporter *reporter)
+{
+ if (reporter->dump_fmsg)
+ devlink_fmsg_free(reporter->dump_fmsg);
+ kfree(reporter);
+}
+
+/**
+ * devl_health_reporter_destroy() - destroy devlink health reporter
+ *
+ * @reporter: devlink health reporter to destroy
+ */
+void
+devl_health_reporter_destroy(struct devlink_health_reporter *reporter)
+{
+ devl_assert_locked(reporter->devlink);
+
+ list_del(&reporter->list);
+ devlink_health_reporter_free(reporter);
+}
+EXPORT_SYMBOL_GPL(devl_health_reporter_destroy);
+
+void
+devlink_health_reporter_destroy(struct devlink_health_reporter *reporter)
+{
+ struct devlink *devlink = reporter->devlink;
+
+ devl_lock(devlink);
+ devl_health_reporter_destroy(reporter);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy);
+
+static int
+devlink_nl_health_reporter_fill(struct sk_buff *msg,
+ struct devlink_health_reporter *reporter,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ struct devlink *devlink = reporter->devlink;
+ struct nlattr *reporter_attr;
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto genlmsg_cancel;
+
+ if (reporter->devlink_port) {
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, reporter->devlink_port->index))
+ goto genlmsg_cancel;
+ }
+ reporter_attr = nla_nest_start_noflag(msg,
+ DEVLINK_ATTR_HEALTH_REPORTER);
+ if (!reporter_attr)
+ goto genlmsg_cancel;
+ if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME,
+ reporter->ops->name))
+ goto reporter_nest_cancel;
+ if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE,
+ reporter->health_state))
+ goto reporter_nest_cancel;
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT,
+ reporter->error_count))
+ goto reporter_nest_cancel;
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT,
+ reporter->recovery_count))
+ goto reporter_nest_cancel;
+ if (reporter->ops->recover &&
+ devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD,
+ reporter->graceful_period))
+ goto reporter_nest_cancel;
+ if (reporter->ops->recover &&
+ devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD,
+ reporter->burst_period))
+ goto reporter_nest_cancel;
+ if (reporter->ops->recover &&
+ nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,
+ reporter->auto_recover))
+ goto reporter_nest_cancel;
+ if (reporter->dump_fmsg &&
+ devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS,
+ jiffies_to_msecs(reporter->dump_ts)))
+ goto reporter_nest_cancel;
+ if (reporter->dump_fmsg &&
+ devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS_NS,
+ reporter->dump_real_ts))
+ goto reporter_nest_cancel;
+ if (reporter->ops->dump &&
+ nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP,
+ reporter->auto_dump))
+ goto reporter_nest_cancel;
+
+ nla_nest_end(msg, reporter_attr);
+ genlmsg_end(msg, hdr);
+ return 0;
+
+reporter_nest_cancel:
+ nla_nest_cancel(msg, reporter_attr);
+genlmsg_cancel:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static struct devlink_health_reporter *
+devlink_health_reporter_get_from_attrs(struct devlink *devlink,
+ struct nlattr **attrs)
+{
+ struct devlink_port *devlink_port;
+ char *reporter_name;
+
+ if (!attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME])
+ return NULL;
+
+ reporter_name = nla_data(attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]);
+ devlink_port = devlink_port_get_from_attrs(devlink, attrs);
+ if (IS_ERR(devlink_port))
+ return devlink_health_reporter_find_by_name(devlink,
+ reporter_name);
+ else
+ return devlink_port_health_reporter_find_by_name(devlink_port,
+ reporter_name);
+}
+
+static struct devlink_health_reporter *
+devlink_health_reporter_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ return devlink_health_reporter_get_from_attrs(devlink, info->attrs);
+}
+
+int devlink_nl_health_reporter_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_health_reporter *reporter;
+ struct sk_buff *msg;
+ int err;
+
+ reporter = devlink_health_reporter_get_from_info(devlink, info);
+ if (!reporter)
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_health_reporter_fill(msg, reporter,
+ DEVLINK_CMD_HEALTH_REPORTER_GET,
+ info->snd_portid, info->snd_seq,
+ 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_health_reporter_get_dump_one(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb,
+ int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ const struct genl_info *info = genl_info_dump(cb);
+ struct devlink_health_reporter *reporter;
+ unsigned long port_index_end = ULONG_MAX;
+ struct nlattr **attrs = info->attrs;
+ unsigned long port_index_start = 0;
+ struct devlink_port *port;
+ unsigned long port_index;
+ int idx = 0;
+ int err;
+
+ if (attrs && attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ port_index_start = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]);
+ port_index_end = port_index_start;
+ flags |= NLM_F_DUMP_FILTERED;
+ goto per_port_dump;
+ }
+
+ list_for_each_entry(reporter, &devlink->reporter_list, list) {
+ if (idx < state->idx) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_health_reporter_fill(msg, reporter,
+ DEVLINK_CMD_HEALTH_REPORTER_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ flags);
+ if (err) {
+ state->idx = idx;
+ return err;
+ }
+ idx++;
+ }
+per_port_dump:
+ xa_for_each_range(&devlink->ports, port_index, port,
+ port_index_start, port_index_end) {
+ list_for_each_entry(reporter, &port->reporter_list, list) {
+ if (idx < state->idx) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_health_reporter_fill(msg, reporter,
+ DEVLINK_CMD_HEALTH_REPORTER_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ flags);
+ if (err) {
+ state->idx = idx;
+ return err;
+ }
+ idx++;
+ }
+ }
+
+ return 0;
+}
+
+int devlink_nl_health_reporter_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb,
+ devlink_nl_health_reporter_get_dump_one);
+}
+
+int devlink_nl_health_reporter_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_health_reporter *reporter;
+
+ reporter = devlink_health_reporter_get_from_info(devlink, info);
+ if (!reporter)
+ return -EINVAL;
+
+ if (!reporter->ops->recover &&
+ (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] ||
+ info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] ||
+ info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]))
+ return -EOPNOTSUPP;
+
+ if (!reporter->ops->dump &&
+ info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP])
+ return -EOPNOTSUPP;
+
+ if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) {
+ reporter->graceful_period =
+ nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]);
+ if (!reporter->graceful_period)
+ reporter->burst_period = 0;
+ }
+
+ if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]) {
+ u64 burst_period =
+ nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]);
+
+ if (!reporter->graceful_period && burst_period) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Cannot set burst period without a grace period.");
+ return -EINVAL;
+ }
+
+ reporter->burst_period = burst_period;
+ }
+
+ if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])
+ reporter->auto_recover =
+ nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]);
+
+ if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP])
+ reporter->auto_dump =
+ nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]);
+
+ return 0;
+}
+
+static void devlink_recover_notify(struct devlink_health_reporter *reporter,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = reporter->devlink;
+ struct devlink_obj_desc desc;
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
+ ASSERT_DEVLINK_REGISTERED(devlink);
+
+ if (!devlink_nl_notify_need(devlink))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_health_reporter_fill(msg, reporter, cmd, 0, 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ devlink_nl_obj_desc_init(&desc, devlink);
+ if (reporter->devlink_port)
+ devlink_nl_obj_desc_port_set(&desc, reporter->devlink_port);
+ devlink_nl_notify_send_desc(devlink, msg, &desc);
+}
+
+static bool
+devlink_health_reporter_in_burst(struct devlink_health_reporter *reporter)
+{
+ unsigned long burst_threshold = reporter->last_recovery_ts +
+ msecs_to_jiffies(reporter->burst_period);
+
+ return time_is_after_jiffies(burst_threshold);
+}
+
+void
+devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter)
+{
+ reporter->recovery_count++;
+ if (!devlink_health_reporter_in_burst(reporter))
+ /* When burst period is set, last_recovery_ts marks the first
+ * recovery within the burst period, not necessarily the last
+ * one.
+ */
+ reporter->last_recovery_ts = jiffies;
+}
+EXPORT_SYMBOL_GPL(devlink_health_reporter_recovery_done);
+
+static int
+devlink_health_reporter_recover(struct devlink_health_reporter *reporter,
+ void *priv_ctx, struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (reporter->health_state == DEVLINK_HEALTH_REPORTER_STATE_HEALTHY)
+ return 0;
+
+ if (!reporter->ops->recover)
+ return -EOPNOTSUPP;
+
+ err = reporter->ops->recover(reporter, priv_ctx, extack);
+ if (err)
+ return err;
+
+ devlink_health_reporter_recovery_done(reporter);
+ reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY;
+ devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
+
+ return 0;
+}
+
+static void
+devlink_health_dump_clear(struct devlink_health_reporter *reporter)
+{
+ if (!reporter->dump_fmsg)
+ return;
+ devlink_fmsg_free(reporter->dump_fmsg);
+ reporter->dump_fmsg = NULL;
+}
+
+static int devlink_health_do_dump(struct devlink_health_reporter *reporter,
+ void *priv_ctx,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (!reporter->ops->dump)
+ return 0;
+
+ if (reporter->dump_fmsg)
+ return 0;
+
+ reporter->dump_fmsg = devlink_fmsg_alloc();
+ if (!reporter->dump_fmsg)
+ return -ENOMEM;
+
+ devlink_fmsg_obj_nest_start(reporter->dump_fmsg);
+
+ err = reporter->ops->dump(reporter, reporter->dump_fmsg,
+ priv_ctx, extack);
+ if (err)
+ goto dump_err;
+
+ devlink_fmsg_obj_nest_end(reporter->dump_fmsg);
+ err = reporter->dump_fmsg->err;
+ if (err)
+ goto dump_err;
+
+ reporter->dump_ts = jiffies;
+ reporter->dump_real_ts = ktime_get_real_ns();
+
+ return 0;
+
+dump_err:
+ devlink_health_dump_clear(reporter);
+ return err;
+}
+
+static bool
+devlink_health_recover_abort(struct devlink_health_reporter *reporter,
+ enum devlink_health_reporter_state prev_state)
+{
+ unsigned long recover_ts_threshold;
+
+ if (!reporter->auto_recover)
+ return false;
+
+ /* abort if the previous error wasn't recovered */
+ if (prev_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY)
+ return true;
+
+ if (devlink_health_reporter_in_burst(reporter))
+ return false;
+
+ recover_ts_threshold = reporter->last_recovery_ts +
+ msecs_to_jiffies(reporter->burst_period) +
+ msecs_to_jiffies(reporter->graceful_period);
+ if (reporter->last_recovery_ts && reporter->recovery_count &&
+ time_is_after_jiffies(recover_ts_threshold))
+ return true;
+
+ return false;
+}
+
+int devlink_health_report(struct devlink_health_reporter *reporter,
+ const char *msg, void *priv_ctx)
+{
+ enum devlink_health_reporter_state prev_health_state;
+ struct devlink *devlink = reporter->devlink;
+ int ret;
+
+ /* write a log message of the current error */
+ WARN_ON(!msg);
+ trace_devlink_health_report(devlink, reporter->ops->name, msg);
+ reporter->error_count++;
+ prev_health_state = reporter->health_state;
+ reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
+ devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
+
+ if (devlink_health_recover_abort(reporter, prev_health_state)) {
+ trace_devlink_health_recover_aborted(devlink,
+ reporter->ops->name,
+ reporter->health_state,
+ jiffies -
+ reporter->last_recovery_ts);
+ return -ECANCELED;
+ }
+
+ if (reporter->auto_dump) {
+ devl_lock(devlink);
+ /* store current dump of current error, for later analysis */
+ devlink_health_do_dump(reporter, priv_ctx, NULL);
+ devl_unlock(devlink);
+ }
+
+ if (!reporter->auto_recover)
+ return 0;
+
+ devl_lock(devlink);
+ ret = devlink_health_reporter_recover(reporter, priv_ctx, NULL);
+ devl_unlock(devlink);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(devlink_health_report);
+
+void
+devlink_health_reporter_state_update(struct devlink_health_reporter *reporter,
+ enum devlink_health_reporter_state state)
+{
+ if (WARN_ON(state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY &&
+ state != DEVLINK_HEALTH_REPORTER_STATE_ERROR))
+ return;
+
+ if (reporter->health_state == state)
+ return;
+
+ reporter->health_state = state;
+ trace_devlink_health_reporter_state_update(reporter->devlink,
+ reporter->ops->name, state);
+ devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
+}
+EXPORT_SYMBOL_GPL(devlink_health_reporter_state_update);
+
+int devlink_nl_health_reporter_recover_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_health_reporter *reporter;
+
+ reporter = devlink_health_reporter_get_from_info(devlink, info);
+ if (!reporter)
+ return -EINVAL;
+
+ return devlink_health_reporter_recover(reporter, NULL, info->extack);
+}
+
+static void devlink_fmsg_err_if_binary(struct devlink_fmsg *fmsg)
+{
+ if (!fmsg->err && fmsg->putting_binary)
+ fmsg->err = -EINVAL;
+}
+
+static void devlink_fmsg_nest_common(struct devlink_fmsg *fmsg, int attrtype)
+{
+ struct devlink_fmsg_item *item;
+
+ if (fmsg->err)
+ return;
+
+ item = kzalloc(sizeof(*item), GFP_KERNEL);
+ if (!item) {
+ fmsg->err = -ENOMEM;
+ return;
+ }
+
+ item->attrtype = attrtype;
+ list_add_tail(&item->list, &fmsg->item_list);
+}
+
+void devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg)
+{
+ devlink_fmsg_err_if_binary(fmsg);
+ devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_OBJ_NEST_START);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_start);
+
+static void devlink_fmsg_nest_end(struct devlink_fmsg *fmsg)
+{
+ devlink_fmsg_err_if_binary(fmsg);
+ devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_NEST_END);
+}
+
+void devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg)
+{
+ devlink_fmsg_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_end);
+
+#define DEVLINK_FMSG_MAX_SIZE (GENLMSG_DEFAULT_SIZE - GENL_HDRLEN - NLA_HDRLEN)
+
+static void devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name)
+{
+ struct devlink_fmsg_item *item;
+
+ devlink_fmsg_err_if_binary(fmsg);
+ if (fmsg->err)
+ return;
+
+ if (strlen(name) + 1 > DEVLINK_FMSG_MAX_SIZE) {
+ fmsg->err = -EMSGSIZE;
+ return;
+ }
+
+ item = kzalloc(sizeof(*item) + strlen(name) + 1, GFP_KERNEL);
+ if (!item) {
+ fmsg->err = -ENOMEM;
+ return;
+ }
+
+ item->nla_type = DEVLINK_VAR_ATTR_TYPE_NUL_STRING;
+ item->len = strlen(name) + 1;
+ item->attrtype = DEVLINK_ATTR_FMSG_OBJ_NAME;
+ memcpy(&item->value, name, item->len);
+ list_add_tail(&item->list, &fmsg->item_list);
+}
+
+void devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name)
+{
+ devlink_fmsg_err_if_binary(fmsg);
+ devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_PAIR_NEST_START);
+ devlink_fmsg_put_name(fmsg, name);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_start);
+
+void devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg)
+{
+ devlink_fmsg_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_end);
+
+void devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg,
+ const char *name)
+{
+ devlink_fmsg_pair_nest_start(fmsg, name);
+ devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_ARR_NEST_START);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_start);
+
+void devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg)
+{
+ devlink_fmsg_nest_end(fmsg);
+ devlink_fmsg_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_end);
+
+void devlink_fmsg_binary_pair_nest_start(struct devlink_fmsg *fmsg,
+ const char *name)
+{
+ devlink_fmsg_arr_pair_nest_start(fmsg, name);
+ fmsg->putting_binary = true;
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_start);
+
+void devlink_fmsg_binary_pair_nest_end(struct devlink_fmsg *fmsg)
+{
+ if (fmsg->err)
+ return;
+
+ if (!fmsg->putting_binary)
+ fmsg->err = -EINVAL;
+
+ fmsg->putting_binary = false;
+ devlink_fmsg_arr_pair_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_nest_end);
+
+static void devlink_fmsg_put_value(struct devlink_fmsg *fmsg,
+ const void *value, u16 value_len,
+ u8 value_nla_type)
+{
+ struct devlink_fmsg_item *item;
+
+ if (fmsg->err)
+ return;
+
+ if (value_len > DEVLINK_FMSG_MAX_SIZE) {
+ fmsg->err = -EMSGSIZE;
+ return;
+ }
+
+ item = kzalloc(sizeof(*item) + value_len, GFP_KERNEL);
+ if (!item) {
+ fmsg->err = -ENOMEM;
+ return;
+ }
+
+ item->nla_type = value_nla_type;
+ item->len = value_len;
+ item->attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA;
+ memcpy(&item->value, value, item->len);
+ list_add_tail(&item->list, &fmsg->item_list);
+}
+
+static void devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value)
+{
+ devlink_fmsg_err_if_binary(fmsg);
+ devlink_fmsg_put_value(fmsg, &value, sizeof(value),
+ DEVLINK_VAR_ATTR_TYPE_FLAG);
+}
+
+static void devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value)
+{
+ devlink_fmsg_err_if_binary(fmsg);
+ devlink_fmsg_put_value(fmsg, &value, sizeof(value),
+ DEVLINK_VAR_ATTR_TYPE_U8);
+}
+
+void devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value)
+{
+ devlink_fmsg_err_if_binary(fmsg);
+ devlink_fmsg_put_value(fmsg, &value, sizeof(value),
+ DEVLINK_VAR_ATTR_TYPE_U32);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put);
+
+static void devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value)
+{
+ devlink_fmsg_err_if_binary(fmsg);
+ devlink_fmsg_put_value(fmsg, &value, sizeof(value),
+ DEVLINK_VAR_ATTR_TYPE_U64);
+}
+
+void devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value)
+{
+ devlink_fmsg_err_if_binary(fmsg);
+ devlink_fmsg_put_value(fmsg, value, strlen(value) + 1,
+ DEVLINK_VAR_ATTR_TYPE_NUL_STRING);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_string_put);
+
+void devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value,
+ u16 value_len)
+{
+ if (!fmsg->err && !fmsg->putting_binary)
+ fmsg->err = -EINVAL;
+
+ devlink_fmsg_put_value(fmsg, value, value_len,
+ DEVLINK_VAR_ATTR_TYPE_BINARY);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put);
+
+void devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name,
+ bool value)
+{
+ devlink_fmsg_pair_nest_start(fmsg, name);
+ devlink_fmsg_bool_put(fmsg, value);
+ devlink_fmsg_pair_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_bool_pair_put);
+
+void devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name,
+ u8 value)
+{
+ devlink_fmsg_pair_nest_start(fmsg, name);
+ devlink_fmsg_u8_put(fmsg, value);
+ devlink_fmsg_pair_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_u8_pair_put);
+
+void devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name,
+ u32 value)
+{
+ devlink_fmsg_pair_nest_start(fmsg, name);
+ devlink_fmsg_u32_put(fmsg, value);
+ devlink_fmsg_pair_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_u32_pair_put);
+
+void devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name,
+ u64 value)
+{
+ devlink_fmsg_pair_nest_start(fmsg, name);
+ devlink_fmsg_u64_put(fmsg, value);
+ devlink_fmsg_pair_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_u64_pair_put);
+
+void devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name,
+ const char *value)
+{
+ devlink_fmsg_pair_nest_start(fmsg, name);
+ devlink_fmsg_string_put(fmsg, value);
+ devlink_fmsg_pair_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_string_pair_put);
+
+void devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name,
+ const void *value, u32 value_len)
+{
+ u32 data_size;
+ u32 offset;
+
+ devlink_fmsg_binary_pair_nest_start(fmsg, name);
+
+ for (offset = 0; offset < value_len; offset += data_size) {
+ data_size = value_len - offset;
+ if (data_size > DEVLINK_FMSG_MAX_SIZE)
+ data_size = DEVLINK_FMSG_MAX_SIZE;
+
+ devlink_fmsg_binary_put(fmsg, value + offset, data_size);
+ }
+
+ devlink_fmsg_binary_pair_nest_end(fmsg);
+ fmsg->putting_binary = false;
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put);
+
+static int
+devlink_fmsg_item_fill_data(struct devlink_fmsg_item *msg, struct sk_buff *skb)
+{
+ int attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA;
+ u8 tmp;
+
+ switch (msg->nla_type) {
+ case DEVLINK_VAR_ATTR_TYPE_FLAG:
+ /* Always provide flag data, regardless of its value */
+ tmp = *(bool *)msg->value;
+
+ return nla_put_u8(skb, attrtype, tmp);
+ case DEVLINK_VAR_ATTR_TYPE_U8:
+ return nla_put_u8(skb, attrtype, *(u8 *)msg->value);
+ case DEVLINK_VAR_ATTR_TYPE_U32:
+ return nla_put_u32(skb, attrtype, *(u32 *)msg->value);
+ case DEVLINK_VAR_ATTR_TYPE_U64:
+ return devlink_nl_put_u64(skb, attrtype, *(u64 *)msg->value);
+ case DEVLINK_VAR_ATTR_TYPE_NUL_STRING:
+ return nla_put_string(skb, attrtype, (char *)&msg->value);
+ case DEVLINK_VAR_ATTR_TYPE_BINARY:
+ return nla_put(skb, attrtype, msg->len, (void *)&msg->value);
+ default:
+ return -EINVAL;
+ }
+}
+
+static int
+devlink_fmsg_prepare_skb(struct devlink_fmsg *fmsg, struct sk_buff *skb,
+ int *start)
+{
+ struct devlink_fmsg_item *item;
+ struct nlattr *fmsg_nlattr;
+ int err = 0;
+ int i = 0;
+
+ fmsg_nlattr = nla_nest_start_noflag(skb, DEVLINK_ATTR_FMSG);
+ if (!fmsg_nlattr)
+ return -EMSGSIZE;
+
+ list_for_each_entry(item, &fmsg->item_list, list) {
+ if (i < *start) {
+ i++;
+ continue;
+ }
+
+ switch (item->attrtype) {
+ case DEVLINK_ATTR_FMSG_OBJ_NEST_START:
+ case DEVLINK_ATTR_FMSG_PAIR_NEST_START:
+ case DEVLINK_ATTR_FMSG_ARR_NEST_START:
+ case DEVLINK_ATTR_FMSG_NEST_END:
+ err = nla_put_flag(skb, item->attrtype);
+ break;
+ case DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA:
+ err = nla_put_u8(skb, DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE,
+ item->nla_type);
+ if (err)
+ break;
+ err = devlink_fmsg_item_fill_data(item, skb);
+ break;
+ case DEVLINK_ATTR_FMSG_OBJ_NAME:
+ err = nla_put_string(skb, item->attrtype,
+ (char *)&item->value);
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+ if (!err)
+ *start = ++i;
+ else
+ break;
+ }
+
+ nla_nest_end(skb, fmsg_nlattr);
+ return err;
+}
+
+static int devlink_fmsg_snd(struct devlink_fmsg *fmsg,
+ struct genl_info *info,
+ enum devlink_command cmd, int flags)
+{
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+ bool last = false;
+ int index = 0;
+ void *hdr;
+ int err;
+
+ if (fmsg->err)
+ return fmsg->err;
+
+ while (!last) {
+ int tmp_index = index;
+
+ skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
+ &devlink_nl_family, flags | NLM_F_MULTI, cmd);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto nla_put_failure;
+ }
+
+ err = devlink_fmsg_prepare_skb(fmsg, skb, &index);
+ if (!err)
+ last = true;
+ else if (err != -EMSGSIZE || tmp_index == index)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, hdr);
+ err = genlmsg_reply(skb, info);
+ if (err)
+ return err;
+ }
+
+ skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOMEM;
+ nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
+ NLMSG_DONE, 0, flags | NLM_F_MULTI);
+ if (!nlh) {
+ err = -EMSGSIZE;
+ goto nla_put_failure;
+ }
+
+ return genlmsg_reply(skb, info);
+
+nla_put_failure:
+ nlmsg_free(skb);
+ return err;
+}
+
+static int devlink_fmsg_dumpit(struct devlink_fmsg *fmsg, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ enum devlink_command cmd)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ int index = state->idx;
+ int tmp_index = index;
+ void *hdr;
+ int err;
+
+ if (fmsg->err)
+ return fmsg->err;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, cmd);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto nla_put_failure;
+ }
+
+ err = devlink_fmsg_prepare_skb(fmsg, skb, &index);
+ if ((err && err != -EMSGSIZE) || tmp_index == index)
+ goto nla_put_failure;
+
+ state->idx = index;
+ genlmsg_end(skb, hdr);
+ return skb->len;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return err;
+}
+
+int devlink_nl_health_reporter_diagnose_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_health_reporter *reporter;
+ struct devlink_fmsg *fmsg;
+ int err;
+
+ reporter = devlink_health_reporter_get_from_info(devlink, info);
+ if (!reporter)
+ return -EINVAL;
+
+ if (!reporter->ops->diagnose)
+ return -EOPNOTSUPP;
+
+ fmsg = devlink_fmsg_alloc();
+ if (!fmsg)
+ return -ENOMEM;
+
+ devlink_fmsg_obj_nest_start(fmsg);
+
+ err = reporter->ops->diagnose(reporter, fmsg, info->extack);
+ if (err)
+ goto out;
+
+ devlink_fmsg_obj_nest_end(fmsg);
+
+ err = devlink_fmsg_snd(fmsg, info,
+ DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, 0);
+
+out:
+ devlink_fmsg_free(fmsg);
+ return err;
+}
+
+static struct devlink_health_reporter *
+devlink_health_reporter_get_from_cb_lock(struct netlink_callback *cb)
+{
+ const struct genl_info *info = genl_info_dump(cb);
+ struct devlink_health_reporter *reporter;
+ struct nlattr **attrs = info->attrs;
+ struct devlink *devlink;
+
+ devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs,
+ false);
+ if (IS_ERR(devlink))
+ return NULL;
+
+ reporter = devlink_health_reporter_get_from_attrs(devlink, attrs);
+ if (!reporter) {
+ devl_unlock(devlink);
+ devlink_put(devlink);
+ }
+ return reporter;
+}
+
+int devlink_nl_health_reporter_dump_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_health_reporter *reporter;
+ struct devlink *devlink;
+ int err;
+
+ reporter = devlink_health_reporter_get_from_cb_lock(cb);
+ if (!reporter)
+ return -EINVAL;
+
+ devlink = reporter->devlink;
+ if (!reporter->ops->dump) {
+ devl_unlock(devlink);
+ devlink_put(devlink);
+ return -EOPNOTSUPP;
+ }
+
+ if (!state->idx) {
+ err = devlink_health_do_dump(reporter, NULL, cb->extack);
+ if (err)
+ goto unlock;
+ state->dump_ts = reporter->dump_ts;
+ }
+ if (!reporter->dump_fmsg || state->dump_ts != reporter->dump_ts) {
+ NL_SET_ERR_MSG(cb->extack, "Dump trampled, please retry");
+ err = -EAGAIN;
+ goto unlock;
+ }
+
+ err = devlink_fmsg_dumpit(reporter->dump_fmsg, skb, cb,
+ DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET);
+unlock:
+ devl_unlock(devlink);
+ devlink_put(devlink);
+ return err;
+}
+
+int devlink_nl_health_reporter_dump_clear_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_health_reporter *reporter;
+
+ reporter = devlink_health_reporter_get_from_info(devlink, info);
+ if (!reporter)
+ return -EINVAL;
+
+ if (!reporter->ops->dump)
+ return -EOPNOTSUPP;
+
+ devlink_health_dump_clear(reporter);
+ return 0;
+}
+
+int devlink_nl_health_reporter_test_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_health_reporter *reporter;
+
+ reporter = devlink_health_reporter_get_from_info(devlink, info);
+ if (!reporter)
+ return -EINVAL;
+
+ if (!reporter->ops->test)
+ return -EOPNOTSUPP;
+
+ return reporter->ops->test(reporter, info->extack);
+}
+
+/**
+ * devlink_fmsg_dump_skb - Dump sk_buffer structure
+ * @fmsg: devlink formatted message pointer
+ * @skb: pointer to skb
+ *
+ * Dump diagnostic information about sk_buff structure, like headroom, length,
+ * tailroom, MAC, etc.
+ */
+void devlink_fmsg_dump_skb(struct devlink_fmsg *fmsg, const struct sk_buff *skb)
+{
+ struct skb_shared_info *sh = skb_shinfo(skb);
+ struct sock *sk = skb->sk;
+ bool has_mac, has_trans;
+
+ has_mac = skb_mac_header_was_set(skb);
+ has_trans = skb_transport_header_was_set(skb);
+
+ devlink_fmsg_pair_nest_start(fmsg, "skb");
+ devlink_fmsg_obj_nest_start(fmsg);
+ devlink_fmsg_put(fmsg, "actual len", skb->len);
+ devlink_fmsg_put(fmsg, "head len", skb_headlen(skb));
+ devlink_fmsg_put(fmsg, "data len", skb->data_len);
+ devlink_fmsg_put(fmsg, "tail len", skb_tailroom(skb));
+ devlink_fmsg_put(fmsg, "MAC", has_mac ? skb->mac_header : -1);
+ devlink_fmsg_put(fmsg, "MAC len",
+ has_mac ? skb_mac_header_len(skb) : -1);
+ devlink_fmsg_put(fmsg, "network hdr", skb->network_header);
+ devlink_fmsg_put(fmsg, "network hdr len",
+ has_trans ? skb_network_header_len(skb) : -1);
+ devlink_fmsg_put(fmsg, "transport hdr",
+ has_trans ? skb->transport_header : -1);
+ devlink_fmsg_put(fmsg, "csum", (__force u32)skb->csum);
+ devlink_fmsg_put(fmsg, "csum_ip_summed", (u8)skb->ip_summed);
+ devlink_fmsg_put(fmsg, "csum_complete_sw", !!skb->csum_complete_sw);
+ devlink_fmsg_put(fmsg, "csum_valid", !!skb->csum_valid);
+ devlink_fmsg_put(fmsg, "csum_level", (u8)skb->csum_level);
+ devlink_fmsg_put(fmsg, "sw_hash", !!skb->sw_hash);
+ devlink_fmsg_put(fmsg, "l4_hash", !!skb->l4_hash);
+ devlink_fmsg_put(fmsg, "proto", ntohs(skb->protocol));
+ devlink_fmsg_put(fmsg, "pkt_type", (u8)skb->pkt_type);
+ devlink_fmsg_put(fmsg, "iif", skb->skb_iif);
+
+ if (sk) {
+ devlink_fmsg_pair_nest_start(fmsg, "sk");
+ devlink_fmsg_obj_nest_start(fmsg);
+ devlink_fmsg_put(fmsg, "family", sk->sk_type);
+ devlink_fmsg_put(fmsg, "type", sk->sk_type);
+ devlink_fmsg_put(fmsg, "proto", sk->sk_protocol);
+ devlink_fmsg_obj_nest_end(fmsg);
+ devlink_fmsg_pair_nest_end(fmsg);
+ }
+
+ devlink_fmsg_obj_nest_end(fmsg);
+ devlink_fmsg_pair_nest_end(fmsg);
+
+ devlink_fmsg_pair_nest_start(fmsg, "shinfo");
+ devlink_fmsg_obj_nest_start(fmsg);
+ devlink_fmsg_put(fmsg, "tx_flags", sh->tx_flags);
+ devlink_fmsg_put(fmsg, "nr_frags", sh->nr_frags);
+ devlink_fmsg_put(fmsg, "gso_size", sh->gso_size);
+ devlink_fmsg_put(fmsg, "gso_type", sh->gso_type);
+ devlink_fmsg_put(fmsg, "gso_segs", sh->gso_segs);
+ devlink_fmsg_obj_nest_end(fmsg);
+ devlink_fmsg_pair_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_dump_skb);
diff --git a/net/devlink/linecard.c b/net/devlink/linecard.c
new file mode 100644
index 000000000000..67f70a621d27
--- /dev/null
+++ b/net/devlink/linecard.c
@@ -0,0 +1,628 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include "devl_internal.h"
+
+struct devlink_linecard {
+ struct list_head list;
+ struct devlink *devlink;
+ unsigned int index;
+ const struct devlink_linecard_ops *ops;
+ void *priv;
+ enum devlink_linecard_state state;
+ struct mutex state_lock; /* Protects state */
+ const char *type;
+ struct devlink_linecard_type *types;
+ unsigned int types_count;
+ u32 rel_index;
+};
+
+unsigned int devlink_linecard_index(struct devlink_linecard *linecard)
+{
+ return linecard->index;
+}
+
+static struct devlink_linecard *
+devlink_linecard_get_by_index(struct devlink *devlink,
+ unsigned int linecard_index)
+{
+ struct devlink_linecard *devlink_linecard;
+
+ list_for_each_entry(devlink_linecard, &devlink->linecard_list, list) {
+ if (devlink_linecard->index == linecard_index)
+ return devlink_linecard;
+ }
+ return NULL;
+}
+
+static bool devlink_linecard_index_exists(struct devlink *devlink,
+ unsigned int linecard_index)
+{
+ return devlink_linecard_get_by_index(devlink, linecard_index);
+}
+
+static struct devlink_linecard *
+devlink_linecard_get_from_attrs(struct devlink *devlink, struct nlattr **attrs)
+{
+ if (attrs[DEVLINK_ATTR_LINECARD_INDEX]) {
+ u32 linecard_index = nla_get_u32(attrs[DEVLINK_ATTR_LINECARD_INDEX]);
+ struct devlink_linecard *linecard;
+
+ linecard = devlink_linecard_get_by_index(devlink, linecard_index);
+ if (!linecard)
+ return ERR_PTR(-ENODEV);
+ return linecard;
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+static struct devlink_linecard *
+devlink_linecard_get_from_info(struct devlink *devlink, struct genl_info *info)
+{
+ return devlink_linecard_get_from_attrs(devlink, info->attrs);
+}
+
+struct devlink_linecard_type {
+ const char *type;
+ const void *priv;
+};
+
+static int devlink_nl_linecard_fill(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct devlink_linecard *linecard,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags,
+ struct netlink_ext_ack *extack)
+{
+ struct devlink_linecard_type *linecard_type;
+ struct nlattr *attr;
+ void *hdr;
+ int i;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX, linecard->index))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_LINECARD_STATE, linecard->state))
+ goto nla_put_failure;
+ if (linecard->type &&
+ nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE, linecard->type))
+ goto nla_put_failure;
+
+ if (linecard->types_count) {
+ attr = nla_nest_start(msg,
+ DEVLINK_ATTR_LINECARD_SUPPORTED_TYPES);
+ if (!attr)
+ goto nla_put_failure;
+ for (i = 0; i < linecard->types_count; i++) {
+ linecard_type = &linecard->types[i];
+ if (nla_put_string(msg, DEVLINK_ATTR_LINECARD_TYPE,
+ linecard_type->type)) {
+ nla_nest_cancel(msg, attr);
+ goto nla_put_failure;
+ }
+ }
+ nla_nest_end(msg, attr);
+ }
+
+ if (devlink_rel_devlink_handle_put(msg, devlink,
+ linecard->rel_index,
+ DEVLINK_ATTR_NESTED_DEVLINK,
+ NULL))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_linecard_notify(struct devlink_linecard *linecard,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = linecard->devlink;
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_LINECARD_NEW &&
+ cmd != DEVLINK_CMD_LINECARD_DEL);
+
+ if (!__devl_is_registered(devlink) || !devlink_nl_notify_need(devlink))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_linecard_fill(msg, devlink, linecard, cmd, 0, 0, 0,
+ NULL);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ devlink_nl_notify_send(devlink, msg);
+}
+
+void devlink_linecards_notify_register(struct devlink *devlink)
+{
+ struct devlink_linecard *linecard;
+
+ list_for_each_entry(linecard, &devlink->linecard_list, list)
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+}
+
+void devlink_linecards_notify_unregister(struct devlink *devlink)
+{
+ struct devlink_linecard *linecard;
+
+ list_for_each_entry_reverse(linecard, &devlink->linecard_list, list)
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL);
+}
+
+int devlink_nl_linecard_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_linecard *linecard;
+ struct sk_buff *msg;
+ int err;
+
+ linecard = devlink_linecard_get_from_info(devlink, info);
+ if (IS_ERR(linecard))
+ return PTR_ERR(linecard);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ mutex_lock(&linecard->state_lock);
+ err = devlink_nl_linecard_fill(msg, devlink, linecard,
+ DEVLINK_CMD_LINECARD_NEW,
+ info->snd_portid, info->snd_seq, 0,
+ info->extack);
+ mutex_unlock(&linecard->state_lock);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_linecard_get_dump_one(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb,
+ int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_linecard *linecard;
+ int idx = 0;
+ int err = 0;
+
+ list_for_each_entry(linecard, &devlink->linecard_list, list) {
+ if (idx < state->idx) {
+ idx++;
+ continue;
+ }
+ mutex_lock(&linecard->state_lock);
+ err = devlink_nl_linecard_fill(msg, devlink, linecard,
+ DEVLINK_CMD_LINECARD_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags,
+ cb->extack);
+ mutex_unlock(&linecard->state_lock);
+ if (err) {
+ state->idx = idx;
+ break;
+ }
+ idx++;
+ }
+
+ return err;
+}
+
+int devlink_nl_linecard_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_linecard_get_dump_one);
+}
+
+static struct devlink_linecard_type *
+devlink_linecard_type_lookup(struct devlink_linecard *linecard,
+ const char *type)
+{
+ struct devlink_linecard_type *linecard_type;
+ int i;
+
+ for (i = 0; i < linecard->types_count; i++) {
+ linecard_type = &linecard->types[i];
+ if (!strcmp(type, linecard_type->type))
+ return linecard_type;
+ }
+ return NULL;
+}
+
+static int devlink_linecard_type_set(struct devlink_linecard *linecard,
+ const char *type,
+ struct netlink_ext_ack *extack)
+{
+ const struct devlink_linecard_ops *ops = linecard->ops;
+ struct devlink_linecard_type *linecard_type;
+ int err;
+
+ mutex_lock(&linecard->state_lock);
+ if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) {
+ NL_SET_ERR_MSG(extack, "Line card is currently being provisioned");
+ err = -EBUSY;
+ goto out;
+ }
+ if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) {
+ NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned");
+ err = -EBUSY;
+ goto out;
+ }
+
+ linecard_type = devlink_linecard_type_lookup(linecard, type);
+ if (!linecard_type) {
+ NL_SET_ERR_MSG(extack, "Unsupported line card type provided");
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (linecard->state != DEVLINK_LINECARD_STATE_UNPROVISIONED &&
+ linecard->state != DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) {
+ NL_SET_ERR_MSG(extack, "Line card already provisioned");
+ err = -EBUSY;
+ /* Check if the line card is provisioned in the same
+ * way the user asks. In case it is, make the operation
+ * to return success.
+ */
+ if (ops->same_provision &&
+ ops->same_provision(linecard, linecard->priv,
+ linecard_type->type,
+ linecard_type->priv))
+ err = 0;
+ goto out;
+ }
+
+ linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING;
+ linecard->type = linecard_type->type;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+ err = ops->provision(linecard, linecard->priv, linecard_type->type,
+ linecard_type->priv, extack);
+ if (err) {
+ /* Provisioning failed. Assume the linecard is unprovisioned
+ * for future operations.
+ */
+ mutex_lock(&linecard->state_lock);
+ linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
+ linecard->type = NULL;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+ }
+ return err;
+
+out:
+ mutex_unlock(&linecard->state_lock);
+ return err;
+}
+
+static int devlink_linecard_type_unset(struct devlink_linecard *linecard,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ mutex_lock(&linecard->state_lock);
+ if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING) {
+ NL_SET_ERR_MSG(extack, "Line card is currently being provisioned");
+ err = -EBUSY;
+ goto out;
+ }
+ if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONING) {
+ NL_SET_ERR_MSG(extack, "Line card is currently being unprovisioned");
+ err = -EBUSY;
+ goto out;
+ }
+ if (linecard->state == DEVLINK_LINECARD_STATE_PROVISIONING_FAILED) {
+ linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
+ linecard->type = NULL;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ err = 0;
+ goto out;
+ }
+
+ if (linecard->state == DEVLINK_LINECARD_STATE_UNPROVISIONED) {
+ NL_SET_ERR_MSG(extack, "Line card is not provisioned");
+ err = 0;
+ goto out;
+ }
+ linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONING;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+ err = linecard->ops->unprovision(linecard, linecard->priv,
+ extack);
+ if (err) {
+ /* Unprovisioning failed. Assume the linecard is unprovisioned
+ * for future operations.
+ */
+ mutex_lock(&linecard->state_lock);
+ linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
+ linecard->type = NULL;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+ }
+ return err;
+
+out:
+ mutex_unlock(&linecard->state_lock);
+ return err;
+}
+
+int devlink_nl_linecard_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_linecard *linecard;
+ int err;
+
+ linecard = devlink_linecard_get_from_info(devlink, info);
+ if (IS_ERR(linecard))
+ return PTR_ERR(linecard);
+
+ if (info->attrs[DEVLINK_ATTR_LINECARD_TYPE]) {
+ const char *type;
+
+ type = nla_data(info->attrs[DEVLINK_ATTR_LINECARD_TYPE]);
+ if (strcmp(type, "")) {
+ err = devlink_linecard_type_set(linecard, type, extack);
+ if (err)
+ return err;
+ } else {
+ err = devlink_linecard_type_unset(linecard, extack);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int devlink_linecard_types_init(struct devlink_linecard *linecard)
+{
+ struct devlink_linecard_type *linecard_type;
+ unsigned int count;
+ int i;
+
+ count = linecard->ops->types_count(linecard, linecard->priv);
+ linecard->types = kmalloc_array(count, sizeof(*linecard_type),
+ GFP_KERNEL);
+ if (!linecard->types)
+ return -ENOMEM;
+ linecard->types_count = count;
+
+ for (i = 0; i < count; i++) {
+ linecard_type = &linecard->types[i];
+ linecard->ops->types_get(linecard, linecard->priv, i,
+ &linecard_type->type,
+ &linecard_type->priv);
+ }
+ return 0;
+}
+
+static void devlink_linecard_types_fini(struct devlink_linecard *linecard)
+{
+ kfree(linecard->types);
+}
+
+/**
+ * devl_linecard_create - Create devlink linecard
+ *
+ * @devlink: devlink
+ * @linecard_index: driver-specific numerical identifier of the linecard
+ * @ops: linecards ops
+ * @priv: user priv pointer
+ *
+ * Create devlink linecard instance with provided linecard index.
+ * Caller can use any indexing, even hw-related one.
+ *
+ * Return: Line card structure or an ERR_PTR() encoded error code.
+ */
+struct devlink_linecard *
+devl_linecard_create(struct devlink *devlink, unsigned int linecard_index,
+ const struct devlink_linecard_ops *ops, void *priv)
+{
+ struct devlink_linecard *linecard;
+ int err;
+
+ if (WARN_ON(!ops || !ops->provision || !ops->unprovision ||
+ !ops->types_count || !ops->types_get))
+ return ERR_PTR(-EINVAL);
+
+ if (devlink_linecard_index_exists(devlink, linecard_index))
+ return ERR_PTR(-EEXIST);
+
+ linecard = kzalloc(sizeof(*linecard), GFP_KERNEL);
+ if (!linecard)
+ return ERR_PTR(-ENOMEM);
+
+ linecard->devlink = devlink;
+ linecard->index = linecard_index;
+ linecard->ops = ops;
+ linecard->priv = priv;
+ linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
+ mutex_init(&linecard->state_lock);
+
+ err = devlink_linecard_types_init(linecard);
+ if (err) {
+ mutex_destroy(&linecard->state_lock);
+ kfree(linecard);
+ return ERR_PTR(err);
+ }
+
+ list_add_tail(&linecard->list, &devlink->linecard_list);
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ return linecard;
+}
+EXPORT_SYMBOL_GPL(devl_linecard_create);
+
+/**
+ * devl_linecard_destroy - Destroy devlink linecard
+ *
+ * @linecard: devlink linecard
+ */
+void devl_linecard_destroy(struct devlink_linecard *linecard)
+{
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_DEL);
+ list_del(&linecard->list);
+ devlink_linecard_types_fini(linecard);
+ mutex_destroy(&linecard->state_lock);
+ kfree(linecard);
+}
+EXPORT_SYMBOL_GPL(devl_linecard_destroy);
+
+/**
+ * devlink_linecard_provision_set - Set provisioning on linecard
+ *
+ * @linecard: devlink linecard
+ * @type: linecard type
+ *
+ * This is either called directly from the provision() op call or
+ * as a result of the provision() op call asynchronously.
+ */
+void devlink_linecard_provision_set(struct devlink_linecard *linecard,
+ const char *type)
+{
+ mutex_lock(&linecard->state_lock);
+ WARN_ON(linecard->type && strcmp(linecard->type, type));
+ linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED;
+ linecard->type = type;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_provision_set);
+
+/**
+ * devlink_linecard_provision_clear - Clear provisioning on linecard
+ *
+ * @linecard: devlink linecard
+ *
+ * This is either called directly from the unprovision() op call or
+ * as a result of the unprovision() op call asynchronously.
+ */
+void devlink_linecard_provision_clear(struct devlink_linecard *linecard)
+{
+ mutex_lock(&linecard->state_lock);
+ linecard->state = DEVLINK_LINECARD_STATE_UNPROVISIONED;
+ linecard->type = NULL;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_provision_clear);
+
+/**
+ * devlink_linecard_provision_fail - Fail provisioning on linecard
+ *
+ * @linecard: devlink linecard
+ *
+ * This is either called directly from the provision() op call or
+ * as a result of the provision() op call asynchronously.
+ */
+void devlink_linecard_provision_fail(struct devlink_linecard *linecard)
+{
+ mutex_lock(&linecard->state_lock);
+ linecard->state = DEVLINK_LINECARD_STATE_PROVISIONING_FAILED;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_provision_fail);
+
+/**
+ * devlink_linecard_activate - Set linecard active
+ *
+ * @linecard: devlink linecard
+ */
+void devlink_linecard_activate(struct devlink_linecard *linecard)
+{
+ mutex_lock(&linecard->state_lock);
+ WARN_ON(linecard->state != DEVLINK_LINECARD_STATE_PROVISIONED);
+ linecard->state = DEVLINK_LINECARD_STATE_ACTIVE;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_activate);
+
+/**
+ * devlink_linecard_deactivate - Set linecard inactive
+ *
+ * @linecard: devlink linecard
+ */
+void devlink_linecard_deactivate(struct devlink_linecard *linecard)
+{
+ mutex_lock(&linecard->state_lock);
+ switch (linecard->state) {
+ case DEVLINK_LINECARD_STATE_ACTIVE:
+ linecard->state = DEVLINK_LINECARD_STATE_PROVISIONED;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+ break;
+ case DEVLINK_LINECARD_STATE_UNPROVISIONING:
+ /* Line card is being deactivated as part
+ * of unprovisioning flow.
+ */
+ break;
+ default:
+ WARN_ON(1);
+ break;
+ }
+ mutex_unlock(&linecard->state_lock);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_deactivate);
+
+static void devlink_linecard_rel_notify_cb(struct devlink *devlink,
+ u32 linecard_index)
+{
+ struct devlink_linecard *linecard;
+
+ linecard = devlink_linecard_get_by_index(devlink, linecard_index);
+ if (!linecard)
+ return;
+ devlink_linecard_notify(linecard, DEVLINK_CMD_LINECARD_NEW);
+}
+
+static void devlink_linecard_rel_cleanup_cb(struct devlink *devlink,
+ u32 linecard_index, u32 rel_index)
+{
+ struct devlink_linecard *linecard;
+
+ linecard = devlink_linecard_get_by_index(devlink, linecard_index);
+ if (linecard && linecard->rel_index == rel_index)
+ linecard->rel_index = 0;
+}
+
+/**
+ * devlink_linecard_nested_dl_set - Attach/detach nested devlink
+ * instance to linecard.
+ *
+ * @linecard: devlink linecard
+ * @nested_devlink: devlink instance to attach or NULL to detach
+ */
+int devlink_linecard_nested_dl_set(struct devlink_linecard *linecard,
+ struct devlink *nested_devlink)
+{
+ return devlink_rel_nested_in_add(&linecard->rel_index,
+ linecard->devlink->index,
+ linecard->index,
+ devlink_linecard_rel_notify_cb,
+ devlink_linecard_rel_cleanup_cb,
+ nested_devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_linecard_nested_dl_set);
diff --git a/net/devlink/netlink.c b/net/devlink/netlink.c
new file mode 100644
index 000000000000..593605c1b1ef
--- /dev/null
+++ b/net/devlink/netlink.c
@@ -0,0 +1,376 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include <net/genetlink.h>
+#include <net/sock.h>
+
+#include "devl_internal.h"
+
+#define DEVLINK_NL_FLAG_NEED_PORT BIT(0)
+#define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1)
+#define DEVLINK_NL_FLAG_NEED_DEV_LOCK BIT(2)
+
+static const struct genl_multicast_group devlink_nl_mcgrps[] = {
+ [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME },
+};
+
+struct devlink_nl_sock_priv {
+ struct devlink_obj_desc __rcu *flt;
+ spinlock_t flt_lock; /* Protects flt. */
+};
+
+static void devlink_nl_sock_priv_init(void *priv)
+{
+ struct devlink_nl_sock_priv *sk_priv = priv;
+
+ spin_lock_init(&sk_priv->flt_lock);
+}
+
+static void devlink_nl_sock_priv_destroy(void *priv)
+{
+ struct devlink_nl_sock_priv *sk_priv = priv;
+ struct devlink_obj_desc *flt;
+
+ flt = rcu_dereference_protected(sk_priv->flt, true);
+ kfree_rcu(flt, rcu);
+}
+
+int devlink_nl_notify_filter_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_nl_sock_priv *sk_priv;
+ struct nlattr **attrs = info->attrs;
+ struct devlink_obj_desc *flt;
+ size_t data_offset = 0;
+ size_t data_size = 0;
+ char *pos;
+
+ if (attrs[DEVLINK_ATTR_BUS_NAME])
+ data_size = size_add(data_size,
+ nla_len(attrs[DEVLINK_ATTR_BUS_NAME]) + 1);
+ if (attrs[DEVLINK_ATTR_DEV_NAME])
+ data_size = size_add(data_size,
+ nla_len(attrs[DEVLINK_ATTR_DEV_NAME]) + 1);
+
+ flt = kzalloc(size_add(sizeof(*flt), data_size), GFP_KERNEL);
+ if (!flt)
+ return -ENOMEM;
+
+ pos = (char *) flt->data;
+ if (attrs[DEVLINK_ATTR_BUS_NAME]) {
+ data_offset += nla_strscpy(pos,
+ attrs[DEVLINK_ATTR_BUS_NAME],
+ data_size) + 1;
+ flt->bus_name = pos;
+ pos += data_offset;
+ }
+ if (attrs[DEVLINK_ATTR_DEV_NAME]) {
+ nla_strscpy(pos, attrs[DEVLINK_ATTR_DEV_NAME],
+ data_size - data_offset);
+ flt->dev_name = pos;
+ }
+
+ if (attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ flt->port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]);
+ flt->port_index_valid = true;
+ }
+
+ /* Don't attach empty filter. */
+ if (!flt->bus_name && !flt->dev_name && !flt->port_index_valid) {
+ kfree(flt);
+ flt = NULL;
+ }
+
+ sk_priv = genl_sk_priv_get(&devlink_nl_family, NETLINK_CB(skb).sk);
+ if (IS_ERR(sk_priv)) {
+ kfree(flt);
+ return PTR_ERR(sk_priv);
+ }
+ spin_lock(&sk_priv->flt_lock);
+ flt = rcu_replace_pointer(sk_priv->flt, flt,
+ lockdep_is_held(&sk_priv->flt_lock));
+ spin_unlock(&sk_priv->flt_lock);
+ kfree_rcu(flt, rcu);
+ return 0;
+}
+
+static bool devlink_obj_desc_match(const struct devlink_obj_desc *desc,
+ const struct devlink_obj_desc *flt)
+{
+ if (desc->bus_name && flt->bus_name &&
+ strcmp(desc->bus_name, flt->bus_name))
+ return false;
+ if (desc->dev_name && flt->dev_name &&
+ strcmp(desc->dev_name, flt->dev_name))
+ return false;
+ if (desc->port_index_valid && flt->port_index_valid &&
+ desc->port_index != flt->port_index)
+ return false;
+ return true;
+}
+
+int devlink_nl_notify_filter(struct sock *dsk, struct sk_buff *skb, void *data)
+{
+ struct devlink_obj_desc *desc = data;
+ struct devlink_nl_sock_priv *sk_priv;
+ struct devlink_obj_desc *flt;
+ int ret = 0;
+
+ rcu_read_lock();
+ sk_priv = __genl_sk_priv_get(&devlink_nl_family, dsk);
+ if (!IS_ERR_OR_NULL(sk_priv)) {
+ flt = rcu_dereference(sk_priv->flt);
+ if (flt)
+ ret = !devlink_obj_desc_match(desc, flt);
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+int devlink_nl_put_nested_handle(struct sk_buff *msg, struct net *net,
+ struct devlink *devlink, int attrtype)
+{
+ struct nlattr *nested_attr;
+ struct net *devl_net;
+
+ nested_attr = nla_nest_start(msg, attrtype);
+ if (!nested_attr)
+ return -EMSGSIZE;
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+ devl_net = read_pnet_rcu(&devlink->_net);
+ if (!net_eq(net, devl_net)) {
+ int id = peernet2id_alloc(net, devl_net, GFP_ATOMIC);
+
+ rcu_read_unlock();
+ if (nla_put_s32(msg, DEVLINK_ATTR_NETNS_ID, id))
+ return -EMSGSIZE;
+ } else {
+ rcu_read_unlock();
+ }
+
+ nla_nest_end(msg, nested_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, nested_attr);
+ return -EMSGSIZE;
+}
+
+int devlink_nl_msg_reply_and_new(struct sk_buff **msg, struct genl_info *info)
+{
+ int err;
+
+ if (*msg) {
+ err = genlmsg_reply(*msg, info);
+ if (err)
+ return err;
+ }
+ *msg = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!*msg)
+ return -ENOMEM;
+ return 0;
+}
+
+struct devlink *
+devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs,
+ bool dev_lock)
+{
+ struct devlink *devlink;
+ unsigned long index;
+ char *busname;
+ char *devname;
+
+ if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME])
+ return ERR_PTR(-EINVAL);
+
+ busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]);
+ devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]);
+
+ devlinks_xa_for_each_registered_get(net, index, devlink) {
+ if (strcmp(devlink->dev->bus->name, busname) == 0 &&
+ strcmp(dev_name(devlink->dev), devname) == 0) {
+ devl_dev_lock(devlink, dev_lock);
+ if (devl_is_registered(devlink))
+ return devlink;
+ devl_dev_unlock(devlink, dev_lock);
+ }
+ devlink_put(devlink);
+ }
+
+ return ERR_PTR(-ENODEV);
+}
+
+static int __devlink_nl_pre_doit(struct sk_buff *skb, struct genl_info *info,
+ u8 flags)
+{
+ bool dev_lock = flags & DEVLINK_NL_FLAG_NEED_DEV_LOCK;
+ struct devlink_port *devlink_port;
+ struct devlink *devlink;
+ int err;
+
+ devlink = devlink_get_from_attrs_lock(genl_info_net(info), info->attrs,
+ dev_lock);
+ if (IS_ERR(devlink))
+ return PTR_ERR(devlink);
+
+ info->user_ptr[0] = devlink;
+ if (flags & DEVLINK_NL_FLAG_NEED_PORT) {
+ devlink_port = devlink_port_get_from_info(devlink, info);
+ if (IS_ERR(devlink_port)) {
+ err = PTR_ERR(devlink_port);
+ goto unlock;
+ }
+ info->user_ptr[1] = devlink_port;
+ } else if (flags & DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT) {
+ devlink_port = devlink_port_get_from_info(devlink, info);
+ if (!IS_ERR(devlink_port))
+ info->user_ptr[1] = devlink_port;
+ }
+ return 0;
+
+unlock:
+ devl_dev_unlock(devlink, dev_lock);
+ devlink_put(devlink);
+ return err;
+}
+
+int devlink_nl_pre_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ return __devlink_nl_pre_doit(skb, info, 0);
+}
+
+int devlink_nl_pre_doit_port(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ return __devlink_nl_pre_doit(skb, info, DEVLINK_NL_FLAG_NEED_PORT);
+}
+
+int devlink_nl_pre_doit_dev_lock(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ return __devlink_nl_pre_doit(skb, info, DEVLINK_NL_FLAG_NEED_DEV_LOCK);
+}
+
+int devlink_nl_pre_doit_port_optional(const struct genl_split_ops *ops,
+ struct sk_buff *skb,
+ struct genl_info *info)
+{
+ return __devlink_nl_pre_doit(skb, info, DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT);
+}
+
+static void __devlink_nl_post_doit(struct sk_buff *skb, struct genl_info *info,
+ u8 flags)
+{
+ bool dev_lock = flags & DEVLINK_NL_FLAG_NEED_DEV_LOCK;
+ struct devlink *devlink;
+
+ devlink = info->user_ptr[0];
+ devl_dev_unlock(devlink, dev_lock);
+ devlink_put(devlink);
+}
+
+void devlink_nl_post_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ __devlink_nl_post_doit(skb, info, 0);
+}
+
+void
+devlink_nl_post_doit_dev_lock(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ __devlink_nl_post_doit(skb, info, DEVLINK_NL_FLAG_NEED_DEV_LOCK);
+}
+
+static int devlink_nl_inst_single_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb, int flags,
+ devlink_nl_dump_one_func_t *dump_one,
+ struct nlattr **attrs)
+{
+ struct devlink *devlink;
+ int err;
+
+ devlink = devlink_get_from_attrs_lock(sock_net(msg->sk), attrs, false);
+ if (IS_ERR(devlink))
+ return PTR_ERR(devlink);
+ err = dump_one(msg, devlink, cb, flags | NLM_F_DUMP_FILTERED);
+
+ devl_unlock(devlink);
+ devlink_put(devlink);
+
+ if (err != -EMSGSIZE)
+ return err;
+ return msg->len;
+}
+
+static int devlink_nl_inst_iter_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb, int flags,
+ devlink_nl_dump_one_func_t *dump_one)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink *devlink;
+ int err = 0;
+
+ while ((devlink = devlinks_xa_find_get(sock_net(msg->sk),
+ &state->instance))) {
+ devl_lock(devlink);
+
+ if (devl_is_registered(devlink))
+ err = dump_one(msg, devlink, cb, flags);
+ else
+ err = 0;
+
+ devl_unlock(devlink);
+ devlink_put(devlink);
+
+ if (err)
+ break;
+
+ state->instance++;
+
+ /* restart sub-object walk for the next instance */
+ state->idx = 0;
+ }
+
+ if (err != -EMSGSIZE)
+ return err;
+ return msg->len;
+}
+
+int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb,
+ devlink_nl_dump_one_func_t *dump_one)
+{
+ const struct genl_info *info = genl_info_dump(cb);
+ struct nlattr **attrs = info->attrs;
+ int flags = NLM_F_MULTI;
+
+ if (attrs &&
+ (attrs[DEVLINK_ATTR_BUS_NAME] || attrs[DEVLINK_ATTR_DEV_NAME]))
+ return devlink_nl_inst_single_dumpit(msg, cb, flags, dump_one,
+ attrs);
+ else
+ return devlink_nl_inst_iter_dumpit(msg, cb, flags, dump_one);
+}
+
+struct genl_family devlink_nl_family __ro_after_init = {
+ .name = DEVLINK_GENL_NAME,
+ .version = DEVLINK_GENL_VERSION,
+ .netnsok = true,
+ .parallel_ops = true,
+ .module = THIS_MODULE,
+ .split_ops = devlink_nl_ops,
+ .n_split_ops = ARRAY_SIZE(devlink_nl_ops),
+ .resv_start_op = DEVLINK_CMD_SELFTESTS_RUN + 1,
+ .mcgrps = devlink_nl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(devlink_nl_mcgrps),
+ .sock_priv_size = sizeof(struct devlink_nl_sock_priv),
+ .sock_priv_init = devlink_nl_sock_priv_init,
+ .sock_priv_destroy = devlink_nl_sock_priv_destroy,
+};
diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c
new file mode 100644
index 000000000000..f4c61c2b4f22
--- /dev/null
+++ b/net/devlink/netlink_gen.c
@@ -0,0 +1,1287 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/devlink.yaml */
+/* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "netlink_gen.h"
+
+#include <uapi/linux/devlink.h>
+
+/* Sparse enums validation callbacks */
+static int
+devlink_attr_param_type_validate(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ switch (nla_get_u8(attr)) {
+ case DEVLINK_VAR_ATTR_TYPE_U8:
+ fallthrough;
+ case DEVLINK_VAR_ATTR_TYPE_U16:
+ fallthrough;
+ case DEVLINK_VAR_ATTR_TYPE_U32:
+ fallthrough;
+ case DEVLINK_VAR_ATTR_TYPE_U64:
+ fallthrough;
+ case DEVLINK_VAR_ATTR_TYPE_STRING:
+ fallthrough;
+ case DEVLINK_VAR_ATTR_TYPE_FLAG:
+ fallthrough;
+ case DEVLINK_VAR_ATTR_TYPE_NUL_STRING:
+ fallthrough;
+ case DEVLINK_VAR_ATTR_TYPE_BINARY:
+ return 0;
+ }
+ NL_SET_ERR_MSG_ATTR(extack, attr, "invalid enum value");
+ return -EINVAL;
+}
+
+/* Common nested types */
+const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1] = {
+ [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY, },
+ [DEVLINK_PORT_FN_ATTR_STATE] = NLA_POLICY_MAX(NLA_U8, 1),
+ [DEVLINK_PORT_FN_ATTR_OPSTATE] = NLA_POLICY_MAX(NLA_U8, 1),
+ [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(15),
+};
+
+const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_RATE_TC_ATTR_BW + 1] = {
+ [DEVLINK_RATE_TC_ATTR_INDEX] = NLA_POLICY_MAX(NLA_U8, DEVLINK_RATE_TC_INDEX_MAX),
+ [DEVLINK_RATE_TC_ATTR_BW] = { .type = NLA_U32, },
+};
+
+const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1] = {
+ [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG, },
+};
+
+/* DEVLINK_CMD_GET - do */
+static const struct nla_policy devlink_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_PORT_GET - do */
+static const struct nla_policy devlink_port_get_do_nl_policy[DEVLINK_ATTR_PORT_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_PORT_GET - dump */
+static const struct nla_policy devlink_port_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_PORT_SET - do */
+static const struct nla_policy devlink_port_set_nl_policy[DEVLINK_ATTR_PORT_FUNCTION + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_PORT_TYPE] = NLA_POLICY_MAX(NLA_U16, 3),
+ [DEVLINK_ATTR_PORT_FUNCTION] = NLA_POLICY_NESTED(devlink_dl_port_function_nl_policy),
+};
+
+/* DEVLINK_CMD_PORT_NEW - do */
+static const struct nla_policy devlink_port_new_nl_policy[DEVLINK_ATTR_PORT_PCI_SF_NUMBER + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_PORT_FLAVOUR] = NLA_POLICY_MAX(NLA_U16, 7),
+ [DEVLINK_ATTR_PORT_PCI_PF_NUMBER] = { .type = NLA_U16, },
+ [DEVLINK_ATTR_PORT_PCI_SF_NUMBER] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_PORT_CONTROLLER_NUMBER] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_PORT_DEL - do */
+static const struct nla_policy devlink_port_del_nl_policy[DEVLINK_ATTR_PORT_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_PORT_SPLIT - do */
+static const struct nla_policy devlink_port_split_nl_policy[DEVLINK_ATTR_PORT_SPLIT_COUNT + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_PORT_UNSPLIT - do */
+static const struct nla_policy devlink_port_unsplit_nl_policy[DEVLINK_ATTR_PORT_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_SB_GET - do */
+static const struct nla_policy devlink_sb_get_do_nl_policy[DEVLINK_ATTR_SB_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_SB_GET - dump */
+static const struct nla_policy devlink_sb_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_SB_POOL_GET - do */
+static const struct nla_policy devlink_sb_pool_get_do_nl_policy[DEVLINK_ATTR_SB_POOL_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16, },
+};
+
+/* DEVLINK_CMD_SB_POOL_GET - dump */
+static const struct nla_policy devlink_sb_pool_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_SB_POOL_SET - do */
+static const struct nla_policy devlink_sb_pool_set_nl_policy[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16, },
+ [DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE] = NLA_POLICY_MAX(NLA_U8, 1),
+ [DEVLINK_ATTR_SB_POOL_SIZE] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_SB_PORT_POOL_GET - do */
+static const struct nla_policy devlink_sb_port_pool_get_do_nl_policy[DEVLINK_ATTR_SB_POOL_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16, },
+};
+
+/* DEVLINK_CMD_SB_PORT_POOL_GET - dump */
+static const struct nla_policy devlink_sb_port_pool_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_SB_PORT_POOL_SET - do */
+static const struct nla_policy devlink_sb_port_pool_set_nl_policy[DEVLINK_ATTR_SB_THRESHOLD + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16, },
+ [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_SB_TC_POOL_BIND_GET - do */
+static const struct nla_policy devlink_sb_tc_pool_bind_get_do_nl_policy[DEVLINK_ATTR_SB_TC_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_SB_POOL_TYPE] = NLA_POLICY_MAX(NLA_U8, 1),
+ [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16, },
+};
+
+/* DEVLINK_CMD_SB_TC_POOL_BIND_GET - dump */
+static const struct nla_policy devlink_sb_tc_pool_bind_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_SB_TC_POOL_BIND_SET - do */
+static const struct nla_policy devlink_sb_tc_pool_bind_set_nl_policy[DEVLINK_ATTR_SB_TC_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_SB_POOL_INDEX] = { .type = NLA_U16, },
+ [DEVLINK_ATTR_SB_POOL_TYPE] = NLA_POLICY_MAX(NLA_U8, 1),
+ [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16, },
+ [DEVLINK_ATTR_SB_THRESHOLD] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_SB_OCC_SNAPSHOT - do */
+static const struct nla_policy devlink_sb_occ_snapshot_nl_policy[DEVLINK_ATTR_SB_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_SB_OCC_MAX_CLEAR - do */
+static const struct nla_policy devlink_sb_occ_max_clear_nl_policy[DEVLINK_ATTR_SB_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_SB_INDEX] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_ESWITCH_GET - do */
+static const struct nla_policy devlink_eswitch_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_ESWITCH_SET - do */
+static const struct nla_policy devlink_eswitch_set_nl_policy[DEVLINK_ATTR_ESWITCH_ENCAP_MODE + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_ESWITCH_MODE] = NLA_POLICY_MAX(NLA_U16, 2),
+ [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = NLA_POLICY_MAX(NLA_U8, 3),
+ [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = NLA_POLICY_MAX(NLA_U8, 1),
+};
+
+/* DEVLINK_CMD_DPIPE_TABLE_GET - do */
+static const struct nla_policy devlink_dpipe_table_get_nl_policy[DEVLINK_ATTR_DPIPE_TABLE_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_DPIPE_ENTRIES_GET - do */
+static const struct nla_policy devlink_dpipe_entries_get_nl_policy[DEVLINK_ATTR_DPIPE_TABLE_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_DPIPE_HEADERS_GET - do */
+static const struct nla_policy devlink_dpipe_headers_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET - do */
+static const struct nla_policy devlink_dpipe_table_counters_set_nl_policy[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8, },
+};
+
+/* DEVLINK_CMD_RESOURCE_SET - do */
+static const struct nla_policy devlink_resource_set_nl_policy[DEVLINK_ATTR_RESOURCE_SIZE + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64, },
+ [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64, },
+};
+
+/* DEVLINK_CMD_RESOURCE_DUMP - do */
+static const struct nla_policy devlink_resource_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_RELOAD - do */
+static const struct nla_policy devlink_reload_nl_policy[DEVLINK_ATTR_RELOAD_LIMITS + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RELOAD_ACTION] = NLA_POLICY_RANGE(NLA_U8, 1, 2),
+ [DEVLINK_ATTR_RELOAD_LIMITS] = NLA_POLICY_BITFIELD32(6),
+ [DEVLINK_ATTR_NETNS_PID] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_NETNS_FD] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_NETNS_ID] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_PARAM_GET - do */
+static const struct nla_policy devlink_param_get_do_nl_policy[DEVLINK_ATTR_PARAM_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_PARAM_GET - dump */
+static const struct nla_policy devlink_param_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_PARAM_SET - do */
+static const struct nla_policy devlink_param_set_nl_policy[DEVLINK_ATTR_PARAM_RESET_DEFAULT + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PARAM_TYPE] = NLA_POLICY_VALIDATE_FN(NLA_U8, &devlink_attr_param_type_validate),
+ [DEVLINK_ATTR_PARAM_VALUE_CMODE] = NLA_POLICY_MAX(NLA_U8, 2),
+ [DEVLINK_ATTR_PARAM_RESET_DEFAULT] = { .type = NLA_FLAG, },
+};
+
+/* DEVLINK_CMD_REGION_GET - do */
+static const struct nla_policy devlink_region_get_do_nl_policy[DEVLINK_ATTR_REGION_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_REGION_GET - dump */
+static const struct nla_policy devlink_region_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_REGION_NEW - do */
+static const struct nla_policy devlink_region_new_nl_policy[DEVLINK_ATTR_REGION_SNAPSHOT_ID + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_REGION_DEL - do */
+static const struct nla_policy devlink_region_del_nl_policy[DEVLINK_ATTR_REGION_SNAPSHOT_ID + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_REGION_READ - dump */
+static const struct nla_policy devlink_region_read_nl_policy[DEVLINK_ATTR_REGION_DIRECT + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_REGION_DIRECT] = { .type = NLA_FLAG, },
+ [DEVLINK_ATTR_REGION_CHUNK_ADDR] = { .type = NLA_U64, },
+ [DEVLINK_ATTR_REGION_CHUNK_LEN] = { .type = NLA_U64, },
+};
+
+/* DEVLINK_CMD_PORT_PARAM_GET - do */
+static const struct nla_policy devlink_port_param_get_nl_policy[DEVLINK_ATTR_PORT_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_PORT_PARAM_SET - do */
+static const struct nla_policy devlink_port_param_set_nl_policy[DEVLINK_ATTR_PORT_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_INFO_GET - do */
+static const struct nla_policy devlink_info_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_HEALTH_REPORTER_GET - do */
+static const struct nla_policy devlink_health_reporter_get_do_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_HEALTH_REPORTER_GET - dump */
+static const struct nla_policy devlink_health_reporter_get_dump_nl_policy[DEVLINK_ATTR_PORT_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_HEALTH_REPORTER_SET - do */
+static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD] = { .type = NLA_U64, },
+};
+
+/* DEVLINK_CMD_HEALTH_REPORTER_RECOVER - do */
+static const struct nla_policy devlink_health_reporter_recover_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE - do */
+static const struct nla_policy devlink_health_reporter_diagnose_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET - dump */
+static const struct nla_policy devlink_health_reporter_dump_get_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR - do */
+static const struct nla_policy devlink_health_reporter_dump_clear_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_FLASH_UPDATE - do */
+static const struct nla_policy devlink_flash_update_nl_policy[DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK] = NLA_POLICY_BITFIELD32(3),
+};
+
+/* DEVLINK_CMD_TRAP_GET - do */
+static const struct nla_policy devlink_trap_get_do_nl_policy[DEVLINK_ATTR_TRAP_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_TRAP_GET - dump */
+static const struct nla_policy devlink_trap_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_TRAP_SET - do */
+static const struct nla_policy devlink_trap_set_nl_policy[DEVLINK_ATTR_TRAP_ACTION + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_TRAP_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_TRAP_ACTION] = NLA_POLICY_MAX(NLA_U8, 2),
+};
+
+/* DEVLINK_CMD_TRAP_GROUP_GET - do */
+static const struct nla_policy devlink_trap_group_get_do_nl_policy[DEVLINK_ATTR_TRAP_GROUP_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_TRAP_GROUP_GET - dump */
+static const struct nla_policy devlink_trap_group_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_TRAP_GROUP_SET - do */
+static const struct nla_policy devlink_trap_group_set_nl_policy[DEVLINK_ATTR_TRAP_POLICER_ID + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_TRAP_GROUP_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_TRAP_ACTION] = NLA_POLICY_MAX(NLA_U8, 2),
+ [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_TRAP_POLICER_GET - do */
+static const struct nla_policy devlink_trap_policer_get_do_nl_policy[DEVLINK_ATTR_TRAP_POLICER_ID + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_TRAP_POLICER_GET - dump */
+static const struct nla_policy devlink_trap_policer_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_TRAP_POLICER_SET - do */
+static const struct nla_policy devlink_trap_policer_set_nl_policy[DEVLINK_ATTR_TRAP_POLICER_BURST + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_TRAP_POLICER_ID] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_TRAP_POLICER_RATE] = { .type = NLA_U64, },
+ [DEVLINK_ATTR_TRAP_POLICER_BURST] = { .type = NLA_U64, },
+};
+
+/* DEVLINK_CMD_HEALTH_REPORTER_TEST - do */
+static const struct nla_policy devlink_health_reporter_test_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_RATE_GET - do */
+static const struct nla_policy devlink_rate_get_do_nl_policy[DEVLINK_ATTR_RATE_NODE_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_RATE_GET - dump */
+static const struct nla_policy devlink_rate_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_RATE_SET - do */
+static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TC_BWS + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RATE_TX_SHARE] = { .type = NLA_U64, },
+ [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64, },
+ [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RATE_TC_BWS] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bws_nl_policy),
+};
+
+/* DEVLINK_CMD_RATE_NEW - do */
+static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TC_BWS + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RATE_TX_SHARE] = { .type = NLA_U64, },
+ [DEVLINK_ATTR_RATE_TX_MAX] = { .type = NLA_U64, },
+ [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RATE_TC_BWS] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bws_nl_policy),
+};
+
+/* DEVLINK_CMD_RATE_DEL - do */
+static const struct nla_policy devlink_rate_del_nl_policy[DEVLINK_ATTR_RATE_NODE_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_LINECARD_GET - do */
+static const struct nla_policy devlink_linecard_get_do_nl_policy[DEVLINK_ATTR_LINECARD_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32, },
+};
+
+/* DEVLINK_CMD_LINECARD_GET - dump */
+static const struct nla_policy devlink_linecard_get_dump_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_LINECARD_SET - do */
+static const struct nla_policy devlink_linecard_set_nl_policy[DEVLINK_ATTR_LINECARD_TYPE + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32, },
+ [DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_SELFTESTS_GET - do */
+static const struct nla_policy devlink_selftests_get_nl_policy[DEVLINK_ATTR_DEV_NAME + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+};
+
+/* DEVLINK_CMD_SELFTESTS_RUN - do */
+static const struct nla_policy devlink_selftests_run_nl_policy[DEVLINK_ATTR_SELFTESTS + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_SELFTESTS] = NLA_POLICY_NESTED(devlink_dl_selftest_id_nl_policy),
+};
+
+/* DEVLINK_CMD_NOTIFY_FILTER_SET - do */
+static const struct nla_policy devlink_notify_filter_set_nl_policy[DEVLINK_ATTR_PORT_INDEX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
+};
+
+/* Ops table for devlink */
+const struct genl_split_ops devlink_nl_ops[74] = {
+ {
+ .cmd = DEVLINK_CMD_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_get_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_GET,
+ .validate = GENL_DONT_VALIDATE_DUMP,
+ .dumpit = devlink_nl_get_dumpit,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_port_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_port_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_PORT_INDEX,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_GET,
+ .dumpit = devlink_nl_port_get_dumpit,
+ .policy = devlink_port_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_port_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_port_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_PORT_FUNCTION,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_NEW,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_port_new_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_port_new_nl_policy,
+ .maxattr = DEVLINK_ATTR_PORT_PCI_SF_NUMBER,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_DEL,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_port_del_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_port_del_nl_policy,
+ .maxattr = DEVLINK_ATTR_PORT_INDEX,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_SPLIT,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_port_split_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_port_split_nl_policy,
+ .maxattr = DEVLINK_ATTR_PORT_SPLIT_COUNT,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_UNSPLIT,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_port_unsplit_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_port_unsplit_nl_policy,
+ .maxattr = DEVLINK_ATTR_PORT_INDEX,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_sb_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_sb_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_SB_INDEX,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_GET,
+ .dumpit = devlink_nl_sb_get_dumpit,
+ .policy = devlink_sb_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_POOL_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_sb_pool_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_sb_pool_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_SB_POOL_INDEX,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_POOL_GET,
+ .dumpit = devlink_nl_sb_pool_get_dumpit,
+ .policy = devlink_sb_pool_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_POOL_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_sb_pool_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_sb_pool_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_PORT_POOL_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_sb_port_pool_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_sb_port_pool_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_SB_POOL_INDEX,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_PORT_POOL_GET,
+ .dumpit = devlink_nl_sb_port_pool_get_dumpit,
+ .policy = devlink_sb_port_pool_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_PORT_POOL_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_sb_port_pool_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_sb_port_pool_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_SB_THRESHOLD,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_sb_tc_pool_bind_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_sb_tc_pool_bind_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_SB_TC_INDEX,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_GET,
+ .dumpit = devlink_nl_sb_tc_pool_bind_get_dumpit,
+ .policy = devlink_sb_tc_pool_bind_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_TC_POOL_BIND_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_sb_tc_pool_bind_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_sb_tc_pool_bind_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_SB_TC_INDEX,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_OCC_SNAPSHOT,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_sb_occ_snapshot_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_sb_occ_snapshot_nl_policy,
+ .maxattr = DEVLINK_ATTR_SB_INDEX,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_sb_occ_max_clear_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_sb_occ_max_clear_nl_policy,
+ .maxattr = DEVLINK_ATTR_SB_INDEX,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_ESWITCH_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_eswitch_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_eswitch_get_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_ESWITCH_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_eswitch_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_eswitch_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_ESWITCH_ENCAP_MODE,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_DPIPE_TABLE_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_dpipe_table_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_dpipe_table_get_nl_policy,
+ .maxattr = DEVLINK_ATTR_DPIPE_TABLE_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_dpipe_entries_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_dpipe_entries_get_nl_policy,
+ .maxattr = DEVLINK_ATTR_DPIPE_TABLE_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_DPIPE_HEADERS_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_dpipe_headers_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_dpipe_headers_get_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_dpipe_table_counters_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_dpipe_table_counters_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_RESOURCE_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_resource_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_resource_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_RESOURCE_SIZE,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_RESOURCE_DUMP,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_resource_dump_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_resource_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_RELOAD,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_dev_lock,
+ .doit = devlink_nl_reload_doit,
+ .post_doit = devlink_nl_post_doit_dev_lock,
+ .policy = devlink_reload_nl_policy,
+ .maxattr = DEVLINK_ATTR_RELOAD_LIMITS,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_PARAM_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_param_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_param_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_PARAM_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_PARAM_GET,
+ .dumpit = devlink_nl_param_get_dumpit,
+ .policy = devlink_param_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_PARAM_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_param_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_param_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_PARAM_RESET_DEFAULT,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_REGION_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port_optional,
+ .doit = devlink_nl_region_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_region_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_REGION_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_REGION_GET,
+ .dumpit = devlink_nl_region_get_dumpit,
+ .policy = devlink_region_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_REGION_NEW,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port_optional,
+ .doit = devlink_nl_region_new_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_region_new_nl_policy,
+ .maxattr = DEVLINK_ATTR_REGION_SNAPSHOT_ID,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_REGION_DEL,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port_optional,
+ .doit = devlink_nl_region_del_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_region_del_nl_policy,
+ .maxattr = DEVLINK_ATTR_REGION_SNAPSHOT_ID,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_REGION_READ,
+ .validate = GENL_DONT_VALIDATE_DUMP_STRICT,
+ .dumpit = devlink_nl_region_read_dumpit,
+ .policy = devlink_region_read_nl_policy,
+ .maxattr = DEVLINK_ATTR_REGION_DIRECT,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_PARAM_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_port_param_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_port_param_get_nl_policy,
+ .maxattr = DEVLINK_ATTR_PORT_INDEX,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_PARAM_GET,
+ .validate = GENL_DONT_VALIDATE_DUMP_STRICT,
+ .dumpit = devlink_nl_port_param_get_dumpit,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_PARAM_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port,
+ .doit = devlink_nl_port_param_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_port_param_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_PORT_INDEX,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_INFO_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_info_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_info_get_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_INFO_GET,
+ .validate = GENL_DONT_VALIDATE_DUMP,
+ .dumpit = devlink_nl_info_get_dumpit,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port_optional,
+ .doit = devlink_nl_health_reporter_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_health_reporter_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET,
+ .dumpit = devlink_nl_health_reporter_get_dumpit,
+ .policy = devlink_health_reporter_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_PORT_INDEX,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_HEALTH_REPORTER_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port_optional,
+ .doit = devlink_nl_health_reporter_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_health_reporter_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_HEALTH_REPORTER_RECOVER,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port_optional,
+ .doit = devlink_nl_health_reporter_recover_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_health_reporter_recover_nl_policy,
+ .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_NAME,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port_optional,
+ .doit = devlink_nl_health_reporter_diagnose_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_health_reporter_diagnose_nl_policy,
+ .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_NAME,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET,
+ .validate = GENL_DONT_VALIDATE_DUMP_STRICT,
+ .dumpit = devlink_nl_health_reporter_dump_get_dumpit,
+ .policy = devlink_health_reporter_dump_get_nl_policy,
+ .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_NAME,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port_optional,
+ .doit = devlink_nl_health_reporter_dump_clear_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_health_reporter_dump_clear_nl_policy,
+ .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_NAME,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_FLASH_UPDATE,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_flash_update_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_flash_update_nl_policy,
+ .maxattr = DEVLINK_ATTR_FLASH_UPDATE_OVERWRITE_MASK,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_trap_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_trap_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_TRAP_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_GET,
+ .dumpit = devlink_nl_trap_get_dumpit,
+ .policy = devlink_trap_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_trap_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_trap_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_TRAP_ACTION,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_GROUP_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_trap_group_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_trap_group_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_TRAP_GROUP_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_GROUP_GET,
+ .dumpit = devlink_nl_trap_group_get_dumpit,
+ .policy = devlink_trap_group_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_GROUP_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_trap_group_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_trap_group_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_TRAP_POLICER_ID,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_POLICER_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_trap_policer_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_trap_policer_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_TRAP_POLICER_ID,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_POLICER_GET,
+ .dumpit = devlink_nl_trap_policer_get_dumpit,
+ .policy = devlink_trap_policer_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_TRAP_POLICER_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_trap_policer_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_trap_policer_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_TRAP_POLICER_BURST,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_HEALTH_REPORTER_TEST,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit_port_optional,
+ .doit = devlink_nl_health_reporter_test_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_health_reporter_test_nl_policy,
+ .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_NAME,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_RATE_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_rate_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_rate_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_RATE_NODE_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_RATE_GET,
+ .dumpit = devlink_nl_rate_get_dumpit,
+ .policy = devlink_rate_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_RATE_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_rate_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_rate_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_RATE_TC_BWS,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_RATE_NEW,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_rate_new_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_rate_new_nl_policy,
+ .maxattr = DEVLINK_ATTR_RATE_TC_BWS,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_RATE_DEL,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_rate_del_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_rate_del_nl_policy,
+ .maxattr = DEVLINK_ATTR_RATE_NODE_NAME,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_LINECARD_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_linecard_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_linecard_get_do_nl_policy,
+ .maxattr = DEVLINK_ATTR_LINECARD_INDEX,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_LINECARD_GET,
+ .dumpit = devlink_nl_linecard_get_dumpit,
+ .policy = devlink_linecard_get_dump_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_LINECARD_SET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_linecard_set_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_linecard_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_LINECARD_TYPE,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SELFTESTS_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_selftests_get_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_selftests_get_nl_policy,
+ .maxattr = DEVLINK_ATTR_DEV_NAME,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_SELFTESTS_GET,
+ .validate = GENL_DONT_VALIDATE_DUMP,
+ .dumpit = devlink_nl_selftests_get_dumpit,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = DEVLINK_CMD_SELFTESTS_RUN,
+ .validate = GENL_DONT_VALIDATE_STRICT,
+ .pre_doit = devlink_nl_pre_doit,
+ .doit = devlink_nl_selftests_run_doit,
+ .post_doit = devlink_nl_post_doit,
+ .policy = devlink_selftests_run_nl_policy,
+ .maxattr = DEVLINK_ATTR_SELFTESTS,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = DEVLINK_CMD_NOTIFY_FILTER_SET,
+ .doit = devlink_nl_notify_filter_set_doit,
+ .policy = devlink_notify_filter_set_nl_policy,
+ .maxattr = DEVLINK_ATTR_PORT_INDEX,
+ .flags = GENL_CMD_CAP_DO,
+ },
+};
diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h
new file mode 100644
index 000000000000..2817d53a0eba
--- /dev/null
+++ b/net/devlink/netlink_gen.h
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/devlink.yaml */
+/* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
+
+#ifndef _LINUX_DEVLINK_GEN_H
+#define _LINUX_DEVLINK_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/devlink.h>
+
+/* Common nested types */
+extern const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1];
+extern const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_RATE_TC_ATTR_BW + 1];
+extern const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1];
+
+/* Ops table for devlink */
+extern const struct genl_split_ops devlink_nl_ops[74];
+
+int devlink_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_pre_doit_port(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_pre_doit_dev_lock(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_pre_doit_port_optional(const struct genl_split_ops *ops,
+ struct sk_buff *skb,
+ struct genl_info *info);
+void
+devlink_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+ struct genl_info *info);
+void
+devlink_nl_post_doit_dev_lock(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info);
+
+int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_port_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_port_set_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_port_new_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_port_del_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_port_split_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_port_unsplit_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_sb_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_sb_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int devlink_nl_sb_pool_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_sb_pool_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_sb_pool_set_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_sb_port_pool_get_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_sb_port_pool_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_sb_port_pool_set_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_sb_tc_pool_bind_get_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_sb_tc_pool_bind_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_sb_tc_pool_bind_set_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_sb_occ_snapshot_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_sb_occ_max_clear_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_dpipe_table_get_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_dpipe_entries_get_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_dpipe_headers_get_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_dpipe_table_counters_set_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_resource_set_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_resource_dump_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_reload_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_param_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_param_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_param_set_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_region_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_region_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_region_new_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_region_del_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_region_read_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_port_param_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_port_param_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_port_param_set_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_info_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_info_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_health_reporter_get_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_health_reporter_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_health_reporter_set_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_health_reporter_recover_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_health_reporter_diagnose_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_health_reporter_dump_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_health_reporter_dump_clear_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_flash_update_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_trap_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_trap_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_trap_set_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_trap_group_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_trap_group_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_trap_group_set_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_trap_policer_get_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_trap_policer_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_trap_policer_set_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_health_reporter_test_doit(struct sk_buff *skb,
+ struct genl_info *info);
+int devlink_nl_rate_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_rate_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_rate_set_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_rate_new_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_rate_del_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_linecard_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_linecard_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_linecard_set_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_selftests_get_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_selftests_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb);
+int devlink_nl_selftests_run_doit(struct sk_buff *skb, struct genl_info *info);
+int devlink_nl_notify_filter_set_doit(struct sk_buff *skb,
+ struct genl_info *info);
+
+#endif /* _LINUX_DEVLINK_GEN_H */
diff --git a/net/devlink/param.c b/net/devlink/param.c
new file mode 100644
index 000000000000..e0ea93eded43
--- /dev/null
+++ b/net/devlink/param.c
@@ -0,0 +1,950 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include "devl_internal.h"
+
+static const struct devlink_param devlink_param_generic[] = {
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET,
+ .name = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_NAME,
+ .type = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_MAX_MACS,
+ .name = DEVLINK_PARAM_GENERIC_MAX_MACS_NAME,
+ .type = DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT,
+ .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME,
+ .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI,
+ .name = DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME,
+ .type = DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX,
+ .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME,
+ .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN,
+ .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME,
+ .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY,
+ .name = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME,
+ .type = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_RESET_DEV_ON_DRV_PROBE,
+ .name = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_NAME,
+ .type = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_REMOTE_DEV_RESET,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_ETH_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_ETH_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_VNET_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_VNET_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_IWARP,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE,
+ .name = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_NAME,
+ .type = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE,
+ .name = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME,
+ .type = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC,
+ .name = DEVLINK_PARAM_GENERIC_ENABLE_PHC_NAME,
+ .type = DEVLINK_PARAM_GENERIC_ENABLE_PHC_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_CLOCK_ID,
+ .name = DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME,
+ .type = DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS,
+ .name = DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME,
+ .type = DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_NUM_DOORBELLS,
+ .name = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_NAME,
+ .type = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_MAX_MAC_PER_VF,
+ .name = DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_NAME,
+ .type = DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_TYPE,
+ },
+};
+
+static int devlink_param_generic_verify(const struct devlink_param *param)
+{
+ /* verify it match generic parameter by id and name */
+ if (param->id > DEVLINK_PARAM_GENERIC_ID_MAX)
+ return -EINVAL;
+ if (strcmp(param->name, devlink_param_generic[param->id].name))
+ return -ENOENT;
+
+ WARN_ON(param->type != devlink_param_generic[param->id].type);
+
+ return 0;
+}
+
+static int devlink_param_driver_verify(const struct devlink_param *param)
+{
+ int i;
+
+ if (param->id <= DEVLINK_PARAM_GENERIC_ID_MAX)
+ return -EINVAL;
+ /* verify no such name in generic params */
+ for (i = 0; i <= DEVLINK_PARAM_GENERIC_ID_MAX; i++)
+ if (!strcmp(param->name, devlink_param_generic[i].name))
+ return -EEXIST;
+
+ return 0;
+}
+
+static struct devlink_param_item *
+devlink_param_find_by_name(struct xarray *params, const char *param_name)
+{
+ struct devlink_param_item *param_item;
+ unsigned long param_id;
+
+ xa_for_each(params, param_id, param_item) {
+ if (!strcmp(param_item->param->name, param_name))
+ return param_item;
+ }
+ return NULL;
+}
+
+static struct devlink_param_item *
+devlink_param_find_by_id(struct xarray *params, u32 param_id)
+{
+ return xa_load(params, param_id);
+}
+
+static bool
+devlink_param_cmode_is_supported(const struct devlink_param *param,
+ enum devlink_param_cmode cmode)
+{
+ return test_bit(cmode, &param->supported_cmodes);
+}
+
+static int devlink_param_get(struct devlink *devlink,
+ const struct devlink_param *param,
+ struct devlink_param_gset_ctx *ctx,
+ struct netlink_ext_ack *extack)
+{
+ if (!param->get)
+ return -EOPNOTSUPP;
+ return param->get(devlink, param->id, ctx, extack);
+}
+
+static int devlink_param_set(struct devlink *devlink,
+ const struct devlink_param *param,
+ struct devlink_param_gset_ctx *ctx,
+ struct netlink_ext_ack *extack)
+{
+ if (!param->set)
+ return -EOPNOTSUPP;
+ return param->set(devlink, param->id, ctx, extack);
+}
+
+static int devlink_param_get_default(struct devlink *devlink,
+ const struct devlink_param *param,
+ struct devlink_param_gset_ctx *ctx,
+ struct netlink_ext_ack *extack)
+{
+ if (!param->get_default)
+ return -EOPNOTSUPP;
+
+ return param->get_default(devlink, param->id, ctx, extack);
+}
+
+static int devlink_param_reset_default(struct devlink *devlink,
+ const struct devlink_param *param,
+ enum devlink_param_cmode cmode,
+ struct netlink_ext_ack *extack)
+{
+ if (!param->reset_default)
+ return -EOPNOTSUPP;
+
+ return param->reset_default(devlink, param->id, cmode, extack);
+}
+
+static int
+devlink_nl_param_value_put(struct sk_buff *msg, enum devlink_param_type type,
+ int nla_type, union devlink_param_value val,
+ bool flag_as_u8)
+{
+ switch (type) {
+ case DEVLINK_PARAM_TYPE_U8:
+ if (nla_put_u8(msg, nla_type, val.vu8))
+ return -EMSGSIZE;
+ break;
+ case DEVLINK_PARAM_TYPE_U16:
+ if (nla_put_u16(msg, nla_type, val.vu16))
+ return -EMSGSIZE;
+ break;
+ case DEVLINK_PARAM_TYPE_U32:
+ if (nla_put_u32(msg, nla_type, val.vu32))
+ return -EMSGSIZE;
+ break;
+ case DEVLINK_PARAM_TYPE_U64:
+ if (devlink_nl_put_u64(msg, nla_type, val.vu64))
+ return -EMSGSIZE;
+ break;
+ case DEVLINK_PARAM_TYPE_STRING:
+ if (nla_put_string(msg, nla_type, val.vstr))
+ return -EMSGSIZE;
+ break;
+ case DEVLINK_PARAM_TYPE_BOOL:
+ /* default values of type bool are encoded with u8, so that
+ * false can be distinguished from not present
+ */
+ if (flag_as_u8) {
+ if (nla_put_u8(msg, nla_type, val.vbool))
+ return -EMSGSIZE;
+ } else {
+ if (val.vbool && nla_put_flag(msg, nla_type))
+ return -EMSGSIZE;
+ }
+ break;
+ }
+ return 0;
+}
+
+static int
+devlink_nl_param_value_fill_one(struct sk_buff *msg,
+ enum devlink_param_type type,
+ enum devlink_param_cmode cmode,
+ union devlink_param_value val,
+ union devlink_param_value default_val,
+ bool has_default)
+{
+ struct nlattr *param_value_attr;
+ int err = -EMSGSIZE;
+
+ param_value_attr = nla_nest_start_noflag(msg,
+ DEVLINK_ATTR_PARAM_VALUE);
+ if (!param_value_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode))
+ goto value_nest_cancel;
+
+ err = devlink_nl_param_value_put(msg, type,
+ DEVLINK_ATTR_PARAM_VALUE_DATA,
+ val, false);
+ if (err)
+ goto value_nest_cancel;
+
+ if (has_default) {
+ err = devlink_nl_param_value_put(msg, type,
+ DEVLINK_ATTR_PARAM_VALUE_DEFAULT,
+ default_val, true);
+ if (err)
+ goto value_nest_cancel;
+ }
+
+ nla_nest_end(msg, param_value_attr);
+ return 0;
+
+value_nest_cancel:
+ nla_nest_cancel(msg, param_value_attr);
+ return err;
+}
+
+static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink,
+ unsigned int port_index,
+ struct devlink_param_item *param_item,
+ enum devlink_command cmd,
+ u32 portid, u32 seq, int flags,
+ struct netlink_ext_ack *extack)
+{
+ union devlink_param_value default_value[DEVLINK_PARAM_CMODE_MAX + 1];
+ union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1];
+ bool default_value_set[DEVLINK_PARAM_CMODE_MAX + 1] = {};
+ bool param_value_set[DEVLINK_PARAM_CMODE_MAX + 1] = {};
+ const struct devlink_param *param = param_item->param;
+ struct devlink_param_gset_ctx ctx;
+ struct nlattr *param_values_list;
+ struct nlattr *param_attr;
+ void *hdr;
+ int err;
+ int i;
+
+ /* Get value from driver part to driverinit configuration mode */
+ for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) {
+ if (!devlink_param_cmode_is_supported(param, i))
+ continue;
+ if (i == DEVLINK_PARAM_CMODE_DRIVERINIT) {
+ if (param_item->driverinit_value_new_valid)
+ param_value[i] = param_item->driverinit_value_new;
+ else if (param_item->driverinit_value_valid)
+ param_value[i] = param_item->driverinit_value;
+ else
+ return -EOPNOTSUPP;
+
+ if (param_item->driverinit_value_valid) {
+ default_value[i] = param_item->driverinit_default;
+ default_value_set[i] = true;
+ }
+ } else {
+ ctx.cmode = i;
+ err = devlink_param_get(devlink, param, &ctx, extack);
+ if (err)
+ return err;
+ param_value[i] = ctx.val;
+
+ err = devlink_param_get_default(devlink, param, &ctx,
+ extack);
+ if (!err) {
+ default_value[i] = ctx.val;
+ default_value_set[i] = true;
+ } else if (err != -EOPNOTSUPP) {
+ return err;
+ }
+ }
+ param_value_set[i] = true;
+ }
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto genlmsg_cancel;
+
+ if (cmd == DEVLINK_CMD_PORT_PARAM_GET ||
+ cmd == DEVLINK_CMD_PORT_PARAM_NEW ||
+ cmd == DEVLINK_CMD_PORT_PARAM_DEL)
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, port_index))
+ goto genlmsg_cancel;
+
+ param_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PARAM);
+ if (!param_attr)
+ goto genlmsg_cancel;
+ if (nla_put_string(msg, DEVLINK_ATTR_PARAM_NAME, param->name))
+ goto param_nest_cancel;
+ if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC))
+ goto param_nest_cancel;
+ if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, param->type))
+ goto param_nest_cancel;
+
+ param_values_list = nla_nest_start_noflag(msg,
+ DEVLINK_ATTR_PARAM_VALUES_LIST);
+ if (!param_values_list)
+ goto param_nest_cancel;
+
+ for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) {
+ if (!param_value_set[i])
+ continue;
+ err = devlink_nl_param_value_fill_one(msg, param->type,
+ i, param_value[i],
+ default_value[i],
+ default_value_set[i]);
+ if (err)
+ goto values_list_nest_cancel;
+ }
+
+ nla_nest_end(msg, param_values_list);
+ nla_nest_end(msg, param_attr);
+ genlmsg_end(msg, hdr);
+ return 0;
+
+values_list_nest_cancel:
+ nla_nest_end(msg, param_values_list);
+param_nest_cancel:
+ nla_nest_cancel(msg, param_attr);
+genlmsg_cancel:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_param_notify(struct devlink *devlink,
+ unsigned int port_index,
+ struct devlink_param_item *param_item,
+ enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL &&
+ cmd != DEVLINK_CMD_PORT_PARAM_NEW &&
+ cmd != DEVLINK_CMD_PORT_PARAM_DEL);
+
+ /* devlink_notify_register() / devlink_notify_unregister()
+ * will replay the notifications if the params are added/removed
+ * outside of the lifetime of the instance.
+ */
+ if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+ err = devlink_nl_param_fill(msg, devlink, port_index, param_item, cmd,
+ 0, 0, 0, NULL);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ devlink_nl_notify_send(devlink, msg);
+}
+
+static void devlink_params_notify(struct devlink *devlink,
+ enum devlink_command cmd)
+{
+ struct devlink_param_item *param_item;
+ unsigned long param_id;
+
+ xa_for_each(&devlink->params, param_id, param_item)
+ devlink_param_notify(devlink, 0, param_item, cmd);
+}
+
+void devlink_params_notify_register(struct devlink *devlink)
+{
+ devlink_params_notify(devlink, DEVLINK_CMD_PARAM_NEW);
+}
+
+void devlink_params_notify_unregister(struct devlink *devlink)
+{
+ devlink_params_notify(devlink, DEVLINK_CMD_PARAM_DEL);
+}
+
+static int devlink_nl_param_get_dump_one(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb,
+ int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_param_item *param_item;
+ unsigned long param_id;
+ int err = 0;
+
+ xa_for_each_start(&devlink->params, param_id, param_item, state->idx) {
+ err = devlink_nl_param_fill(msg, devlink, 0, param_item,
+ DEVLINK_CMD_PARAM_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags,
+ cb->extack);
+ if (err == -EOPNOTSUPP) {
+ err = 0;
+ } else if (err) {
+ state->idx = param_id;
+ break;
+ }
+ }
+
+ return err;
+}
+
+int devlink_nl_param_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_param_get_dump_one);
+}
+
+static int
+devlink_param_type_get_from_info(struct genl_info *info,
+ enum devlink_param_type *param_type)
+{
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_TYPE))
+ return -EINVAL;
+
+ *param_type = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE]);
+
+ return 0;
+}
+
+static int
+devlink_param_value_get_from_info(const struct devlink_param *param,
+ struct genl_info *info,
+ union devlink_param_value *value)
+{
+ struct nlattr *param_data;
+ int len;
+
+ param_data = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA];
+
+ if (param->type != DEVLINK_PARAM_TYPE_BOOL && !param_data)
+ return -EINVAL;
+
+ switch (param->type) {
+ case DEVLINK_PARAM_TYPE_U8:
+ if (nla_len(param_data) != sizeof(u8))
+ return -EINVAL;
+ value->vu8 = nla_get_u8(param_data);
+ break;
+ case DEVLINK_PARAM_TYPE_U16:
+ if (nla_len(param_data) != sizeof(u16))
+ return -EINVAL;
+ value->vu16 = nla_get_u16(param_data);
+ break;
+ case DEVLINK_PARAM_TYPE_U32:
+ if (nla_len(param_data) != sizeof(u32))
+ return -EINVAL;
+ value->vu32 = nla_get_u32(param_data);
+ break;
+ case DEVLINK_PARAM_TYPE_U64:
+ if (nla_len(param_data) != sizeof(u64))
+ return -EINVAL;
+ value->vu64 = nla_get_u64(param_data);
+ break;
+ case DEVLINK_PARAM_TYPE_STRING:
+ len = strnlen(nla_data(param_data), nla_len(param_data));
+ if (len == nla_len(param_data) ||
+ len >= __DEVLINK_PARAM_MAX_STRING_VALUE)
+ return -EINVAL;
+ strcpy(value->vstr, nla_data(param_data));
+ break;
+ case DEVLINK_PARAM_TYPE_BOOL:
+ if (param_data && nla_len(param_data))
+ return -EINVAL;
+ value->vbool = nla_get_flag(param_data);
+ break;
+ }
+ return 0;
+}
+
+static struct devlink_param_item *
+devlink_param_get_from_info(struct xarray *params, struct genl_info *info)
+{
+ char *param_name;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_NAME))
+ return NULL;
+
+ param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]);
+ return devlink_param_find_by_name(params, param_name);
+}
+
+int devlink_nl_param_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_param_item *param_item;
+ struct sk_buff *msg;
+ int err;
+
+ param_item = devlink_param_get_from_info(&devlink->params, info);
+ if (!param_item)
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_param_fill(msg, devlink, 0, param_item,
+ DEVLINK_CMD_PARAM_GET, info->snd_portid,
+ info->snd_seq, 0, info->extack);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink,
+ unsigned int port_index,
+ struct xarray *params,
+ struct genl_info *info,
+ enum devlink_command cmd)
+{
+ enum devlink_param_type param_type;
+ struct devlink_param_gset_ctx ctx;
+ enum devlink_param_cmode cmode;
+ struct devlink_param_item *param_item;
+ const struct devlink_param *param;
+ union devlink_param_value value;
+ bool reset_default;
+ int err = 0;
+
+ param_item = devlink_param_get_from_info(params, info);
+ if (!param_item)
+ return -EINVAL;
+ param = param_item->param;
+ err = devlink_param_type_get_from_info(info, &param_type);
+ if (err)
+ return err;
+ if (param_type != param->type)
+ return -EINVAL;
+
+ reset_default = info->attrs[DEVLINK_ATTR_PARAM_RESET_DEFAULT];
+ if (!reset_default) {
+ err = devlink_param_value_get_from_info(param, info, &value);
+ if (err)
+ return err;
+ if (param->validate) {
+ err = param->validate(devlink, param->id, value,
+ info->extack);
+ if (err)
+ return err;
+ }
+ }
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_VALUE_CMODE))
+ return -EINVAL;
+ cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]);
+ if (!devlink_param_cmode_is_supported(param, cmode))
+ return -EOPNOTSUPP;
+
+ if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) {
+ if (reset_default) {
+ if (!param_item->driverinit_value_valid) {
+ NL_SET_ERR_MSG(info->extack,
+ "Default value not available");
+ return -EOPNOTSUPP;
+ }
+ value = param_item->driverinit_default;
+ }
+
+ param_item->driverinit_value_new = value;
+ param_item->driverinit_value_new_valid = true;
+ } else {
+ if (!param->set)
+ return -EOPNOTSUPP;
+ ctx.val = value;
+ ctx.cmode = cmode;
+ if (reset_default)
+ err = devlink_param_reset_default(devlink, param, cmode,
+ info->extack);
+ else
+ err = devlink_param_set(devlink, param, &ctx,
+ info->extack);
+ if (err)
+ return err;
+ }
+
+ devlink_param_notify(devlink, port_index, param_item, cmd);
+ return 0;
+}
+
+int devlink_nl_param_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+
+ return __devlink_nl_cmd_param_set_doit(devlink, 0, &devlink->params,
+ info, DEVLINK_CMD_PARAM_NEW);
+}
+
+int devlink_nl_port_param_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ NL_SET_ERR_MSG(cb->extack, "Port params are not supported");
+ return msg->len;
+}
+
+int devlink_nl_port_param_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ NL_SET_ERR_MSG(info->extack, "Port params are not supported");
+ return -EINVAL;
+}
+
+int devlink_nl_port_param_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ NL_SET_ERR_MSG(info->extack, "Port params are not supported");
+ return -EINVAL;
+}
+
+static int devlink_param_verify(const struct devlink_param *param)
+{
+ if (!param || !param->name || !param->supported_cmodes)
+ return -EINVAL;
+ if (param->generic)
+ return devlink_param_generic_verify(param);
+ else
+ return devlink_param_driver_verify(param);
+}
+
+static int devlink_param_register(struct devlink *devlink,
+ const struct devlink_param *param)
+{
+ struct devlink_param_item *param_item;
+ int err;
+
+ WARN_ON(devlink_param_verify(param));
+ WARN_ON(devlink_param_find_by_name(&devlink->params, param->name));
+
+ if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT))
+ WARN_ON(param->get || param->set);
+ else
+ WARN_ON(!param->get || !param->set);
+
+ param_item = kzalloc(sizeof(*param_item), GFP_KERNEL);
+ if (!param_item)
+ return -ENOMEM;
+
+ param_item->param = param;
+
+ err = xa_insert(&devlink->params, param->id, param_item, GFP_KERNEL);
+ if (err)
+ goto err_xa_insert;
+
+ devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW);
+ return 0;
+
+err_xa_insert:
+ kfree(param_item);
+ return err;
+}
+
+static void devlink_param_unregister(struct devlink *devlink,
+ const struct devlink_param *param)
+{
+ struct devlink_param_item *param_item;
+
+ param_item = devlink_param_find_by_id(&devlink->params, param->id);
+ if (WARN_ON(!param_item))
+ return;
+ devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_DEL);
+ xa_erase(&devlink->params, param->id);
+ kfree(param_item);
+}
+
+/**
+ * devl_params_register - register configuration parameters
+ *
+ * @devlink: devlink
+ * @params: configuration parameters array
+ * @params_count: number of parameters provided
+ *
+ * Register the configuration parameters supported by the driver.
+ */
+int devl_params_register(struct devlink *devlink,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ const struct devlink_param *param = params;
+ int i, err;
+
+ lockdep_assert_held(&devlink->lock);
+
+ for (i = 0; i < params_count; i++, param++) {
+ err = devlink_param_register(devlink, param);
+ if (err)
+ goto rollback;
+ }
+ return 0;
+
+rollback:
+ if (!i)
+ return err;
+
+ for (param--; i > 0; i--, param--)
+ devlink_param_unregister(devlink, param);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devl_params_register);
+
+int devlink_params_register(struct devlink *devlink,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ int err;
+
+ devl_lock(devlink);
+ err = devl_params_register(devlink, params, params_count);
+ devl_unlock(devlink);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_params_register);
+
+/**
+ * devl_params_unregister - unregister configuration parameters
+ * @devlink: devlink
+ * @params: configuration parameters to unregister
+ * @params_count: number of parameters provided
+ */
+void devl_params_unregister(struct devlink *devlink,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ const struct devlink_param *param = params;
+ int i;
+
+ lockdep_assert_held(&devlink->lock);
+
+ for (i = 0; i < params_count; i++, param++)
+ devlink_param_unregister(devlink, param);
+}
+EXPORT_SYMBOL_GPL(devl_params_unregister);
+
+void devlink_params_unregister(struct devlink *devlink,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ devl_lock(devlink);
+ devl_params_unregister(devlink, params, params_count);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_params_unregister);
+
+/**
+ * devl_param_driverinit_value_get - get configuration parameter
+ * value for driver initializing
+ *
+ * @devlink: devlink
+ * @param_id: parameter ID
+ * @val: pointer to store the value of parameter in driverinit
+ * configuration mode
+ *
+ * This function should be used by the driver to get driverinit
+ * configuration for initialization after reload command.
+ *
+ * Note that lockless call of this function relies on the
+ * driver to maintain following basic sane behavior:
+ * 1) Driver ensures a call to this function cannot race with
+ * registering/unregistering the parameter with the same parameter ID.
+ * 2) Driver ensures a call to this function cannot race with
+ * devl_param_driverinit_value_set() call with the same parameter ID.
+ * 3) Driver ensures a call to this function cannot race with
+ * reload operation.
+ * If the driver is not able to comply, it has to take the devlink->lock
+ * while calling this.
+ */
+int devl_param_driverinit_value_get(struct devlink *devlink, u32 param_id,
+ union devlink_param_value *val)
+{
+ struct devlink_param_item *param_item;
+
+ if (WARN_ON(!devlink_reload_supported(devlink->ops)))
+ return -EOPNOTSUPP;
+
+ param_item = devlink_param_find_by_id(&devlink->params, param_id);
+ if (!param_item)
+ return -EINVAL;
+
+ if (!param_item->driverinit_value_valid)
+ return -EOPNOTSUPP;
+
+ if (WARN_ON(!devlink_param_cmode_is_supported(param_item->param,
+ DEVLINK_PARAM_CMODE_DRIVERINIT)))
+ return -EOPNOTSUPP;
+
+ *val = param_item->driverinit_value;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_param_driverinit_value_get);
+
+/**
+ * devl_param_driverinit_value_set - set value of configuration
+ * parameter for driverinit
+ * configuration mode
+ *
+ * @devlink: devlink
+ * @param_id: parameter ID
+ * @init_val: value of parameter to set for driverinit configuration mode
+ *
+ * This function should be used by the driver to set driverinit
+ * configuration mode default value.
+ */
+void devl_param_driverinit_value_set(struct devlink *devlink, u32 param_id,
+ union devlink_param_value init_val)
+{
+ struct devlink_param_item *param_item;
+
+ devl_assert_locked(devlink);
+
+ param_item = devlink_param_find_by_id(&devlink->params, param_id);
+ if (WARN_ON(!param_item))
+ return;
+
+ if (WARN_ON(!devlink_param_cmode_is_supported(param_item->param,
+ DEVLINK_PARAM_CMODE_DRIVERINIT)))
+ return;
+
+ param_item->driverinit_value = init_val;
+ param_item->driverinit_value_valid = true;
+ param_item->driverinit_default = init_val;
+
+ devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW);
+}
+EXPORT_SYMBOL_GPL(devl_param_driverinit_value_set);
+
+void devlink_params_driverinit_load_new(struct devlink *devlink)
+{
+ struct devlink_param_item *param_item;
+ unsigned long param_id;
+
+ xa_for_each(&devlink->params, param_id, param_item) {
+ if (!devlink_param_cmode_is_supported(param_item->param,
+ DEVLINK_PARAM_CMODE_DRIVERINIT) ||
+ !param_item->driverinit_value_new_valid)
+ continue;
+ param_item->driverinit_value = param_item->driverinit_value_new;
+ param_item->driverinit_value_valid = true;
+ param_item->driverinit_value_new_valid = false;
+ }
+}
+
+/**
+ * devl_param_value_changed - notify devlink on a parameter's value
+ * change. Should be called by the driver
+ * right after the change.
+ *
+ * @devlink: devlink
+ * @param_id: parameter ID
+ *
+ * This function should be used by the driver to notify devlink on value
+ * change, excluding driverinit configuration mode.
+ * For driverinit configuration mode driver should use the function
+ */
+void devl_param_value_changed(struct devlink *devlink, u32 param_id)
+{
+ struct devlink_param_item *param_item;
+
+ param_item = devlink_param_find_by_id(&devlink->params, param_id);
+ WARN_ON(!param_item);
+
+ devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW);
+}
+EXPORT_SYMBOL_GPL(devl_param_value_changed);
diff --git a/net/devlink/port.c b/net/devlink/port.c
new file mode 100644
index 000000000000..93d8a25bb920
--- /dev/null
+++ b/net/devlink/port.c
@@ -0,0 +1,1604 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include "devl_internal.h"
+
+#define DEVLINK_PORT_FN_CAPS_VALID_MASK \
+ (_BITUL(__DEVLINK_PORT_FN_ATTR_CAPS_MAX) - 1)
+
+static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = {
+ [DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY },
+ [DEVLINK_PORT_FN_ATTR_STATE] =
+ NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE,
+ DEVLINK_PORT_FN_STATE_ACTIVE),
+ [DEVLINK_PORT_FN_ATTR_CAPS] =
+ NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK),
+ [DEVLINK_PORT_FN_ATTR_MAX_IO_EQS] = { .type = NLA_U32 },
+};
+
+#define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \
+ WARN_ON_ONCE(!(devlink_port)->registered)
+#define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port) \
+ WARN_ON_ONCE((devlink_port)->registered)
+
+struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
+ unsigned int port_index)
+{
+ return xa_load(&devlink->ports, port_index);
+}
+
+struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink,
+ struct nlattr **attrs)
+{
+ if (attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]);
+ struct devlink_port *devlink_port;
+
+ devlink_port = devlink_port_get_by_index(devlink, port_index);
+ if (!devlink_port)
+ return ERR_PTR(-ENODEV);
+ return devlink_port;
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+struct devlink_port *devlink_port_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ return devlink_port_get_from_attrs(devlink, info->attrs);
+}
+
+static void devlink_port_fn_cap_fill(struct nla_bitfield32 *caps,
+ u32 cap, bool is_enable)
+{
+ caps->selector |= cap;
+ if (is_enable)
+ caps->value |= cap;
+}
+
+static int devlink_port_fn_roce_fill(struct devlink_port *devlink_port,
+ struct nla_bitfield32 *caps,
+ struct netlink_ext_ack *extack)
+{
+ bool is_enable;
+ int err;
+
+ if (!devlink_port->ops->port_fn_roce_get)
+ return 0;
+
+ err = devlink_port->ops->port_fn_roce_get(devlink_port, &is_enable,
+ extack);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_ROCE, is_enable);
+ return 0;
+}
+
+static int devlink_port_fn_migratable_fill(struct devlink_port *devlink_port,
+ struct nla_bitfield32 *caps,
+ struct netlink_ext_ack *extack)
+{
+ bool is_enable;
+ int err;
+
+ if (!devlink_port->ops->port_fn_migratable_get ||
+ devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF)
+ return 0;
+
+ err = devlink_port->ops->port_fn_migratable_get(devlink_port,
+ &is_enable, extack);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_MIGRATABLE, is_enable);
+ return 0;
+}
+
+static int devlink_port_fn_ipsec_crypto_fill(struct devlink_port *devlink_port,
+ struct nla_bitfield32 *caps,
+ struct netlink_ext_ack *extack)
+{
+ bool is_enable;
+ int err;
+
+ if (!devlink_port->ops->port_fn_ipsec_crypto_get ||
+ devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF)
+ return 0;
+
+ err = devlink_port->ops->port_fn_ipsec_crypto_get(devlink_port, &is_enable, extack);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO, is_enable);
+ return 0;
+}
+
+static int devlink_port_fn_ipsec_packet_fill(struct devlink_port *devlink_port,
+ struct nla_bitfield32 *caps,
+ struct netlink_ext_ack *extack)
+{
+ bool is_enable;
+ int err;
+
+ if (!devlink_port->ops->port_fn_ipsec_packet_get ||
+ devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF)
+ return 0;
+
+ err = devlink_port->ops->port_fn_ipsec_packet_get(devlink_port, &is_enable, extack);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+
+ devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_IPSEC_PACKET, is_enable);
+ return 0;
+}
+
+static int devlink_port_fn_caps_fill(struct devlink_port *devlink_port,
+ struct sk_buff *msg,
+ struct netlink_ext_ack *extack,
+ bool *msg_updated)
+{
+ struct nla_bitfield32 caps = {};
+ int err;
+
+ err = devlink_port_fn_roce_fill(devlink_port, &caps, extack);
+ if (err)
+ return err;
+
+ err = devlink_port_fn_migratable_fill(devlink_port, &caps, extack);
+ if (err)
+ return err;
+
+ err = devlink_port_fn_ipsec_crypto_fill(devlink_port, &caps, extack);
+ if (err)
+ return err;
+
+ err = devlink_port_fn_ipsec_packet_fill(devlink_port, &caps, extack);
+ if (err)
+ return err;
+
+ if (!caps.selector)
+ return 0;
+ err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value,
+ caps.selector);
+ if (err)
+ return err;
+
+ *msg_updated = true;
+ return 0;
+}
+
+static int devlink_port_fn_max_io_eqs_fill(struct devlink_port *port,
+ struct sk_buff *msg,
+ struct netlink_ext_ack *extack,
+ bool *msg_updated)
+{
+ u32 max_io_eqs;
+ int err;
+
+ if (!port->ops->port_fn_max_io_eqs_get)
+ return 0;
+
+ err = port->ops->port_fn_max_io_eqs_get(port, &max_io_eqs, extack);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+ err = nla_put_u32(msg, DEVLINK_PORT_FN_ATTR_MAX_IO_EQS, max_io_eqs);
+ if (err)
+ return err;
+ *msg_updated = true;
+ return 0;
+}
+
+int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port)
+{
+ if (devlink_nl_put_handle(msg, devlink_port->devlink))
+ return -EMSGSIZE;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+ return -EMSGSIZE;
+ return 0;
+}
+
+size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port)
+{
+ struct devlink *devlink = devlink_port->devlink;
+
+ return nla_total_size(strlen(devlink->dev->bus->name) + 1) /* DEVLINK_ATTR_BUS_NAME */
+ + nla_total_size(strlen(dev_name(devlink->dev)) + 1) /* DEVLINK_ATTR_DEV_NAME */
+ + nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */
+}
+
+static int devlink_nl_port_attrs_put(struct sk_buff *msg,
+ struct devlink_port *devlink_port)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+ if (!devlink_port->attrs_set)
+ return 0;
+ if (attrs->lanes) {
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_LANES, attrs->lanes))
+ return -EMSGSIZE;
+ }
+ if (nla_put_u8(msg, DEVLINK_ATTR_PORT_SPLITTABLE, attrs->splittable))
+ return -EMSGSIZE;
+ if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
+ return -EMSGSIZE;
+ switch (devlink_port->attrs.flavour) {
+ case DEVLINK_PORT_FLAVOUR_PCI_PF:
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER,
+ attrs->pci_pf.controller) ||
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_pf.pf))
+ return -EMSGSIZE;
+ if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_pf.external))
+ return -EMSGSIZE;
+ break;
+ case DEVLINK_PORT_FLAVOUR_PCI_VF:
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER,
+ attrs->pci_vf.controller) ||
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER, attrs->pci_vf.pf) ||
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER, attrs->pci_vf.vf))
+ return -EMSGSIZE;
+ if (nla_put_u8(msg, DEVLINK_ATTR_PORT_EXTERNAL, attrs->pci_vf.external))
+ return -EMSGSIZE;
+ break;
+ case DEVLINK_PORT_FLAVOUR_PCI_SF:
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_CONTROLLER_NUMBER,
+ attrs->pci_sf.controller) ||
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
+ attrs->pci_sf.pf) ||
+ nla_put_u32(msg, DEVLINK_ATTR_PORT_PCI_SF_NUMBER,
+ attrs->pci_sf.sf))
+ return -EMSGSIZE;
+ break;
+ case DEVLINK_PORT_FLAVOUR_PHYSICAL:
+ case DEVLINK_PORT_FLAVOUR_CPU:
+ case DEVLINK_PORT_FLAVOUR_DSA:
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER,
+ attrs->phys.port_number))
+ return -EMSGSIZE;
+ if (!attrs->split)
+ return 0;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
+ attrs->phys.port_number))
+ return -EMSGSIZE;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,
+ attrs->phys.split_subport_number))
+ return -EMSGSIZE;
+ break;
+ default:
+ break;
+ }
+ return 0;
+}
+
+static int devlink_port_fn_hw_addr_fill(struct devlink_port *port,
+ struct sk_buff *msg,
+ struct netlink_ext_ack *extack,
+ bool *msg_updated)
+{
+ u8 hw_addr[MAX_ADDR_LEN];
+ int hw_addr_len;
+ int err;
+
+ if (!port->ops->port_fn_hw_addr_get)
+ return 0;
+
+ err = port->ops->port_fn_hw_addr_get(port, hw_addr, &hw_addr_len,
+ extack);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+ err = nla_put(msg, DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR, hw_addr_len, hw_addr);
+ if (err)
+ return err;
+ *msg_updated = true;
+ return 0;
+}
+
+static bool
+devlink_port_fn_state_valid(enum devlink_port_fn_state state)
+{
+ return state == DEVLINK_PORT_FN_STATE_INACTIVE ||
+ state == DEVLINK_PORT_FN_STATE_ACTIVE;
+}
+
+static bool
+devlink_port_fn_opstate_valid(enum devlink_port_fn_opstate opstate)
+{
+ return opstate == DEVLINK_PORT_FN_OPSTATE_DETACHED ||
+ opstate == DEVLINK_PORT_FN_OPSTATE_ATTACHED;
+}
+
+static int devlink_port_fn_state_fill(struct devlink_port *port,
+ struct sk_buff *msg,
+ struct netlink_ext_ack *extack,
+ bool *msg_updated)
+{
+ enum devlink_port_fn_opstate opstate;
+ enum devlink_port_fn_state state;
+ int err;
+
+ if (!port->ops->port_fn_state_get)
+ return 0;
+
+ err = port->ops->port_fn_state_get(port, &state, &opstate, extack);
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ return 0;
+ return err;
+ }
+ if (!devlink_port_fn_state_valid(state)) {
+ WARN_ON_ONCE(1);
+ NL_SET_ERR_MSG(extack, "Invalid state read from driver");
+ return -EINVAL;
+ }
+ if (!devlink_port_fn_opstate_valid(opstate)) {
+ WARN_ON_ONCE(1);
+ NL_SET_ERR_MSG(extack, "Invalid operational state read from driver");
+ return -EINVAL;
+ }
+ if (nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_STATE, state) ||
+ nla_put_u8(msg, DEVLINK_PORT_FN_ATTR_OPSTATE, opstate))
+ return -EMSGSIZE;
+ *msg_updated = true;
+ return 0;
+}
+
+static int
+devlink_port_fn_mig_set(struct devlink_port *devlink_port, bool enable,
+ struct netlink_ext_ack *extack)
+{
+ return devlink_port->ops->port_fn_migratable_set(devlink_port, enable,
+ extack);
+}
+
+static int
+devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable,
+ struct netlink_ext_ack *extack)
+{
+ return devlink_port->ops->port_fn_roce_set(devlink_port, enable,
+ extack);
+}
+
+static int
+devlink_port_fn_ipsec_crypto_set(struct devlink_port *devlink_port, bool enable,
+ struct netlink_ext_ack *extack)
+{
+ return devlink_port->ops->port_fn_ipsec_crypto_set(devlink_port, enable, extack);
+}
+
+static int
+devlink_port_fn_ipsec_packet_set(struct devlink_port *devlink_port, bool enable,
+ struct netlink_ext_ack *extack)
+{
+ return devlink_port->ops->port_fn_ipsec_packet_set(devlink_port, enable, extack);
+}
+
+static int devlink_port_fn_caps_set(struct devlink_port *devlink_port,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nla_bitfield32 caps;
+ u32 caps_value;
+ int err;
+
+ caps = nla_get_bitfield32(attr);
+ caps_value = caps.value & caps.selector;
+ if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE) {
+ err = devlink_port_fn_roce_set(devlink_port,
+ caps_value & DEVLINK_PORT_FN_CAP_ROCE,
+ extack);
+ if (err)
+ return err;
+ }
+ if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) {
+ err = devlink_port_fn_mig_set(devlink_port, caps_value &
+ DEVLINK_PORT_FN_CAP_MIGRATABLE,
+ extack);
+ if (err)
+ return err;
+ }
+ if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) {
+ err = devlink_port_fn_ipsec_crypto_set(devlink_port, caps_value &
+ DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO,
+ extack);
+ if (err)
+ return err;
+ }
+ if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) {
+ err = devlink_port_fn_ipsec_packet_set(devlink_port, caps_value &
+ DEVLINK_PORT_FN_CAP_IPSEC_PACKET,
+ extack);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static int
+devlink_port_fn_max_io_eqs_set(struct devlink_port *devlink_port,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ u32 max_io_eqs;
+
+ max_io_eqs = nla_get_u32(attr);
+ return devlink_port->ops->port_fn_max_io_eqs_set(devlink_port,
+ max_io_eqs, extack);
+}
+
+static int
+devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *function_attr;
+ bool msg_updated = false;
+ int err;
+
+ function_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PORT_FUNCTION);
+ if (!function_attr)
+ return -EMSGSIZE;
+
+ err = devlink_port_fn_hw_addr_fill(port, msg, extack, &msg_updated);
+ if (err)
+ goto out;
+ err = devlink_port_fn_caps_fill(port, msg, extack, &msg_updated);
+ if (err)
+ goto out;
+ err = devlink_port_fn_state_fill(port, msg, extack, &msg_updated);
+ if (err)
+ goto out;
+ err = devlink_port_fn_max_io_eqs_fill(port, msg, extack, &msg_updated);
+ if (err)
+ goto out;
+ err = devlink_rel_devlink_handle_put(msg, port->devlink,
+ port->rel_index,
+ DEVLINK_PORT_FN_ATTR_DEVLINK,
+ &msg_updated);
+
+out:
+ if (err || !msg_updated)
+ nla_nest_cancel(msg, function_attr);
+ else
+ nla_nest_end(msg, function_attr);
+ return err;
+}
+
+static int devlink_nl_port_fill(struct sk_buff *msg,
+ struct devlink_port *devlink_port,
+ enum devlink_command cmd, u32 portid, u32 seq,
+ int flags, struct netlink_ext_ack *extack)
+{
+ struct devlink *devlink = devlink_port->devlink;
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+ goto nla_put_failure;
+
+ spin_lock_bh(&devlink_port->type_lock);
+ if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))
+ goto nla_put_failure_type_locked;
+ if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET &&
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE,
+ devlink_port->desired_type))
+ goto nla_put_failure_type_locked;
+ if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) {
+ if (devlink_port->type_eth.netdev &&
+ (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,
+ devlink_port->type_eth.ifindex) ||
+ nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,
+ devlink_port->type_eth.ifname)))
+ goto nla_put_failure_type_locked;
+ }
+ if (devlink_port->type == DEVLINK_PORT_TYPE_IB) {
+ struct ib_device *ibdev = devlink_port->type_ib.ibdev;
+
+ if (ibdev &&
+ nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME,
+ ibdev->name))
+ goto nla_put_failure_type_locked;
+ }
+ spin_unlock_bh(&devlink_port->type_lock);
+ if (devlink_nl_port_attrs_put(msg, devlink_port))
+ goto nla_put_failure;
+ if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack))
+ goto nla_put_failure;
+ if (devlink_port->linecard &&
+ nla_put_u32(msg, DEVLINK_ATTR_LINECARD_INDEX,
+ devlink_linecard_index(devlink_port->linecard)))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure_type_locked:
+ spin_unlock_bh(&devlink_port->type_lock);
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_port_notify(struct devlink_port *devlink_port,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = devlink_port->devlink;
+ struct devlink_obj_desc desc;
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL);
+
+ if (!__devl_is_registered(devlink) || !devlink_nl_notify_need(devlink))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ devlink_nl_obj_desc_init(&desc, devlink);
+ devlink_nl_obj_desc_port_set(&desc, devlink_port);
+ devlink_nl_notify_send_desc(devlink, msg, &desc);
+}
+
+static void devlink_ports_notify(struct devlink *devlink,
+ enum devlink_command cmd)
+{
+ struct devlink_port *devlink_port;
+ unsigned long port_index;
+
+ xa_for_each(&devlink->ports, port_index, devlink_port)
+ devlink_port_notify(devlink_port, cmd);
+}
+
+void devlink_ports_notify_register(struct devlink *devlink)
+{
+ devlink_ports_notify(devlink, DEVLINK_CMD_PORT_NEW);
+}
+
+void devlink_ports_notify_unregister(struct devlink *devlink)
+{
+ devlink_ports_notify(devlink, DEVLINK_CMD_PORT_DEL);
+}
+
+int devlink_nl_port_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW,
+ info->snd_portid, info->snd_seq, 0,
+ info->extack);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int
+devlink_nl_port_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
+ struct netlink_callback *cb, int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_port *devlink_port;
+ unsigned long port_index;
+ int err = 0;
+
+ xa_for_each_start(&devlink->ports, port_index, devlink_port, state->idx) {
+ err = devlink_nl_port_fill(msg, devlink_port,
+ DEVLINK_CMD_PORT_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags,
+ cb->extack);
+ if (err) {
+ state->idx = port_index;
+ break;
+ }
+ }
+
+ return err;
+}
+
+int devlink_nl_port_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_port_get_dump_one);
+}
+
+static int devlink_port_type_set(struct devlink_port *devlink_port,
+ enum devlink_port_type port_type)
+
+{
+ int err;
+
+ if (!devlink_port->ops->port_type_set)
+ return -EOPNOTSUPP;
+
+ if (port_type == devlink_port->type)
+ return 0;
+
+ err = devlink_port->ops->port_type_set(devlink_port, port_type);
+ if (err)
+ return err;
+
+ devlink_port->desired_type = port_type;
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+ return 0;
+}
+
+static int devlink_port_function_hw_addr_set(struct devlink_port *port,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ const u8 *hw_addr;
+ int hw_addr_len;
+
+ hw_addr = nla_data(attr);
+ hw_addr_len = nla_len(attr);
+ if (hw_addr_len > MAX_ADDR_LEN) {
+ NL_SET_ERR_MSG(extack, "Port function hardware address too long");
+ return -EINVAL;
+ }
+ if (port->type == DEVLINK_PORT_TYPE_ETH) {
+ if (hw_addr_len != ETH_ALEN) {
+ NL_SET_ERR_MSG(extack, "Address must be 6 bytes for Ethernet device");
+ return -EINVAL;
+ }
+ if (!is_unicast_ether_addr(hw_addr)) {
+ NL_SET_ERR_MSG(extack, "Non-unicast hardware address unsupported");
+ return -EINVAL;
+ }
+ }
+
+ return port->ops->port_fn_hw_addr_set(port, hw_addr, hw_addr_len,
+ extack);
+}
+
+static int devlink_port_fn_state_set(struct devlink_port *port,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ enum devlink_port_fn_state state;
+
+ state = nla_get_u8(attr);
+ return port->ops->port_fn_state_set(port, state, extack);
+}
+
+static int devlink_port_function_validate(struct devlink_port *devlink_port,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ const struct devlink_port_ops *ops = devlink_port->ops;
+ struct nlattr *attr;
+
+ if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] &&
+ !ops->port_fn_hw_addr_set) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR],
+ "Port doesn't support function attributes");
+ return -EOPNOTSUPP;
+ }
+ if (tb[DEVLINK_PORT_FN_ATTR_STATE] && !ops->port_fn_state_set) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FN_ATTR_STATE],
+ "Function does not support state setting");
+ return -EOPNOTSUPP;
+ }
+ attr = tb[DEVLINK_PORT_FN_ATTR_CAPS];
+ if (attr) {
+ struct nla_bitfield32 caps;
+
+ caps = nla_get_bitfield32(attr);
+ if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE &&
+ !ops->port_fn_roce_set) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "Port doesn't support RoCE function attribute");
+ return -EOPNOTSUPP;
+ }
+ if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) {
+ if (!ops->port_fn_migratable_set) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "Port doesn't support migratable function attribute");
+ return -EOPNOTSUPP;
+ }
+ if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "migratable function attribute supported for VFs only");
+ return -EOPNOTSUPP;
+ }
+ }
+ if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_CRYPTO) {
+ if (!ops->port_fn_ipsec_crypto_set) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "Port doesn't support ipsec_crypto function attribute");
+ return -EOPNOTSUPP;
+ }
+ if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "ipsec_crypto function attribute supported for VFs only");
+ return -EOPNOTSUPP;
+ }
+ }
+ if (caps.selector & DEVLINK_PORT_FN_CAP_IPSEC_PACKET) {
+ if (!ops->port_fn_ipsec_packet_set) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "Port doesn't support ipsec_packet function attribute");
+ return -EOPNOTSUPP;
+ }
+ if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "ipsec_packet function attribute supported for VFs only");
+ return -EOPNOTSUPP;
+ }
+ }
+ }
+ if (tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS] &&
+ !ops->port_fn_max_io_eqs_set) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS],
+ "Function does not support max_io_eqs setting");
+ return -EOPNOTSUPP;
+ }
+ return 0;
+}
+
+static int devlink_port_function_set(struct devlink_port *port,
+ const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, DEVLINK_PORT_FUNCTION_ATTR_MAX, attr,
+ devlink_function_nl_policy, extack);
+ if (err < 0) {
+ NL_SET_ERR_MSG(extack, "Fail to parse port function attributes");
+ return err;
+ }
+
+ err = devlink_port_function_validate(port, tb, extack);
+ if (err)
+ return err;
+
+ attr = tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR];
+ if (attr) {
+ err = devlink_port_function_hw_addr_set(port, attr, extack);
+ if (err)
+ return err;
+ }
+
+ attr = tb[DEVLINK_PORT_FN_ATTR_CAPS];
+ if (attr) {
+ err = devlink_port_fn_caps_set(port, attr, extack);
+ if (err)
+ return err;
+ }
+
+ attr = tb[DEVLINK_PORT_FN_ATTR_MAX_IO_EQS];
+ if (attr) {
+ err = devlink_port_fn_max_io_eqs_set(port, attr, extack);
+ if (err)
+ return err;
+ }
+
+ /* Keep this as the last function attribute set, so that when
+ * multiple port function attributes are set along with state,
+ * Those can be applied first before activating the state.
+ */
+ attr = tb[DEVLINK_PORT_FN_ATTR_STATE];
+ if (attr)
+ err = devlink_port_fn_state_set(port, attr, extack);
+
+ if (!err)
+ devlink_port_notify(port, DEVLINK_CMD_PORT_NEW);
+ return err;
+}
+
+int devlink_nl_port_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ int err;
+
+ if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) {
+ enum devlink_port_type port_type;
+
+ port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]);
+ err = devlink_port_type_set(devlink_port, port_type);
+ if (err)
+ return err;
+ }
+
+ if (info->attrs[DEVLINK_ATTR_PORT_FUNCTION]) {
+ struct nlattr *attr = info->attrs[DEVLINK_ATTR_PORT_FUNCTION];
+ struct netlink_ext_ack *extack = info->extack;
+
+ err = devlink_port_function_set(devlink_port, attr, extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+int devlink_nl_port_split_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ struct devlink *devlink = info->user_ptr[0];
+ u32 count;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PORT_SPLIT_COUNT))
+ return -EINVAL;
+ if (!devlink_port->ops->port_split)
+ return -EOPNOTSUPP;
+
+ count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]);
+
+ if (!devlink_port->attrs.splittable) {
+ /* Split ports cannot be split. */
+ if (devlink_port->attrs.split)
+ NL_SET_ERR_MSG(info->extack, "Port cannot be split further");
+ else
+ NL_SET_ERR_MSG(info->extack, "Port cannot be split");
+ return -EINVAL;
+ }
+
+ if (count < 2 || !is_power_of_2(count) || count > devlink_port->attrs.lanes) {
+ NL_SET_ERR_MSG(info->extack, "Invalid split count");
+ return -EINVAL;
+ }
+
+ return devlink_port->ops->port_split(devlink, devlink_port, count,
+ info->extack);
+}
+
+int devlink_nl_port_unsplit_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ struct devlink *devlink = info->user_ptr[0];
+
+ if (!devlink_port->ops->port_unsplit)
+ return -EOPNOTSUPP;
+ return devlink_port->ops->port_unsplit(devlink, devlink_port, info->extack);
+}
+
+int devlink_nl_port_new_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink_port_new_attrs new_attrs = {};
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_port *devlink_port;
+ struct sk_buff *msg;
+ int err;
+
+ if (!devlink->ops->port_new)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[DEVLINK_ATTR_PORT_FLAVOUR] ||
+ !info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]) {
+ NL_SET_ERR_MSG(extack, "Port flavour or PCI PF are not specified");
+ return -EINVAL;
+ }
+ new_attrs.flavour = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_FLAVOUR]);
+ new_attrs.pfnum =
+ nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_PCI_PF_NUMBER]);
+
+ if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ /* Port index of the new port being created by driver. */
+ new_attrs.port_index =
+ nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+ new_attrs.port_index_valid = true;
+ }
+ if (info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]) {
+ new_attrs.controller =
+ nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_CONTROLLER_NUMBER]);
+ new_attrs.controller_valid = true;
+ }
+ if (new_attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_SF &&
+ info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]) {
+ new_attrs.sfnum = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_PCI_SF_NUMBER]);
+ new_attrs.sfnum_valid = true;
+ }
+
+ err = devlink->ops->port_new(devlink, &new_attrs,
+ extack, &devlink_port);
+ if (err)
+ return err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg) {
+ err = -ENOMEM;
+ goto err_out_port_del;
+ }
+ err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW,
+ info->snd_portid, info->snd_seq, 0, NULL);
+ if (WARN_ON_ONCE(err))
+ goto err_out_msg_free;
+ err = genlmsg_reply(msg, info);
+ if (err)
+ goto err_out_port_del;
+ return 0;
+
+err_out_msg_free:
+ nlmsg_free(msg);
+err_out_port_del:
+ devlink_port->ops->port_del(devlink, devlink_port, NULL);
+ return err;
+}
+
+int devlink_nl_port_del_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+
+ if (!devlink_port->ops->port_del)
+ return -EOPNOTSUPP;
+
+ return devlink_port->ops->port_del(devlink, devlink_port, extack);
+}
+
+static void devlink_port_type_warn(struct work_struct *work)
+{
+ struct devlink_port *port = container_of(to_delayed_work(work),
+ struct devlink_port,
+ type_warn_dw);
+ dev_warn(port->devlink->dev, "Type was not set for devlink port.");
+}
+
+static bool devlink_port_type_should_warn(struct devlink_port *devlink_port)
+{
+ /* Ignore CPU and DSA flavours. */
+ return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU &&
+ devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA &&
+ devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_UNUSED;
+}
+
+#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 3600)
+
+static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port)
+{
+ if (!devlink_port_type_should_warn(devlink_port))
+ return;
+ /* Schedule a work to WARN in case driver does not set port
+ * type within timeout.
+ */
+ schedule_delayed_work(&devlink_port->type_warn_dw,
+ DEVLINK_PORT_TYPE_WARN_TIMEOUT);
+}
+
+static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port)
+{
+ if (!devlink_port_type_should_warn(devlink_port))
+ return;
+ cancel_delayed_work_sync(&devlink_port->type_warn_dw);
+}
+
+/**
+ * devlink_port_init() - Init devlink port
+ *
+ * @devlink: devlink
+ * @devlink_port: devlink port
+ *
+ * Initialize essential stuff that is needed for functions
+ * that may be called before devlink port registration.
+ * Call to this function is optional and not needed
+ * in case the driver does not use such functions.
+ */
+void devlink_port_init(struct devlink *devlink,
+ struct devlink_port *devlink_port)
+{
+ if (devlink_port->initialized)
+ return;
+ devlink_port->devlink = devlink;
+ INIT_LIST_HEAD(&devlink_port->region_list);
+ devlink_port->initialized = true;
+}
+EXPORT_SYMBOL_GPL(devlink_port_init);
+
+/**
+ * devlink_port_fini() - Deinitialize devlink port
+ *
+ * @devlink_port: devlink port
+ *
+ * Deinitialize essential stuff that is in use for functions
+ * that may be called after devlink port unregistration.
+ * Call to this function is optional and not needed
+ * in case the driver does not use such functions.
+ */
+void devlink_port_fini(struct devlink_port *devlink_port)
+{
+ WARN_ON(!list_empty(&devlink_port->region_list));
+}
+EXPORT_SYMBOL_GPL(devlink_port_fini);
+
+static const struct devlink_port_ops devlink_port_dummy_ops = {};
+
+/**
+ * devl_port_register_with_ops() - Register devlink port
+ *
+ * @devlink: devlink
+ * @devlink_port: devlink port
+ * @port_index: driver-specific numerical identifier of the port
+ * @ops: port ops
+ *
+ * Register devlink port with provided port index. User can use
+ * any indexing, even hw-related one. devlink_port structure
+ * is convenient to be embedded inside user driver private structure.
+ * Note that the caller should take care of zeroing the devlink_port
+ * structure.
+ */
+int devl_port_register_with_ops(struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ unsigned int port_index,
+ const struct devlink_port_ops *ops)
+{
+ int err;
+
+ devl_assert_locked(devlink);
+
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
+ devlink_port_init(devlink, devlink_port);
+ devlink_port->registered = true;
+ devlink_port->index = port_index;
+ devlink_port->ops = ops ? ops : &devlink_port_dummy_ops;
+ spin_lock_init(&devlink_port->type_lock);
+ INIT_LIST_HEAD(&devlink_port->reporter_list);
+ err = xa_insert(&devlink->ports, port_index, devlink_port, GFP_KERNEL);
+ if (err) {
+ devlink_port->registered = false;
+ return err;
+ }
+
+ INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn);
+ devlink_port_type_warn_schedule(devlink_port);
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_port_register_with_ops);
+
+/**
+ * devlink_port_register_with_ops - Register devlink port
+ *
+ * @devlink: devlink
+ * @devlink_port: devlink port
+ * @port_index: driver-specific numerical identifier of the port
+ * @ops: port ops
+ *
+ * Register devlink port with provided port index. User can use
+ * any indexing, even hw-related one. devlink_port structure
+ * is convenient to be embedded inside user driver private structure.
+ * Note that the caller should take care of zeroing the devlink_port
+ * structure.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+int devlink_port_register_with_ops(struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ unsigned int port_index,
+ const struct devlink_port_ops *ops)
+{
+ int err;
+
+ devl_lock(devlink);
+ err = devl_port_register_with_ops(devlink, devlink_port,
+ port_index, ops);
+ devl_unlock(devlink);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_port_register_with_ops);
+
+/**
+ * devl_port_unregister() - Unregister devlink port
+ *
+ * @devlink_port: devlink port
+ */
+void devl_port_unregister(struct devlink_port *devlink_port)
+{
+ lockdep_assert_held(&devlink_port->devlink->lock);
+ WARN_ON(devlink_port->type != DEVLINK_PORT_TYPE_NOTSET);
+
+ devlink_port_type_warn_cancel(devlink_port);
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
+ xa_erase(&devlink_port->devlink->ports, devlink_port->index);
+ WARN_ON(!list_empty(&devlink_port->reporter_list));
+ devlink_port->registered = false;
+}
+EXPORT_SYMBOL_GPL(devl_port_unregister);
+
+/**
+ * devlink_port_unregister - Unregister devlink port
+ *
+ * @devlink_port: devlink port
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+void devlink_port_unregister(struct devlink_port *devlink_port)
+{
+ struct devlink *devlink = devlink_port->devlink;
+
+ devl_lock(devlink);
+ devl_port_unregister(devlink_port);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_port_unregister);
+
+static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port,
+ struct net_device *netdev)
+{
+ const struct net_device_ops *ops = netdev->netdev_ops;
+
+ /* If driver registers devlink port, it should set devlink port
+ * attributes accordingly so the compat functions are called
+ * and the original ops are not used.
+ */
+ if (ops->ndo_get_phys_port_name) {
+ /* Some drivers use the same set of ndos for netdevs
+ * that have devlink_port registered and also for
+ * those who don't. Make sure that ndo_get_phys_port_name
+ * returns -EOPNOTSUPP here in case it is defined.
+ * Warn if not.
+ */
+ char name[IFNAMSIZ];
+ int err;
+
+ err = ops->ndo_get_phys_port_name(netdev, name, sizeof(name));
+ WARN_ON(err != -EOPNOTSUPP);
+ }
+ if (ops->ndo_get_port_parent_id) {
+ /* Some drivers use the same set of ndos for netdevs
+ * that have devlink_port registered and also for
+ * those who don't. Make sure that ndo_get_port_parent_id
+ * returns -EOPNOTSUPP here in case it is defined.
+ * Warn if not.
+ */
+ struct netdev_phys_item_id ppid;
+ int err;
+
+ err = ops->ndo_get_port_parent_id(netdev, &ppid);
+ WARN_ON(err != -EOPNOTSUPP);
+ }
+}
+
+static void __devlink_port_type_set(struct devlink_port *devlink_port,
+ enum devlink_port_type type,
+ void *type_dev)
+{
+ struct net_device *netdev = type_dev;
+
+ ASSERT_DEVLINK_PORT_REGISTERED(devlink_port);
+
+ if (type == DEVLINK_PORT_TYPE_NOTSET) {
+ devlink_port_type_warn_schedule(devlink_port);
+ } else {
+ devlink_port_type_warn_cancel(devlink_port);
+ if (type == DEVLINK_PORT_TYPE_ETH && netdev)
+ devlink_port_type_netdev_checks(devlink_port, netdev);
+ }
+
+ spin_lock_bh(&devlink_port->type_lock);
+ devlink_port->type = type;
+ switch (type) {
+ case DEVLINK_PORT_TYPE_ETH:
+ devlink_port->type_eth.netdev = netdev;
+ if (netdev) {
+ ASSERT_RTNL();
+ devlink_port->type_eth.ifindex = netdev->ifindex;
+ BUILD_BUG_ON(sizeof(devlink_port->type_eth.ifname) !=
+ sizeof(netdev->name));
+ strcpy(devlink_port->type_eth.ifname, netdev->name);
+ }
+ break;
+ case DEVLINK_PORT_TYPE_IB:
+ devlink_port->type_ib.ibdev = type_dev;
+ break;
+ default:
+ break;
+ }
+ spin_unlock_bh(&devlink_port->type_lock);
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+
+/**
+ * devlink_port_type_eth_set - Set port type to Ethernet
+ *
+ * @devlink_port: devlink port
+ *
+ * If driver is calling this, most likely it is doing something wrong.
+ */
+void devlink_port_type_eth_set(struct devlink_port *devlink_port)
+{
+ dev_warn(devlink_port->devlink->dev,
+ "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n",
+ devlink_port->index);
+ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_eth_set);
+
+/**
+ * devlink_port_type_ib_set - Set port type to InfiniBand
+ *
+ * @devlink_port: devlink port
+ * @ibdev: related IB device
+ */
+void devlink_port_type_ib_set(struct devlink_port *devlink_port,
+ struct ib_device *ibdev)
+{
+ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_IB, ibdev);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_ib_set);
+
+/**
+ * devlink_port_type_clear - Clear port type
+ *
+ * @devlink_port: devlink port
+ *
+ * If driver is calling this for clearing Ethernet type, most likely
+ * it is doing something wrong.
+ */
+void devlink_port_type_clear(struct devlink_port *devlink_port)
+{
+ if (devlink_port->type == DEVLINK_PORT_TYPE_ETH)
+ dev_warn(devlink_port->devlink->dev,
+ "devlink port type for port %d cleared without a software interface reference, device type not supported by the kernel?\n",
+ devlink_port->index);
+ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_clear);
+
+int devlink_port_netdevice_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+ struct devlink_port *devlink_port = netdev->devlink_port;
+ struct devlink *devlink;
+
+ if (!devlink_port)
+ return NOTIFY_OK;
+ devlink = devlink_port->devlink;
+
+ switch (event) {
+ case NETDEV_POST_INIT:
+ /* Set the type but not netdev pointer. It is going to be set
+ * later on by NETDEV_REGISTER event. Happens once during
+ * netdevice register
+ */
+ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH,
+ NULL);
+ break;
+ case NETDEV_REGISTER:
+ case NETDEV_CHANGENAME:
+ if (devlink_net(devlink) != dev_net(netdev))
+ return NOTIFY_OK;
+ /* Set the netdev on top of previously set type. Note this
+ * event happens also during net namespace change so here
+ * we take into account netdev pointer appearing in this
+ * namespace.
+ */
+ __devlink_port_type_set(devlink_port, devlink_port->type,
+ netdev);
+ break;
+ case NETDEV_UNREGISTER:
+ if (devlink_net(devlink) != dev_net(netdev))
+ return NOTIFY_OK;
+ /* Clear netdev pointer, but not the type. This event happens
+ * also during net namespace change so we need to clear
+ * pointer to netdev that is going to another net namespace.
+ */
+ __devlink_port_type_set(devlink_port, devlink_port->type,
+ NULL);
+ break;
+ case NETDEV_PRE_UNINIT:
+ /* Clear the type and the netdev pointer. Happens one during
+ * netdevice unregister.
+ */
+ __devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET,
+ NULL);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static void __devlink_port_attrs_set(struct devlink_port *devlink_port,
+ enum devlink_port_flavour flavour)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+ devlink_port->attrs_set = true;
+ attrs->flavour = flavour;
+ if (attrs->switch_id.id_len) {
+ devlink_port->switch_port = true;
+ if (WARN_ON(attrs->switch_id.id_len > MAX_PHYS_ITEM_ID_LEN))
+ attrs->switch_id.id_len = MAX_PHYS_ITEM_ID_LEN;
+ } else {
+ devlink_port->switch_port = false;
+ }
+}
+
+/**
+ * devlink_port_attrs_set - Set port attributes
+ *
+ * @devlink_port: devlink port
+ * @attrs: devlink port attrs
+ */
+void devlink_port_attrs_set(struct devlink_port *devlink_port,
+ const struct devlink_port_attrs *attrs)
+{
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+ WARN_ON(attrs->splittable && attrs->split);
+
+ devlink_port->attrs = *attrs;
+ __devlink_port_attrs_set(devlink_port, attrs->flavour);
+}
+EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
+
+/**
+ * devlink_port_attrs_pci_pf_set - Set PCI PF port attributes
+ *
+ * @devlink_port: devlink port
+ * @controller: associated controller number for the devlink port instance
+ * @pf: associated PCI function number for the devlink port instance
+ * @external: indicates if the port is for an external controller
+ */
+void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller,
+ u16 pf, bool external)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
+ __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF);
+ attrs->pci_pf.controller = controller;
+ attrs->pci_pf.pf = pf;
+ attrs->pci_pf.external = external;
+}
+EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set);
+
+/**
+ * devlink_port_attrs_pci_vf_set - Set PCI VF port attributes
+ *
+ * @devlink_port: devlink port
+ * @controller: associated controller number for the devlink port instance
+ * @pf: associated PCI function number for the devlink port instance
+ * @vf: associated PCI VF number of a PF for the devlink port instance;
+ * VF number starts from 0 for the first PCI virtual function
+ * @external: indicates if the port is for an external controller
+ */
+void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller,
+ u16 pf, u16 vf, bool external)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
+ __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF);
+ attrs->pci_vf.controller = controller;
+ attrs->pci_vf.pf = pf;
+ attrs->pci_vf.vf = vf;
+ attrs->pci_vf.external = external;
+}
+EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set);
+
+/**
+ * devlink_port_attrs_pci_sf_set - Set PCI SF port attributes
+ *
+ * @devlink_port: devlink port
+ * @controller: associated controller number for the devlink port instance
+ * @pf: associated PCI function number for the devlink port instance
+ * @sf: associated SF number of a PF for the devlink port instance
+ * @external: indicates if the port is for an external controller
+ */
+void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller,
+ u16 pf, u32 sf, bool external)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
+ __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_SF);
+ attrs->pci_sf.controller = controller;
+ attrs->pci_sf.pf = pf;
+ attrs->pci_sf.sf = sf;
+ attrs->pci_sf.external = external;
+}
+EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set);
+
+static void devlink_port_rel_notify_cb(struct devlink *devlink, u32 port_index)
+{
+ struct devlink_port *devlink_port;
+
+ devlink_port = devlink_port_get_by_index(devlink, port_index);
+ if (!devlink_port)
+ return;
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+
+static void devlink_port_rel_cleanup_cb(struct devlink *devlink, u32 port_index,
+ u32 rel_index)
+{
+ struct devlink_port *devlink_port;
+
+ devlink_port = devlink_port_get_by_index(devlink, port_index);
+ if (devlink_port && devlink_port->rel_index == rel_index)
+ devlink_port->rel_index = 0;
+}
+
+/**
+ * devl_port_fn_devlink_set - Attach peer devlink
+ * instance to port function.
+ * @devlink_port: devlink port
+ * @fn_devlink: devlink instance to attach
+ */
+int devl_port_fn_devlink_set(struct devlink_port *devlink_port,
+ struct devlink *fn_devlink)
+{
+ ASSERT_DEVLINK_PORT_REGISTERED(devlink_port);
+
+ if (WARN_ON(devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_SF ||
+ devlink_port->attrs.pci_sf.external))
+ return -EINVAL;
+
+ return devlink_rel_nested_in_add(&devlink_port->rel_index,
+ devlink_port->devlink->index,
+ devlink_port->index,
+ devlink_port_rel_notify_cb,
+ devlink_port_rel_cleanup_cb,
+ fn_devlink);
+}
+EXPORT_SYMBOL_GPL(devl_port_fn_devlink_set);
+
+/**
+ * devlink_port_linecard_set - Link port with a linecard
+ *
+ * @devlink_port: devlink port
+ * @linecard: devlink linecard
+ */
+void devlink_port_linecard_set(struct devlink_port *devlink_port,
+ struct devlink_linecard *linecard)
+{
+ ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+
+ devlink_port->linecard = linecard;
+}
+EXPORT_SYMBOL_GPL(devlink_port_linecard_set);
+
+static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
+ char *name, size_t len)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+ int n = 0;
+
+ if (!devlink_port->attrs_set || devlink_port->attrs.no_phys_port_name)
+ return -EOPNOTSUPP;
+
+ switch (attrs->flavour) {
+ case DEVLINK_PORT_FLAVOUR_PHYSICAL:
+ if (devlink_port->linecard)
+ n = snprintf(name, len, "l%u",
+ devlink_linecard_index(devlink_port->linecard));
+ if (n < len)
+ n += snprintf(name + n, len - n, "p%u",
+ attrs->phys.port_number);
+ if (n < len && attrs->split)
+ n += snprintf(name + n, len - n, "s%u",
+ attrs->phys.split_subport_number);
+ break;
+ case DEVLINK_PORT_FLAVOUR_CPU:
+ case DEVLINK_PORT_FLAVOUR_DSA:
+ case DEVLINK_PORT_FLAVOUR_UNUSED:
+ /* As CPU and DSA ports do not have a netdevice associated
+ * case should not ever happen.
+ */
+ WARN_ON(1);
+ return -EINVAL;
+ case DEVLINK_PORT_FLAVOUR_PCI_PF:
+ if (attrs->pci_pf.external) {
+ n = snprintf(name, len, "c%u", attrs->pci_pf.controller);
+ if (n >= len)
+ return -EINVAL;
+ len -= n;
+ name += n;
+ }
+ n = snprintf(name, len, "pf%u", attrs->pci_pf.pf);
+ break;
+ case DEVLINK_PORT_FLAVOUR_PCI_VF:
+ if (attrs->pci_vf.external) {
+ n = snprintf(name, len, "c%u", attrs->pci_vf.controller);
+ if (n >= len)
+ return -EINVAL;
+ len -= n;
+ name += n;
+ }
+ n = snprintf(name, len, "pf%uvf%u",
+ attrs->pci_vf.pf, attrs->pci_vf.vf);
+ break;
+ case DEVLINK_PORT_FLAVOUR_PCI_SF:
+ if (attrs->pci_sf.external) {
+ n = snprintf(name, len, "c%u", attrs->pci_sf.controller);
+ if (n >= len)
+ return -EINVAL;
+ len -= n;
+ name += n;
+ }
+ n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf,
+ attrs->pci_sf.sf);
+ break;
+ case DEVLINK_PORT_FLAVOUR_VIRTUAL:
+ return -EOPNOTSUPP;
+ }
+
+ if (n >= len)
+ return -EINVAL;
+
+ return 0;
+}
+
+int devlink_compat_phys_port_name_get(struct net_device *dev,
+ char *name, size_t len)
+{
+ struct devlink_port *devlink_port;
+
+ /* RTNL mutex is held here which ensures that devlink_port
+ * instance cannot disappear in the middle. No need to take
+ * any devlink lock as only permanent values are accessed.
+ */
+ ASSERT_RTNL();
+
+ devlink_port = dev->devlink_port;
+ if (!devlink_port)
+ return -EOPNOTSUPP;
+
+ return __devlink_port_phys_port_name_get(devlink_port, name, len);
+}
+
+int devlink_compat_switch_id_get(struct net_device *dev,
+ struct netdev_phys_item_id *ppid)
+{
+ struct devlink_port *devlink_port;
+
+ /* Caller must hold RTNL mutex or reference to dev, which ensures that
+ * devlink_port instance cannot disappear in the middle. No need to take
+ * any devlink lock as only permanent values are accessed.
+ */
+ devlink_port = dev->devlink_port;
+ if (!devlink_port || !devlink_port->switch_port)
+ return -EOPNOTSUPP;
+
+ memcpy(ppid, &devlink_port->attrs.switch_id, sizeof(*ppid));
+
+ return 0;
+}
diff --git a/net/devlink/rate.c b/net/devlink/rate.c
new file mode 100644
index 000000000000..d157a8419bca
--- /dev/null
+++ b/net/devlink/rate.c
@@ -0,0 +1,850 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include "devl_internal.h"
+
+static inline bool
+devlink_rate_is_leaf(struct devlink_rate *devlink_rate)
+{
+ return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF;
+}
+
+static inline bool
+devlink_rate_is_node(struct devlink_rate *devlink_rate)
+{
+ return devlink_rate->type == DEVLINK_RATE_TYPE_NODE;
+}
+
+static struct devlink_rate *
+devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info)
+{
+ struct devlink_rate *devlink_rate;
+ struct devlink_port *devlink_port;
+
+ devlink_port = devlink_port_get_from_attrs(devlink, info->attrs);
+ if (IS_ERR(devlink_port))
+ return ERR_CAST(devlink_port);
+ devlink_rate = devlink_port->devlink_rate;
+ return devlink_rate ?: ERR_PTR(-ENODEV);
+}
+
+static struct devlink_rate *
+devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name)
+{
+ struct devlink_rate *devlink_rate;
+
+ list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
+ if (devlink_rate_is_node(devlink_rate) &&
+ !strcmp(node_name, devlink_rate->name))
+ return devlink_rate;
+ }
+ return ERR_PTR(-ENODEV);
+}
+
+static struct devlink_rate *
+devlink_rate_node_get_from_attrs(struct devlink *devlink, struct nlattr **attrs)
+{
+ const char *rate_node_name;
+ size_t len;
+
+ if (!attrs[DEVLINK_ATTR_RATE_NODE_NAME])
+ return ERR_PTR(-EINVAL);
+ rate_node_name = nla_data(attrs[DEVLINK_ATTR_RATE_NODE_NAME]);
+ len = strlen(rate_node_name);
+ /* Name cannot be empty or decimal number */
+ if (!len || strspn(rate_node_name, "0123456789") == len)
+ return ERR_PTR(-EINVAL);
+
+ return devlink_rate_node_get_by_name(devlink, rate_node_name);
+}
+
+static struct devlink_rate *
+devlink_rate_node_get_from_info(struct devlink *devlink, struct genl_info *info)
+{
+ return devlink_rate_node_get_from_attrs(devlink, info->attrs);
+}
+
+static struct devlink_rate *
+devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info)
+{
+ struct nlattr **attrs = info->attrs;
+
+ if (attrs[DEVLINK_ATTR_PORT_INDEX])
+ return devlink_rate_leaf_get_from_info(devlink, info);
+ else if (attrs[DEVLINK_ATTR_RATE_NODE_NAME])
+ return devlink_rate_node_get_from_info(devlink, info);
+ else
+ return ERR_PTR(-EINVAL);
+}
+
+static int devlink_rate_put_tc_bws(struct sk_buff *msg, u32 *tc_bw)
+{
+ struct nlattr *nla_tc_bw;
+ int i;
+
+ for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) {
+ nla_tc_bw = nla_nest_start(msg, DEVLINK_ATTR_RATE_TC_BWS);
+ if (!nla_tc_bw)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(msg, DEVLINK_RATE_TC_ATTR_INDEX, i) ||
+ nla_put_u32(msg, DEVLINK_RATE_TC_ATTR_BW, tc_bw[i]))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nla_tc_bw);
+ }
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, nla_tc_bw);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_rate_fill(struct sk_buff *msg,
+ struct devlink_rate *devlink_rate,
+ enum devlink_command cmd, u32 portid, u32 seq,
+ int flags, struct netlink_ext_ack *extack)
+{
+ struct devlink *devlink = devlink_rate->devlink;
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_u16(msg, DEVLINK_ATTR_RATE_TYPE, devlink_rate->type))
+ goto nla_put_failure;
+
+ if (devlink_rate_is_leaf(devlink_rate)) {
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX,
+ devlink_rate->devlink_port->index))
+ goto nla_put_failure;
+ } else if (devlink_rate_is_node(devlink_rate)) {
+ if (nla_put_string(msg, DEVLINK_ATTR_RATE_NODE_NAME,
+ devlink_rate->name))
+ goto nla_put_failure;
+ }
+
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_RATE_TX_SHARE,
+ devlink_rate->tx_share))
+ goto nla_put_failure;
+
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_RATE_TX_MAX,
+ devlink_rate->tx_max))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_PRIORITY,
+ devlink_rate->tx_priority))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_WEIGHT,
+ devlink_rate->tx_weight))
+ goto nla_put_failure;
+
+ if (devlink_rate->parent)
+ if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME,
+ devlink_rate->parent->name))
+ goto nla_put_failure;
+
+ if (devlink_rate_put_tc_bws(msg, devlink_rate->tc_bw))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_rate_notify(struct devlink_rate *devlink_rate,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = devlink_rate->devlink;
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_RATE_NEW && cmd != DEVLINK_CMD_RATE_DEL);
+
+ if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_rate_fill(msg, devlink_rate, cmd, 0, 0, 0, NULL);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ devlink_nl_notify_send(devlink, msg);
+}
+
+void devlink_rates_notify_register(struct devlink *devlink)
+{
+ struct devlink_rate *rate_node;
+
+ list_for_each_entry(rate_node, &devlink->rate_list, list)
+ devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW);
+}
+
+void devlink_rates_notify_unregister(struct devlink *devlink)
+{
+ struct devlink_rate *rate_node;
+
+ list_for_each_entry_reverse(rate_node, &devlink->rate_list, list)
+ devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL);
+}
+
+static int
+devlink_nl_rate_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
+ struct netlink_callback *cb, int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_rate *devlink_rate;
+ int idx = 0;
+ int err = 0;
+
+ list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
+ enum devlink_command cmd = DEVLINK_CMD_RATE_NEW;
+ u32 id = NETLINK_CB(cb->skb).portid;
+
+ if (idx < state->idx) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_rate_fill(msg, devlink_rate, cmd, id,
+ cb->nlh->nlmsg_seq, flags, NULL);
+ if (err) {
+ state->idx = idx;
+ break;
+ }
+ idx++;
+ }
+
+ return err;
+}
+
+int devlink_nl_rate_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_rate_get_dump_one);
+}
+
+int devlink_nl_rate_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_rate *devlink_rate;
+ struct sk_buff *msg;
+ int err;
+
+ devlink_rate = devlink_rate_get_from_info(devlink, info);
+ if (IS_ERR(devlink_rate))
+ return PTR_ERR(devlink_rate);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_rate_fill(msg, devlink_rate, DEVLINK_CMD_RATE_NEW,
+ info->snd_portid, info->snd_seq, 0,
+ info->extack);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static bool
+devlink_rate_is_parent_node(struct devlink_rate *devlink_rate,
+ struct devlink_rate *parent)
+{
+ while (parent) {
+ if (parent == devlink_rate)
+ return true;
+ parent = parent->parent;
+ }
+ return false;
+}
+
+static int
+devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate,
+ struct genl_info *info,
+ struct nlattr *nla_parent)
+{
+ struct devlink *devlink = devlink_rate->devlink;
+ const char *parent_name = nla_data(nla_parent);
+ const struct devlink_ops *ops = devlink->ops;
+ size_t len = strlen(parent_name);
+ struct devlink_rate *parent;
+ int err = -EOPNOTSUPP;
+
+ parent = devlink_rate->parent;
+
+ if (parent && !len) {
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_parent_set(devlink_rate, NULL,
+ devlink_rate->priv, NULL,
+ info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_parent_set(devlink_rate, NULL,
+ devlink_rate->priv, NULL,
+ info->extack);
+ if (err)
+ return err;
+
+ refcount_dec(&parent->refcnt);
+ devlink_rate->parent = NULL;
+ } else if (len) {
+ parent = devlink_rate_node_get_by_name(devlink, parent_name);
+ if (IS_ERR(parent))
+ return -ENODEV;
+
+ if (parent == devlink_rate) {
+ NL_SET_ERR_MSG(info->extack, "Parent to self is not allowed");
+ return -EINVAL;
+ }
+
+ if (devlink_rate_is_node(devlink_rate) &&
+ devlink_rate_is_parent_node(devlink_rate, parent->parent)) {
+ NL_SET_ERR_MSG(info->extack, "Node is already a parent of parent node.");
+ return -EEXIST;
+ }
+
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_parent_set(devlink_rate, parent,
+ devlink_rate->priv, parent->priv,
+ info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_parent_set(devlink_rate, parent,
+ devlink_rate->priv, parent->priv,
+ info->extack);
+ if (err)
+ return err;
+
+ if (devlink_rate->parent)
+ /* we're reassigning to other parent in this case */
+ refcount_dec(&devlink_rate->parent->refcnt);
+
+ refcount_inc(&parent->refcnt);
+ devlink_rate->parent = parent;
+ }
+
+ return 0;
+}
+
+static int devlink_nl_rate_tc_bw_parse(struct nlattr *parent_nest, u32 *tc_bw,
+ unsigned long *bitmap,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[DEVLINK_RATE_TC_ATTR_MAX + 1];
+ u8 tc_index;
+ int err;
+
+ err = nla_parse_nested(tb, DEVLINK_RATE_TC_ATTR_MAX, parent_nest,
+ devlink_dl_rate_tc_bws_nl_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[DEVLINK_RATE_TC_ATTR_INDEX]) {
+ NL_SET_ERR_ATTR_MISS(extack, parent_nest,
+ DEVLINK_RATE_TC_ATTR_INDEX);
+ return -EINVAL;
+ }
+
+ tc_index = nla_get_u8(tb[DEVLINK_RATE_TC_ATTR_INDEX]);
+
+ if (!tb[DEVLINK_RATE_TC_ATTR_BW]) {
+ NL_SET_ERR_ATTR_MISS(extack, parent_nest,
+ DEVLINK_RATE_TC_ATTR_BW);
+ return -EINVAL;
+ }
+
+ if (test_and_set_bit(tc_index, bitmap)) {
+ NL_SET_ERR_MSG_FMT(extack,
+ "Duplicate traffic class index specified (%u)",
+ tc_index);
+ return -EINVAL;
+ }
+
+ tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_RATE_TC_ATTR_BW]);
+
+ return 0;
+}
+
+static int devlink_nl_rate_tc_bw_set(struct devlink_rate *devlink_rate,
+ struct genl_info *info)
+{
+ DECLARE_BITMAP(bitmap, DEVLINK_RATE_TCS_MAX) = {};
+ struct devlink *devlink = devlink_rate->devlink;
+ const struct devlink_ops *ops = devlink->ops;
+ u32 tc_bw[DEVLINK_RATE_TCS_MAX] = {};
+ int rem, err = -EOPNOTSUPP, i;
+ struct nlattr *attr;
+
+ nlmsg_for_each_attr_type(attr, DEVLINK_ATTR_RATE_TC_BWS, info->nlhdr,
+ GENL_HDRLEN, rem) {
+ err = devlink_nl_rate_tc_bw_parse(attr, tc_bw, bitmap,
+ info->extack);
+ if (err)
+ return err;
+ }
+
+ for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) {
+ if (!test_bit(i, bitmap)) {
+ NL_SET_ERR_MSG_FMT(info->extack,
+ "Bandwidth values must be specified for all %u traffic classes",
+ DEVLINK_RATE_TCS_MAX);
+ return -EINVAL;
+ }
+ }
+
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_tc_bw_set(devlink_rate, devlink_rate->priv,
+ tc_bw, info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_tc_bw_set(devlink_rate, devlink_rate->priv,
+ tc_bw, info->extack);
+
+ if (err)
+ return err;
+
+ memcpy(devlink_rate->tc_bw, tc_bw, sizeof(tc_bw));
+
+ return 0;
+}
+
+static int devlink_nl_rate_set(struct devlink_rate *devlink_rate,
+ const struct devlink_ops *ops,
+ struct genl_info *info)
+{
+ struct nlattr *nla_parent, **attrs = info->attrs;
+ int err = -EOPNOTSUPP;
+ u32 priority;
+ u32 weight;
+ u64 rate;
+
+ if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) {
+ rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_SHARE]);
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_tx_share_set(devlink_rate, devlink_rate->priv,
+ rate, info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_tx_share_set(devlink_rate, devlink_rate->priv,
+ rate, info->extack);
+ if (err)
+ return err;
+ devlink_rate->tx_share = rate;
+ }
+
+ if (attrs[DEVLINK_ATTR_RATE_TX_MAX]) {
+ rate = nla_get_u64(attrs[DEVLINK_ATTR_RATE_TX_MAX]);
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_tx_max_set(devlink_rate, devlink_rate->priv,
+ rate, info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_tx_max_set(devlink_rate, devlink_rate->priv,
+ rate, info->extack);
+ if (err)
+ return err;
+ devlink_rate->tx_max = rate;
+ }
+
+ if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]) {
+ priority = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]);
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_tx_priority_set(devlink_rate, devlink_rate->priv,
+ priority, info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_tx_priority_set(devlink_rate, devlink_rate->priv,
+ priority, info->extack);
+
+ if (err)
+ return err;
+ devlink_rate->tx_priority = priority;
+ }
+
+ if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]) {
+ weight = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]);
+ if (devlink_rate_is_leaf(devlink_rate))
+ err = ops->rate_leaf_tx_weight_set(devlink_rate, devlink_rate->priv,
+ weight, info->extack);
+ else if (devlink_rate_is_node(devlink_rate))
+ err = ops->rate_node_tx_weight_set(devlink_rate, devlink_rate->priv,
+ weight, info->extack);
+
+ if (err)
+ return err;
+ devlink_rate->tx_weight = weight;
+ }
+
+ nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME];
+ if (nla_parent) {
+ err = devlink_nl_rate_parent_node_set(devlink_rate, info,
+ nla_parent);
+ if (err)
+ return err;
+ }
+
+ if (attrs[DEVLINK_ATTR_RATE_TC_BWS]) {
+ err = devlink_nl_rate_tc_bw_set(devlink_rate, info);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops,
+ struct genl_info *info,
+ enum devlink_rate_type type)
+{
+ struct nlattr **attrs = info->attrs;
+
+ if (type == DEVLINK_RATE_TYPE_LEAF) {
+ if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_leaf_tx_share_set) {
+ NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the leafs");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_leaf_tx_max_set) {
+ NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the leafs");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] &&
+ !ops->rate_leaf_parent_set) {
+ NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the leafs");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_leaf_tx_priority_set) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[DEVLINK_ATTR_RATE_TX_PRIORITY],
+ "TX priority set isn't supported for the leafs");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_leaf_tx_weight_set) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[DEVLINK_ATTR_RATE_TX_WEIGHT],
+ "TX weight set isn't supported for the leafs");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TC_BWS] &&
+ !ops->rate_leaf_tc_bw_set) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[DEVLINK_ATTR_RATE_TC_BWS],
+ "TC bandwidth set isn't supported for the leafs");
+ return false;
+ }
+ } else if (type == DEVLINK_RATE_TYPE_NODE) {
+ if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) {
+ NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TX_MAX] && !ops->rate_node_tx_max_set) {
+ NL_SET_ERR_MSG(info->extack, "TX max set isn't supported for the nodes");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME] &&
+ !ops->rate_node_parent_set) {
+ NL_SET_ERR_MSG(info->extack, "Parent set isn't supported for the nodes");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_node_tx_priority_set) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[DEVLINK_ATTR_RATE_TX_PRIORITY],
+ "TX priority set isn't supported for the nodes");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_node_tx_weight_set) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[DEVLINK_ATTR_RATE_TX_WEIGHT],
+ "TX weight set isn't supported for the nodes");
+ return false;
+ }
+ if (attrs[DEVLINK_ATTR_RATE_TC_BWS] &&
+ !ops->rate_node_tc_bw_set) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[DEVLINK_ATTR_RATE_TC_BWS],
+ "TC bandwidth set isn't supported for the nodes");
+ return false;
+ }
+ } else {
+ WARN(1, "Unknown type of rate object");
+ return false;
+ }
+
+ return true;
+}
+
+int devlink_nl_rate_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_rate *devlink_rate;
+ const struct devlink_ops *ops;
+ int err;
+
+ devlink_rate = devlink_rate_get_from_info(devlink, info);
+ if (IS_ERR(devlink_rate))
+ return PTR_ERR(devlink_rate);
+
+ ops = devlink->ops;
+ if (!ops || !devlink_rate_set_ops_supported(ops, info, devlink_rate->type))
+ return -EOPNOTSUPP;
+
+ err = devlink_nl_rate_set(devlink_rate, ops, info);
+
+ if (!err)
+ devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW);
+ return err;
+}
+
+int devlink_nl_rate_new_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_rate *rate_node;
+ const struct devlink_ops *ops;
+ int err;
+
+ ops = devlink->ops;
+ if (!ops || !ops->rate_node_new || !ops->rate_node_del) {
+ NL_SET_ERR_MSG(info->extack, "Rate nodes aren't supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (!devlink_rate_set_ops_supported(ops, info, DEVLINK_RATE_TYPE_NODE))
+ return -EOPNOTSUPP;
+
+ rate_node = devlink_rate_node_get_from_attrs(devlink, info->attrs);
+ if (!IS_ERR(rate_node))
+ return -EEXIST;
+ else if (rate_node == ERR_PTR(-EINVAL))
+ return -EINVAL;
+
+ rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL);
+ if (!rate_node)
+ return -ENOMEM;
+
+ rate_node->devlink = devlink;
+ rate_node->type = DEVLINK_RATE_TYPE_NODE;
+ rate_node->name = nla_strdup(info->attrs[DEVLINK_ATTR_RATE_NODE_NAME], GFP_KERNEL);
+ if (!rate_node->name) {
+ err = -ENOMEM;
+ goto err_strdup;
+ }
+
+ err = ops->rate_node_new(rate_node, &rate_node->priv, info->extack);
+ if (err)
+ goto err_node_new;
+
+ err = devlink_nl_rate_set(rate_node, ops, info);
+ if (err)
+ goto err_rate_set;
+
+ refcount_set(&rate_node->refcnt, 1);
+ list_add(&rate_node->list, &devlink->rate_list);
+ devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW);
+ return 0;
+
+err_rate_set:
+ ops->rate_node_del(rate_node, rate_node->priv, info->extack);
+err_node_new:
+ kfree(rate_node->name);
+err_strdup:
+ kfree(rate_node);
+ return err;
+}
+
+int devlink_nl_rate_del_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_rate *rate_node;
+ int err;
+
+ rate_node = devlink_rate_node_get_from_info(devlink, info);
+ if (IS_ERR(rate_node))
+ return PTR_ERR(rate_node);
+
+ if (refcount_read(&rate_node->refcnt) > 1) {
+ NL_SET_ERR_MSG(info->extack, "Node has children. Cannot delete node.");
+ return -EBUSY;
+ }
+
+ devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_DEL);
+ err = devlink->ops->rate_node_del(rate_node, rate_node->priv,
+ info->extack);
+ if (rate_node->parent)
+ refcount_dec(&rate_node->parent->refcnt);
+ list_del(&rate_node->list);
+ kfree(rate_node->name);
+ kfree(rate_node);
+ return err;
+}
+
+int devlink_rate_nodes_check(struct devlink *devlink, u16 mode,
+ struct netlink_ext_ack *extack)
+{
+ struct devlink_rate *devlink_rate;
+
+ list_for_each_entry(devlink_rate, &devlink->rate_list, list)
+ if (devlink_rate_is_node(devlink_rate)) {
+ NL_SET_ERR_MSG(extack, "Rate node(s) exists.");
+ return -EBUSY;
+ }
+ return 0;
+}
+
+/**
+ * devl_rate_node_create - create devlink rate node
+ * @devlink: devlink instance
+ * @priv: driver private data
+ * @node_name: name of the resulting node
+ * @parent: parent devlink_rate struct
+ *
+ * Create devlink rate object of type node
+ */
+struct devlink_rate *
+devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name,
+ struct devlink_rate *parent)
+{
+ struct devlink_rate *rate_node;
+
+ rate_node = devlink_rate_node_get_by_name(devlink, node_name);
+ if (!IS_ERR(rate_node))
+ return ERR_PTR(-EEXIST);
+
+ rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL);
+ if (!rate_node)
+ return ERR_PTR(-ENOMEM);
+
+ if (parent) {
+ rate_node->parent = parent;
+ refcount_inc(&rate_node->parent->refcnt);
+ }
+
+ rate_node->type = DEVLINK_RATE_TYPE_NODE;
+ rate_node->devlink = devlink;
+ rate_node->priv = priv;
+
+ rate_node->name = kstrdup(node_name, GFP_KERNEL);
+ if (!rate_node->name) {
+ kfree(rate_node);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ refcount_set(&rate_node->refcnt, 1);
+ list_add(&rate_node->list, &devlink->rate_list);
+ devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW);
+ return rate_node;
+}
+EXPORT_SYMBOL_GPL(devl_rate_node_create);
+
+/**
+ * devl_rate_leaf_create - create devlink rate leaf
+ * @devlink_port: devlink port object to create rate object on
+ * @priv: driver private data
+ * @parent: parent devlink_rate struct
+ *
+ * Create devlink rate object of type leaf on provided @devlink_port.
+ */
+int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv,
+ struct devlink_rate *parent)
+{
+ struct devlink *devlink = devlink_port->devlink;
+ struct devlink_rate *devlink_rate;
+
+ devl_assert_locked(devlink_port->devlink);
+
+ if (WARN_ON(devlink_port->devlink_rate))
+ return -EBUSY;
+
+ devlink_rate = kzalloc(sizeof(*devlink_rate), GFP_KERNEL);
+ if (!devlink_rate)
+ return -ENOMEM;
+
+ if (parent) {
+ devlink_rate->parent = parent;
+ refcount_inc(&devlink_rate->parent->refcnt);
+ }
+
+ devlink_rate->type = DEVLINK_RATE_TYPE_LEAF;
+ devlink_rate->devlink = devlink;
+ devlink_rate->devlink_port = devlink_port;
+ devlink_rate->priv = priv;
+ list_add_tail(&devlink_rate->list, &devlink->rate_list);
+ devlink_port->devlink_rate = devlink_rate;
+ devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_NEW);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_rate_leaf_create);
+
+/**
+ * devl_rate_leaf_destroy - destroy devlink rate leaf
+ *
+ * @devlink_port: devlink port linked to the rate object
+ *
+ * Destroy the devlink rate object of type leaf on provided @devlink_port.
+ */
+void devl_rate_leaf_destroy(struct devlink_port *devlink_port)
+{
+ struct devlink_rate *devlink_rate = devlink_port->devlink_rate;
+
+ devl_assert_locked(devlink_port->devlink);
+ if (!devlink_rate)
+ return;
+
+ devlink_rate_notify(devlink_rate, DEVLINK_CMD_RATE_DEL);
+ if (devlink_rate->parent)
+ refcount_dec(&devlink_rate->parent->refcnt);
+ list_del(&devlink_rate->list);
+ devlink_port->devlink_rate = NULL;
+ kfree(devlink_rate);
+}
+EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy);
+
+/**
+ * devl_rate_nodes_destroy - destroy all devlink rate nodes on device
+ * @devlink: devlink instance
+ *
+ * Unset parent for all rate objects and destroy all rate nodes
+ * on specified device.
+ */
+void devl_rate_nodes_destroy(struct devlink *devlink)
+{
+ const struct devlink_ops *ops = devlink->ops;
+ struct devlink_rate *devlink_rate, *tmp;
+
+ devl_assert_locked(devlink);
+
+ list_for_each_entry(devlink_rate, &devlink->rate_list, list) {
+ if (!devlink_rate->parent)
+ continue;
+
+ if (devlink_rate_is_leaf(devlink_rate))
+ ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv,
+ NULL, NULL);
+ else if (devlink_rate_is_node(devlink_rate))
+ ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv,
+ NULL, NULL);
+
+ refcount_dec(&devlink_rate->parent->refcnt);
+ devlink_rate->parent = NULL;
+ }
+ list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) {
+ if (devlink_rate_is_node(devlink_rate)) {
+ ops->rate_node_del(devlink_rate, devlink_rate->priv, NULL);
+ list_del(&devlink_rate->list);
+ kfree(devlink_rate->name);
+ kfree(devlink_rate);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy);
diff --git a/net/devlink/region.c b/net/devlink/region.c
new file mode 100644
index 000000000000..d6e5805cf3a0
--- /dev/null
+++ b/net/devlink/region.c
@@ -0,0 +1,1258 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include "devl_internal.h"
+
+struct devlink_region {
+ struct devlink *devlink;
+ struct devlink_port *port;
+ struct list_head list;
+ union {
+ const struct devlink_region_ops *ops;
+ const struct devlink_port_region_ops *port_ops;
+ };
+ struct mutex snapshot_lock; /* protects snapshot_list,
+ * max_snapshots and cur_snapshots
+ * consistency.
+ */
+ struct list_head snapshot_list;
+ u32 max_snapshots;
+ u32 cur_snapshots;
+ u64 size;
+};
+
+struct devlink_snapshot {
+ struct list_head list;
+ struct devlink_region *region;
+ u8 *data;
+ u32 id;
+};
+
+static struct devlink_region *
+devlink_region_get_by_name(struct devlink *devlink, const char *region_name)
+{
+ struct devlink_region *region;
+
+ list_for_each_entry(region, &devlink->region_list, list)
+ if (!strcmp(region->ops->name, region_name))
+ return region;
+
+ return NULL;
+}
+
+static struct devlink_region *
+devlink_port_region_get_by_name(struct devlink_port *port,
+ const char *region_name)
+{
+ struct devlink_region *region;
+
+ list_for_each_entry(region, &port->region_list, list)
+ if (!strcmp(region->port_ops->name, region_name))
+ return region;
+
+ return NULL;
+}
+
+static struct devlink_snapshot *
+devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id)
+{
+ struct devlink_snapshot *snapshot;
+
+ list_for_each_entry(snapshot, &region->snapshot_list, list)
+ if (snapshot->id == id)
+ return snapshot;
+
+ return NULL;
+}
+
+static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct devlink_snapshot *snapshot)
+{
+ struct nlattr *snap_attr;
+ int err;
+
+ snap_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_SNAPSHOT);
+ if (!snap_attr)
+ return -EMSGSIZE;
+
+ err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID, snapshot->id);
+ if (err)
+ goto nla_put_failure;
+
+ nla_nest_end(msg, snap_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, snap_attr);
+ return err;
+}
+
+static int devlink_nl_region_snapshots_id_put(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct devlink_region *region)
+{
+ struct devlink_snapshot *snapshot;
+ struct nlattr *snapshots_attr;
+ int err;
+
+ snapshots_attr = nla_nest_start_noflag(msg,
+ DEVLINK_ATTR_REGION_SNAPSHOTS);
+ if (!snapshots_attr)
+ return -EMSGSIZE;
+
+ list_for_each_entry(snapshot, &region->snapshot_list, list) {
+ err = devlink_nl_region_snapshot_id_put(msg, devlink, snapshot);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ nla_nest_end(msg, snapshots_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, snapshots_attr);
+ return err;
+}
+
+static int devlink_nl_region_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags,
+ struct devlink_region *region)
+{
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ err = devlink_nl_put_handle(msg, devlink);
+ if (err)
+ goto nla_put_failure;
+
+ if (region->port) {
+ err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX,
+ region->port->index);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME, region->ops->name);
+ if (err)
+ goto nla_put_failure;
+
+ err = devlink_nl_put_u64(msg, DEVLINK_ATTR_REGION_SIZE, region->size);
+ if (err)
+ goto nla_put_failure;
+
+ err = nla_put_u32(msg, DEVLINK_ATTR_REGION_MAX_SNAPSHOTS,
+ region->max_snapshots);
+ if (err)
+ goto nla_put_failure;
+
+ err = devlink_nl_region_snapshots_id_put(msg, devlink, region);
+ if (err)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+static struct sk_buff *
+devlink_nl_region_notify_build(struct devlink_region *region,
+ struct devlink_snapshot *snapshot,
+ enum devlink_command cmd, u32 portid, u32 seq)
+{
+ struct devlink *devlink = region->devlink;
+ struct sk_buff *msg;
+ void *hdr;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return ERR_PTR(-ENOMEM);
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, 0, cmd);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto out_free_msg;
+ }
+
+ err = devlink_nl_put_handle(msg, devlink);
+ if (err)
+ goto out_cancel_msg;
+
+ if (region->port) {
+ err = nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX,
+ region->port->index);
+ if (err)
+ goto out_cancel_msg;
+ }
+
+ err = nla_put_string(msg, DEVLINK_ATTR_REGION_NAME,
+ region->ops->name);
+ if (err)
+ goto out_cancel_msg;
+
+ if (snapshot) {
+ err = nla_put_u32(msg, DEVLINK_ATTR_REGION_SNAPSHOT_ID,
+ snapshot->id);
+ if (err)
+ goto out_cancel_msg;
+ } else {
+ err = devlink_nl_put_u64(msg, DEVLINK_ATTR_REGION_SIZE,
+ region->size);
+ if (err)
+ goto out_cancel_msg;
+ }
+ genlmsg_end(msg, hdr);
+
+ return msg;
+
+out_cancel_msg:
+ genlmsg_cancel(msg, hdr);
+out_free_msg:
+ nlmsg_free(msg);
+ return ERR_PTR(err);
+}
+
+static void devlink_nl_region_notify(struct devlink_region *region,
+ struct devlink_snapshot *snapshot,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = region->devlink;
+ struct sk_buff *msg;
+
+ WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL);
+
+ if (!__devl_is_registered(devlink) || !devlink_nl_notify_need(devlink))
+ return;
+
+ msg = devlink_nl_region_notify_build(region, snapshot, cmd, 0, 0);
+ if (IS_ERR(msg))
+ return;
+
+ devlink_nl_notify_send(devlink, msg);
+}
+
+void devlink_regions_notify_register(struct devlink *devlink)
+{
+ struct devlink_region *region;
+
+ list_for_each_entry(region, &devlink->region_list, list)
+ devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW);
+}
+
+void devlink_regions_notify_unregister(struct devlink *devlink)
+{
+ struct devlink_region *region;
+
+ list_for_each_entry_reverse(region, &devlink->region_list, list)
+ devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL);
+}
+
+/**
+ * __devlink_snapshot_id_increment - Increment number of snapshots using an id
+ * @devlink: devlink instance
+ * @id: the snapshot id
+ *
+ * Track when a new snapshot begins using an id. Load the count for the
+ * given id from the snapshot xarray, increment it, and store it back.
+ *
+ * Called when a new snapshot is created with the given id.
+ *
+ * The id *must* have been previously allocated by
+ * devlink_region_snapshot_id_get().
+ *
+ * Returns 0 on success, or an error on failure.
+ */
+static int __devlink_snapshot_id_increment(struct devlink *devlink, u32 id)
+{
+ unsigned long count;
+ void *p;
+ int err;
+
+ xa_lock(&devlink->snapshot_ids);
+ p = xa_load(&devlink->snapshot_ids, id);
+ if (WARN_ON(!p)) {
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ if (WARN_ON(!xa_is_value(p))) {
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ count = xa_to_value(p);
+ count++;
+
+ err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(count),
+ GFP_ATOMIC));
+unlock:
+ xa_unlock(&devlink->snapshot_ids);
+ return err;
+}
+
+/**
+ * __devlink_snapshot_id_decrement - Decrease number of snapshots using an id
+ * @devlink: devlink instance
+ * @id: the snapshot id
+ *
+ * Track when a snapshot is deleted and stops using an id. Load the count
+ * for the given id from the snapshot xarray, decrement it, and store it
+ * back.
+ *
+ * If the count reaches zero, erase this id from the xarray, freeing it
+ * up for future re-use by devlink_region_snapshot_id_get().
+ *
+ * Called when a snapshot using the given id is deleted, and when the
+ * initial allocator of the id is finished using it.
+ */
+static void __devlink_snapshot_id_decrement(struct devlink *devlink, u32 id)
+{
+ unsigned long count;
+ void *p;
+
+ xa_lock(&devlink->snapshot_ids);
+ p = xa_load(&devlink->snapshot_ids, id);
+ if (WARN_ON(!p))
+ goto unlock;
+
+ if (WARN_ON(!xa_is_value(p)))
+ goto unlock;
+
+ count = xa_to_value(p);
+
+ if (count > 1) {
+ count--;
+ __xa_store(&devlink->snapshot_ids, id, xa_mk_value(count),
+ GFP_ATOMIC);
+ } else {
+ /* If this was the last user, we can erase this id */
+ __xa_erase(&devlink->snapshot_ids, id);
+ }
+unlock:
+ xa_unlock(&devlink->snapshot_ids);
+}
+
+/**
+ * __devlink_snapshot_id_insert - Insert a specific snapshot ID
+ * @devlink: devlink instance
+ * @id: the snapshot id
+ *
+ * Mark the given snapshot id as used by inserting a zero value into the
+ * snapshot xarray.
+ *
+ * This must be called while holding the devlink instance lock. Unlike
+ * devlink_snapshot_id_get, the initial reference count is zero, not one.
+ * It is expected that the id will immediately be used before
+ * releasing the devlink instance lock.
+ *
+ * Returns zero on success, or an error code if the snapshot id could not
+ * be inserted.
+ */
+static int __devlink_snapshot_id_insert(struct devlink *devlink, u32 id)
+{
+ int err;
+
+ xa_lock(&devlink->snapshot_ids);
+ if (xa_load(&devlink->snapshot_ids, id)) {
+ xa_unlock(&devlink->snapshot_ids);
+ return -EEXIST;
+ }
+ err = xa_err(__xa_store(&devlink->snapshot_ids, id, xa_mk_value(0),
+ GFP_ATOMIC));
+ xa_unlock(&devlink->snapshot_ids);
+ return err;
+}
+
+/**
+ * __devlink_region_snapshot_id_get - get snapshot ID
+ * @devlink: devlink instance
+ * @id: storage to return snapshot id
+ *
+ * Allocates a new snapshot id. Returns zero on success, or a negative
+ * error on failure. Must be called while holding the devlink instance
+ * lock.
+ *
+ * Snapshot IDs are tracked using an xarray which stores the number of
+ * users of the snapshot id.
+ *
+ * Note that the caller of this function counts as a 'user', in order to
+ * avoid race conditions. The caller must release its hold on the
+ * snapshot by using devlink_region_snapshot_id_put.
+ */
+static int __devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id)
+{
+ return xa_alloc(&devlink->snapshot_ids, id, xa_mk_value(1),
+ xa_limit_32b, GFP_KERNEL);
+}
+
+/**
+ * __devlink_region_snapshot_create - create a new snapshot
+ * This will add a new snapshot of a region. The snapshot
+ * will be stored on the region struct and can be accessed
+ * from devlink. This is useful for future analyses of snapshots.
+ * Multiple snapshots can be created on a region.
+ * The @snapshot_id should be obtained using the getter function.
+ *
+ * Must be called only while holding the region snapshot lock.
+ *
+ * @region: devlink region of the snapshot
+ * @data: snapshot data
+ * @snapshot_id: snapshot id to be created
+ */
+static int
+__devlink_region_snapshot_create(struct devlink_region *region,
+ u8 *data, u32 snapshot_id)
+{
+ struct devlink *devlink = region->devlink;
+ struct devlink_snapshot *snapshot;
+ int err;
+
+ lockdep_assert_held(&region->snapshot_lock);
+
+ /* check if region can hold one more snapshot */
+ if (region->cur_snapshots == region->max_snapshots)
+ return -ENOSPC;
+
+ if (devlink_region_snapshot_get_by_id(region, snapshot_id))
+ return -EEXIST;
+
+ snapshot = kzalloc(sizeof(*snapshot), GFP_KERNEL);
+ if (!snapshot)
+ return -ENOMEM;
+
+ err = __devlink_snapshot_id_increment(devlink, snapshot_id);
+ if (err)
+ goto err_snapshot_id_increment;
+
+ snapshot->id = snapshot_id;
+ snapshot->region = region;
+ snapshot->data = data;
+
+ list_add_tail(&snapshot->list, &region->snapshot_list);
+
+ region->cur_snapshots++;
+
+ devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_NEW);
+ return 0;
+
+err_snapshot_id_increment:
+ kfree(snapshot);
+ return err;
+}
+
+static void devlink_region_snapshot_del(struct devlink_region *region,
+ struct devlink_snapshot *snapshot)
+{
+ struct devlink *devlink = region->devlink;
+
+ lockdep_assert_held(&region->snapshot_lock);
+
+ devlink_nl_region_notify(region, snapshot, DEVLINK_CMD_REGION_DEL);
+ region->cur_snapshots--;
+ list_del(&snapshot->list);
+ region->ops->destructor(snapshot->data);
+ __devlink_snapshot_id_decrement(devlink, snapshot->id);
+ kfree(snapshot);
+}
+
+int devlink_nl_region_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_port *port = NULL;
+ struct devlink_region *region;
+ const char *region_name;
+ struct sk_buff *msg;
+ unsigned int index;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME))
+ return -EINVAL;
+
+ if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+
+ port = devlink_port_get_by_index(devlink, index);
+ if (!port)
+ return -ENODEV;
+ }
+
+ region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
+ if (port)
+ region = devlink_port_region_get_by_name(port, region_name);
+ else
+ region = devlink_region_get_by_name(devlink, region_name);
+
+ if (!region)
+ return -EINVAL;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_region_fill(msg, devlink, DEVLINK_CMD_REGION_GET,
+ info->snd_portid, info->snd_seq, 0,
+ region);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_region_get_port_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb,
+ struct devlink_port *port,
+ int *idx, int start, int flags)
+{
+ struct devlink_region *region;
+ int err = 0;
+
+ list_for_each_entry(region, &port->region_list, list) {
+ if (*idx < start) {
+ (*idx)++;
+ continue;
+ }
+ err = devlink_nl_region_fill(msg, port->devlink,
+ DEVLINK_CMD_REGION_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ flags, region);
+ if (err)
+ goto out;
+ (*idx)++;
+ }
+
+out:
+ return err;
+}
+
+static int devlink_nl_region_get_dump_one(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb,
+ int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_region *region;
+ struct devlink_port *port;
+ unsigned long port_index;
+ int idx = 0;
+ int err;
+
+ list_for_each_entry(region, &devlink->region_list, list) {
+ if (idx < state->idx) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_region_fill(msg, devlink,
+ DEVLINK_CMD_REGION_GET,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags,
+ region);
+ if (err) {
+ state->idx = idx;
+ return err;
+ }
+ idx++;
+ }
+
+ xa_for_each(&devlink->ports, port_index, port) {
+ err = devlink_nl_cmd_region_get_port_dumpit(msg, cb, port, &idx,
+ state->idx, flags);
+ if (err) {
+ state->idx = idx;
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+int devlink_nl_region_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_region_get_dump_one);
+}
+
+int devlink_nl_region_del_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_snapshot *snapshot;
+ struct devlink_port *port = NULL;
+ struct devlink_region *region;
+ const char *region_name;
+ unsigned int index;
+ u32 snapshot_id;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME) ||
+ GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_SNAPSHOT_ID))
+ return -EINVAL;
+
+ region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
+ snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]);
+
+ if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+
+ port = devlink_port_get_by_index(devlink, index);
+ if (!port)
+ return -ENODEV;
+ }
+
+ if (port)
+ region = devlink_port_region_get_by_name(port, region_name);
+ else
+ region = devlink_region_get_by_name(devlink, region_name);
+
+ if (!region)
+ return -EINVAL;
+
+ mutex_lock(&region->snapshot_lock);
+ snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id);
+ if (!snapshot) {
+ mutex_unlock(&region->snapshot_lock);
+ return -EINVAL;
+ }
+
+ devlink_region_snapshot_del(region, snapshot);
+ mutex_unlock(&region->snapshot_lock);
+ return 0;
+}
+
+int devlink_nl_region_new_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_snapshot *snapshot;
+ struct devlink_port *port = NULL;
+ struct nlattr *snapshot_id_attr;
+ struct devlink_region *region;
+ const char *region_name;
+ unsigned int index;
+ u32 snapshot_id;
+ u8 *data;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_REGION_NAME)) {
+ NL_SET_ERR_MSG(info->extack, "No region name provided");
+ return -EINVAL;
+ }
+
+ region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]);
+
+ if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+
+ port = devlink_port_get_by_index(devlink, index);
+ if (!port)
+ return -ENODEV;
+ }
+
+ if (port)
+ region = devlink_port_region_get_by_name(port, region_name);
+ else
+ region = devlink_region_get_by_name(devlink, region_name);
+
+ if (!region) {
+ NL_SET_ERR_MSG(info->extack, "The requested region does not exist");
+ return -EINVAL;
+ }
+
+ if (!region->ops->snapshot) {
+ NL_SET_ERR_MSG(info->extack, "The requested region does not support taking an immediate snapshot");
+ return -EOPNOTSUPP;
+ }
+
+ mutex_lock(&region->snapshot_lock);
+
+ if (region->cur_snapshots == region->max_snapshots) {
+ NL_SET_ERR_MSG(info->extack, "The region has reached the maximum number of stored snapshots");
+ err = -ENOSPC;
+ goto unlock;
+ }
+
+ snapshot_id_attr = info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID];
+ if (snapshot_id_attr) {
+ snapshot_id = nla_get_u32(snapshot_id_attr);
+
+ if (devlink_region_snapshot_get_by_id(region, snapshot_id)) {
+ NL_SET_ERR_MSG(info->extack, "The requested snapshot id is already in use");
+ err = -EEXIST;
+ goto unlock;
+ }
+
+ err = __devlink_snapshot_id_insert(devlink, snapshot_id);
+ if (err)
+ goto unlock;
+ } else {
+ err = __devlink_region_snapshot_id_get(devlink, &snapshot_id);
+ if (err) {
+ NL_SET_ERR_MSG(info->extack, "Failed to allocate a new snapshot id");
+ goto unlock;
+ }
+ }
+
+ if (port)
+ err = region->port_ops->snapshot(port, region->port_ops,
+ info->extack, &data);
+ else
+ err = region->ops->snapshot(devlink, region->ops,
+ info->extack, &data);
+ if (err)
+ goto err_snapshot_capture;
+
+ err = __devlink_region_snapshot_create(region, data, snapshot_id);
+ if (err)
+ goto err_snapshot_create;
+
+ if (!snapshot_id_attr) {
+ struct sk_buff *msg;
+
+ snapshot = devlink_region_snapshot_get_by_id(region,
+ snapshot_id);
+ if (WARN_ON(!snapshot)) {
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ msg = devlink_nl_region_notify_build(region, snapshot,
+ DEVLINK_CMD_REGION_NEW,
+ info->snd_portid,
+ info->snd_seq);
+ err = PTR_ERR_OR_ZERO(msg);
+ if (err)
+ goto err_notify;
+
+ err = genlmsg_reply(msg, info);
+ if (err)
+ goto err_notify;
+ }
+
+ mutex_unlock(&region->snapshot_lock);
+ return 0;
+
+err_snapshot_create:
+ region->ops->destructor(data);
+err_snapshot_capture:
+ __devlink_snapshot_id_decrement(devlink, snapshot_id);
+ mutex_unlock(&region->snapshot_lock);
+ return err;
+
+err_notify:
+ devlink_region_snapshot_del(region, snapshot);
+unlock:
+ mutex_unlock(&region->snapshot_lock);
+ return err;
+}
+
+static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg,
+ u8 *chunk, u32 chunk_size,
+ u64 addr)
+{
+ struct nlattr *chunk_attr;
+ int err;
+
+ chunk_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_REGION_CHUNK);
+ if (!chunk_attr)
+ return -EINVAL;
+
+ err = nla_put(msg, DEVLINK_ATTR_REGION_CHUNK_DATA, chunk_size, chunk);
+ if (err)
+ goto nla_put_failure;
+
+ err = devlink_nl_put_u64(msg, DEVLINK_ATTR_REGION_CHUNK_ADDR, addr);
+ if (err)
+ goto nla_put_failure;
+
+ nla_nest_end(msg, chunk_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, chunk_attr);
+ return err;
+}
+
+#define DEVLINK_REGION_READ_CHUNK_SIZE 256
+
+typedef int devlink_chunk_fill_t(void *cb_priv, u8 *chunk, u32 chunk_size,
+ u64 curr_offset,
+ struct netlink_ext_ack *extack);
+
+static int
+devlink_nl_region_read_fill(struct sk_buff *skb, devlink_chunk_fill_t *cb,
+ void *cb_priv, u64 start_offset, u64 end_offset,
+ u64 *new_offset, struct netlink_ext_ack *extack)
+{
+ u64 curr_offset = start_offset;
+ int err = 0;
+ u8 *data;
+
+ /* Allocate and re-use a single buffer */
+ data = kmalloc(DEVLINK_REGION_READ_CHUNK_SIZE, GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ *new_offset = start_offset;
+
+ while (curr_offset < end_offset) {
+ u32 data_size;
+
+ data_size = min_t(u32, end_offset - curr_offset,
+ DEVLINK_REGION_READ_CHUNK_SIZE);
+
+ err = cb(cb_priv, data, data_size, curr_offset, extack);
+ if (err)
+ break;
+
+ err = devlink_nl_cmd_region_read_chunk_fill(skb, data, data_size, curr_offset);
+ if (err)
+ break;
+
+ curr_offset += data_size;
+ }
+ *new_offset = curr_offset;
+
+ kfree(data);
+
+ return err;
+}
+
+static int
+devlink_region_snapshot_fill(void *cb_priv, u8 *chunk, u32 chunk_size,
+ u64 curr_offset,
+ struct netlink_ext_ack __always_unused *extack)
+{
+ struct devlink_snapshot *snapshot = cb_priv;
+
+ memcpy(chunk, &snapshot->data[curr_offset], chunk_size);
+
+ return 0;
+}
+
+static int
+devlink_region_port_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size,
+ u64 curr_offset, struct netlink_ext_ack *extack)
+{
+ struct devlink_region *region = cb_priv;
+
+ return region->port_ops->read(region->port, region->port_ops, extack,
+ curr_offset, chunk_size, chunk);
+}
+
+static int
+devlink_region_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size,
+ u64 curr_offset, struct netlink_ext_ack *extack)
+{
+ struct devlink_region *region = cb_priv;
+
+ return region->ops->read(region->devlink, region->ops, extack,
+ curr_offset, chunk_size, chunk);
+}
+
+int devlink_nl_region_read_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct nlattr *chunks_attr, *region_attr, *snapshot_attr;
+ u64 ret_offset, start_offset, end_offset = U64_MAX;
+ struct nlattr **attrs = info->info.attrs;
+ struct devlink_port *port = NULL;
+ devlink_chunk_fill_t *region_cb;
+ struct devlink_region *region;
+ const char *region_name;
+ struct devlink *devlink;
+ unsigned int index;
+ void *region_cb_priv;
+ void *hdr;
+ int err;
+
+ start_offset = state->start_offset;
+
+ devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs,
+ false);
+ if (IS_ERR(devlink))
+ return PTR_ERR(devlink);
+
+ if (!attrs[DEVLINK_ATTR_REGION_NAME]) {
+ NL_SET_ERR_MSG(cb->extack, "No region name provided");
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]);
+
+ port = devlink_port_get_by_index(devlink, index);
+ if (!port) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+ }
+
+ region_attr = attrs[DEVLINK_ATTR_REGION_NAME];
+ region_name = nla_data(region_attr);
+
+ if (port)
+ region = devlink_port_region_get_by_name(port, region_name);
+ else
+ region = devlink_region_get_by_name(devlink, region_name);
+
+ if (!region) {
+ NL_SET_ERR_MSG_ATTR(cb->extack, region_attr, "Requested region does not exist");
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ snapshot_attr = attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID];
+ if (!snapshot_attr) {
+ if (!nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) {
+ NL_SET_ERR_MSG(cb->extack, "No snapshot id provided");
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ if (!region->ops->read) {
+ NL_SET_ERR_MSG(cb->extack, "Requested region does not support direct read");
+ err = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
+ if (port)
+ region_cb = &devlink_region_port_direct_fill;
+ else
+ region_cb = &devlink_region_direct_fill;
+ region_cb_priv = region;
+ } else {
+ struct devlink_snapshot *snapshot;
+ u32 snapshot_id;
+
+ if (nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) {
+ NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Direct region read does not use snapshot");
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
+ snapshot_id = nla_get_u32(snapshot_attr);
+ snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id);
+ if (!snapshot) {
+ NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Requested snapshot does not exist");
+ err = -EINVAL;
+ goto out_unlock;
+ }
+ region_cb = &devlink_region_snapshot_fill;
+ region_cb_priv = snapshot;
+ }
+
+ if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] &&
+ attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) {
+ if (!start_offset)
+ start_offset =
+ nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
+
+ end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]);
+ end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]);
+ }
+
+ if (end_offset > region->size)
+ end_offset = region->size;
+
+ /* return 0 if there is no further data to read */
+ if (start_offset == end_offset) {
+ err = 0;
+ goto out_unlock;
+ }
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI,
+ DEVLINK_CMD_REGION_READ);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto out_unlock;
+ }
+
+ err = devlink_nl_put_handle(skb, devlink);
+ if (err)
+ goto nla_put_failure;
+
+ if (region->port) {
+ err = nla_put_u32(skb, DEVLINK_ATTR_PORT_INDEX,
+ region->port->index);
+ if (err)
+ goto nla_put_failure;
+ }
+
+ err = nla_put_string(skb, DEVLINK_ATTR_REGION_NAME, region_name);
+ if (err)
+ goto nla_put_failure;
+
+ chunks_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_REGION_CHUNKS);
+ if (!chunks_attr) {
+ err = -EMSGSIZE;
+ goto nla_put_failure;
+ }
+
+ err = devlink_nl_region_read_fill(skb, region_cb, region_cb_priv,
+ start_offset, end_offset, &ret_offset,
+ cb->extack);
+
+ if (err && err != -EMSGSIZE)
+ goto nla_put_failure;
+
+ /* Check if there was any progress done to prevent infinite loop */
+ if (ret_offset == start_offset) {
+ err = -EINVAL;
+ goto nla_put_failure;
+ }
+
+ state->start_offset = ret_offset;
+
+ nla_nest_end(skb, chunks_attr);
+ genlmsg_end(skb, hdr);
+ devl_unlock(devlink);
+ devlink_put(devlink);
+ return skb->len;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+out_unlock:
+ devl_unlock(devlink);
+ devlink_put(devlink);
+ return err;
+}
+
+/**
+ * devl_region_create - create a new address region
+ *
+ * @devlink: devlink
+ * @ops: region operations and name
+ * @region_max_snapshots: Maximum supported number of snapshots for region
+ * @region_size: size of region
+ */
+struct devlink_region *devl_region_create(struct devlink *devlink,
+ const struct devlink_region_ops *ops,
+ u32 region_max_snapshots,
+ u64 region_size)
+{
+ struct devlink_region *region;
+
+ devl_assert_locked(devlink);
+
+ if (WARN_ON(!ops) || WARN_ON(!ops->destructor))
+ return ERR_PTR(-EINVAL);
+
+ if (devlink_region_get_by_name(devlink, ops->name))
+ return ERR_PTR(-EEXIST);
+
+ region = kzalloc(sizeof(*region), GFP_KERNEL);
+ if (!region)
+ return ERR_PTR(-ENOMEM);
+
+ region->devlink = devlink;
+ region->max_snapshots = region_max_snapshots;
+ region->ops = ops;
+ region->size = region_size;
+ INIT_LIST_HEAD(&region->snapshot_list);
+ mutex_init(&region->snapshot_lock);
+ list_add_tail(&region->list, &devlink->region_list);
+ devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW);
+
+ return region;
+}
+EXPORT_SYMBOL_GPL(devl_region_create);
+
+/**
+ * devlink_region_create - create a new address region
+ *
+ * @devlink: devlink
+ * @ops: region operations and name
+ * @region_max_snapshots: Maximum supported number of snapshots for region
+ * @region_size: size of region
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+struct devlink_region *
+devlink_region_create(struct devlink *devlink,
+ const struct devlink_region_ops *ops,
+ u32 region_max_snapshots, u64 region_size)
+{
+ struct devlink_region *region;
+
+ devl_lock(devlink);
+ region = devl_region_create(devlink, ops, region_max_snapshots,
+ region_size);
+ devl_unlock(devlink);
+ return region;
+}
+EXPORT_SYMBOL_GPL(devlink_region_create);
+
+/**
+ * devlink_port_region_create - create a new address region for a port
+ *
+ * @port: devlink port
+ * @ops: region operations and name
+ * @region_max_snapshots: Maximum supported number of snapshots for region
+ * @region_size: size of region
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+struct devlink_region *
+devlink_port_region_create(struct devlink_port *port,
+ const struct devlink_port_region_ops *ops,
+ u32 region_max_snapshots, u64 region_size)
+{
+ struct devlink *devlink = port->devlink;
+ struct devlink_region *region;
+ int err = 0;
+
+ ASSERT_DEVLINK_PORT_INITIALIZED(port);
+
+ if (WARN_ON(!ops) || WARN_ON(!ops->destructor))
+ return ERR_PTR(-EINVAL);
+
+ devl_lock(devlink);
+
+ if (devlink_port_region_get_by_name(port, ops->name)) {
+ err = -EEXIST;
+ goto unlock;
+ }
+
+ region = kzalloc(sizeof(*region), GFP_KERNEL);
+ if (!region) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ region->devlink = devlink;
+ region->port = port;
+ region->max_snapshots = region_max_snapshots;
+ region->port_ops = ops;
+ region->size = region_size;
+ INIT_LIST_HEAD(&region->snapshot_list);
+ mutex_init(&region->snapshot_lock);
+ list_add_tail(&region->list, &port->region_list);
+ devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_NEW);
+
+ devl_unlock(devlink);
+ return region;
+
+unlock:
+ devl_unlock(devlink);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(devlink_port_region_create);
+
+/**
+ * devl_region_destroy - destroy address region
+ *
+ * @region: devlink region to destroy
+ */
+void devl_region_destroy(struct devlink_region *region)
+{
+ struct devlink *devlink = region->devlink;
+ struct devlink_snapshot *snapshot, *ts;
+
+ devl_assert_locked(devlink);
+
+ /* Free all snapshots of region */
+ mutex_lock(&region->snapshot_lock);
+ list_for_each_entry_safe(snapshot, ts, &region->snapshot_list, list)
+ devlink_region_snapshot_del(region, snapshot);
+ mutex_unlock(&region->snapshot_lock);
+
+ list_del(&region->list);
+ mutex_destroy(&region->snapshot_lock);
+
+ devlink_nl_region_notify(region, NULL, DEVLINK_CMD_REGION_DEL);
+ kfree(region);
+}
+EXPORT_SYMBOL_GPL(devl_region_destroy);
+
+/**
+ * devlink_region_destroy - destroy address region
+ *
+ * @region: devlink region to destroy
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+void devlink_region_destroy(struct devlink_region *region)
+{
+ struct devlink *devlink = region->devlink;
+
+ devl_lock(devlink);
+ devl_region_destroy(region);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_region_destroy);
+
+/**
+ * devlink_region_snapshot_id_get - get snapshot ID
+ *
+ * This callback should be called when adding a new snapshot,
+ * Driver should use the same id for multiple snapshots taken
+ * on multiple regions at the same time/by the same trigger.
+ *
+ * The caller of this function must use devlink_region_snapshot_id_put
+ * when finished creating regions using this id.
+ *
+ * Returns zero on success, or a negative error code on failure.
+ *
+ * @devlink: devlink
+ * @id: storage to return id
+ */
+int devlink_region_snapshot_id_get(struct devlink *devlink, u32 *id)
+{
+ return __devlink_region_snapshot_id_get(devlink, id);
+}
+EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_get);
+
+/**
+ * devlink_region_snapshot_id_put - put snapshot ID reference
+ *
+ * This should be called by a driver after finishing creating snapshots
+ * with an id. Doing so ensures that the ID can later be released in the
+ * event that all snapshots using it have been destroyed.
+ *
+ * @devlink: devlink
+ * @id: id to release reference on
+ */
+void devlink_region_snapshot_id_put(struct devlink *devlink, u32 id)
+{
+ __devlink_snapshot_id_decrement(devlink, id);
+}
+EXPORT_SYMBOL_GPL(devlink_region_snapshot_id_put);
+
+/**
+ * devlink_region_snapshot_create - create a new snapshot
+ * This will add a new snapshot of a region. The snapshot
+ * will be stored on the region struct and can be accessed
+ * from devlink. This is useful for future analyses of snapshots.
+ * Multiple snapshots can be created on a region.
+ * The @snapshot_id should be obtained using the getter function.
+ *
+ * @region: devlink region of the snapshot
+ * @data: snapshot data
+ * @snapshot_id: snapshot id to be created
+ */
+int devlink_region_snapshot_create(struct devlink_region *region,
+ u8 *data, u32 snapshot_id)
+{
+ int err;
+
+ mutex_lock(&region->snapshot_lock);
+ err = __devlink_region_snapshot_create(region, data, snapshot_id);
+ mutex_unlock(&region->snapshot_lock);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_region_snapshot_create);
diff --git a/net/devlink/resource.c b/net/devlink/resource.c
new file mode 100644
index 000000000000..2d6324f3d91f
--- /dev/null
+++ b/net/devlink/resource.c
@@ -0,0 +1,504 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include "devl_internal.h"
+
+/**
+ * struct devlink_resource - devlink resource
+ * @name: name of the resource
+ * @id: id, per devlink instance
+ * @size: size of the resource
+ * @size_new: updated size of the resource, reload is needed
+ * @size_valid: valid in case the total size of the resource is valid
+ * including its children
+ * @parent: parent resource
+ * @size_params: size parameters
+ * @list: parent list
+ * @resource_list: list of child resources
+ * @occ_get: occupancy getter callback
+ * @occ_get_priv: occupancy getter callback priv
+ */
+struct devlink_resource {
+ const char *name;
+ u64 id;
+ u64 size;
+ u64 size_new;
+ bool size_valid;
+ struct devlink_resource *parent;
+ struct devlink_resource_size_params size_params;
+ struct list_head list;
+ struct list_head resource_list;
+ devlink_resource_occ_get_t *occ_get;
+ void *occ_get_priv;
+};
+
+static struct devlink_resource *
+devlink_resource_find(struct devlink *devlink,
+ struct devlink_resource *resource, u64 resource_id)
+{
+ struct list_head *resource_list;
+
+ if (resource)
+ resource_list = &resource->resource_list;
+ else
+ resource_list = &devlink->resource_list;
+
+ list_for_each_entry(resource, resource_list, list) {
+ struct devlink_resource *child_resource;
+
+ if (resource->id == resource_id)
+ return resource;
+
+ child_resource = devlink_resource_find(devlink, resource,
+ resource_id);
+ if (child_resource)
+ return child_resource;
+ }
+ return NULL;
+}
+
+static void
+devlink_resource_validate_children(struct devlink_resource *resource)
+{
+ struct devlink_resource *child_resource;
+ bool size_valid = true;
+ u64 parts_size = 0;
+
+ if (list_empty(&resource->resource_list))
+ goto out;
+
+ list_for_each_entry(child_resource, &resource->resource_list, list)
+ parts_size += child_resource->size_new;
+
+ if (parts_size > resource->size_new)
+ size_valid = false;
+out:
+ resource->size_valid = size_valid;
+}
+
+static int
+devlink_resource_validate_size(struct devlink_resource *resource, u64 size,
+ struct netlink_ext_ack *extack)
+{
+ u64 reminder;
+ int err = 0;
+
+ if (size > resource->size_params.size_max) {
+ NL_SET_ERR_MSG(extack, "Size larger than maximum");
+ err = -EINVAL;
+ }
+
+ if (size < resource->size_params.size_min) {
+ NL_SET_ERR_MSG(extack, "Size smaller than minimum");
+ err = -EINVAL;
+ }
+
+ div64_u64_rem(size, resource->size_params.size_granularity, &reminder);
+ if (reminder) {
+ NL_SET_ERR_MSG(extack, "Wrong granularity");
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
+int devlink_nl_resource_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_resource *resource;
+ u64 resource_id;
+ u64 size;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_ID) ||
+ GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_RESOURCE_SIZE))
+ return -EINVAL;
+ resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]);
+
+ resource = devlink_resource_find(devlink, NULL, resource_id);
+ if (!resource)
+ return -EINVAL;
+
+ size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]);
+ err = devlink_resource_validate_size(resource, size, info->extack);
+ if (err)
+ return err;
+
+ resource->size_new = size;
+ devlink_resource_validate_children(resource);
+ if (resource->parent)
+ devlink_resource_validate_children(resource->parent);
+ return 0;
+}
+
+static int
+devlink_resource_size_params_put(struct devlink_resource *resource,
+ struct sk_buff *skb)
+{
+ struct devlink_resource_size_params *size_params;
+
+ size_params = &resource->size_params;
+ if (devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN,
+ size_params->size_granularity) ||
+ devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX,
+ size_params->size_max) ||
+ devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN,
+ size_params->size_min) ||
+ nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int devlink_resource_occ_put(struct devlink_resource *resource,
+ struct sk_buff *skb)
+{
+ if (!resource->occ_get)
+ return 0;
+ return devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_OCC,
+ resource->occ_get(resource->occ_get_priv));
+}
+
+static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb,
+ struct devlink_resource *resource)
+{
+ struct devlink_resource *child_resource;
+ struct nlattr *child_resource_attr;
+ struct nlattr *resource_attr;
+
+ resource_attr = nla_nest_start_noflag(skb, DEVLINK_ATTR_RESOURCE);
+ if (!resource_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) ||
+ devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size) ||
+ devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id))
+ goto nla_put_failure;
+ if (resource->size != resource->size_new &&
+ devlink_nl_put_u64(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW,
+ resource->size_new))
+ goto nla_put_failure;
+ if (devlink_resource_occ_put(resource, skb))
+ goto nla_put_failure;
+ if (devlink_resource_size_params_put(resource, skb))
+ goto nla_put_failure;
+ if (list_empty(&resource->resource_list))
+ goto out;
+
+ if (nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_SIZE_VALID,
+ resource->size_valid))
+ goto nla_put_failure;
+
+ child_resource_attr = nla_nest_start_noflag(skb,
+ DEVLINK_ATTR_RESOURCE_LIST);
+ if (!child_resource_attr)
+ goto nla_put_failure;
+
+ list_for_each_entry(child_resource, &resource->resource_list, list) {
+ if (devlink_resource_put(devlink, skb, child_resource))
+ goto resource_put_failure;
+ }
+
+ nla_nest_end(skb, child_resource_attr);
+out:
+ nla_nest_end(skb, resource_attr);
+ return 0;
+
+resource_put_failure:
+ nla_nest_cancel(skb, child_resource_attr);
+nla_put_failure:
+ nla_nest_cancel(skb, resource_attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_resource_fill(struct genl_info *info,
+ enum devlink_command cmd, int flags)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_resource *resource;
+ struct nlattr *resources_attr;
+ struct sk_buff *skb = NULL;
+ struct nlmsghdr *nlh;
+ bool incomplete;
+ void *hdr;
+ int i;
+ int err;
+
+ resource = list_first_entry(&devlink->resource_list,
+ struct devlink_resource, list);
+start_again:
+ err = devlink_nl_msg_reply_and_new(&skb, info);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
+ &devlink_nl_family, NLM_F_MULTI, cmd);
+ if (!hdr) {
+ nlmsg_free(skb);
+ return -EMSGSIZE;
+ }
+
+ if (devlink_nl_put_handle(skb, devlink))
+ goto nla_put_failure;
+
+ resources_attr = nla_nest_start_noflag(skb,
+ DEVLINK_ATTR_RESOURCE_LIST);
+ if (!resources_attr)
+ goto nla_put_failure;
+
+ incomplete = false;
+ i = 0;
+ list_for_each_entry_from(resource, &devlink->resource_list, list) {
+ err = devlink_resource_put(devlink, skb, resource);
+ if (err) {
+ if (!i)
+ goto err_resource_put;
+ incomplete = true;
+ break;
+ }
+ i++;
+ }
+ nla_nest_end(skb, resources_attr);
+ genlmsg_end(skb, hdr);
+ if (incomplete)
+ goto start_again;
+send_done:
+ nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq,
+ NLMSG_DONE, 0, flags | NLM_F_MULTI);
+ if (!nlh) {
+ err = devlink_nl_msg_reply_and_new(&skb, info);
+ if (err)
+ return err;
+ goto send_done;
+ }
+ return genlmsg_reply(skb, info);
+
+nla_put_failure:
+ err = -EMSGSIZE;
+err_resource_put:
+ nlmsg_free(skb);
+ return err;
+}
+
+int devlink_nl_resource_dump_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+
+ if (list_empty(&devlink->resource_list))
+ return -EOPNOTSUPP;
+
+ return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0);
+}
+
+int devlink_resources_validate(struct devlink *devlink,
+ struct devlink_resource *resource,
+ struct genl_info *info)
+{
+ struct list_head *resource_list;
+ int err = 0;
+
+ if (resource)
+ resource_list = &resource->resource_list;
+ else
+ resource_list = &devlink->resource_list;
+
+ list_for_each_entry(resource, resource_list, list) {
+ if (!resource->size_valid)
+ return -EINVAL;
+ err = devlink_resources_validate(devlink, resource, info);
+ if (err)
+ return err;
+ }
+ return err;
+}
+
+/**
+ * devl_resource_register - devlink resource register
+ *
+ * @devlink: devlink
+ * @resource_name: resource's name
+ * @resource_size: resource's size
+ * @resource_id: resource's id
+ * @parent_resource_id: resource's parent id
+ * @size_params: size parameters
+ *
+ * Generic resources should reuse the same names across drivers.
+ * Please see the generic resources list at:
+ * Documentation/networking/devlink/devlink-resource.rst
+ */
+int devl_resource_register(struct devlink *devlink,
+ const char *resource_name,
+ u64 resource_size,
+ u64 resource_id,
+ u64 parent_resource_id,
+ const struct devlink_resource_size_params *size_params)
+{
+ struct devlink_resource *resource;
+ struct list_head *resource_list;
+ bool top_hierarchy;
+
+ lockdep_assert_held(&devlink->lock);
+
+ top_hierarchy = parent_resource_id == DEVLINK_RESOURCE_ID_PARENT_TOP;
+
+ resource = devlink_resource_find(devlink, NULL, resource_id);
+ if (resource)
+ return -EEXIST;
+
+ resource = kzalloc(sizeof(*resource), GFP_KERNEL);
+ if (!resource)
+ return -ENOMEM;
+
+ if (top_hierarchy) {
+ resource_list = &devlink->resource_list;
+ } else {
+ struct devlink_resource *parent_resource;
+
+ parent_resource = devlink_resource_find(devlink, NULL,
+ parent_resource_id);
+ if (parent_resource) {
+ resource_list = &parent_resource->resource_list;
+ resource->parent = parent_resource;
+ } else {
+ kfree(resource);
+ return -EINVAL;
+ }
+ }
+
+ resource->name = resource_name;
+ resource->size = resource_size;
+ resource->size_new = resource_size;
+ resource->id = resource_id;
+ resource->size_valid = true;
+ memcpy(&resource->size_params, size_params,
+ sizeof(resource->size_params));
+ INIT_LIST_HEAD(&resource->resource_list);
+ list_add_tail(&resource->list, resource_list);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_resource_register);
+
+static void devlink_resource_unregister(struct devlink *devlink,
+ struct devlink_resource *resource)
+{
+ struct devlink_resource *tmp, *child_resource;
+
+ list_for_each_entry_safe(child_resource, tmp, &resource->resource_list,
+ list) {
+ devlink_resource_unregister(devlink, child_resource);
+ list_del(&child_resource->list);
+ kfree(child_resource);
+ }
+}
+
+/**
+ * devl_resources_unregister - free all resources
+ *
+ * @devlink: devlink
+ */
+void devl_resources_unregister(struct devlink *devlink)
+{
+ struct devlink_resource *tmp, *child_resource;
+
+ lockdep_assert_held(&devlink->lock);
+
+ list_for_each_entry_safe(child_resource, tmp, &devlink->resource_list,
+ list) {
+ devlink_resource_unregister(devlink, child_resource);
+ list_del(&child_resource->list);
+ kfree(child_resource);
+ }
+}
+EXPORT_SYMBOL_GPL(devl_resources_unregister);
+
+/**
+ * devlink_resources_unregister - free all resources
+ *
+ * @devlink: devlink
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+void devlink_resources_unregister(struct devlink *devlink)
+{
+ devl_lock(devlink);
+ devl_resources_unregister(devlink);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_resources_unregister);
+
+/**
+ * devl_resource_size_get - get and update size
+ *
+ * @devlink: devlink
+ * @resource_id: the requested resource id
+ * @p_resource_size: ptr to update
+ */
+int devl_resource_size_get(struct devlink *devlink,
+ u64 resource_id,
+ u64 *p_resource_size)
+{
+ struct devlink_resource *resource;
+
+ lockdep_assert_held(&devlink->lock);
+
+ resource = devlink_resource_find(devlink, NULL, resource_id);
+ if (!resource)
+ return -EINVAL;
+ *p_resource_size = resource->size_new;
+ resource->size = resource->size_new;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_resource_size_get);
+
+/**
+ * devl_resource_occ_get_register - register occupancy getter
+ *
+ * @devlink: devlink
+ * @resource_id: resource id
+ * @occ_get: occupancy getter callback
+ * @occ_get_priv: occupancy getter callback priv
+ */
+void devl_resource_occ_get_register(struct devlink *devlink,
+ u64 resource_id,
+ devlink_resource_occ_get_t *occ_get,
+ void *occ_get_priv)
+{
+ struct devlink_resource *resource;
+
+ lockdep_assert_held(&devlink->lock);
+
+ resource = devlink_resource_find(devlink, NULL, resource_id);
+ if (WARN_ON(!resource))
+ return;
+ WARN_ON(resource->occ_get);
+
+ resource->occ_get = occ_get;
+ resource->occ_get_priv = occ_get_priv;
+}
+EXPORT_SYMBOL_GPL(devl_resource_occ_get_register);
+
+/**
+ * devl_resource_occ_get_unregister - unregister occupancy getter
+ *
+ * @devlink: devlink
+ * @resource_id: resource id
+ */
+void devl_resource_occ_get_unregister(struct devlink *devlink,
+ u64 resource_id)
+{
+ struct devlink_resource *resource;
+
+ lockdep_assert_held(&devlink->lock);
+
+ resource = devlink_resource_find(devlink, NULL, resource_id);
+ if (WARN_ON(!resource))
+ return;
+ WARN_ON(!resource->occ_get);
+
+ resource->occ_get = NULL;
+ resource->occ_get_priv = NULL;
+}
+EXPORT_SYMBOL_GPL(devl_resource_occ_get_unregister);
diff --git a/net/devlink/sb.c b/net/devlink/sb.c
new file mode 100644
index 000000000000..0a76bb32502b
--- /dev/null
+++ b/net/devlink/sb.c
@@ -0,0 +1,995 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include "devl_internal.h"
+
+struct devlink_sb {
+ struct list_head list;
+ unsigned int index;
+ u32 size;
+ u16 ingress_pools_count;
+ u16 egress_pools_count;
+ u16 ingress_tc_count;
+ u16 egress_tc_count;
+};
+
+static u16 devlink_sb_pool_count(struct devlink_sb *devlink_sb)
+{
+ return devlink_sb->ingress_pools_count + devlink_sb->egress_pools_count;
+}
+
+static struct devlink_sb *devlink_sb_get_by_index(struct devlink *devlink,
+ unsigned int sb_index)
+{
+ struct devlink_sb *devlink_sb;
+
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ if (devlink_sb->index == sb_index)
+ return devlink_sb;
+ }
+ return NULL;
+}
+
+static bool devlink_sb_index_exists(struct devlink *devlink,
+ unsigned int sb_index)
+{
+ return devlink_sb_get_by_index(devlink, sb_index);
+}
+
+static struct devlink_sb *devlink_sb_get_from_attrs(struct devlink *devlink,
+ struct nlattr **attrs)
+{
+ if (attrs[DEVLINK_ATTR_SB_INDEX]) {
+ u32 sb_index = nla_get_u32(attrs[DEVLINK_ATTR_SB_INDEX]);
+ struct devlink_sb *devlink_sb;
+
+ devlink_sb = devlink_sb_get_by_index(devlink, sb_index);
+ if (!devlink_sb)
+ return ERR_PTR(-ENODEV);
+ return devlink_sb;
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+static struct devlink_sb *devlink_sb_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ return devlink_sb_get_from_attrs(devlink, info->attrs);
+}
+
+static int devlink_sb_pool_index_get_from_attrs(struct devlink_sb *devlink_sb,
+ struct nlattr **attrs,
+ u16 *p_pool_index)
+{
+ u16 val;
+
+ if (!attrs[DEVLINK_ATTR_SB_POOL_INDEX])
+ return -EINVAL;
+
+ val = nla_get_u16(attrs[DEVLINK_ATTR_SB_POOL_INDEX]);
+ if (val >= devlink_sb_pool_count(devlink_sb))
+ return -EINVAL;
+ *p_pool_index = val;
+ return 0;
+}
+
+static int devlink_sb_pool_index_get_from_info(struct devlink_sb *devlink_sb,
+ struct genl_info *info,
+ u16 *p_pool_index)
+{
+ return devlink_sb_pool_index_get_from_attrs(devlink_sb, info->attrs,
+ p_pool_index);
+}
+
+static int
+devlink_sb_pool_type_get_from_attrs(struct nlattr **attrs,
+ enum devlink_sb_pool_type *p_pool_type)
+{
+ u8 val;
+
+ if (!attrs[DEVLINK_ATTR_SB_POOL_TYPE])
+ return -EINVAL;
+
+ val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_TYPE]);
+ if (val != DEVLINK_SB_POOL_TYPE_INGRESS &&
+ val != DEVLINK_SB_POOL_TYPE_EGRESS)
+ return -EINVAL;
+ *p_pool_type = val;
+ return 0;
+}
+
+static int
+devlink_sb_pool_type_get_from_info(struct genl_info *info,
+ enum devlink_sb_pool_type *p_pool_type)
+{
+ return devlink_sb_pool_type_get_from_attrs(info->attrs, p_pool_type);
+}
+
+static int
+devlink_sb_th_type_get_from_attrs(struct nlattr **attrs,
+ enum devlink_sb_threshold_type *p_th_type)
+{
+ u8 val;
+
+ if (!attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE])
+ return -EINVAL;
+
+ val = nla_get_u8(attrs[DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE]);
+ if (val != DEVLINK_SB_THRESHOLD_TYPE_STATIC &&
+ val != DEVLINK_SB_THRESHOLD_TYPE_DYNAMIC)
+ return -EINVAL;
+ *p_th_type = val;
+ return 0;
+}
+
+static int
+devlink_sb_th_type_get_from_info(struct genl_info *info,
+ enum devlink_sb_threshold_type *p_th_type)
+{
+ return devlink_sb_th_type_get_from_attrs(info->attrs, p_th_type);
+}
+
+static int
+devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb,
+ struct nlattr **attrs,
+ enum devlink_sb_pool_type pool_type,
+ u16 *p_tc_index)
+{
+ u16 val;
+
+ if (!attrs[DEVLINK_ATTR_SB_TC_INDEX])
+ return -EINVAL;
+
+ val = nla_get_u16(attrs[DEVLINK_ATTR_SB_TC_INDEX]);
+ if (pool_type == DEVLINK_SB_POOL_TYPE_INGRESS &&
+ val >= devlink_sb->ingress_tc_count)
+ return -EINVAL;
+ if (pool_type == DEVLINK_SB_POOL_TYPE_EGRESS &&
+ val >= devlink_sb->egress_tc_count)
+ return -EINVAL;
+ *p_tc_index = val;
+ return 0;
+}
+
+static int
+devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
+ struct genl_info *info,
+ enum devlink_sb_pool_type pool_type,
+ u16 *p_tc_index)
+{
+ return devlink_sb_tc_index_get_from_attrs(devlink_sb, info->attrs,
+ pool_type, p_tc_index);
+}
+
+static int devlink_nl_sb_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_SIZE, devlink_sb->size))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_POOL_COUNT,
+ devlink_sb->ingress_pools_count))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_POOL_COUNT,
+ devlink_sb->egress_pools_count))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_INGRESS_TC_COUNT,
+ devlink_sb->ingress_tc_count))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_EGRESS_TC_COUNT,
+ devlink_sb->egress_tc_count))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+int devlink_nl_sb_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb;
+ struct sk_buff *msg;
+ int err;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_sb_fill(msg, devlink, devlink_sb,
+ DEVLINK_CMD_SB_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int
+devlink_nl_sb_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
+ struct netlink_callback *cb, int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_sb *devlink_sb;
+ int idx = 0;
+ int err = 0;
+
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ if (idx < state->idx) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_sb_fill(msg, devlink, devlink_sb,
+ DEVLINK_CMD_SB_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags);
+ if (err) {
+ state->idx = idx;
+ break;
+ }
+ idx++;
+ }
+
+ return err;
+}
+
+int devlink_nl_sb_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_sb_get_dump_one);
+}
+
+static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ u16 pool_index, enum devlink_command cmd,
+ u32 portid, u32 seq, int flags)
+{
+ struct devlink_sb_pool_info pool_info;
+ void *hdr;
+ int err;
+
+ err = devlink->ops->sb_pool_get(devlink, devlink_sb->index,
+ pool_index, &pool_info);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_info.pool_type))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_SIZE, pool_info.size))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE,
+ pool_info.threshold_type))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_CELL_SIZE,
+ pool_info.cell_size))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+int devlink_nl_sb_pool_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb;
+ struct sk_buff *msg;
+ u16 pool_index;
+ int err;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ if (!devlink->ops->sb_pool_get)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_sb_pool_fill(msg, devlink, devlink_sb, pool_index,
+ DEVLINK_CMD_SB_POOL_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int __sb_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx,
+ struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ u32 portid, u32 seq, int flags)
+{
+ u16 pool_count = devlink_sb_pool_count(devlink_sb);
+ u16 pool_index;
+ int err;
+
+ for (pool_index = 0; pool_index < pool_count; pool_index++) {
+ if (*p_idx < start) {
+ (*p_idx)++;
+ continue;
+ }
+ err = devlink_nl_sb_pool_fill(msg, devlink,
+ devlink_sb,
+ pool_index,
+ DEVLINK_CMD_SB_POOL_NEW,
+ portid, seq, flags);
+ if (err)
+ return err;
+ (*p_idx)++;
+ }
+ return 0;
+}
+
+static int
+devlink_nl_sb_pool_get_dump_one(struct sk_buff *msg, struct devlink *devlink,
+ struct netlink_callback *cb, int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_sb *devlink_sb;
+ int err = 0;
+ int idx = 0;
+
+ if (!devlink->ops->sb_pool_get)
+ return 0;
+
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ err = __sb_pool_get_dumpit(msg, state->idx, &idx,
+ devlink, devlink_sb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags);
+ if (err == -EOPNOTSUPP) {
+ err = 0;
+ } else if (err) {
+ state->idx = idx;
+ break;
+ }
+ }
+
+ return err;
+}
+
+int devlink_nl_sb_pool_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_sb_pool_get_dump_one);
+}
+
+static int devlink_sb_pool_set(struct devlink *devlink, unsigned int sb_index,
+ u16 pool_index, u32 size,
+ enum devlink_sb_threshold_type threshold_type,
+ struct netlink_ext_ack *extack)
+
+{
+ const struct devlink_ops *ops = devlink->ops;
+
+ if (ops->sb_pool_set)
+ return ops->sb_pool_set(devlink, sb_index, pool_index,
+ size, threshold_type, extack);
+ return -EOPNOTSUPP;
+}
+
+int devlink_nl_sb_pool_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ enum devlink_sb_threshold_type threshold_type;
+ struct devlink_sb *devlink_sb;
+ u16 pool_index;
+ u32 size;
+ int err;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ err = devlink_sb_th_type_get_from_info(info, &threshold_type);
+ if (err)
+ return err;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_POOL_SIZE))
+ return -EINVAL;
+
+ size = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_POOL_SIZE]);
+ return devlink_sb_pool_set(devlink, devlink_sb->index,
+ pool_index, size, threshold_type,
+ info->extack);
+}
+
+static int devlink_nl_sb_port_pool_fill(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ struct devlink_sb *devlink_sb,
+ u16 pool_index,
+ enum devlink_command cmd,
+ u32 portid, u32 seq, int flags)
+{
+ const struct devlink_ops *ops = devlink->ops;
+ u32 threshold;
+ void *hdr;
+ int err;
+
+ err = ops->sb_port_pool_get(devlink_port, devlink_sb->index,
+ pool_index, &threshold);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold))
+ goto nla_put_failure;
+
+ if (ops->sb_occ_port_pool_get) {
+ u32 cur;
+ u32 max;
+
+ err = ops->sb_occ_port_pool_get(devlink_port, devlink_sb->index,
+ pool_index, &cur, &max);
+ if (err && err != -EOPNOTSUPP)
+ goto sb_occ_get_failure;
+ if (!err) {
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max))
+ goto nla_put_failure;
+ }
+ }
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ err = -EMSGSIZE;
+sb_occ_get_failure:
+ genlmsg_cancel(msg, hdr);
+ return err;
+}
+
+int devlink_nl_sb_port_pool_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ struct devlink *devlink = devlink_port->devlink;
+ struct devlink_sb *devlink_sb;
+ struct sk_buff *msg;
+ u16 pool_index;
+ int err;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ if (!devlink->ops->sb_port_pool_get)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_sb_port_pool_fill(msg, devlink, devlink_port,
+ devlink_sb, pool_index,
+ DEVLINK_CMD_SB_PORT_POOL_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int __sb_port_pool_get_dumpit(struct sk_buff *msg, int start, int *p_idx,
+ struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ u32 portid, u32 seq, int flags)
+{
+ struct devlink_port *devlink_port;
+ u16 pool_count = devlink_sb_pool_count(devlink_sb);
+ unsigned long port_index;
+ u16 pool_index;
+ int err;
+
+ xa_for_each(&devlink->ports, port_index, devlink_port) {
+ for (pool_index = 0; pool_index < pool_count; pool_index++) {
+ if (*p_idx < start) {
+ (*p_idx)++;
+ continue;
+ }
+ err = devlink_nl_sb_port_pool_fill(msg, devlink,
+ devlink_port,
+ devlink_sb,
+ pool_index,
+ DEVLINK_CMD_SB_PORT_POOL_NEW,
+ portid, seq, flags);
+ if (err)
+ return err;
+ (*p_idx)++;
+ }
+ }
+ return 0;
+}
+
+static int
+devlink_nl_sb_port_pool_get_dump_one(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb, int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_sb *devlink_sb;
+ int idx = 0;
+ int err = 0;
+
+ if (!devlink->ops->sb_port_pool_get)
+ return 0;
+
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ err = __sb_port_pool_get_dumpit(msg, state->idx, &idx,
+ devlink, devlink_sb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags);
+ if (err == -EOPNOTSUPP) {
+ err = 0;
+ } else if (err) {
+ state->idx = idx;
+ break;
+ }
+ }
+
+ return err;
+}
+
+int devlink_nl_sb_port_pool_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_sb_port_pool_get_dump_one);
+}
+
+static int devlink_sb_port_pool_set(struct devlink_port *devlink_port,
+ unsigned int sb_index, u16 pool_index,
+ u32 threshold,
+ struct netlink_ext_ack *extack)
+
+{
+ const struct devlink_ops *ops = devlink_port->devlink->ops;
+
+ if (ops->sb_port_pool_set)
+ return ops->sb_port_pool_set(devlink_port, sb_index,
+ pool_index, threshold, extack);
+ return -EOPNOTSUPP;
+}
+
+int devlink_nl_sb_port_pool_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_sb *devlink_sb;
+ u16 pool_index;
+ u32 threshold;
+ int err;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD))
+ return -EINVAL;
+
+ threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]);
+ return devlink_sb_port_pool_set(devlink_port, devlink_sb->index,
+ pool_index, threshold, info->extack);
+}
+
+static int
+devlink_nl_sb_tc_pool_bind_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ struct devlink_sb *devlink_sb, u16 tc_index,
+ enum devlink_sb_pool_type pool_type,
+ enum devlink_command cmd,
+ u32 portid, u32 seq, int flags)
+{
+ const struct devlink_ops *ops = devlink->ops;
+ u16 pool_index;
+ u32 threshold;
+ void *hdr;
+ int err;
+
+ err = ops->sb_tc_pool_bind_get(devlink_port, devlink_sb->index,
+ tc_index, pool_type,
+ &pool_index, &threshold);
+ if (err)
+ return err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_INDEX, devlink_sb->index))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_TC_INDEX, tc_index))
+ goto nla_put_failure;
+ if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_TYPE, pool_type))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_SB_POOL_INDEX, pool_index))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_THRESHOLD, threshold))
+ goto nla_put_failure;
+
+ if (ops->sb_occ_tc_port_bind_get) {
+ u32 cur;
+ u32 max;
+
+ err = ops->sb_occ_tc_port_bind_get(devlink_port,
+ devlink_sb->index,
+ tc_index, pool_type,
+ &cur, &max);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+ if (!err) {
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_CUR, cur))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_SB_OCC_MAX, max))
+ goto nla_put_failure;
+ }
+ }
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+int devlink_nl_sb_tc_pool_bind_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ struct devlink *devlink = devlink_port->devlink;
+ struct devlink_sb *devlink_sb;
+ struct sk_buff *msg;
+ enum devlink_sb_pool_type pool_type;
+ u16 tc_index;
+ int err;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
+ err = devlink_sb_pool_type_get_from_info(info, &pool_type);
+ if (err)
+ return err;
+
+ err = devlink_sb_tc_index_get_from_info(devlink_sb, info,
+ pool_type, &tc_index);
+ if (err)
+ return err;
+
+ if (!devlink->ops->sb_tc_pool_bind_get)
+ return -EOPNOTSUPP;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink, devlink_port,
+ devlink_sb, tc_index, pool_type,
+ DEVLINK_CMD_SB_TC_POOL_BIND_NEW,
+ info->snd_portid,
+ info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int __sb_tc_pool_bind_get_dumpit(struct sk_buff *msg,
+ int start, int *p_idx,
+ struct devlink *devlink,
+ struct devlink_sb *devlink_sb,
+ u32 portid, u32 seq, int flags)
+{
+ struct devlink_port *devlink_port;
+ unsigned long port_index;
+ u16 tc_index;
+ int err;
+
+ xa_for_each(&devlink->ports, port_index, devlink_port) {
+ for (tc_index = 0;
+ tc_index < devlink_sb->ingress_tc_count; tc_index++) {
+ if (*p_idx < start) {
+ (*p_idx)++;
+ continue;
+ }
+ err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink,
+ devlink_port,
+ devlink_sb,
+ tc_index,
+ DEVLINK_SB_POOL_TYPE_INGRESS,
+ DEVLINK_CMD_SB_TC_POOL_BIND_NEW,
+ portid, seq,
+ flags);
+ if (err)
+ return err;
+ (*p_idx)++;
+ }
+ for (tc_index = 0;
+ tc_index < devlink_sb->egress_tc_count; tc_index++) {
+ if (*p_idx < start) {
+ (*p_idx)++;
+ continue;
+ }
+ err = devlink_nl_sb_tc_pool_bind_fill(msg, devlink,
+ devlink_port,
+ devlink_sb,
+ tc_index,
+ DEVLINK_SB_POOL_TYPE_EGRESS,
+ DEVLINK_CMD_SB_TC_POOL_BIND_NEW,
+ portid, seq,
+ flags);
+ if (err)
+ return err;
+ (*p_idx)++;
+ }
+ }
+ return 0;
+}
+
+static int devlink_nl_sb_tc_pool_bind_get_dump_one(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb,
+ int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_sb *devlink_sb;
+ int idx = 0;
+ int err = 0;
+
+ if (!devlink->ops->sb_tc_pool_bind_get)
+ return 0;
+
+ list_for_each_entry(devlink_sb, &devlink->sb_list, list) {
+ err = __sb_tc_pool_bind_get_dumpit(msg, state->idx, &idx,
+ devlink, devlink_sb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags);
+ if (err == -EOPNOTSUPP) {
+ err = 0;
+ } else if (err) {
+ state->idx = idx;
+ break;
+ }
+ }
+
+ return err;
+}
+
+int devlink_nl_sb_tc_pool_bind_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb,
+ devlink_nl_sb_tc_pool_bind_get_dump_one);
+}
+
+static int devlink_sb_tc_pool_bind_set(struct devlink_port *devlink_port,
+ unsigned int sb_index, u16 tc_index,
+ enum devlink_sb_pool_type pool_type,
+ u16 pool_index, u32 threshold,
+ struct netlink_ext_ack *extack)
+
+{
+ const struct devlink_ops *ops = devlink_port->devlink->ops;
+
+ if (ops->sb_tc_pool_bind_set)
+ return ops->sb_tc_pool_bind_set(devlink_port, sb_index,
+ tc_index, pool_type,
+ pool_index, threshold, extack);
+ return -EOPNOTSUPP;
+}
+
+int devlink_nl_sb_tc_pool_bind_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ struct devlink *devlink = info->user_ptr[0];
+ enum devlink_sb_pool_type pool_type;
+ struct devlink_sb *devlink_sb;
+ u16 tc_index;
+ u16 pool_index;
+ u32 threshold;
+ int err;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
+ err = devlink_sb_pool_type_get_from_info(info, &pool_type);
+ if (err)
+ return err;
+
+ err = devlink_sb_tc_index_get_from_info(devlink_sb, info,
+ pool_type, &tc_index);
+ if (err)
+ return err;
+
+ err = devlink_sb_pool_index_get_from_info(devlink_sb, info,
+ &pool_index);
+ if (err)
+ return err;
+
+ if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_SB_THRESHOLD))
+ return -EINVAL;
+
+ threshold = nla_get_u32(info->attrs[DEVLINK_ATTR_SB_THRESHOLD]);
+ return devlink_sb_tc_pool_bind_set(devlink_port, devlink_sb->index,
+ tc_index, pool_type,
+ pool_index, threshold, info->extack);
+}
+
+int devlink_nl_sb_occ_snapshot_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ const struct devlink_ops *ops = devlink->ops;
+ struct devlink_sb *devlink_sb;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
+ if (ops->sb_occ_snapshot)
+ return ops->sb_occ_snapshot(devlink, devlink_sb->index);
+ return -EOPNOTSUPP;
+}
+
+int devlink_nl_sb_occ_max_clear_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ const struct devlink_ops *ops = devlink->ops;
+ struct devlink_sb *devlink_sb;
+
+ devlink_sb = devlink_sb_get_from_info(devlink, info);
+ if (IS_ERR(devlink_sb))
+ return PTR_ERR(devlink_sb);
+
+ if (ops->sb_occ_max_clear)
+ return ops->sb_occ_max_clear(devlink, devlink_sb->index);
+ return -EOPNOTSUPP;
+}
+
+int devl_sb_register(struct devlink *devlink, unsigned int sb_index,
+ u32 size, u16 ingress_pools_count,
+ u16 egress_pools_count, u16 ingress_tc_count,
+ u16 egress_tc_count)
+{
+ struct devlink_sb *devlink_sb;
+
+ lockdep_assert_held(&devlink->lock);
+
+ if (devlink_sb_index_exists(devlink, sb_index))
+ return -EEXIST;
+
+ devlink_sb = kzalloc(sizeof(*devlink_sb), GFP_KERNEL);
+ if (!devlink_sb)
+ return -ENOMEM;
+ devlink_sb->index = sb_index;
+ devlink_sb->size = size;
+ devlink_sb->ingress_pools_count = ingress_pools_count;
+ devlink_sb->egress_pools_count = egress_pools_count;
+ devlink_sb->ingress_tc_count = ingress_tc_count;
+ devlink_sb->egress_tc_count = egress_tc_count;
+ list_add_tail(&devlink_sb->list, &devlink->sb_list);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devl_sb_register);
+
+int devlink_sb_register(struct devlink *devlink, unsigned int sb_index,
+ u32 size, u16 ingress_pools_count,
+ u16 egress_pools_count, u16 ingress_tc_count,
+ u16 egress_tc_count)
+{
+ int err;
+
+ devl_lock(devlink);
+ err = devl_sb_register(devlink, sb_index, size, ingress_pools_count,
+ egress_pools_count, ingress_tc_count,
+ egress_tc_count);
+ devl_unlock(devlink);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_sb_register);
+
+void devl_sb_unregister(struct devlink *devlink, unsigned int sb_index)
+{
+ struct devlink_sb *devlink_sb;
+
+ lockdep_assert_held(&devlink->lock);
+
+ devlink_sb = devlink_sb_get_by_index(devlink, sb_index);
+ WARN_ON(!devlink_sb);
+ list_del(&devlink_sb->list);
+ kfree(devlink_sb);
+}
+EXPORT_SYMBOL_GPL(devl_sb_unregister);
+
+void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index)
+{
+ devl_lock(devlink);
+ devl_sb_unregister(devlink, sb_index);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_sb_unregister);
diff --git a/net/devlink/trap.c b/net/devlink/trap.c
new file mode 100644
index 000000000000..f36087f90db5
--- /dev/null
+++ b/net/devlink/trap.c
@@ -0,0 +1,1854 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ */
+
+#include <trace/events/devlink.h>
+
+#include "devl_internal.h"
+
+struct devlink_stats {
+ u64_stats_t rx_bytes;
+ u64_stats_t rx_packets;
+ struct u64_stats_sync syncp;
+};
+
+/**
+ * struct devlink_trap_policer_item - Packet trap policer attributes.
+ * @policer: Immutable packet trap policer attributes.
+ * @rate: Rate in packets / sec.
+ * @burst: Burst size in packets.
+ * @list: trap_policer_list member.
+ *
+ * Describes packet trap policer attributes. Created by devlink during trap
+ * policer registration.
+ */
+struct devlink_trap_policer_item {
+ const struct devlink_trap_policer *policer;
+ u64 rate;
+ u64 burst;
+ struct list_head list;
+};
+
+/**
+ * struct devlink_trap_group_item - Packet trap group attributes.
+ * @group: Immutable packet trap group attributes.
+ * @policer_item: Associated policer item. Can be NULL.
+ * @list: trap_group_list member.
+ * @stats: Trap group statistics.
+ *
+ * Describes packet trap group attributes. Created by devlink during trap
+ * group registration.
+ */
+struct devlink_trap_group_item {
+ const struct devlink_trap_group *group;
+ struct devlink_trap_policer_item *policer_item;
+ struct list_head list;
+ struct devlink_stats __percpu *stats;
+};
+
+/**
+ * struct devlink_trap_item - Packet trap attributes.
+ * @trap: Immutable packet trap attributes.
+ * @group_item: Associated group item.
+ * @list: trap_list member.
+ * @action: Trap action.
+ * @stats: Trap statistics.
+ * @priv: Driver private information.
+ *
+ * Describes both mutable and immutable packet trap attributes. Created by
+ * devlink during trap registration and used for all trap related operations.
+ */
+struct devlink_trap_item {
+ const struct devlink_trap *trap;
+ struct devlink_trap_group_item *group_item;
+ struct list_head list;
+ enum devlink_trap_action action;
+ struct devlink_stats __percpu *stats;
+ void *priv;
+};
+
+static struct devlink_trap_policer_item *
+devlink_trap_policer_item_lookup(struct devlink *devlink, u32 id)
+{
+ struct devlink_trap_policer_item *policer_item;
+
+ list_for_each_entry(policer_item, &devlink->trap_policer_list, list) {
+ if (policer_item->policer->id == id)
+ return policer_item;
+ }
+
+ return NULL;
+}
+
+static struct devlink_trap_item *
+devlink_trap_item_lookup(struct devlink *devlink, const char *name)
+{
+ struct devlink_trap_item *trap_item;
+
+ list_for_each_entry(trap_item, &devlink->trap_list, list) {
+ if (!strcmp(trap_item->trap->name, name))
+ return trap_item;
+ }
+
+ return NULL;
+}
+
+static struct devlink_trap_item *
+devlink_trap_item_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ struct nlattr *attr;
+
+ if (!info->attrs[DEVLINK_ATTR_TRAP_NAME])
+ return NULL;
+ attr = info->attrs[DEVLINK_ATTR_TRAP_NAME];
+
+ return devlink_trap_item_lookup(devlink, nla_data(attr));
+}
+
+static int
+devlink_trap_action_get_from_info(struct genl_info *info,
+ enum devlink_trap_action *p_trap_action)
+{
+ u8 val;
+
+ val = nla_get_u8(info->attrs[DEVLINK_ATTR_TRAP_ACTION]);
+ switch (val) {
+ case DEVLINK_TRAP_ACTION_DROP:
+ case DEVLINK_TRAP_ACTION_TRAP:
+ case DEVLINK_TRAP_ACTION_MIRROR:
+ *p_trap_action = val;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int devlink_trap_metadata_put(struct sk_buff *msg,
+ const struct devlink_trap *trap)
+{
+ struct nlattr *attr;
+
+ attr = nla_nest_start(msg, DEVLINK_ATTR_TRAP_METADATA);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT) &&
+ nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_IN_PORT))
+ goto nla_put_failure;
+ if ((trap->metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE) &&
+ nla_put_flag(msg, DEVLINK_ATTR_TRAP_METADATA_TYPE_FA_COOKIE))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats,
+ struct devlink_stats *stats)
+{
+ int i;
+
+ memset(stats, 0, sizeof(*stats));
+ for_each_possible_cpu(i) {
+ struct devlink_stats *cpu_stats;
+ u64 rx_packets, rx_bytes;
+ unsigned int start;
+
+ cpu_stats = per_cpu_ptr(trap_stats, i);
+ do {
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
+ rx_packets = u64_stats_read(&cpu_stats->rx_packets);
+ rx_bytes = u64_stats_read(&cpu_stats->rx_bytes);
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
+
+ u64_stats_add(&stats->rx_packets, rx_packets);
+ u64_stats_add(&stats->rx_bytes, rx_bytes);
+ }
+}
+
+static int
+devlink_trap_group_stats_put(struct sk_buff *msg,
+ struct devlink_stats __percpu *trap_stats)
+{
+ struct devlink_stats stats;
+ struct nlattr *attr;
+
+ devlink_trap_stats_read(trap_stats, &stats);
+
+ attr = nla_nest_start(msg, DEVLINK_ATTR_STATS);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_PACKETS,
+ u64_stats_read(&stats.rx_packets)))
+ goto nla_put_failure;
+
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_BYTES,
+ u64_stats_read(&stats.rx_bytes)))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_trap_stats_put(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_item *trap_item)
+{
+ struct devlink_stats stats;
+ struct nlattr *attr;
+ u64 drops = 0;
+ int err;
+
+ if (devlink->ops->trap_drop_counter_get) {
+ err = devlink->ops->trap_drop_counter_get(devlink,
+ trap_item->trap,
+ &drops);
+ if (err)
+ return err;
+ }
+
+ devlink_trap_stats_read(trap_item->stats, &stats);
+
+ attr = nla_nest_start(msg, DEVLINK_ATTR_STATS);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (devlink->ops->trap_drop_counter_get &&
+ devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops))
+ goto nla_put_failure;
+
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_PACKETS,
+ u64_stats_read(&stats.rx_packets)))
+ goto nla_put_failure;
+
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_BYTES,
+ u64_stats_read(&stats.rx_bytes)))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int devlink_nl_trap_fill(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_item *trap_item,
+ enum devlink_command cmd, u32 portid, u32 seq,
+ int flags)
+{
+ struct devlink_trap_group_item *group_item = trap_item->group_item;
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME,
+ group_item->group->name))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, DEVLINK_ATTR_TRAP_NAME, trap_item->trap->name))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_TYPE, trap_item->trap->type))
+ goto nla_put_failure;
+
+ if (trap_item->trap->generic &&
+ nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, DEVLINK_ATTR_TRAP_ACTION, trap_item->action))
+ goto nla_put_failure;
+
+ err = devlink_trap_metadata_put(msg, trap_item->trap);
+ if (err)
+ goto nla_put_failure;
+
+ err = devlink_trap_stats_put(msg, devlink, trap_item);
+ if (err)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+int devlink_nl_trap_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_trap_item *trap_item;
+ struct sk_buff *msg;
+ int err;
+
+ if (list_empty(&devlink->trap_list))
+ return -EOPNOTSUPP;
+
+ trap_item = devlink_trap_item_get_from_info(devlink, info);
+ if (!trap_item) {
+ NL_SET_ERR_MSG(extack, "Device did not register this trap");
+ return -ENOENT;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_trap_fill(msg, devlink, trap_item,
+ DEVLINK_CMD_TRAP_NEW, info->snd_portid,
+ info->snd_seq, 0);
+ if (err)
+ goto err_trap_fill;
+
+ return genlmsg_reply(msg, info);
+
+err_trap_fill:
+ nlmsg_free(msg);
+ return err;
+}
+
+static int devlink_nl_trap_get_dump_one(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb, int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_trap_item *trap_item;
+ int idx = 0;
+ int err = 0;
+
+ list_for_each_entry(trap_item, &devlink->trap_list, list) {
+ if (idx < state->idx) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_trap_fill(msg, devlink, trap_item,
+ DEVLINK_CMD_TRAP_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags);
+ if (err) {
+ state->idx = idx;
+ break;
+ }
+ idx++;
+ }
+
+ return err;
+}
+
+int devlink_nl_trap_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_trap_get_dump_one);
+}
+
+static int __devlink_trap_action_set(struct devlink *devlink,
+ struct devlink_trap_item *trap_item,
+ enum devlink_trap_action trap_action,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (trap_item->action != trap_action &&
+ trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP) {
+ NL_SET_ERR_MSG(extack, "Cannot change action of non-drop traps. Skipping");
+ return 0;
+ }
+
+ err = devlink->ops->trap_action_set(devlink, trap_item->trap,
+ trap_action, extack);
+ if (err)
+ return err;
+
+ trap_item->action = trap_action;
+
+ return 0;
+}
+
+static int devlink_trap_action_set(struct devlink *devlink,
+ struct devlink_trap_item *trap_item,
+ struct genl_info *info)
+{
+ enum devlink_trap_action trap_action;
+ int err;
+
+ if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION])
+ return 0;
+
+ err = devlink_trap_action_get_from_info(info, &trap_action);
+ if (err) {
+ NL_SET_ERR_MSG(info->extack, "Invalid trap action");
+ return -EINVAL;
+ }
+
+ return __devlink_trap_action_set(devlink, trap_item, trap_action,
+ info->extack);
+}
+
+int devlink_nl_trap_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_trap_item *trap_item;
+
+ if (list_empty(&devlink->trap_list))
+ return -EOPNOTSUPP;
+
+ trap_item = devlink_trap_item_get_from_info(devlink, info);
+ if (!trap_item) {
+ NL_SET_ERR_MSG(extack, "Device did not register this trap");
+ return -ENOENT;
+ }
+
+ return devlink_trap_action_set(devlink, trap_item, info);
+}
+
+static struct devlink_trap_group_item *
+devlink_trap_group_item_lookup(struct devlink *devlink, const char *name)
+{
+ struct devlink_trap_group_item *group_item;
+
+ list_for_each_entry(group_item, &devlink->trap_group_list, list) {
+ if (!strcmp(group_item->group->name, name))
+ return group_item;
+ }
+
+ return NULL;
+}
+
+static struct devlink_trap_group_item *
+devlink_trap_group_item_lookup_by_id(struct devlink *devlink, u16 id)
+{
+ struct devlink_trap_group_item *group_item;
+
+ list_for_each_entry(group_item, &devlink->trap_group_list, list) {
+ if (group_item->group->id == id)
+ return group_item;
+ }
+
+ return NULL;
+}
+
+static struct devlink_trap_group_item *
+devlink_trap_group_item_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ char *name;
+
+ if (!info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME])
+ return NULL;
+ name = nla_data(info->attrs[DEVLINK_ATTR_TRAP_GROUP_NAME]);
+
+ return devlink_trap_group_item_lookup(devlink, name);
+}
+
+static int
+devlink_nl_trap_group_fill(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_group_item *group_item,
+ enum devlink_command cmd, u32 portid, u32 seq,
+ int flags)
+{
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_string(msg, DEVLINK_ATTR_TRAP_GROUP_NAME,
+ group_item->group->name))
+ goto nla_put_failure;
+
+ if (group_item->group->generic &&
+ nla_put_flag(msg, DEVLINK_ATTR_TRAP_GENERIC))
+ goto nla_put_failure;
+
+ if (group_item->policer_item &&
+ nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID,
+ group_item->policer_item->policer->id))
+ goto nla_put_failure;
+
+ err = devlink_trap_group_stats_put(msg, group_item->stats);
+ if (err)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+int devlink_nl_trap_group_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_trap_group_item *group_item;
+ struct sk_buff *msg;
+ int err;
+
+ if (list_empty(&devlink->trap_group_list))
+ return -EOPNOTSUPP;
+
+ group_item = devlink_trap_group_item_get_from_info(devlink, info);
+ if (!group_item) {
+ NL_SET_ERR_MSG(extack, "Device did not register this trap group");
+ return -ENOENT;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_trap_group_fill(msg, devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err)
+ goto err_trap_group_fill;
+
+ return genlmsg_reply(msg, info);
+
+err_trap_group_fill:
+ nlmsg_free(msg);
+ return err;
+}
+
+static int devlink_nl_trap_group_get_dump_one(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb,
+ int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_trap_group_item *group_item;
+ int idx = 0;
+ int err = 0;
+
+ list_for_each_entry(group_item, &devlink->trap_group_list, list) {
+ if (idx < state->idx) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_trap_group_fill(msg, devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags);
+ if (err) {
+ state->idx = idx;
+ break;
+ }
+ idx++;
+ }
+
+ return err;
+}
+
+int devlink_nl_trap_group_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_trap_group_get_dump_one);
+}
+
+static int
+__devlink_trap_group_action_set(struct devlink *devlink,
+ struct devlink_trap_group_item *group_item,
+ enum devlink_trap_action trap_action,
+ struct netlink_ext_ack *extack)
+{
+ const char *group_name = group_item->group->name;
+ struct devlink_trap_item *trap_item;
+ int err;
+
+ if (devlink->ops->trap_group_action_set) {
+ err = devlink->ops->trap_group_action_set(devlink, group_item->group,
+ trap_action, extack);
+ if (err)
+ return err;
+
+ list_for_each_entry(trap_item, &devlink->trap_list, list) {
+ if (strcmp(trap_item->group_item->group->name, group_name))
+ continue;
+ if (trap_item->action != trap_action &&
+ trap_item->trap->type != DEVLINK_TRAP_TYPE_DROP)
+ continue;
+ trap_item->action = trap_action;
+ }
+
+ return 0;
+ }
+
+ list_for_each_entry(trap_item, &devlink->trap_list, list) {
+ if (strcmp(trap_item->group_item->group->name, group_name))
+ continue;
+ err = __devlink_trap_action_set(devlink, trap_item,
+ trap_action, extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+devlink_trap_group_action_set(struct devlink *devlink,
+ struct devlink_trap_group_item *group_item,
+ struct genl_info *info, bool *p_modified)
+{
+ enum devlink_trap_action trap_action;
+ int err;
+
+ if (!info->attrs[DEVLINK_ATTR_TRAP_ACTION])
+ return 0;
+
+ err = devlink_trap_action_get_from_info(info, &trap_action);
+ if (err) {
+ NL_SET_ERR_MSG(info->extack, "Invalid trap action");
+ return -EINVAL;
+ }
+
+ err = __devlink_trap_group_action_set(devlink, group_item, trap_action,
+ info->extack);
+ if (err)
+ return err;
+
+ *p_modified = true;
+
+ return 0;
+}
+
+static int devlink_trap_group_set(struct devlink *devlink,
+ struct devlink_trap_group_item *group_item,
+ struct genl_info *info)
+{
+ struct devlink_trap_policer_item *policer_item;
+ struct netlink_ext_ack *extack = info->extack;
+ const struct devlink_trap_policer *policer;
+ struct nlattr **attrs = info->attrs;
+ u32 policer_id;
+ int err;
+
+ if (!attrs[DEVLINK_ATTR_TRAP_POLICER_ID])
+ return 0;
+
+ if (!devlink->ops->trap_group_set)
+ return -EOPNOTSUPP;
+
+ policer_id = nla_get_u32(attrs[DEVLINK_ATTR_TRAP_POLICER_ID]);
+ policer_item = devlink_trap_policer_item_lookup(devlink, policer_id);
+ if (policer_id && !policer_item) {
+ NL_SET_ERR_MSG(extack, "Device did not register this trap policer");
+ return -ENOENT;
+ }
+ policer = policer_item ? policer_item->policer : NULL;
+
+ err = devlink->ops->trap_group_set(devlink, group_item->group, policer,
+ extack);
+ if (err)
+ return err;
+
+ group_item->policer_item = policer_item;
+
+ return 0;
+}
+
+int devlink_nl_trap_group_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_trap_group_item *group_item;
+ bool modified = false;
+ int err;
+
+ if (list_empty(&devlink->trap_group_list))
+ return -EOPNOTSUPP;
+
+ group_item = devlink_trap_group_item_get_from_info(devlink, info);
+ if (!group_item) {
+ NL_SET_ERR_MSG(extack, "Device did not register this trap group");
+ return -ENOENT;
+ }
+
+ err = devlink_trap_group_action_set(devlink, group_item, info,
+ &modified);
+ if (err)
+ return err;
+
+ err = devlink_trap_group_set(devlink, group_item, info);
+ if (err)
+ goto err_trap_group_set;
+
+ return 0;
+
+err_trap_group_set:
+ if (modified)
+ NL_SET_ERR_MSG(extack, "Trap group set failed, but some changes were committed already");
+ return err;
+}
+
+static struct devlink_trap_policer_item *
+devlink_trap_policer_item_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ u32 id;
+
+ if (!info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID])
+ return NULL;
+ id = nla_get_u32(info->attrs[DEVLINK_ATTR_TRAP_POLICER_ID]);
+
+ return devlink_trap_policer_item_lookup(devlink, id);
+}
+
+static int
+devlink_trap_policer_stats_put(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_policer *policer)
+{
+ struct nlattr *attr;
+ u64 drops;
+ int err;
+
+ if (!devlink->ops->trap_policer_counter_get)
+ return 0;
+
+ err = devlink->ops->trap_policer_counter_get(devlink, policer, &drops);
+ if (err)
+ return err;
+
+ attr = nla_nest_start(msg, DEVLINK_ATTR_STATS);
+ if (!attr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_STATS_RX_DROPPED, drops))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, attr);
+
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(msg, attr);
+ return -EMSGSIZE;
+}
+
+static int
+devlink_nl_trap_policer_fill(struct sk_buff *msg, struct devlink *devlink,
+ const struct devlink_trap_policer_item *policer_item,
+ enum devlink_command cmd, u32 portid, u32 seq,
+ int flags)
+{
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, DEVLINK_ATTR_TRAP_POLICER_ID,
+ policer_item->policer->id))
+ goto nla_put_failure;
+
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_TRAP_POLICER_RATE,
+ policer_item->rate))
+ goto nla_put_failure;
+
+ if (devlink_nl_put_u64(msg, DEVLINK_ATTR_TRAP_POLICER_BURST,
+ policer_item->burst))
+ goto nla_put_failure;
+
+ err = devlink_trap_policer_stats_put(msg, devlink,
+ policer_item->policer);
+ if (err)
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+int devlink_nl_trap_policer_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_trap_policer_item *policer_item;
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ if (list_empty(&devlink->trap_policer_list))
+ return -EOPNOTSUPP;
+
+ policer_item = devlink_trap_policer_item_get_from_info(devlink, info);
+ if (!policer_item) {
+ NL_SET_ERR_MSG(extack, "Device did not register this trap policer");
+ return -ENOENT;
+ }
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_trap_policer_fill(msg, devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err)
+ goto err_trap_policer_fill;
+
+ return genlmsg_reply(msg, info);
+
+err_trap_policer_fill:
+ nlmsg_free(msg);
+ return err;
+}
+
+static int devlink_nl_trap_policer_get_dump_one(struct sk_buff *msg,
+ struct devlink *devlink,
+ struct netlink_callback *cb,
+ int flags)
+{
+ struct devlink_nl_dump_state *state = devlink_dump_state(cb);
+ struct devlink_trap_policer_item *policer_item;
+ int idx = 0;
+ int err = 0;
+
+ list_for_each_entry(policer_item, &devlink->trap_policer_list, list) {
+ if (idx < state->idx) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_trap_policer_fill(msg, devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags);
+ if (err) {
+ state->idx = idx;
+ break;
+ }
+ idx++;
+ }
+
+ return err;
+}
+
+int devlink_nl_trap_policer_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ return devlink_nl_dumpit(skb, cb, devlink_nl_trap_policer_get_dump_one);
+}
+
+static int
+devlink_trap_policer_set(struct devlink *devlink,
+ struct devlink_trap_policer_item *policer_item,
+ struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct nlattr **attrs = info->attrs;
+ u64 rate, burst;
+ int err;
+
+ rate = policer_item->rate;
+ burst = policer_item->burst;
+
+ if (attrs[DEVLINK_ATTR_TRAP_POLICER_RATE])
+ rate = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_RATE]);
+
+ if (attrs[DEVLINK_ATTR_TRAP_POLICER_BURST])
+ burst = nla_get_u64(attrs[DEVLINK_ATTR_TRAP_POLICER_BURST]);
+
+ if (rate < policer_item->policer->min_rate) {
+ NL_SET_ERR_MSG(extack, "Policer rate lower than limit");
+ return -EINVAL;
+ }
+
+ if (rate > policer_item->policer->max_rate) {
+ NL_SET_ERR_MSG(extack, "Policer rate higher than limit");
+ return -EINVAL;
+ }
+
+ if (burst < policer_item->policer->min_burst) {
+ NL_SET_ERR_MSG(extack, "Policer burst size lower than limit");
+ return -EINVAL;
+ }
+
+ if (burst > policer_item->policer->max_burst) {
+ NL_SET_ERR_MSG(extack, "Policer burst size higher than limit");
+ return -EINVAL;
+ }
+
+ err = devlink->ops->trap_policer_set(devlink, policer_item->policer,
+ rate, burst, info->extack);
+ if (err)
+ return err;
+
+ policer_item->rate = rate;
+ policer_item->burst = burst;
+
+ return 0;
+}
+
+int devlink_nl_trap_policer_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink_trap_policer_item *policer_item;
+ struct netlink_ext_ack *extack = info->extack;
+ struct devlink *devlink = info->user_ptr[0];
+
+ if (list_empty(&devlink->trap_policer_list))
+ return -EOPNOTSUPP;
+
+ if (!devlink->ops->trap_policer_set)
+ return -EOPNOTSUPP;
+
+ policer_item = devlink_trap_policer_item_get_from_info(devlink, info);
+ if (!policer_item) {
+ NL_SET_ERR_MSG(extack, "Device did not register this trap policer");
+ return -ENOENT;
+ }
+
+ return devlink_trap_policer_set(devlink, policer_item, info);
+}
+
+#define DEVLINK_TRAP(_id, _type) \
+ { \
+ .type = DEVLINK_TRAP_TYPE_##_type, \
+ .id = DEVLINK_TRAP_GENERIC_ID_##_id, \
+ .name = DEVLINK_TRAP_GENERIC_NAME_##_id, \
+ }
+
+static const struct devlink_trap devlink_trap_generic[] = {
+ DEVLINK_TRAP(SMAC_MC, DROP),
+ DEVLINK_TRAP(VLAN_TAG_MISMATCH, DROP),
+ DEVLINK_TRAP(INGRESS_VLAN_FILTER, DROP),
+ DEVLINK_TRAP(INGRESS_STP_FILTER, DROP),
+ DEVLINK_TRAP(EMPTY_TX_LIST, DROP),
+ DEVLINK_TRAP(PORT_LOOPBACK_FILTER, DROP),
+ DEVLINK_TRAP(BLACKHOLE_ROUTE, DROP),
+ DEVLINK_TRAP(TTL_ERROR, EXCEPTION),
+ DEVLINK_TRAP(TAIL_DROP, DROP),
+ DEVLINK_TRAP(NON_IP_PACKET, DROP),
+ DEVLINK_TRAP(UC_DIP_MC_DMAC, DROP),
+ DEVLINK_TRAP(DIP_LB, DROP),
+ DEVLINK_TRAP(SIP_MC, DROP),
+ DEVLINK_TRAP(SIP_LB, DROP),
+ DEVLINK_TRAP(CORRUPTED_IP_HDR, DROP),
+ DEVLINK_TRAP(IPV4_SIP_BC, DROP),
+ DEVLINK_TRAP(IPV6_MC_DIP_RESERVED_SCOPE, DROP),
+ DEVLINK_TRAP(IPV6_MC_DIP_INTERFACE_LOCAL_SCOPE, DROP),
+ DEVLINK_TRAP(MTU_ERROR, EXCEPTION),
+ DEVLINK_TRAP(UNRESOLVED_NEIGH, EXCEPTION),
+ DEVLINK_TRAP(RPF, EXCEPTION),
+ DEVLINK_TRAP(REJECT_ROUTE, EXCEPTION),
+ DEVLINK_TRAP(IPV4_LPM_UNICAST_MISS, EXCEPTION),
+ DEVLINK_TRAP(IPV6_LPM_UNICAST_MISS, EXCEPTION),
+ DEVLINK_TRAP(NON_ROUTABLE, DROP),
+ DEVLINK_TRAP(DECAP_ERROR, EXCEPTION),
+ DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP),
+ DEVLINK_TRAP(INGRESS_FLOW_ACTION_DROP, DROP),
+ DEVLINK_TRAP(EGRESS_FLOW_ACTION_DROP, DROP),
+ DEVLINK_TRAP(STP, CONTROL),
+ DEVLINK_TRAP(LACP, CONTROL),
+ DEVLINK_TRAP(LLDP, CONTROL),
+ DEVLINK_TRAP(IGMP_QUERY, CONTROL),
+ DEVLINK_TRAP(IGMP_V1_REPORT, CONTROL),
+ DEVLINK_TRAP(IGMP_V2_REPORT, CONTROL),
+ DEVLINK_TRAP(IGMP_V3_REPORT, CONTROL),
+ DEVLINK_TRAP(IGMP_V2_LEAVE, CONTROL),
+ DEVLINK_TRAP(MLD_QUERY, CONTROL),
+ DEVLINK_TRAP(MLD_V1_REPORT, CONTROL),
+ DEVLINK_TRAP(MLD_V2_REPORT, CONTROL),
+ DEVLINK_TRAP(MLD_V1_DONE, CONTROL),
+ DEVLINK_TRAP(IPV4_DHCP, CONTROL),
+ DEVLINK_TRAP(IPV6_DHCP, CONTROL),
+ DEVLINK_TRAP(ARP_REQUEST, CONTROL),
+ DEVLINK_TRAP(ARP_RESPONSE, CONTROL),
+ DEVLINK_TRAP(ARP_OVERLAY, CONTROL),
+ DEVLINK_TRAP(IPV6_NEIGH_SOLICIT, CONTROL),
+ DEVLINK_TRAP(IPV6_NEIGH_ADVERT, CONTROL),
+ DEVLINK_TRAP(IPV4_BFD, CONTROL),
+ DEVLINK_TRAP(IPV6_BFD, CONTROL),
+ DEVLINK_TRAP(IPV4_OSPF, CONTROL),
+ DEVLINK_TRAP(IPV6_OSPF, CONTROL),
+ DEVLINK_TRAP(IPV4_BGP, CONTROL),
+ DEVLINK_TRAP(IPV6_BGP, CONTROL),
+ DEVLINK_TRAP(IPV4_VRRP, CONTROL),
+ DEVLINK_TRAP(IPV6_VRRP, CONTROL),
+ DEVLINK_TRAP(IPV4_PIM, CONTROL),
+ DEVLINK_TRAP(IPV6_PIM, CONTROL),
+ DEVLINK_TRAP(UC_LB, CONTROL),
+ DEVLINK_TRAP(LOCAL_ROUTE, CONTROL),
+ DEVLINK_TRAP(EXTERNAL_ROUTE, CONTROL),
+ DEVLINK_TRAP(IPV6_UC_DIP_LINK_LOCAL_SCOPE, CONTROL),
+ DEVLINK_TRAP(IPV6_DIP_ALL_NODES, CONTROL),
+ DEVLINK_TRAP(IPV6_DIP_ALL_ROUTERS, CONTROL),
+ DEVLINK_TRAP(IPV6_ROUTER_SOLICIT, CONTROL),
+ DEVLINK_TRAP(IPV6_ROUTER_ADVERT, CONTROL),
+ DEVLINK_TRAP(IPV6_REDIRECT, CONTROL),
+ DEVLINK_TRAP(IPV4_ROUTER_ALERT, CONTROL),
+ DEVLINK_TRAP(IPV6_ROUTER_ALERT, CONTROL),
+ DEVLINK_TRAP(PTP_EVENT, CONTROL),
+ DEVLINK_TRAP(PTP_GENERAL, CONTROL),
+ DEVLINK_TRAP(FLOW_ACTION_SAMPLE, CONTROL),
+ DEVLINK_TRAP(FLOW_ACTION_TRAP, CONTROL),
+ DEVLINK_TRAP(EARLY_DROP, DROP),
+ DEVLINK_TRAP(VXLAN_PARSING, DROP),
+ DEVLINK_TRAP(LLC_SNAP_PARSING, DROP),
+ DEVLINK_TRAP(VLAN_PARSING, DROP),
+ DEVLINK_TRAP(PPPOE_PPP_PARSING, DROP),
+ DEVLINK_TRAP(MPLS_PARSING, DROP),
+ DEVLINK_TRAP(ARP_PARSING, DROP),
+ DEVLINK_TRAP(IP_1_PARSING, DROP),
+ DEVLINK_TRAP(IP_N_PARSING, DROP),
+ DEVLINK_TRAP(GRE_PARSING, DROP),
+ DEVLINK_TRAP(UDP_PARSING, DROP),
+ DEVLINK_TRAP(TCP_PARSING, DROP),
+ DEVLINK_TRAP(IPSEC_PARSING, DROP),
+ DEVLINK_TRAP(SCTP_PARSING, DROP),
+ DEVLINK_TRAP(DCCP_PARSING, DROP),
+ DEVLINK_TRAP(GTP_PARSING, DROP),
+ DEVLINK_TRAP(ESP_PARSING, DROP),
+ DEVLINK_TRAP(BLACKHOLE_NEXTHOP, DROP),
+ DEVLINK_TRAP(DMAC_FILTER, DROP),
+ DEVLINK_TRAP(EAPOL, CONTROL),
+ DEVLINK_TRAP(LOCKED_PORT, DROP),
+};
+
+#define DEVLINK_TRAP_GROUP(_id) \
+ { \
+ .id = DEVLINK_TRAP_GROUP_GENERIC_ID_##_id, \
+ .name = DEVLINK_TRAP_GROUP_GENERIC_NAME_##_id, \
+ }
+
+static const struct devlink_trap_group devlink_trap_group_generic[] = {
+ DEVLINK_TRAP_GROUP(L2_DROPS),
+ DEVLINK_TRAP_GROUP(L3_DROPS),
+ DEVLINK_TRAP_GROUP(L3_EXCEPTIONS),
+ DEVLINK_TRAP_GROUP(BUFFER_DROPS),
+ DEVLINK_TRAP_GROUP(TUNNEL_DROPS),
+ DEVLINK_TRAP_GROUP(ACL_DROPS),
+ DEVLINK_TRAP_GROUP(STP),
+ DEVLINK_TRAP_GROUP(LACP),
+ DEVLINK_TRAP_GROUP(LLDP),
+ DEVLINK_TRAP_GROUP(MC_SNOOPING),
+ DEVLINK_TRAP_GROUP(DHCP),
+ DEVLINK_TRAP_GROUP(NEIGH_DISCOVERY),
+ DEVLINK_TRAP_GROUP(BFD),
+ DEVLINK_TRAP_GROUP(OSPF),
+ DEVLINK_TRAP_GROUP(BGP),
+ DEVLINK_TRAP_GROUP(VRRP),
+ DEVLINK_TRAP_GROUP(PIM),
+ DEVLINK_TRAP_GROUP(UC_LB),
+ DEVLINK_TRAP_GROUP(LOCAL_DELIVERY),
+ DEVLINK_TRAP_GROUP(EXTERNAL_DELIVERY),
+ DEVLINK_TRAP_GROUP(IPV6),
+ DEVLINK_TRAP_GROUP(PTP_EVENT),
+ DEVLINK_TRAP_GROUP(PTP_GENERAL),
+ DEVLINK_TRAP_GROUP(ACL_SAMPLE),
+ DEVLINK_TRAP_GROUP(ACL_TRAP),
+ DEVLINK_TRAP_GROUP(PARSER_ERROR_DROPS),
+ DEVLINK_TRAP_GROUP(EAPOL),
+};
+
+static int devlink_trap_generic_verify(const struct devlink_trap *trap)
+{
+ if (trap->id > DEVLINK_TRAP_GENERIC_ID_MAX)
+ return -EINVAL;
+
+ if (strcmp(trap->name, devlink_trap_generic[trap->id].name))
+ return -EINVAL;
+
+ if (trap->type != devlink_trap_generic[trap->id].type)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int devlink_trap_driver_verify(const struct devlink_trap *trap)
+{
+ int i;
+
+ if (trap->id <= DEVLINK_TRAP_GENERIC_ID_MAX)
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(devlink_trap_generic); i++) {
+ if (!strcmp(trap->name, devlink_trap_generic[i].name))
+ return -EEXIST;
+ }
+
+ return 0;
+}
+
+static int devlink_trap_verify(const struct devlink_trap *trap)
+{
+ if (!trap || !trap->name)
+ return -EINVAL;
+
+ if (trap->generic)
+ return devlink_trap_generic_verify(trap);
+ else
+ return devlink_trap_driver_verify(trap);
+}
+
+static int
+devlink_trap_group_generic_verify(const struct devlink_trap_group *group)
+{
+ if (group->id > DEVLINK_TRAP_GROUP_GENERIC_ID_MAX)
+ return -EINVAL;
+
+ if (strcmp(group->name, devlink_trap_group_generic[group->id].name))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int
+devlink_trap_group_driver_verify(const struct devlink_trap_group *group)
+{
+ int i;
+
+ if (group->id <= DEVLINK_TRAP_GROUP_GENERIC_ID_MAX)
+ return -EINVAL;
+
+ for (i = 0; i < ARRAY_SIZE(devlink_trap_group_generic); i++) {
+ if (!strcmp(group->name, devlink_trap_group_generic[i].name))
+ return -EEXIST;
+ }
+
+ return 0;
+}
+
+static int devlink_trap_group_verify(const struct devlink_trap_group *group)
+{
+ if (group->generic)
+ return devlink_trap_group_generic_verify(group);
+ else
+ return devlink_trap_group_driver_verify(group);
+}
+
+static void
+devlink_trap_group_notify(struct devlink *devlink,
+ const struct devlink_trap_group_item *group_item,
+ enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_GROUP_NEW &&
+ cmd != DEVLINK_CMD_TRAP_GROUP_DEL);
+
+ if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_trap_group_fill(msg, devlink, group_item, cmd, 0, 0,
+ 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ devlink_nl_notify_send(devlink, msg);
+}
+
+void devlink_trap_groups_notify_register(struct devlink *devlink)
+{
+ struct devlink_trap_group_item *group_item;
+
+ list_for_each_entry(group_item, &devlink->trap_group_list, list)
+ devlink_trap_group_notify(devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_NEW);
+}
+
+void devlink_trap_groups_notify_unregister(struct devlink *devlink)
+{
+ struct devlink_trap_group_item *group_item;
+
+ list_for_each_entry_reverse(group_item, &devlink->trap_group_list, list)
+ devlink_trap_group_notify(devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_DEL);
+}
+
+static int
+devlink_trap_item_group_link(struct devlink *devlink,
+ struct devlink_trap_item *trap_item)
+{
+ u16 group_id = trap_item->trap->init_group_id;
+ struct devlink_trap_group_item *group_item;
+
+ group_item = devlink_trap_group_item_lookup_by_id(devlink, group_id);
+ if (WARN_ON_ONCE(!group_item))
+ return -EINVAL;
+
+ trap_item->group_item = group_item;
+
+ return 0;
+}
+
+static void devlink_trap_notify(struct devlink *devlink,
+ const struct devlink_trap_item *trap_item,
+ enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_NEW &&
+ cmd != DEVLINK_CMD_TRAP_DEL);
+
+ if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_trap_fill(msg, devlink, trap_item, cmd, 0, 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ devlink_nl_notify_send(devlink, msg);
+}
+
+void devlink_traps_notify_register(struct devlink *devlink)
+{
+ struct devlink_trap_item *trap_item;
+
+ list_for_each_entry(trap_item, &devlink->trap_list, list)
+ devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW);
+}
+
+void devlink_traps_notify_unregister(struct devlink *devlink)
+{
+ struct devlink_trap_item *trap_item;
+
+ list_for_each_entry_reverse(trap_item, &devlink->trap_list, list)
+ devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL);
+}
+
+static int
+devlink_trap_register(struct devlink *devlink,
+ const struct devlink_trap *trap, void *priv)
+{
+ struct devlink_trap_item *trap_item;
+ int err;
+
+ if (devlink_trap_item_lookup(devlink, trap->name))
+ return -EEXIST;
+
+ trap_item = kzalloc(sizeof(*trap_item), GFP_KERNEL);
+ if (!trap_item)
+ return -ENOMEM;
+
+ trap_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats);
+ if (!trap_item->stats) {
+ err = -ENOMEM;
+ goto err_stats_alloc;
+ }
+
+ trap_item->trap = trap;
+ trap_item->action = trap->init_action;
+ trap_item->priv = priv;
+
+ err = devlink_trap_item_group_link(devlink, trap_item);
+ if (err)
+ goto err_group_link;
+
+ err = devlink->ops->trap_init(devlink, trap, trap_item);
+ if (err)
+ goto err_trap_init;
+
+ list_add_tail(&trap_item->list, &devlink->trap_list);
+ devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_NEW);
+
+ return 0;
+
+err_trap_init:
+err_group_link:
+ free_percpu(trap_item->stats);
+err_stats_alloc:
+ kfree(trap_item);
+ return err;
+}
+
+static void devlink_trap_unregister(struct devlink *devlink,
+ const struct devlink_trap *trap)
+{
+ struct devlink_trap_item *trap_item;
+
+ trap_item = devlink_trap_item_lookup(devlink, trap->name);
+ if (WARN_ON_ONCE(!trap_item))
+ return;
+
+ devlink_trap_notify(devlink, trap_item, DEVLINK_CMD_TRAP_DEL);
+ list_del(&trap_item->list);
+ if (devlink->ops->trap_fini)
+ devlink->ops->trap_fini(devlink, trap, trap_item);
+ free_percpu(trap_item->stats);
+ kfree(trap_item);
+}
+
+static void devlink_trap_disable(struct devlink *devlink,
+ const struct devlink_trap *trap)
+{
+ struct devlink_trap_item *trap_item;
+
+ trap_item = devlink_trap_item_lookup(devlink, trap->name);
+ if (WARN_ON_ONCE(!trap_item))
+ return;
+
+ devlink->ops->trap_action_set(devlink, trap, DEVLINK_TRAP_ACTION_DROP,
+ NULL);
+ trap_item->action = DEVLINK_TRAP_ACTION_DROP;
+}
+
+/**
+ * devl_traps_register - Register packet traps with devlink.
+ * @devlink: devlink.
+ * @traps: Packet traps.
+ * @traps_count: Count of provided packet traps.
+ * @priv: Driver private information.
+ *
+ * Return: Non-zero value on failure.
+ */
+int devl_traps_register(struct devlink *devlink,
+ const struct devlink_trap *traps,
+ size_t traps_count, void *priv)
+{
+ int i, err;
+
+ if (!devlink->ops->trap_init || !devlink->ops->trap_action_set)
+ return -EINVAL;
+
+ devl_assert_locked(devlink);
+ for (i = 0; i < traps_count; i++) {
+ const struct devlink_trap *trap = &traps[i];
+
+ err = devlink_trap_verify(trap);
+ if (err)
+ goto err_trap_verify;
+
+ err = devlink_trap_register(devlink, trap, priv);
+ if (err)
+ goto err_trap_register;
+ }
+
+ return 0;
+
+err_trap_register:
+err_trap_verify:
+ for (i--; i >= 0; i--)
+ devlink_trap_unregister(devlink, &traps[i]);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devl_traps_register);
+
+/**
+ * devlink_traps_register - Register packet traps with devlink.
+ * @devlink: devlink.
+ * @traps: Packet traps.
+ * @traps_count: Count of provided packet traps.
+ * @priv: Driver private information.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ *
+ * Return: Non-zero value on failure.
+ */
+int devlink_traps_register(struct devlink *devlink,
+ const struct devlink_trap *traps,
+ size_t traps_count, void *priv)
+{
+ int err;
+
+ devl_lock(devlink);
+ err = devl_traps_register(devlink, traps, traps_count, priv);
+ devl_unlock(devlink);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_traps_register);
+
+/**
+ * devl_traps_unregister - Unregister packet traps from devlink.
+ * @devlink: devlink.
+ * @traps: Packet traps.
+ * @traps_count: Count of provided packet traps.
+ */
+void devl_traps_unregister(struct devlink *devlink,
+ const struct devlink_trap *traps,
+ size_t traps_count)
+{
+ int i;
+
+ devl_assert_locked(devlink);
+ /* Make sure we do not have any packets in-flight while unregistering
+ * traps by disabling all of them and waiting for a grace period.
+ */
+ for (i = traps_count - 1; i >= 0; i--)
+ devlink_trap_disable(devlink, &traps[i]);
+ synchronize_rcu();
+ for (i = traps_count - 1; i >= 0; i--)
+ devlink_trap_unregister(devlink, &traps[i]);
+}
+EXPORT_SYMBOL_GPL(devl_traps_unregister);
+
+/**
+ * devlink_traps_unregister - Unregister packet traps from devlink.
+ * @devlink: devlink.
+ * @traps: Packet traps.
+ * @traps_count: Count of provided packet traps.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+void devlink_traps_unregister(struct devlink *devlink,
+ const struct devlink_trap *traps,
+ size_t traps_count)
+{
+ devl_lock(devlink);
+ devl_traps_unregister(devlink, traps, traps_count);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_traps_unregister);
+
+static void
+devlink_trap_stats_update(struct devlink_stats __percpu *trap_stats,
+ size_t skb_len)
+{
+ struct devlink_stats *stats;
+
+ stats = this_cpu_ptr(trap_stats);
+ u64_stats_update_begin(&stats->syncp);
+ u64_stats_add(&stats->rx_bytes, skb_len);
+ u64_stats_inc(&stats->rx_packets);
+ u64_stats_update_end(&stats->syncp);
+}
+
+static void
+devlink_trap_report_metadata_set(struct devlink_trap_metadata *metadata,
+ const struct devlink_trap_item *trap_item,
+ struct devlink_port *in_devlink_port,
+ const struct flow_action_cookie *fa_cookie)
+{
+ metadata->trap_name = trap_item->trap->name;
+ metadata->trap_group_name = trap_item->group_item->group->name;
+ metadata->fa_cookie = fa_cookie;
+ metadata->trap_type = trap_item->trap->type;
+
+ spin_lock(&in_devlink_port->type_lock);
+ if (in_devlink_port->type == DEVLINK_PORT_TYPE_ETH)
+ metadata->input_dev = in_devlink_port->type_eth.netdev;
+ spin_unlock(&in_devlink_port->type_lock);
+}
+
+/**
+ * devlink_trap_report - Report trapped packet to drop monitor.
+ * @devlink: devlink.
+ * @skb: Trapped packet.
+ * @trap_ctx: Trap context.
+ * @in_devlink_port: Input devlink port.
+ * @fa_cookie: Flow action cookie. Could be NULL.
+ */
+void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb,
+ void *trap_ctx, struct devlink_port *in_devlink_port,
+ const struct flow_action_cookie *fa_cookie)
+
+{
+ struct devlink_trap_item *trap_item = trap_ctx;
+
+ devlink_trap_stats_update(trap_item->stats, skb->len);
+ devlink_trap_stats_update(trap_item->group_item->stats, skb->len);
+
+ if (tracepoint_enabled(devlink_trap_report)) {
+ struct devlink_trap_metadata metadata = {};
+
+ devlink_trap_report_metadata_set(&metadata, trap_item,
+ in_devlink_port, fa_cookie);
+ trace_devlink_trap_report(devlink, skb, &metadata);
+ }
+}
+EXPORT_SYMBOL_GPL(devlink_trap_report);
+
+/**
+ * devlink_trap_ctx_priv - Trap context to driver private information.
+ * @trap_ctx: Trap context.
+ *
+ * Return: Driver private information passed during registration.
+ */
+void *devlink_trap_ctx_priv(void *trap_ctx)
+{
+ struct devlink_trap_item *trap_item = trap_ctx;
+
+ return trap_item->priv;
+}
+EXPORT_SYMBOL_GPL(devlink_trap_ctx_priv);
+
+static int
+devlink_trap_group_item_policer_link(struct devlink *devlink,
+ struct devlink_trap_group_item *group_item)
+{
+ u32 policer_id = group_item->group->init_policer_id;
+ struct devlink_trap_policer_item *policer_item;
+
+ if (policer_id == 0)
+ return 0;
+
+ policer_item = devlink_trap_policer_item_lookup(devlink, policer_id);
+ if (WARN_ON_ONCE(!policer_item))
+ return -EINVAL;
+
+ group_item->policer_item = policer_item;
+
+ return 0;
+}
+
+static int
+devlink_trap_group_register(struct devlink *devlink,
+ const struct devlink_trap_group *group)
+{
+ struct devlink_trap_group_item *group_item;
+ int err;
+
+ if (devlink_trap_group_item_lookup(devlink, group->name))
+ return -EEXIST;
+
+ group_item = kzalloc(sizeof(*group_item), GFP_KERNEL);
+ if (!group_item)
+ return -ENOMEM;
+
+ group_item->stats = netdev_alloc_pcpu_stats(struct devlink_stats);
+ if (!group_item->stats) {
+ err = -ENOMEM;
+ goto err_stats_alloc;
+ }
+
+ group_item->group = group;
+
+ err = devlink_trap_group_item_policer_link(devlink, group_item);
+ if (err)
+ goto err_policer_link;
+
+ if (devlink->ops->trap_group_init) {
+ err = devlink->ops->trap_group_init(devlink, group);
+ if (err)
+ goto err_group_init;
+ }
+
+ list_add_tail(&group_item->list, &devlink->trap_group_list);
+ devlink_trap_group_notify(devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_NEW);
+
+ return 0;
+
+err_group_init:
+err_policer_link:
+ free_percpu(group_item->stats);
+err_stats_alloc:
+ kfree(group_item);
+ return err;
+}
+
+static void
+devlink_trap_group_unregister(struct devlink *devlink,
+ const struct devlink_trap_group *group)
+{
+ struct devlink_trap_group_item *group_item;
+
+ group_item = devlink_trap_group_item_lookup(devlink, group->name);
+ if (WARN_ON_ONCE(!group_item))
+ return;
+
+ devlink_trap_group_notify(devlink, group_item,
+ DEVLINK_CMD_TRAP_GROUP_DEL);
+ list_del(&group_item->list);
+ free_percpu(group_item->stats);
+ kfree(group_item);
+}
+
+/**
+ * devl_trap_groups_register - Register packet trap groups with devlink.
+ * @devlink: devlink.
+ * @groups: Packet trap groups.
+ * @groups_count: Count of provided packet trap groups.
+ *
+ * Return: Non-zero value on failure.
+ */
+int devl_trap_groups_register(struct devlink *devlink,
+ const struct devlink_trap_group *groups,
+ size_t groups_count)
+{
+ int i, err;
+
+ devl_assert_locked(devlink);
+ for (i = 0; i < groups_count; i++) {
+ const struct devlink_trap_group *group = &groups[i];
+
+ err = devlink_trap_group_verify(group);
+ if (err)
+ goto err_trap_group_verify;
+
+ err = devlink_trap_group_register(devlink, group);
+ if (err)
+ goto err_trap_group_register;
+ }
+
+ return 0;
+
+err_trap_group_register:
+err_trap_group_verify:
+ for (i--; i >= 0; i--)
+ devlink_trap_group_unregister(devlink, &groups[i]);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devl_trap_groups_register);
+
+/**
+ * devlink_trap_groups_register - Register packet trap groups with devlink.
+ * @devlink: devlink.
+ * @groups: Packet trap groups.
+ * @groups_count: Count of provided packet trap groups.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ *
+ * Return: Non-zero value on failure.
+ */
+int devlink_trap_groups_register(struct devlink *devlink,
+ const struct devlink_trap_group *groups,
+ size_t groups_count)
+{
+ int err;
+
+ devl_lock(devlink);
+ err = devl_trap_groups_register(devlink, groups, groups_count);
+ devl_unlock(devlink);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devlink_trap_groups_register);
+
+/**
+ * devl_trap_groups_unregister - Unregister packet trap groups from devlink.
+ * @devlink: devlink.
+ * @groups: Packet trap groups.
+ * @groups_count: Count of provided packet trap groups.
+ */
+void devl_trap_groups_unregister(struct devlink *devlink,
+ const struct devlink_trap_group *groups,
+ size_t groups_count)
+{
+ int i;
+
+ devl_assert_locked(devlink);
+ for (i = groups_count - 1; i >= 0; i--)
+ devlink_trap_group_unregister(devlink, &groups[i]);
+}
+EXPORT_SYMBOL_GPL(devl_trap_groups_unregister);
+
+/**
+ * devlink_trap_groups_unregister - Unregister packet trap groups from devlink.
+ * @devlink: devlink.
+ * @groups: Packet trap groups.
+ * @groups_count: Count of provided packet trap groups.
+ *
+ * Context: Takes and release devlink->lock <mutex>.
+ */
+void devlink_trap_groups_unregister(struct devlink *devlink,
+ const struct devlink_trap_group *groups,
+ size_t groups_count)
+{
+ devl_lock(devlink);
+ devl_trap_groups_unregister(devlink, groups, groups_count);
+ devl_unlock(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_trap_groups_unregister);
+
+static void
+devlink_trap_policer_notify(struct devlink *devlink,
+ const struct devlink_trap_policer_item *policer_item,
+ enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON_ONCE(cmd != DEVLINK_CMD_TRAP_POLICER_NEW &&
+ cmd != DEVLINK_CMD_TRAP_POLICER_DEL);
+
+ if (!devl_is_registered(devlink) || !devlink_nl_notify_need(devlink))
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_trap_policer_fill(msg, devlink, policer_item, cmd, 0,
+ 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ devlink_nl_notify_send(devlink, msg);
+}
+
+void devlink_trap_policers_notify_register(struct devlink *devlink)
+{
+ struct devlink_trap_policer_item *policer_item;
+
+ list_for_each_entry(policer_item, &devlink->trap_policer_list, list)
+ devlink_trap_policer_notify(devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_NEW);
+}
+
+void devlink_trap_policers_notify_unregister(struct devlink *devlink)
+{
+ struct devlink_trap_policer_item *policer_item;
+
+ list_for_each_entry_reverse(policer_item, &devlink->trap_policer_list,
+ list)
+ devlink_trap_policer_notify(devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_DEL);
+}
+
+static int
+devlink_trap_policer_register(struct devlink *devlink,
+ const struct devlink_trap_policer *policer)
+{
+ struct devlink_trap_policer_item *policer_item;
+ int err;
+
+ if (devlink_trap_policer_item_lookup(devlink, policer->id))
+ return -EEXIST;
+
+ policer_item = kzalloc(sizeof(*policer_item), GFP_KERNEL);
+ if (!policer_item)
+ return -ENOMEM;
+
+ policer_item->policer = policer;
+ policer_item->rate = policer->init_rate;
+ policer_item->burst = policer->init_burst;
+
+ if (devlink->ops->trap_policer_init) {
+ err = devlink->ops->trap_policer_init(devlink, policer);
+ if (err)
+ goto err_policer_init;
+ }
+
+ list_add_tail(&policer_item->list, &devlink->trap_policer_list);
+ devlink_trap_policer_notify(devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_NEW);
+
+ return 0;
+
+err_policer_init:
+ kfree(policer_item);
+ return err;
+}
+
+static void
+devlink_trap_policer_unregister(struct devlink *devlink,
+ const struct devlink_trap_policer *policer)
+{
+ struct devlink_trap_policer_item *policer_item;
+
+ policer_item = devlink_trap_policer_item_lookup(devlink, policer->id);
+ if (WARN_ON_ONCE(!policer_item))
+ return;
+
+ devlink_trap_policer_notify(devlink, policer_item,
+ DEVLINK_CMD_TRAP_POLICER_DEL);
+ list_del(&policer_item->list);
+ if (devlink->ops->trap_policer_fini)
+ devlink->ops->trap_policer_fini(devlink, policer);
+ kfree(policer_item);
+}
+
+/**
+ * devl_trap_policers_register - Register packet trap policers with devlink.
+ * @devlink: devlink.
+ * @policers: Packet trap policers.
+ * @policers_count: Count of provided packet trap policers.
+ *
+ * Return: Non-zero value on failure.
+ */
+int
+devl_trap_policers_register(struct devlink *devlink,
+ const struct devlink_trap_policer *policers,
+ size_t policers_count)
+{
+ int i, err;
+
+ devl_assert_locked(devlink);
+ for (i = 0; i < policers_count; i++) {
+ const struct devlink_trap_policer *policer = &policers[i];
+
+ if (WARN_ON(policer->id == 0 ||
+ policer->max_rate < policer->min_rate ||
+ policer->max_burst < policer->min_burst)) {
+ err = -EINVAL;
+ goto err_trap_policer_verify;
+ }
+
+ err = devlink_trap_policer_register(devlink, policer);
+ if (err)
+ goto err_trap_policer_register;
+ }
+ return 0;
+
+err_trap_policer_register:
+err_trap_policer_verify:
+ for (i--; i >= 0; i--)
+ devlink_trap_policer_unregister(devlink, &policers[i]);
+ return err;
+}
+EXPORT_SYMBOL_GPL(devl_trap_policers_register);
+
+/**
+ * devl_trap_policers_unregister - Unregister packet trap policers from devlink.
+ * @devlink: devlink.
+ * @policers: Packet trap policers.
+ * @policers_count: Count of provided packet trap policers.
+ */
+void
+devl_trap_policers_unregister(struct devlink *devlink,
+ const struct devlink_trap_policer *policers,
+ size_t policers_count)
+{
+ int i;
+
+ devl_assert_locked(devlink);
+ for (i = policers_count - 1; i >= 0; i--)
+ devlink_trap_policer_unregister(devlink, &policers[i]);
+}
+EXPORT_SYMBOL_GPL(devl_trap_policers_unregister);
diff --git a/net/devres.c b/net/devres.c
new file mode 100644
index 000000000000..5ccf6ca311dc
--- /dev/null
+++ b/net/devres.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * This file contains all networking devres helpers.
+ */
+
+#include <linux/device.h>
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+
+struct net_device_devres {
+ struct net_device *ndev;
+};
+
+static void devm_free_netdev(struct device *dev, void *this)
+{
+ struct net_device_devres *res = this;
+
+ free_netdev(res->ndev);
+}
+
+struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
+ unsigned int txqs, unsigned int rxqs)
+{
+ struct net_device_devres *dr;
+
+ dr = devres_alloc(devm_free_netdev, sizeof(*dr), GFP_KERNEL);
+ if (!dr)
+ return NULL;
+
+ dr->ndev = alloc_etherdev_mqs(sizeof_priv, txqs, rxqs);
+ if (!dr->ndev) {
+ devres_free(dr);
+ return NULL;
+ }
+
+ devres_add(dev, dr);
+
+ return dr->ndev;
+}
+EXPORT_SYMBOL(devm_alloc_etherdev_mqs);
+
+static void devm_unregister_netdev(struct device *dev, void *this)
+{
+ struct net_device_devres *res = this;
+
+ unregister_netdev(res->ndev);
+}
+
+static int netdev_devres_match(struct device *dev, void *this, void *match_data)
+{
+ struct net_device_devres *res = this;
+ struct net_device *ndev = match_data;
+
+ return ndev == res->ndev;
+}
+
+/**
+ * devm_register_netdev - resource managed variant of register_netdev()
+ * @dev: managing device for this netdev - usually the parent device
+ * @ndev: device to register
+ *
+ * This is a devres variant of register_netdev() for which the unregister
+ * function will be called automatically when the managing device is
+ * detached. Note: the net_device used must also be resource managed by
+ * the same struct device.
+ */
+int devm_register_netdev(struct device *dev, struct net_device *ndev)
+{
+ struct net_device_devres *dr;
+ int ret;
+
+ /* struct net_device must itself be managed. For now a managed netdev
+ * can only be allocated by devm_alloc_etherdev_mqs() so the check is
+ * straightforward.
+ */
+ if (WARN_ON(!devres_find(dev, devm_free_netdev,
+ netdev_devres_match, ndev)))
+ return -EINVAL;
+
+ dr = devres_alloc(devm_unregister_netdev, sizeof(*dr), GFP_KERNEL);
+ if (!dr)
+ return -ENOMEM;
+
+ ret = register_netdev(ndev);
+ if (ret) {
+ devres_free(dr);
+ return ret;
+ }
+
+ dr->ndev = ndev;
+ devres_add(ndev->dev.parent, dr);
+
+ return 0;
+}
+EXPORT_SYMBOL(devm_register_netdev);
diff --git a/net/dns_resolver/Kconfig b/net/dns_resolver/Kconfig
index 50d49f7e0472..7c2dba273e35 100644
--- a/net/dns_resolver/Kconfig
+++ b/net/dns_resolver/Kconfig
@@ -1,9 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Configuration for DNS Resolver
#
config DNS_RESOLVER
tristate "DNS Resolver support"
- depends on NET && KEYS
+ depends on KEYS
help
Saying Y here will include support for the DNS Resolver key type
which can be used to make upcalls to perform DNS lookups in
@@ -18,10 +19,10 @@ config DNS_RESOLVER
SMB2 later. DNS Resolver is supported by the userspace upcall
helper "/sbin/dns.resolver" via /etc/request-key.conf.
- See <file:Documentation/networking/dns_resolver.txt> for further
+ See <file:Documentation/networking/dns_resolver.rst> for further
information.
To compile this as a module, choose M here: the module will be called
- dnsresolver.
+ dns_resolver.
If unsure, say N.
diff --git a/net/dns_resolver/Makefile b/net/dns_resolver/Makefile
index d5c13c2eb36d..877532d662d0 100644
--- a/net/dns_resolver/Makefile
+++ b/net/dns_resolver/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the Linux DNS Resolver.
#
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index 7f4534828f6c..c42ddd85ff1f 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -1,6 +1,6 @@
/* Key type used to cache DNS lookups made by the kernel
*
- * See Documentation/networking/dns_resolver.txt
+ * See Documentation/networking/dns_resolver.rst
*
* Copyright (c) 2007 Igor Mammedov
* Author(s): Igor Mammedov (niallain@gmail.com)
@@ -29,6 +29,7 @@
#include <linux/keyctl.h>
#include <linux/err.h>
#include <linux/seq_file.h>
+#include <linux/dns_resolver.h>
#include <keys/dns_resolver-type.h>
#include <keys/user-type.h>
#include "internal.h"
@@ -48,14 +49,44 @@ const struct cred *dns_resolver_cache;
/*
* Preparse instantiation data for a dns_resolver key.
*
- * The data must be a NUL-terminated string, with the NUL char accounted in
- * datalen.
+ * For normal hostname lookups, the data must be a NUL-terminated string, with
+ * the NUL char accounted in datalen.
*
* If the data contains a '#' characters, then we take the clause after each
* one to be an option of the form 'key=value'. The actual data of interest is
* the string leading up to the first '#'. For instance:
*
* "ip1,ip2,...#foo=bar"
+ *
+ * For server list requests, the data must begin with a NUL char and be
+ * followed by a byte indicating the version of the data format. Version 1
+ * looks something like (note this is packed):
+ *
+ * u8 Non-string marker (ie. 0)
+ * u8 Content (DNS_PAYLOAD_IS_*)
+ * u8 Version (e.g. 1)
+ * u8 Source of server list
+ * u8 Lookup status of server list
+ * u8 Number of servers
+ * foreach-server {
+ * __le16 Name length
+ * __le16 Priority (as per SRV record, low first)
+ * __le16 Weight (as per SRV record, higher first)
+ * __le16 Port
+ * u8 Source of address list
+ * u8 Lookup status of address list
+ * u8 Protocol (DNS_SERVER_PROTOCOL_*)
+ * u8 Number of addresses
+ * char[] Name (not NUL-terminated)
+ * foreach-address {
+ * u8 Family (DNS_ADDRESS_IS_*)
+ * union {
+ * u8[4] ipv4_addr
+ * u8[16] ipv6_addr
+ * }
+ * }
+ * }
+ *
*/
static int
dns_resolver_preparse(struct key_preparsed_payload *prep)
@@ -66,9 +97,45 @@ dns_resolver_preparse(struct key_preparsed_payload *prep)
int datalen = prep->datalen, result_len = 0;
const char *data = prep->data, *end, *opt;
+ if (datalen <= 1 || !data)
+ return -EINVAL;
+
+ if (data[0] == 0) {
+ const struct dns_server_list_v1_header *v1;
+
+ /* It may be a server list. */
+ if (datalen < sizeof(*v1))
+ return -EINVAL;
+
+ v1 = (const struct dns_server_list_v1_header *)data;
+ kenter("[%u,%u],%u", v1->hdr.content, v1->hdr.version, datalen);
+ if (v1->hdr.content != DNS_PAYLOAD_IS_SERVER_LIST) {
+ pr_warn_ratelimited(
+ "dns_resolver: Unsupported content type (%u)\n",
+ v1->hdr.content);
+ return -EINVAL;
+ }
+
+ if (v1->hdr.version != 1) {
+ pr_warn_ratelimited(
+ "dns_resolver: Unsupported server list version (%u)\n",
+ v1->hdr.version);
+ return -EINVAL;
+ }
+
+ if ((v1->status != DNS_LOOKUP_GOOD &&
+ v1->status != DNS_LOOKUP_GOOD_WITH_BAD)) {
+ if (prep->expiry == TIME64_MAX)
+ prep->expiry = ktime_get_real_seconds() + 1;
+ }
+
+ result_len = datalen;
+ goto store_result;
+ }
+
kenter("'%*.*s',%u", datalen, datalen, data, datalen);
- if (datalen <= 1 || !data || data[datalen - 1] != '\0')
+ if (!data || data[datalen - 1] != '\0')
return -EINVAL;
datalen--;
@@ -144,6 +211,7 @@ dns_resolver_preparse(struct key_preparsed_payload *prep)
return 0;
}
+store_result:
kdebug("store result");
prep->quotalen = result_len;
@@ -241,7 +309,7 @@ static void dns_resolver_describe(const struct key *key, struct seq_file *m)
* - the key's semaphore is read-locked
*/
static long dns_resolver_read(const struct key *key,
- char __user *buffer, size_t buflen)
+ char *buffer, size_t buflen)
{
int err = PTR_ERR(key->payload.data[dns_key_error]);
@@ -253,6 +321,7 @@ static long dns_resolver_read(const struct key *key,
struct key_type key_type_dns_resolver = {
.name = "dns_resolver",
+ .flags = KEY_TYPE_NET_DOMAIN | KEY_TYPE_INSTANT_REAP,
.preparse = dns_resolver_preparse,
.free_preparse = dns_resolver_free_preparse,
.instantiate = generic_key_instantiate,
@@ -275,7 +344,7 @@ static int __init init_dns_resolver(void)
* this is used to prevent malicious redirections from being installed
* with add_key().
*/
- cred = prepare_kernel_cred(NULL);
+ cred = prepare_kernel_cred(&init_task);
if (!cred)
return -ENOMEM;
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index 49da67034f29..53da62984447 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -1,7 +1,7 @@
/* Upcall routine, designed to work as a key type and working through
* /sbin/request-key to contact userspace when handling DNS queries.
*
- * See Documentation/networking/dns_resolver.txt
+ * See Documentation/networking/dns_resolver.rst
*
* Copyright (c) 2007 Igor Mammedov
* Author(s): Igor Mammedov (niallain@gmail.com)
@@ -40,6 +40,7 @@
#include <linux/cred.h>
#include <linux/dns_resolver.h>
#include <linux/err.h>
+#include <net/net_namespace.h>
#include <keys/dns_resolver-type.h>
#include <keys/user-type.h>
@@ -48,12 +49,14 @@
/**
* dns_query - Query the DNS
+ * @net: The network namespace to operate in.
* @type: Query type (or NULL for straight host->IP lookup)
* @name: Name to look up
* @namelen: Length of name
* @options: Request options (or NULL if no options)
* @_result: Where to place the returned data (or NULL)
* @_expiry: Where to store the result expiry time (or NULL)
+ * @invalidate: Always invalidate the key after use
*
* The data will be returned in the pointer at *result, if provided, and the
* caller is responsible for freeing it.
@@ -68,12 +71,13 @@
*
* Returns the size of the result on success, -ve error code otherwise.
*/
-int dns_query(const char *type, const char *name, size_t namelen,
- const char *options, char **_result, time64_t *_expiry)
+int dns_query(struct net *net,
+ const char *type, const char *name, size_t namelen,
+ const char *options, char **_result, time64_t *_expiry,
+ bool invalidate)
{
struct key *rkey;
struct user_key_payload *upayload;
- const struct cred *saved_cred;
size_t typelen, desclen;
char *desc, *cp;
int ret, len;
@@ -94,8 +98,6 @@ int dns_query(const char *type, const char *name, size_t namelen,
desclen += typelen + 1;
}
- if (!namelen)
- namelen = strnlen(name, 256);
if (namelen < 3 || namelen > 255)
return -EINVAL;
desclen += namelen + 1;
@@ -121,9 +123,8 @@ int dns_query(const char *type, const char *name, size_t namelen,
/* make the upcall, using special credentials to prevent the use of
* add_key() to preinstall malicious redirections
*/
- saved_cred = override_creds(dns_resolver_cache);
- rkey = request_key(&key_type_dns_resolver, desc, options);
- revert_creds(saved_cred);
+ scoped_with_creds(dns_resolver_cache)
+ rkey = request_key_net(&key_type_dns_resolver, desc, net, options);
kfree(desc);
if (IS_ERR(rkey)) {
ret = PTR_ERR(rkey);
@@ -148,12 +149,9 @@ int dns_query(const char *type, const char *name, size_t namelen,
if (_result) {
ret = -ENOMEM;
- *_result = kmalloc(len + 1, GFP_KERNEL);
+ *_result = kmemdup_nul(upayload->data, len, GFP_KERNEL);
if (!*_result)
goto put;
-
- memcpy(*_result, upayload->data, len);
- (*_result)[len] = '\0';
}
if (_expiry)
@@ -162,6 +160,8 @@ int dns_query(const char *type, const char *name, size_t namelen,
ret = len;
put:
up_read(&rkey->sem);
+ if (invalidate)
+ key_invalidate(rkey);
key_put(rkey);
out:
kleave(" = %d", ret);
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 4183e4ba27a5..f86b30742122 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -1,56 +1,207 @@
-config HAVE_NET_DSA
- def_bool y
- depends on INET && NETDEVICES && !S390
+# SPDX-License-Identifier: GPL-2.0-only
-# Drivers must select NET_DSA and the appropriate tagging format
-
-config NET_DSA
+menuconfig NET_DSA
tristate "Distributed Switch Architecture"
- depends on HAVE_NET_DSA && MAY_USE_DEVLINK
depends on BRIDGE || BRIDGE=n
+ depends on HSR || HSR=n
+ depends on INET && NETDEVICES
+ select GRO_CELLS
select NET_SWITCHDEV
select PHYLINK
- ---help---
+ select NET_DEVLINK
+ imply NET_SELFTESTS
+ help
Say Y if you want to enable support for the hardware switches supported
by the Distributed Switch Architecture.
if NET_DSA
-config NET_DSA_LEGACY
- bool "Support for older platform device and Device Tree registration"
- default y
- ---help---
- Say Y if you want to enable support for the older platform device and
- deprecated Device Tree binding registration.
+# Drivers must select the appropriate tagging format(s)
+
+config NET_DSA_TAG_NONE
+ tristate "No-op tag driver"
+ help
+ Say Y or M if you want to enable support for switches which don't tag
+ frames over the CPU port.
+
+config NET_DSA_TAG_AR9331
+ tristate "Tag driver for Atheros AR9331 SoC with built-in switch"
+ help
+ Say Y or M if you want to enable support for tagging frames for
+ the Atheros AR9331 SoC with built-in switch.
- This feature is scheduled for removal in 4.17.
+config NET_DSA_TAG_BRCM_COMMON
+ tristate
+ default n
-# tagging formats
config NET_DSA_TAG_BRCM
- bool
+ tristate "Tag driver for Broadcom switches using in-frame headers"
+ select NET_DSA_TAG_BRCM_COMMON
+ help
+ Say Y if you want to enable support for tagging frames for the
+ Broadcom switches which place the tag after the MAC source address.
+
+config NET_DSA_TAG_BRCM_LEGACY
+ tristate "Tag driver for BCM63xx legacy switches using in-frame headers"
+ select NET_DSA_TAG_BRCM_COMMON
+ help
+ Say Y if you want to enable support for tagging frames for the
+ BCM63xx legacy switches which place the tag after the MAC source
+ address.
+ This tag is used in BCM63xx legacy switches which work without the
+ original FCS and length before the tag insertion.
+
+config NET_DSA_TAG_BRCM_LEGACY_FCS
+ tristate "Tag driver for BCM53xx legacy switches using in-frame headers"
+ select NET_DSA_TAG_BRCM_COMMON
+ help
+ Say Y if you want to enable support for tagging frames for the
+ BCM53xx legacy switches which place the tag after the MAC source
+ address.
+ This tag is used in BCM53xx legacy switches which expect original
+ FCS and length before the tag insertion to be present.
config NET_DSA_TAG_BRCM_PREPEND
- bool
+ tristate "Tag driver for Broadcom switches using prepended headers"
+ select NET_DSA_TAG_BRCM_COMMON
+ help
+ Say Y if you want to enable support for tagging frames for the
+ Broadcom switches which places the tag before the Ethernet header
+ (prepended).
+
+config NET_DSA_TAG_HELLCREEK
+ tristate "Tag driver for Hirschmann Hellcreek TSN switches"
+ help
+ Say Y or M if you want to enable support for tagging frames
+ for the Hirschmann Hellcreek TSN switches.
+
+config NET_DSA_TAG_GSWIP
+ tristate "Tag driver for Lantiq / Intel GSWIP switches"
+ help
+ Say Y or M if you want to enable support for tagging frames for the
+ Lantiq / Intel GSWIP switches.
+
+config NET_DSA_TAG_DSA_COMMON
+ tristate
config NET_DSA_TAG_DSA
- bool
+ tristate "Tag driver for Marvell switches using DSA headers"
+ select NET_DSA_TAG_DSA_COMMON
+ help
+ Say Y or M if you want to enable support for tagging frames for the
+ Marvell switches which use DSA headers.
config NET_DSA_TAG_EDSA
- bool
+ tristate "Tag driver for Marvell switches using EtherType DSA headers"
+ select NET_DSA_TAG_DSA_COMMON
+ help
+ Say Y or M if you want to enable support for tagging frames for the
+ Marvell switches which use EtherType DSA headers.
+
+config NET_DSA_TAG_MTK
+ tristate "Tag driver for Mediatek switches"
+ help
+ Say Y or M if you want to enable support for tagging frames for
+ Mediatek switches.
+
+config NET_DSA_TAG_MXL_GSW1XX
+ tristate "Tag driver for MaxLinear GSW1xx switches"
+ help
+ The GSW1xx family of switches supports an 8-byte special tag which
+ can be used on the CPU port of the switch.
+ Say Y or M if you want to enable support for tagging frames for
+ MaxLinear GSW1xx switches.
config NET_DSA_TAG_KSZ
- bool
+ tristate "Tag driver for Microchip 8795/937x/9477/9893 families of switches"
+ help
+ Say Y if you want to enable support for tagging frames for the
+ Microchip 8795/937x/9477/9893 families of switches.
+
+config NET_DSA_TAG_OCELOT
+ tristate "Tag driver for Ocelot family of switches, using NPI port"
+ select PACKING
+ help
+ Say Y or M if you want to enable NPI tagging for the Ocelot switches
+ (VSC7511, VSC7512, VSC7513, VSC7514, VSC9953, VSC9959). In this mode,
+ the frames over the Ethernet CPU port are prepended with a
+ hardware-defined injection/extraction frame header. Flow control
+ (PAUSE frames) over the CPU port is not supported when operating in
+ this mode.
+
+config NET_DSA_TAG_OCELOT_8021Q
+ tristate "Tag driver for Ocelot family of switches, using VLAN"
+ help
+ Say Y or M if you want to enable support for tagging frames with a
+ custom VLAN-based header. Frames that require timestamping, such as
+ PTP, are not delivered over Ethernet but over register-based MMIO.
+ Flow control over the CPU port is functional in this mode. When using
+ this mode, less TCAM resources (VCAP IS1, IS2, ES0) are available for
+ use with tc-flower.
+
+config NET_DSA_TAG_QCA
+ tristate "Tag driver for Qualcomm Atheros QCA8K switches"
+ help
+ Say Y or M if you want to enable support for tagging frames for
+ the Qualcomm Atheros QCA8K switches.
+
+config NET_DSA_TAG_RTL4_A
+ tristate "Tag driver for Realtek 4 byte protocol A tags"
+ help
+ Say Y or M if you want to enable support for tagging frames for the
+ Realtek switches with 4 byte protocol A tags, such as found in
+ the Realtek RTL8366RB.
+
+config NET_DSA_TAG_RTL8_4
+ tristate "Tag driver for Realtek 8 byte protocol 4 tags"
+ help
+ Say Y or M if you want to enable support for tagging frames for Realtek
+ switches with 8 byte protocol 4 tags, such as the Realtek RTL8365MB-VC.
+
+config NET_DSA_TAG_RZN1_A5PSW
+ tristate "Tag driver for Renesas RZ/N1 A5PSW switch"
+ help
+ Say Y or M if you want to enable support for tagging frames for
+ Renesas RZ/N1 embedded switch that uses an 8 byte tag located after
+ destination MAC address.
config NET_DSA_TAG_LAN9303
- bool
+ tristate "Tag driver for SMSC/Microchip LAN9303 family of switches"
+ help
+ Say Y or M if you want to enable support for tagging frames for the
+ SMSC/Microchip LAN9303 family of switches.
-config NET_DSA_TAG_MTK
- bool
+config NET_DSA_TAG_SJA1105
+ tristate "Tag driver for NXP SJA1105 switches"
+ select PACKING
+ help
+ Say Y or M if you want to enable support for tagging frames with the
+ NXP SJA1105 switch family. Both the native tagging protocol (which
+ is only for link-local traffic) as well as non-native tagging (based
+ on a custom 802.1Q VLAN header) are available.
config NET_DSA_TAG_TRAILER
- bool
+ tristate "Tag driver for switches using a trailer tag"
+ help
+ Say Y or M if you want to enable support for tagging frames at
+ with a trailed. e.g. Marvell 88E6060.
-config NET_DSA_TAG_QCA
- bool
+config NET_DSA_TAG_VSC73XX_8021Q
+ tristate "Tag driver for Microchip/Vitesse VSC73xx family of switches, using VLAN"
+ help
+ Say Y or M if you want to enable support for tagging frames with a
+ custom VLAN-based header.
+
+config NET_DSA_TAG_XRS700X
+ tristate "Tag driver for XRS700x switches"
+ help
+ Say Y or M if you want to enable support for tagging frames for
+ Arrow SpeedChips XRS700x switches that use a single byte tag trailer.
+
+config NET_DSA_TAG_YT921X
+ tristate "Tag driver for Motorcomm YT921x switches"
+ help
+ Say Y or M if you want to enable support for tagging frames for
+ Motorcomm YT921x switches.
endif
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 9e4d3536f977..42d173f5a701 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -1,16 +1,46 @@
# SPDX-License-Identifier: GPL-2.0
+
+# the stubs are built-in whenever DSA is built-in or module
+ifdef CONFIG_NET_DSA
+obj-y := stubs.o
+endif
+
# the core
obj-$(CONFIG_NET_DSA) += dsa_core.o
-dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o
-dsa_core-$(CONFIG_NET_DSA_LEGACY) += legacy.o
+dsa_core-y += \
+ conduit.o \
+ devlink.o \
+ dsa.o \
+ netlink.o \
+ port.o \
+ switch.o \
+ tag.o \
+ tag_8021q.o \
+ trace.o \
+ user.o
# tagging formats
-dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o
-dsa_core-$(CONFIG_NET_DSA_TAG_BRCM_PREPEND) += tag_brcm.o
-dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o
-dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o
-dsa_core-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o
-dsa_core-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o
-dsa_core-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o
-dsa_core-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o
-dsa_core-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o
+obj-$(CONFIG_NET_DSA_TAG_AR9331) += tag_ar9331.o
+obj-$(CONFIG_NET_DSA_TAG_BRCM_COMMON) += tag_brcm.o
+obj-$(CONFIG_NET_DSA_TAG_DSA_COMMON) += tag_dsa.o
+obj-$(CONFIG_NET_DSA_TAG_GSWIP) += tag_gswip.o
+obj-$(CONFIG_NET_DSA_TAG_HELLCREEK) += tag_hellcreek.o
+obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o
+obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o
+obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o
+obj-$(CONFIG_NET_DSA_TAG_MXL_GSW1XX) += tag_mxl-gsw1xx.o
+obj-$(CONFIG_NET_DSA_TAG_NONE) += tag_none.o
+obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o
+obj-$(CONFIG_NET_DSA_TAG_OCELOT_8021Q) += tag_ocelot_8021q.o
+obj-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o
+obj-$(CONFIG_NET_DSA_TAG_RTL4_A) += tag_rtl4_a.o
+obj-$(CONFIG_NET_DSA_TAG_RTL8_4) += tag_rtl8_4.o
+obj-$(CONFIG_NET_DSA_TAG_RZN1_A5PSW) += tag_rzn1_a5psw.o
+obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o
+obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o
+obj-$(CONFIG_NET_DSA_TAG_VSC73XX_8021Q) += tag_vsc73xx_8021q.o
+obj-$(CONFIG_NET_DSA_TAG_XRS700X) += tag_xrs700x.o
+obj-$(CONFIG_NET_DSA_TAG_YT921X) += tag_yt921x.o
+
+# for tracing framework to find trace.h
+CFLAGS_trace.o := -I$(src)
diff --git a/net/dsa/conduit.c b/net/dsa/conduit.c
new file mode 100644
index 000000000000..a1b044467bd6
--- /dev/null
+++ b/net/dsa/conduit.c
@@ -0,0 +1,549 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Handling of a conduit device, switching frames via its switch fabric CPU port
+ *
+ * Copyright (c) 2017 Savoir-faire Linux Inc.
+ * Vivien Didelot <vivien.didelot@savoirfairelinux.com>
+ */
+
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+#include <linux/netlink.h>
+#include <net/dsa.h>
+#include <net/netdev_lock.h>
+
+#include "conduit.h"
+#include "dsa.h"
+#include "port.h"
+#include "tag.h"
+
+static int dsa_conduit_get_regs_len(struct net_device *dev)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+ struct dsa_switch *ds = cpu_dp->ds;
+ int port = cpu_dp->index;
+ int ret = 0;
+ int len;
+
+ if (ops && ops->get_regs_len) {
+ netdev_lock_ops(dev);
+ len = ops->get_regs_len(dev);
+ netdev_unlock_ops(dev);
+ if (len < 0)
+ return len;
+ ret += len;
+ }
+
+ ret += sizeof(struct ethtool_drvinfo);
+ ret += sizeof(struct ethtool_regs);
+
+ if (ds->ops->get_regs_len) {
+ len = ds->ops->get_regs_len(ds, port);
+ if (len < 0)
+ return len;
+ ret += len;
+ }
+
+ return ret;
+}
+
+static void dsa_conduit_get_regs(struct net_device *dev,
+ struct ethtool_regs *regs, void *data)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+ struct dsa_switch *ds = cpu_dp->ds;
+ struct ethtool_drvinfo *cpu_info;
+ struct ethtool_regs *cpu_regs;
+ int port = cpu_dp->index;
+ int len;
+
+ if (ops && ops->get_regs_len && ops->get_regs) {
+ netdev_lock_ops(dev);
+ len = ops->get_regs_len(dev);
+ if (len < 0) {
+ netdev_unlock_ops(dev);
+ return;
+ }
+ regs->len = len;
+ ops->get_regs(dev, regs, data);
+ netdev_unlock_ops(dev);
+ data += regs->len;
+ }
+
+ cpu_info = (struct ethtool_drvinfo *)data;
+ strscpy(cpu_info->driver, "dsa", sizeof(cpu_info->driver));
+ data += sizeof(*cpu_info);
+ cpu_regs = (struct ethtool_regs *)data;
+ data += sizeof(*cpu_regs);
+
+ if (ds->ops->get_regs_len && ds->ops->get_regs) {
+ len = ds->ops->get_regs_len(ds, port);
+ if (len < 0)
+ return;
+ cpu_regs->len = len;
+ ds->ops->get_regs(ds, port, cpu_regs, data);
+ }
+}
+
+static ssize_t dsa_conduit_append_port_stats(struct dsa_switch *ds, int port,
+ u64 *data, size_t start)
+{
+ int count;
+
+ if (!ds->ops->get_sset_count)
+ return 0;
+
+ count = ds->ops->get_sset_count(ds, port, ETH_SS_STATS);
+ if (count < 0)
+ return count;
+
+ if (ds->ops->get_ethtool_stats)
+ ds->ops->get_ethtool_stats(ds, port, data + start);
+
+ return count;
+}
+
+static void dsa_conduit_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats *stats,
+ u64 *data)
+{
+ struct dsa_port *dp, *cpu_dp = dev->dsa_ptr;
+ const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+ struct dsa_switch_tree *dst = cpu_dp->dst;
+ int count, mcount = 0;
+
+ if (ops && ops->get_sset_count && ops->get_ethtool_stats) {
+ netdev_lock_ops(dev);
+ mcount = ops->get_sset_count(dev, ETH_SS_STATS);
+ ops->get_ethtool_stats(dev, stats, data);
+ netdev_unlock_ops(dev);
+ }
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (!dsa_port_is_dsa(dp) && !dsa_port_is_cpu(dp))
+ continue;
+
+ count = dsa_conduit_append_port_stats(dp->ds, dp->index,
+ data, mcount);
+ if (count < 0)
+ return;
+
+ mcount += count;
+ }
+}
+
+static void dsa_conduit_get_ethtool_phy_stats(struct net_device *dev,
+ struct ethtool_stats *stats,
+ u64 *data)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+ struct dsa_switch *ds = cpu_dp->ds;
+ int port = cpu_dp->index;
+ int count = 0;
+
+ if (dev->phydev && (!ops || !ops->get_ethtool_phy_stats)) {
+ count = phy_ethtool_get_sset_count(dev->phydev);
+ if (count >= 0)
+ phy_ethtool_get_stats(dev->phydev, stats, data);
+ } else if (ops && ops->get_sset_count && ops->get_ethtool_phy_stats) {
+ netdev_lock_ops(dev);
+ count = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
+ ops->get_ethtool_phy_stats(dev, stats, data);
+ netdev_unlock_ops(dev);
+ }
+
+ if (count < 0)
+ count = 0;
+
+ if (ds->ops->get_ethtool_phy_stats)
+ ds->ops->get_ethtool_phy_stats(ds, port, data + count);
+}
+
+static void dsa_conduit_append_port_sset_count(struct dsa_switch *ds, int port,
+ int sset, int *count)
+{
+ if (ds->ops->get_sset_count)
+ *count += ds->ops->get_sset_count(ds, port, sset);
+}
+
+static int dsa_conduit_get_sset_count(struct net_device *dev, int sset)
+{
+ struct dsa_port *dp, *cpu_dp = dev->dsa_ptr;
+ const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+ struct dsa_switch_tree *dst = cpu_dp->dst;
+ int count = 0;
+
+ netdev_lock_ops(dev);
+ if (sset == ETH_SS_PHY_STATS && dev->phydev &&
+ (!ops || !ops->get_ethtool_phy_stats))
+ count = phy_ethtool_get_sset_count(dev->phydev);
+ else if (ops && ops->get_sset_count)
+ count = ops->get_sset_count(dev, sset);
+ netdev_unlock_ops(dev);
+
+ if (count < 0)
+ count = 0;
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (!dsa_port_is_dsa(dp) && !dsa_port_is_cpu(dp))
+ continue;
+
+ dsa_conduit_append_port_sset_count(dp->ds, dp->index, sset,
+ &count);
+ }
+
+ return count;
+}
+
+static ssize_t dsa_conduit_append_port_strings(struct dsa_switch *ds, int port,
+ u32 stringset, u8 *data,
+ size_t start)
+{
+ int len = ETH_GSTRING_LEN;
+ u8 pfx[8], *ndata;
+ int count, i;
+
+ if (!ds->ops->get_strings)
+ return 0;
+
+ snprintf(pfx, sizeof(pfx), "s%.2d_p%.2d", ds->index, port);
+ /* We do not want to be NULL-terminated, since this is a prefix */
+ pfx[sizeof(pfx) - 1] = '_';
+ ndata = data + start * len;
+ /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle
+ * the output after to prepend our CPU port prefix we
+ * constructed earlier
+ */
+ ds->ops->get_strings(ds, port, stringset, ndata);
+ count = ds->ops->get_sset_count(ds, port, stringset);
+ if (count < 0)
+ return count;
+
+ for (i = 0; i < count; i++) {
+ memmove(ndata + (i * len + sizeof(pfx)),
+ ndata + i * len, len - sizeof(pfx));
+ memcpy(ndata + i * len, pfx, sizeof(pfx));
+ }
+
+ return count;
+}
+
+static void dsa_conduit_get_strings(struct net_device *dev, u32 stringset,
+ u8 *data)
+{
+ struct dsa_port *dp, *cpu_dp = dev->dsa_ptr;
+ const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
+ struct dsa_switch_tree *dst = cpu_dp->dst;
+ int count, mcount = 0;
+
+ netdev_lock_ops(dev);
+ if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
+ !ops->get_ethtool_phy_stats) {
+ mcount = phy_ethtool_get_sset_count(dev->phydev);
+ if (mcount < 0)
+ mcount = 0;
+ else
+ phy_ethtool_get_strings(dev->phydev, data);
+ } else if (ops->get_sset_count && ops->get_strings) {
+ mcount = ops->get_sset_count(dev, stringset);
+ if (mcount < 0)
+ mcount = 0;
+ ops->get_strings(dev, stringset, data);
+ }
+ netdev_unlock_ops(dev);
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (!dsa_port_is_dsa(dp) && !dsa_port_is_cpu(dp))
+ continue;
+
+ count = dsa_conduit_append_port_strings(dp->ds, dp->index,
+ stringset, data,
+ mcount);
+ if (count < 0)
+ return;
+
+ mcount += count;
+ }
+}
+
+/* Deny PTP operations on conduit if there is at least one switch in the tree
+ * that is PTP capable.
+ */
+int __dsa_conduit_hwtstamp_validate(struct net_device *dev,
+ const struct kernel_hwtstamp_config *config,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ struct dsa_switch *ds = cpu_dp->ds;
+ struct dsa_switch_tree *dst;
+ struct dsa_port *dp;
+
+ dst = ds->dst;
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dsa_port_supports_hwtstamp(dp)) {
+ NL_SET_ERR_MSG(extack,
+ "HW timestamping not allowed on DSA conduit when switch supports the operation");
+ return -EBUSY;
+ }
+ }
+
+ return 0;
+}
+
+static int dsa_conduit_ethtool_setup(struct net_device *dev)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ struct dsa_switch *ds = cpu_dp->ds;
+ struct ethtool_ops *ops;
+
+ if (netif_is_lag_master(dev))
+ return 0;
+
+ ops = devm_kzalloc(ds->dev, sizeof(*ops), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
+
+ cpu_dp->orig_ethtool_ops = dev->ethtool_ops;
+ if (cpu_dp->orig_ethtool_ops)
+ memcpy(ops, cpu_dp->orig_ethtool_ops, sizeof(*ops));
+
+ ops->get_regs_len = dsa_conduit_get_regs_len;
+ ops->get_regs = dsa_conduit_get_regs;
+ ops->get_sset_count = dsa_conduit_get_sset_count;
+ ops->get_ethtool_stats = dsa_conduit_get_ethtool_stats;
+ ops->get_strings = dsa_conduit_get_strings;
+ ops->get_ethtool_phy_stats = dsa_conduit_get_ethtool_phy_stats;
+
+ dev->ethtool_ops = ops;
+
+ return 0;
+}
+
+static void dsa_conduit_ethtool_teardown(struct net_device *dev)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+
+ if (netif_is_lag_master(dev))
+ return;
+
+ dev->ethtool_ops = cpu_dp->orig_ethtool_ops;
+ cpu_dp->orig_ethtool_ops = NULL;
+}
+
+/* Keep the conduit always promiscuous if the tagging protocol requires that
+ * (garbles MAC DA) or if it doesn't support unicast filtering, case in which
+ * it would revert to promiscuous mode as soon as we call dev_uc_add() on it
+ * anyway.
+ */
+static void dsa_conduit_set_promiscuity(struct net_device *dev, int inc)
+{
+ const struct dsa_device_ops *ops = dev->dsa_ptr->tag_ops;
+
+ if ((dev->priv_flags & IFF_UNICAST_FLT) && !ops->promisc_on_conduit)
+ return;
+
+ ASSERT_RTNL();
+
+ dev_set_promiscuity(dev, inc);
+}
+
+static ssize_t tagging_show(struct device *d, struct device_attribute *attr,
+ char *buf)
+{
+ struct net_device *dev = to_net_dev(d);
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+
+ return sysfs_emit(buf, "%s\n",
+ dsa_tag_protocol_to_str(cpu_dp->tag_ops));
+}
+
+static ssize_t tagging_store(struct device *d, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ const struct dsa_device_ops *new_tag_ops, *old_tag_ops;
+ const char *end = strchrnul(buf, '\n'), *name;
+ struct net_device *dev = to_net_dev(d);
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ size_t len = end - buf;
+ int err;
+
+ /* Empty string passed */
+ if (!len)
+ return -ENOPROTOOPT;
+
+ name = kstrndup(buf, len, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ old_tag_ops = cpu_dp->tag_ops;
+ new_tag_ops = dsa_tag_driver_get_by_name(name);
+ kfree(name);
+ /* Bad tagger name? */
+ if (IS_ERR(new_tag_ops))
+ return PTR_ERR(new_tag_ops);
+
+ if (new_tag_ops == old_tag_ops)
+ /* Drop the temporarily held duplicate reference, since
+ * the DSA switch tree uses this tagger.
+ */
+ goto out;
+
+ err = dsa_tree_change_tag_proto(cpu_dp->ds->dst, new_tag_ops,
+ old_tag_ops);
+ if (err) {
+ /* On failure the old tagger is restored, so we don't need the
+ * driver for the new one.
+ */
+ dsa_tag_driver_put(new_tag_ops);
+ return err;
+ }
+
+ /* On success we no longer need the module for the old tagging protocol
+ */
+out:
+ dsa_tag_driver_put(old_tag_ops);
+ return count;
+}
+static DEVICE_ATTR_RW(tagging);
+
+static struct attribute *dsa_user_attrs[] = {
+ &dev_attr_tagging.attr,
+ NULL
+};
+
+static const struct attribute_group dsa_group = {
+ .name = "dsa",
+ .attrs = dsa_user_attrs,
+};
+
+static void dsa_conduit_reset_mtu(struct net_device *dev)
+{
+ int err;
+
+ err = dev_set_mtu(dev, ETH_DATA_LEN);
+ if (err)
+ netdev_dbg(dev,
+ "Unable to reset MTU to exclude DSA overheads\n");
+}
+
+int dsa_conduit_setup(struct net_device *dev, struct dsa_port *cpu_dp)
+{
+ const struct dsa_device_ops *tag_ops = cpu_dp->tag_ops;
+ struct dsa_switch *ds = cpu_dp->ds;
+ struct device_link *consumer_link;
+ int mtu, ret;
+
+ mtu = ETH_DATA_LEN + dsa_tag_protocol_overhead(tag_ops);
+
+ /* The DSA conduit must use SET_NETDEV_DEV for this to work. */
+ if (!netif_is_lag_master(dev)) {
+ consumer_link = device_link_add(ds->dev, dev->dev.parent,
+ DL_FLAG_AUTOREMOVE_CONSUMER);
+ if (!consumer_link)
+ netdev_err(dev,
+ "Failed to create a device link to DSA switch %s\n",
+ dev_name(ds->dev));
+ }
+
+ /* The switch driver may not implement ->port_change_mtu(), case in
+ * which dsa_user_change_mtu() will not update the conduit MTU either,
+ * so we need to do that here.
+ */
+ ret = dev_set_mtu(dev, mtu);
+ if (ret)
+ netdev_warn(dev, "error %d setting MTU to %d to include DSA overhead\n",
+ ret, mtu);
+
+ /* If we use a tagging format that doesn't have an ethertype
+ * field, make sure that all packets from this point on get
+ * sent to the tag format's receive function.
+ */
+ wmb();
+
+ dev->dsa_ptr = cpu_dp;
+
+ dsa_conduit_set_promiscuity(dev, 1);
+
+ ret = dsa_conduit_ethtool_setup(dev);
+ if (ret)
+ goto out_err_reset_promisc;
+
+ ret = sysfs_create_group(&dev->dev.kobj, &dsa_group);
+ if (ret)
+ goto out_err_ethtool_teardown;
+
+ return ret;
+
+out_err_ethtool_teardown:
+ dsa_conduit_ethtool_teardown(dev);
+out_err_reset_promisc:
+ dsa_conduit_set_promiscuity(dev, -1);
+ return ret;
+}
+
+void dsa_conduit_teardown(struct net_device *dev)
+{
+ sysfs_remove_group(&dev->dev.kobj, &dsa_group);
+ dsa_conduit_ethtool_teardown(dev);
+ dsa_conduit_reset_mtu(dev);
+ dsa_conduit_set_promiscuity(dev, -1);
+
+ dev->dsa_ptr = NULL;
+
+ /* If we used a tagging format that doesn't have an ethertype
+ * field, make sure that all packets from this point get sent
+ * without the tag and go through the regular receive path.
+ */
+ wmb();
+}
+
+int dsa_conduit_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp,
+ struct netdev_lag_upper_info *uinfo,
+ struct netlink_ext_ack *extack)
+{
+ bool conduit_setup = false;
+ int err;
+
+ if (!netdev_uses_dsa(lag_dev)) {
+ err = dsa_conduit_setup(lag_dev, cpu_dp);
+ if (err)
+ return err;
+
+ conduit_setup = true;
+ }
+
+ err = dsa_port_lag_join(cpu_dp, lag_dev, uinfo, extack);
+ if (err) {
+ NL_SET_ERR_MSG_WEAK_MOD(extack, "CPU port failed to join LAG");
+ goto out_conduit_teardown;
+ }
+
+ return 0;
+
+out_conduit_teardown:
+ if (conduit_setup)
+ dsa_conduit_teardown(lag_dev);
+ return err;
+}
+
+/* Tear down a conduit if there isn't any other user port on it,
+ * optionally also destroying LAG information.
+ */
+void dsa_conduit_lag_teardown(struct net_device *lag_dev,
+ struct dsa_port *cpu_dp)
+{
+ struct net_device *upper;
+ struct list_head *iter;
+
+ dsa_port_lag_leave(cpu_dp, lag_dev);
+
+ netdev_for_each_upper_dev_rcu(lag_dev, upper, iter)
+ if (dsa_user_dev_check(upper))
+ return;
+
+ dsa_conduit_teardown(lag_dev);
+}
diff --git a/net/dsa/conduit.h b/net/dsa/conduit.h
new file mode 100644
index 000000000000..31f8834f54bb
--- /dev/null
+++ b/net/dsa/conduit.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_CONDUIT_H
+#define __DSA_CONDUIT_H
+
+struct dsa_port;
+struct net_device;
+struct netdev_lag_upper_info;
+struct netlink_ext_ack;
+
+int dsa_conduit_setup(struct net_device *dev, struct dsa_port *cpu_dp);
+void dsa_conduit_teardown(struct net_device *dev);
+int dsa_conduit_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp,
+ struct netdev_lag_upper_info *uinfo,
+ struct netlink_ext_ack *extack);
+void dsa_conduit_lag_teardown(struct net_device *lag_dev,
+ struct dsa_port *cpu_dp);
+int __dsa_conduit_hwtstamp_validate(struct net_device *dev,
+ const struct kernel_hwtstamp_config *config,
+ struct netlink_ext_ack *extack);
+
+#endif
diff --git a/net/dsa/devlink.c b/net/dsa/devlink.c
new file mode 100644
index 000000000000..ed342f345692
--- /dev/null
+++ b/net/dsa/devlink.c
@@ -0,0 +1,402 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * DSA devlink handling
+ */
+
+#include <net/dsa.h>
+#include <net/devlink.h>
+
+#include "devlink.h"
+
+static int dsa_devlink_info_get(struct devlink *dl,
+ struct devlink_info_req *req,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (ds->ops->devlink_info_get)
+ return ds->ops->devlink_info_get(ds, req, extack);
+
+ return -EOPNOTSUPP;
+}
+
+static int dsa_devlink_sb_pool_get(struct devlink *dl,
+ unsigned int sb_index, u16 pool_index,
+ struct devlink_sb_pool_info *pool_info)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (!ds->ops->devlink_sb_pool_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_pool_get(ds, sb_index, pool_index,
+ pool_info);
+}
+
+static int dsa_devlink_sb_pool_set(struct devlink *dl, unsigned int sb_index,
+ u16 pool_index, u32 size,
+ enum devlink_sb_threshold_type threshold_type,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (!ds->ops->devlink_sb_pool_set)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_pool_set(ds, sb_index, pool_index, size,
+ threshold_type, extack);
+}
+
+static int dsa_devlink_sb_port_pool_get(struct devlink_port *dlp,
+ unsigned int sb_index, u16 pool_index,
+ u32 *p_threshold)
+{
+ struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
+ int port = dsa_devlink_port_to_port(dlp);
+
+ if (!ds->ops->devlink_sb_port_pool_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_port_pool_get(ds, port, sb_index,
+ pool_index, p_threshold);
+}
+
+static int dsa_devlink_sb_port_pool_set(struct devlink_port *dlp,
+ unsigned int sb_index, u16 pool_index,
+ u32 threshold,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
+ int port = dsa_devlink_port_to_port(dlp);
+
+ if (!ds->ops->devlink_sb_port_pool_set)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_port_pool_set(ds, port, sb_index,
+ pool_index, threshold, extack);
+}
+
+static int
+dsa_devlink_sb_tc_pool_bind_get(struct devlink_port *dlp,
+ unsigned int sb_index, u16 tc_index,
+ enum devlink_sb_pool_type pool_type,
+ u16 *p_pool_index, u32 *p_threshold)
+{
+ struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
+ int port = dsa_devlink_port_to_port(dlp);
+
+ if (!ds->ops->devlink_sb_tc_pool_bind_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_tc_pool_bind_get(ds, port, sb_index,
+ tc_index, pool_type,
+ p_pool_index, p_threshold);
+}
+
+static int
+dsa_devlink_sb_tc_pool_bind_set(struct devlink_port *dlp,
+ unsigned int sb_index, u16 tc_index,
+ enum devlink_sb_pool_type pool_type,
+ u16 pool_index, u32 threshold,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
+ int port = dsa_devlink_port_to_port(dlp);
+
+ if (!ds->ops->devlink_sb_tc_pool_bind_set)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_tc_pool_bind_set(ds, port, sb_index,
+ tc_index, pool_type,
+ pool_index, threshold,
+ extack);
+}
+
+static int dsa_devlink_sb_occ_snapshot(struct devlink *dl,
+ unsigned int sb_index)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (!ds->ops->devlink_sb_occ_snapshot)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_occ_snapshot(ds, sb_index);
+}
+
+static int dsa_devlink_sb_occ_max_clear(struct devlink *dl,
+ unsigned int sb_index)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (!ds->ops->devlink_sb_occ_max_clear)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_occ_max_clear(ds, sb_index);
+}
+
+static int dsa_devlink_sb_occ_port_pool_get(struct devlink_port *dlp,
+ unsigned int sb_index,
+ u16 pool_index, u32 *p_cur,
+ u32 *p_max)
+{
+ struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
+ int port = dsa_devlink_port_to_port(dlp);
+
+ if (!ds->ops->devlink_sb_occ_port_pool_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_occ_port_pool_get(ds, port, sb_index,
+ pool_index, p_cur, p_max);
+}
+
+static int
+dsa_devlink_sb_occ_tc_port_bind_get(struct devlink_port *dlp,
+ unsigned int sb_index, u16 tc_index,
+ enum devlink_sb_pool_type pool_type,
+ u32 *p_cur, u32 *p_max)
+{
+ struct dsa_switch *ds = dsa_devlink_port_to_ds(dlp);
+ int port = dsa_devlink_port_to_port(dlp);
+
+ if (!ds->ops->devlink_sb_occ_tc_port_bind_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_sb_occ_tc_port_bind_get(ds, port,
+ sb_index, tc_index,
+ pool_type, p_cur,
+ p_max);
+}
+
+static const struct devlink_ops dsa_devlink_ops = {
+ .info_get = dsa_devlink_info_get,
+ .sb_pool_get = dsa_devlink_sb_pool_get,
+ .sb_pool_set = dsa_devlink_sb_pool_set,
+ .sb_port_pool_get = dsa_devlink_sb_port_pool_get,
+ .sb_port_pool_set = dsa_devlink_sb_port_pool_set,
+ .sb_tc_pool_bind_get = dsa_devlink_sb_tc_pool_bind_get,
+ .sb_tc_pool_bind_set = dsa_devlink_sb_tc_pool_bind_set,
+ .sb_occ_snapshot = dsa_devlink_sb_occ_snapshot,
+ .sb_occ_max_clear = dsa_devlink_sb_occ_max_clear,
+ .sb_occ_port_pool_get = dsa_devlink_sb_occ_port_pool_get,
+ .sb_occ_tc_port_bind_get = dsa_devlink_sb_occ_tc_port_bind_get,
+};
+
+int dsa_devlink_param_get(struct devlink *dl, u32 id,
+ struct devlink_param_gset_ctx *ctx,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (!ds->ops->devlink_param_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_param_get(ds, id, ctx);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_param_get);
+
+int dsa_devlink_param_set(struct devlink *dl, u32 id,
+ struct devlink_param_gset_ctx *ctx,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dsa_devlink_to_ds(dl);
+
+ if (!ds->ops->devlink_param_set)
+ return -EOPNOTSUPP;
+
+ return ds->ops->devlink_param_set(ds, id, ctx);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_param_set);
+
+int dsa_devlink_params_register(struct dsa_switch *ds,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ return devlink_params_register(ds->devlink, params, params_count);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_params_register);
+
+void dsa_devlink_params_unregister(struct dsa_switch *ds,
+ const struct devlink_param *params,
+ size_t params_count)
+{
+ devlink_params_unregister(ds->devlink, params, params_count);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_params_unregister);
+
+int dsa_devlink_resource_register(struct dsa_switch *ds,
+ const char *resource_name,
+ u64 resource_size,
+ u64 resource_id,
+ u64 parent_resource_id,
+ const struct devlink_resource_size_params *size_params)
+{
+ int ret;
+
+ devl_lock(ds->devlink);
+ ret = devl_resource_register(ds->devlink, resource_name, resource_size,
+ resource_id, parent_resource_id,
+ size_params);
+ devl_unlock(ds->devlink);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_resource_register);
+
+void dsa_devlink_resources_unregister(struct dsa_switch *ds)
+{
+ devlink_resources_unregister(ds->devlink);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_resources_unregister);
+
+void dsa_devlink_resource_occ_get_register(struct dsa_switch *ds,
+ u64 resource_id,
+ devlink_resource_occ_get_t *occ_get,
+ void *occ_get_priv)
+{
+ devl_lock(ds->devlink);
+ devl_resource_occ_get_register(ds->devlink, resource_id, occ_get,
+ occ_get_priv);
+ devl_unlock(ds->devlink);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_register);
+
+void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds,
+ u64 resource_id)
+{
+ devl_lock(ds->devlink);
+ devl_resource_occ_get_unregister(ds->devlink, resource_id);
+ devl_unlock(ds->devlink);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_unregister);
+
+struct devlink_region *
+dsa_devlink_region_create(struct dsa_switch *ds,
+ const struct devlink_region_ops *ops,
+ u32 region_max_snapshots, u64 region_size)
+{
+ return devlink_region_create(ds->devlink, ops, region_max_snapshots,
+ region_size);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_region_create);
+
+struct devlink_region *
+dsa_devlink_port_region_create(struct dsa_switch *ds,
+ int port,
+ const struct devlink_port_region_ops *ops,
+ u32 region_max_snapshots, u64 region_size)
+{
+ struct dsa_port *dp = dsa_to_port(ds, port);
+
+ return devlink_port_region_create(&dp->devlink_port, ops,
+ region_max_snapshots,
+ region_size);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_port_region_create);
+
+void dsa_devlink_region_destroy(struct devlink_region *region)
+{
+ devlink_region_destroy(region);
+}
+EXPORT_SYMBOL_GPL(dsa_devlink_region_destroy);
+
+int dsa_port_devlink_setup(struct dsa_port *dp)
+{
+ struct devlink_port *dlp = &dp->devlink_port;
+ struct dsa_switch_tree *dst = dp->ds->dst;
+ struct devlink_port_attrs attrs = {};
+ struct devlink *dl = dp->ds->devlink;
+ struct dsa_switch *ds = dp->ds;
+ const unsigned char *id;
+ unsigned char len;
+ int err;
+
+ memset(dlp, 0, sizeof(*dlp));
+ devlink_port_init(dl, dlp);
+
+ if (ds->ops->port_setup) {
+ err = ds->ops->port_setup(ds, dp->index);
+ if (err)
+ return err;
+ }
+
+ id = (const unsigned char *)&dst->index;
+ len = sizeof(dst->index);
+
+ attrs.phys.port_number = dp->index;
+ memcpy(attrs.switch_id.id, id, len);
+ attrs.switch_id.id_len = len;
+
+ switch (dp->type) {
+ case DSA_PORT_TYPE_UNUSED:
+ attrs.flavour = DEVLINK_PORT_FLAVOUR_UNUSED;
+ break;
+ case DSA_PORT_TYPE_CPU:
+ attrs.flavour = DEVLINK_PORT_FLAVOUR_CPU;
+ break;
+ case DSA_PORT_TYPE_DSA:
+ attrs.flavour = DEVLINK_PORT_FLAVOUR_DSA;
+ break;
+ case DSA_PORT_TYPE_USER:
+ attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL;
+ break;
+ }
+
+ devlink_port_attrs_set(dlp, &attrs);
+ err = devlink_port_register(dl, dlp, dp->index);
+ if (err) {
+ if (ds->ops->port_teardown)
+ ds->ops->port_teardown(ds, dp->index);
+ return err;
+ }
+
+ return 0;
+}
+
+void dsa_port_devlink_teardown(struct dsa_port *dp)
+{
+ struct devlink_port *dlp = &dp->devlink_port;
+ struct dsa_switch *ds = dp->ds;
+
+ devlink_port_unregister(dlp);
+
+ if (ds->ops->port_teardown)
+ ds->ops->port_teardown(ds, dp->index);
+
+ devlink_port_fini(dlp);
+}
+
+void dsa_switch_devlink_register(struct dsa_switch *ds)
+{
+ devlink_register(ds->devlink);
+}
+
+void dsa_switch_devlink_unregister(struct dsa_switch *ds)
+{
+ devlink_unregister(ds->devlink);
+}
+
+int dsa_switch_devlink_alloc(struct dsa_switch *ds)
+{
+ struct dsa_devlink_priv *dl_priv;
+ struct devlink *dl;
+
+ /* Add the switch to devlink before calling setup, so that setup can
+ * add dpipe tables
+ */
+ dl = devlink_alloc(&dsa_devlink_ops, sizeof(*dl_priv), ds->dev);
+ if (!dl)
+ return -ENOMEM;
+
+ ds->devlink = dl;
+
+ dl_priv = devlink_priv(ds->devlink);
+ dl_priv->ds = ds;
+
+ return 0;
+}
+
+void dsa_switch_devlink_free(struct dsa_switch *ds)
+{
+ devlink_free(ds->devlink);
+ ds->devlink = NULL;
+}
diff --git a/net/dsa/devlink.h b/net/dsa/devlink.h
new file mode 100644
index 000000000000..4d9f4f23705b
--- /dev/null
+++ b/net/dsa/devlink.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_DEVLINK_H
+#define __DSA_DEVLINK_H
+
+struct dsa_port;
+struct dsa_switch;
+
+int dsa_port_devlink_setup(struct dsa_port *dp);
+void dsa_port_devlink_teardown(struct dsa_port *dp);
+void dsa_switch_devlink_register(struct dsa_switch *ds);
+void dsa_switch_devlink_unregister(struct dsa_switch *ds);
+int dsa_switch_devlink_alloc(struct dsa_switch *ds);
+void dsa_switch_devlink_free(struct dsa_switch *ds);
+
+#endif
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 9f3209ff7ffd..a20efabe778f 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -1,90 +1,1381 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * net/dsa/dsa.c - Hardware switch handling
+ * DSA topology and switch handling
+ *
* Copyright (c) 2008-2009 Marvell Semiconductor
* Copyright (c) 2013 Florian Fainelli <florian@openwrt.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * Copyright (c) 2016 Andrew Lunn <andrew@lunn.ch>
*/
#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/if_hsr.h>
#include <linux/list.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
#include <linux/module.h>
-#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/rtnetlink.h>
#include <linux/of.h>
-#include <linux/of_mdio.h>
-#include <linux/of_platform.h>
#include <linux/of_net.h>
-#include <linux/netdevice.h>
-#include <linux/sysfs.h>
-#include <linux/phy_fixed.h>
-#include <linux/ptp_classify.h>
-#include <linux/etherdevice.h>
+#include <net/dsa_stubs.h>
+#include <net/sch_generic.h>
+
+#include "conduit.h"
+#include "devlink.h"
+#include "dsa.h"
+#include "netlink.h"
+#include "port.h"
+#include "switch.h"
+#include "tag.h"
+#include "user.h"
+
+#define DSA_MAX_NUM_OFFLOADING_BRIDGES BITS_PER_LONG
+
+static DEFINE_MUTEX(dsa2_mutex);
+LIST_HEAD(dsa_tree_list);
+
+static struct workqueue_struct *dsa_owq;
-#include "dsa_priv.h"
+/* Track the bridges with forwarding offload enabled */
+static unsigned long dsa_fwd_offloading_bridges;
-static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb,
- struct net_device *dev)
+bool dsa_schedule_work(struct work_struct *work)
{
- /* Just return the original SKB */
- return skb;
+ return queue_work(dsa_owq, work);
}
-static const struct dsa_device_ops none_ops = {
- .xmit = dsa_slave_notag_xmit,
- .rcv = NULL,
-};
+void dsa_flush_workqueue(void)
+{
+ flush_workqueue(dsa_owq);
+}
+EXPORT_SYMBOL_GPL(dsa_flush_workqueue);
-const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = {
-#ifdef CONFIG_NET_DSA_TAG_BRCM
- [DSA_TAG_PROTO_BRCM] = &brcm_netdev_ops,
-#endif
-#ifdef CONFIG_NET_DSA_TAG_BRCM_PREPEND
- [DSA_TAG_PROTO_BRCM_PREPEND] = &brcm_prepend_netdev_ops,
-#endif
-#ifdef CONFIG_NET_DSA_TAG_DSA
- [DSA_TAG_PROTO_DSA] = &dsa_netdev_ops,
-#endif
-#ifdef CONFIG_NET_DSA_TAG_EDSA
- [DSA_TAG_PROTO_EDSA] = &edsa_netdev_ops,
-#endif
-#ifdef CONFIG_NET_DSA_TAG_KSZ
- [DSA_TAG_PROTO_KSZ] = &ksz_netdev_ops,
-#endif
-#ifdef CONFIG_NET_DSA_TAG_LAN9303
- [DSA_TAG_PROTO_LAN9303] = &lan9303_netdev_ops,
-#endif
-#ifdef CONFIG_NET_DSA_TAG_MTK
- [DSA_TAG_PROTO_MTK] = &mtk_netdev_ops,
-#endif
-#ifdef CONFIG_NET_DSA_TAG_QCA
- [DSA_TAG_PROTO_QCA] = &qca_netdev_ops,
-#endif
-#ifdef CONFIG_NET_DSA_TAG_TRAILER
- [DSA_TAG_PROTO_TRAILER] = &trailer_netdev_ops,
-#endif
- [DSA_TAG_PROTO_NONE] = &none_ops,
-};
+/**
+ * dsa_lag_map() - Map LAG structure to a linear LAG array
+ * @dst: Tree in which to record the mapping.
+ * @lag: LAG structure that is to be mapped to the tree's array.
+ *
+ * dsa_lag_id/dsa_lag_by_id can then be used to translate between the
+ * two spaces. The size of the mapping space is determined by the
+ * driver by setting ds->num_lag_ids. It is perfectly legal to leave
+ * it unset if it is not needed, in which case these functions become
+ * no-ops.
+ */
+void dsa_lag_map(struct dsa_switch_tree *dst, struct dsa_lag *lag)
+{
+ unsigned int id;
+
+ for (id = 1; id <= dst->lags_len; id++) {
+ if (!dsa_lag_by_id(dst, id)) {
+ dst->lags[id - 1] = lag;
+ lag->id = id;
+ return;
+ }
+ }
+
+ /* No IDs left, which is OK. Some drivers do not need it. The
+ * ones that do, e.g. mv88e6xxx, will discover that dsa_lag_id
+ * returns an error for this device when joining the LAG. The
+ * driver can then return -EOPNOTSUPP back to DSA, which will
+ * fall back to a software LAG.
+ */
+}
+
+/**
+ * dsa_lag_unmap() - Remove a LAG ID mapping
+ * @dst: Tree in which the mapping is recorded.
+ * @lag: LAG structure that was mapped.
+ *
+ * As there may be multiple users of the mapping, it is only removed
+ * if there are no other references to it.
+ */
+void dsa_lag_unmap(struct dsa_switch_tree *dst, struct dsa_lag *lag)
+{
+ unsigned int id;
+
+ dsa_lags_foreach_id(id, dst) {
+ if (dsa_lag_by_id(dst, id) == lag) {
+ dst->lags[id - 1] = NULL;
+ lag->id = 0;
+ break;
+ }
+ }
+}
+
+struct dsa_lag *dsa_tree_lag_find(struct dsa_switch_tree *dst,
+ const struct net_device *lag_dev)
+{
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_lag_dev_get(dp) == lag_dev)
+ return dp->lag;
+
+ return NULL;
+}
+
+struct dsa_bridge *dsa_tree_bridge_find(struct dsa_switch_tree *dst,
+ const struct net_device *br)
+{
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_bridge_dev_get(dp) == br)
+ return dp->bridge;
+
+ return NULL;
+}
+
+static int dsa_bridge_num_find(const struct net_device *bridge_dev)
+{
+ struct dsa_switch_tree *dst;
+
+ list_for_each_entry(dst, &dsa_tree_list, list) {
+ struct dsa_bridge *bridge;
+
+ bridge = dsa_tree_bridge_find(dst, bridge_dev);
+ if (bridge)
+ return bridge->num;
+ }
+
+ return 0;
+}
+
+unsigned int dsa_bridge_num_get(const struct net_device *bridge_dev, int max)
+{
+ unsigned int bridge_num = dsa_bridge_num_find(bridge_dev);
+
+ /* Switches without FDB isolation support don't get unique
+ * bridge numbering
+ */
+ if (!max)
+ return 0;
+
+ if (!bridge_num) {
+ /* First port that requests FDB isolation or TX forwarding
+ * offload for this bridge
+ */
+ bridge_num = find_next_zero_bit(&dsa_fwd_offloading_bridges,
+ DSA_MAX_NUM_OFFLOADING_BRIDGES,
+ 1);
+ if (bridge_num >= max)
+ return 0;
+
+ set_bit(bridge_num, &dsa_fwd_offloading_bridges);
+ }
+
+ return bridge_num;
+}
+
+void dsa_bridge_num_put(const struct net_device *bridge_dev,
+ unsigned int bridge_num)
+{
+ /* Since we refcount bridges, we know that when we call this function
+ * it is no longer in use, so we can just go ahead and remove it from
+ * the bit mask.
+ */
+ clear_bit(bridge_num, &dsa_fwd_offloading_bridges);
+}
+
+struct dsa_switch *dsa_switch_find(int tree_index, int sw_index)
+{
+ struct dsa_switch_tree *dst;
+ struct dsa_port *dp;
+
+ list_for_each_entry(dst, &dsa_tree_list, list) {
+ if (dst->index != tree_index)
+ continue;
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dp->ds->index != sw_index)
+ continue;
+
+ return dp->ds;
+ }
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(dsa_switch_find);
+
+static struct dsa_switch_tree *dsa_tree_find(int index)
+{
+ struct dsa_switch_tree *dst;
+
+ list_for_each_entry(dst, &dsa_tree_list, list)
+ if (dst->index == index)
+ return dst;
+
+ return NULL;
+}
+
+static struct dsa_switch_tree *dsa_tree_alloc(int index)
+{
+ struct dsa_switch_tree *dst;
+
+ dst = kzalloc(sizeof(*dst), GFP_KERNEL);
+ if (!dst)
+ return NULL;
+
+ dst->index = index;
+
+ INIT_LIST_HEAD(&dst->rtable);
+
+ INIT_LIST_HEAD(&dst->ports);
+
+ INIT_LIST_HEAD(&dst->list);
+ list_add_tail(&dst->list, &dsa_tree_list);
+
+ kref_init(&dst->refcount);
+
+ return dst;
+}
+
+static void dsa_tree_free(struct dsa_switch_tree *dst)
+{
+ if (dst->tag_ops)
+ dsa_tag_driver_put(dst->tag_ops);
+ list_del(&dst->list);
+ kfree(dst);
+}
+
+static struct dsa_switch_tree *dsa_tree_get(struct dsa_switch_tree *dst)
+{
+ if (dst)
+ kref_get(&dst->refcount);
+
+ return dst;
+}
+
+static struct dsa_switch_tree *dsa_tree_touch(int index)
+{
+ struct dsa_switch_tree *dst;
+
+ dst = dsa_tree_find(index);
+ if (dst)
+ return dsa_tree_get(dst);
+ else
+ return dsa_tree_alloc(index);
+}
+
+static void dsa_tree_release(struct kref *ref)
+{
+ struct dsa_switch_tree *dst;
+
+ dst = container_of(ref, struct dsa_switch_tree, refcount);
+
+ dsa_tree_free(dst);
+}
+
+static void dsa_tree_put(struct dsa_switch_tree *dst)
+{
+ if (dst)
+ kref_put(&dst->refcount, dsa_tree_release);
+}
+
+static struct dsa_port *dsa_tree_find_port_by_node(struct dsa_switch_tree *dst,
+ struct device_node *dn)
+{
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dp->dn == dn)
+ return dp;
+
+ return NULL;
+}
+
+static struct dsa_link *dsa_link_touch(struct dsa_port *dp,
+ struct dsa_port *link_dp)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_switch_tree *dst;
+ struct dsa_link *dl;
+
+ dst = ds->dst;
+
+ list_for_each_entry(dl, &dst->rtable, list)
+ if (dl->dp == dp && dl->link_dp == link_dp)
+ return dl;
+
+ dl = kzalloc(sizeof(*dl), GFP_KERNEL);
+ if (!dl)
+ return NULL;
+
+ dl->dp = dp;
+ dl->link_dp = link_dp;
+
+ INIT_LIST_HEAD(&dl->list);
+ list_add_tail(&dl->list, &dst->rtable);
+
+ return dl;
+}
+
+static bool dsa_port_setup_routing_table(struct dsa_port *dp)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_switch_tree *dst = ds->dst;
+ struct device_node *dn = dp->dn;
+ struct of_phandle_iterator it;
+ struct dsa_port *link_dp;
+ struct dsa_link *dl;
+ int err;
+
+ of_for_each_phandle(&it, err, dn, "link", NULL, 0) {
+ link_dp = dsa_tree_find_port_by_node(dst, it.node);
+ if (!link_dp) {
+ of_node_put(it.node);
+ return false;
+ }
+
+ dl = dsa_link_touch(dp, link_dp);
+ if (!dl) {
+ of_node_put(it.node);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool dsa_tree_setup_routing_table(struct dsa_switch_tree *dst)
+{
+ bool complete = true;
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dsa_port_is_dsa(dp)) {
+ complete = dsa_port_setup_routing_table(dp);
+ if (!complete)
+ break;
+ }
+ }
+
+ return complete;
+}
+
+static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_is_cpu(dp))
+ return dp;
+
+ return NULL;
+}
+
+struct net_device *dsa_tree_find_first_conduit(struct dsa_switch_tree *dst)
+{
+ struct device_node *ethernet;
+ struct net_device *conduit;
+ struct dsa_port *cpu_dp;
+
+ cpu_dp = dsa_tree_find_first_cpu(dst);
+ ethernet = of_parse_phandle(cpu_dp->dn, "ethernet", 0);
+ conduit = of_find_net_device_by_node(ethernet);
+ of_node_put(ethernet);
+
+ return conduit;
+}
+
+/* Assign the default CPU port (the first one in the tree) to all ports of the
+ * fabric which don't already have one as part of their own switch.
+ */
+static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *cpu_dp, *dp;
+
+ cpu_dp = dsa_tree_find_first_cpu(dst);
+ if (!cpu_dp) {
+ pr_err("DSA: tree %d has no CPU port\n", dst->index);
+ return -EINVAL;
+ }
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dp->cpu_dp)
+ continue;
+
+ if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
+ dp->cpu_dp = cpu_dp;
+ }
+
+ return 0;
+}
+
+static struct dsa_port *
+dsa_switch_preferred_default_local_cpu_port(struct dsa_switch *ds)
+{
+ struct dsa_port *cpu_dp;
+
+ if (!ds->ops->preferred_default_local_cpu_port)
+ return NULL;
+
+ cpu_dp = ds->ops->preferred_default_local_cpu_port(ds);
+ if (!cpu_dp)
+ return NULL;
+
+ if (WARN_ON(!dsa_port_is_cpu(cpu_dp) || cpu_dp->ds != ds))
+ return NULL;
+
+ return cpu_dp;
+}
+
+/* Perform initial assignment of CPU ports to user ports and DSA links in the
+ * fabric, giving preference to CPU ports local to each switch. Default to
+ * using the first CPU port in the switch tree if the port does not have a CPU
+ * port local to this switch.
+ */
+static int dsa_tree_setup_cpu_ports(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *preferred_cpu_dp, *cpu_dp, *dp;
+
+ list_for_each_entry(cpu_dp, &dst->ports, list) {
+ if (!dsa_port_is_cpu(cpu_dp))
+ continue;
+
+ preferred_cpu_dp = dsa_switch_preferred_default_local_cpu_port(cpu_dp->ds);
+ if (preferred_cpu_dp && preferred_cpu_dp != cpu_dp)
+ continue;
+
+ /* Prefer a local CPU port */
+ dsa_switch_for_each_port(dp, cpu_dp->ds) {
+ /* Prefer the first local CPU port found */
+ if (dp->cpu_dp)
+ continue;
+
+ if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
+ dp->cpu_dp = cpu_dp;
+ }
+ }
+
+ return dsa_tree_setup_default_cpu(dst);
+}
+
+static void dsa_tree_teardown_cpu_ports(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
+ dp->cpu_dp = NULL;
+}
+
+static int dsa_port_setup(struct dsa_port *dp)
+{
+ bool dsa_port_link_registered = false;
+ struct dsa_switch *ds = dp->ds;
+ bool dsa_port_enabled = false;
+ int err = 0;
+
+ if (dp->setup)
+ return 0;
+
+ err = dsa_port_devlink_setup(dp);
+ if (err)
+ return err;
+
+ switch (dp->type) {
+ case DSA_PORT_TYPE_UNUSED:
+ dsa_port_disable(dp);
+ break;
+ case DSA_PORT_TYPE_CPU:
+ if (dp->dn) {
+ err = dsa_shared_port_link_register_of(dp);
+ if (err)
+ break;
+ dsa_port_link_registered = true;
+ } else {
+ dev_warn(ds->dev,
+ "skipping link registration for CPU port %d\n",
+ dp->index);
+ }
+
+ err = dsa_port_enable(dp, NULL);
+ if (err)
+ break;
+ dsa_port_enabled = true;
+
+ break;
+ case DSA_PORT_TYPE_DSA:
+ if (dp->dn) {
+ err = dsa_shared_port_link_register_of(dp);
+ if (err)
+ break;
+ dsa_port_link_registered = true;
+ } else {
+ dev_warn(ds->dev,
+ "skipping link registration for DSA port %d\n",
+ dp->index);
+ }
+
+ err = dsa_port_enable(dp, NULL);
+ if (err)
+ break;
+ dsa_port_enabled = true;
+
+ break;
+ case DSA_PORT_TYPE_USER:
+ of_get_mac_address(dp->dn, dp->mac);
+ err = dsa_user_create(dp);
+ break;
+ }
+
+ if (err && dsa_port_enabled)
+ dsa_port_disable(dp);
+ if (err && dsa_port_link_registered)
+ dsa_shared_port_link_unregister_of(dp);
+ if (err) {
+ dsa_port_devlink_teardown(dp);
+ return err;
+ }
+
+ dp->setup = true;
+
+ return 0;
+}
+
+static void dsa_port_teardown(struct dsa_port *dp)
+{
+ if (!dp->setup)
+ return;
+
+ switch (dp->type) {
+ case DSA_PORT_TYPE_UNUSED:
+ break;
+ case DSA_PORT_TYPE_CPU:
+ dsa_port_disable(dp);
+ if (dp->dn)
+ dsa_shared_port_link_unregister_of(dp);
+ break;
+ case DSA_PORT_TYPE_DSA:
+ dsa_port_disable(dp);
+ if (dp->dn)
+ dsa_shared_port_link_unregister_of(dp);
+ break;
+ case DSA_PORT_TYPE_USER:
+ if (dp->user) {
+ dsa_user_destroy(dp->user);
+ dp->user = NULL;
+ }
+ break;
+ }
+
+ dsa_port_devlink_teardown(dp);
+
+ dp->setup = false;
+}
+
+static int dsa_port_setup_as_unused(struct dsa_port *dp)
+{
+ dp->type = DSA_PORT_TYPE_UNUSED;
+ return dsa_port_setup(dp);
+}
+
+static int dsa_switch_setup_tag_protocol(struct dsa_switch *ds)
+{
+ const struct dsa_device_ops *tag_ops = ds->dst->tag_ops;
+ struct dsa_switch_tree *dst = ds->dst;
+ int err;
+
+ if (tag_ops->proto == dst->default_proto)
+ goto connect;
+
+ rtnl_lock();
+ err = ds->ops->change_tag_protocol(ds, tag_ops->proto);
+ rtnl_unlock();
+ if (err) {
+ dev_err(ds->dev, "Unable to use tag protocol \"%s\": %pe\n",
+ tag_ops->name, ERR_PTR(err));
+ return err;
+ }
+
+connect:
+ if (tag_ops->connect) {
+ err = tag_ops->connect(ds);
+ if (err)
+ return err;
+ }
+
+ if (ds->ops->connect_tag_protocol) {
+ err = ds->ops->connect_tag_protocol(ds, tag_ops->proto);
+ if (err) {
+ dev_err(ds->dev,
+ "Unable to connect to tag protocol \"%s\": %pe\n",
+ tag_ops->name, ERR_PTR(err));
+ goto disconnect;
+ }
+ }
+
+ return 0;
+
+disconnect:
+ if (tag_ops->disconnect)
+ tag_ops->disconnect(ds);
+
+ return err;
+}
+
+static void dsa_switch_teardown_tag_protocol(struct dsa_switch *ds)
+{
+ const struct dsa_device_ops *tag_ops = ds->dst->tag_ops;
+
+ if (tag_ops->disconnect)
+ tag_ops->disconnect(ds);
+}
+
+static int dsa_switch_setup(struct dsa_switch *ds)
+{
+ int err;
+
+ if (ds->setup)
+ return 0;
+
+ /* Initialize ds->phys_mii_mask before registering the user MDIO bus
+ * driver and before ops->setup() has run, since the switch drivers and
+ * the user MDIO bus driver rely on these values for probing PHY
+ * devices or not
+ */
+ ds->phys_mii_mask |= dsa_user_ports(ds);
+
+ err = dsa_switch_devlink_alloc(ds);
+ if (err)
+ return err;
+
+ err = dsa_switch_register_notifier(ds);
+ if (err)
+ goto devlink_free;
+
+ ds->configure_vlan_while_not_filtering = true;
+
+ err = ds->ops->setup(ds);
+ if (err < 0)
+ goto unregister_notifier;
+
+ err = dsa_switch_setup_tag_protocol(ds);
+ if (err)
+ goto teardown;
+
+ if (!ds->user_mii_bus && ds->ops->phy_read) {
+ ds->user_mii_bus = mdiobus_alloc();
+ if (!ds->user_mii_bus) {
+ err = -ENOMEM;
+ goto teardown;
+ }
+
+ dsa_user_mii_bus_init(ds);
+
+ err = mdiobus_register(ds->user_mii_bus);
+ if (err < 0)
+ goto free_user_mii_bus;
+ }
+
+ dsa_switch_devlink_register(ds);
+
+ ds->setup = true;
+ return 0;
+
+free_user_mii_bus:
+ if (ds->user_mii_bus && ds->ops->phy_read)
+ mdiobus_free(ds->user_mii_bus);
+teardown:
+ if (ds->ops->teardown)
+ ds->ops->teardown(ds);
+unregister_notifier:
+ dsa_switch_unregister_notifier(ds);
+devlink_free:
+ dsa_switch_devlink_free(ds);
+ return err;
+}
+
+static void dsa_switch_teardown(struct dsa_switch *ds)
+{
+ if (!ds->setup)
+ return;
+
+ dsa_switch_devlink_unregister(ds);
+
+ if (ds->user_mii_bus && ds->ops->phy_read) {
+ mdiobus_unregister(ds->user_mii_bus);
+ mdiobus_free(ds->user_mii_bus);
+ ds->user_mii_bus = NULL;
+ }
+
+ dsa_switch_teardown_tag_protocol(ds);
+
+ if (ds->ops->teardown)
+ ds->ops->teardown(ds);
+
+ dsa_switch_unregister_notifier(ds);
+
+ dsa_switch_devlink_free(ds);
+
+ ds->setup = false;
+}
+
+/* First tear down the non-shared, then the shared ports. This ensures that
+ * all work items scheduled by our switchdev handlers for user ports have
+ * completed before we destroy the refcounting kept on the shared ports.
+ */
+static void dsa_tree_teardown_ports(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_is_user(dp) || dsa_port_is_unused(dp))
+ dsa_port_teardown(dp);
+
+ dsa_flush_workqueue();
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dsa_port_is_dsa(dp) || dsa_port_is_cpu(dp))
+ dsa_port_teardown(dp);
+}
+
+static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list)
+ dsa_switch_teardown(dp->ds);
+}
+
+/* Bring shared ports up first, then non-shared ports */
+static int dsa_tree_setup_ports(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *dp;
+ int err = 0;
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dsa_port_is_dsa(dp) || dsa_port_is_cpu(dp)) {
+ err = dsa_port_setup(dp);
+ if (err)
+ goto teardown;
+ }
+ }
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dsa_port_is_user(dp) || dsa_port_is_unused(dp)) {
+ err = dsa_port_setup(dp);
+ if (err) {
+ err = dsa_port_setup_as_unused(dp);
+ if (err)
+ goto teardown;
+ }
+ }
+ }
+
+ return 0;
+
+teardown:
+ dsa_tree_teardown_ports(dst);
+
+ return err;
+}
+
+static int dsa_tree_setup_switches(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *dp;
+ int err = 0;
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ err = dsa_switch_setup(dp->ds);
+ if (err) {
+ dsa_tree_teardown_switches(dst);
+ break;
+ }
+ }
+
+ return err;
+}
+
+static int dsa_tree_setup_conduit(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *cpu_dp;
+ int err = 0;
+
+ rtnl_lock();
+
+ dsa_tree_for_each_cpu_port(cpu_dp, dst) {
+ struct net_device *conduit = cpu_dp->conduit;
+ bool admin_up = (conduit->flags & IFF_UP) &&
+ !qdisc_tx_is_noop(conduit);
+
+ err = dsa_conduit_setup(conduit, cpu_dp);
+ if (err)
+ break;
+
+ /* Replay conduit state event */
+ dsa_tree_conduit_admin_state_change(dst, conduit, admin_up);
+ dsa_tree_conduit_oper_state_change(dst, conduit,
+ netif_oper_up(conduit));
+ }
+
+ rtnl_unlock();
+
+ return err;
+}
+
+static void dsa_tree_teardown_conduit(struct dsa_switch_tree *dst)
+{
+ struct dsa_port *cpu_dp;
+
+ rtnl_lock();
+
+ dsa_tree_for_each_cpu_port(cpu_dp, dst) {
+ struct net_device *conduit = cpu_dp->conduit;
+
+ /* Synthesizing an "admin down" state is sufficient for
+ * the switches to get a notification if the conduit is
+ * currently up and running.
+ */
+ dsa_tree_conduit_admin_state_change(dst, conduit, false);
+
+ dsa_conduit_teardown(conduit);
+ }
+
+ rtnl_unlock();
+}
+
+static int dsa_tree_setup_lags(struct dsa_switch_tree *dst)
+{
+ unsigned int len = 0;
+ struct dsa_port *dp;
-const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol)
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dp->ds->num_lag_ids > len)
+ len = dp->ds->num_lag_ids;
+ }
+
+ if (!len)
+ return 0;
+
+ dst->lags = kcalloc(len, sizeof(*dst->lags), GFP_KERNEL);
+ if (!dst->lags)
+ return -ENOMEM;
+
+ dst->lags_len = len;
+ return 0;
+}
+
+static void dsa_tree_teardown_lags(struct dsa_switch_tree *dst)
{
- const struct dsa_device_ops *ops;
+ kfree(dst->lags);
+}
- if (tag_protocol >= DSA_TAG_LAST)
- return ERR_PTR(-EINVAL);
- ops = dsa_device_ops[tag_protocol];
+static void dsa_tree_teardown_routing_table(struct dsa_switch_tree *dst)
+{
+ struct dsa_link *dl, *next;
+
+ list_for_each_entry_safe(dl, next, &dst->rtable, list) {
+ list_del(&dl->list);
+ kfree(dl);
+ }
+}
+
+static int dsa_tree_setup(struct dsa_switch_tree *dst)
+{
+ bool complete;
+ int err;
+
+ if (dst->setup) {
+ pr_err("DSA: tree %d already setup! Disjoint trees?\n",
+ dst->index);
+ return -EEXIST;
+ }
+
+ complete = dsa_tree_setup_routing_table(dst);
+ if (!complete)
+ return 0;
+
+ err = dsa_tree_setup_cpu_ports(dst);
+ if (err)
+ goto teardown_rtable;
+
+ err = dsa_tree_setup_switches(dst);
+ if (err)
+ goto teardown_cpu_ports;
+
+ err = dsa_tree_setup_ports(dst);
+ if (err)
+ goto teardown_switches;
+
+ err = dsa_tree_setup_conduit(dst);
+ if (err)
+ goto teardown_ports;
+
+ err = dsa_tree_setup_lags(dst);
+ if (err)
+ goto teardown_conduit;
+
+ dst->setup = true;
+
+ pr_info("DSA: tree %d setup\n", dst->index);
+
+ return 0;
- if (!ops)
- return ERR_PTR(-ENOPROTOOPT);
+teardown_conduit:
+ dsa_tree_teardown_conduit(dst);
+teardown_ports:
+ dsa_tree_teardown_ports(dst);
+teardown_switches:
+ dsa_tree_teardown_switches(dst);
+teardown_cpu_ports:
+ dsa_tree_teardown_cpu_ports(dst);
+teardown_rtable:
+ dsa_tree_teardown_routing_table(dst);
- return ops;
+ return err;
}
-static int dev_is_class(struct device *dev, void *class)
+static void dsa_tree_teardown(struct dsa_switch_tree *dst)
+{
+ if (!dst->setup)
+ return;
+
+ dsa_tree_teardown_lags(dst);
+
+ dsa_tree_teardown_conduit(dst);
+
+ dsa_tree_teardown_ports(dst);
+
+ dsa_tree_teardown_switches(dst);
+
+ dsa_tree_teardown_cpu_ports(dst);
+
+ dsa_tree_teardown_routing_table(dst);
+
+ pr_info("DSA: tree %d torn down\n", dst->index);
+
+ dst->setup = false;
+}
+
+static int dsa_tree_bind_tag_proto(struct dsa_switch_tree *dst,
+ const struct dsa_device_ops *tag_ops)
+{
+ const struct dsa_device_ops *old_tag_ops = dst->tag_ops;
+ struct dsa_notifier_tag_proto_info info;
+ int err;
+
+ dst->tag_ops = tag_ops;
+
+ /* Notify the switches from this tree about the connection
+ * to the new tagger
+ */
+ info.tag_ops = tag_ops;
+ err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_CONNECT, &info);
+ if (err && err != -EOPNOTSUPP)
+ goto out_disconnect;
+
+ /* Notify the old tagger about the disconnection from this tree */
+ info.tag_ops = old_tag_ops;
+ dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_DISCONNECT, &info);
+
+ return 0;
+
+out_disconnect:
+ info.tag_ops = tag_ops;
+ dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO_DISCONNECT, &info);
+ dst->tag_ops = old_tag_ops;
+
+ return err;
+}
+
+/* Since the dsa/tagging sysfs device attribute is per conduit, the assumption
+ * is that all DSA switches within a tree share the same tagger, otherwise
+ * they would have formed disjoint trees (different "dsa,member" values).
+ */
+int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst,
+ const struct dsa_device_ops *tag_ops,
+ const struct dsa_device_ops *old_tag_ops)
+{
+ struct dsa_notifier_tag_proto_info info;
+ struct dsa_port *dp;
+ int err = -EBUSY;
+
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ /* At the moment we don't allow changing the tag protocol under
+ * traffic. The rtnl_mutex also happens to serialize concurrent
+ * attempts to change the tagging protocol. If we ever lift the IFF_UP
+ * restriction, there needs to be another mutex which serializes this.
+ */
+ dsa_tree_for_each_user_port(dp, dst) {
+ if (dsa_port_to_conduit(dp)->flags & IFF_UP)
+ goto out_unlock;
+
+ if (dp->user->flags & IFF_UP)
+ goto out_unlock;
+ }
+
+ /* Notify the tag protocol change */
+ info.tag_ops = tag_ops;
+ err = dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info);
+ if (err)
+ goto out_unwind_tagger;
+
+ err = dsa_tree_bind_tag_proto(dst, tag_ops);
+ if (err)
+ goto out_unwind_tagger;
+
+ rtnl_unlock();
+
+ return 0;
+
+out_unwind_tagger:
+ info.tag_ops = old_tag_ops;
+ dsa_tree_notify(dst, DSA_NOTIFIER_TAG_PROTO, &info);
+out_unlock:
+ rtnl_unlock();
+ return err;
+}
+
+static void dsa_tree_conduit_state_change(struct dsa_switch_tree *dst,
+ struct net_device *conduit)
+{
+ struct dsa_notifier_conduit_state_info info;
+ struct dsa_port *cpu_dp = conduit->dsa_ptr;
+
+ info.conduit = conduit;
+ info.operational = dsa_port_conduit_is_operational(cpu_dp);
+
+ dsa_tree_notify(dst, DSA_NOTIFIER_CONDUIT_STATE_CHANGE, &info);
+}
+
+void dsa_tree_conduit_admin_state_change(struct dsa_switch_tree *dst,
+ struct net_device *conduit,
+ bool up)
+{
+ struct dsa_port *cpu_dp = conduit->dsa_ptr;
+ bool notify = false;
+
+ /* Don't keep track of admin state on LAG DSA conduits,
+ * but rather just of physical DSA conduits
+ */
+ if (netif_is_lag_master(conduit))
+ return;
+
+ if ((dsa_port_conduit_is_operational(cpu_dp)) !=
+ (up && cpu_dp->conduit_oper_up))
+ notify = true;
+
+ cpu_dp->conduit_admin_up = up;
+
+ if (notify)
+ dsa_tree_conduit_state_change(dst, conduit);
+}
+
+void dsa_tree_conduit_oper_state_change(struct dsa_switch_tree *dst,
+ struct net_device *conduit,
+ bool up)
+{
+ struct dsa_port *cpu_dp = conduit->dsa_ptr;
+ bool notify = false;
+
+ /* Don't keep track of oper state on LAG DSA conduits,
+ * but rather just of physical DSA conduits
+ */
+ if (netif_is_lag_master(conduit))
+ return;
+
+ if ((dsa_port_conduit_is_operational(cpu_dp)) !=
+ (cpu_dp->conduit_admin_up && up))
+ notify = true;
+
+ cpu_dp->conduit_oper_up = up;
+
+ if (notify)
+ dsa_tree_conduit_state_change(dst, conduit);
+}
+
+static struct dsa_port *dsa_port_touch(struct dsa_switch *ds, int index)
+{
+ struct dsa_switch_tree *dst = ds->dst;
+ struct dsa_port *dp;
+
+ dsa_switch_for_each_port(dp, ds)
+ if (dp->index == index)
+ return dp;
+
+ dp = kzalloc(sizeof(*dp), GFP_KERNEL);
+ if (!dp)
+ return NULL;
+
+ dp->ds = ds;
+ dp->index = index;
+
+ mutex_init(&dp->addr_lists_lock);
+ mutex_init(&dp->vlans_lock);
+ INIT_LIST_HEAD(&dp->fdbs);
+ INIT_LIST_HEAD(&dp->mdbs);
+ INIT_LIST_HEAD(&dp->vlans); /* also initializes &dp->user_vlans */
+ INIT_LIST_HEAD(&dp->list);
+ list_add_tail(&dp->list, &dst->ports);
+
+ return dp;
+}
+
+static int dsa_port_parse_user(struct dsa_port *dp, const char *name)
+{
+ dp->type = DSA_PORT_TYPE_USER;
+ dp->name = name;
+
+ return 0;
+}
+
+static int dsa_port_parse_dsa(struct dsa_port *dp)
+{
+ dp->type = DSA_PORT_TYPE_DSA;
+
+ return 0;
+}
+
+static enum dsa_tag_protocol dsa_get_tag_protocol(struct dsa_port *dp,
+ struct net_device *conduit)
+{
+ enum dsa_tag_protocol tag_protocol = DSA_TAG_PROTO_NONE;
+ struct dsa_switch *mds, *ds = dp->ds;
+ unsigned int mdp_upstream;
+ struct dsa_port *mdp;
+
+ /* It is possible to stack DSA switches onto one another when that
+ * happens the switch driver may want to know if its tagging protocol
+ * is going to work in such a configuration.
+ */
+ if (dsa_user_dev_check(conduit)) {
+ mdp = dsa_user_to_port(conduit);
+ mds = mdp->ds;
+ mdp_upstream = dsa_upstream_port(mds, mdp->index);
+ tag_protocol = mds->ops->get_tag_protocol(mds, mdp_upstream,
+ DSA_TAG_PROTO_NONE);
+ }
+
+ /* If the conduit device is not itself a DSA user in a disjoint DSA
+ * tree, then return immediately.
+ */
+ return ds->ops->get_tag_protocol(ds, dp->index, tag_protocol);
+}
+
+static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *conduit,
+ const char *user_protocol)
+{
+ const struct dsa_device_ops *tag_ops = NULL;
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_switch_tree *dst = ds->dst;
+ enum dsa_tag_protocol default_proto;
+
+ /* Find out which protocol the switch would prefer. */
+ default_proto = dsa_get_tag_protocol(dp, conduit);
+ if (dst->default_proto) {
+ if (dst->default_proto != default_proto) {
+ dev_err(ds->dev,
+ "A DSA switch tree can have only one tagging protocol\n");
+ return -EINVAL;
+ }
+ } else {
+ dst->default_proto = default_proto;
+ }
+
+ /* See if the user wants to override that preference. */
+ if (user_protocol) {
+ if (!ds->ops->change_tag_protocol) {
+ dev_err(ds->dev, "Tag protocol cannot be modified\n");
+ return -EINVAL;
+ }
+
+ tag_ops = dsa_tag_driver_get_by_name(user_protocol);
+ if (IS_ERR(tag_ops)) {
+ dev_warn(ds->dev,
+ "Failed to find a tagging driver for protocol %s, using default\n",
+ user_protocol);
+ tag_ops = NULL;
+ }
+ }
+
+ if (!tag_ops)
+ tag_ops = dsa_tag_driver_get_by_id(default_proto);
+
+ if (IS_ERR(tag_ops)) {
+ if (PTR_ERR(tag_ops) == -ENOPROTOOPT)
+ return -EPROBE_DEFER;
+
+ dev_warn(ds->dev, "No tagger for this switch\n");
+ return PTR_ERR(tag_ops);
+ }
+
+ if (dst->tag_ops) {
+ if (dst->tag_ops != tag_ops) {
+ dev_err(ds->dev,
+ "A DSA switch tree can have only one tagging protocol\n");
+
+ dsa_tag_driver_put(tag_ops);
+ return -EINVAL;
+ }
+
+ /* In the case of multiple CPU ports per switch, the tagging
+ * protocol is still reference-counted only per switch tree.
+ */
+ dsa_tag_driver_put(tag_ops);
+ } else {
+ dst->tag_ops = tag_ops;
+ }
+
+ dp->conduit = conduit;
+ dp->type = DSA_PORT_TYPE_CPU;
+ dsa_port_set_tag_protocol(dp, dst->tag_ops);
+ dp->dst = dst;
+
+ /* At this point, the tree may be configured to use a different
+ * tagger than the one chosen by the switch driver during
+ * .setup, in the case when a user selects a custom protocol
+ * through the DT.
+ *
+ * This is resolved by syncing the driver with the tree in
+ * dsa_switch_setup_tag_protocol once .setup has run and the
+ * driver is ready to accept calls to .change_tag_protocol. If
+ * the driver does not support the custom protocol at that
+ * point, the tree is wholly rejected, thereby ensuring that the
+ * tree and driver are always in agreement on the protocol to
+ * use.
+ */
+ return 0;
+}
+
+static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn)
+{
+ struct device_node *ethernet = of_parse_phandle(dn, "ethernet", 0);
+ const char *name = of_get_property(dn, "label", NULL);
+ bool link = of_property_read_bool(dn, "link");
+
+ dp->dn = dn;
+
+ if (ethernet) {
+ struct net_device *conduit;
+ const char *user_protocol;
+
+ conduit = of_find_net_device_by_node(ethernet);
+ of_node_put(ethernet);
+ if (!conduit)
+ return -EPROBE_DEFER;
+
+ user_protocol = of_get_property(dn, "dsa-tag-protocol", NULL);
+ return dsa_port_parse_cpu(dp, conduit, user_protocol);
+ }
+
+ if (link)
+ return dsa_port_parse_dsa(dp);
+
+ return dsa_port_parse_user(dp, name);
+}
+
+static int dsa_switch_parse_ports_of(struct dsa_switch *ds,
+ struct device_node *dn)
+{
+ struct device_node *ports, *port;
+ struct dsa_port *dp;
+ int err = 0;
+ u32 reg;
+
+ ports = of_get_child_by_name(dn, "ports");
+ if (!ports) {
+ /* The second possibility is "ethernet-ports" */
+ ports = of_get_child_by_name(dn, "ethernet-ports");
+ if (!ports) {
+ dev_err(ds->dev, "no ports child node found\n");
+ return -EINVAL;
+ }
+ }
+
+ for_each_available_child_of_node(ports, port) {
+ err = of_property_read_u32(port, "reg", &reg);
+ if (err) {
+ of_node_put(port);
+ goto out_put_node;
+ }
+
+ if (reg >= ds->num_ports) {
+ dev_err(ds->dev, "port %pOF index %u exceeds num_ports (%u)\n",
+ port, reg, ds->num_ports);
+ of_node_put(port);
+ err = -EINVAL;
+ goto out_put_node;
+ }
+
+ dp = dsa_to_port(ds, reg);
+
+ err = dsa_port_parse_of(dp, port);
+ if (err) {
+ of_node_put(port);
+ goto out_put_node;
+ }
+ }
+
+out_put_node:
+ of_node_put(ports);
+ return err;
+}
+
+static int dsa_switch_parse_member_of(struct dsa_switch *ds,
+ struct device_node *dn)
+{
+ u32 m[2] = { 0, 0 };
+ int sz;
+
+ /* Don't error out if this optional property isn't found */
+ sz = of_property_read_variable_u32_array(dn, "dsa,member", m, 2, 2);
+ if (sz < 0 && sz != -EINVAL)
+ return sz;
+
+ ds->index = m[1];
+
+ ds->dst = dsa_tree_touch(m[0]);
+ if (!ds->dst)
+ return -ENOMEM;
+
+ if (dsa_switch_find(ds->dst->index, ds->index)) {
+ dev_err(ds->dev,
+ "A DSA switch with index %d already exists in tree %d\n",
+ ds->index, ds->dst->index);
+ return -EEXIST;
+ }
+
+ if (ds->dst->last_switch < ds->index)
+ ds->dst->last_switch = ds->index;
+
+ return 0;
+}
+
+static int dsa_switch_touch_ports(struct dsa_switch *ds)
+{
+ struct dsa_port *dp;
+ int port;
+
+ for (port = 0; port < ds->num_ports; port++) {
+ dp = dsa_port_touch(ds, port);
+ if (!dp)
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn)
+{
+ int err;
+
+ err = dsa_switch_parse_member_of(ds, dn);
+ if (err)
+ return err;
+
+ err = dsa_switch_touch_ports(ds);
+ if (err)
+ return err;
+
+ return dsa_switch_parse_ports_of(ds, dn);
+}
+
+static int dev_is_class(struct device *dev, const void *class)
{
if (dev->class != NULL && !strcmp(dev->class->name, class))
return 1;
@@ -102,7 +1393,7 @@ static struct device *dev_find_class(struct device *parent, char *class)
return device_find_child(parent, class, dev_is_class);
}
-struct net_device *dsa_dev_to_net_device(struct device *dev)
+static struct net_device *dsa_dev_to_net_device(struct device *dev)
{
struct device *d;
@@ -119,99 +1410,257 @@ struct net_device *dsa_dev_to_net_device(struct device *dev)
return NULL;
}
-EXPORT_SYMBOL_GPL(dsa_dev_to_net_device);
-/* Determine if we should defer delivery of skb until we have a rx timestamp.
- *
- * Called from dsa_switch_rcv. For now, this will only work if tagging is
- * enabled on the switch. Normally the MAC driver would retrieve the hardware
- * timestamp when it reads the packet out of the hardware. However in a DSA
- * switch, the DSA driver owning the interface to which the packet is
- * delivered is never notified unless we do so here.
- */
-static bool dsa_skb_defer_rx_timestamp(struct dsa_slave_priv *p,
- struct sk_buff *skb)
+static int dsa_port_parse(struct dsa_port *dp, const char *name,
+ struct device *dev)
{
- struct dsa_switch *ds = p->dp->ds;
- unsigned int type;
+ if (!strcmp(name, "cpu")) {
+ struct net_device *conduit;
- if (skb_headroom(skb) < ETH_HLEN)
- return false;
+ conduit = dsa_dev_to_net_device(dev);
+ if (!conduit)
+ return -EPROBE_DEFER;
- __skb_push(skb, ETH_HLEN);
+ dev_put(conduit);
- type = ptp_classify_raw(skb);
+ return dsa_port_parse_cpu(dp, conduit, NULL);
+ }
- __skb_pull(skb, ETH_HLEN);
+ if (!strcmp(name, "dsa"))
+ return dsa_port_parse_dsa(dp);
- if (type == PTP_CLASS_NONE)
- return false;
+ return dsa_port_parse_user(dp, name);
+}
+
+static int dsa_switch_parse_ports(struct dsa_switch *ds,
+ struct dsa_chip_data *cd)
+{
+ bool valid_name_found = false;
+ struct dsa_port *dp;
+ struct device *dev;
+ const char *name;
+ unsigned int i;
+ int err;
- if (likely(ds->ops->port_rxtstamp))
- return ds->ops->port_rxtstamp(ds, p->dp->index, skb, type);
+ for (i = 0; i < DSA_MAX_PORTS; i++) {
+ name = cd->port_names[i];
+ dev = cd->netdev[i];
+ dp = dsa_to_port(ds, i);
- return false;
+ if (!name)
+ continue;
+
+ err = dsa_port_parse(dp, name, dev);
+ if (err)
+ return err;
+
+ valid_name_found = true;
+ }
+
+ if (!valid_name_found && i == DSA_MAX_PORTS)
+ return -EINVAL;
+
+ return 0;
}
-static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt, struct net_device *unused)
+static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd)
{
- struct dsa_port *cpu_dp = dev->dsa_ptr;
- struct sk_buff *nskb = NULL;
- struct pcpu_sw_netstats *s;
- struct dsa_slave_priv *p;
+ int err;
- if (unlikely(!cpu_dp)) {
- kfree_skb(skb);
- return 0;
+ ds->cd = cd;
+
+ /* We don't support interconnected switches nor multiple trees via
+ * platform data, so this is the unique switch of the tree.
+ */
+ ds->index = 0;
+ ds->dst = dsa_tree_touch(0);
+ if (!ds->dst)
+ return -ENOMEM;
+
+ err = dsa_switch_touch_ports(ds);
+ if (err)
+ return err;
+
+ return dsa_switch_parse_ports(ds, cd);
+}
+
+static void dsa_switch_release_ports(struct dsa_switch *ds)
+{
+ struct dsa_mac_addr *a, *tmp;
+ struct dsa_port *dp, *next;
+ struct dsa_vlan *v, *n;
+
+ dsa_switch_for_each_port_safe(dp, next, ds) {
+ /* These are either entries that upper layers lost track of
+ * (probably due to bugs), or installed through interfaces
+ * where one does not necessarily have to remove them, like
+ * ndo_dflt_fdb_add().
+ */
+ list_for_each_entry_safe(a, tmp, &dp->fdbs, list) {
+ dev_info(ds->dev,
+ "Cleaning up unicast address %pM vid %u from port %d\n",
+ a->addr, a->vid, dp->index);
+ list_del(&a->list);
+ kfree(a);
+ }
+
+ list_for_each_entry_safe(a, tmp, &dp->mdbs, list) {
+ dev_info(ds->dev,
+ "Cleaning up multicast address %pM vid %u from port %d\n",
+ a->addr, a->vid, dp->index);
+ list_del(&a->list);
+ kfree(a);
+ }
+
+ /* These are entries that upper layers have lost track of,
+ * probably due to bugs, but also due to dsa_port_do_vlan_del()
+ * having failed and the VLAN entry still lingering on.
+ */
+ list_for_each_entry_safe(v, n, &dp->vlans, list) {
+ dev_info(ds->dev,
+ "Cleaning up vid %u from port %d\n",
+ v->vid, dp->index);
+ list_del(&v->list);
+ kfree(v);
+ }
+
+ list_del(&dp->list);
+ kfree(dp);
}
+}
- skb = skb_unshare(skb, GFP_ATOMIC);
- if (!skb)
- return 0;
+static int dsa_switch_probe(struct dsa_switch *ds)
+{
+ struct dsa_switch_tree *dst;
+ struct dsa_chip_data *pdata;
+ struct device_node *np;
+ int err;
- nskb = cpu_dp->rcv(skb, dev, pt);
- if (!nskb) {
- kfree_skb(skb);
- return 0;
+ if (!ds->dev)
+ return -ENODEV;
+
+ pdata = ds->dev->platform_data;
+ np = ds->dev->of_node;
+
+ if (!ds->num_ports)
+ return -EINVAL;
+
+ if (np) {
+ err = dsa_switch_parse_of(ds, np);
+ if (err)
+ dsa_switch_release_ports(ds);
+ } else if (pdata) {
+ err = dsa_switch_parse(ds, pdata);
+ if (err)
+ dsa_switch_release_ports(ds);
+ } else {
+ err = -ENODEV;
}
- skb = nskb;
- p = netdev_priv(skb->dev);
- skb_push(skb, ETH_HLEN);
- skb->pkt_type = PACKET_HOST;
- skb->protocol = eth_type_trans(skb, skb->dev);
+ if (err)
+ return err;
- s = this_cpu_ptr(p->stats64);
- u64_stats_update_begin(&s->syncp);
- s->rx_packets++;
- s->rx_bytes += skb->len;
- u64_stats_update_end(&s->syncp);
+ dst = ds->dst;
+ dsa_tree_get(dst);
+ err = dsa_tree_setup(dst);
+ if (err) {
+ dsa_switch_release_ports(ds);
+ dsa_tree_put(dst);
+ }
- if (dsa_skb_defer_rx_timestamp(p, skb))
- return 0;
+ return err;
+}
- netif_receive_skb(skb);
+int dsa_register_switch(struct dsa_switch *ds)
+{
+ int err;
- return 0;
+ mutex_lock(&dsa2_mutex);
+ err = dsa_switch_probe(ds);
+ dsa_tree_put(ds->dst);
+ mutex_unlock(&dsa2_mutex);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(dsa_register_switch);
+
+static void dsa_switch_remove(struct dsa_switch *ds)
+{
+ struct dsa_switch_tree *dst = ds->dst;
+
+ dsa_tree_teardown(dst);
+ dsa_switch_release_ports(ds);
+ dsa_tree_put(dst);
+}
+
+void dsa_unregister_switch(struct dsa_switch *ds)
+{
+ mutex_lock(&dsa2_mutex);
+ dsa_switch_remove(ds);
+ mutex_unlock(&dsa2_mutex);
}
+EXPORT_SYMBOL_GPL(dsa_unregister_switch);
+
+/* If the DSA conduit chooses to unregister its net_device on .shutdown, DSA is
+ * blocking that operation from completion, due to the dev_hold taken inside
+ * netdev_upper_dev_link. Unlink the DSA user interfaces from being uppers of
+ * the DSA conduit, so that the system can reboot successfully.
+ */
+void dsa_switch_shutdown(struct dsa_switch *ds)
+{
+ struct net_device *conduit, *user_dev;
+ LIST_HEAD(close_list);
+ struct dsa_port *dp;
+
+ mutex_lock(&dsa2_mutex);
+
+ if (!ds->setup)
+ goto out;
+
+ rtnl_lock();
+
+ dsa_switch_for_each_cpu_port(dp, ds)
+ list_add(&dp->conduit->close_list, &close_list);
+
+ netif_close_many(&close_list, true);
+
+ dsa_switch_for_each_user_port(dp, ds) {
+ conduit = dsa_port_to_conduit(dp);
+ user_dev = dp->user;
+
+ netif_device_detach(user_dev);
+ netdev_upper_dev_unlink(conduit, user_dev);
+ }
+
+ /* Disconnect from further netdevice notifiers on the conduit,
+ * since netdev_uses_dsa() will now return false.
+ */
+ dsa_switch_for_each_cpu_port(dp, ds)
+ dp->conduit->dsa_ptr = NULL;
+
+ rtnl_unlock();
+out:
+ mutex_unlock(&dsa2_mutex);
+}
+EXPORT_SYMBOL_GPL(dsa_switch_shutdown);
#ifdef CONFIG_PM_SLEEP
-static bool dsa_is_port_initialized(struct dsa_switch *ds, int p)
+static bool dsa_port_is_initialized(const struct dsa_port *dp)
{
- return dsa_is_user_port(ds, p) && ds->ports[p].slave;
+ return dp->type == DSA_PORT_TYPE_USER && dp->user;
}
int dsa_switch_suspend(struct dsa_switch *ds)
{
- int i, ret = 0;
+ struct dsa_port *dp;
+ int ret = 0;
- /* Suspend slave network devices */
- for (i = 0; i < ds->num_ports; i++) {
- if (!dsa_is_port_initialized(ds, i))
+ /* Suspend user network devices */
+ dsa_switch_for_each_port(dp, ds) {
+ if (!dsa_port_is_initialized(dp))
continue;
- ret = dsa_slave_suspend(ds->ports[i].slave);
+ ret = dsa_user_suspend(dp->user);
if (ret)
return ret;
}
@@ -225,7 +1674,8 @@ EXPORT_SYMBOL_GPL(dsa_switch_suspend);
int dsa_switch_resume(struct dsa_switch *ds)
{
- int i, ret = 0;
+ struct dsa_port *dp;
+ int ret = 0;
if (ds->ops->resume)
ret = ds->ops->resume(ds);
@@ -233,12 +1683,12 @@ int dsa_switch_resume(struct dsa_switch *ds)
if (ret)
return ret;
- /* Resume slave network devices */
- for (i = 0; i < ds->num_ports; i++) {
- if (!dsa_is_port_initialized(ds, i))
+ /* Resume user network devices */
+ dsa_switch_for_each_port(dp, ds) {
+ if (!dsa_port_is_initialized(dp))
continue;
- ret = dsa_slave_resume(ds->ports[i].slave);
+ ret = dsa_user_resume(dp->user);
if (ret)
return ret;
}
@@ -248,39 +1698,152 @@ int dsa_switch_resume(struct dsa_switch *ds)
EXPORT_SYMBOL_GPL(dsa_switch_resume);
#endif
-static struct packet_type dsa_pack_type __read_mostly = {
- .type = cpu_to_be16(ETH_P_XDSA),
- .func = dsa_switch_rcv,
-};
+struct dsa_port *dsa_port_from_netdev(struct net_device *netdev)
+{
+ if (!netdev || !dsa_user_dev_check(netdev))
+ return ERR_PTR(-ENODEV);
-static struct workqueue_struct *dsa_owq;
+ return dsa_user_to_port(netdev);
+}
+EXPORT_SYMBOL_GPL(dsa_port_from_netdev);
-bool dsa_schedule_work(struct work_struct *work)
+bool dsa_db_equal(const struct dsa_db *a, const struct dsa_db *b)
{
- return queue_work(dsa_owq, work);
+ if (a->type != b->type)
+ return false;
+
+ switch (a->type) {
+ case DSA_DB_PORT:
+ return a->dp == b->dp;
+ case DSA_DB_LAG:
+ return a->lag.dev == b->lag.dev;
+ case DSA_DB_BRIDGE:
+ return a->bridge.num == b->bridge.num;
+ default:
+ WARN_ON(1);
+ return false;
+ }
+}
+
+bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port,
+ const unsigned char *addr, u16 vid,
+ struct dsa_db db)
+{
+ struct dsa_port *dp = dsa_to_port(ds, port);
+ struct dsa_mac_addr *a;
+
+ lockdep_assert_held(&dp->addr_lists_lock);
+
+ list_for_each_entry(a, &dp->fdbs, list) {
+ if (!ether_addr_equal(a->addr, addr) || a->vid != vid)
+ continue;
+
+ if (a->db.type == db.type && !dsa_db_equal(&a->db, &db))
+ return true;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(dsa_fdb_present_in_other_db);
+
+bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port,
+ const struct switchdev_obj_port_mdb *mdb,
+ struct dsa_db db)
+{
+ struct dsa_port *dp = dsa_to_port(ds, port);
+ struct dsa_mac_addr *a;
+
+ lockdep_assert_held(&dp->addr_lists_lock);
+
+ list_for_each_entry(a, &dp->mdbs, list) {
+ if (!ether_addr_equal(a->addr, mdb->addr) || a->vid != mdb->vid)
+ continue;
+
+ if (a->db.type == db.type && !dsa_db_equal(&a->db, &db))
+ return true;
+ }
+
+ return false;
}
+EXPORT_SYMBOL_GPL(dsa_mdb_present_in_other_db);
+
+/* Helpers for switches without specific HSR offloads, but which can implement
+ * NETIF_F_HW_HSR_DUP because their tagger uses dsa_xmit_port_mask()
+ */
+int dsa_port_simple_hsr_validate(struct dsa_switch *ds, int port,
+ struct net_device *hsr,
+ struct netlink_ext_ack *extack)
+{
+ enum hsr_port_type type;
+ int err;
+
+ err = hsr_get_port_type(hsr, dsa_to_port(ds, port)->user, &type);
+ if (err)
+ return err;
+
+ if (type != HSR_PT_SLAVE_A && type != HSR_PT_SLAVE_B) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Only HSR slave ports can be offloaded");
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dsa_port_simple_hsr_validate);
+
+int dsa_port_simple_hsr_join(struct dsa_switch *ds, int port,
+ struct net_device *hsr,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *dp = dsa_to_port(ds, port), *other_dp;
+ int err;
+
+ err = dsa_port_simple_hsr_validate(ds, port, hsr, extack);
+ if (err)
+ return err;
-static ATOMIC_NOTIFIER_HEAD(dsa_notif_chain);
+ dsa_hsr_foreach_port(other_dp, ds, hsr) {
+ if (other_dp != dp) {
+ dp->user->features |= NETIF_F_HW_HSR_DUP;
+ other_dp->user->features |= NETIF_F_HW_HSR_DUP;
+ break;
+ }
+ }
-int register_dsa_notifier(struct notifier_block *nb)
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dsa_port_simple_hsr_join);
+
+int dsa_port_simple_hsr_leave(struct dsa_switch *ds, int port,
+ struct net_device *hsr)
{
- return atomic_notifier_chain_register(&dsa_notif_chain, nb);
+ struct dsa_port *dp = dsa_to_port(ds, port), *other_dp;
+
+ dsa_hsr_foreach_port(other_dp, ds, hsr) {
+ if (other_dp != dp) {
+ dp->user->features &= ~NETIF_F_HW_HSR_DUP;
+ other_dp->user->features &= ~NETIF_F_HW_HSR_DUP;
+ break;
+ }
+ }
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(register_dsa_notifier);
+EXPORT_SYMBOL_GPL(dsa_port_simple_hsr_leave);
+
+static const struct dsa_stubs __dsa_stubs = {
+ .conduit_hwtstamp_validate = __dsa_conduit_hwtstamp_validate,
+};
-int unregister_dsa_notifier(struct notifier_block *nb)
+static void dsa_register_stubs(void)
{
- return atomic_notifier_chain_unregister(&dsa_notif_chain, nb);
+ dsa_stubs = &__dsa_stubs;
}
-EXPORT_SYMBOL_GPL(unregister_dsa_notifier);
-int call_dsa_notifiers(unsigned long val, struct net_device *dev,
- struct dsa_notifier_info *info)
+static void dsa_unregister_stubs(void)
{
- info->dev = dev;
- return atomic_notifier_call_chain(&dsa_notif_chain, val, info);
+ dsa_stubs = NULL;
}
-EXPORT_SYMBOL_GPL(call_dsa_notifiers);
static int __init dsa_init_module(void)
{
@@ -291,25 +1854,38 @@ static int __init dsa_init_module(void)
if (!dsa_owq)
return -ENOMEM;
- rc = dsa_slave_register_notifier();
+ rc = dsa_user_register_notifier();
if (rc)
- return rc;
+ goto register_notifier_fail;
- rc = dsa_legacy_register();
+ dev_add_pack(&dsa_pack_type);
+
+ rc = rtnl_link_register(&dsa_link_ops);
if (rc)
- return rc;
+ goto netlink_register_fail;
- dev_add_pack(&dsa_pack_type);
+ dsa_register_stubs();
return 0;
+
+netlink_register_fail:
+ dsa_user_unregister_notifier();
+ dev_remove_pack(&dsa_pack_type);
+register_notifier_fail:
+ destroy_workqueue(dsa_owq);
+
+ return rc;
}
module_init(dsa_init_module);
static void __exit dsa_cleanup_module(void)
{
- dsa_slave_unregister_notifier();
+ dsa_unregister_stubs();
+
+ rtnl_link_unregister(&dsa_link_ops);
+
+ dsa_user_unregister_notifier();
dev_remove_pack(&dsa_pack_type);
- dsa_legacy_unregister();
destroy_workqueue(dsa_owq);
}
module_exit(dsa_cleanup_module);
@@ -318,3 +1894,4 @@ MODULE_AUTHOR("Lennert Buytenhek <buytenh@wantstofly.org>");
MODULE_DESCRIPTION("Driver for Distributed Switch Architecture switch chips");
MODULE_LICENSE("GPL");
MODULE_ALIAS("platform:dsa");
+MODULE_IMPORT_NS("NETDEV_INTERNAL");
diff --git a/net/dsa/dsa.h b/net/dsa/dsa.h
new file mode 100644
index 000000000000..3cc7823e9ef3
--- /dev/null
+++ b/net/dsa/dsa.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_H
+#define __DSA_H
+
+#include <linux/list.h>
+#include <linux/types.h>
+
+struct dsa_db;
+struct dsa_device_ops;
+struct dsa_lag;
+struct dsa_switch_tree;
+struct net_device;
+struct work_struct;
+
+extern struct list_head dsa_tree_list;
+
+bool dsa_db_equal(const struct dsa_db *a, const struct dsa_db *b);
+bool dsa_schedule_work(struct work_struct *work);
+void dsa_lag_map(struct dsa_switch_tree *dst, struct dsa_lag *lag);
+void dsa_lag_unmap(struct dsa_switch_tree *dst, struct dsa_lag *lag);
+struct dsa_lag *dsa_tree_lag_find(struct dsa_switch_tree *dst,
+ const struct net_device *lag_dev);
+struct net_device *dsa_tree_find_first_conduit(struct dsa_switch_tree *dst);
+int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst,
+ const struct dsa_device_ops *tag_ops,
+ const struct dsa_device_ops *old_tag_ops);
+void dsa_tree_conduit_admin_state_change(struct dsa_switch_tree *dst,
+ struct net_device *conduit,
+ bool up);
+void dsa_tree_conduit_oper_state_change(struct dsa_switch_tree *dst,
+ struct net_device *conduit,
+ bool up);
+unsigned int dsa_bridge_num_get(const struct net_device *bridge_dev, int max);
+void dsa_bridge_num_put(const struct net_device *bridge_dev,
+ unsigned int bridge_num);
+struct dsa_bridge *dsa_tree_bridge_find(struct dsa_switch_tree *dst,
+ const struct net_device *br);
+
+#endif
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
deleted file mode 100644
index a1917025e155..000000000000
--- a/net/dsa/dsa2.c
+++ /dev/null
@@ -1,831 +0,0 @@
-/*
- * net/dsa/dsa2.c - Hardware switch handling, binding version 2
- * Copyright (c) 2008-2009 Marvell Semiconductor
- * Copyright (c) 2013 Florian Fainelli <florian@openwrt.org>
- * Copyright (c) 2016 Andrew Lunn <andrew@lunn.ch>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/device.h>
-#include <linux/err.h>
-#include <linux/list.h>
-#include <linux/netdevice.h>
-#include <linux/slab.h>
-#include <linux/rtnetlink.h>
-#include <linux/of.h>
-#include <linux/of_net.h>
-
-#include "dsa_priv.h"
-
-static LIST_HEAD(dsa_tree_list);
-static DEFINE_MUTEX(dsa2_mutex);
-
-static const struct devlink_ops dsa_devlink_ops = {
-};
-
-static struct dsa_switch_tree *dsa_tree_find(int index)
-{
- struct dsa_switch_tree *dst;
-
- list_for_each_entry(dst, &dsa_tree_list, list)
- if (dst->index == index)
- return dst;
-
- return NULL;
-}
-
-static struct dsa_switch_tree *dsa_tree_alloc(int index)
-{
- struct dsa_switch_tree *dst;
-
- dst = kzalloc(sizeof(*dst), GFP_KERNEL);
- if (!dst)
- return NULL;
-
- dst->index = index;
-
- INIT_LIST_HEAD(&dst->list);
- list_add_tail(&dsa_tree_list, &dst->list);
-
- kref_init(&dst->refcount);
-
- return dst;
-}
-
-static void dsa_tree_free(struct dsa_switch_tree *dst)
-{
- list_del(&dst->list);
- kfree(dst);
-}
-
-static struct dsa_switch_tree *dsa_tree_get(struct dsa_switch_tree *dst)
-{
- if (dst)
- kref_get(&dst->refcount);
-
- return dst;
-}
-
-static struct dsa_switch_tree *dsa_tree_touch(int index)
-{
- struct dsa_switch_tree *dst;
-
- dst = dsa_tree_find(index);
- if (dst)
- return dsa_tree_get(dst);
- else
- return dsa_tree_alloc(index);
-}
-
-static void dsa_tree_release(struct kref *ref)
-{
- struct dsa_switch_tree *dst;
-
- dst = container_of(ref, struct dsa_switch_tree, refcount);
-
- dsa_tree_free(dst);
-}
-
-static void dsa_tree_put(struct dsa_switch_tree *dst)
-{
- if (dst)
- kref_put(&dst->refcount, dsa_tree_release);
-}
-
-static bool dsa_port_is_dsa(struct dsa_port *port)
-{
- return port->type == DSA_PORT_TYPE_DSA;
-}
-
-static bool dsa_port_is_cpu(struct dsa_port *port)
-{
- return port->type == DSA_PORT_TYPE_CPU;
-}
-
-static bool dsa_port_is_user(struct dsa_port *dp)
-{
- return dp->type == DSA_PORT_TYPE_USER;
-}
-
-static struct dsa_port *dsa_tree_find_port_by_node(struct dsa_switch_tree *dst,
- struct device_node *dn)
-{
- struct dsa_switch *ds;
- struct dsa_port *dp;
- int device, port;
-
- for (device = 0; device < DSA_MAX_SWITCHES; device++) {
- ds = dst->ds[device];
- if (!ds)
- continue;
-
- for (port = 0; port < ds->num_ports; port++) {
- dp = &ds->ports[port];
-
- if (dp->dn == dn)
- return dp;
- }
- }
-
- return NULL;
-}
-
-static bool dsa_port_setup_routing_table(struct dsa_port *dp)
-{
- struct dsa_switch *ds = dp->ds;
- struct dsa_switch_tree *dst = ds->dst;
- struct device_node *dn = dp->dn;
- struct of_phandle_iterator it;
- struct dsa_port *link_dp;
- int err;
-
- of_for_each_phandle(&it, err, dn, "link", NULL, 0) {
- link_dp = dsa_tree_find_port_by_node(dst, it.node);
- if (!link_dp) {
- of_node_put(it.node);
- return false;
- }
-
- ds->rtable[link_dp->ds->index] = dp->index;
- }
-
- return true;
-}
-
-static bool dsa_switch_setup_routing_table(struct dsa_switch *ds)
-{
- bool complete = true;
- struct dsa_port *dp;
- int i;
-
- for (i = 0; i < DSA_MAX_SWITCHES; i++)
- ds->rtable[i] = DSA_RTABLE_NONE;
-
- for (i = 0; i < ds->num_ports; i++) {
- dp = &ds->ports[i];
-
- if (dsa_port_is_dsa(dp)) {
- complete = dsa_port_setup_routing_table(dp);
- if (!complete)
- break;
- }
- }
-
- return complete;
-}
-
-static bool dsa_tree_setup_routing_table(struct dsa_switch_tree *dst)
-{
- struct dsa_switch *ds;
- bool complete = true;
- int device;
-
- for (device = 0; device < DSA_MAX_SWITCHES; device++) {
- ds = dst->ds[device];
- if (!ds)
- continue;
-
- complete = dsa_switch_setup_routing_table(ds);
- if (!complete)
- break;
- }
-
- return complete;
-}
-
-static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst)
-{
- struct dsa_switch *ds;
- struct dsa_port *dp;
- int device, port;
-
- for (device = 0; device < DSA_MAX_SWITCHES; device++) {
- ds = dst->ds[device];
- if (!ds)
- continue;
-
- for (port = 0; port < ds->num_ports; port++) {
- dp = &ds->ports[port];
-
- if (dsa_port_is_cpu(dp))
- return dp;
- }
- }
-
- return NULL;
-}
-
-static int dsa_tree_setup_default_cpu(struct dsa_switch_tree *dst)
-{
- struct dsa_switch *ds;
- struct dsa_port *dp;
- int device, port;
-
- /* DSA currently only supports a single CPU port */
- dst->cpu_dp = dsa_tree_find_first_cpu(dst);
- if (!dst->cpu_dp) {
- pr_warn("Tree has no master device\n");
- return -EINVAL;
- }
-
- /* Assign the default CPU port to all ports of the fabric */
- for (device = 0; device < DSA_MAX_SWITCHES; device++) {
- ds = dst->ds[device];
- if (!ds)
- continue;
-
- for (port = 0; port < ds->num_ports; port++) {
- dp = &ds->ports[port];
-
- if (dsa_port_is_user(dp) || dsa_port_is_dsa(dp))
- dp->cpu_dp = dst->cpu_dp;
- }
- }
-
- return 0;
-}
-
-static void dsa_tree_teardown_default_cpu(struct dsa_switch_tree *dst)
-{
- /* DSA currently only supports a single CPU port */
- dst->cpu_dp = NULL;
-}
-
-static int dsa_port_setup(struct dsa_port *dp)
-{
- struct dsa_switch *ds = dp->ds;
- int err = 0;
-
- memset(&dp->devlink_port, 0, sizeof(dp->devlink_port));
-
- if (dp->type != DSA_PORT_TYPE_UNUSED)
- err = devlink_port_register(ds->devlink, &dp->devlink_port,
- dp->index);
- if (err)
- return err;
-
- switch (dp->type) {
- case DSA_PORT_TYPE_UNUSED:
- break;
- case DSA_PORT_TYPE_CPU:
- /* dp->index is used now as port_number. However
- * CPU ports should have separate numbering
- * independent from front panel port numbers.
- */
- devlink_port_attrs_set(&dp->devlink_port,
- DEVLINK_PORT_FLAVOUR_CPU,
- dp->index, false, 0);
- err = dsa_port_link_register_of(dp);
- if (err) {
- dev_err(ds->dev, "failed to setup link for port %d.%d\n",
- ds->index, dp->index);
- return err;
- }
- break;
- case DSA_PORT_TYPE_DSA:
- /* dp->index is used now as port_number. However
- * DSA ports should have separate numbering
- * independent from front panel port numbers.
- */
- devlink_port_attrs_set(&dp->devlink_port,
- DEVLINK_PORT_FLAVOUR_DSA,
- dp->index, false, 0);
- err = dsa_port_link_register_of(dp);
- if (err) {
- dev_err(ds->dev, "failed to setup link for port %d.%d\n",
- ds->index, dp->index);
- return err;
- }
- break;
- case DSA_PORT_TYPE_USER:
- devlink_port_attrs_set(&dp->devlink_port,
- DEVLINK_PORT_FLAVOUR_PHYSICAL,
- dp->index, false, 0);
- err = dsa_slave_create(dp);
- if (err)
- dev_err(ds->dev, "failed to create slave for port %d.%d\n",
- ds->index, dp->index);
- else
- devlink_port_type_eth_set(&dp->devlink_port, dp->slave);
- break;
- }
-
- return 0;
-}
-
-static void dsa_port_teardown(struct dsa_port *dp)
-{
- if (dp->type != DSA_PORT_TYPE_UNUSED)
- devlink_port_unregister(&dp->devlink_port);
-
- switch (dp->type) {
- case DSA_PORT_TYPE_UNUSED:
- break;
- case DSA_PORT_TYPE_CPU:
- case DSA_PORT_TYPE_DSA:
- dsa_port_link_unregister_of(dp);
- break;
- case DSA_PORT_TYPE_USER:
- if (dp->slave) {
- dsa_slave_destroy(dp->slave);
- dp->slave = NULL;
- }
- break;
- }
-}
-
-static int dsa_switch_setup(struct dsa_switch *ds)
-{
- int err;
-
- /* Initialize ds->phys_mii_mask before registering the slave MDIO bus
- * driver and before ops->setup() has run, since the switch drivers and
- * the slave MDIO bus driver rely on these values for probing PHY
- * devices or not
- */
- ds->phys_mii_mask |= dsa_user_ports(ds);
-
- /* Add the switch to devlink before calling setup, so that setup can
- * add dpipe tables
- */
- ds->devlink = devlink_alloc(&dsa_devlink_ops, 0);
- if (!ds->devlink)
- return -ENOMEM;
-
- err = devlink_register(ds->devlink, ds->dev);
- if (err)
- return err;
-
- err = ds->ops->setup(ds);
- if (err < 0)
- return err;
-
- err = dsa_switch_register_notifier(ds);
- if (err)
- return err;
-
- if (!ds->slave_mii_bus && ds->ops->phy_read) {
- ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev);
- if (!ds->slave_mii_bus)
- return -ENOMEM;
-
- dsa_slave_mii_bus_init(ds);
-
- err = mdiobus_register(ds->slave_mii_bus);
- if (err < 0)
- return err;
- }
-
- return 0;
-}
-
-static void dsa_switch_teardown(struct dsa_switch *ds)
-{
- if (ds->slave_mii_bus && ds->ops->phy_read)
- mdiobus_unregister(ds->slave_mii_bus);
-
- dsa_switch_unregister_notifier(ds);
-
- if (ds->devlink) {
- devlink_unregister(ds->devlink);
- devlink_free(ds->devlink);
- ds->devlink = NULL;
- }
-
-}
-
-static int dsa_tree_setup_switches(struct dsa_switch_tree *dst)
-{
- struct dsa_switch *ds;
- struct dsa_port *dp;
- int device, port;
- int err;
-
- for (device = 0; device < DSA_MAX_SWITCHES; device++) {
- ds = dst->ds[device];
- if (!ds)
- continue;
-
- err = dsa_switch_setup(ds);
- if (err)
- return err;
-
- for (port = 0; port < ds->num_ports; port++) {
- dp = &ds->ports[port];
-
- err = dsa_port_setup(dp);
- if (err)
- return err;
- }
- }
-
- return 0;
-}
-
-static void dsa_tree_teardown_switches(struct dsa_switch_tree *dst)
-{
- struct dsa_switch *ds;
- struct dsa_port *dp;
- int device, port;
-
- for (device = 0; device < DSA_MAX_SWITCHES; device++) {
- ds = dst->ds[device];
- if (!ds)
- continue;
-
- for (port = 0; port < ds->num_ports; port++) {
- dp = &ds->ports[port];
-
- dsa_port_teardown(dp);
- }
-
- dsa_switch_teardown(ds);
- }
-}
-
-static int dsa_tree_setup_master(struct dsa_switch_tree *dst)
-{
- struct dsa_port *cpu_dp = dst->cpu_dp;
- struct net_device *master = cpu_dp->master;
-
- /* DSA currently supports a single pair of CPU port and master device */
- return dsa_master_setup(master, cpu_dp);
-}
-
-static void dsa_tree_teardown_master(struct dsa_switch_tree *dst)
-{
- struct dsa_port *cpu_dp = dst->cpu_dp;
- struct net_device *master = cpu_dp->master;
-
- return dsa_master_teardown(master);
-}
-
-static int dsa_tree_setup(struct dsa_switch_tree *dst)
-{
- bool complete;
- int err;
-
- if (dst->setup) {
- pr_err("DSA: tree %d already setup! Disjoint trees?\n",
- dst->index);
- return -EEXIST;
- }
-
- complete = dsa_tree_setup_routing_table(dst);
- if (!complete)
- return 0;
-
- err = dsa_tree_setup_default_cpu(dst);
- if (err)
- return err;
-
- err = dsa_tree_setup_switches(dst);
- if (err)
- return err;
-
- err = dsa_tree_setup_master(dst);
- if (err)
- return err;
-
- dst->setup = true;
-
- pr_info("DSA: tree %d setup\n", dst->index);
-
- return 0;
-}
-
-static void dsa_tree_teardown(struct dsa_switch_tree *dst)
-{
- if (!dst->setup)
- return;
-
- dsa_tree_teardown_master(dst);
-
- dsa_tree_teardown_switches(dst);
-
- dsa_tree_teardown_default_cpu(dst);
-
- pr_info("DSA: tree %d torn down\n", dst->index);
-
- dst->setup = false;
-}
-
-static void dsa_tree_remove_switch(struct dsa_switch_tree *dst,
- unsigned int index)
-{
- dsa_tree_teardown(dst);
-
- dst->ds[index] = NULL;
- dsa_tree_put(dst);
-}
-
-static int dsa_tree_add_switch(struct dsa_switch_tree *dst,
- struct dsa_switch *ds)
-{
- unsigned int index = ds->index;
- int err;
-
- if (dst->ds[index])
- return -EBUSY;
-
- dsa_tree_get(dst);
- dst->ds[index] = ds;
-
- err = dsa_tree_setup(dst);
- if (err)
- dsa_tree_remove_switch(dst, index);
-
- return err;
-}
-
-static int dsa_port_parse_user(struct dsa_port *dp, const char *name)
-{
- if (!name)
- name = "eth%d";
-
- dp->type = DSA_PORT_TYPE_USER;
- dp->name = name;
-
- return 0;
-}
-
-static int dsa_port_parse_dsa(struct dsa_port *dp)
-{
- dp->type = DSA_PORT_TYPE_DSA;
-
- return 0;
-}
-
-static int dsa_port_parse_cpu(struct dsa_port *dp, struct net_device *master)
-{
- struct dsa_switch *ds = dp->ds;
- struct dsa_switch_tree *dst = ds->dst;
- const struct dsa_device_ops *tag_ops;
- enum dsa_tag_protocol tag_protocol;
-
- tag_protocol = ds->ops->get_tag_protocol(ds, dp->index);
- tag_ops = dsa_resolve_tag_protocol(tag_protocol);
- if (IS_ERR(tag_ops)) {
- dev_warn(ds->dev, "No tagger for this switch\n");
- return PTR_ERR(tag_ops);
- }
-
- dp->type = DSA_PORT_TYPE_CPU;
- dp->rcv = tag_ops->rcv;
- dp->tag_ops = tag_ops;
- dp->master = master;
- dp->dst = dst;
-
- return 0;
-}
-
-static int dsa_port_parse_of(struct dsa_port *dp, struct device_node *dn)
-{
- struct device_node *ethernet = of_parse_phandle(dn, "ethernet", 0);
- const char *name = of_get_property(dn, "label", NULL);
- bool link = of_property_read_bool(dn, "link");
-
- dp->dn = dn;
-
- if (ethernet) {
- struct net_device *master;
-
- master = of_find_net_device_by_node(ethernet);
- if (!master)
- return -EPROBE_DEFER;
-
- return dsa_port_parse_cpu(dp, master);
- }
-
- if (link)
- return dsa_port_parse_dsa(dp);
-
- return dsa_port_parse_user(dp, name);
-}
-
-static int dsa_switch_parse_ports_of(struct dsa_switch *ds,
- struct device_node *dn)
-{
- struct device_node *ports, *port;
- struct dsa_port *dp;
- u32 reg;
- int err;
-
- ports = of_get_child_by_name(dn, "ports");
- if (!ports) {
- dev_err(ds->dev, "no ports child node found\n");
- return -EINVAL;
- }
-
- for_each_available_child_of_node(ports, port) {
- err = of_property_read_u32(port, "reg", &reg);
- if (err)
- return err;
-
- if (reg >= ds->num_ports)
- return -EINVAL;
-
- dp = &ds->ports[reg];
-
- err = dsa_port_parse_of(dp, port);
- if (err)
- return err;
- }
-
- return 0;
-}
-
-static int dsa_switch_parse_member_of(struct dsa_switch *ds,
- struct device_node *dn)
-{
- u32 m[2] = { 0, 0 };
- int sz;
-
- /* Don't error out if this optional property isn't found */
- sz = of_property_read_variable_u32_array(dn, "dsa,member", m, 2, 2);
- if (sz < 0 && sz != -EINVAL)
- return sz;
-
- ds->index = m[1];
- if (ds->index >= DSA_MAX_SWITCHES)
- return -EINVAL;
-
- ds->dst = dsa_tree_touch(m[0]);
- if (!ds->dst)
- return -ENOMEM;
-
- return 0;
-}
-
-static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn)
-{
- int err;
-
- err = dsa_switch_parse_member_of(ds, dn);
- if (err)
- return err;
-
- return dsa_switch_parse_ports_of(ds, dn);
-}
-
-static int dsa_port_parse(struct dsa_port *dp, const char *name,
- struct device *dev)
-{
- if (!strcmp(name, "cpu")) {
- struct net_device *master;
-
- master = dsa_dev_to_net_device(dev);
- if (!master)
- return -EPROBE_DEFER;
-
- dev_put(master);
-
- return dsa_port_parse_cpu(dp, master);
- }
-
- if (!strcmp(name, "dsa"))
- return dsa_port_parse_dsa(dp);
-
- return dsa_port_parse_user(dp, name);
-}
-
-static int dsa_switch_parse_ports(struct dsa_switch *ds,
- struct dsa_chip_data *cd)
-{
- bool valid_name_found = false;
- struct dsa_port *dp;
- struct device *dev;
- const char *name;
- unsigned int i;
- int err;
-
- for (i = 0; i < DSA_MAX_PORTS; i++) {
- name = cd->port_names[i];
- dev = cd->netdev[i];
- dp = &ds->ports[i];
-
- if (!name)
- continue;
-
- err = dsa_port_parse(dp, name, dev);
- if (err)
- return err;
-
- valid_name_found = true;
- }
-
- if (!valid_name_found && i == DSA_MAX_PORTS)
- return -EINVAL;
-
- return 0;
-}
-
-static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd)
-{
- ds->cd = cd;
-
- /* We don't support interconnected switches nor multiple trees via
- * platform data, so this is the unique switch of the tree.
- */
- ds->index = 0;
- ds->dst = dsa_tree_touch(0);
- if (!ds->dst)
- return -ENOMEM;
-
- return dsa_switch_parse_ports(ds, cd);
-}
-
-static int dsa_switch_add(struct dsa_switch *ds)
-{
- struct dsa_switch_tree *dst = ds->dst;
-
- return dsa_tree_add_switch(dst, ds);
-}
-
-static int dsa_switch_probe(struct dsa_switch *ds)
-{
- struct dsa_chip_data *pdata = ds->dev->platform_data;
- struct device_node *np = ds->dev->of_node;
- int err;
-
- if (np)
- err = dsa_switch_parse_of(ds, np);
- else if (pdata)
- err = dsa_switch_parse(ds, pdata);
- else
- err = -ENODEV;
-
- if (err)
- return err;
-
- return dsa_switch_add(ds);
-}
-
-struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
-{
- size_t size = sizeof(struct dsa_switch) + n * sizeof(struct dsa_port);
- struct dsa_switch *ds;
- int i;
-
- ds = devm_kzalloc(dev, size, GFP_KERNEL);
- if (!ds)
- return NULL;
-
- /* We avoid allocating memory outside dsa_switch
- * if it is not needed.
- */
- if (n <= sizeof(ds->_bitmap) * 8) {
- ds->bitmap = &ds->_bitmap;
- } else {
- ds->bitmap = devm_kcalloc(dev,
- BITS_TO_LONGS(n),
- sizeof(unsigned long),
- GFP_KERNEL);
- if (unlikely(!ds->bitmap))
- return NULL;
- }
-
- ds->dev = dev;
- ds->num_ports = n;
-
- for (i = 0; i < ds->num_ports; ++i) {
- ds->ports[i].index = i;
- ds->ports[i].ds = ds;
- }
-
- return ds;
-}
-EXPORT_SYMBOL_GPL(dsa_switch_alloc);
-
-int dsa_register_switch(struct dsa_switch *ds)
-{
- int err;
-
- mutex_lock(&dsa2_mutex);
- err = dsa_switch_probe(ds);
- dsa_tree_put(ds->dst);
- mutex_unlock(&dsa2_mutex);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(dsa_register_switch);
-
-static void dsa_switch_remove(struct dsa_switch *ds)
-{
- struct dsa_switch_tree *dst = ds->dst;
- unsigned int index = ds->index;
-
- dsa_tree_remove_switch(dst, index);
-}
-
-void dsa_unregister_switch(struct dsa_switch *ds)
-{
- mutex_lock(&dsa2_mutex);
- dsa_switch_remove(ds);
- mutex_unlock(&dsa2_mutex);
-}
-EXPORT_SYMBOL_GPL(dsa_unregister_switch);
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
deleted file mode 100644
index 3964c6f7a7c0..000000000000
--- a/net/dsa/dsa_priv.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * net/dsa/dsa_priv.h - Hardware switch handling
- * Copyright (c) 2008-2009 Marvell Semiconductor
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#ifndef __DSA_PRIV_H
-#define __DSA_PRIV_H
-
-#include <linux/phy.h>
-#include <linux/netdevice.h>
-#include <linux/netpoll.h>
-#include <net/dsa.h>
-
-enum {
- DSA_NOTIFIER_AGEING_TIME,
- DSA_NOTIFIER_BRIDGE_JOIN,
- DSA_NOTIFIER_BRIDGE_LEAVE,
- DSA_NOTIFIER_FDB_ADD,
- DSA_NOTIFIER_FDB_DEL,
- DSA_NOTIFIER_MDB_ADD,
- DSA_NOTIFIER_MDB_DEL,
- DSA_NOTIFIER_VLAN_ADD,
- DSA_NOTIFIER_VLAN_DEL,
-};
-
-/* DSA_NOTIFIER_AGEING_TIME */
-struct dsa_notifier_ageing_time_info {
- struct switchdev_trans *trans;
- unsigned int ageing_time;
-};
-
-/* DSA_NOTIFIER_BRIDGE_* */
-struct dsa_notifier_bridge_info {
- struct net_device *br;
- int sw_index;
- int port;
-};
-
-/* DSA_NOTIFIER_FDB_* */
-struct dsa_notifier_fdb_info {
- int sw_index;
- int port;
- const unsigned char *addr;
- u16 vid;
-};
-
-/* DSA_NOTIFIER_MDB_* */
-struct dsa_notifier_mdb_info {
- const struct switchdev_obj_port_mdb *mdb;
- struct switchdev_trans *trans;
- int sw_index;
- int port;
-};
-
-/* DSA_NOTIFIER_VLAN_* */
-struct dsa_notifier_vlan_info {
- const struct switchdev_obj_port_vlan *vlan;
- struct switchdev_trans *trans;
- int sw_index;
- int port;
-};
-
-struct dsa_slave_priv {
- /* Copy of CPU port xmit for faster access in slave transmit hot path */
- struct sk_buff * (*xmit)(struct sk_buff *skb,
- struct net_device *dev);
-
- struct pcpu_sw_netstats *stats64;
-
- /* DSA port data, such as switch, port index, etc. */
- struct dsa_port *dp;
-
-#ifdef CONFIG_NET_POLL_CONTROLLER
- struct netpoll *netpoll;
-#endif
-
- /* TC context */
- struct list_head mall_tc_list;
-};
-
-/* dsa.c */
-const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol);
-bool dsa_schedule_work(struct work_struct *work);
-
-/* legacy.c */
-#if IS_ENABLED(CONFIG_NET_DSA_LEGACY)
-int dsa_legacy_register(void);
-void dsa_legacy_unregister(void);
-#else
-static inline int dsa_legacy_register(void)
-{
- return 0;
-}
-
-static inline void dsa_legacy_unregister(void) { }
-#endif
-int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
- struct net_device *dev,
- const unsigned char *addr, u16 vid,
- u16 flags);
-int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
- struct net_device *dev,
- const unsigned char *addr, u16 vid);
-
-/* master.c */
-int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp);
-void dsa_master_teardown(struct net_device *dev);
-
-static inline struct net_device *dsa_master_find_slave(struct net_device *dev,
- int device, int port)
-{
- struct dsa_port *cpu_dp = dev->dsa_ptr;
- struct dsa_switch_tree *dst = cpu_dp->dst;
- struct dsa_switch *ds;
- struct dsa_port *slave_port;
-
- if (device < 0 || device >= DSA_MAX_SWITCHES)
- return NULL;
-
- ds = dst->ds[device];
- if (!ds)
- return NULL;
-
- if (port < 0 || port >= ds->num_ports)
- return NULL;
-
- slave_port = &ds->ports[port];
-
- if (unlikely(slave_port->type != DSA_PORT_TYPE_USER))
- return NULL;
-
- return slave_port->slave;
-}
-
-/* port.c */
-int dsa_port_set_state(struct dsa_port *dp, u8 state,
- struct switchdev_trans *trans);
-int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy);
-void dsa_port_disable(struct dsa_port *dp, struct phy_device *phy);
-int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br);
-void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br);
-int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
- struct switchdev_trans *trans);
-int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock,
- struct switchdev_trans *trans);
-int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr,
- u16 vid);
-int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr,
- u16 vid);
-int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data);
-int dsa_port_mdb_add(const struct dsa_port *dp,
- const struct switchdev_obj_port_mdb *mdb,
- struct switchdev_trans *trans);
-int dsa_port_mdb_del(const struct dsa_port *dp,
- const struct switchdev_obj_port_mdb *mdb);
-int dsa_port_vlan_add(struct dsa_port *dp,
- const struct switchdev_obj_port_vlan *vlan,
- struct switchdev_trans *trans);
-int dsa_port_vlan_del(struct dsa_port *dp,
- const struct switchdev_obj_port_vlan *vlan);
-int dsa_port_link_register_of(struct dsa_port *dp);
-void dsa_port_link_unregister_of(struct dsa_port *dp);
-
-/* slave.c */
-extern const struct dsa_device_ops notag_netdev_ops;
-void dsa_slave_mii_bus_init(struct dsa_switch *ds);
-int dsa_slave_create(struct dsa_port *dp);
-void dsa_slave_destroy(struct net_device *slave_dev);
-int dsa_slave_suspend(struct net_device *slave_dev);
-int dsa_slave_resume(struct net_device *slave_dev);
-int dsa_slave_register_notifier(void);
-void dsa_slave_unregister_notifier(void);
-
-static inline struct dsa_port *dsa_slave_to_port(const struct net_device *dev)
-{
- struct dsa_slave_priv *p = netdev_priv(dev);
-
- return p->dp;
-}
-
-static inline struct net_device *
-dsa_slave_to_master(const struct net_device *dev)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
-
- return dp->cpu_dp->master;
-}
-
-/* switch.c */
-int dsa_switch_register_notifier(struct dsa_switch *ds);
-void dsa_switch_unregister_notifier(struct dsa_switch *ds);
-
-/* tag_brcm.c */
-extern const struct dsa_device_ops brcm_netdev_ops;
-extern const struct dsa_device_ops brcm_prepend_netdev_ops;
-
-/* tag_dsa.c */
-extern const struct dsa_device_ops dsa_netdev_ops;
-
-/* tag_edsa.c */
-extern const struct dsa_device_ops edsa_netdev_ops;
-
-/* tag_ksz.c */
-extern const struct dsa_device_ops ksz_netdev_ops;
-
-/* tag_lan9303.c */
-extern const struct dsa_device_ops lan9303_netdev_ops;
-
-/* tag_mtk.c */
-extern const struct dsa_device_ops mtk_netdev_ops;
-
-/* tag_qca.c */
-extern const struct dsa_device_ops qca_netdev_ops;
-
-/* tag_trailer.c */
-extern const struct dsa_device_ops trailer_netdev_ops;
-
-#endif
diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c
deleted file mode 100644
index 42a7b85b84e1..000000000000
--- a/net/dsa/legacy.c
+++ /dev/null
@@ -1,748 +0,0 @@
-/*
- * net/dsa/legacy.c - Hardware switch handling
- * Copyright (c) 2008-2009 Marvell Semiconductor
- * Copyright (c) 2013 Florian Fainelli <florian@openwrt.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/device.h>
-#include <linux/list.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/of.h>
-#include <linux/of_mdio.h>
-#include <linux/of_platform.h>
-#include <linux/of_net.h>
-#include <linux/netdevice.h>
-#include <linux/sysfs.h>
-#include <linux/phy_fixed.h>
-#include <linux/etherdevice.h>
-
-#include "dsa_priv.h"
-
-/* switch driver registration ***********************************************/
-static DEFINE_MUTEX(dsa_switch_drivers_mutex);
-static LIST_HEAD(dsa_switch_drivers);
-
-void register_switch_driver(struct dsa_switch_driver *drv)
-{
- mutex_lock(&dsa_switch_drivers_mutex);
- list_add_tail(&drv->list, &dsa_switch_drivers);
- mutex_unlock(&dsa_switch_drivers_mutex);
-}
-EXPORT_SYMBOL_GPL(register_switch_driver);
-
-void unregister_switch_driver(struct dsa_switch_driver *drv)
-{
- mutex_lock(&dsa_switch_drivers_mutex);
- list_del_init(&drv->list);
- mutex_unlock(&dsa_switch_drivers_mutex);
-}
-EXPORT_SYMBOL_GPL(unregister_switch_driver);
-
-static const struct dsa_switch_ops *
-dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr,
- const char **_name, void **priv)
-{
- const struct dsa_switch_ops *ret;
- struct list_head *list;
- const char *name;
-
- ret = NULL;
- name = NULL;
-
- mutex_lock(&dsa_switch_drivers_mutex);
- list_for_each(list, &dsa_switch_drivers) {
- const struct dsa_switch_ops *ops;
- struct dsa_switch_driver *drv;
-
- drv = list_entry(list, struct dsa_switch_driver, list);
- ops = drv->ops;
-
- name = ops->probe(parent, host_dev, sw_addr, priv);
- if (name != NULL) {
- ret = ops;
- break;
- }
- }
- mutex_unlock(&dsa_switch_drivers_mutex);
-
- *_name = name;
-
- return ret;
-}
-
-/* basic switch operations **************************************************/
-static int dsa_cpu_dsa_setups(struct dsa_switch *ds)
-{
- int ret, port;
-
- for (port = 0; port < ds->num_ports; port++) {
- if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
- continue;
-
- ret = dsa_port_link_register_of(&ds->ports[port]);
- if (ret)
- return ret;
- }
- return 0;
-}
-
-static int dsa_switch_setup_one(struct dsa_switch *ds,
- struct net_device *master)
-{
- const struct dsa_switch_ops *ops = ds->ops;
- struct dsa_switch_tree *dst = ds->dst;
- struct dsa_chip_data *cd = ds->cd;
- bool valid_name_found = false;
- int index = ds->index;
- struct dsa_port *dp;
- int i, ret;
-
- /*
- * Validate supplied switch configuration.
- */
- for (i = 0; i < ds->num_ports; i++) {
- char *name;
-
- dp = &ds->ports[i];
-
- name = cd->port_names[i];
- if (name == NULL)
- continue;
- dp->name = name;
-
- if (!strcmp(name, "cpu")) {
- if (dst->cpu_dp) {
- netdev_err(master,
- "multiple cpu ports?!\n");
- return -EINVAL;
- }
- dst->cpu_dp = &ds->ports[i];
- dst->cpu_dp->master = master;
- dp->type = DSA_PORT_TYPE_CPU;
- } else if (!strcmp(name, "dsa")) {
- dp->type = DSA_PORT_TYPE_DSA;
- } else {
- dp->type = DSA_PORT_TYPE_USER;
- }
- valid_name_found = true;
- }
-
- if (!valid_name_found && i == ds->num_ports)
- return -EINVAL;
-
- /* Make the built-in MII bus mask match the number of ports,
- * switch drivers can override this later
- */
- ds->phys_mii_mask |= dsa_user_ports(ds);
-
- /*
- * If the CPU connects to this switch, set the switch tree
- * tagging protocol to the preferred tagging format of this
- * switch.
- */
- if (dst->cpu_dp->ds == ds) {
- const struct dsa_device_ops *tag_ops;
- enum dsa_tag_protocol tag_protocol;
-
- tag_protocol = ops->get_tag_protocol(ds, dst->cpu_dp->index);
- tag_ops = dsa_resolve_tag_protocol(tag_protocol);
- if (IS_ERR(tag_ops))
- return PTR_ERR(tag_ops);
-
- dst->cpu_dp->tag_ops = tag_ops;
-
- /* Few copies for faster access in master receive hot path */
- dst->cpu_dp->rcv = dst->cpu_dp->tag_ops->rcv;
- dst->cpu_dp->dst = dst;
- }
-
- memcpy(ds->rtable, cd->rtable, sizeof(ds->rtable));
-
- /*
- * Do basic register setup.
- */
- ret = ops->setup(ds);
- if (ret < 0)
- return ret;
-
- ret = dsa_switch_register_notifier(ds);
- if (ret)
- return ret;
-
- if (!ds->slave_mii_bus && ops->phy_read) {
- ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev);
- if (!ds->slave_mii_bus)
- return -ENOMEM;
- dsa_slave_mii_bus_init(ds);
-
- ret = mdiobus_register(ds->slave_mii_bus);
- if (ret < 0)
- return ret;
- }
-
- /*
- * Create network devices for physical switch ports.
- */
- for (i = 0; i < ds->num_ports; i++) {
- ds->ports[i].dn = cd->port_dn[i];
- ds->ports[i].cpu_dp = dst->cpu_dp;
-
- if (!dsa_is_user_port(ds, i))
- continue;
-
- ret = dsa_slave_create(&ds->ports[i]);
- if (ret < 0)
- netdev_err(master, "[%d]: can't create dsa slave device for port %d(%s): %d\n",
- index, i, cd->port_names[i], ret);
- }
-
- /* Perform configuration of the CPU and DSA ports */
- ret = dsa_cpu_dsa_setups(ds);
- if (ret < 0)
- netdev_err(master, "[%d] : can't configure CPU and DSA ports\n",
- index);
-
- return 0;
-}
-
-static struct dsa_switch *
-dsa_switch_setup(struct dsa_switch_tree *dst, struct net_device *master,
- int index, struct device *parent, struct device *host_dev)
-{
- struct dsa_chip_data *cd = dst->pd->chip + index;
- const struct dsa_switch_ops *ops;
- struct dsa_switch *ds;
- int ret;
- const char *name;
- void *priv;
-
- /*
- * Probe for switch model.
- */
- ops = dsa_switch_probe(parent, host_dev, cd->sw_addr, &name, &priv);
- if (!ops) {
- netdev_err(master, "[%d]: could not detect attached switch\n",
- index);
- return ERR_PTR(-EINVAL);
- }
- netdev_info(master, "[%d]: detected a %s switch\n",
- index, name);
-
-
- /*
- * Allocate and initialise switch state.
- */
- ds = dsa_switch_alloc(parent, DSA_MAX_PORTS);
- if (!ds)
- return ERR_PTR(-ENOMEM);
-
- ds->dst = dst;
- ds->index = index;
- ds->cd = cd;
- ds->ops = ops;
- ds->priv = priv;
-
- ret = dsa_switch_setup_one(ds, master);
- if (ret)
- return ERR_PTR(ret);
-
- return ds;
-}
-
-static void dsa_switch_destroy(struct dsa_switch *ds)
-{
- int port;
-
- /* Destroy network devices for physical switch ports. */
- for (port = 0; port < ds->num_ports; port++) {
- if (!dsa_is_user_port(ds, port))
- continue;
-
- if (!ds->ports[port].slave)
- continue;
-
- dsa_slave_destroy(ds->ports[port].slave);
- }
-
- /* Disable configuration of the CPU and DSA ports */
- for (port = 0; port < ds->num_ports; port++) {
- if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
- continue;
- dsa_port_link_unregister_of(&ds->ports[port]);
- }
-
- if (ds->slave_mii_bus && ds->ops->phy_read)
- mdiobus_unregister(ds->slave_mii_bus);
-
- dsa_switch_unregister_notifier(ds);
-}
-
-/* platform driver init and cleanup *****************************************/
-static int dev_is_class(struct device *dev, void *class)
-{
- if (dev->class != NULL && !strcmp(dev->class->name, class))
- return 1;
-
- return 0;
-}
-
-static struct device *dev_find_class(struct device *parent, char *class)
-{
- if (dev_is_class(parent, class)) {
- get_device(parent);
- return parent;
- }
-
- return device_find_child(parent, class, dev_is_class);
-}
-
-struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev)
-{
- struct device *d;
-
- d = dev_find_class(dev, "mdio_bus");
- if (d != NULL) {
- struct mii_bus *bus;
-
- bus = to_mii_bus(d);
- put_device(d);
-
- return bus;
- }
-
- return NULL;
-}
-EXPORT_SYMBOL_GPL(dsa_host_dev_to_mii_bus);
-
-#ifdef CONFIG_OF
-static int dsa_of_setup_routing_table(struct dsa_platform_data *pd,
- struct dsa_chip_data *cd,
- int chip_index, int port_index,
- struct device_node *link)
-{
- const __be32 *reg;
- int link_sw_addr;
- struct device_node *parent_sw;
- int len;
-
- parent_sw = of_get_parent(link);
- if (!parent_sw)
- return -EINVAL;
-
- reg = of_get_property(parent_sw, "reg", &len);
- if (!reg || (len != sizeof(*reg) * 2))
- return -EINVAL;
-
- /*
- * Get the destination switch number from the second field of its 'reg'
- * property, i.e. for "reg = <0x19 1>" sw_addr is '1'.
- */
- link_sw_addr = be32_to_cpup(reg + 1);
-
- if (link_sw_addr >= pd->nr_chips)
- return -EINVAL;
-
- cd->rtable[link_sw_addr] = port_index;
-
- return 0;
-}
-
-static int dsa_of_probe_links(struct dsa_platform_data *pd,
- struct dsa_chip_data *cd,
- int chip_index, int port_index,
- struct device_node *port,
- const char *port_name)
-{
- struct device_node *link;
- int link_index;
- int ret;
-
- for (link_index = 0;; link_index++) {
- link = of_parse_phandle(port, "link", link_index);
- if (!link)
- break;
-
- if (!strcmp(port_name, "dsa") && pd->nr_chips > 1) {
- ret = dsa_of_setup_routing_table(pd, cd, chip_index,
- port_index, link);
- if (ret)
- return ret;
- }
- }
- return 0;
-}
-
-static void dsa_of_free_platform_data(struct dsa_platform_data *pd)
-{
- int i;
- int port_index;
-
- for (i = 0; i < pd->nr_chips; i++) {
- port_index = 0;
- while (port_index < DSA_MAX_PORTS) {
- kfree(pd->chip[i].port_names[port_index]);
- port_index++;
- }
-
- /* Drop our reference to the MDIO bus device */
- if (pd->chip[i].host_dev)
- put_device(pd->chip[i].host_dev);
- }
- kfree(pd->chip);
-}
-
-static int dsa_of_probe(struct device *dev)
-{
- struct device_node *np = dev->of_node;
- struct device_node *child, *mdio, *ethernet, *port;
- struct mii_bus *mdio_bus, *mdio_bus_switch;
- struct net_device *ethernet_dev;
- struct dsa_platform_data *pd;
- struct dsa_chip_data *cd;
- const char *port_name;
- int chip_index, port_index;
- const unsigned int *sw_addr, *port_reg;
- u32 eeprom_len;
- int ret;
-
- mdio = of_parse_phandle(np, "dsa,mii-bus", 0);
- if (!mdio)
- return -EINVAL;
-
- mdio_bus = of_mdio_find_bus(mdio);
- if (!mdio_bus)
- return -EPROBE_DEFER;
-
- ethernet = of_parse_phandle(np, "dsa,ethernet", 0);
- if (!ethernet) {
- ret = -EINVAL;
- goto out_put_mdio;
- }
-
- ethernet_dev = of_find_net_device_by_node(ethernet);
- if (!ethernet_dev) {
- ret = -EPROBE_DEFER;
- goto out_put_mdio;
- }
-
- pd = kzalloc(sizeof(*pd), GFP_KERNEL);
- if (!pd) {
- ret = -ENOMEM;
- goto out_put_ethernet;
- }
-
- dev->platform_data = pd;
- pd->of_netdev = ethernet_dev;
- pd->nr_chips = of_get_available_child_count(np);
- if (pd->nr_chips > DSA_MAX_SWITCHES)
- pd->nr_chips = DSA_MAX_SWITCHES;
-
- pd->chip = kcalloc(pd->nr_chips, sizeof(struct dsa_chip_data),
- GFP_KERNEL);
- if (!pd->chip) {
- ret = -ENOMEM;
- goto out_free;
- }
-
- chip_index = -1;
- for_each_available_child_of_node(np, child) {
- int i;
-
- chip_index++;
- cd = &pd->chip[chip_index];
-
- cd->of_node = child;
-
- /* Initialize the routing table */
- for (i = 0; i < DSA_MAX_SWITCHES; ++i)
- cd->rtable[i] = DSA_RTABLE_NONE;
-
- /* When assigning the host device, increment its refcount */
- cd->host_dev = get_device(&mdio_bus->dev);
-
- sw_addr = of_get_property(child, "reg", NULL);
- if (!sw_addr)
- continue;
-
- cd->sw_addr = be32_to_cpup(sw_addr);
- if (cd->sw_addr >= PHY_MAX_ADDR)
- continue;
-
- if (!of_property_read_u32(child, "eeprom-length", &eeprom_len))
- cd->eeprom_len = eeprom_len;
-
- mdio = of_parse_phandle(child, "mii-bus", 0);
- if (mdio) {
- mdio_bus_switch = of_mdio_find_bus(mdio);
- if (!mdio_bus_switch) {
- ret = -EPROBE_DEFER;
- goto out_free_chip;
- }
-
- /* Drop the mdio_bus device ref, replacing the host
- * device with the mdio_bus_switch device, keeping
- * the refcount from of_mdio_find_bus() above.
- */
- put_device(cd->host_dev);
- cd->host_dev = &mdio_bus_switch->dev;
- }
-
- for_each_available_child_of_node(child, port) {
- port_reg = of_get_property(port, "reg", NULL);
- if (!port_reg)
- continue;
-
- port_index = be32_to_cpup(port_reg);
- if (port_index >= DSA_MAX_PORTS)
- break;
-
- port_name = of_get_property(port, "label", NULL);
- if (!port_name)
- continue;
-
- cd->port_dn[port_index] = port;
-
- cd->port_names[port_index] = kstrdup(port_name,
- GFP_KERNEL);
- if (!cd->port_names[port_index]) {
- ret = -ENOMEM;
- goto out_free_chip;
- }
-
- ret = dsa_of_probe_links(pd, cd, chip_index,
- port_index, port, port_name);
- if (ret)
- goto out_free_chip;
-
- }
- }
-
- /* The individual chips hold their own refcount on the mdio bus,
- * so drop ours */
- put_device(&mdio_bus->dev);
-
- return 0;
-
-out_free_chip:
- dsa_of_free_platform_data(pd);
-out_free:
- kfree(pd);
- dev->platform_data = NULL;
-out_put_ethernet:
- put_device(&ethernet_dev->dev);
-out_put_mdio:
- put_device(&mdio_bus->dev);
- return ret;
-}
-
-static void dsa_of_remove(struct device *dev)
-{
- struct dsa_platform_data *pd = dev->platform_data;
-
- if (!dev->of_node)
- return;
-
- dsa_of_free_platform_data(pd);
- put_device(&pd->of_netdev->dev);
- kfree(pd);
-}
-#else
-static inline int dsa_of_probe(struct device *dev)
-{
- return 0;
-}
-
-static inline void dsa_of_remove(struct device *dev)
-{
-}
-#endif
-
-static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev,
- struct device *parent, struct dsa_platform_data *pd)
-{
- int i;
- unsigned configured = 0;
-
- dst->pd = pd;
-
- for (i = 0; i < pd->nr_chips; i++) {
- struct dsa_switch *ds;
-
- ds = dsa_switch_setup(dst, dev, i, parent, pd->chip[i].host_dev);
- if (IS_ERR(ds)) {
- netdev_err(dev, "[%d]: couldn't create dsa switch instance (error %ld)\n",
- i, PTR_ERR(ds));
- continue;
- }
-
- dst->ds[i] = ds;
-
- ++configured;
- }
-
- /*
- * If no switch was found, exit cleanly
- */
- if (!configured)
- return -EPROBE_DEFER;
-
- return dsa_master_setup(dst->cpu_dp->master, dst->cpu_dp);
-}
-
-static int dsa_probe(struct platform_device *pdev)
-{
- struct dsa_platform_data *pd = pdev->dev.platform_data;
- struct net_device *dev;
- struct dsa_switch_tree *dst;
- int ret;
-
- if (pdev->dev.of_node) {
- ret = dsa_of_probe(&pdev->dev);
- if (ret)
- return ret;
-
- pd = pdev->dev.platform_data;
- }
-
- if (pd == NULL || (pd->netdev == NULL && pd->of_netdev == NULL))
- return -EINVAL;
-
- if (pd->of_netdev) {
- dev = pd->of_netdev;
- dev_hold(dev);
- } else {
- dev = dsa_dev_to_net_device(pd->netdev);
- }
- if (dev == NULL) {
- ret = -EPROBE_DEFER;
- goto out;
- }
-
- if (dev->dsa_ptr != NULL) {
- dev_put(dev);
- ret = -EEXIST;
- goto out;
- }
-
- dst = devm_kzalloc(&pdev->dev, sizeof(*dst), GFP_KERNEL);
- if (dst == NULL) {
- dev_put(dev);
- ret = -ENOMEM;
- goto out;
- }
-
- platform_set_drvdata(pdev, dst);
-
- ret = dsa_setup_dst(dst, dev, &pdev->dev, pd);
- if (ret) {
- dev_put(dev);
- goto out;
- }
-
- return 0;
-
-out:
- dsa_of_remove(&pdev->dev);
-
- return ret;
-}
-
-static void dsa_remove_dst(struct dsa_switch_tree *dst)
-{
- int i;
-
- dsa_master_teardown(dst->cpu_dp->master);
-
- for (i = 0; i < dst->pd->nr_chips; i++) {
- struct dsa_switch *ds = dst->ds[i];
-
- if (ds)
- dsa_switch_destroy(ds);
- }
-
- dev_put(dst->cpu_dp->master);
-}
-
-static int dsa_remove(struct platform_device *pdev)
-{
- struct dsa_switch_tree *dst = platform_get_drvdata(pdev);
-
- dsa_remove_dst(dst);
- dsa_of_remove(&pdev->dev);
-
- return 0;
-}
-
-static void dsa_shutdown(struct platform_device *pdev)
-{
-}
-
-#ifdef CONFIG_PM_SLEEP
-static int dsa_suspend(struct device *d)
-{
- struct platform_device *pdev = to_platform_device(d);
- struct dsa_switch_tree *dst = platform_get_drvdata(pdev);
- int i, ret = 0;
-
- for (i = 0; i < dst->pd->nr_chips; i++) {
- struct dsa_switch *ds = dst->ds[i];
-
- if (ds != NULL)
- ret = dsa_switch_suspend(ds);
- }
-
- return ret;
-}
-
-static int dsa_resume(struct device *d)
-{
- struct platform_device *pdev = to_platform_device(d);
- struct dsa_switch_tree *dst = platform_get_drvdata(pdev);
- int i, ret = 0;
-
- for (i = 0; i < dst->pd->nr_chips; i++) {
- struct dsa_switch *ds = dst->ds[i];
-
- if (ds != NULL)
- ret = dsa_switch_resume(ds);
- }
-
- return ret;
-}
-#endif
-
-static SIMPLE_DEV_PM_OPS(dsa_pm_ops, dsa_suspend, dsa_resume);
-
-static const struct of_device_id dsa_of_match_table[] = {
- { .compatible = "marvell,dsa", },
- {}
-};
-MODULE_DEVICE_TABLE(of, dsa_of_match_table);
-
-static struct platform_driver dsa_driver = {
- .probe = dsa_probe,
- .remove = dsa_remove,
- .shutdown = dsa_shutdown,
- .driver = {
- .name = "dsa",
- .of_match_table = dsa_of_match_table,
- .pm = &dsa_pm_ops,
- },
-};
-
-int dsa_legacy_register(void)
-{
- return platform_driver_register(&dsa_driver);
-}
-
-void dsa_legacy_unregister(void)
-{
- platform_driver_unregister(&dsa_driver);
-}
diff --git a/net/dsa/master.c b/net/dsa/master.c
deleted file mode 100644
index c90ee3227dea..000000000000
--- a/net/dsa/master.c
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Handling of a master device, switching frames via its switch fabric CPU port
- *
- * Copyright (c) 2017 Savoir-faire Linux Inc.
- * Vivien Didelot <vivien.didelot@savoirfairelinux.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include "dsa_priv.h"
-
-static void dsa_master_get_ethtool_stats(struct net_device *dev,
- struct ethtool_stats *stats,
- uint64_t *data)
-{
- struct dsa_port *cpu_dp = dev->dsa_ptr;
- const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
- struct dsa_switch *ds = cpu_dp->ds;
- int port = cpu_dp->index;
- int count = 0;
-
- if (ops->get_sset_count && ops->get_ethtool_stats) {
- count = ops->get_sset_count(dev, ETH_SS_STATS);
- ops->get_ethtool_stats(dev, stats, data);
- }
-
- if (ds->ops->get_ethtool_stats)
- ds->ops->get_ethtool_stats(ds, port, data + count);
-}
-
-static void dsa_master_get_ethtool_phy_stats(struct net_device *dev,
- struct ethtool_stats *stats,
- uint64_t *data)
-{
- struct dsa_port *cpu_dp = dev->dsa_ptr;
- const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
- struct dsa_switch *ds = cpu_dp->ds;
- int port = cpu_dp->index;
- int count = 0;
-
- if (dev->phydev && !ops->get_ethtool_phy_stats) {
- count = phy_ethtool_get_sset_count(dev->phydev);
- if (count >= 0)
- phy_ethtool_get_stats(dev->phydev, stats, data);
- } else if (ops->get_sset_count && ops->get_ethtool_phy_stats) {
- count = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
- ops->get_ethtool_phy_stats(dev, stats, data);
- }
-
- if (count < 0)
- count = 0;
-
- if (ds->ops->get_ethtool_phy_stats)
- ds->ops->get_ethtool_phy_stats(ds, port, data + count);
-}
-
-static int dsa_master_get_sset_count(struct net_device *dev, int sset)
-{
- struct dsa_port *cpu_dp = dev->dsa_ptr;
- const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
- struct dsa_switch *ds = cpu_dp->ds;
- int count = 0;
-
- if (sset == ETH_SS_PHY_STATS && dev->phydev &&
- !ops->get_ethtool_phy_stats)
- count = phy_ethtool_get_sset_count(dev->phydev);
- else if (ops->get_sset_count)
- count = ops->get_sset_count(dev, sset);
-
- if (count < 0)
- count = 0;
-
- if (ds->ops->get_sset_count)
- count += ds->ops->get_sset_count(ds, cpu_dp->index, sset);
-
- return count;
-}
-
-static void dsa_master_get_strings(struct net_device *dev, uint32_t stringset,
- uint8_t *data)
-{
- struct dsa_port *cpu_dp = dev->dsa_ptr;
- const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops;
- struct dsa_switch *ds = cpu_dp->ds;
- int port = cpu_dp->index;
- int len = ETH_GSTRING_LEN;
- int mcount = 0, count;
- unsigned int i;
- uint8_t pfx[4];
- uint8_t *ndata;
-
- snprintf(pfx, sizeof(pfx), "p%.2d", port);
- /* We do not want to be NULL-terminated, since this is a prefix */
- pfx[sizeof(pfx) - 1] = '_';
-
- if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
- !ops->get_ethtool_phy_stats) {
- mcount = phy_ethtool_get_sset_count(dev->phydev);
- if (mcount < 0)
- mcount = 0;
- else
- phy_ethtool_get_strings(dev->phydev, data);
- } else if (ops->get_sset_count && ops->get_strings) {
- mcount = ops->get_sset_count(dev, stringset);
- if (mcount < 0)
- mcount = 0;
- ops->get_strings(dev, stringset, data);
- }
-
- if (ds->ops->get_strings) {
- ndata = data + mcount * len;
- /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle
- * the output after to prepend our CPU port prefix we
- * constructed earlier
- */
- ds->ops->get_strings(ds, port, stringset, ndata);
- count = ds->ops->get_sset_count(ds, port, stringset);
- for (i = 0; i < count; i++) {
- memmove(ndata + (i * len + sizeof(pfx)),
- ndata + i * len, len - sizeof(pfx));
- memcpy(ndata + i * len, pfx, sizeof(pfx));
- }
- }
-}
-
-static int dsa_master_ethtool_setup(struct net_device *dev)
-{
- struct dsa_port *cpu_dp = dev->dsa_ptr;
- struct dsa_switch *ds = cpu_dp->ds;
- struct ethtool_ops *ops;
-
- ops = devm_kzalloc(ds->dev, sizeof(*ops), GFP_KERNEL);
- if (!ops)
- return -ENOMEM;
-
- cpu_dp->orig_ethtool_ops = dev->ethtool_ops;
- if (cpu_dp->orig_ethtool_ops)
- memcpy(ops, cpu_dp->orig_ethtool_ops, sizeof(*ops));
-
- ops->get_sset_count = dsa_master_get_sset_count;
- ops->get_ethtool_stats = dsa_master_get_ethtool_stats;
- ops->get_strings = dsa_master_get_strings;
- ops->get_ethtool_phy_stats = dsa_master_get_ethtool_phy_stats;
-
- dev->ethtool_ops = ops;
-
- return 0;
-}
-
-static void dsa_master_ethtool_teardown(struct net_device *dev)
-{
- struct dsa_port *cpu_dp = dev->dsa_ptr;
-
- dev->ethtool_ops = cpu_dp->orig_ethtool_ops;
- cpu_dp->orig_ethtool_ops = NULL;
-}
-
-int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp)
-{
- /* If we use a tagging format that doesn't have an ethertype
- * field, make sure that all packets from this point on get
- * sent to the tag format's receive function.
- */
- wmb();
-
- dev->dsa_ptr = cpu_dp;
-
- return dsa_master_ethtool_setup(dev);
-}
-
-void dsa_master_teardown(struct net_device *dev)
-{
- dsa_master_ethtool_teardown(dev);
-
- dev->dsa_ptr = NULL;
-
- /* If we used a tagging format that doesn't have an ethertype
- * field, make sure that all packets from this point get sent
- * without the tag and go through the regular receive path.
- */
- wmb();
-}
diff --git a/net/dsa/netlink.c b/net/dsa/netlink.c
new file mode 100644
index 000000000000..1332e56349e5
--- /dev/null
+++ b/net/dsa/netlink.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2022 NXP
+ */
+#include <linux/netdevice.h>
+#include <net/rtnetlink.h>
+
+#include "netlink.h"
+#include "user.h"
+
+static const struct nla_policy dsa_policy[IFLA_DSA_MAX + 1] = {
+ [IFLA_DSA_CONDUIT] = { .type = NLA_U32 },
+};
+
+static int dsa_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (!data)
+ return 0;
+
+ if (data[IFLA_DSA_CONDUIT]) {
+ u32 ifindex = nla_get_u32(data[IFLA_DSA_CONDUIT]);
+ struct net_device *conduit;
+
+ conduit = __dev_get_by_index(dev_net(dev), ifindex);
+ if (!conduit)
+ return -EINVAL;
+
+ err = dsa_user_change_conduit(dev, conduit, extack);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static size_t dsa_get_size(const struct net_device *dev)
+{
+ return nla_total_size(sizeof(u32)) + /* IFLA_DSA_CONDUIT */
+ 0;
+}
+
+static int dsa_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+
+ if (nla_put_u32(skb, IFLA_DSA_CONDUIT, conduit->ifindex))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+struct rtnl_link_ops dsa_link_ops __read_mostly = {
+ .kind = "dsa",
+ .priv_size = sizeof(struct dsa_port),
+ .maxtype = IFLA_DSA_MAX,
+ .policy = dsa_policy,
+ .changelink = dsa_changelink,
+ .get_size = dsa_get_size,
+ .fill_info = dsa_fill_info,
+ .netns_refund = true,
+};
diff --git a/net/dsa/netlink.h b/net/dsa/netlink.h
new file mode 100644
index 000000000000..7eda2fa15722
--- /dev/null
+++ b/net/dsa/netlink.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_NETLINK_H
+#define __DSA_NETLINK_H
+
+extern struct rtnl_link_ops dsa_link_ops __read_mostly;
+
+#endif
diff --git a/net/dsa/port.c b/net/dsa/port.c
index ed0595459df1..ca3a7f52229b 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -1,48 +1,151 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Handling of a single switch port
*
* Copyright (c) 2017 Savoir-faire Linux Inc.
* Vivien Didelot <vivien.didelot@savoirfairelinux.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/if_bridge.h>
+#include <linux/netdevice.h>
#include <linux/notifier.h>
#include <linux/of_mdio.h>
#include <linux/of_net.h>
-#include "dsa_priv.h"
-
+#include "dsa.h"
+#include "port.h"
+#include "switch.h"
+#include "tag_8021q.h"
+#include "user.h"
+
+/**
+ * dsa_port_notify - Notify the switching fabric of changes to a port
+ * @dp: port on which change occurred
+ * @e: event, must be of type DSA_NOTIFIER_*
+ * @v: event-specific value.
+ *
+ * Notify all switches in the DSA tree that this port's switch belongs to,
+ * including this switch itself, of an event. Allows the other switches to
+ * reconfigure themselves for cross-chip operations. Can also be used to
+ * reconfigure ports without net_devices (CPU ports, DSA links) whenever
+ * a user port's state changes.
+ */
static int dsa_port_notify(const struct dsa_port *dp, unsigned long e, void *v)
{
- struct raw_notifier_head *nh = &dp->ds->dst->nh;
+ return dsa_tree_notify(dp->ds->dst, e, v);
+}
+
+static void dsa_port_notify_bridge_fdb_flush(const struct dsa_port *dp, u16 vid)
+{
+ struct net_device *brport_dev = dsa_port_to_bridge_port(dp);
+ struct switchdev_notifier_fdb_info info = {
+ .vid = vid,
+ };
+
+ /* When the port becomes standalone it has already left the bridge.
+ * Don't notify the bridge in that case.
+ */
+ if (!brport_dev)
+ return;
+
+ call_switchdev_notifiers(SWITCHDEV_FDB_FLUSH_TO_BRIDGE,
+ brport_dev, &info.info, NULL);
+}
+
+static void dsa_port_fast_age(const struct dsa_port *dp)
+{
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->port_fast_age)
+ return;
+
+ ds->ops->port_fast_age(ds, dp->index);
+
+ /* flush all VLANs */
+ dsa_port_notify_bridge_fdb_flush(dp, 0);
+}
+
+static int dsa_port_vlan_fast_age(const struct dsa_port *dp, u16 vid)
+{
+ struct dsa_switch *ds = dp->ds;
+ int err;
+
+ if (!ds->ops->port_vlan_fast_age)
+ return -EOPNOTSUPP;
+
+ err = ds->ops->port_vlan_fast_age(ds, dp->index, vid);
+
+ if (!err)
+ dsa_port_notify_bridge_fdb_flush(dp, vid);
+
+ return err;
+}
+
+static int dsa_port_msti_fast_age(const struct dsa_port *dp, u16 msti)
+{
+ DECLARE_BITMAP(vids, VLAN_N_VID) = { 0 };
+ int err, vid;
+
+ err = br_mst_get_info(dsa_port_bridge_dev_get(dp), msti, vids);
+ if (err)
+ return err;
+
+ for_each_set_bit(vid, vids, VLAN_N_VID) {
+ err = dsa_port_vlan_fast_age(dp, vid);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static bool dsa_port_can_configure_learning(struct dsa_port *dp)
+{
+ struct switchdev_brport_flags flags = {
+ .mask = BR_LEARNING,
+ };
+ struct dsa_switch *ds = dp->ds;
int err;
- err = raw_notifier_call_chain(nh, e, v);
+ if (!ds->ops->port_bridge_flags || !ds->ops->port_pre_bridge_flags)
+ return false;
- return notifier_to_errno(err);
+ err = ds->ops->port_pre_bridge_flags(ds, dp->index, flags, NULL);
+ return !err;
}
-int dsa_port_set_state(struct dsa_port *dp, u8 state,
- struct switchdev_trans *trans)
+bool dsa_port_supports_hwtstamp(struct dsa_port *dp)
+{
+ struct kernel_hwtstamp_config config = {};
+ struct dsa_switch *ds = dp->ds;
+ int err;
+
+ if (!ds->ops->port_hwtstamp_get || !ds->ops->port_hwtstamp_set)
+ return false;
+
+ /* "See through" shim implementations of the "get" method. */
+ err = ds->ops->port_hwtstamp_get(ds, dp->index, &config);
+ return err != -EOPNOTSUPP;
+}
+
+int dsa_port_set_state(struct dsa_port *dp, u8 state, bool do_fast_age)
{
struct dsa_switch *ds = dp->ds;
int port = dp->index;
- if (switchdev_trans_ph_prepare(trans))
- return ds->ops->port_stp_state_set ? 0 : -EOPNOTSUPP;
+ if (!ds->ops->port_stp_state_set)
+ return -EOPNOTSUPP;
- if (ds->ops->port_stp_state_set)
- ds->ops->port_stp_state_set(ds, port, state);
+ ds->ops->port_stp_state_set(ds, port, state);
- if (ds->ops->port_fast_age) {
+ if (!dsa_port_can_configure_learning(dp) ||
+ (do_fast_age && dp->learning)) {
/* Fast age FDB entries or flush appropriate forwarding database
* for the given port, if we are moving it from Learning or
* Forwarding state, to Disabled or Blocking or Listening state.
+ * Ports that were standalone before the STP state change don't
+ * need to fast age the FDB, since address learning is off in
+ * standalone mode.
*/
if ((dp->stp_state == BR_STATE_LEARNING ||
@@ -50,7 +153,7 @@ int dsa_port_set_state(struct dsa_port *dp, u8 state,
(state == BR_STATE_DISABLED ||
state == BR_STATE_BLOCKING ||
state == BR_STATE_LISTENING))
- ds->ops->port_fast_age(ds, port);
+ dsa_port_fast_age(dp);
}
dp->stp_state = state;
@@ -58,18 +161,57 @@ int dsa_port_set_state(struct dsa_port *dp, u8 state,
return 0;
}
-static void dsa_port_set_state_now(struct dsa_port *dp, u8 state)
+static void dsa_port_set_state_now(struct dsa_port *dp, u8 state,
+ bool do_fast_age)
{
+ struct dsa_switch *ds = dp->ds;
int err;
- err = dsa_port_set_state(dp, state, NULL);
+ err = dsa_port_set_state(dp, state, do_fast_age);
+ if (err && err != -EOPNOTSUPP) {
+ dev_err(ds->dev, "port %d failed to set STP state %u: %pe\n",
+ dp->index, state, ERR_PTR(err));
+ }
+}
+
+int dsa_port_set_mst_state(struct dsa_port *dp,
+ const struct switchdev_mst_state *state,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dp->ds;
+ u8 prev_state;
+ int err;
+
+ if (!ds->ops->port_mst_state_set)
+ return -EOPNOTSUPP;
+
+ err = br_mst_get_state(dsa_port_to_bridge_port(dp), state->msti,
+ &prev_state);
+ if (err)
+ return err;
+
+ err = ds->ops->port_mst_state_set(ds, dp->index, state);
+ if (err)
+ return err;
+
+ if (!(dp->learning &&
+ (prev_state == BR_STATE_LEARNING ||
+ prev_state == BR_STATE_FORWARDING) &&
+ (state->state == BR_STATE_DISABLED ||
+ state->state == BR_STATE_BLOCKING ||
+ state->state == BR_STATE_LISTENING)))
+ return 0;
+
+ err = dsa_port_msti_fast_age(dp, state->msti);
if (err)
- pr_err("DSA: failed to set STP state %u (%d)\n", state, err);
+ NL_SET_ERR_MSG_MOD(extack,
+ "Unable to flush associated VLANs");
+
+ return 0;
}
-int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy)
+int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy)
{
- u8 stp_state = dp->bridge_dev ? BR_STATE_BLOCKING : BR_STATE_FORWARDING;
struct dsa_switch *ds = dp->ds;
int port = dp->index;
int err;
@@ -80,113 +222,777 @@ int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy)
return err;
}
- dsa_port_set_state_now(dp, stp_state);
+ if (!dp->bridge)
+ dsa_port_set_state_now(dp, BR_STATE_FORWARDING, false);
+
+ if (dp->pl)
+ phylink_start(dp->pl);
return 0;
}
-void dsa_port_disable(struct dsa_port *dp, struct phy_device *phy)
+int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy)
+{
+ int err;
+
+ rtnl_lock();
+ err = dsa_port_enable_rt(dp, phy);
+ rtnl_unlock();
+
+ return err;
+}
+
+void dsa_port_disable_rt(struct dsa_port *dp)
{
struct dsa_switch *ds = dp->ds;
int port = dp->index;
- dsa_port_set_state_now(dp, BR_STATE_DISABLED);
+ if (dp->pl)
+ phylink_stop(dp->pl);
+
+ if (!dp->bridge)
+ dsa_port_set_state_now(dp, BR_STATE_DISABLED, false);
if (ds->ops->port_disable)
- ds->ops->port_disable(ds, port, phy);
+ ds->ops->port_disable(ds, port);
+}
+
+void dsa_port_disable(struct dsa_port *dp)
+{
+ rtnl_lock();
+ dsa_port_disable_rt(dp);
+ rtnl_unlock();
+}
+
+static void dsa_port_reset_vlan_filtering(struct dsa_port *dp,
+ struct dsa_bridge bridge)
+{
+ struct netlink_ext_ack extack = {0};
+ bool change_vlan_filtering = false;
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_port *other_dp;
+ bool vlan_filtering;
+ int err;
+
+ if (ds->needs_standalone_vlan_filtering &&
+ !br_vlan_enabled(bridge.dev)) {
+ change_vlan_filtering = true;
+ vlan_filtering = true;
+ } else if (!ds->needs_standalone_vlan_filtering &&
+ br_vlan_enabled(bridge.dev)) {
+ change_vlan_filtering = true;
+ vlan_filtering = false;
+ }
+
+ /* If the bridge was vlan_filtering, the bridge core doesn't trigger an
+ * event for changing vlan_filtering setting upon user ports leaving
+ * it. That is a good thing, because that lets us handle it and also
+ * handle the case where the switch's vlan_filtering setting is global
+ * (not per port). When that happens, the correct moment to trigger the
+ * vlan_filtering callback is only when the last port leaves the last
+ * VLAN-aware bridge.
+ */
+ if (change_vlan_filtering && ds->vlan_filtering_is_global) {
+ dsa_switch_for_each_port(other_dp, ds) {
+ struct net_device *br = dsa_port_bridge_dev_get(other_dp);
+
+ if (br && br_vlan_enabled(br)) {
+ change_vlan_filtering = false;
+ break;
+ }
+ }
+ }
+
+ if (!change_vlan_filtering)
+ return;
+
+ err = dsa_port_vlan_filtering(dp, vlan_filtering, &extack);
+ if (extack._msg) {
+ dev_err(ds->dev, "port %d: %s\n", dp->index,
+ extack._msg);
+ }
+ if (err && err != -EOPNOTSUPP) {
+ dev_err(ds->dev,
+ "port %d failed to reset VLAN filtering to %d: %pe\n",
+ dp->index, vlan_filtering, ERR_PTR(err));
+ }
+}
+
+static int dsa_port_inherit_brport_flags(struct dsa_port *dp,
+ struct netlink_ext_ack *extack)
+{
+ const unsigned long mask = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD |
+ BR_BCAST_FLOOD | BR_PORT_LOCKED;
+ struct net_device *brport_dev = dsa_port_to_bridge_port(dp);
+ int flag, err;
+
+ for_each_set_bit(flag, &mask, 32) {
+ struct switchdev_brport_flags flags = {0};
+
+ flags.mask = BIT(flag);
+
+ if (br_port_flag_is_set(brport_dev, BIT(flag)))
+ flags.val = BIT(flag);
+
+ err = dsa_port_bridge_flags(dp, flags, extack);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+ }
+
+ return 0;
+}
+
+static void dsa_port_clear_brport_flags(struct dsa_port *dp)
+{
+ const unsigned long val = BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD;
+ const unsigned long mask = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD |
+ BR_BCAST_FLOOD | BR_PORT_LOCKED;
+ int flag, err;
+
+ for_each_set_bit(flag, &mask, 32) {
+ struct switchdev_brport_flags flags = {0};
+
+ flags.mask = BIT(flag);
+ flags.val = val & BIT(flag);
+
+ err = dsa_port_bridge_flags(dp, flags, NULL);
+ if (err && err != -EOPNOTSUPP)
+ dev_err(dp->ds->dev,
+ "failed to clear bridge port flag %lu: %pe\n",
+ flags.val, ERR_PTR(err));
+ }
+}
+
+static int dsa_port_switchdev_sync_attrs(struct dsa_port *dp,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *brport_dev = dsa_port_to_bridge_port(dp);
+ struct net_device *br = dsa_port_bridge_dev_get(dp);
+ int err;
+
+ err = dsa_port_inherit_brport_flags(dp, extack);
+ if (err)
+ return err;
+
+ err = dsa_port_set_state(dp, br_port_get_stp_state(brport_dev), false);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ err = dsa_port_vlan_filtering(dp, br_vlan_enabled(br), extack);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ err = dsa_port_ageing_time(dp, br_get_ageing_time(br));
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ return 0;
+}
+
+static void dsa_port_switchdev_unsync_attrs(struct dsa_port *dp,
+ struct dsa_bridge bridge)
+{
+ /* Configure the port for standalone mode (no address learning,
+ * flood everything).
+ * The bridge only emits SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS events
+ * when the user requests it through netlink or sysfs, but not
+ * automatically at port join or leave, so we need to handle resetting
+ * the brport flags ourselves. But we even prefer it that way, because
+ * otherwise, some setups might never get the notification they need,
+ * for example, when a port leaves a LAG that offloads the bridge,
+ * it becomes standalone, but as far as the bridge is concerned, no
+ * port ever left.
+ */
+ dsa_port_clear_brport_flags(dp);
+
+ /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
+ * so allow it to be in BR_STATE_FORWARDING to be kept functional
+ */
+ dsa_port_set_state_now(dp, BR_STATE_FORWARDING, true);
+
+ dsa_port_reset_vlan_filtering(dp, bridge);
+
+ /* Ageing time may be global to the switch chip, so don't change it
+ * here because we have no good reason (or value) to change it to.
+ */
+}
+
+static int dsa_port_bridge_create(struct dsa_port *dp,
+ struct net_device *br,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_bridge *bridge;
+
+ bridge = dsa_tree_bridge_find(ds->dst, br);
+ if (bridge) {
+ refcount_inc(&bridge->refcount);
+ dp->bridge = bridge;
+ return 0;
+ }
+
+ bridge = kzalloc(sizeof(*bridge), GFP_KERNEL);
+ if (!bridge)
+ return -ENOMEM;
+
+ refcount_set(&bridge->refcount, 1);
+
+ bridge->dev = br;
+
+ bridge->num = dsa_bridge_num_get(br, ds->max_num_bridges);
+ if (ds->max_num_bridges && !bridge->num) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Range of offloadable bridges exceeded");
+ kfree(bridge);
+ return -EOPNOTSUPP;
+ }
+
+ dp->bridge = bridge;
+
+ return 0;
+}
+
+static void dsa_port_bridge_destroy(struct dsa_port *dp,
+ const struct net_device *br)
+{
+ struct dsa_bridge *bridge = dp->bridge;
+
+ dp->bridge = NULL;
+
+ if (!refcount_dec_and_test(&bridge->refcount))
+ return;
+
+ if (bridge->num)
+ dsa_bridge_num_put(br, bridge->num);
+
+ kfree(bridge);
}
-int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br)
+static bool dsa_port_supports_mst(struct dsa_port *dp)
+{
+ struct dsa_switch *ds = dp->ds;
+
+ return ds->ops->vlan_msti_set &&
+ ds->ops->port_mst_state_set &&
+ ds->ops->port_vlan_fast_age &&
+ dsa_port_can_configure_learning(dp);
+}
+
+int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
+ struct netlink_ext_ack *extack)
{
struct dsa_notifier_bridge_info info = {
- .sw_index = dp->ds->index,
- .port = dp->index,
- .br = br,
+ .dp = dp,
+ .extack = extack,
};
+ struct net_device *dev = dp->user;
+ struct net_device *brport_dev;
int err;
- /* Here the port is already bridged. Reflect the current configuration
- * so that drivers can program their chips accordingly.
+ if (br_mst_enabled(br) && !dsa_port_supports_mst(dp))
+ return -EOPNOTSUPP;
+
+ /* Here the interface is already bridged. Reflect the current
+ * configuration so that drivers can program their chips accordingly.
*/
- dp->bridge_dev = br;
+ err = dsa_port_bridge_create(dp, br, extack);
+ if (err)
+ return err;
+
+ brport_dev = dsa_port_to_bridge_port(dp);
+
+ info.bridge = *dp->bridge;
+ err = dsa_broadcast(DSA_NOTIFIER_BRIDGE_JOIN, &info);
+ if (err)
+ goto out_rollback;
+
+ /* Drivers which support bridge TX forwarding should set this */
+ dp->bridge->tx_fwd_offload = info.tx_fwd_offload;
- err = dsa_port_notify(dp, DSA_NOTIFIER_BRIDGE_JOIN, &info);
+ err = switchdev_bridge_port_offload(brport_dev, dev, dp,
+ &dsa_user_switchdev_notifier,
+ &dsa_user_switchdev_blocking_notifier,
+ dp->bridge->tx_fwd_offload, extack);
+ if (err)
+ goto out_rollback_unbridge;
- /* The bridging is rolled back on error */
+ err = dsa_port_switchdev_sync_attrs(dp, extack);
if (err)
- dp->bridge_dev = NULL;
+ goto out_rollback_unoffload;
+
+ return 0;
+out_rollback_unoffload:
+ switchdev_bridge_port_unoffload(brport_dev, dp,
+ &dsa_user_switchdev_notifier,
+ &dsa_user_switchdev_blocking_notifier);
+ dsa_flush_workqueue();
+out_rollback_unbridge:
+ dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info);
+out_rollback:
+ dsa_port_bridge_destroy(dp, br);
return err;
}
+void dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br)
+{
+ struct net_device *brport_dev = dsa_port_to_bridge_port(dp);
+
+ /* Don't try to unoffload something that is not offloaded */
+ if (!brport_dev)
+ return;
+
+ switchdev_bridge_port_unoffload(brport_dev, dp,
+ &dsa_user_switchdev_notifier,
+ &dsa_user_switchdev_blocking_notifier);
+
+ dsa_flush_workqueue();
+}
+
void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br)
{
struct dsa_notifier_bridge_info info = {
- .sw_index = dp->ds->index,
- .port = dp->index,
- .br = br,
+ .dp = dp,
};
int err;
+ /* If the port could not be offloaded to begin with, then
+ * there is nothing to do.
+ */
+ if (!dp->bridge)
+ return;
+
+ info.bridge = *dp->bridge;
+
/* Here the port is already unbridged. Reflect the current configuration
* so that drivers can program their chips accordingly.
*/
- dp->bridge_dev = NULL;
+ dsa_port_bridge_destroy(dp, br);
- err = dsa_port_notify(dp, DSA_NOTIFIER_BRIDGE_LEAVE, &info);
+ err = dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info);
if (err)
- pr_err("DSA: failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n");
+ dev_err(dp->ds->dev,
+ "port %d failed to notify DSA_NOTIFIER_BRIDGE_LEAVE: %pe\n",
+ dp->index, ERR_PTR(err));
- /* Port left the bridge, put in BR_STATE_DISABLED by the bridge layer,
- * so allow it to be in BR_STATE_FORWARDING to be kept functional
+ dsa_port_switchdev_unsync_attrs(dp, info.bridge);
+}
+
+int dsa_port_lag_change(struct dsa_port *dp,
+ struct netdev_lag_lower_state_info *linfo)
+{
+ struct dsa_notifier_lag_info info = {
+ .dp = dp,
+ };
+ bool tx_enabled;
+
+ if (!dp->lag)
+ return 0;
+
+ /* On statically configured aggregates (e.g. loadbalance
+ * without LACP) ports will always be tx_enabled, even if the
+ * link is down. Thus we require both link_up and tx_enabled
+ * in order to include it in the tx set.
+ */
+ tx_enabled = linfo->link_up && linfo->tx_enabled;
+
+ if (tx_enabled == dp->lag_tx_enabled)
+ return 0;
+
+ dp->lag_tx_enabled = tx_enabled;
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_LAG_CHANGE, &info);
+}
+
+static int dsa_port_lag_create(struct dsa_port *dp,
+ struct net_device *lag_dev)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_lag *lag;
+
+ lag = dsa_tree_lag_find(ds->dst, lag_dev);
+ if (lag) {
+ refcount_inc(&lag->refcount);
+ dp->lag = lag;
+ return 0;
+ }
+
+ lag = kzalloc(sizeof(*lag), GFP_KERNEL);
+ if (!lag)
+ return -ENOMEM;
+
+ refcount_set(&lag->refcount, 1);
+ mutex_init(&lag->fdb_lock);
+ INIT_LIST_HEAD(&lag->fdbs);
+ lag->dev = lag_dev;
+ dsa_lag_map(ds->dst, lag);
+ dp->lag = lag;
+
+ return 0;
+}
+
+static void dsa_port_lag_destroy(struct dsa_port *dp)
+{
+ struct dsa_lag *lag = dp->lag;
+
+ dp->lag = NULL;
+ dp->lag_tx_enabled = false;
+
+ if (!refcount_dec_and_test(&lag->refcount))
+ return;
+
+ WARN_ON(!list_empty(&lag->fdbs));
+ dsa_lag_unmap(dp->ds->dst, lag);
+ kfree(lag);
+}
+
+int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev,
+ struct netdev_lag_upper_info *uinfo,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_notifier_lag_info info = {
+ .dp = dp,
+ .info = uinfo,
+ .extack = extack,
+ };
+ struct net_device *bridge_dev;
+ int err;
+
+ err = dsa_port_lag_create(dp, lag_dev);
+ if (err)
+ goto err_lag_create;
+
+ info.lag = *dp->lag;
+ err = dsa_port_notify(dp, DSA_NOTIFIER_LAG_JOIN, &info);
+ if (err)
+ goto err_lag_join;
+
+ bridge_dev = netdev_master_upper_dev_get(lag_dev);
+ if (!bridge_dev || !netif_is_bridge_master(bridge_dev))
+ return 0;
+
+ err = dsa_port_bridge_join(dp, bridge_dev, extack);
+ if (err)
+ goto err_bridge_join;
+
+ return 0;
+
+err_bridge_join:
+ dsa_port_notify(dp, DSA_NOTIFIER_LAG_LEAVE, &info);
+err_lag_join:
+ dsa_port_lag_destroy(dp);
+err_lag_create:
+ return err;
+}
+
+void dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag_dev)
+{
+ struct net_device *br = dsa_port_bridge_dev_get(dp);
+
+ if (br)
+ dsa_port_pre_bridge_leave(dp, br);
+}
+
+void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev)
+{
+ struct net_device *br = dsa_port_bridge_dev_get(dp);
+ struct dsa_notifier_lag_info info = {
+ .dp = dp,
+ };
+ int err;
+
+ if (!dp->lag)
+ return;
+
+ /* Port might have been part of a LAG that in turn was
+ * attached to a bridge.
*/
- dsa_port_set_state_now(dp, BR_STATE_FORWARDING);
+ if (br)
+ dsa_port_bridge_leave(dp, br);
+
+ info.lag = *dp->lag;
+
+ dsa_port_lag_destroy(dp);
+
+ err = dsa_port_notify(dp, DSA_NOTIFIER_LAG_LEAVE, &info);
+ if (err)
+ dev_err(dp->ds->dev,
+ "port %d failed to notify DSA_NOTIFIER_LAG_LEAVE: %pe\n",
+ dp->index, ERR_PTR(err));
+}
+
+/* Must be called under rcu_read_lock() */
+static bool dsa_port_can_apply_vlan_filtering(struct dsa_port *dp,
+ bool vlan_filtering,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_port *other_dp;
+ int err;
+
+ /* VLAN awareness was off, so the question is "can we turn it on".
+ * We may have had 8021q uppers, those need to go. Make sure we don't
+ * enter an inconsistent state: deny changing the VLAN awareness state
+ * as long as we have 8021q uppers.
+ */
+ if (vlan_filtering && dsa_port_is_user(dp)) {
+ struct net_device *br = dsa_port_bridge_dev_get(dp);
+ struct net_device *upper_dev, *user = dp->user;
+ struct list_head *iter;
+
+ netdev_for_each_upper_dev_rcu(user, upper_dev, iter) {
+ struct bridge_vlan_info br_info;
+ u16 vid;
+
+ if (!is_vlan_dev(upper_dev))
+ continue;
+
+ vid = vlan_dev_vlan_id(upper_dev);
+
+ /* br_vlan_get_info() returns -EINVAL or -ENOENT if the
+ * device, respectively the VID is not found, returning
+ * 0 means success, which is a failure for us here.
+ */
+ err = br_vlan_get_info(br, vid, &br_info);
+ if (err == 0) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Must first remove VLAN uppers having VIDs also present in bridge");
+ return false;
+ }
+ }
+ }
+
+ if (!ds->vlan_filtering_is_global)
+ return true;
+
+ /* For cases where enabling/disabling VLAN awareness is global to the
+ * switch, we need to handle the case where multiple bridges span
+ * different ports of the same switch device and one of them has a
+ * different setting than what is being requested.
+ */
+ dsa_switch_for_each_port(other_dp, ds) {
+ struct net_device *other_br = dsa_port_bridge_dev_get(other_dp);
+
+ /* If it's the same bridge, it also has same
+ * vlan_filtering setting => no need to check
+ */
+ if (!other_br || other_br == dsa_port_bridge_dev_get(dp))
+ continue;
+
+ if (br_vlan_enabled(other_br) != vlan_filtering) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "VLAN filtering is a global setting");
+ return false;
+ }
+ }
+ return true;
}
int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
- struct switchdev_trans *trans)
+ struct netlink_ext_ack *extack)
{
+ bool old_vlan_filtering = dsa_port_is_vlan_filtering(dp);
struct dsa_switch *ds = dp->ds;
+ bool apply;
+ int err;
- /* bridge skips -EOPNOTSUPP, so skip the prepare phase */
- if (switchdev_trans_ph_prepare(trans))
+ if (!ds->ops->port_vlan_filtering)
+ return -EOPNOTSUPP;
+
+ /* We are called from dsa_user_switchdev_blocking_event(),
+ * which is not under rcu_read_lock(), unlike
+ * dsa_user_switchdev_event().
+ */
+ rcu_read_lock();
+ apply = dsa_port_can_apply_vlan_filtering(dp, vlan_filtering, extack);
+ rcu_read_unlock();
+ if (!apply)
+ return -EINVAL;
+
+ if (dsa_port_is_vlan_filtering(dp) == vlan_filtering)
return 0;
- if (ds->ops->port_vlan_filtering)
- return ds->ops->port_vlan_filtering(ds, dp->index,
- vlan_filtering);
+ err = ds->ops->port_vlan_filtering(ds, dp->index, vlan_filtering,
+ extack);
+ if (err)
+ return err;
+
+ if (ds->vlan_filtering_is_global) {
+ struct dsa_port *other_dp;
+
+ ds->vlan_filtering = vlan_filtering;
+
+ dsa_switch_for_each_user_port(other_dp, ds) {
+ struct net_device *user = other_dp->user;
+
+ /* We might be called in the unbind path, so not
+ * all user devices might still be registered.
+ */
+ if (!user)
+ continue;
+
+ err = dsa_user_manage_vlan_filtering(user,
+ vlan_filtering);
+ if (err)
+ goto restore;
+ }
+ } else {
+ dp->vlan_filtering = vlan_filtering;
+
+ err = dsa_user_manage_vlan_filtering(dp->user,
+ vlan_filtering);
+ if (err)
+ goto restore;
+ }
return 0;
+
+restore:
+ ds->ops->port_vlan_filtering(ds, dp->index, old_vlan_filtering, NULL);
+
+ if (ds->vlan_filtering_is_global)
+ ds->vlan_filtering = old_vlan_filtering;
+ else
+ dp->vlan_filtering = old_vlan_filtering;
+
+ return err;
}
-int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock,
- struct switchdev_trans *trans)
+/* This enforces legacy behavior for switch drivers which assume they can't
+ * receive VLAN configuration when joining a bridge with vlan_filtering=0
+ */
+bool dsa_port_skip_vlan_configuration(struct dsa_port *dp)
+{
+ struct net_device *br = dsa_port_bridge_dev_get(dp);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!br)
+ return false;
+
+ return !ds->configure_vlan_while_not_filtering && !br_vlan_enabled(br);
+}
+
+int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock)
{
unsigned long ageing_jiffies = clock_t_to_jiffies(ageing_clock);
unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies);
- struct dsa_notifier_ageing_time_info info = {
- .ageing_time = ageing_time,
- .trans = trans,
- };
+ struct dsa_notifier_ageing_time_info info;
+ int err;
- if (switchdev_trans_ph_prepare(trans))
- return dsa_port_notify(dp, DSA_NOTIFIER_AGEING_TIME, &info);
+ info.ageing_time = ageing_time;
+
+ err = dsa_port_notify(dp, DSA_NOTIFIER_AGEING_TIME, &info);
+ if (err)
+ return err;
dp->ageing_time = ageing_time;
- return dsa_port_notify(dp, DSA_NOTIFIER_AGEING_TIME, &info);
+ return 0;
+}
+
+int dsa_port_mst_enable(struct dsa_port *dp, bool on,
+ struct netlink_ext_ack *extack)
+{
+ if (on && !dsa_port_supports_mst(dp)) {
+ NL_SET_ERR_MSG_MOD(extack, "Hardware does not support MST");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int dsa_port_pre_bridge_flags(const struct dsa_port *dp,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->port_pre_bridge_flags)
+ return -EINVAL;
+
+ return ds->ops->port_pre_bridge_flags(ds, dp->index, flags, extack);
+}
+
+int dsa_port_bridge_flags(struct dsa_port *dp,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dp->ds;
+ int err;
+
+ if (!ds->ops->port_bridge_flags)
+ return -EOPNOTSUPP;
+
+ err = ds->ops->port_bridge_flags(ds, dp->index, flags, extack);
+ if (err)
+ return err;
+
+ if (flags.mask & BR_LEARNING) {
+ bool learning = flags.val & BR_LEARNING;
+
+ if (learning == dp->learning)
+ return 0;
+
+ if ((dp->learning && !learning) &&
+ (dp->stp_state == BR_STATE_LEARNING ||
+ dp->stp_state == BR_STATE_FORWARDING))
+ dsa_port_fast_age(dp);
+
+ dp->learning = learning;
+ }
+
+ return 0;
+}
+
+void dsa_port_set_host_flood(struct dsa_port *dp, bool uc, bool mc)
+{
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->port_set_host_flood)
+ ds->ops->port_set_host_flood(ds, dp->index, uc, mc);
+}
+
+int dsa_port_vlan_msti(struct dsa_port *dp,
+ const struct switchdev_vlan_msti *msti)
+{
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->vlan_msti_set)
+ return -EOPNOTSUPP;
+
+ return ds->ops->vlan_msti_set(ds, *dp->bridge, msti);
+}
+
+int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu)
+{
+ struct dsa_notifier_mtu_info info = {
+ .dp = dp,
+ .mtu = new_mtu,
+ };
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_MTU, &info);
}
int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr,
u16 vid)
{
struct dsa_notifier_fdb_info info = {
- .sw_index = dp->ds->index,
- .port = dp->index,
+ .dp = dp,
.addr = addr,
.vid = vid,
+ .db = {
+ .type = DSA_DB_BRIDGE,
+ .bridge = *dp->bridge,
+ },
};
+ /* Refcounting takes bridge.num as a key, and should be global for all
+ * bridges in the absence of FDB isolation, and per bridge otherwise.
+ * Force the bridge.num to zero here in the absence of FDB isolation.
+ */
+ if (!dp->ds->fdb_isolation)
+ info.db.bridge.num = 0;
+
return dsa_port_notify(dp, DSA_NOTIFIER_FDB_ADD, &info);
}
@@ -194,16 +1000,157 @@ int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr,
u16 vid)
{
struct dsa_notifier_fdb_info info = {
- .sw_index = dp->ds->index,
- .port = dp->index,
+ .dp = dp,
.addr = addr,
.vid = vid,
-
+ .db = {
+ .type = DSA_DB_BRIDGE,
+ .bridge = *dp->bridge,
+ },
};
+ if (!dp->ds->fdb_isolation)
+ info.db.bridge.num = 0;
+
return dsa_port_notify(dp, DSA_NOTIFIER_FDB_DEL, &info);
}
+static int dsa_port_host_fdb_add(struct dsa_port *dp,
+ const unsigned char *addr, u16 vid,
+ struct dsa_db db)
+{
+ struct dsa_notifier_fdb_info info = {
+ .dp = dp,
+ .addr = addr,
+ .vid = vid,
+ .db = db,
+ };
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_HOST_FDB_ADD, &info);
+}
+
+int dsa_port_standalone_host_fdb_add(struct dsa_port *dp,
+ const unsigned char *addr, u16 vid)
+{
+ struct dsa_db db = {
+ .type = DSA_DB_PORT,
+ .dp = dp,
+ };
+
+ return dsa_port_host_fdb_add(dp, addr, vid, db);
+}
+
+int dsa_port_bridge_host_fdb_add(struct dsa_port *dp,
+ const unsigned char *addr, u16 vid)
+{
+ struct net_device *conduit = dsa_port_to_conduit(dp);
+ struct dsa_db db = {
+ .type = DSA_DB_BRIDGE,
+ .bridge = *dp->bridge,
+ };
+ int err;
+
+ if (!dp->ds->fdb_isolation)
+ db.bridge.num = 0;
+
+ /* Avoid a call to __dev_set_promiscuity() on the conduit, which
+ * requires rtnl_lock(), since we can't guarantee that is held here,
+ * and we can't take it either.
+ */
+ if (conduit->priv_flags & IFF_UNICAST_FLT) {
+ err = dev_uc_add(conduit, addr);
+ if (err)
+ return err;
+ }
+
+ return dsa_port_host_fdb_add(dp, addr, vid, db);
+}
+
+static int dsa_port_host_fdb_del(struct dsa_port *dp,
+ const unsigned char *addr, u16 vid,
+ struct dsa_db db)
+{
+ struct dsa_notifier_fdb_info info = {
+ .dp = dp,
+ .addr = addr,
+ .vid = vid,
+ .db = db,
+ };
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_HOST_FDB_DEL, &info);
+}
+
+int dsa_port_standalone_host_fdb_del(struct dsa_port *dp,
+ const unsigned char *addr, u16 vid)
+{
+ struct dsa_db db = {
+ .type = DSA_DB_PORT,
+ .dp = dp,
+ };
+
+ return dsa_port_host_fdb_del(dp, addr, vid, db);
+}
+
+int dsa_port_bridge_host_fdb_del(struct dsa_port *dp,
+ const unsigned char *addr, u16 vid)
+{
+ struct net_device *conduit = dsa_port_to_conduit(dp);
+ struct dsa_db db = {
+ .type = DSA_DB_BRIDGE,
+ .bridge = *dp->bridge,
+ };
+ int err;
+
+ if (!dp->ds->fdb_isolation)
+ db.bridge.num = 0;
+
+ if (conduit->priv_flags & IFF_UNICAST_FLT) {
+ err = dev_uc_del(conduit, addr);
+ if (err)
+ return err;
+ }
+
+ return dsa_port_host_fdb_del(dp, addr, vid, db);
+}
+
+int dsa_port_lag_fdb_add(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid)
+{
+ struct dsa_notifier_lag_fdb_info info = {
+ .lag = dp->lag,
+ .addr = addr,
+ .vid = vid,
+ .db = {
+ .type = DSA_DB_BRIDGE,
+ .bridge = *dp->bridge,
+ },
+ };
+
+ if (!dp->ds->fdb_isolation)
+ info.db.bridge.num = 0;
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_LAG_FDB_ADD, &info);
+}
+
+int dsa_port_lag_fdb_del(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid)
+{
+ struct dsa_notifier_lag_fdb_info info = {
+ .lag = dp->lag,
+ .addr = addr,
+ .vid = vid,
+ .db = {
+ .type = DSA_DB_BRIDGE,
+ .bridge = *dp->bridge,
+ },
+ };
+
+ if (!dp->ds->fdb_isolation)
+ info.db.bridge.num = 0;
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_LAG_FDB_DEL, &info);
+}
+
int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data)
{
struct dsa_switch *ds = dp->ds;
@@ -216,16 +1163,20 @@ int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data)
}
int dsa_port_mdb_add(const struct dsa_port *dp,
- const struct switchdev_obj_port_mdb *mdb,
- struct switchdev_trans *trans)
+ const struct switchdev_obj_port_mdb *mdb)
{
struct dsa_notifier_mdb_info info = {
- .sw_index = dp->ds->index,
- .port = dp->index,
- .trans = trans,
+ .dp = dp,
.mdb = mdb,
+ .db = {
+ .type = DSA_DB_BRIDGE,
+ .bridge = *dp->bridge,
+ },
};
+ if (!dp->ds->fdb_isolation)
+ info.db.bridge.num = 0;
+
return dsa_port_notify(dp, DSA_NOTIFIER_MDB_ADD, &info);
}
@@ -233,216 +1184,772 @@ int dsa_port_mdb_del(const struct dsa_port *dp,
const struct switchdev_obj_port_mdb *mdb)
{
struct dsa_notifier_mdb_info info = {
- .sw_index = dp->ds->index,
- .port = dp->index,
+ .dp = dp,
.mdb = mdb,
+ .db = {
+ .type = DSA_DB_BRIDGE,
+ .bridge = *dp->bridge,
+ },
};
+ if (!dp->ds->fdb_isolation)
+ info.db.bridge.num = 0;
+
return dsa_port_notify(dp, DSA_NOTIFIER_MDB_DEL, &info);
}
+static int dsa_port_host_mdb_add(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb,
+ struct dsa_db db)
+{
+ struct dsa_notifier_mdb_info info = {
+ .dp = dp,
+ .mdb = mdb,
+ .db = db,
+ };
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_HOST_MDB_ADD, &info);
+}
+
+int dsa_port_standalone_host_mdb_add(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb)
+{
+ struct dsa_db db = {
+ .type = DSA_DB_PORT,
+ .dp = dp,
+ };
+
+ return dsa_port_host_mdb_add(dp, mdb, db);
+}
+
+int dsa_port_bridge_host_mdb_add(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb)
+{
+ struct net_device *conduit = dsa_port_to_conduit(dp);
+ struct dsa_db db = {
+ .type = DSA_DB_BRIDGE,
+ .bridge = *dp->bridge,
+ };
+ int err;
+
+ if (!dp->ds->fdb_isolation)
+ db.bridge.num = 0;
+
+ err = dev_mc_add(conduit, mdb->addr);
+ if (err)
+ return err;
+
+ return dsa_port_host_mdb_add(dp, mdb, db);
+}
+
+static int dsa_port_host_mdb_del(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb,
+ struct dsa_db db)
+{
+ struct dsa_notifier_mdb_info info = {
+ .dp = dp,
+ .mdb = mdb,
+ .db = db,
+ };
+
+ return dsa_port_notify(dp, DSA_NOTIFIER_HOST_MDB_DEL, &info);
+}
+
+int dsa_port_standalone_host_mdb_del(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb)
+{
+ struct dsa_db db = {
+ .type = DSA_DB_PORT,
+ .dp = dp,
+ };
+
+ return dsa_port_host_mdb_del(dp, mdb, db);
+}
+
+int dsa_port_bridge_host_mdb_del(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb)
+{
+ struct net_device *conduit = dsa_port_to_conduit(dp);
+ struct dsa_db db = {
+ .type = DSA_DB_BRIDGE,
+ .bridge = *dp->bridge,
+ };
+ int err;
+
+ if (!dp->ds->fdb_isolation)
+ db.bridge.num = 0;
+
+ err = dev_mc_del(conduit, mdb->addr);
+ if (err)
+ return err;
+
+ return dsa_port_host_mdb_del(dp, mdb, db);
+}
+
int dsa_port_vlan_add(struct dsa_port *dp,
const struct switchdev_obj_port_vlan *vlan,
- struct switchdev_trans *trans)
+ struct netlink_ext_ack *extack)
{
struct dsa_notifier_vlan_info info = {
- .sw_index = dp->ds->index,
- .port = dp->index,
- .trans = trans,
+ .dp = dp,
.vlan = vlan,
+ .extack = extack,
};
- if (netif_is_bridge_master(vlan->obj.orig_dev))
- return -EOPNOTSUPP;
-
- if (br_vlan_enabled(dp->bridge_dev))
- return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info);
-
- return 0;
+ return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_ADD, &info);
}
int dsa_port_vlan_del(struct dsa_port *dp,
const struct switchdev_obj_port_vlan *vlan)
{
struct dsa_notifier_vlan_info info = {
- .sw_index = dp->ds->index,
- .port = dp->index,
+ .dp = dp,
.vlan = vlan,
};
- if (netif_is_bridge_master(vlan->obj.orig_dev))
+ return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info);
+}
+
+int dsa_port_host_vlan_add(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *conduit = dsa_port_to_conduit(dp);
+ struct dsa_notifier_vlan_info info = {
+ .dp = dp,
+ .vlan = vlan,
+ .extack = extack,
+ };
+ int err;
+
+ err = dsa_port_notify(dp, DSA_NOTIFIER_HOST_VLAN_ADD, &info);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ vlan_vid_add(conduit, htons(ETH_P_8021Q), vlan->vid);
+
+ return err;
+}
+
+int dsa_port_host_vlan_del(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan)
+{
+ struct net_device *conduit = dsa_port_to_conduit(dp);
+ struct dsa_notifier_vlan_info info = {
+ .dp = dp,
+ .vlan = vlan,
+ };
+ int err;
+
+ err = dsa_port_notify(dp, DSA_NOTIFIER_HOST_VLAN_DEL, &info);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
+ vlan_vid_del(conduit, htons(ETH_P_8021Q), vlan->vid);
+
+ return err;
+}
+
+int dsa_port_mrp_add(const struct dsa_port *dp,
+ const struct switchdev_obj_mrp *mrp)
+{
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->port_mrp_add)
return -EOPNOTSUPP;
- if (br_vlan_enabled(dp->bridge_dev))
- return dsa_port_notify(dp, DSA_NOTIFIER_VLAN_DEL, &info);
+ return ds->ops->port_mrp_add(ds, dp->index, mrp);
+}
- return 0;
+int dsa_port_mrp_del(const struct dsa_port *dp,
+ const struct switchdev_obj_mrp *mrp)
+{
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->port_mrp_del)
+ return -EOPNOTSUPP;
+
+ return ds->ops->port_mrp_del(ds, dp->index, mrp);
}
-static struct phy_device *dsa_port_get_phy_device(struct dsa_port *dp)
+int dsa_port_mrp_add_ring_role(const struct dsa_port *dp,
+ const struct switchdev_obj_ring_role_mrp *mrp)
{
- struct device_node *phy_dn;
- struct phy_device *phydev;
+ struct dsa_switch *ds = dp->ds;
- phy_dn = of_parse_phandle(dp->dn, "phy-handle", 0);
- if (!phy_dn)
- return NULL;
+ if (!ds->ops->port_mrp_add_ring_role)
+ return -EOPNOTSUPP;
- phydev = of_phy_find_device(phy_dn);
- if (!phydev) {
- of_node_put(phy_dn);
- return ERR_PTR(-EPROBE_DEFER);
- }
+ return ds->ops->port_mrp_add_ring_role(ds, dp->index, mrp);
+}
- return phydev;
+int dsa_port_mrp_del_ring_role(const struct dsa_port *dp,
+ const struct switchdev_obj_ring_role_mrp *mrp)
+{
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->port_mrp_del_ring_role)
+ return -EOPNOTSUPP;
+
+ return ds->ops->port_mrp_del_ring_role(ds, dp->index, mrp);
}
-static int dsa_port_setup_phy_of(struct dsa_port *dp, bool enable)
+static int dsa_port_assign_conduit(struct dsa_port *dp,
+ struct net_device *conduit,
+ struct netlink_ext_ack *extack,
+ bool fail_on_err)
{
struct dsa_switch *ds = dp->ds;
- struct phy_device *phydev;
- int port = dp->index;
- int err = 0;
+ int port = dp->index, err;
- phydev = dsa_port_get_phy_device(dp);
- if (!phydev)
- return 0;
+ err = ds->ops->port_change_conduit(ds, port, conduit, extack);
+ if (err && !fail_on_err)
+ dev_err(ds->dev, "port %d failed to assign conduit %s: %pe\n",
+ port, conduit->name, ERR_PTR(err));
- if (IS_ERR(phydev))
- return PTR_ERR(phydev);
+ if (err && fail_on_err)
+ return err;
- if (enable) {
- err = genphy_config_init(phydev);
- if (err < 0)
- goto err_put_dev;
+ dp->cpu_dp = conduit->dsa_ptr;
+ dp->cpu_port_in_lag = netif_is_lag_master(conduit);
- err = genphy_resume(phydev);
- if (err < 0)
- goto err_put_dev;
+ return 0;
+}
- err = genphy_read_status(phydev);
- if (err < 0)
- goto err_put_dev;
- } else {
- err = genphy_suspend(phydev);
- if (err < 0)
- goto err_put_dev;
+/* Change the dp->cpu_dp affinity for a user port. Note that both cross-chip
+ * notifiers and drivers have implicit assumptions about user-to-CPU-port
+ * mappings, so we unfortunately cannot delay the deletion of the objects
+ * (switchdev, standalone addresses, standalone VLANs) on the old CPU port
+ * until the new CPU port has been set up. So we need to completely tear down
+ * the old CPU port before changing it, and restore it on errors during the
+ * bringup of the new one.
+ */
+int dsa_port_change_conduit(struct dsa_port *dp, struct net_device *conduit,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *bridge_dev = dsa_port_bridge_dev_get(dp);
+ struct net_device *old_conduit = dsa_port_to_conduit(dp);
+ struct net_device *dev = dp->user;
+ struct dsa_switch *ds = dp->ds;
+ bool vlan_filtering;
+ int err, tmp;
+
+ /* Bridges may hold host FDB, MDB and VLAN objects. These need to be
+ * migrated, so dynamically unoffload and later reoffload the bridge
+ * port.
+ */
+ if (bridge_dev) {
+ dsa_port_pre_bridge_leave(dp, bridge_dev);
+ dsa_port_bridge_leave(dp, bridge_dev);
+ }
+
+ /* The port might still be VLAN filtering even if it's no longer
+ * under a bridge, either due to ds->vlan_filtering_is_global or
+ * ds->needs_standalone_vlan_filtering. In turn this means VLANs
+ * on the CPU port.
+ */
+ vlan_filtering = dsa_port_is_vlan_filtering(dp);
+ if (vlan_filtering) {
+ err = dsa_user_manage_vlan_filtering(dev, false);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Failed to remove standalone VLANs");
+ goto rewind_old_bridge;
+ }
+ }
+
+ /* Standalone addresses, and addresses of upper interfaces like
+ * VLAN, LAG, HSR need to be migrated.
+ */
+ dsa_user_unsync_ha(dev);
+
+ /* If live-changing, we also need to uninstall the user device address
+ * from the port FDB and the conduit interface.
+ */
+ if (dev->flags & IFF_UP)
+ dsa_user_host_uc_uninstall(dev);
+
+ err = dsa_port_assign_conduit(dp, conduit, extack, true);
+ if (err)
+ goto rewind_old_addrs;
+
+ /* If the port doesn't have its own MAC address and relies on the DSA
+ * conduit's one, inherit it again from the new DSA conduit.
+ */
+ if (is_zero_ether_addr(dp->mac))
+ eth_hw_addr_inherit(dev, conduit);
+
+ /* If live-changing, we need to install the user device address to the
+ * port FDB and the conduit interface.
+ */
+ if (dev->flags & IFF_UP) {
+ err = dsa_user_host_uc_install(dev, dev->dev_addr);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Failed to install host UC address");
+ goto rewind_addr_inherit;
+ }
+ }
+
+ dsa_user_sync_ha(dev);
+
+ if (vlan_filtering) {
+ err = dsa_user_manage_vlan_filtering(dev, true);
+ if (err) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Failed to restore standalone VLANs");
+ goto rewind_new_addrs;
+ }
}
- if (ds->ops->adjust_link)
- ds->ops->adjust_link(ds, port, phydev);
+ if (bridge_dev) {
+ err = dsa_port_bridge_join(dp, bridge_dev, extack);
+ if (err && err == -EOPNOTSUPP) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Failed to reoffload bridge");
+ goto rewind_new_vlan;
+ }
+ }
+
+ return 0;
+
+rewind_new_vlan:
+ if (vlan_filtering)
+ dsa_user_manage_vlan_filtering(dev, false);
+
+rewind_new_addrs:
+ dsa_user_unsync_ha(dev);
+
+ if (dev->flags & IFF_UP)
+ dsa_user_host_uc_uninstall(dev);
+
+rewind_addr_inherit:
+ if (is_zero_ether_addr(dp->mac))
+ eth_hw_addr_inherit(dev, old_conduit);
+
+ dsa_port_assign_conduit(dp, old_conduit, NULL, false);
+
+/* Restore the objects on the old CPU port */
+rewind_old_addrs:
+ if (dev->flags & IFF_UP) {
+ tmp = dsa_user_host_uc_install(dev, dev->dev_addr);
+ if (tmp) {
+ dev_err(ds->dev,
+ "port %d failed to restore host UC address: %pe\n",
+ dp->index, ERR_PTR(tmp));
+ }
+ }
- dev_dbg(ds->dev, "enabled port's phy: %s", phydev_name(phydev));
+ dsa_user_sync_ha(dev);
+
+ if (vlan_filtering) {
+ tmp = dsa_user_manage_vlan_filtering(dev, true);
+ if (tmp) {
+ dev_err(ds->dev,
+ "port %d failed to restore standalone VLANs: %pe\n",
+ dp->index, ERR_PTR(tmp));
+ }
+ }
+
+rewind_old_bridge:
+ if (bridge_dev) {
+ tmp = dsa_port_bridge_join(dp, bridge_dev, extack);
+ if (tmp) {
+ dev_err(ds->dev,
+ "port %d failed to rejoin bridge %s: %pe\n",
+ dp->index, bridge_dev->name, ERR_PTR(tmp));
+ }
+ }
-err_put_dev:
- put_device(&phydev->mdio.dev);
return err;
}
-static int dsa_port_fixed_link_register_of(struct dsa_port *dp)
+void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp,
+ const struct dsa_device_ops *tag_ops)
+{
+ cpu_dp->rcv = tag_ops->rcv;
+ cpu_dp->tag_ops = tag_ops;
+}
+
+/* dsa_supports_eee - indicate that EEE is supported
+ * @ds: pointer to &struct dsa_switch
+ * @port: port index
+ *
+ * A default implementation for the .support_eee() DSA operations member,
+ * which drivers can use to indicate that they support EEE on all of their
+ * user ports.
+ *
+ * Returns: true
+ */
+bool dsa_supports_eee(struct dsa_switch *ds, int port)
+{
+ return true;
+}
+EXPORT_SYMBOL_GPL(dsa_supports_eee);
+
+static void dsa_port_phylink_mac_config(struct phylink_config *config,
+ unsigned int mode,
+ const struct phylink_link_state *state)
+{
+}
+
+static void dsa_port_phylink_mac_link_down(struct phylink_config *config,
+ unsigned int mode,
+ phy_interface_t interface)
+{
+}
+
+static void dsa_port_phylink_mac_link_up(struct phylink_config *config,
+ struct phy_device *phydev,
+ unsigned int mode,
+ phy_interface_t interface,
+ int speed, int duplex,
+ bool tx_pause, bool rx_pause)
+{
+}
+
+static const struct phylink_mac_ops dsa_port_phylink_mac_ops = {
+ .mac_config = dsa_port_phylink_mac_config,
+ .mac_link_down = dsa_port_phylink_mac_link_down,
+ .mac_link_up = dsa_port_phylink_mac_link_up,
+};
+
+int dsa_port_phylink_create(struct dsa_port *dp)
{
- struct device_node *dn = dp->dn;
+ const struct phylink_mac_ops *mac_ops;
struct dsa_switch *ds = dp->ds;
- struct phy_device *phydev;
- int port = dp->index;
- int mode;
+ phy_interface_t mode;
+ struct phylink *pl;
int err;
- err = of_phy_register_fixed_link(dn);
- if (err) {
- dev_err(ds->dev,
- "failed to register the fixed PHY of port %d\n",
- port);
- return err;
+ err = of_get_phy_mode(dp->dn, &mode);
+ if (err)
+ mode = PHY_INTERFACE_MODE_NA;
+
+ if (ds->ops->phylink_get_caps) {
+ ds->ops->phylink_get_caps(ds, dp->index, &dp->pl_config);
+ } else {
+ /* For legacy drivers */
+ if (mode != PHY_INTERFACE_MODE_NA) {
+ __set_bit(mode, dp->pl_config.supported_interfaces);
+ } else {
+ __set_bit(PHY_INTERFACE_MODE_INTERNAL,
+ dp->pl_config.supported_interfaces);
+ __set_bit(PHY_INTERFACE_MODE_GMII,
+ dp->pl_config.supported_interfaces);
+ }
}
- phydev = of_phy_find_device(dn);
+ mac_ops = &dsa_port_phylink_mac_ops;
+ if (ds->phylink_mac_ops)
+ mac_ops = ds->phylink_mac_ops;
- mode = of_get_phy_mode(dn);
- if (mode < 0)
- mode = PHY_INTERFACE_MODE_NA;
- phydev->interface = mode;
+ pl = phylink_create(&dp->pl_config, of_fwnode_handle(dp->dn), mode,
+ mac_ops);
+ if (IS_ERR(pl)) {
+ pr_err("error creating PHYLINK: %ld\n", PTR_ERR(pl));
+ return PTR_ERR(pl);
+ }
+
+ dp->pl = pl;
+
+ return 0;
+}
- genphy_config_init(phydev);
- genphy_read_status(phydev);
+void dsa_port_phylink_destroy(struct dsa_port *dp)
+{
+ phylink_destroy(dp->pl);
+ dp->pl = NULL;
+}
- if (ds->ops->adjust_link)
- ds->ops->adjust_link(ds, port, phydev);
+static int dsa_shared_port_phylink_register(struct dsa_port *dp)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct device_node *port_dn = dp->dn;
+ int err;
- put_device(&phydev->mdio.dev);
+ dp->pl_config.dev = ds->dev;
+ dp->pl_config.type = PHYLINK_DEV;
+
+ err = dsa_port_phylink_create(dp);
+ if (err)
+ return err;
+
+ err = phylink_of_phy_connect(dp->pl, port_dn, 0);
+ if (err && err != -ENODEV) {
+ pr_err("could not attach to PHY: %d\n", err);
+ goto err_phy_connect;
+ }
return 0;
+
+err_phy_connect:
+ dsa_port_phylink_destroy(dp);
+ return err;
}
-int dsa_port_link_register_of(struct dsa_port *dp)
+/* During the initial DSA driver migration to OF, port nodes were sometimes
+ * added to device trees with no indication of how they should operate from a
+ * link management perspective (phy-handle, fixed-link, etc). Additionally, the
+ * phy-mode may be absent. The interpretation of these port OF nodes depends on
+ * their type.
+ *
+ * User ports with no phy-handle or fixed-link are expected to connect to an
+ * internal PHY located on the ds->user_mii_bus at an MDIO address equal to
+ * the port number. This description is still actively supported.
+ *
+ * Shared (CPU and DSA) ports with no phy-handle or fixed-link are expected to
+ * operate at the maximum speed that their phy-mode is capable of. If the
+ * phy-mode is absent, they are expected to operate using the phy-mode
+ * supported by the port that gives the highest link speed. It is unspecified
+ * if the port should use flow control or not, half duplex or full duplex, or
+ * if the phy-mode is a SERDES link, whether in-band autoneg is expected to be
+ * enabled or not.
+ *
+ * In the latter case of shared ports, omitting the link management description
+ * from the firmware node is deprecated and strongly discouraged. DSA uses
+ * phylink, which rejects the firmware nodes of these ports for lacking
+ * required properties.
+ *
+ * For switches in this table, DSA will skip enforcing validation and will
+ * later omit registering a phylink instance for the shared ports, if they lack
+ * a fixed-link, a phy-handle, or a managed = "in-band-status" property.
+ * It becomes the responsibility of the driver to ensure that these ports
+ * operate at the maximum speed (whatever this means) and will interoperate
+ * with the DSA conduit or other cascade port, since phylink methods will not be
+ * invoked for them.
+ *
+ * If you are considering expanding this table for newly introduced switches,
+ * think again. It is OK to remove switches from this table if there aren't DT
+ * blobs in circulation which rely on defaulting the shared ports.
+ */
+static const char * const dsa_switches_apply_workarounds[] = {
+#if IS_ENABLED(CONFIG_NET_DSA_XRS700X)
+ "arrow,xrs7003e",
+ "arrow,xrs7003f",
+ "arrow,xrs7004e",
+ "arrow,xrs7004f",
+#endif
+#if IS_ENABLED(CONFIG_B53)
+ "brcm,bcm5325",
+ "brcm,bcm53115",
+ "brcm,bcm53125",
+ "brcm,bcm53128",
+ "brcm,bcm5365",
+ "brcm,bcm5389",
+ "brcm,bcm5395",
+ "brcm,bcm5397",
+ "brcm,bcm5398",
+ "brcm,bcm53010-srab",
+ "brcm,bcm53011-srab",
+ "brcm,bcm53012-srab",
+ "brcm,bcm53018-srab",
+ "brcm,bcm53019-srab",
+ "brcm,bcm5301x-srab",
+ "brcm,bcm11360-srab",
+ "brcm,bcm58522-srab",
+ "brcm,bcm58525-srab",
+ "brcm,bcm58535-srab",
+ "brcm,bcm58622-srab",
+ "brcm,bcm58623-srab",
+ "brcm,bcm58625-srab",
+ "brcm,bcm88312-srab",
+ "brcm,cygnus-srab",
+ "brcm,nsp-srab",
+ "brcm,omega-srab",
+ "brcm,bcm3384-switch",
+ "brcm,bcm6328-switch",
+ "brcm,bcm6368-switch",
+ "brcm,bcm63xx-switch",
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_BCM_SF2)
+ "brcm,bcm7445-switch-v4.0",
+ "brcm,bcm7278-switch-v4.0",
+ "brcm,bcm7278-switch-v4.8",
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_LANTIQ_GSWIP)
+ "lantiq,xrx200-gswip",
+ "lantiq,xrx300-gswip",
+ "lantiq,xrx330-gswip",
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_MV88E6060)
+ "marvell,mv88e6060",
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_MV88E6XXX)
+ "marvell,mv88e6085",
+ "marvell,mv88e6190",
+ "marvell,mv88e6250",
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_MICROCHIP_KSZ_COMMON)
+ "microchip,ksz8765",
+ "microchip,ksz8794",
+ "microchip,ksz8795",
+ "microchip,ksz8863",
+ "microchip,ksz8873",
+ "microchip,ksz9477",
+ "microchip,ksz9897",
+ "microchip,ksz9893",
+ "microchip,ksz9563",
+ "microchip,ksz8563",
+ "microchip,ksz9567",
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_SMSC_LAN9303_MDIO)
+ "smsc,lan9303-mdio",
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_SMSC_LAN9303_I2C)
+ "smsc,lan9303-i2c",
+#endif
+ NULL,
+};
+
+static void dsa_shared_port_validate_of(struct dsa_port *dp,
+ bool *missing_phy_mode,
+ bool *missing_link_description)
{
- if (of_phy_is_fixed_link(dp->dn))
- return dsa_port_fixed_link_register_of(dp);
- else
- return dsa_port_setup_phy_of(dp, true);
+ struct device_node *dn = dp->dn, *phy_np;
+ struct dsa_switch *ds = dp->ds;
+ phy_interface_t mode;
+
+ *missing_phy_mode = false;
+ *missing_link_description = false;
+
+ if (of_get_phy_mode(dn, &mode)) {
+ *missing_phy_mode = true;
+ dev_err(ds->dev,
+ "OF node %pOF of %s port %d lacks the required \"phy-mode\" property\n",
+ dn, dsa_port_is_cpu(dp) ? "CPU" : "DSA", dp->index);
+ }
+
+ /* Note: of_phy_is_fixed_link() also returns true for
+ * managed = "in-band-status"
+ */
+ if (of_phy_is_fixed_link(dn))
+ return;
+
+ phy_np = of_parse_phandle(dn, "phy-handle", 0);
+ if (phy_np) {
+ of_node_put(phy_np);
+ return;
+ }
+
+ *missing_link_description = true;
+
+ dev_err(ds->dev,
+ "OF node %pOF of %s port %d lacks the required \"phy-handle\", \"fixed-link\" or \"managed\" properties\n",
+ dn, dsa_port_is_cpu(dp) ? "CPU" : "DSA", dp->index);
}
-void dsa_port_link_unregister_of(struct dsa_port *dp)
+static void dsa_shared_port_link_down(struct dsa_port *dp)
{
- if (of_phy_is_fixed_link(dp->dn))
- of_phy_deregister_fixed_link(dp->dn);
- else
- dsa_port_setup_phy_of(dp, false);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->phylink_mac_ops && ds->phylink_mac_ops->mac_link_down)
+ ds->phylink_mac_ops->mac_link_down(&dp->pl_config, MLO_AN_FIXED,
+ PHY_INTERFACE_MODE_NA);
}
-int dsa_port_get_phy_strings(struct dsa_port *dp, uint8_t *data)
+int dsa_shared_port_link_register_of(struct dsa_port *dp)
{
- struct phy_device *phydev;
- int ret = -EOPNOTSUPP;
+ struct dsa_switch *ds = dp->ds;
+ bool missing_link_description;
+ bool missing_phy_mode;
- if (of_phy_is_fixed_link(dp->dn))
- return ret;
+ dsa_shared_port_validate_of(dp, &missing_phy_mode,
+ &missing_link_description);
- phydev = dsa_port_get_phy_device(dp);
- if (IS_ERR_OR_NULL(phydev))
- return ret;
+ if ((missing_phy_mode || missing_link_description) &&
+ !of_device_compatible_match(ds->dev->of_node,
+ dsa_switches_apply_workarounds))
+ return -EINVAL;
- ret = phy_ethtool_get_strings(phydev, data);
- put_device(&phydev->mdio.dev);
+ if (missing_link_description) {
+ dev_warn(ds->dev,
+ "Skipping phylink registration for %s port %d\n",
+ dsa_port_is_cpu(dp) ? "CPU" : "DSA", dp->index);
+ } else {
+ dsa_shared_port_link_down(dp);
- return ret;
+ return dsa_shared_port_phylink_register(dp);
+ }
+
+ return 0;
+}
+
+void dsa_shared_port_link_unregister_of(struct dsa_port *dp)
+{
+ if (dp->pl) {
+ rtnl_lock();
+ phylink_disconnect_phy(dp->pl);
+ rtnl_unlock();
+ dsa_port_phylink_destroy(dp);
+ return;
+ }
}
-EXPORT_SYMBOL_GPL(dsa_port_get_phy_strings);
-int dsa_port_get_ethtool_phy_stats(struct dsa_port *dp, uint64_t *data)
+int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr,
+ struct netlink_ext_ack *extack)
{
- struct phy_device *phydev;
- int ret = -EOPNOTSUPP;
+ struct dsa_switch *ds = dp->ds;
+ int err;
- if (of_phy_is_fixed_link(dp->dn))
- return ret;
+ if (!ds->ops->port_hsr_join)
+ return -EOPNOTSUPP;
- phydev = dsa_port_get_phy_device(dp);
- if (IS_ERR_OR_NULL(phydev))
- return ret;
+ dp->hsr_dev = hsr;
- ret = phy_ethtool_get_stats(phydev, NULL, data);
- put_device(&phydev->mdio.dev);
+ err = ds->ops->port_hsr_join(ds, dp->index, hsr, extack);
+ if (err)
+ dp->hsr_dev = NULL;
- return ret;
+ return err;
+}
+
+void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr)
+{
+ struct dsa_switch *ds = dp->ds;
+ int err;
+
+ if (!dp->hsr_dev)
+ return;
+
+ dp->hsr_dev = NULL;
+
+ if (ds->ops->port_hsr_leave) {
+ err = ds->ops->port_hsr_leave(ds, dp->index, hsr);
+ if (err)
+ dev_err(dp->ds->dev,
+ "port %d failed to leave HSR %s: %pe\n",
+ dp->index, hsr->name, ERR_PTR(err));
+ }
}
-EXPORT_SYMBOL_GPL(dsa_port_get_ethtool_phy_stats);
-int dsa_port_get_phy_sset_count(struct dsa_port *dp)
+int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, bool broadcast)
{
- struct phy_device *phydev;
- int ret = -EOPNOTSUPP;
+ struct dsa_notifier_tag_8021q_vlan_info info = {
+ .dp = dp,
+ .vid = vid,
+ };
- if (of_phy_is_fixed_link(dp->dn))
- return ret;
+ if (broadcast)
+ return dsa_broadcast(DSA_NOTIFIER_TAG_8021Q_VLAN_ADD, &info);
- phydev = dsa_port_get_phy_device(dp);
- if (IS_ERR_OR_NULL(phydev))
- return ret;
+ return dsa_port_notify(dp, DSA_NOTIFIER_TAG_8021Q_VLAN_ADD, &info);
+}
- ret = phy_ethtool_get_sset_count(phydev);
- put_device(&phydev->mdio.dev);
+void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid, bool broadcast)
+{
+ struct dsa_notifier_tag_8021q_vlan_info info = {
+ .dp = dp,
+ .vid = vid,
+ };
+ int err;
- return ret;
+ if (broadcast)
+ err = dsa_broadcast(DSA_NOTIFIER_TAG_8021Q_VLAN_DEL, &info);
+ else
+ err = dsa_port_notify(dp, DSA_NOTIFIER_TAG_8021Q_VLAN_DEL, &info);
+ if (err)
+ dev_err(dp->ds->dev,
+ "port %d failed to notify tag_8021q VLAN %d deletion: %pe\n",
+ dp->index, vid, ERR_PTR(err));
}
-EXPORT_SYMBOL_GPL(dsa_port_get_phy_sset_count);
diff --git a/net/dsa/port.h b/net/dsa/port.h
new file mode 100644
index 000000000000..6bc3291573c0
--- /dev/null
+++ b/net/dsa/port.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_PORT_H
+#define __DSA_PORT_H
+
+#include <linux/types.h>
+#include <net/dsa.h>
+
+struct ifreq;
+struct netdev_lag_lower_state_info;
+struct netdev_lag_upper_info;
+struct netlink_ext_ack;
+struct switchdev_mst_state;
+struct switchdev_obj_port_mdb;
+struct switchdev_vlan_msti;
+struct phy_device;
+
+bool dsa_port_supports_hwtstamp(struct dsa_port *dp);
+void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp,
+ const struct dsa_device_ops *tag_ops);
+int dsa_port_set_state(struct dsa_port *dp, u8 state, bool do_fast_age);
+int dsa_port_set_mst_state(struct dsa_port *dp,
+ const struct switchdev_mst_state *state,
+ struct netlink_ext_ack *extack);
+int dsa_port_enable_rt(struct dsa_port *dp, struct phy_device *phy);
+int dsa_port_enable(struct dsa_port *dp, struct phy_device *phy);
+void dsa_port_disable_rt(struct dsa_port *dp);
+void dsa_port_disable(struct dsa_port *dp);
+int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br,
+ struct netlink_ext_ack *extack);
+void dsa_port_pre_bridge_leave(struct dsa_port *dp, struct net_device *br);
+void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br);
+int dsa_port_lag_change(struct dsa_port *dp,
+ struct netdev_lag_lower_state_info *linfo);
+int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev,
+ struct netdev_lag_upper_info *uinfo,
+ struct netlink_ext_ack *extack);
+void dsa_port_pre_lag_leave(struct dsa_port *dp, struct net_device *lag_dev);
+void dsa_port_lag_leave(struct dsa_port *dp, struct net_device *lag_dev);
+int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering,
+ struct netlink_ext_ack *extack);
+bool dsa_port_skip_vlan_configuration(struct dsa_port *dp);
+int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock);
+int dsa_port_mst_enable(struct dsa_port *dp, bool on,
+ struct netlink_ext_ack *extack);
+int dsa_port_vlan_msti(struct dsa_port *dp,
+ const struct switchdev_vlan_msti *msti);
+int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu);
+int dsa_port_fdb_add(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid);
+int dsa_port_fdb_del(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid);
+int dsa_port_standalone_host_fdb_add(struct dsa_port *dp,
+ const unsigned char *addr, u16 vid);
+int dsa_port_standalone_host_fdb_del(struct dsa_port *dp,
+ const unsigned char *addr, u16 vid);
+int dsa_port_bridge_host_fdb_add(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid);
+int dsa_port_bridge_host_fdb_del(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid);
+int dsa_port_lag_fdb_add(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid);
+int dsa_port_lag_fdb_del(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid);
+int dsa_port_fdb_dump(struct dsa_port *dp, dsa_fdb_dump_cb_t *cb, void *data);
+int dsa_port_mdb_add(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb);
+int dsa_port_mdb_del(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb);
+int dsa_port_standalone_host_mdb_add(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb);
+int dsa_port_standalone_host_mdb_del(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb);
+int dsa_port_bridge_host_mdb_add(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb);
+int dsa_port_bridge_host_mdb_del(const struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb);
+int dsa_port_pre_bridge_flags(const struct dsa_port *dp,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack);
+int dsa_port_bridge_flags(struct dsa_port *dp,
+ struct switchdev_brport_flags flags,
+ struct netlink_ext_ack *extack);
+int dsa_port_vlan_add(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ struct netlink_ext_ack *extack);
+int dsa_port_vlan_del(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan);
+int dsa_port_host_vlan_add(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ struct netlink_ext_ack *extack);
+int dsa_port_host_vlan_del(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan);
+int dsa_port_mrp_add(const struct dsa_port *dp,
+ const struct switchdev_obj_mrp *mrp);
+int dsa_port_mrp_del(const struct dsa_port *dp,
+ const struct switchdev_obj_mrp *mrp);
+int dsa_port_mrp_add_ring_role(const struct dsa_port *dp,
+ const struct switchdev_obj_ring_role_mrp *mrp);
+int dsa_port_mrp_del_ring_role(const struct dsa_port *dp,
+ const struct switchdev_obj_ring_role_mrp *mrp);
+int dsa_port_phylink_create(struct dsa_port *dp);
+void dsa_port_phylink_destroy(struct dsa_port *dp);
+int dsa_shared_port_link_register_of(struct dsa_port *dp);
+void dsa_shared_port_link_unregister_of(struct dsa_port *dp);
+int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr,
+ struct netlink_ext_ack *extack);
+void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr);
+int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, bool broadcast);
+void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid, bool broadcast);
+void dsa_port_set_host_flood(struct dsa_port *dp, bool uc, bool mc);
+int dsa_port_change_conduit(struct dsa_port *dp, struct net_device *conduit,
+ struct netlink_ext_ack *extack);
+
+#endif
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
deleted file mode 100644
index 1c45c1d6d241..000000000000
--- a/net/dsa/slave.c
+++ /dev/null
@@ -1,1569 +0,0 @@
-/*
- * net/dsa/slave.c - Slave device handling
- * Copyright (c) 2008-2009 Marvell Semiconductor
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/list.h>
-#include <linux/etherdevice.h>
-#include <linux/netdevice.h>
-#include <linux/phy.h>
-#include <linux/phy_fixed.h>
-#include <linux/phylink.h>
-#include <linux/of_net.h>
-#include <linux/of_mdio.h>
-#include <linux/mdio.h>
-#include <net/rtnetlink.h>
-#include <net/pkt_cls.h>
-#include <net/tc_act/tc_mirred.h>
-#include <linux/if_bridge.h>
-#include <linux/netpoll.h>
-#include <linux/ptp_classify.h>
-
-#include "dsa_priv.h"
-
-static bool dsa_slave_dev_check(struct net_device *dev);
-
-/* slave mii_bus handling ***************************************************/
-static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg)
-{
- struct dsa_switch *ds = bus->priv;
-
- if (ds->phys_mii_mask & (1 << addr))
- return ds->ops->phy_read(ds, addr, reg);
-
- return 0xffff;
-}
-
-static int dsa_slave_phy_write(struct mii_bus *bus, int addr, int reg, u16 val)
-{
- struct dsa_switch *ds = bus->priv;
-
- if (ds->phys_mii_mask & (1 << addr))
- return ds->ops->phy_write(ds, addr, reg, val);
-
- return 0;
-}
-
-void dsa_slave_mii_bus_init(struct dsa_switch *ds)
-{
- ds->slave_mii_bus->priv = (void *)ds;
- ds->slave_mii_bus->name = "dsa slave smi";
- ds->slave_mii_bus->read = dsa_slave_phy_read;
- ds->slave_mii_bus->write = dsa_slave_phy_write;
- snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d.%d",
- ds->dst->index, ds->index);
- ds->slave_mii_bus->parent = ds->dev;
- ds->slave_mii_bus->phy_mask = ~ds->phys_mii_mask;
-}
-
-
-/* slave device handling ****************************************************/
-static int dsa_slave_get_iflink(const struct net_device *dev)
-{
- return dsa_slave_to_master(dev)->ifindex;
-}
-
-static int dsa_slave_open(struct net_device *dev)
-{
- struct net_device *master = dsa_slave_to_master(dev);
- struct dsa_port *dp = dsa_slave_to_port(dev);
- int err;
-
- if (!(master->flags & IFF_UP))
- return -ENETDOWN;
-
- if (!ether_addr_equal(dev->dev_addr, master->dev_addr)) {
- err = dev_uc_add(master, dev->dev_addr);
- if (err < 0)
- goto out;
- }
-
- if (dev->flags & IFF_ALLMULTI) {
- err = dev_set_allmulti(master, 1);
- if (err < 0)
- goto del_unicast;
- }
- if (dev->flags & IFF_PROMISC) {
- err = dev_set_promiscuity(master, 1);
- if (err < 0)
- goto clear_allmulti;
- }
-
- err = dsa_port_enable(dp, dev->phydev);
- if (err)
- goto clear_promisc;
-
- phylink_start(dp->pl);
-
- return 0;
-
-clear_promisc:
- if (dev->flags & IFF_PROMISC)
- dev_set_promiscuity(master, -1);
-clear_allmulti:
- if (dev->flags & IFF_ALLMULTI)
- dev_set_allmulti(master, -1);
-del_unicast:
- if (!ether_addr_equal(dev->dev_addr, master->dev_addr))
- dev_uc_del(master, dev->dev_addr);
-out:
- return err;
-}
-
-static int dsa_slave_close(struct net_device *dev)
-{
- struct net_device *master = dsa_slave_to_master(dev);
- struct dsa_port *dp = dsa_slave_to_port(dev);
-
- phylink_stop(dp->pl);
-
- dsa_port_disable(dp, dev->phydev);
-
- dev_mc_unsync(master, dev);
- dev_uc_unsync(master, dev);
- if (dev->flags & IFF_ALLMULTI)
- dev_set_allmulti(master, -1);
- if (dev->flags & IFF_PROMISC)
- dev_set_promiscuity(master, -1);
-
- if (!ether_addr_equal(dev->dev_addr, master->dev_addr))
- dev_uc_del(master, dev->dev_addr);
-
- return 0;
-}
-
-static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
-{
- struct net_device *master = dsa_slave_to_master(dev);
-
- if (change & IFF_ALLMULTI)
- dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1);
- if (change & IFF_PROMISC)
- dev_set_promiscuity(master, dev->flags & IFF_PROMISC ? 1 : -1);
-}
-
-static void dsa_slave_set_rx_mode(struct net_device *dev)
-{
- struct net_device *master = dsa_slave_to_master(dev);
-
- dev_mc_sync(master, dev);
- dev_uc_sync(master, dev);
-}
-
-static int dsa_slave_set_mac_address(struct net_device *dev, void *a)
-{
- struct net_device *master = dsa_slave_to_master(dev);
- struct sockaddr *addr = a;
- int err;
-
- if (!is_valid_ether_addr(addr->sa_data))
- return -EADDRNOTAVAIL;
-
- if (!(dev->flags & IFF_UP))
- goto out;
-
- if (!ether_addr_equal(addr->sa_data, master->dev_addr)) {
- err = dev_uc_add(master, addr->sa_data);
- if (err < 0)
- return err;
- }
-
- if (!ether_addr_equal(dev->dev_addr, master->dev_addr))
- dev_uc_del(master, dev->dev_addr);
-
-out:
- ether_addr_copy(dev->dev_addr, addr->sa_data);
-
- return 0;
-}
-
-struct dsa_slave_dump_ctx {
- struct net_device *dev;
- struct sk_buff *skb;
- struct netlink_callback *cb;
- int idx;
-};
-
-static int
-dsa_slave_port_fdb_do_dump(const unsigned char *addr, u16 vid,
- bool is_static, void *data)
-{
- struct dsa_slave_dump_ctx *dump = data;
- u32 portid = NETLINK_CB(dump->cb->skb).portid;
- u32 seq = dump->cb->nlh->nlmsg_seq;
- struct nlmsghdr *nlh;
- struct ndmsg *ndm;
-
- if (dump->idx < dump->cb->args[2])
- goto skip;
-
- nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH,
- sizeof(*ndm), NLM_F_MULTI);
- if (!nlh)
- return -EMSGSIZE;
-
- ndm = nlmsg_data(nlh);
- ndm->ndm_family = AF_BRIDGE;
- ndm->ndm_pad1 = 0;
- ndm->ndm_pad2 = 0;
- ndm->ndm_flags = NTF_SELF;
- ndm->ndm_type = 0;
- ndm->ndm_ifindex = dump->dev->ifindex;
- ndm->ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE;
-
- if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, addr))
- goto nla_put_failure;
-
- if (vid && nla_put_u16(dump->skb, NDA_VLAN, vid))
- goto nla_put_failure;
-
- nlmsg_end(dump->skb, nlh);
-
-skip:
- dump->idx++;
- return 0;
-
-nla_put_failure:
- nlmsg_cancel(dump->skb, nlh);
- return -EMSGSIZE;
-}
-
-static int
-dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
- struct net_device *dev, struct net_device *filter_dev,
- int *idx)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_slave_dump_ctx dump = {
- .dev = dev,
- .skb = skb,
- .cb = cb,
- .idx = *idx,
- };
- int err;
-
- err = dsa_port_fdb_dump(dp, dsa_slave_port_fdb_do_dump, &dump);
- *idx = dump.idx;
-
- return err;
-}
-
-static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
-{
- struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->dp->ds;
- int port = p->dp->index;
-
- /* Pass through to switch driver if it supports timestamping */
- switch (cmd) {
- case SIOCGHWTSTAMP:
- if (ds->ops->port_hwtstamp_get)
- return ds->ops->port_hwtstamp_get(ds, port, ifr);
- break;
- case SIOCSHWTSTAMP:
- if (ds->ops->port_hwtstamp_set)
- return ds->ops->port_hwtstamp_set(ds, port, ifr);
- break;
- }
-
- return phylink_mii_ioctl(p->dp->pl, ifr, cmd);
-}
-
-static int dsa_slave_port_attr_set(struct net_device *dev,
- const struct switchdev_attr *attr,
- struct switchdev_trans *trans)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- int ret;
-
- switch (attr->id) {
- case SWITCHDEV_ATTR_ID_PORT_STP_STATE:
- ret = dsa_port_set_state(dp, attr->u.stp_state, trans);
- break;
- case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING:
- ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering,
- trans);
- break;
- case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME:
- ret = dsa_port_ageing_time(dp, attr->u.ageing_time, trans);
- break;
- default:
- ret = -EOPNOTSUPP;
- break;
- }
-
- return ret;
-}
-
-static int dsa_slave_port_obj_add(struct net_device *dev,
- const struct switchdev_obj *obj,
- struct switchdev_trans *trans)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- int err;
-
- /* For the prepare phase, ensure the full set of changes is feasable in
- * one go in order to signal a failure properly. If an operation is not
- * supported, return -EOPNOTSUPP.
- */
-
- switch (obj->id) {
- case SWITCHDEV_OBJ_ID_PORT_MDB:
- err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj), trans);
- break;
- case SWITCHDEV_OBJ_ID_HOST_MDB:
- /* DSA can directly translate this to a normal MDB add,
- * but on the CPU port.
- */
- err = dsa_port_mdb_add(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj),
- trans);
- break;
- case SWITCHDEV_OBJ_ID_PORT_VLAN:
- err = dsa_port_vlan_add(dp, SWITCHDEV_OBJ_PORT_VLAN(obj),
- trans);
- break;
- default:
- err = -EOPNOTSUPP;
- break;
- }
-
- return err;
-}
-
-static int dsa_slave_port_obj_del(struct net_device *dev,
- const struct switchdev_obj *obj)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- int err;
-
- switch (obj->id) {
- case SWITCHDEV_OBJ_ID_PORT_MDB:
- err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
- break;
- case SWITCHDEV_OBJ_ID_HOST_MDB:
- /* DSA can directly translate this to a normal MDB add,
- * but on the CPU port.
- */
- err = dsa_port_mdb_del(dp->cpu_dp, SWITCHDEV_OBJ_PORT_MDB(obj));
- break;
- case SWITCHDEV_OBJ_ID_PORT_VLAN:
- err = dsa_port_vlan_del(dp, SWITCHDEV_OBJ_PORT_VLAN(obj));
- break;
- default:
- err = -EOPNOTSUPP;
- break;
- }
-
- return err;
-}
-
-static int dsa_slave_port_attr_get(struct net_device *dev,
- struct switchdev_attr *attr)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
- struct dsa_switch_tree *dst = ds->dst;
-
- switch (attr->id) {
- case SWITCHDEV_ATTR_ID_PORT_PARENT_ID:
- attr->u.ppid.id_len = sizeof(dst->index);
- memcpy(&attr->u.ppid.id, &dst->index, attr->u.ppid.id_len);
- break;
- case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS_SUPPORT:
- attr->u.brport_flags_support = 0;
- break;
- default:
- return -EOPNOTSUPP;
- }
-
- return 0;
-}
-
-static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev,
- struct sk_buff *skb)
-{
-#ifdef CONFIG_NET_POLL_CONTROLLER
- struct dsa_slave_priv *p = netdev_priv(dev);
-
- if (p->netpoll)
- netpoll_send_skb(p->netpoll, skb);
-#else
- BUG();
-#endif
- return NETDEV_TX_OK;
-}
-
-static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p,
- struct sk_buff *skb)
-{
- struct dsa_switch *ds = p->dp->ds;
- struct sk_buff *clone;
- unsigned int type;
-
- type = ptp_classify_raw(skb);
- if (type == PTP_CLASS_NONE)
- return;
-
- if (!ds->ops->port_txtstamp)
- return;
-
- clone = skb_clone_sk(skb);
- if (!clone)
- return;
-
- if (ds->ops->port_txtstamp(ds, p->dp->index, clone, type))
- return;
-
- kfree_skb(clone);
-}
-
-static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev)
-{
- struct dsa_slave_priv *p = netdev_priv(dev);
- struct pcpu_sw_netstats *s;
- struct sk_buff *nskb;
-
- s = this_cpu_ptr(p->stats64);
- u64_stats_update_begin(&s->syncp);
- s->tx_packets++;
- s->tx_bytes += skb->len;
- u64_stats_update_end(&s->syncp);
-
- /* Identify PTP protocol packets, clone them, and pass them to the
- * switch driver
- */
- dsa_skb_tx_timestamp(p, skb);
-
- /* Transmit function may have to reallocate the original SKB,
- * in which case it must have freed it. Only free it here on error.
- */
- nskb = p->xmit(skb, dev);
- if (!nskb) {
- kfree_skb(skb);
- return NETDEV_TX_OK;
- }
-
- /* SKB for netpoll still need to be mangled with the protocol-specific
- * tag to be successfully transmitted
- */
- if (unlikely(netpoll_tx_running(dev)))
- return dsa_slave_netpoll_send_skb(dev, nskb);
-
- /* Queue the SKB for transmission on the parent interface, but
- * do not modify its EtherType
- */
- nskb->dev = dsa_slave_to_master(dev);
- dev_queue_xmit(nskb);
-
- return NETDEV_TX_OK;
-}
-
-/* ethtool operations *******************************************************/
-
-static void dsa_slave_get_drvinfo(struct net_device *dev,
- struct ethtool_drvinfo *drvinfo)
-{
- strlcpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver));
- strlcpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version));
- strlcpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info));
-}
-
-static int dsa_slave_get_regs_len(struct net_device *dev)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (ds->ops->get_regs_len)
- return ds->ops->get_regs_len(ds, dp->index);
-
- return -EOPNOTSUPP;
-}
-
-static void
-dsa_slave_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (ds->ops->get_regs)
- ds->ops->get_regs(ds, dp->index, regs, _p);
-}
-
-static int dsa_slave_nway_reset(struct net_device *dev)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
-
- return phylink_ethtool_nway_reset(dp->pl);
-}
-
-static int dsa_slave_get_eeprom_len(struct net_device *dev)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (ds->cd && ds->cd->eeprom_len)
- return ds->cd->eeprom_len;
-
- if (ds->ops->get_eeprom_len)
- return ds->ops->get_eeprom_len(ds);
-
- return 0;
-}
-
-static int dsa_slave_get_eeprom(struct net_device *dev,
- struct ethtool_eeprom *eeprom, u8 *data)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (ds->ops->get_eeprom)
- return ds->ops->get_eeprom(ds, eeprom, data);
-
- return -EOPNOTSUPP;
-}
-
-static int dsa_slave_set_eeprom(struct net_device *dev,
- struct ethtool_eeprom *eeprom, u8 *data)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (ds->ops->set_eeprom)
- return ds->ops->set_eeprom(ds, eeprom, data);
-
- return -EOPNOTSUPP;
-}
-
-static void dsa_slave_get_strings(struct net_device *dev,
- uint32_t stringset, uint8_t *data)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (stringset == ETH_SS_STATS) {
- int len = ETH_GSTRING_LEN;
-
- strncpy(data, "tx_packets", len);
- strncpy(data + len, "tx_bytes", len);
- strncpy(data + 2 * len, "rx_packets", len);
- strncpy(data + 3 * len, "rx_bytes", len);
- if (ds->ops->get_strings)
- ds->ops->get_strings(ds, dp->index, stringset,
- data + 4 * len);
- }
-}
-
-static void dsa_slave_get_ethtool_stats(struct net_device *dev,
- struct ethtool_stats *stats,
- uint64_t *data)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = dp->ds;
- struct pcpu_sw_netstats *s;
- unsigned int start;
- int i;
-
- for_each_possible_cpu(i) {
- u64 tx_packets, tx_bytes, rx_packets, rx_bytes;
-
- s = per_cpu_ptr(p->stats64, i);
- do {
- start = u64_stats_fetch_begin_irq(&s->syncp);
- tx_packets = s->tx_packets;
- tx_bytes = s->tx_bytes;
- rx_packets = s->rx_packets;
- rx_bytes = s->rx_bytes;
- } while (u64_stats_fetch_retry_irq(&s->syncp, start));
- data[0] += tx_packets;
- data[1] += tx_bytes;
- data[2] += rx_packets;
- data[3] += rx_bytes;
- }
- if (ds->ops->get_ethtool_stats)
- ds->ops->get_ethtool_stats(ds, dp->index, data + 4);
-}
-
-static int dsa_slave_get_sset_count(struct net_device *dev, int sset)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (sset == ETH_SS_STATS) {
- int count;
-
- count = 4;
- if (ds->ops->get_sset_count)
- count += ds->ops->get_sset_count(ds, dp->index, sset);
-
- return count;
- }
-
- return -EOPNOTSUPP;
-}
-
-static void dsa_slave_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- phylink_ethtool_get_wol(dp->pl, w);
-
- if (ds->ops->get_wol)
- ds->ops->get_wol(ds, dp->index, w);
-}
-
-static int dsa_slave_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
- int ret = -EOPNOTSUPP;
-
- phylink_ethtool_set_wol(dp->pl, w);
-
- if (ds->ops->set_wol)
- ret = ds->ops->set_wol(ds, dp->index, w);
-
- return ret;
-}
-
-static int dsa_slave_set_eee(struct net_device *dev, struct ethtool_eee *e)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
- int ret;
-
- /* Port's PHY and MAC both need to be EEE capable */
- if (!dev->phydev && !dp->pl)
- return -ENODEV;
-
- if (!ds->ops->set_mac_eee)
- return -EOPNOTSUPP;
-
- ret = ds->ops->set_mac_eee(ds, dp->index, e);
- if (ret)
- return ret;
-
- return phylink_ethtool_set_eee(dp->pl, e);
-}
-
-static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
- int ret;
-
- /* Port's PHY and MAC both need to be EEE capable */
- if (!dev->phydev && !dp->pl)
- return -ENODEV;
-
- if (!ds->ops->get_mac_eee)
- return -EOPNOTSUPP;
-
- ret = ds->ops->get_mac_eee(ds, dp->index, e);
- if (ret)
- return ret;
-
- return phylink_ethtool_get_eee(dp->pl, e);
-}
-
-static int dsa_slave_get_link_ksettings(struct net_device *dev,
- struct ethtool_link_ksettings *cmd)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
-
- return phylink_ethtool_ksettings_get(dp->pl, cmd);
-}
-
-static int dsa_slave_set_link_ksettings(struct net_device *dev,
- const struct ethtool_link_ksettings *cmd)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
-
- return phylink_ethtool_ksettings_set(dp->pl, cmd);
-}
-
-#ifdef CONFIG_NET_POLL_CONTROLLER
-static int dsa_slave_netpoll_setup(struct net_device *dev,
- struct netpoll_info *ni)
-{
- struct net_device *master = dsa_slave_to_master(dev);
- struct dsa_slave_priv *p = netdev_priv(dev);
- struct netpoll *netpoll;
- int err = 0;
-
- netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL);
- if (!netpoll)
- return -ENOMEM;
-
- err = __netpoll_setup(netpoll, master);
- if (err) {
- kfree(netpoll);
- goto out;
- }
-
- p->netpoll = netpoll;
-out:
- return err;
-}
-
-static void dsa_slave_netpoll_cleanup(struct net_device *dev)
-{
- struct dsa_slave_priv *p = netdev_priv(dev);
- struct netpoll *netpoll = p->netpoll;
-
- if (!netpoll)
- return;
-
- p->netpoll = NULL;
-
- __netpoll_free_async(netpoll);
-}
-
-static void dsa_slave_poll_controller(struct net_device *dev)
-{
-}
-#endif
-
-static int dsa_slave_get_phys_port_name(struct net_device *dev,
- char *name, size_t len)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
-
- if (snprintf(name, len, "p%d", dp->index) >= len)
- return -EINVAL;
-
- return 0;
-}
-
-static struct dsa_mall_tc_entry *
-dsa_slave_mall_tc_entry_find(struct net_device *dev, unsigned long cookie)
-{
- struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_mall_tc_entry *mall_tc_entry;
-
- list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list)
- if (mall_tc_entry->cookie == cookie)
- return mall_tc_entry;
-
- return NULL;
-}
-
-static int dsa_slave_add_cls_matchall(struct net_device *dev,
- struct tc_cls_matchall_offload *cls,
- bool ingress)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_mall_tc_entry *mall_tc_entry;
- __be16 protocol = cls->common.protocol;
- struct dsa_switch *ds = dp->ds;
- struct net_device *to_dev;
- const struct tc_action *a;
- struct dsa_port *to_dp;
- int err = -EOPNOTSUPP;
-
- if (!ds->ops->port_mirror_add)
- return err;
-
- if (!tcf_exts_has_one_action(cls->exts))
- return err;
-
- a = tcf_exts_first_action(cls->exts);
-
- if (is_tcf_mirred_egress_mirror(a) && protocol == htons(ETH_P_ALL)) {
- struct dsa_mall_mirror_tc_entry *mirror;
-
- to_dev = tcf_mirred_dev(a);
- if (!to_dev)
- return -EINVAL;
-
- if (!dsa_slave_dev_check(to_dev))
- return -EOPNOTSUPP;
-
- mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
- if (!mall_tc_entry)
- return -ENOMEM;
-
- mall_tc_entry->cookie = cls->cookie;
- mall_tc_entry->type = DSA_PORT_MALL_MIRROR;
- mirror = &mall_tc_entry->mirror;
-
- to_dp = dsa_slave_to_port(to_dev);
-
- mirror->to_local_port = to_dp->index;
- mirror->ingress = ingress;
-
- err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress);
- if (err) {
- kfree(mall_tc_entry);
- return err;
- }
-
- list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
- }
-
- return 0;
-}
-
-static void dsa_slave_del_cls_matchall(struct net_device *dev,
- struct tc_cls_matchall_offload *cls)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_mall_tc_entry *mall_tc_entry;
- struct dsa_switch *ds = dp->ds;
-
- if (!ds->ops->port_mirror_del)
- return;
-
- mall_tc_entry = dsa_slave_mall_tc_entry_find(dev, cls->cookie);
- if (!mall_tc_entry)
- return;
-
- list_del(&mall_tc_entry->list);
-
- switch (mall_tc_entry->type) {
- case DSA_PORT_MALL_MIRROR:
- ds->ops->port_mirror_del(ds, dp->index, &mall_tc_entry->mirror);
- break;
- default:
- WARN_ON(1);
- }
-
- kfree(mall_tc_entry);
-}
-
-static int dsa_slave_setup_tc_cls_matchall(struct net_device *dev,
- struct tc_cls_matchall_offload *cls,
- bool ingress)
-{
- if (cls->common.chain_index)
- return -EOPNOTSUPP;
-
- switch (cls->command) {
- case TC_CLSMATCHALL_REPLACE:
- return dsa_slave_add_cls_matchall(dev, cls, ingress);
- case TC_CLSMATCHALL_DESTROY:
- dsa_slave_del_cls_matchall(dev, cls);
- return 0;
- default:
- return -EOPNOTSUPP;
- }
-}
-
-static int dsa_slave_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
- void *cb_priv, bool ingress)
-{
- struct net_device *dev = cb_priv;
-
- if (!tc_can_offload(dev))
- return -EOPNOTSUPP;
-
- switch (type) {
- case TC_SETUP_CLSMATCHALL:
- return dsa_slave_setup_tc_cls_matchall(dev, type_data, ingress);
- default:
- return -EOPNOTSUPP;
- }
-}
-
-static int dsa_slave_setup_tc_block_cb_ig(enum tc_setup_type type,
- void *type_data, void *cb_priv)
-{
- return dsa_slave_setup_tc_block_cb(type, type_data, cb_priv, true);
-}
-
-static int dsa_slave_setup_tc_block_cb_eg(enum tc_setup_type type,
- void *type_data, void *cb_priv)
-{
- return dsa_slave_setup_tc_block_cb(type, type_data, cb_priv, false);
-}
-
-static int dsa_slave_setup_tc_block(struct net_device *dev,
- struct tc_block_offload *f)
-{
- tc_setup_cb_t *cb;
-
- if (f->binder_type == TCF_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
- cb = dsa_slave_setup_tc_block_cb_ig;
- else if (f->binder_type == TCF_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
- cb = dsa_slave_setup_tc_block_cb_eg;
- else
- return -EOPNOTSUPP;
-
- switch (f->command) {
- case TC_BLOCK_BIND:
- return tcf_block_cb_register(f->block, cb, dev, dev, f->extack);
- case TC_BLOCK_UNBIND:
- tcf_block_cb_unregister(f->block, cb, dev);
- return 0;
- default:
- return -EOPNOTSUPP;
- }
-}
-
-static int dsa_slave_setup_tc(struct net_device *dev, enum tc_setup_type type,
- void *type_data)
-{
- switch (type) {
- case TC_SETUP_BLOCK:
- return dsa_slave_setup_tc_block(dev, type_data);
- default:
- return -EOPNOTSUPP;
- }
-}
-
-static void dsa_slave_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *stats)
-{
- struct dsa_slave_priv *p = netdev_priv(dev);
- struct pcpu_sw_netstats *s;
- unsigned int start;
- int i;
-
- netdev_stats_to_stats64(stats, &dev->stats);
- for_each_possible_cpu(i) {
- u64 tx_packets, tx_bytes, rx_packets, rx_bytes;
-
- s = per_cpu_ptr(p->stats64, i);
- do {
- start = u64_stats_fetch_begin_irq(&s->syncp);
- tx_packets = s->tx_packets;
- tx_bytes = s->tx_bytes;
- rx_packets = s->rx_packets;
- rx_bytes = s->rx_bytes;
- } while (u64_stats_fetch_retry_irq(&s->syncp, start));
-
- stats->tx_packets += tx_packets;
- stats->tx_bytes += tx_bytes;
- stats->rx_packets += rx_packets;
- stats->rx_bytes += rx_bytes;
- }
-}
-
-static int dsa_slave_get_rxnfc(struct net_device *dev,
- struct ethtool_rxnfc *nfc, u32 *rule_locs)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (!ds->ops->get_rxnfc)
- return -EOPNOTSUPP;
-
- return ds->ops->get_rxnfc(ds, dp->index, nfc, rule_locs);
-}
-
-static int dsa_slave_set_rxnfc(struct net_device *dev,
- struct ethtool_rxnfc *nfc)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (!ds->ops->set_rxnfc)
- return -EOPNOTSUPP;
-
- return ds->ops->set_rxnfc(ds, dp->index, nfc);
-}
-
-static int dsa_slave_get_ts_info(struct net_device *dev,
- struct ethtool_ts_info *ts)
-{
- struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->dp->ds;
-
- if (!ds->ops->get_ts_info)
- return -EOPNOTSUPP;
-
- return ds->ops->get_ts_info(ds, p->dp->index, ts);
-}
-
-static const struct ethtool_ops dsa_slave_ethtool_ops = {
- .get_drvinfo = dsa_slave_get_drvinfo,
- .get_regs_len = dsa_slave_get_regs_len,
- .get_regs = dsa_slave_get_regs,
- .nway_reset = dsa_slave_nway_reset,
- .get_link = ethtool_op_get_link,
- .get_eeprom_len = dsa_slave_get_eeprom_len,
- .get_eeprom = dsa_slave_get_eeprom,
- .set_eeprom = dsa_slave_set_eeprom,
- .get_strings = dsa_slave_get_strings,
- .get_ethtool_stats = dsa_slave_get_ethtool_stats,
- .get_sset_count = dsa_slave_get_sset_count,
- .set_wol = dsa_slave_set_wol,
- .get_wol = dsa_slave_get_wol,
- .set_eee = dsa_slave_set_eee,
- .get_eee = dsa_slave_get_eee,
- .get_link_ksettings = dsa_slave_get_link_ksettings,
- .set_link_ksettings = dsa_slave_set_link_ksettings,
- .get_rxnfc = dsa_slave_get_rxnfc,
- .set_rxnfc = dsa_slave_set_rxnfc,
- .get_ts_info = dsa_slave_get_ts_info,
-};
-
-/* legacy way, bypassing the bridge *****************************************/
-int dsa_legacy_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
- struct net_device *dev,
- const unsigned char *addr, u16 vid,
- u16 flags)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
-
- return dsa_port_fdb_add(dp, addr, vid);
-}
-
-int dsa_legacy_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
- struct net_device *dev,
- const unsigned char *addr, u16 vid)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
-
- return dsa_port_fdb_del(dp, addr, vid);
-}
-
-static const struct net_device_ops dsa_slave_netdev_ops = {
- .ndo_open = dsa_slave_open,
- .ndo_stop = dsa_slave_close,
- .ndo_start_xmit = dsa_slave_xmit,
- .ndo_change_rx_flags = dsa_slave_change_rx_flags,
- .ndo_set_rx_mode = dsa_slave_set_rx_mode,
- .ndo_set_mac_address = dsa_slave_set_mac_address,
- .ndo_fdb_add = dsa_legacy_fdb_add,
- .ndo_fdb_del = dsa_legacy_fdb_del,
- .ndo_fdb_dump = dsa_slave_fdb_dump,
- .ndo_do_ioctl = dsa_slave_ioctl,
- .ndo_get_iflink = dsa_slave_get_iflink,
-#ifdef CONFIG_NET_POLL_CONTROLLER
- .ndo_netpoll_setup = dsa_slave_netpoll_setup,
- .ndo_netpoll_cleanup = dsa_slave_netpoll_cleanup,
- .ndo_poll_controller = dsa_slave_poll_controller,
-#endif
- .ndo_get_phys_port_name = dsa_slave_get_phys_port_name,
- .ndo_setup_tc = dsa_slave_setup_tc,
- .ndo_get_stats64 = dsa_slave_get_stats64,
-};
-
-static const struct switchdev_ops dsa_slave_switchdev_ops = {
- .switchdev_port_attr_get = dsa_slave_port_attr_get,
- .switchdev_port_attr_set = dsa_slave_port_attr_set,
- .switchdev_port_obj_add = dsa_slave_port_obj_add,
- .switchdev_port_obj_del = dsa_slave_port_obj_del,
-};
-
-static struct device_type dsa_type = {
- .name = "dsa",
-};
-
-static void dsa_slave_phylink_validate(struct net_device *dev,
- unsigned long *supported,
- struct phylink_link_state *state)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (!ds->ops->phylink_validate)
- return;
-
- ds->ops->phylink_validate(ds, dp->index, supported, state);
-}
-
-static int dsa_slave_phylink_mac_link_state(struct net_device *dev,
- struct phylink_link_state *state)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- /* Only called for SGMII and 802.3z */
- if (!ds->ops->phylink_mac_link_state)
- return -EOPNOTSUPP;
-
- return ds->ops->phylink_mac_link_state(ds, dp->index, state);
-}
-
-static void dsa_slave_phylink_mac_config(struct net_device *dev,
- unsigned int mode,
- const struct phylink_link_state *state)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (!ds->ops->phylink_mac_config)
- return;
-
- ds->ops->phylink_mac_config(ds, dp->index, mode, state);
-}
-
-static void dsa_slave_phylink_mac_an_restart(struct net_device *dev)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (!ds->ops->phylink_mac_an_restart)
- return;
-
- ds->ops->phylink_mac_an_restart(ds, dp->index);
-}
-
-static void dsa_slave_phylink_mac_link_down(struct net_device *dev,
- unsigned int mode,
- phy_interface_t interface)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (!ds->ops->phylink_mac_link_down) {
- if (ds->ops->adjust_link && dev->phydev)
- ds->ops->adjust_link(ds, dp->index, dev->phydev);
- return;
- }
-
- ds->ops->phylink_mac_link_down(ds, dp->index, mode, interface);
-}
-
-static void dsa_slave_phylink_mac_link_up(struct net_device *dev,
- unsigned int mode,
- phy_interface_t interface,
- struct phy_device *phydev)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- if (!ds->ops->phylink_mac_link_up) {
- if (ds->ops->adjust_link && dev->phydev)
- ds->ops->adjust_link(ds, dp->index, dev->phydev);
- return;
- }
-
- ds->ops->phylink_mac_link_up(ds, dp->index, mode, interface, phydev);
-}
-
-static const struct phylink_mac_ops dsa_slave_phylink_mac_ops = {
- .validate = dsa_slave_phylink_validate,
- .mac_link_state = dsa_slave_phylink_mac_link_state,
- .mac_config = dsa_slave_phylink_mac_config,
- .mac_an_restart = dsa_slave_phylink_mac_an_restart,
- .mac_link_down = dsa_slave_phylink_mac_link_down,
- .mac_link_up = dsa_slave_phylink_mac_link_up,
-};
-
-void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
-{
- const struct dsa_port *dp = dsa_to_port(ds, port);
-
- phylink_mac_change(dp->pl, up);
-}
-EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change);
-
-static void dsa_slave_phylink_fixed_state(struct net_device *dev,
- struct phylink_link_state *state)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_switch *ds = dp->ds;
-
- /* No need to check that this operation is valid, the callback would
- * not be called if it was not.
- */
- ds->ops->phylink_fixed_state(ds, dp->index, state);
-}
-
-/* slave device setup *******************************************************/
-static int dsa_slave_phy_connect(struct net_device *slave_dev, int addr)
-{
- struct dsa_port *dp = dsa_slave_to_port(slave_dev);
- struct dsa_switch *ds = dp->ds;
-
- slave_dev->phydev = mdiobus_get_phy(ds->slave_mii_bus, addr);
- if (!slave_dev->phydev) {
- netdev_err(slave_dev, "no phy at %d\n", addr);
- return -ENODEV;
- }
-
- return phylink_connect_phy(dp->pl, slave_dev->phydev);
-}
-
-static int dsa_slave_phy_setup(struct net_device *slave_dev)
-{
- struct dsa_port *dp = dsa_slave_to_port(slave_dev);
- struct device_node *port_dn = dp->dn;
- struct dsa_switch *ds = dp->ds;
- u32 phy_flags = 0;
- int mode, ret;
-
- mode = of_get_phy_mode(port_dn);
- if (mode < 0)
- mode = PHY_INTERFACE_MODE_NA;
-
- dp->pl = phylink_create(slave_dev, of_fwnode_handle(port_dn), mode,
- &dsa_slave_phylink_mac_ops);
- if (IS_ERR(dp->pl)) {
- netdev_err(slave_dev,
- "error creating PHYLINK: %ld\n", PTR_ERR(dp->pl));
- return PTR_ERR(dp->pl);
- }
-
- /* Register only if the switch provides such a callback, since this
- * callback takes precedence over polling the link GPIO in PHYLINK
- * (see phylink_get_fixed_state).
- */
- if (ds->ops->phylink_fixed_state)
- phylink_fixed_state_cb(dp->pl, dsa_slave_phylink_fixed_state);
-
- if (ds->ops->get_phy_flags)
- phy_flags = ds->ops->get_phy_flags(ds, dp->index);
-
- ret = phylink_of_phy_connect(dp->pl, port_dn, phy_flags);
- if (ret == -ENODEV) {
- /* We could not connect to a designated PHY or SFP, so use the
- * switch internal MDIO bus instead
- */
- ret = dsa_slave_phy_connect(slave_dev, dp->index);
- if (ret) {
- netdev_err(slave_dev,
- "failed to connect to port %d: %d\n",
- dp->index, ret);
- phylink_destroy(dp->pl);
- return ret;
- }
- }
-
- return 0;
-}
-
-static struct lock_class_key dsa_slave_netdev_xmit_lock_key;
-static void dsa_slave_set_lockdep_class_one(struct net_device *dev,
- struct netdev_queue *txq,
- void *_unused)
-{
- lockdep_set_class(&txq->_xmit_lock,
- &dsa_slave_netdev_xmit_lock_key);
-}
-
-int dsa_slave_suspend(struct net_device *slave_dev)
-{
- struct dsa_port *dp = dsa_slave_to_port(slave_dev);
-
- if (!netif_running(slave_dev))
- return 0;
-
- netif_device_detach(slave_dev);
-
- rtnl_lock();
- phylink_stop(dp->pl);
- rtnl_unlock();
-
- return 0;
-}
-
-int dsa_slave_resume(struct net_device *slave_dev)
-{
- struct dsa_port *dp = dsa_slave_to_port(slave_dev);
-
- if (!netif_running(slave_dev))
- return 0;
-
- netif_device_attach(slave_dev);
-
- rtnl_lock();
- phylink_start(dp->pl);
- rtnl_unlock();
-
- return 0;
-}
-
-static void dsa_slave_notify(struct net_device *dev, unsigned long val)
-{
- struct net_device *master = dsa_slave_to_master(dev);
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct dsa_notifier_register_info rinfo = {
- .switch_number = dp->ds->index,
- .port_number = dp->index,
- .master = master,
- .info.dev = dev,
- };
-
- call_dsa_notifiers(val, dev, &rinfo.info);
-}
-
-int dsa_slave_create(struct dsa_port *port)
-{
- const struct dsa_port *cpu_dp = port->cpu_dp;
- struct net_device *master = cpu_dp->master;
- struct dsa_switch *ds = port->ds;
- const char *name = port->name;
- struct net_device *slave_dev;
- struct dsa_slave_priv *p;
- int ret;
-
- if (!ds->num_tx_queues)
- ds->num_tx_queues = 1;
-
- slave_dev = alloc_netdev_mqs(sizeof(struct dsa_slave_priv), name,
- NET_NAME_UNKNOWN, ether_setup,
- ds->num_tx_queues, 1);
- if (slave_dev == NULL)
- return -ENOMEM;
-
- slave_dev->features = master->vlan_features | NETIF_F_HW_TC;
- slave_dev->hw_features |= NETIF_F_HW_TC;
- slave_dev->ethtool_ops = &dsa_slave_ethtool_ops;
- eth_hw_addr_inherit(slave_dev, master);
- slave_dev->priv_flags |= IFF_NO_QUEUE;
- slave_dev->netdev_ops = &dsa_slave_netdev_ops;
- slave_dev->switchdev_ops = &dsa_slave_switchdev_ops;
- slave_dev->min_mtu = 0;
- slave_dev->max_mtu = ETH_MAX_MTU;
- SET_NETDEV_DEVTYPE(slave_dev, &dsa_type);
-
- netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one,
- NULL);
-
- SET_NETDEV_DEV(slave_dev, port->ds->dev);
- slave_dev->dev.of_node = port->dn;
- slave_dev->vlan_features = master->vlan_features;
-
- p = netdev_priv(slave_dev);
- p->stats64 = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
- if (!p->stats64) {
- free_netdev(slave_dev);
- return -ENOMEM;
- }
- p->dp = port;
- INIT_LIST_HEAD(&p->mall_tc_list);
- p->xmit = cpu_dp->tag_ops->xmit;
- port->slave = slave_dev;
-
- netif_carrier_off(slave_dev);
-
- ret = dsa_slave_phy_setup(slave_dev);
- if (ret) {
- netdev_err(master, "error %d setting up slave phy\n", ret);
- goto out_free;
- }
-
- dsa_slave_notify(slave_dev, DSA_PORT_REGISTER);
-
- ret = register_netdev(slave_dev);
- if (ret) {
- netdev_err(master, "error %d registering interface %s\n",
- ret, slave_dev->name);
- goto out_phy;
- }
-
- return 0;
-
-out_phy:
- rtnl_lock();
- phylink_disconnect_phy(p->dp->pl);
- rtnl_unlock();
- phylink_destroy(p->dp->pl);
-out_free:
- free_percpu(p->stats64);
- free_netdev(slave_dev);
- port->slave = NULL;
- return ret;
-}
-
-void dsa_slave_destroy(struct net_device *slave_dev)
-{
- struct dsa_port *dp = dsa_slave_to_port(slave_dev);
- struct dsa_slave_priv *p = netdev_priv(slave_dev);
-
- netif_carrier_off(slave_dev);
- rtnl_lock();
- phylink_disconnect_phy(dp->pl);
- rtnl_unlock();
-
- dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER);
- unregister_netdev(slave_dev);
- phylink_destroy(dp->pl);
- free_percpu(p->stats64);
- free_netdev(slave_dev);
-}
-
-static bool dsa_slave_dev_check(struct net_device *dev)
-{
- return dev->netdev_ops == &dsa_slave_netdev_ops;
-}
-
-static int dsa_slave_changeupper(struct net_device *dev,
- struct netdev_notifier_changeupper_info *info)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- int err = NOTIFY_DONE;
-
- if (netif_is_bridge_master(info->upper_dev)) {
- if (info->linking) {
- err = dsa_port_bridge_join(dp, info->upper_dev);
- err = notifier_from_errno(err);
- } else {
- dsa_port_bridge_leave(dp, info->upper_dev);
- err = NOTIFY_OK;
- }
- }
-
- return err;
-}
-
-static int dsa_slave_netdevice_event(struct notifier_block *nb,
- unsigned long event, void *ptr)
-{
- struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-
- if (!dsa_slave_dev_check(dev))
- return NOTIFY_DONE;
-
- if (event == NETDEV_CHANGEUPPER)
- return dsa_slave_changeupper(dev, ptr);
-
- return NOTIFY_DONE;
-}
-
-struct dsa_switchdev_event_work {
- struct work_struct work;
- struct switchdev_notifier_fdb_info fdb_info;
- struct net_device *dev;
- unsigned long event;
-};
-
-static void dsa_slave_switchdev_event_work(struct work_struct *work)
-{
- struct dsa_switchdev_event_work *switchdev_work =
- container_of(work, struct dsa_switchdev_event_work, work);
- struct net_device *dev = switchdev_work->dev;
- struct switchdev_notifier_fdb_info *fdb_info;
- struct dsa_port *dp = dsa_slave_to_port(dev);
- int err;
-
- rtnl_lock();
- switch (switchdev_work->event) {
- case SWITCHDEV_FDB_ADD_TO_DEVICE:
- fdb_info = &switchdev_work->fdb_info;
- if (!fdb_info->added_by_user)
- break;
-
- err = dsa_port_fdb_add(dp, fdb_info->addr, fdb_info->vid);
- if (err) {
- netdev_dbg(dev, "fdb add failed err=%d\n", err);
- break;
- }
- call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED, dev,
- &fdb_info->info);
- break;
-
- case SWITCHDEV_FDB_DEL_TO_DEVICE:
- fdb_info = &switchdev_work->fdb_info;
- if (!fdb_info->added_by_user)
- break;
-
- err = dsa_port_fdb_del(dp, fdb_info->addr, fdb_info->vid);
- if (err) {
- netdev_dbg(dev, "fdb del failed err=%d\n", err);
- dev_close(dev);
- }
- break;
- }
- rtnl_unlock();
-
- kfree(switchdev_work->fdb_info.addr);
- kfree(switchdev_work);
- dev_put(dev);
-}
-
-static int
-dsa_slave_switchdev_fdb_work_init(struct dsa_switchdev_event_work *
- switchdev_work,
- const struct switchdev_notifier_fdb_info *
- fdb_info)
-{
- memcpy(&switchdev_work->fdb_info, fdb_info,
- sizeof(switchdev_work->fdb_info));
- switchdev_work->fdb_info.addr = kzalloc(ETH_ALEN, GFP_ATOMIC);
- if (!switchdev_work->fdb_info.addr)
- return -ENOMEM;
- ether_addr_copy((u8 *)switchdev_work->fdb_info.addr,
- fdb_info->addr);
- return 0;
-}
-
-/* Called under rcu_read_lock() */
-static int dsa_slave_switchdev_event(struct notifier_block *unused,
- unsigned long event, void *ptr)
-{
- struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
- struct dsa_switchdev_event_work *switchdev_work;
-
- if (!dsa_slave_dev_check(dev))
- return NOTIFY_DONE;
-
- switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC);
- if (!switchdev_work)
- return NOTIFY_BAD;
-
- INIT_WORK(&switchdev_work->work,
- dsa_slave_switchdev_event_work);
- switchdev_work->dev = dev;
- switchdev_work->event = event;
-
- switch (event) {
- case SWITCHDEV_FDB_ADD_TO_DEVICE: /* fall through */
- case SWITCHDEV_FDB_DEL_TO_DEVICE:
- if (dsa_slave_switchdev_fdb_work_init(switchdev_work, ptr))
- goto err_fdb_work_init;
- dev_hold(dev);
- break;
- default:
- kfree(switchdev_work);
- return NOTIFY_DONE;
- }
-
- dsa_schedule_work(&switchdev_work->work);
- return NOTIFY_OK;
-
-err_fdb_work_init:
- kfree(switchdev_work);
- return NOTIFY_BAD;
-}
-
-static struct notifier_block dsa_slave_nb __read_mostly = {
- .notifier_call = dsa_slave_netdevice_event,
-};
-
-static struct notifier_block dsa_slave_switchdev_notifier = {
- .notifier_call = dsa_slave_switchdev_event,
-};
-
-int dsa_slave_register_notifier(void)
-{
- int err;
-
- err = register_netdevice_notifier(&dsa_slave_nb);
- if (err)
- return err;
-
- err = register_switchdev_notifier(&dsa_slave_switchdev_notifier);
- if (err)
- goto err_switchdev_nb;
-
- return 0;
-
-err_switchdev_nb:
- unregister_netdevice_notifier(&dsa_slave_nb);
- return err;
-}
-
-void dsa_slave_unregister_notifier(void)
-{
- int err;
-
- err = unregister_switchdev_notifier(&dsa_slave_switchdev_notifier);
- if (err)
- pr_err("DSA: failed to unregister switchdev notifier (%d)\n", err);
-
- err = unregister_netdevice_notifier(&dsa_slave_nb);
- if (err)
- pr_err("DSA: failed to unregister slave notifier (%d)\n", err);
-}
diff --git a/net/dsa/stubs.c b/net/dsa/stubs.c
new file mode 100644
index 000000000000..2ed8a6c85fbf
--- /dev/null
+++ b/net/dsa/stubs.c
@@ -0,0 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Stubs for DSA functionality called by the core network stack.
+ * These are necessary because CONFIG_NET_DSA can be a module, and built-in
+ * code cannot directly call symbols exported by modules.
+ */
+#include <net/dsa_stubs.h>
+
+const struct dsa_stubs *dsa_stubs;
+EXPORT_SYMBOL_GPL(dsa_stubs);
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 142b294d3446..3d2feeea897b 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -1,32 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Handling of a single switch chip, part of a switch fabric
*
* Copyright (c) 2017 Savoir-faire Linux Inc.
* Vivien Didelot <vivien.didelot@savoirfairelinux.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
+#include <linux/if_bridge.h>
#include <linux/netdevice.h>
#include <linux/notifier.h>
+#include <linux/if_vlan.h>
#include <net/switchdev.h>
-#include "dsa_priv.h"
+#include "dsa.h"
+#include "netlink.h"
+#include "port.h"
+#include "switch.h"
+#include "tag_8021q.h"
+#include "trace.h"
+#include "user.h"
static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds,
unsigned int ageing_time)
{
- int i;
-
- for (i = 0; i < ds->num_ports; ++i) {
- struct dsa_port *dp = &ds->ports[i];
+ struct dsa_port *dp;
+ dsa_switch_for_each_port(dp, ds)
if (dp->ageing_time && dp->ageing_time < ageing_time)
ageing_time = dp->ageing_time;
- }
return ageing_time;
}
@@ -35,15 +36,12 @@ static int dsa_switch_ageing_time(struct dsa_switch *ds,
struct dsa_notifier_ageing_time_info *info)
{
unsigned int ageing_time = info->ageing_time;
- struct switchdev_trans *trans = info->trans;
- if (switchdev_trans_ph_prepare(trans)) {
- if (ds->ageing_time_min && ageing_time < ds->ageing_time_min)
- return -ERANGE;
- if (ds->ageing_time_max && ageing_time > ds->ageing_time_max)
- return -ERANGE;
- return 0;
- }
+ if (ds->ageing_time_min && ageing_time < ds->ageing_time_min)
+ return -ERANGE;
+
+ if (ds->ageing_time_max && ageing_time > ds->ageing_time_max)
+ return -ERANGE;
/* Program the fastest ageing time in case of multiple bridges */
ageing_time = dsa_switch_fastest_ageing_time(ds, ageing_time);
@@ -54,15 +52,60 @@ static int dsa_switch_ageing_time(struct dsa_switch *ds,
return 0;
}
+static bool dsa_port_mtu_match(struct dsa_port *dp,
+ struct dsa_notifier_mtu_info *info)
+{
+ return dp == info->dp || dsa_port_is_dsa(dp) || dsa_port_is_cpu(dp);
+}
+
+static int dsa_switch_mtu(struct dsa_switch *ds,
+ struct dsa_notifier_mtu_info *info)
+{
+ struct dsa_port *dp;
+ int ret;
+
+ if (!ds->ops->port_change_mtu)
+ return -EOPNOTSUPP;
+
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_mtu_match(dp, info)) {
+ ret = ds->ops->port_change_mtu(ds, dp->index,
+ info->mtu);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
static int dsa_switch_bridge_join(struct dsa_switch *ds,
struct dsa_notifier_bridge_info *info)
{
- if (ds->index == info->sw_index && ds->ops->port_bridge_join)
- return ds->ops->port_bridge_join(ds, info->port, info->br);
+ int err;
+
+ if (info->dp->ds == ds) {
+ if (!ds->ops->port_bridge_join)
+ return -EOPNOTSUPP;
+
+ err = ds->ops->port_bridge_join(ds, info->dp->index,
+ info->bridge,
+ &info->tx_fwd_offload,
+ info->extack);
+ if (err)
+ return err;
+ }
- if (ds->index != info->sw_index && ds->ops->crosschip_bridge_join)
- return ds->ops->crosschip_bridge_join(ds, info->sw_index,
- info->port, info->br);
+ if (info->dp->ds != ds && ds->ops->crosschip_bridge_join) {
+ err = ds->ops->crosschip_bridge_join(ds,
+ info->dp->ds->dst->index,
+ info->dp->ds->index,
+ info->dp->index,
+ info->bridge,
+ info->extack);
+ if (err)
+ return err;
+ }
return 0;
}
@@ -70,153 +113,698 @@ static int dsa_switch_bridge_join(struct dsa_switch *ds,
static int dsa_switch_bridge_leave(struct dsa_switch *ds,
struct dsa_notifier_bridge_info *info)
{
- if (ds->index == info->sw_index && ds->ops->port_bridge_leave)
- ds->ops->port_bridge_leave(ds, info->port, info->br);
+ if (info->dp->ds == ds && ds->ops->port_bridge_leave)
+ ds->ops->port_bridge_leave(ds, info->dp->index, info->bridge);
- if (ds->index != info->sw_index && ds->ops->crosschip_bridge_leave)
- ds->ops->crosschip_bridge_leave(ds, info->sw_index, info->port,
- info->br);
+ if (info->dp->ds != ds && ds->ops->crosschip_bridge_leave)
+ ds->ops->crosschip_bridge_leave(ds, info->dp->ds->dst->index,
+ info->dp->ds->index,
+ info->dp->index,
+ info->bridge);
return 0;
}
+/* Matches for all upstream-facing ports (the CPU port and all upstream-facing
+ * DSA links) that sit between the targeted port on which the notifier was
+ * emitted and its dedicated CPU port.
+ */
+static bool dsa_port_host_address_match(struct dsa_port *dp,
+ const struct dsa_port *targeted_dp)
+{
+ struct dsa_port *cpu_dp = targeted_dp->cpu_dp;
+
+ if (dsa_switch_is_upstream_of(dp->ds, targeted_dp->ds))
+ return dp->index == dsa_towards_port(dp->ds, cpu_dp->ds->index,
+ cpu_dp->index);
+
+ return false;
+}
+
+static struct dsa_mac_addr *dsa_mac_addr_find(struct list_head *addr_list,
+ const unsigned char *addr, u16 vid,
+ struct dsa_db db)
+{
+ struct dsa_mac_addr *a;
+
+ list_for_each_entry(a, addr_list, list)
+ if (ether_addr_equal(a->addr, addr) && a->vid == vid &&
+ dsa_db_equal(&a->db, &db))
+ return a;
+
+ return NULL;
+}
+
+static int dsa_port_do_mdb_add(struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb,
+ struct dsa_db db)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_mac_addr *a;
+ int port = dp->index;
+ int err = 0;
+
+ /* No need to bother with refcounting for user ports */
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) {
+ err = ds->ops->port_mdb_add(ds, port, mdb, db);
+ trace_dsa_mdb_add_hw(dp, mdb->addr, mdb->vid, &db, err);
+
+ return err;
+ }
+
+ mutex_lock(&dp->addr_lists_lock);
+
+ a = dsa_mac_addr_find(&dp->mdbs, mdb->addr, mdb->vid, db);
+ if (a) {
+ refcount_inc(&a->refcount);
+ trace_dsa_mdb_add_bump(dp, mdb->addr, mdb->vid, &db,
+ &a->refcount);
+ goto out;
+ }
+
+ a = kzalloc(sizeof(*a), GFP_KERNEL);
+ if (!a) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = ds->ops->port_mdb_add(ds, port, mdb, db);
+ trace_dsa_mdb_add_hw(dp, mdb->addr, mdb->vid, &db, err);
+ if (err) {
+ kfree(a);
+ goto out;
+ }
+
+ ether_addr_copy(a->addr, mdb->addr);
+ a->vid = mdb->vid;
+ a->db = db;
+ refcount_set(&a->refcount, 1);
+ list_add_tail(&a->list, &dp->mdbs);
+
+out:
+ mutex_unlock(&dp->addr_lists_lock);
+
+ return err;
+}
+
+static int dsa_port_do_mdb_del(struct dsa_port *dp,
+ const struct switchdev_obj_port_mdb *mdb,
+ struct dsa_db db)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_mac_addr *a;
+ int port = dp->index;
+ int err = 0;
+
+ /* No need to bother with refcounting for user ports */
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) {
+ err = ds->ops->port_mdb_del(ds, port, mdb, db);
+ trace_dsa_mdb_del_hw(dp, mdb->addr, mdb->vid, &db, err);
+
+ return err;
+ }
+
+ mutex_lock(&dp->addr_lists_lock);
+
+ a = dsa_mac_addr_find(&dp->mdbs, mdb->addr, mdb->vid, db);
+ if (!a) {
+ trace_dsa_mdb_del_not_found(dp, mdb->addr, mdb->vid, &db);
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (!refcount_dec_and_test(&a->refcount)) {
+ trace_dsa_mdb_del_drop(dp, mdb->addr, mdb->vid, &db,
+ &a->refcount);
+ goto out;
+ }
+
+ err = ds->ops->port_mdb_del(ds, port, mdb, db);
+ trace_dsa_mdb_del_hw(dp, mdb->addr, mdb->vid, &db, err);
+ if (err) {
+ refcount_set(&a->refcount, 1);
+ goto out;
+ }
+
+ list_del(&a->list);
+ kfree(a);
+
+out:
+ mutex_unlock(&dp->addr_lists_lock);
+
+ return err;
+}
+
+static int dsa_port_do_fdb_add(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, struct dsa_db db)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_mac_addr *a;
+ int port = dp->index;
+ int err = 0;
+
+ /* No need to bother with refcounting for user ports */
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) {
+ err = ds->ops->port_fdb_add(ds, port, addr, vid, db);
+ trace_dsa_fdb_add_hw(dp, addr, vid, &db, err);
+
+ return err;
+ }
+
+ mutex_lock(&dp->addr_lists_lock);
+
+ a = dsa_mac_addr_find(&dp->fdbs, addr, vid, db);
+ if (a) {
+ refcount_inc(&a->refcount);
+ trace_dsa_fdb_add_bump(dp, addr, vid, &db, &a->refcount);
+ goto out;
+ }
+
+ a = kzalloc(sizeof(*a), GFP_KERNEL);
+ if (!a) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = ds->ops->port_fdb_add(ds, port, addr, vid, db);
+ trace_dsa_fdb_add_hw(dp, addr, vid, &db, err);
+ if (err) {
+ kfree(a);
+ goto out;
+ }
+
+ ether_addr_copy(a->addr, addr);
+ a->vid = vid;
+ a->db = db;
+ refcount_set(&a->refcount, 1);
+ list_add_tail(&a->list, &dp->fdbs);
+
+out:
+ mutex_unlock(&dp->addr_lists_lock);
+
+ return err;
+}
+
+static int dsa_port_do_fdb_del(struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, struct dsa_db db)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_mac_addr *a;
+ int port = dp->index;
+ int err = 0;
+
+ /* No need to bother with refcounting for user ports */
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) {
+ err = ds->ops->port_fdb_del(ds, port, addr, vid, db);
+ trace_dsa_fdb_del_hw(dp, addr, vid, &db, err);
+
+ return err;
+ }
+
+ mutex_lock(&dp->addr_lists_lock);
+
+ a = dsa_mac_addr_find(&dp->fdbs, addr, vid, db);
+ if (!a) {
+ trace_dsa_fdb_del_not_found(dp, addr, vid, &db);
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (!refcount_dec_and_test(&a->refcount)) {
+ trace_dsa_fdb_del_drop(dp, addr, vid, &db, &a->refcount);
+ goto out;
+ }
+
+ err = ds->ops->port_fdb_del(ds, port, addr, vid, db);
+ trace_dsa_fdb_del_hw(dp, addr, vid, &db, err);
+ if (err) {
+ refcount_set(&a->refcount, 1);
+ goto out;
+ }
+
+ list_del(&a->list);
+ kfree(a);
+
+out:
+ mutex_unlock(&dp->addr_lists_lock);
+
+ return err;
+}
+
+static int dsa_switch_do_lag_fdb_add(struct dsa_switch *ds, struct dsa_lag *lag,
+ const unsigned char *addr, u16 vid,
+ struct dsa_db db)
+{
+ struct dsa_mac_addr *a;
+ int err = 0;
+
+ mutex_lock(&lag->fdb_lock);
+
+ a = dsa_mac_addr_find(&lag->fdbs, addr, vid, db);
+ if (a) {
+ refcount_inc(&a->refcount);
+ trace_dsa_lag_fdb_add_bump(lag->dev, addr, vid, &db,
+ &a->refcount);
+ goto out;
+ }
+
+ a = kzalloc(sizeof(*a), GFP_KERNEL);
+ if (!a) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = ds->ops->lag_fdb_add(ds, *lag, addr, vid, db);
+ trace_dsa_lag_fdb_add_hw(lag->dev, addr, vid, &db, err);
+ if (err) {
+ kfree(a);
+ goto out;
+ }
+
+ ether_addr_copy(a->addr, addr);
+ a->vid = vid;
+ a->db = db;
+ refcount_set(&a->refcount, 1);
+ list_add_tail(&a->list, &lag->fdbs);
+
+out:
+ mutex_unlock(&lag->fdb_lock);
+
+ return err;
+}
+
+static int dsa_switch_do_lag_fdb_del(struct dsa_switch *ds, struct dsa_lag *lag,
+ const unsigned char *addr, u16 vid,
+ struct dsa_db db)
+{
+ struct dsa_mac_addr *a;
+ int err = 0;
+
+ mutex_lock(&lag->fdb_lock);
+
+ a = dsa_mac_addr_find(&lag->fdbs, addr, vid, db);
+ if (!a) {
+ trace_dsa_lag_fdb_del_not_found(lag->dev, addr, vid, &db);
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (!refcount_dec_and_test(&a->refcount)) {
+ trace_dsa_lag_fdb_del_drop(lag->dev, addr, vid, &db,
+ &a->refcount);
+ goto out;
+ }
+
+ err = ds->ops->lag_fdb_del(ds, *lag, addr, vid, db);
+ trace_dsa_lag_fdb_del_hw(lag->dev, addr, vid, &db, err);
+ if (err) {
+ refcount_set(&a->refcount, 1);
+ goto out;
+ }
+
+ list_del(&a->list);
+ kfree(a);
+
+out:
+ mutex_unlock(&lag->fdb_lock);
+
+ return err;
+}
+
+static int dsa_switch_host_fdb_add(struct dsa_switch *ds,
+ struct dsa_notifier_fdb_info *info)
+{
+ struct dsa_port *dp;
+ int err = 0;
+
+ if (!ds->ops->port_fdb_add)
+ return -EOPNOTSUPP;
+
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_host_address_match(dp, info->dp)) {
+ if (dsa_port_is_cpu(dp) && info->dp->cpu_port_in_lag) {
+ err = dsa_switch_do_lag_fdb_add(ds, dp->lag,
+ info->addr,
+ info->vid,
+ info->db);
+ } else {
+ err = dsa_port_do_fdb_add(dp, info->addr,
+ info->vid, info->db);
+ }
+ if (err)
+ break;
+ }
+ }
+
+ return err;
+}
+
+static int dsa_switch_host_fdb_del(struct dsa_switch *ds,
+ struct dsa_notifier_fdb_info *info)
+{
+ struct dsa_port *dp;
+ int err = 0;
+
+ if (!ds->ops->port_fdb_del)
+ return -EOPNOTSUPP;
+
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_host_address_match(dp, info->dp)) {
+ if (dsa_port_is_cpu(dp) && info->dp->cpu_port_in_lag) {
+ err = dsa_switch_do_lag_fdb_del(ds, dp->lag,
+ info->addr,
+ info->vid,
+ info->db);
+ } else {
+ err = dsa_port_do_fdb_del(dp, info->addr,
+ info->vid, info->db);
+ }
+ if (err)
+ break;
+ }
+ }
+
+ return err;
+}
+
static int dsa_switch_fdb_add(struct dsa_switch *ds,
struct dsa_notifier_fdb_info *info)
{
- int port = dsa_towards_port(ds, info->sw_index, info->port);
+ int port = dsa_towards_port(ds, info->dp->ds->index, info->dp->index);
+ struct dsa_port *dp = dsa_to_port(ds, port);
if (!ds->ops->port_fdb_add)
return -EOPNOTSUPP;
- return ds->ops->port_fdb_add(ds, port, info->addr, info->vid);
+ return dsa_port_do_fdb_add(dp, info->addr, info->vid, info->db);
}
static int dsa_switch_fdb_del(struct dsa_switch *ds,
struct dsa_notifier_fdb_info *info)
{
- int port = dsa_towards_port(ds, info->sw_index, info->port);
+ int port = dsa_towards_port(ds, info->dp->ds->index, info->dp->index);
+ struct dsa_port *dp = dsa_to_port(ds, port);
if (!ds->ops->port_fdb_del)
return -EOPNOTSUPP;
- return ds->ops->port_fdb_del(ds, port, info->addr, info->vid);
+ return dsa_port_do_fdb_del(dp, info->addr, info->vid, info->db);
}
-static int
-dsa_switch_mdb_prepare_bitmap(struct dsa_switch *ds,
- const struct switchdev_obj_port_mdb *mdb,
- const unsigned long *bitmap)
+static int dsa_switch_lag_fdb_add(struct dsa_switch *ds,
+ struct dsa_notifier_lag_fdb_info *info)
{
- int port, err;
+ struct dsa_port *dp;
- if (!ds->ops->port_mdb_prepare || !ds->ops->port_mdb_add)
+ if (!ds->ops->lag_fdb_add)
return -EOPNOTSUPP;
- for_each_set_bit(port, bitmap, ds->num_ports) {
- err = ds->ops->port_mdb_prepare(ds, port, mdb);
- if (err)
- return err;
- }
+ /* Notify switch only if it has a port in this LAG */
+ dsa_switch_for_each_port(dp, ds)
+ if (dsa_port_offloads_lag(dp, info->lag))
+ return dsa_switch_do_lag_fdb_add(ds, info->lag,
+ info->addr, info->vid,
+ info->db);
return 0;
}
-static void dsa_switch_mdb_add_bitmap(struct dsa_switch *ds,
- const struct switchdev_obj_port_mdb *mdb,
- const unsigned long *bitmap)
+static int dsa_switch_lag_fdb_del(struct dsa_switch *ds,
+ struct dsa_notifier_lag_fdb_info *info)
{
- int port;
+ struct dsa_port *dp;
+
+ if (!ds->ops->lag_fdb_del)
+ return -EOPNOTSUPP;
- for_each_set_bit(port, bitmap, ds->num_ports)
- ds->ops->port_mdb_add(ds, port, mdb);
+ /* Notify switch only if it has a port in this LAG */
+ dsa_switch_for_each_port(dp, ds)
+ if (dsa_port_offloads_lag(dp, info->lag))
+ return dsa_switch_do_lag_fdb_del(ds, info->lag,
+ info->addr, info->vid,
+ info->db);
+
+ return 0;
}
-static int dsa_switch_mdb_add(struct dsa_switch *ds,
- struct dsa_notifier_mdb_info *info)
+static int dsa_switch_lag_change(struct dsa_switch *ds,
+ struct dsa_notifier_lag_info *info)
{
- const struct switchdev_obj_port_mdb *mdb = info->mdb;
- struct switchdev_trans *trans = info->trans;
- int port;
+ if (info->dp->ds == ds && ds->ops->port_lag_change)
+ return ds->ops->port_lag_change(ds, info->dp->index);
- /* Build a mask of Multicast group members */
- bitmap_zero(ds->bitmap, ds->num_ports);
- if (ds->index == info->sw_index)
- set_bit(info->port, ds->bitmap);
- for (port = 0; port < ds->num_ports; port++)
- if (dsa_is_dsa_port(ds, port))
- set_bit(port, ds->bitmap);
+ if (info->dp->ds != ds && ds->ops->crosschip_lag_change)
+ return ds->ops->crosschip_lag_change(ds, info->dp->ds->index,
+ info->dp->index);
- if (switchdev_trans_ph_prepare(trans))
- return dsa_switch_mdb_prepare_bitmap(ds, mdb, ds->bitmap);
+ return 0;
+}
- dsa_switch_mdb_add_bitmap(ds, mdb, ds->bitmap);
+static int dsa_switch_lag_join(struct dsa_switch *ds,
+ struct dsa_notifier_lag_info *info)
+{
+ if (info->dp->ds == ds && ds->ops->port_lag_join)
+ return ds->ops->port_lag_join(ds, info->dp->index, info->lag,
+ info->info, info->extack);
- return 0;
+ if (info->dp->ds != ds && ds->ops->crosschip_lag_join)
+ return ds->ops->crosschip_lag_join(ds, info->dp->ds->index,
+ info->dp->index, info->lag,
+ info->info, info->extack);
+
+ return -EOPNOTSUPP;
+}
+
+static int dsa_switch_lag_leave(struct dsa_switch *ds,
+ struct dsa_notifier_lag_info *info)
+{
+ if (info->dp->ds == ds && ds->ops->port_lag_leave)
+ return ds->ops->port_lag_leave(ds, info->dp->index, info->lag);
+
+ if (info->dp->ds != ds && ds->ops->crosschip_lag_leave)
+ return ds->ops->crosschip_lag_leave(ds, info->dp->ds->index,
+ info->dp->index, info->lag);
+
+ return -EOPNOTSUPP;
+}
+
+static int dsa_switch_mdb_add(struct dsa_switch *ds,
+ struct dsa_notifier_mdb_info *info)
+{
+ int port = dsa_towards_port(ds, info->dp->ds->index, info->dp->index);
+ struct dsa_port *dp = dsa_to_port(ds, port);
+
+ if (!ds->ops->port_mdb_add)
+ return -EOPNOTSUPP;
+
+ return dsa_port_do_mdb_add(dp, info->mdb, info->db);
}
static int dsa_switch_mdb_del(struct dsa_switch *ds,
struct dsa_notifier_mdb_info *info)
{
- const struct switchdev_obj_port_mdb *mdb = info->mdb;
+ int port = dsa_towards_port(ds, info->dp->ds->index, info->dp->index);
+ struct dsa_port *dp = dsa_to_port(ds, port);
if (!ds->ops->port_mdb_del)
return -EOPNOTSUPP;
- if (ds->index == info->sw_index)
- return ds->ops->port_mdb_del(ds, info->port, mdb);
+ return dsa_port_do_mdb_del(dp, info->mdb, info->db);
+}
- return 0;
+static int dsa_switch_host_mdb_add(struct dsa_switch *ds,
+ struct dsa_notifier_mdb_info *info)
+{
+ struct dsa_port *dp;
+ int err = 0;
+
+ if (!ds->ops->port_mdb_add)
+ return -EOPNOTSUPP;
+
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_host_address_match(dp, info->dp)) {
+ err = dsa_port_do_mdb_add(dp, info->mdb, info->db);
+ if (err)
+ break;
+ }
+ }
+
+ return err;
}
-static int
-dsa_switch_vlan_prepare_bitmap(struct dsa_switch *ds,
- const struct switchdev_obj_port_vlan *vlan,
- const unsigned long *bitmap)
+static int dsa_switch_host_mdb_del(struct dsa_switch *ds,
+ struct dsa_notifier_mdb_info *info)
{
- int port, err;
+ struct dsa_port *dp;
+ int err = 0;
- if (!ds->ops->port_vlan_prepare || !ds->ops->port_vlan_add)
+ if (!ds->ops->port_mdb_del)
return -EOPNOTSUPP;
- for_each_set_bit(port, bitmap, ds->num_ports) {
- err = ds->ops->port_vlan_prepare(ds, port, vlan);
- if (err)
- return err;
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_host_address_match(dp, info->dp)) {
+ err = dsa_port_do_mdb_del(dp, info->mdb, info->db);
+ if (err)
+ break;
+ }
}
- return 0;
+ return err;
+}
+
+/* Port VLANs match on the targeted port and on all DSA ports */
+static bool dsa_port_vlan_match(struct dsa_port *dp,
+ struct dsa_notifier_vlan_info *info)
+{
+ return dsa_port_is_dsa(dp) || dp == info->dp;
+}
+
+/* Host VLANs match on the targeted port's CPU port, and on all DSA ports
+ * (upstream and downstream) of that switch and its upstream switches.
+ */
+static bool dsa_port_host_vlan_match(struct dsa_port *dp,
+ const struct dsa_port *targeted_dp)
+{
+ struct dsa_port *cpu_dp = targeted_dp->cpu_dp;
+
+ if (dsa_switch_is_upstream_of(dp->ds, targeted_dp->ds))
+ return dsa_port_is_dsa(dp) || dp == cpu_dp;
+
+ return false;
+}
+
+struct dsa_vlan *dsa_vlan_find(struct list_head *vlan_list,
+ const struct switchdev_obj_port_vlan *vlan)
+{
+ struct dsa_vlan *v;
+
+ list_for_each_entry(v, vlan_list, list)
+ if (v->vid == vlan->vid)
+ return v;
+
+ return NULL;
+}
+
+static int dsa_port_do_vlan_add(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+ struct dsa_vlan *v;
+ int err = 0;
+
+ /* No need to bother with refcounting for user ports. */
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) {
+ err = ds->ops->port_vlan_add(ds, port, vlan, extack);
+ trace_dsa_vlan_add_hw(dp, vlan, err);
+
+ return err;
+ }
+
+ /* No need to propagate on shared ports the existing VLANs that were
+ * re-notified after just the flags have changed. This would cause a
+ * refcount bump which we need to avoid, since it unbalances the
+ * additions with the deletions.
+ */
+ if (vlan->changed)
+ return 0;
+
+ mutex_lock(&dp->vlans_lock);
+
+ v = dsa_vlan_find(&dp->vlans, vlan);
+ if (v) {
+ refcount_inc(&v->refcount);
+ trace_dsa_vlan_add_bump(dp, vlan, &v->refcount);
+ goto out;
+ }
+
+ v = kzalloc(sizeof(*v), GFP_KERNEL);
+ if (!v) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = ds->ops->port_vlan_add(ds, port, vlan, extack);
+ trace_dsa_vlan_add_hw(dp, vlan, err);
+ if (err) {
+ kfree(v);
+ goto out;
+ }
+
+ v->vid = vlan->vid;
+ refcount_set(&v->refcount, 1);
+ list_add_tail(&v->list, &dp->vlans);
+
+out:
+ mutex_unlock(&dp->vlans_lock);
+
+ return err;
}
-static void
-dsa_switch_vlan_add_bitmap(struct dsa_switch *ds,
- const struct switchdev_obj_port_vlan *vlan,
- const unsigned long *bitmap)
+static int dsa_port_do_vlan_del(struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan)
{
- int port;
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+ struct dsa_vlan *v;
+ int err = 0;
+
+ /* No need to bother with refcounting for user ports */
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp))) {
+ err = ds->ops->port_vlan_del(ds, port, vlan);
+ trace_dsa_vlan_del_hw(dp, vlan, err);
+
+ return err;
+ }
+
+ mutex_lock(&dp->vlans_lock);
+
+ v = dsa_vlan_find(&dp->vlans, vlan);
+ if (!v) {
+ trace_dsa_vlan_del_not_found(dp, vlan);
+ err = -ENOENT;
+ goto out;
+ }
+
+ if (!refcount_dec_and_test(&v->refcount)) {
+ trace_dsa_vlan_del_drop(dp, vlan, &v->refcount);
+ goto out;
+ }
+
+ err = ds->ops->port_vlan_del(ds, port, vlan);
+ trace_dsa_vlan_del_hw(dp, vlan, err);
+ if (err) {
+ refcount_set(&v->refcount, 1);
+ goto out;
+ }
- for_each_set_bit(port, bitmap, ds->num_ports)
- ds->ops->port_vlan_add(ds, port, vlan);
+ list_del(&v->list);
+ kfree(v);
+
+out:
+ mutex_unlock(&dp->vlans_lock);
+
+ return err;
}
static int dsa_switch_vlan_add(struct dsa_switch *ds,
struct dsa_notifier_vlan_info *info)
{
- const struct switchdev_obj_port_vlan *vlan = info->vlan;
- struct switchdev_trans *trans = info->trans;
- int port;
-
- /* Build a mask of VLAN members */
- bitmap_zero(ds->bitmap, ds->num_ports);
- if (ds->index == info->sw_index)
- set_bit(info->port, ds->bitmap);
- for (port = 0; port < ds->num_ports; port++)
- if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))
- set_bit(port, ds->bitmap);
+ struct dsa_port *dp;
+ int err;
- if (switchdev_trans_ph_prepare(trans))
- return dsa_switch_vlan_prepare_bitmap(ds, vlan, ds->bitmap);
+ if (!ds->ops->port_vlan_add)
+ return -EOPNOTSUPP;
- dsa_switch_vlan_add_bitmap(ds, vlan, ds->bitmap);
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_vlan_match(dp, info)) {
+ err = dsa_port_do_vlan_add(dp, info->vlan,
+ info->extack);
+ if (err)
+ return err;
+ }
+ }
return 0;
}
@@ -224,13 +812,161 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds,
static int dsa_switch_vlan_del(struct dsa_switch *ds,
struct dsa_notifier_vlan_info *info)
{
- const struct switchdev_obj_port_vlan *vlan = info->vlan;
+ struct dsa_port *dp;
+ int err;
if (!ds->ops->port_vlan_del)
return -EOPNOTSUPP;
- if (ds->index == info->sw_index)
- return ds->ops->port_vlan_del(ds, info->port, vlan);
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_vlan_match(dp, info)) {
+ err = dsa_port_do_vlan_del(dp, info->vlan);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int dsa_switch_host_vlan_add(struct dsa_switch *ds,
+ struct dsa_notifier_vlan_info *info)
+{
+ struct dsa_port *dp;
+ int err;
+
+ if (!ds->ops->port_vlan_add)
+ return -EOPNOTSUPP;
+
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_host_vlan_match(dp, info->dp)) {
+ err = dsa_port_do_vlan_add(dp, info->vlan,
+ info->extack);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int dsa_switch_host_vlan_del(struct dsa_switch *ds,
+ struct dsa_notifier_vlan_info *info)
+{
+ struct dsa_port *dp;
+ int err;
+
+ if (!ds->ops->port_vlan_del)
+ return -EOPNOTSUPP;
+
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_host_vlan_match(dp, info->dp)) {
+ err = dsa_port_do_vlan_del(dp, info->vlan);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int dsa_switch_change_tag_proto(struct dsa_switch *ds,
+ struct dsa_notifier_tag_proto_info *info)
+{
+ const struct dsa_device_ops *tag_ops = info->tag_ops;
+ struct dsa_port *dp, *cpu_dp;
+ int err;
+
+ if (!ds->ops->change_tag_protocol)
+ return -EOPNOTSUPP;
+
+ ASSERT_RTNL();
+
+ err = ds->ops->change_tag_protocol(ds, tag_ops->proto);
+ if (err)
+ return err;
+
+ dsa_switch_for_each_cpu_port(cpu_dp, ds)
+ dsa_port_set_tag_protocol(cpu_dp, tag_ops);
+
+ /* Now that changing the tag protocol can no longer fail, let's update
+ * the remaining bits which are "duplicated for faster access", and the
+ * bits that depend on the tagger, such as the MTU.
+ */
+ dsa_switch_for_each_user_port(dp, ds) {
+ struct net_device *user = dp->user;
+
+ dsa_user_setup_tagger(user);
+
+ /* rtnl_mutex is held in dsa_tree_change_tag_proto */
+ dsa_user_change_mtu(user, user->mtu);
+ }
+
+ return 0;
+}
+
+/* We use the same cross-chip notifiers to inform both the tagger side, as well
+ * as the switch side, of connection and disconnection events.
+ * Since ds->tagger_data is owned by the tagger, it isn't a hard error if the
+ * switch side doesn't support connecting to this tagger, and therefore, the
+ * fact that we don't disconnect the tagger side doesn't constitute a memory
+ * leak: the tagger will still operate with persistent per-switch memory, just
+ * with the switch side unconnected to it. What does constitute a hard error is
+ * when the switch side supports connecting but fails.
+ */
+static int
+dsa_switch_connect_tag_proto(struct dsa_switch *ds,
+ struct dsa_notifier_tag_proto_info *info)
+{
+ const struct dsa_device_ops *tag_ops = info->tag_ops;
+ int err;
+
+ /* Notify the new tagger about the connection to this switch */
+ if (tag_ops->connect) {
+ err = tag_ops->connect(ds);
+ if (err)
+ return err;
+ }
+
+ if (!ds->ops->connect_tag_protocol)
+ return -EOPNOTSUPP;
+
+ /* Notify the switch about the connection to the new tagger */
+ err = ds->ops->connect_tag_protocol(ds, tag_ops->proto);
+ if (err) {
+ /* Revert the new tagger's connection to this tree */
+ if (tag_ops->disconnect)
+ tag_ops->disconnect(ds);
+ return err;
+ }
+
+ return 0;
+}
+
+static int
+dsa_switch_disconnect_tag_proto(struct dsa_switch *ds,
+ struct dsa_notifier_tag_proto_info *info)
+{
+ const struct dsa_device_ops *tag_ops = info->tag_ops;
+
+ /* Notify the tagger about the disconnection from this switch */
+ if (tag_ops->disconnect && ds->tagger_data)
+ tag_ops->disconnect(ds);
+
+ /* No need to notify the switch, since it shouldn't have any
+ * resources to tear down
+ */
+ return 0;
+}
+
+static int
+dsa_switch_conduit_state_change(struct dsa_switch *ds,
+ struct dsa_notifier_conduit_state_info *info)
+{
+ if (!ds->ops->conduit_state_change)
+ return 0;
+
+ ds->ops->conduit_state_change(ds, info->conduit, info->operational);
return 0;
}
@@ -257,27 +993,77 @@ static int dsa_switch_event(struct notifier_block *nb,
case DSA_NOTIFIER_FDB_DEL:
err = dsa_switch_fdb_del(ds, info);
break;
+ case DSA_NOTIFIER_HOST_FDB_ADD:
+ err = dsa_switch_host_fdb_add(ds, info);
+ break;
+ case DSA_NOTIFIER_HOST_FDB_DEL:
+ err = dsa_switch_host_fdb_del(ds, info);
+ break;
+ case DSA_NOTIFIER_LAG_FDB_ADD:
+ err = dsa_switch_lag_fdb_add(ds, info);
+ break;
+ case DSA_NOTIFIER_LAG_FDB_DEL:
+ err = dsa_switch_lag_fdb_del(ds, info);
+ break;
+ case DSA_NOTIFIER_LAG_CHANGE:
+ err = dsa_switch_lag_change(ds, info);
+ break;
+ case DSA_NOTIFIER_LAG_JOIN:
+ err = dsa_switch_lag_join(ds, info);
+ break;
+ case DSA_NOTIFIER_LAG_LEAVE:
+ err = dsa_switch_lag_leave(ds, info);
+ break;
case DSA_NOTIFIER_MDB_ADD:
err = dsa_switch_mdb_add(ds, info);
break;
case DSA_NOTIFIER_MDB_DEL:
err = dsa_switch_mdb_del(ds, info);
break;
+ case DSA_NOTIFIER_HOST_MDB_ADD:
+ err = dsa_switch_host_mdb_add(ds, info);
+ break;
+ case DSA_NOTIFIER_HOST_MDB_DEL:
+ err = dsa_switch_host_mdb_del(ds, info);
+ break;
case DSA_NOTIFIER_VLAN_ADD:
err = dsa_switch_vlan_add(ds, info);
break;
case DSA_NOTIFIER_VLAN_DEL:
err = dsa_switch_vlan_del(ds, info);
break;
+ case DSA_NOTIFIER_HOST_VLAN_ADD:
+ err = dsa_switch_host_vlan_add(ds, info);
+ break;
+ case DSA_NOTIFIER_HOST_VLAN_DEL:
+ err = dsa_switch_host_vlan_del(ds, info);
+ break;
+ case DSA_NOTIFIER_MTU:
+ err = dsa_switch_mtu(ds, info);
+ break;
+ case DSA_NOTIFIER_TAG_PROTO:
+ err = dsa_switch_change_tag_proto(ds, info);
+ break;
+ case DSA_NOTIFIER_TAG_PROTO_CONNECT:
+ err = dsa_switch_connect_tag_proto(ds, info);
+ break;
+ case DSA_NOTIFIER_TAG_PROTO_DISCONNECT:
+ err = dsa_switch_disconnect_tag_proto(ds, info);
+ break;
+ case DSA_NOTIFIER_TAG_8021Q_VLAN_ADD:
+ err = dsa_switch_tag_8021q_vlan_add(ds, info);
+ break;
+ case DSA_NOTIFIER_TAG_8021Q_VLAN_DEL:
+ err = dsa_switch_tag_8021q_vlan_del(ds, info);
+ break;
+ case DSA_NOTIFIER_CONDUIT_STATE_CHANGE:
+ err = dsa_switch_conduit_state_change(ds, info);
+ break;
default:
err = -EOPNOTSUPP;
break;
}
- /* Non-switchdev operations cannot be rolled back. If a DSA driver
- * returns an error during the chained call, switch chips may be in an
- * inconsistent state.
- */
if (err)
dev_dbg(ds->dev, "breaking chain for DSA event %lu (%d)\n",
event, err);
@@ -285,6 +1071,52 @@ static int dsa_switch_event(struct notifier_block *nb,
return notifier_from_errno(err);
}
+/**
+ * dsa_tree_notify - Execute code for all switches in a DSA switch tree.
+ * @dst: collection of struct dsa_switch devices to notify.
+ * @e: event, must be of type DSA_NOTIFIER_*
+ * @v: event-specific value.
+ *
+ * Given a struct dsa_switch_tree, this can be used to run a function once for
+ * each member DSA switch. The other alternative of traversing the tree is only
+ * through its ports list, which does not uniquely list the switches.
+ */
+int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v)
+{
+ struct raw_notifier_head *nh = &dst->nh;
+ int err;
+
+ err = raw_notifier_call_chain(nh, e, v);
+
+ return notifier_to_errno(err);
+}
+
+/**
+ * dsa_broadcast - Notify all DSA trees in the system.
+ * @e: event, must be of type DSA_NOTIFIER_*
+ * @v: event-specific value.
+ *
+ * Can be used to notify the switching fabric of events such as cross-chip
+ * bridging between disjoint trees (such as islands of tagger-compatible
+ * switches bridged by an incompatible middle switch).
+ *
+ * WARNING: this function is not reliable during probe time, because probing
+ * between trees is asynchronous and not all DSA trees might have probed.
+ */
+int dsa_broadcast(unsigned long e, void *v)
+{
+ struct dsa_switch_tree *dst;
+ int err = 0;
+
+ list_for_each_entry(dst, &dsa_tree_list, list) {
+ err = dsa_tree_notify(dst, e, v);
+ if (err)
+ break;
+ }
+
+ return err;
+}
+
int dsa_switch_register_notifier(struct dsa_switch *ds)
{
ds->nb.notifier_call = dsa_switch_event;
diff --git a/net/dsa/switch.h b/net/dsa/switch.h
new file mode 100644
index 000000000000..be0a2749cd97
--- /dev/null
+++ b/net/dsa/switch.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_SWITCH_H
+#define __DSA_SWITCH_H
+
+#include <net/dsa.h>
+
+struct netlink_ext_ack;
+
+enum {
+ DSA_NOTIFIER_AGEING_TIME,
+ DSA_NOTIFIER_BRIDGE_JOIN,
+ DSA_NOTIFIER_BRIDGE_LEAVE,
+ DSA_NOTIFIER_FDB_ADD,
+ DSA_NOTIFIER_FDB_DEL,
+ DSA_NOTIFIER_HOST_FDB_ADD,
+ DSA_NOTIFIER_HOST_FDB_DEL,
+ DSA_NOTIFIER_LAG_FDB_ADD,
+ DSA_NOTIFIER_LAG_FDB_DEL,
+ DSA_NOTIFIER_LAG_CHANGE,
+ DSA_NOTIFIER_LAG_JOIN,
+ DSA_NOTIFIER_LAG_LEAVE,
+ DSA_NOTIFIER_MDB_ADD,
+ DSA_NOTIFIER_MDB_DEL,
+ DSA_NOTIFIER_HOST_MDB_ADD,
+ DSA_NOTIFIER_HOST_MDB_DEL,
+ DSA_NOTIFIER_VLAN_ADD,
+ DSA_NOTIFIER_VLAN_DEL,
+ DSA_NOTIFIER_HOST_VLAN_ADD,
+ DSA_NOTIFIER_HOST_VLAN_DEL,
+ DSA_NOTIFIER_MTU,
+ DSA_NOTIFIER_TAG_PROTO,
+ DSA_NOTIFIER_TAG_PROTO_CONNECT,
+ DSA_NOTIFIER_TAG_PROTO_DISCONNECT,
+ DSA_NOTIFIER_TAG_8021Q_VLAN_ADD,
+ DSA_NOTIFIER_TAG_8021Q_VLAN_DEL,
+ DSA_NOTIFIER_CONDUIT_STATE_CHANGE,
+};
+
+/* DSA_NOTIFIER_AGEING_TIME */
+struct dsa_notifier_ageing_time_info {
+ unsigned int ageing_time;
+};
+
+/* DSA_NOTIFIER_BRIDGE_* */
+struct dsa_notifier_bridge_info {
+ const struct dsa_port *dp;
+ struct dsa_bridge bridge;
+ bool tx_fwd_offload;
+ struct netlink_ext_ack *extack;
+};
+
+/* DSA_NOTIFIER_FDB_* */
+struct dsa_notifier_fdb_info {
+ const struct dsa_port *dp;
+ const unsigned char *addr;
+ u16 vid;
+ struct dsa_db db;
+};
+
+/* DSA_NOTIFIER_LAG_FDB_* */
+struct dsa_notifier_lag_fdb_info {
+ struct dsa_lag *lag;
+ const unsigned char *addr;
+ u16 vid;
+ struct dsa_db db;
+};
+
+/* DSA_NOTIFIER_MDB_* */
+struct dsa_notifier_mdb_info {
+ const struct dsa_port *dp;
+ const struct switchdev_obj_port_mdb *mdb;
+ struct dsa_db db;
+};
+
+/* DSA_NOTIFIER_LAG_* */
+struct dsa_notifier_lag_info {
+ const struct dsa_port *dp;
+ struct dsa_lag lag;
+ struct netdev_lag_upper_info *info;
+ struct netlink_ext_ack *extack;
+};
+
+/* DSA_NOTIFIER_VLAN_* */
+struct dsa_notifier_vlan_info {
+ const struct dsa_port *dp;
+ const struct switchdev_obj_port_vlan *vlan;
+ struct netlink_ext_ack *extack;
+};
+
+/* DSA_NOTIFIER_MTU */
+struct dsa_notifier_mtu_info {
+ const struct dsa_port *dp;
+ int mtu;
+};
+
+/* DSA_NOTIFIER_TAG_PROTO_* */
+struct dsa_notifier_tag_proto_info {
+ const struct dsa_device_ops *tag_ops;
+};
+
+/* DSA_NOTIFIER_TAG_8021Q_VLAN_* */
+struct dsa_notifier_tag_8021q_vlan_info {
+ const struct dsa_port *dp;
+ u16 vid;
+};
+
+/* DSA_NOTIFIER_CONDUIT_STATE_CHANGE */
+struct dsa_notifier_conduit_state_info {
+ const struct net_device *conduit;
+ bool operational;
+};
+
+struct dsa_vlan *dsa_vlan_find(struct list_head *vlan_list,
+ const struct switchdev_obj_port_vlan *vlan);
+
+int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v);
+int dsa_broadcast(unsigned long e, void *v);
+
+int dsa_switch_register_notifier(struct dsa_switch *ds);
+void dsa_switch_unregister_notifier(struct dsa_switch *ds);
+
+#endif
diff --git a/net/dsa/tag.c b/net/dsa/tag.c
new file mode 100644
index 000000000000..79ad105902d9
--- /dev/null
+++ b/net/dsa/tag.c
@@ -0,0 +1,244 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * DSA tagging protocol handling
+ *
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ * Copyright (c) 2013 Florian Fainelli <florian@openwrt.org>
+ * Copyright (c) 2016 Andrew Lunn <andrew@lunn.ch>
+ */
+
+#include <linux/netdevice.h>
+#include <linux/ptp_classify.h>
+#include <linux/skbuff.h>
+#include <net/dsa.h>
+#include <net/dst_metadata.h>
+
+#include "tag.h"
+#include "user.h"
+
+static LIST_HEAD(dsa_tag_drivers_list);
+static DEFINE_MUTEX(dsa_tag_drivers_lock);
+
+/* Determine if we should defer delivery of skb until we have a rx timestamp.
+ *
+ * Called from dsa_switch_rcv. For now, this will only work if tagging is
+ * enabled on the switch. Normally the MAC driver would retrieve the hardware
+ * timestamp when it reads the packet out of the hardware. However in a DSA
+ * switch, the DSA driver owning the interface to which the packet is
+ * delivered is never notified unless we do so here.
+ */
+static bool dsa_skb_defer_rx_timestamp(struct dsa_user_priv *p,
+ struct sk_buff *skb)
+{
+ struct dsa_switch *ds = p->dp->ds;
+ unsigned int type;
+
+ if (!ds->ops->port_rxtstamp)
+ return false;
+
+ if (skb_headroom(skb) < ETH_HLEN)
+ return false;
+
+ __skb_push(skb, ETH_HLEN);
+
+ type = ptp_classify_raw(skb);
+
+ __skb_pull(skb, ETH_HLEN);
+
+ if (type == PTP_CLASS_NONE)
+ return false;
+
+ return ds->ops->port_rxtstamp(ds, p->dp->index, skb, type);
+}
+
+static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *unused)
+{
+ struct metadata_dst *md_dst = skb_metadata_dst(skb);
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ struct sk_buff *nskb = NULL;
+ struct dsa_user_priv *p;
+
+ if (unlikely(!cpu_dp)) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ skb = skb_unshare(skb, GFP_ATOMIC);
+ if (!skb)
+ return 0;
+
+ if (md_dst && md_dst->type == METADATA_HW_PORT_MUX) {
+ unsigned int port = md_dst->u.port_info.port_id;
+
+ skb_dst_drop(skb);
+ if (!skb_has_extensions(skb))
+ skb->slow_gro = 0;
+
+ skb->dev = dsa_conduit_find_user(dev, 0, port);
+ if (likely(skb->dev)) {
+ dsa_default_offload_fwd_mark(skb);
+ nskb = skb;
+ }
+ } else {
+ nskb = cpu_dp->rcv(skb, dev);
+ }
+
+ if (!nskb) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ skb = nskb;
+ skb_push(skb, ETH_HLEN);
+ skb->pkt_type = PACKET_HOST;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+
+ if (unlikely(!dsa_user_dev_check(skb->dev))) {
+ /* Packet is to be injected directly on an upper
+ * device, e.g. a team/bond, so skip all DSA-port
+ * specific actions.
+ */
+ netif_rx(skb);
+ return 0;
+ }
+
+ p = netdev_priv(skb->dev);
+
+ if (unlikely(cpu_dp->ds->untag_bridge_pvid ||
+ cpu_dp->ds->untag_vlan_aware_bridge_pvid)) {
+ nskb = dsa_software_vlan_untag(skb);
+ if (!nskb) {
+ kfree_skb(skb);
+ return 0;
+ }
+ skb = nskb;
+ }
+
+ dev_sw_netstats_rx_add(skb->dev, skb->len + ETH_HLEN);
+
+ if (dsa_skb_defer_rx_timestamp(p, skb))
+ return 0;
+
+ gro_cells_receive(&p->gcells, skb);
+
+ return 0;
+}
+
+struct packet_type dsa_pack_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_XDSA),
+ .func = dsa_switch_rcv,
+};
+
+static void dsa_tag_driver_register(struct dsa_tag_driver *dsa_tag_driver,
+ struct module *owner)
+{
+ dsa_tag_driver->owner = owner;
+
+ mutex_lock(&dsa_tag_drivers_lock);
+ list_add_tail(&dsa_tag_driver->list, &dsa_tag_drivers_list);
+ mutex_unlock(&dsa_tag_drivers_lock);
+}
+
+void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[],
+ unsigned int count, struct module *owner)
+{
+ unsigned int i;
+
+ for (i = 0; i < count; i++)
+ dsa_tag_driver_register(dsa_tag_driver_array[i], owner);
+}
+
+static void dsa_tag_driver_unregister(struct dsa_tag_driver *dsa_tag_driver)
+{
+ mutex_lock(&dsa_tag_drivers_lock);
+ list_del(&dsa_tag_driver->list);
+ mutex_unlock(&dsa_tag_drivers_lock);
+}
+EXPORT_SYMBOL_GPL(dsa_tag_drivers_register);
+
+void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[],
+ unsigned int count)
+{
+ unsigned int i;
+
+ for (i = 0; i < count; i++)
+ dsa_tag_driver_unregister(dsa_tag_driver_array[i]);
+}
+EXPORT_SYMBOL_GPL(dsa_tag_drivers_unregister);
+
+const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops)
+{
+ return ops->name;
+};
+
+/* Function takes a reference on the module owning the tagger,
+ * so dsa_tag_driver_put must be called afterwards.
+ */
+const struct dsa_device_ops *dsa_tag_driver_get_by_name(const char *name)
+{
+ const struct dsa_device_ops *ops = ERR_PTR(-ENOPROTOOPT);
+ struct dsa_tag_driver *dsa_tag_driver;
+
+ request_module("%s%s", DSA_TAG_DRIVER_ALIAS, name);
+
+ mutex_lock(&dsa_tag_drivers_lock);
+ list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) {
+ const struct dsa_device_ops *tmp = dsa_tag_driver->ops;
+
+ if (strcmp(name, tmp->name))
+ continue;
+
+ if (!try_module_get(dsa_tag_driver->owner))
+ break;
+
+ ops = tmp;
+ break;
+ }
+ mutex_unlock(&dsa_tag_drivers_lock);
+
+ return ops;
+}
+
+const struct dsa_device_ops *dsa_tag_driver_get_by_id(int tag_protocol)
+{
+ struct dsa_tag_driver *dsa_tag_driver;
+ const struct dsa_device_ops *ops;
+ bool found = false;
+
+ request_module("%sid-%d", DSA_TAG_DRIVER_ALIAS, tag_protocol);
+
+ mutex_lock(&dsa_tag_drivers_lock);
+ list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) {
+ ops = dsa_tag_driver->ops;
+ if (ops->proto == tag_protocol) {
+ found = true;
+ break;
+ }
+ }
+
+ if (found) {
+ if (!try_module_get(dsa_tag_driver->owner))
+ ops = ERR_PTR(-ENOPROTOOPT);
+ } else {
+ ops = ERR_PTR(-ENOPROTOOPT);
+ }
+
+ mutex_unlock(&dsa_tag_drivers_lock);
+
+ return ops;
+}
+
+void dsa_tag_driver_put(const struct dsa_device_ops *ops)
+{
+ struct dsa_tag_driver *dsa_tag_driver;
+
+ mutex_lock(&dsa_tag_drivers_lock);
+ list_for_each_entry(dsa_tag_driver, &dsa_tag_drivers_list, list) {
+ if (dsa_tag_driver->ops == ops) {
+ module_put(dsa_tag_driver->owner);
+ break;
+ }
+ }
+ mutex_unlock(&dsa_tag_drivers_lock);
+}
diff --git a/net/dsa/tag.h b/net/dsa/tag.h
new file mode 100644
index 000000000000..cf52283fe9df
--- /dev/null
+++ b/net/dsa/tag.h
@@ -0,0 +1,409 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_TAG_H
+#define __DSA_TAG_H
+
+#include <linux/if_vlan.h>
+#include <linux/list.h>
+#include <linux/types.h>
+#include <net/dsa.h>
+
+#include "port.h"
+#include "user.h"
+
+struct dsa_tag_driver {
+ const struct dsa_device_ops *ops;
+ struct list_head list;
+ struct module *owner;
+};
+
+extern struct packet_type dsa_pack_type;
+
+const struct dsa_device_ops *dsa_tag_driver_get_by_id(int tag_protocol);
+const struct dsa_device_ops *dsa_tag_driver_get_by_name(const char *name);
+void dsa_tag_driver_put(const struct dsa_device_ops *ops);
+const char *dsa_tag_protocol_to_str(const struct dsa_device_ops *ops);
+
+static inline int dsa_tag_protocol_overhead(const struct dsa_device_ops *ops)
+{
+ return ops->needed_headroom + ops->needed_tailroom;
+}
+
+static inline struct net_device *dsa_conduit_find_user(struct net_device *dev,
+ int device, int port)
+{
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ struct dsa_switch_tree *dst = cpu_dp->dst;
+ struct dsa_port *dp;
+
+ list_for_each_entry(dp, &dst->ports, list)
+ if (dp->ds->index == device && dp->index == port &&
+ dp->type == DSA_PORT_TYPE_USER)
+ return dp->user;
+
+ return NULL;
+}
+
+/**
+ * dsa_software_untag_vlan_aware_bridge: Software untagging for VLAN-aware bridge
+ * @skb: Pointer to received socket buffer (packet)
+ * @br: Pointer to bridge upper interface of ingress port
+ * @vid: Parsed VID from packet
+ *
+ * The bridge can process tagged packets. Software like STP/PTP may not. The
+ * bridge can also process untagged packets, to the same effect as if they were
+ * tagged with the PVID of the ingress port. So packets tagged with the PVID of
+ * the bridge port must be software-untagged, to support both use cases.
+ */
+static inline void dsa_software_untag_vlan_aware_bridge(struct sk_buff *skb,
+ struct net_device *br,
+ u16 vid)
+{
+ u16 pvid, proto;
+ int err;
+
+ err = br_vlan_get_proto(br, &proto);
+ if (err)
+ return;
+
+ err = br_vlan_get_pvid_rcu(skb->dev, &pvid);
+ if (err)
+ return;
+
+ if (vid == pvid && skb->vlan_proto == htons(proto))
+ __vlan_hwaccel_clear_tag(skb);
+}
+
+/**
+ * dsa_software_untag_vlan_unaware_bridge: Software untagging for VLAN-unaware bridge
+ * @skb: Pointer to received socket buffer (packet)
+ * @br: Pointer to bridge upper interface of ingress port
+ * @vid: Parsed VID from packet
+ *
+ * The bridge ignores all VLAN tags. Software like STP/PTP may not (it may run
+ * on the plain port, or on a VLAN upper interface). Maybe packets are coming
+ * to software as tagged with a driver-defined VID which is NOT equal to the
+ * PVID of the bridge port (since the bridge is VLAN-unaware, its configuration
+ * should NOT be committed to hardware). DSA needs a method for this private
+ * VID to be communicated by software to it, and if packets are tagged with it,
+ * software-untag them. Note: the private VID may be different per bridge, to
+ * support the FDB isolation use case.
+ *
+ * FIXME: this is currently implemented based on the broken assumption that
+ * the "private VID" used by the driver in VLAN-unaware mode is equal to the
+ * bridge PVID. It should not be, except for a coincidence; the bridge PVID is
+ * irrelevant to the data path in the VLAN-unaware mode. Thus, the VID that
+ * this function removes is wrong.
+ *
+ * All users of ds->untag_bridge_pvid should fix their drivers, if necessary,
+ * to make the two independent. Only then, if there still remains a need to
+ * strip the private VID from packets, then a new ds->ops->get_private_vid()
+ * API shall be introduced to communicate to DSA what this VID is, which needs
+ * to be stripped here.
+ */
+static inline void dsa_software_untag_vlan_unaware_bridge(struct sk_buff *skb,
+ struct net_device *br,
+ u16 vid)
+{
+ struct net_device *upper_dev;
+ u16 pvid, proto;
+ int err;
+
+ err = br_vlan_get_proto(br, &proto);
+ if (err)
+ return;
+
+ err = br_vlan_get_pvid_rcu(skb->dev, &pvid);
+ if (err)
+ return;
+
+ if (vid != pvid || skb->vlan_proto != htons(proto))
+ return;
+
+ /* The sad part about attempting to untag from DSA is that we
+ * don't know, unless we check, if the skb will end up in
+ * the bridge's data path - br_allowed_ingress() - or not.
+ * For example, there might be an 8021q upper for the
+ * default_pvid of the bridge, which will steal VLAN-tagged traffic
+ * from the bridge's data path. This is a configuration that DSA
+ * supports because vlan_filtering is 0. In that case, we should
+ * definitely keep the tag, to make sure it keeps working.
+ */
+ upper_dev = __vlan_find_dev_deep_rcu(br, htons(proto), vid);
+ if (!upper_dev)
+ __vlan_hwaccel_clear_tag(skb);
+}
+
+/**
+ * dsa_software_vlan_untag: Software VLAN untagging in DSA receive path
+ * @skb: Pointer to socket buffer (packet)
+ *
+ * Receive path method for switches which send some packets as VLAN-tagged
+ * towards the CPU port (generally from VLAN-aware bridge ports) even when the
+ * packet was not tagged on the wire. Called when ds->untag_bridge_pvid
+ * (legacy) or ds->untag_vlan_aware_bridge_pvid is set to true.
+ *
+ * As a side effect of this method, any VLAN tag from the skb head is moved
+ * to hwaccel.
+ */
+static inline struct sk_buff *dsa_software_vlan_untag(struct sk_buff *skb)
+{
+ struct dsa_port *dp = dsa_user_to_port(skb->dev);
+ struct net_device *br = dsa_port_bridge_dev_get(dp);
+ u16 vid, proto;
+ int err;
+
+ /* software untagging for standalone ports not yet necessary */
+ if (!br)
+ return skb;
+
+ err = br_vlan_get_proto(br, &proto);
+ if (err)
+ return skb;
+
+ /* Move VLAN tag from data to hwaccel */
+ if (!skb_vlan_tag_present(skb) && skb->protocol == htons(proto)) {
+ skb = skb_vlan_untag(skb);
+ if (!skb)
+ return NULL;
+ }
+
+ if (!skb_vlan_tag_present(skb))
+ return skb;
+
+ vid = skb_vlan_tag_get_id(skb);
+
+ if (br_vlan_enabled(br)) {
+ if (dp->ds->untag_vlan_aware_bridge_pvid)
+ dsa_software_untag_vlan_aware_bridge(skb, br, vid);
+ } else {
+ if (dp->ds->untag_bridge_pvid)
+ dsa_software_untag_vlan_unaware_bridge(skb, br, vid);
+ }
+
+ return skb;
+}
+
+/* For switches without hardware support for DSA tagging to be able
+ * to support termination through the bridge.
+ */
+static inline struct net_device *
+dsa_find_designated_bridge_port_by_vid(struct net_device *conduit, u16 vid)
+{
+ struct dsa_port *cpu_dp = conduit->dsa_ptr;
+ struct dsa_switch_tree *dst = cpu_dp->dst;
+ struct bridge_vlan_info vinfo;
+ struct net_device *user;
+ struct dsa_port *dp;
+ int err;
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (dp->type != DSA_PORT_TYPE_USER)
+ continue;
+
+ if (!dp->bridge)
+ continue;
+
+ if (dp->stp_state != BR_STATE_LEARNING &&
+ dp->stp_state != BR_STATE_FORWARDING)
+ continue;
+
+ /* Since the bridge might learn this packet, keep the CPU port
+ * affinity with the port that will be used for the reply on
+ * xmit.
+ */
+ if (dp->cpu_dp != cpu_dp)
+ continue;
+
+ user = dp->user;
+
+ err = br_vlan_get_info_rcu(user, vid, &vinfo);
+ if (err)
+ continue;
+
+ return user;
+ }
+
+ return NULL;
+}
+
+/* If the ingress port offloads the bridge, we mark the frame as autonomously
+ * forwarded by hardware, so the software bridge doesn't forward in twice, back
+ * to us, because we already did. However, if we're in fallback mode and we do
+ * software bridging, we are not offloading it, therefore the dp->bridge
+ * pointer is not populated, and flooding needs to be done by software (we are
+ * effectively operating in standalone ports mode).
+ */
+static inline void dsa_default_offload_fwd_mark(struct sk_buff *skb)
+{
+ struct dsa_port *dp = dsa_user_to_port(skb->dev);
+
+ skb->offload_fwd_mark = !!(dp->bridge);
+}
+
+/* Helper for removing DSA header tags from packets in the RX path.
+ * Must not be called before skb_pull(len).
+ * skb->data
+ * |
+ * v
+ * | | | | | | | | | | | | | | | | | | |
+ * +-----------------------+-----------------------+---------------+-------+
+ * | Destination MAC | Source MAC | DSA header | EType |
+ * +-----------------------+-----------------------+---------------+-------+
+ * | |
+ * <----- len -----> <----- len ----->
+ * |
+ * >>>>>>> v
+ * >>>>>>> | | | | | | | | | | | | | | |
+ * >>>>>>> +-----------------------+-----------------------+-------+
+ * >>>>>>> | Destination MAC | Source MAC | EType |
+ * +-----------------------+-----------------------+-------+
+ * ^
+ * |
+ * skb->data
+ */
+static inline void dsa_strip_etype_header(struct sk_buff *skb, int len)
+{
+ memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - len, 2 * ETH_ALEN);
+}
+
+/* Helper for creating space for DSA header tags in TX path packets.
+ * Must not be called before skb_push(len).
+ *
+ * Before:
+ *
+ * <<<<<<< | | | | | | | | | | | | | | |
+ * ^ <<<<<<< +-----------------------+-----------------------+-------+
+ * | <<<<<<< | Destination MAC | Source MAC | EType |
+ * | +-----------------------+-----------------------+-------+
+ * <----- len ----->
+ * |
+ * |
+ * skb->data
+ *
+ * After:
+ *
+ * | | | | | | | | | | | | | | | | | | |
+ * +-----------------------+-----------------------+---------------+-------+
+ * | Destination MAC | Source MAC | DSA header | EType |
+ * +-----------------------+-----------------------+---------------+-------+
+ * ^ | |
+ * | <----- len ----->
+ * skb->data
+ */
+static inline void dsa_alloc_etype_header(struct sk_buff *skb, int len)
+{
+ memmove(skb->data, skb->data + len, 2 * ETH_ALEN);
+}
+
+/* On RX, eth_type_trans() on the DSA conduit pulls ETH_HLEN bytes starting from
+ * skb_mac_header(skb), which leaves skb->data pointing at the first byte after
+ * what the DSA conduit perceives as the EtherType (the beginning of the L3
+ * protocol). Since DSA EtherType header taggers treat the EtherType as part of
+ * the DSA tag itself, and the EtherType is 2 bytes in length, the DSA header
+ * is located 2 bytes behind skb->data. Note that EtherType in this context
+ * means the first 2 bytes of the DSA header, not the encapsulated EtherType
+ * that will become visible after the DSA header is stripped.
+ */
+static inline void *dsa_etype_header_pos_rx(struct sk_buff *skb)
+{
+ return skb->data - 2;
+}
+
+/* On TX, skb->data points to the MAC header, which means that EtherType
+ * header taggers start exactly where the EtherType is (the EtherType is
+ * treated as part of the DSA header).
+ */
+static inline void *dsa_etype_header_pos_tx(struct sk_buff *skb)
+{
+ return skb->data + 2 * ETH_ALEN;
+}
+
+static inline unsigned long dsa_xmit_port_mask(const struct sk_buff *skb,
+ const struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ unsigned long mask = BIT(dp->index);
+
+ if (IS_ENABLED(CONFIG_HSR) &&
+ unlikely(dev->features & NETIF_F_HW_HSR_DUP)) {
+ struct net_device *hsr_dev = dp->hsr_dev;
+ struct dsa_port *other_dp;
+
+ dsa_hsr_foreach_port(other_dp, dp->ds, hsr_dev)
+ mask |= BIT(other_dp->index);
+ }
+
+ return mask;
+}
+
+/* Create 2 modaliases per tagging protocol, one to auto-load the module
+ * given the ID reported by get_tag_protocol(), and the other by name.
+ */
+#define DSA_TAG_DRIVER_ALIAS "dsa_tag:"
+#define MODULE_ALIAS_DSA_TAG_DRIVER(__proto, __name) \
+ MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __name); \
+ MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS "id-" \
+ __stringify(__proto##_VALUE))
+
+void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[],
+ unsigned int count,
+ struct module *owner);
+void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[],
+ unsigned int count);
+
+#define dsa_tag_driver_module_drivers(__dsa_tag_drivers_array, __count) \
+static int __init dsa_tag_driver_module_init(void) \
+{ \
+ dsa_tag_drivers_register(__dsa_tag_drivers_array, __count, \
+ THIS_MODULE); \
+ return 0; \
+} \
+module_init(dsa_tag_driver_module_init); \
+ \
+static void __exit dsa_tag_driver_module_exit(void) \
+{ \
+ dsa_tag_drivers_unregister(__dsa_tag_drivers_array, __count); \
+} \
+module_exit(dsa_tag_driver_module_exit)
+
+/**
+ * module_dsa_tag_drivers() - Helper macro for registering DSA tag
+ * drivers
+ * @__ops_array: Array of tag driver structures
+ *
+ * Helper macro for DSA tag drivers which do not do anything special
+ * in module init/exit. Each module may only use this macro once, and
+ * calling it replaces module_init() and module_exit().
+ */
+#define module_dsa_tag_drivers(__ops_array) \
+dsa_tag_driver_module_drivers(__ops_array, ARRAY_SIZE(__ops_array))
+
+#define DSA_TAG_DRIVER_NAME(__ops) dsa_tag_driver ## _ ## __ops
+
+/* Create a static structure we can build a linked list of dsa_tag
+ * drivers
+ */
+#define DSA_TAG_DRIVER(__ops) \
+static struct dsa_tag_driver DSA_TAG_DRIVER_NAME(__ops) = { \
+ .ops = &__ops, \
+}
+
+/**
+ * module_dsa_tag_driver() - Helper macro for registering a single DSA tag
+ * driver
+ * @__ops: Single tag driver structures
+ *
+ * Helper macro for DSA tag drivers which do not do anything special
+ * in module init/exit. Each module may only use this macro once, and
+ * calling it replaces module_init() and module_exit().
+ */
+#define module_dsa_tag_driver(__ops) \
+DSA_TAG_DRIVER(__ops); \
+ \
+static struct dsa_tag_driver *dsa_tag_driver_array[] = { \
+ &DSA_TAG_DRIVER_NAME(__ops) \
+}; \
+module_dsa_tag_drivers(dsa_tag_driver_array)
+
+#endif
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
new file mode 100644
index 000000000000..53e03fd8071b
--- /dev/null
+++ b/net/dsa/tag_8021q.c
@@ -0,0 +1,588 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019, Vladimir Oltean <olteanv@gmail.com>
+ *
+ * This module is not a complete tagger implementation. It only provides
+ * primitives for taggers that rely on 802.1Q VLAN tags to use.
+ */
+#include <linux/if_vlan.h>
+#include <linux/dsa/8021q.h>
+
+#include "port.h"
+#include "switch.h"
+#include "tag.h"
+#include "tag_8021q.h"
+
+/* Binary structure of the fake 12-bit VID field (when the TPID is
+ * ETH_P_DSA_8021Q):
+ *
+ * | 11 | 10 | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
+ * +-----------+-----+-----------------+-----------+-----------------------+
+ * | RSV | VBID| SWITCH_ID | VBID | PORT |
+ * +-----------+-----+-----------------+-----------+-----------------------+
+ *
+ * RSV - VID[11:10]:
+ * Reserved. Must be set to 3 (0b11).
+ *
+ * SWITCH_ID - VID[8:6]:
+ * Index of switch within DSA tree. Must be between 0 and 7.
+ *
+ * VBID - { VID[9], VID[5:4] }:
+ * Virtual bridge ID. If between 1 and 7, packet targets the broadcast
+ * domain of a bridge. If transmitted as zero, packet targets a single
+ * port.
+ *
+ * PORT - VID[3:0]:
+ * Index of switch port. Must be between 0 and 15.
+ */
+
+#define DSA_8021Q_RSV_VAL 3
+#define DSA_8021Q_RSV_SHIFT 10
+#define DSA_8021Q_RSV_MASK GENMASK(11, 10)
+#define DSA_8021Q_RSV ((DSA_8021Q_RSV_VAL << DSA_8021Q_RSV_SHIFT) & \
+ DSA_8021Q_RSV_MASK)
+
+#define DSA_8021Q_SWITCH_ID_SHIFT 6
+#define DSA_8021Q_SWITCH_ID_MASK GENMASK(8, 6)
+#define DSA_8021Q_SWITCH_ID(x) (((x) << DSA_8021Q_SWITCH_ID_SHIFT) & \
+ DSA_8021Q_SWITCH_ID_MASK)
+
+#define DSA_8021Q_VBID_HI_SHIFT 9
+#define DSA_8021Q_VBID_HI_MASK GENMASK(9, 9)
+#define DSA_8021Q_VBID_LO_SHIFT 4
+#define DSA_8021Q_VBID_LO_MASK GENMASK(5, 4)
+#define DSA_8021Q_VBID_HI(x) (((x) & GENMASK(2, 2)) >> 2)
+#define DSA_8021Q_VBID_LO(x) ((x) & GENMASK(1, 0))
+#define DSA_8021Q_VBID(x) \
+ (((DSA_8021Q_VBID_LO(x) << DSA_8021Q_VBID_LO_SHIFT) & \
+ DSA_8021Q_VBID_LO_MASK) | \
+ ((DSA_8021Q_VBID_HI(x) << DSA_8021Q_VBID_HI_SHIFT) & \
+ DSA_8021Q_VBID_HI_MASK))
+
+#define DSA_8021Q_PORT_SHIFT 0
+#define DSA_8021Q_PORT_MASK GENMASK(3, 0)
+#define DSA_8021Q_PORT(x) (((x) << DSA_8021Q_PORT_SHIFT) & \
+ DSA_8021Q_PORT_MASK)
+
+struct dsa_tag_8021q_vlan {
+ struct list_head list;
+ int port;
+ u16 vid;
+ refcount_t refcount;
+};
+
+struct dsa_8021q_context {
+ struct dsa_switch *ds;
+ struct list_head vlans;
+ /* EtherType of RX VID, used for filtering on conduit interface */
+ __be16 proto;
+};
+
+u16 dsa_tag_8021q_bridge_vid(unsigned int bridge_num)
+{
+ /* The VBID value of 0 is reserved for precise TX, but it is also
+ * reserved/invalid for the bridge_num, so all is well.
+ */
+ return DSA_8021Q_RSV | DSA_8021Q_VBID(bridge_num);
+}
+EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_vid);
+
+/* Returns the VID that will be installed as pvid for this switch port, sent as
+ * tagged egress towards the CPU port and decoded by the rcv function.
+ */
+u16 dsa_tag_8021q_standalone_vid(const struct dsa_port *dp)
+{
+ return DSA_8021Q_RSV | DSA_8021Q_SWITCH_ID(dp->ds->index) |
+ DSA_8021Q_PORT(dp->index);
+}
+EXPORT_SYMBOL_GPL(dsa_tag_8021q_standalone_vid);
+
+/* Returns the decoded switch ID from the RX VID. */
+int dsa_8021q_rx_switch_id(u16 vid)
+{
+ return (vid & DSA_8021Q_SWITCH_ID_MASK) >> DSA_8021Q_SWITCH_ID_SHIFT;
+}
+EXPORT_SYMBOL_GPL(dsa_8021q_rx_switch_id);
+
+/* Returns the decoded port ID from the RX VID. */
+int dsa_8021q_rx_source_port(u16 vid)
+{
+ return (vid & DSA_8021Q_PORT_MASK) >> DSA_8021Q_PORT_SHIFT;
+}
+EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port);
+
+/* Returns the decoded VBID from the RX VID. */
+static int dsa_tag_8021q_rx_vbid(u16 vid)
+{
+ u16 vbid_hi = (vid & DSA_8021Q_VBID_HI_MASK) >> DSA_8021Q_VBID_HI_SHIFT;
+ u16 vbid_lo = (vid & DSA_8021Q_VBID_LO_MASK) >> DSA_8021Q_VBID_LO_SHIFT;
+
+ return (vbid_hi << 2) | vbid_lo;
+}
+
+bool vid_is_dsa_8021q(u16 vid)
+{
+ u16 rsv = (vid & DSA_8021Q_RSV_MASK) >> DSA_8021Q_RSV_SHIFT;
+
+ return rsv == DSA_8021Q_RSV_VAL;
+}
+EXPORT_SYMBOL_GPL(vid_is_dsa_8021q);
+
+static struct dsa_tag_8021q_vlan *
+dsa_tag_8021q_vlan_find(struct dsa_8021q_context *ctx, int port, u16 vid)
+{
+ struct dsa_tag_8021q_vlan *v;
+
+ list_for_each_entry(v, &ctx->vlans, list)
+ if (v->vid == vid && v->port == port)
+ return v;
+
+ return NULL;
+}
+
+static int dsa_port_do_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid,
+ u16 flags)
+{
+ struct dsa_8021q_context *ctx = dp->ds->tag_8021q_ctx;
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_tag_8021q_vlan *v;
+ int port = dp->index;
+ int err;
+
+ /* No need to bother with refcounting for user ports */
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp)))
+ return ds->ops->tag_8021q_vlan_add(ds, port, vid, flags);
+
+ v = dsa_tag_8021q_vlan_find(ctx, port, vid);
+ if (v) {
+ refcount_inc(&v->refcount);
+ return 0;
+ }
+
+ v = kzalloc(sizeof(*v), GFP_KERNEL);
+ if (!v)
+ return -ENOMEM;
+
+ err = ds->ops->tag_8021q_vlan_add(ds, port, vid, flags);
+ if (err) {
+ kfree(v);
+ return err;
+ }
+
+ v->vid = vid;
+ v->port = port;
+ refcount_set(&v->refcount, 1);
+ list_add_tail(&v->list, &ctx->vlans);
+
+ return 0;
+}
+
+static int dsa_port_do_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid)
+{
+ struct dsa_8021q_context *ctx = dp->ds->tag_8021q_ctx;
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_tag_8021q_vlan *v;
+ int port = dp->index;
+ int err;
+
+ /* No need to bother with refcounting for user ports */
+ if (!(dsa_port_is_cpu(dp) || dsa_port_is_dsa(dp)))
+ return ds->ops->tag_8021q_vlan_del(ds, port, vid);
+
+ v = dsa_tag_8021q_vlan_find(ctx, port, vid);
+ if (!v)
+ return -ENOENT;
+
+ if (!refcount_dec_and_test(&v->refcount))
+ return 0;
+
+ err = ds->ops->tag_8021q_vlan_del(ds, port, vid);
+ if (err) {
+ refcount_set(&v->refcount, 1);
+ return err;
+ }
+
+ list_del(&v->list);
+ kfree(v);
+
+ return 0;
+}
+
+static bool
+dsa_port_tag_8021q_vlan_match(struct dsa_port *dp,
+ struct dsa_notifier_tag_8021q_vlan_info *info)
+{
+ return dsa_port_is_dsa(dp) || dsa_port_is_cpu(dp) || dp == info->dp;
+}
+
+int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds,
+ struct dsa_notifier_tag_8021q_vlan_info *info)
+{
+ struct dsa_port *dp;
+ int err;
+
+ /* Since we use dsa_broadcast(), there might be other switches in other
+ * trees which don't support tag_8021q, so don't return an error.
+ * Or they might even support tag_8021q but have not registered yet to
+ * use it (maybe they use another tagger currently).
+ */
+ if (!ds->ops->tag_8021q_vlan_add || !ds->tag_8021q_ctx)
+ return 0;
+
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_tag_8021q_vlan_match(dp, info)) {
+ u16 flags = 0;
+
+ if (dsa_port_is_user(dp))
+ flags |= BRIDGE_VLAN_INFO_UNTAGGED |
+ BRIDGE_VLAN_INFO_PVID;
+
+ err = dsa_port_do_tag_8021q_vlan_add(dp, info->vid,
+ flags);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds,
+ struct dsa_notifier_tag_8021q_vlan_info *info)
+{
+ struct dsa_port *dp;
+ int err;
+
+ if (!ds->ops->tag_8021q_vlan_del || !ds->tag_8021q_ctx)
+ return 0;
+
+ dsa_switch_for_each_port(dp, ds) {
+ if (dsa_port_tag_8021q_vlan_match(dp, info)) {
+ err = dsa_port_do_tag_8021q_vlan_del(dp, info->vid);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+/* There are 2 ways of offloading tag_8021q VLANs.
+ *
+ * One is to use a hardware TCAM to push the port's standalone VLAN into the
+ * frame when forwarding it to the CPU, as an egress modification rule on the
+ * CPU port. This is preferable because it has no side effects for the
+ * autonomous forwarding path, and accomplishes tag_8021q's primary goal of
+ * identifying the source port of each packet based on VLAN ID.
+ *
+ * The other is to commit the tag_8021q VLAN as a PVID to the VLAN table, and
+ * to configure the port as VLAN-unaware. This is less preferable because
+ * unique source port identification can only be done for standalone ports;
+ * under a VLAN-unaware bridge, all ports share the same tag_8021q VLAN as
+ * PVID, and under a VLAN-aware bridge, packets received by software will not
+ * have tag_8021q VLANs appended, just bridge VLANs.
+ *
+ * For tag_8021q implementations of the second type, this method is used to
+ * replace the standalone tag_8021q VLAN of a port with the tag_8021q VLAN to
+ * be used for VLAN-unaware bridging.
+ */
+int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, int port,
+ struct dsa_bridge bridge, bool *tx_fwd_offload,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *dp = dsa_to_port(ds, port);
+ u16 standalone_vid, bridge_vid;
+ int err;
+
+ /* Delete the standalone VLAN of the port and replace it with a
+ * bridging VLAN
+ */
+ standalone_vid = dsa_tag_8021q_standalone_vid(dp);
+ bridge_vid = dsa_tag_8021q_bridge_vid(bridge.num);
+
+ err = dsa_port_tag_8021q_vlan_add(dp, bridge_vid, true);
+ if (err)
+ return err;
+
+ dsa_port_tag_8021q_vlan_del(dp, standalone_vid, false);
+
+ *tx_fwd_offload = true;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_join);
+
+void dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, int port,
+ struct dsa_bridge bridge)
+{
+ struct dsa_port *dp = dsa_to_port(ds, port);
+ u16 standalone_vid, bridge_vid;
+ int err;
+
+ /* Delete the bridging VLAN of the port and replace it with a
+ * standalone VLAN
+ */
+ standalone_vid = dsa_tag_8021q_standalone_vid(dp);
+ bridge_vid = dsa_tag_8021q_bridge_vid(bridge.num);
+
+ err = dsa_port_tag_8021q_vlan_add(dp, standalone_vid, false);
+ if (err) {
+ dev_err(ds->dev,
+ "Failed to delete tag_8021q standalone VLAN %d from port %d: %pe\n",
+ standalone_vid, port, ERR_PTR(err));
+ }
+
+ dsa_port_tag_8021q_vlan_del(dp, bridge_vid, true);
+}
+EXPORT_SYMBOL_GPL(dsa_tag_8021q_bridge_leave);
+
+/* Set up a port's standalone tag_8021q VLAN */
+static int dsa_tag_8021q_port_setup(struct dsa_switch *ds, int port)
+{
+ struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
+ struct dsa_port *dp = dsa_to_port(ds, port);
+ u16 vid = dsa_tag_8021q_standalone_vid(dp);
+ struct net_device *conduit;
+ int err;
+
+ /* The CPU port is implicitly configured by
+ * configuring the front-panel ports
+ */
+ if (!dsa_port_is_user(dp))
+ return 0;
+
+ conduit = dsa_port_to_conduit(dp);
+
+ err = dsa_port_tag_8021q_vlan_add(dp, vid, false);
+ if (err) {
+ dev_err(ds->dev,
+ "Failed to apply standalone VID %d to port %d: %pe\n",
+ vid, port, ERR_PTR(err));
+ return err;
+ }
+
+ /* Add the VLAN to the conduit's RX filter. */
+ vlan_vid_add(conduit, ctx->proto, vid);
+
+ return err;
+}
+
+static void dsa_tag_8021q_port_teardown(struct dsa_switch *ds, int port)
+{
+ struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
+ struct dsa_port *dp = dsa_to_port(ds, port);
+ u16 vid = dsa_tag_8021q_standalone_vid(dp);
+ struct net_device *conduit;
+
+ /* The CPU port is implicitly configured by
+ * configuring the front-panel ports
+ */
+ if (!dsa_port_is_user(dp))
+ return;
+
+ conduit = dsa_port_to_conduit(dp);
+
+ dsa_port_tag_8021q_vlan_del(dp, vid, false);
+
+ vlan_vid_del(conduit, ctx->proto, vid);
+}
+
+static int dsa_tag_8021q_setup(struct dsa_switch *ds)
+{
+ int err, port;
+
+ ASSERT_RTNL();
+
+ for (port = 0; port < ds->num_ports; port++) {
+ err = dsa_tag_8021q_port_setup(ds, port);
+ if (err < 0) {
+ dev_err(ds->dev,
+ "Failed to setup VLAN tagging for port %d: %pe\n",
+ port, ERR_PTR(err));
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static void dsa_tag_8021q_teardown(struct dsa_switch *ds)
+{
+ int port;
+
+ ASSERT_RTNL();
+
+ for (port = 0; port < ds->num_ports; port++)
+ dsa_tag_8021q_port_teardown(ds, port);
+}
+
+int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto)
+{
+ struct dsa_8021q_context *ctx;
+ int err;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->proto = proto;
+ ctx->ds = ds;
+
+ INIT_LIST_HEAD(&ctx->vlans);
+
+ ds->tag_8021q_ctx = ctx;
+
+ err = dsa_tag_8021q_setup(ds);
+ if (err)
+ goto err_free;
+
+ return 0;
+
+err_free:
+ kfree(ctx);
+ return err;
+}
+EXPORT_SYMBOL_GPL(dsa_tag_8021q_register);
+
+void dsa_tag_8021q_unregister(struct dsa_switch *ds)
+{
+ struct dsa_8021q_context *ctx = ds->tag_8021q_ctx;
+ struct dsa_tag_8021q_vlan *v, *n;
+
+ dsa_tag_8021q_teardown(ds);
+
+ list_for_each_entry_safe(v, n, &ctx->vlans, list) {
+ list_del(&v->list);
+ kfree(v);
+ }
+
+ ds->tag_8021q_ctx = NULL;
+
+ kfree(ctx);
+}
+EXPORT_SYMBOL_GPL(dsa_tag_8021q_unregister);
+
+struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
+ u16 tpid, u16 tci)
+{
+ /* skb->data points at the MAC header, which is fine
+ * for vlan_insert_tag().
+ */
+ return vlan_insert_tag(skb, htons(tpid), tci);
+}
+EXPORT_SYMBOL_GPL(dsa_8021q_xmit);
+
+static struct net_device *
+dsa_tag_8021q_find_port_by_vbid(struct net_device *conduit, int vbid)
+{
+ struct dsa_port *cpu_dp = conduit->dsa_ptr;
+ struct dsa_switch_tree *dst = cpu_dp->dst;
+ struct dsa_port *dp;
+
+ if (WARN_ON(!vbid))
+ return NULL;
+
+ dsa_tree_for_each_user_port(dp, dst) {
+ if (!dp->bridge)
+ continue;
+
+ if (dp->stp_state != BR_STATE_LEARNING &&
+ dp->stp_state != BR_STATE_FORWARDING)
+ continue;
+
+ if (dp->cpu_dp != cpu_dp)
+ continue;
+
+ if (dsa_port_bridge_num_get(dp) == vbid)
+ return dp->user;
+ }
+
+ return NULL;
+}
+
+struct net_device *dsa_tag_8021q_find_user(struct net_device *conduit,
+ int source_port, int switch_id,
+ int vid, int vbid)
+{
+ /* Always prefer precise source port information, if available */
+ if (source_port != -1 && switch_id != -1)
+ return dsa_conduit_find_user(conduit, switch_id, source_port);
+ else if (vbid >= 1)
+ return dsa_tag_8021q_find_port_by_vbid(conduit, vbid);
+
+ return dsa_find_designated_bridge_port_by_vid(conduit, vid);
+}
+EXPORT_SYMBOL_GPL(dsa_tag_8021q_find_user);
+
+/**
+ * dsa_8021q_rcv - Decode source information from tag_8021q header
+ * @skb: RX socket buffer
+ * @source_port: pointer to storage for precise source port information.
+ * If this is known already from outside tag_8021q, the pre-initialized
+ * value is preserved. If not known, pass -1.
+ * @switch_id: similar to source_port.
+ * @vbid: pointer to storage for imprecise bridge ID. Must be pre-initialized
+ * with -1. If a positive value is returned, the source_port and switch_id
+ * are invalid.
+ * @vid: pointer to storage for original VID, in case tag_8021q decoding failed.
+ *
+ * If the packet has a tag_8021q header, decode it and set @source_port,
+ * @switch_id and @vbid, and strip the header. Otherwise set @vid and keep the
+ * header in the hwaccel area of the packet.
+ */
+void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id,
+ int *vbid, int *vid)
+{
+ int tmp_source_port, tmp_switch_id, tmp_vbid;
+ __be16 vlan_proto;
+ u16 tmp_vid, tci;
+
+ if (skb_vlan_tag_present(skb)) {
+ vlan_proto = skb->vlan_proto;
+ tci = skb_vlan_tag_get(skb);
+ __vlan_hwaccel_clear_tag(skb);
+ } else {
+ struct vlan_ethhdr *hdr = vlan_eth_hdr(skb);
+
+ vlan_proto = hdr->h_vlan_proto;
+ skb_push_rcsum(skb, ETH_HLEN);
+ __skb_vlan_pop(skb, &tci);
+ skb_pull_rcsum(skb, ETH_HLEN);
+ }
+
+ tmp_vid = tci & VLAN_VID_MASK;
+ if (!vid_is_dsa_8021q(tmp_vid)) {
+ /* Not a tag_8021q frame, so return the VID to the
+ * caller for further processing, and put the tag back
+ */
+ if (vid)
+ *vid = tmp_vid;
+
+ __vlan_hwaccel_put_tag(skb, vlan_proto, tci);
+
+ return;
+ }
+
+ tmp_source_port = dsa_8021q_rx_source_port(tmp_vid);
+ tmp_switch_id = dsa_8021q_rx_switch_id(tmp_vid);
+ tmp_vbid = dsa_tag_8021q_rx_vbid(tmp_vid);
+
+ /* Precise source port information is unknown when receiving from a
+ * VLAN-unaware bridging domain, and tmp_source_port and tmp_switch_id
+ * are zeroes in this case.
+ *
+ * Preserve the source information from hardware-specific mechanisms,
+ * if available. This allows us to not overwrite a valid source port
+ * and switch ID with less precise values.
+ */
+ if (tmp_vbid == 0 && *source_port == -1)
+ *source_port = tmp_source_port;
+ if (tmp_vbid == 0 && *switch_id == -1)
+ *switch_id = tmp_switch_id;
+
+ if (vbid)
+ *vbid = tmp_vbid;
+
+ skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+ return;
+}
+EXPORT_SYMBOL_GPL(dsa_8021q_rcv);
diff --git a/net/dsa/tag_8021q.h b/net/dsa/tag_8021q.h
new file mode 100644
index 000000000000..27b8906f99ec
--- /dev/null
+++ b/net/dsa/tag_8021q.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_TAG_8021Q_H
+#define __DSA_TAG_8021Q_H
+
+#include <net/dsa.h>
+
+#include "switch.h"
+
+struct sk_buff;
+struct net_device;
+
+struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
+ u16 tpid, u16 tci);
+
+void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id,
+ int *vbid, int *vid);
+
+struct net_device *dsa_tag_8021q_find_user(struct net_device *conduit,
+ int source_port, int switch_id,
+ int vid, int vbid);
+
+int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds,
+ struct dsa_notifier_tag_8021q_vlan_info *info);
+int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds,
+ struct dsa_notifier_tag_8021q_vlan_info *info);
+
+#endif
diff --git a/net/dsa/tag_ar9331.c b/net/dsa/tag_ar9331.c
new file mode 100644
index 000000000000..cbb588ca73aa
--- /dev/null
+++ b/net/dsa/tag_ar9331.c
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2019 Pengutronix, Oleksij Rempel <kernel@pengutronix.de>
+ */
+
+
+#include <linux/bitfield.h>
+#include <linux/etherdevice.h>
+
+#include "tag.h"
+
+#define AR9331_NAME "ar9331"
+
+#define AR9331_HDR_LEN 2
+#define AR9331_HDR_VERSION 1
+
+#define AR9331_HDR_VERSION_MASK GENMASK(15, 14)
+#define AR9331_HDR_PRIORITY_MASK GENMASK(13, 12)
+#define AR9331_HDR_TYPE_MASK GENMASK(10, 8)
+#define AR9331_HDR_BROADCAST BIT(7)
+#define AR9331_HDR_FROM_CPU BIT(6)
+/* AR9331_HDR_RESERVED - not used or may be version field.
+ * According to the AR8216 doc it should 0b10. On AR9331 it is 0b11 on RX path
+ * and should be set to 0b11 to make it work.
+ */
+#define AR9331_HDR_RESERVED_MASK GENMASK(5, 4)
+#define AR9331_HDR_PORT_NUM_MASK GENMASK(3, 0)
+
+static struct sk_buff *ar9331_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ __le16 *phdr;
+ u16 hdr;
+
+ phdr = skb_push(skb, AR9331_HDR_LEN);
+
+ hdr = FIELD_PREP(AR9331_HDR_VERSION_MASK, AR9331_HDR_VERSION);
+ hdr |= AR9331_HDR_FROM_CPU | dp->index;
+ /* 0b10 for AR8216 and 0b11 for AR9331 */
+ hdr |= AR9331_HDR_RESERVED_MASK;
+
+ phdr[0] = cpu_to_le16(hdr);
+
+ return skb;
+}
+
+static struct sk_buff *ar9331_tag_rcv(struct sk_buff *skb,
+ struct net_device *ndev)
+{
+ u8 ver, port;
+ u16 hdr;
+
+ if (unlikely(!pskb_may_pull(skb, AR9331_HDR_LEN)))
+ return NULL;
+
+ hdr = le16_to_cpu(*(__le16 *)skb_mac_header(skb));
+
+ ver = FIELD_GET(AR9331_HDR_VERSION_MASK, hdr);
+ if (unlikely(ver != AR9331_HDR_VERSION)) {
+ netdev_warn_once(ndev, "%s:%i wrong header version 0x%2x\n",
+ __func__, __LINE__, hdr);
+ return NULL;
+ }
+
+ if (unlikely(hdr & AR9331_HDR_FROM_CPU)) {
+ netdev_warn_once(ndev, "%s:%i packet should not be from cpu 0x%2x\n",
+ __func__, __LINE__, hdr);
+ return NULL;
+ }
+
+ skb_pull_rcsum(skb, AR9331_HDR_LEN);
+
+ /* Get source port information */
+ port = FIELD_GET(AR9331_HDR_PORT_NUM_MASK, hdr);
+
+ skb->dev = dsa_conduit_find_user(ndev, 0, port);
+ if (!skb->dev)
+ return NULL;
+
+ return skb;
+}
+
+static const struct dsa_device_ops ar9331_netdev_ops = {
+ .name = AR9331_NAME,
+ .proto = DSA_TAG_PROTO_AR9331,
+ .xmit = ar9331_tag_xmit,
+ .rcv = ar9331_tag_rcv,
+ .needed_headroom = AR9331_HDR_LEN,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for Atheros AR9331 SoC with built-in switch");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_AR9331, AR9331_NAME);
+module_dsa_tag_driver(ar9331_netdev_ops);
diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c
index 2b06bb91318b..cf9420439054 100644
--- a/net/dsa/tag_brcm.c
+++ b/net/dsa/tag_brcm.c
@@ -1,26 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* Broadcom tag support
*
* Copyright (C) 2014 Broadcom Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
+#include <linux/dsa/brcm.h>
#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
#include <linux/list.h>
#include <linux/slab.h>
-#include "dsa_priv.h"
+#include "tag.h"
-/* This tag length is 4 bytes, older ones were 6 bytes, we do not
- * handle them
- */
+#define BRCM_NAME "brcm"
+#define BRCM_LEGACY_NAME "brcm-legacy"
+#define BRCM_LEGACY_FCS_NAME "brcm-legacy-fcs"
+#define BRCM_PREPEND_NAME "brcm-prepend"
+
+/* Legacy Broadcom tag (6 bytes) */
+#define BRCM_LEG_TAG_LEN 6
+
+/* Type fields */
+/* 1st byte in the tag */
+#define BRCM_LEG_TYPE_HI 0x88
+/* 2nd byte in the tag */
+#define BRCM_LEG_TYPE_LO 0x74
+
+/* Tag fields */
+/* 3rd byte in the tag */
+#define BRCM_LEG_UNICAST (0 << 5)
+#define BRCM_LEG_MULTICAST (1 << 5)
+#define BRCM_LEG_EGRESS (2 << 5)
+#define BRCM_LEG_INGRESS (3 << 5)
+#define BRCM_LEG_LEN_HI(x) (((x) >> 8) & 0x7)
+
+/* 4th byte in the tag */
+#define BRCM_LEG_LEN_LO(x) ((x) & 0xff)
+
+/* 6th byte in the tag */
+#define BRCM_LEG_PORT_ID (0xf)
+
+/* Newer Broadcom tag (4 bytes) */
#define BRCM_TAG_LEN 4
-/* Tag is constructed and desconstructed using byte by byte access
+/* Tag is constructed and deconstructed using byte by byte access
* because the tag is placed after the MAC Source Address, which does
* not make it 4-bytes aligned, so this might cause unaligned accesses
* on most systems where this is used.
@@ -59,17 +83,18 @@
#define BRCM_EG_TC_MASK 0x7
#define BRCM_EG_PID_MASK 0x1f
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM) || \
+ IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND)
+
static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb,
struct net_device *dev,
unsigned int offset)
{
- struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
u16 queue = skb_get_queue_mapping(skb);
+ u16 port_mask;
u8 *brcm_tag;
- if (skb_cow_head(skb, BRCM_TAG_LEN) < 0)
- return NULL;
-
/* The Ethernet switch we are interfaced with needs packets to be at
* least 64 bytes (including FCS) otherwise they will be discarded when
* they enter the switch port logic. When Broadcom tags are enabled, we
@@ -77,7 +102,7 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb,
* (including FCS and tag) because the length verification is done after
* the Broadcom tag is stripped off the ingress packet.
*
- * Let dsa_slave_xmit() free the SKB
+ * Let dsa_user_xmit() free the SKB
*/
if (__skb_put_padto(skb, ETH_ZLEN + BRCM_TAG_LEN, false))
return NULL;
@@ -85,22 +110,21 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb,
skb_push(skb, BRCM_TAG_LEN);
if (offset)
- memmove(skb->data, skb->data + BRCM_TAG_LEN, offset);
+ dsa_alloc_etype_header(skb, BRCM_TAG_LEN);
brcm_tag = skb->data + offset;
- /* Set the ingress opcode, traffic class, tag enforcment is
+ /* Set the ingress opcode, traffic class, tag enforcement is
* deprecated
*/
brcm_tag[0] = (1 << BRCM_OPCODE_SHIFT) |
((queue & BRCM_IG_TC_MASK) << BRCM_IG_TC_SHIFT);
brcm_tag[1] = 0;
- brcm_tag[2] = 0;
- if (dp->index == 8)
- brcm_tag[2] = BRCM_IG_DSTMAP2_MASK;
- brcm_tag[3] = (1 << dp->index) & BRCM_IG_DSTMAP1_MASK;
+ port_mask = dsa_xmit_port_mask(skb, dev);
+ brcm_tag[2] = (port_mask >> 8) & BRCM_IG_DSTMAP2_MASK;
+ brcm_tag[3] = port_mask & BRCM_IG_DSTMAP1_MASK;
- /* Now tell the master network device about the desired output queue
+ /* Now tell the conduit network device about the desired output queue
* as well
*/
skb_set_queue_mapping(skb, BRCM_TAG_SET_PORT_QUEUE(dp->index, queue));
@@ -108,9 +132,20 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb,
return skb;
}
+/* Frames with this tag have one of these two layouts:
+ * -----------------------------------
+ * | MAC DA | MAC SA | 4b tag | Type | DSA_TAG_PROTO_BRCM
+ * -----------------------------------
+ * -----------------------------------
+ * | 4b tag | MAC DA | MAC SA | Type | DSA_TAG_PROTO_BRCM_PREPEND
+ * -----------------------------------
+ * In both cases, at receive time, skb->data points 2 bytes before the actual
+ * Ethernet type field and we have an offset of 4bytes between where skb->data
+ * and where the payload starts. So the same low-level receive function can be
+ * used.
+ */
static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb,
struct net_device *dev,
- struct packet_type *pt,
unsigned int offset)
{
int source_port;
@@ -134,17 +169,21 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb,
/* Locate which port this is coming from */
source_port = brcm_tag[3] & BRCM_EG_PID_MASK;
- skb->dev = dsa_master_find_slave(dev, 0, source_port);
+ skb->dev = dsa_conduit_find_user(dev, 0, source_port);
if (!skb->dev)
return NULL;
/* Remove Broadcom tag and update checksum */
skb_pull_rcsum(skb, BRCM_TAG_LEN);
+ if (likely(!is_link_local_ether_addr(eth_hdr(skb)->h_dest)))
+ dsa_default_offload_fwd_mark(skb);
+
return skb;
}
+#endif
-#ifdef CONFIG_NET_DSA_TAG_BRCM
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM)
static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb,
struct net_device *dev)
{
@@ -153,31 +192,185 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb,
}
-static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt)
+static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev)
{
struct sk_buff *nskb;
/* skb->data points to the EtherType, the tag is right before it */
- nskb = brcm_tag_rcv_ll(skb, dev, pt, 2);
+ nskb = brcm_tag_rcv_ll(skb, dev, 2);
if (!nskb)
return nskb;
- /* Move the Ethernet DA and SA */
- memmove(nskb->data - ETH_HLEN,
- nskb->data - ETH_HLEN - BRCM_TAG_LEN,
- 2 * ETH_ALEN);
+ dsa_strip_etype_header(skb, BRCM_TAG_LEN);
return nskb;
}
-const struct dsa_device_ops brcm_netdev_ops = {
+static const struct dsa_device_ops brcm_netdev_ops = {
+ .name = BRCM_NAME,
+ .proto = DSA_TAG_PROTO_BRCM,
.xmit = brcm_tag_xmit,
.rcv = brcm_tag_rcv,
+ .needed_headroom = BRCM_TAG_LEN,
};
+
+DSA_TAG_DRIVER(brcm_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM, BRCM_NAME);
#endif
-#ifdef CONFIG_NET_DSA_TAG_BRCM_PREPEND
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY) || \
+ IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS)
+static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ int len = BRCM_LEG_TAG_LEN;
+ int source_port;
+ __be16 *proto;
+ u8 *brcm_tag;
+
+ if (unlikely(!pskb_may_pull(skb, BRCM_LEG_TAG_LEN + VLAN_HLEN)))
+ return NULL;
+
+ brcm_tag = dsa_etype_header_pos_rx(skb);
+ proto = (__be16 *)(brcm_tag + BRCM_LEG_TAG_LEN);
+
+ source_port = brcm_tag[5] & BRCM_LEG_PORT_ID;
+
+ skb->dev = dsa_conduit_find_user(dev, 0, source_port);
+ if (!skb->dev)
+ return NULL;
+
+ /* The internal switch in BCM63XX SoCs always tags on egress on the CPU
+ * port. We use VID 0 internally for untagged traffic, so strip the tag
+ * if the TCI field is all 0, and keep it otherwise to also retain
+ * e.g. 802.1p tagged packets.
+ */
+ if (proto[0] == htons(ETH_P_8021Q) && proto[1] == 0)
+ len += VLAN_HLEN;
+
+ /* Remove Broadcom tag and update checksum */
+ skb_pull_rcsum(skb, len);
+
+ if (likely(!is_link_local_ether_addr(eth_hdr(skb)->h_dest)))
+ dsa_default_offload_fwd_mark(skb);
+
+ dsa_strip_etype_header(skb, len);
+
+ return skb;
+}
+#endif /* CONFIG_NET_DSA_TAG_BRCM_LEGACY || CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS */
+
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY)
+static struct sk_buff *brcm_leg_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ u8 *brcm_tag;
+
+ /* The Ethernet switch we are interfaced with needs packets to be at
+ * least 64 bytes (including FCS) otherwise they will be discarded when
+ * they enter the switch port logic. When Broadcom tags are enabled, we
+ * need to make sure that packets are at least 70 bytes
+ * (including FCS and tag) because the length verification is done after
+ * the Broadcom tag is stripped off the ingress packet.
+ *
+ * Let dsa_user_xmit() free the SKB
+ */
+ if (__skb_put_padto(skb, ETH_ZLEN + BRCM_LEG_TAG_LEN, false))
+ return NULL;
+
+ skb_push(skb, BRCM_LEG_TAG_LEN);
+
+ dsa_alloc_etype_header(skb, BRCM_LEG_TAG_LEN);
+
+ brcm_tag = skb->data + 2 * ETH_ALEN;
+
+ /* Broadcom tag type */
+ brcm_tag[0] = BRCM_LEG_TYPE_HI;
+ brcm_tag[1] = BRCM_LEG_TYPE_LO;
+
+ /* Broadcom tag value */
+ brcm_tag[2] = BRCM_LEG_EGRESS;
+ brcm_tag[3] = 0;
+ brcm_tag[4] = 0;
+ brcm_tag[5] = dp->index & BRCM_LEG_PORT_ID;
+
+ return skb;
+}
+
+static const struct dsa_device_ops brcm_legacy_netdev_ops = {
+ .name = BRCM_LEGACY_NAME,
+ .proto = DSA_TAG_PROTO_BRCM_LEGACY,
+ .xmit = brcm_leg_tag_xmit,
+ .rcv = brcm_leg_tag_rcv,
+ .needed_headroom = BRCM_LEG_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(brcm_legacy_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_LEGACY, BRCM_LEGACY_NAME);
+#endif /* CONFIG_NET_DSA_TAG_BRCM_LEGACY */
+
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS)
+static struct sk_buff *brcm_leg_fcs_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ unsigned int fcs_len;
+ __le32 fcs_val;
+ u8 *brcm_tag;
+
+ /* The Ethernet switch we are interfaced with needs packets to be at
+ * least 64 bytes (including FCS) otherwise they will be discarded when
+ * they enter the switch port logic. When Broadcom tags are enabled, we
+ * need to make sure that packets are at least 70 bytes (including FCS
+ * and tag) because the length verification is done after the Broadcom
+ * tag is stripped off the ingress packet.
+ *
+ * Let dsa_user_xmit() free the SKB.
+ */
+ if (__skb_put_padto(skb, ETH_ZLEN + BRCM_LEG_TAG_LEN, false))
+ return NULL;
+
+ fcs_len = skb->len;
+ fcs_val = cpu_to_le32(crc32_le(~0, skb->data, fcs_len) ^ ~0);
+
+ skb_push(skb, BRCM_LEG_TAG_LEN);
+
+ dsa_alloc_etype_header(skb, BRCM_LEG_TAG_LEN);
+
+ brcm_tag = skb->data + 2 * ETH_ALEN;
+
+ /* Broadcom tag type */
+ brcm_tag[0] = BRCM_LEG_TYPE_HI;
+ brcm_tag[1] = BRCM_LEG_TYPE_LO;
+
+ /* Broadcom tag value */
+ brcm_tag[2] = BRCM_LEG_EGRESS | BRCM_LEG_LEN_HI(fcs_len);
+ brcm_tag[3] = BRCM_LEG_LEN_LO(fcs_len);
+ brcm_tag[4] = 0;
+ brcm_tag[5] = dp->index & BRCM_LEG_PORT_ID;
+
+ /* Original FCS value */
+ if (__skb_pad(skb, ETH_FCS_LEN, false))
+ return NULL;
+ skb_put_data(skb, &fcs_val, ETH_FCS_LEN);
+
+ return skb;
+}
+
+static const struct dsa_device_ops brcm_legacy_fcs_netdev_ops = {
+ .name = BRCM_LEGACY_FCS_NAME,
+ .proto = DSA_TAG_PROTO_BRCM_LEGACY_FCS,
+ .xmit = brcm_leg_fcs_tag_xmit,
+ .rcv = brcm_leg_tag_rcv,
+ .needed_headroom = BRCM_LEG_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(brcm_legacy_fcs_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_LEGACY_FCS, BRCM_LEGACY_FCS_NAME);
+#endif /* CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS */
+
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND)
static struct sk_buff *brcm_tag_xmit_prepend(struct sk_buff *skb,
struct net_device *dev)
{
@@ -186,15 +379,40 @@ static struct sk_buff *brcm_tag_xmit_prepend(struct sk_buff *skb,
}
static struct sk_buff *brcm_tag_rcv_prepend(struct sk_buff *skb,
- struct net_device *dev,
- struct packet_type *pt)
+ struct net_device *dev)
{
/* tag is prepended to the packet */
- return brcm_tag_rcv_ll(skb, dev, pt, ETH_HLEN);
+ return brcm_tag_rcv_ll(skb, dev, ETH_HLEN);
}
-const struct dsa_device_ops brcm_prepend_netdev_ops = {
+static const struct dsa_device_ops brcm_prepend_netdev_ops = {
+ .name = BRCM_PREPEND_NAME,
+ .proto = DSA_TAG_PROTO_BRCM_PREPEND,
.xmit = brcm_tag_xmit_prepend,
.rcv = brcm_tag_rcv_prepend,
+ .needed_headroom = BRCM_TAG_LEN,
};
+
+DSA_TAG_DRIVER(brcm_prepend_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_PREPEND, BRCM_PREPEND_NAME);
+#endif
+
+static struct dsa_tag_driver *dsa_tag_driver_array[] = {
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM)
+ &DSA_TAG_DRIVER_NAME(brcm_netdev_ops),
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY)
+ &DSA_TAG_DRIVER_NAME(brcm_legacy_netdev_ops),
#endif
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS)
+ &DSA_TAG_DRIVER_NAME(brcm_legacy_fcs_netdev_ops),
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND)
+ &DSA_TAG_DRIVER_NAME(brcm_prepend_netdev_ops),
+#endif
+};
+
+module_dsa_tag_drivers(dsa_tag_driver_array);
+
+MODULE_DESCRIPTION("DSA tag driver for Broadcom switches using in-frame headers");
+MODULE_LICENSE("GPL");
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index cd13cfc542ce..2a2c4fb61a65 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -1,110 +1,297 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
- * net/dsa/tag_dsa.c - (Non-ethertype) DSA tagging
+ * Regular and Ethertype DSA tagging
* Copyright (c) 2008-2009 Marvell Semiconductor
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
+ * Regular DSA
+ * -----------
+
+ * For untagged (in 802.1Q terms) packets, the switch will splice in
+ * the tag between the SA and the ethertype of the original
+ * packet. Tagged frames will instead have their outermost .1Q tag
+ * converted to a DSA tag. It expects the same layout when receiving
+ * packets from the CPU.
+ *
+ * Example:
+ *
+ * .----.----.----.---------
+ * Pu: | DA | SA | ET | Payload ...
+ * '----'----'----'---------
+ * 6 6 2 N
+ * .----.----.--------.-----.----.---------
+ * Pt: | DA | SA | 0x8100 | TCI | ET | Payload ...
+ * '----'----'--------'-----'----'---------
+ * 6 6 2 2 2 N
+ * .----.----.-----.----.---------
+ * Pd: | DA | SA | DSA | ET | Payload ...
+ * '----'----'-----'----'---------
+ * 6 6 4 2 N
+ *
+ * No matter if a packet is received untagged (Pu) or tagged (Pt),
+ * they will both have the same layout (Pd) when they are sent to the
+ * CPU. This is done by ignoring 802.3, replacing the ethertype field
+ * with more metadata, among which is a bit to signal if the original
+ * packet was tagged or not.
+ *
+ * Ethertype DSA
+ * -------------
+ * Uses the exact same tag format as regular DSA, but also includes a
+ * proper ethertype field (which the mv88e6xxx driver sets to
+ * ETH_P_EDSA/0xdada) followed by two zero bytes:
+ *
+ * .----.----.--------.--------.-----.----.---------
+ * | DA | SA | 0xdada | 0x0000 | DSA | ET | Payload ...
+ * '----'----'--------'--------'-----'----'---------
+ * 6 6 2 2 4 2 N
*/
+#include <linux/dsa/mv88e6xxx.h>
#include <linux/etherdevice.h>
#include <linux/list.h>
#include <linux/slab.h>
-#include "dsa_priv.h"
+#include "tag.h"
+
+#define DSA_NAME "dsa"
+#define EDSA_NAME "edsa"
#define DSA_HLEN 4
-static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
+/**
+ * enum dsa_cmd - DSA Command
+ * @DSA_CMD_TO_CPU: Set on packets that were trapped or mirrored to
+ * the CPU port. This is needed to implement control protocols,
+ * e.g. STP and LLDP, that must not allow those control packets to
+ * be switched according to the normal rules.
+ * @DSA_CMD_FROM_CPU: Used by the CPU to send a packet to a specific
+ * port, ignoring all the barriers that the switch normally
+ * enforces (VLANs, STP port states etc.). No source address
+ * learning takes place. "sudo send packet"
+ * @DSA_CMD_TO_SNIFFER: Set on the copies of packets that matched some
+ * user configured ingress or egress monitor criteria. These are
+ * forwarded by the switch tree to the user configured ingress or
+ * egress monitor port, which can be set to the CPU port or a
+ * regular port. If the destination is a regular port, the tag
+ * will be removed before egressing the port. If the destination
+ * is the CPU port, the tag will not be removed.
+ * @DSA_CMD_FORWARD: This tag is used on all bulk traffic passing
+ * through the switch tree, including the flows that are directed
+ * towards the CPU. Its device/port tuple encodes the original
+ * source port on which the packet ingressed. It can also be used
+ * on transmit by the CPU to defer the forwarding decision to the
+ * hardware, based on the current config of PVT/VTU/ATU
+ * etc. Source address learning takes places if enabled on the
+ * receiving DSA/CPU port.
+ */
+enum dsa_cmd {
+ DSA_CMD_TO_CPU = 0,
+ DSA_CMD_FROM_CPU = 1,
+ DSA_CMD_TO_SNIFFER = 2,
+ DSA_CMD_FORWARD = 3
+};
+
+/**
+ * enum dsa_code - TO_CPU Code
+ *
+ * @DSA_CODE_MGMT_TRAP: DA was classified as a management
+ * address. Typical examples include STP BPDUs and LLDP.
+ * @DSA_CODE_FRAME2REG: Response to a "remote management" request.
+ * @DSA_CODE_IGMP_MLD_TRAP: IGMP/MLD signaling.
+ * @DSA_CODE_POLICY_TRAP: Frame matched some policy configuration on
+ * the device. Typical examples are matching on DA/SA/VID and DHCP
+ * snooping.
+ * @DSA_CODE_ARP_MIRROR: The name says it all really.
+ * @DSA_CODE_POLICY_MIRROR: Same as @DSA_CODE_POLICY_TRAP, but the
+ * particular policy was set to trigger a mirror instead of a
+ * trap.
+ * @DSA_CODE_RESERVED_6: Unused on all devices up to at least 6393X.
+ * @DSA_CODE_RESERVED_7: Unused on all devices up to at least 6393X.
+ *
+ * A 3-bit code is used to relay why a particular frame was sent to
+ * the CPU. We only use this to determine if the packet was mirrored
+ * or trapped, i.e. whether the packet has been forwarded by hardware
+ * or not.
+ *
+ * This is the superset of all possible codes. Any particular device
+ * may only implement a subset.
+ */
+enum dsa_code {
+ DSA_CODE_MGMT_TRAP = 0,
+ DSA_CODE_FRAME2REG = 1,
+ DSA_CODE_IGMP_MLD_TRAP = 2,
+ DSA_CODE_POLICY_TRAP = 3,
+ DSA_CODE_ARP_MIRROR = 4,
+ DSA_CODE_POLICY_MIRROR = 5,
+ DSA_CODE_RESERVED_6 = 6,
+ DSA_CODE_RESERVED_7 = 7
+};
+
+static struct sk_buff *dsa_xmit_ll(struct sk_buff *skb, struct net_device *dev,
+ u8 extra)
{
- struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct net_device *br_dev;
+ u8 tag_dev, tag_port;
+ enum dsa_cmd cmd;
u8 *dsa_header;
- /*
- * Convert the outermost 802.1q tag to a DSA tag for tagged
- * packets, or insert a DSA tag between the addresses and
- * the ethertype field for untagged packets.
- */
- if (skb->protocol == htons(ETH_P_8021Q)) {
- if (skb_cow_head(skb, 0) < 0)
- return NULL;
+ if (skb->offload_fwd_mark) {
+ unsigned int bridge_num = dsa_port_bridge_num_get(dp);
+ struct dsa_switch_tree *dst = dp->ds->dst;
- /*
- * Construct tagged FROM_CPU DSA tag from 802.1q tag.
- */
- dsa_header = skb->data + 2 * ETH_ALEN;
- dsa_header[0] = 0x60 | dp->ds->index;
- dsa_header[1] = dp->index << 3;
+ cmd = DSA_CMD_FORWARD;
- /*
- * Move CFI field from byte 2 to byte 1.
+ /* When offloading forwarding for a bridge, inject FORWARD
+ * packets on behalf of a virtual switch device with an index
+ * past the physical switches.
*/
+ tag_dev = dst->last_switch + bridge_num;
+ tag_port = 0;
+ } else {
+ cmd = DSA_CMD_FROM_CPU;
+ tag_dev = dp->ds->index;
+ tag_port = dp->index;
+ }
+
+ br_dev = dsa_port_bridge_dev_get(dp);
+
+ /* If frame is already 802.1Q tagged, we can convert it to a DSA
+ * tag (avoiding a memmove), but only if the port is standalone
+ * (in which case we always send FROM_CPU) or if the port's
+ * bridge has VLAN filtering enabled (in which case the CPU port
+ * will be a member of the VLAN).
+ */
+ if (skb->protocol == htons(ETH_P_8021Q) &&
+ (!br_dev || br_vlan_enabled(br_dev))) {
+ if (extra) {
+ skb_push(skb, extra);
+ dsa_alloc_etype_header(skb, extra);
+ }
+
+ /* Construct tagged DSA tag from 802.1Q tag. */
+ dsa_header = dsa_etype_header_pos_tx(skb) + extra;
+ dsa_header[0] = (cmd << 6) | 0x20 | tag_dev;
+ dsa_header[1] = tag_port << 3;
+
+ /* Move CFI field from byte 2 to byte 1. */
if (dsa_header[2] & 0x10) {
dsa_header[1] |= 0x01;
dsa_header[2] &= ~0x10;
}
} else {
- if (skb_cow_head(skb, DSA_HLEN) < 0)
- return NULL;
- skb_push(skb, DSA_HLEN);
+ u16 vid;
- memmove(skb->data, skb->data + DSA_HLEN, 2 * ETH_ALEN);
+ vid = br_dev ? MV88E6XXX_VID_BRIDGED : MV88E6XXX_VID_STANDALONE;
- /*
- * Construct untagged FROM_CPU DSA tag.
- */
- dsa_header = skb->data + 2 * ETH_ALEN;
- dsa_header[0] = 0x40 | dp->ds->index;
- dsa_header[1] = dp->index << 3;
- dsa_header[2] = 0x00;
- dsa_header[3] = 0x00;
+ skb_push(skb, DSA_HLEN + extra);
+ dsa_alloc_etype_header(skb, DSA_HLEN + extra);
+
+ /* Construct DSA header from untagged frame. */
+ dsa_header = dsa_etype_header_pos_tx(skb) + extra;
+
+ dsa_header[0] = (cmd << 6) | tag_dev;
+ dsa_header[1] = tag_port << 3;
+ dsa_header[2] = vid >> 8;
+ dsa_header[3] = vid & 0xff;
}
return skb;
}
-static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt)
+static struct sk_buff *dsa_rcv_ll(struct sk_buff *skb, struct net_device *dev,
+ u8 extra)
{
+ bool trap = false, trunk = false;
+ int source_device, source_port;
+ enum dsa_code code;
+ enum dsa_cmd cmd;
u8 *dsa_header;
- int source_device;
- int source_port;
- if (unlikely(!pskb_may_pull(skb, DSA_HLEN)))
- return NULL;
+ /* The ethertype field is part of the DSA header. */
+ dsa_header = dsa_etype_header_pos_rx(skb);
- /*
- * The ethertype field is part of the DSA header.
- */
- dsa_header = skb->data - 2;
+ cmd = dsa_header[0] >> 6;
+ switch (cmd) {
+ case DSA_CMD_FORWARD:
+ trunk = !!(dsa_header[1] & 4);
+ break;
- /*
- * Check that frame type is either TO_CPU or FORWARD.
- */
- if ((dsa_header[0] & 0xc0) != 0x00 && (dsa_header[0] & 0xc0) != 0xc0)
+ case DSA_CMD_TO_CPU:
+ code = (dsa_header[1] & 0x6) | ((dsa_header[2] >> 4) & 1);
+
+ switch (code) {
+ case DSA_CODE_FRAME2REG:
+ /* Remote management is not implemented yet,
+ * drop.
+ */
+ return NULL;
+ case DSA_CODE_ARP_MIRROR:
+ case DSA_CODE_POLICY_MIRROR:
+ /* Mark mirrored packets to notify any upper
+ * device (like a bridge) that forwarding has
+ * already been done by hardware.
+ */
+ break;
+ case DSA_CODE_MGMT_TRAP:
+ case DSA_CODE_IGMP_MLD_TRAP:
+ case DSA_CODE_POLICY_TRAP:
+ /* Traps have, by definition, not been
+ * forwarded by hardware, so don't mark them.
+ */
+ trap = true;
+ break;
+ default:
+ /* Reserved code, this could be anything. Drop
+ * seems like the safest option.
+ */
+ return NULL;
+ }
+
+ break;
+
+ default:
return NULL;
+ }
- /*
- * Determine source device and port.
- */
source_device = dsa_header[0] & 0x1f;
source_port = (dsa_header[1] >> 3) & 0x1f;
- skb->dev = dsa_master_find_slave(dev, source_device, source_port);
+ if (trunk) {
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ struct dsa_lag *lag;
+
+ /* The exact source port is not available in the tag,
+ * so we inject the frame directly on the upper
+ * team/bond.
+ */
+ lag = dsa_lag_by_id(cpu_dp->dst, source_port + 1);
+ skb->dev = lag ? lag->dev : NULL;
+ } else {
+ skb->dev = dsa_conduit_find_user(dev, source_device,
+ source_port);
+ }
+
if (!skb->dev)
return NULL;
- /*
- * Convert the DSA header to an 802.1q header if the 'tagged'
- * bit in the DSA header is set. If the 'tagged' bit is clear,
- * delete the DSA header entirely.
+ /* When using LAG offload, skb->dev is not a DSA user interface,
+ * so we cannot call dsa_default_offload_fwd_mark and we need to
+ * special-case it.
+ */
+ if (trunk)
+ skb->offload_fwd_mark = true;
+ else if (!trap)
+ dsa_default_offload_fwd_mark(skb);
+
+ /* If the 'tagged' bit is set; convert the DSA tag to a 802.1Q
+ * tag, and delete the ethertype (extra) if applicable. If the
+ * 'tagged' bit is cleared; delete the DSA tag, and ethertype
+ * if applicable.
*/
if (dsa_header[0] & 0x20) {
u8 new_header[4];
- /*
- * Insert 802.1q ethertype and copy the VLAN-related
+ /* Insert 802.1Q ethertype and copy the VLAN-related
* fields, but clear the bit that will hold CFI (since
* DSA uses that bit location for another purpose).
*/
@@ -113,16 +300,13 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev,
new_header[2] = dsa_header[2] & ~0x10;
new_header[3] = dsa_header[3];
- /*
- * Move CFI bit from its place in the DSA header to
- * its 802.1q-designated place.
+ /* Move CFI bit from its place in the DSA header to
+ * its 802.1Q-designated place.
*/
if (dsa_header[1] & 0x01)
new_header[2] |= 0x10;
- /*
- * Update packet checksum if skb is CHECKSUM_COMPLETE.
- */
+ /* Update packet checksum if skb is CHECKSUM_COMPLETE. */
if (skb->ip_summed == CHECKSUM_COMPLETE) {
__wsum c = skb->csum;
c = csum_add(c, csum_partial(new_header + 2, 2, 0));
@@ -131,22 +315,96 @@ static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev,
}
memcpy(dsa_header, new_header, DSA_HLEN);
+
+ if (extra)
+ dsa_strip_etype_header(skb, extra);
} else {
- /*
- * Remove DSA tag and update checksum.
- */
skb_pull_rcsum(skb, DSA_HLEN);
- memmove(skb->data - ETH_HLEN,
- skb->data - ETH_HLEN - DSA_HLEN,
- 2 * ETH_ALEN);
+ dsa_strip_etype_header(skb, DSA_HLEN + extra);
}
- skb->offload_fwd_mark = 1;
+ return skb;
+}
+
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_DSA)
+
+static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ return dsa_xmit_ll(skb, dev, 0);
+}
+static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev)
+{
+ if (unlikely(!pskb_may_pull(skb, DSA_HLEN)))
+ return NULL;
+
+ return dsa_rcv_ll(skb, dev, 0);
+}
+
+static const struct dsa_device_ops dsa_netdev_ops = {
+ .name = DSA_NAME,
+ .proto = DSA_TAG_PROTO_DSA,
+ .xmit = dsa_xmit,
+ .rcv = dsa_rcv,
+ .needed_headroom = DSA_HLEN,
+};
+
+DSA_TAG_DRIVER(dsa_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_DSA, DSA_NAME);
+#endif /* CONFIG_NET_DSA_TAG_DSA */
+
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_EDSA)
+
+#define EDSA_HLEN 8
+
+static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ u8 *edsa_header;
+
+ skb = dsa_xmit_ll(skb, dev, EDSA_HLEN - DSA_HLEN);
+ if (!skb)
+ return NULL;
+
+ edsa_header = dsa_etype_header_pos_tx(skb);
+ edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff;
+ edsa_header[1] = ETH_P_EDSA & 0xff;
+ edsa_header[2] = 0x00;
+ edsa_header[3] = 0x00;
return skb;
}
-const struct dsa_device_ops dsa_netdev_ops = {
- .xmit = dsa_xmit,
- .rcv = dsa_rcv,
+static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev)
+{
+ if (unlikely(!pskb_may_pull(skb, EDSA_HLEN)))
+ return NULL;
+
+ skb_pull_rcsum(skb, EDSA_HLEN - DSA_HLEN);
+
+ return dsa_rcv_ll(skb, dev, EDSA_HLEN - DSA_HLEN);
+}
+
+static const struct dsa_device_ops edsa_netdev_ops = {
+ .name = EDSA_NAME,
+ .proto = DSA_TAG_PROTO_EDSA,
+ .xmit = edsa_xmit,
+ .rcv = edsa_rcv,
+ .needed_headroom = EDSA_HLEN,
};
+
+DSA_TAG_DRIVER(edsa_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_EDSA, EDSA_NAME);
+#endif /* CONFIG_NET_DSA_TAG_EDSA */
+
+static struct dsa_tag_driver *dsa_tag_drivers[] = {
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_DSA)
+ &DSA_TAG_DRIVER_NAME(dsa_netdev_ops),
+#endif
+#if IS_ENABLED(CONFIG_NET_DSA_TAG_EDSA)
+ &DSA_TAG_DRIVER_NAME(edsa_netdev_ops),
+#endif
+};
+
+module_dsa_tag_drivers(dsa_tag_drivers);
+
+MODULE_DESCRIPTION("DSA tag driver for Marvell switches using DSA headers");
+MODULE_LICENSE("GPL");
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
deleted file mode 100644
index 4083326b806e..000000000000
--- a/net/dsa/tag_edsa.c
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * net/dsa/tag_edsa.c - Ethertype DSA tagging
- * Copyright (c) 2008-2009 Marvell Semiconductor
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/etherdevice.h>
-#include <linux/list.h>
-#include <linux/slab.h>
-
-#include "dsa_priv.h"
-
-#define DSA_HLEN 4
-#define EDSA_HLEN 8
-
-static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev)
-{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- u8 *edsa_header;
-
- /*
- * Convert the outermost 802.1q tag to a DSA tag and prepend
- * a DSA ethertype field is the packet is tagged, or insert
- * a DSA ethertype plus DSA tag between the addresses and the
- * current ethertype field if the packet is untagged.
- */
- if (skb->protocol == htons(ETH_P_8021Q)) {
- if (skb_cow_head(skb, DSA_HLEN) < 0)
- return NULL;
- skb_push(skb, DSA_HLEN);
-
- memmove(skb->data, skb->data + DSA_HLEN, 2 * ETH_ALEN);
-
- /*
- * Construct tagged FROM_CPU DSA tag from 802.1q tag.
- */
- edsa_header = skb->data + 2 * ETH_ALEN;
- edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff;
- edsa_header[1] = ETH_P_EDSA & 0xff;
- edsa_header[2] = 0x00;
- edsa_header[3] = 0x00;
- edsa_header[4] = 0x60 | dp->ds->index;
- edsa_header[5] = dp->index << 3;
-
- /*
- * Move CFI field from byte 6 to byte 5.
- */
- if (edsa_header[6] & 0x10) {
- edsa_header[5] |= 0x01;
- edsa_header[6] &= ~0x10;
- }
- } else {
- if (skb_cow_head(skb, EDSA_HLEN) < 0)
- return NULL;
- skb_push(skb, EDSA_HLEN);
-
- memmove(skb->data, skb->data + EDSA_HLEN, 2 * ETH_ALEN);
-
- /*
- * Construct untagged FROM_CPU DSA tag.
- */
- edsa_header = skb->data + 2 * ETH_ALEN;
- edsa_header[0] = (ETH_P_EDSA >> 8) & 0xff;
- edsa_header[1] = ETH_P_EDSA & 0xff;
- edsa_header[2] = 0x00;
- edsa_header[3] = 0x00;
- edsa_header[4] = 0x40 | dp->ds->index;
- edsa_header[5] = dp->index << 3;
- edsa_header[6] = 0x00;
- edsa_header[7] = 0x00;
- }
-
- return skb;
-}
-
-static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt)
-{
- u8 *edsa_header;
- int source_device;
- int source_port;
-
- if (unlikely(!pskb_may_pull(skb, EDSA_HLEN)))
- return NULL;
-
- /*
- * Skip the two null bytes after the ethertype.
- */
- edsa_header = skb->data + 2;
-
- /*
- * Check that frame type is either TO_CPU or FORWARD.
- */
- if ((edsa_header[0] & 0xc0) != 0x00 && (edsa_header[0] & 0xc0) != 0xc0)
- return NULL;
-
- /*
- * Determine source device and port.
- */
- source_device = edsa_header[0] & 0x1f;
- source_port = (edsa_header[1] >> 3) & 0x1f;
-
- skb->dev = dsa_master_find_slave(dev, source_device, source_port);
- if (!skb->dev)
- return NULL;
-
- /*
- * If the 'tagged' bit is set, convert the DSA tag to a 802.1q
- * tag and delete the ethertype part. If the 'tagged' bit is
- * clear, delete the ethertype and the DSA tag parts.
- */
- if (edsa_header[0] & 0x20) {
- u8 new_header[4];
-
- /*
- * Insert 802.1q ethertype and copy the VLAN-related
- * fields, but clear the bit that will hold CFI (since
- * DSA uses that bit location for another purpose).
- */
- new_header[0] = (ETH_P_8021Q >> 8) & 0xff;
- new_header[1] = ETH_P_8021Q & 0xff;
- new_header[2] = edsa_header[2] & ~0x10;
- new_header[3] = edsa_header[3];
-
- /*
- * Move CFI bit from its place in the DSA header to
- * its 802.1q-designated place.
- */
- if (edsa_header[1] & 0x01)
- new_header[2] |= 0x10;
-
- skb_pull_rcsum(skb, DSA_HLEN);
-
- /*
- * Update packet checksum if skb is CHECKSUM_COMPLETE.
- */
- if (skb->ip_summed == CHECKSUM_COMPLETE) {
- __wsum c = skb->csum;
- c = csum_add(c, csum_partial(new_header + 2, 2, 0));
- c = csum_sub(c, csum_partial(edsa_header + 2, 2, 0));
- skb->csum = c;
- }
-
- memcpy(edsa_header, new_header, DSA_HLEN);
-
- memmove(skb->data - ETH_HLEN,
- skb->data - ETH_HLEN - DSA_HLEN,
- 2 * ETH_ALEN);
- } else {
- /*
- * Remove DSA tag and update checksum.
- */
- skb_pull_rcsum(skb, EDSA_HLEN);
- memmove(skb->data - ETH_HLEN,
- skb->data - ETH_HLEN - EDSA_HLEN,
- 2 * ETH_ALEN);
- }
-
- skb->offload_fwd_mark = 1;
-
- return skb;
-}
-
-const struct dsa_device_ops edsa_netdev_ops = {
- .xmit = edsa_xmit,
- .rcv = edsa_rcv,
-};
diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c
new file mode 100644
index 000000000000..5fa436121087
--- /dev/null
+++ b/net/dsa/tag_gswip.c
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel / Lantiq GSWIP V2.0 PMAC tag support
+ *
+ * Copyright (C) 2017 - 2018 Hauke Mehrtens <hauke@hauke-m.de>
+ */
+
+#include <linux/bitops.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <net/dsa.h>
+
+#include "tag.h"
+
+#define GSWIP_NAME "gswip"
+
+#define GSWIP_TX_HEADER_LEN 4
+
+/* special tag in TX path header */
+/* Byte 0 */
+#define GSWIP_TX_SLPID_SHIFT 0 /* source port ID */
+#define GSWIP_TX_SLPID_CPU 2
+#define GSWIP_TX_SLPID_APP1 3
+#define GSWIP_TX_SLPID_APP2 4
+#define GSWIP_TX_SLPID_APP3 5
+#define GSWIP_TX_SLPID_APP4 6
+#define GSWIP_TX_SLPID_APP5 7
+
+/* Byte 1 */
+#define GSWIP_TX_CRCGEN_DIS BIT(7)
+#define GSWIP_TX_DPID_SHIFT 0 /* destination group ID */
+#define GSWIP_TX_DPID_ELAN 0
+#define GSWIP_TX_DPID_EWAN 1
+#define GSWIP_TX_DPID_CPU 2
+#define GSWIP_TX_DPID_APP1 3
+#define GSWIP_TX_DPID_APP2 4
+#define GSWIP_TX_DPID_APP3 5
+#define GSWIP_TX_DPID_APP4 6
+#define GSWIP_TX_DPID_APP5 7
+
+/* Byte 2 */
+#define GSWIP_TX_PORT_MAP_EN BIT(7)
+#define GSWIP_TX_PORT_MAP_SEL BIT(6)
+#define GSWIP_TX_LRN_DIS BIT(5)
+#define GSWIP_TX_CLASS_EN BIT(4)
+#define GSWIP_TX_CLASS_SHIFT 0
+#define GSWIP_TX_CLASS_MASK GENMASK(3, 0)
+
+/* Byte 3 */
+#define GSWIP_TX_DPID_EN BIT(0)
+#define GSWIP_TX_PORT_MAP GENMASK(6, 1)
+
+#define GSWIP_RX_HEADER_LEN 8
+
+/* special tag in RX path header */
+/* Byte 7 */
+#define GSWIP_RX_SPPID_SHIFT 4
+#define GSWIP_RX_SPPID_MASK GENMASK(6, 4)
+
+static struct sk_buff *gswip_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ u8 *gswip_tag;
+
+ skb_push(skb, GSWIP_TX_HEADER_LEN);
+
+ gswip_tag = skb->data;
+ gswip_tag[0] = GSWIP_TX_SLPID_CPU;
+ gswip_tag[1] = GSWIP_TX_DPID_ELAN;
+ gswip_tag[2] = GSWIP_TX_PORT_MAP_EN | GSWIP_TX_PORT_MAP_SEL;
+ gswip_tag[3] = FIELD_PREP(GSWIP_TX_PORT_MAP, dsa_xmit_port_mask(skb, dev));
+ gswip_tag[3] |= GSWIP_TX_DPID_EN;
+
+ return skb;
+}
+
+static struct sk_buff *gswip_tag_rcv(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ int port;
+ u8 *gswip_tag;
+
+ if (unlikely(!pskb_may_pull(skb, GSWIP_RX_HEADER_LEN)))
+ return NULL;
+
+ gswip_tag = skb->data - ETH_HLEN;
+
+ /* Get source port information */
+ port = (gswip_tag[7] & GSWIP_RX_SPPID_MASK) >> GSWIP_RX_SPPID_SHIFT;
+ skb->dev = dsa_conduit_find_user(dev, 0, port);
+ if (!skb->dev)
+ return NULL;
+
+ /* remove GSWIP tag */
+ skb_pull_rcsum(skb, GSWIP_RX_HEADER_LEN);
+
+ return skb;
+}
+
+static const struct dsa_device_ops gswip_netdev_ops = {
+ .name = GSWIP_NAME,
+ .proto = DSA_TAG_PROTO_GSWIP,
+ .xmit = gswip_tag_xmit,
+ .rcv = gswip_tag_rcv,
+ .needed_headroom = GSWIP_RX_HEADER_LEN,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for Lantiq / Intel GSWIP switches");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_GSWIP, GSWIP_NAME);
+
+module_dsa_tag_driver(gswip_netdev_ops);
diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c
new file mode 100644
index 000000000000..544ab15685a2
--- /dev/null
+++ b/net/dsa/tag_hellcreek.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: (GPL-2.0 OR MIT)
+/*
+ * net/dsa/tag_hellcreek.c - Hirschmann Hellcreek switch tag format handling
+ *
+ * Copyright (C) 2019,2020 Linutronix GmbH
+ * Author Kurt Kanzenbach <kurt@linutronix.de>
+ *
+ * Based on tag_ksz.c.
+ */
+
+#include <linux/skbuff.h>
+#include <net/dsa.h>
+
+#include "tag.h"
+
+#define HELLCREEK_NAME "hellcreek"
+
+#define HELLCREEK_TAG_LEN 1
+
+static struct sk_buff *hellcreek_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ u8 *tag;
+
+ /* Calculate checksums (if required) before adding the trailer tag to
+ * avoid including it in calculations. That would lead to wrong
+ * checksums after the switch strips the tag.
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ skb_checksum_help(skb))
+ return NULL;
+
+ /* Tag encoding */
+ tag = skb_put(skb, HELLCREEK_TAG_LEN);
+ *tag = dsa_xmit_port_mask(skb, dev);
+
+ return skb;
+}
+
+static struct sk_buff *hellcreek_rcv(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ /* Tag decoding */
+ u8 *tag = skb_tail_pointer(skb) - HELLCREEK_TAG_LEN;
+ unsigned int port = tag[0] & 0x03;
+
+ skb->dev = dsa_conduit_find_user(dev, 0, port);
+ if (!skb->dev) {
+ netdev_warn_once(dev, "Failed to get source port: %d\n", port);
+ return NULL;
+ }
+
+ if (pskb_trim_rcsum(skb, skb->len - HELLCREEK_TAG_LEN))
+ return NULL;
+
+ dsa_default_offload_fwd_mark(skb);
+
+ return skb;
+}
+
+static const struct dsa_device_ops hellcreek_netdev_ops = {
+ .name = HELLCREEK_NAME,
+ .proto = DSA_TAG_PROTO_HELLCREEK,
+ .xmit = hellcreek_xmit,
+ .rcv = hellcreek_rcv,
+ .needed_tailroom = HELLCREEK_TAG_LEN,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for Hirschmann Hellcreek TSN switches");
+MODULE_LICENSE("Dual MIT/GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_HELLCREEK, HELLCREEK_NAME);
+
+module_dsa_tag_driver(hellcreek_netdev_ops);
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index 0f62effad88f..9170a0148cc4 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -1,102 +1,465 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* net/dsa/tag_ksz.c - Microchip KSZ Switch tag format handling
* Copyright (c) 2017 Microchip Technology
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
+#include <linux/dsa/ksz_common.h>
#include <linux/etherdevice.h>
#include <linux/list.h>
-#include <linux/slab.h>
+#include <linux/ptp_classify.h>
#include <net/dsa.h>
-#include "dsa_priv.h"
-/* For Ingress (Host -> KSZ), 2 bytes are added before FCS.
+#include "tag.h"
+
+#define KSZ8795_NAME "ksz8795"
+#define KSZ9477_NAME "ksz9477"
+#define KSZ9893_NAME "ksz9893"
+#define LAN937X_NAME "lan937x"
+
+/* Typically only one byte is used for tail tag. */
+#define KSZ_PTP_TAG_LEN 4
+#define KSZ_EGRESS_TAG_LEN 1
+#define KSZ_INGRESS_TAG_LEN 1
+
+#define KSZ_HWTS_EN 0
+
+struct ksz_tagger_private {
+ struct ksz_tagger_data data; /* Must be first */
+ unsigned long state;
+ struct kthread_worker *xmit_worker;
+};
+
+static struct ksz_tagger_private *
+ksz_tagger_private(struct dsa_switch *ds)
+{
+ return ds->tagger_data;
+}
+
+static void ksz_hwtstamp_set_state(struct dsa_switch *ds, bool on)
+{
+ struct ksz_tagger_private *priv = ksz_tagger_private(ds);
+
+ if (on)
+ set_bit(KSZ_HWTS_EN, &priv->state);
+ else
+ clear_bit(KSZ_HWTS_EN, &priv->state);
+}
+
+static void ksz_disconnect(struct dsa_switch *ds)
+{
+ struct ksz_tagger_private *priv = ds->tagger_data;
+
+ kthread_destroy_worker(priv->xmit_worker);
+ kfree(priv);
+ ds->tagger_data = NULL;
+}
+
+static int ksz_connect(struct dsa_switch *ds)
+{
+ struct ksz_tagger_data *tagger_data;
+ struct kthread_worker *xmit_worker;
+ struct ksz_tagger_private *priv;
+ int ret;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ xmit_worker = kthread_run_worker(0, "dsa%d:%d_xmit",
+ ds->dst->index, ds->index);
+ if (IS_ERR(xmit_worker)) {
+ ret = PTR_ERR(xmit_worker);
+ kfree(priv);
+ return ret;
+ }
+
+ priv->xmit_worker = xmit_worker;
+ /* Export functions for switch driver use */
+ tagger_data = &priv->data;
+ tagger_data->hwtstamp_set_state = ksz_hwtstamp_set_state;
+ ds->tagger_data = priv;
+
+ return 0;
+}
+
+static struct sk_buff *ksz_common_rcv(struct sk_buff *skb,
+ struct net_device *dev,
+ unsigned int port, unsigned int len)
+{
+ skb->dev = dsa_conduit_find_user(dev, 0, port);
+ if (!skb->dev)
+ return NULL;
+
+ if (pskb_trim_rcsum(skb, skb->len - len))
+ return NULL;
+
+ dsa_default_offload_fwd_mark(skb);
+
+ return skb;
+}
+
+/*
+ * For Ingress (Host -> KSZ8795), 1 byte is added before FCS.
* ---------------------------------------------------------------------------
- * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|tag1(1byte)|FCS(4bytes)
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag(1byte)|FCS(4bytes)
+ * ---------------------------------------------------------------------------
+ * tag : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5)
+ *
+ * For Egress (KSZ8795 -> Host), 1 byte is added before FCS.
* ---------------------------------------------------------------------------
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes)
+ * ---------------------------------------------------------------------------
+ * tag0 : zero-based value represents port
+ * (eg, 0x0=port1, 0x2=port3, 0x3=port4)
+ */
+
+#define KSZ8795_TAIL_TAG_EG_PORT_M GENMASK(1, 0)
+#define KSZ8795_TAIL_TAG_OVERRIDE BIT(6)
+#define KSZ8795_TAIL_TAG_LOOKUP BIT(7)
+
+static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct ethhdr *hdr;
+ u8 *tag;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+ return NULL;
+
+ /* Tag encoding */
+ tag = skb_put(skb, KSZ_INGRESS_TAG_LEN);
+ hdr = skb_eth_hdr(skb);
+
+ *tag = dsa_xmit_port_mask(skb, dev);
+ if (is_link_local_ether_addr(hdr->h_dest))
+ *tag |= KSZ8795_TAIL_TAG_OVERRIDE;
+
+ return skb;
+}
+
+static struct sk_buff *ksz8795_rcv(struct sk_buff *skb, struct net_device *dev)
+{
+ u8 *tag;
+
+ if (skb_linearize(skb))
+ return NULL;
+
+ tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN;
+
+ return ksz_common_rcv(skb, dev, tag[0] & KSZ8795_TAIL_TAG_EG_PORT_M,
+ KSZ_EGRESS_TAG_LEN);
+}
+
+static const struct dsa_device_ops ksz8795_netdev_ops = {
+ .name = KSZ8795_NAME,
+ .proto = DSA_TAG_PROTO_KSZ8795,
+ .xmit = ksz8795_xmit,
+ .rcv = ksz8795_rcv,
+ .needed_tailroom = KSZ_INGRESS_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(ksz8795_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ8795, KSZ8795_NAME);
+
+/*
+ * For Ingress (Host -> KSZ9477), 2/6 bytes are added before FCS.
+ * ---------------------------------------------------------------------------
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|tag1(1byte)|
+ * FCS(4bytes)
+ * ---------------------------------------------------------------------------
+ * ts : time stamp (Present only if PTP is enabled in the Hardware)
* tag0 : Prioritization (not used now)
* tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x10=port5)
*
- * For Egress (KSZ -> Host), 1 byte is added before FCS.
+ * For Egress (KSZ9477 -> Host), 1/5 bytes is added before FCS.
* ---------------------------------------------------------------------------
- * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|tag0(1byte)|FCS(4bytes)
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|FCS(4bytes)
* ---------------------------------------------------------------------------
+ * ts : time stamp (Present only if bit 7 of tag0 is set)
* tag0 : zero-based value represents port
* (eg, 0x00=port1, 0x02=port3, 0x06=port7)
*/
-#define KSZ_INGRESS_TAG_LEN 2
-#define KSZ_EGRESS_TAG_LEN 1
+#define KSZ9477_INGRESS_TAG_LEN 2
+#define KSZ9477_PTP_TAG_LEN 4
+#define KSZ9477_PTP_TAG_INDICATION BIT(7)
-static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct net_device *dev)
+#define KSZ9477_TAIL_TAG_EG_PORT_M GENMASK(2, 0)
+#define KSZ9477_TAIL_TAG_PRIO GENMASK(8, 7)
+#define KSZ9477_TAIL_TAG_OVERRIDE BIT(9)
+#define KSZ9477_TAIL_TAG_LOOKUP BIT(10)
+
+static void ksz_rcv_timestamp(struct sk_buff *skb, u8 *tag)
{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct sk_buff *nskb;
- int padlen;
- u8 *tag;
+ u8 *tstamp_raw = tag - KSZ_PTP_TAG_LEN;
+ ktime_t tstamp;
+
+ tstamp = ksz_decode_tstamp(get_unaligned_be32(tstamp_raw));
+ KSZ_SKB_CB(skb)->tstamp = tstamp;
+}
+
+/* Time stamp tag *needs* to be inserted if PTP is enabled in hardware.
+ * Regardless of Whether it is a PTP frame or not.
+ */
+static void ksz_xmit_timestamp(struct dsa_port *dp, struct sk_buff *skb)
+{
+ struct ksz_tagger_private *priv;
+ struct ptp_header *ptp_hdr;
+ unsigned int ptp_type;
+ u32 tstamp_raw = 0;
+ s64 correction;
+
+ priv = ksz_tagger_private(dp->ds);
+
+ if (!test_bit(KSZ_HWTS_EN, &priv->state))
+ return;
+
+ if (!KSZ_SKB_CB(skb)->update_correction)
+ goto output_tag;
+
+ ptp_type = KSZ_SKB_CB(skb)->ptp_type;
- padlen = (skb->len >= ETH_ZLEN) ? 0 : ETH_ZLEN - skb->len;
-
- if (skb_tailroom(skb) >= padlen + KSZ_INGRESS_TAG_LEN) {
- /* Let dsa_slave_xmit() free skb */
- if (__skb_put_padto(skb, skb->len + padlen, false))
- return NULL;
-
- nskb = skb;
- } else {
- nskb = alloc_skb(NET_IP_ALIGN + skb->len +
- padlen + KSZ_INGRESS_TAG_LEN, GFP_ATOMIC);
- if (!nskb)
- return NULL;
- skb_reserve(nskb, NET_IP_ALIGN);
-
- skb_reset_mac_header(nskb);
- skb_set_network_header(nskb,
- skb_network_header(skb) - skb->head);
- skb_set_transport_header(nskb,
- skb_transport_header(skb) - skb->head);
- skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len));
-
- /* Let skb_put_padto() free nskb, and let dsa_slave_xmit() free
- * skb
- */
- if (skb_put_padto(nskb, nskb->len + padlen))
- return NULL;
-
- consume_skb(skb);
+ ptp_hdr = ptp_parse_header(skb, ptp_type);
+ if (!ptp_hdr)
+ goto output_tag;
+
+ correction = (s64)get_unaligned_be64(&ptp_hdr->correction);
+
+ if (correction < 0) {
+ struct timespec64 ts;
+
+ ts = ns_to_timespec64(-correction >> 16);
+ tstamp_raw = ((ts.tv_sec & 3) << 30) | ts.tv_nsec;
+
+ /* Set correction field to 0 and update UDP checksum */
+ ptp_header_update_correction(skb, ptp_type, ptp_hdr, 0);
}
- tag = skb_put(nskb, KSZ_INGRESS_TAG_LEN);
- tag[0] = 0;
- tag[1] = 1 << dp->index; /* destination port */
+output_tag:
+ put_unaligned_be32(tstamp_raw, skb_put(skb, KSZ_PTP_TAG_LEN));
+}
+
+/* Defer transmit if waiting for egress time stamp is required. */
+static struct sk_buff *ksz_defer_xmit(struct dsa_port *dp, struct sk_buff *skb)
+{
+ struct ksz_tagger_data *tagger_data = ksz_tagger_data(dp->ds);
+ struct ksz_tagger_private *priv = ksz_tagger_private(dp->ds);
+ void (*xmit_work_fn)(struct kthread_work *work);
+ struct sk_buff *clone = KSZ_SKB_CB(skb)->clone;
+ struct ksz_deferred_xmit_work *xmit_work;
+ struct kthread_worker *xmit_worker;
+
+ if (!clone)
+ return skb; /* no deferred xmit for this packet */
+
+ xmit_work_fn = tagger_data->xmit_work_fn;
+ xmit_worker = priv->xmit_worker;
+
+ if (!xmit_work_fn || !xmit_worker)
+ return NULL;
+
+ xmit_work = kzalloc(sizeof(*xmit_work), GFP_ATOMIC);
+ if (!xmit_work)
+ return NULL;
+
+ kthread_init_work(&xmit_work->work, xmit_work_fn);
+ /* Increase refcount so the kfree_skb in dsa_user_xmit
+ * won't really free the packet.
+ */
+ xmit_work->dp = dp;
+ xmit_work->skb = skb_get(skb);
+
+ kthread_queue_work(xmit_worker, &xmit_work->work);
- return nskb;
+ return NULL;
}
-static struct sk_buff *ksz_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt)
+static struct sk_buff *ksz9477_xmit(struct sk_buff *skb,
+ struct net_device *dev)
{
+ u16 queue_mapping = skb_get_queue_mapping(skb);
+ u8 prio = netdev_txq_to_tc(dev, queue_mapping);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct ethhdr *hdr;
+ __be16 *tag;
+ u16 val;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+ return NULL;
+
+ /* Tag encoding */
+ ksz_xmit_timestamp(dp, skb);
+
+ tag = skb_put(skb, KSZ9477_INGRESS_TAG_LEN);
+ hdr = skb_eth_hdr(skb);
+
+ val = dsa_xmit_port_mask(skb, dev);
+ val |= FIELD_PREP(KSZ9477_TAIL_TAG_PRIO, prio);
+
+ if (is_link_local_ether_addr(hdr->h_dest))
+ val |= KSZ9477_TAIL_TAG_OVERRIDE;
+
+ *tag = cpu_to_be16(val);
+
+ return ksz_defer_xmit(dp, skb);
+}
+
+static struct sk_buff *ksz9477_rcv(struct sk_buff *skb, struct net_device *dev)
+{
+ unsigned int len = KSZ_EGRESS_TAG_LEN;
+ unsigned int port;
u8 *tag;
- int source_port;
+ if (skb_linearize(skb))
+ return NULL;
+
+ /* Tag decoding */
tag = skb_tail_pointer(skb) - KSZ_EGRESS_TAG_LEN;
+ port = tag[0] & KSZ9477_TAIL_TAG_EG_PORT_M;
- source_port = tag[0] & 7;
+ /* Extra 4-bytes PTP timestamp */
+ if (tag[0] & KSZ9477_PTP_TAG_INDICATION) {
+ ksz_rcv_timestamp(skb, tag);
+ len += KSZ_PTP_TAG_LEN;
+ }
- skb->dev = dsa_master_find_slave(dev, 0, source_port);
- if (!skb->dev)
+ return ksz_common_rcv(skb, dev, port, len);
+}
+
+static const struct dsa_device_ops ksz9477_netdev_ops = {
+ .name = KSZ9477_NAME,
+ .proto = DSA_TAG_PROTO_KSZ9477,
+ .xmit = ksz9477_xmit,
+ .rcv = ksz9477_rcv,
+ .connect = ksz_connect,
+ .disconnect = ksz_disconnect,
+ .needed_tailroom = KSZ9477_INGRESS_TAG_LEN + KSZ_PTP_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(ksz9477_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9477, KSZ9477_NAME);
+
+#define KSZ9893_TAIL_TAG_PRIO GENMASK(4, 3)
+#define KSZ9893_TAIL_TAG_OVERRIDE BIT(5)
+#define KSZ9893_TAIL_TAG_LOOKUP BIT(6)
+
+static struct sk_buff *ksz9893_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ u16 queue_mapping = skb_get_queue_mapping(skb);
+ u8 prio = netdev_txq_to_tc(dev, queue_mapping);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct ethhdr *hdr;
+ u8 *tag;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
return NULL;
- pskb_trim_rcsum(skb, skb->len - KSZ_EGRESS_TAG_LEN);
+ /* Tag encoding */
+ ksz_xmit_timestamp(dp, skb);
- return skb;
+ tag = skb_put(skb, KSZ_INGRESS_TAG_LEN);
+ hdr = skb_eth_hdr(skb);
+
+ *tag = dsa_xmit_port_mask(skb, dev);
+ *tag |= FIELD_PREP(KSZ9893_TAIL_TAG_PRIO, prio);
+
+ if (is_link_local_ether_addr(hdr->h_dest))
+ *tag |= KSZ9893_TAIL_TAG_OVERRIDE;
+
+ return ksz_defer_xmit(dp, skb);
+}
+
+static const struct dsa_device_ops ksz9893_netdev_ops = {
+ .name = KSZ9893_NAME,
+ .proto = DSA_TAG_PROTO_KSZ9893,
+ .xmit = ksz9893_xmit,
+ .rcv = ksz9477_rcv,
+ .connect = ksz_connect,
+ .disconnect = ksz_disconnect,
+ .needed_tailroom = KSZ_INGRESS_TAG_LEN + KSZ_PTP_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(ksz9893_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_KSZ9893, KSZ9893_NAME);
+
+/* For xmit, 2/6 bytes are added before FCS.
+ * ---------------------------------------------------------------------------
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|tag1(1byte)|
+ * FCS(4bytes)
+ * ---------------------------------------------------------------------------
+ * ts : time stamp (Present only if PTP is enabled in the Hardware)
+ * tag0 : represents tag override, lookup and valid
+ * tag1 : each bit represents port (eg, 0x01=port1, 0x02=port2, 0x80=port8)
+ *
+ * For rcv, 1/5 bytes is added before FCS.
+ * ---------------------------------------------------------------------------
+ * DA(6bytes)|SA(6bytes)|....|Data(nbytes)|ts(4bytes)|tag0(1byte)|FCS(4bytes)
+ * ---------------------------------------------------------------------------
+ * ts : time stamp (Present only if bit 7 of tag0 is set)
+ * tag0 : zero-based value represents port
+ * (eg, 0x00=port1, 0x02=port3, 0x07=port8)
+ */
+#define LAN937X_EGRESS_TAG_LEN 2
+
+#define LAN937X_TAIL_TAG_BLOCKING_OVERRIDE BIT(11)
+#define LAN937X_TAIL_TAG_LOOKUP BIT(12)
+#define LAN937X_TAIL_TAG_VALID BIT(13)
+#define LAN937X_TAIL_TAG_PRIO GENMASK(10, 8)
+#define LAN937X_TAIL_TAG_PORT_MASK 7
+
+static struct sk_buff *lan937x_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ u16 queue_mapping = skb_get_queue_mapping(skb);
+ u8 prio = netdev_txq_to_tc(dev, queue_mapping);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ const struct ethhdr *hdr = eth_hdr(skb);
+ __be16 *tag;
+ u16 val;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+ return NULL;
+
+ ksz_xmit_timestamp(dp, skb);
+
+ tag = skb_put(skb, LAN937X_EGRESS_TAG_LEN);
+
+ val = dsa_xmit_port_mask(skb, dev);
+ val |= FIELD_PREP(LAN937X_TAIL_TAG_PRIO, prio);
+
+ if (is_link_local_ether_addr(hdr->h_dest))
+ val |= LAN937X_TAIL_TAG_BLOCKING_OVERRIDE;
+
+ /* Tail tag valid bit - This bit should always be set by the CPU */
+ val |= LAN937X_TAIL_TAG_VALID;
+
+ put_unaligned_be16(val, tag);
+
+ return ksz_defer_xmit(dp, skb);
}
-const struct dsa_device_ops ksz_netdev_ops = {
- .xmit = ksz_xmit,
- .rcv = ksz_rcv,
+static const struct dsa_device_ops lan937x_netdev_ops = {
+ .name = LAN937X_NAME,
+ .proto = DSA_TAG_PROTO_LAN937X,
+ .xmit = lan937x_xmit,
+ .rcv = ksz9477_rcv,
+ .connect = ksz_connect,
+ .disconnect = ksz_disconnect,
+ .needed_tailroom = LAN937X_EGRESS_TAG_LEN + KSZ_PTP_TAG_LEN,
};
+
+DSA_TAG_DRIVER(lan937x_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_LAN937X, LAN937X_NAME);
+
+static struct dsa_tag_driver *dsa_tag_driver_array[] = {
+ &DSA_TAG_DRIVER_NAME(ksz8795_netdev_ops),
+ &DSA_TAG_DRIVER_NAME(ksz9477_netdev_ops),
+ &DSA_TAG_DRIVER_NAME(ksz9893_netdev_ops),
+ &DSA_TAG_DRIVER_NAME(lan937x_netdev_ops),
+};
+
+module_dsa_tag_drivers(dsa_tag_driver_array);
+
+MODULE_DESCRIPTION("DSA tag driver for Microchip 8795/937x/9477/9893 families of switches");
+MODULE_LICENSE("GPL");
diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c
index 548c00254c07..258e5d7dc5ef 100644
--- a/net/dsa/tag_lan9303.c
+++ b/net/dsa/tag_lan9303.c
@@ -1,22 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2017 Pengutronix, Juergen Borleis <jbe@pengutronix.de>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
*/
#include <linux/dsa/lan9303.h>
#include <linux/etherdevice.h>
#include <linux/list.h>
#include <linux/slab.h>
-#include "dsa_priv.h"
+#include "tag.h"
/* To define the outgoing port and to discover the incoming port a regular
* VLAN tag is used by the LAN9303. But its VID meaning is 'special':
@@ -39,6 +30,8 @@
* Required when no forwarding between the external ports should happen.
*/
+#define LAN9303_NAME "lan9303"
+
#define LAN9303_TAG_LEN 4
# define LAN9303_TAG_TX_USE_ALR BIT(3)
# define LAN9303_TAG_TX_STP_OVERRIDE BIT(4)
@@ -63,38 +56,29 @@ static int lan9303_xmit_use_arl(struct dsa_port *dp, u8 *dest_addr)
static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev)
{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- u16 *lan9303_tag;
-
- /* insert a special VLAN tag between the MAC addresses
- * and the current ethertype field.
- */
- if (skb_cow_head(skb, LAN9303_TAG_LEN) < 0) {
- dev_dbg(&dev->dev,
- "Cannot make room for the special tag. Dropping packet\n");
- return NULL;
- }
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ __be16 *lan9303_tag;
+ u16 tag;
/* provide 'LAN9303_TAG_LEN' bytes additional space */
skb_push(skb, LAN9303_TAG_LEN);
/* make room between MACs and Ether-Type */
- memmove(skb->data, skb->data + LAN9303_TAG_LEN, 2 * ETH_ALEN);
+ dsa_alloc_etype_header(skb, LAN9303_TAG_LEN);
- lan9303_tag = (u16 *)(skb->data + 2 * ETH_ALEN);
+ lan9303_tag = dsa_etype_header_pos_tx(skb);
+
+ tag = lan9303_xmit_use_arl(dp, skb->data) ?
+ LAN9303_TAG_TX_USE_ALR :
+ dp->index | LAN9303_TAG_TX_STP_OVERRIDE;
lan9303_tag[0] = htons(ETH_P_8021Q);
- lan9303_tag[1] = lan9303_xmit_use_arl(dp, skb->data) ?
- LAN9303_TAG_TX_USE_ALR :
- dp->index | LAN9303_TAG_TX_STP_OVERRIDE;
- lan9303_tag[1] = htons(lan9303_tag[1]);
+ lan9303_tag[1] = htons(tag);
return skb;
}
-static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt)
+static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev)
{
- u16 *lan9303_tag;
u16 lan9303_tag1;
unsigned int source_port;
@@ -104,40 +88,39 @@ static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev,
return NULL;
}
- /* '->data' points into the middle of our special VLAN tag information:
- *
- * ~ MAC src | 0x81 | 0x00 | 0xyy | 0xzz | ether type
- * ^
- * ->data
- */
- lan9303_tag = (u16 *)(skb->data - 2);
-
- if (lan9303_tag[0] != htons(ETH_P_8021Q)) {
- dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid VLAN marker\n");
- return NULL;
+ if (skb_vlan_tag_present(skb)) {
+ lan9303_tag1 = skb_vlan_tag_get(skb);
+ __vlan_hwaccel_clear_tag(skb);
+ } else {
+ skb_push_rcsum(skb, ETH_HLEN);
+ __skb_vlan_pop(skb, &lan9303_tag1);
+ skb_pull_rcsum(skb, ETH_HLEN);
}
- lan9303_tag1 = ntohs(lan9303_tag[1]);
source_port = lan9303_tag1 & 0x3;
- skb->dev = dsa_master_find_slave(dev, 0, source_port);
+ skb->dev = dsa_conduit_find_user(dev, 0, source_port);
if (!skb->dev) {
dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid source port\n");
return NULL;
}
- /* remove the special VLAN tag between the MAC addresses
- * and the current ethertype field.
- */
- skb_pull_rcsum(skb, 2 + 2);
- memmove(skb->data - ETH_HLEN, skb->data - (ETH_HLEN + LAN9303_TAG_LEN),
- 2 * ETH_ALEN);
- skb->offload_fwd_mark = !(lan9303_tag1 & LAN9303_TAG_RX_TRAPPED_TO_CPU);
+ if (!(lan9303_tag1 & LAN9303_TAG_RX_TRAPPED_TO_CPU))
+ dsa_default_offload_fwd_mark(skb);
return skb;
}
-const struct dsa_device_ops lan9303_netdev_ops = {
+static const struct dsa_device_ops lan9303_netdev_ops = {
+ .name = LAN9303_NAME,
+ .proto = DSA_TAG_PROTO_LAN9303,
.xmit = lan9303_xmit,
.rcv = lan9303_rcv,
+ .needed_headroom = LAN9303_TAG_LEN,
};
+
+MODULE_DESCRIPTION("DSA tag driver for SMSC/Microchip LAN9303 family of switches");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_LAN9303, LAN9303_NAME);
+
+module_dsa_tag_driver(lan9303_netdev_ops);
diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c
index 11535bc70743..dea3eecaf093 100644
--- a/net/dsa/tag_mtk.c
+++ b/net/dsa/tag_mtk.c
@@ -1,34 +1,33 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Mediatek DSA Tag support
* Copyright (C) 2017 Landen Chao <landen.chao@mediatek.com>
* Sean Wang <sean.wang@mediatek.com>
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#include <linux/etherdevice.h>
#include <linux/if_vlan.h>
-#include "dsa_priv.h"
+#include "tag.h"
+
+#define MTK_NAME "mtk"
#define MTK_HDR_LEN 4
#define MTK_HDR_XMIT_UNTAGGED 0
#define MTK_HDR_XMIT_TAGGED_TPID_8100 1
+#define MTK_HDR_XMIT_TAGGED_TPID_88A8 2
#define MTK_HDR_RECV_SOURCE_PORT_MASK GENMASK(2, 0)
#define MTK_HDR_XMIT_DP_BIT_MASK GENMASK(5, 0)
+#define MTK_HDR_XMIT_SA_DIS BIT(6)
static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,
struct net_device *dev)
{
- struct dsa_port *dp = dsa_slave_to_port(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ u8 xmit_tpid;
u8 *mtk_tag;
- bool is_vlan_skb = true;
+
+ skb_set_queue_mapping(skb, dp->index);
/* Build the special tag after the MAC Source Address. If VLAN header
* is present, it's required that VLAN header and special tag is
@@ -36,26 +35,30 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,
* the both special and VLAN tag at the same time and then look up VLAN
* table with VID.
*/
- if (!skb_vlan_tagged(skb)) {
- if (skb_cow_head(skb, MTK_HDR_LEN) < 0)
- return NULL;
-
+ switch (skb->protocol) {
+ case htons(ETH_P_8021Q):
+ xmit_tpid = MTK_HDR_XMIT_TAGGED_TPID_8100;
+ break;
+ case htons(ETH_P_8021AD):
+ xmit_tpid = MTK_HDR_XMIT_TAGGED_TPID_88A8;
+ break;
+ default:
+ xmit_tpid = MTK_HDR_XMIT_UNTAGGED;
skb_push(skb, MTK_HDR_LEN);
- memmove(skb->data, skb->data + MTK_HDR_LEN, 2 * ETH_ALEN);
- is_vlan_skb = false;
+ dsa_alloc_etype_header(skb, MTK_HDR_LEN);
}
- mtk_tag = skb->data + 2 * ETH_ALEN;
+ mtk_tag = dsa_etype_header_pos_tx(skb);
/* Mark tag attribute on special tag insertion to notify hardware
* whether that's a combined special tag with 802.1Q header.
*/
- mtk_tag[0] = is_vlan_skb ? MTK_HDR_XMIT_TAGGED_TPID_8100 :
- MTK_HDR_XMIT_UNTAGGED;
- mtk_tag[1] = (1 << dp->index) & MTK_HDR_XMIT_DP_BIT_MASK;
+ mtk_tag[0] = xmit_tpid;
+ mtk_tag[1] = FIELD_PREP(MTK_HDR_XMIT_DP_BIT_MASK,
+ dsa_xmit_port_mask(skb, dev));
/* Tag control information is kept for 802.1Q */
- if (!is_vlan_skb) {
+ if (xmit_tpid == MTK_HDR_XMIT_UNTAGGED) {
mtk_tag[2] = 0;
mtk_tag[3] = 0;
}
@@ -63,50 +66,45 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb,
return skb;
}
-static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt)
+static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev)
{
+ u16 hdr;
int port;
- __be16 *phdr, hdr;
+ __be16 *phdr;
if (unlikely(!pskb_may_pull(skb, MTK_HDR_LEN)))
return NULL;
- /* The MTK header is added by the switch between src addr
- * and ethertype at this point, skb->data points to 2 bytes
- * after src addr so header should be 2 bytes right before.
- */
- phdr = (__be16 *)(skb->data - 2);
+ phdr = dsa_etype_header_pos_rx(skb);
hdr = ntohs(*phdr);
/* Remove MTK tag and recalculate checksum. */
skb_pull_rcsum(skb, MTK_HDR_LEN);
- memmove(skb->data - ETH_HLEN,
- skb->data - ETH_HLEN - MTK_HDR_LEN,
- 2 * ETH_ALEN);
+ dsa_strip_etype_header(skb, MTK_HDR_LEN);
/* Get source port information */
port = (hdr & MTK_HDR_RECV_SOURCE_PORT_MASK);
- skb->dev = dsa_master_find_slave(dev, 0, port);
+ skb->dev = dsa_conduit_find_user(dev, 0, port);
if (!skb->dev)
return NULL;
- return skb;
-}
+ dsa_default_offload_fwd_mark(skb);
-static int mtk_tag_flow_dissect(const struct sk_buff *skb, __be16 *proto,
- int *offset)
-{
- *offset = 4;
- *proto = ((__be16 *)skb->data)[1];
-
- return 0;
+ return skb;
}
-const struct dsa_device_ops mtk_netdev_ops = {
+static const struct dsa_device_ops mtk_netdev_ops = {
+ .name = MTK_NAME,
+ .proto = DSA_TAG_PROTO_MTK,
.xmit = mtk_tag_xmit,
.rcv = mtk_tag_rcv,
- .flow_dissect = mtk_tag_flow_dissect,
+ .needed_headroom = MTK_HDR_LEN,
};
+
+MODULE_DESCRIPTION("DSA tag driver for Mediatek switches");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_MTK, MTK_NAME);
+
+module_dsa_tag_driver(mtk_netdev_ops);
diff --git a/net/dsa/tag_mxl-gsw1xx.c b/net/dsa/tag_mxl-gsw1xx.c
new file mode 100644
index 000000000000..60f7c445e656
--- /dev/null
+++ b/net/dsa/tag_mxl-gsw1xx.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * DSA driver Special Tag support for MaxLinear GSW1xx switch chips
+ *
+ * Copyright (C) 2025 Daniel Golle <daniel@makrotopia.org>
+ * Copyright (C) 2023 - 2024 MaxLinear Inc.
+ */
+
+#include <linux/bitops.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <net/dsa.h>
+
+#include "tag.h"
+
+/* To define the outgoing port and to discover the incoming port a special
+ * tag is used by the GSW1xx.
+ *
+ * Dest MAC Src MAC special TAG EtherType
+ * ...| 1 2 3 4 5 6 | 1 2 3 4 5 6 | 1 2 3 4 5 6 7 8 | 1 2 |...
+ * |<--------------->|
+ */
+
+#define GSW1XX_TAG_NAME "gsw1xx"
+
+/* special tag header length (RX and TX) */
+#define GSW1XX_HEADER_LEN 8
+
+/* Word 0 = Ethertype -> 0x88C3 */
+
+/* Word 1 */
+#define GSW1XX_TX_PORT_MAP GENMASK(7, 0)
+#define GSW1XX_TX_PORT_MAP_EN BIT(15)
+#define GSW1XX_TX_CLASS_EN BIT(14)
+#define GSW1XX_TX_TIME_STAMP_EN BIT(13)
+#define GSW1XX_TX_LRN_DIS BIT(12)
+#define GSW1XX_TX_CLASS GENMASK(11, 8)
+
+/* special tag in RX path header */
+/* Word 2 */
+#define GSW1XX_RX_PORT_MAP GENMASK(15, 8)
+
+static struct sk_buff *gsw1xx_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ __be16 *gsw1xx_tag;
+ u16 tag;
+
+ /* provide additional space 'GSW1XX_HEADER_LEN' bytes */
+ skb_push(skb, GSW1XX_HEADER_LEN);
+
+ /* add space between MAC address and Ethertype */
+ dsa_alloc_etype_header(skb, GSW1XX_HEADER_LEN);
+
+ /* special tag ingress */
+ gsw1xx_tag = dsa_etype_header_pos_tx(skb);
+ gsw1xx_tag[0] = htons(ETH_P_MXLGSW);
+
+ tag = FIELD_PREP(GSW1XX_TX_PORT_MAP, dsa_xmit_port_mask(skb, dev)) |
+ GSW1XX_TX_PORT_MAP_EN | GSW1XX_TX_LRN_DIS;
+ gsw1xx_tag[1] = htons(tag);
+ gsw1xx_tag[2] = 0;
+ gsw1xx_tag[3] = 0;
+
+ return skb;
+}
+
+static struct sk_buff *gsw1xx_tag_rcv(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ int port;
+ __be16 *gsw1xx_tag;
+
+ if (unlikely(!pskb_may_pull(skb, GSW1XX_HEADER_LEN))) {
+ dev_warn_ratelimited(&dev->dev, "Dropping packet, cannot pull SKB\n");
+ return NULL;
+ }
+
+ gsw1xx_tag = dsa_etype_header_pos_rx(skb);
+
+ if (unlikely(ntohs(gsw1xx_tag[0]) != ETH_P_MXLGSW)) {
+ dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid special tag\n");
+ dev_warn_ratelimited(&dev->dev, "Tag: %8ph\n", gsw1xx_tag);
+ return NULL;
+ }
+
+ /* Get source port information */
+ port = FIELD_GET(GSW1XX_RX_PORT_MAP, ntohs(gsw1xx_tag[1]));
+ skb->dev = dsa_conduit_find_user(dev, 0, port);
+ if (!skb->dev) {
+ dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid source port\n");
+ dev_warn_ratelimited(&dev->dev, "Tag: %8ph\n", gsw1xx_tag);
+ return NULL;
+ }
+
+ /* remove the GSW1xx special tag between MAC addresses and the current
+ * ethertype field.
+ */
+ skb_pull_rcsum(skb, GSW1XX_HEADER_LEN);
+ dsa_strip_etype_header(skb, GSW1XX_HEADER_LEN);
+
+ return skb;
+}
+
+static const struct dsa_device_ops gsw1xx_netdev_ops = {
+ .name = GSW1XX_TAG_NAME,
+ .proto = DSA_TAG_PROTO_MXL_GSW1XX,
+ .xmit = gsw1xx_tag_xmit,
+ .rcv = gsw1xx_tag_rcv,
+ .needed_headroom = GSW1XX_HEADER_LEN,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for MaxLinear GSW1xx 8 byte protocol");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_MXL_GSW1XX, GSW1XX_TAG_NAME);
+
+module_dsa_tag_driver(gsw1xx_netdev_ops);
diff --git a/net/dsa/tag_none.c b/net/dsa/tag_none.c
new file mode 100644
index 000000000000..e9c9670a9c44
--- /dev/null
+++ b/net/dsa/tag_none.c
@@ -0,0 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * net/dsa/tag_none.c - Traffic handling for switches with no tag
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ * Copyright (c) 2013 Florian Fainelli <florian@openwrt.org>
+ *
+ * WARNING: do not use this for new switches. In case of no hardware
+ * tagging support, look at tag_8021q.c instead.
+ */
+
+#include "tag.h"
+
+#define NONE_NAME "none"
+
+static struct sk_buff *dsa_user_notag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ /* Just return the original SKB */
+ return skb;
+}
+
+static const struct dsa_device_ops none_ops = {
+ .name = NONE_NAME,
+ .proto = DSA_TAG_PROTO_NONE,
+ .xmit = dsa_user_notag_xmit,
+};
+
+module_dsa_tag_driver(none_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_NONE, NONE_NAME);
+MODULE_DESCRIPTION("DSA no-op tag driver");
+MODULE_LICENSE("GPL");
diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
new file mode 100644
index 000000000000..3405def79c2d
--- /dev/null
+++ b/net/dsa/tag_ocelot.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2019 NXP
+ */
+#include <linux/dsa/ocelot.h>
+
+#include "tag.h"
+
+#define OCELOT_NAME "ocelot"
+#define SEVILLE_NAME "seville"
+
+static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev,
+ __be32 ifh_prefix, void **ifh)
+{
+ struct dsa_port *dp = dsa_user_to_port(netdev);
+ struct dsa_switch *ds = dp->ds;
+ u64 vlan_tci, tag_type;
+ void *injection;
+ __be32 *prefix;
+ u32 rew_op = 0;
+ u64 qos_class;
+
+ ocelot_xmit_get_vlan_info(skb, dsa_port_bridge_dev_get(dp), &vlan_tci,
+ &tag_type);
+
+ qos_class = netdev_get_num_tc(netdev) ?
+ netdev_get_prio_tc_map(netdev, skb->priority) : skb->priority;
+
+ injection = skb_push(skb, OCELOT_TAG_LEN);
+ prefix = skb_push(skb, OCELOT_SHORT_PREFIX_LEN);
+
+ *prefix = ifh_prefix;
+ memset(injection, 0, OCELOT_TAG_LEN);
+ ocelot_ifh_set_bypass(injection, 1);
+ ocelot_ifh_set_src(injection, ds->num_ports);
+ ocelot_ifh_set_qos_class(injection, qos_class);
+ ocelot_ifh_set_vlan_tci(injection, vlan_tci);
+ ocelot_ifh_set_tag_type(injection, tag_type);
+
+ rew_op = ocelot_ptp_rew_op(skb);
+ if (rew_op)
+ ocelot_ifh_set_rew_op(injection, rew_op);
+
+ *ifh = injection;
+}
+
+static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ void *injection;
+
+ ocelot_xmit_common(skb, netdev, cpu_to_be32(0x8880000a), &injection);
+ ocelot_ifh_set_dest(injection, dsa_xmit_port_mask(skb, netdev));
+
+ return skb;
+}
+
+static struct sk_buff *seville_xmit(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ void *injection;
+
+ ocelot_xmit_common(skb, netdev, cpu_to_be32(0x88800005), &injection);
+ seville_ifh_set_dest(injection, dsa_xmit_port_mask(skb, netdev));
+
+ return skb;
+}
+
+static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ u64 src_port, qos_class;
+ u64 vlan_tci, tag_type;
+ u8 *start = skb->data;
+ struct dsa_port *dp;
+ u8 *extraction;
+ u16 vlan_tpid;
+ u64 rew_val;
+
+ /* Revert skb->data by the amount consumed by the DSA conduit,
+ * so it points to the beginning of the frame.
+ */
+ skb_push(skb, ETH_HLEN);
+ /* We don't care about the short prefix, it is just for easy entrance
+ * into the DSA conduit's RX filter. Discard it now by moving it into
+ * the headroom.
+ */
+ skb_pull(skb, OCELOT_SHORT_PREFIX_LEN);
+ /* And skb->data now points to the extraction frame header.
+ * Keep a pointer to it.
+ */
+ extraction = skb->data;
+ /* Now the EFH is part of the headroom as well */
+ skb_pull(skb, OCELOT_TAG_LEN);
+ /* Reset the pointer to the real MAC header */
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+ /* And move skb->data to the correct location again */
+ skb_pull(skb, ETH_HLEN);
+
+ /* Remove from inet csum the extraction header */
+ skb_postpull_rcsum(skb, start, OCELOT_TOTAL_TAG_LEN);
+
+ ocelot_xfh_get_src_port(extraction, &src_port);
+ ocelot_xfh_get_qos_class(extraction, &qos_class);
+ ocelot_xfh_get_tag_type(extraction, &tag_type);
+ ocelot_xfh_get_vlan_tci(extraction, &vlan_tci);
+ ocelot_xfh_get_rew_val(extraction, &rew_val);
+
+ skb->dev = dsa_conduit_find_user(netdev, 0, src_port);
+ if (!skb->dev)
+ /* The switch will reflect back some frames sent through
+ * sockets opened on the bare DSA conduit. These will come back
+ * with src_port equal to the index of the CPU port, for which
+ * there is no user registered. So don't print any error
+ * message here (ignore and drop those frames).
+ */
+ return NULL;
+
+ dsa_default_offload_fwd_mark(skb);
+ skb->priority = qos_class;
+ OCELOT_SKB_CB(skb)->tstamp_lo = rew_val;
+
+ /* Ocelot switches copy frames unmodified to the CPU. However, it is
+ * possible for the user to request a VLAN modification through
+ * VCAP_IS1_ACT_VID_REPLACE_ENA. In this case, what will happen is that
+ * the VLAN ID field from the Extraction Header gets updated, but the
+ * 802.1Q header does not (the classified VLAN only becomes visible on
+ * egress through the "port tag" of front-panel ports).
+ * So, for traffic extracted by the CPU, we want to pick up the
+ * classified VLAN and manually replace the existing 802.1Q header from
+ * the packet with it, so that the operating system is always up to
+ * date with the result of tc-vlan actions.
+ * NOTE: In VLAN-unaware mode, we don't want to do that, we want the
+ * frame to remain unmodified, because the classified VLAN is always
+ * equal to the pvid of the ingress port and should not be used for
+ * processing.
+ */
+ dp = dsa_user_to_port(skb->dev);
+ vlan_tpid = tag_type ? ETH_P_8021AD : ETH_P_8021Q;
+
+ if (dsa_port_is_vlan_filtering(dp) &&
+ eth_hdr(skb)->h_proto == htons(vlan_tpid)) {
+ u16 dummy_vlan_tci;
+
+ skb_push_rcsum(skb, ETH_HLEN);
+ __skb_vlan_pop(skb, &dummy_vlan_tci);
+ skb_pull_rcsum(skb, ETH_HLEN);
+ __vlan_hwaccel_put_tag(skb, htons(vlan_tpid), vlan_tci);
+ }
+
+ return skb;
+}
+
+static const struct dsa_device_ops ocelot_netdev_ops = {
+ .name = OCELOT_NAME,
+ .proto = DSA_TAG_PROTO_OCELOT,
+ .xmit = ocelot_xmit,
+ .rcv = ocelot_rcv,
+ .needed_headroom = OCELOT_TOTAL_TAG_LEN,
+ .promisc_on_conduit = true,
+};
+
+DSA_TAG_DRIVER(ocelot_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT, OCELOT_NAME);
+
+static const struct dsa_device_ops seville_netdev_ops = {
+ .name = SEVILLE_NAME,
+ .proto = DSA_TAG_PROTO_SEVILLE,
+ .xmit = seville_xmit,
+ .rcv = ocelot_rcv,
+ .needed_headroom = OCELOT_TOTAL_TAG_LEN,
+ .promisc_on_conduit = true,
+};
+
+DSA_TAG_DRIVER(seville_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SEVILLE, SEVILLE_NAME);
+
+static struct dsa_tag_driver *ocelot_tag_driver_array[] = {
+ &DSA_TAG_DRIVER_NAME(ocelot_netdev_ops),
+ &DSA_TAG_DRIVER_NAME(seville_netdev_ops),
+};
+
+module_dsa_tag_drivers(ocelot_tag_driver_array);
+
+MODULE_DESCRIPTION("DSA tag driver for Ocelot family of switches, using NPI port");
+MODULE_LICENSE("GPL v2");
diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c
new file mode 100644
index 000000000000..3929584791e4
--- /dev/null
+++ b/net/dsa/tag_ocelot_8021q.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2020-2021 NXP
+ *
+ * An implementation of the software-defined tag_8021q.c tagger format, which
+ * also preserves full functionality under a vlan_filtering bridge. It does
+ * this by using the TCAM engines for:
+ * - pushing the RX VLAN as a second, outer tag, on egress towards the CPU port
+ * - redirecting towards the correct front port based on TX VLAN and popping
+ * that on egress
+ */
+#include <linux/dsa/8021q.h>
+#include <linux/dsa/ocelot.h>
+
+#include "tag.h"
+#include "tag_8021q.h"
+
+#define OCELOT_8021Q_NAME "ocelot-8021q"
+
+struct ocelot_8021q_tagger_private {
+ struct ocelot_8021q_tagger_data data; /* Must be first */
+ struct kthread_worker *xmit_worker;
+};
+
+static struct sk_buff *ocelot_defer_xmit(struct dsa_port *dp,
+ struct sk_buff *skb)
+{
+ struct ocelot_8021q_tagger_private *priv = dp->ds->tagger_data;
+ struct ocelot_8021q_tagger_data *data = &priv->data;
+ void (*xmit_work_fn)(struct kthread_work *work);
+ struct felix_deferred_xmit_work *xmit_work;
+ struct kthread_worker *xmit_worker;
+
+ xmit_work_fn = data->xmit_work_fn;
+ xmit_worker = priv->xmit_worker;
+
+ if (!xmit_work_fn || !xmit_worker)
+ return NULL;
+
+ /* PTP over IP packets need UDP checksumming. We may have inherited
+ * NETIF_F_HW_CSUM from the DSA conduit, but these packets are not sent
+ * through the DSA conduit, so calculate the checksum here.
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+ return NULL;
+
+ xmit_work = kzalloc(sizeof(*xmit_work), GFP_ATOMIC);
+ if (!xmit_work)
+ return NULL;
+
+ /* Calls felix_port_deferred_xmit in felix.c */
+ kthread_init_work(&xmit_work->work, xmit_work_fn);
+ /* Increase refcount so the kfree_skb in dsa_user_xmit
+ * won't really free the packet.
+ */
+ xmit_work->dp = dp;
+ xmit_work->skb = skb_get(skb);
+
+ kthread_queue_work(xmit_worker, &xmit_work->work);
+
+ return NULL;
+}
+
+static struct sk_buff *ocelot_xmit(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ struct dsa_port *dp = dsa_user_to_port(netdev);
+ u16 queue_mapping = skb_get_queue_mapping(skb);
+ u8 pcp = netdev_txq_to_tc(netdev, queue_mapping);
+ u16 tx_vid = dsa_tag_8021q_standalone_vid(dp);
+ struct ethhdr *hdr = eth_hdr(skb);
+
+ if (ocelot_ptp_rew_op(skb) || is_link_local_ether_addr(hdr->h_dest))
+ return ocelot_defer_xmit(dp, skb);
+
+ return dsa_8021q_xmit(skb, netdev, ETH_P_8021Q,
+ ((pcp << VLAN_PRIO_SHIFT) | tx_vid));
+}
+
+static struct sk_buff *ocelot_rcv(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ int src_port = -1, switch_id = -1;
+
+ dsa_8021q_rcv(skb, &src_port, &switch_id, NULL, NULL);
+
+ skb->dev = dsa_conduit_find_user(netdev, switch_id, src_port);
+ if (!skb->dev)
+ return NULL;
+
+ dsa_default_offload_fwd_mark(skb);
+
+ return skb;
+}
+
+static void ocelot_disconnect(struct dsa_switch *ds)
+{
+ struct ocelot_8021q_tagger_private *priv = ds->tagger_data;
+
+ kthread_destroy_worker(priv->xmit_worker);
+ kfree(priv);
+ ds->tagger_data = NULL;
+}
+
+static int ocelot_connect(struct dsa_switch *ds)
+{
+ struct ocelot_8021q_tagger_private *priv;
+ int err;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ priv->xmit_worker = kthread_run_worker(0, "felix_xmit");
+ if (IS_ERR(priv->xmit_worker)) {
+ err = PTR_ERR(priv->xmit_worker);
+ kfree(priv);
+ return err;
+ }
+
+ ds->tagger_data = priv;
+
+ return 0;
+}
+
+static const struct dsa_device_ops ocelot_8021q_netdev_ops = {
+ .name = OCELOT_8021Q_NAME,
+ .proto = DSA_TAG_PROTO_OCELOT_8021Q,
+ .xmit = ocelot_xmit,
+ .rcv = ocelot_rcv,
+ .connect = ocelot_connect,
+ .disconnect = ocelot_disconnect,
+ .needed_headroom = VLAN_HLEN,
+ .promisc_on_conduit = true,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for Ocelot family of switches, using VLAN");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_OCELOT_8021Q, OCELOT_8021Q_NAME);
+
+module_dsa_tag_driver(ocelot_8021q_netdev_ops);
diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c
index 613f4ee97771..6d56a28c914c 100644
--- a/net/dsa/tag_qca.c
+++ b/net/dsa/tag_qca.c
@@ -1,104 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2015, The Linux Foundation. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#include <linux/etherdevice.h>
+#include <linux/bitfield.h>
+#include <net/dsa.h>
+#include <linux/dsa/tag_qca.h>
-#include "dsa_priv.h"
-
-#define QCA_HDR_LEN 2
-#define QCA_HDR_VERSION 0x2
-
-#define QCA_HDR_RECV_VERSION_MASK GENMASK(15, 14)
-#define QCA_HDR_RECV_VERSION_S 14
-#define QCA_HDR_RECV_PRIORITY_MASK GENMASK(13, 11)
-#define QCA_HDR_RECV_PRIORITY_S 11
-#define QCA_HDR_RECV_TYPE_MASK GENMASK(10, 6)
-#define QCA_HDR_RECV_TYPE_S 6
-#define QCA_HDR_RECV_FRAME_IS_TAGGED BIT(3)
-#define QCA_HDR_RECV_SOURCE_PORT_MASK GENMASK(2, 0)
-
-#define QCA_HDR_XMIT_VERSION_MASK GENMASK(15, 14)
-#define QCA_HDR_XMIT_VERSION_S 14
-#define QCA_HDR_XMIT_PRIORITY_MASK GENMASK(13, 11)
-#define QCA_HDR_XMIT_PRIORITY_S 11
-#define QCA_HDR_XMIT_CONTROL_MASK GENMASK(10, 8)
-#define QCA_HDR_XMIT_CONTROL_S 8
-#define QCA_HDR_XMIT_FROM_CPU BIT(7)
-#define QCA_HDR_XMIT_DP_BIT_MASK GENMASK(6, 0)
+#include "tag.h"
+
+#define QCA_NAME "qca"
static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev)
{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- u16 *phdr, hdr;
-
- dev->stats.tx_packets++;
- dev->stats.tx_bytes += skb->len;
-
- if (skb_cow_head(skb, 0) < 0)
- return NULL;
+ __be16 *phdr;
+ u16 hdr;
skb_push(skb, QCA_HDR_LEN);
- memmove(skb->data, skb->data + QCA_HDR_LEN, 2 * ETH_ALEN);
- phdr = (u16 *)(skb->data + 2 * ETH_ALEN);
+ dsa_alloc_etype_header(skb, QCA_HDR_LEN);
+ phdr = dsa_etype_header_pos_tx(skb);
/* Set the version field, and set destination port information */
- hdr = QCA_HDR_VERSION << QCA_HDR_XMIT_VERSION_S |
- QCA_HDR_XMIT_FROM_CPU | BIT(dp->index);
+ hdr = FIELD_PREP(QCA_HDR_XMIT_VERSION, QCA_HDR_VERSION);
+ hdr |= QCA_HDR_XMIT_FROM_CPU;
+ hdr |= FIELD_PREP(QCA_HDR_XMIT_DP_BIT, dsa_xmit_port_mask(skb, dev));
*phdr = htons(hdr);
return skb;
}
-static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt)
+static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev)
{
- u8 ver;
+ struct qca_tagger_data *tagger_data;
+ struct dsa_port *dp = dev->dsa_ptr;
+ struct dsa_switch *ds = dp->ds;
+ u8 ver, pk_type;
+ __be16 *phdr;
int port;
- __be16 *phdr, hdr;
+ u16 hdr;
+
+ BUILD_BUG_ON(sizeof(struct qca_mgmt_ethhdr) != QCA_HDR_MGMT_HEADER_LEN + QCA_HDR_LEN);
+
+ tagger_data = ds->tagger_data;
if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN)))
return NULL;
- /* The QCA header is added by the switch between src addr and Ethertype
- * At this point, skb->data points to ethertype so header should be
- * right before
- */
- phdr = (__be16 *)(skb->data - 2);
+ phdr = dsa_etype_header_pos_rx(skb);
hdr = ntohs(*phdr);
/* Make sure the version is correct */
- ver = (hdr & QCA_HDR_RECV_VERSION_MASK) >> QCA_HDR_RECV_VERSION_S;
+ ver = FIELD_GET(QCA_HDR_RECV_VERSION, hdr);
if (unlikely(ver != QCA_HDR_VERSION))
return NULL;
- /* Remove QCA tag and recalculate checksum */
- skb_pull_rcsum(skb, QCA_HDR_LEN);
- memmove(skb->data - ETH_HLEN, skb->data - ETH_HLEN - QCA_HDR_LEN,
- ETH_HLEN - QCA_HDR_LEN);
+ /* Get pk type */
+ pk_type = FIELD_GET(QCA_HDR_RECV_TYPE, hdr);
+
+ /* Ethernet mgmt read/write packet */
+ if (pk_type == QCA_HDR_RECV_TYPE_RW_REG_ACK) {
+ if (likely(tagger_data->rw_reg_ack_handler))
+ tagger_data->rw_reg_ack_handler(ds, skb);
+ return NULL;
+ }
+
+ /* Ethernet MIB counter packet */
+ if (pk_type == QCA_HDR_RECV_TYPE_MIB) {
+ if (likely(tagger_data->mib_autocast_handler))
+ tagger_data->mib_autocast_handler(ds, skb);
+ return NULL;
+ }
/* Get source port information */
- port = (hdr & QCA_HDR_RECV_SOURCE_PORT_MASK);
+ port = FIELD_GET(QCA_HDR_RECV_SOURCE_PORT, hdr);
- skb->dev = dsa_master_find_slave(dev, 0, port);
+ skb->dev = dsa_conduit_find_user(dev, 0, port);
if (!skb->dev)
return NULL;
+ /* Remove QCA tag and recalculate checksum */
+ skb_pull_rcsum(skb, QCA_HDR_LEN);
+ dsa_strip_etype_header(skb, QCA_HDR_LEN);
+
return skb;
}
-const struct dsa_device_ops qca_netdev_ops = {
+static int qca_tag_connect(struct dsa_switch *ds)
+{
+ struct qca_tagger_data *tagger_data;
+
+ tagger_data = kzalloc(sizeof(*tagger_data), GFP_KERNEL);
+ if (!tagger_data)
+ return -ENOMEM;
+
+ ds->tagger_data = tagger_data;
+
+ return 0;
+}
+
+static void qca_tag_disconnect(struct dsa_switch *ds)
+{
+ kfree(ds->tagger_data);
+ ds->tagger_data = NULL;
+}
+
+static const struct dsa_device_ops qca_netdev_ops = {
+ .name = QCA_NAME,
+ .proto = DSA_TAG_PROTO_QCA,
+ .connect = qca_tag_connect,
+ .disconnect = qca_tag_disconnect,
.xmit = qca_tag_xmit,
.rcv = qca_tag_rcv,
+ .needed_headroom = QCA_HDR_LEN,
+ .promisc_on_conduit = true,
};
+
+MODULE_DESCRIPTION("DSA tag driver for Qualcomm Atheros QCA8K switches");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_QCA, QCA_NAME);
+
+module_dsa_tag_driver(qca_netdev_ops);
diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c
new file mode 100644
index 000000000000..3cc63eacfa03
--- /dev/null
+++ b/net/dsa/tag_rtl4_a.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Handler for Realtek 4 byte DSA switch tags
+ * Currently only supports protocol "A" found in RTL8366RB
+ * Copyright (c) 2020 Linus Walleij <linus.walleij@linaro.org>
+ *
+ * This "proprietary tag" header looks like so:
+ *
+ * -------------------------------------------------
+ * | MAC DA | MAC SA | 0x8899 | 2 bytes tag | Type |
+ * -------------------------------------------------
+ *
+ * The 2 bytes tag form a 16 bit big endian word. The exact
+ * meaning has been guessed from packet dumps from ingress
+ * frames.
+ */
+
+#include <linux/etherdevice.h>
+#include <linux/bits.h>
+
+#include "tag.h"
+
+#define RTL4_A_NAME "rtl4a"
+
+#define RTL4_A_HDR_LEN 4
+#define RTL4_A_PROTOCOL_SHIFT 12
+/*
+ * 0x1 = Realtek Remote Control protocol (RRCP)
+ * 0x2/0x3 seems to be used for loopback testing
+ * 0x9 = RTL8306 DSA protocol
+ * 0xa = RTL8366RB DSA protocol
+ */
+#define RTL4_A_PROTOCOL_RTL8366RB 0xa
+
+static struct sk_buff *rtl4a_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ __be16 *p;
+ u8 *tag;
+ u16 out;
+
+ /* Pad out to at least 60 bytes */
+ if (unlikely(__skb_put_padto(skb, ETH_ZLEN, false)))
+ return NULL;
+
+ netdev_dbg(dev, "add realtek tag to package to port %d\n",
+ dp->index);
+ skb_push(skb, RTL4_A_HDR_LEN);
+
+ dsa_alloc_etype_header(skb, RTL4_A_HDR_LEN);
+ tag = dsa_etype_header_pos_tx(skb);
+
+ /* Set Ethertype */
+ p = (__be16 *)tag;
+ *p = htons(ETH_P_REALTEK);
+
+ out = (RTL4_A_PROTOCOL_RTL8366RB << RTL4_A_PROTOCOL_SHIFT);
+ /* The lower bits indicate the port number */
+ out |= dsa_xmit_port_mask(skb, dev);
+
+ p = (__be16 *)(tag + 2);
+ *p = htons(out);
+
+ return skb;
+}
+
+static struct sk_buff *rtl4a_tag_rcv(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ u16 protport;
+ __be16 *p;
+ u16 etype;
+ u8 *tag;
+ u8 prot;
+ u8 port;
+
+ if (unlikely(!pskb_may_pull(skb, RTL4_A_HDR_LEN)))
+ return NULL;
+
+ tag = dsa_etype_header_pos_rx(skb);
+ p = (__be16 *)tag;
+ etype = ntohs(*p);
+ if (etype != ETH_P_REALTEK) {
+ /* Not custom, just pass through */
+ netdev_dbg(dev, "non-realtek ethertype 0x%04x\n", etype);
+ return skb;
+ }
+ p = (__be16 *)(tag + 2);
+ protport = ntohs(*p);
+ /* The 4 upper bits are the protocol */
+ prot = (protport >> RTL4_A_PROTOCOL_SHIFT) & 0x0f;
+ if (prot != RTL4_A_PROTOCOL_RTL8366RB) {
+ netdev_err(dev, "unknown realtek protocol 0x%01x\n", prot);
+ return NULL;
+ }
+ port = protport & 0xff;
+
+ skb->dev = dsa_conduit_find_user(dev, 0, port);
+ if (!skb->dev) {
+ netdev_dbg(dev, "could not find user for port %d\n", port);
+ return NULL;
+ }
+
+ /* Remove RTL4 tag and recalculate checksum */
+ skb_pull_rcsum(skb, RTL4_A_HDR_LEN);
+
+ dsa_strip_etype_header(skb, RTL4_A_HDR_LEN);
+
+ dsa_default_offload_fwd_mark(skb);
+
+ return skb;
+}
+
+static const struct dsa_device_ops rtl4a_netdev_ops = {
+ .name = RTL4_A_NAME,
+ .proto = DSA_TAG_PROTO_RTL4_A,
+ .xmit = rtl4a_tag_xmit,
+ .rcv = rtl4a_tag_rcv,
+ .needed_headroom = RTL4_A_HDR_LEN,
+};
+module_dsa_tag_driver(rtl4a_netdev_ops);
+
+MODULE_DESCRIPTION("DSA tag driver for Realtek 4 byte protocol A tags");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL4_A, RTL4_A_NAME);
diff --git a/net/dsa/tag_rtl8_4.c b/net/dsa/tag_rtl8_4.c
new file mode 100644
index 000000000000..2464545da4d2
--- /dev/null
+++ b/net/dsa/tag_rtl8_4.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Handler for Realtek 8 byte switch tags
+ *
+ * Copyright (C) 2021 Alvin Šipraga <alsi@bang-olufsen.dk>
+ *
+ * NOTE: Currently only supports protocol "4" found in the RTL8365MB, hence
+ * named tag_rtl8_4.
+ *
+ * This tag has the following format:
+ *
+ * 0 7|8 15
+ * |-----------------------------------+-----------------------------------|---
+ * | (16-bit) | ^
+ * | Realtek EtherType [0x8899] | |
+ * |-----------------------------------+-----------------------------------| 8
+ * | (8-bit) | (8-bit) |
+ * | Protocol [0x04] | REASON | b
+ * |-----------------------------------+-----------------------------------| y
+ * | (1) | (1) | (2) | (1) | (3) | (1) | (1) | (1) | (5) | t
+ * | FID_EN | X | FID | PRI_EN | PRI | KEEP | X | LEARN_DIS | X | e
+ * |-----------------------------------+-----------------------------------| s
+ * | (1) | (15-bit) | |
+ * | ALLOW | TX/RX | v
+ * |-----------------------------------+-----------------------------------|---
+ *
+ * With the following field descriptions:
+ *
+ * field | description
+ * ------------+-------------
+ * Realtek | 0x8899: indicates that this is a proprietary Realtek tag;
+ * EtherType | note that Realtek uses the same EtherType for
+ * | other incompatible tag formats (e.g. tag_rtl4_a.c)
+ * Protocol | 0x04: indicates that this tag conforms to this format
+ * X | reserved
+ * ------------+-------------
+ * REASON | reason for forwarding packet to CPU
+ * | 0: packet was forwarded or flooded to CPU
+ * | 80: packet was trapped to CPU
+ * FID_EN | 1: packet has an FID
+ * | 0: no FID
+ * FID | FID of packet (if FID_EN=1)
+ * PRI_EN | 1: force priority of packet
+ * | 0: don't force priority
+ * PRI | priority of packet (if PRI_EN=1)
+ * KEEP | preserve packet VLAN tag format
+ * LEARN_DIS | don't learn the source MAC address of the packet
+ * ALLOW | 1: treat TX/RX field as an allowance port mask, meaning the
+ * | packet may only be forwarded to ports specified in the
+ * | mask
+ * | 0: no allowance port mask, TX/RX field is the forwarding
+ * | port mask
+ * TX/RX | TX (switch->CPU): port number the packet was received on
+ * | RX (CPU->switch): forwarding port mask (if ALLOW=0)
+ * | allowance port mask (if ALLOW=1)
+ *
+ * The tag can be positioned before Ethertype, using tag "rtl8_4":
+ *
+ * +--------+--------+------------+------+-----
+ * | MAC DA | MAC SA | 8 byte tag | Type | ...
+ * +--------+--------+------------+------+-----
+ *
+ * The tag can also appear between the end of the payload and before the CRC,
+ * using tag "rtl8_4t":
+ *
+ * +--------+--------+------+-----+---------+------------+-----+
+ * | MAC DA | MAC SA | TYPE | ... | payload | 8-byte tag | CRC |
+ * +--------+--------+------+-----+---------+------------+-----+
+ *
+ * The added bytes after the payload will break most checksums, either in
+ * software or hardware. To avoid this issue, if the checksum is still pending,
+ * this tagger checksums the packet in software before adding the tag.
+ *
+ */
+
+#include <linux/bitfield.h>
+#include <linux/bits.h>
+#include <linux/etherdevice.h>
+
+#include "tag.h"
+
+/* Protocols supported:
+ *
+ * 0x04 = RTL8365MB DSA protocol
+ */
+
+#define RTL8_4_NAME "rtl8_4"
+#define RTL8_4T_NAME "rtl8_4t"
+
+#define RTL8_4_TAG_LEN 8
+
+#define RTL8_4_PROTOCOL GENMASK(15, 8)
+#define RTL8_4_PROTOCOL_RTL8365MB 0x04
+#define RTL8_4_REASON GENMASK(7, 0)
+#define RTL8_4_REASON_FORWARD 0
+#define RTL8_4_REASON_TRAP 80
+
+#define RTL8_4_LEARN_DIS BIT(5)
+
+#define RTL8_4_TX GENMASK(3, 0)
+#define RTL8_4_RX GENMASK(10, 0)
+
+static void rtl8_4_write_tag(struct sk_buff *skb, struct net_device *dev,
+ void *tag)
+{
+ __be16 tag16[RTL8_4_TAG_LEN / 2];
+
+ /* Set Realtek EtherType */
+ tag16[0] = htons(ETH_P_REALTEK);
+
+ /* Set Protocol; zero REASON */
+ tag16[1] = htons(FIELD_PREP(RTL8_4_PROTOCOL, RTL8_4_PROTOCOL_RTL8365MB));
+
+ /* Zero FID_EN, FID, PRI_EN, PRI, KEEP; set LEARN_DIS */
+ tag16[2] = htons(FIELD_PREP(RTL8_4_LEARN_DIS, 1));
+
+ /* Zero ALLOW; set RX (CPU->switch) forwarding port mask */
+ tag16[3] = htons(FIELD_PREP(RTL8_4_RX, dsa_xmit_port_mask(skb, dev)));
+
+ memcpy(tag, tag16, RTL8_4_TAG_LEN);
+}
+
+static struct sk_buff *rtl8_4_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ skb_push(skb, RTL8_4_TAG_LEN);
+
+ dsa_alloc_etype_header(skb, RTL8_4_TAG_LEN);
+
+ rtl8_4_write_tag(skb, dev, dsa_etype_header_pos_tx(skb));
+
+ return skb;
+}
+
+static struct sk_buff *rtl8_4t_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ /* Calculate the checksum here if not done yet as trailing tags will
+ * break either software or hardware based checksum
+ */
+ if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb))
+ return NULL;
+
+ rtl8_4_write_tag(skb, dev, skb_put(skb, RTL8_4_TAG_LEN));
+
+ return skb;
+}
+
+static int rtl8_4_read_tag(struct sk_buff *skb, struct net_device *dev,
+ void *tag)
+{
+ __be16 tag16[RTL8_4_TAG_LEN / 2];
+ u16 etype;
+ u8 reason;
+ u8 proto;
+ u8 port;
+
+ memcpy(tag16, tag, RTL8_4_TAG_LEN);
+
+ /* Parse Realtek EtherType */
+ etype = ntohs(tag16[0]);
+ if (unlikely(etype != ETH_P_REALTEK)) {
+ dev_warn_ratelimited(&dev->dev,
+ "non-realtek ethertype 0x%04x\n", etype);
+ return -EPROTO;
+ }
+
+ /* Parse Protocol */
+ proto = FIELD_GET(RTL8_4_PROTOCOL, ntohs(tag16[1]));
+ if (unlikely(proto != RTL8_4_PROTOCOL_RTL8365MB)) {
+ dev_warn_ratelimited(&dev->dev,
+ "unknown realtek protocol 0x%02x\n",
+ proto);
+ return -EPROTO;
+ }
+
+ /* Parse REASON */
+ reason = FIELD_GET(RTL8_4_REASON, ntohs(tag16[1]));
+
+ /* Parse TX (switch->CPU) */
+ port = FIELD_GET(RTL8_4_TX, ntohs(tag16[3]));
+ skb->dev = dsa_conduit_find_user(dev, 0, port);
+ if (!skb->dev) {
+ dev_warn_ratelimited(&dev->dev,
+ "could not find user for port %d\n",
+ port);
+ return -ENOENT;
+ }
+
+ if (reason != RTL8_4_REASON_TRAP)
+ dsa_default_offload_fwd_mark(skb);
+
+ return 0;
+}
+
+static struct sk_buff *rtl8_4_tag_rcv(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ if (unlikely(!pskb_may_pull(skb, RTL8_4_TAG_LEN)))
+ return NULL;
+
+ if (unlikely(rtl8_4_read_tag(skb, dev, dsa_etype_header_pos_rx(skb))))
+ return NULL;
+
+ /* Remove tag and recalculate checksum */
+ skb_pull_rcsum(skb, RTL8_4_TAG_LEN);
+
+ dsa_strip_etype_header(skb, RTL8_4_TAG_LEN);
+
+ return skb;
+}
+
+static struct sk_buff *rtl8_4t_tag_rcv(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ if (skb_linearize(skb))
+ return NULL;
+
+ if (unlikely(rtl8_4_read_tag(skb, dev, skb_tail_pointer(skb) - RTL8_4_TAG_LEN)))
+ return NULL;
+
+ if (pskb_trim_rcsum(skb, skb->len - RTL8_4_TAG_LEN))
+ return NULL;
+
+ return skb;
+}
+
+/* Ethertype version */
+static const struct dsa_device_ops rtl8_4_netdev_ops = {
+ .name = "rtl8_4",
+ .proto = DSA_TAG_PROTO_RTL8_4,
+ .xmit = rtl8_4_tag_xmit,
+ .rcv = rtl8_4_tag_rcv,
+ .needed_headroom = RTL8_4_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(rtl8_4_netdev_ops);
+
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL8_4, RTL8_4_NAME);
+
+/* Tail version */
+static const struct dsa_device_ops rtl8_4t_netdev_ops = {
+ .name = "rtl8_4t",
+ .proto = DSA_TAG_PROTO_RTL8_4T,
+ .xmit = rtl8_4t_tag_xmit,
+ .rcv = rtl8_4t_tag_rcv,
+ .needed_tailroom = RTL8_4_TAG_LEN,
+};
+
+DSA_TAG_DRIVER(rtl8_4t_netdev_ops);
+
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_RTL8_4T, RTL8_4T_NAME);
+
+static struct dsa_tag_driver *dsa_tag_drivers[] = {
+ &DSA_TAG_DRIVER_NAME(rtl8_4_netdev_ops),
+ &DSA_TAG_DRIVER_NAME(rtl8_4t_netdev_ops),
+};
+module_dsa_tag_drivers(dsa_tag_drivers);
+
+MODULE_DESCRIPTION("DSA tag driver for Realtek 8 byte protocol 4 tags");
+MODULE_LICENSE("GPL");
diff --git a/net/dsa/tag_rzn1_a5psw.c b/net/dsa/tag_rzn1_a5psw.c
new file mode 100644
index 000000000000..10994b3470f6
--- /dev/null
+++ b/net/dsa/tag_rzn1_a5psw.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2022 Schneider Electric
+ *
+ * Clément Léger <clement.leger@bootlin.com>
+ */
+
+#include <linux/bitfield.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <net/dsa.h>
+
+#include "tag.h"
+
+/* To define the outgoing port and to discover the incoming port a TAG is
+ * inserted after Src MAC :
+ *
+ * Dest MAC Src MAC TAG Type
+ * ...| 1 2 3 4 5 6 | 1 2 3 4 5 6 | 1 2 3 4 5 6 7 8 | 1 2 |...
+ * |<--------------->|
+ *
+ * See struct a5psw_tag for layout
+ */
+
+#define A5PSW_NAME "a5psw"
+
+#define ETH_P_DSA_A5PSW 0xE001
+#define A5PSW_TAG_LEN 8
+#define A5PSW_CTRL_DATA_FORCE_FORWARD BIT(0)
+/* This is both used for xmit tag and rcv tagging */
+#define A5PSW_CTRL_DATA_PORT GENMASK(3, 0)
+
+struct a5psw_tag {
+ __be16 ctrl_tag;
+ __be16 ctrl_data;
+ __be16 ctrl_data2_hi;
+ __be16 ctrl_data2_lo;
+};
+
+static struct sk_buff *a5psw_tag_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct a5psw_tag *ptag;
+ u32 data2_val;
+
+ BUILD_BUG_ON(sizeof(*ptag) != A5PSW_TAG_LEN);
+
+ /* The Ethernet switch we are interfaced with needs packets to be at
+ * least 60 bytes otherwise they will be discarded when they enter the
+ * switch port logic.
+ */
+ if (__skb_put_padto(skb, ETH_ZLEN, false))
+ return NULL;
+
+ /* provide 'A5PSW_TAG_LEN' bytes additional space */
+ skb_push(skb, A5PSW_TAG_LEN);
+
+ /* make room between MACs and Ether-Type to insert tag */
+ dsa_alloc_etype_header(skb, A5PSW_TAG_LEN);
+
+ ptag = dsa_etype_header_pos_tx(skb);
+
+ data2_val = FIELD_PREP(A5PSW_CTRL_DATA_PORT, dsa_xmit_port_mask(skb, dev));
+ ptag->ctrl_tag = htons(ETH_P_DSA_A5PSW);
+ ptag->ctrl_data = htons(A5PSW_CTRL_DATA_FORCE_FORWARD);
+ ptag->ctrl_data2_lo = htons(data2_val);
+ ptag->ctrl_data2_hi = 0;
+
+ return skb;
+}
+
+static struct sk_buff *a5psw_tag_rcv(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct a5psw_tag *tag;
+ int port;
+
+ if (unlikely(!pskb_may_pull(skb, A5PSW_TAG_LEN))) {
+ dev_warn_ratelimited(&dev->dev,
+ "Dropping packet, cannot pull\n");
+ return NULL;
+ }
+
+ tag = dsa_etype_header_pos_rx(skb);
+
+ if (tag->ctrl_tag != htons(ETH_P_DSA_A5PSW)) {
+ dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid TAG marker\n");
+ return NULL;
+ }
+
+ port = FIELD_GET(A5PSW_CTRL_DATA_PORT, ntohs(tag->ctrl_data));
+
+ skb->dev = dsa_conduit_find_user(dev, 0, port);
+ if (!skb->dev)
+ return NULL;
+
+ skb_pull_rcsum(skb, A5PSW_TAG_LEN);
+ dsa_strip_etype_header(skb, A5PSW_TAG_LEN);
+
+ dsa_default_offload_fwd_mark(skb);
+
+ return skb;
+}
+
+static const struct dsa_device_ops a5psw_netdev_ops = {
+ .name = A5PSW_NAME,
+ .proto = DSA_TAG_PROTO_RZN1_A5PSW,
+ .xmit = a5psw_tag_xmit,
+ .rcv = a5psw_tag_rcv,
+ .needed_headroom = A5PSW_TAG_LEN,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for Renesas RZ/N1 A5PSW switch");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_A5PSW, A5PSW_NAME);
+module_dsa_tag_driver(a5psw_netdev_ops);
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
new file mode 100644
index 000000000000..02adec693811
--- /dev/null
+++ b/net/dsa/tag_sja1105.c
@@ -0,0 +1,762 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019, Vladimir Oltean <olteanv@gmail.com>
+ */
+#include <linux/if_vlan.h>
+#include <linux/dsa/sja1105.h>
+#include <linux/dsa/8021q.h>
+#include <linux/packing.h>
+
+#include "tag.h"
+#include "tag_8021q.h"
+
+#define SJA1105_NAME "sja1105"
+#define SJA1110_NAME "sja1110"
+
+/* Is this a TX or an RX header? */
+#define SJA1110_HEADER_HOST_TO_SWITCH BIT(15)
+
+/* RX header */
+#define SJA1110_RX_HEADER_IS_METADATA BIT(14)
+#define SJA1110_RX_HEADER_HOST_ONLY BIT(13)
+#define SJA1110_RX_HEADER_HAS_TRAILER BIT(12)
+
+/* Trap-to-host format (no trailer present) */
+#define SJA1110_RX_HEADER_SRC_PORT(x) (((x) & GENMASK(7, 4)) >> 4)
+#define SJA1110_RX_HEADER_SWITCH_ID(x) ((x) & GENMASK(3, 0))
+
+/* Timestamp format (trailer present) */
+#define SJA1110_RX_HEADER_TRAILER_POS(x) ((x) & GENMASK(11, 0))
+
+#define SJA1110_RX_TRAILER_SWITCH_ID(x) (((x) & GENMASK(7, 4)) >> 4)
+#define SJA1110_RX_TRAILER_SRC_PORT(x) ((x) & GENMASK(3, 0))
+
+/* Meta frame format (for 2-step TX timestamps) */
+#define SJA1110_RX_HEADER_N_TS(x) (((x) & GENMASK(8, 4)) >> 4)
+
+/* TX header */
+#define SJA1110_TX_HEADER_UPDATE_TC BIT(14)
+#define SJA1110_TX_HEADER_TAKE_TS BIT(13)
+#define SJA1110_TX_HEADER_TAKE_TS_CASC BIT(12)
+#define SJA1110_TX_HEADER_HAS_TRAILER BIT(11)
+
+/* Only valid if SJA1110_TX_HEADER_HAS_TRAILER is false */
+#define SJA1110_TX_HEADER_PRIO(x) (((x) << 7) & GENMASK(10, 7))
+#define SJA1110_TX_HEADER_TSTAMP_ID(x) ((x) & GENMASK(7, 0))
+
+/* Only valid if SJA1110_TX_HEADER_HAS_TRAILER is true */
+#define SJA1110_TX_HEADER_TRAILER_POS(x) ((x) & GENMASK(10, 0))
+
+#define SJA1110_TX_TRAILER_TSTAMP_ID(x) (((x) << 24) & GENMASK(31, 24))
+#define SJA1110_TX_TRAILER_PRIO(x) (((x) << 21) & GENMASK(23, 21))
+#define SJA1110_TX_TRAILER_SWITCHID(x) (((x) << 12) & GENMASK(15, 12))
+#define SJA1110_TX_TRAILER_DESTPORTS(x) (((x) << 1) & GENMASK(11, 1))
+
+#define SJA1110_META_TSTAMP_SIZE 10
+
+#define SJA1110_HEADER_LEN 4
+#define SJA1110_RX_TRAILER_LEN 13
+#define SJA1110_TX_TRAILER_LEN 4
+#define SJA1110_MAX_PADDING_LEN 15
+
+struct sja1105_tagger_private {
+ struct sja1105_tagger_data data; /* Must be first */
+ /* Protects concurrent access to the meta state machine
+ * from taggers running on multiple ports on SMP systems
+ */
+ spinlock_t meta_lock;
+ struct sk_buff *stampable_skb;
+ struct kthread_worker *xmit_worker;
+};
+
+static struct sja1105_tagger_private *
+sja1105_tagger_private(struct dsa_switch *ds)
+{
+ return ds->tagger_data;
+}
+
+/* Similar to is_link_local_ether_addr(hdr->h_dest) but also covers PTP */
+static bool sja1105_is_link_local(const struct sk_buff *skb)
+{
+ const struct ethhdr *hdr = eth_hdr(skb);
+ u64 dmac = ether_addr_to_u64(hdr->h_dest);
+
+ if (ntohs(hdr->h_proto) == ETH_P_SJA1105_META)
+ return false;
+ if ((dmac & SJA1105_LINKLOCAL_FILTER_A_MASK) ==
+ SJA1105_LINKLOCAL_FILTER_A)
+ return true;
+ if ((dmac & SJA1105_LINKLOCAL_FILTER_B_MASK) ==
+ SJA1105_LINKLOCAL_FILTER_B)
+ return true;
+ return false;
+}
+
+struct sja1105_meta {
+ u64 tstamp;
+ u64 dmac_byte_4;
+ u64 dmac_byte_3;
+ u64 source_port;
+ u64 switch_id;
+};
+
+static void sja1105_meta_unpack(const struct sk_buff *skb,
+ struct sja1105_meta *meta)
+{
+ u8 *buf = skb_mac_header(skb) + ETH_HLEN;
+
+ /* UM10944.pdf section 4.2.17 AVB Parameters:
+ * Structure of the meta-data follow-up frame.
+ * It is in network byte order, so there are no quirks
+ * while unpacking the meta frame.
+ *
+ * Also SJA1105 E/T only populates bits 23:0 of the timestamp
+ * whereas P/Q/R/S does 32 bits. Since the structure is the
+ * same and the E/T puts zeroes in the high-order byte, use
+ * a unified unpacking command for both device series.
+ */
+ packing(buf, &meta->tstamp, 31, 0, 4, UNPACK, 0);
+ packing(buf + 4, &meta->dmac_byte_3, 7, 0, 1, UNPACK, 0);
+ packing(buf + 5, &meta->dmac_byte_4, 7, 0, 1, UNPACK, 0);
+ packing(buf + 6, &meta->source_port, 7, 0, 1, UNPACK, 0);
+ packing(buf + 7, &meta->switch_id, 7, 0, 1, UNPACK, 0);
+}
+
+static bool sja1105_is_meta_frame(const struct sk_buff *skb)
+{
+ const struct ethhdr *hdr = eth_hdr(skb);
+ u64 smac = ether_addr_to_u64(hdr->h_source);
+ u64 dmac = ether_addr_to_u64(hdr->h_dest);
+
+ if (smac != SJA1105_META_SMAC)
+ return false;
+ if (dmac != SJA1105_META_DMAC)
+ return false;
+ if (ntohs(hdr->h_proto) != ETH_P_SJA1105_META)
+ return false;
+ return true;
+}
+
+/* Calls sja1105_port_deferred_xmit in sja1105_main.c */
+static struct sk_buff *sja1105_defer_xmit(struct dsa_port *dp,
+ struct sk_buff *skb)
+{
+ struct sja1105_tagger_data *tagger_data = sja1105_tagger_data(dp->ds);
+ struct sja1105_tagger_private *priv = sja1105_tagger_private(dp->ds);
+ void (*xmit_work_fn)(struct kthread_work *work);
+ struct sja1105_deferred_xmit_work *xmit_work;
+ struct kthread_worker *xmit_worker;
+
+ xmit_work_fn = tagger_data->xmit_work_fn;
+ xmit_worker = priv->xmit_worker;
+
+ if (!xmit_work_fn || !xmit_worker)
+ return NULL;
+
+ xmit_work = kzalloc(sizeof(*xmit_work), GFP_ATOMIC);
+ if (!xmit_work)
+ return NULL;
+
+ kthread_init_work(&xmit_work->work, xmit_work_fn);
+ /* Increase refcount so the kfree_skb in dsa_user_xmit
+ * won't really free the packet.
+ */
+ xmit_work->dp = dp;
+ xmit_work->skb = skb_get(skb);
+
+ kthread_queue_work(xmit_worker, &xmit_work->work);
+
+ return NULL;
+}
+
+/* Send VLAN tags with a TPID that blends in with whatever VLAN protocol a
+ * bridge spanning ports of this switch might have.
+ */
+static u16 sja1105_xmit_tpid(struct dsa_port *dp)
+{
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_port *other_dp;
+ u16 proto;
+
+ /* Since VLAN awareness is global, then if this port is VLAN-unaware,
+ * all ports are. Use the VLAN-unaware TPID used for tag_8021q.
+ */
+ if (!dsa_port_is_vlan_filtering(dp))
+ return ETH_P_SJA1105;
+
+ /* Port is VLAN-aware, so there is a bridge somewhere (a single one,
+ * we're sure about that). It may not be on this port though, so we
+ * need to find it.
+ */
+ dsa_switch_for_each_port(other_dp, ds) {
+ struct net_device *br = dsa_port_bridge_dev_get(other_dp);
+
+ if (!br)
+ continue;
+
+ /* Error is returned only if CONFIG_BRIDGE_VLAN_FILTERING,
+ * which seems pointless to handle, as our port cannot become
+ * VLAN-aware in that case.
+ */
+ br_vlan_get_proto(br, &proto);
+
+ return proto;
+ }
+
+ WARN_ONCE(1, "Port is VLAN-aware but cannot find associated bridge!\n");
+
+ return ETH_P_SJA1105;
+}
+
+static struct sk_buff *sja1105_imprecise_xmit(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ struct dsa_port *dp = dsa_user_to_port(netdev);
+ unsigned int bridge_num = dsa_port_bridge_num_get(dp);
+ struct net_device *br = dsa_port_bridge_dev_get(dp);
+ u16 tx_vid;
+
+ /* If the port is under a VLAN-aware bridge, just slide the
+ * VLAN-tagged packet into the FDB and hope for the best.
+ * This works because we support a single VLAN-aware bridge
+ * across the entire dst, and its VLANs cannot be shared with
+ * any standalone port.
+ */
+ if (br_vlan_enabled(br))
+ return skb;
+
+ /* If the port is under a VLAN-unaware bridge, use an imprecise
+ * TX VLAN that targets the bridge's entire broadcast domain,
+ * instead of just the specific port.
+ */
+ tx_vid = dsa_tag_8021q_bridge_vid(bridge_num);
+
+ return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp), tx_vid);
+}
+
+/* Transform untagged control packets into pvid-tagged control packets so that
+ * all packets sent by this tagger are VLAN-tagged and we can configure the
+ * switch to drop untagged packets coming from the DSA conduit.
+ */
+static struct sk_buff *sja1105_pvid_tag_control_pkt(struct dsa_port *dp,
+ struct sk_buff *skb, u8 pcp)
+{
+ __be16 xmit_tpid = htons(sja1105_xmit_tpid(dp));
+ struct vlan_ethhdr *hdr;
+
+ /* If VLAN tag is in hwaccel area, move it to the payload
+ * to deal with both cases uniformly and to ensure that
+ * the VLANs are added in the right order.
+ */
+ if (unlikely(skb_vlan_tag_present(skb))) {
+ skb = __vlan_hwaccel_push_inside(skb);
+ if (!skb)
+ return NULL;
+ }
+
+ hdr = skb_vlan_eth_hdr(skb);
+
+ /* If skb is already VLAN-tagged, leave that VLAN ID in place */
+ if (hdr->h_vlan_proto == xmit_tpid)
+ return skb;
+
+ return vlan_insert_tag(skb, xmit_tpid, (pcp << VLAN_PRIO_SHIFT) |
+ SJA1105_DEFAULT_VLAN);
+}
+
+static struct sk_buff *sja1105_xmit(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ struct dsa_port *dp = dsa_user_to_port(netdev);
+ u16 queue_mapping = skb_get_queue_mapping(skb);
+ u8 pcp = netdev_txq_to_tc(netdev, queue_mapping);
+ u16 tx_vid = dsa_tag_8021q_standalone_vid(dp);
+
+ if (skb->offload_fwd_mark)
+ return sja1105_imprecise_xmit(skb, netdev);
+
+ /* Transmitting management traffic does not rely upon switch tagging,
+ * but instead SPI-installed management routes. Part 2 of this
+ * is the .port_deferred_xmit driver callback.
+ */
+ if (unlikely(sja1105_is_link_local(skb))) {
+ skb = sja1105_pvid_tag_control_pkt(dp, skb, pcp);
+ if (!skb)
+ return NULL;
+
+ return sja1105_defer_xmit(dp, skb);
+ }
+
+ return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp),
+ ((pcp << VLAN_PRIO_SHIFT) | tx_vid));
+}
+
+static struct sk_buff *sja1110_xmit(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ struct sk_buff *clone = SJA1105_SKB_CB(skb)->clone;
+ struct dsa_port *dp = dsa_user_to_port(netdev);
+ u16 queue_mapping = skb_get_queue_mapping(skb);
+ u8 pcp = netdev_txq_to_tc(netdev, queue_mapping);
+ u16 tx_vid = dsa_tag_8021q_standalone_vid(dp);
+ __be32 *tx_trailer;
+ __be16 *tx_header;
+ int trailer_pos;
+
+ if (skb->offload_fwd_mark)
+ return sja1105_imprecise_xmit(skb, netdev);
+
+ /* Transmitting control packets is done using in-band control
+ * extensions, while data packets are transmitted using
+ * tag_8021q TX VLANs.
+ */
+ if (likely(!sja1105_is_link_local(skb)))
+ return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp),
+ ((pcp << VLAN_PRIO_SHIFT) | tx_vid));
+
+ skb = sja1105_pvid_tag_control_pkt(dp, skb, pcp);
+ if (!skb)
+ return NULL;
+
+ skb_push(skb, SJA1110_HEADER_LEN);
+
+ dsa_alloc_etype_header(skb, SJA1110_HEADER_LEN);
+
+ trailer_pos = skb->len;
+
+ tx_header = dsa_etype_header_pos_tx(skb);
+ tx_trailer = skb_put(skb, SJA1110_TX_TRAILER_LEN);
+
+ tx_header[0] = htons(ETH_P_SJA1110);
+ tx_header[1] = htons(SJA1110_HEADER_HOST_TO_SWITCH |
+ SJA1110_TX_HEADER_HAS_TRAILER |
+ SJA1110_TX_HEADER_TRAILER_POS(trailer_pos));
+ *tx_trailer = cpu_to_be32(SJA1110_TX_TRAILER_PRIO(pcp) |
+ SJA1110_TX_TRAILER_SWITCHID(dp->ds->index) |
+ SJA1110_TX_TRAILER_DESTPORTS(BIT(dp->index)));
+ if (clone) {
+ u8 ts_id = SJA1105_SKB_CB(clone)->ts_id;
+
+ tx_header[1] |= htons(SJA1110_TX_HEADER_TAKE_TS);
+ *tx_trailer |= cpu_to_be32(SJA1110_TX_TRAILER_TSTAMP_ID(ts_id));
+ }
+
+ return skb;
+}
+
+static void sja1105_transfer_meta(struct sk_buff *skb,
+ const struct sja1105_meta *meta)
+{
+ struct ethhdr *hdr = eth_hdr(skb);
+
+ hdr->h_dest[3] = meta->dmac_byte_3;
+ hdr->h_dest[4] = meta->dmac_byte_4;
+ SJA1105_SKB_CB(skb)->tstamp = meta->tstamp;
+}
+
+/* This is a simple state machine which follows the hardware mechanism of
+ * generating RX timestamps:
+ *
+ * After each timestampable skb (all traffic for which send_meta1 and
+ * send_meta0 is true, aka all MAC-filtered link-local traffic) a meta frame
+ * containing a partial timestamp is immediately generated by the switch and
+ * sent as a follow-up to the link-local frame on the CPU port.
+ *
+ * The meta frames have no unique identifier (such as sequence number) by which
+ * one may pair them to the correct timestampable frame.
+ * Instead, the switch has internal logic that ensures no frames are sent on
+ * the CPU port between a link-local timestampable frame and its corresponding
+ * meta follow-up. It also ensures strict ordering between ports (lower ports
+ * have higher priority towards the CPU port). For this reason, a per-port
+ * data structure is not needed/desirable.
+ *
+ * This function pairs the link-local frame with its partial timestamp from the
+ * meta follow-up frame. The full timestamp will be reconstructed later in a
+ * work queue.
+ */
+static struct sk_buff
+*sja1105_rcv_meta_state_machine(struct sk_buff *skb,
+ struct sja1105_meta *meta,
+ bool is_link_local,
+ bool is_meta)
+{
+ /* Step 1: A timestampable frame was received.
+ * Buffer it until we get its meta frame.
+ */
+ if (is_link_local) {
+ struct dsa_port *dp = dsa_user_to_port(skb->dev);
+ struct sja1105_tagger_private *priv;
+ struct dsa_switch *ds = dp->ds;
+
+ priv = sja1105_tagger_private(ds);
+
+ spin_lock(&priv->meta_lock);
+ /* Was this a link-local frame instead of the meta
+ * that we were expecting?
+ */
+ if (priv->stampable_skb) {
+ dev_err_ratelimited(ds->dev,
+ "Expected meta frame, is %12llx "
+ "in the DSA conduit multicast filter?\n",
+ SJA1105_META_DMAC);
+ kfree_skb(priv->stampable_skb);
+ }
+
+ /* Hold a reference to avoid dsa_switch_rcv
+ * from freeing the skb.
+ */
+ priv->stampable_skb = skb_get(skb);
+ spin_unlock(&priv->meta_lock);
+
+ /* Tell DSA we got nothing */
+ return NULL;
+
+ /* Step 2: The meta frame arrived.
+ * Time to take the stampable skb out of the closet, annotate it
+ * with the partial timestamp, and pretend that we received it
+ * just now (basically masquerade the buffered frame as the meta
+ * frame, which serves no further purpose).
+ */
+ } else if (is_meta) {
+ struct dsa_port *dp = dsa_user_to_port(skb->dev);
+ struct sja1105_tagger_private *priv;
+ struct dsa_switch *ds = dp->ds;
+ struct sk_buff *stampable_skb;
+
+ priv = sja1105_tagger_private(ds);
+
+ spin_lock(&priv->meta_lock);
+
+ stampable_skb = priv->stampable_skb;
+ priv->stampable_skb = NULL;
+
+ /* Was this a meta frame instead of the link-local
+ * that we were expecting?
+ */
+ if (!stampable_skb) {
+ dev_err_ratelimited(ds->dev,
+ "Unexpected meta frame\n");
+ spin_unlock(&priv->meta_lock);
+ return NULL;
+ }
+
+ if (stampable_skb->dev != skb->dev) {
+ dev_err_ratelimited(ds->dev,
+ "Meta frame on wrong port\n");
+ spin_unlock(&priv->meta_lock);
+ return NULL;
+ }
+
+ /* Free the meta frame and give DSA the buffered stampable_skb
+ * for further processing up the network stack.
+ */
+ kfree_skb(skb);
+ skb = stampable_skb;
+ sja1105_transfer_meta(skb, meta);
+
+ spin_unlock(&priv->meta_lock);
+ }
+
+ return skb;
+}
+
+static bool sja1105_skb_has_tag_8021q(const struct sk_buff *skb)
+{
+ u16 tpid = ntohs(eth_hdr(skb)->h_proto);
+
+ return tpid == ETH_P_SJA1105 || tpid == ETH_P_8021Q ||
+ skb_vlan_tag_present(skb);
+}
+
+static bool sja1110_skb_has_inband_control_extension(const struct sk_buff *skb)
+{
+ return ntohs(eth_hdr(skb)->h_proto) == ETH_P_SJA1110;
+}
+
+static struct sk_buff *sja1105_rcv(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ int source_port = -1, switch_id = -1, vbid = -1, vid = -1;
+ struct sja1105_meta meta = {0};
+ struct ethhdr *hdr;
+ bool is_link_local;
+ bool is_meta;
+
+ hdr = eth_hdr(skb);
+ is_link_local = sja1105_is_link_local(skb);
+ is_meta = sja1105_is_meta_frame(skb);
+
+ if (is_link_local) {
+ /* Management traffic path. Switch embeds the switch ID and
+ * port ID into bytes of the destination MAC, courtesy of
+ * the incl_srcpt options.
+ */
+ source_port = hdr->h_dest[3];
+ switch_id = hdr->h_dest[4];
+ } else if (is_meta) {
+ sja1105_meta_unpack(skb, &meta);
+ source_port = meta.source_port;
+ switch_id = meta.switch_id;
+ }
+
+ /* Normal data plane traffic and link-local frames are tagged with
+ * a tag_8021q VLAN which we have to strip
+ */
+ if (sja1105_skb_has_tag_8021q(skb))
+ dsa_8021q_rcv(skb, &source_port, &switch_id, &vbid, &vid);
+ else if (source_port == -1 && switch_id == -1)
+ /* Packets with no source information have no chance of
+ * getting accepted, drop them straight away.
+ */
+ return NULL;
+
+ skb->dev = dsa_tag_8021q_find_user(netdev, source_port, switch_id,
+ vid, vbid);
+ if (!skb->dev) {
+ netdev_warn(netdev, "Couldn't decode source port\n");
+ return NULL;
+ }
+
+ if (!is_link_local)
+ dsa_default_offload_fwd_mark(skb);
+
+ return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local,
+ is_meta);
+}
+
+static struct sk_buff *sja1110_rcv_meta(struct sk_buff *skb, u16 rx_header)
+{
+ u8 *buf = dsa_etype_header_pos_rx(skb) + SJA1110_HEADER_LEN;
+ int switch_id = SJA1110_RX_HEADER_SWITCH_ID(rx_header);
+ int n_ts = SJA1110_RX_HEADER_N_TS(rx_header);
+ struct sja1105_tagger_data *tagger_data;
+ struct net_device *conduit = skb->dev;
+ struct dsa_port *cpu_dp;
+ struct dsa_switch *ds;
+ int i;
+
+ cpu_dp = conduit->dsa_ptr;
+ ds = dsa_switch_find(cpu_dp->dst->index, switch_id);
+ if (!ds) {
+ net_err_ratelimited("%s: cannot find switch id %d\n",
+ conduit->name, switch_id);
+ return NULL;
+ }
+
+ tagger_data = sja1105_tagger_data(ds);
+ if (!tagger_data->meta_tstamp_handler)
+ return NULL;
+
+ for (i = 0; i <= n_ts; i++) {
+ u8 ts_id, source_port, dir;
+ u64 tstamp;
+
+ ts_id = buf[0];
+ source_port = (buf[1] & GENMASK(7, 4)) >> 4;
+ dir = (buf[1] & BIT(3)) >> 3;
+ tstamp = be64_to_cpu(*(__be64 *)(buf + 2));
+
+ tagger_data->meta_tstamp_handler(ds, source_port, ts_id, dir,
+ tstamp);
+
+ buf += SJA1110_META_TSTAMP_SIZE;
+ }
+
+ /* Discard the meta frame, we've consumed the timestamps it contained */
+ return NULL;
+}
+
+static struct sk_buff *sja1110_rcv_inband_control_extension(struct sk_buff *skb,
+ int *source_port,
+ int *switch_id,
+ bool *host_only)
+{
+ u16 rx_header;
+
+ if (unlikely(!pskb_may_pull(skb, SJA1110_HEADER_LEN)))
+ return NULL;
+
+ /* skb->data points to skb_mac_header(skb) + ETH_HLEN, which is exactly
+ * what we need because the caller has checked the EtherType (which is
+ * located 2 bytes back) and we just need a pointer to the header that
+ * comes afterwards.
+ */
+ rx_header = ntohs(*(__be16 *)skb->data);
+
+ if (rx_header & SJA1110_RX_HEADER_HOST_ONLY)
+ *host_only = true;
+
+ if (rx_header & SJA1110_RX_HEADER_IS_METADATA)
+ return sja1110_rcv_meta(skb, rx_header);
+
+ /* Timestamp frame, we have a trailer */
+ if (rx_header & SJA1110_RX_HEADER_HAS_TRAILER) {
+ int start_of_padding = SJA1110_RX_HEADER_TRAILER_POS(rx_header);
+ u8 *rx_trailer = skb_tail_pointer(skb) - SJA1110_RX_TRAILER_LEN;
+ u64 *tstamp = &SJA1105_SKB_CB(skb)->tstamp;
+ u8 last_byte = rx_trailer[12];
+
+ /* The timestamp is unaligned, so we need to use packing()
+ * to get it
+ */
+ packing(rx_trailer, tstamp, 63, 0, 8, UNPACK, 0);
+
+ *source_port = SJA1110_RX_TRAILER_SRC_PORT(last_byte);
+ *switch_id = SJA1110_RX_TRAILER_SWITCH_ID(last_byte);
+
+ /* skb->len counts from skb->data, while start_of_padding
+ * counts from the destination MAC address. Right now skb->data
+ * is still as set by the DSA conduit, so to trim away the
+ * padding and trailer we need to account for the fact that
+ * skb->data points to skb_mac_header(skb) + ETH_HLEN.
+ */
+ if (pskb_trim_rcsum(skb, start_of_padding - ETH_HLEN))
+ return NULL;
+ /* Trap-to-host frame, no timestamp trailer */
+ } else {
+ *source_port = SJA1110_RX_HEADER_SRC_PORT(rx_header);
+ *switch_id = SJA1110_RX_HEADER_SWITCH_ID(rx_header);
+ }
+
+ /* Advance skb->data past the DSA header */
+ skb_pull_rcsum(skb, SJA1110_HEADER_LEN);
+
+ dsa_strip_etype_header(skb, SJA1110_HEADER_LEN);
+
+ /* With skb->data in its final place, update the MAC header
+ * so that eth_hdr() continues to works properly.
+ */
+ skb_set_mac_header(skb, -ETH_HLEN);
+
+ return skb;
+}
+
+static struct sk_buff *sja1110_rcv(struct sk_buff *skb,
+ struct net_device *netdev)
+{
+ int source_port = -1, switch_id = -1, vbid = -1, vid = -1;
+ bool host_only = false;
+
+ if (sja1110_skb_has_inband_control_extension(skb)) {
+ skb = sja1110_rcv_inband_control_extension(skb, &source_port,
+ &switch_id,
+ &host_only);
+ if (!skb)
+ return NULL;
+ }
+
+ /* Packets with in-band control extensions might still have RX VLANs */
+ if (likely(sja1105_skb_has_tag_8021q(skb)))
+ dsa_8021q_rcv(skb, &source_port, &switch_id, &vbid, &vid);
+
+ skb->dev = dsa_tag_8021q_find_user(netdev, source_port, switch_id,
+ vid, vbid);
+
+ if (!skb->dev) {
+ netdev_warn(netdev, "Couldn't decode source port\n");
+ return NULL;
+ }
+
+ if (!host_only)
+ dsa_default_offload_fwd_mark(skb);
+
+ return skb;
+}
+
+static void sja1105_flow_dissect(const struct sk_buff *skb, __be16 *proto,
+ int *offset)
+{
+ /* No tag added for management frames, all ok */
+ if (unlikely(sja1105_is_link_local(skb)))
+ return;
+
+ dsa_tag_generic_flow_dissect(skb, proto, offset);
+}
+
+static void sja1110_flow_dissect(const struct sk_buff *skb, __be16 *proto,
+ int *offset)
+{
+ /* Management frames have 2 DSA tags on RX, so the needed_headroom we
+ * declared is fine for the generic dissector adjustment procedure.
+ */
+ if (unlikely(sja1105_is_link_local(skb)))
+ return dsa_tag_generic_flow_dissect(skb, proto, offset);
+
+ /* For the rest, there is a single DSA tag, the tag_8021q one */
+ *offset = VLAN_HLEN;
+ *proto = ((__be16 *)skb->data)[(VLAN_HLEN / 2) - 1];
+}
+
+static void sja1105_disconnect(struct dsa_switch *ds)
+{
+ struct sja1105_tagger_private *priv = ds->tagger_data;
+
+ kthread_destroy_worker(priv->xmit_worker);
+ kfree(priv);
+ ds->tagger_data = NULL;
+}
+
+static int sja1105_connect(struct dsa_switch *ds)
+{
+ struct sja1105_tagger_private *priv;
+ struct kthread_worker *xmit_worker;
+ int err;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ spin_lock_init(&priv->meta_lock);
+
+ xmit_worker = kthread_run_worker(0, "dsa%d:%d_xmit",
+ ds->dst->index, ds->index);
+ if (IS_ERR(xmit_worker)) {
+ err = PTR_ERR(xmit_worker);
+ kfree(priv);
+ return err;
+ }
+
+ priv->xmit_worker = xmit_worker;
+ ds->tagger_data = priv;
+
+ return 0;
+}
+
+static const struct dsa_device_ops sja1105_netdev_ops = {
+ .name = SJA1105_NAME,
+ .proto = DSA_TAG_PROTO_SJA1105,
+ .xmit = sja1105_xmit,
+ .rcv = sja1105_rcv,
+ .connect = sja1105_connect,
+ .disconnect = sja1105_disconnect,
+ .needed_headroom = VLAN_HLEN,
+ .flow_dissect = sja1105_flow_dissect,
+ .promisc_on_conduit = true,
+};
+
+DSA_TAG_DRIVER(sja1105_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SJA1105, SJA1105_NAME);
+
+static const struct dsa_device_ops sja1110_netdev_ops = {
+ .name = SJA1110_NAME,
+ .proto = DSA_TAG_PROTO_SJA1110,
+ .xmit = sja1110_xmit,
+ .rcv = sja1110_rcv,
+ .connect = sja1105_connect,
+ .disconnect = sja1105_disconnect,
+ .flow_dissect = sja1110_flow_dissect,
+ .needed_headroom = SJA1110_HEADER_LEN + VLAN_HLEN,
+ .needed_tailroom = SJA1110_RX_TRAILER_LEN + SJA1110_MAX_PADDING_LEN,
+};
+
+DSA_TAG_DRIVER(sja1110_netdev_ops);
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_SJA1110, SJA1110_NAME);
+
+static struct dsa_tag_driver *sja1105_tag_driver_array[] = {
+ &DSA_TAG_DRIVER_NAME(sja1105_netdev_ops),
+ &DSA_TAG_DRIVER_NAME(sja1110_netdev_ops),
+};
+
+module_dsa_tag_drivers(sja1105_tag_driver_array);
+
+MODULE_DESCRIPTION("DSA tag driver for NXP SJA1105 switches");
+MODULE_LICENSE("GPL v2");
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index 56197f0d9608..4dce24cfe6a7 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -1,62 +1,31 @@
+// SPDX-License-Identifier: GPL-2.0+
/*
* net/dsa/tag_trailer.c - Trailer tag format handling
* Copyright (c) 2008-2009 Marvell Semiconductor
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/etherdevice.h>
#include <linux/list.h>
#include <linux/slab.h>
-#include "dsa_priv.h"
+#include "tag.h"
+
+#define TRAILER_NAME "trailer"
static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev)
{
- struct dsa_port *dp = dsa_slave_to_port(dev);
- struct sk_buff *nskb;
- int padlen;
u8 *trailer;
- /*
- * We have to make sure that the trailer ends up as the very
- * last 4 bytes of the packet. This means that we have to pad
- * the packet to the minimum ethernet frame size, if necessary,
- * before adding the trailer.
- */
- padlen = 0;
- if (skb->len < 60)
- padlen = 60 - skb->len;
-
- nskb = alloc_skb(NET_IP_ALIGN + skb->len + padlen + 4, GFP_ATOMIC);
- if (!nskb)
- return NULL;
- skb_reserve(nskb, NET_IP_ALIGN);
-
- skb_reset_mac_header(nskb);
- skb_set_network_header(nskb, skb_network_header(skb) - skb->head);
- skb_set_transport_header(nskb, skb_transport_header(skb) - skb->head);
- skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len));
- consume_skb(skb);
-
- if (padlen) {
- skb_put_zero(nskb, padlen);
- }
-
- trailer = skb_put(nskb, 4);
+ trailer = skb_put(skb, 4);
trailer[0] = 0x80;
- trailer[1] = 1 << dp->index;
+ trailer[1] = dsa_xmit_port_mask(skb, dev);
trailer[2] = 0x10;
trailer[3] = 0x00;
- return nskb;
+ return skb;
}
-static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt)
+static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev)
{
u8 *trailer;
int source_port;
@@ -71,7 +40,7 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev,
source_port = trailer[1] & 7;
- skb->dev = dsa_master_find_slave(dev, 0, source_port);
+ skb->dev = dsa_conduit_find_user(dev, 0, source_port);
if (!skb->dev)
return NULL;
@@ -81,7 +50,16 @@ static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev,
return skb;
}
-const struct dsa_device_ops trailer_netdev_ops = {
+static const struct dsa_device_ops trailer_netdev_ops = {
+ .name = TRAILER_NAME,
+ .proto = DSA_TAG_PROTO_TRAILER,
.xmit = trailer_xmit,
.rcv = trailer_rcv,
+ .needed_tailroom = 4,
};
+
+MODULE_DESCRIPTION("DSA tag driver for switches using a trailer tag");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_TRAILER, TRAILER_NAME);
+
+module_dsa_tag_driver(trailer_netdev_ops);
diff --git a/net/dsa/tag_vsc73xx_8021q.c b/net/dsa/tag_vsc73xx_8021q.c
new file mode 100644
index 000000000000..af121a9aff7f
--- /dev/null
+++ b/net/dsa/tag_vsc73xx_8021q.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
+/* Copyright (C) 2024 Pawel Dembicki <paweldembicki@gmail.com>
+ */
+#include <linux/dsa/8021q.h>
+
+#include "tag.h"
+#include "tag_8021q.h"
+
+#define VSC73XX_8021Q_NAME "vsc73xx-8021q"
+
+static struct sk_buff *
+vsc73xx_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+ struct dsa_port *dp = dsa_user_to_port(netdev);
+ u16 queue_mapping = skb_get_queue_mapping(skb);
+ u16 tx_vid = dsa_tag_8021q_standalone_vid(dp);
+ u8 pcp;
+
+ if (skb->offload_fwd_mark) {
+ unsigned int bridge_num = dsa_port_bridge_num_get(dp);
+ struct net_device *br = dsa_port_bridge_dev_get(dp);
+
+ if (br_vlan_enabled(br))
+ return skb;
+
+ tx_vid = dsa_tag_8021q_bridge_vid(bridge_num);
+ }
+
+ pcp = netdev_txq_to_tc(netdev, queue_mapping);
+
+ return dsa_8021q_xmit(skb, netdev, ETH_P_8021Q,
+ ((pcp << VLAN_PRIO_SHIFT) | tx_vid));
+}
+
+static struct sk_buff *
+vsc73xx_rcv(struct sk_buff *skb, struct net_device *netdev)
+{
+ int src_port = -1, switch_id = -1, vbid = -1, vid = -1;
+
+ dsa_8021q_rcv(skb, &src_port, &switch_id, &vbid, &vid);
+
+ skb->dev = dsa_tag_8021q_find_user(netdev, src_port, switch_id,
+ vid, vbid);
+ if (!skb->dev) {
+ dev_warn_ratelimited(&netdev->dev,
+ "Couldn't decode source port\n");
+ return NULL;
+ }
+
+ dsa_default_offload_fwd_mark(skb);
+
+ return skb;
+}
+
+static const struct dsa_device_ops vsc73xx_8021q_netdev_ops = {
+ .name = VSC73XX_8021Q_NAME,
+ .proto = DSA_TAG_PROTO_VSC73XX_8021Q,
+ .xmit = vsc73xx_xmit,
+ .rcv = vsc73xx_rcv,
+ .needed_headroom = VLAN_HLEN,
+ .promisc_on_conduit = true,
+};
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("DSA tag driver for VSC73XX family of switches, using VLAN");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_VSC73XX_8021Q, VSC73XX_8021Q_NAME);
+
+module_dsa_tag_driver(vsc73xx_8021q_netdev_ops);
diff --git a/net/dsa/tag_xrs700x.c b/net/dsa/tag_xrs700x.c
new file mode 100644
index 000000000000..a05219f702c6
--- /dev/null
+++ b/net/dsa/tag_xrs700x.c
@@ -0,0 +1,61 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * XRS700x tag format handling
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ * Copyright (c) 2020 NovaTech LLC
+ */
+
+#include <linux/bitops.h>
+
+#include "tag.h"
+
+#define XRS700X_NAME "xrs700x"
+
+static struct sk_buff *xrs700x_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ u8 *trailer;
+
+ trailer = skb_put(skb, 1);
+ trailer[0] = dsa_xmit_port_mask(skb, dev);
+
+ return skb;
+}
+
+static struct sk_buff *xrs700x_rcv(struct sk_buff *skb, struct net_device *dev)
+{
+ int source_port;
+ u8 *trailer;
+
+ trailer = skb_tail_pointer(skb) - 1;
+
+ source_port = ffs((int)trailer[0]) - 1;
+
+ if (source_port < 0)
+ return NULL;
+
+ skb->dev = dsa_conduit_find_user(dev, 0, source_port);
+ if (!skb->dev)
+ return NULL;
+
+ if (pskb_trim_rcsum(skb, skb->len - 1))
+ return NULL;
+
+ /* Frame is forwarded by hardware, don't forward in software. */
+ dsa_default_offload_fwd_mark(skb);
+
+ return skb;
+}
+
+static const struct dsa_device_ops xrs700x_netdev_ops = {
+ .name = XRS700X_NAME,
+ .proto = DSA_TAG_PROTO_XRS700X,
+ .xmit = xrs700x_xmit,
+ .rcv = xrs700x_rcv,
+ .needed_tailroom = 1,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for XRS700x switches");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_XRS700X, XRS700X_NAME);
+
+module_dsa_tag_driver(xrs700x_netdev_ops);
diff --git a/net/dsa/tag_yt921x.c b/net/dsa/tag_yt921x.c
new file mode 100644
index 000000000000..6bbfd42dc5df
--- /dev/null
+++ b/net/dsa/tag_yt921x.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Motorcomm YT921x Switch Extended CPU Port Tagging
+ *
+ * Copyright (c) 2025 David Yang <mmyangfl@gmail.com>
+ *
+ * +----+----+-------+-----+----+---------
+ * | DA | SA | TagET | Tag | ET | Payload ...
+ * +----+----+-------+-----+----+---------
+ * 6 6 2 6 2 N
+ *
+ * Tag Ethertype: CPU_TAG_TPID_TPID (default: ETH_P_YT921X = 0x9988)
+ * * Hardcoded for the moment, but still configurable. Discuss it if there
+ * are conflicts somewhere and/or you want to change it for some reason.
+ * Tag:
+ * 2: VLAN Tag
+ * 2: Rx Port
+ * 15b: Rx Port Valid
+ * 14b-11b: Rx Port
+ * 10b-0b: Cmd?
+ * 2: Tx Port(s)
+ * 15b: Tx Port(s) Valid
+ * 10b-0b: Tx Port(s) Mask
+ */
+
+#include <linux/etherdevice.h>
+
+#include "tag.h"
+
+#define YT921X_TAG_NAME "yt921x"
+
+#define YT921X_TAG_LEN 8
+
+#define YT921X_TAG_PORT_EN BIT(15)
+#define YT921X_TAG_RX_PORT_M GENMASK(14, 11)
+#define YT921X_TAG_RX_CMD_M GENMASK(10, 0)
+#define YT921X_TAG_RX_CMD(x) FIELD_PREP(YT921X_TAG_RX_CMD_M, (x))
+#define YT921X_TAG_RX_CMD_FORWARDED 0x80
+#define YT921X_TAG_RX_CMD_UNK_UCAST 0xb2
+#define YT921X_TAG_RX_CMD_UNK_MCAST 0xb4
+#define YT921X_TAG_TX_PORTS GENMASK(10, 0)
+
+static struct sk_buff *
+yt921x_tag_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+ __be16 *tag;
+ u16 tx;
+
+ skb_push(skb, YT921X_TAG_LEN);
+ dsa_alloc_etype_header(skb, YT921X_TAG_LEN);
+
+ tag = dsa_etype_header_pos_tx(skb);
+
+ tag[0] = htons(ETH_P_YT921X);
+ /* VLAN tag unrelated when TX */
+ tag[1] = 0;
+ tag[2] = 0;
+ tx = FIELD_PREP(YT921X_TAG_TX_PORTS, dsa_xmit_port_mask(skb, netdev)) |
+ YT921X_TAG_PORT_EN;
+ tag[3] = htons(tx);
+
+ return skb;
+}
+
+static struct sk_buff *
+yt921x_tag_rcv(struct sk_buff *skb, struct net_device *netdev)
+{
+ unsigned int port;
+ __be16 *tag;
+ u16 cmd;
+ u16 rx;
+
+ if (unlikely(!pskb_may_pull(skb, YT921X_TAG_LEN)))
+ return NULL;
+
+ tag = dsa_etype_header_pos_rx(skb);
+
+ if (unlikely(tag[0] != htons(ETH_P_YT921X))) {
+ dev_warn_ratelimited(&netdev->dev,
+ "Unexpected EtherType 0x%04x\n",
+ ntohs(tag[0]));
+ return NULL;
+ }
+
+ /* Locate which port this is coming from */
+ rx = ntohs(tag[2]);
+ if (unlikely((rx & YT921X_TAG_PORT_EN) == 0)) {
+ dev_warn_ratelimited(&netdev->dev,
+ "Unexpected rx tag 0x%04x\n", rx);
+ return NULL;
+ }
+
+ port = FIELD_GET(YT921X_TAG_RX_PORT_M, rx);
+ skb->dev = dsa_conduit_find_user(netdev, 0, port);
+ if (unlikely(!skb->dev)) {
+ dev_warn_ratelimited(&netdev->dev,
+ "Couldn't decode source port %u\n", port);
+ return NULL;
+ }
+
+ cmd = FIELD_GET(YT921X_TAG_RX_CMD_M, rx);
+ switch (cmd) {
+ case YT921X_TAG_RX_CMD_FORWARDED:
+ /* Already forwarded by hardware */
+ dsa_default_offload_fwd_mark(skb);
+ break;
+ case YT921X_TAG_RX_CMD_UNK_UCAST:
+ case YT921X_TAG_RX_CMD_UNK_MCAST:
+ /* NOTE: hardware doesn't distinguish between TRAP (copy to CPU
+ * only) and COPY (forward and copy to CPU). In order to perform
+ * a soft switch, NEVER use COPY action in the switch driver.
+ */
+ break;
+ default:
+ dev_warn_ratelimited(&netdev->dev,
+ "Unexpected rx cmd 0x%02x\n", cmd);
+ break;
+ }
+
+ /* Remove YT921x tag and update checksum */
+ skb_pull_rcsum(skb, YT921X_TAG_LEN);
+ dsa_strip_etype_header(skb, YT921X_TAG_LEN);
+
+ return skb;
+}
+
+static const struct dsa_device_ops yt921x_netdev_ops = {
+ .name = YT921X_TAG_NAME,
+ .proto = DSA_TAG_PROTO_YT921X,
+ .xmit = yt921x_tag_xmit,
+ .rcv = yt921x_tag_rcv,
+ .needed_headroom = YT921X_TAG_LEN,
+};
+
+MODULE_DESCRIPTION("DSA tag driver for Motorcomm YT921x switches");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_YT921X, YT921X_TAG_NAME);
+
+module_dsa_tag_driver(yt921x_netdev_ops);
diff --git a/net/dsa/trace.c b/net/dsa/trace.c
new file mode 100644
index 000000000000..1b107165d331
--- /dev/null
+++ b/net/dsa/trace.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Copyright 2022-2023 NXP
+ */
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+void dsa_db_print(const struct dsa_db *db, char buf[DSA_DB_BUFSIZ])
+{
+ switch (db->type) {
+ case DSA_DB_PORT:
+ sprintf(buf, "port %s", db->dp->name);
+ break;
+ case DSA_DB_LAG:
+ sprintf(buf, "lag %s id %d", db->lag.dev->name, db->lag.id);
+ break;
+ case DSA_DB_BRIDGE:
+ sprintf(buf, "bridge %s num %d", db->bridge.dev->name,
+ db->bridge.num);
+ break;
+ default:
+ sprintf(buf, "unknown");
+ break;
+ }
+}
+
+const char *dsa_port_kind(const struct dsa_port *dp)
+{
+ switch (dp->type) {
+ case DSA_PORT_TYPE_USER:
+ return "user";
+ case DSA_PORT_TYPE_CPU:
+ return "cpu";
+ case DSA_PORT_TYPE_DSA:
+ return "dsa";
+ default:
+ return "unused";
+ }
+}
diff --git a/net/dsa/trace.h b/net/dsa/trace.h
new file mode 100644
index 000000000000..83f3e5f78491
--- /dev/null
+++ b/net/dsa/trace.h
@@ -0,0 +1,447 @@
+/* SPDX-License-Identifier: GPL-2.0
+ * Copyright 2022-2023 NXP
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM dsa
+
+#if !defined(_NET_DSA_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _NET_DSA_TRACE_H
+
+#include <net/dsa.h>
+#include <net/switchdev.h>
+#include <linux/etherdevice.h>
+#include <linux/if_bridge.h>
+#include <linux/refcount.h>
+#include <linux/tracepoint.h>
+
+/* Enough to fit "bridge %s num %d" where num has 3 digits */
+#define DSA_DB_BUFSIZ (IFNAMSIZ + 16)
+
+void dsa_db_print(const struct dsa_db *db, char buf[DSA_DB_BUFSIZ]);
+const char *dsa_port_kind(const struct dsa_port *dp);
+
+DECLARE_EVENT_CLASS(dsa_port_addr_op_hw,
+
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr, u16 vid,
+ const struct dsa_db *db, int err),
+
+ TP_ARGS(dp, addr, vid, db, err),
+
+ TP_STRUCT__entry(
+ __string(dev, dev_name(dp->ds->dev))
+ __string(kind, dsa_port_kind(dp))
+ __field(int, port)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ __field(int, err)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ __assign_str(kind);
+ __entry->port = dp->index;
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ __entry->err = err;
+ ),
+
+ TP_printk("%s %s port %d addr %pM vid %u db \"%s\" err %d",
+ __get_str(dev), __get_str(kind), __entry->port, __entry->addr,
+ __entry->vid, __entry->db_buf, __entry->err)
+);
+
+/* Add unicast/multicast address to hardware, either on user ports
+ * (where no refcounting is kept), or on shared ports when the entry
+ * is first seen and its refcount is 1.
+ */
+DEFINE_EVENT(dsa_port_addr_op_hw, dsa_fdb_add_hw,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, int err),
+ TP_ARGS(dp, addr, vid, db, err));
+
+DEFINE_EVENT(dsa_port_addr_op_hw, dsa_mdb_add_hw,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, int err),
+ TP_ARGS(dp, addr, vid, db, err));
+
+/* Delete unicast/multicast address from hardware, either on user ports or
+ * when the refcount on shared ports reaches 0
+ */
+DEFINE_EVENT(dsa_port_addr_op_hw, dsa_fdb_del_hw,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, int err),
+ TP_ARGS(dp, addr, vid, db, err));
+
+DEFINE_EVENT(dsa_port_addr_op_hw, dsa_mdb_del_hw,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, int err),
+ TP_ARGS(dp, addr, vid, db, err));
+
+DECLARE_EVENT_CLASS(dsa_port_addr_op_refcount,
+
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr, u16 vid,
+ const struct dsa_db *db, const refcount_t *refcount),
+
+ TP_ARGS(dp, addr, vid, db, refcount),
+
+ TP_STRUCT__entry(
+ __string(dev, dev_name(dp->ds->dev))
+ __string(kind, dsa_port_kind(dp))
+ __field(int, port)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ __field(unsigned int, refcount)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ __assign_str(kind);
+ __entry->port = dp->index;
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ __entry->refcount = refcount_read(refcount);
+ ),
+
+ TP_printk("%s %s port %d addr %pM vid %u db \"%s\" refcount %u",
+ __get_str(dev), __get_str(kind), __entry->port, __entry->addr,
+ __entry->vid, __entry->db_buf, __entry->refcount)
+);
+
+/* Bump the refcount of an existing unicast/multicast address on shared ports */
+DEFINE_EVENT(dsa_port_addr_op_refcount, dsa_fdb_add_bump,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db,
+ const refcount_t *refcount),
+ TP_ARGS(dp, addr, vid, db, refcount));
+
+DEFINE_EVENT(dsa_port_addr_op_refcount, dsa_mdb_add_bump,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db,
+ const refcount_t *refcount),
+ TP_ARGS(dp, addr, vid, db, refcount));
+
+/* Drop the refcount of a multicast address that we still keep on
+ * shared ports
+ */
+DEFINE_EVENT(dsa_port_addr_op_refcount, dsa_fdb_del_drop,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db,
+ const refcount_t *refcount),
+ TP_ARGS(dp, addr, vid, db, refcount));
+
+DEFINE_EVENT(dsa_port_addr_op_refcount, dsa_mdb_del_drop,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db,
+ const refcount_t *refcount),
+ TP_ARGS(dp, addr, vid, db, refcount));
+
+DECLARE_EVENT_CLASS(dsa_port_addr_del_not_found,
+
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr, u16 vid,
+ const struct dsa_db *db),
+
+ TP_ARGS(dp, addr, vid, db),
+
+ TP_STRUCT__entry(
+ __string(dev, dev_name(dp->ds->dev))
+ __string(kind, dsa_port_kind(dp))
+ __field(int, port)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ __assign_str(kind);
+ __entry->port = dp->index;
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ ),
+
+ TP_printk("%s %s port %d addr %pM vid %u db \"%s\"",
+ __get_str(dev), __get_str(kind), __entry->port,
+ __entry->addr, __entry->vid, __entry->db_buf)
+);
+
+/* Attempt to delete a unicast/multicast address on shared ports for which
+ * the delete operation was called more times than the addition
+ */
+DEFINE_EVENT(dsa_port_addr_del_not_found, dsa_fdb_del_not_found,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db),
+ TP_ARGS(dp, addr, vid, db));
+
+DEFINE_EVENT(dsa_port_addr_del_not_found, dsa_mdb_del_not_found,
+ TP_PROTO(const struct dsa_port *dp, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db),
+ TP_ARGS(dp, addr, vid, db));
+
+TRACE_EVENT(dsa_lag_fdb_add_hw,
+
+ TP_PROTO(const struct net_device *lag_dev, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, int err),
+
+ TP_ARGS(lag_dev, addr, vid, db, err),
+
+ TP_STRUCT__entry(
+ __string(dev, lag_dev->name)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ __field(int, err)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ __entry->err = err;
+ ),
+
+ TP_printk("%s addr %pM vid %u db \"%s\" err %d",
+ __get_str(dev), __entry->addr, __entry->vid,
+ __entry->db_buf, __entry->err)
+);
+
+TRACE_EVENT(dsa_lag_fdb_add_bump,
+
+ TP_PROTO(const struct net_device *lag_dev, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, const refcount_t *refcount),
+
+ TP_ARGS(lag_dev, addr, vid, db, refcount),
+
+ TP_STRUCT__entry(
+ __string(dev, lag_dev->name)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ __field(unsigned int, refcount)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ __entry->refcount = refcount_read(refcount);
+ ),
+
+ TP_printk("%s addr %pM vid %u db \"%s\" refcount %u",
+ __get_str(dev), __entry->addr, __entry->vid,
+ __entry->db_buf, __entry->refcount)
+);
+
+TRACE_EVENT(dsa_lag_fdb_del_hw,
+
+ TP_PROTO(const struct net_device *lag_dev, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, int err),
+
+ TP_ARGS(lag_dev, addr, vid, db, err),
+
+ TP_STRUCT__entry(
+ __string(dev, lag_dev->name)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ __field(int, err)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ __entry->err = err;
+ ),
+
+ TP_printk("%s addr %pM vid %u db \"%s\" err %d",
+ __get_str(dev), __entry->addr, __entry->vid,
+ __entry->db_buf, __entry->err)
+);
+
+TRACE_EVENT(dsa_lag_fdb_del_drop,
+
+ TP_PROTO(const struct net_device *lag_dev, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db, const refcount_t *refcount),
+
+ TP_ARGS(lag_dev, addr, vid, db, refcount),
+
+ TP_STRUCT__entry(
+ __string(dev, lag_dev->name)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ __field(unsigned int, refcount)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ __entry->refcount = refcount_read(refcount);
+ ),
+
+ TP_printk("%s addr %pM vid %u db \"%s\" refcount %u",
+ __get_str(dev), __entry->addr, __entry->vid,
+ __entry->db_buf, __entry->refcount)
+);
+
+TRACE_EVENT(dsa_lag_fdb_del_not_found,
+
+ TP_PROTO(const struct net_device *lag_dev, const unsigned char *addr,
+ u16 vid, const struct dsa_db *db),
+
+ TP_ARGS(lag_dev, addr, vid, db),
+
+ TP_STRUCT__entry(
+ __string(dev, lag_dev->name)
+ __array(unsigned char, addr, ETH_ALEN)
+ __field(u16, vid)
+ __array(char, db_buf, DSA_DB_BUFSIZ)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ ether_addr_copy(__entry->addr, addr);
+ __entry->vid = vid;
+ dsa_db_print(db, __entry->db_buf);
+ ),
+
+ TP_printk("%s addr %pM vid %u db \"%s\"",
+ __get_str(dev), __entry->addr, __entry->vid, __entry->db_buf)
+);
+
+DECLARE_EVENT_CLASS(dsa_vlan_op_hw,
+
+ TP_PROTO(const struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan, int err),
+
+ TP_ARGS(dp, vlan, err),
+
+ TP_STRUCT__entry(
+ __string(dev, dev_name(dp->ds->dev))
+ __string(kind, dsa_port_kind(dp))
+ __field(int, port)
+ __field(u16, vid)
+ __field(u16, flags)
+ __field(bool, changed)
+ __field(int, err)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ __assign_str(kind);
+ __entry->port = dp->index;
+ __entry->vid = vlan->vid;
+ __entry->flags = vlan->flags;
+ __entry->changed = vlan->changed;
+ __entry->err = err;
+ ),
+
+ TP_printk("%s %s port %d vid %u%s%s%s",
+ __get_str(dev), __get_str(kind), __entry->port, __entry->vid,
+ __entry->flags & BRIDGE_VLAN_INFO_PVID ? " pvid" : "",
+ __entry->flags & BRIDGE_VLAN_INFO_UNTAGGED ? " untagged" : "",
+ __entry->changed ? " (changed)" : "")
+);
+
+DEFINE_EVENT(dsa_vlan_op_hw, dsa_vlan_add_hw,
+ TP_PROTO(const struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan, int err),
+ TP_ARGS(dp, vlan, err));
+
+DEFINE_EVENT(dsa_vlan_op_hw, dsa_vlan_del_hw,
+ TP_PROTO(const struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan, int err),
+ TP_ARGS(dp, vlan, err));
+
+DECLARE_EVENT_CLASS(dsa_vlan_op_refcount,
+
+ TP_PROTO(const struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ const refcount_t *refcount),
+
+ TP_ARGS(dp, vlan, refcount),
+
+ TP_STRUCT__entry(
+ __string(dev, dev_name(dp->ds->dev))
+ __string(kind, dsa_port_kind(dp))
+ __field(int, port)
+ __field(u16, vid)
+ __field(u16, flags)
+ __field(bool, changed)
+ __field(unsigned int, refcount)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ __assign_str(kind);
+ __entry->port = dp->index;
+ __entry->vid = vlan->vid;
+ __entry->flags = vlan->flags;
+ __entry->changed = vlan->changed;
+ __entry->refcount = refcount_read(refcount);
+ ),
+
+ TP_printk("%s %s port %d vid %u%s%s%s refcount %u",
+ __get_str(dev), __get_str(kind), __entry->port, __entry->vid,
+ __entry->flags & BRIDGE_VLAN_INFO_PVID ? " pvid" : "",
+ __entry->flags & BRIDGE_VLAN_INFO_UNTAGGED ? " untagged" : "",
+ __entry->changed ? " (changed)" : "", __entry->refcount)
+);
+
+DEFINE_EVENT(dsa_vlan_op_refcount, dsa_vlan_add_bump,
+ TP_PROTO(const struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ const refcount_t *refcount),
+ TP_ARGS(dp, vlan, refcount));
+
+DEFINE_EVENT(dsa_vlan_op_refcount, dsa_vlan_del_drop,
+ TP_PROTO(const struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan,
+ const refcount_t *refcount),
+ TP_ARGS(dp, vlan, refcount));
+
+TRACE_EVENT(dsa_vlan_del_not_found,
+
+ TP_PROTO(const struct dsa_port *dp,
+ const struct switchdev_obj_port_vlan *vlan),
+
+ TP_ARGS(dp, vlan),
+
+ TP_STRUCT__entry(
+ __string(dev, dev_name(dp->ds->dev))
+ __string(kind, dsa_port_kind(dp))
+ __field(int, port)
+ __field(u16, vid)
+ ),
+
+ TP_fast_assign(
+ __assign_str(dev);
+ __assign_str(kind);
+ __entry->port = dp->index;
+ __entry->vid = vlan->vid;
+ ),
+
+ TP_printk("%s %s port %d vid %u",
+ __get_str(dev), __get_str(kind), __entry->port, __entry->vid)
+);
+
+#endif /* _NET_DSA_TRACE_H */
+
+/* We don't want to use include/trace/events */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/net/dsa/user.c b/net/dsa/user.c
new file mode 100644
index 000000000000..f59d66f0975d
--- /dev/null
+++ b/net/dsa/user.c
@@ -0,0 +1,3877 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * net/dsa/user.c - user device handling
+ * Copyright (c) 2008-2009 Marvell Semiconductor
+ */
+
+#include <linux/list.h>
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+#include <linux/phy.h>
+#include <linux/phy_fixed.h>
+#include <linux/phylink.h>
+#include <linux/of_net.h>
+#include <linux/of_mdio.h>
+#include <linux/mdio.h>
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/selftests.h>
+#include <net/tc_act/tc_mirred.h>
+#include <linux/if_bridge.h>
+#include <linux/if_hsr.h>
+#include <net/dcbnl.h>
+#include <linux/netpoll.h>
+#include <linux/string.h>
+
+#include "conduit.h"
+#include "dsa.h"
+#include "netlink.h"
+#include "port.h"
+#include "switch.h"
+#include "tag.h"
+#include "user.h"
+
+struct dsa_switchdev_event_work {
+ struct net_device *dev;
+ struct net_device *orig_dev;
+ struct work_struct work;
+ unsigned long event;
+ /* Specific for SWITCHDEV_FDB_ADD_TO_DEVICE and
+ * SWITCHDEV_FDB_DEL_TO_DEVICE
+ */
+ unsigned char addr[ETH_ALEN];
+ u16 vid;
+ bool host_addr;
+};
+
+enum dsa_standalone_event {
+ DSA_UC_ADD,
+ DSA_UC_DEL,
+ DSA_MC_ADD,
+ DSA_MC_DEL,
+};
+
+struct dsa_standalone_event_work {
+ struct work_struct work;
+ struct net_device *dev;
+ enum dsa_standalone_event event;
+ unsigned char addr[ETH_ALEN];
+ u16 vid;
+};
+
+struct dsa_host_vlan_rx_filtering_ctx {
+ struct net_device *dev;
+ const unsigned char *addr;
+ enum dsa_standalone_event event;
+};
+
+static bool dsa_switch_supports_uc_filtering(struct dsa_switch *ds)
+{
+ return ds->ops->port_fdb_add && ds->ops->port_fdb_del &&
+ ds->fdb_isolation && !ds->vlan_filtering_is_global &&
+ !ds->needs_standalone_vlan_filtering;
+}
+
+static bool dsa_switch_supports_mc_filtering(struct dsa_switch *ds)
+{
+ return ds->ops->port_mdb_add && ds->ops->port_mdb_del &&
+ ds->fdb_isolation && !ds->vlan_filtering_is_global &&
+ !ds->needs_standalone_vlan_filtering;
+}
+
+static void dsa_user_standalone_event_work(struct work_struct *work)
+{
+ struct dsa_standalone_event_work *standalone_work =
+ container_of(work, struct dsa_standalone_event_work, work);
+ const unsigned char *addr = standalone_work->addr;
+ struct net_device *dev = standalone_work->dev;
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct switchdev_obj_port_mdb mdb;
+ struct dsa_switch *ds = dp->ds;
+ u16 vid = standalone_work->vid;
+ int err;
+
+ switch (standalone_work->event) {
+ case DSA_UC_ADD:
+ err = dsa_port_standalone_host_fdb_add(dp, addr, vid);
+ if (err) {
+ dev_err(ds->dev,
+ "port %d failed to add %pM vid %d to fdb: %d\n",
+ dp->index, addr, vid, err);
+ break;
+ }
+ break;
+
+ case DSA_UC_DEL:
+ err = dsa_port_standalone_host_fdb_del(dp, addr, vid);
+ if (err) {
+ dev_err(ds->dev,
+ "port %d failed to delete %pM vid %d from fdb: %d\n",
+ dp->index, addr, vid, err);
+ }
+
+ break;
+ case DSA_MC_ADD:
+ ether_addr_copy(mdb.addr, addr);
+ mdb.vid = vid;
+
+ err = dsa_port_standalone_host_mdb_add(dp, &mdb);
+ if (err) {
+ dev_err(ds->dev,
+ "port %d failed to add %pM vid %d to mdb: %d\n",
+ dp->index, addr, vid, err);
+ break;
+ }
+ break;
+ case DSA_MC_DEL:
+ ether_addr_copy(mdb.addr, addr);
+ mdb.vid = vid;
+
+ err = dsa_port_standalone_host_mdb_del(dp, &mdb);
+ if (err) {
+ dev_err(ds->dev,
+ "port %d failed to delete %pM vid %d from mdb: %d\n",
+ dp->index, addr, vid, err);
+ }
+
+ break;
+ }
+
+ kfree(standalone_work);
+}
+
+static int dsa_user_schedule_standalone_work(struct net_device *dev,
+ enum dsa_standalone_event event,
+ const unsigned char *addr,
+ u16 vid)
+{
+ struct dsa_standalone_event_work *standalone_work;
+
+ standalone_work = kzalloc(sizeof(*standalone_work), GFP_ATOMIC);
+ if (!standalone_work)
+ return -ENOMEM;
+
+ INIT_WORK(&standalone_work->work, dsa_user_standalone_event_work);
+ standalone_work->event = event;
+ standalone_work->dev = dev;
+
+ ether_addr_copy(standalone_work->addr, addr);
+ standalone_work->vid = vid;
+
+ dsa_schedule_work(&standalone_work->work);
+
+ return 0;
+}
+
+static int dsa_user_host_vlan_rx_filtering(void *arg, int vid)
+{
+ struct dsa_host_vlan_rx_filtering_ctx *ctx = arg;
+
+ return dsa_user_schedule_standalone_work(ctx->dev, ctx->event,
+ ctx->addr, vid);
+}
+
+static int dsa_user_vlan_for_each(struct net_device *dev,
+ int (*cb)(void *arg, int vid), void *arg)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_vlan *v;
+ int err;
+
+ lockdep_assert_held(&dev->addr_list_lock);
+
+ err = cb(arg, 0);
+ if (err)
+ return err;
+
+ list_for_each_entry(v, &dp->user_vlans, list) {
+ err = cb(arg, v->vid);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int dsa_user_sync_uc(struct net_device *dev,
+ const unsigned char *addr)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_host_vlan_rx_filtering_ctx ctx = {
+ .dev = dev,
+ .addr = addr,
+ .event = DSA_UC_ADD,
+ };
+
+ dev_uc_add(conduit, addr);
+
+ if (!dsa_switch_supports_uc_filtering(dp->ds))
+ return 0;
+
+ return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering,
+ &ctx);
+}
+
+static int dsa_user_unsync_uc(struct net_device *dev,
+ const unsigned char *addr)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_host_vlan_rx_filtering_ctx ctx = {
+ .dev = dev,
+ .addr = addr,
+ .event = DSA_UC_DEL,
+ };
+
+ dev_uc_del(conduit, addr);
+
+ if (!dsa_switch_supports_uc_filtering(dp->ds))
+ return 0;
+
+ return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering,
+ &ctx);
+}
+
+static int dsa_user_sync_mc(struct net_device *dev,
+ const unsigned char *addr)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_host_vlan_rx_filtering_ctx ctx = {
+ .dev = dev,
+ .addr = addr,
+ .event = DSA_MC_ADD,
+ };
+
+ dev_mc_add(conduit, addr);
+
+ if (!dsa_switch_supports_mc_filtering(dp->ds))
+ return 0;
+
+ return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering,
+ &ctx);
+}
+
+static int dsa_user_unsync_mc(struct net_device *dev,
+ const unsigned char *addr)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_host_vlan_rx_filtering_ctx ctx = {
+ .dev = dev,
+ .addr = addr,
+ .event = DSA_MC_DEL,
+ };
+
+ dev_mc_del(conduit, addr);
+
+ if (!dsa_switch_supports_mc_filtering(dp->ds))
+ return 0;
+
+ return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering,
+ &ctx);
+}
+
+void dsa_user_sync_ha(struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ struct netdev_hw_addr *ha;
+
+ netif_addr_lock_bh(dev);
+
+ netdev_for_each_synced_mc_addr(ha, dev)
+ dsa_user_sync_mc(dev, ha->addr);
+
+ netdev_for_each_synced_uc_addr(ha, dev)
+ dsa_user_sync_uc(dev, ha->addr);
+
+ netif_addr_unlock_bh(dev);
+
+ if (dsa_switch_supports_uc_filtering(ds) ||
+ dsa_switch_supports_mc_filtering(ds))
+ dsa_flush_workqueue();
+}
+
+void dsa_user_unsync_ha(struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ struct netdev_hw_addr *ha;
+
+ netif_addr_lock_bh(dev);
+
+ netdev_for_each_synced_uc_addr(ha, dev)
+ dsa_user_unsync_uc(dev, ha->addr);
+
+ netdev_for_each_synced_mc_addr(ha, dev)
+ dsa_user_unsync_mc(dev, ha->addr);
+
+ netif_addr_unlock_bh(dev);
+
+ if (dsa_switch_supports_uc_filtering(ds) ||
+ dsa_switch_supports_mc_filtering(ds))
+ dsa_flush_workqueue();
+}
+
+/* user mii_bus handling ***************************************************/
+static int dsa_user_phy_read(struct mii_bus *bus, int addr, int reg)
+{
+ struct dsa_switch *ds = bus->priv;
+
+ if (ds->phys_mii_mask & (1 << addr))
+ return ds->ops->phy_read(ds, addr, reg);
+
+ return 0xffff;
+}
+
+static int dsa_user_phy_write(struct mii_bus *bus, int addr, int reg, u16 val)
+{
+ struct dsa_switch *ds = bus->priv;
+
+ if (ds->phys_mii_mask & (1 << addr))
+ return ds->ops->phy_write(ds, addr, reg, val);
+
+ return 0;
+}
+
+void dsa_user_mii_bus_init(struct dsa_switch *ds)
+{
+ ds->user_mii_bus->priv = (void *)ds;
+ ds->user_mii_bus->name = "dsa user smi";
+ ds->user_mii_bus->read = dsa_user_phy_read;
+ ds->user_mii_bus->write = dsa_user_phy_write;
+ snprintf(ds->user_mii_bus->id, MII_BUS_ID_SIZE, "dsa-%d.%d",
+ ds->dst->index, ds->index);
+ ds->user_mii_bus->parent = ds->dev;
+ ds->user_mii_bus->phy_mask = ~ds->phys_mii_mask;
+}
+
+
+/* user device handling ****************************************************/
+static int dsa_user_get_iflink(const struct net_device *dev)
+{
+ return READ_ONCE(dsa_user_to_conduit(dev)->ifindex);
+}
+
+int dsa_user_host_uc_install(struct net_device *dev, const u8 *addr)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int err;
+
+ if (dsa_switch_supports_uc_filtering(ds)) {
+ err = dsa_port_standalone_host_fdb_add(dp, addr, 0);
+ if (err)
+ goto out;
+ }
+
+ if (!ether_addr_equal(addr, conduit->dev_addr)) {
+ err = dev_uc_add(conduit, addr);
+ if (err < 0)
+ goto del_host_addr;
+ }
+
+ return 0;
+
+del_host_addr:
+ if (dsa_switch_supports_uc_filtering(ds))
+ dsa_port_standalone_host_fdb_del(dp, addr, 0);
+out:
+ return err;
+}
+
+void dsa_user_host_uc_uninstall(struct net_device *dev)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ether_addr_equal(dev->dev_addr, conduit->dev_addr))
+ dev_uc_del(conduit, dev->dev_addr);
+
+ if (dsa_switch_supports_uc_filtering(ds))
+ dsa_port_standalone_host_fdb_del(dp, dev->dev_addr, 0);
+}
+
+static int dsa_user_open(struct net_device *dev)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ int err;
+
+ err = dev_open(conduit, NULL);
+ if (err < 0) {
+ netdev_err(dev, "failed to open conduit %s\n", conduit->name);
+ goto out;
+ }
+
+ err = dsa_user_host_uc_install(dev, dev->dev_addr);
+ if (err)
+ goto out;
+
+ err = dsa_port_enable_rt(dp, dev->phydev);
+ if (err)
+ goto out_del_host_uc;
+
+ return 0;
+
+out_del_host_uc:
+ dsa_user_host_uc_uninstall(dev);
+out:
+ return err;
+}
+
+static int dsa_user_close(struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+
+ dsa_port_disable_rt(dp);
+
+ dsa_user_host_uc_uninstall(dev);
+
+ return 0;
+}
+
+static void dsa_user_manage_host_flood(struct net_device *dev)
+{
+ bool mc = dev->flags & (IFF_PROMISC | IFF_ALLMULTI);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ bool uc = dev->flags & IFF_PROMISC;
+
+ dsa_port_set_host_flood(dp, uc, mc);
+}
+
+static void dsa_user_change_rx_flags(struct net_device *dev, int change)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (change & IFF_ALLMULTI)
+ dev_set_allmulti(conduit,
+ dev->flags & IFF_ALLMULTI ? 1 : -1);
+ if (change & IFF_PROMISC)
+ dev_set_promiscuity(conduit,
+ dev->flags & IFF_PROMISC ? 1 : -1);
+
+ if (dsa_switch_supports_uc_filtering(ds) &&
+ dsa_switch_supports_mc_filtering(ds))
+ dsa_user_manage_host_flood(dev);
+}
+
+static void dsa_user_set_rx_mode(struct net_device *dev)
+{
+ __dev_mc_sync(dev, dsa_user_sync_mc, dsa_user_unsync_mc);
+ __dev_uc_sync(dev, dsa_user_sync_uc, dsa_user_unsync_uc);
+}
+
+static int dsa_user_set_mac_address(struct net_device *dev, void *a)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ struct sockaddr *addr = a;
+ int err;
+
+ if (!is_valid_ether_addr(addr->sa_data))
+ return -EADDRNOTAVAIL;
+
+ if (ds->ops->port_set_mac_address) {
+ err = ds->ops->port_set_mac_address(ds, dp->index,
+ addr->sa_data);
+ if (err)
+ return err;
+ }
+
+ /* If the port is down, the address isn't synced yet to hardware or
+ * to the DSA conduit, so there is nothing to change.
+ */
+ if (!(dev->flags & IFF_UP))
+ goto out_change_dev_addr;
+
+ err = dsa_user_host_uc_install(dev, addr->sa_data);
+ if (err)
+ return err;
+
+ dsa_user_host_uc_uninstall(dev);
+
+out_change_dev_addr:
+ eth_hw_addr_set(dev, addr->sa_data);
+
+ return 0;
+}
+
+struct dsa_user_dump_ctx {
+ struct net_device *dev;
+ struct sk_buff *skb;
+ struct netlink_callback *cb;
+ int idx;
+};
+
+static int
+dsa_user_port_fdb_do_dump(const unsigned char *addr, u16 vid,
+ bool is_static, void *data)
+{
+ struct dsa_user_dump_ctx *dump = data;
+ struct ndo_fdb_dump_context *ctx = (void *)dump->cb->ctx;
+ u32 portid = NETLINK_CB(dump->cb->skb).portid;
+ u32 seq = dump->cb->nlh->nlmsg_seq;
+ struct nlmsghdr *nlh;
+ struct ndmsg *ndm;
+
+ if (dump->idx < ctx->fdb_idx)
+ goto skip;
+
+ nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH,
+ sizeof(*ndm), NLM_F_MULTI);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ndm = nlmsg_data(nlh);
+ ndm->ndm_family = AF_BRIDGE;
+ ndm->ndm_pad1 = 0;
+ ndm->ndm_pad2 = 0;
+ ndm->ndm_flags = NTF_SELF;
+ ndm->ndm_type = 0;
+ ndm->ndm_ifindex = dump->dev->ifindex;
+ ndm->ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE;
+
+ if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, addr))
+ goto nla_put_failure;
+
+ if (vid && nla_put_u16(dump->skb, NDA_VLAN, vid))
+ goto nla_put_failure;
+
+ nlmsg_end(dump->skb, nlh);
+
+skip:
+ dump->idx++;
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(dump->skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int
+dsa_user_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ struct net_device *dev, struct net_device *filter_dev,
+ int *idx)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_user_dump_ctx dump = {
+ .dev = dev,
+ .skb = skb,
+ .cb = cb,
+ .idx = *idx,
+ };
+ int err;
+
+ err = dsa_port_fdb_dump(dp, dsa_user_port_fdb_do_dump, &dump);
+ *idx = dump.idx;
+
+ return err;
+}
+
+static int dsa_user_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+ struct dsa_user_priv *p = netdev_priv(dev);
+
+ return phylink_mii_ioctl(p->dp->pl, ifr, cmd);
+}
+
+static int dsa_user_port_attr_set(struct net_device *dev, const void *ctx,
+ const struct switchdev_attr *attr,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ int ret;
+
+ if (ctx && ctx != dp)
+ return 0;
+
+ switch (attr->id) {
+ case SWITCHDEV_ATTR_ID_PORT_STP_STATE:
+ if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
+ return -EOPNOTSUPP;
+
+ ret = dsa_port_set_state(dp, attr->u.stp_state, true);
+ break;
+ case SWITCHDEV_ATTR_ID_PORT_MST_STATE:
+ if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
+ return -EOPNOTSUPP;
+
+ ret = dsa_port_set_mst_state(dp, &attr->u.mst_state, extack);
+ break;
+ case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING:
+ if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
+ return -EOPNOTSUPP;
+
+ ret = dsa_port_vlan_filtering(dp, attr->u.vlan_filtering,
+ extack);
+ break;
+ case SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME:
+ if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
+ return -EOPNOTSUPP;
+
+ ret = dsa_port_ageing_time(dp, attr->u.ageing_time);
+ break;
+ case SWITCHDEV_ATTR_ID_BRIDGE_MST:
+ if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
+ return -EOPNOTSUPP;
+
+ ret = dsa_port_mst_enable(dp, attr->u.mst, extack);
+ break;
+ case SWITCHDEV_ATTR_ID_PORT_PRE_BRIDGE_FLAGS:
+ if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
+ return -EOPNOTSUPP;
+
+ ret = dsa_port_pre_bridge_flags(dp, attr->u.brport_flags,
+ extack);
+ break;
+ case SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS:
+ if (!dsa_port_offloads_bridge_port(dp, attr->orig_dev))
+ return -EOPNOTSUPP;
+
+ ret = dsa_port_bridge_flags(dp, attr->u.brport_flags, extack);
+ break;
+ case SWITCHDEV_ATTR_ID_VLAN_MSTI:
+ if (!dsa_port_offloads_bridge_dev(dp, attr->orig_dev))
+ return -EOPNOTSUPP;
+
+ ret = dsa_port_vlan_msti(dp, &attr->u.vlan_msti);
+ break;
+ default:
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ return ret;
+}
+
+/* Must be called under rcu_read_lock() */
+static int
+dsa_user_vlan_check_for_8021q_uppers(struct net_device *user,
+ const struct switchdev_obj_port_vlan *vlan)
+{
+ struct net_device *upper_dev;
+ struct list_head *iter;
+
+ netdev_for_each_upper_dev_rcu(user, upper_dev, iter) {
+ u16 vid;
+
+ if (!is_vlan_dev(upper_dev))
+ continue;
+
+ vid = vlan_dev_vlan_id(upper_dev);
+ if (vid == vlan->vid)
+ return -EBUSY;
+ }
+
+ return 0;
+}
+
+static int dsa_user_vlan_add(struct net_device *dev,
+ const struct switchdev_obj *obj,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct switchdev_obj_port_vlan *vlan;
+ int err;
+
+ if (dsa_port_skip_vlan_configuration(dp)) {
+ NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN");
+ return 0;
+ }
+
+ vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
+
+ /* Deny adding a bridge VLAN when there is already an 802.1Q upper with
+ * the same VID.
+ */
+ if (br_vlan_enabled(dsa_port_bridge_dev_get(dp))) {
+ rcu_read_lock();
+ err = dsa_user_vlan_check_for_8021q_uppers(dev, vlan);
+ rcu_read_unlock();
+ if (err) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Port already has a VLAN upper with this VID");
+ return err;
+ }
+ }
+
+ return dsa_port_vlan_add(dp, vlan, extack);
+}
+
+/* Offload a VLAN installed on the bridge or on a foreign interface by
+ * installing it as a VLAN towards the CPU port.
+ */
+static int dsa_user_host_vlan_add(struct net_device *dev,
+ const struct switchdev_obj *obj,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct switchdev_obj_port_vlan vlan;
+
+ /* Do nothing if this is a software bridge */
+ if (!dp->bridge)
+ return -EOPNOTSUPP;
+
+ if (dsa_port_skip_vlan_configuration(dp)) {
+ NL_SET_ERR_MSG_MOD(extack, "skipping configuration of VLAN");
+ return 0;
+ }
+
+ vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj);
+
+ /* Even though drivers often handle CPU membership in special ways,
+ * it doesn't make sense to program a PVID, so clear this flag.
+ */
+ vlan.flags &= ~BRIDGE_VLAN_INFO_PVID;
+
+ return dsa_port_host_vlan_add(dp, &vlan, extack);
+}
+
+static int dsa_user_port_obj_add(struct net_device *dev, const void *ctx,
+ const struct switchdev_obj *obj,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ int err;
+
+ if (ctx && ctx != dp)
+ return 0;
+
+ switch (obj->id) {
+ case SWITCHDEV_OBJ_ID_PORT_MDB:
+ if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev))
+ return -EOPNOTSUPP;
+
+ err = dsa_port_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
+ break;
+ case SWITCHDEV_OBJ_ID_HOST_MDB:
+ if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+ return -EOPNOTSUPP;
+
+ err = dsa_port_bridge_host_mdb_add(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
+ break;
+ case SWITCHDEV_OBJ_ID_PORT_VLAN:
+ if (dsa_port_offloads_bridge_port(dp, obj->orig_dev))
+ err = dsa_user_vlan_add(dev, obj, extack);
+ else
+ err = dsa_user_host_vlan_add(dev, obj, extack);
+ break;
+ case SWITCHDEV_OBJ_ID_MRP:
+ if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+ return -EOPNOTSUPP;
+
+ err = dsa_port_mrp_add(dp, SWITCHDEV_OBJ_MRP(obj));
+ break;
+ case SWITCHDEV_OBJ_ID_RING_ROLE_MRP:
+ if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+ return -EOPNOTSUPP;
+
+ err = dsa_port_mrp_add_ring_role(dp,
+ SWITCHDEV_OBJ_RING_ROLE_MRP(obj));
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ return err;
+}
+
+static int dsa_user_vlan_del(struct net_device *dev,
+ const struct switchdev_obj *obj)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct switchdev_obj_port_vlan *vlan;
+
+ if (dsa_port_skip_vlan_configuration(dp))
+ return 0;
+
+ vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
+
+ return dsa_port_vlan_del(dp, vlan);
+}
+
+static int dsa_user_host_vlan_del(struct net_device *dev,
+ const struct switchdev_obj *obj)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct switchdev_obj_port_vlan *vlan;
+
+ /* Do nothing if this is a software bridge */
+ if (!dp->bridge)
+ return -EOPNOTSUPP;
+
+ if (dsa_port_skip_vlan_configuration(dp))
+ return 0;
+
+ vlan = SWITCHDEV_OBJ_PORT_VLAN(obj);
+
+ return dsa_port_host_vlan_del(dp, vlan);
+}
+
+static int dsa_user_port_obj_del(struct net_device *dev, const void *ctx,
+ const struct switchdev_obj *obj)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ int err;
+
+ if (ctx && ctx != dp)
+ return 0;
+
+ switch (obj->id) {
+ case SWITCHDEV_OBJ_ID_PORT_MDB:
+ if (!dsa_port_offloads_bridge_port(dp, obj->orig_dev))
+ return -EOPNOTSUPP;
+
+ err = dsa_port_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
+ break;
+ case SWITCHDEV_OBJ_ID_HOST_MDB:
+ if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+ return -EOPNOTSUPP;
+
+ err = dsa_port_bridge_host_mdb_del(dp, SWITCHDEV_OBJ_PORT_MDB(obj));
+ break;
+ case SWITCHDEV_OBJ_ID_PORT_VLAN:
+ if (dsa_port_offloads_bridge_port(dp, obj->orig_dev))
+ err = dsa_user_vlan_del(dev, obj);
+ else
+ err = dsa_user_host_vlan_del(dev, obj);
+ break;
+ case SWITCHDEV_OBJ_ID_MRP:
+ if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+ return -EOPNOTSUPP;
+
+ err = dsa_port_mrp_del(dp, SWITCHDEV_OBJ_MRP(obj));
+ break;
+ case SWITCHDEV_OBJ_ID_RING_ROLE_MRP:
+ if (!dsa_port_offloads_bridge_dev(dp, obj->orig_dev))
+ return -EOPNOTSUPP;
+
+ err = dsa_port_mrp_del_ring_role(dp,
+ SWITCHDEV_OBJ_RING_ROLE_MRP(obj));
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ return err;
+}
+
+static netdev_tx_t dsa_user_netpoll_send_skb(struct net_device *dev,
+ struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ struct dsa_user_priv *p = netdev_priv(dev);
+
+ return netpoll_send_skb(p->netpoll, skb);
+#else
+ BUG();
+ return NETDEV_TX_OK;
+#endif
+}
+
+static void dsa_skb_tx_timestamp(struct dsa_user_priv *p,
+ struct sk_buff *skb)
+{
+ struct dsa_switch *ds = p->dp->ds;
+
+ if (!(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NOBPF))
+ return;
+
+ if (!ds->ops->port_txtstamp)
+ return;
+
+ ds->ops->port_txtstamp(ds, p->dp->index, skb);
+}
+
+netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev)
+{
+ /* SKB for netpoll still need to be mangled with the protocol-specific
+ * tag to be successfully transmitted
+ */
+ if (unlikely(netpoll_tx_running(dev)))
+ return dsa_user_netpoll_send_skb(dev, skb);
+
+ /* Queue the SKB for transmission on the parent interface, but
+ * do not modify its EtherType
+ */
+ skb->dev = dsa_user_to_conduit(dev);
+ dev_queue_xmit(skb);
+
+ return NETDEV_TX_OK;
+}
+EXPORT_SYMBOL_GPL(dsa_enqueue_skb);
+
+static netdev_tx_t dsa_user_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct dsa_user_priv *p = netdev_priv(dev);
+ struct sk_buff *nskb;
+
+ dev_sw_netstats_tx_add(dev, 1, skb->len);
+
+ memset(skb->cb, 0, sizeof(skb->cb));
+
+ /* Handle tx timestamp if any */
+ dsa_skb_tx_timestamp(p, skb);
+
+ if (skb_ensure_writable_head_tail(skb, dev)) {
+ dev_kfree_skb_any(skb);
+ return NETDEV_TX_OK;
+ }
+
+ /* needed_tailroom should still be 'warm' in the cache line from
+ * skb_ensure_writable_head_tail(), which has also ensured that
+ * padding is safe.
+ */
+ if (dev->needed_tailroom)
+ eth_skb_pad(skb);
+
+ /* Transmit function may have to reallocate the original SKB,
+ * in which case it must have freed it. Only free it here on error.
+ */
+ nskb = p->xmit(skb, dev);
+ if (!nskb) {
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+ }
+
+ return dsa_enqueue_skb(nskb, dev);
+}
+
+/* ethtool operations *******************************************************/
+
+static void dsa_user_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *drvinfo)
+{
+ strscpy(drvinfo->driver, "dsa", sizeof(drvinfo->driver));
+ strscpy(drvinfo->fw_version, "N/A", sizeof(drvinfo->fw_version));
+ strscpy(drvinfo->bus_info, "platform", sizeof(drvinfo->bus_info));
+}
+
+static int dsa_user_get_regs_len(struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_regs_len)
+ return ds->ops->get_regs_len(ds, dp->index);
+
+ return -EOPNOTSUPP;
+}
+
+static void
+dsa_user_get_regs(struct net_device *dev, struct ethtool_regs *regs, void *_p)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_regs)
+ ds->ops->get_regs(ds, dp->index, regs, _p);
+}
+
+static int dsa_user_nway_reset(struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+
+ return phylink_ethtool_nway_reset(dp->pl);
+}
+
+static int dsa_user_get_eeprom_len(struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->cd && ds->cd->eeprom_len)
+ return ds->cd->eeprom_len;
+
+ if (ds->ops->get_eeprom_len)
+ return ds->ops->get_eeprom_len(ds);
+
+ return 0;
+}
+
+static int dsa_user_get_eeprom(struct net_device *dev,
+ struct ethtool_eeprom *eeprom, u8 *data)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_eeprom)
+ return ds->ops->get_eeprom(ds, eeprom, data);
+
+ return -EOPNOTSUPP;
+}
+
+static int dsa_user_set_eeprom(struct net_device *dev,
+ struct ethtool_eeprom *eeprom, u8 *data)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->set_eeprom)
+ return ds->ops->set_eeprom(ds, eeprom, data);
+
+ return -EOPNOTSUPP;
+}
+
+static void dsa_user_get_strings(struct net_device *dev,
+ uint32_t stringset, uint8_t *data)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (stringset == ETH_SS_STATS) {
+ ethtool_puts(&data, "tx_packets");
+ ethtool_puts(&data, "tx_bytes");
+ ethtool_puts(&data, "rx_packets");
+ ethtool_puts(&data, "rx_bytes");
+ if (ds->ops->get_strings)
+ ds->ops->get_strings(ds, dp->index, stringset, data);
+ } else if (stringset == ETH_SS_TEST) {
+ net_selftest_get_strings(data);
+ }
+
+}
+
+static void dsa_user_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats *stats,
+ uint64_t *data)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ struct pcpu_sw_netstats *s;
+ unsigned int start;
+ int i;
+
+ for_each_possible_cpu(i) {
+ u64 tx_packets, tx_bytes, rx_packets, rx_bytes;
+
+ s = per_cpu_ptr(dev->tstats, i);
+ do {
+ start = u64_stats_fetch_begin(&s->syncp);
+ tx_packets = u64_stats_read(&s->tx_packets);
+ tx_bytes = u64_stats_read(&s->tx_bytes);
+ rx_packets = u64_stats_read(&s->rx_packets);
+ rx_bytes = u64_stats_read(&s->rx_bytes);
+ } while (u64_stats_fetch_retry(&s->syncp, start));
+ data[0] += tx_packets;
+ data[1] += tx_bytes;
+ data[2] += rx_packets;
+ data[3] += rx_bytes;
+ }
+ if (ds->ops->get_ethtool_stats)
+ ds->ops->get_ethtool_stats(ds, dp->index, data + 4);
+}
+
+static int dsa_user_get_sset_count(struct net_device *dev, int sset)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (sset == ETH_SS_STATS) {
+ int count = 0;
+
+ if (ds->ops->get_sset_count) {
+ count = ds->ops->get_sset_count(ds, dp->index, sset);
+ if (count < 0)
+ return count;
+ }
+
+ return count + 4;
+ } else if (sset == ETH_SS_TEST) {
+ return net_selftest_get_count();
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static void dsa_user_get_eth_phy_stats(struct net_device *dev,
+ struct ethtool_eth_phy_stats *phy_stats)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_eth_phy_stats)
+ ds->ops->get_eth_phy_stats(ds, dp->index, phy_stats);
+}
+
+static void dsa_user_get_eth_mac_stats(struct net_device *dev,
+ struct ethtool_eth_mac_stats *mac_stats)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_eth_mac_stats)
+ ds->ops->get_eth_mac_stats(ds, dp->index, mac_stats);
+}
+
+static void
+dsa_user_get_eth_ctrl_stats(struct net_device *dev,
+ struct ethtool_eth_ctrl_stats *ctrl_stats)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_eth_ctrl_stats)
+ ds->ops->get_eth_ctrl_stats(ds, dp->index, ctrl_stats);
+}
+
+static void
+dsa_user_get_rmon_stats(struct net_device *dev,
+ struct ethtool_rmon_stats *rmon_stats,
+ const struct ethtool_rmon_hist_range **ranges)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_rmon_stats)
+ ds->ops->get_rmon_stats(ds, dp->index, rmon_stats, ranges);
+}
+
+static void dsa_user_get_ts_stats(struct net_device *dev,
+ struct ethtool_ts_stats *ts_stats)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_ts_stats)
+ ds->ops->get_ts_stats(ds, dp->index, ts_stats);
+}
+
+static void dsa_user_net_selftest(struct net_device *ndev,
+ struct ethtool_test *etest, u64 *buf)
+{
+ struct dsa_port *dp = dsa_user_to_port(ndev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->self_test) {
+ ds->ops->self_test(ds, dp->index, etest, buf);
+ return;
+ }
+
+ net_selftest(ndev, etest, buf);
+}
+
+static int dsa_user_get_mm(struct net_device *dev,
+ struct ethtool_mm_state *state)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->get_mm)
+ return -EOPNOTSUPP;
+
+ return ds->ops->get_mm(ds, dp->index, state);
+}
+
+static int dsa_user_set_mm(struct net_device *dev, struct ethtool_mm_cfg *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->set_mm)
+ return -EOPNOTSUPP;
+
+ return ds->ops->set_mm(ds, dp->index, cfg, extack);
+}
+
+static void dsa_user_get_mm_stats(struct net_device *dev,
+ struct ethtool_mm_stats *stats)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_mm_stats)
+ ds->ops->get_mm_stats(ds, dp->index, stats);
+}
+
+static void dsa_user_get_wol(struct net_device *dev, struct ethtool_wolinfo *w)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ phylink_ethtool_get_wol(dp->pl, w);
+
+ if (ds->ops->get_wol)
+ ds->ops->get_wol(ds, dp->index, w);
+}
+
+static int dsa_user_set_wol(struct net_device *dev, struct ethtool_wolinfo *w)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int ret = -EOPNOTSUPP;
+
+ phylink_ethtool_set_wol(dp->pl, w);
+
+ if (ds->ops->set_wol)
+ ret = ds->ops->set_wol(ds, dp->index, w);
+
+ return ret;
+}
+
+static int dsa_user_set_eee(struct net_device *dev, struct ethtool_keee *e)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int ret;
+
+ /* Check whether the switch supports EEE */
+ if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index))
+ return -EOPNOTSUPP;
+
+ /* If the port is using phylink managed EEE, then an unimplemented
+ * set_mac_eee() is permissible.
+ */
+ if (!phylink_mac_implements_lpi(ds->phylink_mac_ops)) {
+ /* Port's PHY and MAC both need to be EEE capable */
+ if (!dev->phydev)
+ return -ENODEV;
+
+ if (!ds->ops->set_mac_eee)
+ return -EOPNOTSUPP;
+
+ ret = ds->ops->set_mac_eee(ds, dp->index, e);
+ if (ret)
+ return ret;
+ } else if (ds->ops->set_mac_eee) {
+ ret = ds->ops->set_mac_eee(ds, dp->index, e);
+ if (ret)
+ return ret;
+ }
+
+ return phylink_ethtool_set_eee(dp->pl, e);
+}
+
+static int dsa_user_get_eee(struct net_device *dev, struct ethtool_keee *e)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ /* Check whether the switch supports EEE */
+ if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index))
+ return -EOPNOTSUPP;
+
+ /* Port's PHY and MAC both need to be EEE capable */
+ if (!dev->phydev)
+ return -ENODEV;
+
+ return phylink_ethtool_get_eee(dp->pl, e);
+}
+
+static int dsa_user_get_link_ksettings(struct net_device *dev,
+ struct ethtool_link_ksettings *cmd)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+
+ return phylink_ethtool_ksettings_get(dp->pl, cmd);
+}
+
+static int dsa_user_set_link_ksettings(struct net_device *dev,
+ const struct ethtool_link_ksettings *cmd)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+
+ return phylink_ethtool_ksettings_set(dp->pl, cmd);
+}
+
+static void dsa_user_get_pause_stats(struct net_device *dev,
+ struct ethtool_pause_stats *pause_stats)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_pause_stats)
+ ds->ops->get_pause_stats(ds, dp->index, pause_stats);
+}
+
+static void dsa_user_get_pauseparam(struct net_device *dev,
+ struct ethtool_pauseparam *pause)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+
+ phylink_ethtool_get_pauseparam(dp->pl, pause);
+}
+
+static int dsa_user_set_pauseparam(struct net_device *dev,
+ struct ethtool_pauseparam *pause)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+
+ return phylink_ethtool_set_pauseparam(dp->pl, pause);
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static int dsa_user_netpoll_setup(struct net_device *dev)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+ struct dsa_user_priv *p = netdev_priv(dev);
+ struct netpoll *netpoll;
+ int err = 0;
+
+ netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL);
+ if (!netpoll)
+ return -ENOMEM;
+
+ err = __netpoll_setup(netpoll, conduit);
+ if (err) {
+ kfree(netpoll);
+ goto out;
+ }
+
+ p->netpoll = netpoll;
+out:
+ return err;
+}
+
+static void dsa_user_netpoll_cleanup(struct net_device *dev)
+{
+ struct dsa_user_priv *p = netdev_priv(dev);
+ struct netpoll *netpoll = p->netpoll;
+
+ if (!netpoll)
+ return;
+
+ p->netpoll = NULL;
+
+ __netpoll_free(netpoll);
+}
+
+static void dsa_user_poll_controller(struct net_device *dev)
+{
+}
+#endif
+
+static struct dsa_mall_tc_entry *
+dsa_user_mall_tc_entry_find(struct net_device *dev, unsigned long cookie)
+{
+ struct dsa_user_priv *p = netdev_priv(dev);
+ struct dsa_mall_tc_entry *mall_tc_entry;
+
+ list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list)
+ if (mall_tc_entry->cookie == cookie)
+ return mall_tc_entry;
+
+ return NULL;
+}
+
+static int
+dsa_user_add_cls_matchall_mirred(struct net_device *dev,
+ struct tc_cls_matchall_offload *cls,
+ bool ingress, bool ingress_target)
+{
+ struct netlink_ext_ack *extack = cls->common.extack;
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_user_priv *p = netdev_priv(dev);
+ struct dsa_mall_mirror_tc_entry *mirror;
+ struct dsa_mall_tc_entry *mall_tc_entry;
+ struct dsa_switch *ds = dp->ds;
+ struct flow_action_entry *act;
+ struct dsa_port *to_dp;
+ int err;
+
+ if (cls->common.protocol != htons(ETH_P_ALL)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Can only offload \"protocol all\" matchall filter");
+ return -EOPNOTSUPP;
+ }
+
+ if (!ds->ops->port_mirror_add) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Switch does not support mirroring operation");
+ return -EOPNOTSUPP;
+ }
+
+ if (!flow_action_basic_hw_stats_check(&cls->rule->action, extack))
+ return -EOPNOTSUPP;
+
+ act = &cls->rule->action.entries[0];
+
+ if (!act->dev)
+ return -EINVAL;
+
+ if (dsa_user_dev_check(act->dev)) {
+ if (ingress_target) {
+ /* We can only fulfill this using software assist */
+ if (cls->common.skip_sw) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Can only mirred to ingress of DSA user port if filter also runs in software");
+ return -EOPNOTSUPP;
+ }
+ to_dp = dp->cpu_dp;
+ } else {
+ to_dp = dsa_user_to_port(act->dev);
+ }
+ } else {
+ /* Handle mirroring to foreign target ports as a mirror towards
+ * the CPU. The software tc rule will take the packets from
+ * there.
+ */
+ if (cls->common.skip_sw) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Can only mirred to CPU if filter also runs in software");
+ return -EOPNOTSUPP;
+ }
+ to_dp = dp->cpu_dp;
+ }
+
+ if (dp->ds != to_dp->ds) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Cross-chip mirroring not implemented");
+ return -EOPNOTSUPP;
+ }
+
+ mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
+ if (!mall_tc_entry)
+ return -ENOMEM;
+
+ mall_tc_entry->cookie = cls->cookie;
+ mall_tc_entry->type = DSA_PORT_MALL_MIRROR;
+ mirror = &mall_tc_entry->mirror;
+ mirror->to_local_port = to_dp->index;
+ mirror->ingress = ingress;
+
+ err = ds->ops->port_mirror_add(ds, dp->index, mirror, ingress, extack);
+ if (err) {
+ kfree(mall_tc_entry);
+ return err;
+ }
+
+ list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
+
+ return err;
+}
+
+static int
+dsa_user_add_cls_matchall_police(struct net_device *dev,
+ struct tc_cls_matchall_offload *cls,
+ bool ingress)
+{
+ struct netlink_ext_ack *extack = cls->common.extack;
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_user_priv *p = netdev_priv(dev);
+ struct dsa_mall_policer_tc_entry *policer;
+ struct dsa_mall_tc_entry *mall_tc_entry;
+ struct dsa_switch *ds = dp->ds;
+ struct flow_action_entry *act;
+ int err;
+
+ if (!ds->ops->port_policer_add) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Policing offload not implemented");
+ return -EOPNOTSUPP;
+ }
+
+ if (!ingress) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Only supported on ingress qdisc");
+ return -EOPNOTSUPP;
+ }
+
+ if (!flow_action_basic_hw_stats_check(&cls->rule->action, extack))
+ return -EOPNOTSUPP;
+
+ list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) {
+ if (mall_tc_entry->type == DSA_PORT_MALL_POLICER) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Only one port policer allowed");
+ return -EEXIST;
+ }
+ }
+
+ act = &cls->rule->action.entries[0];
+
+ mall_tc_entry = kzalloc(sizeof(*mall_tc_entry), GFP_KERNEL);
+ if (!mall_tc_entry)
+ return -ENOMEM;
+
+ mall_tc_entry->cookie = cls->cookie;
+ mall_tc_entry->type = DSA_PORT_MALL_POLICER;
+ policer = &mall_tc_entry->policer;
+ policer->rate_bytes_per_sec = act->police.rate_bytes_ps;
+ policer->burst = act->police.burst;
+
+ err = ds->ops->port_policer_add(ds, dp->index, policer);
+ if (err) {
+ kfree(mall_tc_entry);
+ return err;
+ }
+
+ list_add_tail(&mall_tc_entry->list, &p->mall_tc_list);
+
+ return err;
+}
+
+static int dsa_user_add_cls_matchall(struct net_device *dev,
+ struct tc_cls_matchall_offload *cls,
+ bool ingress)
+{
+ const struct flow_action *action = &cls->rule->action;
+ struct netlink_ext_ack *extack = cls->common.extack;
+
+ if (!flow_offload_has_one_action(action)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Cannot offload matchall filter with more than one action");
+ return -EOPNOTSUPP;
+ }
+
+ switch (action->entries[0].id) {
+ case FLOW_ACTION_MIRRED:
+ return dsa_user_add_cls_matchall_mirred(dev, cls, ingress,
+ false);
+ case FLOW_ACTION_MIRRED_INGRESS:
+ return dsa_user_add_cls_matchall_mirred(dev, cls, ingress,
+ true);
+ case FLOW_ACTION_POLICE:
+ return dsa_user_add_cls_matchall_police(dev, cls, ingress);
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unknown action");
+ break;
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static void dsa_user_del_cls_matchall(struct net_device *dev,
+ struct tc_cls_matchall_offload *cls)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_mall_tc_entry *mall_tc_entry;
+ struct dsa_switch *ds = dp->ds;
+
+ mall_tc_entry = dsa_user_mall_tc_entry_find(dev, cls->cookie);
+ if (!mall_tc_entry)
+ return;
+
+ list_del(&mall_tc_entry->list);
+
+ switch (mall_tc_entry->type) {
+ case DSA_PORT_MALL_MIRROR:
+ if (ds->ops->port_mirror_del)
+ ds->ops->port_mirror_del(ds, dp->index,
+ &mall_tc_entry->mirror);
+ break;
+ case DSA_PORT_MALL_POLICER:
+ if (ds->ops->port_policer_del)
+ ds->ops->port_policer_del(ds, dp->index);
+ break;
+ default:
+ WARN_ON(1);
+ }
+
+ kfree(mall_tc_entry);
+}
+
+static int dsa_user_setup_tc_cls_matchall(struct net_device *dev,
+ struct tc_cls_matchall_offload *cls,
+ bool ingress)
+{
+ if (cls->common.chain_index)
+ return -EOPNOTSUPP;
+
+ switch (cls->command) {
+ case TC_CLSMATCHALL_REPLACE:
+ return dsa_user_add_cls_matchall(dev, cls, ingress);
+ case TC_CLSMATCHALL_DESTROY:
+ dsa_user_del_cls_matchall(dev, cls);
+ return 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int dsa_user_add_cls_flower(struct net_device *dev,
+ struct flow_cls_offload *cls,
+ bool ingress)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+
+ if (!ds->ops->cls_flower_add)
+ return -EOPNOTSUPP;
+
+ return ds->ops->cls_flower_add(ds, port, cls, ingress);
+}
+
+static int dsa_user_del_cls_flower(struct net_device *dev,
+ struct flow_cls_offload *cls,
+ bool ingress)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+
+ if (!ds->ops->cls_flower_del)
+ return -EOPNOTSUPP;
+
+ return ds->ops->cls_flower_del(ds, port, cls, ingress);
+}
+
+static int dsa_user_stats_cls_flower(struct net_device *dev,
+ struct flow_cls_offload *cls,
+ bool ingress)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+
+ if (!ds->ops->cls_flower_stats)
+ return -EOPNOTSUPP;
+
+ return ds->ops->cls_flower_stats(ds, port, cls, ingress);
+}
+
+static int dsa_user_setup_tc_cls_flower(struct net_device *dev,
+ struct flow_cls_offload *cls,
+ bool ingress)
+{
+ switch (cls->command) {
+ case FLOW_CLS_REPLACE:
+ return dsa_user_add_cls_flower(dev, cls, ingress);
+ case FLOW_CLS_DESTROY:
+ return dsa_user_del_cls_flower(dev, cls, ingress);
+ case FLOW_CLS_STATS:
+ return dsa_user_stats_cls_flower(dev, cls, ingress);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int dsa_user_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+ void *cb_priv, bool ingress)
+{
+ struct net_device *dev = cb_priv;
+
+ if (!tc_can_offload(dev))
+ return -EOPNOTSUPP;
+
+ switch (type) {
+ case TC_SETUP_CLSMATCHALL:
+ return dsa_user_setup_tc_cls_matchall(dev, type_data, ingress);
+ case TC_SETUP_CLSFLOWER:
+ return dsa_user_setup_tc_cls_flower(dev, type_data, ingress);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int dsa_user_setup_tc_block_cb_ig(enum tc_setup_type type,
+ void *type_data, void *cb_priv)
+{
+ return dsa_user_setup_tc_block_cb(type, type_data, cb_priv, true);
+}
+
+static int dsa_user_setup_tc_block_cb_eg(enum tc_setup_type type,
+ void *type_data, void *cb_priv)
+{
+ return dsa_user_setup_tc_block_cb(type, type_data, cb_priv, false);
+}
+
+static LIST_HEAD(dsa_user_block_cb_list);
+
+static int dsa_user_setup_tc_block(struct net_device *dev,
+ struct flow_block_offload *f)
+{
+ struct flow_block_cb *block_cb;
+ flow_setup_cb_t *cb;
+
+ if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+ cb = dsa_user_setup_tc_block_cb_ig;
+ else if (f->binder_type == FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS)
+ cb = dsa_user_setup_tc_block_cb_eg;
+ else
+ return -EOPNOTSUPP;
+
+ f->driver_block_list = &dsa_user_block_cb_list;
+
+ switch (f->command) {
+ case FLOW_BLOCK_BIND:
+ if (flow_block_cb_is_busy(cb, dev, &dsa_user_block_cb_list))
+ return -EBUSY;
+
+ block_cb = flow_block_cb_alloc(cb, dev, dev, NULL);
+ if (IS_ERR(block_cb))
+ return PTR_ERR(block_cb);
+
+ flow_block_cb_add(block_cb, f);
+ list_add_tail(&block_cb->driver_list, &dsa_user_block_cb_list);
+ return 0;
+ case FLOW_BLOCK_UNBIND:
+ block_cb = flow_block_cb_lookup(f->block, cb, dev);
+ if (!block_cb)
+ return -ENOENT;
+
+ flow_block_cb_remove(block_cb, f);
+ list_del(&block_cb->driver_list);
+ return 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int dsa_user_setup_ft_block(struct dsa_switch *ds, int port,
+ void *type_data)
+{
+ struct net_device *conduit = dsa_port_to_conduit(dsa_to_port(ds, port));
+
+ if (!conduit->netdev_ops->ndo_setup_tc)
+ return -EOPNOTSUPP;
+
+ return conduit->netdev_ops->ndo_setup_tc(conduit, TC_SETUP_FT, type_data);
+}
+
+static int dsa_user_setup_tc(struct net_device *dev, enum tc_setup_type type,
+ void *type_data)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ switch (type) {
+ case TC_SETUP_BLOCK:
+ return dsa_user_setup_tc_block(dev, type_data);
+ case TC_SETUP_FT:
+ return dsa_user_setup_ft_block(ds, dp->index, type_data);
+ default:
+ break;
+ }
+
+ if (!ds->ops->port_setup_tc)
+ return -EOPNOTSUPP;
+
+ return ds->ops->port_setup_tc(ds, dp->index, type, type_data);
+}
+
+static int dsa_user_get_rxnfc(struct net_device *dev,
+ struct ethtool_rxnfc *nfc, u32 *rule_locs)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->get_rxnfc)
+ return -EOPNOTSUPP;
+
+ return ds->ops->get_rxnfc(ds, dp->index, nfc, rule_locs);
+}
+
+static int dsa_user_set_rxnfc(struct net_device *dev,
+ struct ethtool_rxnfc *nfc)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->set_rxnfc)
+ return -EOPNOTSUPP;
+
+ return ds->ops->set_rxnfc(ds, dp->index, nfc);
+}
+
+static int dsa_user_get_ts_info(struct net_device *dev,
+ struct kernel_ethtool_ts_info *ts)
+{
+ struct dsa_user_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->dp->ds;
+
+ if (!ds->ops->get_ts_info)
+ return -EOPNOTSUPP;
+
+ return ds->ops->get_ts_info(ds, p->dp->index, ts);
+}
+
+static int dsa_user_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
+ u16 vid)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct switchdev_obj_port_vlan vlan = {
+ .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+ .vid = vid,
+ /* This API only allows programming tagged, non-PVID VIDs */
+ .flags = 0,
+ };
+ struct netlink_ext_ack extack = {0};
+ struct dsa_switch *ds = dp->ds;
+ struct netdev_hw_addr *ha;
+ struct dsa_vlan *v;
+ int ret;
+
+ /* User port... */
+ ret = dsa_port_vlan_add(dp, &vlan, &extack);
+ if (ret) {
+ if (extack._msg)
+ netdev_err(dev, "%s\n", extack._msg);
+ return ret;
+ }
+
+ /* And CPU port... */
+ ret = dsa_port_host_vlan_add(dp, &vlan, &extack);
+ if (ret) {
+ if (extack._msg)
+ netdev_err(dev, "CPU port %d: %s\n", dp->cpu_dp->index,
+ extack._msg);
+ return ret;
+ }
+
+ if (!dsa_switch_supports_uc_filtering(ds) &&
+ !dsa_switch_supports_mc_filtering(ds))
+ return 0;
+
+ v = kzalloc(sizeof(*v), GFP_KERNEL);
+ if (!v) {
+ ret = -ENOMEM;
+ goto rollback;
+ }
+
+ netif_addr_lock_bh(dev);
+
+ v->vid = vid;
+ list_add_tail(&v->list, &dp->user_vlans);
+
+ if (dsa_switch_supports_mc_filtering(ds)) {
+ netdev_for_each_synced_mc_addr(ha, dev) {
+ dsa_user_schedule_standalone_work(dev, DSA_MC_ADD,
+ ha->addr, vid);
+ }
+ }
+
+ if (dsa_switch_supports_uc_filtering(ds)) {
+ netdev_for_each_synced_uc_addr(ha, dev) {
+ dsa_user_schedule_standalone_work(dev, DSA_UC_ADD,
+ ha->addr, vid);
+ }
+ }
+
+ netif_addr_unlock_bh(dev);
+
+ dsa_flush_workqueue();
+
+ return 0;
+
+rollback:
+ dsa_port_host_vlan_del(dp, &vlan);
+ dsa_port_vlan_del(dp, &vlan);
+
+ return ret;
+}
+
+static int dsa_user_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
+ u16 vid)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct switchdev_obj_port_vlan vlan = {
+ .vid = vid,
+ /* This API only allows programming tagged, non-PVID VIDs */
+ .flags = 0,
+ };
+ struct dsa_switch *ds = dp->ds;
+ struct netdev_hw_addr *ha;
+ struct dsa_vlan *v;
+ int err;
+
+ err = dsa_port_vlan_del(dp, &vlan);
+ if (err)
+ return err;
+
+ err = dsa_port_host_vlan_del(dp, &vlan);
+ if (err)
+ return err;
+
+ if (!dsa_switch_supports_uc_filtering(ds) &&
+ !dsa_switch_supports_mc_filtering(ds))
+ return 0;
+
+ netif_addr_lock_bh(dev);
+
+ v = dsa_vlan_find(&dp->user_vlans, &vlan);
+ if (!v) {
+ netif_addr_unlock_bh(dev);
+ return -ENOENT;
+ }
+
+ list_del(&v->list);
+ kfree(v);
+
+ if (dsa_switch_supports_mc_filtering(ds)) {
+ netdev_for_each_synced_mc_addr(ha, dev) {
+ dsa_user_schedule_standalone_work(dev, DSA_MC_DEL,
+ ha->addr, vid);
+ }
+ }
+
+ if (dsa_switch_supports_uc_filtering(ds)) {
+ netdev_for_each_synced_uc_addr(ha, dev) {
+ dsa_user_schedule_standalone_work(dev, DSA_UC_DEL,
+ ha->addr, vid);
+ }
+ }
+
+ netif_addr_unlock_bh(dev);
+
+ dsa_flush_workqueue();
+
+ return 0;
+}
+
+static int dsa_user_restore_vlan(struct net_device *vdev, int vid, void *arg)
+{
+ __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q);
+
+ return dsa_user_vlan_rx_add_vid(arg, proto, vid);
+}
+
+static int dsa_user_clear_vlan(struct net_device *vdev, int vid, void *arg)
+{
+ __be16 proto = vdev ? vlan_dev_vlan_proto(vdev) : htons(ETH_P_8021Q);
+
+ return dsa_user_vlan_rx_kill_vid(arg, proto, vid);
+}
+
+/* Keep the VLAN RX filtering list in sync with the hardware only if VLAN
+ * filtering is enabled. The baseline is that only ports that offload a
+ * VLAN-aware bridge are VLAN-aware, and standalone ports are VLAN-unaware,
+ * but there are exceptions for quirky hardware.
+ *
+ * If ds->vlan_filtering_is_global = true, then standalone ports which share
+ * the same switch with other ports that offload a VLAN-aware bridge are also
+ * inevitably VLAN-aware.
+ *
+ * To summarize, a DSA switch port offloads:
+ *
+ * - If standalone (this includes software bridge, software LAG):
+ * - if ds->needs_standalone_vlan_filtering = true, OR if
+ * (ds->vlan_filtering_is_global = true AND there are bridges spanning
+ * this switch chip which have vlan_filtering=1)
+ * - the 8021q upper VLANs
+ * - else (standalone VLAN filtering is not needed, VLAN filtering is not
+ * global, or it is, but no port is under a VLAN-aware bridge):
+ * - no VLAN (any 8021q upper is a software VLAN)
+ *
+ * - If under a vlan_filtering=0 bridge which it offload:
+ * - if ds->configure_vlan_while_not_filtering = true (default):
+ * - the bridge VLANs. These VLANs are committed to hardware but inactive.
+ * - else (deprecated):
+ * - no VLAN. The bridge VLANs are not restored when VLAN awareness is
+ * enabled, so this behavior is broken and discouraged.
+ *
+ * - If under a vlan_filtering=1 bridge which it offload:
+ * - the bridge VLANs
+ * - the 8021q upper VLANs
+ */
+int dsa_user_manage_vlan_filtering(struct net_device *user,
+ bool vlan_filtering)
+{
+ int err;
+
+ if (vlan_filtering) {
+ user->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
+
+ err = vlan_for_each(user, dsa_user_restore_vlan, user);
+ if (err) {
+ vlan_for_each(user, dsa_user_clear_vlan, user);
+ user->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER;
+ return err;
+ }
+ } else {
+ err = vlan_for_each(user, dsa_user_clear_vlan, user);
+ if (err)
+ return err;
+
+ user->features &= ~NETIF_F_HW_VLAN_CTAG_FILTER;
+ }
+
+ return 0;
+}
+
+struct dsa_hw_port {
+ struct list_head list;
+ struct net_device *dev;
+ int old_mtu;
+};
+
+static int dsa_hw_port_list_set_mtu(struct list_head *hw_port_list, int mtu)
+{
+ const struct dsa_hw_port *p;
+ int err;
+
+ list_for_each_entry(p, hw_port_list, list) {
+ if (p->dev->mtu == mtu)
+ continue;
+
+ err = dev_set_mtu(p->dev, mtu);
+ if (err)
+ goto rollback;
+ }
+
+ return 0;
+
+rollback:
+ list_for_each_entry_continue_reverse(p, hw_port_list, list) {
+ if (p->dev->mtu == p->old_mtu)
+ continue;
+
+ if (dev_set_mtu(p->dev, p->old_mtu))
+ netdev_err(p->dev, "Failed to restore MTU\n");
+ }
+
+ return err;
+}
+
+static void dsa_hw_port_list_free(struct list_head *hw_port_list)
+{
+ struct dsa_hw_port *p, *n;
+
+ list_for_each_entry_safe(p, n, hw_port_list, list)
+ kfree(p);
+}
+
+/* Make the hardware datapath to/from @dev limited to a common MTU */
+static void dsa_bridge_mtu_normalization(struct dsa_port *dp)
+{
+ struct list_head hw_port_list;
+ struct dsa_switch_tree *dst;
+ int min_mtu = ETH_MAX_MTU;
+ struct dsa_port *other_dp;
+ int err;
+
+ if (!dp->ds->mtu_enforcement_ingress)
+ return;
+
+ if (!dp->bridge)
+ return;
+
+ INIT_LIST_HEAD(&hw_port_list);
+
+ /* Populate the list of ports that are part of the same bridge
+ * as the newly added/modified port
+ */
+ list_for_each_entry(dst, &dsa_tree_list, list) {
+ list_for_each_entry(other_dp, &dst->ports, list) {
+ struct dsa_hw_port *hw_port;
+ struct net_device *user;
+
+ if (other_dp->type != DSA_PORT_TYPE_USER)
+ continue;
+
+ if (!dsa_port_bridge_same(dp, other_dp))
+ continue;
+
+ if (!other_dp->ds->mtu_enforcement_ingress)
+ continue;
+
+ user = other_dp->user;
+
+ if (min_mtu > user->mtu)
+ min_mtu = user->mtu;
+
+ hw_port = kzalloc(sizeof(*hw_port), GFP_KERNEL);
+ if (!hw_port)
+ goto out;
+
+ hw_port->dev = user;
+ hw_port->old_mtu = user->mtu;
+
+ list_add(&hw_port->list, &hw_port_list);
+ }
+ }
+
+ /* Attempt to configure the entire hardware bridge to the newly added
+ * interface's MTU first, regardless of whether the intention of the
+ * user was to raise or lower it.
+ */
+ err = dsa_hw_port_list_set_mtu(&hw_port_list, dp->user->mtu);
+ if (!err)
+ goto out;
+
+ /* Clearly that didn't work out so well, so just set the minimum MTU on
+ * all hardware bridge ports now. If this fails too, then all ports will
+ * still have their old MTU rolled back anyway.
+ */
+ dsa_hw_port_list_set_mtu(&hw_port_list, min_mtu);
+
+out:
+ dsa_hw_port_list_free(&hw_port_list);
+}
+
+int dsa_user_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct net_device *conduit = dsa_user_to_conduit(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_port *cpu_dp = dp->cpu_dp;
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_port *other_dp;
+ int largest_mtu = 0;
+ int new_conduit_mtu;
+ int old_conduit_mtu;
+ int mtu_limit;
+ int overhead;
+ int cpu_mtu;
+ int err;
+
+ if (!ds->ops->port_change_mtu)
+ return -EOPNOTSUPP;
+
+ dsa_tree_for_each_user_port(other_dp, ds->dst) {
+ int user_mtu;
+
+ /* During probe, this function will be called for each user
+ * device, while not all of them have been allocated. That's
+ * ok, it doesn't change what the maximum is, so ignore it.
+ */
+ if (!other_dp->user)
+ continue;
+
+ /* Pretend that we already applied the setting, which we
+ * actually haven't (still haven't done all integrity checks)
+ */
+ if (dp == other_dp)
+ user_mtu = new_mtu;
+ else
+ user_mtu = other_dp->user->mtu;
+
+ if (largest_mtu < user_mtu)
+ largest_mtu = user_mtu;
+ }
+
+ overhead = dsa_tag_protocol_overhead(cpu_dp->tag_ops);
+ mtu_limit = min_t(int, conduit->max_mtu, dev->max_mtu + overhead);
+ old_conduit_mtu = conduit->mtu;
+ new_conduit_mtu = largest_mtu + overhead;
+ if (new_conduit_mtu > mtu_limit)
+ return -ERANGE;
+
+ /* If the conduit MTU isn't over limit, there's no need to check the CPU
+ * MTU, since that surely isn't either.
+ */
+ cpu_mtu = largest_mtu;
+
+ /* Start applying stuff */
+ if (new_conduit_mtu != old_conduit_mtu) {
+ err = dev_set_mtu(conduit, new_conduit_mtu);
+ if (err < 0)
+ goto out_conduit_failed;
+
+ /* We only need to propagate the MTU of the CPU port to
+ * upstream switches, so emit a notifier which updates them.
+ */
+ err = dsa_port_mtu_change(cpu_dp, cpu_mtu);
+ if (err)
+ goto out_cpu_failed;
+ }
+
+ err = ds->ops->port_change_mtu(ds, dp->index, new_mtu);
+ if (err)
+ goto out_port_failed;
+
+ WRITE_ONCE(dev->mtu, new_mtu);
+
+ dsa_bridge_mtu_normalization(dp);
+
+ return 0;
+
+out_port_failed:
+ if (new_conduit_mtu != old_conduit_mtu)
+ dsa_port_mtu_change(cpu_dp, old_conduit_mtu - overhead);
+out_cpu_failed:
+ if (new_conduit_mtu != old_conduit_mtu)
+ dev_set_mtu(conduit, old_conduit_mtu);
+out_conduit_failed:
+ return err;
+}
+
+static int __maybe_unused
+dsa_user_dcbnl_set_apptrust(struct net_device *dev, u8 *sel, int nsel)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+
+ if (!ds->ops->port_set_apptrust)
+ return -EOPNOTSUPP;
+
+ return ds->ops->port_set_apptrust(ds, port, sel, nsel);
+}
+
+static int __maybe_unused
+dsa_user_dcbnl_get_apptrust(struct net_device *dev, u8 *sel, int *nsel)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+
+ if (!ds->ops->port_get_apptrust)
+ return -EOPNOTSUPP;
+
+ return ds->ops->port_get_apptrust(ds, port, sel, nsel);
+}
+
+static int __maybe_unused
+dsa_user_dcbnl_set_default_prio(struct net_device *dev, struct dcb_app *app)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ unsigned long mask, new_prio;
+ int err, port = dp->index;
+
+ if (!ds->ops->port_set_default_prio)
+ return -EOPNOTSUPP;
+
+ err = dcb_ieee_setapp(dev, app);
+ if (err)
+ return err;
+
+ mask = dcb_ieee_getapp_mask(dev, app);
+ new_prio = __fls(mask);
+
+ err = ds->ops->port_set_default_prio(ds, port, new_prio);
+ if (err) {
+ dcb_ieee_delapp(dev, app);
+ return err;
+ }
+
+ return 0;
+}
+
+/* Update the DSCP prio entries on all user ports of the switch in case
+ * the switch supports global DSCP prio instead of per port DSCP prios.
+ */
+static int dsa_user_dcbnl_ieee_global_dscp_setdel(struct net_device *dev,
+ struct dcb_app *app, bool del)
+{
+ int (*setdel)(struct net_device *dev, struct dcb_app *app);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ struct dsa_port *other_dp;
+ int err, restore_err;
+
+ if (del)
+ setdel = dcb_ieee_delapp;
+ else
+ setdel = dcb_ieee_setapp;
+
+ dsa_switch_for_each_user_port(other_dp, ds) {
+ struct net_device *user = other_dp->user;
+
+ if (!user || user == dev)
+ continue;
+
+ err = setdel(user, app);
+ if (err)
+ goto err_try_to_restore;
+ }
+
+ return 0;
+
+err_try_to_restore:
+
+ /* Revert logic to restore previous state of app entries */
+ if (!del)
+ setdel = dcb_ieee_delapp;
+ else
+ setdel = dcb_ieee_setapp;
+
+ dsa_switch_for_each_user_port_continue_reverse(other_dp, ds) {
+ struct net_device *user = other_dp->user;
+
+ if (!user || user == dev)
+ continue;
+
+ restore_err = setdel(user, app);
+ if (restore_err)
+ netdev_err(user, "Failed to restore DSCP prio entry configuration\n");
+ }
+
+ return err;
+}
+
+static int __maybe_unused
+dsa_user_dcbnl_add_dscp_prio(struct net_device *dev, struct dcb_app *app)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ unsigned long mask, new_prio;
+ int err, port = dp->index;
+ u8 dscp = app->protocol;
+
+ if (!ds->ops->port_add_dscp_prio)
+ return -EOPNOTSUPP;
+
+ if (dscp >= 64) {
+ netdev_err(dev, "DSCP APP entry with protocol value %u is invalid\n",
+ dscp);
+ return -EINVAL;
+ }
+
+ err = dcb_ieee_setapp(dev, app);
+ if (err)
+ return err;
+
+ mask = dcb_ieee_getapp_mask(dev, app);
+ new_prio = __fls(mask);
+
+ err = ds->ops->port_add_dscp_prio(ds, port, dscp, new_prio);
+ if (err) {
+ dcb_ieee_delapp(dev, app);
+ return err;
+ }
+
+ if (!ds->dscp_prio_mapping_is_global)
+ return 0;
+
+ err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, false);
+ if (err) {
+ if (ds->ops->port_del_dscp_prio)
+ ds->ops->port_del_dscp_prio(ds, port, dscp, new_prio);
+ dcb_ieee_delapp(dev, app);
+ return err;
+ }
+
+ return 0;
+}
+
+static int __maybe_unused dsa_user_dcbnl_ieee_setapp(struct net_device *dev,
+ struct dcb_app *app)
+{
+ switch (app->selector) {
+ case IEEE_8021QAZ_APP_SEL_ETHERTYPE:
+ switch (app->protocol) {
+ case 0:
+ return dsa_user_dcbnl_set_default_prio(dev, app);
+ default:
+ return -EOPNOTSUPP;
+ }
+ break;
+ case IEEE_8021QAZ_APP_SEL_DSCP:
+ return dsa_user_dcbnl_add_dscp_prio(dev, app);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static int __maybe_unused
+dsa_user_dcbnl_del_default_prio(struct net_device *dev, struct dcb_app *app)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ unsigned long mask, new_prio;
+ int err, port = dp->index;
+
+ if (!ds->ops->port_set_default_prio)
+ return -EOPNOTSUPP;
+
+ err = dcb_ieee_delapp(dev, app);
+ if (err)
+ return err;
+
+ mask = dcb_ieee_getapp_mask(dev, app);
+ new_prio = mask ? __fls(mask) : 0;
+
+ err = ds->ops->port_set_default_prio(ds, port, new_prio);
+ if (err) {
+ dcb_ieee_setapp(dev, app);
+ return err;
+ }
+
+ return 0;
+}
+
+static int __maybe_unused
+dsa_user_dcbnl_del_dscp_prio(struct net_device *dev, struct dcb_app *app)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int err, port = dp->index;
+ u8 dscp = app->protocol;
+
+ if (!ds->ops->port_del_dscp_prio)
+ return -EOPNOTSUPP;
+
+ err = dcb_ieee_delapp(dev, app);
+ if (err)
+ return err;
+
+ err = ds->ops->port_del_dscp_prio(ds, port, dscp, app->priority);
+ if (err) {
+ dcb_ieee_setapp(dev, app);
+ return err;
+ }
+
+ if (!ds->dscp_prio_mapping_is_global)
+ return 0;
+
+ err = dsa_user_dcbnl_ieee_global_dscp_setdel(dev, app, true);
+ if (err) {
+ if (ds->ops->port_add_dscp_prio)
+ ds->ops->port_add_dscp_prio(ds, port, dscp,
+ app->priority);
+ dcb_ieee_setapp(dev, app);
+ return err;
+ }
+
+ return 0;
+}
+
+static int __maybe_unused dsa_user_dcbnl_ieee_delapp(struct net_device *dev,
+ struct dcb_app *app)
+{
+ switch (app->selector) {
+ case IEEE_8021QAZ_APP_SEL_ETHERTYPE:
+ switch (app->protocol) {
+ case 0:
+ return dsa_user_dcbnl_del_default_prio(dev, app);
+ default:
+ return -EOPNOTSUPP;
+ }
+ break;
+ case IEEE_8021QAZ_APP_SEL_DSCP:
+ return dsa_user_dcbnl_del_dscp_prio(dev, app);
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+/* Pre-populate the DCB application priority table with the priorities
+ * configured during switch setup, which we read from hardware here.
+ */
+static int dsa_user_dcbnl_init(struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ int port = dp->index;
+ int err;
+
+ if (ds->ops->port_get_default_prio) {
+ int prio = ds->ops->port_get_default_prio(ds, port);
+ struct dcb_app app = {
+ .selector = IEEE_8021QAZ_APP_SEL_ETHERTYPE,
+ .protocol = 0,
+ .priority = prio,
+ };
+
+ if (prio < 0)
+ return prio;
+
+ err = dcb_ieee_setapp(dev, &app);
+ if (err)
+ return err;
+ }
+
+ if (ds->ops->port_get_dscp_prio) {
+ int protocol;
+
+ for (protocol = 0; protocol < 64; protocol++) {
+ struct dcb_app app = {
+ .selector = IEEE_8021QAZ_APP_SEL_DSCP,
+ .protocol = protocol,
+ };
+ int prio;
+
+ prio = ds->ops->port_get_dscp_prio(ds, port, protocol);
+ if (prio == -EOPNOTSUPP)
+ continue;
+ if (prio < 0)
+ return prio;
+
+ app.priority = prio;
+
+ err = dcb_ieee_setapp(dev, &app);
+ if (err)
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static const struct ethtool_ops dsa_user_ethtool_ops = {
+ .get_drvinfo = dsa_user_get_drvinfo,
+ .get_regs_len = dsa_user_get_regs_len,
+ .get_regs = dsa_user_get_regs,
+ .nway_reset = dsa_user_nway_reset,
+ .get_link = ethtool_op_get_link,
+ .get_eeprom_len = dsa_user_get_eeprom_len,
+ .get_eeprom = dsa_user_get_eeprom,
+ .set_eeprom = dsa_user_set_eeprom,
+ .get_strings = dsa_user_get_strings,
+ .get_ethtool_stats = dsa_user_get_ethtool_stats,
+ .get_sset_count = dsa_user_get_sset_count,
+ .get_eth_phy_stats = dsa_user_get_eth_phy_stats,
+ .get_eth_mac_stats = dsa_user_get_eth_mac_stats,
+ .get_eth_ctrl_stats = dsa_user_get_eth_ctrl_stats,
+ .get_rmon_stats = dsa_user_get_rmon_stats,
+ .get_ts_stats = dsa_user_get_ts_stats,
+ .set_wol = dsa_user_set_wol,
+ .get_wol = dsa_user_get_wol,
+ .set_eee = dsa_user_set_eee,
+ .get_eee = dsa_user_get_eee,
+ .get_link_ksettings = dsa_user_get_link_ksettings,
+ .set_link_ksettings = dsa_user_set_link_ksettings,
+ .get_pause_stats = dsa_user_get_pause_stats,
+ .get_pauseparam = dsa_user_get_pauseparam,
+ .set_pauseparam = dsa_user_set_pauseparam,
+ .get_rxnfc = dsa_user_get_rxnfc,
+ .set_rxnfc = dsa_user_set_rxnfc,
+ .get_ts_info = dsa_user_get_ts_info,
+ .self_test = dsa_user_net_selftest,
+ .get_mm = dsa_user_get_mm,
+ .set_mm = dsa_user_set_mm,
+ .get_mm_stats = dsa_user_get_mm_stats,
+};
+
+static const struct dcbnl_rtnl_ops __maybe_unused dsa_user_dcbnl_ops = {
+ .ieee_setapp = dsa_user_dcbnl_ieee_setapp,
+ .ieee_delapp = dsa_user_dcbnl_ieee_delapp,
+ .dcbnl_setapptrust = dsa_user_dcbnl_set_apptrust,
+ .dcbnl_getapptrust = dsa_user_dcbnl_get_apptrust,
+};
+
+static void dsa_user_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *s)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_stats64)
+ ds->ops->get_stats64(ds, dp->index, s);
+ else
+ dev_get_tstats64(dev, s);
+}
+
+static int dsa_user_fill_forward_path(struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+ struct dsa_port *dp = dsa_user_to_port(ctx->dev);
+ struct net_device *conduit = dsa_port_to_conduit(dp);
+ struct dsa_port *cpu_dp = dp->cpu_dp;
+
+ path->dev = ctx->dev;
+ path->type = DEV_PATH_DSA;
+ path->dsa.proto = cpu_dp->tag_ops->proto;
+ path->dsa.port = dp->index;
+ ctx->dev = conduit;
+
+ return 0;
+}
+
+static int dsa_user_hwtstamp_get(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->port_hwtstamp_get)
+ return -EOPNOTSUPP;
+
+ return ds->ops->port_hwtstamp_get(ds, dp->index, cfg);
+}
+
+static int dsa_user_hwtstamp_set(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (!ds->ops->port_hwtstamp_set)
+ return -EOPNOTSUPP;
+
+ return ds->ops->port_hwtstamp_set(ds, dp->index, cfg, extack);
+}
+
+static const struct net_device_ops dsa_user_netdev_ops = {
+ .ndo_open = dsa_user_open,
+ .ndo_stop = dsa_user_close,
+ .ndo_start_xmit = dsa_user_xmit,
+ .ndo_change_rx_flags = dsa_user_change_rx_flags,
+ .ndo_set_rx_mode = dsa_user_set_rx_mode,
+ .ndo_set_mac_address = dsa_user_set_mac_address,
+ .ndo_fdb_dump = dsa_user_fdb_dump,
+ .ndo_eth_ioctl = dsa_user_ioctl,
+ .ndo_get_iflink = dsa_user_get_iflink,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ .ndo_netpoll_setup = dsa_user_netpoll_setup,
+ .ndo_netpoll_cleanup = dsa_user_netpoll_cleanup,
+ .ndo_poll_controller = dsa_user_poll_controller,
+#endif
+ .ndo_setup_tc = dsa_user_setup_tc,
+ .ndo_get_stats64 = dsa_user_get_stats64,
+ .ndo_vlan_rx_add_vid = dsa_user_vlan_rx_add_vid,
+ .ndo_vlan_rx_kill_vid = dsa_user_vlan_rx_kill_vid,
+ .ndo_change_mtu = dsa_user_change_mtu,
+ .ndo_fill_forward_path = dsa_user_fill_forward_path,
+ .ndo_hwtstamp_get = dsa_user_hwtstamp_get,
+ .ndo_hwtstamp_set = dsa_user_hwtstamp_set,
+};
+
+static const struct device_type dsa_type = {
+ .name = "dsa",
+};
+
+void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up)
+{
+ const struct dsa_port *dp = dsa_to_port(ds, port);
+
+ if (dp->pl)
+ phylink_mac_change(dp->pl, up);
+}
+EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change);
+
+static void dsa_user_phylink_fixed_state(struct phylink_config *config,
+ struct phylink_link_state *state)
+{
+ struct dsa_port *dp = dsa_phylink_to_port(config);
+ struct dsa_switch *ds = dp->ds;
+
+ /* No need to check that this operation is valid, the callback would
+ * not be called if it was not.
+ */
+ ds->ops->phylink_fixed_state(ds, dp->index, state);
+}
+
+/* user device setup *******************************************************/
+static int dsa_user_phy_connect(struct net_device *user_dev, int addr,
+ u32 flags)
+{
+ struct dsa_port *dp = dsa_user_to_port(user_dev);
+ struct dsa_switch *ds = dp->ds;
+
+ user_dev->phydev = mdiobus_get_phy(ds->user_mii_bus, addr);
+ if (!user_dev->phydev) {
+ netdev_err(user_dev, "no phy at %d\n", addr);
+ return -ENODEV;
+ }
+
+ user_dev->phydev->dev_flags |= flags;
+
+ return phylink_connect_phy(dp->pl, user_dev->phydev);
+}
+
+static int dsa_user_phy_setup(struct net_device *user_dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(user_dev);
+ struct device_node *port_dn = dp->dn;
+ struct dsa_switch *ds = dp->ds;
+ u32 phy_flags = 0;
+ int ret;
+
+ dp->pl_config.dev = &user_dev->dev;
+ dp->pl_config.type = PHYLINK_NETDEV;
+
+ /* The get_fixed_state callback takes precedence over polling the
+ * link GPIO in PHYLINK (see phylink_get_fixed_state). Only set
+ * this if the switch provides such a callback.
+ */
+ if (ds->ops->phylink_fixed_state) {
+ dp->pl_config.get_fixed_state = dsa_user_phylink_fixed_state;
+ dp->pl_config.poll_fixed_state = true;
+ }
+
+ ret = dsa_port_phylink_create(dp);
+ if (ret)
+ return ret;
+
+ if (ds->ops->get_phy_flags)
+ phy_flags = ds->ops->get_phy_flags(ds, dp->index);
+
+ ret = phylink_of_phy_connect(dp->pl, port_dn, phy_flags);
+ if (ret == -ENODEV && ds->user_mii_bus) {
+ /* We could not connect to a designated PHY or SFP, so try to
+ * use the switch internal MDIO bus instead
+ */
+ ret = dsa_user_phy_connect(user_dev, dp->index, phy_flags);
+ }
+ if (ret) {
+ netdev_err(user_dev, "failed to connect to PHY: %pe\n",
+ ERR_PTR(ret));
+ dsa_port_phylink_destroy(dp);
+ }
+
+ return ret;
+}
+
+void dsa_user_setup_tagger(struct net_device *user)
+{
+ struct dsa_port *dp = dsa_user_to_port(user);
+ struct net_device *conduit = dsa_port_to_conduit(dp);
+ struct dsa_user_priv *p = netdev_priv(user);
+ const struct dsa_port *cpu_dp = dp->cpu_dp;
+ const struct dsa_switch *ds = dp->ds;
+
+ user->needed_headroom = cpu_dp->tag_ops->needed_headroom;
+ user->needed_tailroom = cpu_dp->tag_ops->needed_tailroom;
+ /* Try to save one extra realloc later in the TX path (in the conduit)
+ * by also inheriting the conduit's needed headroom and tailroom.
+ * The 8021q driver also does this.
+ */
+ user->needed_headroom += conduit->needed_headroom;
+ user->needed_tailroom += conduit->needed_tailroom;
+
+ p->xmit = cpu_dp->tag_ops->xmit;
+
+ user->features = conduit->vlan_features | NETIF_F_HW_TC;
+ user->hw_features |= NETIF_F_HW_TC;
+ if (user->needed_tailroom)
+ user->features &= ~(NETIF_F_SG | NETIF_F_FRAGLIST);
+ if (ds->needs_standalone_vlan_filtering)
+ user->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
+
+ user->lltx = true;
+}
+
+int dsa_user_suspend(struct net_device *user_dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(user_dev);
+
+ if (!netif_running(user_dev))
+ return 0;
+
+ netif_device_detach(user_dev);
+
+ rtnl_lock();
+ phylink_stop(dp->pl);
+ rtnl_unlock();
+
+ return 0;
+}
+
+int dsa_user_resume(struct net_device *user_dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(user_dev);
+
+ if (!netif_running(user_dev))
+ return 0;
+
+ netif_device_attach(user_dev);
+
+ rtnl_lock();
+ phylink_start(dp->pl);
+ rtnl_unlock();
+
+ return 0;
+}
+
+int dsa_user_create(struct dsa_port *port)
+{
+ struct net_device *conduit = dsa_port_to_conduit(port);
+ struct dsa_switch *ds = port->ds;
+ struct net_device *user_dev;
+ struct dsa_user_priv *p;
+ const char *name;
+ int assign_type;
+ int ret;
+
+ if (!ds->num_tx_queues)
+ ds->num_tx_queues = 1;
+
+ if (port->name) {
+ name = port->name;
+ assign_type = NET_NAME_PREDICTABLE;
+ } else {
+ name = "eth%d";
+ assign_type = NET_NAME_ENUM;
+ }
+
+ user_dev = alloc_netdev_mqs(sizeof(struct dsa_user_priv), name,
+ assign_type, ether_setup,
+ ds->num_tx_queues, 1);
+ if (user_dev == NULL)
+ return -ENOMEM;
+
+ user_dev->rtnl_link_ops = &dsa_link_ops;
+ user_dev->ethtool_ops = &dsa_user_ethtool_ops;
+#if IS_ENABLED(CONFIG_DCB)
+ user_dev->dcbnl_ops = &dsa_user_dcbnl_ops;
+#endif
+ if (!is_zero_ether_addr(port->mac))
+ eth_hw_addr_set(user_dev, port->mac);
+ else
+ eth_hw_addr_inherit(user_dev, conduit);
+ user_dev->priv_flags |= IFF_NO_QUEUE;
+ if (dsa_switch_supports_uc_filtering(ds))
+ user_dev->priv_flags |= IFF_UNICAST_FLT;
+ user_dev->netdev_ops = &dsa_user_netdev_ops;
+ if (ds->ops->port_max_mtu)
+ user_dev->max_mtu = ds->ops->port_max_mtu(ds, port->index);
+ SET_NETDEV_DEVTYPE(user_dev, &dsa_type);
+
+ SET_NETDEV_DEV(user_dev, port->ds->dev);
+ SET_NETDEV_DEVLINK_PORT(user_dev, &port->devlink_port);
+ user_dev->dev.of_node = port->dn;
+ user_dev->vlan_features = conduit->vlan_features;
+
+ p = netdev_priv(user_dev);
+ user_dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
+
+ ret = gro_cells_init(&p->gcells, user_dev);
+ if (ret)
+ goto out_free;
+
+ p->dp = port;
+ INIT_LIST_HEAD(&p->mall_tc_list);
+ port->user = user_dev;
+ dsa_user_setup_tagger(user_dev);
+
+ netif_carrier_off(user_dev);
+
+ ret = dsa_user_phy_setup(user_dev);
+ if (ret) {
+ netdev_err(user_dev,
+ "error %d setting up PHY for tree %d, switch %d, port %d\n",
+ ret, ds->dst->index, ds->index, port->index);
+ goto out_gcells;
+ }
+
+ rtnl_lock();
+
+ ret = dsa_user_change_mtu(user_dev, ETH_DATA_LEN);
+ if (ret && ret != -EOPNOTSUPP)
+ dev_warn(ds->dev, "nonfatal error %d setting MTU to %d on port %d\n",
+ ret, ETH_DATA_LEN, port->index);
+
+ ret = register_netdevice(user_dev);
+ if (ret) {
+ netdev_err(conduit, "error %d registering interface %s\n",
+ ret, user_dev->name);
+ rtnl_unlock();
+ goto out_phy;
+ }
+
+ if (IS_ENABLED(CONFIG_DCB)) {
+ ret = dsa_user_dcbnl_init(user_dev);
+ if (ret) {
+ netdev_err(user_dev,
+ "failed to initialize DCB: %pe\n",
+ ERR_PTR(ret));
+ rtnl_unlock();
+ goto out_unregister;
+ }
+ }
+
+ ret = netdev_upper_dev_link(conduit, user_dev, NULL);
+
+ rtnl_unlock();
+
+ if (ret)
+ goto out_unregister;
+
+ return 0;
+
+out_unregister:
+ unregister_netdev(user_dev);
+out_phy:
+ rtnl_lock();
+ phylink_disconnect_phy(p->dp->pl);
+ rtnl_unlock();
+ dsa_port_phylink_destroy(p->dp);
+out_gcells:
+ gro_cells_destroy(&p->gcells);
+out_free:
+ free_netdev(user_dev);
+ port->user = NULL;
+ return ret;
+}
+
+void dsa_user_destroy(struct net_device *user_dev)
+{
+ struct net_device *conduit = dsa_user_to_conduit(user_dev);
+ struct dsa_port *dp = dsa_user_to_port(user_dev);
+ struct dsa_user_priv *p = netdev_priv(user_dev);
+
+ netif_carrier_off(user_dev);
+ rtnl_lock();
+ netdev_upper_dev_unlink(conduit, user_dev);
+ unregister_netdevice(user_dev);
+ phylink_disconnect_phy(dp->pl);
+ rtnl_unlock();
+
+ dsa_port_phylink_destroy(dp);
+ gro_cells_destroy(&p->gcells);
+ free_netdev(user_dev);
+}
+
+int dsa_user_change_conduit(struct net_device *dev, struct net_device *conduit,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *old_conduit = dsa_user_to_conduit(dev);
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+ struct net_device *upper;
+ struct list_head *iter;
+ int err;
+
+ if (conduit == old_conduit)
+ return 0;
+
+ if (!ds->ops->port_change_conduit) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Driver does not support changing DSA conduit");
+ return -EOPNOTSUPP;
+ }
+
+ if (!netdev_uses_dsa(conduit)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Interface not eligible as DSA conduit");
+ return -EOPNOTSUPP;
+ }
+
+ netdev_for_each_upper_dev_rcu(conduit, upper, iter) {
+ if (dsa_user_dev_check(upper))
+ continue;
+ if (netif_is_bridge_master(upper))
+ continue;
+ NL_SET_ERR_MSG_MOD(extack, "Cannot join conduit with unknown uppers");
+ return -EOPNOTSUPP;
+ }
+
+ /* Since we allow live-changing the DSA conduit, plus we auto-open the
+ * DSA conduit when the user port opens => we need to ensure that the
+ * new DSA conduit is open too.
+ */
+ if (dev->flags & IFF_UP) {
+ err = dev_open(conduit, extack);
+ if (err)
+ return err;
+ }
+
+ netdev_upper_dev_unlink(old_conduit, dev);
+
+ err = netdev_upper_dev_link(conduit, dev, extack);
+ if (err)
+ goto out_revert_old_conduit_unlink;
+
+ err = dsa_port_change_conduit(dp, conduit, extack);
+ if (err)
+ goto out_revert_conduit_link;
+
+ /* Update the MTU of the new CPU port through cross-chip notifiers */
+ err = dsa_user_change_mtu(dev, dev->mtu);
+ if (err && err != -EOPNOTSUPP) {
+ netdev_warn(dev,
+ "nonfatal error updating MTU with new conduit: %pe\n",
+ ERR_PTR(err));
+ }
+
+ return 0;
+
+out_revert_conduit_link:
+ netdev_upper_dev_unlink(conduit, dev);
+out_revert_old_conduit_unlink:
+ netdev_upper_dev_link(old_conduit, dev, NULL);
+ return err;
+}
+
+bool dsa_user_dev_check(const struct net_device *dev)
+{
+ return dev->netdev_ops == &dsa_user_netdev_ops;
+}
+EXPORT_SYMBOL_GPL(dsa_user_dev_check);
+
+static int dsa_user_changeupper(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct netlink_ext_ack *extack;
+ int err = NOTIFY_DONE;
+ struct dsa_port *dp;
+
+ if (!dsa_user_dev_check(dev))
+ return err;
+
+ dp = dsa_user_to_port(dev);
+ extack = netdev_notifier_info_to_extack(&info->info);
+
+ if (netif_is_bridge_master(info->upper_dev)) {
+ if (info->linking) {
+ err = dsa_port_bridge_join(dp, info->upper_dev, extack);
+ if (!err)
+ dsa_bridge_mtu_normalization(dp);
+ if (err == -EOPNOTSUPP) {
+ NL_SET_ERR_MSG_WEAK_MOD(extack,
+ "Offloading not supported");
+ err = 0;
+ }
+ err = notifier_from_errno(err);
+ } else {
+ dsa_port_bridge_leave(dp, info->upper_dev);
+ err = NOTIFY_OK;
+ }
+ } else if (netif_is_lag_master(info->upper_dev)) {
+ if (info->linking) {
+ err = dsa_port_lag_join(dp, info->upper_dev,
+ info->upper_info, extack);
+ if (err == -EOPNOTSUPP) {
+ NL_SET_ERR_MSG_WEAK_MOD(extack,
+ "Offloading not supported");
+ err = 0;
+ }
+ err = notifier_from_errno(err);
+ } else {
+ dsa_port_lag_leave(dp, info->upper_dev);
+ err = NOTIFY_OK;
+ }
+ } else if (is_hsr_master(info->upper_dev)) {
+ if (info->linking) {
+ err = dsa_port_hsr_join(dp, info->upper_dev, extack);
+ if (err == -EOPNOTSUPP) {
+ NL_SET_ERR_MSG_WEAK_MOD(extack,
+ "Offloading not supported");
+ err = 0;
+ }
+ err = notifier_from_errno(err);
+ } else {
+ dsa_port_hsr_leave(dp, info->upper_dev);
+ err = NOTIFY_OK;
+ }
+ }
+
+ return err;
+}
+
+static int dsa_user_prechangeupper(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct dsa_port *dp;
+
+ if (!dsa_user_dev_check(dev))
+ return NOTIFY_DONE;
+
+ dp = dsa_user_to_port(dev);
+
+ if (netif_is_bridge_master(info->upper_dev) && !info->linking)
+ dsa_port_pre_bridge_leave(dp, info->upper_dev);
+ else if (netif_is_lag_master(info->upper_dev) && !info->linking)
+ dsa_port_pre_lag_leave(dp, info->upper_dev);
+ /* dsa_port_pre_hsr_leave is not yet necessary since hsr devices cannot
+ * meaningfully placed under a bridge yet
+ */
+
+ return NOTIFY_DONE;
+}
+
+static int
+dsa_user_lag_changeupper(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct net_device *lower;
+ struct list_head *iter;
+ int err = NOTIFY_DONE;
+ struct dsa_port *dp;
+
+ if (!netif_is_lag_master(dev))
+ return err;
+
+ netdev_for_each_lower_dev(dev, lower, iter) {
+ if (!dsa_user_dev_check(lower))
+ continue;
+
+ dp = dsa_user_to_port(lower);
+ if (!dp->lag)
+ /* Software LAG */
+ continue;
+
+ err = dsa_user_changeupper(lower, info);
+ if (notifier_to_errno(err))
+ break;
+ }
+
+ return err;
+}
+
+/* Same as dsa_user_lag_changeupper() except that it calls
+ * dsa_user_prechangeupper()
+ */
+static int
+dsa_user_lag_prechangeupper(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct net_device *lower;
+ struct list_head *iter;
+ int err = NOTIFY_DONE;
+ struct dsa_port *dp;
+
+ if (!netif_is_lag_master(dev))
+ return err;
+
+ netdev_for_each_lower_dev(dev, lower, iter) {
+ if (!dsa_user_dev_check(lower))
+ continue;
+
+ dp = dsa_user_to_port(lower);
+ if (!dp->lag)
+ /* Software LAG */
+ continue;
+
+ err = dsa_user_prechangeupper(lower, info);
+ if (notifier_to_errno(err))
+ break;
+ }
+
+ return err;
+}
+
+static int
+dsa_prevent_bridging_8021q_upper(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct netlink_ext_ack *ext_ack;
+ struct net_device *user, *br;
+ struct dsa_port *dp;
+
+ ext_ack = netdev_notifier_info_to_extack(&info->info);
+
+ if (!is_vlan_dev(dev))
+ return NOTIFY_DONE;
+
+ user = vlan_dev_real_dev(dev);
+ if (!dsa_user_dev_check(user))
+ return NOTIFY_DONE;
+
+ dp = dsa_user_to_port(user);
+ br = dsa_port_bridge_dev_get(dp);
+ if (!br)
+ return NOTIFY_DONE;
+
+ /* Deny enslaving a VLAN device into a VLAN-aware bridge */
+ if (br_vlan_enabled(br) &&
+ netif_is_bridge_master(info->upper_dev) && info->linking) {
+ NL_SET_ERR_MSG_MOD(ext_ack,
+ "Cannot make VLAN device join VLAN-aware bridge");
+ return notifier_from_errno(-EINVAL);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int
+dsa_user_check_8021q_upper(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct net_device *br = dsa_port_bridge_dev_get(dp);
+ struct bridge_vlan_info br_info;
+ struct netlink_ext_ack *extack;
+ int err = NOTIFY_DONE;
+ u16 vid;
+
+ if (!br || !br_vlan_enabled(br))
+ return NOTIFY_DONE;
+
+ extack = netdev_notifier_info_to_extack(&info->info);
+ vid = vlan_dev_vlan_id(info->upper_dev);
+
+ /* br_vlan_get_info() returns -EINVAL or -ENOENT if the
+ * device, respectively the VID is not found, returning
+ * 0 means success, which is a failure for us here.
+ */
+ err = br_vlan_get_info(br, vid, &br_info);
+ if (err == 0) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "This VLAN is already configured by the bridge");
+ return notifier_from_errno(-EBUSY);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int
+dsa_user_prechangeupper_sanity_check(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct dsa_switch *ds;
+ struct dsa_port *dp;
+ int err;
+
+ if (!dsa_user_dev_check(dev))
+ return dsa_prevent_bridging_8021q_upper(dev, info);
+
+ dp = dsa_user_to_port(dev);
+ ds = dp->ds;
+
+ if (ds->ops->port_prechangeupper) {
+ err = ds->ops->port_prechangeupper(ds, dp->index, info);
+ if (err)
+ return notifier_from_errno(err);
+ }
+
+ if (is_vlan_dev(info->upper_dev))
+ return dsa_user_check_8021q_upper(dev, info);
+
+ return NOTIFY_DONE;
+}
+
+/* To be eligible as a DSA conduit, a LAG must have all lower interfaces be
+ * eligible DSA conduits. Additionally, all LAG slaves must be DSA conduits of
+ * switches in the same switch tree.
+ */
+static int dsa_lag_conduit_validate(struct net_device *lag_dev,
+ struct netlink_ext_ack *extack)
+{
+ struct net_device *lower1, *lower2;
+ struct list_head *iter1, *iter2;
+
+ netdev_for_each_lower_dev(lag_dev, lower1, iter1) {
+ netdev_for_each_lower_dev(lag_dev, lower2, iter2) {
+ if (!netdev_uses_dsa(lower1) ||
+ !netdev_uses_dsa(lower2)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "All LAG ports must be eligible as DSA conduits");
+ return notifier_from_errno(-EINVAL);
+ }
+
+ if (lower1 == lower2)
+ continue;
+
+ if (!dsa_port_tree_same(lower1->dsa_ptr,
+ lower2->dsa_ptr)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "LAG contains DSA conduits of disjoint switch trees");
+ return notifier_from_errno(-EINVAL);
+ }
+ }
+ }
+
+ return NOTIFY_DONE;
+}
+
+static int
+dsa_conduit_prechangeupper_sanity_check(struct net_device *conduit,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info);
+
+ if (!netdev_uses_dsa(conduit))
+ return NOTIFY_DONE;
+
+ if (!info->linking)
+ return NOTIFY_DONE;
+
+ /* Allow DSA switch uppers */
+ if (dsa_user_dev_check(info->upper_dev))
+ return NOTIFY_DONE;
+
+ /* Allow bridge uppers of DSA conduits, subject to further
+ * restrictions in dsa_bridge_prechangelower_sanity_check()
+ */
+ if (netif_is_bridge_master(info->upper_dev))
+ return NOTIFY_DONE;
+
+ /* Allow LAG uppers, subject to further restrictions in
+ * dsa_lag_conduit_prechangelower_sanity_check()
+ */
+ if (netif_is_lag_master(info->upper_dev))
+ return dsa_lag_conduit_validate(info->upper_dev, extack);
+
+ NL_SET_ERR_MSG_MOD(extack,
+ "DSA conduit cannot join unknown upper interfaces");
+ return notifier_from_errno(-EBUSY);
+}
+
+static int
+dsa_lag_conduit_prechangelower_sanity_check(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info);
+ struct net_device *lag_dev = info->upper_dev;
+ struct net_device *lower;
+ struct list_head *iter;
+
+ if (!netdev_uses_dsa(lag_dev) || !netif_is_lag_master(lag_dev))
+ return NOTIFY_DONE;
+
+ if (!info->linking)
+ return NOTIFY_DONE;
+
+ if (!netdev_uses_dsa(dev)) {
+ NL_SET_ERR_MSG(extack,
+ "Only DSA conduits can join a LAG DSA conduit");
+ return notifier_from_errno(-EINVAL);
+ }
+
+ netdev_for_each_lower_dev(lag_dev, lower, iter) {
+ if (!dsa_port_tree_same(dev->dsa_ptr, lower->dsa_ptr)) {
+ NL_SET_ERR_MSG(extack,
+ "Interface is DSA conduit for a different switch tree than this LAG");
+ return notifier_from_errno(-EINVAL);
+ }
+
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+/* Don't allow bridging of DSA conduits, since the bridge layer rx_handler
+ * prevents the DSA fake ethertype handler to be invoked, so we don't get the
+ * chance to strip off and parse the DSA switch tag protocol header (the bridge
+ * layer just returns RX_HANDLER_CONSUMED, stopping RX processing for these
+ * frames).
+ * The only case where that would not be an issue is when bridging can already
+ * be offloaded, such as when the DSA conduit is itself a DSA or plain switchdev
+ * port, and is bridged only with other ports from the same hardware device.
+ */
+static int
+dsa_bridge_prechangelower_sanity_check(struct net_device *new_lower,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct net_device *br = info->upper_dev;
+ struct netlink_ext_ack *extack;
+ struct net_device *lower;
+ struct list_head *iter;
+
+ if (!netif_is_bridge_master(br))
+ return NOTIFY_DONE;
+
+ if (!info->linking)
+ return NOTIFY_DONE;
+
+ extack = netdev_notifier_info_to_extack(&info->info);
+
+ netdev_for_each_lower_dev(br, lower, iter) {
+ if (!netdev_uses_dsa(new_lower) && !netdev_uses_dsa(lower))
+ continue;
+
+ if (!netdev_port_same_parent_id(lower, new_lower)) {
+ NL_SET_ERR_MSG(extack,
+ "Cannot do software bridging with a DSA conduit");
+ return notifier_from_errno(-EINVAL);
+ }
+ }
+
+ return NOTIFY_DONE;
+}
+
+static void dsa_tree_migrate_ports_from_lag_conduit(struct dsa_switch_tree *dst,
+ struct net_device *lag_dev)
+{
+ struct net_device *new_conduit = dsa_tree_find_first_conduit(dst);
+ struct dsa_port *dp;
+ int err;
+
+ dsa_tree_for_each_user_port(dp, dst) {
+ if (dsa_port_to_conduit(dp) != lag_dev)
+ continue;
+
+ err = dsa_user_change_conduit(dp->user, new_conduit, NULL);
+ if (err) {
+ netdev_err(dp->user,
+ "failed to restore conduit to %s: %pe\n",
+ new_conduit->name, ERR_PTR(err));
+ }
+ }
+}
+
+static int dsa_conduit_lag_join(struct net_device *conduit,
+ struct net_device *lag_dev,
+ struct netdev_lag_upper_info *uinfo,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *cpu_dp = conduit->dsa_ptr;
+ struct dsa_switch_tree *dst = cpu_dp->dst;
+ struct dsa_port *dp;
+ int err;
+
+ err = dsa_conduit_lag_setup(lag_dev, cpu_dp, uinfo, extack);
+ if (err)
+ return err;
+
+ dsa_tree_for_each_user_port(dp, dst) {
+ if (dsa_port_to_conduit(dp) != conduit)
+ continue;
+
+ err = dsa_user_change_conduit(dp->user, lag_dev, extack);
+ if (err)
+ goto restore;
+ }
+
+ return 0;
+
+restore:
+ dsa_tree_for_each_user_port_continue_reverse(dp, dst) {
+ if (dsa_port_to_conduit(dp) != lag_dev)
+ continue;
+
+ err = dsa_user_change_conduit(dp->user, conduit, NULL);
+ if (err) {
+ netdev_err(dp->user,
+ "failed to restore conduit to %s: %pe\n",
+ conduit->name, ERR_PTR(err));
+ }
+ }
+
+ dsa_conduit_lag_teardown(lag_dev, conduit->dsa_ptr);
+
+ return err;
+}
+
+static void dsa_conduit_lag_leave(struct net_device *conduit,
+ struct net_device *lag_dev)
+{
+ struct dsa_port *dp, *cpu_dp = lag_dev->dsa_ptr;
+ struct dsa_switch_tree *dst = cpu_dp->dst;
+ struct dsa_port *new_cpu_dp = NULL;
+ struct net_device *lower;
+ struct list_head *iter;
+
+ netdev_for_each_lower_dev(lag_dev, lower, iter) {
+ if (netdev_uses_dsa(lower)) {
+ new_cpu_dp = lower->dsa_ptr;
+ break;
+ }
+ }
+
+ if (new_cpu_dp) {
+ /* Update the CPU port of the user ports still under the LAG
+ * so that dsa_port_to_conduit() continues to work properly
+ */
+ dsa_tree_for_each_user_port(dp, dst)
+ if (dsa_port_to_conduit(dp) == lag_dev)
+ dp->cpu_dp = new_cpu_dp;
+
+ /* Update the index of the virtual CPU port to match the lowest
+ * physical CPU port
+ */
+ lag_dev->dsa_ptr = new_cpu_dp;
+ wmb();
+ } else {
+ /* If the LAG DSA conduit has no ports left, migrate back all
+ * user ports to the first physical CPU port
+ */
+ dsa_tree_migrate_ports_from_lag_conduit(dst, lag_dev);
+ }
+
+ /* This DSA conduit has left its LAG in any case, so let
+ * the CPU port leave the hardware LAG as well
+ */
+ dsa_conduit_lag_teardown(lag_dev, conduit->dsa_ptr);
+}
+
+static int dsa_conduit_changeupper(struct net_device *dev,
+ struct netdev_notifier_changeupper_info *info)
+{
+ struct netlink_ext_ack *extack;
+ int err = NOTIFY_DONE;
+
+ if (!netdev_uses_dsa(dev))
+ return err;
+
+ extack = netdev_notifier_info_to_extack(&info->info);
+
+ if (netif_is_lag_master(info->upper_dev)) {
+ if (info->linking) {
+ err = dsa_conduit_lag_join(dev, info->upper_dev,
+ info->upper_info, extack);
+ err = notifier_from_errno(err);
+ } else {
+ dsa_conduit_lag_leave(dev, info->upper_dev);
+ err = NOTIFY_OK;
+ }
+ }
+
+ return err;
+}
+
+static int dsa_user_netdevice_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ switch (event) {
+ case NETDEV_PRECHANGEUPPER: {
+ struct netdev_notifier_changeupper_info *info = ptr;
+ int err;
+
+ err = dsa_user_prechangeupper_sanity_check(dev, info);
+ if (notifier_to_errno(err))
+ return err;
+
+ err = dsa_conduit_prechangeupper_sanity_check(dev, info);
+ if (notifier_to_errno(err))
+ return err;
+
+ err = dsa_lag_conduit_prechangelower_sanity_check(dev, info);
+ if (notifier_to_errno(err))
+ return err;
+
+ err = dsa_bridge_prechangelower_sanity_check(dev, info);
+ if (notifier_to_errno(err))
+ return err;
+
+ err = dsa_user_prechangeupper(dev, ptr);
+ if (notifier_to_errno(err))
+ return err;
+
+ err = dsa_user_lag_prechangeupper(dev, ptr);
+ if (notifier_to_errno(err))
+ return err;
+
+ break;
+ }
+ case NETDEV_CHANGEUPPER: {
+ int err;
+
+ err = dsa_user_changeupper(dev, ptr);
+ if (notifier_to_errno(err))
+ return err;
+
+ err = dsa_user_lag_changeupper(dev, ptr);
+ if (notifier_to_errno(err))
+ return err;
+
+ err = dsa_conduit_changeupper(dev, ptr);
+ if (notifier_to_errno(err))
+ return err;
+
+ break;
+ }
+ case NETDEV_CHANGELOWERSTATE: {
+ struct netdev_notifier_changelowerstate_info *info = ptr;
+ struct dsa_port *dp;
+ int err = 0;
+
+ if (dsa_user_dev_check(dev)) {
+ dp = dsa_user_to_port(dev);
+
+ err = dsa_port_lag_change(dp, info->lower_state_info);
+ }
+
+ /* Mirror LAG port events on DSA conduits that are in
+ * a LAG towards their respective switch CPU ports
+ */
+ if (netdev_uses_dsa(dev)) {
+ dp = dev->dsa_ptr;
+
+ err = dsa_port_lag_change(dp, info->lower_state_info);
+ }
+
+ return notifier_from_errno(err);
+ }
+ case NETDEV_CHANGE:
+ case NETDEV_UP: {
+ /* Track state of conduit port.
+ * DSA driver may require the conduit port (and indirectly
+ * the tagger) to be available for some special operation.
+ */
+ if (netdev_uses_dsa(dev)) {
+ struct dsa_port *cpu_dp = dev->dsa_ptr;
+ struct dsa_switch_tree *dst = cpu_dp->ds->dst;
+
+ /* Track when the conduit port is UP */
+ dsa_tree_conduit_oper_state_change(dst, dev,
+ netif_oper_up(dev));
+
+ /* Track when the conduit port is ready and can accept
+ * packet.
+ * NETDEV_UP event is not enough to flag a port as ready.
+ * We also have to wait for linkwatch_do_dev to dev_activate
+ * and emit a NETDEV_CHANGE event.
+ * We check if a conduit port is ready by checking if the dev
+ * have a qdisc assigned and is not noop.
+ */
+ dsa_tree_conduit_admin_state_change(dst, dev,
+ !qdisc_tx_is_noop(dev));
+
+ return NOTIFY_OK;
+ }
+
+ return NOTIFY_DONE;
+ }
+ case NETDEV_GOING_DOWN: {
+ struct dsa_port *dp, *cpu_dp;
+ struct dsa_switch_tree *dst;
+ LIST_HEAD(close_list);
+
+ if (!netdev_uses_dsa(dev))
+ return NOTIFY_DONE;
+
+ cpu_dp = dev->dsa_ptr;
+ dst = cpu_dp->ds->dst;
+
+ dsa_tree_conduit_admin_state_change(dst, dev, false);
+
+ list_for_each_entry(dp, &dst->ports, list) {
+ if (!dsa_port_is_user(dp))
+ continue;
+
+ if (dp->cpu_dp != cpu_dp)
+ continue;
+
+ list_add(&dp->user->close_list, &close_list);
+ }
+
+ netif_close_many(&close_list, true);
+
+ return NOTIFY_OK;
+ }
+ default:
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static void
+dsa_fdb_offload_notify(struct dsa_switchdev_event_work *switchdev_work)
+{
+ struct switchdev_notifier_fdb_info info = {};
+
+ info.addr = switchdev_work->addr;
+ info.vid = switchdev_work->vid;
+ info.offloaded = true;
+ call_switchdev_notifiers(SWITCHDEV_FDB_OFFLOADED,
+ switchdev_work->orig_dev, &info.info, NULL);
+}
+
+static void dsa_user_switchdev_event_work(struct work_struct *work)
+{
+ struct dsa_switchdev_event_work *switchdev_work =
+ container_of(work, struct dsa_switchdev_event_work, work);
+ const unsigned char *addr = switchdev_work->addr;
+ struct net_device *dev = switchdev_work->dev;
+ u16 vid = switchdev_work->vid;
+ struct dsa_switch *ds;
+ struct dsa_port *dp;
+ int err;
+
+ dp = dsa_user_to_port(dev);
+ ds = dp->ds;
+
+ switch (switchdev_work->event) {
+ case SWITCHDEV_FDB_ADD_TO_DEVICE:
+ if (switchdev_work->host_addr)
+ err = dsa_port_bridge_host_fdb_add(dp, addr, vid);
+ else if (dp->lag)
+ err = dsa_port_lag_fdb_add(dp, addr, vid);
+ else
+ err = dsa_port_fdb_add(dp, addr, vid);
+ if (err) {
+ dev_err(ds->dev,
+ "port %d failed to add %pM vid %d to fdb: %d\n",
+ dp->index, addr, vid, err);
+ break;
+ }
+ dsa_fdb_offload_notify(switchdev_work);
+ break;
+
+ case SWITCHDEV_FDB_DEL_TO_DEVICE:
+ if (switchdev_work->host_addr)
+ err = dsa_port_bridge_host_fdb_del(dp, addr, vid);
+ else if (dp->lag)
+ err = dsa_port_lag_fdb_del(dp, addr, vid);
+ else
+ err = dsa_port_fdb_del(dp, addr, vid);
+ if (err) {
+ dev_err(ds->dev,
+ "port %d failed to delete %pM vid %d from fdb: %d\n",
+ dp->index, addr, vid, err);
+ }
+
+ break;
+ }
+
+ kfree(switchdev_work);
+}
+
+static bool dsa_foreign_dev_check(const struct net_device *dev,
+ const struct net_device *foreign_dev)
+{
+ const struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch_tree *dst = dp->ds->dst;
+
+ if (netif_is_bridge_master(foreign_dev))
+ return !dsa_tree_offloads_bridge_dev(dst, foreign_dev);
+
+ if (netif_is_bridge_port(foreign_dev))
+ return !dsa_tree_offloads_bridge_port(dst, foreign_dev);
+
+ /* Everything else is foreign */
+ return true;
+}
+
+static int dsa_user_fdb_event(struct net_device *dev,
+ struct net_device *orig_dev,
+ unsigned long event, const void *ctx,
+ const struct switchdev_notifier_fdb_info *fdb_info)
+{
+ struct dsa_switchdev_event_work *switchdev_work;
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ bool host_addr = fdb_info->is_local;
+ struct dsa_switch *ds = dp->ds;
+
+ if (ctx && ctx != dp)
+ return 0;
+
+ if (!dp->bridge)
+ return 0;
+
+ if (switchdev_fdb_is_dynamically_learned(fdb_info)) {
+ if (dsa_port_offloads_bridge_port(dp, orig_dev))
+ return 0;
+
+ /* FDB entries learned by the software bridge or by foreign
+ * bridge ports should be installed as host addresses only if
+ * the driver requests assisted learning.
+ */
+ if (!ds->assisted_learning_on_cpu_port)
+ return 0;
+ }
+
+ /* Also treat FDB entries on foreign interfaces bridged with us as host
+ * addresses.
+ */
+ if (dsa_foreign_dev_check(dev, orig_dev))
+ host_addr = true;
+
+ /* Check early that we're not doing work in vain.
+ * Host addresses on LAG ports still require regular FDB ops,
+ * since the CPU port isn't in a LAG.
+ */
+ if (dp->lag && !host_addr) {
+ if (!ds->ops->lag_fdb_add || !ds->ops->lag_fdb_del)
+ return -EOPNOTSUPP;
+ } else {
+ if (!ds->ops->port_fdb_add || !ds->ops->port_fdb_del)
+ return -EOPNOTSUPP;
+ }
+
+ switchdev_work = kzalloc(sizeof(*switchdev_work), GFP_ATOMIC);
+ if (!switchdev_work)
+ return -ENOMEM;
+
+ netdev_dbg(dev, "%s FDB entry towards %s, addr %pM vid %d%s\n",
+ event == SWITCHDEV_FDB_ADD_TO_DEVICE ? "Adding" : "Deleting",
+ orig_dev->name, fdb_info->addr, fdb_info->vid,
+ host_addr ? " as host address" : "");
+
+ INIT_WORK(&switchdev_work->work, dsa_user_switchdev_event_work);
+ switchdev_work->event = event;
+ switchdev_work->dev = dev;
+ switchdev_work->orig_dev = orig_dev;
+
+ ether_addr_copy(switchdev_work->addr, fdb_info->addr);
+ switchdev_work->vid = fdb_info->vid;
+ switchdev_work->host_addr = host_addr;
+
+ dsa_schedule_work(&switchdev_work->work);
+
+ return 0;
+}
+
+/* Called under rcu_read_lock() */
+static int dsa_user_switchdev_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+ int err;
+
+ switch (event) {
+ case SWITCHDEV_PORT_ATTR_SET:
+ err = switchdev_handle_port_attr_set(dev, ptr,
+ dsa_user_dev_check,
+ dsa_user_port_attr_set);
+ return notifier_from_errno(err);
+ case SWITCHDEV_FDB_ADD_TO_DEVICE:
+ case SWITCHDEV_FDB_DEL_TO_DEVICE:
+ err = switchdev_handle_fdb_event_to_device(dev, event, ptr,
+ dsa_user_dev_check,
+ dsa_foreign_dev_check,
+ dsa_user_fdb_event);
+ return notifier_from_errno(err);
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
+}
+
+static int dsa_user_switchdev_blocking_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
+ int err;
+
+ switch (event) {
+ case SWITCHDEV_PORT_OBJ_ADD:
+ err = switchdev_handle_port_obj_add_foreign(dev, ptr,
+ dsa_user_dev_check,
+ dsa_foreign_dev_check,
+ dsa_user_port_obj_add);
+ return notifier_from_errno(err);
+ case SWITCHDEV_PORT_OBJ_DEL:
+ err = switchdev_handle_port_obj_del_foreign(dev, ptr,
+ dsa_user_dev_check,
+ dsa_foreign_dev_check,
+ dsa_user_port_obj_del);
+ return notifier_from_errno(err);
+ case SWITCHDEV_PORT_ATTR_SET:
+ err = switchdev_handle_port_attr_set(dev, ptr,
+ dsa_user_dev_check,
+ dsa_user_port_attr_set);
+ return notifier_from_errno(err);
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block dsa_user_nb __read_mostly = {
+ .notifier_call = dsa_user_netdevice_event,
+};
+
+struct notifier_block dsa_user_switchdev_notifier = {
+ .notifier_call = dsa_user_switchdev_event,
+};
+
+struct notifier_block dsa_user_switchdev_blocking_notifier = {
+ .notifier_call = dsa_user_switchdev_blocking_event,
+};
+
+int dsa_user_register_notifier(void)
+{
+ struct notifier_block *nb;
+ int err;
+
+ err = register_netdevice_notifier(&dsa_user_nb);
+ if (err)
+ return err;
+
+ err = register_switchdev_notifier(&dsa_user_switchdev_notifier);
+ if (err)
+ goto err_switchdev_nb;
+
+ nb = &dsa_user_switchdev_blocking_notifier;
+ err = register_switchdev_blocking_notifier(nb);
+ if (err)
+ goto err_switchdev_blocking_nb;
+
+ return 0;
+
+err_switchdev_blocking_nb:
+ unregister_switchdev_notifier(&dsa_user_switchdev_notifier);
+err_switchdev_nb:
+ unregister_netdevice_notifier(&dsa_user_nb);
+ return err;
+}
+
+void dsa_user_unregister_notifier(void)
+{
+ struct notifier_block *nb;
+ int err;
+
+ nb = &dsa_user_switchdev_blocking_notifier;
+ err = unregister_switchdev_blocking_notifier(nb);
+ if (err)
+ pr_err("DSA: failed to unregister switchdev blocking notifier (%d)\n", err);
+
+ err = unregister_switchdev_notifier(&dsa_user_switchdev_notifier);
+ if (err)
+ pr_err("DSA: failed to unregister switchdev notifier (%d)\n", err);
+
+ err = unregister_netdevice_notifier(&dsa_user_nb);
+ if (err)
+ pr_err("DSA: failed to unregister user notifier (%d)\n", err);
+}
diff --git a/net/dsa/user.h b/net/dsa/user.h
new file mode 100644
index 000000000000..016884bead3c
--- /dev/null
+++ b/net/dsa/user.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_USER_H
+#define __DSA_USER_H
+
+#include <linux/if_bridge.h>
+#include <linux/if_vlan.h>
+#include <linux/list.h>
+#include <linux/netpoll.h>
+#include <linux/types.h>
+#include <net/dsa.h>
+#include <net/gro_cells.h>
+
+struct net_device;
+struct netlink_ext_ack;
+
+extern struct notifier_block dsa_user_switchdev_notifier;
+extern struct notifier_block dsa_user_switchdev_blocking_notifier;
+
+struct dsa_user_priv {
+ /* Copy of CPU port xmit for faster access in user transmit hot path */
+ struct sk_buff * (*xmit)(struct sk_buff *skb,
+ struct net_device *dev);
+
+ struct gro_cells gcells;
+
+ /* DSA port data, such as switch, port index, etc. */
+ struct dsa_port *dp;
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ struct netpoll *netpoll;
+#endif
+
+ /* TC context */
+ struct list_head mall_tc_list;
+};
+
+void dsa_user_mii_bus_init(struct dsa_switch *ds);
+int dsa_user_create(struct dsa_port *dp);
+void dsa_user_destroy(struct net_device *user_dev);
+int dsa_user_suspend(struct net_device *user_dev);
+int dsa_user_resume(struct net_device *user_dev);
+int dsa_user_register_notifier(void);
+void dsa_user_unregister_notifier(void);
+int dsa_user_host_uc_install(struct net_device *dev, const u8 *addr);
+void dsa_user_host_uc_uninstall(struct net_device *dev);
+void dsa_user_sync_ha(struct net_device *dev);
+void dsa_user_unsync_ha(struct net_device *dev);
+void dsa_user_setup_tagger(struct net_device *user);
+int dsa_user_change_mtu(struct net_device *dev, int new_mtu);
+int dsa_user_change_conduit(struct net_device *dev, struct net_device *conduit,
+ struct netlink_ext_ack *extack);
+int dsa_user_manage_vlan_filtering(struct net_device *dev,
+ bool vlan_filtering);
+
+static inline struct dsa_port *dsa_user_to_port(const struct net_device *dev)
+{
+ struct dsa_user_priv *p = netdev_priv(dev);
+
+ return p->dp;
+}
+
+static inline struct net_device *
+dsa_user_to_conduit(const struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+
+ return dsa_port_to_conduit(dp);
+}
+
+#endif
diff --git a/net/ethernet/Makefile b/net/ethernet/Makefile
index 323177505404..e03eff94e0db 100644
--- a/net/ethernet/Makefile
+++ b/net/ethernet/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the Linux Ethernet layer.
#
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index fd8faa0dfa61..13a63b48b7ee 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -31,11 +32,6 @@
* older network drivers and IFF_ALLMULTI.
* Christer Weinigel : Better rebuild header message.
* Andrew Morton : 26Feb01: kill ether_setup() - use netdev_boot_setup().
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
#include <linux/types.h>
@@ -47,6 +43,7 @@
#include <linux/inet.h>
#include <linux/ip.h>
#include <linux/netdevice.h>
+#include <linux/nvmem-consumer.h>
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <linux/errno.h>
@@ -54,6 +51,7 @@
#include <linux/if_ether.h>
#include <linux/of_net.h>
#include <linux/pci.h>
+#include <linux/property.h>
#include <net/dst.h>
#include <net/arp.h>
#include <net/sock.h>
@@ -61,11 +59,10 @@
#include <net/ip.h>
#include <net/dsa.h>
#include <net/flow_dissector.h>
+#include <net/gro.h>
#include <linux/uaccess.h>
#include <net/pkt_sched.h>
-__setup("ether=", netdev_boot_setup);
-
/**
* eth_header - create the Ethernet header
* @skb: buffer to alter
@@ -118,13 +115,14 @@ EXPORT_SYMBOL(eth_header);
/**
* eth_get_headlen - determine the length of header for an ethernet frame
+ * @dev: pointer to network device
* @data: pointer to start of frame
* @len: total length of frame
*
* Make a best effort attempt to pull the length for all of the headers for
* a given frame in a linear buffer.
*/
-u32 eth_get_headlen(void *data, unsigned int len)
+u32 eth_get_headlen(const struct net_device *dev, const void *data, u32 len)
{
const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
const struct ethhdr *eth = (const struct ethhdr *)data;
@@ -135,8 +133,9 @@ u32 eth_get_headlen(void *data, unsigned int len)
return len;
/* parse any remaining L2/L3 headers, check for L4 */
- if (!skb_flow_dissect_flow_keys_basic(NULL, &keys, data, eth->h_proto,
- sizeof(*eth), len, flags))
+ if (!skb_flow_dissect_flow_keys_basic(dev_net(dev), NULL, &keys, data,
+ eth->h_proto, sizeof(*eth),
+ len, flags))
return max_t(u32, keys.control.thoff, sizeof(*eth));
/* parse for any L4 headers */
@@ -155,25 +154,15 @@ EXPORT_SYMBOL(eth_get_headlen);
*/
__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
{
- unsigned short _service_access_point;
const unsigned short *sap;
const struct ethhdr *eth;
+ __be16 res;
skb->dev = dev;
skb_reset_mac_header(skb);
- eth = (struct ethhdr *)skb->data;
- skb_pull_inline(skb, ETH_HLEN);
-
- if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) {
- if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast))
- skb->pkt_type = PACKET_BROADCAST;
- else
- skb->pkt_type = PACKET_MULTICAST;
- }
- else if (unlikely(!ether_addr_equal_64bits(eth->h_dest,
- dev->dev_addr)))
- skb->pkt_type = PACKET_OTHERHOST;
+ eth = eth_skb_pull_mac(skb);
+ eth_skb_pkt_type(skb, dev);
/*
* Some variants of DSA tagging don't have an ethertype field
@@ -192,15 +181,15 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
* the protocol design and runs IPX over 802.3 without an 802.2 LLC
* layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This
* won't work for fault tolerant netware but does for the rest.
+ * We use skb->dev as temporary storage to not hit
+ * CONFIG_STACKPROTECTOR_STRONG=y costs on some platforms.
*/
- sap = skb_header_pointer(skb, 0, sizeof(*sap), &_service_access_point);
- if (sap && *sap == 0xFFFF)
- return htons(ETH_P_802_3);
+ sap = skb_header_pointer(skb, 0, sizeof(*sap), &skb->dev);
+ res = (sap && *sap == 0xFFFF) ? htons(ETH_P_802_3) : htons(ETH_P_802_2);
- /*
- * Real 802.2 LLC
- */
- return htons(ETH_P_802_2);
+ /* restore skb->dev in case it was mangled by skb_header_pointer(). */
+ skb->dev = dev;
+ return res;
}
EXPORT_SYMBOL(eth_type_trans);
@@ -239,7 +228,12 @@ int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16
eth->h_proto = type;
memcpy(eth->h_source, dev->dev_addr, ETH_ALEN);
memcpy(eth->h_dest, neigh->ha, ETH_ALEN);
- hh->hh_len = ETH_HLEN;
+
+ /* Pairs with READ_ONCE() in neigh_resolve_output(),
+ * neigh_hh_output() and neigh_update_hhs().
+ */
+ smp_store_release(&hh->hh_len, ETH_HLEN);
+
return 0;
}
EXPORT_SYMBOL(eth_header_cache);
@@ -262,6 +256,18 @@ void eth_header_cache_update(struct hh_cache *hh,
EXPORT_SYMBOL(eth_header_cache_update);
/**
+ * eth_header_parse_protocol - extract protocol from L2 header
+ * @skb: packet to extract protocol from
+ */
+__be16 eth_header_parse_protocol(const struct sk_buff *skb)
+{
+ const struct ethhdr *eth = eth_hdr(skb);
+
+ return eth->h_proto;
+}
+EXPORT_SYMBOL(eth_header_parse_protocol);
+
+/**
* eth_prepare_mac_addr_change - prepare for mac change
* @dev: network device
* @p: socket address
@@ -287,7 +293,7 @@ void eth_commit_mac_addr_change(struct net_device *dev, void *p)
{
struct sockaddr *addr = p;
- memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
+ eth_hw_addr_set(dev, addr->sa_data);
}
EXPORT_SYMBOL(eth_commit_mac_addr_change);
@@ -313,22 +319,6 @@ int eth_mac_addr(struct net_device *dev, void *p)
}
EXPORT_SYMBOL(eth_mac_addr);
-/**
- * eth_change_mtu - set new MTU size
- * @dev: network device
- * @new_mtu: new Maximum Transfer Unit
- *
- * Allow changing MTU size. Needs to be overridden for devices
- * supporting jumbo frames.
- */
-int eth_change_mtu(struct net_device *dev, int new_mtu)
-{
- netdev_warn(dev, "%s is deprecated\n", __func__);
- dev->mtu = new_mtu;
- return 0;
-}
-EXPORT_SYMBOL(eth_change_mtu);
-
int eth_validate_addr(struct net_device *dev)
{
if (!is_valid_ether_addr(dev->dev_addr))
@@ -343,6 +333,7 @@ const struct header_ops eth_header_ops ____cacheline_aligned = {
.parse = eth_header_parse,
.cache = eth_header_cache,
.cache_update = eth_header_cache_update,
+ .parse_protocol = eth_header_parse_protocol,
};
/**
@@ -388,42 +379,14 @@ EXPORT_SYMBOL(ether_setup);
struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
unsigned int rxqs)
{
- return alloc_netdev_mqs(sizeof_priv, "eth%d", NET_NAME_UNKNOWN,
+ return alloc_netdev_mqs(sizeof_priv, "eth%d", NET_NAME_ENUM,
ether_setup, txqs, rxqs);
}
EXPORT_SYMBOL(alloc_etherdev_mqs);
-static void devm_free_netdev(struct device *dev, void *res)
-{
- free_netdev(*(struct net_device **)res);
-}
-
-struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv,
- unsigned int txqs, unsigned int rxqs)
-{
- struct net_device **dr;
- struct net_device *netdev;
-
- dr = devres_alloc(devm_free_netdev, sizeof(*dr), GFP_KERNEL);
- if (!dr)
- return NULL;
-
- netdev = alloc_etherdev_mqs(sizeof_priv, txqs, rxqs);
- if (!netdev) {
- devres_free(dr);
- return NULL;
- }
-
- *dr = netdev;
- devres_add(dev, dr);
-
- return netdev;
-}
-EXPORT_SYMBOL(devm_alloc_etherdev_mqs);
-
ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
{
- return scnprintf(buf, PAGE_SIZE, "%*phC\n", len, addr);
+ return sysfs_emit(buf, "%*phC\n", len, addr);
}
EXPORT_SYMBOL(sysfs_format_mac);
@@ -439,12 +402,9 @@ struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb)
off_eth = skb_gro_offset(skb);
hlen = off_eth + sizeof(*eh);
- eh = skb_gro_header_fast(skb, off_eth);
- if (skb_gro_header_hard(skb, hlen)) {
- eh = skb_gro_header_slow(skb, hlen, off_eth);
- if (unlikely(!eh))
- goto out;
- }
+ eh = skb_gro_header(skb, hlen, off_eth);
+ if (unlikely(!eh))
+ goto out;
flush = 0;
@@ -461,19 +421,19 @@ struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb)
type = eh->h_proto;
- rcu_read_lock();
ptype = gro_find_receive_by_type(type);
if (ptype == NULL) {
flush = 1;
- goto out_unlock;
+ goto out;
}
skb_gro_pull(skb, sizeof(*eh));
skb_gro_postpull_rcsum(skb, eh, sizeof(*eh));
- pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
-out_unlock:
- rcu_read_unlock();
+ pp = indirect_call_gro_receive_inet(ptype->callbacks.gro_receive,
+ ipv6_gro_receive, inet_gro_receive,
+ head, skb);
+
out:
skb_gro_flush_final(skb, pp, flush);
@@ -491,13 +451,12 @@ int eth_gro_complete(struct sk_buff *skb, int nhoff)
if (skb->encapsulation)
skb_set_inner_mac_header(skb, nhoff);
- rcu_read_lock();
ptype = gro_find_complete_by_type(type);
if (ptype != NULL)
- err = ptype->callbacks.gro_complete(skb, nhoff +
- sizeof(struct ethhdr));
+ err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
+ ipv6_gro_complete, inet_gro_complete,
+ skb, nhoff + sizeof(*eh));
- rcu_read_unlock();
return err;
}
EXPORT_SYMBOL(eth_gro_complete);
@@ -527,24 +486,156 @@ unsigned char * __weak arch_get_platform_mac_address(void)
int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr)
{
- const unsigned char *addr;
- struct device_node *dp;
-
- if (dev_is_pci(dev))
- dp = pci_device_to_OF_node(to_pci_dev(dev));
- else
- dp = dev->of_node;
+ unsigned char *addr;
+ int ret;
- addr = NULL;
- if (dp)
- addr = of_get_mac_address(dp);
- if (!addr)
- addr = arch_get_platform_mac_address();
+ ret = of_get_mac_address(dev->of_node, mac_addr);
+ if (!ret)
+ return 0;
+ addr = arch_get_platform_mac_address();
if (!addr)
return -ENODEV;
ether_addr_copy(mac_addr, addr);
+
return 0;
}
EXPORT_SYMBOL(eth_platform_get_mac_address);
+
+/**
+ * platform_get_ethdev_address - Set netdev's MAC address from a given device
+ * @dev: Pointer to the device
+ * @netdev: Pointer to netdev to write the address to
+ *
+ * Wrapper around eth_platform_get_mac_address() which writes the address
+ * directly to netdev->dev_addr.
+ */
+int platform_get_ethdev_address(struct device *dev, struct net_device *netdev)
+{
+ u8 addr[ETH_ALEN] __aligned(2);
+ int ret;
+
+ ret = eth_platform_get_mac_address(dev, addr);
+ if (!ret)
+ eth_hw_addr_set(netdev, addr);
+ return ret;
+}
+EXPORT_SYMBOL(platform_get_ethdev_address);
+
+/**
+ * nvmem_get_mac_address - Obtain the MAC address from an nvmem cell named
+ * 'mac-address' associated with given device.
+ *
+ * @dev: Device with which the mac-address cell is associated.
+ * @addrbuf: Buffer to which the MAC address will be copied on success.
+ *
+ * Returns 0 on success or a negative error number on failure.
+ */
+int nvmem_get_mac_address(struct device *dev, void *addrbuf)
+{
+ struct nvmem_cell *cell;
+ const void *mac;
+ size_t len;
+
+ cell = nvmem_cell_get(dev, "mac-address");
+ if (IS_ERR(cell))
+ return PTR_ERR(cell);
+
+ mac = nvmem_cell_read(cell, &len);
+ nvmem_cell_put(cell);
+
+ if (IS_ERR(mac))
+ return PTR_ERR(mac);
+
+ if (len != ETH_ALEN || !is_valid_ether_addr(mac)) {
+ kfree(mac);
+ return -EINVAL;
+ }
+
+ ether_addr_copy(addrbuf, mac);
+ kfree(mac);
+
+ return 0;
+}
+
+static int fwnode_get_mac_addr(struct fwnode_handle *fwnode,
+ const char *name, char *addr)
+{
+ int ret;
+
+ ret = fwnode_property_read_u8_array(fwnode, name, addr, ETH_ALEN);
+ if (ret)
+ return ret;
+
+ if (!is_valid_ether_addr(addr))
+ return -EINVAL;
+ return 0;
+}
+
+/**
+ * fwnode_get_mac_address - Get the MAC from the firmware node
+ * @fwnode: Pointer to the firmware node
+ * @addr: Address of buffer to store the MAC in
+ *
+ * Search the firmware node for the best MAC address to use. 'mac-address' is
+ * checked first, because that is supposed to contain to "most recent" MAC
+ * address. If that isn't set, then 'local-mac-address' is checked next,
+ * because that is the default address. If that isn't set, then the obsolete
+ * 'address' is checked, just in case we're using an old device tree.
+ *
+ * Note that the 'address' property is supposed to contain a virtual address of
+ * the register set, but some DTS files have redefined that property to be the
+ * MAC address.
+ *
+ * All-zero MAC addresses are rejected, because those could be properties that
+ * exist in the firmware tables, but were not updated by the firmware. For
+ * example, the DTS could define 'mac-address' and 'local-mac-address', with
+ * zero MAC addresses. Some older U-Boots only initialized 'local-mac-address'.
+ * In this case, the real MAC is in 'local-mac-address', and 'mac-address'
+ * exists but is all zeros.
+ */
+int fwnode_get_mac_address(struct fwnode_handle *fwnode, char *addr)
+{
+ if (!fwnode_get_mac_addr(fwnode, "mac-address", addr) ||
+ !fwnode_get_mac_addr(fwnode, "local-mac-address", addr) ||
+ !fwnode_get_mac_addr(fwnode, "address", addr))
+ return 0;
+
+ return -ENOENT;
+}
+EXPORT_SYMBOL(fwnode_get_mac_address);
+
+/**
+ * device_get_mac_address - Get the MAC for a given device
+ * @dev: Pointer to the device
+ * @addr: Address of buffer to store the MAC in
+ */
+int device_get_mac_address(struct device *dev, char *addr)
+{
+ if (!fwnode_get_mac_address(dev_fwnode(dev), addr))
+ return 0;
+
+ return nvmem_get_mac_address(dev, addr);
+}
+EXPORT_SYMBOL(device_get_mac_address);
+
+/**
+ * device_get_ethdev_address - Set netdev's MAC address from a given device
+ * @dev: Pointer to the device
+ * @netdev: Pointer to netdev to write the address to
+ *
+ * Wrapper around device_get_mac_address() which writes the address
+ * directly to netdev->dev_addr.
+ */
+int device_get_ethdev_address(struct device *dev, struct net_device *netdev)
+{
+ u8 addr[ETH_ALEN];
+ int ret;
+
+ ret = device_get_mac_address(dev, addr);
+ if (!ret)
+ eth_hw_addr_set(netdev, addr);
+ return ret;
+}
+EXPORT_SYMBOL(device_get_ethdev_address);
diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile
new file mode 100644
index 000000000000..629c10916670
--- /dev/null
+++ b/net/ethtool/Makefile
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-y += ioctl.o common.o
+
+obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o
+
+ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o rss.o \
+ linkstate.o debug.o wol.o features.o privflags.o rings.o \
+ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \
+ tunnels.o fec.o eeprom.o stats.o phc_vclocks.o mm.o \
+ module.o cmis_fw_update.o cmis_cdb.o pse-pd.o plca.o \
+ phy.o tsconfig.o mse.o
diff --git a/net/ethtool/bitset.c b/net/ethtool/bitset.c
new file mode 100644
index 000000000000..f0883357d12e
--- /dev/null
+++ b/net/ethtool/bitset.c
@@ -0,0 +1,873 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool_netlink.h>
+#include <linux/bitmap.h>
+#include "netlink.h"
+#include "bitset.h"
+
+/* Some bitmaps are internally represented as an array of unsigned long, some
+ * as an array of u32 (some even as single u32 for now). To avoid the need of
+ * wrappers on caller side, we provide two set of functions: those with "32"
+ * suffix in their names expect u32 based bitmaps, those without it expect
+ * unsigned long bitmaps.
+ */
+
+static u32 ethnl_lower_bits(unsigned int n)
+{
+ return ~(u32)0 >> (32 - n % 32);
+}
+
+static u32 ethnl_upper_bits(unsigned int n)
+{
+ return ~(u32)0 << (n % 32);
+}
+
+/**
+ * ethnl_bitmap32_clear() - Clear u32 based bitmap
+ * @dst: bitmap to clear
+ * @start: beginning of the interval
+ * @end: end of the interval
+ * @mod: set if bitmap was modified
+ *
+ * Clear @nbits bits of a bitmap with indices @start <= i < @end
+ */
+static void ethnl_bitmap32_clear(u32 *dst, unsigned int start, unsigned int end,
+ bool *mod)
+{
+ unsigned int start_word = start / 32;
+ unsigned int end_word = end / 32;
+ unsigned int i;
+ u32 mask;
+
+ if (end <= start)
+ return;
+
+ if (start % 32) {
+ mask = ethnl_upper_bits(start);
+ if (end_word == start_word) {
+ mask &= ethnl_lower_bits(end);
+ if (dst[start_word] & mask) {
+ dst[start_word] &= ~mask;
+ *mod = true;
+ }
+ return;
+ }
+ if (dst[start_word] & mask) {
+ dst[start_word] &= ~mask;
+ *mod = true;
+ }
+ start_word++;
+ }
+
+ for (i = start_word; i < end_word; i++) {
+ if (dst[i]) {
+ dst[i] = 0;
+ *mod = true;
+ }
+ }
+ if (end % 32) {
+ mask = ethnl_lower_bits(end);
+ if (dst[end_word] & mask) {
+ dst[end_word] &= ~mask;
+ *mod = true;
+ }
+ }
+}
+
+/**
+ * ethnl_bitmap32_not_zero() - Check if any bit is set in an interval
+ * @map: bitmap to test
+ * @start: beginning of the interval
+ * @end: end of the interval
+ *
+ * Return: true if there is non-zero bit with index @start <= i < @end,
+ * false if the whole interval is zero
+ */
+static bool ethnl_bitmap32_not_zero(const u32 *map, unsigned int start,
+ unsigned int end)
+{
+ unsigned int start_word = start / 32;
+ unsigned int end_word = end / 32;
+ u32 mask;
+
+ if (end <= start)
+ return true;
+
+ if (start % 32) {
+ mask = ethnl_upper_bits(start);
+ if (end_word == start_word) {
+ mask &= ethnl_lower_bits(end);
+ return map[start_word] & mask;
+ }
+ if (map[start_word] & mask)
+ return true;
+ start_word++;
+ }
+
+ if (!memchr_inv(map + start_word, '\0',
+ (end_word - start_word) * sizeof(u32)))
+ return true;
+ if (end % 32 == 0)
+ return true;
+ return map[end_word] & ethnl_lower_bits(end);
+}
+
+/**
+ * ethnl_bitmap32_update() - Modify u32 based bitmap according to value/mask
+ * pair
+ * @dst: bitmap to update
+ * @nbits: bit size of the bitmap
+ * @value: values to set
+ * @mask: mask of bits to set
+ * @mod: set to true if bitmap is modified, preserve if not
+ *
+ * Set bits in @dst bitmap which are set in @mask to values from @value, leave
+ * the rest untouched. If destination bitmap was modified, set @mod to true,
+ * leave as it is if not.
+ */
+static void ethnl_bitmap32_update(u32 *dst, unsigned int nbits,
+ const u32 *value, const u32 *mask, bool *mod)
+{
+ while (nbits > 0) {
+ u32 real_mask = mask ? *mask : ~(u32)0;
+ u32 new_value;
+
+ if (nbits < 32)
+ real_mask &= ethnl_lower_bits(nbits);
+ new_value = (*dst & ~real_mask) | (*value & real_mask);
+ if (new_value != *dst) {
+ *dst = new_value;
+ *mod = true;
+ }
+
+ if (nbits <= 32)
+ break;
+ dst++;
+ nbits -= 32;
+ value++;
+ if (mask)
+ mask++;
+ }
+}
+
+static bool ethnl_bitmap32_test_bit(const u32 *map, unsigned int index)
+{
+ return map[index / 32] & (1U << (index % 32));
+}
+
+/**
+ * ethnl_bitset32_size() - Calculate size of bitset nested attribute
+ * @val: value bitmap (u32 based)
+ * @mask: mask bitmap (u32 based, optional)
+ * @nbits: bit length of the bitset
+ * @names: array of bit names (optional)
+ * @compact: assume compact format for output
+ *
+ * Estimate length of netlink attribute composed by a later call to
+ * ethnl_put_bitset32() call with the same arguments.
+ *
+ * Return: negative error code or attribute length estimate
+ */
+int ethnl_bitset32_size(const u32 *val, const u32 *mask, unsigned int nbits,
+ ethnl_string_array_t names, bool compact)
+{
+ unsigned int len = 0;
+
+ /* list flag */
+ if (!mask)
+ len += nla_total_size(sizeof(u32));
+ /* size */
+ len += nla_total_size(sizeof(u32));
+
+ if (compact) {
+ unsigned int nwords = DIV_ROUND_UP(nbits, 32);
+
+ /* value, mask */
+ len += (mask ? 2 : 1) * nla_total_size(nwords * sizeof(u32));
+ } else {
+ unsigned int bits_len = 0;
+ unsigned int bit_len, i;
+
+ for (i = 0; i < nbits; i++) {
+ const char *name = names ? names[i] : NULL;
+
+ if (!ethnl_bitmap32_test_bit(mask ?: val, i))
+ continue;
+ /* index */
+ bit_len = nla_total_size(sizeof(u32));
+ /* name */
+ if (name)
+ bit_len += ethnl_strz_size(name);
+ /* value */
+ if (mask && ethnl_bitmap32_test_bit(val, i))
+ bit_len += nla_total_size(0);
+
+ /* bit nest */
+ bits_len += nla_total_size(bit_len);
+ }
+ /* bits nest */
+ len += nla_total_size(bits_len);
+ }
+
+ /* outermost nest */
+ return nla_total_size(len);
+}
+
+/**
+ * ethnl_put_bitset32() - Put a bitset nest into a message
+ * @skb: skb with the message
+ * @attrtype: attribute type for the bitset nest
+ * @val: value bitmap (u32 based)
+ * @mask: mask bitmap (u32 based, optional)
+ * @nbits: bit length of the bitset
+ * @names: array of bit names (optional)
+ * @compact: use compact format for the output
+ *
+ * Compose a nested attribute representing a bitset. If @mask is null, simple
+ * bitmap (bit list) is created, if @mask is provided, represent a value/mask
+ * pair. Bit names are only used in verbose mode and when provided by calller.
+ *
+ * Return: 0 on success, negative error value on error
+ */
+int ethnl_put_bitset32(struct sk_buff *skb, int attrtype, const u32 *val,
+ const u32 *mask, unsigned int nbits,
+ ethnl_string_array_t names, bool compact)
+{
+ struct nlattr *nest;
+ struct nlattr *attr;
+
+ nest = nla_nest_start(skb, attrtype);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (!mask && nla_put_flag(skb, ETHTOOL_A_BITSET_NOMASK))
+ goto nla_put_failure;
+ if (nla_put_u32(skb, ETHTOOL_A_BITSET_SIZE, nbits))
+ goto nla_put_failure;
+ if (compact) {
+ unsigned int nwords = DIV_ROUND_UP(nbits, 32);
+ unsigned int nbytes = nwords * sizeof(u32);
+ u32 *dst;
+
+ attr = nla_reserve(skb, ETHTOOL_A_BITSET_VALUE, nbytes);
+ if (!attr)
+ goto nla_put_failure;
+ dst = nla_data(attr);
+ memcpy(dst, val, nbytes);
+ if (nbits % 32)
+ dst[nwords - 1] &= ethnl_lower_bits(nbits);
+
+ if (mask) {
+ attr = nla_reserve(skb, ETHTOOL_A_BITSET_MASK, nbytes);
+ if (!attr)
+ goto nla_put_failure;
+ dst = nla_data(attr);
+ memcpy(dst, mask, nbytes);
+ if (nbits % 32)
+ dst[nwords - 1] &= ethnl_lower_bits(nbits);
+ }
+ } else {
+ struct nlattr *bits;
+ unsigned int i;
+
+ bits = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS);
+ if (!bits)
+ goto nla_put_failure;
+ for (i = 0; i < nbits; i++) {
+ const char *name = names ? names[i] : NULL;
+
+ if (!ethnl_bitmap32_test_bit(mask ?: val, i))
+ continue;
+ attr = nla_nest_start(skb, ETHTOOL_A_BITSET_BITS_BIT);
+ if (!attr)
+ goto nla_put_failure;
+ if (nla_put_u32(skb, ETHTOOL_A_BITSET_BIT_INDEX, i))
+ goto nla_put_failure;
+ if (name &&
+ ethnl_put_strz(skb, ETHTOOL_A_BITSET_BIT_NAME, name))
+ goto nla_put_failure;
+ if (mask && ethnl_bitmap32_test_bit(val, i) &&
+ nla_put_flag(skb, ETHTOOL_A_BITSET_BIT_VALUE))
+ goto nla_put_failure;
+ nla_nest_end(skb, attr);
+ }
+ nla_nest_end(skb, bits);
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy bitset_policy[] = {
+ [ETHTOOL_A_BITSET_NOMASK] = { .type = NLA_FLAG },
+ [ETHTOOL_A_BITSET_SIZE] = NLA_POLICY_MAX(NLA_U32,
+ ETHNL_MAX_BITSET_SIZE),
+ [ETHTOOL_A_BITSET_BITS] = { .type = NLA_NESTED },
+ [ETHTOOL_A_BITSET_VALUE] = { .type = NLA_BINARY },
+ [ETHTOOL_A_BITSET_MASK] = { .type = NLA_BINARY },
+};
+
+static const struct nla_policy bit_policy[] = {
+ [ETHTOOL_A_BITSET_BIT_INDEX] = { .type = NLA_U32 },
+ [ETHTOOL_A_BITSET_BIT_NAME] = { .type = NLA_NUL_STRING },
+ [ETHTOOL_A_BITSET_BIT_VALUE] = { .type = NLA_FLAG },
+};
+
+/**
+ * ethnl_bitset_is_compact() - check if bitset attribute represents a compact
+ * bitset
+ * @bitset: nested attribute representing a bitset
+ * @compact: pointer for return value
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact)
+{
+ struct nlattr *tb[ARRAY_SIZE(bitset_policy)];
+ int ret;
+
+ ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, bitset,
+ bitset_policy, NULL);
+ if (ret < 0)
+ return ret;
+
+ if (tb[ETHTOOL_A_BITSET_BITS]) {
+ if (tb[ETHTOOL_A_BITSET_VALUE] || tb[ETHTOOL_A_BITSET_MASK])
+ return -EINVAL;
+ *compact = false;
+ return 0;
+ }
+ if (!tb[ETHTOOL_A_BITSET_SIZE] || !tb[ETHTOOL_A_BITSET_VALUE])
+ return -EINVAL;
+
+ *compact = true;
+ return 0;
+}
+
+/**
+ * ethnl_name_to_idx() - look up string index for a name
+ * @names: array of ETH_GSTRING_LEN sized strings
+ * @n_names: number of strings in the array
+ * @name: name to look up
+ *
+ * Return: index of the string if found, -ENOENT if not found
+ */
+static int ethnl_name_to_idx(ethnl_string_array_t names, unsigned int n_names,
+ const char *name)
+{
+ unsigned int i;
+
+ if (!names)
+ return -ENOENT;
+
+ for (i = 0; i < n_names; i++) {
+ /* names[i] may not be null terminated */
+ if (!strncmp(names[i], name, ETH_GSTRING_LEN) &&
+ strlen(name) <= ETH_GSTRING_LEN)
+ return i;
+ }
+
+ return -ENOENT;
+}
+
+static int ethnl_parse_bit(unsigned int *index, bool *val, unsigned int nbits,
+ const struct nlattr *bit_attr, bool no_mask,
+ ethnl_string_array_t names,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(bit_policy)];
+ int ret, idx;
+
+ ret = nla_parse_nested(tb, ARRAY_SIZE(bit_policy) - 1, bit_attr,
+ bit_policy, extack);
+ if (ret < 0)
+ return ret;
+
+ if (tb[ETHTOOL_A_BITSET_BIT_INDEX]) {
+ const char *name;
+
+ idx = nla_get_u32(tb[ETHTOOL_A_BITSET_BIT_INDEX]);
+ if (idx >= nbits) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[ETHTOOL_A_BITSET_BIT_INDEX],
+ "bit index too high");
+ return -EOPNOTSUPP;
+ }
+ name = names ? names[idx] : NULL;
+ if (tb[ETHTOOL_A_BITSET_BIT_NAME] && name &&
+ strncmp(nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME]), name,
+ nla_len(tb[ETHTOOL_A_BITSET_BIT_NAME]))) {
+ NL_SET_ERR_MSG_ATTR(extack, bit_attr,
+ "bit index and name mismatch");
+ return -EINVAL;
+ }
+ } else if (tb[ETHTOOL_A_BITSET_BIT_NAME]) {
+ idx = ethnl_name_to_idx(names, nbits,
+ nla_data(tb[ETHTOOL_A_BITSET_BIT_NAME]));
+ if (idx < 0) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[ETHTOOL_A_BITSET_BIT_NAME],
+ "bit name not found");
+ return -EOPNOTSUPP;
+ }
+ } else {
+ NL_SET_ERR_MSG_ATTR(extack, bit_attr,
+ "neither bit index nor name specified");
+ return -EINVAL;
+ }
+
+ *index = idx;
+ *val = no_mask || tb[ETHTOOL_A_BITSET_BIT_VALUE];
+ return 0;
+}
+
+/**
+ * ethnl_bitmap32_equal() - Compare two bitmaps
+ * @map1: first bitmap
+ * @map2: second bitmap
+ * @nbits: bit size to compare
+ *
+ * Return: true if first @nbits are equal, false if not
+ */
+static bool ethnl_bitmap32_equal(const u32 *map1, const u32 *map2,
+ unsigned int nbits)
+{
+ if (memcmp(map1, map2, nbits / 32 * sizeof(u32)))
+ return false;
+ if (nbits % 32 == 0)
+ return true;
+ return !((map1[nbits / 32] ^ map2[nbits / 32]) &
+ ethnl_lower_bits(nbits % 32));
+}
+
+static int
+ethnl_update_bitset32_verbose(u32 *bitmap, unsigned int nbits,
+ const struct nlattr *attr, struct nlattr **tb,
+ ethnl_string_array_t names,
+ struct netlink_ext_ack *extack, bool *mod)
+{
+ u32 *saved_bitmap = NULL;
+ struct nlattr *bit_attr;
+ bool no_mask;
+ int rem;
+ int ret;
+
+ if (tb[ETHTOOL_A_BITSET_VALUE]) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE],
+ "value only allowed in compact bitset");
+ return -EINVAL;
+ }
+ if (tb[ETHTOOL_A_BITSET_MASK]) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK],
+ "mask only allowed in compact bitset");
+ return -EINVAL;
+ }
+
+ no_mask = tb[ETHTOOL_A_BITSET_NOMASK];
+ if (no_mask) {
+ unsigned int nwords = DIV_ROUND_UP(nbits, 32);
+ unsigned int nbytes = nwords * sizeof(u32);
+ bool dummy;
+
+ /* The bitmap size is only the size of the map part without
+ * its mask part.
+ */
+ saved_bitmap = kcalloc(nwords, sizeof(u32), GFP_KERNEL);
+ if (!saved_bitmap)
+ return -ENOMEM;
+ memcpy(saved_bitmap, bitmap, nbytes);
+ ethnl_bitmap32_clear(bitmap, 0, nbits, &dummy);
+ }
+
+ nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) {
+ bool old_val, new_val;
+ unsigned int idx;
+
+ if (nla_type(bit_attr) != ETHTOOL_A_BITSET_BITS_BIT) {
+ NL_SET_ERR_MSG_ATTR(extack, bit_attr,
+ "only ETHTOOL_A_BITSET_BITS_BIT allowed in ETHTOOL_A_BITSET_BITS");
+ kfree(saved_bitmap);
+ return -EINVAL;
+ }
+ ret = ethnl_parse_bit(&idx, &new_val, nbits, bit_attr, no_mask,
+ names, extack);
+ if (ret < 0) {
+ kfree(saved_bitmap);
+ return ret;
+ }
+ old_val = bitmap[idx / 32] & ((u32)1 << (idx % 32));
+ if (new_val != old_val) {
+ if (new_val)
+ bitmap[idx / 32] |= ((u32)1 << (idx % 32));
+ else
+ bitmap[idx / 32] &= ~((u32)1 << (idx % 32));
+ if (!no_mask)
+ *mod = true;
+ }
+ }
+
+ if (no_mask && !ethnl_bitmap32_equal(saved_bitmap, bitmap, nbits))
+ *mod = true;
+
+ kfree(saved_bitmap);
+ return 0;
+}
+
+static int ethnl_compact_sanity_checks(unsigned int nbits,
+ const struct nlattr *nest,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ bool no_mask = tb[ETHTOOL_A_BITSET_NOMASK];
+ unsigned int attr_nbits, attr_nwords;
+ const struct nlattr *test_attr;
+
+ if (no_mask && tb[ETHTOOL_A_BITSET_MASK]) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK],
+ "mask not allowed in list bitset");
+ return -EINVAL;
+ }
+ if (!tb[ETHTOOL_A_BITSET_SIZE]) {
+ NL_SET_ERR_MSG_ATTR(extack, nest,
+ "missing size in compact bitset");
+ return -EINVAL;
+ }
+ if (!tb[ETHTOOL_A_BITSET_VALUE]) {
+ NL_SET_ERR_MSG_ATTR(extack, nest,
+ "missing value in compact bitset");
+ return -EINVAL;
+ }
+ if (!no_mask && !tb[ETHTOOL_A_BITSET_MASK]) {
+ NL_SET_ERR_MSG_ATTR(extack, nest,
+ "missing mask in compact nonlist bitset");
+ return -EINVAL;
+ }
+
+ attr_nbits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]);
+ attr_nwords = DIV_ROUND_UP(attr_nbits, 32);
+ if (nla_len(tb[ETHTOOL_A_BITSET_VALUE]) != attr_nwords * sizeof(u32)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE],
+ "bitset value length does not match size");
+ return -EINVAL;
+ }
+ if (tb[ETHTOOL_A_BITSET_MASK] &&
+ nla_len(tb[ETHTOOL_A_BITSET_MASK]) != attr_nwords * sizeof(u32)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK],
+ "bitset mask length does not match size");
+ return -EINVAL;
+ }
+ if (attr_nbits <= nbits)
+ return 0;
+
+ test_attr = no_mask ? tb[ETHTOOL_A_BITSET_VALUE] :
+ tb[ETHTOOL_A_BITSET_MASK];
+ if (ethnl_bitmap32_not_zero(nla_data(test_attr), nbits, attr_nbits)) {
+ NL_SET_ERR_MSG_ATTR(extack, test_attr,
+ "cannot modify bits past kernel bitset size");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
+ * ethnl_update_bitset32() - Apply a bitset nest to a u32 based bitmap
+ * @bitmap: bitmap to update
+ * @nbits: size of the updated bitmap in bits
+ * @attr: nest attribute to parse and apply
+ * @names: array of bit names; may be null for compact format
+ * @extack: extack for error reporting
+ * @mod: set this to true if bitmap is modified, leave as it is if not
+ *
+ * Apply bitset netsted attribute to a bitmap. If the attribute represents
+ * a bit list, @bitmap is set to its contents; otherwise, bits in mask are
+ * set to values from value. Bitmaps in the attribute may be longer than
+ * @nbits but the message must not request modifying any bits past @nbits.
+ *
+ * Return: negative error code on failure, 0 on success
+ */
+int ethnl_update_bitset32(u32 *bitmap, unsigned int nbits,
+ const struct nlattr *attr, ethnl_string_array_t names,
+ struct netlink_ext_ack *extack, bool *mod)
+{
+ struct nlattr *tb[ARRAY_SIZE(bitset_policy)];
+ unsigned int change_bits;
+ bool no_mask;
+ int ret;
+
+ if (!attr)
+ return 0;
+ ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, attr,
+ bitset_policy, extack);
+ if (ret < 0)
+ return ret;
+
+ if (tb[ETHTOOL_A_BITSET_BITS])
+ return ethnl_update_bitset32_verbose(bitmap, nbits, attr, tb,
+ names, extack, mod);
+ ret = ethnl_compact_sanity_checks(nbits, attr, tb, extack);
+ if (ret < 0)
+ return ret;
+
+ no_mask = tb[ETHTOOL_A_BITSET_NOMASK];
+ change_bits = min_t(unsigned int,
+ nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]), nbits);
+ ethnl_bitmap32_update(bitmap, change_bits,
+ nla_data(tb[ETHTOOL_A_BITSET_VALUE]),
+ no_mask ? NULL :
+ nla_data(tb[ETHTOOL_A_BITSET_MASK]),
+ mod);
+ if (no_mask && change_bits < nbits)
+ ethnl_bitmap32_clear(bitmap, change_bits, nbits, mod);
+
+ return 0;
+}
+
+/**
+ * ethnl_parse_bitset() - Compute effective value and mask from bitset nest
+ * @val: unsigned long based bitmap to put value into
+ * @mask: unsigned long based bitmap to put mask into
+ * @nbits: size of @val and @mask bitmaps
+ * @attr: nest attribute to parse and apply
+ * @names: array of bit names; may be null for compact format
+ * @extack: extack for error reporting
+ *
+ * Provide @nbits size long bitmaps for value and mask so that
+ * x = (val & mask) | (x & ~mask) would modify any @nbits sized bitmap x
+ * the same way ethnl_update_bitset() with the same bitset attribute would.
+ *
+ * Return: negative error code on failure, 0 on success
+ */
+int ethnl_parse_bitset(unsigned long *val, unsigned long *mask,
+ unsigned int nbits, const struct nlattr *attr,
+ ethnl_string_array_t names,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(bitset_policy)];
+ const struct nlattr *bit_attr;
+ bool no_mask;
+ int rem;
+ int ret;
+
+ if (!attr)
+ return 0;
+ ret = nla_parse_nested(tb, ARRAY_SIZE(bitset_policy) - 1, attr,
+ bitset_policy, extack);
+ if (ret < 0)
+ return ret;
+ no_mask = tb[ETHTOOL_A_BITSET_NOMASK];
+
+ if (!tb[ETHTOOL_A_BITSET_BITS]) {
+ unsigned int change_bits;
+
+ ret = ethnl_compact_sanity_checks(nbits, attr, tb, extack);
+ if (ret < 0)
+ return ret;
+
+ change_bits = nla_get_u32(tb[ETHTOOL_A_BITSET_SIZE]);
+ if (change_bits > nbits)
+ change_bits = nbits;
+ bitmap_from_arr32(val, nla_data(tb[ETHTOOL_A_BITSET_VALUE]),
+ change_bits);
+ if (change_bits < nbits)
+ bitmap_clear(val, change_bits, nbits - change_bits);
+ if (no_mask) {
+ bitmap_fill(mask, nbits);
+ } else {
+ bitmap_from_arr32(mask,
+ nla_data(tb[ETHTOOL_A_BITSET_MASK]),
+ change_bits);
+ if (change_bits < nbits)
+ bitmap_clear(mask, change_bits,
+ nbits - change_bits);
+ }
+
+ return 0;
+ }
+
+ if (tb[ETHTOOL_A_BITSET_VALUE]) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_VALUE],
+ "value only allowed in compact bitset");
+ return -EINVAL;
+ }
+ if (tb[ETHTOOL_A_BITSET_MASK]) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_BITSET_MASK],
+ "mask only allowed in compact bitset");
+ return -EINVAL;
+ }
+
+ bitmap_zero(val, nbits);
+ if (no_mask)
+ bitmap_fill(mask, nbits);
+ else
+ bitmap_zero(mask, nbits);
+
+ nla_for_each_nested(bit_attr, tb[ETHTOOL_A_BITSET_BITS], rem) {
+ unsigned int idx;
+ bool bit_val;
+
+ ret = ethnl_parse_bit(&idx, &bit_val, nbits, bit_attr, no_mask,
+ names, extack);
+ if (ret < 0)
+ return ret;
+ if (bit_val)
+ __set_bit(idx, val);
+ if (!no_mask)
+ __set_bit(idx, mask);
+ }
+
+ return 0;
+}
+
+#if BITS_PER_LONG == 64 && defined(__BIG_ENDIAN)
+
+/* 64-bit big endian architectures are the only case when u32 based bitmaps
+ * and unsigned long based bitmaps have different memory layout so that we
+ * cannot simply cast the latter to the former and need actual wrappers
+ * converting the latter to the former.
+ *
+ * To reduce the number of slab allocations, the wrappers use fixed size local
+ * variables for bitmaps up to ETHNL_SMALL_BITMAP_BITS bits which is the
+ * majority of bitmaps used by ethtool.
+ */
+#define ETHNL_SMALL_BITMAP_BITS 128
+#define ETHNL_SMALL_BITMAP_WORDS DIV_ROUND_UP(ETHNL_SMALL_BITMAP_BITS, 32)
+
+int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask,
+ unsigned int nbits, ethnl_string_array_t names,
+ bool compact)
+{
+ u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS];
+ u32 small_val32[ETHNL_SMALL_BITMAP_WORDS];
+ u32 *mask32;
+ u32 *val32;
+ int ret;
+
+ if (nbits > ETHNL_SMALL_BITMAP_BITS) {
+ unsigned int nwords = DIV_ROUND_UP(nbits, 32);
+
+ val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL);
+ if (!val32)
+ return -ENOMEM;
+ mask32 = val32 + nwords;
+ } else {
+ val32 = small_val32;
+ mask32 = small_mask32;
+ }
+
+ bitmap_to_arr32(val32, val, nbits);
+ if (mask)
+ bitmap_to_arr32(mask32, mask, nbits);
+ else
+ mask32 = NULL;
+ ret = ethnl_bitset32_size(val32, mask32, nbits, names, compact);
+
+ if (nbits > ETHNL_SMALL_BITMAP_BITS)
+ kfree(val32);
+
+ return ret;
+}
+
+int ethnl_put_bitset(struct sk_buff *skb, int attrtype,
+ const unsigned long *val, const unsigned long *mask,
+ unsigned int nbits, ethnl_string_array_t names,
+ bool compact)
+{
+ u32 small_mask32[ETHNL_SMALL_BITMAP_WORDS];
+ u32 small_val32[ETHNL_SMALL_BITMAP_WORDS];
+ u32 *mask32;
+ u32 *val32;
+ int ret;
+
+ if (nbits > ETHNL_SMALL_BITMAP_BITS) {
+ unsigned int nwords = DIV_ROUND_UP(nbits, 32);
+
+ val32 = kmalloc_array(2 * nwords, sizeof(u32), GFP_KERNEL);
+ if (!val32)
+ return -ENOMEM;
+ mask32 = val32 + nwords;
+ } else {
+ val32 = small_val32;
+ mask32 = small_mask32;
+ }
+
+ bitmap_to_arr32(val32, val, nbits);
+ if (mask)
+ bitmap_to_arr32(mask32, mask, nbits);
+ else
+ mask32 = NULL;
+ ret = ethnl_put_bitset32(skb, attrtype, val32, mask32, nbits, names,
+ compact);
+
+ if (nbits > ETHNL_SMALL_BITMAP_BITS)
+ kfree(val32);
+
+ return ret;
+}
+
+int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits,
+ const struct nlattr *attr, ethnl_string_array_t names,
+ struct netlink_ext_ack *extack, bool *mod)
+{
+ u32 small_bitmap32[ETHNL_SMALL_BITMAP_WORDS];
+ u32 *bitmap32 = small_bitmap32;
+ bool u32_mod = false;
+ int ret;
+
+ if (nbits > ETHNL_SMALL_BITMAP_BITS) {
+ unsigned int dst_words = DIV_ROUND_UP(nbits, 32);
+
+ bitmap32 = kmalloc_array(dst_words, sizeof(u32), GFP_KERNEL);
+ if (!bitmap32)
+ return -ENOMEM;
+ }
+
+ bitmap_to_arr32(bitmap32, bitmap, nbits);
+ ret = ethnl_update_bitset32(bitmap32, nbits, attr, names, extack,
+ &u32_mod);
+ if (u32_mod) {
+ bitmap_from_arr32(bitmap, bitmap32, nbits);
+ *mod = true;
+ }
+
+ if (nbits > ETHNL_SMALL_BITMAP_BITS)
+ kfree(bitmap32);
+
+ return ret;
+}
+
+#else
+
+/* On little endian 64-bit and all 32-bit architectures, an unsigned long
+ * based bitmap can be interpreted as u32 based one using a simple cast.
+ */
+
+int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask,
+ unsigned int nbits, ethnl_string_array_t names,
+ bool compact)
+{
+ return ethnl_bitset32_size((const u32 *)val, (const u32 *)mask, nbits,
+ names, compact);
+}
+
+int ethnl_put_bitset(struct sk_buff *skb, int attrtype,
+ const unsigned long *val, const unsigned long *mask,
+ unsigned int nbits, ethnl_string_array_t names,
+ bool compact)
+{
+ return ethnl_put_bitset32(skb, attrtype, (const u32 *)val,
+ (const u32 *)mask, nbits, names, compact);
+}
+
+int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits,
+ const struct nlattr *attr, ethnl_string_array_t names,
+ struct netlink_ext_ack *extack, bool *mod)
+{
+ return ethnl_update_bitset32((u32 *)bitmap, nbits, attr, names, extack,
+ mod);
+}
+
+#endif /* BITS_PER_LONG == 64 && defined(__BIG_ENDIAN) */
diff --git a/net/ethtool/bitset.h b/net/ethtool/bitset.h
new file mode 100644
index 000000000000..c2c2e0051d00
--- /dev/null
+++ b/net/ethtool/bitset.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef _NET_ETHTOOL_BITSET_H
+#define _NET_ETHTOOL_BITSET_H
+
+#define ETHNL_MAX_BITSET_SIZE S16_MAX
+
+typedef const char (*const ethnl_string_array_t)[ETH_GSTRING_LEN];
+
+int ethnl_bitset_is_compact(const struct nlattr *bitset, bool *compact);
+int ethnl_bitset_size(const unsigned long *val, const unsigned long *mask,
+ unsigned int nbits, ethnl_string_array_t names,
+ bool compact);
+int ethnl_bitset32_size(const u32 *val, const u32 *mask, unsigned int nbits,
+ ethnl_string_array_t names, bool compact);
+int ethnl_put_bitset(struct sk_buff *skb, int attrtype,
+ const unsigned long *val, const unsigned long *mask,
+ unsigned int nbits, ethnl_string_array_t names,
+ bool compact);
+int ethnl_put_bitset32(struct sk_buff *skb, int attrtype, const u32 *val,
+ const u32 *mask, unsigned int nbits,
+ ethnl_string_array_t names, bool compact);
+int ethnl_update_bitset(unsigned long *bitmap, unsigned int nbits,
+ const struct nlattr *attr, ethnl_string_array_t names,
+ struct netlink_ext_ack *extack, bool *mod);
+int ethnl_update_bitset32(u32 *bitmap, unsigned int nbits,
+ const struct nlattr *attr, ethnl_string_array_t names,
+ struct netlink_ext_ack *extack, bool *mod);
+int ethnl_parse_bitset(unsigned long *val, unsigned long *mask,
+ unsigned int nbits, const struct nlattr *attr,
+ ethnl_string_array_t names,
+ struct netlink_ext_ack *extack);
+
+#endif /* _NET_ETHTOOL_BITSET_H */
diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c
new file mode 100644
index 000000000000..0364b8fb577b
--- /dev/null
+++ b/net/ethtool/cabletest.c
@@ -0,0 +1,453 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/phy.h>
+#include <linux/ethtool_netlink.h>
+#include <net/netdev_lock.h>
+#include "netlink.h"
+#include "common.h"
+
+/* 802.3 standard allows 100 meters for BaseT cables. However longer
+ * cables might work, depending on the quality of the cables and the
+ * PHY. So allow testing for up to 150 meters.
+ */
+#define MAX_CABLE_LENGTH_CM (150 * 100)
+
+const struct nla_policy ethnl_cable_test_act_policy[] = {
+ [ETHTOOL_A_CABLE_TEST_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy_phy),
+};
+
+static int ethnl_cable_test_started(struct phy_device *phydev, u8 cmd)
+{
+ struct sk_buff *skb;
+ int err = -ENOMEM;
+ void *ehdr;
+
+ skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ goto out;
+
+ ehdr = ethnl_bcastmsg_put(skb, cmd);
+ if (!ehdr) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ err = ethnl_fill_reply_header(skb, phydev->attached_dev,
+ ETHTOOL_A_CABLE_TEST_NTF_HEADER);
+ if (err)
+ goto out;
+
+ err = nla_put_u8(skb, ETHTOOL_A_CABLE_TEST_NTF_STATUS,
+ ETHTOOL_A_CABLE_TEST_NTF_STATUS_STARTED);
+ if (err)
+ goto out;
+
+ genlmsg_end(skb, ehdr);
+
+ return ethnl_multicast(skb, phydev->attached_dev);
+
+out:
+ nlmsg_free(skb);
+ phydev_err(phydev, "%s: Error %pe\n", __func__, ERR_PTR(err));
+
+ return err;
+}
+
+int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ethnl_req_info req_info = {};
+ const struct ethtool_phy_ops *ops;
+ struct nlattr **tb = info->attrs;
+ struct phy_device *phydev;
+ struct net_device *dev;
+ int ret;
+
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_CABLE_TEST_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+
+ dev = req_info.dev;
+
+ rtnl_lock();
+ netdev_lock_ops(dev);
+ phydev = ethnl_req_get_phydev(&req_info, tb,
+ ETHTOOL_A_CABLE_TEST_HEADER,
+ info->extack);
+ if (IS_ERR_OR_NULL(phydev)) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
+ ops = ethtool_phy_ops;
+ if (!ops || !ops->start_cable_test) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_unlock;
+
+ ret = ops->start_cable_test(phydev, info->extack);
+
+ ethnl_ops_complete(dev);
+
+ if (!ret)
+ ethnl_cable_test_started(phydev, ETHTOOL_MSG_CABLE_TEST_NTF);
+
+out_unlock:
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+ ethnl_parse_header_dev_put(&req_info);
+ return ret;
+}
+
+int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd)
+{
+ int err = -ENOMEM;
+
+ /* One TDR sample occupies 20 bytes. For a 150 meter cable,
+ * with four pairs, around 12K is needed.
+ */
+ phydev->skb = genlmsg_new(SZ_16K, GFP_KERNEL);
+ if (!phydev->skb)
+ goto out;
+
+ phydev->ehdr = ethnl_bcastmsg_put(phydev->skb, cmd);
+ if (!phydev->ehdr) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ err = ethnl_fill_reply_header(phydev->skb, phydev->attached_dev,
+ ETHTOOL_A_CABLE_TEST_NTF_HEADER);
+ if (err)
+ goto out;
+
+ err = nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_TEST_NTF_STATUS,
+ ETHTOOL_A_CABLE_TEST_NTF_STATUS_COMPLETED);
+ if (err)
+ goto out;
+
+ phydev->nest = nla_nest_start(phydev->skb,
+ ETHTOOL_A_CABLE_TEST_NTF_NEST);
+ if (!phydev->nest) {
+ err = -EMSGSIZE;
+ goto out;
+ }
+
+ return 0;
+
+out:
+ nlmsg_free(phydev->skb);
+ phydev->skb = NULL;
+ return err;
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_alloc);
+
+void ethnl_cable_test_free(struct phy_device *phydev)
+{
+ nlmsg_free(phydev->skb);
+ phydev->skb = NULL;
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_free);
+
+void ethnl_cable_test_finished(struct phy_device *phydev)
+{
+ nla_nest_end(phydev->skb, phydev->nest);
+
+ genlmsg_end(phydev->skb, phydev->ehdr);
+
+ ethnl_multicast(phydev->skb, phydev->attached_dev);
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_finished);
+
+int ethnl_cable_test_result_with_src(struct phy_device *phydev, u8 pair,
+ u8 result, u32 src)
+{
+ struct nlattr *nest;
+ int ret = -EMSGSIZE;
+
+ nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_NEST_RESULT);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_RESULT_PAIR, pair))
+ goto err;
+ if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_RESULT_CODE, result))
+ goto err;
+ if (src != ETHTOOL_A_CABLE_INF_SRC_UNSPEC) {
+ if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_RESULT_SRC, src))
+ goto err;
+ }
+
+ nla_nest_end(phydev->skb, nest);
+ return 0;
+
+err:
+ nla_nest_cancel(phydev->skb, nest);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_result_with_src);
+
+int ethnl_cable_test_fault_length_with_src(struct phy_device *phydev, u8 pair,
+ u32 cm, u32 src)
+{
+ struct nlattr *nest;
+ int ret = -EMSGSIZE;
+
+ nest = nla_nest_start(phydev->skb,
+ ETHTOOL_A_CABLE_NEST_FAULT_LENGTH);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_FAULT_LENGTH_PAIR, pair))
+ goto err;
+ if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_FAULT_LENGTH_CM, cm))
+ goto err;
+ if (src != ETHTOOL_A_CABLE_INF_SRC_UNSPEC) {
+ if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_FAULT_LENGTH_SRC,
+ src))
+ goto err;
+ }
+
+ nla_nest_end(phydev->skb, nest);
+ return 0;
+
+err:
+ nla_nest_cancel(phydev->skb, nest);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_fault_length_with_src);
+
+static const struct nla_policy cable_test_tdr_act_cfg_policy[] = {
+ [ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST] = { .type = NLA_U32 },
+ [ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST] = { .type = NLA_U32 },
+ [ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP] = { .type = NLA_U32 },
+ [ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR] = { .type = NLA_U8 },
+};
+
+const struct nla_policy ethnl_cable_test_tdr_act_policy[] = {
+ [ETHTOOL_A_CABLE_TEST_TDR_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy_phy),
+ [ETHTOOL_A_CABLE_TEST_TDR_CFG] = { .type = NLA_NESTED },
+};
+
+/* CABLE_TEST_TDR_ACT */
+static int ethnl_act_cable_test_tdr_cfg(const struct nlattr *nest,
+ struct genl_info *info,
+ struct phy_tdr_config *cfg)
+{
+ struct nlattr *tb[ARRAY_SIZE(cable_test_tdr_act_cfg_policy)];
+ int ret;
+
+ cfg->first = 100;
+ cfg->step = 100;
+ cfg->last = MAX_CABLE_LENGTH_CM;
+ cfg->pair = PHY_PAIR_ALL;
+
+ if (!nest)
+ return 0;
+
+ ret = nla_parse_nested(tb,
+ ARRAY_SIZE(cable_test_tdr_act_cfg_policy) - 1,
+ nest, cable_test_tdr_act_cfg_policy,
+ info->extack);
+ if (ret < 0)
+ return ret;
+
+ if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST])
+ cfg->first = nla_get_u32(
+ tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST]);
+
+ if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST])
+ cfg->last = nla_get_u32(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST]);
+
+ if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP])
+ cfg->step = nla_get_u32(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP]);
+
+ if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR]) {
+ cfg->pair = nla_get_u8(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR]);
+ if (cfg->pair > ETHTOOL_A_CABLE_PAIR_D) {
+ NL_SET_ERR_MSG_ATTR(
+ info->extack,
+ tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR],
+ "invalid pair parameter");
+ return -EINVAL;
+ }
+ }
+
+ if (cfg->first > MAX_CABLE_LENGTH_CM) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST],
+ "invalid first parameter");
+ return -EINVAL;
+ }
+
+ if (cfg->last > MAX_CABLE_LENGTH_CM) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST],
+ "invalid last parameter");
+ return -EINVAL;
+ }
+
+ if (cfg->first > cfg->last) {
+ NL_SET_ERR_MSG(info->extack, "invalid first/last parameter");
+ return -EINVAL;
+ }
+
+ if (!cfg->step) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP],
+ "invalid step parameter");
+ return -EINVAL;
+ }
+
+ if (cfg->step > (cfg->last - cfg->first)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP],
+ "step parameter too big");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ethnl_req_info req_info = {};
+ const struct ethtool_phy_ops *ops;
+ struct nlattr **tb = info->attrs;
+ struct phy_device *phydev;
+ struct phy_tdr_config cfg;
+ struct net_device *dev;
+ int ret;
+
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_CABLE_TEST_TDR_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+
+ dev = req_info.dev;
+
+ ret = ethnl_act_cable_test_tdr_cfg(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG],
+ info, &cfg);
+ if (ret)
+ goto out_dev_put;
+
+ rtnl_lock();
+ netdev_lock_ops(dev);
+ phydev = ethnl_req_get_phydev(&req_info, tb,
+ ETHTOOL_A_CABLE_TEST_TDR_HEADER,
+ info->extack);
+ if (IS_ERR_OR_NULL(phydev)) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
+ ops = ethtool_phy_ops;
+ if (!ops || !ops->start_cable_test_tdr) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_unlock;
+
+ ret = ops->start_cable_test_tdr(phydev, info->extack, &cfg);
+
+ ethnl_ops_complete(dev);
+
+ if (!ret)
+ ethnl_cable_test_started(phydev,
+ ETHTOOL_MSG_CABLE_TEST_TDR_NTF);
+
+out_unlock:
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+out_dev_put:
+ ethnl_parse_header_dev_put(&req_info);
+ return ret;
+}
+
+int ethnl_cable_test_amplitude(struct phy_device *phydev,
+ u8 pair, s16 mV)
+{
+ struct nlattr *nest;
+ int ret = -EMSGSIZE;
+
+ nest = nla_nest_start(phydev->skb,
+ ETHTOOL_A_CABLE_TDR_NEST_AMPLITUDE);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_AMPLITUDE_PAIR, pair))
+ goto err;
+ if (nla_put_u16(phydev->skb, ETHTOOL_A_CABLE_AMPLITUDE_mV, mV))
+ goto err;
+
+ nla_nest_end(phydev->skb, nest);
+ return 0;
+
+err:
+ nla_nest_cancel(phydev->skb, nest);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_amplitude);
+
+int ethnl_cable_test_pulse(struct phy_device *phydev, u16 mV)
+{
+ struct nlattr *nest;
+ int ret = -EMSGSIZE;
+
+ nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_TDR_NEST_PULSE);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u16(phydev->skb, ETHTOOL_A_CABLE_PULSE_mV, mV))
+ goto err;
+
+ nla_nest_end(phydev->skb, nest);
+ return 0;
+
+err:
+ nla_nest_cancel(phydev->skb, nest);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_pulse);
+
+int ethnl_cable_test_step(struct phy_device *phydev, u32 first, u32 last,
+ u32 step)
+{
+ struct nlattr *nest;
+ int ret = -EMSGSIZE;
+
+ nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_TDR_NEST_STEP);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_FIRST_DISTANCE,
+ first))
+ goto err;
+
+ if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_LAST_DISTANCE, last))
+ goto err;
+
+ if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_STEP_DISTANCE, step))
+ goto err;
+
+ nla_nest_end(phydev->skb, nest);
+ return 0;
+
+err:
+ nla_nest_cancel(phydev->skb, nest);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ethnl_cable_test_step);
diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c
new file mode 100644
index 000000000000..ca4f80282448
--- /dev/null
+++ b/net/ethtool/channels.c
@@ -0,0 +1,199 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <net/xdp_sock_drv.h>
+
+#include "netlink.h"
+#include "common.h"
+
+struct channels_req_info {
+ struct ethnl_req_info base;
+};
+
+struct channels_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_channels channels;
+};
+
+#define CHANNELS_REPDATA(__reply_base) \
+ container_of(__reply_base, struct channels_reply_data, base)
+
+const struct nla_policy ethnl_channels_get_policy[] = {
+ [ETHTOOL_A_CHANNELS_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int channels_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct channels_reply_data *data = CHANNELS_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_channels)
+ return -EOPNOTSUPP;
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ dev->ethtool_ops->get_channels(dev, &data->channels);
+ ethnl_ops_complete(dev);
+
+ return 0;
+}
+
+static int channels_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ return nla_total_size(sizeof(u32)) + /* _CHANNELS_RX_MAX */
+ nla_total_size(sizeof(u32)) + /* _CHANNELS_TX_MAX */
+ nla_total_size(sizeof(u32)) + /* _CHANNELS_OTHER_MAX */
+ nla_total_size(sizeof(u32)) + /* _CHANNELS_COMBINED_MAX */
+ nla_total_size(sizeof(u32)) + /* _CHANNELS_RX_COUNT */
+ nla_total_size(sizeof(u32)) + /* _CHANNELS_TX_COUNT */
+ nla_total_size(sizeof(u32)) + /* _CHANNELS_OTHER_COUNT */
+ nla_total_size(sizeof(u32)); /* _CHANNELS_COMBINED_COUNT */
+}
+
+static int channels_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct channels_reply_data *data = CHANNELS_REPDATA(reply_base);
+ const struct ethtool_channels *channels = &data->channels;
+
+ if ((channels->max_rx &&
+ (nla_put_u32(skb, ETHTOOL_A_CHANNELS_RX_MAX,
+ channels->max_rx) ||
+ nla_put_u32(skb, ETHTOOL_A_CHANNELS_RX_COUNT,
+ channels->rx_count))) ||
+ (channels->max_tx &&
+ (nla_put_u32(skb, ETHTOOL_A_CHANNELS_TX_MAX,
+ channels->max_tx) ||
+ nla_put_u32(skb, ETHTOOL_A_CHANNELS_TX_COUNT,
+ channels->tx_count))) ||
+ (channels->max_other &&
+ (nla_put_u32(skb, ETHTOOL_A_CHANNELS_OTHER_MAX,
+ channels->max_other) ||
+ nla_put_u32(skb, ETHTOOL_A_CHANNELS_OTHER_COUNT,
+ channels->other_count))) ||
+ (channels->max_combined &&
+ (nla_put_u32(skb, ETHTOOL_A_CHANNELS_COMBINED_MAX,
+ channels->max_combined) ||
+ nla_put_u32(skb, ETHTOOL_A_CHANNELS_COMBINED_COUNT,
+ channels->combined_count))))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+/* CHANNELS_SET */
+
+const struct nla_policy ethnl_channels_set_policy[] = {
+ [ETHTOOL_A_CHANNELS_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_CHANNELS_RX_COUNT] = { .type = NLA_U32 },
+ [ETHTOOL_A_CHANNELS_TX_COUNT] = { .type = NLA_U32 },
+ [ETHTOOL_A_CHANNELS_OTHER_COUNT] = { .type = NLA_U32 },
+ [ETHTOOL_A_CHANNELS_COMBINED_COUNT] = { .type = NLA_U32 },
+};
+
+static int
+ethnl_set_channels_validate(struct ethnl_req_info *req_info,
+ struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+
+ return ops->get_channels && ops->set_channels ? 1 : -EOPNOTSUPP;
+}
+
+static int
+ethnl_set_channels(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ unsigned int from_channel, old_total, i;
+ bool mod = false, mod_combined = false;
+ struct net_device *dev = req_info->dev;
+ struct ethtool_channels channels = {};
+ struct nlattr **tb = info->attrs;
+ u32 err_attr;
+ int ret;
+
+ dev->ethtool_ops->get_channels(dev, &channels);
+ old_total = channels.combined_count +
+ max(channels.rx_count, channels.tx_count);
+
+ ethnl_update_u32(&channels.rx_count, tb[ETHTOOL_A_CHANNELS_RX_COUNT],
+ &mod);
+ ethnl_update_u32(&channels.tx_count, tb[ETHTOOL_A_CHANNELS_TX_COUNT],
+ &mod);
+ ethnl_update_u32(&channels.other_count,
+ tb[ETHTOOL_A_CHANNELS_OTHER_COUNT], &mod);
+ ethnl_update_u32(&channels.combined_count,
+ tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT], &mod_combined);
+ mod |= mod_combined;
+ if (!mod)
+ return 0;
+
+ /* ensure new channel counts are within limits */
+ if (channels.rx_count > channels.max_rx)
+ err_attr = ETHTOOL_A_CHANNELS_RX_COUNT;
+ else if (channels.tx_count > channels.max_tx)
+ err_attr = ETHTOOL_A_CHANNELS_TX_COUNT;
+ else if (channels.other_count > channels.max_other)
+ err_attr = ETHTOOL_A_CHANNELS_OTHER_COUNT;
+ else if (channels.combined_count > channels.max_combined)
+ err_attr = ETHTOOL_A_CHANNELS_COMBINED_COUNT;
+ else
+ err_attr = 0;
+ if (err_attr) {
+ NL_SET_ERR_MSG_ATTR(info->extack, tb[err_attr],
+ "requested channel count exceeds maximum");
+ return -EINVAL;
+ }
+
+ /* ensure there is at least one RX and one TX channel */
+ if (!channels.combined_count && !channels.rx_count)
+ err_attr = ETHTOOL_A_CHANNELS_RX_COUNT;
+ else if (!channels.combined_count && !channels.tx_count)
+ err_attr = ETHTOOL_A_CHANNELS_TX_COUNT;
+ else
+ err_attr = 0;
+ if (err_attr) {
+ if (mod_combined)
+ err_attr = ETHTOOL_A_CHANNELS_COMBINED_COUNT;
+ NL_SET_ERR_MSG_ATTR(info->extack, tb[err_attr],
+ "requested channel counts would result in no RX or TX channel being configured");
+ return -EINVAL;
+ }
+
+ ret = ethtool_check_max_channel(dev, channels, info);
+ if (ret)
+ return ret;
+
+ /* Disabling channels, query zero-copy AF_XDP sockets */
+ from_channel = channels.combined_count +
+ min(channels.rx_count, channels.tx_count);
+ for (i = from_channel; i < old_total; i++)
+ if (xsk_get_pool_from_qid(dev, i)) {
+ GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets");
+ return -EINVAL;
+ }
+
+ ret = dev->ethtool_ops->set_channels(dev, &channels);
+ return ret < 0 ? ret : 1;
+}
+
+const struct ethnl_request_ops ethnl_channels_request_ops = {
+ .request_cmd = ETHTOOL_MSG_CHANNELS_GET,
+ .reply_cmd = ETHTOOL_MSG_CHANNELS_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_CHANNELS_HEADER,
+ .req_info_size = sizeof(struct channels_req_info),
+ .reply_data_size = sizeof(struct channels_reply_data),
+
+ .prepare_data = channels_prepare_data,
+ .reply_size = channels_reply_size,
+ .fill_reply = channels_fill_reply,
+
+ .set_validate = ethnl_set_channels_validate,
+ .set = ethnl_set_channels,
+ .set_ntf_cmd = ETHTOOL_MSG_CHANNELS_NTF,
+};
diff --git a/net/ethtool/cmis.h b/net/ethtool/cmis.h
new file mode 100644
index 000000000000..4a9a946cabf0
--- /dev/null
+++ b/net/ethtool/cmis.h
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#define ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH 120
+#define ETHTOOL_CMIS_CDB_EPL_MAX_PL_LENGTH 2048
+#define ETHTOOL_CMIS_CDB_CMD_PAGE 0x9F
+#define ETHTOOL_CMIS_CDB_PAGE_I2C_ADDR 0x50
+
+/**
+ * struct ethtool_cmis_cdb - CDB commands parameters
+ * @cmis_rev: CMIS revision major.
+ * @read_write_len_ext: Allowable additional number of byte octets to the LPL
+ * in a READ or a WRITE CDB commands.
+ * @max_completion_time: Maximum CDB command completion time in msec.
+ */
+struct ethtool_cmis_cdb {
+ u8 cmis_rev;
+ u8 read_write_len_ext;
+ u16 max_completion_time;
+};
+
+enum ethtool_cmis_cdb_cmd_id {
+ ETHTOOL_CMIS_CDB_CMD_QUERY_STATUS = 0x0000,
+ ETHTOOL_CMIS_CDB_CMD_MODULE_FEATURES = 0x0040,
+ ETHTOOL_CMIS_CDB_CMD_FW_MANAGMENT_FEATURES = 0x0041,
+ ETHTOOL_CMIS_CDB_CMD_START_FW_DOWNLOAD = 0x0101,
+ ETHTOOL_CMIS_CDB_CMD_WRITE_FW_BLOCK_LPL = 0x0103,
+ ETHTOOL_CMIS_CDB_CMD_WRITE_FW_BLOCK_EPL = 0x0104,
+ ETHTOOL_CMIS_CDB_CMD_COMPLETE_FW_DOWNLOAD = 0x0107,
+ ETHTOOL_CMIS_CDB_CMD_RUN_FW_IMAGE = 0x0109,
+ ETHTOOL_CMIS_CDB_CMD_COMMIT_FW_IMAGE = 0x010A,
+};
+
+/**
+ * struct ethtool_cmis_cdb_request - CDB commands request fields as decribed in
+ * the CMIS standard
+ * @id: Command ID.
+ * @epl_len: EPL memory length.
+ * @lpl_len: LPL memory length.
+ * @chk_code: Check code for the previous field and the payload.
+ * @resv1: Added to match the CMIS standard request continuity.
+ * @resv2: Added to match the CMIS standard request continuity.
+ * @payload: Payload for the CDB commands.
+ * @epl: Extended payload for the CDB commands.
+ */
+struct ethtool_cmis_cdb_request {
+ __be16 id;
+ struct_group(body,
+ __be16 epl_len;
+ u8 lpl_len;
+ u8 chk_code;
+ u8 resv1;
+ u8 resv2;
+ u8 payload[ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH];
+ );
+ u8 *epl; /* Everything above this field checksummed. */
+};
+
+#define CDB_F_COMPLETION_VALID BIT(0)
+#define CDB_F_STATUS_VALID BIT(1)
+#define CDB_F_MODULE_STATE_VALID BIT(2)
+
+/**
+ * struct ethtool_cmis_cdb_cmd_args - CDB commands execution arguments
+ * @req: CDB command fields as described in the CMIS standard.
+ * @max_duration: Maximum duration time for command completion in msec.
+ * @read_write_len_ext: Allowable additional number of byte octets to the LPL
+ * in a READ or a WRITE commands.
+ * @msleep_pre_rpl: Waiting time before checking reply in msec.
+ * @rpl_exp_len: Expected reply length in bytes.
+ * @flags: Validation flags for CDB commands.
+ * @err_msg: Error message to be sent to user space.
+ */
+struct ethtool_cmis_cdb_cmd_args {
+ struct ethtool_cmis_cdb_request req;
+ u16 max_duration;
+ u8 read_write_len_ext;
+ u8 msleep_pre_rpl;
+ u8 rpl_exp_len;
+ u8 flags;
+ char *err_msg;
+};
+
+/**
+ * struct ethtool_cmis_cdb_rpl_hdr - CDB commands reply header arguments
+ * @rpl_len: Reply length.
+ * @rpl_chk_code: Reply check code.
+ */
+struct ethtool_cmis_cdb_rpl_hdr {
+ u8 rpl_len;
+ u8 rpl_chk_code;
+};
+
+/**
+ * struct ethtool_cmis_cdb_rpl - CDB commands reply arguments
+ * @hdr: CDB commands reply header arguments.
+ * @payload: Payload for the CDB commands reply.
+ */
+struct ethtool_cmis_cdb_rpl {
+ struct ethtool_cmis_cdb_rpl_hdr hdr;
+ u8 payload[ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH];
+};
+
+u32 ethtool_cmis_get_max_lpl_size(u8 num_of_byte_octs);
+
+void ethtool_cmis_cdb_compose_args(struct ethtool_cmis_cdb_cmd_args *args,
+ enum ethtool_cmis_cdb_cmd_id cmd, u8 *lpl,
+ u8 lpl_len, u8 *epl, u16 epl_len,
+ u16 max_duration, u8 read_write_len_ext,
+ u16 msleep_pre_rpl, u8 rpl_exp_len,
+ u8 flags);
+
+void ethtool_cmis_cdb_check_completion_flag(u8 cmis_rev, u8 *flags);
+
+void ethtool_cmis_page_init(struct ethtool_module_eeprom *page_data,
+ u8 page, u32 offset, u32 length);
+
+struct ethtool_cmis_cdb *
+ethtool_cmis_cdb_init(struct net_device *dev,
+ const struct ethtool_module_fw_flash_params *params,
+ struct ethnl_module_fw_flash_ntf_params *ntf_params);
+void ethtool_cmis_cdb_fini(struct ethtool_cmis_cdb *cdb);
+
+int ethtool_cmis_wait_for_cond(struct net_device *dev, u8 flags, u8 flag,
+ u16 max_duration, u32 offset,
+ bool (*cond_success)(u8), bool (*cond_fail)(u8), u8 *state);
+
+int ethtool_cmis_cdb_execute_cmd(struct net_device *dev,
+ struct ethtool_cmis_cdb_cmd_args *args);
diff --git a/net/ethtool/cmis_cdb.c b/net/ethtool/cmis_cdb.c
new file mode 100644
index 000000000000..3057576bc81e
--- /dev/null
+++ b/net/ethtool/cmis_cdb.c
@@ -0,0 +1,666 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool.h>
+#include <linux/jiffies.h>
+
+#include "common.h"
+#include "module_fw.h"
+#include "cmis.h"
+
+/* For accessing the LPL field on page 9Fh, the allowable length extension is
+ * min(i, 15) byte octets where i specifies the allowable additional number of
+ * byte octets in a READ or a WRITE.
+ */
+u32 ethtool_cmis_get_max_lpl_size(u8 num_of_byte_octs)
+{
+ return 8 * (1 + min_t(u8, num_of_byte_octs, 15));
+}
+
+void ethtool_cmis_cdb_compose_args(struct ethtool_cmis_cdb_cmd_args *args,
+ enum ethtool_cmis_cdb_cmd_id cmd, u8 *lpl,
+ u8 lpl_len, u8 *epl, u16 epl_len,
+ u16 max_duration, u8 read_write_len_ext,
+ u16 msleep_pre_rpl, u8 rpl_exp_len, u8 flags)
+{
+ args->req.id = cpu_to_be16(cmd);
+ args->req.lpl_len = lpl_len;
+ if (lpl)
+ memcpy(args->req.payload, lpl, args->req.lpl_len);
+ if (epl) {
+ args->req.epl_len = cpu_to_be16(epl_len);
+ args->req.epl = epl;
+ }
+
+ args->max_duration = max_duration;
+ args->read_write_len_ext =
+ ethtool_cmis_get_max_lpl_size(read_write_len_ext);
+ args->msleep_pre_rpl = msleep_pre_rpl;
+ args->rpl_exp_len = rpl_exp_len;
+ args->flags = flags;
+ args->err_msg = NULL;
+}
+
+void ethtool_cmis_page_init(struct ethtool_module_eeprom *page_data,
+ u8 page, u32 offset, u32 length)
+{
+ page_data->page = page;
+ page_data->offset = offset;
+ page_data->length = length;
+ page_data->i2c_address = ETHTOOL_CMIS_CDB_PAGE_I2C_ADDR;
+}
+
+#define CMIS_REVISION_PAGE 0x00
+#define CMIS_REVISION_OFFSET 0x01
+
+struct cmis_rev_rpl {
+ u8 rev;
+};
+
+static u8 cmis_rev_rpl_major(struct cmis_rev_rpl *rpl)
+{
+ return rpl->rev >> 4;
+}
+
+static int cmis_rev_major_get(struct net_device *dev, u8 *rev_major)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_module_eeprom page_data = {0};
+ struct netlink_ext_ack extack = {};
+ struct cmis_rev_rpl rpl = {};
+ int err;
+
+ ethtool_cmis_page_init(&page_data, CMIS_REVISION_PAGE,
+ CMIS_REVISION_OFFSET, sizeof(rpl));
+ page_data.data = (u8 *)&rpl;
+
+ err = ops->get_module_eeprom_by_page(dev, &page_data, &extack);
+ if (err < 0) {
+ if (extack._msg)
+ netdev_err(dev, "%s\n", extack._msg);
+ return err;
+ }
+
+ *rev_major = cmis_rev_rpl_major(&rpl);
+
+ return 0;
+}
+
+#define CMIS_CDB_ADVERTISEMENT_PAGE 0x01
+#define CMIS_CDB_ADVERTISEMENT_OFFSET 0xA3
+
+/* Based on section 8.4.11 "CDB Messaging Support Advertisement" in CMIS
+ * standard revision 5.2.
+ */
+struct cmis_cdb_advert_rpl {
+ u8 inst_supported;
+ u8 read_write_len_ext;
+ u8 resv1;
+ u8 resv2;
+};
+
+static u8 cmis_cdb_advert_rpl_inst_supported(struct cmis_cdb_advert_rpl *rpl)
+{
+ return rpl->inst_supported >> 6;
+}
+
+static int cmis_cdb_advertisement_get(struct ethtool_cmis_cdb *cdb,
+ struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *ntf_params)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_module_eeprom page_data = {};
+ struct cmis_cdb_advert_rpl rpl = {};
+ struct netlink_ext_ack extack = {};
+ int err;
+
+ ethtool_cmis_page_init(&page_data, CMIS_CDB_ADVERTISEMENT_PAGE,
+ CMIS_CDB_ADVERTISEMENT_OFFSET, sizeof(rpl));
+ page_data.data = (u8 *)&rpl;
+
+ err = ops->get_module_eeprom_by_page(dev, &page_data, &extack);
+ if (err < 0) {
+ if (extack._msg)
+ netdev_err(dev, "%s\n", extack._msg);
+ return err;
+ }
+
+ if (!cmis_cdb_advert_rpl_inst_supported(&rpl)) {
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "CDB functionality is not supported",
+ NULL);
+ return -EOPNOTSUPP;
+ }
+
+ cdb->read_write_len_ext = rpl.read_write_len_ext;
+
+ return 0;
+}
+
+#define CMIS_PASSWORD_ENTRY_PAGE 0x00
+#define CMIS_PASSWORD_ENTRY_OFFSET 0x7A
+
+struct cmis_password_entry_pl {
+ __be32 password;
+};
+
+/* See section 9.3.1 "CMD 0000h: Query Status" in CMIS standard revision 5.2.
+ * struct cmis_cdb_query_status_pl and struct cmis_cdb_query_status_rpl are
+ * structured layouts of the flat arrays,
+ * struct ethtool_cmis_cdb_request::payload and
+ * struct ethtool_cmis_cdb_rpl::payload respectively.
+ */
+struct cmis_cdb_query_status_pl {
+ u16 response_delay;
+};
+
+struct cmis_cdb_query_status_rpl {
+ u8 length;
+ u8 status;
+};
+
+static int
+cmis_cdb_validate_password(struct ethtool_cmis_cdb *cdb,
+ struct net_device *dev,
+ const struct ethtool_module_fw_flash_params *params,
+ struct ethnl_module_fw_flash_ntf_params *ntf_params)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct cmis_cdb_query_status_pl qs_pl = {0};
+ struct ethtool_module_eeprom page_data = {};
+ struct ethtool_cmis_cdb_cmd_args args = {};
+ struct cmis_password_entry_pl pe_pl = {};
+ struct cmis_cdb_query_status_rpl *rpl;
+ struct netlink_ext_ack extack = {};
+ int err;
+
+ ethtool_cmis_page_init(&page_data, CMIS_PASSWORD_ENTRY_PAGE,
+ CMIS_PASSWORD_ENTRY_OFFSET, sizeof(pe_pl));
+ page_data.data = (u8 *)&pe_pl;
+
+ pe_pl = *((struct cmis_password_entry_pl *)page_data.data);
+ pe_pl.password = params->password;
+ err = ops->set_module_eeprom_by_page(dev, &page_data, &extack);
+ if (err < 0) {
+ if (extack._msg)
+ netdev_err(dev, "%s\n", extack._msg);
+ return err;
+ }
+
+ ethtool_cmis_cdb_compose_args(&args, ETHTOOL_CMIS_CDB_CMD_QUERY_STATUS,
+ (u8 *)&qs_pl, sizeof(qs_pl), NULL, 0, 0,
+ cdb->read_write_len_ext, 1000,
+ sizeof(*rpl),
+ CDB_F_COMPLETION_VALID | CDB_F_STATUS_VALID);
+
+ err = ethtool_cmis_cdb_execute_cmd(dev, &args);
+ if (err < 0) {
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "Query Status command failed",
+ args.err_msg);
+ return err;
+ }
+
+ rpl = (struct cmis_cdb_query_status_rpl *)args.req.payload;
+ if (!rpl->length || !rpl->status) {
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "Password was not accepted",
+ NULL);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Some CDB commands asserts the CDB completion flag only from CMIS
+ * revision 5. Therefore, check the relevant validity flag only when
+ * the revision supports it.
+ */
+void ethtool_cmis_cdb_check_completion_flag(u8 cmis_rev, u8 *flags)
+{
+ *flags |= cmis_rev >= 5 ? CDB_F_COMPLETION_VALID : 0;
+}
+
+#define CMIS_CDB_MODULE_FEATURES_RESV_DATA 34
+
+/* See section 9.4.1 "CMD 0040h: Module Features" in CMIS standard revision 5.2.
+ * struct cmis_cdb_module_features_rpl is structured layout of the flat
+ * array, ethtool_cmis_cdb_rpl::payload.
+ */
+struct cmis_cdb_module_features_rpl {
+ u8 resv1[CMIS_CDB_MODULE_FEATURES_RESV_DATA];
+ __be16 max_completion_time;
+};
+
+static u16
+cmis_cdb_module_features_completion_time(struct cmis_cdb_module_features_rpl *rpl)
+{
+ return be16_to_cpu(rpl->max_completion_time);
+}
+
+static int cmis_cdb_module_features_get(struct ethtool_cmis_cdb *cdb,
+ struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *ntf_params)
+{
+ struct ethtool_cmis_cdb_cmd_args args = {};
+ struct cmis_cdb_module_features_rpl *rpl;
+ u8 flags = CDB_F_STATUS_VALID;
+ int err;
+
+ ethtool_cmis_cdb_check_completion_flag(cdb->cmis_rev, &flags);
+ ethtool_cmis_cdb_compose_args(&args,
+ ETHTOOL_CMIS_CDB_CMD_MODULE_FEATURES,
+ NULL, 0, NULL, 0, 0,
+ cdb->read_write_len_ext, 1000,
+ sizeof(*rpl), flags);
+
+ err = ethtool_cmis_cdb_execute_cmd(dev, &args);
+ if (err < 0) {
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "Module Features command failed",
+ args.err_msg);
+ return err;
+ }
+
+ rpl = (struct cmis_cdb_module_features_rpl *)args.req.payload;
+ cdb->max_completion_time =
+ cmis_cdb_module_features_completion_time(rpl);
+
+ return 0;
+}
+
+struct ethtool_cmis_cdb *
+ethtool_cmis_cdb_init(struct net_device *dev,
+ const struct ethtool_module_fw_flash_params *params,
+ struct ethnl_module_fw_flash_ntf_params *ntf_params)
+{
+ struct ethtool_cmis_cdb *cdb;
+ int err;
+
+ cdb = kzalloc(sizeof(*cdb), GFP_KERNEL);
+ if (!cdb)
+ return ERR_PTR(-ENOMEM);
+
+ err = cmis_rev_major_get(dev, &cdb->cmis_rev);
+ if (err < 0)
+ goto err;
+
+ if (cdb->cmis_rev < 4) {
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "CMIS revision doesn't support module firmware flashing",
+ NULL);
+ err = -EOPNOTSUPP;
+ goto err;
+ }
+
+ err = cmis_cdb_advertisement_get(cdb, dev, ntf_params);
+ if (err < 0)
+ goto err;
+
+ if (params->password_valid) {
+ err = cmis_cdb_validate_password(cdb, dev, params, ntf_params);
+ if (err < 0)
+ goto err;
+ }
+
+ err = cmis_cdb_module_features_get(cdb, dev, ntf_params);
+ if (err < 0)
+ goto err;
+
+ return cdb;
+
+err:
+ ethtool_cmis_cdb_fini(cdb);
+ return ERR_PTR(err);
+}
+
+void ethtool_cmis_cdb_fini(struct ethtool_cmis_cdb *cdb)
+{
+ kfree(cdb);
+}
+
+static bool is_completed(u8 data)
+{
+ return !!(data & 0x40);
+}
+
+#define CMIS_CDB_STATUS_SUCCESS 0x01
+
+static bool status_success(u8 data)
+{
+ return data == CMIS_CDB_STATUS_SUCCESS;
+}
+
+#define CMIS_CDB_STATUS_FAIL 0x40
+
+static bool status_fail(u8 data)
+{
+ return data & CMIS_CDB_STATUS_FAIL;
+}
+
+struct cmis_wait_for_cond_rpl {
+ u8 state;
+};
+
+static int
+ethtool_cmis_module_poll(struct net_device *dev,
+ struct cmis_wait_for_cond_rpl *rpl, u32 offset,
+ bool (*cond_success)(u8), bool (*cond_fail)(u8))
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_module_eeprom page_data = {0};
+ struct netlink_ext_ack extack = {};
+ int err;
+
+ ethtool_cmis_page_init(&page_data, 0, offset, sizeof(*rpl));
+ page_data.data = (u8 *)rpl;
+
+ err = ops->get_module_eeprom_by_page(dev, &page_data, &extack);
+ if (err < 0) {
+ if (extack._msg)
+ netdev_err_once(dev, "%s\n", extack._msg);
+ return -EBUSY;
+ }
+
+ if ((*cond_success)(rpl->state))
+ return 0;
+
+ if (*cond_fail && (*cond_fail)(rpl->state))
+ return -EIO;
+
+ return -EBUSY;
+}
+
+int ethtool_cmis_wait_for_cond(struct net_device *dev, u8 flags, u8 flag,
+ u16 max_duration, u32 offset,
+ bool (*cond_success)(u8), bool (*cond_fail)(u8),
+ u8 *state)
+{
+ struct cmis_wait_for_cond_rpl rpl = {};
+ unsigned long end;
+ int err;
+
+ if (!(flags & flag))
+ return 0;
+
+ if (max_duration == 0)
+ max_duration = U16_MAX;
+
+ end = jiffies + msecs_to_jiffies(max_duration);
+ do {
+ err = ethtool_cmis_module_poll(dev, &rpl, offset, cond_success,
+ cond_fail);
+ if (err != -EBUSY)
+ goto out;
+
+ msleep(20);
+ } while (time_before(jiffies, end));
+
+ err = ethtool_cmis_module_poll(dev, &rpl, offset, cond_success,
+ cond_fail);
+ if (err == -EBUSY)
+ err = -ETIMEDOUT;
+
+out:
+ *state = rpl.state;
+ return err;
+}
+
+#define CMIS_CDB_COMPLETION_FLAG_OFFSET 0x08
+
+static int cmis_cdb_wait_for_completion(struct net_device *dev,
+ struct ethtool_cmis_cdb_cmd_args *args)
+{
+ u8 flag;
+ int err;
+
+ /* Some vendors demand waiting time before checking completion flag
+ * in some CDB commands.
+ */
+ msleep(args->msleep_pre_rpl);
+
+ err = ethtool_cmis_wait_for_cond(dev, args->flags,
+ CDB_F_COMPLETION_VALID,
+ args->max_duration,
+ CMIS_CDB_COMPLETION_FLAG_OFFSET,
+ is_completed, NULL, &flag);
+ if (err < 0)
+ args->err_msg = "Completion Flag did not set on time";
+
+ return err;
+}
+
+#define CMIS_CDB_STATUS_OFFSET 0x25
+
+static void cmis_cdb_status_fail_msg_get(u8 status, char **err_msg)
+{
+ switch (status) {
+ case 0b10000001:
+ *err_msg = "CDB Status is in progress: Busy capturing command";
+ break;
+ case 0b10000010:
+ *err_msg =
+ "CDB Status is in progress: Busy checking/validating command";
+ break;
+ case 0b10000011:
+ *err_msg = "CDB Status is in progress: Busy executing";
+ break;
+ case 0b01000000:
+ *err_msg = "CDB status failed: no specific failure";
+ break;
+ case 0b01000010:
+ *err_msg =
+ "CDB status failed: Parameter range error or parameter not supported";
+ break;
+ case 0b01000101:
+ *err_msg = "CDB status failed: CdbChkCode error";
+ break;
+ case 0b01000110:
+ *err_msg = "CDB status failed: Password error";
+ break;
+ default:
+ *err_msg = "Unknown failure reason";
+ }
+};
+
+static int cmis_cdb_wait_for_status(struct net_device *dev,
+ struct ethtool_cmis_cdb_cmd_args *args)
+{
+ u8 status;
+ int err;
+
+ /* Some vendors demand waiting time before checking status in some
+ * CDB commands.
+ */
+ msleep(args->msleep_pre_rpl);
+
+ err = ethtool_cmis_wait_for_cond(dev, args->flags, CDB_F_STATUS_VALID,
+ args->max_duration,
+ CMIS_CDB_STATUS_OFFSET,
+ status_success, status_fail, &status);
+ if (err < 0 && !args->err_msg)
+ cmis_cdb_status_fail_msg_get(status, &args->err_msg);
+
+ return err;
+}
+
+#define CMIS_CDB_REPLY_OFFSET 0x86
+
+static int cmis_cdb_process_reply(struct net_device *dev,
+ struct ethtool_module_eeprom *page_data,
+ struct ethtool_cmis_cdb_cmd_args *args)
+{
+ u8 rpl_hdr_len = sizeof(struct ethtool_cmis_cdb_rpl_hdr);
+ u8 rpl_exp_len = args->rpl_exp_len + rpl_hdr_len;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct netlink_ext_ack extack = {};
+ struct ethtool_cmis_cdb_rpl *rpl;
+ int err;
+
+ if (!args->rpl_exp_len)
+ return 0;
+
+ ethtool_cmis_page_init(page_data, ETHTOOL_CMIS_CDB_CMD_PAGE,
+ CMIS_CDB_REPLY_OFFSET, rpl_exp_len);
+ page_data->data = kmalloc(page_data->length, GFP_KERNEL);
+ if (!page_data->data)
+ return -ENOMEM;
+
+ err = ops->get_module_eeprom_by_page(dev, page_data, &extack);
+ if (err < 0) {
+ if (extack._msg)
+ netdev_err(dev, "%s\n", extack._msg);
+ goto out;
+ }
+
+ rpl = (struct ethtool_cmis_cdb_rpl *)page_data->data;
+ if ((args->rpl_exp_len > rpl->hdr.rpl_len + rpl_hdr_len) ||
+ !rpl->hdr.rpl_chk_code) {
+ err = -EIO;
+ goto out;
+ }
+
+ args->req.lpl_len = rpl->hdr.rpl_len;
+ memcpy(args->req.payload, rpl->payload, args->req.lpl_len);
+
+out:
+ kfree(page_data->data);
+ return err;
+}
+
+static int
+__ethtool_cmis_cdb_execute_cmd(struct net_device *dev,
+ struct ethtool_module_eeprom *page_data,
+ u8 page, u32 offset, u32 length, void *data)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct netlink_ext_ack extack = {};
+ int err;
+
+ ethtool_cmis_page_init(page_data, page, offset, length);
+ page_data->data = kmemdup(data, page_data->length, GFP_KERNEL);
+ if (!page_data->data)
+ return -ENOMEM;
+
+ err = ops->set_module_eeprom_by_page(dev, page_data, &extack);
+ if (err < 0) {
+ if (extack._msg)
+ netdev_err(dev, "%s\n", extack._msg);
+ }
+
+ kfree(page_data->data);
+ return err;
+}
+
+#define CMIS_CDB_EPL_PAGE_START 0xA0
+#define CMIS_CDB_EPL_PAGE_END 0xAF
+#define CMIS_CDB_EPL_FW_BLOCK_OFFSET_START 128
+#define CMIS_CDB_EPL_FW_BLOCK_OFFSET_END 255
+
+static int
+ethtool_cmis_cdb_execute_epl_cmd(struct net_device *dev,
+ struct ethtool_cmis_cdb_cmd_args *args,
+ struct ethtool_module_eeprom *page_data)
+{
+ u16 epl_len = be16_to_cpu(args->req.epl_len);
+ u32 bytes_written = 0;
+ u8 page;
+ int err;
+
+ for (page = CMIS_CDB_EPL_PAGE_START;
+ page <= CMIS_CDB_EPL_PAGE_END && bytes_written < epl_len; page++) {
+ u16 offset = CMIS_CDB_EPL_FW_BLOCK_OFFSET_START;
+
+ while (offset <= CMIS_CDB_EPL_FW_BLOCK_OFFSET_END &&
+ bytes_written < epl_len) {
+ u32 bytes_left = epl_len - bytes_written;
+ u16 space_left, bytes_to_write;
+
+ space_left = CMIS_CDB_EPL_FW_BLOCK_OFFSET_END - offset + 1;
+ bytes_to_write = min_t(u16, bytes_left,
+ min_t(u16, space_left,
+ args->read_write_len_ext));
+
+ err = __ethtool_cmis_cdb_execute_cmd(dev, page_data,
+ page, offset,
+ bytes_to_write,
+ args->req.epl + bytes_written);
+ if (err < 0)
+ return err;
+
+ offset += bytes_to_write;
+ bytes_written += bytes_to_write;
+ }
+ }
+ return 0;
+}
+
+static u8 cmis_cdb_calc_checksum(const void *data, size_t size)
+{
+ const u8 *bytes = (const u8 *)data;
+ u8 checksum = 0;
+
+ for (size_t i = 0; i < size; i++)
+ checksum += bytes[i];
+
+ return ~checksum;
+}
+
+#define CMIS_CDB_CMD_ID_OFFSET 0x80
+
+int ethtool_cmis_cdb_execute_cmd(struct net_device *dev,
+ struct ethtool_cmis_cdb_cmd_args *args)
+{
+ struct ethtool_module_eeprom page_data = {};
+ u32 offset;
+ int err;
+
+ args->req.chk_code =
+ cmis_cdb_calc_checksum(&args->req,
+ offsetof(struct ethtool_cmis_cdb_request,
+ epl));
+
+ if (args->req.lpl_len > args->read_write_len_ext) {
+ args->err_msg = "LPL length is longer than CDB read write length extension allows";
+ return -EINVAL;
+ }
+
+ /* According to the CMIS standard, there are two options to trigger the
+ * CDB commands. The default option is triggering the command by writing
+ * the CMDID bytes. Therefore, the command will be split to 2 calls:
+ * First, with everything except the CMDID field and then the CMDID
+ * field.
+ */
+ offset = CMIS_CDB_CMD_ID_OFFSET +
+ offsetof(struct ethtool_cmis_cdb_request, body);
+ err = __ethtool_cmis_cdb_execute_cmd(dev, &page_data,
+ ETHTOOL_CMIS_CDB_CMD_PAGE, offset,
+ sizeof(args->req.body),
+ &args->req.body);
+ if (err < 0)
+ return err;
+
+ if (args->req.epl_len) {
+ err = ethtool_cmis_cdb_execute_epl_cmd(dev, args, &page_data);
+ if (err < 0)
+ return err;
+ }
+
+ offset = CMIS_CDB_CMD_ID_OFFSET +
+ offsetof(struct ethtool_cmis_cdb_request, id);
+ err = __ethtool_cmis_cdb_execute_cmd(dev, &page_data,
+ ETHTOOL_CMIS_CDB_CMD_PAGE, offset,
+ sizeof(args->req.id),
+ &args->req.id);
+ if (err < 0)
+ return err;
+
+ err = cmis_cdb_wait_for_completion(dev, args);
+ if (err < 0)
+ return err;
+
+ err = cmis_cdb_wait_for_status(dev, args);
+ if (err < 0)
+ return err;
+
+ return cmis_cdb_process_reply(dev, &page_data, args);
+}
diff --git a/net/ethtool/cmis_fw_update.c b/net/ethtool/cmis_fw_update.c
new file mode 100644
index 000000000000..df5f344209c4
--- /dev/null
+++ b/net/ethtool/cmis_fw_update.c
@@ -0,0 +1,485 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool.h>
+#include <linux/firmware.h>
+#include <net/netdev_lock.h>
+
+#include "common.h"
+#include "module_fw.h"
+#include "cmis.h"
+
+struct cmis_fw_update_fw_mng_features {
+ u8 start_cmd_payload_size;
+ u8 write_mechanism;
+ u16 max_duration_start;
+ u16 max_duration_write;
+ u16 max_duration_complete;
+};
+
+/* See section 9.4.2 "CMD 0041h: Firmware Management Features" in CMIS standard
+ * revision 5.2.
+ * struct cmis_cdb_fw_mng_features_rpl is a structured layout of the flat
+ * array, ethtool_cmis_cdb_rpl::payload.
+ */
+struct cmis_cdb_fw_mng_features_rpl {
+ u8 resv1;
+ u8 resv2;
+ u8 start_cmd_payload_size;
+ u8 resv3;
+ u8 read_write_len_ext;
+ u8 write_mechanism;
+ u8 resv4;
+ u8 resv5;
+ __be16 max_duration_start;
+ __be16 resv6;
+ __be16 max_duration_write;
+ __be16 max_duration_complete;
+ __be16 resv7;
+};
+
+enum cmis_cdb_fw_write_mechanism {
+ CMIS_CDB_FW_WRITE_MECHANISM_NONE = 0x00,
+ CMIS_CDB_FW_WRITE_MECHANISM_LPL = 0x01,
+ CMIS_CDB_FW_WRITE_MECHANISM_EPL = 0x10,
+ CMIS_CDB_FW_WRITE_MECHANISM_BOTH = 0x11,
+};
+
+static int
+cmis_fw_update_fw_mng_features_get(struct ethtool_cmis_cdb *cdb,
+ struct net_device *dev,
+ struct cmis_fw_update_fw_mng_features *fw_mng,
+ struct ethnl_module_fw_flash_ntf_params *ntf_params)
+{
+ struct ethtool_cmis_cdb_cmd_args args = {};
+ struct cmis_cdb_fw_mng_features_rpl *rpl;
+ u8 flags = CDB_F_STATUS_VALID;
+ int err;
+
+ ethtool_cmis_cdb_check_completion_flag(cdb->cmis_rev, &flags);
+ ethtool_cmis_cdb_compose_args(&args,
+ ETHTOOL_CMIS_CDB_CMD_FW_MANAGMENT_FEATURES,
+ NULL, 0, NULL, 0,
+ cdb->max_completion_time,
+ cdb->read_write_len_ext, 1000,
+ sizeof(*rpl), flags);
+
+ err = ethtool_cmis_cdb_execute_cmd(dev, &args);
+ if (err < 0) {
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "FW Management Features command failed",
+ args.err_msg);
+ return err;
+ }
+
+ rpl = (struct cmis_cdb_fw_mng_features_rpl *)args.req.payload;
+ if (rpl->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_NONE) {
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "CDB write mechanism is not supported",
+ NULL);
+ return -EOPNOTSUPP;
+ }
+
+ /* Above, we used read_write_len_ext that we got from CDB
+ * advertisement. Update it with the value that we got from module
+ * features query, which is specific for Firmware Management Commands
+ * (IDs 0100h-01FFh).
+ */
+ cdb->read_write_len_ext = rpl->read_write_len_ext;
+ fw_mng->start_cmd_payload_size = rpl->start_cmd_payload_size;
+ fw_mng->write_mechanism =
+ rpl->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_LPL ?
+ CMIS_CDB_FW_WRITE_MECHANISM_LPL :
+ CMIS_CDB_FW_WRITE_MECHANISM_EPL;
+ fw_mng->max_duration_start = be16_to_cpu(rpl->max_duration_start);
+ fw_mng->max_duration_write = be16_to_cpu(rpl->max_duration_write);
+ fw_mng->max_duration_complete = be16_to_cpu(rpl->max_duration_complete);
+
+ return 0;
+}
+
+/* See section 9.7.2 "CMD 0101h: Start Firmware Download" in CMIS standard
+ * revision 5.2.
+ * struct cmis_cdb_start_fw_download_pl is a structured layout of the
+ * flat array, ethtool_cmis_cdb_request::payload.
+ */
+struct cmis_cdb_start_fw_download_pl {
+ __struct_group(cmis_cdb_start_fw_download_pl_h, head, /* no attrs */,
+ __be32 image_size;
+ __be32 resv1;
+ );
+ u8 vendor_data[ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH -
+ sizeof(struct cmis_cdb_start_fw_download_pl_h)];
+};
+
+static int
+cmis_fw_update_start_download(struct ethtool_cmis_cdb *cdb,
+ struct ethtool_cmis_fw_update_params *fw_update,
+ struct cmis_fw_update_fw_mng_features *fw_mng)
+{
+ u8 vendor_data_size = fw_mng->start_cmd_payload_size;
+ struct cmis_cdb_start_fw_download_pl pl = {};
+ struct ethtool_cmis_cdb_cmd_args args = {};
+ u8 lpl_len;
+ int err;
+
+ pl.image_size = cpu_to_be32(fw_update->fw->size);
+ memcpy(pl.vendor_data, fw_update->fw->data, vendor_data_size);
+
+ lpl_len = offsetof(struct cmis_cdb_start_fw_download_pl,
+ vendor_data[vendor_data_size]);
+
+ ethtool_cmis_cdb_compose_args(&args,
+ ETHTOOL_CMIS_CDB_CMD_START_FW_DOWNLOAD,
+ (u8 *)&pl, lpl_len, NULL, 0,
+ fw_mng->max_duration_start,
+ cdb->read_write_len_ext, 1000, 0,
+ CDB_F_COMPLETION_VALID | CDB_F_STATUS_VALID);
+
+ err = ethtool_cmis_cdb_execute_cmd(fw_update->dev, &args);
+ if (err < 0)
+ ethnl_module_fw_flash_ntf_err(fw_update->dev,
+ &fw_update->ntf_params,
+ "Start FW download command failed",
+ args.err_msg);
+
+ return err;
+}
+
+/* See section 9.7.4 "CMD 0103h: Write Firmware Block LPL" in CMIS standard
+ * revision 5.2.
+ * struct cmis_cdb_write_fw_block_lpl_pl is a structured layout of the
+ * flat array, ethtool_cmis_cdb_request::payload.
+ */
+struct cmis_cdb_write_fw_block_lpl_pl {
+ __be32 block_address;
+ u8 fw_block[ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH - sizeof(__be32)];
+};
+
+static int
+cmis_fw_update_write_image_lpl(struct ethtool_cmis_cdb *cdb,
+ struct ethtool_cmis_fw_update_params *fw_update,
+ struct cmis_fw_update_fw_mng_features *fw_mng)
+{
+ u8 start = fw_mng->start_cmd_payload_size;
+ u32 offset, max_block_size, max_lpl_len;
+ u32 image_size = fw_update->fw->size;
+ int err;
+
+ max_lpl_len = min_t(u32,
+ ethtool_cmis_get_max_lpl_size(cdb->read_write_len_ext),
+ ETHTOOL_CMIS_CDB_LPL_MAX_PL_LENGTH);
+ max_block_size =
+ max_lpl_len - sizeof_field(struct cmis_cdb_write_fw_block_lpl_pl,
+ block_address);
+
+ for (offset = start; offset < image_size; offset += max_block_size) {
+ struct cmis_cdb_write_fw_block_lpl_pl pl = {
+ .block_address = cpu_to_be32(offset - start),
+ };
+ struct ethtool_cmis_cdb_cmd_args args = {};
+ u32 block_size, lpl_len;
+
+ ethnl_module_fw_flash_ntf_in_progress(fw_update->dev,
+ &fw_update->ntf_params,
+ offset - start,
+ image_size);
+ block_size = min_t(u32, max_block_size, image_size - offset);
+ memcpy(pl.fw_block, &fw_update->fw->data[offset], block_size);
+ lpl_len = block_size +
+ sizeof_field(struct cmis_cdb_write_fw_block_lpl_pl,
+ block_address);
+
+ ethtool_cmis_cdb_compose_args(&args,
+ ETHTOOL_CMIS_CDB_CMD_WRITE_FW_BLOCK_LPL,
+ (u8 *)&pl, lpl_len, NULL, 0,
+ fw_mng->max_duration_write,
+ cdb->read_write_len_ext, 1, 0,
+ CDB_F_COMPLETION_VALID | CDB_F_STATUS_VALID);
+
+ err = ethtool_cmis_cdb_execute_cmd(fw_update->dev, &args);
+ if (err < 0) {
+ ethnl_module_fw_flash_ntf_err(fw_update->dev,
+ &fw_update->ntf_params,
+ "Write FW block LPL command failed",
+ args.err_msg);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+struct cmis_cdb_write_fw_block_epl_pl {
+ u8 fw_block[ETHTOOL_CMIS_CDB_EPL_MAX_PL_LENGTH];
+};
+
+static int
+cmis_fw_update_write_image_epl(struct ethtool_cmis_cdb *cdb,
+ struct ethtool_cmis_fw_update_params *fw_update,
+ struct cmis_fw_update_fw_mng_features *fw_mng)
+{
+ u8 start = fw_mng->start_cmd_payload_size;
+ u32 image_size = fw_update->fw->size;
+ u32 offset, lpl_len;
+ int err;
+
+ lpl_len = sizeof_field(struct cmis_cdb_write_fw_block_lpl_pl,
+ block_address);
+
+ for (offset = start; offset < image_size;
+ offset += ETHTOOL_CMIS_CDB_EPL_MAX_PL_LENGTH) {
+ struct cmis_cdb_write_fw_block_lpl_pl lpl = {
+ .block_address = cpu_to_be32(offset - start),
+ };
+ struct cmis_cdb_write_fw_block_epl_pl *epl;
+ struct ethtool_cmis_cdb_cmd_args args = {};
+ u32 epl_len;
+
+ ethnl_module_fw_flash_ntf_in_progress(fw_update->dev,
+ &fw_update->ntf_params,
+ offset - start,
+ image_size);
+
+ epl_len = min_t(u32, ETHTOOL_CMIS_CDB_EPL_MAX_PL_LENGTH,
+ image_size - offset);
+ epl = kmalloc_array(epl_len, sizeof(u8), GFP_KERNEL);
+ if (!epl)
+ return -ENOMEM;
+
+ memcpy(epl->fw_block, &fw_update->fw->data[offset], epl_len);
+
+ ethtool_cmis_cdb_compose_args(&args,
+ ETHTOOL_CMIS_CDB_CMD_WRITE_FW_BLOCK_EPL,
+ (u8 *)&lpl, lpl_len, (u8 *)epl,
+ epl_len,
+ fw_mng->max_duration_write,
+ cdb->read_write_len_ext, 1, 0,
+ CDB_F_COMPLETION_VALID | CDB_F_STATUS_VALID);
+
+ err = ethtool_cmis_cdb_execute_cmd(fw_update->dev, &args);
+ kfree(epl);
+ if (err < 0) {
+ ethnl_module_fw_flash_ntf_err(fw_update->dev,
+ &fw_update->ntf_params,
+ "Write FW block EPL command failed",
+ args.err_msg);
+ return err;
+ }
+ }
+
+ return 0;
+}
+
+static int
+cmis_fw_update_complete_download(struct ethtool_cmis_cdb *cdb,
+ struct net_device *dev,
+ struct cmis_fw_update_fw_mng_features *fw_mng,
+ struct ethnl_module_fw_flash_ntf_params *ntf_params)
+{
+ struct ethtool_cmis_cdb_cmd_args args = {};
+ int err;
+
+ ethtool_cmis_cdb_compose_args(&args,
+ ETHTOOL_CMIS_CDB_CMD_COMPLETE_FW_DOWNLOAD,
+ NULL, 0, NULL, 0,
+ fw_mng->max_duration_complete,
+ cdb->read_write_len_ext, 1000, 0,
+ CDB_F_COMPLETION_VALID | CDB_F_STATUS_VALID);
+
+ err = ethtool_cmis_cdb_execute_cmd(dev, &args);
+ if (err < 0)
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "Complete FW download command failed",
+ args.err_msg);
+
+ return err;
+}
+
+static int
+cmis_fw_update_download_image(struct ethtool_cmis_cdb *cdb,
+ struct ethtool_cmis_fw_update_params *fw_update,
+ struct cmis_fw_update_fw_mng_features *fw_mng)
+{
+ int err;
+
+ err = cmis_fw_update_start_download(cdb, fw_update, fw_mng);
+ if (err < 0)
+ return err;
+
+ if (fw_mng->write_mechanism == CMIS_CDB_FW_WRITE_MECHANISM_LPL) {
+ err = cmis_fw_update_write_image_lpl(cdb, fw_update, fw_mng);
+ if (err < 0)
+ return err;
+ } else {
+ err = cmis_fw_update_write_image_epl(cdb, fw_update, fw_mng);
+ if (err < 0)
+ return err;
+ }
+
+ err = cmis_fw_update_complete_download(cdb, fw_update->dev, fw_mng,
+ &fw_update->ntf_params);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+enum {
+ CMIS_MODULE_LOW_PWR = 1,
+ CMIS_MODULE_READY = 3,
+};
+
+static bool module_is_ready(u8 data)
+{
+ u8 state = (data >> 1) & 7;
+
+ return state == CMIS_MODULE_READY || state == CMIS_MODULE_LOW_PWR;
+}
+
+#define CMIS_MODULE_READY_MAX_DURATION_MSEC 1000
+#define CMIS_MODULE_STATE_OFFSET 3
+
+static int
+cmis_fw_update_wait_for_module_state(struct net_device *dev, u8 flags)
+{
+ u8 state;
+
+ return ethtool_cmis_wait_for_cond(dev, flags, CDB_F_MODULE_STATE_VALID,
+ CMIS_MODULE_READY_MAX_DURATION_MSEC,
+ CMIS_MODULE_STATE_OFFSET,
+ module_is_ready, NULL, &state);
+}
+
+/* See section 9.7.10 "CMD 0109h: Run Firmware Image" in CMIS standard
+ * revision 5.2.
+ * struct cmis_cdb_run_fw_image_pl is a structured layout of the flat
+ * array, ethtool_cmis_cdb_request::payload.
+ */
+struct cmis_cdb_run_fw_image_pl {
+ u8 resv1;
+ u8 image_to_run;
+ u16 delay_to_reset;
+};
+
+static int
+cmis_fw_update_run_image(struct ethtool_cmis_cdb *cdb, struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *ntf_params)
+{
+ struct ethtool_cmis_cdb_cmd_args args = {};
+ struct cmis_cdb_run_fw_image_pl pl = {0};
+ int err;
+
+ ethtool_cmis_cdb_compose_args(&args, ETHTOOL_CMIS_CDB_CMD_RUN_FW_IMAGE,
+ (u8 *)&pl, sizeof(pl), NULL, 0,
+ cdb->max_completion_time,
+ cdb->read_write_len_ext, 1000, 0,
+ CDB_F_MODULE_STATE_VALID);
+
+ err = ethtool_cmis_cdb_execute_cmd(dev, &args);
+ if (err < 0) {
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "Run image command failed",
+ args.err_msg);
+ return err;
+ }
+
+ err = cmis_fw_update_wait_for_module_state(dev, args.flags);
+ if (err < 0)
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "Module is not ready on time after reset",
+ NULL);
+
+ return err;
+}
+
+static int
+cmis_fw_update_commit_image(struct ethtool_cmis_cdb *cdb,
+ struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *ntf_params)
+{
+ struct ethtool_cmis_cdb_cmd_args args = {};
+ int err;
+
+ ethtool_cmis_cdb_compose_args(&args,
+ ETHTOOL_CMIS_CDB_CMD_COMMIT_FW_IMAGE,
+ NULL, 0, NULL, 0,
+ cdb->max_completion_time,
+ cdb->read_write_len_ext, 1000, 0,
+ CDB_F_COMPLETION_VALID | CDB_F_STATUS_VALID);
+
+ err = ethtool_cmis_cdb_execute_cmd(dev, &args);
+ if (err < 0)
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params,
+ "Commit image command failed",
+ args.err_msg);
+
+ return err;
+}
+
+static int cmis_fw_update_reset(struct net_device *dev)
+{
+ __u32 reset_data = ETH_RESET_PHY;
+ int ret;
+
+ netdev_lock_ops(dev);
+ ret = dev->ethtool_ops->reset(dev, &reset_data);
+ netdev_unlock_ops(dev);
+
+ return ret;
+}
+
+void
+ethtool_cmis_fw_update(struct ethtool_cmis_fw_update_params *fw_update)
+{
+ struct ethnl_module_fw_flash_ntf_params *ntf_params =
+ &fw_update->ntf_params;
+ struct cmis_fw_update_fw_mng_features fw_mng = {0};
+ struct net_device *dev = fw_update->dev;
+ struct ethtool_cmis_cdb *cdb;
+ int err;
+
+ cdb = ethtool_cmis_cdb_init(dev, &fw_update->params, ntf_params);
+ if (IS_ERR(cdb))
+ goto err_send_ntf;
+
+ ethnl_module_fw_flash_ntf_start(dev, ntf_params);
+
+ err = cmis_fw_update_fw_mng_features_get(cdb, dev, &fw_mng, ntf_params);
+ if (err < 0)
+ goto err_cdb_fini;
+
+ err = cmis_fw_update_download_image(cdb, fw_update, &fw_mng);
+ if (err < 0)
+ goto err_cdb_fini;
+
+ err = cmis_fw_update_run_image(cdb, dev, ntf_params);
+ if (err < 0)
+ goto err_cdb_fini;
+
+ /* The CDB command "Run Firmware Image" resets the firmware, so the new
+ * one might have different settings.
+ * Free the old CDB instance, and init a new one.
+ */
+ ethtool_cmis_cdb_fini(cdb);
+
+ cdb = ethtool_cmis_cdb_init(dev, &fw_update->params, ntf_params);
+ if (IS_ERR(cdb))
+ goto err_send_ntf;
+
+ err = cmis_fw_update_commit_image(cdb, dev, ntf_params);
+ if (err < 0)
+ goto err_cdb_fini;
+
+ err = cmis_fw_update_reset(dev);
+ if (err < 0)
+ goto err_cdb_fini;
+
+ ethnl_module_fw_flash_ntf_complete(dev, ntf_params);
+ ethtool_cmis_cdb_fini(cdb);
+ return;
+
+err_cdb_fini:
+ ethtool_cmis_cdb_fini(cdb);
+err_send_ntf:
+ ethnl_module_fw_flash_ntf_err(dev, ntf_params, NULL, NULL);
+}
diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c
new file mode 100644
index 000000000000..3e18ca1ccc5e
--- /dev/null
+++ b/net/ethtool/coalesce.c
@@ -0,0 +1,649 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/dim.h>
+#include "netlink.h"
+#include "common.h"
+
+struct coalesce_req_info {
+ struct ethnl_req_info base;
+};
+
+struct coalesce_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_coalesce coalesce;
+ struct kernel_ethtool_coalesce kernel_coalesce;
+ u32 supported_params;
+};
+
+#define COALESCE_REPDATA(__reply_base) \
+ container_of(__reply_base, struct coalesce_reply_data, base)
+
+#define __SUPPORTED_OFFSET ETHTOOL_A_COALESCE_RX_USECS
+static u32 attr_to_mask(unsigned int attr_type)
+{
+ return BIT(attr_type - __SUPPORTED_OFFSET);
+}
+
+/* build time check that indices in ethtool_ops::supported_coalesce_params
+ * match corresponding attribute types with an offset
+ */
+#define __CHECK_SUPPORTED_OFFSET(x) \
+ static_assert((ETHTOOL_ ## x) == \
+ BIT((ETHTOOL_A_ ## x) - __SUPPORTED_OFFSET))
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_IRQ);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_IRQ);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_IRQ);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_IRQ);
+__CHECK_SUPPORTED_OFFSET(COALESCE_STATS_BLOCK_USECS);
+__CHECK_SUPPORTED_OFFSET(COALESCE_USE_ADAPTIVE_RX);
+__CHECK_SUPPORTED_OFFSET(COALESCE_USE_ADAPTIVE_TX);
+__CHECK_SUPPORTED_OFFSET(COALESCE_PKT_RATE_LOW);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_LOW);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_LOW);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_LOW);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_LOW);
+__CHECK_SUPPORTED_OFFSET(COALESCE_PKT_RATE_HIGH);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_USECS_HIGH);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RX_MAX_FRAMES_HIGH);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_USECS_HIGH);
+__CHECK_SUPPORTED_OFFSET(COALESCE_TX_MAX_FRAMES_HIGH);
+__CHECK_SUPPORTED_OFFSET(COALESCE_RATE_SAMPLE_INTERVAL);
+
+const struct nla_policy ethnl_coalesce_get_policy[] = {
+ [ETHTOOL_A_COALESCE_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int coalesce_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_coalesce)
+ return -EOPNOTSUPP;
+ data->supported_params = dev->ethtool_ops->supported_coalesce_params;
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ ret = dev->ethtool_ops->get_coalesce(dev, &data->coalesce,
+ &data->kernel_coalesce,
+ info->extack);
+ ethnl_ops_complete(dev);
+
+ return ret;
+}
+
+static int coalesce_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ int modersz = nla_total_size(0) + /* _PROFILE_IRQ_MODERATION, nest */
+ nla_total_size(sizeof(u32)) + /* _IRQ_MODERATION_USEC */
+ nla_total_size(sizeof(u32)) + /* _IRQ_MODERATION_PKTS */
+ nla_total_size(sizeof(u32)); /* _IRQ_MODERATION_COMPS */
+
+ int total_modersz = nla_total_size(0) + /* _{R,T}X_PROFILE, nest */
+ modersz * NET_DIM_PARAMS_NUM_PROFILES;
+
+ return nla_total_size(sizeof(u32)) + /* _RX_USECS */
+ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES */
+ nla_total_size(sizeof(u32)) + /* _RX_USECS_IRQ */
+ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_IRQ */
+ nla_total_size(sizeof(u32)) + /* _TX_USECS */
+ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES */
+ nla_total_size(sizeof(u32)) + /* _TX_USECS_IRQ */
+ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_IRQ */
+ nla_total_size(sizeof(u32)) + /* _STATS_BLOCK_USECS */
+ nla_total_size(sizeof(u8)) + /* _USE_ADAPTIVE_RX */
+ nla_total_size(sizeof(u8)) + /* _USE_ADAPTIVE_TX */
+ nla_total_size(sizeof(u32)) + /* _PKT_RATE_LOW */
+ nla_total_size(sizeof(u32)) + /* _RX_USECS_LOW */
+ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_LOW */
+ nla_total_size(sizeof(u32)) + /* _TX_USECS_LOW */
+ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_LOW */
+ nla_total_size(sizeof(u32)) + /* _PKT_RATE_HIGH */
+ nla_total_size(sizeof(u32)) + /* _RX_USECS_HIGH */
+ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_HIGH */
+ nla_total_size(sizeof(u32)) + /* _TX_USECS_HIGH */
+ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_HIGH */
+ nla_total_size(sizeof(u32)) + /* _RATE_SAMPLE_INTERVAL */
+ nla_total_size(sizeof(u8)) + /* _USE_CQE_MODE_TX */
+ nla_total_size(sizeof(u8)) + /* _USE_CQE_MODE_RX */
+ nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_BYTES */
+ nla_total_size(sizeof(u32)) + /* _TX_AGGR_MAX_FRAMES */
+ nla_total_size(sizeof(u32)) + /* _TX_AGGR_TIME_USECS */
+ total_modersz * 2; /* _{R,T}X_PROFILE */
+}
+
+static bool coalesce_put_u32(struct sk_buff *skb, u16 attr_type, u32 val,
+ u32 supported_params)
+{
+ if (!val && !(supported_params & attr_to_mask(attr_type)))
+ return false;
+ return nla_put_u32(skb, attr_type, val);
+}
+
+static bool coalesce_put_bool(struct sk_buff *skb, u16 attr_type, u32 val,
+ u32 supported_params)
+{
+ if (!val && !(supported_params & attr_to_mask(attr_type)))
+ return false;
+ return nla_put_u8(skb, attr_type, !!val);
+}
+
+/**
+ * coalesce_put_profile - fill reply with a nla nest with four child nla nests.
+ * @skb: socket buffer the message is stored in
+ * @attr_type: nest attr type ETHTOOL_A_COALESCE_*X_PROFILE
+ * @profile: data passed to userspace
+ * @coal_flags: modifiable parameters supported by the driver
+ *
+ * Put a dim profile nest attribute. Refer to ETHTOOL_A_PROFILE_IRQ_MODERATION.
+ *
+ * Return: 0 on success or a negative error code.
+ */
+static int coalesce_put_profile(struct sk_buff *skb, u16 attr_type,
+ const struct dim_cq_moder *profile,
+ u8 coal_flags)
+{
+ struct nlattr *profile_attr, *moder_attr;
+ int i, ret;
+
+ if (!profile || !coal_flags)
+ return 0;
+
+ profile_attr = nla_nest_start(skb, attr_type);
+ if (!profile_attr)
+ return -EMSGSIZE;
+
+ for (i = 0; i < NET_DIM_PARAMS_NUM_PROFILES; i++) {
+ moder_attr = nla_nest_start(skb,
+ ETHTOOL_A_PROFILE_IRQ_MODERATION);
+ if (!moder_attr) {
+ ret = -EMSGSIZE;
+ goto cancel_profile;
+ }
+
+ if (coal_flags & DIM_COALESCE_USEC) {
+ ret = nla_put_u32(skb, ETHTOOL_A_IRQ_MODERATION_USEC,
+ profile[i].usec);
+ if (ret)
+ goto cancel_moder;
+ }
+
+ if (coal_flags & DIM_COALESCE_PKTS) {
+ ret = nla_put_u32(skb, ETHTOOL_A_IRQ_MODERATION_PKTS,
+ profile[i].pkts);
+ if (ret)
+ goto cancel_moder;
+ }
+
+ if (coal_flags & DIM_COALESCE_COMPS) {
+ ret = nla_put_u32(skb, ETHTOOL_A_IRQ_MODERATION_COMPS,
+ profile[i].comps);
+ if (ret)
+ goto cancel_moder;
+ }
+
+ nla_nest_end(skb, moder_attr);
+ }
+
+ nla_nest_end(skb, profile_attr);
+
+ return 0;
+
+cancel_moder:
+ nla_nest_cancel(skb, moder_attr);
+cancel_profile:
+ nla_nest_cancel(skb, profile_attr);
+ return ret;
+}
+
+static int coalesce_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base);
+ const struct kernel_ethtool_coalesce *kcoal = &data->kernel_coalesce;
+ const struct ethtool_coalesce *coal = &data->coalesce;
+ u32 supported = data->supported_params;
+ struct dim_irq_moder *moder;
+ int ret = 0;
+
+ if (coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS,
+ coal->rx_coalesce_usecs, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES,
+ coal->rx_max_coalesced_frames, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_IRQ,
+ coal->rx_coalesce_usecs_irq, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ,
+ coal->rx_max_coalesced_frames_irq, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS,
+ coal->tx_coalesce_usecs, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES,
+ coal->tx_max_coalesced_frames, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_IRQ,
+ coal->tx_coalesce_usecs_irq, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ,
+ coal->tx_max_coalesced_frames_irq, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_STATS_BLOCK_USECS,
+ coal->stats_block_coalesce_usecs, supported) ||
+ coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX,
+ coal->use_adaptive_rx_coalesce, supported) ||
+ coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX,
+ coal->use_adaptive_tx_coalesce, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_PKT_RATE_LOW,
+ coal->pkt_rate_low, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_LOW,
+ coal->rx_coalesce_usecs_low, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW,
+ coal->rx_max_coalesced_frames_low, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_LOW,
+ coal->tx_coalesce_usecs_low, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW,
+ coal->tx_max_coalesced_frames_low, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_PKT_RATE_HIGH,
+ coal->pkt_rate_high, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS_HIGH,
+ coal->rx_coalesce_usecs_high, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH,
+ coal->rx_max_coalesced_frames_high, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_USECS_HIGH,
+ coal->tx_coalesce_usecs_high, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH,
+ coal->tx_max_coalesced_frames_high, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL,
+ coal->rate_sample_interval, supported) ||
+ coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_TX,
+ kcoal->use_cqe_mode_tx, supported) ||
+ coalesce_put_bool(skb, ETHTOOL_A_COALESCE_USE_CQE_MODE_RX,
+ kcoal->use_cqe_mode_rx, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES,
+ kcoal->tx_aggr_max_bytes, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES,
+ kcoal->tx_aggr_max_frames, supported) ||
+ coalesce_put_u32(skb, ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS,
+ kcoal->tx_aggr_time_usecs, supported))
+ return -EMSGSIZE;
+
+ if (!req_base->dev || !req_base->dev->irq_moder)
+ return 0;
+
+ moder = req_base->dev->irq_moder;
+ rcu_read_lock();
+ if (moder->profile_flags & DIM_PROFILE_RX) {
+ ret = coalesce_put_profile(skb, ETHTOOL_A_COALESCE_RX_PROFILE,
+ rcu_dereference(moder->rx_profile),
+ moder->coal_flags);
+ if (ret)
+ goto out;
+ }
+
+ if (moder->profile_flags & DIM_PROFILE_TX)
+ ret = coalesce_put_profile(skb, ETHTOOL_A_COALESCE_TX_PROFILE,
+ rcu_dereference(moder->tx_profile),
+ moder->coal_flags);
+
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+/* COALESCE_SET */
+
+static const struct nla_policy coalesce_irq_moderation_policy[] = {
+ [ETHTOOL_A_IRQ_MODERATION_USEC] = { .type = NLA_U32 },
+ [ETHTOOL_A_IRQ_MODERATION_PKTS] = { .type = NLA_U32 },
+ [ETHTOOL_A_IRQ_MODERATION_COMPS] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy coalesce_profile_policy[] = {
+ [ETHTOOL_A_PROFILE_IRQ_MODERATION] =
+ NLA_POLICY_NESTED(coalesce_irq_moderation_policy),
+};
+
+const struct nla_policy ethnl_coalesce_set_policy[] = {
+ [ETHTOOL_A_COALESCE_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_COALESCE_RX_USECS] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_USECS_IRQ] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_USECS] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_USECS_IRQ] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_STATS_BLOCK_USECS] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX] = { .type = NLA_U8 },
+ [ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX] = { .type = NLA_U8 },
+ [ETHTOOL_A_COALESCE_PKT_RATE_LOW] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_USECS_LOW] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_USECS_LOW] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_PKT_RATE_HIGH] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_USECS_HIGH] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_USECS_HIGH] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_USE_CQE_MODE_TX] = NLA_POLICY_MAX(NLA_U8, 1),
+ [ETHTOOL_A_COALESCE_USE_CQE_MODE_RX] = NLA_POLICY_MAX(NLA_U8, 1),
+ [ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS] = { .type = NLA_U32 },
+ [ETHTOOL_A_COALESCE_RX_PROFILE] =
+ NLA_POLICY_NESTED(coalesce_profile_policy),
+ [ETHTOOL_A_COALESCE_TX_PROFILE] =
+ NLA_POLICY_NESTED(coalesce_profile_policy),
+};
+
+static int
+ethnl_set_coalesce_validate(struct ethnl_req_info *req_info,
+ struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+ struct dim_irq_moder *irq_moder = req_info->dev->irq_moder;
+ struct nlattr **tb = info->attrs;
+ u32 supported_params;
+ u16 a;
+
+ if (!ops->get_coalesce || !ops->set_coalesce)
+ return -EOPNOTSUPP;
+
+ /* make sure that only supported parameters are present */
+ supported_params = ops->supported_coalesce_params;
+ if (irq_moder && irq_moder->profile_flags & DIM_PROFILE_RX)
+ supported_params |= ETHTOOL_COALESCE_RX_PROFILE;
+
+ if (irq_moder && irq_moder->profile_flags & DIM_PROFILE_TX)
+ supported_params |= ETHTOOL_COALESCE_TX_PROFILE;
+
+ for (a = ETHTOOL_A_COALESCE_RX_USECS; a < __ETHTOOL_A_COALESCE_CNT; a++)
+ if (tb[a] && !(supported_params & attr_to_mask(a))) {
+ NL_SET_ERR_MSG_ATTR(info->extack, tb[a],
+ "cannot modify an unsupported parameter");
+ return -EINVAL;
+ }
+
+ return 1;
+}
+
+/**
+ * ethnl_update_irq_moder - update a specific field in the given profile
+ * @irq_moder: place that collects dim related information
+ * @irq_field: field in profile to modify
+ * @attr_type: attr type ETHTOOL_A_IRQ_MODERATION_*
+ * @tb: netlink attribute with new values or null
+ * @coal_bit: DIM_COALESCE_* bit from coal_flags
+ * @mod: pointer to bool for modification tracking
+ * @extack: netlink extended ack
+ *
+ * Return: 0 on success or a negative error code.
+ */
+static int ethnl_update_irq_moder(struct dim_irq_moder *irq_moder,
+ u16 *irq_field, u16 attr_type,
+ struct nlattr **tb,
+ u8 coal_bit, bool *mod,
+ struct netlink_ext_ack *extack)
+{
+ int ret = 0;
+ u32 val;
+
+ if (!tb[attr_type])
+ return 0;
+
+ if (irq_moder->coal_flags & coal_bit) {
+ val = nla_get_u32(tb[attr_type]);
+ if (*irq_field == val)
+ return 0;
+
+ *irq_field = val;
+ *mod = true;
+ } else {
+ NL_SET_BAD_ATTR(extack, tb[attr_type]);
+ ret = -EOPNOTSUPP;
+ }
+
+ return ret;
+}
+
+/**
+ * ethnl_update_profile - get a profile nest with child nests from userspace.
+ * @dev: netdevice to update the profile
+ * @dst: profile get from the driver and modified by ethnl_update_profile.
+ * @nests: nest attr ETHTOOL_A_COALESCE_*X_PROFILE to set profile.
+ * @mod: pointer to bool for modification tracking
+ * @extack: Netlink extended ack
+ *
+ * Layout of nests:
+ * Nested ETHTOOL_A_COALESCE_*X_PROFILE attr
+ * Nested ETHTOOL_A_PROFILE_IRQ_MODERATION attr
+ * ETHTOOL_A_IRQ_MODERATION_USEC attr
+ * ETHTOOL_A_IRQ_MODERATION_PKTS attr
+ * ETHTOOL_A_IRQ_MODERATION_COMPS attr
+ * ...
+ * Nested ETHTOOL_A_PROFILE_IRQ_MODERATION attr
+ * ETHTOOL_A_IRQ_MODERATION_USEC attr
+ * ETHTOOL_A_IRQ_MODERATION_PKTS attr
+ * ETHTOOL_A_IRQ_MODERATION_COMPS attr
+ *
+ * Return: 0 on success or a negative error code.
+ */
+static int ethnl_update_profile(struct net_device *dev,
+ struct dim_cq_moder __rcu **dst,
+ const struct nlattr *nests,
+ bool *mod,
+ struct netlink_ext_ack *extack)
+{
+ int len_irq_moder = ARRAY_SIZE(coalesce_irq_moderation_policy);
+ struct nlattr *tb[ARRAY_SIZE(coalesce_irq_moderation_policy)];
+ struct dim_irq_moder *irq_moder = dev->irq_moder;
+ struct dim_cq_moder *new_profile, *old_profile;
+ int ret, rem, i = 0, len;
+ struct nlattr *nest;
+
+ if (!nests)
+ return 0;
+
+ if (!*dst)
+ return -EOPNOTSUPP;
+
+ old_profile = rtnl_dereference(*dst);
+ len = NET_DIM_PARAMS_NUM_PROFILES * sizeof(*old_profile);
+ new_profile = kmemdup(old_profile, len, GFP_KERNEL);
+ if (!new_profile)
+ return -ENOMEM;
+
+ nla_for_each_nested_type(nest, ETHTOOL_A_PROFILE_IRQ_MODERATION,
+ nests, rem) {
+ ret = nla_parse_nested(tb, len_irq_moder - 1, nest,
+ coalesce_irq_moderation_policy,
+ extack);
+ if (ret)
+ goto err_out;
+
+ ret = ethnl_update_irq_moder(irq_moder, &new_profile[i].usec,
+ ETHTOOL_A_IRQ_MODERATION_USEC,
+ tb, DIM_COALESCE_USEC,
+ mod, extack);
+ if (ret)
+ goto err_out;
+
+ ret = ethnl_update_irq_moder(irq_moder, &new_profile[i].pkts,
+ ETHTOOL_A_IRQ_MODERATION_PKTS,
+ tb, DIM_COALESCE_PKTS,
+ mod, extack);
+ if (ret)
+ goto err_out;
+
+ ret = ethnl_update_irq_moder(irq_moder, &new_profile[i].comps,
+ ETHTOOL_A_IRQ_MODERATION_COMPS,
+ tb, DIM_COALESCE_COMPS,
+ mod, extack);
+ if (ret)
+ goto err_out;
+
+ i++;
+ }
+
+ /* After the profile is modified, dim itself is a dynamic
+ * mechanism and will quickly fit to the appropriate
+ * coalescing parameters according to the new profile.
+ */
+ rcu_assign_pointer(*dst, new_profile);
+ kfree_rcu(old_profile, rcu);
+
+ return 0;
+
+err_out:
+ kfree(new_profile);
+ return ret;
+}
+
+static int
+__ethnl_set_coalesce(struct ethnl_req_info *req_info, struct genl_info *info,
+ bool *dual_change)
+{
+ struct kernel_ethtool_coalesce kernel_coalesce = {};
+ struct net_device *dev = req_info->dev;
+ struct ethtool_coalesce coalesce = {};
+ bool mod_mode = false, mod = false;
+ struct nlattr **tb = info->attrs;
+ int ret;
+
+ ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce,
+ info->extack);
+ if (ret < 0)
+ return ret;
+
+ /* Update values */
+ ethnl_update_u32(&coalesce.rx_coalesce_usecs,
+ tb[ETHTOOL_A_COALESCE_RX_USECS], &mod);
+ ethnl_update_u32(&coalesce.rx_max_coalesced_frames,
+ tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES], &mod);
+ ethnl_update_u32(&coalesce.rx_coalesce_usecs_irq,
+ tb[ETHTOOL_A_COALESCE_RX_USECS_IRQ], &mod);
+ ethnl_update_u32(&coalesce.rx_max_coalesced_frames_irq,
+ tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_IRQ], &mod);
+ ethnl_update_u32(&coalesce.tx_coalesce_usecs,
+ tb[ETHTOOL_A_COALESCE_TX_USECS], &mod);
+ ethnl_update_u32(&coalesce.tx_max_coalesced_frames,
+ tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES], &mod);
+ ethnl_update_u32(&coalesce.tx_coalesce_usecs_irq,
+ tb[ETHTOOL_A_COALESCE_TX_USECS_IRQ], &mod);
+ ethnl_update_u32(&coalesce.tx_max_coalesced_frames_irq,
+ tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_IRQ], &mod);
+ ethnl_update_u32(&coalesce.stats_block_coalesce_usecs,
+ tb[ETHTOOL_A_COALESCE_STATS_BLOCK_USECS], &mod);
+ ethnl_update_u32(&coalesce.pkt_rate_low,
+ tb[ETHTOOL_A_COALESCE_PKT_RATE_LOW], &mod);
+ ethnl_update_u32(&coalesce.rx_coalesce_usecs_low,
+ tb[ETHTOOL_A_COALESCE_RX_USECS_LOW], &mod);
+ ethnl_update_u32(&coalesce.rx_max_coalesced_frames_low,
+ tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_LOW], &mod);
+ ethnl_update_u32(&coalesce.tx_coalesce_usecs_low,
+ tb[ETHTOOL_A_COALESCE_TX_USECS_LOW], &mod);
+ ethnl_update_u32(&coalesce.tx_max_coalesced_frames_low,
+ tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_LOW], &mod);
+ ethnl_update_u32(&coalesce.pkt_rate_high,
+ tb[ETHTOOL_A_COALESCE_PKT_RATE_HIGH], &mod);
+ ethnl_update_u32(&coalesce.rx_coalesce_usecs_high,
+ tb[ETHTOOL_A_COALESCE_RX_USECS_HIGH], &mod);
+ ethnl_update_u32(&coalesce.rx_max_coalesced_frames_high,
+ tb[ETHTOOL_A_COALESCE_RX_MAX_FRAMES_HIGH], &mod);
+ ethnl_update_u32(&coalesce.tx_coalesce_usecs_high,
+ tb[ETHTOOL_A_COALESCE_TX_USECS_HIGH], &mod);
+ ethnl_update_u32(&coalesce.tx_max_coalesced_frames_high,
+ tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH], &mod);
+ ethnl_update_u32(&coalesce.rate_sample_interval,
+ tb[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL], &mod);
+ ethnl_update_u32(&kernel_coalesce.tx_aggr_max_bytes,
+ tb[ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES], &mod);
+ ethnl_update_u32(&kernel_coalesce.tx_aggr_max_frames,
+ tb[ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES], &mod);
+ ethnl_update_u32(&kernel_coalesce.tx_aggr_time_usecs,
+ tb[ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS], &mod);
+
+ if (dev->irq_moder && dev->irq_moder->profile_flags & DIM_PROFILE_RX) {
+ ret = ethnl_update_profile(dev, &dev->irq_moder->rx_profile,
+ tb[ETHTOOL_A_COALESCE_RX_PROFILE],
+ &mod, info->extack);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (dev->irq_moder && dev->irq_moder->profile_flags & DIM_PROFILE_TX) {
+ ret = ethnl_update_profile(dev, &dev->irq_moder->tx_profile,
+ tb[ETHTOOL_A_COALESCE_TX_PROFILE],
+ &mod, info->extack);
+ if (ret < 0)
+ return ret;
+ }
+
+ /* Update operation modes */
+ ethnl_update_bool32(&coalesce.use_adaptive_rx_coalesce,
+ tb[ETHTOOL_A_COALESCE_USE_ADAPTIVE_RX], &mod_mode);
+ ethnl_update_bool32(&coalesce.use_adaptive_tx_coalesce,
+ tb[ETHTOOL_A_COALESCE_USE_ADAPTIVE_TX], &mod_mode);
+ ethnl_update_u8(&kernel_coalesce.use_cqe_mode_tx,
+ tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_TX], &mod_mode);
+ ethnl_update_u8(&kernel_coalesce.use_cqe_mode_rx,
+ tb[ETHTOOL_A_COALESCE_USE_CQE_MODE_RX], &mod_mode);
+
+ *dual_change = mod && mod_mode;
+ if (!mod && !mod_mode)
+ return 0;
+
+ ret = dev->ethtool_ops->set_coalesce(dev, &coalesce, &kernel_coalesce,
+ info->extack);
+ return ret < 0 ? ret : 1;
+}
+
+static int
+ethnl_set_coalesce(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ bool dual_change;
+ int err, ret;
+
+ /* SET_COALESCE may change operation mode and parameters in one call.
+ * Changing operation mode may cause the driver to reset the parameter
+ * values, and therefore ignore user input (driver does not know which
+ * parameters come from user and which are echoed back from ->get).
+ * To not complicate the drivers if user tries to change both the mode
+ * and parameters at once - call the driver twice.
+ */
+ err = __ethnl_set_coalesce(req_info, info, &dual_change);
+ if (err < 0)
+ return err;
+ ret = err;
+
+ if (ret && dual_change) {
+ err = __ethnl_set_coalesce(req_info, info, &dual_change);
+ if (err < 0)
+ return err;
+ }
+ return ret;
+}
+
+const struct ethnl_request_ops ethnl_coalesce_request_ops = {
+ .request_cmd = ETHTOOL_MSG_COALESCE_GET,
+ .reply_cmd = ETHTOOL_MSG_COALESCE_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_COALESCE_HEADER,
+ .req_info_size = sizeof(struct coalesce_req_info),
+ .reply_data_size = sizeof(struct coalesce_reply_data),
+
+ .prepare_data = coalesce_prepare_data,
+ .reply_size = coalesce_reply_size,
+ .fill_reply = coalesce_fill_reply,
+
+ .set_validate = ethnl_set_coalesce_validate,
+ .set = ethnl_set_coalesce,
+ .set_ntf_cmd = ETHTOOL_MSG_COALESCE_NTF,
+};
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
new file mode 100644
index 000000000000..369c05cf8163
--- /dev/null
+++ b/net/ethtool/common.c
@@ -0,0 +1,1169 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool_netlink.h>
+#include <linux/net_tstamp.h>
+#include <linux/phy.h>
+#include <linux/rtnetlink.h>
+#include <linux/ptp_clock_kernel.h>
+#include <linux/phy_link_topology.h>
+#include <net/netdev_queues.h>
+
+#include "netlink.h"
+#include "common.h"
+#include "../core/dev.h"
+
+
+const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
+ [NETIF_F_SG_BIT] = "tx-scatter-gather",
+ [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4",
+ [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic",
+ [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6",
+ [NETIF_F_HIGHDMA_BIT] = "highdma",
+ [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist",
+ [NETIF_F_HW_VLAN_CTAG_TX_BIT] = "tx-vlan-hw-insert",
+
+ [NETIF_F_HW_VLAN_CTAG_RX_BIT] = "rx-vlan-hw-parse",
+ [NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter",
+ [NETIF_F_HW_VLAN_STAG_TX_BIT] = "tx-vlan-stag-hw-insert",
+ [NETIF_F_HW_VLAN_STAG_RX_BIT] = "rx-vlan-stag-hw-parse",
+ [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter",
+ [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged",
+ [NETIF_F_GSO_BIT] = "tx-generic-segmentation",
+ [NETIF_F_GRO_BIT] = "rx-gro",
+ [NETIF_F_GRO_HW_BIT] = "rx-gro-hw",
+ [NETIF_F_LRO_BIT] = "rx-lro",
+
+ [NETIF_F_TSO_BIT] = "tx-tcp-segmentation",
+ [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust",
+ [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation",
+ [NETIF_F_GSO_ACCECN_BIT] = "tx-tcp-accecn-segmentation",
+ [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation",
+ [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation",
+ [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation",
+ [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation",
+ [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation",
+ [NETIF_F_GSO_IPXIP4_BIT] = "tx-ipxip4-segmentation",
+ [NETIF_F_GSO_IPXIP6_BIT] = "tx-ipxip6-segmentation",
+ [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation",
+ [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
+ [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial",
+ [NETIF_F_GSO_TUNNEL_REMCSUM_BIT] = "tx-tunnel-remcsum-segmentation",
+ [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation",
+ [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation",
+ [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation",
+ [NETIF_F_GSO_FRAGLIST_BIT] = "tx-gso-list",
+
+ [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
+ [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp",
+ [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter",
+ [NETIF_F_RXHASH_BIT] = "rx-hashing",
+ [NETIF_F_RXCSUM_BIT] = "rx-checksum",
+ [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy",
+ [NETIF_F_LOOPBACK_BIT] = "loopback",
+ [NETIF_F_RXFCS_BIT] = "rx-fcs",
+ [NETIF_F_RXALL_BIT] = "rx-all",
+ [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
+ [NETIF_F_HW_TC_BIT] = "hw-tc-offload",
+ [NETIF_F_HW_ESP_BIT] = "esp-hw-offload",
+ [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload",
+ [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload",
+ [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record",
+ [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload",
+ [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
+ [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
+ [NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload",
+ [NETIF_F_GRO_UDP_FWD_BIT] = "rx-udp-gro-forwarding",
+ [NETIF_F_HW_HSR_TAG_INS_BIT] = "hsr-tag-ins-offload",
+ [NETIF_F_HW_HSR_TAG_RM_BIT] = "hsr-tag-rm-offload",
+ [NETIF_F_HW_HSR_FWD_BIT] = "hsr-fwd-offload",
+ [NETIF_F_HW_HSR_DUP_BIT] = "hsr-dup-offload",
+};
+
+const char
+rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = {
+ [ETH_RSS_HASH_TOP_BIT] = "toeplitz",
+ [ETH_RSS_HASH_XOR_BIT] = "xor",
+ [ETH_RSS_HASH_CRC32_BIT] = "crc32",
+};
+
+const char
+tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_ID_UNSPEC] = "Unspec",
+ [ETHTOOL_RX_COPYBREAK] = "rx-copybreak",
+ [ETHTOOL_TX_COPYBREAK] = "tx-copybreak",
+ [ETHTOOL_PFC_PREVENTION_TOUT] = "pfc-prevention-tout",
+ [ETHTOOL_TX_COPYBREAK_BUF_SIZE] = "tx-copybreak-buf-size",
+};
+
+const char
+phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_ID_UNSPEC] = "Unspec",
+ [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift",
+ [ETHTOOL_PHY_FAST_LINK_DOWN] = "phy-fast-link-down",
+ [ETHTOOL_PHY_EDPD] = "phy-energy-detect-power-down",
+};
+
+#define __LINK_MODE_NAME(speed, type, duplex) \
+ #speed "base" #type "/" #duplex
+#define __DEFINE_LINK_MODE_NAME(speed, type, duplex) \
+ [ETHTOOL_LINK_MODE(speed, type, duplex)] = \
+ __LINK_MODE_NAME(speed, type, duplex)
+#define __DEFINE_SPECIAL_MODE_NAME(_mode, _name) \
+ [ETHTOOL_LINK_MODE_ ## _mode ## _BIT] = _name
+
+const char link_mode_names[][ETH_GSTRING_LEN] = {
+ __DEFINE_LINK_MODE_NAME(10, T, Half),
+ __DEFINE_LINK_MODE_NAME(10, T, Full),
+ __DEFINE_LINK_MODE_NAME(100, T, Half),
+ __DEFINE_LINK_MODE_NAME(100, T, Full),
+ __DEFINE_LINK_MODE_NAME(1000, T, Half),
+ __DEFINE_LINK_MODE_NAME(1000, T, Full),
+ __DEFINE_SPECIAL_MODE_NAME(Autoneg, "Autoneg"),
+ __DEFINE_SPECIAL_MODE_NAME(TP, "TP"),
+ __DEFINE_SPECIAL_MODE_NAME(AUI, "AUI"),
+ __DEFINE_SPECIAL_MODE_NAME(MII, "MII"),
+ __DEFINE_SPECIAL_MODE_NAME(FIBRE, "FIBRE"),
+ __DEFINE_SPECIAL_MODE_NAME(BNC, "BNC"),
+ __DEFINE_LINK_MODE_NAME(10000, T, Full),
+ __DEFINE_SPECIAL_MODE_NAME(Pause, "Pause"),
+ __DEFINE_SPECIAL_MODE_NAME(Asym_Pause, "Asym_Pause"),
+ __DEFINE_LINK_MODE_NAME(2500, X, Full),
+ __DEFINE_SPECIAL_MODE_NAME(Backplane, "Backplane"),
+ __DEFINE_LINK_MODE_NAME(1000, KX, Full),
+ __DEFINE_LINK_MODE_NAME(10000, KX4, Full),
+ __DEFINE_LINK_MODE_NAME(10000, KR, Full),
+ __DEFINE_SPECIAL_MODE_NAME(10000baseR_FEC, "10000baseR_FEC"),
+ __DEFINE_LINK_MODE_NAME(20000, MLD2, Full),
+ __DEFINE_LINK_MODE_NAME(20000, KR2, Full),
+ __DEFINE_LINK_MODE_NAME(40000, KR4, Full),
+ __DEFINE_LINK_MODE_NAME(40000, CR4, Full),
+ __DEFINE_LINK_MODE_NAME(40000, SR4, Full),
+ __DEFINE_LINK_MODE_NAME(40000, LR4, Full),
+ __DEFINE_LINK_MODE_NAME(56000, KR4, Full),
+ __DEFINE_LINK_MODE_NAME(56000, CR4, Full),
+ __DEFINE_LINK_MODE_NAME(56000, SR4, Full),
+ __DEFINE_LINK_MODE_NAME(56000, LR4, Full),
+ __DEFINE_LINK_MODE_NAME(25000, CR, Full),
+ __DEFINE_LINK_MODE_NAME(25000, KR, Full),
+ __DEFINE_LINK_MODE_NAME(25000, SR, Full),
+ __DEFINE_LINK_MODE_NAME(50000, CR2, Full),
+ __DEFINE_LINK_MODE_NAME(50000, KR2, Full),
+ __DEFINE_LINK_MODE_NAME(100000, KR4, Full),
+ __DEFINE_LINK_MODE_NAME(100000, SR4, Full),
+ __DEFINE_LINK_MODE_NAME(100000, CR4, Full),
+ __DEFINE_LINK_MODE_NAME(100000, LR4_ER4, Full),
+ __DEFINE_LINK_MODE_NAME(50000, SR2, Full),
+ __DEFINE_LINK_MODE_NAME(1000, X, Full),
+ __DEFINE_LINK_MODE_NAME(10000, CR, Full),
+ __DEFINE_LINK_MODE_NAME(10000, SR, Full),
+ __DEFINE_LINK_MODE_NAME(10000, LR, Full),
+ __DEFINE_LINK_MODE_NAME(10000, LRM, Full),
+ __DEFINE_LINK_MODE_NAME(10000, ER, Full),
+ __DEFINE_LINK_MODE_NAME(2500, T, Full),
+ __DEFINE_LINK_MODE_NAME(5000, T, Full),
+ __DEFINE_SPECIAL_MODE_NAME(FEC_NONE, "None"),
+ __DEFINE_SPECIAL_MODE_NAME(FEC_RS, "RS"),
+ __DEFINE_SPECIAL_MODE_NAME(FEC_BASER, "BASER"),
+ __DEFINE_LINK_MODE_NAME(50000, KR, Full),
+ __DEFINE_LINK_MODE_NAME(50000, SR, Full),
+ __DEFINE_LINK_MODE_NAME(50000, CR, Full),
+ __DEFINE_LINK_MODE_NAME(50000, LR_ER_FR, Full),
+ __DEFINE_LINK_MODE_NAME(50000, DR, Full),
+ __DEFINE_LINK_MODE_NAME(100000, KR2, Full),
+ __DEFINE_LINK_MODE_NAME(100000, SR2, Full),
+ __DEFINE_LINK_MODE_NAME(100000, CR2, Full),
+ __DEFINE_LINK_MODE_NAME(100000, LR2_ER2_FR2, Full),
+ __DEFINE_LINK_MODE_NAME(100000, DR2, Full),
+ __DEFINE_LINK_MODE_NAME(200000, KR4, Full),
+ __DEFINE_LINK_MODE_NAME(200000, SR4, Full),
+ __DEFINE_LINK_MODE_NAME(200000, LR4_ER4_FR4, Full),
+ __DEFINE_LINK_MODE_NAME(200000, DR4, Full),
+ __DEFINE_LINK_MODE_NAME(200000, CR4, Full),
+ __DEFINE_LINK_MODE_NAME(100, T1, Full),
+ __DEFINE_LINK_MODE_NAME(1000, T1, Full),
+ __DEFINE_LINK_MODE_NAME(400000, KR8, Full),
+ __DEFINE_LINK_MODE_NAME(400000, SR8, Full),
+ __DEFINE_LINK_MODE_NAME(400000, LR8_ER8_FR8, Full),
+ __DEFINE_LINK_MODE_NAME(400000, DR8, Full),
+ __DEFINE_LINK_MODE_NAME(400000, CR8, Full),
+ __DEFINE_SPECIAL_MODE_NAME(FEC_LLRS, "LLRS"),
+ __DEFINE_LINK_MODE_NAME(100000, KR, Full),
+ __DEFINE_LINK_MODE_NAME(100000, SR, Full),
+ __DEFINE_LINK_MODE_NAME(100000, LR_ER_FR, Full),
+ __DEFINE_LINK_MODE_NAME(100000, DR, Full),
+ __DEFINE_LINK_MODE_NAME(100000, CR, Full),
+ __DEFINE_LINK_MODE_NAME(200000, KR2, Full),
+ __DEFINE_LINK_MODE_NAME(200000, SR2, Full),
+ __DEFINE_LINK_MODE_NAME(200000, LR2_ER2_FR2, Full),
+ __DEFINE_LINK_MODE_NAME(200000, DR2, Full),
+ __DEFINE_LINK_MODE_NAME(200000, CR2, Full),
+ __DEFINE_LINK_MODE_NAME(400000, KR4, Full),
+ __DEFINE_LINK_MODE_NAME(400000, SR4, Full),
+ __DEFINE_LINK_MODE_NAME(400000, LR4_ER4_FR4, Full),
+ __DEFINE_LINK_MODE_NAME(400000, DR4, Full),
+ __DEFINE_LINK_MODE_NAME(400000, CR4, Full),
+ __DEFINE_LINK_MODE_NAME(100, FX, Half),
+ __DEFINE_LINK_MODE_NAME(100, FX, Full),
+ __DEFINE_LINK_MODE_NAME(10, T1L, Full),
+ __DEFINE_LINK_MODE_NAME(800000, CR8, Full),
+ __DEFINE_LINK_MODE_NAME(800000, KR8, Full),
+ __DEFINE_LINK_MODE_NAME(800000, DR8, Full),
+ __DEFINE_LINK_MODE_NAME(800000, DR8_2, Full),
+ __DEFINE_LINK_MODE_NAME(800000, SR8, Full),
+ __DEFINE_LINK_MODE_NAME(800000, VR8, Full),
+ __DEFINE_LINK_MODE_NAME(10, T1S, Full),
+ __DEFINE_LINK_MODE_NAME(10, T1S, Half),
+ __DEFINE_LINK_MODE_NAME(10, T1S_P2MP, Half),
+ __DEFINE_LINK_MODE_NAME(10, T1BRR, Full),
+ __DEFINE_LINK_MODE_NAME(200000, CR, Full),
+ __DEFINE_LINK_MODE_NAME(200000, KR, Full),
+ __DEFINE_LINK_MODE_NAME(200000, DR, Full),
+ __DEFINE_LINK_MODE_NAME(200000, DR_2, Full),
+ __DEFINE_LINK_MODE_NAME(200000, SR, Full),
+ __DEFINE_LINK_MODE_NAME(200000, VR, Full),
+ __DEFINE_LINK_MODE_NAME(400000, CR2, Full),
+ __DEFINE_LINK_MODE_NAME(400000, KR2, Full),
+ __DEFINE_LINK_MODE_NAME(400000, DR2, Full),
+ __DEFINE_LINK_MODE_NAME(400000, DR2_2, Full),
+ __DEFINE_LINK_MODE_NAME(400000, SR2, Full),
+ __DEFINE_LINK_MODE_NAME(400000, VR2, Full),
+ __DEFINE_LINK_MODE_NAME(800000, CR4, Full),
+ __DEFINE_LINK_MODE_NAME(800000, KR4, Full),
+ __DEFINE_LINK_MODE_NAME(800000, DR4, Full),
+ __DEFINE_LINK_MODE_NAME(800000, DR4_2, Full),
+ __DEFINE_LINK_MODE_NAME(800000, SR4, Full),
+ __DEFINE_LINK_MODE_NAME(800000, VR4, Full),
+ __DEFINE_LINK_MODE_NAME(1600000, CR8, Full),
+ __DEFINE_LINK_MODE_NAME(1600000, KR8, Full),
+ __DEFINE_LINK_MODE_NAME(1600000, DR8, Full),
+ __DEFINE_LINK_MODE_NAME(1600000, DR8_2, Full),
+};
+static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+#define __LINK_MODE_LANES_CR 1
+#define __LINK_MODE_LANES_CR2 2
+#define __LINK_MODE_LANES_CR4 4
+#define __LINK_MODE_LANES_CR8 8
+#define __LINK_MODE_LANES_DR 1
+#define __LINK_MODE_LANES_DR_2 1
+#define __LINK_MODE_LANES_DR2 2
+#define __LINK_MODE_LANES_DR2_2 2
+#define __LINK_MODE_LANES_DR4 4
+#define __LINK_MODE_LANES_DR4_2 4
+#define __LINK_MODE_LANES_DR8 8
+#define __LINK_MODE_LANES_KR 1
+#define __LINK_MODE_LANES_KR2 2
+#define __LINK_MODE_LANES_KR4 4
+#define __LINK_MODE_LANES_KR8 8
+#define __LINK_MODE_LANES_SR 1
+#define __LINK_MODE_LANES_SR2 2
+#define __LINK_MODE_LANES_SR4 4
+#define __LINK_MODE_LANES_SR8 8
+#define __LINK_MODE_LANES_ER 1
+#define __LINK_MODE_LANES_KX 1
+#define __LINK_MODE_LANES_KX4 4
+#define __LINK_MODE_LANES_LR 1
+#define __LINK_MODE_LANES_LR4 4
+#define __LINK_MODE_LANES_LR4_ER4 4
+#define __LINK_MODE_LANES_LR_ER_FR 1
+#define __LINK_MODE_LANES_LR2_ER2_FR2 2
+#define __LINK_MODE_LANES_LR4_ER4_FR4 4
+#define __LINK_MODE_LANES_LR8_ER8_FR8 8
+#define __LINK_MODE_LANES_LRM 1
+#define __LINK_MODE_LANES_MLD2 2
+#define __LINK_MODE_LANES_T 1
+#define __LINK_MODE_LANES_T1 1
+#define __LINK_MODE_LANES_X 1
+#define __LINK_MODE_LANES_FX 1
+#define __LINK_MODE_LANES_T1L 1
+#define __LINK_MODE_LANES_T1S 1
+#define __LINK_MODE_LANES_T1S_P2MP 1
+#define __LINK_MODE_LANES_VR 1
+#define __LINK_MODE_LANES_VR2 2
+#define __LINK_MODE_LANES_VR4 4
+#define __LINK_MODE_LANES_VR8 8
+#define __LINK_MODE_LANES_DR8_2 8
+#define __LINK_MODE_LANES_T1BRR 1
+
+#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex) \
+ [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \
+ .speed = SPEED_ ## _speed, \
+ .lanes = __LINK_MODE_LANES_ ## _type, \
+ .duplex = __DUPLEX_ ## _duplex \
+ }
+#define __DUPLEX_Half DUPLEX_HALF
+#define __DUPLEX_Full DUPLEX_FULL
+#define __DEFINE_SPECIAL_MODE_PARAMS(_mode) \
+ [ETHTOOL_LINK_MODE_ ## _mode ## _BIT] = { \
+ .speed = SPEED_UNKNOWN, \
+ .lanes = 0, \
+ .duplex = DUPLEX_UNKNOWN, \
+ }
+
+const struct link_mode_info link_mode_params[] = {
+ __DEFINE_LINK_MODE_PARAMS(10, T, Half),
+ __DEFINE_LINK_MODE_PARAMS(10, T, Full),
+ __DEFINE_LINK_MODE_PARAMS(100, T, Half),
+ __DEFINE_LINK_MODE_PARAMS(100, T, Full),
+ __DEFINE_LINK_MODE_PARAMS(1000, T, Half),
+ __DEFINE_LINK_MODE_PARAMS(1000, T, Full),
+ __DEFINE_SPECIAL_MODE_PARAMS(Autoneg),
+ __DEFINE_SPECIAL_MODE_PARAMS(TP),
+ __DEFINE_SPECIAL_MODE_PARAMS(AUI),
+ __DEFINE_SPECIAL_MODE_PARAMS(MII),
+ __DEFINE_SPECIAL_MODE_PARAMS(FIBRE),
+ __DEFINE_SPECIAL_MODE_PARAMS(BNC),
+ __DEFINE_LINK_MODE_PARAMS(10000, T, Full),
+ __DEFINE_SPECIAL_MODE_PARAMS(Pause),
+ __DEFINE_SPECIAL_MODE_PARAMS(Asym_Pause),
+ __DEFINE_LINK_MODE_PARAMS(2500, X, Full),
+ __DEFINE_SPECIAL_MODE_PARAMS(Backplane),
+ __DEFINE_LINK_MODE_PARAMS(1000, KX, Full),
+ __DEFINE_LINK_MODE_PARAMS(10000, KX4, Full),
+ __DEFINE_LINK_MODE_PARAMS(10000, KR, Full),
+ [ETHTOOL_LINK_MODE_10000baseR_FEC_BIT] = {
+ .speed = SPEED_10000,
+ .lanes = 1,
+ .duplex = DUPLEX_FULL,
+ },
+ __DEFINE_LINK_MODE_PARAMS(20000, MLD2, Full),
+ __DEFINE_LINK_MODE_PARAMS(20000, KR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(40000, KR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(40000, CR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(40000, SR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(40000, LR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(56000, KR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(56000, CR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(56000, SR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(56000, LR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(25000, CR, Full),
+ __DEFINE_LINK_MODE_PARAMS(25000, KR, Full),
+ __DEFINE_LINK_MODE_PARAMS(25000, SR, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, CR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, KR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, KR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, SR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, CR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, LR4_ER4, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, SR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(1000, X, Full),
+ __DEFINE_LINK_MODE_PARAMS(10000, CR, Full),
+ __DEFINE_LINK_MODE_PARAMS(10000, SR, Full),
+ __DEFINE_LINK_MODE_PARAMS(10000, LR, Full),
+ __DEFINE_LINK_MODE_PARAMS(10000, LRM, Full),
+ __DEFINE_LINK_MODE_PARAMS(10000, ER, Full),
+ __DEFINE_LINK_MODE_PARAMS(2500, T, Full),
+ __DEFINE_LINK_MODE_PARAMS(5000, T, Full),
+ __DEFINE_SPECIAL_MODE_PARAMS(FEC_NONE),
+ __DEFINE_SPECIAL_MODE_PARAMS(FEC_RS),
+ __DEFINE_SPECIAL_MODE_PARAMS(FEC_BASER),
+ __DEFINE_LINK_MODE_PARAMS(50000, KR, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, SR, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, CR, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, LR_ER_FR, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, DR, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, KR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, SR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, CR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, LR2_ER2_FR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, DR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, KR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, SR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, LR4_ER4_FR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, DR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, CR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(100, T1, Full),
+ __DEFINE_LINK_MODE_PARAMS(1000, T1, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, KR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, SR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, LR8_ER8_FR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full),
+ __DEFINE_SPECIAL_MODE_PARAMS(FEC_LLRS),
+ __DEFINE_LINK_MODE_PARAMS(100000, KR, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, SR, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, LR_ER_FR, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, DR, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, CR, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, KR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, SR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, LR2_ER2_FR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, DR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, CR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, KR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, SR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, LR4_ER4_FR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, DR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, CR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(100, FX, Half),
+ __DEFINE_LINK_MODE_PARAMS(100, FX, Full),
+ __DEFINE_LINK_MODE_PARAMS(10, T1L, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, CR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, KR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, DR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, DR8_2, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, SR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, VR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(10, T1S, Full),
+ __DEFINE_LINK_MODE_PARAMS(10, T1S, Half),
+ __DEFINE_LINK_MODE_PARAMS(10, T1S_P2MP, Half),
+ __DEFINE_LINK_MODE_PARAMS(10, T1BRR, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, CR, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, KR, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, DR, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, DR_2, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, SR, Full),
+ __DEFINE_LINK_MODE_PARAMS(200000, VR, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, CR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, KR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, DR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, DR2_2, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, SR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(400000, VR2, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, CR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, KR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, DR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, DR4_2, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, SR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(800000, VR4, Full),
+ __DEFINE_LINK_MODE_PARAMS(1600000, CR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(1600000, KR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(1600000, DR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(1600000, DR8_2, Full),
+};
+static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS);
+EXPORT_SYMBOL_GPL(link_mode_params);
+
+const char netif_msg_class_names[][ETH_GSTRING_LEN] = {
+ [NETIF_MSG_DRV_BIT] = "drv",
+ [NETIF_MSG_PROBE_BIT] = "probe",
+ [NETIF_MSG_LINK_BIT] = "link",
+ [NETIF_MSG_TIMER_BIT] = "timer",
+ [NETIF_MSG_IFDOWN_BIT] = "ifdown",
+ [NETIF_MSG_IFUP_BIT] = "ifup",
+ [NETIF_MSG_RX_ERR_BIT] = "rx_err",
+ [NETIF_MSG_TX_ERR_BIT] = "tx_err",
+ [NETIF_MSG_TX_QUEUED_BIT] = "tx_queued",
+ [NETIF_MSG_INTR_BIT] = "intr",
+ [NETIF_MSG_TX_DONE_BIT] = "tx_done",
+ [NETIF_MSG_RX_STATUS_BIT] = "rx_status",
+ [NETIF_MSG_PKTDATA_BIT] = "pktdata",
+ [NETIF_MSG_HW_BIT] = "hw",
+ [NETIF_MSG_WOL_BIT] = "wol",
+};
+static_assert(ARRAY_SIZE(netif_msg_class_names) == NETIF_MSG_CLASS_COUNT);
+
+const char wol_mode_names[][ETH_GSTRING_LEN] = {
+ [const_ilog2(WAKE_PHY)] = "phy",
+ [const_ilog2(WAKE_UCAST)] = "ucast",
+ [const_ilog2(WAKE_MCAST)] = "mcast",
+ [const_ilog2(WAKE_BCAST)] = "bcast",
+ [const_ilog2(WAKE_ARP)] = "arp",
+ [const_ilog2(WAKE_MAGIC)] = "magic",
+ [const_ilog2(WAKE_MAGICSECURE)] = "magicsecure",
+ [const_ilog2(WAKE_FILTER)] = "filter",
+};
+static_assert(ARRAY_SIZE(wol_mode_names) == WOL_MODE_COUNT);
+
+const char sof_timestamping_names[][ETH_GSTRING_LEN] = {
+ [const_ilog2(SOF_TIMESTAMPING_TX_HARDWARE)] = "hardware-transmit",
+ [const_ilog2(SOF_TIMESTAMPING_TX_SOFTWARE)] = "software-transmit",
+ [const_ilog2(SOF_TIMESTAMPING_RX_HARDWARE)] = "hardware-receive",
+ [const_ilog2(SOF_TIMESTAMPING_RX_SOFTWARE)] = "software-receive",
+ [const_ilog2(SOF_TIMESTAMPING_SOFTWARE)] = "software-system-clock",
+ [const_ilog2(SOF_TIMESTAMPING_SYS_HARDWARE)] = "hardware-legacy-clock",
+ [const_ilog2(SOF_TIMESTAMPING_RAW_HARDWARE)] = "hardware-raw-clock",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_ID)] = "option-id",
+ [const_ilog2(SOF_TIMESTAMPING_TX_SCHED)] = "sched-transmit",
+ [const_ilog2(SOF_TIMESTAMPING_TX_ACK)] = "ack-transmit",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_CMSG)] = "option-cmsg",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_TSONLY)] = "option-tsonly",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_STATS)] = "option-stats",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_PKTINFO)] = "option-pktinfo",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_TX_SWHW)] = "option-tx-swhw",
+ [const_ilog2(SOF_TIMESTAMPING_BIND_PHC)] = "bind-phc",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_ID_TCP)] = "option-id-tcp",
+ [const_ilog2(SOF_TIMESTAMPING_OPT_RX_FILTER)] = "option-rx-filter",
+ [const_ilog2(SOF_TIMESTAMPING_TX_COMPLETION)] = "tx-completion",
+};
+static_assert(ARRAY_SIZE(sof_timestamping_names) == __SOF_TIMESTAMPING_CNT);
+
+const char ts_tx_type_names[][ETH_GSTRING_LEN] = {
+ [HWTSTAMP_TX_OFF] = "off",
+ [HWTSTAMP_TX_ON] = "on",
+ [HWTSTAMP_TX_ONESTEP_SYNC] = "onestep-sync",
+ [HWTSTAMP_TX_ONESTEP_P2P] = "onestep-p2p",
+};
+static_assert(ARRAY_SIZE(ts_tx_type_names) == __HWTSTAMP_TX_CNT);
+
+const char ts_rx_filter_names[][ETH_GSTRING_LEN] = {
+ [HWTSTAMP_FILTER_NONE] = "none",
+ [HWTSTAMP_FILTER_ALL] = "all",
+ [HWTSTAMP_FILTER_SOME] = "some",
+ [HWTSTAMP_FILTER_PTP_V1_L4_EVENT] = "ptpv1-l4-event",
+ [HWTSTAMP_FILTER_PTP_V1_L4_SYNC] = "ptpv1-l4-sync",
+ [HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ] = "ptpv1-l4-delay-req",
+ [HWTSTAMP_FILTER_PTP_V2_L4_EVENT] = "ptpv2-l4-event",
+ [HWTSTAMP_FILTER_PTP_V2_L4_SYNC] = "ptpv2-l4-sync",
+ [HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ] = "ptpv2-l4-delay-req",
+ [HWTSTAMP_FILTER_PTP_V2_L2_EVENT] = "ptpv2-l2-event",
+ [HWTSTAMP_FILTER_PTP_V2_L2_SYNC] = "ptpv2-l2-sync",
+ [HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ] = "ptpv2-l2-delay-req",
+ [HWTSTAMP_FILTER_PTP_V2_EVENT] = "ptpv2-event",
+ [HWTSTAMP_FILTER_PTP_V2_SYNC] = "ptpv2-sync",
+ [HWTSTAMP_FILTER_PTP_V2_DELAY_REQ] = "ptpv2-delay-req",
+ [HWTSTAMP_FILTER_NTP_ALL] = "ntp-all",
+};
+static_assert(ARRAY_SIZE(ts_rx_filter_names) == __HWTSTAMP_FILTER_CNT);
+
+const char ts_flags_names[][ETH_GSTRING_LEN] = {
+ [const_ilog2(HWTSTAMP_FLAG_BONDED_PHC_INDEX)] = "bonded-phc-index",
+};
+static_assert(ARRAY_SIZE(ts_flags_names) == __HWTSTAMP_FLAG_CNT);
+
+const char udp_tunnel_type_names[][ETH_GSTRING_LEN] = {
+ [ETHTOOL_UDP_TUNNEL_TYPE_VXLAN] = "vxlan",
+ [ETHTOOL_UDP_TUNNEL_TYPE_GENEVE] = "geneve",
+ [ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE] = "vxlan-gpe",
+};
+static_assert(ARRAY_SIZE(udp_tunnel_type_names) ==
+ __ETHTOOL_UDP_TUNNEL_TYPE_CNT);
+
+/* return false if legacy contained non-0 deprecated fields
+ * maxtxpkt/maxrxpkt. rest of ksettings always updated
+ */
+bool
+convert_legacy_settings_to_link_ksettings(
+ struct ethtool_link_ksettings *link_ksettings,
+ const struct ethtool_cmd *legacy_settings)
+{
+ bool retval = true;
+
+ memset(link_ksettings, 0, sizeof(*link_ksettings));
+
+ /* This is used to tell users that driver is still using these
+ * deprecated legacy fields, and they should not use
+ * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS
+ */
+ if (legacy_settings->maxtxpkt ||
+ legacy_settings->maxrxpkt)
+ retval = false;
+
+ ethtool_convert_legacy_u32_to_link_mode(
+ link_ksettings->link_modes.supported,
+ legacy_settings->supported);
+ ethtool_convert_legacy_u32_to_link_mode(
+ link_ksettings->link_modes.advertising,
+ legacy_settings->advertising);
+ ethtool_convert_legacy_u32_to_link_mode(
+ link_ksettings->link_modes.lp_advertising,
+ legacy_settings->lp_advertising);
+ link_ksettings->base.speed
+ = ethtool_cmd_speed(legacy_settings);
+ link_ksettings->base.duplex
+ = legacy_settings->duplex;
+ link_ksettings->base.port
+ = legacy_settings->port;
+ link_ksettings->base.phy_address
+ = legacy_settings->phy_address;
+ link_ksettings->base.autoneg
+ = legacy_settings->autoneg;
+ link_ksettings->base.mdio_support
+ = legacy_settings->mdio_support;
+ link_ksettings->base.eth_tp_mdix
+ = legacy_settings->eth_tp_mdix;
+ link_ksettings->base.eth_tp_mdix_ctrl
+ = legacy_settings->eth_tp_mdix_ctrl;
+ return retval;
+}
+
+int __ethtool_get_link(struct net_device *dev)
+{
+ if (!dev->ethtool_ops->get_link)
+ return -EOPNOTSUPP;
+
+ return netif_running(dev) && dev->ethtool_ops->get_link(dev);
+}
+
+int ethtool_get_rx_ring_count(struct net_device *dev)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxnfc rx_rings = {};
+ int ret;
+
+ if (ops->get_rx_ring_count)
+ return ops->get_rx_ring_count(dev);
+
+ if (!ops->get_rxnfc)
+ return -EOPNOTSUPP;
+
+ rx_rings.cmd = ETHTOOL_GRXRINGS;
+ ret = ops->get_rxnfc(dev, &rx_rings, NULL);
+ if (ret < 0)
+ return ret;
+
+ return rx_rings.data;
+}
+
+static int ethtool_get_rxnfc_rule_count(struct net_device *dev)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxnfc info = {
+ .cmd = ETHTOOL_GRXCLSRLCNT,
+ };
+ int err;
+
+ err = ops->get_rxnfc(dev, &info, NULL);
+ if (err)
+ return err;
+
+ return info.rule_cnt;
+}
+
+/* Max offset for one RSS context */
+static u32 ethtool_get_rss_ctx_max_channel(struct ethtool_rxfh_context *ctx)
+{
+ u32 max_ring = 0;
+ u32 i, *tbl;
+
+ if (WARN_ON_ONCE(!ctx))
+ return 0;
+ tbl = ethtool_rxfh_context_indir(ctx);
+ for (i = 0; i < ctx->indir_size; i++)
+ max_ring = max(max_ring, tbl[i]);
+ return max_ring;
+}
+
+static int ethtool_get_max_rxnfc_channel(struct net_device *dev, u64 *max)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxnfc *info;
+ int err, i, rule_cnt;
+ u64 max_ring = 0;
+
+ if (!ops->get_rxnfc)
+ return -EOPNOTSUPP;
+
+ rule_cnt = ethtool_get_rxnfc_rule_count(dev);
+ if (rule_cnt <= 0)
+ return -EINVAL;
+
+ info = kvzalloc(struct_size(info, rule_locs, rule_cnt), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ info->cmd = ETHTOOL_GRXCLSRLALL;
+ info->rule_cnt = rule_cnt;
+ err = ops->get_rxnfc(dev, info, info->rule_locs);
+ if (err)
+ goto err_free_info;
+
+ for (i = 0; i < rule_cnt; i++) {
+ struct ethtool_rxnfc rule_info = {
+ .cmd = ETHTOOL_GRXCLSRULE,
+ .fs.location = info->rule_locs[i],
+ };
+
+ err = ops->get_rxnfc(dev, &rule_info, NULL);
+ if (err)
+ goto err_free_info;
+
+ if (rule_info.fs.ring_cookie != RX_CLS_FLOW_DISC &&
+ rule_info.fs.ring_cookie != RX_CLS_FLOW_WAKE &&
+ !ethtool_get_flow_spec_ring_vf(rule_info.fs.ring_cookie)) {
+ u64 ring = rule_info.fs.ring_cookie;
+
+ if (rule_info.flow_type & FLOW_RSS) {
+ struct ethtool_rxfh_context *ctx;
+
+ ctx = xa_load(&dev->ethtool->rss_ctx,
+ rule_info.rss_context);
+ ring += ethtool_get_rss_ctx_max_channel(ctx);
+ }
+ max_ring = max_t(u64, max_ring, ring);
+ }
+ }
+
+ kvfree(info);
+ *max = max_ring;
+ return 0;
+
+err_free_info:
+ kvfree(info);
+ return err;
+}
+
+/* Max offset across all of a device's RSS contexts */
+static u32 ethtool_get_max_rss_ctx_channel(struct net_device *dev)
+{
+ struct ethtool_rxfh_context *ctx;
+ unsigned long context;
+ u32 max_ring = 0;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ xa_for_each(&dev->ethtool->rss_ctx, context, ctx)
+ max_ring = max(max_ring, ethtool_get_rss_ctx_max_channel(ctx));
+ mutex_unlock(&dev->ethtool->rss_lock);
+
+ return max_ring;
+}
+
+static u32 ethtool_get_max_rxfh_channel(struct net_device *dev)
+{
+ struct ethtool_rxfh_param rxfh = {};
+ u32 dev_size, current_max = 0;
+ int ret;
+
+ /* While we do track whether RSS context has an indirection
+ * table explicitly set by the user, no driver looks at that bit.
+ * Assume drivers won't auto-regenerate the additional tables,
+ * to be safe.
+ */
+ current_max = ethtool_get_max_rss_ctx_channel(dev);
+
+ if (!netif_is_rxfh_configured(dev))
+ return current_max;
+
+ if (!dev->ethtool_ops->get_rxfh_indir_size ||
+ !dev->ethtool_ops->get_rxfh)
+ return current_max;
+ dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
+ if (dev_size == 0)
+ return current_max;
+
+ rxfh.indir = kcalloc(dev_size, sizeof(rxfh.indir[0]), GFP_USER);
+ if (!rxfh.indir)
+ return U32_MAX;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ ret = dev->ethtool_ops->get_rxfh(dev, &rxfh);
+ mutex_unlock(&dev->ethtool->rss_lock);
+ if (ret) {
+ current_max = U32_MAX;
+ goto out_free;
+ }
+
+ while (dev_size--)
+ current_max = max(current_max, rxfh.indir[dev_size]);
+
+out_free:
+ kfree(rxfh.indir);
+ return current_max;
+}
+
+int ethtool_check_max_channel(struct net_device *dev,
+ struct ethtool_channels channels,
+ struct genl_info *info)
+{
+ u64 max_rxnfc_in_use;
+ u32 max_rxfh_in_use;
+ int max_mp_in_use;
+
+ /* ensure the new Rx count fits within the configured Rx flow
+ * indirection table/rxnfc settings
+ */
+ if (ethtool_get_max_rxnfc_channel(dev, &max_rxnfc_in_use))
+ max_rxnfc_in_use = 0;
+ max_rxfh_in_use = ethtool_get_max_rxfh_channel(dev);
+ if (channels.combined_count + channels.rx_count <= max_rxfh_in_use) {
+ if (info)
+ GENL_SET_ERR_MSG_FMT(info, "requested channel counts are too low for existing indirection table (%d)", max_rxfh_in_use);
+ return -EINVAL;
+ }
+ if (channels.combined_count + channels.rx_count <= max_rxnfc_in_use) {
+ if (info)
+ GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing ntuple filter settings");
+ return -EINVAL;
+ }
+
+ max_mp_in_use = dev_get_min_mp_channel_count(dev);
+ if (channels.combined_count + channels.rx_count <= max_mp_in_use) {
+ if (info)
+ GENL_SET_ERR_MSG_FMT(info, "requested channel counts are too low for existing memory provider setting (%d)", max_mp_in_use);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int ethtool_check_rss_ctx_busy(struct net_device *dev, u32 rss_context)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxnfc *info;
+ int rc, i, rule_cnt;
+
+ if (!ops->get_rxnfc)
+ return 0;
+
+ rule_cnt = ethtool_get_rxnfc_rule_count(dev);
+ if (!rule_cnt)
+ return 0;
+
+ if (rule_cnt < 0)
+ return -EINVAL;
+
+ info = kvzalloc(struct_size(info, rule_locs, rule_cnt), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
+ info->cmd = ETHTOOL_GRXCLSRLALL;
+ info->rule_cnt = rule_cnt;
+ rc = ops->get_rxnfc(dev, info, info->rule_locs);
+ if (rc)
+ goto out_free;
+
+ for (i = 0; i < rule_cnt; i++) {
+ struct ethtool_rxnfc rule_info = {
+ .cmd = ETHTOOL_GRXCLSRULE,
+ .fs.location = info->rule_locs[i],
+ };
+
+ rc = ops->get_rxnfc(dev, &rule_info, NULL);
+ if (rc)
+ goto out_free;
+
+ if (rule_info.fs.flow_type & FLOW_RSS &&
+ rule_info.rss_context == rss_context) {
+ rc = -EBUSY;
+ goto out_free;
+ }
+ }
+
+out_free:
+ kvfree(info);
+ return rc;
+}
+
+struct ethtool_rxfh_context *
+ethtool_rxfh_ctx_alloc(const struct ethtool_ops *ops,
+ u32 indir_size, u32 key_size)
+{
+ size_t indir_bytes, flex_len, key_off, size;
+ struct ethtool_rxfh_context *ctx;
+ u32 priv_bytes, indir_max;
+ u16 key_max;
+
+ key_max = max(key_size, ops->rxfh_key_space);
+ indir_max = max(indir_size, ops->rxfh_indir_space);
+
+ priv_bytes = ALIGN(ops->rxfh_priv_size, sizeof(u32));
+ indir_bytes = array_size(indir_max, sizeof(u32));
+
+ key_off = size_add(priv_bytes, indir_bytes);
+ flex_len = size_add(key_off, key_max);
+ size = struct_size_t(struct ethtool_rxfh_context, data, flex_len);
+
+ ctx = kzalloc(size, GFP_KERNEL_ACCOUNT);
+ if (!ctx)
+ return NULL;
+
+ ctx->indir_size = indir_size;
+ ctx->key_size = key_size;
+ ctx->key_off = key_off;
+ ctx->priv_size = ops->rxfh_priv_size;
+
+ ctx->hfunc = ETH_RSS_HASH_NO_CHANGE;
+ ctx->input_xfrm = RXH_XFRM_NO_CHANGE;
+
+ return ctx;
+}
+
+/* Check if fields configured for flow hash are symmetric - if src is included
+ * so is dst and vice versa.
+ */
+int ethtool_rxfh_config_is_sym(u64 rxfh)
+{
+ bool sym;
+
+ sym = rxfh == (rxfh & (RXH_IP_SRC | RXH_IP_DST |
+ RXH_L4_B_0_1 | RXH_L4_B_2_3));
+ sym &= !!(rxfh & RXH_IP_SRC) == !!(rxfh & RXH_IP_DST);
+ sym &= !!(rxfh & RXH_L4_B_0_1) == !!(rxfh & RXH_L4_B_2_3);
+
+ return sym;
+}
+
+int ethtool_check_ops(const struct ethtool_ops *ops)
+{
+ if (WARN_ON(ops->set_coalesce && !ops->supported_coalesce_params))
+ return -EINVAL;
+ if (WARN_ON(ops->rxfh_max_num_contexts == 1))
+ return -EINVAL;
+ if (WARN_ON(ops->supported_input_xfrm && !ops->get_rxfh_fields))
+ return -EINVAL;
+ if (WARN_ON(ops->supported_input_xfrm &&
+ ops->rxfh_per_ctx_fields != ops->rxfh_per_ctx_key))
+ return -EINVAL;
+
+ /* NOTE: sufficiently insane drivers may swap ethtool_ops at runtime,
+ * the fact that ops are checked at registration time does not
+ * mean the ops attached to a netdev later on are sane.
+ */
+ return 0;
+}
+
+void ethtool_ringparam_get_cfg(struct net_device *dev,
+ struct ethtool_ringparam *param,
+ struct kernel_ethtool_ringparam *kparam,
+ struct netlink_ext_ack *extack)
+{
+ memset(param, 0, sizeof(*param));
+ memset(kparam, 0, sizeof(*kparam));
+
+ param->cmd = ETHTOOL_GRINGPARAM;
+ dev->ethtool_ops->get_ringparam(dev, param, kparam, extack);
+
+ /* Driver gives us current state, we want to return current config */
+ kparam->tcp_data_split = dev->cfg->hds_config;
+ kparam->hds_thresh = dev->cfg->hds_thresh;
+}
+
+static void ethtool_init_tsinfo(struct kernel_ethtool_ts_info *info)
+{
+ memset(info, 0, sizeof(*info));
+ info->cmd = ETHTOOL_GET_TS_INFO;
+ info->phc_index = -1;
+}
+
+int ethtool_net_get_ts_info_by_phc(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info,
+ struct hwtstamp_provider_desc *hwprov_desc)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int err;
+
+ if (!ops->get_ts_info)
+ return -EOPNOTSUPP;
+
+ /* Does ptp comes from netdev */
+ ethtool_init_tsinfo(info);
+ info->phc_qualifier = hwprov_desc->qualifier;
+ err = ops->get_ts_info(dev, info);
+ if (err)
+ return err;
+
+ if (info->phc_index == hwprov_desc->index &&
+ net_support_hwtstamp_qualifier(dev, hwprov_desc->qualifier))
+ return 0;
+
+ return -ENODEV;
+}
+
+struct phy_device *
+ethtool_phy_get_ts_info_by_phc(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info,
+ struct hwtstamp_provider_desc *hwprov_desc)
+{
+ int err;
+
+ /* Only precise qualifier is supported in phydev */
+ if (hwprov_desc->qualifier != HWTSTAMP_PROVIDER_QUALIFIER_PRECISE)
+ return ERR_PTR(-ENODEV);
+
+ /* Look in the phy topology */
+ if (dev->link_topo) {
+ struct phy_device_node *pdn;
+ unsigned long phy_index;
+
+ xa_for_each(&dev->link_topo->phys, phy_index, pdn) {
+ if (!phy_has_tsinfo(pdn->phy))
+ continue;
+
+ ethtool_init_tsinfo(info);
+ err = phy_ts_info(pdn->phy, info);
+ if (err)
+ return ERR_PTR(err);
+
+ if (info->phc_index == hwprov_desc->index)
+ return pdn->phy;
+ }
+ return ERR_PTR(-ENODEV);
+ }
+
+ /* Look on the dev->phydev */
+ if (phy_has_tsinfo(dev->phydev)) {
+ ethtool_init_tsinfo(info);
+ err = phy_ts_info(dev->phydev, info);
+ if (err)
+ return ERR_PTR(err);
+
+ if (info->phc_index == hwprov_desc->index)
+ return dev->phydev;
+ }
+
+ return ERR_PTR(-ENODEV);
+}
+
+int ethtool_get_ts_info_by_phc(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info,
+ struct hwtstamp_provider_desc *hwprov_desc)
+{
+ int err;
+
+ err = ethtool_net_get_ts_info_by_phc(dev, info, hwprov_desc);
+ if (err == -ENODEV || err == -EOPNOTSUPP) {
+ struct phy_device *phy;
+
+ phy = ethtool_phy_get_ts_info_by_phc(dev, info, hwprov_desc);
+ if (IS_ERR(phy))
+ return PTR_ERR(phy);
+
+ /* Report the phc source only if we have a real
+ * phc source with an index.
+ */
+ if (info->phc_index >= 0) {
+ info->phc_source = HWTSTAMP_SOURCE_PHYLIB;
+ info->phc_phyindex = phy->phyindex;
+ }
+ err = 0;
+ } else if (!err && info->phc_index >= 0) {
+ info->phc_source = HWTSTAMP_SOURCE_NETDEV;
+ }
+
+ info->so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+
+ return err;
+}
+
+int __ethtool_get_ts_info(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info)
+{
+ struct hwtstamp_provider *hwprov;
+ int err = 0;
+
+ rcu_read_lock();
+ hwprov = rcu_dereference(dev->hwprov);
+ /* No provider specified, use default behavior */
+ if (!hwprov) {
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct phy_device *phydev = dev->phydev;
+
+ ethtool_init_tsinfo(info);
+ if (phy_is_default_hwtstamp(phydev) &&
+ phy_has_tsinfo(phydev)) {
+ err = phy_ts_info(phydev, info);
+ /* Report the phc source only if we have a real
+ * phc source with an index.
+ */
+ if (!err && info->phc_index >= 0) {
+ info->phc_source = HWTSTAMP_SOURCE_PHYLIB;
+ info->phc_phyindex = phydev->phyindex;
+ }
+ } else if (ops->get_ts_info) {
+ err = ops->get_ts_info(dev, info);
+ if (!err && info->phc_index >= 0)
+ info->phc_source = HWTSTAMP_SOURCE_NETDEV;
+ }
+
+ info->so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+
+ rcu_read_unlock();
+ return err;
+ }
+
+ err = ethtool_get_ts_info_by_phc(dev, info, &hwprov->desc);
+ rcu_read_unlock();
+ return err;
+}
+
+bool net_support_hwtstamp_qualifier(struct net_device *dev,
+ enum hwtstamp_provider_qualifier qualifier)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (!ops)
+ return false;
+
+ /* Return true with precise qualifier and with NIC without
+ * qualifier description to not break the old behavior.
+ */
+ if (!ops->supported_hwtstamp_qualifiers &&
+ qualifier == HWTSTAMP_PROVIDER_QUALIFIER_PRECISE)
+ return true;
+
+ if (ops->supported_hwtstamp_qualifiers & BIT(qualifier))
+ return true;
+
+ return false;
+}
+
+int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index)
+{
+ struct kernel_ethtool_ts_info info = { };
+ int num = 0;
+
+ if (!__ethtool_get_ts_info(dev, &info))
+ num = ptp_get_vclocks_index(info.phc_index, vclock_index);
+
+ return num;
+}
+EXPORT_SYMBOL(ethtool_get_phc_vclocks);
+
+int ethtool_get_ts_info_by_layer(struct net_device *dev, struct kernel_ethtool_ts_info *info)
+{
+ return __ethtool_get_ts_info(dev, info);
+}
+EXPORT_SYMBOL(ethtool_get_ts_info_by_layer);
+
+const struct ethtool_phy_ops *ethtool_phy_ops;
+
+void ethtool_set_ethtool_phy_ops(const struct ethtool_phy_ops *ops)
+{
+ ASSERT_RTNL();
+ ethtool_phy_ops = ops;
+}
+EXPORT_SYMBOL_GPL(ethtool_set_ethtool_phy_ops);
+
+void
+ethtool_params_from_link_mode(struct ethtool_link_ksettings *link_ksettings,
+ enum ethtool_link_mode_bit_indices link_mode)
+{
+ const struct link_mode_info *link_info;
+
+ if (WARN_ON_ONCE(link_mode >= __ETHTOOL_LINK_MODE_MASK_NBITS))
+ return;
+
+ link_info = &link_mode_params[link_mode];
+ link_ksettings->base.speed = link_info->speed;
+ link_ksettings->lanes = link_info->lanes;
+ link_ksettings->base.duplex = link_info->duplex;
+}
+EXPORT_SYMBOL_GPL(ethtool_params_from_link_mode);
+
+/**
+ * ethtool_forced_speed_maps_init
+ * @maps: Pointer to an array of Ethtool forced speed map
+ * @size: Array size
+ *
+ * Initialize an array of Ethtool forced speed map to Ethtool link modes. This
+ * should be called during driver module init.
+ */
+void
+ethtool_forced_speed_maps_init(struct ethtool_forced_speed_map *maps, u32 size)
+{
+ for (u32 i = 0; i < size; i++) {
+ struct ethtool_forced_speed_map *map = &maps[i];
+
+ linkmode_set_bit_array(map->cap_arr, map->arr_size, map->caps);
+ map->cap_arr = NULL;
+ map->arr_size = 0;
+ }
+}
+EXPORT_SYMBOL_GPL(ethtool_forced_speed_maps_init);
+
+void ethtool_rxfh_context_lost(struct net_device *dev, u32 context_id)
+{
+ struct ethtool_rxfh_context *ctx;
+
+ WARN_ONCE(!rtnl_is_locked() &&
+ !lockdep_is_held_type(&dev->ethtool->rss_lock, -1),
+ "RSS context lock assertion failed\n");
+
+ netdev_err(dev, "device error, RSS context %d lost\n", context_id);
+ ctx = xa_erase(&dev->ethtool->rss_ctx, context_id);
+ kfree(ctx);
+ ethtool_rss_notify(dev, ETHTOOL_MSG_RSS_DELETE_NTF, context_id);
+}
+EXPORT_SYMBOL(ethtool_rxfh_context_lost);
diff --git a/net/ethtool/common.h b/net/ethtool/common.h
new file mode 100644
index 000000000000..1609cf4e53eb
--- /dev/null
+++ b/net/ethtool/common.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef _ETHTOOL_COMMON_H
+#define _ETHTOOL_COMMON_H
+
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+
+#define ETHTOOL_DEV_FEATURE_WORDS DIV_ROUND_UP(NETDEV_FEATURE_COUNT, 32)
+
+/* compose link mode index from speed, type and duplex */
+#define ETHTOOL_LINK_MODE(speed, type, duplex) \
+ ETHTOOL_LINK_MODE_ ## speed ## base ## type ## _ ## duplex ## _BIT
+
+#define __SOF_TIMESTAMPING_CNT (const_ilog2(SOF_TIMESTAMPING_LAST) + 1)
+#define __HWTSTAMP_FLAG_CNT (const_ilog2(HWTSTAMP_FLAG_LAST) + 1)
+
+struct genl_info;
+struct hwtstamp_provider_desc;
+
+extern const char
+netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN];
+extern const char
+rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN];
+extern const char
+tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN];
+extern const char
+phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN];
+extern const char link_mode_names[][ETH_GSTRING_LEN];
+extern const char netif_msg_class_names[][ETH_GSTRING_LEN];
+extern const char wol_mode_names[][ETH_GSTRING_LEN];
+extern const char sof_timestamping_names[][ETH_GSTRING_LEN];
+extern const char ts_tx_type_names[][ETH_GSTRING_LEN];
+extern const char ts_rx_filter_names[][ETH_GSTRING_LEN];
+extern const char ts_flags_names[][ETH_GSTRING_LEN];
+extern const char udp_tunnel_type_names[][ETH_GSTRING_LEN];
+
+int __ethtool_get_link(struct net_device *dev);
+
+bool convert_legacy_settings_to_link_ksettings(
+ struct ethtool_link_ksettings *link_ksettings,
+ const struct ethtool_cmd *legacy_settings);
+int ethtool_check_max_channel(struct net_device *dev,
+ struct ethtool_channels channels,
+ struct genl_info *info);
+struct ethtool_rxfh_context *
+ethtool_rxfh_ctx_alloc(const struct ethtool_ops *ops,
+ u32 indir_size, u32 key_size);
+int ethtool_check_rss_ctx_busy(struct net_device *dev, u32 rss_context);
+int ethtool_rxfh_config_is_sym(u64 rxfh);
+
+void ethtool_ringparam_get_cfg(struct net_device *dev,
+ struct ethtool_ringparam *param,
+ struct kernel_ethtool_ringparam *kparam,
+ struct netlink_ext_ack *extack);
+
+int ethtool_get_rx_ring_count(struct net_device *dev);
+
+int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info);
+int ethtool_get_ts_info_by_phc(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info,
+ struct hwtstamp_provider_desc *hwprov_desc);
+int ethtool_net_get_ts_info_by_phc(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info,
+ struct hwtstamp_provider_desc *hwprov_desc);
+struct phy_device *
+ethtool_phy_get_ts_info_by_phc(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info,
+ struct hwtstamp_provider_desc *hwprov_desc);
+bool net_support_hwtstamp_qualifier(struct net_device *dev,
+ enum hwtstamp_provider_qualifier qualifier);
+
+extern const struct ethtool_phy_ops *ethtool_phy_ops;
+extern const struct ethtool_pse_ops *ethtool_pse_ops;
+
+int ethtool_get_module_info_call(struct net_device *dev,
+ struct ethtool_modinfo *modinfo);
+int ethtool_get_module_eeprom_call(struct net_device *dev,
+ struct ethtool_eeprom *ee, u8 *data);
+
+bool __ethtool_dev_mm_supported(struct net_device *dev);
+
+#if IS_ENABLED(CONFIG_ETHTOOL_NETLINK)
+void ethtool_rss_notify(struct net_device *dev, u32 type, u32 rss_context);
+#else
+static inline void
+ethtool_rss_notify(struct net_device *dev, u32 type, u32 rss_context)
+{
+}
+#endif
+
+#endif /* _ETHTOOL_COMMON_H */
diff --git a/net/ethtool/debug.c b/net/ethtool/debug.c
new file mode 100644
index 000000000000..0b2dea56d461
--- /dev/null
+++ b/net/ethtool/debug.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct debug_req_info {
+ struct ethnl_req_info base;
+};
+
+struct debug_reply_data {
+ struct ethnl_reply_data base;
+ u32 msg_mask;
+};
+
+#define DEBUG_REPDATA(__reply_base) \
+ container_of(__reply_base, struct debug_reply_data, base)
+
+const struct nla_policy ethnl_debug_get_policy[] = {
+ [ETHTOOL_A_DEBUG_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int debug_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct debug_reply_data *data = DEBUG_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_msglevel)
+ return -EOPNOTSUPP;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ data->msg_mask = dev->ethtool_ops->get_msglevel(dev);
+ ethnl_ops_complete(dev);
+
+ return 0;
+}
+
+static int debug_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct debug_reply_data *data = DEBUG_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+
+ return ethnl_bitset32_size(&data->msg_mask, NULL, NETIF_MSG_CLASS_COUNT,
+ netif_msg_class_names, compact);
+}
+
+static int debug_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct debug_reply_data *data = DEBUG_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+
+ return ethnl_put_bitset32(skb, ETHTOOL_A_DEBUG_MSGMASK, &data->msg_mask,
+ NULL, NETIF_MSG_CLASS_COUNT,
+ netif_msg_class_names, compact);
+}
+
+/* DEBUG_SET */
+
+const struct nla_policy ethnl_debug_set_policy[] = {
+ [ETHTOOL_A_DEBUG_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_DEBUG_MSGMASK] = { .type = NLA_NESTED },
+};
+
+static int
+ethnl_set_debug_validate(struct ethnl_req_info *req_info,
+ struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+
+ return ops->get_msglevel && ops->set_msglevel ? 1 : -EOPNOTSUPP;
+}
+
+static int
+ethnl_set_debug(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ struct net_device *dev = req_info->dev;
+ struct nlattr **tb = info->attrs;
+ bool mod = false;
+ u32 msg_mask;
+ int ret;
+
+ msg_mask = dev->ethtool_ops->get_msglevel(dev);
+ ret = ethnl_update_bitset32(&msg_mask, NETIF_MSG_CLASS_COUNT,
+ tb[ETHTOOL_A_DEBUG_MSGMASK],
+ netif_msg_class_names, info->extack, &mod);
+ if (ret < 0 || !mod)
+ return ret;
+
+ dev->ethtool_ops->set_msglevel(dev, msg_mask);
+ return 1;
+}
+
+const struct ethnl_request_ops ethnl_debug_request_ops = {
+ .request_cmd = ETHTOOL_MSG_DEBUG_GET,
+ .reply_cmd = ETHTOOL_MSG_DEBUG_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_DEBUG_HEADER,
+ .req_info_size = sizeof(struct debug_req_info),
+ .reply_data_size = sizeof(struct debug_reply_data),
+
+ .prepare_data = debug_prepare_data,
+ .reply_size = debug_reply_size,
+ .fill_reply = debug_fill_reply,
+
+ .set_validate = ethnl_set_debug_validate,
+ .set = ethnl_set_debug,
+ .set_ntf_cmd = ETHTOOL_MSG_DEBUG_NTF,
+};
diff --git a/net/ethtool/eee.c b/net/ethtool/eee.c
new file mode 100644
index 000000000000..bf398973eb8a
--- /dev/null
+++ b/net/ethtool/eee.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct eee_req_info {
+ struct ethnl_req_info base;
+};
+
+struct eee_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_keee eee;
+};
+
+#define EEE_REPDATA(__reply_base) \
+ container_of(__reply_base, struct eee_reply_data, base)
+
+const struct nla_policy ethnl_eee_get_policy[] = {
+ [ETHTOOL_A_EEE_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int eee_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct eee_reply_data *data = EEE_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ struct ethtool_keee *eee = &data->eee;
+ int ret;
+
+ if (!dev->ethtool_ops->get_eee)
+ return -EOPNOTSUPP;
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ ret = dev->ethtool_ops->get_eee(dev, eee);
+ ethnl_ops_complete(dev);
+
+ return ret;
+}
+
+static int eee_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct eee_reply_data *data = EEE_REPDATA(reply_base);
+ const struct ethtool_keee *eee = &data->eee;
+ int len = 0;
+ int ret;
+
+ /* MODES_OURS */
+ ret = ethnl_bitset_size(eee->advertised, eee->supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ /* MODES_PEERS */
+ ret = ethnl_bitset_size(eee->lp_advertised, NULL,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+
+ len += nla_total_size(sizeof(u8)) + /* _EEE_ACTIVE */
+ nla_total_size(sizeof(u8)) + /* _EEE_ENABLED */
+ nla_total_size(sizeof(u8)) + /* _EEE_TX_LPI_ENABLED */
+ nla_total_size(sizeof(u32)); /* _EEE_TX_LPI_TIMER */
+
+ return len;
+}
+
+static int eee_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct eee_reply_data *data = EEE_REPDATA(reply_base);
+ const struct ethtool_keee *eee = &data->eee;
+ int ret;
+
+ ret = ethnl_put_bitset(skb, ETHTOOL_A_EEE_MODES_OURS,
+ eee->advertised, eee->supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_put_bitset(skb, ETHTOOL_A_EEE_MODES_PEER,
+ eee->lp_advertised, NULL,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+
+ if (nla_put_u8(skb, ETHTOOL_A_EEE_ACTIVE, eee->eee_active) ||
+ nla_put_u8(skb, ETHTOOL_A_EEE_ENABLED, eee->eee_enabled) ||
+ nla_put_u8(skb, ETHTOOL_A_EEE_TX_LPI_ENABLED,
+ eee->tx_lpi_enabled) ||
+ nla_put_u32(skb, ETHTOOL_A_EEE_TX_LPI_TIMER, eee->tx_lpi_timer))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+/* EEE_SET */
+
+const struct nla_policy ethnl_eee_set_policy[] = {
+ [ETHTOOL_A_EEE_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_EEE_MODES_OURS] = { .type = NLA_NESTED },
+ [ETHTOOL_A_EEE_ENABLED] = { .type = NLA_U8 },
+ [ETHTOOL_A_EEE_TX_LPI_ENABLED] = { .type = NLA_U8 },
+ [ETHTOOL_A_EEE_TX_LPI_TIMER] = { .type = NLA_U32 },
+};
+
+static int
+ethnl_set_eee_validate(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+
+ return ops->get_eee && ops->set_eee ? 1 : -EOPNOTSUPP;
+}
+
+static int
+ethnl_set_eee(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ struct net_device *dev = req_info->dev;
+ struct nlattr **tb = info->attrs;
+ struct ethtool_keee eee = {};
+ bool mod = false;
+ int ret;
+
+ ret = dev->ethtool_ops->get_eee(dev, &eee);
+ if (ret < 0)
+ return ret;
+
+ ret = ethnl_update_bitset(eee.advertised,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ tb[ETHTOOL_A_EEE_MODES_OURS],
+ link_mode_names, info->extack, &mod);
+ if (ret < 0)
+ return ret;
+ ethnl_update_bool(&eee.eee_enabled, tb[ETHTOOL_A_EEE_ENABLED], &mod);
+ ethnl_update_bool(&eee.tx_lpi_enabled, tb[ETHTOOL_A_EEE_TX_LPI_ENABLED],
+ &mod);
+ ethnl_update_u32(&eee.tx_lpi_timer, tb[ETHTOOL_A_EEE_TX_LPI_TIMER],
+ &mod);
+ if (!mod)
+ return 0;
+
+ ret = dev->ethtool_ops->set_eee(dev, &eee);
+ return ret < 0 ? ret : 1;
+}
+
+const struct ethnl_request_ops ethnl_eee_request_ops = {
+ .request_cmd = ETHTOOL_MSG_EEE_GET,
+ .reply_cmd = ETHTOOL_MSG_EEE_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_EEE_HEADER,
+ .req_info_size = sizeof(struct eee_req_info),
+ .reply_data_size = sizeof(struct eee_reply_data),
+
+ .prepare_data = eee_prepare_data,
+ .reply_size = eee_reply_size,
+ .fill_reply = eee_fill_reply,
+
+ .set_validate = ethnl_set_eee_validate,
+ .set = ethnl_set_eee,
+ .set_ntf_cmd = ETHTOOL_MSG_EEE_NTF,
+};
diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c
new file mode 100644
index 000000000000..3b8209e930fd
--- /dev/null
+++ b/net/ethtool/eeprom.c
@@ -0,0 +1,246 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool.h>
+#include <linux/sfp.h>
+#include "netlink.h"
+#include "common.h"
+
+struct eeprom_req_info {
+ struct ethnl_req_info base;
+ u32 offset;
+ u32 length;
+ u8 page;
+ u8 bank;
+ u8 i2c_address;
+};
+
+struct eeprom_reply_data {
+ struct ethnl_reply_data base;
+ u32 length;
+ u8 *data;
+};
+
+#define MODULE_EEPROM_REQINFO(__req_base) \
+ container_of(__req_base, struct eeprom_req_info, base)
+
+#define MODULE_EEPROM_REPDATA(__reply_base) \
+ container_of(__reply_base, struct eeprom_reply_data, base)
+
+static int fallback_set_params(struct eeprom_req_info *request,
+ struct ethtool_modinfo *modinfo,
+ struct ethtool_eeprom *eeprom)
+{
+ u32 offset = request->offset;
+ u32 length = request->length;
+
+ if (request->page)
+ offset = request->page * ETH_MODULE_EEPROM_PAGE_LEN + offset;
+
+ if (modinfo->type == ETH_MODULE_SFF_8472 &&
+ request->i2c_address == 0x51)
+ offset += ETH_MODULE_EEPROM_PAGE_LEN * 2;
+
+ if (offset >= modinfo->eeprom_len)
+ return -EINVAL;
+
+ eeprom->cmd = ETHTOOL_GMODULEEEPROM;
+ eeprom->len = length;
+ eeprom->offset = offset;
+
+ return 0;
+}
+
+static int eeprom_fallback(struct eeprom_req_info *request,
+ struct eeprom_reply_data *reply)
+{
+ struct net_device *dev = reply->base.dev;
+ struct ethtool_modinfo modinfo = {0};
+ struct ethtool_eeprom eeprom = {0};
+ u8 *data;
+ int err;
+
+ modinfo.cmd = ETHTOOL_GMODULEINFO;
+ err = ethtool_get_module_info_call(dev, &modinfo);
+ if (err < 0)
+ return err;
+
+ err = fallback_set_params(request, &modinfo, &eeprom);
+ if (err < 0)
+ return err;
+
+ data = kmalloc(eeprom.len, GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+ err = ethtool_get_module_eeprom_call(dev, &eeprom, data);
+ if (err < 0)
+ goto err_out;
+
+ reply->data = data;
+ reply->length = eeprom.len;
+
+ return 0;
+
+err_out:
+ kfree(data);
+ return err;
+}
+
+static int get_module_eeprom_by_page(struct net_device *dev,
+ struct ethtool_module_eeprom *page_data,
+ struct netlink_ext_ack *extack)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (dev->ethtool->module_fw_flash_in_progress) {
+ NL_SET_ERR_MSG(extack,
+ "Module firmware flashing is in progress");
+ return -EBUSY;
+ }
+
+ if (dev->sfp_bus)
+ return sfp_get_module_eeprom_by_page(dev->sfp_bus, page_data, extack);
+
+ if (ops->get_module_eeprom_by_page)
+ return ops->get_module_eeprom_by_page(dev, page_data, extack);
+
+ return -EOPNOTSUPP;
+}
+
+static int eeprom_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base);
+ struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base);
+ struct ethtool_module_eeprom page_data = {0};
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ page_data.offset = request->offset;
+ page_data.length = request->length;
+ page_data.i2c_address = request->i2c_address;
+ page_data.page = request->page;
+ page_data.bank = request->bank;
+ page_data.data = kmalloc(page_data.length, GFP_KERNEL);
+ if (!page_data.data)
+ return -ENOMEM;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret)
+ goto err_free;
+
+ ret = get_module_eeprom_by_page(dev, &page_data, info->extack);
+ if (ret < 0)
+ goto err_ops;
+
+ reply->length = ret;
+ reply->data = page_data.data;
+
+ ethnl_ops_complete(dev);
+ return 0;
+
+err_ops:
+ ethnl_ops_complete(dev);
+err_free:
+ kfree(page_data.data);
+
+ if (ret == -EOPNOTSUPP)
+ return eeprom_fallback(request, reply);
+ return ret;
+}
+
+static int eeprom_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_info);
+
+ if (!tb[ETHTOOL_A_MODULE_EEPROM_OFFSET] ||
+ !tb[ETHTOOL_A_MODULE_EEPROM_LENGTH] ||
+ !tb[ETHTOOL_A_MODULE_EEPROM_PAGE] ||
+ !tb[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS])
+ return -EINVAL;
+
+ request->i2c_address = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS]);
+ request->offset = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_OFFSET]);
+ request->length = nla_get_u32(tb[ETHTOOL_A_MODULE_EEPROM_LENGTH]);
+
+ /* The following set of conditions limit the API to only dump 1/2
+ * EEPROM page without crossing low page boundary located at offset 128.
+ * This means user may only request dumps of length limited to 128 from
+ * either low 128 bytes or high 128 bytes.
+ * For pages higher than 0 only high 128 bytes are accessible.
+ */
+ request->page = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_PAGE]);
+ if (request->page && request->offset < ETH_MODULE_EEPROM_PAGE_LEN) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_PAGE],
+ "reading from lower half page is allowed for page 0 only");
+ return -EINVAL;
+ }
+
+ if (request->offset < ETH_MODULE_EEPROM_PAGE_LEN &&
+ request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH],
+ "reading cross half page boundary is illegal");
+ return -EINVAL;
+ } else if (request->offset + request->length > ETH_MODULE_EEPROM_PAGE_LEN * 2) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MODULE_EEPROM_LENGTH],
+ "reading cross page boundary is illegal");
+ return -EINVAL;
+ }
+
+ if (tb[ETHTOOL_A_MODULE_EEPROM_BANK])
+ request->bank = nla_get_u8(tb[ETHTOOL_A_MODULE_EEPROM_BANK]);
+
+ return 0;
+}
+
+static int eeprom_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct eeprom_req_info *request = MODULE_EEPROM_REQINFO(req_base);
+
+ return nla_total_size(sizeof(u8) * request->length); /* _EEPROM_DATA */
+}
+
+static int eeprom_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base);
+
+ return nla_put(skb, ETHTOOL_A_MODULE_EEPROM_DATA, reply->length, reply->data);
+}
+
+static void eeprom_cleanup_data(struct ethnl_reply_data *reply_base)
+{
+ struct eeprom_reply_data *reply = MODULE_EEPROM_REPDATA(reply_base);
+
+ kfree(reply->data);
+}
+
+const struct ethnl_request_ops ethnl_module_eeprom_request_ops = {
+ .request_cmd = ETHTOOL_MSG_MODULE_EEPROM_GET,
+ .reply_cmd = ETHTOOL_MSG_MODULE_EEPROM_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_MODULE_EEPROM_HEADER,
+ .req_info_size = sizeof(struct eeprom_req_info),
+ .reply_data_size = sizeof(struct eeprom_reply_data),
+
+ .parse_request = eeprom_parse_request,
+ .prepare_data = eeprom_prepare_data,
+ .reply_size = eeprom_reply_size,
+ .fill_reply = eeprom_fill_reply,
+ .cleanup_data = eeprom_cleanup_data,
+};
+
+const struct nla_policy ethnl_module_eeprom_get_policy[] = {
+ [ETHTOOL_A_MODULE_EEPROM_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_MODULE_EEPROM_OFFSET] =
+ NLA_POLICY_MAX(NLA_U32, ETH_MODULE_EEPROM_PAGE_LEN * 2 - 1),
+ [ETHTOOL_A_MODULE_EEPROM_LENGTH] =
+ NLA_POLICY_RANGE(NLA_U32, 1, ETH_MODULE_EEPROM_PAGE_LEN),
+ [ETHTOOL_A_MODULE_EEPROM_PAGE] = { .type = NLA_U8 },
+ [ETHTOOL_A_MODULE_EEPROM_BANK] = { .type = NLA_U8 },
+ [ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS] =
+ NLA_POLICY_RANGE(NLA_U8, 0, ETH_MODULE_MAX_I2C_ADDRESS),
+};
+
diff --git a/net/ethtool/features.c b/net/ethtool/features.c
new file mode 100644
index 000000000000..f2217983be2b
--- /dev/null
+++ b/net/ethtool/features.c
@@ -0,0 +1,297 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <net/netdev_lock.h>
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct features_req_info {
+ struct ethnl_req_info base;
+};
+
+struct features_reply_data {
+ struct ethnl_reply_data base;
+ u32 hw[ETHTOOL_DEV_FEATURE_WORDS];
+ u32 wanted[ETHTOOL_DEV_FEATURE_WORDS];
+ u32 active[ETHTOOL_DEV_FEATURE_WORDS];
+ u32 nochange[ETHTOOL_DEV_FEATURE_WORDS];
+ u32 all[ETHTOOL_DEV_FEATURE_WORDS];
+};
+
+#define FEATURES_REPDATA(__reply_base) \
+ container_of(__reply_base, struct features_reply_data, base)
+
+const struct nla_policy ethnl_features_get_policy[] = {
+ [ETHTOOL_A_FEATURES_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static void ethnl_features_to_bitmap32(u32 *dest, netdev_features_t src)
+{
+ unsigned int i;
+
+ for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; i++)
+ dest[i] = src >> (32 * i);
+}
+
+static int features_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct features_reply_data *data = FEATURES_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ netdev_features_t all_features;
+
+ ethnl_features_to_bitmap32(data->hw, dev->hw_features);
+ ethnl_features_to_bitmap32(data->wanted, dev->wanted_features);
+ ethnl_features_to_bitmap32(data->active, dev->features);
+ ethnl_features_to_bitmap32(data->nochange, NETIF_F_NEVER_CHANGE);
+ all_features = GENMASK_ULL(NETDEV_FEATURE_COUNT - 1, 0);
+ ethnl_features_to_bitmap32(data->all, all_features);
+
+ return 0;
+}
+
+static int features_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct features_reply_data *data = FEATURES_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ unsigned int len = 0;
+ int ret;
+
+ ret = ethnl_bitset32_size(data->hw, data->all, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ ret = ethnl_bitset32_size(data->wanted, NULL, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ ret = ethnl_bitset32_size(data->active, NULL, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ ret = ethnl_bitset32_size(data->nochange, NULL, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+
+ return len;
+}
+
+static int features_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct features_reply_data *data = FEATURES_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ int ret;
+
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_HW, data->hw,
+ data->all, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_WANTED, data->wanted,
+ NULL, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ return ret;
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_ACTIVE, data->active,
+ NULL, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ return ret;
+ return ethnl_put_bitset32(skb, ETHTOOL_A_FEATURES_NOCHANGE,
+ data->nochange, NULL, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+}
+
+const struct ethnl_request_ops ethnl_features_request_ops = {
+ .request_cmd = ETHTOOL_MSG_FEATURES_GET,
+ .reply_cmd = ETHTOOL_MSG_FEATURES_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_FEATURES_HEADER,
+ .req_info_size = sizeof(struct features_req_info),
+ .reply_data_size = sizeof(struct features_reply_data),
+
+ .prepare_data = features_prepare_data,
+ .reply_size = features_reply_size,
+ .fill_reply = features_fill_reply,
+};
+
+/* FEATURES_SET */
+
+const struct nla_policy ethnl_features_set_policy[] = {
+ [ETHTOOL_A_FEATURES_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_FEATURES_WANTED] = { .type = NLA_NESTED },
+};
+
+static void ethnl_features_to_bitmap(unsigned long *dest, netdev_features_t val)
+{
+ const unsigned int words = BITS_TO_LONGS(NETDEV_FEATURE_COUNT);
+ unsigned int i;
+
+ for (i = 0; i < words; i++)
+ dest[i] = (unsigned long)(val >> (i * BITS_PER_LONG));
+}
+
+static netdev_features_t ethnl_bitmap_to_features(unsigned long *src)
+{
+ const unsigned int nft_bits = sizeof(netdev_features_t) * BITS_PER_BYTE;
+ const unsigned int words = BITS_TO_LONGS(NETDEV_FEATURE_COUNT);
+ netdev_features_t ret = 0;
+ unsigned int i;
+
+ for (i = 0; i < words; i++)
+ ret |= (netdev_features_t)(src[i]) << (i * BITS_PER_LONG);
+ ret &= ~(netdev_features_t)0 >> (nft_bits - NETDEV_FEATURE_COUNT);
+ return ret;
+}
+
+static int features_send_reply(struct net_device *dev, struct genl_info *info,
+ const unsigned long *wanted,
+ const unsigned long *wanted_mask,
+ const unsigned long *active,
+ const unsigned long *active_mask, bool compact)
+{
+ struct sk_buff *rskb;
+ void *reply_payload;
+ int reply_len = 0;
+ int ret;
+
+ reply_len = ethnl_reply_header_size();
+ ret = ethnl_bitset_size(wanted, wanted_mask, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ goto err;
+ reply_len += ret;
+ ret = ethnl_bitset_size(active, active_mask, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ goto err;
+ reply_len += ret;
+
+ ret = -ENOMEM;
+ rskb = ethnl_reply_init(reply_len, dev, ETHTOOL_MSG_FEATURES_SET_REPLY,
+ ETHTOOL_A_FEATURES_HEADER, info,
+ &reply_payload);
+ if (!rskb)
+ goto err;
+
+ ret = ethnl_put_bitset(rskb, ETHTOOL_A_FEATURES_WANTED, wanted,
+ wanted_mask, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ goto nla_put_failure;
+ ret = ethnl_put_bitset(rskb, ETHTOOL_A_FEATURES_ACTIVE, active,
+ active_mask, NETDEV_FEATURE_COUNT,
+ netdev_features_strings, compact);
+ if (ret < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(rskb, reply_payload);
+ ret = genlmsg_reply(rskb, info);
+ return ret;
+
+nla_put_failure:
+ nlmsg_free(rskb);
+ WARN_ONCE(1, "calculated message payload length (%d) not sufficient\n",
+ reply_len);
+err:
+ GENL_SET_ERR_MSG(info, "failed to send reply message");
+ return ret;
+}
+
+int ethnl_set_features(struct sk_buff *skb, struct genl_info *info)
+{
+ DECLARE_BITMAP(wanted_diff_mask, NETDEV_FEATURE_COUNT);
+ DECLARE_BITMAP(active_diff_mask, NETDEV_FEATURE_COUNT);
+ DECLARE_BITMAP(old_active, NETDEV_FEATURE_COUNT);
+ DECLARE_BITMAP(old_wanted, NETDEV_FEATURE_COUNT);
+ DECLARE_BITMAP(new_active, NETDEV_FEATURE_COUNT);
+ DECLARE_BITMAP(new_wanted, NETDEV_FEATURE_COUNT);
+ DECLARE_BITMAP(req_wanted, NETDEV_FEATURE_COUNT);
+ DECLARE_BITMAP(req_mask, NETDEV_FEATURE_COUNT);
+ struct ethnl_req_info req_info = {};
+ struct nlattr **tb = info->attrs;
+ struct net_device *dev;
+ bool mod;
+ int ret;
+
+ if (!tb[ETHTOOL_A_FEATURES_WANTED])
+ return -EINVAL;
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_FEATURES_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+ dev = req_info.dev;
+
+ rtnl_lock();
+ netdev_lock_ops(dev);
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_unlock;
+ ethnl_features_to_bitmap(old_active, dev->features);
+ ethnl_features_to_bitmap(old_wanted, dev->wanted_features);
+ ret = ethnl_parse_bitset(req_wanted, req_mask, NETDEV_FEATURE_COUNT,
+ tb[ETHTOOL_A_FEATURES_WANTED],
+ netdev_features_strings, info->extack);
+ if (ret < 0)
+ goto out_ops;
+ if (ethnl_bitmap_to_features(req_mask) & ~NETIF_F_ETHTOOL_BITS) {
+ GENL_SET_ERR_MSG(info, "attempt to change non-ethtool features");
+ ret = -EINVAL;
+ goto out_ops;
+ }
+
+ /* set req_wanted bits not in req_mask from old_wanted */
+ bitmap_and(req_wanted, req_wanted, req_mask, NETDEV_FEATURE_COUNT);
+ bitmap_andnot(new_wanted, old_wanted, req_mask, NETDEV_FEATURE_COUNT);
+ bitmap_or(req_wanted, new_wanted, req_wanted, NETDEV_FEATURE_COUNT);
+ if (!bitmap_equal(req_wanted, old_wanted, NETDEV_FEATURE_COUNT)) {
+ dev->wanted_features &= ~dev->hw_features;
+ dev->wanted_features |= ethnl_bitmap_to_features(req_wanted) & dev->hw_features;
+ __netdev_update_features(dev);
+ }
+ ethnl_features_to_bitmap(new_active, dev->features);
+ mod = !bitmap_equal(old_active, new_active, NETDEV_FEATURE_COUNT);
+
+ ret = 0;
+ if (!(req_info.flags & ETHTOOL_FLAG_OMIT_REPLY)) {
+ bool compact = req_info.flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+
+ bitmap_xor(wanted_diff_mask, req_wanted, new_active,
+ NETDEV_FEATURE_COUNT);
+ bitmap_xor(active_diff_mask, old_active, new_active,
+ NETDEV_FEATURE_COUNT);
+ bitmap_and(wanted_diff_mask, wanted_diff_mask, req_mask,
+ NETDEV_FEATURE_COUNT);
+ bitmap_and(req_wanted, req_wanted, wanted_diff_mask,
+ NETDEV_FEATURE_COUNT);
+ bitmap_and(new_active, new_active, active_diff_mask,
+ NETDEV_FEATURE_COUNT);
+
+ ret = features_send_reply(dev, info, req_wanted,
+ wanted_diff_mask, new_active,
+ active_diff_mask, compact);
+ }
+ if (mod)
+ netdev_features_change(dev);
+
+out_ops:
+ ethnl_ops_complete(dev);
+out_unlock:
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+ ethnl_parse_header_dev_put(&req_info);
+ return ret;
+}
diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c
new file mode 100644
index 000000000000..4669e74cbcaa
--- /dev/null
+++ b/net/ethtool/fec.c
@@ -0,0 +1,364 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct fec_req_info {
+ struct ethnl_req_info base;
+};
+
+struct fec_reply_data {
+ struct ethnl_reply_data base;
+ __ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes);
+ u32 active_fec;
+ u8 fec_auto;
+ struct fec_stat_grp {
+ u64 stats[1 + ETHTOOL_MAX_LANES];
+ u8 cnt;
+ } corr, uncorr, corr_bits;
+ struct ethtool_fec_hist fec_stat_hist;
+};
+
+#define FEC_REPDATA(__reply_base) \
+ container_of(__reply_base, struct fec_reply_data, base)
+
+#define ETHTOOL_FEC_MASK ((ETHTOOL_FEC_LLRS << 1) - 1)
+
+const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1] = {
+ [ETHTOOL_A_FEC_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats),
+};
+
+static void
+ethtool_fec_to_link_modes(u32 fec, unsigned long *link_modes, u8 *fec_auto)
+{
+ if (fec_auto)
+ *fec_auto = !!(fec & ETHTOOL_FEC_AUTO);
+
+ if (fec & ETHTOOL_FEC_OFF)
+ __set_bit(ETHTOOL_LINK_MODE_FEC_NONE_BIT, link_modes);
+ if (fec & ETHTOOL_FEC_RS)
+ __set_bit(ETHTOOL_LINK_MODE_FEC_RS_BIT, link_modes);
+ if (fec & ETHTOOL_FEC_BASER)
+ __set_bit(ETHTOOL_LINK_MODE_FEC_BASER_BIT, link_modes);
+ if (fec & ETHTOOL_FEC_LLRS)
+ __set_bit(ETHTOOL_LINK_MODE_FEC_LLRS_BIT, link_modes);
+}
+
+static int
+ethtool_link_modes_to_fecparam(struct ethtool_fecparam *fec,
+ unsigned long *link_modes, u8 fec_auto)
+{
+ memset(fec, 0, sizeof(*fec));
+
+ if (fec_auto)
+ fec->fec |= ETHTOOL_FEC_AUTO;
+
+ if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_NONE_BIT, link_modes))
+ fec->fec |= ETHTOOL_FEC_OFF;
+ if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_RS_BIT, link_modes))
+ fec->fec |= ETHTOOL_FEC_RS;
+ if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_BASER_BIT, link_modes))
+ fec->fec |= ETHTOOL_FEC_BASER;
+ if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_LLRS_BIT, link_modes))
+ fec->fec |= ETHTOOL_FEC_LLRS;
+
+ if (!bitmap_empty(link_modes, __ETHTOOL_LINK_MODE_MASK_NBITS))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void
+fec_stats_recalc(struct fec_stat_grp *grp, struct ethtool_fec_stat *stats)
+{
+ int i;
+
+ if (stats->lanes[0] == ETHTOOL_STAT_NOT_SET) {
+ grp->stats[0] = stats->total;
+ grp->cnt = stats->total != ETHTOOL_STAT_NOT_SET;
+ return;
+ }
+
+ grp->cnt = 1;
+ grp->stats[0] = 0;
+ for (i = 0; i < ETHTOOL_MAX_LANES; i++) {
+ if (stats->lanes[i] == ETHTOOL_STAT_NOT_SET)
+ break;
+
+ grp->stats[0] += stats->lanes[i];
+ grp->stats[grp->cnt++] = stats->lanes[i];
+ }
+}
+
+static int fec_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ __ETHTOOL_DECLARE_LINK_MODE_MASK(active_fec_modes) = {};
+ struct fec_reply_data *data = FEC_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ struct ethtool_fecparam fec = {};
+ int ret;
+
+ if (!dev->ethtool_ops->get_fecparam)
+ return -EOPNOTSUPP;
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ ret = dev->ethtool_ops->get_fecparam(dev, &fec);
+ if (ret)
+ goto out_complete;
+ if (req_base->flags & ETHTOOL_FLAG_STATS &&
+ dev->ethtool_ops->get_fec_stats) {
+ struct ethtool_fec_stats stats;
+
+ ethtool_stats_init((u64 *)&stats, sizeof(stats) / 8);
+ ethtool_stats_init((u64 *)data->fec_stat_hist.values,
+ sizeof(data->fec_stat_hist.values) / 8);
+ dev->ethtool_ops->get_fec_stats(dev, &stats,
+ &data->fec_stat_hist);
+
+ fec_stats_recalc(&data->corr, &stats.corrected_blocks);
+ fec_stats_recalc(&data->uncorr, &stats.uncorrectable_blocks);
+ fec_stats_recalc(&data->corr_bits, &stats.corrected_bits);
+ }
+
+ WARN_ON_ONCE(fec.reserved);
+
+ ethtool_fec_to_link_modes(fec.fec, data->fec_link_modes,
+ &data->fec_auto);
+
+ ethtool_fec_to_link_modes(fec.active_fec, active_fec_modes, NULL);
+ data->active_fec = find_first_bit(active_fec_modes,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+ /* Don't report attr if no FEC mode set. Note that
+ * ethtool_fecparam_to_link_modes() ignores NONE and AUTO.
+ */
+ if (data->active_fec == __ETHTOOL_LINK_MODE_MASK_NBITS)
+ data->active_fec = 0;
+
+out_complete:
+ ethnl_ops_complete(dev);
+ return ret;
+}
+
+static int fec_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct fec_reply_data *data = FEC_REPDATA(reply_base);
+ int len = 0;
+ int ret;
+
+ ret = ethnl_bitset_size(data->fec_link_modes, NULL,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+
+ len += nla_total_size(sizeof(u8)) + /* _FEC_AUTO */
+ nla_total_size(sizeof(u32)); /* _FEC_ACTIVE */
+
+ if (req_base->flags & ETHTOOL_FLAG_STATS) {
+ len += 3 * nla_total_size_64bit(sizeof(u64) *
+ (1 + ETHTOOL_MAX_LANES));
+ /* add FEC bins information */
+ len += (nla_total_size(0) + /* _A_FEC_HIST */
+ nla_total_size(4) + /* _A_FEC_HIST_BIN_LOW */
+ nla_total_size(4) + /* _A_FEC_HIST_BIN_HI */
+ /* _A_FEC_HIST_BIN_VAL + per-lane values */
+ nla_total_size_64bit(sizeof(u64)) +
+ nla_total_size_64bit(sizeof(u64) * ETHTOOL_MAX_LANES)) *
+ ETHTOOL_FEC_HIST_MAX;
+ }
+
+ return len;
+}
+
+static int fec_put_hist(struct sk_buff *skb,
+ const struct ethtool_fec_hist *hist)
+{
+ const struct ethtool_fec_hist_range *ranges = hist->ranges;
+ const struct ethtool_fec_hist_value *values = hist->values;
+ struct nlattr *nest;
+ int i, j;
+ u64 sum;
+
+ if (!ranges)
+ return 0;
+
+ for (i = 0; i < ETHTOOL_FEC_HIST_MAX; i++) {
+ if (i && !ranges[i].low && !ranges[i].high)
+ break;
+
+ if (WARN_ON_ONCE(values[i].sum == ETHTOOL_STAT_NOT_SET &&
+ values[i].per_lane[0] == ETHTOOL_STAT_NOT_SET))
+ break;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_FEC_STAT_HIST);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_FEC_HIST_BIN_LOW,
+ ranges[i].low) ||
+ nla_put_u32(skb, ETHTOOL_A_FEC_HIST_BIN_HIGH,
+ ranges[i].high))
+ goto err_cancel_hist;
+ sum = 0;
+ for (j = 0; j < ETHTOOL_MAX_LANES; j++) {
+ if (values[i].per_lane[j] == ETHTOOL_STAT_NOT_SET)
+ break;
+ sum += values[i].per_lane[j];
+ }
+ if (nla_put_uint(skb, ETHTOOL_A_FEC_HIST_BIN_VAL,
+ values[i].sum == ETHTOOL_STAT_NOT_SET ?
+ sum : values[i].sum))
+ goto err_cancel_hist;
+ if (j && nla_put_64bit(skb, ETHTOOL_A_FEC_HIST_BIN_VAL_PER_LANE,
+ sizeof(u64) * j,
+ values[i].per_lane,
+ ETHTOOL_A_FEC_HIST_PAD))
+ goto err_cancel_hist;
+
+ nla_nest_end(skb, nest);
+ }
+
+ return 0;
+
+err_cancel_hist:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_FEC_STATS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_CORRECTED,
+ sizeof(u64) * data->corr.cnt,
+ data->corr.stats, ETHTOOL_A_FEC_STAT_PAD) ||
+ nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_UNCORR,
+ sizeof(u64) * data->uncorr.cnt,
+ data->uncorr.stats, ETHTOOL_A_FEC_STAT_PAD) ||
+ nla_put_64bit(skb, ETHTOOL_A_FEC_STAT_CORR_BITS,
+ sizeof(u64) * data->corr_bits.cnt,
+ data->corr_bits.stats, ETHTOOL_A_FEC_STAT_PAD))
+ goto err_cancel;
+
+ if (fec_put_hist(skb, &data->fec_stat_hist))
+ goto err_cancel;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+err_cancel:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int fec_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct fec_reply_data *data = FEC_REPDATA(reply_base);
+ int ret;
+
+ ret = ethnl_put_bitset(skb, ETHTOOL_A_FEC_MODES,
+ data->fec_link_modes, NULL,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+
+ if (nla_put_u8(skb, ETHTOOL_A_FEC_AUTO, data->fec_auto) ||
+ (data->active_fec &&
+ nla_put_u32(skb, ETHTOOL_A_FEC_ACTIVE, data->active_fec)))
+ return -EMSGSIZE;
+
+ if (req_base->flags & ETHTOOL_FLAG_STATS && fec_put_stats(skb, data))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+/* FEC_SET */
+
+const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1] = {
+ [ETHTOOL_A_FEC_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_FEC_MODES] = { .type = NLA_NESTED },
+ [ETHTOOL_A_FEC_AUTO] = NLA_POLICY_MAX(NLA_U8, 1),
+};
+
+static int
+ethnl_set_fec_validate(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+
+ return ops->get_fecparam && ops->set_fecparam ? 1 : -EOPNOTSUPP;
+}
+
+static int
+ethnl_set_fec(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ __ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes) = {};
+ struct net_device *dev = req_info->dev;
+ struct nlattr **tb = info->attrs;
+ struct ethtool_fecparam fec = {};
+ bool mod = false;
+ u8 fec_auto;
+ int ret;
+
+ ret = dev->ethtool_ops->get_fecparam(dev, &fec);
+ if (ret < 0)
+ return ret;
+
+ ethtool_fec_to_link_modes(fec.fec, fec_link_modes, &fec_auto);
+
+ ret = ethnl_update_bitset(fec_link_modes,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ tb[ETHTOOL_A_FEC_MODES],
+ link_mode_names, info->extack, &mod);
+ if (ret < 0)
+ return ret;
+ ethnl_update_u8(&fec_auto, tb[ETHTOOL_A_FEC_AUTO], &mod);
+ if (!mod)
+ return 0;
+
+ ret = ethtool_link_modes_to_fecparam(&fec, fec_link_modes, fec_auto);
+ if (ret) {
+ NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_FEC_MODES],
+ "invalid FEC modes requested");
+ return ret;
+ }
+ if (!fec.fec) {
+ NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_FEC_MODES],
+ "no FEC modes set");
+ return -EINVAL;
+ }
+
+ ret = dev->ethtool_ops->set_fecparam(dev, &fec);
+ return ret < 0 ? ret : 1;
+}
+
+const struct ethnl_request_ops ethnl_fec_request_ops = {
+ .request_cmd = ETHTOOL_MSG_FEC_GET,
+ .reply_cmd = ETHTOOL_MSG_FEC_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_FEC_HEADER,
+ .req_info_size = sizeof(struct fec_req_info),
+ .reply_data_size = sizeof(struct fec_reply_data),
+
+ .prepare_data = fec_prepare_data,
+ .reply_size = fec_reply_size,
+ .fill_reply = fec_fill_reply,
+
+ .set_validate = ethnl_set_fec_validate,
+ .set = ethnl_set_fec,
+ .set_ntf_cmd = ETHTOOL_MSG_FEC_NTF,
+};
diff --git a/net/core/ethtool.c b/net/ethtool/ioctl.c
index 0762aaf8e964..fa83ddade4f8 100644
--- a/net/core/ethtool.c
+++ b/net/ethtool/ioctl.c
@@ -1,16 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/core/ethtool.c - Ethtool ioctl handler
* Copyright (c) 2003 Matthew Wilcox <matthew@wil.cx>
*
* This file is where we call all the ethtool_ops commands to get
* the information ethtool needs.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
+#include <linux/compat.h>
+#include <linux/etherdevice.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/capability.h>
@@ -27,6 +25,31 @@
#include <linux/rtnetlink.h>
#include <linux/sched/signal.h>
#include <linux/net.h>
+#include <linux/pm_runtime.h>
+#include <linux/utsname.h>
+#include <net/devlink.h>
+#include <net/ipv6.h>
+#include <net/xdp_sock_drv.h>
+#include <net/flow_offload.h>
+#include <net/netdev_lock.h>
+#include <linux/ethtool_netlink.h>
+#include "common.h"
+
+/* State held across locks and calls for commands which have devlink fallback */
+struct ethtool_devlink_compat {
+ struct devlink *devlink;
+ union {
+ struct ethtool_flash efl;
+ struct ethtool_drvinfo info;
+ };
+};
+
+static struct devlink *netdev_to_devlink_get(struct net_device *dev)
+{
+ if (!dev->devlink_port)
+ return NULL;
+ return devlink_try_get(dev->devlink_port->devlink);
+}
/*
* Some useful ethtool_ops methods that're device independent.
@@ -36,11 +59,15 @@
u32 ethtool_op_get_link(struct net_device *dev)
{
+ /* Synchronize carrier state with link watch, see also rtnl_getlink() */
+ __linkwatch_sync_dev(dev);
+
return netif_carrier_ok(dev) ? 1 : 0;
}
EXPORT_SYMBOL(ethtool_op_get_link);
-int ethtool_op_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info)
+int ethtool_op_get_ts_info(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info)
{
info->so_timestamping =
SOF_TIMESTAMPING_TX_SOFTWARE |
@@ -53,88 +80,6 @@ EXPORT_SYMBOL(ethtool_op_get_ts_info);
/* Handlers for each ethtool command */
-#define ETHTOOL_DEV_FEATURE_WORDS ((NETDEV_FEATURE_COUNT + 31) / 32)
-
-static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
- [NETIF_F_SG_BIT] = "tx-scatter-gather",
- [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4",
- [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic",
- [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6",
- [NETIF_F_HIGHDMA_BIT] = "highdma",
- [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist",
- [NETIF_F_HW_VLAN_CTAG_TX_BIT] = "tx-vlan-hw-insert",
-
- [NETIF_F_HW_VLAN_CTAG_RX_BIT] = "rx-vlan-hw-parse",
- [NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-filter",
- [NETIF_F_HW_VLAN_STAG_TX_BIT] = "tx-vlan-stag-hw-insert",
- [NETIF_F_HW_VLAN_STAG_RX_BIT] = "rx-vlan-stag-hw-parse",
- [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter",
- [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged",
- [NETIF_F_GSO_BIT] = "tx-generic-segmentation",
- [NETIF_F_LLTX_BIT] = "tx-lockless",
- [NETIF_F_NETNS_LOCAL_BIT] = "netns-local",
- [NETIF_F_GRO_BIT] = "rx-gro",
- [NETIF_F_GRO_HW_BIT] = "rx-gro-hw",
- [NETIF_F_LRO_BIT] = "rx-lro",
-
- [NETIF_F_TSO_BIT] = "tx-tcp-segmentation",
- [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust",
- [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation",
- [NETIF_F_TSO_MANGLEID_BIT] = "tx-tcp-mangleid-segmentation",
- [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation",
- [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation",
- [NETIF_F_GSO_GRE_BIT] = "tx-gre-segmentation",
- [NETIF_F_GSO_GRE_CSUM_BIT] = "tx-gre-csum-segmentation",
- [NETIF_F_GSO_IPXIP4_BIT] = "tx-ipxip4-segmentation",
- [NETIF_F_GSO_IPXIP6_BIT] = "tx-ipxip6-segmentation",
- [NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation",
- [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
- [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial",
- [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation",
- [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation",
- [NETIF_F_GSO_UDP_L4_BIT] = "tx-udp-segmentation",
-
- [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
- [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp",
- [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu",
- [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter",
- [NETIF_F_RXHASH_BIT] = "rx-hashing",
- [NETIF_F_RXCSUM_BIT] = "rx-checksum",
- [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy",
- [NETIF_F_LOOPBACK_BIT] = "loopback",
- [NETIF_F_RXFCS_BIT] = "rx-fcs",
- [NETIF_F_RXALL_BIT] = "rx-all",
- [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
- [NETIF_F_HW_TC_BIT] = "hw-tc-offload",
- [NETIF_F_HW_ESP_BIT] = "esp-hw-offload",
- [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload",
- [NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload",
- [NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record",
- [NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload",
- [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
-};
-
-static const char
-rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = {
- [ETH_RSS_HASH_TOP_BIT] = "toeplitz",
- [ETH_RSS_HASH_XOR_BIT] = "xor",
- [ETH_RSS_HASH_CRC32_BIT] = "crc32",
-};
-
-static const char
-tunable_strings[__ETHTOOL_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
- [ETHTOOL_ID_UNSPEC] = "Unspec",
- [ETHTOOL_RX_COPYBREAK] = "rx-copybreak",
- [ETHTOOL_TX_COPYBREAK] = "tx-copybreak",
- [ETHTOOL_PFC_PREVENTION_TOUT] = "pfc-prevention-tout",
-};
-
-static const char
-phy_tunable_strings[__ETHTOOL_PHY_TUNABLE_COUNT][ETH_GSTRING_LEN] = {
- [ETHTOOL_ID_UNSPEC] = "Unspec",
- [ETHTOOL_PHY_DOWNSHIFT] = "phy-downshift",
-};
-
static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
{
struct ethtool_gfeatures cmd = {
@@ -167,7 +112,8 @@ static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
return -EFAULT;
useraddr += sizeof(cmd);
- if (copy_to_user(useraddr, features, copy_size * sizeof(*features)))
+ if (copy_to_user(useraddr, features,
+ array_size(copy_size, sizeof(*features))))
return -EFAULT;
return 0;
@@ -215,6 +161,7 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
static int __ethtool_get_sset_count(struct net_device *dev, int sset)
{
+ const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops;
const struct ethtool_ops *ops = dev->ethtool_ops;
if (sset == ETH_SS_FEATURES)
@@ -230,8 +177,12 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
return ARRAY_SIZE(phy_tunable_strings);
if (sset == ETH_SS_PHY_STATS && dev->phydev &&
- !ops->get_ethtool_phy_stats)
- return phy_ethtool_get_sset_count(dev->phydev);
+ !ops->get_ethtool_phy_stats &&
+ phy_ops && phy_ops->get_sset_count)
+ return phy_ops->get_sset_count(dev->phydev);
+
+ if (sset == ETH_SS_LINK_MODES)
+ return __ETHTOOL_LINK_MODE_MASK_NBITS;
if (ops->get_sset_count && ops->get_strings)
return ops->get_sset_count(dev, sset);
@@ -242,6 +193,7 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
static void __ethtool_get_strings(struct net_device *dev,
u32 stringset, u8 *data)
{
+ const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops;
const struct ethtool_ops *ops = dev->ethtool_ops;
if (stringset == ETH_SS_FEATURES)
@@ -255,8 +207,12 @@ static void __ethtool_get_strings(struct net_device *dev,
else if (stringset == ETH_SS_PHY_TUNABLES)
memcpy(data, phy_tunable_strings, sizeof(phy_tunable_strings));
else if (stringset == ETH_SS_PHY_STATS && dev->phydev &&
- !ops->get_ethtool_phy_stats)
- phy_ethtool_get_strings(dev->phydev, data);
+ !ops->get_ethtool_phy_stats && phy_ops &&
+ phy_ops->get_strings)
+ phy_ops->get_strings(dev->phydev, data);
+ else if (stringset == ETH_SS_LINK_MODES)
+ memcpy(data, link_mode_names,
+ __ETHTOOL_LINK_MODE_MASK_NBITS * ETH_GSTRING_LEN);
else
/* ops->get_strings is valid because checked earlier */
ops->get_strings(dev, stringset, data);
@@ -269,13 +225,14 @@ static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd)
switch (eth_cmd) {
case ETHTOOL_GTXCSUM:
case ETHTOOL_STXCSUM:
- return NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC;
+ return NETIF_F_CSUM_MASK | NETIF_F_FCOE_CRC |
+ NETIF_F_SCTP_CRC;
case ETHTOOL_GRXCSUM:
case ETHTOOL_SRXCSUM:
return NETIF_F_RXCSUM;
case ETHTOOL_GSG:
case ETHTOOL_SSG:
- return NETIF_F_SG;
+ return NETIF_F_SG | NETIF_F_FRAGLIST;
case ETHTOOL_GTSO:
case ETHTOOL_STSO:
return NETIF_F_ALL_TSO;
@@ -402,7 +359,7 @@ EXPORT_SYMBOL(ethtool_intersect_link_masks);
void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst,
u32 legacy_u32)
{
- bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS);
+ linkmode_zero(dst);
dst[0] = legacy_u32;
}
EXPORT_SYMBOL(ethtool_convert_legacy_u32_to_link_mode);
@@ -411,74 +368,12 @@ EXPORT_SYMBOL(ethtool_convert_legacy_u32_to_link_mode);
bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
const unsigned long *src)
{
- bool retval = true;
-
- /* TODO: following test will soon always be true */
- if (__ETHTOOL_LINK_MODE_MASK_NBITS > 32) {
- __ETHTOOL_DECLARE_LINK_MODE_MASK(ext);
-
- bitmap_zero(ext, __ETHTOOL_LINK_MODE_MASK_NBITS);
- bitmap_fill(ext, 32);
- bitmap_complement(ext, ext, __ETHTOOL_LINK_MODE_MASK_NBITS);
- if (bitmap_intersects(ext, src,
- __ETHTOOL_LINK_MODE_MASK_NBITS)) {
- /* src mask goes beyond bit 31 */
- retval = false;
- }
- }
*legacy_u32 = src[0];
- return retval;
+ return find_next_bit(src, __ETHTOOL_LINK_MODE_MASK_NBITS, 32) ==
+ __ETHTOOL_LINK_MODE_MASK_NBITS;
}
EXPORT_SYMBOL(ethtool_convert_link_mode_to_legacy_u32);
-/* return false if legacy contained non-0 deprecated fields
- * maxtxpkt/maxrxpkt. rest of ksettings always updated
- */
-static bool
-convert_legacy_settings_to_link_ksettings(
- struct ethtool_link_ksettings *link_ksettings,
- const struct ethtool_cmd *legacy_settings)
-{
- bool retval = true;
-
- memset(link_ksettings, 0, sizeof(*link_ksettings));
-
- /* This is used to tell users that driver is still using these
- * deprecated legacy fields, and they should not use
- * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS
- */
- if (legacy_settings->maxtxpkt ||
- legacy_settings->maxrxpkt)
- retval = false;
-
- ethtool_convert_legacy_u32_to_link_mode(
- link_ksettings->link_modes.supported,
- legacy_settings->supported);
- ethtool_convert_legacy_u32_to_link_mode(
- link_ksettings->link_modes.advertising,
- legacy_settings->advertising);
- ethtool_convert_legacy_u32_to_link_mode(
- link_ksettings->link_modes.lp_advertising,
- legacy_settings->lp_advertising);
- link_ksettings->base.speed
- = ethtool_cmd_speed(legacy_settings);
- link_ksettings->base.duplex
- = legacy_settings->duplex;
- link_ksettings->base.port
- = legacy_settings->port;
- link_ksettings->base.phy_address
- = legacy_settings->phy_address;
- link_ksettings->base.autoneg
- = legacy_settings->autoneg;
- link_ksettings->base.mdio_support
- = legacy_settings->mdio_support;
- link_ksettings->base.eth_tp_mdix
- = legacy_settings->eth_tp_mdix;
- link_ksettings->base.eth_tp_mdix_ctrl
- = legacy_settings->eth_tp_mdix_ctrl;
- return retval;
-}
-
/* return false if ksettings link modes had higher bits
* set. legacy_settings always updated (best effort)
*/
@@ -539,47 +434,20 @@ struct ethtool_link_usettings {
} link_modes;
};
-/* Internal kernel helper to query a device ethtool_link_settings.
- *
- * Backward compatibility note: for compatibility with legacy drivers
- * that implement only the ethtool_cmd API, this has to work with both
- * drivers implementing get_link_ksettings API and drivers
- * implementing get_settings API. When drivers implement get_settings
- * and report ethtool_cmd deprecated fields
- * (transceiver/maxrxpkt/maxtxpkt), these fields are silently ignored
- * because the resulting struct ethtool_link_settings does not report them.
- */
+/* Internal kernel helper to query a device ethtool_link_settings. */
int __ethtool_get_link_ksettings(struct net_device *dev,
struct ethtool_link_ksettings *link_ksettings)
{
- int err;
- struct ethtool_cmd cmd;
-
ASSERT_RTNL();
- if (dev->ethtool_ops->get_link_ksettings) {
- memset(link_ksettings, 0, sizeof(*link_ksettings));
- return dev->ethtool_ops->get_link_ksettings(dev,
- link_ksettings);
- }
-
- /* driver doesn't support %ethtool_link_ksettings API. revert to
- * legacy %ethtool_cmd API, unless it's not supported either.
- * TODO: remove when ethtool_ops::get_settings disappears internally
- */
- if (!dev->ethtool_ops->get_settings)
+ if (!dev->ethtool_ops->get_link_ksettings)
return -EOPNOTSUPP;
- memset(&cmd, 0, sizeof(cmd));
- cmd.cmd = ETHTOOL_GSET;
- err = dev->ethtool_ops->get_settings(dev, &cmd);
- if (err < 0)
- return err;
+ if (!netif_device_present(dev))
+ return -ENODEV;
- /* we ignore deprecated fields transceiver/maxrxpkt/maxtxpkt
- */
- convert_legacy_settings_to_link_ksettings(link_ksettings, &cmd);
- return err;
+ memset(link_ksettings, 0, sizeof(*link_ksettings));
+ return dev->ethtool_ops->get_link_ksettings(dev, link_ksettings);
}
EXPORT_SYMBOL(__ethtool_get_link_ksettings);
@@ -608,6 +476,24 @@ static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to,
return 0;
}
+/* Check if the user is trying to change anything besides speed/duplex */
+bool ethtool_virtdev_validate_cmd(const struct ethtool_link_ksettings *cmd)
+{
+ struct ethtool_link_settings base2 = {};
+
+ base2.speed = cmd->base.speed;
+ base2.port = PORT_OTHER;
+ base2.duplex = cmd->base.duplex;
+ base2.cmd = cmd->base.cmd;
+ base2.link_mode_masks_nwords = cmd->base.link_mode_masks_nwords;
+
+ return !memcmp(&base2, &cmd->base, sizeof(base2)) &&
+ bitmap_empty(cmd->link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS) &&
+ bitmap_empty(cmd->link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
/* convert a kernel internal ethtool_link_ksettings to
* ethtool_link_usettings in user space. return 0 on success, errno on
* error.
@@ -618,7 +504,7 @@ store_link_ksettings_for_user(void __user *to,
{
struct ethtool_link_usettings link_usettings;
- memcpy(&link_usettings.base, &from->base, sizeof(link_usettings));
+ memcpy(&link_usettings, from, sizeof(link_usettings));
bitmap_to_arr32(link_usettings.link_modes.supported,
from->link_modes.supported,
__ETHTOOL_LINK_MODE_MASK_NBITS);
@@ -635,16 +521,7 @@ store_link_ksettings_for_user(void __user *to,
return 0;
}
-/* Query device for its ethtool_link_settings.
- *
- * Backward compatibility note: this function must fail when driver
- * does not implement ethtool::get_link_ksettings, even if legacy
- * ethtool_ops::get_settings is implemented. This tells new versions
- * of ethtool that they should use the legacy API %ETHTOOL_GSET for
- * this driver, so that they can correctly access the ethtool_cmd
- * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
- * implements ethtool_ops::get_settings anymore.
- */
+/* Query device for its ethtool_link_settings. */
static int ethtool_get_link_ksettings(struct net_device *dev,
void __user *useraddr)
{
@@ -652,7 +529,6 @@ static int ethtool_get_link_ksettings(struct net_device *dev,
struct ethtool_link_ksettings link_ksettings;
ASSERT_RTNL();
-
if (!dev->ethtool_ops->get_link_ksettings)
return -EOPNOTSUPP;
@@ -695,25 +571,19 @@ static int ethtool_get_link_ksettings(struct net_device *dev,
link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS;
link_ksettings.base.link_mode_masks_nwords
= __ETHTOOL_LINK_MODE_MASK_NU32;
+ link_ksettings.base.master_slave_cfg = MASTER_SLAVE_CFG_UNSUPPORTED;
+ link_ksettings.base.master_slave_state = MASTER_SLAVE_STATE_UNSUPPORTED;
+ link_ksettings.base.rate_matching = RATE_MATCH_NONE;
return store_link_ksettings_for_user(useraddr, &link_ksettings);
}
-/* Update device ethtool_link_settings.
- *
- * Backward compatibility note: this function must fail when driver
- * does not implement ethtool::set_link_ksettings, even if legacy
- * ethtool_ops::set_settings is implemented. This tells new versions
- * of ethtool that they should use the legacy API %ETHTOOL_SSET for
- * this driver, so that they can correctly update the ethtool_cmd
- * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
- * implements ethtool_ops::get_settings anymore.
- */
+/* Update device ethtool_link_settings. */
static int ethtool_set_link_ksettings(struct net_device *dev,
void __user *useraddr)
{
+ struct ethtool_link_ksettings link_ksettings = {};
int err;
- struct ethtool_link_ksettings link_ksettings;
ASSERT_RTNL();
@@ -741,56 +611,69 @@ static int ethtool_set_link_ksettings(struct net_device *dev,
!= link_ksettings.base.link_mode_masks_nwords)
return -EINVAL;
- return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
+ if (link_ksettings.base.master_slave_cfg ||
+ link_ksettings.base.master_slave_state)
+ return -EINVAL;
+
+ err = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
+ if (err >= 0) {
+ ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF);
+ ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF);
+ }
+ return err;
+}
+
+int ethtool_virtdev_set_link_ksettings(struct net_device *dev,
+ const struct ethtool_link_ksettings *cmd,
+ u32 *dev_speed, u8 *dev_duplex)
+{
+ u32 speed;
+ u8 duplex;
+
+ speed = cmd->base.speed;
+ duplex = cmd->base.duplex;
+ /* don't allow custom speed and duplex */
+ if (!ethtool_validate_speed(speed) ||
+ !ethtool_validate_duplex(duplex) ||
+ !ethtool_virtdev_validate_cmd(cmd))
+ return -EINVAL;
+ *dev_speed = speed;
+ *dev_duplex = duplex;
+
+ return 0;
}
+EXPORT_SYMBOL(ethtool_virtdev_set_link_ksettings);
/* Query device for its ethtool_cmd settings.
*
- * Backward compatibility note: for compatibility with legacy ethtool,
- * this has to work with both drivers implementing get_link_ksettings
- * API and drivers implementing get_settings API. When drivers
- * implement get_link_ksettings and report higher link mode bits, a
- * kernel warning is logged once (with name of 1st driver/device) to
- * recommend user to upgrade ethtool, but the command is successful
- * (only the lower link mode bits reported back to user).
+ * Backward compatibility note: for compatibility with legacy ethtool, this is
+ * now implemented via get_link_ksettings. When driver reports higher link mode
+ * bits, a kernel warning is logged once (with name of 1st driver/device) to
+ * recommend user to upgrade ethtool, but the command is successful (only the
+ * lower link mode bits reported back to user). Deprecated fields from
+ * ethtool_cmd (transceiver/maxrxpkt/maxtxpkt) are always set to zero.
*/
static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
{
+ struct ethtool_link_ksettings link_ksettings;
struct ethtool_cmd cmd;
+ int err;
ASSERT_RTNL();
+ if (!dev->ethtool_ops->get_link_ksettings)
+ return -EOPNOTSUPP;
- if (dev->ethtool_ops->get_link_ksettings) {
- /* First, use link_ksettings API if it is supported */
- int err;
- struct ethtool_link_ksettings link_ksettings;
-
- memset(&link_ksettings, 0, sizeof(link_ksettings));
- err = dev->ethtool_ops->get_link_ksettings(dev,
- &link_ksettings);
- if (err < 0)
- return err;
- convert_link_ksettings_to_legacy_settings(&cmd,
- &link_ksettings);
-
- /* send a sensible cmd tag back to user */
- cmd.cmd = ETHTOOL_GSET;
- } else {
- /* driver doesn't support %ethtool_link_ksettings
- * API. revert to legacy %ethtool_cmd API, unless it's
- * not supported either.
- */
- int err;
+ if (dev->ethtool->module_fw_flash_in_progress)
+ return -EBUSY;
- if (!dev->ethtool_ops->get_settings)
- return -EOPNOTSUPP;
+ memset(&link_ksettings, 0, sizeof(link_ksettings));
+ err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings);
+ if (err < 0)
+ return err;
+ convert_link_ksettings_to_legacy_settings(&cmd, &link_ksettings);
- memset(&cmd, 0, sizeof(cmd));
- cmd.cmd = ETHTOOL_GSET;
- err = dev->ethtool_ops->get_settings(dev, &cmd);
- if (err < 0)
- return err;
- }
+ /* send a sensible cmd tag back to user */
+ cmd.cmd = ETHTOOL_GSET;
if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
return -EFAULT;
@@ -800,65 +683,62 @@ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
/* Update device link settings with given ethtool_cmd.
*
- * Backward compatibility note: for compatibility with legacy ethtool,
- * this has to work with both drivers implementing set_link_ksettings
- * API and drivers implementing set_settings API. When drivers
- * implement set_link_ksettings and user's request updates deprecated
- * ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel
- * warning is logged once (with name of 1st driver/device) to
- * recommend user to upgrade ethtool, and the request is rejected.
+ * Backward compatibility note: for compatibility with legacy ethtool, this is
+ * now always implemented via set_link_settings. When user's request updates
+ * deprecated ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel
+ * warning is logged once (with name of 1st driver/device) to recommend user to
+ * upgrade ethtool, and the request is rejected.
*/
static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
{
+ struct ethtool_link_ksettings link_ksettings;
struct ethtool_cmd cmd;
+ int ret;
ASSERT_RTNL();
if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
return -EFAULT;
-
- /* first, try new %ethtool_link_ksettings API. */
- if (dev->ethtool_ops->set_link_ksettings) {
- struct ethtool_link_ksettings link_ksettings;
-
- if (!convert_legacy_settings_to_link_ksettings(&link_ksettings,
- &cmd))
- return -EINVAL;
-
- link_ksettings.base.cmd = ETHTOOL_SLINKSETTINGS;
- link_ksettings.base.link_mode_masks_nwords
- = __ETHTOOL_LINK_MODE_MASK_NU32;
- return dev->ethtool_ops->set_link_ksettings(dev,
- &link_ksettings);
- }
-
- /* legacy %ethtool_cmd API */
-
- /* TODO: return -EOPNOTSUPP when ethtool_ops::get_settings
- * disappears internally
- */
-
- if (!dev->ethtool_ops->set_settings)
+ if (!dev->ethtool_ops->set_link_ksettings)
return -EOPNOTSUPP;
- return dev->ethtool_ops->set_settings(dev, &cmd);
+ if (!convert_legacy_settings_to_link_ksettings(&link_ksettings, &cmd))
+ return -EINVAL;
+ link_ksettings.base.link_mode_masks_nwords =
+ __ETHTOOL_LINK_MODE_MASK_NU32;
+ ret = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
+ if (ret >= 0) {
+ ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF);
+ ethtool_notify(dev, ETHTOOL_MSG_LINKMODES_NTF);
+ }
+ return ret;
}
-static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
- void __user *useraddr)
+static int
+ethtool_get_drvinfo(struct net_device *dev, struct ethtool_devlink_compat *rsp)
{
- struct ethtool_drvinfo info;
const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct device *parent = dev->dev.parent;
- memset(&info, 0, sizeof(info));
- info.cmd = ETHTOOL_GDRVINFO;
+ rsp->info.cmd = ETHTOOL_GDRVINFO;
+ strscpy(rsp->info.version, init_uts_ns.name.release,
+ sizeof(rsp->info.version));
if (ops->get_drvinfo) {
- ops->get_drvinfo(dev, &info);
- } else if (dev->dev.parent && dev->dev.parent->driver) {
- strlcpy(info.bus_info, dev_name(dev->dev.parent),
- sizeof(info.bus_info));
- strlcpy(info.driver, dev->dev.parent->driver->name,
- sizeof(info.driver));
+ ops->get_drvinfo(dev, &rsp->info);
+ if (!rsp->info.bus_info[0] && parent)
+ strscpy(rsp->info.bus_info, dev_name(parent),
+ sizeof(rsp->info.bus_info));
+ if (!rsp->info.driver[0] && parent && parent->driver)
+ strscpy(rsp->info.driver, parent->driver->name,
+ sizeof(rsp->info.driver));
+ } else if (parent && parent->driver) {
+ strscpy(rsp->info.bus_info, dev_name(parent),
+ sizeof(rsp->info.bus_info));
+ strscpy(rsp->info.driver, parent->driver->name,
+ sizeof(rsp->info.driver));
+ } else if (dev->rtnl_link_ops) {
+ strscpy(rsp->info.driver, dev->rtnl_link_ops->kind,
+ sizeof(rsp->info.driver));
} else {
return -EOPNOTSUPP;
}
@@ -872,21 +752,27 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev,
rc = ops->get_sset_count(dev, ETH_SS_TEST);
if (rc >= 0)
- info.testinfo_len = rc;
+ rsp->info.testinfo_len = rc;
rc = ops->get_sset_count(dev, ETH_SS_STATS);
if (rc >= 0)
- info.n_stats = rc;
+ rsp->info.n_stats = rc;
rc = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS);
if (rc >= 0)
- info.n_priv_flags = rc;
+ rsp->info.n_priv_flags = rc;
+ }
+ if (ops->get_regs_len) {
+ int ret = ops->get_regs_len(dev);
+
+ if (ret > 0)
+ rsp->info.regdump_len = ret;
}
- if (ops->get_regs_len)
- info.regdump_len = ops->get_regs_len(dev);
+
if (ops->get_eeprom_len)
- info.eedump_len = ops->get_eeprom_len(dev);
+ rsp->info.eedump_len = ops->get_eeprom_len(dev);
+
+ if (!rsp->info.fw_version[0])
+ rsp->devlink = netdev_to_devlink_get(dev);
- if (copy_to_user(useraddr, &info, sizeof(info)))
- return -EFAULT;
return 0;
}
@@ -936,7 +822,7 @@ static noinline_for_stack int ethtool_get_sset_info(struct net_device *dev,
goto out;
useraddr += offsetof(struct ethtool_sset_info, data);
- if (copy_to_user(useraddr, info_buf, idx * sizeof(u32)))
+ if (copy_to_user(useraddr, info_buf, array_size(idx, sizeof(u32))))
goto out;
ret = 0;
@@ -946,75 +832,421 @@ out:
return ret;
}
-static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
- u32 cmd, void __user *useraddr)
+static noinline_for_stack int
+ethtool_rxnfc_copy_from_compat(struct ethtool_rxnfc *rxnfc,
+ const struct compat_ethtool_rxnfc __user *useraddr,
+ size_t size)
{
- struct ethtool_rxnfc info;
- size_t info_size = sizeof(info);
- int rc;
+ struct compat_ethtool_rxnfc crxnfc = {};
- if (!dev->ethtool_ops->set_rxnfc)
- return -EOPNOTSUPP;
+ /* We expect there to be holes between fs.m_ext and
+ * fs.ring_cookie and at the end of fs, but nowhere else.
+ * On non-x86, no conversion should be needed.
+ */
+ BUILD_BUG_ON(!IS_ENABLED(CONFIG_X86_64) &&
+ sizeof(struct compat_ethtool_rxnfc) !=
+ sizeof(struct ethtool_rxnfc));
+ BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) +
+ sizeof(useraddr->fs.m_ext) !=
+ offsetof(struct ethtool_rxnfc, fs.m_ext) +
+ sizeof(rxnfc->fs.m_ext));
+ BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.location) -
+ offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) !=
+ offsetof(struct ethtool_rxnfc, fs.location) -
+ offsetof(struct ethtool_rxnfc, fs.ring_cookie));
+
+ if (copy_from_user(&crxnfc, useraddr, min(size, sizeof(crxnfc))))
+ return -EFAULT;
+
+ *rxnfc = (struct ethtool_rxnfc) {
+ .cmd = crxnfc.cmd,
+ .flow_type = crxnfc.flow_type,
+ .data = crxnfc.data,
+ .fs = {
+ .flow_type = crxnfc.fs.flow_type,
+ .h_u = crxnfc.fs.h_u,
+ .h_ext = crxnfc.fs.h_ext,
+ .m_u = crxnfc.fs.m_u,
+ .m_ext = crxnfc.fs.m_ext,
+ .ring_cookie = crxnfc.fs.ring_cookie,
+ .location = crxnfc.fs.location,
+ },
+ .rule_cnt = crxnfc.rule_cnt,
+ };
+
+ return 0;
+}
+
+static int ethtool_rxnfc_copy_from_user(struct ethtool_rxnfc *rxnfc,
+ const void __user *useraddr,
+ size_t size)
+{
+ if (compat_need_64bit_alignment_fixup())
+ return ethtool_rxnfc_copy_from_compat(rxnfc, useraddr, size);
+
+ if (copy_from_user(rxnfc, useraddr, size))
+ return -EFAULT;
+
+ return 0;
+}
+static int ethtool_rxnfc_copy_to_compat(void __user *useraddr,
+ const struct ethtool_rxnfc *rxnfc,
+ size_t size, const u32 *rule_buf)
+{
+ struct compat_ethtool_rxnfc crxnfc;
+
+ memset(&crxnfc, 0, sizeof(crxnfc));
+ crxnfc = (struct compat_ethtool_rxnfc) {
+ .cmd = rxnfc->cmd,
+ .flow_type = rxnfc->flow_type,
+ .data = rxnfc->data,
+ .fs = {
+ .flow_type = rxnfc->fs.flow_type,
+ .h_u = rxnfc->fs.h_u,
+ .h_ext = rxnfc->fs.h_ext,
+ .m_u = rxnfc->fs.m_u,
+ .m_ext = rxnfc->fs.m_ext,
+ .ring_cookie = rxnfc->fs.ring_cookie,
+ .location = rxnfc->fs.location,
+ },
+ .rule_cnt = rxnfc->rule_cnt,
+ };
+
+ if (copy_to_user(useraddr, &crxnfc, min(size, sizeof(crxnfc))))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int ethtool_rxnfc_copy_struct(u32 cmd, struct ethtool_rxnfc *info,
+ size_t *info_size, void __user *useraddr)
+{
/* struct ethtool_rxnfc was originally defined for
* ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data
* members. User-space might still be using that
- * definition. */
- if (cmd == ETHTOOL_SRXFH)
- info_size = (offsetof(struct ethtool_rxnfc, data) +
- sizeof(info.data));
+ * definition.
+ */
+ if (cmd == ETHTOOL_GRXFH || cmd == ETHTOOL_SRXFH)
+ *info_size = (offsetof(struct ethtool_rxnfc, data) +
+ sizeof(info->data));
+
+ if (ethtool_rxnfc_copy_from_user(info, useraddr, *info_size))
+ return -EFAULT;
+
+ if ((cmd == ETHTOOL_GRXFH || cmd == ETHTOOL_SRXFH) && info->flow_type & FLOW_RSS) {
+ *info_size = sizeof(*info);
+ if (ethtool_rxnfc_copy_from_user(info, useraddr, *info_size))
+ return -EFAULT;
+ /* Since malicious users may modify the original data,
+ * we need to check whether FLOW_RSS is still requested.
+ */
+ if (!(info->flow_type & FLOW_RSS))
+ return -EINVAL;
+ }
+
+ if (info->cmd != cmd)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ethtool_rxnfc_copy_to_user(void __user *useraddr,
+ const struct ethtool_rxnfc *rxnfc,
+ size_t size, const u32 *rule_buf)
+{
+ int ret;
- if (copy_from_user(&info, useraddr, info_size))
+ if (compat_need_64bit_alignment_fixup()) {
+ ret = ethtool_rxnfc_copy_to_compat(useraddr, rxnfc, size,
+ rule_buf);
+ useraddr += offsetof(struct compat_ethtool_rxnfc, rule_locs);
+ } else {
+ ret = copy_to_user(useraddr, rxnfc, size);
+ useraddr += offsetof(struct ethtool_rxnfc, rule_locs);
+ }
+
+ if (ret)
return -EFAULT;
- rc = dev->ethtool_ops->set_rxnfc(dev, &info);
+ if (rule_buf) {
+ if (copy_to_user(useraddr, rule_buf,
+ rxnfc->rule_cnt * sizeof(u32)))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static bool flow_type_hashable(u32 flow_type)
+{
+ switch (flow_type) {
+ case ETHER_FLOW:
+ case TCP_V4_FLOW:
+ case UDP_V4_FLOW:
+ case SCTP_V4_FLOW:
+ case AH_ESP_V4_FLOW:
+ case TCP_V6_FLOW:
+ case UDP_V6_FLOW:
+ case SCTP_V6_FLOW:
+ case AH_ESP_V6_FLOW:
+ case AH_V4_FLOW:
+ case ESP_V4_FLOW:
+ case AH_V6_FLOW:
+ case ESP_V6_FLOW:
+ case IPV4_FLOW:
+ case IPV6_FLOW:
+ case GTPU_V4_FLOW:
+ case GTPU_V6_FLOW:
+ case GTPC_V4_FLOW:
+ case GTPC_V6_FLOW:
+ case GTPC_TEID_V4_FLOW:
+ case GTPC_TEID_V6_FLOW:
+ case GTPU_EH_V4_FLOW:
+ case GTPU_EH_V6_FLOW:
+ case GTPU_UL_V4_FLOW:
+ case GTPU_UL_V6_FLOW:
+ case GTPU_DL_V4_FLOW:
+ case GTPU_DL_V6_FLOW:
+ return true;
+ }
+
+ return false;
+}
+
+static bool flow_type_v6(u32 flow_type)
+{
+ switch (flow_type) {
+ case TCP_V6_FLOW:
+ case UDP_V6_FLOW:
+ case SCTP_V6_FLOW:
+ case AH_ESP_V6_FLOW:
+ case AH_V6_FLOW:
+ case ESP_V6_FLOW:
+ case IPV6_FLOW:
+ case GTPU_V6_FLOW:
+ case GTPC_V6_FLOW:
+ case GTPC_TEID_V6_FLOW:
+ case GTPU_EH_V6_FLOW:
+ case GTPU_UL_V6_FLOW:
+ case GTPU_DL_V6_FLOW:
+ return true;
+ }
+
+ return false;
+}
+
+/* When adding a new type, update the assert and, if it's hashable, add it to
+ * the flow_type_hashable switch case.
+ */
+static_assert(GTPU_DL_V6_FLOW + 1 == __FLOW_TYPE_COUNT);
+
+static int ethtool_check_xfrm_rxfh(u32 input_xfrm, u64 rxfh)
+{
+ /* Sanity check: if symmetric-xor/symmetric-or-xor is set, then:
+ * 1 - no other fields besides IP src/dst and/or L4 src/dst are set
+ * 2 - If src is set, dst must also be set
+ */
+ if ((input_xfrm != RXH_XFRM_NO_CHANGE &&
+ input_xfrm & (RXH_XFRM_SYM_XOR | RXH_XFRM_SYM_OR_XOR)) &&
+ !ethtool_rxfh_config_is_sym(rxfh))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ethtool_check_flow_types(struct net_device *dev, u32 input_xfrm)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int err;
+ u32 i;
+
+ if (!input_xfrm || input_xfrm == RXH_XFRM_NO_CHANGE)
+ return 0;
+
+ for (i = 0; i < __FLOW_TYPE_COUNT; i++) {
+ struct ethtool_rxfh_fields fields = {
+ .flow_type = i,
+ };
+
+ if (!flow_type_hashable(i))
+ continue;
+
+ if (ops->get_rxfh_fields(dev, &fields))
+ continue;
+
+ err = ethtool_check_xfrm_rxfh(input_xfrm, fields.data);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static noinline_for_stack int
+ethtool_set_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxfh_fields fields = {};
+ struct ethtool_rxnfc info;
+ size_t info_size = sizeof(info);
+ int rc;
+
+ if (!ops->set_rxfh_fields)
+ return -EOPNOTSUPP;
+
+ rc = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr);
if (rc)
return rc;
- if (cmd == ETHTOOL_SRXCLSRLINS &&
- copy_to_user(useraddr, &info, info_size))
- return -EFAULT;
+ if (info.data & RXH_IP6_FL && !flow_type_v6(info.flow_type))
+ return -EINVAL;
+
+ if (info.flow_type & FLOW_RSS && info.rss_context &&
+ !ops->rxfh_per_ctx_fields)
+ return -EINVAL;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ if (ops->get_rxfh) {
+ struct ethtool_rxfh_param rxfh = {};
+
+ rc = ops->get_rxfh(dev, &rxfh);
+ if (rc)
+ goto exit_unlock;
+
+ rc = ethtool_check_xfrm_rxfh(rxfh.input_xfrm, info.data);
+ if (rc)
+ goto exit_unlock;
+ }
+ fields.data = info.data;
+ fields.flow_type = info.flow_type & ~FLOW_RSS;
+ if (info.flow_type & FLOW_RSS)
+ fields.rss_context = info.rss_context;
+
+ rc = ops->set_rxfh_fields(dev, &fields, NULL);
+exit_unlock:
+ mutex_unlock(&dev->ethtool->rss_lock);
+ if (rc)
+ return rc;
+
+ ethtool_rss_notify(dev, ETHTOOL_MSG_RSS_NTF, fields.rss_context);
return 0;
}
-static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
- u32 cmd, void __user *useraddr)
+static noinline_for_stack int
+ethtool_get_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr)
{
struct ethtool_rxnfc info;
size_t info_size = sizeof(info);
const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxfh_fields fields = {};
int ret;
- void *rule_buf = NULL;
- if (!ops->get_rxnfc)
+ if (!ops->get_rxfh_fields)
return -EOPNOTSUPP;
- /* struct ethtool_rxnfc was originally defined for
- * ETHTOOL_{G,S}RXFH with only the cmd, flow_type and data
- * members. User-space might still be using that
- * definition. */
- if (cmd == ETHTOOL_GRXFH)
- info_size = (offsetof(struct ethtool_rxnfc, data) +
- sizeof(info.data));
+ ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr);
+ if (ret)
+ return ret;
- if (copy_from_user(&info, useraddr, info_size))
- return -EFAULT;
+ if (info.flow_type & FLOW_RSS && info.rss_context &&
+ !ops->rxfh_per_ctx_fields)
+ return -EINVAL;
- /* If FLOW_RSS was requested then user-space must be using the
- * new definition, as FLOW_RSS is newer.
- */
- if (cmd == ETHTOOL_GRXFH && info.flow_type & FLOW_RSS) {
- info_size = sizeof(info);
- if (copy_from_user(&info, useraddr, info_size))
- return -EFAULT;
- /* Since malicious users may modify the original data,
- * we need to check whether FLOW_RSS is still requested.
+ fields.flow_type = info.flow_type & ~FLOW_RSS;
+ if (info.flow_type & FLOW_RSS)
+ fields.rss_context = info.rss_context;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ ret = ops->get_rxfh_fields(dev, &fields);
+ mutex_unlock(&dev->ethtool->rss_lock);
+ if (ret < 0)
+ return ret;
+
+ info.data = fields.data;
+
+ return ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL);
+}
+
+static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
+ u32 cmd, void __user *useraddr)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxnfc info;
+ size_t info_size = sizeof(info);
+ int rc;
+
+ if (!ops->set_rxnfc)
+ return -EOPNOTSUPP;
+
+ rc = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr);
+ if (rc)
+ return rc;
+
+ if (cmd == ETHTOOL_SRXCLSRLINS && info.fs.flow_type & FLOW_RSS) {
+ /* Nonzero ring with RSS only makes sense
+ * if NIC adds them together
*/
- if (!(info.flow_type & FLOW_RSS))
+ if (!ops->cap_rss_rxnfc_adds &&
+ ethtool_get_flow_spec_ring(info.fs.ring_cookie))
+ return -EINVAL;
+
+ if (info.rss_context &&
+ !xa_load(&dev->ethtool->rss_ctx, info.rss_context))
return -EINVAL;
}
+ rc = ops->set_rxnfc(dev, &info);
+ if (rc)
+ return rc;
+
+ if (cmd == ETHTOOL_SRXCLSRLINS &&
+ ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL))
+ return -EFAULT;
+
+ return 0;
+}
+
+static noinline_for_stack int ethtool_get_rxrings(struct net_device *dev,
+ u32 cmd,
+ void __user *useraddr)
+{
+ struct ethtool_rxnfc info;
+ size_t info_size;
+ int ret;
+
+ info_size = sizeof(info);
+ ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr);
+ if (ret)
+ return ret;
+
+ ret = ethtool_get_rx_ring_count(dev);
+ if (ret < 0)
+ return ret;
+
+ info.data = ret;
+
+ return ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL);
+}
+
+static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
+ u32 cmd, void __user *useraddr)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxnfc info;
+ void *rule_buf = NULL;
+ size_t info_size;
+ int ret;
+
+ if (!ops->get_rxnfc)
+ return -EOPNOTSUPP;
+
+ info_size = sizeof(info);
+ ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr);
+ if (ret)
+ return ret;
+
if (info.cmd == ETHTOOL_GRXCLSRLALL) {
if (info.rule_cnt > 0) {
if (info.rule_cnt <= KMALLOC_MAX_SIZE / sizeof(u32))
@@ -1029,18 +1261,7 @@ static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
if (ret < 0)
goto err_out;
- ret = -EFAULT;
- if (copy_to_user(useraddr, &info, info_size))
- goto err_out;
-
- if (rule_buf) {
- useraddr += offsetof(struct ethtool_rxnfc, rule_locs);
- if (copy_to_user(useraddr, rule_buf,
- info.rule_cnt * sizeof(u32)))
- goto err_out;
- }
- ret = 0;
-
+ ret = ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, rule_buf);
err_out:
kfree(rule_buf);
@@ -1048,17 +1269,17 @@ err_out:
}
static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr,
- struct ethtool_rxnfc *rx_rings,
- u32 size)
+ int num_rx_rings,
+ u32 size)
{
int i;
- if (copy_from_user(indir, useraddr, size * sizeof(indir[0])))
+ if (copy_from_user(indir, useraddr, array_size(size, sizeof(indir[0]))))
return -EFAULT;
/* Validate ring indices */
for (i = 0; i < size; i++)
- if (indir[i] >= rx_rings->data)
+ if (indir[i] >= num_rx_rings)
return -EINVAL;
return 0;
@@ -1074,49 +1295,18 @@ void netdev_rss_key_fill(void *buffer, size_t len)
}
EXPORT_SYMBOL(netdev_rss_key_fill);
-static int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max)
-{
- u32 dev_size, current_max = 0;
- u32 *indir;
- int ret;
-
- if (!dev->ethtool_ops->get_rxfh_indir_size ||
- !dev->ethtool_ops->get_rxfh)
- return -EOPNOTSUPP;
- dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
- if (dev_size == 0)
- return -EOPNOTSUPP;
-
- indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
- if (!indir)
- return -ENOMEM;
-
- ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL);
- if (ret)
- goto out;
-
- while (dev_size--)
- current_max = max(current_max, indir[dev_size]);
-
- *max = current_max;
-
-out:
- kfree(indir);
- return ret;
-}
-
static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
void __user *useraddr)
{
- u32 user_size, dev_size;
- u32 *indir;
+ struct ethtool_rxfh_param rxfh = {};
+ u32 user_size;
int ret;
if (!dev->ethtool_ops->get_rxfh_indir_size ||
!dev->ethtool_ops->get_rxfh)
return -EOPNOTSUPP;
- dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
- if (dev_size == 0)
+ rxfh.indir_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
+ if (rxfh.indir_size == 0)
return -EOPNOTSUPP;
if (copy_from_user(&user_size,
@@ -1125,50 +1315,51 @@ static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
return -EFAULT;
if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size),
- &dev_size, sizeof(dev_size)))
+ &rxfh.indir_size, sizeof(rxfh.indir_size)))
return -EFAULT;
/* If the user buffer size is 0, this is just a query for the
* device table size. Otherwise, if it's smaller than the
* device table size it's an error.
*/
- if (user_size < dev_size)
+ if (user_size < rxfh.indir_size)
return user_size == 0 ? 0 : -EINVAL;
- indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
- if (!indir)
+ rxfh.indir = kcalloc(rxfh.indir_size, sizeof(rxfh.indir[0]), GFP_USER);
+ if (!rxfh.indir)
return -ENOMEM;
- ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL);
+ mutex_lock(&dev->ethtool->rss_lock);
+ ret = dev->ethtool_ops->get_rxfh(dev, &rxfh);
+ mutex_unlock(&dev->ethtool->rss_lock);
if (ret)
goto out;
-
if (copy_to_user(useraddr +
offsetof(struct ethtool_rxfh_indir, ring_index[0]),
- indir, dev_size * sizeof(indir[0])))
+ rxfh.indir, rxfh.indir_size * sizeof(*rxfh.indir)))
ret = -EFAULT;
out:
- kfree(indir);
+ kfree(rxfh.indir);
return ret;
}
static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
void __user *useraddr)
{
- struct ethtool_rxnfc rx_rings;
- u32 user_size, dev_size, i;
- u32 *indir;
const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxfh_param rxfh_dev = {};
+ struct netlink_ext_ack *extack = NULL;
+ int num_rx_rings;
+ u32 user_size, i;
int ret;
u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]);
- if (!ops->get_rxfh_indir_size || !ops->set_rxfh ||
- !ops->get_rxnfc)
+ if (!ops->get_rxfh_indir_size || !ops->set_rxfh)
return -EOPNOTSUPP;
- dev_size = ops->get_rxfh_indir_size(dev);
- if (dev_size == 0)
+ rxfh_dev.indir_size = ops->get_rxfh_indir_size(dev);
+ if (rxfh_dev.indir_size == 0)
return -EOPNOTSUPP;
if (copy_from_user(&user_size,
@@ -1176,33 +1367,40 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
sizeof(user_size)))
return -EFAULT;
- if (user_size != 0 && user_size != dev_size)
+ if (user_size != 0 && user_size != rxfh_dev.indir_size)
return -EINVAL;
- indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
- if (!indir)
+ rxfh_dev.indir = kcalloc(rxfh_dev.indir_size,
+ sizeof(rxfh_dev.indir[0]), GFP_USER);
+ if (!rxfh_dev.indir)
return -ENOMEM;
- rx_rings.cmd = ETHTOOL_GRXRINGS;
- ret = ops->get_rxnfc(dev, &rx_rings, NULL);
- if (ret)
+ num_rx_rings = ethtool_get_rx_ring_count(dev);
+ if (num_rx_rings < 0) {
+ ret = num_rx_rings;
goto out;
+ }
if (user_size == 0) {
- for (i = 0; i < dev_size; i++)
- indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+ u32 *indir = rxfh_dev.indir;
+
+ for (i = 0; i < rxfh_dev.indir_size; i++)
+ indir[i] = ethtool_rxfh_indir_default(i, num_rx_rings);
} else {
- ret = ethtool_copy_validate_indir(indir,
+ ret = ethtool_copy_validate_indir(rxfh_dev.indir,
useraddr + ringidx_offset,
- &rx_rings,
- dev_size);
+ num_rx_rings,
+ rxfh_dev.indir_size);
if (ret)
goto out;
}
- ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE);
+ rxfh_dev.hfunc = ETH_RSS_HASH_NO_CHANGE;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ ret = ops->set_rxfh(dev, &rxfh_dev, extack);
if (ret)
- goto out;
+ goto out_unlock;
/* indicate whether rxfh was set to default */
if (user_size == 0)
@@ -1210,33 +1408,33 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
else
dev->priv_flags |= IFF_RXFH_CONFIGURED;
+out_unlock:
+ mutex_unlock(&dev->ethtool->rss_lock);
out:
- kfree(indir);
+ kfree(rxfh_dev.indir);
return ret;
}
static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
void __user *useraddr)
{
- int ret;
const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxfh_param rxfh_dev = {};
u32 user_indir_size, user_key_size;
- u32 dev_indir_size = 0, dev_key_size = 0;
+ struct ethtool_rxfh_context *ctx;
struct ethtool_rxfh rxfh;
- u32 total_size;
u32 indir_bytes;
- u32 *indir = NULL;
- u8 dev_hfunc = 0;
- u8 *hkey = NULL;
u8 *rss_config;
+ u32 total_size;
+ int ret;
if (!ops->get_rxfh)
return -EOPNOTSUPP;
if (ops->get_rxfh_indir_size)
- dev_indir_size = ops->get_rxfh_indir_size(dev);
+ rxfh_dev.indir_size = ops->get_rxfh_indir_size(dev);
if (ops->get_rxfh_key_size)
- dev_key_size = ops->get_rxfh_key_size(dev);
+ rxfh_dev.key_size = ops->get_rxfh_key_size(dev);
if (copy_from_user(&rxfh, useraddr, sizeof(rxfh)))
return -EFAULT;
@@ -1244,44 +1442,72 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
user_key_size = rxfh.key_size;
/* Check that reserved fields are 0 for now */
- if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32)
+ if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd32)
return -EINVAL;
/* Most drivers don't handle rss_context, check it's 0 as well */
- if (rxfh.rss_context && !ops->get_rxfh_context)
+ if (rxfh.rss_context && !ops->create_rxfh_context)
return -EOPNOTSUPP;
- rxfh.indir_size = dev_indir_size;
- rxfh.key_size = dev_key_size;
+ rxfh.indir_size = rxfh_dev.indir_size;
+ rxfh.key_size = rxfh_dev.key_size;
if (copy_to_user(useraddr, &rxfh, sizeof(rxfh)))
return -EFAULT;
- if ((user_indir_size && (user_indir_size != dev_indir_size)) ||
- (user_key_size && (user_key_size != dev_key_size)))
+ if ((user_indir_size && user_indir_size != rxfh_dev.indir_size) ||
+ (user_key_size && user_key_size != rxfh_dev.key_size))
return -EINVAL;
- indir_bytes = user_indir_size * sizeof(indir[0]);
+ indir_bytes = user_indir_size * sizeof(rxfh_dev.indir[0]);
total_size = indir_bytes + user_key_size;
rss_config = kzalloc(total_size, GFP_USER);
if (!rss_config)
return -ENOMEM;
if (user_indir_size)
- indir = (u32 *)rss_config;
+ rxfh_dev.indir = (u32 *)rss_config;
if (user_key_size)
- hkey = rss_config + indir_bytes;
+ rxfh_dev.key = rss_config + indir_bytes;
- if (rxfh.rss_context)
- ret = dev->ethtool_ops->get_rxfh_context(dev, indir, hkey,
- &dev_hfunc,
- rxfh.rss_context);
- else
- ret = dev->ethtool_ops->get_rxfh(dev, indir, hkey, &dev_hfunc);
- if (ret)
- goto out;
+ mutex_lock(&dev->ethtool->rss_lock);
+ if (rxfh.rss_context) {
+ ctx = xa_load(&dev->ethtool->rss_ctx, rxfh.rss_context);
+ if (!ctx) {
+ ret = -ENOENT;
+ goto out;
+ }
+ if (rxfh_dev.indir)
+ memcpy(rxfh_dev.indir, ethtool_rxfh_context_indir(ctx),
+ indir_bytes);
+ if (!ops->rxfh_per_ctx_key) {
+ rxfh_dev.key_size = 0;
+ } else {
+ if (rxfh_dev.key)
+ memcpy(rxfh_dev.key,
+ ethtool_rxfh_context_key(ctx),
+ user_key_size);
+ rxfh_dev.hfunc = ctx->hfunc;
+ }
+ rxfh_dev.input_xfrm = ctx->input_xfrm;
+ ret = 0;
+ } else {
+ ret = dev->ethtool_ops->get_rxfh(dev, &rxfh_dev);
+ if (ret)
+ goto out;
+ }
if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, hfunc),
- &dev_hfunc, sizeof(rxfh.hfunc))) {
+ &rxfh_dev.hfunc, sizeof(rxfh.hfunc))) {
+ ret = -EFAULT;
+ } else if (copy_to_user(useraddr +
+ offsetof(struct ethtool_rxfh, input_xfrm),
+ &rxfh_dev.input_xfrm,
+ sizeof(rxfh.input_xfrm))) {
+ ret = -EFAULT;
+ } else if (copy_to_user(useraddr +
+ offsetof(struct ethtool_rxfh, key_size),
+ &rxfh_dev.key_size,
+ sizeof(rxfh.key_size))) {
ret = -EFAULT;
} else if (copy_to_user(useraddr +
offsetof(struct ethtool_rxfh, rss_config[0]),
@@ -1289,6 +1515,7 @@ static noinline_for_stack int ethtool_get_rxfh(struct net_device *dev,
ret = -EFAULT;
}
out:
+ mutex_unlock(&dev->ethtool->rss_lock);
kfree(rss_config);
return ret;
@@ -1297,18 +1524,21 @@ out:
static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
void __user *useraddr)
{
- int ret;
+ u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]);
const struct ethtool_ops *ops = dev->ethtool_ops;
- struct ethtool_rxnfc rx_rings;
- struct ethtool_rxfh rxfh;
u32 dev_indir_size = 0, dev_key_size = 0, i;
- u32 *indir = NULL, indir_bytes = 0;
- u8 *hkey = NULL;
+ u32 user_indir_len = 0, indir_bytes = 0;
+ struct ethtool_rxfh_param rxfh_dev = {};
+ struct ethtool_rxfh_context *ctx = NULL;
+ struct netlink_ext_ack *extack = NULL;
+ struct ethtool_rxfh rxfh;
+ bool create = false;
+ int num_rx_rings;
u8 *rss_config;
- u32 rss_cfg_offset = offsetof(struct ethtool_rxfh, rss_config[0]);
- bool delete = false;
+ int ntf = 0;
+ int ret;
- if (!ops->get_rxnfc || !ops->set_rxfh)
+ if (!ops->set_rxfh)
return -EOPNOTSUPP;
if (ops->get_rxfh_indir_size)
@@ -1320,34 +1550,55 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
return -EFAULT;
/* Check that reserved fields are 0 for now */
- if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd8[2] || rxfh.rsvd32)
+ if (rxfh.rsvd8[0] || rxfh.rsvd8[1] || rxfh.rsvd32)
return -EINVAL;
/* Most drivers don't handle rss_context, check it's 0 as well */
- if (rxfh.rss_context && !ops->set_rxfh_context)
+ if (rxfh.rss_context && !ops->create_rxfh_context)
+ return -EOPNOTSUPP;
+ /* Check input data transformation capabilities */
+ if (rxfh.input_xfrm && rxfh.input_xfrm != RXH_XFRM_SYM_XOR &&
+ rxfh.input_xfrm != RXH_XFRM_SYM_OR_XOR &&
+ rxfh.input_xfrm != RXH_XFRM_NO_CHANGE)
+ return -EINVAL;
+ if (rxfh.input_xfrm != RXH_XFRM_NO_CHANGE &&
+ rxfh.input_xfrm & ~ops->supported_input_xfrm)
return -EOPNOTSUPP;
+ create = rxfh.rss_context == ETH_RXFH_CONTEXT_ALLOC;
- /* If either indir, hash key or function is valid, proceed further.
- * Must request at least one change: indir size, hash key or function.
- */
if ((rxfh.indir_size &&
rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE &&
rxfh.indir_size != dev_indir_size) ||
- (rxfh.key_size && (rxfh.key_size != dev_key_size)) ||
+ (rxfh.key_size && rxfh.key_size != dev_key_size))
+ return -EINVAL;
+
+ /* Must request at least one change: indir size, hash key, function
+ * or input transformation.
+ * There's no need for any of it in case of context creation.
+ */
+ if (!create &&
(rxfh.indir_size == ETH_RXFH_INDIR_NO_CHANGE &&
- rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE))
+ rxfh.key_size == 0 && rxfh.hfunc == ETH_RSS_HASH_NO_CHANGE &&
+ rxfh.input_xfrm == RXH_XFRM_NO_CHANGE))
return -EINVAL;
- if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
- indir_bytes = dev_indir_size * sizeof(indir[0]);
+ indir_bytes = dev_indir_size * sizeof(rxfh_dev.indir[0]);
+
+ /* Check settings which may be global rather than per RSS-context */
+ if (rxfh.rss_context && !ops->rxfh_per_ctx_key)
+ if (rxfh.key_size ||
+ (rxfh.hfunc && rxfh.hfunc != ETH_RSS_HASH_NO_CHANGE) ||
+ (rxfh.input_xfrm && rxfh.input_xfrm != RXH_XFRM_NO_CHANGE))
+ return -EOPNOTSUPP;
- rss_config = kzalloc(indir_bytes + rxfh.key_size, GFP_USER);
+ rss_config = kzalloc(indir_bytes + dev_key_size, GFP_USER);
if (!rss_config)
return -ENOMEM;
- rx_rings.cmd = ETHTOOL_GRXRINGS;
- ret = ops->get_rxnfc(dev, &rx_rings, NULL);
- if (ret)
- goto out;
+ num_rx_rings = ethtool_get_rx_ring_count(dev);
+ if (num_rx_rings < 0) {
+ ret = num_rx_rings;
+ goto out_free;
+ }
/* rxfh.indir_size == 0 means reset the indir table to default (master
* context) or delete the context (other RSS contexts).
@@ -1355,55 +1606,154 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
*/
if (rxfh.indir_size &&
rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE) {
- indir = (u32 *)rss_config;
- ret = ethtool_copy_validate_indir(indir,
+ user_indir_len = indir_bytes;
+ rxfh_dev.indir = (u32 *)rss_config;
+ rxfh_dev.indir_size = dev_indir_size;
+ ret = ethtool_copy_validate_indir(rxfh_dev.indir,
useraddr + rss_cfg_offset,
- &rx_rings,
+ num_rx_rings,
rxfh.indir_size);
if (ret)
- goto out;
+ goto out_free;
} else if (rxfh.indir_size == 0) {
if (rxfh.rss_context == 0) {
- indir = (u32 *)rss_config;
+ u32 *indir;
+
+ rxfh_dev.indir = (u32 *)rss_config;
+ rxfh_dev.indir_size = dev_indir_size;
+ indir = rxfh_dev.indir;
for (i = 0; i < dev_indir_size; i++)
- indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+ indir[i] =
+ ethtool_rxfh_indir_default(i, num_rx_rings);
} else {
- delete = true;
+ rxfh_dev.rss_delete = true;
}
}
if (rxfh.key_size) {
- hkey = rss_config + indir_bytes;
- if (copy_from_user(hkey,
- useraddr + rss_cfg_offset + indir_bytes,
+ rxfh_dev.key_size = dev_key_size;
+ rxfh_dev.key = rss_config + indir_bytes;
+ if (copy_from_user(rxfh_dev.key,
+ useraddr + rss_cfg_offset + user_indir_len,
rxfh.key_size)) {
ret = -EFAULT;
- goto out;
+ goto out_free;
}
}
- if (rxfh.rss_context)
- ret = ops->set_rxfh_context(dev, indir, hkey, rxfh.hfunc,
- &rxfh.rss_context, delete);
- else
- ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc);
+ mutex_lock(&dev->ethtool->rss_lock);
+
+ ret = ethtool_check_flow_types(dev, rxfh.input_xfrm);
if (ret)
- goto out;
+ goto out_unlock;
+
+ if (rxfh.rss_context && rxfh_dev.rss_delete) {
+ ret = ethtool_check_rss_ctx_busy(dev, rxfh.rss_context);
+ if (ret)
+ goto out_unlock;
+ }
+
+ if (create) {
+ u32 limit, ctx_id;
+
+ if (rxfh_dev.rss_delete) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ ctx = ethtool_rxfh_ctx_alloc(ops, dev_indir_size, dev_key_size);
+ if (!ctx) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ limit = ops->rxfh_max_num_contexts ?: U32_MAX;
+ ret = xa_alloc(&dev->ethtool->rss_ctx, &ctx_id, ctx,
+ XA_LIMIT(1, limit - 1), GFP_KERNEL_ACCOUNT);
+ if (ret < 0) {
+ kfree(ctx);
+ goto out_unlock;
+ }
+ WARN_ON(!ctx_id); /* can't happen */
+ rxfh.rss_context = ctx_id;
+ } else if (rxfh.rss_context) {
+ ctx = xa_load(&dev->ethtool->rss_ctx, rxfh.rss_context);
+ if (!ctx) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+ }
+ rxfh_dev.hfunc = rxfh.hfunc;
+ rxfh_dev.rss_context = rxfh.rss_context;
+ rxfh_dev.input_xfrm = rxfh.input_xfrm;
+
+ if (!rxfh.rss_context) {
+ ntf = ETHTOOL_MSG_RSS_NTF;
+ ret = ops->set_rxfh(dev, &rxfh_dev, extack);
+ } else if (create) {
+ ntf = ETHTOOL_MSG_RSS_CREATE_NTF;
+ ret = ops->create_rxfh_context(dev, ctx, &rxfh_dev, extack);
+ /* Make sure driver populates defaults */
+ WARN_ON_ONCE(!ret && !rxfh_dev.key && ops->rxfh_per_ctx_key &&
+ !memchr_inv(ethtool_rxfh_context_key(ctx), 0,
+ ctx->key_size));
+ } else if (rxfh_dev.rss_delete) {
+ ntf = ETHTOOL_MSG_RSS_DELETE_NTF;
+ ret = ops->remove_rxfh_context(dev, ctx, rxfh.rss_context,
+ extack);
+ } else {
+ ntf = ETHTOOL_MSG_RSS_NTF;
+ ret = ops->modify_rxfh_context(dev, ctx, &rxfh_dev, extack);
+ }
+ if (ret) {
+ ntf = 0;
+ if (create) {
+ /* failed to create, free our new tracking entry */
+ xa_erase(&dev->ethtool->rss_ctx, rxfh.rss_context);
+ kfree(ctx);
+ }
+ goto out_unlock;
+ }
if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh, rss_context),
- &rxfh.rss_context, sizeof(rxfh.rss_context)))
+ &rxfh_dev.rss_context, sizeof(rxfh_dev.rss_context)))
ret = -EFAULT;
- if (!rxfh.rss_context) {
+ if (!rxfh_dev.rss_context) {
/* indicate whether rxfh was set to default */
if (rxfh.indir_size == 0)
dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
dev->priv_flags |= IFF_RXFH_CONFIGURED;
}
+ /* Update rss_ctx tracking */
+ if (rxfh_dev.rss_delete) {
+ WARN_ON(xa_erase(&dev->ethtool->rss_ctx, rxfh.rss_context) != ctx);
+ kfree(ctx);
+ } else if (ctx) {
+ if (rxfh_dev.indir) {
+ for (i = 0; i < dev_indir_size; i++)
+ ethtool_rxfh_context_indir(ctx)[i] = rxfh_dev.indir[i];
+ ctx->indir_configured =
+ rxfh.indir_size &&
+ rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE;
+ }
+ if (rxfh_dev.key) {
+ memcpy(ethtool_rxfh_context_key(ctx), rxfh_dev.key,
+ dev_key_size);
+ ctx->key_configured = !!rxfh.key_size;
+ }
+ if (rxfh_dev.hfunc != ETH_RSS_HASH_NO_CHANGE)
+ ctx->hfunc = rxfh_dev.hfunc;
+ if (rxfh_dev.input_xfrm != RXH_XFRM_NO_CHANGE)
+ ctx->input_xfrm = rxfh_dev.input_xfrm;
+ }
-out:
+out_unlock:
+ mutex_unlock(&dev->ethtool->rss_lock);
+out_free:
kfree(rss_config);
+ if (ntf)
+ ethtool_rss_notify(dev, ntf, rxfh.rss_context);
return ret;
}
@@ -1421,15 +1771,18 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
return -EFAULT;
reglen = ops->get_regs_len(dev);
+ if (reglen <= 0)
+ return reglen;
+
if (regs.len > reglen)
regs.len = reglen;
- regbuf = NULL;
- if (reglen) {
- regbuf = vzalloc(reglen);
- if (!regbuf)
- return -ENOMEM;
- }
+ regbuf = vzalloc(reglen);
+ if (!regbuf)
+ return -ENOMEM;
+
+ if (regs.len < reglen)
+ reglen = regs.len;
ops->get_regs(dev, &regs, regbuf);
@@ -1437,7 +1790,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
if (copy_to_user(useraddr, &regs, sizeof(regs)))
goto out;
useraddr += offsetof(struct ethtool_regs, data);
- if (regbuf && copy_to_user(useraddr, regbuf, regs.len))
+ if (copy_to_user(useraddr, regbuf, reglen))
goto out;
ret = 0;
@@ -1454,6 +1807,9 @@ static int ethtool_reset(struct net_device *dev, char __user *useraddr)
if (!dev->ethtool_ops->reset)
return -EOPNOTSUPP;
+ if (dev->ethtool->module_fw_flash_in_progress)
+ return -EBUSY;
+
if (copy_from_user(&reset, useraddr, sizeof(reset)))
return -EFAULT;
@@ -1468,11 +1824,13 @@ static int ethtool_reset(struct net_device *dev, char __user *useraddr)
static int ethtool_get_wol(struct net_device *dev, char __user *useraddr)
{
- struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
+ struct ethtool_wolinfo wol;
if (!dev->ethtool_ops->get_wol)
return -EOPNOTSUPP;
+ memset(&wol, 0, sizeof(struct ethtool_wolinfo));
+ wol.cmd = ETHTOOL_GWOL;
dev->ethtool_ops->get_wol(dev, &wol);
if (copy_to_user(useraddr, &wol, sizeof(wol)))
@@ -1482,40 +1840,87 @@ static int ethtool_get_wol(struct net_device *dev, char __user *useraddr)
static int ethtool_set_wol(struct net_device *dev, char __user *useraddr)
{
- struct ethtool_wolinfo wol;
+ struct ethtool_wolinfo wol, cur_wol;
int ret;
- if (!dev->ethtool_ops->set_wol)
+ if (!dev->ethtool_ops->get_wol || !dev->ethtool_ops->set_wol)
return -EOPNOTSUPP;
+ memset(&cur_wol, 0, sizeof(struct ethtool_wolinfo));
+ cur_wol.cmd = ETHTOOL_GWOL;
+ dev->ethtool_ops->get_wol(dev, &cur_wol);
+
if (copy_from_user(&wol, useraddr, sizeof(wol)))
return -EFAULT;
+ if (wol.wolopts & ~cur_wol.supported)
+ return -EINVAL;
+
+ if (wol.wolopts == cur_wol.wolopts &&
+ !memcmp(wol.sopass, cur_wol.sopass, sizeof(wol.sopass)))
+ return 0;
+
ret = dev->ethtool_ops->set_wol(dev, &wol);
if (ret)
return ret;
- dev->wol_enabled = !!wol.wolopts;
+ dev->ethtool->wol_enabled = !!wol.wolopts;
+ ethtool_notify(dev, ETHTOOL_MSG_WOL_NTF);
return 0;
}
+static void eee_to_keee(struct ethtool_keee *keee,
+ const struct ethtool_eee *eee)
+{
+ memset(keee, 0, sizeof(*keee));
+
+ keee->eee_enabled = eee->eee_enabled;
+ keee->tx_lpi_enabled = eee->tx_lpi_enabled;
+ keee->tx_lpi_timer = eee->tx_lpi_timer;
+
+ ethtool_convert_legacy_u32_to_link_mode(keee->advertised,
+ eee->advertised);
+}
+
+static void keee_to_eee(struct ethtool_eee *eee,
+ const struct ethtool_keee *keee)
+{
+ bool overflow;
+
+ memset(eee, 0, sizeof(*eee));
+
+ eee->eee_active = keee->eee_active;
+ eee->eee_enabled = keee->eee_enabled;
+ eee->tx_lpi_enabled = keee->tx_lpi_enabled;
+ eee->tx_lpi_timer = keee->tx_lpi_timer;
+
+ overflow = !ethtool_convert_link_mode_to_legacy_u32(&eee->supported,
+ keee->supported);
+ ethtool_convert_link_mode_to_legacy_u32(&eee->advertised,
+ keee->advertised);
+ ethtool_convert_link_mode_to_legacy_u32(&eee->lp_advertised,
+ keee->lp_advertised);
+ if (overflow)
+ pr_warn("Ethtool ioctl interface doesn't support passing EEE linkmodes beyond bit 32\n");
+}
+
static int ethtool_get_eee(struct net_device *dev, char __user *useraddr)
{
- struct ethtool_eee edata;
+ struct ethtool_keee keee;
+ struct ethtool_eee eee;
int rc;
if (!dev->ethtool_ops->get_eee)
return -EOPNOTSUPP;
- memset(&edata, 0, sizeof(struct ethtool_eee));
- edata.cmd = ETHTOOL_GEEE;
- rc = dev->ethtool_ops->get_eee(dev, &edata);
-
+ memset(&keee, 0, sizeof(keee));
+ rc = dev->ethtool_ops->get_eee(dev, &keee);
if (rc)
return rc;
- if (copy_to_user(useraddr, &edata, sizeof(edata)))
+ keee_to_eee(&eee, &keee);
+ if (copy_to_user(useraddr, &eee, sizeof(eee)))
return -EFAULT;
return 0;
@@ -1523,15 +1928,21 @@ static int ethtool_get_eee(struct net_device *dev, char __user *useraddr)
static int ethtool_set_eee(struct net_device *dev, char __user *useraddr)
{
- struct ethtool_eee edata;
+ struct ethtool_keee keee;
+ struct ethtool_eee eee;
+ int ret;
if (!dev->ethtool_ops->set_eee)
return -EOPNOTSUPP;
- if (copy_from_user(&edata, useraddr, sizeof(edata)))
+ if (copy_from_user(&eee, useraddr, sizeof(eee)))
return -EFAULT;
- return dev->ethtool_ops->set_eee(dev, &edata);
+ eee_to_keee(&keee, &eee);
+ ret = dev->ethtool_ops->set_eee(dev, &keee);
+ if (!ret)
+ ethtool_notify(dev, ETHTOOL_MSG_EEE_NTF);
+ return ret;
}
static int ethtool_nway_reset(struct net_device *dev)
@@ -1545,12 +1956,12 @@ static int ethtool_nway_reset(struct net_device *dev)
static int ethtool_get_link(struct net_device *dev, char __user *useraddr)
{
struct ethtool_value edata = { .cmd = ETHTOOL_GLINK };
+ int link = __ethtool_get_link(dev);
- if (!dev->ethtool_ops->get_link)
- return -EOPNOTSUPP;
-
- edata.data = netif_running(dev) && dev->ethtool_ops->get_link(dev);
+ if (link < 0)
+ return link;
+ edata.data = link;
if (copy_to_user(useraddr, &edata, sizeof(edata)))
return -EFAULT;
return 0;
@@ -1578,7 +1989,7 @@ static int ethtool_get_any_eeprom(struct net_device *dev, void __user *useraddr,
if (eeprom.offset + eeprom.len > total_len)
return -EINVAL;
- data = kmalloc(PAGE_SIZE, GFP_USER);
+ data = kzalloc(PAGE_SIZE, GFP_USER);
if (!data)
return -ENOMEM;
@@ -1589,6 +2000,10 @@ static int ethtool_get_any_eeprom(struct net_device *dev, void __user *useraddr,
ret = getter(dev, &eeprom, data);
if (ret)
break;
+ if (!eeprom.len) {
+ ret = -EIO;
+ break;
+ }
if (copy_to_user(userbuf, data, eeprom.len)) {
ret = -EFAULT;
break;
@@ -1643,7 +2058,7 @@ static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr)
if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev))
return -EINVAL;
- data = kmalloc(PAGE_SIZE, GFP_USER);
+ data = kzalloc(PAGE_SIZE, GFP_USER);
if (!data)
return -ENOMEM;
@@ -1671,39 +2086,115 @@ static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev,
void __user *useraddr)
{
struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE };
+ struct kernel_ethtool_coalesce kernel_coalesce = {};
+ int ret;
if (!dev->ethtool_ops->get_coalesce)
return -EOPNOTSUPP;
- dev->ethtool_ops->get_coalesce(dev, &coalesce);
+ ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce,
+ NULL);
+ if (ret)
+ return ret;
if (copy_to_user(useraddr, &coalesce, sizeof(coalesce)))
return -EFAULT;
return 0;
}
+static bool
+ethtool_set_coalesce_supported(struct net_device *dev,
+ struct ethtool_coalesce *coalesce)
+{
+ u32 supported_params = dev->ethtool_ops->supported_coalesce_params;
+ u32 nonzero_params = 0;
+
+ if (coalesce->rx_coalesce_usecs)
+ nonzero_params |= ETHTOOL_COALESCE_RX_USECS;
+ if (coalesce->rx_max_coalesced_frames)
+ nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES;
+ if (coalesce->rx_coalesce_usecs_irq)
+ nonzero_params |= ETHTOOL_COALESCE_RX_USECS_IRQ;
+ if (coalesce->rx_max_coalesced_frames_irq)
+ nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES_IRQ;
+ if (coalesce->tx_coalesce_usecs)
+ nonzero_params |= ETHTOOL_COALESCE_TX_USECS;
+ if (coalesce->tx_max_coalesced_frames)
+ nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES;
+ if (coalesce->tx_coalesce_usecs_irq)
+ nonzero_params |= ETHTOOL_COALESCE_TX_USECS_IRQ;
+ if (coalesce->tx_max_coalesced_frames_irq)
+ nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES_IRQ;
+ if (coalesce->stats_block_coalesce_usecs)
+ nonzero_params |= ETHTOOL_COALESCE_STATS_BLOCK_USECS;
+ if (coalesce->use_adaptive_rx_coalesce)
+ nonzero_params |= ETHTOOL_COALESCE_USE_ADAPTIVE_RX;
+ if (coalesce->use_adaptive_tx_coalesce)
+ nonzero_params |= ETHTOOL_COALESCE_USE_ADAPTIVE_TX;
+ if (coalesce->pkt_rate_low)
+ nonzero_params |= ETHTOOL_COALESCE_PKT_RATE_LOW;
+ if (coalesce->rx_coalesce_usecs_low)
+ nonzero_params |= ETHTOOL_COALESCE_RX_USECS_LOW;
+ if (coalesce->rx_max_coalesced_frames_low)
+ nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES_LOW;
+ if (coalesce->tx_coalesce_usecs_low)
+ nonzero_params |= ETHTOOL_COALESCE_TX_USECS_LOW;
+ if (coalesce->tx_max_coalesced_frames_low)
+ nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES_LOW;
+ if (coalesce->pkt_rate_high)
+ nonzero_params |= ETHTOOL_COALESCE_PKT_RATE_HIGH;
+ if (coalesce->rx_coalesce_usecs_high)
+ nonzero_params |= ETHTOOL_COALESCE_RX_USECS_HIGH;
+ if (coalesce->rx_max_coalesced_frames_high)
+ nonzero_params |= ETHTOOL_COALESCE_RX_MAX_FRAMES_HIGH;
+ if (coalesce->tx_coalesce_usecs_high)
+ nonzero_params |= ETHTOOL_COALESCE_TX_USECS_HIGH;
+ if (coalesce->tx_max_coalesced_frames_high)
+ nonzero_params |= ETHTOOL_COALESCE_TX_MAX_FRAMES_HIGH;
+ if (coalesce->rate_sample_interval)
+ nonzero_params |= ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL;
+
+ return (supported_params & nonzero_params) == nonzero_params;
+}
+
static noinline_for_stack int ethtool_set_coalesce(struct net_device *dev,
void __user *useraddr)
{
+ struct kernel_ethtool_coalesce kernel_coalesce = {};
struct ethtool_coalesce coalesce;
+ int ret;
- if (!dev->ethtool_ops->set_coalesce)
+ if (!dev->ethtool_ops->set_coalesce || !dev->ethtool_ops->get_coalesce)
return -EOPNOTSUPP;
+ ret = dev->ethtool_ops->get_coalesce(dev, &coalesce, &kernel_coalesce,
+ NULL);
+ if (ret)
+ return ret;
+
if (copy_from_user(&coalesce, useraddr, sizeof(coalesce)))
return -EFAULT;
- return dev->ethtool_ops->set_coalesce(dev, &coalesce);
+ if (!ethtool_set_coalesce_supported(dev, &coalesce))
+ return -EOPNOTSUPP;
+
+ ret = dev->ethtool_ops->set_coalesce(dev, &coalesce, &kernel_coalesce,
+ NULL);
+ if (!ret)
+ ethtool_notify(dev, ETHTOOL_MSG_COALESCE_NTF);
+ return ret;
}
static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr)
{
struct ethtool_ringparam ringparam = { .cmd = ETHTOOL_GRINGPARAM };
+ struct kernel_ethtool_ringparam kernel_ringparam = {};
if (!dev->ethtool_ops->get_ringparam)
return -EOPNOTSUPP;
- dev->ethtool_ops->get_ringparam(dev, &ringparam);
+ dev->ethtool_ops->get_ringparam(dev, &ringparam,
+ &kernel_ringparam, NULL);
if (copy_to_user(useraddr, &ringparam, sizeof(ringparam)))
return -EFAULT;
@@ -1712,7 +2203,9 @@ static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr)
static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)
{
- struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM };
+ struct kernel_ethtool_ringparam kernel_ringparam;
+ struct ethtool_ringparam ringparam, max;
+ int ret;
if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam)
return -EOPNOTSUPP;
@@ -1720,7 +2213,7 @@ static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)
if (copy_from_user(&ringparam, useraddr, sizeof(ringparam)))
return -EFAULT;
- dev->ethtool_ops->get_ringparam(dev, &max);
+ ethtool_ringparam_get_cfg(dev, &max, &kernel_ringparam, NULL);
/* ensure new ring parameters are within the maximums */
if (ringparam.rx_pending > max.rx_max_pending ||
@@ -1729,7 +2222,11 @@ static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)
ringparam.tx_pending > max.tx_max_pending)
return -EINVAL;
- return dev->ethtool_ops->set_ringparam(dev, &ringparam);
+ ret = dev->ethtool_ops->set_ringparam(dev, &ringparam,
+ &kernel_ringparam, NULL);
+ if (!ret)
+ ethtool_notify(dev, ETHTOOL_MSG_RINGS_NTF);
+ return ret;
}
static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
@@ -1750,8 +2247,10 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
void __user *useraddr)
{
- struct ethtool_channels channels, max = { .cmd = ETHTOOL_GCHANNELS };
- u32 max_rx_in_use = 0;
+ struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS };
+ u16 from_channel, to_channel;
+ unsigned int i;
+ int ret;
if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels)
return -EOPNOTSUPP;
@@ -1759,28 +2258,47 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
if (copy_from_user(&channels, useraddr, sizeof(channels)))
return -EFAULT;
- dev->ethtool_ops->get_channels(dev, &max);
+ dev->ethtool_ops->get_channels(dev, &curr);
+
+ if (channels.rx_count == curr.rx_count &&
+ channels.tx_count == curr.tx_count &&
+ channels.combined_count == curr.combined_count &&
+ channels.other_count == curr.other_count)
+ return 0;
/* ensure new counts are within the maximums */
- if ((channels.rx_count > max.max_rx) ||
- (channels.tx_count > max.max_tx) ||
- (channels.combined_count > max.max_combined) ||
- (channels.other_count > max.max_other))
+ if (channels.rx_count > curr.max_rx ||
+ channels.tx_count > curr.max_tx ||
+ channels.combined_count > curr.max_combined ||
+ channels.other_count > curr.max_other)
return -EINVAL;
- /* ensure the new Rx count fits within the configured Rx flow
- * indirection table settings */
- if (netif_is_rxfh_configured(dev) &&
- !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) &&
- (channels.combined_count + channels.rx_count) <= max_rx_in_use)
- return -EINVAL;
+ /* ensure there is at least one RX and one TX channel */
+ if (!channels.combined_count &&
+ (!channels.rx_count || !channels.tx_count))
+ return -EINVAL;
+
+ ret = ethtool_check_max_channel(dev, channels, NULL);
+ if (ret)
+ return ret;
+
+ /* Disabling channels, query zero-copy AF_XDP sockets */
+ from_channel = channels.combined_count +
+ min(channels.rx_count, channels.tx_count);
+ to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count);
+ for (i = from_channel; i < to_channel; i++)
+ if (xsk_get_pool_from_qid(dev, i))
+ return -EINVAL;
- return dev->ethtool_ops->set_channels(dev, &channels);
+ ret = dev->ethtool_ops->set_channels(dev, &channels);
+ if (!ret)
+ ethtool_notify(dev, ETHTOOL_MSG_CHANNELS_NTF);
+ return ret;
}
static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr)
{
- struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM };
+ struct ethtool_pauseparam pauseparam = { .cmd = ETHTOOL_GPAUSEPARAM };
if (!dev->ethtool_ops->get_pauseparam)
return -EOPNOTSUPP;
@@ -1795,6 +2313,7 @@ static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr)
static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
{
struct ethtool_pauseparam pauseparam;
+ int ret;
if (!dev->ethtool_ops->set_pauseparam)
return -EOPNOTSUPP;
@@ -1802,7 +2321,10 @@ static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam)))
return -EFAULT;
- return dev->ethtool_ops->set_pauseparam(dev, &pauseparam);
+ ret = dev->ethtool_ops->set_pauseparam(dev, &pauseparam);
+ if (!ret)
+ ethtool_notify(dev, ETHTOOL_MSG_PAUSE_NTF);
+ return ret;
}
static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
@@ -1824,17 +2346,19 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
return -EFAULT;
test.len = test_len;
- data = kmalloc_array(test_len, sizeof(u64), GFP_USER);
+ data = kcalloc(test_len, sizeof(u64), GFP_USER);
if (!data)
return -ENOMEM;
+ netif_testing_on(dev);
ops->self_test(dev, &test, data);
+ netif_testing_off(dev);
ret = -EFAULT;
if (copy_to_user(useraddr, &test, sizeof(test)))
goto out;
useraddr += sizeof(test);
- if (copy_to_user(useraddr, data, test.len * sizeof(u64)))
+ if (copy_to_user(useraddr, data, array_size(test.len, sizeof(u64))))
goto out;
ret = 0;
@@ -1860,18 +2384,24 @@ static int ethtool_get_strings(struct net_device *dev, void __user *useraddr)
WARN_ON_ONCE(!ret);
gstrings.len = ret;
- data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN));
- if (gstrings.len && !data)
- return -ENOMEM;
- __ethtool_get_strings(dev, gstrings.string_set, data);
+ if (gstrings.len) {
+ data = vzalloc(array_size(gstrings.len, ETH_GSTRING_LEN));
+ if (!data)
+ return -ENOMEM;
+
+ __ethtool_get_strings(dev, gstrings.string_set, data);
+ } else {
+ data = NULL;
+ }
ret = -EFAULT;
if (copy_to_user(useraddr, &gstrings, sizeof(gstrings)))
goto out;
useraddr += sizeof(gstrings);
if (gstrings.len &&
- copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN))
+ copy_to_user(useraddr, data,
+ array_size(gstrings.len, ETH_GSTRING_LEN)))
goto out;
ret = 0;
@@ -1880,11 +2410,31 @@ out:
return ret;
}
+__printf(2, 3) void ethtool_sprintf(u8 **data, const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ vsnprintf(*data, ETH_GSTRING_LEN, fmt, args);
+ va_end(args);
+
+ *data += ETH_GSTRING_LEN;
+}
+EXPORT_SYMBOL(ethtool_sprintf);
+
+void ethtool_puts(u8 **data, const char *str)
+{
+ strscpy(*data, str, ETH_GSTRING_LEN);
+ *data += ETH_GSTRING_LEN;
+}
+EXPORT_SYMBOL(ethtool_puts);
+
static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
{
struct ethtool_value id;
static bool busy;
const struct ethtool_ops *ops = dev->ethtool_ops;
+ netdevice_tracker dev_tracker;
int rc;
if (!ops->set_phys_id)
@@ -1904,7 +2454,8 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
* removal of the device.
*/
busy = true;
- dev_hold(dev);
+ netdev_hold(dev, &dev_tracker, GFP_KERNEL);
+ netdev_unlock_ops(dev);
rtnl_unlock();
if (rc == 0) {
@@ -1913,27 +2464,26 @@ static int ethtool_phys_id(struct net_device *dev, void __user *useraddr)
id.data ? (id.data * HZ) : MAX_SCHEDULE_TIMEOUT);
} else {
/* Driver expects to be called at twice the frequency in rc */
- int n = rc * 2, i, interval = HZ / n;
+ int n = rc * 2, interval = HZ / n;
+ u64 count = mul_u32_u32(n, id.data);
+ u64 i = 0;
- /* Count down seconds */
do {
- /* Count down iterations per second */
- i = n;
- do {
- rtnl_lock();
- rc = ops->set_phys_id(dev,
- (i & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON);
- rtnl_unlock();
- if (rc)
- break;
- schedule_timeout_interruptible(interval);
- } while (!signal_pending(current) && --i != 0);
- } while (!signal_pending(current) &&
- (id.data == 0 || --id.data != 0));
+ rtnl_lock();
+ netdev_lock_ops(dev);
+ rc = ops->set_phys_id(dev,
+ (i++ & 1) ? ETHTOOL_ID_OFF : ETHTOOL_ID_ON);
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+ if (rc)
+ break;
+ schedule_timeout_interruptible(interval);
+ } while (!signal_pending(current) && (!id.data || i < count));
}
rtnl_lock();
- dev_put(dev);
+ netdev_lock_ops(dev);
+ netdev_put(dev, &dev_tracker);
busy = false;
(void) ops->set_phys_id(dev, ETHTOOL_ID_INACTIVE);
@@ -1960,17 +2510,21 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
return -EFAULT;
stats.n_stats = n_stats;
- data = vzalloc(array_size(n_stats, sizeof(u64)));
- if (n_stats && !data)
- return -ENOMEM;
- ops->get_ethtool_stats(dev, &stats, data);
+ if (n_stats) {
+ data = vzalloc(array_size(n_stats, sizeof(u64)));
+ if (!data)
+ return -ENOMEM;
+ ops->get_ethtool_stats(dev, &stats, data);
+ } else {
+ data = NULL;
+ }
ret = -EFAULT;
if (copy_to_user(useraddr, &stats, sizeof(stats)))
goto out;
useraddr += sizeof(stats);
- if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64)))
+ if (n_stats && copy_to_user(useraddr, data, array_size(n_stats, sizeof(u64))))
goto out;
ret = 0;
@@ -1979,50 +2533,91 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
return ret;
}
-static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
+static int ethtool_vzalloc_stats_array(int n_stats, u64 **data)
{
- const struct ethtool_ops *ops = dev->ethtool_ops;
- struct phy_device *phydev = dev->phydev;
- struct ethtool_stats stats;
- u64 *data;
- int ret, n_stats;
-
- if (!phydev && (!ops->get_ethtool_phy_stats || !ops->get_sset_count))
- return -EOPNOTSUPP;
-
- if (dev->phydev && !ops->get_ethtool_phy_stats)
- n_stats = phy_ethtool_get_sset_count(dev->phydev);
- else
- n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
if (n_stats < 0)
return n_stats;
if (n_stats > S32_MAX / sizeof(u64))
return -ENOMEM;
- WARN_ON_ONCE(!n_stats);
+ if (WARN_ON_ONCE(!n_stats))
+ return -EOPNOTSUPP;
+
+ *data = vzalloc(array_size(n_stats, sizeof(u64)));
+ if (!*data)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static int ethtool_get_phy_stats_phydev(struct phy_device *phydev,
+ struct ethtool_stats *stats,
+ u64 **data)
+ {
+ const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops;
+ int n_stats, ret;
+
+ if (!phy_ops || !phy_ops->get_sset_count || !phy_ops->get_stats)
+ return -EOPNOTSUPP;
+
+ n_stats = phy_ops->get_sset_count(phydev);
+
+ ret = ethtool_vzalloc_stats_array(n_stats, data);
+ if (ret)
+ return ret;
+
+ stats->n_stats = n_stats;
+ return phy_ops->get_stats(phydev, stats, *data);
+}
+
+static int ethtool_get_phy_stats_ethtool(struct net_device *dev,
+ struct ethtool_stats *stats,
+ u64 **data)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int n_stats, ret;
+
+ if (!ops || !ops->get_sset_count || !ops->get_ethtool_phy_stats)
+ return -EOPNOTSUPP;
+
+ n_stats = ops->get_sset_count(dev, ETH_SS_PHY_STATS);
+
+ ret = ethtool_vzalloc_stats_array(n_stats, data);
+ if (ret)
+ return ret;
+
+ stats->n_stats = n_stats;
+ ops->get_ethtool_phy_stats(dev, stats, *data);
+
+ return 0;
+}
+
+static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
+{
+ struct phy_device *phydev = dev->phydev;
+ struct ethtool_stats stats;
+ u64 *data = NULL;
+ int ret = -EOPNOTSUPP;
if (copy_from_user(&stats, useraddr, sizeof(stats)))
return -EFAULT;
- stats.n_stats = n_stats;
- data = vzalloc(array_size(n_stats, sizeof(u64)));
- if (n_stats && !data)
- return -ENOMEM;
+ if (phydev)
+ ret = ethtool_get_phy_stats_phydev(phydev, &stats, &data);
- if (dev->phydev && !ops->get_ethtool_phy_stats) {
- ret = phy_ethtool_get_stats(dev->phydev, &stats, data);
- if (ret < 0)
- return ret;
- } else {
- ops->get_ethtool_phy_stats(dev, &stats, data);
- }
+ if (ret == -EOPNOTSUPP)
+ ret = ethtool_get_phy_stats_ethtool(dev, &stats, &data);
- ret = -EFAULT;
- if (copy_to_user(useraddr, &stats, sizeof(stats)))
+ if (ret)
goto out;
- useraddr += sizeof(stats);
- if (n_stats && copy_to_user(useraddr, data, n_stats * sizeof(u64)))
+
+ if (copy_to_user(useraddr, &stats, sizeof(stats))) {
+ ret = -EFAULT;
goto out;
- ret = 0;
+ }
+
+ useraddr += sizeof(stats);
+ if (copy_to_user(useraddr, data, array_size(stats.n_stats, sizeof(u64))))
+ ret = -EFAULT;
out:
vfree(data);
@@ -2092,20 +2687,15 @@ static int ethtool_set_value(struct net_device *dev, char __user *useraddr,
return actor(dev, edata.data);
}
-static noinline_for_stack int ethtool_flash_device(struct net_device *dev,
- char __user *useraddr)
+static int
+ethtool_flash_device(struct net_device *dev, struct ethtool_devlink_compat *req)
{
- struct ethtool_flash efl;
-
- if (copy_from_user(&efl, useraddr, sizeof(efl)))
- return -EFAULT;
-
- if (!dev->ethtool_ops->flash_device)
- return -EOPNOTSUPP;
-
- efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0;
+ if (!dev->ethtool_ops->flash_device) {
+ req->devlink = netdev_to_devlink_get(dev);
+ return 0;
+ }
- return dev->ethtool_ops->flash_device(dev, &efl);
+ return dev->ethtool_ops->flash_device(dev, &req->efl);
}
static int ethtool_set_dump(struct net_device *dev,
@@ -2209,40 +2799,35 @@ out:
static int ethtool_get_ts_info(struct net_device *dev, void __user *useraddr)
{
- int err = 0;
- struct ethtool_ts_info info;
- const struct ethtool_ops *ops = dev->ethtool_ops;
- struct phy_device *phydev = dev->phydev;
-
- memset(&info, 0, sizeof(info));
- info.cmd = ETHTOOL_GET_TS_INFO;
-
- if (phydev && phydev->drv && phydev->drv->ts_info) {
- err = phydev->drv->ts_info(phydev, &info);
- } else if (ops->get_ts_info) {
- err = ops->get_ts_info(dev, &info);
- } else {
- info.so_timestamping =
- SOF_TIMESTAMPING_RX_SOFTWARE |
- SOF_TIMESTAMPING_SOFTWARE;
- info.phc_index = -1;
- }
+ struct kernel_ethtool_ts_info kernel_info;
+ struct ethtool_ts_info info = {};
+ int err;
+ err = __ethtool_get_ts_info(dev, &kernel_info);
if (err)
return err;
+ info.cmd = kernel_info.cmd;
+ info.so_timestamping = kernel_info.so_timestamping;
+ info.phc_index = kernel_info.phc_index;
+ info.tx_types = kernel_info.tx_types;
+ info.rx_filters = kernel_info.rx_filters;
+
if (copy_to_user(useraddr, &info, sizeof(info)))
- err = -EFAULT;
+ return -EFAULT;
- return err;
+ return 0;
}
-static int __ethtool_get_module_info(struct net_device *dev,
- struct ethtool_modinfo *modinfo)
+int ethtool_get_module_info_call(struct net_device *dev,
+ struct ethtool_modinfo *modinfo)
{
const struct ethtool_ops *ops = dev->ethtool_ops;
struct phy_device *phydev = dev->phydev;
+ if (dev->ethtool->module_fw_flash_in_progress)
+ return -EBUSY;
+
if (dev->sfp_bus)
return sfp_get_module_info(dev->sfp_bus, modinfo);
@@ -2264,7 +2849,7 @@ static int ethtool_get_module_info(struct net_device *dev,
if (copy_from_user(&modinfo, useraddr, sizeof(modinfo)))
return -EFAULT;
- ret = __ethtool_get_module_info(dev, &modinfo);
+ ret = ethtool_get_module_info_call(dev, &modinfo);
if (ret)
return ret;
@@ -2274,12 +2859,15 @@ static int ethtool_get_module_info(struct net_device *dev,
return 0;
}
-static int __ethtool_get_module_eeprom(struct net_device *dev,
- struct ethtool_eeprom *ee, u8 *data)
+int ethtool_get_module_eeprom_call(struct net_device *dev,
+ struct ethtool_eeprom *ee, u8 *data)
{
const struct ethtool_ops *ops = dev->ethtool_ops;
struct phy_device *phydev = dev->phydev;
+ if (dev->ethtool->module_fw_flash_in_progress)
+ return -EBUSY;
+
if (dev->sfp_bus)
return sfp_get_module_eeprom(dev->sfp_bus, ee, data);
@@ -2298,12 +2886,12 @@ static int ethtool_get_module_eeprom(struct net_device *dev,
int ret;
struct ethtool_modinfo modinfo;
- ret = __ethtool_get_module_info(dev, &modinfo);
+ ret = ethtool_get_module_info_call(dev, &modinfo);
if (ret)
return ret;
return ethtool_get_any_eeprom(dev, useraddr,
- __ethtool_get_module_eeprom,
+ ethtool_get_module_eeprom_call,
modinfo.eeprom_len);
}
@@ -2312,6 +2900,7 @@ static int ethtool_tunable_valid(const struct ethtool_tunable *tuna)
switch (tuna->id) {
case ETHTOOL_RX_COPYBREAK:
case ETHTOOL_TX_COPYBREAK:
+ case ETHTOOL_TX_COPYBREAK_BUF_SIZE:
if (tuna->len != sizeof(u32) ||
tuna->type_id != ETHTOOL_TUNABLE_U32)
return -EINVAL;
@@ -2342,7 +2931,7 @@ static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr)
ret = ethtool_tunable_valid(&tuna);
if (ret)
return ret;
- data = kmalloc(tuna.len, GFP_USER);
+ data = kzalloc(tuna.len, GFP_USER);
if (!data)
return -ENOMEM;
ret = ops->get_tunable(dev, &tuna, data);
@@ -2383,9 +2972,10 @@ static int ethtool_set_tunable(struct net_device *dev, void __user *useraddr)
return ret;
}
-static int ethtool_get_per_queue_coalesce(struct net_device *dev,
- void __user *useraddr,
- struct ethtool_per_queue_op *per_queue_opt)
+static noinline_for_stack int
+ethtool_get_per_queue_coalesce(struct net_device *dev,
+ void __user *useraddr,
+ struct ethtool_per_queue_op *per_queue_opt)
{
u32 bit;
int ret;
@@ -2413,9 +3003,10 @@ static int ethtool_get_per_queue_coalesce(struct net_device *dev,
return 0;
}
-static int ethtool_set_per_queue_coalesce(struct net_device *dev,
- void __user *useraddr,
- struct ethtool_per_queue_op *per_queue_opt)
+static noinline_for_stack int
+ethtool_set_per_queue_coalesce(struct net_device *dev,
+ void __user *useraddr,
+ struct ethtool_per_queue_op *per_queue_opt)
{
u32 bit;
int i, ret = 0;
@@ -2449,6 +3040,11 @@ static int ethtool_set_per_queue_coalesce(struct net_device *dev,
goto roll_back;
}
+ if (!ethtool_set_coalesce_supported(dev, &coalesce)) {
+ ret = -EOPNOTSUPP;
+ goto roll_back;
+ }
+
ret = dev->ethtool_ops->set_per_queue_coalesce(dev, bit, &coalesce);
if (ret != 0)
goto roll_back;
@@ -2469,13 +3065,17 @@ roll_back:
return ret;
}
-static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
+static int noinline_for_stack ethtool_set_per_queue(struct net_device *dev,
+ void __user *useraddr, u32 sub_cmd)
{
struct ethtool_per_queue_op per_queue_opt;
if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt)))
return -EFAULT;
+ if (per_queue_opt.sub_command != sub_cmd)
+ return -EINVAL;
+
switch (per_queue_opt.sub_command) {
case ETHTOOL_GCOALESCE:
return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt);
@@ -2483,17 +3083,23 @@ static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt);
default:
return -EOPNOTSUPP;
- };
+ }
}
static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna)
{
switch (tuna->id) {
case ETHTOOL_PHY_DOWNSHIFT:
+ case ETHTOOL_PHY_FAST_LINK_DOWN:
if (tuna->len != sizeof(u8) ||
tuna->type_id != ETHTOOL_TUNABLE_U8)
return -EINVAL;
break;
+ case ETHTOOL_PHY_EDPD:
+ if (tuna->len != sizeof(u16) ||
+ tuna->type_id != ETHTOOL_TUNABLE_U16)
+ return -EINVAL;
+ break;
default:
return -EINVAL;
}
@@ -2503,25 +3109,30 @@ static int ethtool_phy_tunable_valid(const struct ethtool_tunable *tuna)
static int get_phy_tunable(struct net_device *dev, void __user *useraddr)
{
- int ret;
- struct ethtool_tunable tuna;
struct phy_device *phydev = dev->phydev;
+ struct ethtool_tunable tuna;
+ bool phy_drv_tunable;
void *data;
+ int ret;
- if (!(phydev && phydev->drv && phydev->drv->get_tunable))
+ phy_drv_tunable = phydev && phydev->drv && phydev->drv->get_tunable;
+ if (!phy_drv_tunable && !dev->ethtool_ops->get_phy_tunable)
return -EOPNOTSUPP;
-
if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
return -EFAULT;
ret = ethtool_phy_tunable_valid(&tuna);
if (ret)
return ret;
- data = kmalloc(tuna.len, GFP_USER);
+ data = kzalloc(tuna.len, GFP_USER);
if (!data)
return -ENOMEM;
- mutex_lock(&phydev->lock);
- ret = phydev->drv->get_tunable(phydev, &tuna, data);
- mutex_unlock(&phydev->lock);
+ if (phy_drv_tunable) {
+ mutex_lock(&phydev->lock);
+ ret = phydev->drv->get_tunable(phydev, &tuna, data);
+ mutex_unlock(&phydev->lock);
+ } else {
+ ret = dev->ethtool_ops->get_phy_tunable(dev, &tuna, data);
+ }
if (ret)
goto out;
useraddr += sizeof(tuna);
@@ -2537,12 +3148,14 @@ out:
static int set_phy_tunable(struct net_device *dev, void __user *useraddr)
{
- int ret;
- struct ethtool_tunable tuna;
struct phy_device *phydev = dev->phydev;
+ struct ethtool_tunable tuna;
+ bool phy_drv_tunable;
void *data;
+ int ret;
- if (!(phydev && phydev->drv && phydev->drv->set_tunable))
+ phy_drv_tunable = phydev && phydev->drv && phydev->drv->get_tunable;
+ if (!phy_drv_tunable && !dev->ethtool_ops->set_phy_tunable)
return -EOPNOTSUPP;
if (copy_from_user(&tuna, useraddr, sizeof(tuna)))
return -EFAULT;
@@ -2553,9 +3166,13 @@ static int set_phy_tunable(struct net_device *dev, void __user *useraddr)
data = memdup_user(useraddr, tuna.len);
if (IS_ERR(data))
return PTR_ERR(data);
- mutex_lock(&phydev->lock);
- ret = phydev->drv->set_tunable(phydev, &tuna, data);
- mutex_unlock(&phydev->lock);
+ if (phy_drv_tunable) {
+ mutex_lock(&phydev->lock);
+ ret = phydev->drv->set_tunable(phydev, &tuna, data);
+ mutex_unlock(&phydev->lock);
+ } else {
+ ret = dev->ethtool_ops->set_phy_tunable(dev, &tuna, data);
+ }
kfree(data);
return ret;
@@ -2563,7 +3180,7 @@ static int set_phy_tunable(struct net_device *dev, void __user *useraddr)
static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr)
{
- struct ethtool_fecparam fecparam = { ETHTOOL_GFECPARAM };
+ struct ethtool_fecparam fecparam = { .cmd = ETHTOOL_GFECPARAM };
int rc;
if (!dev->ethtool_ops->get_fecparam)
@@ -2573,6 +3190,9 @@ static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr)
if (rc)
return rc;
+ if (WARN_ON_ONCE(fecparam.reserved))
+ fecparam.reserved = 0;
+
if (copy_to_user(useraddr, &fecparam, sizeof(fecparam)))
return -EFAULT;
return 0;
@@ -2588,25 +3208,30 @@ static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr)
if (copy_from_user(&fecparam, useraddr, sizeof(fecparam)))
return -EFAULT;
+ if (!fecparam.fec || fecparam.fec & ETHTOOL_FEC_NONE)
+ return -EINVAL;
+
+ fecparam.active_fec = 0;
+ fecparam.reserved = 0;
+
return dev->ethtool_ops->set_fecparam(dev, &fecparam);
}
/* The main entry point in this file. Called from net/core/dev_ioctl.c */
-int dev_ethtool(struct net *net, struct ifreq *ifr)
+static int
+__dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr,
+ u32 ethcmd, struct ethtool_devlink_compat *devlink_state)
{
- struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
- void __user *useraddr = ifr->ifr_data;
- u32 ethcmd, sub_cmd;
+ struct net_device *dev;
+ u32 sub_cmd;
int rc;
netdev_features_t old_features;
- if (!dev || !netif_device_present(dev))
+ dev = __dev_get_by_name(net, ifr->ifr_name);
+ if (!dev)
return -ENODEV;
- if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
- return -EFAULT;
-
if (ethcmd == ETHTOOL_PERQUEUE) {
if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd)))
return -EFAULT;
@@ -2657,10 +3282,19 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
return -EPERM;
}
+ netdev_lock_ops(dev);
+ if (dev->dev.parent)
+ pm_runtime_get_sync(dev->dev.parent);
+
+ if (!netif_device_present(dev)) {
+ rc = -ENODEV;
+ goto out;
+ }
+
if (dev->ethtool_ops->begin) {
rc = dev->ethtool_ops->begin(dev);
- if (rc < 0)
- return rc;
+ if (rc < 0)
+ goto out;
}
old_features = dev->features;
@@ -2672,7 +3306,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
rc = ethtool_set_settings(dev, useraddr);
break;
case ETHTOOL_GDRVINFO:
- rc = ethtool_get_drvinfo(dev, useraddr);
+ rc = ethtool_get_drvinfo(dev, devlink_state);
break;
case ETHTOOL_GREGS:
rc = ethtool_get_regs(dev, useraddr);
@@ -2690,6 +3324,8 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_SMSGLVL:
rc = ethtool_set_value_void(dev, useraddr,
dev->ethtool_ops->set_msglevel);
+ if (!rc)
+ ethtool_notify(dev, ETHTOOL_MSG_DEBUG_NTF);
break;
case ETHTOOL_GEEE:
rc = ethtool_get_eee(dev, useraddr);
@@ -2752,25 +3388,33 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GPFLAGS:
rc = ethtool_get_value(dev, useraddr, ethcmd,
dev->ethtool_ops->get_priv_flags);
+ if (!rc)
+ ethtool_notify(dev, ETHTOOL_MSG_PRIVFLAGS_NTF);
break;
case ETHTOOL_SPFLAGS:
rc = ethtool_set_value(dev, useraddr,
dev->ethtool_ops->set_priv_flags);
break;
case ETHTOOL_GRXFH:
+ rc = ethtool_get_rxfh_fields(dev, ethcmd, useraddr);
+ break;
+ case ETHTOOL_SRXFH:
+ rc = ethtool_set_rxfh_fields(dev, ethcmd, useraddr);
+ break;
case ETHTOOL_GRXRINGS:
+ rc = ethtool_get_rxrings(dev, ethcmd, useraddr);
+ break;
case ETHTOOL_GRXCLSRLCNT:
case ETHTOOL_GRXCLSRULE:
case ETHTOOL_GRXCLSRLALL:
rc = ethtool_get_rxnfc(dev, ethcmd, useraddr);
break;
- case ETHTOOL_SRXFH:
case ETHTOOL_SRXCLSRLDEL:
case ETHTOOL_SRXCLSRLINS:
rc = ethtool_set_rxnfc(dev, ethcmd, useraddr);
break;
case ETHTOOL_FLASHDEV:
- rc = ethtool_flash_device(dev, useraddr);
+ rc = ethtool_flash_device(dev, devlink_state);
break;
case ETHTOOL_RESET:
rc = ethtool_reset(dev, useraddr);
@@ -2846,7 +3490,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
rc = ethtool_get_phy_stats(dev, useraddr);
break;
case ETHTOOL_PERQUEUE:
- rc = ethtool_set_per_queue(dev, useraddr);
+ rc = ethtool_set_per_queue(dev, useraddr, sub_cmd);
break;
case ETHTOOL_GLINKSETTINGS:
rc = ethtool_get_link_ksettings(dev, useraddr);
@@ -2875,6 +3519,335 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
if (old_features != dev->features)
netdev_features_change(dev);
+out:
+ if (dev->dev.parent)
+ pm_runtime_put(dev->dev.parent);
+ netdev_unlock_ops(dev);
return rc;
}
+
+int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr)
+{
+ struct ethtool_devlink_compat *state;
+ u32 ethcmd;
+ int rc;
+
+ if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
+ return -EFAULT;
+
+ state = kzalloc(sizeof(*state), GFP_KERNEL);
+ if (!state)
+ return -ENOMEM;
+
+ switch (ethcmd) {
+ case ETHTOOL_FLASHDEV:
+ if (copy_from_user(&state->efl, useraddr, sizeof(state->efl))) {
+ rc = -EFAULT;
+ goto exit_free;
+ }
+ state->efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0;
+ break;
+ }
+
+ rtnl_lock();
+ rc = __dev_ethtool(net, ifr, useraddr, ethcmd, state);
+ rtnl_unlock();
+ if (rc)
+ goto exit_free;
+
+ switch (ethcmd) {
+ case ETHTOOL_FLASHDEV:
+ if (state->devlink)
+ rc = devlink_compat_flash_update(state->devlink,
+ state->efl.data);
+ break;
+ case ETHTOOL_GDRVINFO:
+ if (state->devlink)
+ devlink_compat_running_version(state->devlink,
+ state->info.fw_version,
+ sizeof(state->info.fw_version));
+ if (copy_to_user(useraddr, &state->info, sizeof(state->info))) {
+ rc = -EFAULT;
+ goto exit_free;
+ }
+ break;
+ }
+
+exit_free:
+ if (state->devlink)
+ devlink_put(state->devlink);
+ kfree(state);
+ return rc;
+}
+
+struct ethtool_rx_flow_key {
+ struct flow_dissector_key_basic basic;
+ union {
+ struct flow_dissector_key_ipv4_addrs ipv4;
+ struct flow_dissector_key_ipv6_addrs ipv6;
+ };
+ struct flow_dissector_key_ports tp;
+ struct flow_dissector_key_ip ip;
+ struct flow_dissector_key_vlan vlan;
+ struct flow_dissector_key_eth_addrs eth_addrs;
+} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
+
+struct ethtool_rx_flow_match {
+ struct flow_dissector dissector;
+ struct ethtool_rx_flow_key key;
+ struct ethtool_rx_flow_key mask;
+};
+
+struct ethtool_rx_flow_rule *
+ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input)
+{
+ const struct ethtool_rx_flow_spec *fs = input->fs;
+ struct ethtool_rx_flow_match *match;
+ struct ethtool_rx_flow_rule *flow;
+ struct flow_action_entry *act;
+
+ flow = kzalloc(sizeof(struct ethtool_rx_flow_rule) +
+ sizeof(struct ethtool_rx_flow_match), GFP_KERNEL);
+ if (!flow)
+ return ERR_PTR(-ENOMEM);
+
+ /* ethtool_rx supports only one single action per rule. */
+ flow->rule = flow_rule_alloc(1);
+ if (!flow->rule) {
+ kfree(flow);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ match = (struct ethtool_rx_flow_match *)flow->priv;
+ flow->rule->match.dissector = &match->dissector;
+ flow->rule->match.mask = &match->mask;
+ flow->rule->match.key = &match->key;
+
+ match->mask.basic.n_proto = htons(0xffff);
+
+ switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) {
+ case ETHER_FLOW: {
+ const struct ethhdr *ether_spec, *ether_m_spec;
+
+ ether_spec = &fs->h_u.ether_spec;
+ ether_m_spec = &fs->m_u.ether_spec;
+
+ if (!is_zero_ether_addr(ether_m_spec->h_source)) {
+ ether_addr_copy(match->key.eth_addrs.src,
+ ether_spec->h_source);
+ ether_addr_copy(match->mask.eth_addrs.src,
+ ether_m_spec->h_source);
+ }
+ if (!is_zero_ether_addr(ether_m_spec->h_dest)) {
+ ether_addr_copy(match->key.eth_addrs.dst,
+ ether_spec->h_dest);
+ ether_addr_copy(match->mask.eth_addrs.dst,
+ ether_m_spec->h_dest);
+ }
+ if (ether_m_spec->h_proto) {
+ match->key.basic.n_proto = ether_spec->h_proto;
+ match->mask.basic.n_proto = ether_m_spec->h_proto;
+ }
+ }
+ break;
+ case TCP_V4_FLOW:
+ case UDP_V4_FLOW: {
+ const struct ethtool_tcpip4_spec *v4_spec, *v4_m_spec;
+
+ match->key.basic.n_proto = htons(ETH_P_IP);
+
+ v4_spec = &fs->h_u.tcp_ip4_spec;
+ v4_m_spec = &fs->m_u.tcp_ip4_spec;
+
+ if (v4_m_spec->ip4src) {
+ match->key.ipv4.src = v4_spec->ip4src;
+ match->mask.ipv4.src = v4_m_spec->ip4src;
+ }
+ if (v4_m_spec->ip4dst) {
+ match->key.ipv4.dst = v4_spec->ip4dst;
+ match->mask.ipv4.dst = v4_m_spec->ip4dst;
+ }
+ if (v4_m_spec->ip4src ||
+ v4_m_spec->ip4dst) {
+ match->dissector.used_keys |=
+ BIT_ULL(FLOW_DISSECTOR_KEY_IPV4_ADDRS);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_IPV4_ADDRS] =
+ offsetof(struct ethtool_rx_flow_key, ipv4);
+ }
+ if (v4_m_spec->psrc) {
+ match->key.tp.src = v4_spec->psrc;
+ match->mask.tp.src = v4_m_spec->psrc;
+ }
+ if (v4_m_spec->pdst) {
+ match->key.tp.dst = v4_spec->pdst;
+ match->mask.tp.dst = v4_m_spec->pdst;
+ }
+ if (v4_m_spec->psrc ||
+ v4_m_spec->pdst) {
+ match->dissector.used_keys |=
+ BIT_ULL(FLOW_DISSECTOR_KEY_PORTS);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] =
+ offsetof(struct ethtool_rx_flow_key, tp);
+ }
+ if (v4_m_spec->tos) {
+ match->key.ip.tos = v4_spec->tos;
+ match->mask.ip.tos = v4_m_spec->tos;
+ match->dissector.used_keys |=
+ BIT(FLOW_DISSECTOR_KEY_IP);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_IP] =
+ offsetof(struct ethtool_rx_flow_key, ip);
+ }
+ }
+ break;
+ case TCP_V6_FLOW:
+ case UDP_V6_FLOW: {
+ const struct ethtool_tcpip6_spec *v6_spec, *v6_m_spec;
+
+ match->key.basic.n_proto = htons(ETH_P_IPV6);
+
+ v6_spec = &fs->h_u.tcp_ip6_spec;
+ v6_m_spec = &fs->m_u.tcp_ip6_spec;
+ if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6src)) {
+ memcpy(&match->key.ipv6.src, v6_spec->ip6src,
+ sizeof(match->key.ipv6.src));
+ memcpy(&match->mask.ipv6.src, v6_m_spec->ip6src,
+ sizeof(match->mask.ipv6.src));
+ }
+ if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6dst)) {
+ memcpy(&match->key.ipv6.dst, v6_spec->ip6dst,
+ sizeof(match->key.ipv6.dst));
+ memcpy(&match->mask.ipv6.dst, v6_m_spec->ip6dst,
+ sizeof(match->mask.ipv6.dst));
+ }
+ if (!ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6src) ||
+ !ipv6_addr_any((struct in6_addr *)v6_m_spec->ip6dst)) {
+ match->dissector.used_keys |=
+ BIT_ULL(FLOW_DISSECTOR_KEY_IPV6_ADDRS);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_IPV6_ADDRS] =
+ offsetof(struct ethtool_rx_flow_key, ipv6);
+ }
+ if (v6_m_spec->psrc) {
+ match->key.tp.src = v6_spec->psrc;
+ match->mask.tp.src = v6_m_spec->psrc;
+ }
+ if (v6_m_spec->pdst) {
+ match->key.tp.dst = v6_spec->pdst;
+ match->mask.tp.dst = v6_m_spec->pdst;
+ }
+ if (v6_m_spec->psrc ||
+ v6_m_spec->pdst) {
+ match->dissector.used_keys |=
+ BIT_ULL(FLOW_DISSECTOR_KEY_PORTS);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_PORTS] =
+ offsetof(struct ethtool_rx_flow_key, tp);
+ }
+ if (v6_m_spec->tclass) {
+ match->key.ip.tos = v6_spec->tclass;
+ match->mask.ip.tos = v6_m_spec->tclass;
+ match->dissector.used_keys |=
+ BIT_ULL(FLOW_DISSECTOR_KEY_IP);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_IP] =
+ offsetof(struct ethtool_rx_flow_key, ip);
+ }
+ }
+ break;
+ default:
+ ethtool_rx_flow_rule_destroy(flow);
+ return ERR_PTR(-EINVAL);
+ }
+
+ switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) {
+ case TCP_V4_FLOW:
+ case TCP_V6_FLOW:
+ match->key.basic.ip_proto = IPPROTO_TCP;
+ match->mask.basic.ip_proto = 0xff;
+ break;
+ case UDP_V4_FLOW:
+ case UDP_V6_FLOW:
+ match->key.basic.ip_proto = IPPROTO_UDP;
+ match->mask.basic.ip_proto = 0xff;
+ break;
+ }
+
+ match->dissector.used_keys |= BIT_ULL(FLOW_DISSECTOR_KEY_BASIC);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_BASIC] =
+ offsetof(struct ethtool_rx_flow_key, basic);
+
+ if (fs->flow_type & FLOW_EXT) {
+ const struct ethtool_flow_ext *ext_h_spec = &fs->h_ext;
+ const struct ethtool_flow_ext *ext_m_spec = &fs->m_ext;
+
+ if (ext_m_spec->vlan_etype) {
+ match->key.vlan.vlan_tpid = ext_h_spec->vlan_etype;
+ match->mask.vlan.vlan_tpid = ext_m_spec->vlan_etype;
+ }
+
+ if (ext_m_spec->vlan_tci) {
+ match->key.vlan.vlan_id =
+ ntohs(ext_h_spec->vlan_tci) & 0x0fff;
+ match->mask.vlan.vlan_id =
+ ntohs(ext_m_spec->vlan_tci) & 0x0fff;
+
+ match->key.vlan.vlan_dei =
+ !!(ext_h_spec->vlan_tci & htons(0x1000));
+ match->mask.vlan.vlan_dei =
+ !!(ext_m_spec->vlan_tci & htons(0x1000));
+
+ match->key.vlan.vlan_priority =
+ (ntohs(ext_h_spec->vlan_tci) & 0xe000) >> 13;
+ match->mask.vlan.vlan_priority =
+ (ntohs(ext_m_spec->vlan_tci) & 0xe000) >> 13;
+ }
+
+ if (ext_m_spec->vlan_etype ||
+ ext_m_spec->vlan_tci) {
+ match->dissector.used_keys |=
+ BIT_ULL(FLOW_DISSECTOR_KEY_VLAN);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_VLAN] =
+ offsetof(struct ethtool_rx_flow_key, vlan);
+ }
+ }
+ if (fs->flow_type & FLOW_MAC_EXT) {
+ const struct ethtool_flow_ext *ext_h_spec = &fs->h_ext;
+ const struct ethtool_flow_ext *ext_m_spec = &fs->m_ext;
+
+ memcpy(match->key.eth_addrs.dst, ext_h_spec->h_dest,
+ ETH_ALEN);
+ memcpy(match->mask.eth_addrs.dst, ext_m_spec->h_dest,
+ ETH_ALEN);
+
+ match->dissector.used_keys |=
+ BIT_ULL(FLOW_DISSECTOR_KEY_ETH_ADDRS);
+ match->dissector.offset[FLOW_DISSECTOR_KEY_ETH_ADDRS] =
+ offsetof(struct ethtool_rx_flow_key, eth_addrs);
+ }
+
+ act = &flow->rule->action.entries[0];
+ switch (fs->ring_cookie) {
+ case RX_CLS_FLOW_DISC:
+ act->id = FLOW_ACTION_DROP;
+ break;
+ case RX_CLS_FLOW_WAKE:
+ act->id = FLOW_ACTION_WAKE;
+ break;
+ default:
+ act->id = FLOW_ACTION_QUEUE;
+ if (fs->flow_type & FLOW_RSS)
+ act->queue.ctx = input->rss_ctx;
+
+ act->queue.vf = ethtool_get_flow_spec_ring_vf(fs->ring_cookie);
+ act->queue.index = ethtool_get_flow_spec_ring(fs->ring_cookie);
+ break;
+ }
+
+ return flow;
+}
+EXPORT_SYMBOL(ethtool_rx_flow_rule_create);
+
+void ethtool_rx_flow_rule_destroy(struct ethtool_rx_flow_rule *flow)
+{
+ kfree(flow->rule);
+ kfree(flow);
+}
+EXPORT_SYMBOL(ethtool_rx_flow_rule_destroy);
diff --git a/net/ethtool/linkinfo.c b/net/ethtool/linkinfo.c
new file mode 100644
index 000000000000..30b8ce275159
--- /dev/null
+++ b/net/ethtool/linkinfo.c
@@ -0,0 +1,145 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+
+struct linkinfo_req_info {
+ struct ethnl_req_info base;
+};
+
+struct linkinfo_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_link_ksettings ksettings;
+ struct ethtool_link_settings *lsettings;
+};
+
+#define LINKINFO_REPDATA(__reply_base) \
+ container_of(__reply_base, struct linkinfo_reply_data, base)
+
+const struct nla_policy ethnl_linkinfo_get_policy[] = {
+ [ETHTOOL_A_LINKINFO_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int linkinfo_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ data->lsettings = &data->ksettings.base;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ ret = __ethtool_get_link_ksettings(dev, &data->ksettings);
+ if (ret < 0)
+ GENL_SET_ERR_MSG(info, "failed to retrieve link settings");
+ ethnl_ops_complete(dev);
+
+ return ret;
+}
+
+static int linkinfo_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ return nla_total_size(sizeof(u8)) /* LINKINFO_PORT */
+ + nla_total_size(sizeof(u8)) /* LINKINFO_PHYADDR */
+ + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX */
+ + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX_CTRL */
+ + nla_total_size(sizeof(u8)) /* LINKINFO_TRANSCEIVER */
+ + 0;
+}
+
+static int linkinfo_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base);
+
+ if (nla_put_u8(skb, ETHTOOL_A_LINKINFO_PORT, data->lsettings->port) ||
+ nla_put_u8(skb, ETHTOOL_A_LINKINFO_PHYADDR,
+ data->lsettings->phy_address) ||
+ nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX,
+ data->lsettings->eth_tp_mdix) ||
+ nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX_CTRL,
+ data->lsettings->eth_tp_mdix_ctrl) ||
+ nla_put_u8(skb, ETHTOOL_A_LINKINFO_TRANSCEIVER,
+ data->lsettings->transceiver))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+/* LINKINFO_SET */
+
+const struct nla_policy ethnl_linkinfo_set_policy[] = {
+ [ETHTOOL_A_LINKINFO_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_LINKINFO_PORT] = { .type = NLA_U8 },
+ [ETHTOOL_A_LINKINFO_PHYADDR] = { .type = NLA_U8 },
+ [ETHTOOL_A_LINKINFO_TP_MDIX_CTRL] = { .type = NLA_U8 },
+};
+
+static int
+ethnl_set_linkinfo_validate(struct ethnl_req_info *req_info,
+ struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+
+ if (!ops->get_link_ksettings || !ops->set_link_ksettings)
+ return -EOPNOTSUPP;
+ return 1;
+}
+
+static int
+ethnl_set_linkinfo(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ struct ethtool_link_ksettings ksettings = {};
+ struct ethtool_link_settings *lsettings;
+ struct net_device *dev = req_info->dev;
+ struct nlattr **tb = info->attrs;
+ bool mod = false;
+ int ret;
+
+ ret = __ethtool_get_link_ksettings(dev, &ksettings);
+ if (ret < 0) {
+ GENL_SET_ERR_MSG(info, "failed to retrieve link settings");
+ return ret;
+ }
+ lsettings = &ksettings.base;
+
+ ethnl_update_u8(&lsettings->port, tb[ETHTOOL_A_LINKINFO_PORT], &mod);
+ ethnl_update_u8(&lsettings->phy_address, tb[ETHTOOL_A_LINKINFO_PHYADDR],
+ &mod);
+ ethnl_update_u8(&lsettings->eth_tp_mdix_ctrl,
+ tb[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL], &mod);
+ if (!mod)
+ return 0;
+
+ ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings);
+ if (ret < 0) {
+ GENL_SET_ERR_MSG(info, "link settings update failed");
+ return ret;
+ }
+
+ return 1;
+}
+
+const struct ethnl_request_ops ethnl_linkinfo_request_ops = {
+ .request_cmd = ETHTOOL_MSG_LINKINFO_GET,
+ .reply_cmd = ETHTOOL_MSG_LINKINFO_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_LINKINFO_HEADER,
+ .req_info_size = sizeof(struct linkinfo_req_info),
+ .reply_data_size = sizeof(struct linkinfo_reply_data),
+
+ .prepare_data = linkinfo_prepare_data,
+ .reply_size = linkinfo_reply_size,
+ .fill_reply = linkinfo_fill_reply,
+
+ .set_validate = ethnl_set_linkinfo_validate,
+ .set = ethnl_set_linkinfo,
+ .set_ntf_cmd = ETHTOOL_MSG_LINKINFO_NTF,
+};
diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c
new file mode 100644
index 000000000000..259cd9ef1f2a
--- /dev/null
+++ b/net/ethtool/linkmodes.c
@@ -0,0 +1,362 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+/* LINKMODES_GET */
+
+struct linkmodes_req_info {
+ struct ethnl_req_info base;
+};
+
+struct linkmodes_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_link_ksettings ksettings;
+ struct ethtool_link_settings *lsettings;
+ bool peer_empty;
+};
+
+#define LINKMODES_REPDATA(__reply_base) \
+ container_of(__reply_base, struct linkmodes_reply_data, base)
+
+const struct nla_policy ethnl_linkmodes_get_policy[] = {
+ [ETHTOOL_A_LINKMODES_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int linkmodes_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ data->lsettings = &data->ksettings.base;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ ret = __ethtool_get_link_ksettings(dev, &data->ksettings);
+ if (ret < 0) {
+ GENL_SET_ERR_MSG(info, "failed to retrieve link settings");
+ goto out;
+ }
+
+ if (!dev->ethtool_ops->cap_link_lanes_supported)
+ data->ksettings.lanes = 0;
+
+ data->peer_empty =
+ bitmap_empty(data->ksettings.link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+out:
+ ethnl_ops_complete(dev);
+ return ret;
+}
+
+static int linkmodes_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base);
+ const struct ethtool_link_ksettings *ksettings = &data->ksettings;
+ const struct ethtool_link_settings *lsettings = &ksettings->base;
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ int len, ret;
+
+ len = nla_total_size(sizeof(u8)) /* LINKMODES_AUTONEG */
+ + nla_total_size(sizeof(u32)) /* LINKMODES_SPEED */
+ + nla_total_size(sizeof(u32)) /* LINKMODES_LANES */
+ + nla_total_size(sizeof(u8)) /* LINKMODES_DUPLEX */
+ + nla_total_size(sizeof(u8)) /* LINKMODES_RATE_MATCHING */
+ + 0;
+ ret = ethnl_bitset_size(ksettings->link_modes.advertising,
+ ksettings->link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ if (!data->peer_empty) {
+ ret = ethnl_bitset_size(ksettings->link_modes.lp_advertising,
+ NULL, __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ }
+
+ if (lsettings->master_slave_cfg != MASTER_SLAVE_CFG_UNSUPPORTED)
+ len += nla_total_size(sizeof(u8));
+
+ if (lsettings->master_slave_state != MASTER_SLAVE_STATE_UNSUPPORTED)
+ len += nla_total_size(sizeof(u8));
+
+ return len;
+}
+
+static int linkmodes_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base);
+ const struct ethtool_link_ksettings *ksettings = &data->ksettings;
+ const struct ethtool_link_settings *lsettings = &ksettings->base;
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ int ret;
+
+ if (nla_put_u8(skb, ETHTOOL_A_LINKMODES_AUTONEG, lsettings->autoneg))
+ return -EMSGSIZE;
+
+ ret = ethnl_put_bitset(skb, ETHTOOL_A_LINKMODES_OURS,
+ ksettings->link_modes.advertising,
+ ksettings->link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS, link_mode_names,
+ compact);
+ if (ret < 0)
+ return -EMSGSIZE;
+ if (!data->peer_empty) {
+ ret = ethnl_put_bitset(skb, ETHTOOL_A_LINKMODES_PEER,
+ ksettings->link_modes.lp_advertising,
+ NULL, __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_mode_names, compact);
+ if (ret < 0)
+ return -EMSGSIZE;
+ }
+
+ if (nla_put_u32(skb, ETHTOOL_A_LINKMODES_SPEED, lsettings->speed) ||
+ nla_put_u8(skb, ETHTOOL_A_LINKMODES_DUPLEX, lsettings->duplex))
+ return -EMSGSIZE;
+
+ if (ksettings->lanes &&
+ nla_put_u32(skb, ETHTOOL_A_LINKMODES_LANES, ksettings->lanes))
+ return -EMSGSIZE;
+
+ if (lsettings->master_slave_cfg != MASTER_SLAVE_CFG_UNSUPPORTED &&
+ nla_put_u8(skb, ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG,
+ lsettings->master_slave_cfg))
+ return -EMSGSIZE;
+
+ if (lsettings->master_slave_state != MASTER_SLAVE_STATE_UNSUPPORTED &&
+ nla_put_u8(skb, ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE,
+ lsettings->master_slave_state))
+ return -EMSGSIZE;
+
+ if (nla_put_u8(skb, ETHTOOL_A_LINKMODES_RATE_MATCHING,
+ lsettings->rate_matching))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+/* LINKMODES_SET */
+
+const struct nla_policy ethnl_linkmodes_set_policy[] = {
+ [ETHTOOL_A_LINKMODES_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_LINKMODES_AUTONEG] = { .type = NLA_U8 },
+ [ETHTOOL_A_LINKMODES_OURS] = { .type = NLA_NESTED },
+ [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_U32 },
+ [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_U8 },
+ [ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG] = { .type = NLA_U8 },
+ [ETHTOOL_A_LINKMODES_LANES] = NLA_POLICY_RANGE(NLA_U32, 1, 8),
+};
+
+/* Set advertised link modes to all supported modes matching requested speed,
+ * lanes and duplex values. Called when autonegotiation is on, speed, lanes or
+ * duplex is requested but no link mode change. This is done in userspace with
+ * ioctl() interface, move it into kernel for netlink.
+ * Returns true if advertised modes bitmap was modified.
+ */
+static bool ethnl_auto_linkmodes(struct ethtool_link_ksettings *ksettings,
+ bool req_speed, bool req_lanes, bool req_duplex)
+{
+ unsigned long *advertising = ksettings->link_modes.advertising;
+ unsigned long *supported = ksettings->link_modes.supported;
+ DECLARE_BITMAP(old_adv, __ETHTOOL_LINK_MODE_MASK_NBITS);
+ unsigned int i;
+
+ bitmap_copy(old_adv, advertising, __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+ for (i = 0; i < __ETHTOOL_LINK_MODE_MASK_NBITS; i++) {
+ const struct link_mode_info *info = &link_mode_params[i];
+
+ if (info->speed == SPEED_UNKNOWN)
+ continue;
+ if (test_bit(i, supported) &&
+ (!req_speed || info->speed == ksettings->base.speed) &&
+ (!req_lanes || info->lanes == ksettings->lanes) &&
+ (!req_duplex || info->duplex == ksettings->base.duplex))
+ set_bit(i, advertising);
+ else
+ clear_bit(i, advertising);
+ }
+
+ return !bitmap_equal(old_adv, advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+}
+
+static bool ethnl_validate_master_slave_cfg(u8 cfg)
+{
+ switch (cfg) {
+ case MASTER_SLAVE_CFG_MASTER_PREFERRED:
+ case MASTER_SLAVE_CFG_SLAVE_PREFERRED:
+ case MASTER_SLAVE_CFG_MASTER_FORCE:
+ case MASTER_SLAVE_CFG_SLAVE_FORCE:
+ return true;
+ }
+
+ return false;
+}
+
+static int ethnl_check_linkmodes(struct genl_info *info, struct nlattr **tb)
+{
+ const struct nlattr *master_slave_cfg, *lanes_cfg;
+
+ master_slave_cfg = tb[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG];
+ if (master_slave_cfg &&
+ !ethnl_validate_master_slave_cfg(nla_get_u8(master_slave_cfg))) {
+ NL_SET_ERR_MSG_ATTR(info->extack, master_slave_cfg,
+ "master/slave value is invalid");
+ return -EOPNOTSUPP;
+ }
+
+ lanes_cfg = tb[ETHTOOL_A_LINKMODES_LANES];
+ if (lanes_cfg && !is_power_of_2(nla_get_u32(lanes_cfg))) {
+ NL_SET_ERR_MSG_ATTR(info->extack, lanes_cfg,
+ "lanes value is invalid");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb,
+ struct ethtool_link_ksettings *ksettings,
+ bool *mod, const struct net_device *dev)
+{
+ struct ethtool_link_settings *lsettings = &ksettings->base;
+ bool req_speed, req_lanes, req_duplex;
+ const struct nlattr *master_slave_cfg, *lanes_cfg;
+ int ret;
+
+ master_slave_cfg = tb[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG];
+ if (master_slave_cfg) {
+ if (lsettings->master_slave_cfg == MASTER_SLAVE_CFG_UNSUPPORTED) {
+ NL_SET_ERR_MSG_ATTR(info->extack, master_slave_cfg,
+ "master/slave configuration not supported by device");
+ return -EOPNOTSUPP;
+ }
+ }
+
+ *mod = false;
+ req_speed = tb[ETHTOOL_A_LINKMODES_SPEED];
+ req_lanes = tb[ETHTOOL_A_LINKMODES_LANES];
+ req_duplex = tb[ETHTOOL_A_LINKMODES_DUPLEX];
+
+ ethnl_update_u8(&lsettings->autoneg, tb[ETHTOOL_A_LINKMODES_AUTONEG],
+ mod);
+
+ lanes_cfg = tb[ETHTOOL_A_LINKMODES_LANES];
+ if (lanes_cfg) {
+ /* If autoneg is off and lanes parameter is not supported by the
+ * driver, return an error.
+ */
+ if (!lsettings->autoneg &&
+ !dev->ethtool_ops->cap_link_lanes_supported) {
+ NL_SET_ERR_MSG_ATTR(info->extack, lanes_cfg,
+ "lanes configuration not supported by device");
+ return -EOPNOTSUPP;
+ }
+ } else if (!lsettings->autoneg && ksettings->lanes) {
+ /* If autoneg is off and lanes parameter is not passed from user but
+ * it was defined previously then set the lanes parameter to 0.
+ */
+ ksettings->lanes = 0;
+ *mod = true;
+ }
+
+ ret = ethnl_update_bitset(ksettings->link_modes.advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ tb[ETHTOOL_A_LINKMODES_OURS], link_mode_names,
+ info->extack, mod);
+ if (ret < 0)
+ return ret;
+ ethnl_update_u32(&lsettings->speed, tb[ETHTOOL_A_LINKMODES_SPEED],
+ mod);
+ ethnl_update_u32(&ksettings->lanes, lanes_cfg, mod);
+ ethnl_update_u8(&lsettings->duplex, tb[ETHTOOL_A_LINKMODES_DUPLEX],
+ mod);
+ ethnl_update_u8(&lsettings->master_slave_cfg, master_slave_cfg, mod);
+
+ if (!tb[ETHTOOL_A_LINKMODES_OURS] && lsettings->autoneg &&
+ (req_speed || req_lanes || req_duplex) &&
+ ethnl_auto_linkmodes(ksettings, req_speed, req_lanes, req_duplex))
+ *mod = true;
+
+ return 0;
+}
+
+static int
+ethnl_set_linkmodes_validate(struct ethnl_req_info *req_info,
+ struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+ int ret;
+
+ ret = ethnl_check_linkmodes(info, info->attrs);
+ if (ret < 0)
+ return ret;
+
+ if (!ops->get_link_ksettings || !ops->set_link_ksettings)
+ return -EOPNOTSUPP;
+ return 1;
+}
+
+static int
+ethnl_set_linkmodes(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ struct ethtool_link_ksettings ksettings = {};
+ struct net_device *dev = req_info->dev;
+ struct nlattr **tb = info->attrs;
+ bool mod = false;
+ int ret;
+
+ ret = __ethtool_get_link_ksettings(dev, &ksettings);
+ if (ret < 0) {
+ GENL_SET_ERR_MSG(info, "failed to retrieve link settings");
+ return ret;
+ }
+
+ ret = ethnl_update_linkmodes(info, tb, &ksettings, &mod, dev);
+ if (ret < 0)
+ return ret;
+ if (!mod)
+ return 0;
+
+ ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings);
+ if (ret < 0) {
+ GENL_SET_ERR_MSG(info, "link settings update failed");
+ return ret;
+ }
+
+ return 1;
+}
+
+const struct ethnl_request_ops ethnl_linkmodes_request_ops = {
+ .request_cmd = ETHTOOL_MSG_LINKMODES_GET,
+ .reply_cmd = ETHTOOL_MSG_LINKMODES_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_LINKMODES_HEADER,
+ .req_info_size = sizeof(struct linkmodes_req_info),
+ .reply_data_size = sizeof(struct linkmodes_reply_data),
+
+ .prepare_data = linkmodes_prepare_data,
+ .reply_size = linkmodes_reply_size,
+ .fill_reply = linkmodes_fill_reply,
+
+ .set_validate = ethnl_set_linkmodes_validate,
+ .set = ethnl_set_linkmodes,
+ .set_ntf_cmd = ETHTOOL_MSG_LINKMODES_NTF,
+};
diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c
new file mode 100644
index 000000000000..05a5f72c99fa
--- /dev/null
+++ b/net/ethtool/linkstate.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include <linux/phy.h>
+#include <linux/phylib_stubs.h>
+
+struct linkstate_req_info {
+ struct ethnl_req_info base;
+};
+
+struct linkstate_reply_data {
+ struct ethnl_reply_data base;
+ int link;
+ int sqi;
+ int sqi_max;
+ struct ethtool_link_ext_stats link_stats;
+ bool link_ext_state_provided;
+ struct ethtool_link_ext_state_info ethtool_link_ext_state_info;
+};
+
+#define LINKSTATE_REPDATA(__reply_base) \
+ container_of(__reply_base, struct linkstate_reply_data, base)
+
+const struct nla_policy ethnl_linkstate_get_policy[] = {
+ [ETHTOOL_A_LINKSTATE_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy_stats),
+};
+
+static int linkstate_get_sqi(struct phy_device *phydev)
+{
+ int ret;
+
+ if (!phydev)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&phydev->lock);
+ if (!phydev->drv || !phydev->drv->get_sqi)
+ ret = -EOPNOTSUPP;
+ else if (!phydev->link)
+ ret = -ENETDOWN;
+ else
+ ret = phydev->drv->get_sqi(phydev);
+ mutex_unlock(&phydev->lock);
+
+ return ret;
+}
+
+static int linkstate_get_sqi_max(struct phy_device *phydev)
+{
+ int ret;
+
+ if (!phydev)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&phydev->lock);
+ if (!phydev->drv || !phydev->drv->get_sqi_max)
+ ret = -EOPNOTSUPP;
+ else if (!phydev->link)
+ ret = -ENETDOWN;
+ else
+ ret = phydev->drv->get_sqi_max(phydev);
+ mutex_unlock(&phydev->lock);
+
+ return ret;
+};
+
+static bool linkstate_sqi_critical_error(int sqi)
+{
+ return sqi < 0 && sqi != -EOPNOTSUPP && sqi != -ENETDOWN;
+}
+
+static bool linkstate_sqi_valid(struct linkstate_reply_data *data)
+{
+ return data->sqi >= 0 && data->sqi_max >= 0 &&
+ data->sqi <= data->sqi_max;
+}
+
+static int linkstate_get_link_ext_state(struct net_device *dev,
+ struct linkstate_reply_data *data)
+{
+ int err;
+
+ if (!dev->ethtool_ops->get_link_ext_state)
+ return -EOPNOTSUPP;
+
+ err = dev->ethtool_ops->get_link_ext_state(dev, &data->ethtool_link_ext_state_info);
+ if (err)
+ return err;
+
+ data->link_ext_state_provided = true;
+
+ return 0;
+}
+
+static int linkstate_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ struct nlattr **tb = info->attrs;
+ struct phy_device *phydev;
+ int ret;
+
+ phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_LINKSTATE_HEADER,
+ info->extack);
+ if (IS_ERR(phydev)) {
+ ret = PTR_ERR(phydev);
+ goto out;
+ }
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ data->link = __ethtool_get_link(dev);
+
+ ret = linkstate_get_sqi(phydev);
+ if (linkstate_sqi_critical_error(ret))
+ goto out;
+ data->sqi = ret;
+
+ ret = linkstate_get_sqi_max(phydev);
+ if (linkstate_sqi_critical_error(ret))
+ goto out;
+ data->sqi_max = ret;
+
+ if (dev->flags & IFF_UP) {
+ ret = linkstate_get_link_ext_state(dev, data);
+ if (ret < 0 && ret != -EOPNOTSUPP && ret != -ENODATA)
+ goto out;
+ }
+
+ ethtool_stats_init((u64 *)&data->link_stats,
+ sizeof(data->link_stats) / 8);
+
+ if (req_base->flags & ETHTOOL_FLAG_STATS) {
+ if (phydev)
+ phy_ethtool_get_link_ext_stats(phydev,
+ &data->link_stats);
+
+ if (dev->ethtool_ops->get_link_ext_stats)
+ dev->ethtool_ops->get_link_ext_stats(dev,
+ &data->link_stats);
+ }
+
+ ret = 0;
+out:
+ ethnl_ops_complete(dev);
+ return ret;
+}
+
+static int linkstate_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base);
+ int len;
+
+ len = nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */
+ + 0;
+
+ if (linkstate_sqi_valid(data)) {
+ len += nla_total_size(sizeof(u32)); /* LINKSTATE_SQI */
+ len += nla_total_size(sizeof(u32)); /* LINKSTATE_SQI_MAX */
+ }
+
+ if (data->link_ext_state_provided)
+ len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_STATE */
+
+ if (data->ethtool_link_ext_state_info.__link_ext_substate)
+ len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_SUBSTATE */
+
+ if (data->link_stats.link_down_events != ETHTOOL_STAT_NOT_SET)
+ len += nla_total_size(sizeof(u32));
+
+ return len;
+}
+
+static int linkstate_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base);
+
+ if (data->link >= 0 &&
+ nla_put_u8(skb, ETHTOOL_A_LINKSTATE_LINK, !!data->link))
+ return -EMSGSIZE;
+
+ if (linkstate_sqi_valid(data)) {
+ if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi))
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX,
+ data->sqi_max))
+ return -EMSGSIZE;
+ }
+
+ if (data->link_ext_state_provided) {
+ if (nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_STATE,
+ data->ethtool_link_ext_state_info.link_ext_state))
+ return -EMSGSIZE;
+
+ if (data->ethtool_link_ext_state_info.__link_ext_substate &&
+ nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_SUBSTATE,
+ data->ethtool_link_ext_state_info.__link_ext_substate))
+ return -EMSGSIZE;
+ }
+
+ if (data->link_stats.link_down_events != ETHTOOL_STAT_NOT_SET)
+ if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_EXT_DOWN_CNT,
+ data->link_stats.link_down_events))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+const struct ethnl_request_ops ethnl_linkstate_request_ops = {
+ .request_cmd = ETHTOOL_MSG_LINKSTATE_GET,
+ .reply_cmd = ETHTOOL_MSG_LINKSTATE_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_LINKSTATE_HEADER,
+ .req_info_size = sizeof(struct linkstate_req_info),
+ .reply_data_size = sizeof(struct linkstate_reply_data),
+
+ .prepare_data = linkstate_prepare_data,
+ .reply_size = linkstate_reply_size,
+ .fill_reply = linkstate_fill_reply,
+};
diff --git a/net/ethtool/mm.c b/net/ethtool/mm.c
new file mode 100644
index 000000000000..29bbbc149375
--- /dev/null
+++ b/net/ethtool/mm.c
@@ -0,0 +1,561 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2022-2025 NXP
+ * Copyright 2024 Furong Xu <0x1207@gmail.com>
+ */
+#include "common.h"
+#include "netlink.h"
+
+struct mm_req_info {
+ struct ethnl_req_info base;
+};
+
+struct mm_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_mm_state state;
+ struct ethtool_mm_stats stats;
+};
+
+#define MM_REPDATA(__reply_base) \
+ container_of(__reply_base, struct mm_reply_data, base)
+
+#define ETHTOOL_MM_STAT_CNT \
+ (__ETHTOOL_A_MM_STAT_CNT - (ETHTOOL_A_MM_STAT_PAD + 1))
+
+const struct nla_policy ethnl_mm_get_policy[ETHTOOL_A_MM_HEADER + 1] = {
+ [ETHTOOL_A_MM_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats),
+};
+
+static int mm_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct mm_reply_data *data = MM_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ const struct ethtool_ops *ops;
+ int ret;
+
+ ops = dev->ethtool_ops;
+
+ if (!ops->get_mm)
+ return -EOPNOTSUPP;
+
+ ethtool_stats_init((u64 *)&data->stats,
+ sizeof(data->stats) / sizeof(u64));
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ ret = ops->get_mm(dev, &data->state);
+ if (ret)
+ goto out_complete;
+
+ if (ops->get_mm_stats && (req_base->flags & ETHTOOL_FLAG_STATS))
+ ops->get_mm_stats(dev, &data->stats);
+
+out_complete:
+ ethnl_ops_complete(dev);
+
+ return ret;
+}
+
+static int mm_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ int len = 0;
+
+ len += nla_total_size(sizeof(u8)); /* _MM_PMAC_ENABLED */
+ len += nla_total_size(sizeof(u8)); /* _MM_TX_ENABLED */
+ len += nla_total_size(sizeof(u8)); /* _MM_TX_ACTIVE */
+ len += nla_total_size(sizeof(u8)); /* _MM_VERIFY_ENABLED */
+ len += nla_total_size(sizeof(u8)); /* _MM_VERIFY_STATUS */
+ len += nla_total_size(sizeof(u32)); /* _MM_VERIFY_TIME */
+ len += nla_total_size(sizeof(u32)); /* _MM_MAX_VERIFY_TIME */
+ len += nla_total_size(sizeof(u32)); /* _MM_TX_MIN_FRAG_SIZE */
+ len += nla_total_size(sizeof(u32)); /* _MM_RX_MIN_FRAG_SIZE */
+
+ if (req_base->flags & ETHTOOL_FLAG_STATS)
+ len += nla_total_size(0) + /* _MM_STATS */
+ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_MM_STAT_CNT;
+
+ return len;
+}
+
+static int mm_put_stat(struct sk_buff *skb, u64 val, u16 attrtype)
+{
+ if (val == ETHTOOL_STAT_NOT_SET)
+ return 0;
+ if (nla_put_u64_64bit(skb, attrtype, val, ETHTOOL_A_MM_STAT_PAD))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int mm_put_stats(struct sk_buff *skb,
+ const struct ethtool_mm_stats *stats)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_MM_STATS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (mm_put_stat(skb, stats->MACMergeFrameAssErrorCount,
+ ETHTOOL_A_MM_STAT_REASSEMBLY_ERRORS) ||
+ mm_put_stat(skb, stats->MACMergeFrameSmdErrorCount,
+ ETHTOOL_A_MM_STAT_SMD_ERRORS) ||
+ mm_put_stat(skb, stats->MACMergeFrameAssOkCount,
+ ETHTOOL_A_MM_STAT_REASSEMBLY_OK) ||
+ mm_put_stat(skb, stats->MACMergeFragCountRx,
+ ETHTOOL_A_MM_STAT_RX_FRAG_COUNT) ||
+ mm_put_stat(skb, stats->MACMergeFragCountTx,
+ ETHTOOL_A_MM_STAT_TX_FRAG_COUNT) ||
+ mm_put_stat(skb, stats->MACMergeHoldCount,
+ ETHTOOL_A_MM_STAT_HOLD_COUNT))
+ goto err_cancel;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+err_cancel:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int mm_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct mm_reply_data *data = MM_REPDATA(reply_base);
+ const struct ethtool_mm_state *state = &data->state;
+
+ if (nla_put_u8(skb, ETHTOOL_A_MM_TX_ENABLED, state->tx_enabled) ||
+ nla_put_u8(skb, ETHTOOL_A_MM_TX_ACTIVE, state->tx_active) ||
+ nla_put_u8(skb, ETHTOOL_A_MM_PMAC_ENABLED, state->pmac_enabled) ||
+ nla_put_u8(skb, ETHTOOL_A_MM_VERIFY_ENABLED, state->verify_enabled) ||
+ nla_put_u8(skb, ETHTOOL_A_MM_VERIFY_STATUS, state->verify_status) ||
+ nla_put_u32(skb, ETHTOOL_A_MM_VERIFY_TIME, state->verify_time) ||
+ nla_put_u32(skb, ETHTOOL_A_MM_MAX_VERIFY_TIME, state->max_verify_time) ||
+ nla_put_u32(skb, ETHTOOL_A_MM_TX_MIN_FRAG_SIZE, state->tx_min_frag_size) ||
+ nla_put_u32(skb, ETHTOOL_A_MM_RX_MIN_FRAG_SIZE, state->rx_min_frag_size))
+ return -EMSGSIZE;
+
+ if (req_base->flags & ETHTOOL_FLAG_STATS &&
+ mm_put_stats(skb, &data->stats))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+const struct nla_policy ethnl_mm_set_policy[ETHTOOL_A_MM_MAX + 1] = {
+ [ETHTOOL_A_MM_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_MM_VERIFY_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1),
+ [ETHTOOL_A_MM_VERIFY_TIME] = NLA_POLICY_RANGE(NLA_U32, 1, 128),
+ [ETHTOOL_A_MM_TX_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1),
+ [ETHTOOL_A_MM_PMAC_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1),
+ [ETHTOOL_A_MM_TX_MIN_FRAG_SIZE] = NLA_POLICY_RANGE(NLA_U32, 60, 252),
+};
+
+static void mm_state_to_cfg(const struct ethtool_mm_state *state,
+ struct ethtool_mm_cfg *cfg)
+{
+ /* We could also compare state->verify_status against
+ * ETHTOOL_MM_VERIFY_STATUS_DISABLED, but state->verify_enabled
+ * is more like an administrative state which should be seen in
+ * ETHTOOL_MSG_MM_GET replies. For example, a port with verification
+ * disabled might be in the ETHTOOL_MM_VERIFY_STATUS_INITIAL
+ * if it's down.
+ */
+ cfg->verify_enabled = state->verify_enabled;
+ cfg->verify_time = state->verify_time;
+ cfg->tx_enabled = state->tx_enabled;
+ cfg->pmac_enabled = state->pmac_enabled;
+ cfg->tx_min_frag_size = state->tx_min_frag_size;
+}
+
+static int
+ethnl_set_mm_validate(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+
+ return ops->get_mm && ops->set_mm ? 1 : -EOPNOTSUPP;
+}
+
+static int ethnl_set_mm(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct net_device *dev = req_info->dev;
+ struct ethtool_mm_state state = {};
+ struct nlattr **tb = info->attrs;
+ struct ethtool_mm_cfg cfg = {};
+ bool mod = false;
+ int ret;
+
+ ret = dev->ethtool_ops->get_mm(dev, &state);
+ if (ret)
+ return ret;
+
+ mm_state_to_cfg(&state, &cfg);
+
+ ethnl_update_bool(&cfg.verify_enabled, tb[ETHTOOL_A_MM_VERIFY_ENABLED],
+ &mod);
+ ethnl_update_u32(&cfg.verify_time, tb[ETHTOOL_A_MM_VERIFY_TIME], &mod);
+ ethnl_update_bool(&cfg.tx_enabled, tb[ETHTOOL_A_MM_TX_ENABLED], &mod);
+ ethnl_update_bool(&cfg.pmac_enabled, tb[ETHTOOL_A_MM_PMAC_ENABLED],
+ &mod);
+ ethnl_update_u32(&cfg.tx_min_frag_size,
+ tb[ETHTOOL_A_MM_TX_MIN_FRAG_SIZE], &mod);
+
+ if (!mod)
+ return 0;
+
+ if (cfg.verify_time > state.max_verify_time) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_MM_VERIFY_TIME],
+ "verifyTime exceeds device maximum");
+ return -ERANGE;
+ }
+
+ if (cfg.verify_enabled && !cfg.tx_enabled) {
+ NL_SET_ERR_MSG(extack, "Verification requires TX enabled");
+ return -EINVAL;
+ }
+
+ if (cfg.tx_enabled && !cfg.pmac_enabled) {
+ NL_SET_ERR_MSG(extack, "TX enabled requires pMAC enabled");
+ return -EINVAL;
+ }
+
+ ret = dev->ethtool_ops->set_mm(dev, &cfg, extack);
+ return ret < 0 ? ret : 1;
+}
+
+const struct ethnl_request_ops ethnl_mm_request_ops = {
+ .request_cmd = ETHTOOL_MSG_MM_GET,
+ .reply_cmd = ETHTOOL_MSG_MM_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_MM_HEADER,
+ .req_info_size = sizeof(struct mm_req_info),
+ .reply_data_size = sizeof(struct mm_reply_data),
+
+ .prepare_data = mm_prepare_data,
+ .reply_size = mm_reply_size,
+ .fill_reply = mm_fill_reply,
+
+ .set_validate = ethnl_set_mm_validate,
+ .set = ethnl_set_mm,
+ .set_ntf_cmd = ETHTOOL_MSG_MM_NTF,
+};
+
+/* Returns whether a given device supports the MAC merge layer
+ * (has an eMAC and a pMAC). Must be called under rtnl_lock() and
+ * ethnl_ops_begin().
+ */
+bool __ethtool_dev_mm_supported(struct net_device *dev)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_mm_state state = {};
+ int ret = -EOPNOTSUPP;
+
+ if (ops && ops->get_mm)
+ ret = ops->get_mm(dev, &state);
+
+ return !ret;
+}
+
+bool ethtool_dev_mm_supported(struct net_device *dev)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ bool supported;
+ int ret;
+
+ ASSERT_RTNL();
+
+ if (!ops)
+ return false;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return false;
+
+ supported = __ethtool_dev_mm_supported(dev);
+
+ ethnl_ops_complete(dev);
+
+ return supported;
+}
+EXPORT_SYMBOL_GPL(ethtool_dev_mm_supported);
+
+static void ethtool_mmsv_configure_tx(struct ethtool_mmsv *mmsv,
+ bool tx_active)
+{
+ if (mmsv->ops->configure_tx)
+ mmsv->ops->configure_tx(mmsv, tx_active);
+}
+
+static void ethtool_mmsv_configure_pmac(struct ethtool_mmsv *mmsv,
+ bool pmac_enabled)
+{
+ if (mmsv->ops->configure_pmac)
+ mmsv->ops->configure_pmac(mmsv, pmac_enabled);
+}
+
+static void ethtool_mmsv_send_mpacket(struct ethtool_mmsv *mmsv,
+ enum ethtool_mpacket mpacket)
+{
+ if (mmsv->ops->send_mpacket)
+ mmsv->ops->send_mpacket(mmsv, mpacket);
+}
+
+/**
+ * ethtool_mmsv_verify_timer - Timer for MAC Merge verification
+ * @t: timer_list struct containing private info
+ *
+ * Verify the MAC Merge capability in the local TX direction, by
+ * transmitting Verify mPackets up to 3 times. Wait until link
+ * partner responds with a Response mPacket, otherwise fail.
+ */
+static void ethtool_mmsv_verify_timer(struct timer_list *t)
+{
+ struct ethtool_mmsv *mmsv = timer_container_of(mmsv, t, verify_timer);
+ unsigned long flags;
+ bool rearm = false;
+
+ spin_lock_irqsave(&mmsv->lock, flags);
+
+ switch (mmsv->status) {
+ case ETHTOOL_MM_VERIFY_STATUS_INITIAL:
+ case ETHTOOL_MM_VERIFY_STATUS_VERIFYING:
+ if (mmsv->verify_retries != 0) {
+ ethtool_mmsv_send_mpacket(mmsv, ETHTOOL_MPACKET_VERIFY);
+ rearm = true;
+ } else {
+ mmsv->status = ETHTOOL_MM_VERIFY_STATUS_FAILED;
+ }
+
+ mmsv->verify_retries--;
+ break;
+
+ case ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED:
+ ethtool_mmsv_configure_tx(mmsv, true);
+ break;
+
+ default:
+ break;
+ }
+
+ if (rearm) {
+ mod_timer(&mmsv->verify_timer,
+ jiffies + msecs_to_jiffies(mmsv->verify_time));
+ }
+
+ spin_unlock_irqrestore(&mmsv->lock, flags);
+}
+
+static void ethtool_mmsv_verify_timer_arm(struct ethtool_mmsv *mmsv)
+{
+ if (mmsv->pmac_enabled && mmsv->tx_enabled && mmsv->verify_enabled &&
+ mmsv->status != ETHTOOL_MM_VERIFY_STATUS_FAILED &&
+ mmsv->status != ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED) {
+ timer_setup(&mmsv->verify_timer, ethtool_mmsv_verify_timer, 0);
+ mod_timer(&mmsv->verify_timer, jiffies);
+ }
+}
+
+static void ethtool_mmsv_apply(struct ethtool_mmsv *mmsv)
+{
+ /* If verification is disabled, configure FPE right away.
+ * Otherwise let the timer code do it.
+ */
+ if (!mmsv->verify_enabled) {
+ ethtool_mmsv_configure_pmac(mmsv, mmsv->pmac_enabled);
+ ethtool_mmsv_configure_tx(mmsv, mmsv->tx_enabled);
+ } else {
+ mmsv->status = ETHTOOL_MM_VERIFY_STATUS_INITIAL;
+ mmsv->verify_retries = ETHTOOL_MM_MAX_VERIFY_RETRIES;
+
+ if (netif_running(mmsv->dev))
+ ethtool_mmsv_verify_timer_arm(mmsv);
+ }
+}
+
+/**
+ * ethtool_mmsv_stop() - Stop MAC Merge Software Verification
+ * @mmsv: MAC Merge Software Verification state
+ *
+ * Drivers should call this method in a state where the hardware is
+ * about to lose state, like ndo_stop() or suspend(), and turning off
+ * MAC Merge features would be superfluous. Otherwise, prefer
+ * ethtool_mmsv_link_state_handle() with up=false.
+ */
+void ethtool_mmsv_stop(struct ethtool_mmsv *mmsv)
+{
+ timer_shutdown_sync(&mmsv->verify_timer);
+}
+EXPORT_SYMBOL_GPL(ethtool_mmsv_stop);
+
+/**
+ * ethtool_mmsv_link_state_handle() - Inform MAC Merge Software Verification
+ * of link state changes
+ * @mmsv: MAC Merge Software Verification state
+ * @up: True if device carrier is up and able to pass verification packets
+ *
+ * Calling context is expected to be from a task, interrupts enabled.
+ */
+void ethtool_mmsv_link_state_handle(struct ethtool_mmsv *mmsv, bool up)
+{
+ unsigned long flags;
+
+ ethtool_mmsv_stop(mmsv);
+
+ spin_lock_irqsave(&mmsv->lock, flags);
+
+ if (up && mmsv->pmac_enabled) {
+ /* VERIFY process requires pMAC enabled when NIC comes up */
+ ethtool_mmsv_configure_pmac(mmsv, true);
+
+ /* New link => maybe new partner => new verification process */
+ ethtool_mmsv_apply(mmsv);
+ } else {
+ /* Reset the reported verification state while the link is down */
+ if (mmsv->verify_enabled)
+ mmsv->status = ETHTOOL_MM_VERIFY_STATUS_INITIAL;
+
+ /* No link or pMAC not enabled */
+ ethtool_mmsv_configure_pmac(mmsv, false);
+ ethtool_mmsv_configure_tx(mmsv, false);
+ }
+
+ spin_unlock_irqrestore(&mmsv->lock, flags);
+}
+EXPORT_SYMBOL_GPL(ethtool_mmsv_link_state_handle);
+
+/**
+ * ethtool_mmsv_event_handle() - Inform MAC Merge Software Verification
+ * of interrupt-based events
+ * @mmsv: MAC Merge Software Verification state
+ * @event: Event which took place (packet transmission or reception)
+ *
+ * Calling context expects to have interrupts disabled.
+ */
+void ethtool_mmsv_event_handle(struct ethtool_mmsv *mmsv,
+ enum ethtool_mmsv_event event)
+{
+ /* This is interrupt context, just spin_lock() */
+ spin_lock(&mmsv->lock);
+
+ if (!mmsv->pmac_enabled)
+ goto unlock;
+
+ switch (event) {
+ case ETHTOOL_MMSV_LP_SENT_VERIFY_MPACKET:
+ /* Link partner has sent verify mPacket */
+ ethtool_mmsv_send_mpacket(mmsv, ETHTOOL_MPACKET_RESPONSE);
+ break;
+ case ETHTOOL_MMSV_LD_SENT_VERIFY_MPACKET:
+ /* Local device has sent verify mPacket */
+ if (mmsv->status != ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED)
+ mmsv->status = ETHTOOL_MM_VERIFY_STATUS_VERIFYING;
+ break;
+ case ETHTOOL_MMSV_LP_SENT_RESPONSE_MPACKET:
+ /* Link partner has sent response mPacket */
+ if (mmsv->status == ETHTOOL_MM_VERIFY_STATUS_VERIFYING)
+ mmsv->status = ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED;
+ break;
+ }
+
+unlock:
+ spin_unlock(&mmsv->lock);
+}
+EXPORT_SYMBOL_GPL(ethtool_mmsv_event_handle);
+
+static bool ethtool_mmsv_is_tx_active(struct ethtool_mmsv *mmsv)
+{
+ /* TX is active if administratively enabled, and verification either
+ * succeeded, or was administratively disabled.
+ */
+ return mmsv->tx_enabled &&
+ (mmsv->status == ETHTOOL_MM_VERIFY_STATUS_SUCCEEDED ||
+ mmsv->status == ETHTOOL_MM_VERIFY_STATUS_DISABLED);
+}
+
+/**
+ * ethtool_mmsv_get_mm() - get_mm() hook for MAC Merge Software Verification
+ * @mmsv: MAC Merge Software Verification state
+ * @state: see struct ethtool_mm_state
+ *
+ * Drivers are expected to call this from their ethtool_ops :: get_mm()
+ * method.
+ */
+void ethtool_mmsv_get_mm(struct ethtool_mmsv *mmsv,
+ struct ethtool_mm_state *state)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&mmsv->lock, flags);
+
+ state->max_verify_time = ETHTOOL_MM_MAX_VERIFY_TIME_MS;
+ state->verify_enabled = mmsv->verify_enabled;
+ state->pmac_enabled = mmsv->pmac_enabled;
+ state->verify_time = mmsv->verify_time;
+ state->tx_enabled = mmsv->tx_enabled;
+ state->verify_status = mmsv->status;
+ state->tx_active = ethtool_mmsv_is_tx_active(mmsv);
+
+ spin_unlock_irqrestore(&mmsv->lock, flags);
+}
+EXPORT_SYMBOL_GPL(ethtool_mmsv_get_mm);
+
+/**
+ * ethtool_mmsv_set_mm() - set_mm() hook for MAC Merge Software Verification
+ * @mmsv: MAC Merge Software Verification state
+ * @cfg: see struct ethtool_mm_cfg
+ *
+ * Drivers are expected to call this from their ethtool_ops :: set_mm()
+ * method.
+ */
+void ethtool_mmsv_set_mm(struct ethtool_mmsv *mmsv, struct ethtool_mm_cfg *cfg)
+{
+ unsigned long flags;
+
+ /* Wait for the verification that's currently in progress to finish */
+ ethtool_mmsv_stop(mmsv);
+
+ spin_lock_irqsave(&mmsv->lock, flags);
+
+ mmsv->verify_enabled = cfg->verify_enabled;
+ mmsv->pmac_enabled = cfg->pmac_enabled;
+ mmsv->verify_time = cfg->verify_time;
+ mmsv->tx_enabled = cfg->tx_enabled;
+
+ if (!cfg->verify_enabled)
+ mmsv->status = ETHTOOL_MM_VERIFY_STATUS_DISABLED;
+
+ ethtool_mmsv_apply(mmsv);
+
+ spin_unlock_irqrestore(&mmsv->lock, flags);
+}
+EXPORT_SYMBOL_GPL(ethtool_mmsv_set_mm);
+
+/**
+ * ethtool_mmsv_init() - Initialize MAC Merge Software Verification state
+ * @mmsv: MAC Merge Software Verification state
+ * @dev: Pointer to network interface
+ * @ops: Methods for implementing the generic functionality
+ *
+ * The MAC Merge Software Verification is a timer- and event-based state
+ * machine intended for network interfaces which lack a hardware-based
+ * TX verification process (as per IEEE 802.3 clause 99.4.3). The timer
+ * is managed by the core code, whereas events are supplied by the
+ * driver explicitly calling one of the other API functions.
+ */
+void ethtool_mmsv_init(struct ethtool_mmsv *mmsv, struct net_device *dev,
+ const struct ethtool_mmsv_ops *ops)
+{
+ mmsv->ops = ops;
+ mmsv->dev = dev;
+ mmsv->verify_retries = ETHTOOL_MM_MAX_VERIFY_RETRIES;
+ mmsv->verify_time = ETHTOOL_MM_MAX_VERIFY_TIME_MS;
+ mmsv->status = ETHTOOL_MM_VERIFY_STATUS_DISABLED;
+ timer_setup(&mmsv->verify_timer, ethtool_mmsv_verify_timer, 0);
+ spin_lock_init(&mmsv->lock);
+}
+EXPORT_SYMBOL_GPL(ethtool_mmsv_init);
diff --git a/net/ethtool/module.c b/net/ethtool/module.c
new file mode 100644
index 000000000000..4d4e0a82579a
--- /dev/null
+++ b/net/ethtool/module.c
@@ -0,0 +1,557 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool.h>
+#include <linux/firmware.h>
+#include <linux/sfp.h>
+#include <net/devlink.h>
+#include <net/netdev_lock.h>
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+#include "module_fw.h"
+
+struct module_req_info {
+ struct ethnl_req_info base;
+};
+
+struct module_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_module_power_mode_params power;
+};
+
+#define MODULE_REPDATA(__reply_base) \
+ container_of(__reply_base, struct module_reply_data, base)
+
+/* MODULE_GET */
+
+const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + 1] = {
+ [ETHTOOL_A_MODULE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int module_get_power_mode(struct net_device *dev,
+ struct module_reply_data *data,
+ struct netlink_ext_ack *extack)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (!ops->get_module_power_mode)
+ return 0;
+
+ if (dev->ethtool->module_fw_flash_in_progress) {
+ NL_SET_ERR_MSG(extack,
+ "Module firmware flashing is in progress");
+ return -EBUSY;
+ }
+
+ return ops->get_module_power_mode(dev, &data->power, extack);
+}
+
+static int module_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct module_reply_data *data = MODULE_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ ret = module_get_power_mode(dev, data, info->extack);
+ if (ret < 0)
+ goto out_complete;
+
+out_complete:
+ ethnl_ops_complete(dev);
+ return ret;
+}
+
+static int module_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ struct module_reply_data *data = MODULE_REPDATA(reply_base);
+ int len = 0;
+
+ if (data->power.policy)
+ len += nla_total_size(sizeof(u8)); /* _MODULE_POWER_MODE_POLICY */
+
+ if (data->power.mode)
+ len += nla_total_size(sizeof(u8)); /* _MODULE_POWER_MODE */
+
+ return len;
+}
+
+static int module_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct module_reply_data *data = MODULE_REPDATA(reply_base);
+
+ if (data->power.policy &&
+ nla_put_u8(skb, ETHTOOL_A_MODULE_POWER_MODE_POLICY,
+ data->power.policy))
+ return -EMSGSIZE;
+
+ if (data->power.mode &&
+ nla_put_u8(skb, ETHTOOL_A_MODULE_POWER_MODE, data->power.mode))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+/* MODULE_SET */
+
+const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1] = {
+ [ETHTOOL_A_MODULE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_MODULE_POWER_MODE_POLICY] =
+ NLA_POLICY_RANGE(NLA_U8, ETHTOOL_MODULE_POWER_MODE_POLICY_HIGH,
+ ETHTOOL_MODULE_POWER_MODE_POLICY_AUTO),
+};
+
+static int
+ethnl_set_module_validate(struct ethnl_req_info *req_info,
+ struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+ struct nlattr **tb = info->attrs;
+
+ if (!tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY])
+ return 0;
+
+ if (req_info->dev->ethtool->module_fw_flash_in_progress) {
+ NL_SET_ERR_MSG(info->extack,
+ "Module firmware flashing is in progress");
+ return -EBUSY;
+ }
+
+ if (!ops->get_module_power_mode || !ops->set_module_power_mode) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY],
+ "Setting power mode policy is not supported by this device");
+ return -EOPNOTSUPP;
+ }
+
+ return 1;
+}
+
+static int
+ethnl_set_module(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ struct ethtool_module_power_mode_params power = {};
+ struct ethtool_module_power_mode_params power_new;
+ const struct ethtool_ops *ops;
+ struct net_device *dev = req_info->dev;
+ struct nlattr **tb = info->attrs;
+ int ret;
+
+ ops = dev->ethtool_ops;
+
+ power_new.policy = nla_get_u8(tb[ETHTOOL_A_MODULE_POWER_MODE_POLICY]);
+ ret = ops->get_module_power_mode(dev, &power, info->extack);
+ if (ret < 0)
+ return ret;
+
+ if (power_new.policy == power.policy)
+ return 0;
+
+ ret = ops->set_module_power_mode(dev, &power_new, info->extack);
+ return ret < 0 ? ret : 1;
+}
+
+const struct ethnl_request_ops ethnl_module_request_ops = {
+ .request_cmd = ETHTOOL_MSG_MODULE_GET,
+ .reply_cmd = ETHTOOL_MSG_MODULE_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_MODULE_HEADER,
+ .req_info_size = sizeof(struct module_req_info),
+ .reply_data_size = sizeof(struct module_reply_data),
+
+ .prepare_data = module_prepare_data,
+ .reply_size = module_reply_size,
+ .fill_reply = module_fill_reply,
+
+ .set_validate = ethnl_set_module_validate,
+ .set = ethnl_set_module,
+ .set_ntf_cmd = ETHTOOL_MSG_MODULE_NTF,
+};
+
+/* MODULE_FW_FLASH_ACT */
+
+const struct nla_policy
+ethnl_module_fw_flash_act_policy[ETHTOOL_A_MODULE_FW_FLASH_PASSWORD + 1] = {
+ [ETHTOOL_A_MODULE_FW_FLASH_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME] = { .type = NLA_NUL_STRING },
+ [ETHTOOL_A_MODULE_FW_FLASH_PASSWORD] = { .type = NLA_U32 },
+};
+
+static LIST_HEAD(module_fw_flash_work_list);
+static DEFINE_SPINLOCK(module_fw_flash_work_list_lock);
+
+static int
+module_flash_fw_work_list_add(struct ethtool_module_fw_flash *module_fw,
+ struct genl_info *info)
+{
+ struct ethtool_module_fw_flash *work;
+
+ /* First, check if already registered. */
+ spin_lock(&module_fw_flash_work_list_lock);
+ list_for_each_entry(work, &module_fw_flash_work_list, list) {
+ if (work->fw_update.ntf_params.portid == info->snd_portid &&
+ work->fw_update.dev == module_fw->fw_update.dev) {
+ spin_unlock(&module_fw_flash_work_list_lock);
+ return -EALREADY;
+ }
+ }
+
+ list_add_tail(&module_fw->list, &module_fw_flash_work_list);
+ spin_unlock(&module_fw_flash_work_list_lock);
+
+ return 0;
+}
+
+static void module_flash_fw_work_list_del(struct list_head *list)
+{
+ spin_lock(&module_fw_flash_work_list_lock);
+ list_del(list);
+ spin_unlock(&module_fw_flash_work_list_lock);
+}
+
+static void module_flash_fw_work(struct work_struct *work)
+{
+ struct ethtool_module_fw_flash *module_fw;
+
+ module_fw = container_of(work, struct ethtool_module_fw_flash, work);
+
+ ethtool_cmis_fw_update(&module_fw->fw_update);
+
+ module_flash_fw_work_list_del(&module_fw->list);
+ module_fw->fw_update.dev->ethtool->module_fw_flash_in_progress = false;
+ netdev_put(module_fw->fw_update.dev, &module_fw->dev_tracker);
+ release_firmware(module_fw->fw_update.fw);
+ kfree(module_fw);
+}
+
+#define MODULE_EEPROM_PHYS_ID_PAGE 0
+#define MODULE_EEPROM_PHYS_ID_I2C_ADDR 0x50
+
+static int module_flash_fw_work_init(struct ethtool_module_fw_flash *module_fw,
+ struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_module_eeprom page_data = {};
+ u8 phys_id;
+ int err;
+
+ /* Fetch the SFF-8024 Identifier Value. For all supported standards, it
+ * is located at I2C address 0x50, byte 0. See section 4.1 in SFF-8024,
+ * revision 4.9.
+ */
+ page_data.page = MODULE_EEPROM_PHYS_ID_PAGE;
+ page_data.offset = SFP_PHYS_ID;
+ page_data.length = sizeof(phys_id);
+ page_data.i2c_address = MODULE_EEPROM_PHYS_ID_I2C_ADDR;
+ page_data.data = &phys_id;
+
+ err = ops->get_module_eeprom_by_page(dev, &page_data, extack);
+ if (err < 0)
+ return err;
+
+ switch (phys_id) {
+ case SFF8024_ID_QSFP_DD:
+ case SFF8024_ID_OSFP:
+ case SFF8024_ID_DSFP:
+ case SFF8024_ID_QSFP_PLUS_CMIS:
+ case SFF8024_ID_SFP_DD_CMIS:
+ case SFF8024_ID_SFP_PLUS_CMIS:
+ INIT_WORK(&module_fw->work, module_flash_fw_work);
+ break;
+ default:
+ NL_SET_ERR_MSG(extack,
+ "Module type does not support firmware flashing");
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+void ethnl_module_fw_flash_sock_destroy(struct ethnl_sock_priv *sk_priv)
+{
+ struct ethtool_module_fw_flash *work;
+
+ spin_lock(&module_fw_flash_work_list_lock);
+ list_for_each_entry(work, &module_fw_flash_work_list, list) {
+ if (work->fw_update.dev == sk_priv->dev &&
+ work->fw_update.ntf_params.portid == sk_priv->portid) {
+ work->fw_update.ntf_params.closed_sock = true;
+ break;
+ }
+ }
+ spin_unlock(&module_fw_flash_work_list_lock);
+}
+
+static int
+module_flash_fw_schedule(struct net_device *dev, const char *file_name,
+ struct ethtool_module_fw_flash_params *params,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ struct ethtool_cmis_fw_update_params *fw_update;
+ struct ethtool_module_fw_flash *module_fw;
+ int err;
+
+ module_fw = kzalloc(sizeof(*module_fw), GFP_KERNEL);
+ if (!module_fw)
+ return -ENOMEM;
+
+ fw_update = &module_fw->fw_update;
+ fw_update->params = *params;
+ err = request_firmware_direct(&fw_update->fw,
+ file_name, &dev->dev);
+ if (err) {
+ NL_SET_ERR_MSG(info->extack,
+ "Failed to request module firmware image");
+ goto err_free;
+ }
+
+ err = module_flash_fw_work_init(module_fw, dev, info->extack);
+ if (err < 0)
+ goto err_release_firmware;
+
+ dev->ethtool->module_fw_flash_in_progress = true;
+ netdev_hold(dev, &module_fw->dev_tracker, GFP_KERNEL);
+ fw_update->dev = dev;
+ fw_update->ntf_params.portid = info->snd_portid;
+ fw_update->ntf_params.seq = info->snd_seq;
+ fw_update->ntf_params.closed_sock = false;
+
+ err = ethnl_sock_priv_set(skb, dev, fw_update->ntf_params.portid,
+ ETHTOOL_SOCK_TYPE_MODULE_FW_FLASH);
+ if (err < 0)
+ goto err_release_firmware;
+
+ err = module_flash_fw_work_list_add(module_fw, info);
+ if (err < 0)
+ goto err_release_firmware;
+
+ schedule_work(&module_fw->work);
+
+ return 0;
+
+err_release_firmware:
+ release_firmware(fw_update->fw);
+err_free:
+ kfree(module_fw);
+ return err;
+}
+
+static int module_flash_fw(struct net_device *dev, struct nlattr **tb,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ struct ethtool_module_fw_flash_params params = {};
+ const char *file_name;
+ struct nlattr *attr;
+
+ if (GENL_REQ_ATTR_CHECK(info, ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME))
+ return -EINVAL;
+
+ file_name = nla_data(tb[ETHTOOL_A_MODULE_FW_FLASH_FILE_NAME]);
+
+ attr = tb[ETHTOOL_A_MODULE_FW_FLASH_PASSWORD];
+ if (attr) {
+ params.password = cpu_to_be32(nla_get_u32(attr));
+ params.password_valid = true;
+ }
+
+ return module_flash_fw_schedule(dev, file_name, &params, skb, info);
+}
+
+static int ethnl_module_fw_flash_validate(struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ struct devlink_port *devlink_port = dev->devlink_port;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (!ops->set_module_eeprom_by_page ||
+ !ops->get_module_eeprom_by_page) {
+ NL_SET_ERR_MSG(extack,
+ "Flashing module firmware is not supported by this device");
+ return -EOPNOTSUPP;
+ }
+
+ if (!ops->reset) {
+ NL_SET_ERR_MSG(extack,
+ "Reset module is not supported by this device, so flashing is not permitted");
+ return -EOPNOTSUPP;
+ }
+
+ if (dev->ethtool->module_fw_flash_in_progress) {
+ NL_SET_ERR_MSG(extack, "Module firmware flashing already in progress");
+ return -EBUSY;
+ }
+
+ if (dev->flags & IFF_UP) {
+ NL_SET_ERR_MSG(extack, "Netdevice is up, so flashing is not permitted");
+ return -EBUSY;
+ }
+
+ if (devlink_port && devlink_port->attrs.split) {
+ NL_SET_ERR_MSG(extack, "Can't perform firmware flashing on a split port");
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+int ethnl_act_module_fw_flash(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ethnl_req_info req_info = {};
+ struct nlattr **tb = info->attrs;
+ struct net_device *dev;
+ int ret;
+
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_MODULE_FW_FLASH_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+ dev = req_info.dev;
+
+ rtnl_lock();
+ netdev_lock_ops(dev);
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_unlock;
+
+ ret = ethnl_module_fw_flash_validate(dev, info->extack);
+ if (ret < 0)
+ goto out_unlock;
+
+ ret = module_flash_fw(dev, tb, skb, info);
+
+ ethnl_ops_complete(dev);
+
+out_unlock:
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+ ethnl_parse_header_dev_put(&req_info);
+ return ret;
+}
+
+/* MODULE_FW_FLASH_NTF */
+
+static int
+ethnl_module_fw_flash_ntf_put_err(struct sk_buff *skb, char *err_msg,
+ char *sub_err_msg)
+{
+ int err_msg_len, sub_err_msg_len, total_len;
+ struct nlattr *attr;
+
+ if (!err_msg)
+ return 0;
+
+ err_msg_len = strlen(err_msg);
+ total_len = err_msg_len + 2; /* For period and NUL. */
+
+ if (sub_err_msg) {
+ sub_err_msg_len = strlen(sub_err_msg);
+ total_len += sub_err_msg_len + 2; /* For ", ". */
+ }
+
+ attr = nla_reserve(skb, ETHTOOL_A_MODULE_FW_FLASH_STATUS_MSG,
+ total_len);
+ if (!attr)
+ return -ENOMEM;
+
+ if (sub_err_msg)
+ sprintf(nla_data(attr), "%s, %s.", err_msg, sub_err_msg);
+ else
+ sprintf(nla_data(attr), "%s.", err_msg);
+
+ return 0;
+}
+
+static void
+ethnl_module_fw_flash_ntf(struct net_device *dev,
+ enum ethtool_module_fw_flash_status status,
+ struct ethnl_module_fw_flash_ntf_params *ntf_params,
+ char *err_msg, char *sub_err_msg,
+ u64 done, u64 total)
+{
+ struct sk_buff *skb;
+ void *hdr;
+ int ret;
+
+ if (ntf_params->closed_sock)
+ return;
+
+ skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return;
+
+ hdr = ethnl_unicast_put(skb, ntf_params->portid, ++ntf_params->seq,
+ ETHTOOL_MSG_MODULE_FW_FLASH_NTF);
+ if (!hdr)
+ goto err_skb;
+
+ ret = ethnl_fill_reply_header(skb, dev,
+ ETHTOOL_A_MODULE_FW_FLASH_HEADER);
+ if (ret < 0)
+ goto err_skb;
+
+ if (nla_put_u32(skb, ETHTOOL_A_MODULE_FW_FLASH_STATUS, status))
+ goto err_skb;
+
+ ret = ethnl_module_fw_flash_ntf_put_err(skb, err_msg, sub_err_msg);
+ if (ret < 0)
+ goto err_skb;
+
+ if (nla_put_uint(skb, ETHTOOL_A_MODULE_FW_FLASH_DONE, done))
+ goto err_skb;
+
+ if (nla_put_uint(skb, ETHTOOL_A_MODULE_FW_FLASH_TOTAL, total))
+ goto err_skb;
+
+ genlmsg_end(skb, hdr);
+ genlmsg_unicast(dev_net(dev), skb, ntf_params->portid);
+ return;
+
+err_skb:
+ nlmsg_free(skb);
+}
+
+void ethnl_module_fw_flash_ntf_err(struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *params,
+ char *err_msg, char *sub_err_msg)
+{
+ ethnl_module_fw_flash_ntf(dev, ETHTOOL_MODULE_FW_FLASH_STATUS_ERROR,
+ params, err_msg, sub_err_msg, 0, 0);
+}
+
+void
+ethnl_module_fw_flash_ntf_start(struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *params)
+{
+ ethnl_module_fw_flash_ntf(dev, ETHTOOL_MODULE_FW_FLASH_STATUS_STARTED,
+ params, NULL, NULL, 0, 0);
+}
+
+void
+ethnl_module_fw_flash_ntf_complete(struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *params)
+{
+ ethnl_module_fw_flash_ntf(dev, ETHTOOL_MODULE_FW_FLASH_STATUS_COMPLETED,
+ params, NULL, NULL, 0, 0);
+}
+
+void
+ethnl_module_fw_flash_ntf_in_progress(struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *params,
+ u64 done, u64 total)
+{
+ ethnl_module_fw_flash_ntf(dev,
+ ETHTOOL_MODULE_FW_FLASH_STATUS_IN_PROGRESS,
+ params, NULL, NULL, done, total);
+}
diff --git a/net/ethtool/module_fw.h b/net/ethtool/module_fw.h
new file mode 100644
index 000000000000..634543a12d0c
--- /dev/null
+++ b/net/ethtool/module_fw.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#include <uapi/linux/ethtool.h>
+#include "netlink.h"
+
+/**
+ * struct ethnl_module_fw_flash_ntf_params - module firmware flashing
+ * notifications parameters
+ * @portid: Netlink portid of sender.
+ * @seq: Sequence number of sender.
+ * @closed_sock: Indicates whether the socket was closed from user space.
+ */
+struct ethnl_module_fw_flash_ntf_params {
+ u32 portid;
+ u32 seq;
+ bool closed_sock;
+};
+
+/**
+ * struct ethtool_module_fw_flash_params - module firmware flashing parameters
+ * @password: Module password. Only valid when @pass_valid is set.
+ * @password_valid: Whether the module password is valid or not.
+ */
+struct ethtool_module_fw_flash_params {
+ __be32 password;
+ u8 password_valid:1;
+};
+
+/**
+ * struct ethtool_cmis_fw_update_params - CMIS firmware update specific
+ * parameters
+ * @dev: Pointer to the net_device to be flashed.
+ * @params: Module firmware flashing parameters.
+ * @ntf_params: Module firmware flashing notification parameters.
+ * @fw: Firmware to flash.
+ */
+struct ethtool_cmis_fw_update_params {
+ struct net_device *dev;
+ struct ethtool_module_fw_flash_params params;
+ struct ethnl_module_fw_flash_ntf_params ntf_params;
+ const struct firmware *fw;
+};
+
+/**
+ * struct ethtool_module_fw_flash - module firmware flashing
+ * @list: List node for &module_fw_flash_work_list.
+ * @dev_tracker: Refcount tracker for @dev.
+ * @work: The flashing firmware work.
+ * @fw_update: CMIS firmware update specific parameters.
+ */
+struct ethtool_module_fw_flash {
+ struct list_head list;
+ netdevice_tracker dev_tracker;
+ struct work_struct work;
+ struct ethtool_cmis_fw_update_params fw_update;
+};
+
+void ethnl_module_fw_flash_sock_destroy(struct ethnl_sock_priv *sk_priv);
+
+void
+ethnl_module_fw_flash_ntf_err(struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *params,
+ char *err_msg, char *sub_err_msg);
+void
+ethnl_module_fw_flash_ntf_start(struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *params);
+void
+ethnl_module_fw_flash_ntf_complete(struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *params);
+void
+ethnl_module_fw_flash_ntf_in_progress(struct net_device *dev,
+ struct ethnl_module_fw_flash_ntf_params *params,
+ u64 done, u64 total);
+
+void ethtool_cmis_fw_update(struct ethtool_cmis_fw_update_params *params);
diff --git a/net/ethtool/mse.c b/net/ethtool/mse.c
new file mode 100644
index 000000000000..6aac004c3ffc
--- /dev/null
+++ b/net/ethtool/mse.c
@@ -0,0 +1,329 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool.h>
+#include <linux/phy.h>
+#include <linux/slab.h>
+
+#include "netlink.h"
+#include "common.h"
+
+/* Channels A-D only; WORST and LINK are exclusive alternatives */
+#define PHY_MSE_CHANNEL_COUNT 4
+
+struct mse_req_info {
+ struct ethnl_req_info base;
+};
+
+struct mse_snapshot_entry {
+ struct phy_mse_snapshot snapshot;
+ int channel;
+};
+
+struct mse_reply_data {
+ struct ethnl_reply_data base;
+ struct phy_mse_capability capability;
+ struct mse_snapshot_entry *snapshots;
+ unsigned int num_snapshots;
+};
+
+static struct mse_reply_data *
+mse_repdata(const struct ethnl_reply_data *reply_base)
+{
+ return container_of(reply_base, struct mse_reply_data, base);
+}
+
+const struct nla_policy ethnl_mse_get_policy[] = {
+ [ETHTOOL_A_MSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy),
+};
+
+static int get_snapshot_if_supported(struct phy_device *phydev,
+ struct mse_reply_data *data,
+ unsigned int *idx, u32 cap_bit,
+ enum phy_mse_channel channel)
+{
+ int ret;
+
+ if (data->capability.supported_caps & cap_bit) {
+ ret = phydev->drv->get_mse_snapshot(phydev, channel,
+ &data->snapshots[*idx].snapshot);
+ if (ret)
+ return ret;
+ data->snapshots[*idx].channel = channel;
+ (*idx)++;
+ }
+
+ return 0;
+}
+
+static int mse_get_channels(struct phy_device *phydev,
+ struct mse_reply_data *data)
+{
+ unsigned int i = 0;
+ int ret;
+
+ if (!data->capability.supported_caps)
+ return 0;
+
+ data->snapshots = kcalloc(PHY_MSE_CHANNEL_COUNT,
+ sizeof(*data->snapshots), GFP_KERNEL);
+ if (!data->snapshots)
+ return -ENOMEM;
+
+ /* Priority 1: Individual channels */
+ ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_CHANNEL_A,
+ PHY_MSE_CHANNEL_A);
+ if (ret)
+ return ret;
+ ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_CHANNEL_B,
+ PHY_MSE_CHANNEL_B);
+ if (ret)
+ return ret;
+ ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_CHANNEL_C,
+ PHY_MSE_CHANNEL_C);
+ if (ret)
+ return ret;
+ ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_CHANNEL_D,
+ PHY_MSE_CHANNEL_D);
+ if (ret)
+ return ret;
+
+ /* If any individual channels were found, we are done. */
+ if (i > 0) {
+ data->num_snapshots = i;
+ return 0;
+ }
+
+ /* Priority 2: Worst channel, if no individual channels supported. */
+ ret = get_snapshot_if_supported(phydev, data, &i,
+ PHY_MSE_CAP_WORST_CHANNEL,
+ PHY_MSE_CHANNEL_WORST);
+ if (ret)
+ return ret;
+
+ /* If worst channel was found, we are done. */
+ if (i > 0) {
+ data->num_snapshots = i;
+ return 0;
+ }
+
+ /* Priority 3: Link-wide, if nothing else is supported. */
+ ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_LINK,
+ PHY_MSE_CHANNEL_LINK);
+ if (ret)
+ return ret;
+
+ data->num_snapshots = i;
+ return 0;
+}
+
+static int mse_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct mse_reply_data *data = mse_repdata(reply_base);
+ struct net_device *dev = reply_base->dev;
+ struct phy_device *phydev;
+ int ret;
+
+ phydev = ethnl_req_get_phydev(req_base, info->attrs,
+ ETHTOOL_A_MSE_HEADER, info->extack);
+ if (IS_ERR(phydev))
+ return PTR_ERR(phydev);
+ if (!phydev)
+ return -EOPNOTSUPP;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret)
+ return ret;
+
+ mutex_lock(&phydev->lock);
+
+ if (!phydev->drv || !phydev->drv->get_mse_capability ||
+ !phydev->drv->get_mse_snapshot) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+ if (!phydev->link) {
+ ret = -ENETDOWN;
+ goto out_unlock;
+ }
+
+ ret = phydev->drv->get_mse_capability(phydev, &data->capability);
+ if (ret)
+ goto out_unlock;
+
+ ret = mse_get_channels(phydev, data);
+
+out_unlock:
+ mutex_unlock(&phydev->lock);
+ ethnl_ops_complete(dev);
+ if (ret)
+ kfree(data->snapshots);
+ return ret;
+}
+
+static void mse_cleanup_data(struct ethnl_reply_data *reply_base)
+{
+ struct mse_reply_data *data = mse_repdata(reply_base);
+
+ kfree(data->snapshots);
+}
+
+static int mse_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct mse_reply_data *data = mse_repdata(reply_base);
+ size_t len = 0;
+ unsigned int i;
+
+ /* ETHTOOL_A_MSE_CAPABILITIES */
+ len += nla_total_size(0);
+ if (data->capability.supported_caps & PHY_MSE_CAP_AVG)
+ /* ETHTOOL_A_MSE_CAPABILITIES_MAX_AVERAGE_MSE */
+ len += nla_total_size(sizeof(u64));
+ if (data->capability.supported_caps & (PHY_MSE_CAP_PEAK |
+ PHY_MSE_CAP_WORST_PEAK))
+ /* ETHTOOL_A_MSE_CAPABILITIES_MAX_PEAK_MSE */
+ len += nla_total_size(sizeof(u64));
+ /* ETHTOOL_A_MSE_CAPABILITIES_REFRESH_RATE_PS */
+ len += nla_total_size(sizeof(u64));
+ /* ETHTOOL_A_MSE_CAPABILITIES_NUM_SYMBOLS */
+ len += nla_total_size(sizeof(u64));
+
+ for (i = 0; i < data->num_snapshots; i++) {
+ size_t snapshot_len = 0;
+
+ /* Per-channel nest (e.g., ETHTOOL_A_MSE_CHANNEL_A / _B / _C /
+ * _D / _WORST_CHANNEL / _LINK)
+ */
+ snapshot_len += nla_total_size(0);
+
+ if (data->capability.supported_caps & PHY_MSE_CAP_AVG)
+ snapshot_len += nla_total_size(sizeof(u64));
+ if (data->capability.supported_caps & PHY_MSE_CAP_PEAK)
+ snapshot_len += nla_total_size(sizeof(u64));
+ if (data->capability.supported_caps & PHY_MSE_CAP_WORST_PEAK)
+ snapshot_len += nla_total_size(sizeof(u64));
+
+ len += snapshot_len;
+ }
+
+ return len;
+}
+
+static int mse_channel_to_attr(int ch)
+{
+ switch (ch) {
+ case PHY_MSE_CHANNEL_A:
+ return ETHTOOL_A_MSE_CHANNEL_A;
+ case PHY_MSE_CHANNEL_B:
+ return ETHTOOL_A_MSE_CHANNEL_B;
+ case PHY_MSE_CHANNEL_C:
+ return ETHTOOL_A_MSE_CHANNEL_C;
+ case PHY_MSE_CHANNEL_D:
+ return ETHTOOL_A_MSE_CHANNEL_D;
+ case PHY_MSE_CHANNEL_WORST:
+ return ETHTOOL_A_MSE_WORST_CHANNEL;
+ case PHY_MSE_CHANNEL_LINK:
+ return ETHTOOL_A_MSE_LINK;
+ default:
+ return -EINVAL;
+ }
+}
+
+static int mse_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct mse_reply_data *data = mse_repdata(reply_base);
+ struct nlattr *nest;
+ unsigned int i;
+ int ret;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_MSE_CAPABILITIES);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (data->capability.supported_caps & PHY_MSE_CAP_AVG) {
+ ret = nla_put_uint(skb,
+ ETHTOOL_A_MSE_CAPABILITIES_MAX_AVERAGE_MSE,
+ data->capability.max_average_mse);
+ if (ret < 0)
+ goto nla_put_nest_failure;
+ }
+
+ if (data->capability.supported_caps & (PHY_MSE_CAP_PEAK |
+ PHY_MSE_CAP_WORST_PEAK)) {
+ ret = nla_put_uint(skb, ETHTOOL_A_MSE_CAPABILITIES_MAX_PEAK_MSE,
+ data->capability.max_peak_mse);
+ if (ret < 0)
+ goto nla_put_nest_failure;
+ }
+
+ ret = nla_put_uint(skb, ETHTOOL_A_MSE_CAPABILITIES_REFRESH_RATE_PS,
+ data->capability.refresh_rate_ps);
+ if (ret < 0)
+ goto nla_put_nest_failure;
+
+ ret = nla_put_uint(skb, ETHTOOL_A_MSE_CAPABILITIES_NUM_SYMBOLS,
+ data->capability.num_symbols);
+ if (ret < 0)
+ goto nla_put_nest_failure;
+
+ nla_nest_end(skb, nest);
+
+ for (i = 0; i < data->num_snapshots; i++) {
+ const struct mse_snapshot_entry *s = &data->snapshots[i];
+ int chan_attr;
+
+ chan_attr = mse_channel_to_attr(s->channel);
+ if (chan_attr < 0)
+ return chan_attr;
+
+ nest = nla_nest_start(skb, chan_attr);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (data->capability.supported_caps & PHY_MSE_CAP_AVG) {
+ ret = nla_put_uint(skb,
+ ETHTOOL_A_MSE_SNAPSHOT_AVERAGE_MSE,
+ s->snapshot.average_mse);
+ if (ret)
+ goto nla_put_nest_failure;
+ }
+ if (data->capability.supported_caps & PHY_MSE_CAP_PEAK) {
+ ret = nla_put_uint(skb, ETHTOOL_A_MSE_SNAPSHOT_PEAK_MSE,
+ s->snapshot.peak_mse);
+ if (ret)
+ goto nla_put_nest_failure;
+ }
+ if (data->capability.supported_caps & PHY_MSE_CAP_WORST_PEAK) {
+ ret = nla_put_uint(skb,
+ ETHTOOL_A_MSE_SNAPSHOT_WORST_PEAK_MSE,
+ s->snapshot.worst_peak_mse);
+ if (ret)
+ goto nla_put_nest_failure;
+ }
+
+ nla_nest_end(skb, nest);
+ }
+
+ return 0;
+
+nla_put_nest_failure:
+ nla_nest_cancel(skb, nest);
+ return ret;
+}
+
+const struct ethnl_request_ops ethnl_mse_request_ops = {
+ .request_cmd = ETHTOOL_MSG_MSE_GET,
+ .reply_cmd = ETHTOOL_MSG_MSE_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_MSE_HEADER,
+ .req_info_size = sizeof(struct mse_req_info),
+ .reply_data_size = sizeof(struct mse_reply_data),
+
+ .prepare_data = mse_prepare_data,
+ .cleanup_data = mse_cleanup_data,
+ .reply_size = mse_reply_size,
+ .fill_reply = mse_fill_reply,
+};
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
new file mode 100644
index 000000000000..6e5f0f4f815a
--- /dev/null
+++ b/net/ethtool/netlink.c
@@ -0,0 +1,1583 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <net/netdev_lock.h>
+#include <net/netdev_queues.h>
+#include <net/sock.h>
+#include <linux/ethtool_netlink.h>
+#include <linux/phy_link_topology.h>
+#include <linux/pm_runtime.h>
+#include "netlink.h"
+#include "module_fw.h"
+
+static struct genl_family ethtool_genl_family;
+
+static bool ethnl_ok __read_mostly;
+static u32 ethnl_bcast_seq;
+
+#define ETHTOOL_FLAGS_BASIC (ETHTOOL_FLAG_COMPACT_BITSETS | \
+ ETHTOOL_FLAG_OMIT_REPLY)
+#define ETHTOOL_FLAGS_STATS (ETHTOOL_FLAGS_BASIC | ETHTOOL_FLAG_STATS)
+
+const struct nla_policy ethnl_header_policy[] = {
+ [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 },
+ [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING,
+ .len = ALTIFNAMSIZ - 1 },
+ [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32,
+ ETHTOOL_FLAGS_BASIC),
+};
+
+const struct nla_policy ethnl_header_policy_stats[] = {
+ [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 },
+ [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING,
+ .len = ALTIFNAMSIZ - 1 },
+ [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32,
+ ETHTOOL_FLAGS_STATS),
+};
+
+const struct nla_policy ethnl_header_policy_phy[] = {
+ [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 },
+ [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING,
+ .len = ALTIFNAMSIZ - 1 },
+ [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32,
+ ETHTOOL_FLAGS_BASIC),
+ [ETHTOOL_A_HEADER_PHY_INDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+};
+
+const struct nla_policy ethnl_header_policy_phy_stats[] = {
+ [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 },
+ [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING,
+ .len = ALTIFNAMSIZ - 1 },
+ [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32,
+ ETHTOOL_FLAGS_STATS),
+ [ETHTOOL_A_HEADER_PHY_INDEX] = NLA_POLICY_MIN(NLA_U32, 1),
+};
+
+int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid,
+ enum ethnl_sock_type type)
+{
+ struct ethnl_sock_priv *sk_priv;
+
+ sk_priv = genl_sk_priv_get(&ethtool_genl_family, NETLINK_CB(skb).sk);
+ if (IS_ERR(sk_priv))
+ return PTR_ERR(sk_priv);
+
+ sk_priv->dev = dev;
+ sk_priv->portid = portid;
+ sk_priv->type = type;
+
+ return 0;
+}
+
+static void ethnl_sock_priv_destroy(void *priv)
+{
+ struct ethnl_sock_priv *sk_priv = priv;
+
+ switch (sk_priv->type) {
+ case ETHTOOL_SOCK_TYPE_MODULE_FW_FLASH:
+ ethnl_module_fw_flash_sock_destroy(sk_priv);
+ break;
+ default:
+ break;
+ }
+}
+
+u32 ethnl_bcast_seq_next(void)
+{
+ ASSERT_RTNL();
+ return ++ethnl_bcast_seq;
+}
+
+int ethnl_ops_begin(struct net_device *dev)
+{
+ int ret;
+
+ if (!dev)
+ return -ENODEV;
+
+ if (dev->dev.parent)
+ pm_runtime_get_sync(dev->dev.parent);
+
+ netdev_ops_assert_locked(dev);
+
+ if (!netif_device_present(dev) ||
+ dev->reg_state >= NETREG_UNREGISTERING) {
+ ret = -ENODEV;
+ goto err;
+ }
+
+ if (dev->ethtool_ops->begin) {
+ ret = dev->ethtool_ops->begin(dev);
+ if (ret)
+ goto err;
+ }
+
+ return 0;
+err:
+ if (dev->dev.parent)
+ pm_runtime_put(dev->dev.parent);
+
+ return ret;
+}
+
+void ethnl_ops_complete(struct net_device *dev)
+{
+ if (dev->ethtool_ops->complete)
+ dev->ethtool_ops->complete(dev);
+
+ if (dev->dev.parent)
+ pm_runtime_put(dev->dev.parent);
+}
+
+/**
+ * ethnl_parse_header_dev_get() - parse request header
+ * @req_info: structure to put results into
+ * @header: nest attribute with request header
+ * @net: request netns
+ * @extack: netlink extack for error reporting
+ * @require_dev: fail if no device identified in header
+ *
+ * Parse request header in nested attribute @nest and puts results into
+ * the structure pointed to by @req_info. Extack from @info is used for error
+ * reporting. If req_info->dev is not null on return, reference to it has
+ * been taken. If error is returned, *req_info is null initialized and no
+ * reference is held.
+ *
+ * Return: 0 on success or negative error code
+ */
+int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
+ const struct nlattr *header, struct net *net,
+ struct netlink_ext_ack *extack, bool require_dev)
+{
+ struct nlattr *tb[ARRAY_SIZE(ethnl_header_policy_phy)];
+ const struct nlattr *devname_attr;
+ struct net_device *dev = NULL;
+ u32 flags = 0;
+ int ret;
+
+ if (!header) {
+ if (!require_dev)
+ return 0;
+ NL_SET_ERR_MSG(extack, "request header missing");
+ return -EINVAL;
+ }
+ /* No validation here, command policy should have a nested policy set
+ * for the header, therefore validation should have already been done.
+ */
+ ret = nla_parse_nested(tb, ARRAY_SIZE(ethnl_header_policy_phy) - 1, header,
+ NULL, extack);
+ if (ret < 0)
+ return ret;
+ if (tb[ETHTOOL_A_HEADER_FLAGS])
+ flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]);
+
+ devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME];
+ if (tb[ETHTOOL_A_HEADER_DEV_INDEX]) {
+ u32 ifindex = nla_get_u32(tb[ETHTOOL_A_HEADER_DEV_INDEX]);
+
+ dev = netdev_get_by_index(net, ifindex, &req_info->dev_tracker,
+ GFP_KERNEL);
+ if (!dev) {
+ NL_SET_ERR_MSG_ATTR(extack,
+ tb[ETHTOOL_A_HEADER_DEV_INDEX],
+ "no device matches ifindex");
+ return -ENODEV;
+ }
+ /* if both ifindex and ifname are passed, they must match */
+ if (devname_attr &&
+ strncmp(dev->name, nla_data(devname_attr), IFNAMSIZ)) {
+ netdev_put(dev, &req_info->dev_tracker);
+ NL_SET_ERR_MSG_ATTR(extack, header,
+ "ifindex and name do not match");
+ return -ENODEV;
+ }
+ } else if (devname_attr) {
+ dev = netdev_get_by_name(net, nla_data(devname_attr),
+ &req_info->dev_tracker, GFP_KERNEL);
+ if (!dev) {
+ NL_SET_ERR_MSG_ATTR(extack, devname_attr,
+ "no device matches name");
+ return -ENODEV;
+ }
+ } else if (require_dev) {
+ NL_SET_ERR_MSG_ATTR(extack, header,
+ "neither ifindex nor name specified");
+ return -EINVAL;
+ }
+
+ if (tb[ETHTOOL_A_HEADER_PHY_INDEX]) {
+ if (dev) {
+ req_info->phy_index = nla_get_u32(tb[ETHTOOL_A_HEADER_PHY_INDEX]);
+ } else {
+ NL_SET_ERR_MSG_ATTR(extack, header,
+ "phy_index set without a netdev");
+ return -EINVAL;
+ }
+ }
+
+ req_info->dev = dev;
+ req_info->flags = flags;
+ return 0;
+}
+
+struct phy_device *ethnl_req_get_phydev(const struct ethnl_req_info *req_info,
+ struct nlattr **tb, unsigned int header,
+ struct netlink_ext_ack *extack)
+{
+ struct phy_device *phydev;
+
+ ASSERT_RTNL();
+
+ if (!req_info->dev)
+ return NULL;
+
+ if (!req_info->phy_index)
+ return req_info->dev->phydev;
+
+ phydev = phy_link_topo_get_phy(req_info->dev, req_info->phy_index);
+ if (!phydev && tb) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[header],
+ "no phy matching phyindex");
+ return ERR_PTR(-ENODEV);
+ }
+
+ return phydev;
+}
+
+/**
+ * ethnl_fill_reply_header() - Put common header into a reply message
+ * @skb: skb with the message
+ * @dev: network device to describe in header
+ * @attrtype: attribute type to use for the nest
+ *
+ * Create a nested attribute with attributes describing given network device.
+ *
+ * Return: 0 on success, error value (-EMSGSIZE only) on error
+ */
+int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev,
+ u16 attrtype)
+{
+ struct nlattr *nest;
+
+ if (!dev)
+ return 0;
+ nest = nla_nest_start(skb, attrtype);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_HEADER_DEV_INDEX, (u32)dev->ifindex) ||
+ nla_put_string(skb, ETHTOOL_A_HEADER_DEV_NAME, dev->name))
+ goto nla_put_failure;
+ /* If more attributes are put into reply header, ethnl_header_size()
+ * must be updated to account for them.
+ */
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+/**
+ * ethnl_reply_init() - Create skb for a reply and fill device identification
+ * @payload: payload length (without netlink and genetlink header)
+ * @dev: device the reply is about (may be null)
+ * @cmd: ETHTOOL_MSG_* message type for reply
+ * @hdr_attrtype: attribute type for common header
+ * @info: genetlink info of the received packet we respond to
+ * @ehdrp: place to store payload pointer returned by genlmsg_new()
+ *
+ * Return: pointer to allocated skb on success, NULL on error
+ */
+struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd,
+ u16 hdr_attrtype, struct genl_info *info,
+ void **ehdrp)
+{
+ struct sk_buff *skb;
+
+ skb = genlmsg_new(payload, GFP_KERNEL);
+ if (!skb)
+ goto err;
+ *ehdrp = genlmsg_put_reply(skb, info, &ethtool_genl_family, 0, cmd);
+ if (!*ehdrp)
+ goto err_free;
+
+ if (dev) {
+ int ret;
+
+ ret = ethnl_fill_reply_header(skb, dev, hdr_attrtype);
+ if (ret < 0)
+ goto err_free;
+ }
+ return skb;
+
+err_free:
+ nlmsg_free(skb);
+err:
+ if (info)
+ GENL_SET_ERR_MSG(info, "failed to setup reply message");
+ return NULL;
+}
+
+void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd)
+{
+ return genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &ethtool_genl_family, 0, cmd);
+}
+
+void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd)
+{
+ return genlmsg_put(skb, 0, ++ethnl_bcast_seq, &ethtool_genl_family, 0,
+ cmd);
+}
+
+void *ethnl_unicast_put(struct sk_buff *skb, u32 portid, u32 seq, u8 cmd)
+{
+ return genlmsg_put(skb, portid, seq, &ethtool_genl_family, 0, cmd);
+}
+
+int ethnl_multicast(struct sk_buff *skb, struct net_device *dev)
+{
+ return genlmsg_multicast_netns(&ethtool_genl_family, dev_net(dev), skb,
+ 0, ETHNL_MCGRP_MONITOR, GFP_KERNEL);
+}
+
+/* GET request helpers */
+
+/**
+ * struct ethnl_dump_ctx - context structure for generic dumpit() callback
+ * @ops: request ops of currently processed message type
+ * @req_info: parsed request header of processed request
+ * @reply_data: data needed to compose the reply
+ * @pos_ifindex: saved iteration position - ifindex
+ *
+ * These parameters are kept in struct netlink_callback as context preserved
+ * between iterations. They are initialized by ethnl_default_start() and used
+ * in ethnl_default_dumpit() and ethnl_default_done().
+ */
+struct ethnl_dump_ctx {
+ const struct ethnl_request_ops *ops;
+ struct ethnl_req_info *req_info;
+ struct ethnl_reply_data *reply_data;
+ unsigned long pos_ifindex;
+};
+
+/**
+ * struct ethnl_perphy_dump_ctx - context for dumpit() PHY-aware callbacks
+ * @ethnl_ctx: generic ethnl context
+ * @ifindex: For Filtered DUMP requests, the ifindex of the targeted netdev
+ * @pos_phyindex: iterator position for multi-msg DUMP
+ */
+struct ethnl_perphy_dump_ctx {
+ struct ethnl_dump_ctx ethnl_ctx;
+ unsigned int ifindex;
+ unsigned long pos_phyindex;
+};
+
+static const struct ethnl_request_ops *
+ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
+ [ETHTOOL_MSG_STRSET_GET] = &ethnl_strset_request_ops,
+ [ETHTOOL_MSG_LINKINFO_GET] = &ethnl_linkinfo_request_ops,
+ [ETHTOOL_MSG_LINKINFO_SET] = &ethnl_linkinfo_request_ops,
+ [ETHTOOL_MSG_LINKMODES_GET] = &ethnl_linkmodes_request_ops,
+ [ETHTOOL_MSG_LINKMODES_SET] = &ethnl_linkmodes_request_ops,
+ [ETHTOOL_MSG_LINKSTATE_GET] = &ethnl_linkstate_request_ops,
+ [ETHTOOL_MSG_DEBUG_GET] = &ethnl_debug_request_ops,
+ [ETHTOOL_MSG_DEBUG_SET] = &ethnl_debug_request_ops,
+ [ETHTOOL_MSG_WOL_GET] = &ethnl_wol_request_ops,
+ [ETHTOOL_MSG_WOL_SET] = &ethnl_wol_request_ops,
+ [ETHTOOL_MSG_FEATURES_GET] = &ethnl_features_request_ops,
+ [ETHTOOL_MSG_PRIVFLAGS_GET] = &ethnl_privflags_request_ops,
+ [ETHTOOL_MSG_PRIVFLAGS_SET] = &ethnl_privflags_request_ops,
+ [ETHTOOL_MSG_RINGS_GET] = &ethnl_rings_request_ops,
+ [ETHTOOL_MSG_RINGS_SET] = &ethnl_rings_request_ops,
+ [ETHTOOL_MSG_CHANNELS_GET] = &ethnl_channels_request_ops,
+ [ETHTOOL_MSG_CHANNELS_SET] = &ethnl_channels_request_ops,
+ [ETHTOOL_MSG_COALESCE_GET] = &ethnl_coalesce_request_ops,
+ [ETHTOOL_MSG_COALESCE_SET] = &ethnl_coalesce_request_ops,
+ [ETHTOOL_MSG_PAUSE_GET] = &ethnl_pause_request_ops,
+ [ETHTOOL_MSG_PAUSE_SET] = &ethnl_pause_request_ops,
+ [ETHTOOL_MSG_EEE_GET] = &ethnl_eee_request_ops,
+ [ETHTOOL_MSG_EEE_SET] = &ethnl_eee_request_ops,
+ [ETHTOOL_MSG_FEC_GET] = &ethnl_fec_request_ops,
+ [ETHTOOL_MSG_FEC_SET] = &ethnl_fec_request_ops,
+ [ETHTOOL_MSG_TSINFO_GET] = &ethnl_tsinfo_request_ops,
+ [ETHTOOL_MSG_MODULE_EEPROM_GET] = &ethnl_module_eeprom_request_ops,
+ [ETHTOOL_MSG_STATS_GET] = &ethnl_stats_request_ops,
+ [ETHTOOL_MSG_PHC_VCLOCKS_GET] = &ethnl_phc_vclocks_request_ops,
+ [ETHTOOL_MSG_MODULE_GET] = &ethnl_module_request_ops,
+ [ETHTOOL_MSG_MODULE_SET] = &ethnl_module_request_ops,
+ [ETHTOOL_MSG_PSE_GET] = &ethnl_pse_request_ops,
+ [ETHTOOL_MSG_PSE_SET] = &ethnl_pse_request_ops,
+ [ETHTOOL_MSG_RSS_GET] = &ethnl_rss_request_ops,
+ [ETHTOOL_MSG_RSS_SET] = &ethnl_rss_request_ops,
+ [ETHTOOL_MSG_PLCA_GET_CFG] = &ethnl_plca_cfg_request_ops,
+ [ETHTOOL_MSG_PLCA_SET_CFG] = &ethnl_plca_cfg_request_ops,
+ [ETHTOOL_MSG_PLCA_GET_STATUS] = &ethnl_plca_status_request_ops,
+ [ETHTOOL_MSG_MM_GET] = &ethnl_mm_request_ops,
+ [ETHTOOL_MSG_MM_SET] = &ethnl_mm_request_ops,
+ [ETHTOOL_MSG_TSCONFIG_GET] = &ethnl_tsconfig_request_ops,
+ [ETHTOOL_MSG_TSCONFIG_SET] = &ethnl_tsconfig_request_ops,
+ [ETHTOOL_MSG_PHY_GET] = &ethnl_phy_request_ops,
+ [ETHTOOL_MSG_MSE_GET] = &ethnl_mse_request_ops,
+};
+
+static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
+{
+ return (struct ethnl_dump_ctx *)cb->ctx;
+}
+
+static struct ethnl_perphy_dump_ctx *
+ethnl_perphy_dump_context(struct netlink_callback *cb)
+{
+ return (struct ethnl_perphy_dump_ctx *)cb->ctx;
+}
+
+/**
+ * ethnl_default_parse() - Parse request message
+ * @req_info: pointer to structure to put data into
+ * @info: genl_info from the request
+ * @request_ops: struct request_ops for request type
+ * @require_dev: fail if no device identified in header
+ *
+ * Parse universal request header and call request specific ->parse_request()
+ * callback (if defined) to parse the rest of the message.
+ *
+ * Return: 0 on success or negative error code
+ */
+static int ethnl_default_parse(struct ethnl_req_info *req_info,
+ const struct genl_info *info,
+ const struct ethnl_request_ops *request_ops,
+ bool require_dev)
+{
+ struct nlattr **tb = info->attrs;
+ int ret;
+
+ ret = ethnl_parse_header_dev_get(req_info, tb[request_ops->hdr_attr],
+ genl_info_net(info), info->extack,
+ require_dev);
+ if (ret < 0)
+ return ret;
+
+ if (request_ops->parse_request) {
+ ret = request_ops->parse_request(req_info, tb, info->extack);
+ if (ret < 0)
+ goto err_dev;
+ }
+
+ return 0;
+
+err_dev:
+ netdev_put(req_info->dev, &req_info->dev_tracker);
+ req_info->dev = NULL;
+ return ret;
+}
+
+/**
+ * ethnl_init_reply_data() - Initialize reply data for GET request
+ * @reply_data: pointer to embedded struct ethnl_reply_data
+ * @ops: instance of struct ethnl_request_ops describing the layout
+ * @dev: network device to initialize the reply for
+ *
+ * Fills the reply data part with zeros and sets the dev member. Must be called
+ * before calling the ->fill_reply() callback (for each iteration when handling
+ * dump requests).
+ */
+static void ethnl_init_reply_data(struct ethnl_reply_data *reply_data,
+ const struct ethnl_request_ops *ops,
+ struct net_device *dev)
+{
+ memset(reply_data, 0, ops->reply_data_size);
+ reply_data->dev = dev;
+}
+
+/* default ->doit() handler for GET type requests */
+static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ethnl_reply_data *reply_data = NULL;
+ struct ethnl_req_info *req_info = NULL;
+ const u8 cmd = info->genlhdr->cmd;
+ const struct ethnl_request_ops *ops;
+ int hdr_len, reply_len;
+ struct sk_buff *rskb;
+ void *reply_payload;
+ int ret;
+
+ ops = ethnl_default_requests[cmd];
+ if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", cmd))
+ return -EOPNOTSUPP;
+ if (GENL_REQ_ATTR_CHECK(info, ops->hdr_attr))
+ return -EINVAL;
+
+ req_info = kzalloc(ops->req_info_size, GFP_KERNEL);
+ if (!req_info)
+ return -ENOMEM;
+ reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL);
+ if (!reply_data) {
+ kfree(req_info);
+ return -ENOMEM;
+ }
+
+ ret = ethnl_default_parse(req_info, info, ops, !ops->allow_nodev_do);
+ if (ret < 0)
+ goto err_free;
+ ethnl_init_reply_data(reply_data, ops, req_info->dev);
+
+ rtnl_lock();
+ if (req_info->dev)
+ netdev_lock_ops(req_info->dev);
+ ret = ops->prepare_data(req_info, reply_data, info);
+ if (req_info->dev)
+ netdev_unlock_ops(req_info->dev);
+ rtnl_unlock();
+ if (ret < 0)
+ goto err_dev;
+ ret = ops->reply_size(req_info, reply_data);
+ if (ret < 0)
+ goto err_cleanup;
+ reply_len = ret;
+ ret = -ENOMEM;
+ rskb = ethnl_reply_init(reply_len + ethnl_reply_header_size(),
+ req_info->dev, ops->reply_cmd,
+ ops->hdr_attr, info, &reply_payload);
+ if (!rskb)
+ goto err_cleanup;
+ hdr_len = rskb->len;
+ ret = ops->fill_reply(rskb, req_info, reply_data);
+ if (ret < 0)
+ goto err_msg;
+ WARN_ONCE(rskb->len - hdr_len > reply_len,
+ "ethnl cmd %d: calculated reply length %d, but consumed %d\n",
+ cmd, reply_len, rskb->len - hdr_len);
+ if (ops->cleanup_data)
+ ops->cleanup_data(reply_data);
+
+ genlmsg_end(rskb, reply_payload);
+ netdev_put(req_info->dev, &req_info->dev_tracker);
+ kfree(reply_data);
+ kfree(req_info);
+ return genlmsg_reply(rskb, info);
+
+err_msg:
+ WARN_ONCE(ret == -EMSGSIZE, "calculated message payload length (%d) not sufficient\n", reply_len);
+ nlmsg_free(rskb);
+err_cleanup:
+ if (ops->cleanup_data)
+ ops->cleanup_data(reply_data);
+err_dev:
+ netdev_put(req_info->dev, &req_info->dev_tracker);
+err_free:
+ kfree(reply_data);
+ kfree(req_info);
+ return ret;
+}
+
+static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev,
+ const struct ethnl_dump_ctx *ctx,
+ const struct genl_info *info)
+{
+ void *ehdr;
+ int ret;
+
+ ehdr = genlmsg_put(skb, info->snd_portid, info->snd_seq,
+ &ethtool_genl_family, NLM_F_MULTI,
+ ctx->ops->reply_cmd);
+ if (!ehdr)
+ return -EMSGSIZE;
+
+ ethnl_init_reply_data(ctx->reply_data, ctx->ops, dev);
+ rtnl_lock();
+ netdev_lock_ops(dev);
+ ret = ctx->ops->prepare_data(ctx->req_info, ctx->reply_data, info);
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+ if (ret < 0)
+ goto out_cancel;
+ ret = ethnl_fill_reply_header(skb, dev, ctx->ops->hdr_attr);
+ if (ret < 0)
+ goto out;
+ ret = ctx->ops->fill_reply(skb, ctx->req_info, ctx->reply_data);
+
+out:
+ if (ctx->ops->cleanup_data)
+ ctx->ops->cleanup_data(ctx->reply_data);
+out_cancel:
+ ctx->reply_data->dev = NULL;
+ if (ret < 0)
+ genlmsg_cancel(skb, ehdr);
+ else
+ genlmsg_end(skb, ehdr);
+ return ret;
+}
+
+/* Default ->dumpit() handler for GET requests. */
+static int ethnl_default_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb);
+ struct net *net = sock_net(skb->sk);
+ netdevice_tracker dev_tracker;
+ struct net_device *dev;
+ int ret = 0;
+
+ rcu_read_lock();
+ for_each_netdev_dump(net, dev, ctx->pos_ifindex) {
+ netdev_hold(dev, &dev_tracker, GFP_ATOMIC);
+ rcu_read_unlock();
+
+ ret = ethnl_default_dump_one(skb, dev, ctx, genl_info_dump(cb));
+
+ rcu_read_lock();
+ netdev_put(dev, &dev_tracker);
+
+ if (ret < 0 && ret != -EOPNOTSUPP) {
+ if (likely(skb->len))
+ ret = skb->len;
+ break;
+ }
+ ret = 0;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/* generic ->start() handler for GET requests */
+static int ethnl_default_start(struct netlink_callback *cb)
+{
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb);
+ struct ethnl_reply_data *reply_data;
+ const struct ethnl_request_ops *ops;
+ struct ethnl_req_info *req_info;
+ struct genlmsghdr *ghdr;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
+
+ ghdr = nlmsg_data(cb->nlh);
+ ops = ethnl_default_requests[ghdr->cmd];
+ if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", ghdr->cmd))
+ return -EOPNOTSUPP;
+ req_info = kzalloc(ops->req_info_size, GFP_KERNEL);
+ if (!req_info)
+ return -ENOMEM;
+ reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL);
+ if (!reply_data) {
+ ret = -ENOMEM;
+ goto free_req_info;
+ }
+
+ ret = ethnl_default_parse(req_info, &info->info, ops, false);
+ if (ret < 0)
+ goto free_reply_data;
+ if (req_info->dev) {
+ /* We ignore device specification in dump requests but as the
+ * same parser as for non-dump (doit) requests is used, it
+ * would take reference to the device if it finds one
+ */
+ netdev_put(req_info->dev, &req_info->dev_tracker);
+ req_info->dev = NULL;
+ }
+
+ ctx->ops = ops;
+ ctx->req_info = req_info;
+ ctx->reply_data = reply_data;
+ ctx->pos_ifindex = 0;
+
+ return 0;
+
+free_reply_data:
+ kfree(reply_data);
+free_req_info:
+ kfree(req_info);
+
+ return ret;
+}
+
+/* per-PHY ->start() handler for GET requests */
+static int ethnl_perphy_start(struct netlink_callback *cb)
+{
+ struct ethnl_perphy_dump_ctx *phy_ctx = ethnl_perphy_dump_context(cb);
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ struct ethnl_dump_ctx *ctx = &phy_ctx->ethnl_ctx;
+ struct ethnl_reply_data *reply_data;
+ const struct ethnl_request_ops *ops;
+ struct ethnl_req_info *req_info;
+ struct genlmsghdr *ghdr;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
+
+ ghdr = nlmsg_data(cb->nlh);
+ ops = ethnl_default_requests[ghdr->cmd];
+ if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", ghdr->cmd))
+ return -EOPNOTSUPP;
+ req_info = kzalloc(ops->req_info_size, GFP_KERNEL);
+ if (!req_info)
+ return -ENOMEM;
+ reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL);
+ if (!reply_data) {
+ ret = -ENOMEM;
+ goto free_req_info;
+ }
+
+ /* Unlike per-dev dump, don't ignore dev. The dump handler
+ * will notice it and dump PHYs from given dev. We only keep track of
+ * the dev's ifindex, .dumpit() will grab and release the netdev itself.
+ */
+ ret = ethnl_default_parse(req_info, &info->info, ops, false);
+ if (ret < 0)
+ goto free_reply_data;
+ if (req_info->dev) {
+ phy_ctx->ifindex = req_info->dev->ifindex;
+ netdev_put(req_info->dev, &req_info->dev_tracker);
+ req_info->dev = NULL;
+ }
+
+ ctx->ops = ops;
+ ctx->req_info = req_info;
+ ctx->reply_data = reply_data;
+ ctx->pos_ifindex = 0;
+
+ return 0;
+
+free_reply_data:
+ kfree(reply_data);
+free_req_info:
+ kfree(req_info);
+
+ return ret;
+}
+
+static int ethnl_perphy_dump_one_dev(struct sk_buff *skb,
+ struct ethnl_perphy_dump_ctx *ctx,
+ const struct genl_info *info)
+{
+ struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx;
+ struct net_device *dev = ethnl_ctx->req_info->dev;
+ struct phy_device_node *pdn;
+ int ret;
+
+ if (!dev->link_topo)
+ return 0;
+
+ xa_for_each_start(&dev->link_topo->phys, ctx->pos_phyindex, pdn,
+ ctx->pos_phyindex) {
+ ethnl_ctx->req_info->phy_index = ctx->pos_phyindex;
+
+ /* We can re-use the original dump_one as ->prepare_data in
+ * commands use ethnl_req_get_phydev(), which gets the PHY from
+ * the req_info->phy_index
+ */
+ ret = ethnl_default_dump_one(skb, dev, ethnl_ctx, info);
+ if (ret)
+ return ret;
+ }
+
+ ctx->pos_phyindex = 0;
+
+ return 0;
+}
+
+static int ethnl_perphy_dump_all_dev(struct sk_buff *skb,
+ struct ethnl_perphy_dump_ctx *ctx,
+ const struct genl_info *info)
+{
+ struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx;
+ struct net *net = sock_net(skb->sk);
+ netdevice_tracker dev_tracker;
+ struct net_device *dev;
+ int ret = 0;
+
+ rcu_read_lock();
+ for_each_netdev_dump(net, dev, ethnl_ctx->pos_ifindex) {
+ netdev_hold(dev, &dev_tracker, GFP_ATOMIC);
+ rcu_read_unlock();
+
+ /* per-PHY commands use ethnl_req_get_phydev(), which needs the
+ * net_device in the req_info
+ */
+ ethnl_ctx->req_info->dev = dev;
+ ret = ethnl_perphy_dump_one_dev(skb, ctx, info);
+
+ rcu_read_lock();
+ netdev_put(dev, &dev_tracker);
+ ethnl_ctx->req_info->dev = NULL;
+
+ if (ret < 0 && ret != -EOPNOTSUPP) {
+ if (likely(skb->len))
+ ret = skb->len;
+ break;
+ }
+ ret = 0;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/* per-PHY ->dumpit() handler for GET requests. */
+static int ethnl_perphy_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct ethnl_perphy_dump_ctx *ctx = ethnl_perphy_dump_context(cb);
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx;
+ int ret = 0;
+
+ if (ctx->ifindex) {
+ netdevice_tracker dev_tracker;
+ struct net_device *dev;
+
+ dev = netdev_get_by_index(genl_info_net(&info->info),
+ ctx->ifindex, &dev_tracker,
+ GFP_KERNEL);
+ if (!dev)
+ return -ENODEV;
+
+ ethnl_ctx->req_info->dev = dev;
+ ret = ethnl_perphy_dump_one_dev(skb, ctx, genl_info_dump(cb));
+
+ if (ret < 0 && ret != -EOPNOTSUPP && likely(skb->len))
+ ret = skb->len;
+
+ netdev_put(dev, &dev_tracker);
+ } else {
+ ret = ethnl_perphy_dump_all_dev(skb, ctx, genl_info_dump(cb));
+ }
+
+ return ret;
+}
+
+/* per-PHY ->done() handler for GET requests */
+static int ethnl_perphy_done(struct netlink_callback *cb)
+{
+ struct ethnl_perphy_dump_ctx *ctx = ethnl_perphy_dump_context(cb);
+ struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx;
+
+ kfree(ethnl_ctx->reply_data);
+ kfree(ethnl_ctx->req_info);
+
+ return 0;
+}
+
+/* default ->done() handler for GET requests */
+static int ethnl_default_done(struct netlink_callback *cb)
+{
+ struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb);
+
+ kfree(ctx->reply_data);
+ kfree(ctx->req_info);
+
+ return 0;
+}
+
+static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ const struct ethnl_request_ops *ops;
+ const u8 cmd = info->genlhdr->cmd;
+ struct ethnl_req_info *req_info;
+ struct net_device *dev;
+ int ret;
+
+ ops = ethnl_default_requests[cmd];
+ if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", cmd))
+ return -EOPNOTSUPP;
+ if (GENL_REQ_ATTR_CHECK(info, ops->hdr_attr))
+ return -EINVAL;
+
+ req_info = kzalloc(ops->req_info_size, GFP_KERNEL);
+ if (!req_info)
+ return -ENOMEM;
+
+ ret = ethnl_default_parse(req_info, info, ops, true);
+ if (ret < 0)
+ goto out_free_req;
+
+ if (ops->set_validate) {
+ ret = ops->set_validate(req_info, info);
+ /* 0 means nothing to do */
+ if (ret <= 0)
+ goto out_dev;
+ }
+
+ dev = req_info->dev;
+
+ rtnl_lock();
+ netdev_lock_ops(dev);
+ dev->cfg_pending = kmemdup(dev->cfg, sizeof(*dev->cfg),
+ GFP_KERNEL_ACCOUNT);
+ if (!dev->cfg_pending) {
+ ret = -ENOMEM;
+ goto out_tie_cfg;
+ }
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out_free_cfg;
+
+ ret = ops->set(req_info, info);
+ if (ret < 0)
+ goto out_ops;
+
+ swap(dev->cfg, dev->cfg_pending);
+ if (!ret)
+ goto out_ops;
+ ethnl_notify(dev, ops->set_ntf_cmd, req_info);
+
+ ret = 0;
+out_ops:
+ ethnl_ops_complete(dev);
+out_free_cfg:
+ kfree(dev->cfg_pending);
+out_tie_cfg:
+ dev->cfg_pending = dev->cfg;
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+out_dev:
+ ethnl_parse_header_dev_put(req_info);
+out_free_req:
+ kfree(req_info);
+ return ret;
+}
+
+static const struct ethnl_request_ops *
+ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = {
+ [ETHTOOL_MSG_LINKINFO_NTF] = &ethnl_linkinfo_request_ops,
+ [ETHTOOL_MSG_LINKMODES_NTF] = &ethnl_linkmodes_request_ops,
+ [ETHTOOL_MSG_DEBUG_NTF] = &ethnl_debug_request_ops,
+ [ETHTOOL_MSG_WOL_NTF] = &ethnl_wol_request_ops,
+ [ETHTOOL_MSG_FEATURES_NTF] = &ethnl_features_request_ops,
+ [ETHTOOL_MSG_PRIVFLAGS_NTF] = &ethnl_privflags_request_ops,
+ [ETHTOOL_MSG_RINGS_NTF] = &ethnl_rings_request_ops,
+ [ETHTOOL_MSG_CHANNELS_NTF] = &ethnl_channels_request_ops,
+ [ETHTOOL_MSG_COALESCE_NTF] = &ethnl_coalesce_request_ops,
+ [ETHTOOL_MSG_PAUSE_NTF] = &ethnl_pause_request_ops,
+ [ETHTOOL_MSG_EEE_NTF] = &ethnl_eee_request_ops,
+ [ETHTOOL_MSG_FEC_NTF] = &ethnl_fec_request_ops,
+ [ETHTOOL_MSG_MODULE_NTF] = &ethnl_module_request_ops,
+ [ETHTOOL_MSG_PLCA_NTF] = &ethnl_plca_cfg_request_ops,
+ [ETHTOOL_MSG_MM_NTF] = &ethnl_mm_request_ops,
+ [ETHTOOL_MSG_RSS_NTF] = &ethnl_rss_request_ops,
+ [ETHTOOL_MSG_RSS_CREATE_NTF] = &ethnl_rss_request_ops,
+};
+
+/* default notification handler */
+static void ethnl_default_notify(struct net_device *dev, unsigned int cmd,
+ const struct ethnl_req_info *orig_req_info)
+{
+ struct ethnl_reply_data *reply_data;
+ const struct ethnl_request_ops *ops;
+ struct ethnl_req_info *req_info;
+ struct genl_info info;
+ struct sk_buff *skb;
+ void *reply_payload;
+ int reply_len;
+ int ret;
+
+ genl_info_init_ntf(&info, &ethtool_genl_family, cmd);
+
+ if (WARN_ONCE(cmd > ETHTOOL_MSG_KERNEL_MAX ||
+ !ethnl_default_notify_ops[cmd],
+ "unexpected notification type %u\n", cmd))
+ return;
+ ops = ethnl_default_notify_ops[cmd];
+ req_info = kzalloc(ops->req_info_size, GFP_KERNEL);
+ if (!req_info)
+ return;
+ reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL);
+ if (!reply_data) {
+ kfree(req_info);
+ return;
+ }
+
+ req_info->dev = dev;
+ req_info->flags |= ETHTOOL_FLAG_COMPACT_BITSETS;
+ if (orig_req_info) {
+ req_info->phy_index = orig_req_info->phy_index;
+ memcpy(&req_info[1], &orig_req_info[1],
+ ops->req_info_size - sizeof(*req_info));
+ }
+
+ netdev_ops_assert_locked(dev);
+
+ ethnl_init_reply_data(reply_data, ops, dev);
+ ret = ops->prepare_data(req_info, reply_data, &info);
+ if (ret < 0)
+ goto err_rep;
+ ret = ops->reply_size(req_info, reply_data);
+ if (ret < 0)
+ goto err_cleanup;
+ reply_len = ret + ethnl_reply_header_size();
+ skb = genlmsg_new(reply_len, GFP_KERNEL);
+ if (!skb)
+ goto err_cleanup;
+ reply_payload = ethnl_bcastmsg_put(skb, cmd);
+ if (!reply_payload)
+ goto err_skb;
+ ret = ethnl_fill_reply_header(skb, dev, ops->hdr_attr);
+ if (ret < 0)
+ goto err_msg;
+ ret = ops->fill_reply(skb, req_info, reply_data);
+ if (ret < 0)
+ goto err_msg;
+ if (ops->cleanup_data)
+ ops->cleanup_data(reply_data);
+
+ genlmsg_end(skb, reply_payload);
+ kfree(reply_data);
+ kfree(req_info);
+ ethnl_multicast(skb, dev);
+ return;
+
+err_msg:
+ WARN_ONCE(ret == -EMSGSIZE,
+ "calculated message payload length (%d) not sufficient\n",
+ reply_len);
+err_skb:
+ nlmsg_free(skb);
+err_cleanup:
+ if (ops->cleanup_data)
+ ops->cleanup_data(reply_data);
+err_rep:
+ kfree(reply_data);
+ kfree(req_info);
+ return;
+}
+
+/* notifications */
+
+typedef void (*ethnl_notify_handler_t)(struct net_device *dev, unsigned int cmd,
+ const struct ethnl_req_info *req_info);
+
+static const ethnl_notify_handler_t ethnl_notify_handlers[] = {
+ [ETHTOOL_MSG_LINKINFO_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_LINKMODES_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_DEBUG_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_WOL_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_FEATURES_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_PRIVFLAGS_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_RINGS_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_CHANNELS_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_COALESCE_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_PAUSE_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_EEE_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_FEC_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_MODULE_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_PLCA_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_MM_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_RSS_NTF] = ethnl_default_notify,
+ [ETHTOOL_MSG_RSS_CREATE_NTF] = ethnl_default_notify,
+};
+
+void ethnl_notify(struct net_device *dev, unsigned int cmd,
+ const struct ethnl_req_info *req_info)
+{
+ if (unlikely(!ethnl_ok))
+ return;
+ ASSERT_RTNL();
+
+ if (likely(cmd < ARRAY_SIZE(ethnl_notify_handlers) &&
+ ethnl_notify_handlers[cmd]))
+ ethnl_notify_handlers[cmd](dev, cmd, req_info);
+ else
+ WARN_ONCE(1, "notification %u not implemented (dev=%s)\n",
+ cmd, netdev_name(dev));
+}
+
+void ethtool_notify(struct net_device *dev, unsigned int cmd)
+{
+ ethnl_notify(dev, cmd, NULL);
+}
+EXPORT_SYMBOL(ethtool_notify);
+
+static void ethnl_notify_features(struct netdev_notifier_info *info)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(info);
+
+ ethtool_notify(dev, ETHTOOL_MSG_FEATURES_NTF);
+}
+
+static int ethnl_netdev_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct netdev_notifier_info *info = ptr;
+ struct netlink_ext_ack *extack;
+ struct net_device *dev;
+
+ dev = netdev_notifier_info_to_dev(info);
+ extack = netdev_notifier_info_to_extack(info);
+
+ switch (event) {
+ case NETDEV_FEAT_CHANGE:
+ ethnl_notify_features(ptr);
+ break;
+ case NETDEV_PRE_UP:
+ if (dev->ethtool->module_fw_flash_in_progress) {
+ NL_SET_ERR_MSG(extack, "Can't set port up while flashing module firmware");
+ return NOTIFY_BAD;
+ }
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block ethnl_netdev_notifier = {
+ .notifier_call = ethnl_netdev_event,
+};
+
+/* genetlink setup */
+
+static const struct genl_ops ethtool_genl_ops[] = {
+ {
+ .cmd = ETHTOOL_MSG_STRSET_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_strset_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_strset_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_LINKINFO_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_linkinfo_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_linkinfo_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_LINKINFO_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_linkinfo_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_linkinfo_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_LINKMODES_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_linkmodes_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_linkmodes_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_LINKMODES_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_linkmodes_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_linkmodes_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_LINKSTATE_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_linkstate_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_linkstate_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_DEBUG_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_debug_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_debug_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_DEBUG_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_debug_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_debug_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_WOL_GET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_wol_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_wol_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_WOL_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_wol_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_wol_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_FEATURES_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_features_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_features_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_FEATURES_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_set_features,
+ .policy = ethnl_features_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_features_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PRIVFLAGS_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_privflags_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_privflags_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PRIVFLAGS_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_privflags_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_privflags_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_RINGS_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_rings_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_rings_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_RINGS_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_rings_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_rings_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_CHANNELS_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_channels_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_channels_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_CHANNELS_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_channels_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_channels_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_COALESCE_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_coalesce_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_coalesce_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_COALESCE_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_coalesce_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_coalesce_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PAUSE_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_pause_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_pause_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PAUSE_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_pause_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_pause_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_EEE_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_eee_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_eee_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_EEE_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_eee_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_eee_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_TSINFO_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_tsinfo_start,
+ .dumpit = ethnl_tsinfo_dumpit,
+ .done = ethnl_tsinfo_done,
+ .policy = ethnl_tsinfo_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_tsinfo_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_CABLE_TEST_ACT,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_act_cable_test,
+ .policy = ethnl_cable_test_act_policy,
+ .maxattr = ARRAY_SIZE(ethnl_cable_test_act_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_CABLE_TEST_TDR_ACT,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_act_cable_test_tdr,
+ .policy = ethnl_cable_test_tdr_act_policy,
+ .maxattr = ARRAY_SIZE(ethnl_cable_test_tdr_act_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_TUNNEL_INFO_GET,
+ .doit = ethnl_tunnel_info_doit,
+ .start = ethnl_tunnel_info_start,
+ .dumpit = ethnl_tunnel_info_dumpit,
+ .policy = ethnl_tunnel_info_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_tunnel_info_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_FEC_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_fec_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_fec_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_FEC_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_fec_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_fec_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_MODULE_EEPROM_GET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_module_eeprom_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_module_eeprom_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_STATS_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_stats_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_stats_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_phc_vclocks_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_phc_vclocks_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_MODULE_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_module_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_module_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_MODULE_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_module_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_module_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PSE_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_perphy_start,
+ .dumpit = ethnl_perphy_dumpit,
+ .done = ethnl_perphy_done,
+ .policy = ethnl_pse_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_pse_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PSE_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_pse_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_pse_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_RSS_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_rss_dump_start,
+ .dumpit = ethnl_rss_dumpit,
+ .policy = ethnl_rss_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_rss_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PLCA_GET_CFG,
+ .doit = ethnl_default_doit,
+ .start = ethnl_perphy_start,
+ .dumpit = ethnl_perphy_dumpit,
+ .done = ethnl_perphy_done,
+ .policy = ethnl_plca_get_cfg_policy,
+ .maxattr = ARRAY_SIZE(ethnl_plca_get_cfg_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PLCA_SET_CFG,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_plca_set_cfg_policy,
+ .maxattr = ARRAY_SIZE(ethnl_plca_set_cfg_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PLCA_GET_STATUS,
+ .doit = ethnl_default_doit,
+ .start = ethnl_perphy_start,
+ .dumpit = ethnl_perphy_dumpit,
+ .done = ethnl_perphy_done,
+ .policy = ethnl_plca_get_status_policy,
+ .maxattr = ARRAY_SIZE(ethnl_plca_get_status_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_MM_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_mm_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_mm_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_MM_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_mm_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_mm_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_MODULE_FW_FLASH_ACT,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_act_module_fw_flash,
+ .policy = ethnl_module_fw_flash_act_policy,
+ .maxattr = ARRAY_SIZE(ethnl_module_fw_flash_act_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_PHY_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_perphy_start,
+ .dumpit = ethnl_perphy_dumpit,
+ .done = ethnl_perphy_done,
+ .policy = ethnl_phy_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_phy_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_TSCONFIG_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_tsconfig_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_tsconfig_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_TSCONFIG_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_tsconfig_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_tsconfig_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_RSS_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_rss_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_rss_set_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_RSS_CREATE_ACT,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_rss_create_doit,
+ .policy = ethnl_rss_create_policy,
+ .maxattr = ARRAY_SIZE(ethnl_rss_create_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_RSS_DELETE_ACT,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_rss_delete_doit,
+ .policy = ethnl_rss_delete_policy,
+ .maxattr = ARRAY_SIZE(ethnl_rss_delete_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_MSE_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_perphy_start,
+ .dumpit = ethnl_perphy_dumpit,
+ .done = ethnl_perphy_done,
+ .policy = ethnl_mse_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_mse_get_policy) - 1,
+ },
+};
+
+static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
+ [ETHNL_MCGRP_MONITOR] = { .name = ETHTOOL_MCGRP_MONITOR_NAME },
+};
+
+static struct genl_family ethtool_genl_family __ro_after_init = {
+ .name = ETHTOOL_GENL_NAME,
+ .version = ETHTOOL_GENL_VERSION,
+ .netnsok = true,
+ .parallel_ops = true,
+ .ops = ethtool_genl_ops,
+ .n_ops = ARRAY_SIZE(ethtool_genl_ops),
+ .resv_start_op = ETHTOOL_MSG_MODULE_GET + 1,
+ .mcgrps = ethtool_nl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(ethtool_nl_mcgrps),
+ .sock_priv_size = sizeof(struct ethnl_sock_priv),
+ .sock_priv_destroy = ethnl_sock_priv_destroy,
+};
+
+/* module setup */
+
+static int __init ethnl_init(void)
+{
+ int ret;
+
+ ret = genl_register_family(&ethtool_genl_family);
+ if (WARN(ret < 0, "ethtool: genetlink family registration failed"))
+ return ret;
+ ethnl_ok = true;
+
+ ret = register_netdevice_notifier(&ethnl_netdev_notifier);
+ WARN(ret < 0, "ethtool: net device notifier registration failed");
+ return ret;
+}
+
+subsys_initcall(ethnl_init);
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
new file mode 100644
index 000000000000..89010eaa67df
--- /dev/null
+++ b/net/ethtool/netlink.h
@@ -0,0 +1,525 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef _NET_ETHTOOL_NETLINK_H
+#define _NET_ETHTOOL_NETLINK_H
+
+#include <linux/ethtool_netlink.h>
+#include <linux/netdevice.h>
+#include <net/genetlink.h>
+#include <net/sock.h>
+
+struct ethnl_req_info;
+
+u32 ethnl_bcast_seq_next(void);
+int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info,
+ const struct nlattr *nest, struct net *net,
+ struct netlink_ext_ack *extack,
+ bool require_dev);
+int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev,
+ u16 attrtype);
+struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd,
+ u16 hdr_attrtype, struct genl_info *info,
+ void **ehdrp);
+void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd);
+void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd);
+void *ethnl_unicast_put(struct sk_buff *skb, u32 portid, u32 seq, u8 cmd);
+int ethnl_multicast(struct sk_buff *skb, struct net_device *dev);
+void ethnl_notify(struct net_device *dev, unsigned int cmd,
+ const struct ethnl_req_info *req_info);
+
+/**
+ * ethnl_strz_size() - calculate attribute length for fixed size string
+ * @s: ETH_GSTRING_LEN sized string (may not be null terminated)
+ *
+ * Return: total length of an attribute with null terminated string from @s
+ */
+static inline int ethnl_strz_size(const char *s)
+{
+ return nla_total_size(strnlen(s, ETH_GSTRING_LEN) + 1);
+}
+
+/**
+ * ethnl_put_strz() - put string attribute with fixed size string
+ * @skb: skb with the message
+ * @attrtype: attribute type
+ * @s: ETH_GSTRING_LEN sized string (may not be null terminated)
+ *
+ * Puts an attribute with null terminated string from @s into the message.
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static inline int ethnl_put_strz(struct sk_buff *skb, u16 attrtype,
+ const char *s)
+{
+ unsigned int len = strnlen(s, ETH_GSTRING_LEN);
+ struct nlattr *attr;
+
+ attr = nla_reserve(skb, attrtype, len + 1);
+ if (!attr)
+ return -EMSGSIZE;
+
+ memcpy(nla_data(attr), s, len);
+ ((char *)nla_data(attr))[len] = '\0';
+ return 0;
+}
+
+/**
+ * ethnl_update_u32() - update u32 value from NLA_U32 attribute
+ * @dst: value to update
+ * @attr: netlink attribute with new value or null
+ * @mod: pointer to bool for modification tracking
+ *
+ * Copy the u32 value from NLA_U32 netlink attribute @attr into variable
+ * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod
+ * is set to true if this function changed the value of *dst, otherwise it
+ * is left as is.
+ */
+static inline void ethnl_update_u32(u32 *dst, const struct nlattr *attr,
+ bool *mod)
+{
+ u32 val;
+
+ if (!attr)
+ return;
+ val = nla_get_u32(attr);
+ if (*dst == val)
+ return;
+
+ *dst = val;
+ *mod = true;
+}
+
+/**
+ * ethnl_update_u8() - update u8 value from NLA_U8 attribute
+ * @dst: value to update
+ * @attr: netlink attribute with new value or null
+ * @mod: pointer to bool for modification tracking
+ *
+ * Copy the u8 value from NLA_U8 netlink attribute @attr into variable
+ * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod
+ * is set to true if this function changed the value of *dst, otherwise it
+ * is left as is.
+ */
+static inline void ethnl_update_u8(u8 *dst, const struct nlattr *attr,
+ bool *mod)
+{
+ u8 val;
+
+ if (!attr)
+ return;
+ val = nla_get_u8(attr);
+ if (*dst == val)
+ return;
+
+ *dst = val;
+ *mod = true;
+}
+
+/**
+ * ethnl_update_bool32() - update u32 used as bool from NLA_U8 attribute
+ * @dst: value to update
+ * @attr: netlink attribute with new value or null
+ * @mod: pointer to bool for modification tracking
+ *
+ * Use the u8 value from NLA_U8 netlink attribute @attr to set u32 variable
+ * pointed to by @dst to 0 (if zero) or 1 (if not); do nothing if @attr is
+ * null. Bool pointed to by @mod is set to true if this function changed the
+ * logical value of *dst, otherwise it is left as is.
+ */
+static inline void ethnl_update_bool32(u32 *dst, const struct nlattr *attr,
+ bool *mod)
+{
+ u8 val;
+
+ if (!attr)
+ return;
+ val = !!nla_get_u8(attr);
+ if (!!*dst == val)
+ return;
+
+ *dst = val;
+ *mod = true;
+}
+
+/**
+ * ethnl_update_bool() - updateb bool used as bool from NLA_U8 attribute
+ * @dst: value to update
+ * @attr: netlink attribute with new value or null
+ * @mod: pointer to bool for modification tracking
+ *
+ * Use the bool value from NLA_U8 netlink attribute @attr to set bool variable
+ * pointed to by @dst to 0 (if zero) or 1 (if not); do nothing if @attr is
+ * null. Bool pointed to by @mod is set to true if this function changed the
+ * logical value of *dst, otherwise it is left as is.
+ */
+static inline void ethnl_update_bool(bool *dst, const struct nlattr *attr,
+ bool *mod)
+{
+ u8 val;
+
+ if (!attr)
+ return;
+ val = !!nla_get_u8(attr);
+ if (!!*dst == val)
+ return;
+
+ *dst = val;
+ *mod = true;
+}
+
+/**
+ * ethnl_update_binary() - update binary data from NLA_BINARY attribute
+ * @dst: value to update
+ * @len: destination buffer length
+ * @attr: netlink attribute with new value or null
+ * @mod: pointer to bool for modification tracking
+ *
+ * Use the u8 value from NLA_U8 netlink attribute @attr to rewrite data block
+ * of length @len at @dst by attribute payload; do nothing if @attr is null.
+ * Bool pointed to by @mod is set to true if this function changed the logical
+ * value of *dst, otherwise it is left as is.
+ */
+static inline void ethnl_update_binary(void *dst, unsigned int len,
+ const struct nlattr *attr, bool *mod)
+{
+ if (!attr)
+ return;
+ if (nla_len(attr) < len)
+ len = nla_len(attr);
+ if (!memcmp(dst, nla_data(attr), len))
+ return;
+
+ memcpy(dst, nla_data(attr), len);
+ *mod = true;
+}
+
+/**
+ * ethnl_update_bitfield32() - update u32 value from NLA_BITFIELD32 attribute
+ * @dst: value to update
+ * @attr: netlink attribute with new value or null
+ * @mod: pointer to bool for modification tracking
+ *
+ * Update bits in u32 value which are set in attribute's mask to values from
+ * attribute's value. Do nothing if @attr is null or the value wouldn't change;
+ * otherwise, set bool pointed to by @mod to true.
+ */
+static inline void ethnl_update_bitfield32(u32 *dst, const struct nlattr *attr,
+ bool *mod)
+{
+ struct nla_bitfield32 change;
+ u32 newval;
+
+ if (!attr)
+ return;
+ change = nla_get_bitfield32(attr);
+ newval = (*dst & ~change.selector) | (change.value & change.selector);
+ if (*dst == newval)
+ return;
+
+ *dst = newval;
+ *mod = true;
+}
+
+/**
+ * ethnl_reply_header_size() - total size of reply header
+ *
+ * This is an upper estimate so that we do not need to hold RTNL lock longer
+ * than necessary (to prevent rename between size estimate and composing the
+ * message). Accounts only for device ifindex and name as those are the only
+ * attributes ethnl_fill_reply_header() puts into the reply header.
+ */
+static inline unsigned int ethnl_reply_header_size(void)
+{
+ return nla_total_size(nla_total_size(sizeof(u32)) +
+ nla_total_size(IFNAMSIZ));
+}
+
+/* GET request handling */
+
+/* Unified processing of GET requests uses two data structures: request info
+ * and reply data. Request info holds information parsed from client request
+ * and its stays constant through all request processing. Reply data holds data
+ * retrieved from ethtool_ops callbacks or other internal sources which is used
+ * to compose the reply. When processing a dump request, request info is filled
+ * only once (when the request message is parsed) but reply data is filled for
+ * each reply message.
+ *
+ * Both structures consist of part common for all request types (struct
+ * ethnl_req_info and struct ethnl_reply_data defined below) and optional
+ * parts specific for each request type. Common part always starts at offset 0.
+ */
+
+/**
+ * struct ethnl_req_info - base type of request information for GET requests
+ * @dev: network device the request is for (may be null)
+ * @dev_tracker: refcount tracker for @dev reference
+ * @flags: request flags common for all request types
+ * @phy_index: phy_device index connected to @dev this request is for. Can be
+ * 0 if the request doesn't target a phy, or if the @dev's attached
+ * phy is targeted.
+ *
+ * This is a common base for request specific structures holding data from
+ * parsed userspace request. These always embed struct ethnl_req_info at
+ * zero offset.
+ */
+struct ethnl_req_info {
+ struct net_device *dev;
+ netdevice_tracker dev_tracker;
+ u32 flags;
+ u32 phy_index;
+};
+
+static inline void ethnl_parse_header_dev_put(struct ethnl_req_info *req_info)
+{
+ netdev_put(req_info->dev, &req_info->dev_tracker);
+}
+
+/**
+ * ethnl_req_get_phydev() - Gets the phy_device targeted by this request,
+ * if any. Must be called under rntl_lock().
+ * @req_info: The ethnl request to get the phy from.
+ * @tb: The netlink attributes array, for error reporting.
+ * @header: The netlink header index, used for error reporting.
+ * @extack: The netlink extended ACK, for error reporting.
+ *
+ * The caller must hold RTNL, until it's done interacting with the returned
+ * phy_device.
+ *
+ * Return: A phy_device pointer corresponding either to the passed phy_index
+ * if one is provided. If not, the phy_device attached to the
+ * net_device targeted by this request is returned. If there's no
+ * targeted net_device, or no phy_device is attached, NULL is
+ * returned. If the provided phy_index is invalid, an error pointer
+ * is returned.
+ */
+struct phy_device *ethnl_req_get_phydev(const struct ethnl_req_info *req_info,
+ struct nlattr **tb, unsigned int header,
+ struct netlink_ext_ack *extack);
+
+/**
+ * struct ethnl_reply_data - base type of reply data for GET requests
+ * @dev: device for current reply message; in single shot requests it is
+ * equal to &ethnl_req_info.dev; in dumps it's different for each
+ * reply message
+ *
+ * This is a common base for request specific structures holding data for
+ * kernel reply message. These always embed struct ethnl_reply_data at zero
+ * offset.
+ */
+struct ethnl_reply_data {
+ struct net_device *dev;
+};
+
+int ethnl_ops_begin(struct net_device *dev);
+void ethnl_ops_complete(struct net_device *dev);
+
+enum ethnl_sock_type {
+ ETHTOOL_SOCK_TYPE_MODULE_FW_FLASH,
+};
+
+struct ethnl_sock_priv {
+ struct net_device *dev;
+ u32 portid;
+ enum ethnl_sock_type type;
+};
+
+int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid,
+ enum ethnl_sock_type type);
+
+/**
+ * struct ethnl_request_ops - unified handling of GET and SET requests
+ * @request_cmd: command id for request (GET)
+ * @reply_cmd: command id for reply (GET_REPLY)
+ * @hdr_attr: attribute type for request header
+ * @req_info_size: size of request info
+ * @reply_data_size: size of reply data
+ * @allow_nodev_do: allow non-dump request with no device identification
+ * @set_ntf_cmd: notification to generate on changes (SET)
+ * @parse_request:
+ * Parse request except common header (struct ethnl_req_info). Common
+ * header is already filled on entry, the rest up to @repdata_offset
+ * is zero initialized. This callback should only modify type specific
+ * request info by parsed attributes from request message.
+ * Called for both GET and SET. Information parsed for SET will
+ * be conveyed to the req_info used during NTF generation.
+ * @prepare_data:
+ * Retrieve and prepare data needed to compose a reply message. Calls to
+ * ethtool_ops handlers are limited to this callback. Common reply data
+ * (struct ethnl_reply_data) is filled on entry, type specific part after
+ * it is zero initialized. This callback should only modify the type
+ * specific part of reply data. Device identification from struct
+ * ethnl_reply_data is to be used as for dump requests, it iterates
+ * through network devices while dev member of struct ethnl_req_info
+ * points to the device from client request.
+ * @reply_size:
+ * Estimate reply message size. Returned value must be sufficient for
+ * message payload without common reply header. The callback may returned
+ * estimate higher than actual message size if exact calculation would
+ * not be worth the saved memory space.
+ * @fill_reply:
+ * Fill reply message payload (except for common header) from reply data.
+ * The callback must not generate more payload than previously called
+ * ->reply_size() estimated.
+ * @cleanup_data:
+ * Optional cleanup called when reply data is no longer needed. Can be
+ * used e.g. to free any additional data structures outside the main
+ * structure which were allocated by ->prepare_data(). When processing
+ * dump requests, ->cleanup() is called for each message.
+ * @set_validate:
+ * Check if set operation is supported for a given device, and perform
+ * extra input checks. Expected return values:
+ * - 0 if the operation is a noop for the device (rare)
+ * - 1 if operation should proceed to calling @set
+ * - negative errno on errors
+ * Called without any locks, just a reference on the netdev.
+ * @set:
+ * Execute the set operation. The implementation should return
+ * - 0 if no configuration has changed
+ * - 1 if configuration changed and notification should be generated
+ * - negative errno on errors
+ *
+ * Description of variable parts of GET request handling when using the
+ * unified infrastructure. When used, a pointer to an instance of this
+ * structure is to be added to &ethnl_default_requests array and generic
+ * handlers ethnl_default_doit(), ethnl_default_dumpit(),
+ * ethnl_default_start() and ethnl_default_done() used in @ethtool_genl_ops;
+ * ethnl_default_notify() can be used in @ethnl_notify_handlers to send
+ * notifications of the corresponding type.
+ */
+struct ethnl_request_ops {
+ u8 request_cmd;
+ u8 reply_cmd;
+ u16 hdr_attr;
+ unsigned int req_info_size;
+ unsigned int reply_data_size;
+ bool allow_nodev_do;
+ u8 set_ntf_cmd;
+
+ int (*parse_request)(struct ethnl_req_info *req_info,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack);
+ int (*prepare_data)(const struct ethnl_req_info *req_info,
+ struct ethnl_reply_data *reply_data,
+ const struct genl_info *info);
+ int (*reply_size)(const struct ethnl_req_info *req_info,
+ const struct ethnl_reply_data *reply_data);
+ int (*fill_reply)(struct sk_buff *skb,
+ const struct ethnl_req_info *req_info,
+ const struct ethnl_reply_data *reply_data);
+ void (*cleanup_data)(struct ethnl_reply_data *reply_data);
+
+ int (*set_validate)(struct ethnl_req_info *req_info,
+ struct genl_info *info);
+ int (*set)(struct ethnl_req_info *req_info,
+ struct genl_info *info);
+};
+
+/* request handlers */
+
+extern const struct ethnl_request_ops ethnl_strset_request_ops;
+extern const struct ethnl_request_ops ethnl_linkinfo_request_ops;
+extern const struct ethnl_request_ops ethnl_linkmodes_request_ops;
+extern const struct ethnl_request_ops ethnl_linkstate_request_ops;
+extern const struct ethnl_request_ops ethnl_debug_request_ops;
+extern const struct ethnl_request_ops ethnl_wol_request_ops;
+extern const struct ethnl_request_ops ethnl_features_request_ops;
+extern const struct ethnl_request_ops ethnl_privflags_request_ops;
+extern const struct ethnl_request_ops ethnl_rings_request_ops;
+extern const struct ethnl_request_ops ethnl_channels_request_ops;
+extern const struct ethnl_request_ops ethnl_coalesce_request_ops;
+extern const struct ethnl_request_ops ethnl_pause_request_ops;
+extern const struct ethnl_request_ops ethnl_eee_request_ops;
+extern const struct ethnl_request_ops ethnl_tsinfo_request_ops;
+extern const struct ethnl_request_ops ethnl_fec_request_ops;
+extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops;
+extern const struct ethnl_request_ops ethnl_stats_request_ops;
+extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops;
+extern const struct ethnl_request_ops ethnl_module_request_ops;
+extern const struct ethnl_request_ops ethnl_pse_request_ops;
+extern const struct ethnl_request_ops ethnl_rss_request_ops;
+extern const struct ethnl_request_ops ethnl_plca_cfg_request_ops;
+extern const struct ethnl_request_ops ethnl_plca_status_request_ops;
+extern const struct ethnl_request_ops ethnl_mm_request_ops;
+extern const struct ethnl_request_ops ethnl_phy_request_ops;
+extern const struct ethnl_request_ops ethnl_tsconfig_request_ops;
+extern const struct ethnl_request_ops ethnl_mse_request_ops;
+
+extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
+extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
+extern const struct nla_policy ethnl_header_policy_phy[ETHTOOL_A_HEADER_PHY_INDEX + 1];
+extern const struct nla_policy ethnl_header_policy_phy_stats[ETHTOOL_A_HEADER_PHY_INDEX + 1];
+extern const struct nla_policy ethnl_strset_get_policy[ETHTOOL_A_STRSET_COUNTS_ONLY + 1];
+extern const struct nla_policy ethnl_linkinfo_get_policy[ETHTOOL_A_LINKINFO_HEADER + 1];
+extern const struct nla_policy ethnl_linkinfo_set_policy[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL + 1];
+extern const struct nla_policy ethnl_linkmodes_get_policy[ETHTOOL_A_LINKMODES_HEADER + 1];
+extern const struct nla_policy ethnl_linkmodes_set_policy[ETHTOOL_A_LINKMODES_LANES + 1];
+extern const struct nla_policy ethnl_linkstate_get_policy[ETHTOOL_A_LINKSTATE_HEADER + 1];
+extern const struct nla_policy ethnl_debug_get_policy[ETHTOOL_A_DEBUG_HEADER + 1];
+extern const struct nla_policy ethnl_debug_set_policy[ETHTOOL_A_DEBUG_MSGMASK + 1];
+extern const struct nla_policy ethnl_wol_get_policy[ETHTOOL_A_WOL_HEADER + 1];
+extern const struct nla_policy ethnl_wol_set_policy[ETHTOOL_A_WOL_SOPASS + 1];
+extern const struct nla_policy ethnl_features_get_policy[ETHTOOL_A_FEATURES_HEADER + 1];
+extern const struct nla_policy ethnl_features_set_policy[ETHTOOL_A_FEATURES_WANTED + 1];
+extern const struct nla_policy ethnl_privflags_get_policy[ETHTOOL_A_PRIVFLAGS_HEADER + 1];
+extern const struct nla_policy ethnl_privflags_set_policy[ETHTOOL_A_PRIVFLAGS_FLAGS + 1];
+extern const struct nla_policy ethnl_rings_get_policy[ETHTOOL_A_RINGS_HEADER + 1];
+extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_HDS_THRESH_MAX + 1];
+extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1];
+extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1];
+extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1];
+extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_MAX + 1];
+extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_STATS_SRC + 1];
+extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_STATS_SRC + 1];
+extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1];
+extern const struct nla_policy ethnl_eee_set_policy[ETHTOOL_A_EEE_TX_LPI_TIMER + 1];
+extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1];
+extern const struct nla_policy ethnl_cable_test_act_policy[ETHTOOL_A_CABLE_TEST_HEADER + 1];
+extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG + 1];
+extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1];
+extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1];
+extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1];
+extern const struct nla_policy ethnl_module_eeprom_get_policy[ETHTOOL_A_MODULE_EEPROM_I2C_ADDRESS + 1];
+extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_SRC + 1];
+extern const struct nla_policy ethnl_phc_vclocks_get_policy[ETHTOOL_A_PHC_VCLOCKS_HEADER + 1];
+extern const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + 1];
+extern const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1];
+extern const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1];
+extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1];
+extern const struct nla_policy ethnl_rss_get_policy[ETHTOOL_A_RSS_START_CONTEXT + 1];
+extern const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_FLOW_HASH + 1];
+extern const struct nla_policy ethnl_rss_create_policy[ETHTOOL_A_RSS_INPUT_XFRM + 1];
+extern const struct nla_policy ethnl_rss_delete_policy[ETHTOOL_A_RSS_CONTEXT + 1];
+extern const struct nla_policy ethnl_plca_get_cfg_policy[ETHTOOL_A_PLCA_HEADER + 1];
+extern const struct nla_policy ethnl_plca_set_cfg_policy[ETHTOOL_A_PLCA_MAX + 1];
+extern const struct nla_policy ethnl_plca_get_status_policy[ETHTOOL_A_PLCA_HEADER + 1];
+extern const struct nla_policy ethnl_mm_get_policy[ETHTOOL_A_MM_HEADER + 1];
+extern const struct nla_policy ethnl_mm_set_policy[ETHTOOL_A_MM_MAX + 1];
+extern const struct nla_policy ethnl_module_fw_flash_act_policy[ETHTOOL_A_MODULE_FW_FLASH_PASSWORD + 1];
+extern const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1];
+extern const struct nla_policy ethnl_tsconfig_get_policy[ETHTOOL_A_TSCONFIG_HEADER + 1];
+extern const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1];
+extern const struct nla_policy ethnl_mse_get_policy[ETHTOOL_A_MSE_HEADER + 1];
+
+int ethnl_set_features(struct sk_buff *skb, struct genl_info *info);
+int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info);
+int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info);
+int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info);
+int ethnl_tunnel_info_start(struct netlink_callback *cb);
+int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int ethnl_act_module_fw_flash(struct sk_buff *skb, struct genl_info *info);
+int ethnl_rss_dump_start(struct netlink_callback *cb);
+int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int ethnl_tsinfo_start(struct netlink_callback *cb);
+int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int ethnl_tsinfo_done(struct netlink_callback *cb);
+int ethnl_rss_create_doit(struct sk_buff *skb, struct genl_info *info);
+int ethnl_rss_delete_doit(struct sk_buff *skb, struct genl_info *info);
+
+extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN];
+extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN];
+extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN];
+extern const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN];
+extern const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN];
+extern const char stats_phy_names[__ETHTOOL_A_STATS_PHY_CNT][ETH_GSTRING_LEN];
+
+#endif /* _NET_ETHTOOL_NETLINK_H */
diff --git a/net/ethtool/pause.c b/net/ethtool/pause.c
new file mode 100644
index 000000000000..0f9af1e66548
--- /dev/null
+++ b/net/ethtool/pause.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+
+struct pause_req_info {
+ struct ethnl_req_info base;
+ enum ethtool_mac_stats_src src;
+};
+
+#define PAUSE_REQINFO(__req_base) \
+ container_of(__req_base, struct pause_req_info, base)
+
+struct pause_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_pauseparam pauseparam;
+ struct ethtool_pause_stats pausestat;
+};
+
+#define PAUSE_REPDATA(__reply_base) \
+ container_of(__reply_base, struct pause_reply_data, base)
+
+const struct nla_policy ethnl_pause_get_policy[] = {
+ [ETHTOOL_A_PAUSE_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy_stats),
+ [ETHTOOL_A_PAUSE_STATS_SRC] =
+ NLA_POLICY_MAX(NLA_U32, ETHTOOL_MAC_STATS_SRC_PMAC),
+};
+
+static int pause_parse_request(struct ethnl_req_info *req_base,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ enum ethtool_mac_stats_src src = ETHTOOL_MAC_STATS_SRC_AGGREGATE;
+ struct pause_req_info *req_info = PAUSE_REQINFO(req_base);
+
+ if (tb[ETHTOOL_A_PAUSE_STATS_SRC]) {
+ if (!(req_base->flags & ETHTOOL_FLAG_STATS)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "ETHTOOL_FLAG_STATS must be set when requesting a source of stats");
+ return -EINVAL;
+ }
+
+ src = nla_get_u32(tb[ETHTOOL_A_PAUSE_STATS_SRC]);
+ }
+
+ req_info->src = src;
+
+ return 0;
+}
+
+static int pause_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ const struct pause_req_info *req_info = PAUSE_REQINFO(req_base);
+ struct pause_reply_data *data = PAUSE_REPDATA(reply_base);
+ enum ethtool_mac_stats_src src = req_info->src;
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_pauseparam)
+ return -EOPNOTSUPP;
+
+ ethtool_stats_init((u64 *)&data->pausestat,
+ sizeof(data->pausestat) / 8);
+ data->pausestat.src = src;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ if ((src == ETHTOOL_MAC_STATS_SRC_EMAC ||
+ src == ETHTOOL_MAC_STATS_SRC_PMAC) &&
+ !__ethtool_dev_mm_supported(dev)) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Device does not support MAC merge layer");
+ ethnl_ops_complete(dev);
+ return -EOPNOTSUPP;
+ }
+
+ dev->ethtool_ops->get_pauseparam(dev, &data->pauseparam);
+ if (req_base->flags & ETHTOOL_FLAG_STATS &&
+ dev->ethtool_ops->get_pause_stats)
+ dev->ethtool_ops->get_pause_stats(dev, &data->pausestat);
+
+ ethnl_ops_complete(dev);
+
+ return 0;
+}
+
+static int pause_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ int n = nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */
+ nla_total_size(sizeof(u8)) + /* _PAUSE_RX */
+ nla_total_size(sizeof(u8)); /* _PAUSE_TX */
+
+ if (req_base->flags & ETHTOOL_FLAG_STATS)
+ n += nla_total_size(0) + /* _PAUSE_STATS */
+ nla_total_size(sizeof(u32)) + /* _PAUSE_STATS_SRC */
+ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_PAUSE_STAT_CNT;
+ return n;
+}
+
+static int ethtool_put_stat(struct sk_buff *skb, u64 val, u16 attrtype,
+ u16 padtype)
+{
+ if (val == ETHTOOL_STAT_NOT_SET)
+ return 0;
+ if (nla_put_u64_64bit(skb, attrtype, val, padtype))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int pause_put_stats(struct sk_buff *skb,
+ const struct ethtool_pause_stats *pause_stats)
+{
+ const u16 pad = ETHTOOL_A_PAUSE_STAT_PAD;
+ struct nlattr *nest;
+
+ if (nla_put_u32(skb, ETHTOOL_A_PAUSE_STATS_SRC, pause_stats->src))
+ return -EMSGSIZE;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_PAUSE_STATS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (ethtool_put_stat(skb, pause_stats->tx_pause_frames,
+ ETHTOOL_A_PAUSE_STAT_TX_FRAMES, pad) ||
+ ethtool_put_stat(skb, pause_stats->rx_pause_frames,
+ ETHTOOL_A_PAUSE_STAT_RX_FRAMES, pad))
+ goto err_cancel;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+err_cancel:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int pause_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct pause_reply_data *data = PAUSE_REPDATA(reply_base);
+ const struct ethtool_pauseparam *pauseparam = &data->pauseparam;
+
+ if (nla_put_u8(skb, ETHTOOL_A_PAUSE_AUTONEG, !!pauseparam->autoneg) ||
+ nla_put_u8(skb, ETHTOOL_A_PAUSE_RX, !!pauseparam->rx_pause) ||
+ nla_put_u8(skb, ETHTOOL_A_PAUSE_TX, !!pauseparam->tx_pause))
+ return -EMSGSIZE;
+
+ if (req_base->flags & ETHTOOL_FLAG_STATS &&
+ pause_put_stats(skb, &data->pausestat))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+/* PAUSE_SET */
+
+const struct nla_policy ethnl_pause_set_policy[] = {
+ [ETHTOOL_A_PAUSE_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_U8 },
+ [ETHTOOL_A_PAUSE_RX] = { .type = NLA_U8 },
+ [ETHTOOL_A_PAUSE_TX] = { .type = NLA_U8 },
+ [ETHTOOL_A_PAUSE_STATS_SRC] = { .type = NLA_REJECT },
+};
+
+static int
+ethnl_set_pause_validate(struct ethnl_req_info *req_info,
+ struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+
+ return ops->get_pauseparam && ops->set_pauseparam ? 1 : -EOPNOTSUPP;
+}
+
+static int
+ethnl_set_pause(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ struct net_device *dev = req_info->dev;
+ struct ethtool_pauseparam params = {};
+ struct nlattr **tb = info->attrs;
+ bool mod = false;
+ int ret;
+
+ dev->ethtool_ops->get_pauseparam(dev, &params);
+
+ ethnl_update_bool32(&params.autoneg, tb[ETHTOOL_A_PAUSE_AUTONEG], &mod);
+ ethnl_update_bool32(&params.rx_pause, tb[ETHTOOL_A_PAUSE_RX], &mod);
+ ethnl_update_bool32(&params.tx_pause, tb[ETHTOOL_A_PAUSE_TX], &mod);
+ if (!mod)
+ return 0;
+
+ ret = dev->ethtool_ops->set_pauseparam(dev, &params);
+ return ret < 0 ? ret : 1;
+}
+
+const struct ethnl_request_ops ethnl_pause_request_ops = {
+ .request_cmd = ETHTOOL_MSG_PAUSE_GET,
+ .reply_cmd = ETHTOOL_MSG_PAUSE_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_PAUSE_HEADER,
+ .req_info_size = sizeof(struct pause_req_info),
+ .reply_data_size = sizeof(struct pause_reply_data),
+
+ .parse_request = pause_parse_request,
+ .prepare_data = pause_prepare_data,
+ .reply_size = pause_reply_size,
+ .fill_reply = pause_fill_reply,
+
+ .set_validate = ethnl_set_pause_validate,
+ .set = ethnl_set_pause,
+ .set_ntf_cmd = ETHTOOL_MSG_PAUSE_NTF,
+};
diff --git a/net/ethtool/phc_vclocks.c b/net/ethtool/phc_vclocks.c
new file mode 100644
index 000000000000..cadaabed60bd
--- /dev/null
+++ b/net/ethtool/phc_vclocks.c
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2021 NXP
+ */
+#include "netlink.h"
+#include "common.h"
+
+struct phc_vclocks_req_info {
+ struct ethnl_req_info base;
+};
+
+struct phc_vclocks_reply_data {
+ struct ethnl_reply_data base;
+ int num;
+ int *index;
+};
+
+#define PHC_VCLOCKS_REPDATA(__reply_base) \
+ container_of(__reply_base, struct phc_vclocks_reply_data, base)
+
+const struct nla_policy ethnl_phc_vclocks_get_policy[] = {
+ [ETHTOOL_A_PHC_VCLOCKS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int phc_vclocks_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct phc_vclocks_reply_data *data = PHC_VCLOCKS_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ data->num = ethtool_get_phc_vclocks(dev, &data->index);
+ ethnl_ops_complete(dev);
+
+ return ret;
+}
+
+static int phc_vclocks_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct phc_vclocks_reply_data *data =
+ PHC_VCLOCKS_REPDATA(reply_base);
+ int len = 0;
+
+ if (data->num > 0) {
+ len += nla_total_size(sizeof(u32));
+ len += nla_total_size(sizeof(s32) * data->num);
+ }
+
+ return len;
+}
+
+static int phc_vclocks_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct phc_vclocks_reply_data *data =
+ PHC_VCLOCKS_REPDATA(reply_base);
+
+ if (data->num <= 0)
+ return 0;
+
+ if (nla_put_u32(skb, ETHTOOL_A_PHC_VCLOCKS_NUM, data->num) ||
+ nla_put(skb, ETHTOOL_A_PHC_VCLOCKS_INDEX,
+ sizeof(s32) * data->num, data->index))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static void phc_vclocks_cleanup_data(struct ethnl_reply_data *reply_base)
+{
+ const struct phc_vclocks_reply_data *data =
+ PHC_VCLOCKS_REPDATA(reply_base);
+
+ kfree(data->index);
+}
+
+const struct ethnl_request_ops ethnl_phc_vclocks_request_ops = {
+ .request_cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET,
+ .reply_cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_PHC_VCLOCKS_HEADER,
+ .req_info_size = sizeof(struct phc_vclocks_req_info),
+ .reply_data_size = sizeof(struct phc_vclocks_reply_data),
+
+ .prepare_data = phc_vclocks_prepare_data,
+ .reply_size = phc_vclocks_reply_size,
+ .fill_reply = phc_vclocks_fill_reply,
+ .cleanup_data = phc_vclocks_cleanup_data,
+};
diff --git a/net/ethtool/phy.c b/net/ethtool/phy.c
new file mode 100644
index 000000000000..68372bef4b2f
--- /dev/null
+++ b/net/ethtool/phy.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Bootlin
+ *
+ */
+#include "common.h"
+#include "netlink.h"
+
+#include <linux/phy.h>
+#include <linux/phy_link_topology.h>
+#include <linux/sfp.h>
+#include <net/netdev_lock.h>
+
+struct phy_req_info {
+ struct ethnl_req_info base;
+};
+
+struct phy_reply_data {
+ struct ethnl_reply_data base;
+ u32 phyindex;
+ char *drvname;
+ char *name;
+ unsigned int upstream_type;
+ char *upstream_sfp_name;
+ unsigned int upstream_index;
+ char *downstream_sfp_name;
+};
+
+#define PHY_REPDATA(__reply_base) \
+ container_of(__reply_base, struct phy_reply_data, base)
+
+const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1] = {
+ [ETHTOOL_A_PHY_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int phy_reply_size(const struct ethnl_req_info *req_info,
+ const struct ethnl_reply_data *reply_data)
+{
+ struct phy_reply_data *rep_data = PHY_REPDATA(reply_data);
+ size_t size = 0;
+
+ /* ETHTOOL_A_PHY_INDEX */
+ size += nla_total_size(sizeof(u32));
+
+ /* ETHTOOL_A_DRVNAME */
+ if (rep_data->drvname)
+ size += nla_total_size(strlen(rep_data->drvname) + 1);
+
+ /* ETHTOOL_A_NAME */
+ size += nla_total_size(strlen(rep_data->name) + 1);
+
+ /* ETHTOOL_A_PHY_UPSTREAM_TYPE */
+ size += nla_total_size(sizeof(u32));
+
+ /* ETHTOOL_A_PHY_UPSTREAM_SFP_NAME */
+ if (rep_data->upstream_sfp_name)
+ size += nla_total_size(strlen(rep_data->upstream_sfp_name) + 1);
+
+ /* ETHTOOL_A_PHY_UPSTREAM_INDEX */
+ if (rep_data->upstream_index)
+ size += nla_total_size(sizeof(u32));
+
+ /* ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME */
+ if (rep_data->downstream_sfp_name)
+ size += nla_total_size(strlen(rep_data->downstream_sfp_name) + 1);
+
+ return size;
+}
+
+static int phy_prepare_data(const struct ethnl_req_info *req_info,
+ struct ethnl_reply_data *reply_data,
+ const struct genl_info *info)
+{
+ struct phy_link_topology *topo = reply_data->dev->link_topo;
+ struct phy_reply_data *rep_data = PHY_REPDATA(reply_data);
+ struct nlattr **tb = info->attrs;
+ struct phy_device_node *pdn;
+ struct phy_device *phydev;
+
+ /* RTNL is held by the caller */
+ phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PHY_HEADER,
+ info->extack);
+ if (IS_ERR_OR_NULL(phydev))
+ return -EOPNOTSUPP;
+
+ pdn = xa_load(&topo->phys, phydev->phyindex);
+ if (!pdn)
+ return -EOPNOTSUPP;
+
+ rep_data->phyindex = phydev->phyindex;
+ rep_data->name = kstrdup(dev_name(&phydev->mdio.dev), GFP_KERNEL);
+ rep_data->drvname = kstrdup(phydev->drv->name, GFP_KERNEL);
+ rep_data->upstream_type = pdn->upstream_type;
+
+ if (pdn->upstream_type == PHY_UPSTREAM_PHY) {
+ struct phy_device *upstream = pdn->upstream.phydev;
+ rep_data->upstream_index = upstream->phyindex;
+ }
+
+ if (pdn->parent_sfp_bus)
+ rep_data->upstream_sfp_name = kstrdup(sfp_get_name(pdn->parent_sfp_bus),
+ GFP_KERNEL);
+
+ if (phydev->sfp_bus)
+ rep_data->downstream_sfp_name = kstrdup(sfp_get_name(phydev->sfp_bus),
+ GFP_KERNEL);
+
+ return 0;
+}
+
+static int phy_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_info,
+ const struct ethnl_reply_data *reply_data)
+{
+ struct phy_reply_data *rep_data = PHY_REPDATA(reply_data);
+
+ if (nla_put_u32(skb, ETHTOOL_A_PHY_INDEX, rep_data->phyindex) ||
+ nla_put_string(skb, ETHTOOL_A_PHY_NAME, rep_data->name) ||
+ nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_TYPE, rep_data->upstream_type))
+ return -EMSGSIZE;
+
+ if (rep_data->drvname &&
+ nla_put_string(skb, ETHTOOL_A_PHY_DRVNAME, rep_data->drvname))
+ return -EMSGSIZE;
+
+ if (rep_data->upstream_index &&
+ nla_put_u32(skb, ETHTOOL_A_PHY_UPSTREAM_INDEX,
+ rep_data->upstream_index))
+ return -EMSGSIZE;
+
+ if (rep_data->upstream_sfp_name &&
+ nla_put_string(skb, ETHTOOL_A_PHY_UPSTREAM_SFP_NAME,
+ rep_data->upstream_sfp_name))
+ return -EMSGSIZE;
+
+ if (rep_data->downstream_sfp_name &&
+ nla_put_string(skb, ETHTOOL_A_PHY_DOWNSTREAM_SFP_NAME,
+ rep_data->downstream_sfp_name))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static void phy_cleanup_data(struct ethnl_reply_data *reply_data)
+{
+ struct phy_reply_data *rep_data = PHY_REPDATA(reply_data);
+
+ kfree(rep_data->drvname);
+ kfree(rep_data->name);
+ kfree(rep_data->upstream_sfp_name);
+ kfree(rep_data->downstream_sfp_name);
+}
+
+const struct ethnl_request_ops ethnl_phy_request_ops = {
+ .request_cmd = ETHTOOL_MSG_PHY_GET,
+ .reply_cmd = ETHTOOL_MSG_PHY_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_PHY_HEADER,
+ .req_info_size = sizeof(struct phy_req_info),
+ .reply_data_size = sizeof(struct phy_reply_data),
+
+ .prepare_data = phy_prepare_data,
+ .reply_size = phy_reply_size,
+ .fill_reply = phy_fill_reply,
+ .cleanup_data = phy_cleanup_data,
+};
diff --git a/net/ethtool/plca.c b/net/ethtool/plca.c
new file mode 100644
index 000000000000..e1f7820a6158
--- /dev/null
+++ b/net/ethtool/plca.c
@@ -0,0 +1,271 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/phy.h>
+#include <linux/ethtool_netlink.h>
+
+#include "netlink.h"
+#include "common.h"
+
+struct plca_req_info {
+ struct ethnl_req_info base;
+};
+
+struct plca_reply_data {
+ struct ethnl_reply_data base;
+ struct phy_plca_cfg plca_cfg;
+ struct phy_plca_status plca_st;
+};
+
+// Helpers ------------------------------------------------------------------ //
+
+#define PLCA_REPDATA(__reply_base) \
+ container_of(__reply_base, struct plca_reply_data, base)
+
+// PLCA get configuration message ------------------------------------------- //
+
+const struct nla_policy ethnl_plca_get_cfg_policy[] = {
+ [ETHTOOL_A_PLCA_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy_phy),
+};
+
+static void plca_update_sint(int *dst, struct nlattr **tb, u32 attrid,
+ bool *mod)
+{
+ const struct nlattr *attr = tb[attrid];
+
+ if (!attr ||
+ WARN_ON_ONCE(attrid >= ARRAY_SIZE(ethnl_plca_set_cfg_policy)))
+ return;
+
+ switch (ethnl_plca_set_cfg_policy[attrid].type) {
+ case NLA_U8:
+ *dst = nla_get_u8(attr);
+ break;
+ case NLA_U32:
+ *dst = nla_get_u32(attr);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+
+ *mod = true;
+}
+
+static int plca_get_cfg_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct plca_reply_data *data = PLCA_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ const struct ethtool_phy_ops *ops;
+ struct nlattr **tb = info->attrs;
+ struct phy_device *phydev;
+ int ret;
+
+ phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PLCA_HEADER,
+ info->extack);
+ // check that the PHY device is available and connected
+ if (IS_ERR_OR_NULL(phydev)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ // note: rtnl_lock is held already by ethnl_default_doit
+ ops = ethtool_phy_ops;
+ if (!ops || !ops->get_plca_cfg) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out;
+
+ memset(&data->plca_cfg, 0xff,
+ sizeof_field(struct plca_reply_data, plca_cfg));
+
+ ret = ops->get_plca_cfg(phydev, &data->plca_cfg);
+ ethnl_ops_complete(dev);
+
+out:
+ return ret;
+}
+
+static int plca_get_cfg_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ return nla_total_size(sizeof(u16)) + /* _VERSION */
+ nla_total_size(sizeof(u8)) + /* _ENABLED */
+ nla_total_size(sizeof(u32)) + /* _NODE_CNT */
+ nla_total_size(sizeof(u32)) + /* _NODE_ID */
+ nla_total_size(sizeof(u32)) + /* _TO_TIMER */
+ nla_total_size(sizeof(u32)) + /* _BURST_COUNT */
+ nla_total_size(sizeof(u32)); /* _BURST_TIMER */
+}
+
+static int plca_get_cfg_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct plca_reply_data *data = PLCA_REPDATA(reply_base);
+ const struct phy_plca_cfg *plca = &data->plca_cfg;
+
+ if ((plca->version >= 0 &&
+ nla_put_u16(skb, ETHTOOL_A_PLCA_VERSION, plca->version)) ||
+ (plca->enabled >= 0 &&
+ nla_put_u8(skb, ETHTOOL_A_PLCA_ENABLED, !!plca->enabled)) ||
+ (plca->node_id >= 0 &&
+ nla_put_u32(skb, ETHTOOL_A_PLCA_NODE_ID, plca->node_id)) ||
+ (plca->node_cnt >= 0 &&
+ nla_put_u32(skb, ETHTOOL_A_PLCA_NODE_CNT, plca->node_cnt)) ||
+ (plca->to_tmr >= 0 &&
+ nla_put_u32(skb, ETHTOOL_A_PLCA_TO_TMR, plca->to_tmr)) ||
+ (plca->burst_cnt >= 0 &&
+ nla_put_u32(skb, ETHTOOL_A_PLCA_BURST_CNT, plca->burst_cnt)) ||
+ (plca->burst_tmr >= 0 &&
+ nla_put_u32(skb, ETHTOOL_A_PLCA_BURST_TMR, plca->burst_tmr)))
+ return -EMSGSIZE;
+
+ return 0;
+};
+
+// PLCA set configuration message ------------------------------------------- //
+
+const struct nla_policy ethnl_plca_set_cfg_policy[] = {
+ [ETHTOOL_A_PLCA_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy_phy),
+ [ETHTOOL_A_PLCA_ENABLED] = NLA_POLICY_MAX(NLA_U8, 1),
+ [ETHTOOL_A_PLCA_NODE_ID] = NLA_POLICY_MAX(NLA_U32, 255),
+ [ETHTOOL_A_PLCA_NODE_CNT] = NLA_POLICY_RANGE(NLA_U32, 1, 255),
+ [ETHTOOL_A_PLCA_TO_TMR] = NLA_POLICY_MAX(NLA_U32, 255),
+ [ETHTOOL_A_PLCA_BURST_CNT] = NLA_POLICY_MAX(NLA_U32, 255),
+ [ETHTOOL_A_PLCA_BURST_TMR] = NLA_POLICY_MAX(NLA_U32, 255),
+};
+
+static int
+ethnl_set_plca(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ const struct ethtool_phy_ops *ops;
+ struct nlattr **tb = info->attrs;
+ struct phy_plca_cfg plca_cfg;
+ struct phy_device *phydev;
+ bool mod = false;
+ int ret;
+
+ phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PLCA_HEADER,
+ info->extack);
+ // check that the PHY device is available and connected
+ if (IS_ERR_OR_NULL(phydev))
+ return -EOPNOTSUPP;
+
+ ops = ethtool_phy_ops;
+ if (!ops || !ops->set_plca_cfg)
+ return -EOPNOTSUPP;
+
+ memset(&plca_cfg, 0xff, sizeof(plca_cfg));
+ plca_update_sint(&plca_cfg.enabled, tb, ETHTOOL_A_PLCA_ENABLED, &mod);
+ plca_update_sint(&plca_cfg.node_id, tb, ETHTOOL_A_PLCA_NODE_ID, &mod);
+ plca_update_sint(&plca_cfg.node_cnt, tb, ETHTOOL_A_PLCA_NODE_CNT, &mod);
+ plca_update_sint(&plca_cfg.to_tmr, tb, ETHTOOL_A_PLCA_TO_TMR, &mod);
+ plca_update_sint(&plca_cfg.burst_cnt, tb, ETHTOOL_A_PLCA_BURST_CNT,
+ &mod);
+ plca_update_sint(&plca_cfg.burst_tmr, tb, ETHTOOL_A_PLCA_BURST_TMR,
+ &mod);
+ if (!mod)
+ return 0;
+
+ ret = ops->set_plca_cfg(phydev, &plca_cfg, info->extack);
+ return ret < 0 ? ret : 1;
+}
+
+const struct ethnl_request_ops ethnl_plca_cfg_request_ops = {
+ .request_cmd = ETHTOOL_MSG_PLCA_GET_CFG,
+ .reply_cmd = ETHTOOL_MSG_PLCA_GET_CFG_REPLY,
+ .hdr_attr = ETHTOOL_A_PLCA_HEADER,
+ .req_info_size = sizeof(struct plca_req_info),
+ .reply_data_size = sizeof(struct plca_reply_data),
+
+ .prepare_data = plca_get_cfg_prepare_data,
+ .reply_size = plca_get_cfg_reply_size,
+ .fill_reply = plca_get_cfg_fill_reply,
+
+ .set = ethnl_set_plca,
+ .set_ntf_cmd = ETHTOOL_MSG_PLCA_NTF,
+};
+
+// PLCA get status message -------------------------------------------------- //
+
+const struct nla_policy ethnl_plca_get_status_policy[] = {
+ [ETHTOOL_A_PLCA_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy_phy),
+};
+
+static int plca_get_status_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct plca_reply_data *data = PLCA_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ const struct ethtool_phy_ops *ops;
+ struct nlattr **tb = info->attrs;
+ struct phy_device *phydev;
+ int ret;
+
+ phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PLCA_HEADER,
+ info->extack);
+ // check that the PHY device is available and connected
+ if (IS_ERR_OR_NULL(phydev)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ // note: rtnl_lock is held already by ethnl_default_doit
+ ops = ethtool_phy_ops;
+ if (!ops || !ops->get_plca_status) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto out;
+
+ memset(&data->plca_st, 0xff,
+ sizeof_field(struct plca_reply_data, plca_st));
+
+ ret = ops->get_plca_status(phydev, &data->plca_st);
+ ethnl_ops_complete(dev);
+out:
+ return ret;
+}
+
+static int plca_get_status_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ return nla_total_size(sizeof(u8)); /* _STATUS */
+}
+
+static int plca_get_status_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct plca_reply_data *data = PLCA_REPDATA(reply_base);
+ const u8 status = data->plca_st.pst;
+
+ if (nla_put_u8(skb, ETHTOOL_A_PLCA_STATUS, !!status))
+ return -EMSGSIZE;
+
+ return 0;
+};
+
+const struct ethnl_request_ops ethnl_plca_status_request_ops = {
+ .request_cmd = ETHTOOL_MSG_PLCA_GET_STATUS,
+ .reply_cmd = ETHTOOL_MSG_PLCA_GET_STATUS_REPLY,
+ .hdr_attr = ETHTOOL_A_PLCA_HEADER,
+ .req_info_size = sizeof(struct plca_req_info),
+ .reply_data_size = sizeof(struct plca_reply_data),
+
+ .prepare_data = plca_get_status_prepare_data,
+ .reply_size = plca_get_status_reply_size,
+ .fill_reply = plca_get_status_fill_reply,
+};
diff --git a/net/ethtool/privflags.c b/net/ethtool/privflags.c
new file mode 100644
index 000000000000..297be6a13ab9
--- /dev/null
+++ b/net/ethtool/privflags.c
@@ -0,0 +1,195 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct privflags_req_info {
+ struct ethnl_req_info base;
+};
+
+struct privflags_reply_data {
+ struct ethnl_reply_data base;
+ const char (*priv_flag_names)[ETH_GSTRING_LEN];
+ unsigned int n_priv_flags;
+ u32 priv_flags;
+};
+
+#define PRIVFLAGS_REPDATA(__reply_base) \
+ container_of(__reply_base, struct privflags_reply_data, base)
+
+const struct nla_policy ethnl_privflags_get_policy[] = {
+ [ETHTOOL_A_PRIVFLAGS_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int ethnl_get_priv_flags_info(struct net_device *dev,
+ unsigned int *count,
+ const char (**names)[ETH_GSTRING_LEN])
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int nflags;
+
+ nflags = ops->get_sset_count(dev, ETH_SS_PRIV_FLAGS);
+ if (nflags < 0)
+ return nflags;
+
+ if (names) {
+ *names = kcalloc(nflags, ETH_GSTRING_LEN, GFP_KERNEL);
+ if (!*names)
+ return -ENOMEM;
+ ops->get_strings(dev, ETH_SS_PRIV_FLAGS, (u8 *)*names);
+ }
+
+ /* We can pass more than 32 private flags to userspace via netlink but
+ * we cannot get more with ethtool_ops::get_priv_flags(). Note that we
+ * must not adjust nflags before allocating the space for flag names
+ * as the buffer must be large enough for all flags.
+ */
+ if (WARN_ONCE(nflags > 32,
+ "device %s reports more than 32 private flags (%d)\n",
+ netdev_name(dev), nflags))
+ nflags = 32;
+ *count = nflags;
+
+ return 0;
+}
+
+static int privflags_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ const char (*names)[ETH_GSTRING_LEN];
+ const struct ethtool_ops *ops;
+ unsigned int nflags;
+ int ret;
+
+ ops = dev->ethtool_ops;
+ if (!ops->get_priv_flags || !ops->get_sset_count || !ops->get_strings)
+ return -EOPNOTSUPP;
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ ret = ethnl_get_priv_flags_info(dev, &nflags, &names);
+ if (ret < 0)
+ goto out_ops;
+ data->priv_flags = ops->get_priv_flags(dev);
+ data->priv_flag_names = names;
+ data->n_priv_flags = nflags;
+
+out_ops:
+ ethnl_ops_complete(dev);
+ return ret;
+}
+
+static int privflags_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const u32 all_flags = ~(u32)0 >> (32 - data->n_priv_flags);
+
+ return ethnl_bitset32_size(&data->priv_flags, &all_flags,
+ data->n_priv_flags,
+ data->priv_flag_names, compact);
+}
+
+static int privflags_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const u32 all_flags = ~(u32)0 >> (32 - data->n_priv_flags);
+
+ return ethnl_put_bitset32(skb, ETHTOOL_A_PRIVFLAGS_FLAGS,
+ &data->priv_flags, &all_flags,
+ data->n_priv_flags, data->priv_flag_names,
+ compact);
+}
+
+static void privflags_cleanup_data(struct ethnl_reply_data *reply_data)
+{
+ struct privflags_reply_data *data = PRIVFLAGS_REPDATA(reply_data);
+
+ kfree(data->priv_flag_names);
+}
+
+/* PRIVFLAGS_SET */
+
+const struct nla_policy ethnl_privflags_set_policy[] = {
+ [ETHTOOL_A_PRIVFLAGS_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_PRIVFLAGS_FLAGS] = { .type = NLA_NESTED },
+};
+
+static int
+ethnl_set_privflags_validate(struct ethnl_req_info *req_info,
+ struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+
+ if (!info->attrs[ETHTOOL_A_PRIVFLAGS_FLAGS])
+ return -EINVAL;
+
+ if (!ops->get_priv_flags || !ops->set_priv_flags ||
+ !ops->get_sset_count || !ops->get_strings)
+ return -EOPNOTSUPP;
+ return 1;
+}
+
+static int
+ethnl_set_privflags(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ const char (*names)[ETH_GSTRING_LEN] = NULL;
+ struct net_device *dev = req_info->dev;
+ struct nlattr **tb = info->attrs;
+ unsigned int nflags;
+ bool mod = false;
+ bool compact;
+ u32 flags;
+ int ret;
+
+ ret = ethnl_bitset_is_compact(tb[ETHTOOL_A_PRIVFLAGS_FLAGS], &compact);
+ if (ret < 0)
+ return ret;
+
+ ret = ethnl_get_priv_flags_info(dev, &nflags, compact ? NULL : &names);
+ if (ret < 0)
+ return ret;
+ flags = dev->ethtool_ops->get_priv_flags(dev);
+
+ ret = ethnl_update_bitset32(&flags, nflags,
+ tb[ETHTOOL_A_PRIVFLAGS_FLAGS], names,
+ info->extack, &mod);
+ if (ret < 0 || !mod)
+ goto out_free;
+ ret = dev->ethtool_ops->set_priv_flags(dev, flags);
+ if (ret < 0)
+ goto out_free;
+ ret = 1;
+
+out_free:
+ kfree(names);
+ return ret;
+}
+
+const struct ethnl_request_ops ethnl_privflags_request_ops = {
+ .request_cmd = ETHTOOL_MSG_PRIVFLAGS_GET,
+ .reply_cmd = ETHTOOL_MSG_PRIVFLAGS_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_PRIVFLAGS_HEADER,
+ .req_info_size = sizeof(struct privflags_req_info),
+ .reply_data_size = sizeof(struct privflags_reply_data),
+
+ .prepare_data = privflags_prepare_data,
+ .reply_size = privflags_reply_size,
+ .fill_reply = privflags_fill_reply,
+ .cleanup_data = privflags_cleanup_data,
+
+ .set_validate = ethnl_set_privflags_validate,
+ .set = ethnl_set_privflags,
+ .set_ntf_cmd = ETHTOOL_MSG_PRIVFLAGS_NTF,
+};
diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c
new file mode 100644
index 000000000000..24def9c9dd54
--- /dev/null
+++ b/net/ethtool/pse-pd.c
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//
+// ethtool interface for Ethernet PSE (Power Sourcing Equipment)
+// and PD (Powered Device)
+//
+// Copyright (c) 2022 Pengutronix, Oleksij Rempel <kernel@pengutronix.de>
+//
+
+#include "common.h"
+#include "linux/pse-pd/pse.h"
+#include "netlink.h"
+#include <linux/ethtool_netlink.h>
+#include <linux/ethtool.h>
+#include <linux/export.h>
+#include <linux/phy.h>
+
+struct pse_req_info {
+ struct ethnl_req_info base;
+};
+
+struct pse_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_pse_control_status status;
+};
+
+#define PSE_REPDATA(__reply_base) \
+ container_of(__reply_base, struct pse_reply_data, base)
+
+/* PSE_GET */
+
+const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1] = {
+ [ETHTOOL_A_PSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy),
+};
+
+static int pse_get_pse_attributes(struct phy_device *phydev,
+ struct netlink_ext_ack *extack,
+ struct pse_reply_data *data)
+{
+ if (!phydev) {
+ NL_SET_ERR_MSG(extack, "No PHY found");
+ return -EOPNOTSUPP;
+ }
+
+ if (!phydev->psec) {
+ NL_SET_ERR_MSG(extack, "No PSE is attached");
+ return -EOPNOTSUPP;
+ }
+
+ memset(&data->status, 0, sizeof(data->status));
+
+ return pse_ethtool_get_status(phydev->psec, extack, &data->status);
+}
+
+static int pse_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct pse_reply_data *data = PSE_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ struct nlattr **tb = info->attrs;
+ struct phy_device *phydev;
+ int ret;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_PSE_HEADER,
+ info->extack);
+ if (IS_ERR(phydev))
+ return -ENODEV;
+
+ ret = pse_get_pse_attributes(phydev, info->extack, data);
+
+ ethnl_ops_complete(dev);
+
+ return ret;
+}
+
+static int pse_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct pse_reply_data *data = PSE_REPDATA(reply_base);
+ const struct ethtool_pse_control_status *st = &data->status;
+ int len = 0;
+
+ if (st->pw_d_id)
+ len += nla_total_size(sizeof(u32)); /* _PSE_PW_D_ID */
+ if (st->podl_admin_state > 0)
+ len += nla_total_size(sizeof(u32)); /* _PODL_PSE_ADMIN_STATE */
+ if (st->podl_pw_status > 0)
+ len += nla_total_size(sizeof(u32)); /* _PODL_PSE_PW_D_STATUS */
+ if (st->c33_admin_state > 0)
+ len += nla_total_size(sizeof(u32)); /* _C33_PSE_ADMIN_STATE */
+ if (st->c33_pw_status > 0)
+ len += nla_total_size(sizeof(u32)); /* _C33_PSE_PW_D_STATUS */
+ if (st->c33_pw_class > 0)
+ len += nla_total_size(sizeof(u32)); /* _C33_PSE_PW_CLASS */
+ if (st->c33_actual_pw > 0)
+ len += nla_total_size(sizeof(u32)); /* _C33_PSE_ACTUAL_PW */
+ if (st->c33_ext_state_info.c33_pse_ext_state > 0) {
+ len += nla_total_size(sizeof(u32)); /* _C33_PSE_EXT_STATE */
+ if (st->c33_ext_state_info.__c33_pse_ext_substate > 0)
+ /* _C33_PSE_EXT_SUBSTATE */
+ len += nla_total_size(sizeof(u32));
+ }
+ if (st->c33_avail_pw_limit > 0)
+ /* _C33_AVAIL_PSE_PW_LIMIT */
+ len += nla_total_size(sizeof(u32));
+ if (st->c33_pw_limit_nb_ranges > 0)
+ /* _C33_PSE_PW_LIMIT_RANGES */
+ len += st->c33_pw_limit_nb_ranges *
+ (nla_total_size(0) +
+ nla_total_size(sizeof(u32)) * 2);
+ if (st->prio_max)
+ /* _PSE_PRIO_MAX + _PSE_PRIO */
+ len += nla_total_size(sizeof(u32)) * 2;
+
+ return len;
+}
+
+static int pse_put_pw_limit_ranges(struct sk_buff *skb,
+ const struct ethtool_pse_control_status *st)
+{
+ const struct ethtool_c33_pse_pw_limit_range *pw_limit_ranges;
+ int i;
+
+ pw_limit_ranges = st->c33_pw_limit_ranges;
+ for (i = 0; i < st->c33_pw_limit_nb_ranges; i++) {
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_C33_PSE_PW_LIMIT_RANGES);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_C33_PSE_PW_LIMIT_MIN,
+ pw_limit_ranges->min) ||
+ nla_put_u32(skb, ETHTOOL_A_C33_PSE_PW_LIMIT_MAX,
+ pw_limit_ranges->max)) {
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+ }
+ nla_nest_end(skb, nest);
+ pw_limit_ranges++;
+ }
+
+ return 0;
+}
+
+static int pse_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct pse_reply_data *data = PSE_REPDATA(reply_base);
+ const struct ethtool_pse_control_status *st = &data->status;
+
+ if (st->pw_d_id &&
+ nla_put_u32(skb, ETHTOOL_A_PSE_PW_D_ID,
+ st->pw_d_id))
+ return -EMSGSIZE;
+
+ if (st->podl_admin_state > 0 &&
+ nla_put_u32(skb, ETHTOOL_A_PODL_PSE_ADMIN_STATE,
+ st->podl_admin_state))
+ return -EMSGSIZE;
+
+ if (st->podl_pw_status > 0 &&
+ nla_put_u32(skb, ETHTOOL_A_PODL_PSE_PW_D_STATUS,
+ st->podl_pw_status))
+ return -EMSGSIZE;
+
+ if (st->c33_admin_state > 0 &&
+ nla_put_u32(skb, ETHTOOL_A_C33_PSE_ADMIN_STATE,
+ st->c33_admin_state))
+ return -EMSGSIZE;
+
+ if (st->c33_pw_status > 0 &&
+ nla_put_u32(skb, ETHTOOL_A_C33_PSE_PW_D_STATUS,
+ st->c33_pw_status))
+ return -EMSGSIZE;
+
+ if (st->c33_pw_class > 0 &&
+ nla_put_u32(skb, ETHTOOL_A_C33_PSE_PW_CLASS,
+ st->c33_pw_class))
+ return -EMSGSIZE;
+
+ if (st->c33_actual_pw > 0 &&
+ nla_put_u32(skb, ETHTOOL_A_C33_PSE_ACTUAL_PW,
+ st->c33_actual_pw))
+ return -EMSGSIZE;
+
+ if (st->c33_ext_state_info.c33_pse_ext_state > 0) {
+ if (nla_put_u32(skb, ETHTOOL_A_C33_PSE_EXT_STATE,
+ st->c33_ext_state_info.c33_pse_ext_state))
+ return -EMSGSIZE;
+
+ if (st->c33_ext_state_info.__c33_pse_ext_substate > 0 &&
+ nla_put_u32(skb, ETHTOOL_A_C33_PSE_EXT_SUBSTATE,
+ st->c33_ext_state_info.__c33_pse_ext_substate))
+ return -EMSGSIZE;
+ }
+
+ if (st->c33_avail_pw_limit > 0 &&
+ nla_put_u32(skb, ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT,
+ st->c33_avail_pw_limit))
+ return -EMSGSIZE;
+
+ if (st->c33_pw_limit_nb_ranges > 0 &&
+ pse_put_pw_limit_ranges(skb, st))
+ return -EMSGSIZE;
+
+ if (st->prio_max &&
+ (nla_put_u32(skb, ETHTOOL_A_PSE_PRIO_MAX, st->prio_max) ||
+ nla_put_u32(skb, ETHTOOL_A_PSE_PRIO, st->prio)))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static void pse_cleanup_data(struct ethnl_reply_data *reply_base)
+{
+ const struct pse_reply_data *data = PSE_REPDATA(reply_base);
+
+ kfree(data->status.c33_pw_limit_ranges);
+}
+
+/* PSE_SET */
+
+const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1] = {
+ [ETHTOOL_A_PSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy),
+ [ETHTOOL_A_PODL_PSE_ADMIN_CONTROL] =
+ NLA_POLICY_RANGE(NLA_U32, ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED,
+ ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED),
+ [ETHTOOL_A_C33_PSE_ADMIN_CONTROL] =
+ NLA_POLICY_RANGE(NLA_U32, ETHTOOL_C33_PSE_ADMIN_STATE_DISABLED,
+ ETHTOOL_C33_PSE_ADMIN_STATE_ENABLED),
+ [ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT] = { .type = NLA_U32 },
+ [ETHTOOL_A_PSE_PRIO] = { .type = NLA_U32 },
+};
+
+static int
+ethnl_set_pse_validate(struct phy_device *phydev, struct genl_info *info)
+{
+ struct nlattr **tb = info->attrs;
+
+ if (IS_ERR_OR_NULL(phydev)) {
+ NL_SET_ERR_MSG(info->extack, "No PHY is attached");
+ return -EOPNOTSUPP;
+ }
+
+ if (!phydev->psec) {
+ NL_SET_ERR_MSG(info->extack, "No PSE is attached");
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL] &&
+ !pse_has_podl(phydev->psec)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL],
+ "setting PoDL PSE admin control not supported");
+ return -EOPNOTSUPP;
+ }
+ if (tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL] &&
+ !pse_has_c33(phydev->psec)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL],
+ "setting C33 PSE admin control not supported");
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static int
+ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ struct nlattr **tb = info->attrs;
+ struct phy_device *phydev;
+ int ret;
+
+ phydev = ethnl_req_get_phydev(req_info, tb, ETHTOOL_A_PSE_HEADER,
+ info->extack);
+ ret = ethnl_set_pse_validate(phydev, info);
+ if (ret)
+ return ret;
+
+ if (tb[ETHTOOL_A_PSE_PRIO]) {
+ unsigned int prio;
+
+ prio = nla_get_u32(tb[ETHTOOL_A_PSE_PRIO]);
+ ret = pse_ethtool_set_prio(phydev->psec, info->extack, prio);
+ if (ret)
+ return ret;
+ }
+
+ if (tb[ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT]) {
+ unsigned int pw_limit;
+
+ pw_limit = nla_get_u32(tb[ETHTOOL_A_C33_PSE_AVAIL_PW_LIMIT]);
+ ret = pse_ethtool_set_pw_limit(phydev->psec, info->extack,
+ pw_limit);
+ if (ret)
+ return ret;
+ }
+
+ /* These values are already validated by the ethnl_pse_set_policy */
+ if (tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL] ||
+ tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL]) {
+ struct pse_control_config config = {};
+
+ if (tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL])
+ config.podl_admin_control = nla_get_u32(tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]);
+ if (tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL])
+ config.c33_admin_control = nla_get_u32(tb[ETHTOOL_A_C33_PSE_ADMIN_CONTROL]);
+
+ /* pse_ethtool_set_config() will do nothing if the config
+ * is zero
+ */
+ ret = pse_ethtool_set_config(phydev->psec, info->extack,
+ &config);
+ if (ret)
+ return ret;
+ }
+
+ /* Return errno or zero - PSE has no notification */
+ return ret;
+}
+
+const struct ethnl_request_ops ethnl_pse_request_ops = {
+ .request_cmd = ETHTOOL_MSG_PSE_GET,
+ .reply_cmd = ETHTOOL_MSG_PSE_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_PSE_HEADER,
+ .req_info_size = sizeof(struct pse_req_info),
+ .reply_data_size = sizeof(struct pse_reply_data),
+
+ .prepare_data = pse_prepare_data,
+ .reply_size = pse_reply_size,
+ .fill_reply = pse_fill_reply,
+ .cleanup_data = pse_cleanup_data,
+
+ .set = ethnl_set_pse,
+ /* PSE has no notification */
+};
+
+void ethnl_pse_send_ntf(struct net_device *netdev, unsigned long notifs)
+{
+ void *reply_payload;
+ struct sk_buff *skb;
+ int reply_len;
+ int ret;
+
+ ASSERT_RTNL();
+
+ if (!netdev || !notifs)
+ return;
+
+ reply_len = ethnl_reply_header_size() +
+ nla_total_size(sizeof(u32)); /* _PSE_NTF_EVENTS */
+
+ skb = genlmsg_new(reply_len, GFP_KERNEL);
+ if (!skb)
+ return;
+
+ reply_payload = ethnl_bcastmsg_put(skb, ETHTOOL_MSG_PSE_NTF);
+ if (!reply_payload)
+ goto err_skb;
+
+ ret = ethnl_fill_reply_header(skb, netdev, ETHTOOL_A_PSE_NTF_HEADER);
+ if (ret < 0)
+ goto err_skb;
+
+ if (nla_put_uint(skb, ETHTOOL_A_PSE_NTF_EVENTS, notifs))
+ goto err_skb;
+
+ genlmsg_end(skb, reply_payload);
+ ethnl_multicast(skb, netdev);
+ return;
+
+err_skb:
+ nlmsg_free(skb);
+}
+EXPORT_SYMBOL_GPL(ethnl_pse_send_ntf);
diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c
new file mode 100644
index 000000000000..aeedd5ec6b8c
--- /dev/null
+++ b/net/ethtool/rings.c
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <net/netdev_queues.h>
+
+#include "netlink.h"
+#include "common.h"
+
+struct rings_req_info {
+ struct ethnl_req_info base;
+};
+
+struct rings_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_ringparam ringparam;
+ struct kernel_ethtool_ringparam kernel_ringparam;
+ u32 supported_ring_params;
+};
+
+#define RINGS_REPDATA(__reply_base) \
+ container_of(__reply_base, struct rings_reply_data, base)
+
+const struct nla_policy ethnl_rings_get_policy[] = {
+ [ETHTOOL_A_RINGS_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int rings_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct rings_reply_data *data = RINGS_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_ringparam)
+ return -EOPNOTSUPP;
+
+ data->supported_ring_params = dev->ethtool_ops->supported_ring_params;
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ data->kernel_ringparam.tcp_data_split = dev->cfg->hds_config;
+ data->kernel_ringparam.hds_thresh = dev->cfg->hds_thresh;
+
+ dev->ethtool_ops->get_ringparam(dev, &data->ringparam,
+ &data->kernel_ringparam, info->extack);
+ ethnl_ops_complete(dev);
+
+ return 0;
+}
+
+static int rings_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ return nla_total_size(sizeof(u32)) + /* _RINGS_RX_MAX */
+ nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI_MAX */
+ nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO_MAX */
+ nla_total_size(sizeof(u32)) + /* _RINGS_TX_MAX */
+ nla_total_size(sizeof(u32)) + /* _RINGS_RX */
+ nla_total_size(sizeof(u32)) + /* _RINGS_RX_MINI */
+ nla_total_size(sizeof(u32)) + /* _RINGS_RX_JUMBO */
+ nla_total_size(sizeof(u32)) + /* _RINGS_TX */
+ nla_total_size(sizeof(u32)) + /* _RINGS_RX_BUF_LEN */
+ nla_total_size(sizeof(u8)) + /* _RINGS_TCP_DATA_SPLIT */
+ nla_total_size(sizeof(u32) + /* _RINGS_CQE_SIZE */
+ nla_total_size(sizeof(u8)) + /* _RINGS_TX_PUSH */
+ nla_total_size(sizeof(u8))) + /* _RINGS_RX_PUSH */
+ nla_total_size(sizeof(u32)) + /* _RINGS_TX_PUSH_BUF_LEN */
+ nla_total_size(sizeof(u32)) + /* _RINGS_TX_PUSH_BUF_LEN_MAX */
+ nla_total_size(sizeof(u32)) + /* _RINGS_HDS_THRESH */
+ nla_total_size(sizeof(u32)); /* _RINGS_HDS_THRESH_MAX*/
+}
+
+static int rings_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct rings_reply_data *data = RINGS_REPDATA(reply_base);
+ const struct kernel_ethtool_ringparam *kr = &data->kernel_ringparam;
+ const struct ethtool_ringparam *ringparam = &data->ringparam;
+ u32 supported_ring_params = data->supported_ring_params;
+
+ WARN_ON(kr->tcp_data_split > ETHTOOL_TCP_DATA_SPLIT_ENABLED);
+
+ if ((ringparam->rx_max_pending &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MAX,
+ ringparam->rx_max_pending) ||
+ nla_put_u32(skb, ETHTOOL_A_RINGS_RX,
+ ringparam->rx_pending))) ||
+ (ringparam->rx_mini_max_pending &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MINI_MAX,
+ ringparam->rx_mini_max_pending) ||
+ nla_put_u32(skb, ETHTOOL_A_RINGS_RX_MINI,
+ ringparam->rx_mini_pending))) ||
+ (ringparam->rx_jumbo_max_pending &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_JUMBO_MAX,
+ ringparam->rx_jumbo_max_pending) ||
+ nla_put_u32(skb, ETHTOOL_A_RINGS_RX_JUMBO,
+ ringparam->rx_jumbo_pending))) ||
+ (ringparam->tx_max_pending &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_MAX,
+ ringparam->tx_max_pending) ||
+ nla_put_u32(skb, ETHTOOL_A_RINGS_TX,
+ ringparam->tx_pending))) ||
+ (kr->rx_buf_len &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_RX_BUF_LEN, kr->rx_buf_len))) ||
+ (kr->tcp_data_split &&
+ (nla_put_u8(skb, ETHTOOL_A_RINGS_TCP_DATA_SPLIT,
+ kr->tcp_data_split))) ||
+ (kr->cqe_size &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_CQE_SIZE, kr->cqe_size))) ||
+ nla_put_u8(skb, ETHTOOL_A_RINGS_TX_PUSH, !!kr->tx_push) ||
+ nla_put_u8(skb, ETHTOOL_A_RINGS_RX_PUSH, !!kr->rx_push) ||
+ ((supported_ring_params & ETHTOOL_RING_USE_TX_PUSH_BUF_LEN) &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX,
+ kr->tx_push_buf_max_len) ||
+ nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN,
+ kr->tx_push_buf_len))) ||
+ ((supported_ring_params & ETHTOOL_RING_USE_HDS_THRS) &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_HDS_THRESH,
+ kr->hds_thresh) ||
+ nla_put_u32(skb, ETHTOOL_A_RINGS_HDS_THRESH_MAX,
+ kr->hds_thresh_max))))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+/* RINGS_SET */
+
+const struct nla_policy ethnl_rings_set_policy[] = {
+ [ETHTOOL_A_RINGS_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_RINGS_RX] = { .type = NLA_U32 },
+ [ETHTOOL_A_RINGS_RX_MINI] = { .type = NLA_U32 },
+ [ETHTOOL_A_RINGS_RX_JUMBO] = { .type = NLA_U32 },
+ [ETHTOOL_A_RINGS_TX] = { .type = NLA_U32 },
+ [ETHTOOL_A_RINGS_RX_BUF_LEN] = NLA_POLICY_MIN(NLA_U32, 1),
+ [ETHTOOL_A_RINGS_TCP_DATA_SPLIT] =
+ NLA_POLICY_MAX(NLA_U8, ETHTOOL_TCP_DATA_SPLIT_ENABLED),
+ [ETHTOOL_A_RINGS_CQE_SIZE] = NLA_POLICY_MIN(NLA_U32, 1),
+ [ETHTOOL_A_RINGS_TX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1),
+ [ETHTOOL_A_RINGS_RX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1),
+ [ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN] = { .type = NLA_U32 },
+ [ETHTOOL_A_RINGS_HDS_THRESH] = { .type = NLA_U32 },
+};
+
+static int
+ethnl_set_rings_validate(struct ethnl_req_info *req_info,
+ struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+ struct nlattr **tb = info->attrs;
+
+ if (tb[ETHTOOL_A_RINGS_RX_BUF_LEN] &&
+ !(ops->supported_ring_params & ETHTOOL_RING_USE_RX_BUF_LEN)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_RINGS_RX_BUF_LEN],
+ "setting rx buf len not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT] &&
+ !(ops->supported_ring_params & ETHTOOL_RING_USE_TCP_DATA_SPLIT)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT],
+ "setting TCP data split is not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[ETHTOOL_A_RINGS_HDS_THRESH] &&
+ !(ops->supported_ring_params & ETHTOOL_RING_USE_HDS_THRS)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_RINGS_HDS_THRESH],
+ "setting hds-thresh is not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[ETHTOOL_A_RINGS_CQE_SIZE] &&
+ !(ops->supported_ring_params & ETHTOOL_RING_USE_CQE_SIZE)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_RINGS_CQE_SIZE],
+ "setting cqe size not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[ETHTOOL_A_RINGS_TX_PUSH] &&
+ !(ops->supported_ring_params & ETHTOOL_RING_USE_TX_PUSH)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_RINGS_TX_PUSH],
+ "setting tx push not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[ETHTOOL_A_RINGS_RX_PUSH] &&
+ !(ops->supported_ring_params & ETHTOOL_RING_USE_RX_PUSH)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_RINGS_RX_PUSH],
+ "setting rx push not supported");
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN] &&
+ !(ops->supported_ring_params & ETHTOOL_RING_USE_TX_PUSH_BUF_LEN)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN],
+ "setting tx push buf len is not supported");
+ return -EOPNOTSUPP;
+ }
+
+ return ops->get_ringparam && ops->set_ringparam ? 1 : -EOPNOTSUPP;
+}
+
+static int
+ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ struct kernel_ethtool_ringparam kernel_ringparam;
+ struct net_device *dev = req_info->dev;
+ struct ethtool_ringparam ringparam;
+ struct nlattr **tb = info->attrs;
+ const struct nlattr *err_attr;
+ bool mod = false;
+ int ret;
+
+ ethtool_ringparam_get_cfg(dev, &ringparam, &kernel_ringparam,
+ info->extack);
+
+ ethnl_update_u32(&ringparam.rx_pending, tb[ETHTOOL_A_RINGS_RX], &mod);
+ ethnl_update_u32(&ringparam.rx_mini_pending,
+ tb[ETHTOOL_A_RINGS_RX_MINI], &mod);
+ ethnl_update_u32(&ringparam.rx_jumbo_pending,
+ tb[ETHTOOL_A_RINGS_RX_JUMBO], &mod);
+ ethnl_update_u32(&ringparam.tx_pending, tb[ETHTOOL_A_RINGS_TX], &mod);
+ ethnl_update_u32(&kernel_ringparam.rx_buf_len,
+ tb[ETHTOOL_A_RINGS_RX_BUF_LEN], &mod);
+ ethnl_update_u8(&kernel_ringparam.tcp_data_split,
+ tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT], &mod);
+ ethnl_update_u32(&kernel_ringparam.cqe_size,
+ tb[ETHTOOL_A_RINGS_CQE_SIZE], &mod);
+ ethnl_update_u8(&kernel_ringparam.tx_push,
+ tb[ETHTOOL_A_RINGS_TX_PUSH], &mod);
+ ethnl_update_u8(&kernel_ringparam.rx_push,
+ tb[ETHTOOL_A_RINGS_RX_PUSH], &mod);
+ ethnl_update_u32(&kernel_ringparam.tx_push_buf_len,
+ tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], &mod);
+ ethnl_update_u32(&kernel_ringparam.hds_thresh,
+ tb[ETHTOOL_A_RINGS_HDS_THRESH], &mod);
+ if (!mod)
+ return 0;
+
+ if (kernel_ringparam.tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
+ dev_xdp_sb_prog_count(dev)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT],
+ "tcp-data-split can not be enabled with single buffer XDP");
+ return -EINVAL;
+ }
+
+ if (dev_get_min_mp_channel_count(dev)) {
+ if (kernel_ringparam.tcp_data_split !=
+ ETHTOOL_TCP_DATA_SPLIT_ENABLED) {
+ NL_SET_ERR_MSG(info->extack,
+ "can't disable tcp-data-split while device has memory provider enabled");
+ return -EINVAL;
+ } else if (kernel_ringparam.hds_thresh) {
+ NL_SET_ERR_MSG(info->extack,
+ "can't set non-zero hds_thresh while device is memory provider enabled");
+ return -EINVAL;
+ }
+ }
+
+ /* ensure new ring parameters are within limits */
+ if (ringparam.rx_pending > ringparam.rx_max_pending)
+ err_attr = tb[ETHTOOL_A_RINGS_RX];
+ else if (ringparam.rx_mini_pending > ringparam.rx_mini_max_pending)
+ err_attr = tb[ETHTOOL_A_RINGS_RX_MINI];
+ else if (ringparam.rx_jumbo_pending > ringparam.rx_jumbo_max_pending)
+ err_attr = tb[ETHTOOL_A_RINGS_RX_JUMBO];
+ else if (ringparam.tx_pending > ringparam.tx_max_pending)
+ err_attr = tb[ETHTOOL_A_RINGS_TX];
+ else if (kernel_ringparam.hds_thresh > kernel_ringparam.hds_thresh_max)
+ err_attr = tb[ETHTOOL_A_RINGS_HDS_THRESH];
+ else
+ err_attr = NULL;
+ if (err_attr) {
+ NL_SET_ERR_MSG_ATTR(info->extack, err_attr,
+ "requested ring size exceeds maximum");
+ return -EINVAL;
+ }
+
+ if (kernel_ringparam.tx_push_buf_len > kernel_ringparam.tx_push_buf_max_len) {
+ NL_SET_ERR_MSG_ATTR_FMT(info->extack, tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN],
+ "Requested TX push buffer exceeds the maximum of %u",
+ kernel_ringparam.tx_push_buf_max_len);
+
+ return -EINVAL;
+ }
+
+ dev->cfg_pending->hds_config = kernel_ringparam.tcp_data_split;
+ dev->cfg_pending->hds_thresh = kernel_ringparam.hds_thresh;
+
+ ret = dev->ethtool_ops->set_ringparam(dev, &ringparam,
+ &kernel_ringparam, info->extack);
+ return ret < 0 ? ret : 1;
+}
+
+const struct ethnl_request_ops ethnl_rings_request_ops = {
+ .request_cmd = ETHTOOL_MSG_RINGS_GET,
+ .reply_cmd = ETHTOOL_MSG_RINGS_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_RINGS_HEADER,
+ .req_info_size = sizeof(struct rings_req_info),
+ .reply_data_size = sizeof(struct rings_reply_data),
+
+ .prepare_data = rings_prepare_data,
+ .reply_size = rings_reply_size,
+ .fill_reply = rings_fill_reply,
+
+ .set_validate = ethnl_set_rings_validate,
+ .set = ethnl_set_rings,
+ .set_ntf_cmd = ETHTOOL_MSG_RINGS_NTF,
+};
diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c
new file mode 100644
index 000000000000..4dced53be4b3
--- /dev/null
+++ b/net/ethtool/rss.c
@@ -0,0 +1,1205 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <net/netdev_lock.h>
+
+#include "netlink.h"
+#include "common.h"
+
+struct rss_req_info {
+ struct ethnl_req_info base;
+ u32 rss_context;
+};
+
+struct rss_reply_data {
+ struct ethnl_reply_data base;
+ bool has_flow_hash;
+ bool no_key_fields;
+ u32 indir_size;
+ u32 hkey_size;
+ u32 hfunc;
+ u32 input_xfrm;
+ u32 *indir_table;
+ u8 *hkey;
+ int flow_hash[__ETHTOOL_A_FLOW_CNT];
+};
+
+static const u8 ethtool_rxfh_ft_nl2ioctl[] = {
+ [ETHTOOL_A_FLOW_ETHER] = ETHER_FLOW,
+ [ETHTOOL_A_FLOW_IP4] = IPV4_FLOW,
+ [ETHTOOL_A_FLOW_IP6] = IPV6_FLOW,
+ [ETHTOOL_A_FLOW_TCP4] = TCP_V4_FLOW,
+ [ETHTOOL_A_FLOW_UDP4] = UDP_V4_FLOW,
+ [ETHTOOL_A_FLOW_SCTP4] = SCTP_V4_FLOW,
+ [ETHTOOL_A_FLOW_AH_ESP4] = AH_ESP_V4_FLOW,
+ [ETHTOOL_A_FLOW_TCP6] = TCP_V6_FLOW,
+ [ETHTOOL_A_FLOW_UDP6] = UDP_V6_FLOW,
+ [ETHTOOL_A_FLOW_SCTP6] = SCTP_V6_FLOW,
+ [ETHTOOL_A_FLOW_AH_ESP6] = AH_ESP_V6_FLOW,
+ [ETHTOOL_A_FLOW_AH4] = AH_V4_FLOW,
+ [ETHTOOL_A_FLOW_ESP4] = ESP_V4_FLOW,
+ [ETHTOOL_A_FLOW_AH6] = AH_V6_FLOW,
+ [ETHTOOL_A_FLOW_ESP6] = ESP_V6_FLOW,
+ [ETHTOOL_A_FLOW_GTPU4] = GTPU_V4_FLOW,
+ [ETHTOOL_A_FLOW_GTPU6] = GTPU_V6_FLOW,
+ [ETHTOOL_A_FLOW_GTPC4] = GTPC_V4_FLOW,
+ [ETHTOOL_A_FLOW_GTPC6] = GTPC_V6_FLOW,
+ [ETHTOOL_A_FLOW_GTPC_TEID4] = GTPC_TEID_V4_FLOW,
+ [ETHTOOL_A_FLOW_GTPC_TEID6] = GTPC_TEID_V6_FLOW,
+ [ETHTOOL_A_FLOW_GTPU_EH4] = GTPU_EH_V4_FLOW,
+ [ETHTOOL_A_FLOW_GTPU_EH6] = GTPU_EH_V6_FLOW,
+ [ETHTOOL_A_FLOW_GTPU_UL4] = GTPU_UL_V4_FLOW,
+ [ETHTOOL_A_FLOW_GTPU_UL6] = GTPU_UL_V6_FLOW,
+ [ETHTOOL_A_FLOW_GTPU_DL4] = GTPU_DL_V4_FLOW,
+ [ETHTOOL_A_FLOW_GTPU_DL6] = GTPU_DL_V6_FLOW,
+};
+
+#define RSS_REQINFO(__req_base) \
+ container_of(__req_base, struct rss_req_info, base)
+
+#define RSS_REPDATA(__reply_base) \
+ container_of(__reply_base, struct rss_reply_data, base)
+
+const struct nla_policy ethnl_rss_get_policy[] = {
+ [ETHTOOL_A_RSS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_RSS_CONTEXT] = { .type = NLA_U32 },
+ [ETHTOOL_A_RSS_START_CONTEXT] = { .type = NLA_U32 },
+};
+
+static int
+rss_parse_request(struct ethnl_req_info *req_info, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct rss_req_info *request = RSS_REQINFO(req_info);
+
+ if (tb[ETHTOOL_A_RSS_CONTEXT])
+ request->rss_context = nla_get_u32(tb[ETHTOOL_A_RSS_CONTEXT]);
+ if (tb[ETHTOOL_A_RSS_START_CONTEXT]) {
+ NL_SET_BAD_ATTR(extack, tb[ETHTOOL_A_RSS_START_CONTEXT]);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void
+rss_prepare_flow_hash(const struct rss_req_info *req, struct net_device *dev,
+ struct rss_reply_data *data, const struct genl_info *info)
+{
+ int i;
+
+ data->has_flow_hash = false;
+
+ if (!dev->ethtool_ops->get_rxfh_fields)
+ return;
+ if (req->rss_context && !dev->ethtool_ops->rxfh_per_ctx_fields)
+ return;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ for (i = 1; i < __ETHTOOL_A_FLOW_CNT; i++) {
+ struct ethtool_rxfh_fields fields = {
+ .flow_type = ethtool_rxfh_ft_nl2ioctl[i],
+ .rss_context = req->rss_context,
+ };
+
+ if (dev->ethtool_ops->get_rxfh_fields(dev, &fields)) {
+ data->flow_hash[i] = -1; /* Unsupported */
+ continue;
+ }
+
+ data->flow_hash[i] = fields.data;
+ data->has_flow_hash = true;
+ }
+ mutex_unlock(&dev->ethtool->rss_lock);
+}
+
+static int
+rss_get_data_alloc(struct net_device *dev, struct rss_reply_data *data)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ u32 total_size, indir_bytes;
+ u8 *rss_config;
+
+ data->indir_size = 0;
+ data->hkey_size = 0;
+ if (ops->get_rxfh_indir_size)
+ data->indir_size = ops->get_rxfh_indir_size(dev);
+ if (ops->get_rxfh_key_size)
+ data->hkey_size = ops->get_rxfh_key_size(dev);
+
+ indir_bytes = data->indir_size * sizeof(u32);
+ total_size = indir_bytes + data->hkey_size;
+ rss_config = kzalloc(total_size, GFP_KERNEL);
+ if (!rss_config)
+ return -ENOMEM;
+
+ if (data->indir_size)
+ data->indir_table = (u32 *)rss_config;
+ if (data->hkey_size)
+ data->hkey = rss_config + indir_bytes;
+
+ return 0;
+}
+
+static void rss_get_data_free(const struct rss_reply_data *data)
+{
+ kfree(data->indir_table);
+}
+
+static int
+rss_prepare_get(const struct rss_req_info *request, struct net_device *dev,
+ struct rss_reply_data *data, const struct genl_info *info)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxfh_param rxfh = {};
+ int ret;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ mutex_lock(&dev->ethtool->rss_lock);
+
+ ret = rss_get_data_alloc(dev, data);
+ if (ret)
+ goto out_unlock;
+
+ rxfh.indir_size = data->indir_size;
+ rxfh.indir = data->indir_table;
+ rxfh.key_size = data->hkey_size;
+ rxfh.key = data->hkey;
+
+ ret = ops->get_rxfh(dev, &rxfh);
+ if (ret)
+ goto out_unlock;
+
+ data->hfunc = rxfh.hfunc;
+ data->input_xfrm = rxfh.input_xfrm;
+out_unlock:
+ mutex_unlock(&dev->ethtool->rss_lock);
+ ethnl_ops_complete(dev);
+ return ret;
+}
+
+static void
+__rss_prepare_ctx(struct net_device *dev, struct rss_reply_data *data,
+ struct ethtool_rxfh_context *ctx)
+{
+ if (WARN_ON_ONCE(data->indir_size != ctx->indir_size ||
+ data->hkey_size != ctx->key_size))
+ return;
+
+ data->no_key_fields = !dev->ethtool_ops->rxfh_per_ctx_key;
+
+ data->hfunc = ctx->hfunc;
+ data->input_xfrm = ctx->input_xfrm;
+ memcpy(data->indir_table, ethtool_rxfh_context_indir(ctx),
+ data->indir_size * sizeof(u32));
+ if (data->hkey_size)
+ memcpy(data->hkey, ethtool_rxfh_context_key(ctx),
+ data->hkey_size);
+}
+
+static int
+rss_prepare_ctx(const struct rss_req_info *request, struct net_device *dev,
+ struct rss_reply_data *data, const struct genl_info *info)
+{
+ struct ethtool_rxfh_context *ctx;
+ u32 total_size, indir_bytes;
+ u8 *rss_config;
+ int ret;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ ctx = xa_load(&dev->ethtool->rss_ctx, request->rss_context);
+ if (!ctx) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ data->indir_size = ctx->indir_size;
+ data->hkey_size = ctx->key_size;
+
+ indir_bytes = data->indir_size * sizeof(u32);
+ total_size = indir_bytes + data->hkey_size;
+ rss_config = kzalloc(total_size, GFP_KERNEL);
+ if (!rss_config) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ data->indir_table = (u32 *)rss_config;
+ if (data->hkey_size)
+ data->hkey = rss_config + indir_bytes;
+
+ __rss_prepare_ctx(dev, data, ctx);
+
+ ret = 0;
+out_unlock:
+ mutex_unlock(&dev->ethtool->rss_lock);
+ return ret;
+}
+
+static int
+rss_prepare(const struct rss_req_info *request, struct net_device *dev,
+ struct rss_reply_data *data, const struct genl_info *info)
+{
+ rss_prepare_flow_hash(request, dev, data, info);
+
+ /* Coming from RSS_SET, driver may only have flow_hash_fields ops */
+ if (!dev->ethtool_ops->get_rxfh)
+ return 0;
+
+ if (request->rss_context)
+ return rss_prepare_ctx(request, dev, data, info);
+ return rss_prepare_get(request, dev, data, info);
+}
+
+static int
+rss_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct rss_reply_data *data = RSS_REPDATA(reply_base);
+ struct rss_req_info *request = RSS_REQINFO(req_base);
+ struct net_device *dev = reply_base->dev;
+ const struct ethtool_ops *ops;
+
+ ops = dev->ethtool_ops;
+ if (!ops->get_rxfh)
+ return -EOPNOTSUPP;
+
+ /* Some drivers don't handle rss_context */
+ if (request->rss_context && !ops->create_rxfh_context)
+ return -EOPNOTSUPP;
+
+ return rss_prepare(request, dev, data, info);
+}
+
+static int
+rss_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct rss_reply_data *data = RSS_REPDATA(reply_base);
+ int len;
+
+ len = nla_total_size(sizeof(u32)) + /* _RSS_CONTEXT */
+ nla_total_size(sizeof(u32)) + /* _RSS_HFUNC */
+ nla_total_size(sizeof(u32)) + /* _RSS_INPUT_XFRM */
+ nla_total_size(sizeof(u32) * data->indir_size) + /* _RSS_INDIR */
+ nla_total_size(data->hkey_size) + /* _RSS_HKEY */
+ nla_total_size(0) + /* _RSS_FLOW_HASH */
+ nla_total_size(sizeof(u32)) * ETHTOOL_A_FLOW_MAX +
+ 0;
+
+ return len;
+}
+
+static int
+rss_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct rss_reply_data *data = RSS_REPDATA(reply_base);
+ struct rss_req_info *request = RSS_REQINFO(req_base);
+
+ if (request->rss_context &&
+ nla_put_u32(skb, ETHTOOL_A_RSS_CONTEXT, request->rss_context))
+ return -EMSGSIZE;
+
+ if ((data->indir_size &&
+ nla_put(skb, ETHTOOL_A_RSS_INDIR,
+ sizeof(u32) * data->indir_size, data->indir_table)))
+ return -EMSGSIZE;
+
+ if (!data->no_key_fields &&
+ ((data->hfunc &&
+ nla_put_u32(skb, ETHTOOL_A_RSS_HFUNC, data->hfunc)) ||
+ (data->input_xfrm &&
+ nla_put_u32(skb, ETHTOOL_A_RSS_INPUT_XFRM, data->input_xfrm)) ||
+ (data->hkey_size &&
+ nla_put(skb, ETHTOOL_A_RSS_HKEY, data->hkey_size, data->hkey))))
+ return -EMSGSIZE;
+
+ if (data->has_flow_hash) {
+ struct nlattr *nest;
+ int i;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_RSS_FLOW_HASH);
+ if (!nest)
+ return -EMSGSIZE;
+
+ for (i = 1; i < __ETHTOOL_A_FLOW_CNT; i++) {
+ if (data->flow_hash[i] >= 0 &&
+ nla_put_uint(skb, i, data->flow_hash[i])) {
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+ }
+ }
+
+ nla_nest_end(skb, nest);
+ }
+
+ return 0;
+}
+
+static void rss_cleanup_data(struct ethnl_reply_data *reply_base)
+{
+ const struct rss_reply_data *data = RSS_REPDATA(reply_base);
+
+ rss_get_data_free(data);
+}
+
+struct rss_nl_dump_ctx {
+ unsigned long ifindex;
+ unsigned long ctx_idx;
+
+ /* User wants to only dump contexts from given ifindex */
+ unsigned int match_ifindex;
+ unsigned int start_ctx;
+};
+
+static struct rss_nl_dump_ctx *rss_dump_ctx(struct netlink_callback *cb)
+{
+ NL_ASSERT_CTX_FITS(struct rss_nl_dump_ctx);
+
+ return (struct rss_nl_dump_ctx *)cb->ctx;
+}
+
+int ethnl_rss_dump_start(struct netlink_callback *cb)
+{
+ const struct genl_info *info = genl_info_dump(cb);
+ struct rss_nl_dump_ctx *ctx = rss_dump_ctx(cb);
+ struct ethnl_req_info req_info = {};
+ struct nlattr **tb = info->attrs;
+ int ret;
+
+ /* Filtering by context not supported */
+ if (tb[ETHTOOL_A_RSS_CONTEXT]) {
+ NL_SET_BAD_ATTR(info->extack, tb[ETHTOOL_A_RSS_CONTEXT]);
+ return -EINVAL;
+ }
+ if (tb[ETHTOOL_A_RSS_START_CONTEXT]) {
+ ctx->start_ctx = nla_get_u32(tb[ETHTOOL_A_RSS_START_CONTEXT]);
+ ctx->ctx_idx = ctx->start_ctx;
+ }
+
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_RSS_HEADER],
+ sock_net(cb->skb->sk), cb->extack,
+ false);
+ if (req_info.dev) {
+ ctx->match_ifindex = req_info.dev->ifindex;
+ ctx->ifindex = ctx->match_ifindex;
+ ethnl_parse_header_dev_put(&req_info);
+ req_info.dev = NULL;
+ }
+
+ return ret;
+}
+
+static int
+rss_dump_one_ctx(struct sk_buff *skb, struct netlink_callback *cb,
+ struct net_device *dev, u32 rss_context)
+{
+ const struct genl_info *info = genl_info_dump(cb);
+ struct rss_reply_data data = {};
+ struct rss_req_info req = {};
+ void *ehdr;
+ int ret;
+
+ req.rss_context = rss_context;
+
+ ehdr = ethnl_dump_put(skb, cb, ETHTOOL_MSG_RSS_GET_REPLY);
+ if (!ehdr)
+ return -EMSGSIZE;
+
+ ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_RSS_HEADER);
+ if (ret < 0)
+ goto err_cancel;
+
+ ret = rss_prepare(&req, dev, &data, info);
+ if (ret)
+ goto err_cancel;
+
+ ret = rss_fill_reply(skb, &req.base, &data.base);
+ if (ret)
+ goto err_cleanup;
+ genlmsg_end(skb, ehdr);
+
+ rss_cleanup_data(&data.base);
+ return 0;
+
+err_cleanup:
+ rss_cleanup_data(&data.base);
+err_cancel:
+ genlmsg_cancel(skb, ehdr);
+ return ret;
+}
+
+static int
+rss_dump_one_dev(struct sk_buff *skb, struct netlink_callback *cb,
+ struct net_device *dev)
+{
+ struct rss_nl_dump_ctx *ctx = rss_dump_ctx(cb);
+ int ret;
+
+ if (!dev->ethtool_ops->get_rxfh)
+ return 0;
+
+ if (!ctx->ctx_idx) {
+ ret = rss_dump_one_ctx(skb, cb, dev, 0);
+ if (ret)
+ return ret;
+ ctx->ctx_idx++;
+ }
+
+ for (; xa_find(&dev->ethtool->rss_ctx, &ctx->ctx_idx,
+ ULONG_MAX, XA_PRESENT); ctx->ctx_idx++) {
+ ret = rss_dump_one_ctx(skb, cb, dev, ctx->ctx_idx);
+ if (ret)
+ return ret;
+ }
+ ctx->ctx_idx = ctx->start_ctx;
+
+ return 0;
+}
+
+int ethnl_rss_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct rss_nl_dump_ctx *ctx = rss_dump_ctx(cb);
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ int ret = 0;
+
+ rtnl_lock();
+ for_each_netdev_dump(net, dev, ctx->ifindex) {
+ if (ctx->match_ifindex && ctx->match_ifindex != ctx->ifindex)
+ break;
+
+ netdev_lock_ops(dev);
+ ret = rss_dump_one_dev(skb, cb, dev);
+ netdev_unlock_ops(dev);
+ if (ret)
+ break;
+ }
+ rtnl_unlock();
+
+ return ret;
+}
+
+/* RSS_NTF */
+
+static void ethnl_rss_delete_notify(struct net_device *dev, u32 rss_context)
+{
+ struct sk_buff *ntf;
+ size_t ntf_size;
+ void *hdr;
+
+ ntf_size = ethnl_reply_header_size() +
+ nla_total_size(sizeof(u32)); /* _RSS_CONTEXT */
+
+ ntf = genlmsg_new(ntf_size, GFP_KERNEL);
+ if (!ntf)
+ goto out_warn;
+
+ hdr = ethnl_bcastmsg_put(ntf, ETHTOOL_MSG_RSS_DELETE_NTF);
+ if (!hdr)
+ goto out_free_ntf;
+
+ if (ethnl_fill_reply_header(ntf, dev, ETHTOOL_A_RSS_HEADER) ||
+ nla_put_u32(ntf, ETHTOOL_A_RSS_CONTEXT, rss_context))
+ goto out_free_ntf;
+
+ genlmsg_end(ntf, hdr);
+ if (ethnl_multicast(ntf, dev))
+ goto out_warn;
+
+ return;
+
+out_free_ntf:
+ nlmsg_free(ntf);
+out_warn:
+ pr_warn_once("Failed to send a RSS delete notification");
+}
+
+void ethtool_rss_notify(struct net_device *dev, u32 type, u32 rss_context)
+{
+ struct rss_req_info req_info = {
+ .rss_context = rss_context,
+ };
+
+ if (type == ETHTOOL_MSG_RSS_DELETE_NTF)
+ ethnl_rss_delete_notify(dev, rss_context);
+ else
+ ethnl_notify(dev, type, &req_info.base);
+}
+
+/* RSS_SET */
+
+#define RFH_MASK (RXH_L2DA | RXH_VLAN | RXH_IP_SRC | RXH_IP_DST | \
+ RXH_L3_PROTO | RXH_L4_B_0_1 | RXH_L4_B_2_3 | \
+ RXH_GTP_TEID | RXH_DISCARD)
+#define RFH_MASKv6 (RFH_MASK | RXH_IP6_FL)
+
+static const struct nla_policy ethnl_rss_flows_policy[] = {
+ [ETHTOOL_A_FLOW_ETHER] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_IP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_IP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_TCP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_UDP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_SCTP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_AH_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_TCP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_UDP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_SCTP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_AH_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_AH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_AH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_GTPU4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPU6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_GTPC4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPC6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_GTPC_TEID4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPC_TEID6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_GTPU_EH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPU_EH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_GTPU_UL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPU_UL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_GTPU_DL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPU_DL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+};
+
+const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_FLOW_HASH + 1] = {
+ [ETHTOOL_A_RSS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_RSS_CONTEXT] = { .type = NLA_U32, },
+ [ETHTOOL_A_RSS_HFUNC] = NLA_POLICY_MIN(NLA_U32, 1),
+ [ETHTOOL_A_RSS_INDIR] = { .type = NLA_BINARY, },
+ [ETHTOOL_A_RSS_HKEY] = NLA_POLICY_MIN(NLA_BINARY, 1),
+ [ETHTOOL_A_RSS_INPUT_XFRM] =
+ NLA_POLICY_MAX(NLA_U32, RXH_XFRM_SYM_OR_XOR),
+ [ETHTOOL_A_RSS_FLOW_HASH] = NLA_POLICY_NESTED(ethnl_rss_flows_policy),
+};
+
+static int
+ethnl_rss_set_validate(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+ struct rss_req_info *request = RSS_REQINFO(req_info);
+ struct nlattr **tb = info->attrs;
+ struct nlattr *bad_attr = NULL;
+ u32 input_xfrm;
+
+ if (request->rss_context && !ops->create_rxfh_context)
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_CONTEXT];
+
+ if (request->rss_context && !ops->rxfh_per_ctx_key) {
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_HFUNC];
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_HKEY];
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_INPUT_XFRM];
+ }
+
+ input_xfrm = nla_get_u32_default(tb[ETHTOOL_A_RSS_INPUT_XFRM], 0);
+ if (input_xfrm & ~ops->supported_input_xfrm)
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_INPUT_XFRM];
+
+ if (tb[ETHTOOL_A_RSS_FLOW_HASH] && !ops->set_rxfh_fields)
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_FLOW_HASH];
+ if (request->rss_context &&
+ tb[ETHTOOL_A_RSS_FLOW_HASH] && !ops->rxfh_per_ctx_fields)
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_FLOW_HASH];
+
+ if (bad_attr) {
+ NL_SET_BAD_ATTR(info->extack, bad_attr);
+ return -EOPNOTSUPP;
+ }
+
+ return 1;
+}
+
+static int
+rss_set_prep_indir(struct net_device *dev, struct genl_info *info,
+ struct rss_reply_data *data, struct ethtool_rxfh_param *rxfh,
+ bool *reset, bool *mod)
+{
+ struct netlink_ext_ack *extack = info->extack;
+ struct nlattr **tb = info->attrs;
+ size_t alloc_size;
+ int num_rx_rings;
+ u32 user_size;
+ int i, err;
+
+ if (!tb[ETHTOOL_A_RSS_INDIR])
+ return 0;
+ if (!data->indir_size)
+ return -EOPNOTSUPP;
+
+ err = ethtool_get_rx_ring_count(dev);
+ if (err < 0)
+ return err;
+ num_rx_rings = err;
+
+ if (nla_len(tb[ETHTOOL_A_RSS_INDIR]) % 4) {
+ NL_SET_BAD_ATTR(info->extack, tb[ETHTOOL_A_RSS_INDIR]);
+ return -EINVAL;
+ }
+ user_size = nla_len(tb[ETHTOOL_A_RSS_INDIR]) / 4;
+ if (!user_size) {
+ if (rxfh->rss_context) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_RSS_INDIR],
+ "can't reset table for a context");
+ return -EINVAL;
+ }
+ *reset = true;
+ } else if (data->indir_size % user_size) {
+ NL_SET_ERR_MSG_ATTR_FMT(extack, tb[ETHTOOL_A_RSS_INDIR],
+ "size (%d) mismatch with device indir table (%d)",
+ user_size, data->indir_size);
+ return -EINVAL;
+ }
+
+ rxfh->indir_size = data->indir_size;
+ alloc_size = array_size(data->indir_size, sizeof(rxfh->indir[0]));
+ rxfh->indir = kzalloc(alloc_size, GFP_KERNEL);
+ if (!rxfh->indir)
+ return -ENOMEM;
+
+ nla_memcpy(rxfh->indir, tb[ETHTOOL_A_RSS_INDIR], alloc_size);
+ for (i = 0; i < user_size; i++) {
+ if (rxfh->indir[i] < num_rx_rings)
+ continue;
+
+ NL_SET_ERR_MSG_ATTR_FMT(extack, tb[ETHTOOL_A_RSS_INDIR],
+ "entry %d: queue out of range (%d)",
+ i, rxfh->indir[i]);
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ if (user_size) {
+ /* Replicate the user-provided table to fill the device table */
+ for (i = user_size; i < data->indir_size; i++)
+ rxfh->indir[i] = rxfh->indir[i % user_size];
+ } else {
+ for (i = 0; i < data->indir_size; i++)
+ rxfh->indir[i] =
+ ethtool_rxfh_indir_default(i, num_rx_rings);
+ }
+
+ *mod |= memcmp(rxfh->indir, data->indir_table, data->indir_size);
+
+ return 0;
+
+err_free:
+ kfree(rxfh->indir);
+ rxfh->indir = NULL;
+ return err;
+}
+
+static int
+rss_set_prep_hkey(struct net_device *dev, struct genl_info *info,
+ struct rss_reply_data *data, struct ethtool_rxfh_param *rxfh,
+ bool *mod)
+{
+ struct nlattr **tb = info->attrs;
+
+ if (!tb[ETHTOOL_A_RSS_HKEY])
+ return 0;
+
+ if (nla_len(tb[ETHTOOL_A_RSS_HKEY]) != data->hkey_size) {
+ NL_SET_BAD_ATTR(info->extack, tb[ETHTOOL_A_RSS_HKEY]);
+ return -EINVAL;
+ }
+
+ rxfh->key_size = data->hkey_size;
+ rxfh->key = kmemdup(data->hkey, data->hkey_size, GFP_KERNEL);
+ if (!rxfh->key)
+ return -ENOMEM;
+
+ ethnl_update_binary(rxfh->key, rxfh->key_size, tb[ETHTOOL_A_RSS_HKEY],
+ mod);
+ return 0;
+}
+
+static int
+rss_check_rxfh_fields_sym(struct net_device *dev, struct genl_info *info,
+ struct rss_reply_data *data, bool xfrm_sym)
+{
+ struct nlattr **tb = info->attrs;
+ int i;
+
+ if (!xfrm_sym)
+ return 0;
+ if (!data->has_flow_hash) {
+ NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RSS_INPUT_XFRM],
+ "hash field config not reported");
+ return -EINVAL;
+ }
+
+ for (i = 1; i < __ETHTOOL_A_FLOW_CNT; i++)
+ if (data->flow_hash[i] >= 0 &&
+ !ethtool_rxfh_config_is_sym(data->flow_hash[i])) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_RSS_INPUT_XFRM],
+ "hash field config is not symmetric");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+ethnl_set_rss_fields(struct net_device *dev, struct genl_info *info,
+ u32 rss_context, struct rss_reply_data *data,
+ bool xfrm_sym, bool *mod)
+{
+ struct nlattr *flow_nest = info->attrs[ETHTOOL_A_RSS_FLOW_HASH];
+ struct nlattr *flows[ETHTOOL_A_FLOW_MAX + 1];
+ const struct ethtool_ops *ops;
+ int i, ret;
+
+ ops = dev->ethtool_ops;
+
+ ret = rss_check_rxfh_fields_sym(dev, info, data, xfrm_sym);
+ if (ret)
+ return ret;
+
+ if (!flow_nest)
+ return 0;
+
+ ret = nla_parse_nested(flows, ARRAY_SIZE(ethnl_rss_flows_policy) - 1,
+ flow_nest, ethnl_rss_flows_policy, info->extack);
+ if (ret < 0)
+ return ret;
+
+ for (i = 1; i < __ETHTOOL_A_FLOW_CNT; i++) {
+ struct ethtool_rxfh_fields fields = {
+ .flow_type = ethtool_rxfh_ft_nl2ioctl[i],
+ .rss_context = rss_context,
+ };
+
+ if (!flows[i])
+ continue;
+
+ fields.data = nla_get_u32(flows[i]);
+ if (data->has_flow_hash && data->flow_hash[i] == fields.data)
+ continue;
+
+ if (xfrm_sym && !ethtool_rxfh_config_is_sym(fields.data)) {
+ NL_SET_ERR_MSG_ATTR(info->extack, flows[i],
+ "conflict with xfrm-input");
+ return -EINVAL;
+ }
+
+ ret = ops->set_rxfh_fields(dev, &fields, info->extack);
+ if (ret)
+ return ret;
+
+ *mod = true;
+ }
+
+ return 0;
+}
+
+static void
+rss_set_ctx_update(struct ethtool_rxfh_context *ctx, struct nlattr **tb,
+ struct rss_reply_data *data, struct ethtool_rxfh_param *rxfh)
+{
+ int i;
+
+ if (rxfh->indir) {
+ for (i = 0; i < data->indir_size; i++)
+ ethtool_rxfh_context_indir(ctx)[i] = rxfh->indir[i];
+ ctx->indir_configured = !!nla_len(tb[ETHTOOL_A_RSS_INDIR]);
+ }
+ if (rxfh->key) {
+ memcpy(ethtool_rxfh_context_key(ctx), rxfh->key,
+ data->hkey_size);
+ ctx->key_configured = !!rxfh->key_size;
+ }
+ if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE)
+ ctx->hfunc = rxfh->hfunc;
+ if (rxfh->input_xfrm != RXH_XFRM_NO_CHANGE)
+ ctx->input_xfrm = rxfh->input_xfrm;
+}
+
+static int
+ethnl_rss_set(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ bool indir_reset = false, indir_mod, xfrm_sym = false;
+ struct rss_req_info *request = RSS_REQINFO(req_info);
+ struct ethtool_rxfh_context *ctx = NULL;
+ struct net_device *dev = req_info->dev;
+ bool mod = false, fields_mod = false;
+ struct ethtool_rxfh_param rxfh = {};
+ struct nlattr **tb = info->attrs;
+ struct rss_reply_data data = {};
+ const struct ethtool_ops *ops;
+ int ret;
+
+ ops = dev->ethtool_ops;
+ data.base.dev = dev;
+
+ ret = rss_prepare(request, dev, &data, info);
+ if (ret)
+ return ret;
+
+ rxfh.rss_context = request->rss_context;
+
+ ret = rss_set_prep_indir(dev, info, &data, &rxfh, &indir_reset, &mod);
+ if (ret)
+ goto exit_clean_data;
+ indir_mod = !!tb[ETHTOOL_A_RSS_INDIR];
+
+ rxfh.hfunc = data.hfunc;
+ ethnl_update_u8(&rxfh.hfunc, tb[ETHTOOL_A_RSS_HFUNC], &mod);
+ if (rxfh.hfunc == data.hfunc)
+ rxfh.hfunc = ETH_RSS_HASH_NO_CHANGE;
+
+ ret = rss_set_prep_hkey(dev, info, &data, &rxfh, &mod);
+ if (ret)
+ goto exit_free_indir;
+
+ rxfh.input_xfrm = data.input_xfrm;
+ ethnl_update_u8(&rxfh.input_xfrm, tb[ETHTOOL_A_RSS_INPUT_XFRM], &mod);
+ /* For drivers which don't support input_xfrm it will be set to 0xff
+ * in the RSS context info. In all other case input_xfrm != 0 means
+ * symmetric hashing is requested.
+ */
+ if (!request->rss_context || ops->rxfh_per_ctx_key)
+ xfrm_sym = rxfh.input_xfrm || data.input_xfrm;
+ if (rxfh.input_xfrm == data.input_xfrm)
+ rxfh.input_xfrm = RXH_XFRM_NO_CHANGE;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ if (request->rss_context) {
+ ctx = xa_load(&dev->ethtool->rss_ctx, request->rss_context);
+ if (!ctx) {
+ ret = -ENOENT;
+ goto exit_unlock;
+ }
+ }
+
+ ret = ethnl_set_rss_fields(dev, info, request->rss_context,
+ &data, xfrm_sym, &fields_mod);
+ if (ret)
+ goto exit_unlock;
+
+ if (!mod)
+ ret = 0; /* nothing to tell the driver */
+ else if (!ops->set_rxfh)
+ ret = -EOPNOTSUPP;
+ else if (!rxfh.rss_context)
+ ret = ops->set_rxfh(dev, &rxfh, info->extack);
+ else
+ ret = ops->modify_rxfh_context(dev, ctx, &rxfh, info->extack);
+ if (ret)
+ goto exit_unlock;
+
+ if (ctx)
+ rss_set_ctx_update(ctx, tb, &data, &rxfh);
+ else if (indir_reset)
+ dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
+ else if (indir_mod)
+ dev->priv_flags |= IFF_RXFH_CONFIGURED;
+
+exit_unlock:
+ mutex_unlock(&dev->ethtool->rss_lock);
+ kfree(rxfh.key);
+exit_free_indir:
+ kfree(rxfh.indir);
+exit_clean_data:
+ rss_cleanup_data(&data.base);
+
+ return ret ?: mod || fields_mod;
+}
+
+const struct ethnl_request_ops ethnl_rss_request_ops = {
+ .request_cmd = ETHTOOL_MSG_RSS_GET,
+ .reply_cmd = ETHTOOL_MSG_RSS_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_RSS_HEADER,
+ .req_info_size = sizeof(struct rss_req_info),
+ .reply_data_size = sizeof(struct rss_reply_data),
+
+ .parse_request = rss_parse_request,
+ .prepare_data = rss_prepare_data,
+ .reply_size = rss_reply_size,
+ .fill_reply = rss_fill_reply,
+ .cleanup_data = rss_cleanup_data,
+
+ .set_validate = ethnl_rss_set_validate,
+ .set = ethnl_rss_set,
+ .set_ntf_cmd = ETHTOOL_MSG_RSS_NTF,
+};
+
+/* RSS_CREATE */
+
+const struct nla_policy ethnl_rss_create_policy[ETHTOOL_A_RSS_INPUT_XFRM + 1] = {
+ [ETHTOOL_A_RSS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_RSS_CONTEXT] = NLA_POLICY_MIN(NLA_U32, 1),
+ [ETHTOOL_A_RSS_HFUNC] = NLA_POLICY_MIN(NLA_U32, 1),
+ [ETHTOOL_A_RSS_INDIR] = NLA_POLICY_MIN(NLA_BINARY, 1),
+ [ETHTOOL_A_RSS_HKEY] = NLA_POLICY_MIN(NLA_BINARY, 1),
+ [ETHTOOL_A_RSS_INPUT_XFRM] =
+ NLA_POLICY_MAX(NLA_U32, RXH_XFRM_SYM_OR_XOR),
+};
+
+static int
+ethnl_rss_create_validate(struct net_device *dev, struct genl_info *info)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct nlattr **tb = info->attrs;
+ struct nlattr *bad_attr = NULL;
+ u32 rss_context, input_xfrm;
+
+ if (!ops->create_rxfh_context)
+ return -EOPNOTSUPP;
+
+ rss_context = nla_get_u32_default(tb[ETHTOOL_A_RSS_CONTEXT], 0);
+ if (ops->rxfh_max_num_contexts &&
+ ops->rxfh_max_num_contexts <= rss_context) {
+ NL_SET_BAD_ATTR(info->extack, tb[ETHTOOL_A_RSS_CONTEXT]);
+ return -ERANGE;
+ }
+
+ if (!ops->rxfh_per_ctx_key) {
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_HFUNC];
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_HKEY];
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_INPUT_XFRM];
+ }
+
+ input_xfrm = nla_get_u32_default(tb[ETHTOOL_A_RSS_INPUT_XFRM], 0);
+ if (input_xfrm & ~ops->supported_input_xfrm)
+ bad_attr = bad_attr ?: tb[ETHTOOL_A_RSS_INPUT_XFRM];
+
+ if (bad_attr) {
+ NL_SET_BAD_ATTR(info->extack, bad_attr);
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static void
+ethnl_rss_create_send_ntf(struct sk_buff *rsp, struct net_device *dev)
+{
+ struct nlmsghdr *nlh = (void *)rsp->data;
+ struct genlmsghdr *genl_hdr;
+
+ /* Convert the reply into a notification */
+ nlh->nlmsg_pid = 0;
+ nlh->nlmsg_seq = ethnl_bcast_seq_next();
+
+ genl_hdr = nlmsg_data(nlh);
+ genl_hdr->cmd = ETHTOOL_MSG_RSS_CREATE_NTF;
+
+ ethnl_multicast(rsp, dev);
+}
+
+int ethnl_rss_create_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ bool indir_dflt = false, mod = false, ntf_fail = false;
+ struct ethtool_rxfh_param rxfh = {};
+ struct ethtool_rxfh_context *ctx;
+ struct nlattr **tb = info->attrs;
+ struct rss_reply_data data = {};
+ const struct ethtool_ops *ops;
+ struct rss_req_info req = {};
+ struct net_device *dev;
+ struct sk_buff *rsp;
+ void *hdr;
+ u32 limit;
+ int ret;
+
+ rsp = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ ret = ethnl_parse_header_dev_get(&req.base, tb[ETHTOOL_A_RSS_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ goto exit_free_rsp;
+
+ dev = req.base.dev;
+ ops = dev->ethtool_ops;
+
+ req.rss_context = nla_get_u32_default(tb[ETHTOOL_A_RSS_CONTEXT], 0);
+
+ ret = ethnl_rss_create_validate(dev, info);
+ if (ret)
+ goto exit_free_dev;
+
+ rtnl_lock();
+ netdev_lock_ops(dev);
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto exit_dev_unlock;
+
+ ret = rss_get_data_alloc(dev, &data);
+ if (ret)
+ goto exit_ops;
+
+ ret = rss_set_prep_indir(dev, info, &data, &rxfh, &indir_dflt, &mod);
+ if (ret)
+ goto exit_clean_data;
+
+ ethnl_update_u8(&rxfh.hfunc, tb[ETHTOOL_A_RSS_HFUNC], &mod);
+
+ ret = rss_set_prep_hkey(dev, info, &data, &rxfh, &mod);
+ if (ret)
+ goto exit_free_indir;
+
+ rxfh.input_xfrm = RXH_XFRM_NO_CHANGE;
+ ethnl_update_u8(&rxfh.input_xfrm, tb[ETHTOOL_A_RSS_INPUT_XFRM], &mod);
+
+ ctx = ethtool_rxfh_ctx_alloc(ops, data.indir_size, data.hkey_size);
+ if (!ctx) {
+ ret = -ENOMEM;
+ goto exit_free_hkey;
+ }
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ if (!req.rss_context) {
+ limit = ops->rxfh_max_num_contexts ?: U32_MAX;
+ ret = xa_alloc(&dev->ethtool->rss_ctx, &req.rss_context, ctx,
+ XA_LIMIT(1, limit - 1), GFP_KERNEL_ACCOUNT);
+ } else {
+ ret = xa_insert(&dev->ethtool->rss_ctx,
+ req.rss_context, ctx, GFP_KERNEL_ACCOUNT);
+ }
+ if (ret < 0) {
+ NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_RSS_CONTEXT],
+ "error allocating context ID");
+ goto err_unlock_free_ctx;
+ }
+ rxfh.rss_context = req.rss_context;
+
+ ret = ops->create_rxfh_context(dev, ctx, &rxfh, info->extack);
+ if (ret)
+ goto err_ctx_id_free;
+
+ /* Make sure driver populates defaults */
+ WARN_ON_ONCE(!rxfh.key && ops->rxfh_per_ctx_key &&
+ !memchr_inv(ethtool_rxfh_context_key(ctx), 0,
+ ctx->key_size));
+
+ /* Store the config from rxfh to Xarray.. */
+ rss_set_ctx_update(ctx, tb, &data, &rxfh);
+ /* .. copy from Xarray to data. */
+ __rss_prepare_ctx(dev, &data, ctx);
+
+ hdr = ethnl_unicast_put(rsp, info->snd_portid, info->snd_seq,
+ ETHTOOL_MSG_RSS_CREATE_ACT_REPLY);
+ ntf_fail = ethnl_fill_reply_header(rsp, dev, ETHTOOL_A_RSS_HEADER);
+ ntf_fail |= rss_fill_reply(rsp, &req.base, &data.base);
+ if (WARN_ON(!hdr || ntf_fail)) {
+ ret = -EMSGSIZE;
+ goto exit_unlock;
+ }
+
+ genlmsg_end(rsp, hdr);
+
+ /* Use the same skb for the response and the notification,
+ * genlmsg_reply() will copy the skb if it has elevated user count.
+ */
+ skb_get(rsp);
+ ret = genlmsg_reply(rsp, info);
+ ethnl_rss_create_send_ntf(rsp, dev);
+ rsp = NULL;
+
+exit_unlock:
+ mutex_unlock(&dev->ethtool->rss_lock);
+exit_free_hkey:
+ kfree(rxfh.key);
+exit_free_indir:
+ kfree(rxfh.indir);
+exit_clean_data:
+ rss_get_data_free(&data);
+exit_ops:
+ ethnl_ops_complete(dev);
+exit_dev_unlock:
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+exit_free_dev:
+ ethnl_parse_header_dev_put(&req.base);
+exit_free_rsp:
+ nlmsg_free(rsp);
+ return ret;
+
+err_ctx_id_free:
+ xa_erase(&dev->ethtool->rss_ctx, req.rss_context);
+err_unlock_free_ctx:
+ kfree(ctx);
+ goto exit_unlock;
+}
+
+/* RSS_DELETE */
+
+const struct nla_policy ethnl_rss_delete_policy[ETHTOOL_A_RSS_CONTEXT + 1] = {
+ [ETHTOOL_A_RSS_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_RSS_CONTEXT] = NLA_POLICY_MIN(NLA_U32, 1),
+};
+
+int ethnl_rss_delete_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ethtool_rxfh_context *ctx;
+ struct nlattr **tb = info->attrs;
+ struct ethnl_req_info req = {};
+ const struct ethtool_ops *ops;
+ struct net_device *dev;
+ u32 rss_context;
+ int ret;
+
+ if (GENL_REQ_ATTR_CHECK(info, ETHTOOL_A_RSS_CONTEXT))
+ return -EINVAL;
+ rss_context = nla_get_u32(tb[ETHTOOL_A_RSS_CONTEXT]);
+
+ ret = ethnl_parse_header_dev_get(&req, tb[ETHTOOL_A_RSS_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+
+ dev = req.dev;
+ ops = dev->ethtool_ops;
+
+ if (!ops->create_rxfh_context)
+ goto exit_free_dev;
+
+ rtnl_lock();
+ netdev_lock_ops(dev);
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto exit_dev_unlock;
+
+ mutex_lock(&dev->ethtool->rss_lock);
+ ret = ethtool_check_rss_ctx_busy(dev, rss_context);
+ if (ret)
+ goto exit_unlock;
+
+ ctx = xa_load(&dev->ethtool->rss_ctx, rss_context);
+ if (!ctx) {
+ ret = -ENOENT;
+ goto exit_unlock;
+ }
+
+ ret = ops->remove_rxfh_context(dev, ctx, rss_context, info->extack);
+ if (ret)
+ goto exit_unlock;
+
+ WARN_ON(xa_erase(&dev->ethtool->rss_ctx, rss_context) != ctx);
+ kfree(ctx);
+
+ ethnl_rss_delete_notify(dev, rss_context);
+
+exit_unlock:
+ mutex_unlock(&dev->ethtool->rss_lock);
+ ethnl_ops_complete(dev);
+exit_dev_unlock:
+ netdev_unlock_ops(dev);
+ rtnl_unlock();
+exit_free_dev:
+ ethnl_parse_header_dev_put(&req);
+ return ret;
+}
diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c
new file mode 100644
index 000000000000..3ca8eb2a3b31
--- /dev/null
+++ b/net/ethtool/stats.c
@@ -0,0 +1,623 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/phy.h>
+#include <linux/phylib_stubs.h>
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct stats_req_info {
+ struct ethnl_req_info base;
+ DECLARE_BITMAP(stat_mask, __ETHTOOL_STATS_CNT);
+ enum ethtool_mac_stats_src src;
+};
+
+#define STATS_REQINFO(__req_base) \
+ container_of(__req_base, struct stats_req_info, base)
+
+struct stats_reply_data {
+ struct ethnl_reply_data base;
+ struct_group(stats,
+ struct ethtool_eth_phy_stats phy_stats;
+ struct ethtool_eth_mac_stats mac_stats;
+ struct ethtool_eth_ctrl_stats ctrl_stats;
+ struct ethtool_rmon_stats rmon_stats;
+ struct ethtool_phy_stats phydev_stats;
+ );
+ const struct ethtool_rmon_hist_range *rmon_ranges;
+};
+
+#define STATS_REPDATA(__reply_base) \
+ container_of(__reply_base, struct stats_reply_data, base)
+
+const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_STATS_ETH_PHY] = "eth-phy",
+ [ETHTOOL_STATS_ETH_MAC] = "eth-mac",
+ [ETHTOOL_STATS_ETH_CTRL] = "eth-ctrl",
+ [ETHTOOL_STATS_RMON] = "rmon",
+ [ETHTOOL_STATS_PHY] = "phydev",
+};
+
+const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR] = "SymbolErrorDuringCarrier",
+};
+
+const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT] = "FramesTransmittedOK",
+ [ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL] = "SingleCollisionFrames",
+ [ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL] = "MultipleCollisionFrames",
+ [ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT] = "FramesReceivedOK",
+ [ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR] = "FrameCheckSequenceErrors",
+ [ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR] = "AlignmentErrors",
+ [ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES] = "OctetsTransmittedOK",
+ [ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER] = "FramesWithDeferredXmissions",
+ [ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL] = "LateCollisions",
+ [ETHTOOL_A_STATS_ETH_MAC_11_XS_COL] = "FramesAbortedDueToXSColls",
+ [ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR] = "FramesLostDueToIntMACXmitError",
+ [ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR] = "CarrierSenseErrors",
+ [ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES] = "OctetsReceivedOK",
+ [ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR] = "FramesLostDueToIntMACRcvError",
+ [ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST] = "MulticastFramesXmittedOK",
+ [ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST] = "BroadcastFramesXmittedOK",
+ [ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER] = "FramesWithExcessiveDeferral",
+ [ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST] = "MulticastFramesReceivedOK",
+ [ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST] = "BroadcastFramesReceivedOK",
+ [ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR] = "InRangeLengthErrors",
+ [ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN] = "OutOfRangeLengthField",
+ [ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR] = "FrameTooLongErrors",
+};
+
+const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_A_STATS_ETH_CTRL_3_TX] = "MACControlFramesTransmitted",
+ [ETHTOOL_A_STATS_ETH_CTRL_4_RX] = "MACControlFramesReceived",
+ [ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP] = "UnsupportedOpcodesReceived",
+};
+
+const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_A_STATS_RMON_UNDERSIZE] = "etherStatsUndersizePkts",
+ [ETHTOOL_A_STATS_RMON_OVERSIZE] = "etherStatsOversizePkts",
+ [ETHTOOL_A_STATS_RMON_FRAG] = "etherStatsFragments",
+ [ETHTOOL_A_STATS_RMON_JABBER] = "etherStatsJabbers",
+};
+
+const char stats_phy_names[__ETHTOOL_A_STATS_PHY_CNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_A_STATS_PHY_RX_PKTS] = "RxFrames",
+ [ETHTOOL_A_STATS_PHY_RX_BYTES] = "RxOctets",
+ [ETHTOOL_A_STATS_PHY_RX_ERRORS] = "RxErrors",
+ [ETHTOOL_A_STATS_PHY_TX_PKTS] = "TxFrames",
+ [ETHTOOL_A_STATS_PHY_TX_BYTES] = "TxOctets",
+ [ETHTOOL_A_STATS_PHY_TX_ERRORS] = "TxErrors",
+};
+
+const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_SRC + 1] = {
+ [ETHTOOL_A_STATS_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_STATS_GROUPS] = { .type = NLA_NESTED },
+ [ETHTOOL_A_STATS_SRC] =
+ NLA_POLICY_MAX(NLA_U32, ETHTOOL_MAC_STATS_SRC_PMAC),
+};
+
+static int stats_parse_request(struct ethnl_req_info *req_base,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ enum ethtool_mac_stats_src src = ETHTOOL_MAC_STATS_SRC_AGGREGATE;
+ struct stats_req_info *req_info = STATS_REQINFO(req_base);
+ bool mod = false;
+ int err;
+
+ err = ethnl_update_bitset(req_info->stat_mask, __ETHTOOL_STATS_CNT,
+ tb[ETHTOOL_A_STATS_GROUPS], stats_std_names,
+ extack, &mod);
+ if (err)
+ return err;
+
+ if (!mod) {
+ NL_SET_ERR_MSG(extack, "no stats requested");
+ return -EINVAL;
+ }
+
+ if (tb[ETHTOOL_A_STATS_SRC])
+ src = nla_get_u32(tb[ETHTOOL_A_STATS_SRC]);
+
+ req_info->src = src;
+
+ return 0;
+}
+
+static int stats_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ const struct stats_req_info *req_info = STATS_REQINFO(req_base);
+ struct stats_reply_data *data = STATS_REPDATA(reply_base);
+ enum ethtool_mac_stats_src src = req_info->src;
+ struct net_device *dev = reply_base->dev;
+ struct nlattr **tb = info->attrs;
+ struct phy_device *phydev;
+ int ret;
+
+ phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_STATS_HEADER,
+ info->extack);
+ if (IS_ERR(phydev))
+ return PTR_ERR(phydev);
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ if ((src == ETHTOOL_MAC_STATS_SRC_EMAC ||
+ src == ETHTOOL_MAC_STATS_SRC_PMAC) &&
+ !__ethtool_dev_mm_supported(dev)) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Device does not support MAC merge layer");
+ ethnl_ops_complete(dev);
+ return -EOPNOTSUPP;
+ }
+
+ /* Mark all stats as unset (see ETHTOOL_STAT_NOT_SET) to prevent them
+ * from being reported to user space in case driver did not set them.
+ */
+ memset(&data->stats, 0xff, sizeof(data->stats));
+
+ data->phy_stats.src = src;
+ data->mac_stats.src = src;
+ data->ctrl_stats.src = src;
+ data->rmon_stats.src = src;
+
+ if ((test_bit(ETHTOOL_STATS_PHY, req_info->stat_mask) ||
+ test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) &&
+ src == ETHTOOL_MAC_STATS_SRC_AGGREGATE) {
+ if (phydev)
+ phy_ethtool_get_phy_stats(phydev, &data->phy_stats,
+ &data->phydev_stats);
+ }
+
+ if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) &&
+ dev->ethtool_ops->get_eth_phy_stats)
+ dev->ethtool_ops->get_eth_phy_stats(dev, &data->phy_stats);
+ if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask) &&
+ dev->ethtool_ops->get_eth_mac_stats)
+ dev->ethtool_ops->get_eth_mac_stats(dev, &data->mac_stats);
+ if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask) &&
+ dev->ethtool_ops->get_eth_ctrl_stats)
+ dev->ethtool_ops->get_eth_ctrl_stats(dev, &data->ctrl_stats);
+ if (test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask) &&
+ dev->ethtool_ops->get_rmon_stats)
+ dev->ethtool_ops->get_rmon_stats(dev, &data->rmon_stats,
+ &data->rmon_ranges);
+
+ ethnl_ops_complete(dev);
+ return 0;
+}
+
+static int stats_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct stats_req_info *req_info = STATS_REQINFO(req_base);
+ unsigned int n_grps = 0, n_stats = 0;
+ int len = 0;
+
+ len += nla_total_size(sizeof(u32)); /* _STATS_SRC */
+
+ if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) {
+ n_stats += sizeof(struct ethtool_eth_phy_stats) / sizeof(u64);
+ n_grps++;
+ }
+ if (test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask)) {
+ n_stats += sizeof(struct ethtool_eth_mac_stats) / sizeof(u64);
+ n_grps++;
+ }
+ if (test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask)) {
+ n_stats += sizeof(struct ethtool_eth_ctrl_stats) / sizeof(u64);
+ n_grps++;
+ }
+ if (test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask)) {
+ n_stats += sizeof(struct ethtool_rmon_stats) / sizeof(u64);
+ n_grps++;
+ /* Above includes the space for _A_STATS_GRP_HIST_VALs */
+
+ len += (nla_total_size(0) + /* _A_STATS_GRP_HIST */
+ nla_total_size(4) + /* _A_STATS_GRP_HIST_BKT_LOW */
+ nla_total_size(4)) * /* _A_STATS_GRP_HIST_BKT_HI */
+ ETHTOOL_RMON_HIST_MAX * 2;
+ }
+ if (test_bit(ETHTOOL_STATS_PHY, req_info->stat_mask)) {
+ n_stats += sizeof(struct ethtool_phy_stats) / sizeof(u64);
+ n_grps++;
+ }
+
+ len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */
+ nla_total_size(4) + /* _A_STATS_GRP_ID */
+ nla_total_size(4)); /* _A_STATS_GRP_SS_ID */
+ len += n_stats * (nla_total_size(0) + /* _A_STATS_GRP_STAT */
+ nla_total_size_64bit(sizeof(u64)));
+
+ return len;
+}
+
+static int stat_put(struct sk_buff *skb, u16 attrtype, u64 val)
+{
+ struct nlattr *nest;
+ int ret;
+
+ if (val == ETHTOOL_STAT_NOT_SET)
+ return 0;
+
+ /* We want to start stats attr types from 0, so we don't have a type
+ * for pad inside ETHTOOL_A_STATS_GRP_STAT. Pad things on the outside
+ * of ETHTOOL_A_STATS_GRP_STAT. Since we're one nest away from the
+ * actual attr we're 4B off - nla_need_padding_for_64bit() & co.
+ * can't be used.
+ */
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+ if (!IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8))
+ if (!nla_reserve(skb, ETHTOOL_A_STATS_GRP_PAD, 0))
+ return -EMSGSIZE;
+#endif
+
+ nest = nla_nest_start(skb, ETHTOOL_A_STATS_GRP_STAT);
+ if (!nest)
+ return -EMSGSIZE;
+
+ ret = nla_put_u64_64bit(skb, attrtype, val, -1 /* not used */);
+ if (ret) {
+ nla_nest_cancel(skb, nest);
+ return ret;
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+}
+
+static int stats_put_phy_stats(struct sk_buff *skb,
+ const struct stats_reply_data *data)
+{
+ if (stat_put(skb, ETHTOOL_A_STATS_ETH_PHY_5_SYM_ERR,
+ data->phy_stats.SymbolErrorDuringCarrier))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int stats_put_phydev_stats(struct sk_buff *skb,
+ const struct stats_reply_data *data)
+{
+ if (stat_put(skb, ETHTOOL_A_STATS_PHY_RX_PKTS,
+ data->phydev_stats.rx_packets) ||
+ stat_put(skb, ETHTOOL_A_STATS_PHY_RX_BYTES,
+ data->phydev_stats.rx_bytes) ||
+ stat_put(skb, ETHTOOL_A_STATS_PHY_RX_ERRORS,
+ data->phydev_stats.rx_errors) ||
+ stat_put(skb, ETHTOOL_A_STATS_PHY_TX_PKTS,
+ data->phydev_stats.tx_packets) ||
+ stat_put(skb, ETHTOOL_A_STATS_PHY_TX_BYTES,
+ data->phydev_stats.tx_bytes) ||
+ stat_put(skb, ETHTOOL_A_STATS_PHY_TX_ERRORS,
+ data->phydev_stats.tx_errors))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int stats_put_mac_stats(struct sk_buff *skb,
+ const struct stats_reply_data *data)
+{
+ if (stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_2_TX_PKT,
+ data->mac_stats.FramesTransmittedOK) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_3_SINGLE_COL,
+ data->mac_stats.SingleCollisionFrames) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_4_MULTI_COL,
+ data->mac_stats.MultipleCollisionFrames) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_5_RX_PKT,
+ data->mac_stats.FramesReceivedOK) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_6_FCS_ERR,
+ data->mac_stats.FrameCheckSequenceErrors) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_7_ALIGN_ERR,
+ data->mac_stats.AlignmentErrors) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_8_TX_BYTES,
+ data->mac_stats.OctetsTransmittedOK) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_9_TX_DEFER,
+ data->mac_stats.FramesWithDeferredXmissions) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_10_LATE_COL,
+ data->mac_stats.LateCollisions) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_11_XS_COL,
+ data->mac_stats.FramesAbortedDueToXSColls) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_12_TX_INT_ERR,
+ data->mac_stats.FramesLostDueToIntMACXmitError) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_13_CS_ERR,
+ data->mac_stats.CarrierSenseErrors) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_14_RX_BYTES,
+ data->mac_stats.OctetsReceivedOK) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_15_RX_INT_ERR,
+ data->mac_stats.FramesLostDueToIntMACRcvError) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_18_TX_MCAST,
+ data->mac_stats.MulticastFramesXmittedOK) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_19_TX_BCAST,
+ data->mac_stats.BroadcastFramesXmittedOK) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_20_XS_DEFER,
+ data->mac_stats.FramesWithExcessiveDeferral) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_21_RX_MCAST,
+ data->mac_stats.MulticastFramesReceivedOK) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_22_RX_BCAST,
+ data->mac_stats.BroadcastFramesReceivedOK) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_23_IR_LEN_ERR,
+ data->mac_stats.InRangeLengthErrors) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_24_OOR_LEN,
+ data->mac_stats.OutOfRangeLengthField) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_MAC_25_TOO_LONG_ERR,
+ data->mac_stats.FrameTooLongErrors))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int stats_put_ctrl_stats(struct sk_buff *skb,
+ const struct stats_reply_data *data)
+{
+ if (stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_3_TX,
+ data->ctrl_stats.MACControlFramesTransmitted) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_4_RX,
+ data->ctrl_stats.MACControlFramesReceived) ||
+ stat_put(skb, ETHTOOL_A_STATS_ETH_CTRL_5_RX_UNSUP,
+ data->ctrl_stats.UnsupportedOpcodesReceived))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int stats_put_rmon_hist(struct sk_buff *skb, u32 attr, const u64 *hist,
+ const struct ethtool_rmon_hist_range *ranges)
+{
+ struct nlattr *nest;
+ int i;
+
+ if (!ranges)
+ return 0;
+
+ for (i = 0; i < ETHTOOL_RMON_HIST_MAX; i++) {
+ if (!ranges[i].low && !ranges[i].high)
+ break;
+ if (hist[i] == ETHTOOL_STAT_NOT_SET)
+ continue;
+
+ nest = nla_nest_start(skb, attr);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_STATS_GRP_HIST_BKT_LOW,
+ ranges[i].low) ||
+ nla_put_u32(skb, ETHTOOL_A_STATS_GRP_HIST_BKT_HI,
+ ranges[i].high) ||
+ nla_put_u64_64bit(skb, ETHTOOL_A_STATS_GRP_HIST_VAL,
+ hist[i], ETHTOOL_A_STATS_GRP_PAD))
+ goto err_cancel_hist;
+
+ nla_nest_end(skb, nest);
+ }
+
+ return 0;
+
+err_cancel_hist:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int stats_put_rmon_stats(struct sk_buff *skb,
+ const struct stats_reply_data *data)
+{
+ if (stats_put_rmon_hist(skb, ETHTOOL_A_STATS_GRP_HIST_RX,
+ data->rmon_stats.hist, data->rmon_ranges) ||
+ stats_put_rmon_hist(skb, ETHTOOL_A_STATS_GRP_HIST_TX,
+ data->rmon_stats.hist_tx, data->rmon_ranges))
+ return -EMSGSIZE;
+
+ if (stat_put(skb, ETHTOOL_A_STATS_RMON_UNDERSIZE,
+ data->rmon_stats.undersize_pkts) ||
+ stat_put(skb, ETHTOOL_A_STATS_RMON_OVERSIZE,
+ data->rmon_stats.oversize_pkts) ||
+ stat_put(skb, ETHTOOL_A_STATS_RMON_FRAG,
+ data->rmon_stats.fragments) ||
+ stat_put(skb, ETHTOOL_A_STATS_RMON_JABBER,
+ data->rmon_stats.jabbers))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int stats_put_stats(struct sk_buff *skb,
+ const struct stats_reply_data *data,
+ u32 id, u32 ss_id,
+ int (*cb)(struct sk_buff *skb,
+ const struct stats_reply_data *data))
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_STATS_GRP);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_STATS_GRP_ID, id) ||
+ nla_put_u32(skb, ETHTOOL_A_STATS_GRP_SS_ID, ss_id))
+ goto err_cancel;
+
+ if (cb(skb, data))
+ goto err_cancel;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+err_cancel:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int stats_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct stats_req_info *req_info = STATS_REQINFO(req_base);
+ const struct stats_reply_data *data = STATS_REPDATA(reply_base);
+ int ret = 0;
+
+ if (nla_put_u32(skb, ETHTOOL_A_STATS_SRC, req_info->src))
+ return -EMSGSIZE;
+
+ if (!ret && test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask))
+ ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_PHY,
+ ETH_SS_STATS_ETH_PHY,
+ stats_put_phy_stats);
+ if (!ret && test_bit(ETHTOOL_STATS_ETH_MAC, req_info->stat_mask))
+ ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_MAC,
+ ETH_SS_STATS_ETH_MAC,
+ stats_put_mac_stats);
+ if (!ret && test_bit(ETHTOOL_STATS_ETH_CTRL, req_info->stat_mask))
+ ret = stats_put_stats(skb, data, ETHTOOL_STATS_ETH_CTRL,
+ ETH_SS_STATS_ETH_CTRL,
+ stats_put_ctrl_stats);
+ if (!ret && test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask))
+ ret = stats_put_stats(skb, data, ETHTOOL_STATS_RMON,
+ ETH_SS_STATS_RMON, stats_put_rmon_stats);
+ if (!ret && test_bit(ETHTOOL_STATS_PHY, req_info->stat_mask))
+ ret = stats_put_stats(skb, data, ETHTOOL_STATS_PHY,
+ ETH_SS_STATS_PHY, stats_put_phydev_stats);
+
+ return ret;
+}
+
+const struct ethnl_request_ops ethnl_stats_request_ops = {
+ .request_cmd = ETHTOOL_MSG_STATS_GET,
+ .reply_cmd = ETHTOOL_MSG_STATS_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_STATS_HEADER,
+ .req_info_size = sizeof(struct stats_req_info),
+ .reply_data_size = sizeof(struct stats_reply_data),
+
+ .parse_request = stats_parse_request,
+ .prepare_data = stats_prepare_data,
+ .reply_size = stats_reply_size,
+ .fill_reply = stats_fill_reply,
+};
+
+static u64 ethtool_stats_sum(u64 a, u64 b)
+{
+ if (a == ETHTOOL_STAT_NOT_SET)
+ return b;
+ if (b == ETHTOOL_STAT_NOT_SET)
+ return a;
+ return a + b;
+}
+
+/* Avoid modifying the aggregation procedure every time a new counter is added
+ * by treating the structures as an array of u64 statistics.
+ */
+static void ethtool_aggregate_stats(void *aggr_stats, const void *emac_stats,
+ const void *pmac_stats, size_t stats_size,
+ size_t stats_offset)
+{
+ size_t num_stats = stats_size / sizeof(u64);
+ const u64 *s1 = emac_stats + stats_offset;
+ const u64 *s2 = pmac_stats + stats_offset;
+ u64 *s = aggr_stats + stats_offset;
+ int i;
+
+ for (i = 0; i < num_stats; i++)
+ s[i] = ethtool_stats_sum(s1[i], s2[i]);
+}
+
+void ethtool_aggregate_mac_stats(struct net_device *dev,
+ struct ethtool_eth_mac_stats *mac_stats)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_eth_mac_stats pmac, emac;
+
+ memset(&emac, 0xff, sizeof(emac));
+ memset(&pmac, 0xff, sizeof(pmac));
+ emac.src = ETHTOOL_MAC_STATS_SRC_EMAC;
+ pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC;
+
+ ops->get_eth_mac_stats(dev, &emac);
+ ops->get_eth_mac_stats(dev, &pmac);
+
+ ethtool_aggregate_stats(mac_stats, &emac, &pmac,
+ sizeof(mac_stats->stats),
+ offsetof(struct ethtool_eth_mac_stats, stats));
+}
+EXPORT_SYMBOL(ethtool_aggregate_mac_stats);
+
+void ethtool_aggregate_phy_stats(struct net_device *dev,
+ struct ethtool_eth_phy_stats *phy_stats)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_eth_phy_stats pmac, emac;
+
+ memset(&emac, 0xff, sizeof(emac));
+ memset(&pmac, 0xff, sizeof(pmac));
+ emac.src = ETHTOOL_MAC_STATS_SRC_EMAC;
+ pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC;
+
+ ops->get_eth_phy_stats(dev, &emac);
+ ops->get_eth_phy_stats(dev, &pmac);
+
+ ethtool_aggregate_stats(phy_stats, &emac, &pmac,
+ sizeof(phy_stats->stats),
+ offsetof(struct ethtool_eth_phy_stats, stats));
+}
+EXPORT_SYMBOL(ethtool_aggregate_phy_stats);
+
+void ethtool_aggregate_ctrl_stats(struct net_device *dev,
+ struct ethtool_eth_ctrl_stats *ctrl_stats)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_eth_ctrl_stats pmac, emac;
+
+ memset(&emac, 0xff, sizeof(emac));
+ memset(&pmac, 0xff, sizeof(pmac));
+ emac.src = ETHTOOL_MAC_STATS_SRC_EMAC;
+ pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC;
+
+ ops->get_eth_ctrl_stats(dev, &emac);
+ ops->get_eth_ctrl_stats(dev, &pmac);
+
+ ethtool_aggregate_stats(ctrl_stats, &emac, &pmac,
+ sizeof(ctrl_stats->stats),
+ offsetof(struct ethtool_eth_ctrl_stats, stats));
+}
+EXPORT_SYMBOL(ethtool_aggregate_ctrl_stats);
+
+void ethtool_aggregate_pause_stats(struct net_device *dev,
+ struct ethtool_pause_stats *pause_stats)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_pause_stats pmac, emac;
+
+ memset(&emac, 0xff, sizeof(emac));
+ memset(&pmac, 0xff, sizeof(pmac));
+ emac.src = ETHTOOL_MAC_STATS_SRC_EMAC;
+ pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC;
+
+ ops->get_pause_stats(dev, &emac);
+ ops->get_pause_stats(dev, &pmac);
+
+ ethtool_aggregate_stats(pause_stats, &emac, &pmac,
+ sizeof(pause_stats->stats),
+ offsetof(struct ethtool_pause_stats, stats));
+}
+EXPORT_SYMBOL(ethtool_aggregate_pause_stats);
+
+void ethtool_aggregate_rmon_stats(struct net_device *dev,
+ struct ethtool_rmon_stats *rmon_stats)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ const struct ethtool_rmon_hist_range *dummy;
+ struct ethtool_rmon_stats pmac, emac;
+
+ memset(&emac, 0xff, sizeof(emac));
+ memset(&pmac, 0xff, sizeof(pmac));
+ emac.src = ETHTOOL_MAC_STATS_SRC_EMAC;
+ pmac.src = ETHTOOL_MAC_STATS_SRC_PMAC;
+
+ ops->get_rmon_stats(dev, &emac, &dummy);
+ ops->get_rmon_stats(dev, &pmac, &dummy);
+
+ ethtool_aggregate_stats(rmon_stats, &emac, &pmac,
+ sizeof(rmon_stats->stats),
+ offsetof(struct ethtool_rmon_stats, stats));
+}
+EXPORT_SYMBOL(ethtool_aggregate_rmon_stats);
diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c
new file mode 100644
index 000000000000..f6a67109beda
--- /dev/null
+++ b/net/ethtool/strset.c
@@ -0,0 +1,499 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool.h>
+#include <linux/phy.h>
+#include "netlink.h"
+#include "common.h"
+
+struct strset_info {
+ bool per_dev;
+ bool free_strings;
+ unsigned int count;
+ const char (*strings)[ETH_GSTRING_LEN];
+};
+
+static const struct strset_info info_template[] = {
+ [ETH_SS_TEST] = {
+ .per_dev = true,
+ },
+ [ETH_SS_STATS] = {
+ .per_dev = true,
+ },
+ [ETH_SS_PRIV_FLAGS] = {
+ .per_dev = true,
+ },
+ [ETH_SS_FEATURES] = {
+ .per_dev = false,
+ .count = ARRAY_SIZE(netdev_features_strings),
+ .strings = netdev_features_strings,
+ },
+ [ETH_SS_RSS_HASH_FUNCS] = {
+ .per_dev = false,
+ .count = ARRAY_SIZE(rss_hash_func_strings),
+ .strings = rss_hash_func_strings,
+ },
+ [ETH_SS_TUNABLES] = {
+ .per_dev = false,
+ .count = ARRAY_SIZE(tunable_strings),
+ .strings = tunable_strings,
+ },
+ [ETH_SS_PHY_STATS] = {
+ .per_dev = true,
+ },
+ [ETH_SS_PHY_TUNABLES] = {
+ .per_dev = false,
+ .count = ARRAY_SIZE(phy_tunable_strings),
+ .strings = phy_tunable_strings,
+ },
+ [ETH_SS_LINK_MODES] = {
+ .per_dev = false,
+ .count = __ETHTOOL_LINK_MODE_MASK_NBITS,
+ .strings = link_mode_names,
+ },
+ [ETH_SS_MSG_CLASSES] = {
+ .per_dev = false,
+ .count = NETIF_MSG_CLASS_COUNT,
+ .strings = netif_msg_class_names,
+ },
+ [ETH_SS_WOL_MODES] = {
+ .per_dev = false,
+ .count = WOL_MODE_COUNT,
+ .strings = wol_mode_names,
+ },
+ [ETH_SS_SOF_TIMESTAMPING] = {
+ .per_dev = false,
+ .count = __SOF_TIMESTAMPING_CNT,
+ .strings = sof_timestamping_names,
+ },
+ [ETH_SS_TS_TX_TYPES] = {
+ .per_dev = false,
+ .count = __HWTSTAMP_TX_CNT,
+ .strings = ts_tx_type_names,
+ },
+ [ETH_SS_TS_RX_FILTERS] = {
+ .per_dev = false,
+ .count = __HWTSTAMP_FILTER_CNT,
+ .strings = ts_rx_filter_names,
+ },
+ [ETH_SS_TS_FLAGS] = {
+ .per_dev = false,
+ .count = __HWTSTAMP_FLAG_CNT,
+ .strings = ts_flags_names,
+ },
+ [ETH_SS_UDP_TUNNEL_TYPES] = {
+ .per_dev = false,
+ .count = __ETHTOOL_UDP_TUNNEL_TYPE_CNT,
+ .strings = udp_tunnel_type_names,
+ },
+ [ETH_SS_STATS_STD] = {
+ .per_dev = false,
+ .count = __ETHTOOL_STATS_CNT,
+ .strings = stats_std_names,
+ },
+ [ETH_SS_STATS_ETH_PHY] = {
+ .per_dev = false,
+ .count = __ETHTOOL_A_STATS_ETH_PHY_CNT,
+ .strings = stats_eth_phy_names,
+ },
+ [ETH_SS_STATS_ETH_MAC] = {
+ .per_dev = false,
+ .count = __ETHTOOL_A_STATS_ETH_MAC_CNT,
+ .strings = stats_eth_mac_names,
+ },
+ [ETH_SS_STATS_ETH_CTRL] = {
+ .per_dev = false,
+ .count = __ETHTOOL_A_STATS_ETH_CTRL_CNT,
+ .strings = stats_eth_ctrl_names,
+ },
+ [ETH_SS_STATS_RMON] = {
+ .per_dev = false,
+ .count = __ETHTOOL_A_STATS_RMON_CNT,
+ .strings = stats_rmon_names,
+ },
+ [ETH_SS_STATS_PHY] = {
+ .per_dev = false,
+ .count = __ETHTOOL_A_STATS_PHY_CNT,
+ .strings = stats_phy_names,
+ },
+};
+
+struct strset_req_info {
+ struct ethnl_req_info base;
+ u32 req_ids;
+ bool counts_only;
+};
+
+#define STRSET_REQINFO(__req_base) \
+ container_of(__req_base, struct strset_req_info, base)
+
+struct strset_reply_data {
+ struct ethnl_reply_data base;
+ struct strset_info sets[ETH_SS_COUNT];
+};
+
+#define STRSET_REPDATA(__reply_base) \
+ container_of(__reply_base, struct strset_reply_data, base)
+
+const struct nla_policy ethnl_strset_get_policy[] = {
+ [ETHTOOL_A_STRSET_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy_phy),
+ [ETHTOOL_A_STRSET_STRINGSETS] = { .type = NLA_NESTED },
+ [ETHTOOL_A_STRSET_COUNTS_ONLY] = { .type = NLA_FLAG },
+};
+
+static const struct nla_policy get_stringset_policy[] = {
+ [ETHTOOL_A_STRINGSET_ID] = { .type = NLA_U32 },
+};
+
+/**
+ * strset_include() - test if a string set should be included in reply
+ * @info: parsed client request
+ * @data: pointer to request data structure
+ * @id: id of string set to check (ETH_SS_* constants)
+ */
+static bool strset_include(const struct strset_req_info *info,
+ const struct strset_reply_data *data, u32 id)
+{
+ bool per_dev;
+
+ BUILD_BUG_ON(ETH_SS_COUNT >= BITS_PER_BYTE * sizeof(info->req_ids));
+
+ if (info->req_ids)
+ return info->req_ids & (1U << id);
+ per_dev = data->sets[id].per_dev;
+ if (!per_dev && !data->sets[id].strings)
+ return false;
+
+ return data->base.dev ? per_dev : !per_dev;
+}
+
+static int strset_get_id(const struct nlattr *nest, u32 *val,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(get_stringset_policy)];
+ int ret;
+
+ ret = nla_parse_nested(tb, ARRAY_SIZE(get_stringset_policy) - 1, nest,
+ get_stringset_policy, extack);
+ if (ret < 0)
+ return ret;
+ if (NL_REQ_ATTR_CHECK(extack, nest, tb, ETHTOOL_A_STRINGSET_ID))
+ return -EINVAL;
+
+ *val = nla_get_u32(tb[ETHTOOL_A_STRINGSET_ID]);
+ return 0;
+}
+
+static const struct nla_policy strset_stringsets_policy[] = {
+ [ETHTOOL_A_STRINGSETS_STRINGSET] = { .type = NLA_NESTED },
+};
+
+static int strset_parse_request(struct ethnl_req_info *req_base,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct strset_req_info *req_info = STRSET_REQINFO(req_base);
+ struct nlattr *nest = tb[ETHTOOL_A_STRSET_STRINGSETS];
+ struct nlattr *attr;
+ int rem, ret;
+
+ if (!nest)
+ return 0;
+ ret = nla_validate_nested(nest,
+ ARRAY_SIZE(strset_stringsets_policy) - 1,
+ strset_stringsets_policy, extack);
+ if (ret < 0)
+ return ret;
+
+ req_info->counts_only = tb[ETHTOOL_A_STRSET_COUNTS_ONLY];
+ nla_for_each_nested(attr, nest, rem) {
+ u32 id;
+
+ if (WARN_ONCE(nla_type(attr) != ETHTOOL_A_STRINGSETS_STRINGSET,
+ "unexpected attrtype %u in ETHTOOL_A_STRSET_STRINGSETS\n",
+ nla_type(attr)))
+ return -EINVAL;
+
+ ret = strset_get_id(attr, &id, extack);
+ if (ret < 0)
+ return ret;
+ if (id >= ETH_SS_COUNT) {
+ NL_SET_ERR_MSG_ATTR(extack, attr,
+ "unknown string set id");
+ return -EOPNOTSUPP;
+ }
+
+ req_info->req_ids |= (1U << id);
+ }
+
+ return 0;
+}
+
+static void strset_cleanup_data(struct ethnl_reply_data *reply_base)
+{
+ struct strset_reply_data *data = STRSET_REPDATA(reply_base);
+ unsigned int i;
+
+ for (i = 0; i < ETH_SS_COUNT; i++)
+ if (data->sets[i].free_strings) {
+ kfree(data->sets[i].strings);
+ data->sets[i].strings = NULL;
+ data->sets[i].free_strings = false;
+ }
+}
+
+static int strset_prepare_set(struct strset_info *info, struct net_device *dev,
+ struct phy_device *phydev, unsigned int id,
+ bool counts_only)
+{
+ const struct ethtool_phy_ops *phy_ops = ethtool_phy_ops;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ void *strings;
+ int count, ret;
+
+ if (id == ETH_SS_PHY_STATS && phydev &&
+ !ops->get_ethtool_phy_stats && phy_ops &&
+ phy_ops->get_sset_count)
+ ret = phy_ops->get_sset_count(phydev);
+ else if (ops->get_sset_count && ops->get_strings)
+ ret = ops->get_sset_count(dev, id);
+ else
+ ret = -EOPNOTSUPP;
+ if (ret <= 0) {
+ info->count = 0;
+ return 0;
+ }
+
+ count = ret;
+ if (!counts_only) {
+ strings = kcalloc(count, ETH_GSTRING_LEN, GFP_KERNEL);
+ if (!strings)
+ return -ENOMEM;
+ if (id == ETH_SS_PHY_STATS && phydev &&
+ !ops->get_ethtool_phy_stats && phy_ops &&
+ phy_ops->get_strings)
+ phy_ops->get_strings(phydev, strings);
+ else
+ ops->get_strings(dev, id, strings);
+ info->strings = strings;
+ info->free_strings = true;
+ }
+ info->count = count;
+
+ return 0;
+}
+
+static int strset_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ const struct strset_req_info *req_info = STRSET_REQINFO(req_base);
+ struct strset_reply_data *data = STRSET_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ struct nlattr **tb = info->attrs;
+ struct phy_device *phydev;
+ unsigned int i;
+ int ret;
+
+ BUILD_BUG_ON(ARRAY_SIZE(info_template) != ETH_SS_COUNT);
+ memcpy(&data->sets, &info_template, sizeof(data->sets));
+
+ if (!dev) {
+ for (i = 0; i < ETH_SS_COUNT; i++) {
+ if ((req_info->req_ids & (1U << i)) &&
+ data->sets[i].per_dev) {
+ GENL_SET_ERR_MSG(info, "requested per device strings without dev");
+ return -EINVAL;
+ }
+ }
+ return 0;
+ }
+
+ phydev = ethnl_req_get_phydev(req_base, tb, ETHTOOL_A_HEADER_FLAGS,
+ info->extack);
+
+ /* phydev can be NULL, check for errors only */
+ if (IS_ERR(phydev))
+ return PTR_ERR(phydev);
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ goto err_strset;
+ for (i = 0; i < ETH_SS_COUNT; i++) {
+ if (!strset_include(req_info, data, i) ||
+ !data->sets[i].per_dev)
+ continue;
+
+ ret = strset_prepare_set(&data->sets[i], dev, phydev, i,
+ req_info->counts_only);
+ if (ret < 0)
+ goto err_ops;
+ }
+ ethnl_ops_complete(dev);
+
+ return 0;
+err_ops:
+ ethnl_ops_complete(dev);
+err_strset:
+ strset_cleanup_data(reply_base);
+ return ret;
+}
+
+/* calculate size of ETHTOOL_A_STRSET_STRINGSET nest for one string set */
+static int strset_set_size(const struct strset_info *info, bool counts_only)
+{
+ unsigned int len = 0;
+ unsigned int i;
+
+ if (info->count == 0)
+ return 0;
+ if (counts_only)
+ return nla_total_size(2 * nla_total_size(sizeof(u32)));
+
+ for (i = 0; i < info->count; i++) {
+ const char *str = info->strings[i];
+
+ /* ETHTOOL_A_STRING_INDEX, ETHTOOL_A_STRING_VALUE, nest */
+ len += nla_total_size(nla_total_size(sizeof(u32)) +
+ ethnl_strz_size(str));
+ }
+ /* ETHTOOL_A_STRINGSET_ID, ETHTOOL_A_STRINGSET_COUNT */
+ len = 2 * nla_total_size(sizeof(u32)) + nla_total_size(len);
+
+ return nla_total_size(len);
+}
+
+static int strset_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct strset_req_info *req_info = STRSET_REQINFO(req_base);
+ const struct strset_reply_data *data = STRSET_REPDATA(reply_base);
+ unsigned int i;
+ int len = 0;
+ int ret;
+
+ len += nla_total_size(0); /* ETHTOOL_A_STRSET_STRINGSETS */
+
+ for (i = 0; i < ETH_SS_COUNT; i++) {
+ const struct strset_info *set_info = &data->sets[i];
+
+ if (!strset_include(req_info, data, i))
+ continue;
+
+ ret = strset_set_size(set_info, req_info->counts_only);
+ if (ret < 0)
+ return ret;
+ len += ret;
+ }
+
+ return len;
+}
+
+/* fill one string into reply */
+static int strset_fill_string(struct sk_buff *skb,
+ const struct strset_info *set_info, u32 idx)
+{
+ struct nlattr *string_attr;
+ const char *value;
+
+ value = set_info->strings[idx];
+
+ string_attr = nla_nest_start(skb, ETHTOOL_A_STRINGS_STRING);
+ if (!string_attr)
+ return -EMSGSIZE;
+ if (nla_put_u32(skb, ETHTOOL_A_STRING_INDEX, idx) ||
+ ethnl_put_strz(skb, ETHTOOL_A_STRING_VALUE, value))
+ goto nla_put_failure;
+ nla_nest_end(skb, string_attr);
+
+ return 0;
+nla_put_failure:
+ nla_nest_cancel(skb, string_attr);
+ return -EMSGSIZE;
+}
+
+/* fill one string set into reply */
+static int strset_fill_set(struct sk_buff *skb,
+ const struct strset_info *set_info, u32 id,
+ bool counts_only)
+{
+ struct nlattr *stringset_attr;
+ struct nlattr *strings_attr;
+ unsigned int i;
+
+ if (!set_info->per_dev && !set_info->strings)
+ return -EOPNOTSUPP;
+ if (set_info->count == 0)
+ return 0;
+ stringset_attr = nla_nest_start(skb, ETHTOOL_A_STRINGSETS_STRINGSET);
+ if (!stringset_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_STRINGSET_ID, id) ||
+ nla_put_u32(skb, ETHTOOL_A_STRINGSET_COUNT, set_info->count))
+ goto nla_put_failure;
+
+ if (!counts_only) {
+ strings_attr = nla_nest_start(skb, ETHTOOL_A_STRINGSET_STRINGS);
+ if (!strings_attr)
+ goto nla_put_failure;
+ for (i = 0; i < set_info->count; i++) {
+ if (strset_fill_string(skb, set_info, i) < 0)
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, strings_attr);
+ }
+
+ nla_nest_end(skb, stringset_attr);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, stringset_attr);
+ return -EMSGSIZE;
+}
+
+static int strset_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct strset_req_info *req_info = STRSET_REQINFO(req_base);
+ const struct strset_reply_data *data = STRSET_REPDATA(reply_base);
+ struct nlattr *nest;
+ unsigned int i;
+ int ret;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_STRSET_STRINGSETS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ for (i = 0; i < ETH_SS_COUNT; i++) {
+ if (strset_include(req_info, data, i)) {
+ ret = strset_fill_set(skb, &data->sets[i], i,
+ req_info->counts_only);
+ if (ret < 0)
+ goto nla_put_failure;
+ }
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return ret;
+}
+
+const struct ethnl_request_ops ethnl_strset_request_ops = {
+ .request_cmd = ETHTOOL_MSG_STRSET_GET,
+ .reply_cmd = ETHTOOL_MSG_STRSET_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_STRSET_HEADER,
+ .req_info_size = sizeof(struct strset_req_info),
+ .reply_data_size = sizeof(struct strset_reply_data),
+ .allow_nodev_do = true,
+
+ .parse_request = strset_parse_request,
+ .prepare_data = strset_prepare_data,
+ .reply_size = strset_reply_size,
+ .fill_reply = strset_fill_reply,
+ .cleanup_data = strset_cleanup_data,
+};
diff --git a/net/ethtool/ts.h b/net/ethtool/ts.h
new file mode 100644
index 000000000000..d901a879a671
--- /dev/null
+++ b/net/ethtool/ts.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef _NET_ETHTOOL_TS_H
+#define _NET_ETHTOOL_TS_H
+
+#include "netlink.h"
+
+static const struct nla_policy
+ethnl_ts_hwtst_prov_policy[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_MAX + 1] = {
+ [ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX] = { .type = NLA_U32 },
+ [ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER] =
+ NLA_POLICY_MAX(NLA_U32, HWTSTAMP_PROVIDER_QUALIFIER_CNT - 1)
+};
+
+int ts_parse_hwtst_provider(const struct nlattr *nest,
+ struct hwtstamp_provider_desc *hwprov_desc,
+ struct netlink_ext_ack *extack,
+ bool *mod);
+
+#endif /* _NET_ETHTOOL_TS_H */
diff --git a/net/ethtool/tsconfig.c b/net/ethtool/tsconfig.c
new file mode 100644
index 000000000000..169b413b31fc
--- /dev/null
+++ b/net/ethtool/tsconfig.c
@@ -0,0 +1,455 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/net_tstamp.h>
+#include <linux/ptp_clock_kernel.h>
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+#include "../core/dev.h"
+#include "ts.h"
+
+struct tsconfig_req_info {
+ struct ethnl_req_info base;
+};
+
+struct tsconfig_reply_data {
+ struct ethnl_reply_data base;
+ struct hwtstamp_provider_desc hwprov_desc;
+ struct {
+ u32 tx_type;
+ u32 rx_filter;
+ u32 flags;
+ } hwtst_config;
+};
+
+#define TSCONFIG_REPDATA(__reply_base) \
+ container_of(__reply_base, struct tsconfig_reply_data, base)
+
+const struct nla_policy ethnl_tsconfig_get_policy[ETHTOOL_A_TSCONFIG_HEADER + 1] = {
+ [ETHTOOL_A_TSCONFIG_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int tsconfig_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base);
+ struct hwtstamp_provider *hwprov = NULL;
+ struct net_device *dev = reply_base->dev;
+ struct kernel_hwtstamp_config cfg = {};
+ int ret;
+
+ if (!dev->netdev_ops->ndo_hwtstamp_get)
+ return -EOPNOTSUPP;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ ret = dev_get_hwtstamp_phylib(dev, &cfg);
+ if (ret)
+ goto out;
+
+ data->hwtst_config.tx_type = BIT(cfg.tx_type);
+ data->hwtst_config.rx_filter = BIT(cfg.rx_filter);
+ data->hwtst_config.flags = cfg.flags;
+
+ data->hwprov_desc.index = -1;
+ hwprov = rtnl_dereference(dev->hwprov);
+ if (hwprov) {
+ data->hwprov_desc.index = hwprov->desc.index;
+ data->hwprov_desc.qualifier = hwprov->desc.qualifier;
+ } else {
+ struct kernel_ethtool_ts_info ts_info = {};
+
+ ts_info.phc_index = -1;
+ ret = __ethtool_get_ts_info(dev, &ts_info);
+ if (ret)
+ goto out;
+
+ if (ts_info.phc_index == -1)
+ return -ENODEV;
+
+ data->hwprov_desc.index = ts_info.phc_index;
+ data->hwprov_desc.qualifier = ts_info.phc_qualifier;
+ }
+
+out:
+ ethnl_ops_complete(dev);
+ return ret;
+}
+
+static int tsconfig_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ int len = 0;
+ int ret;
+
+ BUILD_BUG_ON(__HWTSTAMP_TX_CNT > 32);
+ BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT > 32);
+ BUILD_BUG_ON(__HWTSTAMP_FLAG_CNT > 32);
+
+ if (data->hwtst_config.flags) {
+ ret = ethnl_bitset32_size(&data->hwtst_config.flags,
+ NULL, __HWTSTAMP_FLAG_CNT,
+ ts_flags_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret; /* _TSCONFIG_HWTSTAMP_FLAGS */
+ }
+
+ if (data->hwtst_config.tx_type) {
+ ret = ethnl_bitset32_size(&data->hwtst_config.tx_type,
+ NULL, __HWTSTAMP_TX_CNT,
+ ts_tx_type_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret; /* _TSCONFIG_TX_TYPES */
+ }
+ if (data->hwtst_config.rx_filter) {
+ ret = ethnl_bitset32_size(&data->hwtst_config.rx_filter,
+ NULL, __HWTSTAMP_FILTER_CNT,
+ ts_rx_filter_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret; /* _TSCONFIG_RX_FILTERS */
+ }
+
+ if (data->hwprov_desc.index >= 0)
+ /* _TSCONFIG_HWTSTAMP_PROVIDER */
+ len += nla_total_size(0) +
+ 2 * nla_total_size(sizeof(u32));
+
+ return len;
+}
+
+static int tsconfig_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ int ret;
+
+ if (data->hwtst_config.flags) {
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS,
+ &data->hwtst_config.flags, NULL,
+ __HWTSTAMP_FLAG_CNT,
+ ts_flags_names, compact);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (data->hwtst_config.tx_type) {
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_TX_TYPES,
+ &data->hwtst_config.tx_type, NULL,
+ __HWTSTAMP_TX_CNT,
+ ts_tx_type_names, compact);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (data->hwtst_config.rx_filter) {
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_RX_FILTERS,
+ &data->hwtst_config.rx_filter,
+ NULL, __HWTSTAMP_FILTER_CNT,
+ ts_rx_filter_names, compact);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (data->hwprov_desc.index >= 0) {
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX,
+ data->hwprov_desc.index) ||
+ nla_put_u32(skb,
+ ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER,
+ data->hwprov_desc.qualifier)) {
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(skb, nest);
+ }
+ return 0;
+}
+
+/* TSCONFIG_SET */
+const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1] = {
+ [ETHTOOL_A_TSCONFIG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER] =
+ NLA_POLICY_NESTED(ethnl_ts_hwtst_prov_policy),
+ [ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS] = { .type = NLA_NESTED },
+ [ETHTOOL_A_TSCONFIG_RX_FILTERS] = { .type = NLA_NESTED },
+ [ETHTOOL_A_TSCONFIG_TX_TYPES] = { .type = NLA_NESTED },
+};
+
+static int tsconfig_send_reply(struct net_device *dev, struct genl_info *info)
+{
+ struct tsconfig_reply_data *reply_data;
+ struct tsconfig_req_info *req_info;
+ struct sk_buff *rskb;
+ void *reply_payload;
+ int reply_len = 0;
+ int ret;
+
+ req_info = kzalloc(sizeof(*req_info), GFP_KERNEL);
+ if (!req_info)
+ return -ENOMEM;
+ reply_data = kmalloc(sizeof(*reply_data), GFP_KERNEL);
+ if (!reply_data) {
+ kfree(req_info);
+ return -ENOMEM;
+ }
+
+ ASSERT_RTNL();
+ reply_data->base.dev = dev;
+ ret = tsconfig_prepare_data(&req_info->base, &reply_data->base, info);
+ if (ret < 0)
+ goto err_cleanup;
+
+ ret = tsconfig_reply_size(&req_info->base, &reply_data->base);
+ if (ret < 0)
+ goto err_cleanup;
+
+ reply_len = ret + ethnl_reply_header_size();
+ rskb = ethnl_reply_init(reply_len, dev, ETHTOOL_MSG_TSCONFIG_SET_REPLY,
+ ETHTOOL_A_TSCONFIG_HEADER, info, &reply_payload);
+ if (!rskb)
+ goto err_cleanup;
+
+ ret = tsconfig_fill_reply(rskb, &req_info->base, &reply_data->base);
+ if (ret < 0)
+ goto err_cleanup;
+
+ genlmsg_end(rskb, reply_payload);
+ ret = genlmsg_reply(rskb, info);
+
+err_cleanup:
+ kfree(reply_data);
+ kfree(req_info);
+ return ret;
+}
+
+static int ethnl_set_tsconfig_validate(struct ethnl_req_info *req_base,
+ struct genl_info *info)
+{
+ const struct net_device_ops *ops = req_base->dev->netdev_ops;
+
+ if (!ops->ndo_hwtstamp_set || !ops->ndo_hwtstamp_get)
+ return -EOPNOTSUPP;
+
+ return 1;
+}
+
+static struct hwtstamp_provider *
+tsconfig_set_hwprov_from_desc(struct net_device *dev,
+ struct genl_info *info,
+ struct hwtstamp_provider_desc *hwprov_desc)
+{
+ struct kernel_ethtool_ts_info ts_info;
+ struct hwtstamp_provider *hwprov;
+ struct nlattr **tb = info->attrs;
+ struct phy_device *phy = NULL;
+ enum hwtstamp_source source;
+ int ret;
+
+ ret = ethtool_net_get_ts_info_by_phc(dev, &ts_info, hwprov_desc);
+ if (!ret) {
+ /* Found */
+ source = HWTSTAMP_SOURCE_NETDEV;
+ } else {
+ phy = ethtool_phy_get_ts_info_by_phc(dev, &ts_info, hwprov_desc);
+ if (IS_ERR(phy)) {
+ if (PTR_ERR(phy) == -ENODEV)
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER],
+ "phc not in this net device topology");
+ return ERR_CAST(phy);
+ }
+
+ source = HWTSTAMP_SOURCE_PHYLIB;
+ }
+
+ hwprov = kzalloc(sizeof(*hwprov), GFP_KERNEL);
+ if (!hwprov)
+ return ERR_PTR(-ENOMEM);
+
+ hwprov->desc.index = hwprov_desc->index;
+ hwprov->desc.qualifier = hwprov_desc->qualifier;
+ hwprov->source = source;
+ hwprov->phydev = phy;
+
+ return hwprov;
+}
+
+static int ethnl_set_tsconfig(struct ethnl_req_info *req_base,
+ struct genl_info *info)
+{
+ struct kernel_hwtstamp_config hwtst_config = {0};
+ bool hwprov_mod = false, config_mod = false;
+ struct hwtstamp_provider *hwprov = NULL;
+ struct net_device *dev = req_base->dev;
+ struct nlattr **tb = info->attrs;
+ int ret;
+
+ BUILD_BUG_ON(__HWTSTAMP_TX_CNT >= 32);
+ BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT >= 32);
+ BUILD_BUG_ON(__HWTSTAMP_FLAG_CNT > 32);
+
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ if (tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER]) {
+ struct hwtstamp_provider_desc __hwprov_desc = {.index = -1};
+ struct hwtstamp_provider *__hwprov;
+
+ __hwprov = rtnl_dereference(dev->hwprov);
+ if (__hwprov) {
+ __hwprov_desc.index = __hwprov->desc.index;
+ __hwprov_desc.qualifier = __hwprov->desc.qualifier;
+ }
+
+ ret = ts_parse_hwtst_provider(tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER],
+ &__hwprov_desc, info->extack,
+ &hwprov_mod);
+ if (ret < 0)
+ return ret;
+
+ if (hwprov_mod) {
+ hwprov = tsconfig_set_hwprov_from_desc(dev, info,
+ &__hwprov_desc);
+ if (IS_ERR(hwprov))
+ return PTR_ERR(hwprov);
+ }
+ }
+
+ /* Get current hwtstamp config if we are not changing the
+ * hwtstamp source. It will be zeroed in the other case.
+ */
+ if (!hwprov_mod) {
+ ret = dev_get_hwtstamp_phylib(dev, &hwtst_config);
+ if (ret < 0 && ret != -EOPNOTSUPP)
+ goto err_free_hwprov;
+ }
+
+ /* Get the hwtstamp config from netlink */
+ if (tb[ETHTOOL_A_TSCONFIG_TX_TYPES]) {
+ u32 req_tx_type;
+
+ req_tx_type = BIT(hwtst_config.tx_type);
+ ret = ethnl_update_bitset32(&req_tx_type,
+ __HWTSTAMP_TX_CNT,
+ tb[ETHTOOL_A_TSCONFIG_TX_TYPES],
+ ts_tx_type_names, info->extack,
+ &config_mod);
+ if (ret < 0)
+ goto err_free_hwprov;
+
+ /* Select only one tx type at a time */
+ if (ffs(req_tx_type) != fls(req_tx_type)) {
+ ret = -EINVAL;
+ goto err_free_hwprov;
+ }
+
+ hwtst_config.tx_type = ffs(req_tx_type) - 1;
+ }
+
+ if (tb[ETHTOOL_A_TSCONFIG_RX_FILTERS]) {
+ u32 req_rx_filter;
+
+ req_rx_filter = BIT(hwtst_config.rx_filter);
+ ret = ethnl_update_bitset32(&req_rx_filter,
+ __HWTSTAMP_FILTER_CNT,
+ tb[ETHTOOL_A_TSCONFIG_RX_FILTERS],
+ ts_rx_filter_names, info->extack,
+ &config_mod);
+ if (ret < 0)
+ goto err_free_hwprov;
+
+ /* Select only one rx filter at a time */
+ if (ffs(req_rx_filter) != fls(req_rx_filter)) {
+ ret = -EINVAL;
+ goto err_free_hwprov;
+ }
+
+ hwtst_config.rx_filter = ffs(req_rx_filter) - 1;
+ }
+
+ if (tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS]) {
+ ret = ethnl_update_bitset32(&hwtst_config.flags,
+ __HWTSTAMP_FLAG_CNT,
+ tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS],
+ ts_flags_names, info->extack,
+ &config_mod);
+ if (ret < 0)
+ goto err_free_hwprov;
+ }
+
+ ret = net_hwtstamp_validate(&hwtst_config);
+ if (ret)
+ goto err_free_hwprov;
+
+ if (hwprov_mod) {
+ struct kernel_hwtstamp_config zero_config = {0};
+ struct hwtstamp_provider *__hwprov;
+
+ /* Disable current time stamping if we try to enable
+ * another one
+ */
+ ret = dev_set_hwtstamp_phylib(dev, &zero_config, info->extack);
+ if (ret < 0)
+ goto err_free_hwprov;
+
+ /* Change the selected hwtstamp source */
+ __hwprov = rcu_replace_pointer_rtnl(dev->hwprov, hwprov);
+ if (__hwprov)
+ kfree_rcu(__hwprov, rcu_head);
+ }
+
+ if (config_mod) {
+ ret = dev_set_hwtstamp_phylib(dev, &hwtst_config,
+ info->extack);
+ if (ret < 0)
+ return ret;
+ }
+
+ ret = tsconfig_send_reply(dev, info);
+ if (ret && ret != -EOPNOTSUPP) {
+ NL_SET_ERR_MSG(info->extack,
+ "error while reading the new configuration set");
+ return ret;
+ }
+
+ /* tsconfig has no notification */
+ return 0;
+
+err_free_hwprov:
+ kfree(hwprov);
+
+ return ret;
+}
+
+const struct ethnl_request_ops ethnl_tsconfig_request_ops = {
+ .request_cmd = ETHTOOL_MSG_TSCONFIG_GET,
+ .reply_cmd = ETHTOOL_MSG_TSCONFIG_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_TSCONFIG_HEADER,
+ .req_info_size = sizeof(struct tsconfig_req_info),
+ .reply_data_size = sizeof(struct tsconfig_reply_data),
+
+ .prepare_data = tsconfig_prepare_data,
+ .reply_size = tsconfig_reply_size,
+ .fill_reply = tsconfig_fill_reply,
+
+ .set_validate = ethnl_set_tsconfig_validate,
+ .set = ethnl_set_tsconfig,
+};
diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c
new file mode 100644
index 000000000000..8c654caa6805
--- /dev/null
+++ b/net/ethtool/tsinfo.c
@@ -0,0 +1,564 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/net_tstamp.h>
+#include <linux/phy.h>
+#include <linux/phy_link_topology.h>
+#include <linux/ptp_clock_kernel.h>
+#include <net/netdev_lock.h>
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+#include "ts.h"
+
+struct tsinfo_req_info {
+ struct ethnl_req_info base;
+ struct hwtstamp_provider_desc hwprov_desc;
+};
+
+struct tsinfo_reply_data {
+ struct ethnl_reply_data base;
+ struct kernel_ethtool_ts_info ts_info;
+ struct ethtool_ts_stats stats;
+};
+
+#define TSINFO_REQINFO(__req_base) \
+ container_of(__req_base, struct tsinfo_req_info, base)
+
+#define TSINFO_REPDATA(__reply_base) \
+ container_of(__reply_base, struct tsinfo_reply_data, base)
+
+#define ETHTOOL_TS_STAT_CNT \
+ (__ETHTOOL_A_TS_STAT_CNT - (ETHTOOL_A_TS_STAT_UNSPEC + 1))
+
+const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1] = {
+ [ETHTOOL_A_TSINFO_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy_stats),
+ [ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER] =
+ NLA_POLICY_NESTED(ethnl_ts_hwtst_prov_policy),
+};
+
+int ts_parse_hwtst_provider(const struct nlattr *nest,
+ struct hwtstamp_provider_desc *hwprov_desc,
+ struct netlink_ext_ack *extack,
+ bool *mod)
+{
+ struct nlattr *tb[ARRAY_SIZE(ethnl_ts_hwtst_prov_policy)];
+ int ret;
+
+ ret = nla_parse_nested(tb,
+ ARRAY_SIZE(ethnl_ts_hwtst_prov_policy) - 1,
+ nest,
+ ethnl_ts_hwtst_prov_policy, extack);
+ if (ret < 0)
+ return ret;
+
+ if (NL_REQ_ATTR_CHECK(extack, nest, tb,
+ ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX) ||
+ NL_REQ_ATTR_CHECK(extack, nest, tb,
+ ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER))
+ return -EINVAL;
+
+ ethnl_update_u32(&hwprov_desc->index,
+ tb[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX],
+ mod);
+ ethnl_update_u32(&hwprov_desc->qualifier,
+ tb[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER],
+ mod);
+
+ return 0;
+}
+
+static int
+tsinfo_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct tsinfo_req_info *req = TSINFO_REQINFO(req_base);
+ bool mod = false;
+
+ req->hwprov_desc.index = -1;
+
+ if (!tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER])
+ return 0;
+
+ return ts_parse_hwtst_provider(tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER],
+ &req->hwprov_desc, extack, &mod);
+}
+
+static int tsinfo_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base);
+ struct tsinfo_req_info *req = TSINFO_REQINFO(req_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ if (req->hwprov_desc.index != -1) {
+ ret = ethtool_get_ts_info_by_phc(dev, &data->ts_info,
+ &req->hwprov_desc);
+ ethnl_ops_complete(dev);
+ return ret;
+ }
+
+ if (req_base->flags & ETHTOOL_FLAG_STATS) {
+ ethtool_stats_init((u64 *)&data->stats,
+ sizeof(data->stats) / sizeof(u64));
+ if (dev->ethtool_ops->get_ts_stats)
+ dev->ethtool_ops->get_ts_stats(dev, &data->stats);
+ }
+
+ ret = __ethtool_get_ts_info(dev, &data->ts_info);
+ ethnl_ops_complete(dev);
+
+ return ret;
+}
+
+static int tsinfo_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct kernel_ethtool_ts_info *ts_info = &data->ts_info;
+ int len = 0;
+ int ret;
+
+ BUILD_BUG_ON(__SOF_TIMESTAMPING_CNT > 32);
+ BUILD_BUG_ON(__HWTSTAMP_TX_CNT > 32);
+ BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT > 32);
+
+ if (ts_info->so_timestamping) {
+ ret = ethnl_bitset32_size(&ts_info->so_timestamping, NULL,
+ __SOF_TIMESTAMPING_CNT,
+ sof_timestamping_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret; /* _TSINFO_TIMESTAMPING */
+ }
+ if (ts_info->tx_types) {
+ ret = ethnl_bitset32_size(&ts_info->tx_types, NULL,
+ __HWTSTAMP_TX_CNT,
+ ts_tx_type_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret; /* _TSINFO_TX_TYPES */
+ }
+ if (ts_info->rx_filters) {
+ ret = ethnl_bitset32_size(&ts_info->rx_filters, NULL,
+ __HWTSTAMP_FILTER_CNT,
+ ts_rx_filter_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret; /* _TSINFO_RX_FILTERS */
+ }
+ if (ts_info->phc_index >= 0) {
+ len += nla_total_size(sizeof(u32)); /* _TSINFO_PHC_INDEX */
+ /* _TSINFO_HWTSTAMP_PROVIDER */
+ len += nla_total_size(0) + 2 * nla_total_size(sizeof(u32));
+ }
+ if (ts_info->phc_source) {
+ len += nla_total_size(sizeof(u32)); /* _TSINFO_HWTSTAMP_SOURCE */
+ if (ts_info->phc_phyindex)
+ /* _TSINFO_HWTSTAMP_PHYINDEX */
+ len += nla_total_size(sizeof(u32));
+ }
+ if (req_base->flags & ETHTOOL_FLAG_STATS)
+ len += nla_total_size(0) + /* _TSINFO_STATS */
+ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_TS_STAT_CNT;
+
+ return len;
+}
+
+static int tsinfo_put_stat(struct sk_buff *skb, u64 val, u16 attrtype)
+{
+ if (val == ETHTOOL_STAT_NOT_SET)
+ return 0;
+ if (nla_put_uint(skb, attrtype, val))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int tsinfo_put_stats(struct sk_buff *skb,
+ const struct ethtool_ts_stats *stats)
+{
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_TSINFO_STATS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (tsinfo_put_stat(skb, stats->tx_stats.pkts,
+ ETHTOOL_A_TS_STAT_TX_PKTS) ||
+ tsinfo_put_stat(skb, stats->tx_stats.onestep_pkts_unconfirmed,
+ ETHTOOL_A_TS_STAT_TX_ONESTEP_PKTS_UNCONFIRMED) ||
+ tsinfo_put_stat(skb, stats->tx_stats.lost,
+ ETHTOOL_A_TS_STAT_TX_LOST) ||
+ tsinfo_put_stat(skb, stats->tx_stats.err,
+ ETHTOOL_A_TS_STAT_TX_ERR))
+ goto err_cancel;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+err_cancel:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int tsinfo_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct kernel_ethtool_ts_info *ts_info = &data->ts_info;
+ int ret;
+
+ if (ts_info->so_timestamping) {
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_TIMESTAMPING,
+ &ts_info->so_timestamping, NULL,
+ __SOF_TIMESTAMPING_CNT,
+ sof_timestamping_names, compact);
+ if (ret < 0)
+ return ret;
+ }
+ if (ts_info->tx_types) {
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_TX_TYPES,
+ &ts_info->tx_types, NULL,
+ __HWTSTAMP_TX_CNT,
+ ts_tx_type_names, compact);
+ if (ret < 0)
+ return ret;
+ }
+ if (ts_info->rx_filters) {
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_RX_FILTERS,
+ &ts_info->rx_filters, NULL,
+ __HWTSTAMP_FILTER_CNT,
+ ts_rx_filter_names, compact);
+ if (ret < 0)
+ return ret;
+ }
+ if (ts_info->phc_index >= 0) {
+ struct nlattr *nest;
+
+ ret = nla_put_u32(skb, ETHTOOL_A_TSINFO_PHC_INDEX,
+ ts_info->phc_index);
+ if (ret)
+ return -EMSGSIZE;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX,
+ ts_info->phc_index) ||
+ nla_put_u32(skb,
+ ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER,
+ ts_info->phc_qualifier)) {
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(skb, nest);
+ }
+ if (ts_info->phc_source) {
+ if (nla_put_u32(skb, ETHTOOL_A_TSINFO_HWTSTAMP_SOURCE,
+ ts_info->phc_source))
+ return -EMSGSIZE;
+
+ if (ts_info->phc_phyindex &&
+ nla_put_u32(skb, ETHTOOL_A_TSINFO_HWTSTAMP_PHYINDEX,
+ ts_info->phc_phyindex))
+ return -EMSGSIZE;
+ }
+ if (req_base->flags & ETHTOOL_FLAG_STATS &&
+ tsinfo_put_stats(skb, &data->stats))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+struct ethnl_tsinfo_dump_ctx {
+ struct tsinfo_req_info *req_info;
+ struct tsinfo_reply_data *reply_data;
+ unsigned long pos_ifindex;
+ bool netdev_dump_done;
+ unsigned long pos_phyindex;
+ enum hwtstamp_provider_qualifier pos_phcqualifier;
+};
+
+static void *ethnl_tsinfo_prepare_dump(struct sk_buff *skb,
+ struct net_device *dev,
+ struct tsinfo_reply_data *reply_data,
+ struct netlink_callback *cb)
+{
+ struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx;
+ void *ehdr = NULL;
+
+ ehdr = ethnl_dump_put(skb, cb,
+ ETHTOOL_MSG_TSINFO_GET_REPLY);
+ if (!ehdr)
+ return ERR_PTR(-EMSGSIZE);
+
+ reply_data = ctx->reply_data;
+ memset(reply_data, 0, sizeof(*reply_data));
+ reply_data->base.dev = dev;
+ reply_data->ts_info.cmd = ETHTOOL_GET_TS_INFO;
+ reply_data->ts_info.phc_index = -1;
+
+ return ehdr;
+}
+
+static int ethnl_tsinfo_end_dump(struct sk_buff *skb,
+ struct net_device *dev,
+ struct tsinfo_req_info *req_info,
+ struct tsinfo_reply_data *reply_data,
+ void *ehdr)
+{
+ int ret;
+
+ reply_data->ts_info.so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+
+ ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_TSINFO_HEADER);
+ if (ret < 0)
+ return ret;
+
+ ret = tsinfo_fill_reply(skb, &req_info->base, &reply_data->base);
+ if (ret < 0)
+ return ret;
+
+ reply_data->base.dev = NULL;
+ genlmsg_end(skb, ehdr);
+
+ return ret;
+}
+
+static int ethnl_tsinfo_dump_one_phydev(struct sk_buff *skb,
+ struct net_device *dev,
+ struct phy_device *phydev,
+ struct netlink_callback *cb)
+{
+ struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx;
+ struct tsinfo_reply_data *reply_data;
+ struct tsinfo_req_info *req_info;
+ void *ehdr = NULL;
+ int ret = 0;
+
+ if (!phy_has_tsinfo(phydev))
+ return -EOPNOTSUPP;
+
+ reply_data = ctx->reply_data;
+ req_info = ctx->req_info;
+ ehdr = ethnl_tsinfo_prepare_dump(skb, dev, reply_data, cb);
+ if (IS_ERR(ehdr))
+ return PTR_ERR(ehdr);
+
+ ret = phy_ts_info(phydev, &reply_data->ts_info);
+ if (ret < 0)
+ goto err;
+
+ if (reply_data->ts_info.phc_index >= 0) {
+ reply_data->ts_info.phc_source = HWTSTAMP_SOURCE_PHYLIB;
+ reply_data->ts_info.phc_phyindex = phydev->phyindex;
+ }
+
+ ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data, ehdr);
+ if (ret < 0)
+ goto err;
+
+ return ret;
+err:
+ genlmsg_cancel(skb, ehdr);
+ return ret;
+}
+
+static int ethnl_tsinfo_dump_one_netdev(struct sk_buff *skb,
+ struct net_device *dev,
+ struct netlink_callback *cb)
+{
+ struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct tsinfo_reply_data *reply_data;
+ struct tsinfo_req_info *req_info;
+ void *ehdr = NULL;
+ int ret = 0;
+
+ if (!ops->get_ts_info)
+ return -EOPNOTSUPP;
+
+ reply_data = ctx->reply_data;
+ req_info = ctx->req_info;
+ for (; ctx->pos_phcqualifier < HWTSTAMP_PROVIDER_QUALIFIER_CNT;
+ ctx->pos_phcqualifier++) {
+ if (!net_support_hwtstamp_qualifier(dev,
+ ctx->pos_phcqualifier))
+ continue;
+
+ ehdr = ethnl_tsinfo_prepare_dump(skb, dev, reply_data, cb);
+ if (IS_ERR(ehdr)) {
+ ret = PTR_ERR(ehdr);
+ goto err;
+ }
+
+ reply_data->ts_info.phc_qualifier = ctx->pos_phcqualifier;
+ ret = ops->get_ts_info(dev, &reply_data->ts_info);
+ if (ret < 0)
+ goto err;
+
+ if (reply_data->ts_info.phc_index >= 0)
+ reply_data->ts_info.phc_source = HWTSTAMP_SOURCE_NETDEV;
+ ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data,
+ ehdr);
+ if (ret < 0)
+ goto err;
+ }
+
+ return ret;
+
+err:
+ genlmsg_cancel(skb, ehdr);
+ return ret;
+}
+
+static int ethnl_tsinfo_dump_one_net_topo(struct sk_buff *skb,
+ struct net_device *dev,
+ struct netlink_callback *cb)
+{
+ struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx;
+ struct phy_device_node *pdn;
+ int ret = 0;
+
+ if (!ctx->netdev_dump_done) {
+ ret = ethnl_tsinfo_dump_one_netdev(skb, dev, cb);
+ if (ret < 0 && ret != -EOPNOTSUPP)
+ return ret;
+ ctx->netdev_dump_done = true;
+ }
+
+ if (!dev->link_topo) {
+ if (phy_has_tsinfo(dev->phydev)) {
+ ret = ethnl_tsinfo_dump_one_phydev(skb, dev,
+ dev->phydev, cb);
+ if (ret < 0 && ret != -EOPNOTSUPP)
+ return ret;
+ }
+
+ return 0;
+ }
+
+ xa_for_each_start(&dev->link_topo->phys, ctx->pos_phyindex, pdn,
+ ctx->pos_phyindex) {
+ if (phy_has_tsinfo(pdn->phy)) {
+ ret = ethnl_tsinfo_dump_one_phydev(skb, dev,
+ pdn->phy, cb);
+ if (ret < 0 && ret != -EOPNOTSUPP)
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx;
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ int ret = 0;
+
+ rtnl_lock();
+ if (ctx->req_info->base.dev) {
+ dev = ctx->req_info->base.dev;
+ netdev_lock_ops(dev);
+ ret = ethnl_tsinfo_dump_one_net_topo(skb, dev, cb);
+ netdev_unlock_ops(dev);
+ } else {
+ for_each_netdev_dump(net, dev, ctx->pos_ifindex) {
+ netdev_lock_ops(dev);
+ ret = ethnl_tsinfo_dump_one_net_topo(skb, dev, cb);
+ netdev_unlock_ops(dev);
+ if (ret < 0 && ret != -EOPNOTSUPP)
+ break;
+ ctx->pos_phyindex = 0;
+ ctx->netdev_dump_done = false;
+ ctx->pos_phcqualifier = HWTSTAMP_PROVIDER_QUALIFIER_PRECISE;
+ }
+ }
+ rtnl_unlock();
+
+ return ret;
+}
+
+int ethnl_tsinfo_start(struct netlink_callback *cb)
+{
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx;
+ struct nlattr **tb = info->info.attrs;
+ struct tsinfo_reply_data *reply_data;
+ struct tsinfo_req_info *req_info;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
+
+ req_info = kzalloc(sizeof(*req_info), GFP_KERNEL);
+ if (!req_info)
+ return -ENOMEM;
+ reply_data = kzalloc(sizeof(*reply_data), GFP_KERNEL);
+ if (!reply_data) {
+ ret = -ENOMEM;
+ goto free_req_info;
+ }
+
+ ret = ethnl_parse_header_dev_get(&req_info->base,
+ tb[ETHTOOL_A_TSINFO_HEADER],
+ sock_net(cb->skb->sk), cb->extack,
+ false);
+ if (ret < 0)
+ goto free_reply_data;
+
+ ctx->req_info = req_info;
+ ctx->reply_data = reply_data;
+ ctx->pos_ifindex = 0;
+ ctx->pos_phyindex = 0;
+ ctx->netdev_dump_done = false;
+ ctx->pos_phcqualifier = HWTSTAMP_PROVIDER_QUALIFIER_PRECISE;
+
+ return 0;
+
+free_reply_data:
+ kfree(reply_data);
+free_req_info:
+ kfree(req_info);
+
+ return ret;
+}
+
+int ethnl_tsinfo_done(struct netlink_callback *cb)
+{
+ struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx;
+ struct tsinfo_req_info *req_info = ctx->req_info;
+
+ ethnl_parse_header_dev_put(&req_info->base);
+ kfree(ctx->reply_data);
+ kfree(ctx->req_info);
+
+ return 0;
+}
+
+const struct ethnl_request_ops ethnl_tsinfo_request_ops = {
+ .request_cmd = ETHTOOL_MSG_TSINFO_GET,
+ .reply_cmd = ETHTOOL_MSG_TSINFO_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_TSINFO_HEADER,
+ .req_info_size = sizeof(struct tsinfo_req_info),
+ .reply_data_size = sizeof(struct tsinfo_reply_data),
+
+ .parse_request = tsinfo_parse_request,
+ .prepare_data = tsinfo_prepare_data,
+ .reply_size = tsinfo_reply_size,
+ .fill_reply = tsinfo_fill_reply,
+};
diff --git a/net/ethtool/tunnels.c b/net/ethtool/tunnels.c
new file mode 100644
index 000000000000..b4ce47dd2aa6
--- /dev/null
+++ b/net/ethtool/tunnels.c
@@ -0,0 +1,281 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/ethtool_netlink.h>
+#include <net/udp_tunnel.h>
+#include <net/vxlan.h>
+
+#include "bitset.h"
+#include "common.h"
+#include "netlink.h"
+
+const struct nla_policy ethnl_tunnel_info_get_policy[] = {
+ [ETHTOOL_A_TUNNEL_INFO_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static_assert(ETHTOOL_UDP_TUNNEL_TYPE_VXLAN == ilog2(UDP_TUNNEL_TYPE_VXLAN));
+static_assert(ETHTOOL_UDP_TUNNEL_TYPE_GENEVE == ilog2(UDP_TUNNEL_TYPE_GENEVE));
+static_assert(ETHTOOL_UDP_TUNNEL_TYPE_VXLAN_GPE ==
+ ilog2(UDP_TUNNEL_TYPE_VXLAN_GPE));
+
+static ssize_t ethnl_udp_table_reply_size(unsigned int types, bool compact)
+{
+ ssize_t size;
+
+ size = ethnl_bitset32_size(&types, NULL, __ETHTOOL_UDP_TUNNEL_TYPE_CNT,
+ udp_tunnel_type_names, compact);
+ if (size < 0)
+ return size;
+
+ return size +
+ nla_total_size(0) + /* _UDP_TABLE */
+ nla_total_size(sizeof(u32)); /* _UDP_TABLE_SIZE */
+}
+
+static ssize_t
+ethnl_tunnel_info_reply_size(const struct ethnl_req_info *req_base,
+ struct netlink_ext_ack *extack)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct udp_tunnel_nic_info *info;
+ unsigned int i;
+ ssize_t ret;
+ size_t size;
+
+ info = req_base->dev->udp_tunnel_nic_info;
+ if (!info) {
+ NL_SET_ERR_MSG(extack,
+ "device does not report tunnel offload info");
+ return -EOPNOTSUPP;
+ }
+
+ size = nla_total_size(0); /* _INFO_UDP_PORTS */
+
+ for (i = 0; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) {
+ if (!info->tables[i].n_entries)
+ break;
+
+ ret = ethnl_udp_table_reply_size(info->tables[i].tunnel_types,
+ compact);
+ if (ret < 0)
+ return ret;
+ size += ret;
+
+ size += udp_tunnel_nic_dump_size(req_base->dev, i);
+ }
+
+ if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN) {
+ ret = ethnl_udp_table_reply_size(0, compact);
+ if (ret < 0)
+ return ret;
+ size += ret;
+
+ size += nla_total_size(0) + /* _TABLE_ENTRY */
+ nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */
+ nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */
+ }
+
+ return size;
+}
+
+static int
+ethnl_tunnel_info_fill_reply(const struct ethnl_req_info *req_base,
+ struct sk_buff *skb)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct udp_tunnel_nic_info *info;
+ struct nlattr *ports, *table, *entry;
+ unsigned int i;
+
+ info = req_base->dev->udp_tunnel_nic_info;
+ if (!info)
+ return -EOPNOTSUPP;
+
+ ports = nla_nest_start(skb, ETHTOOL_A_TUNNEL_INFO_UDP_PORTS);
+ if (!ports)
+ return -EMSGSIZE;
+
+ for (i = 0; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) {
+ if (!info->tables[i].n_entries)
+ break;
+
+ table = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE);
+ if (!table)
+ goto err_cancel_ports;
+
+ if (nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE,
+ info->tables[i].n_entries))
+ goto err_cancel_table;
+
+ if (ethnl_put_bitset32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES,
+ &info->tables[i].tunnel_types, NULL,
+ __ETHTOOL_UDP_TUNNEL_TYPE_CNT,
+ udp_tunnel_type_names, compact))
+ goto err_cancel_table;
+
+ if (udp_tunnel_nic_dump_write(req_base->dev, i, skb))
+ goto err_cancel_table;
+
+ nla_nest_end(skb, table);
+ }
+
+ if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN) {
+ u32 zero = 0;
+
+ table = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE);
+ if (!table)
+ goto err_cancel_ports;
+
+ if (nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_SIZE, 1))
+ goto err_cancel_table;
+
+ if (ethnl_put_bitset32(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES,
+ &zero, NULL,
+ __ETHTOOL_UDP_TUNNEL_TYPE_CNT,
+ udp_tunnel_type_names, compact))
+ goto err_cancel_table;
+
+ entry = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY);
+ if (!entry)
+ goto err_cancel_entry;
+
+ if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT,
+ htons(IANA_VXLAN_UDP_PORT)) ||
+ nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE,
+ ilog2(UDP_TUNNEL_TYPE_VXLAN)))
+ goto err_cancel_entry;
+
+ nla_nest_end(skb, entry);
+ nla_nest_end(skb, table);
+ }
+
+ nla_nest_end(skb, ports);
+
+ return 0;
+
+err_cancel_entry:
+ nla_nest_cancel(skb, entry);
+err_cancel_table:
+ nla_nest_cancel(skb, table);
+err_cancel_ports:
+ nla_nest_cancel(skb, ports);
+ return -EMSGSIZE;
+}
+
+int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ethnl_req_info req_info = {};
+ struct nlattr **tb = info->attrs;
+ struct sk_buff *rskb;
+ void *reply_payload;
+ int reply_len;
+ int ret;
+
+ ret = ethnl_parse_header_dev_get(&req_info,
+ tb[ETHTOOL_A_TUNNEL_INFO_HEADER],
+ genl_info_net(info), info->extack,
+ true);
+ if (ret < 0)
+ return ret;
+
+ rtnl_lock();
+ ret = ethnl_tunnel_info_reply_size(&req_info, info->extack);
+ if (ret < 0)
+ goto err_unlock_rtnl;
+ reply_len = ret + ethnl_reply_header_size();
+
+ rskb = ethnl_reply_init(reply_len, req_info.dev,
+ ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY,
+ ETHTOOL_A_TUNNEL_INFO_HEADER,
+ info, &reply_payload);
+ if (!rskb) {
+ ret = -ENOMEM;
+ goto err_unlock_rtnl;
+ }
+
+ ret = ethnl_tunnel_info_fill_reply(&req_info, rskb);
+ if (ret)
+ goto err_free_msg;
+ rtnl_unlock();
+ ethnl_parse_header_dev_put(&req_info);
+ genlmsg_end(rskb, reply_payload);
+
+ return genlmsg_reply(rskb, info);
+
+err_free_msg:
+ nlmsg_free(rskb);
+err_unlock_rtnl:
+ rtnl_unlock();
+ ethnl_parse_header_dev_put(&req_info);
+ return ret;
+}
+
+struct ethnl_tunnel_info_dump_ctx {
+ struct ethnl_req_info req_info;
+ unsigned long ifindex;
+};
+
+int ethnl_tunnel_info_start(struct netlink_callback *cb)
+{
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx;
+ struct nlattr **tb = info->info.attrs;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
+
+ memset(ctx, 0, sizeof(*ctx));
+
+ ret = ethnl_parse_header_dev_get(&ctx->req_info,
+ tb[ETHTOOL_A_TUNNEL_INFO_HEADER],
+ sock_net(cb->skb->sk), cb->extack,
+ false);
+ if (ctx->req_info.dev) {
+ ethnl_parse_header_dev_put(&ctx->req_info);
+ ctx->req_info.dev = NULL;
+ }
+
+ return ret;
+}
+
+int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ethnl_tunnel_info_dump_ctx *ctx = (void *)cb->ctx;
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ int ret = 0;
+ void *ehdr;
+
+ rtnl_lock();
+ for_each_netdev_dump(net, dev, ctx->ifindex) {
+ ehdr = ethnl_dump_put(skb, cb,
+ ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY);
+ if (!ehdr) {
+ ret = -EMSGSIZE;
+ break;
+ }
+
+ ret = ethnl_fill_reply_header(skb, dev,
+ ETHTOOL_A_TUNNEL_INFO_HEADER);
+ if (ret < 0) {
+ genlmsg_cancel(skb, ehdr);
+ break;
+ }
+
+ ctx->req_info.dev = dev;
+ ret = ethnl_tunnel_info_fill_reply(&ctx->req_info, skb);
+ ctx->req_info.dev = NULL;
+ if (ret < 0) {
+ genlmsg_cancel(skb, ehdr);
+ if (ret == -EOPNOTSUPP)
+ continue;
+ break;
+ }
+ genlmsg_end(skb, ehdr);
+ }
+ rtnl_unlock();
+
+ if (ret == -EMSGSIZE && skb->len)
+ return skb->len;
+ return ret;
+}
diff --git a/net/ethtool/wol.c b/net/ethtool/wol.c
new file mode 100644
index 000000000000..a39d8000d808
--- /dev/null
+++ b/net/ethtool/wol.c
@@ -0,0 +1,158 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+
+struct wol_req_info {
+ struct ethnl_req_info base;
+};
+
+struct wol_reply_data {
+ struct ethnl_reply_data base;
+ struct ethtool_wolinfo wol;
+ bool show_sopass;
+};
+
+#define WOL_REPDATA(__reply_base) \
+ container_of(__reply_base, struct wol_reply_data, base)
+
+const struct nla_policy ethnl_wol_get_policy[] = {
+ [ETHTOOL_A_WOL_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int wol_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct wol_reply_data *data = WOL_REPDATA(reply_base);
+ struct net_device *dev = reply_base->dev;
+ int ret;
+
+ if (!dev->ethtool_ops->get_wol)
+ return -EOPNOTSUPP;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+ dev->ethtool_ops->get_wol(dev, &data->wol);
+ ethnl_ops_complete(dev);
+ /* do not include password in notifications */
+ data->show_sopass = !genl_info_is_ntf(info) &&
+ (data->wol.supported & WAKE_MAGICSECURE);
+
+ return 0;
+}
+
+static int wol_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct wol_reply_data *data = WOL_REPDATA(reply_base);
+ int len;
+
+ len = ethnl_bitset32_size(&data->wol.wolopts, &data->wol.supported,
+ WOL_MODE_COUNT, wol_mode_names, compact);
+ if (len < 0)
+ return len;
+ if (data->show_sopass)
+ len += nla_total_size(sizeof(data->wol.sopass));
+
+ return len;
+}
+
+static int wol_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ const struct wol_reply_data *data = WOL_REPDATA(reply_base);
+ int ret;
+
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_WOL_MODES, &data->wol.wolopts,
+ &data->wol.supported, WOL_MODE_COUNT,
+ wol_mode_names, compact);
+ if (ret < 0)
+ return ret;
+ if (data->show_sopass &&
+ nla_put(skb, ETHTOOL_A_WOL_SOPASS, sizeof(data->wol.sopass),
+ data->wol.sopass))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+/* WOL_SET */
+
+const struct nla_policy ethnl_wol_set_policy[] = {
+ [ETHTOOL_A_WOL_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_WOL_MODES] = { .type = NLA_NESTED },
+ [ETHTOOL_A_WOL_SOPASS] = { .type = NLA_BINARY,
+ .len = SOPASS_MAX },
+};
+
+static int
+ethnl_set_wol_validate(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ const struct ethtool_ops *ops = req_info->dev->ethtool_ops;
+
+ return ops->get_wol && ops->set_wol ? 1 : -EOPNOTSUPP;
+}
+
+static int
+ethnl_set_wol(struct ethnl_req_info *req_info, struct genl_info *info)
+{
+ struct ethtool_wolinfo wol = { .cmd = ETHTOOL_GWOL };
+ struct net_device *dev = req_info->dev;
+ struct nlattr **tb = info->attrs;
+ bool mod = false;
+ int ret;
+
+ dev->ethtool_ops->get_wol(dev, &wol);
+ ret = ethnl_update_bitset32(&wol.wolopts, WOL_MODE_COUNT,
+ tb[ETHTOOL_A_WOL_MODES], wol_mode_names,
+ info->extack, &mod);
+ if (ret < 0)
+ return ret;
+ if (wol.wolopts & ~wol.supported) {
+ NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_WOL_MODES],
+ "cannot enable unsupported WoL mode");
+ return -EINVAL;
+ }
+ if (tb[ETHTOOL_A_WOL_SOPASS]) {
+ if (!(wol.supported & WAKE_MAGICSECURE)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_WOL_SOPASS],
+ "magicsecure not supported, cannot set password");
+ return -EINVAL;
+ }
+ ethnl_update_binary(wol.sopass, sizeof(wol.sopass),
+ tb[ETHTOOL_A_WOL_SOPASS], &mod);
+ }
+
+ if (!mod)
+ return 0;
+ ret = dev->ethtool_ops->set_wol(dev, &wol);
+ if (ret)
+ return ret;
+ dev->ethtool->wol_enabled = !!wol.wolopts;
+ return 1;
+}
+
+const struct ethnl_request_ops ethnl_wol_request_ops = {
+ .request_cmd = ETHTOOL_MSG_WOL_GET,
+ .reply_cmd = ETHTOOL_MSG_WOL_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_WOL_HEADER,
+ .req_info_size = sizeof(struct wol_req_info),
+ .reply_data_size = sizeof(struct wol_reply_data),
+
+ .prepare_data = wol_prepare_data,
+ .reply_size = wol_reply_size,
+ .fill_reply = wol_fill_reply,
+
+ .set_validate = ethnl_set_wol_validate,
+ .set = ethnl_set_wol,
+ .set_ntf_cmd = ETHTOOL_MSG_WOL_NTF,
+};
diff --git a/net/handshake/.kunitconfig b/net/handshake/.kunitconfig
new file mode 100644
index 000000000000..5c48cf4abca2
--- /dev/null
+++ b/net/handshake/.kunitconfig
@@ -0,0 +1,11 @@
+CONFIG_KUNIT=y
+CONFIG_UBSAN=y
+CONFIG_STACKTRACE=y
+CONFIG_NET=y
+CONFIG_NETWORK_FILESYSTEMS=y
+CONFIG_INET=y
+CONFIG_MULTIUSER=y
+CONFIG_NFS_FS=y
+CONFIG_SUNRPC=y
+CONFIG_NET_HANDSHAKE=y
+CONFIG_NET_HANDSHAKE_KUNIT_TEST=y
diff --git a/net/handshake/Makefile b/net/handshake/Makefile
new file mode 100644
index 000000000000..ef4d9a2112bd
--- /dev/null
+++ b/net/handshake/Makefile
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for the Generic HANDSHAKE service
+#
+# Author: Chuck Lever <chuck.lever@oracle.com>
+#
+# Copyright (c) 2023, Oracle and/or its affiliates.
+#
+
+obj-y += handshake.o
+handshake-y := alert.o genl.o netlink.o request.o tlshd.o trace.o
+
+obj-$(CONFIG_NET_HANDSHAKE_KUNIT_TEST) += handshake-test.o
diff --git a/net/handshake/alert.c b/net/handshake/alert.c
new file mode 100644
index 000000000000..329d91984683
--- /dev/null
+++ b/net/handshake/alert.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Handle the TLS Alert protocol
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet.h>
+
+#include <net/sock.h>
+#include <net/handshake.h>
+#include <net/tls.h>
+#include <net/tls_prot.h>
+
+#include "handshake.h"
+
+#include <trace/events/handshake.h>
+
+/**
+ * tls_alert_send - send a TLS Alert on a kTLS socket
+ * @sock: open kTLS socket to send on
+ * @level: TLS Alert level
+ * @description: TLS Alert description
+ *
+ * Returns zero on success or a negative errno.
+ */
+int tls_alert_send(struct socket *sock, u8 level, u8 description)
+{
+ u8 record_type = TLS_RECORD_TYPE_ALERT;
+ u8 buf[CMSG_SPACE(sizeof(record_type))];
+ struct msghdr msg = { 0 };
+ struct cmsghdr *cmsg;
+ struct kvec iov;
+ u8 alert[2];
+ int ret;
+
+ trace_tls_alert_send(sock->sk, level, description);
+
+ alert[0] = level;
+ alert[1] = description;
+ iov.iov_base = alert;
+ iov.iov_len = sizeof(alert);
+
+ memset(buf, 0, sizeof(buf));
+ msg.msg_control = buf;
+ msg.msg_controllen = sizeof(buf);
+ msg.msg_flags = MSG_DONTWAIT;
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_TLS;
+ cmsg->cmsg_type = TLS_SET_RECORD_TYPE;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(record_type));
+ memcpy(CMSG_DATA(cmsg), &record_type, sizeof(record_type));
+
+ iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, iov.iov_len);
+ ret = sock_sendmsg(sock, &msg);
+ return ret < 0 ? ret : 0;
+}
+
+/**
+ * tls_get_record_type - Look for TLS RECORD_TYPE information
+ * @sk: socket (for IP address information)
+ * @cmsg: incoming message to be parsed
+ *
+ * Returns zero or a TLS_RECORD_TYPE value.
+ */
+u8 tls_get_record_type(const struct sock *sk, const struct cmsghdr *cmsg)
+{
+ u8 record_type;
+
+ if (cmsg->cmsg_level != SOL_TLS)
+ return 0;
+ if (cmsg->cmsg_type != TLS_GET_RECORD_TYPE)
+ return 0;
+
+ record_type = *((u8 *)CMSG_DATA(cmsg));
+ trace_tls_contenttype(sk, record_type);
+ return record_type;
+}
+EXPORT_SYMBOL(tls_get_record_type);
+
+/**
+ * tls_alert_recv - Parse TLS Alert messages
+ * @sk: socket (for IP address information)
+ * @msg: incoming message to be parsed
+ * @level: OUT - TLS AlertLevel value
+ * @description: OUT - TLS AlertDescription value
+ *
+ */
+void tls_alert_recv(const struct sock *sk, const struct msghdr *msg,
+ u8 *level, u8 *description)
+{
+ const struct kvec *iov;
+ u8 *data;
+
+ iov = msg->msg_iter.kvec;
+ data = iov->iov_base;
+ *level = data[0];
+ *description = data[1];
+
+ trace_tls_alert_recv(sk, *level, *description);
+}
+EXPORT_SYMBOL(tls_alert_recv);
diff --git a/net/handshake/genl.c b/net/handshake/genl.c
new file mode 100644
index 000000000000..870612609491
--- /dev/null
+++ b/net/handshake/genl.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/handshake.yaml */
+/* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "genl.h"
+
+#include <uapi/linux/handshake.h>
+
+/* HANDSHAKE_CMD_ACCEPT - do */
+static const struct nla_policy handshake_accept_nl_policy[HANDSHAKE_A_ACCEPT_HANDLER_CLASS + 1] = {
+ [HANDSHAKE_A_ACCEPT_HANDLER_CLASS] = NLA_POLICY_MAX(NLA_U32, 2),
+};
+
+/* HANDSHAKE_CMD_DONE - do */
+static const struct nla_policy handshake_done_nl_policy[HANDSHAKE_A_DONE_REMOTE_AUTH + 1] = {
+ [HANDSHAKE_A_DONE_STATUS] = { .type = NLA_U32, },
+ [HANDSHAKE_A_DONE_SOCKFD] = { .type = NLA_S32, },
+ [HANDSHAKE_A_DONE_REMOTE_AUTH] = { .type = NLA_U32, },
+};
+
+/* Ops table for handshake */
+static const struct genl_split_ops handshake_nl_ops[] = {
+ {
+ .cmd = HANDSHAKE_CMD_ACCEPT,
+ .doit = handshake_nl_accept_doit,
+ .policy = handshake_accept_nl_policy,
+ .maxattr = HANDSHAKE_A_ACCEPT_HANDLER_CLASS,
+ .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = HANDSHAKE_CMD_DONE,
+ .doit = handshake_nl_done_doit,
+ .policy = handshake_done_nl_policy,
+ .maxattr = HANDSHAKE_A_DONE_REMOTE_AUTH,
+ .flags = GENL_CMD_CAP_DO,
+ },
+};
+
+static const struct genl_multicast_group handshake_nl_mcgrps[] = {
+ [HANDSHAKE_NLGRP_NONE] = { "none", },
+ [HANDSHAKE_NLGRP_TLSHD] = { "tlshd", },
+};
+
+struct genl_family handshake_nl_family __ro_after_init = {
+ .name = HANDSHAKE_FAMILY_NAME,
+ .version = HANDSHAKE_FAMILY_VERSION,
+ .netnsok = true,
+ .parallel_ops = true,
+ .module = THIS_MODULE,
+ .split_ops = handshake_nl_ops,
+ .n_split_ops = ARRAY_SIZE(handshake_nl_ops),
+ .mcgrps = handshake_nl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(handshake_nl_mcgrps),
+};
diff --git a/net/handshake/genl.h b/net/handshake/genl.h
new file mode 100644
index 000000000000..8d3e18672daf
--- /dev/null
+++ b/net/handshake/genl.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/handshake.yaml */
+/* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
+
+#ifndef _LINUX_HANDSHAKE_GEN_H
+#define _LINUX_HANDSHAKE_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/handshake.h>
+
+int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info);
+int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info);
+
+enum {
+ HANDSHAKE_NLGRP_NONE,
+ HANDSHAKE_NLGRP_TLSHD,
+};
+
+extern struct genl_family handshake_nl_family;
+
+#endif /* _LINUX_HANDSHAKE_GEN_H */
diff --git a/net/handshake/handshake-test.c b/net/handshake/handshake-test.c
new file mode 100644
index 000000000000..55442b2f518a
--- /dev/null
+++ b/net/handshake/handshake-test.c
@@ -0,0 +1,540 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023 Oracle and/or its affiliates.
+ *
+ * KUnit test of the handshake upcall mechanism.
+ */
+
+#include <kunit/test.h>
+#include <kunit/visibility.h>
+
+#include <linux/kernel.h>
+
+#include <net/sock.h>
+#include <net/genetlink.h>
+#include <net/netns/generic.h>
+
+#include <uapi/linux/handshake.h>
+#include "handshake.h"
+
+MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING");
+
+static int test_accept_func(struct handshake_req *req, struct genl_info *info,
+ int fd)
+{
+ return 0;
+}
+
+static void test_done_func(struct handshake_req *req, unsigned int status,
+ struct genl_info *info)
+{
+}
+
+struct handshake_req_alloc_test_param {
+ const char *desc;
+ struct handshake_proto *proto;
+ gfp_t gfp;
+ bool expect_success;
+};
+
+static struct handshake_proto handshake_req_alloc_proto_2 = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_NONE,
+};
+
+static struct handshake_proto handshake_req_alloc_proto_3 = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_MAX,
+};
+
+static struct handshake_proto handshake_req_alloc_proto_4 = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_TLSHD,
+};
+
+static struct handshake_proto handshake_req_alloc_proto_5 = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_TLSHD,
+ .hp_accept = test_accept_func,
+};
+
+static struct handshake_proto handshake_req_alloc_proto_6 = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_TLSHD,
+ .hp_privsize = UINT_MAX,
+ .hp_accept = test_accept_func,
+ .hp_done = test_done_func,
+};
+
+static struct handshake_proto handshake_req_alloc_proto_good = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_TLSHD,
+ .hp_accept = test_accept_func,
+ .hp_done = test_done_func,
+};
+
+static const
+struct handshake_req_alloc_test_param handshake_req_alloc_params[] = {
+ {
+ .desc = "handshake_req_alloc NULL proto",
+ .proto = NULL,
+ .gfp = GFP_KERNEL,
+ .expect_success = false,
+ },
+ {
+ .desc = "handshake_req_alloc CLASS_NONE",
+ .proto = &handshake_req_alloc_proto_2,
+ .gfp = GFP_KERNEL,
+ .expect_success = false,
+ },
+ {
+ .desc = "handshake_req_alloc CLASS_MAX",
+ .proto = &handshake_req_alloc_proto_3,
+ .gfp = GFP_KERNEL,
+ .expect_success = false,
+ },
+ {
+ .desc = "handshake_req_alloc no callbacks",
+ .proto = &handshake_req_alloc_proto_4,
+ .gfp = GFP_KERNEL,
+ .expect_success = false,
+ },
+ {
+ .desc = "handshake_req_alloc no done callback",
+ .proto = &handshake_req_alloc_proto_5,
+ .gfp = GFP_KERNEL,
+ .expect_success = false,
+ },
+ {
+ .desc = "handshake_req_alloc excessive privsize",
+ .proto = &handshake_req_alloc_proto_6,
+ .gfp = GFP_KERNEL | __GFP_NOWARN,
+ .expect_success = false,
+ },
+ {
+ .desc = "handshake_req_alloc all good",
+ .proto = &handshake_req_alloc_proto_good,
+ .gfp = GFP_KERNEL,
+ .expect_success = true,
+ },
+};
+
+static void
+handshake_req_alloc_get_desc(const struct handshake_req_alloc_test_param *param,
+ char *desc)
+{
+ strscpy(desc, param->desc, KUNIT_PARAM_DESC_SIZE);
+}
+
+/* Creates the function handshake_req_alloc_gen_params */
+KUNIT_ARRAY_PARAM(handshake_req_alloc, handshake_req_alloc_params,
+ handshake_req_alloc_get_desc);
+
+static void handshake_req_alloc_case(struct kunit *test)
+{
+ const struct handshake_req_alloc_test_param *param = test->param_value;
+ struct handshake_req *result;
+
+ /* Arrange */
+
+ /* Act */
+ result = handshake_req_alloc(param->proto, param->gfp);
+
+ /* Assert */
+ if (param->expect_success)
+ KUNIT_EXPECT_NOT_NULL(test, result);
+ else
+ KUNIT_EXPECT_NULL(test, result);
+
+ kfree(result);
+}
+
+static void handshake_req_submit_test1(struct kunit *test)
+{
+ struct socket *sock;
+ int err, result;
+
+ /* Arrange */
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ /* Act */
+ result = handshake_req_submit(sock, NULL, GFP_KERNEL);
+
+ /* Assert */
+ KUNIT_EXPECT_EQ(test, result, -EINVAL);
+
+ sock_release(sock);
+}
+
+static void handshake_req_submit_test2(struct kunit *test)
+{
+ struct handshake_req *req;
+ int result;
+
+ /* Arrange */
+ req = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ /* Act */
+ result = handshake_req_submit(NULL, req, GFP_KERNEL);
+
+ /* Assert */
+ KUNIT_EXPECT_EQ(test, result, -EINVAL);
+
+ /* handshake_req_submit() destroys @req on error */
+}
+
+static void handshake_req_submit_test3(struct kunit *test)
+{
+ struct handshake_req *req;
+ struct socket *sock;
+ int err, result;
+
+ /* Arrange */
+ req = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+ sock->file = NULL;
+
+ /* Act */
+ result = handshake_req_submit(sock, req, GFP_KERNEL);
+
+ /* Assert */
+ KUNIT_EXPECT_EQ(test, result, -EINVAL);
+
+ /* handshake_req_submit() destroys @req on error */
+ sock_release(sock);
+}
+
+static void handshake_req_submit_test4(struct kunit *test)
+{
+ struct handshake_req *req, *result;
+ struct socket *sock;
+ struct file *filp;
+ int err;
+
+ /* Arrange */
+ req = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+ filp = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp);
+ KUNIT_ASSERT_NOT_NULL(test, sock->sk);
+ sock->file = filp;
+
+ err = handshake_req_submit(sock, req, GFP_KERNEL);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ /* Act */
+ result = handshake_req_hash_lookup(sock->sk);
+
+ /* Assert */
+ KUNIT_EXPECT_NOT_NULL(test, result);
+ KUNIT_EXPECT_PTR_EQ(test, req, result);
+
+ handshake_req_cancel(sock->sk);
+ fput(filp);
+}
+
+static void handshake_req_submit_test5(struct kunit *test)
+{
+ struct handshake_req *req;
+ struct handshake_net *hn;
+ struct socket *sock;
+ struct file *filp;
+ struct net *net;
+ int saved, err;
+
+ /* Arrange */
+ req = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+ filp = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp);
+ KUNIT_ASSERT_NOT_NULL(test, sock->sk);
+ sock->file = filp;
+
+ net = sock_net(sock->sk);
+ hn = handshake_pernet(net);
+ KUNIT_ASSERT_NOT_NULL(test, hn);
+
+ saved = hn->hn_pending;
+ hn->hn_pending = hn->hn_pending_max + 1;
+
+ /* Act */
+ err = handshake_req_submit(sock, req, GFP_KERNEL);
+
+ /* Assert */
+ KUNIT_EXPECT_EQ(test, err, -EAGAIN);
+
+ fput(filp);
+ hn->hn_pending = saved;
+}
+
+static void handshake_req_submit_test6(struct kunit *test)
+{
+ struct handshake_req *req1, *req2;
+ struct socket *sock;
+ struct file *filp;
+ int err;
+
+ /* Arrange */
+ req1 = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req1);
+ req2 = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req2);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+ filp = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp);
+ KUNIT_ASSERT_NOT_NULL(test, sock->sk);
+ sock->file = filp;
+
+ /* Act */
+ err = handshake_req_submit(sock, req1, GFP_KERNEL);
+ KUNIT_ASSERT_EQ(test, err, 0);
+ err = handshake_req_submit(sock, req2, GFP_KERNEL);
+
+ /* Assert */
+ KUNIT_EXPECT_EQ(test, err, -EBUSY);
+
+ handshake_req_cancel(sock->sk);
+ fput(filp);
+}
+
+static void handshake_req_cancel_test1(struct kunit *test)
+{
+ struct handshake_req *req;
+ struct socket *sock;
+ struct file *filp;
+ bool result;
+ int err;
+
+ /* Arrange */
+ req = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ filp = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp);
+ sock->file = filp;
+
+ err = handshake_req_submit(sock, req, GFP_KERNEL);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ /* NB: handshake_req hasn't been accepted */
+
+ /* Act */
+ result = handshake_req_cancel(sock->sk);
+
+ /* Assert */
+ KUNIT_EXPECT_TRUE(test, result);
+
+ fput(filp);
+}
+
+static void handshake_req_cancel_test2(struct kunit *test)
+{
+ struct handshake_req *req, *next;
+ struct handshake_net *hn;
+ struct socket *sock;
+ struct file *filp;
+ struct net *net;
+ bool result;
+ int err;
+
+ /* Arrange */
+ req = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ filp = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp);
+ sock->file = filp;
+
+ err = handshake_req_submit(sock, req, GFP_KERNEL);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ net = sock_net(sock->sk);
+ hn = handshake_pernet(net);
+ KUNIT_ASSERT_NOT_NULL(test, hn);
+
+ /* Pretend to accept this request */
+ next = handshake_req_next(hn, HANDSHAKE_HANDLER_CLASS_TLSHD);
+ KUNIT_ASSERT_PTR_EQ(test, req, next);
+
+ /* Act */
+ result = handshake_req_cancel(sock->sk);
+
+ /* Assert */
+ KUNIT_EXPECT_TRUE(test, result);
+
+ fput(filp);
+}
+
+static void handshake_req_cancel_test3(struct kunit *test)
+{
+ struct handshake_req *req, *next;
+ struct handshake_net *hn;
+ struct socket *sock;
+ struct file *filp;
+ struct net *net;
+ bool result;
+ int err;
+
+ /* Arrange */
+ req = handshake_req_alloc(&handshake_req_alloc_proto_good, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ filp = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp);
+ sock->file = filp;
+
+ err = handshake_req_submit(sock, req, GFP_KERNEL);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ net = sock_net(sock->sk);
+ hn = handshake_pernet(net);
+ KUNIT_ASSERT_NOT_NULL(test, hn);
+
+ /* Pretend to accept this request */
+ next = handshake_req_next(hn, HANDSHAKE_HANDLER_CLASS_TLSHD);
+ KUNIT_ASSERT_PTR_EQ(test, req, next);
+
+ /* Pretend to complete this request */
+ handshake_complete(next, -ETIMEDOUT, NULL);
+
+ /* Act */
+ result = handshake_req_cancel(sock->sk);
+
+ /* Assert */
+ KUNIT_EXPECT_FALSE(test, result);
+
+ fput(filp);
+}
+
+static struct handshake_req *handshake_req_destroy_test;
+
+static void test_destroy_func(struct handshake_req *req)
+{
+ handshake_req_destroy_test = req;
+}
+
+static struct handshake_proto handshake_req_alloc_proto_destroy = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_TLSHD,
+ .hp_accept = test_accept_func,
+ .hp_done = test_done_func,
+ .hp_destroy = test_destroy_func,
+};
+
+static void handshake_req_destroy_test1(struct kunit *test)
+{
+ struct handshake_req *req;
+ struct socket *sock;
+ struct file *filp;
+ int err;
+
+ /* Arrange */
+ handshake_req_destroy_test = NULL;
+
+ req = handshake_req_alloc(&handshake_req_alloc_proto_destroy, GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, req);
+
+ err = __sock_create(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP,
+ &sock, 1);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ filp = sock_alloc_file(sock, O_NONBLOCK, NULL);
+ KUNIT_ASSERT_NOT_ERR_OR_NULL(test, filp);
+ sock->file = filp;
+
+ err = handshake_req_submit(sock, req, GFP_KERNEL);
+ KUNIT_ASSERT_EQ(test, err, 0);
+
+ handshake_req_cancel(sock->sk);
+
+ /* Act */
+ /* Ensure the close/release/put process has run to
+ * completion before checking the result.
+ */
+ __fput_sync(filp);
+
+ /* Assert */
+ KUNIT_EXPECT_PTR_EQ(test, handshake_req_destroy_test, req);
+}
+
+static struct kunit_case handshake_api_test_cases[] = {
+ {
+ .name = "req_alloc API fuzzing",
+ .run_case = handshake_req_alloc_case,
+ .generate_params = handshake_req_alloc_gen_params,
+ },
+ {
+ .name = "req_submit NULL req arg",
+ .run_case = handshake_req_submit_test1,
+ },
+ {
+ .name = "req_submit NULL sock arg",
+ .run_case = handshake_req_submit_test2,
+ },
+ {
+ .name = "req_submit NULL sock->file",
+ .run_case = handshake_req_submit_test3,
+ },
+ {
+ .name = "req_lookup works",
+ .run_case = handshake_req_submit_test4,
+ },
+ {
+ .name = "req_submit max pending",
+ .run_case = handshake_req_submit_test5,
+ },
+ {
+ .name = "req_submit multiple",
+ .run_case = handshake_req_submit_test6,
+ },
+ {
+ .name = "req_cancel before accept",
+ .run_case = handshake_req_cancel_test1,
+ },
+ {
+ .name = "req_cancel after accept",
+ .run_case = handshake_req_cancel_test2,
+ },
+ {
+ .name = "req_cancel after done",
+ .run_case = handshake_req_cancel_test3,
+ },
+ {
+ .name = "req_destroy works",
+ .run_case = handshake_req_destroy_test1,
+ },
+ {}
+};
+
+static struct kunit_suite handshake_api_suite = {
+ .name = "Handshake API tests",
+ .test_cases = handshake_api_test_cases,
+};
+
+kunit_test_suites(&handshake_api_suite);
+
+MODULE_DESCRIPTION("Test handshake upcall API functions");
+MODULE_LICENSE("GPL");
diff --git a/net/handshake/handshake.h b/net/handshake/handshake.h
new file mode 100644
index 000000000000..a48163765a7a
--- /dev/null
+++ b/net/handshake/handshake.h
@@ -0,0 +1,93 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Generic netlink handshake service
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#ifndef _INTERNAL_HANDSHAKE_H
+#define _INTERNAL_HANDSHAKE_H
+
+/* Per-net namespace context */
+struct handshake_net {
+ spinlock_t hn_lock; /* protects next 3 fields */
+ int hn_pending;
+ int hn_pending_max;
+ struct list_head hn_requests;
+
+ unsigned long hn_flags;
+};
+
+enum hn_flags_bits {
+ HANDSHAKE_F_NET_DRAINING,
+};
+
+struct handshake_proto;
+
+/* One handshake request */
+struct handshake_req {
+ struct list_head hr_list;
+ struct rhash_head hr_rhash;
+ unsigned long hr_flags;
+ const struct handshake_proto *hr_proto;
+ struct sock *hr_sk;
+ void (*hr_odestruct)(struct sock *sk);
+
+ /* Always the last field */
+ char hr_priv[];
+};
+
+enum hr_flags_bits {
+ HANDSHAKE_F_REQ_COMPLETED,
+ HANDSHAKE_F_REQ_SESSION,
+};
+
+struct genl_info;
+
+/* Invariants for all handshake requests for one transport layer
+ * security protocol
+ */
+struct handshake_proto {
+ int hp_handler_class;
+ size_t hp_privsize;
+ unsigned long hp_flags;
+
+ int (*hp_accept)(struct handshake_req *req,
+ struct genl_info *info, int fd);
+ void (*hp_done)(struct handshake_req *req,
+ unsigned int status,
+ struct genl_info *info);
+ void (*hp_destroy)(struct handshake_req *req);
+};
+
+enum hp_flags_bits {
+ HANDSHAKE_F_PROTO_NOTIFY,
+};
+
+/* alert.c */
+int tls_alert_send(struct socket *sock, u8 level, u8 description);
+
+/* netlink.c */
+int handshake_genl_notify(struct net *net, const struct handshake_proto *proto,
+ gfp_t flags);
+struct nlmsghdr *handshake_genl_put(struct sk_buff *msg,
+ struct genl_info *info);
+struct handshake_net *handshake_pernet(struct net *net);
+
+/* request.c */
+struct handshake_req *handshake_req_alloc(const struct handshake_proto *proto,
+ gfp_t flags);
+int handshake_req_hash_init(void);
+void handshake_req_hash_destroy(void);
+void *handshake_req_private(struct handshake_req *req);
+struct handshake_req *handshake_req_hash_lookup(struct sock *sk);
+struct handshake_req *handshake_req_next(struct handshake_net *hn, int class);
+int handshake_req_submit(struct socket *sock, struct handshake_req *req,
+ gfp_t flags);
+void handshake_complete(struct handshake_req *req, unsigned int status,
+ struct genl_info *info);
+bool handshake_req_cancel(struct sock *sk);
+
+#endif /* _INTERNAL_HANDSHAKE_H */
diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c
new file mode 100644
index 000000000000..1d33a4675a48
--- /dev/null
+++ b/net/handshake/netlink.c
@@ -0,0 +1,289 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Generic netlink handshake service
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/mm.h>
+
+#include <net/sock.h>
+#include <net/genetlink.h>
+#include <net/netns/generic.h>
+
+#include <kunit/visibility.h>
+
+#include <uapi/linux/handshake.h>
+#include "handshake.h"
+#include "genl.h"
+
+#include <trace/events/handshake.h>
+
+/**
+ * handshake_genl_notify - Notify handlers that a request is waiting
+ * @net: target network namespace
+ * @proto: handshake protocol
+ * @flags: memory allocation control flags
+ *
+ * Returns zero on success or a negative errno if notification failed.
+ */
+int handshake_genl_notify(struct net *net, const struct handshake_proto *proto,
+ gfp_t flags)
+{
+ struct sk_buff *msg;
+ void *hdr;
+
+ /* Disable notifications during unit testing */
+ if (!test_bit(HANDSHAKE_F_PROTO_NOTIFY, &proto->hp_flags))
+ return 0;
+
+ if (!genl_has_listeners(&handshake_nl_family, net,
+ proto->hp_handler_class))
+ return -ESRCH;
+
+ msg = genlmsg_new(GENLMSG_DEFAULT_SIZE, flags);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, 0, 0, &handshake_nl_family, 0,
+ HANDSHAKE_CMD_READY);
+ if (!hdr)
+ goto out_free;
+
+ if (nla_put_u32(msg, HANDSHAKE_A_ACCEPT_HANDLER_CLASS,
+ proto->hp_handler_class) < 0) {
+ genlmsg_cancel(msg, hdr);
+ goto out_free;
+ }
+
+ genlmsg_end(msg, hdr);
+ return genlmsg_multicast_netns(&handshake_nl_family, net, msg,
+ 0, proto->hp_handler_class, flags);
+
+out_free:
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
+/**
+ * handshake_genl_put - Create a generic netlink message header
+ * @msg: buffer in which to create the header
+ * @info: generic netlink message context
+ *
+ * Returns a ready-to-use header, or NULL.
+ */
+struct nlmsghdr *handshake_genl_put(struct sk_buff *msg,
+ struct genl_info *info)
+{
+ return genlmsg_put(msg, info->snd_portid, info->snd_seq,
+ &handshake_nl_family, 0, info->genlhdr->cmd);
+}
+EXPORT_SYMBOL(handshake_genl_put);
+
+int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = sock_net(skb->sk);
+ struct handshake_net *hn = handshake_pernet(net);
+ struct handshake_req *req = NULL;
+ struct socket *sock;
+ int class, err;
+
+ err = -EOPNOTSUPP;
+ if (!hn)
+ goto out_status;
+
+ err = -EINVAL;
+ if (GENL_REQ_ATTR_CHECK(info, HANDSHAKE_A_ACCEPT_HANDLER_CLASS))
+ goto out_status;
+ class = nla_get_u32(info->attrs[HANDSHAKE_A_ACCEPT_HANDLER_CLASS]);
+
+ err = -EAGAIN;
+ req = handshake_req_next(hn, class);
+ if (req) {
+ sock = req->hr_sk->sk_socket;
+
+ FD_PREPARE(fdf, O_CLOEXEC, sock->file);
+ if (fdf.err) {
+ err = fdf.err;
+ goto out_complete;
+ }
+
+ get_file(sock->file); /* FD_PREPARE() consumes a reference. */
+ err = req->hr_proto->hp_accept(req, info, fd_prepare_fd(fdf));
+ if (err)
+ goto out_complete; /* Automatic cleanup handles fput */
+
+ trace_handshake_cmd_accept(net, req, req->hr_sk, fd_prepare_fd(fdf));
+ fd_publish(fdf);
+ return 0;
+ }
+
+out_complete:
+ handshake_complete(req, -EIO, NULL);
+out_status:
+ trace_handshake_cmd_accept_err(net, req, NULL, err);
+ return err;
+}
+
+int handshake_nl_done_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = sock_net(skb->sk);
+ struct handshake_req *req;
+ struct socket *sock;
+ int fd, status, err;
+
+ if (GENL_REQ_ATTR_CHECK(info, HANDSHAKE_A_DONE_SOCKFD))
+ return -EINVAL;
+ fd = nla_get_s32(info->attrs[HANDSHAKE_A_DONE_SOCKFD]);
+
+ sock = sockfd_lookup(fd, &err);
+ if (!sock)
+ return err;
+
+ req = handshake_req_hash_lookup(sock->sk);
+ if (!req) {
+ err = -EBUSY;
+ trace_handshake_cmd_done_err(net, req, sock->sk, err);
+ sockfd_put(sock);
+ return err;
+ }
+
+ trace_handshake_cmd_done(net, req, sock->sk, fd);
+
+ status = -EIO;
+ if (info->attrs[HANDSHAKE_A_DONE_STATUS])
+ status = nla_get_u32(info->attrs[HANDSHAKE_A_DONE_STATUS]);
+
+ handshake_complete(req, status, info);
+ sockfd_put(sock);
+ return 0;
+}
+
+static unsigned int handshake_net_id;
+
+static int __net_init handshake_net_init(struct net *net)
+{
+ struct handshake_net *hn = net_generic(net, handshake_net_id);
+ unsigned long tmp;
+ struct sysinfo si;
+
+ /*
+ * Arbitrary limit to prevent handshakes that do not make
+ * progress from clogging up the system. The cap scales up
+ * with the amount of physical memory on the system.
+ */
+ si_meminfo(&si);
+ tmp = si.totalram / (25 * si.mem_unit);
+ hn->hn_pending_max = clamp(tmp, 3UL, 50UL);
+
+ spin_lock_init(&hn->hn_lock);
+ hn->hn_pending = 0;
+ hn->hn_flags = 0;
+ INIT_LIST_HEAD(&hn->hn_requests);
+ return 0;
+}
+
+static void __net_exit handshake_net_exit(struct net *net)
+{
+ struct handshake_net *hn = net_generic(net, handshake_net_id);
+ struct handshake_req *req;
+ LIST_HEAD(requests);
+
+ /*
+ * Drain the net's pending list. Requests that have been
+ * accepted and are in progress will be destroyed when
+ * the socket is closed.
+ */
+ spin_lock(&hn->hn_lock);
+ set_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags);
+ list_splice_init(&requests, &hn->hn_requests);
+ spin_unlock(&hn->hn_lock);
+
+ while (!list_empty(&requests)) {
+ req = list_first_entry(&requests, struct handshake_req, hr_list);
+ list_del(&req->hr_list);
+
+ /*
+ * Requests on this list have not yet been
+ * accepted, so they do not have an fd to put.
+ */
+
+ handshake_complete(req, -ETIMEDOUT, NULL);
+ }
+}
+
+static struct pernet_operations handshake_genl_net_ops = {
+ .init = handshake_net_init,
+ .exit = handshake_net_exit,
+ .id = &handshake_net_id,
+ .size = sizeof(struct handshake_net),
+};
+
+/**
+ * handshake_pernet - Get the handshake private per-net structure
+ * @net: network namespace
+ *
+ * Returns a pointer to the net's private per-net structure for the
+ * handshake module, or NULL if handshake_init() failed.
+ */
+struct handshake_net *handshake_pernet(struct net *net)
+{
+ return handshake_net_id ?
+ net_generic(net, handshake_net_id) : NULL;
+}
+EXPORT_SYMBOL_IF_KUNIT(handshake_pernet);
+
+static int __init handshake_init(void)
+{
+ int ret;
+
+ ret = handshake_req_hash_init();
+ if (ret) {
+ pr_warn("handshake: hash initialization failed (%d)\n", ret);
+ return ret;
+ }
+
+ ret = genl_register_family(&handshake_nl_family);
+ if (ret) {
+ pr_warn("handshake: netlink registration failed (%d)\n", ret);
+ handshake_req_hash_destroy();
+ return ret;
+ }
+
+ /*
+ * ORDER: register_pernet_subsys must be done last.
+ *
+ * If initialization does not make it past pernet_subsys
+ * registration, then handshake_net_id will remain 0. That
+ * shunts the handshake consumer API to return ENOTSUPP
+ * to prevent it from dereferencing something that hasn't
+ * been allocated.
+ */
+ ret = register_pernet_subsys(&handshake_genl_net_ops);
+ if (ret) {
+ pr_warn("handshake: pernet registration failed (%d)\n", ret);
+ genl_unregister_family(&handshake_nl_family);
+ handshake_req_hash_destroy();
+ }
+
+ return ret;
+}
+
+static void __exit handshake_exit(void)
+{
+ unregister_pernet_subsys(&handshake_genl_net_ops);
+ handshake_net_id = 0;
+
+ handshake_req_hash_destroy();
+ genl_unregister_family(&handshake_nl_family);
+}
+
+module_init(handshake_init);
+module_exit(handshake_exit);
diff --git a/net/handshake/request.c b/net/handshake/request.c
new file mode 100644
index 000000000000..274d2c89b6b2
--- /dev/null
+++ b/net/handshake/request.c
@@ -0,0 +1,343 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Handshake request lifetime events
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/inet.h>
+#include <linux/rhashtable.h>
+
+#include <net/sock.h>
+#include <net/genetlink.h>
+#include <net/netns/generic.h>
+
+#include <kunit/visibility.h>
+
+#include <uapi/linux/handshake.h>
+#include "handshake.h"
+
+#include <trace/events/handshake.h>
+
+/*
+ * We need both a handshake_req -> sock mapping, and a sock ->
+ * handshake_req mapping. Both are one-to-one.
+ *
+ * To avoid adding another pointer field to struct sock, net/handshake
+ * maintains a hash table, indexed by the memory address of @sock, to
+ * find the struct handshake_req outstanding for that socket. The
+ * reverse direction uses a simple pointer field in the handshake_req
+ * struct.
+ */
+
+static struct rhashtable handshake_rhashtbl ____cacheline_aligned_in_smp;
+
+static const struct rhashtable_params handshake_rhash_params = {
+ .key_len = sizeof_field(struct handshake_req, hr_sk),
+ .key_offset = offsetof(struct handshake_req, hr_sk),
+ .head_offset = offsetof(struct handshake_req, hr_rhash),
+ .automatic_shrinking = true,
+};
+
+int handshake_req_hash_init(void)
+{
+ return rhashtable_init(&handshake_rhashtbl, &handshake_rhash_params);
+}
+
+void handshake_req_hash_destroy(void)
+{
+ rhashtable_destroy(&handshake_rhashtbl);
+}
+
+struct handshake_req *handshake_req_hash_lookup(struct sock *sk)
+{
+ return rhashtable_lookup_fast(&handshake_rhashtbl, &sk,
+ handshake_rhash_params);
+}
+EXPORT_SYMBOL_IF_KUNIT(handshake_req_hash_lookup);
+
+static bool handshake_req_hash_add(struct handshake_req *req)
+{
+ int ret;
+
+ ret = rhashtable_lookup_insert_fast(&handshake_rhashtbl,
+ &req->hr_rhash,
+ handshake_rhash_params);
+ return ret == 0;
+}
+
+static void handshake_req_destroy(struct handshake_req *req)
+{
+ if (req->hr_proto->hp_destroy)
+ req->hr_proto->hp_destroy(req);
+ rhashtable_remove_fast(&handshake_rhashtbl, &req->hr_rhash,
+ handshake_rhash_params);
+ kfree(req);
+}
+
+static void handshake_sk_destruct(struct sock *sk)
+{
+ void (*sk_destruct)(struct sock *sk);
+ struct handshake_req *req;
+
+ req = handshake_req_hash_lookup(sk);
+ if (!req)
+ return;
+
+ trace_handshake_destruct(sock_net(sk), req, sk);
+ sk_destruct = req->hr_odestruct;
+ handshake_req_destroy(req);
+ if (sk_destruct)
+ sk_destruct(sk);
+}
+
+/**
+ * handshake_req_alloc - Allocate a handshake request
+ * @proto: security protocol
+ * @flags: memory allocation flags
+ *
+ * Returns an initialized handshake_req or NULL.
+ */
+struct handshake_req *handshake_req_alloc(const struct handshake_proto *proto,
+ gfp_t flags)
+{
+ struct handshake_req *req;
+
+ if (!proto)
+ return NULL;
+ if (proto->hp_handler_class <= HANDSHAKE_HANDLER_CLASS_NONE)
+ return NULL;
+ if (proto->hp_handler_class >= HANDSHAKE_HANDLER_CLASS_MAX)
+ return NULL;
+ if (!proto->hp_accept || !proto->hp_done)
+ return NULL;
+
+ req = kzalloc(struct_size(req, hr_priv, proto->hp_privsize), flags);
+ if (!req)
+ return NULL;
+
+ INIT_LIST_HEAD(&req->hr_list);
+ req->hr_proto = proto;
+ return req;
+}
+EXPORT_SYMBOL(handshake_req_alloc);
+
+/**
+ * handshake_req_private - Get per-handshake private data
+ * @req: handshake arguments
+ *
+ */
+void *handshake_req_private(struct handshake_req *req)
+{
+ return (void *)&req->hr_priv;
+}
+EXPORT_SYMBOL(handshake_req_private);
+
+static bool __add_pending_locked(struct handshake_net *hn,
+ struct handshake_req *req)
+{
+ if (WARN_ON_ONCE(!list_empty(&req->hr_list)))
+ return false;
+ hn->hn_pending++;
+ list_add_tail(&req->hr_list, &hn->hn_requests);
+ return true;
+}
+
+static void __remove_pending_locked(struct handshake_net *hn,
+ struct handshake_req *req)
+{
+ hn->hn_pending--;
+ list_del_init(&req->hr_list);
+}
+
+/*
+ * Returns %true if the request was found on @net's pending list,
+ * otherwise %false.
+ *
+ * If @req was on a pending list, it has not yet been accepted.
+ */
+static bool remove_pending(struct handshake_net *hn, struct handshake_req *req)
+{
+ bool ret = false;
+
+ spin_lock(&hn->hn_lock);
+ if (!list_empty(&req->hr_list)) {
+ __remove_pending_locked(hn, req);
+ ret = true;
+ }
+ spin_unlock(&hn->hn_lock);
+
+ return ret;
+}
+
+struct handshake_req *handshake_req_next(struct handshake_net *hn, int class)
+{
+ struct handshake_req *req, *pos;
+
+ req = NULL;
+ spin_lock(&hn->hn_lock);
+ list_for_each_entry(pos, &hn->hn_requests, hr_list) {
+ if (pos->hr_proto->hp_handler_class != class)
+ continue;
+ __remove_pending_locked(hn, pos);
+ req = pos;
+ break;
+ }
+ spin_unlock(&hn->hn_lock);
+
+ return req;
+}
+EXPORT_SYMBOL_IF_KUNIT(handshake_req_next);
+
+/**
+ * handshake_req_submit - Submit a handshake request
+ * @sock: open socket on which to perform the handshake
+ * @req: handshake arguments
+ * @flags: memory allocation flags
+ *
+ * Return values:
+ * %0: Request queued
+ * %-EINVAL: Invalid argument
+ * %-EBUSY: A handshake is already under way for this socket
+ * %-ESRCH: No handshake agent is available
+ * %-EAGAIN: Too many pending handshake requests
+ * %-ENOMEM: Failed to allocate memory
+ * %-EMSGSIZE: Failed to construct notification message
+ * %-EOPNOTSUPP: Handshake module not initialized
+ *
+ * A zero return value from handshake_req_submit() means that
+ * exactly one subsequent completion callback is guaranteed.
+ *
+ * A negative return value from handshake_req_submit() means that
+ * no completion callback will be done and that @req has been
+ * destroyed.
+ */
+int handshake_req_submit(struct socket *sock, struct handshake_req *req,
+ gfp_t flags)
+{
+ struct handshake_net *hn;
+ struct net *net;
+ int ret;
+
+ if (!sock || !req || !sock->file) {
+ kfree(req);
+ return -EINVAL;
+ }
+
+ req->hr_sk = sock->sk;
+ if (!req->hr_sk) {
+ kfree(req);
+ return -EINVAL;
+ }
+ req->hr_odestruct = req->hr_sk->sk_destruct;
+ req->hr_sk->sk_destruct = handshake_sk_destruct;
+
+ ret = -EOPNOTSUPP;
+ net = sock_net(req->hr_sk);
+ hn = handshake_pernet(net);
+ if (!hn)
+ goto out_err;
+
+ ret = -EAGAIN;
+ if (READ_ONCE(hn->hn_pending) >= hn->hn_pending_max)
+ goto out_err;
+
+ spin_lock(&hn->hn_lock);
+ ret = -EOPNOTSUPP;
+ if (test_bit(HANDSHAKE_F_NET_DRAINING, &hn->hn_flags))
+ goto out_unlock;
+ ret = -EBUSY;
+ if (!handshake_req_hash_add(req))
+ goto out_unlock;
+ if (!__add_pending_locked(hn, req))
+ goto out_unlock;
+ spin_unlock(&hn->hn_lock);
+
+ ret = handshake_genl_notify(net, req->hr_proto, flags);
+ if (ret) {
+ trace_handshake_notify_err(net, req, req->hr_sk, ret);
+ if (remove_pending(hn, req))
+ goto out_err;
+ }
+
+ /* Prevent socket release while a handshake request is pending */
+ sock_hold(req->hr_sk);
+
+ trace_handshake_submit(net, req, req->hr_sk);
+ return 0;
+
+out_unlock:
+ spin_unlock(&hn->hn_lock);
+out_err:
+ trace_handshake_submit_err(net, req, req->hr_sk, ret);
+ handshake_req_destroy(req);
+ return ret;
+}
+EXPORT_SYMBOL(handshake_req_submit);
+
+void handshake_complete(struct handshake_req *req, unsigned int status,
+ struct genl_info *info)
+{
+ struct sock *sk = req->hr_sk;
+ struct net *net = sock_net(sk);
+
+ if (!test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) {
+ trace_handshake_complete(net, req, sk, status);
+ req->hr_proto->hp_done(req, status, info);
+
+ /* Handshake request is no longer pending */
+ sock_put(sk);
+ }
+}
+EXPORT_SYMBOL_IF_KUNIT(handshake_complete);
+
+/**
+ * handshake_req_cancel - Cancel an in-progress handshake
+ * @sk: socket on which there is an ongoing handshake
+ *
+ * Request cancellation races with request completion. To determine
+ * who won, callers examine the return value from this function.
+ *
+ * Return values:
+ * %true - Uncompleted handshake request was canceled
+ * %false - Handshake request already completed or not found
+ */
+bool handshake_req_cancel(struct sock *sk)
+{
+ struct handshake_req *req;
+ struct handshake_net *hn;
+ struct net *net;
+
+ net = sock_net(sk);
+ req = handshake_req_hash_lookup(sk);
+ if (!req) {
+ trace_handshake_cancel_none(net, req, sk);
+ return false;
+ }
+
+ hn = handshake_pernet(net);
+ if (hn && remove_pending(hn, req)) {
+ /* Request hadn't been accepted */
+ goto out_true;
+ }
+ if (test_and_set_bit(HANDSHAKE_F_REQ_COMPLETED, &req->hr_flags)) {
+ /* Request already completed */
+ trace_handshake_cancel_busy(net, req, sk);
+ return false;
+ }
+
+out_true:
+ trace_handshake_cancel(net, req, sk);
+
+ /* Handshake request is no longer pending */
+ sock_put(sk);
+ return true;
+}
+EXPORT_SYMBOL(handshake_req_cancel);
diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c
new file mode 100644
index 000000000000..8f9532a15f43
--- /dev/null
+++ b/net/handshake/tlshd.c
@@ -0,0 +1,455 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Establish a TLS session for a kernel socket consumer
+ * using the tlshd user space handler.
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2021-2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/key.h>
+
+#include <net/sock.h>
+#include <net/handshake.h>
+#include <net/genetlink.h>
+#include <net/tls_prot.h>
+
+#include <uapi/linux/keyctl.h>
+#include <uapi/linux/handshake.h>
+#include "handshake.h"
+
+struct tls_handshake_req {
+ void (*th_consumer_done)(void *data, int status,
+ key_serial_t peerid);
+ void *th_consumer_data;
+
+ int th_type;
+ unsigned int th_timeout_ms;
+ int th_auth_mode;
+ const char *th_peername;
+ key_serial_t th_keyring;
+ key_serial_t th_certificate;
+ key_serial_t th_privkey;
+
+ unsigned int th_num_peerids;
+ key_serial_t th_peerid[5];
+};
+
+static struct tls_handshake_req *
+tls_handshake_req_init(struct handshake_req *req,
+ const struct tls_handshake_args *args)
+{
+ struct tls_handshake_req *treq = handshake_req_private(req);
+
+ treq->th_timeout_ms = args->ta_timeout_ms;
+ treq->th_consumer_done = args->ta_done;
+ treq->th_consumer_data = args->ta_data;
+ treq->th_peername = args->ta_peername;
+ treq->th_keyring = args->ta_keyring;
+ treq->th_num_peerids = 0;
+ treq->th_certificate = TLS_NO_CERT;
+ treq->th_privkey = TLS_NO_PRIVKEY;
+ return treq;
+}
+
+static void tls_handshake_remote_peerids(struct tls_handshake_req *treq,
+ struct genl_info *info)
+{
+ struct nlattr *head = nlmsg_attrdata(info->nlhdr, GENL_HDRLEN);
+ int rem, len = nlmsg_attrlen(info->nlhdr, GENL_HDRLEN);
+ struct nlattr *nla;
+ unsigned int i;
+
+ i = 0;
+ nla_for_each_attr(nla, head, len, rem) {
+ if (nla_type(nla) == HANDSHAKE_A_DONE_REMOTE_AUTH)
+ i++;
+ }
+ if (!i)
+ return;
+ treq->th_num_peerids = min_t(unsigned int, i,
+ ARRAY_SIZE(treq->th_peerid));
+
+ i = 0;
+ nla_for_each_attr(nla, head, len, rem) {
+ if (nla_type(nla) == HANDSHAKE_A_DONE_REMOTE_AUTH)
+ treq->th_peerid[i++] = nla_get_u32(nla);
+ if (i >= treq->th_num_peerids)
+ break;
+ }
+}
+
+/**
+ * tls_handshake_done - callback to handle a CMD_DONE request
+ * @req: socket on which the handshake was performed
+ * @status: session status code
+ * @info: full results of session establishment
+ *
+ */
+static void tls_handshake_done(struct handshake_req *req,
+ unsigned int status, struct genl_info *info)
+{
+ struct tls_handshake_req *treq = handshake_req_private(req);
+
+ treq->th_peerid[0] = TLS_NO_PEERID;
+ if (info)
+ tls_handshake_remote_peerids(treq, info);
+
+ if (!status)
+ set_bit(HANDSHAKE_F_REQ_SESSION, &req->hr_flags);
+
+ treq->th_consumer_done(treq->th_consumer_data, -status,
+ treq->th_peerid[0]);
+}
+
+#if IS_ENABLED(CONFIG_KEYS)
+static int tls_handshake_private_keyring(struct tls_handshake_req *treq)
+{
+ key_ref_t process_keyring_ref, keyring_ref;
+ int ret;
+
+ if (treq->th_keyring == TLS_NO_KEYRING)
+ return 0;
+
+ process_keyring_ref = lookup_user_key(KEY_SPEC_PROCESS_KEYRING,
+ KEY_LOOKUP_CREATE,
+ KEY_NEED_WRITE);
+ if (IS_ERR(process_keyring_ref)) {
+ ret = PTR_ERR(process_keyring_ref);
+ goto out;
+ }
+
+ keyring_ref = lookup_user_key(treq->th_keyring, KEY_LOOKUP_CREATE,
+ KEY_NEED_LINK);
+ if (IS_ERR(keyring_ref)) {
+ ret = PTR_ERR(keyring_ref);
+ goto out_put_key;
+ }
+
+ ret = key_link(key_ref_to_ptr(process_keyring_ref),
+ key_ref_to_ptr(keyring_ref));
+
+ key_ref_put(keyring_ref);
+out_put_key:
+ key_ref_put(process_keyring_ref);
+out:
+ return ret;
+}
+#else
+static int tls_handshake_private_keyring(struct tls_handshake_req *treq)
+{
+ return 0;
+}
+#endif
+
+static int tls_handshake_put_peer_identity(struct sk_buff *msg,
+ struct tls_handshake_req *treq)
+{
+ unsigned int i;
+
+ for (i = 0; i < treq->th_num_peerids; i++)
+ if (nla_put_u32(msg, HANDSHAKE_A_ACCEPT_PEER_IDENTITY,
+ treq->th_peerid[i]) < 0)
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int tls_handshake_put_certificate(struct sk_buff *msg,
+ struct tls_handshake_req *treq)
+{
+ struct nlattr *entry_attr;
+
+ if (treq->th_certificate == TLS_NO_CERT &&
+ treq->th_privkey == TLS_NO_PRIVKEY)
+ return 0;
+
+ entry_attr = nla_nest_start(msg, HANDSHAKE_A_ACCEPT_CERTIFICATE);
+ if (!entry_attr)
+ return -EMSGSIZE;
+
+ if (nla_put_s32(msg, HANDSHAKE_A_X509_CERT,
+ treq->th_certificate) ||
+ nla_put_s32(msg, HANDSHAKE_A_X509_PRIVKEY,
+ treq->th_privkey)) {
+ nla_nest_cancel(msg, entry_attr);
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(msg, entry_attr);
+ return 0;
+}
+
+/**
+ * tls_handshake_accept - callback to construct a CMD_ACCEPT response
+ * @req: handshake parameters to return
+ * @info: generic netlink message context
+ * @fd: file descriptor to be returned
+ *
+ * Returns zero on success, or a negative errno on failure.
+ */
+static int tls_handshake_accept(struct handshake_req *req,
+ struct genl_info *info, int fd)
+{
+ struct tls_handshake_req *treq = handshake_req_private(req);
+ struct nlmsghdr *hdr;
+ struct sk_buff *msg;
+ int ret;
+
+ ret = tls_handshake_private_keyring(treq);
+ if (ret < 0)
+ goto out;
+
+ ret = -ENOMEM;
+ msg = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ goto out;
+ hdr = handshake_genl_put(msg, info);
+ if (!hdr)
+ goto out_cancel;
+
+ ret = nla_put_s32(msg, HANDSHAKE_A_ACCEPT_SOCKFD, fd);
+ if (ret < 0)
+ goto out_cancel;
+ ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_MESSAGE_TYPE, treq->th_type);
+ if (ret < 0)
+ goto out_cancel;
+ if (treq->th_peername) {
+ ret = nla_put_string(msg, HANDSHAKE_A_ACCEPT_PEERNAME,
+ treq->th_peername);
+ if (ret < 0)
+ goto out_cancel;
+ }
+ if (treq->th_timeout_ms) {
+ ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_TIMEOUT, treq->th_timeout_ms);
+ if (ret < 0)
+ goto out_cancel;
+ }
+ if (treq->th_keyring) {
+ ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_KEYRING,
+ treq->th_keyring);
+ if (ret < 0)
+ goto out_cancel;
+ }
+
+ ret = nla_put_u32(msg, HANDSHAKE_A_ACCEPT_AUTH_MODE,
+ treq->th_auth_mode);
+ if (ret < 0)
+ goto out_cancel;
+ switch (treq->th_auth_mode) {
+ case HANDSHAKE_AUTH_PSK:
+ ret = tls_handshake_put_peer_identity(msg, treq);
+ if (ret < 0)
+ goto out_cancel;
+ break;
+ case HANDSHAKE_AUTH_X509:
+ ret = tls_handshake_put_certificate(msg, treq);
+ if (ret < 0)
+ goto out_cancel;
+ break;
+ }
+
+ genlmsg_end(msg, hdr);
+ return genlmsg_reply(msg, info);
+
+out_cancel:
+ genlmsg_cancel(msg, hdr);
+ nlmsg_free(msg);
+out:
+ return ret;
+}
+
+static const struct handshake_proto tls_handshake_proto = {
+ .hp_handler_class = HANDSHAKE_HANDLER_CLASS_TLSHD,
+ .hp_privsize = sizeof(struct tls_handshake_req),
+ .hp_flags = BIT(HANDSHAKE_F_PROTO_NOTIFY),
+
+ .hp_accept = tls_handshake_accept,
+ .hp_done = tls_handshake_done,
+};
+
+/**
+ * tls_client_hello_anon - request an anonymous TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ * %0: Handshake request enqueue; ->done will be called when complete
+ * %-ESRCH: No user agent is available
+ * %-ENOMEM: Memory allocation failed
+ */
+int tls_client_hello_anon(const struct tls_handshake_args *args, gfp_t flags)
+{
+ struct tls_handshake_req *treq;
+ struct handshake_req *req;
+
+ req = handshake_req_alloc(&tls_handshake_proto, flags);
+ if (!req)
+ return -ENOMEM;
+ treq = tls_handshake_req_init(req, args);
+ treq->th_type = HANDSHAKE_MSG_TYPE_CLIENTHELLO;
+ treq->th_auth_mode = HANDSHAKE_AUTH_UNAUTH;
+
+ return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_client_hello_anon);
+
+/**
+ * tls_client_hello_x509 - request an x.509-based TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ * %0: Handshake request enqueue; ->done will be called when complete
+ * %-ESRCH: No user agent is available
+ * %-ENOMEM: Memory allocation failed
+ */
+int tls_client_hello_x509(const struct tls_handshake_args *args, gfp_t flags)
+{
+ struct tls_handshake_req *treq;
+ struct handshake_req *req;
+
+ req = handshake_req_alloc(&tls_handshake_proto, flags);
+ if (!req)
+ return -ENOMEM;
+ treq = tls_handshake_req_init(req, args);
+ treq->th_type = HANDSHAKE_MSG_TYPE_CLIENTHELLO;
+ treq->th_auth_mode = HANDSHAKE_AUTH_X509;
+ treq->th_certificate = args->ta_my_cert;
+ treq->th_privkey = args->ta_my_privkey;
+
+ return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_client_hello_x509);
+
+/**
+ * tls_client_hello_psk - request a PSK-based TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ * %0: Handshake request enqueue; ->done will be called when complete
+ * %-EINVAL: Wrong number of local peer IDs
+ * %-ESRCH: No user agent is available
+ * %-ENOMEM: Memory allocation failed
+ */
+int tls_client_hello_psk(const struct tls_handshake_args *args, gfp_t flags)
+{
+ struct tls_handshake_req *treq;
+ struct handshake_req *req;
+ unsigned int i;
+
+ if (!args->ta_num_peerids ||
+ args->ta_num_peerids > ARRAY_SIZE(treq->th_peerid))
+ return -EINVAL;
+
+ req = handshake_req_alloc(&tls_handshake_proto, flags);
+ if (!req)
+ return -ENOMEM;
+ treq = tls_handshake_req_init(req, args);
+ treq->th_type = HANDSHAKE_MSG_TYPE_CLIENTHELLO;
+ treq->th_auth_mode = HANDSHAKE_AUTH_PSK;
+ treq->th_num_peerids = args->ta_num_peerids;
+ for (i = 0; i < args->ta_num_peerids; i++)
+ treq->th_peerid[i] = args->ta_my_peerids[i];
+
+ return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_client_hello_psk);
+
+/**
+ * tls_server_hello_x509 - request a server TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ * %0: Handshake request enqueue; ->done will be called when complete
+ * %-ESRCH: No user agent is available
+ * %-ENOMEM: Memory allocation failed
+ */
+int tls_server_hello_x509(const struct tls_handshake_args *args, gfp_t flags)
+{
+ struct tls_handshake_req *treq;
+ struct handshake_req *req;
+
+ req = handshake_req_alloc(&tls_handshake_proto, flags);
+ if (!req)
+ return -ENOMEM;
+ treq = tls_handshake_req_init(req, args);
+ treq->th_type = HANDSHAKE_MSG_TYPE_SERVERHELLO;
+ treq->th_auth_mode = HANDSHAKE_AUTH_X509;
+ treq->th_certificate = args->ta_my_cert;
+ treq->th_privkey = args->ta_my_privkey;
+
+ return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_server_hello_x509);
+
+/**
+ * tls_server_hello_psk - request a server TLS handshake on a socket
+ * @args: socket and handshake parameters for this request
+ * @flags: memory allocation control flags
+ *
+ * Return values:
+ * %0: Handshake request enqueue; ->done will be called when complete
+ * %-ESRCH: No user agent is available
+ * %-ENOMEM: Memory allocation failed
+ */
+int tls_server_hello_psk(const struct tls_handshake_args *args, gfp_t flags)
+{
+ struct tls_handshake_req *treq;
+ struct handshake_req *req;
+
+ req = handshake_req_alloc(&tls_handshake_proto, flags);
+ if (!req)
+ return -ENOMEM;
+ treq = tls_handshake_req_init(req, args);
+ treq->th_type = HANDSHAKE_MSG_TYPE_SERVERHELLO;
+ treq->th_auth_mode = HANDSHAKE_AUTH_PSK;
+ treq->th_num_peerids = 1;
+ treq->th_peerid[0] = args->ta_my_peerids[0];
+
+ return handshake_req_submit(args->ta_sock, req, flags);
+}
+EXPORT_SYMBOL(tls_server_hello_psk);
+
+/**
+ * tls_handshake_cancel - cancel a pending handshake
+ * @sk: socket on which there is an ongoing handshake
+ *
+ * Request cancellation races with request completion. To determine
+ * who won, callers examine the return value from this function.
+ *
+ * Return values:
+ * %true - Uncompleted handshake request was canceled
+ * %false - Handshake request already completed or not found
+ */
+bool tls_handshake_cancel(struct sock *sk)
+{
+ return handshake_req_cancel(sk);
+}
+EXPORT_SYMBOL(tls_handshake_cancel);
+
+/**
+ * tls_handshake_close - send a Closure alert
+ * @sock: an open socket
+ *
+ */
+void tls_handshake_close(struct socket *sock)
+{
+ struct handshake_req *req;
+
+ req = handshake_req_hash_lookup(sock->sk);
+ if (!req)
+ return;
+ if (!test_and_clear_bit(HANDSHAKE_F_REQ_SESSION, &req->hr_flags))
+ return;
+ tls_alert_send(sock, TLS_ALERT_LEVEL_WARNING,
+ TLS_ALERT_DESC_CLOSE_NOTIFY);
+}
+EXPORT_SYMBOL(tls_handshake_close);
diff --git a/net/handshake/trace.c b/net/handshake/trace.c
new file mode 100644
index 000000000000..44432d0857b9
--- /dev/null
+++ b/net/handshake/trace.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Trace points for transport security layer handshakes.
+ *
+ * Author: Chuck Lever <chuck.lever@oracle.com>
+ *
+ * Copyright (c) 2023, Oracle and/or its affiliates.
+ */
+
+#include <linux/types.h>
+#include <linux/ipv6.h>
+
+#include <net/sock.h>
+#include <net/inet_sock.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "handshake.h"
+
+#define CREATE_TRACE_POINTS
+
+#include <trace/events/handshake.h>
diff --git a/net/hsr/Kconfig b/net/hsr/Kconfig
index 4b683fd0abf1..fcacdf4f0ffc 100644
--- a/net/hsr/Kconfig
+++ b/net/hsr/Kconfig
@@ -1,28 +1,58 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# IEC 62439-3 High-availability Seamless Redundancy
#
config HSR
- tristate "High-availability Seamless Redundancy (HSR)"
- ---help---
+ tristate "High-availability Seamless Redundancy (HSR & PRP)"
+ help
+ This enables IEC 62439 defined High-availability Seamless
+ Redundancy (HSR) and Parallel Redundancy Protocol (PRP).
+
If you say Y here, then your Linux box will be able to act as a
- DANH ("Doubly attached node implementing HSR"). For this to work,
- your Linux box needs (at least) two physical Ethernet interfaces,
- and it must be connected as a node in a ring network together with
- other HSR capable nodes.
+ DANH ("Doubly attached node implementing HSR") or DANP ("Doubly
+ attached node implementing PRP"). For this to work, your Linux box
+ needs (at least) two physical Ethernet interfaces.
+
+ For DANH, it must be connected as a node in a ring network together
+ with other HSR capable nodes. All Ethernet frames sent over the HSR
+ device will be sent in both directions on the ring (over both slave
+ ports), giving a redundant, instant fail-over network. Each HSR node
+ in the ring acts like a bridge for HSR frames, but filters frames
+ that have been forwarded earlier.
- All Ethernet frames sent over the hsr device will be sent in both
- directions on the ring (over both slave ports), giving a redundant,
- instant fail-over network. Each HSR node in the ring acts like a
- bridge for HSR frames, but filters frames that have been forwarded
- earlier.
+ For DANP, it must be connected as a node connecting to two
+ separate networks over the two slave interfaces. Like HSR, Ethernet
+ frames sent over the PRP device will be sent to both networks giving
+ a redundant, instant fail-over network. Unlike HSR, PRP networks
+ can have Singly Attached Nodes (SAN) such as PC, printer, bridges
+ etc and will be able to communicate with DANP nodes.
This code is a "best effort" to comply with the HSR standard as
described in IEC 62439-3:2010 (HSRv0) and IEC 62439-3:2012 (HSRv1),
- but no compliancy tests have been made. Use iproute2 to select
- the version you desire.
+ and PRP standard described in IEC 62439-4:2012 (PRP), but no
+ compliancy tests have been made. Use iproute2 to select the protocol
+ you would like to use.
You need to perform any and all necessary tests yourself before
relying on this code in a safety critical system!
If unsure, say N.
+
+if HSR
+
+config PRP_DUP_DISCARD_KUNIT_TEST
+ tristate "PRP duplicate discard KUnit tests" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
+ help
+ Covers the PRP duplicate discard algorithm.
+ Only useful for kernel devs running KUnit test harness and are not
+ for inclusion into a production build.
+
+ For more information on KUnit and unit tests in general please refer
+ to the KUnit documentation in Documentation/dev-tools/kunit/.
+
+ If unsure, say N.
+
+endif
diff --git a/net/hsr/Makefile b/net/hsr/Makefile
index 9ae972a820f4..34e581db5c41 100644
--- a/net/hsr/Makefile
+++ b/net/hsr/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for HSR
#
@@ -6,3 +7,6 @@ obj-$(CONFIG_HSR) += hsr.o
hsr-y := hsr_main.o hsr_framereg.o hsr_device.o \
hsr_netlink.o hsr_slave.o hsr_forward.o
+hsr-$(CONFIG_DEBUG_FS) += hsr_debugfs.o
+
+obj-$(CONFIG_PRP_DUP_DISCARD_KUNIT_TEST) += prp_dup_discard_test.o
diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c
new file mode 100644
index 000000000000..5b2cfac3b2ba
--- /dev/null
+++ b/net/hsr/hsr_debugfs.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * debugfs code for HSR & PRP
+ * Copyright (C) 2019 Texas Instruments Incorporated
+ *
+ * Author(s):
+ * Murali Karicheri <m-karicheri2@ti.com>
+ */
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/debugfs.h>
+#include "hsr_main.h"
+#include "hsr_framereg.h"
+
+static struct dentry *hsr_debugfs_root_dir;
+
+/* hsr_node_table_show - Formats and prints node_table entries */
+static int
+hsr_node_table_show(struct seq_file *sfp, void *data)
+{
+ struct hsr_priv *priv = (struct hsr_priv *)sfp->private;
+ struct hsr_node *node;
+
+ seq_printf(sfp, "Node Table entries for (%s) device\n",
+ (priv->prot_version == PRP_V1 ? "PRP" : "HSR"));
+ seq_puts(sfp, "MAC-Address-A, MAC-Address-B, time_in[A], ");
+ seq_puts(sfp, "time_in[B], Address-B port, ");
+ if (priv->prot_version == PRP_V1)
+ seq_puts(sfp, "SAN-A, SAN-B, DAN-P\n");
+ else
+ seq_puts(sfp, "DAN-H\n");
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(node, &priv->node_db, mac_list) {
+ /* skip self node */
+ if (hsr_addr_is_self(priv, node->macaddress_A))
+ continue;
+ seq_printf(sfp, "%pM ", &node->macaddress_A[0]);
+ seq_printf(sfp, "%pM ", &node->macaddress_B[0]);
+ seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_A]);
+ seq_printf(sfp, "%10lx, ", node->time_in[HSR_PT_SLAVE_B]);
+ seq_printf(sfp, "%14x, ", node->addr_B_port);
+
+ if (priv->prot_version == PRP_V1)
+ seq_printf(sfp, "%5x, %5x, %5x\n",
+ node->san_a, node->san_b,
+ (node->san_a == 0 && node->san_b == 0));
+ else
+ seq_printf(sfp, "%5x\n", 1);
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(hsr_node_table);
+
+void hsr_debugfs_rename(struct net_device *dev)
+{
+ struct hsr_priv *priv = netdev_priv(dev);
+ int err;
+
+ err = debugfs_change_name(priv->node_tbl_root, "%s", dev->name);
+ if (err)
+ netdev_warn(dev, "failed to rename\n");
+}
+
+/* hsr_debugfs_init - create hsr node_table file for dumping
+ * the node table
+ *
+ * Description:
+ * When debugfs is configured this routine sets up the node_table file per
+ * hsr device for dumping the node_table entries
+ */
+void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev)
+{
+ struct dentry *de = NULL;
+
+ de = debugfs_create_dir(hsr_dev->name, hsr_debugfs_root_dir);
+ if (IS_ERR(de)) {
+ pr_err("Cannot create hsr debugfs directory\n");
+ return;
+ }
+
+ priv->node_tbl_root = de;
+
+ de = debugfs_create_file("node_table", S_IFREG | 0444,
+ priv->node_tbl_root, priv,
+ &hsr_node_table_fops);
+ if (IS_ERR(de)) {
+ pr_err("Cannot create hsr node_table file\n");
+ debugfs_remove(priv->node_tbl_root);
+ priv->node_tbl_root = NULL;
+ return;
+ }
+}
+
+/* hsr_debugfs_term - Tear down debugfs intrastructure
+ *
+ * Description:
+ * When Debugfs is configured this routine removes debugfs file system
+ * elements that are specific to hsr
+ */
+void
+hsr_debugfs_term(struct hsr_priv *priv)
+{
+ debugfs_remove_recursive(priv->node_tbl_root);
+ priv->node_tbl_root = NULL;
+}
+
+void hsr_debugfs_create_root(void)
+{
+ hsr_debugfs_root_dir = debugfs_create_dir("hsr", NULL);
+ if (IS_ERR(hsr_debugfs_root_dir)) {
+ pr_err("Cannot create hsr debugfs root directory\n");
+ hsr_debugfs_root_dir = NULL;
+ }
+}
+
+void hsr_debugfs_remove_root(void)
+{
+ /* debugfs_remove() internally checks NULL and ERROR */
+ debugfs_remove(hsr_debugfs_root_dir);
+}
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index b8cd43c9ed5b..d1bfc49b5f01 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -1,15 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/* Copyright 2011-2014 Autronica Fire and Security AS
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
* Author(s):
* 2011-2014 Arvid Brodin, arvid.brodin@alten.se
- *
* This file contains device methods for creating, using and destroying
- * virtual HSR devices.
+ * virtual HSR or PRP devices.
*/
#include <linux/netdevice.h>
@@ -23,7 +18,6 @@
#include "hsr_main.h"
#include "hsr_forward.h"
-
static bool is_admin_up(struct net_device *dev)
{
return dev && (dev->flags & IFF_UP);
@@ -34,90 +28,75 @@ static bool is_slave_up(struct net_device *dev)
return dev && is_admin_up(dev) && netif_oper_up(dev);
}
-static void __hsr_set_operstate(struct net_device *dev, int transition)
-{
- write_lock_bh(&dev_base_lock);
- if (dev->operstate != transition) {
- dev->operstate = transition;
- write_unlock_bh(&dev_base_lock);
- netdev_state_change(dev);
- } else {
- write_unlock_bh(&dev_base_lock);
- }
-}
-
static void hsr_set_operstate(struct hsr_port *master, bool has_carrier)
{
- if (!is_admin_up(master->dev)) {
- __hsr_set_operstate(master->dev, IF_OPER_DOWN);
+ struct net_device *dev = master->dev;
+
+ if (!is_admin_up(dev)) {
+ netif_set_operstate(dev, IF_OPER_DOWN);
return;
}
if (has_carrier)
- __hsr_set_operstate(master->dev, IF_OPER_UP);
+ netif_set_operstate(dev, IF_OPER_UP);
else
- __hsr_set_operstate(master->dev, IF_OPER_LOWERLAYERDOWN);
+ netif_set_operstate(dev, IF_OPER_LOWERLAYERDOWN);
}
static bool hsr_check_carrier(struct hsr_port *master)
{
struct hsr_port *port;
- bool has_carrier;
- has_carrier = false;
+ ASSERT_RTNL();
- rcu_read_lock();
- hsr_for_each_port(master->hsr, port)
- if ((port->type != HSR_PT_MASTER) && is_slave_up(port->dev)) {
- has_carrier = true;
- break;
+ hsr_for_each_port_rtnl(master->hsr, port) {
+ if (port->type != HSR_PT_MASTER && is_slave_up(port->dev)) {
+ netif_carrier_on(master->dev);
+ return true;
}
- rcu_read_unlock();
+ }
- if (has_carrier)
- netif_carrier_on(master->dev);
- else
- netif_carrier_off(master->dev);
+ netif_carrier_off(master->dev);
- return has_carrier;
+ return false;
}
-
-static void hsr_check_announce(struct net_device *hsr_dev,
- unsigned char old_operstate)
+static void hsr_check_announce(struct net_device *hsr_dev)
{
struct hsr_priv *hsr;
hsr = netdev_priv(hsr_dev);
+ if (netif_running(hsr_dev) && netif_oper_up(hsr_dev)) {
+ /* Enable announce timer and start sending supervisory frames */
+ if (!timer_pending(&hsr->announce_timer)) {
+ hsr->announce_count = 0;
+ mod_timer(&hsr->announce_timer, jiffies +
+ msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL));
+ }
- if ((hsr_dev->operstate == IF_OPER_UP)
- && (old_operstate != IF_OPER_UP)) {
- /* Went up */
- hsr->announce_count = 0;
- hsr->announce_timer.expires = jiffies +
- msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL);
- add_timer(&hsr->announce_timer);
+ if (hsr->redbox && !timer_pending(&hsr->announce_proxy_timer))
+ mod_timer(&hsr->announce_proxy_timer, jiffies +
+ msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL) / 2);
+ } else {
+ /* Deactivate the announce timer */
+ timer_delete(&hsr->announce_timer);
+ if (hsr->redbox)
+ timer_delete(&hsr->announce_proxy_timer);
}
-
- if ((hsr_dev->operstate != IF_OPER_UP) && (old_operstate == IF_OPER_UP))
- /* Went down */
- del_timer(&hsr->announce_timer);
}
void hsr_check_carrier_and_operstate(struct hsr_priv *hsr)
{
struct hsr_port *master;
- unsigned char old_operstate;
bool has_carrier;
master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
/* netif_stacked_transfer_operstate() cannot be used here since
* it doesn't set IF_OPER_LOWERLAYERDOWN (?)
*/
- old_operstate = master->dev->operstate;
has_carrier = hsr_check_carrier(master);
hsr_set_operstate(master, has_carrier);
- hsr_check_announce(master->dev, old_operstate);
+ hsr_check_announce(master->dev);
}
int hsr_get_max_mtu(struct hsr_priv *hsr)
@@ -126,33 +105,28 @@ int hsr_get_max_mtu(struct hsr_priv *hsr)
struct hsr_port *port;
mtu_max = ETH_DATA_LEN;
- rcu_read_lock();
- hsr_for_each_port(hsr, port)
+ hsr_for_each_port_rtnl(hsr, port)
if (port->type != HSR_PT_MASTER)
mtu_max = min(port->dev->mtu, mtu_max);
- rcu_read_unlock();
if (mtu_max < HSR_HLEN)
return 0;
return mtu_max - HSR_HLEN;
}
-
static int hsr_dev_change_mtu(struct net_device *dev, int new_mtu)
{
struct hsr_priv *hsr;
- struct hsr_port *master;
hsr = netdev_priv(dev);
- master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
if (new_mtu > hsr_get_max_mtu(hsr)) {
- netdev_info(master->dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n",
+ netdev_info(dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n",
HSR_HLEN);
return -EINVAL;
}
- dev->mtu = new_mtu;
+ WRITE_ONCE(dev->mtu, new_mtu);
return 0;
}
@@ -161,45 +135,60 @@ static int hsr_dev_open(struct net_device *dev)
{
struct hsr_priv *hsr;
struct hsr_port *port;
- char designation;
+ const char *designation = NULL;
hsr = netdev_priv(dev);
- designation = '\0';
- rcu_read_lock();
- hsr_for_each_port(hsr, port) {
+ hsr_for_each_port_rtnl(hsr, port) {
if (port->type == HSR_PT_MASTER)
continue;
switch (port->type) {
case HSR_PT_SLAVE_A:
- designation = 'A';
+ designation = "Slave A";
break;
case HSR_PT_SLAVE_B:
- designation = 'B';
+ designation = "Slave B";
+ break;
+ case HSR_PT_INTERLINK:
+ designation = "Interlink";
break;
default:
- designation = '?';
+ designation = "Unknown";
}
if (!is_slave_up(port->dev))
- netdev_warn(dev, "Slave %c (%s) is not up; please bring it up to get a fully working HSR network\n",
+ netdev_warn(dev, "%s (%s) is not up; please bring it up to get a fully working HSR network\n",
designation, port->dev->name);
}
- rcu_read_unlock();
- if (designation == '\0')
+ if (!designation)
netdev_warn(dev, "No slave devices configured\n");
return 0;
}
-
static int hsr_dev_close(struct net_device *dev)
{
- /* Nothing to do here. */
+ struct hsr_port *port;
+ struct hsr_priv *hsr;
+
+ hsr = netdev_priv(dev);
+ hsr_for_each_port_rtnl(hsr, port) {
+ if (port->type == HSR_PT_MASTER)
+ continue;
+ switch (port->type) {
+ case HSR_PT_SLAVE_A:
+ case HSR_PT_SLAVE_B:
+ dev_uc_unsync(port->dev, dev);
+ dev_mc_unsync(port->dev, dev);
+ break;
+ default:
+ break;
+ }
+ }
+
return 0;
}
-
static netdev_features_t hsr_features_recompute(struct hsr_priv *hsr,
netdev_features_t features)
{
@@ -216,7 +205,7 @@ static netdev_features_t hsr_features_recompute(struct hsr_priv *hsr,
* may become enabled.
*/
features &= ~NETIF_F_ONE_FOR_ALL;
- hsr_for_each_port(hsr, port)
+ hsr_for_each_port_rtnl(hsr, port)
features = netdev_increment_features(features,
port->dev->features,
mask);
@@ -232,99 +221,190 @@ static netdev_features_t hsr_fix_features(struct net_device *dev,
return hsr_features_recompute(hsr, features);
}
-
-static int hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct hsr_priv *hsr = netdev_priv(dev);
struct hsr_port *master;
+ rcu_read_lock();
master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
- skb->dev = master->dev;
- hsr_forward_skb(skb, master);
+ if (master) {
+ skb->dev = master->dev;
+ skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+ spin_lock_bh(&hsr->seqnr_lock);
+ hsr_forward_skb(skb, master);
+ spin_unlock_bh(&hsr->seqnr_lock);
+ } else {
+ dev_core_stats_tx_dropped_inc(dev);
+ dev_kfree_skb_any(skb);
+ }
+ rcu_read_unlock();
return NETDEV_TX_OK;
}
-
static const struct header_ops hsr_header_ops = {
.create = eth_header,
.parse = eth_header_parse,
};
-static void send_hsr_supervision_frame(struct hsr_port *master,
- u8 type, u8 hsrVer)
+static struct sk_buff *hsr_init_skb(struct hsr_port *master, int extra)
{
+ struct hsr_priv *hsr = master->hsr;
struct sk_buff *skb;
int hlen, tlen;
- struct hsr_tag *hsr_tag;
- struct hsr_sup_tag *hsr_stag;
- struct hsr_sup_payload *hsr_sp;
- unsigned long irqflags;
+ int len;
hlen = LL_RESERVED_SPACE(master->dev);
tlen = master->dev->needed_tailroom;
- skb = dev_alloc_skb(
- sizeof(struct hsr_tag) +
- sizeof(struct hsr_sup_tag) +
- sizeof(struct hsr_sup_payload) + hlen + tlen);
+ len = sizeof(struct hsr_sup_tag) + sizeof(struct hsr_sup_payload);
+ /* skb size is same for PRP/HSR frames, only difference
+ * being, for PRP it is a trailer and for HSR it is a
+ * header.
+ * RedBox might use @extra more bytes.
+ */
+ skb = dev_alloc_skb(len + extra + hlen + tlen);
- if (skb == NULL)
- return;
+ if (!skb)
+ return skb;
skb_reserve(skb, hlen);
-
skb->dev = master->dev;
- skb->protocol = htons(hsrVer ? ETH_P_HSR : ETH_P_PRP);
skb->priority = TC_PRIO_CONTROL;
- if (dev_hard_header(skb, skb->dev, (hsrVer ? ETH_P_HSR : ETH_P_PRP),
- master->hsr->sup_multicast_addr,
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ if (dev_hard_header(skb, skb->dev, ETH_P_PRP,
+ hsr->sup_multicast_addr,
skb->dev->dev_addr, skb->len) <= 0)
goto out;
+
skb_reset_mac_header(skb);
+ skb_reset_mac_len(skb);
+
+ return skb;
+out:
+ kfree_skb(skb);
+
+ return NULL;
+}
+
+static void send_hsr_supervision_frame(struct hsr_port *port,
+ unsigned long *interval,
+ const unsigned char *addr)
+{
+ struct hsr_priv *hsr = port->hsr;
+ __u8 type = HSR_TLV_LIFE_CHECK;
+ struct hsr_sup_payload *hsr_sp;
+ struct hsr_sup_tlv *hsr_stlv;
+ struct hsr_sup_tag *hsr_stag;
+ struct sk_buff *skb;
+ int extra = 0;
- if (hsrVer > 0) {
- hsr_tag = skb_put(skb, sizeof(struct hsr_tag));
- hsr_tag->encap_proto = htons(ETH_P_PRP);
- set_hsr_tag_LSDU_size(hsr_tag, HSR_V1_SUP_LSDUSIZE);
+ *interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL);
+ if (hsr->announce_count < 3 && hsr->prot_version == 0) {
+ type = HSR_TLV_ANNOUNCE;
+ *interval = msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL);
+ hsr->announce_count++;
+ }
+
+ if (hsr->redbox)
+ extra = sizeof(struct hsr_sup_tlv) +
+ sizeof(struct hsr_sup_payload);
+
+ skb = hsr_init_skb(port, extra);
+ if (!skb) {
+ netdev_warn_once(port->dev, "HSR: Could not send supervision frame\n");
+ return;
}
hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag));
- set_hsr_stag_path(hsr_stag, (hsrVer ? 0x0 : 0xf));
- set_hsr_stag_HSR_Ver(hsr_stag, hsrVer);
+ skb_set_network_header(skb, ETH_HLEN + HSR_HLEN);
+ skb_reset_mac_len(skb);
+
+ set_hsr_stag_path(hsr_stag, (hsr->prot_version ? 0x0 : 0xf));
+ set_hsr_stag_HSR_ver(hsr_stag, hsr->prot_version);
/* From HSRv1 on we have separate supervision sequence numbers. */
- spin_lock_irqsave(&master->hsr->seqnr_lock, irqflags);
- if (hsrVer > 0) {
- hsr_stag->sequence_nr = htons(master->hsr->sup_sequence_nr);
- hsr_tag->sequence_nr = htons(master->hsr->sequence_nr);
- master->hsr->sup_sequence_nr++;
- master->hsr->sequence_nr++;
+ spin_lock_bh(&hsr->seqnr_lock);
+ if (hsr->prot_version > 0) {
+ hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr);
+ hsr->sup_sequence_nr++;
} else {
- hsr_stag->sequence_nr = htons(master->hsr->sequence_nr);
- master->hsr->sequence_nr++;
+ hsr_stag->sequence_nr = htons(hsr->sequence_nr);
+ hsr->sequence_nr++;
}
- spin_unlock_irqrestore(&master->hsr->seqnr_lock, irqflags);
- hsr_stag->HSR_TLV_Type = type;
- /* TODO: Why 12 in HSRv0? */
- hsr_stag->HSR_TLV_Length = hsrVer ? sizeof(struct hsr_sup_payload) : 12;
+ hsr_stag->tlv.HSR_TLV_type = type;
+ /* HSRv0 has 6 unused bytes after the MAC */
+ hsr_stag->tlv.HSR_TLV_length = hsr->prot_version ?
+ sizeof(struct hsr_sup_payload) : 12;
- /* Payload: MacAddressA */
+ /* Payload: MacAddressA / SAN MAC from ProxyNodeTable */
hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload));
- ether_addr_copy(hsr_sp->MacAddressA, master->dev->dev_addr);
+ ether_addr_copy(hsr_sp->macaddress_A, addr);
+
+ if (hsr->redbox &&
+ hsr_is_node_in_db(&hsr->proxy_node_db, addr)) {
+ hsr_stlv = skb_put(skb, sizeof(struct hsr_sup_tlv));
+ hsr_stlv->HSR_TLV_type = PRP_TLV_REDBOX_MAC;
+ hsr_stlv->HSR_TLV_length = sizeof(struct hsr_sup_payload);
- if (skb_put_padto(skb, ETH_ZLEN + HSR_HLEN))
+ /* Payload: MacAddressRedBox */
+ hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload));
+ ether_addr_copy(hsr_sp->macaddress_A, hsr->macaddress_redbox);
+ }
+
+ if (skb_put_padto(skb, ETH_ZLEN)) {
+ spin_unlock_bh(&hsr->seqnr_lock);
return;
+ }
- hsr_forward_skb(skb, master);
+ hsr_forward_skb(skb, port);
+ spin_unlock_bh(&hsr->seqnr_lock);
return;
-
-out:
- WARN_ONCE(1, "HSR: Could not send supervision frame\n");
- kfree_skb(skb);
}
+static void send_prp_supervision_frame(struct hsr_port *master,
+ unsigned long *interval,
+ const unsigned char *addr)
+{
+ struct hsr_priv *hsr = master->hsr;
+ struct hsr_sup_payload *hsr_sp;
+ struct hsr_sup_tag *hsr_stag;
+ struct sk_buff *skb;
+
+ skb = hsr_init_skb(master, 0);
+ if (!skb) {
+ netdev_warn_once(master->dev, "PRP: Could not send supervision frame\n");
+ return;
+ }
+
+ *interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL);
+ hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag));
+ set_hsr_stag_path(hsr_stag, (hsr->prot_version ? 0x0 : 0xf));
+ set_hsr_stag_HSR_ver(hsr_stag, (hsr->prot_version ? 1 : 0));
+
+ /* From HSRv1 on we have separate supervision sequence numbers. */
+ spin_lock_bh(&hsr->seqnr_lock);
+ hsr_stag->sequence_nr = htons(hsr->sup_sequence_nr);
+ hsr->sup_sequence_nr++;
+ hsr_stag->tlv.HSR_TLV_type = PRP_TLV_LIFE_CHECK_DD;
+ hsr_stag->tlv.HSR_TLV_length = sizeof(struct hsr_sup_payload);
+
+ /* Payload: MacAddressA */
+ hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload));
+ ether_addr_copy(hsr_sp->macaddress_A, master->dev->dev_addr);
+
+ if (skb_put_padto(skb, ETH_ZLEN)) {
+ spin_unlock_bh(&hsr->seqnr_lock);
+ return;
+ }
+
+ hsr_forward_skb(skb, master);
+ spin_unlock_bh(&hsr->seqnr_lock);
+}
/* Announce (supervision frame) timer function
*/
@@ -332,53 +412,192 @@ static void hsr_announce(struct timer_list *t)
{
struct hsr_priv *hsr;
struct hsr_port *master;
+ unsigned long interval;
- hsr = from_timer(hsr, t, announce_timer);
+ hsr = timer_container_of(hsr, t, announce_timer);
rcu_read_lock();
master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ hsr->proto_ops->send_sv_frame(master, &interval, master->dev->dev_addr);
- if (hsr->announce_count < 3 && hsr->protVersion == 0) {
- send_hsr_supervision_frame(master, HSR_TLV_ANNOUNCE,
- hsr->protVersion);
- hsr->announce_count++;
+ if (is_admin_up(master->dev))
+ mod_timer(&hsr->announce_timer, jiffies + interval);
- hsr->announce_timer.expires = jiffies +
- msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL);
- } else {
- send_hsr_supervision_frame(master, HSR_TLV_LIFE_CHECK,
- hsr->protVersion);
+ rcu_read_unlock();
+}
- hsr->announce_timer.expires = jiffies +
- msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL);
+/* Announce (supervision frame) timer function for RedBox
+ */
+static void hsr_proxy_announce(struct timer_list *t)
+{
+ struct hsr_priv *hsr = timer_container_of(hsr, t,
+ announce_proxy_timer);
+ struct hsr_port *interlink;
+ unsigned long interval = 0;
+ struct hsr_node *node;
+
+ rcu_read_lock();
+ /* RedBOX sends supervisory frames to HSR network with MAC addresses
+ * of SAN nodes stored in ProxyNodeTable.
+ */
+ interlink = hsr_port_get_hsr(hsr, HSR_PT_INTERLINK);
+ if (!interlink)
+ goto done;
+
+ list_for_each_entry_rcu(node, &hsr->proxy_node_db, mac_list) {
+ if (hsr_addr_is_redbox(hsr, node->macaddress_A))
+ continue;
+ hsr->proto_ops->send_sv_frame(interlink, &interval,
+ node->macaddress_A);
}
- if (is_admin_up(master->dev))
- add_timer(&hsr->announce_timer);
+ if (is_admin_up(interlink->dev)) {
+ if (!interval)
+ interval = msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL);
+ mod_timer(&hsr->announce_proxy_timer, jiffies + interval);
+ }
+
+done:
rcu_read_unlock();
}
+void hsr_del_ports(struct hsr_priv *hsr)
+{
+ struct hsr_port *port;
+
+ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A);
+ if (port)
+ hsr_del_port(port);
-/* According to comments in the declaration of struct net_device, this function
- * is "Called from unregister, can be used to call free_netdev". Ok then...
- */
-static void hsr_dev_destroy(struct net_device *hsr_dev)
+ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
+ if (port)
+ hsr_del_port(port);
+
+ port = hsr_port_get_hsr(hsr, HSR_PT_INTERLINK);
+ if (port)
+ hsr_del_port(port);
+
+ port = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ if (port)
+ hsr_del_port(port);
+}
+
+static void hsr_set_rx_mode(struct net_device *dev)
{
+ struct hsr_port *port;
struct hsr_priv *hsr;
+
+ hsr = netdev_priv(dev);
+
+ hsr_for_each_port_rtnl(hsr, port) {
+ if (port->type == HSR_PT_MASTER)
+ continue;
+ switch (port->type) {
+ case HSR_PT_SLAVE_A:
+ case HSR_PT_SLAVE_B:
+ dev_mc_sync_multiple(port->dev, dev);
+ dev_uc_sync_multiple(port->dev, dev);
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+static void hsr_change_rx_flags(struct net_device *dev, int change)
+{
struct hsr_port *port;
+ struct hsr_priv *hsr;
- hsr = netdev_priv(hsr_dev);
+ hsr = netdev_priv(dev);
- rtnl_lock();
- hsr_for_each_port(hsr, port)
- hsr_del_port(port);
- rtnl_unlock();
+ hsr_for_each_port_rtnl(hsr, port) {
+ if (port->type == HSR_PT_MASTER)
+ continue;
+ switch (port->type) {
+ case HSR_PT_SLAVE_A:
+ case HSR_PT_SLAVE_B:
+ if (change & IFF_ALLMULTI)
+ dev_set_allmulti(port->dev,
+ dev->flags &
+ IFF_ALLMULTI ? 1 : -1);
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+static int hsr_ndo_vlan_rx_add_vid(struct net_device *dev,
+ __be16 proto, u16 vid)
+{
+ bool is_slave_a_added = false;
+ bool is_slave_b_added = false;
+ struct hsr_port *port;
+ struct hsr_priv *hsr;
+ int ret = 0;
+
+ hsr = netdev_priv(dev);
+
+ hsr_for_each_port_rtnl(hsr, port) {
+ if (port->type == HSR_PT_MASTER ||
+ port->type == HSR_PT_INTERLINK)
+ continue;
+
+ ret = vlan_vid_add(port->dev, proto, vid);
+ switch (port->type) {
+ case HSR_PT_SLAVE_A:
+ if (ret) {
+ /* clean up Slave-B */
+ netdev_err(dev, "add vid failed for Slave-A\n");
+ if (is_slave_b_added)
+ vlan_vid_del(port->dev, proto, vid);
+ return ret;
+ }
+
+ is_slave_a_added = true;
+ break;
+
+ case HSR_PT_SLAVE_B:
+ if (ret) {
+ /* clean up Slave-A */
+ netdev_err(dev, "add vid failed for Slave-B\n");
+ if (is_slave_a_added)
+ vlan_vid_del(port->dev, proto, vid);
+ return ret;
+ }
+
+ is_slave_b_added = true;
+ break;
+ default:
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static int hsr_ndo_vlan_rx_kill_vid(struct net_device *dev,
+ __be16 proto, u16 vid)
+{
+ struct hsr_port *port;
+ struct hsr_priv *hsr;
+
+ hsr = netdev_priv(dev);
- del_timer_sync(&hsr->prune_timer);
- del_timer_sync(&hsr->announce_timer);
+ hsr_for_each_port_rtnl(hsr, port) {
+ switch (port->type) {
+ case HSR_PT_SLAVE_A:
+ case HSR_PT_SLAVE_B:
+ vlan_vid_del(port->dev, proto, vid);
+ break;
+ default:
+ break;
+ }
+ }
- synchronize_rcu();
+ return 0;
}
static const struct net_device_ops hsr_device_ops = {
@@ -386,13 +605,38 @@ static const struct net_device_ops hsr_device_ops = {
.ndo_open = hsr_dev_open,
.ndo_stop = hsr_dev_close,
.ndo_start_xmit = hsr_dev_xmit,
+ .ndo_change_rx_flags = hsr_change_rx_flags,
.ndo_fix_features = hsr_fix_features,
+ .ndo_set_rx_mode = hsr_set_rx_mode,
+ .ndo_vlan_rx_add_vid = hsr_ndo_vlan_rx_add_vid,
+ .ndo_vlan_rx_kill_vid = hsr_ndo_vlan_rx_kill_vid,
};
-static struct device_type hsr_type = {
+static const struct device_type hsr_type = {
.name = "hsr",
};
+static struct hsr_proto_ops hsr_ops = {
+ .send_sv_frame = send_hsr_supervision_frame,
+ .create_tagged_frame = hsr_create_tagged_frame,
+ .get_untagged_frame = hsr_get_untagged_frame,
+ .drop_frame = hsr_drop_frame,
+ .fill_frame_info = hsr_fill_frame_info,
+ .invalid_dan_ingress_frame = hsr_invalid_dan_ingress_frame,
+ .register_frame_out = hsr_register_frame_out,
+};
+
+static struct hsr_proto_ops prp_ops = {
+ .send_sv_frame = send_prp_supervision_frame,
+ .create_tagged_frame = prp_create_tagged_frame,
+ .get_untagged_frame = prp_get_untagged_frame,
+ .drop_frame = prp_drop_frame,
+ .fill_frame_info = prp_fill_frame_info,
+ .handle_san_frame = prp_handle_san_frame,
+ .update_san_info = prp_update_san_info,
+ .register_frame_out = prp_register_frame_out,
+};
+
void hsr_dev_setup(struct net_device *dev)
{
eth_hw_addr_random(dev);
@@ -402,36 +646,69 @@ void hsr_dev_setup(struct net_device *dev)
dev->header_ops = &hsr_header_ops;
dev->netdev_ops = &hsr_device_ops;
SET_NETDEV_DEVTYPE(dev, &hsr_type);
- dev->priv_flags |= IFF_NO_QUEUE;
+ dev->priv_flags |= IFF_NO_QUEUE | IFF_DISABLE_NETPOLL;
+ /* Prevent recursive tx locking */
+ dev->lltx = true;
+ /* Not sure about this. Taken from bridge code. netdevice.h says
+ * it means "Does not change network namespaces".
+ */
+ dev->netns_immutable = true;
dev->needs_free_netdev = true;
- dev->priv_destructor = hsr_dev_destroy;
dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
NETIF_F_GSO_MASK | NETIF_F_HW_CSUM |
- NETIF_F_HW_VLAN_CTAG_TX;
+ NETIF_F_HW_VLAN_CTAG_TX |
+ NETIF_F_HW_VLAN_CTAG_FILTER;
dev->features = dev->hw_features;
-
- /* Prevent recursive tx locking */
- dev->features |= NETIF_F_LLTX;
- /* VLAN on top of HSR needs testing and probably some work on
- * hsr_header_create() etc.
- */
- dev->features |= NETIF_F_VLAN_CHALLENGED;
- /* Not sure about this. Taken from bridge code. netdev_features.h says
- * it means "Does not change network namespaces".
- */
- dev->features |= NETIF_F_NETNS_LOCAL;
}
-
/* Return true if dev is a HSR master; return false otherwise.
*/
-inline bool is_hsr_master(struct net_device *dev)
+bool is_hsr_master(struct net_device *dev)
{
return (dev->netdev_ops->ndo_start_xmit == hsr_dev_xmit);
}
+EXPORT_SYMBOL(is_hsr_master);
+
+struct net_device *hsr_get_port_ndev(struct net_device *ndev,
+ enum hsr_port_type pt)
+{
+ struct hsr_priv *hsr = netdev_priv(ndev);
+ struct hsr_port *port;
+
+ rcu_read_lock();
+ hsr_for_each_port(hsr, port)
+ if (port->type == pt) {
+ dev_hold(port->dev);
+ rcu_read_unlock();
+ return port->dev;
+ }
+ rcu_read_unlock();
+ return NULL;
+}
+EXPORT_SYMBOL(hsr_get_port_ndev);
+
+int hsr_get_port_type(struct net_device *hsr_dev, struct net_device *dev,
+ enum hsr_port_type *type)
+{
+ struct hsr_priv *hsr = netdev_priv(hsr_dev);
+ struct hsr_port *port;
+
+ rcu_read_lock();
+ hsr_for_each_port(hsr, port) {
+ if (port->dev == dev) {
+ *type = port->type;
+ rcu_read_unlock();
+ return 0;
+ }
+ }
+ rcu_read_unlock();
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL(hsr_get_port_type);
/* Default multicast address for HSR Supervision frames */
static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = {
@@ -439,21 +716,34 @@ static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = {
};
int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
- unsigned char multicast_spec, u8 protocol_version)
+ struct net_device *interlink, unsigned char multicast_spec,
+ u8 protocol_version, struct netlink_ext_ack *extack)
{
+ bool unregister = false;
struct hsr_priv *hsr;
- struct hsr_port *port;
int res;
hsr = netdev_priv(hsr_dev);
INIT_LIST_HEAD(&hsr->ports);
INIT_LIST_HEAD(&hsr->node_db);
- INIT_LIST_HEAD(&hsr->self_node_db);
-
- ether_addr_copy(hsr_dev->dev_addr, slave[0]->dev_addr);
+ INIT_LIST_HEAD(&hsr->proxy_node_db);
+ spin_lock_init(&hsr->list_lock);
+
+ eth_hw_addr_set(hsr_dev, slave[0]->dev_addr);
+
+ /* initialize protocol specific functions */
+ if (protocol_version == PRP_V1) {
+ /* For PRP, lan_id has most significant 3 bits holding
+ * the net_id of PRP_LAN_ID
+ */
+ hsr->net_id = PRP_LAN_ID << 1;
+ hsr->proto_ops = &prp_ops;
+ } else {
+ hsr->proto_ops = &hsr_ops;
+ }
/* Make sure we recognize frames from ourselves in hsr_rcv() */
- res = hsr_create_self_node(&hsr->self_node_db, hsr_dev->dev_addr,
+ res = hsr_create_self_node(hsr, hsr_dev->dev_addr,
slave[1]->dev_addr);
if (res < 0)
return res;
@@ -465,47 +755,71 @@ int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
timer_setup(&hsr->announce_timer, hsr_announce, 0);
timer_setup(&hsr->prune_timer, hsr_prune_nodes, 0);
+ timer_setup(&hsr->prune_proxy_timer, hsr_prune_proxy_nodes, 0);
+ timer_setup(&hsr->announce_proxy_timer, hsr_proxy_announce, 0);
ether_addr_copy(hsr->sup_multicast_addr, def_multicast_addr);
hsr->sup_multicast_addr[ETH_ALEN - 1] = multicast_spec;
- hsr->protVersion = protocol_version;
-
- /* FIXME: should I modify the value of these?
- *
- * - hsr_dev->flags - i.e.
- * IFF_MASTER/SLAVE?
- * - hsr_dev->priv_flags - i.e.
- * IFF_EBRIDGE?
- * IFF_TX_SKB_SHARING?
- * IFF_HSR_MASTER/SLAVE?
- */
+ hsr->prot_version = protocol_version;
/* Make sure the 1st call to netif_carrier_on() gets through */
netif_carrier_off(hsr_dev);
- res = hsr_add_port(hsr, hsr_dev, HSR_PT_MASTER);
+ res = hsr_add_port(hsr, hsr_dev, HSR_PT_MASTER, extack);
if (res)
- return res;
+ goto err_add_master;
+
+ /* HSR forwarding offload supported in lower device? */
+ if ((slave[0]->features & NETIF_F_HW_HSR_FWD) &&
+ (slave[1]->features & NETIF_F_HW_HSR_FWD))
+ hsr->fwd_offloaded = true;
+
+ if ((slave[0]->features & NETIF_F_HW_VLAN_CTAG_FILTER) &&
+ (slave[1]->features & NETIF_F_HW_VLAN_CTAG_FILTER))
+ hsr_dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
res = register_netdevice(hsr_dev);
if (res)
- goto fail;
+ goto err_unregister;
+
+ unregister = true;
- res = hsr_add_port(hsr, slave[0], HSR_PT_SLAVE_A);
+ res = hsr_add_port(hsr, slave[0], HSR_PT_SLAVE_A, extack);
if (res)
- goto fail;
- res = hsr_add_port(hsr, slave[1], HSR_PT_SLAVE_B);
+ goto err_unregister;
+
+ res = hsr_add_port(hsr, slave[1], HSR_PT_SLAVE_B, extack);
if (res)
- goto fail;
+ goto err_unregister;
+
+ if (protocol_version == PRP_V1) {
+ eth_hw_addr_set(slave[1], slave[0]->dev_addr);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR, slave[1]);
+ }
+
+ if (interlink) {
+ res = hsr_add_port(hsr, interlink, HSR_PT_INTERLINK, extack);
+ if (res)
+ goto err_unregister;
+ hsr->redbox = true;
+ ether_addr_copy(hsr->macaddress_redbox, interlink->dev_addr);
+ mod_timer(&hsr->prune_proxy_timer,
+ jiffies + msecs_to_jiffies(PRUNE_PROXY_PERIOD));
+ }
+
+ hsr_debugfs_init(hsr, hsr_dev);
mod_timer(&hsr->prune_timer, jiffies + msecs_to_jiffies(PRUNE_PERIOD));
return 0;
-fail:
- hsr_for_each_port(hsr, port)
- hsr_del_port(port);
+err_unregister:
+ hsr_del_ports(hsr);
+err_add_master:
+ hsr_del_self_node(hsr);
+ if (unregister)
+ unregister_netdevice(hsr_dev);
return res;
}
diff --git a/net/hsr/hsr_device.h b/net/hsr/hsr_device.h
index 9975e31bbb82..655284095b78 100644
--- a/net/hsr/hsr_device.h
+++ b/net/hsr/hsr_device.h
@@ -1,12 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright 2011-2014 Autronica Fire and Security AS
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
* Author(s):
* 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * include file for HSR and PRP.
*/
#ifndef __HSR_DEVICE_H
@@ -15,11 +13,11 @@
#include <linux/netdevice.h>
#include "hsr_main.h"
+void hsr_del_ports(struct hsr_priv *hsr);
void hsr_dev_setup(struct net_device *dev);
int hsr_dev_finalize(struct net_device *hsr_dev, struct net_device *slave[2],
- unsigned char multicast_spec, u8 protocol_version);
+ struct net_device *interlink, unsigned char multicast_spec,
+ u8 protocol_version, struct netlink_ext_ack *extack);
void hsr_check_carrier_and_operstate(struct hsr_priv *hsr);
-bool is_hsr_master(struct net_device *dev);
int hsr_get_max_mtu(struct hsr_priv *hsr);
-
#endif /* __HSR_DEVICE_H */
diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c
index 04b5450c5a55..339f0d220212 100644
--- a/net/hsr/hsr_forward.c
+++ b/net/hsr/hsr_forward.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/* Copyright 2011-2014 Autronica Fire and Security AS
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
* Author(s):
* 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * Frame router for HSR and PRP.
*/
#include "hsr_forward.h"
@@ -17,22 +15,8 @@
#include "hsr_main.h"
#include "hsr_framereg.h"
-
struct hsr_node;
-struct hsr_frame_info {
- struct sk_buff *skb_std;
- struct sk_buff *skb_hsr;
- struct hsr_port *port_rcv;
- struct hsr_node *node_src;
- u16 sequence_nr;
- bool is_supervision;
- bool is_vlan;
- bool is_local_dest;
- bool is_local_exclusive;
-};
-
-
/* The uses I can see for these HSR supervision frames are:
* 1) Use the frames that are sent after node initialization ("HSR_TLV.Type =
* 22") to reset any sequence_nr counters belonging to that node. Useful if
@@ -50,48 +34,120 @@ struct hsr_frame_info {
*/
static bool is_supervision_frame(struct hsr_priv *hsr, struct sk_buff *skb)
{
- struct ethhdr *ethHdr;
- struct hsr_sup_tag *hsrSupTag;
- struct hsrv1_ethhdr_sp *hsrV1Hdr;
+ struct ethhdr *eth_hdr;
+ struct hsr_sup_tag *hsr_sup_tag;
+ struct hsrv1_ethhdr_sp *hsr_V1_hdr;
+ struct hsr_sup_tlv *hsr_sup_tlv;
+ u16 total_length = 0;
WARN_ON_ONCE(!skb_mac_header_was_set(skb));
- ethHdr = (struct ethhdr *) skb_mac_header(skb);
+ eth_hdr = (struct ethhdr *)skb_mac_header(skb);
/* Correct addr? */
- if (!ether_addr_equal(ethHdr->h_dest,
+ if (!ether_addr_equal(eth_hdr->h_dest,
hsr->sup_multicast_addr))
return false;
/* Correct ether type?. */
- if (!(ethHdr->h_proto == htons(ETH_P_PRP)
- || ethHdr->h_proto == htons(ETH_P_HSR)))
+ if (!(eth_hdr->h_proto == htons(ETH_P_PRP) ||
+ eth_hdr->h_proto == htons(ETH_P_HSR)))
return false;
/* Get the supervision header from correct location. */
- if (ethHdr->h_proto == htons(ETH_P_HSR)) { /* Okay HSRv1. */
- hsrV1Hdr = (struct hsrv1_ethhdr_sp *) skb_mac_header(skb);
- if (hsrV1Hdr->hsr.encap_proto != htons(ETH_P_PRP))
+ if (eth_hdr->h_proto == htons(ETH_P_HSR)) { /* Okay HSRv1. */
+ total_length = sizeof(struct hsrv1_ethhdr_sp);
+ if (!pskb_may_pull(skb, total_length))
return false;
- hsrSupTag = &hsrV1Hdr->hsr_sup;
+ hsr_V1_hdr = (struct hsrv1_ethhdr_sp *)skb_mac_header(skb);
+ if (hsr_V1_hdr->hsr.encap_proto != htons(ETH_P_PRP))
+ return false;
+
+ hsr_sup_tag = &hsr_V1_hdr->hsr_sup;
} else {
- hsrSupTag = &((struct hsrv0_ethhdr_sp *) skb_mac_header(skb))->hsr_sup;
+ total_length = sizeof(struct hsrv0_ethhdr_sp);
+ if (!pskb_may_pull(skb, total_length))
+ return false;
+
+ hsr_sup_tag =
+ &((struct hsrv0_ethhdr_sp *)skb_mac_header(skb))->hsr_sup;
}
- if ((hsrSupTag->HSR_TLV_Type != HSR_TLV_ANNOUNCE) &&
- (hsrSupTag->HSR_TLV_Type != HSR_TLV_LIFE_CHECK))
+ if (hsr_sup_tag->tlv.HSR_TLV_type != HSR_TLV_ANNOUNCE &&
+ hsr_sup_tag->tlv.HSR_TLV_type != HSR_TLV_LIFE_CHECK &&
+ hsr_sup_tag->tlv.HSR_TLV_type != PRP_TLV_LIFE_CHECK_DD &&
+ hsr_sup_tag->tlv.HSR_TLV_type != PRP_TLV_LIFE_CHECK_DA)
return false;
- if ((hsrSupTag->HSR_TLV_Length != 12) &&
- (hsrSupTag->HSR_TLV_Length !=
- sizeof(struct hsr_sup_payload)))
+ if (hsr_sup_tag->tlv.HSR_TLV_length != 12 &&
+ hsr_sup_tag->tlv.HSR_TLV_length != sizeof(struct hsr_sup_payload))
+ return false;
+
+ /* Get next tlv */
+ total_length += hsr_sup_tag->tlv.HSR_TLV_length;
+ if (!pskb_may_pull(skb, total_length))
+ return false;
+ skb_pull(skb, total_length);
+ hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data;
+ skb_push(skb, total_length);
+
+ /* if this is a redbox supervision frame we need to verify
+ * that more data is available
+ */
+ if (hsr_sup_tlv->HSR_TLV_type == PRP_TLV_REDBOX_MAC) {
+ /* tlv length must be a length of a mac address */
+ if (hsr_sup_tlv->HSR_TLV_length != sizeof(struct hsr_sup_payload))
+ return false;
+
+ /* make sure another tlv follows */
+ total_length += sizeof(struct hsr_sup_tlv) + hsr_sup_tlv->HSR_TLV_length;
+ if (!pskb_may_pull(skb, total_length))
+ return false;
+
+ /* get next tlv */
+ skb_pull(skb, total_length);
+ hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data;
+ skb_push(skb, total_length);
+ }
+
+ /* end of tlvs must follow at the end */
+ if (hsr_sup_tlv->HSR_TLV_type == HSR_TLV_EOT &&
+ hsr_sup_tlv->HSR_TLV_length != 0)
return false;
return true;
}
+static bool is_proxy_supervision_frame(struct hsr_priv *hsr,
+ struct sk_buff *skb)
+{
+ struct hsr_sup_payload *payload;
+ struct ethhdr *eth_hdr;
+ u16 total_length = 0;
+
+ eth_hdr = (struct ethhdr *)skb_mac_header(skb);
+
+ /* Get the HSR protocol revision. */
+ if (eth_hdr->h_proto == htons(ETH_P_HSR))
+ total_length = sizeof(struct hsrv1_ethhdr_sp);
+ else
+ total_length = sizeof(struct hsrv0_ethhdr_sp);
+
+ if (!pskb_may_pull(skb, total_length + sizeof(struct hsr_sup_payload)))
+ return false;
+
+ skb_pull(skb, total_length);
+ payload = (struct hsr_sup_payload *)skb->data;
+ skb_push(skb, total_length);
+
+ /* For RedBox (HSR-SAN) check if we have received the supervision
+ * frame with MAC addresses from own ProxyNodeTable.
+ */
+ return hsr_is_node_in_db(&hsr->proxy_node_db,
+ payload->macaddress_A);
+}
-static struct sk_buff *create_stripped_skb(struct sk_buff *skb_in,
- struct hsr_frame_info *frame)
+static struct sk_buff *create_stripped_skb_hsr(struct sk_buff *skb_in,
+ struct hsr_frame_info *frame)
{
struct sk_buff *skb;
int copylen;
@@ -100,7 +156,7 @@ static struct sk_buff *create_stripped_skb(struct sk_buff *skb_in,
skb_pull(skb_in, HSR_HLEN);
skb = __pskb_copy(skb_in, skb_headroom(skb_in) - HSR_HLEN, GFP_ATOMIC);
skb_push(skb_in, HSR_HLEN);
- if (skb == NULL)
+ if (!skb)
return NULL;
skb_reset_mac_header(skb);
@@ -108,7 +164,7 @@ static struct sk_buff *create_stripped_skb(struct sk_buff *skb_in,
if (skb->ip_summed == CHECKSUM_PARTIAL)
skb->csum_start -= HSR_HLEN;
- copylen = 2*ETH_ALEN;
+ copylen = 2 * ETH_ALEN;
if (frame->is_vlan)
copylen += VLAN_HLEN;
src = skb_mac_header(skb_in);
@@ -119,52 +175,179 @@ static struct sk_buff *create_stripped_skb(struct sk_buff *skb_in,
return skb;
}
-static struct sk_buff *frame_get_stripped_skb(struct hsr_frame_info *frame,
- struct hsr_port *port)
+struct sk_buff *hsr_get_untagged_frame(struct hsr_frame_info *frame,
+ struct hsr_port *port)
{
- if (!frame->skb_std)
- frame->skb_std = create_stripped_skb(frame->skb_hsr, frame);
+ if (!frame->skb_std) {
+ if (frame->skb_hsr)
+ frame->skb_std =
+ create_stripped_skb_hsr(frame->skb_hsr, frame);
+ else
+ netdev_warn_once(port->dev,
+ "Unexpected frame received in hsr_get_untagged_frame()\n");
+
+ if (!frame->skb_std)
+ return NULL;
+ }
+
return skb_clone(frame->skb_std, GFP_ATOMIC);
}
+struct sk_buff *prp_get_untagged_frame(struct hsr_frame_info *frame,
+ struct hsr_port *port)
+{
+ if (!frame->skb_std) {
+ if (frame->skb_prp) {
+ /* trim the skb by len - HSR_HLEN to exclude RCT */
+ skb_trim(frame->skb_prp,
+ frame->skb_prp->len - HSR_HLEN);
+ frame->skb_std =
+ __pskb_copy(frame->skb_prp,
+ skb_headroom(frame->skb_prp),
+ GFP_ATOMIC);
+ } else {
+ /* Unexpected */
+ WARN_ONCE(1, "%s:%d: Unexpected frame received (port_src %s)\n",
+ __FILE__, __LINE__, port->dev->name);
+ return NULL;
+ }
+ }
+
+ return skb_clone(frame->skb_std, GFP_ATOMIC);
+}
-static void hsr_fill_tag(struct sk_buff *skb, struct hsr_frame_info *frame,
- struct hsr_port *port, u8 protoVersion)
+static void prp_set_lan_id(struct prp_rct *trailer,
+ struct hsr_port *port)
{
- struct hsr_ethhdr *hsr_ethhdr;
int lane_id;
- int lsdu_size;
if (port->type == HSR_PT_SLAVE_A)
lane_id = 0;
else
lane_id = 1;
+ /* Add net_id in the upper 3 bits of lane_id */
+ lane_id |= port->hsr->net_id;
+ set_prp_lan_id(trailer, lane_id);
+}
+
+/* Tailroom for PRP rct should have been created before calling this */
+static struct sk_buff *prp_fill_rct(struct sk_buff *skb,
+ struct hsr_frame_info *frame,
+ struct hsr_port *port)
+{
+ struct prp_rct *trailer;
+ int min_size = ETH_ZLEN;
+ int lsdu_size;
+
+ if (!skb)
+ return skb;
+
+ if (frame->is_vlan)
+ min_size = VLAN_ETH_ZLEN;
+
+ if (skb_put_padto(skb, min_size))
+ return NULL;
+
+ trailer = (struct prp_rct *)skb_put(skb, HSR_HLEN);
lsdu_size = skb->len - 14;
if (frame->is_vlan)
lsdu_size -= 4;
+ prp_set_lan_id(trailer, port);
+ set_prp_LSDU_size(trailer, lsdu_size);
+ trailer->sequence_nr = htons(frame->sequence_nr);
+ trailer->PRP_suffix = htons(ETH_P_PRP);
+ skb->protocol = eth_hdr(skb)->h_proto;
- hsr_ethhdr = (struct hsr_ethhdr *) skb_mac_header(skb);
+ return skb;
+}
- set_hsr_tag_path(&hsr_ethhdr->hsr_tag, lane_id);
+static void hsr_set_path_id(struct hsr_frame_info *frame,
+ struct hsr_ethhdr *hsr_ethhdr,
+ struct hsr_port *port)
+{
+ int path_id;
+
+ if (port->hsr->prot_version) {
+ if (port->type == HSR_PT_SLAVE_A)
+ path_id = 0;
+ else
+ path_id = 1;
+ } else {
+ if (frame->is_supervision)
+ path_id = 0xf;
+ else
+ path_id = 1;
+ }
+
+ set_hsr_tag_path(&hsr_ethhdr->hsr_tag, path_id);
+}
+
+static struct sk_buff *hsr_fill_tag(struct sk_buff *skb,
+ struct hsr_frame_info *frame,
+ struct hsr_port *port, u8 proto_version)
+{
+ struct hsr_ethhdr *hsr_ethhdr;
+ unsigned char *pc;
+ int lsdu_size;
+
+ /* pad to minimum packet size which is 60 + 6 (HSR tag) */
+ if (skb_put_padto(skb, ETH_ZLEN + HSR_HLEN))
+ return NULL;
+
+ lsdu_size = skb->len - 14;
+ if (frame->is_vlan)
+ lsdu_size -= 4;
+
+ pc = skb_mac_header(skb);
+ if (frame->is_vlan)
+ /* This 4-byte shift (size of a vlan tag) does not
+ * mean that the ethhdr starts there. But rather it
+ * provides the proper environment for accessing
+ * the fields, such as hsr_tag etc., just like
+ * when the vlan tag is not there. This is because
+ * the hsr tag is after the vlan tag.
+ */
+ hsr_ethhdr = (struct hsr_ethhdr *)(pc + VLAN_HLEN);
+ else
+ hsr_ethhdr = (struct hsr_ethhdr *)pc;
+
+ hsr_set_path_id(frame, hsr_ethhdr, port);
set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, lsdu_size);
hsr_ethhdr->hsr_tag.sequence_nr = htons(frame->sequence_nr);
hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto;
- hsr_ethhdr->ethhdr.h_proto = htons(protoVersion ?
+ hsr_ethhdr->ethhdr.h_proto = htons(proto_version ?
ETH_P_HSR : ETH_P_PRP);
+ skb->protocol = hsr_ethhdr->ethhdr.h_proto;
+
+ return skb;
}
-static struct sk_buff *create_tagged_skb(struct sk_buff *skb_o,
- struct hsr_frame_info *frame,
- struct hsr_port *port)
+/* If the original frame was an HSR tagged frame, just clone it to be sent
+ * unchanged. Otherwise, create a private frame especially tagged for 'port'.
+ */
+struct sk_buff *hsr_create_tagged_frame(struct hsr_frame_info *frame,
+ struct hsr_port *port)
{
- int movelen;
unsigned char *dst, *src;
struct sk_buff *skb;
+ int movelen;
+
+ if (frame->skb_hsr) {
+ struct hsr_ethhdr *hsr_ethhdr =
+ (struct hsr_ethhdr *)skb_mac_header(frame->skb_hsr);
+
+ /* set the lane id properly */
+ hsr_set_path_id(frame, hsr_ethhdr, port);
+ return skb_clone(frame->skb_hsr, GFP_ATOMIC);
+ } else if (port->dev->features & NETIF_F_HW_HSR_TAG_INS) {
+ return skb_clone(frame->skb_std, GFP_ATOMIC);
+ }
/* Create the new skb with enough headroom to fit the HSR tag */
- skb = __pskb_copy(skb_o, skb_headroom(skb_o) + HSR_HLEN, GFP_ATOMIC);
- if (skb == NULL)
+ skb = __pskb_copy(frame->skb_std,
+ skb_headroom(frame->skb_std) + HSR_HLEN, GFP_ATOMIC);
+ if (!skb)
return NULL;
skb_reset_mac_header(skb);
@@ -180,44 +363,53 @@ static struct sk_buff *create_tagged_skb(struct sk_buff *skb_o,
memmove(dst, src, movelen);
skb_reset_mac_header(skb);
- hsr_fill_tag(skb, frame, port, port->hsr->protVersion);
-
- return skb;
+ /* skb_put_padto free skb on error and hsr_fill_tag returns NULL in
+ * that case
+ */
+ return hsr_fill_tag(skb, frame, port, port->hsr->prot_version);
}
-/* If the original frame was an HSR tagged frame, just clone it to be sent
- * unchanged. Otherwise, create a private frame especially tagged for 'port'.
- */
-static struct sk_buff *frame_get_tagged_skb(struct hsr_frame_info *frame,
- struct hsr_port *port)
+struct sk_buff *prp_create_tagged_frame(struct hsr_frame_info *frame,
+ struct hsr_port *port)
{
- if (frame->skb_hsr)
- return skb_clone(frame->skb_hsr, GFP_ATOMIC);
+ struct sk_buff *skb;
- if ((port->type != HSR_PT_SLAVE_A) && (port->type != HSR_PT_SLAVE_B)) {
- WARN_ONCE(1, "HSR: Bug: trying to create a tagged frame for a non-ring port");
- return NULL;
+ if (frame->skb_prp) {
+ struct prp_rct *trailer = skb_get_PRP_rct(frame->skb_prp);
+
+ if (trailer) {
+ prp_set_lan_id(trailer, port);
+ } else {
+ WARN_ONCE(!trailer, "errored PRP skb");
+ return NULL;
+ }
+ return skb_clone(frame->skb_prp, GFP_ATOMIC);
+ } else if (port->dev->features & NETIF_F_HW_HSR_TAG_INS) {
+ return skb_clone(frame->skb_std, GFP_ATOMIC);
}
- return create_tagged_skb(frame->skb_std, frame, port);
+ skb = skb_copy_expand(frame->skb_std, skb_headroom(frame->skb_std),
+ skb_tailroom(frame->skb_std) + HSR_HLEN,
+ GFP_ATOMIC);
+ return prp_fill_rct(skb, frame, port);
}
-
static void hsr_deliver_master(struct sk_buff *skb, struct net_device *dev,
struct hsr_node *node_src)
{
bool was_multicast_frame;
- int res;
+ int res, recv_len;
was_multicast_frame = (skb->pkt_type == PACKET_MULTICAST);
hsr_addr_subst_source(node_src, skb);
skb_pull(skb, ETH_HLEN);
+ recv_len = skb->len;
res = netif_rx(skb);
if (res == NET_RX_DROP) {
dev->stats.rx_dropped++;
} else {
dev->stats.rx_packets++;
- dev->stats.rx_bytes += skb->len;
+ dev->stats.rx_bytes += recv_len;
if (was_multicast_frame)
dev->stats.multicast++;
}
@@ -234,13 +426,86 @@ static int hsr_xmit(struct sk_buff *skb, struct hsr_port *port,
*/
ether_addr_copy(eth_hdr(skb)->h_source, port->dev->dev_addr);
}
+
+ /* When HSR node is used as RedBox - the frame received from HSR ring
+ * requires source MAC address (SA) replacement to one which can be
+ * recognized by SAN devices (otherwise, frames are dropped by switch)
+ */
+ if (port->type == HSR_PT_INTERLINK)
+ ether_addr_copy(eth_hdr(skb)->h_source,
+ port->hsr->macaddress_redbox);
+
return dev_queue_xmit(skb);
}
+bool prp_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port)
+{
+ return ((frame->port_rcv->type == HSR_PT_SLAVE_A &&
+ port->type == HSR_PT_SLAVE_B) ||
+ (frame->port_rcv->type == HSR_PT_SLAVE_B &&
+ port->type == HSR_PT_SLAVE_A));
+}
+
+bool hsr_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port)
+{
+ struct sk_buff *skb;
+
+ if (port->dev->features & NETIF_F_HW_HSR_FWD)
+ return prp_drop_frame(frame, port);
+
+ /* RedBox specific frames dropping policies
+ *
+ * Do not send HSR supervisory frames to SAN devices
+ */
+ if (frame->is_supervision && port->type == HSR_PT_INTERLINK)
+ return true;
+
+ /* Do not forward to other HSR port (A or B) unicast frames which
+ * are addressed to interlink port (and are in the ProxyNodeTable).
+ */
+ skb = frame->skb_hsr;
+ if (skb && prp_drop_frame(frame, port) &&
+ is_unicast_ether_addr(eth_hdr(skb)->h_dest) &&
+ hsr_is_node_in_db(&port->hsr->proxy_node_db,
+ eth_hdr(skb)->h_dest)) {
+ return true;
+ }
+
+ /* Do not forward to port C (Interlink) frames from nodes A and B
+ * if DA is in NodeTable.
+ */
+ if ((frame->port_rcv->type == HSR_PT_SLAVE_A ||
+ frame->port_rcv->type == HSR_PT_SLAVE_B) &&
+ port->type == HSR_PT_INTERLINK) {
+ skb = frame->skb_hsr;
+ if (skb && is_unicast_ether_addr(eth_hdr(skb)->h_dest) &&
+ hsr_is_node_in_db(&port->hsr->node_db,
+ eth_hdr(skb)->h_dest)) {
+ return true;
+ }
+ }
+
+ /* Do not forward to port A and B unicast frames received on the
+ * interlink port if it is addressed to one of nodes registered in
+ * the ProxyNodeTable.
+ */
+ if ((port->type == HSR_PT_SLAVE_A || port->type == HSR_PT_SLAVE_B) &&
+ frame->port_rcv->type == HSR_PT_INTERLINK) {
+ skb = frame->skb_std;
+ if (skb && is_unicast_ether_addr(eth_hdr(skb)->h_dest) &&
+ hsr_is_node_in_db(&port->hsr->proxy_node_db,
+ eth_hdr(skb)->h_dest)) {
+ return true;
+ }
+ }
+
+ return false;
+}
/* Forward the frame through all devices except:
* - Back through the receiving device
* - If it's a HSR frame: through a device where it has passed before
+ * - if it's a PRP frame: through another PRP slave device (no bridge)
* - To the local HSR master only if the frame is directly addressed to it, or
* a non-supervision multicast or broadcast frame.
*
@@ -253,50 +518,72 @@ static void hsr_forward_do(struct hsr_frame_info *frame)
{
struct hsr_port *port;
struct sk_buff *skb;
+ bool sent = false;
hsr_for_each_port(frame->port_rcv->hsr, port) {
+ struct hsr_priv *hsr = port->hsr;
/* Don't send frame back the way it came */
if (port == frame->port_rcv)
continue;
/* Don't deliver locally unless we should */
- if ((port->type == HSR_PT_MASTER) && !frame->is_local_dest)
+ if (port->type == HSR_PT_MASTER && !frame->is_local_dest)
continue;
/* Deliver frames directly addressed to us to master only */
- if ((port->type != HSR_PT_MASTER) && frame->is_local_exclusive)
+ if (port->type != HSR_PT_MASTER && frame->is_local_exclusive)
continue;
- /* Don't send frame over port where it has been sent before */
- if (hsr_register_frame_out(port, frame->node_src,
- frame->sequence_nr))
+ /* If hardware duplicate generation is enabled, only send out
+ * one port.
+ */
+ if ((port->dev->features & NETIF_F_HW_HSR_DUP) && sent)
continue;
- if (frame->is_supervision && (port->type == HSR_PT_MASTER)) {
- hsr_handle_sup_frame(frame->skb_hsr,
- frame->node_src,
- frame->port_rcv);
+ /* Don't send frame over port where it has been sent before.
+ * Also for SAN, this shouldn't be done.
+ */
+ if (!frame->is_from_san &&
+ hsr->proto_ops->register_frame_out &&
+ hsr->proto_ops->register_frame_out(port, frame))
+ continue;
+
+ if (frame->is_supervision && port->type == HSR_PT_MASTER &&
+ !frame->is_proxy_supervision) {
+ hsr_handle_sup_frame(frame);
continue;
}
- if (port->type != HSR_PT_MASTER)
- skb = frame_get_tagged_skb(frame, port);
+ /* Check if frame is to be dropped. Eg. for PRP no forward
+ * between ports, or sending HSR supervision to RedBox.
+ */
+ if (hsr->proto_ops->drop_frame &&
+ hsr->proto_ops->drop_frame(frame, port))
+ continue;
+
+ if (port->type == HSR_PT_SLAVE_A ||
+ port->type == HSR_PT_SLAVE_B)
+ skb = hsr->proto_ops->create_tagged_frame(frame, port);
else
- skb = frame_get_stripped_skb(frame, port);
- if (skb == NULL) {
- /* FIXME: Record the dropped frame? */
+ skb = hsr->proto_ops->get_untagged_frame(frame, port);
+
+ if (!skb) {
+ frame->port_rcv->dev->stats.rx_dropped++;
continue;
}
skb->dev = port->dev;
- if (port->type == HSR_PT_MASTER)
+ if (port->type == HSR_PT_MASTER) {
hsr_deliver_master(skb, port->dev, frame->node_src);
- else
- hsr_xmit(skb, port, frame);
+ } else {
+ if (!hsr_xmit(skb, port, frame))
+ if (port->type == HSR_PT_SLAVE_A ||
+ port->type == HSR_PT_SLAVE_B)
+ sent = true;
+ }
}
}
-
static void check_local_dest(struct hsr_priv *hsr, struct sk_buff *skb,
struct hsr_frame_info *frame)
{
@@ -307,50 +594,135 @@ static void check_local_dest(struct hsr_priv *hsr, struct sk_buff *skb,
frame->is_local_exclusive = false;
}
- if ((skb->pkt_type == PACKET_HOST) ||
- (skb->pkt_type == PACKET_MULTICAST) ||
- (skb->pkt_type == PACKET_BROADCAST)) {
+ if (skb->pkt_type == PACKET_HOST ||
+ skb->pkt_type == PACKET_MULTICAST ||
+ skb->pkt_type == PACKET_BROADCAST) {
frame->is_local_dest = true;
} else {
frame->is_local_dest = false;
}
}
+static void handle_std_frame(struct sk_buff *skb,
+ struct hsr_frame_info *frame)
+{
+ struct hsr_port *port = frame->port_rcv;
+ struct hsr_priv *hsr = port->hsr;
+
+ frame->skb_hsr = NULL;
+ frame->skb_prp = NULL;
+ frame->skb_std = skb;
+
+ if (port->type != HSR_PT_MASTER)
+ frame->is_from_san = true;
+
+ if (port->type == HSR_PT_MASTER ||
+ port->type == HSR_PT_INTERLINK) {
+ /* Sequence nr for the master/interlink node */
+ lockdep_assert_held(&hsr->seqnr_lock);
+ frame->sequence_nr = hsr->sequence_nr;
+ hsr->sequence_nr++;
+ }
+}
-static int hsr_fill_frame_info(struct hsr_frame_info *frame,
- struct sk_buff *skb, struct hsr_port *port)
+int hsr_fill_frame_info(__be16 proto, struct sk_buff *skb,
+ struct hsr_frame_info *frame)
{
+ struct hsr_port *port = frame->port_rcv;
+ struct hsr_priv *hsr = port->hsr;
+
+ /* HSRv0 supervisory frames double as a tag so treat them as tagged. */
+ if ((!hsr->prot_version && proto == htons(ETH_P_PRP)) ||
+ proto == htons(ETH_P_HSR)) {
+ /* Check if skb contains hsr_ethhdr */
+ if (skb->mac_len < sizeof(struct hsr_ethhdr))
+ return -EINVAL;
+
+ /* HSR tagged frame :- Data or Supervision */
+ frame->skb_std = NULL;
+ frame->skb_prp = NULL;
+ frame->skb_hsr = skb;
+ frame->sequence_nr = hsr_get_skb_sequence_nr(skb);
+ return 0;
+ }
+
+ /* Standard frame or PRP from master port */
+ handle_std_frame(skb, frame);
+
+ return 0;
+}
+
+int prp_fill_frame_info(__be16 proto, struct sk_buff *skb,
+ struct hsr_frame_info *frame)
+{
+ /* Supervision frame */
+ struct prp_rct *rct = skb_get_PRP_rct(skb);
+
+ if (rct &&
+ prp_check_lsdu_size(skb, rct, frame->is_supervision)) {
+ frame->skb_hsr = NULL;
+ frame->skb_std = NULL;
+ frame->skb_prp = skb;
+ frame->sequence_nr = prp_get_skb_sequence_nr(rct);
+ return 0;
+ }
+ handle_std_frame(skb, frame);
+
+ return 0;
+}
+
+static int fill_frame_info(struct hsr_frame_info *frame,
+ struct sk_buff *skb, struct hsr_port *port)
+{
+ struct hsr_priv *hsr = port->hsr;
+ struct hsr_vlan_ethhdr *vlan_hdr;
+ struct list_head *n_db;
struct ethhdr *ethhdr;
- unsigned long irqflags;
+ __be16 proto;
+ int ret;
+ /* Check if skb contains ethhdr */
+ if (skb->mac_len < sizeof(struct ethhdr))
+ return -EINVAL;
+
+ memset(frame, 0, sizeof(*frame));
frame->is_supervision = is_supervision_frame(port->hsr, skb);
- frame->node_src = hsr_get_node(port, skb, frame->is_supervision);
- if (frame->node_src == NULL)
+ if (frame->is_supervision && hsr->redbox)
+ frame->is_proxy_supervision =
+ is_proxy_supervision_frame(port->hsr, skb);
+
+ n_db = &hsr->node_db;
+ if (port->type == HSR_PT_INTERLINK)
+ n_db = &hsr->proxy_node_db;
+
+ frame->node_src = hsr_get_node(port, n_db, skb,
+ frame->is_supervision, port->type);
+ if (!frame->node_src)
return -1; /* Unknown node and !is_supervision, or no mem */
- ethhdr = (struct ethhdr *) skb_mac_header(skb);
+ ethhdr = (struct ethhdr *)skb_mac_header(skb);
frame->is_vlan = false;
- if (ethhdr->h_proto == htons(ETH_P_8021Q)) {
+ proto = ethhdr->h_proto;
+
+ if (proto == htons(ETH_P_8021Q))
frame->is_vlan = true;
- /* FIXME: */
- WARN_ONCE(1, "HSR: VLAN not yet supported");
- }
- if (ethhdr->h_proto == htons(ETH_P_PRP)
- || ethhdr->h_proto == htons(ETH_P_HSR)) {
- frame->skb_std = NULL;
- frame->skb_hsr = skb;
- frame->sequence_nr = hsr_get_skb_sequence_nr(skb);
- } else {
- frame->skb_std = skb;
- frame->skb_hsr = NULL;
- /* Sequence nr for the master node */
- spin_lock_irqsave(&port->hsr->seqnr_lock, irqflags);
- frame->sequence_nr = port->hsr->sequence_nr;
- port->hsr->sequence_nr++;
- spin_unlock_irqrestore(&port->hsr->seqnr_lock, irqflags);
+
+ if (frame->is_vlan) {
+ /* Note: skb->mac_len might be wrong here. */
+ if (!pskb_may_pull(skb,
+ skb_mac_offset(skb) +
+ offsetofend(struct hsr_vlan_ethhdr, vlanhdr)))
+ return -EINVAL;
+ vlan_hdr = (struct hsr_vlan_ethhdr *)skb_mac_header(skb);
+ proto = vlan_hdr->vlanhdr.h_vlan_encapsulated_proto;
}
+ frame->is_from_san = false;
frame->port_rcv = port;
+ ret = hsr->proto_ops->fill_frame_info(proto, skb, frame);
+ if (ret)
+ return ret;
+
check_local_dest(port->hsr, skb, frame);
return 0;
@@ -361,24 +733,28 @@ void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port)
{
struct hsr_frame_info frame;
- if (skb_mac_header(skb) != skb->data) {
- WARN_ONCE(1, "%s:%d: Malformed frame (port_src %s)\n",
- __FILE__, __LINE__, port->dev->name);
+ rcu_read_lock();
+ if (fill_frame_info(&frame, skb, port) < 0)
goto out_drop;
- }
- if (hsr_fill_frame_info(&frame, skb, port) < 0)
- goto out_drop;
hsr_register_frame_in(frame.node_src, port, frame.sequence_nr);
hsr_forward_do(&frame);
+ rcu_read_unlock();
+ /* Gets called for ingress frames as well as egress from master port.
+ * So check and increment stats for master port only here.
+ */
+ if (port->type == HSR_PT_MASTER || port->type == HSR_PT_INTERLINK) {
+ port->dev->stats.tx_packets++;
+ port->dev->stats.tx_bytes += skb->len;
+ }
- if (frame.skb_hsr != NULL)
- kfree_skb(frame.skb_hsr);
- if (frame.skb_std != NULL)
- kfree_skb(frame.skb_std);
+ kfree_skb(frame.skb_hsr);
+ kfree_skb(frame.skb_prp);
+ kfree_skb(frame.skb_std);
return;
out_drop:
+ rcu_read_unlock();
port->dev->stats.tx_dropped++;
kfree_skb(skb);
}
diff --git a/net/hsr/hsr_forward.h b/net/hsr/hsr_forward.h
index 5c5bc4b6b75f..206636750b30 100644
--- a/net/hsr/hsr_forward.h
+++ b/net/hsr/hsr_forward.h
@@ -1,12 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright 2011-2014 Autronica Fire and Security AS
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
* Author(s):
* 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * include file for HSR and PRP.
*/
#ifndef __HSR_FORWARD_H
@@ -16,5 +14,18 @@
#include "hsr_main.h"
void hsr_forward_skb(struct sk_buff *skb, struct hsr_port *port);
-
+struct sk_buff *prp_create_tagged_frame(struct hsr_frame_info *frame,
+ struct hsr_port *port);
+struct sk_buff *hsr_create_tagged_frame(struct hsr_frame_info *frame,
+ struct hsr_port *port);
+struct sk_buff *hsr_get_untagged_frame(struct hsr_frame_info *frame,
+ struct hsr_port *port);
+struct sk_buff *prp_get_untagged_frame(struct hsr_frame_info *frame,
+ struct hsr_port *port);
+bool prp_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port);
+bool hsr_drop_frame(struct hsr_frame_info *frame, struct hsr_port *port);
+int prp_fill_frame_info(__be16 proto, struct sk_buff *skb,
+ struct hsr_frame_info *frame);
+int hsr_fill_frame_info(__be16 proto, struct sk_buff *skb,
+ struct hsr_frame_info *frame);
#endif /* __HSR_FORWARD_H */
diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
index 286ceb41ac0c..3a2a2fa7a0a3 100644
--- a/net/hsr/hsr_framereg.c
+++ b/net/hsr/hsr_framereg.c
@@ -1,10 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/* Copyright 2011-2014 Autronica Fire and Security AS
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
* Author(s):
* 2011-2014 Arvid Brodin, arvid.brodin@alten.se
*
@@ -12,6 +8,7 @@
* interface. A frame is identified by its source MAC address and its HSR
* sequence number. This code keeps track of senders and their sequence numbers
* to allow filtering of duplicate frames, and to detect HSR ring errors.
+ * Same code handles filtering of duplicates for PRP as well.
*/
#include <linux/if_ether.h>
@@ -22,23 +19,6 @@
#include "hsr_framereg.h"
#include "hsr_netlink.h"
-
-struct hsr_node {
- struct list_head mac_list;
- unsigned char MacAddressA[ETH_ALEN];
- unsigned char MacAddressB[ETH_ALEN];
- /* Local slave through which AddrB frames are received from this node */
- enum hsr_port_type AddrB_port;
- unsigned long time_in[HSR_PT_PORTS];
- bool time_in_stale[HSR_PT_PORTS];
- u16 seq_out[HSR_PT_PORTS];
- struct rcu_head rcu_head;
-};
-
-
-/* TODO: use hash lists for mac addresses (linux/jhash.h)? */
-
-
/* seq_nr_after(a, b) - return true if a is after (higher in sequence than) b,
* false otherwise.
*/
@@ -47,221 +27,390 @@ static bool seq_nr_after(u16 a, u16 b)
/* Remove inconsistency where
* seq_nr_after(a, b) == seq_nr_before(a, b)
*/
- if ((int) b - a == 32768)
+ if ((int)b - a == 32768)
return false;
- return (((s16) (b - a)) < 0);
+ return (((s16)(b - a)) < 0);
}
+
#define seq_nr_before(a, b) seq_nr_after((b), (a))
-#define seq_nr_after_or_eq(a, b) (!seq_nr_before((a), (b)))
#define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b)))
+#define PRP_DROP_WINDOW_LEN 32768
+bool hsr_addr_is_redbox(struct hsr_priv *hsr, unsigned char *addr)
+{
+ if (!hsr->redbox || !is_valid_ether_addr(hsr->macaddress_redbox))
+ return false;
+
+ return ether_addr_equal(addr, hsr->macaddress_redbox);
+}
bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr)
{
- struct hsr_node *node;
+ struct hsr_self_node *sn;
+ bool ret = false;
- node = list_first_or_null_rcu(&hsr->self_node_db, struct hsr_node,
- mac_list);
- if (!node) {
+ rcu_read_lock();
+ sn = rcu_dereference(hsr->self_node);
+ if (!sn) {
WARN_ONCE(1, "HSR: No self node\n");
- return false;
+ goto out;
}
- if (ether_addr_equal(addr, node->MacAddressA))
- return true;
- if (ether_addr_equal(addr, node->MacAddressB))
- return true;
-
- return false;
+ if (ether_addr_equal(addr, sn->macaddress_A) ||
+ ether_addr_equal(addr, sn->macaddress_B))
+ ret = true;
+out:
+ rcu_read_unlock();
+ return ret;
}
/* Search for mac entry. Caller must hold rcu read lock.
*/
-static struct hsr_node *find_node_by_AddrA(struct list_head *node_db,
- const unsigned char addr[ETH_ALEN])
+static struct hsr_node *find_node_by_addr_A(struct list_head *node_db,
+ const unsigned char addr[ETH_ALEN])
{
struct hsr_node *node;
list_for_each_entry_rcu(node, node_db, mac_list) {
- if (ether_addr_equal(node->MacAddressA, addr))
+ if (ether_addr_equal(node->macaddress_A, addr))
return node;
}
return NULL;
}
+/* Check if node for a given MAC address is already present in data base
+ */
+bool hsr_is_node_in_db(struct list_head *node_db,
+ const unsigned char addr[ETH_ALEN])
+{
+ return !!find_node_by_addr_A(node_db, addr);
+}
-/* Helper for device init; the self_node_db is used in hsr_rcv() to recognize
+/* Helper for device init; the self_node is used in hsr_rcv() to recognize
* frames from self that's been looped over the HSR ring.
*/
-int hsr_create_self_node(struct list_head *self_node_db,
- unsigned char addr_a[ETH_ALEN],
- unsigned char addr_b[ETH_ALEN])
+int hsr_create_self_node(struct hsr_priv *hsr,
+ const unsigned char addr_a[ETH_ALEN],
+ const unsigned char addr_b[ETH_ALEN])
{
- struct hsr_node *node, *oldnode;
+ struct hsr_self_node *sn, *old;
- node = kmalloc(sizeof(*node), GFP_KERNEL);
- if (!node)
+ sn = kmalloc(sizeof(*sn), GFP_KERNEL);
+ if (!sn)
return -ENOMEM;
- ether_addr_copy(node->MacAddressA, addr_a);
- ether_addr_copy(node->MacAddressB, addr_b);
+ ether_addr_copy(sn->macaddress_A, addr_a);
+ ether_addr_copy(sn->macaddress_B, addr_b);
- rcu_read_lock();
- oldnode = list_first_or_null_rcu(self_node_db,
- struct hsr_node, mac_list);
- if (oldnode) {
- list_replace_rcu(&oldnode->mac_list, &node->mac_list);
- rcu_read_unlock();
- synchronize_rcu();
- kfree(oldnode);
- } else {
- rcu_read_unlock();
- list_add_tail_rcu(&node->mac_list, self_node_db);
- }
+ spin_lock_bh(&hsr->list_lock);
+ old = rcu_replace_pointer(hsr->self_node, sn,
+ lockdep_is_held(&hsr->list_lock));
+ spin_unlock_bh(&hsr->list_lock);
+ if (old)
+ kfree_rcu(old, rcu_head);
return 0;
}
+void hsr_del_self_node(struct hsr_priv *hsr)
+{
+ struct hsr_self_node *old;
+
+ spin_lock_bh(&hsr->list_lock);
+ old = rcu_replace_pointer(hsr->self_node, NULL,
+ lockdep_is_held(&hsr->list_lock));
+ spin_unlock_bh(&hsr->list_lock);
+ if (old)
+ kfree_rcu(old, rcu_head);
+}
+
+void hsr_del_nodes(struct list_head *node_db)
+{
+ struct hsr_node *node;
+ struct hsr_node *tmp;
+
+ list_for_each_entry_safe(node, tmp, node_db, mac_list)
+ kfree(node);
+}
-/* Allocate an hsr_node and add it to node_db. 'addr' is the node's AddressA;
+void prp_handle_san_frame(bool san, enum hsr_port_type port,
+ struct hsr_node *node)
+{
+ /* Mark if the SAN node is over LAN_A or LAN_B */
+ if (port == HSR_PT_SLAVE_A) {
+ node->san_a = true;
+ return;
+ }
+
+ if (port == HSR_PT_SLAVE_B)
+ node->san_b = true;
+}
+
+/* Allocate an hsr_node and add it to node_db. 'addr' is the node's address_A;
* seq_out is used to initialize filtering of outgoing duplicate frames
* originating from the newly added node.
*/
-struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[],
- u16 seq_out)
+static struct hsr_node *hsr_add_node(struct hsr_priv *hsr,
+ struct list_head *node_db,
+ unsigned char addr[],
+ u16 seq_out, bool san,
+ enum hsr_port_type rx_port)
{
- struct hsr_node *node;
+ struct hsr_node *new_node, *node;
unsigned long now;
int i;
- node = kzalloc(sizeof(*node), GFP_ATOMIC);
- if (!node)
+ new_node = kzalloc(sizeof(*new_node), GFP_ATOMIC);
+ if (!new_node)
return NULL;
- ether_addr_copy(node->MacAddressA, addr);
+ ether_addr_copy(new_node->macaddress_A, addr);
+ spin_lock_init(&new_node->seq_out_lock);
/* We are only interested in time diffs here, so use current jiffies
* as initialization. (0 could trigger an spurious ring error warning).
*/
now = jiffies;
- for (i = 0; i < HSR_PT_PORTS; i++)
- node->time_in[i] = now;
- for (i = 0; i < HSR_PT_PORTS; i++)
- node->seq_out[i] = seq_out;
+ for (i = 0; i < HSR_PT_PORTS; i++) {
+ new_node->time_in[i] = now;
+ new_node->time_out[i] = now;
+ }
+ for (i = 0; i < HSR_PT_PORTS; i++) {
+ new_node->seq_out[i] = seq_out;
+ new_node->seq_expected[i] = seq_out + 1;
+ new_node->seq_start[i] = seq_out + 1;
+ }
- list_add_tail_rcu(&node->mac_list, node_db);
+ if (san && hsr->proto_ops->handle_san_frame)
+ hsr->proto_ops->handle_san_frame(san, rx_port, new_node);
+ spin_lock_bh(&hsr->list_lock);
+ list_for_each_entry_rcu(node, node_db, mac_list,
+ lockdep_is_held(&hsr->list_lock)) {
+ if (ether_addr_equal(node->macaddress_A, addr))
+ goto out;
+ if (ether_addr_equal(node->macaddress_B, addr))
+ goto out;
+ }
+ list_add_tail_rcu(&new_node->mac_list, node_db);
+ spin_unlock_bh(&hsr->list_lock);
+ return new_node;
+out:
+ spin_unlock_bh(&hsr->list_lock);
+ kfree(new_node);
return node;
}
+void prp_update_san_info(struct hsr_node *node, bool is_sup)
+{
+ if (!is_sup)
+ return;
+
+ node->san_a = false;
+ node->san_b = false;
+}
+
/* Get the hsr_node from which 'skb' was sent.
*/
-struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb,
- bool is_sup)
+struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db,
+ struct sk_buff *skb, bool is_sup,
+ enum hsr_port_type rx_port)
{
- struct list_head *node_db = &port->hsr->node_db;
+ struct hsr_priv *hsr = port->hsr;
struct hsr_node *node;
struct ethhdr *ethhdr;
+ struct prp_rct *rct;
+ bool san = false;
u16 seq_out;
if (!skb_mac_header_was_set(skb))
return NULL;
- ethhdr = (struct ethhdr *) skb_mac_header(skb);
+ ethhdr = (struct ethhdr *)skb_mac_header(skb);
list_for_each_entry_rcu(node, node_db, mac_list) {
- if (ether_addr_equal(node->MacAddressA, ethhdr->h_source))
+ if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) {
+ if (hsr->proto_ops->update_san_info)
+ hsr->proto_ops->update_san_info(node, is_sup);
+ return node;
+ }
+ if (ether_addr_equal(node->macaddress_B, ethhdr->h_source)) {
+ if (hsr->proto_ops->update_san_info)
+ hsr->proto_ops->update_san_info(node, is_sup);
return node;
- if (ether_addr_equal(node->MacAddressB, ethhdr->h_source))
+ }
+ }
+
+ /* Check if required node is not in proxy nodes table */
+ list_for_each_entry_rcu(node, &hsr->proxy_node_db, mac_list) {
+ if (ether_addr_equal(node->macaddress_A, ethhdr->h_source)) {
+ if (hsr->proto_ops->update_san_info)
+ hsr->proto_ops->update_san_info(node, is_sup);
return node;
+ }
}
- /* Everyone may create a node entry, connected node to a HSR device. */
+ /* Everyone may create a node entry, connected node to a HSR/PRP
+ * device.
+ */
+ if (ethhdr->h_proto == htons(ETH_P_PRP) ||
+ ethhdr->h_proto == htons(ETH_P_HSR)) {
+ /* Check if skb contains hsr_ethhdr */
+ if (skb->mac_len < sizeof(struct hsr_ethhdr))
+ return NULL;
- if (ethhdr->h_proto == htons(ETH_P_PRP)
- || ethhdr->h_proto == htons(ETH_P_HSR)) {
/* Use the existing sequence_nr from the tag as starting point
* for filtering duplicate frames.
*/
seq_out = hsr_get_skb_sequence_nr(skb) - 1;
} else {
- /* this is called also for frames from master port and
- * so warn only for non master ports
- */
- if (port->type != HSR_PT_MASTER)
- WARN_ONCE(1, "%s: Non-HSR frame\n", __func__);
- seq_out = HSR_SEQNR_START;
+ rct = skb_get_PRP_rct(skb);
+ if (rct && prp_check_lsdu_size(skb, rct, is_sup)) {
+ seq_out = prp_get_skb_sequence_nr(rct);
+ } else {
+ if (rx_port != HSR_PT_MASTER)
+ san = true;
+ seq_out = HSR_SEQNR_START;
+ }
}
- return hsr_add_node(node_db, ethhdr->h_source, seq_out);
+ return hsr_add_node(hsr, node_db, ethhdr->h_source, seq_out,
+ san, rx_port);
}
-/* Use the Supervision frame's info about an eventual MacAddressB for merging
- * nodes that has previously had their MacAddressB registered as a separate
+/* Use the Supervision frame's info about an eventual macaddress_B for merging
+ * nodes that has previously had their macaddress_B registered as a separate
* node.
*/
-void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr,
- struct hsr_port *port_rcv)
+void hsr_handle_sup_frame(struct hsr_frame_info *frame)
{
- struct ethhdr *ethhdr;
- struct hsr_node *node_real;
+ struct hsr_node *node_curr = frame->node_src;
+ struct hsr_port *port_rcv = frame->port_rcv;
+ struct hsr_priv *hsr = port_rcv->hsr;
struct hsr_sup_payload *hsr_sp;
+ struct hsr_sup_tlv *hsr_sup_tlv;
+ struct hsr_node *node_real;
+ struct sk_buff *skb = NULL;
struct list_head *node_db;
+ struct ethhdr *ethhdr;
int i;
+ unsigned int pull_size = 0;
+ unsigned int total_pull_size = 0;
- ethhdr = (struct ethhdr *) skb_mac_header(skb);
+ /* Here either frame->skb_hsr or frame->skb_prp should be
+ * valid as supervision frame always will have protocol
+ * header info.
+ */
+ if (frame->skb_hsr)
+ skb = frame->skb_hsr;
+ else if (frame->skb_prp)
+ skb = frame->skb_prp;
+ else if (frame->skb_std)
+ skb = frame->skb_std;
+ if (!skb)
+ return;
/* Leave the ethernet header. */
- skb_pull(skb, sizeof(struct ethhdr));
+ pull_size = sizeof(struct ethhdr);
+ skb_pull(skb, pull_size);
+ total_pull_size += pull_size;
+
+ ethhdr = (struct ethhdr *)skb_mac_header(skb);
/* And leave the HSR tag. */
- if (ethhdr->h_proto == htons(ETH_P_HSR))
- skb_pull(skb, sizeof(struct hsr_tag));
+ if (ethhdr->h_proto == htons(ETH_P_HSR)) {
+ pull_size = sizeof(struct hsr_tag);
+ skb_pull(skb, pull_size);
+ total_pull_size += pull_size;
+ }
/* And leave the HSR sup tag. */
- skb_pull(skb, sizeof(struct hsr_sup_tag));
+ pull_size = sizeof(struct hsr_sup_tag);
+ skb_pull(skb, pull_size);
+ total_pull_size += pull_size;
- hsr_sp = (struct hsr_sup_payload *) skb->data;
+ /* get HSR sup payload */
+ hsr_sp = (struct hsr_sup_payload *)skb->data;
- /* Merge node_curr (registered on MacAddressB) into node_real */
+ /* Merge node_curr (registered on macaddress_B) into node_real */
node_db = &port_rcv->hsr->node_db;
- node_real = find_node_by_AddrA(node_db, hsr_sp->MacAddressA);
+ node_real = find_node_by_addr_A(node_db, hsr_sp->macaddress_A);
if (!node_real)
/* No frame received from AddrA of this node yet */
- node_real = hsr_add_node(node_db, hsr_sp->MacAddressA,
- HSR_SEQNR_START - 1);
+ node_real = hsr_add_node(hsr, node_db, hsr_sp->macaddress_A,
+ HSR_SEQNR_START - 1, true,
+ port_rcv->type);
if (!node_real)
goto done; /* No mem */
if (node_real == node_curr)
/* Node has already been merged */
goto done;
- ether_addr_copy(node_real->MacAddressB, ethhdr->h_source);
+ /* Leave the first HSR sup payload. */
+ pull_size = sizeof(struct hsr_sup_payload);
+ skb_pull(skb, pull_size);
+ total_pull_size += pull_size;
+
+ /* Get second supervision tlv */
+ hsr_sup_tlv = (struct hsr_sup_tlv *)skb->data;
+ /* And check if it is a redbox mac TLV */
+ if (hsr_sup_tlv->HSR_TLV_type == PRP_TLV_REDBOX_MAC) {
+ /* We could stop here after pushing hsr_sup_payload,
+ * or proceed and allow macaddress_B and for redboxes.
+ */
+ /* Sanity check length */
+ if (hsr_sup_tlv->HSR_TLV_length != 6)
+ goto done;
+
+ /* Leave the second HSR sup tlv. */
+ pull_size = sizeof(struct hsr_sup_tlv);
+ skb_pull(skb, pull_size);
+ total_pull_size += pull_size;
+
+ /* Get redbox mac address. */
+ hsr_sp = (struct hsr_sup_payload *)skb->data;
+
+ /* Check if redbox mac and node mac are equal. */
+ if (!ether_addr_equal(node_real->macaddress_A, hsr_sp->macaddress_A)) {
+ /* This is a redbox supervision frame for a VDAN! */
+ goto done;
+ }
+ }
+
+ ether_addr_copy(node_real->macaddress_B, ethhdr->h_source);
+ spin_lock_bh(&node_real->seq_out_lock);
for (i = 0; i < HSR_PT_PORTS; i++) {
if (!node_curr->time_in_stale[i] &&
time_after(node_curr->time_in[i], node_real->time_in[i])) {
node_real->time_in[i] = node_curr->time_in[i];
- node_real->time_in_stale[i] = node_curr->time_in_stale[i];
+ node_real->time_in_stale[i] =
+ node_curr->time_in_stale[i];
}
if (seq_nr_after(node_curr->seq_out[i], node_real->seq_out[i]))
node_real->seq_out[i] = node_curr->seq_out[i];
}
- node_real->AddrB_port = port_rcv->type;
-
- list_del_rcu(&node_curr->mac_list);
- kfree_rcu(node_curr, rcu_head);
+ spin_unlock_bh(&node_real->seq_out_lock);
+ node_real->addr_B_port = port_rcv->type;
+
+ spin_lock_bh(&hsr->list_lock);
+ if (!node_curr->removed) {
+ list_del_rcu(&node_curr->mac_list);
+ node_curr->removed = true;
+ kfree_rcu(node_curr, rcu_head);
+ }
+ spin_unlock_bh(&hsr->list_lock);
done:
- skb_push(skb, sizeof(struct hsrv1_ethhdr_sp));
+ /* Push back here */
+ skb_push(skb, total_pull_size);
}
-
/* 'skb' is a frame meant for this host, that is to be passed to upper layers.
*
* If the frame was sent by a node's B interface, replace the source
- * address with that node's "official" address (MacAddressA) so that upper
+ * address with that node's "official" address (macaddress_A) so that upper
* layers recognize where it came from.
*/
void hsr_addr_subst_source(struct hsr_node *node, struct sk_buff *skb)
@@ -271,7 +420,7 @@ void hsr_addr_subst_source(struct hsr_node *node, struct sk_buff *skb)
return;
}
- memcpy(&eth_hdr(skb)->h_source, node->MacAddressA, ETH_ALEN);
+ memcpy(&eth_hdr(skb)->h_source, node->macaddress_A, ETH_ALEN);
}
/* 'skb' is a frame meant for another host.
@@ -296,18 +445,24 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb,
if (!is_unicast_ether_addr(eth_hdr(skb)->h_dest))
return;
- node_dst = find_node_by_AddrA(&port->hsr->node_db, eth_hdr(skb)->h_dest);
+ node_dst = find_node_by_addr_A(&port->hsr->node_db,
+ eth_hdr(skb)->h_dest);
+ if (!node_dst && port->hsr->redbox)
+ node_dst = find_node_by_addr_A(&port->hsr->proxy_node_db,
+ eth_hdr(skb)->h_dest);
+
if (!node_dst) {
- WARN_ONCE(1, "%s: Unknown node\n", __func__);
+ if (port->hsr->prot_version != PRP_V1 && net_ratelimit())
+ netdev_err(skb->dev, "%s: Unknown node\n", __func__);
return;
}
- if (port->type != node_dst->AddrB_port)
+ if (port->type != node_dst->addr_B_port)
return;
- ether_addr_copy(eth_hdr(skb)->h_dest, node_dst->MacAddressB);
+ if (is_valid_ether_addr(node_dst->macaddress_B))
+ ether_addr_copy(eth_hdr(skb)->h_dest, node_dst->macaddress_B);
}
-
void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port,
u16 sequence_nr)
{
@@ -315,7 +470,8 @@ void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port,
* ensures entries of restarted nodes gets pruned so that they can
* re-register and resume communications.
*/
- if (seq_nr_before(sequence_nr, node->seq_out[port->type]))
+ if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) &&
+ seq_nr_before(sequence_nr, node->seq_out[port->type]))
return;
node->time_in[port->type] = jiffies;
@@ -330,16 +486,111 @@ void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port,
* 0 otherwise, or
* negative error code on error
*/
-int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node,
- u16 sequence_nr)
+int hsr_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame)
{
- if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]))
+ struct hsr_node *node = frame->node_src;
+ u16 sequence_nr = frame->sequence_nr;
+
+ spin_lock_bh(&node->seq_out_lock);
+ if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]) &&
+ time_is_after_jiffies(node->time_out[port->type] +
+ msecs_to_jiffies(HSR_ENTRY_FORGET_TIME))) {
+ spin_unlock_bh(&node->seq_out_lock);
return 1;
+ }
+ node->time_out[port->type] = jiffies;
node->seq_out[port->type] = sequence_nr;
+ spin_unlock_bh(&node->seq_out_lock);
return 0;
}
+/* Adaptation of the PRP duplicate discard algorithm described in wireshark
+ * wiki (https://wiki.wireshark.org/PRP)
+ *
+ * A drop window is maintained for both LANs with start sequence set to the
+ * first sequence accepted on the LAN that has not been seen on the other LAN,
+ * and expected sequence set to the latest received sequence number plus one.
+ *
+ * When a frame is received on either LAN it is compared against the received
+ * frames on the other LAN. If it is outside the drop window of the other LAN
+ * the frame is accepted and the drop window is updated.
+ * The drop window for the other LAN is reset.
+ *
+ * 'port' is the outgoing interface
+ * 'frame' is the frame to be sent
+ *
+ * Return:
+ * 1 if frame can be shown to have been sent recently on this interface,
+ * 0 otherwise
+ */
+int prp_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame)
+{
+ enum hsr_port_type other_port;
+ enum hsr_port_type rcv_port;
+ struct hsr_node *node;
+ u16 sequence_diff;
+ u16 sequence_exp;
+ u16 sequence_nr;
+
+ /* out-going frames are always in order
+ * and can be checked the same way as for HSR
+ */
+ if (frame->port_rcv->type == HSR_PT_MASTER)
+ return hsr_register_frame_out(port, frame);
+
+ /* for PRP we should only forward frames from the slave ports
+ * to the master port
+ */
+ if (port->type != HSR_PT_MASTER)
+ return 1;
+
+ node = frame->node_src;
+ sequence_nr = frame->sequence_nr;
+ sequence_exp = sequence_nr + 1;
+ rcv_port = frame->port_rcv->type;
+ other_port = rcv_port == HSR_PT_SLAVE_A ? HSR_PT_SLAVE_B :
+ HSR_PT_SLAVE_A;
+
+ spin_lock_bh(&node->seq_out_lock);
+ if (time_is_before_jiffies(node->time_out[port->type] +
+ msecs_to_jiffies(HSR_ENTRY_FORGET_TIME)) ||
+ (node->seq_start[rcv_port] == node->seq_expected[rcv_port] &&
+ node->seq_start[other_port] == node->seq_expected[other_port])) {
+ /* the node hasn't been sending for a while
+ * or both drop windows are empty, forward the frame
+ */
+ node->seq_start[rcv_port] = sequence_nr;
+ } else if (seq_nr_before(sequence_nr, node->seq_expected[other_port]) &&
+ seq_nr_before_or_eq(node->seq_start[other_port], sequence_nr)) {
+ /* drop the frame, update the drop window for the other port
+ * and reset our drop window
+ */
+ node->seq_start[other_port] = sequence_exp;
+ node->seq_expected[rcv_port] = sequence_exp;
+ node->seq_start[rcv_port] = node->seq_expected[rcv_port];
+ spin_unlock_bh(&node->seq_out_lock);
+ return 1;
+ }
+
+ /* update the drop window for the port where this frame was received
+ * and clear the drop window for the other port
+ */
+ node->seq_start[other_port] = node->seq_expected[other_port];
+ node->seq_expected[rcv_port] = sequence_exp;
+ sequence_diff = sequence_exp - node->seq_start[rcv_port];
+ if (sequence_diff > PRP_DROP_WINDOW_LEN)
+ node->seq_start[rcv_port] = sequence_exp - PRP_DROP_WINDOW_LEN;
+
+ node->time_out[port->type] = jiffies;
+ node->seq_out[port->type] = sequence_nr;
+ spin_unlock_bh(&node->seq_out_lock);
+ return 0;
+}
+
+#if IS_MODULE(CONFIG_PRP_DUP_DISCARD_KUNIT_TEST)
+EXPORT_SYMBOL(prp_register_frame_out);
+#endif
static struct hsr_port *get_late_port(struct hsr_priv *hsr,
struct hsr_node *node)
@@ -361,28 +612,36 @@ static struct hsr_port *get_late_port(struct hsr_priv *hsr,
return NULL;
}
-
/* Remove stale sequence_nr records. Called by timer every
* HSR_LIFE_CHECK_INTERVAL (two seconds or so).
*/
void hsr_prune_nodes(struct timer_list *t)
{
- struct hsr_priv *hsr = from_timer(hsr, t, prune_timer);
+ struct hsr_priv *hsr = timer_container_of(hsr, t, prune_timer);
struct hsr_node *node;
+ struct hsr_node *tmp;
struct hsr_port *port;
unsigned long timestamp;
unsigned long time_a, time_b;
- rcu_read_lock();
- list_for_each_entry_rcu(node, &hsr->node_db, mac_list) {
+ spin_lock_bh(&hsr->list_lock);
+ list_for_each_entry_safe(node, tmp, &hsr->node_db, mac_list) {
+ /* Don't prune own node. Neither time_in[HSR_PT_SLAVE_A]
+ * nor time_in[HSR_PT_SLAVE_B], will ever be updated for
+ * the master port. Thus the master node will be repeatedly
+ * pruned leading to packet loss.
+ */
+ if (hsr_addr_is_self(hsr, node->macaddress_A))
+ continue;
+
/* Shorthand */
time_a = node->time_in[HSR_PT_SLAVE_A];
time_b = node->time_in[HSR_PT_SLAVE_B];
/* Check for timestamps old enough to risk wrap-around */
- if (time_after(jiffies, time_a + MAX_JIFFY_OFFSET/2))
+ if (time_after(jiffies, time_a + MAX_JIFFY_OFFSET / 2))
node->time_in_stale[HSR_PT_SLAVE_A] = true;
- if (time_after(jiffies, time_b + MAX_JIFFY_OFFSET/2))
+ if (time_after(jiffies, time_b + MAX_JIFFY_OFFSET / 2))
node->time_in_stale[HSR_PT_SLAVE_B] = true;
/* Get age of newest frame from node.
@@ -397,26 +656,67 @@ void hsr_prune_nodes(struct timer_list *t)
/* Warn of ring error only as long as we get frames at all */
if (time_is_after_jiffies(timestamp +
- msecs_to_jiffies(1.5*MAX_SLAVE_DIFF))) {
+ msecs_to_jiffies(1.5 * MAX_SLAVE_DIFF))) {
rcu_read_lock();
port = get_late_port(hsr, node);
- if (port != NULL)
- hsr_nl_ringerror(hsr, node->MacAddressA, port);
+ if (port)
+ hsr_nl_ringerror(hsr, node->macaddress_A, port);
rcu_read_unlock();
}
/* Prune old entries */
if (time_is_before_jiffies(timestamp +
- msecs_to_jiffies(HSR_NODE_FORGET_TIME))) {
- hsr_nl_nodedown(hsr, node->MacAddressA);
- list_del_rcu(&node->mac_list);
- /* Note that we need to free this entry later: */
- kfree_rcu(node, rcu_head);
+ msecs_to_jiffies(HSR_NODE_FORGET_TIME))) {
+ hsr_nl_nodedown(hsr, node->macaddress_A);
+ if (!node->removed) {
+ list_del_rcu(&node->mac_list);
+ node->removed = true;
+ /* Note that we need to free this entry later: */
+ kfree_rcu(node, rcu_head);
+ }
}
}
- rcu_read_unlock();
+ spin_unlock_bh(&hsr->list_lock);
+
+ /* Restart timer */
+ mod_timer(&hsr->prune_timer,
+ jiffies + msecs_to_jiffies(PRUNE_PERIOD));
}
+void hsr_prune_proxy_nodes(struct timer_list *t)
+{
+ struct hsr_priv *hsr = timer_container_of(hsr, t, prune_proxy_timer);
+ unsigned long timestamp;
+ struct hsr_node *node;
+ struct hsr_node *tmp;
+
+ spin_lock_bh(&hsr->list_lock);
+ list_for_each_entry_safe(node, tmp, &hsr->proxy_node_db, mac_list) {
+ /* Don't prune RedBox node. */
+ if (hsr_addr_is_redbox(hsr, node->macaddress_A))
+ continue;
+
+ timestamp = node->time_in[HSR_PT_INTERLINK];
+
+ /* Prune old entries */
+ if (time_is_before_jiffies(timestamp +
+ msecs_to_jiffies(HSR_PROXY_NODE_FORGET_TIME))) {
+ hsr_nl_nodedown(hsr, node->macaddress_A);
+ if (!node->removed) {
+ list_del_rcu(&node->mac_list);
+ node->removed = true;
+ /* Note that we need to free this entry later: */
+ kfree_rcu(node, rcu_head);
+ }
+ }
+ }
+
+ spin_unlock_bh(&hsr->list_lock);
+
+ /* Restart timer */
+ mod_timer(&hsr->prune_proxy_timer,
+ jiffies + msecs_to_jiffies(PRUNE_PROXY_PERIOD));
+}
void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos,
unsigned char addr[ETH_ALEN])
@@ -427,20 +727,19 @@ void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos,
node = list_first_or_null_rcu(&hsr->node_db,
struct hsr_node, mac_list);
if (node)
- ether_addr_copy(addr, node->MacAddressA);
+ ether_addr_copy(addr, node->macaddress_A);
return node;
}
node = _pos;
list_for_each_entry_continue_rcu(node, &hsr->node_db, mac_list) {
- ether_addr_copy(addr, node->MacAddressA);
+ ether_addr_copy(addr, node->macaddress_A);
return node;
}
return NULL;
}
-
int hsr_get_node_data(struct hsr_priv *hsr,
const unsigned char *addr,
unsigned char addr_b[ETH_ALEN],
@@ -454,15 +753,11 @@ int hsr_get_node_data(struct hsr_priv *hsr,
struct hsr_port *port;
unsigned long tdiff;
+ node = find_node_by_addr_A(&hsr->node_db, addr);
+ if (!node)
+ return -ENOENT;
- rcu_read_lock();
- node = find_node_by_AddrA(&hsr->node_db, addr);
- if (!node) {
- rcu_read_unlock();
- return -ENOENT; /* No such entry */
- }
-
- ether_addr_copy(addr_b, node->MacAddressB);
+ ether_addr_copy(addr_b, node->macaddress_B);
tdiff = jiffies - node->time_in[HSR_PT_SLAVE_A];
if (node->time_in_stale[HSR_PT_SLAVE_A])
@@ -488,14 +783,12 @@ int hsr_get_node_data(struct hsr_priv *hsr,
*if1_seq = node->seq_out[HSR_PT_SLAVE_B];
*if2_seq = node->seq_out[HSR_PT_SLAVE_A];
- if (node->AddrB_port != HSR_PT_NONE) {
- port = hsr_port_get_hsr(hsr, node->AddrB_port);
+ if (node->addr_B_port != HSR_PT_NONE) {
+ port = hsr_port_get_hsr(hsr, node->addr_B_port);
*addr_b_ifindex = port->dev->ifindex;
} else {
*addr_b_ifindex = -1;
}
- rcu_read_unlock();
-
return 0;
}
diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h
index 370b45998121..b04948659d84 100644
--- a/net/hsr/hsr_framereg.h
+++ b/net/hsr/hsr_framereg.h
@@ -1,12 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright 2011-2014 Autronica Fire and Security AS
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
* Author(s):
* 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * include file for HSR and PRP.
*/
#ifndef __HSR_FRAMEREG_H
@@ -16,13 +14,29 @@
struct hsr_node;
-struct hsr_node *hsr_add_node(struct list_head *node_db, unsigned char addr[],
- u16 seq_out);
-struct hsr_node *hsr_get_node(struct hsr_port *port, struct sk_buff *skb,
- bool is_sup);
-void hsr_handle_sup_frame(struct sk_buff *skb, struct hsr_node *node_curr,
- struct hsr_port *port);
+struct hsr_frame_info {
+ struct sk_buff *skb_std;
+ struct sk_buff *skb_hsr;
+ struct sk_buff *skb_prp;
+ struct hsr_port *port_rcv;
+ struct hsr_node *node_src;
+ u16 sequence_nr;
+ bool is_supervision;
+ bool is_proxy_supervision;
+ bool is_vlan;
+ bool is_local_dest;
+ bool is_local_exclusive;
+ bool is_from_san;
+};
+
+void hsr_del_self_node(struct hsr_priv *hsr);
+void hsr_del_nodes(struct list_head *node_db);
+struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db,
+ struct sk_buff *skb, bool is_sup,
+ enum hsr_port_type rx_port);
+void hsr_handle_sup_frame(struct hsr_frame_info *frame);
bool hsr_addr_is_self(struct hsr_priv *hsr, unsigned char *addr);
+bool hsr_addr_is_redbox(struct hsr_priv *hsr, unsigned char *addr);
void hsr_addr_subst_source(struct hsr_node *node, struct sk_buff *skb);
void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb,
@@ -30,14 +44,14 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb,
void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port,
u16 sequence_nr);
-int hsr_register_frame_out(struct hsr_port *port, struct hsr_node *node,
- u16 sequence_nr);
+int hsr_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame);
void hsr_prune_nodes(struct timer_list *t);
+void hsr_prune_proxy_nodes(struct timer_list *t);
-int hsr_create_self_node(struct list_head *self_node_db,
- unsigned char addr_a[ETH_ALEN],
- unsigned char addr_b[ETH_ALEN]);
+int hsr_create_self_node(struct hsr_priv *hsr,
+ const unsigned char addr_a[ETH_ALEN],
+ const unsigned char addr_b[ETH_ALEN]);
void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos,
unsigned char addr[ETH_ALEN]);
@@ -51,4 +65,35 @@ int hsr_get_node_data(struct hsr_priv *hsr,
int *if2_age,
u16 *if2_seq);
+void prp_handle_san_frame(bool san, enum hsr_port_type port,
+ struct hsr_node *node);
+void prp_update_san_info(struct hsr_node *node, bool is_sup);
+
+bool hsr_is_node_in_db(struct list_head *node_db,
+ const unsigned char addr[ETH_ALEN]);
+
+int prp_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame);
+
+struct hsr_node {
+ struct list_head mac_list;
+ /* Protect R/W access to seq_out */
+ spinlock_t seq_out_lock;
+ unsigned char macaddress_A[ETH_ALEN];
+ unsigned char macaddress_B[ETH_ALEN];
+ /* Local slave through which AddrB frames are received from this node */
+ enum hsr_port_type addr_B_port;
+ unsigned long time_in[HSR_PT_PORTS];
+ bool time_in_stale[HSR_PT_PORTS];
+ unsigned long time_out[HSR_PT_PORTS];
+ /* if the node is a SAN */
+ bool san_a;
+ bool san_b;
+ u16 seq_out[HSR_PT_PORTS];
+ bool removed;
+ /* PRP specific duplicate handling */
+ u16 seq_expected[HSR_PT_PORTS];
+ u16 seq_start[HSR_PT_PORTS];
+ struct rcu_head rcu_head;
+};
+
#endif /* __HSR_FRAMEREG_H */
diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c
index cd37d0011b42..bc94b07101d8 100644
--- a/net/hsr/hsr_main.c
+++ b/net/hsr/hsr_main.c
@@ -1,15 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0
/* Copyright 2011-2014 Autronica Fire and Security AS
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
* Author(s):
* 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * Event handling for HSR and PRP devices.
*/
#include <linux/netdevice.h>
+#include <net/rtnetlink.h>
#include <linux/rculist.h>
#include <linux/timer.h>
#include <linux/etherdevice.h>
@@ -19,24 +18,34 @@
#include "hsr_framereg.h"
#include "hsr_slave.h"
+static bool hsr_slave_empty(struct hsr_priv *hsr)
+{
+ struct hsr_port *port;
+
+ hsr_for_each_port_rtnl(hsr, port)
+ if (port->type != HSR_PT_MASTER)
+ return false;
+ return true;
+}
static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event,
void *ptr)
{
- struct net_device *dev;
struct hsr_port *port, *master;
+ struct net_device *dev;
struct hsr_priv *hsr;
+ LIST_HEAD(list_kill);
int mtu_max;
int res;
dev = netdev_notifier_info_to_dev(ptr);
port = hsr_port_get_rtnl(dev);
- if (port == NULL) {
+ if (!port) {
if (!is_hsr_master(dev))
return NOTIFY_DONE; /* Not an HSR device */
hsr = netdev_priv(dev);
port = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
- if (port == NULL) {
+ if (!port) {
/* Resend of notification concerning removed device? */
return NOTIFY_DONE;
}
@@ -50,6 +59,10 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event,
case NETDEV_CHANGE: /* Link (carrier) state changes */
hsr_check_carrier_and_operstate(hsr);
break;
+ case NETDEV_CHANGENAME:
+ if (is_hsr_master(dev))
+ hsr_debugfs_rename(dev);
+ break;
case NETDEV_CHANGEADDR:
if (port->type == HSR_PT_MASTER) {
/* This should not happen since there's no
@@ -62,13 +75,23 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event,
master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
if (port->type == HSR_PT_SLAVE_A) {
- ether_addr_copy(master->dev->dev_addr, dev->dev_addr);
- call_netdevice_notifiers(NETDEV_CHANGEADDR, master->dev);
+ eth_hw_addr_set(master->dev, dev->dev_addr);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR,
+ master->dev);
+
+ if (hsr->prot_version == PRP_V1) {
+ port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
+ if (port) {
+ eth_hw_addr_set(port->dev, dev->dev_addr);
+ call_netdevice_notifiers(NETDEV_CHANGEADDR,
+ port->dev);
+ }
+ }
}
/* Make sure we recognize frames from ourselves in hsr_rcv() */
port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
- res = hsr_create_self_node(&hsr->self_node_db,
+ res = hsr_create_self_node(hsr,
master->dev->dev_addr,
port ?
port->dev->dev_addr :
@@ -82,10 +105,20 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event,
break; /* Handled in ndo_change_mtu() */
mtu_max = hsr_get_max_mtu(port->hsr);
master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER);
- master->dev->mtu = mtu_max;
+ WRITE_ONCE(master->dev->mtu, mtu_max);
break;
case NETDEV_UNREGISTER:
- hsr_del_port(port);
+ if (!is_hsr_master(dev)) {
+ master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER);
+ hsr_del_port(port);
+ if (hsr_slave_empty(master->hsr)) {
+ const struct rtnl_link_ops *ops;
+
+ ops = master->dev->rtnl_link_ops;
+ ops->dellink(master->dev, &list_kill);
+ unregister_netdevice_many(&list_kill);
+ }
+ }
break;
case NETDEV_PRE_TYPE_CHANGE:
/* HSR works only on Ethernet devices. Refuse slave to change
@@ -97,40 +130,58 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event,
return NOTIFY_DONE;
}
-
struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt)
{
struct hsr_port *port;
- hsr_for_each_port(hsr, port)
+ hsr_for_each_port_rtnl(hsr, port)
if (port->type == pt)
return port;
return NULL;
}
+int hsr_get_version(struct net_device *dev, enum hsr_version *ver)
+{
+ struct hsr_priv *hsr;
+
+ hsr = netdev_priv(dev);
+ *ver = hsr->prot_version;
+
+ return 0;
+}
+EXPORT_SYMBOL(hsr_get_version);
+
static struct notifier_block hsr_nb = {
.notifier_call = hsr_netdev_notify, /* Slave event notifications */
};
-
static int __init hsr_init(void)
{
- int res;
+ int err;
BUILD_BUG_ON(sizeof(struct hsr_tag) != HSR_HLEN);
- register_netdevice_notifier(&hsr_nb);
- res = hsr_netlink_init();
+ err = register_netdevice_notifier(&hsr_nb);
+ if (err)
+ return err;
+
+ err = hsr_netlink_init();
+ if (err) {
+ unregister_netdevice_notifier(&hsr_nb);
+ return err;
+ }
- return res;
+ return 0;
}
static void __exit hsr_exit(void)
{
- unregister_netdevice_notifier(&hsr_nb);
hsr_netlink_exit();
+ hsr_debugfs_remove_root();
+ unregister_netdevice_notifier(&hsr_nb);
}
module_init(hsr_init);
module_exit(hsr_exit);
+MODULE_DESCRIPTION("High-availability Seamless Redundancy (HSR) driver");
MODULE_LICENSE("GPL");
diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
index 9b9909e89e9e..33b0d2460c9b 100644
--- a/net/hsr/hsr_main.h
+++ b/net/hsr/hsr_main.h
@@ -1,12 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright 2011-2014 Autronica Fire and Security AS
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
* Author(s):
* 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * include file for HSR and PRP.
*/
#ifndef __HSR_PRIVATE_H
@@ -14,7 +12,8 @@
#include <linux/netdevice.h>
#include <linux/list.h>
-
+#include <linux/if_vlan.h>
+#include <linux/if_hsr.h>
/* Time constants as specified in the HSR specification (IEC-62439-3 2010)
* Table 8.
@@ -22,8 +21,9 @@
*/
#define HSR_LIFE_CHECK_INTERVAL 2000 /* ms */
#define HSR_NODE_FORGET_TIME 60000 /* ms */
+#define HSR_PROXY_NODE_FORGET_TIME 60000 /* ms */
#define HSR_ANNOUNCE_INTERVAL 100 /* ms */
-
+#define HSR_ENTRY_FORGET_TIME 400 /* ms */
/* By how much may slave1 and slave2 timestamps of latest received frame from
* each node differ before we notify of communication problem?
@@ -32,32 +32,20 @@
#define HSR_SEQNR_START (USHRT_MAX - 1024)
#define HSR_SUP_SEQNR_START (HSR_SEQNR_START / 2)
-
/* How often shall we check for broken ring and remove node entries older than
* HSR_NODE_FORGET_TIME?
*/
#define PRUNE_PERIOD 3000 /* ms */
-
-
+#define PRUNE_PROXY_PERIOD 3000 /* ms */
+#define HSR_TLV_EOT 0 /* End of TLVs */
#define HSR_TLV_ANNOUNCE 22
#define HSR_TLV_LIFE_CHECK 23
-
-
-/* HSR Tag.
- * As defined in IEC-62439-3:2010, the HSR tag is really { ethertype = 0x88FB,
- * path, LSDU_size, sequence Nr }. But we let eth_header() create { h_dest,
- * h_source, h_proto = 0x88FB }, and add { path, LSDU_size, sequence Nr,
- * encapsulated protocol } instead.
- *
- * Field names as defined in the IEC:2010 standard for HSR.
- */
-struct hsr_tag {
- __be16 path_and_LSDU_size;
- __be16 sequence_nr;
- __be16 encap_proto;
-} __packed;
-
-#define HSR_HLEN 6
+/* PRP V1 life check for Duplicate discard */
+#define PRP_TLV_LIFE_CHECK_DD 20
+/* PRP V1 life check for Duplicate Accept */
+#define PRP_TLV_LIFE_CHECK_DA 21
+/* PRP V1 life redundancy box MAC address */
+#define PRP_TLV_REDBOX_MAC 30
#define HSR_V1_SUP_LSDUSIZE 52
@@ -71,27 +59,17 @@ struct hsr_tag {
* with the path field in-between, which seems strange. I'm guessing the MAC
* address definition is in error.
*/
-static inline u16 get_hsr_tag_path(struct hsr_tag *ht)
-{
- return ntohs(ht->path_and_LSDU_size) >> 12;
-}
-
-static inline u16 get_hsr_tag_LSDU_size(struct hsr_tag *ht)
-{
- return ntohs(ht->path_and_LSDU_size) & 0x0FFF;
-}
static inline void set_hsr_tag_path(struct hsr_tag *ht, u16 path)
{
- ht->path_and_LSDU_size = htons(
- (ntohs(ht->path_and_LSDU_size) & 0x0FFF) | (path << 12));
+ ht->path_and_LSDU_size =
+ htons((ntohs(ht->path_and_LSDU_size) & 0x0FFF) | (path << 12));
}
static inline void set_hsr_tag_LSDU_size(struct hsr_tag *ht, u16 LSDU_size)
{
- ht->path_and_LSDU_size = htons(
- (ntohs(ht->path_and_LSDU_size) & 0xF000) |
- (LSDU_size & 0x0FFF));
+ ht->path_and_LSDU_size = htons((ntohs(ht->path_and_LSDU_size) &
+ 0xF000) | (LSDU_size & 0x0FFF));
}
struct hsr_ethhdr {
@@ -99,39 +77,37 @@ struct hsr_ethhdr {
struct hsr_tag hsr_tag;
} __packed;
+struct hsr_vlan_ethhdr {
+ struct vlan_ethhdr vlanhdr;
+ struct hsr_tag hsr_tag;
+} __packed;
+
+struct hsr_sup_tlv {
+ u8 HSR_TLV_type;
+ u8 HSR_TLV_length;
+} __packed;
-/* HSR Supervision Frame data types.
+/* HSR/PRP Supervision Frame data types.
* Field names as defined in the IEC:2010 standard for HSR.
*/
struct hsr_sup_tag {
- __be16 path_and_HSR_Ver;
- __be16 sequence_nr;
- __u8 HSR_TLV_Type;
- __u8 HSR_TLV_Length;
+ __be16 path_and_HSR_ver;
+ __be16 sequence_nr;
+ struct hsr_sup_tlv tlv;
} __packed;
struct hsr_sup_payload {
- unsigned char MacAddressA[ETH_ALEN];
+ unsigned char macaddress_A[ETH_ALEN];
} __packed;
-static inline u16 get_hsr_stag_path(struct hsr_sup_tag *hst)
-{
- return get_hsr_tag_path((struct hsr_tag *) hst);
-}
-
-static inline u16 get_hsr_stag_HSR_ver(struct hsr_sup_tag *hst)
-{
- return get_hsr_tag_LSDU_size((struct hsr_tag *) hst);
-}
-
static inline void set_hsr_stag_path(struct hsr_sup_tag *hst, u16 path)
{
- set_hsr_tag_path((struct hsr_tag *) hst, path);
+ set_hsr_tag_path((struct hsr_tag *)hst, path);
}
-static inline void set_hsr_stag_HSR_Ver(struct hsr_sup_tag *hst, u16 HSR_Ver)
+static inline void set_hsr_stag_HSR_ver(struct hsr_sup_tag *hst, u16 HSR_ver)
{
- set_hsr_tag_LSDU_size((struct hsr_tag *) hst, HSR_Ver);
+ set_hsr_tag_LSDU_size((struct hsr_tag *)hst, HSR_ver);
}
struct hsrv0_ethhdr_sp {
@@ -145,41 +121,112 @@ struct hsrv1_ethhdr_sp {
struct hsr_sup_tag hsr_sup;
} __packed;
+/* PRP Redunancy Control Trailor (RCT).
+ * As defined in IEC-62439-4:2012, the PRP RCT is really { sequence Nr,
+ * Lan indentifier (LanId), LSDU_size and PRP_suffix = 0x88FB }.
+ *
+ * Field names as defined in the IEC:2012 standard for PRP.
+ */
+struct prp_rct {
+ __be16 sequence_nr;
+ __be16 lan_id_and_LSDU_size;
+ __be16 PRP_suffix;
+} __packed;
+
+static inline u16 get_prp_LSDU_size(struct prp_rct *rct)
+{
+ return ntohs(rct->lan_id_and_LSDU_size) & 0x0FFF;
+}
-enum hsr_port_type {
- HSR_PT_NONE = 0, /* Must be 0, used by framereg */
- HSR_PT_SLAVE_A,
- HSR_PT_SLAVE_B,
- HSR_PT_INTERLINK,
- HSR_PT_MASTER,
- HSR_PT_PORTS, /* This must be the last item in the enum */
-};
+static inline void set_prp_lan_id(struct prp_rct *rct, u16 lan_id)
+{
+ rct->lan_id_and_LSDU_size = htons((ntohs(rct->lan_id_and_LSDU_size) &
+ 0x0FFF) | (lan_id << 12));
+}
+static inline void set_prp_LSDU_size(struct prp_rct *rct, u16 LSDU_size)
+{
+ rct->lan_id_and_LSDU_size = htons((ntohs(rct->lan_id_and_LSDU_size) &
+ 0xF000) | (LSDU_size & 0x0FFF));
+}
struct hsr_port {
struct list_head port_list;
struct net_device *dev;
struct hsr_priv *hsr;
enum hsr_port_type type;
+ struct rcu_head rcu;
+ unsigned char original_macaddress[ETH_ALEN];
+};
+
+struct hsr_frame_info;
+struct hsr_node;
+
+struct hsr_proto_ops {
+ /* format and send supervision frame */
+ void (*send_sv_frame)(struct hsr_port *port, unsigned long *interval,
+ const unsigned char addr[ETH_ALEN]);
+ void (*handle_san_frame)(bool san, enum hsr_port_type port,
+ struct hsr_node *node);
+ bool (*drop_frame)(struct hsr_frame_info *frame, struct hsr_port *port);
+ struct sk_buff * (*get_untagged_frame)(struct hsr_frame_info *frame,
+ struct hsr_port *port);
+ struct sk_buff * (*create_tagged_frame)(struct hsr_frame_info *frame,
+ struct hsr_port *port);
+ int (*fill_frame_info)(__be16 proto, struct sk_buff *skb,
+ struct hsr_frame_info *frame);
+ bool (*invalid_dan_ingress_frame)(__be16 protocol);
+ void (*update_san_info)(struct hsr_node *node, bool is_sup);
+ int (*register_frame_out)(struct hsr_port *port,
+ struct hsr_frame_info *frame);
+};
+
+struct hsr_self_node {
+ unsigned char macaddress_A[ETH_ALEN];
+ unsigned char macaddress_B[ETH_ALEN];
+ struct rcu_head rcu_head;
};
struct hsr_priv {
struct rcu_head rcu_head;
struct list_head ports;
struct list_head node_db; /* Known HSR nodes */
- struct list_head self_node_db; /* MACs of slaves */
+ struct list_head proxy_node_db; /* RedBox HSR proxy nodes */
+ struct hsr_self_node __rcu *self_node; /* MACs of slaves */
struct timer_list announce_timer; /* Supervision frame dispatch */
+ struct timer_list announce_proxy_timer;
struct timer_list prune_timer;
+ struct timer_list prune_proxy_timer;
int announce_count;
u16 sequence_nr;
- u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */
- u8 protVersion; /* Indicate if HSRv0 or HSRv1. */
- spinlock_t seqnr_lock; /* locking for sequence_nr */
- unsigned char sup_multicast_addr[ETH_ALEN];
+ u16 sup_sequence_nr; /* For HSRv1 separate seq_nr for supervision */
+ enum hsr_version prot_version; /* Indicate if HSRv0, HSRv1 or PRPv1 */
+ spinlock_t seqnr_lock; /* locking for sequence_nr */
+ spinlock_t list_lock; /* locking for node list */
+ struct hsr_proto_ops *proto_ops;
+#define PRP_LAN_ID 0x5 /* 0x1010 for A and 0x1011 for B. Bit 0 is set
+ * based on SLAVE_A or SLAVE_B
+ */
+ u8 net_id; /* for PRP, it occupies most significant 3 bits
+ * of lan_id
+ */
+ bool fwd_offloaded; /* Forwarding offloaded to HW */
+ bool redbox; /* Device supports HSR RedBox */
+ unsigned char macaddress_redbox[ETH_ALEN];
+ unsigned char sup_multicast_addr[ETH_ALEN] __aligned(sizeof(u16));
+ /* Align to u16 boundary to avoid unaligned access
+ * in ether_addr_equal
+ */
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *node_tbl_root;
+#endif
};
#define hsr_for_each_port(hsr, port) \
list_for_each_entry_rcu((port), &(hsr)->ports, port_list)
+#define hsr_for_each_port_rtnl(hsr, port) \
+ list_for_each_entry_rcu((port), &(hsr)->ports, port_list, lockdep_rtnl_is_held())
+
struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt);
/* Caller must ensure skb is a valid HSR frame */
@@ -187,8 +234,67 @@ static inline u16 hsr_get_skb_sequence_nr(struct sk_buff *skb)
{
struct hsr_ethhdr *hsr_ethhdr;
- hsr_ethhdr = (struct hsr_ethhdr *) skb_mac_header(skb);
+ hsr_ethhdr = (struct hsr_ethhdr *)skb_mac_header(skb);
return ntohs(hsr_ethhdr->hsr_tag.sequence_nr);
}
+static inline struct prp_rct *skb_get_PRP_rct(struct sk_buff *skb)
+{
+ unsigned char *tail = skb_tail_pointer(skb) - HSR_HLEN;
+
+ struct prp_rct *rct = (struct prp_rct *)tail;
+
+ if (rct->PRP_suffix == htons(ETH_P_PRP))
+ return rct;
+
+ return NULL;
+}
+
+/* Assume caller has confirmed this skb is PRP suffixed */
+static inline u16 prp_get_skb_sequence_nr(struct prp_rct *rct)
+{
+ return ntohs(rct->sequence_nr);
+}
+
+/* assume there is a valid rct */
+static inline bool prp_check_lsdu_size(struct sk_buff *skb,
+ struct prp_rct *rct,
+ bool is_sup)
+{
+ struct ethhdr *ethhdr;
+ int expected_lsdu_size;
+
+ if (is_sup) {
+ expected_lsdu_size = HSR_V1_SUP_LSDUSIZE;
+ } else {
+ ethhdr = (struct ethhdr *)skb_mac_header(skb);
+ expected_lsdu_size = skb->len - 14;
+ if (ethhdr->h_proto == htons(ETH_P_8021Q))
+ expected_lsdu_size -= 4;
+ }
+
+ return (expected_lsdu_size == get_prp_LSDU_size(rct));
+}
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+void hsr_debugfs_rename(struct net_device *dev);
+void hsr_debugfs_init(struct hsr_priv *priv, struct net_device *hsr_dev);
+void hsr_debugfs_term(struct hsr_priv *priv);
+void hsr_debugfs_create_root(void);
+void hsr_debugfs_remove_root(void);
+#else
+static inline void hsr_debugfs_rename(struct net_device *dev)
+{
+}
+static inline void hsr_debugfs_init(struct hsr_priv *priv,
+ struct net_device *hsr_dev)
+{}
+static inline void hsr_debugfs_term(struct hsr_priv *priv)
+{}
+static inline void hsr_debugfs_create_root(void)
+{}
+static inline void hsr_debugfs_remove_root(void)
+{}
+#endif
+
#endif /* __HSR_PRIVATE_H */
diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c
index b9cce0fd5696..db0b0af7a692 100644
--- a/net/hsr/hsr_netlink.c
+++ b/net/hsr/hsr_netlink.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/* Copyright 2011-2014 Autronica Fire and Security AS
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
* Author(s):
* 2011-2014 Arvid Brodin, arvid.brodin@alten.se
*
- * Routines for handling Netlink messages for HSR.
+ * Routines for handling Netlink messages for HSR and PRP.
*/
#include "hsr_netlink.h"
@@ -26,82 +22,166 @@ static const struct nla_policy hsr_policy[IFLA_HSR_MAX + 1] = {
[IFLA_HSR_VERSION] = { .type = NLA_U8 },
[IFLA_HSR_SUPERVISION_ADDR] = { .len = ETH_ALEN },
[IFLA_HSR_SEQ_NR] = { .type = NLA_U16 },
+ [IFLA_HSR_PROTOCOL] = { .type = NLA_U8 },
+ [IFLA_HSR_INTERLINK] = { .type = NLA_U32 },
};
-
/* Here, it seems a netdevice has already been allocated for us, and the
* hsr_dev_setup routine has been executed. Nice!
*/
-static int hsr_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[],
+static int hsr_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
struct netlink_ext_ack *extack)
{
- struct net_device *link[2];
- unsigned char multicast_spec, hsr_version;
+ struct net *link_net = rtnl_newlink_link_net(params);
+ struct net_device *link[2], *interlink = NULL;
+ struct nlattr **data = params->data;
+ enum hsr_version proto_version;
+ unsigned char multicast_spec;
+ u8 proto = HSR_PROTOCOL_HSR;
+
+ if (!net_eq(link_net, dev_net(dev))) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "HSR slaves/interlink must be on the same net namespace than HSR link");
+ return -EINVAL;
+ }
if (!data) {
- netdev_info(dev, "HSR: No slave devices specified\n");
+ NL_SET_ERR_MSG_MOD(extack, "No slave devices specified");
return -EINVAL;
}
if (!data[IFLA_HSR_SLAVE1]) {
- netdev_info(dev, "HSR: Slave1 device not specified\n");
+ NL_SET_ERR_MSG_MOD(extack, "Slave1 device not specified");
+ return -EINVAL;
+ }
+ link[0] = __dev_get_by_index(link_net,
+ nla_get_u32(data[IFLA_HSR_SLAVE1]));
+ if (!link[0]) {
+ NL_SET_ERR_MSG_MOD(extack, "Slave1 does not exist");
return -EINVAL;
}
- link[0] = __dev_get_by_index(src_net, nla_get_u32(data[IFLA_HSR_SLAVE1]));
if (!data[IFLA_HSR_SLAVE2]) {
- netdev_info(dev, "HSR: Slave2 device not specified\n");
+ NL_SET_ERR_MSG_MOD(extack, "Slave2 device not specified");
return -EINVAL;
}
- link[1] = __dev_get_by_index(src_net, nla_get_u32(data[IFLA_HSR_SLAVE2]));
+ link[1] = __dev_get_by_index(link_net,
+ nla_get_u32(data[IFLA_HSR_SLAVE2]));
+ if (!link[1]) {
+ NL_SET_ERR_MSG_MOD(extack, "Slave2 does not exist");
+ return -EINVAL;
+ }
+
+ if (link[0] == link[1]) {
+ NL_SET_ERR_MSG_MOD(extack, "Slave1 and Slave2 are same");
+ return -EINVAL;
+ }
+
+ if (data[IFLA_HSR_INTERLINK])
+ interlink = __dev_get_by_index(link_net,
+ nla_get_u32(data[IFLA_HSR_INTERLINK]));
- if (!link[0] || !link[1])
- return -ENODEV;
- if (link[0] == link[1])
+ if (interlink && interlink == link[0]) {
+ NL_SET_ERR_MSG_MOD(extack, "Interlink and Slave1 are the same");
return -EINVAL;
+ }
+
+ if (interlink && interlink == link[1]) {
+ NL_SET_ERR_MSG_MOD(extack, "Interlink and Slave2 are the same");
+ return -EINVAL;
+ }
+
+ multicast_spec = nla_get_u8_default(data[IFLA_HSR_MULTICAST_SPEC], 0);
+
+ if (data[IFLA_HSR_PROTOCOL])
+ proto = nla_get_u8(data[IFLA_HSR_PROTOCOL]);
+
+ if (proto >= HSR_PROTOCOL_MAX) {
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported protocol");
+ return -EINVAL;
+ }
+
+ if (!data[IFLA_HSR_VERSION]) {
+ proto_version = HSR_V0;
+ } else {
+ if (proto == HSR_PROTOCOL_PRP) {
+ NL_SET_ERR_MSG_MOD(extack, "PRP version unsupported");
+ return -EINVAL;
+ }
+
+ proto_version = nla_get_u8(data[IFLA_HSR_VERSION]);
+ if (proto_version > HSR_V1) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Only HSR version 0/1 supported");
+ return -EINVAL;
+ }
+ }
+
+ if (proto == HSR_PROTOCOL_PRP) {
+ proto_version = PRP_V1;
+ if (interlink) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Interlink only works with HSR");
+ return -EINVAL;
+ }
+ }
- if (!data[IFLA_HSR_MULTICAST_SPEC])
- multicast_spec = 0;
- else
- multicast_spec = nla_get_u8(data[IFLA_HSR_MULTICAST_SPEC]);
+ return hsr_dev_finalize(dev, link, interlink, multicast_spec,
+ proto_version, extack);
+}
+
+static void hsr_dellink(struct net_device *dev, struct list_head *head)
+{
+ struct hsr_priv *hsr = netdev_priv(dev);
- if (!data[IFLA_HSR_VERSION])
- hsr_version = 0;
- else
- hsr_version = nla_get_u8(data[IFLA_HSR_VERSION]);
+ timer_delete_sync(&hsr->prune_timer);
+ timer_delete_sync(&hsr->prune_proxy_timer);
+ timer_delete_sync(&hsr->announce_timer);
+ timer_delete_sync(&hsr->announce_proxy_timer);
- return hsr_dev_finalize(dev, link, multicast_spec, hsr_version);
+ hsr_debugfs_term(hsr);
+ hsr_del_ports(hsr);
+
+ hsr_del_self_node(hsr);
+ hsr_del_nodes(&hsr->node_db);
+ hsr_del_nodes(&hsr->proxy_node_db);
+
+ unregister_netdevice_queue(dev, head);
}
static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
- struct hsr_priv *hsr;
+ struct hsr_priv *hsr = netdev_priv(dev);
+ u8 proto = HSR_PROTOCOL_HSR;
struct hsr_port *port;
- int res;
-
- hsr = netdev_priv(dev);
-
- res = 0;
- rcu_read_lock();
port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A);
- if (port)
- res = nla_put_u32(skb, IFLA_HSR_SLAVE1, port->dev->ifindex);
- rcu_read_unlock();
- if (res)
- goto nla_put_failure;
+ if (port) {
+ if (nla_put_u32(skb, IFLA_HSR_SLAVE1, port->dev->ifindex))
+ goto nla_put_failure;
+ }
- rcu_read_lock();
port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
- if (port)
- res = nla_put_u32(skb, IFLA_HSR_SLAVE2, port->dev->ifindex);
- rcu_read_unlock();
- if (res)
- goto nla_put_failure;
+ if (port) {
+ if (nla_put_u32(skb, IFLA_HSR_SLAVE2, port->dev->ifindex))
+ goto nla_put_failure;
+ }
+
+ port = hsr_port_get_hsr(hsr, HSR_PT_INTERLINK);
+ if (port) {
+ if (nla_put_u32(skb, IFLA_HSR_INTERLINK, port->dev->ifindex))
+ goto nla_put_failure;
+ }
if (nla_put(skb, IFLA_HSR_SUPERVISION_ADDR, ETH_ALEN,
hsr->sup_multicast_addr) ||
nla_put_u16(skb, IFLA_HSR_SEQ_NR, hsr->sequence_nr))
goto nla_put_failure;
+ if (hsr->prot_version == PRP_V1)
+ proto = HSR_PROTOCOL_PRP;
+ else if (nla_put_u8(skb, IFLA_HSR_VERSION, hsr->prot_version))
+ goto nla_put_failure;
+ if (nla_put_u8(skb, IFLA_HSR_PROTOCOL, proto))
+ goto nla_put_failure;
return 0;
@@ -116,11 +196,10 @@ static struct rtnl_link_ops hsr_link_ops __read_mostly = {
.priv_size = sizeof(struct hsr_priv),
.setup = hsr_dev_setup,
.newlink = hsr_newlink,
+ .dellink = hsr_dellink,
.fill_info = hsr_fill_info,
};
-
-
/* attribute policy */
static const struct nla_policy hsr_genl_policy[HSR_A_MAX + 1] = {
[HSR_A_NODE_ADDR] = { .len = ETH_ALEN },
@@ -138,8 +217,6 @@ static const struct genl_multicast_group hsr_mcgrps[] = {
{ .name = "hsr-network", },
};
-
-
/* This is called if for some node with MAC address addr, we only get frames
* over one of the slave interfaces. This would indicate an open network ring
* (i.e. a link has failed somewhere).
@@ -156,7 +233,8 @@ void hsr_nl_ringerror(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN],
if (!skb)
goto fail;
- msg_head = genlmsg_put(skb, 0, 0, &hsr_genl_family, 0, HSR_C_RING_ERROR);
+ msg_head = genlmsg_put(skb, 0, 0, &hsr_genl_family, 0,
+ HSR_C_RING_ERROR);
if (!msg_head)
goto nla_put_failure;
@@ -201,7 +279,6 @@ void hsr_nl_nodedown(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN])
if (!msg_head)
goto nla_put_failure;
-
res = nla_put(skb, HSR_A_NODE_ADDR, ETH_ALEN, addr);
if (res < 0)
goto nla_put_failure;
@@ -221,7 +298,6 @@ fail:
rcu_read_unlock();
}
-
/* HSR_C_GET_NODE_STATUS lets userspace query the internal HSR node table
* about the status of a specific node in the network, defined by its MAC
* address.
@@ -259,25 +335,24 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info)
if (!na)
goto invalid;
- hsr_dev = __dev_get_by_index(genl_info_net(info),
- nla_get_u32(info->attrs[HSR_A_IFINDEX]));
+ rcu_read_lock();
+ hsr_dev = dev_get_by_index_rcu(genl_info_net(info),
+ nla_get_u32(info->attrs[HSR_A_IFINDEX]));
if (!hsr_dev)
- goto invalid;
+ goto rcu_unlock;
if (!is_hsr_master(hsr_dev))
- goto invalid;
-
+ goto rcu_unlock;
/* Send reply */
-
- skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
if (!skb_out) {
res = -ENOMEM;
goto fail;
}
msg_head = genlmsg_put(skb_out, NETLINK_CB(skb_in).portid,
- info->snd_seq, &hsr_genl_family, 0,
- HSR_C_SET_NODE_STATUS);
+ info->snd_seq, &hsr_genl_family, 0,
+ HSR_C_SET_NODE_STATUS);
if (!msg_head) {
res = -ENOMEM;
goto nla_put_failure;
@@ -289,28 +364,30 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info)
hsr = netdev_priv(hsr_dev);
res = hsr_get_node_data(hsr,
- (unsigned char *) nla_data(info->attrs[HSR_A_NODE_ADDR]),
- hsr_node_addr_b,
- &addr_b_ifindex,
- &hsr_node_if1_age,
- &hsr_node_if1_seq,
- &hsr_node_if2_age,
- &hsr_node_if2_seq);
+ (unsigned char *)
+ nla_data(info->attrs[HSR_A_NODE_ADDR]),
+ hsr_node_addr_b,
+ &addr_b_ifindex,
+ &hsr_node_if1_age,
+ &hsr_node_if1_seq,
+ &hsr_node_if2_age,
+ &hsr_node_if2_seq);
if (res < 0)
goto nla_put_failure;
res = nla_put(skb_out, HSR_A_NODE_ADDR, ETH_ALEN,
- nla_data(info->attrs[HSR_A_NODE_ADDR]));
+ nla_data(info->attrs[HSR_A_NODE_ADDR]));
if (res < 0)
goto nla_put_failure;
if (addr_b_ifindex > -1) {
res = nla_put(skb_out, HSR_A_NODE_ADDR_B, ETH_ALEN,
- hsr_node_addr_b);
+ hsr_node_addr_b);
if (res < 0)
goto nla_put_failure;
- res = nla_put_u32(skb_out, HSR_A_ADDR_B_IFINDEX, addr_b_ifindex);
+ res = nla_put_u32(skb_out, HSR_A_ADDR_B_IFINDEX,
+ addr_b_ifindex);
if (res < 0)
goto nla_put_failure;
}
@@ -321,12 +398,10 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info)
res = nla_put_u16(skb_out, HSR_A_IF1_SEQ, hsr_node_if1_seq);
if (res < 0)
goto nla_put_failure;
- rcu_read_lock();
port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_A);
if (port)
res = nla_put_u32(skb_out, HSR_A_IF1_IFINDEX,
port->dev->ifindex);
- rcu_read_unlock();
if (res < 0)
goto nla_put_failure;
@@ -336,20 +411,22 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info)
res = nla_put_u16(skb_out, HSR_A_IF2_SEQ, hsr_node_if2_seq);
if (res < 0)
goto nla_put_failure;
- rcu_read_lock();
port = hsr_port_get_hsr(hsr, HSR_PT_SLAVE_B);
if (port)
res = nla_put_u32(skb_out, HSR_A_IF2_IFINDEX,
port->dev->ifindex);
- rcu_read_unlock();
if (res < 0)
goto nla_put_failure;
+ rcu_read_unlock();
+
genlmsg_end(skb_out, msg_head);
genlmsg_unicast(genl_info_net(info), skb_out, info->snd_portid);
return 0;
+rcu_unlock:
+ rcu_read_unlock();
invalid:
netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL, NULL);
return 0;
@@ -359,6 +436,7 @@ nla_put_failure:
/* Fall through */
fail:
+ rcu_read_unlock();
return res;
}
@@ -366,16 +444,14 @@ fail:
*/
static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info)
{
- /* For receiving */
- struct nlattr *na;
+ unsigned char addr[ETH_ALEN];
struct net_device *hsr_dev;
-
- /* For sending */
struct sk_buff *skb_out;
- void *msg_head;
struct hsr_priv *hsr;
- void *pos;
- unsigned char addr[ETH_ALEN];
+ bool restart = false;
+ struct nlattr *na;
+ void *pos = NULL;
+ void *msg_head;
int res;
if (!info)
@@ -385,42 +461,50 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info)
if (!na)
goto invalid;
- hsr_dev = __dev_get_by_index(genl_info_net(info),
- nla_get_u32(info->attrs[HSR_A_IFINDEX]));
+ rcu_read_lock();
+ hsr_dev = dev_get_by_index_rcu(genl_info_net(info),
+ nla_get_u32(info->attrs[HSR_A_IFINDEX]));
if (!hsr_dev)
- goto invalid;
+ goto rcu_unlock;
if (!is_hsr_master(hsr_dev))
- goto invalid;
-
+ goto rcu_unlock;
+restart:
/* Send reply */
-
- skb_out = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ skb_out = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_ATOMIC);
if (!skb_out) {
res = -ENOMEM;
goto fail;
}
msg_head = genlmsg_put(skb_out, NETLINK_CB(skb_in).portid,
- info->snd_seq, &hsr_genl_family, 0,
- HSR_C_SET_NODE_LIST);
+ info->snd_seq, &hsr_genl_family, 0,
+ HSR_C_SET_NODE_LIST);
if (!msg_head) {
res = -ENOMEM;
goto nla_put_failure;
}
- res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex);
- if (res < 0)
- goto nla_put_failure;
+ if (!restart) {
+ res = nla_put_u32(skb_out, HSR_A_IFINDEX, hsr_dev->ifindex);
+ if (res < 0)
+ goto nla_put_failure;
+ }
hsr = netdev_priv(hsr_dev);
- rcu_read_lock();
- pos = hsr_get_next_node(hsr, NULL, addr);
+ if (!pos)
+ pos = hsr_get_next_node(hsr, NULL, addr);
while (pos) {
res = nla_put(skb_out, HSR_A_NODE_ADDR, ETH_ALEN, addr);
if (res < 0) {
- rcu_read_unlock();
+ if (res == -EMSGSIZE) {
+ genlmsg_end(skb_out, msg_head);
+ genlmsg_unicast(genl_info_net(info), skb_out,
+ info->snd_portid);
+ restart = true;
+ goto restart;
+ }
goto nla_put_failure;
}
pos = hsr_get_next_node(hsr, pos, addr);
@@ -432,31 +516,33 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info)
return 0;
+rcu_unlock:
+ rcu_read_unlock();
invalid:
netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL, NULL);
return 0;
nla_put_failure:
- kfree_skb(skb_out);
+ nlmsg_free(skb_out);
/* Fall through */
fail:
+ rcu_read_unlock();
return res;
}
-
-static const struct genl_ops hsr_ops[] = {
+static const struct genl_small_ops hsr_ops[] = {
{
.cmd = HSR_C_GET_NODE_STATUS,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = 0,
- .policy = hsr_genl_policy,
.doit = hsr_get_node_status,
.dumpit = NULL,
},
{
.cmd = HSR_C_GET_NODE_LIST,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = 0,
- .policy = hsr_genl_policy,
.doit = hsr_get_node_list,
.dumpit = NULL,
},
@@ -467,9 +553,12 @@ static struct genl_family hsr_genl_family __ro_after_init = {
.name = "HSR",
.version = 1,
.maxattr = HSR_A_MAX,
+ .policy = hsr_genl_policy,
+ .netnsok = true,
.module = THIS_MODULE,
- .ops = hsr_ops,
- .n_ops = ARRAY_SIZE(hsr_ops),
+ .small_ops = hsr_ops,
+ .n_small_ops = ARRAY_SIZE(hsr_ops),
+ .resv_start_op = HSR_C_SET_NODE_LIST + 1,
.mcgrps = hsr_mcgrps,
.n_mcgrps = ARRAY_SIZE(hsr_mcgrps),
};
@@ -486,6 +575,7 @@ int __init hsr_netlink_init(void)
if (rc)
goto fail_genl_register_family;
+ hsr_debugfs_create_root();
return 0;
fail_genl_register_family:
diff --git a/net/hsr/hsr_netlink.h b/net/hsr/hsr_netlink.h
index 3f6b95b5b6b8..8c99e64e1cea 100644
--- a/net/hsr/hsr_netlink.h
+++ b/net/hsr/hsr_netlink.h
@@ -1,12 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright 2011-2014 Autronica Fire and Security AS
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
* Author(s):
* 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * include file for HSR and PRP.
*/
#ifndef __HSR_NETLINK_H
@@ -25,7 +23,5 @@ void __exit hsr_netlink_exit(void);
void hsr_nl_ringerror(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN],
struct hsr_port *port);
void hsr_nl_nodedown(struct hsr_priv *hsr, unsigned char addr[ETH_ALEN]);
-void hsr_nl_framedrop(int dropcount, int dev_idx);
-void hsr_nl_linkdown(int dev_idx);
#endif /* __HSR_NETLINK_H */
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
index 56080da4aa77..afe06ba00ea4 100644
--- a/net/hsr/hsr_slave.c
+++ b/net/hsr/hsr_slave.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0
/* Copyright 2011-2014 Autronica Fire and Security AS
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
* Author(s):
* 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * Frame handler other utility functions for HSR and PRP.
*/
#include "hsr_slave.h"
@@ -18,20 +16,31 @@
#include "hsr_forward.h"
#include "hsr_framereg.h"
+bool hsr_invalid_dan_ingress_frame(__be16 protocol)
+{
+ return (protocol != htons(ETH_P_PRP) && protocol != htons(ETH_P_HSR));
+}
static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb)
{
struct sk_buff *skb = *pskb;
struct hsr_port *port;
- u16 protocol;
+ struct hsr_priv *hsr;
+ __be16 protocol;
+
+ /* Packets from dev_loopback_xmit() do not have L2 header, bail out */
+ if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
+ return RX_HANDLER_PASS;
if (!skb_mac_header_was_set(skb)) {
WARN_ONCE(1, "%s: skb invalid", __func__);
return RX_HANDLER_PASS;
}
- rcu_read_lock(); /* hsr->node_db, hsr->ports */
port = hsr_port_get_rcu(skb->dev);
+ if (!port)
+ goto finish_pass;
+ hsr = port->hsr;
if (hsr_addr_is_self(port->hsr, eth_hdr(skb)->h_source)) {
/* Directly kill frames sent by ourselves */
@@ -39,20 +48,46 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb)
goto finish_consume;
}
+ /* For HSR, only tagged frames are expected (unless the device offloads
+ * HSR tag removal), but for PRP there could be non tagged frames as
+ * well from Single attached nodes (SANs).
+ */
protocol = eth_hdr(skb)->h_proto;
- if (protocol != htons(ETH_P_PRP) && protocol != htons(ETH_P_HSR))
+
+ if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) &&
+ port->type != HSR_PT_INTERLINK &&
+ hsr->proto_ops->invalid_dan_ingress_frame &&
+ hsr->proto_ops->invalid_dan_ingress_frame(protocol))
goto finish_pass;
skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ if ((!hsr->prot_version && protocol == htons(ETH_P_PRP)) ||
+ protocol == htons(ETH_P_HSR)) {
+ if (!pskb_may_pull(skb, ETH_HLEN + HSR_HLEN)) {
+ kfree_skb(skb);
+ goto finish_consume;
+ }
+
+ skb_set_network_header(skb, ETH_HLEN + HSR_HLEN);
+ }
+ skb_reset_mac_len(skb);
- hsr_forward_skb(skb, port);
+ /* Only the frames received over the interlink port will assign a
+ * sequence number and require synchronisation vs other sender.
+ */
+ if (port->type == HSR_PT_INTERLINK) {
+ spin_lock_bh(&hsr->seqnr_lock);
+ hsr_forward_skb(skb, port);
+ spin_unlock_bh(&hsr->seqnr_lock);
+ } else {
+ hsr_forward_skb(skb, port);
+ }
finish_consume:
- rcu_read_unlock(); /* hsr->node_db, hsr->ports */
return RX_HANDLER_CONSUMED;
finish_pass:
- rcu_read_unlock(); /* hsr->node_db, hsr->ports */
return RX_HANDLER_PASS;
}
@@ -61,34 +96,37 @@ bool hsr_port_exists(const struct net_device *dev)
return rcu_access_pointer(dev->rx_handler) == hsr_handle_frame;
}
-
-static int hsr_check_dev_ok(struct net_device *dev)
+static int hsr_check_dev_ok(struct net_device *dev,
+ struct netlink_ext_ack *extack)
{
/* Don't allow HSR on non-ethernet like devices */
- if ((dev->flags & IFF_LOOPBACK) || (dev->type != ARPHRD_ETHER) ||
- (dev->addr_len != ETH_ALEN)) {
- netdev_info(dev, "Cannot use loopback or non-ethernet device as HSR slave.\n");
+ if ((dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER ||
+ dev->addr_len != ETH_ALEN) {
+ NL_SET_ERR_MSG_MOD(extack, "Cannot use loopback or non-ethernet device as HSR slave.");
return -EINVAL;
}
/* Don't allow enslaving hsr devices */
if (is_hsr_master(dev)) {
- netdev_info(dev, "Cannot create trees of HSR devices.\n");
+ NL_SET_ERR_MSG_MOD(extack,
+ "Cannot create trees of HSR devices.");
return -EINVAL;
}
if (hsr_port_exists(dev)) {
- netdev_info(dev, "This device is already a HSR slave.\n");
+ NL_SET_ERR_MSG_MOD(extack,
+ "This device is already a HSR slave.");
return -EINVAL;
}
if (is_vlan_dev(dev)) {
- netdev_info(dev, "HSR on top of VLAN is not yet supported in this driver.\n");
+ NL_SET_ERR_MSG_MOD(extack, "HSR on top of VLAN is not yet supported in this driver.");
return -EINVAL;
}
if (dev->priv_flags & IFF_DONT_BRIDGE) {
- netdev_info(dev, "This device does not support bridging.\n");
+ NL_SET_ERR_MSG_MOD(extack,
+ "This device does not support bridging.");
return -EOPNOTSUPP;
}
@@ -99,21 +137,34 @@ static int hsr_check_dev_ok(struct net_device *dev)
return 0;
}
-
/* Setup device to be added to the HSR bridge. */
-static int hsr_portdev_setup(struct net_device *dev, struct hsr_port *port)
+static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev,
+ struct hsr_port *port,
+ struct netlink_ext_ack *extack)
+
{
+ struct netdev_lag_upper_info lag_upper_info;
+ struct net_device *hsr_dev;
+ struct hsr_port *master;
int res;
- dev_hold(dev);
- res = dev_set_promiscuity(dev, 1);
- if (res)
- goto fail_promiscuity;
-
- /* FIXME:
- * What does net device "adjacency" mean? Should we do
- * res = netdev_master_upper_dev_link(port->dev, port->hsr->dev); ?
+ /* Don't use promiscuous mode for offload since L2 frame forward
+ * happens at the offloaded hardware.
*/
+ if (!port->hsr->fwd_offloaded) {
+ res = dev_set_promiscuity(dev, 1);
+ if (res)
+ return res;
+ }
+
+ master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
+ hsr_dev = master->dev;
+
+ lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_BROADCAST;
+ lag_upper_info.hash_type = NETDEV_LAG_HASH_UNKNOWN;
+ res = netdev_master_upper_dev_link(dev, hsr_dev, NULL, &lag_upper_info, extack);
+ if (res)
+ goto fail_upper_dev_link;
res = netdev_rx_handler_register(dev, hsr_handle_frame, port);
if (res)
@@ -123,45 +174,46 @@ static int hsr_portdev_setup(struct net_device *dev, struct hsr_port *port)
return 0;
fail_rx_handler:
- dev_set_promiscuity(dev, -1);
-fail_promiscuity:
- dev_put(dev);
+ netdev_upper_dev_unlink(dev, hsr_dev);
+fail_upper_dev_link:
+ if (!port->hsr->fwd_offloaded)
+ dev_set_promiscuity(dev, -1);
return res;
}
int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev,
- enum hsr_port_type type)
+ enum hsr_port_type type, struct netlink_ext_ack *extack)
{
struct hsr_port *port, *master;
int res;
if (type != HSR_PT_MASTER) {
- res = hsr_check_dev_ok(dev);
+ res = hsr_check_dev_ok(dev, extack);
if (res)
return res;
}
port = hsr_port_get_hsr(hsr, type);
- if (port != NULL)
+ if (port)
return -EBUSY; /* This port already exists */
port = kzalloc(sizeof(*port), GFP_KERNEL);
- if (port == NULL)
+ if (!port)
return -ENOMEM;
- if (type != HSR_PT_MASTER) {
- res = hsr_portdev_setup(dev, port);
- if (res)
- goto fail_dev_setup;
- }
-
port->hsr = hsr;
port->dev = dev;
port->type = type;
+ ether_addr_copy(port->original_macaddress, dev->dev_addr);
list_add_tail_rcu(&port->port_list, &hsr->ports);
- synchronize_rcu();
+
+ if (type != HSR_PT_MASTER) {
+ res = hsr_portdev_setup(hsr, dev, port, extack);
+ if (res)
+ goto fail_dev_setup;
+ }
master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
netdev_update_features(master->dev);
@@ -170,7 +222,8 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev,
return 0;
fail_dev_setup:
- kfree(port);
+ list_del_rcu(&port->port_list);
+ kfree_rcu(port, rcu);
return res;
}
@@ -184,20 +237,14 @@ void hsr_del_port(struct hsr_port *port)
list_del_rcu(&port->port_list);
if (port != master) {
- if (master != NULL) {
- netdev_update_features(master->dev);
- dev_set_mtu(master->dev, hsr_get_max_mtu(hsr));
- }
+ netdev_update_features(master->dev);
+ dev_set_mtu(master->dev, hsr_get_max_mtu(hsr));
netdev_rx_handler_unregister(port->dev);
- dev_set_promiscuity(port->dev, -1);
+ if (!port->hsr->fwd_offloaded)
+ dev_set_promiscuity(port->dev, -1);
+ netdev_upper_dev_unlink(port->dev, master->dev);
+ eth_hw_addr_set(port->dev, port->original_macaddress);
}
- /* FIXME?
- * netdev_upper_dev_unlink(port->dev, port->hsr->dev);
- */
-
- synchronize_rcu();
-
- if (port != master)
- dev_put(port->dev);
+ kfree_rcu(port, rcu);
}
diff --git a/net/hsr/hsr_slave.h b/net/hsr/hsr_slave.h
index 3ccfbf71c92e..edc4612bb009 100644
--- a/net/hsr/hsr_slave.h
+++ b/net/hsr/hsr_slave.h
@@ -1,12 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright 2011-2014 Autronica Fire and Security AS
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
- * Author(s):
* 2011-2014 Arvid Brodin, arvid.brodin@alten.se
+ *
+ * include file for HSR and PRP.
*/
#ifndef __HSR_SLAVE_H
@@ -18,7 +15,7 @@
#include "hsr_main.h"
int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev,
- enum hsr_port_type pt);
+ enum hsr_port_type pt, struct netlink_ext_ack *extack);
void hsr_del_port(struct hsr_port *port);
bool hsr_port_exists(const struct net_device *dev);
@@ -35,4 +32,6 @@ static inline struct hsr_port *hsr_port_get_rcu(const struct net_device *dev)
rcu_dereference(dev->rx_handler_data) : NULL;
}
+bool hsr_invalid_dan_ingress_frame(__be16 protocol);
+
#endif /* __HSR_SLAVE_H */
diff --git a/net/hsr/prp_dup_discard_test.c b/net/hsr/prp_dup_discard_test.c
new file mode 100644
index 000000000000..e86b7b633ae8
--- /dev/null
+++ b/net/hsr/prp_dup_discard_test.c
@@ -0,0 +1,212 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <kunit/test.h>
+
+#include "hsr_main.h"
+#include "hsr_framereg.h"
+
+struct prp_test_data {
+ struct hsr_port port;
+ struct hsr_port port_rcv;
+ struct hsr_frame_info frame;
+ struct hsr_node node;
+};
+
+static struct prp_test_data *build_prp_test_data(struct kunit *test)
+{
+ struct prp_test_data *data = kunit_kzalloc(test,
+ sizeof(struct prp_test_data), GFP_USER);
+ KUNIT_EXPECT_NOT_ERR_OR_NULL(test, data);
+
+ data->frame.node_src = &data->node;
+ data->frame.port_rcv = &data->port_rcv;
+ data->port_rcv.type = HSR_PT_SLAVE_A;
+ data->node.seq_start[HSR_PT_SLAVE_A] = 1;
+ data->node.seq_expected[HSR_PT_SLAVE_A] = 1;
+ data->node.seq_start[HSR_PT_SLAVE_B] = 1;
+ data->node.seq_expected[HSR_PT_SLAVE_B] = 1;
+ data->node.seq_out[HSR_PT_MASTER] = 0;
+ data->node.time_out[HSR_PT_MASTER] = jiffies;
+ data->port.type = HSR_PT_MASTER;
+
+ return data;
+}
+
+static void check_prp_counters(struct kunit *test,
+ struct prp_test_data *data,
+ u16 seq_start_a, u16 seq_expected_a,
+ u16 seq_start_b, u16 seq_expected_b)
+{
+ KUNIT_EXPECT_EQ(test, data->node.seq_start[HSR_PT_SLAVE_A],
+ seq_start_a);
+ KUNIT_EXPECT_EQ(test, data->node.seq_start[HSR_PT_SLAVE_B],
+ seq_start_b);
+ KUNIT_EXPECT_EQ(test, data->node.seq_expected[HSR_PT_SLAVE_A],
+ seq_expected_a);
+ KUNIT_EXPECT_EQ(test, data->node.seq_expected[HSR_PT_SLAVE_B],
+ seq_expected_b);
+}
+
+static void prp_dup_discard_forward(struct kunit *test)
+{
+ /* Normal situation, both LANs in sync. Next frame is forwarded */
+ struct prp_test_data *data = build_prp_test_data(test);
+
+ data->frame.sequence_nr = 2;
+ KUNIT_EXPECT_EQ(test, 0,
+ prp_register_frame_out(&data->port, &data->frame));
+ KUNIT_EXPECT_EQ(test, data->frame.sequence_nr,
+ data->node.seq_out[HSR_PT_MASTER]);
+ KUNIT_EXPECT_EQ(test, jiffies, data->node.time_out[HSR_PT_MASTER]);
+ check_prp_counters(test, data, data->frame.sequence_nr,
+ data->frame.sequence_nr + 1, 1, 1);
+}
+
+static void prp_dup_discard_inside_dropwindow(struct kunit *test)
+{
+ /* Normal situation, other LAN ahead by one. Frame is dropped */
+ struct prp_test_data *data = build_prp_test_data(test);
+ unsigned long time = jiffies - 10;
+
+ data->frame.sequence_nr = 1;
+ data->node.seq_expected[HSR_PT_SLAVE_B] = 3;
+ data->node.seq_out[HSR_PT_MASTER] = 2;
+ data->node.time_out[HSR_PT_MASTER] = time;
+
+ KUNIT_EXPECT_EQ(test, 1,
+ prp_register_frame_out(&data->port, &data->frame));
+ KUNIT_EXPECT_EQ(test, 2, data->node.seq_out[HSR_PT_MASTER]);
+ KUNIT_EXPECT_EQ(test, time, data->node.time_out[HSR_PT_MASTER]);
+ check_prp_counters(test, data, 2, 2, 2, 3);
+}
+
+static void prp_dup_discard_node_timeout(struct kunit *test)
+{
+ /* Timeout situation, node hasn't sent anything for a while */
+ struct prp_test_data *data = build_prp_test_data(test);
+
+ data->frame.sequence_nr = 7;
+ data->node.seq_start[HSR_PT_SLAVE_A] = 1234;
+ data->node.seq_expected[HSR_PT_SLAVE_A] = 1235;
+ data->node.seq_start[HSR_PT_SLAVE_B] = 1234;
+ data->node.seq_expected[HSR_PT_SLAVE_B] = 1234;
+ data->node.seq_out[HSR_PT_MASTER] = 1234;
+ data->node.time_out[HSR_PT_MASTER] =
+ jiffies - msecs_to_jiffies(HSR_ENTRY_FORGET_TIME) - 1;
+
+ KUNIT_EXPECT_EQ(test, 0,
+ prp_register_frame_out(&data->port, &data->frame));
+ KUNIT_EXPECT_EQ(test, data->frame.sequence_nr,
+ data->node.seq_out[HSR_PT_MASTER]);
+ KUNIT_EXPECT_EQ(test, jiffies, data->node.time_out[HSR_PT_MASTER]);
+ check_prp_counters(test, data, data->frame.sequence_nr,
+ data->frame.sequence_nr + 1, 1234, 1234);
+}
+
+static void prp_dup_discard_out_of_sequence(struct kunit *test)
+{
+ /* One frame is received out of sequence on both LANs */
+ struct prp_test_data *data = build_prp_test_data(test);
+
+ data->node.seq_start[HSR_PT_SLAVE_A] = 10;
+ data->node.seq_expected[HSR_PT_SLAVE_A] = 10;
+ data->node.seq_start[HSR_PT_SLAVE_B] = 10;
+ data->node.seq_expected[HSR_PT_SLAVE_B] = 10;
+ data->node.seq_out[HSR_PT_MASTER] = 9;
+
+ /* 1st old frame, should be accepted */
+ data->frame.sequence_nr = 8;
+ KUNIT_EXPECT_EQ(test, 0,
+ prp_register_frame_out(&data->port, &data->frame));
+ KUNIT_EXPECT_EQ(test, data->frame.sequence_nr,
+ data->node.seq_out[HSR_PT_MASTER]);
+ check_prp_counters(test, data, data->frame.sequence_nr,
+ data->frame.sequence_nr + 1, 10, 10);
+
+ /* 2nd frame should be dropped */
+ data->frame.sequence_nr = 8;
+ data->port_rcv.type = HSR_PT_SLAVE_B;
+ KUNIT_EXPECT_EQ(test, 1,
+ prp_register_frame_out(&data->port, &data->frame));
+ check_prp_counters(test, data, data->frame.sequence_nr + 1,
+ data->frame.sequence_nr + 1,
+ data->frame.sequence_nr + 1,
+ data->frame.sequence_nr + 1);
+
+ /* Next frame, this is forwarded */
+ data->frame.sequence_nr = 10;
+ data->port_rcv.type = HSR_PT_SLAVE_A;
+ KUNIT_EXPECT_EQ(test, 0,
+ prp_register_frame_out(&data->port, &data->frame));
+ KUNIT_EXPECT_EQ(test, data->frame.sequence_nr,
+ data->node.seq_out[HSR_PT_MASTER]);
+ check_prp_counters(test, data, data->frame.sequence_nr,
+ data->frame.sequence_nr + 1, 9, 9);
+
+ /* and next one is dropped */
+ data->frame.sequence_nr = 10;
+ data->port_rcv.type = HSR_PT_SLAVE_B;
+ KUNIT_EXPECT_EQ(test, 1,
+ prp_register_frame_out(&data->port, &data->frame));
+ check_prp_counters(test, data, data->frame.sequence_nr + 1,
+ data->frame.sequence_nr + 1,
+ data->frame.sequence_nr + 1,
+ data->frame.sequence_nr + 1);
+}
+
+static void prp_dup_discard_lan_b_late(struct kunit *test)
+{
+ /* LAN B is behind */
+ struct prp_test_data *data = build_prp_test_data(test);
+
+ data->node.seq_start[HSR_PT_SLAVE_A] = 9;
+ data->node.seq_expected[HSR_PT_SLAVE_A] = 9;
+ data->node.seq_start[HSR_PT_SLAVE_B] = 9;
+ data->node.seq_expected[HSR_PT_SLAVE_B] = 9;
+ data->node.seq_out[HSR_PT_MASTER] = 8;
+
+ data->frame.sequence_nr = 9;
+ KUNIT_EXPECT_EQ(test, 0,
+ prp_register_frame_out(&data->port, &data->frame));
+ KUNIT_EXPECT_EQ(test, data->frame.sequence_nr,
+ data->node.seq_out[HSR_PT_MASTER]);
+ check_prp_counters(test, data, 9, 10, 9, 9);
+
+ data->frame.sequence_nr = 10;
+ KUNIT_EXPECT_EQ(test, 0,
+ prp_register_frame_out(&data->port, &data->frame));
+ KUNIT_EXPECT_EQ(test, data->frame.sequence_nr,
+ data->node.seq_out[HSR_PT_MASTER]);
+ check_prp_counters(test, data, 9, 11, 9, 9);
+
+ data->frame.sequence_nr = 9;
+ data->port_rcv.type = HSR_PT_SLAVE_B;
+ KUNIT_EXPECT_EQ(test, 1,
+ prp_register_frame_out(&data->port, &data->frame));
+ check_prp_counters(test, data, 10, 11, 10, 10);
+
+ data->frame.sequence_nr = 10;
+ data->port_rcv.type = HSR_PT_SLAVE_B;
+ KUNIT_EXPECT_EQ(test, 1,
+ prp_register_frame_out(&data->port, &data->frame));
+ check_prp_counters(test, data, 11, 11, 11, 11);
+}
+
+static struct kunit_case prp_dup_discard_test_cases[] = {
+ KUNIT_CASE(prp_dup_discard_forward),
+ KUNIT_CASE(prp_dup_discard_inside_dropwindow),
+ KUNIT_CASE(prp_dup_discard_node_timeout),
+ KUNIT_CASE(prp_dup_discard_out_of_sequence),
+ KUNIT_CASE(prp_dup_discard_lan_b_late),
+ {}
+};
+
+static struct kunit_suite prp_dup_discard_suite = {
+ .name = "prp_duplicate_discard",
+ .test_cases = prp_dup_discard_test_cases,
+};
+
+kunit_test_suite(prp_dup_discard_suite);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("KUnit tests for PRP duplicate discard");
+MODULE_AUTHOR("Jaakko Karrenpalo <jkarrenpalo@gmail.com>");
diff --git a/net/ieee802154/6lowpan/Kconfig b/net/ieee802154/6lowpan/Kconfig
index d24f985b0bfd..e808e4db2678 100644
--- a/net/ieee802154/6lowpan/Kconfig
+++ b/net/ieee802154/6lowpan/Kconfig
@@ -1,5 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
config IEEE802154_6LOWPAN
tristate "6lowpan support over IEEE 802.15.4"
depends on 6LOWPAN
- ---help---
+ help
IPv6 compression over IEEE 802.15.4.
diff --git a/net/ieee802154/6lowpan/Makefile b/net/ieee802154/6lowpan/Makefile
index 6bfb270a81a6..f11d6376a891 100644
--- a/net/ieee802154/6lowpan/Makefile
+++ b/net/ieee802154/6lowpan/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_IEEE802154_6LOWPAN) += ieee802154_6lowpan.o
ieee802154_6lowpan-y := core.o rx.o reassembly.o tx.o
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index 3297e7fa9945..018929563c6b 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -47,8 +47,10 @@
#include <linux/module.h>
#include <linux/netdevice.h>
#include <linux/ieee802154.h>
+#include <linux/if_arp.h>
#include <net/ipv6.h>
+#include <net/netdev_lock.h>
#include "6lowpan_i.h"
@@ -92,7 +94,7 @@ static int lowpan_neigh_construct(struct net_device *dev, struct neighbour *n)
static int lowpan_get_iflink(const struct net_device *dev)
{
- return lowpan_802154_dev(dev)->wdev->ifindex;
+ return READ_ONCE(lowpan_802154_dev(dev)->wdev->ifindex);
}
static const struct net_device_ops lowpan_netdev_ops = {
@@ -115,7 +117,7 @@ static void lowpan_setup(struct net_device *ldev)
ldev->netdev_ops = &lowpan_netdev_ops;
ldev->header_ops = &lowpan_header_ops;
ldev->needs_free_netdev = true;
- ldev->features |= NETIF_F_NETNS_LOCAL;
+ ldev->netns_immutable = true;
}
static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[],
@@ -128,10 +130,11 @@ static int lowpan_validate(struct nlattr *tb[], struct nlattr *data[],
return 0;
}
-static int lowpan_newlink(struct net *src_net, struct net_device *ldev,
- struct nlattr *tb[], struct nlattr *data[],
+static int lowpan_newlink(struct net_device *ldev,
+ struct rtnl_newlink_params *params,
struct netlink_ext_ack *extack)
{
+ struct nlattr **tb = params->tb;
struct net_device *wdev;
int ret;
@@ -141,6 +144,8 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev,
if (!tb[IFLA_LINK])
return -EINVAL;
+ if (params->link_net && !net_eq(params->link_net, dev_net(ldev)))
+ return -EINVAL;
/* find and hold wpan device */
wdev = dev_get_by_index(dev_net(ldev), nla_get_u32(tb[IFLA_LINK]));
if (!wdev)
@@ -157,7 +162,7 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev,
lowpan_802154_dev(ldev)->wdev = wdev;
/* Set the lowpan hardware address to the wpan hardware address. */
- memcpy(ldev->dev_addr, wdev->dev_addr, IEEE802154_ADDR_LEN);
+ __dev_addr_set(ldev, wdev->dev_addr, IEEE802154_ADDR_LEN);
/* We need headroom for possible wpan_dev_hard_header call and tailroom
* for encryption/fcs handling. The lowpan interface will replace
* the IPv6 header with 6LoWPAN header. At worst case the 6LoWPAN
@@ -279,5 +284,6 @@ static void __exit lowpan_cleanup_module(void)
module_init(lowpan_init_module);
module_exit(lowpan_cleanup_module);
+MODULE_DESCRIPTION("IPv6 over Low power Wireless Personal Area Network IEEE 802.15.4 core");
MODULE_LICENSE("GPL");
MODULE_ALIAS_RTNL_LINK("lowpan");
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index e7857a8ac86d..ddb6a5817d09 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -1,15 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* 6LoWPAN fragment reassembly
*
- *
* Authors:
* Alexander Aring <aar@pengutronix.de>
*
* Based on: net/ipv6/reassembly.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "6LoWPAN: " fmt
@@ -27,6 +22,7 @@
#include <net/6lowpan.h>
#include <net/ipv6_frag.h>
#include <net/inet_frag.h>
+#include <net/ip.h>
#include "6lowpan_i.h"
@@ -34,8 +30,9 @@ static const char lowpan_frags_cache_name[] = "lowpan-frags";
static struct inet_frags lowpan_frags;
-static int lowpan_frag_reasm(struct lowpan_frag_queue *fq,
- struct sk_buff *prev, struct net_device *ldev);
+static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb,
+ struct sk_buff *prev, struct net_device *ldev,
+ int *refs);
static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
{
@@ -47,8 +44,9 @@ static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
static void lowpan_frag_expire(struct timer_list *t)
{
- struct inet_frag_queue *frag = from_timer(frag, t, timer);
+ struct inet_frag_queue *frag = timer_container_of(frag, t, timer);
struct frag_queue *fq;
+ int refs = 1;
fq = container_of(frag, struct frag_queue, q);
@@ -57,10 +55,10 @@ static void lowpan_frag_expire(struct timer_list *t)
if (fq->q.flags & INET_FRAG_COMPLETE)
goto out;
- inet_frag_kill(&fq->q);
+ inet_frag_kill(&fq->q, &refs);
out:
spin_unlock(&fq->q.lock);
- inet_frag_put(&fq->q);
+ inet_frag_putn(&fq->q, refs);
}
static inline struct lowpan_frag_queue *
@@ -78,7 +76,7 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
key.src = *src;
key.dst = *dst;
- q = inet_frag_find(&ieee802154_lowpan->frags, &key);
+ q = inet_frag_find(ieee802154_lowpan->fqdir, &key);
if (!q)
return NULL;
@@ -86,11 +84,18 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
}
static int lowpan_frag_queue(struct lowpan_frag_queue *fq,
- struct sk_buff *skb, u8 frag_type)
+ struct sk_buff *skb, u8 frag_type,
+ int *refs)
{
- struct sk_buff *prev, *next;
+ struct sk_buff *prev_tail;
struct net_device *ldev;
- int end, offset;
+ int end, offset, err;
+
+ /* inet_frag_queue_* functions use skb->cb; see struct ipfrag_skb_cb
+ * in inet_fragment.c
+ */
+ BUILD_BUG_ON(sizeof(struct lowpan_802154_cb) > sizeof(struct inet_skb_parm));
+ BUILD_BUG_ON(sizeof(struct lowpan_802154_cb) > sizeof(struct inet6_skb_parm));
if (fq->q.flags & INET_FRAG_COMPLETE)
goto err;
@@ -117,45 +122,23 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq,
}
}
- /* Find out which fragments are in front and at the back of us
- * in the chain of fragments so far. We must know where to put
- * this fragment, right?
- */
- prev = fq->q.fragments_tail;
- if (!prev ||
- lowpan_802154_cb(prev)->d_offset <
- lowpan_802154_cb(skb)->d_offset) {
- next = NULL;
- goto found;
- }
- prev = NULL;
- for (next = fq->q.fragments; next != NULL; next = next->next) {
- if (lowpan_802154_cb(next)->d_offset >=
- lowpan_802154_cb(skb)->d_offset)
- break; /* bingo! */
- prev = next;
- }
-
-found:
- /* Insert this fragment in the chain of fragments. */
- skb->next = next;
- if (!next)
- fq->q.fragments_tail = skb;
- if (prev)
- prev->next = skb;
- else
- fq->q.fragments = skb;
-
ldev = skb->dev;
if (ldev)
skb->dev = NULL;
+ barrier();
+
+ prev_tail = fq->q.fragments_tail;
+ err = inet_frag_queue_insert(&fq->q, skb, offset, end);
+ if (err)
+ goto err;
fq->q.stamp = skb->tstamp;
+ fq->q.tstamp_type = skb->tstamp_type;
if (frag_type == LOWPAN_DISPATCH_FRAG1)
fq->q.flags |= INET_FRAG_FIRST_IN;
fq->q.meat += skb->len;
- add_frag_mem_limit(fq->q.net, skb->truesize);
+ add_frag_mem_limit(fq->q.fqdir, skb->truesize);
if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
fq->q.meat == fq->q.len) {
@@ -163,10 +146,11 @@ found:
unsigned long orefdst = skb->_skb_refdst;
skb->_skb_refdst = 0UL;
- res = lowpan_frag_reasm(fq, prev, ldev);
+ res = lowpan_frag_reasm(fq, skb, prev_tail, ldev, refs);
skb->_skb_refdst = orefdst;
return res;
}
+ skb_dst_drop(skb);
return -1;
err:
@@ -175,97 +159,29 @@ err:
}
/* Check if this packet is complete.
- * Returns NULL on failure by any reason, and pointer
- * to current nexthdr field in reassembled frame.
*
* It is called with locked fq, and caller must check that
* queue is eligible for reassembly i.e. it is not COMPLETE,
* the last and the first frames arrived and all the bits are here.
*/
-static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
- struct net_device *ldev)
+static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb,
+ struct sk_buff *prev_tail, struct net_device *ldev,
+ int *refs)
{
- struct sk_buff *fp, *head = fq->q.fragments;
- int sum_truesize;
-
- inet_frag_kill(&fq->q);
-
- /* Make the one we just received the head. */
- if (prev) {
- head = prev->next;
- fp = skb_clone(head, GFP_ATOMIC);
-
- if (!fp)
- goto out_oom;
-
- fp->next = head->next;
- if (!fp->next)
- fq->q.fragments_tail = fp;
- prev->next = fp;
+ void *reasm_data;
- skb_morph(head, fq->q.fragments);
- head->next = fq->q.fragments->next;
+ inet_frag_kill(&fq->q, refs);
- consume_skb(fq->q.fragments);
- fq->q.fragments = head;
- }
-
- /* Head of list must not be cloned. */
- if (skb_unclone(head, GFP_ATOMIC))
+ reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail);
+ if (!reasm_data)
goto out_oom;
+ inet_frag_reasm_finish(&fq->q, skb, reasm_data, false);
- /* If the first fragment is fragmented itself, we split
- * it to two chunks: the first with data and paged part
- * and the second, holding only fragments.
- */
- if (skb_has_frag_list(head)) {
- struct sk_buff *clone;
- int i, plen = 0;
-
- clone = alloc_skb(0, GFP_ATOMIC);
- if (!clone)
- goto out_oom;
- clone->next = head->next;
- head->next = clone;
- skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
- skb_frag_list_init(head);
- for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
- plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
- clone->len = head->data_len - plen;
- clone->data_len = clone->len;
- head->data_len -= clone->len;
- head->len -= clone->len;
- add_frag_mem_limit(fq->q.net, clone->truesize);
- }
-
- WARN_ON(head == NULL);
-
- sum_truesize = head->truesize;
- for (fp = head->next; fp;) {
- bool headstolen;
- int delta;
- struct sk_buff *next = fp->next;
-
- sum_truesize += fp->truesize;
- if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
- kfree_skb_partial(fp, headstolen);
- } else {
- if (!skb_shinfo(head)->frag_list)
- skb_shinfo(head)->frag_list = fp;
- head->data_len += fp->len;
- head->len += fp->len;
- head->truesize += fp->truesize;
- }
- fp = next;
- }
- sub_frag_mem_limit(fq->q.net, sum_truesize);
-
- head->next = NULL;
- head->dev = ldev;
- head->tstamp = fq->q.stamp;
-
- fq->q.fragments = NULL;
+ skb->dev = ldev;
+ skb->tstamp = fq->q.stamp;
+ fq->q.rb_fragments = RB_ROOT;
fq->q.fragments_tail = NULL;
+ fq->q.last_run_head = NULL;
return 1;
out_oom:
@@ -284,7 +200,7 @@ static int lowpan_frag_rx_handlers_result(struct sk_buff *skb,
net_warn_ratelimited("%s: received unknown dispatch\n",
__func__);
- /* fall-through */
+ fallthrough;
default:
/* all others failure */
return NET_RX_DROP;
@@ -388,17 +304,20 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
goto err;
}
+ rcu_read_lock();
fq = fq_find(net, cb, &hdr.source, &hdr.dest);
if (fq != NULL) {
- int ret;
+ int ret, refs = 0;
spin_lock(&fq->q.lock);
- ret = lowpan_frag_queue(fq, skb, frag_type);
+ ret = lowpan_frag_queue(fq, skb, frag_type, &refs);
spin_unlock(&fq->q.lock);
- inet_frag_put(&fq->q);
+ rcu_read_unlock();
+ inet_frag_putn(&fq->q, refs);
return ret;
}
+ rcu_read_unlock();
err:
kfree_skb(skb);
@@ -410,28 +329,22 @@ err:
static struct ctl_table lowpan_frags_ns_ctl_table[] = {
{
.procname = "6lowpanfrag_high_thresh",
- .data = &init_net.ieee802154_lowpan.frags.high_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra1 = &init_net.ieee802154_lowpan.frags.low_thresh
},
{
.procname = "6lowpanfrag_low_thresh",
- .data = &init_net.ieee802154_lowpan.frags.low_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra2 = &init_net.ieee802154_lowpan.frags.high_thresh
},
{
.procname = "6lowpanfrag_time",
- .data = &init_net.ieee802154_lowpan.frags.timeout,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- { }
};
/* secret interval has been deprecated */
@@ -444,7 +357,6 @@ static struct ctl_table lowpan_frags_ctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- { }
};
static int __net_init lowpan_frags_ns_sysctl_register(struct net *net)
@@ -453,6 +365,7 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net)
struct ctl_table_header *hdr;
struct netns_ieee802154_lowpan *ieee802154_lowpan =
net_ieee802154_lowpan(net);
+ size_t table_size = ARRAY_SIZE(lowpan_frags_ns_ctl_table);
table = lowpan_frags_ns_ctl_table;
if (!net_eq(net, &init_net)) {
@@ -461,19 +374,19 @@ static int __net_init lowpan_frags_ns_sysctl_register(struct net *net)
if (table == NULL)
goto err_alloc;
- table[0].data = &ieee802154_lowpan->frags.high_thresh;
- table[0].extra1 = &ieee802154_lowpan->frags.low_thresh;
- table[0].extra2 = &init_net.ieee802154_lowpan.frags.high_thresh;
- table[1].data = &ieee802154_lowpan->frags.low_thresh;
- table[1].extra2 = &ieee802154_lowpan->frags.high_thresh;
- table[2].data = &ieee802154_lowpan->frags.timeout;
-
/* Don't export sysctls to unprivileged users */
if (net->user_ns != &init_user_ns)
- table[0].procname = NULL;
+ table_size = 0;
}
- hdr = register_net_sysctl(net, "net/ieee802154/6lowpan", table);
+ table[0].data = &ieee802154_lowpan->fqdir->high_thresh;
+ table[0].extra1 = &ieee802154_lowpan->fqdir->low_thresh;
+ table[1].data = &ieee802154_lowpan->fqdir->low_thresh;
+ table[1].extra2 = &ieee802154_lowpan->fqdir->high_thresh;
+ table[2].data = &ieee802154_lowpan->fqdir->timeout;
+
+ hdr = register_net_sysctl_sz(net, "net/ieee802154/6lowpan", table,
+ table_size);
if (hdr == NULL)
goto err_reg;
@@ -489,7 +402,7 @@ err_alloc:
static void __net_exit lowpan_frags_ns_sysctl_unregister(struct net *net)
{
- struct ctl_table *table;
+ const struct ctl_table *table;
struct netns_ieee802154_lowpan *ieee802154_lowpan =
net_ieee802154_lowpan(net);
@@ -539,32 +452,42 @@ static int __net_init lowpan_frags_init_net(struct net *net)
net_ieee802154_lowpan(net);
int res;
- ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
- ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
- ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
- ieee802154_lowpan->frags.f = &lowpan_frags;
- res = inet_frags_init_net(&ieee802154_lowpan->frags);
+ res = fqdir_init(&ieee802154_lowpan->fqdir, &lowpan_frags, net);
if (res < 0)
return res;
+
+ ieee802154_lowpan->fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH;
+ ieee802154_lowpan->fqdir->low_thresh = IPV6_FRAG_LOW_THRESH;
+ ieee802154_lowpan->fqdir->timeout = IPV6_FRAG_TIMEOUT;
+
res = lowpan_frags_ns_sysctl_register(net);
if (res < 0)
- inet_frags_exit_net(&ieee802154_lowpan->frags);
+ fqdir_exit(ieee802154_lowpan->fqdir);
return res;
}
+static void __net_exit lowpan_frags_pre_exit_net(struct net *net)
+{
+ struct netns_ieee802154_lowpan *ieee802154_lowpan =
+ net_ieee802154_lowpan(net);
+
+ fqdir_pre_exit(ieee802154_lowpan->fqdir);
+}
+
static void __net_exit lowpan_frags_exit_net(struct net *net)
{
struct netns_ieee802154_lowpan *ieee802154_lowpan =
net_ieee802154_lowpan(net);
lowpan_frags_ns_sysctl_unregister(net);
- inet_frags_exit_net(&ieee802154_lowpan->frags);
+ fqdir_exit(ieee802154_lowpan->fqdir);
}
static struct pernet_operations lowpan_frags_ops = {
- .init = lowpan_frags_init_net,
- .exit = lowpan_frags_exit_net,
+ .init = lowpan_frags_init_net,
+ .pre_exit = lowpan_frags_pre_exit_net,
+ .exit = lowpan_frags_exit_net,
};
static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed)
@@ -629,7 +552,7 @@ err_sysctl:
void lowpan_net_frag_exit(void)
{
- inet_frags_fini(&lowpan_frags);
lowpan_frags_sysctl_unregister();
unregister_pernet_subsys(&lowpan_frags_ops);
+ inet_frags_fini(&lowpan_frags);
}
diff --git a/net/ieee802154/6lowpan/rx.c b/net/ieee802154/6lowpan/rx.c
index 649e7d45e88f..517e6493f5d1 100644
--- a/net/ieee802154/6lowpan/rx.c
+++ b/net/ieee802154/6lowpan/rx.c
@@ -1,12 +1,4 @@
-/* This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/if_arp.h>
@@ -43,11 +35,11 @@ static int lowpan_rx_handlers_result(struct sk_buff *skb, lowpan_rx_result res)
net_warn_ratelimited("%s: received unknown dispatch\n",
__func__);
- /* fall-through */
+ fallthrough;
case RX_DROP_UNUSABLE:
kfree_skb(skb);
- /* fall-through */
+ fallthrough;
case RX_DROP:
return NET_RX_DROP;
case RX_QUEUED:
@@ -248,7 +240,7 @@ static inline bool lowpan_is_reserved(u8 dispatch)
return ((dispatch >= 0x44 && dispatch <= 0x4F) ||
(dispatch >= 0x51 && dispatch <= 0x5F) ||
(dispatch >= 0xc8 && dispatch <= 0xdf) ||
- (dispatch >= 0xe8 && dispatch <= 0xff));
+ dispatch >= 0xe8);
}
/* lowpan_rx_h_check checks on generic 6LoWPAN requirements
diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c
index ca53efa17be1..0c07662b44c0 100644
--- a/net/ieee802154/6lowpan/tx.c
+++ b/net/ieee802154/6lowpan/tx.c
@@ -1,12 +1,4 @@
-/* This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- */
+// SPDX-License-Identifier: GPL-2.0-only
#include <net/6lowpan.h>
#include <net/ndisc.h>
@@ -48,6 +40,9 @@ int lowpan_header_create(struct sk_buff *skb, struct net_device *ldev,
const struct ipv6hdr *hdr = ipv6_hdr(skb);
struct neighbour *n;
+ if (!daddr)
+ return -EINVAL;
+
/* TODO:
* if this package isn't ipv6 one, where should it be routed?
*/
diff --git a/net/ieee802154/Kconfig b/net/ieee802154/Kconfig
index 188135bcb803..bcb05ba97686 100644
--- a/net/ieee802154/Kconfig
+++ b/net/ieee802154/Kconfig
@@ -1,6 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
menuconfig IEEE802154
tristate "IEEE Std 802.15.4 Low-Rate Wireless Personal Area Networks support"
- ---help---
+ help
IEEE Std 802.15.4 defines a low data rate, low power and low
complexity short range wireless personal area networks. It was
designed to organise networks of sensors, switches, etc automation
@@ -14,13 +15,13 @@ if IEEE802154
config IEEE802154_NL802154_EXPERIMENTAL
bool "IEEE 802.15.4 experimental netlink support"
- ---help---
+ help
Adds experimental netlink support for nl802154.
config IEEE802154_SOCKET
tristate "IEEE 802.15.4 socket interface"
default y
- ---help---
+ help
Socket interface for IEEE 802.15.4. Contains DGRAM sockets interface
for 802.15.4 dataframes. Also RAW socket interface to build MAC
header from userspace.
diff --git a/net/ieee802154/Makefile b/net/ieee802154/Makefile
index f05b7bdae2aa..7bce67673e83 100644
--- a/net/ieee802154/Makefile
+++ b/net/ieee802154/Makefile
@@ -4,7 +4,7 @@ obj-$(CONFIG_IEEE802154_SOCKET) += ieee802154_socket.o
obj-y += 6lowpan/
ieee802154-y := netlink.o nl-mac.o nl-phy.o nl_policy.o core.o \
- header_ops.o sysfs.o nl802154.o trace.o
+ header_ops.o sysfs.o nl802154.o trace.o pan.o
ieee802154_socket-y := socket.o
CFLAGS_trace.o := -I$(src)
diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c
index fe225d9a1877..89b671b12600 100644
--- a/net/ieee802154/core.c
+++ b/net/ieee802154/core.c
@@ -1,15 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2007, 2008, 2009 Siemens AG
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
*/
#include <linux/slab.h>
@@ -32,11 +23,6 @@
LIST_HEAD(cfg802154_rdev_list);
int cfg802154_rdev_list_generation;
-static int wpan_phy_match(struct device *dev, const void *data)
-{
- return !strcmp(dev_name(dev), (const char *)data);
-}
-
struct wpan_phy *wpan_phy_find(const char *str)
{
struct device *dev;
@@ -44,7 +30,7 @@ struct wpan_phy *wpan_phy_find(const char *str)
if (WARN_ON(!str))
return NULL;
- dev = class_find_device(&wpan_phy_class, NULL, str, wpan_phy_match);
+ dev = class_find_device_by_name(&wpan_phy_class, str);
if (!dev)
return NULL;
@@ -143,6 +129,9 @@ wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size)
wpan_phy_net_set(&rdev->wpan_phy, &init_net);
init_waitqueue_head(&rdev->dev_wait);
+ init_waitqueue_head(&rdev->wpan_phy.sync_txq);
+
+ spin_lock_init(&rdev->wpan_phy.queue_lock);
return &rdev->wpan_phy;
}
@@ -209,6 +198,25 @@ void wpan_phy_free(struct wpan_phy *phy)
}
EXPORT_SYMBOL(wpan_phy_free);
+static void cfg802154_free_peer_structures(struct wpan_dev *wpan_dev)
+{
+ struct ieee802154_pan_device *child, *tmp;
+
+ mutex_lock(&wpan_dev->association_lock);
+
+ kfree(wpan_dev->parent);
+ wpan_dev->parent = NULL;
+
+ list_for_each_entry_safe(child, tmp, &wpan_dev->children, node) {
+ list_del(&child->node);
+ kfree(child);
+ }
+
+ wpan_dev->nchildren = 0;
+
+ mutex_unlock(&wpan_dev->association_lock);
+}
+
int cfg802154_switch_netns(struct cfg802154_registered_device *rdev,
struct net *net)
{
@@ -218,11 +226,11 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev,
list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) {
if (!wpan_dev->netdev)
continue;
- wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
+ wpan_dev->netdev->netns_immutable = false;
err = dev_change_net_namespace(wpan_dev->netdev, net, "wpan%d");
if (err)
break;
- wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL;
+ wpan_dev->netdev->netns_immutable = true;
}
if (err) {
@@ -234,11 +242,11 @@ int cfg802154_switch_netns(struct cfg802154_registered_device *rdev,
list) {
if (!wpan_dev->netdev)
continue;
- wpan_dev->netdev->features &= ~NETIF_F_NETNS_LOCAL;
+ wpan_dev->netdev->netns_immutable = false;
err = dev_change_net_namespace(wpan_dev->netdev, net,
"wpan%d");
WARN_ON(err);
- wpan_dev->netdev->features |= NETIF_F_NETNS_LOCAL;
+ wpan_dev->netdev->netns_immutable = true;
}
return err;
@@ -283,10 +291,13 @@ static int cfg802154_netdev_notifier_call(struct notifier_block *nb,
switch (state) {
/* TODO NETDEV_DEVTYPE */
case NETDEV_REGISTER:
- dev->features |= NETIF_F_NETNS_LOCAL;
+ dev->netns_immutable = true;
wpan_dev->identifier = ++rdev->wpan_dev_id;
list_add_rcu(&wpan_dev->list, &rdev->wpan_dev_list);
rdev->devlist_generation++;
+ mutex_init(&wpan_dev->association_lock);
+ INIT_LIST_HEAD(&wpan_dev->children);
+ wpan_dev->max_associations = SZ_16K;
wpan_dev->netdev = dev;
break;
@@ -302,6 +313,8 @@ static int cfg802154_netdev_notifier_call(struct notifier_block *nb,
rdev->opencount++;
break;
case NETDEV_UNREGISTER:
+ cfg802154_free_peer_structures(wpan_dev);
+
/* It is possible to get NETDEV_UNREGISTER
* multiple times. To detect that, check
* that the interface is still on the list
diff --git a/net/ieee802154/header_ops.c b/net/ieee802154/header_ops.c
index c7439f0fbbdf..41a556be1017 100644
--- a/net/ieee802154/header_ops.c
+++ b/net/ieee802154/header_ops.c
@@ -1,15 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2014 Fraunhofer ITWM
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
* Written by:
* Phoebe Buckheister <phoebe.buckheister@itwm.fraunhofer.de>
*/
@@ -128,6 +120,53 @@ ieee802154_hdr_push(struct sk_buff *skb, struct ieee802154_hdr *hdr)
}
EXPORT_SYMBOL_GPL(ieee802154_hdr_push);
+int ieee802154_mac_cmd_push(struct sk_buff *skb, void *f,
+ const void *pl, unsigned int pl_len)
+{
+ struct ieee802154_mac_cmd_frame *frame = f;
+ struct ieee802154_mac_cmd_pl *mac_pl = &frame->mac_pl;
+ struct ieee802154_hdr *mhr = &frame->mhr;
+ int ret;
+
+ skb_reserve(skb, sizeof(*mhr));
+ ret = ieee802154_hdr_push(skb, mhr);
+ if (ret < 0)
+ return ret;
+
+ skb_reset_mac_header(skb);
+ skb->mac_len = ret;
+
+ skb_put_data(skb, mac_pl, sizeof(*mac_pl));
+ skb_put_data(skb, pl, pl_len);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ieee802154_mac_cmd_push);
+
+int ieee802154_beacon_push(struct sk_buff *skb,
+ struct ieee802154_beacon_frame *beacon)
+{
+ struct ieee802154_beacon_hdr *mac_pl = &beacon->mac_pl;
+ struct ieee802154_hdr *mhr = &beacon->mhr;
+ int ret;
+
+ skb_reserve(skb, sizeof(*mhr));
+ ret = ieee802154_hdr_push(skb, mhr);
+ if (ret < 0)
+ return ret;
+
+ skb_reset_mac_header(skb);
+ skb->mac_len = ret;
+
+ skb_put_data(skb, mac_pl, sizeof(*mac_pl));
+
+ if (mac_pl->pend_short_addr_count || mac_pl->pend_ext_addr_count)
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ieee802154_beacon_push);
+
static int
ieee802154_hdr_get_addr(const u8 *buf, int mode, bool omit_pan,
struct ieee802154_addr *addr)
@@ -268,6 +307,19 @@ ieee802154_hdr_pull(struct sk_buff *skb, struct ieee802154_hdr *hdr)
}
EXPORT_SYMBOL_GPL(ieee802154_hdr_pull);
+int ieee802154_mac_cmd_pl_pull(struct sk_buff *skb,
+ struct ieee802154_mac_cmd_pl *mac_pl)
+{
+ if (!pskb_may_pull(skb, sizeof(*mac_pl)))
+ return -EINVAL;
+
+ memcpy(mac_pl, skb->data, sizeof(*mac_pl));
+ skb_pull(skb, sizeof(*mac_pl));
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ieee802154_mac_cmd_pl_pull);
+
int
ieee802154_hdr_peek_addrs(const struct sk_buff *skb, struct ieee802154_hdr *hdr)
{
diff --git a/net/ieee802154/ieee802154.h b/net/ieee802154/ieee802154.h
index a5d7515b7f62..c5d91f78301a 100644
--- a/net/ieee802154/ieee802154.h
+++ b/net/ieee802154/ieee802154.h
@@ -1,15 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2007, 2008, 2009 Siemens AG
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
*/
#ifndef IEEE_802154_LOCAL_H
#define IEEE_802154_LOCAL_H
@@ -20,7 +11,6 @@ void ieee802154_nl_exit(void);
#define IEEE802154_OP(_cmd, _func) \
{ \
.cmd = _cmd, \
- .policy = ieee802154_policy, \
.doit = _func, \
.dumpit = NULL, \
.flags = GENL_ADMIN_PERM, \
@@ -29,7 +19,6 @@ void ieee802154_nl_exit(void);
#define IEEE802154_DUMP(_cmd, _func, _dump) \
{ \
.cmd = _cmd, \
- .policy = ieee802154_policy, \
.doit = _func, \
.dumpit = _dump, \
}
diff --git a/net/ieee802154/netlink.c b/net/ieee802154/netlink.c
index 96636e3b7aa9..7d2de4ee6992 100644
--- a/net/ieee802154/netlink.c
+++ b/net/ieee802154/netlink.c
@@ -1,17 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Netlink interface for IEEE 802.15.4 stack
*
* Copyright 2007, 2008 Siemens AG
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
* Written by:
* Sergey Lapin <slapin@ossfans.org>
* Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
@@ -89,7 +81,7 @@ int ieee802154_nl_reply(struct sk_buff *msg, struct genl_info *info)
return genlmsg_reply(msg, info);
}
-static const struct genl_ops ieee802154_ops[] = {
+static const struct genl_small_ops ieee802154_ops[] = {
/* see nl-phy.c */
IEEE802154_DUMP(IEEE802154_LIST_PHY, ieee802154_list_phy,
ieee802154_dump_phy),
@@ -136,9 +128,11 @@ struct genl_family nl802154_family __ro_after_init = {
.name = IEEE802154_NL_NAME,
.version = 1,
.maxattr = IEEE802154_ATTR_MAX,
+ .policy = ieee802154_policy,
.module = THIS_MODULE,
- .ops = ieee802154_ops,
- .n_ops = ARRAY_SIZE(ieee802154_ops),
+ .small_ops = ieee802154_ops,
+ .n_small_ops = ARRAY_SIZE(ieee802154_ops),
+ .resv_start_op = IEEE802154_LLSEC_DEL_SECLEVEL + 1,
.mcgrps = ieee802154_mcgrps,
.n_mcgrps = ARRAY_SIZE(ieee802154_mcgrps),
};
diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c
index d3cbb3258718..74ef0a310afb 100644
--- a/net/ieee802154/nl-mac.c
+++ b/net/ieee802154/nl-mac.c
@@ -1,17 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Netlink interface for IEEE 802.15.4 stack
*
* Copyright 2007, 2008 Siemens AG
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
* Written by:
* Sergey Lapin <slapin@ossfans.org>
* Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
@@ -157,7 +149,7 @@ static struct net_device *ieee802154_nl_get_dev(struct genl_info *info)
if (info->attrs[IEEE802154_ATTR_DEV_NAME]) {
char name[IFNAMSIZ + 1];
- nla_strlcpy(name, info->attrs[IEEE802154_ATTR_DEV_NAME],
+ nla_strscpy(name, info->attrs[IEEE802154_ATTR_DEV_NAME],
sizeof(name));
dev = dev_get_by_name(&init_net, name);
} else if (info->attrs[IEEE802154_ATTR_DEV_INDEX]) {
@@ -210,10 +202,7 @@ int ieee802154_associate_req(struct sk_buff *skb, struct genl_info *info)
addr.pan_id = nla_get_shortaddr(
info->attrs[IEEE802154_ATTR_COORD_PAN_ID]);
- if (info->attrs[IEEE802154_ATTR_PAGE])
- page = nla_get_u8(info->attrs[IEEE802154_ATTR_PAGE]);
- else
- page = 0;
+ page = nla_get_u8_default(info->attrs[IEEE802154_ATTR_PAGE], 0);
ret = ieee802154_mlme_ops(dev)->assoc_req(dev, &addr,
nla_get_u8(info->attrs[IEEE802154_ATTR_CHANNEL]),
@@ -346,10 +335,7 @@ int ieee802154_start_req(struct sk_buff *skb, struct genl_info *info)
blx = nla_get_u8(info->attrs[IEEE802154_ATTR_BAT_EXT]);
coord_realign = nla_get_u8(info->attrs[IEEE802154_ATTR_COORD_REALIGN]);
- if (info->attrs[IEEE802154_ATTR_PAGE])
- page = nla_get_u8(info->attrs[IEEE802154_ATTR_PAGE]);
- else
- page = 0;
+ page = nla_get_u8_default(info->attrs[IEEE802154_ATTR_PAGE], 0);
if (addr.short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST)) {
ieee802154_nl_start_confirm(dev, IEEE802154_NO_SHORT_ADDRESS);
@@ -396,10 +382,7 @@ int ieee802154_scan_req(struct sk_buff *skb, struct genl_info *info)
channels = nla_get_u32(info->attrs[IEEE802154_ATTR_CHANNELS]);
duration = nla_get_u8(info->attrs[IEEE802154_ATTR_DURATION]);
- if (info->attrs[IEEE802154_ATTR_PAGE])
- page = nla_get_u8(info->attrs[IEEE802154_ATTR_PAGE]);
- else
- page = 0;
+ page = nla_get_u8_default(info->attrs[IEEE802154_ATTR_PAGE], 0);
ret = ieee802154_mlme_ops(dev)->scan_req(dev, type, channels,
page, duration);
@@ -559,9 +542,7 @@ ieee802154_llsec_parse_key_id(struct genl_info *info,
desc->mode = nla_get_u8(info->attrs[IEEE802154_ATTR_LLSEC_KEY_MODE]);
if (desc->mode == IEEE802154_SCF_KEY_IMPLICIT) {
- if (!info->attrs[IEEE802154_ATTR_PAN_ID] &&
- !(info->attrs[IEEE802154_ATTR_SHORT_ADDR] ||
- info->attrs[IEEE802154_ATTR_HW_ADDR]))
+ if (!info->attrs[IEEE802154_ATTR_PAN_ID])
return -EINVAL;
desc->device_addr.pan_id = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_PAN_ID]);
@@ -570,6 +551,9 @@ ieee802154_llsec_parse_key_id(struct genl_info *info,
desc->device_addr.mode = IEEE802154_ADDR_SHORT;
desc->device_addr.short_addr = nla_get_shortaddr(info->attrs[IEEE802154_ATTR_SHORT_ADDR]);
} else {
+ if (!info->attrs[IEEE802154_ATTR_HW_ADDR])
+ return -EINVAL;
+
desc->device_addr.mode = IEEE802154_ADDR_LONG;
desc->device_addr.extended_addr = nla_get_hwaddr(info->attrs[IEEE802154_ATTR_HW_ADDR]);
}
@@ -687,8 +671,10 @@ int ieee802154_llsec_getparams(struct sk_buff *skb, struct genl_info *info)
nla_put_u8(msg, IEEE802154_ATTR_LLSEC_SECLEVEL, params.out_level) ||
nla_put_u32(msg, IEEE802154_ATTR_LLSEC_FRAME_COUNTER,
be32_to_cpu(params.frame_counter)) ||
- ieee802154_llsec_fill_key_id(msg, &params.out_key))
+ ieee802154_llsec_fill_key_id(msg, &params.out_key)) {
+ rc = -ENOBUFS;
goto out_free;
+ }
dev_put(dev);
@@ -1191,7 +1177,7 @@ static int llsec_iter_devkeys(struct llsec_dump_data *data)
{
struct ieee802154_llsec_device *dpos;
struct ieee802154_llsec_device_key *kpos;
- int rc = 0, idx = 0, idx2;
+ int idx = 0, idx2;
list_for_each_entry(dpos, &data->table->devices, list) {
if (idx++ < data->s_idx)
@@ -1207,7 +1193,7 @@ static int llsec_iter_devkeys(struct llsec_dump_data *data)
data->nlmsg_seq,
dpos->hwaddr, kpos,
data->dev)) {
- return rc = -EMSGSIZE;
+ return -EMSGSIZE;
}
data->s_idx2++;
@@ -1216,7 +1202,7 @@ static int llsec_iter_devkeys(struct llsec_dump_data *data)
data->s_idx++;
}
- return rc;
+ return 0;
}
int ieee802154_llsec_dump_devkeys(struct sk_buff *skb,
diff --git a/net/ieee802154/nl-phy.c b/net/ieee802154/nl-phy.c
index b231e40f006a..4c07a475c567 100644
--- a/net/ieee802154/nl-phy.c
+++ b/net/ieee802154/nl-phy.c
@@ -1,17 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Netlink interface for IEEE 802.15.4 stack
*
* Copyright 2007, 2008 Siemens AG
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
* Written by:
* Sergey Lapin <slapin@ossfans.org>
* Dmitry Eremin-Solenikov <dbaryshkov@gmail.com>
@@ -38,7 +30,7 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid,
{
void *hdr;
int i, pages = 0;
- uint32_t *buf = kcalloc(32, sizeof(uint32_t), GFP_KERNEL);
+ u32 *buf = kcalloc(IEEE802154_MAX_PAGE + 1, sizeof(u32), GFP_KERNEL);
pr_debug("%s\n", __func__);
@@ -55,7 +47,7 @@ static int ieee802154_nl_fill_phy(struct sk_buff *msg, u32 portid,
nla_put_u8(msg, IEEE802154_ATTR_PAGE, phy->current_page) ||
nla_put_u8(msg, IEEE802154_ATTR_CHANNEL, phy->current_channel))
goto nla_put_failure;
- for (i = 0; i < 32; i++) {
+ for (i = 0; i <= IEEE802154_MAX_PAGE; i++) {
if (phy->supported.channels[i])
buf[pages++] = phy->supported.channels[i] | (i << 27);
}
@@ -232,25 +224,27 @@ int ieee802154_add_iface(struct sk_buff *skb, struct genl_info *info)
dev_hold(dev);
if (info->attrs[IEEE802154_ATTR_HW_ADDR]) {
- struct sockaddr addr;
+ struct sockaddr_storage addr;
- addr.sa_family = ARPHRD_IEEE802154;
- nla_memcpy(&addr.sa_data, info->attrs[IEEE802154_ATTR_HW_ADDR],
+ addr.ss_family = ARPHRD_IEEE802154;
+ nla_memcpy(&addr.__data, info->attrs[IEEE802154_ATTR_HW_ADDR],
IEEE802154_ADDR_LEN);
/* strangely enough, some callbacks (inetdev_event) from
* dev_set_mac_address require RTNL_LOCK
*/
rtnl_lock();
- rc = dev_set_mac_address(dev, &addr);
+ rc = dev_set_mac_address(dev, &addr, NULL);
rtnl_unlock();
if (rc)
goto dev_unregister;
}
if (nla_put_string(msg, IEEE802154_ATTR_PHY_NAME, wpan_phy_name(phy)) ||
- nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name))
+ nla_put_string(msg, IEEE802154_ATTR_DEV_NAME, dev->name)) {
+ rc = -EMSGSIZE;
goto nla_put_failure;
+ }
dev_put(dev);
wpan_phy_put(phy);
@@ -346,8 +340,7 @@ nla_put_failure:
out_dev:
wpan_phy_put(phy);
out:
- if (dev)
- dev_put(dev);
+ dev_put(dev);
return rc;
}
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index 99f6c254ea77..5a024ca60d35 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -1,11 +1,5 @@
-/* This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
+// SPDX-License-Identifier: GPL-2.0-only
+/*
*
* Authors:
* Alexander Aring <aar@pengutronix.de>
@@ -32,10 +26,12 @@ static struct genl_family nl802154_fam;
/* multicast groups */
enum nl802154_multicast_groups {
NL802154_MCGRP_CONFIG,
+ NL802154_MCGRP_SCAN,
};
static const struct genl_multicast_group nl802154_mcgrps[] = {
[NL802154_MCGRP_CONFIG] = { .name = "config", },
+ [NL802154_MCGRP_SCAN] = { .name = "scan", },
};
/* returns ERR_PTR values */
@@ -191,8 +187,8 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = {
[NL802154_ATTR_WPAN_DEV] = { .type = NLA_U64 },
- [NL802154_ATTR_PAGE] = { .type = NLA_U8, },
- [NL802154_ATTR_CHANNEL] = { .type = NLA_U8, },
+ [NL802154_ATTR_PAGE] = NLA_POLICY_MAX(NLA_U8, IEEE802154_MAX_PAGE),
+ [NL802154_ATTR_CHANNEL] = NLA_POLICY_MAX(NLA_U8, IEEE802154_MAX_CHANNEL),
[NL802154_ATTR_TX_POWER] = { .type = NLA_S32, },
@@ -222,6 +218,25 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = {
[NL802154_ATTR_PID] = { .type = NLA_U32 },
[NL802154_ATTR_NETNS_FD] = { .type = NLA_U32 },
+
+ [NL802154_ATTR_COORDINATOR] = { .type = NLA_NESTED },
+
+ [NL802154_ATTR_SCAN_TYPE] =
+ NLA_POLICY_RANGE(NLA_U8, NL802154_SCAN_ED, NL802154_SCAN_RIT_PASSIVE),
+ [NL802154_ATTR_SCAN_CHANNELS] =
+ NLA_POLICY_MASK(NLA_U32, GENMASK(IEEE802154_MAX_CHANNEL, 0)),
+ [NL802154_ATTR_SCAN_PREAMBLE_CODES] = { .type = NLA_REJECT },
+ [NL802154_ATTR_SCAN_MEAN_PRF] = { .type = NLA_REJECT },
+ [NL802154_ATTR_SCAN_DURATION] =
+ NLA_POLICY_MAX(NLA_U8, IEEE802154_MAX_SCAN_DURATION),
+ [NL802154_ATTR_SCAN_DONE_REASON] =
+ NLA_POLICY_RANGE(NLA_U8, NL802154_SCAN_DONE_REASON_FINISHED,
+ NL802154_SCAN_DONE_REASON_ABORTED),
+ [NL802154_ATTR_BEACON_INTERVAL] =
+ NLA_POLICY_MAX(NLA_U8, IEEE802154_ACTIVE_SCAN_DURATION),
+ [NL802154_ATTR_MAX_ASSOCIATIONS] = { .type = NLA_U32 },
+ [NL802154_ATTR_PEER] = { .type = NLA_NESTED },
+
#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
[NL802154_ATTR_SEC_ENABLED] = { .type = NLA_U8, },
[NL802154_ATTR_SEC_OUT_LEVEL] = { .type = NLA_U32, },
@@ -235,26 +250,20 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = {
#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
};
-#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
static int
nl802154_prepare_wpan_dev_dump(struct sk_buff *skb,
struct netlink_callback *cb,
struct cfg802154_registered_device **rdev,
struct wpan_dev **wpan_dev)
{
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
int err;
rtnl_lock();
if (!cb->args[0]) {
- err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl802154_fam.hdrsize,
- genl_family_attrbuf(&nl802154_fam),
- nl802154_fam.maxattr, nl802154_policy, NULL);
- if (err)
- goto out_unlock;
-
*wpan_dev = __cfg802154_wpan_dev_from_attrs(sock_net(skb->sk),
- genl_family_attrbuf(&nl802154_fam));
+ info->info.attrs);
if (IS_ERR(*wpan_dev)) {
err = PTR_ERR(*wpan_dev);
goto out_unlock;
@@ -299,7 +308,6 @@ nl802154_finish_wpan_dev_dump(struct cfg802154_registered_device *rdev)
{
rtnl_unlock();
}
-#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
/* message building helper */
static inline void *nl802154hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
@@ -312,7 +320,7 @@ static inline void *nl802154hdr_put(struct sk_buff *skb, u32 portid, u32 seq,
static int
nl802154_put_flags(struct sk_buff *msg, int attr, u32 mask)
{
- struct nlattr *nl_flags = nla_nest_start(msg, attr);
+ struct nlattr *nl_flags = nla_nest_start_noflag(msg, attr);
int i;
if (!nl_flags)
@@ -338,7 +346,7 @@ nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev,
struct nlattr *nl_page;
unsigned long page;
- nl_page = nla_nest_start(msg, NL802154_ATTR_CHANNELS_SUPPORTED);
+ nl_page = nla_nest_start_noflag(msg, NL802154_ATTR_CHANNELS_SUPPORTED);
if (!nl_page)
return -ENOBUFS;
@@ -360,11 +368,11 @@ nl802154_put_capabilities(struct sk_buff *msg,
struct nlattr *nl_caps, *nl_channels;
int i;
- nl_caps = nla_nest_start(msg, NL802154_ATTR_WPAN_PHY_CAPS);
+ nl_caps = nla_nest_start_noflag(msg, NL802154_ATTR_WPAN_PHY_CAPS);
if (!nl_caps)
return -ENOBUFS;
- nl_channels = nla_nest_start(msg, NL802154_CAP_ATTR_CHANNELS);
+ nl_channels = nla_nest_start_noflag(msg, NL802154_CAP_ATTR_CHANNELS);
if (!nl_channels)
return -ENOBUFS;
@@ -380,8 +388,8 @@ nl802154_put_capabilities(struct sk_buff *msg,
if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) {
struct nlattr *nl_ed_lvls;
- nl_ed_lvls = nla_nest_start(msg,
- NL802154_CAP_ATTR_CCA_ED_LEVELS);
+ nl_ed_lvls = nla_nest_start_noflag(msg,
+ NL802154_CAP_ATTR_CCA_ED_LEVELS);
if (!nl_ed_lvls)
return -ENOBUFS;
@@ -396,7 +404,8 @@ nl802154_put_capabilities(struct sk_buff *msg,
if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) {
struct nlattr *nl_tx_pwrs;
- nl_tx_pwrs = nla_nest_start(msg, NL802154_CAP_ATTR_TX_POWERS);
+ nl_tx_pwrs = nla_nest_start_noflag(msg,
+ NL802154_CAP_ATTR_TX_POWERS);
if (!nl_tx_pwrs)
return -ENOBUFS;
@@ -504,7 +513,7 @@ static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev,
if (nl802154_put_capabilities(msg, rdev))
goto nla_put_failure;
- nl_cmds = nla_nest_start(msg, NL802154_ATTR_SUPPORTED_COMMANDS);
+ nl_cmds = nla_nest_start_noflag(msg, NL802154_ATTR_SUPPORTED_COMMANDS);
if (!nl_cmds)
goto nla_put_failure;
@@ -560,15 +569,8 @@ static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb,
struct netlink_callback *cb,
struct nl802154_dump_wpan_phy_state *state)
{
- struct nlattr **tb = genl_family_attrbuf(&nl802154_fam);
- int ret = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl802154_fam.hdrsize, tb,
- nl802154_fam.maxattr, nl802154_policy, NULL);
-
- /* TODO check if we can handle error here,
- * we have no backward compatibility
- */
- if (ret)
- return 0;
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ struct nlattr **tb = info->info.attrs;
if (tb[NL802154_ATTR_WPAN_PHY])
state->filter_wpan_phy = nla_get_u32(tb[NL802154_ATTR_WPAN_PHY]);
@@ -693,7 +695,8 @@ ieee802154_llsec_send_key_id(struct sk_buff *msg,
switch (desc->mode) {
case NL802154_KEY_ID_MODE_IMPLICIT:
- nl_dev_addr = nla_nest_start(msg, NL802154_KEY_ID_ATTR_IMPLICIT);
+ nl_dev_addr = nla_nest_start_noflag(msg,
+ NL802154_KEY_ID_ATTR_IMPLICIT);
if (!nl_dev_addr)
return -ENOBUFS;
@@ -768,7 +771,7 @@ static int nl802154_get_llsec_params(struct sk_buff *msg,
params.frame_counter))
return -ENOBUFS;
- nl_key_id = nla_nest_start(msg, NL802154_ATTR_SEC_OUT_KEY_ID);
+ nl_key_id = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_OUT_KEY_ID);
if (!nl_key_id)
return -ENOBUFS;
@@ -836,8 +839,13 @@ nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags,
goto nla_put_failure;
#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ goto out;
+
if (nl802154_get_llsec_params(msg, rdev, wpan_dev) < 0)
goto nla_put_failure;
+
+out:
#endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */
genlmsg_end(msg, hdr);
@@ -975,8 +983,7 @@ static int nl802154_set_channel(struct sk_buff *skb, struct genl_info *info)
channel = nla_get_u8(info->attrs[NL802154_ATTR_CHANNEL]);
/* check 802.15.4 constraints */
- if (page > IEEE802154_MAX_PAGE || channel > IEEE802154_MAX_CHANNEL ||
- !(rdev->wpan_phy.supported.channels[page] & BIT(channel)))
+ if (!ieee802154_chan_is_valid(&rdev->wpan_phy, page, channel))
return -EINVAL;
return rdev_set_channel(rdev, page, channel);
@@ -1080,15 +1087,14 @@ static int nl802154_set_pan_id(struct sk_buff *skb, struct genl_info *info)
pan_id = nla_get_le16(info->attrs[NL802154_ATTR_PAN_ID]);
- /* TODO
- * I am not sure about to check here on broadcast pan_id.
- * Broadcast is a valid setting, comment from 802.15.4:
- * If this value is 0xffff, the device is not associated.
- *
- * This could useful to simple deassociate an device.
+ /* Only allow changing the PAN ID when the device has no more
+ * associations ongoing to avoid confusing peers.
*/
- if (pan_id == cpu_to_le16(IEEE802154_PAN_ID_BROADCAST))
+ if (cfg802154_device_is_associated(wpan_dev)) {
+ NL_SET_ERR_MSG(info->extack,
+ "Existing associations, changing PAN ID forbidden");
return -EINVAL;
+ }
return rdev_set_pan_id(rdev, wpan_dev, pan_id);
}
@@ -1116,20 +1122,17 @@ static int nl802154_set_short_addr(struct sk_buff *skb, struct genl_info *info)
short_addr = nla_get_le16(info->attrs[NL802154_ATTR_SHORT_ADDR]);
- /* TODO
- * I am not sure about to check here on broadcast short_addr.
- * Broadcast is a valid setting, comment from 802.15.4:
- * A value of 0xfffe indicates that the device has
- * associated but has not been allocated an address. A
- * value of 0xffff indicates that the device does not
- * have a short address.
- *
- * I think we should allow to set these settings but
- * don't allow to allow socket communication with it.
+ /* The short address only has a meaning when part of a PAN, after a
+ * proper association procedure. However, we want to still offer the
+ * possibility to create static networks so changing the short address
+ * is only allowed when not already associated to other devices with
+ * the official handshake.
*/
- if (short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC) ||
- short_addr == cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST))
+ if (cfg802154_device_is_associated(wpan_dev)) {
+ NL_SET_ERR_MSG(info->extack,
+ "Existing associations, changing short address forbidden");
return -EINVAL;
+ }
return rdev_set_short_addr(rdev, wpan_dev, short_addr);
}
@@ -1292,6 +1295,522 @@ static int nl802154_wpan_phy_netns(struct sk_buff *skb, struct genl_info *info)
return err;
}
+static int nl802154_prep_scan_event_msg(struct sk_buff *msg,
+ struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ u32 portid, u32 seq, int flags, u8 cmd,
+ struct ieee802154_coord_desc *desc)
+{
+ struct nlattr *nla;
+ void *hdr;
+
+ hdr = nl802154hdr_put(msg, portid, seq, flags, cmd);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx))
+ goto nla_put_failure;
+
+ if (wpan_dev->netdev &&
+ nla_put_u32(msg, NL802154_ATTR_IFINDEX, wpan_dev->netdev->ifindex))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, NL802154_ATTR_WPAN_DEV,
+ wpan_dev_id(wpan_dev), NL802154_ATTR_PAD))
+ goto nla_put_failure;
+
+ nla = nla_nest_start_noflag(msg, NL802154_ATTR_COORDINATOR);
+ if (!nla)
+ goto nla_put_failure;
+
+ if (nla_put(msg, NL802154_COORD_PANID, IEEE802154_PAN_ID_LEN,
+ &desc->addr.pan_id))
+ goto nla_put_failure;
+
+ if (desc->addr.mode == IEEE802154_ADDR_SHORT) {
+ if (nla_put(msg, NL802154_COORD_ADDR,
+ IEEE802154_SHORT_ADDR_LEN,
+ &desc->addr.short_addr))
+ goto nla_put_failure;
+ } else {
+ if (nla_put(msg, NL802154_COORD_ADDR,
+ IEEE802154_EXTENDED_ADDR_LEN,
+ &desc->addr.extended_addr))
+ goto nla_put_failure;
+ }
+
+ if (nla_put_u8(msg, NL802154_COORD_CHANNEL, desc->channel))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, NL802154_COORD_PAGE, desc->page))
+ goto nla_put_failure;
+
+ if (nla_put_u16(msg, NL802154_COORD_SUPERFRAME_SPEC,
+ desc->superframe_spec))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, NL802154_COORD_LINK_QUALITY, desc->link_quality))
+ goto nla_put_failure;
+
+ if (desc->gts_permit && nla_put_flag(msg, NL802154_COORD_GTS_PERMIT))
+ goto nla_put_failure;
+
+ /* TODO: NL802154_COORD_PAYLOAD_DATA if any */
+
+ nla_nest_end(msg, nla);
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+
+ return -EMSGSIZE;
+}
+
+int nl802154_scan_event(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ struct ieee802154_coord_desc *desc)
+{
+ struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(wpan_phy);
+ struct sk_buff *msg;
+ int ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+ if (!msg)
+ return -ENOMEM;
+
+ ret = nl802154_prep_scan_event_msg(msg, rdev, wpan_dev, 0, 0, 0,
+ NL802154_CMD_SCAN_EVENT,
+ desc);
+ if (ret < 0) {
+ nlmsg_free(msg);
+ return ret;
+ }
+
+ return genlmsg_multicast_netns(&nl802154_fam, wpan_phy_net(wpan_phy),
+ msg, 0, NL802154_MCGRP_SCAN, GFP_ATOMIC);
+}
+EXPORT_SYMBOL_GPL(nl802154_scan_event);
+
+static int nl802154_trigger_scan(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct wpan_phy *wpan_phy = &rdev->wpan_phy;
+ struct cfg802154_scan_request *request;
+ u8 type;
+ int err;
+
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) {
+ NL_SET_ERR_MSG(info->extack, "Monitors are not allowed to perform scans");
+ return -EOPNOTSUPP;
+ }
+
+ if (!info->attrs[NL802154_ATTR_SCAN_TYPE]) {
+ NL_SET_ERR_MSG(info->extack, "Malformed request, missing scan type");
+ return -EINVAL;
+ }
+
+ if (wpan_phy->flags & WPAN_PHY_FLAG_DATAGRAMS_ONLY) {
+ NL_SET_ERR_MSG(info->extack, "PHY only supports datagrams");
+ return -EOPNOTSUPP;
+ }
+
+ request = kzalloc(sizeof(*request), GFP_KERNEL);
+ if (!request)
+ return -ENOMEM;
+
+ request->wpan_dev = wpan_dev;
+ request->wpan_phy = wpan_phy;
+
+ type = nla_get_u8(info->attrs[NL802154_ATTR_SCAN_TYPE]);
+ switch (type) {
+ case NL802154_SCAN_ACTIVE:
+ case NL802154_SCAN_PASSIVE:
+ request->type = type;
+ break;
+ default:
+ NL_SET_ERR_MSG_FMT(info->extack, "Unsupported scan type: %d", type);
+ err = -EINVAL;
+ goto free_request;
+ }
+
+ /* Use current page by default */
+ request->page = nla_get_u8_default(info->attrs[NL802154_ATTR_PAGE],
+ wpan_phy->current_page);
+
+ /* Scan all supported channels by default */
+ request->channels =
+ nla_get_u32_default(info->attrs[NL802154_ATTR_SCAN_CHANNELS],
+ wpan_phy->supported.channels[request->page]);
+
+ /* Use maximum duration order by default */
+ request->duration =
+ nla_get_u8_default(info->attrs[NL802154_ATTR_SCAN_DURATION],
+ IEEE802154_MAX_SCAN_DURATION);
+
+ err = rdev_trigger_scan(rdev, request);
+ if (err) {
+ pr_err("Failure starting scanning (%d)\n", err);
+ goto free_request;
+ }
+
+ return 0;
+
+free_request:
+ kfree(request);
+
+ return err;
+}
+
+static int nl802154_prep_scan_msg(struct sk_buff *msg,
+ struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, u32 portid,
+ u32 seq, int flags, u8 cmd, u8 arg)
+{
+ void *hdr;
+
+ hdr = nl802154hdr_put(msg, portid, seq, flags, cmd);
+ if (!hdr)
+ return -ENOBUFS;
+
+ if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx))
+ goto nla_put_failure;
+
+ if (wpan_dev->netdev &&
+ nla_put_u32(msg, NL802154_ATTR_IFINDEX, wpan_dev->netdev->ifindex))
+ goto nla_put_failure;
+
+ if (nla_put_u64_64bit(msg, NL802154_ATTR_WPAN_DEV,
+ wpan_dev_id(wpan_dev), NL802154_ATTR_PAD))
+ goto nla_put_failure;
+
+ if (cmd == NL802154_CMD_SCAN_DONE &&
+ nla_put_u8(msg, NL802154_ATTR_SCAN_DONE_REASON, arg))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+
+ return -EMSGSIZE;
+}
+
+static int nl802154_send_scan_msg(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev, u8 cmd, u8 arg)
+{
+ struct sk_buff *msg;
+ int ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ ret = nl802154_prep_scan_msg(msg, rdev, wpan_dev, 0, 0, 0, cmd, arg);
+ if (ret < 0) {
+ nlmsg_free(msg);
+ return ret;
+ }
+
+ return genlmsg_multicast_netns(&nl802154_fam,
+ wpan_phy_net(&rdev->wpan_phy), msg, 0,
+ NL802154_MCGRP_SCAN, GFP_KERNEL);
+}
+
+int nl802154_scan_started(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev)
+{
+ struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(wpan_phy);
+ int err;
+
+ /* Ignore errors when there are no listeners */
+ err = nl802154_send_scan_msg(rdev, wpan_dev, NL802154_CMD_TRIGGER_SCAN, 0);
+ if (err == -ESRCH)
+ err = 0;
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(nl802154_scan_started);
+
+int nl802154_scan_done(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ enum nl802154_scan_done_reasons reason)
+{
+ struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(wpan_phy);
+ int err;
+
+ /* Ignore errors when there are no listeners */
+ err = nl802154_send_scan_msg(rdev, wpan_dev, NL802154_CMD_SCAN_DONE, reason);
+ if (err == -ESRCH)
+ err = 0;
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(nl802154_scan_done);
+
+static int nl802154_abort_scan(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+
+ /* Resources are released in the notification helper above */
+ return rdev_abort_scan(rdev, wpan_dev);
+}
+
+static int
+nl802154_send_beacons(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct wpan_phy *wpan_phy = &rdev->wpan_phy;
+ struct cfg802154_beacon_request *request;
+ int err;
+
+ if (wpan_dev->iftype != NL802154_IFTYPE_COORD) {
+ NL_SET_ERR_MSG(info->extack, "Only coordinators can send beacons");
+ return -EOPNOTSUPP;
+ }
+
+ if (wpan_dev->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST)) {
+ NL_SET_ERR_MSG(info->extack, "Device is not part of any PAN");
+ return -EPERM;
+ }
+
+ if (wpan_phy->flags & WPAN_PHY_FLAG_DATAGRAMS_ONLY) {
+ NL_SET_ERR_MSG(info->extack, "PHY only supports datagrams");
+ return -EOPNOTSUPP;
+ }
+
+ request = kzalloc(sizeof(*request), GFP_KERNEL);
+ if (!request)
+ return -ENOMEM;
+
+ request->wpan_dev = wpan_dev;
+ request->wpan_phy = wpan_phy;
+
+ /* Use maximum duration order by default */
+ request->interval = nla_get_u8_default(info->attrs[NL802154_ATTR_BEACON_INTERVAL],
+ IEEE802154_MAX_SCAN_DURATION);
+
+ err = rdev_send_beacons(rdev, request);
+ if (err) {
+ pr_err("Failure starting sending beacons (%d)\n", err);
+ goto free_request;
+ }
+
+ return 0;
+
+free_request:
+ kfree(request);
+
+ return err;
+}
+
+void nl802154_beaconing_done(struct wpan_dev *wpan_dev)
+{
+ /* NOP */
+}
+EXPORT_SYMBOL_GPL(nl802154_beaconing_done);
+
+static int
+nl802154_stop_beacons(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+
+ /* Resources are released in the notification helper above */
+ return rdev_stop_beacons(rdev, wpan_dev);
+}
+
+static int nl802154_associate(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev;
+ struct wpan_phy *wpan_phy;
+ struct ieee802154_addr coord;
+ int err;
+
+ wpan_dev = dev->ieee802154_ptr;
+ wpan_phy = &rdev->wpan_phy;
+
+ if (wpan_phy->flags & WPAN_PHY_FLAG_DATAGRAMS_ONLY) {
+ NL_SET_ERR_MSG(info->extack, "PHY only supports datagrams");
+ return -EOPNOTSUPP;
+ }
+
+ if (!info->attrs[NL802154_ATTR_PAN_ID] ||
+ !info->attrs[NL802154_ATTR_EXTENDED_ADDR])
+ return -EINVAL;
+
+ coord.pan_id = nla_get_le16(info->attrs[NL802154_ATTR_PAN_ID]);
+ coord.mode = IEEE802154_ADDR_LONG;
+ coord.extended_addr = nla_get_le64(info->attrs[NL802154_ATTR_EXTENDED_ADDR]);
+
+ mutex_lock(&wpan_dev->association_lock);
+ err = rdev_associate(rdev, wpan_dev, &coord);
+ mutex_unlock(&wpan_dev->association_lock);
+ if (err)
+ pr_err("Association with PAN ID 0x%x failed (%d)\n",
+ le16_to_cpu(coord.pan_id), err);
+
+ return err;
+}
+
+static int nl802154_disassociate(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg802154_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ struct wpan_phy *wpan_phy = &rdev->wpan_phy;
+ struct ieee802154_addr target;
+
+ if (wpan_phy->flags & WPAN_PHY_FLAG_DATAGRAMS_ONLY) {
+ NL_SET_ERR_MSG(info->extack, "PHY only supports datagrams");
+ return -EOPNOTSUPP;
+ }
+
+ target.pan_id = wpan_dev->pan_id;
+
+ if (info->attrs[NL802154_ATTR_EXTENDED_ADDR]) {
+ target.mode = IEEE802154_ADDR_LONG;
+ target.extended_addr = nla_get_le64(info->attrs[NL802154_ATTR_EXTENDED_ADDR]);
+ } else if (info->attrs[NL802154_ATTR_SHORT_ADDR]) {
+ target.mode = IEEE802154_ADDR_SHORT;
+ target.short_addr = nla_get_le16(info->attrs[NL802154_ATTR_SHORT_ADDR]);
+ } else {
+ NL_SET_ERR_MSG(info->extack, "Device address is missing");
+ return -EINVAL;
+ }
+
+ mutex_lock(&wpan_dev->association_lock);
+ rdev_disassociate(rdev, wpan_dev, &target);
+ mutex_unlock(&wpan_dev->association_lock);
+
+ return 0;
+}
+
+static int nl802154_set_max_associations(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net_device *dev = info->user_ptr[1];
+ struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
+ unsigned int max_assoc;
+
+ if (!info->attrs[NL802154_ATTR_MAX_ASSOCIATIONS]) {
+ NL_SET_ERR_MSG(info->extack, "No maximum number of association given");
+ return -EINVAL;
+ }
+
+ max_assoc = nla_get_u32(info->attrs[NL802154_ATTR_MAX_ASSOCIATIONS]);
+
+ mutex_lock(&wpan_dev->association_lock);
+ cfg802154_set_max_associations(wpan_dev, max_assoc);
+ mutex_unlock(&wpan_dev->association_lock);
+
+ return 0;
+}
+
+static int nl802154_send_peer_info(struct sk_buff *msg,
+ struct netlink_callback *cb,
+ u32 seq, int flags,
+ struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ struct ieee802154_pan_device *peer,
+ enum nl802154_peer_type type)
+{
+ struct nlattr *nla;
+ void *hdr;
+
+ ASSERT_RTNL();
+
+ hdr = nl802154hdr_put(msg, NETLINK_CB(cb->skb).portid, seq, flags,
+ NL802154_CMD_LIST_ASSOCIATIONS);
+ if (!hdr)
+ return -ENOBUFS;
+
+ genl_dump_check_consistent(cb, hdr);
+
+ nla = nla_nest_start_noflag(msg, NL802154_ATTR_PEER);
+ if (!nla)
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, NL802154_DEV_ADDR_ATTR_PEER_TYPE, type))
+ goto nla_put_failure;
+
+ if (nla_put_u8(msg, NL802154_DEV_ADDR_ATTR_MODE, peer->mode))
+ goto nla_put_failure;
+
+ if (nla_put(msg, NL802154_DEV_ADDR_ATTR_SHORT,
+ IEEE802154_SHORT_ADDR_LEN, &peer->short_addr))
+ goto nla_put_failure;
+
+ if (nla_put(msg, NL802154_DEV_ADDR_ATTR_EXTENDED,
+ IEEE802154_EXTENDED_ADDR_LEN, &peer->extended_addr))
+ goto nla_put_failure;
+
+ nla_nest_end(msg, nla);
+
+ genlmsg_end(msg, hdr);
+
+ return 0;
+
+ nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static int nl802154_list_associations(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct cfg802154_registered_device *rdev;
+ struct ieee802154_pan_device *child;
+ struct wpan_dev *wpan_dev;
+ int err;
+
+ err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev);
+ if (err)
+ return err;
+
+ mutex_lock(&wpan_dev->association_lock);
+
+ if (cb->args[2])
+ goto out;
+
+ if (wpan_dev->parent) {
+ err = nl802154_send_peer_info(skb, cb, cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, rdev, wpan_dev,
+ wpan_dev->parent,
+ NL802154_PEER_TYPE_PARENT);
+ if (err < 0)
+ goto out_err;
+ }
+
+ list_for_each_entry(child, &wpan_dev->children, node) {
+ err = nl802154_send_peer_info(skb, cb, cb->nlh->nlmsg_seq,
+ NLM_F_MULTI, rdev, wpan_dev,
+ child,
+ NL802154_PEER_TYPE_CHILD);
+ if (err < 0)
+ goto out_err;
+ }
+
+ cb->args[2] = 1;
+out:
+ err = skb->len;
+out_err:
+ mutex_unlock(&wpan_dev->association_lock);
+
+ nl802154_finish_wpan_dev_dump(rdev);
+
+ return err;
+}
+
#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
static const struct nla_policy nl802154_dev_addr_policy[NL802154_DEV_ADDR_ATTR_MAX + 1] = {
[NL802154_DEV_ADDR_ATTR_PAN_ID] = { .type = NLA_U16 },
@@ -1306,23 +1825,23 @@ ieee802154_llsec_parse_dev_addr(struct nlattr *nla,
{
struct nlattr *attrs[NL802154_DEV_ADDR_ATTR_MAX + 1];
- if (!nla || nla_parse_nested(attrs, NL802154_DEV_ADDR_ATTR_MAX, nla,
- nl802154_dev_addr_policy, NULL))
+ if (!nla || nla_parse_nested_deprecated(attrs, NL802154_DEV_ADDR_ATTR_MAX, nla, nl802154_dev_addr_policy, NULL))
return -EINVAL;
- if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] ||
- !attrs[NL802154_DEV_ADDR_ATTR_MODE] ||
- !(attrs[NL802154_DEV_ADDR_ATTR_SHORT] ||
- attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]))
+ if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] || !attrs[NL802154_DEV_ADDR_ATTR_MODE])
return -EINVAL;
addr->pan_id = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_PAN_ID]);
addr->mode = nla_get_u32(attrs[NL802154_DEV_ADDR_ATTR_MODE]);
switch (addr->mode) {
case NL802154_DEV_ADDR_SHORT:
+ if (!attrs[NL802154_DEV_ADDR_ATTR_SHORT])
+ return -EINVAL;
addr->short_addr = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_SHORT]);
break;
case NL802154_DEV_ADDR_EXTENDED:
+ if (!attrs[NL802154_DEV_ADDR_ATTR_EXTENDED])
+ return -EINVAL;
addr->extended_addr = nla_get_le64(attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]);
break;
default:
@@ -1346,8 +1865,7 @@ ieee802154_llsec_parse_key_id(struct nlattr *nla,
{
struct nlattr *attrs[NL802154_KEY_ID_ATTR_MAX + 1];
- if (!nla || nla_parse_nested(attrs, NL802154_KEY_ID_ATTR_MAX, nla,
- nl802154_key_id_policy, NULL))
+ if (!nla || nla_parse_nested_deprecated(attrs, NL802154_KEY_ID_ATTR_MAX, nla, nl802154_key_id_policy, NULL))
return -EINVAL;
if (!attrs[NL802154_KEY_ID_ATTR_MODE])
@@ -1402,6 +1920,9 @@ static int nl802154_set_llsec_params(struct sk_buff *skb,
u32 changed = 0;
int ret;
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ return -EOPNOTSUPP;
+
if (info->attrs[NL802154_ATTR_SEC_ENABLED]) {
u8 enabled;
@@ -1450,16 +1971,16 @@ static int nl802154_send_key(struct sk_buff *msg, u32 cmd, u32 portid,
hdr = nl802154hdr_put(msg, portid, seq, flags, cmd);
if (!hdr)
- return -1;
+ return -ENOBUFS;
if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex))
goto nla_put_failure;
- nl_key = nla_nest_start(msg, NL802154_ATTR_SEC_KEY);
+ nl_key = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_KEY);
if (!nl_key)
goto nla_put_failure;
- nl_key_id = nla_nest_start(msg, NL802154_KEY_ATTR_ID);
+ nl_key_id = nla_nest_start_noflag(msg, NL802154_KEY_ATTR_ID);
if (!nl_key_id)
goto nla_put_failure;
@@ -1508,6 +2029,11 @@ nl802154_dump_llsec_key(struct sk_buff *skb, struct netlink_callback *cb)
if (err)
return err;
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) {
+ err = skb->len;
+ goto out_err;
+ }
+
if (!wpan_dev->netdev) {
err = -EINVAL;
goto out_err;
@@ -1562,9 +2088,11 @@ static int nl802154_add_llsec_key(struct sk_buff *skb, struct genl_info *info)
struct ieee802154_llsec_key_id id = { };
u32 commands[NL802154_CMD_FRAME_NR_IDS / 32] = { };
- if (nla_parse_nested(attrs, NL802154_KEY_ATTR_MAX,
- info->attrs[NL802154_ATTR_SEC_KEY],
- nl802154_key_policy, info->extack))
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL802154_ATTR_SEC_KEY] ||
+ nla_parse_nested_deprecated(attrs, NL802154_KEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_KEY], nl802154_key_policy, info->extack))
return -EINVAL;
if (!attrs[NL802154_KEY_ATTR_USAGE_FRAMES] ||
@@ -1612,9 +2140,11 @@ static int nl802154_del_llsec_key(struct sk_buff *skb, struct genl_info *info)
struct nlattr *attrs[NL802154_KEY_ATTR_MAX + 1];
struct ieee802154_llsec_key_id id;
- if (nla_parse_nested(attrs, NL802154_KEY_ATTR_MAX,
- info->attrs[NL802154_ATTR_SEC_KEY],
- nl802154_key_policy, info->extack))
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL802154_ATTR_SEC_KEY] ||
+ nla_parse_nested_deprecated(attrs, NL802154_KEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_KEY], nl802154_key_policy, info->extack))
return -EINVAL;
if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0)
@@ -1634,12 +2164,12 @@ static int nl802154_send_device(struct sk_buff *msg, u32 cmd, u32 portid,
hdr = nl802154hdr_put(msg, portid, seq, flags, cmd);
if (!hdr)
- return -1;
+ return -ENOBUFS;
if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex))
goto nla_put_failure;
- nl_device = nla_nest_start(msg, NL802154_ATTR_SEC_DEVICE);
+ nl_device = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_DEVICE);
if (!nl_device)
goto nla_put_failure;
@@ -1678,6 +2208,11 @@ nl802154_dump_llsec_dev(struct sk_buff *skb, struct netlink_callback *cb)
if (err)
return err;
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) {
+ err = skb->len;
+ goto out_err;
+ }
+
if (!wpan_dev->netdev) {
err = -EINVAL;
goto out_err;
@@ -1728,8 +2263,7 @@ ieee802154_llsec_parse_device(struct nlattr *nla,
{
struct nlattr *attrs[NL802154_DEV_ATTR_MAX + 1];
- if (!nla || nla_parse_nested(attrs, NL802154_DEV_ATTR_MAX,
- nla, nl802154_dev_policy, NULL))
+ if (!nla || nla_parse_nested_deprecated(attrs, NL802154_DEV_ATTR_MAX, nla, nl802154_dev_policy, NULL))
return -EINVAL;
memset(dev, 0, sizeof(*dev));
@@ -1765,6 +2299,9 @@ static int nl802154_add_llsec_dev(struct sk_buff *skb, struct genl_info *info)
struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
struct ieee802154_llsec_device dev_desc;
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ return -EOPNOTSUPP;
+
if (ieee802154_llsec_parse_device(info->attrs[NL802154_ATTR_SEC_DEVICE],
&dev_desc) < 0)
return -EINVAL;
@@ -1780,9 +2317,11 @@ static int nl802154_del_llsec_dev(struct sk_buff *skb, struct genl_info *info)
struct nlattr *attrs[NL802154_DEV_ATTR_MAX + 1];
__le64 extended_addr;
- if (nla_parse_nested(attrs, NL802154_DEV_ATTR_MAX,
- info->attrs[NL802154_ATTR_SEC_DEVICE],
- nl802154_dev_policy, info->extack))
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL802154_ATTR_SEC_DEVICE] ||
+ nla_parse_nested_deprecated(attrs, NL802154_DEV_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_DEVICE], nl802154_dev_policy, info->extack))
return -EINVAL;
if (!attrs[NL802154_DEV_ATTR_EXTENDED_ADDR])
@@ -1803,12 +2342,12 @@ static int nl802154_send_devkey(struct sk_buff *msg, u32 cmd, u32 portid,
hdr = nl802154hdr_put(msg, portid, seq, flags, cmd);
if (!hdr)
- return -1;
+ return -ENOBUFS;
if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex))
goto nla_put_failure;
- nl_devkey = nla_nest_start(msg, NL802154_ATTR_SEC_DEVKEY);
+ nl_devkey = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_DEVKEY);
if (!nl_devkey)
goto nla_put_failure;
@@ -1818,7 +2357,7 @@ static int nl802154_send_devkey(struct sk_buff *msg, u32 cmd, u32 portid,
devkey->frame_counter))
goto nla_put_failure;
- nl_key_id = nla_nest_start(msg, NL802154_DEVKEY_ATTR_ID);
+ nl_key_id = nla_nest_start_noflag(msg, NL802154_DEVKEY_ATTR_ID);
if (!nl_key_id)
goto nla_put_failure;
@@ -1850,6 +2389,11 @@ nl802154_dump_llsec_devkey(struct sk_buff *skb, struct netlink_callback *cb)
if (err)
return err;
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) {
+ err = skb->len;
+ goto out_err;
+ }
+
if (!wpan_dev->netdev) {
err = -EINVAL;
goto out_err;
@@ -1907,10 +2451,11 @@ static int nl802154_add_llsec_devkey(struct sk_buff *skb, struct genl_info *info
struct ieee802154_llsec_device_key key;
__le64 extended_addr;
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ return -EOPNOTSUPP;
+
if (!info->attrs[NL802154_ATTR_SEC_DEVKEY] ||
- nla_parse_nested(attrs, NL802154_DEVKEY_ATTR_MAX,
- info->attrs[NL802154_ATTR_SEC_DEVKEY],
- nl802154_devkey_policy, info->extack) < 0)
+ nla_parse_nested_deprecated(attrs, NL802154_DEVKEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_DEVKEY], nl802154_devkey_policy, info->extack) < 0)
return -EINVAL;
if (!attrs[NL802154_DEVKEY_ATTR_FRAME_COUNTER] ||
@@ -1940,9 +2485,11 @@ static int nl802154_del_llsec_devkey(struct sk_buff *skb, struct genl_info *info
struct ieee802154_llsec_device_key key;
__le64 extended_addr;
- if (nla_parse_nested(attrs, NL802154_DEVKEY_ATTR_MAX,
- info->attrs[NL802154_ATTR_SEC_DEVKEY],
- nl802154_devkey_policy, info->extack))
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ return -EOPNOTSUPP;
+
+ if (!info->attrs[NL802154_ATTR_SEC_DEVKEY] ||
+ nla_parse_nested_deprecated(attrs, NL802154_DEVKEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_DEVKEY], nl802154_devkey_policy, info->extack))
return -EINVAL;
if (!attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR])
@@ -1971,12 +2518,12 @@ static int nl802154_send_seclevel(struct sk_buff *msg, u32 cmd, u32 portid,
hdr = nl802154hdr_put(msg, portid, seq, flags, cmd);
if (!hdr)
- return -1;
+ return -ENOBUFS;
if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex))
goto nla_put_failure;
- nl_seclevel = nla_nest_start(msg, NL802154_ATTR_SEC_LEVEL);
+ nl_seclevel = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_LEVEL);
if (!nl_seclevel)
goto nla_put_failure;
@@ -2015,6 +2562,11 @@ nl802154_dump_llsec_seclevel(struct sk_buff *skb, struct netlink_callback *cb)
if (err)
return err;
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) {
+ err = skb->len;
+ goto out_err;
+ }
+
if (!wpan_dev->netdev) {
err = -EINVAL;
goto out_err;
@@ -2062,8 +2614,7 @@ llsec_parse_seclevel(struct nlattr *nla, struct ieee802154_llsec_seclevel *sl)
{
struct nlattr *attrs[NL802154_SECLEVEL_ATTR_MAX + 1];
- if (!nla || nla_parse_nested(attrs, NL802154_SECLEVEL_ATTR_MAX,
- nla, nl802154_seclevel_policy, NULL))
+ if (!nla || nla_parse_nested_deprecated(attrs, NL802154_SECLEVEL_ATTR_MAX, nla, nl802154_seclevel_policy, NULL))
return -EINVAL;
memset(sl, 0, sizeof(*sl));
@@ -2100,6 +2651,9 @@ static int nl802154_add_llsec_seclevel(struct sk_buff *skb,
struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
struct ieee802154_llsec_seclevel sl;
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ return -EOPNOTSUPP;
+
if (llsec_parse_seclevel(info->attrs[NL802154_ATTR_SEC_LEVEL],
&sl) < 0)
return -EINVAL;
@@ -2115,8 +2669,10 @@ static int nl802154_del_llsec_seclevel(struct sk_buff *skb,
struct wpan_dev *wpan_dev = dev->ieee802154_ptr;
struct ieee802154_llsec_seclevel sl;
- if (!info->attrs[NL802154_ATTR_SEC_LEVEL] ||
- llsec_parse_seclevel(info->attrs[NL802154_ATTR_SEC_LEVEL],
+ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR)
+ return -EOPNOTSUPP;
+
+ if (llsec_parse_seclevel(info->attrs[NL802154_ATTR_SEC_LEVEL],
&sl) < 0)
return -EINVAL;
@@ -2128,13 +2684,10 @@ static int nl802154_del_llsec_seclevel(struct sk_buff *skb,
#define NL802154_FLAG_NEED_NETDEV 0x02
#define NL802154_FLAG_NEED_RTNL 0x04
#define NL802154_FLAG_CHECK_NETDEV_UP 0x08
-#define NL802154_FLAG_NEED_NETDEV_UP (NL802154_FLAG_NEED_NETDEV |\
- NL802154_FLAG_CHECK_NETDEV_UP)
#define NL802154_FLAG_NEED_WPAN_DEV 0x10
-#define NL802154_FLAG_NEED_WPAN_DEV_UP (NL802154_FLAG_NEED_WPAN_DEV |\
- NL802154_FLAG_CHECK_NETDEV_UP)
-static int nl802154_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
+static int nl802154_pre_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb,
struct genl_info *info)
{
struct cfg802154_registered_device *rdev;
@@ -2196,15 +2749,15 @@ static int nl802154_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
return 0;
}
-static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
+static void nl802154_post_doit(const struct genl_split_ops *ops,
+ struct sk_buff *skb,
struct genl_info *info)
{
if (info->user_ptr[1]) {
if (ops->internal_flags & NL802154_FLAG_NEED_WPAN_DEV) {
struct wpan_dev *wpan_dev = info->user_ptr[1];
- if (wpan_dev->netdev)
- dev_put(wpan_dev->netdev);
+ dev_put(wpan_dev->netdev);
} else {
dev_put(info->user_ptr[1]);
}
@@ -2217,165 +2770,227 @@ static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
static const struct genl_ops nl802154_ops[] = {
{
.cmd = NL802154_CMD_GET_WPAN_PHY,
+ .validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP_STRICT,
.doit = nl802154_get_wpan_phy,
.dumpit = nl802154_dump_wpan_phy,
.done = nl802154_dump_wpan_phy_done,
- .policy = nl802154_policy,
/* can be retrieved by unprivileged users */
.internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_GET_INTERFACE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_get_interface,
.dumpit = nl802154_dump_interface,
- .policy = nl802154_policy,
/* can be retrieved by unprivileged users */
.internal_flags = NL802154_FLAG_NEED_WPAN_DEV |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_NEW_INTERFACE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_new_interface,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_DEL_INTERFACE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_del_interface,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_WPAN_DEV |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_SET_CHANNEL,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_set_channel,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_SET_CCA_MODE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_set_cca_mode,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_SET_CCA_ED_LEVEL,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_set_cca_ed_level,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_SET_TX_POWER,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_set_tx_power,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_SET_WPAN_PHY_NETNS,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_wpan_phy_netns,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_WPAN_PHY |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_SET_PAN_ID,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_set_pan_id,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_NETDEV |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_SET_SHORT_ADDR,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_set_short_addr,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_NETDEV |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_SET_BACKOFF_EXPONENT,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_set_backoff_exponent,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_NETDEV |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_SET_MAX_CSMA_BACKOFFS,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_set_max_csma_backoffs,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_NETDEV |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_SET_MAX_FRAME_RETRIES,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_set_max_frame_retries,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_NETDEV |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_SET_LBT_MODE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_set_lbt_mode,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_NETDEV |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_SET_ACKREQ_DEFAULT,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_set_ackreq_default,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_NETDEV |
NL802154_FLAG_NEED_RTNL,
},
+ {
+ .cmd = NL802154_CMD_TRIGGER_SCAN,
+ .doit = nl802154_trigger_scan,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_CHECK_NETDEV_UP |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_ABORT_SCAN,
+ .doit = nl802154_abort_scan,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_CHECK_NETDEV_UP |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SEND_BEACONS,
+ .doit = nl802154_send_beacons,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_CHECK_NETDEV_UP |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_STOP_BEACONS,
+ .doit = nl802154_stop_beacons,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_CHECK_NETDEV_UP |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_ASSOCIATE,
+ .doit = nl802154_associate,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_CHECK_NETDEV_UP |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_DISASSOCIATE,
+ .doit = nl802154_disassociate,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_CHECK_NETDEV_UP |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_SET_MAX_ASSOCIATIONS,
+ .doit = nl802154_set_max_associations,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL802154_FLAG_NEED_NETDEV |
+ NL802154_FLAG_NEED_RTNL,
+ },
+ {
+ .cmd = NL802154_CMD_LIST_ASSOCIATIONS,
+ .dumpit = nl802154_list_associations,
+ /* can be retrieved by unprivileged users */
+ },
#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
{
.cmd = NL802154_CMD_SET_SEC_PARAMS,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_set_llsec_params,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_NETDEV |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_GET_SEC_KEY,
+ .validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP_STRICT,
/* TODO .doit by matching key id? */
.dumpit = nl802154_dump_llsec_key,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_NETDEV |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_NEW_SEC_KEY,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_add_llsec_key,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_NETDEV |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_DEL_SEC_KEY,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_del_llsec_key,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_NETDEV |
NL802154_FLAG_NEED_RTNL,
@@ -2383,25 +2998,26 @@ static const struct genl_ops nl802154_ops[] = {
/* TODO unique identifier must short+pan OR extended_addr */
{
.cmd = NL802154_CMD_GET_SEC_DEV,
+ .validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP_STRICT,
/* TODO .doit by matching extended_addr? */
.dumpit = nl802154_dump_llsec_dev,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_NETDEV |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_NEW_SEC_DEV,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_add_llsec_dev,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_NETDEV |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_DEL_SEC_DEV,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_del_llsec_dev,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_NETDEV |
NL802154_FLAG_NEED_RTNL,
@@ -2409,51 +3025,53 @@ static const struct genl_ops nl802154_ops[] = {
/* TODO remove complete devkey, put it as nested? */
{
.cmd = NL802154_CMD_GET_SEC_DEVKEY,
+ .validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP_STRICT,
/* TODO doit by matching ??? */
.dumpit = nl802154_dump_llsec_devkey,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_NETDEV |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_NEW_SEC_DEVKEY,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_add_llsec_devkey,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_NETDEV |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_DEL_SEC_DEVKEY,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_del_llsec_devkey,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_NETDEV |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_GET_SEC_LEVEL,
+ .validate = GENL_DONT_VALIDATE_STRICT |
+ GENL_DONT_VALIDATE_DUMP_STRICT,
/* TODO .doit by matching frame_type? */
.dumpit = nl802154_dump_llsec_seclevel,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_NETDEV |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_NEW_SEC_LEVEL,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = nl802154_add_llsec_seclevel,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_NETDEV |
NL802154_FLAG_NEED_RTNL,
},
{
.cmd = NL802154_CMD_DEL_SEC_LEVEL,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
/* TODO match frame_type only? */
.doit = nl802154_del_llsec_seclevel,
- .policy = nl802154_policy,
.flags = GENL_ADMIN_PERM,
.internal_flags = NL802154_FLAG_NEED_NETDEV |
NL802154_FLAG_NEED_RTNL,
@@ -2466,12 +3084,14 @@ static struct genl_family nl802154_fam __ro_after_init = {
.hdrsize = 0, /* no private header */
.version = 1, /* no particular meaning now */
.maxattr = NL802154_ATTR_MAX,
+ .policy = nl802154_policy,
.netnsok = true,
.pre_doit = nl802154_pre_doit,
.post_doit = nl802154_post_doit,
.module = THIS_MODULE,
.ops = nl802154_ops,
.n_ops = ARRAY_SIZE(nl802154_ops),
+ .resv_start_op = NL802154_CMD_DEL_SEC_LEVEL + 1,
.mcgrps = nl802154_mcgrps,
.n_mcgrps = ARRAY_SIZE(nl802154_mcgrps),
};
diff --git a/net/ieee802154/nl802154.h b/net/ieee802154/nl802154.h
index 8c4b6d08954c..d69d950f9a6a 100644
--- a/net/ieee802154/nl802154.h
+++ b/net/ieee802154/nl802154.h
@@ -4,5 +4,11 @@
int nl802154_init(void);
void nl802154_exit(void);
+int nl802154_scan_event(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ struct ieee802154_coord_desc *desc);
+int nl802154_scan_started(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev);
+int nl802154_scan_done(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+ enum nl802154_scan_done_reasons reason);
+void nl802154_beaconing_done(struct wpan_dev *wpan_dev);
#endif /* __IEEE802154_NL802154_H */
diff --git a/net/ieee802154/nl_policy.c b/net/ieee802154/nl_policy.c
index 78f6f1233194..0672b2f01586 100644
--- a/net/ieee802154/nl_policy.c
+++ b/net/ieee802154/nl_policy.c
@@ -1,17 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* nl802154.h
*
* Copyright (C) 2007, 2008 Siemens AG
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
*/
#include <linux/kernel.h>
@@ -30,7 +21,13 @@ const struct nla_policy ieee802154_policy[IEEE802154_ATTR_MAX + 1] = {
[IEEE802154_ATTR_HW_ADDR] = { .type = NLA_HW_ADDR, },
[IEEE802154_ATTR_PAN_ID] = { .type = NLA_U16, },
[IEEE802154_ATTR_CHANNEL] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_BCN_ORD] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_SF_ORD] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_PAN_COORD] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_BAT_EXT] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_COORD_REALIGN] = { .type = NLA_U8, },
[IEEE802154_ATTR_PAGE] = { .type = NLA_U8, },
+ [IEEE802154_ATTR_DEV_TYPE] = { .type = NLA_U8, },
[IEEE802154_ATTR_COORD_SHORT_ADDR] = { .type = NLA_U16, },
[IEEE802154_ATTR_COORD_HW_ADDR] = { .type = NLA_HW_ADDR, },
[IEEE802154_ATTR_COORD_PAN_ID] = { .type = NLA_U16, },
diff --git a/net/ieee802154/pan.c b/net/ieee802154/pan.c
new file mode 100644
index 000000000000..249df7364b3e
--- /dev/null
+++ b/net/ieee802154/pan.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * IEEE 802.15.4 PAN management
+ *
+ * Copyright (C) 2023 Qorvo US, Inc
+ * Authors:
+ * - David Girault <david.girault@qorvo.com>
+ * - Miquel Raynal <miquel.raynal@bootlin.com>
+ */
+
+#include <linux/kernel.h>
+#include <net/cfg802154.h>
+#include <net/af_ieee802154.h>
+
+/* Checks whether a device address matches one from the PAN list.
+ * This helper is meant to be used only during PAN management, when we expect
+ * extended addresses to be used.
+ */
+static bool cfg802154_pan_device_is_matching(struct ieee802154_pan_device *pan_dev,
+ struct ieee802154_addr *ext_dev)
+{
+ if (!pan_dev || !ext_dev)
+ return false;
+
+ if (ext_dev->mode == IEEE802154_ADDR_SHORT)
+ return false;
+
+ return pan_dev->extended_addr == ext_dev->extended_addr;
+}
+
+bool cfg802154_device_is_associated(struct wpan_dev *wpan_dev)
+{
+ bool is_assoc;
+
+ mutex_lock(&wpan_dev->association_lock);
+ is_assoc = !list_empty(&wpan_dev->children) || wpan_dev->parent;
+ mutex_unlock(&wpan_dev->association_lock);
+
+ return is_assoc;
+}
+
+bool cfg802154_device_is_parent(struct wpan_dev *wpan_dev,
+ struct ieee802154_addr *target)
+{
+ lockdep_assert_held(&wpan_dev->association_lock);
+
+ return cfg802154_pan_device_is_matching(wpan_dev->parent, target);
+}
+EXPORT_SYMBOL_GPL(cfg802154_device_is_parent);
+
+struct ieee802154_pan_device *
+cfg802154_device_is_child(struct wpan_dev *wpan_dev,
+ struct ieee802154_addr *target)
+{
+ struct ieee802154_pan_device *child;
+
+ lockdep_assert_held(&wpan_dev->association_lock);
+
+ list_for_each_entry(child, &wpan_dev->children, node)
+ if (cfg802154_pan_device_is_matching(child, target))
+ return child;
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(cfg802154_device_is_child);
+
+__le16 cfg802154_get_free_short_addr(struct wpan_dev *wpan_dev)
+{
+ struct ieee802154_pan_device *child;
+ __le16 addr;
+
+ lockdep_assert_held(&wpan_dev->association_lock);
+
+ do {
+ get_random_bytes(&addr, 2);
+ if (addr == cpu_to_le16(IEEE802154_ADDR_SHORT_BROADCAST) ||
+ addr == cpu_to_le16(IEEE802154_ADDR_SHORT_UNSPEC))
+ continue;
+
+ if (wpan_dev->short_addr == addr)
+ continue;
+
+ if (wpan_dev->parent && wpan_dev->parent->short_addr == addr)
+ continue;
+
+ list_for_each_entry(child, &wpan_dev->children, node)
+ if (child->short_addr == addr)
+ continue;
+
+ break;
+ } while (1);
+
+ return addr;
+}
+EXPORT_SYMBOL_GPL(cfg802154_get_free_short_addr);
+
+unsigned int cfg802154_set_max_associations(struct wpan_dev *wpan_dev,
+ unsigned int max)
+{
+ unsigned int old_max;
+
+ lockdep_assert_held(&wpan_dev->association_lock);
+
+ old_max = wpan_dev->max_associations;
+ wpan_dev->max_associations = max;
+
+ return old_max;
+}
+EXPORT_SYMBOL_GPL(cfg802154_set_max_associations);
diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h
index 598f5af49775..64071ef6f57b 100644
--- a/net/ieee802154/rdev-ops.h
+++ b/net/ieee802154/rdev-ops.h
@@ -209,6 +209,92 @@ rdev_set_ackreq_default(struct cfg802154_registered_device *rdev,
return ret;
}
+static inline int rdev_trigger_scan(struct cfg802154_registered_device *rdev,
+ struct cfg802154_scan_request *request)
+{
+ int ret;
+
+ if (!rdev->ops->trigger_scan)
+ return -EOPNOTSUPP;
+
+ trace_802154_rdev_trigger_scan(&rdev->wpan_phy, request);
+ ret = rdev->ops->trigger_scan(&rdev->wpan_phy, request);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int rdev_abort_scan(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev)
+{
+ int ret;
+
+ if (!rdev->ops->abort_scan)
+ return -EOPNOTSUPP;
+
+ trace_802154_rdev_abort_scan(&rdev->wpan_phy, wpan_dev);
+ ret = rdev->ops->abort_scan(&rdev->wpan_phy, wpan_dev);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int rdev_send_beacons(struct cfg802154_registered_device *rdev,
+ struct cfg802154_beacon_request *request)
+{
+ int ret;
+
+ if (!rdev->ops->send_beacons)
+ return -EOPNOTSUPP;
+
+ trace_802154_rdev_send_beacons(&rdev->wpan_phy, request);
+ ret = rdev->ops->send_beacons(&rdev->wpan_phy, request);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int rdev_stop_beacons(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev)
+{
+ int ret;
+
+ if (!rdev->ops->stop_beacons)
+ return -EOPNOTSUPP;
+
+ trace_802154_rdev_stop_beacons(&rdev->wpan_phy, wpan_dev);
+ ret = rdev->ops->stop_beacons(&rdev->wpan_phy, wpan_dev);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int rdev_associate(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ struct ieee802154_addr *coord)
+{
+ int ret;
+
+ if (!rdev->ops->associate)
+ return -EOPNOTSUPP;
+
+ trace_802154_rdev_associate(&rdev->wpan_phy, wpan_dev, coord);
+ ret = rdev->ops->associate(&rdev->wpan_phy, wpan_dev, coord);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
+static inline int rdev_disassociate(struct cfg802154_registered_device *rdev,
+ struct wpan_dev *wpan_dev,
+ struct ieee802154_addr *target)
+{
+ int ret;
+
+ if (!rdev->ops->disassociate)
+ return -EOPNOTSUPP;
+
+ trace_802154_rdev_disassociate(&rdev->wpan_phy, wpan_dev, target);
+ ret = rdev->ops->disassociate(&rdev->wpan_phy, wpan_dev, target);
+ trace_802154_rdev_return_int(&rdev->wpan_phy, ret);
+ return ret;
+}
+
#ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
/* TODO this is already a nl802154, so move into ieee802154 */
static inline void
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index bc6b912603f1..e542fbe113e7 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -1,17 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* IEEE802154.4 socket interface
*
* Copyright 2007, 2008 Siemens AG
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
* Written by:
* Sergey Lapin <slapin@ossfans.org>
* Maxim Gorbachyov <maxim.gorbachev@siemens.com>
@@ -49,8 +41,7 @@ ieee802154_get_dev(struct net *net, const struct ieee802154_addr *addr)
ieee802154_devaddr_to_raw(hwaddr, addr->extended_addr);
rcu_read_lock();
dev = dev_getbyhwaddr_rcu(net, ARPHRD_IEEE802154, hwaddr);
- if (dev)
- dev_hold(dev);
+ dev_hold(dev);
rcu_read_unlock();
break;
case IEEE802154_ADDR_SHORT:
@@ -105,7 +96,7 @@ static int ieee802154_sock_sendmsg(struct socket *sock, struct msghdr *msg,
return sk->sk_prot->sendmsg(sk, msg, len);
}
-static int ieee802154_sock_bind(struct socket *sock, struct sockaddr *uaddr,
+static int ieee802154_sock_bind(struct socket *sock, struct sockaddr_unsized *uaddr,
int addr_len)
{
struct sock *sk = sock->sk;
@@ -116,7 +107,7 @@ static int ieee802154_sock_bind(struct socket *sock, struct sockaddr *uaddr,
return sock_no_bind(sock, uaddr, addr_len);
}
-static int ieee802154_sock_connect(struct socket *sock, struct sockaddr *uaddr,
+static int ieee802154_sock_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
@@ -137,7 +128,7 @@ static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg,
int ret = -ENOIOCTLCMD;
struct net_device *dev;
- if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+ if (get_user_ifreq(&ifr, NULL, arg))
return -EFAULT;
ifr.ifr_name[IFNAMSIZ-1] = 0;
@@ -151,7 +142,7 @@ static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg,
if (dev->type == ARPHRD_IEEE802154 && dev->netdev_ops->ndo_do_ioctl)
ret = dev->netdev_ops->ndo_do_ioctl(dev, &ifr, cmd);
- if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq)))
+ if (!ret && put_user_ifreq(&ifr, arg))
ret = -EFAULT;
dev_put(dev);
@@ -164,10 +155,6 @@ static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd,
struct sock *sk = sock->sk;
switch (cmd) {
- case SIOCGSTAMP:
- return sock_get_timestamp(sk, (struct timeval __user *)arg);
- case SIOCGSTAMPNS:
- return sock_get_timestampns(sk, (struct timespec __user *)arg);
case SIOCGIFADDR:
case SIOCSIFADDR:
return ieee802154_dev_ioctl(sk, (struct ifreq __user *)arg,
@@ -175,7 +162,7 @@ static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd,
default:
if (!sk->sk_prot->ioctl)
return -ENOIOCTLCMD;
- return sk->sk_prot->ioctl(sk, cmd, arg);
+ return sk_ioctl(sk, cmd, (void __user *)arg);
}
}
@@ -187,8 +174,8 @@ static int raw_hash(struct sock *sk)
{
write_lock_bh(&raw_lock);
sk_add_node(sk, &raw_head);
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
write_unlock_bh(&raw_lock);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
return 0;
}
@@ -206,15 +193,16 @@ static void raw_close(struct sock *sk, long timeout)
sk_common_release(sk);
}
-static int raw_bind(struct sock *sk, struct sockaddr *_uaddr, int len)
+static int raw_bind(struct sock *sk, struct sockaddr_unsized *_uaddr, int len)
{
struct ieee802154_addr addr;
struct sockaddr_ieee802154 *uaddr = (struct sockaddr_ieee802154 *)_uaddr;
int err = 0;
struct net_device *dev = NULL;
- if (len < sizeof(*uaddr))
- return -EINVAL;
+ err = ieee802154_sockaddr_check_size(uaddr, len);
+ if (err < 0)
+ return err;
uaddr = (struct sockaddr_ieee802154 *)_uaddr;
if (uaddr->family != AF_IEEE802154)
@@ -239,7 +227,7 @@ out:
return err;
}
-static int raw_connect(struct sock *sk, struct sockaddr *uaddr,
+static int raw_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
int addr_len)
{
return -ENOTSUPP;
@@ -284,6 +272,10 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
err = -EMSGSIZE;
goto out_dev;
}
+ if (!size) {
+ err = 0;
+ goto out_dev;
+ }
hlen = LL_RESERVED_SPACE(dev);
tlen = dev->needed_tailroom;
@@ -321,13 +313,13 @@ out:
}
static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int noblock, int flags, int *addr_len)
+ int flags, int *addr_len)
{
size_t copied = 0;
int err = -EOPNOTSUPP;
struct sk_buff *skb;
- skb = skb_recv_datagram(sk, flags, noblock, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
if (!skb)
goto out;
@@ -341,7 +333,7 @@ static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (err)
goto done;
- sock_recv_ts_and_drops(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
if (flags & MSG_TRUNC)
copied = skb->len;
@@ -394,7 +386,7 @@ static int raw_getsockopt(struct sock *sk, int level, int optname,
}
static int raw_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
return -EOPNOTSUPP;
}
@@ -426,6 +418,7 @@ static const struct proto_ops ieee802154_raw_ops = {
.getname = sock_no_getname,
.poll = datagram_poll,
.ioctl = ieee802154_sock_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = sock_common_setsockopt,
@@ -433,11 +426,6 @@ static const struct proto_ops ieee802154_raw_ops = {
.sendmsg = ieee802154_sock_sendmsg,
.recvmsg = sock_common_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
-#endif
};
/* DGRAM Sockets (802.15.4 dataframes) */
@@ -469,8 +457,8 @@ static int dgram_hash(struct sock *sk)
{
write_lock_bh(&dgram_lock);
sk_add_node(sk, &dgram_head);
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
write_unlock_bh(&dgram_lock);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
return 0;
}
@@ -497,7 +485,7 @@ static void dgram_close(struct sock *sk, long timeout)
sk_common_release(sk);
}
-static int dgram_bind(struct sock *sk, struct sockaddr *uaddr, int len)
+static int dgram_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int len)
{
struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr;
struct ieee802154_addr haddr;
@@ -509,11 +497,14 @@ static int dgram_bind(struct sock *sk, struct sockaddr *uaddr, int len)
ro->bound = 0;
- if (len < sizeof(*addr))
+ err = ieee802154_sockaddr_check_size(addr, len);
+ if (err < 0)
goto out;
- if (addr->family != AF_IEEE802154)
+ if (addr->family != AF_IEEE802154) {
+ err = -EINVAL;
goto out;
+ }
ieee802154_addr_from_sa(&haddr, &addr->addr);
dev = ieee802154_get_dev(sock_net(sk), &haddr);
@@ -539,22 +530,21 @@ out:
return err;
}
-static int dgram_ioctl(struct sock *sk, int cmd, unsigned long arg)
+static int dgram_ioctl(struct sock *sk, int cmd, int *karg)
{
switch (cmd) {
case SIOCOUTQ:
{
- int amount = sk_wmem_alloc_get(sk);
+ *karg = sk_wmem_alloc_get(sk);
- return put_user(amount, (int __user *)arg);
+ return 0;
}
case SIOCINQ:
{
struct sk_buff *skb;
- unsigned long amount;
- amount = 0;
+ *karg = 0;
spin_lock_bh(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue);
if (skb) {
@@ -562,10 +552,10 @@ static int dgram_ioctl(struct sock *sk, int cmd, unsigned long arg)
* of this packet since that is all
* that will be read.
*/
- amount = skb->len - ieee802154_hdr_length(skb);
+ *karg = skb->len - ieee802154_hdr_length(skb);
}
spin_unlock_bh(&sk->sk_receive_queue.lock);
- return put_user(amount, (int __user *)arg);
+ return 0;
}
}
@@ -573,15 +563,16 @@ static int dgram_ioctl(struct sock *sk, int cmd, unsigned long arg)
}
/* FIXME: autobind */
-static int dgram_connect(struct sock *sk, struct sockaddr *uaddr,
+static int dgram_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
int len)
{
struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr;
struct dgram_sock *ro = dgram_sk(sk);
int err = 0;
- if (len < sizeof(*addr))
- return -EINVAL;
+ err = ieee802154_sockaddr_check_size(addr, len);
+ if (err < 0)
+ return err;
if (addr->family != AF_IEEE802154)
return -EINVAL;
@@ -620,6 +611,7 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
struct ieee802154_mac_cb *cb;
struct dgram_sock *ro = dgram_sk(sk);
struct ieee802154_addr dst_addr;
+ DECLARE_SOCKADDR(struct sockaddr_ieee802154*, daddr, msg->msg_name);
int hlen, tlen;
int err;
@@ -628,10 +620,20 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
return -EOPNOTSUPP;
}
- if (!ro->connected && !msg->msg_name)
- return -EDESTADDRREQ;
- else if (ro->connected && msg->msg_name)
- return -EISCONN;
+ if (msg->msg_name) {
+ if (ro->connected)
+ return -EISCONN;
+ if (msg->msg_namelen < IEEE802154_MIN_NAMELEN)
+ return -EINVAL;
+ err = ieee802154_sockaddr_check_size(daddr, msg->msg_namelen);
+ if (err < 0)
+ return err;
+ ieee802154_addr_from_sa(&dst_addr, &daddr->addr);
+ } else {
+ if (!ro->connected)
+ return -EDESTADDRREQ;
+ dst_addr = ro->dst_addr;
+ }
if (!ro->bound)
dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154);
@@ -667,16 +669,6 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
cb = mac_cb_init(skb);
cb->type = IEEE802154_FC_TYPE_DATA;
cb->ackreq = ro->want_ack;
-
- if (msg->msg_name) {
- DECLARE_SOCKADDR(struct sockaddr_ieee802154*,
- daddr, msg->msg_name);
-
- ieee802154_addr_from_sa(&dst_addr, &daddr->addr);
- } else {
- dst_addr = ro->dst_addr;
- }
-
cb->secen = ro->secen;
cb->secen_override = ro->secen_override;
cb->seclevel = ro->seclevel;
@@ -711,7 +703,7 @@ out:
}
static int dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int noblock, int flags, int *addr_len)
+ int flags, int *addr_len)
{
size_t copied = 0;
int err = -EOPNOTSUPP;
@@ -719,7 +711,7 @@ static int dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
struct dgram_sock *ro = dgram_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_ieee802154 *, saddr, msg->msg_name);
- skb = skb_recv_datagram(sk, flags, noblock, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
if (!skb)
goto out;
@@ -734,7 +726,7 @@ static int dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (err)
goto done;
- sock_recv_ts_and_drops(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
if (saddr) {
/* Clear the implicit padding in struct sockaddr_ieee802154
@@ -887,7 +879,7 @@ static int dgram_getsockopt(struct sock *sk, int level, int optname,
}
static int dgram_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct dgram_sock *ro = dgram_sk(sk);
struct net *net = sock_net(sk);
@@ -897,7 +889,7 @@ static int dgram_setsockopt(struct sock *sk, int level, int optname,
if (optlen < sizeof(int))
return -EINVAL;
- if (get_user(val, (int __user *)optval))
+ if (copy_from_sockptr(&val, optval, sizeof(int)))
return -EFAULT;
lock_sock(sk);
@@ -988,6 +980,7 @@ static const struct proto_ops ieee802154_dgram_ops = {
.getname = sock_no_getname,
.poll = datagram_poll,
.ioctl = ieee802154_sock_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = sock_common_setsockopt,
@@ -995,13 +988,13 @@ static const struct proto_ops ieee802154_dgram_ops = {
.sendmsg = ieee802154_sock_sendmsg,
.recvmsg = sock_common_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
-#endif
};
+static void ieee802154_sock_destruct(struct sock *sk)
+{
+ skb_queue_purge(&sk->sk_receive_queue);
+}
+
/* Create a socket. Initialise the socket, blank the addresses
* set the state.
*/
@@ -1018,6 +1011,9 @@ static int ieee802154_create(struct net *net, struct socket *sock,
switch (sock->type) {
case SOCK_RAW:
+ rc = -EPERM;
+ if (!capable(CAP_NET_RAW))
+ goto out;
proto = &ieee802154_raw_prot;
ops = &ieee802154_raw_ops;
break;
@@ -1039,7 +1035,7 @@ static int ieee802154_create(struct net *net, struct socket *sock,
sock->ops = ops;
sock_init_data(sock, sk);
- /* FIXME: sk->sk_destruct */
+ sk->sk_destruct = ieee802154_sock_destruct;
sk->sk_family = PF_IEEE802154;
/* Checksums on by default */
@@ -1047,19 +1043,21 @@ static int ieee802154_create(struct net *net, struct socket *sock,
if (sk->sk_prot->hash) {
rc = sk->sk_prot->hash(sk);
- if (rc) {
- sk_common_release(sk);
- goto out;
- }
+ if (rc)
+ goto out_sk_release;
}
if (sk->sk_prot->init) {
rc = sk->sk_prot->init(sk);
if (rc)
- sk_common_release(sk);
+ goto out_sk_release;
}
out:
return rc;
+out_sk_release:
+ sk_common_release(sk);
+ sock->sk = NULL;
+ goto out;
}
static const struct net_proto_family ieee802154_family_ops = {
@@ -1102,7 +1100,7 @@ static struct packet_type ieee802154_packet_type = {
static int __init af_ieee802154_init(void)
{
- int rc = -EINVAL;
+ int rc;
rc = proto_register(&ieee802154_raw_prot, 1);
if (rc)
@@ -1141,4 +1139,5 @@ module_init(af_ieee802154_init);
module_exit(af_ieee802154_remove);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IEEE 802.15.4 socket interface");
MODULE_ALIAS_NETPROTO(PF_IEEE802154);
diff --git a/net/ieee802154/sysfs.c b/net/ieee802154/sysfs.c
index bd88525b041e..6708160ebf9f 100644
--- a/net/ieee802154/sysfs.c
+++ b/net/ieee802154/sysfs.c
@@ -1,11 +1,5 @@
-/* This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
+// SPDX-License-Identifier: GPL-2.0-only
+/*
*
* Authors:
* Alexander Aring <aar@pengutronix.de>
@@ -99,7 +93,7 @@ static SIMPLE_DEV_PM_OPS(wpan_phy_pm_ops, wpan_phy_suspend, wpan_phy_resume);
#define WPAN_PHY_PM_OPS NULL
#endif
-struct class wpan_phy_class = {
+const struct class wpan_phy_class = {
.name = "ieee802154",
.dev_release = wpan_phy_release,
.dev_groups = pmib_groups,
diff --git a/net/ieee802154/sysfs.h b/net/ieee802154/sysfs.h
index 337545b639e9..69961e166257 100644
--- a/net/ieee802154/sysfs.h
+++ b/net/ieee802154/sysfs.h
@@ -5,6 +5,6 @@
int wpan_phy_sysfs_init(void);
void wpan_phy_sysfs_exit(void);
-extern struct class wpan_phy_class;
+extern const struct class wpan_phy_class;
#endif /* __IEEE802154_SYSFS_H */
diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h
index 19c2e5d60e76..591ce0a16fc0 100644
--- a/net/ieee802154/trace.h
+++ b/net/ieee802154/trace.h
@@ -13,7 +13,7 @@
#define MAXNAME 32
#define WPAN_PHY_ENTRY __array(char, wpan_phy_name, MAXNAME)
-#define WPAN_PHY_ASSIGN strlcpy(__entry->wpan_phy_name, \
+#define WPAN_PHY_ASSIGN strscpy(__entry->wpan_phy_name, \
wpan_phy_name(wpan_phy), \
MAXNAME)
#define WPAN_PHY_PR_FMT "%s"
@@ -75,7 +75,7 @@ TRACE_EVENT(802154_rdev_add_virtual_intf,
),
TP_fast_assign(
WPAN_PHY_ASSIGN;
- __assign_str(vir_intf_name, name ? name : "<noname>");
+ __assign_str(vir_intf_name);
__entry->type = type;
__entry->extended_addr = extended_addr;
),
@@ -295,6 +295,105 @@ TRACE_EVENT(802154_rdev_set_ackreq_default,
WPAN_DEV_PR_ARG, BOOL_TO_STR(__entry->ackreq))
);
+TRACE_EVENT(802154_rdev_trigger_scan,
+ TP_PROTO(struct wpan_phy *wpan_phy,
+ struct cfg802154_scan_request *request),
+ TP_ARGS(wpan_phy, request),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ __field(u8, page)
+ __field(u32, channels)
+ __field(u8, duration)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ __entry->page = request->page;
+ __entry->channels = request->channels;
+ __entry->duration = request->duration;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", scan, page: %d, channels: %x, duration %d",
+ WPAN_PHY_PR_ARG, __entry->page, __entry->channels, __entry->duration)
+);
+
+TRACE_EVENT(802154_rdev_send_beacons,
+ TP_PROTO(struct wpan_phy *wpan_phy,
+ struct cfg802154_beacon_request *request),
+ TP_ARGS(wpan_phy, request),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ __field(u8, interval)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ __entry->interval = request->interval;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", sending beacons (interval order: %d)",
+ WPAN_PHY_PR_ARG, __entry->interval)
+);
+
+DECLARE_EVENT_CLASS(802154_wdev_template,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev),
+ TP_ARGS(wpan_phy, wpan_dev),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT,
+ WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG)
+);
+
+DEFINE_EVENT(802154_wdev_template, 802154_rdev_abort_scan,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev),
+ TP_ARGS(wpan_phy, wpan_dev)
+);
+
+DEFINE_EVENT(802154_wdev_template, 802154_rdev_stop_beacons,
+ TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev),
+ TP_ARGS(wpan_phy, wpan_dev)
+);
+
+TRACE_EVENT(802154_rdev_associate,
+ TP_PROTO(struct wpan_phy *wpan_phy,
+ struct wpan_dev *wpan_dev,
+ struct ieee802154_addr *coord),
+ TP_ARGS(wpan_phy, wpan_dev, coord),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ __field(__le64, addr)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ __entry->addr = coord->extended_addr;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", associating with: 0x%llx",
+ WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, __entry->addr)
+);
+
+TRACE_EVENT(802154_rdev_disassociate,
+ TP_PROTO(struct wpan_phy *wpan_phy,
+ struct wpan_dev *wpan_dev,
+ struct ieee802154_addr *target),
+ TP_ARGS(wpan_phy, wpan_dev, target),
+ TP_STRUCT__entry(
+ WPAN_PHY_ENTRY
+ WPAN_DEV_ENTRY
+ __field(__le64, addr)
+ ),
+ TP_fast_assign(
+ WPAN_PHY_ASSIGN;
+ WPAN_DEV_ASSIGN;
+ __entry->addr = target->extended_addr;
+ ),
+ TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT ", disassociating with: 0x%llx",
+ WPAN_PHY_PR_ARG, WPAN_DEV_PR_ARG, __entry->addr)
+);
+
TRACE_EVENT(802154_rdev_return_int,
TP_PROTO(struct wpan_phy *wpan_phy, int ret),
TP_ARGS(wpan_phy, ret),
diff --git a/net/ife/Kconfig b/net/ife/Kconfig
index 31e48b652c7c..de36a5b91e50 100644
--- a/net/ife/Kconfig
+++ b/net/ife/Kconfig
@@ -1,10 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# IFE subsystem configuration
#
menuconfig NET_IFE
- depends on NET
- tristate "Inter-FE based on IETF ForCES InterFE LFB"
+ tristate "Inter-FE based on IETF ForCES InterFE LFB"
default n
help
Say Y here to add support of IFE encapsulation protocol
diff --git a/net/ife/Makefile b/net/ife/Makefile
index 2a90d97746cc..1258fcb07f67 100644
--- a/net/ife/Makefile
+++ b/net/ife/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the IFE encapsulation protocol
#
diff --git a/net/ife/ife.c b/net/ife/ife.c
index 13bbf8cb6a39..be05b690b9ef 100644
--- a/net/ife/ife.c
+++ b/net/ife/ife.c
@@ -82,6 +82,7 @@ void *ife_decode(struct sk_buff *skb, u16 *metalen)
if (unlikely(!pskb_may_pull(skb, total_pull)))
return NULL;
+ ifehdr = (struct ifeheadr *)(skb->data + skb->dev->hard_header_len);
skb_set_mac_header(skb, total_pull);
__skb_pull(skb, total_pull);
*metalen = ifehdrln - IFE_METAHDRLEN;
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 32cae39cdff6..b71c22475c51 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# IP configuration
#
@@ -9,11 +10,11 @@ config IP_MULTICAST
intend to participate in the MBONE, a high bandwidth network on top
of the Internet which carries audio and video broadcasts. More
information about the MBONE is on the WWW at
- <http://www.savetz.com/mbone/>. For most people, it's safe to say N.
+ <https://www.savetz.com/mbone/>. For most people, it's safe to say N.
config IP_ADVANCED_ROUTER
bool "IP: advanced router"
- ---help---
+ help
If you intend to run your Linux box mostly as a router, i.e. as a
computer that forwards and redistributes network packets, say Y; you
will then be presented with several options that allow more precise
@@ -48,14 +49,14 @@ config IP_ADVANCED_ROUTER
Note that some distributions enable it in startup scripts.
For details about rp_filter strict and loose mode read
- <file:Documentation/networking/ip-sysctl.txt>.
+ <file:Documentation/networking/ip-sysctl.rst>.
If unsure, say N here.
config IP_FIB_TRIE_STATS
bool "FIB TRIE statistics"
depends on IP_ADVANCED_ROUTER
- ---help---
+ help
Keep track of statistics on structure of FIB TRIE table.
Useful for testing and measuring TRIE performance.
@@ -63,7 +64,7 @@ config IP_MULTIPLE_TABLES
bool "IP: policy routing"
depends on IP_ADVANCED_ROUTER
select FIB_RULES
- ---help---
+ help
Normally, a router decides what to do with a received packet based
solely on the packet's final destination address. If you say Y here,
the Linux router will also be able to take the packet's source
@@ -72,7 +73,7 @@ config IP_MULTIPLE_TABLES
If you need more information, see the Linux Advanced
Routing and Traffic Control documentation at
- <http://lartc.org/howto/lartc.rpdb.html>
+ <https://lartc.org/howto/lartc.rpdb.html>
If unsure, say N.
@@ -116,7 +117,7 @@ config IP_PNP
config IP_PNP_DHCP
bool "IP: DHCP support"
depends on IP_PNP
- ---help---
+ help
If you want your Linux box to mount its whole root file system (the
one containing the directory /) from some other computer over the
net via NFS and you want the IP address of your computer to be
@@ -128,12 +129,12 @@ config IP_PNP_DHCP
If unsure, say Y. Note that if you want to use DHCP, a DHCP server
must be operating on your network. Read
- <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
+ <file:Documentation/admin-guide/nfs/nfsroot.rst> for details.
config IP_PNP_BOOTP
bool "IP: BOOTP support"
depends on IP_PNP
- ---help---
+ help
If you want your Linux box to mount its whole root file system (the
one containing the directory /) from some other computer over the
net via NFS and you want the IP address of your computer to be
@@ -143,7 +144,7 @@ config IP_PNP_BOOTP
does BOOTP itself, providing all necessary information on the kernel
command line, you can say N here. If unsure, say Y. Note that if you
want to use BOOTP, a BOOTP server must be operating on your network.
- Read <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
+ Read <file:Documentation/admin-guide/nfs/nfsroot.rst> for details.
config IP_PNP_RARP
bool "IP: RARP support"
@@ -156,13 +157,13 @@ config IP_PNP_RARP
older protocol which is being obsoleted by BOOTP and DHCP), say Y
here. Note that if you want to use RARP, a RARP server must be
operating on your network. Read
- <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
+ <file:Documentation/admin-guide/nfs/nfsroot.rst> for details.
config NET_IPIP
tristate "IP: tunneling"
select INET_TUNNEL
select NET_IP_TUNNEL
- ---help---
+ help
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
encapsulating protocol. This particular tunneling driver implements
@@ -179,8 +180,8 @@ config NET_IPIP
config NET_IPGRE_DEMUX
tristate "IP: GRE demultiplexer"
help
- This is helper module to demultiplex GRE packets on GRE version field criteria.
- Required by ip_gre and pptp modules.
+ This is helper module to demultiplex GRE packets on GRE version field criteria.
+ Required by ip_gre and pptp modules.
config NET_IP_TUNNEL
tristate
@@ -266,7 +267,7 @@ config IP_PIMSM_V2
config SYN_COOKIES
bool "IP: TCP syncookie support"
- ---help---
+ help
Normal TCP/IP networking is open to an attack known as "SYN
flooding". This denial-of-service attack prevents legitimate remote
users from being able to connect to your computer during an ongoing
@@ -279,7 +280,7 @@ config SYN_COOKIES
continue to connect, even when your machine is under attack. There
is no need for the legitimate users to change their TCP/IP software;
SYN cookies work transparently to them. For technical information
- about SYN cookies, check out <http://cr.yp.to/syncookies.html>.
+ about SYN cookies, check out <https://cr.yp.to/syncookies.html>.
If you are SYN flooded, the source address reported by the kernel is
likely to have been forged by the attacker; it is only reported as
@@ -302,10 +303,11 @@ config SYN_COOKIES
config NET_IPVTI
tristate "Virtual (secure) IP: tunneling"
+ depends on IPV6 || IPV6=n
select INET_TUNNEL
select NET_IP_TUNNEL
- depends on INET_XFRM_MODE_TUNNEL
- ---help---
+ select XFRM
+ help
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
encapsulating protocol. This can be used with xfrm mode tunnel to give
@@ -319,9 +321,8 @@ config NET_UDP_TUNNEL
config NET_FOU
tristate "IP: Foo (IP protocols) over UDP"
- select XFRM
select NET_UDP_TUNNEL
- ---help---
+ help
Foo over UDP allows any IP protocol to be directly encapsulated
over UDP include tunnels (IPIP, GRE, SIT). By encapsulating in UDP
network mechanisms and optimizations for UDP (such as ECMP
@@ -331,36 +332,38 @@ config NET_FOU_IP_TUNNELS
bool "IP: FOU encapsulation of IP tunnels"
depends on NET_IPIP || NET_IPGRE || IPV6_SIT
select NET_FOU
- ---help---
+ help
Allow configuration of FOU or GUE encapsulation for IP tunnels.
When this option is enabled IP tunnels can be configured to use
FOU or GUE encapsulation.
config INET_AH
tristate "IP: AH transformation"
- select XFRM_ALGO
- select CRYPTO
- select CRYPTO_HMAC
- select CRYPTO_MD5
- select CRYPTO_SHA1
- ---help---
- Support for IPsec AH.
+ select XFRM_AH
+ help
+ Support for IPsec AH (Authentication Header).
+
+ AH can be used with various authentication algorithms. Besides
+ enabling AH support itself, this option enables the generic
+ implementations of the algorithms that RFC 8221 lists as MUST be
+ implemented. If you need any other algorithms, you'll need to enable
+ them in the crypto API. You should also enable accelerated
+ implementations of any needed algorithms when available.
If unsure, say Y.
config INET_ESP
tristate "IP: ESP transformation"
- select XFRM_ALGO
- select CRYPTO
- select CRYPTO_AUTHENC
- select CRYPTO_HMAC
- select CRYPTO_MD5
- select CRYPTO_CBC
- select CRYPTO_SHA1
- select CRYPTO_DES
- select CRYPTO_ECHAINIV
- ---help---
- Support for IPsec ESP.
+ select XFRM_ESP
+ help
+ Support for IPsec ESP (Encapsulating Security Payload).
+
+ ESP can be used with various encryption and authentication algorithms.
+ Besides enabling ESP support itself, this option enables the generic
+ implementations of the algorithms that RFC 8221 lists as MUST be
+ implemented. If you need any other algorithms, you'll need to enable
+ them in the crypto API. You should also enable accelerated
+ implementations of any needed algorithms when available.
If unsure, say Y.
@@ -369,7 +372,7 @@ config INET_ESP_OFFLOAD
depends on INET_ESP
select XFRM_OFFLOAD
default n
- ---help---
+ help
Support for ESP transformation offload. This makes sense
only if this system really does IPsec and want to do it
with high throughput. A typical desktop system does not
@@ -377,16 +380,38 @@ config INET_ESP_OFFLOAD
If unsure, say N.
+config INET_ESPINTCP
+ bool "IP: ESP in TCP encapsulation (RFC 8229)"
+ depends on XFRM && INET_ESP
+ select STREAM_PARSER
+ select NET_SOCK_MSG
+ select XFRM_ESPINTCP
+ help
+ Support for RFC 8229 encapsulation of ESP and IKE over
+ TCP/IPv4 sockets.
+
+ If unsure, say N.
+
config INET_IPCOMP
tristate "IP: IPComp transformation"
select INET_XFRM_TUNNEL
select XFRM_IPCOMP
- ---help---
+ help
Support for IP Payload Compression Protocol (IPComp) (RFC3173),
typically needed for IPsec.
If unsure, say Y.
+config INET_TABLE_PERTURB_ORDER
+ int "INET: Source port perturbation table size (as power of 2)" if EXPERT
+ default 16
+ help
+ Source port perturbation table size (as power of 2) for
+ RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm.
+
+ The default is almost always what you want.
+ Only change this if you know what you are doing.
+
config INET_XFRM_TUNNEL
tristate
select INET_TUNNEL
@@ -396,38 +421,11 @@ config INET_TUNNEL
tristate
default n
-config INET_XFRM_MODE_TRANSPORT
- tristate "IP: IPsec transport mode"
- default y
- select XFRM
- ---help---
- Support for IPsec transport mode.
-
- If unsure, say Y.
-
-config INET_XFRM_MODE_TUNNEL
- tristate "IP: IPsec tunnel mode"
- default y
- select XFRM
- ---help---
- Support for IPsec tunnel mode.
-
- If unsure, say Y.
-
-config INET_XFRM_MODE_BEET
- tristate "IP: IPsec BEET mode"
- default y
- select XFRM
- ---help---
- Support for IPsec BEET mode.
-
- If unsure, say Y.
-
config INET_DIAG
tristate "INET: socket monitoring interface"
default y
- ---help---
- Support for INET (TCP, DCCP, etc) socket monitoring interface used by
+ help
+ Support for INET (TCP, UDP, etc) socket monitoring interface used by
native Linux tools such as ss. ss is included in iproute2, currently
downloadable at:
@@ -443,7 +441,7 @@ config INET_UDP_DIAG
tristate "UDP: socket monitoring interface"
depends on INET_DIAG && (IPV6 || IPV6=n)
default n
- ---help---
+ help
Support for UDP socket monitoring interface used by the ss tool.
If unsure, say Y.
@@ -451,7 +449,7 @@ config INET_RAW_DIAG
tristate "RAW: socket monitoring interface"
depends on INET_DIAG && (IPV6 || IPV6=n)
default n
- ---help---
+ help
Support for RAW socket monitoring interface used by the ss tool.
If unsure, say Y.
@@ -459,7 +457,7 @@ config INET_DIAG_DESTROY
bool "INET: allow privileged process to administratively close sockets"
depends on INET_DIAG
default n
- ---help---
+ help
Provides a SOCK_DESTROY operation that allows privileged processes
(e.g., a connection manager or a network administration tool such as
ss) to close sockets opened by other processes. Closing a socket in
@@ -470,7 +468,7 @@ config INET_DIAG_DESTROY
menuconfig TCP_CONG_ADVANCED
bool "TCP: advanced congestion control"
- ---help---
+ help
Support for selection of various TCP congestion control
modules.
@@ -484,201 +482,202 @@ if TCP_CONG_ADVANCED
config TCP_CONG_BIC
tristate "Binary Increase Congestion (BIC) control"
default m
- ---help---
- BIC-TCP is a sender-side only change that ensures a linear RTT
- fairness under large windows while offering both scalability and
- bounded TCP-friendliness. The protocol combines two schemes
- called additive increase and binary search increase. When the
- congestion window is large, additive increase with a large
- increment ensures linear RTT fairness as well as good
- scalability. Under small congestion windows, binary search
- increase provides TCP friendliness.
- See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
+ help
+ BIC-TCP is a sender-side only change that ensures a linear RTT
+ fairness under large windows while offering both scalability and
+ bounded TCP-friendliness. The protocol combines two schemes
+ called additive increase and binary search increase. When the
+ congestion window is large, additive increase with a large
+ increment ensures linear RTT fairness as well as good
+ scalability. Under small congestion windows, binary search
+ increase provides TCP friendliness.
+ See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/
config TCP_CONG_CUBIC
tristate "CUBIC TCP"
default y
- ---help---
- This is version 2.0 of BIC-TCP which uses a cubic growth function
- among other techniques.
- See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
+ help
+ This is version 2.0 of BIC-TCP which uses a cubic growth function
+ among other techniques.
+ See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf
config TCP_CONG_WESTWOOD
tristate "TCP Westwood+"
default m
- ---help---
- TCP Westwood+ is a sender-side only modification of the TCP Reno
- protocol stack that optimizes the performance of TCP congestion
- control. It is based on end-to-end bandwidth estimation to set
- congestion window and slow start threshold after a congestion
- episode. Using this estimation, TCP Westwood+ adaptively sets a
- slow start threshold and a congestion window which takes into
- account the bandwidth used at the time congestion is experienced.
- TCP Westwood+ significantly increases fairness wrt TCP Reno in
- wired networks and throughput over wireless links.
+ help
+ TCP Westwood+ is a sender-side only modification of the TCP Reno
+ protocol stack that optimizes the performance of TCP congestion
+ control. It is based on end-to-end bandwidth estimation to set
+ congestion window and slow start threshold after a congestion
+ episode. Using this estimation, TCP Westwood+ adaptively sets a
+ slow start threshold and a congestion window which takes into
+ account the bandwidth used at the time congestion is experienced.
+ TCP Westwood+ significantly increases fairness wrt TCP Reno in
+ wired networks and throughput over wireless links.
config TCP_CONG_HTCP
- tristate "H-TCP"
- default m
- ---help---
- H-TCP is a send-side only modifications of the TCP Reno
- protocol stack that optimizes the performance of TCP
- congestion control for high speed network links. It uses a
- modeswitch to change the alpha and beta parameters of TCP Reno
- based on network conditions and in a way so as to be fair with
- other Reno and H-TCP flows.
+ tristate "H-TCP"
+ default m
+ help
+ H-TCP is a send-side only modifications of the TCP Reno
+ protocol stack that optimizes the performance of TCP
+ congestion control for high speed network links. It uses a
+ modeswitch to change the alpha and beta parameters of TCP Reno
+ based on network conditions and in a way so as to be fair with
+ other Reno and H-TCP flows.
config TCP_CONG_HSTCP
tristate "High Speed TCP"
default n
- ---help---
- Sally Floyd's High Speed TCP (RFC 3649) congestion control.
- A modification to TCP's congestion control mechanism for use
- with large congestion windows. A table indicates how much to
- increase the congestion window by when an ACK is received.
- For more detail see http://www.icir.org/floyd/hstcp.html
+ help
+ Sally Floyd's High Speed TCP (RFC 3649) congestion control.
+ A modification to TCP's congestion control mechanism for use
+ with large congestion windows. A table indicates how much to
+ increase the congestion window by when an ACK is received.
+ For more detail see https://www.icir.org/floyd/hstcp.html
config TCP_CONG_HYBLA
tristate "TCP-Hybla congestion control algorithm"
default n
- ---help---
- TCP-Hybla is a sender-side only change that eliminates penalization of
- long-RTT, large-bandwidth connections, like when satellite legs are
- involved, especially when sharing a common bottleneck with normal
- terrestrial connections.
+ help
+ TCP-Hybla is a sender-side only change that eliminates penalization of
+ long-RTT, large-bandwidth connections, like when satellite legs are
+ involved, especially when sharing a common bottleneck with normal
+ terrestrial connections.
config TCP_CONG_VEGAS
tristate "TCP Vegas"
default n
- ---help---
- TCP Vegas is a sender-side only change to TCP that anticipates
- the onset of congestion by estimating the bandwidth. TCP Vegas
- adjusts the sending rate by modifying the congestion
- window. TCP Vegas should provide less packet loss, but it is
- not as aggressive as TCP Reno.
+ help
+ TCP Vegas is a sender-side only change to TCP that anticipates
+ the onset of congestion by estimating the bandwidth. TCP Vegas
+ adjusts the sending rate by modifying the congestion
+ window. TCP Vegas should provide less packet loss, but it is
+ not as aggressive as TCP Reno.
config TCP_CONG_NV
- tristate "TCP NV"
- default n
- ---help---
- TCP NV is a follow up to TCP Vegas. It has been modified to deal with
- 10G networks, measurement noise introduced by LRO, GRO and interrupt
- coalescence. In addition, it will decrease its cwnd multiplicatively
- instead of linearly.
+ tristate "TCP NV"
+ default n
+ help
+ TCP NV is a follow up to TCP Vegas. It has been modified to deal with
+ 10G networks, measurement noise introduced by LRO, GRO and interrupt
+ coalescence. In addition, it will decrease its cwnd multiplicatively
+ instead of linearly.
- Note that in general congestion avoidance (cwnd decreased when # packets
- queued grows) cannot coexist with congestion control (cwnd decreased only
- when there is packet loss) due to fairness issues. One scenario when they
- can coexist safely is when the CA flows have RTTs << CC flows RTTs.
+ Note that in general congestion avoidance (cwnd decreased when # packets
+ queued grows) cannot coexist with congestion control (cwnd decreased only
+ when there is packet loss) due to fairness issues. One scenario when they
+ can coexist safely is when the CA flows have RTTs << CC flows RTTs.
- For further details see http://www.brakmo.org/networking/tcp-nv/
+ For further details see http://www.brakmo.org/networking/tcp-nv/
config TCP_CONG_SCALABLE
tristate "Scalable TCP"
default n
- ---help---
- Scalable TCP is a sender-side only change to TCP which uses a
- MIMD congestion control algorithm which has some nice scaling
- properties, though is known to have fairness issues.
- See http://www.deneholme.net/tom/scalable/
+ help
+ Scalable TCP is a sender-side only change to TCP which uses a
+ MIMD congestion control algorithm which has some nice scaling
+ properties, though is known to have fairness issues.
+ See http://www.deneholme.net/tom/scalable/
config TCP_CONG_LP
tristate "TCP Low Priority"
default n
- ---help---
- TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
- to utilize only the excess network bandwidth as compared to the
- ``fair share`` of bandwidth as targeted by TCP.
- See http://www-ece.rice.edu/networks/TCP-LP/
+ help
+ TCP Low Priority (TCP-LP), a distributed algorithm whose goal is
+ to utilize only the excess network bandwidth as compared to the
+ ``fair share`` of bandwidth as targeted by TCP.
+ See http://www-ece.rice.edu/networks/TCP-LP/
config TCP_CONG_VENO
tristate "TCP Veno"
default n
- ---help---
- TCP Veno is a sender-side only enhancement of TCP to obtain better
- throughput over wireless networks. TCP Veno makes use of state
- distinguishing to circumvent the difficult judgment of the packet loss
- type. TCP Veno cuts down less congestion window in response to random
- loss packets.
- See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
+ help
+ TCP Veno is a sender-side only enhancement of TCP to obtain better
+ throughput over wireless networks. TCP Veno makes use of state
+ distinguishing to circumvent the difficult judgment of the packet loss
+ type. TCP Veno cuts down less congestion window in response to random
+ loss packets.
+ See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
config TCP_CONG_YEAH
tristate "YeAH TCP"
select TCP_CONG_VEGAS
default n
- ---help---
- YeAH-TCP is a sender-side high-speed enabled TCP congestion control
- algorithm, which uses a mixed loss/delay approach to compute the
- congestion window. It's design goals target high efficiency,
- internal, RTT and Reno fairness, resilience to link loss while
- keeping network elements load as low as possible.
+ help
+ YeAH-TCP is a sender-side high-speed enabled TCP congestion control
+ algorithm, which uses a mixed loss/delay approach to compute the
+ congestion window. It's design goals target high efficiency,
+ internal, RTT and Reno fairness, resilience to link loss while
+ keeping network elements load as low as possible.
- For further details look here:
- http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
+ For further details look here:
+ http://wil.cs.caltech.edu/pfldnet2007/paper/YeAH_TCP.pdf
config TCP_CONG_ILLINOIS
tristate "TCP Illinois"
default n
- ---help---
- TCP-Illinois is a sender-side modification of TCP Reno for
- high speed long delay links. It uses round-trip-time to
- adjust the alpha and beta parameters to achieve a higher average
- throughput and maintain fairness.
+ help
+ TCP-Illinois is a sender-side modification of TCP Reno for
+ high speed long delay links. It uses round-trip-time to
+ adjust the alpha and beta parameters to achieve a higher average
+ throughput and maintain fairness.
- For further details see:
- http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
+ For further details see:
+ http://www.ews.uiuc.edu/~shaoliu/tcpillinois/index.html
config TCP_CONG_DCTCP
tristate "DataCenter TCP (DCTCP)"
default n
- ---help---
- DCTCP leverages Explicit Congestion Notification (ECN) in the network to
- provide multi-bit feedback to the end hosts. It is designed to provide:
+ help
+ DCTCP leverages Explicit Congestion Notification (ECN) in the network to
+ provide multi-bit feedback to the end hosts. It is designed to provide:
- - High burst tolerance (incast due to partition/aggregate),
- - Low latency (short flows, queries),
- - High throughput (continuous data updates, large file transfers) with
- commodity, shallow-buffered switches.
+ - High burst tolerance (incast due to partition/aggregate),
+ - Low latency (short flows, queries),
+ - High throughput (continuous data updates, large file transfers) with
+ commodity, shallow-buffered switches.
- All switches in the data center network running DCTCP must support
- ECN marking and be configured for marking when reaching defined switch
- buffer thresholds. The default ECN marking threshold heuristic for
- DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
- (~100KB) at 10Gbps, but might need further careful tweaking.
+ All switches in the data center network running DCTCP must support
+ ECN marking and be configured for marking when reaching defined switch
+ buffer thresholds. The default ECN marking threshold heuristic for
+ DCTCP on switches is 20 packets (30KB) at 1Gbps, and 65 packets
+ (~100KB) at 10Gbps, but might need further careful tweaking.
- For further details see:
- http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
+ For further details see:
+ http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
config TCP_CONG_CDG
tristate "CAIA Delay-Gradient (CDG)"
default n
- ---help---
- CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies
- the TCP sender in order to:
+ help
+ CAIA Delay-Gradient (CDG) is a TCP congestion control that modifies
+ the TCP sender in order to:
o Use the delay gradient as a congestion signal.
o Back off with an average probability that is independent of the RTT.
o Coexist with flows that use loss-based congestion control.
o Tolerate packet loss unrelated to congestion.
- For further details see:
- D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
- delay gradients." In Networking 2011. Preprint: http://goo.gl/No3vdg
+ For further details see:
+ D.A. Hayes and G. Armitage. "Revisiting TCP congestion control using
+ delay gradients." In Networking 2011. Preprint:
+ http://caia.swin.edu.au/cv/dahayes/content/networking2011-cdg-preprint.pdf
config TCP_CONG_BBR
tristate "BBR TCP"
default n
- ---help---
-
- BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
- maximize network utilization and minimize queues. It builds an explicit
- model of the the bottleneck delivery rate and path round-trip
- propagation delay. It tolerates packet loss and delay unrelated to
- congestion. It can operate over LAN, WAN, cellular, wifi, or cable
- modem links. It can coexist with flows that use loss-based congestion
- control, and can operate with shallow buffers, deep buffers,
- bufferbloat, policers, or AQM schemes that do not provide a delay
- signal. It requires the fq ("Fair Queue") pacing packet scheduler.
+ help
+
+ BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to
+ maximize network utilization and minimize queues. It builds an explicit
+ model of the bottleneck delivery rate and path round-trip propagation
+ delay. It tolerates packet loss and delay unrelated to congestion. It
+ can operate over LAN, WAN, cellular, wifi, or cable modem links. It can
+ coexist with flows that use loss-based congestion control, and can
+ operate with shallow buffers, deep buffers, bufferbloat, policers, or
+ AQM schemes that do not provide a delay signal. It requires the fq
+ ("Fair Queue") pacing packet scheduler.
choice
prompt "Default TCP congestion control"
@@ -743,11 +742,26 @@ config DEFAULT_TCP_CONG
default "bbr" if DEFAULT_BBR
default "cubic"
+config TCP_SIGPOOL
+ tristate
+
+config TCP_AO
+ bool "TCP: Authentication Option (RFC5925)"
+ select CRYPTO
+ select TCP_SIGPOOL
+ depends on 64BIT && IPV6 != m # seq-number extension needs WRITE_ONCE(u64)
+ help
+ TCP-AO specifies the use of stronger Message Authentication Codes (MACs),
+ protects against replays for long-lived TCP connections, and
+ provides more details on the association of security with TCP
+ connections than TCP MD5 (See RFC5925)
+
+ If unsure, say N.
+
config TCP_MD5SIG
bool "TCP: MD5 Signature Option support (RFC2385)"
- select CRYPTO
- select CRYPTO_MD5
- ---help---
+ select CRYPTO_LIB_MD5
+ help
RFC2385 specifies a method of giving MD5 protection to TCP sessions.
Its main (only?) use is to protect BGP sessions between core routers
on the Internet.
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 7446b98661d8..ec36d2ec059e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -10,13 +10,11 @@ obj-y := route.o inetpeer.o protocol.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
tcp_rate.o tcp_recovery.o tcp_ulp.o \
- tcp_offload.o datagram.o raw.o udp.o udplite.o \
+ tcp_offload.o tcp_plb.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
- metrics.o netlink.o
-
-obj-$(CONFIG_BPFILTER) += bpfilter/
+ metrics.o netlink.o nexthop.o udp_tunnel_stub.o
obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
@@ -26,9 +24,11 @@ obj-$(CONFIG_IP_MROUTE) += ipmr.o
obj-$(CONFIG_IP_MROUTE_COMMON) += ipmr_base.o
obj-$(CONFIG_NET_IPIP) += ipip.o
gre-y := gre_demux.o
+fou-y := fou_core.o fou_nl.o fou_bpf.o
obj-$(CONFIG_NET_FOU) += fou.o
obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
obj-$(CONFIG_NET_IPGRE) += ip_gre.o
+udp_tunnel-y := udp_tunnel_core.o udp_tunnel_nic.o
obj-$(CONFIG_NET_UDP_TUNNEL) += udp_tunnel.o
obj-$(CONFIG_NET_IPVTI) += ip_vti.o
obj-$(CONFIG_SYN_COOKIES) += syncookies.o
@@ -37,10 +37,7 @@ obj-$(CONFIG_INET_ESP) += esp4.o
obj-$(CONFIG_INET_ESP_OFFLOAD) += esp4_offload.o
obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
-obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o
obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
-obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
-obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
obj-$(CONFIG_IP_PNP) += ipconfig.o
obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
obj-$(CONFIG_INET_DIAG) += inet_diag.o
@@ -63,7 +60,15 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
+obj-$(CONFIG_TCP_SIGPOOL) += tcp_sigpool.o
+obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o
+obj-$(CONFIG_BPF_SYSCALL) += udp_bpf.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
xfrm4_output.o xfrm4_protocol.o
+obj-$(CONFIG_TCP_AO) += tcp_ao.o
+
+ifeq ($(CONFIG_BPF_JIT),y)
+obj-$(CONFIG_BPF_SYSCALL) += bpf_tcp_ca.o
+endif
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 1fbe2f815474..08d811f11896 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -58,11 +59,6 @@
* Some other random speedups.
* Cyrus Durgin : Cleaned up file for kmod hacks.
* Andi Kleen : Fix inet_stream_connect TCP race.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "IPv4: " fmt
@@ -103,7 +99,10 @@
#include <net/route.h>
#include <net/ip_fib.h>
#include <net/inet_connection_sock.h>
+#include <net/gro.h>
+#include <net/gso.h>
#include <net/tcp.h>
+#include <net/psp.h>
#include <net/udp.h>
#include <net/udplite.h>
#include <net/ping.h>
@@ -120,6 +119,8 @@
#include <linux/mroute.h>
#endif
#include <net/l3mdev.h>
+#include <net/compat.h>
+#include <net/rps.h>
#include <trace/events/sock.h>
@@ -138,7 +139,7 @@ void inet_sock_destruct(struct sock *sk)
__skb_queue_purge(&sk->sk_receive_queue);
__skb_queue_purge(&sk->sk_error_queue);
- sk_mem_reclaim(sk);
+ sk_mem_reclaim_final(sk);
if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
pr_err("Attempt to release TCP socket in state %d %p\n",
@@ -150,15 +151,15 @@ void inet_sock_destruct(struct sock *sk)
return;
}
- WARN_ON(atomic_read(&sk->sk_rmem_alloc));
- WARN_ON(refcount_read(&sk->sk_wmem_alloc));
- WARN_ON(sk->sk_wmem_queued);
- WARN_ON(sk->sk_forward_alloc);
+ WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc));
+ WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
+ WARN_ON_ONCE(sk->sk_wmem_queued);
+ WARN_ON_ONCE(sk->sk_forward_alloc);
kfree(rcu_dereference_protected(inet->inet_opt, 1));
- dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
- dst_release(sk->sk_rx_dst);
- sk_refcnt_debug_dec(sk);
+ dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
+ dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1));
+ psp_sk_assoc_free(sk);
}
EXPORT_SYMBOL(inet_sock_destruct);
@@ -189,25 +190,15 @@ static int inet_autobind(struct sock *sk)
return 0;
}
-/*
- * Move a socket into listening state.
- */
-int inet_listen(struct socket *sock, int backlog)
+int __inet_listen_sk(struct sock *sk, int backlog)
{
- struct sock *sk = sock->sk;
- unsigned char old_state;
+ unsigned char old_state = sk->sk_state;
int err, tcp_fastopen;
- lock_sock(sk);
-
- err = -EINVAL;
- if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
- goto out;
-
- old_state = sk->sk_state;
if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
- goto out;
+ return -EINVAL;
+ WRITE_ONCE(sk->sk_max_ack_backlog, backlog);
/* Really, if the socket is already in listen state
* we can only allow the backlog to be adjusted.
*/
@@ -218,7 +209,7 @@ int inet_listen(struct socket *sock, int backlog)
* because the socket was in TCP_LISTEN state previously but
* was shutdown() rather than close().
*/
- tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen;
+ tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen);
if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) &&
(tcp_fastopen & TFO_SERVER_ENABLE) &&
!inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) {
@@ -226,13 +217,29 @@ int inet_listen(struct socket *sock, int backlog)
tcp_fastopen_init_key_once(sock_net(sk));
}
- err = inet_csk_listen_start(sk, backlog);
+ err = inet_csk_listen_start(sk);
if (err)
- goto out;
+ return err;
+
tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
}
- sk->sk_max_ack_backlog = backlog;
- err = 0;
+ return 0;
+}
+
+/*
+ * Move a socket into listening state.
+ */
+int inet_listen(struct socket *sock, int backlog)
+{
+ struct sock *sk = sock->sk;
+ int err = -EINVAL;
+
+ lock_sock(sk);
+
+ if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
+ goto out;
+
+ err = __inet_listen_sk(sk, backlog);
out:
release_sock(sk);
@@ -317,7 +324,7 @@ lookup_protocol:
WARN_ON(!answer_prot->slab);
- err = -ENOBUFS;
+ err = -ENOMEM;
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
if (!sk)
goto out;
@@ -326,40 +333,42 @@ lookup_protocol:
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = SK_CAN_REUSE;
+ if (INET_PROTOSW_ICSK & answer_flags)
+ inet_init_csk_locks(sk);
+
inet = inet_sk(sk);
- inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
+ inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags);
- inet->nodefrag = 0;
+ inet_clear_bit(NODEFRAG, sk);
if (SOCK_RAW == sock->type) {
inet->inet_num = protocol;
if (IPPROTO_RAW == protocol)
- inet->hdrincl = 1;
+ inet_set_bit(HDRINCL, sk);
}
- if (net->ipv4.sysctl_ip_no_pmtu_disc)
+ if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc))
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
- inet->inet_id = 0;
+ atomic_set(&inet->inet_id, 0);
sock_init_data(sock, sk);
sk->sk_destruct = inet_sock_destruct;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+ sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash);
inet->uc_ttl = -1;
- inet->mc_loop = 1;
+ inet_set_bit(MC_LOOP, sk);
inet->mc_ttl = 1;
- inet->mc_all = 1;
+ inet_set_bit(MC_ALL, sk);
inet->mc_index = 0;
inet->mc_list = NULL;
inet->rcv_tos = 0;
- sk_refcnt_debug_inc(sk);
-
if (inet->inet_num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
@@ -369,32 +378,30 @@ lookup_protocol:
inet->inet_sport = htons(inet->inet_num);
/* Add to protocol hash chains. */
err = sk->sk_prot->hash(sk);
- if (err) {
- sk_common_release(sk);
- goto out;
- }
+ if (err)
+ goto out_sk_release;
}
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
- if (err) {
- sk_common_release(sk);
- goto out;
- }
+ if (err)
+ goto out_sk_release;
}
if (!kern) {
err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
- if (err) {
- sk_common_release(sk);
- goto out;
- }
+ if (err)
+ goto out_sk_release;
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
+out_sk_release:
+ sk_common_release(sk);
+ sock->sk = NULL;
+ goto out;
}
@@ -410,6 +417,9 @@ int inet_release(struct socket *sock)
if (sk) {
long timeout;
+ if (!sk->sk_kern_sock)
+ BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk);
+
/* Applications forget to leave groups before exiting */
ip_mc_drop_socket(sk);
@@ -424,16 +434,16 @@ int inet_release(struct socket *sock)
if (sock_flag(sk, SOCK_LINGER) &&
!(current->flags & PF_EXITING))
timeout = sk->sk_lingertime;
- sock->sk = NULL;
sk->sk_prot->close(sk, timeout);
+ sock->sk = NULL;
}
return 0;
}
EXPORT_SYMBOL(inet_release);
-int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+int inet_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
{
- struct sock *sk = sock->sk;
+ u32 flags = BIND_WITH_LOCK;
int err;
/* If the socket has its own bind function then use it. (RAW) */
@@ -446,16 +456,22 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
/* BPF prog is run before any checks are done so that if the prog
* changes context in a wrong way it will be caught.
*/
- err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr);
+ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len,
+ CGROUP_INET4_BIND, &flags);
if (err)
return err;
- return __inet_bind(sk, uaddr, addr_len, false, true);
+ return __inet_bind(sk, uaddr, addr_len, flags);
+}
+
+int inet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
+{
+ return inet_bind_sk(sock->sk, uaddr, addr_len);
}
EXPORT_SYMBOL(inet_bind);
-int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
- bool force_bind_address_no_port, bool with_lock)
+int __inet_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len,
+ u32 flags)
{
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct inet_sock *inet = inet_sk(sk);
@@ -486,16 +502,14 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
* is temporarily down)
*/
err = -EADDRNOTAVAIL;
- if (!inet_can_nonlocal_bind(net, inet) &&
- addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
- chk_addr_ret != RTN_LOCAL &&
- chk_addr_ret != RTN_MULTICAST &&
- chk_addr_ret != RTN_BROADCAST)
+ if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr,
+ chk_addr_ret))
goto out;
snum = ntohs(addr->sin_port);
err = -EACCES;
- if (snum && snum < inet_prot_sock(net) &&
+ if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
+ snum && inet_port_requires_bind_service(net, snum) &&
!ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
goto out;
@@ -506,7 +520,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
* would be illegal to use them (multicast/broadcast) in
* which case the sending device address is used.
*/
- if (with_lock)
+ if (flags & BIND_WITH_LOCK)
lock_sock(sk);
/* Check these errors (active socket, double bind). */
@@ -519,18 +533,22 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
inet->inet_saddr = 0; /* Use device */
/* Make sure we are allowed to bind here. */
- if (snum || !(inet->bind_address_no_port ||
- force_bind_address_no_port)) {
- if (sk->sk_prot->get_port(sk, snum)) {
- inet->inet_saddr = inet->inet_rcv_saddr = 0;
- err = -EADDRINUSE;
- goto out_release_sock;
- }
- err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
+ if (snum || !(inet_test_bit(BIND_ADDRESS_NO_PORT, sk) ||
+ (flags & BIND_FORCE_ADDRESS_NO_PORT))) {
+ err = sk->sk_prot->get_port(sk, snum);
if (err) {
inet->inet_saddr = inet->inet_rcv_saddr = 0;
goto out_release_sock;
}
+ if (!(flags & BIND_FROM_BPF)) {
+ err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
+ if (err) {
+ inet->inet_saddr = inet->inet_rcv_saddr = 0;
+ if (sk->sk_prot->put_port)
+ sk->sk_prot->put_port(sk);
+ goto out_release_sock;
+ }
+ }
}
if (inet->inet_rcv_saddr)
@@ -543,32 +561,37 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
sk_dst_reset(sk);
err = 0;
out_release_sock:
- if (with_lock)
+ if (flags & BIND_WITH_LOCK)
release_sock(sk);
out:
return err;
}
-int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr,
+int inet_dgram_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
+ const struct proto *prot;
int err;
if (addr_len < sizeof(uaddr->sa_family))
return -EINVAL;
+
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ prot = READ_ONCE(sk->sk_prot);
+
if (uaddr->sa_family == AF_UNSPEC)
- return sk->sk_prot->disconnect(sk, flags);
+ return prot->disconnect(sk, flags);
if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) {
- err = sk->sk_prot->pre_connect(sk, uaddr, addr_len);
+ err = prot->pre_connect(sk, uaddr, addr_len);
if (err)
return err;
}
- if (!inet_sk(sk)->inet_num && inet_autobind(sk))
+ if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk))
return -EAGAIN;
- return sk->sk_prot->connect(sk, uaddr, addr_len);
+ return prot->connect(sk, uaddr, addr_len);
}
EXPORT_SYMBOL(inet_dgram_connect);
@@ -600,7 +623,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias)
* Connect to a remote host. There is regrettably still a little
* TCP 'magic' in here.
*/
-int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+int __inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
int addr_len, int flags, int is_sendmsg)
{
struct sock *sk = sock->sk;
@@ -621,6 +644,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
return -EINVAL;
if (uaddr->sa_family == AF_UNSPEC) {
+ sk->sk_disconnects++;
err = sk->sk_prot->disconnect(sk, flags);
sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
goto out;
@@ -635,7 +659,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
err = -EISCONN;
goto out;
case SS_CONNECTING:
- if (inet_sk(sk)->defer_connect)
+ if (inet_test_bit(DEFER_CONNECT, sk))
err = is_sendmsg ? -EINPROGRESS : -EISCONN;
else
err = -EALREADY;
@@ -658,7 +682,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
sock->state = SS_CONNECTING;
- if (!err && inet_sk(sk)->defer_connect)
+ if (!err && inet_test_bit(DEFER_CONNECT, sk))
goto out;
/* Just entered SS_CONNECTING state; the only
@@ -675,6 +699,7 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int writebias = (sk->sk_protocol == IPPROTO_TCP) &&
tcp_sk(sk)->fastopen_req &&
tcp_sk(sk)->fastopen_req->data ? 1 : 0;
+ int dis = sk->sk_disconnects;
/* Error code is set above */
if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
@@ -683,6 +708,11 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
err = sock_intr_errno(timeo);
if (signal_pending(current))
goto out;
+
+ if (dis != sk->sk_disconnects) {
+ err = -EPIPE;
+ goto out;
+ }
}
/* Connection was closed by RST, timeout, ICMP error
@@ -704,13 +734,14 @@ out:
sock_error:
err = sock_error(sk) ? : -ECONNABORTED;
sock->state = SS_UNCONNECTED;
+ sk->sk_disconnects++;
if (sk->sk_prot->disconnect(sk, flags))
sock->state = SS_DISCONNECTING;
goto out;
}
EXPORT_SYMBOL(__inet_stream_connect);
-int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+int inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
int addr_len, int flags)
{
int err;
@@ -722,101 +753,131 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
}
EXPORT_SYMBOL(inet_stream_connect);
+void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk)
+{
+ if (mem_cgroup_sockets_enabled) {
+ mem_cgroup_sk_alloc(newsk);
+ __sk_charge(newsk, GFP_KERNEL);
+ }
+
+ sock_rps_record_flow(newsk);
+ WARN_ON(!((1 << newsk->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_SYN_RECV |
+ TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 |
+ TCPF_CLOSING | TCPF_CLOSE_WAIT |
+ TCPF_CLOSE)));
+
+ if (test_bit(SOCK_SUPPORT_ZC, &sock->flags))
+ set_bit(SOCK_SUPPORT_ZC, &newsock->flags);
+ sock_graft(newsk, newsock);
+
+ newsock->state = SS_CONNECTED;
+}
+EXPORT_SYMBOL_GPL(__inet_accept);
+
/*
* Accept a pending connection. The TCP layer now gives BSD semantics.
*/
-int inet_accept(struct socket *sock, struct socket *newsock, int flags,
- bool kern)
+int inet_accept(struct socket *sock, struct socket *newsock,
+ struct proto_accept_arg *arg)
{
- struct sock *sk1 = sock->sk;
- int err = -EINVAL;
- struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err, kern);
+ struct sock *sk1 = sock->sk, *sk2;
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ arg->err = -EINVAL;
+ sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, arg);
if (!sk2)
- goto do_err;
+ return arg->err;
lock_sock(sk2);
-
- sock_rps_record_flow(sk2);
- WARN_ON(!((1 << sk2->sk_state) &
- (TCPF_ESTABLISHED | TCPF_SYN_RECV |
- TCPF_CLOSE_WAIT | TCPF_CLOSE)));
-
- sock_graft(sk2, newsock);
-
- newsock->state = SS_CONNECTED;
- err = 0;
+ __inet_accept(sock, newsock, sk2);
release_sock(sk2);
-do_err:
- return err;
+ return 0;
}
EXPORT_SYMBOL(inet_accept);
-
/*
* This does both peername and sockname.
*/
int inet_getname(struct socket *sock, struct sockaddr *uaddr,
- int peer)
+ int peer)
{
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
+ int sin_addr_len = sizeof(*sin);
sin->sin_family = AF_INET;
+ lock_sock(sk);
if (peer) {
if (!inet->inet_dport ||
(((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
- peer == 1))
+ peer == 1)) {
+ release_sock(sk);
return -ENOTCONN;
+ }
sin->sin_port = inet->inet_dport;
sin->sin_addr.s_addr = inet->inet_daddr;
+ BPF_CGROUP_RUN_SA_PROG(sk, sin, &sin_addr_len,
+ CGROUP_INET4_GETPEERNAME);
} else {
__be32 addr = inet->inet_rcv_saddr;
if (!addr)
addr = inet->inet_saddr;
sin->sin_port = inet->inet_sport;
sin->sin_addr.s_addr = addr;
+ BPF_CGROUP_RUN_SA_PROG(sk, sin, &sin_addr_len,
+ CGROUP_INET4_GETSOCKNAME);
}
+ release_sock(sk);
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
- return sizeof(*sin);
+ return sin_addr_len;
}
EXPORT_SYMBOL(inet_getname);
-int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+int inet_send_prepare(struct sock *sk)
{
- struct sock *sk = sock->sk;
-
sock_rps_record_flow(sk);
/* We may need to bind the socket. */
- if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
+ if (data_race(!inet_sk(sk)->inet_num) && !sk->sk_prot->no_autobind &&
inet_autobind(sk))
return -EAGAIN;
- return sk->sk_prot->sendmsg(sk, msg, size);
+ return 0;
}
-EXPORT_SYMBOL(inet_sendmsg);
+EXPORT_SYMBOL_GPL(inet_send_prepare);
-ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
- size_t size, int flags)
+int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
{
struct sock *sk = sock->sk;
- sock_rps_record_flow(sk);
-
- /* We may need to bind the socket. */
- if (!inet_sk(sk)->inet_num && !sk->sk_prot->no_autobind &&
- inet_autobind(sk))
+ if (unlikely(inet_send_prepare(sk)))
return -EAGAIN;
- if (sk->sk_prot->sendpage)
- return sk->sk_prot->sendpage(sk, page, offset, size, flags);
- return sock_no_sendpage(sock, page, offset, size, flags);
+ return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udp_sendmsg,
+ sk, msg, size);
+}
+EXPORT_SYMBOL(inet_sendmsg);
+
+void inet_splice_eof(struct socket *sock)
+{
+ const struct proto *prot;
+ struct sock *sk = sock->sk;
+
+ if (unlikely(inet_send_prepare(sk)))
+ return;
+
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ prot = READ_ONCE(sk->sk_prot);
+ if (prot->splice_eof)
+ prot->splice_eof(sock);
}
-EXPORT_SYMBOL(inet_sendpage);
+EXPORT_SYMBOL_GPL(inet_splice_eof);
+INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *,
+ size_t, int, int *));
int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int flags)
{
@@ -827,8 +888,8 @@ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
if (likely(!(flags & MSG_ERRQUEUE)))
sock_rps_record_flow(sk);
- err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
- flags & ~MSG_DONTWAIT, &addr_len);
+ err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg,
+ sk, msg, size, flags, &addr_len);
if (err >= 0)
msg->msg_namelen = addr_len;
return err;
@@ -863,9 +924,9 @@ int inet_shutdown(struct socket *sock, int how)
err = -ENOTCONN;
/* Hack to wake up other listeners, who can poll for
EPOLLHUP, even on eg. unconnected UDP sockets -- RR */
- /* fall through */
+ fallthrough;
default:
- sk->sk_shutdown |= how;
+ WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | how);
if (sk->sk_prot->shutdown)
sk->sk_prot->shutdown(sk, how);
break;
@@ -877,7 +938,7 @@ int inet_shutdown(struct socket *sock, int how)
case TCP_LISTEN:
if (!(how & RCV_SHUTDOWN))
break;
- /* fall through */
+ fallthrough;
case TCP_SYN_SENT:
err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
@@ -911,12 +972,6 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
struct rtentry rt;
switch (cmd) {
- case SIOCGSTAMP:
- err = sock_get_timestamp(sk, (struct timeval __user *)arg);
- break;
- case SIOCGSTAMPNS:
- err = sock_get_timestampns(sk, (struct timespec __user *)arg);
- break;
case SIOCADDRT:
case SIOCDELRT:
if (copy_from_user(&rt, p, sizeof(struct rtentry)))
@@ -936,10 +991,10 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case SIOCGIFNETMASK:
case SIOCGIFDSTADDR:
case SIOCGIFPFLAGS:
- if (copy_from_user(&ifr, p, sizeof(struct ifreq)))
+ if (get_user_ifreq(&ifr, NULL, p))
return -EFAULT;
err = devinet_ioctl(net, cmd, &ifr);
- if (!err && copy_to_user(p, &ifr, sizeof(struct ifreq)))
+ if (!err && put_user_ifreq(&ifr, p))
err = -EFAULT;
break;
@@ -949,13 +1004,13 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
case SIOCSIFDSTADDR:
case SIOCSIFPFLAGS:
case SIOCSIFFLAGS:
- if (copy_from_user(&ifr, p, sizeof(struct ifreq)))
+ if (get_user_ifreq(&ifr, NULL, p))
return -EFAULT;
err = devinet_ioctl(net, cmd, &ifr);
break;
default:
if (sk->sk_prot->ioctl)
- err = sk->sk_prot->ioctl(sk, cmd, arg);
+ err = sk_ioctl(sk, cmd, (void __user *)arg);
else
err = -ENOIOCTLCMD;
break;
@@ -965,17 +1020,42 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
EXPORT_SYMBOL(inet_ioctl);
#ifdef CONFIG_COMPAT
+static int inet_compat_routing_ioctl(struct sock *sk, unsigned int cmd,
+ struct compat_rtentry __user *ur)
+{
+ compat_uptr_t rtdev;
+ struct rtentry rt;
+
+ if (copy_from_user(&rt.rt_dst, &ur->rt_dst,
+ 3 * sizeof(struct sockaddr)) ||
+ get_user(rt.rt_flags, &ur->rt_flags) ||
+ get_user(rt.rt_metric, &ur->rt_metric) ||
+ get_user(rt.rt_mtu, &ur->rt_mtu) ||
+ get_user(rt.rt_window, &ur->rt_window) ||
+ get_user(rt.rt_irtt, &ur->rt_irtt) ||
+ get_user(rtdev, &ur->rt_dev))
+ return -EFAULT;
+
+ rt.rt_dev = compat_ptr(rtdev);
+ return ip_rt_ioctl(sock_net(sk), cmd, &rt);
+}
+
static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
+ void __user *argp = compat_ptr(arg);
struct sock *sk = sock->sk;
- int err = -ENOIOCTLCMD;
-
- if (sk->sk_prot->compat_ioctl)
- err = sk->sk_prot->compat_ioctl(sk, cmd, arg);
- return err;
+ switch (cmd) {
+ case SIOCADDRT:
+ case SIOCDELRT:
+ return inet_compat_routing_ioctl(sk, cmd, argp);
+ default:
+ if (!sk->sk_prot->compat_ioctl)
+ return -ENOIOCTLCMD;
+ return sk->sk_prot->compat_ioctl(sk, cmd, arg);
+ }
}
-#endif
+#endif /* CONFIG_COMPAT */
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
@@ -988,6 +1068,7 @@ const struct proto_ops inet_stream_ops = {
.getname = inet_getname,
.poll = tcp_poll,
.ioctl = inet_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = inet_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
@@ -997,15 +1078,14 @@ const struct proto_ops inet_stream_ops = {
#ifdef CONFIG_MMU
.mmap = tcp_mmap,
#endif
- .sendpage = inet_sendpage,
+ .splice_eof = inet_splice_eof,
.splice_read = tcp_splice_read,
+ .set_peek_off = sk_set_peek_off,
.read_sock = tcp_read_sock,
+ .read_skb = tcp_read_skb,
.sendmsg_locked = tcp_sendmsg_locked,
- .sendpage_locked = tcp_sendpage_locked,
.peek_len = tcp_peek_len,
#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
.compat_ioctl = inet_compat_ioctl,
#endif
.set_rcvlowat = tcp_set_rcvlowat,
@@ -1023,18 +1103,18 @@ const struct proto_ops inet_dgram_ops = {
.getname = inet_getname,
.poll = udp_poll,
.ioctl = inet_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
+ .read_skb = udp_read_skb,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = inet_sendpage,
- .set_peek_off = sk_set_peek_off,
+ .splice_eof = inet_splice_eof,
+ .set_peek_off = udp_set_peek_off,
#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
.compat_ioctl = inet_compat_ioctl,
#endif
};
@@ -1055,6 +1135,7 @@ static const struct proto_ops inet_sockraw_ops = {
.getname = inet_getname,
.poll = datagram_poll,
.ioctl = inet_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
@@ -1062,10 +1143,8 @@ static const struct proto_ops inet_sockraw_ops = {
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = inet_sendpage,
+ .splice_eof = inet_splice_eof,
#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
.compat_ioctl = inet_compat_ioctl,
#endif
};
@@ -1188,6 +1267,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
struct rtable *rt;
__be32 new_saddr;
struct ip_options_rcu *inet_opt;
+ int err;
inet_opt = rcu_dereference_protected(inet->inet_opt,
lockdep_sock_is_held(sk));
@@ -1196,26 +1276,32 @@ static int inet_sk_reselect_saddr(struct sock *sk)
/* Query new route. */
fl4 = &inet->cork.fl.u.ip4;
- rt = ip_route_connect(fl4, daddr, 0, RT_CONN_FLAGS(sk),
- sk->sk_bound_dev_if, sk->sk_protocol,
- inet->inet_sport, inet->inet_dport, sk);
+ rt = ip_route_connect(fl4, daddr, 0, sk->sk_bound_dev_if,
+ sk->sk_protocol, inet->inet_sport,
+ inet->inet_dport, sk);
if (IS_ERR(rt))
return PTR_ERR(rt);
- sk_setup_caps(sk, &rt->dst);
-
new_saddr = fl4->saddr;
- if (new_saddr == old_saddr)
+ if (new_saddr == old_saddr) {
+ sk_setup_caps(sk, &rt->dst);
return 0;
+ }
+
+ err = inet_bhash2_update_saddr(sk, &new_saddr, AF_INET);
+ if (err) {
+ ip_rt_put(rt);
+ return err;
+ }
+
+ sk_setup_caps(sk, &rt->dst);
- if (sock_net(sk)->ipv4.sysctl_ip_dynaddr > 1) {
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) > 1) {
pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
__func__, &old_saddr, &new_saddr);
}
- inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
-
/*
* XXX The only one ugly spot where we need to
* XXX really change the sockets identity after
@@ -1229,10 +1315,8 @@ static int inet_sk_reselect_saddr(struct sock *sk)
int inet_sk_rebuild_header(struct sock *sk)
{
+ struct rtable *rt = dst_rtable(__sk_dst_check(sk, 0));
struct inet_sock *inet = inet_sk(sk);
- struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
- __be32 daddr;
- struct ip_options_rcu *inet_opt;
struct flowi4 *fl4;
int err;
@@ -1241,17 +1325,9 @@ int inet_sk_rebuild_header(struct sock *sk)
return 0;
/* Reroute. */
- rcu_read_lock();
- inet_opt = rcu_dereference(inet->inet_opt);
- daddr = inet->inet_daddr;
- if (inet_opt && inet_opt->opt.srr)
- daddr = inet_opt->opt.faddr;
- rcu_read_unlock();
fl4 = &inet->cork.fl.u.ip4;
- rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,
- inet->inet_dport, inet->inet_sport,
- sk->sk_protocol, RT_CONN_FLAGS(sk),
- sk->sk_bound_dev_if);
+ inet_sk_init_flowi4(inet, fl4);
+ rt = ip_route_output_flow(sock_net(sk), fl4, sk);
if (!IS_ERR(rt)) {
err = 0;
sk_setup_caps(sk, &rt->dst);
@@ -1260,15 +1336,12 @@ int inet_sk_rebuild_header(struct sock *sk)
/* Routing failed... */
sk->sk_route_caps = 0;
- /*
- * Other protocols have to map its equivalent state to TCP_SYN_SENT.
- * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
- */
- if (!sock_net(sk)->ipv4.sysctl_ip_dynaddr ||
+
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) ||
sk->sk_state != TCP_SYN_SENT ||
(sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
(err = inet_sk_reselect_saddr(sk)) != 0)
- sk->sk_err_soft = -err;
+ WRITE_ONCE(sk->sk_err_soft, -err);
}
return err;
@@ -1328,18 +1401,17 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
segs = ERR_PTR(-EPROTONOSUPPORT);
- if (!skb->encapsulation || encap) {
- udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
- fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
+ fixedid = !!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCP_FIXEDID << encap));
- /* fixed ID is invalid if DF bit is not set */
- if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF)))
- goto out;
- }
+ if (!skb->encapsulation || encap)
+ udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
ops = rcu_dereference(inet_offloads[proto]);
- if (likely(ops && ops->callbacks.gso_segment))
+ if (likely(ops && ops->callbacks.gso_segment)) {
segs = ops->callbacks.gso_segment(skb, features);
+ if (!segs)
+ skb->network_header = skb_mac_header(skb) + nhoff - skb->head;
+ }
if (IS_ERR_OR_NULL(segs))
goto out;
@@ -1383,7 +1455,15 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
out:
return segs;
}
-EXPORT_SYMBOL(inet_gso_segment);
+
+static struct sk_buff *ipip_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4))
+ return ERR_PTR(-EINVAL);
+
+ return inet_gso_segment(skb, features);
+}
struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
{
@@ -1393,42 +1473,35 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
struct sk_buff *p;
unsigned int hlen;
unsigned int off;
- unsigned int id;
int flush = 1;
int proto;
off = skb_gro_offset(skb);
hlen = off + sizeof(*iph);
- iph = skb_gro_header_fast(skb, off);
- if (skb_gro_header_hard(skb, hlen)) {
- iph = skb_gro_header_slow(skb, hlen, off);
- if (unlikely(!iph))
- goto out;
- }
+ iph = skb_gro_header(skb, hlen, off);
+ if (unlikely(!iph))
+ goto out;
proto = iph->protocol;
- rcu_read_lock();
ops = rcu_dereference(inet_offloads[proto]);
if (!ops || !ops->callbacks.gro_receive)
- goto out_unlock;
+ goto out;
if (*(u8 *)iph != 0x45)
- goto out_unlock;
+ goto out;
if (ip_is_fragment(iph))
- goto out_unlock;
+ goto out;
if (unlikely(ip_fast_csum((u8 *)iph, 5)))
- goto out_unlock;
+ goto out;
- id = ntohl(*(__be32 *)&iph->id);
- flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
- id >>= 16;
+ NAPI_GRO_CB(skb)->proto = proto;
+ flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (ntohl(*(__be32 *)&iph->id) & ~IP_DF));
list_for_each_entry(p, head, list) {
struct iphdr *iph2;
- u16 flush_id;
if (!NAPI_GRO_CB(p)->same_flow)
continue;
@@ -1445,48 +1518,10 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
-
- /* All fields must match except length and checksum. */
- NAPI_GRO_CB(p)->flush |=
- (iph->ttl ^ iph2->ttl) |
- (iph->tos ^ iph2->tos) |
- ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF));
-
- NAPI_GRO_CB(p)->flush |= flush;
-
- /* We need to store of the IP ID check to be included later
- * when we can verify that this packet does in fact belong
- * to a given flow.
- */
- flush_id = (u16)(id - ntohs(iph2->id));
-
- /* This bit of code makes it much easier for us to identify
- * the cases where we are doing atomic vs non-atomic IP ID
- * checks. Specifically an atomic check can return IP ID
- * values 0 - 0xFFFF, while a non-atomic check can only
- * return 0 or 0xFFFF.
- */
- if (!NAPI_GRO_CB(p)->is_atomic ||
- !(iph->frag_off & htons(IP_DF))) {
- flush_id ^= NAPI_GRO_CB(p)->count;
- flush_id = flush_id ? 0xFFFF : 0;
- }
-
- /* If the previous IP ID value was based on an atomic
- * datagram we can overwrite the value and ignore it.
- */
- if (NAPI_GRO_CB(skb)->is_atomic)
- NAPI_GRO_CB(p)->flush_id = flush_id;
- else
- NAPI_GRO_CB(p)->flush_id |= flush_id;
}
- NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF));
NAPI_GRO_CB(skb)->flush |= flush;
- skb_set_network_header(skb, off);
- /* The above will be needed by the transport layer if there is one
- * immediately following this IP hdr.
- */
+ NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off;
/* Note : No need to call skb_gro_postpull_rcsum() here,
* as we already checked checksum over ipv4 header was 0
@@ -1494,17 +1529,14 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
skb_gro_pull(skb, sizeof(*iph));
skb_set_transport_header(skb, skb_gro_offset(skb));
- pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
-
-out_unlock:
- rcu_read_unlock();
+ pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive,
+ ops->callbacks.gro_receive, head, skb);
out:
skb_gro_flush_final(skb, pp, flush);
return pp;
}
-EXPORT_SYMBOL(inet_gro_receive);
static struct sk_buff *ipip_gro_receive(struct list_head *head,
struct sk_buff *skb)
@@ -1547,20 +1579,23 @@ EXPORT_SYMBOL(inet_current_timestamp);
int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
{
- if (sk->sk_family == AF_INET)
+ unsigned int family = READ_ONCE(sk->sk_family);
+
+ if (family == AF_INET)
return ip_recv_error(sk, msg, len, addr_len);
#if IS_ENABLED(CONFIG_IPV6)
- if (sk->sk_family == AF_INET6)
+ if (family == AF_INET6)
return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len);
#endif
return -EINVAL;
}
+EXPORT_SYMBOL(inet_recv_error);
int inet_gro_complete(struct sk_buff *skb, int nhoff)
{
- __be16 newlen = htons(skb->len - nhoff);
struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
const struct net_offload *ops;
+ __be16 totlen = iph->tot_len;
int proto = iph->protocol;
int err = -ENOSYS;
@@ -1569,26 +1604,24 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
skb_set_inner_network_header(skb, nhoff);
}
- csum_replace2(&iph->check, iph->tot_len, newlen);
- iph->tot_len = newlen;
+ iph_set_totlen(iph, skb->len - nhoff);
+ csum_replace2(&iph->check, totlen, iph->tot_len);
- rcu_read_lock();
ops = rcu_dereference(inet_offloads[proto]);
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
- goto out_unlock;
+ goto out;
/* Only need to add sizeof(*iph) to get to the next hdr below
* because any hdr with option will have been flushed in
* inet_gro_receive().
*/
- err = ops->callbacks.gro_complete(skb, nhoff + sizeof(*iph));
-
-out_unlock:
- rcu_read_unlock();
+ err = INDIRECT_CALL_2(ops->callbacks.gro_complete,
+ tcp4_gro_complete, udp4_gro_complete,
+ skb, nhoff + sizeof(*iph));
+out:
return err;
}
-EXPORT_SYMBOL(inet_gro_complete);
static int ipip_gro_complete(struct sk_buff *skb, int nhoff)
{
@@ -1607,6 +1640,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
if (rc == 0) {
*sk = sock->sk;
(*sk)->sk_allocation = GFP_ATOMIC;
+ (*sk)->sk_use_task_frag = false;
/*
* Unhash it so that IP input processing does not even see it,
* we do not wish this socket to see incoming packets.
@@ -1617,12 +1651,6 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
}
EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
-u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offt)
-{
- return *(((unsigned long *)per_cpu_ptr(mib, cpu)) + offt);
-}
-EXPORT_SYMBOL_GPL(snmp_get_cpu_field);
-
unsigned long snmp_fold_field(void __percpu *mib, int offt)
{
unsigned long res = 0;
@@ -1647,9 +1675,9 @@ u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt,
bhptr = per_cpu_ptr(mib, cpu);
syncp = (struct u64_stats_sync *)(bhptr + syncp_offset);
do {
- start = u64_stats_fetch_begin_irq(syncp);
+ start = u64_stats_fetch_begin(syncp);
v = *(((u64 *)bhptr) + offt);
- } while (u64_stats_fetch_retry_irq(syncp, start));
+ } while (u64_stats_fetch_retry(syncp, start));
return v;
}
@@ -1671,40 +1699,13 @@ EXPORT_SYMBOL_GPL(snmp_fold_field64);
#ifdef CONFIG_IP_MULTICAST
static const struct net_protocol igmp_protocol = {
.handler = igmp_rcv,
- .netns_ok = 1,
};
#endif
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct net_protocol tcp_protocol = {
- .early_demux = tcp_v4_early_demux,
- .early_demux_handler = tcp_v4_early_demux,
- .handler = tcp_v4_rcv,
- .err_handler = tcp_v4_err,
- .no_policy = 1,
- .netns_ok = 1,
- .icmp_strict_tag_validation = 1,
-};
-
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct net_protocol udp_protocol = {
- .early_demux = udp_v4_early_demux,
- .early_demux_handler = udp_v4_early_demux,
- .handler = udp_rcv,
- .err_handler = udp_err,
- .no_policy = 1,
- .netns_ok = 1,
-};
-
static const struct net_protocol icmp_protocol = {
.handler = icmp_rcv,
.err_handler = icmp_err,
.no_policy = 1,
- .netns_ok = 1,
};
static __net_init int ipv4_mib_init_net(struct net *net)
@@ -1769,6 +1770,10 @@ static __net_exit void ipv4_mib_exit_net(struct net *net)
free_percpu(net->mib.net_statistics);
free_percpu(net->mib.ip_statistics);
free_percpu(net->mib.tcp_statistics);
+#ifdef CONFIG_MPTCP
+ /* allocated on demand, see mptcp_init_sock() */
+ free_percpu(net->mib.mptcp_statistics);
+#endif
}
static __net_initdata struct pernet_operations ipv4_mib_ops = {
@@ -1786,9 +1791,7 @@ static __net_init int inet_init_net(struct net *net)
/*
* Set defaults for local port range
*/
- seqlock_init(&net->ipv4.ip_local_ports.lock);
- net->ipv4.ip_local_ports.range[0] = 32768;
- net->ipv4.ip_local_ports.range[1] = 60999;
+ net->ipv4.ip_local_ports.range = 60999u << 16 | 32768u;
seqlock_init(&net->ipv4.ping_group_range.lock);
/*
@@ -1807,6 +1810,7 @@ static __net_init int inet_init_net(struct net *net)
net->ipv4.sysctl_ip_early_demux = 1;
net->ipv4.sysctl_udp_early_demux = 1;
net->ipv4.sysctl_tcp_early_demux = 1;
+ net->ipv4.sysctl_nexthop_compat_mode = 1;
#ifdef CONFIG_SYSCTL
net->ipv4.sysctl_ip_prot_sock = PROT_SOCK;
#endif
@@ -1818,16 +1822,13 @@ static __net_init int inet_init_net(struct net *net)
net->ipv4.sysctl_igmp_llm_reports = 1;
net->ipv4.sysctl_igmp_qrv = 2;
- return 0;
-}
+ net->ipv4.sysctl_fib_notify_on_flag_change = 0;
-static __net_exit void inet_exit_net(struct net *net)
-{
+ return 0;
}
static __net_initdata struct pernet_operations af_inet_ops = {
.init = inet_init_net,
- .exit = inet_exit_net,
};
static int __init init_inet_pernet_ops(void)
@@ -1841,18 +1842,10 @@ static int ipv4_proc_init(void);
* IP protocol layer initialiser
*/
-static struct packet_offload ip_packet_offload __read_mostly = {
- .type = cpu_to_be16(ETH_P_IP),
- .callbacks = {
- .gso_segment = inet_gso_segment,
- .gro_receive = inet_gro_receive,
- .gro_complete = inet_gro_complete,
- },
-};
static const struct net_offload ipip_offload = {
.callbacks = {
- .gso_segment = inet_gso_segment,
+ .gso_segment = ipip_gso_segment,
.gro_receive = ipip_gro_receive,
.gro_complete = ipip_gro_complete,
},
@@ -1875,7 +1868,15 @@ static int __init ipv4_offload_init(void)
if (ipip_offload_init() < 0)
pr_crit("%s: Cannot add IPIP protocol offload\n", __func__);
- dev_add_offload(&ip_packet_offload);
+ net_hotdata.ip_packet_offload = (struct packet_offload) {
+ .type = cpu_to_be16(ETH_P_IP),
+ .callbacks = {
+ .gso_segment = inet_gso_segment,
+ .gro_receive = inet_gro_receive,
+ .gro_complete = inet_gro_complete,
+ },
+ };
+ dev_add_offload(&net_hotdata.ip_packet_offload);
return 0;
}
@@ -1891,10 +1892,12 @@ static int __init inet_init(void)
{
struct inet_protosw *q;
struct list_head *r;
- int rc = -EINVAL;
+ int rc;
sock_skb_cb_check_size(sizeof(struct inet_skb_parm));
+ raw_hashinfo_init(&raw_v4_hashinfo);
+
rc = proto_register(&tcp_prot, 1);
if (rc)
goto out;
@@ -1927,9 +1930,22 @@ static int __init inet_init(void)
if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
pr_crit("%s: Cannot add ICMP protocol\n", __func__);
- if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
+
+ net_hotdata.udp_protocol = (struct net_protocol) {
+ .handler = udp_rcv,
+ .err_handler = udp_err,
+ .no_policy = 1,
+ };
+ if (inet_add_protocol(&net_hotdata.udp_protocol, IPPROTO_UDP) < 0)
pr_crit("%s: Cannot add UDP protocol\n", __func__);
- if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
+
+ net_hotdata.tcp_protocol = (struct net_protocol) {
+ .handler = tcp_v4_rcv,
+ .err_handler = tcp_v4_err,
+ .no_policy = 1,
+ .icmp_strict_tag_validation = 1,
+ };
+ if (inet_add_protocol(&net_hotdata.tcp_protocol, IPPROTO_TCP) < 0)
pr_crit("%s: Cannot add TCP protocol\n", __func__);
#ifdef CONFIG_IP_MULTICAST
if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
@@ -1955,6 +1971,10 @@ static int __init inet_init(void)
ip_init();
+ /* Initialise per-cpu ipv4 mibs */
+ if (init_ipv4_mibs())
+ panic("%s: Cannot init ipv4 mibs\n", __func__);
+
/* Setup TCP slab cache for open requests. */
tcp_init();
@@ -1964,6 +1984,8 @@ static int __init inet_init(void)
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
+ raw_init();
+
ping_init();
/*
@@ -1983,12 +2005,6 @@ static int __init inet_init(void)
if (init_inet_pernet_ops())
pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);
- /*
- * Initialise per-cpu ipv4 mibs
- */
-
- if (init_ipv4_mibs())
- pr_crit("%s: Cannot init ipv4 mibs\n", __func__);
ipv4_proc_init();
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 4dd95cdd8070..64aec3dff8ec 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -1,7 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) "IPsec: " fmt
-#include <crypto/algapi.h>
#include <crypto/hash.h>
+#include <crypto/utils.h>
#include <linux/err.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -26,9 +27,7 @@ static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags,
{
unsigned int len;
- len = size + crypto_ahash_digestsize(ahash) +
- (crypto_ahash_alignmask(ahash) &
- ~(crypto_tfm_ctx_alignment() - 1));
+ len = size + crypto_ahash_digestsize(ahash);
len = ALIGN(len, crypto_tfm_ctx_alignment());
@@ -45,10 +44,9 @@ static inline u8 *ah_tmp_auth(void *tmp, unsigned int offset)
return tmp + offset;
}
-static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp,
- unsigned int offset)
+static inline u8 *ah_tmp_icv(void *tmp, unsigned int offset)
{
- return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1);
+ return tmp + offset;
}
static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash,
@@ -106,7 +104,7 @@ static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)
if (optlen < 6)
return -EINVAL;
memcpy(daddr, optptr+optlen-4, 4);
- /* Fall through */
+ fallthrough;
default:
memset(optptr, 0, optlen);
}
@@ -116,11 +114,11 @@ static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr)
return 0;
}
-static void ah_output_done(struct crypto_async_request *base, int err)
+static void ah_output_done(void *data, int err)
{
u8 *icv;
struct iphdr *iph;
- struct sk_buff *skb = base->data;
+ struct sk_buff *skb = data;
struct xfrm_state *x = skb_dst(skb)->xfrm;
struct ah_data *ahp = x->data;
struct iphdr *top_iph = ip_hdr(skb);
@@ -128,7 +126,7 @@ static void ah_output_done(struct crypto_async_request *base, int err)
int ihl = ip_hdrlen(skb);
iph = AH_SKB_CB(skb)->tmp;
- icv = ah_tmp_icv(ahp->ahash, iph, ihl);
+ icv = ah_tmp_icv(iph, ihl);
memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
top_iph->tos = iph->tos;
@@ -140,7 +138,7 @@ static void ah_output_done(struct crypto_async_request *base, int err)
}
kfree(AH_SKB_CB(skb)->tmp);
- xfrm_output_resume(skb, err);
+ xfrm_output_resume(skb->sk, skb, err);
}
static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
@@ -181,7 +179,7 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
if (!iph)
goto out;
seqhi = (__be32 *)((char *)iph + ihl);
- icv = ah_tmp_icv(ahash, seqhi, seqhi_len);
+ icv = ah_tmp_icv(seqhi, seqhi_len);
req = ah_tmp_req(ahash, icv);
sg = ah_req_sg(ahash, req);
seqhisg = sg + nfrags;
@@ -261,12 +259,12 @@ out:
return err;
}
-static void ah_input_done(struct crypto_async_request *base, int err)
+static void ah_input_done(void *data, int err)
{
u8 *auth_data;
u8 *icv;
struct iphdr *work_iph;
- struct sk_buff *skb = base->data;
+ struct sk_buff *skb = data;
struct xfrm_state *x = xfrm_input_state(skb);
struct ah_data *ahp = x->data;
struct ip_auth_hdr *ah = ip_auth_hdr(skb);
@@ -278,7 +276,7 @@ static void ah_input_done(struct crypto_async_request *base, int err)
work_iph = AH_SKB_CB(skb)->tmp;
auth_data = ah_tmp_auth(work_iph, ihl);
- icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
+ icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len);
err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
if (err)
@@ -373,7 +371,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
seqhi = (__be32 *)((char *)work_iph + ihl);
auth_data = ah_tmp_auth(seqhi, seqhi_len);
- icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len);
+ icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len);
req = ah_tmp_req(ahash, icv);
sg = ah_req_sg(ahash, req);
seqhisg = sg + nfrags;
@@ -449,6 +447,7 @@ static int ah4_err(struct sk_buff *skb, u32 info)
case ICMP_DEST_UNREACH:
if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
return 0;
+ break;
case ICMP_REDIRECT:
break;
default:
@@ -461,38 +460,46 @@ static int ah4_err(struct sk_buff *skb, u32 info)
return 0;
if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
- ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_AH, 0);
+ ipv4_update_pmtu(skb, net, info, 0, IPPROTO_AH);
else
- ipv4_redirect(skb, net, 0, 0, IPPROTO_AH, 0);
+ ipv4_redirect(skb, net, 0, IPPROTO_AH);
xfrm_state_put(x);
return 0;
}
-static int ah_init_state(struct xfrm_state *x)
+static int ah_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
struct ah_data *ahp = NULL;
struct xfrm_algo_desc *aalg_desc;
struct crypto_ahash *ahash;
- if (!x->aalg)
+ if (!x->aalg) {
+ NL_SET_ERR_MSG(extack, "AH requires a state with an AUTH algorithm");
goto error;
+ }
- if (x->encap)
+ if (x->encap) {
+ NL_SET_ERR_MSG(extack, "AH is not compatible with encapsulation");
goto error;
+ }
ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
if (!ahp)
return -ENOMEM;
ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
- if (IS_ERR(ahash))
+ if (IS_ERR(ahash)) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto error;
+ }
ahp->ahash = ahash;
if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
- (x->aalg->alg_key_len + 7) / 8))
+ (x->aalg->alg_key_len + 7) / 8)) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto error;
+ }
/*
* Lookup the algorithm description maintained by xfrm_algo,
@@ -505,10 +512,7 @@ static int ah_init_state(struct xfrm_state *x)
if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
crypto_ahash_digestsize(ahash)) {
- pr_info("%s: %s digestsize %u != %hu\n",
- __func__, x->aalg->alg_name,
- crypto_ahash_digestsize(ahash),
- aalg_desc->uinfo.auth.icv_fullbits / 8);
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto error;
}
@@ -553,7 +557,6 @@ static int ah4_rcv_cb(struct sk_buff *skb, int err)
static const struct xfrm_type ah_type =
{
- .description = "AH4",
.owner = THIS_MODULE,
.proto = IPPROTO_AH,
.flags = XFRM_TYPE_REPLAY_PROT,
@@ -589,11 +592,11 @@ static void __exit ah4_fini(void)
{
if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0)
pr_info("%s: can't remove protocol\n", __func__);
- if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
- pr_info("%s: can't remove xfrm type\n", __func__);
+ xfrm_unregister_type(&ah_type, AF_INET);
}
module_init(ah4_init);
module_exit(ah4_fini);
+MODULE_DESCRIPTION("IPv4 AH transformation library");
MODULE_LICENSE("GPL");
MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_AH);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index e90c89ef8c08..7f3863daaa40 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* linux/net/ipv4/arp.c
*
* Copyright (C) 1994 by Florian La Roche
@@ -7,11 +8,6 @@
* high-level addresses) into a low-level hardware address (like an Ethernet
* address).
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Fixes:
* Alan Cox : Removed the Ethernet assumptions in
* Florian's code
@@ -129,6 +125,7 @@ static int arp_constructor(struct neighbour *neigh);
static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
static void parp_redo(struct sk_buff *skb);
+static int arp_is_multicast(const void *pkey);
static const struct neigh_ops arp_generic_ops = {
.family = AF_INET,
@@ -160,6 +157,7 @@ struct neigh_table arp_tbl = {
.key_eq = arp_key_eq,
.constructor = arp_constructor,
.proxy_redo = parp_redo,
+ .is_multicast = arp_is_multicast,
.id = "arp_cache",
.parms = {
.tbl = &arp_tbl,
@@ -170,8 +168,9 @@ struct neigh_table arp_tbl = {
[NEIGH_VAR_RETRANS_TIME] = 1 * HZ,
[NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ,
[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+ [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
- [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
+ [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT,
[NEIGH_VAR_PROXY_QLEN] = 64,
[NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
[NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
@@ -295,7 +294,7 @@ static int arp_constructor(struct neighbour *neigh)
static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
{
dst_link_failure(skb);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED);
}
/* Create and send an arp packet. */
@@ -376,7 +375,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
if (probes < 0) {
- if (!(neigh->nud_state & NUD_VALID))
+ if (!(READ_ONCE(neigh->nud_state) & NUD_VALID))
pr_debug("trying to ucast probe in NUD_INVALID\n");
neigh_ha_snapshot(dst_ha, neigh, dev);
dst_hw = dst_ha;
@@ -430,6 +429,26 @@ static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip)
return !inet_confirm_addr(net, in_dev, sip, tip, scope);
}
+static int arp_accept(struct in_device *in_dev, __be32 sip)
+{
+ struct net *net = dev_net(in_dev->dev);
+ int scope = RT_SCOPE_LINK;
+
+ switch (IN_DEV_ARP_ACCEPT(in_dev)) {
+ case 0: /* Don't create new entries from garp */
+ return 0;
+ case 1: /* Create new entries from garp */
+ return 1;
+ case 2: /* Create a neighbor in the arp table only if sip
+ * is in the same subnet as an address configured
+ * on the interface that received the garp message
+ */
+ return !!inet_confirm_addr(net, in_dev, sip, 0, scope);
+ default:
+ return 0;
+ }
+}
+
static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
{
struct rtable *rt;
@@ -437,7 +456,8 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
/*unsigned long now; */
struct net *net = dev_net(dev);
- rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev));
+ rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev),
+ RT_SCOPE_UNIVERSE);
if (IS_ERR(rt))
return 1;
if (rt->dst.dev != dev) {
@@ -639,10 +659,12 @@ static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb
*/
void arp_xmit(struct sk_buff *skb)
{
+ rcu_read_lock();
/* Send it off, maybe filter it using firewalling first. */
NF_HOOK(NFPROTO_ARP, NF_ARP_OUT,
- dev_net(skb->dev), NULL, skb, NULL, skb->dev,
+ dev_net_rcu(skb->dev), NULL, skb, NULL, skb->dev,
arp_xmit_finish);
+ rcu_read_unlock();
}
EXPORT_SYMBOL(arp_xmit);
@@ -842,7 +864,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
(arp_fwd_proxy(in_dev, dev, rt) ||
arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
(rt->dst.dev != dev &&
- pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) {
+ pneigh_lookup(&arp_tbl, net, &tip, dev)))) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n)
neigh_release(n);
@@ -869,12 +891,12 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
addr_type = -1;
- if (n || IN_DEV_ARP_ACCEPT(in_dev)) {
+ if (n || arp_accept(in_dev, sip)) {
is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op,
sip, tip, sha, tha);
}
- if (IN_DEV_ARP_ACCEPT(in_dev)) {
+ if (arp_accept(in_dev, sip)) {
/* Unsolicited ARP is not accepted by default.
It is possible, that this option should be enabled for some
devices (strip is candidate)
@@ -932,6 +954,10 @@ static void parp_redo(struct sk_buff *skb)
arp_process(dev_net(skb->dev), NULL, skb);
}
+static int arp_is_multicast(const void *pkey)
+{
+ return ipv4_is_multicast(*((__be32 *)pkey));
+}
/*
* Receive an arp request from the device layer.
@@ -940,6 +966,7 @@ static void parp_redo(struct sk_buff *skb)
static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
+ enum skb_drop_reason drop_reason;
const struct arphdr *arp;
/* do not tweak dropwatch on an ARP we will ignore */
@@ -953,12 +980,15 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
goto out_of_mem;
/* ARP header, plus 2 device addresses, plus 2 IP addresses. */
- if (!pskb_may_pull(skb, arp_hdr_len(dev)))
+ drop_reason = pskb_may_pull_reason(skb, arp_hdr_len(dev));
+ if (drop_reason != SKB_NOT_DROPPED_YET)
goto freeskb;
arp = arp_hdr(skb);
- if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4)
+ if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4) {
+ drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
goto freeskb;
+ }
memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
@@ -970,7 +1000,7 @@ consumeskb:
consume_skb(skb);
return NET_RX_SUCCESS;
freeskb:
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
out_of_mem:
return NET_RX_DROP;
}
@@ -979,6 +1009,55 @@ out_of_mem:
* User level interface (ioctl)
*/
+static struct net_device *arp_req_dev_by_name(struct net *net, struct arpreq *r,
+ bool getarp)
+{
+ struct net_device *dev;
+
+ if (getarp)
+ dev = dev_get_by_name_rcu(net, r->arp_dev);
+ else
+ dev = __dev_get_by_name(net, r->arp_dev);
+ if (!dev)
+ return ERR_PTR(-ENODEV);
+
+ /* Mmmm... It is wrong... ARPHRD_NETROM == 0 */
+ if (!r->arp_ha.sa_family)
+ r->arp_ha.sa_family = dev->type;
+
+ if ((r->arp_flags & ATF_COM) && r->arp_ha.sa_family != dev->type)
+ return ERR_PTR(-EINVAL);
+
+ return dev;
+}
+
+static struct net_device *arp_req_dev(struct net *net, struct arpreq *r)
+{
+ struct net_device *dev;
+ struct rtable *rt;
+ __be32 ip;
+
+ if (r->arp_dev[0])
+ return arp_req_dev_by_name(net, r, false);
+
+ if (r->arp_flags & ATF_PUBL)
+ return NULL;
+
+ ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+
+ rt = ip_route_output(net, ip, 0, 0, 0, RT_SCOPE_LINK);
+ if (IS_ERR(rt))
+ return ERR_CAST(rt);
+
+ dev = rt->dst.dev;
+ ip_rt_put(rt);
+
+ if (!dev)
+ return ERR_PTR(-EINVAL);
+
+ return dev;
+}
+
/*
* Set (create) an ARP cache entry.
*/
@@ -989,8 +1068,8 @@ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
IPV4_DEVCONF_ALL(net, PROXY_ARP) = on;
return 0;
}
- if (__in_dev_get_rtnl(dev)) {
- IN_DEV_CONF_SET(__in_dev_get_rtnl(dev), PROXY_ARP, on);
+ if (__in_dev_get_rtnl_net(dev)) {
+ IN_DEV_CONF_SET(__in_dev_get_rtnl_net(dev), PROXY_ARP, on);
return 0;
}
return -ENXIO;
@@ -999,49 +1078,37 @@ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on)
static int arp_req_set_public(struct net *net, struct arpreq *r,
struct net_device *dev)
{
- __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
__be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
- if (mask && mask != htonl(0xFFFFFFFF))
- return -EINVAL;
if (!dev && (r->arp_flags & ATF_COM)) {
- dev = dev_getbyhwaddr_rcu(net, r->arp_ha.sa_family,
+ dev = dev_getbyhwaddr(net, r->arp_ha.sa_family,
r->arp_ha.sa_data);
if (!dev)
return -ENODEV;
}
if (mask) {
- if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1))
- return -ENOBUFS;
- return 0;
+ __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+
+ return pneigh_create(&arp_tbl, net, &ip, dev, 0, 0, false);
}
return arp_req_set_proxy(net, dev, 1);
}
-static int arp_req_set(struct net *net, struct arpreq *r,
- struct net_device *dev)
+static int arp_req_set(struct net *net, struct arpreq *r)
{
- __be32 ip;
struct neighbour *neigh;
+ struct net_device *dev;
+ __be32 ip;
int err;
+ dev = arp_req_dev(net, r);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
+
if (r->arp_flags & ATF_PUBL)
return arp_req_set_public(net, r, dev);
- ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
- if (r->arp_flags & ATF_PERM)
- r->arp_flags |= ATF_COM;
- if (!dev) {
- struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
-
- if (IS_ERR(rt))
- return PTR_ERR(rt);
- dev = rt->dst.dev;
- ip_rt_put(rt);
- if (!dev)
- return -EINVAL;
- }
switch (dev->type) {
#if IS_ENABLED(CONFIG_FDDI)
case ARPHRD_FDDI:
@@ -1063,12 +1130,18 @@ static int arp_req_set(struct net *net, struct arpreq *r,
break;
}
+ ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+
neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
err = PTR_ERR(neigh);
if (!IS_ERR(neigh)) {
unsigned int state = NUD_STALE;
- if (r->arp_flags & ATF_PERM)
+
+ if (r->arp_flags & ATF_PERM) {
+ r->arp_flags |= ATF_COM;
state = NUD_PERMANENT;
+ }
+
err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
r->arp_ha.sa_data : NULL, state,
NEIGH_UPDATE_F_OVERRIDE |
@@ -1092,43 +1165,62 @@ static unsigned int arp_state_to_flags(struct neighbour *neigh)
* Get an ARP cache entry.
*/
-static int arp_req_get(struct arpreq *r, struct net_device *dev)
+static int arp_req_get(struct net *net, struct arpreq *r)
{
__be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
struct neighbour *neigh;
- int err = -ENXIO;
+ struct net_device *dev;
+
+ if (!r->arp_dev[0])
+ return -ENODEV;
+
+ dev = arp_req_dev_by_name(net, r, true);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
neigh = neigh_lookup(&arp_tbl, &ip, dev);
- if (neigh) {
- if (!(neigh->nud_state & NUD_NOARP)) {
- read_lock_bh(&neigh->lock);
- memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
- r->arp_flags = arp_state_to_flags(neigh);
- read_unlock_bh(&neigh->lock);
- r->arp_ha.sa_family = dev->type;
- strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
- err = 0;
- }
+ if (!neigh)
+ return -ENXIO;
+
+ if (READ_ONCE(neigh->nud_state) & NUD_NOARP) {
neigh_release(neigh);
+ return -ENXIO;
}
- return err;
+
+ read_lock_bh(&neigh->lock);
+ memcpy(r->arp_ha.sa_data, neigh->ha,
+ min(dev->addr_len, sizeof(r->arp_ha.sa_data)));
+ r->arp_flags = arp_state_to_flags(neigh);
+ read_unlock_bh(&neigh->lock);
+
+ neigh_release(neigh);
+
+ r->arp_ha.sa_family = dev->type;
+ netdev_copy_name(dev, r->arp_dev);
+
+ return 0;
}
-static int arp_invalidate(struct net_device *dev, __be32 ip)
+int arp_invalidate(struct net_device *dev, __be32 ip, bool force)
{
struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev);
int err = -ENXIO;
struct neigh_table *tbl = &arp_tbl;
if (neigh) {
- if (neigh->nud_state & ~NUD_NOARP)
+ if ((READ_ONCE(neigh->nud_state) & NUD_VALID) && !force) {
+ neigh_release(neigh);
+ return 0;
+ }
+
+ if (READ_ONCE(neigh->nud_state) & ~NUD_NOARP)
err = neigh_update(neigh, NULL, NUD_FAILED,
NEIGH_UPDATE_F_OVERRIDE|
NEIGH_UPDATE_F_ADMIN, 0);
- write_lock_bh(&tbl->lock);
+ spin_lock_bh(&tbl->lock);
neigh_release(neigh);
- neigh_remove_one(neigh, tbl);
- write_unlock_bh(&tbl->lock);
+ neigh_remove_one(neigh);
+ spin_unlock_bh(&tbl->lock);
}
return err;
@@ -1137,37 +1229,32 @@ static int arp_invalidate(struct net_device *dev, __be32 ip)
static int arp_req_delete_public(struct net *net, struct arpreq *r,
struct net_device *dev)
{
- __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
__be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
- if (mask == htonl(0xFFFFFFFF))
- return pneigh_delete(&arp_tbl, net, &ip, dev);
+ if (mask) {
+ __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
- if (mask)
- return -EINVAL;
+ return pneigh_delete(&arp_tbl, net, &ip, dev);
+ }
return arp_req_set_proxy(net, dev, 0);
}
-static int arp_req_delete(struct net *net, struct arpreq *r,
- struct net_device *dev)
+static int arp_req_delete(struct net *net, struct arpreq *r)
{
+ struct net_device *dev;
__be32 ip;
+ dev = arp_req_dev(net, r);
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
+
if (r->arp_flags & ATF_PUBL)
return arp_req_delete_public(net, r, dev);
ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
- if (!dev) {
- struct rtable *rt = ip_route_output(net, ip, 0, RTO_ONLINK, 0);
- if (IS_ERR(rt))
- return PTR_ERR(rt);
- dev = rt->dst.dev;
- ip_rt_put(rt);
- if (!dev)
- return -EINVAL;
- }
- return arp_invalidate(dev, ip);
+
+ return arp_invalidate(dev, ip, true);
}
/*
@@ -1176,16 +1263,16 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
{
- int err;
struct arpreq r;
- struct net_device *dev = NULL;
+ __be32 *netmask;
+ int err;
switch (cmd) {
case SIOCDARP:
case SIOCSARP:
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- /* fall through */
+ fallthrough;
case SIOCGARP:
err = copy_from_user(&r, arg, sizeof(struct arpreq));
if (err)
@@ -1201,42 +1288,34 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
if (!(r.arp_flags & ATF_PUBL) &&
(r.arp_flags & (ATF_NETMASK | ATF_DONTPUB)))
return -EINVAL;
+
+ netmask = &((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr;
if (!(r.arp_flags & ATF_NETMASK))
- ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
- htonl(0xFFFFFFFFUL);
- rtnl_lock();
- if (r.arp_dev[0]) {
- err = -ENODEV;
- dev = __dev_get_by_name(net, r.arp_dev);
- if (!dev)
- goto out;
-
- /* Mmmm... It is wrong... ARPHRD_NETROM==0 */
- if (!r.arp_ha.sa_family)
- r.arp_ha.sa_family = dev->type;
- err = -EINVAL;
- if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type)
- goto out;
- } else if (cmd == SIOCGARP) {
- err = -ENODEV;
- goto out;
- }
+ *netmask = htonl(0xFFFFFFFFUL);
+ else if (*netmask && *netmask != htonl(0xFFFFFFFFUL))
+ return -EINVAL;
switch (cmd) {
case SIOCDARP:
- err = arp_req_delete(net, &r, dev);
+ rtnl_net_lock(net);
+ err = arp_req_delete(net, &r);
+ rtnl_net_unlock(net);
break;
case SIOCSARP:
- err = arp_req_set(net, &r, dev);
+ rtnl_net_lock(net);
+ err = arp_req_set(net, &r);
+ rtnl_net_unlock(net);
break;
case SIOCGARP:
- err = arp_req_get(&r, dev);
+ rcu_read_lock();
+ err = arp_req_get(net, &r);
+ rcu_read_unlock();
+
+ if (!err && copy_to_user(arg, &r, sizeof(r)))
+ err = -EFAULT;
break;
}
-out:
- rtnl_unlock();
- if (cmd == SIOCGARP && !err && copy_to_user(arg, &r, sizeof(r)))
- err = -EFAULT;
+
return err;
}
@@ -1245,6 +1324,8 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event,
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct netdev_notifier_change_info *change_info;
+ struct in_device *in_dev;
+ bool evict_nocarrier;
switch (event) {
case NETDEV_CHANGEADDR:
@@ -1255,6 +1336,15 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event,
change_info = ptr;
if (change_info->flags_changed & IFF_NOARP)
neigh_changeaddr(&arp_tbl, dev);
+
+ in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev)
+ evict_nocarrier = true;
+ else
+ evict_nocarrier = IN_DEV_ARP_EVICT_NOCARRIER(in_dev);
+
+ if (evict_nocarrier && !netif_carrier_ok(dev))
+ neigh_carrier_down(&arp_tbl, dev);
break;
default:
break;
@@ -1286,24 +1376,9 @@ static struct packet_type arp_packet_type __read_mostly = {
.func = arp_rcv,
};
-static int arp_proc_init(void);
-
-void __init arp_init(void)
-{
- neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl);
-
- dev_add_pack(&arp_packet_type);
- arp_proc_init();
-#ifdef CONFIG_SYSCTL
- neigh_sysctl_register(NULL, &arp_tbl.parms, NULL);
-#endif
- register_netdevice_notifier(&arp_netdev_notifier);
-}
-
#ifdef CONFIG_PROC_FS
#if IS_ENABLED(CONFIG_AX25)
-/* ------------------------------------------------------------------------ */
/*
* ax25 -> ASCII conversion
*/
@@ -1409,16 +1484,13 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP);
}
-/* ------------------------------------------------------------------------ */
-
static const struct seq_operations arp_seq_ops = {
.start = arp_seq_start,
.next = neigh_seq_next,
.stop = neigh_seq_stop,
.show = arp_seq_show,
};
-
-/* ------------------------------------------------------------------------ */
+#endif /* CONFIG_PROC_FS */
static int __net_init arp_net_init(struct net *net)
{
@@ -1438,16 +1510,14 @@ static struct pernet_operations arp_net_ops = {
.exit = arp_net_exit,
};
-static int __init arp_proc_init(void)
+void __init arp_init(void)
{
- return register_pernet_subsys(&arp_net_ops);
-}
-
-#else /* CONFIG_PROC_FS */
+ neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl);
-static int __init arp_proc_init(void)
-{
- return 0;
+ dev_add_pack(&arp_packet_type);
+ register_pernet_subsys(&arp_net_ops);
+#ifdef CONFIG_SYSCTL
+ neigh_sysctl_register(NULL, &arp_tbl.parms, NULL);
+#endif
+ register_netdevice_notifier(&arp_netdev_notifier);
}
-
-#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
new file mode 100644
index 000000000000..e01492234b0b
--- /dev/null
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -0,0 +1,349 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2019 Facebook */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/filter.h>
+#include <net/tcp.h>
+#include <net/bpf_sk_storage.h>
+
+/* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */
+static struct bpf_struct_ops bpf_tcp_congestion_ops;
+
+static const struct btf_type *tcp_sock_type;
+static u32 tcp_sock_id, sock_id;
+static const struct btf_type *tcp_congestion_ops_type;
+
+static int bpf_tcp_ca_init(struct btf *btf)
+{
+ s32 type_id;
+
+ type_id = btf_find_by_name_kind(btf, "sock", BTF_KIND_STRUCT);
+ if (type_id < 0)
+ return -EINVAL;
+ sock_id = type_id;
+
+ type_id = btf_find_by_name_kind(btf, "tcp_sock", BTF_KIND_STRUCT);
+ if (type_id < 0)
+ return -EINVAL;
+ tcp_sock_id = type_id;
+ tcp_sock_type = btf_type_by_id(btf, tcp_sock_id);
+
+ type_id = btf_find_by_name_kind(btf, "tcp_congestion_ops", BTF_KIND_STRUCT);
+ if (type_id < 0)
+ return -EINVAL;
+ tcp_congestion_ops_type = btf_type_by_id(btf, type_id);
+
+ return 0;
+}
+
+static bool bpf_tcp_ca_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (!bpf_tracing_btf_ctx_access(off, size, type, prog, info))
+ return false;
+
+ if (base_type(info->reg_type) == PTR_TO_BTF_ID &&
+ !bpf_type_has_unsafe_modifiers(info->reg_type) &&
+ info->btf_id == sock_id)
+ /* promote it to tcp_sock */
+ info->btf_id = tcp_sock_id;
+
+ return true;
+}
+
+static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log,
+ const struct bpf_reg_state *reg,
+ int off, int size)
+{
+ const struct btf_type *t;
+ size_t end;
+
+ t = btf_type_by_id(reg->btf, reg->btf_id);
+ if (t != tcp_sock_type) {
+ bpf_log(log, "only read is supported\n");
+ return -EACCES;
+ }
+
+ switch (off) {
+ case offsetof(struct sock, sk_pacing_rate):
+ end = offsetofend(struct sock, sk_pacing_rate);
+ break;
+ case offsetof(struct sock, sk_pacing_status):
+ end = offsetofend(struct sock, sk_pacing_status);
+ break;
+ case bpf_ctx_range(struct inet_connection_sock, icsk_ca_priv):
+ end = offsetofend(struct inet_connection_sock, icsk_ca_priv);
+ break;
+ case offsetof(struct inet_connection_sock, icsk_ack.pending):
+ end = offsetofend(struct inet_connection_sock,
+ icsk_ack.pending);
+ break;
+ case offsetof(struct tcp_sock, snd_cwnd):
+ end = offsetofend(struct tcp_sock, snd_cwnd);
+ break;
+ case offsetof(struct tcp_sock, snd_cwnd_cnt):
+ end = offsetofend(struct tcp_sock, snd_cwnd_cnt);
+ break;
+ case offsetof(struct tcp_sock, snd_cwnd_stamp):
+ end = offsetofend(struct tcp_sock, snd_cwnd_stamp);
+ break;
+ case offsetof(struct tcp_sock, snd_ssthresh):
+ end = offsetofend(struct tcp_sock, snd_ssthresh);
+ break;
+ case offsetof(struct tcp_sock, ecn_flags):
+ end = offsetofend(struct tcp_sock, ecn_flags);
+ break;
+ case offsetof(struct tcp_sock, app_limited):
+ end = offsetofend(struct tcp_sock, app_limited);
+ break;
+ default:
+ bpf_log(log, "no write support to tcp_sock at off %d\n", off);
+ return -EACCES;
+ }
+
+ if (off + size > end) {
+ bpf_log(log,
+ "write access at off %d with size %d beyond the member of tcp_sock ended at %zu\n",
+ off, size, end);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
+BPF_CALL_2(bpf_tcp_send_ack, struct tcp_sock *, tp, u32, rcv_nxt)
+{
+ /* bpf_tcp_ca prog cannot have NULL tp */
+ __tcp_send_ack((struct sock *)tp, rcv_nxt, 0);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_tcp_send_ack_proto = {
+ .func = bpf_tcp_send_ack,
+ .gpl_only = false,
+ /* In case we want to report error later */
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_BTF_ID,
+ .arg1_btf_id = &tcp_sock_id,
+ .arg2_type = ARG_ANYTHING,
+};
+
+static u32 prog_ops_moff(const struct bpf_prog *prog)
+{
+ const struct btf_member *m;
+ const struct btf_type *t;
+ u32 midx;
+
+ midx = prog->expected_attach_type;
+ t = tcp_congestion_ops_type;
+ m = &btf_type_member(t)[midx];
+
+ return __btf_member_bit_offset(t, m) / 8;
+}
+
+static const struct bpf_func_proto *
+bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
+ const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_tcp_send_ack:
+ return &bpf_tcp_send_ack_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+ case BPF_FUNC_setsockopt:
+ /* Does not allow release() to call setsockopt.
+ * release() is called when the current bpf-tcp-cc
+ * is retiring. It is not allowed to call
+ * setsockopt() to make further changes which
+ * may potentially allocate new resources.
+ */
+ if (prog_ops_moff(prog) !=
+ offsetof(struct tcp_congestion_ops, release))
+ return &bpf_sk_setsockopt_proto;
+ return NULL;
+ case BPF_FUNC_getsockopt:
+ /* Since get/setsockopt is usually expected to
+ * be available together, disable getsockopt for
+ * release also to avoid usage surprise.
+ * The bpf-tcp-cc already has a more powerful way
+ * to read tcp_sock from the PTR_TO_BTF_ID.
+ */
+ if (prog_ops_moff(prog) !=
+ offsetof(struct tcp_congestion_ops, release))
+ return &bpf_sk_getsockopt_proto;
+ return NULL;
+ case BPF_FUNC_ktime_get_coarse_ns:
+ return &bpf_ktime_get_coarse_ns_proto;
+ default:
+ return bpf_base_func_proto(func_id, prog);
+ }
+}
+
+BTF_KFUNCS_START(bpf_tcp_ca_check_kfunc_ids)
+BTF_ID_FLAGS(func, tcp_reno_ssthresh)
+BTF_ID_FLAGS(func, tcp_reno_cong_avoid)
+BTF_ID_FLAGS(func, tcp_reno_undo_cwnd)
+BTF_ID_FLAGS(func, tcp_slow_start)
+BTF_ID_FLAGS(func, tcp_cong_avoid_ai)
+BTF_KFUNCS_END(bpf_tcp_ca_check_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_tcp_ca_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_tcp_ca_check_kfunc_ids,
+};
+
+static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = {
+ .get_func_proto = bpf_tcp_ca_get_func_proto,
+ .is_valid_access = bpf_tcp_ca_is_valid_access,
+ .btf_struct_access = bpf_tcp_ca_btf_struct_access,
+};
+
+static int bpf_tcp_ca_init_member(const struct btf_type *t,
+ const struct btf_member *member,
+ void *kdata, const void *udata)
+{
+ const struct tcp_congestion_ops *utcp_ca;
+ struct tcp_congestion_ops *tcp_ca;
+ u32 moff;
+
+ utcp_ca = (const struct tcp_congestion_ops *)udata;
+ tcp_ca = (struct tcp_congestion_ops *)kdata;
+
+ moff = __btf_member_bit_offset(t, member) / 8;
+ switch (moff) {
+ case offsetof(struct tcp_congestion_ops, flags):
+ if (utcp_ca->flags & ~TCP_CONG_MASK)
+ return -EINVAL;
+ tcp_ca->flags = utcp_ca->flags;
+ return 1;
+ case offsetof(struct tcp_congestion_ops, name):
+ if (bpf_obj_name_cpy(tcp_ca->name, utcp_ca->name,
+ sizeof(tcp_ca->name)) <= 0)
+ return -EINVAL;
+ return 1;
+ }
+
+ return 0;
+}
+
+static int bpf_tcp_ca_reg(void *kdata, struct bpf_link *link)
+{
+ return tcp_register_congestion_control(kdata);
+}
+
+static void bpf_tcp_ca_unreg(void *kdata, struct bpf_link *link)
+{
+ tcp_unregister_congestion_control(kdata);
+}
+
+static int bpf_tcp_ca_update(void *kdata, void *old_kdata, struct bpf_link *link)
+{
+ return tcp_update_congestion_control(kdata, old_kdata);
+}
+
+static int bpf_tcp_ca_validate(void *kdata)
+{
+ return tcp_validate_congestion_control(kdata);
+}
+
+static u32 bpf_tcp_ca_ssthresh(struct sock *sk)
+{
+ return 0;
+}
+
+static void bpf_tcp_ca_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+{
+}
+
+static void bpf_tcp_ca_set_state(struct sock *sk, u8 new_state)
+{
+}
+
+static void bpf_tcp_ca_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
+{
+}
+
+static void bpf_tcp_ca_in_ack_event(struct sock *sk, u32 flags)
+{
+}
+
+static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *sample)
+{
+}
+
+static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk)
+{
+ return 0;
+}
+
+static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag,
+ const struct rate_sample *rs)
+{
+}
+
+static u32 bpf_tcp_ca_undo_cwnd(struct sock *sk)
+{
+ return 0;
+}
+
+static u32 bpf_tcp_ca_sndbuf_expand(struct sock *sk)
+{
+ return 0;
+}
+
+static void __bpf_tcp_ca_init(struct sock *sk)
+{
+}
+
+static void __bpf_tcp_ca_release(struct sock *sk)
+{
+}
+
+static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = {
+ .ssthresh = bpf_tcp_ca_ssthresh,
+ .cong_avoid = bpf_tcp_ca_cong_avoid,
+ .set_state = bpf_tcp_ca_set_state,
+ .cwnd_event = bpf_tcp_ca_cwnd_event,
+ .in_ack_event = bpf_tcp_ca_in_ack_event,
+ .pkts_acked = bpf_tcp_ca_pkts_acked,
+ .min_tso_segs = bpf_tcp_ca_min_tso_segs,
+ .cong_control = bpf_tcp_ca_cong_control,
+ .undo_cwnd = bpf_tcp_ca_undo_cwnd,
+ .sndbuf_expand = bpf_tcp_ca_sndbuf_expand,
+
+ .init = __bpf_tcp_ca_init,
+ .release = __bpf_tcp_ca_release,
+};
+
+static struct bpf_struct_ops bpf_tcp_congestion_ops = {
+ .verifier_ops = &bpf_tcp_ca_verifier_ops,
+ .reg = bpf_tcp_ca_reg,
+ .unreg = bpf_tcp_ca_unreg,
+ .update = bpf_tcp_ca_update,
+ .init_member = bpf_tcp_ca_init_member,
+ .init = bpf_tcp_ca_init,
+ .validate = bpf_tcp_ca_validate,
+ .name = "tcp_congestion_ops",
+ .cfi_stubs = &__bpf_ops_tcp_congestion_ops,
+ .owner = THIS_MODULE,
+};
+
+static int __init bpf_tcp_ca_kfunc_init(void)
+{
+ int ret;
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_tcp_ca_kfunc_set);
+ ret = ret ?: register_bpf_struct_ops(&bpf_tcp_congestion_ops, tcp_congestion_ops);
+
+ return ret;
+}
+late_initcall(bpf_tcp_ca_kfunc_init);
diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile
deleted file mode 100644
index e9e42f99725e..000000000000
--- a/net/ipv4/bpfilter/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-obj-$(CONFIG_BPFILTER) += sockopt.o
diff --git a/net/ipv4/bpfilter/sockopt.c b/net/ipv4/bpfilter/sockopt.c
deleted file mode 100644
index 5e04ed25bc0e..000000000000
--- a/net/ipv4/bpfilter/sockopt.c
+++ /dev/null
@@ -1,43 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/uaccess.h>
-#include <linux/bpfilter.h>
-#include <uapi/linux/bpf.h>
-#include <linux/wait.h>
-#include <linux/kmod.h>
-
-int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
- char __user *optval,
- unsigned int optlen, bool is_set);
-EXPORT_SYMBOL_GPL(bpfilter_process_sockopt);
-
-static int bpfilter_mbox_request(struct sock *sk, int optname,
- char __user *optval,
- unsigned int optlen, bool is_set)
-{
- if (!bpfilter_process_sockopt) {
- int err = request_module("bpfilter");
-
- if (err)
- return err;
- if (!bpfilter_process_sockopt)
- return -ECHILD;
- }
- return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set);
-}
-
-int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
- unsigned int optlen)
-{
- return bpfilter_mbox_request(sk, optname, optval, optlen, true);
-}
-
-int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
- int __user *optlen)
-{
- int len;
-
- if (get_user(len, optlen))
- return -EFAULT;
-
- return bpfilter_mbox_request(sk, optname, optval, len, false);
-}
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 82178cc69c96..709021197e1c 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* CIPSO - Commercial IP Security Option
*
@@ -9,30 +10,15 @@
*
* The CIPSO draft specification can be found in the kernel's Documentation
* directory as well as the following URL:
- * http://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt
+ * https://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt
* The FIPS-188 specification can be found at the following URL:
- * http://www.itl.nist.gov/fipspubs/fip188.htm
+ * https://www.itl.nist.gov/fipspubs/fip188.htm
*
* Author: Paul Moore <paul.moore@hp.com>
- *
*/
/*
* (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- *
*/
#include <linux/init.h>
@@ -51,7 +37,7 @@
#include <net/cipso_ipv4.h>
#include <linux/atomic.h>
#include <linux/bug.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
/* List of available DOI definitions */
/* XXX - This currently assumes a minimal number of different DOIs in use,
@@ -87,7 +73,7 @@ struct cipso_v4_map_cache_entry {
static struct cipso_v4_map_cache_bkt *cipso_v4_cache;
/* Restricted bitmap (tag #1) flags */
-int cipso_v4_rbm_optfmt = 0;
+int cipso_v4_rbm_optfmt;
int cipso_v4_rbm_strictvalid = 1;
/*
@@ -201,8 +187,7 @@ static int __init cipso_v4_cache_init(void)
* cipso_v4_cache_invalidate - Invalidates the current CIPSO cache
*
* Description:
- * Invalidates and frees any entries in the CIPSO cache. Returns zero on
- * success and negative values on failure.
+ * Invalidates and frees any entries in the CIPSO cache.
*
*/
void cipso_v4_cache_invalidate(void)
@@ -254,7 +239,7 @@ static int cipso_v4_cache_check(const unsigned char *key,
struct cipso_v4_map_cache_entry *prev_entry = NULL;
u32 hash;
- if (!cipso_v4_cache_enabled)
+ if (!READ_ONCE(cipso_v4_cache_enabled))
return -ENOENT;
hash = cipso_v4_map_cache_hash(key, key_len);
@@ -297,7 +282,7 @@ static int cipso_v4_cache_check(const unsigned char *key,
/**
* cipso_v4_cache_add - Add an entry to the CIPSO cache
- * @skb: the packet
+ * @cipso_ptr: pointer to CIPSO IP option
* @secattr: the packet's security attributes
*
* Description:
@@ -311,13 +296,14 @@ static int cipso_v4_cache_check(const unsigned char *key,
int cipso_v4_cache_add(const unsigned char *cipso_ptr,
const struct netlbl_lsm_secattr *secattr)
{
+ int bkt_size = READ_ONCE(cipso_v4_cache_bucketsize);
int ret_val = -EPERM;
u32 bkt;
struct cipso_v4_map_cache_entry *entry = NULL;
struct cipso_v4_map_cache_entry *old_entry = NULL;
u32 cipso_ptr_len;
- if (!cipso_v4_cache_enabled || cipso_v4_cache_bucketsize <= 0)
+ if (!READ_ONCE(cipso_v4_cache_enabled) || bkt_size <= 0)
return 0;
cipso_ptr_len = cipso_ptr[1];
@@ -337,7 +323,7 @@ int cipso_v4_cache_add(const unsigned char *cipso_ptr,
bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETS - 1);
spin_lock_bh(&cipso_v4_cache[bkt].lock);
- if (cipso_v4_cache[bkt].size < cipso_v4_cache_bucketsize) {
+ if (cipso_v4_cache[bkt].size < bkt_size) {
list_add(&entry->list, &cipso_v4_cache[bkt].list);
cipso_v4_cache[bkt].size += 1;
} else {
@@ -486,6 +472,7 @@ void cipso_v4_doi_free(struct cipso_v4_doi *doi_def)
kfree(doi_def->map.std->lvl.local);
kfree(doi_def->map.std->cat.cipso);
kfree(doi_def->map.std->cat.local);
+ kfree(doi_def->map.std);
break;
}
kfree(doi_def);
@@ -512,7 +499,7 @@ static void cipso_v4_doi_free_rcu(struct rcu_head *entry)
/**
* cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine
* @doi: the DOI value
- * @audit_secid: the LSM secid to use in the audit message
+ * @audit_info: NetLabel audit information
*
* Description:
* Removes a DOI definition from the CIPSO engine. The NetLabel routines will
@@ -533,16 +520,10 @@ int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info)
ret_val = -ENOENT;
goto doi_remove_return;
}
- if (!refcount_dec_and_test(&doi_def->refcount)) {
- spin_unlock(&cipso_v4_doi_list_lock);
- ret_val = -EBUSY;
- goto doi_remove_return;
- }
list_del_rcu(&doi_def->list);
spin_unlock(&cipso_v4_doi_list_lock);
- cipso_v4_cache_invalidate();
- call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
+ cipso_v4_doi_putdef(doi_def);
ret_val = 0;
doi_remove_return:
@@ -599,9 +580,6 @@ void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def)
if (!refcount_dec_and_test(&doi_def->refcount))
return;
- spin_lock(&cipso_v4_doi_list_lock);
- list_del_rcu(&doi_def->list);
- spin_unlock(&cipso_v4_doi_list_lock);
cipso_v4_cache_invalidate();
call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu);
@@ -667,7 +645,8 @@ static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level)
case CIPSO_V4_MAP_PASS:
return 0;
case CIPSO_V4_MAP_TRANS:
- if (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL)
+ if ((level < doi_def->map.std->lvl.cipso_size) &&
+ (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL))
return 0;
break;
}
@@ -885,11 +864,8 @@ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def,
net_clen_bits,
net_spot + 1,
1);
- if (net_spot < 0) {
- if (net_spot == -2)
- return -EFAULT;
+ if (net_spot < 0)
return 0;
- }
switch (doi_def->type) {
case CIPSO_V4_MAP_PASS:
@@ -1175,7 +1151,7 @@ static void cipso_v4_gentag_hdr(const struct cipso_v4_doi *doi_def,
{
buf[0] = IPOPT_CIPSO;
buf[1] = CIPSO_V4_HDR_LEN + len;
- *(__be32 *)&buf[2] = htonl(doi_def->doi);
+ put_unaligned_be32(doi_def->doi, &buf[2]);
}
/**
@@ -1221,7 +1197,8 @@ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def,
/* This will send packets using the "optimized" format when
* possible as specified in section 3.4.2.6 of the
* CIPSO draft. */
- if (cipso_v4_rbm_optfmt && ret_val > 0 && ret_val <= 10)
+ if (READ_ONCE(cipso_v4_rbm_optfmt) && ret_val > 0 &&
+ ret_val <= 10)
tag_len = 14;
else
tag_len = 4 + ret_val;
@@ -1271,7 +1248,8 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def,
return ret_val;
}
- secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+ if (secattr->attr.mls.cat)
+ secattr->flags |= NETLBL_SECATTR_MLS_CAT;
}
return 0;
@@ -1452,7 +1430,8 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def,
return ret_val;
}
- secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+ if (secattr->attr.mls.cat)
+ secattr->flags |= NETLBL_SECATTR_MLS_CAT;
}
return 0;
@@ -1512,7 +1491,7 @@ static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def,
*
* Description:
* Parse the packet's IP header looking for a CIPSO option. Returns a pointer
- * to the start of the CIPSO option on success, NULL if one if not found.
+ * to the start of the CIPSO option on success, NULL if one is not found.
*
*/
unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
@@ -1522,10 +1501,8 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
int optlen;
int taglen;
- for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 0; ) {
+ for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 1; ) {
switch (optptr[0]) {
- case IPOPT_CIPSO:
- return optptr;
case IPOPT_END:
return NULL;
case IPOPT_NOOP:
@@ -1534,6 +1511,11 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
default:
taglen = optptr[1];
}
+ if (!taglen || taglen > optlen)
+ return NULL;
+ if (optptr[0] == IPOPT_CIPSO)
+ return optptr;
+
optlen -= taglen;
optptr += taglen;
}
@@ -1543,6 +1525,7 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb)
/**
* cipso_v4_validate - Validate a CIPSO option
+ * @skb: the packet
* @option: the start of the option, on error it is set to point to the error
*
* Description:
@@ -1619,7 +1602,7 @@ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option)
* all the CIPSO validations here but it doesn't
* really specify _exactly_ what we need to validate
* ... so, just make it a sysctl tunable. */
- if (cipso_v4_rbm_strictvalid) {
+ if (READ_ONCE(cipso_v4_rbm_strictvalid)) {
if (cipso_v4_map_lvl_valid(doi_def,
tag[3]) < 0) {
err_offset = opt_iter + 3;
@@ -1732,13 +1715,30 @@ validate_return:
*/
void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
{
+ struct inet_skb_parm parm;
+ int res;
+
if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES)
return;
+ /*
+ * We might be called above the IP layer,
+ * so we can not use icmp_send and IPCB here.
+ */
+
+ memset(&parm, 0, sizeof(parm));
+ parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
+ rcu_read_lock();
+ res = __ip_options_compile(dev_net(skb->dev), &parm.opt, skb, NULL);
+ rcu_read_unlock();
+
+ if (res)
+ return;
+
if (gateway)
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0);
+ __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, &parm);
else
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0);
+ __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, &parm);
}
/**
@@ -1809,11 +1809,35 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
return CIPSO_V4_HDR_LEN + ret_val;
}
+static int cipso_v4_get_actual_opt_len(const unsigned char *data, int len)
+{
+ int iter = 0, optlen = 0;
+
+ /* determining the new total option length is tricky because of
+ * the padding necessary, the only thing i can think to do at
+ * this point is walk the options one-by-one, skipping the
+ * padding at the end to determine the actual option size and
+ * from there we can determine the new total option length
+ */
+ while (iter < len) {
+ if (data[iter] == IPOPT_END) {
+ break;
+ } else if (data[iter] == IPOPT_NOP) {
+ iter++;
+ } else {
+ iter += data[iter + 1];
+ optlen = iter;
+ }
+ }
+ return optlen;
+}
+
/**
* cipso_v4_sock_setattr - Add a CIPSO option to a socket
* @sk: the socket
* @doi_def: the CIPSO DOI to use
* @secattr: the specific security attributes of the socket
+ * @sk_locked: true if caller holds the socket lock
*
* Description:
* Set the CIPSO option on the given socket using the DOI definition and
@@ -1825,7 +1849,8 @@ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len,
*/
int cipso_v4_sock_setattr(struct sock *sk,
const struct cipso_v4_doi *doi_def,
- const struct netlbl_lsm_secattr *secattr)
+ const struct netlbl_lsm_secattr *secattr,
+ bool sk_locked)
{
int ret_val = -EPERM;
unsigned char *buf = NULL;
@@ -1875,9 +1900,8 @@ int cipso_v4_sock_setattr(struct sock *sk,
sk_inet = inet_sk(sk);
- old = rcu_dereference_protected(sk_inet->inet_opt,
- lockdep_sock_is_held(sk));
- if (sk_inet->is_icsk) {
+ old = rcu_dereference_protected(sk_inet->inet_opt, sk_locked);
+ if (inet_test_bit(IS_ICSK, sk)) {
sk_conn = inet_csk(sk);
if (old)
sk_conn->icsk_ext_hdr_len -= old->opt.optlen;
@@ -1951,7 +1975,7 @@ int cipso_v4_req_setattr(struct request_sock *req,
buf = NULL;
req_inet = inet_rsk(req);
- opt = xchg((__force struct ip_options_rcu **)&req_inet->ireq_opt, opt);
+ opt = unrcu_pointer(xchg(&req_inet->ireq_opt, RCU_INITIALIZER(opt)));
if (opt)
kfree_rcu(opt, rcu);
@@ -1984,7 +2008,6 @@ static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr)
u8 cipso_len;
u8 cipso_off;
unsigned char *cipso_ptr;
- int iter;
int optlen_new;
cipso_off = opt->opt.cipso - sizeof(struct iphdr);
@@ -2004,19 +2027,8 @@ static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr)
memmove(cipso_ptr, cipso_ptr + cipso_len,
opt->opt.optlen - cipso_off - cipso_len);
- /* determining the new total option length is tricky because of
- * the padding necessary, the only thing i can think to do at
- * this point is walk the options one-by-one, skipping the
- * padding at the end to determine the actual option size and
- * from there we can determine the new total option length */
- iter = 0;
- optlen_new = 0;
- while (iter < opt->opt.optlen)
- if (opt->opt.__data[iter] != IPOPT_NOP) {
- iter += opt->opt.__data[iter + 1];
- optlen_new = iter;
- } else
- iter++;
+ optlen_new = cipso_v4_get_actual_opt_len(opt->opt.__data,
+ opt->opt.optlen);
hdr_delta = opt->opt.optlen;
opt->opt.optlen = (optlen_new + 3) & ~3;
hdr_delta -= opt->opt.optlen;
@@ -2047,7 +2059,7 @@ void cipso_v4_sock_delattr(struct sock *sk)
sk_inet = inet_sk(sk);
hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt);
- if (sk_inet->is_icsk && hdr_delta > 0) {
+ if (inet_test_bit(IS_ICSK, sk) && hdr_delta > 0) {
struct inet_connection_sock *sk_conn = inet_csk(sk);
sk_conn->icsk_ext_hdr_len -= hdr_delta;
sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie);
@@ -2056,7 +2068,7 @@ void cipso_v4_sock_delattr(struct sock *sk)
/**
* cipso_v4_req_delattr - Delete the CIPSO option from a request socket
- * @reg: the request socket
+ * @req: the request socket
*
* Description:
* Removes the CIPSO option from a request socket, if present.
@@ -2148,6 +2160,7 @@ int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr)
/**
* cipso_v4_skbuff_setattr - Set the CIPSO option on a packet
* @skb: the packet
+ * @doi_def: the DOI structure
* @secattr: the security attributes
*
* Description:
@@ -2217,7 +2230,7 @@ int cipso_v4_skbuff_setattr(struct sk_buff *skb,
memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len);
if (len_delta != 0) {
iph->ihl = 5 + (opt_len >> 2);
- iph->tot_len = htons(skb->len);
+ iph_set_totlen(iph, skb->len);
}
ip_send_check(iph);
@@ -2235,7 +2248,8 @@ int cipso_v4_skbuff_setattr(struct sk_buff *skb,
*/
int cipso_v4_skbuff_delattr(struct sk_buff *skb)
{
- int ret_val;
+ int ret_val, cipso_len, hdr_len_actual, new_hdr_len_actual, new_hdr_len,
+ hdr_len_delta;
struct iphdr *iph;
struct ip_options *opt = &IPCB(skb)->opt;
unsigned char *cipso_ptr;
@@ -2248,16 +2262,37 @@ int cipso_v4_skbuff_delattr(struct sk_buff *skb)
if (ret_val < 0)
return ret_val;
- /* the easiest thing to do is just replace the cipso option with noop
- * options since we don't change the size of the packet, although we
- * still need to recalculate the checksum */
-
iph = ip_hdr(skb);
cipso_ptr = (unsigned char *)iph + opt->cipso;
- memset(cipso_ptr, IPOPT_NOOP, cipso_ptr[1]);
+ cipso_len = cipso_ptr[1];
+
+ hdr_len_actual = sizeof(struct iphdr) +
+ cipso_v4_get_actual_opt_len((unsigned char *)(iph + 1),
+ opt->optlen);
+ new_hdr_len_actual = hdr_len_actual - cipso_len;
+ new_hdr_len = (new_hdr_len_actual + 3) & ~3;
+ hdr_len_delta = (iph->ihl << 2) - new_hdr_len;
+
+ /* 1. shift any options after CIPSO to the left */
+ memmove(cipso_ptr, cipso_ptr + cipso_len,
+ new_hdr_len_actual - opt->cipso);
+ /* 2. move the whole IP header to its new place */
+ memmove((unsigned char *)iph + hdr_len_delta, iph, new_hdr_len_actual);
+ /* 3. adjust the skb layout */
+ skb_pull(skb, hdr_len_delta);
+ skb_reset_network_header(skb);
+ iph = ip_hdr(skb);
+ /* 4. re-fill new padding with IPOPT_END (may now be longer) */
+ memset((unsigned char *)iph + new_hdr_len_actual, IPOPT_END,
+ new_hdr_len - new_hdr_len_actual);
+
+ opt->optlen -= hdr_len_delta;
opt->cipso = 0;
opt->is_changed = 1;
-
+ if (hdr_len_delta != 0) {
+ iph->ihl = new_hdr_len >> 2;
+ iph_set_totlen(iph, skb->len);
+ }
ip_send_check(iph);
return 0;
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index f915abff1350..1614593b6d72 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -1,26 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* common UDP/RAW code
* Linux INET implementation
*
* Authors:
* Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/types.h>
#include <linux/module.h>
-#include <linux/ip.h>
#include <linux/in.h>
#include <net/ip.h>
#include <net/sock.h>
#include <net/route.h>
#include <net/tcp_states.h>
+#include <net/sock_reuseport.h>
-int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int __ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
@@ -42,16 +38,17 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
oif = sk->sk_bound_dev_if;
saddr = inet->inet_saddr;
if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
- if (!oif)
- oif = inet->mc_index;
+ if (!oif || netif_index_is_l3_master(sock_net(sk), oif))
+ oif = READ_ONCE(inet->mc_index);
if (!saddr)
- saddr = inet->mc_addr;
+ saddr = READ_ONCE(inet->mc_addr);
+ } else if (!oif) {
+ oif = READ_ONCE(inet->uc_index);
}
fl4 = &inet->cork.fl.u.ip4;
- rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr,
- RT_CONN_FLAGS(sk), oif,
- sk->sk_protocol,
- inet->inet_sport, usin->sin_port, sk);
+ rt = ip_route_connect(fl4, usin->sin_addr.s_addr, saddr, oif,
+ sk->sk_protocol, inet->inet_sport,
+ usin->sin_port, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
if (err == -ENETUNREACH)
@@ -64,18 +61,21 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
err = -EACCES;
goto out;
}
+
+ /* Update addresses before rehashing */
+ inet->inet_daddr = fl4->daddr;
+ inet->inet_dport = usin->sin_port;
if (!inet->inet_saddr)
- inet->inet_saddr = fl4->saddr; /* Update source address */
+ inet->inet_saddr = fl4->saddr;
if (!inet->inet_rcv_saddr) {
inet->inet_rcv_saddr = fl4->saddr;
if (sk->sk_prot->rehash)
sk->sk_prot->rehash(sk);
}
- inet->inet_daddr = fl4->daddr;
- inet->inet_dport = usin->sin_port;
+ reuseport_has_conns_set(sk);
sk->sk_state = TCP_ESTABLISHED;
sk_set_txhash(sk);
- inet->inet_id = jiffies;
+ atomic_set(&inet->inet_id, get_random_u16());
sk_dst_set(sk, &rt->dst);
err = 0;
@@ -84,7 +84,7 @@ out:
}
EXPORT_SYMBOL(__ip4_datagram_connect);
-int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
{
int res;
@@ -102,8 +102,6 @@ EXPORT_SYMBOL(ip4_datagram_connect);
void ip4_datagram_release_cb(struct sock *sk)
{
const struct inet_sock *inet = inet_sk(sk);
- const struct ip_options_rcu *inet_opt;
- __be32 daddr = inet->inet_daddr;
struct dst_entry *dst;
struct flowi4 fl4;
struct rtable *rt;
@@ -111,18 +109,13 @@ void ip4_datagram_release_cb(struct sock *sk)
rcu_read_lock();
dst = __sk_dst_get(sk);
- if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) {
+ if (!dst || !READ_ONCE(dst->obsolete) || dst->ops->check(dst, 0)) {
rcu_read_unlock();
return;
}
- inet_opt = rcu_dereference(inet->inet_opt);
- if (inet_opt && inet_opt->opt.srr)
- daddr = inet_opt->opt.faddr;
- rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr,
- inet->inet_saddr, inet->inet_dport,
- inet->inet_sport, sk->sk_protocol,
- RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
+ inet_sk_init_flowi4(inet, &fl4);
+ rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
dst = !IS_ERR(rt) ? &rt->dst : NULL;
sk_dst_set(sk, dst);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index ea4bd8a52422..942a887bf089 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* NET3 IP device support routines.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Derived from the IP parts of dev.c 1.0.19
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
@@ -50,6 +46,7 @@
#include <linux/notifier.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
+#include "igmp_internal.h"
#include <linux/slab.h>
#include <linux/hash.h>
#ifdef CONFIG_SYSCTL
@@ -66,6 +63,11 @@
#include <net/net_namespace.h>
#include <net/addrconf.h>
+#define IPV6ONLY_FLAGS \
+ (IFA_F_NODAD | IFA_F_OPTIMISTIC | IFA_F_DADFAILED | \
+ IFA_F_HOMEADDRESS | IFA_F_TENTATIVE | \
+ IFA_F_MANAGETEMPADDR | IFA_F_STABLE_PRIVACY)
+
static struct ipv4_devconf ipv4_devconf = {
.data = {
[IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
@@ -74,6 +76,7 @@ static struct ipv4_devconf ipv4_devconf = {
[IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
[IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
[IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/,
+ [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1,
},
};
@@ -86,6 +89,7 @@ static struct ipv4_devconf ipv4_devconf_dflt = {
[IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1,
[IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1] = 10000 /*ms*/,
[IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1] = 1000 /*ms*/,
+ [IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] = 1,
},
};
@@ -100,16 +104,16 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
[IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
[IFA_FLAGS] = { .type = NLA_U32 },
[IFA_RT_PRIORITY] = { .type = NLA_U32 },
+ [IFA_TARGET_NETNSID] = { .type = NLA_S32 },
+ [IFA_PROTO] = { .type = NLA_U8 },
};
#define IN4_ADDR_HSIZE_SHIFT 8
#define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT)
-static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE];
-
static u32 inet_addr_hash(const struct net *net, __be32 addr)
{
- u32 val = (__force u32) addr ^ net_hash_mix(net);
+ u32 val = __ipv4_addr_hash(addr, net_hash_mix(net));
return hash_32(val, IN4_ADDR_HSIZE_SHIFT);
}
@@ -119,13 +123,13 @@ static void inet_hash_insert(struct net *net, struct in_ifaddr *ifa)
u32 hash = inet_addr_hash(net, ifa->ifa_local);
ASSERT_RTNL();
- hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]);
+ hlist_add_head_rcu(&ifa->addr_lst, &net->ipv4.inet_addr_lst[hash]);
}
static void inet_hash_remove(struct in_ifaddr *ifa)
{
ASSERT_RTNL();
- hlist_del_init_rcu(&ifa->hash);
+ hlist_del_init_rcu(&ifa->addr_lst);
}
/**
@@ -172,9 +176,8 @@ struct in_ifaddr *inet_lookup_ifaddr_rcu(struct net *net, __be32 addr)
u32 hash = inet_addr_hash(net, addr);
struct in_ifaddr *ifa;
- hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash)
- if (ifa->ifa_local == addr &&
- net_eq(dev_net(ifa->ifa_dev->dev), net))
+ hlist_for_each_entry_rcu(ifa, &net->ipv4.inet_addr_lst[hash], addr_lst)
+ if (ifa->ifa_local == addr)
return ifa;
return NULL;
@@ -184,7 +187,8 @@ static void rtmsg_ifa(int event, struct in_ifaddr *, struct nlmsghdr *, u32);
static BLOCKING_NOTIFIER_HEAD(inetaddr_chain);
static BLOCKING_NOTIFIER_HEAD(inetaddr_validator_chain);
-static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+static void inet_del_ifa(struct in_device *in_dev,
+ struct in_ifaddr __rcu **ifap,
int destroy);
#ifdef CONFIG_SYSCTL
static int devinet_sysctl_register(struct in_device *idev);
@@ -201,22 +205,45 @@ static void devinet_sysctl_unregister(struct in_device *idev)
/* Locks all the inet devices. */
-static struct in_ifaddr *inet_alloc_ifa(void)
+static struct in_ifaddr *inet_alloc_ifa(struct in_device *in_dev)
{
- return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL);
+ struct in_ifaddr *ifa;
+
+ ifa = kzalloc(sizeof(*ifa), GFP_KERNEL_ACCOUNT);
+ if (!ifa)
+ return NULL;
+
+ in_dev_hold(in_dev);
+ ifa->ifa_dev = in_dev;
+
+ INIT_HLIST_NODE(&ifa->addr_lst);
+
+ return ifa;
}
static void inet_rcu_free_ifa(struct rcu_head *head)
{
struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head);
- if (ifa->ifa_dev)
- in_dev_put(ifa->ifa_dev);
+
+ in_dev_put(ifa->ifa_dev);
kfree(ifa);
}
static void inet_free_ifa(struct in_ifaddr *ifa)
{
- call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+ /* Our reference to ifa->ifa_dev must be freed ASAP
+ * to release the reference to the netdev the same way.
+ * in_dev_put() -> in_dev_finish_destroy() -> netdev_put()
+ */
+ call_rcu_hurry(&ifa->rcu_head, inet_rcu_free_ifa);
+}
+
+static void in_dev_free_rcu(struct rcu_head *head)
+{
+ struct in_device *idev = container_of(head, struct in_device, rcu_head);
+
+ kfree(rcu_dereference_protected(idev->mc_hash, 1));
+ kfree(idev);
}
void in_dev_finish_destroy(struct in_device *idev)
@@ -225,15 +252,14 @@ void in_dev_finish_destroy(struct in_device *idev)
WARN_ON(idev->ifa_list);
WARN_ON(idev->mc_list);
- kfree(rcu_dereference_protected(idev->mc_hash, 1));
#ifdef NET_REFCNT_DEBUG
pr_debug("%s: %p=%s\n", __func__, idev, dev ? dev->name : "NIL");
#endif
- dev_put(dev);
+ netdev_put(dev, &idev->dev_tracker);
if (!idev->dead)
pr_err("Freeing alive in_device %p\n", idev);
else
- kfree(idev);
+ call_rcu(&idev->rcu_head, in_dev_free_rcu);
}
EXPORT_SYMBOL(in_dev_finish_destroy);
@@ -255,22 +281,25 @@ static struct in_device *inetdev_init(struct net_device *dev)
if (!in_dev->arp_parms)
goto out_kfree;
if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))
- dev_disable_lro(dev);
+ netif_disable_lro(dev);
/* Reference in_dev->dev */
- dev_hold(dev);
+ netdev_hold(dev, &in_dev->dev_tracker, GFP_KERNEL);
/* Account for reference dev->ip_ptr (below) */
refcount_set(&in_dev->refcnt, 1);
- err = devinet_sysctl_register(in_dev);
- if (err) {
- in_dev->dead = 1;
- in_dev_put(in_dev);
- in_dev = NULL;
- goto out;
+ if (dev != blackhole_netdev) {
+ err = devinet_sysctl_register(in_dev);
+ if (err) {
+ in_dev->dead = 1;
+ neigh_parms_release(&arp_tbl, in_dev->arp_parms);
+ in_dev_put(in_dev);
+ in_dev = NULL;
+ goto out;
+ }
+ ip_mc_init_dev(in_dev);
+ if (dev->flags & IFF_UP)
+ ip_mc_up(in_dev);
}
- ip_mc_init_dev(in_dev);
- if (dev->flags & IFF_UP)
- ip_mc_up(in_dev);
/* we can receive as soon as ip_ptr is set -- do this last */
rcu_assign_pointer(dev->ip_ptr, in_dev);
@@ -282,16 +311,10 @@ out_kfree:
goto out;
}
-static void in_dev_rcu_put(struct rcu_head *head)
-{
- struct in_device *idev = container_of(head, struct in_device, rcu_head);
- in_dev_put(idev);
-}
-
static void inetdev_destroy(struct in_device *in_dev)
{
- struct in_ifaddr *ifa;
struct net_device *dev;
+ struct in_ifaddr *ifa;
ASSERT_RTNL();
@@ -301,7 +324,7 @@ static void inetdev_destroy(struct in_device *in_dev)
ip_mc_destroy_dev(in_dev);
- while ((ifa = in_dev->ifa_list) != NULL) {
+ while ((ifa = rtnl_dereference(in_dev->ifa_list)) != NULL) {
inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
inet_free_ifa(ifa);
}
@@ -312,35 +335,52 @@ static void inetdev_destroy(struct in_device *in_dev)
neigh_parms_release(&arp_tbl, in_dev->arp_parms);
arp_ifdown(dev);
- call_rcu(&in_dev->rcu_head, in_dev_rcu_put);
+ in_dev_put(in_dev);
+}
+
+static int __init inet_blackhole_dev_init(void)
+{
+ struct in_device *in_dev;
+
+ rtnl_lock();
+ in_dev = inetdev_init(blackhole_netdev);
+ rtnl_unlock();
+
+ return PTR_ERR_OR_ZERO(in_dev);
}
+late_initcall(inet_blackhole_dev_init);
int inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b)
{
+ const struct in_ifaddr *ifa;
+
rcu_read_lock();
- for_primary_ifa(in_dev) {
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (inet_ifa_match(a, ifa)) {
if (!b || inet_ifa_match(b, ifa)) {
rcu_read_unlock();
return 1;
}
}
- } endfor_ifa(in_dev);
+ }
rcu_read_unlock();
return 0;
}
-static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
- int destroy, struct nlmsghdr *nlh, u32 portid)
+static void __inet_del_ifa(struct in_device *in_dev,
+ struct in_ifaddr __rcu **ifap,
+ int destroy, struct nlmsghdr *nlh, u32 portid)
{
struct in_ifaddr *promote = NULL;
- struct in_ifaddr *ifa, *ifa1 = *ifap;
- struct in_ifaddr *last_prim = in_dev->ifa_list;
+ struct in_ifaddr *ifa, *ifa1;
+ struct in_ifaddr __rcu **last_prim;
struct in_ifaddr *prev_prom = NULL;
int do_promote = IN_DEV_PROMOTE_SECONDARIES(in_dev);
ASSERT_RTNL();
+ ifa1 = rtnl_dereference(*ifap);
+ last_prim = ifap;
if (in_dev->dead)
goto no_promotions;
@@ -349,12 +389,12 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
**/
if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) {
- struct in_ifaddr **ifap1 = &ifa1->ifa_next;
+ struct in_ifaddr __rcu **ifap1 = &ifa1->ifa_next;
- while ((ifa = *ifap1) != NULL) {
+ while ((ifa = rtnl_dereference(*ifap1)) != NULL) {
if (!(ifa->ifa_flags & IFA_F_SECONDARY) &&
ifa1->ifa_scope <= ifa->ifa_scope)
- last_prim = ifa;
+ last_prim = &ifa->ifa_next;
if (!(ifa->ifa_flags & IFA_F_SECONDARY) ||
ifa1->ifa_mask != ifa->ifa_mask ||
@@ -384,7 +424,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
* and later to add them back with new prefsrc. Do this
* while all addresses are on the device list.
*/
- for (ifa = promote; ifa; ifa = ifa->ifa_next) {
+ for (ifa = promote; ifa; ifa = rtnl_dereference(ifa->ifa_next)) {
if (ifa1->ifa_mask == ifa->ifa_mask &&
inet_ifa_match(ifa1->ifa_address, ifa))
fib_del_ifaddr(ifa, ifa1);
@@ -410,19 +450,25 @@ no_promotions:
blocking_notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
if (promote) {
- struct in_ifaddr *next_sec = promote->ifa_next;
+ struct in_ifaddr *next_sec;
+ next_sec = rtnl_dereference(promote->ifa_next);
if (prev_prom) {
- prev_prom->ifa_next = promote->ifa_next;
- promote->ifa_next = last_prim->ifa_next;
- last_prim->ifa_next = promote;
+ struct in_ifaddr *last_sec;
+
+ rcu_assign_pointer(prev_prom->ifa_next, next_sec);
+
+ last_sec = rtnl_dereference(*last_prim);
+ rcu_assign_pointer(promote->ifa_next, last_sec);
+ rcu_assign_pointer(*last_prim, promote);
}
promote->ifa_flags &= ~IFA_F_SECONDARY;
rtmsg_ifa(RTM_NEWADDR, promote, nlh, portid);
blocking_notifier_call_chain(&inetaddr_chain,
NETDEV_UP, promote);
- for (ifa = next_sec; ifa; ifa = ifa->ifa_next) {
+ for (ifa = next_sec; ifa;
+ ifa = rtnl_dereference(ifa->ifa_next)) {
if (ifa1->ifa_mask != ifa->ifa_mask ||
!inet_ifa_match(ifa1->ifa_address, ifa))
continue;
@@ -434,36 +480,35 @@ no_promotions:
inet_free_ifa(ifa1);
}
-static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+static void inet_del_ifa(struct in_device *in_dev,
+ struct in_ifaddr __rcu **ifap,
int destroy)
{
__inet_del_ifa(in_dev, ifap, destroy, NULL, 0);
}
-static void check_lifetime(struct work_struct *work);
-
-static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime);
-
static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
u32 portid, struct netlink_ext_ack *extack)
{
+ struct in_ifaddr __rcu **last_primary, **ifap;
struct in_device *in_dev = ifa->ifa_dev;
- struct in_ifaddr *ifa1, **ifap, **last_primary;
+ struct net *net = dev_net(in_dev->dev);
struct in_validator_info ivi;
+ struct in_ifaddr *ifa1;
int ret;
ASSERT_RTNL();
- if (!ifa->ifa_local) {
- inet_free_ifa(ifa);
- return 0;
- }
-
ifa->ifa_flags &= ~IFA_F_SECONDARY;
last_primary = &in_dev->ifa_list;
- for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
- ifap = &ifa1->ifa_next) {
+ /* Don't set IPv6 only flags to IPv4 addresses */
+ ifa->ifa_flags &= ~IPV6ONLY_FLAGS;
+
+ ifap = &in_dev->ifa_list;
+ ifa1 = rtnl_dereference(*ifap);
+
+ while (ifa1) {
if (!(ifa1->ifa_flags & IFA_F_SECONDARY) &&
ifa->ifa_scope <= ifa1->ifa_scope)
last_primary = &ifa1->ifa_next;
@@ -474,11 +519,15 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
return -EEXIST;
}
if (ifa1->ifa_scope != ifa->ifa_scope) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid scope value");
inet_free_ifa(ifa);
return -EINVAL;
}
ifa->ifa_flags |= IFA_F_SECONDARY;
}
+
+ ifap = &ifa1->ifa_next;
+ ifa1 = rtnl_dereference(*ifap);
}
/* Allow any devices that wish to register ifaddr validtors to weigh
@@ -499,18 +548,16 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
return ret;
}
- if (!(ifa->ifa_flags & IFA_F_SECONDARY)) {
- prandom_seed((__force u32) ifa->ifa_local);
+ if (!(ifa->ifa_flags & IFA_F_SECONDARY))
ifap = last_primary;
- }
- ifa->ifa_next = *ifap;
- *ifap = ifa;
+ rcu_assign_pointer(ifa->ifa_next, *ifap);
+ rcu_assign_pointer(*ifap, ifa);
inet_hash_insert(dev_net(in_dev->dev), ifa);
- cancel_delayed_work(&check_lifetime_work);
- queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0);
+ cancel_delayed_work(&net->ipv4.addr_chk_work);
+ queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, 0);
/* Send message first, then call notifier.
Notifier will trigger FIB update, so that
@@ -523,26 +570,21 @@ static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh,
static int inet_insert_ifa(struct in_ifaddr *ifa)
{
+ if (!ifa->ifa_local) {
+ inet_free_ifa(ifa);
+ return 0;
+ }
+
return __inet_insert_ifa(ifa, NULL, 0, NULL);
}
static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
{
- struct in_device *in_dev = __in_dev_get_rtnl(dev);
-
- ASSERT_RTNL();
+ struct in_device *in_dev = __in_dev_get_rtnl_net(dev);
- if (!in_dev) {
- inet_free_ifa(ifa);
- return -ENOBUFS;
- }
ipv4_devconf_setall(in_dev);
neigh_parms_data_state_setall(in_dev->arp_parms);
- if (ifa->ifa_dev != in_dev) {
- WARN_ON(ifa->ifa_dev);
- in_dev_hold(in_dev);
- ifa->ifa_dev = in_dev;
- }
+
if (ipv4_is_loopback(ifa->ifa_local))
ifa->ifa_scope = RT_SCOPE_HOST;
return inet_insert_ifa(ifa);
@@ -570,24 +612,29 @@ EXPORT_SYMBOL(inetdev_by_index);
struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, __be32 prefix,
__be32 mask)
{
+ struct in_ifaddr *ifa;
+
ASSERT_RTNL();
- for_primary_ifa(in_dev) {
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa))
return ifa;
- } endfor_ifa(in_dev);
+ }
return NULL;
}
-static int ip_mc_config(struct sock *sk, bool join, const struct in_ifaddr *ifa)
+static int ip_mc_autojoin_config(struct net *net, bool join,
+ const struct in_ifaddr *ifa)
{
+#if defined(CONFIG_IP_MULTICAST)
struct ip_mreqn mreq = {
.imr_multiaddr.s_addr = ifa->ifa_address,
.imr_ifindex = ifa->ifa_dev->dev->ifindex,
};
+ struct sock *sk = net->ipv4.mc_autojoin_sk;
int ret;
- ASSERT_RTNL();
+ ASSERT_RTNL_NET(net);
lock_sock(sk);
if (join)
@@ -597,33 +644,40 @@ static int ip_mc_config(struct sock *sk, bool join, const struct in_ifaddr *ifa)
release_sock(sk);
return ret;
+#else
+ return -EOPNOTSUPP;
+#endif
}
static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
+ struct in_ifaddr __rcu **ifap;
struct nlattr *tb[IFA_MAX+1];
struct in_device *in_dev;
struct ifaddrmsg *ifm;
- struct in_ifaddr *ifa, **ifap;
- int err = -EINVAL;
-
- ASSERT_RTNL();
+ struct in_ifaddr *ifa;
+ int err;
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy,
- extack);
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
+ ifa_ipv4_policy, extack);
if (err < 0)
- goto errout;
+ goto out;
ifm = nlmsg_data(nlh);
+
+ rtnl_net_lock(net);
+
in_dev = inetdev_by_index(net, ifm->ifa_index);
if (!in_dev) {
+ NL_SET_ERR_MSG(extack, "ipv4: Device not found");
err = -ENODEV;
- goto errout;
+ goto unlock;
}
- for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ for (ifap = &in_dev->ifa_list;
+ (ifa = rtnl_net_dereference(net, *ifap)) != NULL;
ifap = &ifa->ifa_next) {
if (tb[IFA_LOCAL] &&
ifa->ifa_local != nla_get_in_addr(tb[IFA_LOCAL]))
@@ -638,68 +692,77 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
continue;
if (ipv4_is_multicast(ifa->ifa_address))
- ip_mc_config(net->ipv4.mc_autojoin_sk, false, ifa);
+ ip_mc_autojoin_config(net, false, ifa);
+
__inet_del_ifa(in_dev, ifap, 1, nlh, NETLINK_CB(skb).portid);
- return 0;
+ goto unlock;
}
+ NL_SET_ERR_MSG(extack, "ipv4: Address not found");
err = -EADDRNOTAVAIL;
-errout:
+unlock:
+ rtnl_net_unlock(net);
+out:
return err;
}
-#define INFINITY_LIFE_TIME 0xFFFFFFFF
-
static void check_lifetime(struct work_struct *work)
{
unsigned long now, next, next_sec, next_sched;
struct in_ifaddr *ifa;
struct hlist_node *n;
+ struct net *net;
int i;
+ net = container_of(to_delayed_work(work), struct net, ipv4.addr_chk_work);
now = jiffies;
next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY);
for (i = 0; i < IN4_ADDR_HSIZE; i++) {
+ struct hlist_head *head = &net->ipv4.inet_addr_lst[i];
bool change_needed = false;
rcu_read_lock();
- hlist_for_each_entry_rcu(ifa, &inet_addr_lst[i], hash) {
- unsigned long age;
-
- if (ifa->ifa_flags & IFA_F_PERMANENT)
+ hlist_for_each_entry_rcu(ifa, head, addr_lst) {
+ unsigned long age, tstamp;
+ u32 preferred_lft;
+ u32 valid_lft;
+ u32 flags;
+
+ flags = READ_ONCE(ifa->ifa_flags);
+ if (flags & IFA_F_PERMANENT)
continue;
+ preferred_lft = READ_ONCE(ifa->ifa_preferred_lft);
+ valid_lft = READ_ONCE(ifa->ifa_valid_lft);
+ tstamp = READ_ONCE(ifa->ifa_tstamp);
/* We try to batch several events at once. */
- age = (now - ifa->ifa_tstamp +
+ age = (now - tstamp +
ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
- if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
- age >= ifa->ifa_valid_lft) {
+ if (valid_lft != INFINITY_LIFE_TIME &&
+ age >= valid_lft) {
change_needed = true;
- } else if (ifa->ifa_preferred_lft ==
+ } else if (preferred_lft ==
INFINITY_LIFE_TIME) {
continue;
- } else if (age >= ifa->ifa_preferred_lft) {
- if (time_before(ifa->ifa_tstamp +
- ifa->ifa_valid_lft * HZ, next))
- next = ifa->ifa_tstamp +
- ifa->ifa_valid_lft * HZ;
+ } else if (age >= preferred_lft) {
+ if (time_before(tstamp + valid_lft * HZ, next))
+ next = tstamp + valid_lft * HZ;
- if (!(ifa->ifa_flags & IFA_F_DEPRECATED))
+ if (!(flags & IFA_F_DEPRECATED))
change_needed = true;
- } else if (time_before(ifa->ifa_tstamp +
- ifa->ifa_preferred_lft * HZ,
+ } else if (time_before(tstamp + preferred_lft * HZ,
next)) {
- next = ifa->ifa_tstamp +
- ifa->ifa_preferred_lft * HZ;
+ next = tstamp + preferred_lft * HZ;
}
}
rcu_read_unlock();
if (!change_needed)
continue;
- rtnl_lock();
- hlist_for_each_entry_safe(ifa, n, &inet_addr_lst[i], hash) {
+
+ rtnl_net_lock(net);
+ hlist_for_each_entry_safe(ifa, n, head, addr_lst) {
unsigned long age;
if (ifa->ifa_flags & IFA_F_PERMANENT)
@@ -711,15 +774,19 @@ static void check_lifetime(struct work_struct *work)
if (ifa->ifa_valid_lft != INFINITY_LIFE_TIME &&
age >= ifa->ifa_valid_lft) {
- struct in_ifaddr **ifap;
+ struct in_ifaddr __rcu **ifap;
+ struct in_ifaddr *tmp;
- for (ifap = &ifa->ifa_dev->ifa_list;
- *ifap != NULL; ifap = &(*ifap)->ifa_next) {
- if (*ifap == ifa) {
+ ifap = &ifa->ifa_dev->ifa_list;
+ tmp = rtnl_net_dereference(net, *ifap);
+ while (tmp) {
+ if (tmp == ifa) {
inet_del_ifa(ifa->ifa_dev,
ifap, 1);
break;
}
+ ifap = &tmp->ifa_next;
+ tmp = rtnl_net_dereference(net, *ifap);
}
} else if (ifa->ifa_preferred_lft !=
INFINITY_LIFE_TIME &&
@@ -729,7 +796,7 @@ static void check_lifetime(struct work_struct *work)
rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
}
}
- rtnl_unlock();
+ rtnl_net_unlock(net);
}
next_sec = round_jiffies_up(next);
@@ -744,65 +811,97 @@ static void check_lifetime(struct work_struct *work)
if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX))
next_sched = now + ADDRCONF_TIMER_FUZZ_MAX;
- queue_delayed_work(system_power_efficient_wq, &check_lifetime_work,
- next_sched - now);
+ queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work,
+ next_sched - now);
}
static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft,
__u32 prefered_lft)
{
unsigned long timeout;
+ u32 flags;
- ifa->ifa_flags &= ~(IFA_F_PERMANENT | IFA_F_DEPRECATED);
+ flags = ifa->ifa_flags & ~(IFA_F_PERMANENT | IFA_F_DEPRECATED);
timeout = addrconf_timeout_fixup(valid_lft, HZ);
if (addrconf_finite_timeout(timeout))
- ifa->ifa_valid_lft = timeout;
+ WRITE_ONCE(ifa->ifa_valid_lft, timeout);
else
- ifa->ifa_flags |= IFA_F_PERMANENT;
+ flags |= IFA_F_PERMANENT;
timeout = addrconf_timeout_fixup(prefered_lft, HZ);
if (addrconf_finite_timeout(timeout)) {
if (timeout == 0)
- ifa->ifa_flags |= IFA_F_DEPRECATED;
- ifa->ifa_preferred_lft = timeout;
+ flags |= IFA_F_DEPRECATED;
+ WRITE_ONCE(ifa->ifa_preferred_lft, timeout);
}
- ifa->ifa_tstamp = jiffies;
+ WRITE_ONCE(ifa->ifa_flags, flags);
+ WRITE_ONCE(ifa->ifa_tstamp, jiffies);
if (!ifa->ifa_cstamp)
- ifa->ifa_cstamp = ifa->ifa_tstamp;
+ WRITE_ONCE(ifa->ifa_cstamp, ifa->ifa_tstamp);
}
-static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
- __u32 *pvalid_lft, __u32 *pprefered_lft)
+static int inet_validate_rtm(struct nlmsghdr *nlh, struct nlattr **tb,
+ struct netlink_ext_ack *extack,
+ __u32 *valid_lft, __u32 *prefered_lft)
{
- struct nlattr *tb[IFA_MAX+1];
- struct in_ifaddr *ifa;
- struct ifaddrmsg *ifm;
- struct net_device *dev;
- struct in_device *in_dev;
+ struct ifaddrmsg *ifm = nlmsg_data(nlh);
int err;
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy,
- NULL);
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
+ ifa_ipv4_policy, extack);
if (err < 0)
- goto errout;
+ return err;
- ifm = nlmsg_data(nlh);
- err = -EINVAL;
- if (ifm->ifa_prefixlen > 32 || !tb[IFA_LOCAL])
- goto errout;
+ if (ifm->ifa_prefixlen > 32) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid prefix length");
+ return -EINVAL;
+ }
+
+ if (!tb[IFA_LOCAL]) {
+ NL_SET_ERR_MSG(extack, "ipv4: Local address is not supplied");
+ return -EINVAL;
+ }
+
+ if (tb[IFA_CACHEINFO]) {
+ struct ifa_cacheinfo *ci;
+
+ ci = nla_data(tb[IFA_CACHEINFO]);
+ if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) {
+ NL_SET_ERR_MSG(extack, "ipv4: address lifetime invalid");
+ return -EINVAL;
+ }
+
+ *valid_lft = ci->ifa_valid;
+ *prefered_lft = ci->ifa_prefered;
+ }
+
+ return 0;
+}
+
+static struct in_ifaddr *inet_rtm_to_ifa(struct net *net, struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct ifaddrmsg *ifm = nlmsg_data(nlh);
+ struct in_device *in_dev;
+ struct net_device *dev;
+ struct in_ifaddr *ifa;
+ int err;
dev = __dev_get_by_index(net, ifm->ifa_index);
err = -ENODEV;
- if (!dev)
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "ipv4: Device not found");
goto errout;
+ }
- in_dev = __in_dev_get_rtnl(dev);
+ in_dev = __in_dev_get_rtnl_net(dev);
err = -ENOBUFS;
if (!in_dev)
goto errout;
- ifa = inet_alloc_ifa();
+ ifa = inet_alloc_ifa(in_dev);
if (!ifa)
/*
* A potential indev allocation can be left alive, it stays
@@ -812,19 +911,14 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
ipv4_devconf_setall(in_dev);
neigh_parms_data_state_setall(in_dev->arp_parms);
- in_dev_hold(in_dev);
if (!tb[IFA_ADDRESS])
tb[IFA_ADDRESS] = tb[IFA_LOCAL];
- INIT_HLIST_NODE(&ifa->hash);
ifa->ifa_prefixlen = ifm->ifa_prefixlen;
ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
- ifa->ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) :
- ifm->ifa_flags;
+ ifa->ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags);
ifa->ifa_scope = ifm->ifa_scope;
- ifa->ifa_dev = in_dev;
-
ifa->ifa_local = nla_get_in_addr(tb[IFA_LOCAL]);
ifa->ifa_address = nla_get_in_addr(tb[IFA_ADDRESS]);
@@ -832,91 +926,91 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]);
if (tb[IFA_LABEL])
- nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
+ nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ);
else
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
if (tb[IFA_RT_PRIORITY])
ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
- if (tb[IFA_CACHEINFO]) {
- struct ifa_cacheinfo *ci;
-
- ci = nla_data(tb[IFA_CACHEINFO]);
- if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) {
- err = -EINVAL;
- goto errout_free;
- }
- *pvalid_lft = ci->ifa_valid;
- *pprefered_lft = ci->ifa_prefered;
- }
+ if (tb[IFA_PROTO])
+ ifa->ifa_proto = nla_get_u8(tb[IFA_PROTO]);
return ifa;
-errout_free:
- inet_free_ifa(ifa);
errout:
return ERR_PTR(err);
}
-static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa)
+static struct in_ifaddr *find_matching_ifa(struct net *net, struct in_ifaddr *ifa)
{
struct in_device *in_dev = ifa->ifa_dev;
- struct in_ifaddr *ifa1, **ifap;
-
- if (!ifa->ifa_local)
- return NULL;
+ struct in_ifaddr *ifa1;
- for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
- ifap = &ifa1->ifa_next) {
+ in_dev_for_each_ifa_rtnl_net(net, ifa1, in_dev) {
if (ifa1->ifa_mask == ifa->ifa_mask &&
inet_ifa_match(ifa1->ifa_address, ifa) &&
ifa1->ifa_local == ifa->ifa_local)
return ifa1;
}
+
return NULL;
}
static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
+ __u32 prefered_lft = INFINITY_LIFE_TIME;
+ __u32 valid_lft = INFINITY_LIFE_TIME;
struct net *net = sock_net(skb->sk);
- struct in_ifaddr *ifa;
struct in_ifaddr *ifa_existing;
- __u32 valid_lft = INFINITY_LIFE_TIME;
- __u32 prefered_lft = INFINITY_LIFE_TIME;
+ struct nlattr *tb[IFA_MAX + 1];
+ struct in_ifaddr *ifa;
+ int ret;
- ASSERT_RTNL();
+ ret = inet_validate_rtm(nlh, tb, extack, &valid_lft, &prefered_lft);
+ if (ret < 0)
+ return ret;
- ifa = rtm_to_ifaddr(net, nlh, &valid_lft, &prefered_lft);
- if (IS_ERR(ifa))
- return PTR_ERR(ifa);
+ if (!nla_get_in_addr(tb[IFA_LOCAL]))
+ return 0;
+
+ rtnl_net_lock(net);
+
+ ifa = inet_rtm_to_ifa(net, nlh, tb, extack);
+ if (IS_ERR(ifa)) {
+ ret = PTR_ERR(ifa);
+ goto unlock;
+ }
- ifa_existing = find_matching_ifa(ifa);
+ ifa_existing = find_matching_ifa(net, ifa);
if (!ifa_existing) {
/* It would be best to check for !NLM_F_CREATE here but
* userspace already relies on not having to provide this.
*/
set_ifa_lifetime(ifa, valid_lft, prefered_lft);
if (ifa->ifa_flags & IFA_F_MCAUTOJOIN) {
- int ret = ip_mc_config(net->ipv4.mc_autojoin_sk,
- true, ifa);
-
+ ret = ip_mc_autojoin_config(net, true, ifa);
if (ret < 0) {
+ NL_SET_ERR_MSG(extack, "ipv4: Multicast auto join failed");
inet_free_ifa(ifa);
- return ret;
+ goto unlock;
}
}
- return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid,
- extack);
+
+ ret = __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid, extack);
} else {
u32 new_metric = ifa->ifa_rt_priority;
+ u8 new_proto = ifa->ifa_proto;
inet_free_ifa(ifa);
if (nlh->nlmsg_flags & NLM_F_EXCL ||
- !(nlh->nlmsg_flags & NLM_F_REPLACE))
- return -EEXIST;
+ !(nlh->nlmsg_flags & NLM_F_REPLACE)) {
+ NL_SET_ERR_MSG(extack, "ipv4: Address already assigned");
+ ret = -EEXIST;
+ goto unlock;
+ }
ifa = ifa_existing;
if (ifa->ifa_rt_priority != new_metric) {
@@ -924,13 +1018,19 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
ifa->ifa_rt_priority = new_metric;
}
+ ifa->ifa_proto = new_proto;
+
set_ifa_lifetime(ifa, valid_lft, prefered_lft);
- cancel_delayed_work(&check_lifetime_work);
+ cancel_delayed_work(&net->ipv4.addr_chk_work);
queue_delayed_work(system_power_efficient_wq,
- &check_lifetime_work, 0);
+ &net->ipv4.addr_chk_work, 0);
rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid);
}
- return 0;
+
+unlock:
+ rtnl_net_unlock(net);
+
+ return ret;
}
/*
@@ -941,17 +1041,18 @@ static int inet_abc_len(__be32 addr)
{
int rc = -1; /* Something else, probably a multicast. */
- if (ipv4_is_zeronet(addr))
+ if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
rc = 0;
else {
__u32 haddr = ntohl(addr);
-
if (IN_CLASSA(haddr))
rc = 8;
else if (IN_CLASSB(haddr))
rc = 16;
else if (IN_CLASSC(haddr))
rc = 24;
+ else if (IN_CLASSE(haddr))
+ rc = 32;
}
return rc;
@@ -962,8 +1063,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
{
struct sockaddr_in sin_orig;
struct sockaddr_in *sin = (struct sockaddr_in *)&ifr->ifr_addr;
+ struct in_ifaddr __rcu **ifap = NULL;
struct in_device *in_dev;
- struct in_ifaddr **ifap = NULL;
struct in_ifaddr *ifa = NULL;
struct net_device *dev;
char *colon;
@@ -1016,7 +1117,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
goto out;
}
- rtnl_lock();
+ rtnl_net_lock(net);
ret = -ENODEV;
dev = __dev_get_by_name(net, ifr->ifr_name);
@@ -1026,7 +1127,7 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
if (colon)
*colon = ':';
- in_dev = __in_dev_get_rtnl(dev);
+ in_dev = __in_dev_get_rtnl_net(dev);
if (in_dev) {
if (tryaddrmatch) {
/* Matthias Andree */
@@ -1034,7 +1135,9 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
/* note: we only do this for a limited set of ioctls
and only if the original address family was AF_INET.
This is checked above. */
- for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+
+ for (ifap = &in_dev->ifa_list;
+ (ifa = rtnl_net_dereference(net, *ifap)) != NULL;
ifap = &ifa->ifa_next) {
if (!strcmp(ifr->ifr_name, ifa->ifa_label) &&
sin_orig.sin_addr.s_addr ==
@@ -1047,7 +1150,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
4.3BSD-style and passed in junk so we fall back to
comparing just the label */
if (!ifa) {
- for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+ for (ifap = &in_dev->ifa_list;
+ (ifa = rtnl_net_dereference(net, *ifap)) != NULL;
ifap = &ifa->ifa_next)
if (!strcmp(ifr->ifr_name, ifa->ifa_label))
break;
@@ -1089,7 +1193,10 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
inet_del_ifa(in_dev, ifap, 1);
break;
}
- ret = dev_change_flags(dev, ifr->ifr_flags);
+
+ /* NETDEV_UP/DOWN/CHANGE could touch a peer dev */
+ ASSERT_RTNL();
+ ret = dev_change_flags(dev, ifr->ifr_flags, NULL);
break;
case SIOCSIFADDR: /* Set interface address (and family) */
@@ -1099,10 +1206,12 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
if (!ifa) {
ret = -ENOBUFS;
- ifa = inet_alloc_ifa();
+ if (!in_dev)
+ break;
+ ifa = inet_alloc_ifa(in_dev);
if (!ifa)
break;
- INIT_HLIST_NODE(&ifa->hash);
+
if (colon)
memcpy(ifa->ifa_label, ifr->ifr_name, IFNAMSIZ);
else
@@ -1188,15 +1297,15 @@ int devinet_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr)
break;
}
done:
- rtnl_unlock();
+ rtnl_net_unlock(net);
out:
return ret;
}
-static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size)
+int inet_gifconf(struct net_device *dev, char __user *buf, int len, int size)
{
- struct in_device *in_dev = __in_dev_get_rtnl(dev);
- struct in_ifaddr *ifa;
+ struct in_device *in_dev = __in_dev_get_rtnl_net(dev);
+ const struct in_ifaddr *ifa;
struct ifreq ifr;
int done = 0;
@@ -1206,7 +1315,7 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len, int s
if (!in_dev)
goto out;
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ in_dev_for_each_ifa_rtnl_net(dev_net(dev), ifa, in_dev) {
if (!buf) {
done += size;
continue;
@@ -1234,29 +1343,41 @@ out:
static __be32 in_dev_select_addr(const struct in_device *in_dev,
int scope)
{
- for_primary_ifa(in_dev) {
+ const struct in_ifaddr *ifa;
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY)
+ continue;
if (ifa->ifa_scope != RT_SCOPE_LINK &&
ifa->ifa_scope <= scope)
return ifa->ifa_local;
- } endfor_ifa(in_dev);
+ }
return 0;
}
__be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
{
+ const struct in_ifaddr *ifa;
__be32 addr = 0;
+ unsigned char localnet_scope = RT_SCOPE_HOST;
struct in_device *in_dev;
- struct net *net = dev_net(dev);
+ struct net *net;
int master_idx;
rcu_read_lock();
+ net = dev_net_rcu(dev);
in_dev = __in_dev_get_rcu(dev);
if (!in_dev)
goto no_in_dev;
- for_primary_ifa(in_dev) {
- if (ifa->ifa_scope > scope)
+ if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev)))
+ localnet_scope = RT_SCOPE_LINK;
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY)
+ continue;
+ if (min(ifa->ifa_scope, localnet_scope) > scope)
continue;
if (!dst || inet_ifa_match(dst, ifa)) {
addr = ifa->ifa_local;
@@ -1264,7 +1385,7 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
}
if (!addr)
addr = ifa->ifa_local;
- } endfor_ifa(in_dev);
+ }
if (addr)
goto out_unlock;
@@ -1309,13 +1430,20 @@ EXPORT_SYMBOL(inet_select_addr);
static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
__be32 local, int scope)
{
- int same = 0;
+ unsigned char localnet_scope = RT_SCOPE_HOST;
+ const struct in_ifaddr *ifa;
__be32 addr = 0;
+ int same = 0;
+
+ if (unlikely(IN_DEV_ROUTE_LOCALNET(in_dev)))
+ localnet_scope = RT_SCOPE_LINK;
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ unsigned char min_scope = min(ifa->ifa_scope, localnet_scope);
- for_ifa(in_dev) {
if (!addr &&
(local == ifa->ifa_local || !local) &&
- ifa->ifa_scope <= scope) {
+ min_scope <= scope) {
addr = ifa->ifa_local;
if (same)
break;
@@ -1330,7 +1458,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
if (inet_ifa_match(addr, ifa))
break;
/* No, then can we use new local src? */
- if (ifa->ifa_scope <= scope) {
+ if (min_scope <= scope) {
addr = ifa->ifa_local;
break;
}
@@ -1338,7 +1466,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
same = 0;
}
}
- } endfor_ifa(in_dev);
+ }
return same ? addr : 0;
}
@@ -1412,7 +1540,7 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
struct in_ifaddr *ifa;
int named = 0;
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
char old[IFNAMSIZ], *dot;
memcpy(old, ifa->ifa_label, IFNAMSIZ);
@@ -1433,19 +1561,13 @@ skip:
}
}
-static bool inetdev_valid_mtu(unsigned int mtu)
-{
- return mtu >= IPV4_MIN_MTU;
-}
-
static void inetdev_send_gratuitous_arp(struct net_device *dev,
struct in_device *in_dev)
{
- struct in_ifaddr *ifa;
+ const struct in_ifaddr *ifa;
- for (ifa = in_dev->ifa_list; ifa;
- ifa = ifa->ifa_next) {
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
arp_send(ARPOP_REQUEST, ETH_P_ARP,
ifa->ifa_local, dev,
ifa->ifa_local, NULL,
@@ -1489,16 +1611,13 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
if (!inetdev_valid_mtu(dev->mtu))
break;
if (dev->flags & IFF_LOOPBACK) {
- struct in_ifaddr *ifa = inet_alloc_ifa();
+ struct in_ifaddr *ifa = inet_alloc_ifa(in_dev);
if (ifa) {
- INIT_HLIST_NODE(&ifa->hash);
ifa->ifa_local =
ifa->ifa_address = htonl(INADDR_LOOPBACK);
ifa->ifa_prefixlen = 8;
ifa->ifa_mask = inet_make_mask(8);
- in_dev_hold(in_dev);
- ifa->ifa_dev = in_dev;
ifa->ifa_scope = RT_SCOPE_HOST;
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
set_ifa_lifetime(ifa, INFINITY_LIFE_TIME,
@@ -1509,11 +1628,11 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
}
}
ip_mc_up(in_dev);
- /* fall through */
+ fallthrough;
case NETDEV_CHANGEADDR:
if (!IN_DEV_ARP_NOTIFY(in_dev))
break;
- /* fall through */
+ fallthrough;
case NETDEV_NOTIFY_PEERS:
/* Send gratuitous ARP to notify of link change */
inetdev_send_gratuitous_arp(dev, in_dev);
@@ -1531,7 +1650,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
if (inetdev_valid_mtu(dev->mtu))
break;
/* disable IP when MTU is not enough */
- /* fall through */
+ fallthrough;
case NETDEV_UNREGISTER:
inetdev_destroy(in_dev);
break;
@@ -1561,6 +1680,7 @@ static size_t inet_nlmsg_size(void)
+ nla_total_size(4) /* IFA_BROADCAST */
+ nla_total_size(IFNAMSIZ) /* IFA_LABEL */
+ nla_total_size(4) /* IFA_FLAGS */
+ + nla_total_size(1) /* IFA_PROTO */
+ nla_total_size(4) /* IFA_RT_PRIORITY */
+ nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */
}
@@ -1583,29 +1703,43 @@ static int put_cacheinfo(struct sk_buff *skb, unsigned long cstamp,
return nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci);
}
-static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
- u32 portid, u32 seq, int event, unsigned int flags)
+static int inet_fill_ifaddr(struct sk_buff *skb, const struct in_ifaddr *ifa,
+ struct inet_fill_args *args)
{
struct ifaddrmsg *ifm;
struct nlmsghdr *nlh;
+ unsigned long tstamp;
u32 preferred, valid;
+ u32 flags;
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(*ifm), flags);
+ nlh = nlmsg_put(skb, args->portid, args->seq, args->event, sizeof(*ifm),
+ args->flags);
if (!nlh)
return -EMSGSIZE;
ifm = nlmsg_data(nlh);
ifm->ifa_family = AF_INET;
ifm->ifa_prefixlen = ifa->ifa_prefixlen;
- ifm->ifa_flags = ifa->ifa_flags;
+
+ flags = READ_ONCE(ifa->ifa_flags);
+ /* Warning : ifm->ifa_flags is an __u8, it holds only 8 bits.
+ * The 32bit value is given in IFA_FLAGS attribute.
+ */
+ ifm->ifa_flags = (__u8)flags;
+
ifm->ifa_scope = ifa->ifa_scope;
ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
- if (!(ifm->ifa_flags & IFA_F_PERMANENT)) {
- preferred = ifa->ifa_preferred_lft;
- valid = ifa->ifa_valid_lft;
+ if (args->netnsid >= 0 &&
+ nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
+ goto nla_put_failure;
+
+ tstamp = READ_ONCE(ifa->ifa_tstamp);
+ if (!(flags & IFA_F_PERMANENT)) {
+ preferred = READ_ONCE(ifa->ifa_preferred_lft);
+ valid = READ_ONCE(ifa->ifa_valid_lft);
if (preferred != INFINITY_LIFE_TIME) {
- long tval = (jiffies - ifa->ifa_tstamp) / HZ;
+ long tval = (jiffies - tstamp) / HZ;
if (preferred > tval)
preferred -= tval;
@@ -1630,10 +1764,12 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
nla_put_in_addr(skb, IFA_BROADCAST, ifa->ifa_broadcast)) ||
(ifa->ifa_label[0] &&
nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
- nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) ||
+ (ifa->ifa_proto &&
+ nla_put_u8(skb, IFA_PROTO, ifa->ifa_proto)) ||
+ nla_put_u32(skb, IFA_FLAGS, flags) ||
(ifa->ifa_rt_priority &&
nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) ||
- put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp,
+ put_cacheinfo(skb, READ_ONCE(ifa->ifa_cstamp), tstamp,
preferred, valid))
goto nla_put_failure;
@@ -1645,68 +1781,232 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
+static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh,
+ struct inet_fill_args *fillargs,
+ struct net **tgt_net, struct sock *sk,
+ struct netlink_callback *cb)
+{
+ struct netlink_ext_ack *extack = cb->extack;
+ struct nlattr *tb[IFA_MAX+1];
+ struct ifaddrmsg *ifm;
+ int err, i;
+
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid header for address dump request");
+ return -EINVAL;
+ }
+
+ if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for address dump request");
+ return -EINVAL;
+ }
+
+ fillargs->ifindex = ifm->ifa_index;
+ if (fillargs->ifindex) {
+ cb->answer_flags |= NLM_F_DUMP_FILTERED;
+ fillargs->flags |= NLM_F_DUMP_FILTERED;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
+ ifa_ipv4_policy, extack);
+ if (err < 0)
+ return err;
+
+ for (i = 0; i <= IFA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ if (i == IFA_TARGET_NETNSID) {
+ struct net *net;
+
+ fillargs->netnsid = nla_get_s32(tb[i]);
+
+ net = rtnl_get_net_ns_capable(sk, fillargs->netnsid);
+ if (IS_ERR(net)) {
+ fillargs->netnsid = -1;
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid target network namespace id");
+ return PTR_ERR(net);
+ }
+ *tgt_net = net;
+ } else {
+ NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in dump request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int in_dev_dump_ifmcaddr(struct in_device *in_dev, struct sk_buff *skb,
+ struct netlink_callback *cb, int *s_ip_idx,
+ struct inet_fill_args *fillargs)
+{
+ struct ip_mc_list *im;
+ int ip_idx = 0;
+ int err;
+
+ for (im = rcu_dereference(in_dev->mc_list);
+ im;
+ im = rcu_dereference(im->next_rcu)) {
+ if (ip_idx < *s_ip_idx) {
+ ip_idx++;
+ continue;
+ }
+ err = inet_fill_ifmcaddr(skb, in_dev->dev, im, fillargs);
+ if (err < 0)
+ goto done;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ ip_idx++;
+ }
+ err = 0;
+ ip_idx = 0;
+done:
+ *s_ip_idx = ip_idx;
+ return err;
+}
+
+static int in_dev_dump_ifaddr(struct in_device *in_dev, struct sk_buff *skb,
+ struct netlink_callback *cb, int *s_ip_idx,
+ struct inet_fill_args *fillargs)
+{
+ struct in_ifaddr *ifa;
+ int ip_idx = 0;
+ int err;
+
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (ip_idx < *s_ip_idx) {
+ ip_idx++;
+ continue;
+ }
+ err = inet_fill_ifaddr(skb, ifa, fillargs);
+ if (err < 0)
+ goto done;
+
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ ip_idx++;
+ }
+ err = 0;
+ ip_idx = 0;
+done:
+ *s_ip_idx = ip_idx;
+
+ return err;
+}
+
+static int in_dev_dump_addr(struct in_device *in_dev, struct sk_buff *skb,
+ struct netlink_callback *cb, int *s_ip_idx,
+ struct inet_fill_args *fillargs)
+{
+ switch (fillargs->event) {
+ case RTM_NEWADDR:
+ return in_dev_dump_ifaddr(in_dev, skb, cb, s_ip_idx, fillargs);
+ case RTM_GETMULTICAST:
+ return in_dev_dump_ifmcaddr(in_dev, skb, cb, s_ip_idx,
+ fillargs);
+ default:
+ return -EINVAL;
+ }
+}
+
+/* Combine dev_addr_genid and dev_base_seq to detect changes.
+ */
+static u32 inet_base_seq(const struct net *net)
{
+ u32 res = atomic_read(&net->ipv4.dev_addr_genid) +
+ READ_ONCE(net->dev_base_seq);
+
+ /* Must not return 0 (see nl_dump_check_consistent()).
+ * Chose a value far away from 0.
+ */
+ if (!res)
+ res = 0x80000000;
+ return res;
+}
+
+static int inet_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
+ int event)
+{
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct inet_fill_args fillargs = {
+ .portid = NETLINK_CB(cb->skb).portid,
+ .seq = nlh->nlmsg_seq,
+ .event = event,
+ .flags = NLM_F_MULTI,
+ .netnsid = -1,
+ };
struct net *net = sock_net(skb->sk);
- int h, s_h;
- int idx, s_idx;
- int ip_idx, s_ip_idx;
- struct net_device *dev;
+ struct net *tgt_net = net;
+ struct {
+ unsigned long ifindex;
+ int ip_idx;
+ } *ctx = (void *)cb->ctx;
struct in_device *in_dev;
- struct in_ifaddr *ifa;
- struct hlist_head *head;
+ struct net_device *dev;
+ int err = 0;
- s_h = cb->args[0];
- s_idx = idx = cb->args[1];
- s_ip_idx = ip_idx = cb->args[2];
+ rcu_read_lock();
+ if (cb->strict_check) {
+ err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net,
+ skb->sk, cb);
+ if (err < 0)
+ goto done;
- for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
- idx = 0;
- head = &net->dev_index_head[h];
- rcu_read_lock();
- cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
- net->dev_base_seq;
- hlist_for_each_entry_rcu(dev, head, index_hlist) {
- if (idx < s_idx)
- goto cont;
- if (h > s_h || idx > s_idx)
- s_ip_idx = 0;
+ if (fillargs.ifindex) {
+ dev = dev_get_by_index_rcu(tgt_net, fillargs.ifindex);
+ if (!dev) {
+ err = -ENODEV;
+ goto done;
+ }
in_dev = __in_dev_get_rcu(dev);
if (!in_dev)
- goto cont;
-
- for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
- ifa = ifa->ifa_next, ip_idx++) {
- if (ip_idx < s_ip_idx)
- continue;
- if (inet_fill_ifaddr(skb, ifa,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWADDR, NLM_F_MULTI) < 0) {
- rcu_read_unlock();
- goto done;
- }
- nl_dump_check_consistent(cb, nlmsg_hdr(skb));
- }
-cont:
- idx++;
+ goto done;
+ err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx,
+ &fillargs);
+ goto done;
}
- rcu_read_unlock();
}
+ cb->seq = inet_base_seq(tgt_net);
+
+ for_each_netdev_dump(tgt_net, dev, ctx->ifindex) {
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ continue;
+ err = in_dev_dump_addr(in_dev, skb, cb, &ctx->ip_idx,
+ &fillargs);
+ if (err < 0)
+ goto done;
+ }
done:
- cb->args[0] = h;
- cb->args[1] = idx;
- cb->args[2] = ip_idx;
+ if (fillargs.netnsid >= 0)
+ put_net(tgt_net);
+ rcu_read_unlock();
+ return err;
+}
- return skb->len;
+static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return inet_dump_addr(skb, cb, RTM_NEWADDR);
+}
+
+static int inet_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return inet_dump_addr(skb, cb, RTM_GETMULTICAST);
}
static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
u32 portid)
{
+ struct inet_fill_args fillargs = {
+ .portid = portid,
+ .seq = nlh ? nlh->nlmsg_seq : 0,
+ .event = event,
+ .flags = 0,
+ .netnsid = -1,
+ };
struct sk_buff *skb;
- u32 seq = nlh ? nlh->nlmsg_seq : 0;
int err = -ENOBUFS;
struct net *net;
@@ -1715,7 +2015,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
if (!skb)
goto errout;
- err = inet_fill_ifaddr(skb, ifa, portid, seq, event, 0);
+ err = inet_fill_ifaddr(skb, ifa, &fillargs);
if (err < 0) {
/* -EMSGSIZE implies BUG in inet_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
@@ -1725,8 +2025,7 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
rtnl_notify(skb, net, portid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
return;
errout:
- if (err < 0)
- rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
}
static size_t inet_get_link_af_size(const struct net_device *dev,
@@ -1755,7 +2054,7 @@ static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev,
return -EMSGSIZE;
for (i = 0; i < IPV4_DEVCONF_MAX; i++)
- ((u32 *) nla_data(nla))[i] = in_dev->cnf.data[i];
+ ((u32 *) nla_data(nla))[i] = READ_ONCE(in_dev->cnf.data[i]);
return 0;
}
@@ -1765,15 +2064,17 @@ static const struct nla_policy inet_af_policy[IFLA_INET_MAX+1] = {
};
static int inet_validate_link_af(const struct net_device *dev,
- const struct nlattr *nla)
+ const struct nlattr *nla,
+ struct netlink_ext_ack *extack)
{
struct nlattr *a, *tb[IFLA_INET_MAX+1];
int err, rem;
- if (dev && !__in_dev_get_rcu(dev))
+ if (dev && !__in_dev_get_rtnl(dev))
return -EAFNOSUPPORT;
- err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy, NULL);
+ err = nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla,
+ inet_af_policy, extack);
if (err < 0)
return err;
@@ -1792,17 +2093,18 @@ static int inet_validate_link_af(const struct net_device *dev,
return 0;
}
-static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla)
+static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla,
+ struct netlink_ext_ack *extack)
{
- struct in_device *in_dev = __in_dev_get_rcu(dev);
+ struct in_device *in_dev = __in_dev_get_rtnl(dev);
struct nlattr *a, *tb[IFLA_INET_MAX+1];
int rem;
if (!in_dev)
return -EAFNOSUPPORT;
- if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0)
- BUG();
+ if (nla_parse_nested_deprecated(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0)
+ return -EINVAL;
if (tb[IFLA_INET_CONF]) {
nla_for_each_nested(a, tb[IFLA_INET_CONF], rem)
@@ -1838,9 +2140,9 @@ static int inet_netconf_msgsize_devconf(int type)
}
static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
- struct ipv4_devconf *devconf, u32 portid,
- u32 seq, int event, unsigned int flags,
- int type)
+ const struct ipv4_devconf *devconf,
+ u32 portid, u32 seq, int event,
+ unsigned int flags, int type)
{
struct nlmsghdr *nlh;
struct netconfmsg *ncm;
@@ -1865,27 +2167,28 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
if ((all || type == NETCONFA_FORWARDING) &&
nla_put_s32(skb, NETCONFA_FORWARDING,
- IPV4_DEVCONF(*devconf, FORWARDING)) < 0)
+ IPV4_DEVCONF_RO(*devconf, FORWARDING)) < 0)
goto nla_put_failure;
if ((all || type == NETCONFA_RP_FILTER) &&
nla_put_s32(skb, NETCONFA_RP_FILTER,
- IPV4_DEVCONF(*devconf, RP_FILTER)) < 0)
+ IPV4_DEVCONF_RO(*devconf, RP_FILTER)) < 0)
goto nla_put_failure;
if ((all || type == NETCONFA_MC_FORWARDING) &&
nla_put_s32(skb, NETCONFA_MC_FORWARDING,
- IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
+ IPV4_DEVCONF_RO(*devconf, MC_FORWARDING)) < 0)
goto nla_put_failure;
if ((all || type == NETCONFA_BC_FORWARDING) &&
nla_put_s32(skb, NETCONFA_BC_FORWARDING,
- IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0)
+ IPV4_DEVCONF_RO(*devconf, BC_FORWARDING)) < 0)
goto nla_put_failure;
if ((all || type == NETCONFA_PROXY_NEIGH) &&
nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
- IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
+ IPV4_DEVCONF_RO(*devconf, PROXY_ARP)) < 0)
goto nla_put_failure;
if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
- IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0)
+ IPV4_DEVCONF_RO(*devconf,
+ IGNORE_ROUTES_WITH_LINKDOWN)) < 0)
goto nla_put_failure;
out:
@@ -1918,8 +2221,7 @@ void inet_netconf_notify_devconf(struct net *net, int event, int type,
rtnl_notify(skb, net, 0, RTNLGRP_IPV4_NETCONF, NULL, GFP_KERNEL);
return;
errout:
- if (err < 0)
- rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err);
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_NETCONF, err);
}
static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
@@ -1930,28 +2232,64 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = {
[NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) },
};
+static int inet_netconf_valid_get_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ int i, err;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf get request");
+ return -EINVAL;
+ }
+
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg),
+ tb, NETCONFA_MAX,
+ devconf_ipv4_policy, extack);
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg),
+ tb, NETCONFA_MAX,
+ devconf_ipv4_policy, extack);
+ if (err)
+ return err;
+
+ for (i = 0; i <= NETCONFA_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case NETCONFA_IFINDEX:
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in netconf get request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static int inet_netconf_get_devconf(struct sk_buff *in_skb,
struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(in_skb->sk);
- struct nlattr *tb[NETCONFA_MAX+1];
- struct netconfmsg *ncm;
+ struct nlattr *tb[NETCONFA_MAX + 1];
+ const struct ipv4_devconf *devconf;
+ struct in_device *in_dev = NULL;
+ struct net_device *dev = NULL;
struct sk_buff *skb;
- struct ipv4_devconf *devconf;
- struct in_device *in_dev;
- struct net_device *dev;
int ifindex;
int err;
- err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX,
- devconf_ipv4_policy, extack);
- if (err < 0)
- goto errout;
+ err = inet_netconf_valid_get_req(in_skb, nlh, tb, extack);
+ if (err)
+ return err;
- err = -EINVAL;
if (!tb[NETCONFA_IFINDEX])
- goto errout;
+ return -EINVAL;
ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]);
switch (ifindex) {
@@ -1962,10 +2300,10 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
devconf = net->ipv4.devconf_dflt;
break;
default:
- dev = __dev_get_by_index(net, ifindex);
- if (!dev)
- goto errout;
- in_dev = __in_dev_get_rtnl(dev);
+ err = -ENODEV;
+ dev = dev_get_by_index(net, ifindex);
+ if (dev)
+ in_dev = in_dev_get(dev);
if (!in_dev)
goto errout;
devconf = &in_dev->cnf;
@@ -1989,78 +2327,79 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
}
err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
errout:
+ if (in_dev)
+ in_dev_put(in_dev);
+ dev_put(dev);
return err;
}
static int inet_netconf_dump_devconf(struct sk_buff *skb,
struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
- int h, s_h;
- int idx, s_idx;
+ struct {
+ unsigned long ifindex;
+ unsigned int all_default;
+ } *ctx = (void *)cb->ctx;
+ const struct in_device *in_dev;
struct net_device *dev;
- struct in_device *in_dev;
- struct hlist_head *head;
+ int err = 0;
- s_h = cb->args[0];
- s_idx = idx = cb->args[1];
+ if (cb->strict_check) {
+ struct netlink_ext_ack *extack = cb->extack;
+ struct netconfmsg *ncm;
- for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
- idx = 0;
- head = &net->dev_index_head[h];
- rcu_read_lock();
- cb->seq = atomic_read(&net->ipv4.dev_addr_genid) ^
- net->dev_base_seq;
- hlist_for_each_entry_rcu(dev, head, index_hlist) {
- if (idx < s_idx)
- goto cont;
- in_dev = __in_dev_get_rcu(dev);
- if (!in_dev)
- goto cont;
-
- if (inet_netconf_fill_devconf(skb, dev->ifindex,
- &in_dev->cnf,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWNETCONF,
- NLM_F_MULTI,
- NETCONFA_ALL) < 0) {
- rcu_read_unlock();
- goto done;
- }
- nl_dump_check_consistent(cb, nlmsg_hdr(skb));
-cont:
- idx++;
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid header for netconf dump request");
+ return -EINVAL;
+ }
+
+ if (nlmsg_attrlen(nlh, sizeof(*ncm))) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid data after header in netconf dump request");
+ return -EINVAL;
}
- rcu_read_unlock();
}
- if (h == NETDEV_HASHENTRIES) {
- if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
- net->ipv4.devconf_all,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWNETCONF, NLM_F_MULTI,
- NETCONFA_ALL) < 0)
+
+ rcu_read_lock();
+ for_each_netdev_dump(net, dev, ctx->ifindex) {
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ continue;
+ err = inet_netconf_fill_devconf(skb, dev->ifindex,
+ &in_dev->cnf,
+ NETLINK_CB(cb->skb).portid,
+ nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ NETCONFA_ALL);
+ if (err < 0)
goto done;
- else
- h++;
- }
- if (h == NETDEV_HASHENTRIES + 1) {
- if (inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
- net->ipv4.devconf_dflt,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWNETCONF, NLM_F_MULTI,
- NETCONFA_ALL) < 0)
+ }
+ if (ctx->all_default == 0) {
+ err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
+ net->ipv4.devconf_all,
+ NETLINK_CB(cb->skb).portid,
+ nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ NETCONFA_ALL);
+ if (err < 0)
goto done;
- else
- h++;
+ ctx->all_default++;
+ }
+ if (ctx->all_default == 1) {
+ err = inet_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
+ net->ipv4.devconf_dflt,
+ NETLINK_CB(cb->skb).portid,
+ nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ NETCONFA_ALL);
+ if (err < 0)
+ goto done;
+ ctx->all_default++;
}
done:
- cb->args[0] = h;
- cb->args[1] = idx;
-
- return skb->len;
+ rcu_read_unlock();
+ return err;
}
#ifdef CONFIG_SYSCTL
@@ -2103,7 +2442,7 @@ static void inet_forward_change(struct net *net)
if (on)
dev_disable_lro(dev);
- in_dev = __in_dev_get_rtnl(dev);
+ in_dev = __in_dev_get_rtnl_net(dev);
if (in_dev) {
IN_DEV_CONF_SET(in_dev, FORWARDING, on);
inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
@@ -2126,9 +2465,8 @@ static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf)
}
}
-static int devinet_conf_proc(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+static int devinet_conf_proc(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int old_value = *(int *)ctl->data;
int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
@@ -2179,20 +2517,23 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write,
return ret;
}
-static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+static int devinet_sysctl_forward(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int val = *valp;
loff_t pos = *ppos;
- int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ struct net *net = ctl->extra2;
+ int ret;
- if (write && *valp != val) {
- struct net *net = ctl->extra2;
+ if (write && !ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ if (write && *valp != val) {
if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) {
- if (!rtnl_trylock()) {
+ if (!rtnl_net_trylock(net)) {
/* Restore the original values before restarting */
*valp = val;
*ppos = pos;
@@ -2211,7 +2552,7 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
idev->dev->ifindex,
cnf);
}
- rtnl_unlock();
+ rtnl_net_unlock(net);
rt_cache_flush(net);
} else
inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
@@ -2223,9 +2564,8 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write,
return ret;
}
-static int ipv4_doint_and_flush(struct ctl_table *ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+static int ipv4_doint_and_flush(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int val = *valp;
@@ -2263,7 +2603,7 @@ static int ipv4_doint_and_flush(struct ctl_table *ctl, int write,
static struct devinet_sysctl_table {
struct ctl_table_header *sysctl_header;
- struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX];
+ struct ctl_table devinet_vars[IPV4_DEVCONF_MAX];
} devinet_sysctl = {
.devinet_vars = {
DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
@@ -2290,6 +2630,8 @@ static struct devinet_sysctl_table {
DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
+ DEVINET_SYSCTL_RW_ENTRY(ARP_EVICT_NOCARRIER,
+ "arp_evict_nocarrier"),
DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
DEVINET_SYSCTL_RW_ENTRY(FORCE_IGMP_VERSION,
"force_igmp_version"),
@@ -2320,11 +2662,11 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
struct devinet_sysctl_table *t;
char path[sizeof("net/ipv4/conf/") + IFNAMSIZ];
- t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL);
+ t = kmemdup(&devinet_sysctl, sizeof(*t), GFP_KERNEL_ACCOUNT);
if (!t)
goto out;
- for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) {
+ for (i = 0; i < ARRAY_SIZE(t->devinet_vars); i++) {
t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf;
t->devinet_vars[i].extra1 = p;
t->devinet_vars[i].extra2 = net;
@@ -2345,7 +2687,7 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
free:
kfree(t);
out:
- return -ENOBUFS;
+ return -ENOMEM;
}
static void __devinet_sysctl_unregister(struct net *net,
@@ -2398,41 +2740,66 @@ static struct ctl_table ctl_forward_entry[] = {
.extra1 = &ipv4_devconf,
.extra2 = &init_net,
},
- { },
};
#endif
static __net_init int devinet_init_net(struct net *net)
{
- int err;
- struct ipv4_devconf *all, *dflt;
#ifdef CONFIG_SYSCTL
- struct ctl_table *tbl = ctl_forward_entry;
struct ctl_table_header *forw_hdr;
+ struct ctl_table *tbl;
#endif
+ struct ipv4_devconf *all, *dflt;
+ int err;
+ int i;
err = -ENOMEM;
- all = &ipv4_devconf;
- dflt = &ipv4_devconf_dflt;
+ net->ipv4.inet_addr_lst = kmalloc_array(IN4_ADDR_HSIZE,
+ sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!net->ipv4.inet_addr_lst)
+ goto err_alloc_hash;
- if (!net_eq(net, &init_net)) {
- all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL);
- if (!all)
- goto err_alloc_all;
+ all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL);
+ if (!all)
+ goto err_alloc_all;
- dflt = kmemdup(dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL);
- if (!dflt)
- goto err_alloc_dflt;
+ dflt = kmemdup(&ipv4_devconf_dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL);
+ if (!dflt)
+ goto err_alloc_dflt;
#ifdef CONFIG_SYSCTL
- tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL);
- if (!tbl)
- goto err_alloc_ctl;
+ tbl = kmemdup(ctl_forward_entry, sizeof(ctl_forward_entry), GFP_KERNEL);
+ if (!tbl)
+ goto err_alloc_ctl;
- tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1];
- tbl[0].extra1 = all;
- tbl[0].extra2 = net;
+ tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1];
+ tbl[0].extra1 = all;
+ tbl[0].extra2 = net;
#endif
+
+ if (!net_eq(net, &init_net)) {
+ switch (net_inherit_devconf()) {
+ case 3:
+ /* copy from the current netns */
+ memcpy(all, current->nsproxy->net_ns->ipv4.devconf_all,
+ sizeof(ipv4_devconf));
+ memcpy(dflt,
+ current->nsproxy->net_ns->ipv4.devconf_dflt,
+ sizeof(ipv4_devconf_dflt));
+ break;
+ case 0:
+ case 1:
+ /* copy from init_net */
+ memcpy(all, init_net.ipv4.devconf_all,
+ sizeof(ipv4_devconf));
+ memcpy(dflt, init_net.ipv4.devconf_dflt,
+ sizeof(ipv4_devconf_dflt));
+ break;
+ case 2:
+ /* use compiled values */
+ break;
+ }
}
#ifdef CONFIG_SYSCTL
@@ -2446,12 +2813,18 @@ static __net_init int devinet_init_net(struct net *net)
goto err_reg_dflt;
err = -ENOMEM;
- forw_hdr = register_net_sysctl(net, "net/ipv4", tbl);
+ forw_hdr = register_net_sysctl_sz(net, "net/ipv4", tbl,
+ ARRAY_SIZE(ctl_forward_entry));
if (!forw_hdr)
goto err_reg_ctl;
net->ipv4.forw_hdr = forw_hdr;
#endif
+ for (i = 0; i < IN4_ADDR_HSIZE; i++)
+ INIT_HLIST_HEAD(&net->ipv4.inet_addr_lst[i]);
+
+ INIT_DEFERRABLE_WORK(&net->ipv4.addr_chk_work, check_lifetime);
+
net->ipv4.devconf_all = all;
net->ipv4.devconf_dflt = dflt;
return 0;
@@ -2462,24 +2835,27 @@ err_reg_ctl:
err_reg_dflt:
__devinet_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL);
err_reg_all:
- if (tbl != ctl_forward_entry)
- kfree(tbl);
+ kfree(tbl);
err_alloc_ctl:
#endif
- if (dflt != &ipv4_devconf_dflt)
- kfree(dflt);
+ kfree(dflt);
err_alloc_dflt:
- if (all != &ipv4_devconf)
- kfree(all);
+ kfree(all);
err_alloc_all:
+ kfree(net->ipv4.inet_addr_lst);
+err_alloc_hash:
return err;
}
static __net_exit void devinet_exit_net(struct net *net)
{
#ifdef CONFIG_SYSCTL
- struct ctl_table *tbl;
+ const struct ctl_table *tbl;
+#endif
+
+ cancel_delayed_work_sync(&net->ipv4.addr_chk_work);
+#ifdef CONFIG_SYSCTL
tbl = net->ipv4.forw_hdr->ctl_table_arg;
unregister_net_sysctl_table(net->ipv4.forw_hdr);
__devinet_sysctl_unregister(net, net->ipv4.devconf_dflt,
@@ -2490,6 +2866,7 @@ static __net_exit void devinet_exit_net(struct net *net)
#endif
kfree(net->ipv4.devconf_dflt);
kfree(net->ipv4.devconf_all);
+ kfree(net->ipv4.inet_addr_lst);
}
static __net_initdata struct pernet_operations devinet_ops = {
@@ -2505,25 +2882,27 @@ static struct rtnl_af_ops inet_af_ops __read_mostly = {
.set_link_af = inet_set_link_af,
};
+static const struct rtnl_msg_handler devinet_rtnl_msg_handlers[] __initconst = {
+ {.protocol = PF_INET, .msgtype = RTM_NEWADDR, .doit = inet_rtm_newaddr,
+ .flags = RTNL_FLAG_DOIT_PERNET},
+ {.protocol = PF_INET, .msgtype = RTM_DELADDR, .doit = inet_rtm_deladdr,
+ .flags = RTNL_FLAG_DOIT_PERNET},
+ {.protocol = PF_INET, .msgtype = RTM_GETADDR, .dumpit = inet_dump_ifaddr,
+ .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
+ {.protocol = PF_INET, .msgtype = RTM_GETNETCONF,
+ .doit = inet_netconf_get_devconf, .dumpit = inet_netconf_dump_devconf,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
+ {.owner = THIS_MODULE, .protocol = PF_INET, .msgtype = RTM_GETMULTICAST,
+ .dumpit = inet_dump_ifmcaddr, .flags = RTNL_FLAG_DUMP_UNLOCKED},
+};
+
void __init devinet_init(void)
{
- int i;
-
- for (i = 0; i < IN4_ADDR_HSIZE; i++)
- INIT_HLIST_HEAD(&inet_addr_lst[i]);
-
register_pernet_subsys(&devinet_ops);
-
- register_gifconf(PF_INET, inet_gifconf);
register_netdevice_notifier(&ip_netdev_notifier);
- queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0);
-
- rtnl_af_register(&inet_af_ops);
+ if (rtnl_af_register(&inet_af_ops))
+ panic("Unable to register inet_af_ops\n");
- rtnl_register(PF_INET, RTM_NEWADDR, inet_rtm_newaddr, NULL, 0);
- rtnl_register(PF_INET, RTM_DELADDR, inet_rtm_deladdr, NULL, 0);
- rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr, 0);
- rtnl_register(PF_INET, RTM_GETNETCONF, inet_netconf_get_devconf,
- inet_netconf_dump_devconf, 0);
+ rtnl_register_many(devinet_rtnl_msg_handlers);
}
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 97689012b357..2c922afadb8f 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) "IPsec: " fmt
#include <crypto/aead.h>
@@ -17,6 +18,9 @@
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/espintcp.h>
+#include <linux/skbuff_ref.h>
#include <linux/highmem.h>
@@ -32,8 +36,6 @@ struct esp_output_extra {
#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
-static u32 esp4_get_mtu(struct xfrm_state *x, int mtu);
-
/*
* Allocate an AEAD request structure with extra space for SG and IV.
*
@@ -94,9 +96,8 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
__alignof__(struct scatterlist));
}
-static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
+static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
{
- struct esp_output_extra *extra = esp_tmp_extra(tmp);
struct crypto_aead *aead = x->data;
int extralen = 0;
u8 *iv;
@@ -104,9 +105,8 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
struct scatterlist *sg;
if (x->props.flags & XFRM_STATE_ESN)
- extralen += sizeof(*extra);
+ extralen += sizeof(struct esp_output_extra);
- extra = esp_tmp_extra(tmp);
iv = esp_tmp_iv(aead, tmp, extralen);
req = esp_tmp_req(aead, iv);
@@ -115,23 +115,112 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
*/
if (req->src != req->dst)
for (sg = sg_next(req->src); sg; sg = sg_next(sg))
- put_page(sg_page(sg));
+ skb_page_unref(page_to_netmem(sg_page(sg)),
+ skb->pp_recycle);
+}
+
+#ifdef CONFIG_INET_ESPINTCP
+static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
+{
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct net *net = xs_net(x);
+ __be16 sport, dport;
+ struct sock *sk;
+
+ spin_lock_bh(&x->lock);
+ sport = encap->encap_sport;
+ dport = encap->encap_dport;
+ spin_unlock_bh(&x->lock);
+
+ sk = inet_lookup_established(net, x->id.daddr.a4, dport,
+ x->props.saddr.a4, sport, 0);
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ if (!tcp_is_ulp_esp(sk)) {
+ sock_put(sk);
+ return ERR_PTR(-EINVAL);
+ }
+
+ return sk;
+}
+
+static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct sock *sk;
+ int err;
+
+ rcu_read_lock();
+
+ sk = esp_find_tcp_sk(x);
+ err = PTR_ERR_OR_ZERO(sk);
+ if (err) {
+ kfree_skb(skb);
+ goto out;
+ }
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk))
+ err = espintcp_queue_out(sk, skb);
+ else
+ err = espintcp_push_skb(sk, skb);
+ bh_unlock_sock(sk);
+
+ sock_put(sk);
+
+out:
+ rcu_read_unlock();
+ return err;
+}
+
+static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct xfrm_state *x = dst->xfrm;
+
+ return esp_output_tcp_finish(x, skb);
}
-static void esp_output_done(struct crypto_async_request *base, int err)
+static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
{
- struct sk_buff *skb = base->data;
+ int err;
+
+ local_bh_disable();
+ err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb);
+ local_bh_enable();
+
+ /* EINPROGRESS just happens to do the right thing. It
+ * actually means that the skb has been consumed and
+ * isn't coming back.
+ */
+ return err ?: -EINPROGRESS;
+}
+#else
+static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
+{
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+}
+#endif
+
+static void esp_output_done(void *data, int err)
+{
+ struct sk_buff *skb = data;
struct xfrm_offload *xo = xfrm_offload(skb);
void *tmp;
struct xfrm_state *x;
- if (xo && (xo->flags & XFRM_DEV_RESUME))
- x = skb->sp->xvec[skb->sp->len - 1];
- else
+ if (xo && (xo->flags & XFRM_DEV_RESUME)) {
+ struct sec_path *sp = skb_sec_path(skb);
+
+ x = sp->xvec[sp->len - 1];
+ } else {
x = skb_dst(skb)->xfrm;
+ }
tmp = ESP_SKB_CB(skb)->tmp;
- esp_ssg_unref(x, tmp);
+ esp_ssg_unref(x, tmp, skb);
kfree(tmp);
if (xo && (xo->flags & XFRM_DEV_RESUME)) {
@@ -145,7 +234,11 @@ static void esp_output_done(struct crypto_async_request *base, int err)
secpath_reset(skb);
xfrm_dev_resume(skb);
} else {
- xfrm_output_resume(skb, err);
+ if (!err &&
+ x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
+ esp_output_tail_tcp(x, skb);
+ else
+ xfrm_output_resume(skb_to_full_sk(skb), skb, err);
}
}
@@ -175,7 +268,7 @@ static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb,
struct esp_output_extra *extra)
{
/* For ESN we move the header forward by 4 bytes to
- * accomodate the high bits. We will move it back after
+ * accommodate the high bits. We will move it back after
* encryption.
*/
if ((x->props.flags & XFRM_STATE_ESN)) {
@@ -199,38 +292,88 @@ static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb,
return esph;
}
-static void esp_output_done_esn(struct crypto_async_request *base, int err)
+static void esp_output_done_esn(void *data, int err)
{
- struct sk_buff *skb = base->data;
+ struct sk_buff *skb = data;
esp_output_restore_header(skb);
- esp_output_done(base, err);
+ esp_output_done(data, err);
}
-static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
+static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb,
+ int encap_type,
+ struct esp_info *esp,
+ __be16 sport,
+ __be16 dport)
{
- /* Fill padding... */
- if (tfclen) {
- memset(tail, 0, tfclen);
- tail += tfclen;
- }
- do {
- int i;
- for (i = 0; i < plen - 2; i++)
- tail[i] = i + 1;
- } while (0);
- tail[plen - 2] = plen - 2;
- tail[plen - 1] = proto;
+ struct udphdr *uh;
+ unsigned int len;
+ struct xfrm_offload *xo = xfrm_offload(skb);
+
+ len = skb->len + esp->tailen - skb_transport_offset(skb);
+ if (len + sizeof(struct iphdr) > IP_MAX_MTU)
+ return ERR_PTR(-EMSGSIZE);
+
+ uh = (struct udphdr *)esp->esph;
+ uh->source = sport;
+ uh->dest = dport;
+ uh->len = htons(len);
+ uh->check = 0;
+
+ /* For IPv4 ESP with UDP encapsulation, if xo is not null, the skb is in the crypto offload
+ * data path, which means that esp_output_udp_encap is called outside of the XFRM stack.
+ * In this case, the mac header doesn't point to the IPv4 protocol field, so don't set it.
+ */
+ if (!xo || encap_type != UDP_ENCAP_ESPINUDP)
+ *skb_mac_header(skb) = IPPROTO_UDP;
+
+ return (struct ip_esp_hdr *)(uh + 1);
}
-static void esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
+#ifdef CONFIG_INET_ESPINTCP
+static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x,
+ struct sk_buff *skb,
+ struct esp_info *esp)
+{
+ __be16 *lenp = (void *)esp->esph;
+ struct ip_esp_hdr *esph;
+ unsigned int len;
+ struct sock *sk;
+
+ len = skb->len + esp->tailen - skb_transport_offset(skb);
+ if (len > IP_MAX_MTU)
+ return ERR_PTR(-EMSGSIZE);
+
+ rcu_read_lock();
+ sk = esp_find_tcp_sk(x);
+ rcu_read_unlock();
+
+ if (IS_ERR(sk))
+ return ERR_CAST(sk);
+
+ sock_put(sk);
+
+ *lenp = htons(len);
+ esph = (struct ip_esp_hdr *)(lenp + 1);
+
+ return esph;
+}
+#else
+static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x,
+ struct sk_buff *skb,
+ struct esp_info *esp)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+#endif
+
+static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb,
+ struct esp_info *esp)
{
- int encap_type;
- struct udphdr *uh;
- __be32 *udpdata32;
- __be16 sport, dport;
struct xfrm_encap_tmpl *encap = x->encap;
- struct ip_esp_hdr *esph = esp->esph;
+ struct ip_esp_hdr *esph;
+ __be16 sport, dport;
+ int encap_type;
spin_lock_bh(&x->lock);
sport = encap->encap_sport;
@@ -238,42 +381,44 @@ static void esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, stru
encap_type = encap->encap_type;
spin_unlock_bh(&x->lock);
- uh = (struct udphdr *)esph;
- uh->source = sport;
- uh->dest = dport;
- uh->len = htons(skb->len + esp->tailen
- - skb_transport_offset(skb));
- uh->check = 0;
-
switch (encap_type) {
default:
case UDP_ENCAP_ESPINUDP:
- esph = (struct ip_esp_hdr *)(uh + 1);
+ esph = esp_output_udp_encap(skb, encap_type, esp, sport, dport);
break;
- case UDP_ENCAP_ESPINUDP_NON_IKE:
- udpdata32 = (__be32 *)(uh + 1);
- udpdata32[0] = udpdata32[1] = 0;
- esph = (struct ip_esp_hdr *)(udpdata32 + 2);
+ case TCP_ENCAP_ESPINTCP:
+ esph = esp_output_tcp_encap(x, skb, esp);
break;
}
- *skb_mac_header(skb) = IPPROTO_UDP;
+ if (IS_ERR(esph))
+ return PTR_ERR(esph);
+
esp->esph = esph;
+
+ return 0;
}
int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
{
u8 *tail;
- u8 *vaddr;
int nfrags;
int esph_offset;
struct page *page;
struct sk_buff *trailer;
int tailen = esp->tailen;
- /* this is non-NULL only with UDP Encapsulation */
- if (x->encap)
- esp_output_udp_encap(x, skb, esp);
+ /* this is non-NULL only with TCP/UDP Encapsulation */
+ if (x->encap) {
+ int err = esp_output_encap(x, skb, esp);
+
+ if (err < 0)
+ return err;
+ }
+
+ if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE ||
+ ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE)
+ goto cow;
if (!skb_cloned(skb)) {
if (tailen <= skb_tailroom(skb)) {
@@ -302,14 +447,10 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
page = pfrag->page;
get_page(page);
- vaddr = kmap_atomic(page);
-
- tail = vaddr + pfrag->offset;
+ tail = page_address(page) + pfrag->offset;
esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
- kunmap_atomic(vaddr);
-
nfrags = skb_shinfo(skb)->nr_frags;
__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
@@ -322,10 +463,8 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
nfrags++;
- skb->len += tailen;
- skb->data_len += tailen;
- skb->truesize += tailen;
- if (sk)
+ skb_len_add(skb, tailen);
+ if (sk && sk_fullsock(sk))
refcount_add(tailen, &sk->sk_wmem_alloc);
goto out;
@@ -460,7 +599,10 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
}
if (sg != dsg)
- esp_ssg_unref(x, tmp);
+ esp_ssg_unref(x, tmp, skb);
+
+ if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
+ err = esp_output_tail_tcp(x, skb);
error_free:
kfree(tmp);
@@ -492,7 +634,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
u32 padto;
- padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached));
+ padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached));
if (skb->len < padto)
esp.tfclen = padto - skb->len;
}
@@ -522,7 +664,6 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
static inline int esp_remove_trailer(struct sk_buff *skb)
{
struct xfrm_state *x = xfrm_input_state(skb);
- struct xfrm_offload *xo = xfrm_offload(skb);
struct crypto_aead *aead = x->data;
int alen, hlen, elen;
int padlen, trimlen;
@@ -534,11 +675,6 @@ static inline int esp_remove_trailer(struct sk_buff *skb)
hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
elen = skb->len - hlen;
- if (xo && (xo->flags & XFRM_ESP_NO_TRAILER)) {
- ret = xo->proto;
- goto out;
- }
-
if (skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2))
BUG();
@@ -556,7 +692,9 @@ static inline int esp_remove_trailer(struct sk_buff *skb)
skb->csum = csum_block_sub(skb->csum, csumdiff,
skb->len - trimlen);
}
- pskb_trim(skb, skb->len - trimlen);
+ ret = pskb_trim(skb, skb->len - trimlen);
+ if (unlikely(ret))
+ return ret;
ret = nexthdr[1];
@@ -573,7 +711,7 @@ int esp_input_done2(struct sk_buff *skb, int err)
int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
int ihl;
- if (!xo || (xo && !(xo->flags & CRYPTO_DONE)))
+ if (!xo || !(xo->flags & CRYPTO_DONE))
kfree(ESP_SKB_CB(skb)->tmp);
if (unlikely(err))
@@ -588,20 +726,35 @@ int esp_input_done2(struct sk_buff *skb, int err)
if (x->encap) {
struct xfrm_encap_tmpl *encap = x->encap;
+ struct tcphdr *th = (void *)(skb_network_header(skb) + ihl);
struct udphdr *uh = (void *)(skb_network_header(skb) + ihl);
+ __be16 source;
+
+ switch (x->encap->encap_type) {
+ case TCP_ENCAP_ESPINTCP:
+ source = th->source;
+ break;
+ case UDP_ENCAP_ESPINUDP:
+ source = uh->source;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ err = -EINVAL;
+ goto out;
+ }
/*
* 1) if the NAT-T peer's IP or port changed then
- * advertize the change to the keying daemon.
+ * advertise the change to the keying daemon.
* This is an inbound SA, so just compare
* SRC ports.
*/
if (iph->saddr != x->props.saddr.a4 ||
- uh->source != encap->encap_sport) {
+ source != encap->encap_sport) {
xfrm_address_t ipaddr;
ipaddr.a4 = iph->saddr;
- km_new_mapping(x, &ipaddr, uh->source);
+ km_new_mapping(x, &ipaddr, source);
/* XXX: perhaps add an extra
* policy check here, to see
@@ -624,7 +777,8 @@ int esp_input_done2(struct sk_buff *skb, int err)
}
skb_pull_rcsum(skb, hlen);
- if (x->props.mode == XFRM_MODE_TUNNEL)
+ if (x->props.mode == XFRM_MODE_TUNNEL ||
+ x->props.mode == XFRM_MODE_IPTFS)
skb_reset_transport_header(skb);
else
skb_set_transport_header(skb, -ihl);
@@ -638,9 +792,9 @@ out:
}
EXPORT_SYMBOL_GPL(esp_input_done2);
-static void esp_input_done(struct crypto_async_request *base, int err)
+static void esp_input_done(void *data, int err)
{
- struct sk_buff *skb = base->data;
+ struct sk_buff *skb = data;
xfrm_input_resume(skb, esp_input_done2(skb, err));
}
@@ -657,7 +811,7 @@ static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
struct ip_esp_hdr *esph;
/* For ESN we move the header forward by 4 bytes to
- * accomodate the high bits. We will move it back after
+ * accommodate the high bits. We will move it back after
* decryption.
*/
if ((x->props.flags & XFRM_STATE_ESN)) {
@@ -668,12 +822,12 @@ static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
}
}
-static void esp_input_done_esn(struct crypto_async_request *base, int err)
+static void esp_input_done_esn(void *data, int err)
{
- struct sk_buff *skb = base->data;
+ struct sk_buff *skb = data;
esp_input_restore_header(skb);
- esp_input_done(base, err);
+ esp_input_done(data, err);
}
/*
@@ -683,12 +837,11 @@ static void esp_input_done_esn(struct crypto_async_request *base, int err)
*/
static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
{
- struct ip_esp_hdr *esph;
struct crypto_aead *aead = x->data;
struct aead_request *req;
struct sk_buff *trailer;
int ivlen = crypto_aead_ivsize(aead);
- int elen = skb->len - sizeof(*esph) - ivlen;
+ int elen = skb->len - sizeof(struct ip_esp_hdr) - ivlen;
int nfrags;
int assoclen;
int seqhilen;
@@ -698,13 +851,13 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
struct scatterlist *sg;
int err = -EINVAL;
- if (!pskb_may_pull(skb, sizeof(*esph) + ivlen))
+ if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + ivlen))
goto out;
if (elen <= 0)
goto out;
- assoclen = sizeof(*esph);
+ assoclen = sizeof(struct ip_esp_hdr);
seqhilen = 0;
if (x->props.flags & XFRM_STATE_ESN) {
@@ -775,28 +928,6 @@ out:
return err;
}
-static u32 esp4_get_mtu(struct xfrm_state *x, int mtu)
-{
- struct crypto_aead *aead = x->data;
- u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
- unsigned int net_adj;
-
- switch (x->props.mode) {
- case XFRM_MODE_TRANSPORT:
- case XFRM_MODE_BEET:
- net_adj = sizeof(struct iphdr);
- break;
- case XFRM_MODE_TUNNEL:
- net_adj = 0;
- break;
- default:
- BUG();
- }
-
- return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
- net_adj) & ~(blksize - 1)) + net_adj - 2;
-}
-
static int esp4_err(struct sk_buff *skb, u32 info)
{
struct net *net = dev_net(skb->dev);
@@ -808,6 +939,7 @@ static int esp4_err(struct sk_buff *skb, u32 info)
case ICMP_DEST_UNREACH:
if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
return 0;
+ break;
case ICMP_REDIRECT:
break;
default:
@@ -820,9 +952,9 @@ static int esp4_err(struct sk_buff *skb, u32 info)
return 0;
if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
- ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ESP, 0);
+ ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ESP);
else
- ipv4_redirect(skb, net, 0, 0, IPPROTO_ESP, 0);
+ ipv4_redirect(skb, net, 0, IPPROTO_ESP);
xfrm_state_put(x);
return 0;
@@ -838,16 +970,17 @@ static void esp_destroy(struct xfrm_state *x)
crypto_free_aead(aead);
}
-static int esp_init_aead(struct xfrm_state *x)
+static int esp_init_aead(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
char aead_name[CRYPTO_MAX_ALG_NAME];
struct crypto_aead *aead;
int err;
- err = -ENAMETOOLONG;
if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
- x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME)
- goto error;
+ x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) {
+ NL_SET_ERR_MSG(extack, "Algorithm name is too long");
+ return -ENAMETOOLONG;
+ }
aead = crypto_alloc_aead(aead_name, 0, 0);
err = PTR_ERR(aead);
@@ -865,11 +998,15 @@ static int esp_init_aead(struct xfrm_state *x)
if (err)
goto error;
+ return 0;
+
error:
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
return err;
}
-static int esp_init_authenc(struct xfrm_state *x)
+static int esp_init_authenc(struct xfrm_state *x,
+ struct netlink_ext_ack *extack)
{
struct crypto_aead *aead;
struct crypto_authenc_key_param *param;
@@ -880,10 +1017,6 @@ static int esp_init_authenc(struct xfrm_state *x)
unsigned int keylen;
int err;
- err = -EINVAL;
- if (!x->ealg)
- goto error;
-
err = -ENAMETOOLONG;
if ((x->props.flags & XFRM_STATE_ESN)) {
@@ -892,22 +1025,28 @@ static int esp_init_authenc(struct xfrm_state *x)
x->geniv ?: "", x->geniv ? "(" : "",
x->aalg ? x->aalg->alg_name : "digest_null",
x->ealg->alg_name,
- x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME)
+ x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
+ NL_SET_ERR_MSG(extack, "Algorithm name is too long");
goto error;
+ }
} else {
if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
"%s%sauthenc(%s,%s)%s",
x->geniv ?: "", x->geniv ? "(" : "",
x->aalg ? x->aalg->alg_name : "digest_null",
x->ealg->alg_name,
- x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME)
+ x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
+ NL_SET_ERR_MSG(extack, "Algorithm name is too long");
goto error;
+ }
}
aead = crypto_alloc_aead(authenc_name, 0, 0);
err = PTR_ERR(aead);
- if (IS_ERR(aead))
+ if (IS_ERR(aead)) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto error;
+ }
x->data = aead;
@@ -937,17 +1076,16 @@ static int esp_init_authenc(struct xfrm_state *x)
err = -EINVAL;
if (aalg_desc->uinfo.auth.icv_fullbits / 8 !=
crypto_aead_authsize(aead)) {
- pr_info("ESP: %s digestsize %u != %hu\n",
- x->aalg->alg_name,
- crypto_aead_authsize(aead),
- aalg_desc->uinfo.auth.icv_fullbits / 8);
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto free_key;
}
err = crypto_aead_setauthsize(
aead, x->aalg->alg_trunc_len / 8);
- if (err)
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto free_key;
+ }
}
param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8);
@@ -956,13 +1094,13 @@ static int esp_init_authenc(struct xfrm_state *x)
err = crypto_aead_setkey(aead, key, keylen);
free_key:
- kfree(key);
+ kfree_sensitive(key);
error:
return err;
}
-static int esp_init_state(struct xfrm_state *x)
+static int esp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
struct crypto_aead *aead;
u32 align;
@@ -970,10 +1108,14 @@ static int esp_init_state(struct xfrm_state *x)
x->data = NULL;
- if (x->aead)
- err = esp_init_aead(x);
- else
- err = esp_init_authenc(x);
+ if (x->aead) {
+ err = esp_init_aead(x, extack);
+ } else if (x->ealg) {
+ err = esp_init_authenc(x, extack);
+ } else {
+ NL_SET_ERR_MSG(extack, "ESP: AEAD or CRYPT must be provided");
+ err = -EINVAL;
+ }
if (err)
goto error;
@@ -991,14 +1133,20 @@ static int esp_init_state(struct xfrm_state *x)
switch (encap->encap_type) {
default:
+ NL_SET_ERR_MSG(extack, "Unsupported encapsulation type for ESP");
err = -EINVAL;
goto error;
case UDP_ENCAP_ESPINUDP:
x->props.header_len += sizeof(struct udphdr);
break;
- case UDP_ENCAP_ESPINUDP_NON_IKE:
- x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
+#ifdef CONFIG_INET_ESPINTCP
+ case TCP_ENCAP_ESPINTCP:
+ /* only the length field, TCP encap is done by
+ * the socket
+ */
+ x->props.header_len += 2;
break;
+#endif
}
}
@@ -1016,13 +1164,11 @@ static int esp4_rcv_cb(struct sk_buff *skb, int err)
static const struct xfrm_type esp_type =
{
- .description = "ESP4",
.owner = THIS_MODULE,
.proto = IPPROTO_ESP,
.flags = XFRM_TYPE_REPLAY_PROT,
.init_state = esp_init_state,
.destructor = esp_destroy,
- .get_mtu = esp4_get_mtu,
.input = esp_input,
.output = esp_output,
};
@@ -1053,11 +1199,11 @@ static void __exit esp4_fini(void)
{
if (xfrm4_protocol_deregister(&esp4_protocol, IPPROTO_ESP) < 0)
pr_info("%s: can't remove protocol\n", __func__);
- if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
- pr_info("%s: can't remove xfrm type\n", __func__);
+ xfrm_unregister_type(&esp_type, AF_INET);
}
module_init(esp4_init);
module_exit(esp4_fini);
+MODULE_DESCRIPTION("IPv4 ESP transformation library");
MODULE_LICENSE("GPL");
MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_ESP);
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 58834a10c0be..05828d4cb6cd 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* IPV4 GSO/GRO offload support
* Linux INET implementation
@@ -5,10 +6,6 @@
* Copyright (C) 2016 secunet Security Networks AG
* Author: Steffen Klassert <steffen.klassert@secunet.com>
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
* ESP GRO support
*/
@@ -19,6 +16,8 @@
#include <crypto/authenc.h>
#include <linux/err.h>
#include <linux/module.h>
+#include <net/gro.h>
+#include <net/gso.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/esp.h>
@@ -34,43 +33,54 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head,
int offset = skb_gro_offset(skb);
struct xfrm_offload *xo;
struct xfrm_state *x;
+ int encap_type = 0;
__be32 seq;
__be32 spi;
- int err;
if (!pskb_pull(skb, offset))
return NULL;
- if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0)
+ if (xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq) != 0)
goto out;
xo = xfrm_offload(skb);
if (!xo || !(xo->flags & CRYPTO_DONE)) {
- err = secpath_set(skb);
- if (err)
- goto out;
+ struct sec_path *sp = secpath_set(skb);
- if (skb->sp->len == XFRM_MAX_DEPTH)
+ if (!sp)
goto out;
- x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
- (xfrm_address_t *)&ip_hdr(skb)->daddr,
- spi, IPPROTO_ESP, AF_INET);
- if (!x)
- goto out;
+ if (sp->len == XFRM_MAX_DEPTH)
+ goto out_reset;
- skb->sp->xvec[skb->sp->len++] = x;
- skb->sp->olen++;
+ x = xfrm_input_state_lookup(dev_net(skb->dev), skb->mark,
+ (xfrm_address_t *)&ip_hdr(skb)->daddr,
+ spi, IPPROTO_ESP, AF_INET);
- xo = xfrm_offload(skb);
- if (!xo) {
+ if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) {
+ /* non-offload path will record the error and audit log */
xfrm_state_put(x);
- goto out;
+ x = NULL;
}
+
+ if (!x)
+ goto out_reset;
+
+ skb->mark = xfrm_smark_get(skb->mark, x);
+
+ sp->xvec[sp->len++] = x;
+ sp->olen++;
+
+ xo = xfrm_offload(skb);
+ if (!xo)
+ goto out_reset;
}
xo->flags |= XFRM_GRO;
+ if (NAPI_GRO_CB(skb)->proto == IPPROTO_UDP)
+ encap_type = UDP_ENCAP_ESPINUDP;
+
XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL;
XFRM_SPI_SKB_CB(skb)->family = AF_INET;
XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
@@ -78,9 +88,11 @@ static struct sk_buff *esp4_gro_receive(struct list_head *head,
/* We don't need to handle errors from xfrm_input, it does all
* the error handling and frees the resources on error. */
- xfrm_input(skb, IPPROTO_ESP, spi, -2);
+ xfrm_input(skb, IPPROTO_ESP, spi, encap_type);
return ERR_PTR(-EINPROGRESS);
+out_reset:
+ secpath_reset(skb);
out:
skb_push(skb, offset);
NAPI_GRO_CB(skb)->same_flow = 0;
@@ -106,6 +118,91 @@ static void esp4_gso_encap(struct xfrm_state *x, struct sk_buff *skb)
xo->proto = proto;
}
+static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x,
+ XFRM_MODE_SKB_CB(skb)->protocol);
+ __be16 type = inner_mode->family == AF_INET6 ? htons(ETH_P_IPV6)
+ : htons(ETH_P_IP);
+
+ return skb_eth_gso_segment(skb, features, type);
+}
+
+static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ const struct net_offload *ops;
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ struct xfrm_offload *xo = xfrm_offload(skb);
+
+ skb->transport_header += x->props.header_len;
+ ops = rcu_dereference(inet_offloads[xo->proto]);
+ if (likely(ops && ops->callbacks.gso_segment))
+ segs = ops->callbacks.gso_segment(skb, features);
+
+ return segs;
+}
+
+static struct sk_buff *xfrm4_beet_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ const struct net_offload *ops;
+ u8 proto = xo->proto;
+
+ skb->transport_header += x->props.header_len;
+
+ if (x->sel.family != AF_INET6) {
+ if (proto == IPPROTO_BEETPH) {
+ struct ip_beet_phdr *ph =
+ (struct ip_beet_phdr *)skb->data;
+
+ skb->transport_header += ph->hdrlen * 8;
+ proto = ph->nexthdr;
+ } else {
+ skb->transport_header -= IPV4_BEET_PHMAXLEN;
+ }
+ } else {
+ __be16 frag;
+
+ skb->transport_header +=
+ ipv6_skip_exthdr(skb, 0, &proto, &frag);
+ if (proto == IPPROTO_TCP)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
+ }
+
+ if (proto == IPPROTO_IPV6)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4;
+
+ __skb_pull(skb, skb_transport_offset(skb));
+ ops = rcu_dereference(inet_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_segment))
+ segs = ops->callbacks.gso_segment(skb, features);
+
+ return segs;
+}
+
+static struct sk_buff *xfrm4_outer_mode_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ switch (x->outer_mode.encap) {
+ case XFRM_MODE_TUNNEL:
+ return xfrm4_tunnel_gso_segment(x, skb, features);
+ case XFRM_MODE_TRANSPORT:
+ return xfrm4_transport_gso_segment(x, skb, features);
+ case XFRM_MODE_BEET:
+ return xfrm4_beet_gso_segment(x, skb, features);
+ }
+
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
@@ -114,6 +211,7 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
struct crypto_aead *aead;
netdev_features_t esp_features = features;
struct xfrm_offload *xo = xfrm_offload(skb);
+ struct sec_path *sp;
if (!xo)
return ERR_PTR(-EINVAL);
@@ -121,7 +219,8 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP))
return ERR_PTR(-EINVAL);
- x = skb->sp->xvec[skb->sp->len - 1];
+ sp = skb_sec_path(skb);
+ x = sp->xvec[sp->len - 1];
aead = x->data;
esph = ip_esp_hdr(skb);
@@ -135,14 +234,18 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
skb->encap_hdr_csum = 1;
- if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev)
- esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
- else if (!(features & NETIF_F_HW_ESP_TX_CSUM))
- esp_features = features & ~NETIF_F_CSUM_MASK;
+ if ((!(skb->dev->gso_partial_features & NETIF_F_HW_ESP) &&
+ !(features & NETIF_F_HW_ESP)) || x->xso.dev != skb->dev)
+ esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK |
+ NETIF_F_SCTP_CRC);
+ else if (!(features & NETIF_F_HW_ESP_TX_CSUM) &&
+ !(skb->dev->gso_partial_features & NETIF_F_HW_ESP_TX_CSUM))
+ esp_features = features & ~(NETIF_F_CSUM_MASK |
+ NETIF_F_SCTP_CRC);
xo->flags |= XFRM_GSO_SEGMENT;
- return x->outer_mode->gso_segment(x, skb, esp_features);
+ return xfrm4_outer_mode_gso_segment(x, skb, esp_features);
}
static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb)
@@ -170,6 +273,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_
struct esp_info esp;
bool hw_offload = true;
__u32 seq;
+ int encap_type = 0;
esp.inplace = true;
@@ -178,7 +282,9 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_
if (!xo)
return -EINVAL;
- if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) {
+ if ((!(features & NETIF_F_HW_ESP) &&
+ !(skb->dev->gso_partial_features & NETIF_F_HW_ESP)) ||
+ x->xso.dev != skb->dev) {
xo->flags |= CRYPTO_FALLBACK;
hw_offload = false;
}
@@ -200,8 +306,10 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_
esp.esph = ip_esp_hdr(skb);
+ if (x->encap)
+ encap_type = x->encap->encap_type;
- if (!hw_offload || (hw_offload && !skb_is_gso(skb))) {
+ if (!hw_offload || !skb_is_gso(skb) || (hw_offload && encap_type == UDP_ENCAP_ESPINUDP)) {
esp.nfrags = esp_output_head(x, skb, &esp);
if (esp.nfrags < 0)
return esp.nfrags;
@@ -223,13 +331,37 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_
xo->seq.low += skb_shinfo(skb)->gso_segs;
}
+ if (xo->seq.low < seq)
+ xo->seq.hi++;
+
esp.seqno = cpu_to_be64(seq + ((u64)xo->seq.hi << 32));
+ if (hw_offload && encap_type == UDP_ENCAP_ESPINUDP) {
+ /* In the XFRM stack, the encapsulation protocol is set to iphdr->protocol by
+ * setting *skb_mac_header(skb) (see esp_output_udp_encap()) where skb->mac_header
+ * points to iphdr->protocol (see xfrm4_tunnel_encap_add()).
+ * However, in esp_xmit(), skb->mac_header doesn't point to iphdr->protocol.
+ * Therefore, the protocol field needs to be corrected.
+ */
+ ip_hdr(skb)->protocol = IPPROTO_UDP;
+
+ esph->seq_no = htonl(seq);
+ }
+
ip_hdr(skb)->tot_len = htons(skb->len);
ip_send_check(ip_hdr(skb));
- if (hw_offload)
+ if (hw_offload) {
+ if (!skb_ext_add(skb, SKB_EXT_SEC_PATH))
+ return -ENOMEM;
+
+ xo = xfrm_offload(skb);
+ if (!xo)
+ return -EINVAL;
+
+ xo->flags |= XFRM_XMIT;
return 0;
+ }
err = esp_output_tail(x, skb, &esp);
if (err)
@@ -237,6 +369,9 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_
secpath_reset(skb);
+ if (skb_needs_linearize(skb, skb->dev->features) &&
+ __skb_linearize(skb))
+ return -ENOMEM;
return 0;
}
@@ -248,7 +383,6 @@ static const struct net_offload esp4_offload = {
};
static const struct xfrm_type_offload esp_type_offload = {
- .description = "ESP4 OFFLOAD",
.owner = THIS_MODULE,
.proto = IPPROTO_ESP,
.input_tail = esp_input_tail,
@@ -268,9 +402,7 @@ static int __init esp4_offload_init(void)
static void __exit esp4_offload_exit(void)
{
- if (xfrm_unregister_type_offload(&esp_type_offload, AF_INET) < 0)
- pr_info("%s: can't remove xfrm type offload\n", __func__);
-
+ xfrm_unregister_type_offload(&esp_type_offload, AF_INET);
inet_del_offload(&esp4_offload, IPPROTO_ESP);
}
@@ -279,3 +411,4 @@ module_exit(esp4_offload_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET, XFRM_PROTO_ESP);
+MODULE_DESCRIPTION("IPV4 GSO/GRO offload support");
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 2998b0e47d4b..1dab44e13d3b 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -6,11 +7,6 @@
* IPv4 Forwarding Information Base: FIB frontend.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
@@ -36,6 +32,8 @@
#include <linux/list.h>
#include <linux/slab.h>
+#include <net/flow.h>
+#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
@@ -43,6 +41,7 @@
#include <net/sock.h>
#include <net/arp.h>
#include <net/ip_fib.h>
+#include <net/nexthop.h>
#include <net/rtnetlink.h>
#include <net/xfrm.h>
#include <net/l3mdev.h>
@@ -73,11 +72,6 @@ fail:
fib_free_table(main_table);
return -ENOMEM;
}
-
-static bool fib4_has_custom_rules(struct net *net)
-{
- return false;
-}
#else
struct fib_table *fib_new_table(struct net *net, u32 id)
@@ -127,17 +121,13 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
h = id & (FIB_TABLE_HASHSZ - 1);
head = &net->ipv4.fib_table_hash[h];
- hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ hlist_for_each_entry_rcu(tb, head, tb_hlist,
+ lockdep_rtnl_is_held()) {
if (tb->tb_id == id)
return tb;
}
return NULL;
}
-
-static bool fib4_has_custom_rules(struct net *net)
-{
- return net->ipv4.fib_has_custom_rules;
-}
#endif /* CONFIG_IP_MULTIPLE_TABLES */
static void fib_replace_table(struct net *net, struct fib_table *old,
@@ -192,7 +182,7 @@ int fib_unmerge(struct net *net)
return 0;
}
-static void fib_flush(struct net *net)
+void fib_flush(struct net *net)
{
int flushed = 0;
unsigned int h;
@@ -203,7 +193,7 @@ static void fib_flush(struct net *net)
struct fib_table *tb;
hlist_for_each_entry_safe(tb, tmp, head, tb_hlist)
- flushed += fib_table_flush(net, tb);
+ flushed += fib_table_flush(net, tb, false);
}
if (flushed)
@@ -234,7 +224,9 @@ static inline unsigned int __inet_dev_addr_type(struct net *net,
if (table) {
ret = RTN_UNICAST;
if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) {
- if (!dev || dev == res.fi->fib_dev)
+ struct fib_nh_common *nhc = fib_info_nhc(res.fi, 0);
+
+ if (!dev || dev == nhc->nhc_dev)
ret = res.type;
}
}
@@ -300,14 +292,14 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev);
struct flowi4 fl4 = {
.flowi4_iif = LOOPBACK_IFINDEX,
- .flowi4_oif = l3mdev_master_ifindex_rcu(dev),
+ .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev),
.daddr = ip_hdr(skb)->saddr,
- .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
+ .flowi4_dscp = ip4h_dscp(ip_hdr(skb)),
.flowi4_scope = scope,
.flowi4_mark = vmark ? skb->mark : 0,
};
if (!fib_lookup(net, &fl4, &res, 0))
- return FIB_RES_PREFSRC(net, res);
+ return fib_result_prefsrc(net, &res);
} else {
scope = RT_SCOPE_LINK;
}
@@ -315,6 +307,33 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
return inet_select_addr(dev, ip_hdr(skb)->saddr, scope);
}
+bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev)
+{
+ bool dev_match = false;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (unlikely(fi->nh)) {
+ dev_match = nexthop_uses_dev(fi->nh, dev);
+ } else {
+ int ret;
+
+ for (ret = 0; ret < fib_info_num_path(fi); ret++) {
+ const struct fib_nh_common *nhc = fib_info_nhc(fi, ret);
+
+ if (nhc_l3mdev_matches_dev(nhc, dev)) {
+ dev_match = true;
+ break;
+ }
+ }
+ }
+#else
+ if (fib_info_nhc(fi, 0)->nhc_dev == dev)
+ dev_match = true;
+#endif
+
+ return dev_match;
+}
+EXPORT_SYMBOL_GPL(fib_info_nh_uses_dev);
+
/* Given (packet source, input interface) and optional (dst, oif, tos):
* - (main) check, that source is valid i.e. not broadcast or our local
* address.
@@ -324,10 +343,11 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
* called with rcu_read_lock()
*/
static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
- u8 tos, int oif, struct net_device *dev,
+ dscp_t dscp, int oif, struct net_device *dev,
int rpf, struct in_device *idev, u32 *itag)
{
struct net *net = dev_net(dev);
+ enum skb_drop_reason reason;
struct flow_keys flkeys;
int ret, no_addr;
struct fib_result res;
@@ -335,16 +355,16 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
bool dev_match;
fl4.flowi4_oif = 0;
- fl4.flowi4_iif = l3mdev_master_ifindex_rcu(dev);
- if (!fl4.flowi4_iif)
- fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
+ fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev);
+ fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
fl4.daddr = src;
fl4.saddr = dst;
- fl4.flowi4_tos = tos;
+ fl4.flowi4_dscp = dscp;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_tun_key.tun_id = 0;
fl4.flowi4_flags = 0;
fl4.flowi4_uid = sock_net_uid(net, NULL);
+ fl4.flowi4_multipath_hash = 0;
no_addr = idev->ifa_list == NULL;
@@ -353,34 +373,31 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
fl4.flowi4_proto = 0;
fl4.fl4_sport = 0;
fl4.fl4_dport = 0;
+ } else {
+ swap(fl4.fl4_sport, fl4.fl4_dport);
}
if (fib_lookup(net, &fl4, &res, 0))
goto last_resort;
- if (res.type != RTN_UNICAST &&
- (res.type != RTN_LOCAL || !IN_DEV_ACCEPT_LOCAL(idev)))
- goto e_inval;
- fib_combine_itag(itag, &res);
- dev_match = false;
-
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- for (ret = 0; ret < res.fi->fib_nhs; ret++) {
- struct fib_nh *nh = &res.fi->fib_nh[ret];
-
- if (nh->nh_dev == dev) {
- dev_match = true;
- break;
- } else if (l3mdev_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) {
- dev_match = true;
- break;
+ if (res.type != RTN_UNICAST) {
+ if (res.type != RTN_LOCAL) {
+ reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
+ goto e_inval;
+ } else if (!IN_DEV_ACCEPT_LOCAL(idev)) {
+ reason = SKB_DROP_REASON_IP_LOCAL_SOURCE;
+ goto e_inval;
}
}
-#else
- if (FIB_RES_DEV(res) == dev)
- dev_match = true;
-#endif
+ fib_combine_itag(itag, &res);
+
+ dev_match = fib_info_nh_uses_dev(res.fi, dev);
+ /* This is not common, loopback packets retain skb_dst so normally they
+ * would not even hit this slow path.
+ */
+ dev_match = dev_match || (res.type == RTN_LOCAL &&
+ dev == net->loopback_dev);
if (dev_match) {
- ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+ ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
return ret;
}
if (no_addr)
@@ -392,7 +409,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
ret = 0;
if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) {
if (res.type == RTN_UNICAST)
- ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+ ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST;
}
return ret;
@@ -403,14 +420,14 @@ last_resort:
return 0;
e_inval:
- return -EINVAL;
+ return -reason;
e_rpf:
- return -EXDEV;
+ return -SKB_DROP_REASON_IP_RPFILTER;
}
/* Ignore rp_filter for packets protected by IPsec. */
int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
- u8 tos, int oif, struct net_device *dev,
+ dscp_t dscp, int oif, struct net_device *dev,
struct in_device *idev, u32 *itag)
{
int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev);
@@ -427,8 +444,11 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
if (net->ipv4.fib_has_custom_local_routes ||
fib4_has_custom_rules(net))
goto full_check;
+ /* Within the same container, it is regarded as a martian source,
+ * and the same host but different containers are not.
+ */
if (inet_lookup_ifaddr_rcu(net, src))
- return -EINVAL;
+ return -SKB_DROP_REASON_IP_LOCAL_SOURCE;
ok:
*itag = 0;
@@ -436,7 +456,8 @@ ok:
}
full_check:
- return __fib_validate_source(skb, src, dst, tos, oif, dev, r, idev, itag);
+ return __fib_validate_source(skb, src, dst, dscp, oif, dev, r, idev,
+ itag);
}
static inline __be32 sk_extract_addr(struct sockaddr *addr)
@@ -530,14 +551,20 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
cfg->fc_oif = dev->ifindex;
cfg->fc_table = l3mdev_fib_table(dev);
if (colon) {
- struct in_ifaddr *ifa;
- struct in_device *in_dev = __in_dev_get_rtnl(dev);
+ const struct in_ifaddr *ifa;
+ struct in_device *in_dev;
+
+ in_dev = __in_dev_get_rtnl_net(dev);
if (!in_dev)
return -ENODEV;
+
*colon = ':';
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
+
+ in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) {
if (strcmp(ifa->ifa_label, devname) == 0)
break;
+ }
+
if (!ifa)
return -ENODEV;
cfg->fc_prefsrc = ifa->ifa_local;
@@ -548,17 +575,21 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
if (rt->rt_gateway.sa_family == AF_INET && addr) {
unsigned int addr_type;
- cfg->fc_gw = addr;
+ cfg->fc_gw4 = addr;
+ cfg->fc_gw_family = AF_INET;
addr_type = inet_addr_type_table(net, addr, cfg->fc_table);
if (rt->rt_flags & RTF_GATEWAY &&
addr_type == RTN_UNICAST)
cfg->fc_scope = RT_SCOPE_UNIVERSE;
}
+ if (!cfg->fc_table)
+ cfg->fc_table = RT_TABLE_MAIN;
+
if (cmd == SIOCDELRT)
return 0;
- if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw)
+ if (rt->rt_flags & RTF_GATEWAY && !cfg->fc_gw_family)
return -EINVAL;
if (cfg->fc_scope == RT_SCOPE_NOWHERE)
@@ -603,7 +634,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- rtnl_lock();
+ rtnl_net_lock(net);
err = rtentry_to_fib_config(net, cmd, rt, &cfg);
if (err == 0) {
struct fib_table *tb;
@@ -627,13 +658,14 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, struct rtentry *rt)
/* allocated by rtentry_to_fib_config() */
kfree(cfg.fc_mx);
}
- rtnl_unlock();
+ rtnl_net_unlock(net);
return err;
}
return -EINVAL;
}
const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
+ [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 },
[RTA_DST] = { .type = NLA_U32 },
[RTA_SRC] = { .type = NLA_U32 },
[RTA_IIF] = { .type = NLA_U32 },
@@ -652,26 +684,80 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
[RTA_IP_PROTO] = { .type = NLA_U8 },
[RTA_SPORT] = { .type = NLA_U16 },
[RTA_DPORT] = { .type = NLA_U16 },
+ [RTA_NH_ID] = { .type = NLA_U32 },
};
+int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
+ struct netlink_ext_ack *extack)
+{
+ struct rtvia *via;
+ int alen;
+
+ if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr)) {
+ NL_SET_ERR_MSG(extack, "Invalid attribute length for RTA_VIA");
+ return -EINVAL;
+ }
+
+ via = nla_data(nla);
+ alen = nla_len(nla) - offsetof(struct rtvia, rtvia_addr);
+
+ switch (via->rtvia_family) {
+ case AF_INET:
+ if (alen != sizeof(__be32)) {
+ NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_VIA");
+ return -EINVAL;
+ }
+ cfg->fc_gw_family = AF_INET;
+ cfg->fc_gw4 = *((__be32 *)via->rtvia_addr);
+ break;
+ case AF_INET6:
+#if IS_ENABLED(CONFIG_IPV6)
+ if (alen != sizeof(struct in6_addr)) {
+ NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_VIA");
+ return -EINVAL;
+ }
+ cfg->fc_gw_family = AF_INET6;
+ cfg->fc_gw6 = *((struct in6_addr *)via->rtvia_addr);
+#else
+ NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
+ return -EINVAL;
+#endif
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported address family in RTA_VIA");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
struct nlmsghdr *nlh, struct fib_config *cfg,
struct netlink_ext_ack *extack)
{
+ bool has_gw = false, has_via = false;
struct nlattr *attr;
int err, remaining;
struct rtmsg *rtm;
- err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy,
- extack);
+ err = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX,
+ rtm_ipv4_policy, extack);
if (err < 0)
goto errout;
memset(cfg, 0, sizeof(*cfg));
rtm = nlmsg_data(nlh);
+
+ if (!inet_validate_dscp(rtm->rtm_tos)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid dsfield (tos): ECN bits must be 0");
+ err = -EINVAL;
+ goto errout;
+ }
+ cfg->fc_dscp = inet_dsfield_to_dscp(rtm->rtm_tos);
+
cfg->fc_dst_len = rtm->rtm_dst_len;
- cfg->fc_tos = rtm->rtm_tos;
cfg->fc_table = rtm->rtm_table;
cfg->fc_protocol = rtm->rtm_protocol;
cfg->fc_scope = rtm->rtm_scope;
@@ -698,7 +784,16 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
cfg->fc_oif = nla_get_u32(attr);
break;
case RTA_GATEWAY:
- cfg->fc_gw = nla_get_be32(attr);
+ has_gw = true;
+ cfg->fc_gw4 = nla_get_be32(attr);
+ if (cfg->fc_gw4)
+ cfg->fc_gw_family = AF_INET;
+ break;
+ case RTA_VIA:
+ has_via = true;
+ err = fib_gw_from_via(cfg, attr, extack);
+ if (err)
+ goto errout;
break;
case RTA_PRIORITY:
cfg->fc_priority = nla_get_u32(attr);
@@ -735,9 +830,44 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
if (err < 0)
goto errout;
break;
+ case RTA_NH_ID:
+ cfg->fc_nh_id = nla_get_u32(attr);
+ break;
+ }
+ }
+
+ if (cfg->fc_dst_len > 32) {
+ NL_SET_ERR_MSG(extack, "Invalid prefix length");
+ err = -EINVAL;
+ goto errout;
+ }
+
+ if (cfg->fc_dst_len < 32 && (ntohl(cfg->fc_dst) << cfg->fc_dst_len)) {
+ NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length");
+ err = -EINVAL;
+ goto errout;
+ }
+
+ if (cfg->fc_nh_id) {
+ if (cfg->fc_oif || cfg->fc_gw_family ||
+ cfg->fc_encap || cfg->fc_mp) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop specification and nexthop id are mutually exclusive");
+ err = -EINVAL;
+ goto errout;
}
}
+ if (has_gw && has_via) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop configuration can not contain both GATEWAY and VIA");
+ err = -EINVAL;
+ goto errout;
+ }
+
+ if (!cfg->fc_table)
+ cfg->fc_table = RT_TABLE_MAIN;
+
return 0;
errout:
return err;
@@ -755,14 +885,24 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
goto errout;
+ rtnl_net_lock(net);
+
+ if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) {
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ err = -EINVAL;
+ goto unlock;
+ }
+
tb = fib_get_table(net, cfg.fc_table);
if (!tb) {
NL_SET_ERR_MSG(extack, "FIB table does not exist");
err = -ESRCH;
- goto errout;
+ goto unlock;
}
err = fib_table_delete(net, tb, &cfg, extack);
+unlock:
+ rtnl_net_unlock(net);
errout:
return err;
}
@@ -779,37 +919,150 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
goto errout;
+ rtnl_net_lock(net);
+
tb = fib_new_table(net, cfg.fc_table);
if (!tb) {
err = -ENOBUFS;
- goto errout;
+ goto unlock;
}
err = fib_table_insert(net, tb, &cfg, extack);
if (!err && cfg.fc_type == RTN_LOCAL)
net->ipv4.fib_has_custom_local_routes = true;
+
+unlock:
+ rtnl_net_unlock(net);
errout:
return err;
}
+int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
+ struct fib_dump_filter *filter,
+ struct netlink_callback *cb)
+{
+ struct netlink_ext_ack *extack = cb->extack;
+ struct nlattr *tb[RTA_MAX + 1];
+ struct rtmsg *rtm;
+ int err, i;
+
+ if (filter->rtnl_held)
+ ASSERT_RTNL();
+
+ rtm = nlmsg_payload(nlh, sizeof(*rtm));
+ if (!rtm) {
+ NL_SET_ERR_MSG(extack, "Invalid header for FIB dump request");
+ return -EINVAL;
+ }
+
+ if (rtm->rtm_dst_len || rtm->rtm_src_len || rtm->rtm_tos ||
+ rtm->rtm_scope) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for FIB dump request");
+ return -EINVAL;
+ }
+
+ if (rtm->rtm_flags & ~(RTM_F_CLONED | RTM_F_PREFIX)) {
+ NL_SET_ERR_MSG(extack, "Invalid flags for FIB dump request");
+ return -EINVAL;
+ }
+ if (rtm->rtm_flags & RTM_F_CLONED)
+ filter->dump_routes = false;
+ else
+ filter->dump_exceptions = false;
+
+ filter->flags = rtm->rtm_flags;
+ filter->protocol = rtm->rtm_protocol;
+ filter->rt_type = rtm->rtm_type;
+ filter->table_id = rtm->rtm_table;
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv4_policy, extack);
+ if (err < 0)
+ return err;
+
+ for (i = 0; i <= RTA_MAX; ++i) {
+ int ifindex;
+
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case RTA_TABLE:
+ filter->table_id = nla_get_u32(tb[i]);
+ break;
+ case RTA_OIF:
+ ifindex = nla_get_u32(tb[i]);
+ if (filter->rtnl_held)
+ filter->dev = __dev_get_by_index(net, ifindex);
+ else
+ filter->dev = dev_get_by_index_rcu(net, ifindex);
+ if (!filter->dev)
+ return -ENODEV;
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
+ return -EINVAL;
+ }
+ }
+
+ if (filter->flags || filter->protocol || filter->rt_type ||
+ filter->table_id || filter->dev) {
+ filter->filter_set = 1;
+ cb->answer_flags = NLM_F_DUMP_FILTERED;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ip_valid_fib_dump_req);
+
static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct fib_dump_filter filter = {
+ .dump_routes = true,
+ .dump_exceptions = true,
+ .rtnl_held = false,
+ };
+ const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
unsigned int h, s_h;
unsigned int e = 0, s_e;
struct fib_table *tb;
struct hlist_head *head;
- int dumped = 0, err;
+ int dumped = 0, err = 0;
- if (nlmsg_len(cb->nlh) >= sizeof(struct rtmsg) &&
- ((struct rtmsg *) nlmsg_data(cb->nlh))->rtm_flags & RTM_F_CLONED)
- return skb->len;
+ rcu_read_lock();
+ if (cb->strict_check) {
+ err = ip_valid_fib_dump_req(net, nlh, &filter, cb);
+ if (err < 0)
+ goto unlock;
+ } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
+ struct rtmsg *rtm = nlmsg_data(nlh);
+
+ filter.flags = rtm->rtm_flags & (RTM_F_PREFIX | RTM_F_CLONED);
+ }
+
+ /* ipv4 does not use prefix flag */
+ if (filter.flags & RTM_F_PREFIX)
+ goto unlock;
+
+ if (filter.table_id) {
+ tb = fib_get_table(net, filter.table_id);
+ if (!tb) {
+ if (rtnl_msg_family(cb->nlh) != PF_INET)
+ goto unlock;
+
+ NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist");
+ err = -ENOENT;
+ goto unlock;
+ }
+ err = fib_table_dump(tb, skb, cb, &filter);
+ goto unlock;
+ }
s_h = cb->args[0];
s_e = cb->args[1];
- rcu_read_lock();
-
+ err = 0;
for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
e = 0;
head = &net->ipv4.fib_table_hash[h];
@@ -819,26 +1072,21 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
if (dumped)
memset(&cb->args[2], 0, sizeof(cb->args) -
2 * sizeof(cb->args[0]));
- err = fib_table_dump(tb, skb, cb);
- if (err < 0) {
- if (likely(skb->len))
- goto out;
-
- goto out_err;
- }
+ err = fib_table_dump(tb, skb, cb, &filter);
+ if (err < 0)
+ goto out;
dumped = 1;
next:
e++;
}
}
out:
- err = skb->len;
-out_err:
- rcu_read_unlock();
cb->args[1] = e;
cb->args[0] = h;
+unlock:
+ rcu_read_unlock();
return err;
}
@@ -911,9 +1159,11 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
return;
/* Add broadcast address, if it is explicitly assigned. */
- if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
+ if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) {
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
prim, 0);
+ arp_invalidate(dev, ifa->ifa_broadcast, false);
+ }
if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
(prefix != addr || ifa->ifa_prefixlen < 32)) {
@@ -923,12 +1173,11 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
prefix, ifa->ifa_prefixlen, prim,
ifa->ifa_rt_priority);
- /* Add network specific broadcasts, when it takes a sense */
+ /* Add the network broadcast address, when it makes sense */
if (ifa->ifa_prefixlen < 31) {
- fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32,
- prim, 0);
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
32, prim, 0);
+ arp_invalidate(dev, prefix | ~mask, false);
}
}
}
@@ -942,7 +1191,7 @@ void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
if (!(dev->flags & IFF_UP) ||
ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
ipv4_is_zeronet(prefix) ||
- prefix == ifa->ifa_local || ifa->ifa_prefixlen == 32)
+ (prefix == ifa->ifa_local && ifa->ifa_prefixlen == 32))
return;
/* add the new */
@@ -1009,8 +1258,8 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
*
* Scan address list to be sure that addresses are really gone.
*/
-
- for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
+ rcu_read_lock();
+ in_dev_for_each_ifa_rcu(ifa1, in_dev) {
if (ifa1 == ifa) {
/* promotion, keep the IP */
gone = 0;
@@ -1078,6 +1327,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
}
}
}
+ rcu_read_unlock();
no_promotions:
if (!(ok & BRD_OK))
@@ -1123,7 +1373,7 @@ static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
struct flowi4 fl4 = {
.flowi4_mark = frn->fl_mark,
.daddr = frn->fl_addr,
- .flowi4_tos = frn->fl_tos,
+ .flowi4_dscp = inet_dsfield_to_dscp(frn->fl_tos),
.flowi4_scope = frn->fl_scope,
};
struct fib_table *tb;
@@ -1170,13 +1420,13 @@ static void nl_fib_input(struct sk_buff *skb)
return;
nlh = nlmsg_hdr(skb);
- frn = (struct fib_result_nl *) nlmsg_data(nlh);
+ frn = nlmsg_data(nlh);
nl_fib_lookup(net, frn);
portid = NETLINK_CB(skb).portid; /* netlink portid */
NETLINK_CB(skb).portid = 0; /* from kernel */
NETLINK_CB(skb).dst_group = 0; /* unicast */
- netlink_unicast(net->ipv4.fibnl, skb, portid, MSG_DONTWAIT);
+ nlmsg_unicast(net->ipv4.fibnl, skb, portid);
}
static int __net_init nl_fib_lookup_init(struct net *net)
@@ -1211,7 +1461,7 @@ static void fib_disable_ip(struct net_device *dev, unsigned long event,
static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
{
- struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+ struct in_ifaddr *ifa = ptr;
struct net_device *dev = ifa->ifa_dev->dev;
struct net *net = dev_net(dev);
@@ -1222,7 +1472,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
fib_sync_up(dev, RTNH_F_DEAD);
#endif
atomic_inc(&net->ipv4.dev_addr_genid);
- rt_cache_flush(dev_net(dev));
+ rt_cache_flush(net);
break;
case NETDEV_DOWN:
fib_del_ifaddr(ifa, NULL);
@@ -1233,7 +1483,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
*/
fib_disable_ip(dev, event, true);
} else {
- rt_cache_flush(dev_net(dev));
+ rt_cache_flush(net);
}
break;
}
@@ -1243,9 +1493,11 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct netdev_notifier_changeupper_info *info;
+ struct netdev_notifier_changeupper_info *upper_info = ptr;
+ struct netdev_notifier_info_ext *info_ext = ptr;
struct in_device *in_dev;
struct net *net = dev_net(dev);
+ struct in_ifaddr *ifa;
unsigned int flags;
if (event == NETDEV_UNREGISTER) {
@@ -1260,9 +1512,9 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
switch (event) {
case NETDEV_UP:
- for_ifa(in_dev) {
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
fib_add_ifaddr(ifa);
- } endfor_ifa(in_dev);
+ }
#ifdef CONFIG_IP_ROUTE_MULTIPATH
fib_sync_up(dev, RTNH_F_DEAD);
#endif
@@ -1273,21 +1525,24 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
fib_disable_ip(dev, event, false);
break;
case NETDEV_CHANGE:
- flags = dev_get_flags(dev);
+ flags = netif_get_flags(dev);
if (flags & (IFF_RUNNING | IFF_LOWER_UP))
fib_sync_up(dev, RTNH_F_LINKDOWN);
else
fib_sync_down_dev(dev, event, false);
- /* fall through */
+ rt_cache_flush(net);
+ break;
case NETDEV_CHANGEMTU:
+ fib_sync_mtu(dev, info_ext->ext.mtu);
rt_cache_flush(net);
break;
case NETDEV_CHANGEUPPER:
- info = ptr;
+ upper_info = ptr;
/* flush all routes if dev is linked to or unlinked from
* an L3 master device (e.g., VRF)
*/
- if (info->upper_dev && netif_is_l3_master(info->upper_dev))
+ if (upper_info->upper_dev &&
+ netif_is_l3_master(upper_info->upper_dev))
fib_disable_ip(dev, NETDEV_DOWN, true);
break;
}
@@ -1311,6 +1566,12 @@ static int __net_init ip_fib_net_init(struct net *net)
if (err)
return err;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ /* Default to 3-tuple */
+ net->ipv4.sysctl_fib_multipath_hash_fields =
+ FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK;
+#endif
+
/* Avoid false sharing : Use at least a full cache line */
size = max_t(size_t, size, L1_CACHE_BYTES);
@@ -1336,7 +1597,7 @@ static void ip_fib_net_exit(struct net *net)
{
int i;
- rtnl_lock();
+ ASSERT_RTNL_NET(net);
#ifdef CONFIG_IP_MULTIPLE_TABLES
RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
@@ -1353,7 +1614,7 @@ static void ip_fib_net_exit(struct net *net)
hlist_for_each_entry_safe(tb, tmp, head, tb_hlist) {
hlist_del(&tb->tb_hlist);
- fib_table_flush(net, tb);
+ fib_table_flush(net, tb, true);
fib_free_table(tb);
}
}
@@ -1361,7 +1622,7 @@ static void ip_fib_net_exit(struct net *net)
#ifdef CONFIG_IP_MULTIPLE_TABLES
fib4_rules_exit(net);
#endif
- rtnl_unlock();
+
kfree(net->ipv4.fib_table_hash);
fib4_notifier_exit(net);
}
@@ -1371,14 +1632,20 @@ static int __net_init fib_net_init(struct net *net)
int error;
#ifdef CONFIG_IP_ROUTE_CLASSID
- net->ipv4.fib_num_tclassid_users = 0;
+ atomic_set(&net->ipv4.fib_num_tclassid_users, 0);
#endif
error = ip_fib_net_init(net);
if (error < 0)
goto out;
+
+ error = fib4_semantics_init(net);
+ if (error)
+ goto out_semantics;
+
error = nl_fib_lookup_init(net);
if (error < 0)
goto out_nlfl;
+
error = fib_proc_init(net);
if (error < 0)
goto out_proc;
@@ -1388,7 +1655,11 @@ out:
out_proc:
nl_fib_lookup_exit(net);
out_nlfl:
+ fib4_semantics_exit(net);
+out_semantics:
+ rtnl_net_lock(net);
ip_fib_net_exit(net);
+ rtnl_net_unlock(net);
goto out;
}
@@ -1396,12 +1667,37 @@ static void __net_exit fib_net_exit(struct net *net)
{
fib_proc_exit(net);
nl_fib_lookup_exit(net);
- ip_fib_net_exit(net);
+}
+
+static void __net_exit fib_net_exit_batch(struct list_head *net_list)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list) {
+ __rtnl_net_lock(net);
+ ip_fib_net_exit(net);
+ __rtnl_net_unlock(net);
+ }
+ rtnl_unlock();
+
+ list_for_each_entry(net, net_list, exit_list)
+ fib4_semantics_exit(net);
}
static struct pernet_operations fib_net_ops = {
.init = fib_net_init,
.exit = fib_net_exit,
+ .exit_batch = fib_net_exit_batch,
+};
+
+static const struct rtnl_msg_handler fib_rtnl_msg_handlers[] __initconst = {
+ {.protocol = PF_INET, .msgtype = RTM_NEWROUTE,
+ .doit = inet_rtm_newroute, .flags = RTNL_FLAG_DOIT_PERNET},
+ {.protocol = PF_INET, .msgtype = RTM_DELROUTE,
+ .doit = inet_rtm_delroute, .flags = RTNL_FLAG_DOIT_PERNET},
+ {.protocol = PF_INET, .msgtype = RTM_GETROUTE, .dumpit = inet_dump_fib,
+ .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
};
void __init ip_fib_init(void)
@@ -1413,7 +1709,5 @@ void __init ip_fib_init(void)
register_netdevice_notifier(&fib_netdev_notifier);
register_inetaddr_notifier(&fib_inetaddr_notifier);
- rtnl_register(PF_INET, RTM_NEWROUTE, inet_rtm_newroute, NULL, 0);
- rtnl_register(PF_INET, RTM_DELROUTE, inet_rtm_delroute, NULL, 0);
- rtnl_register(PF_INET, RTM_GETROUTE, NULL, inet_dump_fib, 0);
+ rtnl_register_many(fib_rtnl_msg_handlers);
}
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index e6ff282bb7f4..f9b9e26c32c1 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -4,23 +4,28 @@
#include <linux/types.h>
#include <linux/list.h>
+#include <net/inet_dscp.h>
#include <net/ip_fib.h>
+#include <net/nexthop.h>
struct fib_alias {
struct hlist_node fa_list;
struct fib_info *fa_info;
- u8 fa_tos;
+ dscp_t fa_dscp;
u8 fa_type;
u8 fa_state;
u8 fa_slen;
u32 tb_id;
s16 fa_default;
+ u8 offload;
+ u8 trap;
+ u8 offload_failed;
struct rcu_head rcu;
};
#define FA_S_ACCESSED 0x01
-/* Dont write on fa_state unless needed, to keep it shared on all cpus */
+/* Don't write on fa_state unless needed, to keep it shared on all cpus */
static inline void fib_alias_accessed(struct fib_alias *fa)
{
if (!(fa->fa_state & FA_S_ACCESSED))
@@ -31,20 +36,21 @@ static inline void fib_alias_accessed(struct fib_alias *fa)
void fib_release_info(struct fib_info *);
struct fib_info *fib_create_info(struct fib_config *cfg,
struct netlink_ext_ack *extack);
-int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
+int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
struct netlink_ext_ack *extack);
bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi);
-int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id,
- u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi,
- unsigned int);
+int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+ const struct fib_rt_info *fri, unsigned int flags);
void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len,
u32 tb_id, const struct nl_info *info, unsigned int nlm_flags);
+size_t fib_nlmsg_size(struct fib_info *fi);
static inline void fib_result_assign(struct fib_result *res,
struct fib_info *fi)
{
/* we used to play games with refcounts, but we now use RCU */
res->fi = fi;
+ res->nhc = fib_info_nhc(fi, 0);
}
struct fib_prop {
diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c
index b804ccbdb241..b1551c26554b 100644
--- a/net/ipv4/fib_notifier.c
+++ b/net/ipv4/fib_notifier.c
@@ -6,15 +6,14 @@
#include <linux/export.h>
#include <net/net_namespace.h>
#include <net/fib_notifier.h>
-#include <net/netns/ipv4.h>
#include <net/ip_fib.h>
-int call_fib4_notifier(struct notifier_block *nb, struct net *net,
+int call_fib4_notifier(struct notifier_block *nb,
enum fib_event_type event_type,
struct fib_notifier_info *info)
{
info->family = AF_INET;
- return call_fib_notifier(nb, net, event_type, info);
+ return call_fib_notifier(nb, event_type, info);
}
int call_fib4_notifiers(struct net *net, enum fib_event_type event_type,
@@ -23,28 +22,27 @@ int call_fib4_notifiers(struct net *net, enum fib_event_type event_type,
ASSERT_RTNL();
info->family = AF_INET;
- net->ipv4.fib_seq++;
+ /* Paired with READ_ONCE() in fib4_seq_read() */
+ WRITE_ONCE(net->ipv4.fib_seq, net->ipv4.fib_seq + 1);
return call_fib_notifiers(net, event_type, info);
}
-static unsigned int fib4_seq_read(struct net *net)
+static unsigned int fib4_seq_read(const struct net *net)
{
- ASSERT_RTNL();
-
- return net->ipv4.fib_seq + fib4_rules_seq_read(net);
+ /* Paired with WRITE_ONCE() in call_fib4_notifiers() */
+ return READ_ONCE(net->ipv4.fib_seq) + fib4_rules_seq_read(net);
}
-static int fib4_dump(struct net *net, struct notifier_block *nb)
+static int fib4_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
int err;
- err = fib4_rules_dump(net, nb);
+ err = fib4_rules_dump(net, nb, extack);
if (err)
return err;
- fib_notify(net, nb);
-
- return 0;
+ return fib_notify(net, nb, extack);
}
static const struct fib_notifier_ops fib4_notifier_ops_template = {
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index f8eb78d042a4..51f0193092f0 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -8,11 +9,6 @@
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
* Thomas Graf <tgraf@suug.ch>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Fixes:
* Rani Assaf : local_rule cannot be deleted
* Marc Boucher : routing by fwmark
@@ -27,17 +23,23 @@
#include <linux/list.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
+#include <net/flow.h>
+#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/ip_fib.h>
+#include <net/nexthop.h>
#include <net/fib_rules.h>
+#include <linux/indirect_call_wrapper.h>
struct fib4_rule {
struct fib_rule common;
u8 dst_len;
u8 src_len;
- u8 tos;
+ dscp_t dscp;
+ dscp_t dscp_mask;
+ u8 dscp_full:1; /* DSCP or TOS selector */
__be32 src;
__be32 srcmask;
__be32 dst;
@@ -51,7 +53,7 @@ static bool fib4_rule_matchall(const struct fib_rule *rule)
{
struct fib4_rule *r = container_of(rule, struct fib4_rule, common);
- if (r->dst_len || r->src_len || r->tos)
+ if (r->dst_len || r->src_len || r->dscp)
return false;
return fib_rule_matchall(rule);
}
@@ -68,12 +70,13 @@ bool fib4_rule_default(const struct fib_rule *rule)
}
EXPORT_SYMBOL_GPL(fib4_rule_default);
-int fib4_rules_dump(struct net *net, struct notifier_block *nb)
+int fib4_rules_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
- return fib_rules_dump(net, nb, AF_INET);
+ return fib_rules_dump(net, nb, AF_INET, extack);
}
-unsigned int fib4_rules_seq_read(struct net *net)
+unsigned int fib4_rules_seq_read(const struct net *net)
{
return fib_rules_seq_read(net, AF_INET);
}
@@ -105,8 +108,9 @@ int __fib_lookup(struct net *net, struct flowi4 *flp,
}
EXPORT_SYMBOL_GPL(__fib_lookup);
-static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
- int flags, struct fib_lookup_arg *arg)
+INDIRECT_CALLABLE_SCOPE int fib4_rule_action(struct fib_rule *rule,
+ struct flowi *flp, int flags,
+ struct fib_lookup_arg *arg)
{
int err = -EAGAIN;
struct fib_table *tbl;
@@ -140,13 +144,18 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
return err;
}
-static bool fib4_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
+INDIRECT_CALLABLE_SCOPE bool fib4_rule_suppress(struct fib_rule *rule,
+ int flags,
+ struct fib_lookup_arg *arg)
{
- struct fib_result *result = (struct fib_result *) arg->result;
+ struct fib_result *result = arg->result;
struct net_device *dev = NULL;
- if (result->fi)
- dev = result->fi->fib_dev;
+ if (result->fi) {
+ struct fib_nh_common *nhc = fib_info_nhc(result->fi, 0);
+
+ dev = nhc->nhc_dev;
+ }
/* do not accept result if the route does
* not meet the required prefix length
@@ -168,7 +177,8 @@ suppress_route:
return true;
}
-static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
+INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule,
+ struct flowi *fl, int flags)
{
struct fib4_rule *r = (struct fib4_rule *) rule;
struct flowi4 *fl4 = &fl->u.ip4;
@@ -179,18 +189,26 @@ static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
((daddr ^ r->dst) & r->dstmask))
return 0;
- if (r->tos && (r->tos != fl4->flowi4_tos))
+ /* When DSCP selector is used we need to match on the entire DSCP field
+ * in the flow information structure. When TOS selector is used we need
+ * to mask the upper three DSCP bits prior to matching to maintain
+ * legacy behavior.
+ */
+ if (r->dscp_full && (r->dscp ^ fl4->flowi4_dscp) & r->dscp_mask)
+ return 0;
+ else if (!r->dscp_full && r->dscp &&
+ !fib_dscp_masked_match(r->dscp, fl4))
return 0;
if (rule->ip_proto && (rule->ip_proto != fl4->flowi4_proto))
return 0;
- if (fib_rule_port_range_set(&rule->sport_range) &&
- !fib_rule_port_inrange(&rule->sport_range, fl4->fl4_sport))
+ if (!fib_rule_port_match(&rule->sport_range, rule->sport_mask,
+ fl4->fl4_sport))
return 0;
- if (fib_rule_port_range_set(&rule->dport_range) &&
- !fib_rule_port_inrange(&rule->dport_range, fl4->fl4_dport))
+ if (!fib_rule_port_match(&rule->dport_range, rule->dport_mask,
+ fl4->fl4_dport))
return 0;
return 1;
@@ -198,32 +216,90 @@ static int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
static struct fib_table *fib_empty_table(struct net *net)
{
- u32 id;
+ u32 id = 1;
- for (id = 1; id <= RT_TABLE_MAX; id++)
+ while (1) {
if (!fib_get_table(net, id))
return fib_new_table(net, id);
+
+ if (id++ == RT_TABLE_MAX)
+ break;
+ }
return NULL;
}
-static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = {
- FRA_GENERIC_POLICY,
- [FRA_FLOW] = { .type = NLA_U32 },
-};
+static int fib4_nl2rule_dscp(const struct nlattr *nla, struct fib4_rule *rule4,
+ struct netlink_ext_ack *extack)
+{
+ if (rule4->dscp) {
+ NL_SET_ERR_MSG(extack, "Cannot specify both TOS and DSCP");
+ return -EINVAL;
+ }
+
+ rule4->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2);
+ rule4->dscp_mask = inet_dsfield_to_dscp(INET_DSCP_MASK);
+ rule4->dscp_full = true;
+
+ return 0;
+}
+
+static int fib4_nl2rule_dscp_mask(const struct nlattr *nla,
+ struct fib4_rule *rule4,
+ struct netlink_ext_ack *extack)
+{
+ dscp_t dscp_mask;
+
+ if (!rule4->dscp_full) {
+ NL_SET_ERR_MSG_ATTR(extack, nla,
+ "Cannot specify DSCP mask without DSCP value");
+ return -EINVAL;
+ }
+
+ dscp_mask = inet_dsfield_to_dscp(nla_get_u8(nla) << 2);
+ if (rule4->dscp & ~dscp_mask) {
+ NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid DSCP mask");
+ return -EINVAL;
+ }
+
+ rule4->dscp_mask = dscp_mask;
+
+ return 0;
+}
static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh,
struct nlattr **tb,
struct netlink_ext_ack *extack)
{
- struct net *net = sock_net(skb->sk);
+ struct fib4_rule *rule4 = (struct fib4_rule *)rule;
+ struct net *net = rule->fr_net;
int err = -EINVAL;
- struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+ if (tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) {
+ NL_SET_ERR_MSG(extack,
+ "Flow label cannot be specified for IPv4 FIB rules");
+ goto errout;
+ }
+
+ if (!inet_validate_dscp(frh->tos)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid dsfield (tos): ECN bits must be 0");
+ goto errout;
+ }
+ /* IPv4 currently doesn't handle high order DSCP bits correctly */
if (frh->tos & ~IPTOS_TOS_MASK) {
NL_SET_ERR_MSG(extack, "Invalid tos");
goto errout;
}
+ rule4->dscp = inet_dsfield_to_dscp(frh->tos);
+
+ if (tb[FRA_DSCP] &&
+ fib4_nl2rule_dscp(tb[FRA_DSCP], rule4, extack) < 0)
+ goto errout;
+
+ if (tb[FRA_DSCP_MASK] &&
+ fib4_nl2rule_dscp_mask(tb[FRA_DSCP_MASK], rule4, extack) < 0)
+ goto errout;
/* split local/main if they are not already split */
err = fib_unmerge(net);
@@ -254,7 +330,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
if (tb[FRA_FLOW]) {
rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
if (rule4->tclassid)
- net->ipv4.fib_num_tclassid_users++;
+ atomic_inc(&net->ipv4.fib_num_tclassid_users);
}
#endif
@@ -265,7 +341,6 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
rule4->srcmask = inet_make_mask(rule4->src_len);
rule4->dst_len = frh->dst_len;
rule4->dstmask = inet_make_mask(rule4->dst_len);
- rule4->tos = frh->tos;
net->ipv4.fib_has_custom_rules = true;
@@ -286,7 +361,7 @@ static int fib4_rule_delete(struct fib_rule *rule)
#ifdef CONFIG_IP_ROUTE_CLASSID
if (((struct fib4_rule *)rule)->tclassid)
- net->ipv4.fib_num_tclassid_users--;
+ atomic_dec(&net->ipv4.fib_num_tclassid_users);
#endif
net->ipv4.fib_has_custom_rules = true;
@@ -308,9 +383,27 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
if (frh->dst_len && (rule4->dst_len != frh->dst_len))
return 0;
- if (frh->tos && (rule4->tos != frh->tos))
+ if (frh->tos &&
+ (rule4->dscp_full ||
+ inet_dscp_to_dsfield(rule4->dscp) != frh->tos))
return 0;
+ if (tb[FRA_DSCP]) {
+ dscp_t dscp;
+
+ dscp = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP]) << 2);
+ if (!rule4->dscp_full || rule4->dscp != dscp)
+ return 0;
+ }
+
+ if (tb[FRA_DSCP_MASK]) {
+ dscp_t dscp_mask;
+
+ dscp_mask = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP_MASK]) << 2);
+ if (!rule4->dscp_full || rule4->dscp_mask != dscp_mask)
+ return 0;
+ }
+
#ifdef CONFIG_IP_ROUTE_CLASSID
if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
return 0;
@@ -332,7 +425,17 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
frh->dst_len = rule4->dst_len;
frh->src_len = rule4->src_len;
- frh->tos = rule4->tos;
+
+ if (rule4->dscp_full) {
+ frh->tos = 0;
+ if (nla_put_u8(skb, FRA_DSCP,
+ inet_dscp_to_dsfield(rule4->dscp) >> 2) ||
+ nla_put_u8(skb, FRA_DSCP_MASK,
+ inet_dscp_to_dsfield(rule4->dscp_mask) >> 2))
+ goto nla_put_failure;
+ } else {
+ frh->tos = inet_dscp_to_dsfield(rule4->dscp);
+ }
if ((rule4->dst_len &&
nla_put_in_addr(skb, FRA_DST, rule4->dst)) ||
@@ -354,7 +457,9 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
{
return nla_total_size(4) /* dst */
+ nla_total_size(4) /* src */
- + nla_total_size(4); /* flow */
+ + nla_total_size(4) /* flow */
+ + nla_total_size(1) /* dscp */
+ + nla_total_size(1); /* dscp mask */
}
static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
@@ -376,7 +481,6 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
.nlmsg_payload = fib4_rule_nlmsg_payload,
.flush_cache = fib4_rule_flush_cache,
.nlgroup = RTNLGRP_IPV4_RULE,
- .policy = fib4_rule_policy,
.owner = THIS_MODULE,
};
@@ -384,13 +488,13 @@ static int fib_default_rules_init(struct fib_rules_ops *ops)
{
int err;
- err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0);
+ err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL);
if (err < 0)
return err;
- err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0);
+ err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN);
if (err < 0)
return err;
- err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT, 0);
+ err = fib_default_rule_add(ops, 0x7FFF, RT_TABLE_DEFAULT);
if (err < 0)
return err;
return 0;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index f3c89ccf14c5..a5f3c8459758 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -6,11 +7,6 @@
* IPv4 Forwarding Information Base: semantics.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/uaccess.h>
@@ -33,43 +29,42 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/netlink.h>
+#include <linux/hash.h>
+#include <linux/nospec.h>
#include <net/arp.h>
+#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
#include <net/tcp.h>
#include <net/sock.h>
#include <net/ip_fib.h>
-#include <net/netlink.h>
+#include <net/ip6_fib.h>
#include <net/nexthop.h>
+#include <net/netlink.h>
+#include <net/rtnh.h>
#include <net/lwtunnel.h>
#include <net/fib_notifier.h>
+#include <net/addrconf.h>
#include "fib_lookup.h"
-static DEFINE_SPINLOCK(fib_info_lock);
-static struct hlist_head *fib_info_hash;
-static struct hlist_head *fib_info_laddrhash;
-static unsigned int fib_info_hash_size;
-static unsigned int fib_info_cnt;
-
-#define DEVINDEX_HASHBITS 8
-#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
-static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
-
+/* for_nexthops and change_nexthops only used when nexthop object
+ * is not set in a fib_info. The logic within can reference fib_nh.
+ */
#ifdef CONFIG_IP_ROUTE_MULTIPATH
#define for_nexthops(fi) { \
int nhsel; const struct fib_nh *nh; \
for (nhsel = 0, nh = (fi)->fib_nh; \
- nhsel < (fi)->fib_nhs; \
+ nhsel < fib_info_num_path((fi)); \
nh++, nhsel++)
#define change_nexthops(fi) { \
int nhsel; struct fib_nh *nexthop_nh; \
for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
- nhsel < (fi)->fib_nhs; \
+ nhsel < fib_info_num_path((fi)); \
nexthop_nh++, nhsel++)
#else /* CONFIG_IP_ROUTE_MULTIPATH */
@@ -157,12 +152,12 @@ static void rt_fibinfo_free(struct rtable __rcu **rtp)
dst_release_immediate(&rt->dst);
}
-static void free_nh_exceptions(struct fib_nh *nh)
+static void free_nh_exceptions(struct fib_nh_common *nhc)
{
struct fnhe_hash_bucket *hash;
int i;
- hash = rcu_dereference_protected(nh->nh_exceptions, 1);
+ hash = rcu_dereference_protected(nhc->nhc_exceptions, 1);
if (!hash)
return;
for (i = 0; i < FNHE_HASH_SIZE; i++) {
@@ -204,24 +199,40 @@ static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
free_percpu(rtp);
}
+void fib_nh_common_release(struct fib_nh_common *nhc)
+{
+ netdev_put(nhc->nhc_dev, &nhc->nhc_dev_tracker);
+ lwtstate_put(nhc->nhc_lwtstate);
+ rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output);
+ rt_fibinfo_free(&nhc->nhc_rth_input);
+ free_nh_exceptions(nhc);
+}
+EXPORT_SYMBOL_GPL(fib_nh_common_release);
+
+void fib_nh_release(struct net *net, struct fib_nh *fib_nh)
+{
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (fib_nh->nh_tclassid)
+ atomic_dec(&net->ipv4.fib_num_tclassid_users);
+#endif
+ fib_nh_common_release(&fib_nh->nh_common);
+}
+
/* Release a nexthop info record */
static void free_fib_info_rcu(struct rcu_head *head)
{
struct fib_info *fi = container_of(head, struct fib_info, rcu);
- struct dst_metrics *m;
- change_nexthops(fi) {
- if (nexthop_nh->nh_dev)
- dev_put(nexthop_nh->nh_dev);
- lwtstate_put(nexthop_nh->nh_lwtstate);
- free_nh_exceptions(nexthop_nh);
- rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
- rt_fibinfo_free(&nexthop_nh->nh_rth_input);
- } endfor_nexthops(fi);
+ if (fi->nh) {
+ nexthop_put(fi->nh);
+ } else {
+ change_nexthops(fi) {
+ fib_nh_release(fi->fib_net, nexthop_nh);
+ } endfor_nexthops(fi);
+ }
+
+ ip_fib_metrics_put(fi->fib_metrics);
- m = fi->fib_metrics;
- if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
- kfree(m);
kfree(fi);
}
@@ -231,104 +242,229 @@ void free_fib_info(struct fib_info *fi)
pr_warn("Freeing alive fib_info %p\n", fi);
return;
}
- fib_info_cnt--;
-#ifdef CONFIG_IP_ROUTE_CLASSID
- change_nexthops(fi) {
- if (nexthop_nh->nh_tclassid)
- fi->fib_net->ipv4.fib_num_tclassid_users--;
- } endfor_nexthops(fi);
-#endif
- call_rcu(&fi->rcu, free_fib_info_rcu);
+
+ call_rcu_hurry(&fi->rcu, free_fib_info_rcu);
}
EXPORT_SYMBOL_GPL(free_fib_info);
void fib_release_info(struct fib_info *fi)
{
- spin_lock_bh(&fib_info_lock);
- if (fi && --fi->fib_treeref == 0) {
+ ASSERT_RTNL();
+ if (fi && refcount_dec_and_test(&fi->fib_treeref)) {
hlist_del(&fi->fib_hash);
+ fi->fib_net->ipv4.fib_info_cnt--;
+
if (fi->fib_prefsrc)
hlist_del(&fi->fib_lhash);
- change_nexthops(fi) {
- if (!nexthop_nh->nh_dev)
- continue;
- hlist_del(&nexthop_nh->nh_hash);
- } endfor_nexthops(fi)
- fi->fib_dead = 1;
+ if (fi->nh) {
+ list_del(&fi->nh_list);
+ } else {
+ change_nexthops(fi) {
+ if (!nexthop_nh->fib_nh_dev)
+ continue;
+ hlist_del_rcu(&nexthop_nh->nh_hash);
+ } endfor_nexthops(fi)
+ }
+ /* Paired with READ_ONCE() from fib_table_lookup() */
+ WRITE_ONCE(fi->fib_dead, 1);
fib_info_put(fi);
}
- spin_unlock_bh(&fib_info_lock);
}
-static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
+static inline int nh_comp(struct fib_info *fi, struct fib_info *ofi)
{
- const struct fib_nh *onh = ofi->fib_nh;
+ const struct fib_nh *onh;
+
+ if (fi->nh || ofi->nh)
+ return nexthop_cmp(fi->nh, ofi->nh) ? 0 : -1;
+
+ if (ofi->fib_nhs == 0)
+ return 0;
for_nexthops(fi) {
- if (nh->nh_oif != onh->nh_oif ||
- nh->nh_gw != onh->nh_gw ||
- nh->nh_scope != onh->nh_scope ||
+ onh = fib_info_nh(ofi, nhsel);
+
+ if (nh->fib_nh_oif != onh->fib_nh_oif ||
+ nh->fib_nh_gw_family != onh->fib_nh_gw_family ||
+ nh->fib_nh_scope != onh->fib_nh_scope ||
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- nh->nh_weight != onh->nh_weight ||
+ nh->fib_nh_weight != onh->fib_nh_weight ||
#endif
#ifdef CONFIG_IP_ROUTE_CLASSID
nh->nh_tclassid != onh->nh_tclassid ||
#endif
- lwtunnel_cmp_encap(nh->nh_lwtstate, onh->nh_lwtstate) ||
- ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK))
+ lwtunnel_cmp_encap(nh->fib_nh_lws, onh->fib_nh_lws) ||
+ ((nh->fib_nh_flags ^ onh->fib_nh_flags) & ~RTNH_COMPARE_MASK))
+ return -1;
+
+ if (nh->fib_nh_gw_family == AF_INET &&
+ nh->fib_nh_gw4 != onh->fib_nh_gw4)
+ return -1;
+
+ if (nh->fib_nh_gw_family == AF_INET6 &&
+ ipv6_addr_cmp(&nh->fib_nh_gw6, &onh->fib_nh_gw6))
return -1;
- onh++;
} endfor_nexthops(fi);
return 0;
}
-static inline unsigned int fib_devindex_hashfn(unsigned int val)
+static struct hlist_head *fib_nh_head(struct net_device *dev)
+{
+ return &dev->fib_nh_head;
+}
+
+static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope,
+ u32 prefsrc, u32 priority)
{
- unsigned int mask = DEVINDEX_HASHSIZE - 1;
+ unsigned int val = init_val;
- return (val ^
- (val >> DEVINDEX_HASHBITS) ^
- (val >> (DEVINDEX_HASHBITS * 2))) & mask;
+ val ^= (protocol << 8) | scope;
+ val ^= prefsrc;
+ val ^= priority;
+
+ return val;
}
-static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
+static unsigned int fib_info_hashfn_result(const struct net *net,
+ unsigned int val)
{
- unsigned int mask = (fib_info_hash_size - 1);
- unsigned int val = fi->fib_nhs;
+ return hash_32(val ^ net_hash_mix(net), net->ipv4.fib_info_hash_bits);
+}
- val ^= (fi->fib_protocol << 8) | fi->fib_scope;
- val ^= (__force u32)fi->fib_prefsrc;
- val ^= fi->fib_priority;
- for_nexthops(fi) {
- val ^= fib_devindex_hashfn(nh->nh_oif);
- } endfor_nexthops(fi)
+static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi)
+{
+ struct net *net = fi->fib_net;
+ unsigned int val;
- return (val ^ (val >> 7) ^ (val >> 12)) & mask;
+ val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol,
+ fi->fib_scope, (__force u32)fi->fib_prefsrc,
+ fi->fib_priority);
+
+ if (fi->nh) {
+ val ^= fi->nh->id;
+ } else {
+ for_nexthops(fi) {
+ val ^= nh->fib_nh_oif;
+ } endfor_nexthops(fi)
+ }
+
+ return &net->ipv4.fib_info_hash[fib_info_hashfn_result(net, val)];
}
-static struct fib_info *fib_find_info(const struct fib_info *nfi)
+static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net,
+ __be32 val)
+{
+ unsigned int hash_bits = net->ipv4.fib_info_hash_bits;
+ u32 slot;
+
+ slot = hash_32(net_hash_mix(net) ^ (__force u32)val, hash_bits);
+
+ return &net->ipv4.fib_info_hash[(1 << hash_bits) + slot];
+}
+
+static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits)
+{
+ /* The second half is used for prefsrc */
+ return kvcalloc((1 << hash_bits) * 2, sizeof(struct hlist_head),
+ GFP_KERNEL);
+}
+
+static void fib_info_hash_free(struct hlist_head *head)
+{
+ kvfree(head);
+}
+
+static void fib_info_hash_grow(struct net *net)
+{
+ unsigned int old_size = 1 << net->ipv4.fib_info_hash_bits;
+ struct hlist_head *new_info_hash, *old_info_hash;
+ unsigned int i;
+
+ if (net->ipv4.fib_info_cnt < old_size)
+ return;
+
+ new_info_hash = fib_info_hash_alloc(net->ipv4.fib_info_hash_bits + 1);
+ if (!new_info_hash)
+ return;
+
+ old_info_hash = net->ipv4.fib_info_hash;
+ net->ipv4.fib_info_hash = new_info_hash;
+ net->ipv4.fib_info_hash_bits += 1;
+
+ for (i = 0; i < old_size; i++) {
+ struct hlist_head *head = &old_info_hash[i];
+ struct hlist_node *n;
+ struct fib_info *fi;
+
+ hlist_for_each_entry_safe(fi, n, head, fib_hash)
+ hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi));
+ }
+
+ for (i = 0; i < old_size; i++) {
+ struct hlist_head *lhead = &old_info_hash[old_size + i];
+ struct hlist_node *n;
+ struct fib_info *fi;
+
+ hlist_for_each_entry_safe(fi, n, lhead, fib_lhash)
+ hlist_add_head(&fi->fib_lhash,
+ fib_info_laddrhash_bucket(fi->fib_net,
+ fi->fib_prefsrc));
+ }
+
+ fib_info_hash_free(old_info_hash);
+}
+
+/* no metrics, only nexthop id */
+static struct fib_info *fib_find_info_nh(struct net *net,
+ const struct fib_config *cfg)
{
struct hlist_head *head;
struct fib_info *fi;
unsigned int hash;
- hash = fib_info_hashfn(nfi);
- head = &fib_info_hash[hash];
+ hash = fib_info_hashfn_1(cfg->fc_nh_id,
+ cfg->fc_protocol, cfg->fc_scope,
+ (__force u32)cfg->fc_prefsrc,
+ cfg->fc_priority);
+ hash = fib_info_hashfn_result(net, hash);
+ head = &net->ipv4.fib_info_hash[hash];
hlist_for_each_entry(fi, head, fib_hash) {
- if (!net_eq(fi->fib_net, nfi->fib_net))
+ if (!fi->nh || fi->nh->id != cfg->fc_nh_id)
continue;
+
+ if (cfg->fc_protocol == fi->fib_protocol &&
+ cfg->fc_scope == fi->fib_scope &&
+ cfg->fc_prefsrc == fi->fib_prefsrc &&
+ cfg->fc_priority == fi->fib_priority &&
+ cfg->fc_type == fi->fib_type &&
+ cfg->fc_table == fi->fib_tb_id &&
+ !((cfg->fc_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK))
+ return fi;
+ }
+
+ return NULL;
+}
+
+static struct fib_info *fib_find_info(struct fib_info *nfi)
+{
+ struct hlist_head *head = fib_info_hash_bucket(nfi);
+ struct fib_info *fi;
+
+ hlist_for_each_entry(fi, head, fib_hash) {
if (fi->fib_nhs != nfi->fib_nhs)
continue;
+
if (nfi->fib_protocol == fi->fib_protocol &&
nfi->fib_scope == fi->fib_scope &&
nfi->fib_prefsrc == fi->fib_prefsrc &&
nfi->fib_priority == fi->fib_priority &&
nfi->fib_type == fi->fib_type &&
+ nfi->fib_tb_id == fi->fib_tb_id &&
memcmp(nfi->fib_metrics, fi->fib_metrics,
sizeof(u32) * RTAX_MAX) == 0 &&
!((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
- (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
+ nh_comp(fi, nfi) == 0)
return fi;
}
@@ -336,33 +472,27 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
}
/* Check, that the gateway is already configured.
- * Used only by redirect accept routine.
+ * Used only by redirect accept routine, under rcu_read_lock();
*/
int ip_fib_check_default(__be32 gw, struct net_device *dev)
{
struct hlist_head *head;
struct fib_nh *nh;
- unsigned int hash;
- spin_lock(&fib_info_lock);
+ head = fib_nh_head(dev);
- hash = fib_devindex_hashfn(dev->ifindex);
- head = &fib_info_devhash[hash];
- hlist_for_each_entry(nh, head, nh_hash) {
- if (nh->nh_dev == dev &&
- nh->nh_gw == gw &&
- !(nh->nh_flags & RTNH_F_DEAD)) {
- spin_unlock(&fib_info_lock);
+ hlist_for_each_entry_rcu(nh, head, nh_hash) {
+ DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev);
+ if (nh->fib_nh_gw4 == gw &&
+ !(nh->fib_nh_flags & RTNH_F_DEAD)) {
return 0;
}
}
- spin_unlock(&fib_info_lock);
-
return -1;
}
-static inline size_t fib_nlmsg_size(struct fib_info *fi)
+size_t fib_nlmsg_size(struct fib_info *fi)
{
size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
+ nla_total_size(4) /* RTA_TABLE */
@@ -370,34 +500,40 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi)
+ nla_total_size(4) /* RTA_PRIORITY */
+ nla_total_size(4) /* RTA_PREFSRC */
+ nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
+ unsigned int nhs = fib_info_num_path(fi);
/* space for nested metrics */
payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
- if (fi->fib_nhs) {
+ if (fi->nh)
+ payload += nla_total_size(4); /* RTA_NH_ID */
+
+ if (nhs) {
size_t nh_encapsize = 0;
- /* Also handles the special case fib_nhs == 1 */
+ /* Also handles the special case nhs == 1 */
/* each nexthop is packed in an attribute */
size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
+ unsigned int i;
/* may contain flow and gateway attribute */
nhsize += 2 * nla_total_size(4);
/* grab encap info */
- for_nexthops(fi) {
- if (nh->nh_lwtstate) {
+ for (i = 0; i < fib_info_num_path(fi); i++) {
+ struct fib_nh_common *nhc = fib_info_nhc(fi, i);
+
+ if (nhc->nhc_lwtstate) {
/* RTA_ENCAP_TYPE */
nh_encapsize += lwtunnel_get_encap_size(
- nh->nh_lwtstate);
+ nhc->nhc_lwtstate);
/* RTA_ENCAP */
nh_encapsize += nla_total_size(2);
}
- } endfor_nexthops(fi);
+ }
/* all nexthops are packed in a nested attribute */
- payload += nla_total_size((fi->fib_nhs * nhsize) +
- nh_encapsize);
+ payload += nla_total_size((nhs * nhsize) + nh_encapsize);
}
@@ -408,6 +544,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
int dst_len, u32 tb_id, const struct nl_info *info,
unsigned int nlm_flags)
{
+ struct fib_rt_info fri;
struct sk_buff *skb;
u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
int err = -ENOBUFS;
@@ -416,9 +553,16 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
if (!skb)
goto errout;
- err = fib_dump_info(skb, info->portid, seq, event, tb_id,
- fa->fa_type, key, dst_len,
- fa->fa_tos, fa->fa_info, nlm_flags);
+ fri.fi = fa->fa_info;
+ fri.tb_id = tb_id;
+ fri.dst = key;
+ fri.dst_len = dst_len;
+ fri.dscp = fa->fa_dscp;
+ fri.type = fa->fa_type;
+ fri.offload = READ_ONCE(fa->offload);
+ fri.trap = READ_ONCE(fa->trap);
+ fri.offload_failed = READ_ONCE(fa->offload_failed);
+ err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags);
if (err < 0) {
/* -EMSGSIZE implies BUG in fib_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
@@ -429,20 +573,27 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
info->nlh, GFP_KERNEL);
return;
errout:
- if (err < 0)
- rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
+ rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
}
static int fib_detect_death(struct fib_info *fi, int order,
struct fib_info **last_resort, int *last_idx,
int dflt)
{
+ const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
struct neighbour *n;
int state = NUD_NONE;
- n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
+ if (likely(nhc->nhc_gw_family == AF_INET))
+ n = neigh_lookup(&arp_tbl, &nhc->nhc_gw.ipv4, nhc->nhc_dev);
+ else if (nhc->nhc_gw_family == AF_INET6)
+ n = neigh_lookup(ipv6_stub->nd_tbl, &nhc->nhc_gw.ipv6,
+ nhc->nhc_dev);
+ else
+ n = NULL;
+
if (n) {
- state = n->nud_state;
+ state = READ_ONCE(n->nud_state);
neigh_release(n);
} else {
return 0;
@@ -459,6 +610,72 @@ static int fib_detect_death(struct fib_info *fi, int order,
return 1;
}
+int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc,
+ struct nlattr *encap, u16 encap_type,
+ void *cfg, gfp_t gfp_flags,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *,
+ gfp_flags);
+ if (!nhc->nhc_pcpu_rth_output)
+ return -ENOMEM;
+
+ if (encap) {
+ struct lwtunnel_state *lwtstate;
+
+ err = lwtunnel_build_state(net, encap_type, encap,
+ nhc->nhc_family, cfg, &lwtstate,
+ extack);
+ if (err)
+ goto lwt_failure;
+
+ nhc->nhc_lwtstate = lwtstate_get(lwtstate);
+ }
+
+ return 0;
+
+lwt_failure:
+ rt_fibinfo_free_cpus(nhc->nhc_pcpu_rth_output);
+ nhc->nhc_pcpu_rth_output = NULL;
+ return err;
+}
+EXPORT_SYMBOL_GPL(fib_nh_common_init);
+
+int fib_nh_init(struct net *net, struct fib_nh *nh,
+ struct fib_config *cfg, int nh_weight,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ nh->fib_nh_family = AF_INET;
+
+ err = fib_nh_common_init(net, &nh->nh_common, cfg->fc_encap,
+ cfg->fc_encap_type, cfg, GFP_KERNEL, extack);
+ if (err)
+ return err;
+
+ nh->fib_nh_oif = cfg->fc_oif;
+ nh->fib_nh_gw_family = cfg->fc_gw_family;
+ if (cfg->fc_gw_family == AF_INET)
+ nh->fib_nh_gw4 = cfg->fc_gw4;
+ else if (cfg->fc_gw_family == AF_INET6)
+ nh->fib_nh_gw6 = cfg->fc_gw6;
+
+ nh->fib_nh_flags = cfg->fc_flags;
+
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ nh->nh_tclassid = cfg->fc_flow;
+ if (nh->nh_tclassid)
+ atomic_inc(&net->ipv4.fib_num_tclassid_users);
+#endif
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ nh->fib_nh_weight = nh_weight;
+#endif
+ return 0;
+}
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining,
@@ -481,15 +698,34 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining,
return nhs;
}
+static int fib_gw_from_attr(__be32 *gw, struct nlattr *nla,
+ struct netlink_ext_ack *extack)
+{
+ if (nla_len(nla) < sizeof(*gw)) {
+ NL_SET_ERR_MSG(extack, "Invalid IPv4 address in RTA_GATEWAY");
+ return -EINVAL;
+ }
+
+ *gw = nla_get_in_addr(nla);
+
+ return 0;
+}
+
+/* only called when fib_nh is integrated into fib_info */
static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
int remaining, struct fib_config *cfg,
struct netlink_ext_ack *extack)
{
+ struct net *net = fi->fib_net;
+ struct fib_config fib_cfg;
+ struct fib_nh *nh;
int ret;
change_nexthops(fi) {
int attrlen;
+ memset(&fib_cfg, 0, sizeof(fib_cfg));
+
if (!rtnh_ok(rtnh, remaining)) {
NL_SET_ERR_MSG(extack,
"Invalid nexthop configuration - extra data after nexthop");
@@ -502,112 +738,145 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
return -EINVAL;
}
- nexthop_nh->nh_flags =
- (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
- nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
- nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
+ fib_cfg.fc_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
+ fib_cfg.fc_oif = rtnh->rtnh_ifindex;
attrlen = rtnh_attrlen(rtnh);
if (attrlen > 0) {
- struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+ struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh);
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
- nexthop_nh->nh_gw = nla ? nla_get_in_addr(nla) : 0;
-#ifdef CONFIG_IP_ROUTE_CLASSID
- nla = nla_find(attrs, attrlen, RTA_FLOW);
- nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
- if (nexthop_nh->nh_tclassid)
- fi->fib_net->ipv4.fib_num_tclassid_users++;
-#endif
- nla = nla_find(attrs, attrlen, RTA_ENCAP);
+ nlav = nla_find(attrs, attrlen, RTA_VIA);
+ if (nla && nlav) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop configuration can not contain both GATEWAY and VIA");
+ return -EINVAL;
+ }
if (nla) {
- struct lwtunnel_state *lwtstate;
- struct nlattr *nla_entype;
-
- nla_entype = nla_find(attrs, attrlen,
- RTA_ENCAP_TYPE);
- if (!nla_entype) {
- NL_SET_BAD_ATTR(extack, nla);
- NL_SET_ERR_MSG(extack,
- "Encap type is missing");
- goto err_inval;
- }
+ ret = fib_gw_from_attr(&fib_cfg.fc_gw4, nla,
+ extack);
+ if (ret)
+ goto errout;
- ret = lwtunnel_build_state(nla_get_u16(
- nla_entype),
- nla, AF_INET, cfg,
- &lwtstate, extack);
+ if (fib_cfg.fc_gw4)
+ fib_cfg.fc_gw_family = AF_INET;
+ } else if (nlav) {
+ ret = fib_gw_from_via(&fib_cfg, nlav, extack);
if (ret)
goto errout;
- nexthop_nh->nh_lwtstate =
- lwtstate_get(lwtstate);
}
+
+ nla = nla_find(attrs, attrlen, RTA_FLOW);
+ if (nla) {
+ if (nla_len(nla) < sizeof(u32)) {
+ NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW");
+ return -EINVAL;
+ }
+ fib_cfg.fc_flow = nla_get_u32(nla);
+ }
+
+ fib_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
+ /* RTA_ENCAP_TYPE length checked in
+ * lwtunnel_valid_encap_type_attr
+ */
+ nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
+ if (nla)
+ fib_cfg.fc_encap_type = nla_get_u16(nla);
}
+ ret = fib_nh_init(net, nexthop_nh, &fib_cfg,
+ rtnh->rtnh_hops + 1, extack);
+ if (ret)
+ goto errout;
+
rtnh = rtnh_next(rtnh, &remaining);
} endfor_nexthops(fi);
- return 0;
-
-err_inval:
ret = -EINVAL;
-
+ nh = fib_info_nh(fi, 0);
+ if (cfg->fc_oif && nh->fib_nh_oif != cfg->fc_oif) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop device index does not match RTA_OIF");
+ goto errout;
+ }
+ if (cfg->fc_gw_family) {
+ if (cfg->fc_gw_family != nh->fib_nh_gw_family ||
+ (cfg->fc_gw_family == AF_INET &&
+ nh->fib_nh_gw4 != cfg->fc_gw4) ||
+ (cfg->fc_gw_family == AF_INET6 &&
+ ipv6_addr_cmp(&nh->fib_nh_gw6, &cfg->fc_gw6))) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop gateway does not match RTA_GATEWAY or RTA_VIA");
+ goto errout;
+ }
+ }
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (cfg->fc_flow && nh->nh_tclassid != cfg->fc_flow) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop class id does not match RTA_FLOW");
+ goto errout;
+ }
+#endif
+ ret = 0;
errout:
return ret;
}
+/* only called when fib_nh is integrated into fib_info */
static void fib_rebalance(struct fib_info *fi)
{
int total;
int w;
- struct in_device *in_dev;
- if (fi->fib_nhs < 2)
+ if (fib_info_num_path(fi) < 2)
return;
total = 0;
for_nexthops(fi) {
- if (nh->nh_flags & RTNH_F_DEAD)
+ if (nh->fib_nh_flags & RTNH_F_DEAD)
continue;
- in_dev = __in_dev_get_rtnl(nh->nh_dev);
-
- if (in_dev &&
- IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
- nh->nh_flags & RTNH_F_LINKDOWN)
+ if (ip_ignore_linkdown(nh->fib_nh_dev) &&
+ nh->fib_nh_flags & RTNH_F_LINKDOWN)
continue;
- total += nh->nh_weight;
+ total += nh->fib_nh_weight;
} endfor_nexthops(fi);
w = 0;
change_nexthops(fi) {
int upper_bound;
- in_dev = __in_dev_get_rtnl(nexthop_nh->nh_dev);
-
- if (nexthop_nh->nh_flags & RTNH_F_DEAD) {
+ if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD) {
upper_bound = -1;
- } else if (in_dev &&
- IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
- nexthop_nh->nh_flags & RTNH_F_LINKDOWN) {
+ } else if (ip_ignore_linkdown(nexthop_nh->fib_nh_dev) &&
+ nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN) {
upper_bound = -1;
} else {
- w += nexthop_nh->nh_weight;
+ w += nexthop_nh->fib_nh_weight;
upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
total) - 1;
}
- atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
+ atomic_set(&nexthop_nh->fib_nh_upper_bound, upper_bound);
} endfor_nexthops(fi);
}
#else /* CONFIG_IP_ROUTE_MULTIPATH */
+static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
+ int remaining, struct fib_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ NL_SET_ERR_MSG(extack, "Multipath support not enabled in kernel");
+
+ return -EINVAL;
+}
+
#define fib_rebalance(fi) do { } while (0)
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
-static int fib_encap_match(u16 encap_type,
+static int fib_encap_match(struct net *net, u16 encap_type,
struct nlattr *encap,
const struct fib_nh *nh,
const struct fib_config *cfg,
@@ -619,17 +888,17 @@ static int fib_encap_match(u16 encap_type,
if (encap_type == LWTUNNEL_ENCAP_NONE)
return 0;
- ret = lwtunnel_build_state(encap_type, encap, AF_INET,
+ ret = lwtunnel_build_state(net, encap_type, encap, AF_INET,
cfg, &lwtstate, extack);
if (!ret) {
- result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
+ result = lwtunnel_cmp_encap(lwtstate, nh->fib_nh_lws);
lwtstate_free(lwtstate);
}
return result;
}
-int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
+int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi,
struct netlink_ext_ack *extack)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -640,21 +909,46 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
return 1;
- if (cfg->fc_oif || cfg->fc_gw) {
+ if (cfg->fc_nh_id) {
+ if (fi->nh && cfg->fc_nh_id == fi->nh->id)
+ return 0;
+ return 1;
+ }
+
+ if (fi->nh) {
+ if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_mp)
+ return 1;
+ return 0;
+ }
+
+ if (cfg->fc_oif || cfg->fc_gw_family) {
+ struct fib_nh *nh;
+
+ nh = fib_info_nh(fi, 0);
if (cfg->fc_encap) {
- if (fib_encap_match(cfg->fc_encap_type, cfg->fc_encap,
- fi->fib_nh, cfg, extack))
+ if (fib_encap_match(net, cfg->fc_encap_type,
+ cfg->fc_encap, nh, cfg, extack))
return 1;
}
#ifdef CONFIG_IP_ROUTE_CLASSID
if (cfg->fc_flow &&
- cfg->fc_flow != fi->fib_nh->nh_tclassid)
+ cfg->fc_flow != nh->nh_tclassid)
return 1;
#endif
- if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
- (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw))
- return 0;
- return 1;
+ if ((cfg->fc_oif && cfg->fc_oif != nh->fib_nh_oif) ||
+ (cfg->fc_gw_family &&
+ cfg->fc_gw_family != nh->fib_nh_gw_family))
+ return 1;
+
+ if (cfg->fc_gw_family == AF_INET &&
+ cfg->fc_gw4 != nh->fib_nh_gw4)
+ return 1;
+
+ if (cfg->fc_gw_family == AF_INET6 &&
+ ipv6_addr_cmp(&cfg->fc_gw6, &nh->fib_nh_gw6))
+ return 1;
+
+ return 0;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -670,20 +964,64 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
if (!rtnh_ok(rtnh, remaining))
return -EINVAL;
- if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
+ if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->fib_nh_oif)
return 1;
attrlen = rtnh_attrlen(rtnh);
if (attrlen > 0) {
- struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+ struct nlattr *nla, *nlav, *attrs = rtnh_attrs(rtnh);
+ int err;
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
- if (nla && nla_get_in_addr(nla) != nh->nh_gw)
- return 1;
+ nlav = nla_find(attrs, attrlen, RTA_VIA);
+ if (nla && nlav) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop configuration can not contain both GATEWAY and VIA");
+ return -EINVAL;
+ }
+
+ if (nla) {
+ __be32 gw;
+
+ err = fib_gw_from_attr(&gw, nla, extack);
+ if (err)
+ return err;
+
+ if (nh->fib_nh_gw_family != AF_INET ||
+ gw != nh->fib_nh_gw4)
+ return 1;
+ } else if (nlav) {
+ struct fib_config cfg2;
+
+ err = fib_gw_from_via(&cfg2, nlav, extack);
+ if (err)
+ return err;
+
+ switch (nh->fib_nh_gw_family) {
+ case AF_INET:
+ if (cfg2.fc_gw_family != AF_INET ||
+ cfg2.fc_gw4 != nh->fib_nh_gw4)
+ return 1;
+ break;
+ case AF_INET6:
+ if (cfg2.fc_gw_family != AF_INET6 ||
+ ipv6_addr_cmp(&cfg2.fc_gw6,
+ &nh->fib_nh_gw6))
+ return 1;
+ break;
+ }
+ }
+
#ifdef CONFIG_IP_ROUTE_CLASSID
nla = nla_find(attrs, attrlen, RTA_FLOW);
- if (nla && nla_get_u32(nla) != nh->nh_tclassid)
- return 1;
+ if (nla) {
+ if (nla_len(nla) < sizeof(u32)) {
+ NL_SET_ERR_MSG(extack, "Invalid RTA_FLOW");
+ return -EINVAL;
+ }
+ if (nla_get_u32(nla) != nh->nh_tclassid)
+ return 1;
+ }
#endif
}
@@ -710,12 +1048,13 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
if (type > RTAX_MAX)
return false;
+ type = array_index_nospec(type, RTAX_MAX + 1);
if (type == RTAX_CC_ALGO) {
char tmp[TCP_CA_NAME_MAX];
bool ecn_ca = false;
- nla_strlcpy(tmp, nla, sizeof(tmp));
- val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
+ nla_strscpy(tmp, nla, sizeof(tmp));
+ val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
} else {
if (nla_len(nla) != sizeof(u32))
return false;
@@ -733,6 +1072,31 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
return true;
}
+static int fib_check_nh_v6_gw(struct net *net, struct fib_nh *nh,
+ u32 table, struct netlink_ext_ack *extack)
+{
+ struct fib6_config cfg = {
+ .fc_table = table,
+ .fc_flags = nh->fib_nh_flags | RTF_GATEWAY,
+ .fc_ifindex = nh->fib_nh_oif,
+ .fc_gateway = nh->fib_nh_gw6,
+ };
+ struct fib6_nh fib6_nh = {};
+ int err;
+
+ err = ipv6_stub->fib6_nh_init(net, &fib6_nh, &cfg, GFP_KERNEL, extack);
+ if (!err) {
+ nh->fib_nh_dev = fib6_nh.fib_nh_dev;
+ netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker,
+ GFP_KERNEL);
+ nh->fib_nh_oif = nh->fib_nh_dev->ifindex;
+ nh->fib_nh_scope = RT_SCOPE_LINK;
+
+ ipv6_stub->fib6_nh_release(&fib6_nh);
+ }
+
+ return err;
+}
/*
* Picture
@@ -777,221 +1141,185 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
* |
* |-> {local prefix} (terminal node)
*/
-static int fib_check_nh(struct fib_config *cfg, struct fib_nh *nh,
- struct netlink_ext_ack *extack)
+static int fib_check_nh_v4_gw(struct net *net, struct fib_nh *nh, u32 table,
+ u8 scope, struct netlink_ext_ack *extack)
{
- int err = 0;
- struct net *net;
struct net_device *dev;
+ struct fib_result res;
+ int err = 0;
- net = cfg->fc_nlinfo.nl_net;
- if (nh->nh_gw) {
- struct fib_result res;
-
- if (nh->nh_flags & RTNH_F_ONLINK) {
- unsigned int addr_type;
+ if (nh->fib_nh_flags & RTNH_F_ONLINK) {
+ unsigned int addr_type;
- if (cfg->fc_scope >= RT_SCOPE_LINK) {
- NL_SET_ERR_MSG(extack,
- "Nexthop has invalid scope");
- return -EINVAL;
- }
- dev = __dev_get_by_index(net, nh->nh_oif);
- if (!dev)
- return -ENODEV;
- if (!(dev->flags & IFF_UP)) {
- NL_SET_ERR_MSG(extack,
- "Nexthop device is not up");
- return -ENETDOWN;
- }
- addr_type = inet_addr_type_dev_table(net, dev, nh->nh_gw);
- if (addr_type != RTN_UNICAST) {
- NL_SET_ERR_MSG(extack,
- "Nexthop has invalid gateway");
- return -EINVAL;
- }
- if (!netif_carrier_ok(dev))
- nh->nh_flags |= RTNH_F_LINKDOWN;
- nh->nh_dev = dev;
- dev_hold(dev);
- nh->nh_scope = RT_SCOPE_LINK;
- return 0;
+ if (scope >= RT_SCOPE_LINK) {
+ NL_SET_ERR_MSG(extack, "Nexthop has invalid scope");
+ return -EINVAL;
}
- rcu_read_lock();
- {
- struct fib_table *tbl = NULL;
- struct flowi4 fl4 = {
- .daddr = nh->nh_gw,
- .flowi4_scope = cfg->fc_scope + 1,
- .flowi4_oif = nh->nh_oif,
- .flowi4_iif = LOOPBACK_IFINDEX,
- };
-
- /* It is not necessary, but requires a bit of thinking */
- if (fl4.flowi4_scope < RT_SCOPE_LINK)
- fl4.flowi4_scope = RT_SCOPE_LINK;
-
- if (cfg->fc_table)
- tbl = fib_get_table(net, cfg->fc_table);
-
- if (tbl)
- err = fib_table_lookup(tbl, &fl4, &res,
- FIB_LOOKUP_IGNORE_LINKSTATE |
- FIB_LOOKUP_NOREF);
-
- /* on error or if no table given do full lookup. This
- * is needed for example when nexthops are in the local
- * table rather than the given table
- */
- if (!tbl || err) {
- err = fib_lookup(net, &fl4, &res,
- FIB_LOOKUP_IGNORE_LINKSTATE);
- }
-
- if (err) {
- NL_SET_ERR_MSG(extack,
- "Nexthop has invalid gateway");
- rcu_read_unlock();
- return err;
- }
+ dev = __dev_get_by_index(net, nh->fib_nh_oif);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack, "Nexthop device required for onlink");
+ return -ENODEV;
}
- err = -EINVAL;
- if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) {
- NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
- goto out;
+ if (!(dev->flags & IFF_UP)) {
+ NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+ return -ENETDOWN;
}
- nh->nh_scope = res.scope;
- nh->nh_oif = FIB_RES_OIF(res);
- nh->nh_dev = dev = FIB_RES_DEV(res);
- if (!dev) {
- NL_SET_ERR_MSG(extack,
- "No egress device for nexthop gateway");
- goto out;
+ addr_type = inet_addr_type_dev_table(net, dev, nh->fib_nh_gw4);
+ if (addr_type != RTN_UNICAST) {
+ NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
+ return -EINVAL;
}
- dev_hold(dev);
if (!netif_carrier_ok(dev))
- nh->nh_flags |= RTNH_F_LINKDOWN;
- err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
- } else {
- struct in_device *in_dev;
-
- if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) {
- NL_SET_ERR_MSG(extack,
- "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set");
- return -EINVAL;
+ nh->fib_nh_flags |= RTNH_F_LINKDOWN;
+ nh->fib_nh_dev = dev;
+ netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
+ nh->fib_nh_scope = RT_SCOPE_LINK;
+ return 0;
+ }
+ rcu_read_lock();
+ {
+ struct fib_table *tbl = NULL;
+ struct flowi4 fl4 = {
+ .daddr = nh->fib_nh_gw4,
+ .flowi4_scope = scope + 1,
+ .flowi4_oif = nh->fib_nh_oif,
+ .flowi4_iif = LOOPBACK_IFINDEX,
+ };
+
+ /* It is not necessary, but requires a bit of thinking */
+ if (fl4.flowi4_scope < RT_SCOPE_LINK)
+ fl4.flowi4_scope = RT_SCOPE_LINK;
+
+ if (table && table != RT_TABLE_MAIN)
+ tbl = fib_get_table(net, table);
+
+ if (tbl)
+ err = fib_table_lookup(tbl, &fl4, &res,
+ FIB_LOOKUP_IGNORE_LINKSTATE |
+ FIB_LOOKUP_NOREF);
+
+ /* on error or if no table given do full lookup. This
+ * is needed for example when nexthops are in the local
+ * table rather than the given table
+ */
+ if (!tbl || err) {
+ err = fib_lookup(net, &fl4, &res,
+ FIB_LOOKUP_IGNORE_LINKSTATE);
}
- rcu_read_lock();
- err = -ENODEV;
- in_dev = inetdev_by_index(net, nh->nh_oif);
- if (!in_dev)
- goto out;
- err = -ENETDOWN;
- if (!(in_dev->dev->flags & IFF_UP)) {
- NL_SET_ERR_MSG(extack, "Device for nexthop is not up");
+
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
goto out;
}
- nh->nh_dev = in_dev->dev;
- dev_hold(nh->nh_dev);
- nh->nh_scope = RT_SCOPE_HOST;
- if (!netif_carrier_ok(nh->nh_dev))
- nh->nh_flags |= RTNH_F_LINKDOWN;
- err = 0;
}
+
+ err = -EINVAL;
+ if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) {
+ NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
+ goto out;
+ }
+ nh->fib_nh_scope = res.scope;
+ nh->fib_nh_oif = FIB_RES_OIF(res);
+ nh->fib_nh_dev = dev = FIB_RES_DEV(res);
+ if (!dev) {
+ NL_SET_ERR_MSG(extack,
+ "No egress device for nexthop gateway");
+ goto out;
+ }
+ netdev_hold(dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
+ if (!netif_carrier_ok(dev))
+ nh->fib_nh_flags |= RTNH_F_LINKDOWN;
+ err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
out:
rcu_read_unlock();
return err;
}
-static inline unsigned int fib_laddr_hashfn(__be32 val)
+static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh,
+ struct netlink_ext_ack *extack)
{
- unsigned int mask = (fib_info_hash_size - 1);
+ struct in_device *in_dev;
+ int err;
- return ((__force u32)val ^
- ((__force u32)val >> 7) ^
- ((__force u32)val >> 14)) & mask;
-}
+ if (nh->fib_nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid flags for nexthop - PERVASIVE and ONLINK can not be set");
+ return -EINVAL;
+ }
-static struct hlist_head *fib_info_hash_alloc(int bytes)
-{
- if (bytes <= PAGE_SIZE)
- return kzalloc(bytes, GFP_KERNEL);
- else
- return (struct hlist_head *)
- __get_free_pages(GFP_KERNEL | __GFP_ZERO,
- get_order(bytes));
+ rcu_read_lock();
+
+ err = -ENODEV;
+ in_dev = inetdev_by_index(net, nh->fib_nh_oif);
+ if (!in_dev)
+ goto out;
+ err = -ENETDOWN;
+ if (!(in_dev->dev->flags & IFF_UP)) {
+ NL_SET_ERR_MSG(extack, "Device for nexthop is not up");
+ goto out;
+ }
+
+ nh->fib_nh_dev = in_dev->dev;
+ netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC);
+ nh->fib_nh_scope = RT_SCOPE_HOST;
+ if (!netif_carrier_ok(nh->fib_nh_dev))
+ nh->fib_nh_flags |= RTNH_F_LINKDOWN;
+ err = 0;
+out:
+ rcu_read_unlock();
+ return err;
}
-static void fib_info_hash_free(struct hlist_head *hash, int bytes)
+int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope,
+ struct netlink_ext_ack *extack)
{
- if (!hash)
- return;
+ int err;
- if (bytes <= PAGE_SIZE)
- kfree(hash);
+ if (nh->fib_nh_gw_family == AF_INET)
+ err = fib_check_nh_v4_gw(net, nh, table, scope, extack);
+ else if (nh->fib_nh_gw_family == AF_INET6)
+ err = fib_check_nh_v6_gw(net, nh, table, extack);
else
- free_pages((unsigned long) hash, get_order(bytes));
+ err = fib_check_nh_nongw(net, nh, extack);
+
+ return err;
}
-static void fib_info_hash_move(struct hlist_head *new_info_hash,
- struct hlist_head *new_laddrhash,
- unsigned int new_size)
+__be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc,
+ unsigned char scope)
{
- struct hlist_head *old_info_hash, *old_laddrhash;
- unsigned int old_size = fib_info_hash_size;
- unsigned int i, bytes;
-
- spin_lock_bh(&fib_info_lock);
- old_info_hash = fib_info_hash;
- old_laddrhash = fib_info_laddrhash;
- fib_info_hash_size = new_size;
-
- for (i = 0; i < old_size; i++) {
- struct hlist_head *head = &fib_info_hash[i];
- struct hlist_node *n;
- struct fib_info *fi;
+ struct fib_nh *nh;
+ __be32 saddr;
- hlist_for_each_entry_safe(fi, n, head, fib_hash) {
- struct hlist_head *dest;
- unsigned int new_hash;
+ if (nhc->nhc_family != AF_INET)
+ return inet_select_addr(nhc->nhc_dev, 0, scope);
- new_hash = fib_info_hashfn(fi);
- dest = &new_info_hash[new_hash];
- hlist_add_head(&fi->fib_hash, dest);
- }
- }
- fib_info_hash = new_info_hash;
+ nh = container_of(nhc, struct fib_nh, nh_common);
+ saddr = inet_select_addr(nh->fib_nh_dev, nh->fib_nh_gw4, scope);
- for (i = 0; i < old_size; i++) {
- struct hlist_head *lhead = &fib_info_laddrhash[i];
- struct hlist_node *n;
- struct fib_info *fi;
+ WRITE_ONCE(nh->nh_saddr, saddr);
+ WRITE_ONCE(nh->nh_saddr_genid, atomic_read(&net->ipv4.dev_addr_genid));
- hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) {
- struct hlist_head *ldest;
- unsigned int new_hash;
+ return saddr;
+}
- new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
- ldest = &new_laddrhash[new_hash];
- hlist_add_head(&fi->fib_lhash, ldest);
- }
- }
- fib_info_laddrhash = new_laddrhash;
+__be32 fib_result_prefsrc(struct net *net, struct fib_result *res)
+{
+ struct fib_nh_common *nhc = res->nhc;
- spin_unlock_bh(&fib_info_lock);
+ if (res->fi->fib_prefsrc)
+ return res->fi->fib_prefsrc;
- bytes = old_size * sizeof(struct hlist_head *);
- fib_info_hash_free(old_info_hash, bytes);
- fib_info_hash_free(old_laddrhash, bytes);
-}
+ if (nhc->nhc_family == AF_INET) {
+ struct fib_nh *nh;
-__be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
-{
- nh->nh_saddr = inet_select_addr(nh->nh_dev,
- nh->nh_gw,
- nh->nh_parent->fib_scope);
- nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);
+ nh = container_of(nhc, struct fib_nh, nh_common);
+ if (READ_ONCE(nh->nh_saddr_genid) ==
+ atomic_read(&net->ipv4.dev_addr_genid))
+ return READ_ONCE(nh->nh_saddr);
+ }
- return nh->nh_saddr;
+ return fib_info_update_nhc_saddr(net, nhc, res->fi->fib_scope);
}
static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
@@ -1018,22 +1346,17 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
return true;
}
-static int
-fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
-{
- return ip_metrics_convert(fi->fib_net, cfg->fc_mx, cfg->fc_mx_len,
- fi->fib_metrics->metrics);
-}
-
struct fib_info *fib_create_info(struct fib_config *cfg,
struct netlink_ext_ack *extack)
{
int err;
struct fib_info *fi = NULL;
+ struct nexthop *nh = NULL;
struct fib_info *ofi;
int nhs = 1;
struct net *net = cfg->fc_nlinfo.nl_net;
+ ASSERT_RTNL();
if (cfg->fc_type > RTN_MAX)
goto err_inval;
@@ -1049,6 +1372,23 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
goto err_inval;
}
+ if (cfg->fc_nh_id) {
+ if (!cfg->fc_mx) {
+ fi = fib_find_info_nh(net, cfg);
+ if (fi) {
+ refcount_inc(&fi->fib_treeref);
+ return fi;
+ }
+ }
+
+ nh = nexthop_find_by_id(net, cfg->fc_nh_id);
+ if (!nh) {
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ goto err_inval;
+ }
+ nhs = 0;
+ }
+
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (cfg->fc_mp) {
nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack);
@@ -1057,42 +1397,21 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
}
#endif
- err = -ENOBUFS;
- if (fib_info_cnt >= fib_info_hash_size) {
- unsigned int new_size = fib_info_hash_size << 1;
- struct hlist_head *new_info_hash;
- struct hlist_head *new_laddrhash;
- unsigned int bytes;
-
- if (!new_size)
- new_size = 16;
- bytes = new_size * sizeof(struct hlist_head *);
- new_info_hash = fib_info_hash_alloc(bytes);
- new_laddrhash = fib_info_hash_alloc(bytes);
- if (!new_info_hash || !new_laddrhash) {
- fib_info_hash_free(new_info_hash, bytes);
- fib_info_hash_free(new_laddrhash, bytes);
- } else
- fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
-
- if (!fib_info_hash_size)
- goto failure;
- }
+ fib_info_hash_grow(net);
- fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
- if (!fi)
+ fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL);
+ if (!fi) {
+ err = -ENOBUFS;
goto failure;
- if (cfg->fc_mx) {
- fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL);
- if (unlikely(!fi->fib_metrics)) {
- kfree(fi);
- return ERR_PTR(err);
- }
- refcount_set(&fi->fib_metrics->refcnt, 1);
- } else {
- fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics;
}
- fib_info_cnt++;
+
+ fi->fib_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack);
+ if (IS_ERR(fi->fib_metrics)) {
+ err = PTR_ERR(fi->fib_metrics);
+ kfree(fi);
+ return ERR_PTR(err);
+ }
+
fi->fib_net = net;
fi->fib_protocol = cfg->fc_protocol;
fi->fib_scope = cfg->fc_scope;
@@ -1103,78 +1422,31 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
fi->fib_tb_id = cfg->fc_table;
fi->fib_nhs = nhs;
- change_nexthops(fi) {
- nexthop_nh->nh_parent = fi;
- nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
- if (!nexthop_nh->nh_pcpu_rth_output)
- goto failure;
- } endfor_nexthops(fi)
-
- err = fib_convert_metrics(fi, cfg);
- if (err)
- goto failure;
-
- if (cfg->fc_mp) {
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg, extack);
- if (err != 0)
- goto failure;
- if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif) {
- NL_SET_ERR_MSG(extack,
- "Nexthop device index does not match RTA_OIF");
- goto err_inval;
- }
- if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) {
- NL_SET_ERR_MSG(extack,
- "Nexthop gateway does not match RTA_GATEWAY");
- goto err_inval;
- }
-#ifdef CONFIG_IP_ROUTE_CLASSID
- if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) {
- NL_SET_ERR_MSG(extack,
- "Nexthop class id does not match RTA_FLOW");
- goto err_inval;
+ if (nh) {
+ if (!nexthop_get(nh)) {
+ NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+ err = -EINVAL;
+ } else {
+ err = 0;
+ fi->nh = nh;
}
-#endif
-#else
- NL_SET_ERR_MSG(extack,
- "Multipath support not enabled in kernel");
- goto err_inval;
-#endif
} else {
- struct fib_nh *nh = fi->fib_nh;
-
- if (cfg->fc_encap) {
- struct lwtunnel_state *lwtstate;
-
- if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) {
- NL_SET_ERR_MSG(extack,
- "LWT encap type not specified");
- goto err_inval;
- }
- err = lwtunnel_build_state(cfg->fc_encap_type,
- cfg->fc_encap, AF_INET, cfg,
- &lwtstate, extack);
- if (err)
- goto failure;
+ change_nexthops(fi) {
+ nexthop_nh->nh_parent = fi;
+ } endfor_nexthops(fi)
- nh->nh_lwtstate = lwtstate_get(lwtstate);
- }
- nh->nh_oif = cfg->fc_oif;
- nh->nh_gw = cfg->fc_gw;
- nh->nh_flags = cfg->fc_flags;
-#ifdef CONFIG_IP_ROUTE_CLASSID
- nh->nh_tclassid = cfg->fc_flow;
- if (nh->nh_tclassid)
- fi->fib_net->ipv4.fib_num_tclassid_users++;
-#endif
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- nh->nh_weight = 1;
-#endif
+ if (cfg->fc_mp)
+ err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg,
+ extack);
+ else
+ err = fib_nh_init(net, fi->fib_nh, cfg, 1, extack);
}
+ if (err != 0)
+ goto failure;
+
if (fib_props[cfg->fc_type].error) {
- if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp) {
+ if (cfg->fc_gw_family || cfg->fc_oif || cfg->fc_mp) {
NL_SET_ERR_MSG(extack,
"Gateway, device and multipath can not be specified for this route type");
goto err_inval;
@@ -1199,7 +1471,11 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
goto err_inval;
}
- if (cfg->fc_scope == RT_SCOPE_HOST) {
+ if (fi->nh) {
+ err = fib_check_nexthop(fi->nh, cfg->fc_scope, extack);
+ if (err)
+ goto failure;
+ } else if (cfg->fc_scope == RT_SCOPE_HOST) {
struct fib_nh *nh = fi->fib_nh;
/* Local address is added. */
@@ -1208,24 +1484,28 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
"Route with host scope can not have multiple nexthops");
goto err_inval;
}
- if (nh->nh_gw) {
+ if (nh->fib_nh_gw_family) {
NL_SET_ERR_MSG(extack,
"Route with host scope can not have a gateway");
goto err_inval;
}
- nh->nh_scope = RT_SCOPE_NOWHERE;
- nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
+ nh->fib_nh_scope = RT_SCOPE_NOWHERE;
+ nh->fib_nh_dev = dev_get_by_index(net, nh->fib_nh_oif);
err = -ENODEV;
- if (!nh->nh_dev)
+ if (!nh->fib_nh_dev)
goto failure;
+ netdev_tracker_alloc(nh->fib_nh_dev, &nh->fib_nh_dev_tracker,
+ GFP_KERNEL);
} else {
int linkdown = 0;
change_nexthops(fi) {
- err = fib_check_nh(cfg, nexthop_nh, extack);
+ err = fib_check_nh(cfg->fc_nlinfo.nl_net, nexthop_nh,
+ cfg->fc_table, cfg->fc_scope,
+ extack);
if (err != 0)
goto failure;
- if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
+ if (nexthop_nh->fib_nh_flags & RTNH_F_LINKDOWN)
linkdown++;
} endfor_nexthops(fi)
if (linkdown == fi->fib_nhs)
@@ -1237,43 +1517,51 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
goto err_inval;
}
- change_nexthops(fi) {
- fib_info_update_nh_saddr(net, nexthop_nh);
- } endfor_nexthops(fi)
+ if (!fi->nh) {
+ change_nexthops(fi) {
+ fib_info_update_nhc_saddr(net, &nexthop_nh->nh_common,
+ fi->fib_scope);
+ if (nexthop_nh->fib_nh_gw_family == AF_INET6)
+ fi->fib_nh_is_v6 = true;
+ } endfor_nexthops(fi)
- fib_rebalance(fi);
+ fib_rebalance(fi);
+ }
link_it:
ofi = fib_find_info(fi);
if (ofi) {
+ /* fib_table_lookup() should not see @fi yet. */
fi->fib_dead = 1;
free_fib_info(fi);
- ofi->fib_treeref++;
+ refcount_inc(&ofi->fib_treeref);
return ofi;
}
- fi->fib_treeref++;
+ refcount_set(&fi->fib_treeref, 1);
refcount_set(&fi->fib_clntref, 1);
- spin_lock_bh(&fib_info_lock);
- hlist_add_head(&fi->fib_hash,
- &fib_info_hash[fib_info_hashfn(fi)]);
+
+ net->ipv4.fib_info_cnt++;
+ hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi));
+
if (fi->fib_prefsrc) {
struct hlist_head *head;
- head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
+ head = fib_info_laddrhash_bucket(net, fi->fib_prefsrc);
hlist_add_head(&fi->fib_lhash, head);
}
- change_nexthops(fi) {
- struct hlist_head *head;
- unsigned int hash;
+ if (fi->nh) {
+ list_add(&fi->nh_list, &nh->fi_list);
+ } else {
+ change_nexthops(fi) {
+ struct hlist_head *head;
- if (!nexthop_nh->nh_dev)
- continue;
- hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
- head = &fib_info_devhash[hash];
- hlist_add_head(&nexthop_nh->nh_hash, head);
- } endfor_nexthops(fi)
- spin_unlock_bh(&fib_info_lock);
+ if (!nexthop_nh->fib_nh_dev)
+ continue;
+ head = fib_nh_head(nexthop_nh->fib_nh_dev);
+ hlist_add_head_rcu(&nexthop_nh->nh_hash, head);
+ } endfor_nexthops(fi)
+ }
return fi;
err_inval:
@@ -1281,6 +1569,7 @@ err_inval:
failure:
if (fi) {
+ /* fib_table_lookup() should not see @fi yet. */
fi->fib_dead = 1;
free_fib_info(fi);
}
@@ -1288,10 +1577,155 @@ failure:
return ERR_PTR(err);
}
+int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
+ u8 rt_family, unsigned char *flags, bool skip_oif)
+{
+ if (nhc->nhc_flags & RTNH_F_DEAD)
+ *flags |= RTNH_F_DEAD;
+
+ if (nhc->nhc_flags & RTNH_F_LINKDOWN) {
+ *flags |= RTNH_F_LINKDOWN;
+
+ rcu_read_lock();
+ switch (nhc->nhc_family) {
+ case AF_INET:
+ if (ip_ignore_linkdown(nhc->nhc_dev))
+ *flags |= RTNH_F_DEAD;
+ break;
+ case AF_INET6:
+ if (ip6_ignore_linkdown(nhc->nhc_dev))
+ *flags |= RTNH_F_DEAD;
+ break;
+ }
+ rcu_read_unlock();
+ }
+
+ switch (nhc->nhc_gw_family) {
+ case AF_INET:
+ if (nla_put_in_addr(skb, RTA_GATEWAY, nhc->nhc_gw.ipv4))
+ goto nla_put_failure;
+ break;
+ case AF_INET6:
+ /* if gateway family does not match nexthop family
+ * gateway is encoded as RTA_VIA
+ */
+ if (rt_family != nhc->nhc_gw_family) {
+ int alen = sizeof(struct in6_addr);
+ struct nlattr *nla;
+ struct rtvia *via;
+
+ nla = nla_reserve(skb, RTA_VIA, alen + 2);
+ if (!nla)
+ goto nla_put_failure;
+
+ via = nla_data(nla);
+ via->rtvia_family = AF_INET6;
+ memcpy(via->rtvia_addr, &nhc->nhc_gw.ipv6, alen);
+ } else if (nla_put_in6_addr(skb, RTA_GATEWAY,
+ &nhc->nhc_gw.ipv6) < 0) {
+ goto nla_put_failure;
+ }
+ break;
+ }
+
+ *flags |= (nhc->nhc_flags &
+ (RTNH_F_ONLINK | RTNH_F_OFFLOAD | RTNH_F_TRAP));
+
+ if (!skip_oif && nhc->nhc_dev &&
+ nla_put_u32(skb, RTA_OIF, nhc->nhc_dev->ifindex))
+ goto nla_put_failure;
+
+ if (lwtunnel_fill_encap(skb, nhc->nhc_lwtstate,
+ RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(fib_nexthop_info);
+
+#if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6)
+int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
+ int nh_weight, u8 rt_family, u32 nh_tclassid)
+{
+ const struct net_device *dev = nhc->nhc_dev;
+ struct rtnexthop *rtnh;
+ unsigned char flags = 0;
+
+ rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
+ if (!rtnh)
+ goto nla_put_failure;
+
+ rtnh->rtnh_hops = nh_weight - 1;
+ rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
+
+ if (fib_nexthop_info(skb, nhc, rt_family, &flags, true) < 0)
+ goto nla_put_failure;
+
+ rtnh->rtnh_flags = flags;
+
+ if (nh_tclassid && nla_put_u32(skb, RTA_FLOW, nh_tclassid))
+ goto nla_put_failure;
+
+ /* length of rtnetlink header + attributes */
+ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(fib_add_nexthop);
+#endif
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
+{
+ struct nlattr *mp;
+
+ mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
+ if (!mp)
+ goto nla_put_failure;
+
+ if (unlikely(fi->nh)) {
+ if (nexthop_mpath_fill_node(skb, fi->nh, AF_INET) < 0)
+ goto nla_put_failure;
+ goto mp_end;
+ }
+
+ for_nexthops(fi) {
+ u32 nh_tclassid = 0;
+#ifdef CONFIG_IP_ROUTE_CLASSID
+ nh_tclassid = nh->nh_tclassid;
+#endif
+ if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight,
+ AF_INET, nh_tclassid) < 0)
+ goto nla_put_failure;
+ } endfor_nexthops(fi);
+
+mp_end:
+ nla_nest_end(skb, mp);
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+#else
+static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
+{
+ return 0;
+}
+#endif
+
int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
- u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
- struct fib_info *fi, unsigned int flags)
+ const struct fib_rt_info *fri, unsigned int flags)
{
+ unsigned int nhs = fib_info_num_path(fri->fi);
+ struct fib_info *fi = fri->fi;
+ u32 tb_id = fri->tb_id;
struct nlmsghdr *nlh;
struct rtmsg *rtm;
@@ -1301,22 +1735,22 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
rtm = nlmsg_data(nlh);
rtm->rtm_family = AF_INET;
- rtm->rtm_dst_len = dst_len;
+ rtm->rtm_dst_len = fri->dst_len;
rtm->rtm_src_len = 0;
- rtm->rtm_tos = tos;
+ rtm->rtm_tos = inet_dscp_to_dsfield(fri->dscp);
if (tb_id < 256)
rtm->rtm_table = tb_id;
else
rtm->rtm_table = RT_TABLE_COMPAT;
if (nla_put_u32(skb, RTA_TABLE, tb_id))
goto nla_put_failure;
- rtm->rtm_type = type;
+ rtm->rtm_type = fri->type;
rtm->rtm_flags = fi->fib_flags;
rtm->rtm_scope = fi->fib_scope;
rtm->rtm_protocol = fi->fib_protocol;
if (rtm->rtm_dst_len &&
- nla_put_in_addr(skb, RTA_DST, dst))
+ nla_put_in_addr(skb, RTA_DST, fri->dst))
goto nla_put_failure;
if (fi->fib_priority &&
nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
@@ -1327,81 +1761,47 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
if (fi->fib_prefsrc &&
nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
goto nla_put_failure;
- if (fi->fib_nhs == 1) {
- if (fi->fib_nh->nh_gw &&
- nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
- goto nla_put_failure;
- if (fi->fib_nh->nh_oif &&
- nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
- goto nla_put_failure;
- if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) {
- struct in_device *in_dev;
-
- rcu_read_lock();
- in_dev = __in_dev_get_rcu(fi->fib_nh->nh_dev);
- if (in_dev &&
- IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
- rtm->rtm_flags |= RTNH_F_DEAD;
- rcu_read_unlock();
- }
- if (fi->fib_nh->nh_flags & RTNH_F_OFFLOAD)
- rtm->rtm_flags |= RTNH_F_OFFLOAD;
-#ifdef CONFIG_IP_ROUTE_CLASSID
- if (fi->fib_nh[0].nh_tclassid &&
- nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
- goto nla_put_failure;
-#endif
- if (fi->fib_nh->nh_lwtstate &&
- lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate) < 0)
- goto nla_put_failure;
- }
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (fi->fib_nhs > 1) {
- struct rtnexthop *rtnh;
- struct nlattr *mp;
- mp = nla_nest_start(skb, RTA_MULTIPATH);
- if (!mp)
+ if (fi->nh) {
+ if (nla_put_u32(skb, RTA_NH_ID, fi->nh->id))
goto nla_put_failure;
+ if (nexthop_is_blackhole(fi->nh))
+ rtm->rtm_type = RTN_BLACKHOLE;
+ if (!READ_ONCE(fi->fib_net->ipv4.sysctl_nexthop_compat_mode))
+ goto offload;
+ }
- for_nexthops(fi) {
- rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
- if (!rtnh)
- goto nla_put_failure;
-
- rtnh->rtnh_flags = nh->nh_flags & 0xFF;
- if (nh->nh_flags & RTNH_F_LINKDOWN) {
- struct in_device *in_dev;
+ if (nhs == 1) {
+ const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
+ unsigned char flags = 0;
- rcu_read_lock();
- in_dev = __in_dev_get_rcu(nh->nh_dev);
- if (in_dev &&
- IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
- rtnh->rtnh_flags |= RTNH_F_DEAD;
- rcu_read_unlock();
- }
- rtnh->rtnh_hops = nh->nh_weight - 1;
- rtnh->rtnh_ifindex = nh->nh_oif;
+ if (fib_nexthop_info(skb, nhc, AF_INET, &flags, false) < 0)
+ goto nla_put_failure;
- if (nh->nh_gw &&
- nla_put_in_addr(skb, RTA_GATEWAY, nh->nh_gw))
- goto nla_put_failure;
+ rtm->rtm_flags = flags;
#ifdef CONFIG_IP_ROUTE_CLASSID
+ if (nhc->nhc_family == AF_INET) {
+ struct fib_nh *nh;
+
+ nh = container_of(nhc, struct fib_nh, nh_common);
if (nh->nh_tclassid &&
nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
goto nla_put_failure;
+ }
#endif
- if (nh->nh_lwtstate &&
- lwtunnel_fill_encap(skb, nh->nh_lwtstate) < 0)
- goto nla_put_failure;
+ } else {
+ if (fib_add_multipath(skb, fi) < 0)
+ goto nla_put_failure;
+ }
- /* length of rtnetlink header + attributes */
- rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
- } endfor_nexthops(fi);
+offload:
+ if (fri->offload)
+ rtm->rtm_flags |= RTM_F_OFFLOAD;
+ if (fri->trap)
+ rtm->rtm_flags |= RTM_F_TRAP;
+ if (fri->offload_failed)
+ rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED;
- nla_nest_end(skb, mp);
- }
-#endif
nlmsg_end(skb, nlh);
return 0;
@@ -1418,51 +1818,51 @@ nla_put_failure:
*/
int fib_sync_down_addr(struct net_device *dev, __be32 local)
{
- int ret = 0;
- unsigned int hash = fib_laddr_hashfn(local);
- struct hlist_head *head = &fib_info_laddrhash[hash];
+ int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
struct net *net = dev_net(dev);
- int tb_id = l3mdev_fib_table(dev);
+ struct hlist_head *head;
struct fib_info *fi;
+ int ret = 0;
- if (!fib_info_laddrhash || local == 0)
+ if (!local)
return 0;
+ head = fib_info_laddrhash_bucket(net, local);
hlist_for_each_entry(fi, head, fib_lhash) {
if (!net_eq(fi->fib_net, net) ||
fi->fib_tb_id != tb_id)
continue;
if (fi->fib_prefsrc == local) {
fi->fib_flags |= RTNH_F_DEAD;
+ fi->pfsrc_removed = true;
ret++;
}
}
return ret;
}
-static int call_fib_nh_notifiers(struct fib_nh *fib_nh,
+static int call_fib_nh_notifiers(struct fib_nh *nh,
enum fib_event_type event_type)
{
- struct in_device *in_dev = __in_dev_get_rtnl(fib_nh->nh_dev);
+ bool ignore_link_down = ip_ignore_linkdown(nh->fib_nh_dev);
struct fib_nh_notifier_info info = {
- .fib_nh = fib_nh,
+ .fib_nh = nh,
};
switch (event_type) {
case FIB_EVENT_NH_ADD:
- if (fib_nh->nh_flags & RTNH_F_DEAD)
+ if (nh->fib_nh_flags & RTNH_F_DEAD)
break;
- if (IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
- fib_nh->nh_flags & RTNH_F_LINKDOWN)
+ if (ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN)
break;
- return call_fib4_notifiers(dev_net(fib_nh->nh_dev), event_type,
+ return call_fib4_notifiers(dev_net(nh->fib_nh_dev), event_type,
&info.info);
case FIB_EVENT_NH_DEL:
- if ((in_dev && IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
- fib_nh->nh_flags & RTNH_F_LINKDOWN) ||
- (fib_nh->nh_flags & RTNH_F_DEAD))
- return call_fib4_notifiers(dev_net(fib_nh->nh_dev),
+ if ((ignore_link_down && nh->fib_nh_flags & RTNH_F_LINKDOWN) ||
+ (nh->fib_nh_flags & RTNH_F_DEAD))
+ return call_fib4_notifiers(dev_net(nh->fib_nh_dev),
event_type, &info.info);
+ break;
default:
break;
}
@@ -1470,20 +1870,70 @@ static int call_fib_nh_notifiers(struct fib_nh *fib_nh,
return NOTIFY_DONE;
}
+/* Update the PMTU of exceptions when:
+ * - the new MTU of the first hop becomes smaller than the PMTU
+ * - the old MTU was the same as the PMTU, and it limited discovery of
+ * larger MTUs on the path. With that limit raised, we can now
+ * discover larger MTUs
+ * A special case is locked exceptions, for which the PMTU is smaller
+ * than the minimal accepted PMTU:
+ * - if the new MTU is greater than the PMTU, don't make any change
+ * - otherwise, unlock and set PMTU
+ */
+void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig)
+{
+ struct fnhe_hash_bucket *bucket;
+ int i;
+
+ bucket = rcu_dereference_protected(nhc->nhc_exceptions, 1);
+ if (!bucket)
+ return;
+
+ for (i = 0; i < FNHE_HASH_SIZE; i++) {
+ struct fib_nh_exception *fnhe;
+
+ for (fnhe = rcu_dereference_protected(bucket[i].chain, 1);
+ fnhe;
+ fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1)) {
+ if (fnhe->fnhe_mtu_locked) {
+ if (new <= fnhe->fnhe_pmtu) {
+ fnhe->fnhe_pmtu = new;
+ fnhe->fnhe_mtu_locked = false;
+ }
+ } else if (new < fnhe->fnhe_pmtu ||
+ orig == fnhe->fnhe_pmtu) {
+ fnhe->fnhe_pmtu = new;
+ }
+ }
+ }
+}
+
+void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
+{
+ struct hlist_head *head = fib_nh_head(dev);
+ struct fib_nh *nh;
+
+ hlist_for_each_entry(nh, head, nh_hash) {
+ DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev);
+ fib_nhc_update_mtu(&nh->nh_common, dev->mtu, orig_mtu);
+ }
+}
+
/* Event force Flags Description
* NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host
* NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host
* NETDEV_DOWN 1 LINKDOWN|DEAD Last address removed
* NETDEV_UNREGISTER 1 LINKDOWN|DEAD Device removed
+ *
+ * only used when fib_nh is built into fib_info
*/
int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
{
- int ret = 0;
- int scope = RT_SCOPE_NOWHERE;
+ struct hlist_head *head = fib_nh_head(dev);
struct fib_info *prev_fi = NULL;
- unsigned int hash = fib_devindex_hashfn(dev->ifindex);
- struct hlist_head *head = &fib_info_devhash[hash];
+ int scope = RT_SCOPE_NOWHERE;
struct fib_nh *nh;
+ int ret = 0;
if (force)
scope = -1;
@@ -1493,22 +1943,23 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
int dead;
BUG_ON(!fi->fib_nhs);
- if (nh->nh_dev != dev || fi == prev_fi)
+ DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev);
+ if (fi == prev_fi)
continue;
prev_fi = fi;
dead = 0;
change_nexthops(fi) {
- if (nexthop_nh->nh_flags & RTNH_F_DEAD)
+ if (nexthop_nh->fib_nh_flags & RTNH_F_DEAD)
dead++;
- else if (nexthop_nh->nh_dev == dev &&
- nexthop_nh->nh_scope != scope) {
+ else if (nexthop_nh->fib_nh_dev == dev &&
+ nexthop_nh->fib_nh_scope != scope) {
switch (event) {
case NETDEV_DOWN:
case NETDEV_UNREGISTER:
- nexthop_nh->nh_flags |= RTNH_F_DEAD;
- /* fall through */
+ nexthop_nh->fib_nh_flags |= RTNH_F_DEAD;
+ fallthrough;
case NETDEV_CHANGE:
- nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
+ nexthop_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
break;
}
call_fib_nh_notifiers(nexthop_nh,
@@ -1517,7 +1968,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (event == NETDEV_UNREGISTER &&
- nexthop_nh->nh_dev == dev) {
+ nexthop_nh->fib_nh_dev == dev) {
dead = fi->fib_nhs;
break;
}
@@ -1528,7 +1979,7 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
case NETDEV_DOWN:
case NETDEV_UNREGISTER:
fi->fib_flags |= RTNH_F_DEAD;
- /* fall through */
+ fallthrough;
case NETDEV_CHANGE:
fi->fib_flags |= RTNH_F_LINKDOWN;
break;
@@ -1552,33 +2003,35 @@ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
int order = -1, last_idx = -1;
struct fib_alias *fa, *fa1 = NULL;
u32 last_prio = res->fi->fib_priority;
- u8 last_tos = 0;
+ dscp_t last_dscp = 0;
hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
struct fib_info *next_fi = fa->fa_info;
+ struct fib_nh_common *nhc;
if (fa->fa_slen != slen)
continue;
- if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp))
continue;
if (fa->tb_id != tb->tb_id)
continue;
if (next_fi->fib_priority > last_prio &&
- fa->fa_tos == last_tos) {
- if (last_tos)
+ fa->fa_dscp == last_dscp) {
+ if (last_dscp)
continue;
break;
}
if (next_fi->fib_flags & RTNH_F_DEAD)
continue;
- last_tos = fa->fa_tos;
+ last_dscp = fa->fa_dscp;
last_prio = next_fi->fib_priority;
if (next_fi->fib_scope != res->scope ||
fa->fa_type != RTN_UNICAST)
continue;
- if (!next_fi->fib_nh[0].nh_gw ||
- next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+
+ nhc = fib_info_nhc(next_fi, 0);
+ if (!nhc->nhc_gw_family || nhc->nhc_scope != RT_SCOPE_LINK)
continue;
fib_alias_accessed(fa);
@@ -1620,11 +2073,12 @@ out:
/*
* Dead device goes up. We wake up dead nexthops.
* It takes sense only on multipath routes.
+ *
+ * only used when fib_nh is built into fib_info
*/
-int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
+int fib_sync_up(struct net_device *dev, unsigned char nh_flags)
{
struct fib_info *prev_fi;
- unsigned int hash;
struct hlist_head *head;
struct fib_nh *nh;
int ret;
@@ -1633,15 +2087,14 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
return 0;
if (nh_flags & RTNH_F_DEAD) {
- unsigned int flags = dev_get_flags(dev);
+ unsigned int flags = netif_get_flags(dev);
if (flags & (IFF_RUNNING | IFF_LOWER_UP))
nh_flags |= RTNH_F_LINKDOWN;
}
prev_fi = NULL;
- hash = fib_devindex_hashfn(dev->ifindex);
- head = &fib_info_devhash[hash];
+ head = fib_nh_head(dev);
ret = 0;
hlist_for_each_entry(nh, head, nh_hash) {
@@ -1649,24 +2102,25 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
int alive;
BUG_ON(!fi->fib_nhs);
- if (nh->nh_dev != dev || fi == prev_fi)
+ DEBUG_NET_WARN_ON_ONCE(nh->fib_nh_dev != dev);
+ if (fi == prev_fi)
continue;
prev_fi = fi;
alive = 0;
change_nexthops(fi) {
- if (!(nexthop_nh->nh_flags & nh_flags)) {
+ if (!(nexthop_nh->fib_nh_flags & nh_flags)) {
alive++;
continue;
}
- if (!nexthop_nh->nh_dev ||
- !(nexthop_nh->nh_dev->flags & IFF_UP))
+ if (!nexthop_nh->fib_nh_dev ||
+ !(nexthop_nh->fib_nh_dev->flags & IFF_UP))
continue;
- if (nexthop_nh->nh_dev != dev ||
+ if (nexthop_nh->fib_nh_dev != dev ||
!__in_dev_get_rtnl(dev))
continue;
alive++;
- nexthop_nh->nh_flags &= ~nh_flags;
+ nexthop_nh->fib_nh_flags &= ~nh_flags;
call_fib_nh_notifiers(nexthop_nh, FIB_EVENT_NH_ADD);
} endfor_nexthops(fi)
@@ -1686,43 +2140,74 @@ static bool fib_good_nh(const struct fib_nh *nh)
{
int state = NUD_REACHABLE;
- if (nh->nh_scope == RT_SCOPE_LINK) {
+ if (nh->fib_nh_scope == RT_SCOPE_LINK) {
struct neighbour *n;
- rcu_read_lock_bh();
+ rcu_read_lock();
- n = __ipv4_neigh_lookup_noref(nh->nh_dev,
- (__force u32)nh->nh_gw);
+ if (likely(nh->fib_nh_gw_family == AF_INET))
+ n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
+ (__force u32)nh->fib_nh_gw4);
+ else if (nh->fib_nh_gw_family == AF_INET6)
+ n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev,
+ &nh->fib_nh_gw6);
+ else
+ n = NULL;
if (n)
- state = n->nud_state;
+ state = READ_ONCE(n->nud_state);
- rcu_read_unlock_bh();
+ rcu_read_unlock();
}
return !!(state & NUD_VALID);
}
-void fib_select_multipath(struct fib_result *res, int hash)
+void fib_select_multipath(struct fib_result *res, int hash,
+ const struct flowi4 *fl4)
{
struct fib_info *fi = res->fi;
struct net *net = fi->fib_net;
- bool first = false;
+ bool found = false;
+ bool use_neigh;
+ __be32 saddr;
- for_nexthops(fi) {
- if (net->ipv4.sysctl_fib_multipath_use_neigh) {
- if (!fib_good_nh(nh))
- continue;
- if (!first) {
- res->nh_sel = nhsel;
- first = true;
- }
+ if (unlikely(res->fi->nh)) {
+ nexthop_path_fib_result(res, hash);
+ return;
+ }
+
+ use_neigh = READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh);
+ saddr = fl4 ? fl4->saddr : 0;
+
+ change_nexthops(fi) {
+ int nh_upper_bound;
+
+ /* Nexthops without a carrier are assigned an upper bound of
+ * minus one when "ignore_routes_with_linkdown" is set.
+ */
+ nh_upper_bound = atomic_read(&nexthop_nh->fib_nh_upper_bound);
+ if (nh_upper_bound == -1 ||
+ (use_neigh && !fib_good_nh(nexthop_nh)))
+ continue;
+
+ if (!found) {
+ res->nh_sel = nhsel;
+ res->nhc = &nexthop_nh->nh_common;
+ found = !saddr || nexthop_nh->nh_saddr == saddr;
}
- if (hash > atomic_read(&nh->nh_upper_bound))
+ if (hash > nh_upper_bound)
continue;
- res->nh_sel = nhsel;
- return;
+ if (!saddr || nexthop_nh->nh_saddr == saddr) {
+ res->nh_sel = nhsel;
+ res->nhc = &nexthop_nh->nh_common;
+ return;
+ }
+
+ if (found)
+ return;
+
} endfor_nexthops(fi);
}
#endif
@@ -1730,14 +2215,14 @@ void fib_select_multipath(struct fib_result *res, int hash)
void fib_select_path(struct net *net, struct fib_result *res,
struct flowi4 *fl4, const struct sk_buff *skb)
{
- if (fl4->flowi4_oif && !(fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF))
+ if (fl4->flowi4_oif)
goto check_saddr;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi->fib_nhs > 1) {
+ if (fib_info_num_path(res->fi) > 1) {
int h = fib_multipath_hash(net, fl4, skb, NULL);
- fib_select_multipath(res, h);
+ fib_select_multipath(res, h, fl4);
}
else
#endif
@@ -1747,6 +2232,34 @@ void fib_select_path(struct net *net, struct fib_result *res,
fib_select_default(fl4, res);
check_saddr:
- if (!fl4->saddr)
- fl4->saddr = FIB_RES_PREFSRC(net, *res);
+ if (!fl4->saddr) {
+ struct net_device *l3mdev;
+
+ l3mdev = dev_get_by_index_rcu(net, fl4->flowi4_l3mdev);
+
+ if (!l3mdev ||
+ l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) == l3mdev)
+ fl4->saddr = fib_result_prefsrc(net, res);
+ else
+ fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK);
+ }
+}
+
+int __net_init fib4_semantics_init(struct net *net)
+{
+ unsigned int hash_bits = 4;
+
+ net->ipv4.fib_info_hash = fib_info_hash_alloc(hash_bits);
+ if (!net->ipv4.fib_info_hash)
+ return -ENOMEM;
+
+ net->ipv4.fib_info_hash_bits = hash_bits;
+ net->ipv4.fib_info_cnt = 0;
+
+ return 0;
+}
+
+void __net_exit fib4_semantics_exit(struct net *net)
+{
+ fib_info_hash_free(net->ipv4.fib_info_hash);
}
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 5bc0c89e81e4..59a6f0a9638f 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1,8 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*
* Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
* & Swedish University of Agricultural Sciences.
@@ -16,30 +13,21 @@
*
* An experimental study of compression methods for dynamic tries
* Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
- * http://www.csc.kth.se/~snilsson/software/dyntrie2/
- *
+ * https://www.csc.kth.se/~snilsson/software/dyntrie2/
*
* IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
* IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
*
- *
* Code from fib_hash has been reused which includes the following header:
*
- *
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* IPv4 FIB: lookup engine and maintenance routines.
*
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Substantial contributions to this work comes from:
*
* David S. Miller, <davem@davemloft.net>
@@ -47,9 +35,6 @@
* Paul E. McKenney <paulmck@us.ibm.com>
* Patrick McHardy <kaber@trash.net>
*/
-
-#define VERSION "0.409"
-
#include <linux/cache.h>
#include <linux/uaccess.h>
#include <linux/bitops.h>
@@ -67,6 +52,7 @@
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/rcupdate.h>
+#include <linux/rcupdate_wait.h>
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
@@ -76,6 +62,7 @@
#include <linux/vmalloc.h>
#include <linux/notifier.h>
#include <net/net_namespace.h>
+#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/route.h>
@@ -86,19 +73,21 @@
#include <trace/events/fib.h>
#include "fib_lookup.h"
-static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
+static int call_fib_entry_notifier(struct notifier_block *nb,
enum fib_event_type event_type, u32 dst,
- int dst_len, struct fib_alias *fa)
+ int dst_len, struct fib_alias *fa,
+ struct netlink_ext_ack *extack)
{
struct fib_entry_notifier_info info = {
+ .info.extack = extack,
.dst = dst,
.dst_len = dst_len,
.fi = fa->fa_info,
- .tos = fa->fa_tos,
+ .dscp = fa->fa_dscp,
.type = fa->fa_type,
.tb_id = fa->tb_id,
};
- return call_fib4_notifier(nb, net, event_type, &info.info);
+ return call_fib4_notifier(nb, event_type, &info.info);
}
static int call_fib_entry_notifiers(struct net *net,
@@ -111,7 +100,7 @@ static int call_fib_entry_notifiers(struct net *net,
.dst = dst,
.dst_len = dst_len,
.fi = fa->fa_info,
- .tos = fa->fa_tos,
+ .dscp = fa->fa_dscp,
.type = fa->fa_type,
.tb_id = fa->tb_id,
};
@@ -138,7 +127,7 @@ struct key_vector {
/* This list pointer if valid if (pos | bits) == 0 (LEAF) */
struct hlist_head leaf;
/* This array is valid if (pos | bits) > 0 (TNODE) */
- struct key_vector __rcu *tnode[0];
+ DECLARE_FLEX_ARRAY(struct key_vector __rcu *, tnode);
};
};
@@ -183,14 +172,16 @@ struct trie {
};
static struct key_vector *resize(struct trie *t, struct key_vector *tn);
-static size_t tnode_free_size;
+static unsigned int tnode_free_size;
/*
- * synchronize_rcu after call_rcu for that many pages; it should be especially
- * useful before resizing the root node with PREEMPT_NONE configs; the value was
- * obtained experimentally, aiming to avoid visible slowdown.
+ * synchronize_rcu after call_rcu for outstanding dirty memory; it should be
+ * especially useful before resizing the root node with PREEMPT_NONE configs;
+ * the value was obtained experimentally, aiming to avoid visible slowdown.
*/
-static const int sync_pages = 128;
+unsigned int sysctl_fib_sync_mem = 512 * 1024;
+unsigned int sysctl_fib_sync_mem_min = 64 * 1024;
+unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024;
static struct kmem_cache *fn_alias_kmem __ro_after_init;
static struct kmem_cache *trie_leaf_kmem __ro_after_init;
@@ -301,19 +292,11 @@ static const int inflate_threshold = 50;
static const int halve_threshold_root = 15;
static const int inflate_threshold_root = 30;
-static void __alias_free_mem(struct rcu_head *head)
-{
- struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
- kmem_cache_free(fn_alias_kmem, fa);
-}
-
static inline void alias_free_mem_rcu(struct fib_alias *fa)
{
- call_rcu(&fa->rcu, __alias_free_mem);
+ kfree_rcu(fa, rcu);
}
-#define TNODE_KMALLOC_MAX \
- ilog2((PAGE_SIZE - TNODE_SIZE(0)) / sizeof(struct key_vector *))
#define TNODE_VMALLOC_MAX \
ilog2((SIZE_MAX - TNODE_SIZE(0)) / sizeof(struct key_vector *))
@@ -348,12 +331,18 @@ static struct tnode *tnode_alloc(int bits)
static inline void empty_child_inc(struct key_vector *n)
{
- ++tn_info(n)->empty_children ? : ++tn_info(n)->full_children;
+ tn_info(n)->empty_children++;
+
+ if (!tn_info(n)->empty_children)
+ tn_info(n)->full_children++;
}
static inline void empty_child_dec(struct key_vector *n)
{
- tn_info(n)->empty_children-- ? : tn_info(n)->full_children--;
+ if (!tn_info(n)->empty_children)
+ tn_info(n)->full_children--;
+
+ tn_info(n)->empty_children--;
}
static struct key_vector *leaf_new(t_key key, struct fib_alias *fa)
@@ -504,9 +493,9 @@ static void tnode_free(struct key_vector *tn)
tn = container_of(head, struct tnode, rcu)->kv;
}
- if (tnode_free_size >= PAGE_SIZE * sync_pages) {
+ if (tnode_free_size >= READ_ONCE(sysctl_fib_sync_mem)) {
tnode_free_size = 0;
- synchronize_rcu();
+ synchronize_net();
}
}
@@ -980,11 +969,14 @@ static struct key_vector *fib_find_node(struct trie *t,
return n;
}
-/* Return the first fib alias matching TOS with
+/* Return the first fib alias matching DSCP with
* priority less than or equal to PRIO.
+ * If 'find_first' is set, return the first matching
+ * fib alias, regardless of DSCP and priority.
*/
static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
- u8 tos, u32 prio, u32 tb_id)
+ dscp_t dscp, u32 prio, u32 tb_id,
+ bool find_first)
{
struct fib_alias *fa;
@@ -992,6 +984,10 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
return NULL;
hlist_for_each_entry(fa, fah, fa_list) {
+ /* Avoid Sparse warning when using dscp_t in inequalities */
+ u8 __fa_dscp = inet_dscp_to_dsfield(fa->fa_dscp);
+ u8 __dscp = inet_dscp_to_dsfield(dscp);
+
if (fa->fa_slen < slen)
continue;
if (fa->fa_slen != slen)
@@ -1000,15 +996,105 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen,
continue;
if (fa->tb_id != tb_id)
break;
- if (fa->fa_tos > tos)
+ if (find_first)
+ return fa;
+ if (__fa_dscp > __dscp)
continue;
- if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos)
+ if (fa->fa_info->fib_priority >= prio || __fa_dscp < __dscp)
return fa;
}
return NULL;
}
+static struct fib_alias *
+fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri)
+{
+ u8 slen = KEYLENGTH - fri->dst_len;
+ struct key_vector *l, *tp;
+ struct fib_table *tb;
+ struct fib_alias *fa;
+ struct trie *t;
+
+ tb = fib_get_table(net, fri->tb_id);
+ if (!tb)
+ return NULL;
+
+ t = (struct trie *)tb->tb_data;
+ l = fib_find_node(t, &tp, be32_to_cpu(fri->dst));
+ if (!l)
+ return NULL;
+
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ if (fa->fa_slen == slen && fa->tb_id == fri->tb_id &&
+ fa->fa_dscp == fri->dscp && fa->fa_info == fri->fi &&
+ fa->fa_type == fri->type)
+ return fa;
+ }
+
+ return NULL;
+}
+
+void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri)
+{
+ u8 fib_notify_on_flag_change;
+ struct fib_alias *fa_match;
+ struct sk_buff *skb;
+ int err;
+
+ rcu_read_lock();
+
+ fa_match = fib_find_matching_alias(net, fri);
+ if (!fa_match)
+ goto out;
+
+ /* These are paired with the WRITE_ONCE() happening in this function.
+ * The reason is that we are only protected by RCU at this point.
+ */
+ if (READ_ONCE(fa_match->offload) == fri->offload &&
+ READ_ONCE(fa_match->trap) == fri->trap &&
+ READ_ONCE(fa_match->offload_failed) == fri->offload_failed)
+ goto out;
+
+ WRITE_ONCE(fa_match->offload, fri->offload);
+ WRITE_ONCE(fa_match->trap, fri->trap);
+
+ fib_notify_on_flag_change = READ_ONCE(net->ipv4.sysctl_fib_notify_on_flag_change);
+
+ /* 2 means send notifications only if offload_failed was changed. */
+ if (fib_notify_on_flag_change == 2 &&
+ READ_ONCE(fa_match->offload_failed) == fri->offload_failed)
+ goto out;
+
+ WRITE_ONCE(fa_match->offload_failed, fri->offload_failed);
+
+ if (!fib_notify_on_flag_change)
+ goto out;
+
+ skb = nlmsg_new(fib_nlmsg_size(fa_match->fa_info), GFP_ATOMIC);
+ if (!skb) {
+ err = -ENOBUFS;
+ goto errout;
+ }
+
+ err = fib_dump_info(skb, 0, 0, RTM_NEWROUTE, fri, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_ROUTE, NULL, GFP_ATOMIC);
+ goto out;
+
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_ROUTE, err);
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set);
+
static void trie_rebalance(struct trie *t, struct key_vector *tn)
{
while (!IS_TRIE(tn))
@@ -1065,9 +1151,6 @@ noleaf:
return -ENOMEM;
}
-/* fib notifier for ADD is sent before calling fib_insert_alias with
- * the expectation that the only possible failure ENOMEM
- */
static int fib_insert_alias(struct trie *t, struct key_vector *tp,
struct key_vector *l, struct fib_alias *new,
struct fib_alias *fa, t_key key)
@@ -1104,27 +1187,13 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp,
return 0;
}
-static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack)
-{
- if (plen > KEYLENGTH) {
- NL_SET_ERR_MSG(extack, "Invalid prefix length");
- return false;
- }
-
- if ((plen < KEYLENGTH) && (key << plen)) {
- NL_SET_ERR_MSG(extack,
- "Invalid prefix for given prefix length");
- return false;
- }
-
- return true;
-}
+static void fib_remove_alias(struct trie *t, struct key_vector *tp,
+ struct key_vector *l, struct fib_alias *old);
/* Caller must hold RTNL. */
int fib_table_insert(struct net *net, struct fib_table *tb,
struct fib_config *cfg, struct netlink_ext_ack *extack)
{
- enum fib_event_type event = FIB_EVENT_ENTRY_ADD;
struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa, *new_fa;
struct key_vector *l, *tp;
@@ -1132,15 +1201,12 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
struct fib_info *fi;
u8 plen = cfg->fc_dst_len;
u8 slen = KEYLENGTH - plen;
- u8 tos = cfg->fc_tos;
+ dscp_t dscp;
u32 key;
int err;
key = ntohl(cfg->fc_dst);
- if (!fib_valid_key_len(key, plen, extack))
- return -EINVAL;
-
pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
fi = fib_create_info(cfg, extack);
@@ -1149,12 +1215,13 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
goto err;
}
+ dscp = cfg->fc_dscp;
l = fib_find_node(t, &tp, key);
- fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority,
- tb->tb_id) : NULL;
+ fa = l ? fib_find_alias(&l->leaf, slen, dscp, fi->fib_priority,
+ tb->tb_id, false) : NULL;
/* Now fa, if non-NULL, points to the first fib alias
- * with the same keys [prefix,tos,priority], if such key already
+ * with the same keys [prefix,dscp,priority], if such key already
* exists or to the node before which we will insert new one.
*
* If fa is NULL, we will need to allocate a new one and
@@ -1162,7 +1229,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
* of the new alias.
*/
- if (fa && fa->fa_tos == tos &&
+ if (fa && fa->fa_dscp == dscp &&
fa->fa_info->fib_priority == fi->fib_priority) {
struct fib_alias *fa_first, *fa_match;
@@ -1182,7 +1249,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
hlist_for_each_entry_from(fa, fa_list) {
if ((fa->fa_slen != slen) ||
(fa->tb_id != tb->tb_id) ||
- (fa->fa_tos != tos))
+ (fa->fa_dscp != dscp))
break;
if (fa->fa_info->fib_priority != fi->fib_priority)
break;
@@ -1210,7 +1277,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
goto out;
fi_drop = fa->fa_info;
- new_fa->fa_tos = fa->fa_tos;
+ new_fa->fa_dscp = fa->fa_dscp;
new_fa->fa_info = fi;
new_fa->fa_type = cfg->fc_type;
state = fa->fa_state;
@@ -1218,19 +1285,30 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
new_fa->fa_slen = fa->fa_slen;
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
+ new_fa->offload = 0;
+ new_fa->trap = 0;
+ new_fa->offload_failed = 0;
- err = call_fib_entry_notifiers(net,
- FIB_EVENT_ENTRY_REPLACE,
- key, plen, new_fa,
- extack);
- if (err)
- goto out_free_new_fa;
+ hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
+
+ if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0,
+ tb->tb_id, true) == new_fa) {
+ enum fib_event_type fib_event;
+
+ fib_event = FIB_EVENT_ENTRY_REPLACE;
+ err = call_fib_entry_notifiers(net, fib_event,
+ key, plen,
+ new_fa, extack);
+ if (err) {
+ hlist_replace_rcu(&new_fa->fa_list,
+ &fa->fa_list);
+ goto out_free_new_fa;
+ }
+ }
rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
tb->tb_id, &cfg->fc_nlinfo, nlflags);
- hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
-
alias_free_mem_rcu(fa);
fib_release_info(fi_drop);
@@ -1246,12 +1324,10 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
if (fa_match)
goto out;
- if (cfg->fc_nlflags & NLM_F_APPEND) {
- event = FIB_EVENT_ENTRY_APPEND;
+ if (cfg->fc_nlflags & NLM_F_APPEND)
nlflags |= NLM_F_APPEND;
- } else {
+ else
fa = fa_first;
- }
}
err = -ENOENT;
if (!(cfg->fc_nlflags & NLM_F_CREATE))
@@ -1264,21 +1340,38 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
goto out;
new_fa->fa_info = fi;
- new_fa->fa_tos = tos;
+ new_fa->fa_dscp = dscp;
new_fa->fa_type = cfg->fc_type;
new_fa->fa_state = 0;
new_fa->fa_slen = slen;
new_fa->tb_id = tb->tb_id;
new_fa->fa_default = -1;
-
- err = call_fib_entry_notifiers(net, event, key, plen, new_fa, extack);
- if (err)
- goto out_free_new_fa;
+ new_fa->offload = 0;
+ new_fa->trap = 0;
+ new_fa->offload_failed = 0;
/* Insert new entry to the list. */
err = fib_insert_alias(t, tp, l, new_fa, fa, key);
if (err)
- goto out_fib_notif;
+ goto out_free_new_fa;
+
+ /* The alias was already inserted, so the node must exist. */
+ l = l ? l : fib_find_node(t, &tp, key);
+ if (WARN_ON_ONCE(!l)) {
+ err = -ENOENT;
+ goto out_free_new_fa;
+ }
+
+ if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) ==
+ new_fa) {
+ enum fib_event_type fib_event;
+
+ fib_event = FIB_EVENT_ENTRY_REPLACE;
+ err = call_fib_entry_notifiers(net, fib_event, key, plen,
+ new_fa, extack);
+ if (err)
+ goto out_remove_new_fa;
+ }
if (!plen)
tb->tb_num_default++;
@@ -1289,14 +1382,8 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
succeeded:
return 0;
-out_fib_notif:
- /* notifier was sent that entry would be added to trie, but
- * the add failed and need to recover. Only failure for
- * fib_insert_alias is ENOMEM.
- */
- NL_SET_ERR_MSG(extack, "Failed to insert route into trie");
- call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key,
- plen, new_fa, NULL);
+out_remove_new_fa:
+ fib_remove_alias(t, tp, l, new_fa);
out_free_new_fa:
kmem_cache_free(fn_alias_kmem, new_fa);
out:
@@ -1312,6 +1399,23 @@ static inline t_key prefix_mismatch(t_key key, struct key_vector *n)
return (key ^ prefix) & (prefix | -prefix);
}
+bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags,
+ const struct flowi4 *flp)
+{
+ if (nhc->nhc_flags & RTNH_F_DEAD)
+ return false;
+
+ if (ip_ignore_linkdown(nhc->nhc_dev) &&
+ nhc->nhc_flags & RTNH_F_LINKDOWN &&
+ !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
+ return false;
+
+ if (flp->flowi4_oif && flp->flowi4_oif != nhc->nhc_oif)
+ return false;
+
+ return true;
+}
+
/* should be called with rcu_read_lock */
int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp,
struct fib_result *res, int fib_flags)
@@ -1444,21 +1548,24 @@ found:
/* Step 3: Process the leaf, if that fails fall back to backtracing */
hlist_for_each_entry_rcu(fa, &n->leaf, fa_list) {
struct fib_info *fi = fa->fa_info;
+ struct fib_nh_common *nhc;
int nhsel, err;
if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) {
if (index >= (1ul << fa->fa_slen))
continue;
}
- if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
+ if (fa->fa_dscp && !fib_dscp_masked_match(fa->fa_dscp, flp))
continue;
- if (fi->fib_dead)
+ /* Paired with WRITE_ONCE() in fib_release_info() */
+ if (READ_ONCE(fi->fib_dead))
continue;
if (fa->fa_info->fib_scope < flp->flowi4_scope)
continue;
fib_alias_accessed(fa);
err = fib_props[fa->fa_type].error;
if (unlikely(err < 0)) {
+out_reject:
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->semantic_match_passed);
#endif
@@ -1467,42 +1574,48 @@ found:
}
if (fi->fib_flags & RTNH_F_DEAD)
continue;
- for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
- const struct fib_nh *nh = &fi->fib_nh[nhsel];
- struct in_device *in_dev = __in_dev_get_rcu(nh->nh_dev);
- if (nh->nh_flags & RTNH_F_DEAD)
- continue;
- if (in_dev &&
- IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
- nh->nh_flags & RTNH_F_LINKDOWN &&
- !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
- continue;
- if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {
- if (flp->flowi4_oif &&
- flp->flowi4_oif != nh->nh_oif)
- continue;
+ if (unlikely(fi->nh)) {
+ if (nexthop_is_blackhole(fi->nh)) {
+ err = fib_props[RTN_BLACKHOLE].error;
+ goto out_reject;
}
+ nhc = nexthop_get_nhc_lookup(fi->nh, fib_flags, flp,
+ &nhsel);
+ if (nhc)
+ goto set_result;
+ goto miss;
+ }
+
+ for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
+ nhc = fib_info_nhc(fi, nhsel);
+
+ if (!fib_lookup_good_nhc(nhc, fib_flags, flp))
+ continue;
+set_result:
if (!(fib_flags & FIB_LOOKUP_NOREF))
refcount_inc(&fi->fib_clntref);
res->prefix = htonl(n->key);
res->prefixlen = KEYLENGTH - fa->fa_slen;
res->nh_sel = nhsel;
+ res->nhc = nhc;
res->type = fa->fa_type;
res->scope = fi->fib_scope;
+ res->dscp = fa->fa_dscp;
res->fi = fi;
res->table = tb;
res->fa_head = &n->leaf;
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->semantic_match_passed);
#endif
- trace_fib_table_lookup(tb->tb_id, flp, nh, err);
+ trace_fib_table_lookup(tb->tb_id, flp, nhc, err);
return err;
}
}
+miss:
#ifdef CONFIG_IP_FIB_TRIE_STATS
this_cpu_inc(stats->semantic_match_miss);
#endif
@@ -1541,6 +1654,36 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp,
node_pull_suffix(tp, fa->fa_slen);
}
+static void fib_notify_alias_delete(struct net *net, u32 key,
+ struct hlist_head *fah,
+ struct fib_alias *fa_to_delete,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_alias *fa_next, *fa_to_notify;
+ u32 tb_id = fa_to_delete->tb_id;
+ u8 slen = fa_to_delete->fa_slen;
+ enum fib_event_type fib_event;
+
+ /* Do not notify if we do not care about the route. */
+ if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete)
+ return;
+
+ /* Determine if the route should be replaced by the next route in the
+ * list.
+ */
+ fa_next = hlist_entry_safe(fa_to_delete->fa_list.next,
+ struct fib_alias, fa_list);
+ if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) {
+ fib_event = FIB_EVENT_ENTRY_REPLACE;
+ fa_to_notify = fa_next;
+ } else {
+ fib_event = FIB_EVENT_ENTRY_DEL;
+ fa_to_notify = fa_to_delete;
+ }
+ call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen,
+ fa_to_notify, extack);
+}
+
/* Caller must hold RTNL. */
int fib_table_delete(struct net *net, struct fib_table *tb,
struct fib_config *cfg, struct netlink_ext_ack *extack)
@@ -1550,23 +1693,22 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
struct key_vector *l, *tp;
u8 plen = cfg->fc_dst_len;
u8 slen = KEYLENGTH - plen;
- u8 tos = cfg->fc_tos;
+ dscp_t dscp;
u32 key;
key = ntohl(cfg->fc_dst);
- if (!fib_valid_key_len(key, plen, extack))
- return -EINVAL;
-
l = fib_find_node(t, &tp, key);
if (!l)
return -ESRCH;
- fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id);
+ dscp = cfg->fc_dscp;
+ fa = fib_find_alias(&l->leaf, slen, dscp, 0, tb->tb_id, false);
if (!fa)
return -ESRCH;
- pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
+ pr_debug("Deleting %08x/%d dsfield=0x%02x t=%p\n", key, plen,
+ inet_dscp_to_dsfield(dscp), t);
fa_to_delete = NULL;
hlist_for_each_entry_from(fa, fa_list) {
@@ -1574,7 +1716,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
if ((fa->fa_slen != slen) ||
(fa->tb_id != tb->tb_id) ||
- (fa->fa_tos != tos))
+ (fa->fa_dscp != dscp))
break;
if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
@@ -1584,7 +1726,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
fi->fib_prefsrc == cfg->fc_prefsrc) &&
(!cfg->fc_protocol ||
fi->fib_protocol == cfg->fc_protocol) &&
- fib_nh_match(cfg, fi, extack) == 0 &&
+ fib_nh_match(net, cfg, fi, extack) == 0 &&
fib_metrics_match(cfg, fi)) {
fa_to_delete = fa;
break;
@@ -1594,8 +1736,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
if (!fa_to_delete)
return -ESRCH;
- call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen,
- fa_to_delete, extack);
+ fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack);
rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
&cfg->fc_nlinfo, 0);
@@ -1749,7 +1890,7 @@ struct fib_table *fib_trie_unmerge(struct fib_table *oldtb)
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
struct key_vector *local_l = NULL, *local_tp;
- hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ hlist_for_each_entry(fa, &l->leaf, fa_list) {
struct fib_alias *new_fa;
if (local_tb->tb_id != fa->tb_id)
@@ -1856,9 +1997,10 @@ void fib_table_flush_external(struct fib_table *tb)
}
/* Caller must hold RTNL. */
-int fib_table_flush(struct net *net, struct fib_table *tb)
+int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all)
{
struct trie *t = (struct trie *)tb->tb_data;
+ struct nl_info info = { .nl_net = net };
struct key_vector *pn = t->kv;
unsigned long cindex = 1;
struct hlist_node *tmp;
@@ -1904,16 +2046,26 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
hlist_for_each_entry_safe(fa, tmp, &n->leaf, fa_list) {
struct fib_info *fi = fa->fa_info;
- if (!fi || !(fi->fib_flags & RTNH_F_DEAD) ||
- tb->tb_id != fa->tb_id) {
+ if (!fi || tb->tb_id != fa->tb_id ||
+ (!(fi->fib_flags & RTNH_F_DEAD) &&
+ !fib_props[fa->fa_type].error)) {
slen = fa->fa_slen;
continue;
}
- call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
- n->key,
- KEYLENGTH - fa->fa_slen, fa,
- NULL);
+ /* Do not flush error routes if network namespace is
+ * not being dismantled
+ */
+ if (!flush_all && fib_props[fa->fa_type].error) {
+ slen = fa->fa_slen;
+ continue;
+ }
+
+ fib_notify_alias_delete(net, n->key, &n->leaf, fa,
+ NULL);
+ if (fi->pfsrc_removed)
+ rtmsg_fib(RTM_DELROUTE, htonl(n->key), fa,
+ KEYLENGTH - fa->fa_slen, tb->tb_id, &info, 0);
hlist_del_rcu(&fa->fa_list);
fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa);
@@ -1933,10 +2085,76 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
return found;
}
-static void fib_leaf_notify(struct net *net, struct key_vector *l,
- struct fib_table *tb, struct notifier_block *nb)
+/* derived from fib_trie_free */
+static void __fib_info_notify_update(struct net *net, struct fib_table *tb,
+ struct nl_info *info)
+{
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *pn = t->kv;
+ unsigned long cindex = 1;
+ struct fib_alias *fa;
+
+ for (;;) {
+ struct key_vector *n;
+
+ if (!(cindex--)) {
+ t_key pkey = pn->key;
+
+ if (IS_TRIE(pn))
+ break;
+
+ pn = node_parent(pn);
+ cindex = get_index(pkey, pn);
+ continue;
+ }
+
+ /* grab the next available node */
+ n = get_child(pn, cindex);
+ if (!n)
+ continue;
+
+ if (IS_TNODE(n)) {
+ /* record pn and cindex for leaf walking */
+ pn = n;
+ cindex = 1ul << n->bits;
+
+ continue;
+ }
+
+ hlist_for_each_entry(fa, &n->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (!fi || !fi->nh_updated || fa->tb_id != tb->tb_id)
+ continue;
+
+ rtmsg_fib(RTM_NEWROUTE, htonl(n->key), fa,
+ KEYLENGTH - fa->fa_slen, tb->tb_id,
+ info, NLM_F_REPLACE);
+ }
+ }
+}
+
+void fib_info_notify_update(struct net *net, struct nl_info *info)
+{
+ unsigned int h;
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct fib_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb_hlist,
+ lockdep_rtnl_is_held())
+ __fib_info_notify_update(net, tb, info);
+ }
+}
+
+static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb,
+ struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
struct fib_alias *fa;
+ int last_slen = -1;
+ int err;
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
struct fib_info *fi = fa->fa_info;
@@ -1950,39 +2168,57 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l,
if (tb->tb_id != fa->tb_id)
continue;
- call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key,
- KEYLENGTH - fa->fa_slen, fa);
+ if (fa->fa_slen == last_slen)
+ continue;
+
+ last_slen = fa->fa_slen;
+ err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE,
+ l->key, KEYLENGTH - fa->fa_slen,
+ fa, extack);
+ if (err)
+ return err;
}
+ return 0;
}
-static void fib_table_notify(struct net *net, struct fib_table *tb,
- struct notifier_block *nb)
+static int fib_table_notify(struct fib_table *tb, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
struct trie *t = (struct trie *)tb->tb_data;
struct key_vector *l, *tp = t->kv;
t_key key = 0;
+ int err;
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
- fib_leaf_notify(net, l, tb, nb);
+ err = fib_leaf_notify(l, tb, nb, extack);
+ if (err)
+ return err;
key = l->key + 1;
/* stop in case of wrap around */
if (key < l->key)
break;
}
+ return 0;
}
-void fib_notify(struct net *net, struct notifier_block *nb)
+int fib_notify(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
unsigned int h;
+ int err;
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
struct fib_table *tb;
- hlist_for_each_entry_rcu(tb, head, tb_hlist)
- fib_table_notify(net, tb, nb);
+ hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+ err = fib_table_notify(tb, nb, extack);
+ if (err)
+ return err;
+ }
}
+ return 0;
}
static void __trie_free_rcu(struct rcu_head *head)
@@ -2003,48 +2239,94 @@ void fib_free_table(struct fib_table *tb)
}
static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
- struct sk_buff *skb, struct netlink_callback *cb)
+ struct sk_buff *skb, struct netlink_callback *cb,
+ struct fib_dump_filter *filter)
{
+ unsigned int flags = NLM_F_MULTI;
__be32 xkey = htonl(l->key);
+ int i, s_i, i_fa, s_fa, err;
struct fib_alias *fa;
- int i, s_i;
+
+ if (filter->filter_set ||
+ !filter->dump_exceptions || !filter->dump_routes)
+ flags |= NLM_F_DUMP_FILTERED;
s_i = cb->args[4];
+ s_fa = cb->args[5];
i = 0;
/* rcu_read_lock is hold by caller */
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
- int err;
+ struct fib_info *fi = fa->fa_info;
- if (i < s_i) {
- i++;
- continue;
+ if (i < s_i)
+ goto next;
+
+ i_fa = 0;
+
+ if (tb->tb_id != fa->tb_id)
+ goto next;
+
+ if (filter->filter_set) {
+ if (filter->rt_type && fa->fa_type != filter->rt_type)
+ goto next;
+
+ if ((filter->protocol &&
+ fi->fib_protocol != filter->protocol))
+ goto next;
+
+ if (filter->dev &&
+ !fib_info_nh_uses_dev(fi, filter->dev))
+ goto next;
}
- if (tb->tb_id != fa->tb_id) {
- i++;
- continue;
+ if (filter->dump_routes) {
+ if (!s_fa) {
+ struct fib_rt_info fri;
+
+ fri.fi = fi;
+ fri.tb_id = tb->tb_id;
+ fri.dst = xkey;
+ fri.dst_len = KEYLENGTH - fa->fa_slen;
+ fri.dscp = fa->fa_dscp;
+ fri.type = fa->fa_type;
+ fri.offload = READ_ONCE(fa->offload);
+ fri.trap = READ_ONCE(fa->trap);
+ fri.offload_failed = READ_ONCE(fa->offload_failed);
+ err = fib_dump_info(skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWROUTE, &fri, flags);
+ if (err < 0)
+ goto stop;
+ }
+
+ i_fa++;
}
- err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, RTM_NEWROUTE,
- tb->tb_id, fa->fa_type,
- xkey, KEYLENGTH - fa->fa_slen,
- fa->fa_tos, fa->fa_info, NLM_F_MULTI);
- if (err < 0) {
- cb->args[4] = i;
- return err;
+ if (filter->dump_exceptions) {
+ err = fib_dump_info_fnhe(skb, cb, tb->tb_id, fi,
+ &i_fa, s_fa, flags);
+ if (err < 0)
+ goto stop;
}
+
+next:
i++;
}
cb->args[4] = i;
return skb->len;
+
+stop:
+ cb->args[4] = i;
+ cb->args[5] = i_fa;
+ return err;
}
/* rcu_read_lock needs to be hold by caller from readside */
int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
- struct netlink_callback *cb)
+ struct netlink_callback *cb, struct fib_dump_filter *filter)
{
struct trie *t = (struct trie *)tb->tb_data;
struct key_vector *l, *tp = t->kv;
@@ -2054,10 +2336,16 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
int count = cb->args[2];
t_key key = cb->args[3];
+ /* First time here, count and key are both always 0. Count > 0
+ * and key == 0 means the dump has wrapped around and we are done.
+ */
+ if (count && !key)
+ return 0;
+
while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
int err;
- err = fn_trie_dump_leaf(l, tb, skb, cb);
+ err = fn_trie_dump_leaf(l, tb, skb, cb, filter);
if (err < 0) {
cb->args[3] = key;
cb->args[2] = count;
@@ -2078,18 +2366,18 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
cb->args[3] = key;
cb->args[2] = count;
- return skb->len;
+ return 0;
}
void __init fib_trie_init(void)
{
fn_alias_kmem = kmem_cache_create("ip_fib_alias",
sizeof(struct fib_alias),
- 0, SLAB_PANIC, NULL);
+ 0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
LEAF_SIZE,
- 0, SLAB_PANIC, NULL);
+ 0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
}
struct fib_table *fib_trie_table(u32 id, struct fib_table *alias)
@@ -2320,7 +2608,7 @@ static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
static int fib_triestat_seq_show(struct seq_file *seq, void *v)
{
- struct net *net = (struct net *)seq->private;
+ struct net *net = seq->private;
unsigned int h;
seq_printf(seq,
@@ -2328,6 +2616,7 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
" %zd bytes, size of tnode: %zd bytes.\n",
LEAF_SIZE, TNODE_SIZE(0));
+ rcu_read_lock();
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv4.fib_table_hash[h];
struct fib_table *tb;
@@ -2347,7 +2636,9 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v)
trie_show_usage(seq, t->stats);
#endif
}
+ cond_resched_rcu();
}
+ rcu_read_unlock();
return 0;
}
@@ -2509,8 +2800,9 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v)
fa->fa_info->fib_scope),
rtn_type(buf2, sizeof(buf2),
fa->fa_type));
- if (fa->fa_tos)
- seq_printf(seq, " tos=%d", fa->fa_tos);
+ if (fa->fa_dscp)
+ seq_printf(seq, " tos=%d",
+ inet_dscp_to_dsfield(fa->fa_dscp));
seq_putc(seq, '\n');
}
}
@@ -2621,14 +2913,18 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v)
rcu_read_unlock();
}
-static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
+static unsigned int fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
{
unsigned int flags = 0;
if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
flags = RTF_REJECT;
- if (fi && fi->fib_nh->nh_gw)
- flags |= RTF_GATEWAY;
+ if (fi) {
+ const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
+
+ if (nhc->nhc_gw.ipv4)
+ flags |= RTF_GATEWAY;
+ }
if (mask == htonl(0xFFFFFFFF))
flags |= RTF_HOST;
flags |= RTF_UP;
@@ -2659,7 +2955,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
prefix = htonl(l->key);
hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
- const struct fib_info *fi = fa->fa_info;
+ struct fib_info *fi = fa->fa_info;
__be32 mask = inet_make_mask(KEYLENGTH - fa->fa_slen);
unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
@@ -2672,26 +2968,31 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
seq_setwidth(seq, 127);
- if (fi)
+ if (fi) {
+ struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
+ __be32 gw = 0;
+
+ if (nhc->nhc_gw_family == AF_INET)
+ gw = nhc->nhc_gw.ipv4;
+
seq_printf(seq,
"%s\t%08X\t%08X\t%04X\t%d\t%u\t"
- "%d\t%08X\t%d\t%u\t%u",
- fi->fib_dev ? fi->fib_dev->name : "*",
- prefix,
- fi->fib_nh->nh_gw, flags, 0, 0,
+ "%u\t%08X\t%d\t%u\t%u",
+ nhc->nhc_dev ? nhc->nhc_dev->name : "*",
+ prefix, gw, flags, 0, 0,
fi->fib_priority,
mask,
(fi->fib_advmss ?
fi->fib_advmss + 40 : 0),
fi->fib_window,
fi->fib_rtt >> 3);
- else
+ } else {
seq_printf(seq,
"*\t%08X\t%08X\t%04X\t%d\t%u\t"
- "%d\t%08X\t%d\t%u\t%u",
+ "%u\t%08X\t%d\t%u\t%u",
prefix, 0, flags, 0, 0, 0,
mask, 0, 0, 0);
-
+ }
seq_pad(seq, '\n');
}
diff --git a/net/ipv4/fou_bpf.c b/net/ipv4/fou_bpf.c
new file mode 100644
index 000000000000..54984f3170a8
--- /dev/null
+++ b/net/ipv4/fou_bpf.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable Fou Helpers for TC-BPF hook
+ *
+ * These are called from SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+
+#include <net/dst_metadata.h>
+#include <net/fou.h>
+
+struct bpf_fou_encap {
+ __be16 sport;
+ __be16 dport;
+};
+
+enum bpf_fou_encap_type {
+ FOU_BPF_ENCAP_FOU,
+ FOU_BPF_ENCAP_GUE,
+};
+
+__bpf_kfunc_start_defs();
+
+/* bpf_skb_set_fou_encap - Set FOU encap parameters
+ *
+ * This function allows for using GUE or FOU encapsulation together with an
+ * ipip device in collect-metadata mode.
+ *
+ * It is meant to be used in BPF tc-hooks and after a call to the
+ * bpf_skb_set_tunnel_key helper, responsible for setting IP addresses.
+ *
+ * Parameters:
+ * @skb_ctx Pointer to ctx (__sk_buff) in TC program. Cannot be NULL
+ * @encap Pointer to a `struct bpf_fou_encap` storing UDP src and
+ * dst ports. If sport is set to 0 the kernel will auto-assign a
+ * port. This is similar to using `encap-sport auto`.
+ * Cannot be NULL
+ * @type Encapsulation type for the packet. Their definitions are
+ * specified in `enum bpf_fou_encap_type`
+ */
+__bpf_kfunc int bpf_skb_set_fou_encap(struct __sk_buff *skb_ctx,
+ struct bpf_fou_encap *encap, int type)
+{
+ struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+ struct ip_tunnel_info *info = skb_tunnel_info(skb);
+
+ if (unlikely(!encap))
+ return -EINVAL;
+
+ if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX)))
+ return -EINVAL;
+
+ switch (type) {
+ case FOU_BPF_ENCAP_FOU:
+ info->encap.type = TUNNEL_ENCAP_FOU;
+ break;
+ case FOU_BPF_ENCAP_GUE:
+ info->encap.type = TUNNEL_ENCAP_GUE;
+ break;
+ default:
+ info->encap.type = TUNNEL_ENCAP_NONE;
+ }
+
+ if (test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags))
+ info->encap.flags |= TUNNEL_ENCAP_FLAG_CSUM;
+
+ info->encap.sport = encap->sport;
+ info->encap.dport = encap->dport;
+
+ return 0;
+}
+
+/* bpf_skb_get_fou_encap - Get FOU encap parameters
+ *
+ * This function allows for reading encap metadata from a packet received
+ * on an ipip device in collect-metadata mode.
+ *
+ * Parameters:
+ * @skb_ctx Pointer to ctx (__sk_buff) in TC program. Cannot be NULL
+ * @encap Pointer to a struct bpf_fou_encap storing UDP source and
+ * destination port. Cannot be NULL
+ */
+__bpf_kfunc int bpf_skb_get_fou_encap(struct __sk_buff *skb_ctx,
+ struct bpf_fou_encap *encap)
+{
+ struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+ struct ip_tunnel_info *info = skb_tunnel_info(skb);
+
+ if (unlikely(!info))
+ return -EINVAL;
+
+ encap->sport = info->encap.sport;
+ encap->dport = info->encap.dport;
+
+ return 0;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(fou_kfunc_set)
+BTF_ID_FLAGS(func, bpf_skb_set_fou_encap)
+BTF_ID_FLAGS(func, bpf_skb_get_fou_encap)
+BTF_KFUNCS_END(fou_kfunc_set)
+
+static const struct btf_kfunc_id_set fou_bpf_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &fou_kfunc_set,
+};
+
+int register_fou_bpf(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
+ &fou_bpf_kfunc_set);
+}
diff --git a/net/ipv4/fou.c b/net/ipv4/fou_core.c
index 500a59906b87..3970b6b7ace5 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou_core.c
@@ -1,22 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/socket.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
+#include <linux/icmp.h>
#include <linux/udp.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <net/genetlink.h>
+#include <net/gro.h>
#include <net/gue.h>
#include <net/fou.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/udp.h>
#include <net/udp_tunnel.h>
-#include <net/xfrm.h>
#include <uapi/linux/fou.h>
#include <uapi/linux/genetlink.h>
+#include "fou_nl.h"
+
struct fou {
struct socket *sock;
u8 protocol;
@@ -46,7 +50,7 @@ struct fou_net {
static inline struct fou *fou_from_sock(struct sock *sk)
{
- return sk->sk_user_data;
+ return rcu_dereference_sk_user_data(sk);
}
static int fou_recv_pull(struct sk_buff *skb, struct fou *fou, size_t len)
@@ -120,6 +124,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
struct guehdr *guehdr;
void *data;
u16 doffset = 0;
+ u8 proto_ctype;
if (!fou)
return 1;
@@ -135,7 +140,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
break;
case 1: {
- /* Direct encasulation of IPv4 or IPv6 */
+ /* Direct encapsulation of IPv4 or IPv6 */
int prot;
@@ -169,9 +174,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
/* guehdr may change after pull */
guehdr = (struct guehdr *)&udp_hdr(skb)[1];
- hdrlen = sizeof(struct guehdr) + optlen;
-
- if (guehdr->version != 0 || validate_gue_flags(guehdr, optlen))
+ if (validate_gue_flags(guehdr, optlen))
goto drop;
hdrlen = sizeof(struct guehdr) + optlen;
@@ -211,31 +214,44 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
if (unlikely(guehdr->control))
return gue_control_message(skb, guehdr);
+ proto_ctype = guehdr->proto_ctype;
__skb_pull(skb, sizeof(struct udphdr) + hdrlen);
skb_reset_transport_header(skb);
if (iptunnel_pull_offloads(skb))
goto drop;
- return -guehdr->proto_ctype;
+ return -proto_ctype;
drop:
kfree_skb(skb);
return 0;
}
+static const struct net_offload *fou_gro_ops(const struct sock *sk,
+ int proto)
+{
+ const struct net_offload __rcu **offloads;
+
+ /* FOU doesn't allow IPv4 on IPv6 sockets. */
+ offloads = sk->sk_family == AF_INET6 ? inet6_offloads : inet_offloads;
+ return rcu_dereference(offloads[proto]);
+}
+
static struct sk_buff *fou_gro_receive(struct sock *sk,
struct list_head *head,
struct sk_buff *skb)
{
- u8 proto = fou_from_sock(sk)->protocol;
- const struct net_offload **offloads;
+ struct fou *fou = fou_from_sock(sk);
const struct net_offload *ops;
struct sk_buff *pp = NULL;
+ if (!fou)
+ goto out;
+
/* We can clear the encap_mark for FOU as we are essentially doing
* one of two possible things. We are either adding an L4 tunnel
- * header to the outer L3 tunnel header, or we are are simply
+ * header to the outer L3 tunnel header, or we are simply
* treating the GRE tunnel header as though it is a UDP protocol
* specific header such as VXLAN or GENEVE.
*/
@@ -244,41 +260,39 @@ static struct sk_buff *fou_gro_receive(struct sock *sk,
/* Flag this frame as already having an outer encap header */
NAPI_GRO_CB(skb)->is_fou = 1;
- rcu_read_lock();
- offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
- ops = rcu_dereference(offloads[proto]);
+ ops = fou_gro_ops(sk, fou->protocol);
if (!ops || !ops->callbacks.gro_receive)
- goto out_unlock;
+ goto out;
pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
-out_unlock:
- rcu_read_unlock();
-
+out:
return pp;
}
static int fou_gro_complete(struct sock *sk, struct sk_buff *skb,
int nhoff)
{
+ struct fou *fou = fou_from_sock(sk);
const struct net_offload *ops;
- u8 proto = fou_from_sock(sk)->protocol;
- int err = -ENOSYS;
- const struct net_offload **offloads;
+ int err;
- rcu_read_lock();
- offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
- ops = rcu_dereference(offloads[proto]);
- if (WARN_ON(!ops || !ops->callbacks.gro_complete))
- goto out_unlock;
+ if (!fou) {
+ err = -ENOENT;
+ goto out;
+ }
+
+ ops = fou_gro_ops(sk, fou->protocol);
+ if (WARN_ON(!ops || !ops->callbacks.gro_complete)) {
+ err = -ENOSYS;
+ goto out;
+ }
err = ops->callbacks.gro_complete(skb, nhoff);
skb_set_inner_mac_header(skb, nhoff);
-out_unlock:
- rcu_read_unlock();
-
+out:
return err;
}
@@ -309,7 +323,6 @@ static struct sk_buff *gue_gro_receive(struct sock *sk,
struct list_head *head,
struct sk_buff *skb)
{
- const struct net_offload **offloads;
const struct net_offload *ops;
struct sk_buff *pp = NULL;
struct sk_buff *p;
@@ -324,15 +337,15 @@ static struct sk_buff *gue_gro_receive(struct sock *sk,
skb_gro_remcsum_init(&grc);
+ if (!fou)
+ goto out;
+
off = skb_gro_offset(skb);
len = off + sizeof(*guehdr);
- guehdr = skb_gro_header_fast(skb, off);
- if (skb_gro_header_hard(skb, len)) {
- guehdr = skb_gro_header_slow(skb, len, off);
- if (unlikely(!guehdr))
- goto out;
- }
+ guehdr = skb_gro_header(skb, len, off);
+ if (unlikely(!guehdr))
+ goto out;
switch (guehdr->version) {
case 0:
@@ -356,7 +369,7 @@ static struct sk_buff *gue_gro_receive(struct sock *sk,
optlen = guehdr->hlen << 2;
len += optlen;
- if (skb_gro_header_hard(skb, len)) {
+ if (!skb_gro_may_pull(skb, len)) {
guehdr = skb_gro_header_slow(skb, len, off);
if (unlikely(!guehdr))
goto out;
@@ -427,7 +440,7 @@ next_proto:
/* We can clear the encap_mark for GUE as we are essentially doing
* one of two possible things. We are either adding an L4 tunnel
- * header to the outer L3 tunnel header, or we are are simply
+ * header to the outer L3 tunnel header, or we are simply
* treating the GRE tunnel header as though it is a UDP protocol
* specific header such as VXLAN or GENEVE.
*/
@@ -436,17 +449,13 @@ next_proto:
/* Flag this frame as already having an outer encap header */
NAPI_GRO_CB(skb)->is_fou = 1;
- rcu_read_lock();
- offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
- ops = rcu_dereference(offloads[proto]);
- if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive))
- goto out_unlock;
+ ops = fou_gro_ops(sk, proto);
+ if (!ops || !ops->callbacks.gro_receive)
+ goto out;
pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
flush = 0;
-out_unlock:
- rcu_read_unlock();
out:
skb_gro_flush_final_remcsum(skb, pp, flush, &grc);
@@ -455,7 +464,6 @@ out:
static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
{
- const struct net_offload **offloads;
struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
const struct net_offload *ops;
unsigned int guehlen = 0;
@@ -483,30 +491,57 @@ static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
return err;
}
- rcu_read_lock();
- offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
- ops = rcu_dereference(offloads[proto]);
+ ops = fou_gro_ops(sk, proto);
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
- goto out_unlock;
+ goto out;
err = ops->callbacks.gro_complete(skb, nhoff + guehlen);
skb_set_inner_mac_header(skb, nhoff + guehlen);
-out_unlock:
- rcu_read_unlock();
+out:
return err;
}
-static int fou_add_to_port_list(struct net *net, struct fou *fou)
+static bool fou_cfg_cmp(struct fou *fou, struct fou_cfg *cfg)
+{
+ struct sock *sk = fou->sock->sk;
+ struct udp_port_cfg *udp_cfg = &cfg->udp_config;
+
+ if (fou->family != udp_cfg->family ||
+ fou->port != udp_cfg->local_udp_port ||
+ sk->sk_dport != udp_cfg->peer_udp_port ||
+ sk->sk_bound_dev_if != udp_cfg->bind_ifindex)
+ return false;
+
+ if (fou->family == AF_INET) {
+ if (sk->sk_rcv_saddr != udp_cfg->local_ip.s_addr ||
+ sk->sk_daddr != udp_cfg->peer_ip.s_addr)
+ return false;
+ else
+ return true;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else {
+ if (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, &udp_cfg->local_ip6) ||
+ ipv6_addr_cmp(&sk->sk_v6_daddr, &udp_cfg->peer_ip6))
+ return false;
+ else
+ return true;
+#endif
+ }
+
+ return false;
+}
+
+static int fou_add_to_port_list(struct net *net, struct fou *fou,
+ struct fou_cfg *cfg)
{
struct fou_net *fn = net_generic(net, fou_net_id);
struct fou *fout;
mutex_lock(&fn->fou_lock);
list_for_each_entry(fout, &fn->fou_list, list) {
- if (fou->port == fout->port &&
- fou->family == fout->family) {
+ if (fou_cfg_cmp(fout, cfg)) {
mutex_unlock(&fn->fou_lock);
return -EALREADY;
}
@@ -584,7 +619,7 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
sk->sk_allocation = GFP_ATOMIC;
- err = fou_add_to_port_list(net, fou);
+ err = fou_add_to_port_list(net, fou, cfg);
if (err)
goto error;
@@ -604,14 +639,12 @@ error:
static int fou_destroy(struct net *net, struct fou_cfg *cfg)
{
struct fou_net *fn = net_generic(net, fou_net_id);
- __be16 port = cfg->udp_config.local_udp_port;
- u8 family = cfg->udp_config.family;
int err = -EINVAL;
struct fou *fou;
mutex_lock(&fn->fou_lock);
list_for_each_entry(fou, &fn->fou_list, list) {
- if (fou->port == port && fou->family == family) {
+ if (fou_cfg_cmp(fou, cfg)) {
fou_release(fou);
err = 0;
break;
@@ -624,17 +657,14 @@ static int fou_destroy(struct net *net, struct fou_cfg *cfg)
static struct genl_family fou_nl_family;
-static const struct nla_policy fou_nl_policy[FOU_ATTR_MAX + 1] = {
- [FOU_ATTR_PORT] = { .type = NLA_U16, },
- [FOU_ATTR_AF] = { .type = NLA_U8, },
- [FOU_ATTR_IPPROTO] = { .type = NLA_U8, },
- [FOU_ATTR_TYPE] = { .type = NLA_U8, },
- [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, },
-};
-
static int parse_nl_config(struct genl_info *info,
struct fou_cfg *cfg)
{
+ bool has_local = false, has_peer = false;
+ struct nlattr *attr;
+ int ifindex;
+ __be16 port;
+
memset(cfg, 0, sizeof(*cfg));
cfg->udp_config.family = AF_INET;
@@ -656,8 +686,7 @@ static int parse_nl_config(struct genl_info *info,
}
if (info->attrs[FOU_ATTR_PORT]) {
- __be16 port = nla_get_be16(info->attrs[FOU_ATTR_PORT]);
-
+ port = nla_get_be16(info->attrs[FOU_ATTR_PORT]);
cfg->udp_config.local_udp_port = port;
}
@@ -670,10 +699,56 @@ static int parse_nl_config(struct genl_info *info,
if (info->attrs[FOU_ATTR_REMCSUM_NOPARTIAL])
cfg->flags |= FOU_F_REMCSUM_NOPARTIAL;
+ if (cfg->udp_config.family == AF_INET) {
+ if (info->attrs[FOU_ATTR_LOCAL_V4]) {
+ attr = info->attrs[FOU_ATTR_LOCAL_V4];
+ cfg->udp_config.local_ip.s_addr = nla_get_in_addr(attr);
+ has_local = true;
+ }
+
+ if (info->attrs[FOU_ATTR_PEER_V4]) {
+ attr = info->attrs[FOU_ATTR_PEER_V4];
+ cfg->udp_config.peer_ip.s_addr = nla_get_in_addr(attr);
+ has_peer = true;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ } else {
+ if (info->attrs[FOU_ATTR_LOCAL_V6]) {
+ attr = info->attrs[FOU_ATTR_LOCAL_V6];
+ cfg->udp_config.local_ip6 = nla_get_in6_addr(attr);
+ has_local = true;
+ }
+
+ if (info->attrs[FOU_ATTR_PEER_V6]) {
+ attr = info->attrs[FOU_ATTR_PEER_V6];
+ cfg->udp_config.peer_ip6 = nla_get_in6_addr(attr);
+ has_peer = true;
+ }
+#endif
+ }
+
+ if (has_peer) {
+ if (info->attrs[FOU_ATTR_PEER_PORT]) {
+ port = nla_get_be16(info->attrs[FOU_ATTR_PEER_PORT]);
+ cfg->udp_config.peer_udp_port = port;
+ } else {
+ return -EINVAL;
+ }
+ }
+
+ if (info->attrs[FOU_ATTR_IFINDEX]) {
+ if (!has_local)
+ return -EINVAL;
+
+ ifindex = nla_get_s32(info->attrs[FOU_ATTR_IFINDEX]);
+
+ cfg->udp_config.bind_ifindex = ifindex;
+ }
+
return 0;
}
-static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info)
+int fou_nl_add_doit(struct sk_buff *skb, struct genl_info *info)
{
struct net *net = genl_info_net(info);
struct fou_cfg cfg;
@@ -686,7 +761,7 @@ static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info)
return fou_create(net, &cfg, NULL);
}
-static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info)
+int fou_nl_del_doit(struct sk_buff *skb, struct genl_info *info)
{
struct net *net = genl_info_net(info);
struct fou_cfg cfg;
@@ -701,15 +776,37 @@ static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info)
static int fou_fill_info(struct fou *fou, struct sk_buff *msg)
{
+ struct sock *sk = fou->sock->sk;
+
if (nla_put_u8(msg, FOU_ATTR_AF, fou->sock->sk->sk_family) ||
nla_put_be16(msg, FOU_ATTR_PORT, fou->port) ||
+ nla_put_be16(msg, FOU_ATTR_PEER_PORT, sk->sk_dport) ||
nla_put_u8(msg, FOU_ATTR_IPPROTO, fou->protocol) ||
- nla_put_u8(msg, FOU_ATTR_TYPE, fou->type))
+ nla_put_u8(msg, FOU_ATTR_TYPE, fou->type) ||
+ nla_put_s32(msg, FOU_ATTR_IFINDEX, sk->sk_bound_dev_if))
return -1;
if (fou->flags & FOU_F_REMCSUM_NOPARTIAL)
if (nla_put_flag(msg, FOU_ATTR_REMCSUM_NOPARTIAL))
return -1;
+
+ if (fou->sock->sk->sk_family == AF_INET) {
+ if (nla_put_in_addr(msg, FOU_ATTR_LOCAL_V4, sk->sk_rcv_saddr))
+ return -1;
+
+ if (nla_put_in_addr(msg, FOU_ATTR_PEER_V4, sk->sk_daddr))
+ return -1;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else {
+ if (nla_put_in6_addr(msg, FOU_ATTR_LOCAL_V6,
+ &sk->sk_v6_rcv_saddr))
+ return -1;
+
+ if (nla_put_in6_addr(msg, FOU_ATTR_PEER_V6, &sk->sk_v6_daddr))
+ return -1;
+#endif
+ }
+
return 0;
}
@@ -733,7 +830,7 @@ nla_put_failure:
return -EMSGSIZE;
}
-static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info)
+int fou_nl_get_doit(struct sk_buff *skb, struct genl_info *info)
{
struct net *net = genl_info_net(info);
struct fou_net *fn = net_generic(net, fou_net_id);
@@ -762,7 +859,7 @@ static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info)
ret = -ESRCH;
mutex_lock(&fn->fou_lock);
list_for_each_entry(fout, &fn->fou_list, list) {
- if (port == fout->port && family == fout->family) {
+ if (fou_cfg_cmp(fout, &cfg)) {
ret = fou_dump_info(fout, info->snd_portid,
info->snd_seq, 0, msg,
info->genlhdr->cmd);
@@ -780,7 +877,7 @@ out_free:
return ret;
}
-static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
+int fou_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
struct fou_net *fn = net_generic(net, fou_net_id);
@@ -803,36 +900,17 @@ static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
-static const struct genl_ops fou_nl_ops[] = {
- {
- .cmd = FOU_CMD_ADD,
- .doit = fou_nl_cmd_add_port,
- .policy = fou_nl_policy,
- .flags = GENL_ADMIN_PERM,
- },
- {
- .cmd = FOU_CMD_DEL,
- .doit = fou_nl_cmd_rm_port,
- .policy = fou_nl_policy,
- .flags = GENL_ADMIN_PERM,
- },
- {
- .cmd = FOU_CMD_GET,
- .doit = fou_nl_cmd_get_port,
- .dumpit = fou_nl_dump,
- .policy = fou_nl_policy,
- },
-};
-
static struct genl_family fou_nl_family __ro_after_init = {
.hdrsize = 0,
.name = FOU_GENL_NAME,
.version = FOU_GENL_VERSION,
.maxattr = FOU_ATTR_MAX,
+ .policy = fou_nl_policy,
.netnsok = true,
.module = THIS_MODULE,
- .ops = fou_nl_ops,
- .n_ops = ARRAY_SIZE(fou_nl_ops),
+ .small_ops = fou_nl_ops,
+ .n_small_ops = ARRAY_SIZE(fou_nl_ops),
+ .resv_start_op = FOU_CMD_GET + 1,
};
size_t fou_encap_hlen(struct ip_tunnel_encap *e)
@@ -1003,15 +1081,95 @@ static int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
return 0;
}
+static int gue_err_proto_handler(int proto, struct sk_buff *skb, u32 info)
+{
+ const struct net_protocol *ipprot = rcu_dereference(inet_protos[proto]);
+
+ if (ipprot && ipprot->err_handler) {
+ if (!ipprot->err_handler(skb, info))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static int gue_err(struct sk_buff *skb, u32 info)
+{
+ int transport_offset = skb_transport_offset(skb);
+ struct guehdr *guehdr;
+ size_t len, optlen;
+ int ret;
+
+ len = sizeof(struct udphdr) + sizeof(struct guehdr);
+ if (!pskb_may_pull(skb, transport_offset + len))
+ return -EINVAL;
+
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ switch (guehdr->version) {
+ case 0: /* Full GUE header present */
+ break;
+ case 1: {
+ /* Direct encapsulation of IPv4 or IPv6 */
+ skb_set_transport_header(skb, -(int)sizeof(struct icmphdr));
+
+ switch (((struct iphdr *)guehdr)->version) {
+ case 4:
+ ret = gue_err_proto_handler(IPPROTO_IPIP, skb, info);
+ goto out;
+#if IS_ENABLED(CONFIG_IPV6)
+ case 6:
+ ret = gue_err_proto_handler(IPPROTO_IPV6, skb, info);
+ goto out;
+#endif
+ default:
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ }
+ default: /* Undefined version */
+ return -EOPNOTSUPP;
+ }
+
+ if (guehdr->control)
+ return -ENOENT;
+
+ optlen = guehdr->hlen << 2;
+
+ if (!pskb_may_pull(skb, transport_offset + len + optlen))
+ return -EINVAL;
+
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+ if (validate_gue_flags(guehdr, optlen))
+ return -EINVAL;
+
+ /* Handling exceptions for direct UDP encapsulation in GUE would lead to
+ * recursion. Besides, this kind of encapsulation can't even be
+ * configured currently. Discard this.
+ */
+ if (guehdr->proto_ctype == IPPROTO_UDP ||
+ guehdr->proto_ctype == IPPROTO_UDPLITE)
+ return -EOPNOTSUPP;
+
+ skb_set_transport_header(skb, -(int)sizeof(struct icmphdr));
+ ret = gue_err_proto_handler(guehdr->proto_ctype, skb, info);
+
+out:
+ skb_set_transport_header(skb, transport_offset);
+ return ret;
+}
+
static const struct ip_tunnel_encap_ops fou_iptun_ops = {
.encap_hlen = fou_encap_hlen,
.build_header = fou_build_header,
+ .err_handler = gue_err,
};
static const struct ip_tunnel_encap_ops gue_iptun_ops = {
.encap_hlen = gue_encap_hlen,
.build_header = gue_build_header,
+ .err_handler = gue_err,
};
static int ip_tunnel_encap_add_fou_ops(void)
@@ -1093,10 +1251,15 @@ static int __init fou_init(void)
if (ret < 0)
goto unregister;
+ ret = register_fou_bpf();
+ if (ret < 0)
+ goto kfunc_failed;
+
ret = ip_tunnel_encap_add_fou_ops();
if (ret == 0)
return 0;
+kfunc_failed:
genl_unregister_family(&fou_nl_family);
unregister:
unregister_pernet_device(&fou_net_ops);
@@ -1115,3 +1278,4 @@ module_init(fou_init);
module_exit(fou_fini);
MODULE_AUTHOR("Tom Herbert <therbert@google.com>");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Foo over UDP");
diff --git a/net/ipv4/fou_nl.c b/net/ipv4/fou_nl.c
new file mode 100644
index 000000000000..7a99639204b1
--- /dev/null
+++ b/net/ipv4/fou_nl.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/fou.yaml */
+/* YNL-GEN kernel source */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "fou_nl.h"
+
+#include <uapi/linux/fou.h>
+
+/* Global operation policy for fou */
+const struct nla_policy fou_nl_policy[FOU_ATTR_IFINDEX + 1] = {
+ [FOU_ATTR_PORT] = { .type = NLA_BE16, },
+ [FOU_ATTR_AF] = { .type = NLA_U8, },
+ [FOU_ATTR_IPPROTO] = { .type = NLA_U8, },
+ [FOU_ATTR_TYPE] = { .type = NLA_U8, },
+ [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, },
+ [FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, },
+ [FOU_ATTR_LOCAL_V6] = NLA_POLICY_EXACT_LEN(16),
+ [FOU_ATTR_PEER_V4] = { .type = NLA_U32, },
+ [FOU_ATTR_PEER_V6] = NLA_POLICY_EXACT_LEN(16),
+ [FOU_ATTR_PEER_PORT] = { .type = NLA_BE16, },
+ [FOU_ATTR_IFINDEX] = { .type = NLA_S32, },
+};
+
+/* Ops table for fou */
+const struct genl_small_ops fou_nl_ops[3] = {
+ {
+ .cmd = FOU_CMD_ADD,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = fou_nl_add_doit,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = FOU_CMD_DEL,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = fou_nl_del_doit,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = FOU_CMD_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = fou_nl_get_doit,
+ .dumpit = fou_nl_get_dumpit,
+ },
+};
diff --git a/net/ipv4/fou_nl.h b/net/ipv4/fou_nl.h
new file mode 100644
index 000000000000..438342dc8507
--- /dev/null
+++ b/net/ipv4/fou_nl.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/fou.yaml */
+/* YNL-GEN kernel header */
+/* To regenerate run: tools/net/ynl/ynl-regen.sh */
+
+#ifndef _LINUX_FOU_GEN_H
+#define _LINUX_FOU_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/fou.h>
+
+/* Global operation policy for fou */
+extern const struct nla_policy fou_nl_policy[FOU_ATTR_IFINDEX + 1];
+
+/* Ops table for fou */
+extern const struct genl_small_ops fou_nl_ops[3];
+
+int fou_nl_add_doit(struct sk_buff *skb, struct genl_info *info);
+int fou_nl_del_doit(struct sk_buff *skb, struct genl_info *info);
+int fou_nl_get_doit(struct sk_buff *skb, struct genl_info *info);
+int fou_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+
+#endif /* _LINUX_FOU_GEN_H */
diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c
index b798862b6be5..dafd68f3436a 100644
--- a/net/ipv4/gre_demux.c
+++ b/net/ipv4/gre_demux.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* GRE over IPv4 demultiplexer driver
*
* Authors: Dmitry Kozlov (xeb@mail.ru)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -25,6 +20,7 @@
#include <linux/spinlock.h>
#include <net/protocol.h>
#include <net/gre.h>
+#include <net/erspan.h>
#include <net/icmp.h>
#include <net/route.h>
@@ -60,7 +56,9 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version)
}
EXPORT_SYMBOL_GPL(gre_del_protocol);
-/* Fills in tpi and returns header length to be pulled. */
+/* Fills in tpi and returns header length to be pulled.
+ * Note that caller must use pskb_may_pull() before pulling GRE header.
+ */
int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
bool *csum_err, __be16 proto, int nhs)
{
@@ -75,7 +73,7 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
return -EINVAL;
- tpi->flags = gre_flags_to_tnl_flags(greh->flags);
+ gre_flags_to_tnl_flags(tpi->flags, greh->flags);
hdr_len = gre_calc_hlen(tpi->flags);
if (!pskb_may_pull(skb, nhs + hdr_len))
@@ -86,13 +84,14 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
options = (__be32 *)(greh + 1);
if (greh->flags & GRE_CSUM) {
- if (skb_checksum_simple_validate(skb)) {
+ if (!skb_checksum_simple_validate(skb)) {
+ skb_checksum_try_convert(skb, IPPROTO_GRE,
+ null_compute_pseudo);
+ } else if (csum_err) {
*csum_err = true;
return -EINVAL;
}
- skb_checksum_try_convert(skb, IPPROTO_GRE, 0,
- null_compute_pseudo);
options++;
}
@@ -113,11 +112,33 @@ int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
* - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
*/
if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
+ u8 _val, *val;
+
+ val = skb_header_pointer(skb, nhs + hdr_len,
+ sizeof(_val), &_val);
+ if (!val)
+ return -EINVAL;
tpi->proto = proto;
- if ((*(u8 *)options & 0xF0) != 0x40)
+ if ((*val & 0xF0) != 0x40)
hdr_len += 4;
}
tpi->hdr_len = hdr_len;
+
+ /* ERSPAN ver 1 and 2 protocol sets GRE key field
+ * to 0 and sets the configured key in the
+ * inner erspan header field
+ */
+ if ((greh->protocol == htons(ETH_P_ERSPAN) && hdr_len != 4) ||
+ greh->protocol == htons(ETH_P_ERSPAN2)) {
+ struct erspan_base_hdr *ershdr;
+
+ if (!pskb_may_pull(skb, nhs + hdr_len + sizeof(*ershdr)))
+ return -EINVAL;
+
+ ershdr = (struct erspan_base_hdr *)(skb->data + nhs + hdr_len);
+ tpi->key = cpu_to_be32(get_session_id(ershdr));
+ }
+
return hdr_len;
}
EXPORT_SYMBOL(gre_parse_header);
@@ -150,31 +171,35 @@ drop:
return NET_RX_DROP;
}
-static void gre_err(struct sk_buff *skb, u32 info)
+static int gre_err(struct sk_buff *skb, u32 info)
{
const struct gre_protocol *proto;
const struct iphdr *iph = (const struct iphdr *)skb->data;
u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f;
+ int err = 0;
if (ver >= GREPROTO_MAX)
- return;
+ return -EINVAL;
rcu_read_lock();
proto = rcu_dereference(gre_proto[ver]);
if (proto && proto->err_handler)
proto->err_handler(skb, info);
+ else
+ err = -EPROTONOSUPPORT;
rcu_read_unlock();
+
+ return err;
}
static const struct net_protocol net_gre_protocol = {
.handler = gre_rcv,
.err_handler = gre_err,
- .netns_ok = 1,
};
static int __init gre_init(void)
{
- pr_info("GRE over IPv4 demultiplexor driver\n");
+ pr_info("GRE over IPv4 demultiplexer driver\n");
if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
pr_err("can't add protocol\n");
@@ -192,5 +217,5 @@ module_init(gre_init);
module_exit(gre_exit);
MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
-MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
+MODULE_AUTHOR("D. Kozlov <xeb@mail.ru>");
MODULE_LICENSE("GPL");
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 6c63524f598a..5028c72d494a 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPV4 GSO/GRO offload support
* Linux INET implementation
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* GRE GSO support
*/
@@ -14,17 +10,19 @@
#include <linux/init.h>
#include <net/protocol.h>
#include <net/gre.h>
+#include <net/gro.h>
+#include <net/gso.h>
static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
+ bool need_csum, offload_csum, gso_partial, need_ipsec;
struct sk_buff *segs = ERR_PTR(-EINVAL);
u16 mac_offset = skb->mac_header;
__be16 protocol = skb->protocol;
u16 mac_len = skb->mac_len;
int gre_offset, outer_hlen;
- bool need_csum, gso_partial;
if (!skb->encapsulation)
goto out;
@@ -48,6 +46,13 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
skb->encap_hdr_csum = need_csum;
features &= skb->dev->hw_enc_features;
+ if (need_csum)
+ features &= ~NETIF_F_SCTP_CRC;
+
+ need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
+ /* Try to offload checksum if possible */
+ offload_csum = !!(need_csum && !need_ipsec &&
+ (skb->dev->features & NETIF_F_HW_CSUM));
/* segment inner packet. */
segs = skb_mac_gso_segment(skb, features);
@@ -102,7 +107,13 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
}
*(pcsum + 1) = 0;
- *pcsum = gso_make_checksum(skb, 0);
+ if (skb->encapsulation || !offload_csum) {
+ *pcsum = gso_make_checksum(skb, 0);
+ } else {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = sizeof(*greh);
+ }
} while ((skb = skb->next));
out:
return segs;
@@ -127,12 +138,9 @@ static struct sk_buff *gre_gro_receive(struct list_head *head,
off = skb_gro_offset(skb);
hlen = off + sizeof(*greh);
- greh = skb_gro_header_fast(skb, off);
- if (skb_gro_header_hard(skb, hlen)) {
- greh = skb_gro_header_slow(skb, hlen, off);
- if (unlikely(!greh))
- goto out;
- }
+ greh = skb_gro_header(skb, hlen, off);
+ if (unlikely(!greh))
+ goto out;
/* Only support version 0 and K (key), C (csum) flags. Note that
* although the support for the S (seq#) flag can be added easily
@@ -153,10 +161,9 @@ static struct sk_buff *gre_gro_receive(struct list_head *head,
type = greh->protocol;
- rcu_read_lock();
ptype = gro_find_receive_by_type(type);
if (!ptype)
- goto out_unlock;
+ goto out;
grehlen = GRE_HEADER_SECTION;
@@ -167,18 +174,18 @@ static struct sk_buff *gre_gro_receive(struct list_head *head,
grehlen += GRE_HEADER_SECTION;
hlen = off + grehlen;
- if (skb_gro_header_hard(skb, hlen)) {
+ if (!skb_gro_may_pull(skb, hlen)) {
greh = skb_gro_header_slow(skb, hlen, off);
if (unlikely(!greh))
- goto out_unlock;
+ goto out;
}
/* Don't bother verifying checksum if we're going to flush anyway. */
if ((greh->flags & GRE_CSUM) && !NAPI_GRO_CB(skb)->flush) {
if (skb_gro_checksum_simple_validate(skb))
- goto out_unlock;
+ goto out;
- skb_gro_checksum_try_convert(skb, IPPROTO_GRE, 0,
+ skb_gro_checksum_try_convert(skb, IPPROTO_GRE,
null_compute_pseudo);
}
@@ -220,8 +227,6 @@ static struct sk_buff *gre_gro_receive(struct list_head *head,
pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
flush = 0;
-out_unlock:
- rcu_read_unlock();
out:
skb_gro_flush_final(skb, pp, flush);
@@ -246,13 +251,10 @@ static int gre_gro_complete(struct sk_buff *skb, int nhoff)
if (greh->flags & GRE_CSUM)
grehlen += GRE_HEADER_SECTION;
- rcu_read_lock();
ptype = gro_find_complete_by_type(type);
if (ptype)
err = ptype->callbacks.gro_complete(skb, nhoff + grehlen);
- rcu_read_unlock();
-
skb_set_inner_mac_header(skb, nhoff + grehlen);
return err;
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 695979b7ef6d..4abbec2f47ef 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* NET3: Implementation of the ICMP protocol layer.
*
* Alan Cox, <alan@lxorguk.ukuu.org.uk>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Some of the function names and the icmp unreach table for this
* module were derived from [icmp.c 1.0.11 06/02/93] by
* Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting.
@@ -59,7 +55,6 @@
*
* - Should use skb_pull() instead of all the manual checking.
* This would also greatly simply some upper layer error handlers. --AK
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -77,6 +72,7 @@
#include <linux/string.h>
#include <linux/netfilter_ipv4.h>
#include <linux/slab.h>
+#include <net/flow.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/route.h>
@@ -97,6 +93,10 @@
#include <net/inet_common.h>
#include <net/ip_fib.h>
#include <net/l3mdev.h>
+#include <net/addrconf.h>
+#include <net/inet_dscp.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/icmp.h>
/*
* Build xmit assembly blocks
@@ -191,30 +191,20 @@ EXPORT_SYMBOL(icmp_err_convert);
*/
struct icmp_control {
- bool (*handler)(struct sk_buff *skb);
+ enum skb_drop_reason (*handler)(struct sk_buff *skb);
short error; /* This ICMP is classed as an error message */
};
static const struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
-/*
- * The ICMP socket(s). This is the most convenient way to flow control
- * our ICMP output as well as maintain a clean interface throughout
- * all layers. All Socketless IP sends will soon be gone.
- *
- * On SMP we have one ICMP socket per-cpu.
- */
-static struct sock *icmp_sk(struct net *net)
-{
- return *this_cpu_ptr(net->ipv4.icmp_sk);
-}
+static DEFINE_PER_CPU(struct sock *, ipv4_icmp_sk);
/* Called with BH disabled */
static inline struct sock *icmp_xmit_lock(struct net *net)
{
struct sock *sk;
- sk = icmp_sk(net);
+ sk = this_cpu_read(ipv4_icmp_sk);
if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
/* This can happen if the output path signals a
@@ -222,64 +212,66 @@ static inline struct sock *icmp_xmit_lock(struct net *net)
*/
return NULL;
}
+ sock_net_set(sk, net);
return sk;
}
static inline void icmp_xmit_unlock(struct sock *sk)
{
+ sock_net_set(sk, &init_net);
spin_unlock(&sk->sk_lock.slock);
}
-int sysctl_icmp_msgs_per_sec __read_mostly = 1000;
-int sysctl_icmp_msgs_burst __read_mostly = 50;
-
-static struct {
- spinlock_t lock;
- u32 credit;
- u32 stamp;
-} icmp_global = {
- .lock = __SPIN_LOCK_UNLOCKED(icmp_global.lock),
-};
-
/**
* icmp_global_allow - Are we allowed to send one more ICMP message ?
+ * @net: network namespace
*
- * Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec.
+ * Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec.
* Returns false if we reached the limit and can not send another packet.
- * Note: called with BH disabled
+ * Works in tandem with icmp_global_consume().
*/
-bool icmp_global_allow(void)
+bool icmp_global_allow(struct net *net)
{
- u32 credit, delta, incr = 0, now = (u32)jiffies;
- bool rc = false;
+ u32 delta, now, oldstamp;
+ int incr, new, old;
- /* Check if token bucket is empty and cannot be refilled
- * without taking the spinlock.
+ /* Note: many cpus could find this condition true.
+ * Then later icmp_global_consume() could consume more credits,
+ * this is an acceptable race.
*/
- if (!icmp_global.credit) {
- delta = min_t(u32, now - icmp_global.stamp, HZ);
- if (delta < HZ / 50)
- return false;
- }
+ if (atomic_read(&net->ipv4.icmp_global_credit) > 0)
+ return true;
- spin_lock(&icmp_global.lock);
- delta = min_t(u32, now - icmp_global.stamp, HZ);
- if (delta >= HZ / 50) {
- incr = sysctl_icmp_msgs_per_sec * delta / HZ ;
- if (incr)
- icmp_global.stamp = now;
- }
- credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst);
- if (credit) {
- credit--;
- rc = true;
+ now = jiffies;
+ oldstamp = READ_ONCE(net->ipv4.icmp_global_stamp);
+ delta = min_t(u32, now - oldstamp, HZ);
+ if (delta < HZ / 50)
+ return false;
+
+ incr = READ_ONCE(net->ipv4.sysctl_icmp_msgs_per_sec) * delta / HZ;
+ if (!incr)
+ return false;
+
+ if (cmpxchg(&net->ipv4.icmp_global_stamp, oldstamp, now) == oldstamp) {
+ old = atomic_read(&net->ipv4.icmp_global_credit);
+ do {
+ new = min(old + incr, READ_ONCE(net->ipv4.sysctl_icmp_msgs_burst));
+ } while (!atomic_try_cmpxchg(&net->ipv4.icmp_global_credit, &old, new));
}
- icmp_global.credit = credit;
- spin_unlock(&icmp_global.lock);
- return rc;
+ return true;
}
EXPORT_SYMBOL(icmp_global_allow);
+void icmp_global_consume(struct net *net)
+{
+ int credits = get_random_u32_below(3);
+
+ /* Note: this might make icmp_global.credit negative. */
+ if (credits)
+ atomic_sub(credits, &net->ipv4.icmp_global_credit);
+}
+EXPORT_SYMBOL(icmp_global_consume);
+
static bool icmpv4_mask_allow(struct net *net, int type, int code)
{
if (type > NR_ICMP_TYPES)
@@ -290,20 +282,23 @@ static bool icmpv4_mask_allow(struct net *net, int type, int code)
return true;
/* Limit if icmp type is enabled in ratemask. */
- if (!((1 << type) & net->ipv4.sysctl_icmp_ratemask))
+ if (!((1 << type) & READ_ONCE(net->ipv4.sysctl_icmp_ratemask)))
return true;
return false;
}
-static bool icmpv4_global_allow(struct net *net, int type, int code)
+static bool icmpv4_global_allow(struct net *net, int type, int code,
+ bool *apply_ratelimit)
{
if (icmpv4_mask_allow(net, type, code))
return true;
- if (icmp_global_allow())
+ if (icmp_global_allow(net)) {
+ *apply_ratelimit = true;
return true;
-
+ }
+ __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL);
return false;
}
@@ -312,26 +307,33 @@ static bool icmpv4_global_allow(struct net *net, int type, int code)
*/
static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
- struct flowi4 *fl4, int type, int code)
+ struct flowi4 *fl4, int type, int code,
+ bool apply_ratelimit)
{
struct dst_entry *dst = &rt->dst;
struct inet_peer *peer;
+ struct net_device *dev;
bool rc = true;
- int vif;
- if (icmpv4_mask_allow(net, type, code))
- goto out;
+ if (!apply_ratelimit)
+ return true;
/* No rate limit on loopback */
- if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
+ rcu_read_lock();
+ dev = dst_dev_rcu(dst);
+ if (dev && (dev->flags & IFF_LOOPBACK))
goto out;
- vif = l3mdev_master_ifindex(dst->dev);
- peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1);
- rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit);
- if (peer)
- inet_putpeer(peer);
+ peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr,
+ l3mdev_master_ifindex_rcu(dev));
+ rc = inet_peer_xrlim_allow(peer,
+ READ_ONCE(net->ipv4.sysctl_icmp_ratelimit));
out:
+ rcu_read_unlock();
+ if (!rc)
+ __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST);
+ else
+ icmp_global_consume(net);
return rc;
}
@@ -351,12 +353,12 @@ void icmp_out_count(struct net *net, unsigned char type)
static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
struct sk_buff *skb)
{
- struct icmp_bxm *icmp_param = (struct icmp_bxm *)from;
+ struct icmp_bxm *icmp_param = from;
__wsum csum;
csum = skb_copy_and_csum_bits(icmp_param->skb,
icmp_param->offset + offset,
- to, len, 0);
+ to, len);
skb->csum = csum_block_add(skb->csum, csum, odd);
if (icmp_pointers[icmp_param->data.icmph.type].error)
@@ -364,14 +366,13 @@ static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
return 0;
}
-static void icmp_push_reply(struct icmp_bxm *icmp_param,
+static void icmp_push_reply(struct sock *sk,
+ struct icmp_bxm *icmp_param,
struct flowi4 *fl4,
struct ipcm_cookie *ipc, struct rtable **rt)
{
- struct sock *sk;
struct sk_buff *skb;
- sk = icmp_sk(dev_net((*rt)->dst.dev));
if (ip_append_data(sk, fl4, icmp_glue_bits, icmp_param,
icmp_param->data_len+icmp_param->head_len,
icmp_param->head_len,
@@ -380,15 +381,15 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
ip_flush_pending_frames(sk);
} else if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
struct icmphdr *icmph = icmp_hdr(skb);
- __wsum csum = 0;
+ __wsum csum;
struct sk_buff *skb1;
+ csum = csum_partial_copy_nocheck((void *)&icmp_param->data,
+ (char *)icmph,
+ icmp_param->head_len);
skb_queue_walk(&sk->sk_write_queue, skb1) {
csum = csum_add(csum, skb1->csum);
}
- csum = csum_partial_copy_nocheck((void *)&icmp_param->data,
- (char *)icmph,
- icmp_param->head_len, csum);
icmph->checksum = csum_fold(csum);
skb->ip_summed = CHECKSUM_NONE;
ip_push_pending_frames(sk, fl4);
@@ -401,12 +402,12 @@ static void icmp_push_reply(struct icmp_bxm *icmp_param,
static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
{
- struct ipcm_cookie ipc;
struct rtable *rt = skb_rtable(skb);
- struct net *net = dev_net(rt->dst.dev);
+ struct net *net = dev_net_rcu(rt->dst.dev);
+ bool apply_ratelimit = false;
+ struct ipcm_cookie ipc;
struct flowi4 fl4;
struct sock *sk;
- struct inet_sock *inet;
__be32 daddr, saddr;
u32 mark = IP4_REPLY_MARK(net, skb->mark);
int type = icmp_param->data.icmph.type;
@@ -415,23 +416,22 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb))
return;
- /* Needed by both icmp_global_allow and icmp_xmit_lock */
+ /* Needed by both icmpv4_global_allow and icmp_xmit_lock */
local_bh_disable();
- /* global icmp_msgs_per_sec */
- if (!icmpv4_global_allow(net, type, code))
+ /* is global icmp_msgs_per_sec exhausted ? */
+ if (!icmpv4_global_allow(net, type, code, &apply_ratelimit))
goto out_bh_enable;
sk = icmp_xmit_lock(net);
if (!sk)
goto out_bh_enable;
- inet = inet_sk(sk);
icmp_param->data.icmph.checksum = 0;
ipcm_init(&ipc);
- inet->tos = ip_hdr(skb)->tos;
- sk->sk_mark = mark;
+ ipc.tos = ip_hdr(skb)->tos;
+ ipc.sockc.mark = mark;
daddr = ipc.addr = ip_hdr(skb)->saddr;
saddr = fib_compute_spec_dst(skb);
@@ -445,15 +445,15 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
fl4.saddr = saddr;
fl4.flowi4_mark = mark;
fl4.flowi4_uid = sock_net_uid(net, NULL);
- fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
+ fl4.flowi4_dscp = ip4h_dscp(ip_hdr(skb));
fl4.flowi4_proto = IPPROTO_ICMP;
fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
- security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
+ security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
goto out_unlock;
- if (icmpv4_xrlim_allow(net, rt, &fl4, type, code))
- icmp_push_reply(icmp_param, &fl4, &ipc, &rt);
+ if (icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit))
+ icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt);
ip_rt_put(rt);
out_unlock:
icmp_xmit_unlock(sk);
@@ -461,14 +461,31 @@ out_bh_enable:
local_bh_enable();
}
-static struct rtable *icmp_route_lookup(struct net *net,
- struct flowi4 *fl4,
+/*
+ * The device used for looking up which routing table to use for sending an ICMP
+ * error is preferably the source whenever it is set, which should ensure the
+ * icmp error can be sent to the source host, else lookup using the routing
+ * table of the destination device, else use the main routing table (index 0).
+ */
+static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ const struct dst_entry *dst;
+
+ if (dev)
+ return dev;
+ dst = skb_dst(skb);
+ return dst ? dst_dev(dst) : NULL;
+}
+
+static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
struct sk_buff *skb_in,
- const struct iphdr *iph,
- __be32 saddr, u8 tos, u32 mark,
- int type, int code,
- struct icmp_bxm *param)
+ const struct iphdr *iph, __be32 saddr,
+ dscp_t dscp, u32 mark, int type,
+ int code, struct icmp_bxm *param)
{
+ struct net_device *route_lookup_dev;
+ struct dst_entry *dst, *dst2;
struct rtable *rt, *rt2;
struct flowi4 fl4_dec;
int err;
@@ -479,13 +496,14 @@ static struct rtable *icmp_route_lookup(struct net *net,
fl4->saddr = saddr;
fl4->flowi4_mark = mark;
fl4->flowi4_uid = sock_net_uid(net, NULL);
- fl4->flowi4_tos = RT_TOS(tos);
+ fl4->flowi4_dscp = dscp;
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
fl4->fl4_icmp_code = code;
- fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev);
+ route_lookup_dev = icmp_get_route_lookup_dev(skb_in);
+ fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev);
- security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
+ security_skb_classify_flow(skb_in, flowi4_to_flowi_common(fl4));
rt = ip_route_output_key_hash(net, fl4, skb_in);
if (IS_ERR(rt))
return rt;
@@ -493,21 +511,25 @@ static struct rtable *icmp_route_lookup(struct net *net,
/* No need to clone since we're just using its address. */
rt2 = rt;
- rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
- flowi4_to_flowi(fl4), NULL, 0);
- if (!IS_ERR(rt)) {
+ dst = xfrm_lookup(net, &rt->dst,
+ flowi4_to_flowi(fl4), NULL, 0);
+ rt = dst_rtable(dst);
+ if (!IS_ERR(dst)) {
if (rt != rt2)
return rt;
- } else if (PTR_ERR(rt) == -EPERM) {
+ if (inet_addr_type_dev_table(net, route_lookup_dev,
+ fl4->daddr) == RTN_LOCAL)
+ return rt;
+ } else if (PTR_ERR(dst) == -EPERM) {
rt = NULL;
- } else
+ } else {
return rt;
-
- err = xfrm_decode_session_reverse(skb_in, flowi4_to_flowi(&fl4_dec), AF_INET);
+ }
+ err = xfrm_decode_session_reverse(net, skb_in, flowi4_to_flowi(&fl4_dec), AF_INET);
if (err)
goto relookup_failed;
- if (inet_addr_type_dev_table(net, skb_dst(skb_in)->dev,
+ if (inet_addr_type_dev_table(net, route_lookup_dev,
fl4_dec.saddr) == RTN_LOCAL) {
rt2 = __ip_route_output_key(net, &fl4_dec);
if (IS_ERR(rt2))
@@ -523,32 +545,33 @@ static struct rtable *icmp_route_lookup(struct net *net,
goto relookup_failed;
}
/* Ugh! */
- orefdst = skb_in->_skb_refdst; /* save old refdst */
- skb_dst_set(skb_in, NULL);
+ orefdst = skb_dstref_steal(skb_in);
err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr,
- RT_TOS(tos), rt2->dst.dev);
+ dscp, rt2->dst.dev) ? -EINVAL : 0;
dst_release(&rt2->dst);
rt2 = skb_rtable(skb_in);
- skb_in->_skb_refdst = orefdst; /* restore old refdst */
+ /* steal dst entry from skb_in, don't drop refcnt */
+ skb_dstref_steal(skb_in);
+ skb_dstref_restore(skb_in, orefdst);
}
if (err)
goto relookup_failed;
- rt2 = (struct rtable *) xfrm_lookup(net, &rt2->dst,
- flowi4_to_flowi(&fl4_dec), NULL,
- XFRM_LOOKUP_ICMP);
- if (!IS_ERR(rt2)) {
+ dst2 = xfrm_lookup(net, &rt2->dst, flowi4_to_flowi(&fl4_dec), NULL,
+ XFRM_LOOKUP_ICMP);
+ rt2 = dst_rtable(dst2);
+ if (!IS_ERR(dst2)) {
dst_release(&rt->dst);
memcpy(fl4, &fl4_dec, sizeof(*fl4));
rt = rt2;
- } else if (PTR_ERR(rt2) == -EPERM) {
+ } else if (PTR_ERR(dst2) == -EPERM) {
if (rt)
dst_release(&rt->dst);
return rt2;
} else {
- err = PTR_ERR(rt2);
+ err = PTR_ERR(dst2);
goto relookup_failed;
}
return rt;
@@ -559,6 +582,185 @@ relookup_failed:
return ERR_PTR(err);
}
+struct icmp_ext_iio_addr4_subobj {
+ __be16 afi;
+ __be16 reserved;
+ __be32 addr4;
+};
+
+static unsigned int icmp_ext_iio_len(void)
+{
+ return sizeof(struct icmp_extobj_hdr) +
+ /* ifIndex */
+ sizeof(__be32) +
+ /* Interface Address Sub-Object */
+ sizeof(struct icmp_ext_iio_addr4_subobj) +
+ /* Interface Name Sub-Object. Length must be a multiple of 4
+ * bytes.
+ */
+ ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) +
+ /* MTU */
+ sizeof(__be32);
+}
+
+static unsigned int icmp_ext_max_len(u8 ext_objs)
+{
+ unsigned int ext_max_len;
+
+ ext_max_len = sizeof(struct icmp_ext_hdr);
+
+ if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
+ ext_max_len += icmp_ext_iio_len();
+
+ return ext_max_len;
+}
+
+static __be32 icmp_ext_iio_addr4_find(const struct net_device *dev)
+{
+ struct in_device *in_dev;
+ struct in_ifaddr *ifa;
+
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ return 0;
+
+ /* It is unclear from RFC 5837 which IP address should be chosen, but
+ * it makes sense to choose a global unicast address.
+ */
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
+ if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY)
+ continue;
+ if (ifa->ifa_scope != RT_SCOPE_UNIVERSE ||
+ ipv4_is_multicast(ifa->ifa_address))
+ continue;
+ return ifa->ifa_address;
+ }
+
+ return 0;
+}
+
+static void icmp_ext_iio_iif_append(struct net *net, struct sk_buff *skb,
+ int iif)
+{
+ struct icmp_ext_iio_name_subobj *name_subobj;
+ struct icmp_extobj_hdr *objh;
+ struct net_device *dev;
+ __be32 data;
+
+ if (!iif)
+ return;
+
+ /* Add the fields in the order specified by RFC 5837. */
+ objh = skb_put(skb, sizeof(*objh));
+ objh->class_num = ICMP_EXT_OBJ_CLASS_IIO;
+ objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF);
+
+ data = htonl(iif);
+ skb_put_data(skb, &data, sizeof(__be32));
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX;
+
+ rcu_read_lock();
+
+ dev = dev_get_by_index_rcu(net, iif);
+ if (!dev)
+ goto out;
+
+ data = icmp_ext_iio_addr4_find(dev);
+ if (data) {
+ struct icmp_ext_iio_addr4_subobj *addr4_subobj;
+
+ addr4_subobj = skb_put_zero(skb, sizeof(*addr4_subobj));
+ addr4_subobj->afi = htons(ICMP_AFI_IP);
+ addr4_subobj->addr4 = data;
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR;
+ }
+
+ name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4));
+ name_subobj->len = ALIGN(sizeof(*name_subobj), 4);
+ netdev_copy_name(dev, name_subobj->name);
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME;
+
+ data = htonl(READ_ONCE(dev->mtu));
+ skb_put_data(skb, &data, sizeof(__be32));
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU;
+
+out:
+ rcu_read_unlock();
+ objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh);
+}
+
+static void icmp_ext_objs_append(struct net *net, struct sk_buff *skb,
+ u8 ext_objs, int iif)
+{
+ if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
+ icmp_ext_iio_iif_append(net, skb, iif);
+}
+
+static struct sk_buff *
+icmp_ext_append(struct net *net, struct sk_buff *skb_in, struct icmphdr *icmph,
+ unsigned int room, int iif)
+{
+ unsigned int payload_len, ext_max_len, ext_len;
+ struct icmp_ext_hdr *ext_hdr;
+ struct sk_buff *skb;
+ u8 ext_objs;
+ int nhoff;
+
+ switch (icmph->type) {
+ case ICMP_DEST_UNREACH:
+ case ICMP_TIME_EXCEEDED:
+ case ICMP_PARAMETERPROB:
+ break;
+ default:
+ return NULL;
+ }
+
+ ext_objs = READ_ONCE(net->ipv4.sysctl_icmp_errors_extension_mask);
+ if (!ext_objs)
+ return NULL;
+
+ ext_max_len = icmp_ext_max_len(ext_objs);
+ if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room)
+ return NULL;
+
+ skb = skb_clone(skb_in, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ nhoff = skb_network_offset(skb);
+ payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN);
+
+ if (!pskb_network_may_pull(skb, payload_len))
+ goto free_skb;
+
+ if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) ||
+ __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false))
+ goto free_skb;
+
+ if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC))
+ goto free_skb;
+
+ ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr));
+ ext_hdr->version = ICMP_EXT_VERSION_2;
+
+ icmp_ext_objs_append(net, skb, ext_objs, iif);
+
+ /* Do not send an empty extension structure. */
+ ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr;
+ if (ext_len == sizeof(*ext_hdr))
+ goto free_skb;
+
+ ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len);
+ /* The length of the original datagram in 32-bit words (RFC 4884). */
+ icmph->un.reserved[1] = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u32);
+
+ return skb;
+
+free_skb:
+ consume_skb(skb);
+ return NULL;
+}
+
/*
* Send an ICMP message in response to a situation
*
@@ -570,12 +772,15 @@ relookup_failed:
* MUST reply to only the first fragment.
*/
-void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
+void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
+ const struct inet_skb_parm *parm)
{
struct iphdr *iph;
int room;
struct icmp_bxm icmp_param;
struct rtable *rt = skb_rtable(skb_in);
+ bool apply_ratelimit = false;
+ struct sk_buff *ext_skb;
struct ipcm_cookie ipc;
struct flowi4 fl4;
__be32 saddr;
@@ -585,8 +790,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
struct sock *sk;
if (!rt)
+ return;
+
+ rcu_read_lock();
+
+ if (rt->dst.dev)
+ net = dev_net_rcu(rt->dst.dev);
+ else if (skb_in->dev)
+ net = dev_net_rcu(skb_in->dev);
+ else
goto out;
- net = dev_net(rt->dst.dev);
/*
* Find the original header. It is expected to be valid, of course.
@@ -651,7 +864,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
}
}
- /* Needed by both icmp_global_allow and icmp_xmit_lock */
+ /* Needed by both icmpv4_global_allow and icmp_xmit_lock */
local_bh_disable();
/* Check global sysctl_icmp_msgs_per_sec ratelimit, unless
@@ -659,7 +872,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
* loopback, then peer ratelimit still work (in icmpv4_xrlim_allow)
*/
if (!(skb_in->dev && (skb_in->dev->flags&IFF_LOOPBACK)) &&
- !icmpv4_global_allow(net, type, code))
+ !icmpv4_global_allow(net, type, code, &apply_ratelimit))
goto out_bh_enable;
sk = icmp_xmit_lock(net);
@@ -676,22 +889,25 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
rcu_read_lock();
if (rt_is_input_route(rt) &&
- net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
- dev = dev_get_by_index_rcu(net, inet_iif(skb_in));
+ READ_ONCE(net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr))
+ dev = dev_get_by_index_rcu(net, parm->iif ? parm->iif :
+ inet_iif(skb_in));
if (dev)
- saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
+ saddr = inet_select_addr(dev, iph->saddr,
+ RT_SCOPE_LINK);
else
saddr = 0;
rcu_read_unlock();
}
- tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
+ tos = icmp_pointers[type].error ? (RT_TOS(iph->tos) |
IPTOS_PREC_INTERNETCONTROL) :
- iph->tos;
+ iph->tos;
mark = IP4_REPLY_MARK(net, skb_in->mark);
- if (ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in))
+ if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in,
+ &parm->opt))
goto out_unlock;
@@ -705,19 +921,20 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
icmp_param.data.icmph.checksum = 0;
icmp_param.skb = skb_in;
icmp_param.offset = skb_network_offset(skb_in);
- inet_sk(sk)->tos = tos;
- sk->sk_mark = mark;
ipcm_init(&ipc);
+ ipc.tos = tos;
ipc.addr = iph->saddr;
ipc.opt = &icmp_param.replyopts.opt;
+ ipc.sockc.mark = mark;
- rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, tos, mark,
- type, code, &icmp_param);
+ rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr,
+ inet_dsfield_to_dscp(tos), mark, type, code,
+ &icmp_param);
if (IS_ERR(rt))
goto out_unlock;
/* peer icmp_ratelimit */
- if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code))
+ if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit))
goto ende;
/* RFC says return as much as we can without exceeding 576 bytes. */
@@ -727,27 +944,87 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
room = 576;
room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
room -= sizeof(struct icmphdr);
+ /* Guard against tiny mtu. We need to include at least one
+ * IP network header for this message to make any sense.
+ */
+ if (room <= (int)sizeof(struct iphdr))
+ goto ende;
+
+ ext_skb = icmp_ext_append(net, skb_in, &icmp_param.data.icmph, room,
+ parm->iif);
+ if (ext_skb)
+ icmp_param.skb = ext_skb;
- icmp_param.data_len = skb_in->len - icmp_param.offset;
+ icmp_param.data_len = icmp_param.skb->len - icmp_param.offset;
if (icmp_param.data_len > room)
icmp_param.data_len = room;
icmp_param.head_len = sizeof(struct icmphdr);
- icmp_push_reply(&icmp_param, &fl4, &ipc, &rt);
+ /* if we don't have a source address at this point, fall back to the
+ * dummy address instead of sending out a packet with a source address
+ * of 0.0.0.0
+ */
+ if (!fl4.saddr)
+ fl4.saddr = htonl(INADDR_DUMMY);
+
+ trace_icmp_send(skb_in, type, code);
+
+ icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt);
+
+ if (ext_skb)
+ consume_skb(ext_skb);
ende:
ip_rt_put(rt);
out_unlock:
icmp_xmit_unlock(sk);
out_bh_enable:
local_bh_enable();
-out:;
+out:
+ rcu_read_unlock();
}
-EXPORT_SYMBOL(icmp_send);
+EXPORT_SYMBOL(__icmp_send);
+#if IS_ENABLED(CONFIG_NF_NAT)
+#include <net/netfilter/nf_conntrack.h>
+void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
+{
+ struct sk_buff *cloned_skb = NULL;
+ enum ip_conntrack_info ctinfo;
+ enum ip_conntrack_dir dir;
+ struct inet_skb_parm parm;
+ struct nf_conn *ct;
+ __be32 orig_ip;
+
+ memset(&parm, 0, sizeof(parm));
+ ct = nf_ct_get(skb_in, &ctinfo);
+ if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) {
+ __icmp_send(skb_in, type, code, info, &parm);
+ return;
+ }
+
+ if (skb_shared(skb_in))
+ skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC);
+
+ if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head ||
+ (skb_network_header(skb_in) + sizeof(struct iphdr)) >
+ skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in,
+ skb_network_offset(skb_in) + sizeof(struct iphdr))))
+ goto out;
+
+ orig_ip = ip_hdr(skb_in)->saddr;
+ dir = CTINFO2DIR(ctinfo);
+ ip_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.ip;
+ __icmp_send(skb_in, type, code, info, &parm);
+ ip_hdr(skb_in)->saddr = orig_ip;
+out:
+ consume_skb(cloned_skb);
+}
+EXPORT_SYMBOL(icmp_ndo_send);
+#endif
static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
const struct net_protocol *ipprot;
int protocol = iph->protocol;
@@ -755,7 +1032,7 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
* avoid additional coding at protocol handlers.
*/
if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) {
- __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS);
+ __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS);
return;
}
@@ -781,14 +1058,15 @@ static bool icmp_tag_validation(int proto)
* ICMP_PARAMETERPROB.
*/
-static bool icmp_unreach(struct sk_buff *skb)
+static enum skb_drop_reason icmp_unreach(struct sk_buff *skb)
{
+ enum skb_drop_reason reason = SKB_NOT_DROPPED_YET;
const struct iphdr *iph;
struct icmphdr *icmph;
struct net *net;
u32 info = 0;
- net = dev_net(skb_dst(skb)->dev);
+ net = skb_dst_dev_net_rcu(skb);
/*
* Incomplete header ?
@@ -802,8 +1080,10 @@ static bool icmp_unreach(struct sk_buff *skb)
icmph = icmp_hdr(skb);
iph = (const struct iphdr *)skb->data;
- if (iph->ihl < 5) /* Mangled header, drop. */
+ if (iph->ihl < 5) { /* Mangled header, drop. */
+ reason = SKB_DROP_REASON_IP_INHDR;
goto out_err;
+ }
switch (icmph->type) {
case ICMP_DEST_UNREACH:
@@ -816,9 +1096,9 @@ static bool icmp_unreach(struct sk_buff *skb)
case ICMP_FRAG_NEEDED:
/* for documentation of the ip_no_pmtu_disc
* values please see
- * Documentation/networking/ip-sysctl.txt
+ * Documentation/networking/ip-sysctl.rst
*/
- switch (net->ipv4.sysctl_ip_no_pmtu_disc) {
+ switch (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc)) {
default:
net_dbg_ratelimited("%pI4: fragmentation needed and DF set\n",
&iph->daddr);
@@ -828,7 +1108,7 @@ static bool icmp_unreach(struct sk_buff *skb)
case 3:
if (!icmp_tag_validation(iph->protocol))
goto out;
- /* fall through */
+ fallthrough;
case 0:
info = ntohs(icmph->un.frag.mtu);
}
@@ -871,7 +1151,7 @@ static bool icmp_unreach(struct sk_buff *skb)
* get the other vendor to fix their kit.
*/
- if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses &&
+ if (!READ_ONCE(net->ipv4.sysctl_icmp_ignore_bogus_error_responses) &&
inet_addr_type_dev_table(net, skb->dev, iph->daddr) == RTN_BROADCAST) {
net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n",
&ip_hdr(skb)->saddr,
@@ -883,10 +1163,10 @@ static bool icmp_unreach(struct sk_buff *skb)
icmp_socket_deliver(skb, info);
out:
- return true;
+ return reason;
out_err:
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
- return false;
+ return reason ?: SKB_DROP_REASON_NOT_SPECIFIED;
}
@@ -894,24 +1174,24 @@ out_err:
* Handle ICMP_REDIRECT.
*/
-static bool icmp_redirect(struct sk_buff *skb)
+static enum skb_drop_reason icmp_redirect(struct sk_buff *skb)
{
if (skb->len < sizeof(struct iphdr)) {
- __ICMP_INC_STATS(dev_net(skb->dev), ICMP_MIB_INERRORS);
- return false;
+ __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS);
+ return SKB_DROP_REASON_PKT_TOO_SMALL;
}
if (!pskb_may_pull(skb, sizeof(struct iphdr))) {
/* there aught to be a stat */
- return false;
+ return SKB_DROP_REASON_NOMEM;
}
- icmp_socket_deliver(skb, icmp_hdr(skb)->un.gateway);
- return true;
+ icmp_socket_deliver(skb, ntohl(icmp_hdr(skb)->un.gateway));
+ return SKB_NOT_DROPPED_YET;
}
/*
- * Handle ICMP_ECHO ("ping") requests.
+ * Handle ICMP_ECHO ("ping") and ICMP_EXT_ECHO ("PROBE") requests.
*
* RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
* requests.
@@ -919,28 +1199,150 @@ static bool icmp_redirect(struct sk_buff *skb)
* included in the reply.
* RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring
* echo requests, MUST have default=NOT.
+ * RFC 8335: 8 MUST have a config option to enable/disable ICMP
+ * Extended Echo Functionality, MUST be disabled by default
* See also WRT handling of options once they are done and working.
*/
-static bool icmp_echo(struct sk_buff *skb)
+static enum skb_drop_reason icmp_echo(struct sk_buff *skb)
{
+ struct icmp_bxm icmp_param;
struct net *net;
- net = dev_net(skb_dst(skb)->dev);
- if (!net->ipv4.sysctl_icmp_echo_ignore_all) {
- struct icmp_bxm icmp_param;
+ net = skb_dst_dev_net_rcu(skb);
+ /* should there be an ICMP stat for ignored echos? */
+ if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all))
+ return SKB_NOT_DROPPED_YET;
+
+ icmp_param.data.icmph = *icmp_hdr(skb);
+ icmp_param.skb = skb;
+ icmp_param.offset = 0;
+ icmp_param.data_len = skb->len;
+ icmp_param.head_len = sizeof(struct icmphdr);
- icmp_param.data.icmph = *icmp_hdr(skb);
+ if (icmp_param.data.icmph.type == ICMP_ECHO)
icmp_param.data.icmph.type = ICMP_ECHOREPLY;
- icmp_param.skb = skb;
- icmp_param.offset = 0;
- icmp_param.data_len = skb->len;
- icmp_param.head_len = sizeof(struct icmphdr);
- icmp_reply(&icmp_param, skb);
+ else if (!icmp_build_probe(skb, &icmp_param.data.icmph))
+ return SKB_NOT_DROPPED_YET;
+
+ icmp_reply(&icmp_param, skb);
+ return SKB_NOT_DROPPED_YET;
+}
+
+/* Helper for icmp_echo and icmpv6_echo_reply.
+ * Searches for net_device that matches PROBE interface identifier
+ * and builds PROBE reply message in icmphdr.
+ *
+ * Returns false if PROBE responses are disabled via sysctl
+ */
+
+bool icmp_build_probe(struct sk_buff *skb, struct icmphdr *icmphdr)
+{
+ struct net *net = dev_net_rcu(skb->dev);
+ struct icmp_ext_hdr *ext_hdr, _ext_hdr;
+ struct icmp_ext_echo_iio *iio, _iio;
+ struct inet6_dev *in6_dev;
+ struct in_device *in_dev;
+ struct net_device *dev;
+ char buff[IFNAMSIZ];
+ u16 ident_len;
+ u8 status;
+
+ if (!READ_ONCE(net->ipv4.sysctl_icmp_echo_enable_probe))
+ return false;
+
+ /* We currently only support probing interfaces on the proxy node
+ * Check to ensure L-bit is set
+ */
+ if (!(ntohs(icmphdr->un.echo.sequence) & 1))
+ return false;
+ /* Clear status bits in reply message */
+ icmphdr->un.echo.sequence &= htons(0xFF00);
+ if (icmphdr->type == ICMP_EXT_ECHO)
+ icmphdr->type = ICMP_EXT_ECHOREPLY;
+ else
+ icmphdr->type = ICMPV6_EXT_ECHO_REPLY;
+ ext_hdr = skb_header_pointer(skb, 0, sizeof(_ext_hdr), &_ext_hdr);
+ /* Size of iio is class_type dependent.
+ * Only check header here and assign length based on ctype in the switch statement
+ */
+ iio = skb_header_pointer(skb, sizeof(_ext_hdr), sizeof(iio->extobj_hdr), &_iio);
+ if (!ext_hdr || !iio)
+ goto send_mal_query;
+ if (ntohs(iio->extobj_hdr.length) <= sizeof(iio->extobj_hdr) ||
+ ntohs(iio->extobj_hdr.length) > sizeof(_iio))
+ goto send_mal_query;
+ ident_len = ntohs(iio->extobj_hdr.length) - sizeof(iio->extobj_hdr);
+ iio = skb_header_pointer(skb, sizeof(_ext_hdr),
+ sizeof(iio->extobj_hdr) + ident_len, &_iio);
+ if (!iio)
+ goto send_mal_query;
+
+ status = 0;
+ dev = NULL;
+ switch (iio->extobj_hdr.class_type) {
+ case ICMP_EXT_ECHO_CTYPE_NAME:
+ if (ident_len >= IFNAMSIZ)
+ goto send_mal_query;
+ memset(buff, 0, sizeof(buff));
+ memcpy(buff, &iio->ident.name, ident_len);
+ dev = dev_get_by_name(net, buff);
+ break;
+ case ICMP_EXT_ECHO_CTYPE_INDEX:
+ if (ident_len != sizeof(iio->ident.ifindex))
+ goto send_mal_query;
+ dev = dev_get_by_index(net, ntohl(iio->ident.ifindex));
+ break;
+ case ICMP_EXT_ECHO_CTYPE_ADDR:
+ if (ident_len < sizeof(iio->ident.addr.ctype3_hdr) ||
+ ident_len != sizeof(iio->ident.addr.ctype3_hdr) +
+ iio->ident.addr.ctype3_hdr.addrlen)
+ goto send_mal_query;
+ switch (ntohs(iio->ident.addr.ctype3_hdr.afi)) {
+ case ICMP_AFI_IP:
+ if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in_addr))
+ goto send_mal_query;
+ dev = ip_dev_find(net, iio->ident.addr.ip_addr.ipv4_addr);
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case ICMP_AFI_IP6:
+ if (iio->ident.addr.ctype3_hdr.addrlen != sizeof(struct in6_addr))
+ goto send_mal_query;
+ dev = ipv6_stub->ipv6_dev_find(net, &iio->ident.addr.ip_addr.ipv6_addr, dev);
+ dev_hold(dev);
+ break;
+#endif
+ default:
+ goto send_mal_query;
+ }
+ break;
+ default:
+ goto send_mal_query;
}
- /* should there be an ICMP stat for ignored echos? */
+ if (!dev) {
+ icmphdr->code = ICMP_EXT_CODE_NO_IF;
+ return true;
+ }
+ /* Fill bits in reply message */
+ if (dev->flags & IFF_UP)
+ status |= ICMP_EXT_ECHOREPLY_ACTIVE;
+
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev && rcu_access_pointer(in_dev->ifa_list))
+ status |= ICMP_EXT_ECHOREPLY_IPV4;
+
+ in6_dev = __in6_dev_get(dev);
+ if (in6_dev && !list_empty(&in6_dev->addr_list))
+ status |= ICMP_EXT_ECHOREPLY_IPV6;
+
+ dev_put(dev);
+ icmphdr->un.echo.sequence |= htons(status);
+ return true;
+send_mal_query:
+ icmphdr->code = ICMP_EXT_CODE_MAL_QUERY;
return true;
}
+EXPORT_SYMBOL_GPL(icmp_build_probe);
/*
* Handle ICMP Timestamp requests.
@@ -949,7 +1351,7 @@ static bool icmp_echo(struct sk_buff *skb)
* MUST be accurate to a few minutes.
* MUST be updated at least at 15Hz.
*/
-static bool icmp_timestamp(struct sk_buff *skb)
+static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb)
{
struct icmp_bxm icmp_param;
/*
@@ -974,17 +1376,17 @@ static bool icmp_timestamp(struct sk_buff *skb)
icmp_param.data_len = 0;
icmp_param.head_len = sizeof(struct icmphdr) + 12;
icmp_reply(&icmp_param, skb);
- return true;
+ return SKB_NOT_DROPPED_YET;
out_err:
- __ICMP_INC_STATS(dev_net(skb_dst(skb)->dev), ICMP_MIB_INERRORS);
- return false;
+ __ICMP_INC_STATS(skb_dst_dev_net_rcu(skb), ICMP_MIB_INERRORS);
+ return SKB_DROP_REASON_PKT_TOO_SMALL;
}
-static bool icmp_discard(struct sk_buff *skb)
+static enum skb_drop_reason icmp_discard(struct sk_buff *skb)
{
/* pretend it was a success */
- return true;
+ return SKB_NOT_DROPPED_YET;
}
/*
@@ -992,18 +1394,20 @@ static bool icmp_discard(struct sk_buff *skb)
*/
int icmp_rcv(struct sk_buff *skb)
{
- struct icmphdr *icmph;
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct rtable *rt = skb_rtable(skb);
- struct net *net = dev_net(rt->dst.dev);
- bool success;
+ struct net *net = dev_net_rcu(rt->dst.dev);
+ struct icmphdr *icmph;
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
struct sec_path *sp = skb_sec_path(skb);
int nh;
if (!(sp && sp->xvec[sp->len - 1]->props.flags &
- XFRM_STATE_ICMP))
+ XFRM_STATE_ICMP)) {
+ reason = SKB_DROP_REASON_XFRM_POLICY;
goto drop;
+ }
if (!pskb_may_pull(skb, sizeof(*icmph) + sizeof(struct iphdr)))
goto drop;
@@ -1011,8 +1415,11 @@ int icmp_rcv(struct sk_buff *skb)
nh = skb_network_offset(skb);
skb_set_network_header(skb, sizeof(*icmph));
- if (!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN, skb))
+ if (!xfrm4_policy_check_reverse(NULL, XFRM_POLICY_IN,
+ skb)) {
+ reason = SKB_DROP_REASON_XFRM_POLICY;
goto drop;
+ }
skb_set_network_header(skb, nh);
}
@@ -1028,15 +1435,15 @@ int icmp_rcv(struct sk_buff *skb)
icmph = icmp_hdr(skb);
ICMPMSGIN_INC_STATS(net, icmph->type);
- /*
- * 18 is the highest 'known' ICMP type. Anything else is a mystery
- *
- * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
- * discarded.
- */
- if (icmph->type > NR_ICMP_TYPES)
- goto error;
+ /* Check for ICMP Extended Echo (PROBE) messages */
+ if (icmph->type == ICMP_EXT_ECHO) {
+ /* We can't use icmp_pointers[].handler() because it is an array of
+ * size NR_ICMP_TYPES + 1 (19 elements) and PROBE has code 42.
+ */
+ reason = icmp_echo(skb);
+ goto reason_check;
+ }
/*
* Parse the ICMP message
@@ -1051,42 +1458,121 @@ int icmp_rcv(struct sk_buff *skb)
*/
if ((icmph->type == ICMP_ECHO ||
icmph->type == ICMP_TIMESTAMP) &&
- net->ipv4.sysctl_icmp_echo_ignore_broadcasts) {
+ READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_broadcasts)) {
+ reason = SKB_DROP_REASON_INVALID_PROTO;
goto error;
}
if (icmph->type != ICMP_ECHO &&
icmph->type != ICMP_TIMESTAMP &&
icmph->type != ICMP_ADDRESS &&
icmph->type != ICMP_ADDRESSREPLY) {
+ reason = SKB_DROP_REASON_INVALID_PROTO;
goto error;
}
}
- success = icmp_pointers[icmph->type].handler(skb);
+ if (icmph->type == ICMP_EXT_ECHOREPLY ||
+ icmph->type == ICMP_ECHOREPLY) {
+ reason = ping_rcv(skb);
+ return reason ? NET_RX_DROP : NET_RX_SUCCESS;
+ }
- if (success) {
+ /*
+ * 18 is the highest 'known' ICMP type. Anything else is a mystery
+ *
+ * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently
+ * discarded.
+ */
+ if (icmph->type > NR_ICMP_TYPES) {
+ reason = SKB_DROP_REASON_UNHANDLED_PROTO;
+ goto error;
+ }
+
+ reason = icmp_pointers[icmph->type].handler(skb);
+reason_check:
+ if (!reason) {
consume_skb(skb);
return NET_RX_SUCCESS;
}
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return NET_RX_DROP;
csum_error:
+ reason = SKB_DROP_REASON_ICMP_CSUM;
__ICMP_INC_STATS(net, ICMP_MIB_CSUMERRORS);
error:
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
goto drop;
}
-void icmp_err(struct sk_buff *skb, u32 info)
+static bool ip_icmp_error_rfc4884_validate(const struct sk_buff *skb, int off)
+{
+ struct icmp_extobj_hdr *objh, _objh;
+ struct icmp_ext_hdr *exth, _exth;
+ u16 olen;
+
+ exth = skb_header_pointer(skb, off, sizeof(_exth), &_exth);
+ if (!exth)
+ return false;
+ if (exth->version != 2)
+ return true;
+
+ if (exth->checksum &&
+ csum_fold(skb_checksum(skb, off, skb->len - off, 0)))
+ return false;
+
+ off += sizeof(_exth);
+ while (off < skb->len) {
+ objh = skb_header_pointer(skb, off, sizeof(_objh), &_objh);
+ if (!objh)
+ return false;
+
+ olen = ntohs(objh->length);
+ if (olen < sizeof(_objh))
+ return false;
+
+ off += olen;
+ if (off > skb->len)
+ return false;
+ }
+
+ return true;
+}
+
+void ip_icmp_error_rfc4884(const struct sk_buff *skb,
+ struct sock_ee_data_rfc4884 *out,
+ int thlen, int off)
+{
+ int hlen;
+
+ /* original datagram headers: end of icmph to payload (skb->data) */
+ hlen = -skb_transport_offset(skb) - thlen;
+
+ /* per rfc 4884: minimal datagram length of 128 bytes */
+ if (off < 128 || off < hlen)
+ return;
+
+ /* kernel has stripped headers: return payload offset in bytes */
+ off -= hlen;
+ if (off + sizeof(struct icmp_ext_hdr) > skb->len)
+ return;
+
+ out->len = off;
+
+ if (!ip_icmp_error_rfc4884_validate(skb, off))
+ out->flags |= SO_EE_RFC4884_FLAG_INVALID;
+}
+EXPORT_SYMBOL_GPL(ip_icmp_error_rfc4884);
+
+int icmp_err(struct sk_buff *skb, u32 info)
{
struct iphdr *iph = (struct iphdr *)skb->data;
int offset = iph->ihl<<2;
struct icmphdr *icmph = (struct icmphdr *)(skb->data + offset);
+ struct net *net = dev_net_rcu(skb->dev);
int type = icmp_hdr(skb)->type;
int code = icmp_hdr(skb)->code;
- struct net *net = dev_net(skb->dev);
/*
* Use ping_err to handle all icmp errors except those
@@ -1094,13 +1580,15 @@ void icmp_err(struct sk_buff *skb, u32 info)
*/
if (icmph->type != ICMP_ECHOREPLY) {
ping_err(skb, offset, info);
- return;
+ return 0;
}
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
- ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_ICMP, 0);
+ ipv4_update_pmtu(skb, net, info, 0, IPPROTO_ICMP);
else if (type == ICMP_REDIRECT)
- ipv4_redirect(skb, net, 0, 0, IPPROTO_ICMP, 0);
+ ipv4_redirect(skb, net, 0, IPPROTO_ICMP);
+
+ return 0;
}
/*
@@ -1177,48 +1665,11 @@ static const struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
},
};
-static void __net_exit icmp_sk_exit(struct net *net)
-{
- int i;
-
- for_each_possible_cpu(i)
- inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i));
- free_percpu(net->ipv4.icmp_sk);
- net->ipv4.icmp_sk = NULL;
-}
-
static int __net_init icmp_sk_init(struct net *net)
{
- int i, err;
-
- net->ipv4.icmp_sk = alloc_percpu(struct sock *);
- if (!net->ipv4.icmp_sk)
- return -ENOMEM;
-
- for_each_possible_cpu(i) {
- struct sock *sk;
-
- err = inet_ctl_sock_create(&sk, PF_INET,
- SOCK_RAW, IPPROTO_ICMP, net);
- if (err < 0)
- goto fail;
-
- *per_cpu_ptr(net->ipv4.icmp_sk, i) = sk;
-
- /* Enough space for 2 64K ICMP packets, including
- * sk_buff/skb_shared_info struct overhead.
- */
- sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
-
- /*
- * Speedup sock_wfree()
- */
- sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
- inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
- }
-
/* Control parameters for ECHO replies. */
net->ipv4.sysctl_icmp_echo_ignore_all = 0;
+ net->ipv4.sysctl_icmp_echo_enable_probe = 0;
net->ipv4.sysctl_icmp_echo_ignore_broadcasts = 1;
/* Control parameter - ignore bogus broadcast responses? */
@@ -1239,22 +1690,41 @@ static int __net_init icmp_sk_init(struct net *net)
net->ipv4.sysctl_icmp_ratelimit = 1 * HZ;
net->ipv4.sysctl_icmp_ratemask = 0x1818;
net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0;
+ net->ipv4.sysctl_icmp_errors_extension_mask = 0;
+ net->ipv4.sysctl_icmp_msgs_per_sec = 1000;
+ net->ipv4.sysctl_icmp_msgs_burst = 50;
return 0;
-
-fail:
- for_each_possible_cpu(i)
- inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.icmp_sk, i));
- free_percpu(net->ipv4.icmp_sk);
- return err;
}
static struct pernet_operations __net_initdata icmp_sk_ops = {
.init = icmp_sk_init,
- .exit = icmp_sk_exit,
};
int __init icmp_init(void)
{
+ int err, i;
+
+ for_each_possible_cpu(i) {
+ struct sock *sk;
+
+ err = inet_ctl_sock_create(&sk, PF_INET,
+ SOCK_RAW, IPPROTO_ICMP, &init_net);
+ if (err < 0)
+ return err;
+
+ per_cpu(ipv4_icmp_sk, i) = sk;
+
+ /* Enough space for 2 64K ICMP packets, including
+ * sk_buff/skb_shared_info struct overhead.
+ */
+ sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
+
+ /*
+ * Speedup sock_wfree()
+ */
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+ inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
+ }
return register_pernet_subsys(&icmp_sk_ops);
}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 4da39446da2d..7182f1419c2a 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux NET3: Internet Group Management Protocol [IGMP]
*
@@ -11,11 +12,6 @@
* Authors:
* Alan Cox <alan@lxorguk.ukuu.org.uk>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Fixes:
*
* Alan Cox : Added lots of __inline__ to optimise
@@ -85,6 +81,7 @@
#include <linux/skbuff.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
+#include "igmp_internal.h"
#include <linux/if_arp.h>
#include <linux/rtnetlink.h>
#include <linux/times.h>
@@ -92,6 +89,8 @@
#include <linux/byteorder/generic.h>
#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/addrconf.h>
#include <net/arp.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -111,13 +110,8 @@
#ifdef CONFIG_IP_MULTICAST
/* Parameter names and values are taken from igmp-v2-06 draft */
-#define IGMP_V1_ROUTER_PRESENT_TIMEOUT (400*HZ)
-#define IGMP_V2_ROUTER_PRESENT_TIMEOUT (400*HZ)
-#define IGMP_V2_UNSOLICITED_REPORT_INTERVAL (10*HZ)
-#define IGMP_V3_UNSOLICITED_REPORT_INTERVAL (1*HZ)
+#define IGMP_QUERY_INTERVAL (125*HZ)
#define IGMP_QUERY_RESPONSE_INTERVAL (10*HZ)
-#define IGMP_QUERY_ROBUSTNESS_VARIABLE 2
-
#define IGMP_INITIAL_REPORT_DELAY (1)
@@ -129,12 +123,12 @@
*/
#define IGMP_V1_SEEN(in_dev) \
- (IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \
+ (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 1 || \
IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 1 || \
((in_dev)->mr_v1_seen && \
time_before(jiffies, (in_dev)->mr_v1_seen)))
#define IGMP_V2_SEEN(in_dev) \
- (IPV4_DEVCONF_ALL(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \
+ (IPV4_DEVCONF_ALL_RO(dev_net(in_dev->dev), FORCE_IGMP_VERSION) == 2 || \
IN_DEV_CONF_GET((in_dev), FORCE_IGMP_VERSION) == 2 || \
((in_dev)->mr_v2_seen && \
time_before(jiffies, (in_dev)->mr_v2_seen)))
@@ -162,7 +156,8 @@ static int unsolicited_report_interval(struct in_device *in_dev)
return interval_jiffies;
}
-static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im,
+ gfp_t gfp);
static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im);
static void igmpv3_clear_delrec(struct in_device *in_dev);
static int sf_setstate(struct ip_mc_list *pmc);
@@ -190,6 +185,17 @@ static void ip_ma_put(struct ip_mc_list *im)
pmc != NULL; \
pmc = rtnl_dereference(pmc->next_rcu))
+static void ip_sf_list_clear_all(struct ip_sf_list *psf)
+{
+ struct ip_sf_list *next;
+
+ while (psf) {
+ next = psf->sf_next;
+ kfree(psf);
+ psf = next;
+ }
+}
+
#ifdef CONFIG_IP_MULTICAST
/*
@@ -199,7 +205,7 @@ static void ip_ma_put(struct ip_mc_list *im)
static void igmp_stop_timer(struct ip_mc_list *im)
{
spin_lock_bh(&im->lock);
- if (del_timer(&im->timer))
+ if (timer_delete(&im->timer))
refcount_dec(&im->refcnt);
im->tm_running = 0;
im->reporter = 0;
@@ -210,16 +216,18 @@ static void igmp_stop_timer(struct ip_mc_list *im)
/* It must be called with locked im->lock */
static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
{
- int tv = prandom_u32() % max_delay;
+ int tv = get_random_u32_below(max_delay);
im->tm_running = 1;
- if (!mod_timer(&im->timer, jiffies+tv+2))
- refcount_inc(&im->refcnt);
+ if (refcount_inc_not_zero(&im->refcnt)) {
+ if (mod_timer(&im->timer, jiffies + tv + 2))
+ ip_ma_put(im);
+ }
}
static void igmp_gq_start_timer(struct in_device *in_dev)
{
- int tv = prandom_u32() % in_dev->mr_maxdelay;
+ int tv = get_random_u32_below(in_dev->mr_maxdelay);
unsigned long exp = jiffies + tv + 2;
if (in_dev->mr_gq_running &&
@@ -233,7 +241,7 @@ static void igmp_gq_start_timer(struct in_device *in_dev)
static void igmp_ifc_start_timer(struct in_device *in_dev, int delay)
{
- int tv = prandom_u32() % delay;
+ int tv = get_random_u32_below(delay);
if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2))
in_dev_hold(in_dev);
@@ -243,7 +251,7 @@ static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
{
spin_lock_bh(&im->lock);
im->unsolicit_count = 0;
- if (del_timer(&im->timer)) {
+ if (timer_delete(&im->timer)) {
if ((long)(im->timer.expires-jiffies) < max_delay) {
add_timer(&im->timer);
im->tm_running = 1;
@@ -327,14 +335,15 @@ static __be32 igmpv3_get_srcaddr(struct net_device *dev,
const struct flowi4 *fl4)
{
struct in_device *in_dev = __in_dev_get_rcu(dev);
+ const struct in_ifaddr *ifa;
if (!in_dev)
return htonl(INADDR_ANY);
- for_ifa(in_dev) {
+ in_dev_for_each_ifa_rcu(ifa, in_dev) {
if (fl4->saddr == ifa->ifa_local)
return fl4->saddr;
- } endfor_ifa(in_dev);
+ }
return htonl(INADDR_ANY);
}
@@ -349,8 +358,9 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
struct flowi4 fl4;
int hlen = LL_RESERVED_SPACE(dev);
int tlen = dev->needed_tailroom;
- unsigned int size = mtu;
+ unsigned int size;
+ size = min(mtu, IP_MAX_MTU);
while (1) {
skb = alloc_skb(size + hlen + tlen,
GFP_ATOMIC | __GFP_NOWARN);
@@ -417,7 +427,7 @@ static int igmpv3_sendpack(struct sk_buff *skb)
pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);
- return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
+ return ip_local_out(skb_dst_dev_net(skb), skb->sk, skb);
}
static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
@@ -463,7 +473,8 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
if (pmc->multiaddr == IGMP_ALL_HOSTS)
return skb;
- if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
+ if (ipv4_is_local_multicast(pmc->multiaddr) &&
+ !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
return skb;
mtu = READ_ONCE(dev->mtu);
@@ -589,7 +600,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
if (pmc->multiaddr == IGMP_ALL_HOSTS)
continue;
if (ipv4_is_local_multicast(pmc->multiaddr) &&
- !net->ipv4.sysctl_igmp_llm_reports)
+ !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
continue;
spin_lock_bh(&pmc->lock);
if (pmc->sfcount[MCAST_EXCLUDE])
@@ -635,6 +646,13 @@ static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
}
}
+static void kfree_pmc(struct ip_mc_list *pmc)
+{
+ ip_sf_list_clear_all(pmc->sources);
+ ip_sf_list_clear_all(pmc->tomb);
+ kfree(pmc);
+}
+
static void igmpv3_send_cr(struct in_device *in_dev)
{
struct ip_mc_list *pmc, *pmc_prev, *pmc_next;
@@ -671,7 +689,7 @@ static void igmpv3_send_cr(struct in_device *in_dev)
else
in_dev->mc_tomb = pmc_next;
in_dev_put(pmc->interface);
- kfree(pmc);
+ kfree_pmc(pmc);
} else
pmc_prev = pmc;
}
@@ -725,7 +743,8 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
return igmpv3_send_report(in_dev, pmc);
- if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports)
+ if (ipv4_is_local_multicast(group) &&
+ !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
return 0;
if (type == IGMP_HOST_LEAVE_MESSAGE)
@@ -782,7 +801,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
static void igmp_gq_timer_expire(struct timer_list *t)
{
- struct in_device *in_dev = from_timer(in_dev, t, mr_gq_timer);
+ struct in_device *in_dev = timer_container_of(in_dev, t, mr_gq_timer);
in_dev->mr_gq_running = 0;
igmpv3_send_report(in_dev, NULL);
@@ -791,11 +810,18 @@ static void igmp_gq_timer_expire(struct timer_list *t)
static void igmp_ifc_timer_expire(struct timer_list *t)
{
- struct in_device *in_dev = from_timer(in_dev, t, mr_ifc_timer);
+ struct in_device *in_dev = timer_container_of(in_dev, t, mr_ifc_timer);
+ u32 mr_ifc_count;
igmpv3_send_cr(in_dev);
- if (in_dev->mr_ifc_count) {
- in_dev->mr_ifc_count--;
+restart:
+ mr_ifc_count = READ_ONCE(in_dev->mr_ifc_count);
+
+ if (mr_ifc_count) {
+ if (cmpxchg(&in_dev->mr_ifc_count,
+ mr_ifc_count,
+ mr_ifc_count - 1) != mr_ifc_count)
+ goto restart;
igmp_ifc_start_timer(in_dev,
unsolicited_report_interval(in_dev));
}
@@ -807,14 +833,14 @@ static void igmp_ifc_event(struct in_device *in_dev)
struct net *net = dev_net(in_dev->dev);
if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
return;
- in_dev->mr_ifc_count = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+ WRITE_ONCE(in_dev->mr_ifc_count, in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv));
igmp_ifc_start_timer(in_dev, 1);
}
static void igmp_timer_expire(struct timer_list *t)
{
- struct ip_mc_list *im = from_timer(im, t, timer);
+ struct ip_mc_list *im = timer_container_of(im, t, timer);
struct in_device *in_dev = im->interface;
spin_lock(&im->lock);
@@ -902,7 +928,8 @@ static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
if (group == IGMP_ALL_HOSTS)
return false;
- if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports)
+ if (ipv4_is_local_multicast(group) &&
+ !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
return false;
rcu_read_lock();
@@ -935,17 +962,19 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
max_delay = IGMP_QUERY_RESPONSE_INTERVAL;
in_dev->mr_v1_seen = jiffies +
- IGMP_V1_ROUTER_PRESENT_TIMEOUT;
+ (in_dev->mr_qrv * in_dev->mr_qi) +
+ in_dev->mr_qri;
group = 0;
} else {
/* v2 router present */
max_delay = ih->code*(HZ/IGMP_TIMER_SCALE);
in_dev->mr_v2_seen = jiffies +
- IGMP_V2_ROUTER_PRESENT_TIMEOUT;
+ (in_dev->mr_qrv * in_dev->mr_qi) +
+ in_dev->mr_qri;
}
/* cancel the interface change timer */
- in_dev->mr_ifc_count = 0;
- if (del_timer(&in_dev->mr_ifc_timer))
+ WRITE_ONCE(in_dev->mr_ifc_count, 0);
+ if (timer_delete(&in_dev->mr_ifc_timer))
__in_dev_put(in_dev);
/* clear deleted report items */
igmpv3_clear_delrec(in_dev);
@@ -981,8 +1010,21 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
if (!max_delay)
max_delay = 1; /* can't mod w/ 0 */
in_dev->mr_maxdelay = max_delay;
- if (ih3->qrv)
- in_dev->mr_qrv = ih3->qrv;
+
+ /* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently
+ * received value was zero, use the default or statically
+ * configured value.
+ */
+ in_dev->mr_qrv = ih3->qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
+ in_dev->mr_qi = IGMPV3_QQIC(ih3->qqic)*HZ ?: IGMP_QUERY_INTERVAL;
+
+ /* RFC3376, 8.3. Query Response Interval:
+ * The number of seconds represented by the [Query Response
+ * Interval] must be less than the [Query Interval].
+ */
+ if (in_dev->mr_qri >= in_dev->mr_qi)
+ in_dev->mr_qri = (in_dev->mr_qi/HZ - 1)*HZ;
+
if (!group) { /* general query */
if (ih3->nsrcs)
return true; /* no sources allowed */
@@ -1012,7 +1054,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
if (im->multiaddr == IGMP_ALL_HOSTS)
continue;
if (ipv4_is_local_multicast(im->multiaddr) &&
- !net->ipv4.sysctl_igmp_llm_reports)
+ !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
continue;
spin_lock_bh(&im->lock);
if (im->tm_running)
@@ -1133,7 +1175,8 @@ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr)
/*
* deleted ip_mc_list manipulation
*/
-static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im,
+ gfp_t gfp)
{
struct ip_mc_list *pmc;
struct net *net = dev_net(in_dev->dev);
@@ -1144,7 +1187,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
* for deleted items allows change reports to use common code with
* non-deleted or query-response MCA's.
*/
- pmc = kzalloc(sizeof(*pmc), GFP_KERNEL);
+ pmc = kzalloc(sizeof(*pmc), gfp);
if (!pmc)
return;
spin_lock_init(&pmc->lock);
@@ -1152,7 +1195,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
pmc->interface = im->interface;
in_dev_hold(in_dev);
pmc->multiaddr = im->multiaddr;
- pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+ pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
pmc->sfmode = im->sfmode;
if (pmc->sfmode == MCAST_INCLUDE) {
struct ip_sf_list *psf;
@@ -1200,15 +1243,17 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
if (pmc) {
im->interface = pmc->interface;
if (im->sfmode == MCAST_INCLUDE) {
- im->tomb = pmc->tomb;
- im->sources = pmc->sources;
+ swap(im->tomb, pmc->tomb);
+ swap(im->sources, pmc->sources);
for (psf = im->sources; psf; psf = psf->sf_next)
- psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+ psf->sf_crcount = in_dev->mr_qrv ?:
+ READ_ONCE(net->ipv4.sysctl_igmp_qrv);
} else {
- im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+ im->crcount = in_dev->mr_qrv ?:
+ READ_ONCE(net->ipv4.sysctl_igmp_qrv);
}
in_dev_put(pmc->interface);
- kfree(pmc);
+ kfree_pmc(pmc);
}
spin_unlock_bh(&im->lock);
}
@@ -1229,27 +1274,24 @@ static void igmpv3_clear_delrec(struct in_device *in_dev)
nextpmc = pmc->next;
ip_mc_clear_src(pmc);
in_dev_put(pmc->interface);
- kfree(pmc);
+ kfree_pmc(pmc);
}
/* clear dead sources, too */
rcu_read_lock();
for_each_pmc_rcu(in_dev, pmc) {
- struct ip_sf_list *psf, *psf_next;
+ struct ip_sf_list *psf;
spin_lock_bh(&pmc->lock);
psf = pmc->tomb;
pmc->tomb = NULL;
spin_unlock_bh(&pmc->lock);
- for (; psf; psf = psf_next) {
- psf_next = psf->sf_next;
- kfree(psf);
- }
+ ip_sf_list_clear_all(psf);
}
rcu_read_unlock();
}
#endif
-static void igmp_group_dropped(struct ip_mc_list *im)
+static void __igmp_group_dropped(struct ip_mc_list *im, gfp_t gfp)
{
struct in_device *in_dev = im->interface;
#ifdef CONFIG_IP_MULTICAST
@@ -1265,7 +1307,8 @@ static void igmp_group_dropped(struct ip_mc_list *im)
#ifdef CONFIG_IP_MULTICAST
if (im->multiaddr == IGMP_ALL_HOSTS)
return;
- if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
+ if (ipv4_is_local_multicast(im->multiaddr) &&
+ !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
return;
reporter = im->reporter;
@@ -1280,13 +1323,18 @@ static void igmp_group_dropped(struct ip_mc_list *im)
return;
}
/* IGMPv3 */
- igmpv3_add_delrec(in_dev, im);
+ igmpv3_add_delrec(in_dev, im, gfp);
igmp_ifc_event(in_dev);
}
#endif
}
+static void igmp_group_dropped(struct ip_mc_list *im)
+{
+ __igmp_group_dropped(im, GFP_KERNEL);
+}
+
static void igmp_group_added(struct ip_mc_list *im)
{
struct in_device *in_dev = im->interface;
@@ -1302,13 +1350,14 @@ static void igmp_group_added(struct ip_mc_list *im)
#ifdef CONFIG_IP_MULTICAST
if (im->multiaddr == IGMP_ALL_HOSTS)
return;
- if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
+ if (ipv4_is_local_multicast(im->multiaddr) &&
+ !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
return;
if (in_dev->dead)
return;
- im->unsolicit_count = net->ipv4.sysctl_igmp_qrv;
+ im->unsolicit_count = READ_ONCE(net->ipv4.sysctl_igmp_qrv);
if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
spin_lock_bh(&im->lock);
igmp_start_timer(im, IGMP_INITIAL_REPORT_DELAY);
@@ -1322,7 +1371,7 @@ static void igmp_group_added(struct ip_mc_list *im)
* IN() to IN(A).
*/
if (im->sfmode == MCAST_EXCLUDE)
- im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+ im->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
igmp_ifc_event(in_dev);
#endif
@@ -1384,26 +1433,106 @@ static void ip_mc_hash_remove(struct in_device *in_dev,
*mc_hash = im->next_hash;
}
+int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev,
+ const struct ip_mc_list *im,
+ struct inet_fill_args *args)
+{
+ struct ifa_cacheinfo ci;
+ struct ifaddrmsg *ifm;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
+ sizeof(struct ifaddrmsg), args->flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifa_family = AF_INET;
+ ifm->ifa_prefixlen = 32;
+ ifm->ifa_flags = IFA_F_PERMANENT;
+ ifm->ifa_scope = RT_SCOPE_UNIVERSE;
+ ifm->ifa_index = dev->ifindex;
+
+ ci.cstamp = (READ_ONCE(im->mca_cstamp) - INITIAL_JIFFIES) * 100UL / HZ;
+ ci.tstamp = ci.cstamp;
+ ci.ifa_prefered = INFINITY_LIFE_TIME;
+ ci.ifa_valid = INFINITY_LIFE_TIME;
+
+ if (nla_put_in_addr(skb, IFA_MULTICAST, im->multiaddr) < 0 ||
+ nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci) < 0) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static void inet_ifmcaddr_notify(struct net_device *dev,
+ const struct ip_mc_list *im, int event)
+{
+ struct inet_fill_args fillargs = {
+ .event = event,
+ };
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOMEM;
+
+ skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) +
+ nla_total_size(sizeof(__be32)) +
+ nla_total_size(sizeof(struct ifa_cacheinfo)),
+ GFP_KERNEL);
+ if (!skb)
+ goto error;
+
+ err = inet_fill_ifmcaddr(skb, dev, im, &fillargs);
+ if (err < 0) {
+ WARN_ON_ONCE(err == -EMSGSIZE);
+ nlmsg_free(skb);
+ goto error;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MCADDR, NULL, GFP_KERNEL);
+ return;
+error:
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_MCADDR, err);
+}
/*
* A socket has joined a multicast group on device dev.
*/
-static void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
- unsigned int mode)
+static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
+ unsigned int mode, gfp_t gfp)
{
+ struct ip_mc_list __rcu **mc_hash;
struct ip_mc_list *im;
ASSERT_RTNL();
- for_each_pmc_rtnl(in_dev, im) {
- if (im->multiaddr == addr) {
- im->users++;
- ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0);
- goto out;
+ mc_hash = rtnl_dereference(in_dev->mc_hash);
+ if (mc_hash) {
+ u32 hash = hash_32((__force u32)addr, MC_HASH_SZ_LOG);
+
+ for (im = rtnl_dereference(mc_hash[hash]);
+ im;
+ im = rtnl_dereference(im->next_hash)) {
+ if (im->multiaddr == addr)
+ break;
}
+ } else {
+ for_each_pmc_rtnl(in_dev, im) {
+ if (im->multiaddr == addr)
+ break;
+ }
+ }
+
+ if (im) {
+ im->users++;
+ ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0);
+ goto out;
}
- im = kzalloc(sizeof(*im), GFP_KERNEL);
+ im = kzalloc(sizeof(*im), gfp);
if (!im)
goto out;
@@ -1411,6 +1540,8 @@ static void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
im->interface = in_dev;
in_dev_hold(in_dev);
im->multiaddr = addr;
+ im->mca_cstamp = jiffies;
+ im->mca_tstamp = im->mca_cstamp;
/* initial mode is (EX, empty) */
im->sfmode = mode;
im->sfcount[mode] = 1;
@@ -1430,15 +1561,22 @@ static void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
igmpv3_del_delrec(in_dev, im);
#endif
igmp_group_added(im);
+ inet_ifmcaddr_notify(in_dev->dev, im, RTM_NEWMULTICAST);
if (!in_dev->dead)
ip_rt_multicast_event(in_dev);
out:
return;
}
+void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, gfp_t gfp)
+{
+ ____ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE, gfp);
+}
+EXPORT_SYMBOL(__ip_mc_inc_group);
+
void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
{
- __ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE);
+ __ip_mc_inc_group(in_dev, addr, GFP_KERNEL);
}
EXPORT_SYMBOL(ip_mc_inc_group);
@@ -1481,22 +1619,22 @@ static int ip_mc_check_igmp_reportv3(struct sk_buff *skb)
len += sizeof(struct igmpv3_report);
- return pskb_may_pull(skb, len) ? 0 : -EINVAL;
+ return ip_mc_may_pull(skb, len) ? 0 : -EINVAL;
}
static int ip_mc_check_igmp_query(struct sk_buff *skb)
{
- unsigned int len = skb_transport_offset(skb);
-
- len += sizeof(struct igmphdr);
- if (skb->len < len)
- return -EINVAL;
+ unsigned int transport_len = ip_transport_len(skb);
+ unsigned int len;
/* IGMPv{1,2}? */
- if (skb->len != len) {
+ if (transport_len != sizeof(struct igmphdr)) {
/* or IGMPv3? */
- len += sizeof(struct igmpv3_query) - sizeof(struct igmphdr);
- if (skb->len < len || !pskb_may_pull(skb, len))
+ if (transport_len < sizeof(struct igmpv3_query))
+ return -EINVAL;
+
+ len = skb_transport_offset(skb) + sizeof(struct igmpv3_query);
+ if (!ip_mc_may_pull(skb, len))
return -EINVAL;
}
@@ -1516,7 +1654,6 @@ static int ip_mc_check_igmp_msg(struct sk_buff *skb)
case IGMP_HOST_LEAVE_MESSAGE:
case IGMP_HOST_MEMBERSHIP_REPORT:
case IGMPV2_HOST_MEMBERSHIP_REPORT:
- /* fall through */
return 0;
case IGMPV3_HOST_MEMBERSHIP_REPORT:
return ip_mc_check_igmp_reportv3(skb);
@@ -1527,52 +1664,34 @@ static int ip_mc_check_igmp_msg(struct sk_buff *skb)
}
}
-static inline __sum16 ip_mc_validate_checksum(struct sk_buff *skb)
+static __sum16 ip_mc_validate_checksum(struct sk_buff *skb)
{
return skb_checksum_simple_validate(skb);
}
-static int __ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed)
-
+static int ip_mc_check_igmp_csum(struct sk_buff *skb)
{
- struct sk_buff *skb_chk;
- unsigned int transport_len;
unsigned int len = skb_transport_offset(skb) + sizeof(struct igmphdr);
- int ret = -EINVAL;
+ unsigned int transport_len = ip_transport_len(skb);
+ struct sk_buff *skb_chk;
- transport_len = ntohs(ip_hdr(skb)->tot_len) - ip_hdrlen(skb);
+ if (!ip_mc_may_pull(skb, len))
+ return -EINVAL;
skb_chk = skb_checksum_trimmed(skb, transport_len,
ip_mc_validate_checksum);
if (!skb_chk)
- goto err;
-
- if (!pskb_may_pull(skb_chk, len))
- goto err;
-
- ret = ip_mc_check_igmp_msg(skb_chk);
- if (ret)
- goto err;
-
- if (skb_trimmed)
- *skb_trimmed = skb_chk;
- /* free now unneeded clone */
- else if (skb_chk != skb)
- kfree_skb(skb_chk);
-
- ret = 0;
+ return -EINVAL;
-err:
- if (ret && skb_chk && skb_chk != skb)
+ if (skb_chk != skb)
kfree_skb(skb_chk);
- return ret;
+ return 0;
}
/**
* ip_mc_check_igmp - checks whether this is a sane IGMP packet
* @skb: the skb to validate
- * @skb_trimmed: to store an skb pointer trimmed to IPv4 packet tail (optional)
*
* Checks whether an IPv4 packet is a valid IGMP packet. If so sets
* skb transport header accordingly and returns zero.
@@ -1582,18 +1701,10 @@ err:
* -ENOMSG: IP header validation succeeded but it is not an IGMP packet.
* -ENOMEM: A memory allocation failure happened.
*
- * Optionally, an skb pointer might be provided via skb_trimmed (or set it
- * to NULL): After parsing an IGMP packet successfully it will point to
- * an skb which has its tail aligned to the IP packet end. This might
- * either be the originally provided skb or a trimmed, cloned version if
- * the skb frame had data beyond the IP packet. A cloned skb allows us
- * to leave the original skb and its full frame unchanged (which might be
- * desirable for layer 2 frame jugglers).
- *
* Caller needs to set the skb network header and free any returned skb if it
* differs from the provided skb.
*/
-int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed)
+int ip_mc_check_igmp(struct sk_buff *skb)
{
int ret = ip_mc_check_iphdr(skb);
@@ -1603,7 +1714,11 @@ int ip_mc_check_igmp(struct sk_buff *skb, struct sk_buff **skb_trimmed)
if (ip_hdr(skb)->protocol != IPPROTO_IGMP)
return -ENOMSG;
- return __ip_mc_check_igmp(skb, skb_trimmed);
+ ret = ip_mc_check_igmp_csum(skb);
+ if (ret < 0)
+ return ret;
+
+ return ip_mc_check_igmp_msg(skb);
}
EXPORT_SYMBOL(ip_mc_check_igmp);
@@ -1623,7 +1738,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev)
if (im->multiaddr == IGMP_ALL_HOSTS)
continue;
if (ipv4_is_local_multicast(im->multiaddr) &&
- !net->ipv4.sysctl_igmp_llm_reports)
+ !READ_ONCE(net->ipv4.sysctl_igmp_llm_reports))
continue;
/* a failover is happening and switches
@@ -1644,7 +1759,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev)
* A socket has left a multicast group on device dev
*/
-void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
+void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp)
{
struct ip_mc_list *i;
struct ip_mc_list __rcu **ip;
@@ -1659,7 +1774,9 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
ip_mc_hash_remove(in_dev, i);
*ip = i->next_rcu;
in_dev->mc_count--;
- igmp_group_dropped(i);
+ __igmp_group_dropped(i, gfp);
+ inet_ifmcaddr_notify(in_dev->dev, i,
+ RTM_DELMULTICAST);
ip_mc_clear_src(i);
if (!in_dev->dead)
@@ -1672,7 +1789,7 @@ void ip_mc_dec_group(struct in_device *in_dev, __be32 addr)
}
}
}
-EXPORT_SYMBOL(ip_mc_dec_group);
+EXPORT_SYMBOL(__ip_mc_dec_group);
/* Device changing type */
@@ -1712,29 +1829,41 @@ void ip_mc_down(struct in_device *in_dev)
igmp_group_dropped(pmc);
#ifdef CONFIG_IP_MULTICAST
- in_dev->mr_ifc_count = 0;
- if (del_timer(&in_dev->mr_ifc_timer))
+ WRITE_ONCE(in_dev->mr_ifc_count, 0);
+ if (timer_delete(&in_dev->mr_ifc_timer))
__in_dev_put(in_dev);
in_dev->mr_gq_running = 0;
- if (del_timer(&in_dev->mr_gq_timer))
+ if (timer_delete(&in_dev->mr_gq_timer))
__in_dev_put(in_dev);
#endif
ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
}
-void ip_mc_init_dev(struct in_device *in_dev)
-{
#ifdef CONFIG_IP_MULTICAST
+static void ip_mc_reset(struct in_device *in_dev)
+{
struct net *net = dev_net(in_dev->dev);
+
+ in_dev->mr_qi = IGMP_QUERY_INTERVAL;
+ in_dev->mr_qri = IGMP_QUERY_RESPONSE_INTERVAL;
+ in_dev->mr_qrv = READ_ONCE(net->ipv4.sysctl_igmp_qrv);
+}
+#else
+static void ip_mc_reset(struct in_device *in_dev)
+{
+}
#endif
+
+void ip_mc_init_dev(struct in_device *in_dev)
+{
ASSERT_RTNL();
#ifdef CONFIG_IP_MULTICAST
timer_setup(&in_dev->mr_gq_timer, igmp_gq_timer_expire, 0);
timer_setup(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire, 0);
- in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;
#endif
+ ip_mc_reset(in_dev);
spin_lock_init(&in_dev->mc_tomb_lock);
}
@@ -1744,15 +1873,10 @@ void ip_mc_init_dev(struct in_device *in_dev)
void ip_mc_up(struct in_device *in_dev)
{
struct ip_mc_list *pmc;
-#ifdef CONFIG_IP_MULTICAST
- struct net *net = dev_net(in_dev->dev);
-#endif
ASSERT_RTNL();
-#ifdef CONFIG_IP_MULTICAST
- in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;
-#endif
+ ip_mc_reset(in_dev);
ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
for_each_pmc_rtnl(in_dev, pmc) {
@@ -1782,6 +1906,7 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) {
in_dev->mc_list = i->next_rcu;
in_dev->mc_count--;
+ ip_mc_clear_src(i);
ip_ma_put(i);
}
}
@@ -1805,7 +1930,8 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
if (!dev) {
struct rtable *rt = ip_route_output(net,
imr->imr_multiaddr.s_addr,
- 0, 0, 0);
+ 0, 0, 0,
+ RT_SCOPE_UNIVERSE);
if (!IS_ERR(rt)) {
dev = rt->dst.dev;
ip_rt_put(rt);
@@ -1856,7 +1982,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
#ifdef CONFIG_IP_MULTICAST
if (psf->sf_oldin &&
!IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) {
- psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+ psf->sf_crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
psf->sf_next = pmc->tomb;
pmc->tomb = psf;
rv = 1;
@@ -1920,8 +2046,8 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
/* filter mode change */
pmc->sfmode = MCAST_INCLUDE;
#ifdef CONFIG_IP_MULTICAST
- pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
- in_dev->mr_ifc_count = pmc->crcount;
+ pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
+ WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount);
for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = 0;
igmp_ifc_event(pmc->interface);
@@ -2099,8 +2225,8 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
#ifdef CONFIG_IP_MULTICAST
/* else no filters; keep old mode for reports */
- pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
- in_dev->mr_ifc_count = pmc->crcount;
+ pmc->crcount = in_dev->mr_qrv ?: READ_ONCE(net->ipv4.sysctl_igmp_qrv);
+ WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount);
for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = 0;
igmp_ifc_event(in_dev);
@@ -2114,7 +2240,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
static void ip_mc_clear_src(struct ip_mc_list *pmc)
{
- struct ip_sf_list *psf, *nextpsf, *tomb, *sources;
+ struct ip_sf_list *tomb, *sources;
spin_lock_bh(&pmc->lock);
tomb = pmc->tomb;
@@ -2126,14 +2252,8 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc)
pmc->sfcount[MCAST_EXCLUDE] = 1;
spin_unlock_bh(&pmc->lock);
- for (psf = tomb; psf; psf = nextpsf) {
- nextpsf = psf->sf_next;
- kfree(psf);
- }
- for (psf = sources; psf; psf = nextpsf) {
- nextpsf = psf->sf_next;
- kfree(psf);
- }
+ ip_sf_list_clear_all(tomb);
+ ip_sf_list_clear_all(sources);
}
/* Join a multicast group
@@ -2171,7 +2291,7 @@ static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr,
count++;
}
err = -ENOBUFS;
- if (count >= net->ipv4.sysctl_igmp_max_memberships)
+ if (count >= READ_ONCE(net->ipv4.sysctl_igmp_max_memberships))
goto done;
iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
if (!iml)
@@ -2182,7 +2302,7 @@ static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr,
iml->sflist = NULL;
iml->sfmode = mode;
rcu_assign_pointer(inet->mc_list, iml);
- __ip_mc_inc_group(in_dev, addr, mode);
+ ____ip_mc_inc_group(in_dev, addr, mode, GFP_KERNEL);
err = 0;
done:
return err;
@@ -2219,7 +2339,7 @@ static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
iml->sfmode, psf->sl_count, psf->sl_addr, 0);
RCU_INIT_POINTER(iml->sflist, NULL);
/* decrease mem now to avoid the memleak warning */
- atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc);
+ atomic_sub(struct_size(psf, sl_addr, psf->sl_max), &sk->sk_omem_alloc);
kfree_rcu(psf, rcu);
return err;
}
@@ -2358,7 +2478,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
}
/* else, add a new source to the filter */
- if (psl && psl->sl_count >= net->ipv4.sysctl_igmp_max_msf) {
+ if (psl && psl->sl_count >= READ_ONCE(net->ipv4.sysctl_igmp_max_msf)) {
err = -ENOBUFS;
goto done;
}
@@ -2368,7 +2488,8 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
if (psl)
count += psl->sl_max;
- newpsl = sock_kmalloc(sk, IP_SFLSIZE(count), GFP_KERNEL);
+ newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count),
+ GFP_KERNEL);
if (!newpsl) {
err = -ENOBUFS;
goto done;
@@ -2379,10 +2500,12 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
for (i = 0; i < psl->sl_count; i++)
newpsl->sl_addr[i] = psl->sl_addr[i];
/* decrease mem now to avoid the memleak warning */
- atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
- kfree_rcu(psl, rcu);
+ atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+ &sk->sk_omem_alloc);
}
rcu_assign_pointer(pmc->sflist, newpsl);
+ if (psl)
+ kfree_rcu(psl, rcu);
psl = newpsl;
}
rv = 1; /* > 0 for insert logic below if sl_count is 0 */
@@ -2454,19 +2577,22 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
goto done;
}
if (msf->imsf_numsrc) {
- newpsl = sock_kmalloc(sk, IP_SFLSIZE(msf->imsf_numsrc),
- GFP_KERNEL);
+ newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr,
+ msf->imsf_numsrc),
+ GFP_KERNEL);
if (!newpsl) {
err = -ENOBUFS;
goto done;
}
newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc;
- memcpy(newpsl->sl_addr, msf->imsf_slist,
- msf->imsf_numsrc * sizeof(msf->imsf_slist[0]));
+ memcpy(newpsl->sl_addr, msf->imsf_slist_flex,
+ flex_array_size(msf, imsf_slist_flex, msf->imsf_numsrc));
err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0);
if (err) {
- sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max));
+ sock_kfree_s(sk, newpsl,
+ struct_size(newpsl, sl_addr,
+ newpsl->sl_max));
goto done;
}
} else {
@@ -2479,12 +2605,15 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
psl->sl_count, psl->sl_addr, 0);
/* decrease mem now to avoid the memleak warning */
- atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
- kfree_rcu(psl, rcu);
- } else
+ atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+ &sk->sk_omem_alloc);
+ } else {
(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
0, NULL, 0);
+ }
rcu_assign_pointer(pmc->sflist, newpsl);
+ if (psl)
+ kfree_rcu(psl, rcu);
pmc->sfmode = msf->imsf_fmode;
err = 0;
done:
@@ -2492,11 +2621,10 @@ done:
err = ip_mc_leave_group(sk, &imr);
return err;
}
-
int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
- struct ip_msfilter __user *optval, int __user *optlen)
+ sockptr_t optval, sockptr_t optlen)
{
- int err, len, count, copycount;
+ int err, len, count, copycount, msf_size;
struct ip_mreqn imr;
__be32 addr = msf->imsf_multiaddr;
struct ip_mc_socklist *pmc;
@@ -2531,20 +2659,22 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
msf->imsf_fmode = pmc->sfmode;
psl = rtnl_dereference(pmc->sflist);
if (!psl) {
- len = 0;
count = 0;
} else {
count = psl->sl_count;
}
copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc;
- len = copycount * sizeof(psl->sl_addr[0]);
+ len = flex_array_size(psl, sl_addr, copycount);
msf->imsf_numsrc = count;
- if (put_user(IP_MSFILTER_SIZE(copycount), optlen) ||
- copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) {
+ msf_size = IP_MSFILTER_SIZE(copycount);
+ if (copy_to_sockptr(optlen, &msf_size, sizeof(int)) ||
+ copy_to_sockptr(optval, msf, IP_MSFILTER_SIZE(0))) {
return -EFAULT;
}
if (len &&
- copy_to_user(&optval->imsf_slist[0], psl->sl_addr, len))
+ copy_to_sockptr_offset(optval,
+ offsetof(struct ip_msfilter, imsf_slist_flex),
+ psl->sl_addr, len))
return -EFAULT;
return 0;
done:
@@ -2552,9 +2682,9 @@ done:
}
int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
- struct group_filter __user *optval, int __user *optlen)
+ sockptr_t optval, size_t ss_offset)
{
- int err, i, count, copycount;
+ int i, count, copycount;
struct sockaddr_in *psin;
__be32 addr;
struct ip_mc_socklist *pmc;
@@ -2570,24 +2700,18 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
if (!ipv4_is_multicast(addr))
return -EINVAL;
- err = -EADDRNOTAVAIL;
-
for_each_pmc_rtnl(inet, pmc) {
if (pmc->multi.imr_multiaddr.s_addr == addr &&
pmc->multi.imr_ifindex == gsf->gf_interface)
break;
}
if (!pmc) /* must have a prior join */
- goto done;
+ return -EADDRNOTAVAIL;
gsf->gf_fmode = pmc->sfmode;
psl = rtnl_dereference(pmc->sflist);
count = psl ? psl->sl_count : 0;
copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
gsf->gf_numsrc = count;
- if (put_user(GROUP_FILTER_SIZE(copycount), optlen) ||
- copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
- return -EFAULT;
- }
for (i = 0; i < copycount; i++) {
struct sockaddr_storage ss;
@@ -2595,21 +2719,21 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
memset(&ss, 0, sizeof(ss));
psin->sin_family = AF_INET;
psin->sin_addr.s_addr = psl->sl_addr[i];
- if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss)))
+ if (copy_to_sockptr_offset(optval, ss_offset,
+ &ss, sizeof(ss)))
return -EFAULT;
+ ss_offset += sizeof(ss);
}
return 0;
-done:
- return err;
}
/*
* check if a multicast source filter allows delivery for a given <src,dst,intf>
*/
-int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr,
+int ip_mc_sf_allow(const struct sock *sk, __be32 loc_addr, __be32 rmt_addr,
int dif, int sdif)
{
- struct inet_sock *inet = inet_sk(sk);
+ const struct inet_sock *inet = inet_sk(sk);
struct ip_mc_socklist *pmc;
struct ip_sf_socklist *psl;
int i;
@@ -2626,7 +2750,7 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr,
(sdif && pmc->multi.imr_ifindex == sdif)))
break;
}
- ret = inet->mc_all;
+ ret = inet_test_bit(MC_ALL, sk);
if (!pmc)
goto unlock;
psl = rcu_dereference(pmc->sflist);
@@ -2707,6 +2831,7 @@ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u
rv = 1;
} else if (im) {
if (src_addr) {
+ spin_lock_bh(&im->lock);
for (psf = im->sources; psf; psf = psf->sf_next) {
if (psf->sf_inaddr == src_addr)
break;
@@ -2717,6 +2842,7 @@ int ip_check_mc_rcu(struct in_device *in_dev, __be32 mc_addr, __be32 src_addr, u
im->sfcount[MCAST_EXCLUDE];
else
rv = im->sfcount[MCAST_EXCLUDE] != 0;
+ spin_unlock_bh(&im->lock);
} else
rv = 1; /* unspecified source; tentatively allow */
}
@@ -2816,7 +2942,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v)
seq_puts(seq,
"Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n");
else {
- struct ip_mc_list *im = (struct ip_mc_list *)v;
+ struct ip_mc_list *im = v;
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
char *querier;
long delta;
@@ -2909,8 +3035,6 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l
continue;
state->im = rcu_dereference(state->idev->mc_list);
}
- if (!state->im)
- break;
spin_lock_bh(&state->im->lock);
psf = state->im->sources;
}
@@ -2960,7 +3084,7 @@ static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
{
- struct ip_sf_list *psf = (struct ip_sf_list *)v;
+ struct ip_sf_list *psf = v;
struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
if (v == SEQ_START_TOKEN) {
diff --git a/net/ipv4/igmp_internal.h b/net/ipv4/igmp_internal.h
new file mode 100644
index 000000000000..0a1bcc8ec8e1
--- /dev/null
+++ b/net/ipv4/igmp_internal.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_IGMP_INTERNAL_H
+#define _LINUX_IGMP_INTERNAL_H
+
+struct inet_fill_args {
+ u32 portid;
+ u32 seq;
+ int event;
+ unsigned int flags;
+ int netnsid;
+ int ifindex;
+};
+
+int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev,
+ const struct ip_mc_list *im,
+ struct inet_fill_args *args);
+#endif
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 15e7f7915a21..97d57c52b9ad 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -6,11 +7,6 @@
* Support for INET connection oriented protocols.
*
* Authors: See the TCP sources
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or(at your option) any later version.
*/
#include <linux/module.h>
@@ -28,17 +24,19 @@
#include <net/addrconf.h>
#if IS_ENABLED(CONFIG_IPV6)
-/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
- * only, and any IPv4 addresses if not IPv6 only
- * match_wildcard == false: addresses must be exactly the same, i.e.
- * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
- * and 0.0.0.0 equals to 0.0.0.0 only
+/* match_sk*_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses
+ * if IPv6 only, and any IPv4 addresses
+ * if not IPv6 only
+ * match_sk*_wildcard == false: addresses must be exactly the same, i.e.
+ * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
+ * and 0.0.0.0 equals to 0.0.0.0 only
*/
static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
const struct in6_addr *sk2_rcv_saddr6,
__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
bool sk1_ipv6only, bool sk2_ipv6only,
- bool match_wildcard)
+ bool match_sk1_wildcard,
+ bool match_sk2_wildcard)
{
int addr_type = ipv6_addr_type(sk1_rcv_saddr6);
int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
@@ -48,8 +46,8 @@ static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
if (!sk2_ipv6only) {
if (sk1_rcv_saddr == sk2_rcv_saddr)
return true;
- if (!sk1_rcv_saddr || !sk2_rcv_saddr)
- return match_wildcard;
+ return (match_sk1_wildcard && !sk1_rcv_saddr) ||
+ (match_sk2_wildcard && !sk2_rcv_saddr);
}
return false;
}
@@ -57,11 +55,11 @@ static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
return true;
- if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
+ if (addr_type2 == IPV6_ADDR_ANY && match_sk2_wildcard &&
!(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
return true;
- if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
+ if (addr_type == IPV6_ADDR_ANY && match_sk1_wildcard &&
!(sk1_ipv6only && addr_type2 == IPV6_ADDR_MAPPED))
return true;
@@ -73,18 +71,19 @@ static bool ipv6_rcv_saddr_equal(const struct in6_addr *sk1_rcv_saddr6,
}
#endif
-/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses
- * match_wildcard == false: addresses must be exactly the same, i.e.
- * 0.0.0.0 only equals to 0.0.0.0
+/* match_sk*_wildcard == true: 0.0.0.0 equals to any IPv4 addresses
+ * match_sk*_wildcard == false: addresses must be exactly the same, i.e.
+ * 0.0.0.0 only equals to 0.0.0.0
*/
static bool ipv4_rcv_saddr_equal(__be32 sk1_rcv_saddr, __be32 sk2_rcv_saddr,
- bool sk2_ipv6only, bool match_wildcard)
+ bool sk2_ipv6only, bool match_sk1_wildcard,
+ bool match_sk2_wildcard)
{
if (!sk2_ipv6only) {
if (sk1_rcv_saddr == sk2_rcv_saddr)
return true;
- if (!sk1_rcv_saddr || !sk2_rcv_saddr)
- return match_wildcard;
+ return (match_sk1_wildcard && !sk1_rcv_saddr) ||
+ (match_sk2_wildcard && !sk2_rcv_saddr);
}
return false;
}
@@ -100,10 +99,12 @@ bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
sk2->sk_rcv_saddr,
ipv6_only_sock(sk),
ipv6_only_sock(sk2),
+ match_wildcard,
match_wildcard);
#endif
return ipv4_rcv_saddr_equal(sk->sk_rcv_saddr, sk2->sk_rcv_saddr,
- ipv6_only_sock(sk2), match_wildcard);
+ ipv6_only_sock(sk2), match_wildcard,
+ match_wildcard);
}
EXPORT_SYMBOL(inet_rcv_saddr_equal);
@@ -116,77 +117,233 @@ bool inet_rcv_saddr_any(const struct sock *sk)
return !sk->sk_rcv_saddr;
}
-void inet_get_local_port_range(struct net *net, int *low, int *high)
+/**
+ * inet_sk_get_local_port_range - fetch ephemeral ports range
+ * @sk: socket
+ * @low: pointer to low port
+ * @high: pointer to high port
+ *
+ * Fetch netns port range (/proc/sys/net/ipv4/ip_local_port_range)
+ * Range can be overridden if socket got IP_LOCAL_PORT_RANGE option.
+ * Returns true if IP_LOCAL_PORT_RANGE was set on this socket.
+ */
+bool inet_sk_get_local_port_range(const struct sock *sk, int *low, int *high)
+{
+ int lo, hi, sk_lo, sk_hi;
+ bool local_range = false;
+ u32 sk_range;
+
+ inet_get_local_port_range(sock_net(sk), &lo, &hi);
+
+ sk_range = READ_ONCE(inet_sk(sk)->local_port_range);
+ if (unlikely(sk_range)) {
+ sk_lo = sk_range & 0xffff;
+ sk_hi = sk_range >> 16;
+
+ if (lo <= sk_lo && sk_lo <= hi)
+ lo = sk_lo;
+ if (lo <= sk_hi && sk_hi <= hi)
+ hi = sk_hi;
+ local_range = true;
+ }
+
+ *low = lo;
+ *high = hi;
+ return local_range;
+}
+EXPORT_SYMBOL(inet_sk_get_local_port_range);
+
+static bool inet_use_bhash2_on_bind(const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6) {
+ if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+ return false;
+
+ if (!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
+ return true;
+ }
+#endif
+ return sk->sk_rcv_saddr != htonl(INADDR_ANY);
+}
+
+static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2,
+ kuid_t uid, bool relax,
+ bool reuseport_cb_ok, bool reuseport_ok)
+{
+ int bound_dev_if2;
+
+ if (sk == sk2)
+ return false;
+
+ bound_dev_if2 = READ_ONCE(sk2->sk_bound_dev_if);
+
+ if (!sk->sk_bound_dev_if || !bound_dev_if2 ||
+ sk->sk_bound_dev_if == bound_dev_if2) {
+ if (sk->sk_reuse && sk2->sk_reuse &&
+ sk2->sk_state != TCP_LISTEN) {
+ if (!relax || (!reuseport_ok && sk->sk_reuseport &&
+ sk2->sk_reuseport && reuseport_cb_ok &&
+ (sk2->sk_state == TCP_TIME_WAIT ||
+ uid_eq(uid, sk_uid(sk2)))))
+ return true;
+ } else if (!reuseport_ok || !sk->sk_reuseport ||
+ !sk2->sk_reuseport || !reuseport_cb_ok ||
+ (sk2->sk_state != TCP_TIME_WAIT &&
+ !uid_eq(uid, sk_uid(sk2)))) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2,
+ kuid_t uid, bool relax,
+ bool reuseport_cb_ok, bool reuseport_ok)
+{
+ if (ipv6_only_sock(sk2)) {
+ if (sk->sk_family == AF_INET)
+ return false;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
+ return false;
+#endif
+ }
+
+ return inet_bind_conflict(sk, sk2, uid, relax,
+ reuseport_cb_ok, reuseport_ok);
+}
+
+static bool inet_bhash2_conflict(const struct sock *sk,
+ const struct inet_bind2_bucket *tb2,
+ kuid_t uid,
+ bool relax, bool reuseport_cb_ok,
+ bool reuseport_ok)
{
- unsigned int seq;
+ struct sock *sk2;
- do {
- seq = read_seqbegin(&net->ipv4.ip_local_ports.lock);
+ sk_for_each_bound(sk2, &tb2->owners) {
+ if (__inet_bhash2_conflict(sk, sk2, uid, relax,
+ reuseport_cb_ok, reuseport_ok))
+ return true;
+ }
- *low = net->ipv4.ip_local_ports.range[0];
- *high = net->ipv4.ip_local_ports.range[1];
- } while (read_seqretry(&net->ipv4.ip_local_ports.lock, seq));
+ return false;
}
-EXPORT_SYMBOL(inet_get_local_port_range);
+#define sk_for_each_bound_bhash(__sk, __tb2, __tb) \
+ hlist_for_each_entry(__tb2, &(__tb)->bhash2, bhash_node) \
+ sk_for_each_bound((__sk), &(__tb2)->owners)
+
+/* This should be called only when the tb and tb2 hashbuckets' locks are held */
static int inet_csk_bind_conflict(const struct sock *sk,
const struct inet_bind_bucket *tb,
+ const struct inet_bind2_bucket *tb2, /* may be null */
bool relax, bool reuseport_ok)
{
+ struct sock_reuseport *reuseport_cb;
+ kuid_t uid = sk_uid(sk);
+ bool reuseport_cb_ok;
struct sock *sk2;
- bool reuse = sk->sk_reuse;
- bool reuseport = !!sk->sk_reuseport && reuseport_ok;
- kuid_t uid = sock_i_uid((struct sock *)sk);
- /*
- * Unlike other sk lookup places we do not check
+ rcu_read_lock();
+ reuseport_cb = rcu_dereference(sk->sk_reuseport_cb);
+ /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */
+ reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks);
+ rcu_read_unlock();
+
+ /* Conflicts with an existing IPV6_ADDR_ANY (if ipv6) or INADDR_ANY (if
+ * ipv4) should have been checked already. We need to do these two
+ * checks separately because their spinlocks have to be acquired/released
+ * independently of each other, to prevent possible deadlocks
+ */
+ if (inet_use_bhash2_on_bind(sk))
+ return tb2 && inet_bhash2_conflict(sk, tb2, uid, relax,
+ reuseport_cb_ok, reuseport_ok);
+
+ /* Unlike other sk lookup places we do not check
* for sk_net here, since _all_ the socks listed
- * in tb->owners list belong to the same net - the
- * one this bucket belongs to.
+ * in tb->owners and tb2->owners list belong
+ * to the same net - the one this bucket belongs to.
*/
+ sk_for_each_bound_bhash(sk2, tb2, tb) {
+ if (!inet_bind_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok))
+ continue;
- sk_for_each_bound(sk2, &tb->owners) {
- if (sk != sk2 &&
- (!sk->sk_bound_dev_if ||
- !sk2->sk_bound_dev_if ||
- sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
- if ((!reuse || !sk2->sk_reuse ||
- sk2->sk_state == TCP_LISTEN) &&
- (!reuseport || !sk2->sk_reuseport ||
- rcu_access_pointer(sk->sk_reuseport_cb) ||
- (sk2->sk_state != TCP_TIME_WAIT &&
- !uid_eq(uid, sock_i_uid(sk2))))) {
- if (inet_rcv_saddr_equal(sk, sk2, true))
- break;
- }
- if (!relax && reuse && sk2->sk_reuse &&
- sk2->sk_state != TCP_LISTEN) {
- if (inet_rcv_saddr_equal(sk, sk2, true))
- break;
- }
- }
+ if (inet_rcv_saddr_equal(sk, sk2, true))
+ return true;
}
- return sk2 != NULL;
+
+ return false;
+}
+
+/* Determine if there is a bind conflict with an existing IPV6_ADDR_ANY (if ipv6) or
+ * INADDR_ANY (if ipv4) socket.
+ *
+ * Caller must hold bhash hashbucket lock with local bh disabled, to protect
+ * against concurrent binds on the port for addr any
+ */
+static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l3mdev,
+ bool relax, bool reuseport_ok)
+{
+ const struct net *net = sock_net(sk);
+ struct sock_reuseport *reuseport_cb;
+ struct inet_bind_hashbucket *head2;
+ struct inet_bind2_bucket *tb2;
+ kuid_t uid = sk_uid(sk);
+ bool conflict = false;
+ bool reuseport_cb_ok;
+
+ rcu_read_lock();
+ reuseport_cb = rcu_dereference(sk->sk_reuseport_cb);
+ /* paired with WRITE_ONCE() in __reuseport_(add|detach)_closed_sock */
+ reuseport_cb_ok = !reuseport_cb || READ_ONCE(reuseport_cb->num_closed_socks);
+ rcu_read_unlock();
+
+ head2 = inet_bhash2_addr_any_hashbucket(sk, net, port);
+
+ spin_lock(&head2->lock);
+
+ inet_bind_bucket_for_each(tb2, &head2->chain) {
+ if (!inet_bind2_bucket_match_addr_any(tb2, net, port, l3mdev, sk))
+ continue;
+
+ if (!inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, reuseport_ok))
+ continue;
+
+ conflict = true;
+ break;
+ }
+
+ spin_unlock(&head2->lock);
+
+ return conflict;
}
/*
* Find an open port number for the socket. Returns with the
- * inet_bind_hashbucket lock held.
+ * inet_bind_hashbucket locks held if successful.
*/
static struct inet_bind_hashbucket *
-inet_csk_find_open_port(struct sock *sk, struct inet_bind_bucket **tb_ret, int *port_ret)
+inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret,
+ struct inet_bind2_bucket **tb2_ret,
+ struct inet_bind_hashbucket **head2_ret, int *port_ret)
{
- struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
- int port = 0;
- struct inet_bind_hashbucket *head;
+ struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
+ int i, low, high, attempt_half, port, l3mdev;
+ struct inet_bind_hashbucket *head, *head2;
struct net *net = sock_net(sk);
- int i, low, high, attempt_half;
+ struct inet_bind2_bucket *tb2;
struct inet_bind_bucket *tb;
u32 remaining, offset;
+ bool relax = false;
+ l3mdev = inet_sk_bound_l3mdev(sk);
+ports_exhausted:
attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan:
- inet_get_local_port_range(net, &low, &high);
+ inet_sk_get_local_port_range(sk, &low, &high);
high++; /* [32768, 60999] -> [32768, 61000[ */
if (high - low < 4)
attempt_half = 0;
@@ -202,7 +359,7 @@ other_half_scan:
if (likely(remaining > 1))
remaining &= ~1U;
- offset = prandom_u32() % remaining;
+ offset = get_random_u32_below(remaining);
/* __inet_hash_connect() favors ports having @low parity
* We do the opposite to not pollute connect() users.
*/
@@ -218,10 +375,20 @@ other_parity_scan:
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
spin_lock_bh(&head->lock);
+ if (inet_use_bhash2_on_bind(sk)) {
+ if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, relax, false))
+ goto next_port;
+ }
+
+ head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
+ spin_lock(&head2->lock);
+ tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
inet_bind_bucket_for_each(tb, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->port == port) {
- if (!inet_csk_bind_conflict(sk, tb, false, false))
+ if (inet_bind_bucket_match(tb, net, port, l3mdev)) {
+ if (!inet_csk_bind_conflict(sk, tb, tb2,
+ relax, false))
goto success;
+ spin_unlock(&head2->lock);
goto next_port;
}
tb = NULL;
@@ -240,25 +407,31 @@ next_port:
attempt_half = 2;
goto other_half_scan;
}
+
+ if (READ_ONCE(net->ipv4.sysctl_ip_autobind_reuse) && !relax) {
+ /* We still have a chance to connect to different destinations */
+ relax = true;
+ goto ports_exhausted;
+ }
return NULL;
success:
*port_ret = port;
*tb_ret = tb;
+ *tb2_ret = tb2;
+ *head2_ret = head2;
return head;
}
static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
- struct sock *sk)
+ const struct sock *sk)
{
- kuid_t uid = sock_i_uid(sk);
-
if (tb->fastreuseport <= 0)
return 0;
if (!sk->sk_reuseport)
return 0;
if (rcu_access_pointer(sk->sk_reuseport_cb))
return 0;
- if (!uid_eq(tb->fastuid, uid))
+ if (!uid_eq(tb->fastuid, sk_uid(sk)))
return 0;
/* We only need to check the rcv_saddr if this tb was once marked
* without fastreuseport and then was reset, as we can only know that
@@ -274,62 +447,23 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
tb->fast_rcv_saddr,
sk->sk_rcv_saddr,
tb->fast_ipv6_only,
- ipv6_only_sock(sk), true);
+ ipv6_only_sock(sk), true, false);
#endif
return ipv4_rcv_saddr_equal(tb->fast_rcv_saddr, sk->sk_rcv_saddr,
- ipv6_only_sock(sk), true);
+ ipv6_only_sock(sk), true, false);
}
-/* Obtain a reference to a local port for the given sock,
- * if snum is zero it means select any available local port.
- * We try to allocate an odd port (and leave even ports for connect())
- */
-int inet_csk_get_port(struct sock *sk, unsigned short snum)
+void inet_csk_update_fastreuse(const struct sock *sk,
+ struct inet_bind_bucket *tb,
+ struct inet_bind2_bucket *tb2)
{
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
- struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
- int ret = 1, port = snum;
- struct inet_bind_hashbucket *head;
- struct net *net = sock_net(sk);
- struct inet_bind_bucket *tb = NULL;
- kuid_t uid = sock_i_uid(sk);
- if (!port) {
- head = inet_csk_find_open_port(sk, &tb, &port);
- if (!head)
- return ret;
- if (!tb)
- goto tb_not_found;
- goto success;
- }
- head = &hinfo->bhash[inet_bhashfn(net, port,
- hinfo->bhash_size)];
- spin_lock_bh(&head->lock);
- inet_bind_bucket_for_each(tb, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->port == port)
- goto tb_found;
-tb_not_found:
- tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
- net, head, port);
- if (!tb)
- goto fail_unlock;
-tb_found:
- if (!hlist_empty(&tb->owners)) {
- if (sk->sk_reuse == SK_FORCE_REUSE)
- goto success;
-
- if ((tb->fastreuse > 0 && reuse) ||
- sk_reuseport_match(tb, sk))
- goto success;
- if (inet_csk_bind_conflict(sk, tb, true, true))
- goto fail_unlock;
- }
-success:
- if (hlist_empty(&tb->owners)) {
+ if (hlist_empty(&tb->bhash2)) {
tb->fastreuse = reuse;
if (sk->sk_reuseport) {
tb->fastreuseport = FASTREUSEPORT_ANY;
- tb->fastuid = uid;
+ tb->fastuid = sk_uid(sk);
tb->fast_rcv_saddr = sk->sk_rcv_saddr;
tb->fast_ipv6_only = ipv6_only_sock(sk);
tb->fast_sk_family = sk->sk_family;
@@ -356,7 +490,7 @@ success:
*/
if (!sk_reuseport_match(tb, sk)) {
tb->fastreuseport = FASTREUSEPORT_STRICT;
- tb->fastuid = uid;
+ tb->fastuid = sk_uid(sk);
tb->fast_rcv_saddr = sk->sk_rcv_saddr;
tb->fast_ipv6_only = ipv6_only_sock(sk);
tb->fast_sk_family = sk->sk_family;
@@ -368,12 +502,107 @@ success:
tb->fastreuseport = 0;
}
}
+
+ tb2->fastreuse = tb->fastreuse;
+ tb2->fastreuseport = tb->fastreuseport;
+}
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ * We try to allocate an odd port (and leave even ports for connect())
+ */
+int inet_csk_get_port(struct sock *sk, unsigned short snum)
+{
+ bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
+ bool found_port = false, check_bind_conflict = true;
+ bool bhash_created = false, bhash2_created = false;
+ struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
+ int ret = -EADDRINUSE, port = snum, l3mdev;
+ struct inet_bind_hashbucket *head, *head2;
+ struct inet_bind2_bucket *tb2 = NULL;
+ struct inet_bind_bucket *tb = NULL;
+ bool head2_lock_acquired = false;
+ struct net *net = sock_net(sk);
+
+ l3mdev = inet_sk_bound_l3mdev(sk);
+
+ if (!port) {
+ head = inet_csk_find_open_port(sk, &tb, &tb2, &head2, &port);
+ if (!head)
+ return ret;
+
+ head2_lock_acquired = true;
+
+ if (tb && tb2)
+ goto success;
+ found_port = true;
+ } else {
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+ spin_lock_bh(&head->lock);
+ inet_bind_bucket_for_each(tb, &head->chain)
+ if (inet_bind_bucket_match(tb, net, port, l3mdev))
+ break;
+ }
+
+ if (!tb) {
+ tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, net,
+ head, port, l3mdev);
+ if (!tb)
+ goto fail_unlock;
+ bhash_created = true;
+ }
+
+ if (!found_port) {
+ if (!hlist_empty(&tb->bhash2)) {
+ if (sk->sk_reuse == SK_FORCE_REUSE ||
+ (tb->fastreuse > 0 && reuse) ||
+ sk_reuseport_match(tb, sk))
+ check_bind_conflict = false;
+ }
+
+ if (check_bind_conflict && inet_use_bhash2_on_bind(sk)) {
+ if (inet_bhash2_addr_any_conflict(sk, port, l3mdev, true, true))
+ goto fail_unlock;
+ }
+
+ head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
+ spin_lock(&head2->lock);
+ head2_lock_acquired = true;
+ tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
+ }
+
+ if (!tb2) {
+ tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep,
+ net, head2, tb, sk);
+ if (!tb2)
+ goto fail_unlock;
+ bhash2_created = true;
+ }
+
+ if (!found_port && check_bind_conflict) {
+ if (inet_csk_bind_conflict(sk, tb, tb2, true, true))
+ goto fail_unlock;
+ }
+
+success:
+ inet_csk_update_fastreuse(sk, tb, tb2);
+
if (!inet_csk(sk)->icsk_bind_hash)
- inet_bind_hash(sk, tb, port);
+ inet_bind_hash(sk, tb, tb2, port);
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
+ WARN_ON(inet_csk(sk)->icsk_bind2_hash != tb2);
ret = 0;
fail_unlock:
+ if (ret) {
+ if (bhash2_created)
+ inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2);
+ if (bhash_created)
+ inet_bind_bucket_destroy(tb);
+ }
+ if (head2_lock_acquired)
+ spin_unlock(&head2->lock);
spin_unlock_bh(&head->lock);
return ret;
}
@@ -431,7 +660,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
/*
* This will accept the next outstanding connection.
*/
-struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
+struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
@@ -450,7 +679,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
/* Find already established connection */
if (reqsk_queue_empty(queue)) {
- long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+ long timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
/* If this is a non blocking socket don't sleep */
error = -EAGAIN;
@@ -462,6 +691,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
goto out_err;
}
req = reqsk_queue_remove(queue, sk);
+ arg->is_empty = reqsk_queue_empty(queue);
newsk = req->sk;
if (sk->sk_protocol == IPPROTO_TCP &&
@@ -479,16 +709,19 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
}
spin_unlock_bh(&queue->fastopenq.lock);
}
-out:
+
release_sock(sk);
+
if (req)
reqsk_put(req);
+
+ inet_init_csk_locks(newsk);
return newsk;
+
out_err:
- newsk = NULL;
- req = NULL;
- *err = error;
- goto out;
+ release_sock(sk);
+ arg->err = error;
+ return NULL;
}
EXPORT_SYMBOL(inet_csk_accept);
@@ -504,36 +737,38 @@ void inet_csk_init_xmit_timers(struct sock *sk,
{
struct inet_connection_sock *icsk = inet_csk(sk);
- timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0);
+ timer_setup(&sk->tcp_retransmit_timer, retransmit_handler, 0);
timer_setup(&icsk->icsk_delack_timer, delack_handler, 0);
- timer_setup(&sk->sk_timer, keepalive_handler, 0);
+ timer_setup(&icsk->icsk_keepalive_timer, keepalive_handler, 0);
icsk->icsk_pending = icsk->icsk_ack.pending = 0;
}
-EXPORT_SYMBOL(inet_csk_init_xmit_timers);
void inet_csk_clear_xmit_timers(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
+ smp_store_release(&icsk->icsk_pending, 0);
+ smp_store_release(&icsk->icsk_ack.pending, 0);
- sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
+ sk_stop_timer(sk, &sk->tcp_retransmit_timer);
sk_stop_timer(sk, &icsk->icsk_delack_timer);
- sk_stop_timer(sk, &sk->sk_timer);
+ sk_stop_timer(sk, &icsk->icsk_keepalive_timer);
}
-EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
-void inet_csk_delete_keepalive_timer(struct sock *sk)
+void inet_csk_clear_xmit_timers_sync(struct sock *sk)
{
- sk_stop_timer(sk, &sk->sk_timer);
-}
-EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
+ struct inet_connection_sock *icsk = inet_csk(sk);
-void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
-{
- sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+ /* ongoing timer handlers need to acquire socket lock. */
+ sock_not_owned_by_me(sk);
+
+ smp_store_release(&icsk->icsk_pending, 0);
+ smp_store_release(&icsk->icsk_ack.pending, 0);
+
+ sk_stop_timer_sync(sk, &sk->tcp_retransmit_timer);
+ sk_stop_timer_sync(sk, &icsk->icsk_delack_timer);
+ sk_stop_timer_sync(sk, &icsk->icsk_keepalive_timer);
}
-EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);
struct dst_entry *inet_csk_route_req(const struct sock *sk,
struct flowi4 *fl4,
@@ -548,12 +783,12 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk,
opt = rcu_dereference(ireq->ireq_opt);
flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
- RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+ ip_sock_rt_tos(sk), ip_sock_rt_scope(sk),
sk->sk_protocol, inet_sk_flowi_flags(sk),
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port,
- htons(ireq->ir_num), sk->sk_uid);
- security_req_classify_flow(req, flowi4_to_flowi(fl4));
+ htons(ireq->ir_num), sk_uid(sk));
+ security_req_classify_flow(req, flowi4_to_flowi_common(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
goto no_route;
@@ -569,7 +804,6 @@ no_route:
__IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
return NULL;
}
-EXPORT_SYMBOL_GPL(inet_csk_route_req);
struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
struct sock *newsk,
@@ -586,12 +820,12 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
fl4 = &newinet->cork.fl.u.ip4;
flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
- RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+ ip_sock_rt_tos(sk), ip_sock_rt_scope(sk),
sk->sk_protocol, inet_sk_flowi_flags(sk),
(opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr,
ireq->ir_loc_addr, ireq->ir_rmt_port,
- htons(ireq->ir_num), sk->sk_uid);
- security_req_classify_flow(req, flowi4_to_flowi(fl4));
+ htons(ireq->ir_num), sk_uid(sk));
+ security_req_classify_flow(req, flowi4_to_flowi_common(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
goto no_route;
@@ -607,27 +841,20 @@ no_route:
}
EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
-#if IS_ENABLED(CONFIG_IPV6)
-#define AF_INET_FAMILY(fam) ((fam) == AF_INET)
-#else
-#define AF_INET_FAMILY(fam) true
-#endif
-
/* Decide when to expire the request and when to resend SYN-ACK */
-static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
- const int max_retries,
- const u8 rskq_defer_accept,
- int *expire, int *resend)
+static void syn_ack_recalc(struct request_sock *req,
+ const int max_syn_ack_retries,
+ const u8 rskq_defer_accept,
+ int *expire, int *resend)
{
if (!rskq_defer_accept) {
- *expire = req->num_timeout >= thresh;
+ *expire = req->num_timeout >= max_syn_ack_retries;
*resend = 1;
return;
}
- *expire = req->num_timeout >= thresh &&
- (!inet_rsk(req)->acked || req->num_timeout >= max_retries);
- /*
- * Do not resend while waiting for data after ACK,
+ *expire = req->num_timeout >= max_syn_ack_retries &&
+ (!inet_rsk(req)->acked || req->num_timeout >= rskq_defer_accept);
+ /* Do not resend while waiting for data after ACK,
* start to resend on end of deferring period to give
* last chance for data or ACK to create established socket.
*/
@@ -635,67 +862,209 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
req->num_timeout >= rskq_defer_accept - 1;
}
-int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
+static struct request_sock *
+reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener,
+ bool attach_listener)
{
- int err = req->rsk_ops->rtx_syn_ack(parent, req);
+ struct request_sock *req;
- if (!err)
- req->num_retrans++;
- return err;
+ req = kmem_cache_alloc_noprof(ops->slab, GFP_ATOMIC | __GFP_NOWARN);
+ if (!req)
+ return NULL;
+ req->rsk_listener = NULL;
+ if (attach_listener) {
+ if (unlikely(!refcount_inc_not_zero(&sk_listener->sk_refcnt))) {
+ kmem_cache_free(ops->slab, req);
+ return NULL;
+ }
+ req->rsk_listener = sk_listener;
+ }
+ req->rsk_ops = ops;
+ req_to_sk(req)->sk_prot = sk_listener->sk_prot;
+ sk_node_init(&req_to_sk(req)->sk_node);
+ sk_tx_queue_clear(req_to_sk(req));
+ req->saved_syn = NULL;
+ req->syncookie = 0;
+ req->num_timeout = 0;
+ req->num_retrans = 0;
+ req->sk = NULL;
+ refcount_set(&req->rsk_refcnt, 0);
+
+ return req;
+}
+#define reqsk_alloc(...) alloc_hooks(reqsk_alloc_noprof(__VA_ARGS__))
+
+struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
+ struct sock *sk_listener,
+ bool attach_listener)
+{
+ struct request_sock *req = reqsk_alloc(ops, sk_listener,
+ attach_listener);
+
+ if (req) {
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ ireq->ireq_opt = NULL;
+#if IS_ENABLED(CONFIG_IPV6)
+ ireq->pktopts = NULL;
+#endif
+ atomic64_set(&ireq->ir_cookie, 0);
+ ireq->ireq_state = TCP_NEW_SYN_RECV;
+ write_pnet(&ireq->ireq_net, sock_net(sk_listener));
+ ireq->ireq_family = sk_listener->sk_family;
+ }
+
+ return req;
+}
+EXPORT_SYMBOL(inet_reqsk_alloc);
+
+static struct request_sock *inet_reqsk_clone(struct request_sock *req,
+ struct sock *sk)
+{
+ struct sock *req_sk, *nreq_sk;
+ struct request_sock *nreq;
+
+ nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN);
+ if (!nreq) {
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
+
+ /* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */
+ sock_put(sk);
+ return NULL;
+ }
+
+ req_sk = req_to_sk(req);
+ nreq_sk = req_to_sk(nreq);
+
+ memcpy(nreq_sk, req_sk,
+ offsetof(struct sock, sk_dontcopy_begin));
+ unsafe_memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end,
+ req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end),
+ /* alloc is larger than struct, see above */);
+
+ sk_node_init(&nreq_sk->sk_node);
+ nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping;
+#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
+ nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping;
+#endif
+ nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu;
+
+ nreq->rsk_listener = sk;
+
+ /* We need not acquire fastopenq->lock
+ * because the child socket is locked in inet_csk_listen_stop().
+ */
+ if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener)
+ rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq);
+
+ return nreq;
+}
+
+static void reqsk_queue_migrated(struct request_sock_queue *queue,
+ const struct request_sock *req)
+{
+ if (req->num_timeout == 0)
+ atomic_inc(&queue->young);
+ atomic_inc(&queue->qlen);
+}
+
+static void reqsk_migrate_reset(struct request_sock *req)
+{
+ req->saved_syn = NULL;
+#if IS_ENABLED(CONFIG_IPV6)
+ inet_rsk(req)->ipv6_opt = NULL;
+ inet_rsk(req)->pktopts = NULL;
+#else
+ inet_rsk(req)->ireq_opt = NULL;
+#endif
}
-EXPORT_SYMBOL(inet_rtx_syn_ack);
/* return true if req was found in the ehash table */
-static bool reqsk_queue_unlink(struct request_sock_queue *queue,
- struct request_sock *req)
+static bool reqsk_queue_unlink(struct request_sock *req)
{
- struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo;
+ struct sock *sk = req_to_sk(req);
bool found = false;
- if (sk_hashed(req_to_sk(req))) {
- spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash);
+ if (sk_hashed(sk)) {
+ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
+ spinlock_t *lock;
+ lock = inet_ehash_lockp(hashinfo, req->rsk_hash);
spin_lock(lock);
- found = __sk_nulls_del_node_init_rcu(req_to_sk(req));
+ found = __sk_nulls_del_node_init_rcu(sk);
spin_unlock(lock);
}
- if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer))
- reqsk_put(req);
+
return found;
}
-void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
+static bool __inet_csk_reqsk_queue_drop(struct sock *sk,
+ struct request_sock *req,
+ bool from_timer)
{
- if (reqsk_queue_unlink(&inet_csk(sk)->icsk_accept_queue, req)) {
+ bool unlinked = reqsk_queue_unlink(req);
+
+ if (!from_timer && timer_delete_sync(&req->rsk_timer))
+ reqsk_put(req);
+
+ if (unlinked) {
reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
reqsk_put(req);
}
+
+ return unlinked;
+}
+
+bool inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
+{
+ return __inet_csk_reqsk_queue_drop(sk, req, false);
}
-EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req)
{
inet_csk_reqsk_queue_drop(sk, req);
reqsk_put(req);
}
-EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put);
+EXPORT_IPV6_MOD(inet_csk_reqsk_queue_drop_and_put);
static void reqsk_timer_handler(struct timer_list *t)
{
- struct request_sock *req = from_timer(req, t, rsk_timer);
+ struct request_sock *req = timer_container_of(req, t, rsk_timer);
+ struct request_sock *nreq = NULL, *oreq = req;
struct sock *sk_listener = req->rsk_listener;
- struct net *net = sock_net(sk_listener);
- struct inet_connection_sock *icsk = inet_csk(sk_listener);
- struct request_sock_queue *queue = &icsk->icsk_accept_queue;
- int qlen, expire = 0, resend = 0;
- int max_retries, thresh;
- u8 defer_accept;
+ struct inet_connection_sock *icsk;
+ struct request_sock_queue *queue;
+ struct net *net;
+ int max_syn_ack_retries, qlen, expire = 0, resend = 0;
+
+ if (inet_sk_state_load(sk_listener) != TCP_LISTEN) {
+ struct sock *nsk;
+
+ nsk = reuseport_migrate_sock(sk_listener, req_to_sk(req), NULL);
+ if (!nsk)
+ goto drop;
+
+ nreq = inet_reqsk_clone(req, nsk);
+ if (!nreq)
+ goto drop;
+
+ /* The new timer for the cloned req can decrease the 2
+ * by calling inet_csk_reqsk_queue_drop_and_put(), so
+ * hold another count to prevent use-after-free and
+ * call reqsk_put() just before return.
+ */
+ refcount_set(&nreq->rsk_refcnt, 2 + 1);
+ timer_setup(&nreq->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
+ reqsk_queue_migrated(&inet_csk(nsk)->icsk_accept_queue, req);
- if (inet_sk_state_load(sk_listener) != TCP_LISTEN)
- goto drop;
+ req = nreq;
+ sk_listener = nsk;
+ }
- max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries;
- thresh = max_retries;
+ icsk = inet_csk(sk_listener);
+ net = sock_net(sk_listener);
+ max_syn_ack_retries = READ_ONCE(icsk->icsk_syn_retries) ? :
+ READ_ONCE(net->ipv4.sysctl_tcp_synack_retries);
/* Normally all the openreqs are young and become mature
* (i.e. converted to established socket) for first timeout.
* If synack was not acknowledged for 1 second, it means
@@ -713,64 +1082,105 @@ static void reqsk_timer_handler(struct timer_list *t)
* embrions; and abort old ones without pity, if old
* ones are about to clog our table.
*/
+ queue = &icsk->icsk_accept_queue;
qlen = reqsk_queue_len(queue);
- if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) {
+ if ((qlen << 1) > max(8U, READ_ONCE(sk_listener->sk_max_ack_backlog))) {
int young = reqsk_queue_len_young(queue) << 1;
- while (thresh > 2) {
+ while (max_syn_ack_retries > 2) {
if (qlen < young)
break;
- thresh--;
+ max_syn_ack_retries--;
young <<= 1;
}
}
- defer_accept = READ_ONCE(queue->rskq_defer_accept);
- if (defer_accept)
- max_retries = defer_accept;
- syn_ack_recalc(req, thresh, max_retries, defer_accept,
+
+ syn_ack_recalc(req, max_syn_ack_retries, READ_ONCE(queue->rskq_defer_accept),
&expire, &resend);
- req->rsk_ops->syn_ack_timeout(req);
+ tcp_syn_ack_timeout(req);
+
if (!expire &&
(!resend ||
- !inet_rtx_syn_ack(sk_listener, req) ||
+ !tcp_rtx_synack(sk_listener, req) ||
inet_rsk(req)->acked)) {
- unsigned long timeo;
-
if (req->num_timeout++ == 0)
atomic_dec(&queue->young);
- timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
- mod_timer(&req->rsk_timer, jiffies + timeo);
+ mod_timer(&req->rsk_timer, jiffies + tcp_reqsk_timeout(req));
+
+ if (!nreq)
+ return;
+
+ if (!inet_ehash_insert(req_to_sk(nreq), req_to_sk(oreq), NULL)) {
+ /* delete timer */
+ __inet_csk_reqsk_queue_drop(sk_listener, nreq, true);
+ goto no_ownership;
+ }
+
+ __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQSUCCESS);
+ reqsk_migrate_reset(oreq);
+ reqsk_queue_removed(&inet_csk(oreq->rsk_listener)->icsk_accept_queue, oreq);
+ reqsk_put(oreq);
+
+ reqsk_put(nreq);
return;
}
+
+ /* Even if we can clone the req, we may need not retransmit any more
+ * SYN+ACKs (nreq->num_timeout > max_syn_ack_retries, etc), or another
+ * CPU may win the "own_req" race so that inet_ehash_insert() fails.
+ */
+ if (nreq) {
+ __NET_INC_STATS(net, LINUX_MIB_TCPMIGRATEREQFAILURE);
+no_ownership:
+ reqsk_migrate_reset(nreq);
+ reqsk_queue_removed(queue, nreq);
+ __reqsk_free(nreq);
+ }
+
drop:
- inet_csk_reqsk_queue_drop_and_put(sk_listener, req);
+ __inet_csk_reqsk_queue_drop(sk_listener, oreq, true);
+ reqsk_put(oreq);
}
-static void reqsk_queue_hash_req(struct request_sock *req,
- unsigned long timeout)
+static bool reqsk_queue_hash_req(struct request_sock *req)
{
- req->num_retrans = 0;
- req->num_timeout = 0;
- req->sk = NULL;
+ bool found_dup_sk = false;
+ if (!inet_ehash_insert(req_to_sk(req), NULL, &found_dup_sk))
+ return false;
+
+ /* The timer needs to be setup after a successful insertion. */
+ req->timeout = tcp_timeout_init((struct sock *)req);
timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED);
- mod_timer(&req->rsk_timer, jiffies + timeout);
+ mod_timer(&req->rsk_timer, jiffies + req->timeout);
- inet_ehash_insert(req_to_sk(req), NULL);
/* before letting lookups find us, make sure all req fields
* are committed to memory and refcnt initialized.
*/
smp_wmb();
refcount_set(&req->rsk_refcnt, 2 + 1);
+ return true;
}
-void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
- unsigned long timeout)
+bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req)
{
- reqsk_queue_hash_req(req, timeout);
+ if (!reqsk_queue_hash_req(req))
+ return false;
+
inet_csk_reqsk_queue_added(sk);
+ return true;
+}
+
+static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk,
+ const gfp_t priority)
+{
+ struct inet_connection_sock *icsk = inet_csk(newsk);
+
+ if (!icsk->icsk_ulp_ops)
+ return;
+
+ icsk->icsk_ulp_ops->clone(req, newsk, priority);
}
-EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
/**
* inet_csk_clone_lock - clone an inet socket, and lock its clone
@@ -785,38 +1195,61 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
const gfp_t priority)
{
struct sock *newsk = sk_clone_lock(sk, priority);
+ struct inet_connection_sock *newicsk;
+ struct inet_request_sock *ireq;
+ struct inet_sock *newinet;
- if (newsk) {
- struct inet_connection_sock *newicsk = inet_csk(newsk);
+ if (!newsk)
+ return NULL;
- inet_sk_set_state(newsk, TCP_SYN_RECV);
- newicsk->icsk_bind_hash = NULL;
+ newicsk = inet_csk(newsk);
+ newinet = inet_sk(newsk);
+ ireq = inet_rsk(req);
- inet_sk(newsk)->inet_dport = inet_rsk(req)->ir_rmt_port;
- inet_sk(newsk)->inet_num = inet_rsk(req)->ir_num;
- inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
+ newicsk->icsk_bind_hash = NULL;
+ newicsk->icsk_bind2_hash = NULL;
- /* listeners have SOCK_RCU_FREE, not the children */
- sock_reset_flag(newsk, SOCK_RCU_FREE);
+ newinet->inet_dport = ireq->ir_rmt_port;
+ newinet->inet_num = ireq->ir_num;
+ newinet->inet_sport = htons(ireq->ir_num);
- inet_sk(newsk)->mc_list = NULL;
+ newsk->sk_bound_dev_if = ireq->ir_iif;
- newsk->sk_mark = inet_rsk(req)->ir_mark;
- atomic64_set(&newsk->sk_cookie,
- atomic64_read(&inet_rsk(req)->ir_cookie));
+ newsk->sk_daddr = ireq->ir_rmt_addr;
+ newsk->sk_rcv_saddr = ireq->ir_loc_addr;
+ newinet->inet_saddr = ireq->ir_loc_addr;
- newicsk->icsk_retransmits = 0;
- newicsk->icsk_backoff = 0;
- newicsk->icsk_probes_out = 0;
+#if IS_ENABLED(CONFIG_IPV6)
+ newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr;
+ newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr;
+#endif
- /* Deinitialize accept_queue to trap illegal accesses. */
- memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
+ /* listeners have SOCK_RCU_FREE, not the children */
+ sock_reset_flag(newsk, SOCK_RCU_FREE);
+
+ inet_sk(newsk)->mc_list = NULL;
+
+ newsk->sk_mark = inet_rsk(req)->ir_mark;
+ atomic64_set(&newsk->sk_cookie,
+ atomic64_read(&inet_rsk(req)->ir_cookie));
+
+ newicsk->icsk_retransmits = 0;
+ newicsk->icsk_backoff = 0;
+ newicsk->icsk_probes_out = 0;
+ newicsk->icsk_probes_tstamp = 0;
+
+ /* Deinitialize accept_queue to trap illegal accesses. */
+ memset(&newicsk->icsk_accept_queue, 0,
+ sizeof(newicsk->icsk_accept_queue));
+
+ inet_sk_set_state(newsk, TCP_SYN_RECV);
+
+ inet_clone_ulp(req, newsk, priority);
+
+ security_inet_csk_clone(newsk, req);
- security_inet_csk_clone(newsk, req);
- }
return newsk;
}
-EXPORT_SYMBOL_GPL(inet_csk_clone_lock);
/*
* At this point, there should be no process reference to this
@@ -841,16 +1274,21 @@ void inet_csk_destroy_sock(struct sock *sk)
xfrm_sk_free_policy(sk);
- sk_refcnt_debug_release(sk);
-
- percpu_counter_dec(sk->sk_prot->orphan_count);
+ tcp_orphan_count_dec();
sock_put(sk);
}
EXPORT_SYMBOL(inet_csk_destroy_sock);
+void inet_csk_prepare_for_destroy_sock(struct sock *sk)
+{
+ /* The below has to be done to allow calling inet_csk_destroy_sock */
+ sock_set_flag(sk, SOCK_DEAD);
+ tcp_orphan_count_inc();
+}
+
/* This function allows to force a closure of a socket after the call to
- * tcp/dccp_create_openreq_child().
+ * tcp_create_openreq_child().
*/
void inet_csk_prepare_forced_close(struct sock *sk)
__releases(&sk->sk_lock.slock)
@@ -858,23 +1296,33 @@ void inet_csk_prepare_forced_close(struct sock *sk)
/* sk_clone_lock locked the socket and set refcnt to 2 */
bh_unlock_sock(sk);
sock_put(sk);
-
- /* The below has to be done to allow calling inet_csk_destroy_sock */
- sock_set_flag(sk, SOCK_DEAD);
- percpu_counter_inc(sk->sk_prot->orphan_count);
+ inet_csk_prepare_for_destroy_sock(sk);
inet_sk(sk)->inet_num = 0;
}
EXPORT_SYMBOL(inet_csk_prepare_forced_close);
-int inet_csk_listen_start(struct sock *sk, int backlog)
+static int inet_ulp_can_listen(const struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ulp_ops && !icsk->icsk_ulp_ops->clone)
+ return -EINVAL;
+
+ return 0;
+}
+
+int inet_csk_listen_start(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet = inet_sk(sk);
- int err = -EADDRINUSE;
+ int err;
+
+ err = inet_ulp_can_listen(sk);
+ if (unlikely(err))
+ return err;
reqsk_queue_alloc(&icsk->icsk_accept_queue);
- sk->sk_max_ack_backlog = backlog;
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
@@ -884,7 +1332,8 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
* after validation is complete.
*/
inet_sk_state_store(sk, TCP_LISTEN);
- if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
+ err = sk->sk_prot->get_port(sk, inet->inet_num);
+ if (!err) {
inet->inet_sport = htons(inet->inet_num);
sk_dst_reset(sk);
@@ -897,7 +1346,6 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
inet_sk_set_state(sk, TCP_CLOSE);
return err;
}
-EXPORT_SYMBOL_GPL(inet_csk_listen_start);
static void inet_child_forget(struct sock *sk, struct request_sock *req,
struct sock *child)
@@ -906,10 +1354,10 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
sock_orphan(child);
- percpu_counter_inc(sk->sk_prot->orphan_count);
+ tcp_orphan_count_inc();
if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
- BUG_ON(tcp_sk(child)->fastopen_rsk != req);
+ BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
BUG_ON(sk != req->rsk_listener);
/* Paranoid, to prevent race condition if
@@ -918,7 +1366,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
* Also to satisfy an assertion in
* tcp_v4_destroy_sock().
*/
- tcp_sk(child)->fastopen_rsk = NULL;
+ RCU_INIT_POINTER(tcp_sk(child)->fastopen_rsk, NULL);
}
inet_csk_destroy_sock(child);
}
@@ -937,7 +1385,7 @@ struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
req->sk = child;
req->dl_next = NULL;
if (queue->rskq_accept_head == NULL)
- queue->rskq_accept_head = req;
+ WRITE_ONCE(queue->rskq_accept_head, req);
else
queue->rskq_accept_tail->dl_next = req;
queue->rskq_accept_tail = req;
@@ -952,17 +1400,46 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
struct request_sock *req, bool own_req)
{
if (own_req) {
- inet_csk_reqsk_queue_drop(sk, req);
- reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
- if (inet_csk_reqsk_queue_add(sk, req, child))
+ inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+ reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
+
+ if (sk != req->rsk_listener) {
+ /* another listening sk has been selected,
+ * migrate the req to it.
+ */
+ struct request_sock *nreq;
+
+ /* hold a refcnt for the nreq->rsk_listener
+ * which is assigned in inet_reqsk_clone()
+ */
+ sock_hold(sk);
+ nreq = inet_reqsk_clone(req, sk);
+ if (!nreq) {
+ inet_child_forget(sk, req, child);
+ goto child_put;
+ }
+
+ refcount_set(&nreq->rsk_refcnt, 1);
+ if (inet_csk_reqsk_queue_add(sk, nreq, child)) {
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQSUCCESS);
+ reqsk_migrate_reset(req);
+ reqsk_put(req);
+ return child;
+ }
+
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
+ reqsk_migrate_reset(nreq);
+ __reqsk_free(nreq);
+ } else if (inet_csk_reqsk_queue_add(sk, req, child)) {
return child;
+ }
}
/* Too bad, another child took ownership of the request, undo. */
+child_put:
bh_unlock_sock(child);
sock_put(child);
return NULL;
}
-EXPORT_SYMBOL(inet_csk_complete_hashdance);
/*
* This routine closes sockets which have been at least partially
@@ -983,14 +1460,40 @@ void inet_csk_listen_stop(struct sock *sk)
* of the variants now. --ANK
*/
while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
- struct sock *child = req->sk;
+ struct sock *child = req->sk, *nsk;
+ struct request_sock *nreq;
local_bh_disable();
bh_lock_sock(child);
WARN_ON(sock_owned_by_user(child));
sock_hold(child);
+ nsk = reuseport_migrate_sock(sk, child, NULL);
+ if (nsk) {
+ nreq = inet_reqsk_clone(req, nsk);
+ if (nreq) {
+ refcount_set(&nreq->rsk_refcnt, 1);
+
+ if (inet_csk_reqsk_queue_add(nsk, nreq, child)) {
+ __NET_INC_STATS(sock_net(nsk),
+ LINUX_MIB_TCPMIGRATEREQSUCCESS);
+ reqsk_migrate_reset(req);
+ } else {
+ __NET_INC_STATS(sock_net(nsk),
+ LINUX_MIB_TCPMIGRATEREQFAILURE);
+ reqsk_migrate_reset(nreq);
+ __reqsk_free(nreq);
+ }
+
+ /* inet_csk_reqsk_queue_add() has already
+ * called inet_child_forget() on failure case.
+ */
+ goto skip_child_forget;
+ }
+ }
+
inet_child_forget(sk, req, child);
+skip_child_forget:
reqsk_put(req);
bh_unlock_sock(child);
local_bh_enable();
@@ -1014,62 +1517,16 @@ void inet_csk_listen_stop(struct sock *sk)
}
EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
-void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
-{
- struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
- const struct inet_sock *inet = inet_sk(sk);
-
- sin->sin_family = AF_INET;
- sin->sin_addr.s_addr = inet->inet_daddr;
- sin->sin_port = inet->inet_dport;
-}
-EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
-
-#ifdef CONFIG_COMPAT
-int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
-
- if (icsk->icsk_af_ops->compat_getsockopt)
- return icsk->icsk_af_ops->compat_getsockopt(sk, level, optname,
- optval, optlen);
- return icsk->icsk_af_ops->getsockopt(sk, level, optname,
- optval, optlen);
-}
-EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt);
-
-int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
-
- if (icsk->icsk_af_ops->compat_setsockopt)
- return icsk->icsk_af_ops->compat_setsockopt(sk, level, optname,
- optval, optlen);
- return icsk->icsk_af_ops->setsockopt(sk, level, optname,
- optval, optlen);
-}
-EXPORT_SYMBOL_GPL(inet_csk_compat_setsockopt);
-#endif
-
static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
{
const struct inet_sock *inet = inet_sk(sk);
- const struct ip_options_rcu *inet_opt;
- __be32 daddr = inet->inet_daddr;
struct flowi4 *fl4;
struct rtable *rt;
rcu_read_lock();
- inet_opt = rcu_dereference(inet->inet_opt);
- if (inet_opt && inet_opt->opt.srr)
- daddr = inet_opt->opt.faddr;
fl4 = &fl->u.ip4;
- rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
- inet->inet_saddr, inet->inet_dport,
- inet->inet_sport, sk->sk_protocol,
- RT_CONN_FLAGS(sk), sk->sk_bound_dev_if);
+ inet_sk_init_flowi4(inet, fl4);
+ rt = ip_route_output_flow(sock_net(sk), fl4, sk);
if (IS_ERR(rt))
rt = NULL;
if (rt)
@@ -1089,7 +1546,7 @@ struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
if (!dst)
goto out;
}
- dst->ops->update_pmtu(dst, sk, NULL, mtu);
+ dst->ops->update_pmtu(dst, sk, NULL, mtu, true);
dst = __sk_dst_check(sk, 0);
if (!dst)
@@ -1097,4 +1554,3 @@ struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu)
out:
return dst;
}
-EXPORT_SYMBOL_GPL(inet_csk_update_pmtu);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 4e5bc4b2f14e..3f5b1418a610 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* inet_diag.c Module for monitoring INET transport protocols sockets.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -24,9 +20,7 @@
#include <net/ipv6.h>
#include <net/inet_common.h>
#include <net/inet_connection_sock.h>
-#include <net/inet_hashtables.h>
-#include <net/inet_timewait_sock.h>
-#include <net/inet6_hashtables.h>
+#include <net/bpf_sk_storage.h>
#include <net/netlink.h>
#include <linux/inet.h>
@@ -35,7 +29,7 @@
#include <linux/inet_diag.h>
#include <linux/sock_diag.h>
-static const struct inet_diag_handler **inet_diag_table;
+static const struct inet_diag_handler __rcu **inet_diag_table;
struct inet_diag_entry {
const __be32 *saddr;
@@ -46,83 +40,67 @@ struct inet_diag_entry {
u16 userlocks;
u32 ifindex;
u32 mark;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ u64 cgroup_id;
+#endif
};
-static DEFINE_MUTEX(inet_diag_table_mutex);
-
static const struct inet_diag_handler *inet_diag_lock_handler(int proto)
{
- if (!inet_diag_table[proto])
+ const struct inet_diag_handler *handler;
+
+ if (proto < 0 || proto >= IPPROTO_MAX)
+ return NULL;
+
+ if (!READ_ONCE(inet_diag_table[proto]))
sock_load_diag_module(AF_INET, proto);
- mutex_lock(&inet_diag_table_mutex);
- if (!inet_diag_table[proto])
- return ERR_PTR(-ENOENT);
+ rcu_read_lock();
+ handler = rcu_dereference(inet_diag_table[proto]);
+ if (handler && !try_module_get(handler->owner))
+ handler = NULL;
+ rcu_read_unlock();
- return inet_diag_table[proto];
+ return handler;
}
static void inet_diag_unlock_handler(const struct inet_diag_handler *handler)
{
- mutex_unlock(&inet_diag_table_mutex);
+ module_put(handler->owner);
}
void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
{
- r->idiag_family = sk->sk_family;
+ r->idiag_family = READ_ONCE(sk->sk_family);
- r->id.idiag_sport = htons(sk->sk_num);
- r->id.idiag_dport = sk->sk_dport;
- r->id.idiag_if = sk->sk_bound_dev_if;
+ r->id.idiag_sport = htons(READ_ONCE(sk->sk_num));
+ r->id.idiag_dport = READ_ONCE(sk->sk_dport);
+ r->id.idiag_if = READ_ONCE(sk->sk_bound_dev_if);
sock_diag_save_cookie(sk, r->id.idiag_cookie);
#if IS_ENABLED(CONFIG_IPV6)
- if (sk->sk_family == AF_INET6) {
- *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr;
- *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;
+ if (r->idiag_family == AF_INET6) {
+ data_race(*(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr);
+ data_race(*(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr);
} else
#endif
{
memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
- r->id.idiag_src[0] = sk->sk_rcv_saddr;
- r->id.idiag_dst[0] = sk->sk_daddr;
+ r->id.idiag_src[0] = READ_ONCE(sk->sk_rcv_saddr);
+ r->id.idiag_dst[0] = READ_ONCE(sk->sk_daddr);
}
}
EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill);
-static size_t inet_sk_attr_size(struct sock *sk,
- const struct inet_diag_req_v2 *req,
- bool net_admin)
-{
- const struct inet_diag_handler *handler;
- size_t aux = 0;
-
- handler = inet_diag_table[req->sdiag_protocol];
- if (handler && handler->idiag_get_aux_size)
- aux = handler->idiag_get_aux_size(sk, net_admin);
-
- return nla_total_size(sizeof(struct tcp_info))
- + nla_total_size(1) /* INET_DIAG_SHUTDOWN */
- + nla_total_size(1) /* INET_DIAG_TOS */
- + nla_total_size(1) /* INET_DIAG_TCLASS */
- + nla_total_size(4) /* INET_DIAG_MARK */
- + nla_total_size(sizeof(struct inet_diag_meminfo))
- + nla_total_size(sizeof(struct inet_diag_msg))
- + nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
- + nla_total_size(TCP_CA_NAME_MAX)
- + nla_total_size(sizeof(struct tcpvegas_info))
- + aux
- + 64;
-}
-
int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
struct inet_diag_msg *r, int ext,
struct user_namespace *user_ns,
bool net_admin)
{
const struct inet_sock *inet = inet_sk(sk);
+ struct inet_diag_sockopt inet_sockopt;
if (nla_put_u8(skb, INET_DIAG_SHUTDOWN, sk->sk_shutdown))
goto errout;
@@ -131,7 +109,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
* hence this needs to be included regardless of socket family.
*/
if (ext & (1 << (INET_DIAG_TOS - 1)))
- if (nla_put_u8(skb, INET_DIAG_TOS, inet->tos) < 0)
+ if (nla_put_u8(skb, INET_DIAG_TOS, READ_ONCE(inet->tos)) < 0)
goto errout;
#if IS_ENABLED(CONFIG_IPV6)
@@ -147,38 +125,114 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
}
#endif
- if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, sk->sk_mark))
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, READ_ONCE(sk->sk_mark)))
goto errout;
- r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk));
+ if (ext & (1 << (INET_DIAG_CLASS_ID - 1)) ||
+ ext & (1 << (INET_DIAG_TCLASS - 1))) {
+ u32 classid = 0;
+
+#ifdef CONFIG_CGROUP_NET_CLASSID
+ classid = sock_cgroup_classid(&sk->sk_cgrp_data);
+#endif
+ /* Fallback to socket priority if class id isn't set.
+ * Classful qdiscs use it as direct reference to class.
+ * For cgroup2 classid is always zero.
+ */
+ if (!classid)
+ classid = READ_ONCE(sk->sk_priority);
+
+ if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid))
+ goto errout;
+ }
+
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ if (nla_put_u64_64bit(skb, INET_DIAG_CGROUP_ID,
+ cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)),
+ INET_DIAG_PAD))
+ goto errout;
+#endif
+
+ r->idiag_uid = from_kuid_munged(user_ns, sk_uid(sk));
r->idiag_inode = sock_i_ino(sk);
+ memset(&inet_sockopt, 0, sizeof(inet_sockopt));
+ inet_sockopt.recverr = inet_test_bit(RECVERR, sk);
+ inet_sockopt.is_icsk = inet_test_bit(IS_ICSK, sk);
+ inet_sockopt.freebind = inet_test_bit(FREEBIND, sk);
+ inet_sockopt.hdrincl = inet_test_bit(HDRINCL, sk);
+ inet_sockopt.mc_loop = inet_test_bit(MC_LOOP, sk);
+ inet_sockopt.transparent = inet_test_bit(TRANSPARENT, sk);
+ inet_sockopt.mc_all = inet_test_bit(MC_ALL, sk);
+ inet_sockopt.nodefrag = inet_test_bit(NODEFRAG, sk);
+ inet_sockopt.bind_address_no_port = inet_test_bit(BIND_ADDRESS_NO_PORT, sk);
+ inet_sockopt.recverr_rfc4884 = inet_test_bit(RECVERR_RFC4884, sk);
+ inet_sockopt.defer_connect = inet_test_bit(DEFER_CONNECT, sk);
+ if (nla_put(skb, INET_DIAG_SOCKOPT, sizeof(inet_sockopt),
+ &inet_sockopt))
+ goto errout;
+
return 0;
errout:
return 1;
}
EXPORT_SYMBOL_GPL(inet_diag_msg_attrs_fill);
+static int inet_diag_parse_attrs(const struct nlmsghdr *nlh, int hdrlen,
+ struct nlattr **req_nlas)
+{
+ struct nlattr *nla;
+ int remaining;
+
+ nlmsg_for_each_attr(nla, nlh, hdrlen, remaining) {
+ int type = nla_type(nla);
+
+ if (type == INET_DIAG_REQ_PROTOCOL && nla_len(nla) != sizeof(u32))
+ return -EINVAL;
+
+ if (type < __INET_DIAG_REQ_MAX)
+ req_nlas[type] = nla;
+ }
+ return 0;
+}
+
+static int inet_diag_get_protocol(const struct inet_diag_req_v2 *req,
+ const struct inet_diag_dump_data *data)
+{
+ if (data->req_nlas[INET_DIAG_REQ_PROTOCOL])
+ return nla_get_u32(data->req_nlas[INET_DIAG_REQ_PROTOCOL]);
+ return req->sdiag_protocol;
+}
+
+#define MAX_DUMP_ALLOC_SIZE (KMALLOC_MAX_SIZE - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
+
int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
- struct sk_buff *skb, const struct inet_diag_req_v2 *req,
- struct user_namespace *user_ns,
- u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh,
- bool net_admin)
+ struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *req,
+ u16 nlmsg_flags, bool net_admin)
{
const struct tcp_congestion_ops *ca_ops;
const struct inet_diag_handler *handler;
+ struct inet_diag_dump_data *cb_data;
int ext = req->idiag_ext;
struct inet_diag_msg *r;
struct nlmsghdr *nlh;
struct nlattr *attr;
void *info = NULL;
+ u8 icsk_pending;
+ int protocol;
- handler = inet_diag_table[req->sdiag_protocol];
- BUG_ON(!handler);
+ cb_data = cb->data;
+ protocol = inet_diag_get_protocol(req, cb_data);
- nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
- nlmsg_flags);
+ /* inet_diag_lock_handler() made sure inet_diag_table[] is stable. */
+ handler = rcu_dereference_protected(inet_diag_table[protocol], 1);
+ DEBUG_NET_WARN_ON_ONCE(!handler);
+ if (!handler)
+ return -ENXIO;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
if (!nlh)
return -EMSGSIZE;
@@ -189,15 +243,18 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
r->idiag_state = sk->sk_state;
r->idiag_timer = 0;
r->idiag_retrans = 0;
+ r->idiag_expires = 0;
- if (inet_diag_msg_attrs_fill(sk, skb, r, ext, user_ns, net_admin))
+ if (inet_diag_msg_attrs_fill(sk, skb, r, ext,
+ sk_user_ns(NETLINK_CB(cb->skb).sk),
+ net_admin))
goto errout;
if (ext & (1 << (INET_DIAG_MEMINFO - 1))) {
struct inet_diag_meminfo minfo = {
.idiag_rmem = sk_rmem_alloc_get(sk),
- .idiag_wmem = sk->sk_wmem_queued,
- .idiag_fmem = sk->sk_forward_alloc,
+ .idiag_wmem = READ_ONCE(sk->sk_wmem_queued),
+ .idiag_fmem = READ_ONCE(sk->sk_forward_alloc),
.idiag_tmem = sk_wmem_alloc_get(sk),
};
@@ -223,26 +280,24 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
goto out;
}
- if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
- icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
- icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ icsk_pending = smp_load_acquire(&icsk->icsk_pending);
+ if (icsk_pending == ICSK_TIME_RETRANS ||
+ icsk_pending == ICSK_TIME_REO_TIMEOUT ||
+ icsk_pending == ICSK_TIME_LOSS_PROBE) {
r->idiag_timer = 1;
- r->idiag_retrans = icsk->icsk_retransmits;
+ r->idiag_retrans = READ_ONCE(icsk->icsk_retransmits);
r->idiag_expires =
- jiffies_to_msecs(icsk->icsk_timeout - jiffies);
- } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+ jiffies_delta_to_msecs(tcp_timeout_expires(sk) - jiffies);
+ } else if (icsk_pending == ICSK_TIME_PROBE0) {
r->idiag_timer = 4;
- r->idiag_retrans = icsk->icsk_probes_out;
+ r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out);
r->idiag_expires =
- jiffies_to_msecs(icsk->icsk_timeout - jiffies);
- } else if (timer_pending(&sk->sk_timer)) {
+ jiffies_delta_to_msecs(tcp_timeout_expires(sk) - jiffies);
+ } else if (timer_pending(&icsk->icsk_keepalive_timer)) {
r->idiag_timer = 2;
- r->idiag_retrans = icsk->icsk_probes_out;
+ r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out);
r->idiag_expires =
- jiffies_to_msecs(sk->sk_timer.expires - jiffies);
- } else {
- r->idiag_timer = 0;
- r->idiag_expires = 0;
+ jiffies_delta_to_msecs(icsk->icsk_keepalive_timer.expires - jiffies);
}
if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) {
@@ -287,15 +342,46 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
goto errout;
}
- if (ext & (1 << (INET_DIAG_CLASS_ID - 1))) {
- u32 classid = 0;
-
-#ifdef CONFIG_SOCK_CGROUP_DATA
- classid = sock_cgroup_classid(&sk->sk_cgrp_data);
-#endif
+ /* Keep it at the end for potential retry with a larger skb,
+ * or else do best-effort fitting, which is only done for the
+ * first_nlmsg.
+ */
+ if (cb_data->bpf_stg_diag) {
+ bool first_nlmsg = ((unsigned char *)nlh == skb->data);
+ unsigned int prev_min_dump_alloc;
+ unsigned int total_nla_size = 0;
+ unsigned int msg_len;
+ int err;
+
+ msg_len = skb_tail_pointer(skb) - (unsigned char *)nlh;
+ err = bpf_sk_storage_diag_put(cb_data->bpf_stg_diag, sk, skb,
+ INET_DIAG_SK_BPF_STORAGES,
+ &total_nla_size);
+
+ if (!err)
+ goto out;
+
+ total_nla_size += msg_len;
+ prev_min_dump_alloc = cb->min_dump_alloc;
+ if (total_nla_size > prev_min_dump_alloc)
+ cb->min_dump_alloc = min_t(u32, total_nla_size,
+ MAX_DUMP_ALLOC_SIZE);
+
+ if (!first_nlmsg)
+ goto errout;
- if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid))
+ if (cb->min_dump_alloc > prev_min_dump_alloc)
+ /* Retry with pskb_expand_head() with
+ * __GFP_DIRECT_RECLAIM
+ */
goto errout;
+
+ WARN_ON_ONCE(total_nla_size <= prev_min_dump_alloc);
+
+ /* Send what we have for this sk
+ * and move on to the next sk in the following
+ * dump()
+ */
}
out:
@@ -308,214 +394,38 @@ errout:
}
EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
-static int inet_csk_diag_fill(struct sock *sk,
- struct sk_buff *skb,
- const struct inet_diag_req_v2 *req,
- struct user_namespace *user_ns,
- u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh,
- bool net_admin)
-{
- return inet_sk_diag_fill(sk, inet_csk(sk), skb, req, user_ns,
- portid, seq, nlmsg_flags, unlh, net_admin);
-}
-
-static int inet_twsk_diag_fill(struct sock *sk,
- struct sk_buff *skb,
- u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh)
-{
- struct inet_timewait_sock *tw = inet_twsk(sk);
- struct inet_diag_msg *r;
- struct nlmsghdr *nlh;
- long tmo;
-
- nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
- nlmsg_flags);
- if (!nlh)
- return -EMSGSIZE;
-
- r = nlmsg_data(nlh);
- BUG_ON(tw->tw_state != TCP_TIME_WAIT);
-
- tmo = tw->tw_timer.expires - jiffies;
- if (tmo < 0)
- tmo = 0;
-
- inet_diag_msg_common_fill(r, sk);
- r->idiag_retrans = 0;
-
- r->idiag_state = tw->tw_substate;
- r->idiag_timer = 3;
- r->idiag_expires = jiffies_to_msecs(tmo);
- r->idiag_rqueue = 0;
- r->idiag_wqueue = 0;
- r->idiag_uid = 0;
- r->idiag_inode = 0;
-
- nlmsg_end(skb, nlh);
- return 0;
-}
-
-static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
- u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh, bool net_admin)
-{
- struct request_sock *reqsk = inet_reqsk(sk);
- struct inet_diag_msg *r;
- struct nlmsghdr *nlh;
- long tmo;
-
- nlh = nlmsg_put(skb, portid, seq, unlh->nlmsg_type, sizeof(*r),
- nlmsg_flags);
- if (!nlh)
- return -EMSGSIZE;
-
- r = nlmsg_data(nlh);
- inet_diag_msg_common_fill(r, sk);
- r->idiag_state = TCP_SYN_RECV;
- r->idiag_timer = 1;
- r->idiag_retrans = reqsk->num_retrans;
-
- BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
- offsetof(struct sock, sk_cookie));
-
- tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies;
- r->idiag_expires = (tmo >= 0) ? jiffies_to_msecs(tmo) : 0;
- r->idiag_rqueue = 0;
- r->idiag_wqueue = 0;
- r->idiag_uid = 0;
- r->idiag_inode = 0;
-
- if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
- inet_rsk(reqsk)->ir_mark))
- return -EMSGSIZE;
-
- nlmsg_end(skb, nlh);
- return 0;
-}
-
-static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
- const struct inet_diag_req_v2 *r,
- struct user_namespace *user_ns,
- u32 portid, u32 seq, u16 nlmsg_flags,
- const struct nlmsghdr *unlh, bool net_admin)
-{
- if (sk->sk_state == TCP_TIME_WAIT)
- return inet_twsk_diag_fill(sk, skb, portid, seq,
- nlmsg_flags, unlh);
-
- if (sk->sk_state == TCP_NEW_SYN_RECV)
- return inet_req_diag_fill(sk, skb, portid, seq,
- nlmsg_flags, unlh, net_admin);
-
- return inet_csk_diag_fill(sk, skb, r, user_ns, portid, seq,
- nlmsg_flags, unlh, net_admin);
-}
-
-struct sock *inet_diag_find_one_icsk(struct net *net,
- struct inet_hashinfo *hashinfo,
- const struct inet_diag_req_v2 *req)
-{
- struct sock *sk;
-
- rcu_read_lock();
- if (req->sdiag_family == AF_INET)
- sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0],
- req->id.idiag_dport, req->id.idiag_src[0],
- req->id.idiag_sport, req->id.idiag_if);
-#if IS_ENABLED(CONFIG_IPV6)
- else if (req->sdiag_family == AF_INET6) {
- if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
- ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
- sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3],
- req->id.idiag_dport, req->id.idiag_src[3],
- req->id.idiag_sport, req->id.idiag_if);
- else
- sk = inet6_lookup(net, hashinfo, NULL, 0,
- (struct in6_addr *)req->id.idiag_dst,
- req->id.idiag_dport,
- (struct in6_addr *)req->id.idiag_src,
- req->id.idiag_sport,
- req->id.idiag_if);
- }
-#endif
- else {
- rcu_read_unlock();
- return ERR_PTR(-EINVAL);
- }
- rcu_read_unlock();
- if (!sk)
- return ERR_PTR(-ENOENT);
-
- if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) {
- sock_gen_put(sk);
- return ERR_PTR(-ENOENT);
- }
-
- return sk;
-}
-EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk);
-
-int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
- struct sk_buff *in_skb,
- const struct nlmsghdr *nlh,
- const struct inet_diag_req_v2 *req)
-{
- bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN);
- struct net *net = sock_net(in_skb->sk);
- struct sk_buff *rep;
- struct sock *sk;
- int err;
-
- sk = inet_diag_find_one_icsk(net, hashinfo, req);
- if (IS_ERR(sk))
- return PTR_ERR(sk);
-
- rep = nlmsg_new(inet_sk_attr_size(sk, req, net_admin), GFP_KERNEL);
- if (!rep) {
- err = -ENOMEM;
- goto out;
- }
-
- err = sk_diag_fill(sk, rep, req,
- sk_user_ns(NETLINK_CB(in_skb).sk),
- NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, nlh, net_admin);
- if (err < 0) {
- WARN_ON(err == -EMSGSIZE);
- nlmsg_free(rep);
- goto out;
- }
- err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
- MSG_DONTWAIT);
- if (err > 0)
- err = 0;
-
-out:
- if (sk)
- sock_gen_put(sk);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
-
static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb,
const struct nlmsghdr *nlh,
+ int hdrlen,
const struct inet_diag_req_v2 *req)
{
const struct inet_diag_handler *handler;
- int err;
+ struct inet_diag_dump_data dump_data;
+ int err, protocol;
+
+ memset(&dump_data, 0, sizeof(dump_data));
+ err = inet_diag_parse_attrs(nlh, hdrlen, dump_data.req_nlas);
+ if (err)
+ return err;
+
+ protocol = inet_diag_get_protocol(req, &dump_data);
+
+ handler = inet_diag_lock_handler(protocol);
+ if (!handler)
+ return -ENOENT;
- handler = inet_diag_lock_handler(req->sdiag_protocol);
- if (IS_ERR(handler))
- err = PTR_ERR(handler);
- else if (cmd == SOCK_DIAG_BY_FAMILY)
- err = handler->dump_one(in_skb, nlh, req);
- else if (cmd == SOCK_DESTROY && handler->destroy)
+ if (cmd == SOCK_DIAG_BY_FAMILY) {
+ struct netlink_callback cb = {
+ .nlh = nlh,
+ .skb = in_skb,
+ .data = &dump_data,
+ };
+ err = handler->dump_one(&cb, req);
+ } else if (cmd == SOCK_DESTROY && handler->destroy) {
err = handler->destroy(in_skb, req);
- else
+ } else {
err = -EOPNOTSUPP;
+ }
inet_diag_unlock_handler(handler);
return err;
@@ -641,6 +551,16 @@ static int inet_diag_bc_run(const struct nlattr *_bc,
yes = 0;
break;
}
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ case INET_DIAG_BC_CGROUP_COND: {
+ u64 cgroup_id;
+
+ cgroup_id = get_unaligned((const u64 *)(op + 1));
+ if (cgroup_id != entry->cgroup_id)
+ yes = 0;
+ break;
+ }
+#endif
}
if (yes) {
@@ -660,7 +580,7 @@ static void entry_fill_addrs(struct inet_diag_entry *entry,
const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
- if (sk->sk_family == AF_INET6) {
+ if (entry->family == AF_INET6) {
entry->saddr = sk->sk_v6_rcv_saddr.s6_addr32;
entry->daddr = sk->sk_v6_daddr.s6_addr32;
} else
@@ -671,26 +591,37 @@ static void entry_fill_addrs(struct inet_diag_entry *entry,
}
}
-int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
+int inet_diag_bc_sk(const struct inet_diag_dump_data *cb_data, struct sock *sk)
{
- struct inet_sock *inet = inet_sk(sk);
+ const struct nlattr *bc = cb_data->inet_diag_nla_bc;
+ const struct inet_sock *inet = inet_sk(sk);
struct inet_diag_entry entry;
if (!bc)
return 1;
- entry.family = sk->sk_family;
+ entry.family = READ_ONCE(sk->sk_family);
entry_fill_addrs(&entry, sk);
- entry.sport = inet->inet_num;
- entry.dport = ntohs(inet->inet_dport);
- entry.ifindex = sk->sk_bound_dev_if;
- entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0;
- if (sk_fullsock(sk))
- entry.mark = sk->sk_mark;
- else if (sk->sk_state == TCP_NEW_SYN_RECV)
- entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark;
- else
- entry.mark = 0;
+ entry.sport = READ_ONCE(inet->inet_num);
+ entry.dport = ntohs(READ_ONCE(inet->inet_dport));
+ entry.ifindex = READ_ONCE(sk->sk_bound_dev_if);
+ if (cb_data->userlocks_needed)
+ entry.userlocks = sk_fullsock(sk) ? READ_ONCE(sk->sk_userlocks) : 0;
+ if (cb_data->mark_needed) {
+ if (sk_fullsock(sk))
+ entry.mark = READ_ONCE(sk->sk_mark);
+ else if (sk->sk_state == TCP_NEW_SYN_RECV)
+ entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark;
+ else if (sk->sk_state == TCP_TIME_WAIT)
+ entry.mark = inet_twsk(sk)->tw_mark;
+ else
+ entry.mark = 0;
+ }
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ if (cb_data->cgroup_needed)
+ entry.cgroup_id = sk_fullsock(sk) ?
+ cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0;
+#endif
return inet_diag_bc_run(bc, &entry);
}
@@ -780,16 +711,30 @@ static bool valid_markcond(const struct inet_diag_bc_op *op, int len,
return len >= *min_len;
}
-static int inet_diag_bc_audit(const struct nlattr *attr,
+#ifdef CONFIG_SOCK_CGROUP_DATA
+static bool valid_cgroupcond(const struct inet_diag_bc_op *op, int len,
+ int *min_len)
+{
+ *min_len += sizeof(u64);
+ return len >= *min_len;
+}
+#endif
+
+static int inet_diag_bc_audit(struct inet_diag_dump_data *cb_data,
const struct sk_buff *skb)
{
- bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN);
+ const struct nlattr *attr = cb_data->inet_diag_nla_bc;
const void *bytecode, *bc;
int bytecode_len, len;
+ bool net_admin;
- if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op))
+ if (!attr)
+ return 0;
+
+ if (nla_len(attr) < sizeof(struct inet_diag_bc_op))
return -EINVAL;
+ net_admin = netlink_net_capable(skb, CAP_NET_ADMIN);
bytecode = bc = nla_data(attr);
len = bytecode_len = nla_len(attr);
@@ -821,8 +766,18 @@ static int inet_diag_bc_audit(const struct nlattr *attr,
return -EPERM;
if (!valid_markcond(bc, len, &min_len))
return -EINVAL;
+ cb_data->mark_needed = true;
+ break;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ case INET_DIAG_BC_CGROUP_COND:
+ if (!valid_cgroupcond(bc, len, &min_len))
+ return -EINVAL;
+ cb_data->cgroup_needed = true;
break;
+#endif
case INET_DIAG_BC_AUTO:
+ cb_data->userlocks_needed = true;
+ fallthrough;
case INET_DIAG_BC_JMP:
case INET_DIAG_BC_NOP:
break;
@@ -846,223 +801,99 @@ static int inet_diag_bc_audit(const struct nlattr *attr,
return len == 0 ? 0 : -EINVAL;
}
-static int inet_csk_diag_dump(struct sock *sk,
- struct sk_buff *skb,
- struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r,
- const struct nlattr *bc,
- bool net_admin)
-{
- if (!inet_diag_bc_sk(bc, sk))
- return 0;
-
- return inet_csk_diag_fill(sk, skb, r,
- sk_user_ns(NETLINK_CB(cb->skb).sk),
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh,
- net_admin);
-}
-
-static void twsk_build_assert(void)
+static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r)
{
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) !=
- offsetof(struct sock, sk_family));
-
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) !=
- offsetof(struct inet_sock, inet_num));
-
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) !=
- offsetof(struct inet_sock, inet_dport));
-
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) !=
- offsetof(struct inet_sock, inet_rcv_saddr));
+ struct inet_diag_dump_data *cb_data = cb->data;
+ const struct inet_diag_handler *handler;
+ u32 prev_min_dump_alloc;
+ int protocol, err = 0;
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) !=
- offsetof(struct inet_sock, inet_daddr));
+ protocol = inet_diag_get_protocol(r, cb_data);
-#if IS_ENABLED(CONFIG_IPV6)
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) !=
- offsetof(struct sock, sk_v6_rcv_saddr));
+again:
+ prev_min_dump_alloc = cb->min_dump_alloc;
+ handler = inet_diag_lock_handler(protocol);
+ if (handler) {
+ handler->dump(skb, cb, r);
+ inet_diag_unlock_handler(handler);
+ } else {
+ err = -ENOENT;
+ }
+ /* The skb is not large enough to fit one sk info and
+ * inet_sk_diag_fill() has requested for a larger skb.
+ */
+ if (!skb->len && cb->min_dump_alloc > prev_min_dump_alloc) {
+ err = pskb_expand_head(skb, 0, cb->min_dump_alloc, GFP_KERNEL);
+ if (!err)
+ goto again;
+ }
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) !=
- offsetof(struct sock, sk_v6_daddr));
-#endif
+ return err ? : skb->len;
}
-void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
- struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r, struct nlattr *bc)
+static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
- bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
- struct net *net = sock_net(skb->sk);
- u32 idiag_states = r->idiag_states;
- int i, num, s_i, s_num;
- struct sock *sk;
-
- if (idiag_states & TCPF_SYN_RECV)
- idiag_states |= TCPF_NEW_SYN_RECV;
- s_i = cb->args[1];
- s_num = num = cb->args[2];
-
- if (cb->args[0] == 0) {
- if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport)
- goto skip_listen_ht;
-
- for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
- struct inet_listen_hashbucket *ilb;
-
- num = 0;
- ilb = &hashinfo->listening_hash[i];
- spin_lock(&ilb->lock);
- sk_for_each(sk, &ilb->head) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (!net_eq(sock_net(sk), net))
- continue;
-
- if (num < s_num) {
- num++;
- continue;
- }
-
- if (r->sdiag_family != AF_UNSPEC &&
- sk->sk_family != r->sdiag_family)
- goto next_listen;
-
- if (r->id.idiag_sport != inet->inet_sport &&
- r->id.idiag_sport)
- goto next_listen;
+ return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh));
+}
- if (inet_csk_diag_dump(sk, skb, cb, r,
- bc, net_admin) < 0) {
- spin_unlock(&ilb->lock);
- goto done;
- }
+static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen)
+{
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct inet_diag_dump_data *cb_data;
+ struct sk_buff *skb = cb->skb;
+ struct nlattr *nla;
+ int err;
-next_listen:
- ++num;
- }
- spin_unlock(&ilb->lock);
+ cb_data = kzalloc(sizeof(*cb_data), GFP_KERNEL);
+ if (!cb_data)
+ return -ENOMEM;
- s_num = 0;
- }
-skip_listen_ht:
- cb->args[0] = 1;
- s_i = num = s_num = 0;
+ err = inet_diag_parse_attrs(nlh, hdrlen, cb_data->req_nlas);
+ if (err) {
+ kfree(cb_data);
+ return err;
+ }
+ err = inet_diag_bc_audit(cb_data, skb);
+ if (err) {
+ kfree(cb_data);
+ return err;
}
- if (!(idiag_states & ~TCPF_LISTEN))
- goto out;
+ nla = cb_data->inet_diag_nla_bpf_stgs;
+ if (nla) {
+ struct bpf_sk_storage_diag *bpf_stg_diag;
-#define SKARR_SZ 16
- for (i = s_i; i <= hashinfo->ehash_mask; i++) {
- struct inet_ehash_bucket *head = &hashinfo->ehash[i];
- spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
- struct hlist_nulls_node *node;
- struct sock *sk_arr[SKARR_SZ];
- int num_arr[SKARR_SZ];
- int idx, accum, res;
-
- if (hlist_nulls_empty(&head->chain))
- continue;
-
- if (i > s_i)
- s_num = 0;
-
-next_chunk:
- num = 0;
- accum = 0;
- spin_lock_bh(lock);
- sk_nulls_for_each(sk, node, &head->chain) {
- int state;
-
- if (!net_eq(sock_net(sk), net))
- continue;
- if (num < s_num)
- goto next_normal;
- state = (sk->sk_state == TCP_TIME_WAIT) ?
- inet_twsk(sk)->tw_substate : sk->sk_state;
- if (!(idiag_states & (1 << state)))
- goto next_normal;
- if (r->sdiag_family != AF_UNSPEC &&
- sk->sk_family != r->sdiag_family)
- goto next_normal;
- if (r->id.idiag_sport != htons(sk->sk_num) &&
- r->id.idiag_sport)
- goto next_normal;
- if (r->id.idiag_dport != sk->sk_dport &&
- r->id.idiag_dport)
- goto next_normal;
- twsk_build_assert();
-
- if (!inet_diag_bc_sk(bc, sk))
- goto next_normal;
-
- sock_hold(sk);
- num_arr[accum] = num;
- sk_arr[accum] = sk;
- if (++accum == SKARR_SZ)
- break;
-next_normal:
- ++num;
- }
- spin_unlock_bh(lock);
- res = 0;
- for (idx = 0; idx < accum; idx++) {
- if (res >= 0) {
- res = sk_diag_fill(sk_arr[idx], skb, r,
- sk_user_ns(NETLINK_CB(cb->skb).sk),
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI,
- cb->nlh, net_admin);
- if (res < 0)
- num = num_arr[idx];
- }
- sock_gen_put(sk_arr[idx]);
- }
- if (res < 0)
- break;
- cond_resched();
- if (accum == SKARR_SZ) {
- s_num = num + 1;
- goto next_chunk;
+ bpf_stg_diag = bpf_sk_storage_diag_alloc(nla);
+ if (IS_ERR(bpf_stg_diag)) {
+ kfree(cb_data);
+ return PTR_ERR(bpf_stg_diag);
}
+ cb_data->bpf_stg_diag = bpf_stg_diag;
}
-done:
- cb->args[1] = i;
- cb->args[2] = num;
-out:
- ;
+ cb->data = cb_data;
+ return 0;
}
-EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
-static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r,
- struct nlattr *bc)
+static int inet_diag_dump_start(struct netlink_callback *cb)
{
- const struct inet_diag_handler *handler;
- int err = 0;
-
- handler = inet_diag_lock_handler(r->sdiag_protocol);
- if (!IS_ERR(handler))
- handler->dump(skb, cb, r, bc);
- else
- err = PTR_ERR(handler);
- inet_diag_unlock_handler(handler);
+ return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req_v2));
+}
- return err ? : skb->len;
+static int inet_diag_dump_start_compat(struct netlink_callback *cb)
+{
+ return __inet_diag_dump_start(cb, sizeof(struct inet_diag_req));
}
-static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+static int inet_diag_dump_done(struct netlink_callback *cb)
{
- int hdrlen = sizeof(struct inet_diag_req_v2);
- struct nlattr *bc = NULL;
+ struct inet_diag_dump_data *cb_data = cb->data;
- if (nlmsg_attrlen(cb->nlh, hdrlen))
- bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
+ bpf_sk_storage_diag_free(cb_data->bpf_stg_diag);
+ kfree(cb->data);
- return __inet_diag_dump(skb, cb, nlmsg_data(cb->nlh), bc);
+ return 0;
}
static int inet_diag_type2proto(int type)
@@ -1070,8 +901,6 @@ static int inet_diag_type2proto(int type)
switch (type) {
case TCPDIAG_GETSOCK:
return IPPROTO_TCP;
- case DCCPDIAG_GETSOCK:
- return IPPROTO_DCCP;
default:
return 0;
}
@@ -1081,20 +910,16 @@ static int inet_diag_dump_compat(struct sk_buff *skb,
struct netlink_callback *cb)
{
struct inet_diag_req *rc = nlmsg_data(cb->nlh);
- int hdrlen = sizeof(struct inet_diag_req);
struct inet_diag_req_v2 req;
- struct nlattr *bc = NULL;
req.sdiag_family = AF_UNSPEC; /* compatibility */
req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
req.idiag_ext = rc->idiag_ext;
+ req.pad = 0;
req.idiag_states = rc->idiag_states;
req.id = rc->id;
- if (nlmsg_attrlen(cb->nlh, hdrlen))
- bc = nlmsg_find_attr(cb->nlh, hdrlen, INET_DIAG_REQ_BYTECODE);
-
- return __inet_diag_dump(skb, cb, &req, bc);
+ return __inet_diag_dump(skb, cb, &req);
}
static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
@@ -1106,10 +931,12 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
req.sdiag_family = rc->idiag_family;
req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type);
req.idiag_ext = rc->idiag_ext;
+ req.pad = 0;
req.idiag_states = rc->idiag_states;
req.id = rc->id;
- return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh, &req);
+ return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh,
+ sizeof(struct inet_diag_req), &req);
}
static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
@@ -1122,22 +949,12 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
return -EINVAL;
if (nlh->nlmsg_flags & NLM_F_DUMP) {
- if (nlmsg_attrlen(nlh, hdrlen)) {
- struct nlattr *attr;
- int err;
-
- attr = nlmsg_find_attr(nlh, hdrlen,
- INET_DIAG_REQ_BYTECODE);
- err = inet_diag_bc_audit(attr, skb);
- if (err)
- return err;
- }
- {
- struct netlink_dump_control c = {
- .dump = inet_diag_dump_compat,
- };
- return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
- }
+ struct netlink_dump_control c = {
+ .start = inet_diag_dump_start_compat,
+ .done = inet_diag_dump_done,
+ .dump = inet_diag_dump_compat,
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, nlh, &c);
}
return inet_diag_get_exact_compat(skb, nlh);
@@ -1153,25 +970,16 @@ static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h)
if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
h->nlmsg_flags & NLM_F_DUMP) {
- if (nlmsg_attrlen(h, hdrlen)) {
- struct nlattr *attr;
- int err;
-
- attr = nlmsg_find_attr(h, hdrlen,
- INET_DIAG_REQ_BYTECODE);
- err = inet_diag_bc_audit(attr, skb);
- if (err)
- return err;
- }
- {
- struct netlink_dump_control c = {
- .dump = inet_diag_dump,
- };
- return netlink_dump_start(net->diag_nlsk, skb, h, &c);
- }
+ struct netlink_dump_control c = {
+ .start = inet_diag_dump_start,
+ .done = inet_diag_dump_done,
+ .dump = inet_diag_dump,
+ };
+ return netlink_dump_start(net->diag_nlsk, skb, h, &c);
}
- return inet_diag_cmd_exact(h->nlmsg_type, skb, h, nlmsg_data(h));
+ return inet_diag_cmd_exact(h->nlmsg_type, skb, h, hdrlen,
+ nlmsg_data(h));
}
static
@@ -1201,10 +1009,9 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk)
}
handler = inet_diag_lock_handler(sk->sk_protocol);
- if (IS_ERR(handler)) {
- inet_diag_unlock_handler(handler);
+ if (!handler) {
nlmsg_cancel(skb, nlh);
- return PTR_ERR(handler);
+ return -ENOENT;
}
attr = handler->idiag_info_size
@@ -1223,6 +1030,7 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk)
}
static const struct sock_diag_handler inet_diag_handler = {
+ .owner = THIS_MODULE,
.family = AF_INET,
.dump = inet_diag_handler_cmd,
.get_info = inet_diag_handler_get_info,
@@ -1230,6 +1038,7 @@ static const struct sock_diag_handler inet_diag_handler = {
};
static const struct sock_diag_handler inet6_diag_handler = {
+ .owner = THIS_MODULE,
.family = AF_INET6,
.dump = inet_diag_handler_cmd,
.get_info = inet_diag_handler_get_info,
@@ -1239,20 +1048,12 @@ static const struct sock_diag_handler inet6_diag_handler = {
int inet_diag_register(const struct inet_diag_handler *h)
{
const __u16 type = h->idiag_type;
- int err = -EINVAL;
if (type >= IPPROTO_MAX)
- goto out;
+ return -EINVAL;
- mutex_lock(&inet_diag_table_mutex);
- err = -EEXIST;
- if (!inet_diag_table[type]) {
- inet_diag_table[type] = h;
- err = 0;
- }
- mutex_unlock(&inet_diag_table_mutex);
-out:
- return err;
+ return !cmpxchg((const struct inet_diag_handler **)&inet_diag_table[type],
+ NULL, h) ? 0 : -EEXIST;
}
EXPORT_SYMBOL_GPL(inet_diag_register);
@@ -1263,12 +1064,16 @@ void inet_diag_unregister(const struct inet_diag_handler *h)
if (type >= IPPROTO_MAX)
return;
- mutex_lock(&inet_diag_table_mutex);
- inet_diag_table[type] = NULL;
- mutex_unlock(&inet_diag_table_mutex);
+ xchg((const struct inet_diag_handler **)&inet_diag_table[type],
+ NULL);
}
EXPORT_SYMBOL_GPL(inet_diag_unregister);
+static const struct sock_diag_inet_compat inet_diag_compat = {
+ .owner = THIS_MODULE,
+ .fn = inet_diag_rcv_msg_compat,
+};
+
static int __init inet_diag_init(void)
{
const int inet_diag_table_size = (IPPROTO_MAX *
@@ -1287,7 +1092,7 @@ static int __init inet_diag_init(void)
if (err)
goto out_free_inet;
- sock_diag_register_inet_compat(inet_diag_rcv_msg_compat);
+ sock_diag_register_inet_compat(&inet_diag_compat);
out:
return err;
@@ -1302,12 +1107,13 @@ static void __exit inet_diag_exit(void)
{
sock_diag_unregister(&inet6_diag_handler);
sock_diag_unregister(&inet_diag_handler);
- sock_diag_unregister_inet_compat(inet_diag_rcv_msg_compat);
+ sock_diag_unregister_inet_compat(&inet_diag_compat);
kfree(inet_diag_table);
}
module_init(inet_diag_init);
module_exit(inet_diag_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("INET/INET6: socket monitoring via SOCK_DIAG");
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2 /* AF_INET */);
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10 /* AF_INET6 */);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index bcb11f3a27c0..025895eb6ec5 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* inet fragments management
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Pavel Emelyanov <xemul@openvz.org>
* Started as consolidation of ipv4/ip_fragment.c,
* ipv6/reassembly. and ipv6 nf conntrack reassembly
@@ -25,6 +21,65 @@
#include <net/sock.h>
#include <net/inet_frag.h>
#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+
+#include "../core/sock_destructor.h"
+
+/* Use skb->cb to track consecutive/adjacent fragments coming at
+ * the end of the queue. Nodes in the rb-tree queue will
+ * contain "runs" of one or more adjacent fragments.
+ *
+ * Invariants:
+ * - next_frag is NULL at the tail of a "run";
+ * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
+ */
+struct ipfrag_skb_cb {
+ union {
+ struct inet_skb_parm h4;
+ struct inet6_skb_parm h6;
+ };
+ struct sk_buff *next_frag;
+ int frag_run_len;
+ int ip_defrag_offset;
+};
+
+#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
+
+static void fragcb_clear(struct sk_buff *skb)
+{
+ RB_CLEAR_NODE(&skb->rbnode);
+ FRAG_CB(skb)->next_frag = NULL;
+ FRAG_CB(skb)->frag_run_len = skb->len;
+}
+
+/* Append skb to the last "run". */
+static void fragrun_append_to_last(struct inet_frag_queue *q,
+ struct sk_buff *skb)
+{
+ fragcb_clear(skb);
+
+ FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
+ FRAG_CB(q->fragments_tail)->next_frag = skb;
+ q->fragments_tail = skb;
+}
+
+/* Create a new "run" with the skb. */
+static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb)
+{
+ BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
+ fragcb_clear(skb);
+
+ if (q->last_run_head)
+ rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
+ &q->last_run_head->rbnode.rb_right);
+ else
+ rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
+ rb_insert_color(&skb->rbnode, &q->rb_fragments);
+
+ q->fragments_tail = skb;
+ q->last_run_head = skb;
+}
/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
* Value : 0xff if frame should be dropped.
@@ -54,60 +109,145 @@ int inet_frags_init(struct inet_frags *f)
if (!f->frags_cachep)
return -ENOMEM;
+ refcount_set(&f->refcnt, 1);
+ init_completion(&f->completion);
return 0;
}
EXPORT_SYMBOL(inet_frags_init);
void inet_frags_fini(struct inet_frags *f)
{
- /* We must wait that all inet_frag_destroy_rcu() have completed. */
- rcu_barrier();
+ if (refcount_dec_and_test(&f->refcnt))
+ complete(&f->completion);
+
+ wait_for_completion(&f->completion);
kmem_cache_destroy(f->frags_cachep);
f->frags_cachep = NULL;
}
EXPORT_SYMBOL(inet_frags_fini);
+/* called from rhashtable_free_and_destroy() at netns_frags dismantle */
static void inet_frags_free_cb(void *ptr, void *arg)
{
struct inet_frag_queue *fq = ptr;
+ int count;
- /* If we can not cancel the timer, it means this frag_queue
- * is already disappearing, we have nothing to do.
- * Otherwise, we own a refcount until the end of this function.
- */
- if (!del_timer(&fq->timer))
- return;
+ count = timer_delete_sync(&fq->timer) ? 1 : 0;
spin_lock_bh(&fq->lock);
+ fq->flags |= INET_FRAG_DROP;
if (!(fq->flags & INET_FRAG_COMPLETE)) {
fq->flags |= INET_FRAG_COMPLETE;
- refcount_dec(&fq->refcnt);
+ count++;
+ } else if (fq->flags & INET_FRAG_HASH_DEAD) {
+ count++;
}
spin_unlock_bh(&fq->lock);
- inet_frag_put(fq);
+ inet_frag_putn(fq, count);
}
-void inet_frags_exit_net(struct netns_frags *nf)
+static LLIST_HEAD(fqdir_free_list);
+
+static void fqdir_free_fn(struct work_struct *work)
{
- nf->high_thresh = 0; /* prevent creation of new frags */
+ struct llist_node *kill_list;
+ struct fqdir *fqdir, *tmp;
+ struct inet_frags *f;
+
+ /* Atomically snapshot the list of fqdirs to free */
+ kill_list = llist_del_all(&fqdir_free_list);
- rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
+ /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu)
+ * have completed, since they need to dereference fqdir.
+ * Would it not be nice to have kfree_rcu_barrier() ? :)
+ */
+ rcu_barrier();
+
+ llist_for_each_entry_safe(fqdir, tmp, kill_list, free_list) {
+ f = fqdir->f;
+ if (refcount_dec_and_test(&f->refcnt))
+ complete(&f->completion);
+
+ kfree(fqdir);
+ }
}
-EXPORT_SYMBOL(inet_frags_exit_net);
-void inet_frag_kill(struct inet_frag_queue *fq)
+static DECLARE_DELAYED_WORK(fqdir_free_work, fqdir_free_fn);
+
+static void fqdir_work_fn(struct work_struct *work)
{
- if (del_timer(&fq->timer))
- refcount_dec(&fq->refcnt);
+ struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work);
+
+ rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
+
+ if (llist_add(&fqdir->free_list, &fqdir_free_list))
+ queue_delayed_work(system_percpu_wq, &fqdir_free_work, HZ);
+}
+
+int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
+{
+ struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL);
+ int res;
+
+ if (!fqdir)
+ return -ENOMEM;
+ fqdir->f = f;
+ fqdir->net = net;
+ res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params);
+ if (res < 0) {
+ kfree(fqdir);
+ return res;
+ }
+ refcount_inc(&f->refcnt);
+ *fqdirp = fqdir;
+ return 0;
+}
+EXPORT_SYMBOL(fqdir_init);
+
+static struct workqueue_struct *inet_frag_wq;
+
+static int __init inet_frag_wq_init(void)
+{
+ inet_frag_wq = create_workqueue("inet_frag_wq");
+ if (!inet_frag_wq)
+ panic("Could not create inet frag workq");
+ return 0;
+}
+
+pure_initcall(inet_frag_wq_init);
+
+void fqdir_exit(struct fqdir *fqdir)
+{
+ INIT_WORK(&fqdir->destroy_work, fqdir_work_fn);
+ queue_work(inet_frag_wq, &fqdir->destroy_work);
+}
+EXPORT_SYMBOL(fqdir_exit);
+
+void inet_frag_kill(struct inet_frag_queue *fq, int *refs)
+{
+ if (timer_delete(&fq->timer))
+ (*refs)++;
if (!(fq->flags & INET_FRAG_COMPLETE)) {
- struct netns_frags *nf = fq->net;
+ struct fqdir *fqdir = fq->fqdir;
fq->flags |= INET_FRAG_COMPLETE;
- rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params);
- refcount_dec(&fq->refcnt);
+ rcu_read_lock();
+ /* The RCU read lock provides a memory barrier
+ * guaranteeing that if fqdir->dead is false then
+ * the hash table destruction will not start until
+ * after we unlock. Paired with fqdir_pre_exit().
+ */
+ if (!READ_ONCE(fqdir->dead)) {
+ rhashtable_remove_fast(&fqdir->rhashtable, &fq->node,
+ fqdir->f->rhash_params);
+ (*refs)++;
+ } else {
+ fq->flags |= INET_FRAG_HASH_DEAD;
+ }
+ rcu_read_unlock();
}
}
EXPORT_SYMBOL(inet_frag_kill);
@@ -116,47 +256,62 @@ static void inet_frag_destroy_rcu(struct rcu_head *head)
{
struct inet_frag_queue *q = container_of(head, struct inet_frag_queue,
rcu);
- struct inet_frags *f = q->net->f;
+ struct inet_frags *f = q->fqdir->f;
if (f->destructor)
f->destructor(q);
kmem_cache_free(f->frags_cachep, q);
}
+unsigned int inet_frag_rbtree_purge(struct rb_root *root,
+ enum skb_drop_reason reason)
+{
+ struct rb_node *p = rb_first(root);
+ unsigned int sum = 0;
+
+ while (p) {
+ struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
+
+ p = rb_next(p);
+ rb_erase(&skb->rbnode, root);
+ while (skb) {
+ struct sk_buff *next = FRAG_CB(skb)->next_frag;
+
+ sum += skb->truesize;
+ kfree_skb_reason(skb, reason);
+ skb = next;
+ }
+ }
+ return sum;
+}
+EXPORT_SYMBOL(inet_frag_rbtree_purge);
+
void inet_frag_destroy(struct inet_frag_queue *q)
{
- struct sk_buff *fp;
- struct netns_frags *nf;
unsigned int sum, sum_truesize = 0;
+ enum skb_drop_reason reason;
struct inet_frags *f;
+ struct fqdir *fqdir;
WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
- WARN_ON(del_timer(&q->timer) != 0);
+ reason = (q->flags & INET_FRAG_DROP) ?
+ SKB_DROP_REASON_FRAG_REASM_TIMEOUT :
+ SKB_CONSUMED;
+ WARN_ON(timer_delete(&q->timer) != 0);
/* Release all fragment data. */
- fp = q->fragments;
- nf = q->net;
- f = nf->f;
- if (fp) {
- do {
- struct sk_buff *xp = fp->next;
-
- sum_truesize += fp->truesize;
- kfree_skb(fp);
- fp = xp;
- } while (fp);
- } else {
- sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
- }
+ fqdir = q->fqdir;
+ f = fqdir->f;
+ sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments, reason);
sum = sum_truesize + f->qsize;
call_rcu(&q->rcu, inet_frag_destroy_rcu);
- sub_frag_mem_limit(nf, sum);
+ sub_frag_mem_limit(fqdir, sum);
}
EXPORT_SYMBOL(inet_frag_destroy);
-static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
+static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,
struct inet_frags *f,
void *arg)
{
@@ -166,60 +321,331 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
if (!q)
return NULL;
- q->net = nf;
+ q->fqdir = fqdir;
f->constructor(q, arg);
- add_frag_mem_limit(nf, f->qsize);
+ add_frag_mem_limit(fqdir, f->qsize);
timer_setup(&q->timer, f->frag_expire, 0);
spin_lock_init(&q->lock);
- refcount_set(&q->refcnt, 3);
+ /* One reference for the timer, one for the hash table. */
+ refcount_set(&q->refcnt, 2);
return q;
}
-static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
- void *arg)
+static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
+ void *arg,
+ struct inet_frag_queue **prev)
{
- struct inet_frags *f = nf->f;
+ struct inet_frags *f = fqdir->f;
struct inet_frag_queue *q;
- int err;
- q = inet_frag_alloc(nf, f, arg);
- if (!q)
+ q = inet_frag_alloc(fqdir, f, arg);
+ if (!q) {
+ *prev = ERR_PTR(-ENOMEM);
return NULL;
+ }
+ mod_timer(&q->timer, jiffies + fqdir->timeout);
- mod_timer(&q->timer, jiffies + nf->timeout);
+ *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key,
+ &q->node, f->rhash_params);
+ if (*prev) {
+ /* We could not insert in the hash table,
+ * we need to cancel what inet_frag_alloc()
+ * anticipated.
+ */
+ int refs = 1;
- err = rhashtable_insert_fast(&nf->rhashtable, &q->node,
- f->rhash_params);
- if (err < 0) {
q->flags |= INET_FRAG_COMPLETE;
- inet_frag_kill(q);
- inet_frag_destroy(q);
+ inet_frag_kill(q, &refs);
+ inet_frag_putn(q, refs);
return NULL;
}
return q;
}
-/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
-struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
+struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key)
{
- struct inet_frag_queue *fq;
+ /* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */
+ long high_thresh = READ_ONCE(fqdir->high_thresh);
+ struct inet_frag_queue *fq = NULL, *prev;
- if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
+ if (!high_thresh || frag_mem_limit(fqdir) > high_thresh)
return NULL;
- rcu_read_lock();
+ prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params);
+ if (!prev)
+ fq = inet_frag_create(fqdir, key, &prev);
+ if (!IS_ERR_OR_NULL(prev))
+ fq = prev;
+ return fq;
+}
+EXPORT_SYMBOL(inet_frag_find);
- fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
- if (fq) {
- if (!refcount_inc_not_zero(&fq->refcnt))
- fq = NULL;
- rcu_read_unlock();
- return fq;
+int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
+ int offset, int end)
+{
+ struct sk_buff *last = q->fragments_tail;
+
+ /* RFC5722, Section 4, amended by Errata ID : 3089
+ * When reassembling an IPv6 datagram, if
+ * one or more its constituent fragments is determined to be an
+ * overlapping fragment, the entire datagram (and any constituent
+ * fragments) MUST be silently discarded.
+ *
+ * Duplicates, however, should be ignored (i.e. skb dropped, but the
+ * queue/fragments kept for later reassembly).
+ */
+ if (!last)
+ fragrun_create(q, skb); /* First fragment. */
+ else if (FRAG_CB(last)->ip_defrag_offset + last->len < end) {
+ /* This is the common case: skb goes to the end. */
+ /* Detect and discard overlaps. */
+ if (offset < FRAG_CB(last)->ip_defrag_offset + last->len)
+ return IPFRAG_OVERLAP;
+ if (offset == FRAG_CB(last)->ip_defrag_offset + last->len)
+ fragrun_append_to_last(q, skb);
+ else
+ fragrun_create(q, skb);
+ } else {
+ /* Binary search. Note that skb can become the first fragment,
+ * but not the last (covered above).
+ */
+ struct rb_node **rbn, *parent;
+
+ rbn = &q->rb_fragments.rb_node;
+ do {
+ struct sk_buff *curr;
+ int curr_run_end;
+
+ parent = *rbn;
+ curr = rb_to_skb(parent);
+ curr_run_end = FRAG_CB(curr)->ip_defrag_offset +
+ FRAG_CB(curr)->frag_run_len;
+ if (end <= FRAG_CB(curr)->ip_defrag_offset)
+ rbn = &parent->rb_left;
+ else if (offset >= curr_run_end)
+ rbn = &parent->rb_right;
+ else if (offset >= FRAG_CB(curr)->ip_defrag_offset &&
+ end <= curr_run_end)
+ return IPFRAG_DUP;
+ else
+ return IPFRAG_OVERLAP;
+ } while (*rbn);
+ /* Here we have parent properly set, and rbn pointing to
+ * one of its NULL left/right children. Insert skb.
+ */
+ fragcb_clear(skb);
+ rb_link_node(&skb->rbnode, parent, rbn);
+ rb_insert_color(&skb->rbnode, &q->rb_fragments);
}
- rcu_read_unlock();
- return inet_frag_create(nf, key);
+ FRAG_CB(skb)->ip_defrag_offset = offset;
+
+ return IPFRAG_OK;
}
-EXPORT_SYMBOL(inet_frag_find);
+EXPORT_SYMBOL(inet_frag_queue_insert);
+
+void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
+ struct sk_buff *parent)
+{
+ struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments);
+ void (*destructor)(struct sk_buff *);
+ unsigned int orig_truesize = 0;
+ struct sk_buff **nextp = NULL;
+ struct sock *sk = skb->sk;
+ int delta;
+
+ if (sk && is_skb_wmem(skb)) {
+ /* TX: skb->sk might have been passed as argument to
+ * dst->output and must remain valid until tx completes.
+ *
+ * Move sk to reassembled skb and fix up wmem accounting.
+ */
+ orig_truesize = skb->truesize;
+ destructor = skb->destructor;
+ }
+
+ if (head != skb) {
+ fp = skb_clone(skb, GFP_ATOMIC);
+ if (!fp) {
+ head = skb;
+ goto out_restore_sk;
+ }
+ FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
+ if (RB_EMPTY_NODE(&skb->rbnode))
+ FRAG_CB(parent)->next_frag = fp;
+ else
+ rb_replace_node(&skb->rbnode, &fp->rbnode,
+ &q->rb_fragments);
+ if (q->fragments_tail == skb)
+ q->fragments_tail = fp;
+
+ if (orig_truesize) {
+ /* prevent skb_morph from releasing sk */
+ skb->sk = NULL;
+ skb->destructor = NULL;
+ }
+ skb_morph(skb, head);
+ FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
+ rb_replace_node(&head->rbnode, &skb->rbnode,
+ &q->rb_fragments);
+ consume_skb(head);
+ head = skb;
+ }
+ WARN_ON(FRAG_CB(head)->ip_defrag_offset != 0);
+
+ delta = -head->truesize;
+
+ /* Head of list must not be cloned. */
+ if (skb_unclone(head, GFP_ATOMIC))
+ goto out_restore_sk;
+
+ delta += head->truesize;
+ if (delta)
+ add_frag_mem_limit(q->fqdir, delta);
+
+ /* If the first fragment is fragmented itself, we split
+ * it to two chunks: the first with data and paged part
+ * and the second, holding only fragments.
+ */
+ if (skb_has_frag_list(head)) {
+ struct sk_buff *clone;
+ int i, plen = 0;
+
+ clone = alloc_skb(0, GFP_ATOMIC);
+ if (!clone)
+ goto out_restore_sk;
+ skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
+ skb_frag_list_init(head);
+ for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
+ plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
+ clone->data_len = head->data_len - plen;
+ clone->len = clone->data_len;
+ head->truesize += clone->truesize;
+ clone->csum = 0;
+ clone->ip_summed = head->ip_summed;
+ add_frag_mem_limit(q->fqdir, clone->truesize);
+ skb_shinfo(head)->frag_list = clone;
+ nextp = &clone->next;
+ } else {
+ nextp = &skb_shinfo(head)->frag_list;
+ }
+
+out_restore_sk:
+ if (orig_truesize) {
+ int ts_delta = head->truesize - orig_truesize;
+
+ /* if this reassembled skb is fragmented later,
+ * fraglist skbs will get skb->sk assigned from head->sk,
+ * and each frag skb will be released via sock_wfree.
+ *
+ * Update sk_wmem_alloc.
+ */
+ head->sk = sk;
+ head->destructor = destructor;
+ refcount_add(ts_delta, &sk->sk_wmem_alloc);
+ }
+
+ return nextp;
+}
+EXPORT_SYMBOL(inet_frag_reasm_prepare);
+
+void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
+ void *reasm_data, bool try_coalesce)
+{
+ struct sock *sk = is_skb_wmem(head) ? head->sk : NULL;
+ const unsigned int head_truesize = head->truesize;
+ struct sk_buff **nextp = reasm_data;
+ struct rb_node *rbn;
+ struct sk_buff *fp;
+ int sum_truesize;
+
+ skb_push(head, head->data - skb_network_header(head));
+
+ /* Traverse the tree in order, to build frag_list. */
+ fp = FRAG_CB(head)->next_frag;
+ rbn = rb_next(&head->rbnode);
+ rb_erase(&head->rbnode, &q->rb_fragments);
+
+ sum_truesize = head->truesize;
+ while (rbn || fp) {
+ /* fp points to the next sk_buff in the current run;
+ * rbn points to the next run.
+ */
+ /* Go through the current run. */
+ while (fp) {
+ struct sk_buff *next_frag = FRAG_CB(fp)->next_frag;
+ bool stolen;
+ int delta;
+
+ sum_truesize += fp->truesize;
+ if (head->ip_summed != fp->ip_summed)
+ head->ip_summed = CHECKSUM_NONE;
+ else if (head->ip_summed == CHECKSUM_COMPLETE)
+ head->csum = csum_add(head->csum, fp->csum);
+
+ if (try_coalesce && skb_try_coalesce(head, fp, &stolen,
+ &delta)) {
+ kfree_skb_partial(fp, stolen);
+ } else {
+ fp->prev = NULL;
+ memset(&fp->rbnode, 0, sizeof(fp->rbnode));
+ fp->sk = NULL;
+
+ head->data_len += fp->len;
+ head->len += fp->len;
+ head->truesize += fp->truesize;
+
+ *nextp = fp;
+ nextp = &fp->next;
+ }
+
+ fp = next_frag;
+ }
+ /* Move to the next run. */
+ if (rbn) {
+ struct rb_node *rbnext = rb_next(rbn);
+
+ fp = rb_to_skb(rbn);
+ rb_erase(rbn, &q->rb_fragments);
+ rbn = rbnext;
+ }
+ }
+ sub_frag_mem_limit(q->fqdir, sum_truesize);
+
+ *nextp = NULL;
+ skb_mark_not_on_list(head);
+ head->prev = NULL;
+ head->tstamp = q->stamp;
+ head->tstamp_type = q->tstamp_type;
+
+ if (sk)
+ refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc);
+}
+EXPORT_SYMBOL(inet_frag_reasm_finish);
+
+struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q)
+{
+ struct sk_buff *head, *skb;
+
+ head = skb_rb_first(&q->rb_fragments);
+ if (!head)
+ return NULL;
+ skb = FRAG_CB(head)->next_frag;
+ if (skb)
+ rb_replace_node(&head->rbnode, &skb->rbnode,
+ &q->rb_fragments);
+ else
+ rb_erase(&head->rbnode, &q->rb_fragments);
+ memset(&head->rbnode, 0, sizeof(head->rbnode));
+ barrier();
+
+ if (head == q->fragments_tail)
+ q->fragments_tail = NULL;
+
+ sub_frag_mem_limit(q->fqdir, head->truesize);
+
+ return head;
+}
+EXPORT_SYMBOL(inet_frag_pull_head);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index f5c9ef2586de..f5826ec4bcaa 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -6,11 +7,6 @@
* Generic INET transport hashtables
*
* Authors: Lotsa people, from code originally in tcp
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
@@ -19,27 +15,31 @@
#include <linux/slab.h>
#include <linux/wait.h>
#include <linux/vmalloc.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <net/addrconf.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
-#include <net/secure_seq.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/inet6_hashtables.h>
+#endif
+#include <net/hotdata.h>
#include <net/ip.h>
-#include <net/tcp.h>
+#include <net/rps.h>
+#include <net/secure_seq.h>
#include <net/sock_reuseport.h>
+#include <net/tcp.h>
-static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
- const __u16 lport, const __be32 faddr,
- const __be16 fport)
+u32 inet_ehashfn(const struct net *net, const __be32 laddr,
+ const __u16 lport, const __be32 faddr,
+ const __be16 fport)
{
- static u32 inet_ehash_secret __read_mostly;
-
net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));
- return __inet_ehashfn(laddr, lport, faddr, fport,
- inet_ehash_secret + net_hash_mix(net));
+ return lport + __inet_ehashfn(laddr, 0, faddr, fport,
+ inet_ehash_secret + net_hash_mix(net));
}
+EXPORT_SYMBOL_GPL(inet_ehashfn);
/* This function handles inet_sock, but also timewait and request sockets
* for IPv4/IPv6.
@@ -58,6 +58,14 @@ static u32 sk_ehashfn(const struct sock *sk)
sk->sk_daddr, sk->sk_dport);
}
+static bool sk_is_connect_bind(const struct sock *sk)
+{
+ if (sk->sk_state == TCP_TIME_WAIT)
+ return inet_twsk(sk)->tw_connect_bind;
+ else
+ return sk->sk_userlocks & SOCK_CONNECT_BIND;
+}
+
/*
* Allocate and initialize a new local port bind bucket.
* The bindhash mutex for snum's hash chain must be held here.
@@ -65,17 +73,19 @@ static u32 sk_ehashfn(const struct sock *sk)
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
struct net *net,
struct inet_bind_hashbucket *head,
- const unsigned short snum)
+ const unsigned short snum,
+ int l3mdev)
{
struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
if (tb) {
write_pnet(&tb->ib_net, net);
+ tb->l3mdev = l3mdev;
tb->port = snum;
tb->fastreuse = 0;
tb->fastreuseport = 0;
- INIT_HLIST_HEAD(&tb->owners);
- hlist_add_head(&tb->node, &head->chain);
+ INIT_HLIST_HEAD(&tb->bhash2);
+ hlist_add_head_rcu(&tb->node, &head->chain);
}
return tb;
}
@@ -83,20 +93,117 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
/*
* Caller must hold hashbucket lock for this tb with local BH disabled
*/
-void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
+void inet_bind_bucket_destroy(struct inet_bind_bucket *tb)
+{
+ const struct inet_bind2_bucket *tb2;
+
+ if (hlist_empty(&tb->bhash2)) {
+ hlist_del_rcu(&tb->node);
+ kfree_rcu(tb, rcu);
+ return;
+ }
+
+ if (tb->fastreuse == -1 && tb->fastreuseport == -1)
+ return;
+ hlist_for_each_entry(tb2, &tb->bhash2, bhash_node) {
+ if (tb2->fastreuse != -1 || tb2->fastreuseport != -1)
+ return;
+ }
+ tb->fastreuse = -1;
+ tb->fastreuseport = -1;
+}
+
+bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net,
+ unsigned short port, int l3mdev)
{
+ return net_eq(ib_net(tb), net) && tb->port == port &&
+ tb->l3mdev == l3mdev;
+}
+
+static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2,
+ struct net *net,
+ struct inet_bind_hashbucket *head,
+ struct inet_bind_bucket *tb,
+ const struct sock *sk)
+{
+ write_pnet(&tb2->ib_net, net);
+ tb2->l3mdev = tb->l3mdev;
+ tb2->port = tb->port;
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(USHRT_MAX < (IPV6_ADDR_ANY | IPV6_ADDR_MAPPED));
+ if (sk->sk_family == AF_INET6) {
+ tb2->addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
+ tb2->v6_rcv_saddr = sk->sk_v6_rcv_saddr;
+ } else {
+ tb2->addr_type = IPV6_ADDR_MAPPED;
+ ipv6_addr_set_v4mapped(sk->sk_rcv_saddr, &tb2->v6_rcv_saddr);
+ }
+#else
+ tb2->rcv_saddr = sk->sk_rcv_saddr;
+#endif
+ tb2->fastreuse = 0;
+ tb2->fastreuseport = 0;
+ INIT_HLIST_HEAD(&tb2->owners);
+ hlist_add_head(&tb2->node, &head->chain);
+ hlist_add_head(&tb2->bhash_node, &tb->bhash2);
+}
+
+struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep,
+ struct net *net,
+ struct inet_bind_hashbucket *head,
+ struct inet_bind_bucket *tb,
+ const struct sock *sk)
+{
+ struct inet_bind2_bucket *tb2 = kmem_cache_alloc(cachep, GFP_ATOMIC);
+
+ if (tb2)
+ inet_bind2_bucket_init(tb2, net, head, tb, sk);
+
+ return tb2;
+}
+
+/* Caller must hold hashbucket lock for this tb with local BH disabled */
+void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
+{
+ const struct sock *sk;
+
if (hlist_empty(&tb->owners)) {
__hlist_del(&tb->node);
+ __hlist_del(&tb->bhash_node);
kmem_cache_free(cachep, tb);
+ return;
}
+
+ if (tb->fastreuse == -1 && tb->fastreuseport == -1)
+ return;
+ sk_for_each_bound(sk, &tb->owners) {
+ if (!sk_is_connect_bind(sk))
+ return;
+ }
+ tb->fastreuse = -1;
+ tb->fastreuseport = -1;
+}
+
+static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2,
+ const struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr);
+
+ if (tb2->addr_type != IPV6_ADDR_MAPPED)
+ return false;
+#endif
+ return tb2->rcv_saddr == sk->sk_rcv_saddr;
}
void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
- const unsigned short snum)
+ struct inet_bind2_bucket *tb2, unsigned short port)
{
- inet_sk(sk)->inet_num = snum;
- sk_add_bind_node(sk, &tb->owners);
+ inet_sk(sk)->inet_num = port;
inet_csk(sk)->icsk_bind_hash = tb;
+ inet_csk(sk)->icsk_bind2_hash = tb2;
+ sk_add_bind_node(sk, &tb2->owners);
}
/*
@@ -104,18 +211,33 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
*/
static void __inet_put_port(struct sock *sk)
{
- struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
- const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
- hashinfo->bhash_size);
- struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
+ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
+ struct inet_bind_hashbucket *head, *head2;
+ struct net *net = sock_net(sk);
struct inet_bind_bucket *tb;
+ int bhash;
+
+ bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size);
+ head = &hashinfo->bhash[bhash];
+ head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num);
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
- __sk_del_bind_node(sk);
inet_csk(sk)->icsk_bind_hash = NULL;
inet_sk(sk)->inet_num = 0;
- inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ sk->sk_userlocks &= ~SOCK_CONNECT_BIND;
+
+ spin_lock(&head2->lock);
+ if (inet_csk(sk)->icsk_bind2_hash) {
+ struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash;
+
+ __sk_del_bind_node(sk);
+ inet_csk(sk)->icsk_bind2_hash = NULL;
+ inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
+ }
+ spin_unlock(&head2->lock);
+
+ inet_bind_bucket_destroy(tb);
spin_unlock(&head->lock);
}
@@ -129,43 +251,80 @@ EXPORT_SYMBOL(inet_put_port);
int __inet_inherit_port(const struct sock *sk, struct sock *child)
{
- struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
+ struct inet_hashinfo *table = tcp_get_hashinfo(sk);
unsigned short port = inet_sk(child)->inet_num;
- const int bhash = inet_bhashfn(sock_net(sk), port,
- table->bhash_size);
- struct inet_bind_hashbucket *head = &table->bhash[bhash];
+ struct inet_bind_hashbucket *head, *head2;
+ bool created_inet_bind_bucket = false;
+ struct net *net = sock_net(sk);
+ bool update_fastreuse = false;
+ struct inet_bind2_bucket *tb2;
struct inet_bind_bucket *tb;
+ int bhash, l3mdev;
+
+ bhash = inet_bhashfn(net, port, table->bhash_size);
+ head = &table->bhash[bhash];
+ head2 = inet_bhashfn_portaddr(table, child, net, port);
spin_lock(&head->lock);
+ spin_lock(&head2->lock);
tb = inet_csk(sk)->icsk_bind_hash;
- if (unlikely(!tb)) {
+ tb2 = inet_csk(sk)->icsk_bind2_hash;
+ if (unlikely(!tb || !tb2)) {
+ spin_unlock(&head2->lock);
spin_unlock(&head->lock);
return -ENOENT;
}
if (tb->port != port) {
+ l3mdev = inet_sk_bound_l3mdev(sk);
+
/* NOTE: using tproxy and redirecting skbs to a proxy
* on a different listener port breaks the assumption
* that the listener socket's icsk_bind_hash is the same
* as that of the child socket. We have to look up or
* create a new bind bucket for the child here. */
inet_bind_bucket_for_each(tb, &head->chain) {
- if (net_eq(ib_net(tb), sock_net(sk)) &&
- tb->port == port)
+ if (inet_bind_bucket_match(tb, net, port, l3mdev))
break;
}
if (!tb) {
tb = inet_bind_bucket_create(table->bind_bucket_cachep,
- sock_net(sk), head, port);
+ net, head, port, l3mdev);
if (!tb) {
+ spin_unlock(&head2->lock);
spin_unlock(&head->lock);
return -ENOMEM;
}
+ created_inet_bind_bucket = true;
+ }
+ update_fastreuse = true;
+
+ goto bhash2_find;
+ } else if (!inet_bind2_bucket_addr_match(tb2, child)) {
+ l3mdev = inet_sk_bound_l3mdev(sk);
+
+bhash2_find:
+ tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child);
+ if (!tb2) {
+ tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep,
+ net, head2, tb, child);
+ if (!tb2)
+ goto error;
}
}
- inet_bind_hash(child, tb, port);
+ if (update_fastreuse)
+ inet_csk_update_fastreuse(child, tb, tb2);
+ inet_bind_hash(child, tb, tb2, port);
+ spin_unlock(&head2->lock);
spin_unlock(&head->lock);
return 0;
+
+error:
+ if (created_inet_bind_bucket)
+ inet_bind_bucket_destroy(tb);
+ spin_unlock(&head2->lock);
+ spin_unlock(&head->lock);
+ return -ENOMEM;
}
EXPORT_SYMBOL_GPL(__inet_inherit_port);
@@ -187,72 +346,61 @@ inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
return inet_lhash2_bucket(h, hash);
}
-static void inet_hash2(struct inet_hashinfo *h, struct sock *sk)
-{
- struct inet_listen_hashbucket *ilb2;
-
- if (!h->lhash2)
- return;
-
- ilb2 = inet_lhash2_bucket_sk(h, sk);
-
- spin_lock(&ilb2->lock);
- if (sk->sk_reuseport && sk->sk_family == AF_INET6)
- hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
- &ilb2->head);
- else
- hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node,
- &ilb2->head);
- ilb2->count++;
- spin_unlock(&ilb2->lock);
-}
-
-static void inet_unhash2(struct inet_hashinfo *h, struct sock *sk)
+static inline int compute_score(struct sock *sk, const struct net *net,
+ const unsigned short hnum, const __be32 daddr,
+ const int dif, const int sdif)
{
- struct inet_listen_hashbucket *ilb2;
+ int score = -1;
- if (!h->lhash2 ||
- WARN_ON_ONCE(hlist_unhashed(&inet_csk(sk)->icsk_listen_portaddr_node)))
- return;
+ if (net_eq(sock_net(sk), net) && sk->sk_num == hnum &&
+ !ipv6_only_sock(sk)) {
+ if (sk->sk_rcv_saddr != daddr)
+ return -1;
- ilb2 = inet_lhash2_bucket_sk(h, sk);
+ if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
+ return -1;
+ score = sk->sk_bound_dev_if ? 2 : 1;
- spin_lock(&ilb2->lock);
- hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node);
- ilb2->count--;
- spin_unlock(&ilb2->lock);
+ if (sk->sk_family == PF_INET)
+ score++;
+ if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
+ score++;
+ }
+ return score;
}
-static inline int compute_score(struct sock *sk, struct net *net,
- const unsigned short hnum, const __be32 daddr,
- const int dif, const int sdif, bool exact_dif)
+/**
+ * inet_lookup_reuseport() - execute reuseport logic on AF_INET socket if necessary.
+ * @net: network namespace.
+ * @sk: AF_INET socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP.
+ * @skb: context for a potential SK_REUSEPORT program.
+ * @doff: header offset.
+ * @saddr: source address.
+ * @sport: source port.
+ * @daddr: destination address.
+ * @hnum: destination port in host byte order.
+ * @ehashfn: hash function used to generate the fallback hash.
+ *
+ * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to
+ * the selected sock or an error.
+ */
+struct sock *inet_lookup_reuseport(const struct net *net, struct sock *sk,
+ struct sk_buff *skb, int doff,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned short hnum,
+ inet_ehashfn_t *ehashfn)
{
- int score = -1;
- struct inet_sock *inet = inet_sk(sk);
+ struct sock *reuse_sk = NULL;
+ u32 phash;
- if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
- !ipv6_only_sock(sk)) {
- __be32 rcv_saddr = inet->inet_rcv_saddr;
- score = sk->sk_family == PF_INET ? 2 : 1;
- if (rcv_saddr) {
- if (rcv_saddr != daddr)
- return -1;
- score += 4;
- }
- if (sk->sk_bound_dev_if || exact_dif) {
- bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
-
- if (!dev_match)
- return -1;
- if (sk->sk_bound_dev_if)
- score += 4;
- }
- if (sk->sk_incoming_cpu == raw_smp_processor_id())
- score++;
+ if (sk->sk_reuseport) {
+ phash = INDIRECT_CALL_2(ehashfn, udp_ehashfn, inet_ehashfn,
+ net, daddr, hnum, saddr, sport);
+ reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
}
- return score;
+ return reuse_sk;
}
+EXPORT_SYMBOL_GPL(inet_lookup_reuseport);
/*
* Here are some nice properties to exploit here. The BSD API
@@ -262,32 +410,25 @@ static inline int compute_score(struct sock *sk, struct net *net,
*/
/* called with rcu_read_lock() : No refcount taken on the socket */
-static struct sock *inet_lhash2_lookup(struct net *net,
+static struct sock *inet_lhash2_lookup(const struct net *net,
struct inet_listen_hashbucket *ilb2,
struct sk_buff *skb, int doff,
const __be32 saddr, __be16 sport,
const __be32 daddr, const unsigned short hnum,
const int dif, const int sdif)
{
- bool exact_dif = inet_exact_dif_match(net, skb);
- struct inet_connection_sock *icsk;
struct sock *sk, *result = NULL;
+ struct hlist_nulls_node *node;
int score, hiscore = 0;
- u32 phash = 0;
- inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
- sk = (struct sock *)icsk;
- score = compute_score(sk, net, hnum, daddr,
- dif, sdif, exact_dif);
+ sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) {
+ score = compute_score(sk, net, hnum, daddr, dif, sdif);
if (score > hiscore) {
- if (sk->sk_reuseport) {
- phash = inet_ehashfn(net, daddr, hnum,
- saddr, sport);
- result = reuseport_select_sock(sk, phash,
- skb, doff);
- if (result)
- return result;
- }
+ result = inet_lookup_reuseport(net, sk, skb, doff,
+ saddr, sport, daddr, hnum, inet_ehashfn);
+ if (result)
+ return result;
+
result = sk;
hiscore = score;
}
@@ -296,33 +437,51 @@ static struct sock *inet_lhash2_lookup(struct net *net,
return result;
}
-struct sock *__inet_lookup_listener(struct net *net,
- struct inet_hashinfo *hashinfo,
+struct sock *inet_lookup_run_sk_lookup(const struct net *net,
+ int protocol,
+ struct sk_buff *skb, int doff,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, u16 hnum, const int dif,
+ inet_ehashfn_t *ehashfn)
+{
+ struct sock *sk, *reuse_sk;
+ bool no_reuseport;
+
+ no_reuseport = bpf_sk_lookup_run_v4(net, protocol, saddr, sport,
+ daddr, hnum, dif, &sk);
+ if (no_reuseport || IS_ERR_OR_NULL(sk))
+ return sk;
+
+ reuse_sk = inet_lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum,
+ ehashfn);
+ if (reuse_sk)
+ sk = reuse_sk;
+ return sk;
+}
+
+struct sock *__inet_lookup_listener(const struct net *net,
struct sk_buff *skb, int doff,
const __be32 saddr, __be16 sport,
const __be32 daddr, const unsigned short hnum,
const int dif, const int sdif)
{
- unsigned int hash = inet_lhashfn(net, hnum);
- struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
- bool exact_dif = inet_exact_dif_match(net, skb);
struct inet_listen_hashbucket *ilb2;
- struct sock *sk, *result = NULL;
- int score, hiscore = 0;
+ struct inet_hashinfo *hashinfo;
+ struct sock *result = NULL;
unsigned int hash2;
- u32 phash = 0;
-
- if (ilb->count <= 10 || !hashinfo->lhash2)
- goto port_lookup;
- /* Too many sk in the ilb bucket (which is hashed by port alone).
- * Try lhash2 (which is hashed by port and addr) instead.
- */
+ /* Lookup redirect from BPF */
+ if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
+ result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff,
+ saddr, sport, daddr, hnum, dif,
+ inet_ehashfn);
+ if (result)
+ goto done;
+ }
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
hash2 = ipv4_portaddr_hash(net, daddr, hnum);
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
- if (ilb2->count > ilb->count)
- goto port_lookup;
result = inet_lhash2_lookup(net, ilb2, skb, doff,
saddr, sport, daddr, hnum,
@@ -331,36 +490,14 @@ struct sock *__inet_lookup_listener(struct net *net,
goto done;
/* Lookup lhash2 with INADDR_ANY */
-
hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
- if (ilb2->count > ilb->count)
- goto port_lookup;
result = inet_lhash2_lookup(net, ilb2, skb, doff,
- saddr, sport, daddr, hnum,
+ saddr, sport, htonl(INADDR_ANY), hnum,
dif, sdif);
- goto done;
-
-port_lookup:
- sk_for_each_rcu(sk, &ilb->head) {
- score = compute_score(sk, net, hnum, daddr,
- dif, sdif, exact_dif);
- if (score > hiscore) {
- if (sk->sk_reuseport) {
- phash = inet_ehashfn(net, daddr, hnum,
- saddr, sport);
- result = reuseport_select_sock(sk, phash,
- skb, doff);
- if (result)
- goto done;
- }
- result = sk;
- hiscore = score;
- }
- }
done:
- if (unlikely(IS_ERR(result)))
+ if (IS_ERR(result))
return NULL;
return result;
}
@@ -387,34 +524,33 @@ void sock_edemux(struct sk_buff *skb)
}
EXPORT_SYMBOL(sock_edemux);
-struct sock *__inet_lookup_established(struct net *net,
- struct inet_hashinfo *hashinfo,
- const __be32 saddr, const __be16 sport,
- const __be32 daddr, const u16 hnum,
- const int dif, const int sdif)
+struct sock *__inet_lookup_established(const struct net *net,
+ const __be32 saddr, const __be16 sport,
+ const __be32 daddr, const u16 hnum,
+ const int dif, const int sdif)
{
- INET_ADDR_COOKIE(acookie, saddr, daddr);
const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
- struct sock *sk;
+ INET_ADDR_COOKIE(acookie, saddr, daddr);
const struct hlist_nulls_node *node;
- /* Optimize here for direct hit, only listening connections can
- * have wildcards anyways.
- */
- unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
- unsigned int slot = hash & hashinfo->ehash_mask;
- struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+ struct inet_ehash_bucket *head;
+ struct inet_hashinfo *hashinfo;
+ unsigned int hash, slot;
+ struct sock *sk;
+
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
+ hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
+ slot = hash & hashinfo->ehash_mask;
+ head = &hashinfo->ehash[slot];
begin:
sk_nulls_for_each_rcu(sk, node, &head->chain) {
if (sk->sk_hash != hash)
continue;
- if (likely(INET_MATCH(sk, net, acookie,
- saddr, daddr, ports, dif, sdif))) {
+ if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) {
if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
goto out;
- if (unlikely(!INET_MATCH(sk, net, acookie,
- saddr, daddr, ports,
- dif, sdif))) {
+ if (unlikely(!inet_match(net, sk, acookie,
+ ports, dif, sdif))) {
sock_gen_put(sk);
goto begin;
}
@@ -438,7 +574,9 @@ EXPORT_SYMBOL_GPL(__inet_lookup_established);
/* called with local bh disabled */
static int __inet_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk, __u16 lport,
- struct inet_timewait_sock **twp)
+ struct inet_timewait_sock **twp,
+ bool rcu_lookup,
+ u32 hash)
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
struct inet_sock *inet = inet_sk(sk);
@@ -449,25 +587,35 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
int sdif = l3mdev_master_ifindex_by_index(net, dif);
INET_ADDR_COOKIE(acookie, saddr, daddr);
const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
- unsigned int hash = inet_ehashfn(net, daddr, lport,
- saddr, inet->inet_dport);
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
- spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
- struct sock *sk2;
- const struct hlist_nulls_node *node;
struct inet_timewait_sock *tw = NULL;
+ const struct hlist_nulls_node *node;
+ struct sock *sk2;
+ spinlock_t *lock;
+ if (rcu_lookup) {
+ sk_nulls_for_each(sk2, node, &head->chain) {
+ if (sk2->sk_hash != hash ||
+ !inet_match(net, sk2, acookie, ports, dif, sdif))
+ continue;
+ if (sk2->sk_state == TCP_TIME_WAIT)
+ break;
+ return -EADDRNOTAVAIL;
+ }
+ return 0;
+ }
+
+ lock = inet_ehash_lockp(hinfo, hash);
spin_lock(lock);
sk_nulls_for_each(sk2, node, &head->chain) {
if (sk2->sk_hash != hash)
continue;
- if (likely(INET_MATCH(sk2, net, acookie,
- saddr, daddr, ports, dif, sdif))) {
+ if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) {
if (sk2->sk_state == TCP_TIME_WAIT) {
tw = inet_twsk(sk2);
- if (twsk_unique(sk, sk2, twp))
+ if (tcp_twsk_unique(sk, sk2, twp))
break;
}
goto not_unique;
@@ -502,7 +650,7 @@ not_unique:
return -EADDRNOTAVAIL;
}
-static u32 inet_sk_port_offset(const struct sock *sk)
+static u64 inet_sk_port_offset(const struct sock *sk)
{
const struct inet_sock *inet = inet_sk(sk);
@@ -511,14 +659,54 @@ static u32 inet_sk_port_offset(const struct sock *sk)
inet->inet_dport);
}
-/* insert a socket into ehash, and eventually remove another one
- * (The another one can be a SYN_RECV or TIMEWAIT
+/* Searches for an exsiting socket in the ehash bucket list.
+ * Returns true if found, false otherwise.
*/
-bool inet_ehash_insert(struct sock *sk, struct sock *osk)
+static bool inet_ehash_lookup_by_sk(struct sock *sk,
+ struct hlist_nulls_head *list)
{
- struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
- struct hlist_nulls_head *list;
+ const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num);
+ const int sdif = sk->sk_bound_dev_if;
+ const int dif = sk->sk_bound_dev_if;
+ const struct hlist_nulls_node *node;
+ struct net *net = sock_net(sk);
+ struct sock *esk;
+
+ INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr);
+
+ sk_nulls_for_each_rcu(esk, node, list) {
+ if (esk->sk_hash != sk->sk_hash)
+ continue;
+ if (sk->sk_family == AF_INET) {
+ if (unlikely(inet_match(net, esk, acookie,
+ ports, dif, sdif))) {
+ return true;
+ }
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (sk->sk_family == AF_INET6) {
+ if (unlikely(inet6_match(net, esk,
+ &sk->sk_v6_daddr,
+ &sk->sk_v6_rcv_saddr,
+ ports, dif, sdif))) {
+ return true;
+ }
+ }
+#endif
+ }
+ return false;
+}
+
+/* Insert a socket into ehash, and eventually remove another one
+ * (The another one can be a SYN_RECV or TIMEWAIT)
+ * If an existing socket already exists, socket sk is not inserted,
+ * and sets found_dup_sk parameter to true.
+ */
+bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
+{
+ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
struct inet_ehash_bucket *head;
+ struct hlist_nulls_head *list;
spinlock_t *lock;
bool ret = true;
@@ -532,44 +720,56 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk)
spin_lock(lock);
if (osk) {
WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
- ret = sk_nulls_del_node_init_rcu(osk);
+ ret = sk_nulls_replace_node_init_rcu(osk, sk);
+ goto unlock;
}
+
+ if (found_dup_sk) {
+ *found_dup_sk = inet_ehash_lookup_by_sk(sk, list);
+ if (*found_dup_sk)
+ ret = false;
+ }
+
if (ret)
__sk_nulls_add_node_rcu(sk, list);
+
+unlock:
spin_unlock(lock);
+
return ret;
}
-bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
+bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
{
- bool ok = inet_ehash_insert(sk, osk);
+ bool ok = inet_ehash_insert(sk, osk, found_dup_sk);
if (ok) {
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
} else {
- percpu_counter_inc(sk->sk_prot->orphan_count);
+ tcp_orphan_count_inc();
inet_sk_set_state(sk, TCP_CLOSE);
sock_set_flag(sk, SOCK_DEAD);
inet_csk_destroy_sock(sk);
}
return ok;
}
-EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
+EXPORT_IPV6_MOD(inet_ehash_nolisten);
static int inet_reuseport_add_sock(struct sock *sk,
struct inet_listen_hashbucket *ilb)
{
struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
+ const struct hlist_nulls_node *node;
+ kuid_t uid = sk_uid(sk);
struct sock *sk2;
- kuid_t uid = sock_i_uid(sk);
- sk_for_each_rcu(sk2, &ilb->head) {
+ sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) {
if (sk2 != sk &&
sk2->sk_family == sk->sk_family &&
ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
inet_csk(sk2)->icsk_bind_hash == tb &&
- sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
+ sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) &&
inet_rcv_saddr_equal(sk, sk2, false))
return reuseport_add_sock(sk, sk2,
inet_rcv_saddr_any(sk));
@@ -578,195 +778,451 @@ static int inet_reuseport_add_sock(struct sock *sk,
return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
}
-int __inet_hash(struct sock *sk, struct sock *osk)
+int inet_hash(struct sock *sk)
{
- struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
- struct inet_listen_hashbucket *ilb;
+ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
+ struct inet_listen_hashbucket *ilb2;
int err = 0;
+ if (sk->sk_state == TCP_CLOSE)
+ return 0;
+
if (sk->sk_state != TCP_LISTEN) {
- inet_ehash_nolisten(sk, osk);
+ local_bh_disable();
+ inet_ehash_nolisten(sk, NULL, NULL);
+ local_bh_enable();
return 0;
}
WARN_ON(!sk_unhashed(sk));
- ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
+ ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
- spin_lock(&ilb->lock);
+ spin_lock(&ilb2->lock);
if (sk->sk_reuseport) {
- err = inet_reuseport_add_sock(sk, ilb);
+ err = inet_reuseport_add_sock(sk, ilb2);
if (err)
goto unlock;
}
+ sock_set_flag(sk, SOCK_RCU_FREE);
if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
sk->sk_family == AF_INET6)
- hlist_add_tail_rcu(&sk->sk_node, &ilb->head);
+ __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
else
- hlist_add_head_rcu(&sk->sk_node, &ilb->head);
- inet_hash2(hashinfo, sk);
- ilb->count++;
- sock_set_flag(sk, SOCK_RCU_FREE);
+ __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
unlock:
- spin_unlock(&ilb->lock);
+ spin_unlock(&ilb2->lock);
return err;
}
-EXPORT_SYMBOL(__inet_hash);
+EXPORT_IPV6_MOD(inet_hash);
-int inet_hash(struct sock *sk)
+void inet_unhash(struct sock *sk)
{
- int err = 0;
+ struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
- if (sk->sk_state != TCP_CLOSE) {
- local_bh_disable();
- err = __inet_hash(sk, NULL);
- local_bh_enable();
+ if (sk_unhashed(sk))
+ return;
+
+ sock_rps_delete_flow(sk);
+ if (sk->sk_state == TCP_LISTEN) {
+ struct inet_listen_hashbucket *ilb2;
+
+ ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
+ /* Don't disable bottom halves while acquiring the lock to
+ * avoid circular locking dependency on PREEMPT_RT.
+ */
+ spin_lock(&ilb2->lock);
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ reuseport_stop_listen_sock(sk);
+
+ __sk_nulls_del_node_init_rcu(sk);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ spin_unlock(&ilb2->lock);
+ } else {
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+
+ spin_lock_bh(lock);
+ __sk_nulls_del_node_init_rcu(sk);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ spin_unlock_bh(lock);
}
+}
+EXPORT_IPV6_MOD(inet_unhash);
- return err;
+static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb,
+ const struct net *net, unsigned short port,
+ int l3mdev, const struct sock *sk)
+{
+ if (!net_eq(ib2_net(tb), net) || tb->port != port ||
+ tb->l3mdev != l3mdev)
+ return false;
+
+ return inet_bind2_bucket_addr_match(tb, sk);
}
-EXPORT_SYMBOL_GPL(inet_hash);
-void inet_unhash(struct sock *sk)
+bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net,
+ unsigned short port, int l3mdev, const struct sock *sk)
{
- struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
- struct inet_listen_hashbucket *ilb = NULL;
- spinlock_t *lock;
+ if (!net_eq(ib2_net(tb), net) || tb->port != port ||
+ tb->l3mdev != l3mdev)
+ return false;
- if (sk_unhashed(sk))
- return;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (tb->addr_type == IPV6_ADDR_ANY)
+ return true;
- if (sk->sk_state == TCP_LISTEN) {
- ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
- lock = &ilb->lock;
- } else {
- lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
+ if (tb->addr_type != IPV6_ADDR_MAPPED)
+ return false;
+
+ if (sk->sk_family == AF_INET6 &&
+ !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
+ return false;
+#endif
+ return tb->rcv_saddr == 0;
+}
+
+/* The socket's bhash2 hashbucket spinlock must be held when this is called */
+struct inet_bind2_bucket *
+inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net,
+ unsigned short port, int l3mdev, const struct sock *sk)
+{
+ struct inet_bind2_bucket *bhash2 = NULL;
+
+ inet_bind_bucket_for_each(bhash2, &head->chain)
+ if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk))
+ break;
+
+ return bhash2;
+}
+
+struct inet_bind_hashbucket *
+inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port)
+{
+ struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
+ u32 hash;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ hash = ipv6_portaddr_hash(net, &in6addr_any, port);
+ else
+#endif
+ hash = ipv4_portaddr_hash(net, 0, port);
+
+ return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
+}
+
+static void inet_update_saddr(struct sock *sk, void *saddr, int family)
+{
+ if (family == AF_INET) {
+ inet_sk(sk)->inet_saddr = *(__be32 *)saddr;
+ sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr);
}
- spin_lock_bh(lock);
- if (sk_unhashed(sk))
- goto unlock;
+#if IS_ENABLED(CONFIG_IPV6)
+ else {
+ sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr;
+ }
+#endif
+}
- if (rcu_access_pointer(sk->sk_reuseport_cb))
- reuseport_detach_sock(sk);
- if (ilb) {
- inet_unhash2(hashinfo, sk);
- __sk_del_node_init(sk);
- ilb->count--;
- } else {
- __sk_nulls_del_node_init_rcu(sk);
+static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset)
+{
+ struct inet_hashinfo *hinfo = tcp_get_hashinfo(sk);
+ struct inet_bind_hashbucket *head, *head2;
+ struct inet_bind2_bucket *tb2, *new_tb2;
+ int l3mdev = inet_sk_bound_l3mdev(sk);
+ int port = inet_sk(sk)->inet_num;
+ struct net *net = sock_net(sk);
+ int bhash;
+
+ if (!inet_csk(sk)->icsk_bind2_hash) {
+ /* Not bind()ed before. */
+ if (reset)
+ inet_reset_saddr(sk);
+ else
+ inet_update_saddr(sk, saddr, family);
+
+ return 0;
}
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
-unlock:
- spin_unlock_bh(lock);
+
+ /* Allocate a bind2 bucket ahead of time to avoid permanently putting
+ * the bhash2 table in an inconsistent state if a new tb2 bucket
+ * allocation fails.
+ */
+ new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC);
+ if (!new_tb2) {
+ if (reset) {
+ /* The (INADDR_ANY, port) bucket might have already
+ * been freed, then we cannot fixup icsk_bind2_hash,
+ * so we give up and unlink sk from bhash/bhash2 not
+ * to leave inconsistency in bhash2.
+ */
+ inet_put_port(sk);
+ inet_reset_saddr(sk);
+ }
+
+ return -ENOMEM;
+ }
+
+ bhash = inet_bhashfn(net, port, hinfo->bhash_size);
+ head = &hinfo->bhash[bhash];
+ head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
+
+ /* If we change saddr locklessly, another thread
+ * iterating over bhash might see corrupted address.
+ */
+ spin_lock_bh(&head->lock);
+
+ spin_lock(&head2->lock);
+ __sk_del_bind_node(sk);
+ inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash);
+ spin_unlock(&head2->lock);
+
+ if (reset)
+ inet_reset_saddr(sk);
+ else
+ inet_update_saddr(sk, saddr, family);
+
+ head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
+
+ spin_lock(&head2->lock);
+ tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
+ if (!tb2) {
+ tb2 = new_tb2;
+ inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk);
+ if (sk_is_connect_bind(sk)) {
+ tb2->fastreuse = -1;
+ tb2->fastreuseport = -1;
+ }
+ }
+ inet_csk(sk)->icsk_bind2_hash = tb2;
+ sk_add_bind_node(sk, &tb2->owners);
+ spin_unlock(&head2->lock);
+
+ spin_unlock_bh(&head->lock);
+
+ if (tb2 != new_tb2)
+ kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2);
+
+ return 0;
+}
+
+int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family)
+{
+ return __inet_bhash2_update_saddr(sk, saddr, family, false);
+}
+EXPORT_IPV6_MOD(inet_bhash2_update_saddr);
+
+void inet_bhash2_reset_saddr(struct sock *sk)
+{
+ if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+ __inet_bhash2_update_saddr(sk, NULL, 0, true);
}
-EXPORT_SYMBOL_GPL(inet_unhash);
+EXPORT_IPV6_MOD(inet_bhash2_reset_saddr);
+
+/* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm
+ * Note that we use 32bit integers (vs RFC 'short integers')
+ * because 2^16 is not a multiple of num_ephemeral and this
+ * property might be used by clever attacker.
+ *
+ * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though
+ * attacks were since demonstrated, thus we use 65536 by default instead
+ * to really give more isolation and privacy, at the expense of 256kB
+ * of kernel memory.
+ */
+#define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER)
+static u32 *table_perturb;
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
- struct sock *sk, u32 port_offset,
+ struct sock *sk, u64 port_offset,
+ u32 hash_port0,
int (*check_established)(struct inet_timewait_death_row *,
- struct sock *, __u16, struct inet_timewait_sock **))
+ struct sock *, __u16, struct inet_timewait_sock **,
+ bool rcu_lookup, u32 hash))
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
+ struct inet_bind_hashbucket *head, *head2;
struct inet_timewait_sock *tw = NULL;
- struct inet_bind_hashbucket *head;
int port = inet_sk(sk)->inet_num;
struct net *net = sock_net(sk);
+ struct inet_bind2_bucket *tb2;
struct inet_bind_bucket *tb;
+ bool tb_created = false;
u32 remaining, offset;
int ret, i, low, high;
- static u32 hint;
+ bool local_ports;
+ int step, l3mdev;
+ u32 index;
if (port) {
- head = &hinfo->bhash[inet_bhashfn(net, port,
- hinfo->bhash_size)];
- tb = inet_csk(sk)->icsk_bind_hash;
- spin_lock_bh(&head->lock);
- if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- inet_ehash_nolisten(sk, NULL);
- spin_unlock_bh(&head->lock);
- return 0;
- }
- spin_unlock(&head->lock);
- /* No definite answer... Walk to established hash table */
- ret = check_established(death_row, sk, port, NULL);
+ local_bh_disable();
+ ret = check_established(death_row, sk, port, NULL, false,
+ hash_port0 + port);
local_bh_enable();
return ret;
}
- inet_get_local_port_range(net, &low, &high);
+ l3mdev = inet_sk_bound_l3mdev(sk);
+
+ local_ports = inet_sk_get_local_port_range(sk, &low, &high);
+ step = local_ports ? 1 : 2;
+
high++; /* [32768, 60999] -> [32768, 61000[ */
remaining = high - low;
- if (likely(remaining > 1))
+ if (!local_ports && remaining > 1)
remaining &= ~1U;
- offset = (hint + port_offset) % remaining;
+ get_random_sleepable_once(table_perturb,
+ INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb));
+ index = port_offset & (INET_TABLE_PERTURB_SIZE - 1);
+
+ offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32);
+ offset %= remaining;
+
/* In first pass we try ports of @low parity.
* inet_csk_get_port() does the opposite choice.
*/
- offset &= ~1U;
+ if (!local_ports)
+ offset &= ~1U;
other_parity_scan:
port = low + offset;
- for (i = 0; i < remaining; i += 2, port += 2) {
+ for (i = 0; i < remaining; i += step, port += step) {
if (unlikely(port >= high))
port -= remaining;
if (inet_is_local_reserved_port(net, port))
continue;
head = &hinfo->bhash[inet_bhashfn(net, port,
hinfo->bhash_size)];
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tb, &head->chain, node) {
+ if (!inet_bind_bucket_match(tb, net, port, l3mdev))
+ continue;
+ if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) {
+ rcu_read_unlock();
+ goto next_port;
+ }
+ if (!check_established(death_row, sk, port, &tw, true,
+ hash_port0 + port))
+ break;
+ rcu_read_unlock();
+ goto next_port;
+ }
+ rcu_read_unlock();
+
spin_lock_bh(&head->lock);
/* Does not bother with rcv_saddr checks, because
* the established check is already unique enough.
*/
inet_bind_bucket_for_each(tb, &head->chain) {
- if (net_eq(ib_net(tb), net) && tb->port == port) {
+ if (inet_bind_bucket_match(tb, net, port, l3mdev)) {
if (tb->fastreuse >= 0 ||
tb->fastreuseport >= 0)
- goto next_port;
- WARN_ON(hlist_empty(&tb->owners));
+ goto next_port_unlock;
+ WARN_ON(hlist_empty(&tb->bhash2));
if (!check_established(death_row, sk,
- port, &tw))
+ port, &tw, false,
+ hash_port0 + port))
goto ok;
- goto next_port;
+ goto next_port_unlock;
}
}
tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
- net, head, port);
+ net, head, port, l3mdev);
if (!tb) {
spin_unlock_bh(&head->lock);
return -ENOMEM;
}
+ tb_created = true;
tb->fastreuse = -1;
tb->fastreuseport = -1;
goto ok;
-next_port:
+next_port_unlock:
spin_unlock_bh(&head->lock);
+next_port:
cond_resched();
}
- offset++;
- if ((offset & 1) && remaining > 1)
- goto other_parity_scan;
-
+ if (!local_ports) {
+ offset++;
+ if ((offset & 1) && remaining > 1)
+ goto other_parity_scan;
+ }
return -EADDRNOTAVAIL;
ok:
- hint += i + 2;
+ /* Find the corresponding tb2 bucket since we need to
+ * add the socket to the bhash2 table as well
+ */
+ head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
+ spin_lock(&head2->lock);
+
+ tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
+ if (!tb2) {
+ tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net,
+ head2, tb, sk);
+ if (!tb2)
+ goto error;
+ tb2->fastreuse = -1;
+ tb2->fastreuseport = -1;
+ }
+
+ /* Here we want to add a little bit of randomness to the next source
+ * port that will be chosen. We use a max() with a random here so that
+ * on low contention the randomness is maximal and on high contention
+ * it may be inexistent.
+ */
+ i = max_t(int, i, get_random_u32_below(8) * step);
+ WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + step);
/* Head lock still held and bh's disabled */
- inet_bind_hash(sk, tb, port);
+ inet_bind_hash(sk, tb, tb2, port);
+ sk->sk_userlocks |= SOCK_CONNECT_BIND;
+
if (sk_unhashed(sk)) {
inet_sk(sk)->inet_sport = htons(port);
- inet_ehash_nolisten(sk, (struct sock *)tw);
+ inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
}
if (tw)
inet_twsk_bind_unhash(tw, hinfo);
+
+ spin_unlock(&head2->lock);
spin_unlock(&head->lock);
+
if (tw)
inet_twsk_deschedule_put(tw);
local_bh_enable();
return 0;
+
+error:
+ if (sk_hashed(sk)) {
+ spinlock_t *lock = inet_ehash_lockp(hinfo, sk->sk_hash);
+
+ sock_prot_inuse_add(net, sk->sk_prot, -1);
+
+ spin_lock(lock);
+ __sk_nulls_del_node_init_rcu(sk);
+ spin_unlock(lock);
+
+ sk->sk_hash = 0;
+ inet_sk(sk)->inet_sport = 0;
+ inet_sk(sk)->inet_num = 0;
+
+ if (tw)
+ inet_twsk_bind_unhash(tw, hinfo);
+ }
+
+ spin_unlock(&head2->lock);
+ if (tb_created)
+ inet_bind_bucket_destroy(tb);
+ spin_unlock(&head->lock);
+
+ if (tw)
+ inet_twsk_deschedule_put(tw);
+
+ local_bh_enable();
+
+ return -ENOMEM;
}
/*
@@ -775,36 +1231,37 @@ ok:
int inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk)
{
- u32 port_offset = 0;
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct net *net = sock_net(sk);
+ u64 port_offset = 0;
+ u32 hash_port0;
if (!inet_sk(sk)->inet_num)
port_offset = inet_sk_port_offset(sk);
- return __inet_hash_connect(death_row, sk, port_offset,
+
+ hash_port0 = inet_ehashfn(net, inet->inet_rcv_saddr, 0,
+ inet->inet_daddr, inet->inet_dport);
+
+ return __inet_hash_connect(death_row, sk, port_offset, hash_port0,
__inet_check_established);
}
-EXPORT_SYMBOL_GPL(inet_hash_connect);
-void inet_hashinfo_init(struct inet_hashinfo *h)
+static void init_hashinfo_lhash2(struct inet_hashinfo *h)
{
int i;
- for (i = 0; i < INET_LHTABLE_SIZE; i++) {
- spin_lock_init(&h->listening_hash[i].lock);
- INIT_HLIST_HEAD(&h->listening_hash[i].head);
- h->listening_hash[i].count = 0;
+ for (i = 0; i <= h->lhash2_mask; i++) {
+ spin_lock_init(&h->lhash2[i].lock);
+ INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head,
+ i + LISTENING_NULLS_BASE);
}
-
- h->lhash2 = NULL;
}
-EXPORT_SYMBOL_GPL(inet_hashinfo_init);
void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
unsigned long numentries, int scale,
unsigned long low_limit,
unsigned long high_limit)
{
- unsigned int i;
-
h->lhash2 = alloc_large_system_hash(name,
sizeof(*h->lhash2),
numentries,
@@ -814,35 +1271,111 @@ void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
&h->lhash2_mask,
low_limit,
high_limit);
+ init_hashinfo_lhash2(h);
+
+ /* this one is used for source ports of outgoing connections */
+ table_perturb = alloc_large_system_hash("Table-perturb",
+ sizeof(*table_perturb),
+ INET_TABLE_PERTURB_SIZE,
+ 0, 0, NULL, NULL,
+ INET_TABLE_PERTURB_SIZE,
+ INET_TABLE_PERTURB_SIZE);
+}
- for (i = 0; i <= h->lhash2_mask; i++) {
- spin_lock_init(&h->lhash2[i].lock);
- INIT_HLIST_HEAD(&h->lhash2[i].head);
- h->lhash2[i].count = 0;
- }
+int inet_hashinfo2_init_mod(struct inet_hashinfo *h)
+{
+ h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL);
+ if (!h->lhash2)
+ return -ENOMEM;
+
+ h->lhash2_mask = INET_LHTABLE_SIZE - 1;
+ /* INET_LHTABLE_SIZE must be a power of 2 */
+ BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask);
+
+ init_hashinfo_lhash2(h);
+ return 0;
}
int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
{
unsigned int locksz = sizeof(spinlock_t);
unsigned int i, nblocks = 1;
+ spinlock_t *ptr = NULL;
- if (locksz != 0) {
- /* allocate 2 cache lines or at least one spinlock per cpu */
- nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U);
- nblocks = roundup_pow_of_two(nblocks * num_possible_cpus());
+ if (locksz == 0)
+ goto set_mask;
- /* no more locks than number of hash buckets */
- nblocks = min(nblocks, hashinfo->ehash_mask + 1);
+ /* Allocate 2 cache lines or at least one spinlock per cpu. */
+ nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U) * num_possible_cpus();
- hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL);
- if (!hashinfo->ehash_locks)
- return -ENOMEM;
+ /* At least one page per NUMA node. */
+ nblocks = max(nblocks, num_online_nodes() * PAGE_SIZE / locksz);
+
+ nblocks = roundup_pow_of_two(nblocks);
- for (i = 0; i < nblocks; i++)
- spin_lock_init(&hashinfo->ehash_locks[i]);
+ /* No more locks than number of hash buckets. */
+ nblocks = min(nblocks, hashinfo->ehash_mask + 1);
+
+ if (num_online_nodes() > 1) {
+ /* Use vmalloc() to allow NUMA policy to spread pages
+ * on all available nodes if desired.
+ */
+ ptr = vmalloc_array(nblocks, locksz);
+ }
+ if (!ptr) {
+ ptr = kvmalloc_array(nblocks, locksz, GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
}
+ for (i = 0; i < nblocks; i++)
+ spin_lock_init(&ptr[i]);
+ hashinfo->ehash_locks = ptr;
+set_mask:
hashinfo->ehash_locks_mask = nblocks - 1;
return 0;
}
-EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc);
+
+struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo,
+ unsigned int ehash_entries)
+{
+ struct inet_hashinfo *new_hashinfo;
+ int i;
+
+ new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL);
+ if (!new_hashinfo)
+ goto err;
+
+ new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket),
+ GFP_KERNEL_ACCOUNT);
+ if (!new_hashinfo->ehash)
+ goto free_hashinfo;
+
+ new_hashinfo->ehash_mask = ehash_entries - 1;
+
+ if (inet_ehash_locks_alloc(new_hashinfo))
+ goto free_ehash;
+
+ for (i = 0; i < ehash_entries; i++)
+ INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i);
+
+ new_hashinfo->pernet = true;
+
+ return new_hashinfo;
+
+free_ehash:
+ vfree(new_hashinfo->ehash);
+free_hashinfo:
+ kfree(new_hashinfo);
+err:
+ return NULL;
+}
+
+void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo)
+{
+ if (!hashinfo->pernet)
+ return;
+
+ inet_ehash_locks_free(hashinfo);
+ vfree(hashinfo->ehash);
+ kfree(hashinfo);
+}
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 88c5069b5d20..d4c781a0667f 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -14,7 +15,8 @@
#include <net/inet_hashtables.h>
#include <net/inet_timewait_sock.h>
#include <net/ip.h>
-
+#include <net/tcp.h>
+#include <net/psp.h>
/**
* inet_twsk_bind_unhash - unhash a timewait socket from bind hash
@@ -28,14 +30,18 @@
void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
struct inet_hashinfo *hashinfo)
{
+ struct inet_bind2_bucket *tb2 = tw->tw_tb2;
struct inet_bind_bucket *tb = tw->tw_tb;
if (!tb)
return;
- __hlist_del(&tw->tw_bind_node);
+ __sk_del_bind_node((struct sock *)tw);
tw->tw_tb = NULL;
- inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ tw->tw_tb2 = NULL;
+ inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
+ inet_bind_bucket_destroy(tb);
+
__sock_put((struct sock *)tw);
}
@@ -44,7 +50,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
{
struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
- struct inet_bind_hashbucket *bhead;
+ struct inet_bind_hashbucket *bhead, *bhead2;
spin_lock(lock);
sk_nulls_del_node_init_rcu((struct sock *)tw);
@@ -53,22 +59,24 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
/* Disassociate with bind bucket. */
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
hashinfo->bhash_size)];
+ bhead2 = inet_bhashfn_portaddr(hashinfo, (struct sock *)tw,
+ twsk_net(tw), tw->tw_num);
spin_lock(&bhead->lock);
+ spin_lock(&bhead2->lock);
inet_twsk_bind_unhash(tw, hashinfo);
+ spin_unlock(&bhead2->lock);
spin_unlock(&bhead->lock);
- atomic_dec(&tw->tw_dr->tw_count);
+ refcount_dec(&tw->tw_dr->tw_refcount);
inet_twsk_put(tw);
}
void inet_twsk_free(struct inet_timewait_sock *tw)
{
struct module *owner = tw->tw_prot->owner;
- twsk_destructor((struct sock *)tw);
-#ifdef SOCK_REFCNT_DEBUG
- pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw);
-#endif
+
+ tcp_twsk_destructor((struct sock *)tw);
kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
module_put(owner);
}
@@ -80,74 +88,80 @@ void inet_twsk_put(struct inet_timewait_sock *tw)
}
EXPORT_SYMBOL_GPL(inet_twsk_put);
-static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
- struct hlist_nulls_head *list)
+static void inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo)
{
- hlist_nulls_add_head_rcu(&tw->tw_node, list);
-}
-
-static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
- struct hlist_head *list)
-{
- hlist_add_head(&tw->tw_bind_node, list);
+ __inet_twsk_schedule(tw, timeo, false);
}
/*
- * Enter the time wait state. This is called with locally disabled BH.
+ * Enter the time wait state.
* Essentially we whip up a timewait bucket, copy the relevant info into it
* from the SK, and mess with hash chains and list linkage.
+ *
+ * The caller must not access @tw anymore after this function returns.
*/
-void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
- struct inet_hashinfo *hashinfo)
+void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw,
+ struct sock *sk,
+ struct inet_hashinfo *hashinfo,
+ int timeo)
{
const struct inet_sock *inet = inet_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
- struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
- struct inet_bind_hashbucket *bhead;
- /* Step 1: Put TW into bind hash. Original socket stays there too.
- Note, that any socket with inet->num != 0 MUST be bound in
- binding cache, even if it is closed.
+ struct inet_bind_hashbucket *bhead, *bhead2;
+
+ /* Put TW into bind hash. Original socket stays there too.
+ * Note, that any socket with inet->num != 0 MUST be bound in
+ * binding cache, even if it is closed.
*/
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
hashinfo->bhash_size)];
+ bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num);
+
+ local_bh_disable();
spin_lock(&bhead->lock);
+ spin_lock(&bhead2->lock);
+
tw->tw_tb = icsk->icsk_bind_hash;
WARN_ON(!icsk->icsk_bind_hash);
- inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
- spin_unlock(&bhead->lock);
-
- spin_lock(lock);
- inet_twsk_add_node_rcu(tw, &ehead->chain);
+ tw->tw_tb2 = icsk->icsk_bind2_hash;
+ WARN_ON(!icsk->icsk_bind2_hash);
+ sk_add_bind_node((struct sock *)tw, &tw->tw_tb2->owners);
- /* Step 3: Remove SK from hash chain */
- if (__sk_nulls_del_node_init_rcu(sk))
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ spin_unlock(&bhead2->lock);
+ spin_unlock(&bhead->lock);
- spin_unlock(lock);
+ spin_lock(lock);
/* tw_refcnt is set to 3 because we have :
* - one reference for bhash chain.
* - one reference for ehash chain.
* - one reference for timer.
- * We can use atomic_set() because prior spin_lock()/spin_unlock()
- * committed into memory all tw fields.
* Also note that after this point, we lost our implicit reference
* so we are not allowed to use tw anymore.
*/
refcount_set(&tw->tw_refcnt, 3);
+
+ /* Ensure tw_refcnt has been set before tw is published.
+ * smp_wmb() provides the necessary memory barrier to enforce this
+ * ordering.
+ */
+ smp_wmb();
+
+ hlist_nulls_replace_init_rcu(&sk->sk_nulls_node, &tw->tw_node);
+ sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+
+ inet_twsk_schedule(tw, timeo);
+
+ spin_unlock(lock);
+ local_bh_enable();
}
-EXPORT_SYMBOL_GPL(inet_twsk_hashdance);
static void tw_timer_handler(struct timer_list *t)
{
- struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer);
+ struct inet_timewait_sock *tw = timer_container_of(tw, t, tw_timer);
- if (tw->tw_kill)
- __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED);
- else
- __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITED);
inet_twsk_kill(tw);
}
@@ -157,7 +171,8 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
{
struct inet_timewait_sock *tw;
- if (atomic_read(&dr->tw_count) >= dr->sysctl_max_tw_buckets)
+ if (refcount_read(&dr->tw_refcount) - 1 >=
+ READ_ONCE(dr->sysctl_max_tw_buckets))
return NULL;
tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
@@ -181,11 +196,15 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
tw->tw_reuseport = sk->sk_reuseport;
tw->tw_hash = sk->sk_hash;
tw->tw_ipv6only = 0;
- tw->tw_transparent = inet->transparent;
+ tw->tw_transparent = inet_test_bit(TRANSPARENT, sk);
+ tw->tw_connect_bind = !!(sk->sk_userlocks & SOCK_CONNECT_BIND);
tw->tw_prot = sk->sk_prot_creator;
atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
twsk_net_set(tw, sock_net(sk));
- timer_setup(&tw->tw_timer, tw_timer_handler, TIMER_PINNED);
+ timer_setup(&tw->tw_timer, tw_timer_handler, 0);
+#ifdef CONFIG_SOCK_VALIDATE_XMIT
+ tw->tw_validate_xmit_skb = NULL;
+#endif
/*
* Because we use RCU lookups, we should not set tw_refcnt
* to a non null value before everything is setup for this
@@ -194,11 +213,11 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
refcount_set(&tw->tw_refcnt, 0);
__module_get(tw->tw_prot->owner);
+ psp_twsk_init(tw, sk);
}
return tw;
}
-EXPORT_SYMBOL_GPL(inet_twsk_alloc);
/* These are always called from BH context. See callers in
* tcp_input.c to verify this.
@@ -210,7 +229,34 @@ EXPORT_SYMBOL_GPL(inet_twsk_alloc);
*/
void inet_twsk_deschedule_put(struct inet_timewait_sock *tw)
{
- if (del_timer_sync(&tw->tw_timer))
+ struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
+
+ /* inet_twsk_purge() walks over all sockets, including tw ones,
+ * and removes them via inet_twsk_deschedule_put() after a
+ * refcount_inc_not_zero().
+ *
+ * inet_twsk_hashdance_schedule() must (re)init the refcount before
+ * arming the timer, i.e. inet_twsk_purge can obtain a reference to
+ * a twsk that did not yet schedule the timer.
+ *
+ * The ehash lock synchronizes these two:
+ * After acquiring the lock, the timer is always scheduled (else
+ * timer_shutdown returns false), because hashdance_schedule releases
+ * the ehash lock only after completing the timer initialization.
+ *
+ * Without grabbing the ehash lock, we get:
+ * 1) cpu x sets twsk refcount to 3
+ * 2) cpu y bumps refcount to 4
+ * 3) cpu y calls inet_twsk_deschedule_put() and shuts timer down
+ * 4) cpu x tries to start timer, but mod_timer is a noop post-shutdown
+ * -> timer refcount is never decremented.
+ */
+ spin_lock(lock);
+ /* Makes sure hashdance_schedule() has completed */
+ spin_unlock(lock);
+
+ if (timer_shutdown_sync(&tw->tw_timer))
inet_twsk_kill(tw);
inet_twsk_put(tw);
}
@@ -243,49 +289,63 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
* of PAWS.
*/
- tw->tw_kill = timeo <= 4*HZ;
if (!rearm) {
+ bool kill = timeo <= 4*HZ;
+
+ __NET_INC_STATS(twsk_net(tw), kill ? LINUX_MIB_TIMEWAITKILLED :
+ LINUX_MIB_TIMEWAITED);
BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo));
- atomic_inc(&tw->tw_dr->tw_count);
+ refcount_inc(&tw->tw_dr->tw_refcount);
} else {
mod_timer_pending(&tw->tw_timer, jiffies + timeo);
}
}
-EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
-void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family)
+/* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */
+void inet_twsk_purge(struct inet_hashinfo *hashinfo)
{
- struct inet_timewait_sock *tw;
- struct sock *sk;
+ struct inet_ehash_bucket *head = &hashinfo->ehash[0];
+ unsigned int ehash_mask = hashinfo->ehash_mask;
struct hlist_nulls_node *node;
unsigned int slot;
+ struct sock *sk;
+
+ for (slot = 0; slot <= ehash_mask; slot++, head++) {
+ if (hlist_nulls_empty(&head->chain))
+ continue;
- for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
- struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
restart_rcu:
cond_resched();
rcu_read_lock();
restart:
sk_nulls_for_each_rcu(sk, node, &head->chain) {
- if (sk->sk_state != TCP_TIME_WAIT)
+ int state = inet_sk_state_load(sk);
+
+ if ((1 << state) & ~(TCPF_TIME_WAIT |
+ TCPF_NEW_SYN_RECV))
continue;
- tw = inet_twsk(sk);
- if ((tw->tw_family != family) ||
- refcount_read(&twsk_net(tw)->count))
+
+ if (check_net(sock_net(sk)))
continue;
- if (unlikely(!refcount_inc_not_zero(&tw->tw_refcnt)))
+ if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
continue;
- if (unlikely((tw->tw_family != family) ||
- refcount_read(&twsk_net(tw)->count))) {
- inet_twsk_put(tw);
+ if (check_net(sock_net(sk))) {
+ sock_gen_put(sk);
goto restart;
}
rcu_read_unlock();
local_bh_disable();
- inet_twsk_deschedule_put(tw);
+ if (state == TCP_TIME_WAIT) {
+ inet_twsk_deschedule_put(inet_twsk(sk));
+ } else {
+ struct request_sock *req = inet_reqsk(sk);
+
+ inet_csk_reqsk_queue_drop_and_put(req->rsk_listener,
+ req);
+ }
local_bh_enable();
goto restart_rcu;
}
@@ -298,4 +358,3 @@ restart:
rcu_read_unlock();
}
}
-EXPORT_SYMBOL_GPL(inet_twsk_purge);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index d757b9642d0d..7b1e0a2d6906 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -60,12 +60,12 @@ void inet_peer_base_init(struct inet_peer_base *bp)
seqlock_init(&bp->lock);
bp->total = 0;
}
-EXPORT_SYMBOL_GPL(inet_peer_base_init);
+EXPORT_IPV6_MOD_GPL(inet_peer_base_init);
#define PEER_MAX_GC 32
/* Exported for sysctl_net_ipv4. */
-int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries more
+int inet_peer_threshold __read_mostly; /* start to throw entries more
* aggressively at this stage */
int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */
int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */
@@ -73,25 +73,15 @@ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min
/* Called from ip_output.c:ip_init */
void __init inet_initpeers(void)
{
- struct sysinfo si;
+ u64 nr_entries;
- /* Use the straight interface to information about memory. */
- si_meminfo(&si);
- /* The values below were suggested by Alexey Kuznetsov
- * <kuznet@ms2.inr.ac.ru>. I don't have any opinion about the values
- * myself. --SAW
- */
- if (si.totalram <= (32768*1024)/PAGE_SIZE)
- inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */
- if (si.totalram <= (16384*1024)/PAGE_SIZE)
- inet_peer_threshold >>= 1; /* about 512KB */
- if (si.totalram <= (8192*1024)/PAGE_SIZE)
- inet_peer_threshold >>= 2; /* about 128KB */
-
- peer_cachep = kmem_cache_create("inet_peer_cache",
- sizeof(struct inet_peer),
- 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
- NULL);
+ /* 1% of physical memory */
+ nr_entries = div64_ul((u64)totalram_pages() << PAGE_SHIFT,
+ 100 * L1_CACHE_ALIGN(sizeof(struct inet_peer)));
+
+ inet_peer_threshold = clamp_val(nr_entries, 4096, 65536 + 128);
+
+ peer_cachep = KMEM_CACHE(inet_peer, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
}
/* Called with rcu_read_lock() or base->lock held */
@@ -105,6 +95,7 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr,
{
struct rb_node **pp, *parent, *next;
struct inet_peer *p;
+ u32 now;
pp = &base->rb_root.rb_node;
parent = NULL;
@@ -118,8 +109,9 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr,
p = rb_entry(parent, struct inet_peer, rb_node);
cmp = inetpeer_addr_cmp(daddr, &p->daddr);
if (cmp == 0) {
- if (!refcount_inc_not_zero(&p->refcnt))
- break;
+ now = jiffies;
+ if (READ_ONCE(p->dtime) != now)
+ WRITE_ONCE(p->dtime, now);
return p;
}
if (gc_stack) {
@@ -138,29 +130,30 @@ static struct inet_peer *lookup(const struct inetpeer_addr *daddr,
return NULL;
}
-static void inetpeer_free_rcu(struct rcu_head *head)
-{
- kmem_cache_free(peer_cachep, container_of(head, struct inet_peer, rcu));
-}
-
/* perform garbage collect on all items stacked during a lookup */
static void inet_peer_gc(struct inet_peer_base *base,
struct inet_peer *gc_stack[],
unsigned int gc_cnt)
{
+ int peer_threshold, peer_maxttl, peer_minttl;
struct inet_peer *p;
__u32 delta, ttl;
int i;
- if (base->total >= inet_peer_threshold)
+ peer_threshold = READ_ONCE(inet_peer_threshold);
+ peer_maxttl = READ_ONCE(inet_peer_maxttl);
+ peer_minttl = READ_ONCE(inet_peer_minttl);
+
+ if (base->total >= peer_threshold)
ttl = 0; /* be aggressive */
else
- ttl = inet_peer_maxttl
- - (inet_peer_maxttl - inet_peer_minttl) / HZ *
- base->total / inet_peer_threshold * HZ;
+ ttl = peer_maxttl - (peer_maxttl - peer_minttl) / HZ *
+ base->total / peer_threshold * HZ;
for (i = 0; i < gc_cnt; i++) {
p = gc_stack[i];
- delta = (__u32)jiffies - p->dtime;
+
+ delta = (__u32)jiffies - READ_ONCE(p->dtime);
+
if (delta < ttl || !refcount_dec_if_one(&p->refcnt))
gc_stack[i] = NULL;
}
@@ -169,36 +162,28 @@ static void inet_peer_gc(struct inet_peer_base *base,
if (p) {
rb_erase(&p->rb_node, &base->rb_root);
base->total--;
- call_rcu(&p->rcu, inetpeer_free_rcu);
+ kfree_rcu(p, rcu);
}
}
}
+/* Must be called under RCU : No refcount change is done here. */
struct inet_peer *inet_getpeer(struct inet_peer_base *base,
- const struct inetpeer_addr *daddr,
- int create)
+ const struct inetpeer_addr *daddr)
{
struct inet_peer *p, *gc_stack[PEER_MAX_GC];
struct rb_node **pp, *parent;
unsigned int gc_cnt, seq;
- int invalidated;
/* Attempt a lockless lookup first.
* Because of a concurrent writer, we might not find an existing entry.
*/
- rcu_read_lock();
seq = read_seqbegin(&base->lock);
p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp);
- invalidated = read_seqretry(&base->lock, seq);
- rcu_read_unlock();
if (p)
return p;
- /* If no writer did a change during our lookup, we can return early. */
- if (!create && !invalidated)
- return NULL;
-
/* retry an exact lookup, taking the lock before.
* At least, nodes should be hot in our cache.
*/
@@ -207,15 +192,16 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
gc_cnt = 0;
p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp);
- if (!p && create) {
+ if (!p) {
p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
if (p) {
p->daddr = *daddr;
p->dtime = (__u32)jiffies;
- refcount_set(&p->refcnt, 2);
+ refcount_set(&p->refcnt, 1);
atomic_set(&p->rid, 0);
p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW;
p->rate_tokens = 0;
+ p->n_redirects = 0;
/* 60*HZ is arbitrary, but chosen enough high so that the first
* calculation of tokens is at its maximum.
*/
@@ -232,16 +218,13 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base,
return p;
}
-EXPORT_SYMBOL_GPL(inet_getpeer);
+EXPORT_IPV6_MOD_GPL(inet_getpeer);
void inet_putpeer(struct inet_peer *p)
{
- p->dtime = (__u32)jiffies;
-
if (refcount_dec_and_test(&p->refcnt))
- call_rcu(&p->rcu, inetpeer_free_rcu);
+ kfree_rcu(p, rcu);
}
-EXPORT_SYMBOL_GPL(inet_putpeer);
/*
* Check transmit rate limitation for given message.
@@ -263,26 +246,30 @@ EXPORT_SYMBOL_GPL(inet_putpeer);
#define XRLIM_BURST_FACTOR 6
bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
{
- unsigned long now, token;
+ unsigned long now, token, otoken, delta;
bool rc = false;
if (!peer)
return true;
- token = peer->rate_tokens;
+ token = otoken = READ_ONCE(peer->rate_tokens);
now = jiffies;
- token += now - peer->rate_last;
- peer->rate_last = now;
- if (token > XRLIM_BURST_FACTOR * timeout)
- token = XRLIM_BURST_FACTOR * timeout;
+ delta = now - READ_ONCE(peer->rate_last);
+ if (delta) {
+ WRITE_ONCE(peer->rate_last, now);
+ token += delta;
+ if (token > XRLIM_BURST_FACTOR * timeout)
+ token = XRLIM_BURST_FACTOR * timeout;
+ }
if (token >= timeout) {
token -= timeout;
rc = true;
}
- peer->rate_tokens = token;
+ if (token != otoken)
+ WRITE_ONCE(peer->rate_tokens, token);
return rc;
}
-EXPORT_SYMBOL(inet_peer_xrlim_allow);
+EXPORT_IPV6_MOD(inet_peer_xrlim_allow);
void inetpeer_invalidate_tree(struct inet_peer_base *base)
{
@@ -299,4 +286,4 @@ void inetpeer_invalidate_tree(struct inet_peer_base *base)
base->total = 0;
}
-EXPORT_SYMBOL(inetpeer_invalidate_tree);
+EXPORT_IPV6_MOD(inetpeer_invalidate_tree);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index 32662e9e5d21..8b65f12583eb 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -66,12 +66,17 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s
{
struct ip_options *opt = &(IPCB(skb)->opt);
- __IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
- __IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len);
+#ifdef CONFIG_NET_SWITCHDEV
+ if (skb->offload_l3_fwd_mark) {
+ consume_skb(skb);
+ return 0;
+ }
+#endif
if (unlikely(opt->optlen))
ip_forward_options(skb);
+ skb_clear_tstamp(skb);
return dst_output(net, sk, skb);
}
@@ -82,6 +87,7 @@ int ip_forward(struct sk_buff *skb)
struct rtable *rt; /* Route we use */
struct ip_options *opt = &(IPCB(skb)->opt);
struct net *net;
+ SKB_DR(reason);
/* that should never happen */
if (skb->pkt_type != PACKET_HOST)
@@ -93,8 +99,10 @@ int ip_forward(struct sk_buff *skb)
if (skb_warn_if_lro(skb))
goto drop;
- if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
+ SKB_DR_SET(reason, XFRM_POLICY);
goto drop;
+ }
if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
return NET_RX_SUCCESS;
@@ -110,20 +118,25 @@ int ip_forward(struct sk_buff *skb)
if (ip_hdr(skb)->ttl <= 1)
goto too_many_hops;
- if (!xfrm4_route_forward(skb))
+ if (!xfrm4_route_forward(skb)) {
+ SKB_DR_SET(reason, XFRM_POLICY);
goto drop;
+ }
rt = skb_rtable(skb);
if (opt->is_strictroute && rt->rt_uses_gateway)
goto sr_failed;
+ __IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
+
IPCB(skb)->flags |= IPSKB_FORWARDED;
mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
if (ip_exceeds_mtu(skb, mtu)) {
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(mtu));
+ SKB_DR_SET(reason, PKT_TOO_BIG);
goto drop;
}
@@ -143,7 +156,7 @@ int ip_forward(struct sk_buff *skb)
!skb_sec_path(skb))
ip_rt_send_redirect(skb);
- if (net->ipv4.sysctl_ip_fwd_update_priority)
+ if (READ_ONCE(net->ipv4.sysctl_ip_fwd_update_priority))
skb->priority = rt_tos2priority(iph->tos);
return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
@@ -161,7 +174,8 @@ too_many_hops:
/* Tell the sender its packet died... */
__IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
+ SKB_DR_SET(reason, IP_INHDR);
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return NET_RX_DROP;
}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index e7227128df2c..f7012479713b 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -57,57 +57,6 @@
*/
static const char ip_frag_cache_name[] = "ip4-frags";
-/* Use skb->cb to track consecutive/adjacent fragments coming at
- * the end of the queue. Nodes in the rb-tree queue will
- * contain "runs" of one or more adjacent fragments.
- *
- * Invariants:
- * - next_frag is NULL at the tail of a "run";
- * - the head of a "run" has the sum of all fragment lengths in frag_run_len.
- */
-struct ipfrag_skb_cb {
- struct inet_skb_parm h;
- struct sk_buff *next_frag;
- int frag_run_len;
-};
-
-#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
-
-static void ip4_frag_init_run(struct sk_buff *skb)
-{
- BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
-
- FRAG_CB(skb)->next_frag = NULL;
- FRAG_CB(skb)->frag_run_len = skb->len;
-}
-
-/* Append skb to the last "run". */
-static void ip4_frag_append_to_last_run(struct inet_frag_queue *q,
- struct sk_buff *skb)
-{
- RB_CLEAR_NODE(&skb->rbnode);
- FRAG_CB(skb)->next_frag = NULL;
-
- FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
- FRAG_CB(q->fragments_tail)->next_frag = skb;
- q->fragments_tail = skb;
-}
-
-/* Create a new "run" with the skb. */
-static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb)
-{
- if (q->last_run_head)
- rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
- &q->last_run_head->rbnode.rb_right);
- else
- rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
- rb_insert_color(&skb->rbnode, &q->rb_fragments);
-
- ip4_frag_init_run(skb);
- q->fragments_tail = skb;
- q->last_run_head = skb;
-}
-
/* Describe an entry in the "incomplete datagrams" queue. */
struct ipq {
struct inet_frag_queue q;
@@ -127,23 +76,27 @@ static u8 ip4_frag_ecn(u8 tos)
static struct inet_frags ip4_frags;
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
- struct sk_buff *prev_tail, struct net_device *dev);
+ struct sk_buff *prev_tail, struct net_device *dev,
+ int *refs);
static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
{
struct ipq *qp = container_of(q, struct ipq, q);
- struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
- frags);
- struct net *net = container_of(ipv4, struct net, ipv4);
-
const struct frag_v4_compare_key *key = a;
+ struct net *net = q->fqdir->net;
+ struct inet_peer *p = NULL;
q->key.v4 = *key;
qp->ecn = 0;
- qp->peer = q->net->max_dist ?
- inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
- NULL;
+ if (q->fqdir->max_dist) {
+ rcu_read_lock();
+ p = inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif);
+ if (p && !refcount_inc_not_zero(&p->refcnt))
+ p = NULL;
+ rcu_read_unlock();
+ }
+ qp->peer = p;
}
static void ip4_frag_free(struct inet_frag_queue *q)
@@ -155,22 +108,6 @@ static void ip4_frag_free(struct inet_frag_queue *q)
inet_putpeer(qp->peer);
}
-
-/* Destruction primitives. */
-
-static void ipq_put(struct ipq *ipq)
-{
- inet_frag_put(&ipq->q);
-}
-
-/* Kill ipq entry. It is not destroyed immediately,
- * because caller (and someone more) holds reference count.
- */
-static void ipq_kill(struct ipq *ipq)
-{
- inet_frag_kill(&ipq->q);
-}
-
static bool frag_expire_skip_icmp(u32 user)
{
return user == IP_DEFRAG_AF_PACKET ||
@@ -185,23 +122,30 @@ static bool frag_expire_skip_icmp(u32 user)
*/
static void ip_expire(struct timer_list *t)
{
- struct inet_frag_queue *frag = from_timer(frag, t, timer);
+ enum skb_drop_reason reason = SKB_DROP_REASON_FRAG_REASM_TIMEOUT;
+ struct inet_frag_queue *frag = timer_container_of(frag, t, timer);
const struct iphdr *iph;
struct sk_buff *head = NULL;
struct net *net;
struct ipq *qp;
- int err;
+ int refs = 1;
qp = container_of(frag, struct ipq, q);
- net = container_of(qp->q.net, struct net, ipv4.frags);
+ net = qp->q.fqdir->net;
rcu_read_lock();
+
+ /* Paired with WRITE_ONCE() in fqdir_pre_exit(). */
+ if (READ_ONCE(qp->q.fqdir->dead))
+ goto out_rcu_unlock;
+
spin_lock(&qp->q.lock);
if (qp->q.flags & INET_FRAG_COMPLETE)
goto out;
- ipq_kill(qp);
+ qp->q.flags |= INET_FRAG_DROP;
+ inet_frag_kill(&qp->q, &refs);
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
@@ -212,27 +156,9 @@ static void ip_expire(struct timer_list *t)
* pull the head out of the tree in order to be able to
* deal with head->dev.
*/
- if (qp->q.fragments) {
- head = qp->q.fragments;
- qp->q.fragments = head->next;
- } else {
- head = skb_rb_first(&qp->q.rb_fragments);
- if (!head)
- goto out;
- if (FRAG_CB(head)->next_frag)
- rb_replace_node(&head->rbnode,
- &FRAG_CB(head)->next_frag->rbnode,
- &qp->q.rb_fragments);
- else
- rb_erase(&head->rbnode, &qp->q.rb_fragments);
- memset(&head->rbnode, 0, sizeof(head->rbnode));
- barrier();
- }
- if (head == qp->q.fragments_tail)
- qp->q.fragments_tail = NULL;
-
- sub_frag_mem_limit(qp->q.net, head->truesize);
-
+ head = inet_frag_pull_head(&qp->q);
+ if (!head)
+ goto out;
head->dev = dev_get_by_index_rcu(net, qp->iif);
if (!head->dev)
goto out;
@@ -240,14 +166,15 @@ static void ip_expire(struct timer_list *t)
/* skb has no dst, perform route lookup again */
iph = ip_hdr(head);
- err = ip_route_input_noref(head, iph->daddr, iph->saddr,
- iph->tos, head->dev);
- if (err)
+ reason = ip_route_input_noref(head, iph->daddr, iph->saddr,
+ ip4h_dscp(iph), head->dev);
+ if (reason)
goto out;
/* Only an end host needs to send an ICMP
* "Fragment Reassembly Timeout" message, per RFC792.
*/
+ reason = SKB_DROP_REASON_FRAG_REASM_TIMEOUT;
if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
(skb_rtable(head)->rt_type != RTN_LOCAL))
goto out;
@@ -260,9 +187,8 @@ out:
spin_unlock(&qp->q.lock);
out_rcu_unlock:
rcu_read_unlock();
- if (head)
- kfree_skb(head);
- ipq_put(qp);
+ kfree_skb_reason(head, reason);
+ inet_frag_putn(&qp->q, refs);
}
/* Find the correct entry in the "incomplete datagrams" queue for
@@ -281,7 +207,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
};
struct inet_frag_queue *q;
- q = inet_frag_find(&net->ipv4.frags, &key);
+ q = inet_frag_find(net->ipv4.fqdir, &key);
if (!q)
return NULL;
@@ -292,7 +218,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
static int ip_frag_too_far(struct ipq *qp)
{
struct inet_peer *peer = qp->peer;
- unsigned int max = qp->q.net->max_dist;
+ unsigned int max = qp->q.fqdir->max_dist;
unsigned int start, end;
int rc;
@@ -306,12 +232,8 @@ static int ip_frag_too_far(struct ipq *qp)
rc = qp->q.fragments_tail && (end - start) > max;
- if (rc) {
- struct net *net;
-
- net = container_of(qp->q.net, struct net, ipv4.frags);
- __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
- }
+ if (rc)
+ __IP_INC_STATS(qp->q.fqdir->net, IPSTATS_MIB_REASMFAILS);
return rc;
}
@@ -320,18 +242,18 @@ static int ip_frag_reinit(struct ipq *qp)
{
unsigned int sum_truesize = 0;
- if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
+ if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) {
refcount_inc(&qp->q.refcnt);
return -ETIMEDOUT;
}
- sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
- sub_frag_mem_limit(qp->q.net, sum_truesize);
+ sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments,
+ SKB_DROP_REASON_FRAG_TOO_FAR);
+ sub_frag_mem_limit(qp->q.fqdir, sum_truesize);
qp->q.flags = 0;
qp->q.len = 0;
qp->q.meat = 0;
- qp->q.fragments = NULL;
qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL;
qp->q.last_run_head = NULL;
@@ -342,25 +264,27 @@ static int ip_frag_reinit(struct ipq *qp)
}
/* Add new segment to existing queue. */
-static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb, int *refs)
{
- struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
- struct rb_node **rbn, *parent;
- struct sk_buff *skb1, *prev_tail;
+ struct net *net = qp->q.fqdir->net;
+ int ihl, end, flags, offset;
+ struct sk_buff *prev_tail;
struct net_device *dev;
unsigned int fragsize;
- int flags, offset;
- int ihl, end;
int err = -ENOENT;
+ SKB_DR(reason);
u8 ecn;
- if (qp->q.flags & INET_FRAG_COMPLETE)
+ /* If reassembly is already done, @skb must be a duplicate frag. */
+ if (qp->q.flags & INET_FRAG_COMPLETE) {
+ SKB_DR_SET(reason, DUP_FRAG);
goto err;
+ }
if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
unlikely(ip_frag_too_far(qp)) &&
unlikely(err = ip_frag_reinit(qp))) {
- ipq_kill(qp);
+ inet_frag_kill(&qp->q, refs);
goto err;
}
@@ -382,7 +306,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
*/
if (end < qp->q.len ||
((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
- goto err;
+ goto discard_qp;
qp->q.flags |= INET_FRAG_LAST_IN;
qp->q.len = end;
} else {
@@ -394,80 +318,39 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (end > qp->q.len) {
/* Some bits beyond end -> corruption. */
if (qp->q.flags & INET_FRAG_LAST_IN)
- goto err;
+ goto discard_qp;
qp->q.len = end;
}
}
if (end == offset)
- goto err;
+ goto discard_qp;
err = -ENOMEM;
if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
- goto err;
+ goto discard_qp;
err = pskb_trim_rcsum(skb, end - offset);
if (err)
- goto err;
+ goto discard_qp;
/* Note : skb->rbnode and skb->dev share the same location. */
dev = skb->dev;
/* Makes sure compiler wont do silly aliasing games */
barrier();
- /* RFC5722, Section 4, amended by Errata ID : 3089
- * When reassembling an IPv6 datagram, if
- * one or more its constituent fragments is determined to be an
- * overlapping fragment, the entire datagram (and any constituent
- * fragments) MUST be silently discarded.
- *
- * We do the same here for IPv4 (and increment an snmp counter).
- */
-
- /* Find out where to put this fragment. */
prev_tail = qp->q.fragments_tail;
- if (!prev_tail)
- ip4_frag_create_run(&qp->q, skb); /* First fragment. */
- else if (prev_tail->ip_defrag_offset + prev_tail->len < end) {
- /* This is the common case: skb goes to the end. */
- /* Detect and discard overlaps. */
- if (offset < prev_tail->ip_defrag_offset + prev_tail->len)
- goto discard_qp;
- if (offset == prev_tail->ip_defrag_offset + prev_tail->len)
- ip4_frag_append_to_last_run(&qp->q, skb);
- else
- ip4_frag_create_run(&qp->q, skb);
- } else {
- /* Binary search. Note that skb can become the first fragment,
- * but not the last (covered above).
- */
- rbn = &qp->q.rb_fragments.rb_node;
- do {
- parent = *rbn;
- skb1 = rb_to_skb(parent);
- if (end <= skb1->ip_defrag_offset)
- rbn = &parent->rb_left;
- else if (offset >= skb1->ip_defrag_offset +
- FRAG_CB(skb1)->frag_run_len)
- rbn = &parent->rb_right;
- else /* Found an overlap with skb1. */
- goto discard_qp;
- } while (*rbn);
- /* Here we have parent properly set, and rbn pointing to
- * one of its NULL left/right children. Insert skb.
- */
- ip4_frag_init_run(skb);
- rb_link_node(&skb->rbnode, parent, rbn);
- rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
- }
+ err = inet_frag_queue_insert(&qp->q, skb, offset, end);
+ if (err)
+ goto insert_error;
if (dev)
qp->iif = dev->ifindex;
- skb->ip_defrag_offset = offset;
qp->q.stamp = skb->tstamp;
+ qp->q.tstamp_type = skb->tstamp_type;
qp->q.meat += skb->len;
qp->ecn |= ecn;
- add_frag_mem_limit(qp->q.net, skb->truesize);
+ add_frag_mem_limit(qp->q.fqdir, skb->truesize);
if (offset == 0)
qp->q.flags |= INET_FRAG_FIRST_IN;
@@ -485,149 +368,74 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
unsigned long orefdst = skb->_skb_refdst;
skb->_skb_refdst = 0UL;
- err = ip_frag_reasm(qp, skb, prev_tail, dev);
+ err = ip_frag_reasm(qp, skb, prev_tail, dev, refs);
skb->_skb_refdst = orefdst;
+ if (err)
+ inet_frag_kill(&qp->q, refs);
return err;
}
skb_dst_drop(skb);
+ skb_orphan(skb);
return -EINPROGRESS;
-discard_qp:
- inet_frag_kill(&qp->q);
+insert_error:
+ if (err == IPFRAG_DUP) {
+ SKB_DR_SET(reason, DUP_FRAG);
+ err = -EINVAL;
+ goto err;
+ }
err = -EINVAL;
__IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
+discard_qp:
+ inet_frag_kill(&qp->q, refs);
+ __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
err:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return err;
}
+static bool ip_frag_coalesce_ok(const struct ipq *qp)
+{
+ return qp->q.key.v4.user == IP_DEFRAG_LOCAL_DELIVER;
+}
+
/* Build a new IP datagram from all its fragments. */
static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
- struct sk_buff *prev_tail, struct net_device *dev)
+ struct sk_buff *prev_tail, struct net_device *dev,
+ int *refs)
{
- struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+ struct net *net = qp->q.fqdir->net;
struct iphdr *iph;
- struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
- struct sk_buff **nextp; /* To build frag_list. */
- struct rb_node *rbn;
- int len;
- int ihlen;
- int err;
+ void *reasm_data;
+ int len, err;
u8 ecn;
- ipq_kill(qp);
+ inet_frag_kill(&qp->q, refs);
ecn = ip_frag_ecn_table[qp->ecn];
if (unlikely(ecn == 0xff)) {
err = -EINVAL;
goto out_fail;
}
- /* Make the one we just received the head. */
- if (head != skb) {
- fp = skb_clone(skb, GFP_ATOMIC);
- if (!fp)
- goto out_nomem;
- FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
- if (RB_EMPTY_NODE(&skb->rbnode))
- FRAG_CB(prev_tail)->next_frag = fp;
- else
- rb_replace_node(&skb->rbnode, &fp->rbnode,
- &qp->q.rb_fragments);
- if (qp->q.fragments_tail == skb)
- qp->q.fragments_tail = fp;
- skb_morph(skb, head);
- FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
- rb_replace_node(&head->rbnode, &skb->rbnode,
- &qp->q.rb_fragments);
- consume_skb(head);
- head = skb;
- }
-
- WARN_ON(head->ip_defrag_offset != 0);
- /* Allocate a new buffer for the datagram. */
- ihlen = ip_hdrlen(head);
- len = ihlen + qp->q.len;
+ /* Make the one we just received the head. */
+ reasm_data = inet_frag_reasm_prepare(&qp->q, skb, prev_tail);
+ if (!reasm_data)
+ goto out_nomem;
+ len = ip_hdrlen(skb) + qp->q.len;
err = -E2BIG;
if (len > 65535)
goto out_oversize;
- /* Head of list must not be cloned. */
- if (skb_unclone(head, GFP_ATOMIC))
- goto out_nomem;
-
- /* If the first fragment is fragmented itself, we split
- * it to two chunks: the first with data and paged part
- * and the second, holding only fragments. */
- if (skb_has_frag_list(head)) {
- struct sk_buff *clone;
- int i, plen = 0;
-
- clone = alloc_skb(0, GFP_ATOMIC);
- if (!clone)
- goto out_nomem;
- skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
- skb_frag_list_init(head);
- for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
- plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
- clone->len = clone->data_len = head->data_len - plen;
- head->truesize += clone->truesize;
- clone->csum = 0;
- clone->ip_summed = head->ip_summed;
- add_frag_mem_limit(qp->q.net, clone->truesize);
- skb_shinfo(head)->frag_list = clone;
- nextp = &clone->next;
- } else {
- nextp = &skb_shinfo(head)->frag_list;
- }
-
- skb_push(head, head->data - skb_network_header(head));
-
- /* Traverse the tree in order, to build frag_list. */
- fp = FRAG_CB(head)->next_frag;
- rbn = rb_next(&head->rbnode);
- rb_erase(&head->rbnode, &qp->q.rb_fragments);
- while (rbn || fp) {
- /* fp points to the next sk_buff in the current run;
- * rbn points to the next run.
- */
- /* Go through the current run. */
- while (fp) {
- *nextp = fp;
- nextp = &fp->next;
- fp->prev = NULL;
- memset(&fp->rbnode, 0, sizeof(fp->rbnode));
- fp->sk = NULL;
- head->data_len += fp->len;
- head->len += fp->len;
- if (head->ip_summed != fp->ip_summed)
- head->ip_summed = CHECKSUM_NONE;
- else if (head->ip_summed == CHECKSUM_COMPLETE)
- head->csum = csum_add(head->csum, fp->csum);
- head->truesize += fp->truesize;
- fp = FRAG_CB(fp)->next_frag;
- }
- /* Move to the next run. */
- if (rbn) {
- struct rb_node *rbnext = rb_next(rbn);
-
- fp = rb_to_skb(rbn);
- rb_erase(rbn, &qp->q.rb_fragments);
- rbn = rbnext;
- }
- }
- sub_frag_mem_limit(qp->q.net, head->truesize);
+ inet_frag_reasm_finish(&qp->q, skb, reasm_data,
+ ip_frag_coalesce_ok(qp));
- *nextp = NULL;
- head->next = NULL;
- head->prev = NULL;
- head->dev = dev;
- head->tstamp = qp->q.stamp;
- IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
+ skb->dev = dev;
+ IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
- iph = ip_hdr(head);
+ iph = ip_hdr(skb);
iph->tot_len = htons(len);
iph->tos |= ecn;
@@ -640,7 +448,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
* from one very small df-fragment and one large non-df frag.
*/
if (qp->max_df_size == qp->q.max_size) {
- IPCB(head)->flags |= IPSKB_FRAG_PMTU;
+ IPCB(skb)->flags |= IPSKB_FRAG_PMTU;
iph->frag_off = htons(IP_DF);
} else {
iph->frag_off = 0;
@@ -649,7 +457,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
ip_send_check(iph);
__IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
- qp->q.fragments = NULL;
qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL;
qp->q.last_run_head = NULL;
@@ -669,26 +476,30 @@ out_fail:
/* Process an incoming IP datagram fragment. */
int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
- struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
- int vif = l3mdev_master_ifindex_rcu(dev);
+ struct net_device *dev;
struct ipq *qp;
+ int vif;
__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
- skb_orphan(skb);
/* Lookup (or create) queue header */
+ rcu_read_lock();
+ dev = skb->dev ? : skb_dst_dev_rcu(skb);
+ vif = l3mdev_master_ifindex_rcu(dev);
qp = ip_find(net, ip_hdr(skb), user, vif);
if (qp) {
- int ret;
+ int ret, refs = 0;
spin_lock(&qp->q.lock);
- ret = ip_frag_queue(qp, skb);
+ ret = ip_frag_queue(qp, skb, &refs);
spin_unlock(&qp->q.lock);
- ipq_put(qp);
+ rcu_read_unlock();
+ inet_frag_putn(&qp->q, refs);
return ret;
}
+ rcu_read_unlock();
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
kfree_skb(skb);
@@ -720,10 +531,14 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
if (ip_is_fragment(&iph)) {
skb = skb_share_check(skb, GFP_ATOMIC);
if (skb) {
- if (!pskb_may_pull(skb, netoff + iph.ihl * 4))
- return skb;
- if (pskb_trim_rcsum(skb, netoff + len))
- return skb;
+ if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+ if (pskb_trim_rcsum(skb, netoff + len)) {
+ kfree_skb(skb);
+ return NULL;
+ }
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
if (ip_defrag(net, skb, user))
return NULL;
@@ -734,64 +549,35 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
}
EXPORT_SYMBOL(ip_check_defrag);
-unsigned int inet_frag_rbtree_purge(struct rb_root *root)
-{
- struct rb_node *p = rb_first(root);
- unsigned int sum = 0;
-
- while (p) {
- struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
-
- p = rb_next(p);
- rb_erase(&skb->rbnode, root);
- while (skb) {
- struct sk_buff *next = FRAG_CB(skb)->next_frag;
-
- sum += skb->truesize;
- kfree_skb(skb);
- skb = next;
- }
- }
- return sum;
-}
-EXPORT_SYMBOL(inet_frag_rbtree_purge);
-
#ifdef CONFIG_SYSCTL
static int dist_min;
static struct ctl_table ip4_frags_ns_ctl_table[] = {
{
.procname = "ipfrag_high_thresh",
- .data = &init_net.ipv4.frags.high_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra1 = &init_net.ipv4.frags.low_thresh
},
{
.procname = "ipfrag_low_thresh",
- .data = &init_net.ipv4.frags.low_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra2 = &init_net.ipv4.frags.high_thresh
},
{
.procname = "ipfrag_time",
- .data = &init_net.ipv4.frags.timeout,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "ipfrag_max_dist",
- .data = &init_net.ipv4.frags.max_dist,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &dist_min,
},
- { }
};
/* secret interval has been deprecated */
@@ -804,7 +590,6 @@ static struct ctl_table ip4_frags_ctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- { }
};
static int __net_init ip4_frags_ns_ctl_register(struct net *net)
@@ -818,16 +603,16 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
if (!table)
goto err_alloc;
- table[0].data = &net->ipv4.frags.high_thresh;
- table[0].extra1 = &net->ipv4.frags.low_thresh;
- table[0].extra2 = &init_net.ipv4.frags.high_thresh;
- table[1].data = &net->ipv4.frags.low_thresh;
- table[1].extra2 = &net->ipv4.frags.high_thresh;
- table[2].data = &net->ipv4.frags.timeout;
- table[3].data = &net->ipv4.frags.max_dist;
}
-
- hdr = register_net_sysctl(net, "net/ipv4", table);
+ table[0].data = &net->ipv4.fqdir->high_thresh;
+ table[0].extra1 = &net->ipv4.fqdir->low_thresh;
+ table[1].data = &net->ipv4.fqdir->low_thresh;
+ table[1].extra2 = &net->ipv4.fqdir->high_thresh;
+ table[2].data = &net->ipv4.fqdir->timeout;
+ table[3].data = &net->ipv4.fqdir->max_dist;
+
+ hdr = register_net_sysctl_sz(net, "net/ipv4", table,
+ ARRAY_SIZE(ip4_frags_ns_ctl_table));
if (!hdr)
goto err_reg;
@@ -843,7 +628,7 @@ err_alloc:
static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
{
- struct ctl_table *table;
+ const struct ctl_table *table;
table = net->ipv4.frags_hdr->ctl_table_arg;
unregister_net_sysctl_table(net->ipv4.frags_hdr);
@@ -873,6 +658,9 @@ static int __net_init ipv4_frags_init_net(struct net *net)
{
int res;
+ res = fqdir_init(&net->ipv4.fqdir, &ip4_frags, net);
+ if (res < 0)
+ return res;
/* Fragment cache limits.
*
* The fragment memory accounting code, (tries to) account for
@@ -887,36 +675,38 @@ static int __net_init ipv4_frags_init_net(struct net *net)
* we will prune down to 3MB, making room for approx 8 big 64K
* fragments 8x128k.
*/
- net->ipv4.frags.high_thresh = 4 * 1024 * 1024;
- net->ipv4.frags.low_thresh = 3 * 1024 * 1024;
+ net->ipv4.fqdir->high_thresh = 4 * 1024 * 1024;
+ net->ipv4.fqdir->low_thresh = 3 * 1024 * 1024;
/*
* Important NOTE! Fragment queue must be destroyed before MSL expires.
* RFC791 is wrong proposing to prolongate timer each fragment arrival
* by TTL.
*/
- net->ipv4.frags.timeout = IP_FRAG_TIME;
+ net->ipv4.fqdir->timeout = IP_FRAG_TIME;
- net->ipv4.frags.max_dist = 64;
- net->ipv4.frags.f = &ip4_frags;
+ net->ipv4.fqdir->max_dist = 64;
- res = inet_frags_init_net(&net->ipv4.frags);
- if (res < 0)
- return res;
res = ip4_frags_ns_ctl_register(net);
if (res < 0)
- inet_frags_exit_net(&net->ipv4.frags);
+ fqdir_exit(net->ipv4.fqdir);
return res;
}
+static void __net_exit ipv4_frags_pre_exit_net(struct net *net)
+{
+ fqdir_pre_exit(net->ipv4.fqdir);
+}
+
static void __net_exit ipv4_frags_exit_net(struct net *net)
{
ip4_frags_ns_ctl_unregister(net);
- inet_frags_exit_net(&net->ipv4.frags);
+ fqdir_exit(net->ipv4.fqdir);
}
static struct pernet_operations ip4_frags_ops = {
- .init = ipv4_frags_init_net,
- .exit = ipv4_frags_exit_net,
+ .init = ipv4_frags_init_net,
+ .pre_exit = ipv4_frags_pre_exit_net,
+ .exit = ipv4_frags_exit_net,
};
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 8cce0e9ea08c..761a53c6a89a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux NET3: GRE over IP protocol decoder.
*
* Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -33,6 +28,7 @@
#include <linux/etherdevice.h>
#include <linux/if_ether.h>
+#include <net/flow.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
@@ -112,6 +108,8 @@ module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
static struct rtnl_link_ops ipgre_link_ops __read_mostly;
+static const struct header_ops ipgre_header_ops;
+
static int ipgre_tunnel_init(struct net_device *dev);
static void erspan_build_header(struct sk_buff *skb,
u32 id, u32 index,
@@ -121,8 +119,8 @@ static unsigned int ipgre_net_id __read_mostly;
static unsigned int gre_tap_net_id __read_mostly;
static unsigned int erspan_net_id __read_mostly;
-static void ipgre_err(struct sk_buff *skb, u32 info,
- const struct tnl_ptk_info *tpi)
+static int ipgre_err(struct sk_buff *skb, u32 info,
+ const struct tnl_ptk_info *tpi)
{
/* All the routers (except for Linux) return only
@@ -143,20 +141,34 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
const struct iphdr *iph;
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
- unsigned int data_len = 0;
struct ip_tunnel *t;
+ if (tpi->proto == htons(ETH_P_TEB))
+ itn = net_generic(net, gre_tap_net_id);
+ else if (tpi->proto == htons(ETH_P_ERSPAN) ||
+ tpi->proto == htons(ETH_P_ERSPAN2))
+ itn = net_generic(net, erspan_net_id);
+ else
+ itn = net_generic(net, ipgre_net_id);
+
+ iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
+ iph->daddr, iph->saddr, tpi->key);
+
+ if (!t)
+ return -ENOENT;
+
switch (type) {
default:
case ICMP_PARAMETERPROB:
- return;
+ return 0;
case ICMP_DEST_UNREACH:
switch (code) {
case ICMP_SR_FAILED:
case ICMP_PORT_UNREACH:
/* Impossible event. */
- return;
+ return 0;
default:
/* All others are translated to HOST_UNREACH.
rfc2003 contains "deep thoughts" about NET_UNREACH,
@@ -168,48 +180,40 @@ static void ipgre_err(struct sk_buff *skb, u32 info,
case ICMP_TIME_EXCEEDED:
if (code != ICMP_EXC_TTL)
- return;
- data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
+ return 0;
break;
case ICMP_REDIRECT:
break;
}
- if (tpi->proto == htons(ETH_P_TEB))
- itn = net_generic(net, gre_tap_net_id);
- else if (tpi->proto == htons(ETH_P_ERSPAN) ||
- tpi->proto == htons(ETH_P_ERSPAN2))
- itn = net_generic(net, erspan_net_id);
- else
- itn = net_generic(net, ipgre_net_id);
-
- iph = (const struct iphdr *)(icmp_hdr(skb) + 1);
- t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
- iph->daddr, iph->saddr, tpi->key);
+#if IS_ENABLED(CONFIG_IPV6)
+ if (tpi->proto == htons(ETH_P_IPV6)) {
+ unsigned int data_len = 0;
- if (!t)
- return;
+ if (type == ICMP_TIME_EXCEEDED)
+ data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
-#if IS_ENABLED(CONFIG_IPV6)
- if (tpi->proto == htons(ETH_P_IPV6) &&
- !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
- type, data_len))
- return;
+ if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4 + tpi->hdr_len,
+ type, data_len))
+ return 0;
+ }
#endif
if (t->parms.iph.daddr == 0 ||
ipv4_is_multicast(t->parms.iph.daddr))
- return;
+ return 0;
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
- return;
+ return 0;
if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
t->err_count++;
else
t->err_count = 1;
t->err_time = jiffies;
+
+ return 0;
}
static void gre_err(struct sk_buff *skb, u32 info)
@@ -232,35 +236,41 @@ static void gre_err(struct sk_buff *skb, u32 info)
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
struct tnl_ptk_info tpi;
- bool csum_err = false;
- if (gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP),
- iph->ihl * 4) < 0) {
- if (!csum_err) /* ignore csum errors. */
- return;
- }
+ if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IP),
+ iph->ihl * 4) < 0)
+ return;
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
ipv4_update_pmtu(skb, dev_net(skb->dev), info,
- skb->dev->ifindex, 0, IPPROTO_GRE, 0);
+ skb->dev->ifindex, IPPROTO_GRE);
return;
}
if (type == ICMP_REDIRECT) {
- ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
- IPPROTO_GRE, 0);
+ ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex,
+ IPPROTO_GRE);
return;
}
ipgre_err(skb, info, &tpi);
}
+static bool is_erspan_type1(int gre_hdr_len)
+{
+ /* Both ERSPAN type I (version 0) and type II (version 1) use
+ * protocol 0x88BE, but the type I has only 4-byte GRE header,
+ * while type II has 8-byte.
+ */
+ return gre_hdr_len == 4;
+}
+
static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
int gre_hdr_len)
{
struct net *net = dev_net(skb->dev);
struct metadata_dst *tun_dst = NULL;
struct erspan_base_hdr *ershdr;
- struct erspan_metadata *pkt_md;
+ IP_TUNNEL_DECLARE_FLAGS(flags);
struct ip_tunnel_net *itn;
struct ip_tunnel *tunnel;
const struct iphdr *iph;
@@ -268,33 +278,37 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
int ver;
int len;
- itn = net_generic(net, erspan_net_id);
- len = gre_hdr_len + sizeof(*ershdr);
-
- /* Check based hdr len */
- if (unlikely(!pskb_may_pull(skb, len)))
- return PACKET_REJECT;
+ ip_tunnel_flags_copy(flags, tpi->flags);
+ itn = net_generic(net, erspan_net_id);
iph = ip_hdr(skb);
- ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
- ver = ershdr->ver;
+ if (is_erspan_type1(gre_hdr_len)) {
+ ver = 0;
+ __set_bit(IP_TUNNEL_NO_KEY_BIT, flags);
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags,
+ iph->saddr, iph->daddr, 0);
+ } else {
+ if (unlikely(!pskb_may_pull(skb,
+ gre_hdr_len + sizeof(*ershdr))))
+ return PACKET_REJECT;
- /* The original GRE header does not have key field,
- * Use ERSPAN 10-bit session ID as key.
- */
- tpi->key = cpu_to_be32(get_session_id(ershdr));
- tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex,
- tpi->flags | TUNNEL_KEY,
- iph->saddr, iph->daddr, tpi->key);
+ ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
+ ver = ershdr->ver;
+ iph = ip_hdr(skb);
+ __set_bit(IP_TUNNEL_KEY_BIT, flags);
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags,
+ iph->saddr, iph->daddr, tpi->key);
+ }
if (tunnel) {
- len = gre_hdr_len + erspan_hdr_len(ver);
+ if (is_erspan_type1(gre_hdr_len))
+ len = gre_hdr_len;
+ else
+ len = gre_hdr_len + erspan_hdr_len(ver);
+
if (unlikely(!pskb_may_pull(skb, len)))
return PACKET_REJECT;
- ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len);
- pkt_md = (struct erspan_metadata *)(ershdr + 1);
-
if (__iptunnel_pull_header(skb,
len,
htons(ETH_P_TEB),
@@ -302,13 +316,13 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
goto drop;
if (tunnel->collect_md) {
+ struct erspan_metadata *pkt_md, *md;
struct ip_tunnel_info *info;
- struct erspan_metadata *md;
+ unsigned char *gh;
__be64 tun_id;
- __be16 flags;
- tpi->flags |= TUNNEL_KEY;
- flags = tpi->flags;
+ __set_bit(IP_TUNNEL_KEY_BIT, tpi->flags);
+ ip_tunnel_flags_copy(flags, tpi->flags);
tun_id = key32_to_tunnel_id(tpi->key);
tun_dst = ip_tun_rx_dst(skb, flags,
@@ -316,6 +330,14 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
if (!tun_dst)
return PACKET_REJECT;
+ /* skb can be uncloned in __iptunnel_pull_header, so
+ * old pkt_md is no longer valid and we need to reset
+ * it
+ */
+ gh = skb_network_header(skb) +
+ skb_network_header_len(skb);
+ pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len +
+ sizeof(*ershdr));
md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
md->version = ver;
md2 = &md->u.md2;
@@ -323,7 +345,8 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
ERSPAN_V2_MDSIZE);
info = &tun_dst->u.tun_info;
- info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+ __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT,
+ info->key.tun_flags);
info->options_len = sizeof(*md);
}
@@ -350,19 +373,29 @@ static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
iph->saddr, iph->daddr, tpi->key);
if (tunnel) {
+ const struct iphdr *tnl_params;
+
if (__iptunnel_pull_header(skb, hdr_len, tpi->proto,
raw_proto, false) < 0)
goto drop;
- if (tunnel->dev->type != ARPHRD_NONE)
+ /* Special case for ipgre_header_parse(), which expects the
+ * mac_header to point to the outer IP header.
+ */
+ if (tunnel->dev->header_ops == &ipgre_header_ops)
skb_pop_mac_header(skb);
else
skb_reset_mac_header(skb);
- if (tunnel->collect_md) {
- __be16 flags;
+
+ tnl_params = &tunnel->parms.iph;
+ if (tunnel->collect_md || tnl_params->daddr == 0) {
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
__be64 tun_id;
- flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
+ __set_bit(IP_TUNNEL_CSUM_BIT, flags);
+ __set_bit(IP_TUNNEL_KEY_BIT, flags);
+ ip_tunnel_flags_and(flags, tpi->flags, flags);
+
tun_id = key32_to_tunnel_id(tpi->key);
tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
if (!tun_dst)
@@ -442,14 +475,15 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
__be16 proto)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
+ IP_TUNNEL_DECLARE_FLAGS(flags);
- if (tunnel->parms.o_flags & TUNNEL_SEQ)
- tunnel->o_seqno++;
+ ip_tunnel_flags_copy(flags, tunnel->parms.o_flags);
/* Push GRE header. */
gre_build_header(skb, tunnel->tun_hlen,
- tunnel->parms.o_flags, proto, tunnel->parms.o_key,
- htonl(tunnel->o_seqno));
+ flags, proto, tunnel->parms.o_key,
+ test_bit(IP_TUNNEL_SEQ_BIT, flags) ?
+ htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
}
@@ -459,81 +493,14 @@ static int gre_handle_offloads(struct sk_buff *skb, bool csum)
return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
}
-static struct rtable *gre_get_rt(struct sk_buff *skb,
- struct net_device *dev,
- struct flowi4 *fl,
- const struct ip_tunnel_key *key)
-{
- struct net *net = dev_net(dev);
-
- memset(fl, 0, sizeof(*fl));
- fl->daddr = key->u.ipv4.dst;
- fl->saddr = key->u.ipv4.src;
- fl->flowi4_tos = RT_TOS(key->tos);
- fl->flowi4_mark = skb->mark;
- fl->flowi4_proto = IPPROTO_GRE;
-
- return ip_route_output_key(net, fl);
-}
-
-static struct rtable *prepare_fb_xmit(struct sk_buff *skb,
- struct net_device *dev,
- struct flowi4 *fl,
- int tunnel_hlen)
-{
- struct ip_tunnel_info *tun_info;
- const struct ip_tunnel_key *key;
- struct rtable *rt = NULL;
- int min_headroom;
- bool use_cache;
- int err;
-
- tun_info = skb_tunnel_info(skb);
- key = &tun_info->key;
- use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
-
- if (use_cache)
- rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl->saddr);
- if (!rt) {
- rt = gre_get_rt(skb, dev, fl, key);
- if (IS_ERR(rt))
- goto err_free_skb;
- if (use_cache)
- dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
- fl->saddr);
- }
-
- min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
- + tunnel_hlen + sizeof(struct iphdr);
- if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
- int head_delta = SKB_DATA_ALIGN(min_headroom -
- skb_headroom(skb) +
- 16);
- err = pskb_expand_head(skb, max_t(int, head_delta, 0),
- 0, GFP_ATOMIC);
- if (unlikely(err))
- goto err_free_rt;
- }
- return rt;
-
-err_free_rt:
- ip_rt_put(rt);
-err_free_skb:
- kfree_skb(skb);
- dev->stats.tx_dropped++;
- return NULL;
-}
-
static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
__be16 proto)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
- struct rtable *rt = NULL;
- struct flowi4 fl;
int tunnel_hlen;
- __be16 df, flags;
tun_info = skb_tunnel_info(skb);
if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
@@ -543,48 +510,45 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev,
key = &tun_info->key;
tunnel_hlen = gre_calc_hlen(key->tun_flags);
- rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
- if (!rt)
- return;
+ if (skb_cow_head(skb, dev->needed_headroom))
+ goto err_free_skb;
/* Push Tunnel header. */
- if (gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)))
- goto err_free_rt;
+ if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
+ tunnel->parms.o_flags)))
+ goto err_free_skb;
+
+ __set_bit(IP_TUNNEL_CSUM_BIT, flags);
+ __set_bit(IP_TUNNEL_KEY_BIT, flags);
+ __set_bit(IP_TUNNEL_SEQ_BIT, flags);
+ ip_tunnel_flags_and(flags, tun_info->key.tun_flags, flags);
- flags = tun_info->key.tun_flags &
- (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
gre_build_header(skb, tunnel_hlen, flags, proto,
tunnel_id_to_key32(tun_info->key.tun_id),
- (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++) : 0);
+ test_bit(IP_TUNNEL_SEQ_BIT, flags) ?
+ htonl(atomic_fetch_inc(&tunnel->o_seqno)) : 0);
- df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+ ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
- iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
- key->tos, key->ttl, df, false);
return;
-err_free_rt:
- ip_rt_put(rt);
err_free_skb:
kfree_skb(skb);
- dev->stats.tx_dropped++;
+ DEV_STATS_INC(dev, tx_dropped);
}
-static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
- __be16 proto)
+static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
struct erspan_metadata *md;
- struct rtable *rt = NULL;
bool truncate = false;
- struct flowi4 fl;
+ __be16 proto;
int tunnel_hlen;
int version;
- __be16 df;
int nhoff;
- int thoff;
tun_info = skb_tunnel_info(skb);
if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
@@ -592,77 +556,88 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev,
goto err_free_skb;
key = &tun_info->key;
- if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
- goto err_free_rt;
+ if (!test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags))
+ goto err_free_skb;
+ if (tun_info->options_len < sizeof(*md))
+ goto err_free_skb;
md = ip_tunnel_info_opts(tun_info);
- if (!md)
- goto err_free_rt;
/* ERSPAN has fixed 8 byte GRE header */
version = md->version;
tunnel_hlen = 8 + erspan_hdr_len(version);
- rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen);
- if (!rt)
- return;
+ if (skb_cow_head(skb, dev->needed_headroom))
+ goto err_free_skb;
if (gre_handle_offloads(skb, false))
- goto err_free_rt;
+ goto err_free_skb;
if (skb->len > dev->mtu + dev->hard_header_len) {
- pskb_trim(skb, dev->mtu + dev->hard_header_len);
+ if (pskb_trim(skb, dev->mtu + dev->hard_header_len))
+ goto err_free_skb;
truncate = true;
}
- nhoff = skb_network_header(skb) - skb_mac_header(skb);
+ nhoff = skb_network_offset(skb);
if (skb->protocol == htons(ETH_P_IP) &&
(ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
truncate = true;
- thoff = skb_transport_header(skb) - skb_mac_header(skb);
- if (skb->protocol == htons(ETH_P_IPV6) &&
- (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff))
- truncate = true;
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ int thoff;
+
+ if (skb_transport_header_was_set(skb))
+ thoff = skb_transport_offset(skb);
+ else
+ thoff = nhoff + sizeof(struct ipv6hdr);
+ if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)
+ truncate = true;
+ }
if (version == 1) {
erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)),
ntohl(md->u.index), truncate, true);
+ proto = htons(ETH_P_ERSPAN);
} else if (version == 2) {
erspan_build_header_v2(skb,
ntohl(tunnel_id_to_key32(key->tun_id)),
md->u.md2.dir,
get_hwid(&md->u.md2),
truncate, true);
+ proto = htons(ETH_P_ERSPAN2);
} else {
- goto err_free_rt;
+ goto err_free_skb;
}
- gre_build_header(skb, 8, TUNNEL_SEQ,
- htons(ETH_P_ERSPAN), 0, htonl(tunnel->o_seqno++));
+ __set_bit(IP_TUNNEL_SEQ_BIT, flags);
+ gre_build_header(skb, 8, flags, proto, 0,
+ htonl(atomic_fetch_inc(&tunnel->o_seqno)));
- df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+ ip_md_tunnel_xmit(skb, dev, IPPROTO_GRE, tunnel_hlen);
- iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
- key->tos, key->ttl, df, false);
return;
-err_free_rt:
- ip_rt_put(rt);
err_free_skb:
kfree_skb(skb);
- dev->stats.tx_dropped++;
+ DEV_STATS_INC(dev, tx_dropped);
}
static int gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ const struct ip_tunnel_key *key;
struct rtable *rt;
struct flowi4 fl4;
if (ip_tunnel_info_af(info) != AF_INET)
return -EINVAL;
- rt = gre_get_rt(skb, dev, &fl4, &info->key);
+ key = &info->key;
+ ip_tunnel_init_flow(&fl4, IPPROTO_GRE, key->u.ipv4.dst, key->u.ipv4.src,
+ tunnel_id_to_key32(key->tun_id),
+ key->tos & ~INET_ECN_MASK, dev_net(dev), 0,
+ skb->mark, skb_get_hash(skb), key->flow_flags);
+ rt = ip_route_output_key(dev_net(dev), &fl4);
if (IS_ERR(rt))
return PTR_ERR(rt);
@@ -677,24 +652,32 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
struct ip_tunnel *tunnel = netdev_priv(dev);
const struct iphdr *tnl_params;
+ if (!pskb_inet_may_pull(skb))
+ goto free_skb;
+
if (tunnel->collect_md) {
gre_fb_xmit(skb, dev, skb->protocol);
return NETDEV_TX_OK;
}
if (dev->header_ops) {
- /* Need space for new headers */
- if (skb_cow_head(skb, dev->needed_headroom -
- (tunnel->hlen + sizeof(struct iphdr))))
+ int pull_len = tunnel->hlen + sizeof(struct iphdr);
+
+ if (skb_cow_head(skb, 0))
+ goto free_skb;
+
+ if (!pskb_may_pull(skb, pull_len))
goto free_skb;
tnl_params = (const struct iphdr *)skb->data;
- /* Pull skb since ip_tunnel_xmit() needs skb->data pointing
- * to gre header.
- */
- skb_pull(skb, tunnel->hlen + sizeof(struct iphdr));
+ /* ip_tunnel_xmit() needs skb->data pointing to gre header. */
+ skb_pull(skb, pull_len);
skb_reset_mac_header(skb);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ skb_checksum_start(skb) < skb->data)
+ goto free_skb;
} else {
if (skb_cow_head(skb, dev->needed_headroom))
goto free_skb;
@@ -702,7 +685,8 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
tnl_params = &tunnel->parms.iph;
}
- if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
+ if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
+ tunnel->parms.o_flags)))
goto free_skb;
__gre_xmit(skb, dev, tnl_params, skb->protocol);
@@ -710,7 +694,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
free_skb:
kfree_skb(skb);
- dev->stats.tx_dropped++;
+ DEV_STATS_INC(dev, tx_dropped);
return NETDEV_TX_OK;
}
@@ -719,9 +703,13 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
{
struct ip_tunnel *tunnel = netdev_priv(dev);
bool truncate = false;
+ __be16 proto;
+
+ if (!pskb_inet_may_pull(skb))
+ goto free_skb;
if (tunnel->collect_md) {
- erspan_fb_xmit(skb, dev, skb->protocol);
+ erspan_fb_xmit(skb, dev);
return NETDEV_TX_OK;
}
@@ -732,29 +720,36 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb,
goto free_skb;
if (skb->len > dev->mtu + dev->hard_header_len) {
- pskb_trim(skb, dev->mtu + dev->hard_header_len);
+ if (pskb_trim(skb, dev->mtu + dev->hard_header_len))
+ goto free_skb;
truncate = true;
}
/* Push ERSPAN header */
- if (tunnel->erspan_ver == 1)
+ if (tunnel->erspan_ver == 0) {
+ proto = htons(ETH_P_ERSPAN);
+ __clear_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags);
+ } else if (tunnel->erspan_ver == 1) {
erspan_build_header(skb, ntohl(tunnel->parms.o_key),
tunnel->index,
truncate, true);
- else if (tunnel->erspan_ver == 2)
+ proto = htons(ETH_P_ERSPAN);
+ } else if (tunnel->erspan_ver == 2) {
erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key),
tunnel->dir, tunnel->hwid,
truncate, true);
- else
+ proto = htons(ETH_P_ERSPAN2);
+ } else {
goto free_skb;
+ }
- tunnel->parms.o_flags &= ~TUNNEL_KEY;
- __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN));
+ __clear_bit(IP_TUNNEL_KEY_BIT, tunnel->parms.o_flags);
+ __gre_xmit(skb, dev, &tunnel->parms.iph, proto);
return NETDEV_TX_OK;
free_skb:
kfree_skb(skb);
- dev->stats.tx_dropped++;
+ DEV_STATS_INC(dev, tx_dropped);
return NETDEV_TX_OK;
}
@@ -763,12 +758,16 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
{
struct ip_tunnel *tunnel = netdev_priv(dev);
+ if (!pskb_inet_may_pull(skb))
+ goto free_skb;
+
if (tunnel->collect_md) {
gre_fb_xmit(skb, dev, htons(ETH_P_TEB));
return NETDEV_TX_OK;
}
- if (gre_handle_offloads(skb, !!(tunnel->parms.o_flags & TUNNEL_CSUM)))
+ if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
+ tunnel->parms.o_flags)))
goto free_skb;
if (skb_cow_head(skb, dev->needed_headroom))
@@ -779,7 +778,7 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
free_skb:
kfree_skb(skb);
- dev->stats.tx_dropped++;
+ DEV_STATS_INC(dev, tx_dropped);
return NETDEV_TX_OK;
}
@@ -793,64 +792,67 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu)
len = tunnel->tun_hlen - len;
tunnel->hlen = tunnel->hlen + len;
- dev->needed_headroom = dev->needed_headroom + len;
+ if (dev->header_ops)
+ dev->hard_header_len += len;
+ else
+ dev->needed_headroom += len;
+
if (set_mtu)
- dev->mtu = max_t(int, dev->mtu - len, 68);
-
- if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
- if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
- tunnel->encap.type == TUNNEL_ENCAP_NONE) {
- dev->features |= NETIF_F_GSO_SOFTWARE;
- dev->hw_features |= NETIF_F_GSO_SOFTWARE;
- } else {
- dev->features &= ~NETIF_F_GSO_SOFTWARE;
- dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
- }
- dev->features |= NETIF_F_LLTX;
- } else {
+ WRITE_ONCE(dev->mtu, max_t(int, dev->mtu - len, 68));
+
+ if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags) ||
+ (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) &&
+ tunnel->encap.type != TUNNEL_ENCAP_NONE)) {
+ dev->features &= ~NETIF_F_GSO_SOFTWARE;
dev->hw_features &= ~NETIF_F_GSO_SOFTWARE;
- dev->features &= ~(NETIF_F_LLTX | NETIF_F_GSO_SOFTWARE);
+ } else {
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
}
}
-static int ipgre_tunnel_ioctl(struct net_device *dev,
- struct ifreq *ifr, int cmd)
+static int ipgre_tunnel_ctl(struct net_device *dev,
+ struct ip_tunnel_parm_kern *p,
+ int cmd)
{
- struct ip_tunnel_parm p;
+ __be16 i_flags, o_flags;
int err;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- return -EFAULT;
+ if (!ip_tunnel_flags_is_be16_compat(p->i_flags) ||
+ !ip_tunnel_flags_is_be16_compat(p->o_flags))
+ return -EOVERFLOW;
+
+ i_flags = ip_tunnel_flags_to_be16(p->i_flags);
+ o_flags = ip_tunnel_flags_to_be16(p->o_flags);
if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
- if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
- p.iph.ihl != 5 || (p.iph.frag_off & htons(~IP_DF)) ||
- ((p.i_flags | p.o_flags) & (GRE_VERSION | GRE_ROUTING)))
+ if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE ||
+ p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) ||
+ ((i_flags | o_flags) & (GRE_VERSION | GRE_ROUTING)))
return -EINVAL;
}
- p.i_flags = gre_flags_to_tnl_flags(p.i_flags);
- p.o_flags = gre_flags_to_tnl_flags(p.o_flags);
+ gre_flags_to_tnl_flags(p->i_flags, i_flags);
+ gre_flags_to_tnl_flags(p->o_flags, o_flags);
- err = ip_tunnel_ioctl(dev, &p, cmd);
+ err = ip_tunnel_ctl(dev, p, cmd);
if (err)
return err;
if (cmd == SIOCCHGTUNNEL) {
struct ip_tunnel *t = netdev_priv(dev);
- t->parms.i_flags = p.i_flags;
- t->parms.o_flags = p.o_flags;
+ ip_tunnel_flags_copy(t->parms.i_flags, p->i_flags);
+ ip_tunnel_flags_copy(t->parms.o_flags, p->o_flags);
if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
ipgre_link_update(dev, true);
}
- p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags);
- p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags);
-
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
- return -EFAULT;
+ i_flags = gre_tnl_flags_to_gre_flags(p->i_flags);
+ ip_tunnel_flags_from_be16(p->i_flags, i_flags);
+ o_flags = gre_tnl_flags_to_gre_flags(p->o_flags);
+ ip_tunnel_flags_from_be16(p->o_flags, o_flags);
return 0;
}
@@ -926,15 +928,18 @@ static int ipgre_open(struct net_device *dev)
struct ip_tunnel *t = netdev_priv(dev);
if (ipv4_is_multicast(t->parms.iph.daddr)) {
- struct flowi4 fl4;
+ struct flowi4 fl4 = {
+ .flowi4_oif = t->parms.link,
+ .flowi4_dscp = ip4h_dscp(&t->parms.iph),
+ .flowi4_scope = RT_SCOPE_UNIVERSE,
+ .flowi4_proto = IPPROTO_GRE,
+ .saddr = t->parms.iph.saddr,
+ .daddr = t->parms.iph.daddr,
+ .fl4_gre_key = t->parms.o_key,
+ };
struct rtable *rt;
- rt = ip_route_output_gre(t->net, &fl4,
- t->parms.iph.daddr,
- t->parms.iph.saddr,
- t->parms.o_key,
- RT_TOS(t->parms.iph.tos),
- t->parms.link);
+ rt = ip_route_output_key(t->net, &fl4);
if (IS_ERR(rt))
return -EADDRNOTAVAIL;
dev = rt->dst.dev;
@@ -969,10 +974,11 @@ static const struct net_device_ops ipgre_netdev_ops = {
.ndo_stop = ipgre_close,
#endif
.ndo_start_xmit = ipgre_xmit,
- .ndo_do_ioctl = ipgre_tunnel_ioctl,
+ .ndo_siocdevprivate = ip_tunnel_siocdevprivate,
.ndo_change_mtu = ip_tunnel_change_mtu,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip_tunnel_get_iflink,
+ .ndo_tunnel_ctl = ipgre_tunnel_ctl,
};
#define GRE_FEATURES (NETIF_F_SG | \
@@ -996,26 +1002,24 @@ static void __gre_tunnel_init(struct net_device *dev)
tunnel->parms.iph.protocol = IPPROTO_GRE;
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
+ dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph);
dev->features |= GRE_FEATURES;
dev->hw_features |= GRE_FEATURES;
- if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
- /* TCP offload with GRE SEQ is not supported, nor
- * can we support 2 levels of outer headers requiring
- * an update.
- */
- if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
- (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
- dev->features |= NETIF_F_GSO_SOFTWARE;
- dev->hw_features |= NETIF_F_GSO_SOFTWARE;
- }
+ /* TCP offload with GRE SEQ is not supported, nor can we support 2
+ * levels of outer headers requiring an update.
+ */
+ if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.o_flags))
+ return;
+ if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.o_flags) &&
+ tunnel->encap.type != TUNNEL_ENCAP_NONE)
+ return;
- /* Can use a lockless transmit, unless we generate
- * output sequences
- */
- dev->features |= NETIF_F_LLTX;
- }
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+
+ dev->lltx = true;
}
static int ipgre_tunnel_init(struct net_device *dev)
@@ -1025,7 +1029,7 @@ static int ipgre_tunnel_init(struct net_device *dev)
__gre_tunnel_init(dev);
- memcpy(dev->dev_addr, &iph->saddr, 4);
+ __dev_addr_set(dev, &iph->saddr, 4);
memcpy(dev->broadcast, &iph->daddr, 4);
dev->flags = IFF_NOARP;
@@ -1039,10 +1043,14 @@ static int ipgre_tunnel_init(struct net_device *dev)
return -EINVAL;
dev->flags = IFF_BROADCAST;
dev->header_ops = &ipgre_header_ops;
+ dev->hard_header_len = tunnel->hlen + sizeof(*iph);
+ dev->needed_headroom = 0;
}
#endif
} else if (!tunnel->collect_md) {
dev->header_ops = &ipgre_header_ops;
+ dev->hard_header_len = tunnel->hlen + sizeof(*iph);
+ dev->needed_headroom = 0;
}
return ip_tunnel_init(dev);
@@ -1058,14 +1066,15 @@ static int __net_init ipgre_init_net(struct net *net)
return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL);
}
-static void __net_exit ipgre_exit_batch_net(struct list_head *list_net)
+static void __net_exit ipgre_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
{
- ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops);
+ ip_tunnel_delete_net(net, ipgre_net_id, &ipgre_link_ops, dev_to_kill);
}
static struct pernet_operations ipgre_net_ops = {
.init = ipgre_init_net,
- .exit_batch = ipgre_exit_batch_net,
+ .exit_rtnl = ipgre_exit_rtnl,
.id = &ipgre_net_id,
.size = sizeof(struct ip_tunnel_net),
};
@@ -1132,7 +1141,11 @@ static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
if (ret)
return ret;
- /* ERSPAN should only have GRE sequence and key flag */
+ if (data[IFLA_GRE_ERSPAN_VER] &&
+ nla_get_u8(data[IFLA_GRE_ERSPAN_VER]) == 0)
+ return 0;
+
+ /* ERSPAN type II/III should only have GRE sequence and key flag */
if (data[IFLA_GRE_OFLAGS])
flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
if (data[IFLA_GRE_IFLAGS])
@@ -1158,7 +1171,7 @@ static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
static int ipgre_netlink_parms(struct net_device *dev,
struct nlattr *data[],
struct nlattr *tb[],
- struct ip_tunnel_parm *parms,
+ struct ip_tunnel_parm_kern *parms,
__u32 *fwmark)
{
struct ip_tunnel *t = netdev_priv(dev);
@@ -1174,10 +1187,12 @@ static int ipgre_netlink_parms(struct net_device *dev,
parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
if (data[IFLA_GRE_IFLAGS])
- parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS]));
+ gre_flags_to_tnl_flags(parms->i_flags,
+ nla_get_be16(data[IFLA_GRE_IFLAGS]));
if (data[IFLA_GRE_OFLAGS])
- parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS]));
+ gre_flags_to_tnl_flags(parms->o_flags,
+ nla_get_be16(data[IFLA_GRE_OFLAGS]));
if (data[IFLA_GRE_IKEY])
parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
@@ -1219,10 +1234,28 @@ static int ipgre_netlink_parms(struct net_device *dev,
if (data[IFLA_GRE_FWMARK])
*fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]);
+ return 0;
+}
+
+static int erspan_netlink_parms(struct net_device *dev,
+ struct nlattr *data[],
+ struct nlattr *tb[],
+ struct ip_tunnel_parm_kern *parms,
+ __u32 *fwmark)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err;
+
+ err = ipgre_netlink_parms(dev, data, tb, parms, fwmark);
+ if (err)
+ return err;
+ if (!data)
+ return 0;
+
if (data[IFLA_GRE_ERSPAN_VER]) {
t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
- if (t->erspan_ver != 1 && t->erspan_ver != 2)
+ if (t->erspan_ver > 2)
return -EINVAL;
}
@@ -1298,7 +1331,7 @@ static const struct net_device_ops gre_tap_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
.ndo_change_mtu = ip_tunnel_change_mtu,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip_tunnel_get_iflink,
.ndo_fill_metadata_dst = gre_fill_metadata_dst,
};
@@ -1307,7 +1340,11 @@ static int erspan_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- tunnel->tun_hlen = 8;
+ if (tunnel->erspan_ver == 0)
+ tunnel->tun_hlen = 4; /* 4-byte GRE hdr. */
+ else
+ tunnel->tun_hlen = 8; /* 8-byte GRE hdr. */
+
tunnel->parms.iph.protocol = IPPROTO_GRE;
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
erspan_hdr_len(tunnel->erspan_ver);
@@ -1327,7 +1364,7 @@ static const struct net_device_ops erspan_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
.ndo_change_mtu = ip_tunnel_change_mtu,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip_tunnel_get_iflink,
.ndo_fill_metadata_dst = gre_fill_metadata_dst,
};
@@ -1342,33 +1379,62 @@ static void ipgre_tap_setup(struct net_device *dev)
ip_tunnel_setup(dev, gre_tap_net_id);
}
-bool is_gretap_dev(const struct net_device *dev)
-{
- return dev->netdev_ops == &gre_tap_netdev_ops;
-}
-EXPORT_SYMBOL_GPL(is_gretap_dev);
-
-static int ipgre_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[],
- struct netlink_ext_ack *extack)
+static int
+ipgre_newlink_encap_setup(struct net_device *dev, struct nlattr *data[])
{
- struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
- __u32 fwmark = 0;
- int err;
if (ipgre_netlink_encap_parms(data, &ipencap)) {
struct ip_tunnel *t = netdev_priv(dev);
- err = ip_tunnel_encap_setup(t, &ipencap);
+ int err = ip_tunnel_encap_setup(t, &ipencap);
if (err < 0)
return err;
}
+ return 0;
+}
+
+static int ipgre_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
+ struct ip_tunnel_parm_kern p;
+ __u32 fwmark = 0;
+ int err;
+
+ err = ipgre_newlink_encap_setup(dev, data);
+ if (err)
+ return err;
+
err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
if (err < 0)
return err;
- return ip_tunnel_newlink(dev, tb, &p, fwmark);
+ return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p,
+ fwmark);
+}
+
+static int erspan_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
+ struct ip_tunnel_parm_kern p;
+ __u32 fwmark = 0;
+ int err;
+
+ err = ipgre_newlink_encap_setup(dev, data);
+ if (err)
+ return err;
+
+ err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
+ if (err)
+ return err;
+ return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p,
+ fwmark);
}
static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
@@ -1376,17 +1442,13 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
struct netlink_ext_ack *extack)
{
struct ip_tunnel *t = netdev_priv(dev);
- struct ip_tunnel_encap ipencap;
+ struct ip_tunnel_parm_kern p;
__u32 fwmark = t->fwmark;
- struct ip_tunnel_parm p;
int err;
- if (ipgre_netlink_encap_parms(data, &ipencap)) {
- err = ip_tunnel_encap_setup(t, &ipencap);
-
- if (err < 0)
- return err;
- }
+ err = ipgre_newlink_encap_setup(dev, data);
+ if (err)
+ return err;
err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark);
if (err < 0)
@@ -1396,11 +1458,37 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
if (err < 0)
return err;
- t->parms.i_flags = p.i_flags;
- t->parms.o_flags = p.o_flags;
+ ip_tunnel_flags_copy(t->parms.i_flags, p.i_flags);
+ ip_tunnel_flags_copy(t->parms.o_flags, p.o_flags);
+
+ ipgre_link_update(dev, !tb[IFLA_MTU]);
+
+ return 0;
+}
+
+static int erspan_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_parm_kern p;
+ __u32 fwmark = t->fwmark;
+ int err;
+
+ err = ipgre_newlink_encap_setup(dev, data);
+ if (err)
+ return err;
+
+ err = erspan_netlink_parms(dev, data, tb, &p, &fwmark);
+ if (err < 0)
+ return err;
+
+ err = ip_tunnel_changelink(dev, tb, &p, fwmark);
+ if (err < 0)
+ return err;
- if (strcmp(dev->rtnl_link_ops->kind, "erspan"))
- ipgre_link_update(dev, !tb[IFLA_MTU]);
+ ip_tunnel_flags_copy(t->parms.i_flags, p.i_flags);
+ ip_tunnel_flags_copy(t->parms.o_flags, p.o_flags);
return 0;
}
@@ -1456,13 +1544,16 @@ static size_t ipgre_get_size(const struct net_device *dev)
static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
struct ip_tunnel *t = netdev_priv(dev);
- struct ip_tunnel_parm *p = &t->parms;
+ struct ip_tunnel_parm_kern *p = &t->parms;
+ IP_TUNNEL_DECLARE_FLAGS(o_flags);
+
+ ip_tunnel_flags_copy(o_flags, p->o_flags);
if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
nla_put_be16(skb, IFLA_GRE_IFLAGS,
gre_tnl_flags_to_gre_flags(p->i_flags)) ||
nla_put_be16(skb, IFLA_GRE_OFLAGS,
- gre_tnl_flags_to_gre_flags(p->o_flags)) ||
+ gre_tnl_flags_to_gre_flags(o_flags)) ||
nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
@@ -1492,20 +1583,35 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
goto nla_put_failure;
}
- if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
- goto nla_put_failure;
+ return 0;
- if (t->erspan_ver == 1) {
- if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
- goto nla_put_failure;
- } else if (t->erspan_ver == 2) {
- if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
- goto nla_put_failure;
- if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int erspan_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+
+ if (t->erspan_ver <= 2) {
+ if (t->erspan_ver != 0 && !t->collect_md)
+ __set_bit(IP_TUNNEL_KEY_BIT, t->parms.o_flags);
+
+ if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver))
goto nla_put_failure;
+
+ if (t->erspan_ver == 1) {
+ if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index))
+ goto nla_put_failure;
+ } else if (t->erspan_ver == 2) {
+ if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir))
+ goto nla_put_failure;
+ if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid))
+ goto nla_put_failure;
+ }
}
- return 0;
+ return ipgre_fill_info(skb, dev);
nla_put_failure:
return -EMSGSIZE;
@@ -1516,6 +1622,7 @@ static void erspan_setup(struct net_device *dev)
struct ip_tunnel *t = netdev_priv(dev);
ether_setup(dev);
+ dev->max_mtu = 0;
dev->netdev_ops = &erspan_netdev_ops;
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
@@ -1529,8 +1636,8 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
[IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
[IFLA_GRE_IKEY] = { .type = NLA_U32 },
[IFLA_GRE_OKEY] = { .type = NLA_U32 },
- [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
- [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+ [IFLA_GRE_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) },
+ [IFLA_GRE_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) },
[IFLA_GRE_TTL] = { .type = NLA_U8 },
[IFLA_GRE_TOS] = { .type = NLA_U8 },
[IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
@@ -1584,17 +1691,18 @@ static struct rtnl_link_ops erspan_link_ops __read_mostly = {
.priv_size = sizeof(struct ip_tunnel),
.setup = erspan_setup,
.validate = erspan_validate,
- .newlink = ipgre_newlink,
- .changelink = ipgre_changelink,
+ .newlink = erspan_newlink,
+ .changelink = erspan_changelink,
.dellink = ip_tunnel_dellink,
.get_size = ipgre_get_size,
- .fill_info = ipgre_fill_info,
+ .fill_info = erspan_fill_info,
.get_link_net = ip_tunnel_get_link_net,
};
struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
u8 name_assign_type)
{
+ struct rtnl_newlink_params params = { .src_net = net };
struct nlattr *tb[IFLA_MAX + 1];
struct net_device *dev;
LIST_HEAD(list_kill);
@@ -1602,9 +1710,10 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
int err;
memset(&tb, 0, sizeof(tb));
+ params.tb = tb;
dev = rtnl_create_link(net, name, name_assign_type,
- &ipgre_tap_ops, tb);
+ &ipgre_tap_ops, tb, NULL);
if (IS_ERR(dev))
return dev;
@@ -1612,7 +1721,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
t = netdev_priv(dev);
t->collect_md = true;
- err = ipgre_newlink(net, dev, tb, NULL, NULL);
+ err = ipgre_newlink(dev, &params, NULL);
if (err < 0) {
free_netdev(dev);
return ERR_PTR(err);
@@ -1625,7 +1734,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
if (err)
goto out;
- err = rtnl_configure_link(dev, NULL);
+ err = rtnl_configure_link(dev, NULL, 0, NULL);
if (err < 0)
goto out;
@@ -1642,14 +1751,15 @@ static int __net_init ipgre_tap_init_net(struct net *net)
return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
}
-static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net)
+static void __net_exit ipgre_tap_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
{
- ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops);
+ ip_tunnel_delete_net(net, gre_tap_net_id, &ipgre_tap_ops, dev_to_kill);
}
static struct pernet_operations ipgre_tap_net_ops = {
.init = ipgre_tap_init_net,
- .exit_batch = ipgre_tap_exit_batch_net,
+ .exit_rtnl = ipgre_tap_exit_rtnl,
.id = &gre_tap_net_id,
.size = sizeof(struct ip_tunnel_net),
};
@@ -1660,14 +1770,15 @@ static int __net_init erspan_init_net(struct net *net)
&erspan_link_ops, "erspan0");
}
-static void __net_exit erspan_exit_batch_net(struct list_head *net_list)
+static void __net_exit erspan_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
{
- ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops);
+ ip_tunnel_delete_net(net, erspan_net_id, &erspan_link_ops, dev_to_kill);
}
static struct pernet_operations erspan_net_ops = {
.init = erspan_init_net,
- .exit_batch = erspan_exit_batch_net,
+ .exit_rtnl = erspan_exit_rtnl,
.id = &erspan_net_id,
.size = sizeof(struct ip_tunnel_net),
};
@@ -1738,6 +1849,7 @@ static void __exit ipgre_fini(void)
module_init(ipgre_init);
module_exit(ipgre_fini);
+MODULE_DESCRIPTION("IPv4 GRE tunnels over IP library");
MODULE_LICENSE("GPL");
MODULE_ALIAS_RTNL_LINK("gre");
MODULE_ALIAS_RTNL_LINK("gretap");
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 3196cf58f418..19d3141dad1f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -14,7 +15,6 @@
* Jorge Cwik, <jorge@laser.satlink.net>
* Arnt Gulbrandsen, <agulbra@nvg.unit.no>
*
- *
* Fixes:
* Alan Cox : Commented a couple of minor bits of surplus code
* Alan Cox : Undefining IP_FORWARD doesn't include the code
@@ -96,8 +96,6 @@
* Jos Vos : Do accounting *before* call_in_firewall
* Willy Konynenberg : Transparent proxying support
*
- *
- *
* To Fix:
* IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient
* and could be made very efficient with the addition of some virtual memory hacks to permit
@@ -106,11 +104,6 @@
* interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet
* output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause
* fragmentation anyway.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "IPv4: " fmt
@@ -130,6 +123,7 @@
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
+#include <linux/indirect_call_wrapper.h>
#include <net/snmp.h>
#include <net/ip.h>
@@ -147,6 +141,8 @@
#include <linux/mroute.h>
#include <linux/netlink.h>
#include <net/dst_metadata.h>
+#include <net/udp.h>
+#include <net/tcp.h>
/*
* Process Router Attention IP option (RFC 2113)
@@ -188,51 +184,61 @@ bool ip_call_ra_chain(struct sk_buff *skb)
return false;
}
-static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+INDIRECT_CALLABLE_DECLARE(int udp_rcv(struct sk_buff *));
+INDIRECT_CALLABLE_DECLARE(int tcp_v4_rcv(struct sk_buff *));
+void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
{
- __skb_pull(skb, skb_network_header_len(skb));
-
- rcu_read_lock();
- {
- int protocol = ip_hdr(skb)->protocol;
- const struct net_protocol *ipprot;
- int raw;
-
- resubmit:
- raw = raw_local_deliver(skb, protocol);
-
- ipprot = rcu_dereference(inet_protos[protocol]);
- if (ipprot) {
- int ret;
-
- if (!ipprot->no_policy) {
- if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- kfree_skb(skb);
- goto out;
- }
- nf_reset(skb);
+ const struct net_protocol *ipprot;
+ int raw, ret;
+
+resubmit:
+ raw = raw_local_deliver(skb, protocol);
+
+ ipprot = rcu_dereference(inet_protos[protocol]);
+ if (ipprot) {
+ if (!ipprot->no_policy) {
+ if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ kfree_skb_reason(skb,
+ SKB_DROP_REASON_XFRM_POLICY);
+ return;
}
- ret = ipprot->handler(skb);
- if (ret < 0) {
- protocol = -ret;
- goto resubmit;
+ nf_reset_ct(skb);
+ }
+ ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,
+ skb);
+ if (ret < 0) {
+ protocol = -ret;
+ goto resubmit;
+ }
+ __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+ } else {
+ if (!raw) {
+ if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
+ icmp_send(skb, ICMP_DEST_UNREACH,
+ ICMP_PROT_UNREACH, 0);
}
- __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_NOPROTO);
} else {
- if (!raw) {
- if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- __IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
- icmp_send(skb, ICMP_DEST_UNREACH,
- ICMP_PROT_UNREACH, 0);
- }
- kfree_skb(skb);
- } else {
- __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
- consume_skb(skb);
- }
+ __IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
+ consume_skb(skb);
}
}
- out:
+}
+
+static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) {
+ __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NOMEM);
+ return 0;
+ }
+
+ skb_clear_delivery_time(skb);
+ __skb_pull(skb, skb_network_header_len(skb));
+
+ rcu_read_lock();
+ ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);
rcu_read_unlock();
return 0;
@@ -257,12 +263,13 @@ int ip_local_deliver(struct sk_buff *skb)
net, NULL, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
+EXPORT_SYMBOL(ip_local_deliver);
-static inline bool ip_rcv_options(struct sk_buff *skb)
+static inline enum skb_drop_reason
+ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
{
- struct ip_options *opt;
const struct iphdr *iph;
- struct net_device *dev = skb->dev;
+ struct ip_options *opt;
/* It looks as overkill, because not all
IP options require packet mangling.
@@ -273,7 +280,7 @@ static inline bool ip_rcv_options(struct sk_buff *skb)
*/
if (skb_cow(skb, skb_headroom(skb))) {
__IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS);
- goto drop;
+ return SKB_DROP_REASON_NOMEM;
}
iph = ip_hdr(skb);
@@ -282,7 +289,7 @@ static inline bool ip_rcv_options(struct sk_buff *skb)
if (ip_options_compile(dev_net(dev), opt, skb)) {
__IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
- goto drop;
+ return SKB_DROP_REASON_IP_INHDR;
}
if (unlikely(opt->srr)) {
@@ -294,42 +301,62 @@ static inline bool ip_rcv_options(struct sk_buff *skb)
net_info_ratelimited("source route option %pI4 -> %pI4\n",
&iph->saddr,
&iph->daddr);
- goto drop;
+ return SKB_DROP_REASON_NOT_SPECIFIED;
}
}
- if (ip_options_rcv_srr(skb))
- goto drop;
+ if (ip_options_rcv_srr(skb, dev))
+ return SKB_DROP_REASON_NOT_SPECIFIED;
}
- return false;
-drop:
- return true;
+ return SKB_NOT_DROPPED_YET;
}
-static int ip_rcv_finish_core(struct net *net, struct sock *sk,
- struct sk_buff *skb)
+static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
+ const struct sk_buff *hint)
+{
+ return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr &&
+ ip_hdr(hint)->tos == iph->tos;
+}
+
+static int ip_rcv_finish_core(struct net *net,
+ struct sk_buff *skb, struct net_device *dev,
+ const struct sk_buff *hint)
{
const struct iphdr *iph = ip_hdr(skb);
- int (*edemux)(struct sk_buff *skb);
- struct net_device *dev = skb->dev;
struct rtable *rt;
- int err;
+ int drop_reason;
+
+ if (ip_can_use_hint(skb, iph, hint)) {
+ drop_reason = ip_route_use_hint(skb, iph->daddr, iph->saddr,
+ ip4h_dscp(iph), dev, hint);
+ if (unlikely(drop_reason))
+ goto drop_error;
+ }
- if (net->ipv4.sysctl_ip_early_demux &&
+ if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
!skb_dst(skb) &&
!skb->sk &&
!ip_is_fragment(iph)) {
- const struct net_protocol *ipprot;
- int protocol = iph->protocol;
-
- ipprot = rcu_dereference(inet_protos[protocol]);
- if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
- err = edemux(skb);
- if (unlikely(err))
- goto drop_error;
- /* must reload iph, skb->head might have changed */
- iph = ip_hdr(skb);
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) {
+ tcp_v4_early_demux(skb);
+
+ /* must reload iph, skb->head might have changed */
+ iph = ip_hdr(skb);
+ }
+ break;
+ case IPPROTO_UDP:
+ if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) {
+ drop_reason = udp_v4_early_demux(skb);
+ if (unlikely(drop_reason))
+ goto drop_error;
+
+ /* must reload iph, skb->head might have changed */
+ iph = ip_hdr(skb);
+ }
+ break;
}
}
@@ -338,10 +365,15 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk,
* how the packet travels inside Linux networking.
*/
if (!skb_valid_dst(skb)) {
- err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
- iph->tos, dev);
- if (unlikely(err))
+ drop_reason = ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ ip4h_dscp(iph), dev);
+ if (unlikely(drop_reason))
goto drop_error;
+ } else {
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+ if (in_dev && IN_DEV_ORCONF(in_dev, NOPOLICY))
+ IPCB(skb)->flags |= IPSKB_NOPOLICY;
}
#ifdef CONFIG_IP_ROUTE_CLASSID
@@ -355,8 +387,11 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk,
}
#endif
- if (iph->ihl > 5 && ip_rcv_options(skb))
- goto drop;
+ if (iph->ihl > 5) {
+ drop_reason = ip_rcv_options(skb, dev);
+ if (drop_reason)
+ goto drop;
+ }
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
@@ -383,24 +418,27 @@ static int ip_rcv_finish_core(struct net *net, struct sock *sk,
* so-called "hole-196" attack) so do it for both.
*/
if (in_dev &&
- IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST))
+ IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) {
+ drop_reason = SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST;
goto drop;
+ }
}
return NET_RX_SUCCESS;
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
return NET_RX_DROP;
drop_error:
- if (err == -EXDEV)
+ if (drop_reason == SKB_DROP_REASON_IP_RPFILTER)
__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
goto drop;
}
static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ struct net_device *dev = skb->dev;
int ret;
/* if ingress device is enslaved to an L3 master device pass the
@@ -410,7 +448,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
if (!skb)
return NET_RX_SUCCESS;
- ret = ip_rcv_finish_core(net, sk, skb);
+ ret = ip_rcv_finish_core(net, skb, dev, NULL);
if (ret != NET_RX_DROP)
ret = dst_input(skb);
return ret;
@@ -422,14 +460,17 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
{
const struct iphdr *iph;
+ int drop_reason;
u32 len;
/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
*/
- if (skb->pkt_type == PACKET_OTHERHOST)
+ if (skb->pkt_type == PACKET_OTHERHOST) {
+ dev_core_stats_rx_otherhost_dropped_inc(skb->dev);
+ drop_reason = SKB_DROP_REASON_OTHERHOST;
goto drop;
-
+ }
__IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);
@@ -439,6 +480,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
goto out;
}
+ drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto inhdr_error;
@@ -473,8 +515,9 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto csum_error;
- len = ntohs(iph->tot_len);
+ len = iph_totlen(skb, iph);
if (skb->len < len) {
+ drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
__IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
@@ -489,6 +532,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
goto drop;
}
+ iph = ip_hdr(skb);
skb->transport_header = skb->network_header + iph->ihl*4;
/* Remove any debris in the socket control block */
@@ -496,16 +540,20 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
IPCB(skb)->iif = skb->skb_iif;
/* Must drop socket now because of tproxy. */
- skb_orphan(skb);
+ if (!skb_sk_is_prefetched(skb))
+ skb_orphan(skb);
return skb;
csum_error:
+ drop_reason = SKB_DROP_REASON_IP_CSUM;
__IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
inhdr_error:
+ if (drop_reason == SKB_DROP_REASON_NOT_SPECIFIED)
+ drop_reason = SKB_DROP_REASON_IP_INHDR;
__IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, drop_reason);
out:
return NULL;
}
@@ -521,6 +569,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
skb = ip_rcv_core(skb, net);
if (skb == NULL)
return NET_RX_DROP;
+
return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
net, NULL, skb, dev, NULL,
ip_rcv_finish);
@@ -531,38 +580,49 @@ static void ip_sublist_rcv_finish(struct list_head *head)
struct sk_buff *skb, *next;
list_for_each_entry_safe(skb, next, head, list) {
- list_del(&skb->list);
- /* Handle ip{6}_forward case, as sch_direct_xmit have
- * another kind of SKB-list usage (see validate_xmit_skb_list)
- */
- skb->next = NULL;
+ skb_list_del_init(skb);
dst_input(skb);
}
}
-static void ip_list_rcv_finish(struct net *net, struct sock *sk,
- struct list_head *head)
+static struct sk_buff *ip_extract_route_hint(const struct net *net,
+ struct sk_buff *skb)
{
+ const struct iphdr *iph = ip_hdr(skb);
+
+ if (fib4_has_custom_rules(net) ||
+ ipv4_is_lbcast(iph->daddr) ||
+ ipv4_is_zeronet(iph->daddr) ||
+ IPCB(skb)->flags & IPSKB_MULTIPATH)
+ return NULL;
+
+ return skb;
+}
+
+static void ip_list_rcv_finish(struct net *net, struct list_head *head)
+{
+ struct sk_buff *skb, *next, *hint = NULL;
struct dst_entry *curr_dst = NULL;
- struct sk_buff *skb, *next;
- struct list_head sublist;
+ LIST_HEAD(sublist);
- INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
+ struct net_device *dev = skb->dev;
struct dst_entry *dst;
- list_del(&skb->list);
+ skb_list_del_init(skb);
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
*/
skb = l3mdev_ip_rcv(skb);
if (!skb)
continue;
- if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP)
+ if (ip_rcv_finish_core(net, skb, dev, hint) == NET_RX_DROP)
continue;
dst = skb_dst(skb);
if (curr_dst != dst) {
+ hint = ip_extract_route_hint(net, skb);
+
/* dispatch old sublist */
if (!list_empty(&sublist))
ip_sublist_rcv_finish(&sublist);
@@ -581,7 +641,7 @@ static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
{
NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
head, dev, NULL, ip_rcv_finish);
- ip_list_rcv_finish(net, NULL, head);
+ ip_list_rcv_finish(net, head);
}
/* Receive a list of IP packets */
@@ -591,14 +651,13 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt,
struct net_device *curr_dev = NULL;
struct net *curr_net = NULL;
struct sk_buff *skb, *next;
- struct list_head sublist;
+ LIST_HEAD(sublist);
- INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
struct net_device *dev = skb->dev;
struct net *net = dev_net(dev);
- list_del(&skb->list);
+ skb_list_del_init(skb);
skb = ip_rcv_core(skb, net);
if (skb == NULL)
continue;
@@ -615,5 +674,6 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt,
list_add_tail(&skb->list, &sublist);
}
/* dispatch final sublist */
- ip_sublist_rcv(&sublist, curr_dev, curr_net);
+ if (!list_empty(&sublist))
+ ip_sublist_rcv(&sublist, curr_dev, curr_net);
}
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index ed194d46c00e..be8815ce3ac2 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -17,7 +17,7 @@
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/uaccess.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/icmp.h>
@@ -42,39 +42,26 @@
*/
void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
- __be32 daddr, struct rtable *rt, int is_frag)
+ __be32 daddr, struct rtable *rt)
{
unsigned char *iph = skb_network_header(skb);
memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options));
- memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen);
+ memcpy(iph + sizeof(struct iphdr), opt->__data, opt->optlen);
opt = &(IPCB(skb)->opt);
if (opt->srr)
- memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4);
+ memcpy(iph + opt->srr + iph[opt->srr + 1] - 4, &daddr, 4);
- if (!is_frag) {
- if (opt->rr_needaddr)
- ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, skb, rt);
- if (opt->ts_needaddr)
- ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt);
- if (opt->ts_needtime) {
- __be32 midtime;
+ if (opt->rr_needaddr)
+ ip_rt_get_source(iph + opt->rr + iph[opt->rr + 2] - 5, skb, rt);
+ if (opt->ts_needaddr)
+ ip_rt_get_source(iph + opt->ts + iph[opt->ts + 2] - 9, skb, rt);
+ if (opt->ts_needtime) {
+ __be32 midtime;
- midtime = inet_current_timestamp();
- memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);
- }
- return;
- }
- if (opt->rr) {
- memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]);
- opt->rr = 0;
- opt->rr_needaddr = 0;
- }
- if (opt->ts) {
- memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]);
- opt->ts = 0;
- opt->ts_needaddr = opt->ts_needtime = 0;
+ midtime = inet_current_timestamp();
+ memcpy(iph + opt->ts + iph[opt->ts + 2] - 5, &midtime, 4);
}
}
@@ -251,8 +238,9 @@ static void spec_dst_fill(__be32 *spec_dst, struct sk_buff *skb)
* If opt == NULL, then skb->data should point to IP header.
*/
-int ip_options_compile(struct net *net,
- struct ip_options *opt, struct sk_buff *skb)
+int __ip_options_compile(struct net *net,
+ struct ip_options *opt, struct sk_buff *skb,
+ __be32 *info)
{
__be32 spec_dst = htonl(INADDR_ANY);
unsigned char *pp_ptr = NULL;
@@ -468,11 +456,23 @@ eol:
return 0;
error:
- if (skb) {
- icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24));
- }
+ if (info)
+ *info = htonl((pp_ptr-iph)<<24);
return -EINVAL;
}
+EXPORT_SYMBOL(__ip_options_compile);
+
+int ip_options_compile(struct net *net,
+ struct ip_options *opt, struct sk_buff *skb)
+{
+ int ret;
+ __be32 info;
+
+ ret = __ip_options_compile(net, opt, skb, &info);
+ if (ret != 0 && skb)
+ icmp_send(skb, ICMP_PARAMETERPROB, 0, info);
+ return ret;
+}
EXPORT_SYMBOL(ip_options_compile);
/*
@@ -482,39 +482,47 @@ EXPORT_SYMBOL(ip_options_compile);
void ip_options_undo(struct ip_options *opt)
{
if (opt->srr) {
- unsigned char *optptr = opt->__data+opt->srr-sizeof(struct iphdr);
- memmove(optptr+7, optptr+3, optptr[1]-7);
- memcpy(optptr+3, &opt->faddr, 4);
+ unsigned char *optptr = opt->__data + opt->srr - sizeof(struct iphdr);
+
+ memmove(optptr + 7, optptr + 3, optptr[1] - 7);
+ memcpy(optptr + 3, &opt->faddr, 4);
}
if (opt->rr_needaddr) {
- unsigned char *optptr = opt->__data+opt->rr-sizeof(struct iphdr);
+ unsigned char *optptr = opt->__data + opt->rr - sizeof(struct iphdr);
+
optptr[2] -= 4;
- memset(&optptr[optptr[2]-1], 0, 4);
+ memset(&optptr[optptr[2] - 1], 0, 4);
}
if (opt->ts) {
- unsigned char *optptr = opt->__data+opt->ts-sizeof(struct iphdr);
+ unsigned char *optptr = opt->__data + opt->ts - sizeof(struct iphdr);
+
if (opt->ts_needtime) {
optptr[2] -= 4;
- memset(&optptr[optptr[2]-1], 0, 4);
- if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC)
+ memset(&optptr[optptr[2] - 1], 0, 4);
+ if ((optptr[3] & 0xF) == IPOPT_TS_PRESPEC)
optptr[2] -= 4;
}
if (opt->ts_needaddr) {
optptr[2] -= 4;
- memset(&optptr[optptr[2]-1], 0, 4);
+ memset(&optptr[optptr[2] - 1], 0, 4);
}
}
}
-static struct ip_options_rcu *ip_options_get_alloc(const int optlen)
+int ip_options_get(struct net *net, struct ip_options_rcu **optp,
+ sockptr_t data, int optlen)
{
- return kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3),
+ struct ip_options_rcu *opt;
+
+ opt = kzalloc(sizeof(struct ip_options_rcu) + ((optlen + 3) & ~3),
GFP_KERNEL);
-}
+ if (!opt)
+ return -ENOMEM;
+ if (optlen && copy_from_sockptr(opt->opt.__data, data, optlen)) {
+ kfree(opt);
+ return -EFAULT;
+ }
-static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp,
- struct ip_options_rcu *opt, int optlen)
-{
while (optlen & 3)
opt->opt.__data[optlen++] = IPOPT_END;
opt->opt.optlen = optlen;
@@ -527,32 +535,6 @@ static int ip_options_get_finish(struct net *net, struct ip_options_rcu **optp,
return 0;
}
-int ip_options_get_from_user(struct net *net, struct ip_options_rcu **optp,
- unsigned char __user *data, int optlen)
-{
- struct ip_options_rcu *opt = ip_options_get_alloc(optlen);
-
- if (!opt)
- return -ENOMEM;
- if (optlen && copy_from_user(opt->opt.__data, data, optlen)) {
- kfree(opt);
- return -EFAULT;
- }
- return ip_options_get_finish(net, optp, opt, optlen);
-}
-
-int ip_options_get(struct net *net, struct ip_options_rcu **optp,
- unsigned char *data, int optlen)
-{
- struct ip_options_rcu *opt = ip_options_get_alloc(optlen);
-
- if (!opt)
- return -ENOMEM;
- if (optlen)
- memcpy(opt->opt.__data, data, optlen);
- return ip_options_get_finish(net, optp, opt, optlen);
-}
-
void ip_forward_options(struct sk_buff *skb)
{
struct ip_options *opt = &(IPCB(skb)->opt);
@@ -600,7 +582,7 @@ void ip_forward_options(struct sk_buff *skb)
}
}
-int ip_options_rcv_srr(struct sk_buff *skb)
+int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev)
{
struct ip_options *opt = &(IPCB(skb)->opt);
int srrspace, srrptr;
@@ -633,13 +615,13 @@ int ip_options_rcv_srr(struct sk_buff *skb)
}
memcpy(&nexthop, &optptr[srrptr-1], 4);
- orefdst = skb->_skb_refdst;
- skb_dst_set(skb, NULL);
- err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
+ orefdst = skb_dstref_steal(skb);
+ err = ip_route_input(skb, nexthop, iph->saddr, ip4h_dscp(iph),
+ dev) ? -EINVAL : 0;
rt2 = skb_rtable(skb);
if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
skb_dst_drop(skb);
- skb->_skb_refdst = orefdst;
+ skb_dstref_restore(skb, orefdst);
return -EINVAL;
}
refdst_drop(orefdst);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 9c4e72e9c60a..ff11d3a85a36 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -33,7 +34,7 @@
* Andi Kleen : Replace ip_reply with ip_send_reply.
* Andi Kleen : Split fast and slow ip_build_xmit path
* for decreased register pressure on x86
- * and more readibility.
+ * and more readability.
* Marc Boucher : When call_out_firewall returns FW_QUEUE,
* silently drop skb instead of failing with -EPERM.
* Detlev Wengorz : Copy protocol for fragments.
@@ -62,6 +63,7 @@
#include <linux/stat.h>
#include <linux/init.h>
+#include <net/flow.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -72,14 +74,17 @@
#include <net/arp.h>
#include <net/icmp.h>
#include <net/checksum.h>
+#include <net/gso.h>
#include <net/inetpeer.h>
#include <net/lwtunnel.h>
+#include <net/inet_dscp.h>
#include <linux/bpf-cgroup.h>
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
#include <linux/netlink.h>
#include <linux/tcp.h>
+#include <net/psp.h>
static int
ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
@@ -98,7 +103,9 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct iphdr *iph = ip_hdr(skb);
- iph->tot_len = htons(skb->len);
+ IP_INC_STATS(net, IPSTATS_MIB_OUTREQUESTS);
+
+ iph_set_totlen(iph, skb->len);
ip_send_check(iph);
/* if egress device is enslaved to an L3 master device pass the
@@ -111,7 +118,7 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
skb->protocol = htons(ETH_P_IP);
return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
- net, sk, skb, NULL, skb_dst(skb)->dev,
+ net, sk, skb, NULL, skb_dst_dev(skb),
dst_output);
}
@@ -127,9 +134,10 @@ int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL_GPL(ip_local_out);
-static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
+static inline int ip_select_ttl(const struct inet_sock *inet,
+ const struct dst_entry *dst)
{
- int ttl = inet->uc_ttl;
+ int ttl = READ_ONCE(inet->uc_ttl);
if (ttl < 0)
ttl = ip4_dst_hoplimit(dst);
@@ -141,9 +149,10 @@ static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
*
*/
int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
- __be32 saddr, __be32 daddr, struct ip_options_rcu *opt)
+ __be32 saddr, __be32 daddr, struct ip_options_rcu *opt,
+ u8 tos)
{
- struct inet_sock *inet = inet_sk(sk);
+ const struct inet_sock *inet = inet_sk(sk);
struct rtable *rt = skb_rtable(skb);
struct net *net = sock_net(sk);
struct iphdr *iph;
@@ -154,27 +163,34 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
iph = ip_hdr(skb);
iph->version = 4;
iph->ihl = 5;
- iph->tos = inet->tos;
+ iph->tos = tos;
iph->ttl = ip_select_ttl(inet, &rt->dst);
iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
iph->saddr = saddr;
iph->protocol = sk->sk_protocol;
- if (ip_dont_fragment(sk, &rt->dst)) {
+ /* Do not bother generating IPID for small packets (eg SYNACK) */
+ if (skb->len <= IPV4_MIN_MTU || ip_dont_fragment(sk, &rt->dst)) {
iph->frag_off = htons(IP_DF);
iph->id = 0;
} else {
iph->frag_off = 0;
- __ip_select_ident(net, iph, 1);
+ /* TCP packets here are SYNACK with fat IPv4/TCP options.
+ * Avoid using the hashed IP ident generator.
+ */
+ if (sk->sk_protocol == IPPROTO_TCP)
+ iph->id = (__force __be16)get_random_u16();
+ else
+ __ip_select_ident(net, iph, 1);
}
if (opt && opt->opt.optlen) {
iph->ihl += opt->opt.optlen>>2;
- ip_options_build(skb, &opt->opt, daddr, rt, 0);
+ ip_options_build(skb, &opt->opt, daddr, rt);
}
- skb->priority = sk->sk_priority;
+ skb->priority = READ_ONCE(sk->sk_priority);
if (!skb->mark)
- skb->mark = sk->sk_mark;
+ skb->mark = READ_ONCE(sk->sk_mark);
/* Send it out. */
return ip_local_out(net, skb->sk, skb);
@@ -184,66 +200,57 @@ EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
- struct rtable *rt = (struct rtable *)dst;
- struct net_device *dev = dst->dev;
+ struct rtable *rt = dst_rtable(dst);
+ struct net_device *dev = dst_dev(dst);
unsigned int hh_len = LL_RESERVED_SPACE(dev);
struct neighbour *neigh;
- u32 nexthop;
+ bool is_v6gw = false;
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
- /* Be paranoid, rather than too clever. */
- if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
- struct sk_buff *skb2;
+ /* OUTOCTETS should be counted after fragment */
+ IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
- skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
- if (!skb2) {
- kfree_skb(skb);
+ if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
+ skb = skb_expand_head(skb, hh_len);
+ if (!skb)
return -ENOMEM;
- }
- if (skb->sk)
- skb_set_owner_w(skb2, skb->sk);
- consume_skb(skb);
- skb = skb2;
}
if (lwtunnel_xmit_redirect(dst->lwtstate)) {
int res = lwtunnel_xmit(skb);
- if (res < 0 || res == LWTUNNEL_XMIT_DONE)
+ if (res != LWTUNNEL_XMIT_CONTINUE)
return res;
}
- rcu_read_lock_bh();
- nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
- neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
- if (unlikely(!neigh))
- neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
+ rcu_read_lock();
+ neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
if (!IS_ERR(neigh)) {
int res;
sock_confirm_neigh(skb, neigh);
- res = neigh_output(neigh, skb);
-
- rcu_read_unlock_bh();
+ /* if crossing protocols, can not use the cached header */
+ res = neigh_output(neigh, skb, is_v6gw);
+ rcu_read_unlock();
return res;
}
- rcu_read_unlock_bh();
+ rcu_read_unlock();
net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
__func__);
- kfree_skb(skb);
- return -EINVAL;
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
+ return PTR_ERR(neigh);
}
static int ip_finish_output_gso(struct net *net, struct sock *sk,
struct sk_buff *skb, unsigned int mtu)
{
+ struct sk_buff *segs, *nskb;
netdev_features_t features;
- struct sk_buff *segs;
int ret = 0;
/* common case: seglen is <= mtu
@@ -262,10 +269,10 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
* interface with a smaller MTU.
* - Arriving GRO skb (or GSO skb in a virtualized environment) that is
* bridged to a NETIF_F_TSO tunnel stacked over an interface with an
- * insufficent MTU.
+ * insufficient MTU.
*/
features = netif_skb_features(skb);
- BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET);
+ BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_GSO_CB_OFFSET);
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
if (IS_ERR_OR_NULL(segs)) {
kfree_skb(skb);
@@ -274,31 +281,22 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
consume_skb(skb);
- do {
- struct sk_buff *nskb = segs->next;
+ skb_list_walk_safe(segs, segs, nskb) {
int err;
- segs->next = NULL;
+ skb_mark_not_on_list(segs);
err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
if (err && ret == 0)
ret = err;
- segs = nskb;
- } while (segs);
+ }
return ret;
}
-static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int __ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
unsigned int mtu;
- int ret;
-
- ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
- if (ret) {
- kfree_skb(skb);
- return ret;
- }
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
@@ -311,24 +309,60 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
if (skb_is_gso(skb))
return ip_finish_output_gso(net, sk, skb, mtu);
- if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
+ if (skb->len > mtu || IPCB(skb)->frag_max_size)
return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
return ip_finish_output2(net, sk, skb);
}
+static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ switch (ret) {
+ case NET_XMIT_SUCCESS:
+ return __ip_finish_output(net, sk, skb);
+ case NET_XMIT_CN:
+ return __ip_finish_output(net, sk, skb) ? : ret;
+ default:
+ kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
+ return ret;
+ }
+}
+
static int ip_mc_finish_output(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
- int ret;
+ struct rtable *new_rt;
+ bool do_cn = false;
+ int ret, err;
ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
- if (ret) {
- kfree_skb(skb);
+ switch (ret) {
+ case NET_XMIT_CN:
+ do_cn = true;
+ fallthrough;
+ case NET_XMIT_SUCCESS:
+ break;
+ default:
+ kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
return ret;
}
- return dev_loopback_xmit(net, sk, skb);
+ /* Reset rt_iif so that inet_iif() will return skb->skb_iif. Setting
+ * this to non-zero causes ipi_ifindex in in_pktinfo to be overwritten,
+ * see ipv4_pktinfo_prepare().
+ */
+ new_rt = rt_dst_clone(net->loopback_dev, skb_rtable(skb));
+ if (new_rt) {
+ new_rt->rt_iif = 0;
+ skb_dst_drop(skb);
+ skb_dst_set(skb, &new_rt->dst);
+ }
+
+ err = dev_loopback_xmit(net, sk, skb);
+ return (do_cn && err) ? ret : err;
}
int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
@@ -339,8 +373,6 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
/*
* If the indicated interface is up and running, send the packet.
*/
- IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
-
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
@@ -395,18 +427,22 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct net_device *dev = skb_dst(skb)->dev;
-
- IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
+ struct net_device *dev, *indev = skb->dev;
+ int ret_val;
+ rcu_read_lock();
+ dev = skb_dst_dev_rcu(skb);
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
- return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
- net, sk, skb, NULL, dev,
- ip_finish_output,
- !(IPCB(skb)->flags & IPSKB_REROUTED));
+ ret_val = NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
+ net, sk, skb, indev, dev,
+ ip_finish_output,
+ !(IPCB(skb)->flags & IPSKB_REROUTED));
+ rcu_read_unlock();
+ return ret_val;
}
+EXPORT_SYMBOL(ip_output);
/*
* copy saddr and daddr, possibly using 64bit load/stores
@@ -418,8 +454,9 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
{
BUILD_BUG_ON(offsetof(typeof(*fl4), daddr) !=
offsetof(typeof(*fl4), saddr) + sizeof(fl4->saddr));
- memcpy(&iph->saddr, &fl4->saddr,
- sizeof(fl4->saddr) + sizeof(fl4->daddr));
+
+ iph->saddr = fl4->saddr;
+ iph->daddr = fl4->daddr;
}
/* Note: skb->sk can be different from sk, in case of tunnels */
@@ -445,26 +482,18 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
goto packet_routed;
/* Make sure we can route this packet. */
- rt = (struct rtable *)__sk_dst_check(sk, 0);
+ rt = dst_rtable(__sk_dst_check(sk, 0));
if (!rt) {
- __be32 daddr;
+ inet_sk_init_flowi4(inet, fl4);
- /* Use correct destination address if we have options. */
- daddr = inet->inet_daddr;
- if (inet_opt && inet_opt->opt.srr)
- daddr = inet_opt->opt.faddr;
+ /* sctp_v4_xmit() uses its own DSCP value */
+ fl4->flowi4_dscp = inet_dsfield_to_dscp(tos);
/* If this fails, retransmit mechanism of transport layer will
* keep trying until route appears or the connection times
* itself out.
*/
- rt = ip_route_output_ports(net, fl4, sk,
- daddr, inet->inet_saddr,
- inet->inet_dport,
- inet->inet_sport,
- sk->sk_protocol,
- RT_CONN_FLAGS_TOS(sk, tos),
- sk->sk_bound_dev_if);
+ rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
goto no_route;
sk_setup_caps(sk, &rt->dst);
@@ -492,15 +521,15 @@ packet_routed:
if (inet_opt && inet_opt->opt.optlen) {
iph->ihl += inet_opt->opt.optlen >> 2;
- ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
+ ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt);
}
ip_select_ident_segs(net, skb, sk,
skb_shinfo(skb)->gso_segs ?: 1);
/* TODO : should we use skb->sk here instead of sk ? */
- skb->priority = sk->sk_priority;
- skb->mark = sk->sk_mark;
+ skb->priority = READ_ONCE(sk->sk_priority);
+ skb->mark = READ_ONCE(sk->sk_mark);
res = ip_local_out(net, sk, skb);
rcu_read_unlock();
@@ -509,16 +538,23 @@ packet_routed:
no_route:
rcu_read_unlock();
IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_OUTNOROUTES);
return -EHOSTUNREACH;
}
EXPORT_SYMBOL(__ip_queue_xmit);
+int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
+{
+ return __ip_queue_xmit(sk, skb, fl, READ_ONCE(inet_sk(sk)->tos));
+}
+EXPORT_SYMBOL(ip_queue_xmit);
+
static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
{
to->pkt_type = from->pkt_type;
to->priority = from->priority;
to->protocol = from->protocol;
+ to->skb_iif = from->skb_iif;
skb_dst_drop(to);
skb_dst_copy(to, from);
to->dev = from->dev;
@@ -526,13 +562,11 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
skb_copy_hash(to, from);
- /* Copy the flags to each fragment. */
- IPCB(to)->flags = IPCB(from)->flags;
-
#ifdef CONFIG_NET_SCHED
to->tc_index = from->tc_index;
#endif
nf_copy(to, from);
+ skb_ext_copy(to, from);
#if IS_ENABLED(CONFIG_IP_VS)
to->ipvs_property = from->ipvs_property;
#endif
@@ -561,6 +595,162 @@ static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
return ip_do_fragment(net, sk, skb, output);
}
+void ip_fraglist_init(struct sk_buff *skb, struct iphdr *iph,
+ unsigned int hlen, struct ip_fraglist_iter *iter)
+{
+ unsigned int first_len = skb_pagelen(skb);
+
+ iter->frag = skb_shinfo(skb)->frag_list;
+ skb_frag_list_init(skb);
+
+ iter->offset = 0;
+ iter->iph = iph;
+ iter->hlen = hlen;
+
+ skb->data_len = first_len - skb_headlen(skb);
+ skb->len = first_len;
+ iph->tot_len = htons(first_len);
+ iph->frag_off = htons(IP_MF);
+ ip_send_check(iph);
+}
+EXPORT_SYMBOL(ip_fraglist_init);
+
+void ip_fraglist_prepare(struct sk_buff *skb, struct ip_fraglist_iter *iter)
+{
+ unsigned int hlen = iter->hlen;
+ struct iphdr *iph = iter->iph;
+ struct sk_buff *frag;
+
+ frag = iter->frag;
+ frag->ip_summed = CHECKSUM_NONE;
+ skb_reset_transport_header(frag);
+ __skb_push(frag, hlen);
+ skb_reset_network_header(frag);
+ memcpy(skb_network_header(frag), iph, hlen);
+ iter->iph = ip_hdr(frag);
+ iph = iter->iph;
+ iph->tot_len = htons(frag->len);
+ ip_copy_metadata(frag, skb);
+ iter->offset += skb->len - hlen;
+ iph->frag_off = htons(iter->offset >> 3);
+ if (frag->next)
+ iph->frag_off |= htons(IP_MF);
+ /* Ready, complete checksum */
+ ip_send_check(iph);
+}
+EXPORT_SYMBOL(ip_fraglist_prepare);
+
+void ip_frag_init(struct sk_buff *skb, unsigned int hlen,
+ unsigned int ll_rs, unsigned int mtu, bool DF,
+ struct ip_frag_state *state)
+{
+ struct iphdr *iph = ip_hdr(skb);
+
+ state->DF = DF;
+ state->hlen = hlen;
+ state->ll_rs = ll_rs;
+ state->mtu = mtu;
+
+ state->left = skb->len - hlen; /* Space per frame */
+ state->ptr = hlen; /* Where to start from */
+
+ state->offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
+ state->not_last_frag = iph->frag_off & htons(IP_MF);
+}
+EXPORT_SYMBOL(ip_frag_init);
+
+static void ip_frag_ipcb(struct sk_buff *from, struct sk_buff *to,
+ bool first_frag)
+{
+ /* Copy the flags to each fragment. */
+ IPCB(to)->flags = IPCB(from)->flags;
+
+ /* ANK: dirty, but effective trick. Upgrade options only if
+ * the segment to be fragmented was THE FIRST (otherwise,
+ * options are already fixed) and make it ONCE
+ * on the initial skb, so that all the following fragments
+ * will inherit fixed options.
+ */
+ if (first_frag)
+ ip_options_fragment(from);
+}
+
+struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state)
+{
+ unsigned int len = state->left;
+ struct sk_buff *skb2;
+ struct iphdr *iph;
+
+ /* IF: it doesn't fit, use 'mtu' - the data space left */
+ if (len > state->mtu)
+ len = state->mtu;
+ /* IF: we are not sending up to and including the packet end
+ then align the next start on an eight byte boundary */
+ if (len < state->left) {
+ len &= ~7;
+ }
+
+ /* Allocate buffer */
+ skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC);
+ if (!skb2)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Set up data on packet
+ */
+
+ ip_copy_metadata(skb2, skb);
+ skb_reserve(skb2, state->ll_rs);
+ skb_put(skb2, len + state->hlen);
+ skb_reset_network_header(skb2);
+ skb2->transport_header = skb2->network_header + state->hlen;
+
+ /*
+ * Charge the memory for the fragment to any owner
+ * it might possess
+ */
+
+ if (skb->sk)
+ skb_set_owner_w(skb2, skb->sk);
+
+ /*
+ * Copy the packet header into the new buffer.
+ */
+
+ skb_copy_from_linear_data(skb, skb_network_header(skb2), state->hlen);
+
+ /*
+ * Copy a block of the IP datagram.
+ */
+ if (skb_copy_bits(skb, state->ptr, skb_transport_header(skb2), len))
+ BUG();
+ state->left -= len;
+
+ /*
+ * Fill in the new header fields.
+ */
+ iph = ip_hdr(skb2);
+ iph->frag_off = htons((state->offset >> 3));
+ if (state->DF)
+ iph->frag_off |= htons(IP_DF);
+
+ /*
+ * Added AC : If we are fragmenting a fragment that's not the
+ * last fragment then keep MF on each bit
+ */
+ if (state->left > 0 || state->not_last_frag)
+ iph->frag_off |= htons(IP_MF);
+ state->ptr += len;
+ state->offset += len;
+
+ iph->tot_len = htons(len + state->hlen);
+
+ ip_send_check(iph);
+
+ return skb2;
+}
+EXPORT_SYMBOL(ip_frag_next);
+
/*
* This IP datagram is too large to be sent in one piece. Break it up into
* smaller pieces (each of size equal to IP header plus
@@ -572,12 +762,13 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
int (*output)(struct net *, struct sock *, struct sk_buff *))
{
struct iphdr *iph;
- int ptr;
struct sk_buff *skb2;
- unsigned int mtu, hlen, left, len, ll_rs;
- int offset;
- __be16 not_last_frag;
+ u8 tstamp_type = skb->tstamp_type;
struct rtable *rt = skb_rtable(skb);
+ unsigned int mtu, hlen, ll_rs;
+ struct ip_fraglist_iter iter;
+ ktime_t tstamp = skb->tstamp;
+ struct ip_frag_state state;
int err = 0;
/* for offloaded checksums cleanup checksum before fragmentation */
@@ -642,49 +833,37 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
}
/* Everything is OK. Generate! */
-
- err = 0;
- offset = 0;
- frag = skb_shinfo(skb)->frag_list;
- skb_frag_list_init(skb);
- skb->data_len = first_len - skb_headlen(skb);
- skb->len = first_len;
- iph->tot_len = htons(first_len);
- iph->frag_off = htons(IP_MF);
- ip_send_check(iph);
+ ip_fraglist_init(skb, iph, hlen, &iter);
for (;;) {
/* Prepare header of the next frame,
* before previous one went down. */
- if (frag) {
- frag->ip_summed = CHECKSUM_NONE;
- skb_reset_transport_header(frag);
- __skb_push(frag, hlen);
- skb_reset_network_header(frag);
- memcpy(skb_network_header(frag), iph, hlen);
- iph = ip_hdr(frag);
- iph->tot_len = htons(frag->len);
- ip_copy_metadata(frag, skb);
- if (offset == 0)
- ip_options_fragment(frag);
- offset += skb->len - hlen;
- iph->frag_off = htons(offset>>3);
- if (frag->next)
- iph->frag_off |= htons(IP_MF);
- /* Ready, complete checksum */
- ip_send_check(iph);
+ if (iter.frag) {
+ bool first_frag = (iter.offset == 0);
+
+ IPCB(iter.frag)->flags = IPCB(skb)->flags;
+ ip_fraglist_prepare(skb, &iter);
+ if (first_frag && IPCB(skb)->opt.optlen) {
+ /* ipcb->opt is not populated for frags
+ * coming from __ip_make_skb(),
+ * ip_options_fragment() needs optlen
+ */
+ IPCB(iter.frag)->opt.optlen =
+ IPCB(skb)->opt.optlen;
+ ip_options_fragment(iter.frag);
+ ip_send_check(iter.iph);
+ }
}
+ skb_set_delivery_time(skb, tstamp, tstamp_type);
err = output(net, sk, skb);
if (!err)
IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
- if (err || !frag)
+ if (err || !iter.frag)
break;
- skb = frag;
- frag = skb->next;
- skb->next = NULL;
+ skb = ip_fraglist_next(&iter);
}
if (err == 0) {
@@ -692,11 +871,8 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
return 0;
}
- while (frag) {
- skb = frag->next;
- kfree_skb(frag);
- frag = skb;
- }
+ kfree_skb_list(iter.frag);
+
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
return err;
@@ -711,105 +887,31 @@ slow_path_clean:
}
slow_path:
- iph = ip_hdr(skb);
-
- left = skb->len - hlen; /* Space per frame */
- ptr = hlen; /* Where to start from */
-
/*
* Fragment the datagram.
*/
- offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
- not_last_frag = iph->frag_off & htons(IP_MF);
+ ip_frag_init(skb, hlen, ll_rs, mtu, IPCB(skb)->flags & IPSKB_FRAG_PMTU,
+ &state);
/*
* Keep copying data until we run out.
*/
- while (left > 0) {
- len = left;
- /* IF: it doesn't fit, use 'mtu' - the data space left */
- if (len > mtu)
- len = mtu;
- /* IF: we are not sending up to and including the packet end
- then align the next start on an eight byte boundary */
- if (len < left) {
- len &= ~7;
- }
+ while (state.left > 0) {
+ bool first_frag = (state.offset == 0);
- /* Allocate buffer */
- skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC);
- if (!skb2) {
- err = -ENOMEM;
+ skb2 = ip_frag_next(skb, &state);
+ if (IS_ERR(skb2)) {
+ err = PTR_ERR(skb2);
goto fail;
}
-
- /*
- * Set up data on packet
- */
-
- ip_copy_metadata(skb2, skb);
- skb_reserve(skb2, ll_rs);
- skb_put(skb2, len + hlen);
- skb_reset_network_header(skb2);
- skb2->transport_header = skb2->network_header + hlen;
-
- /*
- * Charge the memory for the fragment to any owner
- * it might possess
- */
-
- if (skb->sk)
- skb_set_owner_w(skb2, skb->sk);
-
- /*
- * Copy the packet header into the new buffer.
- */
-
- skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
-
- /*
- * Copy a block of the IP datagram.
- */
- if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
- BUG();
- left -= len;
-
- /*
- * Fill in the new header fields.
- */
- iph = ip_hdr(skb2);
- iph->frag_off = htons((offset >> 3));
-
- if (IPCB(skb)->flags & IPSKB_FRAG_PMTU)
- iph->frag_off |= htons(IP_DF);
-
- /* ANK: dirty, but effective trick. Upgrade options only if
- * the segment to be fragmented was THE FIRST (otherwise,
- * options are already fixed) and make it ONCE
- * on the initial skb, so that all the following fragments
- * will inherit fixed options.
- */
- if (offset == 0)
- ip_options_fragment(skb);
-
- /*
- * Added AC : If we are fragmenting a fragment that's not the
- * last fragment then keep MF on each bit
- */
- if (left > 0 || not_last_frag)
- iph->frag_off |= htons(IP_MF);
- ptr += len;
- offset += len;
+ ip_frag_ipcb(skb, skb2, first_frag);
/*
* Put this fragment into the sending queue.
*/
- iph->tot_len = htons(len + hlen);
-
- ip_send_check(iph);
-
+ skb_set_delivery_time(skb2, tstamp, tstamp_type);
err = output(net, sk, skb2);
if (err)
goto fail;
@@ -845,17 +947,6 @@ ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk
}
EXPORT_SYMBOL(ip_generic_getfrag);
-static inline __wsum
-csum_page(struct page *page, int offset, int copy)
-{
- char *kaddr;
- __wsum csum;
- kaddr = kmap(page);
- csum = csum_partial(kaddr + offset, copy, 0);
- kunmap(page);
- return csum;
-}
-
static int __ip_append_data(struct sock *sk,
struct flowi4 *fl4,
struct sk_buff_head *queue,
@@ -867,8 +958,8 @@ static int __ip_append_data(struct sock *sk,
unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
+ struct ubuf_info *uarg = NULL;
struct sk_buff *skb;
-
struct ip_options *opt = cork->opt;
int hh_len;
int exthdrlen;
@@ -876,12 +967,13 @@ static int __ip_append_data(struct sock *sk,
int copy;
int err;
int offset = 0;
+ bool zc = false;
unsigned int maxfraglen, fragheaderlen, maxnonfragsize;
int csummode = CHECKSUM_NONE;
- struct rtable *rt = (struct rtable *)cork->dst;
+ struct rtable *rt = dst_rtable(cork->dst);
+ bool paged, hold_tskey = false, extra_uref = false;
unsigned int wmem_alloc_delta = 0;
u32 tskey = 0;
- bool paged;
skb = skb_peek_tail(queue);
@@ -889,15 +981,11 @@ static int __ip_append_data(struct sock *sk,
mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
paged = !!cork->gso_size;
- if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
- sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
- tskey = sk->sk_tskey++;
-
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
- maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
+ maxnonfragsize = ip_sk_ignore_df(sk) ? IP_MAX_MTU : mtu;
if (cork->length + length > maxnonfragsize - fragheaderlen) {
ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
@@ -916,8 +1004,60 @@ static int __ip_append_data(struct sock *sk,
(!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
csummode = CHECKSUM_PARTIAL;
+ if ((flags & MSG_ZEROCOPY) && length) {
+ struct msghdr *msg = from;
+
+ if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
+ if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
+ return -EINVAL;
+
+ /* Leave uarg NULL if can't zerocopy, callers should
+ * be able to handle it.
+ */
+ if ((rt->dst.dev->features & NETIF_F_SG) &&
+ csummode == CHECKSUM_PARTIAL) {
+ paged = true;
+ zc = true;
+ uarg = msg->msg_ubuf;
+ }
+ } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
+ uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
+ false);
+ if (!uarg)
+ return -ENOBUFS;
+ extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
+ if (rt->dst.dev->features & NETIF_F_SG &&
+ csummode == CHECKSUM_PARTIAL) {
+ paged = true;
+ zc = true;
+ } else {
+ uarg_to_msgzc(uarg)->zerocopy = 0;
+ skb_zcopy_set(skb, uarg, &extra_uref);
+ }
+ }
+ } else if ((flags & MSG_SPLICE_PAGES) && length) {
+ if (inet_test_bit(HDRINCL, sk))
+ return -EPERM;
+ if (rt->dst.dev->features & NETIF_F_SG &&
+ getfrag == ip_generic_getfrag)
+ /* We need an empty buffer to attach stuff to */
+ paged = true;
+ else
+ flags &= ~MSG_SPLICE_PAGES;
+ }
+
cork->length += length;
+ if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
+ READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
+ if (cork->flags & IPCORK_TS_OPT_ID) {
+ tskey = cork->ts_opt_id;
+ } else {
+ tskey = atomic_inc_return(&sk->sk_tskey) - 1;
+ hold_tskey = true;
+ }
+ }
+
/* So, what's going on in the loop below?
*
* We use calculated fragment length to generate chained skb,
@@ -938,8 +1078,8 @@ static int __ip_append_data(struct sock *sk,
unsigned int datalen;
unsigned int fraglen;
unsigned int fraggap;
- unsigned int alloclen;
- unsigned int pagedlen = 0;
+ unsigned int alloclen, alloc_extra;
+ unsigned int pagedlen;
struct sk_buff *skb_prev;
alloc_new_skb:
skb_prev = skb;
@@ -956,18 +1096,10 @@ alloc_new_skb:
if (datalen > mtu - fragheaderlen)
datalen = maxfraglen - fragheaderlen;
fraglen = datalen + fragheaderlen;
+ pagedlen = 0;
- if ((flags & MSG_MORE) &&
- !(rt->dst.dev->features&NETIF_F_SG))
- alloclen = mtu;
- else if (!paged)
- alloclen = fraglen;
- else {
- alloclen = min_t(int, fraglen, MAX_HEADER);
- pagedlen = fraglen - alloclen;
- }
-
- alloclen += exthdrlen;
+ alloc_extra = hh_len + 15;
+ alloc_extra += exthdrlen;
/* The last fragment gets additional space at tail.
* Note, with MSG_MORE we overallocate on fragments,
@@ -975,17 +1107,30 @@ alloc_new_skb:
* the last.
*/
if (datalen == length + fraggap)
- alloclen += rt->dst.trailer_len;
+ alloc_extra += rt->dst.trailer_len;
+
+ if ((flags & MSG_MORE) &&
+ !(rt->dst.dev->features&NETIF_F_SG))
+ alloclen = mtu;
+ else if (!paged &&
+ (fraglen + alloc_extra < SKB_MAX_ALLOC ||
+ !(rt->dst.dev->features & NETIF_F_SG)))
+ alloclen = fraglen;
+ else {
+ alloclen = fragheaderlen + transhdrlen;
+ pagedlen = datalen - transhdrlen;
+ }
+
+ alloclen += alloc_extra;
if (transhdrlen) {
- skb = sock_alloc_send_skb(sk,
- alloclen + hh_len + 15,
+ skb = sock_alloc_send_skb(sk, alloclen,
(flags & MSG_DONTWAIT), &err);
} else {
skb = NULL;
if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
2 * sk->sk_sndbuf)
- skb = alloc_skb(alloclen + hh_len + 15,
+ skb = alloc_skb(alloclen,
sk->sk_allocation);
if (unlikely(!skb))
err = -ENOBUFS;
@@ -1000,12 +1145,6 @@ alloc_new_skb:
skb->csum = 0;
skb_reserve(skb, hh_len);
- /* only the initial fragment is time stamped */
- skb_shinfo(skb)->tx_flags = cork->tx_flags;
- cork->tx_flags = 0;
- skb_shinfo(skb)->tskey = tskey;
- tskey = 0;
-
/*
* Find where to start putting bytes.
*/
@@ -1018,7 +1157,7 @@ alloc_new_skb:
if (fraggap) {
skb->csum = skb_copy_and_csum_bits(
skb_prev, maxfraglen,
- data + transhdrlen, fraggap, 0);
+ data + transhdrlen, fraggap);
skb_prev->csum = csum_sub(skb_prev->csum,
skb->csum);
data += fraggap;
@@ -1026,10 +1165,18 @@ alloc_new_skb:
}
copy = datalen - transhdrlen - fraggap - pagedlen;
- if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
+ /* [!] NOTE: copy will be negative if pagedlen>0
+ * because then the equation reduces to -fraggap.
+ */
+ if (copy > 0 &&
+ INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from, data + transhdrlen, offset,
+ copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
goto error;
+ } else if (flags & MSG_SPLICE_PAGES) {
+ copy = 0;
}
offset += copy;
@@ -1038,6 +1185,13 @@ alloc_new_skb:
exthdrlen = 0;
csummode = CHECKSUM_NONE;
+ /* only the initial fragment is time stamped */
+ skb_shinfo(skb)->tx_flags = cork->tx_flags;
+ cork->tx_flags = 0;
+ skb_shinfo(skb)->tskey = tskey;
+ tskey = 0;
+ skb_zcopy_set(skb, uarg, &extra_uref);
+
if ((flags & MSG_CONFIRM) && !skb_prev)
skb_set_dst_pending_confirm(skb, 1);
@@ -1061,19 +1215,33 @@ alloc_new_skb:
unsigned int off;
off = skb->len;
- if (getfrag(from, skb_put(skb, copy),
- offset, copy, off, skb) < 0) {
+ if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from, skb_put(skb, copy),
+ offset, copy, off, skb) < 0) {
__skb_trim(skb, off);
err = -EFAULT;
goto error;
}
- } else {
+ } else if (flags & MSG_SPLICE_PAGES) {
+ struct msghdr *msg = from;
+
+ err = -EIO;
+ if (WARN_ON_ONCE(copy > msg->msg_iter.count))
+ goto error;
+
+ err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
+ if (err < 0)
+ goto error;
+ copy = err;
+ wmem_alloc_delta += copy;
+ } else if (!zc) {
int i = skb_shinfo(skb)->nr_frags;
err = -ENOMEM;
if (!sk_page_frag_refill(sk, pfrag))
goto error;
+ skb_zcopy_downgrade_managed(skb);
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
err = -EMSGSIZE;
@@ -1086,17 +1254,20 @@ alloc_new_skb:
get_page(pfrag->page);
}
copy = min_t(int, copy, pfrag->size - pfrag->offset);
- if (getfrag(from,
+ if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from,
page_address(pfrag->page) + pfrag->offset,
offset, copy, skb->len, skb) < 0)
goto error_efault;
pfrag->offset += copy;
skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
- skb->len += copy;
- skb->data_len += copy;
- skb->truesize += copy;
+ skb_len_add(skb, copy);
wmem_alloc_delta += copy;
+ } else {
+ err = skb_zerocopy_iter_dgram(skb, from, copy);
+ if (err < 0)
+ goto error;
}
offset += copy;
length -= copy;
@@ -1109,9 +1280,12 @@ alloc_new_skb:
error_efault:
err = -EFAULT;
error:
+ net_zcopy_put_abort(uarg, extra_uref);
cork->length -= length;
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
+ if (hold_tskey)
+ atomic_dec(&sk->sk_tskey);
return err;
}
@@ -1125,6 +1299,12 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
if (unlikely(!rt))
return -EFAULT;
+ cork->fragsize = ip_sk_use_pmtu(sk) ?
+ dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);
+
+ if (!inetdev_valid_mtu(cork->fragsize))
+ return -ENETUNREACH;
+
/*
* setup for corking.
*/
@@ -1141,31 +1321,33 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
cork->addr = ipc->addr;
}
- /*
- * We steal reference to this route, caller should not release it
- */
- *rtp = NULL;
- cork->fragsize = ip_sk_use_pmtu(sk) ?
- dst_mtu(&rt->dst) : rt->dst.dev->mtu;
-
cork->gso_size = ipc->gso_size;
+
cork->dst = &rt->dst;
+ /* We stole this route, caller should not release it. */
+ *rtp = NULL;
+
cork->length = 0;
cork->ttl = ipc->ttl;
cork->tos = ipc->tos;
- cork->priority = ipc->priority;
+ cork->mark = ipc->sockc.mark;
+ cork->priority = ipc->sockc.priority;
cork->transmit_time = ipc->sockc.transmit_time;
cork->tx_flags = 0;
- sock_tx_timestamp(sk, ipc->sockc.tsflags, &cork->tx_flags);
+ sock_tx_timestamp(sk, &ipc->sockc, &cork->tx_flags);
+ if (ipc->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) {
+ cork->flags |= IPCORK_TS_OPT_ID;
+ cork->ts_opt_id = ipc->sockc.ts_opt_id;
+ }
return 0;
}
/*
- * ip_append_data() and ip_append_page() can make one large IP datagram
- * from many pieces of data. Each pieces will be holded on the socket
- * until ip_push_pending_frames() is called. Each piece can be a page
- * or non-page data.
+ * ip_append_data() can make one large IP datagram from many pieces of
+ * data. Each piece will be held on the socket until
+ * ip_push_pending_frames() is called. Each piece can be a page or
+ * non-page data.
*
* Not only UDP, other transport protocols - e.g. raw sockets - can use
* this interface potentially.
@@ -1198,136 +1380,6 @@ int ip_append_data(struct sock *sk, struct flowi4 *fl4,
from, length, transhdrlen, flags);
}
-ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
- int offset, size_t size, int flags)
-{
- struct inet_sock *inet = inet_sk(sk);
- struct sk_buff *skb;
- struct rtable *rt;
- struct ip_options *opt = NULL;
- struct inet_cork *cork;
- int hh_len;
- int mtu;
- int len;
- int err;
- unsigned int maxfraglen, fragheaderlen, fraggap, maxnonfragsize;
-
- if (inet->hdrincl)
- return -EPERM;
-
- if (flags&MSG_PROBE)
- return 0;
-
- if (skb_queue_empty(&sk->sk_write_queue))
- return -EINVAL;
-
- cork = &inet->cork.base;
- rt = (struct rtable *)cork->dst;
- if (cork->flags & IPCORK_OPT)
- opt = cork->opt;
-
- if (!(rt->dst.dev->features&NETIF_F_SG))
- return -EOPNOTSUPP;
-
- hh_len = LL_RESERVED_SPACE(rt->dst.dev);
- mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
-
- fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
- maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
- maxnonfragsize = ip_sk_ignore_df(sk) ? 0xFFFF : mtu;
-
- if (cork->length + size > maxnonfragsize - fragheaderlen) {
- ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport,
- mtu - (opt ? opt->optlen : 0));
- return -EMSGSIZE;
- }
-
- skb = skb_peek_tail(&sk->sk_write_queue);
- if (!skb)
- return -EINVAL;
-
- cork->length += size;
-
- while (size > 0) {
- /* Check if the remaining data fits into current packet. */
- len = mtu - skb->len;
- if (len < size)
- len = maxfraglen - skb->len;
-
- if (len <= 0) {
- struct sk_buff *skb_prev;
- int alloclen;
-
- skb_prev = skb;
- fraggap = skb_prev->len - maxfraglen;
-
- alloclen = fragheaderlen + hh_len + fraggap + 15;
- skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
- if (unlikely(!skb)) {
- err = -ENOBUFS;
- goto error;
- }
-
- /*
- * Fill in the control structures
- */
- skb->ip_summed = CHECKSUM_NONE;
- skb->csum = 0;
- skb_reserve(skb, hh_len);
-
- /*
- * Find where to start putting bytes.
- */
- skb_put(skb, fragheaderlen + fraggap);
- skb_reset_network_header(skb);
- skb->transport_header = (skb->network_header +
- fragheaderlen);
- if (fraggap) {
- skb->csum = skb_copy_and_csum_bits(skb_prev,
- maxfraglen,
- skb_transport_header(skb),
- fraggap, 0);
- skb_prev->csum = csum_sub(skb_prev->csum,
- skb->csum);
- pskb_trim_unique(skb_prev, maxfraglen);
- }
-
- /*
- * Put the packet on the pending queue.
- */
- __skb_queue_tail(&sk->sk_write_queue, skb);
- continue;
- }
-
- if (len > size)
- len = size;
-
- if (skb_append_pagefrags(skb, page, offset, len)) {
- err = -EMSGSIZE;
- goto error;
- }
-
- if (skb->ip_summed == CHECKSUM_NONE) {
- __wsum csum;
- csum = csum_page(page, offset, len);
- skb->csum = csum_block_add(skb->csum, csum, skb->len);
- }
-
- skb->len += len;
- skb->data_len += len;
- skb->truesize += len;
- refcount_add(len, &sk->sk_wmem_alloc);
- offset += len;
- size -= len;
- }
- return 0;
-
-error:
- cork->length -= size;
- IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
- return err;
-}
-
static void ip_cork_release(struct inet_cork *cork)
{
cork->flags &= ~IPCORK_OPT;
@@ -1351,10 +1403,10 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
struct ip_options *opt = NULL;
- struct rtable *rt = (struct rtable *)cork->dst;
+ struct rtable *rt = dst_rtable(cork->dst);
struct iphdr *iph;
+ u8 pmtudisc, ttl;
__be16 df = 0;
- __u8 ttl;
skb = __skb_dequeue(queue);
if (!skb)
@@ -1384,8 +1436,9 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
/* DF bit is set when we want to see DF on outgoing frames.
* If ignore_df is set too, we still allow to fragment this frame
* locally. */
- if (inet->pmtudisc == IP_PMTUDISC_DO ||
- inet->pmtudisc == IP_PMTUDISC_PROBE ||
+ pmtudisc = READ_ONCE(inet->pmtudisc);
+ if (pmtudisc == IP_PMTUDISC_DO ||
+ pmtudisc == IP_PMTUDISC_PROBE ||
(skb->len <= dst_mtu(&rt->dst) &&
ip_dont_fragment(sk, &rt->dst)))
df = htons(IP_DF);
@@ -1396,14 +1449,14 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
if (cork->ttl != 0)
ttl = cork->ttl;
else if (rt->rt_type == RTN_MULTICAST)
- ttl = inet->mc_ttl;
+ ttl = READ_ONCE(inet->mc_ttl);
else
ttl = ip_select_ttl(inet, &rt->dst);
iph = ip_hdr(skb);
iph->version = 4;
iph->ihl = 5;
- iph->tos = (cork->tos != -1) ? cork->tos : inet->tos;
+ iph->tos = (cork->tos != -1) ? cork->tos : READ_ONCE(inet->tos);
iph->frag_off = df;
iph->ttl = ttl;
iph->protocol = sk->sk_protocol;
@@ -1411,13 +1464,16 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
ip_select_ident(net, skb, sk);
if (opt) {
- iph->ihl += opt->optlen>>2;
- ip_options_build(skb, opt, cork->addr, rt, 0);
+ iph->ihl += opt->optlen >> 2;
+ ip_options_build(skb, opt, cork->addr, rt);
}
- skb->priority = (cork->tos != -1) ? cork->priority: sk->sk_priority;
- skb->mark = sk->sk_mark;
- skb->tstamp = cork->transmit_time;
+ skb->priority = cork->priority;
+ skb->mark = cork->mark;
+ if (sk_is_tcp(sk))
+ skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC);
+ else
+ skb_set_delivery_type_by_clockid(skb, cork->transmit_time, sk->sk_clockid);
/*
* Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
* on dst refcount
@@ -1425,9 +1481,20 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
cork->dst = NULL;
skb_dst_set(skb, &rt->dst);
- if (iph->protocol == IPPROTO_ICMP)
- icmp_out_count(net, ((struct icmphdr *)
- skb_transport_header(skb))->type);
+ if (iph->protocol == IPPROTO_ICMP) {
+ u8 icmp_type;
+
+ /* For such sockets, transhdrlen is zero when do ip_append_data(),
+ * so icmphdr does not in skb linear region and can not get icmp_type
+ * by icmp_hdr(skb)->type.
+ */
+ if (sk->sk_type == SOCK_RAW &&
+ !(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH))
+ icmp_type = fl4->fl4_icmp_type;
+ else
+ icmp_type = icmp_hdr(skb)->type;
+ icmp_out_count(net, icmp_type);
+ }
ip_cork_release(cork);
out:
@@ -1523,7 +1590,7 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
{
__wsum csum;
- csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
+ csum = csum_partial_copy_nocheck(dptr+offset, to, len);
skb->csum = csum_block_add(skb->csum, csum, odd);
return 0;
}
@@ -1532,11 +1599,12 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
* Generic function to send a packet as reply to another packet.
* Used to send some TCP resets/acks so far.
*/
-void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
+void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
+ struct sk_buff *skb,
const struct ip_options *sopt,
__be32 daddr, __be32 saddr,
const struct ip_reply_arg *arg,
- unsigned int len)
+ unsigned int len, u64 transmit_time, u32 txhash)
{
struct ip_options_data replyopts;
struct ipcm_cookie ipc;
@@ -1552,6 +1620,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
ipcm_init(&ipc);
ipc.addr = daddr;
+ ipc.sockc.transmit_time = transmit_time;
if (replyopts.opt.opt.optlen) {
ipc.opt = &replyopts.opt;
@@ -1566,24 +1635,23 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
flowi4_init_output(&fl4, oif,
IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
- RT_TOS(arg->tos),
+ arg->tos & INET_DSCP_MASK,
RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
ip_reply_arg_flowi_flags(arg),
daddr, saddr,
tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
arg->uid);
- security_skb_classify_flow(skb, flowi4_to_flowi(&fl4));
- rt = ip_route_output_key(net, &fl4);
+ security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
+ rt = ip_route_output_flow(net, &fl4, sk);
if (IS_ERR(rt))
return;
inet_sk(sk)->tos = arg->tos;
- sk->sk_priority = skb->priority;
sk->sk_protocol = ip_hdr(skb)->protocol;
sk->sk_bound_dev_if = arg->bound_dev_if;
- sk->sk_sndbuf = sysctl_wmem_default;
- sk->sk_mark = fl4.flowi4_mark;
+ sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
+ ipc.sockc.mark = fl4.flowi4_mark;
err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
len, 0, &ipc, &rt, MSG_DONTWAIT);
if (unlikely(err)) {
@@ -1598,6 +1666,14 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
arg->csumoffset) = csum_fold(csum_add(nskb->csum,
arg->csum));
nskb->ip_summed = CHECKSUM_NONE;
+ if (orig_sk) {
+ skb_set_owner_edemux(nskb, (struct sock *)orig_sk);
+ psp_reply_set_decrypted(orig_sk, nskb);
+ }
+ if (transmit_time)
+ nskb->tstamp_type = SKB_CLOCK_MONOTONIC;
+ if (txhash)
+ skb_set_hash(nskb, txhash, PKT_HASH_TYPE_L4);
ip_push_pending_frames(sk, &fl4);
}
out:
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 26c36cccabdc..6d9c5c20b1c4 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -47,8 +47,6 @@
#include <linux/errqueue.h>
#include <linux/uaccess.h>
-#include <linux/bpfilter.h>
-
/*
* SOL_IP control messages.
*/
@@ -130,37 +128,35 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
{
- char *secdata;
- u32 seclen, secid;
+ struct lsm_context ctx;
+ u32 secid;
int err;
err = security_socket_getpeersec_dgram(NULL, skb, &secid);
if (err)
return;
- err = security_secid_to_secctx(secid, &secdata, &seclen);
- if (err)
+ err = security_secid_to_secctx(secid, &ctx);
+ if (err < 0)
return;
- put_cmsg(msg, SOL_IP, SCM_SECURITY, seclen, secdata);
- security_release_secctx(secdata, seclen);
+ put_cmsg(msg, SOL_IP, SCM_SECURITY, ctx.len, ctx.context);
+ security_release_secctx(&ctx);
}
static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
{
+ __be16 _ports[2], *ports;
struct sockaddr_in sin;
- __be16 *ports;
- int end;
-
- end = skb_transport_offset(skb) + 4;
- if (end > 0 && !pskb_may_pull(skb, end))
- return;
/* All current transport protocols have the port numbers in the
* first four bytes of the transport header and this function is
* written with this assumption in mind.
*/
- ports = (__be16 *)skb_transport_header(skb);
+ ports = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_ports), &_ports);
+ if (!ports)
+ return;
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = ip_hdr(skb)->daddr;
@@ -173,8 +169,10 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
void ip_cmsg_recv_offset(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb, int tlen, int offset)
{
- struct inet_sock *inet = inet_sk(sk);
- unsigned int flags = inet->cmsg_flags;
+ unsigned long flags = inet_cmsg_flags(inet_sk(sk));
+
+ if (!flags)
+ return;
/* Ordered by supposed usage frequency */
if (flags & IP_CMSG_PKTINFO) {
@@ -269,7 +267,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
}
#endif
if (cmsg->cmsg_level == SOL_SOCKET) {
- err = __sock_cmsg_send(sk, msg, cmsg, &ipc->sockc);
+ err = __sock_cmsg_send(sk, cmsg, &ipc->sockc);
if (err)
return err;
continue;
@@ -282,7 +280,8 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
err = cmsg->cmsg_len - sizeof(struct cmsghdr);
/* Our caller is responsible for freeing ipc->opt */
- err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg),
+ err = ip_options_get(net, &ipc->opt,
+ KERNEL_SOCKPTR(CMSG_DATA(cmsg)),
err < 40 ? err : 40);
if (err)
return err;
@@ -316,9 +315,16 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
if (val < 0 || val > 255)
return -EINVAL;
ipc->tos = val;
- ipc->priority = rt_tos2priority(ipc->tos);
+ ipc->sockc.priority = rt_tos2priority(ipc->tos);
+ break;
+ case IP_PROTOCOL:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
+ return -EINVAL;
+ val = *(int *)CMSG_DATA(cmsg);
+ if (val < 1 || val > 255)
+ return -EINVAL;
+ ipc->protocol = val;
break;
-
default:
return -EINVAL;
}
@@ -345,6 +351,8 @@ int ip_ra_control(struct sock *sk, unsigned char on,
return -EINVAL;
new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
+ if (on && !new_ra)
+ return -ENOMEM;
mutex_lock(&net->ipv4.ra_mutex);
for (rap = &net->ipv4.ra_chain;
@@ -389,6 +397,18 @@ int ip_ra_control(struct sock *sk, unsigned char on,
return 0;
}
+static void ipv4_icmp_error_rfc4884(const struct sk_buff *skb,
+ struct sock_ee_data_rfc4884 *out)
+{
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ case ICMP_TIME_EXCEEDED:
+ case ICMP_PARAMETERPROB:
+ ip_icmp_error_rfc4884(skb, out, sizeof(struct icmphdr),
+ icmp_hdr(skb)->un.reserved[1] * 4);
+ }
+}
+
void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
__be16 port, u32 info, u8 *payload)
{
@@ -411,21 +431,24 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
serr->port = port;
if (skb_pull(skb, payload - skb->data)) {
+ if (inet_test_bit(RECVERR_RFC4884, sk))
+ ipv4_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884);
+
skb_reset_transport_header(skb);
if (sock_queue_err_skb(sk, skb) == 0)
return;
}
kfree_skb(skb);
}
+EXPORT_SYMBOL_GPL(ip_icmp_error);
void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info)
{
- struct inet_sock *inet = inet_sk(sk);
struct sock_exterr_skb *serr;
struct iphdr *iph;
struct sk_buff *skb;
- if (!inet->recverr)
+ if (!inet_test_bit(RECVERR, sk))
return;
skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC);
@@ -486,7 +509,7 @@ static bool ipv4_datagram_support_cmsg(const struct sock *sk,
* or without payload (SOF_TIMESTAMPING_OPT_TSONLY).
*/
info = PKTINFO_SKB_CB(skb);
- if (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG) ||
+ if (!(READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_CMSG) ||
!info->ipi_ifindex)
return false;
@@ -544,7 +567,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
if (ipv4_datagram_support_cmsg(sk, skb, serr->ee.ee_origin)) {
sin->sin_family = AF_INET;
sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
- if (inet_sk(sk)->cmsg_flags)
+ if (inet_cmsg_flags(inet_sk(sk)))
ip_cmsg_recv(msg, skb);
}
@@ -560,6 +583,55 @@ out:
return err;
}
+void __ip_sock_set_tos(struct sock *sk, int val)
+{
+ u8 old_tos = inet_sk(sk)->tos;
+
+ if (sk->sk_type == SOCK_STREAM) {
+ val &= ~INET_ECN_MASK;
+ val |= old_tos & INET_ECN_MASK;
+ }
+ if (old_tos != val) {
+ WRITE_ONCE(inet_sk(sk)->tos, val);
+ WRITE_ONCE(sk->sk_priority, rt_tos2priority(val));
+ sk_dst_reset(sk);
+ }
+}
+
+void ip_sock_set_tos(struct sock *sk, int val)
+{
+ sockopt_lock_sock(sk);
+ __ip_sock_set_tos(sk, val);
+ sockopt_release_sock(sk);
+}
+EXPORT_SYMBOL(ip_sock_set_tos);
+
+void ip_sock_set_freebind(struct sock *sk)
+{
+ inet_set_bit(FREEBIND, sk);
+}
+EXPORT_SYMBOL(ip_sock_set_freebind);
+
+void ip_sock_set_recverr(struct sock *sk)
+{
+ inet_set_bit(RECVERR, sk);
+}
+EXPORT_SYMBOL(ip_sock_set_recverr);
+
+int ip_sock_set_mtu_discover(struct sock *sk, int val)
+{
+ if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
+ return -EINVAL;
+ WRITE_ONCE(inet_sk(sk)->pmtudisc, val);
+ return 0;
+}
+EXPORT_SYMBOL(ip_sock_set_mtu_discover);
+
+void ip_sock_set_pktinfo(struct sock *sk)
+{
+ inet_set_bit(PKTINFO, sk);
+}
+EXPORT_SYMBOL(ip_sock_set_pktinfo);
/*
* Socket option code for IP. This is the end of the line after any
@@ -587,12 +659,242 @@ static bool setsockopt_needs_rtnl(int optname)
return false;
}
-static int do_ip_setsockopt(struct sock *sk, int level,
- int optname, char __user *optval, unsigned int optlen)
+static int set_mcast_msfilter(struct sock *sk, int ifindex,
+ int numsrc, int fmode,
+ struct sockaddr_storage *group,
+ struct sockaddr_storage *list)
+{
+ struct ip_msfilter *msf;
+ struct sockaddr_in *psin;
+ int err, i;
+
+ msf = kmalloc(IP_MSFILTER_SIZE(numsrc), GFP_KERNEL);
+ if (!msf)
+ return -ENOBUFS;
+
+ psin = (struct sockaddr_in *)group;
+ if (psin->sin_family != AF_INET)
+ goto Eaddrnotavail;
+ msf->imsf_multiaddr = psin->sin_addr.s_addr;
+ msf->imsf_interface = 0;
+ msf->imsf_fmode = fmode;
+ msf->imsf_numsrc = numsrc;
+ for (i = 0; i < numsrc; ++i) {
+ psin = (struct sockaddr_in *)&list[i];
+
+ if (psin->sin_family != AF_INET)
+ goto Eaddrnotavail;
+ msf->imsf_slist_flex[i] = psin->sin_addr.s_addr;
+ }
+ err = ip_mc_msfilter(sk, msf, ifindex);
+ kfree(msf);
+ return err;
+
+Eaddrnotavail:
+ kfree(msf);
+ return -EADDRNOTAVAIL;
+}
+
+static int copy_group_source_from_sockptr(struct group_source_req *greqs,
+ sockptr_t optval, int optlen)
+{
+ if (in_compat_syscall()) {
+ struct compat_group_source_req gr32;
+
+ if (optlen != sizeof(gr32))
+ return -EINVAL;
+ if (copy_from_sockptr(&gr32, optval, sizeof(gr32)))
+ return -EFAULT;
+ greqs->gsr_interface = gr32.gsr_interface;
+ greqs->gsr_group = gr32.gsr_group;
+ greqs->gsr_source = gr32.gsr_source;
+ } else {
+ if (optlen != sizeof(*greqs))
+ return -EINVAL;
+ if (copy_from_sockptr(greqs, optval, sizeof(*greqs)))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int do_mcast_group_source(struct sock *sk, int optname,
+ sockptr_t optval, int optlen)
+{
+ struct group_source_req greqs;
+ struct ip_mreq_source mreqs;
+ struct sockaddr_in *psin;
+ int omode, add, err;
+
+ err = copy_group_source_from_sockptr(&greqs, optval, optlen);
+ if (err)
+ return err;
+
+ if (greqs.gsr_group.ss_family != AF_INET ||
+ greqs.gsr_source.ss_family != AF_INET)
+ return -EADDRNOTAVAIL;
+
+ psin = (struct sockaddr_in *)&greqs.gsr_group;
+ mreqs.imr_multiaddr = psin->sin_addr.s_addr;
+ psin = (struct sockaddr_in *)&greqs.gsr_source;
+ mreqs.imr_sourceaddr = psin->sin_addr.s_addr;
+ mreqs.imr_interface = 0; /* use index for mc_source */
+
+ if (optname == MCAST_BLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 1;
+ } else if (optname == MCAST_UNBLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 0;
+ } else if (optname == MCAST_JOIN_SOURCE_GROUP) {
+ struct ip_mreqn mreq;
+
+ psin = (struct sockaddr_in *)&greqs.gsr_group;
+ mreq.imr_multiaddr = psin->sin_addr;
+ mreq.imr_address.s_addr = 0;
+ mreq.imr_ifindex = greqs.gsr_interface;
+ err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE);
+ if (err && err != -EADDRINUSE)
+ return err;
+ greqs.gsr_interface = mreq.imr_ifindex;
+ omode = MCAST_INCLUDE;
+ add = 1;
+ } else /* MCAST_LEAVE_SOURCE_GROUP */ {
+ omode = MCAST_INCLUDE;
+ add = 0;
+ }
+ return ip_mc_source(add, omode, sk, &mreqs, greqs.gsr_interface);
+}
+
+static int ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval, int optlen)
+{
+ struct group_filter *gsf = NULL;
+ int err;
+
+ if (optlen < GROUP_FILTER_SIZE(0))
+ return -EINVAL;
+ if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
+ return -ENOBUFS;
+
+ gsf = memdup_sockptr(optval, optlen);
+ if (IS_ERR(gsf))
+ return PTR_ERR(gsf);
+
+ /* numsrc >= (4G-140)/128 overflow in 32 bits */
+ err = -ENOBUFS;
+ if (gsf->gf_numsrc >= 0x1ffffff ||
+ gsf->gf_numsrc > READ_ONCE(sock_net(sk)->ipv4.sysctl_igmp_max_msf))
+ goto out_free_gsf;
+
+ err = -EINVAL;
+ if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen)
+ goto out_free_gsf;
+
+ err = set_mcast_msfilter(sk, gsf->gf_interface, gsf->gf_numsrc,
+ gsf->gf_fmode, &gsf->gf_group,
+ gsf->gf_slist_flex);
+out_free_gsf:
+ kfree(gsf);
+ return err;
+}
+
+static int compat_ip_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
+ int optlen)
+{
+ const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
+ struct compat_group_filter *gf32;
+ unsigned int n;
+ void *p;
+ int err;
+
+ if (optlen < size0)
+ return -EINVAL;
+ if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4)
+ return -ENOBUFS;
+
+ p = kmalloc(optlen + 4, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+ gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */
+
+ err = -EFAULT;
+ if (copy_from_sockptr(gf32, optval, optlen))
+ goto out_free_gsf;
+
+ /* numsrc >= (4G-140)/128 overflow in 32 bits */
+ n = gf32->gf_numsrc;
+ err = -ENOBUFS;
+ if (n >= 0x1ffffff)
+ goto out_free_gsf;
+
+ err = -EINVAL;
+ if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen)
+ goto out_free_gsf;
+
+ /* numsrc >= (4G-140)/128 overflow in 32 bits */
+ err = -ENOBUFS;
+ if (n > READ_ONCE(sock_net(sk)->ipv4.sysctl_igmp_max_msf))
+ goto out_free_gsf;
+ err = set_mcast_msfilter(sk, gf32->gf_interface, n, gf32->gf_fmode,
+ &gf32->gf_group, gf32->gf_slist_flex);
+out_free_gsf:
+ kfree(p);
+ return err;
+}
+
+static int ip_mcast_join_leave(struct sock *sk, int optname,
+ sockptr_t optval, int optlen)
+{
+ struct ip_mreqn mreq = { };
+ struct sockaddr_in *psin;
+ struct group_req greq;
+
+ if (optlen < sizeof(struct group_req))
+ return -EINVAL;
+ if (copy_from_sockptr(&greq, optval, sizeof(greq)))
+ return -EFAULT;
+
+ psin = (struct sockaddr_in *)&greq.gr_group;
+ if (psin->sin_family != AF_INET)
+ return -EINVAL;
+ mreq.imr_multiaddr = psin->sin_addr;
+ mreq.imr_ifindex = greq.gr_interface;
+ if (optname == MCAST_JOIN_GROUP)
+ return ip_mc_join_group(sk, &mreq);
+ return ip_mc_leave_group(sk, &mreq);
+}
+
+static int compat_ip_mcast_join_leave(struct sock *sk, int optname,
+ sockptr_t optval, int optlen)
+{
+ struct compat_group_req greq;
+ struct ip_mreqn mreq = { };
+ struct sockaddr_in *psin;
+
+ if (optlen < sizeof(struct compat_group_req))
+ return -EINVAL;
+ if (copy_from_sockptr(&greq, optval, sizeof(greq)))
+ return -EFAULT;
+
+ psin = (struct sockaddr_in *)&greq.gr_group;
+ if (psin->sin_family != AF_INET)
+ return -EINVAL;
+ mreq.imr_multiaddr = psin->sin_addr;
+ mreq.imr_ifindex = greq.gr_interface;
+
+ if (optname == MCAST_JOIN_GROUP)
+ return ip_mc_join_group(sk, &mreq);
+ return ip_mc_leave_group(sk, &mreq);
+}
+
+DEFINE_STATIC_KEY_FALSE(ip4_min_ttl);
+
+int do_ip_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
- int val = 0, err;
+ int val = 0, err, retv;
bool needs_rtnl = setsockopt_needs_rtnl(optname);
switch (optname) {
@@ -620,13 +922,15 @@ static int do_ip_setsockopt(struct sock *sk, int level,
case IP_RECVORIGDSTADDR:
case IP_CHECKSUM:
case IP_RECVFRAGSIZE:
+ case IP_RECVERR_RFC4884:
+ case IP_LOCAL_PORT_RANGE:
if (optlen >= sizeof(int)) {
- if (get_user(val, (int __user *) optval))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
} else if (optlen >= sizeof(char)) {
unsigned char ucval;
- if (get_user(ucval, (unsigned char __user *) optval))
+ if (copy_from_sockptr(&ucval, optval, sizeof(ucval)))
return -EFAULT;
val = (int) ucval;
}
@@ -634,15 +938,144 @@ static int do_ip_setsockopt(struct sock *sk, int level,
/* If optlen==0, it is equivalent to val == 0 */
- if (optname == IP_ROUTER_ALERT)
- return ip_ra_control(sk, val ? 1 : 0, NULL);
+ if (optname == IP_ROUTER_ALERT) {
+ retv = ip_ra_control(sk, val ? 1 : 0, NULL);
+ if (retv == 0)
+ inet_assign_bit(RTALERT, sk, val);
+ return retv;
+ }
if (ip_mroute_opt(optname))
return ip_mroute_setsockopt(sk, optname, optval, optlen);
+ /* Handle options that can be set without locking the socket. */
+ switch (optname) {
+ case IP_PKTINFO:
+ inet_assign_bit(PKTINFO, sk, val);
+ return 0;
+ case IP_RECVTTL:
+ inet_assign_bit(TTL, sk, val);
+ return 0;
+ case IP_RECVTOS:
+ inet_assign_bit(TOS, sk, val);
+ return 0;
+ case IP_RECVOPTS:
+ inet_assign_bit(RECVOPTS, sk, val);
+ return 0;
+ case IP_RETOPTS:
+ inet_assign_bit(RETOPTS, sk, val);
+ return 0;
+ case IP_PASSSEC:
+ inet_assign_bit(PASSSEC, sk, val);
+ return 0;
+ case IP_RECVORIGDSTADDR:
+ inet_assign_bit(ORIGDSTADDR, sk, val);
+ return 0;
+ case IP_RECVFRAGSIZE:
+ if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM)
+ return -EINVAL;
+ inet_assign_bit(RECVFRAGSIZE, sk, val);
+ return 0;
+ case IP_RECVERR:
+ inet_assign_bit(RECVERR, sk, val);
+ if (!val)
+ skb_errqueue_purge(&sk->sk_error_queue);
+ return 0;
+ case IP_RECVERR_RFC4884:
+ if (val < 0 || val > 1)
+ return -EINVAL;
+ inet_assign_bit(RECVERR_RFC4884, sk, val);
+ return 0;
+ case IP_FREEBIND:
+ if (optlen < 1)
+ return -EINVAL;
+ inet_assign_bit(FREEBIND, sk, val);
+ return 0;
+ case IP_HDRINCL:
+ if (sk->sk_type != SOCK_RAW)
+ return -ENOPROTOOPT;
+ inet_assign_bit(HDRINCL, sk, val);
+ return 0;
+ case IP_MULTICAST_LOOP:
+ if (optlen < 1)
+ return -EINVAL;
+ inet_assign_bit(MC_LOOP, sk, val);
+ return 0;
+ case IP_MULTICAST_ALL:
+ if (optlen < 1)
+ return -EINVAL;
+ if (val != 0 && val != 1)
+ return -EINVAL;
+ inet_assign_bit(MC_ALL, sk, val);
+ return 0;
+ case IP_TRANSPARENT:
+ if (!!val && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
+ !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ if (optlen < 1)
+ return -EINVAL;
+ inet_assign_bit(TRANSPARENT, sk, val);
+ return 0;
+ case IP_NODEFRAG:
+ if (sk->sk_type != SOCK_RAW)
+ return -ENOPROTOOPT;
+ inet_assign_bit(NODEFRAG, sk, val);
+ return 0;
+ case IP_BIND_ADDRESS_NO_PORT:
+ inet_assign_bit(BIND_ADDRESS_NO_PORT, sk, val);
+ return 0;
+ case IP_TTL:
+ if (optlen < 1)
+ return -EINVAL;
+ if (val != -1 && (val < 1 || val > 255))
+ return -EINVAL;
+ WRITE_ONCE(inet->uc_ttl, val);
+ return 0;
+ case IP_MINTTL:
+ if (optlen < 1)
+ return -EINVAL;
+ if (val < 0 || val > 255)
+ return -EINVAL;
+
+ if (val)
+ static_branch_enable(&ip4_min_ttl);
+
+ WRITE_ONCE(inet->min_ttl, val);
+ return 0;
+ case IP_MULTICAST_TTL:
+ if (sk->sk_type == SOCK_STREAM)
+ return -EINVAL;
+ if (optlen < 1)
+ return -EINVAL;
+ if (val == -1)
+ val = 1;
+ if (val < 0 || val > 255)
+ return -EINVAL;
+ WRITE_ONCE(inet->mc_ttl, val);
+ return 0;
+ case IP_MTU_DISCOVER:
+ return ip_sock_set_mtu_discover(sk, val);
+ case IP_TOS: /* This sets both TOS and Precedence */
+ ip_sock_set_tos(sk, val);
+ return 0;
+ case IP_LOCAL_PORT_RANGE:
+ {
+ u16 lo = val;
+ u16 hi = val >> 16;
+
+ if (optlen != sizeof(u32))
+ return -EINVAL;
+ if (lo != 0 && hi != 0 && lo > hi)
+ return -EINVAL;
+
+ WRITE_ONCE(inet->local_port_range, val);
+ return 0;
+ }
+ }
+
err = 0;
if (needs_rtnl)
rtnl_lock();
- lock_sock(sk);
+ sockopt_lock_sock(sk);
switch (optname) {
case IP_OPTIONS:
@@ -651,13 +1084,12 @@ static int do_ip_setsockopt(struct sock *sk, int level,
if (optlen > 40)
goto e_inval;
- err = ip_options_get_from_user(sock_net(sk), &opt,
- optval, optlen);
+ err = ip_options_get(sock_net(sk), &opt, optval, optlen);
if (err)
break;
old = rcu_dereference_protected(inet->inet_opt,
lockdep_sock_is_held(sk));
- if (inet->is_icsk) {
+ if (inet_test_bit(IS_ICSK, sk)) {
struct inet_connection_sock *icsk = inet_csk(sk);
#if IS_ENABLED(CONFIG_IPV6)
if (sk->sk_family == PF_INET ||
@@ -679,130 +1111,19 @@ static int do_ip_setsockopt(struct sock *sk, int level,
kfree_rcu(old, rcu);
break;
}
- case IP_PKTINFO:
- if (val)
- inet->cmsg_flags |= IP_CMSG_PKTINFO;
- else
- inet->cmsg_flags &= ~IP_CMSG_PKTINFO;
- break;
- case IP_RECVTTL:
- if (val)
- inet->cmsg_flags |= IP_CMSG_TTL;
- else
- inet->cmsg_flags &= ~IP_CMSG_TTL;
- break;
- case IP_RECVTOS:
- if (val)
- inet->cmsg_flags |= IP_CMSG_TOS;
- else
- inet->cmsg_flags &= ~IP_CMSG_TOS;
- break;
- case IP_RECVOPTS:
- if (val)
- inet->cmsg_flags |= IP_CMSG_RECVOPTS;
- else
- inet->cmsg_flags &= ~IP_CMSG_RECVOPTS;
- break;
- case IP_RETOPTS:
- if (val)
- inet->cmsg_flags |= IP_CMSG_RETOPTS;
- else
- inet->cmsg_flags &= ~IP_CMSG_RETOPTS;
- break;
- case IP_PASSSEC:
- if (val)
- inet->cmsg_flags |= IP_CMSG_PASSSEC;
- else
- inet->cmsg_flags &= ~IP_CMSG_PASSSEC;
- break;
- case IP_RECVORIGDSTADDR:
- if (val)
- inet->cmsg_flags |= IP_CMSG_ORIGDSTADDR;
- else
- inet->cmsg_flags &= ~IP_CMSG_ORIGDSTADDR;
- break;
case IP_CHECKSUM:
if (val) {
- if (!(inet->cmsg_flags & IP_CMSG_CHECKSUM)) {
+ if (!(inet_test_bit(CHECKSUM, sk))) {
inet_inc_convert_csum(sk);
- inet->cmsg_flags |= IP_CMSG_CHECKSUM;
+ inet_set_bit(CHECKSUM, sk);
}
} else {
- if (inet->cmsg_flags & IP_CMSG_CHECKSUM) {
+ if (inet_test_bit(CHECKSUM, sk)) {
inet_dec_convert_csum(sk);
- inet->cmsg_flags &= ~IP_CMSG_CHECKSUM;
+ inet_clear_bit(CHECKSUM, sk);
}
}
break;
- case IP_RECVFRAGSIZE:
- if (sk->sk_type != SOCK_RAW && sk->sk_type != SOCK_DGRAM)
- goto e_inval;
- if (val)
- inet->cmsg_flags |= IP_CMSG_RECVFRAGSIZE;
- else
- inet->cmsg_flags &= ~IP_CMSG_RECVFRAGSIZE;
- break;
- case IP_TOS: /* This sets both TOS and Precedence */
- if (sk->sk_type == SOCK_STREAM) {
- val &= ~INET_ECN_MASK;
- val |= inet->tos & INET_ECN_MASK;
- }
- if (inet->tos != val) {
- inet->tos = val;
- sk->sk_priority = rt_tos2priority(val);
- sk_dst_reset(sk);
- }
- break;
- case IP_TTL:
- if (optlen < 1)
- goto e_inval;
- if (val != -1 && (val < 1 || val > 255))
- goto e_inval;
- inet->uc_ttl = val;
- break;
- case IP_HDRINCL:
- if (sk->sk_type != SOCK_RAW) {
- err = -ENOPROTOOPT;
- break;
- }
- inet->hdrincl = val ? 1 : 0;
- break;
- case IP_NODEFRAG:
- if (sk->sk_type != SOCK_RAW) {
- err = -ENOPROTOOPT;
- break;
- }
- inet->nodefrag = val ? 1 : 0;
- break;
- case IP_BIND_ADDRESS_NO_PORT:
- inet->bind_address_no_port = val ? 1 : 0;
- break;
- case IP_MTU_DISCOVER:
- if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT)
- goto e_inval;
- inet->pmtudisc = val;
- break;
- case IP_RECVERR:
- inet->recverr = !!val;
- if (!val)
- skb_queue_purge(&sk->sk_error_queue);
- break;
- case IP_MULTICAST_TTL:
- if (sk->sk_type == SOCK_STREAM)
- goto e_inval;
- if (optlen < 1)
- goto e_inval;
- if (val == -1)
- val = 1;
- if (val < 0 || val > 255)
- goto e_inval;
- inet->mc_ttl = val;
- break;
- case IP_MULTICAST_LOOP:
- if (optlen < 1)
- goto e_inval;
- inet->mc_loop = !!val;
- break;
case IP_UNICAST_IF:
{
struct net_device *dev = NULL;
@@ -814,7 +1135,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
ifindex = (__force int)ntohl((__force __be32)val);
if (ifindex == 0) {
- inet->uc_index = 0;
+ WRITE_ONCE(inet->uc_index, 0);
err = 0;
break;
}
@@ -828,11 +1149,10 @@ static int do_ip_setsockopt(struct sock *sk, int level,
dev_put(dev);
err = -EINVAL;
- if (sk->sk_bound_dev_if &&
- (!midx || midx != sk->sk_bound_dev_if))
+ if (sk->sk_bound_dev_if && midx != sk->sk_bound_dev_if)
break;
- inet->uc_index = ifindex;
+ WRITE_ONCE(inet->uc_index, ifindex);
err = 0;
break;
}
@@ -853,25 +1173,25 @@ static int do_ip_setsockopt(struct sock *sk, int level,
err = -EFAULT;
if (optlen >= sizeof(struct ip_mreqn)) {
- if (copy_from_user(&mreq, optval, sizeof(mreq)))
+ if (copy_from_sockptr(&mreq, optval, sizeof(mreq)))
break;
} else {
memset(&mreq, 0, sizeof(mreq));
if (optlen >= sizeof(struct ip_mreq)) {
- if (copy_from_user(&mreq, optval,
- sizeof(struct ip_mreq)))
+ if (copy_from_sockptr(&mreq, optval,
+ sizeof(struct ip_mreq)))
break;
} else if (optlen >= sizeof(struct in_addr)) {
- if (copy_from_user(&mreq.imr_address, optval,
- sizeof(struct in_addr)))
+ if (copy_from_sockptr(&mreq.imr_address, optval,
+ sizeof(struct in_addr)))
break;
}
}
if (!mreq.imr_ifindex) {
if (mreq.imr_address.s_addr == htonl(INADDR_ANY)) {
- inet->mc_index = 0;
- inet->mc_addr = 0;
+ WRITE_ONCE(inet->mc_index, 0);
+ WRITE_ONCE(inet->mc_addr, 0);
err = 0;
break;
}
@@ -893,11 +1213,11 @@ static int do_ip_setsockopt(struct sock *sk, int level,
err = -EINVAL;
if (sk->sk_bound_dev_if &&
mreq.imr_ifindex != sk->sk_bound_dev_if &&
- (!midx || midx != sk->sk_bound_dev_if))
+ midx != sk->sk_bound_dev_if)
break;
- inet->mc_index = mreq.imr_ifindex;
- inet->mc_addr = mreq.imr_address.s_addr;
+ WRITE_ONCE(inet->mc_index, mreq.imr_ifindex);
+ WRITE_ONCE(inet->mc_addr, mreq.imr_address.s_addr);
err = 0;
break;
}
@@ -908,18 +1228,19 @@ static int do_ip_setsockopt(struct sock *sk, int level,
struct ip_mreqn mreq;
err = -EPROTO;
- if (inet_sk(sk)->is_icsk)
+ if (inet_test_bit(IS_ICSK, sk))
break;
if (optlen < sizeof(struct ip_mreq))
goto e_inval;
err = -EFAULT;
if (optlen >= sizeof(struct ip_mreqn)) {
- if (copy_from_user(&mreq, optval, sizeof(mreq)))
+ if (copy_from_sockptr(&mreq, optval, sizeof(mreq)))
break;
} else {
memset(&mreq, 0, sizeof(mreq));
- if (copy_from_user(&mreq, optval, sizeof(struct ip_mreq)))
+ if (copy_from_sockptr(&mreq, optval,
+ sizeof(struct ip_mreq)))
break;
}
@@ -935,18 +1256,18 @@ static int do_ip_setsockopt(struct sock *sk, int level,
if (optlen < IP_MSFILTER_SIZE(0))
goto e_inval;
- if (optlen > sysctl_optmem_max) {
+ if (optlen > READ_ONCE(net->core.sysctl_optmem_max)) {
err = -ENOBUFS;
break;
}
- msf = memdup_user(optval, optlen);
+ msf = memdup_sockptr(optval, optlen);
if (IS_ERR(msf)) {
err = PTR_ERR(msf);
break;
}
/* numsrc >= (1G-4) overflow in 32 bits */
if (msf->imsf_numsrc >= 0x3ffffffcU ||
- msf->imsf_numsrc > net->ipv4.sysctl_igmp_max_msf) {
+ msf->imsf_numsrc > READ_ONCE(net->ipv4.sysctl_igmp_max_msf)) {
kfree(msf);
err = -ENOBUFS;
break;
@@ -970,7 +1291,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
if (optlen != sizeof(struct ip_mreq_source))
goto e_inval;
- if (copy_from_user(&mreqs, optval, sizeof(mreqs))) {
+ if (copy_from_sockptr(&mreqs, optval, sizeof(mreqs))) {
err = -EFAULT;
break;
}
@@ -1000,197 +1321,43 @@ static int do_ip_setsockopt(struct sock *sk, int level,
}
case MCAST_JOIN_GROUP:
case MCAST_LEAVE_GROUP:
- {
- struct group_req greq;
- struct sockaddr_in *psin;
- struct ip_mreqn mreq;
-
- if (optlen < sizeof(struct group_req))
- goto e_inval;
- err = -EFAULT;
- if (copy_from_user(&greq, optval, sizeof(greq)))
- break;
- psin = (struct sockaddr_in *)&greq.gr_group;
- if (psin->sin_family != AF_INET)
- goto e_inval;
- memset(&mreq, 0, sizeof(mreq));
- mreq.imr_multiaddr = psin->sin_addr;
- mreq.imr_ifindex = greq.gr_interface;
-
- if (optname == MCAST_JOIN_GROUP)
- err = ip_mc_join_group(sk, &mreq);
+ if (in_compat_syscall())
+ err = compat_ip_mcast_join_leave(sk, optname, optval,
+ optlen);
else
- err = ip_mc_leave_group(sk, &mreq);
+ err = ip_mcast_join_leave(sk, optname, optval, optlen);
break;
- }
case MCAST_JOIN_SOURCE_GROUP:
case MCAST_LEAVE_SOURCE_GROUP:
case MCAST_BLOCK_SOURCE:
case MCAST_UNBLOCK_SOURCE:
- {
- struct group_source_req greqs;
- struct ip_mreq_source mreqs;
- struct sockaddr_in *psin;
- int omode, add;
-
- if (optlen != sizeof(struct group_source_req))
- goto e_inval;
- if (copy_from_user(&greqs, optval, sizeof(greqs))) {
- err = -EFAULT;
- break;
- }
- if (greqs.gsr_group.ss_family != AF_INET ||
- greqs.gsr_source.ss_family != AF_INET) {
- err = -EADDRNOTAVAIL;
- break;
- }
- psin = (struct sockaddr_in *)&greqs.gsr_group;
- mreqs.imr_multiaddr = psin->sin_addr.s_addr;
- psin = (struct sockaddr_in *)&greqs.gsr_source;
- mreqs.imr_sourceaddr = psin->sin_addr.s_addr;
- mreqs.imr_interface = 0; /* use index for mc_source */
-
- if (optname == MCAST_BLOCK_SOURCE) {
- omode = MCAST_EXCLUDE;
- add = 1;
- } else if (optname == MCAST_UNBLOCK_SOURCE) {
- omode = MCAST_EXCLUDE;
- add = 0;
- } else if (optname == MCAST_JOIN_SOURCE_GROUP) {
- struct ip_mreqn mreq;
-
- psin = (struct sockaddr_in *)&greqs.gsr_group;
- mreq.imr_multiaddr = psin->sin_addr;
- mreq.imr_address.s_addr = 0;
- mreq.imr_ifindex = greqs.gsr_interface;
- err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE);
- if (err && err != -EADDRINUSE)
- break;
- greqs.gsr_interface = mreq.imr_ifindex;
- omode = MCAST_INCLUDE;
- add = 1;
- } else /* MCAST_LEAVE_SOURCE_GROUP */ {
- omode = MCAST_INCLUDE;
- add = 0;
- }
- err = ip_mc_source(add, omode, sk, &mreqs,
- greqs.gsr_interface);
+ err = do_mcast_group_source(sk, optname, optval, optlen);
break;
- }
case MCAST_MSFILTER:
- {
- struct sockaddr_in *psin;
- struct ip_msfilter *msf = NULL;
- struct group_filter *gsf = NULL;
- int msize, i, ifindex;
-
- if (optlen < GROUP_FILTER_SIZE(0))
- goto e_inval;
- if (optlen > sysctl_optmem_max) {
- err = -ENOBUFS;
- break;
- }
- gsf = memdup_user(optval, optlen);
- if (IS_ERR(gsf)) {
- err = PTR_ERR(gsf);
- break;
- }
-
- /* numsrc >= (4G-140)/128 overflow in 32 bits */
- if (gsf->gf_numsrc >= 0x1ffffff ||
- gsf->gf_numsrc > net->ipv4.sysctl_igmp_max_msf) {
- err = -ENOBUFS;
- goto mc_msf_out;
- }
- if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) {
- err = -EINVAL;
- goto mc_msf_out;
- }
- msize = IP_MSFILTER_SIZE(gsf->gf_numsrc);
- msf = kmalloc(msize, GFP_KERNEL);
- if (!msf) {
- err = -ENOBUFS;
- goto mc_msf_out;
- }
- ifindex = gsf->gf_interface;
- psin = (struct sockaddr_in *)&gsf->gf_group;
- if (psin->sin_family != AF_INET) {
- err = -EADDRNOTAVAIL;
- goto mc_msf_out;
- }
- msf->imsf_multiaddr = psin->sin_addr.s_addr;
- msf->imsf_interface = 0;
- msf->imsf_fmode = gsf->gf_fmode;
- msf->imsf_numsrc = gsf->gf_numsrc;
- err = -EADDRNOTAVAIL;
- for (i = 0; i < gsf->gf_numsrc; ++i) {
- psin = (struct sockaddr_in *)&gsf->gf_slist[i];
-
- if (psin->sin_family != AF_INET)
- goto mc_msf_out;
- msf->imsf_slist[i] = psin->sin_addr.s_addr;
- }
- kfree(gsf);
- gsf = NULL;
-
- err = ip_mc_msfilter(sk, msf, ifindex);
-mc_msf_out:
- kfree(msf);
- kfree(gsf);
- break;
- }
- case IP_MULTICAST_ALL:
- if (optlen < 1)
- goto e_inval;
- if (val != 0 && val != 1)
- goto e_inval;
- inet->mc_all = val;
- break;
-
- case IP_FREEBIND:
- if (optlen < 1)
- goto e_inval;
- inet->freebind = !!val;
+ if (in_compat_syscall())
+ err = compat_ip_set_mcast_msfilter(sk, optval, optlen);
+ else
+ err = ip_set_mcast_msfilter(sk, optval, optlen);
break;
-
case IP_IPSEC_POLICY:
case IP_XFRM_POLICY:
err = -EPERM;
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
break;
err = xfrm_user_policy(sk, optname, optval, optlen);
break;
- case IP_TRANSPARENT:
- if (!!val && !ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
- !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
- err = -EPERM;
- break;
- }
- if (optlen < 1)
- goto e_inval;
- inet->transparent = !!val;
- break;
-
- case IP_MINTTL:
- if (optlen < 1)
- goto e_inval;
- if (val < 0 || val > 255)
- goto e_inval;
- inet->min_ttl = val;
- break;
-
default:
err = -ENOPROTOOPT;
break;
}
- release_sock(sk);
+ sockopt_release_sock(sk);
if (needs_rtnl)
rtnl_unlock();
return err;
e_inval:
- release_sock(sk);
+ sockopt_release_sock(sk);
if (needs_rtnl)
rtnl_unlock();
return -EINVAL;
@@ -1200,15 +1367,16 @@ e_inval:
* ipv4_pktinfo_prepare - transfer some info from rtable to skb
* @sk: socket
* @skb: buffer
+ * @drop_dst: if true, drops skb dst
*
* To support IP_CMSG_PKTINFO option, we store rt_iif and specific
* destination in skb->cb[] before dst drop.
* This way, receiver doesn't make cache line misses to read rtable.
*/
-void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
+void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb, bool drop_dst)
{
struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb);
- bool prepare = (inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) ||
+ bool prepare = inet_test_bit(PKTINFO, sk) ||
ipv6_sk_rxinfo(sk);
if (prepare && skb_rtable(skb)) {
@@ -1234,11 +1402,12 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
pktinfo->ipi_ifindex = 0;
pktinfo->ipi_spec_dst.s_addr = 0;
}
- skb_dst_drop(skb);
+ if (drop_dst)
+ skb_dst_drop(skb);
}
-int ip_setsockopt(struct sock *sk, int level,
- int optname, char __user *optval, unsigned int optlen)
+int ip_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+ unsigned int optlen)
{
int err;
@@ -1246,11 +1415,6 @@ int ip_setsockopt(struct sock *sk, int level,
return -ENOPROTOOPT;
err = do_ip_setsockopt(sk, level, optname, optval, optlen);
-#ifdef CONFIG_BPFILTER
- if (optname >= BPFILTER_IPT_SO_SET_REPLACE &&
- optname < BPFILTER_IPT_SET_MAX)
- err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen);
-#endif
#ifdef CONFIG_NETFILTER
/* we need to exclude all possible ENOPROTOOPTs except default case */
if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
@@ -1263,34 +1427,6 @@ int ip_setsockopt(struct sock *sk, int level,
}
EXPORT_SYMBOL(ip_setsockopt);
-#ifdef CONFIG_COMPAT
-int compat_ip_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- int err;
-
- if (level != SOL_IP)
- return -ENOPROTOOPT;
-
- if (optname >= MCAST_JOIN_GROUP && optname <= MCAST_MSFILTER)
- return compat_mc_setsockopt(sk, level, optname, optval, optlen,
- ip_setsockopt);
-
- err = do_ip_setsockopt(sk, level, optname, optval, optlen);
-#ifdef CONFIG_NETFILTER
- /* we need to exclude all possible ENOPROTOOPTs except default case */
- if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
- optname != IP_IPSEC_POLICY &&
- optname != IP_XFRM_POLICY &&
- !ip_mroute_opt(optname))
- err = compat_nf_setsockopt(sk, PF_INET, optname, optval,
- optlen);
-#endif
- return err;
-}
-EXPORT_SYMBOL(compat_ip_setsockopt);
-#endif
-
/*
* Get the options. Note for future reference. The GET of IP options gets
* the _received_ ones. The set sets the _sent_ ones.
@@ -1306,8 +1442,70 @@ static bool getsockopt_needs_rtnl(int optname)
return false;
}
-static int do_ip_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen, unsigned int flags)
+static int ip_get_mcast_msfilter(struct sock *sk, sockptr_t optval,
+ sockptr_t optlen, int len)
+{
+ const int size0 = offsetof(struct group_filter, gf_slist_flex);
+ struct group_filter gsf;
+ int num, gsf_size;
+ int err;
+
+ if (len < size0)
+ return -EINVAL;
+ if (copy_from_sockptr(&gsf, optval, size0))
+ return -EFAULT;
+
+ num = gsf.gf_numsrc;
+ err = ip_mc_gsfget(sk, &gsf, optval,
+ offsetof(struct group_filter, gf_slist_flex));
+ if (err)
+ return err;
+ if (gsf.gf_numsrc < num)
+ num = gsf.gf_numsrc;
+ gsf_size = GROUP_FILTER_SIZE(num);
+ if (copy_to_sockptr(optlen, &gsf_size, sizeof(int)) ||
+ copy_to_sockptr(optval, &gsf, size0))
+ return -EFAULT;
+ return 0;
+}
+
+static int compat_ip_get_mcast_msfilter(struct sock *sk, sockptr_t optval,
+ sockptr_t optlen, int len)
+{
+ const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
+ struct compat_group_filter gf32;
+ struct group_filter gf;
+ int num;
+ int err;
+
+ if (len < size0)
+ return -EINVAL;
+ if (copy_from_sockptr(&gf32, optval, size0))
+ return -EFAULT;
+
+ gf.gf_interface = gf32.gf_interface;
+ gf.gf_fmode = gf32.gf_fmode;
+ num = gf.gf_numsrc = gf32.gf_numsrc;
+ gf.gf_group = gf32.gf_group;
+
+ err = ip_mc_gsfget(sk, &gf, optval,
+ offsetof(struct compat_group_filter, gf_slist_flex));
+ if (err)
+ return err;
+ if (gf.gf_numsrc < num)
+ num = gf.gf_numsrc;
+ len = GROUP_FILTER_SIZE(num) - (sizeof(gf) - sizeof(gf32));
+ if (copy_to_sockptr(optlen, &len, sizeof(int)) ||
+ copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode),
+ &gf.gf_fmode, sizeof(gf.gf_fmode)) ||
+ copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc),
+ &gf.gf_numsrc, sizeof(gf.gf_numsrc)))
+ return -EFAULT;
+ return 0;
+}
+
+int do_ip_getsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, sockptr_t optlen)
{
struct inet_sock *inet = inet_sk(sk);
bool needs_rtnl = getsockopt_needs_rtnl(optname);
@@ -1320,93 +1518,116 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
if (ip_mroute_opt(optname))
return ip_mroute_getsockopt(sk, optname, optval, optlen);
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
if (len < 0)
return -EINVAL;
- if (needs_rtnl)
- rtnl_lock();
- lock_sock(sk);
-
+ /* Handle options that can be read without locking the socket. */
switch (optname) {
+ case IP_PKTINFO:
+ val = inet_test_bit(PKTINFO, sk);
+ goto copyval;
+ case IP_RECVTTL:
+ val = inet_test_bit(TTL, sk);
+ goto copyval;
+ case IP_RECVTOS:
+ val = inet_test_bit(TOS, sk);
+ goto copyval;
+ case IP_RECVOPTS:
+ val = inet_test_bit(RECVOPTS, sk);
+ goto copyval;
+ case IP_RETOPTS:
+ val = inet_test_bit(RETOPTS, sk);
+ goto copyval;
+ case IP_PASSSEC:
+ val = inet_test_bit(PASSSEC, sk);
+ goto copyval;
+ case IP_RECVORIGDSTADDR:
+ val = inet_test_bit(ORIGDSTADDR, sk);
+ goto copyval;
+ case IP_CHECKSUM:
+ val = inet_test_bit(CHECKSUM, sk);
+ goto copyval;
+ case IP_RECVFRAGSIZE:
+ val = inet_test_bit(RECVFRAGSIZE, sk);
+ goto copyval;
+ case IP_RECVERR:
+ val = inet_test_bit(RECVERR, sk);
+ goto copyval;
+ case IP_RECVERR_RFC4884:
+ val = inet_test_bit(RECVERR_RFC4884, sk);
+ goto copyval;
+ case IP_FREEBIND:
+ val = inet_test_bit(FREEBIND, sk);
+ goto copyval;
+ case IP_HDRINCL:
+ val = inet_test_bit(HDRINCL, sk);
+ goto copyval;
+ case IP_MULTICAST_LOOP:
+ val = inet_test_bit(MC_LOOP, sk);
+ goto copyval;
+ case IP_MULTICAST_ALL:
+ val = inet_test_bit(MC_ALL, sk);
+ goto copyval;
+ case IP_TRANSPARENT:
+ val = inet_test_bit(TRANSPARENT, sk);
+ goto copyval;
+ case IP_NODEFRAG:
+ val = inet_test_bit(NODEFRAG, sk);
+ goto copyval;
+ case IP_BIND_ADDRESS_NO_PORT:
+ val = inet_test_bit(BIND_ADDRESS_NO_PORT, sk);
+ goto copyval;
+ case IP_ROUTER_ALERT:
+ val = inet_test_bit(RTALERT, sk);
+ goto copyval;
+ case IP_TTL:
+ val = READ_ONCE(inet->uc_ttl);
+ if (val < 0)
+ val = READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_default_ttl);
+ goto copyval;
+ case IP_MINTTL:
+ val = READ_ONCE(inet->min_ttl);
+ goto copyval;
+ case IP_MULTICAST_TTL:
+ val = READ_ONCE(inet->mc_ttl);
+ goto copyval;
+ case IP_MTU_DISCOVER:
+ val = READ_ONCE(inet->pmtudisc);
+ goto copyval;
+ case IP_TOS:
+ val = READ_ONCE(inet->tos);
+ goto copyval;
case IP_OPTIONS:
{
unsigned char optbuf[sizeof(struct ip_options)+40];
struct ip_options *opt = (struct ip_options *)optbuf;
struct ip_options_rcu *inet_opt;
- inet_opt = rcu_dereference_protected(inet->inet_opt,
- lockdep_sock_is_held(sk));
+ rcu_read_lock();
+ inet_opt = rcu_dereference(inet->inet_opt);
opt->optlen = 0;
if (inet_opt)
memcpy(optbuf, &inet_opt->opt,
sizeof(struct ip_options) +
inet_opt->opt.optlen);
- release_sock(sk);
+ rcu_read_unlock();
- if (opt->optlen == 0)
- return put_user(0, optlen);
+ if (opt->optlen == 0) {
+ len = 0;
+ return copy_to_sockptr(optlen, &len, sizeof(int));
+ }
ip_options_undo(opt);
len = min_t(unsigned int, len, opt->optlen);
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, opt->__data, len))
+ if (copy_to_sockptr(optval, opt->__data, len))
return -EFAULT;
return 0;
}
- case IP_PKTINFO:
- val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0;
- break;
- case IP_RECVTTL:
- val = (inet->cmsg_flags & IP_CMSG_TTL) != 0;
- break;
- case IP_RECVTOS:
- val = (inet->cmsg_flags & IP_CMSG_TOS) != 0;
- break;
- case IP_RECVOPTS:
- val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0;
- break;
- case IP_RETOPTS:
- val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0;
- break;
- case IP_PASSSEC:
- val = (inet->cmsg_flags & IP_CMSG_PASSSEC) != 0;
- break;
- case IP_RECVORIGDSTADDR:
- val = (inet->cmsg_flags & IP_CMSG_ORIGDSTADDR) != 0;
- break;
- case IP_CHECKSUM:
- val = (inet->cmsg_flags & IP_CMSG_CHECKSUM) != 0;
- break;
- case IP_RECVFRAGSIZE:
- val = (inet->cmsg_flags & IP_CMSG_RECVFRAGSIZE) != 0;
- break;
- case IP_TOS:
- val = inet->tos;
- break;
- case IP_TTL:
- {
- struct net *net = sock_net(sk);
- val = (inet->uc_ttl == -1 ?
- net->ipv4.sysctl_ip_default_ttl :
- inet->uc_ttl);
- break;
- }
- case IP_HDRINCL:
- val = inet->hdrincl;
- break;
- case IP_NODEFRAG:
- val = inet->nodefrag;
- break;
- case IP_BIND_ADDRESS_NO_PORT:
- val = inet->bind_address_no_port;
- break;
- case IP_MTU_DISCOVER:
- val = inet->pmtudisc;
- break;
case IP_MTU:
{
struct dst_entry *dst;
@@ -1416,37 +1637,72 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
val = dst_mtu(dst);
dst_release(dst);
}
- if (!val) {
- release_sock(sk);
+ if (!val)
return -ENOTCONN;
+ goto copyval;
+ }
+ case IP_PKTOPTIONS:
+ {
+ struct msghdr msg;
+
+ if (sk->sk_type != SOCK_STREAM)
+ return -ENOPROTOOPT;
+
+ if (optval.is_kernel) {
+ msg.msg_control_is_user = false;
+ msg.msg_control = optval.kernel;
+ } else {
+ msg.msg_control_is_user = true;
+ msg.msg_control_user = optval.user;
}
- break;
+ msg.msg_controllen = len;
+ msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0;
+
+ if (inet_test_bit(PKTINFO, sk)) {
+ struct in_pktinfo info;
+
+ info.ipi_addr.s_addr = READ_ONCE(inet->inet_rcv_saddr);
+ info.ipi_spec_dst.s_addr = READ_ONCE(inet->inet_rcv_saddr);
+ info.ipi_ifindex = READ_ONCE(inet->mc_index);
+ put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+ }
+ if (inet_test_bit(TTL, sk)) {
+ int hlim = READ_ONCE(inet->mc_ttl);
+
+ put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
+ }
+ if (inet_test_bit(TOS, sk)) {
+ int tos = READ_ONCE(inet->rcv_tos);
+ put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos);
+ }
+ len -= msg.msg_controllen;
+ return copy_to_sockptr(optlen, &len, sizeof(int));
}
- case IP_RECVERR:
- val = inet->recverr;
- break;
- case IP_MULTICAST_TTL:
- val = inet->mc_ttl;
- break;
- case IP_MULTICAST_LOOP:
- val = inet->mc_loop;
- break;
case IP_UNICAST_IF:
- val = (__force int)htonl((__u32) inet->uc_index);
- break;
+ val = (__force int)htonl((__u32) READ_ONCE(inet->uc_index));
+ goto copyval;
case IP_MULTICAST_IF:
{
struct in_addr addr;
len = min_t(unsigned int, len, sizeof(struct in_addr));
- addr.s_addr = inet->mc_addr;
- release_sock(sk);
+ addr.s_addr = READ_ONCE(inet->mc_addr);
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &addr, len))
+ if (copy_to_sockptr(optval, &addr, len))
return -EFAULT;
return 0;
}
+ case IP_LOCAL_PORT_RANGE:
+ val = READ_ONCE(inet->local_port_range);
+ goto copyval;
+ }
+
+ if (needs_rtnl)
+ rtnl_lock();
+ sockopt_lock_sock(sk);
+
+ switch (optname) {
case IP_MSFILTER:
{
struct ip_msfilter msf;
@@ -1455,99 +1711,47 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
err = -EINVAL;
goto out;
}
- if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
+ if (copy_from_sockptr(&msf, optval, IP_MSFILTER_SIZE(0))) {
err = -EFAULT;
goto out;
}
- err = ip_mc_msfget(sk, &msf,
- (struct ip_msfilter __user *)optval, optlen);
+ err = ip_mc_msfget(sk, &msf, optval, optlen);
goto out;
}
case MCAST_MSFILTER:
- {
- struct group_filter gsf;
-
- if (len < GROUP_FILTER_SIZE(0)) {
- err = -EINVAL;
- goto out;
- }
- if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) {
- err = -EFAULT;
- goto out;
- }
- err = ip_mc_gsfget(sk, &gsf,
- (struct group_filter __user *)optval,
- optlen);
+ if (in_compat_syscall())
+ err = compat_ip_get_mcast_msfilter(sk, optval, optlen,
+ len);
+ else
+ err = ip_get_mcast_msfilter(sk, optval, optlen, len);
goto out;
- }
- case IP_MULTICAST_ALL:
- val = inet->mc_all;
- break;
- case IP_PKTOPTIONS:
- {
- struct msghdr msg;
-
- release_sock(sk);
-
- if (sk->sk_type != SOCK_STREAM)
- return -ENOPROTOOPT;
-
- msg.msg_control = (__force void *) optval;
- msg.msg_controllen = len;
- msg.msg_flags = flags;
-
- if (inet->cmsg_flags & IP_CMSG_PKTINFO) {
- struct in_pktinfo info;
-
- info.ipi_addr.s_addr = inet->inet_rcv_saddr;
- info.ipi_spec_dst.s_addr = inet->inet_rcv_saddr;
- info.ipi_ifindex = inet->mc_index;
- put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
- }
- if (inet->cmsg_flags & IP_CMSG_TTL) {
- int hlim = inet->mc_ttl;
- put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
- }
- if (inet->cmsg_flags & IP_CMSG_TOS) {
- int tos = inet->rcv_tos;
- put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos);
- }
- len -= msg.msg_controllen;
- return put_user(len, optlen);
- }
- case IP_FREEBIND:
- val = inet->freebind;
- break;
- case IP_TRANSPARENT:
- val = inet->transparent;
- break;
- case IP_MINTTL:
- val = inet->min_ttl;
+ case IP_PROTOCOL:
+ val = inet_sk(sk)->inet_num;
break;
default:
- release_sock(sk);
+ sockopt_release_sock(sk);
return -ENOPROTOOPT;
}
- release_sock(sk);
-
+ sockopt_release_sock(sk);
+copyval:
if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) {
unsigned char ucval = (unsigned char)val;
len = 1;
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &ucval, 1))
+ if (copy_to_sockptr(optval, &ucval, 1))
return -EFAULT;
} else {
len = min_t(unsigned int, sizeof(int), len);
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &val, len))
+ if (copy_to_sockptr(optval, &val, len))
return -EFAULT;
}
return 0;
out:
- release_sock(sk);
+ sockopt_release_sock(sk);
if (needs_rtnl)
rtnl_unlock();
return err;
@@ -1558,12 +1762,9 @@ int ip_getsockopt(struct sock *sk, int level,
{
int err;
- err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);
-#ifdef CONFIG_BPFILTER
- if (optname >= BPFILTER_IPT_SO_GET_INFO &&
- optname < BPFILTER_IPT_GET_MAX)
- err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
-#endif
+ err = do_ip_getsockopt(sk, level, optname,
+ USER_SOCKPTR(optval), USER_SOCKPTR(optlen));
+
#ifdef CONFIG_NETFILTER
/* we need to exclude all possible ENOPROTOOPTs except default case */
if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
@@ -1582,41 +1783,3 @@ int ip_getsockopt(struct sock *sk, int level,
return err;
}
EXPORT_SYMBOL(ip_getsockopt);
-
-#ifdef CONFIG_COMPAT
-int compat_ip_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- int err;
-
- if (optname == MCAST_MSFILTER)
- return compat_mc_getsockopt(sk, level, optname, optval, optlen,
- ip_getsockopt);
-
- err = do_ip_getsockopt(sk, level, optname, optval, optlen,
- MSG_CMSG_COMPAT);
-
-#ifdef CONFIG_BPFILTER
- if (optname >= BPFILTER_IPT_SO_GET_INFO &&
- optname < BPFILTER_IPT_GET_MAX)
- err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
-#endif
-#ifdef CONFIG_NETFILTER
- /* we need to exclude all possible ENOPROTOOPTs except default case */
- if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
- !ip_mroute_opt(optname)) {
- int len;
-
- if (get_user(len, optlen))
- return -EFAULT;
-
- err = compat_nf_getsockopt(sk, PF_INET, optname, optval, &len);
- if (err >= 0)
- err = put_user(len, optlen);
- return err;
- }
-#endif
- return err;
-}
-EXPORT_SYMBOL(compat_ip_getsockopt);
-#endif
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 284a22154b4e..158a30ae7c5f 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2013 Nicira, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -53,9 +40,11 @@
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <net/netdev_lock.h>
#include <net/rtnetlink.h>
#include <net/udp.h>
#include <net/dst_metadata.h>
+#include <net/inet_dscp.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
@@ -69,17 +58,13 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
IP_TNL_HASH_BITS);
}
-static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
- __be16 flags, __be32 key)
+static bool ip_tunnel_key_match(const struct ip_tunnel_parm_kern *p,
+ const unsigned long *flags, __be32 key)
{
- if (p->i_flags & TUNNEL_KEY) {
- if (flags & TUNNEL_KEY)
- return key == p->i_key;
- else
- /* key expected, none present */
- return false;
- } else
- return !(flags & TUNNEL_KEY);
+ if (!test_bit(IP_TUNNEL_KEY_BIT, flags))
+ return !test_bit(IP_TUNNEL_KEY_BIT, p->i_flags);
+
+ return test_bit(IP_TUNNEL_KEY_BIT, p->i_flags) && p->i_key == key;
}
/* Fallback tunnel: no source, no destination, no key, no options
@@ -94,13 +79,14 @@ static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
Given src, dst and key, find appropriate for input tunnel.
*/
struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
- int link, __be16 flags,
+ int link, const unsigned long *flags,
__be32 remote, __be32 local,
__be32 key)
{
- unsigned int hash;
struct ip_tunnel *t, *cand = NULL;
struct hlist_head *head;
+ struct net_device *ndev;
+ unsigned int hash;
hash = ip_tunnel_hash(key, remote);
head = &itn->tunnels[hash];
@@ -114,10 +100,9 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
if (!ip_tunnel_key_match(&t->parms, flags, key))
continue;
- if (t->parms.link == link)
+ if (READ_ONCE(t->parms.link) == link)
return t;
- else
- cand = t;
+ cand = t;
}
hlist_for_each_entry_rcu(t, head, hash_node) {
@@ -129,9 +114,9 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
if (!ip_tunnel_key_match(&t->parms, flags, key))
continue;
- if (t->parms.link == link)
+ if (READ_ONCE(t->parms.link) == link)
return t;
- else if (!cand)
+ if (!cand)
cand = t;
}
@@ -149,29 +134,26 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
if (!ip_tunnel_key_match(&t->parms, flags, key))
continue;
- if (t->parms.link == link)
+ if (READ_ONCE(t->parms.link) == link)
return t;
- else if (!cand)
+ if (!cand)
cand = t;
}
- if (flags & TUNNEL_NO_KEY)
- goto skip_key_lookup;
-
hlist_for_each_entry_rcu(t, head, hash_node) {
- if (t->parms.i_key != key ||
+ if ((!test_bit(IP_TUNNEL_NO_KEY_BIT, flags) &&
+ t->parms.i_key != key) ||
t->parms.iph.saddr != 0 ||
t->parms.iph.daddr != 0 ||
!(t->dev->flags & IFF_UP))
continue;
- if (t->parms.link == link)
+ if (READ_ONCE(t->parms.link) == link)
return t;
- else if (!cand)
+ if (!cand)
cand = t;
}
-skip_key_lookup:
if (cand)
return cand;
@@ -179,15 +161,16 @@ skip_key_lookup:
if (t && t->dev->flags & IFF_UP)
return t;
- if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
- return netdev_priv(itn->fb_tunnel_dev);
+ ndev = READ_ONCE(itn->fb_tunnel_dev);
+ if (ndev && ndev->flags & IFF_UP)
+ return netdev_priv(ndev);
return NULL;
}
EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
- struct ip_tunnel_parm *parms)
+ struct ip_tunnel_parm_kern *parms)
{
unsigned int h;
__be32 remote;
@@ -198,7 +181,8 @@ static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
else
remote = 0;
- if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
+ if (!test_bit(IP_TUNNEL_KEY_BIT, parms->i_flags) &&
+ test_bit(IP_TUNNEL_VTI_BIT, parms->i_flags))
i_key = 0;
h = ip_tunnel_hash(i_key, remote);
@@ -222,21 +206,23 @@ static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
}
static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
- struct ip_tunnel_parm *parms,
+ struct ip_tunnel_parm_kern *parms,
int type)
{
__be32 remote = parms->iph.daddr;
__be32 local = parms->iph.saddr;
+ IP_TUNNEL_DECLARE_FLAGS(flags);
__be32 key = parms->i_key;
- __be16 flags = parms->i_flags;
int link = parms->link;
struct ip_tunnel *t = NULL;
struct hlist_head *head = ip_bucket(itn, parms);
- hlist_for_each_entry_rcu(t, head, hash_node) {
+ ip_tunnel_flags_copy(flags, parms->i_flags);
+
+ hlist_for_each_entry_rcu(t, head, hash_node, lockdep_rtnl_is_held()) {
if (local == t->parms.iph.saddr &&
remote == t->parms.iph.daddr &&
- link == t->parms.link &&
+ link == READ_ONCE(t->parms.link) &&
type == t->dev->type &&
ip_tunnel_key_match(&t->parms, flags, key))
break;
@@ -246,7 +232,7 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
static struct net_device *__ip_tunnel_create(struct net *net,
const struct rtnl_link_ops *ops,
- struct ip_tunnel_parm *parms)
+ struct ip_tunnel_parm_kern *parms)
{
int err;
struct ip_tunnel *tunnel;
@@ -257,11 +243,11 @@ static struct net_device *__ip_tunnel_create(struct net *net,
if (parms->name[0]) {
if (!dev_valid_name(parms->name))
goto failed;
- strlcpy(name, parms->name, IFNAMSIZ);
+ strscpy(name, parms->name);
} else {
if (strlen(ops->kind) > (IFNAMSIZ - 3))
goto failed;
- strcpy(name, ops->kind);
+ strscpy(name, ops->kind);
strcat(name, "%d");
}
@@ -309,8 +295,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
iph->saddr, tunnel->parms.o_key,
- RT_TOS(iph->tos), tunnel->parms.link,
- tunnel->fwmark);
+ iph->tos & INET_DSCP_MASK, tunnel->net,
+ tunnel->parms.link, tunnel->fwmark, 0, 0);
rt = ip_route_output_key(tunnel->net, &fl4);
if (!IS_ERR(rt)) {
@@ -332,7 +318,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
}
dev->needed_headroom = t_hlen + hlen;
- mtu -= (dev->hard_header_len + t_hlen);
+ mtu -= t_hlen + (dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0);
if (mtu < IPV4_MIN_MTU)
mtu = IPV4_MIN_MTU;
@@ -342,7 +328,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
static struct ip_tunnel *ip_tunnel_create(struct net *net,
struct ip_tunnel_net *itn,
- struct ip_tunnel_parm *parms)
+ struct ip_tunnel_parm_kern *parms)
{
struct ip_tunnel *nt;
struct net_device *dev;
@@ -362,7 +348,10 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net,
nt = netdev_priv(dev);
t_hlen = nt->hlen + sizeof(struct iphdr);
dev->min_mtu = ETH_MIN_MTU;
- dev->max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
+ dev->max_mtu = IP_MAX_MTU - t_hlen;
+ if (dev->type == ARPHRD_ETHER)
+ dev->max_mtu -= dev->hard_header_len;
+
ip_tunnel_add(itn, nt);
return nt;
@@ -371,39 +360,65 @@ err_dev_set_mtu:
return ERR_PTR(err);
}
+void ip_tunnel_md_udp_encap(struct sk_buff *skb, struct ip_tunnel_info *info)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ const struct udphdr *udph;
+
+ if (iph->protocol != IPPROTO_UDP)
+ return;
+
+ udph = (struct udphdr *)((__u8 *)iph + (iph->ihl << 2));
+ info->encap.sport = udph->source;
+ info->encap.dport = udph->dest;
+}
+EXPORT_SYMBOL(ip_tunnel_md_udp_encap);
+
int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
bool log_ecn_error)
{
- struct pcpu_sw_netstats *tstats;
const struct iphdr *iph = ip_hdr(skb);
- int err;
+ int nh, err;
#ifdef CONFIG_NET_IPGRE_BROADCAST
if (ipv4_is_multicast(iph->daddr)) {
- tunnel->dev->stats.multicast++;
+ DEV_STATS_INC(tunnel->dev, multicast);
skb->pkt_type = PACKET_BROADCAST;
}
#endif
- if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
- ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
- tunnel->dev->stats.rx_crc_errors++;
- tunnel->dev->stats.rx_errors++;
+ if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) !=
+ test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) {
+ DEV_STATS_INC(tunnel->dev, rx_crc_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
goto drop;
}
- if (tunnel->parms.i_flags&TUNNEL_SEQ) {
- if (!(tpi->flags&TUNNEL_SEQ) ||
+ if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) {
+ if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) ||
(tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
- tunnel->dev->stats.rx_fifo_errors++;
- tunnel->dev->stats.rx_errors++;
+ DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
goto drop;
}
tunnel->i_seqno = ntohl(tpi->seq) + 1;
}
- skb_reset_network_header(skb);
+ /* Save offset of outer header relative to skb->head,
+ * because we are going to reset the network header to the inner header
+ * and might change skb->head.
+ */
+ nh = skb_network_header(skb) - skb->head;
+
+ skb_set_network_header(skb, (tunnel->dev->type == ARPHRD_ETHER) ? ETH_HLEN : 0);
+
+ if (!pskb_inet_may_pull(skb)) {
+ DEV_STATS_INC(tunnel->dev, rx_length_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
+ goto drop;
+ }
+ iph = (struct iphdr *)(skb->head + nh);
err = IP_ECN_decapsulate(iph, skb);
if (unlikely(err)) {
@@ -411,18 +426,13 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
&iph->saddr, iph->tos);
if (err > 1) {
- ++tunnel->dev->stats.rx_frame_errors;
- ++tunnel->dev->stats.rx_errors;
+ DEV_STATS_INC(tunnel->dev, rx_frame_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
goto drop;
}
}
- tstats = this_cpu_ptr(tunnel->dev->tstats);
- u64_stats_update_begin(&tstats->syncp);
- tstats->rx_packets++;
- tstats->rx_bytes += skb->len;
- u64_stats_update_end(&tstats->syncp);
-
+ dev_sw_netstats_rx_add(tunnel->dev, skb->len);
skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
if (tunnel->dev->type == ARPHRD_ETHER) {
@@ -501,37 +511,47 @@ EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
struct rtable *rt, __be16 df,
- const struct iphdr *inner_iph)
+ const struct iphdr *inner_iph,
+ int tunnel_hlen, __be32 dst, bool md)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
+ int pkt_size;
int mtu;
- if (df)
- mtu = dst_mtu(&rt->dst) - dev->hard_header_len
- - sizeof(struct iphdr) - tunnel->hlen;
- else
- mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
+ tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
+ pkt_size = skb->len - tunnel_hlen;
+ pkt_size -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
- skb_dst_update_pmtu(skb, mtu);
+ if (df) {
+ mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
+ mtu -= dev->type == ARPHRD_ETHER ? dev->hard_header_len : 0;
+ } else {
+ mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
+ }
+
+ if (skb_valid_dst(skb))
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
if (skb->protocol == htons(ETH_P_IP)) {
if (!skb_is_gso(skb) &&
(inner_iph->frag_off & htons(IP_DF)) &&
mtu < pkt_size) {
- memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+ icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
return -E2BIG;
}
}
#if IS_ENABLED(CONFIG_IPV6)
else if (skb->protocol == htons(ETH_P_IPV6)) {
- struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
+ struct rt6_info *rt6;
+ __be32 daddr;
+
+ rt6 = skb_valid_dst(skb) ? dst_rt6_info(skb_dst(skb)) :
+ NULL;
+ daddr = md ? dst : tunnel->parms.iph.daddr;
if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
mtu >= IPV6_MIN_MTU) {
- if ((tunnel->parms.iph.daddr &&
- !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
+ if ((daddr && !ipv4_is_multicast(daddr)) ||
rt6->rt6i_dst.plen == 128) {
rt6->rt6i_flags |= RTF_MODIFIED;
dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
@@ -540,7 +560,7 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
mtu < pkt_size) {
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
return -E2BIG;
}
}
@@ -548,17 +568,19 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
return 0;
}
-void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
+void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
+ u8 proto, int tunnel_hlen)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
u32 headroom = sizeof(struct iphdr);
struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
const struct iphdr *inner_iph;
- struct rtable *rt;
+ struct rtable *rt = NULL;
struct flowi4 fl4;
__be16 df = 0;
u8 tos, ttl;
+ bool use_cache;
tun_info = skb_tunnel_info(skb);
if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
@@ -574,20 +596,44 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
else if (skb->protocol == htons(ETH_P_IPV6))
tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
}
- ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0,
- RT_TOS(tos), tunnel->parms.link, tunnel->fwmark);
- if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
- goto tx_error;
- rt = ip_route_output_key(tunnel->net, &fl4);
- if (IS_ERR(rt)) {
- dev->stats.tx_carrier_errors++;
+ ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
+ tunnel_id_to_key32(key->tun_id),
+ tos & INET_DSCP_MASK, tunnel->net, 0, skb->mark,
+ skb_get_hash(skb), key->flow_flags);
+
+ if (!tunnel_hlen)
+ tunnel_hlen = ip_encap_hlen(&tun_info->encap);
+
+ if (ip_tunnel_encap(skb, &tun_info->encap, &proto, &fl4) < 0)
goto tx_error;
+
+ use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
+ if (use_cache)
+ rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
+ if (!rt) {
+ rt = ip_route_output_key(tunnel->net, &fl4);
+ if (IS_ERR(rt)) {
+ DEV_STATS_INC(dev, tx_carrier_errors);
+ goto tx_error;
+ }
+ if (use_cache)
+ dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
+ fl4.saddr);
}
if (rt->dst.dev == dev) {
ip_rt_put(rt);
- dev->stats.collisions++;
+ DEV_STATS_INC(dev, collisions);
+ goto tx_error;
+ }
+
+ if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, key->tun_flags))
+ df = htons(IP_DF);
+ if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
+ key->u.ipv4.dst, true)) {
+ ip_rt_put(rt);
goto tx_error;
}
+
tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
ttl = key->ttl;
if (ttl == 0) {
@@ -598,26 +644,23 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto)
else
ttl = ip4_dst_hoplimit(&rt->dst);
}
- if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
- df = htons(IP_DF);
- else if (skb->protocol == htons(ETH_P_IP))
- df = inner_iph->frag_off & htons(IP_DF);
- headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
- if (headroom > dev->needed_headroom)
- dev->needed_headroom = headroom;
- if (skb_cow_head(skb, dev->needed_headroom)) {
+ headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
+ if (skb_cow_head(skb, headroom)) {
ip_rt_put(rt);
goto tx_dropped;
}
+
+ ip_tunnel_adj_headroom(dev, headroom);
+
iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
- df, !net_eq(tunnel->net, dev_net(dev)));
+ df, !net_eq(tunnel->net, dev_net(dev)), 0);
return;
tx_error:
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
goto kfree;
tx_dropped:
- dev->stats.tx_dropped++;
+ DEV_STATS_INC(dev, tx_dropped);
kfree:
kfree_skb(skb);
}
@@ -627,26 +670,22 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
const struct iphdr *tnl_params, u8 protocol)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- unsigned int inner_nhdr_len = 0;
+ struct ip_tunnel_info *tun_info = NULL;
const struct iphdr *inner_iph;
- struct flowi4 fl4;
- u8 tos, ttl;
- __be16 df;
- struct rtable *rt; /* Route to the other host */
unsigned int max_headroom; /* The extra header space needed */
- __be32 dst;
+ struct rtable *rt = NULL; /* Route to the other host */
+ __be16 payload_protocol;
+ bool use_cache = false;
+ struct flowi4 fl4;
+ bool md = false;
bool connected;
-
- /* ensure we can access the inner net header, for several users below */
- if (skb->protocol == htons(ETH_P_IP))
- inner_nhdr_len = sizeof(struct iphdr);
- else if (skb->protocol == htons(ETH_P_IPV6))
- inner_nhdr_len = sizeof(struct ipv6hdr);
- if (unlikely(!pskb_may_pull(skb, inner_nhdr_len)))
- goto tx_error;
+ u8 tos, ttl;
+ __be32 dst;
+ __be16 df;
inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
connected = (tunnel->parms.iph.daddr != 0);
+ payload_protocol = skb_protocol(skb, true);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
@@ -655,16 +694,23 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
/* NBMA tunnel */
if (!skb_dst(skb)) {
- dev->stats.tx_fifo_errors++;
+ DEV_STATS_INC(dev, tx_fifo_errors);
goto tx_error;
}
- if (skb->protocol == htons(ETH_P_IP)) {
+ tun_info = skb_tunnel_info(skb);
+ if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
+ ip_tunnel_info_af(tun_info) == AF_INET &&
+ tun_info->key.u.ipv4.dst) {
+ dst = tun_info->key.u.ipv4.dst;
+ md = true;
+ connected = true;
+ } else if (payload_protocol == htons(ETH_P_IP)) {
rt = skb_rtable(skb);
dst = rt_nexthop(rt, inner_iph->daddr);
}
#if IS_ENABLED(CONFIG_IPV6)
- else if (skb->protocol == htons(ETH_P_IPV6)) {
+ else if (payload_protocol == htons(ETH_P_IPV6)) {
const struct in6_addr *addr6;
struct neighbour *neigh;
bool do_tx_error_icmp;
@@ -697,50 +743,66 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
else
goto tx_error;
- connected = false;
+ if (!md)
+ connected = false;
}
tos = tnl_params->tos;
if (tos & 0x1) {
tos &= ~0x1;
- if (skb->protocol == htons(ETH_P_IP)) {
+ if (payload_protocol == htons(ETH_P_IP)) {
tos = inner_iph->tos;
connected = false;
- } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ } else if (payload_protocol == htons(ETH_P_IPV6)) {
tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
connected = false;
}
}
ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
- tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
- tunnel->fwmark);
+ tunnel->parms.o_key, tos & INET_DSCP_MASK,
+ tunnel->net, READ_ONCE(tunnel->parms.link),
+ tunnel->fwmark, skb_get_hash(skb), 0);
- if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
+ if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0)
goto tx_error;
- rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
- NULL;
+ if (connected && md) {
+ use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
+ if (use_cache)
+ rt = dst_cache_get_ip4(&tun_info->dst_cache,
+ &fl4.saddr);
+ } else {
+ rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
+ &fl4.saddr) : NULL;
+ }
if (!rt) {
rt = ip_route_output_key(tunnel->net, &fl4);
if (IS_ERR(rt)) {
- dev->stats.tx_carrier_errors++;
+ DEV_STATS_INC(dev, tx_carrier_errors);
goto tx_error;
}
- if (connected)
+ if (use_cache)
+ dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
+ fl4.saddr);
+ else if (!md && connected)
dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
fl4.saddr);
}
if (rt->dst.dev == dev) {
ip_rt_put(rt);
- dev->stats.collisions++;
+ DEV_STATS_INC(dev, collisions);
goto tx_error;
}
- if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off, inner_iph)) {
+ df = tnl_params->frag_off;
+ if (payload_protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
+ df |= (inner_iph->frag_off & htons(IP_DF));
+
+ if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
ip_rt_put(rt);
goto tx_error;
}
@@ -758,34 +820,30 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
ttl = tnl_params->ttl;
if (ttl == 0) {
- if (skb->protocol == htons(ETH_P_IP))
+ if (payload_protocol == htons(ETH_P_IP))
ttl = inner_iph->ttl;
#if IS_ENABLED(CONFIG_IPV6)
- else if (skb->protocol == htons(ETH_P_IPV6))
+ else if (payload_protocol == htons(ETH_P_IPV6))
ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
#endif
else
ttl = ip4_dst_hoplimit(&rt->dst);
}
- df = tnl_params->frag_off;
- if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
- df |= (inner_iph->frag_off&htons(IP_DF));
-
max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
+ rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
- if (max_headroom > dev->needed_headroom)
- dev->needed_headroom = max_headroom;
- if (skb_cow_head(skb, dev->needed_headroom)) {
+ if (skb_cow_head(skb, max_headroom)) {
ip_rt_put(rt);
- dev->stats.tx_dropped++;
+ DEV_STATS_INC(dev, tx_dropped);
kfree_skb(skb);
return;
}
+ ip_tunnel_adj_headroom(dev, max_headroom);
+
iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
- df, !net_eq(tunnel->net, dev_net(dev)));
+ df, !net_eq(tunnel->net, dev_net(dev)), 0);
return;
#if IS_ENABLED(CONFIG_IPV6)
@@ -793,7 +851,7 @@ tx_error_icmp:
dst_link_failure(skb);
#endif
tx_error:
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
@@ -801,7 +859,7 @@ EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
static void ip_tunnel_update(struct ip_tunnel_net *itn,
struct ip_tunnel *t,
struct net_device *dev,
- struct ip_tunnel_parm *p,
+ struct ip_tunnel_parm_kern *p,
bool set_mtu,
__u32 fwmark)
{
@@ -811,7 +869,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
t->parms.i_key = p->i_key;
t->parms.o_key = p->o_key;
if (dev->type != ARPHRD_ETHER) {
- memcpy(dev->dev_addr, &p->iph.saddr, 4);
+ __dev_addr_set(dev, &p->iph.saddr, 4);
memcpy(dev->broadcast, &p->iph.daddr, 4);
}
ip_tunnel_add(itn, t);
@@ -823,17 +881,18 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
if (t->parms.link != p->link || t->fwmark != fwmark) {
int mtu;
- t->parms.link = p->link;
+ WRITE_ONCE(t->parms.link, p->link);
t->fwmark = fwmark;
mtu = ip_tunnel_bind_dev(dev);
if (set_mtu)
- dev->mtu = mtu;
+ WRITE_ONCE(dev->mtu, mtu);
}
dst_cache_reset(&t->dst_cache);
netdev_state_change(dev);
}
-int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
+int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p,
+ int cmd)
{
int err = 0;
struct ip_tunnel *t = netdev_priv(dev);
@@ -857,10 +916,10 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
goto done;
if (p->iph.ttl)
p->iph.frag_off |= htons(IP_DF);
- if (!(p->i_flags & VTI_ISVTI)) {
- if (!(p->i_flags & TUNNEL_KEY))
+ if (!test_bit(IP_TUNNEL_VTI_BIT, p->i_flags)) {
+ if (!test_bit(IP_TUNNEL_KEY_BIT, p->i_flags))
p->i_key = 0;
- if (!(p->o_flags & TUNNEL_KEY))
+ if (!test_bit(IP_TUNNEL_KEY_BIT, p->o_flags))
p->o_key = 0;
}
@@ -933,13 +992,73 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
done:
return err;
}
-EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
+EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
+
+bool ip_tunnel_parm_from_user(struct ip_tunnel_parm_kern *kp,
+ const void __user *data)
+{
+ struct ip_tunnel_parm p;
+
+ if (copy_from_user(&p, data, sizeof(p)))
+ return false;
+
+ strscpy(kp->name, p.name);
+ kp->link = p.link;
+ ip_tunnel_flags_from_be16(kp->i_flags, p.i_flags);
+ ip_tunnel_flags_from_be16(kp->o_flags, p.o_flags);
+ kp->i_key = p.i_key;
+ kp->o_key = p.o_key;
+ memcpy(&kp->iph, &p.iph, min(sizeof(kp->iph), sizeof(p.iph)));
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_parm_from_user);
+
+bool ip_tunnel_parm_to_user(void __user *data, struct ip_tunnel_parm_kern *kp)
+{
+ struct ip_tunnel_parm p;
+
+ if (!ip_tunnel_flags_is_be16_compat(kp->i_flags) ||
+ !ip_tunnel_flags_is_be16_compat(kp->o_flags))
+ return false;
+
+ memset(&p, 0, sizeof(p));
+
+ strscpy(p.name, kp->name);
+ p.link = kp->link;
+ p.i_flags = ip_tunnel_flags_to_be16(kp->i_flags);
+ p.o_flags = ip_tunnel_flags_to_be16(kp->o_flags);
+ p.i_key = kp->i_key;
+ p.o_key = kp->o_key;
+ memcpy(&p.iph, &kp->iph, min(sizeof(p.iph), sizeof(kp->iph)));
+
+ return !copy_to_user(data, &p, sizeof(p));
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_parm_to_user);
+
+int ip_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+ void __user *data, int cmd)
+{
+ struct ip_tunnel_parm_kern p;
+ int err;
+
+ if (!ip_tunnel_parm_from_user(&p, data))
+ return -EFAULT;
+ err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
+ if (!err && !ip_tunnel_parm_to_user(data, &p))
+ return -EFAULT;
+ return err;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_siocdevprivate);
int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
- int max_mtu = IP_MAX_MTU - dev->hard_header_len - t_hlen;
+ int max_mtu = IP_MAX_MTU - t_hlen;
+
+ if (dev->type == ARPHRD_ETHER)
+ max_mtu -= dev->hard_header_len;
if (new_mtu < ETH_MIN_MTU)
return -EINVAL;
@@ -951,7 +1070,7 @@ int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
new_mtu = max_mtu;
}
- dev->mtu = new_mtu;
+ WRITE_ONCE(dev->mtu, new_mtu);
return 0;
}
EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
@@ -968,7 +1087,6 @@ static void ip_tunnel_dev_free(struct net_device *dev)
gro_cells_destroy(&tunnel->gro_cells);
dst_cache_destroy(&tunnel->dst_cache);
- free_percpu(dev->tstats);
}
void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
@@ -989,15 +1107,15 @@ struct net *ip_tunnel_get_link_net(const struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- return tunnel->net;
+ return READ_ONCE(tunnel->net);
}
EXPORT_SYMBOL(ip_tunnel_get_link_net);
int ip_tunnel_get_iflink(const struct net_device *dev)
{
- struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct ip_tunnel *tunnel = netdev_priv(dev);
- return tunnel->parms.link;
+ return READ_ONCE(tunnel->parms.link);
}
EXPORT_SYMBOL(ip_tunnel_get_iflink);
@@ -1005,7 +1123,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
struct rtnl_link_ops *ops, char *devname)
{
struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
- struct ip_tunnel_parm parms;
+ struct ip_tunnel_parm_kern parms;
unsigned int i;
itn->rtnl_link_ops = ops;
@@ -1023,7 +1141,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
memset(&parms, 0, sizeof(parms));
if (devname)
- strlcpy(parms.name, devname, IFNAMSIZ);
+ strscpy(parms.name, devname, IFNAMSIZ);
rtnl_lock();
itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
@@ -1031,7 +1149,7 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
* Allowing to move it to another netns is clearly unsafe.
*/
if (!IS_ERR(itn->fb_tunnel_dev)) {
- itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
+ itn->fb_tunnel_dev->netns_immutable = true;
itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
itn->type = itn->fb_tunnel_dev->type;
@@ -1042,13 +1160,16 @@ int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
}
EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
-static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
- struct list_head *head,
- struct rtnl_link_ops *ops)
+void ip_tunnel_delete_net(struct net *net, unsigned int id,
+ struct rtnl_link_ops *ops,
+ struct list_head *head)
{
+ struct ip_tunnel_net *itn = net_generic(net, id);
struct net_device *dev, *aux;
int h;
+ ASSERT_RTNL_NET(net);
+
for_each_netdev_safe(net, dev, aux)
if (dev->rtnl_link_ops == ops)
unregister_netdevice_queue(dev, head);
@@ -1066,29 +1187,13 @@ static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
unregister_netdevice_queue(t->dev, head);
}
}
+EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
-void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
- struct rtnl_link_ops *ops)
-{
- struct ip_tunnel_net *itn;
- struct net *net;
- LIST_HEAD(list);
-
- rtnl_lock();
- list_for_each_entry(net, net_list, exit_list) {
- itn = net_generic(net, id);
- ip_tunnel_destroy(net, itn, &list, ops);
- }
- unregister_netdevice_many(&list);
- rtnl_unlock();
-}
-EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
-
-int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
- struct ip_tunnel_parm *p, __u32 fwmark)
+int ip_tunnel_newlink(struct net *net, struct net_device *dev,
+ struct nlattr *tb[], struct ip_tunnel_parm_kern *p,
+ __u32 fwmark)
{
struct ip_tunnel *nt;
- struct net *net = dev_net(dev);
struct ip_tunnel_net *itn;
int mtu;
int err;
@@ -1116,10 +1221,12 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
mtu = ip_tunnel_bind_dev(dev);
if (tb[IFLA_MTU]) {
- unsigned int max = IP_MAX_MTU - dev->hard_header_len - nt->hlen;
+ unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
- mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU,
- (unsigned int)(max - sizeof(struct iphdr)));
+ if (dev->type == ARPHRD_ETHER)
+ max -= dev->hard_header_len;
+
+ mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
}
err = dev_set_mtu(dev, mtu);
@@ -1137,7 +1244,7 @@ err_register_netdevice:
EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
- struct ip_tunnel_parm *p, __u32 fwmark)
+ struct ip_tunnel_parm_kern *p, __u32 fwmark)
{
struct ip_tunnel *t;
struct ip_tunnel *tunnel = netdev_priv(dev);
@@ -1182,33 +1289,26 @@ int ip_tunnel_init(struct net_device *dev)
dev->needs_free_netdev = true;
dev->priv_destructor = ip_tunnel_dev_free;
- dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
- if (!dev->tstats)
- return -ENOMEM;
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
- if (err) {
- free_percpu(dev->tstats);
+ if (err)
return err;
- }
err = gro_cells_init(&tunnel->gro_cells, dev);
if (err) {
dst_cache_destroy(&tunnel->dst_cache);
- free_percpu(dev->tstats);
return err;
}
tunnel->dev = dev;
- tunnel->net = dev_net(dev);
- strcpy(tunnel->parms.name, dev->name);
+ strscpy(tunnel->parms.name, dev->name);
iph->version = 4;
iph->ihl = 5;
- if (tunnel->collect_md) {
- dev->features |= NETIF_F_NETNS_LOCAL;
+ if (tunnel->collect_md)
netif_keep_dst(dev);
- }
+ netdev_lockdep_set_classes(dev);
return 0;
}
EXPORT_SYMBOL_GPL(ip_tunnel_init);
@@ -1220,9 +1320,9 @@ void ip_tunnel_uninit(struct net_device *dev)
struct ip_tunnel_net *itn;
itn = net_generic(net, tunnel->ip_tnl_net_id);
- /* fb_tunnel_dev will be unregisted in net-exit call. */
- if (itn->fb_tunnel_dev != dev)
- ip_tunnel_del(itn, netdev_priv(dev));
+ ip_tunnel_del(itn, netdev_priv(dev));
+ if (itn->fb_tunnel_dev == dev)
+ WRITE_ONCE(itn->fb_tunnel_dev, NULL);
dst_cache_reset(&tunnel->dst_cache);
}
@@ -1236,4 +1336,5 @@ void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
}
EXPORT_SYMBOL_GPL(ip_tunnel_setup);
+MODULE_DESCRIPTION("IPv4 tunnel implementation library");
MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index dde671e97829..2e61ac137128 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -1,19 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2013 Nicira, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -38,6 +25,7 @@
#include <net/protocol.h>
#include <net/ip_tunnels.h>
#include <net/ip6_tunnel.h>
+#include <net/ip6_checksum.h>
#include <net/arp.h>
#include <net/checksum.h>
#include <net/dsfield.h>
@@ -47,6 +35,9 @@
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/dst_metadata.h>
+#include <net/geneve.h>
+#include <net/vxlan.h>
+#include <net/erspan.h>
const struct ip_tunnel_encap_ops __rcu *
iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly;
@@ -58,7 +49,8 @@ EXPORT_SYMBOL(ip6tun_encaps);
void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 proto,
- __u8 tos, __u8 ttl, __be16 df, bool xnet)
+ __u8 tos, __u8 ttl, __be16 df, bool xnet,
+ u16 ipcb_flags)
{
int pkt_len = skb->len - skb_inner_network_offset(skb);
struct net *net = dev_net(rt->dst.dev);
@@ -71,6 +63,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
skb_clear_hash_if_not_l4(skb);
skb_dst_set(skb, &rt->dst);
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ IPCB(skb)->flags = ipcb_flags;
/* Push down and install the IP header. */
skb_push(skb, sizeof(struct iphdr));
@@ -80,7 +73,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
iph->version = 4;
iph->ihl = sizeof(struct iphdr) >> 2;
- iph->frag_off = df;
+ iph->frag_off = ip_mtu_locked(&rt->dst) ? 0 : df;
iph->protocol = proto;
iph->tos = tos;
iph->daddr = dst;
@@ -89,9 +82,12 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
__ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1);
err = ip_local_out(net, sk, skb);
- if (unlikely(net_xmit_eval(err)))
- pkt_len = 0;
- iptunnel_xmit_stats(dev, pkt_len);
+
+ if (dev) {
+ if (unlikely(net_xmit_eval(err)))
+ pkt_len = 0;
+ iptunnel_xmit_stats(dev, pkt_len);
+ }
}
EXPORT_SYMBOL_GPL(iptunnel_xmit);
@@ -120,7 +116,7 @@ int __iptunnel_pull_header(struct sk_buff *skb, int hdr_len,
}
skb_clear_hash_if_not_l4(skb);
- skb->vlan_tci = 0;
+ __vlan_hwaccel_clear_tag(skb);
skb_set_queue_mapping(skb, 0);
skb_scrub_packet(skb, xnet);
@@ -131,27 +127,30 @@ EXPORT_SYMBOL_GPL(__iptunnel_pull_header);
struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
gfp_t flags)
{
+ IP_TUNNEL_DECLARE_FLAGS(tun_flags) = { };
struct metadata_dst *res;
struct ip_tunnel_info *dst, *src;
if (!md || md->type != METADATA_IP_TUNNEL ||
md->u.tun_info.mode & IP_TUNNEL_INFO_TX)
-
return NULL;
- res = metadata_dst_alloc(0, METADATA_IP_TUNNEL, flags);
+ src = &md->u.tun_info;
+ res = metadata_dst_alloc(src->options_len, METADATA_IP_TUNNEL, flags);
if (!res)
return NULL;
dst = &res->u.tun_info;
- src = &md->u.tun_info;
dst->key.tun_id = src->key.tun_id;
if (src->mode & IP_TUNNEL_INFO_IPV6)
memcpy(&dst->key.u.ipv6.dst, &src->key.u.ipv6.src,
sizeof(struct in6_addr));
else
dst->key.u.ipv4.dst = src->key.u.ipv4.src;
+ ip_tunnel_flags_copy(dst->key.tun_flags, src->key.tun_flags);
dst->mode = src->mode | IP_TUNNEL_INFO_TX;
+ ip_tunnel_info_opts_set(dst, ip_tunnel_info_opts(src),
+ src->options_len, tun_flags);
return res;
}
@@ -189,61 +188,496 @@ int iptunnel_handle_offloads(struct sk_buff *skb,
}
EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
-/* Often modified stats are per cpu, other are shared (netdev->stats) */
-void ip_tunnel_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *tot)
+/**
+ * iptunnel_pmtud_build_icmp() - Build ICMP error message for PMTUD
+ * @skb: Original packet with L2 header
+ * @mtu: MTU value for ICMP error
+ *
+ * Return: length on success, negative error code if message couldn't be built.
+ */
+static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ struct icmphdr *icmph;
+ struct iphdr *niph;
+ struct ethhdr eh;
+ int len, err;
+
+ if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr)))
+ return -EINVAL;
+
+ if (skb_is_gso(skb))
+ skb_gso_reset(skb);
+
+ skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
+ pskb_pull(skb, ETH_HLEN);
+ skb_reset_network_header(skb);
+
+ err = pskb_trim(skb, 576 - sizeof(*niph) - sizeof(*icmph));
+ if (err)
+ return err;
+
+ len = skb->len + sizeof(*icmph);
+ err = skb_cow(skb, sizeof(*niph) + sizeof(*icmph) + ETH_HLEN);
+ if (err)
+ return err;
+
+ icmph = skb_push(skb, sizeof(*icmph));
+ *icmph = (struct icmphdr) {
+ .type = ICMP_DEST_UNREACH,
+ .code = ICMP_FRAG_NEEDED,
+ .checksum = 0,
+ .un.frag.__unused = 0,
+ .un.frag.mtu = htons(mtu),
+ };
+ icmph->checksum = csum_fold(skb_checksum(skb, 0, len, 0));
+ skb_reset_transport_header(skb);
+
+ niph = skb_push(skb, sizeof(*niph));
+ *niph = (struct iphdr) {
+ .ihl = sizeof(*niph) / 4u,
+ .version = 4,
+ .tos = 0,
+ .tot_len = htons(len + sizeof(*niph)),
+ .id = 0,
+ .frag_off = htons(IP_DF),
+ .ttl = iph->ttl,
+ .protocol = IPPROTO_ICMP,
+ .saddr = iph->daddr,
+ .daddr = iph->saddr,
+ };
+ ip_send_check(niph);
+ skb_reset_network_header(skb);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0);
+ skb_reset_mac_header(skb);
+
+ return skb->len;
+}
+
+/**
+ * iptunnel_pmtud_check_icmp() - Trigger ICMP reply if needed and allowed
+ * @skb: Buffer being sent by encapsulation, L2 headers expected
+ * @mtu: Network MTU for path
+ *
+ * Return: 0 for no ICMP reply, length if built, negative value on error.
+ */
+static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu)
{
- int i;
+ const struct icmphdr *icmph = icmp_hdr(skb);
+ const struct iphdr *iph = ip_hdr(skb);
- netdev_stats_to_stats64(tot, &dev->stats);
+ if (mtu < 576 || iph->frag_off != htons(IP_DF))
+ return 0;
+
+ if (ipv4_is_lbcast(iph->daddr) || ipv4_is_multicast(iph->daddr) ||
+ ipv4_is_zeronet(iph->saddr) || ipv4_is_loopback(iph->saddr) ||
+ ipv4_is_lbcast(iph->saddr) || ipv4_is_multicast(iph->saddr))
+ return 0;
- for_each_possible_cpu(i) {
- const struct pcpu_sw_netstats *tstats =
- per_cpu_ptr(dev->tstats, i);
- u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
- unsigned int start;
+ if (iph->protocol == IPPROTO_ICMP && icmp_is_err(icmph->type))
+ return 0;
+
+ return iptunnel_pmtud_build_icmp(skb, mtu);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/**
+ * iptunnel_pmtud_build_icmpv6() - Build ICMPv6 error message for PMTUD
+ * @skb: Original packet with L2 header
+ * @mtu: MTU value for ICMPv6 error
+ *
+ * Return: length on success, negative error code if message couldn't be built.
+ */
+static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ struct icmp6hdr *icmp6h;
+ struct ipv6hdr *nip6h;
+ struct ethhdr eh;
+ int len, err;
+ __wsum csum;
+
+ if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr)))
+ return -EINVAL;
+
+ if (skb_is_gso(skb))
+ skb_gso_reset(skb);
+
+ skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
+ pskb_pull(skb, ETH_HLEN);
+ skb_reset_network_header(skb);
+
+ err = pskb_trim(skb, IPV6_MIN_MTU - sizeof(*nip6h) - sizeof(*icmp6h));
+ if (err)
+ return err;
+
+ len = skb->len + sizeof(*icmp6h);
+ err = skb_cow(skb, sizeof(*nip6h) + sizeof(*icmp6h) + ETH_HLEN);
+ if (err)
+ return err;
+
+ icmp6h = skb_push(skb, sizeof(*icmp6h));
+ *icmp6h = (struct icmp6hdr) {
+ .icmp6_type = ICMPV6_PKT_TOOBIG,
+ .icmp6_code = 0,
+ .icmp6_cksum = 0,
+ .icmp6_mtu = htonl(mtu),
+ };
+ skb_reset_transport_header(skb);
+
+ nip6h = skb_push(skb, sizeof(*nip6h));
+ *nip6h = (struct ipv6hdr) {
+ .priority = 0,
+ .version = 6,
+ .flow_lbl = { 0 },
+ .payload_len = htons(len),
+ .nexthdr = IPPROTO_ICMPV6,
+ .hop_limit = ip6h->hop_limit,
+ .saddr = ip6h->daddr,
+ .daddr = ip6h->saddr,
+ };
+ skb_reset_network_header(skb);
+
+ csum = skb_checksum(skb, skb_transport_offset(skb), len, 0);
+ icmp6h->icmp6_cksum = csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, len,
+ IPPROTO_ICMPV6, csum);
+
+ skb->ip_summed = CHECKSUM_NONE;
+
+ eth_header(skb, skb->dev, ntohs(eh.h_proto), eh.h_source, eh.h_dest, 0);
+ skb_reset_mac_header(skb);
+
+ return skb->len;
+}
+
+/**
+ * iptunnel_pmtud_check_icmpv6() - Trigger ICMPv6 reply if needed and allowed
+ * @skb: Buffer being sent by encapsulation, L2 headers expected
+ * @mtu: Network MTU for path
+ *
+ * Return: 0 for no ICMPv6 reply, length if built, negative value on error.
+ */
+static int iptunnel_pmtud_check_icmpv6(struct sk_buff *skb, int mtu)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ int stype = ipv6_addr_type(&ip6h->saddr);
+ u8 proto = ip6h->nexthdr;
+ __be16 frag_off;
+ int offset;
+
+ if (mtu < IPV6_MIN_MTU)
+ return 0;
+
+ if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST ||
+ stype == IPV6_ADDR_LOOPBACK)
+ return 0;
+
+ offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto,
+ &frag_off);
+ if (offset < 0 || (frag_off & htons(~0x7)))
+ return 0;
- do {
- start = u64_stats_fetch_begin_irq(&tstats->syncp);
- rx_packets = tstats->rx_packets;
- tx_packets = tstats->tx_packets;
- rx_bytes = tstats->rx_bytes;
- tx_bytes = tstats->tx_bytes;
- } while (u64_stats_fetch_retry_irq(&tstats->syncp, start));
+ if (proto == IPPROTO_ICMPV6) {
+ struct icmp6hdr *icmp6h;
- tot->rx_packets += rx_packets;
- tot->tx_packets += tx_packets;
- tot->rx_bytes += rx_bytes;
- tot->tx_bytes += tx_bytes;
+ if (!pskb_may_pull(skb, skb_network_header(skb) +
+ offset + 1 - skb->data))
+ return 0;
+
+ icmp6h = (struct icmp6hdr *)(skb_network_header(skb) + offset);
+ if (icmpv6_is_err(icmp6h->icmp6_type) ||
+ icmp6h->icmp6_type == NDISC_REDIRECT)
+ return 0;
}
+
+ return iptunnel_pmtud_build_icmpv6(skb, mtu);
+}
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+
+/**
+ * skb_tunnel_check_pmtu() - Check, update PMTU and trigger ICMP reply as needed
+ * @skb: Buffer being sent by encapsulation, L2 headers expected
+ * @encap_dst: Destination for tunnel encapsulation (outer IP)
+ * @headroom: Encapsulation header size, bytes
+ * @reply: Build matching ICMP or ICMPv6 message as a result
+ *
+ * L2 tunnel implementations that can carry IP and can be directly bridged
+ * (currently UDP tunnels) can't always rely on IP forwarding paths to handle
+ * PMTU discovery. In the bridged case, ICMP or ICMPv6 messages need to be built
+ * based on payload and sent back by the encapsulation itself.
+ *
+ * For routable interfaces, we just need to update the PMTU for the destination.
+ *
+ * Return: 0 if ICMP error not needed, length if built, negative value on error
+ */
+int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
+ int headroom, bool reply)
+{
+ u32 mtu = dst_mtu(encap_dst) - headroom;
+
+ if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) ||
+ (!skb_is_gso(skb) && (skb->len - skb_network_offset(skb)) <= mtu))
+ return 0;
+
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
+
+ if (!reply)
+ return 0;
+
+ if (skb->protocol == htons(ETH_P_IP))
+ return iptunnel_pmtud_check_icmp(skb, mtu);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (skb->protocol == htons(ETH_P_IPV6))
+ return iptunnel_pmtud_check_icmpv6(skb, mtu);
+#endif
+ return 0;
}
-EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
+EXPORT_SYMBOL(skb_tunnel_check_pmtu);
static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = {
+ [LWTUNNEL_IP_UNSPEC] = { .strict_start_type = LWTUNNEL_IP_OPTS },
[LWTUNNEL_IP_ID] = { .type = NLA_U64 },
[LWTUNNEL_IP_DST] = { .type = NLA_U32 },
[LWTUNNEL_IP_SRC] = { .type = NLA_U32 },
[LWTUNNEL_IP_TTL] = { .type = NLA_U8 },
[LWTUNNEL_IP_TOS] = { .type = NLA_U8 },
[LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 },
+ [LWTUNNEL_IP_OPTS] = { .type = NLA_NESTED },
};
-static int ip_tun_build_state(struct nlattr *attr,
+static const struct nla_policy ip_opts_policy[LWTUNNEL_IP_OPTS_MAX + 1] = {
+ [LWTUNNEL_IP_OPTS_GENEVE] = { .type = NLA_NESTED },
+ [LWTUNNEL_IP_OPTS_VXLAN] = { .type = NLA_NESTED },
+ [LWTUNNEL_IP_OPTS_ERSPAN] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy
+geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = {
+ [LWTUNNEL_IP_OPT_GENEVE_CLASS] = { .type = NLA_U16 },
+ [LWTUNNEL_IP_OPT_GENEVE_TYPE] = { .type = NLA_U8 },
+ [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 127 },
+};
+
+static const struct nla_policy
+vxlan_opt_policy[LWTUNNEL_IP_OPT_VXLAN_MAX + 1] = {
+ [LWTUNNEL_IP_OPT_VXLAN_GBP] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy
+erspan_opt_policy[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1] = {
+ [LWTUNNEL_IP_OPT_ERSPAN_VER] = { .type = NLA_U8 },
+ [LWTUNNEL_IP_OPT_ERSPAN_INDEX] = { .type = NLA_U32 },
+ [LWTUNNEL_IP_OPT_ERSPAN_DIR] = { .type = NLA_U8 },
+ [LWTUNNEL_IP_OPT_ERSPAN_HWID] = { .type = NLA_U8 },
+};
+
+static int ip_tun_parse_opts_geneve(struct nlattr *attr,
+ struct ip_tunnel_info *info, int opts_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[LWTUNNEL_IP_OPT_GENEVE_MAX + 1];
+ int data_len, err;
+
+ err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_GENEVE_MAX, attr,
+ geneve_opt_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[LWTUNNEL_IP_OPT_GENEVE_CLASS] ||
+ !tb[LWTUNNEL_IP_OPT_GENEVE_TYPE] ||
+ !tb[LWTUNNEL_IP_OPT_GENEVE_DATA])
+ return -EINVAL;
+
+ attr = tb[LWTUNNEL_IP_OPT_GENEVE_DATA];
+ data_len = nla_len(attr);
+ if (data_len % 4)
+ return -EINVAL;
+
+ if (info) {
+ struct geneve_opt *opt = ip_tunnel_info_opts(info) + opts_len;
+
+ memcpy(opt->opt_data, nla_data(attr), data_len);
+ opt->length = data_len / 4;
+ attr = tb[LWTUNNEL_IP_OPT_GENEVE_CLASS];
+ opt->opt_class = nla_get_be16(attr);
+ attr = tb[LWTUNNEL_IP_OPT_GENEVE_TYPE];
+ opt->type = nla_get_u8(attr);
+ __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags);
+ }
+
+ return sizeof(struct geneve_opt) + data_len;
+}
+
+static int ip_tun_parse_opts_vxlan(struct nlattr *attr,
+ struct ip_tunnel_info *info, int opts_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[LWTUNNEL_IP_OPT_VXLAN_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_VXLAN_MAX, attr,
+ vxlan_opt_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[LWTUNNEL_IP_OPT_VXLAN_GBP])
+ return -EINVAL;
+
+ if (info) {
+ struct vxlan_metadata *md =
+ ip_tunnel_info_opts(info) + opts_len;
+
+ attr = tb[LWTUNNEL_IP_OPT_VXLAN_GBP];
+ md->gbp = nla_get_u32(attr);
+ md->gbp &= VXLAN_GBP_MASK;
+ __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags);
+ }
+
+ return sizeof(struct vxlan_metadata);
+}
+
+static int ip_tun_parse_opts_erspan(struct nlattr *attr,
+ struct ip_tunnel_info *info, int opts_len,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[LWTUNNEL_IP_OPT_ERSPAN_MAX + 1];
+ int err;
+ u8 ver;
+
+ err = nla_parse_nested(tb, LWTUNNEL_IP_OPT_ERSPAN_MAX, attr,
+ erspan_opt_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[LWTUNNEL_IP_OPT_ERSPAN_VER])
+ return -EINVAL;
+
+ ver = nla_get_u8(tb[LWTUNNEL_IP_OPT_ERSPAN_VER]);
+ if (ver == 1) {
+ if (!tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX])
+ return -EINVAL;
+ } else if (ver == 2) {
+ if (!tb[LWTUNNEL_IP_OPT_ERSPAN_DIR] ||
+ !tb[LWTUNNEL_IP_OPT_ERSPAN_HWID])
+ return -EINVAL;
+ } else {
+ return -EINVAL;
+ }
+
+ if (info) {
+ struct erspan_metadata *md =
+ ip_tunnel_info_opts(info) + opts_len;
+
+ md->version = ver;
+ if (ver == 1) {
+ attr = tb[LWTUNNEL_IP_OPT_ERSPAN_INDEX];
+ md->u.index = nla_get_be32(attr);
+ } else {
+ attr = tb[LWTUNNEL_IP_OPT_ERSPAN_DIR];
+ md->u.md2.dir = nla_get_u8(attr);
+ attr = tb[LWTUNNEL_IP_OPT_ERSPAN_HWID];
+ set_hwid(&md->u.md2, nla_get_u8(attr));
+ }
+
+ __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags);
+ }
+
+ return sizeof(struct erspan_metadata);
+}
+
+static int ip_tun_parse_opts(struct nlattr *attr, struct ip_tunnel_info *info,
+ struct netlink_ext_ack *extack)
+{
+ int err, rem, opt_len, opts_len = 0;
+ struct nlattr *nla;
+ u32 type = 0;
+
+ if (!attr)
+ return 0;
+
+ err = nla_validate(nla_data(attr), nla_len(attr), LWTUNNEL_IP_OPTS_MAX,
+ ip_opts_policy, extack);
+ if (err)
+ return err;
+
+ nla_for_each_attr(nla, nla_data(attr), nla_len(attr), rem) {
+ switch (nla_type(nla)) {
+ case LWTUNNEL_IP_OPTS_GENEVE:
+ if (type && type != IP_TUNNEL_GENEVE_OPT_BIT)
+ return -EINVAL;
+ opt_len = ip_tun_parse_opts_geneve(nla, info, opts_len,
+ extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ if (opts_len > IP_TUNNEL_OPTS_MAX)
+ return -EINVAL;
+ type = IP_TUNNEL_GENEVE_OPT_BIT;
+ break;
+ case LWTUNNEL_IP_OPTS_VXLAN:
+ if (type)
+ return -EINVAL;
+ opt_len = ip_tun_parse_opts_vxlan(nla, info, opts_len,
+ extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ type = IP_TUNNEL_VXLAN_OPT_BIT;
+ break;
+ case LWTUNNEL_IP_OPTS_ERSPAN:
+ if (type)
+ return -EINVAL;
+ opt_len = ip_tun_parse_opts_erspan(nla, info, opts_len,
+ extack);
+ if (opt_len < 0)
+ return opt_len;
+ opts_len += opt_len;
+ type = IP_TUNNEL_ERSPAN_OPT_BIT;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ return opts_len;
+}
+
+static int ip_tun_get_optlen(struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ return ip_tun_parse_opts(attr, NULL, extack);
+}
+
+static int ip_tun_set_opts(struct nlattr *attr, struct ip_tunnel_info *info,
+ struct netlink_ext_ack *extack)
+{
+ return ip_tun_parse_opts(attr, info, extack);
+}
+
+static int ip_tun_build_state(struct net *net, struct nlattr *attr,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
{
- struct ip_tunnel_info *tun_info;
- struct lwtunnel_state *new_state;
struct nlattr *tb[LWTUNNEL_IP_MAX + 1];
- int err;
+ struct lwtunnel_state *new_state;
+ struct ip_tunnel_info *tun_info;
+ int err, opt_len;
- err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy,
- extack);
+ err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP_MAX, attr,
+ ip_tun_policy, extack);
if (err < 0)
return err;
- new_state = lwtunnel_state_alloc(sizeof(*tun_info));
+ opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP_OPTS], extack);
+ if (opt_len < 0)
+ return opt_len;
+
+ new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
if (!new_state)
return -ENOMEM;
@@ -251,6 +685,20 @@ static int ip_tun_build_state(struct nlattr *attr,
tun_info = lwt_tun_info(new_state);
+ err = ip_tun_set_opts(tb[LWTUNNEL_IP_OPTS], tun_info, extack);
+ if (err < 0) {
+ lwtstate_free(new_state);
+ return err;
+ }
+
+#ifdef CONFIG_DST_CACHE
+ err = dst_cache_init(&tun_info->dst_cache, GFP_KERNEL);
+ if (err) {
+ lwtstate_free(new_state);
+ return err;
+ }
+#endif
+
if (tb[LWTUNNEL_IP_ID])
tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP_ID]);
@@ -266,17 +714,142 @@ static int ip_tun_build_state(struct nlattr *attr,
if (tb[LWTUNNEL_IP_TOS])
tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]);
- if (tb[LWTUNNEL_IP_FLAGS])
- tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP_FLAGS]);
+ if (tb[LWTUNNEL_IP_FLAGS]) {
+ IP_TUNNEL_DECLARE_FLAGS(flags);
+
+ ip_tunnel_flags_from_be16(flags,
+ nla_get_be16(tb[LWTUNNEL_IP_FLAGS]));
+ ip_tunnel_clear_options_present(flags);
+
+ ip_tunnel_flags_or(tun_info->key.tun_flags,
+ tun_info->key.tun_flags, flags);
+ }
tun_info->mode = IP_TUNNEL_INFO_TX;
- tun_info->options_len = 0;
+ tun_info->options_len = opt_len;
*ts = new_state;
return 0;
}
+static void ip_tun_destroy_state(struct lwtunnel_state *lwtstate)
+{
+#ifdef CONFIG_DST_CACHE
+ struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
+
+ dst_cache_destroy(&tun_info->dst_cache);
+#endif
+}
+
+static int ip_tun_fill_encap_opts_geneve(struct sk_buff *skb,
+ struct ip_tunnel_info *tun_info)
+{
+ struct geneve_opt *opt;
+ struct nlattr *nest;
+ int offset = 0;
+
+ nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_GENEVE);
+ if (!nest)
+ return -ENOMEM;
+
+ while (tun_info->options_len > offset) {
+ opt = ip_tunnel_info_opts(tun_info) + offset;
+ if (nla_put_be16(skb, LWTUNNEL_IP_OPT_GENEVE_CLASS,
+ opt->opt_class) ||
+ nla_put_u8(skb, LWTUNNEL_IP_OPT_GENEVE_TYPE, opt->type) ||
+ nla_put(skb, LWTUNNEL_IP_OPT_GENEVE_DATA, opt->length * 4,
+ opt->opt_data)) {
+ nla_nest_cancel(skb, nest);
+ return -ENOMEM;
+ }
+ offset += sizeof(*opt) + opt->length * 4;
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+}
+
+static int ip_tun_fill_encap_opts_vxlan(struct sk_buff *skb,
+ struct ip_tunnel_info *tun_info)
+{
+ struct vxlan_metadata *md;
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_VXLAN);
+ if (!nest)
+ return -ENOMEM;
+
+ md = ip_tunnel_info_opts(tun_info);
+ if (nla_put_u32(skb, LWTUNNEL_IP_OPT_VXLAN_GBP, md->gbp)) {
+ nla_nest_cancel(skb, nest);
+ return -ENOMEM;
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+}
+
+static int ip_tun_fill_encap_opts_erspan(struct sk_buff *skb,
+ struct ip_tunnel_info *tun_info)
+{
+ struct erspan_metadata *md;
+ struct nlattr *nest;
+
+ nest = nla_nest_start_noflag(skb, LWTUNNEL_IP_OPTS_ERSPAN);
+ if (!nest)
+ return -ENOMEM;
+
+ md = ip_tunnel_info_opts(tun_info);
+ if (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_VER, md->version))
+ goto err;
+
+ if (md->version == 1 &&
+ nla_put_be32(skb, LWTUNNEL_IP_OPT_ERSPAN_INDEX, md->u.index))
+ goto err;
+
+ if (md->version == 2 &&
+ (nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_DIR, md->u.md2.dir) ||
+ nla_put_u8(skb, LWTUNNEL_IP_OPT_ERSPAN_HWID,
+ get_hwid(&md->u.md2))))
+ goto err;
+
+ nla_nest_end(skb, nest);
+ return 0;
+err:
+ nla_nest_cancel(skb, nest);
+ return -ENOMEM;
+}
+
+static int ip_tun_fill_encap_opts(struct sk_buff *skb, int type,
+ struct ip_tunnel_info *tun_info)
+{
+ struct nlattr *nest;
+ int err = 0;
+
+ if (!ip_tunnel_is_options_present(tun_info->key.tun_flags))
+ return 0;
+
+ nest = nla_nest_start_noflag(skb, type);
+ if (!nest)
+ return -ENOMEM;
+
+ if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_info->key.tun_flags))
+ err = ip_tun_fill_encap_opts_geneve(skb, tun_info);
+ else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_info->key.tun_flags))
+ err = ip_tun_fill_encap_opts_vxlan(skb, tun_info);
+ else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_info->key.tun_flags))
+ err = ip_tun_fill_encap_opts_erspan(skb, tun_info);
+
+ if (err) {
+ nla_nest_cancel(skb, nest);
+ return err;
+ }
+
+ nla_nest_end(skb, nest);
+ return 0;
+}
+
static int ip_tun_fill_encap_info(struct sk_buff *skb,
struct lwtunnel_state *lwtstate)
{
@@ -288,12 +861,53 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb,
nla_put_in_addr(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
- nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags))
+ nla_put_be16(skb, LWTUNNEL_IP_FLAGS,
+ ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) ||
+ ip_tun_fill_encap_opts(skb, LWTUNNEL_IP_OPTS, tun_info))
return -ENOMEM;
return 0;
}
+static int ip_tun_opts_nlsize(struct ip_tunnel_info *info)
+{
+ int opt_len;
+
+ if (!ip_tunnel_is_options_present(info->key.tun_flags))
+ return 0;
+
+ opt_len = nla_total_size(0); /* LWTUNNEL_IP_OPTS */
+ if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, info->key.tun_flags)) {
+ struct geneve_opt *opt;
+ int offset = 0;
+
+ opt_len += nla_total_size(0); /* LWTUNNEL_IP_OPTS_GENEVE */
+ while (info->options_len > offset) {
+ opt = ip_tunnel_info_opts(info) + offset;
+ opt_len += nla_total_size(2) /* OPT_GENEVE_CLASS */
+ + nla_total_size(1) /* OPT_GENEVE_TYPE */
+ + nla_total_size(opt->length * 4);
+ /* OPT_GENEVE_DATA */
+ offset += sizeof(*opt) + opt->length * 4;
+ }
+ } else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, info->key.tun_flags)) {
+ opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_VXLAN */
+ + nla_total_size(4); /* OPT_VXLAN_GBP */
+ } else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, info->key.tun_flags)) {
+ struct erspan_metadata *md = ip_tunnel_info_opts(info);
+
+ opt_len += nla_total_size(0) /* LWTUNNEL_IP_OPTS_ERSPAN */
+ + nla_total_size(1) /* OPT_ERSPAN_VER */
+ + (md->version == 1 ? nla_total_size(4)
+ /* OPT_ERSPAN_INDEX (v1) */
+ : nla_total_size(1) +
+ nla_total_size(1));
+ /* OPT_ERSPAN_DIR + HWID (v2) */
+ }
+
+ return opt_len;
+}
+
static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
{
return nla_total_size_64bit(8) /* LWTUNNEL_IP_ID */
@@ -301,17 +915,26 @@ static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
+ nla_total_size(4) /* LWTUNNEL_IP_SRC */
+ nla_total_size(1) /* LWTUNNEL_IP_TOS */
+ nla_total_size(1) /* LWTUNNEL_IP_TTL */
- + nla_total_size(2); /* LWTUNNEL_IP_FLAGS */
+ + nla_total_size(2) /* LWTUNNEL_IP_FLAGS */
+ + ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
+ /* LWTUNNEL_IP_OPTS */
}
static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b)
{
- return memcmp(lwt_tun_info(a), lwt_tun_info(b),
- sizeof(struct ip_tunnel_info));
+ struct ip_tunnel_info *info_a = lwt_tun_info(a);
+ struct ip_tunnel_info *info_b = lwt_tun_info(b);
+
+ return memcmp(info_a, info_b, sizeof(info_a->key)) ||
+ info_a->mode != info_b->mode ||
+ info_a->options_len != info_b->options_len ||
+ memcmp(ip_tunnel_info_opts(info_a),
+ ip_tunnel_info_opts(info_b), info_a->options_len);
}
static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
.build_state = ip_tun_build_state,
+ .destroy_state = ip_tun_destroy_state,
.fill_encap = ip_tun_fill_encap_info,
.get_encap_size = ip_tun_encap_nlsize,
.cmp_encap = ip_tun_cmp_encap,
@@ -319,30 +942,36 @@ static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
};
static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
+ [LWTUNNEL_IP6_UNSPEC] = { .strict_start_type = LWTUNNEL_IP6_OPTS },
[LWTUNNEL_IP6_ID] = { .type = NLA_U64 },
[LWTUNNEL_IP6_DST] = { .len = sizeof(struct in6_addr) },
[LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) },
[LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 },
[LWTUNNEL_IP6_TC] = { .type = NLA_U8 },
[LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 },
+ [LWTUNNEL_IP6_OPTS] = { .type = NLA_NESTED },
};
-static int ip6_tun_build_state(struct nlattr *attr,
+static int ip6_tun_build_state(struct net *net, struct nlattr *attr,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
{
- struct ip_tunnel_info *tun_info;
- struct lwtunnel_state *new_state;
struct nlattr *tb[LWTUNNEL_IP6_MAX + 1];
- int err;
+ struct lwtunnel_state *new_state;
+ struct ip_tunnel_info *tun_info;
+ int err, opt_len;
- err = nla_parse_nested(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy,
- extack);
+ err = nla_parse_nested_deprecated(tb, LWTUNNEL_IP6_MAX, attr,
+ ip6_tun_policy, extack);
if (err < 0)
return err;
- new_state = lwtunnel_state_alloc(sizeof(*tun_info));
+ opt_len = ip_tun_get_optlen(tb[LWTUNNEL_IP6_OPTS], extack);
+ if (opt_len < 0)
+ return opt_len;
+
+ new_state = lwtunnel_state_alloc(sizeof(*tun_info) + opt_len);
if (!new_state)
return -ENOMEM;
@@ -350,6 +979,12 @@ static int ip6_tun_build_state(struct nlattr *attr,
tun_info = lwt_tun_info(new_state);
+ err = ip_tun_set_opts(tb[LWTUNNEL_IP6_OPTS], tun_info, extack);
+ if (err < 0) {
+ lwtstate_free(new_state);
+ return err;
+ }
+
if (tb[LWTUNNEL_IP6_ID])
tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]);
@@ -365,11 +1000,20 @@ static int ip6_tun_build_state(struct nlattr *attr,
if (tb[LWTUNNEL_IP6_TC])
tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]);
- if (tb[LWTUNNEL_IP6_FLAGS])
- tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]);
+ if (tb[LWTUNNEL_IP6_FLAGS]) {
+ IP_TUNNEL_DECLARE_FLAGS(flags);
+ __be16 data;
+
+ data = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]);
+ ip_tunnel_flags_from_be16(flags, data);
+ ip_tunnel_clear_options_present(flags);
+
+ ip_tunnel_flags_or(tun_info->key.tun_flags,
+ tun_info->key.tun_flags, flags);
+ }
tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6;
- tun_info->options_len = 0;
+ tun_info->options_len = opt_len;
*ts = new_state;
@@ -387,7 +1031,9 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb,
nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) ||
nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) ||
- nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags))
+ nla_put_be16(skb, LWTUNNEL_IP6_FLAGS,
+ ip_tunnel_flags_to_be16(tun_info->key.tun_flags)) ||
+ ip_tun_fill_encap_opts(skb, LWTUNNEL_IP6_OPTS, tun_info))
return -ENOMEM;
return 0;
@@ -400,7 +1046,9 @@ static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate)
+ nla_total_size(16) /* LWTUNNEL_IP6_SRC */
+ nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */
+ nla_total_size(1) /* LWTUNNEL_IP6_TC */
- + nla_total_size(2); /* LWTUNNEL_IP6_FLAGS */
+ + nla_total_size(2) /* LWTUNNEL_IP6_FLAGS */
+ + ip_tun_opts_nlsize(lwt_tun_info(lwtstate));
+ /* LWTUNNEL_IP6_OPTS */
}
static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
@@ -437,3 +1085,92 @@ void ip_tunnel_unneed_metadata(void)
static_branch_dec(&ip_tunnel_metadata_cnt);
}
EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata);
+
+/* Returns either the correct skb->protocol value, or 0 if invalid. */
+__be16 ip_tunnel_parse_protocol(const struct sk_buff *skb)
+{
+ if (skb_network_header(skb) >= skb->head &&
+ (skb_network_header(skb) + sizeof(struct iphdr)) <= skb_tail_pointer(skb) &&
+ ip_hdr(skb)->version == 4)
+ return htons(ETH_P_IP);
+ if (skb_network_header(skb) >= skb->head &&
+ (skb_network_header(skb) + sizeof(struct ipv6hdr)) <= skb_tail_pointer(skb) &&
+ ipv6_hdr(skb)->version == 6)
+ return htons(ETH_P_IPV6);
+ return 0;
+}
+EXPORT_SYMBOL(ip_tunnel_parse_protocol);
+
+const struct header_ops ip_tunnel_header_ops = { .parse_protocol = ip_tunnel_parse_protocol };
+EXPORT_SYMBOL(ip_tunnel_header_ops);
+
+/* This function returns true when ENCAP attributes are present in the nl msg */
+bool ip_tunnel_netlink_encap_parms(struct nlattr *data[],
+ struct ip_tunnel_encap *encap)
+{
+ bool ret = false;
+
+ memset(encap, 0, sizeof(*encap));
+
+ if (!data)
+ return ret;
+
+ if (data[IFLA_IPTUN_ENCAP_TYPE]) {
+ ret = true;
+ encap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
+ ret = true;
+ encap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_SPORT]) {
+ ret = true;
+ encap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
+ }
+
+ if (data[IFLA_IPTUN_ENCAP_DPORT]) {
+ ret = true;
+ encap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_netlink_encap_parms);
+
+void ip_tunnel_netlink_parms(struct nlattr *data[],
+ struct ip_tunnel_parm_kern *parms)
+{
+ if (data[IFLA_IPTUN_LINK])
+ parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
+
+ if (data[IFLA_IPTUN_LOCAL])
+ parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]);
+
+ if (data[IFLA_IPTUN_REMOTE])
+ parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]);
+
+ if (data[IFLA_IPTUN_TTL]) {
+ parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
+ if (parms->iph.ttl)
+ parms->iph.frag_off = htons(IP_DF);
+ }
+
+ if (data[IFLA_IPTUN_TOS])
+ parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
+
+ if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
+ parms->iph.frag_off = htons(IP_DF);
+
+ if (data[IFLA_IPTUN_FLAGS]) {
+ __be16 flags;
+
+ flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]);
+ ip_tunnel_flags_from_be16(parms->i_flags, flags);
+ }
+
+ if (data[IFLA_IPTUN_PROTO])
+ parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+}
+EXPORT_SYMBOL_GPL(ip_tunnel_netlink_parms);
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index f38cb21d773d..95b6bb78fcd2 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -1,15 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux NET3: IP/IP protocol decoder modified to support
* virtual tunnel interface
*
* Authors:
* Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
/*
@@ -50,14 +45,17 @@ static unsigned int vti_net_id __read_mostly;
static int vti_tunnel_init(struct net_device *dev);
static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
- int encap_type)
+ int encap_type, bool update_skb_dev)
{
struct ip_tunnel *tunnel;
const struct iphdr *iph = ip_hdr(skb);
struct net *net = dev_net(skb->dev);
struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+
+ __set_bit(IP_TUNNEL_NO_KEY_BIT, flags);
- tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags,
iph->saddr, iph->daddr, 0);
if (tunnel) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
@@ -65,6 +63,9 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel;
+ if (update_skb_dev)
+ skb->dev = tunnel->dev;
+
return xfrm_input(skb, nexthdr, spi, encap_type);
}
@@ -74,21 +75,31 @@ drop:
return 0;
}
-static int vti_rcv(struct sk_buff *skb)
+static int vti_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi,
+ int encap_type)
+{
+ return vti_input(skb, nexthdr, spi, encap_type, false);
+}
+
+static int vti_rcv(struct sk_buff *skb, __be32 spi, bool update_skb_dev)
{
XFRM_SPI_SKB_CB(skb)->family = AF_INET;
XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
- return vti_input(skb, ip_hdr(skb)->protocol, 0, 0);
+ return vti_input(skb, ip_hdr(skb)->protocol, spi, 0, update_skb_dev);
+}
+
+static int vti_rcv_proto(struct sk_buff *skb)
+{
+ return vti_rcv(skb, 0, false);
}
static int vti_rcv_cb(struct sk_buff *skb, int err)
{
unsigned short family;
struct net_device *dev;
- struct pcpu_sw_netstats *tstats;
struct xfrm_state *x;
- struct xfrm_mode *inner_mode;
+ const struct xfrm_mode *inner_mode;
struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4;
u32 orig_mark = skb->mark;
int ret;
@@ -99,15 +110,15 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
dev = tunnel->dev;
if (err) {
- dev->stats.rx_errors++;
- dev->stats.rx_dropped++;
+ DEV_STATS_INC(dev, rx_errors);
+ DEV_STATS_INC(dev, rx_dropped);
return 0;
}
x = xfrm_input_state(skb);
- inner_mode = x->inner_mode;
+ inner_mode = &x->inner_mode;
if (x->sel.family == AF_UNSPEC) {
inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
@@ -118,7 +129,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
}
}
- family = inner_mode->afinfo->family;
+ family = inner_mode->family;
skb->mark = be32_to_cpu(tunnel->parms.i_key);
ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
@@ -129,13 +140,7 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev)));
skb->dev = dev;
-
- tstats = this_cpu_ptr(dev->tstats);
-
- u64_stats_update_begin(&tstats->syncp);
- tstats->rx_packets++;
- tstats->rx_bytes += skb->len;
- u64_stats_update_end(&tstats->syncp);
+ dev_sw_netstats_rx_add(dev, skb->len);
return 0;
}
@@ -165,7 +170,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
struct flowi *fl)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct ip_tunnel_parm *parms = &tunnel->parms;
+ struct ip_tunnel_parm_kern *parms = &tunnel->parms;
struct dst_entry *dst = skb_dst(skb);
struct net_device *tdev; /* Device to other host */
int pkt_len = skb->len;
@@ -173,51 +178,88 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
int mtu;
if (!dst) {
- dev->stats.tx_carrier_errors++;
- goto tx_error_icmp;
+ switch (skb->protocol) {
+ case htons(ETH_P_IP): {
+ struct rtable *rt;
+
+ fl->u.ip4.flowi4_oif = dev->ifindex;
+ fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC;
+ rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4);
+ if (IS_ERR(rt)) {
+ DEV_STATS_INC(dev, tx_carrier_errors);
+ goto tx_error_icmp;
+ }
+ dst = &rt->dst;
+ skb_dst_set(skb, dst);
+ break;
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ case htons(ETH_P_IPV6):
+ fl->u.ip6.flowi6_oif = dev->ifindex;
+ fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC;
+ dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6);
+ if (dst->error) {
+ dst_release(dst);
+ dst = NULL;
+ DEV_STATS_INC(dev, tx_carrier_errors);
+ goto tx_error_icmp;
+ }
+ skb_dst_set(skb, dst);
+ break;
+#endif
+ default:
+ DEV_STATS_INC(dev, tx_carrier_errors);
+ goto tx_error_icmp;
+ }
}
dst_hold(dst);
- dst = xfrm_lookup(tunnel->net, dst, fl, NULL, 0);
+ dst = xfrm_lookup_route(tunnel->net, dst, fl, NULL, 0);
if (IS_ERR(dst)) {
- dev->stats.tx_carrier_errors++;
+ DEV_STATS_INC(dev, tx_carrier_errors);
goto tx_error_icmp;
}
+ if (dst->flags & DST_XFRM_QUEUE)
+ goto xmit;
+
if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) {
- dev->stats.tx_carrier_errors++;
+ DEV_STATS_INC(dev, tx_carrier_errors);
dst_release(dst);
goto tx_error_icmp;
}
- tdev = dst->dev;
+ tdev = dst_dev(dst);
if (tdev == dev) {
dst_release(dst);
- dev->stats.collisions++;
+ DEV_STATS_INC(dev, collisions);
goto tx_error;
}
mtu = dst_mtu(dst);
if (skb->len > mtu) {
- skb_dst_update_pmtu(skb, mtu);
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
if (skb->protocol == htons(ETH_P_IP)) {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(mtu));
+ if (!(ip_hdr(skb)->frag_off & htons(IP_DF)))
+ goto xmit;
+ icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
} else {
if (mtu < IPV6_MIN_MTU)
mtu = IPV6_MIN_MTU;
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
}
dst_release(dst);
goto tx_error;
}
+xmit:
skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev)));
skb_dst_set(skb, dst);
- skb->dev = skb_dst(skb)->dev;
+ skb->dev = skb_dst_dev(skb);
err = dst_output(tunnel->net, skb->sk, skb);
if (net_xmit_eval(err) == 0)
@@ -228,7 +270,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
tx_error_icmp:
dst_link_failure(skb);
tx_error:
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -241,27 +283,33 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
struct ip_tunnel *tunnel = netdev_priv(dev);
struct flowi fl;
+ if (!pskb_inet_may_pull(skb))
+ goto tx_err;
+
memset(&fl, 0, sizeof(fl));
switch (skb->protocol) {
case htons(ETH_P_IP):
- xfrm_decode_session(skb, &fl, AF_INET);
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET);
break;
case htons(ETH_P_IPV6):
- xfrm_decode_session(skb, &fl, AF_INET6);
memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+ xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6);
break;
default:
- dev->stats.tx_errors++;
- dev_kfree_skb(skb);
- return NETDEV_TX_OK;
+ goto tx_err;
}
/* override mark with tunnel output key */
fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key);
return vti_xmit(skb, dev, &fl);
+
+tx_err:
+ DEV_STATS_INC(dev, tx_errors);
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
}
static int vti4_err(struct sk_buff *skb, u32 info)
@@ -277,8 +325,11 @@ static int vti4_err(struct sk_buff *skb, u32 info)
const struct iphdr *iph = (const struct iphdr *)skb->data;
int protocol = iph->protocol;
struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+
+ __set_bit(IP_TUNNEL_NO_KEY_BIT, flags);
- tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags,
iph->daddr, iph->saddr, 0);
if (!tunnel)
return -1;
@@ -306,6 +357,7 @@ static int vti4_err(struct sk_buff *skb, u32 info)
case ICMP_DEST_UNREACH:
if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
return 0;
+ break;
case ICMP_REDIRECT:
break;
default:
@@ -318,47 +370,47 @@ static int vti4_err(struct sk_buff *skb, u32 info)
return 0;
if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
- ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0);
+ ipv4_update_pmtu(skb, net, info, 0, protocol);
else
- ipv4_redirect(skb, net, 0, 0, protocol, 0);
+ ipv4_redirect(skb, net, 0, protocol);
xfrm_state_put(x);
return 0;
}
static int
-vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd)
{
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
int err = 0;
- struct ip_tunnel_parm p;
-
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- return -EFAULT;
if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
- if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
- p.iph.ihl != 5)
+ if (p->iph.version != 4 || p->iph.protocol != IPPROTO_IPIP ||
+ p->iph.ihl != 5)
return -EINVAL;
}
- if (!(p.i_flags & GRE_KEY))
- p.i_key = 0;
- if (!(p.o_flags & GRE_KEY))
- p.o_key = 0;
+ if (!ip_tunnel_flags_is_be16_compat(p->i_flags) ||
+ !ip_tunnel_flags_is_be16_compat(p->o_flags))
+ return -EOVERFLOW;
+
+ if (!(ip_tunnel_flags_to_be16(p->i_flags) & GRE_KEY))
+ p->i_key = 0;
+ if (!(ip_tunnel_flags_to_be16(p->o_flags) & GRE_KEY))
+ p->o_key = 0;
- p.i_flags = VTI_ISVTI;
+ __set_bit(IP_TUNNEL_VTI_BIT, flags);
+ ip_tunnel_flags_copy(p->i_flags, flags);
- err = ip_tunnel_ioctl(dev, &p, cmd);
+ err = ip_tunnel_ctl(dev, p, cmd);
if (err)
return err;
if (cmd != SIOCDELTUNNEL) {
- p.i_flags |= GRE_KEY;
- p.o_flags |= GRE_KEY;
+ ip_tunnel_flags_from_be16(flags, GRE_KEY);
+ ip_tunnel_flags_or(p->i_flags, p->i_flags, flags);
+ ip_tunnel_flags_or(p->o_flags, p->o_flags, flags);
}
-
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
- return -EFAULT;
return 0;
}
@@ -366,15 +418,17 @@ static const struct net_device_ops vti_netdev_ops = {
.ndo_init = vti_tunnel_init,
.ndo_uninit = ip_tunnel_uninit,
.ndo_start_xmit = vti_tunnel_xmit,
- .ndo_do_ioctl = vti_tunnel_ioctl,
+ .ndo_siocdevprivate = ip_tunnel_siocdevprivate,
.ndo_change_mtu = ip_tunnel_change_mtu,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip_tunnel_get_iflink,
+ .ndo_tunnel_ctl = vti_tunnel_ctl,
};
static void vti_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &vti_netdev_ops;
+ dev->header_ops = &ip_tunnel_header_ops;
dev->type = ARPHRD_TUNNEL;
ip_tunnel_setup(dev, vti_net_id);
}
@@ -384,12 +438,12 @@ static int vti_tunnel_init(struct net_device *dev)
struct ip_tunnel *tunnel = netdev_priv(dev);
struct iphdr *iph = &tunnel->parms.iph;
- memcpy(dev->dev_addr, &iph->saddr, 4);
+ __dev_addr_set(dev, &iph->saddr, 4);
memcpy(dev->broadcast, &iph->daddr, 4);
dev->flags = IFF_NOARP;
dev->addr_len = 4;
- dev->features |= NETIF_F_LLTX;
+ dev->lltx = true;
netif_keep_dst(dev);
return ip_tunnel_init(dev);
@@ -406,29 +460,55 @@ static void __net_init vti_fb_tunnel_init(struct net_device *dev)
}
static struct xfrm4_protocol vti_esp4_protocol __read_mostly = {
- .handler = vti_rcv,
- .input_handler = vti_input,
+ .handler = vti_rcv_proto,
+ .input_handler = vti_input_proto,
.cb_handler = vti_rcv_cb,
.err_handler = vti4_err,
.priority = 100,
};
static struct xfrm4_protocol vti_ah4_protocol __read_mostly = {
- .handler = vti_rcv,
- .input_handler = vti_input,
+ .handler = vti_rcv_proto,
+ .input_handler = vti_input_proto,
.cb_handler = vti_rcv_cb,
.err_handler = vti4_err,
.priority = 100,
};
static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = {
- .handler = vti_rcv,
- .input_handler = vti_input,
+ .handler = vti_rcv_proto,
+ .input_handler = vti_input_proto,
.cb_handler = vti_rcv_cb,
.err_handler = vti4_err,
.priority = 100,
};
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+static int vti_rcv_tunnel(struct sk_buff *skb)
+{
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr);
+
+ return vti_input(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr, 0, false);
+}
+
+static struct xfrm_tunnel vti_ipip_handler __read_mostly = {
+ .handler = vti_rcv_tunnel,
+ .cb_handler = vti_rcv_cb,
+ .err_handler = vti4_err,
+ .priority = 0,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static struct xfrm_tunnel vti_ipip6_handler __read_mostly = {
+ .handler = vti_rcv_tunnel,
+ .cb_handler = vti_rcv_cb,
+ .err_handler = vti4_err,
+ .priority = 0,
+};
+#endif
+#endif
+
static int __net_init vti_init_net(struct net *net)
{
int err;
@@ -443,14 +523,15 @@ static int __net_init vti_init_net(struct net *net)
return 0;
}
-static void __net_exit vti_exit_batch_net(struct list_head *list_net)
+static void __net_exit vti_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
{
- ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops);
+ ip_tunnel_delete_net(net, vti_net_id, &vti_link_ops, dev_to_kill);
}
static struct pernet_operations vti_net_ops = {
.init = vti_init_net,
- .exit_batch = vti_exit_batch_net,
+ .exit_rtnl = vti_exit_rtnl,
.id = &vti_net_id,
.size = sizeof(struct ip_tunnel_net),
};
@@ -462,7 +543,7 @@ static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
}
static void vti_netlink_parms(struct nlattr *data[],
- struct ip_tunnel_parm *parms,
+ struct ip_tunnel_parm_kern *parms,
__u32 *fwmark)
{
memset(parms, 0, sizeof(*parms));
@@ -472,7 +553,7 @@ static void vti_netlink_parms(struct nlattr *data[],
if (!data)
return;
- parms->i_flags = VTI_ISVTI;
+ __set_bit(IP_TUNNEL_VTI_BIT, parms->i_flags);
if (data[IFLA_VTI_LINK])
parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
@@ -493,15 +574,18 @@ static void vti_netlink_parms(struct nlattr *data[],
*fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]);
}
-static int vti_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[],
+static int vti_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
struct netlink_ext_ack *extack)
{
- struct ip_tunnel_parm parms;
+ struct nlattr **data = params->data;
+ struct ip_tunnel_parm_kern parms;
+ struct nlattr **tb = params->tb;
__u32 fwmark = 0;
vti_netlink_parms(data, &parms, &fwmark);
- return ip_tunnel_newlink(dev, tb, &parms, fwmark);
+ return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb,
+ &parms, fwmark);
}
static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
@@ -509,8 +593,8 @@ static int vti_changelink(struct net_device *dev, struct nlattr *tb[],
struct netlink_ext_ack *extack)
{
struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_parm_kern p;
__u32 fwmark = t->fwmark;
- struct ip_tunnel_parm p;
vti_netlink_parms(data, &p, &fwmark);
return ip_tunnel_changelink(dev, tb, &p, fwmark);
@@ -537,7 +621,7 @@ static size_t vti_get_size(const struct net_device *dev)
static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
struct ip_tunnel *t = netdev_priv(dev);
- struct ip_tunnel_parm *p = &t->parms;
+ struct ip_tunnel_parm_kern *p = &t->parms;
if (nla_put_u32(skb, IFLA_VTI_LINK, p->link) ||
nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key) ||
@@ -554,8 +638,8 @@ static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = {
[IFLA_VTI_LINK] = { .type = NLA_U32 },
[IFLA_VTI_IKEY] = { .type = NLA_U32 },
[IFLA_VTI_OKEY] = { .type = NLA_U32 },
- [IFLA_VTI_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
- [IFLA_VTI_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+ [IFLA_VTI_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) },
+ [IFLA_VTI_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) },
[IFLA_VTI_FWMARK] = { .type = NLA_U32 },
};
@@ -597,6 +681,18 @@ static int __init vti_init(void)
if (err < 0)
goto xfrm_proto_comp_failed;
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+ msg = "ipip tunnel";
+ err = xfrm4_tunnel_register(&vti_ipip_handler, AF_INET);
+ if (err < 0)
+ goto xfrm_tunnel_ipip_failed;
+#if IS_ENABLED(CONFIG_IPV6)
+ err = xfrm4_tunnel_register(&vti_ipip6_handler, AF_INET6);
+ if (err < 0)
+ goto xfrm_tunnel_ipip6_failed;
+#endif
+#endif
+
msg = "netlink interface";
err = rtnl_link_register(&vti_link_ops);
if (err < 0)
@@ -605,6 +701,14 @@ static int __init vti_init(void)
return err;
rtnl_link_failed:
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+#if IS_ENABLED(CONFIG_IPV6)
+ xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6);
+xfrm_tunnel_ipip6_failed:
+#endif
+ xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET);
+xfrm_tunnel_ipip_failed:
+#endif
xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
xfrm_proto_comp_failed:
xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
@@ -620,6 +724,12 @@ pernet_dev_failed:
static void __exit vti_fini(void)
{
rtnl_link_unregister(&vti_link_ops);
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+#if IS_ENABLED(CONFIG_IPV6)
+ xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6);
+#endif
+ xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET);
+#endif
xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP);
xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH);
xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP);
@@ -628,6 +738,7 @@ static void __exit vti_fini(void)
module_init(vti_init);
module_exit(vti_fini);
+MODULE_DESCRIPTION("Virtual (secure) IP tunneling library");
MODULE_LICENSE("GPL");
MODULE_ALIAS_RTNL_LINK("vti");
MODULE_ALIAS_NETDEV("ip_vti0");
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index d97f4f2787f5..9a45aed508d1 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IP Payload Compression Protocol (IPComp) - RFC3173.
*
* Copyright (c) 2003 James Morris <jmorris@intercode.com.au>
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- *
* Todo:
* - Tunable compression parameters.
* - Compression stats.
@@ -35,6 +31,7 @@ static int ipcomp4_err(struct sk_buff *skb, u32 info)
case ICMP_DEST_UNREACH:
if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
return 0;
+ break;
case ICMP_REDIRECT:
break;
default:
@@ -48,15 +45,16 @@ static int ipcomp4_err(struct sk_buff *skb, u32 info)
return 0;
if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
- ipv4_update_pmtu(skb, net, info, 0, 0, IPPROTO_COMP, 0);
+ ipv4_update_pmtu(skb, net, info, 0, IPPROTO_COMP);
else
- ipv4_redirect(skb, net, 0, 0, IPPROTO_COMP, 0);
+ ipv4_redirect(skb, net, 0, IPPROTO_COMP);
xfrm_state_put(x);
return 0;
}
/* We always hold one tunnel user reference to indicate a tunnel */
+static struct lock_class_key xfrm_state_lock_key;
static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
{
struct net *net = xs_net(x);
@@ -65,6 +63,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
t = xfrm_state_alloc(net);
if (!t)
goto out;
+ lockdep_set_class(&t->lock, &xfrm_state_lock_key);
t->id.proto = IPPROTO_IPIP;
t->id.spi = x->props.saddr.a4;
@@ -76,6 +75,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
t->props.flags = x->props.flags;
t->props.extra_flags = x->props.extra_flags;
memcpy(&t->mark, &x->mark, sizeof(t->mark));
+ t->if_id = x->if_id;
if (xfrm_init_state(t))
goto error;
@@ -119,7 +119,8 @@ out:
return err;
}
-static int ipcomp4_init_state(struct xfrm_state *x)
+static int ipcomp4_init_state(struct xfrm_state *x,
+ struct netlink_ext_ack *extack)
{
int err = -EINVAL;
@@ -131,17 +132,20 @@ static int ipcomp4_init_state(struct xfrm_state *x)
x->props.header_len += sizeof(struct iphdr);
break;
default:
+ NL_SET_ERR_MSG(extack, "Unsupported XFRM mode for IPcomp");
goto out;
}
- err = ipcomp_init_state(x);
+ err = ipcomp_init_state(x, extack);
if (err)
goto out;
if (x->props.mode == XFRM_MODE_TUNNEL) {
err = ipcomp_tunnel_attach(x);
- if (err)
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Kernel error: failed to initialize the associated state");
goto out;
+ }
}
err = 0;
@@ -155,7 +159,6 @@ static int ipcomp4_rcv_cb(struct sk_buff *skb, int err)
}
static const struct xfrm_type ipcomp_type = {
- .description = "IPCOMP4",
.owner = THIS_MODULE,
.proto = IPPROTO_COMP,
.init_state = ipcomp4_init_state,
@@ -190,8 +193,7 @@ static void __exit ipcomp4_fini(void)
{
if (xfrm4_protocol_deregister(&ipcomp4_protocol, IPPROTO_COMP) < 0)
pr_info("%s: can't remove protocol\n", __func__);
- if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
- pr_info("%s: can't remove xfrm type\n", __func__);
+ xfrm_unregister_type(&ipcomp_type, AF_INET);
}
module_init(ipcomp4_init);
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 88212615bf4c..019408d3ca2c 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -61,7 +61,6 @@
#include <linux/export.h>
#include <net/net_namespace.h>
#include <net/arp.h>
-#include <net/dsa.h>
#include <net/ip.h>
#include <net/ipconfig.h>
#include <net/route.h>
@@ -85,7 +84,6 @@
/* Define the friendly delay before and after opening net devices */
#define CONF_POST_OPEN 10 /* After opening: 10 msecs */
-#define CONF_CARRIER_TIMEOUT 120000 /* Wait for carrier timeout */
/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */
#define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */
@@ -101,6 +99,9 @@
#define NONE cpu_to_be32(INADDR_NONE)
#define ANY cpu_to_be32(INADDR_ANY)
+/* Wait for carrier timeout default in seconds */
+static unsigned int carrier_timeout = 120;
+
/*
* Public IP configuration
*/
@@ -216,11 +217,11 @@ static int __init ic_open_devs(void)
last = &ic_first_dev;
rtnl_lock();
- /* bring loopback and DSA master network devices up first */
+ /* bring loopback device up first */
for_each_netdev(&init_net, dev) {
- if (!(dev->flags & IFF_LOOPBACK) && !netdev_uses_dsa(dev))
+ if (!(dev->flags & IFF_LOOPBACK))
continue;
- if (dev_change_flags(dev, dev->flags | IFF_UP) < 0)
+ if (dev_change_flags(dev, dev->flags | IFF_UP, NULL) < 0)
pr_err("IP-Config: Failed to open %s\n", dev->name);
}
@@ -238,7 +239,7 @@ static int __init ic_open_devs(void)
if (ic_proto_enabled && !able)
continue;
oflags = dev->flags;
- if (dev_change_flags(dev, oflags | IFF_UP) < 0) {
+ if (dev_change_flags(dev, oflags | IFF_UP, NULL) < 0) {
pr_err("IP-Config: Failed to open %s\n",
dev->name);
continue;
@@ -261,6 +262,11 @@ static int __init ic_open_devs(void)
dev->name, able, d->xid);
}
}
+ /* Devices with a complex topology like SFP ethernet interfaces needs
+ * the rtnl_lock at init. The carrier wait-loop must therefore run
+ * without holding it.
+ */
+ rtnl_unlock();
/* no point in waiting if we could not bring up at least one device */
if (!ic_first_dev)
@@ -268,14 +274,18 @@ static int __init ic_open_devs(void)
/* wait for a carrier on at least one device */
start = jiffies;
- next_msg = start + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12);
+ next_msg = start + secs_to_jiffies(20);
while (time_before(jiffies, start +
- msecs_to_jiffies(CONF_CARRIER_TIMEOUT))) {
+ secs_to_jiffies(carrier_timeout))) {
int wait, elapsed;
+ rtnl_lock();
for_each_netdev(&init_net, dev)
- if (ic_is_init_dev(dev) && netif_carrier_ok(dev))
+ if (ic_is_init_dev(dev) && netif_carrier_ok(dev)) {
+ rtnl_unlock();
goto have_carrier;
+ }
+ rtnl_unlock();
msleep(1);
@@ -283,12 +293,11 @@ static int __init ic_open_devs(void)
continue;
elapsed = jiffies_to_msecs(jiffies - start);
- wait = (CONF_CARRIER_TIMEOUT - elapsed + 500)/1000;
+ wait = (carrier_timeout * 1000 - elapsed + 500) / 1000;
pr_info("Waiting up to %d more seconds for network.\n", wait);
- next_msg = jiffies + msecs_to_jiffies(CONF_CARRIER_TIMEOUT/12);
+ next_msg = jiffies + secs_to_jiffies(20);
}
have_carrier:
- rtnl_unlock();
*last = NULL;
@@ -303,19 +312,36 @@ have_carrier:
return 0;
}
+/* Close all network interfaces except the one we've autoconfigured, and its
+ * lowers, in case it's a stacked virtual interface.
+ */
static void __init ic_close_devs(void)
{
+ struct net_device *selected_dev = ic_dev ? ic_dev->dev : NULL;
struct ic_device *d, *next;
struct net_device *dev;
rtnl_lock();
next = ic_first_dev;
while ((d = next)) {
+ bool bring_down = (d != ic_dev);
+ struct net_device *lower;
+ struct list_head *iter;
+
next = d->next;
dev = d->dev;
- if (d != ic_dev && !netdev_uses_dsa(dev)) {
+
+ if (selected_dev) {
+ netdev_for_each_lower_dev(selected_dev, lower, iter) {
+ if (dev == lower) {
+ bring_down = false;
+ break;
+ }
+ }
+ }
+ if (bring_down) {
pr_debug("IP-Config: Downing %s\n", dev->name);
- dev_change_flags(dev, d->flags);
+ dev_change_flags(dev, d->flags, NULL);
}
kfree(d);
}
@@ -429,6 +455,8 @@ static int __init ic_defaults(void)
ic_netmask = htonl(IN_CLASSB_NET);
else if (IN_CLASSC(ntohl(ic_myaddr)))
ic_netmask = htonl(IN_CLASSC_NET);
+ else if (IN_CLASSE(ntohl(ic_myaddr)))
+ ic_netmask = htonl(IN_CLASSE_NET);
else {
pr_err("IP-Config: Unable to guess netmask for address %pI4\n",
&ic_myaddr);
@@ -637,6 +665,9 @@ static struct packet_type bootp_packet_type __initdata = {
.func = ic_bootp_recv,
};
+/* DHCPACK can overwrite DNS if fallback was set upon first BOOTP reply */
+static int ic_nameservers_fallback __initdata;
+
/*
* Initialize DHCP/BOOTP extension fields in the request.
*/
@@ -866,7 +897,7 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
/*
- * Copy BOOTP-supplied string if not already set.
+ * Copy BOOTP-supplied string
*/
static int __init ic_bootp_string(char *dest, char *src, int len, int max)
{
@@ -910,17 +941,21 @@ static void __init ic_do_bootp_ext(u8 *ext)
if (servers > CONF_NAMESERVERS_MAX)
servers = CONF_NAMESERVERS_MAX;
for (i = 0; i < servers; i++) {
- if (ic_nameservers[i] == NONE)
+ if (ic_nameservers[i] == NONE ||
+ ic_nameservers_fallback)
memcpy(&ic_nameservers[i], ext+1+4*i, 4);
}
break;
case 12: /* Host name */
- ic_bootp_string(utsname()->nodename, ext+1, *ext,
- __NEW_UTS_LEN);
- ic_host_name_set = 1;
+ if (!ic_host_name_set) {
+ ic_bootp_string(utsname()->nodename, ext+1, *ext,
+ __NEW_UTS_LEN);
+ ic_host_name_set = 1;
+ }
break;
case 15: /* Domain name (DNS) */
- ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain));
+ if (!ic_domain[0])
+ ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain));
break;
case 17: /* Root path */
if (!root_server_path[0])
@@ -1127,8 +1162,10 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
ic_addrservaddr = b->iph.saddr;
if (ic_gateway == NONE && b->relay_ip)
ic_gateway = b->relay_ip;
- if (ic_nameservers[0] == NONE)
+ if (ic_nameservers[0] == NONE) {
ic_nameservers[0] = ic_servaddr;
+ ic_nameservers_fallback = 1;
+ }
ic_got_reply = IC_BOOTP;
drop_unlock:
@@ -1330,7 +1367,7 @@ static int __init ipconfig_proc_net_init(void)
/* Create a new file under /proc/net/ipconfig */
static int ipconfig_proc_net_create(const char *name,
- const struct file_operations *fops)
+ const struct proc_ops *proc_ops)
{
char *pname;
struct proc_dir_entry *p;
@@ -1342,7 +1379,7 @@ static int ipconfig_proc_net_create(const char *name,
if (!pname)
return -ENOMEM;
- p = proc_create(pname, 0444, init_net.proc_net, fops);
+ p = proc_create(pname, 0444, init_net.proc_net, proc_ops);
kfree(pname);
if (!p)
return -ENOMEM;
@@ -1351,7 +1388,7 @@ static int ipconfig_proc_net_create(const char *name,
}
/* Write NTP server IP addresses to /proc/net/ipconfig/ntp_servers */
-static int ntp_servers_seq_show(struct seq_file *seq, void *v)
+static int ntp_servers_show(struct seq_file *seq, void *v)
{
int i;
@@ -1361,18 +1398,7 @@ static int ntp_servers_seq_show(struct seq_file *seq, void *v)
}
return 0;
}
-
-static int ntp_servers_seq_open(struct inode *inode, struct file *file)
-{
- return single_open(file, ntp_servers_seq_show, NULL);
-}
-
-static const struct file_operations ntp_servers_seq_fops = {
- .open = ntp_servers_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_PROC_SHOW_ATTRIBUTE(ntp_servers);
#endif /* CONFIG_PROC_FS */
/*
@@ -1414,11 +1440,15 @@ __be32 __init root_nfs_parse_addr(char *name)
static int __init wait_for_devices(void)
{
int i;
+ bool try_init_devs = true;
for (i = 0; i < DEVICE_WAIT_MAX; i++) {
struct net_device *dev;
int found = 0;
+ /* make sure deferred device probes are finished */
+ wait_for_device_probe();
+
rtnl_lock();
for_each_netdev(&init_net, dev) {
if (ic_is_init_dev(dev)) {
@@ -1429,6 +1459,11 @@ static int __init wait_for_devices(void)
rtnl_unlock();
if (found)
return 0;
+ if (try_init_devs &&
+ (ROOT_DEV == Root_NFS || ROOT_DEV == Root_CIFS)) {
+ try_init_devs = false;
+ wait_for_init_devices_probe();
+ }
ssleep(1);
}
return -ENODEV;
@@ -1445,7 +1480,7 @@ static int __init ip_auto_config(void)
int retries = CONF_OPEN_RETRIES;
#endif
int err;
- unsigned int i;
+ unsigned int i, count;
/* Initialise all name servers and NTP servers to NONE (but only if the
* "ip=" or "nfsaddrs=" kernel command line parameters weren't decoded,
@@ -1460,7 +1495,7 @@ static int __init ip_auto_config(void)
proc_create_single("pnp", 0444, init_net.proc_net, pnp_seq_show);
if (ipconfig_proc_net_init() == 0)
- ipconfig_proc_net_create("ntp_servers", &ntp_servers_seq_fops);
+ ipconfig_proc_net_create("ntp_servers", &ntp_servers_proc_ops);
#endif /* CONFIG_PROC_FS */
if (!ic_enable)
@@ -1490,10 +1525,10 @@ static int __init ip_auto_config(void)
* missing values.
*/
if (ic_myaddr == NONE ||
-#ifdef CONFIG_ROOT_NFS
+#if defined(CONFIG_ROOT_NFS) || defined(CONFIG_CIFS_ROOT)
(root_server_addr == NONE &&
ic_servaddr == NONE &&
- ROOT_DEV == Root_NFS) ||
+ (ROOT_DEV == Root_NFS || ROOT_DEV == Root_CIFS)) ||
#endif
ic_first_dev->next) {
#ifdef IPCONFIG_DYNAMIC
@@ -1520,6 +1555,12 @@ static int __init ip_auto_config(void)
goto try_try_again;
}
#endif
+#ifdef CONFIG_CIFS_ROOT
+ if (ROOT_DEV == Root_CIFS) {
+ pr_err("IP-Config: Retrying forever (CIFS root)...\n");
+ goto try_try_again;
+ }
+#endif
if (--retries) {
pr_err("IP-Config: Reopening network devices...\n");
@@ -1573,7 +1614,7 @@ static int __init ip_auto_config(void)
if (ic_dev_mtu)
pr_cont(", mtu=%d", ic_dev_mtu);
/* Name servers (if any): */
- for (i = 0; i < CONF_NAMESERVERS_MAX; i++) {
+ for (i = 0, count = 0; i < CONF_NAMESERVERS_MAX; i++) {
if (ic_nameservers[i] != NONE) {
if (i == 0)
pr_info(" nameserver%u=%pI4",
@@ -1581,12 +1622,14 @@ static int __init ip_auto_config(void)
else
pr_cont(", nameserver%u=%pI4",
i, &ic_nameservers[i]);
+
+ count++;
}
- if (i + 1 == CONF_NAMESERVERS_MAX)
+ if ((i + 1 == CONF_NAMESERVERS_MAX) && count > 0)
pr_cont("\n");
}
/* NTP servers (if any): */
- for (i = 0; i < CONF_NTP_SERVERS_MAX; i++) {
+ for (i = 0, count = 0; i < CONF_NTP_SERVERS_MAX; i++) {
if (ic_ntp_servers[i] != NONE) {
if (i == 0)
pr_info(" ntpserver%u=%pI4",
@@ -1594,8 +1637,10 @@ static int __init ip_auto_config(void)
else
pr_cont(", ntpserver%u=%pI4",
i, &ic_ntp_servers[i]);
+
+ count++;
}
- if (i + 1 == CONF_NTP_SERVERS_MAX)
+ if ((i + 1 == CONF_NTP_SERVERS_MAX) && count > 0)
pr_cont("\n");
}
#endif /* !SILENT */
@@ -1619,7 +1664,7 @@ late_initcall(ip_auto_config);
/*
* Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel
- * command line parameter. See Documentation/filesystems/nfs/nfsroot.txt.
+ * command line parameter. See Documentation/admin-guide/nfs/nfsroot.rst.
*/
static int __init ic_proto_name(char *name)
{
@@ -1645,7 +1690,8 @@ static int __init ic_proto_name(char *name)
*v = 0;
if (kstrtou8(client_id, 0, dhcp_client_identifier))
pr_debug("DHCP: Invalid client identifier type\n");
- strncpy(dhcp_client_identifier + 1, v + 1, 251);
+ strscpy(dhcp_client_identifier + 1, v + 1,
+ sizeof(dhcp_client_identifier) - 1);
*v = ',';
}
return 1;
@@ -1726,15 +1772,15 @@ static int __init ip_auto_config_setup(char *addrs)
case 4:
if ((dp = strchr(ip, '.'))) {
*dp++ = '\0';
- strlcpy(utsname()->domainname, dp,
+ strscpy(utsname()->domainname, dp,
sizeof(utsname()->domainname));
}
- strlcpy(utsname()->nodename, ip,
+ strscpy(utsname()->nodename, ip,
sizeof(utsname()->nodename));
ic_host_name_set = 1;
break;
case 5:
- strlcpy(user_dev_name, ip, sizeof(user_dev_name));
+ strscpy(user_dev_name, ip, sizeof(user_dev_name));
break;
case 6:
if (ic_proto_name(ip) == 0 &&
@@ -1781,7 +1827,7 @@ __setup("nfsaddrs=", nfsaddrs_config_setup);
static int __init vendor_class_identifier_setup(char *addrs)
{
- if (strlcpy(vendor_class_identifier, addrs,
+ if (strscpy(vendor_class_identifier, addrs,
sizeof(vendor_class_identifier))
>= sizeof(vendor_class_identifier))
pr_warn("DHCP: vendorclass too long, truncated to \"%s\"\n",
@@ -1789,3 +1835,18 @@ static int __init vendor_class_identifier_setup(char *addrs)
return 1;
}
__setup("dhcpclass=", vendor_class_identifier_setup);
+
+static int __init set_carrier_timeout(char *str)
+{
+ ssize_t ret;
+
+ if (!str)
+ return 0;
+
+ ret = kstrtouint(str, 0, &carrier_timeout);
+ if (ret)
+ return 0;
+
+ return 1;
+}
+__setup("carrier_timeout=", set_carrier_timeout);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index c891235b4966..ff95b1b9908e 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux NET3: IP/IP protocol decoder.
*
@@ -16,12 +17,6 @@
* Carlos Picoto : GRE over IP support
* Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
* I do not want to merge them together.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
/* tunnel.c: an IP tunnel driver
@@ -135,11 +130,21 @@ static int ipip_err(struct sk_buff *skb, u32 info)
struct net *net = dev_net(skb->dev);
struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
const struct iphdr *iph = (const struct iphdr *)skb->data;
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
struct ip_tunnel *t;
int err = 0;
+ __set_bit(IP_TUNNEL_NO_KEY_BIT, flags);
+
+ t = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->daddr,
+ iph->saddr, 0);
+ if (!t) {
+ err = -ENOENT;
+ goto out;
+ }
+
switch (type) {
case ICMP_DEST_UNREACH:
switch (code) {
@@ -167,21 +172,13 @@ static int ipip_err(struct sk_buff *skb, u32 info)
goto out;
}
- t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
- iph->daddr, iph->saddr, 0);
- if (!t) {
- err = -ENOENT;
- goto out;
- }
-
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
- ipv4_update_pmtu(skb, net, info, t->parms.link, 0,
- iph->protocol, 0);
+ ipv4_update_pmtu(skb, net, info, t->parms.link, iph->protocol);
goto out;
}
if (type == ICMP_REDIRECT) {
- ipv4_redirect(skb, net, t->parms.link, 0, iph->protocol, 0);
+ ipv4_redirect(skb, net, t->parms.link, iph->protocol);
goto out;
}
@@ -219,13 +216,16 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
{
struct net *net = dev_net(skb->dev);
struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
struct metadata_dst *tun_dst = NULL;
struct ip_tunnel *tunnel;
const struct iphdr *iph;
+ __set_bit(IP_TUNNEL_NO_KEY_BIT, flags);
+
iph = ip_hdr(skb);
- tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
- iph->saddr, iph->daddr, 0);
+ tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, flags, iph->saddr,
+ iph->daddr, 0);
if (tunnel) {
const struct tnl_ptk_info *tpi;
@@ -244,10 +244,15 @@ static int ipip_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
if (iptunnel_pull_header(skb, 0, tpi->proto, false))
goto drop;
if (tunnel->collect_md) {
- tun_dst = ip_tun_rx_dst(skb, 0, 0, 0);
+ ip_tunnel_flags_zero(flags);
+
+ tun_dst = ip_tun_rx_dst(skb, flags, 0, 0);
if (!tun_dst)
return 0;
+ ip_tunnel_md_udp_encap(skb, &tun_dst->u.tun_info);
}
+ skb_reset_mac_header(skb);
+
return ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
}
@@ -281,6 +286,9 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
const struct iphdr *tiph = &tunnel->parms.iph;
u8 ipproto;
+ if (!pskb_inet_may_pull(skb))
+ goto tx_error;
+
switch (skb->protocol) {
case htons(ETH_P_IP):
ipproto = IPPROTO_IPIP;
@@ -303,7 +311,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
skb_set_inner_ipproto(skb, ipproto);
if (tunnel->collect_md)
- ip_md_tunnel_xmit(skb, dev, ipproto);
+ ip_md_tunnel_xmit(skb, dev, ipproto, 0);
else
ip_tunnel_xmit(skb, dev, tiph, ipproto);
return NETDEV_TX_OK;
@@ -311,7 +319,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb,
tx_error:
kfree_skb(skb);
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
return NETDEV_TX_OK;
}
@@ -330,29 +338,41 @@ static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto)
}
static int
-ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd)
{
- int err = 0;
- struct ip_tunnel_parm p;
-
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- return -EFAULT;
-
if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) {
- if (p.iph.version != 4 ||
- !ipip_tunnel_ioctl_verify_protocol(p.iph.protocol) ||
- p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
+ if (p->iph.version != 4 ||
+ !ipip_tunnel_ioctl_verify_protocol(p->iph.protocol) ||
+ p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)))
return -EINVAL;
}
- p.i_key = p.o_key = 0;
- p.i_flags = p.o_flags = 0;
- err = ip_tunnel_ioctl(dev, &p, cmd);
- if (err)
- return err;
+ p->i_key = p->o_key = 0;
+ ip_tunnel_flags_zero(p->i_flags);
+ ip_tunnel_flags_zero(p->o_flags);
+ return ip_tunnel_ctl(dev, p, cmd);
+}
+
+static int ipip_fill_forward_path(struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+ struct ip_tunnel *tunnel = netdev_priv(ctx->dev);
+ const struct iphdr *tiph = &tunnel->parms.iph;
+ struct rtable *rt;
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
- return -EFAULT;
+ rt = ip_route_output(dev_net(ctx->dev), tiph->daddr, 0, 0, 0,
+ RT_SCOPE_UNIVERSE);
+ if (IS_ERR(rt))
+ return PTR_ERR(rt);
+
+ path->type = DEV_PATH_TUN;
+ path->tun.src_v4.s_addr = tiph->saddr;
+ path->tun.dst_v4.s_addr = tiph->daddr;
+ path->tun.l3_proto = IPPROTO_IPIP;
+ path->dev = ctx->dev;
+
+ ctx->dev = rt->dst.dev;
+ ip_rt_put(rt);
return 0;
}
@@ -361,10 +381,12 @@ static const struct net_device_ops ipip_netdev_ops = {
.ndo_init = ipip_tunnel_init,
.ndo_uninit = ip_tunnel_uninit,
.ndo_start_xmit = ipip_tunnel_xmit,
- .ndo_do_ioctl = ipip_tunnel_ioctl,
+ .ndo_siocdevprivate = ip_tunnel_siocdevprivate,
.ndo_change_mtu = ip_tunnel_change_mtu,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip_tunnel_get_iflink,
+ .ndo_tunnel_ctl = ipip_tunnel_ctl,
+ .ndo_fill_forward_path = ipip_fill_forward_path,
};
#define IPIP_FEATURES (NETIF_F_SG | \
@@ -376,11 +398,12 @@ static const struct net_device_ops ipip_netdev_ops = {
static void ipip_tunnel_setup(struct net_device *dev)
{
dev->netdev_ops = &ipip_netdev_ops;
+ dev->header_ops = &ip_tunnel_header_ops;
dev->type = ARPHRD_TUNNEL;
dev->flags = IFF_NOARP;
dev->addr_len = 4;
- dev->features |= NETIF_F_LLTX;
+ dev->lltx = true;
netif_keep_dst(dev);
dev->features |= IPIP_FEATURES;
@@ -392,7 +415,7 @@ static int ipip_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
+ __dev_addr_set(dev, &tunnel->parms.iph.saddr, 4);
memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
tunnel->tun_hlen = 0;
@@ -416,8 +439,8 @@ static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[],
}
static void ipip_netlink_parms(struct nlattr *data[],
- struct ip_tunnel_parm *parms, bool *collect_md,
- __u32 *fwmark)
+ struct ip_tunnel_parm_kern *parms,
+ bool *collect_md, __u32 *fwmark)
{
memset(parms, 0, sizeof(*parms));
@@ -429,29 +452,7 @@ static void ipip_netlink_parms(struct nlattr *data[],
if (!data)
return;
- if (data[IFLA_IPTUN_LINK])
- parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
-
- if (data[IFLA_IPTUN_LOCAL])
- parms->iph.saddr = nla_get_in_addr(data[IFLA_IPTUN_LOCAL]);
-
- if (data[IFLA_IPTUN_REMOTE])
- parms->iph.daddr = nla_get_in_addr(data[IFLA_IPTUN_REMOTE]);
-
- if (data[IFLA_IPTUN_TTL]) {
- parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
- if (parms->iph.ttl)
- parms->iph.frag_off = htons(IP_DF);
- }
-
- if (data[IFLA_IPTUN_TOS])
- parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
-
- if (data[IFLA_IPTUN_PROTO])
- parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]);
-
- if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
- parms->iph.frag_off = htons(IP_DF);
+ ip_tunnel_netlink_parms(data, parms);
if (data[IFLA_IPTUN_COLLECT_METADATA])
*collect_md = true;
@@ -460,50 +461,18 @@ static void ipip_netlink_parms(struct nlattr *data[],
*fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]);
}
-/* This function returns true when ENCAP attributes are present in the nl msg */
-static bool ipip_netlink_encap_parms(struct nlattr *data[],
- struct ip_tunnel_encap *ipencap)
-{
- bool ret = false;
-
- memset(ipencap, 0, sizeof(*ipencap));
-
- if (!data)
- return ret;
-
- if (data[IFLA_IPTUN_ENCAP_TYPE]) {
- ret = true;
- ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
- }
-
- if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
- ret = true;
- ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
- }
-
- if (data[IFLA_IPTUN_ENCAP_SPORT]) {
- ret = true;
- ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
- }
-
- if (data[IFLA_IPTUN_ENCAP_DPORT]) {
- ret = true;
- ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
- }
-
- return ret;
-}
-
-static int ipip_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[],
+static int ipip_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
struct netlink_ext_ack *extack)
{
struct ip_tunnel *t = netdev_priv(dev);
- struct ip_tunnel_parm p;
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
struct ip_tunnel_encap ipencap;
+ struct ip_tunnel_parm_kern p;
__u32 fwmark = 0;
- if (ipip_netlink_encap_parms(data, &ipencap)) {
+ if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
int err = ip_tunnel_encap_setup(t, &ipencap);
if (err < 0)
@@ -511,7 +480,8 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev,
}
ipip_netlink_parms(data, &p, &t->collect_md, &fwmark);
- return ip_tunnel_newlink(dev, tb, &p, fwmark);
+ return ip_tunnel_newlink(params->link_net ? : dev_net(dev), dev, tb, &p,
+ fwmark);
}
static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
@@ -519,12 +489,12 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[],
struct netlink_ext_ack *extack)
{
struct ip_tunnel *t = netdev_priv(dev);
- struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
+ struct ip_tunnel_parm_kern p;
bool collect_md;
__u32 fwmark = t->fwmark;
- if (ipip_netlink_encap_parms(data, &ipencap)) {
+ if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
int err = ip_tunnel_encap_setup(t, &ipencap);
if (err < 0)
@@ -577,7 +547,7 @@ static size_t ipip_get_size(const struct net_device *dev)
static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct ip_tunnel_parm *parm = &tunnel->parms;
+ struct ip_tunnel_parm_kern *parm = &tunnel->parms;
if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
@@ -659,14 +629,15 @@ static int __net_init ipip_init_net(struct net *net)
return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
}
-static void __net_exit ipip_exit_batch_net(struct list_head *list_net)
+static void __net_exit ipip_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
{
- ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops);
+ ip_tunnel_delete_net(net, ipip_net_id, &ipip_link_ops, dev_to_kill);
}
static struct pernet_operations ipip_net_ops = {
.init = ipip_init_net,
- .exit_batch = ipip_exit_batch_net,
+ .exit_rtnl = ipip_exit_rtnl,
.id = &ipip_net_id,
.size = sizeof(struct ip_tunnel_net),
};
@@ -701,7 +672,7 @@ out:
rtnl_link_failed:
#if IS_ENABLED(CONFIG_MPLS)
- xfrm4_tunnel_deregister(&mplsip_handler, AF_INET);
+ xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS);
xfrm_tunnel_mplsip_failed:
#endif
@@ -725,6 +696,7 @@ static void __exit ipip_fini(void)
module_init(ipip_init);
module_exit(ipip_fini);
+MODULE_DESCRIPTION("IP/IP protocol decoder library");
MODULE_LICENSE("GPL");
MODULE_ALIAS_RTNL_LINK("ipip");
MODULE_ALIAS_NETDEV("tunl0");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 5660adcf7a04..ca9eaee4c2ef 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IP multicast routing support for mrouted 3.6/3.8
*
* (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
* Linux Consultancy and Custom Driver Development
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Fixes:
* Michael Chastain : Incorrect size of copying.
* Alan Cox : Added the cache manager code
@@ -23,7 +19,6 @@
* Carlos Picoto : PIMv1 Support
* Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header
* Relax this requirement to work with older peers.
- *
*/
#include <linux/uaccess.h>
@@ -47,6 +42,7 @@
#include <linux/init.h>
#include <linux/if_ether.h>
#include <linux/slab.h>
+#include <net/flow.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -66,8 +62,10 @@
#include <net/netlink.h>
#include <net/fib_rules.h>
#include <linux/netconf.h>
-#include <net/nexthop.h>
-#include <net/switchdev.h>
+#include <net/rtnh.h>
+#include <net/inet_dscp.h>
+
+#include <linux/nospec.h>
struct ipmr_rule {
struct fib_rule common;
@@ -81,7 +79,12 @@ struct ipmr_result {
* Note that the changes are semaphored via rtnl_lock.
*/
-static DEFINE_RWLOCK(mrt_lock);
+static DEFINE_SPINLOCK(mrt_lock);
+
+static struct net_device *vif_dev_read(const struct vif_device *vif)
+{
+ return rcu_dereference(vif->dev);
+}
/* Multicast router control variables */
@@ -104,17 +107,19 @@ static void ipmr_free_table(struct mr_table *mrt);
static void ip_mr_forward(struct net *net, struct mr_table *mrt,
struct net_device *dev, struct sk_buff *skb,
struct mfc_cache *cache, int local);
-static int ipmr_cache_report(struct mr_table *mrt,
+static int ipmr_cache_report(const struct mr_table *mrt,
struct sk_buff *pkt, vifi_t vifi, int assert);
static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
int cmd);
-static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
-static void mroute_clean_tables(struct mr_table *mrt, bool all);
+static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt);
+static void mroute_clean_tables(struct mr_table *mrt, int flags);
static void ipmr_expire_process(struct timer_list *t);
#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
-#define ipmr_for_each_table(mrt, net) \
- list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list)
+#define ipmr_for_each_table(mrt, net) \
+ list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list, \
+ lockdep_rtnl_is_held() || \
+ list_empty(&net->ipv4.mr_tables))
static struct mr_table *ipmr_mr_table_iter(struct net *net,
struct mr_table *mrt)
@@ -133,7 +138,7 @@ static struct mr_table *ipmr_mr_table_iter(struct net *net,
return ret;
}
-static struct mr_table *ipmr_get_table(struct net *net, u32 id)
+static struct mr_table *__ipmr_get_table(struct net *net, u32 id)
{
struct mr_table *mrt;
@@ -144,6 +149,16 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id)
return NULL;
}
+static struct mr_table *ipmr_get_table(struct net *net, u32 id)
+{
+ struct mr_table *mrt;
+
+ rcu_read_lock();
+ mrt = __ipmr_get_table(net, id);
+ rcu_read_unlock();
+ return mrt;
+}
+
static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
struct mr_table **mrt)
{
@@ -185,7 +200,7 @@ static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
arg->table = fib_rule_get_table(rule, arg);
- mrt = ipmr_get_table(rule->fr_net, arg->table);
+ mrt = __ipmr_get_table(rule->fr_net, arg->table);
if (!mrt)
return -EAGAIN;
res->mrt = mrt;
@@ -197,10 +212,6 @@ static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
return 1;
}
-static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
- FRA_GENERIC_POLICY,
-};
-
static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh, struct nlattr **tb,
struct netlink_ext_ack *extack)
@@ -233,7 +244,6 @@ static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
.compare = ipmr_rule_compare,
.fill = ipmr_rule_fill,
.nlgroup = RTNLGRP_IPV4_RULE,
- .policy = ipmr_rule_policy,
.owner = THIS_MODULE,
};
@@ -255,7 +265,7 @@ static int __net_init ipmr_rules_init(struct net *net)
goto err1;
}
- err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0);
+ err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT);
if (err < 0)
goto err2;
@@ -263,7 +273,9 @@ static int __net_init ipmr_rules_init(struct net *net)
return 0;
err2:
+ rtnl_lock();
ipmr_free_table(mrt);
+ rtnl_unlock();
err1:
fib_rules_unregister(ops);
return err;
@@ -273,21 +285,21 @@ static void __net_exit ipmr_rules_exit(struct net *net)
{
struct mr_table *mrt, *next;
- rtnl_lock();
+ ASSERT_RTNL();
list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
list_del(&mrt->list);
ipmr_free_table(mrt);
}
fib_rules_unregister(net->ipv4.mr_rules_ops);
- rtnl_unlock();
}
-static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
- return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR);
+ return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR, extack);
}
-static unsigned int ipmr_rules_seq_read(struct net *net)
+static unsigned int ipmr_rules_seq_read(const struct net *net)
{
return fib_rules_seq_read(net, RTNL_FAMILY_IPMR);
}
@@ -314,6 +326,8 @@ static struct mr_table *ipmr_get_table(struct net *net, u32 id)
return net->ipv4.mrt;
}
+#define __ipmr_get_table ipmr_get_table
+
static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
struct mr_table **mrt)
{
@@ -334,18 +348,18 @@ static int __net_init ipmr_rules_init(struct net *net)
static void __net_exit ipmr_rules_exit(struct net *net)
{
- rtnl_lock();
+ ASSERT_RTNL();
ipmr_free_table(net->ipv4.mrt);
net->ipv4.mrt = NULL;
- rtnl_unlock();
}
-static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
+static int ipmr_rules_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
return 0;
}
-static unsigned int ipmr_rules_seq_read(struct net *net)
+static unsigned int ipmr_rules_seq_read(const struct net *net)
{
return 0;
}
@@ -361,7 +375,7 @@ static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
const void *ptr)
{
const struct mfc_cache_cmp_arg *cmparg = arg->key;
- struct mfc_cache *c = (struct mfc_cache *)ptr;
+ const struct mfc_cache *c = ptr;
return cmparg->mfc_mcastgrp != c->mfc_mcastgrp ||
cmparg->mfc_origin != c->mfc_origin;
@@ -372,7 +386,6 @@ static const struct rhashtable_params ipmr_rht_params = {
.key_offset = offsetof(struct mfc_cache, cmparg),
.key_len = sizeof(struct mfc_cache_cmp_arg),
.nelem_hint = 3,
- .locks_mul = 1,
.obj_cmpfn = ipmr_hash_cmp,
.automatic_shrinking = true,
};
@@ -403,7 +416,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
if (id != RT_TABLE_DEFAULT && id >= 1000000000)
return ERR_PTR(-EINVAL);
- mrt = ipmr_get_table(net, id);
+ mrt = __ipmr_get_table(net, id);
if (mrt)
return mrt;
@@ -413,45 +426,19 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
static void ipmr_free_table(struct mr_table *mrt)
{
- del_timer_sync(&mrt->ipmr_expire_timer);
- mroute_clean_tables(mrt, true);
+ struct net *net = read_pnet(&mrt->net);
+
+ WARN_ON_ONCE(!mr_can_free_table(net));
+
+ timer_shutdown_sync(&mrt->ipmr_expire_timer);
+ mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC |
+ MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC);
rhltable_destroy(&mrt->mfc_hash);
kfree(mrt);
}
/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
-static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
-{
- struct net *net = dev_net(dev);
-
- dev_close(dev);
-
- dev = __dev_get_by_name(net, "tunl0");
- if (dev) {
- const struct net_device_ops *ops = dev->netdev_ops;
- struct ifreq ifr;
- struct ip_tunnel_parm p;
-
- memset(&p, 0, sizeof(p));
- p.iph.daddr = v->vifc_rmt_addr.s_addr;
- p.iph.saddr = v->vifc_lcl_addr.s_addr;
- p.iph.version = 4;
- p.iph.ihl = 5;
- p.iph.protocol = IPPROTO_IPIP;
- sprintf(p.name, "dvmrp%d", v->vifc_vifi);
- ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
-
- if (ops->ndo_do_ioctl) {
- mm_segment_t oldfs = get_fs();
-
- set_fs(KERNEL_DS);
- ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
- set_fs(oldfs);
- }
- }
-}
-
/* Initialize ipmr pimreg/tunnel in_device */
static bool ipmr_init_vif_indev(const struct net_device *dev)
{
@@ -471,51 +458,52 @@ static bool ipmr_init_vif_indev(const struct net_device *dev)
static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
{
- struct net_device *dev;
-
- dev = __dev_get_by_name(net, "tunl0");
-
- if (dev) {
- const struct net_device_ops *ops = dev->netdev_ops;
- int err;
- struct ifreq ifr;
- struct ip_tunnel_parm p;
+ struct net_device *tunnel_dev, *new_dev;
+ struct ip_tunnel_parm_kern p = { };
+ int err;
- memset(&p, 0, sizeof(p));
- p.iph.daddr = v->vifc_rmt_addr.s_addr;
- p.iph.saddr = v->vifc_lcl_addr.s_addr;
- p.iph.version = 4;
- p.iph.ihl = 5;
- p.iph.protocol = IPPROTO_IPIP;
- sprintf(p.name, "dvmrp%d", v->vifc_vifi);
- ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
+ tunnel_dev = __dev_get_by_name(net, "tunl0");
+ if (!tunnel_dev)
+ goto out;
- if (ops->ndo_do_ioctl) {
- mm_segment_t oldfs = get_fs();
+ p.iph.daddr = v->vifc_rmt_addr.s_addr;
+ p.iph.saddr = v->vifc_lcl_addr.s_addr;
+ p.iph.version = 4;
+ p.iph.ihl = 5;
+ p.iph.protocol = IPPROTO_IPIP;
+ sprintf(p.name, "dvmrp%d", v->vifc_vifi);
- set_fs(KERNEL_DS);
- err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
- set_fs(oldfs);
- } else {
- err = -EOPNOTSUPP;
- }
- dev = NULL;
+ if (!tunnel_dev->netdev_ops->ndo_tunnel_ctl)
+ goto out;
+ err = tunnel_dev->netdev_ops->ndo_tunnel_ctl(tunnel_dev, &p,
+ SIOCADDTUNNEL);
+ if (err)
+ goto out;
- if (err == 0 &&
- (dev = __dev_get_by_name(net, p.name)) != NULL) {
- dev->flags |= IFF_MULTICAST;
- if (!ipmr_init_vif_indev(dev))
- goto failure;
- if (dev_open(dev))
- goto failure;
- dev_hold(dev);
- }
- }
- return dev;
+ new_dev = __dev_get_by_name(net, p.name);
+ if (!new_dev)
+ goto out;
-failure:
- unregister_netdevice(dev);
- return NULL;
+ new_dev->flags |= IFF_MULTICAST;
+ if (!ipmr_init_vif_indev(new_dev))
+ goto out_unregister;
+ if (dev_open(new_dev, NULL))
+ goto out_unregister;
+ dev_hold(new_dev);
+ err = dev_set_allmulti(new_dev, 1);
+ if (err) {
+ dev_close(new_dev);
+ tunnel_dev->netdev_ops->ndo_tunnel_ctl(tunnel_dev, &p,
+ SIOCDELTUNNEL);
+ dev_put(new_dev);
+ new_dev = ERR_PTR(err);
+ }
+ return new_dev;
+
+out_unregister:
+ unregister_netdevice(new_dev);
+out:
+ return ERR_PTR(-ENOBUFS);
}
#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
@@ -536,11 +524,15 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
return err;
}
- read_lock(&mrt_lock);
- dev->stats.tx_bytes += skb->len;
- dev->stats.tx_packets++;
- ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
- read_unlock(&mrt_lock);
+ DEV_STATS_ADD(dev, tx_bytes, skb->len);
+ DEV_STATS_INC(dev, tx_packets);
+ rcu_read_lock();
+
+ /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */
+ ipmr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num),
+ IGMPMSG_WHOLEPKT);
+
+ rcu_read_unlock();
kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -562,7 +554,7 @@ static void reg_vif_setup(struct net_device *dev)
dev->flags = IFF_NOARP;
dev->netdev_ops = &reg_vif_netdev_ops;
dev->needs_free_netdev = true;
- dev->features |= NETIF_F_NETNS_LOCAL;
+ dev->netns_immutable = true;
}
static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
@@ -589,7 +581,7 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
if (!ipmr_init_vif_indev(dev))
goto failure;
- if (dev_open(dev))
+ if (dev_open(dev, NULL))
goto failure;
dev_hold(dev);
@@ -607,6 +599,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
{
struct net_device *reg_dev = NULL;
struct iphdr *encap;
+ int vif_num;
encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
/* Check that:
@@ -619,11 +612,10 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
ntohs(encap->tot_len) + pimlen > skb->len)
return 1;
- read_lock(&mrt_lock);
- if (mrt->mroute_reg_vif_num >= 0)
- reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
- read_unlock(&mrt_lock);
-
+ /* Pairs with WRITE_ONCE() in vif_add()/vid_delete() */
+ vif_num = READ_ONCE(mrt->mroute_reg_vif_num);
+ if (vif_num >= 0)
+ reg_dev = vif_dev_read(&mrt->vif_table[vif_num]);
if (!reg_dev)
return 1;
@@ -649,10 +641,11 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
static int call_ipmr_vif_entry_notifiers(struct net *net,
enum fib_event_type event_type,
struct vif_device *vif,
+ struct net_device *vif_dev,
vifi_t vif_index, u32 tb_id)
{
return mr_call_vif_notifiers(net, RTNL_FAMILY_IPMR, event_type,
- vif, vif_index, tb_id,
+ vif, vif_dev, vif_index, tb_id,
&net->ipv4.ipmr_seq);
}
@@ -666,7 +659,10 @@ static int call_ipmr_mfc_entry_notifiers(struct net *net,
/**
* vif_delete - Delete a VIF entry
+ * @mrt: Table to delete from
+ * @vifi: VIF identifier to delete
* @notify: Set to 1, if the caller is a notifier_call
+ * @head: if unregistering the VIF, place it on this queue
*/
static int vif_delete(struct mr_table *mrt, int vifi, int notify,
struct list_head *head)
@@ -681,22 +677,19 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
v = &mrt->vif_table[vifi];
- if (VIF_EXISTS(mrt, vifi))
- call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, vifi,
- mrt->id);
-
- write_lock_bh(&mrt_lock);
- dev = v->dev;
- v->dev = NULL;
-
- if (!dev) {
- write_unlock_bh(&mrt_lock);
+ dev = rtnl_dereference(v->dev);
+ if (!dev)
return -EADDRNOTAVAIL;
- }
- if (vifi == mrt->mroute_reg_vif_num)
- mrt->mroute_reg_vif_num = -1;
+ spin_lock(&mrt_lock);
+ call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, dev,
+ vifi, mrt->id);
+ RCU_INIT_POINTER(v->dev, NULL);
+ if (vifi == mrt->mroute_reg_vif_num) {
+ /* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */
+ WRITE_ONCE(mrt->mroute_reg_vif_num, -1);
+ }
if (vifi + 1 == mrt->maxvif) {
int tmp;
@@ -704,10 +697,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
if (VIF_EXISTS(mrt, tmp))
break;
}
- mrt->maxvif = tmp+1;
+ WRITE_ONCE(mrt->maxvif, tmp + 1);
}
- write_unlock_bh(&mrt_lock);
+ spin_unlock(&mrt_lock);
dev_set_allmulti(dev, -1);
@@ -723,7 +716,7 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
unregister_netdevice_queue(dev, head);
- dev_put(dev);
+ netdev_put(dev, &v->dev_tracker);
return 0;
}
@@ -773,7 +766,7 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
/* Timer process for the unresolved queue. */
static void ipmr_expire_process(struct timer_list *t)
{
- struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer);
+ struct mr_table *mrt = timer_container_of(mrt, t, ipmr_expire_timer);
struct mr_mfc *c, *next;
unsigned long expires;
unsigned long now;
@@ -809,7 +802,7 @@ out:
spin_unlock(&mfc_unres_lock);
}
-/* Fill oifs list. It is called under write locked mrt_lock. */
+/* Fill oifs list. It is called under locked mrt_lock. */
static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache,
unsigned char *ttls)
{
@@ -829,16 +822,14 @@ static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache,
cache->mfc_un.res.maxvif = vifi + 1;
}
}
- cache->mfc_un.res.lastuse = jiffies;
+ WRITE_ONCE(cache->mfc_un.res.lastuse, jiffies);
}
static int vif_add(struct net *net, struct mr_table *mrt,
struct vifctl *vifc, int mrtsock)
{
+ struct netdev_phys_item_id ppid = { };
int vifi = vifc->vifc_vifi;
- struct switchdev_attr attr = {
- .id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
- };
struct vif_device *v = &mrt->vif_table[vifi];
struct net_device *dev;
struct in_device *in_dev;
@@ -869,14 +860,8 @@ static int vif_add(struct net *net, struct mr_table *mrt,
break;
case VIFF_TUNNEL:
dev = ipmr_new_tunnel(net, vifc);
- if (!dev)
- return -ENOBUFS;
- err = dev_set_allmulti(dev, 1);
- if (err) {
- ipmr_del_tunnel(dev, vifc);
- dev_put(dev);
- return err;
- }
+ if (IS_ERR(dev))
+ return PTR_ERR(dev);
break;
case VIFF_USE_IFINDEX:
case 0:
@@ -917,10 +902,10 @@ static int vif_add(struct net *net, struct mr_table *mrt,
vifc->vifc_flags | (!mrtsock ? VIFF_STATIC : 0),
(VIFF_TUNNEL | VIFF_REGISTER));
- attr.orig_dev = dev;
- if (!switchdev_port_attr_get(dev, &attr)) {
- memcpy(v->dev_parent_id.id, attr.u.ppid.id, attr.u.ppid.id_len);
- v->dev_parent_id.id_len = attr.u.ppid.id_len;
+ err = netif_get_port_parent_id(dev, &ppid, true);
+ if (err == 0) {
+ memcpy(v->dev_parent_id.id, ppid.id, ppid.id_len);
+ v->dev_parent_id.id_len = ppid.id_len;
} else {
v->dev_parent_id.id_len = 0;
}
@@ -929,14 +914,18 @@ static int vif_add(struct net *net, struct mr_table *mrt,
v->remote = vifc->vifc_rmt_addr.s_addr;
/* And finish update writing critical data */
- write_lock_bh(&mrt_lock);
- v->dev = dev;
- if (v->flags & VIFF_REGISTER)
- mrt->mroute_reg_vif_num = vifi;
+ spin_lock(&mrt_lock);
+ rcu_assign_pointer(v->dev, dev);
+ netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC);
+ if (v->flags & VIFF_REGISTER) {
+ /* Pairs with READ_ONCE() in ipmr_cache_report() and reg_vif_xmit() */
+ WRITE_ONCE(mrt->mroute_reg_vif_num, vifi);
+ }
if (vifi+1 > mrt->maxvif)
- mrt->maxvif = vifi+1;
- write_unlock_bh(&mrt_lock);
- call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, vifi, mrt->id);
+ WRITE_ONCE(mrt->maxvif, vifi + 1);
+ spin_unlock(&mrt_lock);
+ call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, dev,
+ vifi, mrt->id);
return 0;
}
@@ -1033,16 +1022,18 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
} else {
+ rcu_read_lock();
ip_mr_forward(net, mrt, skb->dev, skb, c, 0);
+ rcu_read_unlock();
}
}
}
/* Bounce a cache query up to mrouted and netlink.
*
- * Called under mrt_lock.
+ * Called under rcu_read_lock().
*/
-static int ipmr_cache_report(struct mr_table *mrt,
+static int ipmr_cache_report(const struct mr_table *mrt,
struct sk_buff *pkt, vifi_t vifi, int assert)
{
const int ihl = ip_hdrlen(pkt);
@@ -1052,6 +1043,10 @@ static int ipmr_cache_report(struct mr_table *mrt,
struct sk_buff *skb;
int ret;
+ mroute_sk = rcu_dereference(mrt->mroute_sk);
+ if (!mroute_sk)
+ return -EINVAL;
+
if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE)
skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
else
@@ -1073,10 +1068,16 @@ static int ipmr_cache_report(struct mr_table *mrt,
memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
msg->im_msgtype = assert;
msg->im_mbz = 0;
- if (assert == IGMPMSG_WRVIFWHOLE)
+ if (assert == IGMPMSG_WRVIFWHOLE) {
msg->im_vif = vifi;
- else
- msg->im_vif = mrt->mroute_reg_vif_num;
+ msg->im_vif_hi = vifi >> 8;
+ } else {
+ /* Pairs with WRITE_ONCE() in vif_add() and vif_delete() */
+ int vif_num = READ_ONCE(mrt->mroute_reg_vif_num);
+
+ msg->im_vif = vif_num;
+ msg->im_vif_hi = vif_num >> 8;
+ }
ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
sizeof(struct iphdr));
@@ -1089,7 +1090,9 @@ static int ipmr_cache_report(struct mr_table *mrt,
ip_hdr(skb)->protocol = 0;
msg = (struct igmpmsg *)skb_network_header(skb);
msg->im_vif = vifi;
- skb_dst_set(skb, dst_clone(skb_dst(pkt)));
+ msg->im_vif_hi = vifi >> 8;
+ ipv4_pktinfo_prepare(mroute_sk, pkt, false);
+ memcpy(skb->cb, pkt->cb, sizeof(skb->cb));
/* Add our header */
igmp = skb_put(skb, sizeof(struct igmphdr));
igmp->type = assert;
@@ -1099,19 +1102,11 @@ static int ipmr_cache_report(struct mr_table *mrt,
skb->transport_header = skb->network_header;
}
- rcu_read_lock();
- mroute_sk = rcu_dereference(mrt->mroute_sk);
- if (!mroute_sk) {
- rcu_read_unlock();
- kfree_skb(skb);
- return -EINVAL;
- }
-
igmpmsg_netlink_event(mrt, skb);
/* Deliver to mrouted */
ret = sock_queue_rcv_skb(mroute_sk, skb);
- rcu_read_unlock();
+
if (ret < 0) {
net_warn_ratelimited("mroute: pending queue full, dropping entries\n");
kfree_skb(skb);
@@ -1121,6 +1116,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
}
/* Queue a packet for resolution. It gets locked cache entry! */
+/* Called under rcu_read_lock() */
static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
struct sk_buff *skb, struct net_device *dev)
{
@@ -1140,8 +1136,8 @@ static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
if (!found) {
/* Create a new entry if allowable */
- if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
- (c = ipmr_cache_alloc_unres()) == NULL) {
+ c = ipmr_cache_alloc_unres();
+ if (!c) {
spin_unlock_bh(&mfc_unres_lock);
kfree_skb(skb);
@@ -1233,12 +1229,12 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
mfc->mfcc_mcastgrp.s_addr, parent);
rcu_read_unlock();
if (c) {
- write_lock_bh(&mrt_lock);
+ spin_lock(&mrt_lock);
c->_c.mfc_parent = mfc->mfcc_parent;
ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls);
if (!mrtsock)
c->_c.mfc_flags |= MFC_STATIC;
- write_unlock_bh(&mrt_lock);
+ spin_unlock(&mrt_lock);
call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c,
mrt->id);
mroute_netlink_event(mrt, c, RTM_NEWROUTE);
@@ -1284,7 +1280,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
}
}
if (list_empty(&mrt->mfc_unres_queue))
- del_timer(&mrt->ipmr_expire_timer);
+ timer_delete(&mrt->ipmr_expire_timer);
spin_unlock_bh(&mfc_unres_lock);
if (found) {
@@ -1297,7 +1293,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
}
/* Close the multicast socket, and clear the vif tables etc */
-static void mroute_clean_tables(struct mr_table *mrt, bool all)
+static void mroute_clean_tables(struct mr_table *mrt, int flags)
{
struct net *net = read_pnet(&mrt->net);
struct mr_mfc *c, *tmp;
@@ -1306,40 +1302,49 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all)
int i;
/* Shut down all active vif entries */
- for (i = 0; i < mrt->maxvif; i++) {
- if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
- continue;
- vif_delete(mrt, i, 0, &list);
+ if (flags & (MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC)) {
+ for (i = 0; i < mrt->maxvif; i++) {
+ if (((mrt->vif_table[i].flags & VIFF_STATIC) &&
+ !(flags & MRT_FLUSH_VIFS_STATIC)) ||
+ (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT_FLUSH_VIFS)))
+ continue;
+ vif_delete(mrt, i, 0, &list);
+ }
+ unregister_netdevice_many(&list);
}
- unregister_netdevice_many(&list);
/* Wipe the cache */
- list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
- if (!all && (c->mfc_flags & MFC_STATIC))
- continue;
- rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
- list_del_rcu(&c->list);
- cache = (struct mfc_cache *)c;
- call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache,
- mrt->id);
- mroute_netlink_event(mrt, cache, RTM_DELROUTE);
- mr_cache_put(c);
- }
-
- if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
- spin_lock_bh(&mfc_unres_lock);
- list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
- list_del(&c->list);
+ if (flags & (MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC)) {
+ list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
+ if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC_STATIC)) ||
+ (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC)))
+ continue;
+ rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
+ list_del_rcu(&c->list);
cache = (struct mfc_cache *)c;
+ call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache,
+ mrt->id);
mroute_netlink_event(mrt, cache, RTM_DELROUTE);
- ipmr_destroy_unres(mrt, cache);
+ mr_cache_put(c);
+ }
+ }
+
+ if (flags & MRT_FLUSH_MFC) {
+ if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
+ spin_lock_bh(&mfc_unres_lock);
+ list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
+ list_del(&c->list);
+ cache = (struct mfc_cache *)c;
+ mroute_netlink_event(mrt, cache, RTM_DELROUTE);
+ ipmr_destroy_unres(mrt, cache);
+ }
+ spin_unlock_bh(&mfc_unres_lock);
}
- spin_unlock_bh(&mfc_unres_lock);
}
}
/* called from ip_ra_control(), before an RCU grace period,
- * we dont need to call synchronize_rcu() here
+ * we don't need to call synchronize_rcu() here
*/
static void mrtsock_destruct(struct sock *sk)
{
@@ -1355,7 +1360,7 @@ static void mrtsock_destruct(struct sock *sk)
NETCONFA_IFINDEX_ALL,
net->ipv4.devconf_all);
RCU_INIT_POINTER(mrt->mroute_sk, NULL);
- mroute_clean_tables(mrt, false);
+ mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_MFC);
}
}
rtnl_unlock();
@@ -1367,7 +1372,7 @@ static void mrtsock_destruct(struct sock *sk)
* MOSPF/PIM router set up we can clean this up.
*/
-int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
+int ip_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval,
unsigned int optlen)
{
struct net *net = sock_net(sk);
@@ -1386,7 +1391,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
goto out_unlock;
}
- mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
+ mrt = __ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
if (!mrt) {
ret = -ENOENT;
goto out_unlock;
@@ -1439,7 +1444,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
ret = -EINVAL;
break;
}
- if (copy_from_user(&vif, optval, sizeof(vif))) {
+ if (copy_from_sockptr(&vif, optval, sizeof(vif))) {
ret = -EFAULT;
break;
}
@@ -1460,14 +1465,14 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
case MRT_ADD_MFC:
case MRT_DEL_MFC:
parent = -1;
- /* fall through */
+ fallthrough;
case MRT_ADD_MFC_PROXY:
case MRT_DEL_MFC_PROXY:
if (optlen != sizeof(mfc)) {
ret = -EINVAL;
break;
}
- if (copy_from_user(&mfc, optval, sizeof(mfc))) {
+ if (copy_from_sockptr(&mfc, optval, sizeof(mfc))) {
ret = -EFAULT;
break;
}
@@ -1480,13 +1485,24 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
sk == rtnl_dereference(mrt->mroute_sk),
parent);
break;
+ case MRT_FLUSH:
+ if (optlen != sizeof(val)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (copy_from_sockptr(&val, optval, sizeof(val))) {
+ ret = -EFAULT;
+ break;
+ }
+ mroute_clean_tables(mrt, val);
+ break;
/* Control PIM assert. */
case MRT_ASSERT:
if (optlen != sizeof(val)) {
ret = -EINVAL;
break;
}
- if (get_user(val, (int __user *)optval)) {
+ if (copy_from_sockptr(&val, optval, sizeof(val))) {
ret = -EFAULT;
break;
}
@@ -1501,7 +1517,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
ret = -EINVAL;
break;
}
- if (get_user(val, (int __user *)optval)) {
+ if (copy_from_sockptr(&val, optval, sizeof(val))) {
ret = -EFAULT;
break;
}
@@ -1523,7 +1539,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
ret = -EINVAL;
break;
}
- if (get_user(uval, (u32 __user *)optval)) {
+ if (copy_from_sockptr(&uval, optval, sizeof(uval))) {
ret = -EFAULT;
break;
}
@@ -1548,8 +1564,31 @@ out:
return ret;
}
+/* Execute if this ioctl is a special mroute ioctl */
+int ipmr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
+{
+ switch (cmd) {
+ /* These userspace buffers will be consumed by ipmr_ioctl() */
+ case SIOCGETVIFCNT: {
+ struct sioc_vif_req buffer;
+
+ return sock_ioctl_inout(sk, cmd, arg, &buffer,
+ sizeof(buffer));
+ }
+ case SIOCGETSGCNT: {
+ struct sioc_sg_req buffer;
+
+ return sock_ioctl_inout(sk, cmd, arg, &buffer,
+ sizeof(buffer));
+ }
+ }
+ /* return code > 0 means that the ioctl was not executed */
+ return 1;
+}
+
/* Getsock opt support for the multicast routing system. */
-int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
+int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval,
+ sockptr_t optlen)
{
int olr;
int val;
@@ -1580,26 +1619,28 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
return -ENOPROTOOPT;
}
- if (get_user(olr, optlen))
+ if (copy_from_sockptr(&olr, optlen, sizeof(int)))
return -EFAULT;
- olr = min_t(unsigned int, olr, sizeof(int));
if (olr < 0)
return -EINVAL;
- if (put_user(olr, optlen))
+
+ olr = min_t(unsigned int, olr, sizeof(int));
+
+ if (copy_to_sockptr(optlen, &olr, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &val, olr))
+ if (copy_to_sockptr(optval, &val, olr))
return -EFAULT;
return 0;
}
/* The IP multicast ioctl support routines. */
-int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
+int ipmr_ioctl(struct sock *sk, int cmd, void *arg)
{
- struct sioc_sg_req sr;
- struct sioc_vif_req vr;
struct vif_device *vif;
struct mfc_cache *c;
struct net *net = sock_net(sk);
+ struct sioc_vif_req *vr;
+ struct sioc_sg_req *sr;
struct mr_table *mrt;
mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
@@ -1608,39 +1649,33 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
switch (cmd) {
case SIOCGETVIFCNT:
- if (copy_from_user(&vr, arg, sizeof(vr)))
- return -EFAULT;
- if (vr.vifi >= mrt->maxvif)
+ vr = (struct sioc_vif_req *)arg;
+ if (vr->vifi >= mrt->maxvif)
return -EINVAL;
- read_lock(&mrt_lock);
- vif = &mrt->vif_table[vr.vifi];
- if (VIF_EXISTS(mrt, vr.vifi)) {
- vr.icount = vif->pkt_in;
- vr.ocount = vif->pkt_out;
- vr.ibytes = vif->bytes_in;
- vr.obytes = vif->bytes_out;
- read_unlock(&mrt_lock);
+ vr->vifi = array_index_nospec(vr->vifi, mrt->maxvif);
+ rcu_read_lock();
+ vif = &mrt->vif_table[vr->vifi];
+ if (VIF_EXISTS(mrt, vr->vifi)) {
+ vr->icount = READ_ONCE(vif->pkt_in);
+ vr->ocount = READ_ONCE(vif->pkt_out);
+ vr->ibytes = READ_ONCE(vif->bytes_in);
+ vr->obytes = READ_ONCE(vif->bytes_out);
+ rcu_read_unlock();
- if (copy_to_user(arg, &vr, sizeof(vr)))
- return -EFAULT;
return 0;
}
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -EADDRNOTAVAIL;
case SIOCGETSGCNT:
- if (copy_from_user(&sr, arg, sizeof(sr)))
- return -EFAULT;
+ sr = (struct sioc_sg_req *)arg;
rcu_read_lock();
- c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
+ c = ipmr_cache_find(mrt, sr->src.s_addr, sr->grp.s_addr);
if (c) {
- sr.pktcnt = c->_c.mfc_un.res.pkt;
- sr.bytecnt = c->_c.mfc_un.res.bytes;
- sr.wrong_if = c->_c.mfc_un.res.wrong_if;
+ sr->pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt);
+ sr->bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes);
+ sr->wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if);
rcu_read_unlock();
-
- if (copy_to_user(arg, &sr, sizeof(sr)))
- return -EFAULT;
return 0;
}
rcu_read_unlock();
@@ -1686,20 +1721,21 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
return -EFAULT;
if (vr.vifi >= mrt->maxvif)
return -EINVAL;
- read_lock(&mrt_lock);
+ vr.vifi = array_index_nospec(vr.vifi, mrt->maxvif);
+ rcu_read_lock();
vif = &mrt->vif_table[vr.vifi];
if (VIF_EXISTS(mrt, vr.vifi)) {
- vr.icount = vif->pkt_in;
- vr.ocount = vif->pkt_out;
- vr.ibytes = vif->bytes_in;
- vr.obytes = vif->bytes_out;
- read_unlock(&mrt_lock);
+ vr.icount = READ_ONCE(vif->pkt_in);
+ vr.ocount = READ_ONCE(vif->pkt_out);
+ vr.ibytes = READ_ONCE(vif->bytes_in);
+ vr.obytes = READ_ONCE(vif->bytes_out);
+ rcu_read_unlock();
if (copy_to_user(arg, &vr, sizeof(vr)))
return -EFAULT;
return 0;
}
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -EADDRNOTAVAIL;
case SIOCGETSGCNT:
if (copy_from_user(&sr, arg, sizeof(sr)))
@@ -1708,9 +1744,9 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
rcu_read_lock();
c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
if (c) {
- sr.pktcnt = c->_c.mfc_un.res.pkt;
- sr.bytecnt = c->_c.mfc_un.res.bytes;
- sr.wrong_if = c->_c.mfc_un.res.wrong_if;
+ sr.pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt);
+ sr.bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes);
+ sr.wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if);
rcu_read_unlock();
if (copy_to_user(arg, &sr, sizeof(sr)))
@@ -1739,7 +1775,7 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
ipmr_for_each_table(mrt, net) {
v = &mrt->vif_table[0];
for (ct = 0; ct < mrt->maxvif; ct++, v++) {
- if (v->dev == dev)
+ if (rcu_access_pointer(v->dev) == dev)
vif_delete(mrt, ct, 1, NULL);
}
}
@@ -1778,7 +1814,7 @@ static void ip_encap(struct net *net, struct sk_buff *skb,
ip_send_check(iph);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- nf_reset(skb);
+ nf_reset_ct(skb);
}
static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
@@ -1787,7 +1823,6 @@ static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
struct ip_options *opt = &(IPCB(skb)->opt);
IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
- IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len);
if (unlikely(opt->optlen))
ip_forward_options(skb);
@@ -1802,7 +1837,7 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
struct vif_device *out_vif = &mrt->vif_table[out_vifi];
struct vif_device *in_vif = &mrt->vif_table[in_vifi];
- if (!skb->offload_mr_fwd_mark)
+ if (!skb->offload_l3_fwd_mark)
return false;
if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len)
return false;
@@ -1817,54 +1852,49 @@ static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
}
#endif
-/* Processing handlers for ipmr_forward */
+/* Processing handlers for ipmr_forward, under rcu_read_lock() */
-static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
- int in_vifi, struct sk_buff *skb,
- struct mfc_cache *c, int vifi)
+static int ipmr_prepare_xmit(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, int vifi)
{
const struct iphdr *iph = ip_hdr(skb);
struct vif_device *vif = &mrt->vif_table[vifi];
- struct net_device *dev;
+ struct net_device *vif_dev;
struct rtable *rt;
struct flowi4 fl4;
int encap = 0;
- if (!vif->dev)
- goto out_free;
+ vif_dev = vif_dev_read(vif);
+ if (!vif_dev)
+ return -1;
if (vif->flags & VIFF_REGISTER) {
- vif->pkt_out++;
- vif->bytes_out += skb->len;
- vif->dev->stats.tx_bytes += skb->len;
- vif->dev->stats.tx_packets++;
+ WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
+ WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
+ DEV_STATS_ADD(vif_dev, tx_bytes, skb->len);
+ DEV_STATS_INC(vif_dev, tx_packets);
ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
- goto out_free;
+ return -1;
}
- if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi))
- goto out_free;
-
if (vif->flags & VIFF_TUNNEL) {
rt = ip_route_output_ports(net, &fl4, NULL,
vif->remote, vif->local,
0, 0,
IPPROTO_IPIP,
- RT_TOS(iph->tos), vif->link);
+ iph->tos & INET_DSCP_MASK, vif->link);
if (IS_ERR(rt))
- goto out_free;
+ return -1;
encap = sizeof(struct iphdr);
} else {
rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0,
0, 0,
IPPROTO_IPIP,
- RT_TOS(iph->tos), vif->link);
+ iph->tos & INET_DSCP_MASK, vif->link);
if (IS_ERR(rt))
- goto out_free;
+ return -1;
}
- dev = rt->dst.dev;
-
if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
/* Do not fragment multicasts. Alas, IPv4 does not
* allow to send ICMP, so that packets will disappear
@@ -1872,18 +1902,18 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
*/
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
ip_rt_put(rt);
- goto out_free;
+ return -1;
}
- encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;
+ encap += LL_RESERVED_SPACE(dst_dev_rcu(&rt->dst)) + rt->dst.header_len;
if (skb_cow(skb, encap)) {
ip_rt_put(rt);
- goto out_free;
+ return -1;
}
- vif->pkt_out++;
- vif->bytes_out += skb->len;
+ WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
+ WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
skb_dst_drop(skb);
skb_dst_set(skb, &rt->dst);
@@ -1895,10 +1925,26 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
if (vif->flags & VIFF_TUNNEL) {
ip_encap(net, skb, vif->local, vif->remote);
/* FIXME: extra output firewall step used to be here. --RR */
- vif->dev->stats.tx_packets++;
- vif->dev->stats.tx_bytes += skb->len;
+ DEV_STATS_INC(vif_dev, tx_packets);
+ DEV_STATS_ADD(vif_dev, tx_bytes, skb->len);
}
+ return 0;
+}
+
+static void ipmr_queue_fwd_xmit(struct net *net, struct mr_table *mrt,
+ int in_vifi, struct sk_buff *skb, int vifi)
+{
+ struct rtable *rt;
+
+ if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi))
+ goto out_free;
+
+ if (ipmr_prepare_xmit(net, mrt, skb, vifi))
+ goto out_free;
+
+ rt = skb_rtable(skb);
+
IPCB(skb)->flags |= IPSKB_FORWARDED;
/* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
@@ -1912,7 +1958,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
* result in receiving multiple packets.
*/
NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
- net, NULL, skb, skb->dev, dev,
+ net, NULL, skb, skb->dev, dst_dev_rcu(&rt->dst),
ipmr_forward_finish);
return;
@@ -1920,18 +1966,33 @@ out_free:
kfree_skb(skb);
}
-static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
+static void ipmr_queue_output_xmit(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, int vifi)
{
- int ct;
+ if (ipmr_prepare_xmit(net, mrt, skb, vifi))
+ goto out_free;
+
+ ip_mc_output(net, NULL, skb);
+ return;
+
+out_free:
+ kfree_skb(skb);
+}
- for (ct = mrt->maxvif-1; ct >= 0; ct--) {
- if (mrt->vif_table[ct].dev == dev)
+/* Called with mrt_lock or rcu_read_lock() */
+static int ipmr_find_vif(const struct mr_table *mrt, struct net_device *dev)
+{
+ int ct;
+ /* Pairs with WRITE_ONCE() in vif_delete()/vif_add() */
+ for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) {
+ if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev)
break;
}
return ct;
}
/* "local" means that we should preserve one skb (for local delivery) */
+/* Called uner rcu_read_lock() */
static void ip_mr_forward(struct net *net, struct mr_table *mrt,
struct net_device *dev, struct sk_buff *skb,
struct mfc_cache *c, int local)
@@ -1941,14 +2002,14 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
int vif, ct;
vif = c->_c.mfc_parent;
- c->_c.mfc_un.res.pkt++;
- c->_c.mfc_un.res.bytes += skb->len;
- c->_c.mfc_un.res.lastuse = jiffies;
+ atomic_long_inc(&c->_c.mfc_un.res.pkt);
+ atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes);
+ WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies);
if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) {
struct mfc_cache *cache_proxy;
- /* For an (*,G) entry, we only check that the incomming
+ /* For an (*,G) entry, we only check that the incoming
* interface is part of the static tree.
*/
cache_proxy = mr_mfc_find_any_parent(mrt, vif);
@@ -1958,7 +2019,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
}
/* Wrong interface: drop packet and (maybe) send PIM assert. */
- if (mrt->vif_table[vif].dev != dev) {
+ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) {
if (rt_is_output_route(skb_rtable(skb))) {
/* It is our own packet, looped back.
* Very complicated situation...
@@ -1974,7 +2035,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
goto dont_forward;
}
- c->_c.mfc_un.res.wrong_if++;
+ atomic_long_inc(&c->_c.mfc_un.res.wrong_if);
if (true_vifi >= 0 && mrt->mroute_do_assert &&
/* pimsm uses asserts, when switching from RPT to SPT,
@@ -1997,8 +2058,10 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
}
forward:
- mrt->vif_table[vif].pkt_in++;
- mrt->vif_table[vif].bytes_in += skb->len;
+ WRITE_ONCE(mrt->vif_table[vif].pkt_in,
+ mrt->vif_table[vif].pkt_in + 1);
+ WRITE_ONCE(mrt->vif_table[vif].bytes_in,
+ mrt->vif_table[vif].bytes_in + skb->len);
/* Forward the frame */
if (c->mfc_origin == htonl(INADDR_ANY) &&
@@ -2026,8 +2089,8 @@ forward:
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2)
- ipmr_queue_xmit(net, mrt, true_vifi,
- skb2, c, psend);
+ ipmr_queue_fwd_xmit(net, mrt, true_vifi,
+ skb2, psend);
}
psend = ct;
}
@@ -2038,10 +2101,10 @@ last_forward:
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2)
- ipmr_queue_xmit(net, mrt, true_vifi, skb2,
- c, psend);
+ ipmr_queue_fwd_xmit(net, mrt, true_vifi, skb2,
+ psend);
} else {
- ipmr_queue_xmit(net, mrt, true_vifi, skb, c, psend);
+ ipmr_queue_fwd_xmit(net, mrt, true_vifi, skb, psend);
return;
}
}
@@ -2058,7 +2121,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
struct flowi4 fl4 = {
.daddr = iph->daddr,
.saddr = iph->saddr,
- .flowi4_tos = RT_TOS(iph->tos),
+ .flowi4_dscp = ip4h_dscp(iph),
.flowi4_oif = (rt_is_output_route(rt) ?
skb->dev->ifindex : 0),
.flowi4_iif = (rt_is_output_route(rt) ?
@@ -2125,11 +2188,11 @@ int ip_mr_input(struct sk_buff *skb)
mroute_sk = rcu_dereference(mrt->mroute_sk);
if (mroute_sk) {
- nf_reset(skb);
+ nf_reset_ct(skb);
raw_rcv(mroute_sk, skb);
return 0;
}
- }
+ }
}
/* already under rcu_read_lock() */
@@ -2154,22 +2217,14 @@ int ip_mr_input(struct sk_buff *skb)
skb = skb2;
}
- read_lock(&mrt_lock);
vif = ipmr_find_vif(mrt, dev);
- if (vif >= 0) {
- int err2 = ipmr_cache_unresolved(mrt, vif, skb, dev);
- read_unlock(&mrt_lock);
-
- return err2;
- }
- read_unlock(&mrt_lock);
+ if (vif >= 0)
+ return ipmr_cache_unresolved(mrt, vif, skb, dev);
kfree_skb(skb);
return -ENODEV;
}
- read_lock(&mrt_lock);
ip_mr_forward(net, mrt, dev, skb, cache, local);
- read_unlock(&mrt_lock);
if (local)
return ip_local_deliver(skb);
@@ -2183,6 +2238,110 @@ dont_forward:
return 0;
}
+static void ip_mr_output_finish(struct net *net, struct mr_table *mrt,
+ struct net_device *dev, struct sk_buff *skb,
+ struct mfc_cache *c)
+{
+ int psend = -1;
+ int ct;
+
+ atomic_long_inc(&c->_c.mfc_un.res.pkt);
+ atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes);
+ WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies);
+
+ /* Forward the frame */
+ if (c->mfc_origin == htonl(INADDR_ANY) &&
+ c->mfc_mcastgrp == htonl(INADDR_ANY)) {
+ if (ip_hdr(skb)->ttl >
+ c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) {
+ /* It's an (*,*) entry and the packet is not coming from
+ * the upstream: forward the packet to the upstream
+ * only.
+ */
+ psend = c->_c.mfc_parent;
+ goto last_xmit;
+ }
+ goto dont_xmit;
+ }
+
+ for (ct = c->_c.mfc_un.res.maxvif - 1;
+ ct >= c->_c.mfc_un.res.minvif; ct--) {
+ if (ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[ct]) {
+ if (psend != -1) {
+ struct sk_buff *skb2;
+
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2)
+ ipmr_queue_output_xmit(net, mrt,
+ skb2, psend);
+ }
+ psend = ct;
+ }
+ }
+
+last_xmit:
+ if (psend != -1) {
+ ipmr_queue_output_xmit(net, mrt, skb, psend);
+ return;
+ }
+
+dont_xmit:
+ kfree_skb(skb);
+}
+
+/* Multicast packets for forwarding arrive here
+ * Called with rcu_read_lock();
+ */
+int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct rtable *rt = skb_rtable(skb);
+ struct mfc_cache *cache;
+ struct net_device *dev;
+ struct mr_table *mrt;
+ int vif;
+
+ guard(rcu)();
+
+ dev = dst_dev_rcu(&rt->dst);
+
+ if (IPCB(skb)->flags & IPSKB_FORWARDED)
+ goto mc_output;
+ if (!(IPCB(skb)->flags & IPSKB_MCROUTE))
+ goto mc_output;
+
+ skb->dev = dev;
+
+ mrt = ipmr_rt_fib_lookup(net, skb);
+ if (IS_ERR(mrt))
+ goto mc_output;
+
+ cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
+ if (!cache) {
+ vif = ipmr_find_vif(mrt, dev);
+ if (vif >= 0)
+ cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr,
+ vif);
+ }
+
+ /* No usable cache entry */
+ if (!cache) {
+ vif = ipmr_find_vif(mrt, dev);
+ if (vif >= 0)
+ return ipmr_cache_unresolved(mrt, vif, skb, dev);
+ goto mc_output;
+ }
+
+ vif = cache->_c.mfc_parent;
+ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev)
+ goto mc_output;
+
+ ip_mr_output_finish(net, mrt, dev, skb, cache);
+ return 0;
+
+mc_output:
+ return ip_mc_output(net, sk, skb);
+}
+
#ifdef CONFIG_IP_PIMSM_V1
/* Handle IGMP messages of PIMv1 */
int pim_rcv_v1(struct sk_buff *skb)
@@ -2247,11 +2406,13 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
struct mr_table *mrt;
int err;
- mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
- if (!mrt)
+ rcu_read_lock();
+ mrt = __ipmr_get_table(net, RT_TABLE_DEFAULT);
+ if (!mrt) {
+ rcu_read_unlock();
return -ENOENT;
+ }
- rcu_read_lock();
cache = ipmr_cache_find(mrt, saddr, daddr);
if (!cache && skb->dev) {
int vif = ipmr_find_vif(mrt, skb->dev);
@@ -2266,17 +2427,15 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
int vif = -1;
dev = skb->dev;
- read_lock(&mrt_lock);
if (dev)
vif = ipmr_find_vif(mrt, dev);
if (vif < 0) {
- read_unlock(&mrt_lock);
rcu_read_unlock();
return -ENODEV;
}
- skb2 = skb_clone(skb, GFP_ATOMIC);
+
+ skb2 = skb_realloc_headroom(skb, sizeof(struct iphdr));
if (!skb2) {
- read_unlock(&mrt_lock);
rcu_read_unlock();
return -ENOMEM;
}
@@ -2290,14 +2449,11 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
iph->daddr = daddr;
iph->version = 0;
err = ipmr_cache_unresolved(mrt, vif, skb2, dev);
- read_unlock(&mrt_lock);
rcu_read_unlock();
return err;
}
- read_lock(&mrt_lock);
err = mr_fill_mroute(mrt, skb, &cache->_c, rtm);
- read_unlock(&mrt_lock);
rcu_read_unlock();
return err;
}
@@ -2397,8 +2553,7 @@ static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
errout:
kfree_skb(skb);
- if (err < 0)
- rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err);
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err);
}
static size_t igmpmsg_netlink_msgsize(size_t payloadlen)
@@ -2409,6 +2564,7 @@ static size_t igmpmsg_netlink_msgsize(size_t payloadlen)
+ nla_total_size(4) /* IPMRA_CREPORT_VIF_ID */
+ nla_total_size(4) /* IPMRA_CREPORT_SRC_ADDR */
+ nla_total_size(4) /* IPMRA_CREPORT_DST_ADDR */
+ + nla_total_size(4) /* IPMRA_CREPORT_TABLE */
/* IPMRA_CREPORT_PKT */
+ nla_total_size(payloadlen)
;
@@ -2416,7 +2572,7 @@ static size_t igmpmsg_netlink_msgsize(size_t payloadlen)
return len;
}
-static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt)
+static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt)
{
struct net *net = read_pnet(&mrt->net);
struct nlmsghdr *nlh;
@@ -2440,11 +2596,12 @@ static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt)
rtgenm = nlmsg_data(nlh);
rtgenm->rtgen_family = RTNL_FAMILY_IPMR;
if (nla_put_u8(skb, IPMRA_CREPORT_MSGTYPE, msg->im_msgtype) ||
- nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif) ||
+ nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif | (msg->im_vif_hi << 8)) ||
nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR,
msg->im_src.s_addr) ||
nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR,
- msg->im_dst.s_addr))
+ msg->im_dst.s_addr) ||
+ nla_put_u32(skb, IPMRA_CREPORT_TABLE, mrt->id))
goto nla_put_failure;
nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen);
@@ -2464,6 +2621,61 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE_R, -ENOBUFS);
}
+static int ipmr_rtm_valid_getroute_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct rtmsg *rtm;
+ int i, err;
+
+ rtm = nlmsg_payload(nlh, sizeof(*rtm));
+ if (!rtm) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid header for multicast route get request");
+ return -EINVAL;
+ }
+
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv4_policy, extack);
+
+ if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
+ (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
+ rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol ||
+ rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for multicast route get request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv4_policy, extack);
+ if (err)
+ return err;
+
+ if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
+ (tb[RTA_DST] && !rtm->rtm_dst_len)) {
+ NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
+ return -EINVAL;
+ }
+
+ for (i = 0; i <= RTA_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case RTA_SRC:
+ case RTA_DST:
+ case RTA_TABLE:
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in multicast route get request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
@@ -2472,23 +2684,19 @@ static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct sk_buff *skb = NULL;
struct mfc_cache *cache;
struct mr_table *mrt;
- struct rtmsg *rtm;
__be32 src, grp;
u32 tableid;
int err;
- err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
- rtm_ipv4_policy, extack);
+ err = ipmr_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
if (err < 0)
goto errout;
- rtm = nlmsg_data(nlh);
-
- src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
- grp = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
- tableid = tb[RTA_TABLE] ? nla_get_u32(tb[RTA_TABLE]) : 0;
+ src = nla_get_in_addr_default(tb[RTA_SRC], 0);
+ grp = nla_get_in_addr_default(tb[RTA_DST], 0);
+ tableid = nla_get_u32_default(tb[RTA_TABLE], 0);
- mrt = ipmr_get_table(net, tableid ? tableid : RT_TABLE_DEFAULT);
+ mrt = __ipmr_get_table(net, tableid ? tableid : RT_TABLE_DEFAULT);
if (!mrt) {
err = -ENOENT;
goto errout_free;
@@ -2527,8 +2735,36 @@ errout_free:
static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct fib_dump_filter filter = {
+ .rtnl_held = true,
+ };
+ int err;
+
+ if (cb->strict_check) {
+ err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh,
+ &filter, cb);
+ if (err < 0)
+ return err;
+ }
+
+ if (filter.table_id) {
+ struct mr_table *mrt;
+
+ mrt = __ipmr_get_table(sock_net(skb->sk), filter.table_id);
+ if (!mrt) {
+ if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IPMR)
+ return skb->len;
+
+ NL_SET_ERR_MSG(cb->extack, "ipv4: MR table does not exist");
+ return -ENOENT;
+ }
+ err = mr_table_dump(mrt, skb, cb, _ipmr_fill_mroute,
+ &mfc_unres_lock, &filter);
+ return skb->len ? : err;
+ }
+
return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter,
- _ipmr_fill_mroute, &mfc_unres_lock);
+ _ipmr_fill_mroute, &mfc_unres_lock, &filter);
}
static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = {
@@ -2577,8 +2813,8 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh,
struct rtmsg *rtm;
int ret, rem;
- ret = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy,
- extack);
+ ret = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX,
+ rtm_ipmr_policy, extack);
if (ret < 0)
goto out;
rtm = nlmsg_data(nlh);
@@ -2622,7 +2858,7 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh,
break;
}
}
- mrt = ipmr_get_table(net, tblid);
+ mrt = __ipmr_get_table(net, tblid);
if (!mrt) {
ret = -ENOENT;
goto out;
@@ -2678,18 +2914,21 @@ static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb)
static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb)
{
+ struct net_device *vif_dev;
struct nlattr *vif_nest;
struct vif_device *vif;
+ vif = &mrt->vif_table[vifid];
+ vif_dev = rtnl_dereference(vif->dev);
/* if the VIF doesn't exist just continue */
- if (!VIF_EXISTS(mrt, vifid))
+ if (!vif_dev)
return true;
- vif = &mrt->vif_table[vifid];
- vif_nest = nla_nest_start(skb, IPMRA_VIF);
+ vif_nest = nla_nest_start_noflag(skb, IPMRA_VIF);
if (!vif_nest)
return false;
- if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif->dev->ifindex) ||
+
+ if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif_dev->ifindex) ||
nla_put_u32(skb, IPMRA_VIFA_VIF_ID, vifid) ||
nla_put_u16(skb, IPMRA_VIFA_FLAGS, vif->flags) ||
nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, vif->bytes_in,
@@ -2710,6 +2949,31 @@ static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb)
return true;
}
+static int ipmr_valid_dumplink(const struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct ifinfomsg *ifm;
+
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid header for ipmr link dump");
+ return -EINVAL;
+ }
+
+ if (nlmsg_attrlen(nlh, sizeof(*ifm))) {
+ NL_SET_ERR_MSG(extack, "Invalid data after header in ipmr link dump");
+ return -EINVAL;
+ }
+
+ if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
+ ifm->ifi_change || ifm->ifi_index) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for ipmr link dump request");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
@@ -2718,6 +2982,13 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb)
unsigned int e = 0, s_e;
struct mr_table *mrt;
+ if (cb->strict_check) {
+ int err = ipmr_valid_dumplink(cb->nlh, cb->extack);
+
+ if (err < 0)
+ return err;
+ }
+
s_t = cb->args[0];
s_e = cb->args[1];
@@ -2738,7 +3009,7 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb)
memset(hdr, 0, sizeof(*hdr));
hdr->ifi_family = RTNL_FAMILY_IPMR;
- af = nla_nest_start(skb, IFLA_AF_SPEC);
+ af = nla_nest_start_noflag(skb, IFLA_AF_SPEC);
if (!af) {
nlmsg_cancel(skb, nlh);
goto out;
@@ -2749,7 +3020,7 @@ static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb)
goto out;
}
- vifs = nla_nest_start(skb, IPMRA_TABLE_VIFS);
+ vifs = nla_nest_start_noflag(skb, IPMRA_TABLE_VIFS);
if (!vifs) {
nla_nest_end(skb, af);
nlmsg_end(skb, nlh);
@@ -2789,26 +3060,28 @@ out:
*/
static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(mrt_lock)
+ __acquires(RCU)
{
struct mr_vif_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
struct mr_table *mrt;
- mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
- if (!mrt)
+ rcu_read_lock();
+ mrt = __ipmr_get_table(net, RT_TABLE_DEFAULT);
+ if (!mrt) {
+ rcu_read_unlock();
return ERR_PTR(-ENOENT);
+ }
iter->mrt = mrt;
- read_lock(&mrt_lock);
return mr_vif_seq_start(seq, pos);
}
static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
- __releases(mrt_lock)
+ __releases(RCU)
{
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
}
static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
@@ -2821,9 +3094,11 @@ static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
"Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n");
} else {
const struct vif_device *vif = v;
- const char *name = vif->dev ?
- vif->dev->name : "none";
+ const struct net_device *vif_dev;
+ const char *name;
+ vif_dev = vif_dev_read(vif);
+ name = vif_dev ? vif_dev->name : "none";
seq_printf(seq,
"%2td %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n",
vif - mrt->vif_table,
@@ -2872,9 +3147,9 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
if (it->cache != &mrt->mfc_unres_queue) {
seq_printf(seq, " %8lu %8lu %8lu",
- mfc->_c.mfc_un.res.pkt,
- mfc->_c.mfc_un.res.bytes,
- mfc->_c.mfc_un.res.wrong_if);
+ atomic_long_read(&mfc->_c.mfc_un.res.pkt),
+ atomic_long_read(&mfc->_c.mfc_un.res.bytes),
+ atomic_long_read(&mfc->_c.mfc_un.res.wrong_if));
for (n = mfc->_c.mfc_un.res.minvif;
n < mfc->_c.mfc_un.res.maxvif; n++) {
if (VIF_EXISTS(mrt, n) &&
@@ -2905,21 +3180,19 @@ static const struct seq_operations ipmr_mfc_seq_ops = {
#ifdef CONFIG_IP_PIMSM_V2
static const struct net_protocol pim_protocol = {
.handler = pim_rcv,
- .netns_ok = 1,
};
#endif
-static unsigned int ipmr_seq_read(struct net *net)
+static unsigned int ipmr_seq_read(const struct net *net)
{
- ASSERT_RTNL();
-
- return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net);
+ return READ_ONCE(net->ipv4.ipmr_seq) + ipmr_rules_seq_read(net);
}
-static int ipmr_dump(struct net *net, struct notifier_block *nb)
+static int ipmr_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
return mr_dump(net, nb, RTNL_FAMILY_IPMR, ipmr_rules_dump,
- ipmr_mr_table_iter, &mrt_lock);
+ ipmr_mr_table_iter, extack);
}
static const struct fib_notifier_ops ipmr_notifier_ops_template = {
@@ -2977,7 +3250,9 @@ static int __net_init ipmr_net_init(struct net *net)
proc_cache_fail:
remove_proc_entry("ip_mr_vif", net->proc_net);
proc_vif_fail:
+ rtnl_lock();
ipmr_rules_exit(net);
+ rtnl_unlock();
#endif
ipmr_rules_fail:
ipmr_notifier_exit(net);
@@ -2992,22 +3267,40 @@ static void __net_exit ipmr_net_exit(struct net *net)
remove_proc_entry("ip_mr_vif", net->proc_net);
#endif
ipmr_notifier_exit(net);
- ipmr_rules_exit(net);
+}
+
+static void __net_exit ipmr_net_exit_batch(struct list_head *net_list)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list)
+ ipmr_rules_exit(net);
+ rtnl_unlock();
}
static struct pernet_operations ipmr_net_ops = {
.init = ipmr_net_init,
.exit = ipmr_net_exit,
+ .exit_batch = ipmr_net_exit_batch,
+};
+
+static const struct rtnl_msg_handler ipmr_rtnl_msg_handlers[] __initconst = {
+ {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETLINK,
+ .dumpit = ipmr_rtm_dumplink},
+ {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_NEWROUTE,
+ .doit = ipmr_rtm_route},
+ {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_DELROUTE,
+ .doit = ipmr_rtm_route},
+ {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETROUTE,
+ .doit = ipmr_rtm_getroute, .dumpit = ipmr_rtm_dumproute},
};
int __init ip_mr_init(void)
{
int err;
- mrt_cachep = kmem_cache_create("ip_mrt_cache",
- sizeof(struct mfc_cache),
- 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
- NULL);
+ mrt_cachep = KMEM_CACHE(mfc_cache, SLAB_HWCACHE_ALIGN | SLAB_PANIC);
err = register_pernet_subsys(&ipmr_net_ops);
if (err)
@@ -3023,15 +3316,8 @@ int __init ip_mr_init(void)
goto add_proto_fail;
}
#endif
- rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE,
- ipmr_rtm_getroute, ipmr_rtm_dumproute, 0);
- rtnl_register(RTNL_FAMILY_IPMR, RTM_NEWROUTE,
- ipmr_rtm_route, NULL, 0);
- rtnl_register(RTNL_FAMILY_IPMR, RTM_DELROUTE,
- ipmr_rtm_route, NULL, 0);
-
- rtnl_register(RTNL_FAMILY_IPMR, RTM_GETLINK,
- NULL, ipmr_rtm_dumplink, 0);
+ rtnl_register_many(ipmr_rtnl_msg_handlers);
+
return 0;
#ifdef CONFIG_IP_PIMSM_V2
diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c
index 1ad9aa62a97b..28d77d454d44 100644
--- a/net/ipv4/ipmr_base.c
+++ b/net/ipv4/ipmr_base.c
@@ -13,7 +13,7 @@ void vif_device_init(struct vif_device *v,
unsigned short flags,
unsigned short get_iflink_mask)
{
- v->dev = NULL;
+ RCU_INIT_POINTER(v->dev, NULL);
v->bytes_in = 0;
v->bytes_out = 0;
v->pkt_in = 0;
@@ -208,6 +208,7 @@ EXPORT_SYMBOL(mr_mfc_seq_next);
int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
struct mr_mfc *c, struct rtmsg *rtm)
{
+ struct net_device *vif_dev;
struct rta_mfc_stats mfcs;
struct nlattr *mp_attr;
struct rtnexthop *nhp;
@@ -220,44 +221,51 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
return -ENOENT;
}
- if (VIF_EXISTS(mrt, c->mfc_parent) &&
- nla_put_u32(skb, RTA_IIF,
- mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
+ rcu_read_lock();
+ vif_dev = rcu_dereference(mrt->vif_table[c->mfc_parent].dev);
+ if (vif_dev && nla_put_u32(skb, RTA_IIF, vif_dev->ifindex) < 0) {
+ rcu_read_unlock();
return -EMSGSIZE;
+ }
+ rcu_read_unlock();
if (c->mfc_flags & MFC_OFFLOAD)
rtm->rtm_flags |= RTNH_F_OFFLOAD;
- mp_attr = nla_nest_start(skb, RTA_MULTIPATH);
+ mp_attr = nla_nest_start_noflag(skb, RTA_MULTIPATH);
if (!mp_attr)
return -EMSGSIZE;
+ rcu_read_lock();
for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
- if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
- struct vif_device *vif;
+ struct vif_device *vif = &mrt->vif_table[ct];
+
+ vif_dev = rcu_dereference(vif->dev);
+ if (vif_dev && c->mfc_un.res.ttls[ct] < 255) {
nhp = nla_reserve_nohdr(skb, sizeof(*nhp));
if (!nhp) {
+ rcu_read_unlock();
nla_nest_cancel(skb, mp_attr);
return -EMSGSIZE;
}
nhp->rtnh_flags = 0;
nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
- vif = &mrt->vif_table[ct];
- nhp->rtnh_ifindex = vif->dev->ifindex;
+ nhp->rtnh_ifindex = vif_dev->ifindex;
nhp->rtnh_len = sizeof(*nhp);
}
}
+ rcu_read_unlock();
nla_nest_end(skb, mp_attr);
lastuse = READ_ONCE(c->mfc_un.res.lastuse);
lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0;
- mfcs.mfcs_packets = c->mfc_un.res.pkt;
- mfcs.mfcs_bytes = c->mfc_un.res.bytes;
- mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
+ mfcs.mfcs_packets = atomic_long_read(&c->mfc_un.res.pkt);
+ mfcs.mfcs_bytes = atomic_long_read(&c->mfc_un.res.bytes);
+ mfcs.mfcs_wrong_if = atomic_long_read(&c->mfc_un.res.wrong_if);
if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) ||
nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse),
RTA_PAD))
@@ -268,6 +276,78 @@ int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
}
EXPORT_SYMBOL(mr_fill_mroute);
+static bool mr_mfc_uses_dev(const struct mr_table *mrt,
+ const struct mr_mfc *c,
+ const struct net_device *dev)
+{
+ int ct;
+
+ for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+ const struct net_device *vif_dev;
+ const struct vif_device *vif;
+
+ vif = &mrt->vif_table[ct];
+ vif_dev = rcu_access_pointer(vif->dev);
+ if (vif_dev && c->mfc_un.res.ttls[ct] < 255 &&
+ vif_dev == dev)
+ return true;
+ }
+ return false;
+}
+
+int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ int (*fill)(struct mr_table *mrt, struct sk_buff *skb,
+ u32 portid, u32 seq, struct mr_mfc *c,
+ int cmd, int flags),
+ spinlock_t *lock, struct fib_dump_filter *filter)
+{
+ unsigned int e = 0, s_e = cb->args[1];
+ unsigned int flags = NLM_F_MULTI;
+ struct mr_mfc *mfc;
+ int err;
+
+ if (filter->filter_set)
+ flags |= NLM_F_DUMP_FILTERED;
+
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list,
+ lockdep_rtnl_is_held()) {
+ if (e < s_e)
+ goto next_entry;
+ if (filter->dev &&
+ !mr_mfc_uses_dev(mrt, mfc, filter->dev))
+ goto next_entry;
+
+ err = fill(mrt, skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags);
+ if (err < 0)
+ goto out;
+next_entry:
+ e++;
+ }
+
+ spin_lock_bh(lock);
+ list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
+ if (e < s_e)
+ goto next_entry2;
+
+ err = fill(mrt, skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags);
+ if (err < 0) {
+ spin_unlock_bh(lock);
+ goto out;
+ }
+next_entry2:
+ e++;
+ }
+ spin_unlock_bh(lock);
+ err = 0;
+out:
+ cb->args[1] = e;
+ return err;
+}
+EXPORT_SYMBOL(mr_table_dump);
+
int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
struct mr_table *(*iter)(struct net *net,
struct mr_table *mrt),
@@ -275,53 +355,36 @@ int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb,
struct sk_buff *skb,
u32 portid, u32 seq, struct mr_mfc *c,
int cmd, int flags),
- spinlock_t *lock)
+ spinlock_t *lock, struct fib_dump_filter *filter)
{
- unsigned int t = 0, e = 0, s_t = cb->args[0], s_e = cb->args[1];
+ unsigned int t = 0, s_t = cb->args[0];
struct net *net = sock_net(skb->sk);
struct mr_table *mrt;
- struct mr_mfc *mfc;
+ int err;
+
+ /* multicast does not track protocol or have route type other
+ * than RTN_MULTICAST
+ */
+ if (filter->filter_set) {
+ if (filter->protocol || filter->flags ||
+ (filter->rt_type && filter->rt_type != RTN_MULTICAST))
+ return skb->len;
+ }
rcu_read_lock();
for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) {
if (t < s_t)
goto next_table;
- list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
- if (e < s_e)
- goto next_entry;
- if (fill(mrt, skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, mfc,
- RTM_NEWROUTE, NLM_F_MULTI) < 0)
- goto done;
-next_entry:
- e++;
- }
- e = 0;
- s_e = 0;
-
- spin_lock_bh(lock);
- list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
- if (e < s_e)
- goto next_entry2;
- if (fill(mrt, skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, mfc,
- RTM_NEWROUTE, NLM_F_MULTI) < 0) {
- spin_unlock_bh(lock);
- goto done;
- }
-next_entry2:
- e++;
- }
- spin_unlock_bh(lock);
- e = 0;
- s_e = 0;
+
+ err = mr_table_dump(mrt, skb, cb, fill, lock, filter);
+ if (err < 0)
+ break;
+ cb->args[1] = 0;
next_table:
t++;
}
-done:
rcu_read_unlock();
- cb->args[1] = e;
cb->args[0] = t;
return skb->len;
@@ -330,40 +393,52 @@ EXPORT_SYMBOL(mr_rtm_dumproute);
int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family,
int (*rules_dump)(struct net *net,
- struct notifier_block *nb),
+ struct notifier_block *nb,
+ struct netlink_ext_ack *extack),
struct mr_table *(*mr_iter)(struct net *net,
struct mr_table *mrt),
- rwlock_t *mrt_lock)
+ struct netlink_ext_ack *extack)
{
struct mr_table *mrt;
int err;
- err = rules_dump(net, nb);
+ err = rules_dump(net, nb, extack);
if (err)
return err;
for (mrt = mr_iter(net, NULL); mrt; mrt = mr_iter(net, mrt)) {
struct vif_device *v = &mrt->vif_table[0];
+ struct net_device *vif_dev;
struct mr_mfc *mfc;
int vifi;
/* Notifiy on table VIF entries */
- read_lock(mrt_lock);
+ rcu_read_lock();
for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) {
- if (!v->dev)
+ vif_dev = rcu_dereference(v->dev);
+ if (!vif_dev)
continue;
- mr_call_vif_notifier(nb, net, family,
- FIB_EVENT_VIF_ADD,
- v, vifi, mrt->id);
+ err = mr_call_vif_notifier(nb, family,
+ FIB_EVENT_VIF_ADD, v,
+ vif_dev, vifi,
+ mrt->id, extack);
+ if (err)
+ break;
}
- read_unlock(mrt_lock);
+ rcu_read_unlock();
+
+ if (err)
+ return err;
/* Notify on table MFC entries */
- list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
- mr_call_mfc_notifier(nb, net, family,
- FIB_EVENT_ENTRY_ADD,
- mfc, mrt->id);
+ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
+ err = mr_call_mfc_notifier(nb, family,
+ FIB_EVENT_ENTRY_ADD,
+ mfc, mrt->id, extack);
+ if (err)
+ return err;
+ }
}
return 0;
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
index 04311f7067e2..8ddac1f595ed 100644
--- a/net/ipv4/metrics.c
+++ b/net/ipv4/metrics.c
@@ -1,39 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/netlink.h>
+#include <linux/nospec.h>
#include <linux/rtnetlink.h>
#include <linux/types.h>
#include <net/ip.h>
#include <net/net_namespace.h>
#include <net/tcp.h>
-int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
- u32 *metrics)
+static int ip_metrics_convert(struct nlattr *fc_mx,
+ int fc_mx_len, u32 *metrics,
+ struct netlink_ext_ack *extack)
{
bool ecn_ca = false;
struct nlattr *nla;
int remaining;
- if (!fc_mx)
- return 0;
-
nla_for_each_attr(nla, fc_mx, fc_mx_len, remaining) {
int type = nla_type(nla);
u32 val;
if (!type)
continue;
- if (type > RTAX_MAX)
+ if (type > RTAX_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid metric type");
return -EINVAL;
+ }
+ type = array_index_nospec(type, RTAX_MAX + 1);
if (type == RTAX_CC_ALGO) {
char tmp[TCP_CA_NAME_MAX];
- nla_strlcpy(tmp, nla, sizeof(tmp));
- val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
- if (val == TCP_CA_UNSPEC)
+ nla_strscpy(tmp, nla, sizeof(tmp));
+ val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
+ if (val == TCP_CA_UNSPEC) {
+ NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm");
return -EINVAL;
+ }
} else {
- if (nla_len(nla) != sizeof(u32))
+ if (nla_len(nla) != sizeof(u32)) {
+ NL_SET_ERR_MSG_ATTR(extack, nla,
+ "Invalid attribute in metrics");
return -EINVAL;
+ }
val = nla_get_u32(nla);
}
if (type == RTAX_ADVMSS && val > 65535 - 40)
@@ -42,8 +50,10 @@ int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
val = 65535 - 15;
if (type == RTAX_HOPLIMIT && val > 255)
val = 255;
- if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
+ if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) {
+ NL_SET_ERR_MSG(extack, "Unknown flag set in feature mask in metrics attribute");
return -EINVAL;
+ }
metrics[type - 1] = val;
}
@@ -52,4 +62,30 @@ int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, int fc_mx_len,
return 0;
}
-EXPORT_SYMBOL_GPL(ip_metrics_convert);
+
+struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx,
+ int fc_mx_len,
+ struct netlink_ext_ack *extack)
+{
+ struct dst_metrics *fib_metrics;
+ int err;
+
+ if (!fc_mx)
+ return (struct dst_metrics *)&dst_default_metrics;
+
+ fib_metrics = kzalloc(sizeof(*fib_metrics), GFP_KERNEL);
+ if (unlikely(!fib_metrics))
+ return ERR_PTR(-ENOMEM);
+
+ err = ip_metrics_convert(fc_mx, fc_mx_len, fib_metrics->metrics,
+ extack);
+ if (!err) {
+ refcount_set(&fib_metrics->refcnt, 1);
+ } else {
+ kfree(fib_metrics);
+ fib_metrics = ERR_PTR(err);
+ }
+
+ return fib_metrics;
+}
+EXPORT_SYMBOL_GPL(ip_fib_metrics_init);
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 8d2e5dc9a827..ce310eb779e0 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -11,23 +11,27 @@
#include <linux/skbuff.h>
#include <linux/gfp.h>
#include <linux/export.h>
+#include <net/flow.h>
#include <net/route.h>
#include <net/xfrm.h>
#include <net/ip.h>
#include <net/netfilter/nf_queue.h>
/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
-int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_type)
+int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int addr_type)
{
+ struct net_device *dev = skb_dst_dev(skb);
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
struct flowi4 fl4 = {};
__be32 saddr = iph->saddr;
- const struct sock *sk = skb_to_full_sk(skb);
- __u8 flags = sk ? inet_sk_flowi_flags(sk) : 0;
- struct net_device *dev = skb_dst(skb)->dev;
+ __u8 flags;
+ struct flow_keys flkeys;
unsigned int hh_len;
+ sk = sk_to_full_sk(sk);
+ flags = sk ? inet_sk_flowi_flags(sk) : 0;
+
if (addr_type == RTN_UNSPEC)
addr_type = inet_addr_type_dev_table(net, dev, saddr);
if (addr_type == RTN_LOCAL || addr_type == RTN_UNICAST)
@@ -40,12 +44,12 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
*/
fl4.daddr = iph->daddr;
fl4.saddr = saddr;
- fl4.flowi4_tos = RT_TOS(iph->tos);
+ fl4.flowi4_dscp = ip4h_dscp(iph);
fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0;
- if (!fl4.flowi4_oif)
- fl4.flowi4_oif = l3mdev_master_ifindex(dev);
+ fl4.flowi4_l3mdev = l3mdev_master_ifindex(dev);
fl4.flowi4_mark = skb->mark;
fl4.flowi4_flags = flags;
+ fib4_rules_early_flow_dissect(net, skb, &fl4, &flkeys);
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
return PTR_ERR(rt);
@@ -59,9 +63,12 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
#ifdef CONFIG_XFRM
if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
- xfrm_decode_session(skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
+ xfrm_decode_session(net, skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
struct dst_entry *dst = skb_dst(skb);
- skb_dst_set(skb, NULL);
+ /* ignore return value from skb_dstref_steal, xfrm_lookup takes
+ * care of dropping the refcnt if needed.
+ */
+ skb_dstref_steal(skb);
dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), sk, 0);
if (IS_ERR(dst))
return PTR_ERR(dst);
@@ -70,7 +77,7 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
#endif
/* Change in oif may mean change in hh_len. */
- hh_len = skb_dst(skb)->dev->hard_header_len;
+ hh_len = skb_dst_dev(skb)->hard_header_len;
if (skb_headroom(skb) < hh_len &&
pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)),
0, GFP_ATOMIC))
@@ -80,24 +87,6 @@ int ip_route_me_harder(struct net *net, struct sk_buff *skb, unsigned int addr_t
}
EXPORT_SYMBOL(ip_route_me_harder);
-int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry)
-{
- const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry);
-
- if (entry->state.hook == NF_INET_LOCAL_OUT) {
- const struct iphdr *iph = ip_hdr(skb);
-
- if (!(iph->tos == rt_info->tos &&
- skb->mark == rt_info->mark &&
- iph->daddr == rt_info->daddr &&
- iph->saddr == rt_info->saddr))
- return ip_route_me_harder(entry->state.net, skb,
- RTN_UNSPEC);
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(nf_ip_reroute);
-
int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
bool strict __always_unused)
{
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 184bf2e0a1ed..7dc9772fe2d8 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# IP netfilter configuration
#
@@ -9,6 +10,17 @@ config NF_DEFRAG_IPV4
tristate
default n
+# old sockopt interface and eval loop
+config IP_NF_IPTABLES_LEGACY
+ tristate "Legacy IP tables support"
+ depends on NETFILTER_XTABLES_LEGACY
+ depends on NETFILTER_XTABLES
+ default m if NETFILTER_XTABLES_LEGACY
+ help
+ iptables is a legacy packet classifier.
+ This is not needed if you are using iptables over nftables
+ (iptables-nft).
+
config NF_SOCKET_IPV4
tristate "IPv4 socket lookup support"
help
@@ -27,14 +39,6 @@ config NF_TABLES_IPV4
if NF_TABLES_IPV4
-config NFT_CHAIN_ROUTE_IPV4
- tristate "IPv4 nf_tables route chain support"
- help
- This option enables the "route" chain for IPv4 in nf_tables. This
- chain type is used to force packet re-routing after mangling header
- fields such as the source, destination, type of service and
- the packet mark.
-
config NFT_REJECT_IPV4
select NF_REJECT_IPV4
default NFT_REJECT
@@ -65,14 +69,6 @@ config NF_TABLES_ARP
endif # NF_TABLES
-config NF_FLOW_TABLE_IPV4
- tristate "Netfilter flow table IPv4 module"
- depends on NF_FLOW_TABLE
- help
- This option adds the flow table IPv4 support.
-
- To compile it as a module, choose M here.
-
config NF_DUP_IPV4
tristate "Netfilter IPv4 packet duplication to alternate destination"
depends on !NF_CONNTRACK || NF_CONNTRACK
@@ -83,68 +79,31 @@ config NF_DUP_IPV4
config NF_LOG_ARP
tristate "ARP packet logging"
default m if NETFILTER_ADVANCED=n
- select NF_LOG_COMMON
+ select NF_LOG_SYSLOG
+ help
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects CONFIG_NF_LOG_SYSLOG.
config NF_LOG_IPV4
tristate "IPv4 packet logging"
default m if NETFILTER_ADVANCED=n
- select NF_LOG_COMMON
+ select NF_LOG_SYSLOG
+ help
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects CONFIG_NF_LOG_SYSLOG.
config NF_REJECT_IPV4
tristate "IPv4 packet rejection"
default m if NETFILTER_ADVANCED=n
-config NF_NAT_IPV4
- tristate "IPv4 NAT"
- depends on NF_CONNTRACK
- default m if NETFILTER_ADVANCED=n
- select NF_NAT
- help
- The IPv4 NAT option allows masquerading, port forwarding and other
- forms of full Network Address Port Translation. This can be
- controlled by iptables or nft.
-
-if NF_NAT_IPV4
-
-config NF_NAT_MASQUERADE_IPV4
- bool
-
-if NF_TABLES
-config NFT_CHAIN_NAT_IPV4
- depends on NF_TABLES_IPV4
- tristate "IPv4 nf_tables nat chain support"
- help
- This option enables the "nat" chain for IPv4 in nf_tables. This
- chain type is used to perform Network Address Translation (NAT)
- packet transformations such as the source, destination address and
- source and destination ports.
-
-config NFT_MASQ_IPV4
- tristate "IPv4 masquerading support for nf_tables"
- depends on NF_TABLES_IPV4
- depends on NFT_MASQ
- select NF_NAT_MASQUERADE_IPV4
- help
- This is the expression that provides IPv4 masquerading support for
- nf_tables.
-
-config NFT_REDIR_IPV4
- tristate "IPv4 redirect support for nf_tables"
- depends on NF_TABLES_IPV4
- depends on NFT_REDIR
- select NF_NAT_REDIRECT
- help
- This is the expression that provides IPv4 redirect support for
- nf_tables.
-endif # NF_TABLES
-
+if NF_NAT
config NF_NAT_SNMP_BASIC
tristate "Basic SNMP-ALG support"
depends on NF_CONNTRACK_SNMP
depends on NETFILTER_ADVANCED
default NF_NAT && NF_CONNTRACK_SNMP
select ASN1
- ---help---
+ help
This module implements an Application Layer Gateway (ALG) for
SNMP payloads. In conjunction with NAT, it allows a network
@@ -156,22 +115,17 @@ config NF_NAT_SNMP_BASIC
To compile it as a module, choose M here. If unsure, say N.
-config NF_NAT_PROTO_GRE
- tristate
- depends on NF_CT_PROTO_GRE
-
config NF_NAT_PPTP
tristate
depends on NF_CONNTRACK
default NF_CONNTRACK_PPTP
- select NF_NAT_PROTO_GRE
config NF_NAT_H323
tristate
depends on NF_CONNTRACK
default NF_CONNTRACK_H323
-endif # NF_NAT_IPV4
+endif # NF_NAT
config IP_NF_IPTABLES
tristate "IP tables support (required for filtering/masq/NAT)"
@@ -201,7 +155,7 @@ config IP_NF_MATCH_ECN
tristate '"ecn" match support'
depends on NETFILTER_ADVANCED
select NETFILTER_XT_MATCH_ECN
- ---help---
+ help
This is a backwards-compat option for the user's convenience
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_MATCH_ECN.
@@ -209,8 +163,8 @@ config IP_NF_MATCH_ECN
config IP_NF_MATCH_RPFILTER
tristate '"rpfilter" reverse path filter match support'
depends on NETFILTER_ADVANCED
- depends on IP_NF_MANGLE || IP_NF_RAW
- ---help---
+ depends on IP_NF_MANGLE || IP_NF_RAW || NFT_COMPAT
+ help
This option allows you to match packets whose replies would
go out via the interface the packet came in.
@@ -221,7 +175,7 @@ config IP_NF_MATCH_TTL
tristate '"ttl" match support'
depends on NETFILTER_ADVANCED
select NETFILTER_XT_MATCH_HL
- ---help---
+ help
This is a backwards-compat option for the user's convenience
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_MATCH_HL.
@@ -229,7 +183,8 @@ config IP_NF_MATCH_TTL
# `filter', generic and specific targets
config IP_NF_FILTER
tristate "Packet filtering"
- default m if NETFILTER_ADVANCED=n
+ default m if NETFILTER_ADVANCED=n || IP_NF_IPTABLES_LEGACY
+ depends on IP_NF_IPTABLES_LEGACY
help
Packet filtering defines a table `filter', which has a series of
rules for simple packet filtering at local input, forwarding and
@@ -239,7 +194,7 @@ config IP_NF_FILTER
config IP_NF_TARGET_REJECT
tristate "REJECT target support"
- depends on IP_NF_FILTER
+ depends on IP_NF_FILTER || NFT_COMPAT
select NF_REJECT_IPV4
default m if NETFILTER_ADVANCED=n
help
@@ -266,9 +221,9 @@ config IP_NF_TARGET_SYNPROXY
config IP_NF_NAT
tristate "iptables NAT support"
depends on NF_CONNTRACK
+ depends on IP_NF_IPTABLES_LEGACY
default m if NETFILTER_ADVANCED=n
select NF_NAT
- select NF_NAT_IPV4
select NETFILTER_XT_NAT
help
This enables the `nat' table in iptables. This allows masquerading,
@@ -281,22 +236,16 @@ if IP_NF_NAT
config IP_NF_TARGET_MASQUERADE
tristate "MASQUERADE target support"
- select NF_NAT_MASQUERADE_IPV4
- default m if NETFILTER_ADVANCED=n
+ select NETFILTER_XT_TARGET_MASQUERADE
help
- Masquerading is a special case of NAT: all outgoing connections are
- changed to seem to come from a particular interface's address, and
- if the interface goes down, those connections are lost. This is
- only useful for dialup accounts with dynamic IP address (ie. your IP
- address will be different on next dialup).
-
- To compile it as a module, choose M here. If unsure, say N.
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects NETFILTER_XT_TARGET_MASQUERADE.
config IP_NF_TARGET_NETMAP
tristate "NETMAP target support"
depends on NETFILTER_ADVANCED
select NETFILTER_XT_TARGET_NETMAP
- ---help---
+ help
This is a backwards-compat option for the user's convenience
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_TARGET_NETMAP.
@@ -305,7 +254,7 @@ config IP_NF_TARGET_REDIRECT
tristate "REDIRECT target support"
depends on NETFILTER_ADVANCED
select NETFILTER_XT_TARGET_REDIRECT
- ---help---
+ help
This is a backwards-compat option for the user's convenience
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_TARGET_REDIRECT.
@@ -315,7 +264,8 @@ endif # IP_NF_NAT
# mangle + specific targets
config IP_NF_MANGLE
tristate "Packet mangling"
- default m if NETFILTER_ADVANCED=n
+ default m if NETFILTER_ADVANCED=n || IP_NF_IPTABLES_LEGACY
+ depends on IP_NF_IPTABLES_LEGACY
help
This option adds a `mangle' table to iptables: see the man page for
iptables(8). This table is used for various packet alterations
@@ -323,27 +273,13 @@ config IP_NF_MANGLE
To compile it as a module, choose M here. If unsure, say N.
-config IP_NF_TARGET_CLUSTERIP
- tristate "CLUSTERIP target support"
- depends on IP_NF_MANGLE
- depends on NF_CONNTRACK
- depends on NETFILTER_ADVANCED
- select NF_CONNTRACK_MARK
- select NETFILTER_FAMILY_ARP
- help
- The CLUSTERIP target allows you to build load-balancing clusters of
- network servers without having a dedicated load-balancing
- router/server/switch.
-
- To compile it as a module, choose M here. If unsure, say N.
-
config IP_NF_TARGET_ECN
tristate "ECN target support"
- depends on IP_NF_MANGLE
+ depends on IP_NF_MANGLE || NFT_COMPAT
depends on NETFILTER_ADVANCED
- ---help---
+ help
This option adds a `ECN' target, which can be used in the iptables mangle
- table.
+ table.
You can use this target to remove the ECN bits from the IPv4 header of
an IP packet. This is particularly useful, if you need to work around
@@ -356,7 +292,7 @@ config IP_NF_TARGET_TTL
tristate '"TTL" target support'
depends on NETFILTER_ADVANCED && IP_NF_MANGLE
select NETFILTER_XT_TARGET_HL
- ---help---
+ help
This is a backwards-compatible option for the user's convenience
(e.g. when running oldconfig). It selects
CONFIG_NETFILTER_XT_TARGET_HL.
@@ -364,59 +300,69 @@ config IP_NF_TARGET_TTL
# raw + specific targets
config IP_NF_RAW
tristate 'raw table support (required for NOTRACK/TRACE)'
+ depends on IP_NF_IPTABLES_LEGACY
help
This option adds a `raw' table to iptables. This table is the very
first in the netfilter framework and hooks in at the PREROUTING
and OUTPUT chains.
-
+
If you want to compile it as a module, say M here and read
- <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+ <file:Documentation/kbuild/modules.rst>. If unsure, say `N'.
# security table for MAC policy
config IP_NF_SECURITY
tristate "Security table"
depends on SECURITY
depends on NETFILTER_ADVANCED
+ depends on IP_NF_IPTABLES_LEGACY
help
This option adds a `security' table to iptables, for use
with Mandatory Access Control (MAC) policy.
-
+
If unsure, say N.
endif # IP_NF_IPTABLES
# ARP tables
config IP_NF_ARPTABLES
- tristate "ARP tables support"
- select NETFILTER_XTABLES
- select NETFILTER_FAMILY_ARP
- depends on NETFILTER_ADVANCED
+ tristate "Legacy ARPTABLES support"
+ depends on NETFILTER_XTABLES_LEGACY
+ depends on NETFILTER_XTABLES
+ default n
help
- arptables is a general, extensible packet identification framework.
- The ARP packet filtering and mangling (manipulation)subsystems
- use this: say Y or M here if you want to use either of those.
-
- To compile it as a module, choose M here. If unsure, say N.
+ arptables is a legacy packet classifier.
+ This is not needed if you are using arptables over nftables
+ (iptables-nft).
-if IP_NF_ARPTABLES
+config NFT_COMPAT_ARP
+ tristate
+ depends on NF_TABLES_ARP && NFT_COMPAT
+ default m if NFT_COMPAT=m
+ default y if NFT_COMPAT=y
config IP_NF_ARPFILTER
- tristate "ARP packet filtering"
+ tristate "arptables-legacy packet filtering support"
+ select IP_NF_ARPTABLES
+ select NETFILTER_FAMILY_ARP
+ depends on NETFILTER_XTABLES_LEGACY
+ depends on NETFILTER_XTABLES
help
ARP packet filtering defines a table `filter', which has a series of
rules for simple ARP packet filtering at local input and
- local output. On a bridge, you can also specify filtering rules
- for forwarded ARP packets. See the man page for arptables(8).
+ local output. This is only needed for arptables-legacy(8).
+ Neither arptables-nft nor nftables need this to work.
To compile it as a module, choose M here. If unsure, say N.
config IP_NF_ARP_MANGLE
tristate "ARP payload mangling"
+ depends on IP_NF_ARPTABLES || NFT_COMPAT_ARP
help
Allows altering the ARP packet payload: source and destination
hardware and network addresses.
-endif # IP_NF_ARPTABLES
+ This option is needed by both arptables-legacy and arptables-nft.
+ It is not used by nftables.
endmenu
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 367993adf4d3..85502d4dfbb4 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -3,20 +3,12 @@
# Makefile for the netfilter modules on top of IPv4.
#
-nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
-nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
-obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
-
# defrag
obj-$(CONFIG_NF_DEFRAG_IPV4) += nf_defrag_ipv4.o
obj-$(CONFIG_NF_SOCKET_IPV4) += nf_socket_ipv4.o
obj-$(CONFIG_NF_TPROXY_IPV4) += nf_tproxy_ipv4.o
-# logging
-obj-$(CONFIG_NF_LOG_ARP) += nf_log_arp.o
-obj-$(CONFIG_NF_LOG_IPV4) += nf_log_ipv4.o
-
# reject
obj-$(CONFIG_NF_REJECT_IPV4) += nf_reject_ipv4.o
@@ -28,22 +20,12 @@ nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o
$(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h
obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o
-# NAT protocols (nf_nat)
-obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o
-
-obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o
-obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o
obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o
obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o
-obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o
-obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o
obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o
-# flow table support
-obj-$(CONFIG_NF_FLOW_TABLE_IPV4) += nf_flow_table_ipv4.o
-
-# generic IP tables
-obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
+# generic IP tables
+obj-$(CONFIG_IP_NF_IPTABLES_LEGACY) += ip_tables.o
# the three instances of ip_tables
obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
@@ -57,9 +39,7 @@ obj-$(CONFIG_IP_NF_MATCH_AH) += ipt_ah.o
obj-$(CONFIG_IP_NF_MATCH_RPFILTER) += ipt_rpfilter.o
# targets
-obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
-obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
obj-$(CONFIG_IP_NF_TARGET_SYNPROXY) += ipt_SYNPROXY.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 2dc83de53f94..1cdd9c28ab2d 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Packet matching code for ARP packets.
*
@@ -178,10 +179,11 @@ struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry)
return (void *)entry + entry->next_offset;
}
-unsigned int arpt_do_table(struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct xt_table *table)
+unsigned int arpt_do_table(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
{
+ const struct xt_table *table = priv;
unsigned int hook = state->hook;
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
unsigned int verdict = NF_DROP;
@@ -383,10 +385,11 @@ next: ;
return 1;
}
-static inline int check_target(struct arpt_entry *e, const char *name)
+static int check_target(struct arpt_entry *e, struct net *net, const char *name)
{
struct xt_entry_target *t = arpt_get_target(e);
struct xt_tgchk_param par = {
+ .net = net,
.table = name,
.entryinfo = e,
.target = t->u.kernel.target,
@@ -398,8 +401,9 @@ static inline int check_target(struct arpt_entry *e, const char *name)
return xt_check_target(&par, t->u.target_size - sizeof(*t), 0, false);
}
-static inline int
-find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
+static int
+find_check_entry(struct arpt_entry *e, struct net *net, const char *name,
+ unsigned int size,
struct xt_percpu_counter_alloc_state *alloc_state)
{
struct xt_entry_target *t;
@@ -418,7 +422,7 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
}
t->u.kernel.target = target;
- ret = check_target(e, name);
+ ret = check_target(e, net, name);
if (ret)
goto err;
return 0;
@@ -493,12 +497,13 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
return 0;
}
-static inline void cleanup_entry(struct arpt_entry *e)
+static void cleanup_entry(struct arpt_entry *e, struct net *net)
{
struct xt_tgdtor_param par;
struct xt_entry_target *t;
t = arpt_get_target(e);
+ par.net = net;
par.target = t->u.kernel.target;
par.targinfo = t->data;
par.family = NFPROTO_ARP;
@@ -511,7 +516,9 @@ static inline void cleanup_entry(struct arpt_entry *e)
/* Checks and translates the user-supplied table segment (held in
* newinfo).
*/
-static int translate_table(struct xt_table_info *newinfo, void *entry0,
+static int translate_table(struct net *net,
+ struct xt_table_info *newinfo,
+ void *entry0,
const struct arpt_replace *repl)
{
struct xt_percpu_counter_alloc_state alloc_state = { 0 };
@@ -568,7 +575,7 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
/* Finally, each sanity check must pass */
i = 0;
xt_entry_foreach(iter, entry0, newinfo->size) {
- ret = find_check_entry(iter, repl->name, repl->size,
+ ret = find_check_entry(iter, net, repl->name, repl->size,
&alloc_state);
if (ret != 0)
break;
@@ -579,7 +586,7 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
xt_entry_foreach(iter, entry0, newinfo->size) {
if (i-- == 0)
break;
- cleanup_entry(iter);
+ cleanup_entry(iter, net);
}
return ret;
}
@@ -707,7 +714,7 @@ static int copy_entries_to_user(unsigned int total_size,
return ret;
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
static void compat_standard_from_user(void *dst, const void *src)
{
int v = *(compat_int_t *)src;
@@ -781,8 +788,7 @@ static int compat_table_info(const struct xt_table_info *info,
}
#endif
-static int get_info(struct net *net, void __user *user,
- const int *len, int compat)
+static int get_info(struct net *net, void __user *user, const int *len)
{
char name[XT_TABLE_MAXNAMELEN];
struct xt_table *t;
@@ -795,18 +801,18 @@ static int get_info(struct net *net, void __user *user,
return -EFAULT;
name[XT_TABLE_MAXNAMELEN-1] = '\0';
-#ifdef CONFIG_COMPAT
- if (compat)
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
xt_compat_lock(NFPROTO_ARP);
#endif
t = xt_request_find_table_lock(net, NFPROTO_ARP, name);
if (!IS_ERR(t)) {
struct arpt_getinfo info;
const struct xt_table_info *private = t->private;
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
struct xt_table_info tmp;
- if (compat) {
+ if (in_compat_syscall()) {
ret = compat_table_info(private, &tmp);
xt_compat_flush_offsets(NFPROTO_ARP);
private = &tmp;
@@ -820,7 +826,7 @@ static int get_info(struct net *net, void __user *user,
sizeof(info.underflow));
info.num_entries = private->number;
info.size = private->size;
- strcpy(info.name, name);
+ strscpy(info.name, name);
if (copy_to_user(user, &info, *len) != 0)
ret = -EFAULT;
@@ -830,8 +836,8 @@ static int get_info(struct net *net, void __user *user,
module_put(t->me);
} else
ret = PTR_ERR(t);
-#ifdef CONFIG_COMPAT
- if (compat)
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
xt_compat_unlock(NFPROTO_ARP);
#endif
return ret;
@@ -922,7 +928,7 @@ static int __do_replace(struct net *net, const char *name,
/* Decrease module usage counts and free resource */
loc_cpu_old_entry = oldinfo->entries;
xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
- cleanup_entry(iter);
+ cleanup_entry(iter, net);
xt_free_table_info(oldinfo);
if (copy_to_user(counters_ptr, counters,
@@ -942,8 +948,7 @@ static int __do_replace(struct net *net, const char *name,
return ret;
}
-static int do_replace(struct net *net, const void __user *user,
- unsigned int len)
+static int do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
int ret;
struct arpt_replace tmp;
@@ -951,7 +956,9 @@ static int do_replace(struct net *net, const void __user *user,
void *loc_cpu_entry;
struct arpt_entry *iter;
- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ if (len < sizeof(tmp))
+ return -EINVAL;
+ if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
return -EFAULT;
/* overflow check */
@@ -959,6 +966,8 @@ static int do_replace(struct net *net, const void __user *user,
return -ENOMEM;
if (tmp.num_counters == 0)
return -EINVAL;
+ if ((u64)len < (u64)tmp.size + sizeof(tmp))
+ return -EINVAL;
tmp.name[sizeof(tmp.name)-1] = 0;
@@ -967,13 +976,13 @@ static int do_replace(struct net *net, const void __user *user,
return -ENOMEM;
loc_cpu_entry = newinfo->entries;
- if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
- tmp.size) != 0) {
+ if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
+ tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
}
- ret = translate_table(newinfo, loc_cpu_entry, &tmp);
+ ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
if (ret != 0)
goto free_newinfo;
@@ -985,14 +994,13 @@ static int do_replace(struct net *net, const void __user *user,
free_newinfo_untrans:
xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
- cleanup_entry(iter);
+ cleanup_entry(iter, net);
free_newinfo:
xt_free_table_info(newinfo);
return ret;
}
-static int do_add_counters(struct net *net, const void __user *user,
- unsigned int len, int compat)
+static int do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
{
unsigned int i;
struct xt_counters_info tmp;
@@ -1003,7 +1011,7 @@ static int do_add_counters(struct net *net, const void __user *user,
struct arpt_entry *iter;
unsigned int addend;
- paddc = xt_copy_counters_from_user(user, len, &tmp, compat);
+ paddc = xt_copy_counters(arg, len, &tmp);
if (IS_ERR(paddc))
return PTR_ERR(paddc);
@@ -1041,7 +1049,7 @@ static int do_add_counters(struct net *net, const void __user *user,
return ret;
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
struct compat_arpt_replace {
char name[XT_TABLE_MAXNAMELEN];
u32 valid_hooks;
@@ -1051,7 +1059,7 @@ struct compat_arpt_replace {
u32 underflow[NF_ARP_NUMHOOKS];
u32 num_counters;
compat_uptr_t counters;
- struct compat_arpt_entry entries[0];
+ struct compat_arpt_entry entries[];
};
static inline void compat_release_entry(struct compat_arpt_entry *e)
@@ -1148,7 +1156,8 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
}
}
-static int translate_compat_table(struct xt_table_info **pinfo,
+static int translate_compat_table(struct net *net,
+ struct xt_table_info **pinfo,
void **pentry0,
const struct compat_arpt_replace *compatr)
{
@@ -1189,6 +1198,8 @@ static int translate_compat_table(struct xt_table_info **pinfo,
if (!newinfo)
goto out_unlock;
+ memset(newinfo->entries, 0, size);
+
newinfo->number = compatr->num_entries;
for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
newinfo->hook_entry[i] = compatr->hook_entry[i];
@@ -1216,7 +1227,7 @@ static int translate_compat_table(struct xt_table_info **pinfo,
repl.num_counters = 0;
repl.counters = NULL;
repl.size = newinfo->size;
- ret = translate_table(newinfo, entry1, &repl);
+ ret = translate_table(net, newinfo, entry1, &repl);
if (ret)
goto free_newinfo;
@@ -1239,8 +1250,7 @@ out_unlock:
return ret;
}
-static int compat_do_replace(struct net *net, void __user *user,
- unsigned int len)
+static int compat_do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
int ret;
struct compat_arpt_replace tmp;
@@ -1248,7 +1258,9 @@ static int compat_do_replace(struct net *net, void __user *user,
void *loc_cpu_entry;
struct arpt_entry *iter;
- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ if (len < sizeof(tmp))
+ return -EINVAL;
+ if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
return -EFAULT;
/* overflow check */
@@ -1256,6 +1268,8 @@ static int compat_do_replace(struct net *net, void __user *user,
return -ENOMEM;
if (tmp.num_counters == 0)
return -EINVAL;
+ if ((u64)len < (u64)tmp.size + sizeof(tmp))
+ return -EINVAL;
tmp.name[sizeof(tmp.name)-1] = 0;
@@ -1264,12 +1278,13 @@ static int compat_do_replace(struct net *net, void __user *user,
return -ENOMEM;
loc_cpu_entry = newinfo->entries;
- if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) {
+ if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
+ tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
}
- ret = translate_compat_table(&newinfo, &loc_cpu_entry, &tmp);
+ ret = translate_compat_table(net, &newinfo, &loc_cpu_entry, &tmp);
if (ret != 0)
goto free_newinfo;
@@ -1281,36 +1296,12 @@ static int compat_do_replace(struct net *net, void __user *user,
free_newinfo_untrans:
xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
- cleanup_entry(iter);
+ cleanup_entry(iter, net);
free_newinfo:
xt_free_table_info(newinfo);
return ret;
}
-static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user,
- unsigned int len)
-{
- int ret;
-
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
- return -EPERM;
-
- switch (cmd) {
- case ARPT_SO_SET_REPLACE:
- ret = compat_do_replace(sock_net(sk), user, len);
- break;
-
- case ARPT_SO_SET_ADD_COUNTERS:
- ret = do_add_counters(sock_net(sk), user, len, 1);
- break;
-
- default:
- ret = -EINVAL;
- }
-
- return ret;
-}
-
static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
compat_uint_t *size,
struct xt_counters *counters,
@@ -1376,7 +1367,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
struct compat_arpt_get_entries {
char name[XT_TABLE_MAXNAMELEN];
compat_uint_t size;
- struct compat_arpt_entry entrytable[0];
+ struct compat_arpt_entry entrytable[];
};
static int compat_get_entries(struct net *net,
@@ -1418,32 +1409,10 @@ static int compat_get_entries(struct net *net,
xt_compat_unlock(NFPROTO_ARP);
return ret;
}
-
-static int do_arpt_get_ctl(struct sock *, int, void __user *, int *);
-
-static int compat_do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user,
- int *len)
-{
- int ret;
-
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
- return -EPERM;
-
- switch (cmd) {
- case ARPT_SO_GET_INFO:
- ret = get_info(sock_net(sk), user, len, 1);
- break;
- case ARPT_SO_GET_ENTRIES:
- ret = compat_get_entries(sock_net(sk), user, len);
- break;
- default:
- ret = do_arpt_get_ctl(sk, cmd, user, len);
- }
- return ret;
-}
#endif
-static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+static int do_arpt_set_ctl(struct sock *sk, int cmd, sockptr_t arg,
+ unsigned int len)
{
int ret;
@@ -1452,11 +1421,16 @@ static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned
switch (cmd) {
case ARPT_SO_SET_REPLACE:
- ret = do_replace(sock_net(sk), user, len);
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ ret = compat_do_replace(sock_net(sk), arg, len);
+ else
+#endif
+ ret = do_replace(sock_net(sk), arg, len);
break;
case ARPT_SO_SET_ADD_COUNTERS:
- ret = do_add_counters(sock_net(sk), user, len, 0);
+ ret = do_add_counters(sock_net(sk), arg, len);
break;
default:
@@ -1475,11 +1449,16 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
switch (cmd) {
case ARPT_SO_GET_INFO:
- ret = get_info(sock_net(sk), user, len, 0);
+ ret = get_info(sock_net(sk), user, len);
break;
case ARPT_SO_GET_ENTRIES:
- ret = get_entries(sock_net(sk), user, len);
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ ret = compat_get_entries(sock_net(sk), user, len);
+ else
+#endif
+ ret = get_entries(sock_net(sk), user, len);
break;
case ARPT_SO_GET_REVISION_TARGET: {
@@ -1508,7 +1487,7 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
return ret;
}
-static void __arpt_unregister_table(struct xt_table *table)
+static void __arpt_unregister_table(struct net *net, struct xt_table *table)
{
struct xt_table_info *private;
void *loc_cpu_entry;
@@ -1520,7 +1499,7 @@ static void __arpt_unregister_table(struct xt_table *table)
/* Decrease module usage counts and free resources */
loc_cpu_entry = private->entries;
xt_entry_foreach(iter, loc_cpu_entry, private->size)
- cleanup_entry(iter);
+ cleanup_entry(iter, net);
if (private->number > private->initial_entries)
module_put(table_owner);
xt_free_table_info(private);
@@ -1529,10 +1508,11 @@ static void __arpt_unregister_table(struct xt_table *table)
int arpt_register_table(struct net *net,
const struct xt_table *table,
const struct arpt_replace *repl,
- const struct nf_hook_ops *ops,
- struct xt_table **res)
+ const struct nf_hook_ops *template_ops)
{
- int ret;
+ struct nf_hook_ops *ops;
+ unsigned int num_ops;
+ int ret, i;
struct xt_table_info *newinfo;
struct xt_table_info bootstrap = {0};
void *loc_cpu_entry;
@@ -1545,37 +1525,65 @@ int arpt_register_table(struct net *net,
loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
- ret = translate_table(newinfo, loc_cpu_entry, repl);
- if (ret != 0)
- goto out_free;
+ ret = translate_table(net, newinfo, loc_cpu_entry, repl);
+ if (ret != 0) {
+ xt_free_table_info(newinfo);
+ return ret;
+ }
new_table = xt_register_table(net, table, &bootstrap, newinfo);
if (IS_ERR(new_table)) {
- ret = PTR_ERR(new_table);
- goto out_free;
+ struct arpt_entry *iter;
+
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter, net);
+ xt_free_table_info(newinfo);
+ return PTR_ERR(new_table);
}
- /* set res now, will see skbs right after nf_register_net_hooks */
- WRITE_ONCE(*res, new_table);
+ num_ops = hweight32(table->valid_hooks);
+ if (num_ops == 0) {
+ ret = -EINVAL;
+ goto out_free;
+ }
- ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
- if (ret != 0) {
- __arpt_unregister_table(new_table);
- *res = NULL;
+ ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL);
+ if (!ops) {
+ ret = -ENOMEM;
+ goto out_free;
}
+ for (i = 0; i < num_ops; i++)
+ ops[i].priv = new_table;
+
+ new_table->ops = ops;
+
+ ret = nf_register_net_hooks(net, ops, num_ops);
+ if (ret != 0)
+ goto out_free;
+
return ret;
out_free:
- xt_free_table_info(newinfo);
+ __arpt_unregister_table(net, new_table);
return ret;
}
-void arpt_unregister_table(struct net *net, struct xt_table *table,
- const struct nf_hook_ops *ops)
+void arpt_unregister_table_pre_exit(struct net *net, const char *name)
{
- nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
- __arpt_unregister_table(table);
+ struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name);
+
+ if (table)
+ nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
+}
+EXPORT_SYMBOL(arpt_unregister_table_pre_exit);
+
+void arpt_unregister_table(struct net *net, const char *name)
+{
+ struct xt_table *table = xt_find_table(net, NFPROTO_ARP, name);
+
+ if (table)
+ __arpt_unregister_table(net, table);
}
/* The built-in targets: standard (NULL) and error. */
@@ -1584,7 +1592,7 @@ static struct xt_target arpt_builtin_tg[] __read_mostly = {
.name = XT_STANDARD_TARGET,
.targetsize = sizeof(int),
.family = NFPROTO_ARP,
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
.compatsize = sizeof(compat_int_t),
.compat_from_user = compat_standard_from_user,
.compat_to_user = compat_standard_to_user,
@@ -1603,15 +1611,9 @@ static struct nf_sockopt_ops arpt_sockopts = {
.set_optmin = ARPT_BASE_CTL,
.set_optmax = ARPT_SO_SET_MAX+1,
.set = do_arpt_set_ctl,
-#ifdef CONFIG_COMPAT
- .compat_set = compat_do_arpt_set_ctl,
-#endif
.get_optmin = ARPT_BASE_CTL,
.get_optmax = ARPT_SO_GET_MAX+1,
.get = do_arpt_get_ctl,
-#ifdef CONFIG_COMPAT
- .compat_get = compat_do_arpt_get_ctl,
-#endif
.owner = THIS_MODULE,
};
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index a5e52a9f0a12..a4e07e5e9c11 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* module that allows mangling of the arp payload */
#include <linux/module.h>
#include <linux/netfilter.h>
@@ -16,7 +17,7 @@ target(struct sk_buff *skb, const struct xt_action_param *par)
unsigned char *arpptr;
int pln, hln;
- if (!skb_make_writable(skb, skb->len))
+ if (skb_ensure_writable(skb, skb->len))
return NF_DROP;
arp = arp_hdr(skb);
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 8f8713b4388f..78cd5ee24448 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Filtering ARP tables module.
*
@@ -17,82 +18,72 @@ MODULE_DESCRIPTION("arptables filter table");
#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
(1 << NF_ARP_FORWARD))
-static int __net_init arptable_filter_table_init(struct net *net);
-
static const struct xt_table packet_filter = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_ARP,
.priority = NF_IP_PRI_FILTER,
- .table_init = arptable_filter_table_init,
};
-/* The work comes in here from netfilter.c */
-static unsigned int
-arptable_filter_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return arpt_do_table(skb, state, state->net->ipv4.arptable_filter);
-}
-
static struct nf_hook_ops *arpfilter_ops __read_mostly;
-static int __net_init arptable_filter_table_init(struct net *net)
+static int arptable_filter_table_init(struct net *net)
{
struct arpt_replace *repl;
int err;
- if (net->ipv4.arptable_filter)
- return 0;
-
repl = arpt_alloc_initial_table(&packet_filter);
if (repl == NULL)
return -ENOMEM;
- err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops,
- &net->ipv4.arptable_filter);
+ err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops);
kfree(repl);
return err;
}
+static void __net_exit arptable_filter_net_pre_exit(struct net *net)
+{
+ arpt_unregister_table_pre_exit(net, "filter");
+}
+
static void __net_exit arptable_filter_net_exit(struct net *net)
{
- if (!net->ipv4.arptable_filter)
- return;
- arpt_unregister_table(net, net->ipv4.arptable_filter, arpfilter_ops);
- net->ipv4.arptable_filter = NULL;
+ arpt_unregister_table(net, "filter");
}
static struct pernet_operations arptable_filter_net_ops = {
.exit = arptable_filter_net_exit,
+ .pre_exit = arptable_filter_net_pre_exit,
};
static int __init arptable_filter_init(void)
{
- int ret;
+ int ret = xt_register_template(&packet_filter,
+ arptable_filter_table_init);
+
+ if (ret < 0)
+ return ret;
- arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arptable_filter_hook);
- if (IS_ERR(arpfilter_ops))
+ arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arpt_do_table);
+ if (IS_ERR(arpfilter_ops)) {
+ xt_unregister_template(&packet_filter);
return PTR_ERR(arpfilter_ops);
+ }
ret = register_pernet_subsys(&arptable_filter_net_ops);
if (ret < 0) {
+ xt_unregister_template(&packet_filter);
kfree(arpfilter_ops);
return ret;
}
- ret = arptable_filter_table_init(&init_net);
- if (ret) {
- unregister_pernet_subsys(&arptable_filter_net_ops);
- kfree(arpfilter_ops);
- }
-
return ret;
}
static void __exit arptable_filter_fini(void)
{
unregister_pernet_subsys(&arptable_filter_net_ops);
+ xt_unregister_template(&packet_filter);
kfree(arpfilter_ops);
}
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index e77872c93c20..23c8deff8095 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Packet matching code.
*
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
* Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
* Copyright (C) 2006-2010 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/cache.h>
@@ -17,7 +14,6 @@
#include <linux/vmalloc.h>
#include <linux/netdevice.h>
#include <linux/module.h>
-#include <linux/icmp.h>
#include <net/ip.h>
#include <net/compat.h>
#include <linux/uaccess.h>
@@ -34,7 +30,6 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("IPv4 packet filter");
-MODULE_ALIAS("ipt_icmp");
void *ipt_alloc_initial_table(const struct xt_table *info)
{
@@ -225,10 +220,11 @@ struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry)
/* Returns one of the generic firewall policies, like NF_ACCEPT. */
unsigned int
-ipt_do_table(struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct xt_table *table)
+ipt_do_table(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
{
+ const struct xt_table *table = priv;
unsigned int hook = state->hook;
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
const struct iphdr *ip;
@@ -274,7 +270,7 @@ ipt_do_table(struct sk_buff *skb,
* but it is no problem since absolute verdict is issued by these.
*/
if (static_key_false(&xt_tee_enabled))
- jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
+ jumpstack += private->stacksize * current->in_nf_duplicate;
e = get_entry(table_base, private->hook_entry[hook]);
@@ -871,7 +867,7 @@ copy_entries_to_user(unsigned int total_size,
return ret;
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
static void compat_standard_from_user(void *dst, const void *src)
{
int v = *(compat_int_t *)src;
@@ -947,8 +943,7 @@ static int compat_table_info(const struct xt_table_info *info,
}
#endif
-static int get_info(struct net *net, void __user *user,
- const int *len, int compat)
+static int get_info(struct net *net, void __user *user, const int *len)
{
char name[XT_TABLE_MAXNAMELEN];
struct xt_table *t;
@@ -961,18 +956,18 @@ static int get_info(struct net *net, void __user *user,
return -EFAULT;
name[XT_TABLE_MAXNAMELEN-1] = '\0';
-#ifdef CONFIG_COMPAT
- if (compat)
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
xt_compat_lock(AF_INET);
#endif
t = xt_request_find_table_lock(net, AF_INET, name);
if (!IS_ERR(t)) {
struct ipt_getinfo info;
const struct xt_table_info *private = t->private;
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
struct xt_table_info tmp;
- if (compat) {
+ if (in_compat_syscall()) {
ret = compat_table_info(private, &tmp);
xt_compat_flush_offsets(AF_INET);
private = &tmp;
@@ -986,7 +981,7 @@ static int get_info(struct net *net, void __user *user,
sizeof(info.underflow));
info.num_entries = private->number;
info.size = private->size;
- strcpy(info.name, name);
+ strscpy(info.name, name);
if (copy_to_user(user, &info, *len) != 0)
ret = -EFAULT;
@@ -997,8 +992,8 @@ static int get_info(struct net *net, void __user *user,
module_put(t->me);
} else
ret = PTR_ERR(t);
-#ifdef CONFIG_COMPAT
- if (compat)
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
xt_compat_unlock(AF_INET);
#endif
return ret;
@@ -1048,7 +1043,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
struct xt_counters *counters;
struct ipt_entry *iter;
- ret = 0;
counters = xt_counters_alloc(num_counters);
if (!counters) {
ret = -ENOMEM;
@@ -1094,7 +1088,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n");
}
vfree(counters);
- return ret;
+ return 0;
put_module:
module_put(t->me);
@@ -1106,7 +1100,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
}
static int
-do_replace(struct net *net, const void __user *user, unsigned int len)
+do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
int ret;
struct ipt_replace tmp;
@@ -1114,7 +1108,9 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
void *loc_cpu_entry;
struct ipt_entry *iter;
- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ if (len < sizeof(tmp))
+ return -EINVAL;
+ if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
return -EFAULT;
/* overflow check */
@@ -1122,6 +1118,8 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
return -ENOMEM;
if (tmp.num_counters == 0)
return -EINVAL;
+ if ((u64)len < (u64)tmp.size + sizeof(tmp))
+ return -EINVAL;
tmp.name[sizeof(tmp.name)-1] = 0;
@@ -1130,8 +1128,8 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
return -ENOMEM;
loc_cpu_entry = newinfo->entries;
- if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
- tmp.size) != 0) {
+ if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
+ tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
}
@@ -1155,8 +1153,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
}
static int
-do_add_counters(struct net *net, const void __user *user,
- unsigned int len, int compat)
+do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
{
unsigned int i;
struct xt_counters_info tmp;
@@ -1167,7 +1164,7 @@ do_add_counters(struct net *net, const void __user *user,
struct ipt_entry *iter;
unsigned int addend;
- paddc = xt_copy_counters_from_user(user, len, &tmp, compat);
+ paddc = xt_copy_counters(arg, len, &tmp);
if (IS_ERR(paddc))
return PTR_ERR(paddc);
@@ -1204,7 +1201,7 @@ do_add_counters(struct net *net, const void __user *user,
return ret;
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
struct compat_ipt_replace {
char name[XT_TABLE_MAXNAMELEN];
u32 valid_hooks;
@@ -1214,7 +1211,7 @@ struct compat_ipt_replace {
u32 underflow[NF_INET_NUMHOOKS];
u32 num_counters;
compat_uptr_t counters; /* struct xt_counters * */
- struct compat_ipt_entry entries[0];
+ struct compat_ipt_entry entries[];
};
static int
@@ -1433,6 +1430,8 @@ translate_compat_table(struct net *net,
if (!newinfo)
goto out_unlock;
+ memset(newinfo->entries, 0, size);
+
newinfo->number = compatr->num_entries;
for (i = 0; i < NF_INET_NUMHOOKS; i++) {
newinfo->hook_entry[i] = compatr->hook_entry[i];
@@ -1489,7 +1488,7 @@ out_unlock:
}
static int
-compat_do_replace(struct net *net, void __user *user, unsigned int len)
+compat_do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
int ret;
struct compat_ipt_replace tmp;
@@ -1497,7 +1496,9 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
void *loc_cpu_entry;
struct ipt_entry *iter;
- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ if (len < sizeof(tmp))
+ return -EINVAL;
+ if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
return -EFAULT;
/* overflow check */
@@ -1505,6 +1506,8 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
return -ENOMEM;
if (tmp.num_counters == 0)
return -EINVAL;
+ if ((u64)len < (u64)tmp.size + sizeof(tmp))
+ return -EINVAL;
tmp.name[sizeof(tmp.name)-1] = 0;
@@ -1513,8 +1516,8 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
return -ENOMEM;
loc_cpu_entry = newinfo->entries;
- if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
- tmp.size) != 0) {
+ if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
+ tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
}
@@ -1537,35 +1540,10 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
return ret;
}
-static int
-compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user,
- unsigned int len)
-{
- int ret;
-
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
- return -EPERM;
-
- switch (cmd) {
- case IPT_SO_SET_REPLACE:
- ret = compat_do_replace(sock_net(sk), user, len);
- break;
-
- case IPT_SO_SET_ADD_COUNTERS:
- ret = do_add_counters(sock_net(sk), user, len, 1);
- break;
-
- default:
- ret = -EINVAL;
- }
-
- return ret;
-}
-
struct compat_ipt_get_entries {
char name[XT_TABLE_MAXNAMELEN];
compat_uint_t size;
- struct compat_ipt_entry entrytable[0];
+ struct compat_ipt_entry entrytable[];
};
static int
@@ -1637,33 +1615,10 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
xt_compat_unlock(AF_INET);
return ret;
}
-
-static int do_ipt_get_ctl(struct sock *, int, void __user *, int *);
-
-static int
-compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
-{
- int ret;
-
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
- return -EPERM;
-
- switch (cmd) {
- case IPT_SO_GET_INFO:
- ret = get_info(sock_net(sk), user, len, 1);
- break;
- case IPT_SO_GET_ENTRIES:
- ret = compat_get_entries(sock_net(sk), user, len);
- break;
- default:
- ret = do_ipt_get_ctl(sk, cmd, user, len);
- }
- return ret;
-}
#endif
static int
-do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+do_ipt_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len)
{
int ret;
@@ -1672,11 +1627,16 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
switch (cmd) {
case IPT_SO_SET_REPLACE:
- ret = do_replace(sock_net(sk), user, len);
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ ret = compat_do_replace(sock_net(sk), arg, len);
+ else
+#endif
+ ret = do_replace(sock_net(sk), arg, len);
break;
case IPT_SO_SET_ADD_COUNTERS:
- ret = do_add_counters(sock_net(sk), user, len, 0);
+ ret = do_add_counters(sock_net(sk), arg, len);
break;
default:
@@ -1696,11 +1656,16 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
switch (cmd) {
case IPT_SO_GET_INFO:
- ret = get_info(sock_net(sk), user, len, 0);
+ ret = get_info(sock_net(sk), user, len);
break;
case IPT_SO_GET_ENTRIES:
- ret = get_entries(sock_net(sk), user, len);
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ ret = compat_get_entries(sock_net(sk), user, len);
+ else
+#endif
+ ret = get_entries(sock_net(sk), user, len);
break;
case IPT_SO_GET_REVISION_MATCH:
@@ -1757,9 +1722,11 @@ static void __ipt_unregister_table(struct net *net, struct xt_table *table)
int ipt_register_table(struct net *net, const struct xt_table *table,
const struct ipt_replace *repl,
- const struct nf_hook_ops *ops, struct xt_table **res)
+ const struct nf_hook_ops *template_ops)
{
- int ret;
+ struct nf_hook_ops *ops;
+ unsigned int num_ops;
+ int ret, i;
struct xt_table_info *newinfo;
struct xt_table_info bootstrap = {0};
void *loc_cpu_entry;
@@ -1773,85 +1740,69 @@ int ipt_register_table(struct net *net, const struct xt_table *table,
memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(net, newinfo, loc_cpu_entry, repl);
- if (ret != 0)
- goto out_free;
+ if (ret != 0) {
+ xt_free_table_info(newinfo);
+ return ret;
+ }
new_table = xt_register_table(net, table, &bootstrap, newinfo);
if (IS_ERR(new_table)) {
- ret = PTR_ERR(new_table);
- goto out_free;
+ struct ipt_entry *iter;
+
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter, net);
+ xt_free_table_info(newinfo);
+ return PTR_ERR(new_table);
}
- /* set res now, will see skbs right after nf_register_net_hooks */
- WRITE_ONCE(*res, new_table);
- if (!ops)
+ /* No template? No need to do anything. This is used by 'nat' table, it registers
+ * with the nat core instead of the netfilter core.
+ */
+ if (!template_ops)
return 0;
- ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
- if (ret != 0) {
- __ipt_unregister_table(net, new_table);
- *res = NULL;
+ num_ops = hweight32(table->valid_hooks);
+ if (num_ops == 0) {
+ ret = -EINVAL;
+ goto out_free;
+ }
+
+ ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL);
+ if (!ops) {
+ ret = -ENOMEM;
+ goto out_free;
}
+ for (i = 0; i < num_ops; i++)
+ ops[i].priv = new_table;
+
+ new_table->ops = ops;
+
+ ret = nf_register_net_hooks(net, ops, num_ops);
+ if (ret != 0)
+ goto out_free;
+
return ret;
out_free:
- xt_free_table_info(newinfo);
+ __ipt_unregister_table(net, new_table);
return ret;
}
-void ipt_unregister_table(struct net *net, struct xt_table *table,
- const struct nf_hook_ops *ops)
-{
- if (ops)
- nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
- __ipt_unregister_table(net, table);
-}
-
-/* Returns 1 if the type and code is matched by the range, 0 otherwise */
-static inline bool
-icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
- u_int8_t type, u_int8_t code,
- bool invert)
-{
- return ((test_type == 0xFF) ||
- (type == test_type && code >= min_code && code <= max_code))
- ^ invert;
-}
-
-static bool
-icmp_match(const struct sk_buff *skb, struct xt_action_param *par)
+void ipt_unregister_table_pre_exit(struct net *net, const char *name)
{
- const struct icmphdr *ic;
- struct icmphdr _icmph;
- const struct ipt_icmp *icmpinfo = par->matchinfo;
-
- /* Must not be a fragment. */
- if (par->fragoff != 0)
- return false;
+ struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name);
- ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph);
- if (ic == NULL) {
- /* We've been asked to examine this packet, and we
- * can't. Hence, no choice but to drop.
- */
- par->hotdrop = true;
- return false;
- }
-
- return icmp_type_code_match(icmpinfo->type,
- icmpinfo->code[0],
- icmpinfo->code[1],
- ic->type, ic->code,
- !!(icmpinfo->invflags&IPT_ICMP_INV));
+ if (table)
+ nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
}
-static int icmp_checkentry(const struct xt_mtchk_param *par)
+void ipt_unregister_table_exit(struct net *net, const char *name)
{
- const struct ipt_icmp *icmpinfo = par->matchinfo;
+ struct xt_table *table = xt_find_table(net, NFPROTO_IPV4, name);
- /* Must specify no unknown invflags */
- return (icmpinfo->invflags & ~IPT_ICMP_INV) ? -EINVAL : 0;
+ if (table)
+ __ipt_unregister_table(net, table);
}
static struct xt_target ipt_builtin_tg[] __read_mostly = {
@@ -1859,7 +1810,7 @@ static struct xt_target ipt_builtin_tg[] __read_mostly = {
.name = XT_STANDARD_TARGET,
.targetsize = sizeof(int),
.family = NFPROTO_IPV4,
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
.compatsize = sizeof(compat_int_t),
.compat_from_user = compat_standard_from_user,
.compat_to_user = compat_standard_to_user,
@@ -1878,30 +1829,12 @@ static struct nf_sockopt_ops ipt_sockopts = {
.set_optmin = IPT_BASE_CTL,
.set_optmax = IPT_SO_SET_MAX+1,
.set = do_ipt_set_ctl,
-#ifdef CONFIG_COMPAT
- .compat_set = compat_do_ipt_set_ctl,
-#endif
.get_optmin = IPT_BASE_CTL,
.get_optmax = IPT_SO_GET_MAX+1,
.get = do_ipt_get_ctl,
-#ifdef CONFIG_COMPAT
- .compat_get = compat_do_ipt_get_ctl,
-#endif
.owner = THIS_MODULE,
};
-static struct xt_match ipt_builtin_mt[] __read_mostly = {
- {
- .name = "icmp",
- .match = icmp_match,
- .matchsize = sizeof(struct ipt_icmp),
- .checkentry = icmp_checkentry,
- .proto = IPPROTO_ICMP,
- .family = NFPROTO_IPV4,
- .me = THIS_MODULE,
- },
-};
-
static int __net_init ip_tables_net_init(struct net *net)
{
return xt_proto_init(net, NFPROTO_IPV4);
@@ -1929,19 +1862,14 @@ static int __init ip_tables_init(void)
ret = xt_register_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
if (ret < 0)
goto err2;
- ret = xt_register_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
- if (ret < 0)
- goto err4;
/* Register setsockopt */
ret = nf_register_sockopt(&ipt_sockopts);
if (ret < 0)
- goto err5;
+ goto err4;
return 0;
-err5:
- xt_unregister_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
err4:
xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
err2:
@@ -1954,13 +1882,13 @@ static void __exit ip_tables_fini(void)
{
nf_unregister_sockopt(&ipt_sockopts);
- xt_unregister_matches(ipt_builtin_mt, ARRAY_SIZE(ipt_builtin_mt));
xt_unregister_targets(ipt_builtin_tg, ARRAY_SIZE(ipt_builtin_tg));
unregister_pernet_subsys(&ip_tables_net_ops);
}
EXPORT_SYMBOL(ipt_register_table);
-EXPORT_SYMBOL(ipt_unregister_table);
+EXPORT_SYMBOL(ipt_unregister_table_pre_exit);
+EXPORT_SYMBOL(ipt_unregister_table_exit);
EXPORT_SYMBOL(ipt_do_table);
module_init(ip_tables_init);
module_exit(ip_tables_fini);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
deleted file mode 100644
index 2c8d313ae216..000000000000
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ /dev/null
@@ -1,884 +0,0 @@
-/* Cluster IP hashmark target
- * (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
- * based on ideas of Fabio Olive Leite <olive@unixforge.org>
- *
- * Development of this code funded by SuSE Linux AG, http://www.suse.com/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
-#include <linux/proc_fs.h>
-#include <linux/jhash.h>
-#include <linux/bitops.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-#include <linux/ip.h>
-#include <linux/tcp.h>
-#include <linux/udp.h>
-#include <linux/icmp.h>
-#include <linux/if_arp.h>
-#include <linux/seq_file.h>
-#include <linux/refcount.h>
-#include <linux/netfilter_arp.h>
-#include <linux/netfilter/x_tables.h>
-#include <linux/netfilter_ipv4/ip_tables.h>
-#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/net_namespace.h>
-#include <net/netns/generic.h>
-#include <net/checksum.h>
-#include <net/ip.h>
-
-#define CLUSTERIP_VERSION "0.8"
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
-MODULE_DESCRIPTION("Xtables: CLUSTERIP target");
-
-struct clusterip_config {
- struct list_head list; /* list of all configs */
- refcount_t refcount; /* reference count */
- refcount_t entries; /* number of entries/rules
- * referencing us */
-
- __be32 clusterip; /* the IP address */
- u_int8_t clustermac[ETH_ALEN]; /* the MAC address */
- int ifindex; /* device ifindex */
- u_int16_t num_total_nodes; /* total number of nodes */
- unsigned long local_nodes; /* node number array */
-
-#ifdef CONFIG_PROC_FS
- struct proc_dir_entry *pde; /* proc dir entry */
-#endif
- enum clusterip_hashmode hash_mode; /* which hashing mode */
- u_int32_t hash_initval; /* hash initialization */
- struct rcu_head rcu;
-
- char ifname[IFNAMSIZ]; /* device ifname */
- struct notifier_block notifier; /* refresh c->ifindex in it */
-};
-
-#ifdef CONFIG_PROC_FS
-static const struct file_operations clusterip_proc_fops;
-#endif
-
-static unsigned int clusterip_net_id __read_mostly;
-
-struct clusterip_net {
- struct list_head configs;
- /* lock protects the configs list */
- spinlock_t lock;
-
-#ifdef CONFIG_PROC_FS
- struct proc_dir_entry *procdir;
-#endif
-};
-
-static inline void
-clusterip_config_get(struct clusterip_config *c)
-{
- refcount_inc(&c->refcount);
-}
-
-
-static void clusterip_config_rcu_free(struct rcu_head *head)
-{
- kfree(container_of(head, struct clusterip_config, rcu));
-}
-
-static inline void
-clusterip_config_put(struct clusterip_config *c)
-{
- if (refcount_dec_and_test(&c->refcount))
- call_rcu_bh(&c->rcu, clusterip_config_rcu_free);
-}
-
-/* decrease the count of entries using/referencing this config. If last
- * entry(rule) is removed, remove the config from lists, but don't free it
- * yet, since proc-files could still be holding references */
-static inline void
-clusterip_config_entry_put(struct net *net, struct clusterip_config *c)
-{
- struct clusterip_net *cn = net_generic(net, clusterip_net_id);
-
- local_bh_disable();
- if (refcount_dec_and_lock(&c->entries, &cn->lock)) {
- /* In case anyone still accesses the file, the open/close
- * functions are also incrementing the refcount on their own,
- * so it's safe to remove the entry even if it's in use. */
-#ifdef CONFIG_PROC_FS
- if (cn->procdir)
- proc_remove(c->pde);
-#endif
- list_del_rcu(&c->list);
- spin_unlock(&cn->lock);
- local_bh_enable();
-
- unregister_netdevice_notifier(&c->notifier);
-
- return;
- }
- local_bh_enable();
-}
-
-static struct clusterip_config *
-__clusterip_config_find(struct net *net, __be32 clusterip)
-{
- struct clusterip_config *c;
- struct clusterip_net *cn = net_generic(net, clusterip_net_id);
-
- list_for_each_entry_rcu(c, &cn->configs, list) {
- if (c->clusterip == clusterip)
- return c;
- }
-
- return NULL;
-}
-
-static inline struct clusterip_config *
-clusterip_config_find_get(struct net *net, __be32 clusterip, int entry)
-{
- struct clusterip_config *c;
-
- rcu_read_lock_bh();
- c = __clusterip_config_find(net, clusterip);
- if (c) {
-#ifdef CONFIG_PROC_FS
- if (!c->pde)
- c = NULL;
- else
-#endif
- if (unlikely(!refcount_inc_not_zero(&c->refcount)))
- c = NULL;
- else if (entry) {
- if (unlikely(!refcount_inc_not_zero(&c->entries))) {
- clusterip_config_put(c);
- c = NULL;
- }
- }
- }
- rcu_read_unlock_bh();
-
- return c;
-}
-
-static void
-clusterip_config_init_nodelist(struct clusterip_config *c,
- const struct ipt_clusterip_tgt_info *i)
-{
- int n;
-
- for (n = 0; n < i->num_local_nodes; n++)
- set_bit(i->local_nodes[n] - 1, &c->local_nodes);
-}
-
-static int
-clusterip_netdev_event(struct notifier_block *this, unsigned long event,
- void *ptr)
-{
- struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct clusterip_config *c;
-
- c = container_of(this, struct clusterip_config, notifier);
- switch (event) {
- case NETDEV_REGISTER:
- if (!strcmp(dev->name, c->ifname)) {
- c->ifindex = dev->ifindex;
- dev_mc_add(dev, c->clustermac);
- }
- break;
- case NETDEV_UNREGISTER:
- if (dev->ifindex == c->ifindex) {
- dev_mc_del(dev, c->clustermac);
- c->ifindex = -1;
- }
- break;
- case NETDEV_CHANGENAME:
- if (!strcmp(dev->name, c->ifname)) {
- c->ifindex = dev->ifindex;
- dev_mc_add(dev, c->clustermac);
- } else if (dev->ifindex == c->ifindex) {
- dev_mc_del(dev, c->clustermac);
- c->ifindex = -1;
- }
- break;
- }
-
- return NOTIFY_DONE;
-}
-
-static struct clusterip_config *
-clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i,
- __be32 ip, const char *iniface)
-{
- struct clusterip_net *cn = net_generic(net, clusterip_net_id);
- struct clusterip_config *c;
- int err;
-
- c = kzalloc(sizeof(*c), GFP_ATOMIC);
- if (!c)
- return ERR_PTR(-ENOMEM);
-
- strcpy(c->ifname, iniface);
- c->ifindex = -1;
- c->clusterip = ip;
- memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
- c->num_total_nodes = i->num_total_nodes;
- clusterip_config_init_nodelist(c, i);
- c->hash_mode = i->hash_mode;
- c->hash_initval = i->hash_initval;
- refcount_set(&c->refcount, 1);
-
- spin_lock_bh(&cn->lock);
- if (__clusterip_config_find(net, ip)) {
- spin_unlock_bh(&cn->lock);
- kfree(c);
-
- return ERR_PTR(-EBUSY);
- }
-
- list_add_rcu(&c->list, &cn->configs);
- spin_unlock_bh(&cn->lock);
-
-#ifdef CONFIG_PROC_FS
- {
- char buffer[16];
-
- /* create proc dir entry */
- sprintf(buffer, "%pI4", &ip);
- c->pde = proc_create_data(buffer, 0600,
- cn->procdir,
- &clusterip_proc_fops, c);
- if (!c->pde) {
- err = -ENOMEM;
- goto err;
- }
- }
-#endif
-
- c->notifier.notifier_call = clusterip_netdev_event;
- err = register_netdevice_notifier(&c->notifier);
- if (!err) {
- refcount_set(&c->entries, 1);
- return c;
- }
-
-#ifdef CONFIG_PROC_FS
- proc_remove(c->pde);
-err:
-#endif
- spin_lock_bh(&cn->lock);
- list_del_rcu(&c->list);
- spin_unlock_bh(&cn->lock);
- clusterip_config_put(c);
-
- return ERR_PTR(err);
-}
-
-#ifdef CONFIG_PROC_FS
-static int
-clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
-{
-
- if (nodenum == 0 ||
- nodenum > c->num_total_nodes)
- return 1;
-
- /* check if we already have this number in our bitfield */
- if (test_and_set_bit(nodenum - 1, &c->local_nodes))
- return 1;
-
- return 0;
-}
-
-static bool
-clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
-{
- if (nodenum == 0 ||
- nodenum > c->num_total_nodes)
- return true;
-
- if (test_and_clear_bit(nodenum - 1, &c->local_nodes))
- return false;
-
- return true;
-}
-#endif
-
-static inline u_int32_t
-clusterip_hashfn(const struct sk_buff *skb,
- const struct clusterip_config *config)
-{
- const struct iphdr *iph = ip_hdr(skb);
- unsigned long hashval;
- u_int16_t sport = 0, dport = 0;
- int poff;
-
- poff = proto_ports_offset(iph->protocol);
- if (poff >= 0) {
- const u_int16_t *ports;
- u16 _ports[2];
-
- ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports);
- if (ports) {
- sport = ports[0];
- dport = ports[1];
- }
- } else {
- net_info_ratelimited("unknown protocol %u\n", iph->protocol);
- }
-
- switch (config->hash_mode) {
- case CLUSTERIP_HASHMODE_SIP:
- hashval = jhash_1word(ntohl(iph->saddr),
- config->hash_initval);
- break;
- case CLUSTERIP_HASHMODE_SIP_SPT:
- hashval = jhash_2words(ntohl(iph->saddr), sport,
- config->hash_initval);
- break;
- case CLUSTERIP_HASHMODE_SIP_SPT_DPT:
- hashval = jhash_3words(ntohl(iph->saddr), sport, dport,
- config->hash_initval);
- break;
- default:
- /* to make gcc happy */
- hashval = 0;
- /* This cannot happen, unless the check function wasn't called
- * at rule load time */
- pr_info("unknown mode %u\n", config->hash_mode);
- BUG();
- break;
- }
-
- /* node numbers are 1..n, not 0..n */
- return reciprocal_scale(hashval, config->num_total_nodes) + 1;
-}
-
-static inline int
-clusterip_responsible(const struct clusterip_config *config, u_int32_t hash)
-{
- return test_bit(hash - 1, &config->local_nodes);
-}
-
-/***********************************************************************
- * IPTABLES TARGET
- ***********************************************************************/
-
-static unsigned int
-clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
-{
- const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- u_int32_t hash;
-
- /* don't need to clusterip_config_get() here, since refcount
- * is only decremented by destroy() - and ip_tables guarantees
- * that the ->target() function isn't called after ->destroy() */
-
- ct = nf_ct_get(skb, &ctinfo);
- if (ct == NULL)
- return NF_DROP;
-
- /* special case: ICMP error handling. conntrack distinguishes between
- * error messages (RELATED) and information requests (see below) */
- if (ip_hdr(skb)->protocol == IPPROTO_ICMP &&
- (ctinfo == IP_CT_RELATED ||
- ctinfo == IP_CT_RELATED_REPLY))
- return XT_CONTINUE;
-
- /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
- * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here
- * on, which all have an ID field [relevant for hashing]. */
-
- hash = clusterip_hashfn(skb, cipinfo->config);
-
- switch (ctinfo) {
- case IP_CT_NEW:
- ct->mark = hash;
- break;
- case IP_CT_RELATED:
- case IP_CT_RELATED_REPLY:
- /* FIXME: we don't handle expectations at the moment.
- * They can arrive on a different node than
- * the master connection (e.g. FTP passive mode) */
- case IP_CT_ESTABLISHED:
- case IP_CT_ESTABLISHED_REPLY:
- break;
- default: /* Prevent gcc warnings */
- break;
- }
-
-#ifdef DEBUG
- nf_ct_dump_tuple_ip(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
-#endif
- pr_debug("hash=%u ct_hash=%u ", hash, ct->mark);
- if (!clusterip_responsible(cipinfo->config, hash)) {
- pr_debug("not responsible\n");
- return NF_DROP;
- }
- pr_debug("responsible\n");
-
- /* despite being received via linklayer multicast, this is
- * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */
- skb->pkt_type = PACKET_HOST;
-
- return XT_CONTINUE;
-}
-
-static int clusterip_tg_check(const struct xt_tgchk_param *par)
-{
- struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
- const struct ipt_entry *e = par->entryinfo;
- struct clusterip_config *config;
- int ret, i;
-
- if (par->nft_compat) {
- pr_err("cannot use CLUSTERIP target from nftables compat\n");
- return -EOPNOTSUPP;
- }
-
- if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
- cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT &&
- cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
- pr_info("unknown mode %u\n", cipinfo->hash_mode);
- return -EINVAL;
-
- }
- if (e->ip.dmsk.s_addr != htonl(0xffffffff) ||
- e->ip.dst.s_addr == 0) {
- pr_info("Please specify destination IP\n");
- return -EINVAL;
- }
- if (cipinfo->num_local_nodes > ARRAY_SIZE(cipinfo->local_nodes)) {
- pr_info("bad num_local_nodes %u\n", cipinfo->num_local_nodes);
- return -EINVAL;
- }
- for (i = 0; i < cipinfo->num_local_nodes; i++) {
- if (cipinfo->local_nodes[i] - 1 >=
- sizeof(config->local_nodes) * 8) {
- pr_info("bad local_nodes[%d] %u\n",
- i, cipinfo->local_nodes[i]);
- return -EINVAL;
- }
- }
-
- config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1);
- if (!config) {
- if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
- pr_info("no config found for %pI4, need 'new'\n",
- &e->ip.dst.s_addr);
- return -EINVAL;
- } else {
- struct net_device *dev;
-
- if (e->ip.iniface[0] == '\0') {
- pr_info("Please specify an interface name\n");
- return -EINVAL;
- }
-
- dev = dev_get_by_name(par->net, e->ip.iniface);
- if (!dev) {
- pr_info("no such interface %s\n",
- e->ip.iniface);
- return -ENOENT;
- }
- dev_put(dev);
-
- config = clusterip_config_init(par->net, cipinfo,
- e->ip.dst.s_addr,
- e->ip.iniface);
- if (IS_ERR(config))
- return PTR_ERR(config);
- }
- }
-
- ret = nf_ct_netns_get(par->net, par->family);
- if (ret < 0) {
- pr_info("cannot load conntrack support for proto=%u\n",
- par->family);
- clusterip_config_entry_put(par->net, config);
- clusterip_config_put(config);
- return ret;
- }
-
- if (!par->net->xt.clusterip_deprecated_warning) {
- pr_info("ipt_CLUSTERIP is deprecated and it will removed soon, "
- "use xt_cluster instead\n");
- par->net->xt.clusterip_deprecated_warning = true;
- }
-
- cipinfo->config = config;
- return ret;
-}
-
-/* drop reference count of cluster config when rule is deleted */
-static void clusterip_tg_destroy(const struct xt_tgdtor_param *par)
-{
- const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo;
-
- /* if no more entries are referencing the config, remove it
- * from the list and destroy the proc entry */
- clusterip_config_entry_put(par->net, cipinfo->config);
-
- clusterip_config_put(cipinfo->config);
-
- nf_ct_netns_put(par->net, par->family);
-}
-
-#ifdef CONFIG_COMPAT
-struct compat_ipt_clusterip_tgt_info
-{
- u_int32_t flags;
- u_int8_t clustermac[6];
- u_int16_t num_total_nodes;
- u_int16_t num_local_nodes;
- u_int16_t local_nodes[CLUSTERIP_MAX_NODES];
- u_int32_t hash_mode;
- u_int32_t hash_initval;
- compat_uptr_t config;
-};
-#endif /* CONFIG_COMPAT */
-
-static struct xt_target clusterip_tg_reg __read_mostly = {
- .name = "CLUSTERIP",
- .family = NFPROTO_IPV4,
- .target = clusterip_tg,
- .checkentry = clusterip_tg_check,
- .destroy = clusterip_tg_destroy,
- .targetsize = sizeof(struct ipt_clusterip_tgt_info),
- .usersize = offsetof(struct ipt_clusterip_tgt_info, config),
-#ifdef CONFIG_COMPAT
- .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info),
-#endif /* CONFIG_COMPAT */
- .me = THIS_MODULE
-};
-
-
-/***********************************************************************
- * ARP MANGLING CODE
- ***********************************************************************/
-
-/* hardcoded for 48bit ethernet and 32bit ipv4 addresses */
-struct arp_payload {
- u_int8_t src_hw[ETH_ALEN];
- __be32 src_ip;
- u_int8_t dst_hw[ETH_ALEN];
- __be32 dst_ip;
-} __packed;
-
-#ifdef DEBUG
-static void arp_print(struct arp_payload *payload)
-{
-#define HBUFFERLEN 30
- char hbuffer[HBUFFERLEN];
- int j, k;
-
- for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < ETH_ALEN; j++) {
- hbuffer[k++] = hex_asc_hi(payload->src_hw[j]);
- hbuffer[k++] = hex_asc_lo(payload->src_hw[j]);
- hbuffer[k++] = ':';
- }
- hbuffer[--k] = '\0';
-
- pr_debug("src %pI4@%s, dst %pI4\n",
- &payload->src_ip, hbuffer, &payload->dst_ip);
-}
-#endif
-
-static unsigned int
-arp_mangle(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct arphdr *arp = arp_hdr(skb);
- struct arp_payload *payload;
- struct clusterip_config *c;
- struct net *net = state->net;
-
- /* we don't care about non-ethernet and non-ipv4 ARP */
- if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
- arp->ar_pro != htons(ETH_P_IP) ||
- arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
- return NF_ACCEPT;
-
- /* we only want to mangle arp requests and replies */
- if (arp->ar_op != htons(ARPOP_REPLY) &&
- arp->ar_op != htons(ARPOP_REQUEST))
- return NF_ACCEPT;
-
- payload = (void *)(arp+1);
-
- /* if there is no clusterip configuration for the arp reply's
- * source ip, we don't want to mangle it */
- c = clusterip_config_find_get(net, payload->src_ip, 0);
- if (!c)
- return NF_ACCEPT;
-
- /* normally the linux kernel always replies to arp queries of
- * addresses on different interfacs. However, in the CLUSTERIP case
- * this wouldn't work, since we didn't subscribe the mcast group on
- * other interfaces */
- if (c->ifindex != state->out->ifindex) {
- pr_debug("not mangling arp reply on different interface: cip'%d'-skb'%d'\n",
- c->ifindex, state->out->ifindex);
- clusterip_config_put(c);
- return NF_ACCEPT;
- }
-
- /* mangle reply hardware address */
- memcpy(payload->src_hw, c->clustermac, arp->ar_hln);
-
-#ifdef DEBUG
- pr_debug("mangled arp reply: ");
- arp_print(payload);
-#endif
-
- clusterip_config_put(c);
-
- return NF_ACCEPT;
-}
-
-static const struct nf_hook_ops cip_arp_ops = {
- .hook = arp_mangle,
- .pf = NFPROTO_ARP,
- .hooknum = NF_ARP_OUT,
- .priority = -1
-};
-
-/***********************************************************************
- * PROC DIR HANDLING
- ***********************************************************************/
-
-#ifdef CONFIG_PROC_FS
-
-struct clusterip_seq_position {
- unsigned int pos; /* position */
- unsigned int weight; /* number of bits set == size */
- unsigned int bit; /* current bit */
- unsigned long val; /* current value */
-};
-
-static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
-{
- struct clusterip_config *c = s->private;
- unsigned int weight;
- u_int32_t local_nodes;
- struct clusterip_seq_position *idx;
-
- /* FIXME: possible race */
- local_nodes = c->local_nodes;
- weight = hweight32(local_nodes);
- if (*pos >= weight)
- return NULL;
-
- idx = kmalloc(sizeof(struct clusterip_seq_position), GFP_KERNEL);
- if (!idx)
- return ERR_PTR(-ENOMEM);
-
- idx->pos = *pos;
- idx->weight = weight;
- idx->bit = ffs(local_nodes);
- idx->val = local_nodes;
- clear_bit(idx->bit - 1, &idx->val);
-
- return idx;
-}
-
-static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
- struct clusterip_seq_position *idx = v;
-
- *pos = ++idx->pos;
- if (*pos >= idx->weight) {
- kfree(v);
- return NULL;
- }
- idx->bit = ffs(idx->val);
- clear_bit(idx->bit - 1, &idx->val);
- return idx;
-}
-
-static void clusterip_seq_stop(struct seq_file *s, void *v)
-{
- if (!IS_ERR(v))
- kfree(v);
-}
-
-static int clusterip_seq_show(struct seq_file *s, void *v)
-{
- struct clusterip_seq_position *idx = v;
-
- if (idx->pos != 0)
- seq_putc(s, ',');
-
- seq_printf(s, "%u", idx->bit);
-
- if (idx->pos == idx->weight - 1)
- seq_putc(s, '\n');
-
- return 0;
-}
-
-static const struct seq_operations clusterip_seq_ops = {
- .start = clusterip_seq_start,
- .next = clusterip_seq_next,
- .stop = clusterip_seq_stop,
- .show = clusterip_seq_show,
-};
-
-static int clusterip_proc_open(struct inode *inode, struct file *file)
-{
- int ret = seq_open(file, &clusterip_seq_ops);
-
- if (!ret) {
- struct seq_file *sf = file->private_data;
- struct clusterip_config *c = PDE_DATA(inode);
-
- sf->private = c;
-
- clusterip_config_get(c);
- }
-
- return ret;
-}
-
-static int clusterip_proc_release(struct inode *inode, struct file *file)
-{
- struct clusterip_config *c = PDE_DATA(inode);
- int ret;
-
- ret = seq_release(inode, file);
-
- if (!ret)
- clusterip_config_put(c);
-
- return ret;
-}
-
-static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
- size_t size, loff_t *ofs)
-{
- struct clusterip_config *c = PDE_DATA(file_inode(file));
-#define PROC_WRITELEN 10
- char buffer[PROC_WRITELEN+1];
- unsigned long nodenum;
- int rc;
-
- if (size > PROC_WRITELEN)
- return -EIO;
- if (copy_from_user(buffer, input, size))
- return -EFAULT;
- buffer[size] = 0;
-
- if (*buffer == '+') {
- rc = kstrtoul(buffer+1, 10, &nodenum);
- if (rc)
- return rc;
- if (clusterip_add_node(c, nodenum))
- return -ENOMEM;
- } else if (*buffer == '-') {
- rc = kstrtoul(buffer+1, 10, &nodenum);
- if (rc)
- return rc;
- if (clusterip_del_node(c, nodenum))
- return -ENOENT;
- } else
- return -EIO;
-
- return size;
-}
-
-static const struct file_operations clusterip_proc_fops = {
- .open = clusterip_proc_open,
- .read = seq_read,
- .write = clusterip_proc_write,
- .llseek = seq_lseek,
- .release = clusterip_proc_release,
-};
-
-#endif /* CONFIG_PROC_FS */
-
-static int clusterip_net_init(struct net *net)
-{
- struct clusterip_net *cn = net_generic(net, clusterip_net_id);
- int ret;
-
- INIT_LIST_HEAD(&cn->configs);
-
- spin_lock_init(&cn->lock);
-
- ret = nf_register_net_hook(net, &cip_arp_ops);
- if (ret < 0)
- return ret;
-
-#ifdef CONFIG_PROC_FS
- cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net);
- if (!cn->procdir) {
- nf_unregister_net_hook(net, &cip_arp_ops);
- pr_err("Unable to proc dir entry\n");
- return -ENOMEM;
- }
-#endif /* CONFIG_PROC_FS */
-
- return 0;
-}
-
-static void clusterip_net_exit(struct net *net)
-{
- struct clusterip_net *cn = net_generic(net, clusterip_net_id);
-#ifdef CONFIG_PROC_FS
- proc_remove(cn->procdir);
- cn->procdir = NULL;
-#endif
- nf_unregister_net_hook(net, &cip_arp_ops);
- WARN_ON_ONCE(!list_empty(&cn->configs));
-}
-
-static struct pernet_operations clusterip_net_ops = {
- .init = clusterip_net_init,
- .exit = clusterip_net_exit,
- .id = &clusterip_net_id,
- .size = sizeof(struct clusterip_net),
-};
-
-static int __init clusterip_tg_init(void)
-{
- int ret;
-
- ret = register_pernet_subsys(&clusterip_net_ops);
- if (ret < 0)
- return ret;
-
- ret = xt_register_target(&clusterip_tg_reg);
- if (ret < 0)
- goto cleanup_subsys;
-
- pr_info("ClusterIP Version %s loaded successfully\n",
- CLUSTERIP_VERSION);
-
- return 0;
-
-cleanup_subsys:
- unregister_pernet_subsys(&clusterip_net_ops);
- return ret;
-}
-
-static void __exit clusterip_tg_exit(void)
-{
- pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
-
- xt_unregister_target(&clusterip_tg_reg);
- unregister_pernet_subsys(&clusterip_net_ops);
-
- /* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
- rcu_barrier_bh();
-}
-
-module_init(clusterip_tg_init);
-module_exit(clusterip_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index aaaf9a81fbc9..5930d3b02555 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* iptables module for the IPv4 and TCP ECN bits, Version 1.5
*
* (C) 2002 by Harald Welte <laforge@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/in.h>
@@ -32,7 +29,7 @@ set_ect_ip(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
if ((iph->tos & IPT_ECN_IP_MASK) != (einfo->ip_ect & IPT_ECN_IP_MASK)) {
__u8 oldtos;
- if (!skb_make_writable(skb, sizeof(struct iphdr)))
+ if (skb_ensure_writable(skb, sizeof(struct iphdr)))
return false;
iph = ip_hdr(skb);
oldtos = iph->tos;
@@ -61,7 +58,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
tcph->cwr == einfo->proto.tcp.cwr))
return true;
- if (!skb_make_writable(skb, ip_hdrlen(skb) + sizeof(*tcph)))
+ if (skb_ensure_writable(skb, ip_hdrlen(skb) + sizeof(*tcph)))
return false;
tcph = (void *)ip_hdr(skb) + ip_hdrlen(skb);
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
deleted file mode 100644
index ce1512b02cb2..000000000000
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Masquerade. Simple mapping which alters range to a local IP address
- (depending on route). */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/types.h>
-#include <linux/inetdevice.h>
-#include <linux/ip.h>
-#include <linux/timer.h>
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <net/protocol.h>
-#include <net/ip.h>
-#include <net/checksum.h>
-#include <net/route.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/x_tables.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/ipv4/nf_nat_masquerade.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("Xtables: automatic-address SNAT");
-
-/* FIXME: Multiple targets. --RR */
-static int masquerade_tg_check(const struct xt_tgchk_param *par)
-{
- const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
-
- if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) {
- pr_debug("bad MAP_IPS.\n");
- return -EINVAL;
- }
- if (mr->rangesize != 1) {
- pr_debug("bad rangesize %u\n", mr->rangesize);
- return -EINVAL;
- }
- return nf_ct_netns_get(par->net, par->family);
-}
-
-static unsigned int
-masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par)
-{
- struct nf_nat_range2 range;
- const struct nf_nat_ipv4_multi_range_compat *mr;
-
- mr = par->targinfo;
- range.flags = mr->range[0].flags;
- range.min_proto = mr->range[0].min;
- range.max_proto = mr->range[0].max;
-
- return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range,
- xt_out(par));
-}
-
-static void masquerade_tg_destroy(const struct xt_tgdtor_param *par)
-{
- nf_ct_netns_put(par->net, par->family);
-}
-
-static struct xt_target masquerade_tg_reg __read_mostly = {
- .name = "MASQUERADE",
- .family = NFPROTO_IPV4,
- .target = masquerade_tg,
- .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat),
- .table = "nat",
- .hooks = 1 << NF_INET_POST_ROUTING,
- .checkentry = masquerade_tg_check,
- .destroy = masquerade_tg_destroy,
- .me = THIS_MODULE,
-};
-
-static int __init masquerade_tg_init(void)
-{
- int ret;
-
- ret = xt_register_target(&masquerade_tg_reg);
-
- if (ret == 0)
- nf_nat_masquerade_ipv4_register_notifier();
-
- return ret;
-}
-
-static void __exit masquerade_tg_exit(void)
-{
- xt_unregister_target(&masquerade_tg_reg);
- nf_nat_masquerade_ipv4_unregister_notifier();
-}
-
-module_init(masquerade_tg_init);
-module_exit(masquerade_tg_exit);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index e8bed3390e58..4b8840734762 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* This is a module which is used for rejecting packets.
*/
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -59,7 +56,8 @@ reject_tg(struct sk_buff *skb, const struct xt_action_param *par)
nf_send_unreach(skb, ICMP_PKT_FILTERED, hook);
break;
case IPT_TCP_RESET:
- nf_send_reset(xt_net(par), skb, hook);
+ nf_send_reset(xt_net(par), par->state->sk, skb, hook);
+ break;
case IPT_ICMP_ECHOREPLY:
/* Doesn't happen. */
break;
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 690b17ef6a44..f2984c7eef40 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -1,263 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <net/tcp.h>
-
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_SYNPROXY.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/nf_conntrack_synproxy.h>
-#include <net/netfilter/nf_conntrack_ecache.h>
-
-static struct iphdr *
-synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr,
- __be32 daddr)
-{
- struct iphdr *iph;
-
- skb_reset_network_header(skb);
- iph = skb_put(skb, sizeof(*iph));
- iph->version = 4;
- iph->ihl = sizeof(*iph) / 4;
- iph->tos = 0;
- iph->id = 0;
- iph->frag_off = htons(IP_DF);
- iph->ttl = net->ipv4.sysctl_ip_default_ttl;
- iph->protocol = IPPROTO_TCP;
- iph->check = 0;
- iph->saddr = saddr;
- iph->daddr = daddr;
-
- return iph;
-}
-
-static void
-synproxy_send_tcp(struct net *net,
- const struct sk_buff *skb, struct sk_buff *nskb,
- struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
- struct iphdr *niph, struct tcphdr *nth,
- unsigned int tcp_hdr_size)
-{
- nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
- nskb->ip_summed = CHECKSUM_PARTIAL;
- nskb->csum_start = (unsigned char *)nth - nskb->head;
- nskb->csum_offset = offsetof(struct tcphdr, check);
-
- skb_dst_set_noref(nskb, skb_dst(skb));
- nskb->protocol = htons(ETH_P_IP);
- if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
- goto free_nskb;
-
- if (nfct) {
- nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
- nf_conntrack_get(nfct);
- }
-
- ip_local_out(net, nskb->sk, nskb);
- return;
-
-free_nskb:
- kfree_skb(nskb);
-}
-
-static void
-synproxy_send_client_synack(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts)
-{
- struct sk_buff *nskb;
- struct iphdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
- u16 mss = opts->mss;
-
- iph = ip_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->dest;
- nth->dest = th->source;
- nth->seq = htonl(__cookie_v4_init_sequence(iph, th, &mss));
- nth->ack_seq = htonl(ntohl(th->seq) + 1);
- tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
- if (opts->options & XT_SYNPROXY_OPT_ECN)
- tcp_flag_word(nth) |= TCP_FLAG_ECE;
- nth->doff = tcp_hdr_size / 4;
- nth->window = 0;
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
- IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_server_syn(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts, u32 recv_seq)
-{
- struct synproxy_net *snet = synproxy_pernet(net);
- struct sk_buff *nskb;
- struct iphdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
-
- iph = ip_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->source;
- nth->dest = th->dest;
- nth->seq = htonl(recv_seq - 1);
- /* ack_seq is used to relay our ISN to the synproxy hook to initialize
- * sequence number translation once a connection tracking entry exists.
- */
- nth->ack_seq = htonl(ntohl(th->ack_seq) - 1);
- tcp_flag_word(nth) = TCP_FLAG_SYN;
- if (opts->options & XT_SYNPROXY_OPT_ECN)
- tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
- nth->doff = tcp_hdr_size / 4;
- nth->window = th->window;
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
- niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_server_ack(struct net *net,
- const struct ip_ct_tcp *state,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts)
-{
- struct sk_buff *nskb;
- struct iphdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
-
- iph = ip_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->dest;
- nth->dest = th->source;
- nth->seq = htonl(ntohl(th->ack_seq));
- nth->ack_seq = htonl(ntohl(th->seq) + 1);
- tcp_flag_word(nth) = TCP_FLAG_ACK;
- nth->doff = tcp_hdr_size / 4;
- nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_client_ack(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts)
-{
- struct sk_buff *nskb;
- struct iphdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
-
- iph = ip_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->source;
- nth->dest = th->dest;
- nth->seq = htonl(ntohl(th->seq) + 1);
- nth->ack_seq = th->ack_seq;
- tcp_flag_word(nth) = TCP_FLAG_ACK;
- nth->doff = tcp_hdr_size / 4;
- nth->window = htons(ntohs(th->window) >> opts->wscale);
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
- IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
-}
-
-static bool
-synproxy_recv_client_ack(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- struct synproxy_options *opts, u32 recv_seq)
-{
- struct synproxy_net *snet = synproxy_pernet(net);
- int mss;
-
- mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1);
- if (mss == 0) {
- this_cpu_inc(snet->stats->cookie_invalid);
- return false;
- }
-
- this_cpu_inc(snet->stats->cookie_valid);
- opts->mss = mss;
- opts->options |= XT_SYNPROXY_OPT_MSS;
-
- if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
- synproxy_check_timestamp_cookie(opts);
-
- synproxy_send_server_syn(net, skb, th, opts, recv_seq);
- return true;
-}
+#include <net/netfilter/nf_synproxy.h>
static unsigned int
synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
@@ -286,6 +36,8 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
opts.options |= XT_SYNPROXY_OPT_ECN;
opts.options &= info->options;
+ opts.mss_encode = opts.mss_option;
+ opts.mss_option = info->mss;
if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
synproxy_init_timestamp_cookie(info, &opts);
else
@@ -309,135 +61,6 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static unsigned int ipv4_synproxy_hook(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *nhs)
-{
- struct net *net = nhs->net;
- struct synproxy_net *snet = synproxy_pernet(net);
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct;
- struct nf_conn_synproxy *synproxy;
- struct synproxy_options opts = {};
- const struct ip_ct_tcp *state;
- struct tcphdr *th, _th;
- unsigned int thoff;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (ct == NULL)
- return NF_ACCEPT;
-
- synproxy = nfct_synproxy(ct);
- if (synproxy == NULL)
- return NF_ACCEPT;
-
- if (nf_is_loopback_packet(skb) ||
- ip_hdr(skb)->protocol != IPPROTO_TCP)
- return NF_ACCEPT;
-
- thoff = ip_hdrlen(skb);
- th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
- if (th == NULL)
- return NF_DROP;
-
- state = &ct->proto.tcp;
- switch (state->state) {
- case TCP_CONNTRACK_CLOSE:
- if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
- ntohl(th->seq) + 1);
- break;
- }
-
- if (!th->syn || th->ack ||
- CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
- break;
-
- /* Reopened connection - reset the sequence number and timestamp
- * adjustments, they will get initialized once the connection is
- * reestablished.
- */
- nf_ct_seqadj_init(ct, ctinfo, 0);
- synproxy->tsoff = 0;
- this_cpu_inc(snet->stats->conn_reopened);
-
- /* fall through */
- case TCP_CONNTRACK_SYN_SENT:
- if (!synproxy_parse_options(skb, thoff, th, &opts))
- return NF_DROP;
-
- if (!th->syn && th->ack &&
- CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
- /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
- * therefore we need to add 1 to make the SYN sequence
- * number match the one of first SYN.
- */
- if (synproxy_recv_client_ack(net, skb, th, &opts,
- ntohl(th->seq) + 1)) {
- this_cpu_inc(snet->stats->cookie_retrans);
- consume_skb(skb);
- return NF_STOLEN;
- } else {
- return NF_DROP;
- }
- }
-
- synproxy->isn = ntohl(th->ack_seq);
- if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
- synproxy->its = opts.tsecr;
-
- nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
- break;
- case TCP_CONNTRACK_SYN_RECV:
- if (!th->syn || !th->ack)
- break;
-
- if (!synproxy_parse_options(skb, thoff, th, &opts))
- return NF_DROP;
-
- if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) {
- synproxy->tsoff = opts.tsval - synproxy->its;
- nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
- }
-
- opts.options &= ~(XT_SYNPROXY_OPT_MSS |
- XT_SYNPROXY_OPT_WSCALE |
- XT_SYNPROXY_OPT_SACK_PERM);
-
- swap(opts.tsval, opts.tsecr);
- synproxy_send_server_ack(net, state, skb, th, &opts);
-
- nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
- nf_conntrack_event_cache(IPCT_SEQADJ, ct);
-
- swap(opts.tsval, opts.tsecr);
- synproxy_send_client_ack(net, skb, th, &opts);
-
- consume_skb(skb);
- return NF_STOLEN;
- default:
- break;
- }
-
- synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
- return NF_ACCEPT;
-}
-
-static const struct nf_hook_ops ipv4_synproxy_ops[] = {
- {
- .hook = ipv4_synproxy_hook,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
- },
- {
- .hook = ipv4_synproxy_hook,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
- },
-};
-
static int synproxy_tg4_check(const struct xt_tgchk_param *par)
{
struct synproxy_net *snet = synproxy_pernet(par->net);
@@ -452,16 +75,12 @@ static int synproxy_tg4_check(const struct xt_tgchk_param *par)
if (err)
return err;
- if (snet->hook_ref4 == 0) {
- err = nf_register_net_hooks(par->net, ipv4_synproxy_ops,
- ARRAY_SIZE(ipv4_synproxy_ops));
- if (err) {
- nf_ct_netns_put(par->net, par->family);
- return err;
- }
+ err = nf_synproxy_ipv4_init(snet, par->net);
+ if (err) {
+ nf_ct_netns_put(par->net, par->family);
+ return err;
}
- snet->hook_ref4++;
return err;
}
@@ -469,10 +88,7 @@ static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
{
struct synproxy_net *snet = synproxy_pernet(par->net);
- snet->hook_ref4--;
- if (snet->hook_ref4 == 0)
- nf_unregister_net_hooks(par->net, ipv4_synproxy_ops,
- ARRAY_SIZE(ipv4_synproxy_ops));
+ nf_synproxy_ipv4_fini(snet, par->net);
nf_ct_netns_put(par->net, par->family);
}
@@ -502,3 +118,4 @@ module_exit(synproxy_tg4_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Intercept TCP connections and establish them using syncookies");
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
index 7c6c20eaf4db..161ba412cb08 100644
--- a/net/ipv4/netfilter/ipt_ah.c
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel module to match AH parameters. */
/* (C) 1999-2000 Yon Uriarte <yon@astaro.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/in.h>
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index 12843c9ef142..6d9bf5106868 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2011 Florian Westphal <fw@strlen.de>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* based on fib_frontend.c; Author: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -12,6 +9,7 @@
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/ip.h>
+#include <net/flow.h>
#include <net/ip.h>
#include <net/ip_fib.h>
#include <net/route.h>
@@ -36,8 +34,6 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4,
const struct net_device *dev, u8 flags)
{
struct fib_result res;
- bool dev_match;
- int ret __maybe_unused;
if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE))
return false;
@@ -46,21 +42,7 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4,
if (res.type != RTN_LOCAL || !(flags & XT_RPFILTER_ACCEPT_LOCAL))
return false;
}
- dev_match = false;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- for (ret = 0; ret < res.fi->fib_nhs; ret++) {
- struct fib_nh *nh = &res.fi->fib_nh[ret];
-
- if (nh->nh_dev == dev) {
- dev_match = true;
- break;
- }
- }
-#else
- if (FIB_RES_DEV(res) == dev)
- dev_match = true;
-#endif
- return dev_match || flags & XT_RPFILTER_LOOSE;
+ return fib_info_nh_uses_dev(res.fi, dev) || flags & XT_RPFILTER_LOOSE;
}
static bool
@@ -94,8 +76,10 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
flow.daddr = iph->saddr;
flow.saddr = rpfilter_get_saddr(iph->daddr);
flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
- flow.flowi4_tos = RT_TOS(iph->tos);
+ flow.flowi4_dscp = ip4h_dscp(iph);
flow.flowi4_scope = RT_SCOPE_UNIVERSE;
+ flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par));
+ flow.flowi4_uid = sock_net_uid(xt_net(par), NULL);
return rpfilter_lookup_reverse(xt_net(par), &flow, xt_in(par), info->flags) ^ invert;
}
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 9ac92ea7b93c..3ab908b74795 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
*
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
* Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#include <linux/module.h>
@@ -23,7 +19,6 @@ MODULE_DESCRIPTION("iptables filter table");
#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT))
-static int __net_init iptable_filter_table_init(struct net *net);
static const struct xt_table packet_filter = {
.name = "filter",
@@ -31,82 +26,83 @@ static const struct xt_table packet_filter = {
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_FILTER,
- .table_init = iptable_filter_table_init,
};
-static unsigned int
-iptable_filter_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ipt_do_table(skb, state, state->net->ipv4.iptable_filter);
-}
-
static struct nf_hook_ops *filter_ops __read_mostly;
/* Default to forward because I got too much mail already. */
static bool forward __read_mostly = true;
module_param(forward, bool, 0000);
-static int __net_init iptable_filter_table_init(struct net *net)
+static int iptable_filter_table_init(struct net *net)
{
struct ipt_replace *repl;
int err;
- if (net->ipv4.iptable_filter)
- return 0;
-
repl = ipt_alloc_initial_table(&packet_filter);
if (repl == NULL)
return -ENOMEM;
/* Entry 1 is the FORWARD hook */
((struct ipt_standard *)repl->entries)[1].target.verdict =
- forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
+ forward ? -NF_ACCEPT - 1 : NF_DROP - 1;
- err = ipt_register_table(net, &packet_filter, repl, filter_ops,
- &net->ipv4.iptable_filter);
+ err = ipt_register_table(net, &packet_filter, repl, filter_ops);
kfree(repl);
return err;
}
static int __net_init iptable_filter_net_init(struct net *net)
{
- if (net == &init_net || !forward)
+ if (!forward)
return iptable_filter_table_init(net);
return 0;
}
+static void __net_exit iptable_filter_net_pre_exit(struct net *net)
+{
+ ipt_unregister_table_pre_exit(net, "filter");
+}
+
static void __net_exit iptable_filter_net_exit(struct net *net)
{
- if (!net->ipv4.iptable_filter)
- return;
- ipt_unregister_table(net, net->ipv4.iptable_filter, filter_ops);
- net->ipv4.iptable_filter = NULL;
+ ipt_unregister_table_exit(net, "filter");
}
static struct pernet_operations iptable_filter_net_ops = {
.init = iptable_filter_net_init,
+ .pre_exit = iptable_filter_net_pre_exit,
.exit = iptable_filter_net_exit,
};
static int __init iptable_filter_init(void)
{
- int ret;
+ int ret = xt_register_template(&packet_filter,
+ iptable_filter_table_init);
+
+ if (ret < 0)
+ return ret;
- filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook);
- if (IS_ERR(filter_ops))
+ filter_ops = xt_hook_ops_alloc(&packet_filter, ipt_do_table);
+ if (IS_ERR(filter_ops)) {
+ xt_unregister_template(&packet_filter);
return PTR_ERR(filter_ops);
+ }
ret = register_pernet_subsys(&iptable_filter_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ xt_unregister_template(&packet_filter);
kfree(filter_ops);
+ return ret;
+ }
- return ret;
+ return 0;
}
static void __exit iptable_filter_fini(void)
{
unregister_pernet_subsys(&iptable_filter_net_ops);
+ xt_unregister_template(&packet_filter);
kfree(filter_ops);
}
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index dea138ca8925..385d945d8ebe 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
*
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
* Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/netfilter_ipv4/ip_tables.h>
@@ -28,26 +25,23 @@ MODULE_DESCRIPTION("iptables mangle table");
(1 << NF_INET_LOCAL_OUT) | \
(1 << NF_INET_POST_ROUTING))
-static int __net_init iptable_mangle_table_init(struct net *net);
-
static const struct xt_table packet_mangler = {
.name = "mangle",
.valid_hooks = MANGLE_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_MANGLE,
- .table_init = iptable_mangle_table_init,
};
static unsigned int
-ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
+ipt_mangle_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
- unsigned int ret;
+ unsigned int ret, verdict;
const struct iphdr *iph;
- u_int8_t tos;
__be32 saddr, daddr;
- u_int32_t mark;
+ u32 mark;
int err;
+ u8 tos;
/* Save things which could affect route */
mark = skb->mark;
@@ -56,16 +50,17 @@ ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
daddr = iph->daddr;
tos = iph->tos;
- ret = ipt_do_table(skb, state, state->net->ipv4.iptable_mangle);
+ ret = ipt_do_table(priv, skb, state);
+ verdict = ret & NF_VERDICT_MASK;
/* Reroute for ANY change. */
- if (ret != NF_DROP && ret != NF_STOLEN) {
+ if (verdict != NF_DROP && verdict != NF_STOLEN) {
iph = ip_hdr(skb);
if (iph->saddr != saddr ||
iph->daddr != daddr ||
skb->mark != mark ||
iph->tos != tos) {
- err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
+ err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -81,68 +76,67 @@ iptable_mangle_hook(void *priv,
const struct nf_hook_state *state)
{
if (state->hook == NF_INET_LOCAL_OUT)
- return ipt_mangle_out(skb, state);
- return ipt_do_table(skb, state, state->net->ipv4.iptable_mangle);
+ return ipt_mangle_out(priv, skb, state);
+ return ipt_do_table(priv, skb, state);
}
static struct nf_hook_ops *mangle_ops __read_mostly;
-static int __net_init iptable_mangle_table_init(struct net *net)
+static int iptable_mangle_table_init(struct net *net)
{
struct ipt_replace *repl;
int ret;
- if (net->ipv4.iptable_mangle)
- return 0;
-
repl = ipt_alloc_initial_table(&packet_mangler);
if (repl == NULL)
return -ENOMEM;
- ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops,
- &net->ipv4.iptable_mangle);
+ ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops);
kfree(repl);
return ret;
}
+static void __net_exit iptable_mangle_net_pre_exit(struct net *net)
+{
+ ipt_unregister_table_pre_exit(net, "mangle");
+}
+
static void __net_exit iptable_mangle_net_exit(struct net *net)
{
- if (!net->ipv4.iptable_mangle)
- return;
- ipt_unregister_table(net, net->ipv4.iptable_mangle, mangle_ops);
- net->ipv4.iptable_mangle = NULL;
+ ipt_unregister_table_exit(net, "mangle");
}
static struct pernet_operations iptable_mangle_net_ops = {
+ .pre_exit = iptable_mangle_net_pre_exit,
.exit = iptable_mangle_net_exit,
};
static int __init iptable_mangle_init(void)
{
- int ret;
+ int ret = xt_register_template(&packet_mangler,
+ iptable_mangle_table_init);
+ if (ret < 0)
+ return ret;
mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook);
if (IS_ERR(mangle_ops)) {
+ xt_unregister_template(&packet_mangler);
ret = PTR_ERR(mangle_ops);
return ret;
}
ret = register_pernet_subsys(&iptable_mangle_net_ops);
if (ret < 0) {
+ xt_unregister_template(&packet_mangler);
kfree(mangle_ops);
return ret;
}
- ret = iptable_mangle_table_init(&init_net);
- if (ret) {
- unregister_pernet_subsys(&iptable_mangle_net_ops);
- kfree(mangle_ops);
- }
-
return ret;
}
static void __exit iptable_mangle_fini(void)
{
unregister_pernet_subsys(&iptable_mangle_net_ops);
+ xt_unregister_template(&packet_mangler);
kfree(mangle_ops);
}
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index a317445448bf..a5db7c67d61b 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
* (C) 2011 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -15,10 +12,12 @@
#include <net/ip.h>
#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-static int __net_init iptable_nat_table_init(struct net *net);
+struct iptable_nat_pernet {
+ struct nf_hook_ops *nf_nat_ops;
+};
+
+static unsigned int iptable_nat_net_id __read_mostly;
static const struct xt_table nf_nat_ipv4_table = {
.name = "nat",
@@ -28,37 +27,29 @@ static const struct xt_table nf_nat_ipv4_table = {
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
- .table_init = iptable_nat_table_init,
};
-static unsigned int iptable_nat_do_chain(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ipt_do_table(skb, state, state->net->ipv4.nat_table);
-}
-
static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
{
- .hook = iptable_nat_do_chain,
+ .hook = ipt_do_table,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP_PRI_NAT_DST,
},
{
- .hook = iptable_nat_do_chain,
+ .hook = ipt_do_table,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP_PRI_NAT_SRC,
},
{
- .hook = iptable_nat_do_chain,
+ .hook = ipt_do_table,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP_PRI_NAT_DST,
},
{
- .hook = iptable_nat_do_chain,
+ .hook = ipt_do_table,
.pf = NFPROTO_IPV4,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP_PRI_NAT_SRC,
@@ -67,85 +58,113 @@ static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
static int ipt_nat_register_lookups(struct net *net)
{
+ struct iptable_nat_pernet *xt_nat_net;
+ struct nf_hook_ops *ops;
+ struct xt_table *table;
int i, ret;
+ xt_nat_net = net_generic(net, iptable_nat_net_id);
+ table = xt_find_table(net, NFPROTO_IPV4, "nat");
+ if (WARN_ON_ONCE(!table))
+ return -ENOENT;
+
+ ops = kmemdup(nf_nat_ipv4_ops, sizeof(nf_nat_ipv4_ops), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
+
for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++) {
- ret = nf_nat_l3proto_ipv4_register_fn(net, &nf_nat_ipv4_ops[i]);
+ ops[i].priv = table;
+ ret = nf_nat_ipv4_register_fn(net, &ops[i]);
if (ret) {
while (i)
- nf_nat_l3proto_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[--i]);
+ nf_nat_ipv4_unregister_fn(net, &ops[--i]);
+ kfree(ops);
return ret;
}
}
+ xt_nat_net->nf_nat_ops = ops;
return 0;
}
static void ipt_nat_unregister_lookups(struct net *net)
{
+ struct iptable_nat_pernet *xt_nat_net = net_generic(net, iptable_nat_net_id);
+ struct nf_hook_ops *ops = xt_nat_net->nf_nat_ops;
int i;
+ if (!ops)
+ return;
+
for (i = 0; i < ARRAY_SIZE(nf_nat_ipv4_ops); i++)
- nf_nat_l3proto_ipv4_unregister_fn(net, &nf_nat_ipv4_ops[i]);
+ nf_nat_ipv4_unregister_fn(net, &ops[i]);
+
+ kfree(ops);
}
-static int __net_init iptable_nat_table_init(struct net *net)
+static int iptable_nat_table_init(struct net *net)
{
struct ipt_replace *repl;
int ret;
- if (net->ipv4.nat_table)
- return 0;
-
repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
if (repl == NULL)
return -ENOMEM;
- ret = ipt_register_table(net, &nf_nat_ipv4_table, repl,
- NULL, &net->ipv4.nat_table);
+
+ ret = ipt_register_table(net, &nf_nat_ipv4_table, repl, NULL);
if (ret < 0) {
kfree(repl);
return ret;
}
ret = ipt_nat_register_lookups(net);
- if (ret < 0) {
- ipt_unregister_table(net, net->ipv4.nat_table, NULL);
- net->ipv4.nat_table = NULL;
- }
+ if (ret < 0)
+ ipt_unregister_table_exit(net, "nat");
kfree(repl);
return ret;
}
-static void __net_exit iptable_nat_net_exit(struct net *net)
+static void __net_exit iptable_nat_net_pre_exit(struct net *net)
{
- if (!net->ipv4.nat_table)
- return;
ipt_nat_unregister_lookups(net);
- ipt_unregister_table(net, net->ipv4.nat_table, NULL);
- net->ipv4.nat_table = NULL;
+}
+
+static void __net_exit iptable_nat_net_exit(struct net *net)
+{
+ ipt_unregister_table_exit(net, "nat");
}
static struct pernet_operations iptable_nat_net_ops = {
+ .pre_exit = iptable_nat_net_pre_exit,
.exit = iptable_nat_net_exit,
+ .id = &iptable_nat_net_id,
+ .size = sizeof(struct iptable_nat_pernet),
};
static int __init iptable_nat_init(void)
{
- int ret = register_pernet_subsys(&iptable_nat_net_ops);
+ int ret;
- if (ret)
+ /* net->gen->ptr[iptable_nat_net_id] must be allocated
+ * before calling iptable_nat_table_init().
+ */
+ ret = register_pernet_subsys(&iptable_nat_net_ops);
+ if (ret < 0)
return ret;
- ret = iptable_nat_table_init(&init_net);
- if (ret)
+ ret = xt_register_template(&nf_nat_ipv4_table,
+ iptable_nat_table_init);
+ if (ret < 0)
unregister_pernet_subsys(&iptable_nat_net_ops);
+
return ret;
}
static void __exit iptable_nat_exit(void)
{
+ xt_unregister_template(&nf_nat_ipv4_table);
unregister_pernet_subsys(&iptable_nat_net_ops);
}
@@ -153,3 +172,4 @@ module_init(iptable_nat_init);
module_exit(iptable_nat_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("iptables legacy nat table");
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 960625aabf04..0e7f53964d0a 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -1,7 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT .
*
- * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -11,8 +12,6 @@
#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
-static int __net_init iptable_raw_table_init(struct net *net);
-
static bool raw_before_defrag __read_mostly;
MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag");
module_param(raw_before_defrag, bool, 0000);
@@ -23,7 +22,6 @@ static const struct xt_table packet_raw = {
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_RAW,
- .table_init = iptable_raw_table_init,
};
static const struct xt_table packet_raw_before_defrag = {
@@ -32,20 +30,11 @@ static const struct xt_table packet_raw_before_defrag = {
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_RAW_BEFORE_DEFRAG,
- .table_init = iptable_raw_table_init,
};
-/* The work comes in here from netfilter.c. */
-static unsigned int
-iptable_raw_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ipt_do_table(skb, state, state->net->ipv4.iptable_raw);
-}
-
static struct nf_hook_ops *rawtable_ops __read_mostly;
-static int __net_init iptable_raw_table_init(struct net *net)
+static int iptable_raw_table_init(struct net *net)
{
struct ipt_replace *repl;
const struct xt_table *table = &packet_raw;
@@ -54,27 +43,26 @@ static int __net_init iptable_raw_table_init(struct net *net)
if (raw_before_defrag)
table = &packet_raw_before_defrag;
- if (net->ipv4.iptable_raw)
- return 0;
-
repl = ipt_alloc_initial_table(table);
if (repl == NULL)
return -ENOMEM;
- ret = ipt_register_table(net, table, repl, rawtable_ops,
- &net->ipv4.iptable_raw);
+ ret = ipt_register_table(net, table, repl, rawtable_ops);
kfree(repl);
return ret;
}
+static void __net_exit iptable_raw_net_pre_exit(struct net *net)
+{
+ ipt_unregister_table_pre_exit(net, "raw");
+}
+
static void __net_exit iptable_raw_net_exit(struct net *net)
{
- if (!net->ipv4.iptable_raw)
- return;
- ipt_unregister_table(net, net->ipv4.iptable_raw, rawtable_ops);
- net->ipv4.iptable_raw = NULL;
+ ipt_unregister_table_exit(net, "raw");
}
static struct pernet_operations iptable_raw_net_ops = {
+ .pre_exit = iptable_raw_net_pre_exit,
.exit = iptable_raw_net_exit,
};
@@ -89,22 +77,24 @@ static int __init iptable_raw_init(void)
pr_info("Enabling raw table before defrag\n");
}
- rawtable_ops = xt_hook_ops_alloc(table, iptable_raw_hook);
- if (IS_ERR(rawtable_ops))
+ ret = xt_register_template(table,
+ iptable_raw_table_init);
+ if (ret < 0)
+ return ret;
+
+ rawtable_ops = xt_hook_ops_alloc(table, ipt_do_table);
+ if (IS_ERR(rawtable_ops)) {
+ xt_unregister_template(table);
return PTR_ERR(rawtable_ops);
+ }
ret = register_pernet_subsys(&iptable_raw_net_ops);
if (ret < 0) {
+ xt_unregister_template(table);
kfree(rawtable_ops);
return ret;
}
- ret = iptable_raw_table_init(&init_net);
- if (ret) {
- unregister_pernet_subsys(&iptable_raw_net_ops);
- kfree(rawtable_ops);
- }
-
return ret;
}
@@ -112,8 +102,10 @@ static void __exit iptable_raw_fini(void)
{
unregister_pernet_subsys(&iptable_raw_net_ops);
kfree(rawtable_ops);
+ xt_unregister_template(&packet_raw);
}
module_init(iptable_raw_init);
module_exit(iptable_raw_fini);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("iptables legacy raw table");
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index e5379fe57b64..d885443cb267 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* "security" table
*
@@ -10,10 +11,6 @@
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
* Copyright (C) 2000-2004 Netfilter Core Team <coreteam <at> netfilter.org>
* Copyright (C) 2008 Red Hat, Inc., James Morris <jmorris <at> redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/netfilter_ipv4/ip_tables.h>
@@ -28,76 +25,65 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules");
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT)
-static int __net_init iptable_security_table_init(struct net *net);
-
static const struct xt_table security_table = {
.name = "security",
.valid_hooks = SECURITY_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_SECURITY,
- .table_init = iptable_security_table_init,
};
-static unsigned int
-iptable_security_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ipt_do_table(skb, state, state->net->ipv4.iptable_security);
-}
-
static struct nf_hook_ops *sectbl_ops __read_mostly;
-static int __net_init iptable_security_table_init(struct net *net)
+static int iptable_security_table_init(struct net *net)
{
struct ipt_replace *repl;
int ret;
- if (net->ipv4.iptable_security)
- return 0;
-
repl = ipt_alloc_initial_table(&security_table);
if (repl == NULL)
return -ENOMEM;
- ret = ipt_register_table(net, &security_table, repl, sectbl_ops,
- &net->ipv4.iptable_security);
+ ret = ipt_register_table(net, &security_table, repl, sectbl_ops);
kfree(repl);
return ret;
}
-static void __net_exit iptable_security_net_exit(struct net *net)
+static void __net_exit iptable_security_net_pre_exit(struct net *net)
{
- if (!net->ipv4.iptable_security)
- return;
+ ipt_unregister_table_pre_exit(net, "security");
+}
- ipt_unregister_table(net, net->ipv4.iptable_security, sectbl_ops);
- net->ipv4.iptable_security = NULL;
+static void __net_exit iptable_security_net_exit(struct net *net)
+{
+ ipt_unregister_table_exit(net, "security");
}
static struct pernet_operations iptable_security_net_ops = {
+ .pre_exit = iptable_security_net_pre_exit,
.exit = iptable_security_net_exit,
};
static int __init iptable_security_init(void)
{
- int ret;
+ int ret = xt_register_template(&security_table,
+ iptable_security_table_init);
+
+ if (ret < 0)
+ return ret;
- sectbl_ops = xt_hook_ops_alloc(&security_table, iptable_security_hook);
- if (IS_ERR(sectbl_ops))
+ sectbl_ops = xt_hook_ops_alloc(&security_table, ipt_do_table);
+ if (IS_ERR(sectbl_ops)) {
+ xt_unregister_template(&security_table);
return PTR_ERR(sectbl_ops);
+ }
ret = register_pernet_subsys(&iptable_security_net_ops);
if (ret < 0) {
+ xt_unregister_template(&security_table);
kfree(sectbl_ops);
return ret;
}
- ret = iptable_security_table_init(&init_net);
- if (ret) {
- unregister_pernet_subsys(&iptable_security_net_ops);
- kfree(sectbl_ops);
- }
-
return ret;
}
@@ -105,6 +91,7 @@ static void __exit iptable_security_fini(void)
{
unregister_pernet_subsys(&iptable_security_net_ops);
kfree(sectbl_ops);
+ xt_unregister_template(&security_table);
}
module_init(iptable_security_init);
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index a0d3ad60a411..482e733c3375 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -1,15 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/types.h>
#include <linux/ip.h>
#include <linux/netfilter.h>
#include <linux/module.h>
+#include <linux/rcupdate.h>
#include <linux/skbuff.h>
#include <net/netns/generic.h>
#include <net/route.h>
@@ -68,7 +66,7 @@ static unsigned int ipv4_conntrack_defrag(void *priv,
struct sock *sk = skb->sk;
if (sk && sk_fullsock(sk) && (sk->sk_family == PF_INET) &&
- inet_sk(sk)->nodefrag)
+ inet_test_bit(NODEFRAG, sk))
return NF_ACCEPT;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
@@ -109,24 +107,38 @@ static const struct nf_hook_ops ipv4_defrag_ops[] = {
static void __net_exit defrag4_net_exit(struct net *net)
{
- if (net->nf.defrag_ipv4) {
+ if (net->nf.defrag_ipv4_users) {
nf_unregister_net_hooks(net, ipv4_defrag_ops,
ARRAY_SIZE(ipv4_defrag_ops));
- net->nf.defrag_ipv4 = false;
+ net->nf.defrag_ipv4_users = 0;
}
}
+static const struct nf_defrag_hook defrag_hook = {
+ .owner = THIS_MODULE,
+ .enable = nf_defrag_ipv4_enable,
+ .disable = nf_defrag_ipv4_disable,
+};
+
static struct pernet_operations defrag4_net_ops = {
.exit = defrag4_net_exit,
};
static int __init nf_defrag_init(void)
{
- return register_pernet_subsys(&defrag4_net_ops);
+ int err;
+
+ err = register_pernet_subsys(&defrag4_net_ops);
+ if (err)
+ return err;
+
+ rcu_assign_pointer(nf_defrag_v4_hook, &defrag_hook);
+ return err;
}
static void __exit nf_defrag_fini(void)
{
+ rcu_assign_pointer(nf_defrag_v4_hook, NULL);
unregister_pernet_subsys(&defrag4_net_ops);
}
@@ -134,19 +146,21 @@ int nf_defrag_ipv4_enable(struct net *net)
{
int err = 0;
- might_sleep();
-
- if (net->nf.defrag_ipv4)
- return 0;
-
mutex_lock(&defrag4_mutex);
- if (net->nf.defrag_ipv4)
+ if (net->nf.defrag_ipv4_users == UINT_MAX) {
+ err = -EOVERFLOW;
goto out_unlock;
+ }
+
+ if (net->nf.defrag_ipv4_users) {
+ net->nf.defrag_ipv4_users++;
+ goto out_unlock;
+ }
err = nf_register_net_hooks(net, ipv4_defrag_ops,
ARRAY_SIZE(ipv4_defrag_ops));
if (err == 0)
- net->nf.defrag_ipv4 = true;
+ net->nf.defrag_ipv4_users = 1;
out_unlock:
mutex_unlock(&defrag4_mutex);
@@ -154,7 +168,22 @@ int nf_defrag_ipv4_enable(struct net *net)
}
EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable);
+void nf_defrag_ipv4_disable(struct net *net)
+{
+ mutex_lock(&defrag4_mutex);
+ if (net->nf.defrag_ipv4_users) {
+ net->nf.defrag_ipv4_users--;
+ if (net->nf.defrag_ipv4_users == 0)
+ nf_unregister_net_hooks(net, ipv4_defrag_ops,
+ ARRAY_SIZE(ipv4_defrag_ops));
+ }
+
+ mutex_unlock(&defrag4_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_defrag_ipv4_disable);
+
module_init(nf_defrag_init);
module_exit(nf_defrag_fini);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IPv4 defragmentation support");
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index 39895b9ddeb9..9a773502f10a 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* (C) 2007 by Sebastian Claßen <sebastian.classen@freenet.ag>
* (C) 2007-2010 by Jan Engelhardt <jengelh@medozas.de>
*
* Extracted from xt_TEE.c
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 or later, as
- * published by the Free Software Foundation.
*/
#include <linux/ip.h>
#include <linux/module.h>
@@ -15,6 +12,7 @@
#include <linux/skbuff.h>
#include <linux/netfilter.h>
#include <net/checksum.h>
+#include <net/flow.h>
#include <net/icmp.h>
#include <net/ip.h>
#include <net/route.h>
@@ -35,7 +33,7 @@ static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb,
fl4.flowi4_oif = oif;
fl4.daddr = gw->s_addr;
- fl4.flowi4_tos = RT_TOS(iph->tos);
+ fl4.flowi4_dscp = ip4h_dscp(iph);
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH;
rt = ip_route_output_key(net, &fl4);
@@ -55,8 +53,9 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
{
struct iphdr *iph;
- if (this_cpu_read(nf_skb_duplicated))
- return;
+ local_bh_disable();
+ if (current->in_nf_duplicate)
+ goto out;
/*
* Copy the skb, and route the copy. Will later return %XT_CONTINUE for
* the original skb, which should continue on its way as if nothing has
@@ -64,11 +63,11 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
*/
skb = pskb_copy(skb, GFP_ATOMIC);
if (skb == NULL)
- return;
+ goto out;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
/* Avoid counting cloned packets towards the original connection. */
- nf_reset(skb);
+ nf_reset_ct(skb);
nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
#endif
/*
@@ -87,12 +86,14 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
--iph->ttl;
if (nf_dup_ipv4_route(net, skb, gw, oif)) {
- __this_cpu_write(nf_skb_duplicated, true);
+ current->in_nf_duplicate = true;
ip_local_out(net, skb->sk, skb);
- __this_cpu_write(nf_skb_duplicated, false);
+ current->in_nf_duplicate = false;
} else {
kfree_skb(skb);
}
+out:
+ local_bh_enable();
}
EXPORT_SYMBOL_GPL(nf_dup_ipv4);
diff --git a/net/ipv4/netfilter/nf_flow_table_ipv4.c b/net/ipv4/netfilter/nf_flow_table_ipv4.c
deleted file mode 100644
index e1e56d7123d2..000000000000
--- a/net/ipv4/netfilter/nf_flow_table_ipv4.c
+++ /dev/null
@@ -1,33 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_flow_table.h>
-#include <net/netfilter/nf_tables.h>
-
-static struct nf_flowtable_type flowtable_ipv4 = {
- .family = NFPROTO_IPV4,
- .init = nf_flow_table_init,
- .free = nf_flow_table_free,
- .hook = nf_flow_offload_ip_hook,
- .owner = THIS_MODULE,
-};
-
-static int __init nf_flow_ipv4_module_init(void)
-{
- nft_register_flowtable_type(&flowtable_ipv4);
-
- return 0;
-}
-
-static void __exit nf_flow_ipv4_module_exit(void)
-{
- nft_unregister_flowtable_type(&flowtable_ipv4);
-}
-
-module_init(nf_flow_ipv4_module_init);
-module_exit(nf_flow_ipv4_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
-MODULE_ALIAS_NF_FLOWTABLE(AF_INET);
diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c
deleted file mode 100644
index df5c2a2061a4..000000000000
--- a/net/ipv4/netfilter/nf_log_arp.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * (C) 2014 by Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * Based on code from ebt_log from:
- *
- * Bart De Schuymer <bdschuym@pandora.be>
- * Harald Welte <laforge@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/skbuff.h>
-#include <linux/if_arp.h>
-#include <linux/ip.h>
-#include <net/route.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter/xt_LOG.h>
-#include <net/netfilter/nf_log.h>
-
-static const struct nf_loginfo default_loginfo = {
- .type = NF_LOG_TYPE_LOG,
- .u = {
- .log = {
- .level = LOGLEVEL_NOTICE,
- .logflags = NF_LOG_DEFAULT_MASK,
- },
- },
-};
-
-struct arppayload {
- unsigned char mac_src[ETH_ALEN];
- unsigned char ip_src[4];
- unsigned char mac_dst[ETH_ALEN];
- unsigned char ip_dst[4];
-};
-
-static void dump_arp_packet(struct nf_log_buf *m,
- const struct nf_loginfo *info,
- const struct sk_buff *skb, unsigned int nhoff)
-{
- const struct arphdr *ah;
- struct arphdr _arph;
- const struct arppayload *ap;
- struct arppayload _arpp;
-
- ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
- if (ah == NULL) {
- nf_log_buf_add(m, "TRUNCATED");
- return;
- }
- nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d",
- ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op));
-
- /* If it's for Ethernet and the lengths are OK, then log the ARP
- * payload.
- */
- if (ah->ar_hrd != htons(ARPHRD_ETHER) ||
- ah->ar_hln != ETH_ALEN ||
- ah->ar_pln != sizeof(__be32))
- return;
-
- ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp);
- if (ap == NULL) {
- nf_log_buf_add(m, " INCOMPLETE [%zu bytes]",
- skb->len - sizeof(_arph));
- return;
- }
- nf_log_buf_add(m, " MACSRC=%pM IPSRC=%pI4 MACDST=%pM IPDST=%pI4",
- ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst);
-}
-
-static void nf_log_arp_packet(struct net *net, u_int8_t pf,
- unsigned int hooknum, const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *loginfo,
- const char *prefix)
-{
- struct nf_log_buf *m;
-
- /* FIXME: Disabled from containers until syslog ns is supported */
- if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
- return;
-
- m = nf_log_buf_open();
-
- if (!loginfo)
- loginfo = &default_loginfo;
-
- nf_log_dump_packet_common(m, pf, hooknum, skb, in, out, loginfo,
- prefix);
- dump_arp_packet(m, loginfo, skb, 0);
-
- nf_log_buf_close(m);
-}
-
-static struct nf_logger nf_arp_logger __read_mostly = {
- .name = "nf_log_arp",
- .type = NF_LOG_TYPE_LOG,
- .logfn = nf_log_arp_packet,
- .me = THIS_MODULE,
-};
-
-static int __net_init nf_log_arp_net_init(struct net *net)
-{
- return nf_log_set(net, NFPROTO_ARP, &nf_arp_logger);
-}
-
-static void __net_exit nf_log_arp_net_exit(struct net *net)
-{
- nf_log_unset(net, &nf_arp_logger);
-}
-
-static struct pernet_operations nf_log_arp_net_ops = {
- .init = nf_log_arp_net_init,
- .exit = nf_log_arp_net_exit,
-};
-
-static int __init nf_log_arp_init(void)
-{
- int ret;
-
- ret = register_pernet_subsys(&nf_log_arp_net_ops);
- if (ret < 0)
- return ret;
-
- ret = nf_log_register(NFPROTO_ARP, &nf_arp_logger);
- if (ret < 0) {
- pr_err("failed to register logger\n");
- goto err1;
- }
-
- return 0;
-
-err1:
- unregister_pernet_subsys(&nf_log_arp_net_ops);
- return ret;
-}
-
-static void __exit nf_log_arp_exit(void)
-{
- unregister_pernet_subsys(&nf_log_arp_net_ops);
- nf_log_unregister(&nf_arp_logger);
-}
-
-module_init(nf_log_arp_init);
-module_exit(nf_log_arp_exit);
-
-MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
-MODULE_DESCRIPTION("Netfilter ARP packet logging");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NF_LOGGER(3, 0);
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c
deleted file mode 100644
index 1e6f28c97d3a..000000000000
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ /dev/null
@@ -1,396 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/skbuff.h>
-#include <linux/if_arp.h>
-#include <linux/ip.h>
-#include <net/ipv6.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/tcp.h>
-#include <net/route.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter/xt_LOG.h>
-#include <net/netfilter/nf_log.h>
-
-static const struct nf_loginfo default_loginfo = {
- .type = NF_LOG_TYPE_LOG,
- .u = {
- .log = {
- .level = LOGLEVEL_NOTICE,
- .logflags = NF_LOG_DEFAULT_MASK,
- },
- },
-};
-
-/* One level of recursion won't kill us */
-static void dump_ipv4_packet(struct net *net, struct nf_log_buf *m,
- const struct nf_loginfo *info,
- const struct sk_buff *skb, unsigned int iphoff)
-{
- struct iphdr _iph;
- const struct iphdr *ih;
- unsigned int logflags;
-
- if (info->type == NF_LOG_TYPE_LOG)
- logflags = info->u.log.logflags;
- else
- logflags = NF_LOG_DEFAULT_MASK;
-
- ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
- if (ih == NULL) {
- nf_log_buf_add(m, "TRUNCATED");
- return;
- }
-
- /* Important fields:
- * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
- /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
- nf_log_buf_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr);
-
- /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
- nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
- ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
- ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
-
- /* Max length: 6 "CE DF MF " */
- if (ntohs(ih->frag_off) & IP_CE)
- nf_log_buf_add(m, "CE ");
- if (ntohs(ih->frag_off) & IP_DF)
- nf_log_buf_add(m, "DF ");
- if (ntohs(ih->frag_off) & IP_MF)
- nf_log_buf_add(m, "MF ");
-
- /* Max length: 11 "FRAG:65535 " */
- if (ntohs(ih->frag_off) & IP_OFFSET)
- nf_log_buf_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
-
- if ((logflags & NF_LOG_IPOPT) &&
- ih->ihl * 4 > sizeof(struct iphdr)) {
- const unsigned char *op;
- unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
- unsigned int i, optsize;
-
- optsize = ih->ihl * 4 - sizeof(struct iphdr);
- op = skb_header_pointer(skb, iphoff+sizeof(_iph),
- optsize, _opt);
- if (op == NULL) {
- nf_log_buf_add(m, "TRUNCATED");
- return;
- }
-
- /* Max length: 127 "OPT (" 15*4*2chars ") " */
- nf_log_buf_add(m, "OPT (");
- for (i = 0; i < optsize; i++)
- nf_log_buf_add(m, "%02X", op[i]);
- nf_log_buf_add(m, ") ");
- }
-
- switch (ih->protocol) {
- case IPPROTO_TCP:
- if (nf_log_dump_tcp_header(m, skb, ih->protocol,
- ntohs(ih->frag_off) & IP_OFFSET,
- iphoff+ih->ihl*4, logflags))
- return;
- break;
- case IPPROTO_UDP:
- case IPPROTO_UDPLITE:
- if (nf_log_dump_udp_header(m, skb, ih->protocol,
- ntohs(ih->frag_off) & IP_OFFSET,
- iphoff+ih->ihl*4))
- return;
- break;
- case IPPROTO_ICMP: {
- struct icmphdr _icmph;
- const struct icmphdr *ich;
- static const size_t required_len[NR_ICMP_TYPES+1]
- = { [ICMP_ECHOREPLY] = 4,
- [ICMP_DEST_UNREACH]
- = 8 + sizeof(struct iphdr),
- [ICMP_SOURCE_QUENCH]
- = 8 + sizeof(struct iphdr),
- [ICMP_REDIRECT]
- = 8 + sizeof(struct iphdr),
- [ICMP_ECHO] = 4,
- [ICMP_TIME_EXCEEDED]
- = 8 + sizeof(struct iphdr),
- [ICMP_PARAMETERPROB]
- = 8 + sizeof(struct iphdr),
- [ICMP_TIMESTAMP] = 20,
- [ICMP_TIMESTAMPREPLY] = 20,
- [ICMP_ADDRESS] = 12,
- [ICMP_ADDRESSREPLY] = 12 };
-
- /* Max length: 11 "PROTO=ICMP " */
- nf_log_buf_add(m, "PROTO=ICMP ");
-
- if (ntohs(ih->frag_off) & IP_OFFSET)
- break;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
- sizeof(_icmph), &_icmph);
- if (ich == NULL) {
- nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- /* Max length: 18 "TYPE=255 CODE=255 " */
- nf_log_buf_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- if (ich->type <= NR_ICMP_TYPES &&
- required_len[ich->type] &&
- skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
- nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- switch (ich->type) {
- case ICMP_ECHOREPLY:
- case ICMP_ECHO:
- /* Max length: 19 "ID=65535 SEQ=65535 " */
- nf_log_buf_add(m, "ID=%u SEQ=%u ",
- ntohs(ich->un.echo.id),
- ntohs(ich->un.echo.sequence));
- break;
-
- case ICMP_PARAMETERPROB:
- /* Max length: 14 "PARAMETER=255 " */
- nf_log_buf_add(m, "PARAMETER=%u ",
- ntohl(ich->un.gateway) >> 24);
- break;
- case ICMP_REDIRECT:
- /* Max length: 24 "GATEWAY=255.255.255.255 " */
- nf_log_buf_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
- /* Fall through */
- case ICMP_DEST_UNREACH:
- case ICMP_SOURCE_QUENCH:
- case ICMP_TIME_EXCEEDED:
- /* Max length: 3+maxlen */
- if (!iphoff) { /* Only recurse once. */
- nf_log_buf_add(m, "[");
- dump_ipv4_packet(net, m, info, skb,
- iphoff + ih->ihl*4+sizeof(_icmph));
- nf_log_buf_add(m, "] ");
- }
-
- /* Max length: 10 "MTU=65535 " */
- if (ich->type == ICMP_DEST_UNREACH &&
- ich->code == ICMP_FRAG_NEEDED) {
- nf_log_buf_add(m, "MTU=%u ",
- ntohs(ich->un.frag.mtu));
- }
- }
- break;
- }
- /* Max Length */
- case IPPROTO_AH: {
- struct ip_auth_hdr _ahdr;
- const struct ip_auth_hdr *ah;
-
- if (ntohs(ih->frag_off) & IP_OFFSET)
- break;
-
- /* Max length: 9 "PROTO=AH " */
- nf_log_buf_add(m, "PROTO=AH ");
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
- sizeof(_ahdr), &_ahdr);
- if (ah == NULL) {
- nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- /* Length: 15 "SPI=0xF1234567 " */
- nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi));
- break;
- }
- case IPPROTO_ESP: {
- struct ip_esp_hdr _esph;
- const struct ip_esp_hdr *eh;
-
- /* Max length: 10 "PROTO=ESP " */
- nf_log_buf_add(m, "PROTO=ESP ");
-
- if (ntohs(ih->frag_off) & IP_OFFSET)
- break;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
- sizeof(_esph), &_esph);
- if (eh == NULL) {
- nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
- skb->len - iphoff - ih->ihl*4);
- break;
- }
-
- /* Length: 15 "SPI=0xF1234567 " */
- nf_log_buf_add(m, "SPI=0x%x ", ntohl(eh->spi));
- break;
- }
- /* Max length: 10 "PROTO 255 " */
- default:
- nf_log_buf_add(m, "PROTO=%u ", ih->protocol);
- }
-
- /* Max length: 15 "UID=4294967295 " */
- if ((logflags & NF_LOG_UID) && !iphoff)
- nf_log_dump_sk_uid_gid(net, m, skb->sk);
-
- /* Max length: 16 "MARK=0xFFFFFFFF " */
- if (!iphoff && skb->mark)
- nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
-
- /* Proto Max log string length */
- /* IP: 40+46+6+11+127 = 230 */
- /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */
- /* UDP: 10+max(25,20) = 35 */
- /* UDPLITE: 14+max(25,20) = 39 */
- /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
- /* ESP: 10+max(25)+15 = 50 */
- /* AH: 9+max(25)+15 = 49 */
- /* unknown: 10 */
-
- /* (ICMP allows recursion one level deep) */
- /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */
- /* maxlen = 230+ 91 + 230 + 252 = 803 */
-}
-
-static void dump_ipv4_mac_header(struct nf_log_buf *m,
- const struct nf_loginfo *info,
- const struct sk_buff *skb)
-{
- struct net_device *dev = skb->dev;
- unsigned int logflags = 0;
-
- if (info->type == NF_LOG_TYPE_LOG)
- logflags = info->u.log.logflags;
-
- if (!(logflags & NF_LOG_MACDECODE))
- goto fallback;
-
- switch (dev->type) {
- case ARPHRD_ETHER:
- nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
- eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
- ntohs(eth_hdr(skb)->h_proto));
- return;
- default:
- break;
- }
-
-fallback:
- nf_log_buf_add(m, "MAC=");
- if (dev->hard_header_len &&
- skb->mac_header != skb->network_header) {
- const unsigned char *p = skb_mac_header(skb);
- unsigned int i;
-
- nf_log_buf_add(m, "%02x", *p++);
- for (i = 1; i < dev->hard_header_len; i++, p++)
- nf_log_buf_add(m, ":%02x", *p);
- }
- nf_log_buf_add(m, " ");
-}
-
-static void nf_log_ip_packet(struct net *net, u_int8_t pf,
- unsigned int hooknum, const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *loginfo,
- const char *prefix)
-{
- struct nf_log_buf *m;
-
- /* FIXME: Disabled from containers until syslog ns is supported */
- if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
- return;
-
- m = nf_log_buf_open();
-
- if (!loginfo)
- loginfo = &default_loginfo;
-
- nf_log_dump_packet_common(m, pf, hooknum, skb, in,
- out, loginfo, prefix);
-
- if (in != NULL)
- dump_ipv4_mac_header(m, loginfo, skb);
-
- dump_ipv4_packet(net, m, loginfo, skb, 0);
-
- nf_log_buf_close(m);
-}
-
-static struct nf_logger nf_ip_logger __read_mostly = {
- .name = "nf_log_ipv4",
- .type = NF_LOG_TYPE_LOG,
- .logfn = nf_log_ip_packet,
- .me = THIS_MODULE,
-};
-
-static int __net_init nf_log_ipv4_net_init(struct net *net)
-{
- return nf_log_set(net, NFPROTO_IPV4, &nf_ip_logger);
-}
-
-static void __net_exit nf_log_ipv4_net_exit(struct net *net)
-{
- nf_log_unset(net, &nf_ip_logger);
-}
-
-static struct pernet_operations nf_log_ipv4_net_ops = {
- .init = nf_log_ipv4_net_init,
- .exit = nf_log_ipv4_net_exit,
-};
-
-static int __init nf_log_ipv4_init(void)
-{
- int ret;
-
- ret = register_pernet_subsys(&nf_log_ipv4_net_ops);
- if (ret < 0)
- return ret;
-
- ret = nf_log_register(NFPROTO_IPV4, &nf_ip_logger);
- if (ret < 0) {
- pr_err("failed to register logger\n");
- goto err1;
- }
-
- return 0;
-
-err1:
- unregister_pernet_subsys(&nf_log_ipv4_net_ops);
- return ret;
-}
-
-static void __exit nf_log_ipv4_exit(void)
-{
- unregister_pernet_subsys(&nf_log_ipv4_net_ops);
- nf_log_unregister(&nf_ip_logger);
-}
-
-module_init(nf_log_ipv4_init);
-module_exit(nf_log_ipv4_exit);
-
-MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("Netfilter IPv4 packet logging");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NF_LOGGER(AF_INET, 0);
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 4e6b53ab6c33..faee20af4856 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -1,13 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* H.323 extension for NAT alteration.
*
* Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net>
* Copyright (c) 2006-2012 Patrick McHardy <kaber@trash.net>
*
- * This source code is licensed under General Public License version 2.
- *
* Based on the 'brute force' H.323 NAT module by
- * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Jozsef Kadlecsik <kadlec@netfilter.org>
*/
#include <linux/module.h>
@@ -59,7 +58,7 @@ static int set_addr(struct sk_buff *skb, unsigned int protoff,
net_notice_ratelimited("nf_nat_h323: nf_nat_mangle_udp_packet error\n");
return -1;
}
- /* nf_nat_mangle_udp_packet uses skb_make_writable() to copy
+ /* nf_nat_mangle_udp_packet uses skb_ensure_writable() to copy
* or pull everything in a linear buffer, so we can safely
* use the skb pointers now */
*data = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr);
@@ -222,11 +221,11 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
int ret;
rtp_exp->tuple.dst.u.udp.port = htons(nated_port);
- ret = nf_ct_expect_related(rtp_exp);
+ ret = nf_ct_expect_related(rtp_exp, 0);
if (ret == 0) {
rtcp_exp->tuple.dst.u.udp.port =
htons(nated_port + 1);
- ret = nf_ct_expect_related(rtcp_exp);
+ ret = nf_ct_expect_related(rtcp_exp, 0);
if (ret == 0)
break;
else if (ret == -EBUSY) {
@@ -292,20 +291,7 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
exp->expectfn = nf_nat_follow_master;
exp->dir = !dir;
- /* Try to get same port: if not, try to change it. */
- for (; nated_port != 0; nated_port++) {
- int ret;
-
- exp->tuple.dst.u.tcp.port = htons(nated_port);
- ret = nf_ct_expect_related(exp);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- nated_port = 0;
- break;
- }
- }
-
+ nated_port = nf_nat_exp_find_port(exp, nated_port);
if (nated_port == 0) { /* No port available */
net_notice_ratelimited("nf_nat_h323: out of TCP ports\n");
return 0;
@@ -348,20 +334,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
if (info->sig_port[dir] == port)
nated_port = ntohs(info->sig_port[!dir]);
- /* Try to get same port: if not, try to change it. */
- for (; nated_port != 0; nated_port++) {
- int ret;
-
- exp->tuple.dst.u.tcp.port = htons(nated_port);
- ret = nf_ct_expect_related(exp);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- nated_port = 0;
- break;
- }
- }
-
+ nated_port = nf_nat_exp_find_port(exp, nated_port);
if (nated_port == 0) { /* No port available */
net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
return 0;
@@ -440,20 +413,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
if (info->sig_port[dir] == port)
nated_port = ntohs(info->sig_port[!dir]);
- /* Try to get same port: if not, try to change it. */
- for (; nated_port != 0; nated_port++) {
- int ret;
-
- exp->tuple.dst.u.tcp.port = htons(nated_port);
- ret = nf_ct_expect_related(exp);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- nated_port = 0;
- break;
- }
- }
-
+ nated_port = nf_nat_exp_find_port(exp, nated_port);
if (nated_port == 0) { /* No port available */
net_notice_ratelimited("nf_nat_ras: out of TCP ports\n");
return 0;
@@ -533,20 +493,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
exp->expectfn = ip_nat_callforwarding_expect;
exp->dir = !dir;
- /* Try to get same port: if not, try to change it. */
- for (nated_port = ntohs(port); nated_port != 0; nated_port++) {
- int ret;
-
- exp->tuple.dst.u.tcp.port = htons(nated_port);
- ret = nf_ct_expect_related(exp);
- if (ret == 0)
- break;
- else if (ret != -EBUSY) {
- nated_port = 0;
- break;
- }
- }
-
+ nated_port = nf_nat_exp_find_port(exp, ntohs(port));
if (nated_port == 0) { /* No port available */
net_notice_ratelimited("nf_nat_q931: out of TCP ports\n");
return 0;
@@ -580,55 +527,41 @@ static struct nf_ct_helper_expectfn callforwarding_nat = {
.expectfn = ip_nat_callforwarding_expect,
};
+static const struct nfct_h323_nat_hooks nathooks = {
+ .set_h245_addr = set_h245_addr,
+ .set_h225_addr = set_h225_addr,
+ .set_sig_addr = set_sig_addr,
+ .set_ras_addr = set_ras_addr,
+ .nat_rtp_rtcp = nat_rtp_rtcp,
+ .nat_t120 = nat_t120,
+ .nat_h245 = nat_h245,
+ .nat_callforwarding = nat_callforwarding,
+ .nat_q931 = nat_q931,
+};
+
/****************************************************************************/
-static int __init init(void)
+static int __init nf_nat_h323_init(void)
{
- BUG_ON(set_h245_addr_hook != NULL);
- BUG_ON(set_h225_addr_hook != NULL);
- BUG_ON(set_sig_addr_hook != NULL);
- BUG_ON(set_ras_addr_hook != NULL);
- BUG_ON(nat_rtp_rtcp_hook != NULL);
- BUG_ON(nat_t120_hook != NULL);
- BUG_ON(nat_h245_hook != NULL);
- BUG_ON(nat_callforwarding_hook != NULL);
- BUG_ON(nat_q931_hook != NULL);
-
- RCU_INIT_POINTER(set_h245_addr_hook, set_h245_addr);
- RCU_INIT_POINTER(set_h225_addr_hook, set_h225_addr);
- RCU_INIT_POINTER(set_sig_addr_hook, set_sig_addr);
- RCU_INIT_POINTER(set_ras_addr_hook, set_ras_addr);
- RCU_INIT_POINTER(nat_rtp_rtcp_hook, nat_rtp_rtcp);
- RCU_INIT_POINTER(nat_t120_hook, nat_t120);
- RCU_INIT_POINTER(nat_h245_hook, nat_h245);
- RCU_INIT_POINTER(nat_callforwarding_hook, nat_callforwarding);
- RCU_INIT_POINTER(nat_q931_hook, nat_q931);
+ RCU_INIT_POINTER(nfct_h323_nat_hook, &nathooks);
nf_ct_helper_expectfn_register(&q931_nat);
nf_ct_helper_expectfn_register(&callforwarding_nat);
return 0;
}
/****************************************************************************/
-static void __exit fini(void)
+static void __exit nf_nat_h323_fini(void)
{
- RCU_INIT_POINTER(set_h245_addr_hook, NULL);
- RCU_INIT_POINTER(set_h225_addr_hook, NULL);
- RCU_INIT_POINTER(set_sig_addr_hook, NULL);
- RCU_INIT_POINTER(set_ras_addr_hook, NULL);
- RCU_INIT_POINTER(nat_rtp_rtcp_hook, NULL);
- RCU_INIT_POINTER(nat_t120_hook, NULL);
- RCU_INIT_POINTER(nat_h245_hook, NULL);
- RCU_INIT_POINTER(nat_callforwarding_hook, NULL);
- RCU_INIT_POINTER(nat_q931_hook, NULL);
+ RCU_INIT_POINTER(nfct_h323_nat_hook, NULL);
nf_ct_helper_expectfn_unregister(&q931_nat);
nf_ct_helper_expectfn_unregister(&callforwarding_nat);
synchronize_rcu();
}
/****************************************************************************/
-module_init(init);
-module_exit(fini);
+module_init(nf_nat_h323_init);
+module_exit(nf_nat_h323_fini);
MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>");
MODULE_DESCRIPTION("H.323 NAT helper");
MODULE_LICENSE("GPL");
-MODULE_ALIAS("ip_nat_h323");
+MODULE_ALIAS_NF_NAT_HELPER("h323");
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
deleted file mode 100644
index 6115bf1ff6f0..000000000000
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ /dev/null
@@ -1,421 +0,0 @@
-/*
- * (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2011 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <linux/icmp.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <net/secure_seq.h>
-#include <net/checksum.h>
-#include <net/route.h>
-#include <net/ip.h>
-
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/netfilter/nf_nat_l4proto.h>
-
-static const struct nf_nat_l3proto nf_nat_l3proto_ipv4;
-
-#ifdef CONFIG_XFRM
-static void nf_nat_ipv4_decode_session(struct sk_buff *skb,
- const struct nf_conn *ct,
- enum ip_conntrack_dir dir,
- unsigned long statusbit,
- struct flowi *fl)
-{
- const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple;
- struct flowi4 *fl4 = &fl->u.ip4;
-
- if (ct->status & statusbit) {
- fl4->daddr = t->dst.u3.ip;
- if (t->dst.protonum == IPPROTO_TCP ||
- t->dst.protonum == IPPROTO_UDP ||
- t->dst.protonum == IPPROTO_UDPLITE ||
- t->dst.protonum == IPPROTO_DCCP ||
- t->dst.protonum == IPPROTO_SCTP)
- fl4->fl4_dport = t->dst.u.all;
- }
-
- statusbit ^= IPS_NAT_MASK;
-
- if (ct->status & statusbit) {
- fl4->saddr = t->src.u3.ip;
- if (t->dst.protonum == IPPROTO_TCP ||
- t->dst.protonum == IPPROTO_UDP ||
- t->dst.protonum == IPPROTO_UDPLITE ||
- t->dst.protonum == IPPROTO_DCCP ||
- t->dst.protonum == IPPROTO_SCTP)
- fl4->fl4_sport = t->src.u.all;
- }
-}
-#endif /* CONFIG_XFRM */
-
-static bool nf_nat_ipv4_in_range(const struct nf_conntrack_tuple *t,
- const struct nf_nat_range2 *range)
-{
- return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) &&
- ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip);
-}
-
-static u32 nf_nat_ipv4_secure_port(const struct nf_conntrack_tuple *t,
- __be16 dport)
-{
- return secure_ipv4_port_ephemeral(t->src.u3.ip, t->dst.u3.ip, dport);
-}
-
-static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb,
- unsigned int iphdroff,
- const struct nf_nat_l4proto *l4proto,
- const struct nf_conntrack_tuple *target,
- enum nf_nat_manip_type maniptype)
-{
- struct iphdr *iph;
- unsigned int hdroff;
-
- if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
- return false;
-
- iph = (void *)skb->data + iphdroff;
- hdroff = iphdroff + iph->ihl * 4;
-
- if (!l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv4, iphdroff, hdroff,
- target, maniptype))
- return false;
- iph = (void *)skb->data + iphdroff;
-
- if (maniptype == NF_NAT_MANIP_SRC) {
- csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
- iph->saddr = target->src.u3.ip;
- } else {
- csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
- iph->daddr = target->dst.u3.ip;
- }
- return true;
-}
-
-static void nf_nat_ipv4_csum_update(struct sk_buff *skb,
- unsigned int iphdroff, __sum16 *check,
- const struct nf_conntrack_tuple *t,
- enum nf_nat_manip_type maniptype)
-{
- struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff);
- __be32 oldip, newip;
-
- if (maniptype == NF_NAT_MANIP_SRC) {
- oldip = iph->saddr;
- newip = t->src.u3.ip;
- } else {
- oldip = iph->daddr;
- newip = t->dst.u3.ip;
- }
- inet_proto_csum_replace4(check, skb, oldip, newip, true);
-}
-
-static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
- u8 proto, void *data, __sum16 *check,
- int datalen, int oldlen)
-{
- if (skb->ip_summed != CHECKSUM_PARTIAL) {
- const struct iphdr *iph = ip_hdr(skb);
-
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
- ip_hdrlen(skb);
- skb->csum_offset = (void *)check - data;
- *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen,
- proto, 0);
- } else
- inet_proto_csum_replace2(check, skb,
- htons(oldlen), htons(datalen), true);
-}
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[],
- struct nf_nat_range2 *range)
-{
- if (tb[CTA_NAT_V4_MINIP]) {
- range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]);
- range->flags |= NF_NAT_RANGE_MAP_IPS;
- }
-
- if (tb[CTA_NAT_V4_MAXIP])
- range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]);
- else
- range->max_addr.ip = range->min_addr.ip;
-
- return 0;
-}
-#endif
-
-static const struct nf_nat_l3proto nf_nat_l3proto_ipv4 = {
- .l3proto = NFPROTO_IPV4,
- .in_range = nf_nat_ipv4_in_range,
- .secure_port = nf_nat_ipv4_secure_port,
- .manip_pkt = nf_nat_ipv4_manip_pkt,
- .csum_update = nf_nat_ipv4_csum_update,
- .csum_recalc = nf_nat_ipv4_csum_recalc,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_to_range = nf_nat_ipv4_nlattr_to_range,
-#endif
-#ifdef CONFIG_XFRM
- .decode_session = nf_nat_ipv4_decode_session,
-#endif
-};
-
-int nf_nat_icmp_reply_translation(struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int hooknum)
-{
- struct {
- struct icmphdr icmp;
- struct iphdr ip;
- } *inside;
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
- enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
- unsigned int hdrlen = ip_hdrlen(skb);
- const struct nf_nat_l4proto *l4proto;
- struct nf_conntrack_tuple target;
- unsigned long statusbit;
-
- WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY);
-
- if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
- return 0;
- if (nf_ip_checksum(skb, hooknum, hdrlen, 0))
- return 0;
-
- inside = (void *)skb->data + hdrlen;
- if (inside->icmp.type == ICMP_REDIRECT) {
- if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
- return 0;
- if (ct->status & IPS_NAT_MASK)
- return 0;
- }
-
- if (manip == NF_NAT_MANIP_SRC)
- statusbit = IPS_SRC_NAT;
- else
- statusbit = IPS_DST_NAT;
-
- /* Invert if this is reply direction */
- if (dir == IP_CT_DIR_REPLY)
- statusbit ^= IPS_NAT_MASK;
-
- if (!(ct->status & statusbit))
- return 1;
-
- l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, inside->ip.protocol);
- if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp),
- l4proto, &ct->tuplehash[!dir].tuple, !manip))
- return 0;
-
- if (skb->ip_summed != CHECKSUM_PARTIAL) {
- /* Reloading "inside" here since manip_pkt may reallocate */
- inside = (void *)skb->data + hdrlen;
- inside->icmp.checksum = 0;
- inside->icmp.checksum =
- csum_fold(skb_checksum(skb, hdrlen,
- skb->len - hdrlen, 0));
- }
-
- /* Change outer to look like the reply to an incoming packet */
- nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
- l4proto = __nf_nat_l4proto_find(NFPROTO_IPV4, 0);
- if (!nf_nat_ipv4_manip_pkt(skb, 0, l4proto, &target, manip))
- return 0;
-
- return 1;
-}
-EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation);
-
-static unsigned int
-nf_nat_ipv4_fn(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct)
- return NF_ACCEPT;
-
- if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) {
- if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
- if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
- state->hook))
- return NF_DROP;
- else
- return NF_ACCEPT;
- }
- }
-
- return nf_nat_inet_fn(priv, skb, state);
-}
-EXPORT_SYMBOL_GPL(nf_nat_ipv4_fn);
-
-static unsigned int
-nf_nat_ipv4_in(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- unsigned int ret;
- __be32 daddr = ip_hdr(skb)->daddr;
-
- ret = nf_nat_ipv4_fn(priv, skb, state);
- if (ret != NF_DROP && ret != NF_STOLEN &&
- daddr != ip_hdr(skb)->daddr)
- skb_dst_drop(skb);
-
- return ret;
-}
-
-static unsigned int
-nf_nat_ipv4_out(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
-#ifdef CONFIG_XFRM
- const struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- int err;
-#endif
- unsigned int ret;
-
- ret = nf_nat_ipv4_fn(priv, skb, state);
-#ifdef CONFIG_XFRM
- if (ret != NF_DROP && ret != NF_STOLEN &&
- !(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
- (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-
- if ((ct->tuplehash[dir].tuple.src.u3.ip !=
- ct->tuplehash[!dir].tuple.dst.u3.ip) ||
- (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
- ct->tuplehash[dir].tuple.src.u.all !=
- ct->tuplehash[!dir].tuple.dst.u.all)) {
- err = nf_xfrm_me_harder(state->net, skb, AF_INET);
- if (err < 0)
- ret = NF_DROP_ERR(err);
- }
- }
-#endif
- return ret;
-}
-
-static unsigned int
-nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- const struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- unsigned int ret;
- int err;
-
- ret = nf_nat_ipv4_fn(priv, skb, state);
- if (ret != NF_DROP && ret != NF_STOLEN &&
- (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-
- if (ct->tuplehash[dir].tuple.dst.u3.ip !=
- ct->tuplehash[!dir].tuple.src.u3.ip) {
- err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
- if (err < 0)
- ret = NF_DROP_ERR(err);
- }
-#ifdef CONFIG_XFRM
- else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
- ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP &&
- ct->tuplehash[dir].tuple.dst.u.all !=
- ct->tuplehash[!dir].tuple.src.u.all) {
- err = nf_xfrm_me_harder(state->net, skb, AF_INET);
- if (err < 0)
- ret = NF_DROP_ERR(err);
- }
-#endif
- }
- return ret;
-}
-
-static const struct nf_hook_ops nf_nat_ipv4_ops[] = {
- /* Before packet filtering, change destination */
- {
- .hook = nf_nat_ipv4_in,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_PRE_ROUTING,
- .priority = NF_IP_PRI_NAT_DST,
- },
- /* After packet filtering, change source */
- {
- .hook = nf_nat_ipv4_out,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_NAT_SRC,
- },
- /* Before packet filtering, change destination */
- {
- .hook = nf_nat_ipv4_local_fn,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_OUT,
- .priority = NF_IP_PRI_NAT_DST,
- },
- /* After packet filtering, change source */
- {
- .hook = nf_nat_ipv4_fn,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP_PRI_NAT_SRC,
- },
-};
-
-int nf_nat_l3proto_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops)
-{
- return nf_nat_register_fn(net, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
-}
-EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_register_fn);
-
-void nf_nat_l3proto_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
-{
- nf_nat_unregister_fn(net, ops, ARRAY_SIZE(nf_nat_ipv4_ops));
-}
-EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv4_unregister_fn);
-
-static int __init nf_nat_l3proto_ipv4_init(void)
-{
- int err;
-
- err = nf_nat_l4proto_register(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
- if (err < 0)
- goto err1;
- err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv4);
- if (err < 0)
- goto err2;
- return err;
-
-err2:
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
-err1:
- return err;
-}
-
-static void __exit nf_nat_l3proto_ipv4_exit(void)
-{
- nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv4);
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &nf_nat_l4proto_icmp);
-}
-
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("nf-nat-" __stringify(AF_INET));
-
-module_init(nf_nat_l3proto_ipv4_init);
-module_exit(nf_nat_l3proto_ipv4_exit);
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
deleted file mode 100644
index ad3aeff152ed..000000000000
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ /dev/null
@@ -1,158 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/atomic.h>
-#include <linux/inetdevice.h>
-#include <linux/ip.h>
-#include <linux/timer.h>
-#include <linux/netfilter.h>
-#include <net/protocol.h>
-#include <net/ip.h>
-#include <net/checksum.h>
-#include <net/route.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/x_tables.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/ipv4/nf_nat_masquerade.h>
-
-unsigned int
-nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum,
- const struct nf_nat_range2 *range,
- const struct net_device *out)
-{
- struct nf_conn *ct;
- struct nf_conn_nat *nat;
- enum ip_conntrack_info ctinfo;
- struct nf_nat_range2 newrange;
- const struct rtable *rt;
- __be32 newsrc, nh;
-
- WARN_ON(hooknum != NF_INET_POST_ROUTING);
-
- ct = nf_ct_get(skb, &ctinfo);
-
- WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
- ctinfo == IP_CT_RELATED_REPLY)));
-
- /* Source address is 0.0.0.0 - locally generated packet that is
- * probably not supposed to be masqueraded.
- */
- if (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip == 0)
- return NF_ACCEPT;
-
- rt = skb_rtable(skb);
- nh = rt_nexthop(rt, ip_hdr(skb)->daddr);
- newsrc = inet_select_addr(out, nh, RT_SCOPE_UNIVERSE);
- if (!newsrc) {
- pr_info("%s ate my IP address\n", out->name);
- return NF_DROP;
- }
-
- nat = nf_ct_nat_ext_add(ct);
- if (nat)
- nat->masq_index = out->ifindex;
-
- /* Transfer from original range. */
- memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
- memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
- newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
- newrange.min_addr.ip = newsrc;
- newrange.max_addr.ip = newsrc;
- newrange.min_proto = range->min_proto;
- newrange.max_proto = range->max_proto;
-
- /* Hand modified range to generic setup. */
- return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
-}
-EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4);
-
-static int device_cmp(struct nf_conn *i, void *ifindex)
-{
- const struct nf_conn_nat *nat = nfct_nat(i);
-
- if (!nat)
- return 0;
- if (nf_ct_l3num(i) != NFPROTO_IPV4)
- return 0;
- return nat->masq_index == (int)(long)ifindex;
-}
-
-static int masq_device_event(struct notifier_block *this,
- unsigned long event,
- void *ptr)
-{
- const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct net *net = dev_net(dev);
-
- if (event == NETDEV_DOWN) {
- /* Device was downed. Search entire table for
- * conntracks which were associated with that device,
- * and forget them.
- */
- WARN_ON(dev->ifindex == 0);
-
- nf_ct_iterate_cleanup_net(net, device_cmp,
- (void *)(long)dev->ifindex, 0, 0);
- }
-
- return NOTIFY_DONE;
-}
-
-static int masq_inet_event(struct notifier_block *this,
- unsigned long event,
- void *ptr)
-{
- struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev;
- struct netdev_notifier_info info;
-
- /* The masq_dev_notifier will catch the case of the device going
- * down. So if the inetdev is dead and being destroyed we have
- * no work to do. Otherwise this is an individual address removal
- * and we have to perform the flush.
- */
- if (idev->dead)
- return NOTIFY_DONE;
-
- netdev_notifier_info_init(&info, idev->dev);
- return masq_device_event(this, event, &info);
-}
-
-static struct notifier_block masq_dev_notifier = {
- .notifier_call = masq_device_event,
-};
-
-static struct notifier_block masq_inet_notifier = {
- .notifier_call = masq_inet_event,
-};
-
-static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0);
-
-void nf_nat_masquerade_ipv4_register_notifier(void)
-{
- /* check if the notifier was already set */
- if (atomic_inc_return(&masquerade_notifier_refcount) > 1)
- return;
-
- /* Register for device down reports */
- register_netdevice_notifier(&masq_dev_notifier);
- /* Register IP address change reports */
- register_inetaddr_notifier(&masq_inet_notifier);
-}
-EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_register_notifier);
-
-void nf_nat_masquerade_ipv4_unregister_notifier(void)
-{
- /* check if the notifier still has clients */
- if (atomic_dec_return(&masquerade_notifier_refcount) > 0)
- return;
-
- unregister_netdevice_notifier(&masq_dev_notifier);
- unregister_inetaddr_notifier(&masq_inet_notifier);
-}
-EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier);
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 5d259a12e25f..fab357cc8559 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -1,8 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* nf_nat_pptp.c
*
* NAT support for PPTP (Point to Point Tunneling Protocol).
- * PPTP is a a protocol for creating virtual private networks.
+ * PPTP is a protocol for creating virtual private networks.
* It is a specification defined by Microsoft and some vendors
* working with Microsoft. PPTP is built on top of a modified
* version of the Internet Generic Routing Encapsulation Protocol.
@@ -37,7 +38,7 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
MODULE_DESCRIPTION("Netfilter NAT helper module for PPTP");
-MODULE_ALIAS("ip_nat_pptp");
+MODULE_ALIAS_NF_NAT_HELPER("pptp");
static void pptp_nat_expected(struct nf_conn *ct,
struct nf_conntrack_expect *exp)
@@ -165,9 +166,8 @@ pptp_outbound_pkt(struct sk_buff *skb,
break;
default:
pr_debug("unknown outbound packet 0x%04x:%s\n", msg,
- msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] :
- pptp_msg_name[0]);
- /* fall through */
+ pptp_msg_name(msg));
+ fallthrough;
case PPTP_SET_LINK_INFO:
/* only need to NAT in case PAC is behind NAT box */
case PPTP_START_SESSION_REQUEST:
@@ -267,10 +267,8 @@ pptp_inbound_pkt(struct sk_buff *skb,
pcid_off = offsetof(union pptp_ctrl_union, setlink.peersCallID);
break;
default:
- pr_debug("unknown inbound packet %s\n",
- msg <= PPTP_MSG_MAX ? pptp_msg_name[msg] :
- pptp_msg_name[0]);
- /* fall through */
+ pr_debug("unknown inbound packet %s\n", pptp_msg_name(msg));
+ fallthrough;
case PPTP_START_SESSION_REQUEST:
case PPTP_START_SESSION_REPLY:
case PPTP_STOP_SESSION_REQUEST:
@@ -297,30 +295,24 @@ pptp_inbound_pkt(struct sk_buff *skb,
return NF_ACCEPT;
}
+static const struct nf_nat_pptp_hook pptp_hooks = {
+ .outbound = pptp_outbound_pkt,
+ .inbound = pptp_inbound_pkt,
+ .exp_gre = pptp_exp_gre,
+ .expectfn = pptp_nat_expected,
+};
+
static int __init nf_nat_helper_pptp_init(void)
{
- nf_nat_need_gre();
-
- BUG_ON(nf_nat_pptp_hook_outbound != NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, pptp_outbound_pkt);
-
- BUG_ON(nf_nat_pptp_hook_inbound != NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, pptp_inbound_pkt);
-
- BUG_ON(nf_nat_pptp_hook_exp_gre != NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, pptp_exp_gre);
+ WARN_ON(nf_nat_pptp_hook != NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook, &pptp_hooks);
- BUG_ON(nf_nat_pptp_hook_expectfn != NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, pptp_nat_expected);
return 0;
}
static void __exit nf_nat_helper_pptp_fini(void)
{
- RCU_INIT_POINTER(nf_nat_pptp_hook_expectfn, NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_exp_gre, NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_inbound, NULL);
- RCU_INIT_POINTER(nf_nat_pptp_hook_outbound, NULL);
+ RCU_INIT_POINTER(nf_nat_pptp_hook, NULL);
synchronize_rcu();
}
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c
deleted file mode 100644
index 00fda6331ce5..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * nf_nat_proto_gre.c
- *
- * NAT protocol helper module for GRE.
- *
- * GRE is a generic encapsulation protocol, which is generally not very
- * suited for NAT, as it has no protocol-specific part as port numbers.
- *
- * It has an optional key field, which may help us distinguishing two
- * connections between the same two hosts.
- *
- * GRE is defined in RFC 1701 and RFC 1702, as well as RFC 2784
- *
- * PPTP is built on top of a modified version of GRE, and has a mandatory
- * field called "CallID", which serves us for the same purpose as the key
- * field in plain GRE.
- *
- * Documentation about PPTP can be found in RFC 2637
- *
- * (C) 2000-2005 by Harald Welte <laforge@gnumonks.org>
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- *
- * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
- *
- */
-
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_l4proto.h>
-#include <linux/netfilter/nf_conntrack_proto_gre.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
-MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE");
-
-/* generate unique tuple ... */
-static void
-gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
- struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range2 *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
-{
- static u_int16_t key;
- __be16 *keyptr;
- unsigned int min, i, range_size;
-
- /* If there is no master conntrack we are not PPTP,
- do not change tuples */
- if (!ct->master)
- return;
-
- if (maniptype == NF_NAT_MANIP_SRC)
- keyptr = &tuple->src.u.gre.key;
- else
- keyptr = &tuple->dst.u.gre.key;
-
- if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
- pr_debug("%p: NATing GRE PPTP\n", ct);
- min = 1;
- range_size = 0xffff;
- } else {
- min = ntohs(range->min_proto.gre.key);
- range_size = ntohs(range->max_proto.gre.key) - min + 1;
- }
-
- pr_debug("min = %u, range_size = %u\n", min, range_size);
-
- for (i = 0; ; ++key) {
- *keyptr = htons(min + key % range_size);
- if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
- return;
- }
-
- pr_debug("%p: no NAT mapping\n", ct);
- return;
-}
-
-/* manipulate a GRE packet according to maniptype */
-static bool
-gre_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
- unsigned int iphdroff, unsigned int hdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
- const struct gre_base_hdr *greh;
- struct pptp_gre_header *pgreh;
-
- /* pgreh includes two optional 32bit fields which are not required
- * to be there. That's where the magic '8' comes from */
- if (!skb_make_writable(skb, hdroff + sizeof(*pgreh) - 8))
- return false;
-
- greh = (void *)skb->data + hdroff;
- pgreh = (struct pptp_gre_header *)greh;
-
- /* we only have destination manip of a packet, since 'source key'
- * is not present in the packet itself */
- if (maniptype != NF_NAT_MANIP_DST)
- return true;
-
- switch (greh->flags & GRE_VERSION) {
- case GRE_VERSION_0:
- /* We do not currently NAT any GREv0 packets.
- * Try to behave like "nf_nat_proto_unknown" */
- break;
- case GRE_VERSION_1:
- pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key));
- pgreh->call_id = tuple->dst.u.gre.key;
- break;
- default:
- pr_debug("can't nat unknown GRE version\n");
- return false;
- }
- return true;
-}
-
-static const struct nf_nat_l4proto gre = {
- .l4proto = IPPROTO_GRE,
- .manip_pkt = gre_manip_pkt,
- .in_range = nf_nat_l4proto_in_range,
- .unique_tuple = gre_unique_tuple,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
-#endif
-};
-
-static int __init nf_nat_proto_gre_init(void)
-{
- return nf_nat_l4proto_register(NFPROTO_IPV4, &gre);
-}
-
-static void __exit nf_nat_proto_gre_fini(void)
-{
- nf_nat_l4proto_unregister(NFPROTO_IPV4, &gre);
-}
-
-module_init(nf_nat_proto_gre_init);
-module_exit(nf_nat_proto_gre_fini);
-
-void nf_nat_need_gre(void)
-{
- return;
-}
-EXPORT_SYMBOL_GPL(nf_nat_need_gre);
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c
deleted file mode 100644
index 6d7cf1d79baf..000000000000
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ /dev/null
@@ -1,83 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/export.h>
-#include <linux/ip.h>
-#include <linux/icmp.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_l4proto.h>
-
-static bool
-icmp_in_range(const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype,
- const union nf_conntrack_man_proto *min,
- const union nf_conntrack_man_proto *max)
-{
- return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) &&
- ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
-}
-
-static void
-icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
- struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range2 *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
-{
- static u_int16_t id;
- unsigned int range_size;
- unsigned int i;
-
- range_size = ntohs(range->max_proto.icmp.id) -
- ntohs(range->min_proto.icmp.id) + 1;
- /* If no range specified... */
- if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
- range_size = 0xFFFF;
-
- for (i = 0; ; ++id) {
- tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
- (id % range_size));
- if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
- return;
- }
- return;
-}
-
-static bool
-icmp_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
- unsigned int iphdroff, unsigned int hdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
- struct icmphdr *hdr;
-
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
- return false;
-
- hdr = (struct icmphdr *)(skb->data + hdroff);
- inet_proto_csum_replace2(&hdr->checksum, skb,
- hdr->un.echo.id, tuple->src.u.icmp.id, false);
- hdr->un.echo.id = tuple->src.u.icmp.id;
- return true;
-}
-
-const struct nf_nat_l4proto nf_nat_l4proto_icmp = {
- .l4proto = IPPROTO_ICMP,
- .manip_pkt = icmp_manip_pkt,
- .in_range = icmp_in_range,
- .unique_tuple = icmp_unique_tuple,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
-#endif
-};
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.asn1 b/net/ipv4/netfilter/nf_nat_snmp_basic.asn1
index 24b73268f362..dc2cc5794160 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.asn1
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.asn1
@@ -1,3 +1,11 @@
+-- SPDX-License-Identifier: BSD-3-Clause
+--
+-- Copyright (C) 1990, 2002 IETF Trust and the persons identified as authors
+-- of the code
+--
+-- https://www.rfc-editor.org/rfc/rfc1157#section-4
+-- https://www.rfc-editor.org/rfc/rfc3416#section-3
+
Message ::=
SEQUENCE {
version
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
index ac110c1d55b5..717b726504fe 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic_main.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* nf_nat_snmp_basic.c
*
@@ -25,17 +26,6 @@
*
* Copyright (c) 2000 RP Internet (www.rpi.net.au).
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- *
* Author: James Morris <jmorris@intercode.com.au>
*
* Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net>
@@ -60,6 +50,7 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway");
MODULE_ALIAS("ip_nat_snmp_basic");
+MODULE_ALIAS_NFCT_HELPER("snmp_trap");
#define SNMP_PORT 161
#define SNMP_TRAP_PORT 162
@@ -104,6 +95,8 @@ static void fast_csum(struct snmp_ctx *ctx, unsigned char offset)
int snmp_version(void *context, size_t hdrlen, unsigned char tag,
const void *data, size_t datalen)
{
+ if (datalen != 1)
+ return -EINVAL;
if (*(unsigned char *)data > 1)
return -ENOTSUPP;
return 1;
@@ -113,8 +106,11 @@ int snmp_helper(void *context, size_t hdrlen, unsigned char tag,
const void *data, size_t datalen)
{
struct snmp_ctx *ctx = (struct snmp_ctx *)context;
- __be32 *pdata = (__be32 *)data;
+ __be32 *pdata;
+ if (datalen != 4)
+ return -EINVAL;
+ pdata = (__be32 *)data;
if (*pdata == ctx->from) {
pr_debug("%s: %pI4 to %pI4\n", __func__,
(void *)&ctx->from, (void *)&ctx->to);
@@ -190,7 +186,7 @@ static int help(struct sk_buff *skb, unsigned int protoff,
return NF_DROP;
}
- if (!skb_make_writable(skb, skb->len)) {
+ if (skb_ensure_writable(skb, skb->len)) {
nf_ct_helper_log(skb, ct, "cannot mangle packet");
return NF_DROP;
}
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 5cd06ba3535d..fae4aa4a5f09 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -15,8 +12,167 @@
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
-const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
- struct tcphdr *_oth, int hook)
+static struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ __u8 protocol, int ttl);
+static void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
+ const struct tcphdr *oth);
+static const struct tcphdr *
+nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
+ struct tcphdr *_oth, int hook);
+
+static int nf_reject_iphdr_validate(struct sk_buff *skb)
+{
+ struct iphdr *iph;
+ u32 len;
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ return 0;
+
+ iph = ip_hdr(skb);
+ if (iph->ihl < 5 || iph->version != 4)
+ return 0;
+
+ len = ntohs(iph->tot_len);
+ if (skb->len < len)
+ return 0;
+ else if (len < (iph->ihl*4))
+ return 0;
+
+ if (!pskb_may_pull(skb, iph->ihl*4))
+ return 0;
+
+ return 1;
+}
+
+struct sk_buff *nf_reject_skb_v4_tcp_reset(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook)
+{
+ const struct tcphdr *oth;
+ struct sk_buff *nskb;
+ struct iphdr *niph;
+ struct tcphdr _oth;
+
+ if (!nf_reject_iphdr_validate(oldskb))
+ return NULL;
+
+ oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook);
+ if (!oth)
+ return NULL;
+
+ nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) +
+ LL_MAX_HEADER, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+
+ nskb->dev = (struct net_device *)dev;
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
+ READ_ONCE(net->ipv4.sysctl_ip_default_ttl));
+ nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
+ niph->tot_len = htons(nskb->len);
+ ip_send_check(niph);
+
+ return nskb;
+}
+EXPORT_SYMBOL_GPL(nf_reject_skb_v4_tcp_reset);
+
+static bool nf_skb_is_icmp_unreach(const struct sk_buff *skb)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ u8 *tp, _type;
+ int thoff;
+
+ if (iph->protocol != IPPROTO_ICMP)
+ return false;
+
+ thoff = skb_network_offset(skb) + sizeof(*iph);
+
+ tp = skb_header_pointer(skb,
+ thoff + offsetof(struct icmphdr, type),
+ sizeof(_type), &_type);
+
+ if (!tp)
+ return false;
+
+ return *tp == ICMP_DEST_UNREACH;
+}
+
+struct sk_buff *nf_reject_skb_v4_unreach(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook, u8 code)
+{
+ struct sk_buff *nskb;
+ struct iphdr *niph;
+ struct icmphdr *icmph;
+ unsigned int len;
+ int dataoff;
+ __wsum csum;
+ u8 proto;
+
+ if (!nf_reject_iphdr_validate(oldskb))
+ return NULL;
+
+ /* IP header checks: fragment. */
+ if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
+ return NULL;
+
+ /* don't reply to ICMP_DEST_UNREACH with ICMP_DEST_UNREACH. */
+ if (nf_skb_is_icmp_unreach(oldskb))
+ return NULL;
+
+ /* RFC says return as much as we can without exceeding 576 bytes. */
+ len = min_t(unsigned int, 536, oldskb->len);
+
+ if (!pskb_may_pull(oldskb, len))
+ return NULL;
+
+ if (pskb_trim_rcsum(oldskb, ntohs(ip_hdr(oldskb)->tot_len)))
+ return NULL;
+
+ dataoff = ip_hdrlen(oldskb);
+ proto = ip_hdr(oldskb)->protocol;
+
+ if (!skb_csum_unnecessary(oldskb) &&
+ nf_reject_verify_csum(oldskb, dataoff, proto) &&
+ nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), proto))
+ return NULL;
+
+ nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct icmphdr) +
+ LL_MAX_HEADER + len, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+
+ nskb->dev = (struct net_device *)dev;
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP,
+ READ_ONCE(net->ipv4.sysctl_ip_default_ttl));
+
+ skb_reset_transport_header(nskb);
+ icmph = skb_put_zero(nskb, sizeof(struct icmphdr));
+ icmph->type = ICMP_DEST_UNREACH;
+ icmph->code = code;
+
+ skb_put_data(nskb, skb_network_header(oldskb), len);
+
+ csum = csum_partial((void *)icmph, len + sizeof(struct icmphdr), 0);
+ icmph->checksum = csum_fold(csum);
+
+ niph->tot_len = htons(nskb->len);
+ ip_send_check(niph);
+
+ return nskb;
+}
+EXPORT_SYMBOL_GPL(nf_reject_skb_v4_unreach);
+
+static const struct tcphdr *
+nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
+ struct tcphdr *_oth, int hook)
{
const struct tcphdr *oth;
@@ -42,11 +198,10 @@ const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
return oth;
}
-EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get);
-struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
- const struct sk_buff *oldskb,
- __u8 protocol, int ttl)
+static struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ __u8 protocol, int ttl)
{
struct iphdr *niph, *oiph = ip_hdr(oldskb);
@@ -67,10 +222,9 @@ struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
return niph;
}
-EXPORT_SYMBOL_GPL(nf_reject_iphdr_put);
-void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
- const struct tcphdr *oth)
+static void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
+ const struct tcphdr *oth)
{
struct iphdr *niph = ip_hdr(nskb);
struct tcphdr *tcph;
@@ -97,20 +251,37 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
nskb->csum_start = (unsigned char *)tcph - nskb->head;
nskb->csum_offset = offsetof(struct tcphdr, check);
}
-EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put);
+
+static int nf_reject_fill_skb_dst(struct sk_buff *skb_in)
+{
+ struct dst_entry *dst = NULL;
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(struct flowi));
+ fl.u.ip4.daddr = ip_hdr(skb_in)->saddr;
+ nf_ip_route(dev_net(skb_in->dev), &dst, &fl, false);
+ if (!dst)
+ return -1;
+
+ skb_dst_set(skb_in, dst);
+ return 0;
+}
/* Send RST reply */
-void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
+void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
+ int hook)
{
- struct sk_buff *nskb;
- struct iphdr *niph;
const struct tcphdr *oth;
+ struct sk_buff *nskb;
struct tcphdr _oth;
oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook);
if (!oth)
return;
+ if (!skb_dst(oldskb) && nf_reject_fill_skb_dst(oldskb) < 0)
+ return;
+
if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
return;
@@ -125,20 +296,18 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
nskb->mark = IP4_REPLY_MARK(net, oldskb->mark);
skb_reserve(nskb, LL_MAX_HEADER);
- niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
- ip4_dst_hoplimit(skb_dst(nskb)));
+ nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
+ ip4_dst_hoplimit(skb_dst(nskb)));
nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
-
- if (ip_route_me_harder(net, nskb, RTN_UNSPEC))
+ if (ip_route_me_harder(net, sk, nskb, RTN_UNSPEC))
goto free_nskb;
- niph = ip_hdr(nskb);
-
/* "Never happens" */
if (nskb->len > dst_mtu(skb_dst(nskb)))
goto free_nskb;
nf_ct_attach(nskb, oldskb);
+ nf_ct_set_closing(skb_nfct(oldskb));
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
/* If we use ip_local_out for bridged traffic, the MAC source on
@@ -147,10 +316,16 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
* build the eth header using the original destination's MAC as the
* source, and send the RST packet directly.
*/
- if (oldskb->nf_bridge) {
+ if (nf_bridge_info_exists(oldskb)) {
struct ethhdr *oeth = eth_hdr(oldskb);
+ struct iphdr *niph = ip_hdr(nskb);
+ struct net_device *br_indev;
- nskb->dev = nf_bridge_get_physindev(oldskb);
+ br_indev = nf_bridge_get_physindev(oldskb, net);
+ if (!br_indev)
+ goto free_nskb;
+
+ nskb->dev = br_indev;
niph->tot_len = htons(nskb->len);
ip_send_check(niph);
if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
@@ -171,24 +346,25 @@ EXPORT_SYMBOL_GPL(nf_send_reset);
void nf_send_unreach(struct sk_buff *skb_in, int code, int hook)
{
struct iphdr *iph = ip_hdr(skb_in);
- u8 proto;
+ int dataoff = ip_hdrlen(skb_in);
+ u8 proto = iph->protocol;
if (iph->frag_off & htons(IP_OFFSET))
return;
- if (skb_csum_unnecessary(skb_in)) {
+ if (!skb_dst(skb_in) && nf_reject_fill_skb_dst(skb_in) < 0)
+ return;
+
+ if (skb_csum_unnecessary(skb_in) ||
+ !nf_reject_verify_csum(skb_in, dataoff, proto)) {
icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
return;
}
- if (iph->protocol == IPPROTO_TCP || iph->protocol == IPPROTO_UDP)
- proto = iph->protocol;
- else
- proto = 0;
-
- if (nf_ip_checksum(skb_in, hook, ip_hdrlen(skb_in), proto) == 0)
+ if (nf_ip_checksum(skb_in, hook, dataoff, proto) == 0)
icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
}
EXPORT_SYMBOL_GPL(nf_send_unreach);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IPv4 packet rejection core");
diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c
index 4824b1e183a1..5080fa5fbf6a 100644
--- a/net/ipv4/netfilter/nf_socket_ipv4.c
+++ b/net/ipv4/netfilter/nf_socket_ipv4.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2007-2008 BalaBit IT Ltd.
* Author: Krisztian Kovacs
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -35,16 +31,8 @@ extract_icmp4_fields(const struct sk_buff *skb, u8 *protocol,
if (icmph == NULL)
return 1;
- switch (icmph->type) {
- case ICMP_DEST_UNREACH:
- case ICMP_SOURCE_QUENCH:
- case ICMP_REDIRECT:
- case ICMP_TIME_EXCEEDED:
- case ICMP_PARAMETERPROB:
- break;
- default:
+ if (!icmp_is_err(icmph->type))
return 1;
- }
inside_iph = skb_header_pointer(skb, outside_hdrlen +
sizeof(struct icmphdr),
@@ -83,8 +71,7 @@ nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
{
switch (protocol) {
case IPPROTO_TCP:
- return inet_lookup(net, &tcp_hashinfo, skb, doff,
- saddr, sport, daddr, dport,
+ return inet_lookup(net, skb, doff, saddr, sport, daddr, dport,
in->ifindex);
case IPPROTO_UDP:
return udp4_lib_lookup(net, saddr, sport, daddr, dport,
@@ -96,11 +83,11 @@ nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb,
const struct net_device *indev)
{
- __be32 uninitialized_var(daddr), uninitialized_var(saddr);
- __be16 uninitialized_var(dport), uninitialized_var(sport);
+ __be32 daddr, saddr;
+ __be16 dport, sport;
const struct iphdr *iph = ip_hdr(skb);
struct sk_buff *data_skb = NULL;
- u8 uninitialized_var(protocol);
+ u8 protocol;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
enum ip_conntrack_info ctinfo;
struct nf_conn const *ct;
diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c
index 164714104965..041c3f37f237 100644
--- a/net/ipv4/netfilter/nf_tproxy_ipv4.c
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2007-2008 BalaBit IT Ltd.
* Author: Krisztian Kovacs
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#include <net/netfilter/nf_tproxy.h>
@@ -42,7 +38,7 @@ nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
hp->source, lport ? lport : hp->dest,
skb->dev, NF_TPROXY_LOOKUP_LISTENER);
if (sk2) {
- inet_twsk_deschedule_put(inet_twsk(sk));
+ nf_tproxy_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
}
}
@@ -53,6 +49,7 @@ EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait4);
__be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
{
+ const struct in_ifaddr *ifa;
struct in_device *indev;
__be32 laddr;
@@ -61,10 +58,16 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
laddr = 0;
indev = __in_dev_get_rcu(skb->dev);
- for_primary_ifa(indev) {
+ if (!indev)
+ return daddr;
+
+ in_dev_for_each_ifa_rcu(ifa, indev) {
+ if (ifa->ifa_flags & IFA_F_SECONDARY)
+ continue;
+
laddr = ifa->ifa_local;
break;
- } endfor_ifa(indev);
+ }
return laddr ? laddr : daddr;
}
@@ -91,12 +94,10 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
switch (lookup_type) {
case NF_TPROXY_LOOKUP_LISTENER:
- sk = inet_lookup_listener(net, &tcp_hashinfo, skb,
- ip_hdrlen(skb) +
- __tcp_hdrlen(hp),
- saddr, sport,
- daddr, dport,
- in->ifindex, 0);
+ sk = inet_lookup_listener(net, skb,
+ ip_hdrlen(skb) + __tcp_hdrlen(hp),
+ saddr, sport, daddr, dport,
+ in->ifindex, 0);
if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
@@ -107,9 +108,8 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
*/
break;
case NF_TPROXY_LOOKUP_ESTABLISHED:
- sk = inet_lookup_established(net, &tcp_hashinfo,
- saddr, sport, daddr, dport,
- in->ifindex);
+ sk = inet_lookup_established(net, saddr, sport,
+ daddr, dport, in->ifindex);
break;
default:
BUG();
diff --git a/net/ipv4/netfilter/nft_chain_nat_ipv4.c b/net/ipv4/netfilter/nft_chain_nat_ipv4.c
deleted file mode 100644
index a3c4ea303e3e..000000000000
--- a/net/ipv4/netfilter/nft_chain_nat_ipv4.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
- * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
- * Copyright (c) 2012 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables_ipv4.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/ip.h>
-
-static unsigned int nft_nat_do_chain(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct nft_pktinfo pkt;
-
- nft_set_pktinfo(&pkt, skb, state);
- nft_set_pktinfo_ipv4(&pkt, skb);
-
- return nft_do_chain(&pkt, priv);
-}
-
-static int nft_nat_ipv4_reg(struct net *net, const struct nf_hook_ops *ops)
-{
- return nf_nat_l3proto_ipv4_register_fn(net, ops);
-}
-
-static void nft_nat_ipv4_unreg(struct net *net, const struct nf_hook_ops *ops)
-{
- nf_nat_l3proto_ipv4_unregister_fn(net, ops);
-}
-
-static const struct nft_chain_type nft_chain_nat_ipv4 = {
- .name = "nat",
- .type = NFT_CHAIN_T_NAT,
- .family = NFPROTO_IPV4,
- .owner = THIS_MODULE,
- .hook_mask = (1 << NF_INET_PRE_ROUTING) |
- (1 << NF_INET_POST_ROUTING) |
- (1 << NF_INET_LOCAL_OUT) |
- (1 << NF_INET_LOCAL_IN),
- .hooks = {
- [NF_INET_PRE_ROUTING] = nft_nat_do_chain,
- [NF_INET_POST_ROUTING] = nft_nat_do_chain,
- [NF_INET_LOCAL_OUT] = nft_nat_do_chain,
- [NF_INET_LOCAL_IN] = nft_nat_do_chain,
- },
- .ops_register = nft_nat_ipv4_reg,
- .ops_unregister = nft_nat_ipv4_unreg,
-};
-
-static int __init nft_chain_nat_init(void)
-{
- nft_register_chain_type(&nft_chain_nat_ipv4);
-
- return 0;
-}
-
-static void __exit nft_chain_nat_exit(void)
-{
- nft_unregister_chain_type(&nft_chain_nat_ipv4);
-}
-
-module_init(nft_chain_nat_init);
-module_exit(nft_chain_nat_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_CHAIN(AF_INET, "nat");
diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c b/net/ipv4/netfilter/nft_chain_route_ipv4.c
deleted file mode 100644
index 7d82934c46f4..000000000000
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables_ipv4.h>
-#include <net/route.h>
-#include <net/ip.h>
-
-static unsigned int nf_route_table_hook(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- unsigned int ret;
- struct nft_pktinfo pkt;
- u32 mark;
- __be32 saddr, daddr;
- u_int8_t tos;
- const struct iphdr *iph;
- int err;
-
- nft_set_pktinfo(&pkt, skb, state);
- nft_set_pktinfo_ipv4(&pkt, skb);
-
- mark = skb->mark;
- iph = ip_hdr(skb);
- saddr = iph->saddr;
- daddr = iph->daddr;
- tos = iph->tos;
-
- ret = nft_do_chain(&pkt, priv);
- if (ret != NF_DROP && ret != NF_STOLEN) {
- iph = ip_hdr(skb);
-
- if (iph->saddr != saddr ||
- iph->daddr != daddr ||
- skb->mark != mark ||
- iph->tos != tos) {
- err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
- if (err < 0)
- ret = NF_DROP_ERR(err);
- }
- }
- return ret;
-}
-
-static const struct nft_chain_type nft_chain_route_ipv4 = {
- .name = "route",
- .type = NFT_CHAIN_T_ROUTE,
- .family = NFPROTO_IPV4,
- .owner = THIS_MODULE,
- .hook_mask = (1 << NF_INET_LOCAL_OUT),
- .hooks = {
- [NF_INET_LOCAL_OUT] = nf_route_table_hook,
- },
-};
-
-static int __init nft_chain_route_init(void)
-{
- nft_register_chain_type(&nft_chain_route_ipv4);
-
- return 0;
-}
-
-static void __exit nft_chain_route_exit(void)
-{
- nft_unregister_chain_type(&nft_chain_route_ipv4);
-}
-
-module_init(nft_chain_route_init);
-module_exit(nft_chain_route_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_CHAIN(AF_INET, "route");
diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c
index 0af3d8df70dd..ef5dd88107dd 100644
--- a/net/ipv4/netfilter/nft_dup_ipv4.c
+++ b/net/ipv4/netfilter/nft_dup_ipv4.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
*/
#include <linux/kernel.h>
@@ -16,8 +13,8 @@
#include <net/netfilter/ipv4/nf_dup_ipv4.h>
struct nft_dup_ipv4 {
- enum nft_registers sreg_addr:8;
- enum nft_registers sreg_dev:8;
+ u8 sreg_addr;
+ u8 sreg_dev;
};
static void nft_dup_ipv4_eval(const struct nft_expr *expr,
@@ -43,19 +40,20 @@ static int nft_dup_ipv4_init(const struct nft_ctx *ctx,
if (tb[NFTA_DUP_SREG_ADDR] == NULL)
return -EINVAL;
- priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]);
- err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in_addr));
+ err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr,
+ sizeof(struct in_addr));
if (err < 0)
return err;
- if (tb[NFTA_DUP_SREG_DEV] != NULL) {
- priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]);
- return nft_validate_register_load(priv->sreg_dev, sizeof(int));
- }
- return 0;
+ if (tb[NFTA_DUP_SREG_DEV])
+ err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_DEV],
+ &priv->sreg_dev, sizeof(int));
+
+ return err;
}
-static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_dup_ipv4_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
@@ -78,6 +76,7 @@ static const struct nft_expr_ops nft_dup_ipv4_ops = {
.eval = nft_dup_ipv4_eval,
.init = nft_dup_ipv4_init,
.dump = nft_dup_ipv4_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nla_policy nft_dup_ipv4_policy[NFTA_DUP_MAX + 1] = {
@@ -110,3 +109,4 @@ module_exit(nft_dup_ipv4_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "dup");
+MODULE_DESCRIPTION("IPv4 nftables packet duplication support");
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
index e50976e3c213..82af6cd76d13 100644
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -1,8 +1,4 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/init.h>
@@ -14,6 +10,8 @@
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nft_fib.h>
+#include <net/flow.h>
+#include <net/ip.h>
#include <net/ip_fib.h>
#include <net/route.h>
@@ -26,8 +24,6 @@ static __be32 get_saddr(__be32 addr)
return addr;
}
-#define DSCP_BITS 0xfc
-
void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
@@ -54,14 +50,14 @@ void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
else
addr = iph->saddr;
- *dst = inet_dev_addr_type(nft_net(pkt), dev, addr);
-}
-EXPORT_SYMBOL_GPL(nft_fib4_eval_type);
+ if (priv->flags & (NFTA_FIB_F_IIF | NFTA_FIB_F_OIF)) {
+ *dst = inet_dev_addr_type(nft_net(pkt), dev, addr);
+ return;
+ }
-static int get_ifindex(const struct net_device *dev)
-{
- return dev ? dev->ifindex : 0;
+ *dst = inet_addr_type_dev_table(nft_net(pkt), pkt->skb->dev, addr);
}
+EXPORT_SYMBOL_GPL(nft_fib4_eval_type);
void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -74,12 +70,16 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
struct flowi4 fl4 = {
.flowi4_scope = RT_SCOPE_UNIVERSE,
.flowi4_iif = LOOPBACK_IFINDEX,
+ .flowi4_proto = pkt->tprot,
+ .flowi4_uid = sock_net_uid(nft_net(pkt), NULL),
};
const struct net_device *oif;
- struct net_device *found;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- int i;
-#endif
+ const struct net_device *found;
+
+ if (nft_fib_can_skip(pkt)) {
+ nft_fib_store_result(dest, priv, nft_in(pkt));
+ return;
+ }
/*
* Do not set flowi4_oif, it restricts results (for example, asking
@@ -95,12 +95,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
else
oif = NULL;
- if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
- nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
- nft_fib_store_result(dest, priv, pkt,
- nft_in(pkt)->ifindex);
- return;
- }
+ fl4.flowi4_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, oif);
iph = skb_header_pointer(pkt->skb, noff, sizeof(_iph), &_iph);
if (!iph) {
@@ -111,8 +106,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
if (ipv4_is_zeronet(iph->saddr)) {
if (ipv4_is_lbcast(iph->daddr) ||
ipv4_is_local_multicast(iph->daddr)) {
- nft_fib_store_result(dest, priv, pkt,
- get_ifindex(pkt->skb->dev));
+ nft_fib_store_result(dest, priv, pkt->skb->dev);
return;
}
}
@@ -120,12 +114,16 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
if (priv->flags & NFTA_FIB_F_MARK)
fl4.flowi4_mark = pkt->skb->mark;
- fl4.flowi4_tos = iph->tos & DSCP_BITS;
+ fl4.flowi4_dscp = ip4h_dscp(iph);
if (priv->flags & NFTA_FIB_F_DADDR) {
fl4.daddr = iph->daddr;
fl4.saddr = get_saddr(iph->saddr);
} else {
+ if (nft_hook(pkt) == NF_INET_FORWARD &&
+ priv->flags & NFTA_FIB_F_IIF)
+ fl4.flowi4_iif = nft_out(pkt)->ifindex;
+
fl4.daddr = iph->saddr;
fl4.saddr = get_saddr(iph->daddr);
}
@@ -144,38 +142,15 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
break;
}
- if (!oif) {
- found = FIB_RES_DEV(res);
- goto ok;
- }
-
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- for (i = 0; i < res.fi->fib_nhs; i++) {
- struct fib_nh *nh = &res.fi->fib_nh[i];
-
- if (nh->nh_dev == oif) {
- found = nh->nh_dev;
- goto ok;
- }
- }
- return;
-#else
- found = FIB_RES_DEV(res);
- if (found != oif)
- return;
-#endif
-ok:
- switch (priv->result) {
- case NFT_FIB_RESULT_OIF:
- *dest = found->ifindex;
- break;
- case NFT_FIB_RESULT_OIFNAME:
- strncpy((char *)dest, found->name, IFNAMSIZ);
- break;
- default:
- WARN_ON_ONCE(1);
- break;
+ if (!oif) {
+ found = FIB_RES_DEV(res);
+ } else {
+ if (!fib_info_nh_uses_dev(res.fi, oif))
+ return;
+ found = oif;
}
+
+ nft_fib_store_result(dest, priv, found);
}
EXPORT_SYMBOL_GPL(nft_fib4_eval);
@@ -188,6 +163,7 @@ static const struct nft_expr_ops nft_fib4_type_ops = {
.init = nft_fib_init,
.dump = nft_fib_dump,
.validate = nft_fib_validate,
+ .reduce = nft_fib_reduce,
};
static const struct nft_expr_ops nft_fib4_ops = {
@@ -197,6 +173,7 @@ static const struct nft_expr_ops nft_fib4_ops = {
.init = nft_fib_init,
.dump = nft_fib_dump,
.validate = nft_fib_validate,
+ .reduce = nft_fib_reduce,
};
static const struct nft_expr_ops *
@@ -246,3 +223,4 @@ module_exit(nft_fib4_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
MODULE_ALIAS_NFT_AF_EXPR(2, "fib");
+MODULE_DESCRIPTION("nftables fib / ip route lookup support");
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
deleted file mode 100644
index f1193e1e928a..000000000000
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nft_masq.h>
-#include <net/netfilter/ipv4/nf_nat_masquerade.h>
-
-static void nft_masq_ipv4_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
-{
- struct nft_masq *priv = nft_expr_priv(expr);
- struct nf_nat_range2 range;
-
- memset(&range, 0, sizeof(range));
- range.flags = priv->flags;
- if (priv->sreg_proto_min) {
- range.min_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_min]);
- range.max_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_max]);
- }
- regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, nft_hook(pkt),
- &range, nft_out(pkt));
-}
-
-static void
-nft_masq_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
-{
- nf_ct_netns_put(ctx->net, NFPROTO_IPV4);
-}
-
-static struct nft_expr_type nft_masq_ipv4_type;
-static const struct nft_expr_ops nft_masq_ipv4_ops = {
- .type = &nft_masq_ipv4_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
- .eval = nft_masq_ipv4_eval,
- .init = nft_masq_init,
- .destroy = nft_masq_ipv4_destroy,
- .dump = nft_masq_dump,
- .validate = nft_masq_validate,
-};
-
-static struct nft_expr_type nft_masq_ipv4_type __read_mostly = {
- .family = NFPROTO_IPV4,
- .name = "masq",
- .ops = &nft_masq_ipv4_ops,
- .policy = nft_masq_policy,
- .maxattr = NFTA_MASQ_MAX,
- .owner = THIS_MODULE,
-};
-
-static int __init nft_masq_ipv4_module_init(void)
-{
- int ret;
-
- ret = nft_register_expr(&nft_masq_ipv4_type);
- if (ret < 0)
- return ret;
-
- nf_nat_masquerade_ipv4_register_notifier();
-
- return ret;
-}
-
-static void __exit nft_masq_ipv4_module_exit(void)
-{
- nft_unregister_expr(&nft_masq_ipv4_type);
- nf_nat_masquerade_ipv4_unregister_notifier();
-}
-
-module_init(nft_masq_ipv4_module_init);
-module_exit(nft_masq_ipv4_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org");
-MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "masq");
diff --git a/net/ipv4/netfilter/nft_redir_ipv4.c b/net/ipv4/netfilter/nft_redir_ipv4.c
deleted file mode 100644
index 5120be1d3118..000000000000
--- a/net/ipv4/netfilter/nft_redir_ipv4.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_redirect.h>
-#include <net/netfilter/nft_redir.h>
-
-static void nft_redir_ipv4_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
-{
- struct nft_redir *priv = nft_expr_priv(expr);
- struct nf_nat_ipv4_multi_range_compat mr;
-
- memset(&mr, 0, sizeof(mr));
- if (priv->sreg_proto_min) {
- mr.range[0].min.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_min]);
- mr.range[0].max.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_max]);
- mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
- }
-
- mr.range[0].flags |= priv->flags;
-
- regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, nft_hook(pkt));
-}
-
-static void
-nft_redir_ipv4_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
-{
- nf_ct_netns_put(ctx->net, NFPROTO_IPV4);
-}
-
-static struct nft_expr_type nft_redir_ipv4_type;
-static const struct nft_expr_ops nft_redir_ipv4_ops = {
- .type = &nft_redir_ipv4_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
- .eval = nft_redir_ipv4_eval,
- .init = nft_redir_init,
- .destroy = nft_redir_ipv4_destroy,
- .dump = nft_redir_dump,
- .validate = nft_redir_validate,
-};
-
-static struct nft_expr_type nft_redir_ipv4_type __read_mostly = {
- .family = NFPROTO_IPV4,
- .name = "redir",
- .ops = &nft_redir_ipv4_ops,
- .policy = nft_redir_policy,
- .maxattr = NFTA_REDIR_MAX,
- .owner = THIS_MODULE,
-};
-
-static int __init nft_redir_ipv4_module_init(void)
-{
- return nft_register_expr(&nft_redir_ipv4_type);
-}
-
-static void __exit nft_redir_ipv4_module_exit(void)
-{
- nft_unregister_expr(&nft_redir_ipv4_type);
-}
-
-module_init(nft_redir_ipv4_module_init);
-module_exit(nft_redir_ipv4_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
-MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "redir");
diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c
index 517ce93699de..6cb213bb7256 100644
--- a/net/ipv4/netfilter/nft_reject_ipv4.c
+++ b/net/ipv4/netfilter/nft_reject_ipv4.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
* Copyright (c) 2013 Eric Leblond <eric@regit.org>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
@@ -30,7 +27,8 @@ static void nft_reject_ipv4_eval(const struct nft_expr *expr,
nf_send_unreach(pkt->skb, priv->icmp_code, nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset(nft_net(pkt), pkt->skb, nft_hook(pkt));
+ nf_send_reset(nft_net(pkt), nft_sk(pkt), pkt->skb,
+ nft_hook(pkt));
break;
default:
break;
@@ -47,6 +45,7 @@ static const struct nft_expr_ops nft_reject_ipv4_ops = {
.init = nft_reject_init,
.dump = nft_reject_dump,
.validate = nft_reject_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_reject_ipv4_type __read_mostly = {
@@ -74,3 +73,4 @@ module_exit(nft_reject_ipv4_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "reject");
+MODULE_DESCRIPTION("IPv4 packet rejection for nftables");
diff --git a/net/ipv4/netlink.c b/net/ipv4/netlink.c
index f86bb4f06609..b920e1bdcf58 100644
--- a/net/ipv4/netlink.c
+++ b/net/ipv4/netlink.c
@@ -1,11 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
#include <linux/types.h>
#include <net/net_namespace.h>
#include <net/netlink.h>
+#include <linux/in6.h>
#include <net/ip.h>
-int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto,
+int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, u8 family,
struct netlink_ext_ack *extack)
{
*ip_proto = nla_get_u8(attr);
@@ -13,11 +15,19 @@ int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto,
switch (*ip_proto) {
case IPPROTO_TCP:
case IPPROTO_UDP:
+ return 0;
case IPPROTO_ICMP:
+ if (family != AF_INET)
+ break;
+ return 0;
+#if IS_ENABLED(CONFIG_IPV6)
+ case IPPROTO_ICMPV6:
+ if (family != AF_INET6)
+ break;
return 0;
- default:
- NL_SET_ERR_MSG(extack, "Unsupported ip proto");
- return -EOPNOTSUPP;
+#endif
}
+ NL_SET_ERR_MSG(extack, "Unsupported ip proto");
+ return -EOPNOTSUPP;
}
EXPORT_SYMBOL_GPL(rtm_getroute_parse_ip_proto);
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
new file mode 100644
index 000000000000..7b9d70f9b31c
--- /dev/null
+++ b/net/ipv4/nexthop.c
@@ -0,0 +1,4157 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Generic nexthop implementation
+ *
+ * Copyright (c) 2017-19 Cumulus Networks
+ * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com>
+ */
+
+#include <linux/nexthop.h>
+#include <linux/rtnetlink.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <net/arp.h>
+#include <net/ipv6_stubs.h>
+#include <net/lwtunnel.h>
+#include <net/ndisc.h>
+#include <net/nexthop.h>
+#include <net/route.h>
+#include <net/sock.h>
+
+#define NH_RES_DEFAULT_IDLE_TIMER (120 * HZ)
+#define NH_RES_DEFAULT_UNBALANCED_TIMER 0 /* No forced rebalancing. */
+
+static void remove_nexthop(struct net *net, struct nexthop *nh,
+ struct nl_info *nlinfo);
+
+#define NH_DEV_HASHBITS 8
+#define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
+
+#define NHA_OP_FLAGS_DUMP_ALL (NHA_OP_FLAG_DUMP_STATS | \
+ NHA_OP_FLAG_DUMP_HW_STATS)
+
+static const struct nla_policy rtm_nh_policy_new[] = {
+ [NHA_ID] = { .type = NLA_U32 },
+ [NHA_GROUP] = { .type = NLA_BINARY },
+ [NHA_GROUP_TYPE] = { .type = NLA_U16 },
+ [NHA_BLACKHOLE] = { .type = NLA_FLAG },
+ [NHA_OIF] = { .type = NLA_U32 },
+ [NHA_GATEWAY] = { .type = NLA_BINARY },
+ [NHA_ENCAP_TYPE] = { .type = NLA_U16 },
+ [NHA_ENCAP] = { .type = NLA_NESTED },
+ [NHA_FDB] = { .type = NLA_FLAG },
+ [NHA_RES_GROUP] = { .type = NLA_NESTED },
+ [NHA_HW_STATS_ENABLE] = NLA_POLICY_MAX(NLA_U32, true),
+};
+
+static const struct nla_policy rtm_nh_policy_get[] = {
+ [NHA_ID] = { .type = NLA_U32 },
+ [NHA_OP_FLAGS] = NLA_POLICY_MASK(NLA_U32,
+ NHA_OP_FLAGS_DUMP_ALL),
+};
+
+static const struct nla_policy rtm_nh_policy_del[] = {
+ [NHA_ID] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy rtm_nh_policy_dump[] = {
+ [NHA_OIF] = { .type = NLA_U32 },
+ [NHA_GROUPS] = { .type = NLA_FLAG },
+ [NHA_MASTER] = { .type = NLA_U32 },
+ [NHA_FDB] = { .type = NLA_FLAG },
+ [NHA_OP_FLAGS] = NLA_POLICY_MASK(NLA_U32,
+ NHA_OP_FLAGS_DUMP_ALL),
+};
+
+static const struct nla_policy rtm_nh_res_policy_new[] = {
+ [NHA_RES_GROUP_BUCKETS] = { .type = NLA_U16 },
+ [NHA_RES_GROUP_IDLE_TIMER] = { .type = NLA_U32 },
+ [NHA_RES_GROUP_UNBALANCED_TIMER] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy rtm_nh_policy_dump_bucket[] = {
+ [NHA_ID] = { .type = NLA_U32 },
+ [NHA_OIF] = { .type = NLA_U32 },
+ [NHA_MASTER] = { .type = NLA_U32 },
+ [NHA_RES_BUCKET] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy rtm_nh_res_bucket_policy_dump[] = {
+ [NHA_RES_BUCKET_NH_ID] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy rtm_nh_policy_get_bucket[] = {
+ [NHA_ID] = { .type = NLA_U32 },
+ [NHA_RES_BUCKET] = { .type = NLA_NESTED },
+};
+
+static const struct nla_policy rtm_nh_res_bucket_policy_get[] = {
+ [NHA_RES_BUCKET_INDEX] = { .type = NLA_U16 },
+};
+
+static bool nexthop_notifiers_is_empty(struct net *net)
+{
+ return !net->nexthop.notifier_chain.head;
+}
+
+static void
+__nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info,
+ const struct nh_info *nhi)
+{
+ nh_info->dev = nhi->fib_nhc.nhc_dev;
+ nh_info->gw_family = nhi->fib_nhc.nhc_gw_family;
+ if (nh_info->gw_family == AF_INET)
+ nh_info->ipv4 = nhi->fib_nhc.nhc_gw.ipv4;
+ else if (nh_info->gw_family == AF_INET6)
+ nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6;
+
+ nh_info->id = nhi->nh_parent->id;
+ nh_info->is_reject = nhi->reject_nh;
+ nh_info->is_fdb = nhi->fdb_nh;
+ nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate;
+}
+
+static int nh_notifier_single_info_init(struct nh_notifier_info *info,
+ const struct nexthop *nh)
+{
+ struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+
+ info->type = NH_NOTIFIER_INFO_TYPE_SINGLE;
+ info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL);
+ if (!info->nh)
+ return -ENOMEM;
+
+ __nh_notifier_single_info_init(info->nh, nhi);
+
+ return 0;
+}
+
+static void nh_notifier_single_info_fini(struct nh_notifier_info *info)
+{
+ kfree(info->nh);
+}
+
+static int nh_notifier_mpath_info_init(struct nh_notifier_info *info,
+ struct nh_group *nhg)
+{
+ u16 num_nh = nhg->num_nh;
+ int i;
+
+ info->type = NH_NOTIFIER_INFO_TYPE_GRP;
+ info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh),
+ GFP_KERNEL);
+ if (!info->nh_grp)
+ return -ENOMEM;
+
+ info->nh_grp->num_nh = num_nh;
+ info->nh_grp->is_fdb = nhg->fdb_nh;
+ info->nh_grp->hw_stats = nhg->hw_stats;
+
+ for (i = 0; i < num_nh; i++) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+ struct nh_info *nhi;
+
+ nhi = rtnl_dereference(nhge->nh->nh_info);
+ info->nh_grp->nh_entries[i].weight = nhge->weight;
+ __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh,
+ nhi);
+ }
+
+ return 0;
+}
+
+static int nh_notifier_res_table_info_init(struct nh_notifier_info *info,
+ struct nh_group *nhg)
+{
+ struct nh_res_table *res_table = rtnl_dereference(nhg->res_table);
+ u16 num_nh_buckets = res_table->num_nh_buckets;
+ unsigned long size;
+ u16 i;
+
+ info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE;
+ size = struct_size(info->nh_res_table, nhs, num_nh_buckets);
+ info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO |
+ __GFP_NOWARN);
+ if (!info->nh_res_table)
+ return -ENOMEM;
+
+ info->nh_res_table->num_nh_buckets = num_nh_buckets;
+ info->nh_res_table->hw_stats = nhg->hw_stats;
+
+ for (i = 0; i < num_nh_buckets; i++) {
+ struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
+ struct nh_grp_entry *nhge;
+ struct nh_info *nhi;
+
+ nhge = rtnl_dereference(bucket->nh_entry);
+ nhi = rtnl_dereference(nhge->nh->nh_info);
+ __nh_notifier_single_info_init(&info->nh_res_table->nhs[i],
+ nhi);
+ }
+
+ return 0;
+}
+
+static int nh_notifier_grp_info_init(struct nh_notifier_info *info,
+ const struct nexthop *nh)
+{
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+
+ if (nhg->hash_threshold)
+ return nh_notifier_mpath_info_init(info, nhg);
+ else if (nhg->resilient)
+ return nh_notifier_res_table_info_init(info, nhg);
+ return -EINVAL;
+}
+
+static void nh_notifier_grp_info_fini(struct nh_notifier_info *info,
+ const struct nexthop *nh)
+{
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+
+ if (nhg->hash_threshold)
+ kfree(info->nh_grp);
+ else if (nhg->resilient)
+ vfree(info->nh_res_table);
+}
+
+static int nh_notifier_info_init(struct nh_notifier_info *info,
+ const struct nexthop *nh)
+{
+ info->id = nh->id;
+
+ if (nh->is_group)
+ return nh_notifier_grp_info_init(info, nh);
+ else
+ return nh_notifier_single_info_init(info, nh);
+}
+
+static void nh_notifier_info_fini(struct nh_notifier_info *info,
+ const struct nexthop *nh)
+{
+ if (nh->is_group)
+ nh_notifier_grp_info_fini(info, nh);
+ else
+ nh_notifier_single_info_fini(info);
+}
+
+static int call_nexthop_notifiers(struct net *net,
+ enum nexthop_event_type event_type,
+ struct nexthop *nh,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_notifier_info info = {
+ .net = net,
+ .extack = extack,
+ };
+ int err;
+
+ ASSERT_RTNL();
+
+ if (nexthop_notifiers_is_empty(net))
+ return 0;
+
+ err = nh_notifier_info_init(&info, nh);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info");
+ return err;
+ }
+
+ err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
+ event_type, &info);
+ nh_notifier_info_fini(&info, nh);
+
+ return notifier_to_errno(err);
+}
+
+static int
+nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info,
+ bool force, unsigned int *p_idle_timer_ms)
+{
+ struct nh_res_table *res_table;
+ struct nh_group *nhg;
+ struct nexthop *nh;
+ int err = 0;
+
+ /* When 'force' is false, nexthop bucket replacement is performed
+ * because the bucket was deemed to be idle. In this case, capable
+ * listeners can choose to perform an atomic replacement: The bucket is
+ * only replaced if it is inactive. However, if the idle timer interval
+ * is smaller than the interval in which a listener is querying
+ * buckets' activity from the device, then atomic replacement should
+ * not be tried. Pass the idle timer value to listeners, so that they
+ * could determine which type of replacement to perform.
+ */
+ if (force) {
+ *p_idle_timer_ms = 0;
+ return 0;
+ }
+
+ rcu_read_lock();
+
+ nh = nexthop_find_by_id(info->net, info->id);
+ if (!nh) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ nhg = rcu_dereference(nh->nh_grp);
+ res_table = rcu_dereference(nhg->res_table);
+ *p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer);
+
+out:
+ rcu_read_unlock();
+
+ return err;
+}
+
+static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info,
+ u16 bucket_index, bool force,
+ struct nh_info *oldi,
+ struct nh_info *newi)
+{
+ unsigned int idle_timer_ms;
+ int err;
+
+ err = nh_notifier_res_bucket_idle_timer_get(info, force,
+ &idle_timer_ms);
+ if (err)
+ return err;
+
+ info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET;
+ info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket),
+ GFP_KERNEL);
+ if (!info->nh_res_bucket)
+ return -ENOMEM;
+
+ info->nh_res_bucket->bucket_index = bucket_index;
+ info->nh_res_bucket->idle_timer_ms = idle_timer_ms;
+ info->nh_res_bucket->force = force;
+ __nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi);
+ __nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi);
+ return 0;
+}
+
+static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info)
+{
+ kfree(info->nh_res_bucket);
+}
+
+static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id,
+ u16 bucket_index, bool force,
+ struct nh_info *oldi,
+ struct nh_info *newi,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_notifier_info info = {
+ .net = net,
+ .extack = extack,
+ .id = nhg_id,
+ };
+ int err;
+
+ if (nexthop_notifiers_is_empty(net))
+ return 0;
+
+ err = nh_notifier_res_bucket_info_init(&info, bucket_index, force,
+ oldi, newi);
+ if (err)
+ return err;
+
+ err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
+ NEXTHOP_EVENT_BUCKET_REPLACE, &info);
+ nh_notifier_res_bucket_info_fini(&info);
+
+ return notifier_to_errno(err);
+}
+
+/* There are three users of RES_TABLE, and NHs etc. referenced from there:
+ *
+ * 1) a collection of callbacks for NH maintenance. This operates under
+ * RTNL,
+ * 2) the delayed work that gradually balances the resilient table,
+ * 3) and nexthop_select_path(), operating under RCU.
+ *
+ * Both the delayed work and the RTNL block are writers, and need to
+ * maintain mutual exclusion. Since there are only two and well-known
+ * writers for each table, the RTNL code can make sure it has exclusive
+ * access thus:
+ *
+ * - Have the DW operate without locking;
+ * - synchronously cancel the DW;
+ * - do the writing;
+ * - if the write was not actually a delete, call upkeep, which schedules
+ * DW again if necessary.
+ *
+ * The functions that are always called from the RTNL context use
+ * rtnl_dereference(). The functions that can also be called from the DW do
+ * a raw dereference and rely on the above mutual exclusion scheme.
+ */
+#define nh_res_dereference(p) (rcu_dereference_raw(p))
+
+static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id,
+ u16 bucket_index, bool force,
+ struct nexthop *old_nh,
+ struct nexthop *new_nh,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_info *oldi = nh_res_dereference(old_nh->nh_info);
+ struct nh_info *newi = nh_res_dereference(new_nh->nh_info);
+
+ return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index,
+ force, oldi, newi, extack);
+}
+
+static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_notifier_info info = {
+ .net = net,
+ .extack = extack,
+ .id = nh->id,
+ };
+ struct nh_group *nhg;
+ int err;
+
+ ASSERT_RTNL();
+
+ if (nexthop_notifiers_is_empty(net))
+ return 0;
+
+ /* At this point, the nexthop buckets are still not populated. Only
+ * emit a notification with the logical nexthops, so that a listener
+ * could potentially veto it in case of unsupported configuration.
+ */
+ nhg = rtnl_dereference(nh->nh_grp);
+ err = nh_notifier_mpath_info_init(&info, nhg);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info");
+ return err;
+ }
+
+ err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
+ NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE,
+ &info);
+ kfree(info.nh_grp);
+
+ return notifier_to_errno(err);
+}
+
+static int call_nexthop_notifier(struct notifier_block *nb, struct net *net,
+ enum nexthop_event_type event_type,
+ struct nexthop *nh,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_notifier_info info = {
+ .net = net,
+ .extack = extack,
+ };
+ int err;
+
+ err = nh_notifier_info_init(&info, nh);
+ if (err)
+ return err;
+
+ err = nb->notifier_call(nb, event_type, &info);
+ nh_notifier_info_fini(&info, nh);
+
+ return notifier_to_errno(err);
+}
+
+static unsigned int nh_dev_hashfn(unsigned int val)
+{
+ unsigned int mask = NH_DEV_HASHSIZE - 1;
+
+ return (val ^
+ (val >> NH_DEV_HASHBITS) ^
+ (val >> (NH_DEV_HASHBITS * 2))) & mask;
+}
+
+static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
+{
+ struct net_device *dev = nhi->fib_nhc.nhc_dev;
+ struct hlist_head *head;
+ unsigned int hash;
+
+ WARN_ON(!dev);
+
+ hash = nh_dev_hashfn(dev->ifindex);
+ head = &net->nexthop.devhash[hash];
+ hlist_add_head(&nhi->dev_hash, head);
+}
+
+static void nexthop_free_group(struct nexthop *nh)
+{
+ struct nh_group *nhg;
+ int i;
+
+ nhg = rcu_dereference_raw(nh->nh_grp);
+ for (i = 0; i < nhg->num_nh; ++i) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+ WARN_ON(!list_empty(&nhge->nh_list));
+ free_percpu(nhge->stats);
+ nexthop_put(nhge->nh);
+ }
+
+ WARN_ON(nhg->spare == nhg);
+
+ if (nhg->resilient)
+ vfree(rcu_dereference_raw(nhg->res_table));
+
+ kfree(nhg->spare);
+ kfree(nhg);
+}
+
+static void nexthop_free_single(struct nexthop *nh)
+{
+ struct nh_info *nhi;
+
+ nhi = rcu_dereference_raw(nh->nh_info);
+ switch (nhi->family) {
+ case AF_INET:
+ fib_nh_release(nh->net, &nhi->fib_nh);
+ break;
+ case AF_INET6:
+ ipv6_stub->fib6_nh_release(&nhi->fib6_nh);
+ break;
+ }
+ kfree(nhi);
+}
+
+void nexthop_free_rcu(struct rcu_head *head)
+{
+ struct nexthop *nh = container_of(head, struct nexthop, rcu);
+
+ if (nh->is_group)
+ nexthop_free_group(nh);
+ else
+ nexthop_free_single(nh);
+
+ kfree(nh);
+}
+EXPORT_SYMBOL_GPL(nexthop_free_rcu);
+
+static struct nexthop *nexthop_alloc(void)
+{
+ struct nexthop *nh;
+
+ nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL);
+ if (nh) {
+ INIT_LIST_HEAD(&nh->fi_list);
+ INIT_LIST_HEAD(&nh->f6i_list);
+ INIT_LIST_HEAD(&nh->grp_list);
+ INIT_LIST_HEAD(&nh->fdb_list);
+ spin_lock_init(&nh->lock);
+ }
+ return nh;
+}
+
+static struct nh_group *nexthop_grp_alloc(u16 num_nh)
+{
+ struct nh_group *nhg;
+
+ nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL);
+ if (nhg)
+ nhg->num_nh = num_nh;
+
+ return nhg;
+}
+
+static void nh_res_table_upkeep_dw(struct work_struct *work);
+
+static struct nh_res_table *
+nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg)
+{
+ const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets;
+ struct nh_res_table *res_table;
+ unsigned long size;
+
+ size = struct_size(res_table, nh_buckets, num_nh_buckets);
+ res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
+ if (!res_table)
+ return NULL;
+
+ res_table->net = net;
+ res_table->nhg_id = nhg_id;
+ INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw);
+ INIT_LIST_HEAD(&res_table->uw_nh_entries);
+ res_table->idle_timer = cfg->nh_grp_res_idle_timer;
+ res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer;
+ res_table->num_nh_buckets = num_nh_buckets;
+ return res_table;
+}
+
+static void nh_base_seq_inc(struct net *net)
+{
+ while (++net->nexthop.seq == 0)
+ ;
+}
+
+/* no reference taken; rcu lock or rtnl must be held */
+struct nexthop *nexthop_find_by_id(struct net *net, u32 id)
+{
+ struct rb_node **pp, *parent = NULL, *next;
+
+ pp = &net->nexthop.rb_root.rb_node;
+ while (1) {
+ struct nexthop *nh;
+
+ next = rcu_dereference_raw(*pp);
+ if (!next)
+ break;
+ parent = next;
+
+ nh = rb_entry(parent, struct nexthop, rb_node);
+ if (id < nh->id)
+ pp = &next->rb_left;
+ else if (id > nh->id)
+ pp = &next->rb_right;
+ else
+ return nh;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nexthop_find_by_id);
+
+/* used for auto id allocation; called with rtnl held */
+static u32 nh_find_unused_id(struct net *net)
+{
+ u32 id_start = net->nexthop.last_id_allocated;
+
+ while (1) {
+ net->nexthop.last_id_allocated++;
+ if (net->nexthop.last_id_allocated == id_start)
+ break;
+
+ if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated))
+ return net->nexthop.last_id_allocated;
+ }
+ return 0;
+}
+
+static void nh_res_time_set_deadline(unsigned long next_time,
+ unsigned long *deadline)
+{
+ if (time_before(next_time, *deadline))
+ *deadline = next_time;
+}
+
+static clock_t nh_res_table_unbalanced_time(struct nh_res_table *res_table)
+{
+ if (list_empty(&res_table->uw_nh_entries))
+ return 0;
+ return jiffies_delta_to_clock_t(jiffies - res_table->unbalanced_since);
+}
+
+static int nla_put_nh_group_res(struct sk_buff *skb, struct nh_group *nhg)
+{
+ struct nh_res_table *res_table = rtnl_dereference(nhg->res_table);
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, NHA_RES_GROUP);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u16(skb, NHA_RES_GROUP_BUCKETS,
+ res_table->num_nh_buckets) ||
+ nla_put_u32(skb, NHA_RES_GROUP_IDLE_TIMER,
+ jiffies_to_clock_t(res_table->idle_timer)) ||
+ nla_put_u32(skb, NHA_RES_GROUP_UNBALANCED_TIMER,
+ jiffies_to_clock_t(res_table->unbalanced_timer)) ||
+ nla_put_u64_64bit(skb, NHA_RES_GROUP_UNBALANCED_TIME,
+ nh_res_table_unbalanced_time(res_table),
+ NHA_RES_GROUP_PAD))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static void nh_grp_entry_stats_inc(struct nh_grp_entry *nhge)
+{
+ struct nh_grp_entry_stats *cpu_stats;
+
+ cpu_stats = get_cpu_ptr(nhge->stats);
+ u64_stats_update_begin(&cpu_stats->syncp);
+ u64_stats_inc(&cpu_stats->packets);
+ u64_stats_update_end(&cpu_stats->syncp);
+ put_cpu_ptr(cpu_stats);
+}
+
+static void nh_grp_entry_stats_read(struct nh_grp_entry *nhge,
+ u64 *ret_packets)
+{
+ int i;
+
+ *ret_packets = 0;
+
+ for_each_possible_cpu(i) {
+ struct nh_grp_entry_stats *cpu_stats;
+ unsigned int start;
+ u64 packets;
+
+ cpu_stats = per_cpu_ptr(nhge->stats, i);
+ do {
+ start = u64_stats_fetch_begin(&cpu_stats->syncp);
+ packets = u64_stats_read(&cpu_stats->packets);
+ } while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
+
+ *ret_packets += packets;
+ }
+}
+
+static int nh_notifier_grp_hw_stats_init(struct nh_notifier_info *info,
+ const struct nexthop *nh)
+{
+ struct nh_group *nhg;
+ int i;
+
+ ASSERT_RTNL();
+ nhg = rtnl_dereference(nh->nh_grp);
+
+ info->id = nh->id;
+ info->type = NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS;
+ info->nh_grp_hw_stats = kzalloc(struct_size(info->nh_grp_hw_stats,
+ stats, nhg->num_nh),
+ GFP_KERNEL);
+ if (!info->nh_grp_hw_stats)
+ return -ENOMEM;
+
+ info->nh_grp_hw_stats->num_nh = nhg->num_nh;
+ for (i = 0; i < nhg->num_nh; i++) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+ info->nh_grp_hw_stats->stats[i].id = nhge->nh->id;
+ }
+
+ return 0;
+}
+
+static void nh_notifier_grp_hw_stats_fini(struct nh_notifier_info *info)
+{
+ kfree(info->nh_grp_hw_stats);
+}
+
+void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info *info,
+ unsigned int nh_idx,
+ u64 delta_packets)
+{
+ info->hw_stats_used = true;
+ info->stats[nh_idx].packets += delta_packets;
+}
+EXPORT_SYMBOL(nh_grp_hw_stats_report_delta);
+
+static void nh_grp_hw_stats_apply_update(struct nexthop *nh,
+ struct nh_notifier_info *info)
+{
+ struct nh_group *nhg;
+ int i;
+
+ ASSERT_RTNL();
+ nhg = rtnl_dereference(nh->nh_grp);
+
+ for (i = 0; i < nhg->num_nh; i++) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+ nhge->packets_hw += info->nh_grp_hw_stats->stats[i].packets;
+ }
+}
+
+static int nh_grp_hw_stats_update(struct nexthop *nh, bool *hw_stats_used)
+{
+ struct nh_notifier_info info = {
+ .net = nh->net,
+ };
+ struct net *net = nh->net;
+ int err;
+
+ if (nexthop_notifiers_is_empty(net)) {
+ *hw_stats_used = false;
+ return 0;
+ }
+
+ err = nh_notifier_grp_hw_stats_init(&info, nh);
+ if (err)
+ return err;
+
+ err = blocking_notifier_call_chain(&net->nexthop.notifier_chain,
+ NEXTHOP_EVENT_HW_STATS_REPORT_DELTA,
+ &info);
+
+ /* Cache whatever we got, even if there was an error, otherwise the
+ * successful stats retrievals would get lost.
+ */
+ nh_grp_hw_stats_apply_update(nh, &info);
+ *hw_stats_used = info.nh_grp_hw_stats->hw_stats_used;
+
+ nh_notifier_grp_hw_stats_fini(&info);
+ return notifier_to_errno(err);
+}
+
+static int nla_put_nh_group_stats_entry(struct sk_buff *skb,
+ struct nh_grp_entry *nhge,
+ u32 op_flags)
+{
+ struct nlattr *nest;
+ u64 packets;
+
+ nh_grp_entry_stats_read(nhge, &packets);
+
+ nest = nla_nest_start(skb, NHA_GROUP_STATS_ENTRY);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, NHA_GROUP_STATS_ENTRY_ID, nhge->nh->id) ||
+ nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS,
+ packets + nhge->packets_hw))
+ goto nla_put_failure;
+
+ if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS &&
+ nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS_HW,
+ nhge->packets_hw))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static int nla_put_nh_group_stats(struct sk_buff *skb, struct nexthop *nh,
+ u32 op_flags)
+{
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+ struct nlattr *nest;
+ bool hw_stats_used;
+ int err;
+ int i;
+
+ if (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats))
+ goto err_out;
+
+ if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS &&
+ nhg->hw_stats) {
+ err = nh_grp_hw_stats_update(nh, &hw_stats_used);
+ if (err)
+ goto out;
+
+ if (nla_put_u32(skb, NHA_HW_STATS_USED, hw_stats_used))
+ goto err_out;
+ }
+
+ nest = nla_nest_start(skb, NHA_GROUP_STATS);
+ if (!nest)
+ goto err_out;
+
+ for (i = 0; i < nhg->num_nh; i++)
+ if (nla_put_nh_group_stats_entry(skb, &nhg->nh_entries[i],
+ op_flags))
+ goto cancel_out;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+cancel_out:
+ nla_nest_cancel(skb, nest);
+err_out:
+ err = -EMSGSIZE;
+out:
+ return err;
+}
+
+static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh,
+ u32 op_flags, u32 *resp_op_flags)
+{
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+ struct nexthop_grp *p;
+ size_t len = nhg->num_nh * sizeof(*p);
+ struct nlattr *nla;
+ u16 group_type = 0;
+ u16 weight;
+ int i;
+
+ *resp_op_flags |= NHA_OP_FLAG_RESP_GRP_RESVD_0;
+
+ if (nhg->hash_threshold)
+ group_type = NEXTHOP_GRP_TYPE_MPATH;
+ else if (nhg->resilient)
+ group_type = NEXTHOP_GRP_TYPE_RES;
+
+ if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type))
+ goto nla_put_failure;
+
+ nla = nla_reserve(skb, NHA_GROUP, len);
+ if (!nla)
+ goto nla_put_failure;
+
+ p = nla_data(nla);
+ for (i = 0; i < nhg->num_nh; ++i) {
+ weight = nhg->nh_entries[i].weight - 1;
+
+ *p++ = (struct nexthop_grp) {
+ .id = nhg->nh_entries[i].nh->id,
+ .weight = weight,
+ .weight_high = weight >> 8,
+ };
+ }
+
+ if (nhg->resilient && nla_put_nh_group_res(skb, nhg))
+ goto nla_put_failure;
+
+ if (op_flags & NHA_OP_FLAG_DUMP_STATS &&
+ (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats) ||
+ nla_put_nh_group_stats(skb, nh, op_flags)))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
+ int event, u32 portid, u32 seq, unsigned int nlflags,
+ u32 op_flags)
+{
+ struct fib6_nh *fib6_nh;
+ struct fib_nh *fib_nh;
+ struct nlmsghdr *nlh;
+ struct nh_info *nhi;
+ struct nhmsg *nhm;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ nhm = nlmsg_data(nlh);
+ nhm->nh_family = AF_UNSPEC;
+ nhm->nh_flags = nh->nh_flags;
+ nhm->nh_protocol = nh->protocol;
+ nhm->nh_scope = 0;
+ nhm->resvd = 0;
+
+ if (nla_put_u32(skb, NHA_ID, nh->id))
+ goto nla_put_failure;
+
+ if (nh->is_group) {
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+ u32 resp_op_flags = 0;
+
+ if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB))
+ goto nla_put_failure;
+ if (nla_put_nh_group(skb, nh, op_flags, &resp_op_flags) ||
+ nla_put_u32(skb, NHA_OP_FLAGS, resp_op_flags))
+ goto nla_put_failure;
+ goto out;
+ }
+
+ nhi = rtnl_dereference(nh->nh_info);
+ nhm->nh_family = nhi->family;
+ if (nhi->reject_nh) {
+ if (nla_put_flag(skb, NHA_BLACKHOLE))
+ goto nla_put_failure;
+ goto out;
+ } else if (nhi->fdb_nh) {
+ if (nla_put_flag(skb, NHA_FDB))
+ goto nla_put_failure;
+ } else {
+ const struct net_device *dev;
+
+ dev = nhi->fib_nhc.nhc_dev;
+ if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex))
+ goto nla_put_failure;
+ }
+
+ nhm->nh_scope = nhi->fib_nhc.nhc_scope;
+ switch (nhi->family) {
+ case AF_INET:
+ fib_nh = &nhi->fib_nh;
+ if (fib_nh->fib_nh_gw_family &&
+ nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4))
+ goto nla_put_failure;
+ break;
+
+ case AF_INET6:
+ fib6_nh = &nhi->fib6_nh;
+ if (fib6_nh->fib_nh_gw_family &&
+ nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6))
+ goto nla_put_failure;
+ break;
+ }
+
+ if (lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate,
+ NHA_ENCAP, NHA_ENCAP_TYPE) < 0)
+ goto nla_put_failure;
+
+out:
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg)
+{
+ return nla_total_size(0) + /* NHA_RES_GROUP */
+ nla_total_size(2) + /* NHA_RES_GROUP_BUCKETS */
+ nla_total_size(4) + /* NHA_RES_GROUP_IDLE_TIMER */
+ nla_total_size(4) + /* NHA_RES_GROUP_UNBALANCED_TIMER */
+ nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */
+}
+
+static size_t nh_nlmsg_size_grp(struct nexthop *nh)
+{
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+ size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh;
+ size_t tot = nla_total_size(sz) +
+ nla_total_size(2); /* NHA_GROUP_TYPE */
+
+ if (nhg->resilient)
+ tot += nh_nlmsg_size_grp_res(nhg);
+
+ return tot;
+}
+
+static size_t nh_nlmsg_size_single(struct nexthop *nh)
+{
+ struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+ size_t sz;
+
+ /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
+ * are mutually exclusive
+ */
+ sz = nla_total_size(4); /* NHA_OIF */
+
+ switch (nhi->family) {
+ case AF_INET:
+ if (nhi->fib_nh.fib_nh_gw_family)
+ sz += nla_total_size(4); /* NHA_GATEWAY */
+ break;
+
+ case AF_INET6:
+ /* NHA_GATEWAY */
+ if (nhi->fib6_nh.fib_nh_gw_family)
+ sz += nla_total_size(sizeof(const struct in6_addr));
+ break;
+ }
+
+ if (nhi->fib_nhc.nhc_lwtstate) {
+ sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate);
+ sz += nla_total_size(2); /* NHA_ENCAP_TYPE */
+ }
+
+ return sz;
+}
+
+static size_t nh_nlmsg_size(struct nexthop *nh)
+{
+ size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg));
+
+ sz += nla_total_size(4); /* NHA_ID */
+
+ if (nh->is_group)
+ sz += nh_nlmsg_size_grp(nh) +
+ nla_total_size(4) + /* NHA_OP_FLAGS */
+ 0;
+ else
+ sz += nh_nlmsg_size_single(nh);
+
+ return sz;
+}
+
+static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
+{
+ unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0;
+ u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any());
+ if (!skb)
+ goto errout;
+
+ err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in nh_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP,
+ info->nlh, gfp_any());
+ return;
+errout:
+ rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err);
+}
+
+static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket)
+{
+ return (unsigned long)atomic_long_read(&bucket->used_time);
+}
+
+static unsigned long
+nh_res_bucket_idle_point(const struct nh_res_table *res_table,
+ const struct nh_res_bucket *bucket,
+ unsigned long now)
+{
+ unsigned long time = nh_res_bucket_used_time(bucket);
+
+ /* Bucket was not used since it was migrated. The idle time is now. */
+ if (time == bucket->migrated_time)
+ return now;
+
+ return time + res_table->idle_timer;
+}
+
+static unsigned long
+nh_res_table_unb_point(const struct nh_res_table *res_table)
+{
+ return res_table->unbalanced_since + res_table->unbalanced_timer;
+}
+
+static void nh_res_bucket_set_idle(const struct nh_res_table *res_table,
+ struct nh_res_bucket *bucket)
+{
+ unsigned long now = jiffies;
+
+ atomic_long_set(&bucket->used_time, (long)now);
+ bucket->migrated_time = now;
+}
+
+static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket)
+{
+ atomic_long_set(&bucket->used_time, (long)jiffies);
+}
+
+static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket *bucket)
+{
+ unsigned long used_time = nh_res_bucket_used_time(bucket);
+
+ return jiffies_delta_to_clock_t(jiffies - used_time);
+}
+
+static int nh_fill_res_bucket(struct sk_buff *skb, struct nexthop *nh,
+ struct nh_res_bucket *bucket, u16 bucket_index,
+ int event, u32 portid, u32 seq,
+ unsigned int nlflags,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry);
+ struct nlmsghdr *nlh;
+ struct nlattr *nest;
+ struct nhmsg *nhm;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ nhm = nlmsg_data(nlh);
+ nhm->nh_family = AF_UNSPEC;
+ nhm->nh_flags = bucket->nh_flags;
+ nhm->nh_protocol = nh->protocol;
+ nhm->nh_scope = 0;
+ nhm->resvd = 0;
+
+ if (nla_put_u32(skb, NHA_ID, nh->id))
+ goto nla_put_failure;
+
+ nest = nla_nest_start(skb, NHA_RES_BUCKET);
+ if (!nest)
+ goto nla_put_failure;
+
+ if (nla_put_u16(skb, NHA_RES_BUCKET_INDEX, bucket_index) ||
+ nla_put_u32(skb, NHA_RES_BUCKET_NH_ID, nhge->nh->id) ||
+ nla_put_u64_64bit(skb, NHA_RES_BUCKET_IDLE_TIME,
+ nh_res_bucket_idle_time(bucket),
+ NHA_RES_BUCKET_PAD))
+ goto nla_put_failure_nest;
+
+ nla_nest_end(skb, nest);
+ nlmsg_end(skb, nlh);
+ return 0;
+
+nla_put_failure_nest:
+ nla_nest_cancel(skb, nest);
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static void nexthop_bucket_notify(struct nh_res_table *res_table,
+ u16 bucket_index)
+{
+ struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index];
+ struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry);
+ struct nexthop *nh = nhge->nh_parent;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ goto errout;
+
+ err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
+ RTM_NEWNEXTHOPBUCKET, 0, 0, NLM_F_REPLACE,
+ NULL);
+ if (err < 0) {
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL);
+ return;
+errout:
+ rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err);
+}
+
+static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
+ bool *is_fdb, struct netlink_ext_ack *extack)
+{
+ if (nh->is_group) {
+ struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
+
+ /* Nesting groups within groups is not supported. */
+ if (nhg->hash_threshold) {
+ NL_SET_ERR_MSG(extack,
+ "Hash-threshold group can not be a nexthop within a group");
+ return false;
+ }
+ if (nhg->resilient) {
+ NL_SET_ERR_MSG(extack,
+ "Resilient group can not be a nexthop within a group");
+ return false;
+ }
+ *is_fdb = nhg->fdb_nh;
+ } else {
+ struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+
+ if (nhi->reject_nh && npaths > 1) {
+ NL_SET_ERR_MSG(extack,
+ "Blackhole nexthop can not be used in a group with more than 1 path");
+ return false;
+ }
+ *is_fdb = nhi->fdb_nh;
+ }
+
+ return true;
+}
+
+static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_info *nhi;
+
+ nhi = rtnl_dereference(nh->nh_info);
+
+ if (!nhi->fdb_nh) {
+ NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops");
+ return -EINVAL;
+ }
+
+ if (*nh_family == AF_UNSPEC) {
+ *nh_family = nhi->family;
+ } else if (*nh_family != nhi->family) {
+ NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int nh_check_attr_group(struct net *net,
+ struct nlattr *tb[], size_t tb_size,
+ u16 nh_grp_type, struct netlink_ext_ack *extack)
+{
+ unsigned int len = nla_len(tb[NHA_GROUP]);
+ struct nexthop_grp *nhg;
+ unsigned int i, j;
+
+ if (!len || len & (sizeof(struct nexthop_grp) - 1)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid length for nexthop group attribute");
+ return -EINVAL;
+ }
+
+ /* convert len to number of nexthop ids */
+ len /= sizeof(*nhg);
+
+ nhg = nla_data(tb[NHA_GROUP]);
+ for (i = 0; i < len; ++i) {
+ if (nhg[i].resvd2) {
+ NL_SET_ERR_MSG(extack, "Reserved field in nexthop_grp must be 0");
+ return -EINVAL;
+ }
+ if (nexthop_grp_weight(&nhg[i]) == 0) {
+ /* 0xffff got passed in, representing weight of 0x10000,
+ * which is too heavy.
+ */
+ NL_SET_ERR_MSG(extack, "Invalid value for weight");
+ return -EINVAL;
+ }
+ for (j = i + 1; j < len; ++j) {
+ if (nhg[i].id == nhg[j].id) {
+ NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group");
+ return -EINVAL;
+ }
+ }
+ }
+
+ nhg = nla_data(tb[NHA_GROUP]);
+ for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) {
+ if (!tb[i])
+ continue;
+ switch (i) {
+ case NHA_HW_STATS_ENABLE:
+ case NHA_FDB:
+ continue;
+ case NHA_RES_GROUP:
+ if (nh_grp_type == NEXTHOP_GRP_TYPE_RES)
+ continue;
+ break;
+ }
+ NL_SET_ERR_MSG(extack,
+ "No other attributes can be set in nexthop groups");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int nh_check_attr_group_rtnl(struct net *net, struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ u8 nh_family = AF_UNSPEC;
+ struct nexthop_grp *nhg;
+ unsigned int len;
+ unsigned int i;
+ u8 nhg_fdb;
+
+ len = nla_len(tb[NHA_GROUP]) / sizeof(*nhg);
+ nhg = nla_data(tb[NHA_GROUP]);
+ nhg_fdb = !!tb[NHA_FDB];
+
+ for (i = 0; i < len; i++) {
+ struct nexthop *nh;
+ bool is_fdb_nh;
+
+ nh = nexthop_find_by_id(net, nhg[i].id);
+ if (!nh) {
+ NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+ return -EINVAL;
+ }
+ if (!valid_group_nh(nh, len, &is_fdb_nh, extack))
+ return -EINVAL;
+
+ if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack))
+ return -EINVAL;
+
+ if (!nhg_fdb && is_fdb_nh) {
+ NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static bool ipv6_good_nh(const struct fib6_nh *nh)
+{
+ int state = NUD_REACHABLE;
+ struct neighbour *n;
+
+ rcu_read_lock();
+
+ n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6);
+ if (n)
+ state = READ_ONCE(n->nud_state);
+
+ rcu_read_unlock();
+
+ return !!(state & NUD_VALID);
+}
+
+static bool ipv4_good_nh(const struct fib_nh *nh)
+{
+ int state = NUD_REACHABLE;
+ struct neighbour *n;
+
+ rcu_read_lock();
+
+ n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev,
+ (__force u32)nh->fib_nh_gw4);
+ if (n)
+ state = READ_ONCE(n->nud_state);
+
+ rcu_read_unlock();
+
+ return !!(state & NUD_VALID);
+}
+
+static bool nexthop_is_good_nh(const struct nexthop *nh)
+{
+ struct nh_info *nhi = rcu_dereference(nh->nh_info);
+
+ switch (nhi->family) {
+ case AF_INET:
+ return ipv4_good_nh(&nhi->fib_nh);
+ case AF_INET6:
+ return ipv6_good_nh(&nhi->fib6_nh);
+ }
+
+ return false;
+}
+
+static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash)
+{
+ int i;
+
+ for (i = 0; i < nhg->num_nh; i++) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+ if (hash > atomic_read(&nhge->hthr.upper_bound))
+ continue;
+
+ nh_grp_entry_stats_inc(nhge);
+ return nhge->nh;
+ }
+
+ WARN_ON_ONCE(1);
+ return NULL;
+}
+
+static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash)
+{
+ struct nh_grp_entry *nhge0 = NULL;
+ int i;
+
+ if (nhg->fdb_nh)
+ return nexthop_select_path_fdb(nhg, hash);
+
+ for (i = 0; i < nhg->num_nh; ++i) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+ /* nexthops always check if it is good and does
+ * not rely on a sysctl for this behavior
+ */
+ if (!nexthop_is_good_nh(nhge->nh))
+ continue;
+
+ if (!nhge0)
+ nhge0 = nhge;
+
+ if (hash > atomic_read(&nhge->hthr.upper_bound))
+ continue;
+
+ nh_grp_entry_stats_inc(nhge);
+ return nhge->nh;
+ }
+
+ if (!nhge0)
+ nhge0 = &nhg->nh_entries[0];
+ nh_grp_entry_stats_inc(nhge0);
+ return nhge0->nh;
+}
+
+static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash)
+{
+ struct nh_res_table *res_table = rcu_dereference(nhg->res_table);
+ u16 bucket_index = hash % res_table->num_nh_buckets;
+ struct nh_res_bucket *bucket;
+ struct nh_grp_entry *nhge;
+
+ /* nexthop_select_path() is expected to return a non-NULL value, so
+ * skip protocol validation and just hand out whatever there is.
+ */
+ bucket = &res_table->nh_buckets[bucket_index];
+ nh_res_bucket_set_busy(bucket);
+ nhge = rcu_dereference(bucket->nh_entry);
+ nh_grp_entry_stats_inc(nhge);
+ return nhge->nh;
+}
+
+struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
+{
+ struct nh_group *nhg;
+
+ if (!nh->is_group)
+ return nh;
+
+ nhg = rcu_dereference(nh->nh_grp);
+ if (nhg->hash_threshold)
+ return nexthop_select_path_hthr(nhg, hash);
+ else if (nhg->resilient)
+ return nexthop_select_path_res(nhg, hash);
+
+ /* Unreachable. */
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nexthop_select_path);
+
+int nexthop_for_each_fib6_nh(struct nexthop *nh,
+ int (*cb)(struct fib6_nh *nh, void *arg),
+ void *arg)
+{
+ struct nh_info *nhi;
+ int err;
+
+ if (nh->is_group) {
+ struct nh_group *nhg;
+ int i;
+
+ nhg = rcu_dereference_rtnl(nh->nh_grp);
+ for (i = 0; i < nhg->num_nh; i++) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+ nhi = rcu_dereference_rtnl(nhge->nh->nh_info);
+ err = cb(&nhi->fib6_nh, arg);
+ if (err)
+ return err;
+ }
+ } else {
+ nhi = rcu_dereference_rtnl(nh->nh_info);
+ err = cb(&nhi->fib6_nh, arg);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh);
+
+static int check_src_addr(const struct in6_addr *saddr,
+ struct netlink_ext_ack *extack)
+{
+ if (!ipv6_addr_any(saddr)) {
+ NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_info *nhi;
+ bool is_fdb_nh;
+
+ /* fib6_src is unique to a fib6_info and limits the ability to cache
+ * routes in fib6_nh within a nexthop that is potentially shared
+ * across multiple fib entries. If the config wants to use source
+ * routing it can not use nexthop objects. mlxsw also does not allow
+ * fib6_src on routes.
+ */
+ if (cfg && check_src_addr(&cfg->fc_src, extack) < 0)
+ return -EINVAL;
+
+ if (nh->is_group) {
+ struct nh_group *nhg;
+
+ nhg = rcu_dereference_rtnl(nh->nh_grp);
+ if (nhg->has_v4)
+ goto no_v4_nh;
+ is_fdb_nh = nhg->fdb_nh;
+ } else {
+ nhi = rcu_dereference_rtnl(nh->nh_info);
+ if (nhi->family == AF_INET)
+ goto no_v4_nh;
+ is_fdb_nh = nhi->fdb_nh;
+ }
+
+ if (is_fdb_nh) {
+ NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
+ return -EINVAL;
+ }
+
+ return 0;
+no_v4_nh:
+ NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop");
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(fib6_check_nexthop);
+
+/* if existing nexthop has ipv6 routes linked to it, need
+ * to verify this new spec works with ipv6
+ */
+static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_info *f6i;
+
+ if (list_empty(&old->f6i_list))
+ return 0;
+
+ list_for_each_entry(f6i, &old->f6i_list, nh_list) {
+ if (check_src_addr(&f6i->fib6_src.addr, extack) < 0)
+ return -EINVAL;
+ }
+
+ return fib6_check_nexthop(new, NULL, extack);
+}
+
+static int nexthop_check_scope(struct nh_info *nhi, u8 scope,
+ struct netlink_ext_ack *extack)
+{
+ if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) {
+ NL_SET_ERR_MSG(extack,
+ "Route with host scope can not have a gateway");
+ return -EINVAL;
+ }
+
+ if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) {
+ NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Invoked by fib add code to verify nexthop by id is ok with
+ * config for prefix; parts of fib_check_nh not done when nexthop
+ * object is used.
+ */
+int fib_check_nexthop(struct nexthop *nh, u8 scope,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_info *nhi;
+ int err = 0;
+
+ if (nh->is_group) {
+ struct nh_group *nhg;
+
+ nhg = rtnl_dereference(nh->nh_grp);
+ if (nhg->fdb_nh) {
+ NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (scope == RT_SCOPE_HOST) {
+ NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops");
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* all nexthops in a group have the same scope */
+ nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info);
+ err = nexthop_check_scope(nhi, scope, extack);
+ } else {
+ nhi = rtnl_dereference(nh->nh_info);
+ if (nhi->fdb_nh) {
+ NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
+ err = -EINVAL;
+ goto out;
+ }
+ err = nexthop_check_scope(nhi, scope, extack);
+ }
+
+out:
+ return err;
+}
+
+static int fib_check_nh_list(struct nexthop *old, struct nexthop *new,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_info *fi;
+
+ list_for_each_entry(fi, &old->fi_list, nh_list) {
+ int err;
+
+ err = fib_check_nexthop(new, fi->fib_scope, extack);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge)
+{
+ return nhge->res.count_buckets == nhge->res.wants_buckets;
+}
+
+static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge)
+{
+ return nhge->res.count_buckets > nhge->res.wants_buckets;
+}
+
+static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge)
+{
+ return nhge->res.count_buckets < nhge->res.wants_buckets;
+}
+
+static bool nh_res_table_is_balanced(const struct nh_res_table *res_table)
+{
+ return list_empty(&res_table->uw_nh_entries);
+}
+
+static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket)
+{
+ struct nh_grp_entry *nhge;
+
+ if (bucket->occupied) {
+ nhge = nh_res_dereference(bucket->nh_entry);
+ nhge->res.count_buckets--;
+ bucket->occupied = false;
+ }
+}
+
+static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket,
+ struct nh_grp_entry *nhge)
+{
+ nh_res_bucket_unset_nh(bucket);
+
+ bucket->occupied = true;
+ rcu_assign_pointer(bucket->nh_entry, nhge);
+ nhge->res.count_buckets++;
+}
+
+static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table,
+ struct nh_res_bucket *bucket,
+ unsigned long *deadline, bool *force)
+{
+ unsigned long now = jiffies;
+ struct nh_grp_entry *nhge;
+ unsigned long idle_point;
+
+ if (!bucket->occupied) {
+ /* The bucket is not occupied, its NHGE pointer is either
+ * NULL or obsolete. We _have to_ migrate: set force.
+ */
+ *force = true;
+ return true;
+ }
+
+ nhge = nh_res_dereference(bucket->nh_entry);
+
+ /* If the bucket is populated by an underweight or balanced
+ * nexthop, do not migrate.
+ */
+ if (!nh_res_nhge_is_ow(nhge))
+ return false;
+
+ /* At this point we know that the bucket is populated with an
+ * overweight nexthop. It needs to be migrated to a new nexthop if
+ * the idle timer of unbalanced timer expired.
+ */
+
+ idle_point = nh_res_bucket_idle_point(res_table, bucket, now);
+ if (time_after_eq(now, idle_point)) {
+ /* The bucket is idle. We _can_ migrate: unset force. */
+ *force = false;
+ return true;
+ }
+
+ /* Unbalanced timer of 0 means "never force". */
+ if (res_table->unbalanced_timer) {
+ unsigned long unb_point;
+
+ unb_point = nh_res_table_unb_point(res_table);
+ if (time_after(now, unb_point)) {
+ /* The bucket is not idle, but the unbalanced timer
+ * expired. We _can_ migrate, but set force anyway,
+ * so that drivers know to ignore activity reports
+ * from the HW.
+ */
+ *force = true;
+ return true;
+ }
+
+ nh_res_time_set_deadline(unb_point, deadline);
+ }
+
+ nh_res_time_set_deadline(idle_point, deadline);
+ return false;
+}
+
+static bool nh_res_bucket_migrate(struct nh_res_table *res_table,
+ u16 bucket_index, bool notify,
+ bool notify_nl, bool force)
+{
+ struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index];
+ struct nh_grp_entry *new_nhge;
+ struct netlink_ext_ack extack;
+ int err;
+
+ new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries,
+ struct nh_grp_entry,
+ res.uw_nh_entry);
+ if (WARN_ON_ONCE(!new_nhge))
+ /* If this function is called, "bucket" is either not
+ * occupied, or it belongs to a next hop that is
+ * overweight. In either case, there ought to be a
+ * corresponding underweight next hop.
+ */
+ return false;
+
+ if (notify) {
+ struct nh_grp_entry *old_nhge;
+
+ old_nhge = nh_res_dereference(bucket->nh_entry);
+ err = call_nexthop_res_bucket_notifiers(res_table->net,
+ res_table->nhg_id,
+ bucket_index, force,
+ old_nhge->nh,
+ new_nhge->nh, &extack);
+ if (err) {
+ pr_err_ratelimited("%s\n", extack._msg);
+ if (!force)
+ return false;
+ /* It is not possible to veto a forced replacement, so
+ * just clear the hardware flags from the nexthop
+ * bucket to indicate to user space that this bucket is
+ * not correctly populated in hardware.
+ */
+ bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
+ }
+ }
+
+ nh_res_bucket_set_nh(bucket, new_nhge);
+ nh_res_bucket_set_idle(res_table, bucket);
+
+ if (notify_nl)
+ nexthop_bucket_notify(res_table, bucket_index);
+
+ if (nh_res_nhge_is_balanced(new_nhge))
+ list_del(&new_nhge->res.uw_nh_entry);
+ return true;
+}
+
+#define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2)
+
+static void nh_res_table_upkeep(struct nh_res_table *res_table,
+ bool notify, bool notify_nl)
+{
+ unsigned long now = jiffies;
+ unsigned long deadline;
+ u16 i;
+
+ /* Deadline is the next time that upkeep should be run. It is the
+ * earliest time at which one of the buckets might be migrated.
+ * Start at the most pessimistic estimate: either unbalanced_timer
+ * from now, or if there is none, idle_timer from now. For each
+ * encountered time point, call nh_res_time_set_deadline() to
+ * refine the estimate.
+ */
+ if (res_table->unbalanced_timer)
+ deadline = now + res_table->unbalanced_timer;
+ else
+ deadline = now + res_table->idle_timer;
+
+ for (i = 0; i < res_table->num_nh_buckets; i++) {
+ struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
+ bool force;
+
+ if (nh_res_bucket_should_migrate(res_table, bucket,
+ &deadline, &force)) {
+ if (!nh_res_bucket_migrate(res_table, i, notify,
+ notify_nl, force)) {
+ unsigned long idle_point;
+
+ /* A driver can override the migration
+ * decision if the HW reports that the
+ * bucket is actually not idle. Therefore
+ * remark the bucket as busy again and
+ * update the deadline.
+ */
+ nh_res_bucket_set_busy(bucket);
+ idle_point = nh_res_bucket_idle_point(res_table,
+ bucket,
+ now);
+ nh_res_time_set_deadline(idle_point, &deadline);
+ }
+ }
+ }
+
+ /* If the group is still unbalanced, schedule the next upkeep to
+ * either the deadline computed above, or the minimum deadline,
+ * whichever comes later.
+ */
+ if (!nh_res_table_is_balanced(res_table)) {
+ unsigned long now = jiffies;
+ unsigned long min_deadline;
+
+ min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL;
+ if (time_before(deadline, min_deadline))
+ deadline = min_deadline;
+
+ queue_delayed_work(system_power_efficient_wq,
+ &res_table->upkeep_dw, deadline - now);
+ }
+}
+
+static void nh_res_table_upkeep_dw(struct work_struct *work)
+{
+ struct delayed_work *dw = to_delayed_work(work);
+ struct nh_res_table *res_table;
+
+ res_table = container_of(dw, struct nh_res_table, upkeep_dw);
+ nh_res_table_upkeep(res_table, true, true);
+}
+
+static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table)
+{
+ cancel_delayed_work_sync(&res_table->upkeep_dw);
+}
+
+static void nh_res_group_rebalance(struct nh_group *nhg,
+ struct nh_res_table *res_table)
+{
+ u16 prev_upper_bound = 0;
+ u32 total = 0;
+ u32 w = 0;
+ int i;
+
+ INIT_LIST_HEAD(&res_table->uw_nh_entries);
+
+ for (i = 0; i < nhg->num_nh; ++i)
+ total += nhg->nh_entries[i].weight;
+
+ for (i = 0; i < nhg->num_nh; ++i) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+ u16 upper_bound;
+ u64 btw;
+
+ w += nhge->weight;
+ btw = ((u64)res_table->num_nh_buckets) * w;
+ upper_bound = DIV_ROUND_CLOSEST_ULL(btw, total);
+ nhge->res.wants_buckets = upper_bound - prev_upper_bound;
+ prev_upper_bound = upper_bound;
+
+ if (nh_res_nhge_is_uw(nhge)) {
+ if (list_empty(&res_table->uw_nh_entries))
+ res_table->unbalanced_since = jiffies;
+ list_add(&nhge->res.uw_nh_entry,
+ &res_table->uw_nh_entries);
+ }
+ }
+}
+
+/* Migrate buckets in res_table so that they reference NHGE's from NHG with
+ * the right NH ID. Set those buckets that do not have a corresponding NHGE
+ * entry in NHG as not occupied.
+ */
+static void nh_res_table_migrate_buckets(struct nh_res_table *res_table,
+ struct nh_group *nhg)
+{
+ u16 i;
+
+ for (i = 0; i < res_table->num_nh_buckets; i++) {
+ struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
+ u32 id = rtnl_dereference(bucket->nh_entry)->nh->id;
+ bool found = false;
+ int j;
+
+ for (j = 0; j < nhg->num_nh; j++) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[j];
+
+ if (nhge->nh->id == id) {
+ nh_res_bucket_set_nh(bucket, nhge);
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ nh_res_bucket_unset_nh(bucket);
+ }
+}
+
+static void replace_nexthop_grp_res(struct nh_group *oldg,
+ struct nh_group *newg)
+{
+ /* For NH group replacement, the new NHG might only have a stub
+ * hash table with 0 buckets, because the number of buckets was not
+ * specified. For NH removal, oldg and newg both reference the same
+ * res_table. So in any case, in the following, we want to work
+ * with oldg->res_table.
+ */
+ struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table);
+ unsigned long prev_unbalanced_since = old_res_table->unbalanced_since;
+ bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries);
+
+ nh_res_table_cancel_upkeep(old_res_table);
+ nh_res_table_migrate_buckets(old_res_table, newg);
+ nh_res_group_rebalance(newg, old_res_table);
+ if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries))
+ old_res_table->unbalanced_since = prev_unbalanced_since;
+ nh_res_table_upkeep(old_res_table, true, false);
+}
+
+static void nh_hthr_group_rebalance(struct nh_group *nhg)
+{
+ u32 total = 0;
+ u32 w = 0;
+ int i;
+
+ for (i = 0; i < nhg->num_nh; ++i)
+ total += nhg->nh_entries[i].weight;
+
+ for (i = 0; i < nhg->num_nh; ++i) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+ u32 upper_bound;
+
+ w += nhge->weight;
+ upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1;
+ atomic_set(&nhge->hthr.upper_bound, upper_bound);
+ }
+}
+
+static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
+ struct nl_info *nlinfo)
+{
+ struct nh_grp_entry *nhges, *new_nhges;
+ struct nexthop *nhp = nhge->nh_parent;
+ struct netlink_ext_ack extack;
+ struct nexthop *nh = nhge->nh;
+ struct nh_group *nhg, *newg;
+ int i, j, err;
+
+ WARN_ON(!nh);
+
+ nhg = rtnl_dereference(nhp->nh_grp);
+ newg = nhg->spare;
+
+ /* last entry, keep it visible and remove the parent */
+ if (nhg->num_nh == 1) {
+ remove_nexthop(net, nhp, nlinfo);
+ return;
+ }
+
+ newg->has_v4 = false;
+ newg->is_multipath = nhg->is_multipath;
+ newg->hash_threshold = nhg->hash_threshold;
+ newg->resilient = nhg->resilient;
+ newg->fdb_nh = nhg->fdb_nh;
+ newg->num_nh = nhg->num_nh;
+
+ /* copy old entries to new except the one getting removed */
+ nhges = nhg->nh_entries;
+ new_nhges = newg->nh_entries;
+ for (i = 0, j = 0; i < nhg->num_nh; ++i) {
+ struct nh_info *nhi;
+
+ /* current nexthop getting removed */
+ if (nhg->nh_entries[i].nh == nh) {
+ newg->num_nh--;
+ continue;
+ }
+
+ nhi = rtnl_dereference(nhges[i].nh->nh_info);
+ if (nhi->family == AF_INET)
+ newg->has_v4 = true;
+
+ list_del(&nhges[i].nh_list);
+ new_nhges[j].stats = nhges[i].stats;
+ new_nhges[j].nh_parent = nhges[i].nh_parent;
+ new_nhges[j].nh = nhges[i].nh;
+ new_nhges[j].weight = nhges[i].weight;
+ list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list);
+ j++;
+ }
+
+ if (newg->hash_threshold)
+ nh_hthr_group_rebalance(newg);
+ else if (newg->resilient)
+ replace_nexthop_grp_res(nhg, newg);
+
+ rcu_assign_pointer(nhp->nh_grp, newg);
+
+ list_del(&nhge->nh_list);
+ free_percpu(nhge->stats);
+ nexthop_put(nhge->nh);
+
+ /* Removal of a NH from a resilient group is notified through
+ * bucket notifications.
+ */
+ if (newg->hash_threshold) {
+ err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp,
+ &extack);
+ if (err)
+ pr_err("%s\n", extack._msg);
+ }
+
+ if (nlinfo)
+ nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo);
+}
+
+static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
+ struct nl_info *nlinfo)
+{
+ struct nh_grp_entry *nhge, *tmp;
+
+ /* If there is nothing to do, let's avoid the costly call to
+ * synchronize_net()
+ */
+ if (list_empty(&nh->grp_list))
+ return;
+
+ list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list)
+ remove_nh_grp_entry(net, nhge, nlinfo);
+
+ /* make sure all see the newly published array before releasing rtnl */
+ synchronize_net();
+}
+
+static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
+{
+ struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp);
+ struct nh_res_table *res_table;
+ int i, num_nh = nhg->num_nh;
+
+ for (i = 0; i < num_nh; ++i) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+
+ if (WARN_ON(!nhge->nh))
+ continue;
+
+ list_del_init(&nhge->nh_list);
+ }
+
+ if (nhg->resilient) {
+ res_table = rtnl_dereference(nhg->res_table);
+ nh_res_table_cancel_upkeep(res_table);
+ }
+}
+
+/* not called for nexthop replace */
+static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
+{
+ struct fib6_info *f6i;
+ bool do_flush = false;
+ struct fib_info *fi;
+
+ list_for_each_entry(fi, &nh->fi_list, nh_list) {
+ fi->fib_flags |= RTNH_F_DEAD;
+ do_flush = true;
+ }
+ if (do_flush)
+ fib_flush(net);
+
+ spin_lock_bh(&nh->lock);
+
+ nh->dead = true;
+
+ while (!list_empty(&nh->f6i_list)) {
+ f6i = list_first_entry(&nh->f6i_list, typeof(*f6i), nh_list);
+
+ /* __ip6_del_rt does a release, so do a hold here */
+ fib6_info_hold(f6i);
+
+ spin_unlock_bh(&nh->lock);
+ ipv6_stub->ip6_del_rt(net, f6i,
+ !READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode));
+
+ spin_lock_bh(&nh->lock);
+ }
+
+ spin_unlock_bh(&nh->lock);
+}
+
+static void __remove_nexthop(struct net *net, struct nexthop *nh,
+ struct nl_info *nlinfo)
+{
+ __remove_nexthop_fib(net, nh);
+
+ if (nh->is_group) {
+ remove_nexthop_group(nh, nlinfo);
+ } else {
+ struct nh_info *nhi;
+
+ nhi = rtnl_dereference(nh->nh_info);
+ if (nhi->fib_nhc.nhc_dev)
+ hlist_del(&nhi->dev_hash);
+
+ remove_nexthop_from_groups(net, nh, nlinfo);
+ }
+}
+
+static void remove_nexthop(struct net *net, struct nexthop *nh,
+ struct nl_info *nlinfo)
+{
+ call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL);
+
+ /* remove from the tree */
+ rb_erase(&nh->rb_node, &net->nexthop.rb_root);
+
+ if (nlinfo)
+ nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
+
+ __remove_nexthop(net, nh, nlinfo);
+ nh_base_seq_inc(net);
+
+ nexthop_put(nh);
+}
+
+/* if any FIB entries reference this nexthop, any dst entries
+ * need to be regenerated
+ */
+static void nh_rt_cache_flush(struct net *net, struct nexthop *nh,
+ struct nexthop *replaced_nh)
+{
+ struct fib6_info *f6i;
+ struct nh_group *nhg;
+ int i;
+
+ if (!list_empty(&nh->fi_list))
+ rt_cache_flush(net);
+
+ list_for_each_entry(f6i, &nh->f6i_list, nh_list)
+ ipv6_stub->fib6_update_sernum(net, f6i);
+
+ /* if an IPv6 group was replaced, we have to release all old
+ * dsts to make sure all refcounts are released
+ */
+ if (!replaced_nh->is_group)
+ return;
+
+ nhg = rtnl_dereference(replaced_nh->nh_grp);
+ for (i = 0; i < nhg->num_nh; i++) {
+ struct nh_grp_entry *nhge = &nhg->nh_entries[i];
+ struct nh_info *nhi = rtnl_dereference(nhge->nh->nh_info);
+
+ if (nhi->family == AF_INET6)
+ ipv6_stub->fib6_nh_release_dsts(&nhi->fib6_nh);
+ }
+}
+
+static int replace_nexthop_grp(struct net *net, struct nexthop *old,
+ struct nexthop *new, const struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_res_table *tmp_table = NULL;
+ struct nh_res_table *new_res_table;
+ struct nh_res_table *old_res_table;
+ struct nh_group *oldg, *newg;
+ int i, err;
+
+ if (!new->is_group) {
+ NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop.");
+ return -EINVAL;
+ }
+
+ oldg = rtnl_dereference(old->nh_grp);
+ newg = rtnl_dereference(new->nh_grp);
+
+ if (newg->hash_threshold != oldg->hash_threshold) {
+ NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type.");
+ return -EINVAL;
+ }
+
+ if (newg->hash_threshold) {
+ err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new,
+ extack);
+ if (err)
+ return err;
+ } else if (newg->resilient) {
+ new_res_table = rtnl_dereference(newg->res_table);
+ old_res_table = rtnl_dereference(oldg->res_table);
+
+ /* Accept if num_nh_buckets was not given, but if it was
+ * given, demand that the value be correct.
+ */
+ if (cfg->nh_grp_res_has_num_buckets &&
+ cfg->nh_grp_res_num_buckets !=
+ old_res_table->num_nh_buckets) {
+ NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group.");
+ return -EINVAL;
+ }
+
+ /* Emit a pre-replace notification so that listeners could veto
+ * a potentially unsupported configuration. Otherwise,
+ * individual bucket replacement notifications would need to be
+ * vetoed, which is something that should only happen if the
+ * bucket is currently active.
+ */
+ err = call_nexthop_res_table_notifiers(net, new, extack);
+ if (err)
+ return err;
+
+ if (cfg->nh_grp_res_has_idle_timer)
+ old_res_table->idle_timer = cfg->nh_grp_res_idle_timer;
+ if (cfg->nh_grp_res_has_unbalanced_timer)
+ old_res_table->unbalanced_timer =
+ cfg->nh_grp_res_unbalanced_timer;
+
+ replace_nexthop_grp_res(oldg, newg);
+
+ tmp_table = new_res_table;
+ rcu_assign_pointer(newg->res_table, old_res_table);
+ rcu_assign_pointer(newg->spare->res_table, old_res_table);
+ }
+
+ /* update parents - used by nexthop code for cleanup */
+ for (i = 0; i < newg->num_nh; i++)
+ newg->nh_entries[i].nh_parent = old;
+
+ rcu_assign_pointer(old->nh_grp, newg);
+
+ /* Make sure concurrent readers are not using 'oldg' anymore. */
+ synchronize_net();
+
+ if (newg->resilient) {
+ rcu_assign_pointer(oldg->res_table, tmp_table);
+ rcu_assign_pointer(oldg->spare->res_table, tmp_table);
+ }
+
+ for (i = 0; i < oldg->num_nh; i++)
+ oldg->nh_entries[i].nh_parent = new;
+
+ rcu_assign_pointer(new->nh_grp, oldg);
+
+ return 0;
+}
+
+static void nh_group_v4_update(struct nh_group *nhg)
+{
+ struct nh_grp_entry *nhges;
+ bool has_v4 = false;
+ int i;
+
+ nhges = nhg->nh_entries;
+ for (i = 0; i < nhg->num_nh; i++) {
+ struct nh_info *nhi;
+
+ nhi = rtnl_dereference(nhges[i].nh->nh_info);
+ if (nhi->family == AF_INET)
+ has_v4 = true;
+ }
+ nhg->has_v4 = has_v4;
+}
+
+static int replace_nexthop_single_notify_res(struct net *net,
+ struct nh_res_table *res_table,
+ struct nexthop *old,
+ struct nh_info *oldi,
+ struct nh_info *newi,
+ struct netlink_ext_ack *extack)
+{
+ u32 nhg_id = res_table->nhg_id;
+ int err;
+ u16 i;
+
+ for (i = 0; i < res_table->num_nh_buckets; i++) {
+ struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
+ struct nh_grp_entry *nhge;
+
+ nhge = rtnl_dereference(bucket->nh_entry);
+ if (nhge->nh == old) {
+ err = __call_nexthop_res_bucket_notifiers(net, nhg_id,
+ i, true,
+ oldi, newi,
+ extack);
+ if (err)
+ goto err_notify;
+ }
+ }
+
+ return 0;
+
+err_notify:
+ while (i-- > 0) {
+ struct nh_res_bucket *bucket = &res_table->nh_buckets[i];
+ struct nh_grp_entry *nhge;
+
+ nhge = rtnl_dereference(bucket->nh_entry);
+ if (nhge->nh == old)
+ __call_nexthop_res_bucket_notifiers(net, nhg_id, i,
+ true, newi, oldi,
+ extack);
+ }
+ return err;
+}
+
+static int replace_nexthop_single_notify(struct net *net,
+ struct nexthop *group_nh,
+ struct nexthop *old,
+ struct nh_info *oldi,
+ struct nh_info *newi,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp);
+ struct nh_res_table *res_table;
+
+ if (nhg->hash_threshold) {
+ return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE,
+ group_nh, extack);
+ } else if (nhg->resilient) {
+ res_table = rtnl_dereference(nhg->res_table);
+ return replace_nexthop_single_notify_res(net, res_table,
+ old, oldi, newi,
+ extack);
+ }
+
+ return -EINVAL;
+}
+
+static int replace_nexthop_single(struct net *net, struct nexthop *old,
+ struct nexthop *new,
+ struct netlink_ext_ack *extack)
+{
+ u8 old_protocol, old_nh_flags;
+ struct nh_info *oldi, *newi;
+ struct nh_grp_entry *nhge;
+ int err;
+
+ if (new->is_group) {
+ NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group.");
+ return -EINVAL;
+ }
+
+ if (!list_empty(&old->grp_list) &&
+ rtnl_dereference(new->nh_info)->fdb_nh !=
+ rtnl_dereference(old->nh_info)->fdb_nh) {
+ NL_SET_ERR_MSG(extack, "Cannot change nexthop FDB status while in a group");
+ return -EINVAL;
+ }
+
+ err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack);
+ if (err)
+ return err;
+
+ /* Hardware flags were set on 'old' as 'new' is not in the red-black
+ * tree. Therefore, inherit the flags from 'old' to 'new'.
+ */
+ new->nh_flags |= old->nh_flags & (RTNH_F_OFFLOAD | RTNH_F_TRAP);
+
+ oldi = rtnl_dereference(old->nh_info);
+ newi = rtnl_dereference(new->nh_info);
+
+ newi->nh_parent = old;
+ oldi->nh_parent = new;
+
+ old_protocol = old->protocol;
+ old_nh_flags = old->nh_flags;
+
+ old->protocol = new->protocol;
+ old->nh_flags = new->nh_flags;
+
+ rcu_assign_pointer(old->nh_info, newi);
+ rcu_assign_pointer(new->nh_info, oldi);
+
+ /* Send a replace notification for all the groups using the nexthop. */
+ list_for_each_entry(nhge, &old->grp_list, nh_list) {
+ struct nexthop *nhp = nhge->nh_parent;
+
+ err = replace_nexthop_single_notify(net, nhp, old, oldi, newi,
+ extack);
+ if (err)
+ goto err_notify;
+ }
+
+ /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially
+ * update IPv4 indication in all the groups using the nexthop.
+ */
+ if (oldi->family == AF_INET && newi->family == AF_INET6) {
+ list_for_each_entry(nhge, &old->grp_list, nh_list) {
+ struct nexthop *nhp = nhge->nh_parent;
+ struct nh_group *nhg;
+
+ nhg = rtnl_dereference(nhp->nh_grp);
+ nh_group_v4_update(nhg);
+ }
+ }
+
+ return 0;
+
+err_notify:
+ rcu_assign_pointer(new->nh_info, newi);
+ rcu_assign_pointer(old->nh_info, oldi);
+ old->nh_flags = old_nh_flags;
+ old->protocol = old_protocol;
+ oldi->nh_parent = old;
+ newi->nh_parent = new;
+ list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) {
+ struct nexthop *nhp = nhge->nh_parent;
+
+ replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL);
+ }
+ call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack);
+ return err;
+}
+
+static void __nexthop_replace_notify(struct net *net, struct nexthop *nh,
+ struct nl_info *info)
+{
+ struct fib6_info *f6i;
+
+ if (!list_empty(&nh->fi_list)) {
+ struct fib_info *fi;
+
+ /* expectation is a few fib_info per nexthop and then
+ * a lot of routes per fib_info. So mark the fib_info
+ * and then walk the fib tables once
+ */
+ list_for_each_entry(fi, &nh->fi_list, nh_list)
+ fi->nh_updated = true;
+
+ fib_info_notify_update(net, info);
+
+ list_for_each_entry(fi, &nh->fi_list, nh_list)
+ fi->nh_updated = false;
+ }
+
+ list_for_each_entry(f6i, &nh->f6i_list, nh_list)
+ ipv6_stub->fib6_rt_update(net, f6i, info);
+}
+
+/* send RTM_NEWROUTE with REPLACE flag set for all FIB entries
+ * linked to this nexthop and for all groups that the nexthop
+ * is a member of
+ */
+static void nexthop_replace_notify(struct net *net, struct nexthop *nh,
+ struct nl_info *info)
+{
+ struct nh_grp_entry *nhge;
+
+ __nexthop_replace_notify(net, nh, info);
+
+ list_for_each_entry(nhge, &nh->grp_list, nh_list)
+ __nexthop_replace_notify(net, nhge->nh_parent, info);
+}
+
+static int replace_nexthop(struct net *net, struct nexthop *old,
+ struct nexthop *new, const struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ bool new_is_reject = false;
+ struct nh_grp_entry *nhge;
+ int err;
+
+ /* check that existing FIB entries are ok with the
+ * new nexthop definition
+ */
+ err = fib_check_nh_list(old, new, extack);
+ if (err)
+ return err;
+
+ err = fib6_check_nh_list(old, new, extack);
+ if (err)
+ return err;
+
+ if (!new->is_group) {
+ struct nh_info *nhi = rtnl_dereference(new->nh_info);
+
+ new_is_reject = nhi->reject_nh;
+ }
+
+ list_for_each_entry(nhge, &old->grp_list, nh_list) {
+ /* if new nexthop is a blackhole, any groups using this
+ * nexthop cannot have more than 1 path
+ */
+ if (new_is_reject &&
+ nexthop_num_path(nhge->nh_parent) > 1) {
+ NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path");
+ return -EINVAL;
+ }
+
+ err = fib_check_nh_list(nhge->nh_parent, new, extack);
+ if (err)
+ return err;
+
+ err = fib6_check_nh_list(nhge->nh_parent, new, extack);
+ if (err)
+ return err;
+ }
+
+ if (old->is_group)
+ err = replace_nexthop_grp(net, old, new, cfg, extack);
+ else
+ err = replace_nexthop_single(net, old, new, extack);
+
+ if (!err) {
+ nh_rt_cache_flush(net, old, new);
+
+ __remove_nexthop(net, new, NULL);
+ nexthop_put(new);
+ }
+
+ return err;
+}
+
+/* called with rtnl_lock held */
+static int insert_nexthop(struct net *net, struct nexthop *new_nh,
+ struct nh_config *cfg, struct netlink_ext_ack *extack)
+{
+ struct rb_node **pp, *parent = NULL, *next;
+ struct rb_root *root = &net->nexthop.rb_root;
+ bool replace = !!(cfg->nlflags & NLM_F_REPLACE);
+ bool create = !!(cfg->nlflags & NLM_F_CREATE);
+ u32 new_id = new_nh->id;
+ int replace_notify = 0;
+ int rc = -EEXIST;
+
+ pp = &root->rb_node;
+ while (1) {
+ struct nexthop *nh;
+
+ next = *pp;
+ if (!next)
+ break;
+
+ parent = next;
+
+ nh = rb_entry(parent, struct nexthop, rb_node);
+ if (new_id < nh->id) {
+ pp = &next->rb_left;
+ } else if (new_id > nh->id) {
+ pp = &next->rb_right;
+ } else if (replace) {
+ rc = replace_nexthop(net, nh, new_nh, cfg, extack);
+ if (!rc) {
+ new_nh = nh; /* send notification with old nh */
+ replace_notify = 1;
+ }
+ goto out;
+ } else {
+ /* id already exists and not a replace */
+ goto out;
+ }
+ }
+
+ if (replace && !create) {
+ NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists");
+ rc = -ENOENT;
+ goto out;
+ }
+
+ if (new_nh->is_group) {
+ struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp);
+ struct nh_res_table *res_table;
+
+ if (nhg->resilient) {
+ res_table = rtnl_dereference(nhg->res_table);
+
+ /* Not passing the number of buckets is OK when
+ * replacing, but not when creating a new group.
+ */
+ if (!cfg->nh_grp_res_has_num_buckets) {
+ NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion");
+ rc = -EINVAL;
+ goto out;
+ }
+
+ nh_res_group_rebalance(nhg, res_table);
+
+ /* Do not send bucket notifications, we do full
+ * notification below.
+ */
+ nh_res_table_upkeep(res_table, false, false);
+ }
+ }
+
+ rb_link_node_rcu(&new_nh->rb_node, parent, pp);
+ rb_insert_color(&new_nh->rb_node, root);
+
+ /* The initial insertion is a full notification for hash-threshold as
+ * well as resilient groups.
+ */
+ rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack);
+ if (rc)
+ rb_erase(&new_nh->rb_node, &net->nexthop.rb_root);
+
+out:
+ if (!rc) {
+ nh_base_seq_inc(net);
+ nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo);
+ if (replace_notify &&
+ READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode))
+ nexthop_replace_notify(net, new_nh, &cfg->nlinfo);
+ }
+
+ return rc;
+}
+
+/* rtnl */
+/* remove all nexthops tied to a device being deleted */
+static void nexthop_flush_dev(struct net_device *dev, unsigned long event)
+{
+ unsigned int hash = nh_dev_hashfn(dev->ifindex);
+ struct net *net = dev_net(dev);
+ struct hlist_head *head = &net->nexthop.devhash[hash];
+ struct hlist_node *n;
+ struct nh_info *nhi;
+
+ hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
+ if (nhi->fib_nhc.nhc_dev != dev)
+ continue;
+
+ if (nhi->reject_nh &&
+ (event == NETDEV_DOWN || event == NETDEV_CHANGE))
+ continue;
+
+ remove_nexthop(net, nhi->nh_parent, NULL);
+ }
+}
+
+/* rtnl; called when net namespace is deleted */
+static void flush_all_nexthops(struct net *net)
+{
+ struct rb_root *root = &net->nexthop.rb_root;
+ struct rb_node *node;
+ struct nexthop *nh;
+
+ while ((node = rb_first(root))) {
+ nh = rb_entry(node, struct nexthop, rb_node);
+ remove_nexthop(net, nh, NULL);
+ cond_resched();
+ }
+}
+
+static struct nexthop *nexthop_create_group(struct net *net,
+ struct nh_config *cfg)
+{
+ struct nlattr *grps_attr = cfg->nh_grp;
+ struct nexthop_grp *entry = nla_data(grps_attr);
+ u16 num_nh = nla_len(grps_attr) / sizeof(*entry);
+ struct nh_group *nhg;
+ struct nexthop *nh;
+ int err;
+ int i;
+
+ nh = nexthop_alloc();
+ if (!nh)
+ return ERR_PTR(-ENOMEM);
+
+ nh->is_group = 1;
+
+ nhg = nexthop_grp_alloc(num_nh);
+ if (!nhg) {
+ kfree(nh);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* spare group used for removals */
+ nhg->spare = nexthop_grp_alloc(num_nh);
+ if (!nhg->spare) {
+ kfree(nhg);
+ kfree(nh);
+ return ERR_PTR(-ENOMEM);
+ }
+ nhg->spare->spare = nhg;
+
+ for (i = 0; i < nhg->num_nh; ++i) {
+ struct nexthop *nhe;
+ struct nh_info *nhi;
+
+ nhe = nexthop_find_by_id(net, entry[i].id);
+ if (!nexthop_get(nhe)) {
+ err = -ENOENT;
+ goto out_no_nh;
+ }
+
+ nhi = rtnl_dereference(nhe->nh_info);
+ if (nhi->family == AF_INET)
+ nhg->has_v4 = true;
+
+ nhg->nh_entries[i].stats =
+ netdev_alloc_pcpu_stats(struct nh_grp_entry_stats);
+ if (!nhg->nh_entries[i].stats) {
+ err = -ENOMEM;
+ nexthop_put(nhe);
+ goto out_no_nh;
+ }
+ nhg->nh_entries[i].nh = nhe;
+ nhg->nh_entries[i].weight = nexthop_grp_weight(&entry[i]);
+
+ list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list);
+ nhg->nh_entries[i].nh_parent = nh;
+ }
+
+ if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
+ nhg->hash_threshold = 1;
+ nhg->is_multipath = true;
+ } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) {
+ struct nh_res_table *res_table;
+
+ res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg);
+ if (!res_table) {
+ err = -ENOMEM;
+ goto out_no_nh;
+ }
+
+ rcu_assign_pointer(nhg->spare->res_table, res_table);
+ rcu_assign_pointer(nhg->res_table, res_table);
+ nhg->resilient = true;
+ nhg->is_multipath = true;
+ }
+
+ WARN_ON_ONCE(nhg->hash_threshold + nhg->resilient != 1);
+
+ if (nhg->hash_threshold)
+ nh_hthr_group_rebalance(nhg);
+
+ if (cfg->nh_fdb)
+ nhg->fdb_nh = 1;
+
+ if (cfg->nh_hw_stats)
+ nhg->hw_stats = true;
+
+ rcu_assign_pointer(nh->nh_grp, nhg);
+
+ return nh;
+
+out_no_nh:
+ for (i--; i >= 0; --i) {
+ list_del(&nhg->nh_entries[i].nh_list);
+ free_percpu(nhg->nh_entries[i].stats);
+ nexthop_put(nhg->nh_entries[i].nh);
+ }
+
+ kfree(nhg->spare);
+ kfree(nhg);
+ kfree(nh);
+
+ return ERR_PTR(err);
+}
+
+static int nh_create_ipv4(struct net *net, struct nexthop *nh,
+ struct nh_info *nhi, struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct fib_nh *fib_nh = &nhi->fib_nh;
+ struct fib_config fib_cfg = {
+ .fc_oif = cfg->nh_ifindex,
+ .fc_gw4 = cfg->gw.ipv4,
+ .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0,
+ .fc_flags = cfg->nh_flags,
+ .fc_nlinfo = cfg->nlinfo,
+ .fc_encap = cfg->nh_encap,
+ .fc_encap_type = cfg->nh_encap_type,
+ };
+ u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN);
+ int err;
+
+ err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
+ if (err) {
+ fib_nh_release(net, fib_nh);
+ goto out;
+ }
+
+ if (nhi->fdb_nh)
+ goto out;
+
+ /* sets nh_dev if successful */
+ err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
+ if (!err) {
+ nh->nh_flags = fib_nh->fib_nh_flags;
+ fib_info_update_nhc_saddr(net, &fib_nh->nh_common,
+ !fib_nh->fib_nh_scope ? 0 : fib_nh->fib_nh_scope - 1);
+ } else {
+ fib_nh_release(net, fib_nh);
+ }
+out:
+ return err;
+}
+
+static int nh_create_ipv6(struct net *net, struct nexthop *nh,
+ struct nh_info *nhi, struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_nh *fib6_nh = &nhi->fib6_nh;
+ struct fib6_config fib6_cfg = {
+ .fc_table = l3mdev_fib_table(cfg->dev),
+ .fc_ifindex = cfg->nh_ifindex,
+ .fc_gateway = cfg->gw.ipv6,
+ .fc_flags = cfg->nh_flags,
+ .fc_nlinfo = cfg->nlinfo,
+ .fc_encap = cfg->nh_encap,
+ .fc_encap_type = cfg->nh_encap_type,
+ .fc_is_fdb = cfg->nh_fdb,
+ };
+ int err;
+
+ if (!ipv6_addr_any(&cfg->gw.ipv6))
+ fib6_cfg.fc_flags |= RTF_GATEWAY;
+
+ /* sets nh_dev if successful */
+ err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL,
+ extack);
+ if (err) {
+ /* IPv6 is not enabled, don't call fib6_nh_release */
+ if (err == -EAFNOSUPPORT)
+ goto out;
+ ipv6_stub->fib6_nh_release(fib6_nh);
+ } else {
+ nh->nh_flags = fib6_nh->fib_nh_flags;
+ }
+out:
+ return err;
+}
+
+static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_info *nhi;
+ struct nexthop *nh;
+ int err = 0;
+
+ nh = nexthop_alloc();
+ if (!nh)
+ return ERR_PTR(-ENOMEM);
+
+ nhi = kzalloc(sizeof(*nhi), GFP_KERNEL);
+ if (!nhi) {
+ kfree(nh);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ nh->nh_flags = cfg->nh_flags;
+ nh->net = net;
+
+ nhi->nh_parent = nh;
+ nhi->family = cfg->nh_family;
+ nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK;
+
+ if (cfg->nh_fdb)
+ nhi->fdb_nh = 1;
+
+ if (cfg->nh_blackhole) {
+ nhi->reject_nh = 1;
+ cfg->nh_ifindex = net->loopback_dev->ifindex;
+ }
+
+ switch (cfg->nh_family) {
+ case AF_INET:
+ err = nh_create_ipv4(net, nh, nhi, cfg, extack);
+ break;
+ case AF_INET6:
+ err = nh_create_ipv6(net, nh, nhi, cfg, extack);
+ break;
+ }
+
+ if (err) {
+ kfree(nhi);
+ kfree(nh);
+ return ERR_PTR(err);
+ }
+
+ /* add the entry to the device based hash */
+ if (!nhi->fdb_nh)
+ nexthop_devhash_add(net, nhi);
+
+ rcu_assign_pointer(nh->nh_info, nhi);
+
+ return nh;
+}
+
+/* called with rtnl lock held */
+static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nexthop *nh;
+ int err;
+
+ if (!cfg->nh_id) {
+ cfg->nh_id = nh_find_unused_id(net);
+ if (!cfg->nh_id) {
+ NL_SET_ERR_MSG(extack, "No unused id");
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ if (cfg->nh_grp)
+ nh = nexthop_create_group(net, cfg);
+ else
+ nh = nexthop_create(net, cfg, extack);
+
+ if (IS_ERR(nh))
+ return nh;
+
+ refcount_set(&nh->refcnt, 1);
+ nh->id = cfg->nh_id;
+ nh->protocol = cfg->nh_protocol;
+ nh->net = net;
+
+ err = insert_nexthop(net, nh, cfg, extack);
+ if (err) {
+ __remove_nexthop(net, nh, NULL);
+ nexthop_put(nh);
+ nh = ERR_PTR(err);
+ }
+
+ return nh;
+}
+
+static int rtm_nh_get_timer(struct nlattr *attr, unsigned long fallback,
+ unsigned long *timer_p, bool *has_p,
+ struct netlink_ext_ack *extack)
+{
+ unsigned long timer;
+ u32 value;
+
+ if (!attr) {
+ *timer_p = fallback;
+ *has_p = false;
+ return 0;
+ }
+
+ value = nla_get_u32(attr);
+ timer = clock_t_to_jiffies(value);
+ if (timer == ~0UL) {
+ NL_SET_ERR_MSG(extack, "Timer value too large");
+ return -EINVAL;
+ }
+
+ *timer_p = timer;
+ *has_p = true;
+ return 0;
+}
+
+static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_policy_new)] = {};
+ int err;
+
+ if (res) {
+ err = nla_parse_nested(tb,
+ ARRAY_SIZE(rtm_nh_res_policy_new) - 1,
+ res, rtm_nh_res_policy_new, extack);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[NHA_RES_GROUP_BUCKETS]) {
+ cfg->nh_grp_res_num_buckets =
+ nla_get_u16(tb[NHA_RES_GROUP_BUCKETS]);
+ cfg->nh_grp_res_has_num_buckets = true;
+ if (!cfg->nh_grp_res_num_buckets) {
+ NL_SET_ERR_MSG(extack, "Number of buckets needs to be non-0");
+ return -EINVAL;
+ }
+ }
+
+ err = rtm_nh_get_timer(tb[NHA_RES_GROUP_IDLE_TIMER],
+ NH_RES_DEFAULT_IDLE_TIMER,
+ &cfg->nh_grp_res_idle_timer,
+ &cfg->nh_grp_res_has_idle_timer,
+ extack);
+ if (err)
+ return err;
+
+ return rtm_nh_get_timer(tb[NHA_RES_GROUP_UNBALANCED_TIMER],
+ NH_RES_DEFAULT_UNBALANCED_TIMER,
+ &cfg->nh_grp_res_unbalanced_timer,
+ &cfg->nh_grp_res_has_unbalanced_timer,
+ extack);
+}
+
+static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
+ struct nlmsghdr *nlh, struct nlattr **tb,
+ struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct nhmsg *nhm = nlmsg_data(nlh);
+ int err;
+
+ err = -EINVAL;
+ if (nhm->resvd || nhm->nh_scope) {
+ NL_SET_ERR_MSG(extack, "Invalid values in ancillary header");
+ goto out;
+ }
+ if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) {
+ NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header");
+ goto out;
+ }
+
+ switch (nhm->nh_family) {
+ case AF_INET:
+ case AF_INET6:
+ break;
+ case AF_UNSPEC:
+ if (tb[NHA_GROUP])
+ break;
+ fallthrough;
+ default:
+ NL_SET_ERR_MSG(extack, "Invalid address family");
+ goto out;
+ }
+
+ memset(cfg, 0, sizeof(*cfg));
+ cfg->nlflags = nlh->nlmsg_flags;
+ cfg->nlinfo.portid = NETLINK_CB(skb).portid;
+ cfg->nlinfo.nlh = nlh;
+ cfg->nlinfo.nl_net = net;
+
+ cfg->nh_family = nhm->nh_family;
+ cfg->nh_protocol = nhm->nh_protocol;
+ cfg->nh_flags = nhm->nh_flags;
+
+ if (tb[NHA_ID])
+ cfg->nh_id = nla_get_u32(tb[NHA_ID]);
+
+ if (tb[NHA_FDB]) {
+ if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] ||
+ tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) {
+ NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole");
+ goto out;
+ }
+ if (nhm->nh_flags) {
+ NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header");
+ goto out;
+ }
+ cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]);
+ }
+
+ if (tb[NHA_GROUP]) {
+ if (nhm->nh_family != AF_UNSPEC) {
+ NL_SET_ERR_MSG(extack, "Invalid family for group");
+ goto out;
+ }
+ cfg->nh_grp = tb[NHA_GROUP];
+
+ cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH;
+ if (tb[NHA_GROUP_TYPE])
+ cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]);
+
+ if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid group type");
+ goto out;
+ }
+
+ err = nh_check_attr_group(net, tb, ARRAY_SIZE(rtm_nh_policy_new),
+ cfg->nh_grp_type, extack);
+ if (err)
+ goto out;
+
+ if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES)
+ err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP],
+ cfg, extack);
+
+ if (tb[NHA_HW_STATS_ENABLE])
+ cfg->nh_hw_stats = nla_get_u32(tb[NHA_HW_STATS_ENABLE]);
+
+ /* no other attributes should be set */
+ goto out;
+ }
+
+ if (tb[NHA_BLACKHOLE]) {
+ if (tb[NHA_GATEWAY] || tb[NHA_OIF] ||
+ tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) {
+ NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb");
+ goto out;
+ }
+
+ cfg->nh_blackhole = 1;
+ err = 0;
+ goto out;
+ }
+
+ if (!cfg->nh_fdb && !tb[NHA_OIF]) {
+ NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops");
+ goto out;
+ }
+
+ err = -EINVAL;
+ if (tb[NHA_GATEWAY]) {
+ struct nlattr *gwa = tb[NHA_GATEWAY];
+
+ switch (cfg->nh_family) {
+ case AF_INET:
+ if (nla_len(gwa) != sizeof(u32)) {
+ NL_SET_ERR_MSG(extack, "Invalid gateway");
+ goto out;
+ }
+ cfg->gw.ipv4 = nla_get_be32(gwa);
+ break;
+ case AF_INET6:
+ if (nla_len(gwa) != sizeof(struct in6_addr)) {
+ NL_SET_ERR_MSG(extack, "Invalid gateway");
+ goto out;
+ }
+ cfg->gw.ipv6 = nla_get_in6_addr(gwa);
+ break;
+ default:
+ NL_SET_ERR_MSG(extack,
+ "Unknown address family for gateway");
+ goto out;
+ }
+ } else {
+ /* device only nexthop (no gateway) */
+ if (cfg->nh_flags & RTNH_F_ONLINK) {
+ NL_SET_ERR_MSG(extack,
+ "ONLINK flag can not be set for nexthop without a gateway");
+ goto out;
+ }
+ }
+
+ if (tb[NHA_ENCAP]) {
+ cfg->nh_encap = tb[NHA_ENCAP];
+
+ if (!tb[NHA_ENCAP_TYPE]) {
+ NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing");
+ goto out;
+ }
+
+ cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]);
+ err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack);
+ if (err < 0)
+ goto out;
+
+ } else if (tb[NHA_ENCAP_TYPE]) {
+ NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing");
+ goto out;
+ }
+
+ if (tb[NHA_HW_STATS_ENABLE]) {
+ NL_SET_ERR_MSG(extack, "Cannot enable nexthop hardware statistics for non-group nexthops");
+ goto out;
+ }
+
+ err = 0;
+out:
+ return err;
+}
+
+static int rtm_to_nh_config_rtnl(struct net *net, struct nlattr **tb,
+ struct nh_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ if (tb[NHA_GROUP])
+ return nh_check_attr_group_rtnl(net, tb, extack);
+
+ if (tb[NHA_OIF]) {
+ cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
+ if (cfg->nh_ifindex)
+ cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
+
+ if (!cfg->dev) {
+ NL_SET_ERR_MSG(extack, "Invalid device index");
+ return -EINVAL;
+ }
+
+ if (!(cfg->dev->flags & IFF_UP)) {
+ NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+ return -ENETDOWN;
+ }
+
+ if (!netif_carrier_ok(cfg->dev)) {
+ NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
+ return -ENETDOWN;
+ }
+ }
+
+ return 0;
+}
+
+/* rtnl */
+static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)];
+ struct net *net = sock_net(skb->sk);
+ struct nh_config cfg;
+ struct nexthop *nh;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
+ ARRAY_SIZE(rtm_nh_policy_new) - 1,
+ rtm_nh_policy_new, extack);
+ if (err < 0)
+ goto out;
+
+ err = rtm_to_nh_config(net, skb, nlh, tb, &cfg, extack);
+ if (err)
+ goto out;
+
+ if (cfg.nlflags & NLM_F_REPLACE && !cfg.nh_id) {
+ NL_SET_ERR_MSG(extack, "Replace requires nexthop id");
+ err = -EINVAL;
+ goto out;
+ }
+
+ rtnl_net_lock(net);
+
+ err = rtm_to_nh_config_rtnl(net, tb, &cfg, extack);
+ if (err)
+ goto unlock;
+
+ nh = nexthop_add(net, &cfg, extack);
+ if (IS_ERR(nh))
+ err = PTR_ERR(nh);
+
+unlock:
+ rtnl_net_unlock(net);
+out:
+ return err;
+}
+
+static int nh_valid_get_del_req(const struct nlmsghdr *nlh,
+ struct nlattr **tb, u32 *id, u32 *op_flags,
+ struct netlink_ext_ack *extack)
+{
+ struct nhmsg *nhm = nlmsg_data(nlh);
+
+ if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header");
+ return -EINVAL;
+ }
+
+ if (!tb[NHA_ID]) {
+ NL_SET_ERR_MSG(extack, "Nexthop id is missing");
+ return -EINVAL;
+ }
+
+ *id = nla_get_u32(tb[NHA_ID]);
+ if (!(*id)) {
+ NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+ return -EINVAL;
+ }
+
+ if (op_flags)
+ *op_flags = nla_get_u32_default(tb[NHA_OP_FLAGS], 0);
+
+ return 0;
+}
+
+/* rtnl */
+static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_del)];
+ struct net *net = sock_net(skb->sk);
+ struct nl_info nlinfo = {
+ .nlh = nlh,
+ .nl_net = net,
+ .portid = NETLINK_CB(skb).portid,
+ };
+ struct nexthop *nh;
+ int err;
+ u32 id;
+
+ err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
+ ARRAY_SIZE(rtm_nh_policy_del) - 1, rtm_nh_policy_del,
+ extack);
+ if (err < 0)
+ return err;
+
+ err = nh_valid_get_del_req(nlh, tb, &id, NULL, extack);
+ if (err)
+ return err;
+
+ rtnl_net_lock(net);
+
+ nh = nexthop_find_by_id(net, id);
+ if (nh)
+ remove_nexthop(net, nh, &nlinfo);
+ else
+ err = -ENOENT;
+
+ rtnl_net_unlock(net);
+
+ return err;
+}
+
+/* rtnl */
+static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)];
+ struct net *net = sock_net(in_skb->sk);
+ struct sk_buff *skb = NULL;
+ struct nexthop *nh;
+ u32 op_flags;
+ int err;
+ u32 id;
+
+ err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
+ ARRAY_SIZE(rtm_nh_policy_get) - 1, rtm_nh_policy_get,
+ extack);
+ if (err < 0)
+ return err;
+
+ err = nh_valid_get_del_req(nlh, tb, &id, &op_flags, extack);
+ if (err)
+ return err;
+
+ err = -ENOBUFS;
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ goto out;
+
+ err = -ENOENT;
+ nh = nexthop_find_by_id(net, id);
+ if (!nh)
+ goto errout_free;
+
+ err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0, op_flags);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ goto errout_free;
+ }
+
+ err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+out:
+ return err;
+errout_free:
+ kfree_skb(skb);
+ goto out;
+}
+
+struct nh_dump_filter {
+ u32 nh_id;
+ int dev_idx;
+ int master_idx;
+ bool group_filter;
+ bool fdb_filter;
+ u32 res_bucket_nh_id;
+ u32 op_flags;
+};
+
+static bool nh_dump_filtered(struct nexthop *nh,
+ struct nh_dump_filter *filter, u8 family)
+{
+ const struct net_device *dev;
+ const struct nh_info *nhi;
+
+ if (filter->group_filter && !nh->is_group)
+ return true;
+
+ if (!filter->dev_idx && !filter->master_idx && !family)
+ return false;
+
+ if (nh->is_group)
+ return true;
+
+ nhi = rtnl_dereference(nh->nh_info);
+ if (family && nhi->family != family)
+ return true;
+
+ dev = nhi->fib_nhc.nhc_dev;
+ if (filter->dev_idx && (!dev || dev->ifindex != filter->dev_idx))
+ return true;
+
+ if (filter->master_idx) {
+ struct net_device *master;
+
+ if (!dev)
+ return true;
+
+ master = netdev_master_upper_dev_get((struct net_device *)dev);
+ if (!master || master->ifindex != filter->master_idx)
+ return true;
+ }
+
+ return false;
+}
+
+static int __nh_valid_dump_req(const struct nlmsghdr *nlh, struct nlattr **tb,
+ struct nh_dump_filter *filter,
+ struct netlink_ext_ack *extack)
+{
+ struct nhmsg *nhm;
+ u32 idx;
+
+ if (tb[NHA_OIF]) {
+ idx = nla_get_u32(tb[NHA_OIF]);
+ if (idx > INT_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid device index");
+ return -EINVAL;
+ }
+ filter->dev_idx = idx;
+ }
+ if (tb[NHA_MASTER]) {
+ idx = nla_get_u32(tb[NHA_MASTER]);
+ if (idx > INT_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid master device index");
+ return -EINVAL;
+ }
+ filter->master_idx = idx;
+ }
+ filter->group_filter = nla_get_flag(tb[NHA_GROUPS]);
+ filter->fdb_filter = nla_get_flag(tb[NHA_FDB]);
+
+ nhm = nlmsg_data(nlh);
+ if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) {
+ NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int nh_valid_dump_req(const struct nlmsghdr *nlh,
+ struct nh_dump_filter *filter,
+ struct netlink_callback *cb)
+{
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump)];
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
+ ARRAY_SIZE(rtm_nh_policy_dump) - 1,
+ rtm_nh_policy_dump, cb->extack);
+ if (err < 0)
+ return err;
+
+ filter->op_flags = nla_get_u32_default(tb[NHA_OP_FLAGS], 0);
+
+ return __nh_valid_dump_req(nlh, tb, filter, cb->extack);
+}
+
+struct rtm_dump_nh_ctx {
+ u32 idx;
+};
+
+static struct rtm_dump_nh_ctx *
+rtm_dump_nh_ctx(struct netlink_callback *cb)
+{
+ struct rtm_dump_nh_ctx *ctx = (void *)cb->ctx;
+
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
+ return ctx;
+}
+
+static int rtm_dump_walk_nexthops(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct rb_root *root,
+ struct rtm_dump_nh_ctx *ctx,
+ int (*nh_cb)(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct nexthop *nh, void *data),
+ void *data)
+{
+ struct rb_node *node;
+ int s_idx;
+ int err;
+
+ s_idx = ctx->idx;
+
+ /* If this is not the first invocation, ctx->idx will contain the id of
+ * the last nexthop we processed. Instead of starting from the very
+ * first element of the red/black tree again and linearly skipping the
+ * (potentially large) set of nodes with an id smaller than s_idx, walk
+ * the tree and find the left-most node whose id is >= s_idx. This
+ * provides an efficient O(log n) starting point for the dump
+ * continuation.
+ */
+ if (s_idx != 0) {
+ struct rb_node *tmp = root->rb_node;
+
+ node = NULL;
+ while (tmp) {
+ struct nexthop *nh;
+
+ nh = rb_entry(tmp, struct nexthop, rb_node);
+ if (nh->id < s_idx) {
+ tmp = tmp->rb_right;
+ } else {
+ /* Track current candidate and keep looking on
+ * the left side to find the left-most
+ * (smallest id) that is still >= s_idx.
+ */
+ node = tmp;
+ tmp = tmp->rb_left;
+ }
+ }
+ } else {
+ node = rb_first(root);
+ }
+
+ for (; node; node = rb_next(node)) {
+ struct nexthop *nh;
+
+ nh = rb_entry(node, struct nexthop, rb_node);
+
+ ctx->idx = nh->id;
+ err = nh_cb(skb, cb, nh, data);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb,
+ struct nexthop *nh, void *data)
+{
+ struct nhmsg *nhm = nlmsg_data(cb->nlh);
+ struct nh_dump_filter *filter = data;
+
+ if (nh_dump_filtered(nh, filter, nhm->nh_family))
+ return 0;
+
+ return nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI, filter->op_flags);
+}
+
+/* rtnl */
+static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct rtm_dump_nh_ctx *ctx = rtm_dump_nh_ctx(cb);
+ struct net *net = sock_net(skb->sk);
+ struct rb_root *root = &net->nexthop.rb_root;
+ struct nh_dump_filter filter = {};
+ int err;
+
+ err = nh_valid_dump_req(cb->nlh, &filter, cb);
+ if (err < 0)
+ return err;
+
+ err = rtm_dump_walk_nexthops(skb, cb, root, ctx,
+ &rtm_dump_nexthop_cb, &filter);
+
+ cb->seq = net->nexthop.seq;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ return err;
+}
+
+static struct nexthop *
+nexthop_find_group_resilient(struct net *net, u32 id,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_group *nhg;
+ struct nexthop *nh;
+
+ nh = nexthop_find_by_id(net, id);
+ if (!nh)
+ return ERR_PTR(-ENOENT);
+
+ if (!nh->is_group) {
+ NL_SET_ERR_MSG(extack, "Not a nexthop group");
+ return ERR_PTR(-EINVAL);
+ }
+
+ nhg = rtnl_dereference(nh->nh_grp);
+ if (!nhg->resilient) {
+ NL_SET_ERR_MSG(extack, "Nexthop group not of type resilient");
+ return ERR_PTR(-EINVAL);
+ }
+
+ return nh;
+}
+
+static int nh_valid_dump_nhid(struct nlattr *attr, u32 *nh_id_p,
+ struct netlink_ext_ack *extack)
+{
+ u32 idx;
+
+ if (attr) {
+ idx = nla_get_u32(attr);
+ if (!idx) {
+ NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+ return -EINVAL;
+ }
+ *nh_id_p = idx;
+ } else {
+ *nh_id_p = 0;
+ }
+
+ return 0;
+}
+
+static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh,
+ struct nh_dump_filter *filter,
+ struct netlink_callback *cb)
+{
+ struct nlattr *res_tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump)];
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump_bucket)];
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
+ ARRAY_SIZE(rtm_nh_policy_dump_bucket) - 1,
+ rtm_nh_policy_dump_bucket, NULL);
+ if (err < 0)
+ return err;
+
+ err = nh_valid_dump_nhid(tb[NHA_ID], &filter->nh_id, cb->extack);
+ if (err)
+ return err;
+
+ if (tb[NHA_RES_BUCKET]) {
+ size_t max = ARRAY_SIZE(rtm_nh_res_bucket_policy_dump) - 1;
+
+ err = nla_parse_nested(res_tb, max,
+ tb[NHA_RES_BUCKET],
+ rtm_nh_res_bucket_policy_dump,
+ cb->extack);
+ if (err < 0)
+ return err;
+
+ err = nh_valid_dump_nhid(res_tb[NHA_RES_BUCKET_NH_ID],
+ &filter->res_bucket_nh_id,
+ cb->extack);
+ if (err)
+ return err;
+ }
+
+ return __nh_valid_dump_req(nlh, tb, filter, cb->extack);
+}
+
+struct rtm_dump_res_bucket_ctx {
+ struct rtm_dump_nh_ctx nh;
+ u16 bucket_index;
+};
+
+static struct rtm_dump_res_bucket_ctx *
+rtm_dump_res_bucket_ctx(struct netlink_callback *cb)
+{
+ struct rtm_dump_res_bucket_ctx *ctx = (void *)cb->ctx;
+
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
+ return ctx;
+}
+
+struct rtm_dump_nexthop_bucket_data {
+ struct rtm_dump_res_bucket_ctx *ctx;
+ struct nh_dump_filter filter;
+};
+
+static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct nexthop *nh,
+ struct rtm_dump_nexthop_bucket_data *dd)
+{
+ u32 portid = NETLINK_CB(cb->skb).portid;
+ struct nhmsg *nhm = nlmsg_data(cb->nlh);
+ struct nh_res_table *res_table;
+ struct nh_group *nhg;
+ u16 bucket_index;
+ int err;
+
+ nhg = rtnl_dereference(nh->nh_grp);
+ res_table = rtnl_dereference(nhg->res_table);
+ for (bucket_index = dd->ctx->bucket_index;
+ bucket_index < res_table->num_nh_buckets;
+ bucket_index++) {
+ struct nh_res_bucket *bucket;
+ struct nh_grp_entry *nhge;
+
+ bucket = &res_table->nh_buckets[bucket_index];
+ nhge = rtnl_dereference(bucket->nh_entry);
+ if (nh_dump_filtered(nhge->nh, &dd->filter, nhm->nh_family))
+ continue;
+
+ if (dd->filter.res_bucket_nh_id &&
+ dd->filter.res_bucket_nh_id != nhge->nh->id)
+ continue;
+
+ dd->ctx->bucket_index = bucket_index;
+ err = nh_fill_res_bucket(skb, nh, bucket, bucket_index,
+ RTM_NEWNEXTHOPBUCKET, portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ cb->extack);
+ if (err)
+ return err;
+ }
+
+ dd->ctx->bucket_index = 0;
+
+ return 0;
+}
+
+static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb,
+ struct netlink_callback *cb,
+ struct nexthop *nh, void *data)
+{
+ struct rtm_dump_nexthop_bucket_data *dd = data;
+ struct nh_group *nhg;
+
+ if (!nh->is_group)
+ return 0;
+
+ nhg = rtnl_dereference(nh->nh_grp);
+ if (!nhg->resilient)
+ return 0;
+
+ return rtm_dump_nexthop_bucket_nh(skb, cb, nh, dd);
+}
+
+/* rtnl */
+static int rtm_dump_nexthop_bucket(struct sk_buff *skb,
+ struct netlink_callback *cb)
+{
+ struct rtm_dump_res_bucket_ctx *ctx = rtm_dump_res_bucket_ctx(cb);
+ struct rtm_dump_nexthop_bucket_data dd = { .ctx = ctx };
+ struct net *net = sock_net(skb->sk);
+ struct nexthop *nh;
+ int err;
+
+ err = nh_valid_dump_bucket_req(cb->nlh, &dd.filter, cb);
+ if (err)
+ return err;
+
+ if (dd.filter.nh_id) {
+ nh = nexthop_find_group_resilient(net, dd.filter.nh_id,
+ cb->extack);
+ if (IS_ERR(nh))
+ return PTR_ERR(nh);
+ err = rtm_dump_nexthop_bucket_nh(skb, cb, nh, &dd);
+ } else {
+ struct rb_root *root = &net->nexthop.rb_root;
+
+ err = rtm_dump_walk_nexthops(skb, cb, root, &ctx->nh,
+ &rtm_dump_nexthop_bucket_cb, &dd);
+ }
+
+ cb->seq = net->nexthop.seq;
+ nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+ return err;
+}
+
+static int nh_valid_get_bucket_req_res_bucket(struct nlattr *res,
+ u16 *bucket_index,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_get)];
+ int err;
+
+ err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_bucket_policy_get) - 1,
+ res, rtm_nh_res_bucket_policy_get, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[NHA_RES_BUCKET_INDEX]) {
+ NL_SET_ERR_MSG(extack, "Bucket index is missing");
+ return -EINVAL;
+ }
+
+ *bucket_index = nla_get_u16(tb[NHA_RES_BUCKET_INDEX]);
+ return 0;
+}
+
+static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh,
+ u32 *id, u16 *bucket_index,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get_bucket)];
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb,
+ ARRAY_SIZE(rtm_nh_policy_get_bucket) - 1,
+ rtm_nh_policy_get_bucket, extack);
+ if (err < 0)
+ return err;
+
+ err = nh_valid_get_del_req(nlh, tb, id, NULL, extack);
+ if (err)
+ return err;
+
+ if (!tb[NHA_RES_BUCKET]) {
+ NL_SET_ERR_MSG(extack, "Bucket information is missing");
+ return -EINVAL;
+ }
+
+ err = nh_valid_get_bucket_req_res_bucket(tb[NHA_RES_BUCKET],
+ bucket_index, extack);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+/* rtnl */
+static int rtm_get_nexthop_bucket(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct nh_res_table *res_table;
+ struct sk_buff *skb = NULL;
+ struct nh_group *nhg;
+ struct nexthop *nh;
+ u16 bucket_index;
+ int err;
+ u32 id;
+
+ err = nh_valid_get_bucket_req(nlh, &id, &bucket_index, extack);
+ if (err)
+ return err;
+
+ nh = nexthop_find_group_resilient(net, id, extack);
+ if (IS_ERR(nh))
+ return PTR_ERR(nh);
+
+ nhg = rtnl_dereference(nh->nh_grp);
+ res_table = rtnl_dereference(nhg->res_table);
+ if (bucket_index >= res_table->num_nh_buckets) {
+ NL_SET_ERR_MSG(extack, "Bucket index out of bounds");
+ return -ENOENT;
+ }
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ err = nh_fill_res_bucket(skb, nh, &res_table->nh_buckets[bucket_index],
+ bucket_index, RTM_NEWNEXTHOPBUCKET,
+ NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
+ 0, extack);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ goto errout_free;
+ }
+
+ return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+
+errout_free:
+ kfree_skb(skb);
+ return err;
+}
+
+static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu)
+{
+ unsigned int hash = nh_dev_hashfn(dev->ifindex);
+ struct net *net = dev_net(dev);
+ struct hlist_head *head = &net->nexthop.devhash[hash];
+ struct hlist_node *n;
+ struct nh_info *nhi;
+
+ hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
+ if (nhi->fib_nhc.nhc_dev == dev) {
+ if (nhi->family == AF_INET)
+ fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu,
+ orig_mtu);
+ }
+ }
+}
+
+/* rtnl */
+static int nh_netdev_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_notifier_info_ext *info_ext;
+
+ switch (event) {
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ nexthop_flush_dev(dev, event);
+ break;
+ case NETDEV_CHANGE:
+ if (!(netif_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP)))
+ nexthop_flush_dev(dev, event);
+ break;
+ case NETDEV_CHANGEMTU:
+ info_ext = ptr;
+ nexthop_sync_mtu(dev, info_ext->ext.mtu);
+ rt_cache_flush(dev_net(dev));
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block nh_netdev_notifier = {
+ .notifier_call = nh_netdev_event,
+};
+
+static int nexthops_dump(struct net *net, struct notifier_block *nb,
+ enum nexthop_event_type event_type,
+ struct netlink_ext_ack *extack)
+{
+ struct rb_root *root = &net->nexthop.rb_root;
+ struct rb_node *node;
+ int err = 0;
+
+ for (node = rb_first(root); node; node = rb_next(node)) {
+ struct nexthop *nh;
+
+ nh = rb_entry(node, struct nexthop, rb_node);
+ err = call_nexthop_notifier(nb, net, event_type, nh, extack);
+ if (err)
+ break;
+ }
+
+ return err;
+}
+
+int register_nexthop_notifier(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ rtnl_lock();
+ err = nexthops_dump(net, nb, NEXTHOP_EVENT_REPLACE, extack);
+ if (err)
+ goto unlock;
+ err = blocking_notifier_chain_register(&net->nexthop.notifier_chain,
+ nb);
+unlock:
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL(register_nexthop_notifier);
+
+int __unregister_nexthop_notifier(struct net *net, struct notifier_block *nb)
+{
+ int err;
+
+ err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain,
+ nb);
+ if (!err)
+ nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL);
+ return err;
+}
+EXPORT_SYMBOL(__unregister_nexthop_notifier);
+
+int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb)
+{
+ int err;
+
+ rtnl_lock();
+ err = __unregister_nexthop_notifier(net, nb);
+ rtnl_unlock();
+ return err;
+}
+EXPORT_SYMBOL(unregister_nexthop_notifier);
+
+void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap)
+{
+ struct nexthop *nexthop;
+
+ rcu_read_lock();
+
+ nexthop = nexthop_find_by_id(net, id);
+ if (!nexthop)
+ goto out;
+
+ nexthop->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
+ if (offload)
+ nexthop->nh_flags |= RTNH_F_OFFLOAD;
+ if (trap)
+ nexthop->nh_flags |= RTNH_F_TRAP;
+
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(nexthop_set_hw_flags);
+
+void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index,
+ bool offload, bool trap)
+{
+ struct nh_res_table *res_table;
+ struct nh_res_bucket *bucket;
+ struct nexthop *nexthop;
+ struct nh_group *nhg;
+
+ rcu_read_lock();
+
+ nexthop = nexthop_find_by_id(net, id);
+ if (!nexthop || !nexthop->is_group)
+ goto out;
+
+ nhg = rcu_dereference(nexthop->nh_grp);
+ if (!nhg->resilient)
+ goto out;
+
+ if (bucket_index >= nhg->res_table->num_nh_buckets)
+ goto out;
+
+ res_table = rcu_dereference(nhg->res_table);
+ bucket = &res_table->nh_buckets[bucket_index];
+ bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP);
+ if (offload)
+ bucket->nh_flags |= RTNH_F_OFFLOAD;
+ if (trap)
+ bucket->nh_flags |= RTNH_F_TRAP;
+
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(nexthop_bucket_set_hw_flags);
+
+void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets,
+ unsigned long *activity)
+{
+ struct nh_res_table *res_table;
+ struct nexthop *nexthop;
+ struct nh_group *nhg;
+ u16 i;
+
+ rcu_read_lock();
+
+ nexthop = nexthop_find_by_id(net, id);
+ if (!nexthop || !nexthop->is_group)
+ goto out;
+
+ nhg = rcu_dereference(nexthop->nh_grp);
+ if (!nhg->resilient)
+ goto out;
+
+ /* Instead of silently ignoring some buckets, demand that the sizes
+ * be the same.
+ */
+ res_table = rcu_dereference(nhg->res_table);
+ if (num_buckets != res_table->num_nh_buckets)
+ goto out;
+
+ for (i = 0; i < num_buckets; i++) {
+ if (test_bit(i, activity))
+ nh_res_bucket_set_busy(&res_table->nh_buckets[i]);
+ }
+
+out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(nexthop_res_grp_activity_update);
+
+static void __net_exit nexthop_net_exit_rtnl(struct net *net,
+ struct list_head *dev_to_kill)
+{
+ ASSERT_RTNL_NET(net);
+ flush_all_nexthops(net);
+}
+
+static void __net_exit nexthop_net_exit(struct net *net)
+{
+ kfree(net->nexthop.devhash);
+ net->nexthop.devhash = NULL;
+}
+
+static int __net_init nexthop_net_init(struct net *net)
+{
+ size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE;
+
+ net->nexthop.rb_root = RB_ROOT;
+ net->nexthop.devhash = kzalloc(sz, GFP_KERNEL);
+ if (!net->nexthop.devhash)
+ return -ENOMEM;
+ BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain);
+
+ return 0;
+}
+
+static struct pernet_operations nexthop_net_ops = {
+ .init = nexthop_net_init,
+ .exit = nexthop_net_exit,
+ .exit_rtnl = nexthop_net_exit_rtnl,
+};
+
+static const struct rtnl_msg_handler nexthop_rtnl_msg_handlers[] __initconst = {
+ {.msgtype = RTM_NEWNEXTHOP, .doit = rtm_new_nexthop,
+ .flags = RTNL_FLAG_DOIT_PERNET},
+ {.msgtype = RTM_DELNEXTHOP, .doit = rtm_del_nexthop,
+ .flags = RTNL_FLAG_DOIT_PERNET},
+ {.msgtype = RTM_GETNEXTHOP, .doit = rtm_get_nexthop,
+ .dumpit = rtm_dump_nexthop},
+ {.msgtype = RTM_GETNEXTHOPBUCKET, .doit = rtm_get_nexthop_bucket,
+ .dumpit = rtm_dump_nexthop_bucket},
+ {.protocol = PF_INET, .msgtype = RTM_NEWNEXTHOP,
+ .doit = rtm_new_nexthop, .flags = RTNL_FLAG_DOIT_PERNET},
+ {.protocol = PF_INET, .msgtype = RTM_GETNEXTHOP,
+ .dumpit = rtm_dump_nexthop},
+ {.protocol = PF_INET6, .msgtype = RTM_NEWNEXTHOP,
+ .doit = rtm_new_nexthop, .flags = RTNL_FLAG_DOIT_PERNET},
+ {.protocol = PF_INET6, .msgtype = RTM_GETNEXTHOP,
+ .dumpit = rtm_dump_nexthop},
+};
+
+static int __init nexthop_init(void)
+{
+ register_pernet_subsys(&nexthop_net_ops);
+
+ register_netdevice_notifier(&nh_netdev_notifier);
+
+ rtnl_register_many(nexthop_rtnl_msg_handlers);
+
+ return 0;
+}
+subsys_initcall(nexthop_init);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 8d7aaf118a30..ad56588107cc 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -5,11 +6,6 @@
*
* "Ping" sockets
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Based on ipv4/udp.c code.
*
* Authors: Vasiliy Kulikov / Openwall (for Linux 2.6),
@@ -17,7 +13,6 @@
*
* Pavel gave all rights to bugs to Vasiliy,
* none of the bugs are Pavel's now.
- *
*/
#include <linux/uaccess.h>
@@ -38,6 +33,7 @@
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
#include <linux/export.h>
+#include <linux/bpf-cgroup.h>
#include <net/sock.h>
#include <net/ping.h>
#include <net/udp.h>
@@ -54,15 +50,13 @@
#endif
struct ping_table {
- struct hlist_nulls_head hash[PING_HTABLE_SIZE];
- rwlock_t lock;
+ struct hlist_head hash[PING_HTABLE_SIZE];
+ spinlock_t lock;
};
static struct ping_table ping_table;
struct pingv6_ops pingv6_ops;
-EXPORT_SYMBOL_GPL(pingv6_ops);
-
-static u16 ping_port_rover;
+EXPORT_IPV6_MOD_GPL(pingv6_ops);
static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask)
{
@@ -71,33 +65,33 @@ static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask)
pr_debug("hash(%u) = %u\n", num, res);
return res;
}
-EXPORT_SYMBOL_GPL(ping_hash);
-static inline struct hlist_nulls_head *ping_hashslot(struct ping_table *table,
- struct net *net, unsigned int num)
+static inline struct hlist_head *ping_hashslot(struct ping_table *table,
+ struct net *net, unsigned int num)
{
return &table->hash[ping_hashfn(net, num, PING_HTABLE_MASK)];
}
int ping_get_port(struct sock *sk, unsigned short ident)
{
- struct hlist_nulls_node *node;
- struct hlist_nulls_head *hlist;
+ struct net *net = sock_net(sk);
struct inet_sock *isk, *isk2;
+ struct hlist_head *hlist;
struct sock *sk2 = NULL;
isk = inet_sk(sk);
- write_lock_bh(&ping_table.lock);
+ spin_lock(&ping_table.lock);
if (ident == 0) {
+ u16 result = net->ipv4.ping_port_rover + 1;
u32 i;
- u16 result = ping_port_rover + 1;
for (i = 0; i < (1L << 16); i++, result++) {
if (!result)
- result++; /* avoid zero */
- hlist = ping_hashslot(&ping_table, sock_net(sk),
- result);
- ping_portaddr_for_each_entry(sk2, node, hlist) {
+ continue; /* avoid zero */
+ hlist = ping_hashslot(&ping_table, net, result);
+ sk_for_each(sk2, hlist) {
+ if (!net_eq(sock_net(sk2), net))
+ continue;
isk2 = inet_sk(sk2);
if (isk2->inet_num == result)
@@ -105,7 +99,7 @@ int ping_get_port(struct sock *sk, unsigned short ident)
}
/* found */
- ping_port_rover = ident = result;
+ net->ipv4.ping_port_rover = ident = result;
break;
next_port:
;
@@ -113,8 +107,10 @@ next_port:
if (i >= (1L << 16))
goto fail;
} else {
- hlist = ping_hashslot(&ping_table, sock_net(sk), ident);
- ping_portaddr_for_each_entry(sk2, node, hlist) {
+ hlist = ping_hashslot(&ping_table, net, ident);
+ sk_for_each(sk2, hlist) {
+ if (!net_eq(sock_net(sk2), net))
+ continue;
isk2 = inet_sk(sk2);
/* BUG? Why is this reuse and not reuseaddr? ping.c
@@ -132,66 +128,61 @@ next_port:
isk->inet_num = ident;
if (sk_unhashed(sk)) {
pr_debug("was not hashed\n");
- sock_hold(sk);
- hlist_nulls_add_head(&sk->sk_nulls_node, hlist);
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ sk_add_node_rcu(sk, hlist);
+ sock_set_flag(sk, SOCK_RCU_FREE);
+ sock_prot_inuse_add(net, sk->sk_prot, 1);
}
- write_unlock_bh(&ping_table.lock);
+ spin_unlock(&ping_table.lock);
return 0;
fail:
- write_unlock_bh(&ping_table.lock);
- return 1;
-}
-EXPORT_SYMBOL_GPL(ping_get_port);
-
-int ping_hash(struct sock *sk)
-{
- pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num);
- BUG(); /* "Please do not press this button again." */
-
- return 0;
+ spin_unlock(&ping_table.lock);
+ return -EADDRINUSE;
}
+EXPORT_IPV6_MOD_GPL(ping_get_port);
void ping_unhash(struct sock *sk)
{
struct inet_sock *isk = inet_sk(sk);
pr_debug("ping_unhash(isk=%p,isk->num=%u)\n", isk, isk->inet_num);
- write_lock_bh(&ping_table.lock);
- if (sk_hashed(sk)) {
- hlist_nulls_del(&sk->sk_nulls_node);
- sk_nulls_node_init(&sk->sk_nulls_node);
- sock_put(sk);
+ spin_lock(&ping_table.lock);
+ if (sk_del_node_init_rcu(sk)) {
isk->inet_num = 0;
isk->inet_sport = 0;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
}
- write_unlock_bh(&ping_table.lock);
+ spin_unlock(&ping_table.lock);
}
-EXPORT_SYMBOL_GPL(ping_unhash);
+EXPORT_IPV6_MOD_GPL(ping_unhash);
+/* Called under rcu_read_lock() */
static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
{
- struct hlist_nulls_head *hslot = ping_hashslot(&ping_table, net, ident);
+ struct hlist_head *hslot = ping_hashslot(&ping_table, net, ident);
struct sock *sk = NULL;
struct inet_sock *isk;
- struct hlist_nulls_node *hnode;
- int dif = skb->dev->ifindex;
+ int dif, sdif;
if (skb->protocol == htons(ETH_P_IP)) {
+ dif = inet_iif(skb);
+ sdif = inet_sdif(skb);
pr_debug("try to find: num = %d, daddr = %pI4, dif = %d\n",
(int)ident, &ip_hdr(skb)->daddr, dif);
#if IS_ENABLED(CONFIG_IPV6)
} else if (skb->protocol == htons(ETH_P_IPV6)) {
+ dif = inet6_iif(skb);
+ sdif = inet6_sdif(skb);
pr_debug("try to find: num = %d, daddr = %pI6c, dif = %d\n",
(int)ident, &ipv6_hdr(skb)->daddr, dif);
#endif
+ } else {
+ return NULL;
}
- read_lock_bh(&ping_table.lock);
-
- ping_portaddr_for_each_entry(sk, hnode, hslot) {
+ sk_for_each_rcu(sk, hslot) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
isk = inet_sk(sk);
pr_debug("iterate\n");
@@ -225,16 +216,15 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
continue;
}
- if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)
+ if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
+ sk->sk_bound_dev_if != sdif)
continue;
- sock_hold(sk);
goto exit;
}
sk = NULL;
exit:
- read_unlock_bh(&ping_table.lock);
return sk;
}
@@ -284,7 +274,7 @@ out_release_group:
put_group_info(group_info);
return ret;
}
-EXPORT_SYMBOL_GPL(ping_init_sock);
+EXPORT_IPV6_MOD_GPL(ping_init_sock);
void ping_close(struct sock *sk, long timeout)
{
@@ -294,14 +284,29 @@ void ping_close(struct sock *sk, long timeout)
sk_common_release(sk);
}
-EXPORT_SYMBOL_GPL(ping_close);
+EXPORT_IPV6_MOD_GPL(ping_close);
+
+static int ping_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
+{
+ /* This check is replicated from __ip4_datagram_connect() and
+ * intended to prevent BPF program called below from accessing bytes
+ * that are out of the bound specified by user in addr_len.
+ */
+ if (addr_len < sizeof(struct sockaddr_in))
+ return -EINVAL;
+
+ return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len);
+}
/* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */
static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
- struct sockaddr *uaddr, int addr_len) {
+ struct sockaddr_unsized *uaddr, int addr_len)
+{
struct net *net = sock_net(sk);
if (sk->sk_family == AF_INET) {
struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+ u32 tb_id = RT_TABLE_LOCAL;
int chk_addr_ret;
if (addr_len < sizeof(*addr))
@@ -315,15 +320,16 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n",
sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port));
- chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr);
-
if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
- chk_addr_ret = RTN_LOCAL;
+ return 0;
- if ((!inet_can_nonlocal_bind(net, isk) &&
- chk_addr_ret != RTN_LOCAL) ||
- chk_addr_ret == RTN_MULTICAST ||
- chk_addr_ret == RTN_BROADCAST)
+ tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
+ chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
+
+ if (chk_addr_ret == RTN_MULTICAST ||
+ chk_addr_ret == RTN_BROADCAST ||
+ (chk_addr_ret != RTN_LOCAL &&
+ !inet_can_nonlocal_bind(net, isk)))
return -EADDRNOTAVAIL;
#if IS_ENABLED(CONFIG_IPV6)
@@ -356,6 +362,14 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
return -ENODEV;
}
}
+
+ if (!dev && sk->sk_bound_dev_if) {
+ dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
+ if (!dev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+ }
has_addr = pingv6_ops.ipv6_chk_addr(net, &addr->sin6_addr, dev,
scoped);
rcu_read_unlock();
@@ -373,7 +387,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
return 0;
}
-static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr)
+static void ping_set_saddr(struct sock *sk, struct sockaddr_unsized *saddr)
{
if (saddr->sa_family == AF_INET) {
struct inet_sock *isk = inet_sk(sk);
@@ -388,26 +402,12 @@ static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr)
}
}
-static void ping_clear_saddr(struct sock *sk, int dif)
-{
- sk->sk_bound_dev_if = dif;
- if (sk->sk_family == AF_INET) {
- struct inet_sock *isk = inet_sk(sk);
- isk->inet_rcv_saddr = isk->inet_saddr = 0;
-#if IS_ENABLED(CONFIG_IPV6)
- } else if (sk->sk_family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
- memset(&sk->sk_v6_rcv_saddr, 0, sizeof(sk->sk_v6_rcv_saddr));
- memset(&np->saddr, 0, sizeof(np->saddr));
-#endif
- }
-}
/*
* We need our own bind because there are no privileged id's == local ports.
* Moreover, we don't allow binding to multi- and broadcast addresses.
*/
-int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int ping_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
{
struct inet_sock *isk = inet_sk(sk);
unsigned short snum;
@@ -425,12 +425,13 @@ int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
goto out;
err = -EADDRINUSE;
- ping_set_saddr(sk, uaddr);
snum = ntohs(((struct sockaddr_in *)uaddr)->sin_port);
if (ping_get_port(sk, snum) != 0) {
- ping_clear_saddr(sk, dif);
+ /* Restore possibly modified sk->sk_bound_dev_if by ping_check_bind_addr(). */
+ sk->sk_bound_dev_if = dif;
goto out;
}
+ ping_set_saddr(sk, uaddr);
pr_debug("after bind(): num = %hu, dif = %d\n",
isk->inet_num,
@@ -461,7 +462,7 @@ out:
pr_debug("ping_v4_bind -> %d\n", err);
return err;
}
-EXPORT_SYMBOL_GPL(ping_bind);
+EXPORT_IPV6_MOD_GPL(ping_bind);
/*
* Is this a supported type of ICMP message?
@@ -470,7 +471,9 @@ EXPORT_SYMBOL_GPL(ping_bind);
static inline int ping_supported(int family, int type, int code)
{
return (family == AF_INET && type == ICMP_ECHO && code == 0) ||
- (family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0);
+ (family == AF_INET && type == ICMP_EXT_ECHO && code == 0) ||
+ (family == AF_INET6 && type == ICMPV6_ECHO_REQUEST && code == 0) ||
+ (family == AF_INET6 && type == ICMPV6_EXT_ECHO_REQUEST && code == 0);
}
/*
@@ -543,7 +546,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info)
case ICMP_DEST_UNREACH:
if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
ipv4_sk_update_pmtu(skb, sk, info);
- if (inet_sock->pmtudisc != IP_PMTUDISC_DONT) {
+ if (READ_ONCE(inet_sock->pmtudisc) != IP_PMTUDISC_DONT) {
err = EMSGSIZE;
harderr = 1;
break;
@@ -572,8 +575,8 @@ void ping_err(struct sk_buff *skb, int offset, u32 info)
* RFC1122: OK. Passes ICMP errors back to application, as per
* 4.1.3.3.
*/
- if ((family == AF_INET && !inet_sock->recverr) ||
- (family == AF_INET6 && !inet6_sk(sk)->recverr)) {
+ if ((family == AF_INET && !inet_test_bit(RECVERR, sk)) ||
+ (family == AF_INET6 && !inet6_test_bit(RECVERR6, sk))) {
if (!harderr || sk->sk_state != TCP_ESTABLISHED)
goto out;
} else {
@@ -588,11 +591,11 @@ void ping_err(struct sk_buff *skb, int offset, u32 info)
}
}
sk->sk_err = err;
- sk->sk_error_report(sk);
+ sk_error_report(sk);
out:
- sock_put(sk);
+ return;
}
-EXPORT_SYMBOL_GPL(ping_err);
+EXPORT_IPV6_MOD_GPL(ping_err);
/*
* Copy and checksum an ICMP Echo packet from user space into a buffer
@@ -602,23 +605,11 @@ EXPORT_SYMBOL_GPL(ping_err);
int ping_getfrag(void *from, char *to,
int offset, int fraglen, int odd, struct sk_buff *skb)
{
- struct pingfakehdr *pfh = (struct pingfakehdr *)from;
-
- if (offset == 0) {
- fraglen -= sizeof(struct icmphdr);
- if (fraglen < 0)
- BUG();
- if (!csum_and_copy_from_iter_full(to + sizeof(struct icmphdr),
- fraglen, &pfh->wcheck,
- &pfh->msg->msg_iter))
- return -EFAULT;
- } else if (offset < sizeof(struct icmphdr)) {
- BUG();
- } else {
- if (!csum_and_copy_from_iter_full(to, fraglen, &pfh->wcheck,
- &pfh->msg->msg_iter))
- return -EFAULT;
- }
+ struct pingfakehdr *pfh = from;
+
+ if (!csum_and_copy_from_iter_full(to, fraglen, &pfh->wcheck,
+ &pfh->msg->msg_iter))
+ return -EFAULT;
#if IS_ENABLED(CONFIG_IPV6)
/* For IPv6, checksum each skb as we go along, as expected by
@@ -626,7 +617,7 @@ int ping_getfrag(void *from, char *to,
* wcheck, it will be finalized in ping_v4_push_pending_frames.
*/
if (pfh->family == AF_INET6) {
- skb->csum = pfh->wcheck;
+ skb->csum = csum_block_add(skb->csum, pfh->wcheck, odd);
skb->ip_summed = CHECKSUM_NONE;
pfh->wcheck = 0;
}
@@ -634,7 +625,7 @@ int ping_getfrag(void *from, char *to,
return 0;
}
-EXPORT_SYMBOL_GPL(ping_getfrag);
+EXPORT_IPV6_MOD_GPL(ping_getfrag);
static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
struct flowi4 *fl4)
@@ -652,7 +643,8 @@ static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
}
int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
- void *user_icmph, size_t icmph_len) {
+ void *user_icmph, size_t icmph_len)
+{
u8 type, code;
if (len > 0xFFFF)
@@ -694,7 +686,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
return 0;
}
-EXPORT_SYMBOL_GPL(ping_common_sendmsg);
+EXPORT_IPV6_MOD_GPL(ping_common_sendmsg);
static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
@@ -708,7 +700,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct ip_options_data opt_copy;
int free = 0;
__be32 saddr, daddr, faddr;
- u8 tos;
+ u8 scope;
int err;
pr_debug("ping_v4_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
@@ -771,27 +763,25 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
faddr = ipc.opt->opt.faddr;
}
- tos = get_rttos(&ipc, inet);
- if (sock_flag(sk, SOCK_LOCALROUTE) ||
- (msg->msg_flags & MSG_DONTROUTE) ||
- (ipc.opt && ipc.opt->opt.is_strictroute)) {
- tos |= RTO_ONLINK;
- }
+ scope = ip_sendmsg_scope(inet, &ipc, msg);
if (ipv4_is_multicast(daddr)) {
- if (!ipc.oif)
- ipc.oif = inet->mc_index;
+ if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
+ ipc.oif = READ_ONCE(inet->mc_index);
if (!saddr)
- saddr = inet->mc_addr;
+ saddr = READ_ONCE(inet->mc_addr);
} else if (!ipc.oif)
- ipc.oif = inet->uc_index;
+ ipc.oif = READ_ONCE(inet->uc_index);
- flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
- RT_SCOPE_UNIVERSE, sk->sk_protocol,
- inet_sk_flowi_flags(sk), faddr, saddr, 0, 0,
- sk->sk_uid);
+ flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark,
+ ipc.tos & INET_DSCP_MASK, scope,
+ sk->sk_protocol, inet_sk_flowi_flags(sk), faddr,
+ saddr, 0, 0, sk_uid(sk));
- security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
+ fl4.fl4_icmp_type = user_icmph.type;
+ fl4.fl4_icmp_code = user_icmph.code;
+
+ security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4));
rt = ip_route_output_flow(net, &fl4, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
@@ -825,7 +815,8 @@ back_from_confirm:
pfh.family = AF_INET;
err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len,
- 0, &ipc, &rt, msg->msg_flags);
+ sizeof(struct icmphdr), &ipc, &rt,
+ msg->msg_flags);
if (err)
ip_flush_pending_frames(sk);
else
@@ -852,8 +843,8 @@ do_confirm:
goto out;
}
-int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
- int flags, int *addr_len)
+int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+ int *addr_len)
{
struct inet_sock *isk = inet_sk(sk);
int family = sk->sk_family;
@@ -869,7 +860,7 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
if (flags & MSG_ERRQUEUE)
return inet_recv_error(sk, msg, len, addr_len);
- skb = skb_recv_datagram(sk, flags, noblock, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
if (!skb)
goto out;
@@ -898,12 +889,11 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
*addr_len = sizeof(*sin);
}
- if (isk->cmsg_flags)
+ if (inet_cmsg_flags(isk))
ip_cmsg_recv(msg, skb);
#if IS_ENABLED(CONFIG_IPV6)
} else if (family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
struct ipv6hdr *ip6 = ipv6_hdr(skb);
DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
@@ -912,7 +902,7 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
sin6->sin6_port = 0;
sin6->sin6_addr = ip6->saddr;
sin6->sin6_flowinfo = 0;
- if (np->sndflow)
+ if (inet6_test_bit(SNDFLOW, sk))
sin6->sin6_flowinfo = ip6_flowinfo(ip6);
sin6->sin6_scope_id =
ipv6_iface_scope_id(&sin6->sin6_addr,
@@ -925,7 +915,8 @@ int ping_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
if (skb->protocol == htons(ETH_P_IPV6) &&
inet6_sk(sk)->rxopt.all)
pingv6_ops.ip6_datagram_recv_specific_ctl(sk, msg, skb);
- else if (skb->protocol == htons(ETH_P_IP) && isk->cmsg_flags)
+ else if (skb->protocol == htons(ETH_P_IP) &&
+ inet_cmsg_flags(isk))
ip_cmsg_recv(msg, skb);
#endif
} else {
@@ -940,31 +931,39 @@ out:
pr_debug("ping_recvmsg -> %d\n", err);
return err;
}
-EXPORT_SYMBOL_GPL(ping_recvmsg);
+EXPORT_IPV6_MOD_GPL(ping_recvmsg);
-int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static enum skb_drop_reason __ping_queue_rcv_skb(struct sock *sk,
+ struct sk_buff *skb)
{
+ enum skb_drop_reason reason;
+
pr_debug("ping_queue_rcv_skb(sk=%p,sk->num=%d,skb=%p)\n",
inet_sk(sk), inet_sk(sk)->inet_num, skb);
- if (sock_queue_rcv_skb(sk, skb) < 0) {
- kfree_skb(skb);
+ if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) {
+ sk_skb_reason_drop(sk, skb, reason);
pr_debug("ping_queue_rcv_skb -> failed\n");
- return -1;
+ return reason;
}
- return 0;
+ return SKB_NOT_DROPPED_YET;
+}
+
+int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ return __ping_queue_rcv_skb(sk, skb) ? -1 : 0;
}
-EXPORT_SYMBOL_GPL(ping_queue_rcv_skb);
+EXPORT_IPV6_MOD_GPL(ping_queue_rcv_skb);
/*
* All we need to do is get the socket.
*/
-bool ping_rcv(struct sk_buff *skb)
+enum skb_drop_reason ping_rcv(struct sk_buff *skb)
{
- struct sock *sk;
struct net *net = dev_net(skb->dev);
struct icmphdr *icmph = icmp_hdr(skb);
+ struct sock *sk;
/* We assume the packet has already been checked by icmp_rcv */
@@ -975,26 +974,20 @@ bool ping_rcv(struct sk_buff *skb)
skb_push(skb, skb->data - (u8 *)icmph);
sk = ping_lookup(net, skb, ntohs(icmph->un.echo.id));
- if (sk) {
- struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
-
- pr_debug("rcv on socket %p\n", sk);
- if (skb2)
- ping_queue_rcv_skb(sk, skb2);
- sock_put(sk);
- return true;
- }
- pr_debug("no socket, dropping\n");
+ if (sk)
+ return __ping_queue_rcv_skb(sk, skb);
- return false;
+ kfree_skb_reason(skb, SKB_DROP_REASON_NO_SOCKET);
+ return SKB_DROP_REASON_NO_SOCKET;
}
-EXPORT_SYMBOL_GPL(ping_rcv);
+EXPORT_IPV6_MOD_GPL(ping_rcv);
struct proto ping_prot = {
.name = "PING",
.owner = THIS_MODULE,
.init = ping_init_sock,
.close = ping_close,
+ .pre_connect = ping_pre_connect,
.connect = ip4_datagram_connect,
.disconnect = __udp_disconnect,
.setsockopt = ip_setsockopt,
@@ -1004,12 +997,12 @@ struct proto ping_prot = {
.bind = ping_bind,
.backlog_rcv = ping_queue_rcv_skb,
.release_cb = ip4_datagram_release_cb,
- .hash = ping_hash,
.unhash = ping_unhash,
.get_port = ping_get_port,
+ .put_port = ping_unhash,
.obj_size = sizeof(struct inet_sock),
};
-EXPORT_SYMBOL(ping_prot);
+EXPORT_IPV6_MOD(ping_prot);
#ifdef CONFIG_PROC_FS
@@ -1021,15 +1014,14 @@ static struct sock *ping_get_first(struct seq_file *seq, int start)
for (state->bucket = start; state->bucket < PING_HTABLE_SIZE;
++state->bucket) {
- struct hlist_nulls_node *node;
- struct hlist_nulls_head *hslot;
+ struct hlist_head *hslot;
hslot = &ping_table.hash[state->bucket];
- if (hlist_nulls_empty(hslot))
+ if (hlist_empty(hslot))
continue;
- sk_nulls_for_each(sk, node, hslot) {
+ sk_for_each(sk, hslot) {
if (net_eq(sock_net(sk), net) &&
sk->sk_family == state->family)
goto found;
@@ -1046,7 +1038,7 @@ static struct sock *ping_get_next(struct seq_file *seq, struct sock *sk)
struct net *net = seq_file_net(seq);
do {
- sk = sk_nulls_next(sk);
+ sk = sk_next(sk);
} while (sk && (!net_eq(sock_net(sk), net)));
if (!sk)
@@ -1071,11 +1063,11 @@ void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family)
state->bucket = 0;
state->family = family;
- read_lock_bh(&ping_table.lock);
+ spin_lock(&ping_table.lock);
return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
}
-EXPORT_SYMBOL_GPL(ping_seq_start);
+EXPORT_IPV6_MOD_GPL(ping_seq_start);
static void *ping_v4_seq_start(struct seq_file *seq, loff_t *pos)
{
@@ -1094,14 +1086,14 @@ void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++*pos;
return sk;
}
-EXPORT_SYMBOL_GPL(ping_seq_next);
+EXPORT_IPV6_MOD_GPL(ping_seq_next);
void ping_seq_stop(struct seq_file *seq, void *v)
__releases(ping_table.lock)
{
- read_unlock_bh(&ping_table.lock);
+ spin_unlock(&ping_table.lock);
}
-EXPORT_SYMBOL_GPL(ping_seq_stop);
+EXPORT_IPV6_MOD_GPL(ping_seq_stop);
static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
int bucket)
@@ -1113,15 +1105,15 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
__u16 srcp = ntohs(inet->inet_sport);
seq_printf(f, "%5d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u",
bucket, src, srcp, dest, destp, sp->sk_state,
sk_wmem_alloc_get(sp),
sk_rmem_alloc_get(sp),
0, 0L, 0,
- from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
+ from_kuid_munged(seq_user_ns(f), sk_uid(sp)),
0, sock_i_ino(sp),
refcount_read(&sp->sk_refcnt), sp,
- atomic_read(&sp->sk_drops));
+ sk_drops_read(sp));
}
static int ping_v4_seq_show(struct seq_file *seq, void *v)
@@ -1152,6 +1144,8 @@ static int __net_init ping_v4_proc_init_net(struct net *net)
if (!proc_create_net("icmp", 0444, net->proc_net, &ping_v4_seq_ops,
sizeof(struct ping_iter_state)))
return -ENOMEM;
+
+ net->ipv4.ping_port_rover = get_random_u16();
return 0;
}
@@ -1182,6 +1176,6 @@ void __init ping_init(void)
int i;
for (i = 0; i < PING_HTABLE_SIZE; i++)
- INIT_HLIST_NULLS_HEAD(&ping_table.hash[i], i);
- rwlock_init(&ping_table.lock);
+ INIT_HLIST_HEAD(&ping_table.hash[i]);
+ spin_lock_init(&ping_table.lock);
}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 70289682a670..974afc4ecbe2 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -25,17 +26,14 @@
* split functions for more readibility.
* Andi Kleen : Add support for /proc/net/netstat
* Arnaldo C. Melo : Convert to seq_file
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/types.h>
#include <net/net_namespace.h>
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/tcp.h>
+#include <net/mptcp.h>
+#include <net/proto_memory.h>
#include <net/udp.h>
#include <net/udplite.h>
#include <linux/bottom_half.h>
@@ -46,7 +44,7 @@
#include <net/sock.h>
#include <net/raw.h>
-#define TCPUDP_MIB_MAX max_t(u32, UDP_MIB_MAX, TCP_MIB_MAX)
+#define TCPUDP_MIB_MAX MAX_T(u32, UDP_MIB_MAX, TCP_MIB_MAX)
/*
* Report socket allocation statistics [mea@utu.fi]
@@ -56,14 +54,14 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
struct net *net = seq->private;
int orphans, sockets;
- orphans = percpu_counter_sum_positive(&tcp_orphan_count);
+ orphans = tcp_orphan_count_sum();
sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
socket_seq_show(seq);
seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
sock_prot_inuse_get(net, &tcp_prot), orphans,
- atomic_read(&net->ipv4.tcp_death_row.tw_count), sockets,
- proto_memory_allocated(&tcp_prot));
+ refcount_read(&net->ipv4.tcp_death_row.tw_refcount) - 1,
+ sockets, proto_memory_allocated(&tcp_prot));
seq_printf(seq, "UDP: inuse %d mem %ld\n",
sock_prot_inuse_get(net, &udp_prot),
proto_memory_allocated(&udp_prot));
@@ -72,8 +70,8 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "RAW: inuse %d\n",
sock_prot_inuse_get(net, &raw_prot));
seq_printf(seq, "FRAG: inuse %u memory %lu\n",
- atomic_read(&net->ipv4.frags.rhashtable.nelems),
- frag_mem_limit(&net->ipv4.frags));
+ atomic_read(&net->ipv4.fqdir->rhashtable.nelems),
+ frag_mem_limit(net->ipv4.fqdir));
return 0;
}
@@ -86,7 +84,7 @@ static const struct snmp_mib snmp4_ipstats_list[] = {
SNMP_MIB_ITEM("InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS),
SNMP_MIB_ITEM("InDiscards", IPSTATS_MIB_INDISCARDS),
SNMP_MIB_ITEM("InDelivers", IPSTATS_MIB_INDELIVERS),
- SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTPKTS),
+ SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTREQUESTS),
SNMP_MIB_ITEM("OutDiscards", IPSTATS_MIB_OUTDISCARDS),
SNMP_MIB_ITEM("OutNoRoutes", IPSTATS_MIB_OUTNOROUTES),
SNMP_MIB_ITEM("ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT),
@@ -96,7 +94,7 @@ static const struct snmp_mib snmp4_ipstats_list[] = {
SNMP_MIB_ITEM("FragOKs", IPSTATS_MIB_FRAGOKS),
SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS),
SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES),
- SNMP_MIB_SENTINEL
+ SNMP_MIB_ITEM("OutTransmits", IPSTATS_MIB_OUTPKTS),
};
/* Following items are displayed in /proc/net/netstat */
@@ -120,7 +118,6 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS),
- SNMP_MIB_SENTINEL
};
static const struct {
@@ -158,7 +155,6 @@ static const struct snmp_mib snmp4_tcp_list[] = {
SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS),
SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS),
SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS),
- SNMP_MIB_SENTINEL
};
static const struct snmp_mib snmp4_udp_list[] = {
@@ -170,7 +166,7 @@ static const struct snmp_mib snmp4_udp_list[] = {
SNMP_MIB_ITEM("SndbufErrors", UDP_MIB_SNDBUFERRORS),
SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS),
SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI),
- SNMP_MIB_SENTINEL
+ SNMP_MIB_ITEM("MemErrors", UDP_MIB_MEMERRORS),
};
static const struct snmp_mib snmp4_net_list[] = {
@@ -189,6 +185,10 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED),
SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED),
SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED),
+ SNMP_MIB_ITEM("BeyondWindow", LINUX_MIB_BEYOND_WINDOW),
+ SNMP_MIB_ITEM("TSEcrRejected", LINUX_MIB_TSECRREJECTED),
+ SNMP_MIB_ITEM("PAWSOldAck", LINUX_MIB_PAWS_OLD_ACK),
+ SNMP_MIB_ITEM("PAWSTimewait", LINUX_MIB_PAWS_TW_REJECTED),
SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS),
SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED),
SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
@@ -219,6 +219,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
+ SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE),
SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
@@ -290,7 +291,20 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPAckCompressed", LINUX_MIB_TCPACKCOMPRESSED),
SNMP_MIB_ITEM("TCPZeroWindowDrop", LINUX_MIB_TCPZEROWINDOWDROP),
SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP),
- SNMP_MIB_SENTINEL
+ SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG),
+ SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY),
+ SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH),
+ SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH),
+ SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS),
+ SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS),
+ SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS),
+ SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE),
+ SNMP_MIB_ITEM("TCPPLBRehash", LINUX_MIB_TCPPLBREHASH),
+ SNMP_MIB_ITEM("TCPAORequired", LINUX_MIB_TCPAOREQUIRED),
+ SNMP_MIB_ITEM("TCPAOBad", LINUX_MIB_TCPAOBAD),
+ SNMP_MIB_ITEM("TCPAOKeyNotFound", LINUX_MIB_TCPAOKEYNOTFOUND),
+ SNMP_MIB_ITEM("TCPAOGood", LINUX_MIB_TCPAOGOOD),
+ SNMP_MIB_ITEM("TCPAODroppedIcmps", LINUX_MIB_TCPAODROPPEDICMPS),
};
static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals,
@@ -345,7 +359,7 @@ static void icmp_put(struct seq_file *seq)
seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors");
for (i = 0; icmpmibmap[i].name; i++)
seq_printf(seq, " In%s", icmpmibmap[i].name);
- seq_puts(seq, " OutMsgs OutErrors");
+ seq_puts(seq, " OutMsgs OutErrors OutRateLimitGlobal OutRateLimitHost");
for (i = 0; icmpmibmap[i].name; i++)
seq_printf(seq, " Out%s", icmpmibmap[i].name);
seq_printf(seq, "\nIcmp: %lu %lu %lu",
@@ -355,9 +369,11 @@ static void icmp_put(struct seq_file *seq)
for (i = 0; icmpmibmap[i].name; i++)
seq_printf(seq, " %lu",
atomic_long_read(ptr + icmpmibmap[i].index));
- seq_printf(seq, " %lu %lu",
+ seq_printf(seq, " %lu %lu %lu %lu",
snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
- snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITGLOBAL),
+ snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITHOST));
for (i = 0; icmpmibmap[i].name; i++)
seq_printf(seq, " %lu",
atomic_long_read(ptr + (icmpmibmap[i].index | 0x100)));
@@ -368,25 +384,26 @@ static void icmp_put(struct seq_file *seq)
*/
static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
{
+ const int cnt = ARRAY_SIZE(snmp4_ipstats_list);
+ u64 buff64[ARRAY_SIZE(snmp4_ipstats_list)];
struct net *net = seq->private;
- u64 buff64[IPSTATS_MIB_MAX];
int i;
- memset(buff64, 0, IPSTATS_MIB_MAX * sizeof(u64));
+ memset(buff64, 0, sizeof(buff64));
seq_puts(seq, "Ip: Forwarding DefaultTTL");
- for (i = 0; snmp4_ipstats_list[i].name; i++)
+ for (i = 0; i < cnt; i++)
seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
seq_printf(seq, "\nIp: %d %d",
- IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2,
- net->ipv4.sysctl_ip_default_ttl);
+ IPV4_DEVCONF_ALL_RO(net, FORWARDING) ? 1 : 2,
+ READ_ONCE(net->ipv4.sysctl_ip_default_ttl));
BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
- snmp_get_cpu_field64_batch(buff64, snmp4_ipstats_list,
- net->mib.ip_statistics,
- offsetof(struct ipstats_mib, syncp));
- for (i = 0; snmp4_ipstats_list[i].name; i++)
+ snmp_get_cpu_field64_batch_cnt(buff64, snmp4_ipstats_list, cnt,
+ net->mib.ip_statistics,
+ offsetof(struct ipstats_mib, syncp));
+ for (i = 0; i < cnt; i++)
seq_printf(seq, " %llu", buff64[i]);
return 0;
@@ -394,20 +411,23 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
{
+ const int udp_cnt = ARRAY_SIZE(snmp4_udp_list);
+ const int tcp_cnt = ARRAY_SIZE(snmp4_tcp_list);
unsigned long buff[TCPUDP_MIB_MAX];
struct net *net = seq->private;
int i;
- memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
+ memset(buff, 0, tcp_cnt * sizeof(unsigned long));
seq_puts(seq, "\nTcp:");
- for (i = 0; snmp4_tcp_list[i].name; i++)
+ for (i = 0; i < tcp_cnt; i++)
seq_printf(seq, " %s", snmp4_tcp_list[i].name);
seq_puts(seq, "\nTcp:");
- snmp_get_cpu_field_batch(buff, snmp4_tcp_list,
- net->mib.tcp_statistics);
- for (i = 0; snmp4_tcp_list[i].name; i++) {
+ snmp_get_cpu_field_batch_cnt(buff, snmp4_tcp_list,
+ tcp_cnt,
+ net->mib.tcp_statistics);
+ for (i = 0; i < tcp_cnt; i++) {
/* MaxConn field is signed, RFC 2012 */
if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
seq_printf(seq, " %ld", buff[i]);
@@ -415,27 +435,29 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
seq_printf(seq, " %lu", buff[i]);
}
- memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
+ memset(buff, 0, udp_cnt * sizeof(unsigned long));
- snmp_get_cpu_field_batch(buff, snmp4_udp_list,
- net->mib.udp_statistics);
+ snmp_get_cpu_field_batch_cnt(buff, snmp4_udp_list,
+ udp_cnt,
+ net->mib.udp_statistics);
seq_puts(seq, "\nUdp:");
- for (i = 0; snmp4_udp_list[i].name; i++)
+ for (i = 0; i < udp_cnt; i++)
seq_printf(seq, " %s", snmp4_udp_list[i].name);
seq_puts(seq, "\nUdp:");
- for (i = 0; snmp4_udp_list[i].name; i++)
+ for (i = 0; i < udp_cnt; i++)
seq_printf(seq, " %lu", buff[i]);
- memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
+ memset(buff, 0, udp_cnt * sizeof(unsigned long));
/* the UDP and UDP-Lite MIBs are the same */
seq_puts(seq, "\nUdpLite:");
- snmp_get_cpu_field_batch(buff, snmp4_udp_list,
- net->mib.udplite_statistics);
- for (i = 0; snmp4_udp_list[i].name; i++)
+ snmp_get_cpu_field_batch_cnt(buff, snmp4_udp_list,
+ udp_cnt,
+ net->mib.udplite_statistics);
+ for (i = 0; i < udp_cnt; i++)
seq_printf(seq, " %s", snmp4_udp_list[i].name);
seq_puts(seq, "\nUdpLite:");
- for (i = 0; snmp4_udp_list[i].name; i++)
+ for (i = 0; i < udp_cnt; i++)
seq_printf(seq, " %lu", buff[i]);
seq_putc(seq, '\n');
@@ -459,31 +481,54 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
*/
static int netstat_seq_show(struct seq_file *seq, void *v)
{
- int i;
+ const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list);
+ const int tcp_cnt = ARRAY_SIZE(snmp4_net_list);
struct net *net = seq->private;
+ unsigned long *buff;
+ int i;
seq_puts(seq, "TcpExt:");
- for (i = 0; snmp4_net_list[i].name; i++)
+ for (i = 0; i < tcp_cnt; i++)
seq_printf(seq, " %s", snmp4_net_list[i].name);
seq_puts(seq, "\nTcpExt:");
- for (i = 0; snmp4_net_list[i].name; i++)
- seq_printf(seq, " %lu",
- snmp_fold_field(net->mib.net_statistics,
- snmp4_net_list[i].entry));
-
+ buff = kzalloc(max(tcp_cnt * sizeof(long), ip_cnt * sizeof(u64)),
+ GFP_KERNEL);
+ if (buff) {
+ snmp_get_cpu_field_batch_cnt(buff, snmp4_net_list, tcp_cnt,
+ net->mib.net_statistics);
+ for (i = 0; i < tcp_cnt; i++)
+ seq_printf(seq, " %lu", buff[i]);
+ } else {
+ for (i = 0; i < tcp_cnt; i++)
+ seq_printf(seq, " %lu",
+ snmp_fold_field(net->mib.net_statistics,
+ snmp4_net_list[i].entry));
+ }
seq_puts(seq, "\nIpExt:");
- for (i = 0; snmp4_ipextstats_list[i].name; i++)
+ for (i = 0; i < ip_cnt; i++)
seq_printf(seq, " %s", snmp4_ipextstats_list[i].name);
seq_puts(seq, "\nIpExt:");
- for (i = 0; snmp4_ipextstats_list[i].name; i++)
- seq_printf(seq, " %llu",
- snmp_fold_field64(net->mib.ip_statistics,
- snmp4_ipextstats_list[i].entry,
- offsetof(struct ipstats_mib, syncp)));
-
+ if (buff) {
+ u64 *buff64 = (u64 *)buff;
+
+ memset(buff64, 0, ip_cnt * sizeof(u64));
+ snmp_get_cpu_field64_batch_cnt(buff64, snmp4_ipextstats_list, ip_cnt,
+ net->mib.ip_statistics,
+ offsetof(struct ipstats_mib, syncp));
+ for (i = 0; i < ip_cnt; i++)
+ seq_printf(seq, " %llu", buff64[i]);
+ } else {
+ for (i = 0; i < ip_cnt; i++)
+ seq_printf(seq, " %llu",
+ snmp_fold_field64(net->mib.ip_statistics,
+ snmp4_ipextstats_list[i].entry,
+ offsetof(struct ipstats_mib, syncp)));
+ }
+ kfree(buff);
seq_putc(seq, '\n');
+ mptcp_seq_show(seq);
return 0;
}
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index 32a691b7ce2c..6913979948d7 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -16,11 +17,6 @@
* Richard Colella : Hang on hash collision
* Vince Laviano : Modified inet_del_protocol() to correctly
* maintain copy bit.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/cache.h>
#include <linux/module.h>
@@ -29,17 +25,12 @@
#include <net/protocol.h>
struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly;
+EXPORT_SYMBOL(inet_protos);
const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly;
EXPORT_SYMBOL(inet_offloads);
int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
{
- if (!prot->netns_ok) {
- pr_err("Protocol %u is not namespace aware, cannot register.\n",
- protocol);
- return -EINVAL;
- }
-
return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],
NULL, prot) ? 0 : -1;
}
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 33df4d76db2d..5998c4cc6f47 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -30,11 +31,6 @@
* Alan Cox : Added IP_HDRINCL option.
* Alan Cox : Skip broadcast check if BSDism set.
* David S. Miller : New socket lookup architecture.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/types.h>
@@ -89,22 +85,21 @@ struct raw_frag_vec {
int hlen;
};
-struct raw_hashinfo raw_v4_hashinfo = {
- .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
-};
+struct raw_hashinfo raw_v4_hashinfo;
EXPORT_SYMBOL_GPL(raw_v4_hashinfo);
int raw_hash_sk(struct sock *sk)
{
struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
- struct hlist_head *head;
+ struct hlist_head *hlist;
- head = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)];
+ hlist = &h->ht[raw_hashfunc(sock_net(sk), inet_sk(sk)->inet_num)];
- write_lock_bh(&h->lock);
- sk_add_node(sk, head);
+ spin_lock(&h->lock);
+ sk_add_node_rcu(sk, hlist);
+ sock_set_flag(sk, SOCK_RCU_FREE);
+ spin_unlock(&h->lock);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
- write_unlock_bh(&h->lock);
return 0;
}
@@ -114,32 +109,26 @@ void raw_unhash_sk(struct sock *sk)
{
struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
- write_lock_bh(&h->lock);
- if (sk_del_node_init(sk))
+ spin_lock(&h->lock);
+ if (sk_del_node_init_rcu(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
- write_unlock_bh(&h->lock);
+ spin_unlock(&h->lock);
}
EXPORT_SYMBOL_GPL(raw_unhash_sk);
-struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
- unsigned short num, __be32 raddr, __be32 laddr,
- int dif, int sdif)
+bool raw_v4_match(struct net *net, const struct sock *sk, unsigned short num,
+ __be32 raddr, __be32 laddr, int dif, int sdif)
{
- sk_for_each_from(sk) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (net_eq(sock_net(sk), net) && inet->inet_num == num &&
- !(inet->inet_daddr && inet->inet_daddr != raddr) &&
- !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
- !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
- sk->sk_bound_dev_if != sdif))
- goto found; /* gotcha */
- }
- sk = NULL;
-found:
- return sk;
+ const struct inet_sock *inet = inet_sk(sk);
+
+ if (net_eq(sock_net(sk), net) && inet->inet_num == num &&
+ !(inet->inet_daddr && inet->inet_daddr != raddr) &&
+ !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
+ raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
+ return true;
+ return false;
}
-EXPORT_SYMBOL_GPL(__raw_v4_lookup);
+EXPORT_SYMBOL_GPL(raw_v4_match);
/*
* 0 - deliver
@@ -171,25 +160,28 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb)
* RFC 1122: SHOULD pass TOS value up to the transport layer.
* -> It does. And not only TOS, but all IP header.
*/
-static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
+static int raw_v4_input(struct net *net, struct sk_buff *skb,
+ const struct iphdr *iph, int hash)
{
int sdif = inet_sdif(skb);
- struct sock *sk;
- struct hlist_head *head;
+ struct hlist_head *hlist;
+ int dif = inet_iif(skb);
int delivered = 0;
- struct net *net;
-
- read_lock(&raw_v4_hashinfo.lock);
- head = &raw_v4_hashinfo.ht[hash];
- if (hlist_empty(head))
- goto out;
+ struct sock *sk;
- net = dev_net(skb->dev);
- sk = __raw_v4_lookup(net, __sk_head(head), iph->protocol,
- iph->saddr, iph->daddr,
- skb->dev->ifindex, sdif);
+ hlist = &raw_v4_hashinfo.ht[hash];
+ rcu_read_lock();
+ sk_for_each_rcu(sk, hlist) {
+ if (!raw_v4_match(net, sk, iph->protocol,
+ iph->saddr, iph->daddr, dif, sdif))
+ continue;
+
+ if (atomic_read(&sk->sk_rmem_alloc) >=
+ READ_ONCE(sk->sk_rcvbuf)) {
+ sk_drops_inc(sk);
+ continue;
+ }
- while (sk) {
delivered = 1;
if ((iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) &&
ip_mc_sf_allow(sk, iph->daddr, iph->saddr,
@@ -200,31 +192,17 @@ static int raw_v4_input(struct sk_buff *skb, const struct iphdr *iph, int hash)
if (clone)
raw_rcv(sk, clone);
}
- sk = __raw_v4_lookup(net, sk_next(sk), iph->protocol,
- iph->saddr, iph->daddr,
- skb->dev->ifindex, sdif);
}
-out:
- read_unlock(&raw_v4_hashinfo.lock);
+ rcu_read_unlock();
return delivered;
}
int raw_local_deliver(struct sk_buff *skb, int protocol)
{
- int hash;
- struct sock *raw_sk;
-
- hash = protocol & (RAW_HTABLE_SIZE - 1);
- raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
-
- /* If there maybe a raw socket we must check - if not we
- * don't care less
- */
- if (raw_sk && !raw_v4_input(skb, ip_hdr(skb), hash))
- raw_sk = NULL;
-
- return raw_sk != NULL;
+ struct net *net = dev_net(skb->dev);
+ return raw_v4_input(net, skb, ip_hdr(skb),
+ raw_hashfunc(net, protocol));
}
static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
@@ -232,8 +210,9 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
struct inet_sock *inet = inet_sk(sk);
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
- int err = 0;
int harderr = 0;
+ bool recverr;
+ int err = 0;
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
ipv4_sk_update_pmtu(skb, sk, info);
@@ -247,7 +226,8 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
2. Socket is connected (otherwise the error indication
is useless without ip_recverr and error is hard.
*/
- if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED)
+ recverr = inet_test_bit(RECVERR, sk);
+ if (!recverr && sk->sk_state != TCP_ESTABLISHED)
return;
switch (type) {
@@ -265,65 +245,63 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info)
err = EHOSTUNREACH;
if (code > NR_ICMP_UNREACH)
break;
- err = icmp_err_convert[code].errno;
- harderr = icmp_err_convert[code].fatal;
if (code == ICMP_FRAG_NEEDED) {
- harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
+ harderr = READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT;
err = EMSGSIZE;
+ } else {
+ err = icmp_err_convert[code].errno;
+ harderr = icmp_err_convert[code].fatal;
}
}
- if (inet->recverr) {
+ if (recverr) {
const struct iphdr *iph = (const struct iphdr *)skb->data;
u8 *payload = skb->data + (iph->ihl << 2);
- if (inet->hdrincl)
+ if (inet_test_bit(HDRINCL, sk))
payload = skb->data;
ip_icmp_error(sk, skb, err, 0, info, payload);
}
- if (inet->recverr || harderr) {
+ if (recverr || harderr) {
sk->sk_err = err;
- sk->sk_error_report(sk);
+ sk_error_report(sk);
}
}
void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info)
{
- int hash;
- struct sock *raw_sk;
+ struct net *net = dev_net(skb->dev);
+ int dif = skb->dev->ifindex;
+ int sdif = inet_sdif(skb);
+ struct hlist_head *hlist;
const struct iphdr *iph;
- struct net *net;
-
- hash = protocol & (RAW_HTABLE_SIZE - 1);
+ struct sock *sk;
+ int hash;
- read_lock(&raw_v4_hashinfo.lock);
- raw_sk = sk_head(&raw_v4_hashinfo.ht[hash]);
- if (raw_sk) {
- int dif = skb->dev->ifindex;
- int sdif = inet_sdif(skb);
+ hash = raw_hashfunc(net, protocol);
+ hlist = &raw_v4_hashinfo.ht[hash];
+ rcu_read_lock();
+ sk_for_each_rcu(sk, hlist) {
iph = (const struct iphdr *)skb->data;
- net = dev_net(skb->dev);
-
- while ((raw_sk = __raw_v4_lookup(net, raw_sk, protocol,
- iph->daddr, iph->saddr,
- dif, sdif)) != NULL) {
- raw_err(raw_sk, skb, info);
- raw_sk = sk_next(raw_sk);
- iph = (const struct iphdr *)skb->data;
- }
+ if (!raw_v4_match(net, sk, iph->protocol,
+ iph->daddr, iph->saddr, dif, sdif))
+ continue;
+ raw_err(sk, skb, info);
}
- read_unlock(&raw_v4_hashinfo.lock);
+ rcu_read_unlock();
}
static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
+ enum skb_drop_reason reason;
+
/* Charge it to the socket. */
- ipv4_pktinfo_prepare(sk, skb);
- if (sock_queue_rcv_skb(sk, skb) < 0) {
- kfree_skb(skb);
+ ipv4_pktinfo_prepare(sk, skb, true);
+ if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) {
+ sk_skb_reason_drop(sk, skb, reason);
return NET_RX_DROP;
}
@@ -333,13 +311,13 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
int raw_rcv(struct sock *sk, struct sk_buff *skb)
{
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
- atomic_inc(&sk->sk_drops);
- kfree_skb(skb);
+ sk_drops_inc(sk);
+ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY);
return NET_RX_DROP;
}
- nf_reset(skb);
+ nf_reset_ct(skb);
- skb_push(skb, skb->data - skb_network_header(skb));
+ skb_push(skb, -skb_network_offset(skb));
raw_rcv_skb(sk, skb);
return 0;
@@ -379,9 +357,10 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
goto error;
skb_reserve(skb, hlen);
- skb->priority = sk->sk_priority;
- skb->mark = sk->sk_mark;
- skb->tstamp = sockc->transmit_time;
+ skb->protocol = htons(ETH_P_IP);
+ skb->priority = sockc->priority;
+ skb->mark = sockc->mark;
+ skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid);
skb_dst_set(skb, &rt->dst);
*rtp = NULL;
@@ -391,7 +370,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
skb->ip_summed = CHECKSUM_NONE;
- sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags);
+ skb_setup_tx_timestamp(skb, sockc);
if (flags & MSG_CONFIRM)
skb_set_dst_pending_confirm(skb, 1);
@@ -444,7 +423,7 @@ error_free:
kfree_skb(skb);
error:
IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
- if (err == -ENOBUFS && !inet->recverr)
+ if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk))
err = 0;
return err;
}
@@ -483,7 +462,7 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
skb->csum = csum_block_add(
skb->csum,
csum_partial_copy_nocheck(rfv->hdr.c + offset,
- to, copy, 0),
+ to, copy),
odd);
odd = 0;
@@ -507,11 +486,11 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct ipcm_cookie ipc;
struct rtable *rt = NULL;
struct flowi4 fl4;
+ u8 scope;
int free = 0;
__be32 daddr;
__be32 saddr;
- u8 tos;
- int err;
+ int uc_index, err;
struct ip_options_data opt_copy;
struct raw_frag_vec rfv;
int hdrincl;
@@ -520,12 +499,8 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (len > 0xFFFF)
goto out;
- /* hdrincl should be READ_ONCE(inet->hdrincl)
- * but READ_ONCE() doesn't work with bit fields.
- * Doing this indirectly yields the same result.
- */
- hdrincl = inet->hdrincl;
- hdrincl = READ_ONCE(hdrincl);
+ hdrincl = inet_test_bit(HDRINCL, sk);
+
/*
* Check the flags.
*/
@@ -563,6 +538,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
ipcm_init_sk(&ipc, inet);
+ /* Keep backward compat */
+ if (hdrincl)
+ ipc.protocol = IPPROTO_RAW;
if (msg->msg_controllen) {
err = ip_cmsg_send(sk, msg, &ipc, false);
@@ -603,37 +581,39 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
daddr = ipc.opt->opt.faddr;
}
}
- tos = get_rtconn_flags(&ipc, sk);
- if (msg->msg_flags & MSG_DONTROUTE)
- tos |= RTO_ONLINK;
+ scope = ip_sendmsg_scope(inet, &ipc, msg);
+ uc_index = READ_ONCE(inet->uc_index);
if (ipv4_is_multicast(daddr)) {
- if (!ipc.oif)
- ipc.oif = inet->mc_index;
+ if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
+ ipc.oif = READ_ONCE(inet->mc_index);
if (!saddr)
- saddr = inet->mc_addr;
+ saddr = READ_ONCE(inet->mc_addr);
} else if (!ipc.oif) {
- ipc.oif = inet->uc_index;
- } else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
- /* oif is set, packet is to local broadcast and
+ ipc.oif = uc_index;
+ } else if (ipv4_is_lbcast(daddr) && uc_index) {
+ /* oif is set, packet is to local broadcast
* and uc_index is set. oif is most likely set
* by sk_bound_dev_if. If uc_index != oif check if the
* oif is an L3 master and uc_index is an L3 slave.
* If so, we want to allow the send using the uc_index.
*/
- if (ipc.oif != inet->uc_index &&
+ if (ipc.oif != uc_index &&
ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
- inet->uc_index)) {
- ipc.oif = inet->uc_index;
+ uc_index)) {
+ ipc.oif = uc_index;
}
}
- flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
- RT_SCOPE_UNIVERSE,
- hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+ flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark,
+ ipc.tos & INET_DSCP_MASK, scope,
+ hdrincl ? ipc.protocol : sk->sk_protocol,
inet_sk_flowi_flags(sk) |
(hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
- daddr, saddr, 0, 0, sk->sk_uid);
+ daddr, saddr, 0, 0, sk_uid(sk));
+
+ fl4.fl4_icmp_type = 0;
+ fl4.fl4_icmp_code = 0;
if (!hdrincl) {
rfv.msg = msg;
@@ -644,7 +624,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
goto done;
}
- security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
+ security_sk_classify_flow(sk, flowi4_to_flowi_common(&fl4));
rt = ip_route_output_flow(net, &fl4, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
@@ -675,7 +655,7 @@ back_from_confirm:
ip_flush_pending_frames(sk);
else if (!(msg->msg_flags & MSG_MORE)) {
err = ip_push_pending_frames(sk, &fl4);
- if (err == -ENOBUFS && !inet->recverr)
+ if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk))
err = 0;
}
release_sock(sk);
@@ -717,34 +697,39 @@ static void raw_destroy(struct sock *sk)
}
/* This gets rid of all the nasties in af_inet. -DaveM */
-static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+static int raw_bind(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+ struct net *net = sock_net(sk);
u32 tb_id = RT_TABLE_LOCAL;
int ret = -EINVAL;
int chk_addr_ret;
+ lock_sock(sk);
if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
goto out;
if (sk->sk_bound_dev_if)
- tb_id = l3mdev_fib_table_by_index(sock_net(sk),
- sk->sk_bound_dev_if) ? : tb_id;
+ tb_id = l3mdev_fib_table_by_index(net,
+ sk->sk_bound_dev_if) ? : tb_id;
- chk_addr_ret = inet_addr_type_table(sock_net(sk), addr->sin_addr.s_addr,
- tb_id);
+ chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);
ret = -EADDRNOTAVAIL;
- if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
- chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
+ if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr,
+ chk_addr_ret))
goto out;
+
inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
inet->inet_saddr = 0; /* Use device */
sk_dst_reset(sk);
ret = 0;
-out: return ret;
+out:
+ release_sock(sk);
+ return ret;
}
/*
@@ -753,7 +738,7 @@ out: return ret;
*/
static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int noblock, int flags, int *addr_len)
+ int flags, int *addr_len)
{
struct inet_sock *inet = inet_sk(sk);
size_t copied = 0;
@@ -769,7 +754,7 @@ static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
goto out;
}
- skb = skb_recv_datagram(sk, flags, noblock, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
if (!skb)
goto out;
@@ -783,7 +768,7 @@ static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (err)
goto done;
- sock_recv_ts_and_drops(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
/* Copy the address. */
if (sin) {
@@ -793,7 +778,7 @@ static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
*addr_len = sizeof(*sin);
}
- if (inet->cmsg_flags)
+ if (inet_cmsg_flags(inet))
ip_cmsg_recv(msg, skb);
if (flags & MSG_TRUNC)
copied = skb->len;
@@ -805,20 +790,21 @@ out:
return copied;
}
-static int raw_init(struct sock *sk)
+static int raw_sk_init(struct sock *sk)
{
struct raw_sock *rp = raw_sk(sk);
+ sk->sk_drop_counters = &rp->drop_counters;
if (inet_sk(sk)->inet_num == IPPROTO_ICMP)
memset(&rp->filter, 0, sizeof(rp->filter));
return 0;
}
-static int raw_seticmpfilter(struct sock *sk, char __user *optval, int optlen)
+static int raw_seticmpfilter(struct sock *sk, sockptr_t optval, int optlen)
{
if (optlen > sizeof(struct icmp_filter))
optlen = sizeof(struct icmp_filter);
- if (copy_from_user(&raw_sk(sk)->filter, optval, optlen))
+ if (copy_from_sockptr(&raw_sk(sk)->filter, optval, optlen))
return -EFAULT;
return 0;
}
@@ -842,8 +828,8 @@ static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *o
out: return ret;
}
-static int do_raw_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
+static int do_raw_setsockopt(struct sock *sk, int optname,
+ sockptr_t optval, unsigned int optlen)
{
if (optname == ICMP_FILTER) {
if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
@@ -855,25 +841,15 @@ static int do_raw_setsockopt(struct sock *sk, int level, int optname,
}
static int raw_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
if (level != SOL_RAW)
return ip_setsockopt(sk, level, optname, optval, optlen);
- return do_raw_setsockopt(sk, level, optname, optval, optlen);
-}
-
-#ifdef CONFIG_COMPAT
-static int compat_raw_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- if (level != SOL_RAW)
- return compat_ip_setsockopt(sk, level, optname, optval, optlen);
- return do_raw_setsockopt(sk, level, optname, optval, optlen);
+ return do_raw_setsockopt(sk, optname, optval, optlen);
}
-#endif
-static int do_raw_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
+static int do_raw_getsockopt(struct sock *sk, int optname,
+ char __user *optval, int __user *optlen)
{
if (optname == ICMP_FILTER) {
if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
@@ -889,42 +865,32 @@ static int raw_getsockopt(struct sock *sk, int level, int optname,
{
if (level != SOL_RAW)
return ip_getsockopt(sk, level, optname, optval, optlen);
- return do_raw_getsockopt(sk, level, optname, optval, optlen);
-}
-
-#ifdef CONFIG_COMPAT
-static int compat_raw_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- if (level != SOL_RAW)
- return compat_ip_getsockopt(sk, level, optname, optval, optlen);
- return do_raw_getsockopt(sk, level, optname, optval, optlen);
+ return do_raw_getsockopt(sk, optname, optval, optlen);
}
-#endif
-static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
+static int raw_ioctl(struct sock *sk, int cmd, int *karg)
{
switch (cmd) {
case SIOCOUTQ: {
- int amount = sk_wmem_alloc_get(sk);
-
- return put_user(amount, (int __user *)arg);
+ *karg = sk_wmem_alloc_get(sk);
+ return 0;
}
case SIOCINQ: {
struct sk_buff *skb;
- int amount = 0;
spin_lock_bh(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue);
if (skb)
- amount = skb->len;
+ *karg = skb->len;
+ else
+ *karg = 0;
spin_unlock_bh(&sk->sk_receive_queue.lock);
- return put_user(amount, (int __user *)arg);
+ return 0;
}
default:
#ifdef CONFIG_IP_MROUTE
- return ipmr_ioctl(sk, cmd, (void __user *)arg);
+ return ipmr_ioctl(sk, cmd, karg);
#else
return -ENOIOCTLCMD;
#endif
@@ -953,7 +919,7 @@ int raw_abort(struct sock *sk, int err)
lock_sock(sk);
sk->sk_err = err;
- sk->sk_error_report(sk);
+ sk_error_report(sk);
__udp_disconnect(sk, 0);
release_sock(sk);
@@ -970,7 +936,7 @@ struct proto raw_prot = {
.connect = ip4_datagram_connect,
.disconnect = __udp_disconnect,
.ioctl = raw_ioctl,
- .init = raw_init,
+ .init = raw_sk_init,
.setsockopt = raw_setsockopt,
.getsockopt = raw_getsockopt,
.sendmsg = raw_sendmsg,
@@ -985,52 +951,46 @@ struct proto raw_prot = {
.usersize = sizeof_field(struct raw_sock, filter),
.h.raw_hash = &raw_v4_hashinfo,
#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_raw_setsockopt,
- .compat_getsockopt = compat_raw_getsockopt,
.compat_ioctl = compat_raw_ioctl,
#endif
.diag_destroy = raw_abort,
};
#ifdef CONFIG_PROC_FS
-static struct sock *raw_get_first(struct seq_file *seq)
+static struct sock *raw_get_first(struct seq_file *seq, int bucket)
{
- struct sock *sk;
- struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
+ struct raw_hashinfo *h = pde_data(file_inode(seq->file));
struct raw_iter_state *state = raw_seq_private(seq);
+ struct hlist_head *hlist;
+ struct sock *sk;
- for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;
+ for (state->bucket = bucket; state->bucket < RAW_HTABLE_SIZE;
++state->bucket) {
- sk_for_each(sk, &h->ht[state->bucket])
+ hlist = &h->ht[state->bucket];
+ sk_for_each(sk, hlist) {
if (sock_net(sk) == seq_file_net(seq))
- goto found;
+ return sk;
+ }
}
- sk = NULL;
-found:
- return sk;
+ return NULL;
}
static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
{
- struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
struct raw_iter_state *state = raw_seq_private(seq);
do {
sk = sk_next(sk);
-try_again:
- ;
} while (sk && sock_net(sk) != seq_file_net(seq));
- if (!sk && ++state->bucket < RAW_HTABLE_SIZE) {
- sk = sk_head(&h->ht[state->bucket]);
- goto try_again;
- }
+ if (!sk)
+ return raw_get_first(seq, state->bucket + 1);
return sk;
}
static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
{
- struct sock *sk = raw_get_first(seq);
+ struct sock *sk = raw_get_first(seq, 0);
if (sk)
while (pos && (sk = raw_get_next(seq, sk)) != NULL)
@@ -1039,10 +999,12 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
}
void *raw_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(&h->lock)
{
- struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
+ struct raw_hashinfo *h = pde_data(file_inode(seq->file));
+
+ spin_lock(&h->lock);
- read_lock(&h->lock);
return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
EXPORT_SYMBOL_GPL(raw_seq_start);
@@ -1052,7 +1014,7 @@ void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
struct sock *sk;
if (v == SEQ_START_TOKEN)
- sk = raw_get_first(seq);
+ sk = raw_get_first(seq, 0);
else
sk = raw_get_next(seq, v);
++*pos;
@@ -1061,10 +1023,11 @@ void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
EXPORT_SYMBOL_GPL(raw_seq_next);
void raw_seq_stop(struct seq_file *seq, void *v)
+ __releases(&h->lock)
{
- struct raw_hashinfo *h = PDE_DATA(file_inode(seq->file));
+ struct raw_hashinfo *h = pde_data(file_inode(seq->file));
- read_unlock(&h->lock);
+ spin_unlock(&h->lock);
}
EXPORT_SYMBOL_GPL(raw_seq_stop);
@@ -1077,14 +1040,14 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
srcp = inet->inet_num;
seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n",
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u\n",
i, src, srcp, dest, destp, sp->sk_state,
sk_wmem_alloc_get(sp),
sk_rmem_alloc_get(sp),
0, 0L, 0,
- from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
+ from_kuid_munged(seq_user_ns(seq), sk_uid(sp)),
0, sock_i_ino(sp),
- refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
+ refcount_read(&sp->sk_refcnt), sp, sk_drops_read(sp));
}
static int raw_seq_show(struct seq_file *seq, void *v)
@@ -1126,6 +1089,7 @@ static __net_initdata struct pernet_operations raw_net_ops = {
int __init raw_proc_init(void)
{
+
return register_pernet_subsys(&raw_net_ops);
}
@@ -1134,3 +1098,27 @@ void __init raw_proc_exit(void)
unregister_pernet_subsys(&raw_net_ops);
}
#endif /* CONFIG_PROC_FS */
+
+static void raw_sysctl_init_net(struct net *net)
+{
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ net->ipv4.sysctl_raw_l3mdev_accept = 1;
+#endif
+}
+
+static int __net_init raw_sysctl_init(struct net *net)
+{
+ raw_sysctl_init_net(net);
+ return 0;
+}
+
+static struct pernet_operations __net_initdata raw_sysctl_ops = {
+ .init = raw_sysctl_init,
+};
+
+void __init raw_init(void)
+{
+ raw_sysctl_init_net(&init_net);
+ if (register_pernet_subsys(&raw_sysctl_ops))
+ panic("RAW: failed to init sysctl parameters.\n");
+}
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index c200065ef9a5..943e5998e0ad 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/module.h>
#include <linux/inet_diag.h>
@@ -23,9 +24,6 @@ raw_get_hashinfo(const struct inet_diag_req_v2 *r)
return &raw_v6_hashinfo;
#endif
} else {
- pr_warn_once("Unexpected inet family %d\n",
- r->sdiag_family);
- WARN_ON_ONCE(1);
return ERR_PTR(-EINVAL);
}
}
@@ -36,84 +34,82 @@ raw_get_hashinfo(const struct inet_diag_req_v2 *r)
* use helper to figure it out.
*/
-static struct sock *raw_lookup(struct net *net, struct sock *from,
- const struct inet_diag_req_v2 *req)
+static bool raw_lookup(struct net *net, const struct sock *sk,
+ const struct inet_diag_req_v2 *req)
{
struct inet_diag_req_raw *r = (void *)req;
- struct sock *sk = NULL;
if (r->sdiag_family == AF_INET)
- sk = __raw_v4_lookup(net, from, r->sdiag_raw_protocol,
- r->id.idiag_dst[0],
- r->id.idiag_src[0],
- r->id.idiag_if, 0);
+ return raw_v4_match(net, sk, r->sdiag_raw_protocol,
+ r->id.idiag_dst[0],
+ r->id.idiag_src[0],
+ r->id.idiag_if, 0);
#if IS_ENABLED(CONFIG_IPV6)
else
- sk = __raw_v6_lookup(net, from, r->sdiag_raw_protocol,
- (const struct in6_addr *)r->id.idiag_src,
- (const struct in6_addr *)r->id.idiag_dst,
- r->id.idiag_if, 0);
+ return raw_v6_match(net, sk, r->sdiag_raw_protocol,
+ (const struct in6_addr *)r->id.idiag_src,
+ (const struct in6_addr *)r->id.idiag_dst,
+ r->id.idiag_if, 0);
#endif
- return sk;
+ return false;
}
static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r)
{
struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
- struct sock *sk = NULL, *s;
+ struct hlist_head *hlist;
+ struct sock *sk;
int slot;
if (IS_ERR(hashinfo))
return ERR_CAST(hashinfo);
- read_lock(&hashinfo->lock);
+ rcu_read_lock();
for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) {
- sk_for_each(s, &hashinfo->ht[slot]) {
- sk = raw_lookup(net, s, r);
- if (sk) {
+ hlist = &hashinfo->ht[slot];
+ sk_for_each_rcu(sk, hlist) {
+ if (raw_lookup(net, sk, r)) {
/*
* Grab it and keep until we fill
- * diag meaage to be reported, so
+ * diag message to be reported, so
* caller should call sock_put then.
- * We can do that because we're keeping
- * hashinfo->lock here.
*/
- sock_hold(sk);
- goto out_unlock;
+ if (refcount_inc_not_zero(&sk->sk_refcnt))
+ goto out_unlock;
}
}
}
+ sk = ERR_PTR(-ENOENT);
out_unlock:
- read_unlock(&hashinfo->lock);
+ rcu_read_unlock();
- return sk ? sk : ERR_PTR(-ENOENT);
+ return sk;
}
-static int raw_diag_dump_one(struct sk_buff *in_skb,
- const struct nlmsghdr *nlh,
+static int raw_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *r)
{
- struct net *net = sock_net(in_skb->sk);
+ struct sk_buff *in_skb = cb->skb;
struct sk_buff *rep;
struct sock *sk;
+ struct net *net;
int err;
+ net = sock_net(in_skb->sk);
sk = raw_sock_get(net, r);
if (IS_ERR(sk))
return PTR_ERR(sk);
- rep = nlmsg_new(sizeof(struct inet_diag_msg) +
- sizeof(struct inet_diag_meminfo) + 64,
+ rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) +
+ inet_diag_msg_attrs_size() +
+ nla_total_size(sizeof(struct inet_diag_meminfo)) + 64,
GFP_KERNEL);
if (!rep) {
sock_put(sk);
return -ENOMEM;
}
- err = inet_sk_diag_fill(sk, NULL, rep, r,
- sk_user_ns(NETLINK_CB(in_skb).sk),
- NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, nlh,
+ err = inet_sk_diag_fill(sk, NULL, rep, cb, r, 0,
netlink_net_capable(in_skb, CAP_NET_ADMIN));
sock_put(sk);
@@ -122,36 +118,30 @@ static int raw_diag_dump_one(struct sk_buff *in_skb,
return err;
}
- err = netlink_unicast(net->diag_nlsk, rep,
- NETLINK_CB(in_skb).portid,
- MSG_DONTWAIT);
- if (err > 0)
- err = 0;
+ err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
return err;
}
static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r,
- struct nlattr *bc, bool net_admin)
+ bool net_admin)
{
- if (!inet_diag_bc_sk(bc, sk))
+ if (!inet_diag_bc_sk(cb->data, sk))
return 0;
- return inet_sk_diag_fill(sk, NULL, skb, r,
- sk_user_ns(NETLINK_CB(cb->skb).sk),
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI,
- cb->nlh, net_admin);
+ return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin);
}
static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r)
{
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
struct net *net = sock_net(skb->sk);
int num, s_num, slot, s_slot;
+ struct hlist_head *hlist;
struct sock *sk = NULL;
if (IS_ERR(hashinfo))
@@ -160,11 +150,12 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
s_slot = cb->args[0];
num = s_num = cb->args[1];
- read_lock(&hashinfo->lock);
+ rcu_read_lock();
for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) {
num = 0;
- sk_for_each(sk, &hashinfo->ht[slot]) {
+ hlist = &hashinfo->ht[slot];
+ sk_for_each_rcu(sk, hlist) {
struct inet_sock *inet = inet_sk(sk);
if (!net_eq(sock_net(sk), net))
@@ -179,7 +170,7 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
if (r->id.idiag_dport != inet->inet_dport &&
r->id.idiag_dport)
goto next;
- if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0)
+ if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0)
goto out_unlock;
next:
num++;
@@ -187,7 +178,7 @@ next:
}
out_unlock:
- read_unlock(&hashinfo->lock);
+ rcu_read_unlock();
cb->args[0] = slot;
cb->args[1] = num;
@@ -218,6 +209,7 @@ static int raw_diag_destroy(struct sk_buff *in_skb,
#endif
static const struct inet_diag_handler raw_diag_handler = {
+ .owner = THIS_MODULE,
.dump = raw_diag_dump,
.dump_one = raw_diag_dump_one,
.idiag_get_info = raw_diag_get_info,
@@ -262,5 +254,6 @@ static void __exit raw_diag_exit(void)
module_init(raw_diag_init);
module_exit(raw_diag_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("RAW socket monitoring via SOCK_DIAG");
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-255 /* AF_INET - IPPROTO_RAW */);
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10-255 /* AF_INET6 - IPPROTO_RAW */);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index b678466da451..b549d6a57307 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -20,7 +21,7 @@
* Alan Cox : Added BSD route gw semantics
* Alan Cox : Super /proc >4K
* Alan Cox : MTU in route table
- * Alan Cox : MSS actually. Also added the window
+ * Alan Cox : MSS actually. Also added the window
* clamper.
* Sam Lantinga : Fixed route matching in rt_del()
* Alan Cox : Routing cache support.
@@ -40,7 +41,7 @@
* Olaf Erb : irtt wasn't being copied right.
* Bjorn Ekwall : Kerneld route support.
* Alan Cox : Multicast fixed (I hope)
- * Pavel Krauz : Limited broadcast fixed
+ * Pavel Krauz : Limited broadcast fixed
* Mike McLagan : Routing by source
* Alexey Kuznetsov : End of old history. Split to fib.c and
* route.c and rewritten from scratch.
@@ -53,26 +54,18 @@
* Robert Olsson : Added rt_cache statistics
* Arnaldo C. Melo : Convert proc stuff to seq_file
* Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
- * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
- * Ilia Sotnikov : Removed TOS from hash calculations
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
+ * Ilia Sotnikov : Removed TOS from hash calculations
*/
#define pr_fmt(fmt) "IPv4: " fmt
#include <linux/module.h>
-#include <linux/uaccess.h>
#include <linux/bitops.h>
-#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/mm.h>
-#include <linux/string.h>
+#include <linux/memblock.h>
#include <linux/socket.h>
-#include <linux/sockios.h>
#include <linux/errno.h>
#include <linux/in.h>
#include <linux/inet.h>
@@ -87,19 +80,19 @@
#include <linux/netfilter_ipv4.h>
#include <linux/random.h>
#include <linux/rcupdate.h>
-#include <linux/times.h>
#include <linux/slab.h>
#include <linux/jhash.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
+#include <net/flow.h>
+#include <net/inet_dscp.h>
#include <net/net_namespace.h>
-#include <net/protocol.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/inetpeer.h>
#include <net/sock.h>
#include <net/ip_fib.h>
-#include <net/arp.h>
+#include <net/nexthop.h>
#include <net/tcp.h>
#include <net/icmp.h>
#include <net/xfrm.h>
@@ -111,24 +104,20 @@
#endif
#include <net/secure_seq.h>
#include <net/ip_tunnels.h>
-#include <net/l3mdev.h>
#include "fib_lookup.h"
-#define RT_FL_TOS(oldflp4) \
- ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
-
#define RT_GC_TIMEOUT (300*HZ)
+#define DEFAULT_MIN_PMTU (512 + 20 + 20)
+#define DEFAULT_MTU_EXPIRES (10 * 60 * HZ)
+#define DEFAULT_MIN_ADVMSS 256
static int ip_rt_max_size;
static int ip_rt_redirect_number __read_mostly = 9;
static int ip_rt_redirect_load __read_mostly = HZ / 50;
static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
static int ip_rt_error_cost __read_mostly = HZ;
static int ip_rt_error_burst __read_mostly = 5 * HZ;
-static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
-static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
-static int ip_rt_min_advmss __read_mostly = 256;
static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
@@ -136,13 +125,17 @@ static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
* Interface to generic destination cache.
*/
-static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
+INDIRECT_CALLABLE_SCOPE
+struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
-static unsigned int ipv4_mtu(const struct dst_entry *dst);
-static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
+INDIRECT_CALLABLE_SCOPE
+unsigned int ipv4_mtu(const struct dst_entry *dst);
+static void ipv4_negative_advice(struct sock *sk,
+ struct dst_entry *dst);
static void ipv4_link_failure(struct sk_buff *skb);
static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu);
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh);
static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
static void ipv4_dst_destroy(struct dst_entry *dst);
@@ -197,7 +190,11 @@ const __u8 ip_tos2prio[16] = {
EXPORT_SYMBOL(ip_tos2prio);
static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
+#ifndef CONFIG_PREEMPT_RT
#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
+#else
+#define RT_CACHE_STAT_INC(field) this_cpu_inc(rt_cache_stat.field)
+#endif
#ifdef CONFIG_PROC_FS
static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
@@ -234,19 +231,6 @@ static const struct seq_operations rt_cache_seq_ops = {
.show = rt_cache_seq_show,
};
-static int rt_cache_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &rt_cache_seq_ops);
-}
-
-static const struct file_operations rt_cache_seq_fops = {
- .open = rt_cache_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-
static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
{
int cpu;
@@ -273,6 +257,7 @@ static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
*pos = cpu+1;
return &per_cpu(rt_cache_stat, cpu);
}
+ (*pos)++;
return NULL;
}
@@ -287,12 +272,13 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
struct rt_cache_stat *st = v;
if (v == SEQ_START_TOKEN) {
- seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
+ seq_puts(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
return 0;
}
- seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
- " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
+ seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x "
+ "%08x %08x %08x %08x %08x %08x "
+ "%08x %08x %08x %08x\n",
dst_entries_get_slow(&ipv4_dst_ops),
0, /* st->in_hit */
st->in_slow_tot,
@@ -323,19 +309,6 @@ static const struct seq_operations rt_cpu_seq_ops = {
.show = rt_cpu_seq_show,
};
-
-static int rt_cpu_seq_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &rt_cpu_seq_ops);
-}
-
-static const struct file_operations rt_cpu_seq_fops = {
- .open = rt_cpu_seq_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
#ifdef CONFIG_IP_ROUTE_CLASSID
static int rt_acct_proc_show(struct seq_file *m, void *v)
{
@@ -366,13 +339,13 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
{
struct proc_dir_entry *pde;
- pde = proc_create("rt_cache", 0444, net->proc_net,
- &rt_cache_seq_fops);
+ pde = proc_create_seq("rt_cache", 0444, net->proc_net,
+ &rt_cache_seq_ops);
if (!pde)
goto err1;
- pde = proc_create("rt_cache", 0444,
- net->proc_net_stat, &rt_cpu_seq_fops);
+ pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat,
+ &rt_cpu_seq_ops);
if (!pde)
goto err2;
@@ -422,7 +395,13 @@ static inline int ip_rt_proc_init(void)
static inline bool rt_is_expired(const struct rtable *rth)
{
- return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
+ bool res;
+
+ rcu_read_lock();
+ res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev));
+ rcu_read_unlock();
+
+ return res;
}
void rt_cache_flush(struct net *net)
@@ -434,42 +413,53 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
struct sk_buff *skb,
const void *daddr)
{
- struct net_device *dev = dst->dev;
- const __be32 *pkey = daddr;
- const struct rtable *rt;
+ const struct rtable *rt = container_of(dst, struct rtable, dst);
+ struct net_device *dev;
struct neighbour *n;
- rt = (const struct rtable *) dst;
- if (rt->rt_gateway)
- pkey = (const __be32 *) &rt->rt_gateway;
- else if (skb)
- pkey = &ip_hdr(skb)->daddr;
+ rcu_read_lock();
+ dev = dst_dev_rcu(dst);
+ if (likely(rt->rt_gw_family == AF_INET)) {
+ n = ip_neigh_gw4(dev, rt->rt_gw4);
+ } else if (rt->rt_gw_family == AF_INET6) {
+ n = ip_neigh_gw6(dev, &rt->rt_gw6);
+ } else {
+ __be32 pkey;
+
+ pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
+ n = ip_neigh_gw4(dev, pkey);
+ }
+
+ if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
+ n = NULL;
- n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
- if (n)
- return n;
- return neigh_create(&arp_tbl, pkey, dev);
+ rcu_read_unlock();
+
+ return n;
}
static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
- struct net_device *dev = dst->dev;
+ const struct rtable *rt = container_of(dst, struct rtable, dst);
+ struct net_device *dev = dst_dev(dst);
const __be32 *pkey = daddr;
- const struct rtable *rt;
- rt = (const struct rtable *)dst;
- if (rt->rt_gateway)
- pkey = (const __be32 *)&rt->rt_gateway;
- else if (!daddr ||
+ if (rt->rt_gw_family == AF_INET) {
+ pkey = (const __be32 *)&rt->rt_gw4;
+ } else if (rt->rt_gw_family == AF_INET6) {
+ return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
+ } else if (!daddr ||
(rt->rt_flags &
- (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
+ (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
return;
-
+ }
__ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
}
-#define IP_IDENTS_SZ 2048u
-
+/* Hash tables of size 2048..262144 depending on RAM size.
+ * Each bucket uses 8 bytes.
+ */
+static u32 ip_idents_mask __read_mostly;
static atomic_t *ip_idents __read_mostly;
static u32 *ip_tstamps __read_mostly;
@@ -477,61 +467,64 @@ static u32 *ip_tstamps __read_mostly;
* if one generator is seldom used. This makes hard for an attacker
* to infer how many packets were sent between two points in time.
*/
-u32 ip_idents_reserve(u32 hash, int segs)
+static u32 ip_idents_reserve(u32 hash, int segs)
{
- u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
- atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
- u32 old = READ_ONCE(*p_tstamp);
- u32 now = (u32)jiffies;
- u32 new, delta = 0;
+ u32 bucket, old, now = (u32)jiffies;
+ atomic_t *p_id;
+ u32 *p_tstamp;
+ u32 delta = 0;
- if (old != now && cmpxchg(p_tstamp, old, now) == old)
- delta = prandom_u32_max(now - old);
+ bucket = hash & ip_idents_mask;
+ p_tstamp = ip_tstamps + bucket;
+ p_id = ip_idents + bucket;
+ old = READ_ONCE(*p_tstamp);
- /* Do not use atomic_add_return() as it makes UBSAN unhappy */
- do {
- old = (u32)atomic_read(p_id);
- new = old + delta + segs;
- } while (atomic_cmpxchg(p_id, old, new) != old);
+ if (old != now && cmpxchg(p_tstamp, old, now) == old)
+ delta = get_random_u32_below(now - old);
- return new - segs;
+ /* If UBSAN reports an error there, please make sure your compiler
+ * supports -fno-strict-overflow before reporting it that was a bug
+ * in UBSAN, and it has been fixed in GCC-8.
+ */
+ return atomic_add_return(segs + delta, p_id) - segs;
}
-EXPORT_SYMBOL(ip_idents_reserve);
void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
{
- static u32 ip_idents_hashrnd __read_mostly;
u32 hash, id;
- net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
+ /* Note the following code is not safe, but this is okay. */
+ if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
+ get_random_bytes(&net->ipv4.ip_id_key,
+ sizeof(net->ipv4.ip_id_key));
- hash = jhash_3words((__force u32)iph->daddr,
+ hash = siphash_3u32((__force u32)iph->daddr,
(__force u32)iph->saddr,
- iph->protocol ^ net_hash_mix(net),
- ip_idents_hashrnd);
+ iph->protocol,
+ &net->ipv4.ip_id_key);
id = ip_idents_reserve(hash, segs);
iph->id = htons(id);
}
EXPORT_SYMBOL(__ip_select_ident);
static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
- const struct sock *sk,
- const struct iphdr *iph,
- int oif, u8 tos,
- u8 prot, u32 mark, int flow_flags)
+ const struct sock *sk, const struct iphdr *iph,
+ int oif, __u8 tos, u8 prot, u32 mark,
+ int flow_flags)
{
- if (sk) {
- const struct inet_sock *inet = inet_sk(sk);
+ __u8 scope = RT_SCOPE_UNIVERSE;
+ if (sk) {
oif = sk->sk_bound_dev_if;
- mark = sk->sk_mark;
- tos = RT_CONN_FLAGS(sk);
- prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
- }
- flowi4_init_output(fl4, oif, mark, tos,
- RT_SCOPE_UNIVERSE, prot,
- flow_flags,
- iph->daddr, iph->saddr, 0, 0,
+ mark = READ_ONCE(sk->sk_mark);
+ tos = ip_sock_rt_tos(sk);
+ scope = ip_sock_rt_scope(sk);
+ prot = inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW :
+ sk->sk_protocol;
+ }
+
+ flowi4_init_output(fl4, oif, mark, tos & INET_DSCP_MASK, scope,
+ prot, flow_flags, iph->daddr, iph->saddr, 0, 0,
sock_net_uid(net, sk));
}
@@ -541,9 +534,9 @@ static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
const struct net *net = dev_net(skb->dev);
const struct iphdr *iph = ip_hdr(skb);
int oif = skb->dev->ifindex;
- u8 tos = RT_TOS(iph->tos);
u8 prot = iph->protocol;
u32 mark = skb->mark;
+ __u8 tos = iph->tos;
__build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
}
@@ -558,11 +551,14 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
inet_opt = rcu_dereference(inet->inet_opt);
if (inet_opt && inet_opt->opt.srr)
daddr = inet_opt->opt.faddr;
- flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
- RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
- inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+ flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark),
+ ip_sock_rt_tos(sk),
+ ip_sock_rt_scope(sk),
+ inet_test_bit(HDRINCL, sk) ?
+ IPPROTO_RAW : sk->sk_protocol,
inet_sk_flowi_flags(sk),
- daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
+ daddr, inet->inet_saddr, 0, 0,
+ sk_uid(sk));
rcu_read_unlock();
}
@@ -595,28 +591,40 @@ static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
}
}
-static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
+static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
{
- struct fib_nh_exception *fnhe, *oldest;
+ struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
+ struct fib_nh_exception *fnhe, *oldest = NULL;
- oldest = rcu_dereference(hash->chain);
- for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
- fnhe = rcu_dereference(fnhe->fnhe_next)) {
- if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
+ for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
+ fnhe = rcu_dereference_protected(*fnhe_p,
+ lockdep_is_held(&fnhe_lock));
+ if (!fnhe)
+ break;
+ if (!oldest ||
+ time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
oldest = fnhe;
+ oldest_p = fnhe_p;
+ }
}
+
+ /* Clear oldest->fnhe_daddr to prevent this fnhe from being
+ * rebound with new dsts in rt_bind_exception().
+ */
+ oldest->fnhe_daddr = 0;
fnhe_flush_routes(oldest);
- return oldest;
+ *oldest_p = oldest->fnhe_next;
+ kfree_rcu(oldest, rcu);
}
-static inline u32 fnhe_hashfun(__be32 daddr)
+static u32 fnhe_hashfun(__be32 daddr)
{
- static u32 fnhe_hashrnd __read_mostly;
- u32 hval;
+ static siphash_aligned_key_t fnhe_hash_key;
+ u64 hval;
- net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
- hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
- return hash_32(hval, FNHE_HASH_SHIFT);
+ net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
+ hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
+ return hash_64(hval, FNHE_HASH_SHIFT);
}
static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
@@ -627,13 +635,15 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh
if (fnhe->fnhe_gw) {
rt->rt_flags |= RTCF_REDIRECTED;
- rt->rt_gateway = fnhe->fnhe_gw;
rt->rt_uses_gateway = 1;
+ rt->rt_gw_family = AF_INET;
+ rt->rt_gw4 = fnhe->fnhe_gw;
}
}
-static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
- u32 pmtu, bool lock, unsigned long expires)
+static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
+ __be32 gw, u32 pmtu, bool lock,
+ unsigned long expires)
{
struct fnhe_hash_bucket *hash;
struct fib_nh_exception *fnhe;
@@ -642,17 +652,17 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
unsigned int i;
int depth;
- genid = fnhe_genid(dev_net(nh->nh_dev));
+ genid = fnhe_genid(dev_net(nhc->nhc_dev));
hval = fnhe_hashfun(daddr);
spin_lock_bh(&fnhe_lock);
- hash = rcu_dereference(nh->nh_exceptions);
+ hash = rcu_dereference(nhc->nhc_exceptions);
if (!hash) {
hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
if (!hash)
goto out_unlock;
- rcu_assign_pointer(nh->nh_exceptions, hash);
+ rcu_assign_pointer(nhc->nhc_exceptions, hash);
}
hash += hval;
@@ -683,16 +693,21 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
if (rt)
fill_route_from_fnhe(rt, fnhe);
} else {
- if (depth > FNHE_RECLAIM_DEPTH)
- fnhe = fnhe_oldest(hash);
- else {
- fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
- if (!fnhe)
- goto out_unlock;
-
- fnhe->fnhe_next = hash->chain;
- rcu_assign_pointer(hash->chain, fnhe);
+ /* Randomize max depth to avoid some side channels attacks. */
+ int max_depth = FNHE_RECLAIM_DEPTH +
+ get_random_u32_below(FNHE_RECLAIM_DEPTH);
+
+ while (depth > max_depth) {
+ fnhe_remove_oldest(hash);
+ depth--;
}
+
+ fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
+ if (!fnhe)
+ goto out_unlock;
+
+ fnhe->fnhe_next = hash->chain;
+
fnhe->fnhe_genid = genid;
fnhe->fnhe_daddr = daddr;
fnhe->fnhe_gw = gw;
@@ -700,20 +715,23 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
fnhe->fnhe_mtu_locked = lock;
fnhe->fnhe_expires = max(1UL, expires);
+ rcu_assign_pointer(hash->chain, fnhe);
+
/* Exception created; mark the cached routes for the nexthop
* stale, so anyone caching it rechecks if this exception
* applies to them.
*/
- rt = rcu_dereference(nh->nh_rth_input);
+ rt = rcu_dereference(nhc->nhc_rth_input);
if (rt)
- rt->dst.obsolete = DST_OBSOLETE_KILL;
+ WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL);
for_each_possible_cpu(i) {
struct rtable __rcu **prt;
- prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
+
+ prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
rt = rcu_dereference(*prt);
if (rt)
- rt->dst.obsolete = DST_OBSOLETE_KILL;
+ WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL);
}
}
@@ -745,7 +763,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
return;
}
- if (rt->rt_gateway != old_gw)
+ if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
return;
in_dev = __in_dev_get_rcu(dev);
@@ -768,22 +786,24 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
goto reject_redirect;
}
- n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
+ n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
if (!n)
n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
if (!IS_ERR(n)) {
- if (!(n->nud_state & NUD_VALID)) {
+ if (!(READ_ONCE(n->nud_state) & NUD_VALID)) {
neigh_event_send(n, NULL);
} else {
if (fib_lookup(net, fl4, &res, 0) == 0) {
- struct fib_nh *nh = &FIB_RES_NH(res);
+ struct fib_nh_common *nhc;
- update_or_create_fnhe(nh, fl4->daddr, new_gw,
+ fib_select_path(net, &res, fl4, skb);
+ nhc = FIB_RES_NHC(res);
+ update_or_create_fnhe(nhc, fl4->daddr, new_gw,
0, false,
jiffies + ip_rt_gc_timeout);
}
if (kill_route)
- rt->dst.obsolete = DST_OBSOLETE_KILL;
+ WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL);
call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
}
neigh_release(n);
@@ -813,32 +833,25 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
const struct iphdr *iph = (const struct iphdr *) skb->data;
struct net *net = dev_net(skb->dev);
int oif = skb->dev->ifindex;
- u8 tos = RT_TOS(iph->tos);
u8 prot = iph->protocol;
u32 mark = skb->mark;
+ __u8 tos = iph->tos;
- rt = (struct rtable *) dst;
+ rt = dst_rtable(dst);
__build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
__ip_do_redirect(rt, skb, &fl4, true);
}
-static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
+static void ipv4_negative_advice(struct sock *sk,
+ struct dst_entry *dst)
{
- struct rtable *rt = (struct rtable *)dst;
- struct dst_entry *ret = dst;
+ struct rtable *rt = dst_rtable(dst);
- if (rt) {
- if (dst->obsolete > 0) {
- ip_rt_put(rt);
- ret = NULL;
- } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
- rt->dst.expires) {
- ip_rt_put(rt);
- ret = NULL;
- }
- }
- return ret;
+ if ((READ_ONCE(dst->obsolete) > 0) ||
+ (rt->rt_flags & RTCF_REDIRECTED) ||
+ READ_ONCE(rt->dst.expires))
+ sk_dst_reset(sk);
}
/*
@@ -874,11 +887,11 @@ void ip_rt_send_redirect(struct sk_buff *skb)
}
log_martians = IN_DEV_LOG_MARTIANS(in_dev);
vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
- rcu_read_unlock();
net = dev_net(rt->dst.dev);
- peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
+ peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif);
if (!peer) {
+ rcu_read_unlock();
icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
rt_nexthop(rt, ip_hdr(skb)->daddr));
return;
@@ -887,39 +900,39 @@ void ip_rt_send_redirect(struct sk_buff *skb)
/* No redirected packets during ip_rt_redirect_silence;
* reset the algorithm.
*/
- if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
+ if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
peer->rate_tokens = 0;
+ peer->n_redirects = 0;
+ }
/* Too many ignored redirects; do not send anything
* set dst.rate_last to the last seen redirected packet.
*/
- if (peer->rate_tokens >= ip_rt_redirect_number) {
+ if (peer->n_redirects >= ip_rt_redirect_number) {
peer->rate_last = jiffies;
- goto out_put_peer;
+ goto out_unlock;
}
/* Check for load limit; set rate_last to the latest sent
* redirect.
*/
- if (peer->rate_tokens == 0 ||
+ if (peer->n_redirects == 0 ||
time_after(jiffies,
(peer->rate_last +
- (ip_rt_redirect_load << peer->rate_tokens)))) {
+ (ip_rt_redirect_load << peer->n_redirects)))) {
__be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
peer->rate_last = jiffies;
- ++peer->rate_tokens;
-#ifdef CONFIG_IP_ROUTE_VERBOSE
- if (log_martians &&
- peer->rate_tokens == ip_rt_redirect_number)
+ ++peer->n_redirects;
+ if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE) && log_martians &&
+ peer->n_redirects == ip_rt_redirect_number)
net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
&ip_hdr(skb)->saddr, inet_iif(skb),
&ip_hdr(skb)->daddr, &gw);
-#endif
}
-out_put_peer:
- inet_putpeer(peer);
+out_unlock:
+ rcu_read_unlock();
}
static int ip_error(struct sk_buff *skb)
@@ -930,6 +943,7 @@ static int ip_error(struct sk_buff *skb)
struct inet_peer *peer;
unsigned long now;
struct net *net;
+ SKB_DR(reason);
bool send;
int code;
@@ -949,10 +963,12 @@ static int ip_error(struct sk_buff *skb)
if (!IN_DEV_FORWARD(in_dev)) {
switch (rt->dst.error) {
case EHOSTUNREACH:
+ SKB_DR_SET(reason, IP_INADDRERRORS);
__IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
break;
case ENETUNREACH:
+ SKB_DR_SET(reason, IP_INNOROUTES);
__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
break;
}
@@ -968,6 +984,7 @@ static int ip_error(struct sk_buff *skb)
break;
case ENETUNREACH:
code = ICMP_NET_UNREACH;
+ SKB_DR_SET(reason, IP_INNOROUTES);
__IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
break;
case EACCES:
@@ -975,9 +992,9 @@ static int ip_error(struct sk_buff *skb)
break;
}
+ rcu_read_lock();
peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
- l3mdev_master_ifindex(skb->dev), 1);
-
+ l3mdev_master_ifindex_rcu(skb->dev));
send = true;
if (peer) {
now = jiffies;
@@ -989,12 +1006,13 @@ static int ip_error(struct sk_buff *skb)
peer->rate_tokens -= ip_rt_error_cost;
else
send = false;
- inet_putpeer(peer);
}
+ rcu_read_unlock();
+
if (send)
icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
-out: kfree_skb(skb);
+out: kfree_skb_reason(skb, reason);
return 0;
}
@@ -1003,54 +1021,78 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
struct dst_entry *dst = &rt->dst;
struct fib_result res;
bool lock = false;
+ struct net *net;
+ u32 old_mtu;
if (ip_mtu_locked(dst))
return;
- if (ipv4_mtu(dst) < mtu)
+ old_mtu = ipv4_mtu(dst);
+ if (old_mtu < mtu)
return;
- if (mtu < ip_rt_min_pmtu) {
+ rcu_read_lock();
+ net = dst_dev_net_rcu(dst);
+ if (mtu < net->ipv4.ip_rt_min_pmtu) {
lock = true;
- mtu = ip_rt_min_pmtu;
+ mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
}
- if (rt->rt_pmtu == mtu &&
- time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
- return;
+ if (rt->rt_pmtu == mtu && !lock &&
+ time_before(jiffies, READ_ONCE(dst->expires) -
+ net->ipv4.ip_rt_mtu_expires / 2))
+ goto out;
- rcu_read_lock();
- if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
- struct fib_nh *nh = &FIB_RES_NH(res);
+ if (fib_lookup(net, fl4, &res, 0) == 0) {
+ struct fib_nh_common *nhc;
+
+ fib_select_path(net, &res, fl4, NULL);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (fib_info_num_path(res.fi) > 1) {
+ int nhsel;
- update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
- jiffies + ip_rt_mtu_expires);
+ for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) {
+ nhc = fib_info_nhc(res.fi, nhsel);
+ update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
+ jiffies + net->ipv4.ip_rt_mtu_expires);
+ }
+ goto out;
+ }
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+ nhc = FIB_RES_NHC(res);
+ update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
+ jiffies + net->ipv4.ip_rt_mtu_expires);
}
+out:
rcu_read_unlock();
}
static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
{
- struct rtable *rt = (struct rtable *) dst;
+ struct rtable *rt = dst_rtable(dst);
struct flowi4 fl4;
ip_rt_build_flow_key(&fl4, sk, skb);
+
+ /* Don't make lookup fail for bridged encapsulations */
+ if (skb && netif_is_any_bridge_port(skb->dev))
+ fl4.flowi4_oif = 0;
+
__ip_rt_update_pmtu(rt, &fl4, mtu);
}
void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
- int oif, u32 mark, u8 protocol, int flow_flags)
+ int oif, u8 protocol)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct flowi4 fl4;
struct rtable *rt;
+ u32 mark = IP4_REPLY_MARK(net, skb->mark);
- if (!mark)
- mark = IP4_REPLY_MARK(net, skb->mark);
-
- __build_flow_key(net, &fl4, NULL, iph, oif,
- RT_TOS(iph->tos), protocol, mark, flow_flags);
+ __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, mark,
+ 0);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
__ip_rt_update_pmtu(rt, &fl4, mtu);
@@ -1061,7 +1103,7 @@ EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct flowi4 fl4;
struct rtable *rt;
@@ -1079,7 +1121,7 @@ static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct flowi4 fl4;
struct rtable *rt;
struct dst_entry *odst = NULL;
@@ -1100,8 +1142,8 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
__build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
- rt = (struct rtable *)odst;
- if (odst->obsolete && !odst->ops->check(odst, 0)) {
+ rt = dst_rtable(odst);
+ if (READ_ONCE(odst->obsolete) && !odst->ops->check(odst, 0)) {
rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
if (IS_ERR(rt))
goto out;
@@ -1109,7 +1151,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
new = true;
}
- __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
+ __ip_rt_update_pmtu(dst_rtable(xfrm_dst_path(&rt->dst)), &fl4, mtu);
if (!dst_check(&rt->dst, 0)) {
if (new)
@@ -1132,14 +1174,13 @@ out:
EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
void ipv4_redirect(struct sk_buff *skb, struct net *net,
- int oif, u32 mark, u8 protocol, int flow_flags)
+ int oif, u8 protocol)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct flowi4 fl4;
struct rtable *rt;
- __build_flow_key(net, &fl4, NULL, iph, oif,
- RT_TOS(iph->tos), protocol, mark, flow_flags);
+ __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, 0, 0);
rt = __ip_route_output_key(net, &fl4);
if (!IS_ERR(rt)) {
__ip_do_redirect(rt, skb, &fl4, false);
@@ -1150,7 +1191,7 @@ EXPORT_SYMBOL_GPL(ipv4_redirect);
void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
- const struct iphdr *iph = (const struct iphdr *) skb->data;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
struct flowi4 fl4;
struct rtable *rt;
struct net *net = sock_net(sk);
@@ -1164,9 +1205,10 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
}
EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
-static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
+INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst,
+ u32 cookie)
{
- struct rtable *rt = (struct rtable *) dst;
+ struct rtable *rt = dst_rtable(dst);
/* All IPV4 dsts are created with ->obsolete set to the value
* DST_OBSOLETE_FORCE_CHK which forces validation calls down
@@ -1174,18 +1216,50 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
*
* When a PMTU/redirect information update invalidates a route,
* this is indicated by setting obsolete to DST_OBSOLETE_KILL or
- * DST_OBSOLETE_DEAD by dst_free().
+ * DST_OBSOLETE_DEAD.
*/
- if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
+ if (READ_ONCE(dst->obsolete) != DST_OBSOLETE_FORCE_CHK ||
+ rt_is_expired(rt))
return NULL;
return dst;
}
+EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
+
+static void ipv4_send_dest_unreach(struct sk_buff *skb)
+{
+ struct inet_skb_parm parm;
+ struct net_device *dev;
+ int res;
+
+ /* Recompile ip options since IPCB may not be valid anymore.
+ * Also check we have a reasonable ipv4 header.
+ */
+ if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
+ ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
+ return;
+
+ memset(&parm, 0, sizeof(parm));
+ if (ip_hdr(skb)->ihl > 5) {
+ if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
+ return;
+ parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
+
+ rcu_read_lock();
+ dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
+ res = __ip_options_compile(dev_net(dev), &parm.opt, skb, NULL);
+ rcu_read_unlock();
+
+ if (res)
+ return;
+ }
+ __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &parm);
+}
static void ipv4_link_failure(struct sk_buff *skb)
{
struct rtable *rt;
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+ ipv4_send_dest_unreach(skb);
rt = skb_rtable(skb);
if (rt)
@@ -1203,12 +1277,12 @@ static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
}
/*
- We do not cache source address of outgoing interface,
- because it is used only by IP RR, TS and SRR options,
- so that it out of fast path.
-
- BTW remember: "addr" is allowed to be not aligned
- in IP options!
+ * We do not cache source address of outgoing interface,
+ * because it is used only by IP RR, TS and SRR options,
+ * so that it out of fast path.
+ *
+ * BTW remember: "addr" is allowed to be not aligned
+ * in IP options!
*/
void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
@@ -1219,22 +1293,19 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
src = ip_hdr(skb)->saddr;
else {
struct fib_result res;
- struct flowi4 fl4;
- struct iphdr *iph;
-
- iph = ip_hdr(skb);
-
- memset(&fl4, 0, sizeof(fl4));
- fl4.daddr = iph->daddr;
- fl4.saddr = iph->saddr;
- fl4.flowi4_tos = RT_TOS(iph->tos);
- fl4.flowi4_oif = rt->dst.dev->ifindex;
- fl4.flowi4_iif = skb->dev->ifindex;
- fl4.flowi4_mark = skb->mark;
+ struct iphdr *iph = ip_hdr(skb);
+ struct flowi4 fl4 = {
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .flowi4_dscp = ip4h_dscp(iph),
+ .flowi4_oif = rt->dst.dev->ifindex,
+ .flowi4_iif = skb->dev->ifindex,
+ .flowi4_mark = skb->mark,
+ };
rcu_read_lock();
if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
- src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
+ src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
else
src = inet_select_addr(rt->dst.dev,
rt_nexthop(rt, iph->daddr),
@@ -1257,36 +1328,25 @@ static void set_class_tag(struct rtable *rt, u32 tag)
static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
{
unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
- unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
- ip_rt_min_advmss);
+ unsigned int advmss;
+ struct net *net;
+
+ rcu_read_lock();
+ net = dst_dev_net_rcu(dst);
+ advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
+ net->ipv4.ip_rt_min_advmss);
+ rcu_read_unlock();
return min(advmss, IPV4_MAX_PMTU - header_size);
}
-static unsigned int ipv4_mtu(const struct dst_entry *dst)
+INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst)
{
- const struct rtable *rt = (const struct rtable *) dst;
- unsigned int mtu = rt->rt_pmtu;
-
- if (!mtu || time_after_eq(jiffies, rt->dst.expires))
- mtu = dst_metric_raw(dst, RTAX_MTU);
-
- if (mtu)
- return mtu;
-
- mtu = READ_ONCE(dst->dev->mtu);
-
- if (unlikely(ip_mtu_locked(dst))) {
- if (rt->rt_uses_gateway && mtu > 576)
- mtu = 576;
- }
-
- mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
-
- return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
+ return ip_dst_mtu_maybe_forward(dst, false);
}
+EXPORT_INDIRECT_CALLABLE(ipv4_mtu);
-static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
+static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
{
struct fnhe_hash_bucket *hash;
struct fib_nh_exception *fnhe, __rcu **fnhe_p;
@@ -1294,7 +1354,7 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
spin_lock_bh(&fnhe_lock);
- hash = rcu_dereference_protected(nh->nh_exceptions,
+ hash = rcu_dereference_protected(nhc->nhc_exceptions,
lockdep_is_held(&fnhe_lock));
hash += hval;
@@ -1304,6 +1364,10 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
if (fnhe->fnhe_daddr == daddr) {
rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
+ /* set fnhe_daddr to 0 to ensure it won't bind with
+ * new dsts in rt_bind_exception().
+ */
+ fnhe->fnhe_daddr = 0;
fnhe_flush_routes(fnhe);
kfree_rcu(fnhe, rcu);
break;
@@ -1316,9 +1380,10 @@ static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
spin_unlock_bh(&fnhe_lock);
}
-static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
+static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
+ __be32 daddr)
{
- struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
+ struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
struct fib_nh_exception *fnhe;
u32 hval;
@@ -1332,7 +1397,7 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
if (fnhe->fnhe_daddr == daddr) {
if (fnhe->fnhe_expires &&
time_after(jiffies, fnhe->fnhe_expires)) {
- ip_del_fnhe(nh, daddr);
+ ip_del_fnhe(nhc, daddr);
break;
}
return fnhe;
@@ -1349,19 +1414,19 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
{
+ struct fib_nh_common *nhc = res->nhc;
+ struct net_device *dev = nhc->nhc_dev;
struct fib_info *fi = res->fi;
- struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
- struct net_device *dev = nh->nh_dev;
u32 mtu = 0;
- if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
+ if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
mtu = fi->fib_mtu;
if (likely(!mtu)) {
struct fib_nh_exception *fnhe;
- fnhe = find_exception(nh, daddr);
+ fnhe = find_exception(nhc, daddr);
if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
mtu = fnhe->fnhe_pmtu;
}
@@ -1369,7 +1434,7 @@ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
if (likely(!mtu))
mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
- return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
+ return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
}
static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
@@ -1400,8 +1465,10 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
orig = NULL;
}
fill_route_from_fnhe(rt, fnhe);
- if (!rt->rt_gateway)
- rt->rt_gateway = daddr;
+ if (!rt->rt_gw4) {
+ rt->rt_gw4 = daddr;
+ rt->rt_gw_family = AF_INET;
+ }
if (do_cache) {
dst_hold(&rt->dst);
@@ -1420,15 +1487,15 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
return ret;
}
-static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
+static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
{
struct rtable *orig, *prev, **p;
bool ret = true;
if (rt_is_input_route(rt)) {
- p = (struct rtable **)&nh->nh_rth_input;
+ p = (struct rtable **)&nhc->nhc_rth_input;
} else {
- p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
+ p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
}
orig = *p;
@@ -1439,7 +1506,7 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
prev = cmpxchg(p, orig, rt);
if (prev == orig) {
if (orig) {
- dst_dev_put(&orig->dst);
+ rt_add_uncached_list(orig);
dst_release(&orig->dst);
}
} else {
@@ -1461,51 +1528,49 @@ void rt_add_uncached_list(struct rtable *rt)
{
struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
- rt->rt_uncached_list = ul;
+ rt->dst.rt_uncached_list = ul;
spin_lock_bh(&ul->lock);
- list_add_tail(&rt->rt_uncached, &ul->head);
+ list_add_tail(&rt->dst.rt_uncached, &ul->head);
spin_unlock_bh(&ul->lock);
}
void rt_del_uncached_list(struct rtable *rt)
{
- if (!list_empty(&rt->rt_uncached)) {
- struct uncached_list *ul = rt->rt_uncached_list;
+ if (!list_empty(&rt->dst.rt_uncached)) {
+ struct uncached_list *ul = rt->dst.rt_uncached_list;
spin_lock_bh(&ul->lock);
- list_del(&rt->rt_uncached);
+ list_del_init(&rt->dst.rt_uncached);
spin_unlock_bh(&ul->lock);
}
}
static void ipv4_dst_destroy(struct dst_entry *dst)
{
- struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
- struct rtable *rt = (struct rtable *)dst;
-
- if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
- kfree(p);
-
- rt_del_uncached_list(rt);
+ ip_dst_metrics_put(dst);
+ rt_del_uncached_list(dst_rtable(dst));
}
void rt_flush_dev(struct net_device *dev)
{
- struct net *net = dev_net(dev);
- struct rtable *rt;
+ struct rtable *rt, *safe;
int cpu;
for_each_possible_cpu(cpu) {
struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
+ if (list_empty(&ul->head))
+ continue;
+
spin_lock_bh(&ul->lock);
- list_for_each_entry(rt, &ul->head, rt_uncached) {
+ list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
if (rt->dst.dev != dev)
continue;
- rt->dst.dev = net->loopback_dev;
- dev_hold(rt->dst.dev);
- dev_put(dev);
+ rt->dst.dev = blackhole_netdev;
+ netdev_ref_replace(dev, blackhole_netdev,
+ &rt->dst.dev_tracker, GFP_ATOMIC);
+ list_del_init(&rt->dst.rt_uncached);
}
spin_unlock_bh(&ul->lock);
}
@@ -1514,7 +1579,7 @@ void rt_flush_dev(struct net_device *dev)
static bool rt_cache_valid(const struct rtable *rt)
{
return rt &&
- rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
+ READ_ONCE(rt->dst.obsolete) == DST_OBSOLETE_FORCE_CHK &&
!rt_is_expired(rt);
}
@@ -1527,33 +1592,43 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
bool cached = false;
if (fi) {
- struct fib_nh *nh = &FIB_RES_NH(*res);
+ struct fib_nh_common *nhc = FIB_RES_NHC(*res);
- if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
- rt->rt_gateway = nh->nh_gw;
+ if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
rt->rt_uses_gateway = 1;
+ rt->rt_gw_family = nhc->nhc_gw_family;
+ /* only INET and INET6 are supported */
+ if (likely(nhc->nhc_gw_family == AF_INET))
+ rt->rt_gw4 = nhc->nhc_gw.ipv4;
+ else
+ rt->rt_gw6 = nhc->nhc_gw.ipv6;
}
- dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
- if (fi->fib_metrics != &dst_default_metrics) {
- rt->dst._metrics |= DST_METRICS_REFCOUNTED;
- refcount_inc(&fi->fib_metrics->refcnt);
- }
+
+ ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
+
#ifdef CONFIG_IP_ROUTE_CLASSID
- rt->dst.tclassid = nh->nh_tclassid;
+ if (nhc->nhc_family == AF_INET) {
+ struct fib_nh *nh;
+
+ nh = container_of(nhc, struct fib_nh, nh_common);
+ rt->dst.tclassid = nh->nh_tclassid;
+ }
#endif
- rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
+ rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
if (unlikely(fnhe))
cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
else if (do_cache)
- cached = rt_cache_route(nh, rt);
+ cached = rt_cache_route(nhc, rt);
if (unlikely(!cached)) {
/* Routes we intend to cache in nexthop exception or
* FIB nexthop have the DST_NOCACHE bit clear.
* However, if we are unsuccessful at storing this
* route into the cache we really need to set it.
*/
- if (!rt->rt_gateway)
- rt->rt_gateway = daddr;
+ if (!rt->rt_gw4) {
+ rt->rt_gw_family = AF_INET;
+ rt->rt_gw4 = daddr;
+ }
rt_add_uncached_list(rt);
}
} else
@@ -1569,13 +1644,11 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
struct rtable *rt_dst_alloc(struct net_device *dev,
unsigned int flags, u16 type,
- bool nopolicy, bool noxfrm, bool will_cache)
+ bool noxfrm)
{
struct rtable *rt;
- rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
- (will_cache ? 0 : DST_HOST) |
- (nopolicy ? DST_NOPOLICY : 0) |
+ rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
(noxfrm ? DST_NOXFRM : 0));
if (rt) {
@@ -1586,9 +1659,9 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
rt->rt_iif = 0;
rt->rt_pmtu = 0;
rt->rt_mtu_locked = 0;
- rt->rt_gateway = 0;
rt->rt_uses_gateway = 0;
- INIT_LIST_HEAD(&rt->rt_uncached);
+ rt->rt_gw_family = 0;
+ rt->rt_gw4 = 0;
rt->dst.output = ip_output;
if (flags & RTCF_LOCAL)
@@ -1599,57 +1672,97 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
}
EXPORT_SYMBOL(rt_dst_alloc);
+struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
+{
+ struct rtable *new_rt;
+
+ new_rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK,
+ rt->dst.flags);
+
+ if (new_rt) {
+ new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
+ new_rt->rt_flags = rt->rt_flags;
+ new_rt->rt_type = rt->rt_type;
+ new_rt->rt_is_input = rt->rt_is_input;
+ new_rt->rt_iif = rt->rt_iif;
+ new_rt->rt_pmtu = rt->rt_pmtu;
+ new_rt->rt_mtu_locked = rt->rt_mtu_locked;
+ new_rt->rt_gw_family = rt->rt_gw_family;
+ if (rt->rt_gw_family == AF_INET)
+ new_rt->rt_gw4 = rt->rt_gw4;
+ else if (rt->rt_gw_family == AF_INET6)
+ new_rt->rt_gw6 = rt->rt_gw6;
+
+ new_rt->dst.input = READ_ONCE(rt->dst.input);
+ new_rt->dst.output = READ_ONCE(rt->dst.output);
+ new_rt->dst.error = rt->dst.error;
+ new_rt->dst.lastuse = jiffies;
+ new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
+ }
+ return new_rt;
+}
+EXPORT_SYMBOL(rt_dst_clone);
+
/* called in rcu_read_lock() section */
-int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
- u8 tos, struct net_device *dev,
- struct in_device *in_dev, u32 *itag)
+enum skb_drop_reason
+ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ dscp_t dscp, struct net_device *dev,
+ struct in_device *in_dev, u32 *itag)
{
- int err;
+ enum skb_drop_reason reason;
/* Primary sanity checks. */
if (!in_dev)
- return -EINVAL;
+ return SKB_DROP_REASON_NOT_SPECIFIED;
- if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
- skb->protocol != htons(ETH_P_IP))
- return -EINVAL;
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
+ return SKB_DROP_REASON_IP_INVALID_SOURCE;
+
+ if (skb->protocol != htons(ETH_P_IP))
+ return SKB_DROP_REASON_INVALID_PROTO;
if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
- return -EINVAL;
+ return SKB_DROP_REASON_IP_LOCALNET;
if (ipv4_is_zeronet(saddr)) {
- if (!ipv4_is_local_multicast(daddr))
- return -EINVAL;
+ if (!ipv4_is_local_multicast(daddr) &&
+ ip_hdr(skb)->protocol != IPPROTO_IGMP)
+ return SKB_DROP_REASON_IP_INVALID_SOURCE;
} else {
- err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
- in_dev, itag);
- if (err < 0)
- return err;
+ reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0,
+ dev, in_dev, itag);
+ if (reason)
+ return reason;
}
- return 0;
+ return SKB_NOT_DROPPED_YET;
}
/* called in rcu_read_lock() section */
-static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
- u8 tos, struct net_device *dev, int our)
+static enum skb_drop_reason
+ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ dscp_t dscp, struct net_device *dev, int our)
{
struct in_device *in_dev = __in_dev_get_rcu(dev);
unsigned int flags = RTCF_MULTICAST;
+ enum skb_drop_reason reason;
struct rtable *rth;
u32 itag = 0;
- int err;
- err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
- if (err)
- return err;
+ reason = ip_mc_validate_source(skb, daddr, saddr, dscp, dev, in_dev,
+ &itag);
+ if (reason)
+ return reason;
if (our)
flags |= RTCF_LOCAL;
+ if (IN_DEV_ORCONF(in_dev, NOPOLICY))
+ IPCB(skb)->flags |= IPSKB_NOPOLICY;
+
rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
- IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
+ false);
if (!rth)
- return -ENOBUFS;
+ return SKB_DROP_REASON_NOMEM;
#ifdef CONFIG_IP_ROUTE_CLASSID
rth->dst.tclassid = itag;
@@ -1663,8 +1776,9 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
#endif
RT_CACHE_STAT_INC(in_slow_mc);
+ skb_dst_drop(skb);
skb_dst_set(skb, &rth->dst);
- return 0;
+ return SKB_NOT_DROPPED_YET;
}
@@ -1687,18 +1801,21 @@ static void ip_handle_martian_source(struct net_device *dev,
print_hex_dump(KERN_WARNING, "ll header: ",
DUMP_PREFIX_OFFSET, 16, 1,
skb_mac_header(skb),
- dev->hard_header_len, true);
+ dev->hard_header_len, false);
}
}
#endif
}
/* called in rcu_read_lock() section */
-static int __mkroute_input(struct sk_buff *skb,
- const struct fib_result *res,
- struct in_device *in_dev,
- __be32 daddr, __be32 saddr, u32 tos)
-{
+static enum skb_drop_reason
+__mkroute_input(struct sk_buff *skb, const struct fib_result *res,
+ struct in_device *in_dev, __be32 daddr,
+ __be32 saddr, dscp_t dscp)
+{
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ struct fib_nh_common *nhc = FIB_RES_NHC(*res);
+ struct net_device *dev = nhc->nhc_dev;
struct fib_nh_exception *fnhe;
struct rtable *rth;
int err;
@@ -1707,15 +1824,16 @@ static int __mkroute_input(struct sk_buff *skb,
u32 itag = 0;
/* get a working reference to the output device */
- out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
+ out_dev = __in_dev_get_rcu(dev);
if (!out_dev) {
net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
- return -EINVAL;
+ return reason;
}
- err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
+ err = fib_validate_source(skb, saddr, daddr, dscp, FIB_RES_OIF(*res),
in_dev->dev, in_dev, &itag);
if (err < 0) {
+ reason = -err;
ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
saddr);
@@ -1724,10 +1842,14 @@ static int __mkroute_input(struct sk_buff *skb,
do_cache = res->fi && !itag;
if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
- skb->protocol == htons(ETH_P_IP) &&
- (IN_DEV_SHARED_MEDIA(out_dev) ||
- inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
- IPCB(skb)->flags |= IPSKB_DOREDIRECT;
+ skb->protocol == htons(ETH_P_IP)) {
+ __be32 gw;
+
+ gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
+ if (IN_DEV_SHARED_MEDIA(out_dev) ||
+ inet_addr_onlink(out_dev, saddr, gw))
+ IPCB(skb)->flags |= IPSKB_DOREDIRECT;
+ }
if (skb->protocol != htons(ETH_P_IP)) {
/* Not IP (i.e. ARP). Do not create route, if it is
@@ -1739,17 +1861,20 @@ static int __mkroute_input(struct sk_buff *skb,
*/
if (out_dev == in_dev &&
IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
- err = -EINVAL;
+ reason = SKB_DROP_REASON_ARP_PVLAN_DISABLE;
goto cleanup;
}
}
- fnhe = find_exception(&FIB_RES_NH(*res), daddr);
+ if (IN_DEV_ORCONF(in_dev, NOPOLICY))
+ IPCB(skb)->flags |= IPSKB_NOPOLICY;
+
+ fnhe = find_exception(nhc, daddr);
if (do_cache) {
if (fnhe)
rth = rcu_dereference(fnhe->fnhe_rth_input);
else
- rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+ rth = rcu_dereference(nhc->nhc_rth_input);
if (rt_cache_valid(rth)) {
skb_dst_set_noref(skb, &rth->dst);
goto out;
@@ -1757,10 +1882,9 @@ static int __mkroute_input(struct sk_buff *skb,
}
rth = rt_dst_alloc(out_dev->dev, 0, res->type,
- IN_DEV_CONF_GET(in_dev, NOPOLICY),
- IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
+ IN_DEV_ORCONF(out_dev, NOXFRM));
if (!rth) {
- err = -ENOBUFS;
+ reason = SKB_DROP_REASON_NOMEM;
goto cleanup;
}
@@ -1774,9 +1898,9 @@ static int __mkroute_input(struct sk_buff *skb,
lwtunnel_set_redirect(&rth->dst);
skb_dst_set(skb, &rth->dst);
out:
- err = 0;
- cleanup:
- return err;
+ reason = SKB_NOT_DROPPED_YET;
+cleanup:
+ return reason;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -1804,10 +1928,7 @@ static void ip_multipath_l3_keys(const struct sk_buff *skb,
if (!icmph)
goto out;
- if (icmph->type != ICMP_DEST_UNREACH &&
- icmph->type != ICMP_REDIRECT &&
- icmph->type != ICMP_TIME_EXCEEDED &&
- icmph->type != ICMP_PARAMETERPROB)
+ if (!icmp_is_err(icmph->type))
goto out;
inner_iph = skb_header_pointer(skb,
@@ -1822,14 +1943,134 @@ out:
hash_keys->addrs.v4addrs.dst = key_iph->daddr;
}
+static u32 fib_multipath_custom_hash_outer(const struct net *net,
+ const struct sk_buff *skb,
+ bool *p_has_inner)
+{
+ u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
+ struct flow_keys keys, hash_keys;
+
+ if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
+ return 0;
+
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);
+
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
+ hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
+ hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
+ hash_keys.basic.ip_proto = keys.basic.ip_proto;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
+ hash_keys.ports.src = keys.ports.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
+ hash_keys.ports.dst = keys.ports.dst;
+
+ *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
+ return fib_multipath_hash_from_keys(net, &hash_keys);
+}
+
+static u32 fib_multipath_custom_hash_inner(const struct net *net,
+ const struct sk_buff *skb,
+ bool has_inner)
+{
+ u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
+ struct flow_keys keys, hash_keys;
+
+ /* We assume the packet carries an encapsulation, but if none was
+ * encountered during dissection of the outer flow, then there is no
+ * point in calling the flow dissector again.
+ */
+ if (!has_inner)
+ return 0;
+
+ if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
+ return 0;
+
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ skb_flow_dissect_flow_keys(skb, &keys, 0);
+
+ if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
+ return 0;
+
+ if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
+ hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
+ hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+ } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
+ hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
+ hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
+ hash_keys.tags.flow_label = keys.tags.flow_label;
+ }
+
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
+ hash_keys.basic.ip_proto = keys.basic.ip_proto;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
+ hash_keys.ports.src = keys.ports.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
+ hash_keys.ports.dst = keys.ports.dst;
+
+ return fib_multipath_hash_from_keys(net, &hash_keys);
+}
+
+static u32 fib_multipath_custom_hash_skb(const struct net *net,
+ const struct sk_buff *skb)
+{
+ u32 mhash, mhash_inner;
+ bool has_inner = true;
+
+ mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner);
+ mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner);
+
+ return jhash_2words(mhash, mhash_inner, 0);
+}
+
+static u32 fib_multipath_custom_hash_fl4(const struct net *net,
+ const struct flowi4 *fl4)
+{
+ u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields);
+ struct flow_keys hash_keys;
+
+ if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
+ return 0;
+
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
+ hash_keys.addrs.v4addrs.src = fl4->saddr;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
+ hash_keys.addrs.v4addrs.dst = fl4->daddr;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
+ hash_keys.basic.ip_proto = fl4->flowi4_proto;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) {
+ if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT)
+ hash_keys.ports.src = (__force __be16)get_random_u16();
+ else
+ hash_keys.ports.src = fl4->fl4_sport;
+ }
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
+ hash_keys.ports.dst = fl4->fl4_dport;
+
+ return fib_multipath_hash_from_keys(net, &hash_keys);
+}
+
/* if skb is set it will be used and fl4 can be NULL */
int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
const struct sk_buff *skb, struct flow_keys *flkeys)
{
+ u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
struct flow_keys hash_keys;
- u32 mhash;
+ u32 mhash = 0;
- switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
+ switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) {
case 0:
memset(&hash_keys, 0, sizeof(hash_keys));
hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
@@ -1839,6 +2080,7 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
hash_keys.addrs.v4addrs.src = fl4->saddr;
hash_keys.addrs.v4addrs.dst = fl4->daddr;
}
+ mhash = fib_multipath_hash_from_keys(net, &hash_keys);
break;
case 1:
/* skb is currently provided only when forwarding */
@@ -1868,40 +2110,148 @@ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
hash_keys.addrs.v4addrs.src = fl4->saddr;
hash_keys.addrs.v4addrs.dst = fl4->daddr;
- hash_keys.ports.src = fl4->fl4_sport;
+ if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT)
+ hash_keys.ports.src = (__force __be16)get_random_u16();
+ else
+ hash_keys.ports.src = fl4->fl4_sport;
hash_keys.ports.dst = fl4->fl4_dport;
hash_keys.basic.ip_proto = fl4->flowi4_proto;
}
+ mhash = fib_multipath_hash_from_keys(net, &hash_keys);
+ break;
+ case 2:
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ /* skb is currently provided only when forwarding */
+ if (skb) {
+ struct flow_keys keys;
+
+ skb_flow_dissect_flow_keys(skb, &keys, 0);
+ /* Inner can be v4 or v6 */
+ if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+ hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+ } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
+ hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
+ hash_keys.tags.flow_label = keys.tags.flow_label;
+ hash_keys.basic.ip_proto = keys.basic.ip_proto;
+ } else {
+ /* Same as case 0 */
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ ip_multipath_l3_keys(skb, &hash_keys);
+ }
+ } else {
+ /* Same as case 0 */
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ hash_keys.addrs.v4addrs.src = fl4->saddr;
+ hash_keys.addrs.v4addrs.dst = fl4->daddr;
+ }
+ mhash = fib_multipath_hash_from_keys(net, &hash_keys);
+ break;
+ case 3:
+ if (skb)
+ mhash = fib_multipath_custom_hash_skb(net, skb);
+ else
+ mhash = fib_multipath_custom_hash_fl4(net, fl4);
break;
}
- mhash = flow_hash_from_keys(&hash_keys);
+
+ if (multipath_hash)
+ mhash = jhash_2words(mhash, multipath_hash, 0);
return mhash >> 1;
}
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
-static int ip_mkroute_input(struct sk_buff *skb,
- struct fib_result *res,
- struct in_device *in_dev,
- __be32 daddr, __be32 saddr, u32 tos,
- struct flow_keys *hkeys)
+static enum skb_drop_reason
+ip_mkroute_input(struct sk_buff *skb, struct fib_result *res,
+ struct in_device *in_dev, __be32 daddr,
+ __be32 saddr, dscp_t dscp, struct flow_keys *hkeys)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi && res->fi->fib_nhs > 1) {
+ if (res->fi && fib_info_num_path(res->fi) > 1) {
int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
- fib_select_multipath(res, h);
+ fib_select_multipath(res, h, NULL);
+ IPCB(skb)->flags |= IPSKB_MULTIPATH;
}
#endif
/* create a routing cache entry */
- return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
+ return __mkroute_input(skb, res, in_dev, daddr, saddr, dscp);
+}
+
+/* Implements all the saddr-related checks as ip_route_input_slow(),
+ * assuming daddr is valid and the destination is not a local broadcast one.
+ * Uses the provided hint instead of performing a route lookup.
+ */
+enum skb_drop_reason
+ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ dscp_t dscp, struct net_device *dev,
+ const struct sk_buff *hint)
+{
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+ struct rtable *rt = skb_rtable(hint);
+ struct net *net = dev_net(dev);
+ u32 tag = 0;
+
+ if (!in_dev)
+ return reason;
+
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) {
+ reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
+ goto martian_source;
+ }
+
+ if (ipv4_is_zeronet(saddr)) {
+ reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
+ goto martian_source;
+ }
+
+ if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) {
+ reason = SKB_DROP_REASON_IP_LOCALNET;
+ goto martian_source;
+ }
+
+ if (!(rt->rt_flags & RTCF_LOCAL))
+ goto skip_validate_source;
+
+ reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev,
+ in_dev, &tag);
+ if (reason)
+ goto martian_source;
+
+skip_validate_source:
+ skb_dst_copy(skb, hint);
+ return SKB_NOT_DROPPED_YET;
+
+martian_source:
+ ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
+ return reason;
+}
+
+/* get device for dst_alloc with local routes */
+static struct net_device *ip_rt_get_dev(struct net *net,
+ const struct fib_result *res)
+{
+ struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
+ struct net_device *dev = NULL;
+
+ if (nhc)
+ dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
+
+ return dev ? : net->loopback_dev;
}
/*
* NOTE. We drop all the packets that has local source
* addresses, because every properly looped back packet
* must have correct destination already attached by output routine.
+ * Changes in the enforced policies must be applied also to
+ * ip_route_use_hint().
*
* Such approach solves two big problems:
* 1. Not simplex devices are handled properly.
@@ -1909,10 +2259,12 @@ static int ip_mkroute_input(struct sk_buff *skb,
* called with rcu_read_lock()
*/
-static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
- u8 tos, struct net_device *dev,
- struct fib_result *res)
+static enum skb_drop_reason
+ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ dscp_t dscp, struct net_device *dev,
+ struct fib_result *res)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct in_device *in_dev = __in_dev_get_rcu(dev);
struct flow_keys *flkeys = NULL, _flkeys;
struct net *net = dev_net(dev);
@@ -1922,7 +2274,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u32 itag = 0;
struct rtable *rth;
struct flowi4 fl4;
- bool do_cache;
+ bool do_cache = true;
/* IP on this device is disabled. */
@@ -1930,7 +2282,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
goto out;
/* Check for the most weird martians, which can be not detected
- by fib_lookup.
+ * by fib_lookup.
*/
tun_info = skb_tunnel_info(skb);
@@ -1940,8 +2292,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
fl4.flowi4_tun_key.tun_id = 0;
skb_dst_drop(skb);
- if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
+ if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) {
+ reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
goto martian_source;
+ }
res->fi = NULL;
res->table = NULL;
@@ -1951,35 +2305,45 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
/* Accept zero addresses only to limited broadcast;
* I even do not know to fix it or not. Waiting for complains :-)
*/
- if (ipv4_is_zeronet(saddr))
+ if (ipv4_is_zeronet(saddr)) {
+ reason = SKB_DROP_REASON_IP_INVALID_SOURCE;
goto martian_source;
+ }
- if (ipv4_is_zeronet(daddr))
+ if (ipv4_is_zeronet(daddr)) {
+ reason = SKB_DROP_REASON_IP_INVALID_DEST;
goto martian_destination;
+ }
/* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
* and call it once if daddr or/and saddr are loopback addresses
*/
if (ipv4_is_loopback(daddr)) {
- if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
+ if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) {
+ reason = SKB_DROP_REASON_IP_LOCALNET;
goto martian_destination;
+ }
} else if (ipv4_is_loopback(saddr)) {
- if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
+ if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) {
+ reason = SKB_DROP_REASON_IP_LOCALNET;
goto martian_source;
+ }
}
/*
* Now we are ready to route packet.
*/
+ fl4.flowi4_l3mdev = 0;
fl4.flowi4_oif = 0;
fl4.flowi4_iif = dev->ifindex;
fl4.flowi4_mark = skb->mark;
- fl4.flowi4_tos = tos;
+ fl4.flowi4_dscp = dscp;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_flags = 0;
fl4.daddr = daddr;
fl4.saddr = saddr;
fl4.flowi4_uid = sock_net_uid(net, NULL);
+ fl4.flowi4_multipath_hash = 0;
if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
flkeys = &_flkeys;
@@ -1999,13 +2363,17 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (res->type == RTN_BROADCAST) {
if (IN_DEV_BFORWARD(in_dev))
goto make_route;
+ /* not do cache if bc_forwarding is enabled */
+ if (IPV4_DEVCONF_ALL_RO(net, BC_FORWARDING))
+ do_cache = false;
goto brd_input;
}
+ err = -EINVAL;
if (res->type == RTN_LOCAL) {
- err = fib_validate_source(skb, saddr, daddr, tos,
- 0, dev, in_dev, &itag);
- if (err < 0)
+ reason = fib_validate_source_reason(skb, saddr, daddr, dscp,
+ 0, dev, in_dev, &itag);
+ if (reason)
goto martian_source;
goto local_input;
}
@@ -2014,21 +2382,28 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
err = -EHOSTUNREACH;
goto no_route;
}
- if (res->type != RTN_UNICAST)
+ if (res->type != RTN_UNICAST) {
+ reason = SKB_DROP_REASON_IP_INVALID_DEST;
goto martian_destination;
+ }
make_route:
- err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
-out: return err;
+ reason = ip_mkroute_input(skb, res, in_dev, daddr, saddr, dscp,
+ flkeys);
+
+out:
+ return reason;
brd_input:
- if (skb->protocol != htons(ETH_P_IP))
- goto e_inval;
+ if (skb->protocol != htons(ETH_P_IP)) {
+ reason = SKB_DROP_REASON_INVALID_PROTO;
+ goto out;
+ }
if (!ipv4_is_zeronet(saddr)) {
- err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
- in_dev, &itag);
- if (err < 0)
+ reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0,
+ dev, in_dev, &itag);
+ if (reason)
goto martian_source;
}
flags |= RTCF_BROADCAST;
@@ -2036,22 +2411,23 @@ brd_input:
RT_CACHE_STAT_INC(in_brd);
local_input:
- do_cache = false;
- if (res->fi) {
- if (!itag) {
- rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
- if (rt_cache_valid(rth)) {
- skb_dst_set_noref(skb, &rth->dst);
- err = 0;
- goto out;
- }
- do_cache = true;
+ if (IN_DEV_ORCONF(in_dev, NOPOLICY))
+ IPCB(skb)->flags |= IPSKB_NOPOLICY;
+
+ do_cache &= res->fi && !itag;
+ if (do_cache) {
+ struct fib_nh_common *nhc = FIB_RES_NHC(*res);
+
+ rth = rcu_dereference(nhc->nhc_rth_input);
+ if (rt_cache_valid(rth)) {
+ skb_dst_set_noref(skb, &rth->dst);
+ reason = SKB_NOT_DROPPED_YET;
+ goto out;
}
}
- rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
- flags | RTCF_LOCAL, res->type,
- IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
+ rth = rt_dst_alloc(ip_rt_get_dev(net, res),
+ flags | RTCF_LOCAL, res->type, false);
if (!rth)
goto e_nobufs;
@@ -2065,24 +2441,24 @@ local_input:
if (res->type == RTN_UNREACHABLE) {
rth->dst.input= ip_error;
rth->dst.error= -err;
- rth->rt_flags &= ~RTCF_LOCAL;
+ rth->rt_flags &= ~RTCF_LOCAL;
}
if (do_cache) {
- struct fib_nh *nh = &FIB_RES_NH(*res);
+ struct fib_nh_common *nhc = FIB_RES_NHC(*res);
- rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
+ rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
WARN_ON(rth->dst.input == lwtunnel_input);
rth->dst.lwtstate->orig_input = rth->dst.input;
rth->dst.input = lwtunnel_input;
}
- if (unlikely(!rt_cache_route(nh, rth)))
+ if (unlikely(!rt_cache_route(nhc, rth)))
rt_add_uncached_list(rth);
}
skb_dst_set(skb, &rth->dst);
- err = 0;
+ reason = SKB_NOT_DROPPED_YET;
goto out;
no_route:
@@ -2102,13 +2478,10 @@ martian_destination:
net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
&daddr, &saddr, dev->name);
#endif
-
-e_inval:
- err = -EINVAL;
goto out;
e_nobufs:
- err = -ENOBUFS;
+ reason = SKB_DROP_REASON_NOMEM;
goto out;
martian_source:
@@ -2116,47 +2489,36 @@ martian_source:
goto out;
}
-int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
- u8 tos, struct net_device *dev)
-{
- struct fib_result res;
- int err;
-
- tos &= IPTOS_RT_MASK;
- rcu_read_lock();
- err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
- rcu_read_unlock();
-
- return err;
-}
-EXPORT_SYMBOL(ip_route_input_noref);
-
/* called with rcu_read_lock held */
-int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
- u8 tos, struct net_device *dev, struct fib_result *res)
+static enum skb_drop_reason
+ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
+ dscp_t dscp, struct net_device *dev,
+ struct fib_result *res)
{
/* Multicast recognition logic is moved from route cache to here.
- The problem was that too many Ethernet cards have broken/missing
- hardware multicast filters :-( As result the host on multicasting
- network acquires a lot of useless route cache entries, sort of
- SDR messages from all the world. Now we try to get rid of them.
- Really, provided software IP multicast filter is organized
- reasonably (at least, hashed), it does not result in a slowdown
- comparing with route cache reject entries.
- Note, that multicast routers are not affected, because
- route cache entry is created eventually.
+ * The problem was that too many Ethernet cards have broken/missing
+ * hardware multicast filters :-( As result the host on multicasting
+ * network acquires a lot of useless route cache entries, sort of
+ * SDR messages from all the world. Now we try to get rid of them.
+ * Really, provided software IP multicast filter is organized
+ * reasonably (at least, hashed), it does not result in a slowdown
+ * comparing with route cache reject entries.
+ * Note, that multicast routers are not affected, because
+ * route cache entry is created eventually.
*/
if (ipv4_is_multicast(daddr)) {
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct in_device *in_dev = __in_dev_get_rcu(dev);
int our = 0;
- int err = -EINVAL;
- if (in_dev)
- our = ip_check_mc_rcu(in_dev, daddr, saddr,
- ip_hdr(skb)->protocol);
+ if (!in_dev)
+ return reason;
+
+ our = ip_check_mc_rcu(in_dev, daddr, saddr,
+ ip_hdr(skb)->protocol);
/* check l3 master if no match yet */
- if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
+ if (!our && netif_is_l3_slave(dev)) {
struct in_device *l3_in_dev;
l3_in_dev = __in_dev_get_rcu(skb->dev);
@@ -2172,14 +2534,29 @@ int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
IN_DEV_MFORWARD(in_dev))
#endif
) {
- err = ip_route_input_mc(skb, daddr, saddr,
- tos, dev, our);
+ reason = ip_route_input_mc(skb, daddr, saddr, dscp,
+ dev, our);
}
- return err;
+ return reason;
}
- return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
+ return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res);
+}
+
+enum skb_drop_reason ip_route_input_noref(struct sk_buff *skb, __be32 daddr,
+ __be32 saddr, dscp_t dscp,
+ struct net_device *dev)
+{
+ enum skb_drop_reason reason;
+ struct fib_result res;
+
+ rcu_read_lock();
+ reason = ip_route_input_rcu(skb, daddr, saddr, dscp, dev, &res);
+ rcu_read_unlock();
+
+ return reason;
}
+EXPORT_SYMBOL(ip_route_input_noref);
/* called with rcu_read_lock() */
static struct rtable *__mkroute_output(const struct fib_result *res,
@@ -2204,12 +2581,16 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
!netif_is_l3_master(dev_out))
return ERR_PTR(-EINVAL);
- if (ipv4_is_lbcast(fl4->daddr))
+ if (ipv4_is_lbcast(fl4->daddr)) {
type = RTN_BROADCAST;
- else if (ipv4_is_multicast(fl4->daddr))
+
+ /* reset fi to prevent gateway resolution */
+ fi = NULL;
+ } else if (ipv4_is_multicast(fl4->daddr)) {
type = RTN_MULTICAST;
- else if (ipv4_is_zeronet(fl4->daddr))
+ } else if (ipv4_is_zeronet(fl4->daddr)) {
return ERR_PTR(-EINVAL);
+ }
if (dev_out->flags & IFF_LOOPBACK)
flags |= RTCF_LOCAL;
@@ -2217,7 +2598,6 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
do_cache = true;
if (type == RTN_BROADCAST) {
flags |= RTCF_BROADCAST | RTCF_LOCAL;
- fi = NULL;
} else if (type == RTN_MULTICAST) {
flags |= RTCF_MULTICAST | RTCF_LOCAL;
if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
@@ -2248,10 +2628,10 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
fnhe = NULL;
do_cache &= fi != NULL;
if (fi) {
+ struct fib_nh_common *nhc = FIB_RES_NHC(*res);
struct rtable __rcu **prth;
- struct fib_nh *nh = &FIB_RES_NH(*res);
- fnhe = find_exception(nh, fl4->daddr);
+ fnhe = find_exception(nhc, fl4->daddr);
if (!do_cache)
goto add;
if (fnhe) {
@@ -2259,12 +2639,12 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
} else {
if (unlikely(fl4->flowi4_flags &
FLOWI_FLAG_KNOWN_NH &&
- !(nh->nh_gw &&
- nh->nh_scope == RT_SCOPE_LINK))) {
+ !(nhc->nhc_gw_family &&
+ nhc->nhc_scope == RT_SCOPE_LINK))) {
do_cache = false;
goto add;
}
- prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
+ prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
}
rth = rcu_dereference(*prth);
if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
@@ -2273,9 +2653,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
add:
rth = rt_dst_alloc(dev_out, flags, type,
- IN_DEV_CONF_GET(in_dev, NOPOLICY),
- IN_DEV_CONF_GET(in_dev, NOXFRM),
- do_cache);
+ IN_DEV_ORCONF(in_dev, NOXFRM));
if (!rth)
return ERR_PTR(-ENOBUFS);
@@ -2294,7 +2672,7 @@ add:
if (IN_DEV_MFORWARD(in_dev) &&
!ipv4_is_local_multicast(fl4->daddr)) {
rth->dst.input = ip_mr_input;
- rth->dst.output = ip_mc_output;
+ rth->dst.output = ip_mr_output;
}
}
#endif
@@ -2313,7 +2691,6 @@ add:
struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
const struct sk_buff *skb)
{
- __u8 tos = RT_FL_TOS(fl4);
struct fib_result res = {
.type = RTN_UNSPEC,
.fi = NULL,
@@ -2323,9 +2700,6 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
struct rtable *rth;
fl4->flowi4_iif = LOOPBACK_IFINDEX;
- fl4->flowi4_tos = tos & IPTOS_RT_MASK;
- fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
- RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
rcu_read_lock();
rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
@@ -2343,21 +2717,23 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
int orig_oif = fl4->flowi4_oif;
unsigned int flags = 0;
struct rtable *rth;
- int err = -ENETUNREACH;
+ int err;
if (fl4->saddr) {
- rth = ERR_PTR(-EINVAL);
if (ipv4_is_multicast(fl4->saddr) ||
- ipv4_is_lbcast(fl4->saddr) ||
- ipv4_is_zeronet(fl4->saddr))
+ ipv4_is_lbcast(fl4->saddr)) {
+ rth = ERR_PTR(-EINVAL);
goto out;
+ }
+
+ rth = ERR_PTR(-ENETUNREACH);
/* I removed check for oif == dev_out->oif here.
- It was wrong for two reasons:
- 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
- is assigned to multiple interfaces.
- 2. Moreover, we are allowed to send packets with saddr
- of another iface. --ANK
+ * It was wrong for two reasons:
+ * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
+ * is assigned to multiple interfaces.
+ * 2. Moreover, we are allowed to send packets with saddr
+ * of another iface. --ANK
*/
if (fl4->flowi4_oif == 0 &&
@@ -2369,18 +2745,18 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
goto out;
/* Special hack: user can direct multicasts
- and limited broadcast via necessary interface
- without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
- This hack is not just for fun, it allows
- vic,vat and friends to work.
- They bind socket to loopback, set ttl to zero
- and expect that it will work.
- From the viewpoint of routing cache they are broken,
- because we are not allowed to build multicast path
- with loopback source addr (look, routing cache
- cannot know, that ttl is zero, so that packet
- will not leave this host and route is valid).
- Luckily, this hack is good workaround.
+ * and limited broadcast via necessary interface
+ * without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
+ * This hack is not just for fun, it allows
+ * vic,vat and friends to work.
+ * They bind socket to loopback, set ttl to zero
+ * and expect that it will work.
+ * From the viewpoint of routing cache they are broken,
+ * because we are not allowed to build multicast path
+ * with loopback source addr (look, routing cache
+ * cannot know, that ttl is zero, so that packet
+ * will not leave this host and route is valid).
+ * Luckily, this hack is good workaround.
*/
fl4->flowi4_oif = dev_out->ifindex;
@@ -2440,24 +2816,23 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
res->fi = NULL;
res->table = NULL;
if (fl4->flowi4_oif &&
- (ipv4_is_multicast(fl4->daddr) ||
- !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
+ (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) {
/* Apparently, routing tables are wrong. Assume,
- that the destination is on link.
-
- WHY? DW.
- Because we are allowed to send to iface
- even if it has NO routes and NO assigned
- addresses. When oif is specified, routing
- tables are looked up with only one purpose:
- to catch if destination is gatewayed, rather than
- direct. Moreover, if MSG_DONTROUTE is set,
- we send packet, ignoring both routing tables
- and ifaddr state. --ANK
-
-
- We could make it even if oif is unknown,
- likely IPv6, but we do not.
+ * that the destination is on link.
+ *
+ * WHY? DW.
+ * Because we are allowed to send to iface
+ * even if it has NO routes and NO assigned
+ * addresses. When oif is specified, routing
+ * tables are looked up with only one purpose:
+ * to catch if destination is gatewayed, rather than
+ * direct. Moreover, if MSG_DONTROUTE is set,
+ * we send packet, ignoring both routing tables
+ * and ifaddr state. --ANK
+ *
+ *
+ * We could make it even if oif is unknown,
+ * likely IPv6, but we do not.
*/
if (fl4->saddr == 0)
@@ -2495,8 +2870,6 @@ struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
fib_select_path(net, res, fl4, skb);
dev_out = FIB_RES_DEV(*res);
- fl4->flowi4_oif = dev_out->ifindex;
-
make_route:
rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
@@ -2505,51 +2878,23 @@ out:
return rth;
}
-static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
-{
- return NULL;
-}
-
-static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
-{
- unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
-
- return mtu ? : dst->dev->mtu;
-}
-
-static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
-{
-}
-
-static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb)
-{
-}
-
-static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
- unsigned long old)
-{
- return NULL;
-}
-
static struct dst_ops ipv4_dst_blackhole_ops = {
- .family = AF_INET,
- .check = ipv4_blackhole_dst_check,
- .mtu = ipv4_blackhole_mtu,
- .default_advmss = ipv4_default_advmss,
- .update_pmtu = ipv4_rt_blackhole_update_pmtu,
- .redirect = ipv4_rt_blackhole_redirect,
- .cow_metrics = ipv4_rt_blackhole_cow_metrics,
- .neigh_lookup = ipv4_neigh_lookup,
+ .family = AF_INET,
+ .default_advmss = ipv4_default_advmss,
+ .neigh_lookup = ipv4_neigh_lookup,
+ .check = dst_blackhole_check,
+ .cow_metrics = dst_blackhole_cow_metrics,
+ .update_pmtu = dst_blackhole_update_pmtu,
+ .redirect = dst_blackhole_redirect,
+ .mtu = dst_blackhole_mtu,
};
struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
{
- struct rtable *ort = (struct rtable *) dst_orig;
+ struct rtable *ort = dst_rtable(dst_orig);
struct rtable *rt;
- rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
+ rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0);
if (rt) {
struct dst_entry *new = &rt->dst;
@@ -2558,8 +2903,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
new->output = dst_discard_out;
new->dev = net->loopback_dev;
- if (new->dev)
- dev_hold(new->dev);
+ netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC);
rt->rt_is_input = ort->rt_is_input;
rt->rt_iif = ort->rt_iif;
@@ -2569,10 +2913,12 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
rt->rt_genid = rt_genid_ipv4(net);
rt->rt_flags = ort->rt_flags;
rt->rt_type = ort->rt_type;
- rt->rt_gateway = ort->rt_gateway;
rt->rt_uses_gateway = ort->rt_uses_gateway;
-
- INIT_LIST_HEAD(&rt->rt_uncached);
+ rt->rt_gw_family = ort->rt_gw_family;
+ if (rt->rt_gw_family == AF_INET)
+ rt->rt_gw4 = ort->rt_gw4;
+ else if (rt->rt_gw_family == AF_INET6)
+ rt->rt_gw6 = ort->rt_gw6;
}
dst_release(dst_orig);
@@ -2588,10 +2934,12 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
if (IS_ERR(rt))
return rt;
- if (flp4->flowi4_proto)
- rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
- flowi4_to_flowi(flp4),
- sk, 0);
+ if (flp4->flowi4_proto) {
+ flp4->flowi4_oif = rt->dst.dev->ifindex;
+ rt = dst_rtable(xfrm_lookup_route(net, &rt->dst,
+ flowi4_to_flowi(flp4),
+ sk, 0));
+ }
return rt;
}
@@ -2599,8 +2947,9 @@ EXPORT_SYMBOL_GPL(ip_route_output_flow);
/* called with rcu_read_lock held */
static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
- struct rtable *rt, u32 table_id, struct flowi4 *fl4,
- struct sk_buff *skb, u32 portid, u32 seq)
+ struct rtable *rt, u32 table_id, dscp_t dscp,
+ struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
+ u32 seq, unsigned int flags)
{
struct rtmsg *r;
struct nlmsghdr *nlh;
@@ -2608,7 +2957,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
u32 error;
u32 metrics[RTAX_MAX];
- nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
+ nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
if (!nlh)
return -EMSGSIZE;
@@ -2616,7 +2965,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
r->rtm_family = AF_INET;
r->rtm_dst_len = 32;
r->rtm_src_len = 0;
- r->rtm_tos = fl4->flowi4_tos;
+ r->rtm_tos = inet_dscp_to_dsfield(dscp);
r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
if (nla_put_u32(skb, RTA_TABLE, table_id))
goto nla_put_failure;
@@ -2639,21 +2988,38 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
if (rt->dst.dev &&
nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
goto nla_put_failure;
+ if (lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
+ goto nla_put_failure;
#ifdef CONFIG_IP_ROUTE_CLASSID
if (rt->dst.tclassid &&
nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
goto nla_put_failure;
#endif
- if (!rt_is_input_route(rt) &&
+ if (fl4 && !rt_is_input_route(rt) &&
fl4->saddr != src) {
if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
goto nla_put_failure;
}
- if (rt->rt_uses_gateway &&
- nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
- goto nla_put_failure;
+ if (rt->rt_uses_gateway) {
+ if (rt->rt_gw_family == AF_INET &&
+ nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
+ goto nla_put_failure;
+ } else if (rt->rt_gw_family == AF_INET6) {
+ int alen = sizeof(struct in6_addr);
+ struct nlattr *nla;
+ struct rtvia *via;
- expires = rt->dst.expires;
+ nla = nla_reserve(skb, RTA_VIA, alen + 2);
+ if (!nla)
+ goto nla_put_failure;
+
+ via = nla_data(nla);
+ via->rtvia_family = AF_INET6;
+ memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
+ }
+ }
+
+ expires = READ_ONCE(rt->dst.expires);
if (expires) {
unsigned long now = jiffies;
@@ -2671,36 +3037,40 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
if (rtnetlink_put_metrics(skb, metrics) < 0)
goto nla_put_failure;
- if (fl4->flowi4_mark &&
- nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
- goto nla_put_failure;
-
- if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
- nla_put_u32(skb, RTA_UID,
- from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
- goto nla_put_failure;
+ if (fl4) {
+ if (fl4->flowi4_mark &&
+ nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
+ goto nla_put_failure;
- error = rt->dst.error;
+ if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
+ nla_put_u32(skb, RTA_UID,
+ from_kuid_munged(current_user_ns(),
+ fl4->flowi4_uid)))
+ goto nla_put_failure;
- if (rt_is_input_route(rt)) {
+ if (rt_is_input_route(rt)) {
#ifdef CONFIG_IP_MROUTE
- if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
- IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
- int err = ipmr_get_route(net, skb,
- fl4->saddr, fl4->daddr,
- r, portid);
-
- if (err <= 0) {
- if (err == 0)
- return 0;
- goto nla_put_failure;
- }
- } else
+ if (ipv4_is_multicast(dst) &&
+ !ipv4_is_local_multicast(dst) &&
+ IPV4_DEVCONF_ALL_RO(net, MC_FORWARDING)) {
+ int err = ipmr_get_route(net, skb,
+ fl4->saddr, fl4->daddr,
+ r, portid);
+
+ if (err <= 0) {
+ if (err == 0)
+ return 0;
+ goto nla_put_failure;
+ }
+ } else
#endif
- if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
- goto nla_put_failure;
+ if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
+ goto nla_put_failure;
+ }
}
+ error = rt->dst.error;
+
if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
goto nla_put_failure;
@@ -2712,6 +3082,81 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, u32 table_id,
+ struct fnhe_hash_bucket *bucket, int genid,
+ int *fa_index, int fa_start, unsigned int flags)
+{
+ int i;
+
+ for (i = 0; i < FNHE_HASH_SIZE; i++) {
+ struct fib_nh_exception *fnhe;
+
+ for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
+ fnhe = rcu_dereference(fnhe->fnhe_next)) {
+ struct rtable *rt;
+ int err;
+
+ if (*fa_index < fa_start)
+ goto next;
+
+ if (fnhe->fnhe_genid != genid)
+ goto next;
+
+ if (fnhe->fnhe_expires &&
+ time_after(jiffies, fnhe->fnhe_expires))
+ goto next;
+
+ rt = rcu_dereference(fnhe->fnhe_rth_input);
+ if (!rt)
+ rt = rcu_dereference(fnhe->fnhe_rth_output);
+ if (!rt)
+ goto next;
+
+ err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
+ table_id, 0, NULL, skb,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, flags);
+ if (err)
+ return err;
+next:
+ (*fa_index)++;
+ }
+ }
+
+ return 0;
+}
+
+int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
+ u32 table_id, struct fib_info *fi,
+ int *fa_index, int fa_start, unsigned int flags)
+{
+ struct net *net = sock_net(cb->skb->sk);
+ int nhsel, genid = fnhe_genid(net);
+
+ for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
+ struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
+ struct fnhe_hash_bucket *bucket;
+ int err;
+
+ if (nhc->nhc_flags & RTNH_F_DEAD)
+ continue;
+
+ rcu_read_lock();
+ bucket = rcu_dereference(nhc->nhc_exceptions);
+ err = 0;
+ if (bucket)
+ err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
+ genid, fa_index, fa_start,
+ flags);
+ rcu_read_unlock();
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
u8 ip_proto, __be16 sport,
__be16 dport)
@@ -2745,7 +3190,7 @@ static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
udph = skb_put_zero(skb, sizeof(struct udphdr));
udph->source = sport;
udph->dest = dport;
- udph->len = sizeof(struct udphdr);
+ udph->len = htons(sizeof(struct udphdr));
udph->check = 0;
break;
}
@@ -2773,6 +3218,75 @@ static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
return skb;
}
+static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct rtmsg *rtm;
+ int i, err;
+
+ rtm = nlmsg_payload(nlh, sizeof(*rtm));
+ if (!rtm) {
+ NL_SET_ERR_MSG(extack,
+ "ipv4: Invalid header for route get request");
+ return -EINVAL;
+ }
+
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv4_policy, extack);
+
+ if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
+ (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
+ rtm->rtm_table || rtm->rtm_protocol ||
+ rtm->rtm_scope || rtm->rtm_type) {
+ NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
+ return -EINVAL;
+ }
+
+ if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
+ RTM_F_LOOKUP_TABLE |
+ RTM_F_FIB_MATCH)) {
+ NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv4_policy, extack);
+ if (err)
+ return err;
+
+ if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
+ (tb[RTA_DST] && !rtm->rtm_dst_len)) {
+ NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
+ return -EINVAL;
+ }
+
+ for (i = 0; i <= RTA_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case RTA_IIF:
+ case RTA_OIF:
+ case RTA_SRC:
+ case RTA_DST:
+ case RTA_IP_PROTO:
+ case RTA_SPORT:
+ case RTA_DPORT:
+ case RTA_MARK:
+ case RTA_UID:
+ break;
+ default:
+ NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
@@ -2785,24 +3299,25 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct rtable *rt = NULL;
struct sk_buff *skb;
struct rtmsg *rtm;
- struct flowi4 fl4;
+ struct flowi4 fl4 = {};
__be32 dst = 0;
__be32 src = 0;
+ dscp_t dscp;
kuid_t uid;
u32 iif;
int err;
int mark;
- err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
- extack);
+ err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
if (err < 0)
return err;
rtm = nlmsg_data(nlh);
- src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
- dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
- iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
- mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
+ src = nla_get_in_addr_default(tb[RTA_SRC], 0);
+ dst = nla_get_in_addr_default(tb[RTA_DST], 0);
+ iif = nla_get_u32_default(tb[RTA_IIF], 0);
+ mark = nla_get_u32_default(tb[RTA_MARK], 0);
+ dscp = inet_dsfield_to_dscp(rtm->rtm_tos);
if (tb[RTA_UID])
uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
else
@@ -2810,7 +3325,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
if (tb[RTA_IP_PROTO]) {
err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
- &ip_proto, extack);
+ &ip_proto, AF_INET, extack);
if (err)
return err;
}
@@ -2825,11 +3340,10 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
if (!skb)
return -ENOBUFS;
- memset(&fl4, 0, sizeof(fl4));
fl4.daddr = dst;
fl4.saddr = src;
- fl4.flowi4_tos = rtm->rtm_tos;
- fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
+ fl4.flowi4_dscp = dscp;
+ fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0);
fl4.flowi4_mark = mark;
fl4.flowi4_uid = uid;
if (sport)
@@ -2852,14 +3366,15 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fl4.flowi4_iif = iif; /* for rt_fill_info */
skb->dev = dev;
skb->mark = mark;
- err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
- dev, &res);
+ err = ip_route_input_rcu(skb, dst, src, dscp, dev,
+ &res) ? -EINVAL : 0;
rt = skb_rtable(skb);
if (err == 0 && rt->dst.error)
err = -rt->dst.error;
} else {
fl4.flowi4_iif = LOOPBACK_IFINDEX;
+ skb->dev = net->loopback_dev;
rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
err = 0;
if (IS_ERR(rt))
@@ -2884,19 +3399,48 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
skb_reset_mac_header(skb);
if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
+ struct fib_rt_info fri;
+
if (!res.fi) {
err = fib_props[res.type].error;
if (!err)
err = -EHOSTUNREACH;
goto errout_rcu;
}
+ fri.fi = res.fi;
+ fri.tb_id = table_id;
+ fri.dst = res.prefix;
+ fri.dst_len = res.prefixlen;
+ fri.dscp = res.dscp;
+ fri.type = rt->rt_type;
+ fri.offload = 0;
+ fri.trap = 0;
+ fri.offload_failed = 0;
+ if (res.fa_head) {
+ struct fib_alias *fa;
+
+ hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
+ u8 slen = 32 - fri.dst_len;
+
+ if (fa->fa_slen == slen &&
+ fa->tb_id == fri.tb_id &&
+ fa->fa_dscp == fri.dscp &&
+ fa->fa_info == res.fi &&
+ fa->fa_type == fri.type) {
+ fri.offload = READ_ONCE(fa->offload);
+ fri.trap = READ_ONCE(fa->trap);
+ fri.offload_failed =
+ READ_ONCE(fa->offload_failed);
+ break;
+ }
+ }
+ }
err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
- rt->rt_type, res.prefix, res.prefixlen,
- fl4.flowi4_tos, res.fi, 0);
+ nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
} else {
- err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
- NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
+ err = rt_fill_info(net, dst, src, rt, table_id, res.dscp, &fl4,
+ skb, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0);
}
if (err < 0)
goto errout_rcu;
@@ -2924,9 +3468,8 @@ static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
static int ip_rt_gc_elasticity __read_mostly = 8;
static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
-static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+static int ipv4_sysctl_rtcache_flush(const struct ctl_table *__ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = (struct net *)__ctl->extra1;
@@ -3026,64 +3569,76 @@ static struct ctl_table ipv4_route_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+};
+
+static const char ipv4_route_flush_procname[] = "flush";
+
+static struct ctl_table ipv4_route_netns_table[] = {
{
- .procname = "mtu_expires",
- .data = &ip_rt_mtu_expires,
+ .procname = ipv4_route_flush_procname,
.maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
+ .mode = 0200,
+ .proc_handler = ipv4_sysctl_rtcache_flush,
},
{
- .procname = "min_pmtu",
- .data = &ip_rt_min_pmtu,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &ip_min_valid_pmtu,
+ .procname = "min_pmtu",
+ .data = &init_net.ipv4.ip_rt_min_pmtu,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &ip_min_valid_pmtu,
},
{
- .procname = "min_adv_mss",
- .data = &ip_rt_min_advmss,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
+ .procname = "mtu_expires",
+ .data = &init_net.ipv4.ip_rt_mtu_expires,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
},
- { }
-};
-
-static struct ctl_table ipv4_route_flush_table[] = {
{
- .procname = "flush",
- .maxlen = sizeof(int),
- .mode = 0200,
- .proc_handler = ipv4_sysctl_rtcache_flush,
+ .procname = "min_adv_mss",
+ .data = &init_net.ipv4.ip_rt_min_advmss,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
},
- { },
};
static __net_init int sysctl_route_net_init(struct net *net)
{
struct ctl_table *tbl;
+ size_t table_size = ARRAY_SIZE(ipv4_route_netns_table);
- tbl = ipv4_route_flush_table;
+ tbl = ipv4_route_netns_table;
if (!net_eq(net, &init_net)) {
- tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
+ int i;
+
+ tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL);
if (!tbl)
goto err_dup;
- /* Don't export sysctls to unprivileged users */
- if (net->user_ns != &init_user_ns)
- tbl[0].procname = NULL;
+ /* Don't export non-whitelisted sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns) {
+ if (tbl[0].procname != ipv4_route_flush_procname)
+ table_size = 0;
+ }
+
+ /* Update the variables to point into the current struct net
+ * except for the first element flush
+ */
+ for (i = 1; i < table_size; i++)
+ tbl[i].data += (void *)net - (void *)&init_net;
}
tbl[0].extra1 = net;
- net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
+ net->ipv4.route_hdr = register_net_sysctl_sz(net, "net/ipv4/route",
+ tbl, table_size);
if (!net->ipv4.route_hdr)
goto err_reg;
return 0;
err_reg:
- if (tbl != ipv4_route_flush_table)
+ if (tbl != ipv4_route_netns_table)
kfree(tbl);
err_dup:
return -ENOMEM;
@@ -3091,11 +3646,11 @@ err_dup:
static __net_exit void sysctl_route_net_exit(struct net *net)
{
- struct ctl_table *tbl;
+ const struct ctl_table *tbl;
tbl = net->ipv4.route_hdr->ctl_table_arg;
unregister_net_sysctl_table(net->ipv4.route_hdr);
- BUG_ON(tbl == ipv4_route_flush_table);
+ BUG_ON(tbl == ipv4_route_netns_table);
kfree(tbl);
}
@@ -3105,11 +3660,24 @@ static __net_initdata struct pernet_operations sysctl_route_ops = {
};
#endif
+static __net_init int netns_ip_rt_init(struct net *net)
+{
+ /* Set default value for namespaceified sysctls */
+ net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU;
+ net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES;
+ net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS;
+ return 0;
+}
+
+static struct pernet_operations __net_initdata ip_rt_ops = {
+ .init = netns_ip_rt_init,
+};
+
static __net_init int rt_genid_init(struct net *net)
{
atomic_set(&net->ipv4.rt_genid, 0);
atomic_set(&net->fnhe_genid, 0);
- atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
+ atomic_set(&net->ipv4.dev_addr_genid, get_random_u32());
return 0;
}
@@ -3146,20 +3714,32 @@ static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
#endif /* CONFIG_IP_ROUTE_CLASSID */
+static const struct rtnl_msg_handler ip_rt_rtnl_msg_handlers[] __initconst = {
+ {.protocol = PF_INET, .msgtype = RTM_GETROUTE,
+ .doit = inet_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
+};
+
int __init ip_rt_init(void)
{
+ void *idents_hash;
int cpu;
- ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
- GFP_KERNEL);
- if (!ip_idents)
- panic("IP: failed to allocate ip_idents\n");
+ /* For modern hosts, this will use 2 MB of memory */
+ idents_hash = alloc_large_system_hash("IP idents",
+ sizeof(*ip_idents) + sizeof(*ip_tstamps),
+ 0,
+ 16, /* one bucket per 64 KB */
+ HASH_ZERO,
+ NULL,
+ &ip_idents_mask,
+ 2048,
+ 256*1024);
+
+ ip_idents = idents_hash;
- prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
+ get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
- ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
- if (!ip_tstamps)
- panic("IP: failed to allocate ip_tstamps\n");
+ ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
for_each_possible_cpu(cpu) {
struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
@@ -3173,9 +3753,8 @@ int __init ip_rt_init(void)
panic("IP: failed to allocate ip_rt_acct\n");
#endif
- ipv4_dst_ops.kmem_cachep =
- kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ ipv4_dst_ops.kmem_cachep = KMEM_CACHE(rtable,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC);
ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
@@ -3197,12 +3776,12 @@ int __init ip_rt_init(void)
xfrm_init();
xfrm4_init();
#endif
- rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
- RTNL_FLAG_DOIT_UNLOCKED);
+ rtnl_register_many(ip_rt_rtnl_msg_handlers);
#ifdef CONFIG_SYSCTL
register_pernet_subsys(&sysctl_route_ops);
#endif
+ register_pernet_subsys(&ip_rt_ops);
register_pernet_subsys(&rt_genid_ops);
register_pernet_subsys(&ipv4_inetpeer_ops);
return 0;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index c3387dfd725b..569befcf021b 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -1,26 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Syncookies implementation for the Linux kernel
*
* Copyright (C) 1997 Andi Kleen
* Based on ideas by D.J.Bernstein and Eric Schenk.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/tcp.h>
-#include <linux/slab.h>
-#include <linux/random.h>
#include <linux/siphash.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <net/secure_seq.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/route.h>
-static siphash_key_t syncookie_secret[2] __read_mostly;
+static siphash_aligned_key_t syncookie_secret[2];
#define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
@@ -47,7 +42,6 @@ static siphash_key_t syncookie_secret[2] __read_mostly;
* requested/supported by the syn/synack exchange.
*/
#define TSBITS 6
-#define TSMASK (((__u32)1 << TSBITS) - 1)
static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
u32 count, int c)
@@ -58,7 +52,6 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
count, &syncookie_secret[c]);
}
-
/*
* when syncookies are in effect and tcp timestamps are enabled we encode
* tcp options in the lower bits of the timestamp value that will be
@@ -66,29 +59,26 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
* Since subsequent timestamps use the normal tcp_time_stamp value, we
* must make sure that the resulting initial timestamp is <= tcp_time_stamp.
*/
-u64 cookie_init_timestamp(struct request_sock *req)
+u64 cookie_init_timestamp(struct request_sock *req, u64 now)
{
- struct inet_request_sock *ireq;
- u32 ts, ts_now = tcp_time_stamp_raw();
+ const struct inet_request_sock *ireq = inet_rsk(req);
+ u64 ts, ts_now = tcp_ns_to_ts(false, now);
u32 options = 0;
- ireq = inet_rsk(req);
-
options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK;
if (ireq->sack_ok)
options |= TS_OPT_SACK;
if (ireq->ecn_ok)
options |= TS_OPT_ECN;
- ts = ts_now & ~TSMASK;
+ ts = (ts_now >> TSBITS) << TSBITS;
ts |= options;
- if (ts > ts_now) {
- ts >>= TSBITS;
- ts--;
- ts <<= TSBITS;
- ts |= options;
- }
- return (u64)ts * (USEC_PER_SEC / TCP_TS_HZ);
+ if (ts > ts_now)
+ ts -= (1UL << TSBITS);
+
+ if (tcp_rsk(req)->req_usec_ts)
+ return ts * NSEC_PER_USEC;
+ return ts * NSEC_PER_MSEC;
}
@@ -191,12 +181,14 @@ __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mssp)
* Check if a ack sequence number is a valid syncookie.
* Return the decoded mss if it is, or 0 if not.
*/
-int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
- u32 cookie)
+int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th)
{
+ __u32 cookie = ntohl(th->ack_seq) - 1;
__u32 seq = ntohl(th->seq) - 1;
- __u32 mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
- th->source, th->dest, seq);
+ __u32 mssind;
+
+ mssind = check_tcp_syn_cookie(cookie, iph->saddr, iph->daddr,
+ th->source, th->dest, seq);
return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
}
@@ -204,7 +196,7 @@ EXPORT_SYMBOL_GPL(__cookie_v4_check);
struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
- struct dst_entry *dst, u32 tsoff)
+ struct dst_entry *dst)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct sock *child;
@@ -214,15 +206,24 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
NULL, &own_req);
if (child) {
refcount_set(&req->rsk_refcnt, 1);
- tcp_sk(child)->tsoffset = tsoff;
sock_rps_save_rxhash(child, skb);
- inet_csk_reqsk_queue_add(sk, req, child);
- } else {
- reqsk_free(req);
+
+ if (rsk_drop_req(req)) {
+ reqsk_put(req);
+ return child;
+ }
+
+ if (inet_csk_reqsk_queue_add(sk, req, child))
+ return child;
+
+ bh_unlock_sock(child);
+ sock_put(child);
}
- return child;
+ __reqsk_free(req);
+
+ return NULL;
}
-EXPORT_SYMBOL(tcp_get_cookie_sock);
+EXPORT_IPV6_MOD(tcp_get_cookie_sock);
/*
* when syncookies are in effect and tcp timestamps are enabled we stored
@@ -243,12 +244,12 @@ bool cookie_timestamp_decode(const struct net *net,
return true;
}
- if (!net->ipv4.sysctl_tcp_timestamps)
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps))
return false;
tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0;
- if (tcp_opt->sack_ok && !net->ipv4.sysctl_tcp_sack)
+ if (tcp_opt->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack))
return false;
if ((options & TS_OPT_WSCALE_MASK) == TS_OPT_WSCALE_MASK)
@@ -257,114 +258,194 @@ bool cookie_timestamp_decode(const struct net *net,
tcp_opt->wscale_ok = 1;
tcp_opt->snd_wscale = options & TS_OPT_WSCALE_MASK;
- return net->ipv4.sysctl_tcp_window_scaling != 0;
+ return READ_ONCE(net->ipv4.sysctl_tcp_window_scaling) != 0;
}
-EXPORT_SYMBOL(cookie_timestamp_decode);
+EXPORT_IPV6_MOD(cookie_timestamp_decode);
-bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt,
- const struct net *net, const struct dst_entry *dst)
+static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req)
{
- bool ecn_ok = tcp_opt->rcv_tsecr & TS_OPT_ECN;
+ struct inet_request_sock *ireq = inet_rsk(req);
+ struct tcp_request_sock *treq = tcp_rsk(req);
+ const struct tcphdr *th = tcp_hdr(skb);
- if (!ecn_ok)
- return false;
+ req->num_retrans = 0;
- if (net->ipv4.sysctl_tcp_ecn)
- return true;
+ ireq->ir_num = ntohs(th->dest);
+ ireq->ir_rmt_port = th->source;
+ ireq->ir_iif = inet_request_bound_dev_if(sk, skb);
+ ireq->ir_mark = inet_request_mark(sk, skb);
- return dst_feature(dst, RTAX_FEATURE_ECN);
+ if (IS_ENABLED(CONFIG_SMC))
+ ireq->smc_ok = 0;
+
+ treq->snt_synack = 0;
+ treq->snt_tsval_first = 0;
+ treq->tfo_listener = false;
+ treq->txhash = net_tx_rndhash();
+ treq->rcv_isn = ntohl(th->seq) - 1;
+ treq->snt_isn = ntohl(th->ack_seq) - 1;
+ treq->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
+ treq->req_usec_ts = false;
+
+#if IS_ENABLED(CONFIG_MPTCP)
+ treq->is_mptcp = sk_is_mptcp(sk);
+ if (treq->is_mptcp)
+ return mptcp_subflow_init_cookie_req(req, sk, skb);
+#endif
+
+ return 0;
}
-EXPORT_SYMBOL(cookie_ecn_ok);
-/* On input, sk is a listener.
- * Output is listener if incoming packet would not create a child
- * NULL if memory could not be allocated.
- */
-struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
+#if IS_ENABLED(CONFIG_BPF)
+struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb)
+{
+ struct request_sock *req = inet_reqsk(skb->sk);
+
+ skb->sk = NULL;
+ skb->destructor = NULL;
+
+ if (cookie_tcp_reqsk_init(sk, skb, req)) {
+ reqsk_free(req);
+ req = NULL;
+ }
+
+ return req;
+}
+EXPORT_IPV6_MOD_GPL(cookie_bpf_check);
+#endif
+
+struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
+ struct sock *sk, struct sk_buff *skb,
+ struct tcp_options_received *tcp_opt,
+ int mss, u32 tsoff)
{
- struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
- struct tcp_options_received tcp_opt;
struct inet_request_sock *ireq;
struct tcp_request_sock *treq;
- struct tcp_sock *tp = tcp_sk(sk);
- const struct tcphdr *th = tcp_hdr(skb);
- __u32 cookie = ntohl(th->ack_seq) - 1;
- struct sock *ret = sk;
struct request_sock *req;
- int mss;
- struct rtable *rt;
- __u8 rcv_wscale;
- struct flowi4 fl4;
- u32 tsoff = 0;
- if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst)
- goto out;
+ if (sk_is_mptcp(sk))
+ req = mptcp_subflow_reqsk_alloc(ops, sk, false);
+ else
+ req = inet_reqsk_alloc(ops, sk, false);
+
+ if (!req)
+ return NULL;
+
+ if (cookie_tcp_reqsk_init(sk, skb, req)) {
+ reqsk_free(req);
+ return NULL;
+ }
+
+ ireq = inet_rsk(req);
+ treq = tcp_rsk(req);
+
+ req->mss = mss;
+ req->ts_recent = tcp_opt->saw_tstamp ? tcp_opt->rcv_tsval : 0;
+
+ ireq->snd_wscale = tcp_opt->snd_wscale;
+ ireq->tstamp_ok = tcp_opt->saw_tstamp;
+ ireq->sack_ok = tcp_opt->sack_ok;
+ ireq->wscale_ok = tcp_opt->wscale_ok;
+ ireq->ecn_ok = !!(tcp_opt->rcv_tsecr & TS_OPT_ECN);
+
+ treq->ts_off = tsoff;
+
+ return req;
+}
+EXPORT_IPV6_MOD_GPL(cookie_tcp_reqsk_alloc);
+
+static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct tcp_options_received tcp_opt;
+ u32 tsoff = 0;
+ int mss;
if (tcp_synq_no_recent_overflow(sk))
goto out;
- mss = __cookie_v4_check(ip_hdr(skb), th, cookie);
- if (mss == 0) {
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
+ mss = __cookie_v4_check(ip_hdr(skb), tcp_hdr(skb));
+ if (!mss) {
+ __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESFAILED);
goto out;
}
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
+ __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESRECV);
/* check for timestamp cookie support */
memset(&tcp_opt, 0, sizeof(tcp_opt));
- tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
+ tcp_parse_options(net, skb, &tcp_opt, 0, NULL);
if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
- tsoff = secure_tcp_ts_off(sock_net(sk),
+ tsoff = secure_tcp_ts_off(net,
ip_hdr(skb)->daddr,
ip_hdr(skb)->saddr);
tcp_opt.rcv_tsecr -= tsoff;
}
- if (!cookie_timestamp_decode(sock_net(sk), &tcp_opt))
+ if (!cookie_timestamp_decode(net, &tcp_opt))
goto out;
- ret = NULL;
- req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */
- if (!req)
+ return cookie_tcp_reqsk_alloc(&tcp_request_sock_ops, sk, skb,
+ &tcp_opt, mss, tsoff);
+out:
+ return ERR_PTR(-EINVAL);
+}
+
+/* On input, sk is a listener.
+ * Output is listener if incoming packet would not create a child
+ * NULL if memory could not be allocated.
+ */
+struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
+{
+ struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_request_sock *ireq;
+ struct net *net = sock_net(sk);
+ struct tcp_request_sock *treq;
+ struct request_sock *req;
+ struct sock *ret = sk;
+ struct flowi4 fl4;
+ struct rtable *rt;
+ __u8 rcv_wscale;
+ int full_space;
+ SKB_DR(reason);
+
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_syncookies) ||
+ !th->ack || th->rst)
goto out;
+ if (cookie_bpf_ok(skb)) {
+ req = cookie_bpf_check(sk, skb);
+ } else {
+ req = cookie_tcp_check(net, sk, skb);
+ if (IS_ERR(req))
+ goto out;
+ }
+ if (!req) {
+ SKB_DR_SET(reason, NO_SOCKET);
+ goto out_drop;
+ }
+
ireq = inet_rsk(req);
treq = tcp_rsk(req);
- treq->rcv_isn = ntohl(th->seq) - 1;
- treq->snt_isn = cookie;
- treq->ts_off = 0;
- treq->txhash = net_tx_rndhash();
- req->mss = mss;
- ireq->ir_num = ntohs(th->dest);
- ireq->ir_rmt_port = th->source;
+
sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
- ireq->ir_mark = inet_request_mark(sk, skb);
- ireq->snd_wscale = tcp_opt.snd_wscale;
- ireq->sack_ok = tcp_opt.sack_ok;
- ireq->wscale_ok = tcp_opt.wscale_ok;
- ireq->tstamp_ok = tcp_opt.saw_tstamp;
- req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
- treq->snt_synack = 0;
- treq->tfo_listener = false;
- if (IS_ENABLED(CONFIG_SMC))
- ireq->smc_ok = 0;
-
- ireq->ir_iif = inet_request_bound_dev_if(sk, skb);
/* We throwed the options of the initial SYN away, so we hope
* the ACK carries the same options again (see RFC1122 4.2.3.8)
*/
- RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(sock_net(sk), skb));
+ RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
if (security_inet_conn_request(sk, skb, req)) {
- reqsk_free(req);
- goto out;
+ SKB_DR_SET(reason, SECURITY_HOOK);
+ goto out_free;
}
- req->num_retrans = 0;
+ tcp_ao_syncookie(sk, skb, req, AF_INET);
/*
* We need to lookup the route here to get at the correct
@@ -373,33 +454,54 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
* no easy way to do this.
*/
flowi4_init_output(&fl4, ireq->ir_iif, ireq->ir_mark,
- RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
- inet_sk_flowi_flags(sk),
+ ip_sock_rt_tos(sk), ip_sock_rt_scope(sk),
+ IPPROTO_TCP, inet_sk_flowi_flags(sk),
opt->srr ? opt->faddr : ireq->ir_rmt_addr,
- ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid);
- security_req_classify_flow(req, flowi4_to_flowi(&fl4));
- rt = ip_route_output_key(sock_net(sk), &fl4);
+ ireq->ir_loc_addr, th->source, th->dest,
+ sk_uid(sk));
+ security_req_classify_flow(req, flowi4_to_flowi_common(&fl4));
+ rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt)) {
- reqsk_free(req);
- goto out;
+ SKB_DR_SET(reason, IP_OUTNOROUTES);
+ goto out_free;
}
/* Try to redo what tcp_v4_send_synack did. */
- req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
-
- tcp_select_initial_window(sk, tcp_full_space(sk), req->mss,
+ req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? :
+ dst_metric(&rt->dst, RTAX_WINDOW);
+ /* limit the window selection if the user enforce a smaller rx buffer */
+ full_space = tcp_full_space(sk);
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
+ req->rsk_window_clamp = full_space;
+
+ tcp_select_initial_window(sk, full_space, req->mss,
&req->rsk_rcv_wnd, &req->rsk_window_clamp,
ireq->wscale_ok, &rcv_wscale,
dst_metric(&rt->dst, RTAX_INITRWND));
- ireq->rcv_wscale = rcv_wscale;
- ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst);
+ /* req->syncookie is set true only if ACK is validated
+ * by BPF kfunc, then, rcv_wscale is already configured.
+ */
+ if (!req->syncookie)
+ ireq->rcv_wscale = rcv_wscale;
+ ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst);
+ treq->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th);
- ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst, tsoff);
+ ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst);
/* ip_queue_xmit() depends on our flow being setup
* Normal sockets get it right from inet_csk_route_child_sock()
*/
- if (ret)
- inet_sk(ret)->cork.fl.u.ip4 = fl4;
-out: return ret;
+ if (!ret) {
+ SKB_DR_SET(reason, NO_SOCKET);
+ goto out_drop;
+ }
+ inet_sk(ret)->cork.fl.u.ip4 = fl4;
+out:
+ return ret;
+out_free:
+ reqsk_free(req);
+out_drop:
+ sk_skb_reason_drop(sk, skb, reason);
+ return NULL;
}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 891ed2f91467..a1a50a5c80dc 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -6,75 +6,71 @@
* Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS]
*/
-#include <linux/mm.h>
-#include <linux/module.h>
#include <linux/sysctl.h>
-#include <linux/igmp.h>
-#include <linux/inetdevice.h>
#include <linux/seqlock.h>
#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/nsproxy.h>
-#include <linux/swap.h>
-#include <net/snmp.h>
#include <net/icmp.h>
#include <net/ip.h>
-#include <net/route.h>
+#include <net/ip_fib.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <net/cipso_ipv4.h>
-#include <net/inet_frag.h>
#include <net/ping.h>
#include <net/protocol.h>
#include <net/netevent.h>
-static int zero;
-static int one = 1;
-static int two = 2;
-static int four = 4;
-static int thousand = 1000;
-static int gso_max_segs = GSO_MAX_SEGS;
static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 };
static int tcp_adv_win_scale_min = -31;
static int tcp_adv_win_scale_max = 31;
+static int tcp_app_win_max = 31;
+static int tcp_min_snd_mss_min = TCP_MIN_SND_MSS;
+static int tcp_min_snd_mss_max = 65535;
+static int tcp_rto_max_max = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
static int ip_privileged_port_min;
static int ip_privileged_port_max = 65535;
static int ip_ttl_min = 1;
static int ip_ttl_max = 255;
static int tcp_syn_retries_min = 1;
static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
-static int ip_ping_group_range_min[] = { 0, 0 };
-static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
-static int comp_sack_nr_max = 255;
+static int tcp_syn_linear_timeouts_max = MAX_TCP_SYNCNT;
+static unsigned long ip_ping_group_range_min[] = { 0, 0 };
+static unsigned long ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
static u32 u32_max_div_HZ = UINT_MAX / HZ;
+static int one_day_secs = 24 * 3600;
+static u32 fib_multipath_hash_fields_all_mask __maybe_unused =
+ FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
+static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
+static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
+static int tcp_plb_max_rounds = 31;
+static int tcp_plb_max_cong_thresh = 256;
+static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC;
+static int tcp_ecn_mode_max = 2;
+static u32 icmp_errors_extension_mask_all =
+ GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0);
/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
/* Update system visible IP port range */
-static void set_local_port_range(struct net *net, int range[2])
+static void set_local_port_range(struct net *net, unsigned int low, unsigned int high)
{
- bool same_parity = !((range[0] ^ range[1]) & 1);
+ bool same_parity = !((low ^ high) & 1);
- write_seqlock_bh(&net->ipv4.ip_local_ports.lock);
if (same_parity && !net->ipv4.ip_local_ports.warned) {
net->ipv4.ip_local_ports.warned = true;
pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n");
}
- net->ipv4.ip_local_ports.range[0] = range[0];
- net->ipv4.ip_local_ports.range[1] = range[1];
- write_sequnlock_bh(&net->ipv4.ip_local_ports.lock);
+ WRITE_ONCE(net->ipv4.ip_local_ports.range, high << 16 | low);
}
/* Validate changes from /proc interface. */
-static int ipv4_local_port_range(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+static int ipv4_local_port_range(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
- struct net *net =
- container_of(table->data, struct net, ipv4.ip_local_ports.range);
+ struct net *net = table->data;
int ret;
int range[2];
struct ctl_table tmp = {
@@ -95,18 +91,18 @@ static int ipv4_local_port_range(struct ctl_table *table, int write,
* port limit.
*/
if ((range[1] < range[0]) ||
- (range[0] < net->ipv4.sysctl_ip_prot_sock))
+ (range[0] < READ_ONCE(net->ipv4.sysctl_ip_prot_sock)))
ret = -EINVAL;
else
- set_local_port_range(net, range);
+ set_local_port_range(net, range[0], range[1]);
}
return ret;
}
/* Validate changes from /proc interface. */
-static int ipv4_privileged_ports(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int ipv4_privileged_ports(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = container_of(table->data, struct net,
ipv4.sysctl_ip_prot_sock);
@@ -121,7 +117,7 @@ static int ipv4_privileged_ports(struct ctl_table *table, int write,
.extra2 = &ip_privileged_port_max,
};
- pports = net->ipv4.sysctl_ip_prot_sock;
+ pports = READ_ONCE(net->ipv4.sysctl_ip_prot_sock);
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
@@ -133,13 +129,14 @@ static int ipv4_privileged_ports(struct ctl_table *table, int write,
if (range[0] < pports)
ret = -EINVAL;
else
- net->ipv4.sysctl_ip_prot_sock = pports;
+ WRITE_ONCE(net->ipv4.sysctl_ip_prot_sock, pports);
}
return ret;
}
-static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low, kgid_t *high)
+static void inet_get_ping_group_range_table(const struct ctl_table *table,
+ kgid_t *low, kgid_t *high)
{
kgid_t *data = table->data;
struct net *net =
@@ -154,7 +151,8 @@ static void inet_get_ping_group_range_table(struct ctl_table *table, kgid_t *low
}
/* Update system visible IP port range */
-static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t high)
+static void set_ping_group_range(const struct ctl_table *table,
+ kgid_t low, kgid_t high)
{
kgid_t *data = table->data;
struct net *net =
@@ -166,13 +164,12 @@ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t hig
}
/* Validate changes from /proc interface. */
-static int ipv4_ping_group_range(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+static int ipv4_ping_group_range(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct user_namespace *user_ns = current_user_ns();
int ret;
- gid_t urange[2];
+ unsigned long urange[2];
kgid_t low, high;
struct ctl_table tmp = {
.data = &urange,
@@ -185,7 +182,7 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
inet_get_ping_group_range_table(table, &low, &high);
urange[0] = from_kgid_munged(user_ns, low);
urange[1] = from_kgid_munged(user_ns, high);
- ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ ret = proc_doulongvec_minmax(&tmp, write, buffer, lenp, ppos);
if (write && ret == 0) {
low = make_kgid(user_ns, urange[0]);
@@ -202,16 +199,15 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
return ret;
}
-static int ipv4_fwd_update_priority(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+static int ipv4_fwd_update_priority(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net;
int ret;
net = container_of(table->data, struct net,
ipv4.sysctl_ip_fwd_update_priority);
- ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
if (write && ret == 0)
call_netevent_notifiers(NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE,
net);
@@ -219,8 +215,8 @@ static int ipv4_fwd_update_priority(struct ctl_table *table, int write,
return ret;
}
-static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int proc_tcp_congestion_control(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = container_of(ctl->data, struct net,
ipv4.tcp_congestion_control);
@@ -239,10 +235,9 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
return ret;
}
-static int proc_tcp_available_congestion_control(struct ctl_table *ctl,
- int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+static int proc_tcp_available_congestion_control(const struct ctl_table *ctl,
+ int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, };
int ret;
@@ -256,10 +251,9 @@ static int proc_tcp_available_congestion_control(struct ctl_table *ctl,
return ret;
}
-static int proc_allowed_congestion_control(struct ctl_table *ctl,
- int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+static int proc_allowed_congestion_control(const struct ctl_table *ctl,
+ int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX };
int ret;
@@ -276,163 +270,260 @@ static int proc_allowed_congestion_control(struct ctl_table *ctl,
return ret;
}
-static int proc_tcp_fastopen_key(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+static int sscanf_key(char *buf, __le32 *key)
+{
+ u32 user_key[4];
+ int i, ret = 0;
+
+ if (sscanf(buf, "%x-%x-%x-%x", user_key, user_key + 1,
+ user_key + 2, user_key + 3) != 4) {
+ ret = -EINVAL;
+ } else {
+ for (i = 0; i < ARRAY_SIZE(user_key); i++)
+ key[i] = cpu_to_le32(user_key[i]);
+ }
+ pr_debug("proc TFO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
+ user_key[0], user_key[1], user_key[2], user_key[3], buf, ret);
+
+ return ret;
+}
+
+static int proc_tcp_fastopen_key(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = container_of(table->data, struct net,
ipv4.sysctl_tcp_fastopen);
- struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) };
- struct tcp_fastopen_context *ctxt;
- u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */
- __le32 key[4];
- int ret, i;
+ /* maxlen to print the list of keys in hex (*2), with dashes
+ * separating doublewords and a comma in between keys.
+ */
+ struct ctl_table tbl = { .maxlen = ((TCP_FASTOPEN_KEY_LENGTH *
+ 2 * TCP_FASTOPEN_KEY_MAX) +
+ (TCP_FASTOPEN_KEY_MAX * 5)) };
+ u32 user_key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u32)];
+ __le32 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(__le32)];
+ char *backup_data;
+ int ret, i = 0, off = 0, n_keys;
tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL);
if (!tbl.data)
return -ENOMEM;
- rcu_read_lock();
- ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
- if (ctxt)
- memcpy(key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH);
- else
- memset(key, 0, sizeof(key));
- rcu_read_unlock();
+ n_keys = tcp_fastopen_get_cipher(net, NULL, (u64 *)key);
+ if (!n_keys) {
+ memset(&key[0], 0, TCP_FASTOPEN_KEY_LENGTH);
+ n_keys = 1;
+ }
- for (i = 0; i < ARRAY_SIZE(key); i++)
+ for (i = 0; i < n_keys * 4; i++)
user_key[i] = le32_to_cpu(key[i]);
- snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x",
- user_key[0], user_key[1], user_key[2], user_key[3]);
+ for (i = 0; i < n_keys; i++) {
+ off += snprintf(tbl.data + off, tbl.maxlen - off,
+ "%08x-%08x-%08x-%08x",
+ user_key[i * 4],
+ user_key[i * 4 + 1],
+ user_key[i * 4 + 2],
+ user_key[i * 4 + 3]);
+
+ if (WARN_ON_ONCE(off >= tbl.maxlen - 1))
+ break;
+
+ if (i + 1 < n_keys)
+ off += snprintf(tbl.data + off, tbl.maxlen - off, ",");
+ }
+
ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
if (write && ret == 0) {
- if (sscanf(tbl.data, "%x-%x-%x-%x", user_key, user_key + 1,
- user_key + 2, user_key + 3) != 4) {
+ backup_data = strchr(tbl.data, ',');
+ if (backup_data) {
+ *backup_data = '\0';
+ backup_data++;
+ }
+ if (sscanf_key(tbl.data, key)) {
ret = -EINVAL;
goto bad_key;
}
-
- for (i = 0; i < ARRAY_SIZE(user_key); i++)
- key[i] = cpu_to_le32(user_key[i]);
-
+ if (backup_data) {
+ if (sscanf_key(backup_data, key + 4)) {
+ ret = -EINVAL;
+ goto bad_key;
+ }
+ }
tcp_fastopen_reset_cipher(net, NULL, key,
- TCP_FASTOPEN_KEY_LENGTH);
+ backup_data ? key + 4 : NULL);
}
bad_key:
- pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n",
- user_key[0], user_key[1], user_key[2], user_key[3],
- (char *)tbl.data, ret);
kfree(tbl.data);
return ret;
}
-static void proc_configure_early_demux(int enabled, int protocol)
+static int proc_tfo_blackhole_detect_timeout(const struct ctl_table *table,
+ int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
- struct net_protocol *ipprot;
-#if IS_ENABLED(CONFIG_IPV6)
- struct inet6_protocol *ip6prot;
-#endif
+ struct net *net = container_of(table->data, struct net,
+ ipv4.sysctl_tcp_fastopen_blackhole_timeout);
+ int ret;
- rcu_read_lock();
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ atomic_set(&net->ipv4.tfo_active_disable_times, 0);
- ipprot = rcu_dereference(inet_protos[protocol]);
- if (ipprot)
- ipprot->early_demux = enabled ? ipprot->early_demux_handler :
- NULL;
+ return ret;
+}
-#if IS_ENABLED(CONFIG_IPV6)
- ip6prot = rcu_dereference(inet6_protos[protocol]);
- if (ip6prot)
- ip6prot->early_demux = enabled ? ip6prot->early_demux_handler :
- NULL;
-#endif
- rcu_read_unlock();
+static int proc_tcp_available_ulp(const struct ctl_table *ctl,
+ int write, void *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct ctl_table tbl = { .maxlen = TCP_ULP_BUF_MAX, };
+ int ret;
+
+ tbl.data = kmalloc(tbl.maxlen, GFP_USER);
+ if (!tbl.data)
+ return -ENOMEM;
+ tcp_get_available_ulp(tbl.data, TCP_ULP_BUF_MAX);
+ ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+ kfree(tbl.data);
+
+ return ret;
}
-static int proc_tcp_early_demux(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int proc_tcp_ehash_entries(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
- int ret = 0;
+ struct net *net = container_of(table->data, struct net,
+ ipv4.sysctl_tcp_child_ehash_entries);
+ struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
+ int tcp_ehash_entries;
+ struct ctl_table tbl;
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ tcp_ehash_entries = hinfo->ehash_mask + 1;
- if (write && !ret) {
- int enabled = init_net.ipv4.sysctl_tcp_early_demux;
+ /* A negative number indicates that the child netns
+ * shares the global ehash.
+ */
+ if (!net_eq(net, &init_net) && !hinfo->pernet)
+ tcp_ehash_entries *= -1;
- proc_configure_early_demux(enabled, IPPROTO_TCP);
- }
+ memset(&tbl, 0, sizeof(tbl));
+ tbl.data = &tcp_ehash_entries;
+ tbl.maxlen = sizeof(int);
- return ret;
+ return proc_dointvec(&tbl, write, buffer, lenp, ppos);
}
-static int proc_udp_early_demux(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int proc_udp_hash_entries(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
- int ret = 0;
+ struct net *net = container_of(table->data, struct net,
+ ipv4.sysctl_udp_child_hash_entries);
+ int udp_hash_entries;
+ struct ctl_table tbl;
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ udp_hash_entries = net->ipv4.udp_table->mask + 1;
- if (write && !ret) {
- int enabled = init_net.ipv4.sysctl_udp_early_demux;
+ /* A negative number indicates that the child netns
+ * shares the global udp_table.
+ */
+ if (!net_eq(net, &init_net) && net->ipv4.udp_table == &udp_table)
+ udp_hash_entries *= -1;
- proc_configure_early_demux(enabled, IPPROTO_UDP);
- }
+ memset(&tbl, 0, sizeof(tbl));
+ tbl.data = &udp_hash_entries;
+ tbl.maxlen = sizeof(int);
- return ret;
+ return proc_dointvec(&tbl, write, buffer, lenp, ppos);
}
-static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
- int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+static int proc_fib_multipath_hash_policy(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
{
struct net *net = container_of(table->data, struct net,
- ipv4.sysctl_tcp_fastopen_blackhole_timeout);
+ ipv4.sysctl_fib_multipath_hash_policy);
int ret;
- ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
if (write && ret == 0)
- atomic_set(&net->ipv4.tfo_active_disable_times, 0);
+ call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net);
return ret;
}
-static int proc_tcp_available_ulp(struct ctl_table *ctl,
- int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+static int proc_fib_multipath_hash_fields(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
{
- struct ctl_table tbl = { .maxlen = TCP_ULP_BUF_MAX, };
+ struct net *net;
int ret;
- tbl.data = kmalloc(tbl.maxlen, GFP_USER);
- if (!tbl.data)
- return -ENOMEM;
- tcp_get_available_ulp(tbl.data, TCP_ULP_BUF_MAX);
- ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
- kfree(tbl.data);
+ net = container_of(table->data, struct net,
+ ipv4.sysctl_fib_multipath_hash_fields);
+ ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net);
return ret;
}
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
-static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+static u32 proc_fib_multipath_hash_rand_seed __ro_after_init;
+
+static void proc_fib_multipath_hash_init_rand_seed(void)
{
- struct net *net = container_of(table->data, struct net,
- ipv4.sysctl_fib_multipath_hash_policy);
+ get_random_bytes(&proc_fib_multipath_hash_rand_seed,
+ sizeof(proc_fib_multipath_hash_rand_seed));
+}
+
+static void proc_fib_multipath_hash_set_seed(struct net *net, u32 user_seed)
+{
+ struct sysctl_fib_multipath_hash_seed new = {
+ .user_seed = user_seed,
+ .mp_seed = (user_seed ? user_seed :
+ proc_fib_multipath_hash_rand_seed),
+ };
+
+ WRITE_ONCE(net->ipv4.sysctl_fib_multipath_hash_seed, new);
+}
+
+static int proc_fib_multipath_hash_seed(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ struct sysctl_fib_multipath_hash_seed *mphs;
+ struct net *net = table->data;
+ struct ctl_table tmp;
+ u32 user_seed;
int ret;
- ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
- if (write && ret == 0)
+ mphs = &net->ipv4.sysctl_fib_multipath_hash_seed;
+ user_seed = mphs->user_seed;
+
+ tmp = *table;
+ tmp.data = &user_seed;
+
+ ret = proc_douintvec_minmax(&tmp, write, buffer, lenp, ppos);
+
+ if (write && ret == 0) {
+ proc_fib_multipath_hash_set_seed(net, user_seed);
call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net);
+ }
return ret;
}
+#else
+
+static void proc_fib_multipath_hash_init_rand_seed(void)
+{
+}
+
+static void proc_fib_multipath_hash_set_seed(struct net *net, u32 user_seed)
+{
+}
+
#endif
static struct ctl_table ipv4_table[] = {
@@ -509,77 +600,90 @@ static struct ctl_table ipv4_table[] = {
},
#endif /* CONFIG_NETLABEL */
{
- .procname = "tcp_available_congestion_control",
- .maxlen = TCP_CA_BUF_MAX,
- .mode = 0444,
- .proc_handler = proc_tcp_available_congestion_control,
- },
- {
- .procname = "tcp_allowed_congestion_control",
- .maxlen = TCP_CA_BUF_MAX,
- .mode = 0644,
- .proc_handler = proc_allowed_congestion_control,
- },
- {
.procname = "tcp_available_ulp",
.maxlen = TCP_ULP_BUF_MAX,
.mode = 0444,
.proc_handler = proc_tcp_available_ulp,
},
{
- .procname = "icmp_msgs_per_sec",
- .data = &sysctl_icmp_msgs_per_sec,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- },
- {
- .procname = "icmp_msgs_burst",
- .data = &sysctl_icmp_msgs_burst,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- },
- {
.procname = "udp_mem",
.data = &sysctl_udp_mem,
.maxlen = sizeof(sysctl_udp_mem),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
- { }
+ {
+ .procname = "fib_sync_mem",
+ .data = &sysctl_fib_sync_mem,
+ .maxlen = sizeof(sysctl_fib_sync_mem),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = &sysctl_fib_sync_mem_min,
+ .extra2 = &sysctl_fib_sync_mem_max,
+ },
};
static struct ctl_table ipv4_net_table[] = {
{
- .procname = "icmp_echo_ignore_all",
- .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all,
+ .procname = "tcp_max_tw_buckets",
+ .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{
+ .procname = "icmp_echo_ignore_all",
+ .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
+ },
+ {
+ .procname = "icmp_echo_enable_probe",
+ .data = &init_net.ipv4.sysctl_icmp_echo_enable_probe,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
+ },
+ {
.procname = "icmp_echo_ignore_broadcasts",
.data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
},
{
.procname = "icmp_ignore_bogus_error_responses",
.data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
},
{
.procname = "icmp_errors_use_inbound_ifaddr",
.data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
+ },
+ {
+ .procname = "icmp_errors_extension_mask",
+ .data = &init_net.ipv4.sysctl_icmp_errors_extension_mask,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &icmp_errors_extension_mask_all,
},
{
.procname = "icmp_ratelimit",
@@ -596,67 +700,125 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "icmp_msgs_per_sec",
+ .data = &init_net.ipv4.sysctl_icmp_msgs_per_sec,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "icmp_msgs_burst",
+ .data = &init_net.ipv4.sysctl_icmp_msgs_burst,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
.procname = "ping_group_range",
.data = &init_net.ipv4.ping_group_range.range,
.maxlen = sizeof(gid_t)*2,
.mode = 0644,
.proc_handler = ipv4_ping_group_range,
},
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ {
+ .procname = "raw_l3mdev_accept",
+ .data = &init_net.ipv4.sysctl_raw_l3mdev_accept,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+#endif
{
.procname = "tcp_ecn",
.data = &init_net.ipv4.sysctl_tcp_ecn,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &tcp_ecn_mode_max,
+ },
+ {
+ .procname = "tcp_ecn_option",
+ .data = &init_net.ipv4.sysctl_tcp_ecn_option,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "tcp_ecn_option_beacon",
+ .data = &init_net.ipv4.sysctl_tcp_ecn_option_beacon,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_THREE,
},
{
.procname = "tcp_ecn_fallback",
.data = &init_net.ipv4.sysctl_tcp_ecn_fallback,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "ip_dynaddr",
.data = &init_net.ipv4.sysctl_ip_dynaddr,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "ip_early_demux",
.data = &init_net.ipv4.sysctl_ip_early_demux,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "udp_early_demux",
.data = &init_net.ipv4.sysctl_udp_early_demux,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_udp_early_demux
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_early_demux",
.data = &init_net.ipv4.sysctl_tcp_early_demux,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "nexthop_compat_mode",
+ .data = &init_net.ipv4.sysctl_nexthop_compat_mode,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_tcp_early_demux
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "ip_default_ttl",
.data = &init_net.ipv4.sysctl_ip_default_ttl,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dou8vec_minmax,
.extra1 = &ip_ttl_min,
.extra2 = &ip_ttl_max,
},
{
.procname = "ip_local_port_range",
- .maxlen = sizeof(init_net.ipv4.ip_local_ports.range),
- .data = &init_net.ipv4.ip_local_ports.range,
+ .maxlen = 0,
+ .data = &init_net,
.mode = 0644,
.proc_handler = ipv4_local_port_range,
},
@@ -670,64 +832,73 @@ static struct ctl_table ipv4_net_table[] = {
{
.procname = "ip_no_pmtu_disc",
.data = &init_net.ipv4.sysctl_ip_no_pmtu_disc,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "ip_forward_use_pmtu",
.data = &init_net.ipv4.sysctl_ip_fwd_use_pmtu,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "ip_forward_update_priority",
.data = &init_net.ipv4.sysctl_ip_fwd_update_priority,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = ipv4_fwd_update_priority,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "ip_nonlocal_bind",
.data = &init_net.ipv4.sysctl_ip_nonlocal_bind,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "ip_autobind_reuse",
+ .data = &init_net.ipv4.sysctl_ip_autobind_reuse,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "fwmark_reflect",
.data = &init_net.ipv4.sysctl_fwmark_reflect,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_fwmark_accept",
.data = &init_net.ipv4.sysctl_tcp_fwmark_accept,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dou8vec_minmax,
},
#ifdef CONFIG_NET_L3_MASTER_DEV
{
.procname = "tcp_l3mdev_accept",
.data = &init_net.ipv4.sysctl_tcp_l3mdev_accept,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
{
.procname = "tcp_mtu_probing",
.data = &init_net.ipv4.sysctl_tcp_mtu_probing,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_base_mss",
@@ -737,6 +908,24 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec,
},
{
+ .procname = "tcp_min_snd_mss",
+ .data = &init_net.ipv4.sysctl_tcp_min_snd_mss,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &tcp_min_snd_mss_min,
+ .extra2 = &tcp_min_snd_mss_max,
+ },
+ {
+ .procname = "tcp_mtu_probe_floor",
+ .data = &init_net.ipv4.sysctl_tcp_mtu_probe_floor,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &tcp_min_snd_mss_min,
+ .extra2 = &tcp_min_snd_mss_max,
+ },
+ {
.procname = "tcp_probe_threshold",
.data = &init_net.ipv4.sysctl_tcp_probe_threshold,
.maxlen = sizeof(int),
@@ -754,9 +943,9 @@ static struct ctl_table ipv4_net_table[] = {
{
.procname = "igmp_link_local_mcast_reports",
.data = &init_net.ipv4.sysctl_igmp_llm_reports,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "igmp_max_memberships",
@@ -779,7 +968,7 @@ static struct ctl_table ipv4_net_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one
+ .extra1 = SYSCTL_ONE
},
#endif
{
@@ -790,6 +979,18 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_tcp_congestion_control,
},
{
+ .procname = "tcp_available_congestion_control",
+ .maxlen = TCP_CA_BUF_MAX,
+ .mode = 0444,
+ .proc_handler = proc_tcp_available_congestion_control,
+ },
+ {
+ .procname = "tcp_allowed_congestion_control",
+ .maxlen = TCP_CA_BUF_MAX,
+ .mode = 0644,
+ .proc_handler = proc_allowed_congestion_control,
+ },
+ {
.procname = "tcp_keepalive_time",
.data = &init_net.ipv4.sysctl_tcp_keepalive_time,
.maxlen = sizeof(int),
@@ -799,9 +1000,9 @@ static struct ctl_table ipv4_net_table[] = {
{
.procname = "tcp_keepalive_probes",
.data = &init_net.ipv4.sysctl_tcp_keepalive_probes,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_keepalive_intvl",
@@ -813,29 +1014,38 @@ static struct ctl_table ipv4_net_table[] = {
{
.procname = "tcp_syn_retries",
.data = &init_net.ipv4.sysctl_tcp_syn_retries,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dou8vec_minmax,
.extra1 = &tcp_syn_retries_min,
.extra2 = &tcp_syn_retries_max
},
{
.procname = "tcp_synack_retries",
.data = &init_net.ipv4.sysctl_tcp_synack_retries,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
#ifdef CONFIG_SYN_COOKIES
{
.procname = "tcp_syncookies",
.data = &init_net.ipv4.sysctl_tcp_syncookies,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
#endif
{
+ .procname = "tcp_migrate_req",
+ .data = &init_net.ipv4.sysctl_tcp_migrate_req,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE
+ },
+ {
.procname = "tcp_reordering",
.data = &init_net.ipv4.sysctl_tcp_reordering,
.maxlen = sizeof(int),
@@ -845,24 +1055,24 @@ static struct ctl_table ipv4_net_table[] = {
{
.procname = "tcp_retries1",
.data = &init_net.ipv4.sysctl_tcp_retries1,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dou8vec_minmax,
.extra2 = &tcp_retr1_max
},
{
.procname = "tcp_retries2",
.data = &init_net.ipv4.sysctl_tcp_retries2,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_orphan_retries",
.data = &init_net.ipv4.sysctl_tcp_orphan_retries,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_fin_timeout",
@@ -881,18 +1091,20 @@ static struct ctl_table ipv4_net_table[] = {
{
.procname = "tcp_tw_reuse",
.data = &init_net.ipv4.sysctl_tcp_tw_reuse,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &two,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
},
{
- .procname = "tcp_max_tw_buckets",
- .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets,
- .maxlen = sizeof(int),
+ .procname = "tcp_tw_reuse_delay",
+ .data = &init_net.ipv4.sysctl_tcp_tw_reuse_delay,
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = &tcp_tw_reuse_delay_max,
},
{
.procname = "tcp_max_syn_backlog",
@@ -912,7 +1124,12 @@ static struct ctl_table ipv4_net_table[] = {
.procname = "tcp_fastopen_key",
.mode = 0600,
.data = &init_net.ipv4.sysctl_tcp_fastopen,
- .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10),
+ /* maxlen to print the list of keys in hex (*2), with dashes
+ * separating doublewords and a comma in between keys.
+ */
+ .maxlen = ((TCP_FASTOPEN_KEY_LENGTH *
+ 2 * TCP_FASTOPEN_KEY_MAX) +
+ (TCP_FASTOPEN_KEY_MAX * 5)),
.proc_handler = proc_tcp_fastopen_key,
},
{
@@ -921,26 +1138,42 @@ static struct ctl_table ipv4_net_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_tfo_blackhole_detect_timeout,
- .extra1 = &zero,
+ .extra1 = SYSCTL_ZERO,
},
#ifdef CONFIG_IP_ROUTE_MULTIPATH
{
.procname = "fib_multipath_use_neigh",
.data = &init_net.ipv4.sysctl_fib_multipath_use_neigh,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "fib_multipath_hash_policy",
.data = &init_net.ipv4.sysctl_fib_multipath_hash_policy,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_fib_multipath_hash_policy,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_THREE,
+ },
+ {
+ .procname = "fib_multipath_hash_fields",
+ .data = &init_net.ipv4.sysctl_fib_multipath_hash_fields,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_fib_multipath_hash_fields,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = &fib_multipath_hash_fields_all_mask,
+ },
+ {
+ .procname = "fib_multipath_hash_seed",
+ .data = &init_net,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_fib_multipath_hash_seed,
},
#endif
{
@@ -954,98 +1187,98 @@ static struct ctl_table ipv4_net_table[] = {
{
.procname = "udp_l3mdev_accept",
.data = &init_net.ipv4.sysctl_udp_l3mdev_accept,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
#endif
{
.procname = "tcp_sack",
.data = &init_net.ipv4.sysctl_tcp_sack,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_window_scaling",
.data = &init_net.ipv4.sysctl_tcp_window_scaling,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_timestamps",
.data = &init_net.ipv4.sysctl_tcp_timestamps,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_early_retrans",
.data = &init_net.ipv4.sysctl_tcp_early_retrans,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &four,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_FOUR,
},
{
.procname = "tcp_recovery",
.data = &init_net.ipv4.sysctl_tcp_recovery,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_thin_linear_timeouts",
.data = &init_net.ipv4.sysctl_tcp_thin_linear_timeouts,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_slow_start_after_idle",
.data = &init_net.ipv4.sysctl_tcp_slow_start_after_idle,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_retrans_collapse",
.data = &init_net.ipv4.sysctl_tcp_retrans_collapse,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_stdurg",
.data = &init_net.ipv4.sysctl_tcp_stdurg,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_rfc1337",
.data = &init_net.ipv4.sysctl_tcp_rfc1337,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_abort_on_overflow",
.data = &init_net.ipv4.sysctl_tcp_abort_on_overflow,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_fack",
.data = &init_net.ipv4.sysctl_tcp_fack,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_max_reordering",
@@ -1057,16 +1290,18 @@ static struct ctl_table ipv4_net_table[] = {
{
.procname = "tcp_dsack",
.data = &init_net.ipv4.sysctl_tcp_dsack,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_app_win",
.data = &init_net.ipv4.sysctl_tcp_app_win,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &tcp_app_win_max,
},
{
.procname = "tcp_adv_win_scale",
@@ -1080,37 +1315,55 @@ static struct ctl_table ipv4_net_table[] = {
{
.procname = "tcp_frto",
.data = &init_net.ipv4.sysctl_tcp_frto,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_no_metrics_save",
.data = &init_net.ipv4.sysctl_tcp_nometrics_save,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_no_ssthresh_metrics_save",
+ .data = &init_net.ipv4.sysctl_tcp_no_ssthresh_metrics_save,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "tcp_moderate_rcvbuf",
.data = &init_net.ipv4.sysctl_tcp_moderate_rcvbuf,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_rcvbuf_low_rtt",
+ .data = &init_net.ipv4.sysctl_tcp_rcvbuf_low_rtt,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_INT_MAX,
},
{
.procname = "tcp_tso_win_divisor",
.data = &init_net.ipv4.sysctl_tcp_tso_win_divisor,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_workaround_signed_windows",
.data = &init_net.ipv4.sysctl_tcp_workaround_signed_windows,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_limit_output_bytes",
@@ -1129,27 +1382,35 @@ static struct ctl_table ipv4_net_table[] = {
{
.procname = "tcp_min_tso_segs",
.data = &init_net.ipv4.sysctl_tcp_min_tso_segs,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
- .extra2 = &gso_max_segs,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ONE,
+ },
+ {
+ .procname = "tcp_tso_rtt_log",
+ .data = &init_net.ipv4.sysctl_tcp_tso_rtt_log,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "tcp_min_rtt_wlen",
.data = &init_net.ipv4.sysctl_tcp_min_rtt_wlen,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &one_day_secs
},
{
.procname = "tcp_autocorking",
.data = &init_net.ipv4.sysctl_tcp_autocorking,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &one,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "tcp_invalid_ratelimit",
@@ -1164,8 +1425,8 @@ static struct ctl_table ipv4_net_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &thousand,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_THOUSAND,
},
{
.procname = "tcp_pacing_ca_ratio",
@@ -1173,8 +1434,8 @@ static struct ctl_table ipv4_net_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &thousand,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE_THOUSAND,
},
{
.procname = "tcp_wmem",
@@ -1182,7 +1443,7 @@ static struct ctl_table ipv4_net_table[] = {
.maxlen = sizeof(init_net.ipv4.sysctl_tcp_wmem),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
+ .extra1 = SYSCTL_ONE,
},
{
.procname = "tcp_rmem",
@@ -1190,7 +1451,7 @@ static struct ctl_table ipv4_net_table[] = {
.maxlen = sizeof(init_net.ipv4.sysctl_tcp_rmem),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one,
+ .extra1 = SYSCTL_ONE,
},
{
.procname = "tcp_comp_sack_delay_ns",
@@ -1200,13 +1461,76 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_doulongvec_minmax,
},
{
- .procname = "tcp_comp_sack_nr",
- .data = &init_net.ipv4.sysctl_tcp_comp_sack_nr,
+ .procname = "tcp_comp_sack_rtt_percent",
+ .data = &init_net.ipv4.sysctl_tcp_comp_sack_rtt_percent,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &zero,
- .extra2 = &comp_sack_nr_max,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_ONE_THOUSAND,
+ },
+ {
+ .procname = "tcp_comp_sack_slack_ns",
+ .data = &init_net.ipv4.sysctl_tcp_comp_sack_slack_ns,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "tcp_comp_sack_nr",
+ .data = &init_net.ipv4.sysctl_tcp_comp_sack_nr,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
+ {
+ .procname = "tcp_backlog_ack_defer",
+ .data = &init_net.ipv4.sysctl_tcp_backlog_ack_defer,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "tcp_reflect_tos",
+ .data = &init_net.ipv4.sysctl_tcp_reflect_tos,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "tcp_ehash_entries",
+ .data = &init_net.ipv4.sysctl_tcp_child_ehash_entries,
+ .mode = 0444,
+ .proc_handler = proc_tcp_ehash_entries,
+ },
+ {
+ .procname = "tcp_child_ehash_entries",
+ .data = &init_net.ipv4.sysctl_tcp_child_ehash_entries,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &tcp_child_ehash_entries_max,
+ },
+ {
+ .procname = "udp_hash_entries",
+ .data = &init_net.ipv4.sysctl_udp_child_hash_entries,
+ .mode = 0444,
+ .proc_handler = proc_udp_hash_entries,
+ },
+ {
+ .procname = "udp_child_hash_entries",
+ .data = &init_net.ipv4.sysctl_udp_child_hash_entries,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &udp_child_hash_entries_max,
},
{
.procname = "udp_rmem_min",
@@ -1214,7 +1538,7 @@ static struct ctl_table ipv4_net_table[] = {
.maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one
+ .extra1 = SYSCTL_ONE
},
{
.procname = "udp_wmem_min",
@@ -1222,13 +1546,106 @@ static struct ctl_table ipv4_net_table[] = {
.maxlen = sizeof(init_net.ipv4.sysctl_udp_wmem_min),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one
+ .extra1 = SYSCTL_ONE
+ },
+ {
+ .procname = "fib_notify_on_flag_change",
+ .data = &init_net.ipv4.sysctl_fib_notify_on_flag_change,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "tcp_plb_enabled",
+ .data = &init_net.ipv4.sysctl_tcp_plb_enabled,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "tcp_plb_idle_rehash_rounds",
+ .data = &init_net.ipv4.sysctl_tcp_plb_idle_rehash_rounds,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra2 = &tcp_plb_max_rounds,
+ },
+ {
+ .procname = "tcp_plb_rehash_rounds",
+ .data = &init_net.ipv4.sysctl_tcp_plb_rehash_rounds,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra2 = &tcp_plb_max_rounds,
+ },
+ {
+ .procname = "tcp_plb_suspend_rto_sec",
+ .data = &init_net.ipv4.sysctl_tcp_plb_suspend_rto_sec,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "tcp_plb_cong_thresh",
+ .data = &init_net.ipv4.sysctl_tcp_plb_cong_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &tcp_plb_max_cong_thresh,
+ },
+ {
+ .procname = "tcp_syn_linear_timeouts",
+ .data = &init_net.ipv4.sysctl_tcp_syn_linear_timeouts,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &tcp_syn_linear_timeouts_max,
+ },
+ {
+ .procname = "tcp_shrink_window",
+ .data = &init_net.ipv4.sysctl_tcp_shrink_window,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "tcp_pingpong_thresh",
+ .data = &init_net.ipv4.sysctl_tcp_pingpong_thresh,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ONE,
+ },
+ {
+ .procname = "tcp_rto_min_us",
+ .data = &init_net.ipv4.sysctl_tcp_rto_min_us,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ },
+ {
+ .procname = "tcp_rto_max_ms",
+ .data = &init_net.ipv4.sysctl_tcp_rto_max_ms,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE_THOUSAND,
+ .extra2 = &tcp_rto_max_max,
},
- { }
};
static __net_init int ipv4_sysctl_init_net(struct net *net)
{
+ size_t table_size = ARRAY_SIZE(ipv4_net_table);
struct ctl_table *table;
table = ipv4_net_table;
@@ -1239,12 +1656,23 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
if (!table)
goto err_alloc;
- /* Update the variables to point into the current struct net */
- for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++)
- table[i].data += (void *)net - (void *)&init_net;
+ for (i = 0; i < table_size; i++) {
+ if (table[i].data) {
+ /* Update the variables to point into
+ * the current struct net
+ */
+ table[i].data += (void *)net - (void *)&init_net;
+ } else {
+ /* Entries without data pointer are global;
+ * Make them read-only in non-init_net ns
+ */
+ table[i].mode &= ~0222;
+ }
+ }
}
- net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
+ net->ipv4.ipv4_hdr = register_net_sysctl_sz(net, "net/ipv4", table,
+ table_size);
if (!net->ipv4.ipv4_hdr)
goto err_reg;
@@ -1252,6 +1680,8 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
if (!net->ipv4.sysctl_local_reserved_ports)
goto err_ports;
+ proc_fib_multipath_hash_set_seed(net, 0);
+
return 0;
err_ports:
@@ -1265,7 +1695,7 @@ err_alloc:
static __net_exit void ipv4_sysctl_exit_net(struct net *net)
{
- struct ctl_table *table;
+ const struct ctl_table *table;
kfree(net->ipv4.sysctl_local_reserved_ports);
table = net->ipv4.ipv4_hdr->ctl_table_arg;
@@ -1286,6 +1716,8 @@ static __init int sysctl_ipv4_init(void)
if (!hdr)
return -ENOMEM;
+ proc_fib_multipath_hash_init_rand_seed();
+
if (register_pernet_subsys(&ipv4_sysctl_ops)) {
unregister_net_sysctl_table(hdr);
return -ENOMEM;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 10c6246396cc..f035440c475a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -205,11 +206,6 @@
* Hirokazu Takahashi : Use copy_from_user() instead of
* csum_and_copy_from_user() if possible.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or(at your option) any later version.
- *
* Description of States:
*
* TCP_SYN_SENT sent a connection request, waiting for ack
@@ -247,7 +243,7 @@
#define pr_fmt(fmt) "TCP: " fmt
-#include <crypto/hash.h>
+#include <crypto/md5.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -257,40 +253,59 @@
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/skbuff.h>
-#include <linux/scatterlist.h>
#include <linux/splice.h>
#include <linux/net.h>
#include <linux/socket.h>
#include <linux/random.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/highmem.h>
-#include <linux/swap.h>
#include <linux/cache.h>
#include <linux/err.h>
#include <linux/time.h>
#include <linux/slab.h>
#include <linux/errqueue.h>
#include <linux/static_key.h>
+#include <linux/btf.h>
#include <net/icmp.h>
#include <net/inet_common.h>
+#include <net/inet_ecn.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
+#include <net/mptcp.h>
+#include <net/proto_memory.h>
#include <net/xfrm.h>
#include <net/ip.h>
+#include <net/psp.h>
#include <net/sock.h>
+#include <net/rstreason.h>
#include <linux/uaccess.h>
#include <asm/ioctls.h>
#include <net/busy_poll.h>
+#include <net/hotdata.h>
+#include <trace/events/tcp.h>
+#include <net/rps.h>
+
+#include "../core/devmem.h"
+
+/* Track pending CMSGs. */
+enum {
+ TCP_CMSG_INQ = 1,
+ TCP_CMSG_TS = 2
+};
-struct percpu_counter tcp_orphan_count;
-EXPORT_SYMBOL_GPL(tcp_orphan_count);
+DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
+EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
+
+DEFINE_PER_CPU(u32, tcp_tw_isn);
+EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn);
long sysctl_tcp_mem[3] __read_mostly;
-EXPORT_SYMBOL(sysctl_tcp_mem);
+EXPORT_IPV6_MOD(sysctl_tcp_mem);
-atomic_long_t tcp_memory_allocated; /* Current allocated memory. */
-EXPORT_SYMBOL(tcp_memory_allocated);
+DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
+EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc);
#if IS_ENABLED(CONFIG_SMC)
DEFINE_STATIC_KEY_FALSE(tcp_have_smc);
@@ -300,8 +315,8 @@ EXPORT_SYMBOL(tcp_have_smc);
/*
* Current number of TCP sockets.
*/
-struct percpu_counter tcp_sockets_allocated;
-EXPORT_SYMBOL(tcp_sockets_allocated);
+struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
+EXPORT_IPV6_MOD(tcp_sockets_allocated);
/*
* TCP splice context
@@ -325,7 +340,7 @@ void tcp_enter_memory_pressure(struct sock *sk)
{
unsigned long val;
- if (tcp_memory_pressure)
+ if (READ_ONCE(tcp_memory_pressure))
return;
val = jiffies;
@@ -334,20 +349,20 @@ void tcp_enter_memory_pressure(struct sock *sk)
if (!cmpxchg(&tcp_memory_pressure, 0, val))
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES);
}
-EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure);
+EXPORT_IPV6_MOD_GPL(tcp_enter_memory_pressure);
void tcp_leave_memory_pressure(struct sock *sk)
{
unsigned long val;
- if (!tcp_memory_pressure)
+ if (!READ_ONCE(tcp_memory_pressure))
return;
val = xchg(&tcp_memory_pressure, 0);
if (val)
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO,
jiffies_to_msecs(jiffies - val));
}
-EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure);
+EXPORT_IPV6_MOD_GPL(tcp_leave_memory_pressure);
/* Convert seconds to retransmits based on initial and max timeout */
static u8 secs_to_retrans(int seconds, int timeout, int rto_max)
@@ -399,6 +414,21 @@ static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
return rate64;
}
+#ifdef CONFIG_TCP_MD5SIG
+void tcp_md5_destruct_sock(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->md5sig_info) {
+
+ tcp_clear_md5_list(sk);
+ kfree(rcu_replace_pointer(tp->md5sig_info, NULL, 1));
+ static_branch_slow_dec_deferred(&tcp_md5_needed);
+ }
+}
+EXPORT_IPV6_MOD_GPL(tcp_md5_destruct_sock);
+#endif
+
/* Address-family independent initialization for a tcp_sock.
*
* NOTE: A lot of things set to zero explicitly by call to
@@ -408,6 +438,7 @@ void tcp_init_sock(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ int rto_min_us, rto_max_ms;
tp->out_of_order_queue = RB_ROOT;
sk->tcp_rtx_queue = RB_ROOT;
@@ -416,6 +447,13 @@ void tcp_init_sock(struct sock *sk)
INIT_LIST_HEAD(&tp->tsorted_sent_queue);
icsk->icsk_rto = TCP_TIMEOUT_INIT;
+
+ rto_max_ms = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_max_ms);
+ icsk->icsk_rto_max = msecs_to_jiffies(rto_max_ms);
+
+ rto_min_us = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rto_min_us);
+ icsk->icsk_rto_min = usecs_to_jiffies(rto_min_us);
+ icsk->icsk_delack_max = TCP_DELACK_MAX;
tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
@@ -424,10 +462,11 @@ void tcp_init_sock(struct sock *sk)
* algorithms that we must have the following bandaid to talk
* efficiently to them. -DaveM
*/
- tp->snd_cwnd = TCP_INIT_CWND;
+ tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
/* There's a bubble in the pipe until at least the first ACK. */
tp->app_limited = ~0U;
+ tp->rate_app_limited = 1;
/* See draft-stevens-tcpca-spec-01 for discussion of the
* initialization of these values.
@@ -436,61 +475,53 @@ void tcp_init_sock(struct sock *sk)
tp->snd_cwnd_clamp = ~0;
tp->mss_cache = TCP_MSS_DEFAULT;
- tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
+ tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering);
tcp_assign_congestion_control(sk);
tp->tsoffset = 0;
tp->rack.reo_wnd_steps = 1;
- sk->sk_state = TCP_CLOSE;
-
sk->sk_write_space = sk_stream_write_space;
sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
icsk->icsk_sync_mss = tcp_sync_mss;
- sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
- sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
+ WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1]));
+ WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1]));
+ tcp_scaling_ratio_init(sk);
+ set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
sk_sockets_allocated_inc(sk);
- sk->sk_route_forced_caps = NETIF_F_GSO;
-}
-EXPORT_SYMBOL(tcp_init_sock);
-
-void tcp_init_transfer(struct sock *sk, int bpf_op)
-{
- struct inet_connection_sock *icsk = inet_csk(sk);
-
- tcp_mtup_init(sk);
- icsk->icsk_af_ops->rebuild_header(sk);
- tcp_init_metrics(sk);
- tcp_call_bpf(sk, bpf_op, 0, NULL);
- tcp_init_congestion_control(sk);
- tcp_init_buffer_space(sk);
+ xa_init_flags(&sk->sk_user_frags, XA_FLAGS_ALLOC1);
}
+EXPORT_IPV6_MOD(tcp_init_sock);
-static void tcp_tx_timestamp(struct sock *sk, u16 tsflags)
+static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc)
{
struct sk_buff *skb = tcp_write_queue_tail(sk);
+ u32 tsflags = sockc->tsflags;
if (tsflags && skb) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
- sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags);
+ sock_tx_timestamp(sk, sockc, &shinfo->tx_flags);
if (tsflags & SOF_TIMESTAMPING_TX_ACK)
- tcb->txstamp_ack = 1;
+ tcb->txstamp_ack |= TSTAMP_ACK_SK;
if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK)
shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1;
}
+
+ if (cgroup_bpf_enabled(CGROUP_SOCK_OPS) &&
+ SK_BPF_CB_FLAG_TEST(sk, SK_BPF_CB_TX_TIMESTAMPING) && skb)
+ bpf_skops_tx_timestamping(sk, skb, BPF_SOCK_OPS_TSTAMP_SENDMSG_CB);
}
-static inline bool tcp_stream_is_readable(const struct tcp_sock *tp,
- int target, struct sock *sk)
+static bool tcp_stream_is_readable(struct sock *sk, int target)
{
- return (tp->rcv_nxt - tp->copied_seq >= target) ||
- (sk->sk_prot->stream_memory_read ?
- sk->sk_prot->stream_memory_read(sk) : false);
+ if (tcp_epollin_ready(sk, target))
+ return true;
+ return sk_is_readable(sk);
}
/*
@@ -505,9 +536,10 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
__poll_t mask;
struct sock *sk = sock->sk;
const struct tcp_sock *tp = tcp_sk(sk);
+ u8 shutdown;
int state;
- sock_poll_wait(file, wait);
+ sock_poll_wait(file, sock, wait);
state = inet_sk_state_load(sk);
if (state == TCP_LISTEN)
@@ -547,26 +579,28 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
* blocking on fresh not-connected or disconnected socket. --ANK
*/
- if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
+ shutdown = READ_ONCE(sk->sk_shutdown);
+ if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
mask |= EPOLLHUP;
- if (sk->sk_shutdown & RCV_SHUTDOWN)
+ if (shutdown & RCV_SHUTDOWN)
mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
/* Connected or passive Fast Open socket? */
if (state != TCP_SYN_SENT &&
- (state != TCP_SYN_RECV || tp->fastopen_rsk)) {
+ (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) {
int target = sock_rcvlowat(sk, 0, INT_MAX);
+ u16 urg_data = READ_ONCE(tp->urg_data);
- if (tp->urg_seq == tp->copied_seq &&
- !sock_flag(sk, SOCK_URGINLINE) &&
- tp->urg_data)
+ if (unlikely(urg_data) &&
+ READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) &&
+ !sock_flag(sk, SOCK_URGINLINE))
target++;
- if (tcp_stream_is_readable(tp, target, sk))
+ if (tcp_stream_is_readable(sk, target))
mask |= EPOLLIN | EPOLLRDNORM;
- if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
- if (sk_stream_is_writeable(sk)) {
+ if (!(shutdown & SEND_SHUTDOWN)) {
+ if (__sk_stream_is_writeable(sk, 1)) {
mask |= EPOLLOUT | EPOLLWRNORM;
} else { /* send SIGIO later */
sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
@@ -578,31 +612,33 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
* pairs with the input side.
*/
smp_mb__after_atomic();
- if (sk_stream_is_writeable(sk))
+ if (__sk_stream_is_writeable(sk, 1))
mask |= EPOLLOUT | EPOLLWRNORM;
}
} else
mask |= EPOLLOUT | EPOLLWRNORM;
- if (tp->urg_data & TCP_URG_VALID)
+ if (urg_data & TCP_URG_VALID)
mask |= EPOLLPRI;
- } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
+ } else if (state == TCP_SYN_SENT &&
+ inet_test_bit(DEFER_CONNECT, sk)) {
/* Active TCP fastopen socket with defer_connect
* Return EPOLLOUT so application can call write()
* in order for kernel to generate SYN+data
*/
mask |= EPOLLOUT | EPOLLWRNORM;
}
- /* This barrier is coupled with smp_wmb() in tcp_reset() */
+ /* This barrier is coupled with smp_wmb() in tcp_done_with_error() */
smp_rmb();
- if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
+ if (READ_ONCE(sk->sk_err) ||
+ !skb_queue_empty_lockless(&sk->sk_error_queue))
mask |= EPOLLERR;
return mask;
}
EXPORT_SYMBOL(tcp_poll);
-int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+int tcp_ioctl(struct sock *sk, int cmd, int *karg)
{
struct tcp_sock *tp = tcp_sk(sk);
int answ;
@@ -618,7 +654,8 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
unlock_sock_fast(sk, slow);
break;
case SIOCATMARK:
- answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
+ answ = READ_ONCE(tp->urg_data) &&
+ READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq);
break;
case SIOCOUTQ:
if (sk->sk_state == TCP_LISTEN)
@@ -627,7 +664,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
answ = 0;
else
- answ = tp->write_seq - tp->snd_una;
+ answ = READ_ONCE(tp->write_seq) - tp->snd_una;
break;
case SIOCOUTQNSD:
if (sk->sk_state == TCP_LISTEN)
@@ -636,17 +673,19 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
answ = 0;
else
- answ = tp->write_seq - tp->snd_nxt;
+ answ = READ_ONCE(tp->write_seq) -
+ READ_ONCE(tp->snd_nxt);
break;
default:
return -ENOIOCTLCMD;
}
- return put_user(answ, (int __user *)arg);
+ *karg = answ;
+ return 0;
}
-EXPORT_SYMBOL(tcp_ioctl);
+EXPORT_IPV6_MOD(tcp_ioctl);
-static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
+void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
{
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
tp->pushed_seq = tp->write_seq;
@@ -657,18 +696,17 @@ static inline bool forced_push(const struct tcp_sock *tp)
return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
}
-static void skb_entail(struct sock *sk, struct sk_buff *skb)
+void tcp_skb_entail(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
- skb->csum = 0;
tcb->seq = tcb->end_seq = tp->write_seq;
tcb->tcp_flags = TCPHDR_ACK;
- tcb->sacked = 0;
__skb_header_release(skb);
+ psp_enqueue_set_decrypted(sk, skb);
tcp_add_write_queue_tail(sk, skb);
- sk->sk_wmem_queued += skb->truesize;
+ sk_wmem_queued_add(sk, skb->truesize);
sk_mem_charge(sk, skb->truesize);
if (tp->nonagle & TCP_NAGLE_PUSH)
tp->nonagle &= ~TCP_NAGLE_PUSH;
@@ -696,13 +734,14 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb,
int size_goal)
{
return skb->len < size_goal &&
- sock_net(sk)->ipv4.sysctl_tcp_autocorking &&
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) &&
!tcp_rtx_queue_empty(sk) &&
- refcount_read(&sk->sk_wmem_alloc) > skb->truesize;
+ refcount_read(&sk->sk_wmem_alloc) > skb->truesize &&
+ tcp_skb_can_collapse_to(skb);
}
-static void tcp_push(struct sock *sk, int flags, int mss_now,
- int nonagle, int size_goal)
+void tcp_push(struct sock *sk, int flags, int mss_now,
+ int nonagle, int size_goal)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
@@ -721,6 +760,7 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
+ smp_mb__after_atomic();
}
/* It is possible TX completion already happened
* before we set TSQ_THROTTLED.
@@ -830,7 +870,9 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
*/
if (!skb_queue_empty(&sk->sk_receive_queue))
break;
- sk_wait_data(sk, &timeo, NULL);
+ ret = sk_wait_data(sk, &timeo, NULL);
+ if (ret < 0)
+ break;
if (signal_pending(current)) {
ret = sock_intr_errno(timeo);
break;
@@ -840,7 +882,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
tss.len -= ret;
spliced += ret;
- if (!timeo)
+ if (!tss.len || !timeo)
break;
release_sock(sk);
lock_sock(sk);
@@ -858,23 +900,18 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
return ret;
}
-EXPORT_SYMBOL(tcp_splice_read);
+EXPORT_IPV6_MOD(tcp_splice_read);
-struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
- bool force_schedule)
+struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
+ bool force_schedule)
{
struct sk_buff *skb;
- /* The TCP header must be at least 32-bit aligned. */
- size = ALIGN(size, 4);
-
- if (unlikely(tcp_under_memory_pressure(sk)))
- sk_mem_reclaim_partial(sk);
-
- skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
+ skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp);
if (likely(skb)) {
bool mem_scheduled;
+ skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
if (force_schedule) {
mem_scheduled = true;
sk_forced_mem_schedule(sk, skb->truesize);
@@ -882,18 +919,15 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
mem_scheduled = sk_wmem_schedule(sk, skb->truesize);
}
if (likely(mem_scheduled)) {
- skb_reserve(skb, sk->sk_prot->max_header);
- /*
- * Make sure that we have exactly size bytes
- * available to the caller, no more, no less.
- */
- skb->reserved_tailroom = skb->end - skb->tail - size;
+ skb_reserve(skb, MAX_TCP_HEADER);
+ skb->ip_summed = CHECKSUM_PARTIAL;
INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
return skb;
}
__kfree_skb(skb);
} else {
- sk->sk_prot->enter_memory_pressure(sk);
+ if (!sk->sk_bypass_prot_mem)
+ tcp_enter_memory_pressure(sk);
sk_stream_moderate_sndbuf(sk);
}
return NULL;
@@ -909,8 +943,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
return mss_now;
/* Note : tcp_tso_autosize() will eventually split this later */
- new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER;
- new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal);
+ new_size_goal = tcp_bound_to_half_wnd(tp, sk->sk_gso_max_size);
/* We try hard to avoid divides here */
size_goal = tp->gso_segs * mss_now;
@@ -924,7 +957,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
return max(size_goal, mss_now);
}
-static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
+int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
{
int mss_now;
@@ -934,188 +967,56 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
return mss_now;
}
-ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
- size_t size, int flags)
+/* In some cases, sendmsg() could have added an skb to the write queue,
+ * but failed adding payload on it. We need to remove it to consume less
+ * memory, but more importantly be able to generate EPOLLOUT for Edge Trigger
+ * epoll() users. Another reason is that tcp_write_xmit() does not like
+ * finding an empty skb in the write queue.
+ */
+void tcp_remove_empty_skb(struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
- int mss_now, size_goal;
- int err;
- ssize_t copied;
- long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
-
- /* Wait for a connection to finish. One exception is TCP Fast Open
- * (passive side) where data is allowed to be sent before a connection
- * is fully established.
- */
- if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
- !tcp_passive_fastopen(sk)) {
- err = sk_stream_wait_connect(sk, &timeo);
- if (err != 0)
- goto out_err;
- }
-
- sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
-
- mss_now = tcp_send_mss(sk, &size_goal, flags);
- copied = 0;
-
- err = -EPIPE;
- if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
- goto out_err;
-
- while (size > 0) {
- struct sk_buff *skb = tcp_write_queue_tail(sk);
- int copy, i;
- bool can_coalesce;
-
- if (!skb || (copy = size_goal - skb->len) <= 0 ||
- !tcp_skb_can_collapse_to(skb)) {
-new_segment:
- if (!sk_stream_memory_free(sk))
- goto wait_for_sndbuf;
-
- skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
- tcp_rtx_and_write_queues_empty(sk));
- if (!skb)
- goto wait_for_memory;
-
- skb_entail(sk, skb);
- copy = size_goal;
- }
-
- if (copy > size)
- copy = size;
-
- i = skb_shinfo(skb)->nr_frags;
- can_coalesce = skb_can_coalesce(skb, i, page, offset);
- if (!can_coalesce && i >= sysctl_max_skb_frags) {
- tcp_mark_push(tp, skb);
- goto new_segment;
- }
- if (!sk_wmem_schedule(sk, copy))
- goto wait_for_memory;
-
- if (can_coalesce) {
- skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
- } else {
- get_page(page);
- skb_fill_page_desc(skb, i, page, offset, copy);
- }
-
- if (!(flags & MSG_NO_SHARED_FRAGS))
- skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
-
- skb->len += copy;
- skb->data_len += copy;
- skb->truesize += copy;
- sk->sk_wmem_queued += copy;
- sk_mem_charge(sk, copy);
- skb->ip_summed = CHECKSUM_PARTIAL;
- tp->write_seq += copy;
- TCP_SKB_CB(skb)->end_seq += copy;
- tcp_skb_pcount_set(skb, 0);
-
- if (!copied)
- TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
-
- copied += copy;
- offset += copy;
- size -= copy;
- if (!size)
- goto out;
-
- if (skb->len < size_goal || (flags & MSG_OOB))
- continue;
-
- if (forced_push(tp)) {
- tcp_mark_push(tp, skb);
- __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
- } else if (skb == tcp_send_head(sk))
- tcp_push_one(sk, mss_now);
- continue;
-
-wait_for_sndbuf:
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-wait_for_memory:
- tcp_push(sk, flags & ~MSG_MORE, mss_now,
- TCP_NAGLE_PUSH, size_goal);
-
- err = sk_stream_wait_memory(sk, &timeo);
- if (err != 0)
- goto do_error;
-
- mss_now = tcp_send_mss(sk, &size_goal, flags);
- }
-
-out:
- if (copied) {
- tcp_tx_timestamp(sk, sk->sk_tsflags);
- if (!(flags & MSG_SENDPAGE_NOTLAST))
- tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
- }
- return copied;
+ struct sk_buff *skb = tcp_write_queue_tail(sk);
-do_error:
- if (copied)
- goto out;
-out_err:
- /* make sure we wake any epoll edge trigger waiter */
- if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
- err == -EAGAIN)) {
- sk->sk_write_space(sk);
- tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
+ if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
+ tcp_unlink_write_queue(skb, sk);
+ if (tcp_write_queue_empty(sk))
+ tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
+ tcp_wmem_free_skb(sk, skb);
}
- return sk_stream_error(sk, flags, err);
}
-EXPORT_SYMBOL_GPL(do_tcp_sendpages);
-int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
- size_t size, int flags)
+/* skb changing from pure zc to mixed, must charge zc */
+static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb)
{
- if (!(sk->sk_route_caps & NETIF_F_SG))
- return sock_no_sendpage_locked(sk, page, offset, size, flags);
+ if (unlikely(skb_zcopy_pure(skb))) {
+ u32 extra = skb->truesize -
+ SKB_TRUESIZE(skb_end_offset(skb));
- tcp_rate_check_app_limited(sk); /* is sending application-limited? */
+ if (!sk_wmem_schedule(sk, extra))
+ return -ENOMEM;
- return do_tcp_sendpages(sk, page, offset, size, flags);
+ sk_mem_charge(sk, extra);
+ skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY;
+ }
+ return 0;
}
-EXPORT_SYMBOL_GPL(tcp_sendpage_locked);
-int tcp_sendpage(struct sock *sk, struct page *page, int offset,
- size_t size, int flags)
-{
- int ret;
- lock_sock(sk);
- ret = tcp_sendpage_locked(sk, page, offset, size, flags);
- release_sock(sk);
-
- return ret;
-}
-EXPORT_SYMBOL(tcp_sendpage);
-
-/* Do not bother using a page frag for very small frames.
- * But use this heuristic only for the first skb in write queue.
- *
- * Having no payload in skb->head allows better SACK shifting
- * in tcp_shift_skb_data(), reducing sack/rack overhead, because
- * write queue has less skbs.
- * Each skb can hold up to MAX_SKB_FRAGS * 32Kbytes, or ~0.5 MB.
- * This also speeds up tso_fragment(), since it wont fallback
- * to tcp_fragment().
- */
-static int linear_payload_sz(bool first_skb)
+int tcp_wmem_schedule(struct sock *sk, int copy)
{
- if (first_skb)
- return SKB_WITH_OVERHEAD(2048 - MAX_TCP_HEADER);
- return 0;
-}
+ int left;
-static int select_size(bool first_skb, bool zc)
-{
- if (zc)
- return 0;
- return linear_payload_sz(first_skb);
+ if (likely(sk_wmem_schedule(sk, copy)))
+ return copy;
+
+ /* We could be in trouble if we have nothing queued.
+ * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0]
+ * to guarantee some progress.
+ */
+ left = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[0]) - sk->sk_wmem_queued;
+ if (left > 0)
+ sk_forced_mem_schedule(sk, min(left, copy));
+ return min(copy, sk->sk_forward_alloc);
}
void tcp_free_fastopen_req(struct tcp_sock *tp)
@@ -1126,15 +1027,16 @@ void tcp_free_fastopen_req(struct tcp_sock *tp)
}
}
-static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
- int *copied, size_t size)
+int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
+ size_t size, struct ubuf_info *uarg)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_sock *inet = inet_sk(sk);
struct sockaddr *uaddr = msg->msg_name;
int err, flags;
- if (!(sock_net(sk)->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) ||
+ if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) &
+ TFO_CLIENT_ENABLE) ||
(uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) &&
uaddr->sa_family == AF_UNSPEC))
return -EOPNOTSUPP;
@@ -1147,8 +1049,9 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
return -ENOBUFS;
tp->fastopen_req->data = msg;
tp->fastopen_req->size = size;
+ tp->fastopen_req->uarg = uarg;
- if (inet->defer_connect) {
+ if (inet_test_bit(DEFER_CONNECT, sk)) {
err = tcp_connect(sk);
/* Same failure procedure as in tcp_v4/6_connect */
if (err) {
@@ -1158,7 +1061,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
}
}
flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
- err = __inet_stream_connect(sk->sk_socket, uaddr,
+ err = __inet_stream_connect(sk->sk_socket, (struct sockaddr_unsized *)uaddr,
msg->msg_namelen, flags, 1);
/* fastopen_req could already be freed in __inet_stream_connect
* if the connection times out or gets rst
@@ -1166,46 +1069,77 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
if (tp->fastopen_req) {
*copied = tp->fastopen_req->copied;
tcp_free_fastopen_req(tp);
- inet->defer_connect = 0;
+ inet_clear_bit(DEFER_CONNECT, sk);
}
return err;
}
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
{
+ struct net_devmem_dmabuf_binding *binding = NULL;
struct tcp_sock *tp = tcp_sk(sk);
struct ubuf_info *uarg = NULL;
struct sk_buff *skb;
struct sockcm_cookie sockc;
int flags, err, copied = 0;
int mss_now = 0, size_goal, copied_syn = 0;
- bool process_backlog = false;
- bool zc = false;
+ int process_backlog = 0;
+ int sockc_err = 0;
+ int zc = 0;
long timeo;
flags = msg->msg_flags;
- if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
- if (sk->sk_state != TCP_ESTABLISHED) {
- err = -EINVAL;
- goto out_err;
- }
+ sockc = (struct sockcm_cookie){ .tsflags = READ_ONCE(sk->sk_tsflags) };
+ if (msg->msg_controllen) {
+ sockc_err = sock_cmsg_send(sk, msg, &sockc);
+ /* Don't return error until MSG_FASTOPEN has been processed;
+ * that may succeed even if the cmsg is invalid.
+ */
+ }
- skb = tcp_write_queue_tail(sk);
- uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
- if (!uarg) {
- err = -ENOBUFS;
- goto out_err;
+ if ((flags & MSG_ZEROCOPY) && size) {
+ if (msg->msg_ubuf) {
+ uarg = msg->msg_ubuf;
+ if (sk->sk_route_caps & NETIF_F_SG)
+ zc = MSG_ZEROCOPY;
+ } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
+ skb = tcp_write_queue_tail(sk);
+ uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb),
+ !sockc_err && sockc.dmabuf_id);
+ if (!uarg) {
+ err = -ENOBUFS;
+ goto out_err;
+ }
+ if (sk->sk_route_caps & NETIF_F_SG)
+ zc = MSG_ZEROCOPY;
+ else
+ uarg_to_msgzc(uarg)->zerocopy = 0;
+
+ if (!sockc_err && sockc.dmabuf_id) {
+ binding = net_devmem_get_binding(sk, sockc.dmabuf_id);
+ if (IS_ERR(binding)) {
+ err = PTR_ERR(binding);
+ binding = NULL;
+ goto out_err;
+ }
+ }
}
+ } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) {
+ if (sk->sk_route_caps & NETIF_F_SG)
+ zc = MSG_SPLICE_PAGES;
+ }
- zc = sk->sk_route_caps & NETIF_F_SG;
- if (!zc)
- uarg->zerocopy = 0;
+ if (!sockc_err && sockc.dmabuf_id &&
+ (!(flags & MSG_ZEROCOPY) || !sock_flag(sk, SOCK_ZEROCOPY))) {
+ err = -EINVAL;
+ goto out_err;
}
- if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
+ if (unlikely(flags & MSG_FASTOPEN ||
+ inet_test_bit(DEFER_CONNECT, sk)) &&
!tp->repair) {
- err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
+ err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg);
if (err == -EINPROGRESS && copied_syn > 0)
goto out;
else if (err)
@@ -1240,13 +1174,9 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
/* 'common' sending to sendq */
}
- sockcm_init(&sockc, sk);
- if (msg->msg_controllen) {
- err = sock_cmsg_send(sk, msg, &sockc);
- if (unlikely(err)) {
- err = -EINVAL;
- goto out_err;
- }
+ if (sockc_err) {
+ err = sockc_err;
+ goto out_err;
}
/* This should be in poll */
@@ -1269,33 +1199,36 @@ restart:
if (skb)
copy = size_goal - skb->len;
+ trace_tcp_sendmsg_locked(sk, msg, skb, size_goal);
+
if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
bool first_skb;
- int linear;
new_segment:
if (!sk_stream_memory_free(sk))
- goto wait_for_sndbuf;
+ goto wait_for_space;
- if (process_backlog && sk_flush_backlog(sk)) {
- process_backlog = false;
- goto restart;
+ if (unlikely(process_backlog >= 16)) {
+ process_backlog = 0;
+ if (sk_flush_backlog(sk))
+ goto restart;
}
first_skb = tcp_rtx_and_write_queues_empty(sk);
- linear = select_size(first_skb, zc);
- skb = sk_stream_alloc_skb(sk, linear, sk->sk_allocation,
- first_skb);
+ skb = tcp_stream_alloc_skb(sk, sk->sk_allocation,
+ first_skb);
if (!skb)
- goto wait_for_memory;
+ goto wait_for_space;
- process_backlog = true;
- skb->ip_summed = CHECKSUM_PARTIAL;
+ process_backlog++;
- skb_entail(sk, skb);
+#ifdef CONFIG_SKB_DECRYPTED
+ skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
+#endif
+ tcp_skb_entail(sk, skb);
copy = size_goal;
/* All packets are restored as if they have
- * already been sent. skb_mstamp isn't set to
+ * already been sent. skb_mstamp_ns isn't set to
* avoid wrong rtt estimation.
*/
if (tp->repair)
@@ -1306,24 +1239,17 @@ new_segment:
if (copy > msg_data_left(msg))
copy = msg_data_left(msg);
- /* Where to copy to? */
- if (skb_availroom(skb) > 0 && !zc) {
- /* We have some space in skb head. Superb! */
- copy = min_t(int, copy, skb_availroom(skb));
- err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
- if (err)
- goto do_fault;
- } else if (!zc) {
+ if (zc == 0) {
bool merge = true;
int i = skb_shinfo(skb)->nr_frags;
struct page_frag *pfrag = sk_page_frag(sk);
if (!sk_page_frag_refill(sk, pfrag))
- goto wait_for_memory;
+ goto wait_for_space;
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
- if (i >= sysctl_max_skb_frags) {
+ if (i >= READ_ONCE(net_hotdata.sysctl_max_skb_frags)) {
tcp_mark_push(tp, skb);
goto new_segment;
}
@@ -1332,8 +1258,15 @@ new_segment:
copy = min_t(int, copy, pfrag->size - pfrag->offset);
- if (!sk_wmem_schedule(sk, copy))
- goto wait_for_memory;
+ if (unlikely(skb_zcopy_pure(skb) || skb_zcopy_managed(skb))) {
+ if (tcp_downgrade_zcopy_pure(sk, skb))
+ goto wait_for_space;
+ skb_zcopy_downgrade_managed(skb);
+ }
+
+ copy = tcp_wmem_schedule(sk, copy);
+ if (!copy)
+ goto wait_for_space;
err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
pfrag->page,
@@ -1351,8 +1284,21 @@ new_segment:
page_ref_inc(pfrag->page);
}
pfrag->offset += copy;
- } else {
- err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg);
+ } else if (zc == MSG_ZEROCOPY) {
+ /* First append to a fragless skb builds initial
+ * pure zerocopy skb
+ */
+ if (!skb->len)
+ skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY;
+
+ if (!skb_zcopy_pure(skb)) {
+ copy = tcp_wmem_schedule(sk, copy);
+ if (!copy)
+ goto wait_for_space;
+ }
+
+ err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg,
+ binding);
if (err == -EMSGSIZE || err == -EEXIST) {
tcp_mark_push(tp, skb);
goto new_segment;
@@ -1360,12 +1306,35 @@ new_segment:
if (err < 0)
goto do_error;
copy = err;
+ } else if (zc == MSG_SPLICE_PAGES) {
+ /* Splice in data if we can; copy if we can't. */
+ if (tcp_downgrade_zcopy_pure(sk, skb))
+ goto wait_for_space;
+ copy = tcp_wmem_schedule(sk, copy);
+ if (!copy)
+ goto wait_for_space;
+
+ err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
+ if (err < 0) {
+ if (err == -EMSGSIZE) {
+ tcp_mark_push(tp, skb);
+ goto new_segment;
+ }
+ goto do_error;
+ }
+ copy = err;
+
+ if (!(flags & MSG_NO_SHARED_FRAGS))
+ skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
+
+ sk_wmem_queued_add(sk, copy);
+ sk_mem_charge(sk, copy);
}
if (!copied)
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
- tp->write_seq += copy;
+ WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
TCP_SKB_CB(skb)->end_seq += copy;
tcp_skb_pcount_set(skb, 0);
@@ -1386,9 +1355,9 @@ new_segment:
tcp_push_one(sk, mss_now);
continue;
-wait_for_sndbuf:
+wait_for_space:
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-wait_for_memory:
+ tcp_remove_empty_skb(sk);
if (copied)
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
@@ -1402,35 +1371,35 @@ wait_for_memory:
out:
if (copied) {
- tcp_tx_timestamp(sk, sockc.tsflags);
+ tcp_tx_timestamp(sk, &sockc);
tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
}
out_nopush:
- sock_zerocopy_put(uarg);
+ /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */
+ if (uarg && !msg->msg_ubuf)
+ net_zcopy_put(uarg);
+ if (binding)
+ net_devmem_dmabuf_binding_put(binding);
return copied + copied_syn;
-do_fault:
- if (!skb->len) {
- tcp_unlink_write_queue(skb, sk);
- /* It is the one place in all of TCP, except connection
- * reset, where we can be unlinking the send_head.
- */
- tcp_check_send_head(sk, skb);
- sk_wmem_free_skb(sk, skb);
- }
-
do_error:
+ tcp_remove_empty_skb(sk);
+
if (copied + copied_syn)
goto out;
out_err:
- sock_zerocopy_put_abort(uarg);
+ /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */
+ if (uarg && !msg->msg_ubuf)
+ net_zcopy_put_abort(uarg, true);
err = sk_stream_error(sk, flags, err);
/* make sure we wake any epoll edge trigger waiter */
- if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
- err == -EAGAIN)) {
+ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
sk->sk_write_space(sk);
tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
+ if (binding)
+ net_devmem_dmabuf_binding_put(binding);
+
return err;
}
EXPORT_SYMBOL_GPL(tcp_sendmsg_locked);
@@ -1447,6 +1416,22 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
}
EXPORT_SYMBOL(tcp_sendmsg);
+void tcp_splice_eof(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct tcp_sock *tp = tcp_sk(sk);
+ int mss_now, size_goal;
+
+ if (!tcp_write_queue_tail(sk))
+ return;
+
+ lock_sock(sk);
+ mss_now = tcp_send_mss(sk, &size_goal, 0);
+ tcp_push(sk, 0, mss_now, tp->nonagle, size_goal);
+ release_sock(sk);
+}
+EXPORT_IPV6_MOD_GPL(tcp_splice_eof);
+
/*
* Handle reading urgent data. BSD has very simple semantics for
* this, no blocking and very strange errors 8)
@@ -1469,7 +1454,7 @@ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags)
char c = tp->urg_data;
if (!(flags & MSG_PEEK))
- tp->urg_data = TCP_URG_READ;
+ WRITE_ONCE(tp->urg_data, TCP_URG_READ);
/* Read urgent data. */
msg->msg_flags |= MSG_OOB;
@@ -1501,8 +1486,6 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
struct sk_buff *skb;
int copied = 0, err = 0;
- /* XXX -- need to support SO_PEEK_OFF */
-
skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
if (err)
@@ -1527,23 +1510,15 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
* calculation of whether or not we must ACK for the sake of
* a window update.
*/
-static void tcp_cleanup_rbuf(struct sock *sk, int copied)
+void __tcp_cleanup_rbuf(struct sock *sk, int copied)
{
struct tcp_sock *tp = tcp_sk(sk);
bool time_to_ack = false;
- struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
-
- WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
- "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
- tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
-
if (inet_csk_ack_scheduled(sk)) {
const struct inet_connection_sock *icsk = inet_csk(sk);
- /* Delayed ACKs frequently hit locked sockets during bulk
- * receive. */
- if (icsk->icsk_ack.blocked ||
- /* Once-per-two-segments ACK was not sent by tcp_input.c */
+
+ if (/* Once-per-two-segments ACK was not sent by tcp_input.c */
tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
/*
* If this read emptied read buffer, we send ACK, if
@@ -1554,7 +1529,7 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied)
(copied > 0 &&
((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
- !icsk->icsk_ack.pingpong)) &&
+ !inet_csk_in_pingpong_mode(sk))) &&
!atomic_read(&sk->sk_rmem_alloc)))
time_to_ack = true;
}
@@ -1581,11 +1556,36 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied)
time_to_ack = true;
}
}
- if (time_to_ack)
+ if (time_to_ack) {
+ tcp_mstamp_refresh(tp);
tcp_send_ack(sk);
+ }
}
-static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
+void tcp_cleanup_rbuf(struct sock *sk, int copied)
+{
+ struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
+ "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
+ tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
+ __tcp_cleanup_rbuf(sk, copied);
+}
+
+static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ if (likely(skb->destructor == sock_rfree)) {
+ sock_rfree(skb);
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ return skb_attempt_defer_free(skb);
+ }
+ __kfree_skb(skb);
+}
+
+struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
struct sk_buff *skb;
u32 offset;
@@ -1604,10 +1604,11 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
* splitted a fat GRO packet, while we released socket lock
* in skb_splice_bits()
*/
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
}
return NULL;
}
+EXPORT_SYMBOL(tcp_recv_skb);
/*
* This routine provides an alternative to tcp_recvmsg() for routines
@@ -1620,12 +1621,13 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
* or for 'peeking' the socket using this routine
* (although both would be easy to implement).
*/
-int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
- sk_read_actor_t recv_actor)
+static int __tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor, bool noack,
+ u32 *copied_seq)
{
struct sk_buff *skb;
struct tcp_sock *tp = tcp_sk(sk);
- u32 seq = tp->copied_seq;
+ u32 seq = *copied_seq;
u32 offset;
int copied = 0;
@@ -1638,7 +1640,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
len = skb->len - offset;
/* Stop reading if we hit a patch of urgent data */
- if (tp->urg_data) {
+ if (unlikely(tp->urg_data)) {
u32 urg_offset = tp->urg_seq - seq;
if (urg_offset < len)
len = urg_offset;
@@ -1650,11 +1652,13 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
if (!copied)
copied = used;
break;
- } else if (used <= len) {
- seq += used;
- copied += used;
- offset += used;
}
+ if (WARN_ON_ONCE(used > len))
+ used = len;
+ seq += used;
+ copied += used;
+ offset += used;
+
/* If recv_actor drops the lock (e.g. TCP splice
* receive) the skb pointer might be invalid when
* getting here: tcp_collapse might have deleted it
@@ -1670,16 +1674,19 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
continue;
}
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
++seq;
break;
}
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
if (!desc->count)
break;
- tp->copied_seq = seq;
+ WRITE_ONCE(*copied_seq, seq);
}
- tp->copied_seq = seq;
+ WRITE_ONCE(*copied_seq, seq);
+
+ if (noack)
+ goto out;
tcp_rcv_space_adjust(sk);
@@ -1688,27 +1695,112 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
tcp_recv_skb(sk, seq, &offset);
tcp_cleanup_rbuf(sk, copied);
}
+out:
return copied;
}
+
+int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor)
+{
+ return __tcp_read_sock(sk, desc, recv_actor, false,
+ &tcp_sk(sk)->copied_seq);
+}
EXPORT_SYMBOL(tcp_read_sock);
+int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor, bool noack,
+ u32 *copied_seq)
+{
+ return __tcp_read_sock(sk, desc, recv_actor, noack, copied_seq);
+}
+
+int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
+{
+ struct sk_buff *skb;
+ int copied = 0;
+
+ if (sk->sk_state == TCP_LISTEN)
+ return -ENOTCONN;
+
+ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+ u8 tcp_flags;
+ int used;
+
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
+ tcp_flags = TCP_SKB_CB(skb)->tcp_flags;
+ used = recv_actor(sk, skb);
+ if (used < 0) {
+ if (!copied)
+ copied = used;
+ break;
+ }
+ copied += used;
+
+ if (tcp_flags & TCPHDR_FIN)
+ break;
+ }
+ return copied;
+}
+EXPORT_IPV6_MOD(tcp_read_skb);
+
+void tcp_read_done(struct sock *sk, size_t len)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 seq = tp->copied_seq;
+ struct sk_buff *skb;
+ size_t left;
+ u32 offset;
+
+ if (sk->sk_state == TCP_LISTEN)
+ return;
+
+ left = len;
+ while (left && (skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
+ int used;
+
+ used = min_t(size_t, skb->len - offset, left);
+ seq += used;
+ left -= used;
+
+ if (skb->len > offset + used)
+ break;
+
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) {
+ tcp_eat_recv_skb(sk, skb);
+ ++seq;
+ break;
+ }
+ tcp_eat_recv_skb(sk, skb);
+ }
+ WRITE_ONCE(tp->copied_seq, seq);
+
+ tcp_rcv_space_adjust(sk);
+
+ /* Clean up data we have read: This will do ACK frames. */
+ if (left != len)
+ tcp_cleanup_rbuf(sk, len - left);
+}
+EXPORT_SYMBOL(tcp_read_done);
+
int tcp_peek_len(struct socket *sock)
{
return tcp_inq(sock->sk);
}
-EXPORT_SYMBOL(tcp_peek_len);
+EXPORT_IPV6_MOD(tcp_peek_len);
/* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */
int tcp_set_rcvlowat(struct sock *sk, int val)
{
- int cap;
+ struct tcp_sock *tp = tcp_sk(sk);
+ int space, cap;
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
cap = sk->sk_rcvbuf >> 1;
else
- cap = sock_net(sk)->ipv4.sysctl_tcp_rmem[2] >> 1;
+ cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
val = min(val, cap);
- sk->sk_rcvlowat = val ? : 1;
+ WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
/* Check if we need to signal EPOLLIN right now */
tcp_data_ready(sk);
@@ -1716,14 +1808,30 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
return 0;
- val <<= 1;
- if (val > sk->sk_rcvbuf) {
- sk->sk_rcvbuf = val;
- tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val);
+ space = tcp_space_from_win(sk, val);
+ if (space > sk->sk_rcvbuf) {
+ WRITE_ONCE(sk->sk_rcvbuf, space);
+
+ if (tp->window_clamp && tp->window_clamp < val)
+ WRITE_ONCE(tp->window_clamp, val);
}
return 0;
}
-EXPORT_SYMBOL(tcp_set_rcvlowat);
+EXPORT_IPV6_MOD(tcp_set_rcvlowat);
+
+void tcp_update_recv_tstamps(struct sk_buff *skb,
+ struct scm_timestamping_internal *tss)
+{
+ if (skb->tstamp)
+ tss->ts[0] = ktime_to_timespec64(skb->tstamp);
+ else
+ tss->ts[0] = (struct timespec64) {0};
+
+ if (skb_hwtstamps(skb)->hwtstamp)
+ tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp);
+ else
+ tss->ts[2] = (struct timespec64) {0};
+}
#ifdef CONFIG_MMU
static const struct vm_operations_struct tcp_vm_ops = {
@@ -1734,27 +1842,353 @@ int tcp_mmap(struct file *file, struct socket *sock,
{
if (vma->vm_flags & (VM_WRITE | VM_EXEC))
return -EPERM;
- vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
+ vm_flags_clear(vma, VM_MAYWRITE | VM_MAYEXEC);
- /* Instruct vm_insert_page() to not down_read(mmap_sem) */
- vma->vm_flags |= VM_MIXEDMAP;
+ /* Instruct vm_insert_page() to not mmap_read_lock(mm) */
+ vm_flags_set(vma, VM_MIXEDMAP);
vma->vm_ops = &tcp_vm_ops;
return 0;
}
-EXPORT_SYMBOL(tcp_mmap);
+EXPORT_IPV6_MOD(tcp_mmap);
+
+static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,
+ u32 *offset_frag)
+{
+ skb_frag_t *frag;
+
+ if (unlikely(offset_skb >= skb->len))
+ return NULL;
+
+ offset_skb -= skb_headlen(skb);
+ if ((int)offset_skb < 0 || skb_has_frag_list(skb))
+ return NULL;
+
+ frag = skb_shinfo(skb)->frags;
+ while (offset_skb) {
+ if (skb_frag_size(frag) > offset_skb) {
+ *offset_frag = offset_skb;
+ return frag;
+ }
+ offset_skb -= skb_frag_size(frag);
+ ++frag;
+ }
+ *offset_frag = 0;
+ return frag;
+}
+
+static bool can_map_frag(const skb_frag_t *frag)
+{
+ struct page *page;
+
+ if (skb_frag_size(frag) != PAGE_SIZE || skb_frag_off(frag))
+ return false;
+
+ page = skb_frag_page(frag);
+
+ if (PageCompound(page) || page->mapping)
+ return false;
+
+ return true;
+}
+
+static int find_next_mappable_frag(const skb_frag_t *frag,
+ int remaining_in_skb)
+{
+ int offset = 0;
+
+ if (likely(can_map_frag(frag)))
+ return 0;
+
+ while (offset < remaining_in_skb && !can_map_frag(frag)) {
+ offset += skb_frag_size(frag);
+ ++frag;
+ }
+ return offset;
+}
+
+static void tcp_zerocopy_set_hint_for_skb(struct sock *sk,
+ struct tcp_zerocopy_receive *zc,
+ struct sk_buff *skb, u32 offset)
+{
+ u32 frag_offset, partial_frag_remainder = 0;
+ int mappable_offset;
+ skb_frag_t *frag;
+
+ /* worst case: skip to next skb. try to improve on this case below */
+ zc->recv_skip_hint = skb->len - offset;
+
+ /* Find the frag containing this offset (and how far into that frag) */
+ frag = skb_advance_to_frag(skb, offset, &frag_offset);
+ if (!frag)
+ return;
+
+ if (frag_offset) {
+ struct skb_shared_info *info = skb_shinfo(skb);
+
+ /* We read part of the last frag, must recvmsg() rest of skb. */
+ if (frag == &info->frags[info->nr_frags - 1])
+ return;
+
+ /* Else, we must at least read the remainder in this frag. */
+ partial_frag_remainder = skb_frag_size(frag) - frag_offset;
+ zc->recv_skip_hint -= partial_frag_remainder;
+ ++frag;
+ }
+
+ /* partial_frag_remainder: If part way through a frag, must read rest.
+ * mappable_offset: Bytes till next mappable frag, *not* counting bytes
+ * in partial_frag_remainder.
+ */
+ mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint);
+ zc->recv_skip_hint = mappable_offset + partial_frag_remainder;
+}
+
+static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
+ int flags, struct scm_timestamping_internal *tss,
+ int *cmsg_flags);
+static int receive_fallback_to_copy(struct sock *sk,
+ struct tcp_zerocopy_receive *zc, int inq,
+ struct scm_timestamping_internal *tss)
+{
+ unsigned long copy_address = (unsigned long)zc->copybuf_address;
+ struct msghdr msg = {};
+ int err;
+
+ zc->length = 0;
+ zc->recv_skip_hint = 0;
+
+ if (copy_address != zc->copybuf_address)
+ return -EINVAL;
+
+ err = import_ubuf(ITER_DEST, (void __user *)copy_address, inq,
+ &msg.msg_iter);
+ if (err)
+ return err;
+
+ err = tcp_recvmsg_locked(sk, &msg, inq, MSG_DONTWAIT,
+ tss, &zc->msg_flags);
+ if (err < 0)
+ return err;
+
+ zc->copybuf_len = err;
+ if (likely(zc->copybuf_len)) {
+ struct sk_buff *skb;
+ u32 offset;
+
+ skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset);
+ if (skb)
+ tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset);
+ }
+ return 0;
+}
+
+static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
+ struct sk_buff *skb, u32 copylen,
+ u32 *offset, u32 *seq)
+{
+ unsigned long copy_address = (unsigned long)zc->copybuf_address;
+ struct msghdr msg = {};
+ int err;
+
+ if (copy_address != zc->copybuf_address)
+ return -EINVAL;
+
+ err = import_ubuf(ITER_DEST, (void __user *)copy_address, copylen,
+ &msg.msg_iter);
+ if (err)
+ return err;
+ err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
+ if (err)
+ return err;
+ zc->recv_skip_hint -= copylen;
+ *offset += copylen;
+ *seq += copylen;
+ return (__s32)copylen;
+}
+
+static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc,
+ struct sock *sk,
+ struct sk_buff *skb,
+ u32 *seq,
+ s32 copybuf_len,
+ struct scm_timestamping_internal *tss)
+{
+ u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
+
+ if (!copylen)
+ return 0;
+ /* skb is null if inq < PAGE_SIZE. */
+ if (skb) {
+ offset = *seq - TCP_SKB_CB(skb)->seq;
+ } else {
+ skb = tcp_recv_skb(sk, *seq, &offset);
+ if (TCP_SKB_CB(skb)->has_rxtstamp) {
+ tcp_update_recv_tstamps(skb, tss);
+ zc->msg_flags |= TCP_CMSG_TS;
+ }
+ }
+
+ zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
+ seq);
+ return zc->copybuf_len < 0 ? 0 : copylen;
+}
+
+static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
+ struct page **pending_pages,
+ unsigned long pages_remaining,
+ unsigned long *address,
+ u32 *length,
+ u32 *seq,
+ struct tcp_zerocopy_receive *zc,
+ u32 total_bytes_to_map,
+ int err)
+{
+ /* At least one page did not map. Try zapping if we skipped earlier. */
+ if (err == -EBUSY &&
+ zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) {
+ u32 maybe_zap_len;
+
+ maybe_zap_len = total_bytes_to_map - /* All bytes to map */
+ *length + /* Mapped or pending */
+ (pages_remaining * PAGE_SIZE); /* Failed map. */
+ zap_page_range_single(vma, *address, maybe_zap_len, NULL);
+ err = 0;
+ }
+
+ if (!err) {
+ unsigned long leftover_pages = pages_remaining;
+ int bytes_mapped;
+
+ /* We called zap_page_range_single, try to reinsert. */
+ err = vm_insert_pages(vma, *address,
+ pending_pages,
+ &pages_remaining);
+ bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining);
+ *seq += bytes_mapped;
+ *address += bytes_mapped;
+ }
+ if (err) {
+ /* Either we were unable to zap, OR we zapped, retried an
+ * insert, and still had an issue. Either ways, pages_remaining
+ * is the number of pages we were unable to map, and we unroll
+ * some state we speculatively touched before.
+ */
+ const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
+
+ *length -= bytes_not_mapped;
+ zc->recv_skip_hint += bytes_not_mapped;
+ }
+ return err;
+}
+
+static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
+ struct page **pages,
+ unsigned int pages_to_map,
+ unsigned long *address,
+ u32 *length,
+ u32 *seq,
+ struct tcp_zerocopy_receive *zc,
+ u32 total_bytes_to_map)
+{
+ unsigned long pages_remaining = pages_to_map;
+ unsigned int pages_mapped;
+ unsigned int bytes_mapped;
+ int err;
+
+ err = vm_insert_pages(vma, *address, pages, &pages_remaining);
+ pages_mapped = pages_to_map - (unsigned int)pages_remaining;
+ bytes_mapped = PAGE_SIZE * pages_mapped;
+ /* Even if vm_insert_pages fails, it may have partially succeeded in
+ * mapping (some but not all of the pages).
+ */
+ *seq += bytes_mapped;
+ *address += bytes_mapped;
+
+ if (likely(!err))
+ return 0;
+
+ /* Error: maybe zap and retry + rollback state for failed inserts. */
+ return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped,
+ pages_remaining, address, length, seq, zc, total_bytes_to_map,
+ err);
+}
+
+#define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS)
+static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
+ struct tcp_zerocopy_receive *zc,
+ struct scm_timestamping_internal *tss)
+{
+ unsigned long msg_control_addr;
+ struct msghdr cmsg_dummy;
+
+ msg_control_addr = (unsigned long)zc->msg_control;
+ cmsg_dummy.msg_control_user = (void __user *)msg_control_addr;
+ cmsg_dummy.msg_controllen =
+ (__kernel_size_t)zc->msg_controllen;
+ cmsg_dummy.msg_flags = in_compat_syscall()
+ ? MSG_CMSG_COMPAT : 0;
+ cmsg_dummy.msg_control_is_user = true;
+ zc->msg_flags = 0;
+ if (zc->msg_control == msg_control_addr &&
+ zc->msg_controllen == cmsg_dummy.msg_controllen) {
+ tcp_recv_timestamp(&cmsg_dummy, sk, tss);
+ zc->msg_control = (__u64)
+ ((uintptr_t)cmsg_dummy.msg_control_user);
+ zc->msg_controllen =
+ (__u64)cmsg_dummy.msg_controllen;
+ zc->msg_flags = (__u32)cmsg_dummy.msg_flags;
+ }
+}
+static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm,
+ unsigned long address,
+ bool *mmap_locked)
+{
+ struct vm_area_struct *vma = lock_vma_under_rcu(mm, address);
+
+ if (vma) {
+ if (vma->vm_ops != &tcp_vm_ops) {
+ vma_end_read(vma);
+ return NULL;
+ }
+ *mmap_locked = false;
+ return vma;
+ }
+
+ mmap_read_lock(mm);
+ vma = vma_lookup(mm, address);
+ if (!vma || vma->vm_ops != &tcp_vm_ops) {
+ mmap_read_unlock(mm);
+ return NULL;
+ }
+ *mmap_locked = true;
+ return vma;
+}
+
+#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
static int tcp_zerocopy_receive(struct sock *sk,
- struct tcp_zerocopy_receive *zc)
+ struct tcp_zerocopy_receive *zc,
+ struct scm_timestamping_internal *tss)
{
+ u32 length = 0, offset, vma_len, avail_len, copylen = 0;
unsigned long address = (unsigned long)zc->address;
+ struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE];
+ s32 copybuf_len = zc->copybuf_len;
+ struct tcp_sock *tp = tcp_sk(sk);
const skb_frag_t *frags = NULL;
- u32 length = 0, seq, offset;
+ unsigned int pages_to_map = 0;
struct vm_area_struct *vma;
struct sk_buff *skb = NULL;
- struct tcp_sock *tp;
+ u32 seq = tp->copied_seq;
+ u32 total_bytes_to_map;
+ int inq = tcp_inq(sk);
+ bool mmap_locked;
int ret;
+ zc->copybuf_len = 0;
+ zc->msg_flags = 0;
+
if (address & (PAGE_SIZE - 1) || address != zc->address)
return -EINVAL;
@@ -1763,64 +2197,115 @@ static int tcp_zerocopy_receive(struct sock *sk,
sock_rps_record_flow(sk);
- down_read(&current->mm->mmap_sem);
+ if (inq && inq <= copybuf_len)
+ return receive_fallback_to_copy(sk, zc, inq, tss);
- ret = -EINVAL;
- vma = find_vma(current->mm, address);
- if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops)
- goto out;
- zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
-
- tp = tcp_sk(sk);
- seq = tp->copied_seq;
- zc->length = min_t(u32, zc->length, tcp_inq(sk));
- zc->length &= ~(PAGE_SIZE - 1);
+ if (inq < PAGE_SIZE) {
+ zc->length = 0;
+ zc->recv_skip_hint = inq;
+ if (!inq && sock_flag(sk, SOCK_DONE))
+ return -EIO;
+ return 0;
+ }
- zap_page_range(vma, address, zc->length);
+ vma = find_tcp_vma(current->mm, address, &mmap_locked);
+ if (!vma)
+ return -EINVAL;
- zc->recv_skip_hint = 0;
+ vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
+ avail_len = min_t(u32, vma_len, inq);
+ total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1);
+ if (total_bytes_to_map) {
+ if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT))
+ zap_page_range_single(vma, address, total_bytes_to_map,
+ NULL);
+ zc->length = total_bytes_to_map;
+ zc->recv_skip_hint = 0;
+ } else {
+ zc->length = avail_len;
+ zc->recv_skip_hint = avail_len;
+ }
ret = 0;
while (length + PAGE_SIZE <= zc->length) {
+ int mappable_offset;
+ struct page *page;
+
if (zc->recv_skip_hint < PAGE_SIZE) {
+ u32 offset_frag;
+
if (skb) {
+ if (zc->recv_skip_hint > 0)
+ break;
skb = skb->next;
offset = seq - TCP_SKB_CB(skb)->seq;
} else {
skb = tcp_recv_skb(sk, seq, &offset);
}
- zc->recv_skip_hint = skb->len - offset;
- offset -= skb_headlen(skb);
- if ((int)offset < 0 || skb_has_frag_list(skb))
+ if (!skb_frags_readable(skb))
break;
- frags = skb_shinfo(skb)->frags;
- while (offset) {
- if (frags->size > offset)
- goto out;
- offset -= frags->size;
- frags++;
+
+ if (TCP_SKB_CB(skb)->has_rxtstamp) {
+ tcp_update_recv_tstamps(skb, tss);
+ zc->msg_flags |= TCP_CMSG_TS;
}
+ zc->recv_skip_hint = skb->len - offset;
+ frags = skb_advance_to_frag(skb, offset, &offset_frag);
+ if (!frags || offset_frag)
+ break;
}
- if (frags->size != PAGE_SIZE || frags->page_offset)
+
+ mappable_offset = find_next_mappable_frag(frags,
+ zc->recv_skip_hint);
+ if (mappable_offset) {
+ zc->recv_skip_hint = mappable_offset;
break;
- ret = vm_insert_page(vma, address + length,
- skb_frag_page(frags));
- if (ret)
+ }
+ page = skb_frag_page(frags);
+ if (WARN_ON_ONCE(!page))
break;
+
+ prefetchw(page);
+ pages[pages_to_map++] = page;
length += PAGE_SIZE;
- seq += PAGE_SIZE;
zc->recv_skip_hint -= PAGE_SIZE;
frags++;
+ if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE ||
+ zc->recv_skip_hint < PAGE_SIZE) {
+ /* Either full batch, or we're about to go to next skb
+ * (and we cannot unroll failed ops across skbs).
+ */
+ ret = tcp_zerocopy_vm_insert_batch(vma, pages,
+ pages_to_map,
+ &address, &length,
+ &seq, zc,
+ total_bytes_to_map);
+ if (ret)
+ goto out;
+ pages_to_map = 0;
+ }
+ }
+ if (pages_to_map) {
+ ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map,
+ &address, &length, &seq,
+ zc, total_bytes_to_map);
}
out:
- up_read(&current->mm->mmap_sem);
- if (length) {
- tp->copied_seq = seq;
+ if (mmap_locked)
+ mmap_read_unlock(current->mm);
+ else
+ vma_end_read(vma);
+ /* Try to copy straggler data. */
+ if (!ret)
+ copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss);
+
+ if (length + copylen) {
+ WRITE_ONCE(tp->copied_seq, seq);
tcp_rcv_space_adjust(sk);
/* Clean up data we have read: This will do ACK frames. */
tcp_recv_skb(sk, seq, &offset);
- tcp_cleanup_rbuf(sk, length);
+ tcp_cleanup_rbuf(sk, length + copylen);
ret = 0;
if (length == zc->length)
zc->recv_skip_hint = 0;
@@ -1833,58 +2318,74 @@ out:
}
#endif
-static void tcp_update_recv_tstamps(struct sk_buff *skb,
- struct scm_timestamping *tss)
-{
- if (skb->tstamp)
- tss->ts[0] = ktime_to_timespec(skb->tstamp);
- else
- tss->ts[0] = (struct timespec) {0};
-
- if (skb_hwtstamps(skb)->hwtstamp)
- tss->ts[2] = ktime_to_timespec(skb_hwtstamps(skb)->hwtstamp);
- else
- tss->ts[2] = (struct timespec) {0};
-}
-
/* Similar to __sock_recv_timestamp, but does not require an skb */
-static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
- struct scm_timestamping *tss)
+void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
+ struct scm_timestamping_internal *tss)
{
- struct timeval tv;
+ int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
+ u32 tsflags = READ_ONCE(sk->sk_tsflags);
bool has_timestamping = false;
if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) {
if (sock_flag(sk, SOCK_RCVTSTAMP)) {
if (sock_flag(sk, SOCK_RCVTSTAMPNS)) {
- put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPNS,
- sizeof(tss->ts[0]), &tss->ts[0]);
+ if (new_tstamp) {
+ struct __kernel_timespec kts = {
+ .tv_sec = tss->ts[0].tv_sec,
+ .tv_nsec = tss->ts[0].tv_nsec,
+ };
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW,
+ sizeof(kts), &kts);
+ } else {
+ struct __kernel_old_timespec ts_old = {
+ .tv_sec = tss->ts[0].tv_sec,
+ .tv_nsec = tss->ts[0].tv_nsec,
+ };
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD,
+ sizeof(ts_old), &ts_old);
+ }
} else {
- tv.tv_sec = tss->ts[0].tv_sec;
- tv.tv_usec = tss->ts[0].tv_nsec / 1000;
-
- put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
- sizeof(tv), &tv);
+ if (new_tstamp) {
+ struct __kernel_sock_timeval stv = {
+ .tv_sec = tss->ts[0].tv_sec,
+ .tv_usec = tss->ts[0].tv_nsec / 1000,
+ };
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW,
+ sizeof(stv), &stv);
+ } else {
+ struct __kernel_old_timeval tv = {
+ .tv_sec = tss->ts[0].tv_sec,
+ .tv_usec = tss->ts[0].tv_nsec / 1000,
+ };
+ put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD,
+ sizeof(tv), &tv);
+ }
}
}
- if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
+ if (tsflags & SOF_TIMESTAMPING_SOFTWARE &&
+ (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE ||
+ !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER)))
has_timestamping = true;
else
- tss->ts[0] = (struct timespec) {0};
+ tss->ts[0] = (struct timespec64) {0};
}
if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
- if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
+ if (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE &&
+ (tsflags & SOF_TIMESTAMPING_RX_HARDWARE ||
+ !(tsflags & SOF_TIMESTAMPING_OPT_RX_FILTER)))
has_timestamping = true;
else
- tss->ts[2] = (struct timespec) {0};
+ tss->ts[2] = (struct timespec64) {0};
}
if (has_timestamping) {
- tss->ts[1] = (struct timespec) {0};
- put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMPING,
- sizeof(*tss), tss);
+ tss->ts[1] = (struct timespec64) {0};
+ if (sock_flag(sk, SOCK_TSTAMP_NEW))
+ put_cmsg_scm_timestamping64(msg, tss);
+ else
+ put_cmsg_scm_timestamping(msg, tss);
}
}
@@ -1901,9 +2402,227 @@ static int tcp_inq_hint(struct sock *sk)
inq = tp->rcv_nxt - tp->copied_seq;
release_sock(sk);
}
+ /* After receiving a FIN, tell the user-space to continue reading
+ * by returning a non-zero inq.
+ */
+ if (inq == 0 && sock_flag(sk, SOCK_DONE))
+ inq = 1;
return inq;
}
+/* batch __xa_alloc() calls and reduce xa_lock()/xa_unlock() overhead. */
+struct tcp_xa_pool {
+ u8 max; /* max <= MAX_SKB_FRAGS */
+ u8 idx; /* idx <= max */
+ __u32 tokens[MAX_SKB_FRAGS];
+ netmem_ref netmems[MAX_SKB_FRAGS];
+};
+
+static void tcp_xa_pool_commit_locked(struct sock *sk, struct tcp_xa_pool *p)
+{
+ int i;
+
+ /* Commit part that has been copied to user space. */
+ for (i = 0; i < p->idx; i++)
+ __xa_cmpxchg(&sk->sk_user_frags, p->tokens[i], XA_ZERO_ENTRY,
+ (__force void *)p->netmems[i], GFP_KERNEL);
+ /* Rollback what has been pre-allocated and is no longer needed. */
+ for (; i < p->max; i++)
+ __xa_erase(&sk->sk_user_frags, p->tokens[i]);
+
+ p->max = 0;
+ p->idx = 0;
+}
+
+static void tcp_xa_pool_commit(struct sock *sk, struct tcp_xa_pool *p)
+{
+ if (!p->max)
+ return;
+
+ xa_lock_bh(&sk->sk_user_frags);
+
+ tcp_xa_pool_commit_locked(sk, p);
+
+ xa_unlock_bh(&sk->sk_user_frags);
+}
+
+static int tcp_xa_pool_refill(struct sock *sk, struct tcp_xa_pool *p,
+ unsigned int max_frags)
+{
+ int err, k;
+
+ if (p->idx < p->max)
+ return 0;
+
+ xa_lock_bh(&sk->sk_user_frags);
+
+ tcp_xa_pool_commit_locked(sk, p);
+
+ for (k = 0; k < max_frags; k++) {
+ err = __xa_alloc(&sk->sk_user_frags, &p->tokens[k],
+ XA_ZERO_ENTRY, xa_limit_31b, GFP_KERNEL);
+ if (err)
+ break;
+ }
+
+ xa_unlock_bh(&sk->sk_user_frags);
+
+ p->max = k;
+ p->idx = 0;
+ return k ? 0 : err;
+}
+
+/* On error, returns the -errno. On success, returns number of bytes sent to the
+ * user. May not consume all of @remaining_len.
+ */
+static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb,
+ unsigned int offset, struct msghdr *msg,
+ int remaining_len)
+{
+ struct dmabuf_cmsg dmabuf_cmsg = { 0 };
+ struct tcp_xa_pool tcp_xa_pool;
+ unsigned int start;
+ int i, copy, n;
+ int sent = 0;
+ int err = 0;
+
+ tcp_xa_pool.max = 0;
+ tcp_xa_pool.idx = 0;
+ do {
+ start = skb_headlen(skb);
+
+ if (skb_frags_readable(skb)) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ /* Copy header. */
+ copy = start - offset;
+ if (copy > 0) {
+ copy = min(copy, remaining_len);
+
+ n = copy_to_iter(skb->data + offset, copy,
+ &msg->msg_iter);
+ if (n != copy) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ offset += copy;
+ remaining_len -= copy;
+
+ /* First a dmabuf_cmsg for # bytes copied to user
+ * buffer.
+ */
+ memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg));
+ dmabuf_cmsg.frag_size = copy;
+ err = put_cmsg_notrunc(msg, SOL_SOCKET,
+ SO_DEVMEM_LINEAR,
+ sizeof(dmabuf_cmsg),
+ &dmabuf_cmsg);
+ if (err)
+ goto out;
+
+ sent += copy;
+
+ if (remaining_len == 0)
+ goto out;
+ }
+
+ /* after that, send information of dmabuf pages through a
+ * sequence of cmsg
+ */
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ struct net_iov *niov;
+ u64 frag_offset;
+ int end;
+
+ /* !skb_frags_readable() should indicate that ALL the
+ * frags in this skb are dmabuf net_iovs. We're checking
+ * for that flag above, but also check individual frags
+ * here. If the tcp stack is not setting
+ * skb_frags_readable() correctly, we still don't want
+ * to crash here.
+ */
+ if (!skb_frag_net_iov(frag)) {
+ net_err_ratelimited("Found non-dmabuf skb with net_iov");
+ err = -ENODEV;
+ goto out;
+ }
+
+ niov = skb_frag_net_iov(frag);
+ if (!net_is_devmem_iov(niov)) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ end = start + skb_frag_size(frag);
+ copy = end - offset;
+
+ if (copy > 0) {
+ copy = min(copy, remaining_len);
+
+ frag_offset = net_iov_virtual_addr(niov) +
+ skb_frag_off(frag) + offset -
+ start;
+ dmabuf_cmsg.frag_offset = frag_offset;
+ dmabuf_cmsg.frag_size = copy;
+ err = tcp_xa_pool_refill(sk, &tcp_xa_pool,
+ skb_shinfo(skb)->nr_frags - i);
+ if (err)
+ goto out;
+
+ /* Will perform the exchange later */
+ dmabuf_cmsg.frag_token = tcp_xa_pool.tokens[tcp_xa_pool.idx];
+ dmabuf_cmsg.dmabuf_id = net_devmem_iov_binding_id(niov);
+
+ offset += copy;
+ remaining_len -= copy;
+
+ err = put_cmsg_notrunc(msg, SOL_SOCKET,
+ SO_DEVMEM_DMABUF,
+ sizeof(dmabuf_cmsg),
+ &dmabuf_cmsg);
+ if (err)
+ goto out;
+
+ atomic_long_inc(&niov->desc.pp_ref_count);
+ tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag);
+
+ sent += copy;
+
+ if (remaining_len == 0)
+ goto out;
+ }
+ start = end;
+ }
+
+ tcp_xa_pool_commit(sk, &tcp_xa_pool);
+ if (!remaining_len)
+ goto out;
+
+ /* if remaining_len is not satisfied yet, we need to go to the
+ * next frag in the frag_list to satisfy remaining_len.
+ */
+ skb = skb_shinfo(skb)->frag_list ?: skb->next;
+
+ offset = offset - start;
+ } while (skb);
+
+ if (remaining_len) {
+ err = -EFAULT;
+ goto out;
+ }
+
+out:
+ tcp_xa_pool_commit(sk, &tcp_xa_pool);
+ if (!sent)
+ sent = err;
+
+ return sent;
+}
+
/*
* This routine copies from a sock struct into the user buffer.
*
@@ -1912,38 +2631,32 @@ static int tcp_inq_hint(struct sock *sk)
* Probably, code can be easily improved even more.
*/
-int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
- int flags, int *addr_len)
+static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
+ int flags, struct scm_timestamping_internal *tss,
+ int *cmsg_flags)
{
struct tcp_sock *tp = tcp_sk(sk);
+ int last_copied_dmabuf = -1; /* uninitialized */
int copied = 0;
u32 peek_seq;
u32 *seq;
unsigned long used;
- int err, inq;
+ int err;
int target; /* Read at least this many bytes */
long timeo;
struct sk_buff *skb, *last;
+ u32 peek_offset = 0;
u32 urg_hole = 0;
- struct scm_timestamping tss;
- bool has_tss = false;
- bool has_cmsg;
-
- if (unlikely(flags & MSG_ERRQUEUE))
- return inet_recv_error(sk, msg, len, addr_len);
-
- if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue) &&
- (sk->sk_state == TCP_ESTABLISHED))
- sk_busy_loop(sk, nonblock);
-
- lock_sock(sk);
err = -ENOTCONN;
if (sk->sk_state == TCP_LISTEN)
goto out;
- has_cmsg = tp->recvmsg_inq;
- timeo = sock_rcvtimeo(sk, nonblock);
+ if (tp->recvmsg_inq) {
+ *cmsg_flags = TCP_CMSG_INQ;
+ msg->msg_get_inq = 1;
+ }
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
/* Urgent data needs to be handled specially. */
if (flags & MSG_OOB)
@@ -1966,7 +2679,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
seq = &tp->copied_seq;
if (flags & MSG_PEEK) {
- peek_seq = tp->copied_seq;
+ peek_offset = max(sk_peek_offset(sk, flags), 0);
+ peek_seq = tp->copied_seq + peek_offset;
seq = &peek_seq;
}
@@ -1976,7 +2690,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
u32 offset;
/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
- if (tp->urg_data && tp->urg_seq == *seq) {
+ if (unlikely(tp->urg_data) && tp->urg_seq == *seq) {
if (copied)
break;
if (signal_pending(current)) {
@@ -2015,14 +2729,14 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
/* Well, if we have backlog, try to process it now yet. */
- if (copied >= target && !sk->sk_backlog.tail)
+ if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
break;
if (copied) {
- if (sk->sk_err ||
+ if (!timeo ||
+ sk->sk_err ||
sk->sk_state == TCP_CLOSE ||
(sk->sk_shutdown & RCV_SHUTDOWN) ||
- !timeo ||
signal_pending(current))
break;
} else {
@@ -2056,38 +2770,40 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
}
}
- tcp_cleanup_rbuf(sk, copied);
-
if (copied >= target) {
/* Do not sleep, just process backlog. */
- release_sock(sk);
- lock_sock(sk);
+ __sk_flush_backlog(sk);
} else {
- sk_wait_data(sk, &timeo, last);
+ tcp_cleanup_rbuf(sk, copied);
+ err = sk_wait_data(sk, &timeo, last);
+ if (err < 0) {
+ err = copied ? : err;
+ goto out;
+ }
}
if ((flags & MSG_PEEK) &&
- (peek_seq - copied - urg_hole != tp->copied_seq)) {
+ (peek_seq - peek_offset - copied - urg_hole != tp->copied_seq)) {
net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n",
current->comm,
task_pid_nr(current));
- peek_seq = tp->copied_seq;
+ peek_seq = tp->copied_seq + peek_offset;
}
continue;
- found_ok_skb:
+found_ok_skb:
/* Ok so how much can we use? */
used = skb->len - offset;
if (len < used)
used = len;
/* Do we have urgent data here? */
- if (tp->urg_data) {
+ if (unlikely(tp->urg_data)) {
u32 urg_offset = tp->urg_seq - *seq;
if (urg_offset < used) {
if (!urg_offset) {
if (!sock_flag(sk, SOCK_URGINLINE)) {
- ++*seq;
+ WRITE_ONCE(*seq, *seq + 1);
urg_hole++;
offset++;
used--;
@@ -2100,45 +2816,78 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
}
if (!(flags & MSG_TRUNC)) {
- err = skb_copy_datagram_msg(skb, offset, msg, used);
- if (err) {
- /* Exception. Bailout! */
- if (!copied)
- copied = -EFAULT;
+ if (last_copied_dmabuf != -1 &&
+ last_copied_dmabuf != !skb_frags_readable(skb))
break;
+
+ if (skb_frags_readable(skb)) {
+ err = skb_copy_datagram_msg(skb, offset, msg,
+ used);
+ if (err) {
+ /* Exception. Bailout! */
+ if (!copied)
+ copied = -EFAULT;
+ break;
+ }
+ } else {
+ if (!(flags & MSG_SOCK_DEVMEM)) {
+ /* dmabuf skbs can only be received
+ * with the MSG_SOCK_DEVMEM flag.
+ */
+ if (!copied)
+ copied = -EFAULT;
+
+ break;
+ }
+
+ err = tcp_recvmsg_dmabuf(sk, skb, offset, msg,
+ used);
+ if (err < 0) {
+ if (!copied)
+ copied = err;
+
+ break;
+ }
+ used = err;
}
}
- *seq += used;
+ last_copied_dmabuf = !skb_frags_readable(skb);
+
+ WRITE_ONCE(*seq, *seq + used);
copied += used;
len -= used;
-
+ if (flags & MSG_PEEK)
+ sk_peek_offset_fwd(sk, used);
+ else
+ sk_peek_offset_bwd(sk, used);
tcp_rcv_space_adjust(sk);
skip_copy:
- if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
- tp->urg_data = 0;
+ if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) {
+ WRITE_ONCE(tp->urg_data, 0);
tcp_fast_path_check(sk);
}
- if (used + offset < skb->len)
- continue;
if (TCP_SKB_CB(skb)->has_rxtstamp) {
- tcp_update_recv_tstamps(skb, &tss);
- has_tss = true;
- has_cmsg = true;
+ tcp_update_recv_tstamps(skb, tss);
+ *cmsg_flags |= TCP_CMSG_TS;
}
+
+ if (used + offset < skb->len)
+ continue;
+
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
continue;
- found_fin_ok:
+found_fin_ok:
/* Process the FIN. */
- ++*seq;
+ WRITE_ONCE(*seq, *seq + 1);
if (!(flags & MSG_PEEK))
- sk_eat_skb(sk, skb);
+ tcp_eat_recv_skb(sk, skb);
break;
} while (len > 0);
@@ -2148,22 +2897,9 @@ skip_copy:
/* Clean up data we have read: This will do ACK frames. */
tcp_cleanup_rbuf(sk, copied);
-
- release_sock(sk);
-
- if (has_cmsg) {
- if (has_tss)
- tcp_recv_timestamp(msg, sk, &tss);
- if (tp->recvmsg_inq) {
- inq = tcp_inq_hint(sk);
- put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
- }
- }
-
return copied;
out:
- release_sock(sk);
return err;
recv_urg:
@@ -2174,7 +2910,38 @@ recv_sndq:
err = tcp_peek_sndq(sk, msg, len);
goto out;
}
-EXPORT_SYMBOL(tcp_recvmsg);
+
+int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+ int *addr_len)
+{
+ int cmsg_flags = 0, ret;
+ struct scm_timestamping_internal tss;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+
+ if (sk_can_busy_loop(sk) &&
+ skb_queue_empty_lockless(&sk->sk_receive_queue) &&
+ sk->sk_state == TCP_ESTABLISHED)
+ sk_busy_loop(sk, flags & MSG_DONTWAIT);
+
+ lock_sock(sk);
+ ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags);
+ release_sock(sk);
+
+ if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) {
+ if (cmsg_flags & TCP_CMSG_TS)
+ tcp_recv_timestamp(msg, sk, &tss);
+ if (msg->msg_get_inq) {
+ msg->msg_inq = tcp_inq_hint(sk);
+ if (cmsg_flags & TCP_CMSG_INQ)
+ put_cmsg(msg, SOL_TCP, TCP_CM_INQ,
+ sizeof(msg->msg_inq), &msg->msg_inq);
+ }
+ }
+ return ret;
+}
+EXPORT_IPV6_MOD(tcp_recvmsg);
void tcp_set_state(struct sock *sk, int state)
{
@@ -2199,8 +2966,20 @@ void tcp_set_state(struct sock *sk, int state)
BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN);
BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING);
BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV);
+ BUILD_BUG_ON((int)BPF_TCP_BOUND_INACTIVE != (int)TCP_BOUND_INACTIVE);
BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES);
+ /* bpf uapi header bpf.h defines an anonymous enum with values
+ * BPF_TCP_* used by bpf programs. Currently gcc built vmlinux
+ * is able to emit this enum in DWARF due to the above BUILD_BUG_ON.
+ * But clang built vmlinux does not have this enum in DWARF
+ * since clang removes the above code before generating IR/debuginfo.
+ * Let us explicitly emit the type debuginfo to ensure the
+ * above-mentioned anonymous enum in the vmlinux DWARF and hence BTF
+ * regardless of which compiler is used.
+ */
+ BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED);
+
if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG))
tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state);
@@ -2209,6 +2988,10 @@ void tcp_set_state(struct sock *sk, int state)
if (oldstate != TCP_ESTABLISHED)
TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
break;
+ case TCP_CLOSE_WAIT:
+ if (oldstate == TCP_SYN_RECV)
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
+ break;
case TCP_CLOSE:
if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED)
@@ -2218,9 +3001,9 @@ void tcp_set_state(struct sock *sk, int state)
if (inet_csk(sk)->icsk_bind_hash &&
!(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
inet_put_port(sk);
- /* fall through */
+ fallthrough;
default:
- if (oldstate == TCP_ESTABLISHED)
+ if (oldstate == TCP_ESTABLISHED || oldstate == TCP_CLOSE_WAIT)
TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB);
}
@@ -2228,10 +3011,6 @@ void tcp_set_state(struct sock *sk, int state)
* socket sitting in hash tables.
*/
inet_sk_state_store(sk, state);
-
-#ifdef STATE_TRACE
- SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
-#endif
}
EXPORT_SYMBOL_GPL(tcp_set_state);
@@ -2286,19 +3065,53 @@ void tcp_shutdown(struct sock *sk, int how)
/* If we've already sent a FIN, or it's a closed state, skip this. */
if ((1 << sk->sk_state) &
(TCPF_ESTABLISHED | TCPF_SYN_SENT |
- TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
+ TCPF_CLOSE_WAIT)) {
/* Clear out any half completed packets. FIN if needed. */
if (tcp_close_state(sk))
tcp_send_fin(sk);
}
}
-EXPORT_SYMBOL(tcp_shutdown);
+EXPORT_IPV6_MOD(tcp_shutdown);
+
+int tcp_orphan_count_sum(void)
+{
+ int i, total = 0;
+
+ for_each_possible_cpu(i)
+ total += per_cpu(tcp_orphan_count, i);
+
+ return max(total, 0);
+}
-bool tcp_check_oom(struct sock *sk, int shift)
+static int tcp_orphan_cache;
+static struct timer_list tcp_orphan_timer;
+#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100)
+
+static void tcp_orphan_update(struct timer_list *unused)
+{
+ WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum());
+ mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
+}
+
+static bool tcp_too_many_orphans(int shift)
+{
+ return READ_ONCE(tcp_orphan_cache) << shift >
+ READ_ONCE(sysctl_tcp_max_orphans);
+}
+
+static bool tcp_out_of_memory(const struct sock *sk)
+{
+ if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
+ sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2))
+ return true;
+ return false;
+}
+
+bool tcp_check_oom(const struct sock *sk, int shift)
{
bool too_many_orphans, out_of_socket_memory;
- too_many_orphans = tcp_too_many_orphans(sk, shift);
+ too_many_orphans = tcp_too_many_orphans(shift);
out_of_socket_memory = tcp_out_of_memory(sk);
if (too_many_orphans)
@@ -2308,14 +3121,13 @@ bool tcp_check_oom(struct sock *sk, int shift)
return too_many_orphans || out_of_socket_memory;
}
-void tcp_close(struct sock *sk, long timeout)
+void __tcp_close(struct sock *sk, long timeout)
{
+ bool data_was_unread = false;
struct sk_buff *skb;
- int data_was_unread = 0;
int state;
- lock_sock(sk);
- sk->sk_shutdown = SHUTDOWN_MASK;
+ WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
if (sk->sk_state == TCP_LISTEN) {
tcp_set_state(sk, TCP_CLOSE);
@@ -2330,17 +3142,16 @@ void tcp_close(struct sock *sk, long timeout)
* descriptor close, not protocol-sourced closes, because the
* reader process may not have drained the data yet!
*/
- while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
- u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
+ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
- len--;
- data_was_unread += len;
- __kfree_skb(skb);
+ end_seq--;
+ if (after(end_seq, tcp_sk(sk)->copied_seq))
+ data_was_unread = true;
+ tcp_eat_recv_skb(sk, skb);
}
- sk_mem_reclaim(sk);
-
/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
if (sk->sk_state == TCP_CLOSE)
goto adjudge_to_death;
@@ -2358,7 +3169,8 @@ void tcp_close(struct sock *sk, long timeout)
/* Unread data was tossed, zap the connection. */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE);
tcp_set_state(sk, TCP_CLOSE);
- tcp_send_active_reset(sk, sk->sk_allocation);
+ tcp_send_active_reset(sk, sk->sk_allocation,
+ SK_RST_REASON_TCP_ABORT_ON_CLOSE);
} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
/* Check zero linger _after_ checking for unread data. */
sk->sk_prot->disconnect(sk, 0);
@@ -2372,7 +3184,7 @@ void tcp_close(struct sock *sk, long timeout)
* machine. State transitions:
*
* TCP_ESTABLISHED -> TCP_FIN_WAIT1
- * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
+ * TCP_SYN_RECV -> TCP_FIN_WAIT1 (it is difficult)
* TCP_CLOSE_WAIT -> TCP_LAST_ACK
*
* are legal only when FIN has been sent (i.e. in window),
@@ -2403,18 +3215,12 @@ adjudge_to_death:
sock_hold(sk);
sock_orphan(sk);
- /* It is the last release_sock in its life. It will remove backlog. */
- release_sock(sk);
-
-
- /* Now socket is owned by kernel and we acquire BH lock
- * to finish close. No need to check for user refs.
- */
local_bh_disable();
bh_lock_sock(sk);
- WARN_ON(sock_owned_by_user(sk));
+ /* remove backlog if any, without releasing ownership. */
+ __release_sock(sk);
- percpu_counter_inc(sk->sk_prot->orphan_count);
+ tcp_orphan_count_inc();
/* Have we already been destroyed by a softirq or backlog? */
if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
@@ -2436,16 +3242,17 @@ adjudge_to_death:
if (sk->sk_state == TCP_FIN_WAIT2) {
struct tcp_sock *tp = tcp_sk(sk);
- if (tp->linger2 < 0) {
+ if (READ_ONCE(tp->linger2) < 0) {
tcp_set_state(sk, TCP_CLOSE);
- tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_TCP_ABORT_ON_LINGER);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONLINGER);
} else {
const int tmo = tcp_fin_time(sk);
if (tmo > TCP_TIMEWAIT_LEN) {
- inet_csk_reset_keepalive_timer(sk,
+ tcp_reset_keepalive_timer(sk,
tmo - TCP_TIMEWAIT_LEN);
} else {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
@@ -2454,10 +3261,10 @@ adjudge_to_death:
}
}
if (sk->sk_state != TCP_CLOSE) {
- sk_mem_reclaim(sk);
if (tcp_check_oom(sk, 0)) {
tcp_set_state(sk, TCP_CLOSE);
- tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_TCP_ABORT_ON_MEMORY);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONMEMORY);
} else if (!check_net(sock_net(sk))) {
@@ -2467,7 +3274,10 @@ adjudge_to_death:
}
if (sk->sk_state == TCP_CLOSE) {
- struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+ struct request_sock *req;
+
+ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk,
+ lockdep_sock_is_held(sk));
/* We could get here with a non-NULL req if the socket is
* aborted (e.g., closed with unread data) before 3WHS
* finishes.
@@ -2481,6 +3291,15 @@ adjudge_to_death:
out:
bh_unlock_sock(sk);
local_bh_enable();
+}
+
+void tcp_close(struct sock *sk, long timeout)
+{
+ lock_sock(sk);
+ __tcp_close(sk, timeout);
+ release_sock(sk);
+ if (!sk->sk_net_refcnt)
+ inet_csk_clear_xmit_timers_sync(sk);
sock_put(sk);
}
EXPORT_SYMBOL(tcp_close);
@@ -2498,6 +3317,7 @@ static void tcp_rtx_queue_purge(struct sock *sk)
{
struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
+ tcp_sk(sk)->highest_sack = NULL;
while (p) {
struct sk_buff *skb = rb_to_skb(p);
@@ -2506,7 +3326,7 @@ static void tcp_rtx_queue_purge(struct sock *sk)
* list_del(&skb->tcp_tsorted_anchor)
*/
tcp_rtx_queue_unlink(skb, sk);
- sk_wmem_free_skb(sk, skb);
+ tcp_wmem_free_skb(sk, skb);
}
}
@@ -2517,13 +3337,13 @@ void tcp_write_queue_purge(struct sock *sk)
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
tcp_skb_tsorted_anchor_cleanup(skb);
- sk_wmem_free_skb(sk, skb);
+ tcp_wmem_free_skb(sk, skb);
}
tcp_rtx_queue_purge(sk);
INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
- sk_mem_reclaim(sk);
tcp_clear_all_retrans_hints(tcp_sk(sk));
tcp_sk(sk)->packets_out = 0;
+ inet_csk(sk)->icsk_backoff = 0;
}
int tcp_disconnect(struct sock *sk, int flags)
@@ -2532,6 +3352,8 @@ int tcp_disconnect(struct sock *sk, int flags)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
int old_state = sk->sk_state;
+ struct request_sock *req;
+ u32 seq;
if (old_state != TCP_CLOSE)
tcp_set_state(sk, TCP_CLOSE);
@@ -2540,48 +3362,72 @@ int tcp_disconnect(struct sock *sk, int flags)
if (old_state == TCP_LISTEN) {
inet_csk_listen_stop(sk);
} else if (unlikely(tp->repair)) {
- sk->sk_err = ECONNABORTED;
- } else if (tcp_need_reset(old_state) ||
- (tp->snd_nxt != tp->write_seq &&
- (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
+ WRITE_ONCE(sk->sk_err, ECONNABORTED);
+ } else if (tcp_need_reset(old_state)) {
+ tcp_send_active_reset(sk, gfp_any(), SK_RST_REASON_TCP_STATE);
+ WRITE_ONCE(sk->sk_err, ECONNRESET);
+ } else if (tp->snd_nxt != tp->write_seq &&
+ (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) {
/* The last check adjusts for discrepancy of Linux wrt. RFC
* states
*/
- tcp_send_active_reset(sk, gfp_any());
- sk->sk_err = ECONNRESET;
+ tcp_send_active_reset(sk, gfp_any(),
+ SK_RST_REASON_TCP_DISCONNECT_WITH_DATA);
+ WRITE_ONCE(sk->sk_err, ECONNRESET);
} else if (old_state == TCP_SYN_SENT)
- sk->sk_err = ECONNRESET;
+ WRITE_ONCE(sk->sk_err, ECONNRESET);
tcp_clear_xmit_timers(sk);
__skb_queue_purge(&sk->sk_receive_queue);
- tp->copied_seq = tp->rcv_nxt;
- tp->urg_data = 0;
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
+ WRITE_ONCE(tp->urg_data, 0);
+ sk_set_peek_off(sk, -1);
tcp_write_queue_purge(sk);
tcp_fastopen_active_disable_ofo_check(sk);
skb_rbtree_purge(&tp->out_of_order_queue);
inet->inet_dport = 0;
- if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
- inet_reset_saddr(sk);
+ inet_bhash2_reset_saddr(sk);
- sk->sk_shutdown = 0;
+ WRITE_ONCE(sk->sk_shutdown, 0);
sock_reset_flag(sk, SOCK_DONE);
tp->srtt_us = 0;
+ tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
tp->rcv_rtt_last_tsecr = 0;
- tp->write_seq += tp->max_window + 2;
- if (tp->write_seq == 0)
- tp->write_seq = 1;
+
+ seq = tp->write_seq + tp->max_window + 2;
+ if (!seq)
+ seq = 1;
+ WRITE_ONCE(tp->write_seq, seq);
+
icsk->icsk_backoff = 0;
- tp->snd_cwnd = 2;
- icsk->icsk_probes_out = 0;
+ WRITE_ONCE(icsk->icsk_probes_out, 0);
+ icsk->icsk_probes_tstamp = 0;
+ icsk->icsk_rto = TCP_TIMEOUT_INIT;
+ WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN);
+ WRITE_ONCE(icsk->icsk_delack_max, TCP_DELACK_MAX);
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
+ tcp_snd_cwnd_set(tp, TCP_INIT_CWND);
tp->snd_cwnd_cnt = 0;
+ tp->is_cwnd_limited = 0;
+ tp->max_packets_out = 0;
tp->window_clamp = 0;
+ tp->delivered = 0;
tp->delivered_ce = 0;
+ tp->accecn_fail_mode = 0;
+ tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN;
+ tcp_accecn_init_counters(tp);
+ tp->prev_ecnfield = 0;
+ tp->accecn_opt_tstamp = 0;
+ if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release)
+ icsk->icsk_ca_ops->release(sk);
+ memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
+ icsk->icsk_ca_initialized = 0;
tcp_set_ca_state(sk, TCP_CA_Open);
tp->is_sack_reneg = 0;
tcp_clear_retrans(tp);
+ tp->total_retrans = 0;
inet_csk_delack_init(sk);
/* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
* issue in __tcp_select_window()
@@ -2589,18 +3435,51 @@ int tcp_disconnect(struct sock *sk, int flags)
icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
__sk_dst_reset(sk);
- dst_release(sk->sk_rx_dst);
- sk->sk_rx_dst = NULL;
+ dst_release(unrcu_pointer(xchg(&sk->sk_rx_dst, NULL)));
tcp_saved_syn_free(tp);
tp->compressed_ack = 0;
+ tp->segs_in = 0;
+ tp->segs_out = 0;
tp->bytes_sent = 0;
+ tp->bytes_acked = 0;
+ tp->bytes_received = 0;
tp->bytes_retrans = 0;
+ tp->data_segs_in = 0;
+ tp->data_segs_out = 0;
+ tp->duplicate_sack[0].start_seq = 0;
+ tp->duplicate_sack[0].end_seq = 0;
tp->dsack_dups = 0;
tp->reord_seen = 0;
+ tp->retrans_out = 0;
+ tp->sacked_out = 0;
+ tp->tlp_high_seq = 0;
+ tp->last_oow_ack_time = 0;
+ tp->plb_rehash = 0;
+ /* There's a bubble in the pipe until at least the first ACK. */
+ tp->app_limited = ~0U;
+ tp->rate_app_limited = 1;
+ tp->rack.mstamp = 0;
+ tp->rack.advanced = 0;
+ tp->rack.reo_wnd_steps = 1;
+ tp->rack.last_delivered = 0;
+ tp->rack.reo_wnd_persist = 0;
+ tp->rack.dsack_seen = 0;
+ tp->syn_data_acked = 0;
+ tp->syn_fastopen_child = 0;
+ tp->rx_opt.saw_tstamp = 0;
+ tp->rx_opt.dsack = 0;
+ tp->rx_opt.num_sacks = 0;
+ tp->rcv_ooopack = 0;
+
/* Clean up fastopen related fields */
+ req = rcu_dereference_protected(tp->fastopen_rsk,
+ lockdep_sock_is_held(sk));
+ if (req)
+ reqsk_fastopen_remove(sk, req, false);
tcp_free_fastopen_req(tp);
- inet->defer_connect = 0;
+ inet_clear_bit(DEFER_CONNECT, sk);
+ tp->fastopen_client_fail = 0;
WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
@@ -2609,19 +3488,18 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_frag.page = NULL;
sk->sk_frag.offset = 0;
}
-
- sk->sk_error_report(sk);
+ sk_error_report(sk);
return 0;
}
EXPORT_SYMBOL(tcp_disconnect);
static inline bool tcp_can_repair_sock(const struct sock *sk)
{
- return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
+ return sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
(sk->sk_state != TCP_LISTEN);
}
-static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len)
+static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len)
{
struct tcp_repair_window opt;
@@ -2631,7 +3509,7 @@ static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int l
if (len != sizeof(opt))
return -EINVAL;
- if (copy_from_user(&opt, optbuf, sizeof(opt)))
+ if (copy_from_sockptr(&opt, optbuf, sizeof(opt)))
return -EFAULT;
if (opt.max_window < opt.snd_wnd)
@@ -2653,17 +3531,18 @@ static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int l
return 0;
}
-static int tcp_repair_options_est(struct sock *sk,
- struct tcp_repair_opt __user *optbuf, unsigned int len)
+static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf,
+ unsigned int len)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_repair_opt opt;
+ size_t offset = 0;
while (len >= sizeof(opt)) {
- if (copy_from_user(&opt, optbuf, sizeof(opt)))
+ if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt)))
return -EFAULT;
- optbuf++;
+ offset += sizeof(opt);
len -= sizeof(opt);
switch (opt.opt_code) {
@@ -2702,11 +3581,248 @@ static int tcp_repair_options_est(struct sock *sk,
return 0;
}
+DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled);
+EXPORT_IPV6_MOD(tcp_tx_delay_enabled);
+
+static void tcp_enable_tx_delay(struct sock *sk, int val)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ s32 delta = (val - tp->tcp_tx_delay) << 3;
+
+ if (val && !static_branch_unlikely(&tcp_tx_delay_enabled)) {
+ static int __tcp_tx_delay_enabled = 0;
+
+ if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) {
+ static_branch_enable(&tcp_tx_delay_enabled);
+ pr_info("TCP_TX_DELAY enabled\n");
+ }
+ }
+ /* If we change tcp_tx_delay on a live flow, adjust tp->srtt_us,
+ * tp->rtt_min, icsk_rto and sk->sk_pacing_rate.
+ * This is best effort.
+ */
+ if (delta && sk->sk_state == TCP_ESTABLISHED) {
+ s64 srtt = (s64)tp->srtt_us + delta;
+
+ tp->srtt_us = clamp_t(s64, srtt, 1, ~0U);
+
+ /* Note: does not deal with non zero icsk_backoff */
+ tcp_set_rto(sk);
+
+ minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U);
+
+ tcp_update_pacing_rate(sk);
+ }
+}
+
+/* When set indicates to always queue non-full frames. Later the user clears
+ * this option and we transmit any pending partial frames in the queue. This is
+ * meant to be used alongside sendfile() to get properly filled frames when the
+ * user (for example) must write out headers with a write() call first and then
+ * use sendfile to send out the data parts.
+ *
+ * TCP_CORK can be set together with TCP_NODELAY and it is stronger than
+ * TCP_NODELAY.
+ */
+void __tcp_sock_set_cork(struct sock *sk, bool on)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (on) {
+ tp->nonagle |= TCP_NAGLE_CORK;
+ } else {
+ tp->nonagle &= ~TCP_NAGLE_CORK;
+ if (tp->nonagle & TCP_NAGLE_OFF)
+ tp->nonagle |= TCP_NAGLE_PUSH;
+ tcp_push_pending_frames(sk);
+ }
+}
+
+void tcp_sock_set_cork(struct sock *sk, bool on)
+{
+ lock_sock(sk);
+ __tcp_sock_set_cork(sk, on);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(tcp_sock_set_cork);
+
+/* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is
+ * remembered, but it is not activated until cork is cleared.
+ *
+ * However, when TCP_NODELAY is set we make an explicit push, which overrides
+ * even TCP_CORK for currently queued segments.
+ */
+void __tcp_sock_set_nodelay(struct sock *sk, bool on)
+{
+ if (on) {
+ tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
+ tcp_push_pending_frames(sk);
+ } else {
+ tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF;
+ }
+}
+
+void tcp_sock_set_nodelay(struct sock *sk)
+{
+ lock_sock(sk);
+ __tcp_sock_set_nodelay(sk, true);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(tcp_sock_set_nodelay);
+
+static void __tcp_sock_set_quickack(struct sock *sk, int val)
+{
+ if (!val) {
+ inet_csk_enter_pingpong_mode(sk);
+ return;
+ }
+
+ inet_csk_exit_pingpong_mode(sk);
+ if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
+ inet_csk_ack_scheduled(sk)) {
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED;
+ tcp_cleanup_rbuf(sk, 1);
+ if (!(val & 1))
+ inet_csk_enter_pingpong_mode(sk);
+ }
+}
+
+void tcp_sock_set_quickack(struct sock *sk, int val)
+{
+ lock_sock(sk);
+ __tcp_sock_set_quickack(sk, val);
+ release_sock(sk);
+}
+EXPORT_SYMBOL(tcp_sock_set_quickack);
+
+int tcp_sock_set_syncnt(struct sock *sk, int val)
+{
+ if (val < 1 || val > MAX_TCP_SYNCNT)
+ return -EINVAL;
+
+ WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_sock_set_syncnt);
+
+int tcp_sock_set_user_timeout(struct sock *sk, int val)
+{
+ /* Cap the max time in ms TCP will retry or probe the window
+ * before giving up and aborting (ETIMEDOUT) a connection.
+ */
+ if (val < 0)
+ return -EINVAL;
+
+ WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_sock_set_user_timeout);
+
+int tcp_sock_set_keepidle_locked(struct sock *sk, int val)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (val < 1 || val > MAX_TCP_KEEPIDLE)
+ return -EINVAL;
+
+ /* Paired with WRITE_ONCE() in keepalive_time_when() */
+ WRITE_ONCE(tp->keepalive_time, val * HZ);
+ if (sock_flag(sk, SOCK_KEEPOPEN) &&
+ !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
+ u32 elapsed = keepalive_time_elapsed(tp);
+
+ if (tp->keepalive_time > elapsed)
+ elapsed = tp->keepalive_time - elapsed;
+ else
+ elapsed = 0;
+ tcp_reset_keepalive_timer(sk, elapsed);
+ }
+
+ return 0;
+}
+
+int tcp_sock_set_keepidle(struct sock *sk, int val)
+{
+ int err;
+
+ lock_sock(sk);
+ err = tcp_sock_set_keepidle_locked(sk, val);
+ release_sock(sk);
+ return err;
+}
+EXPORT_SYMBOL(tcp_sock_set_keepidle);
+
+int tcp_sock_set_keepintvl(struct sock *sk, int val)
+{
+ if (val < 1 || val > MAX_TCP_KEEPINTVL)
+ return -EINVAL;
+
+ WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_sock_set_keepintvl);
+
+int tcp_sock_set_keepcnt(struct sock *sk, int val)
+{
+ if (val < 1 || val > MAX_TCP_KEEPCNT)
+ return -EINVAL;
+
+ /* Paired with READ_ONCE() in keepalive_probes() */
+ WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val);
+ return 0;
+}
+EXPORT_SYMBOL(tcp_sock_set_keepcnt);
+
+int tcp_set_window_clamp(struct sock *sk, int val)
+{
+ u32 old_window_clamp, new_window_clamp, new_rcv_ssthresh;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!val) {
+ if (sk->sk_state != TCP_CLOSE)
+ return -EINVAL;
+ WRITE_ONCE(tp->window_clamp, 0);
+ return 0;
+ }
+
+ old_window_clamp = tp->window_clamp;
+ new_window_clamp = max_t(int, SOCK_MIN_RCVBUF / 2, val);
+
+ if (new_window_clamp == old_window_clamp)
+ return 0;
+
+ WRITE_ONCE(tp->window_clamp, new_window_clamp);
+
+ /* Need to apply the reserved mem provisioning only
+ * when shrinking the window clamp.
+ */
+ if (new_window_clamp < old_window_clamp) {
+ __tcp_adjust_rcv_ssthresh(sk, new_window_clamp);
+ } else {
+ new_rcv_ssthresh = min(tp->rcv_wnd, new_window_clamp);
+ tp->rcv_ssthresh = max(new_rcv_ssthresh, tp->rcv_ssthresh);
+ }
+ return 0;
+}
+
+int tcp_sock_set_maxseg(struct sock *sk, int val)
+{
+ /* Values greater than interface MTU won't take effect. However
+ * at the point when this call is done we typically don't yet
+ * know which interface is going to be used
+ */
+ if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW))
+ return -EINVAL;
+
+ WRITE_ONCE(tcp_sk(sk)->rx_opt.user_mss, val);
+ return 0;
+}
+
/*
* Socket option code for TCP.
*/
-static int do_tcp_setsockopt(struct sock *sk, int level,
- int optname, char __user *optval, unsigned int optlen)
+int do_tcp_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2722,15 +3838,17 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
if (optlen < 1)
return -EINVAL;
- val = strncpy_from_user(name, optval,
+ val = strncpy_from_sockptr(name, optval,
min_t(long, TCP_CA_NAME_MAX-1, optlen));
if (val < 0)
return -EFAULT;
name[val] = 0;
- lock_sock(sk);
- err = tcp_set_congestion_control(sk, name, true, true);
- release_sock(sk);
+ sockopt_lock_sock(sk);
+ err = tcp_set_congestion_control(sk, name, !has_current_bpf_ctx(),
+ sockopt_ns_capable(sock_net(sk)->user_ns,
+ CAP_NET_ADMIN));
+ sockopt_release_sock(sk);
return err;
}
case TCP_ULP: {
@@ -2739,28 +3857,36 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
if (optlen < 1)
return -EINVAL;
- val = strncpy_from_user(name, optval,
+ val = strncpy_from_sockptr(name, optval,
min_t(long, TCP_ULP_NAME_MAX - 1,
optlen));
if (val < 0)
return -EFAULT;
name[val] = 0;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
err = tcp_set_ulp(sk, name);
- release_sock(sk);
+ sockopt_release_sock(sk);
return err;
}
case TCP_FASTOPEN_KEY: {
- __u8 key[TCP_FASTOPEN_KEY_LENGTH];
+ __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH];
+ __u8 *backup_key = NULL;
- if (optlen != sizeof(key))
+ /* Allow a backup key as well to facilitate key rotation
+ * First key is the active one.
+ */
+ if (optlen != TCP_FASTOPEN_KEY_LENGTH &&
+ optlen != TCP_FASTOPEN_KEY_BUF_LENGTH)
return -EINVAL;
- if (copy_from_user(key, optval, optlen))
+ if (copy_from_sockptr(key, optval, optlen))
return -EFAULT;
- return tcp_fastopen_reset_cipher(net, sk, key, sizeof(key));
+ if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH)
+ backup_key = key + TCP_FASTOPEN_KEY_LENGTH;
+
+ return tcp_fastopen_reset_cipher(net, sk, key, backup_key);
}
default:
/* fallthru */
@@ -2770,39 +3896,63 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
if (optlen < sizeof(int))
return -EINVAL;
- if (get_user(val, (int __user *)optval))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
- lock_sock(sk);
-
+ /* Handle options that can be set without locking the socket. */
switch (optname) {
+ case TCP_SYNCNT:
+ return tcp_sock_set_syncnt(sk, val);
+ case TCP_USER_TIMEOUT:
+ return tcp_sock_set_user_timeout(sk, val);
+ case TCP_KEEPINTVL:
+ return tcp_sock_set_keepintvl(sk, val);
+ case TCP_KEEPCNT:
+ return tcp_sock_set_keepcnt(sk, val);
+ case TCP_LINGER2:
+ if (val < 0)
+ WRITE_ONCE(tp->linger2, -1);
+ else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
+ WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX);
+ else
+ WRITE_ONCE(tp->linger2, val * HZ);
+ return 0;
+ case TCP_DEFER_ACCEPT:
+ /* Translate value in seconds to number of retransmits */
+ WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept,
+ secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
+ TCP_RTO_MAX / HZ));
+ return 0;
+ case TCP_RTO_MAX_MS:
+ if (val < MSEC_PER_SEC || val > TCP_RTO_MAX_SEC * MSEC_PER_SEC)
+ return -EINVAL;
+ WRITE_ONCE(inet_csk(sk)->icsk_rto_max, msecs_to_jiffies(val));
+ return 0;
+ case TCP_RTO_MIN_US: {
+ int rto_min = usecs_to_jiffies(val);
+
+ if (rto_min > TCP_RTO_MIN || rto_min < TCP_TIMEOUT_MIN)
+ return -EINVAL;
+ WRITE_ONCE(inet_csk(sk)->icsk_rto_min, rto_min);
+ return 0;
+ }
+ case TCP_DELACK_MAX_US: {
+ int delack_max = usecs_to_jiffies(val);
+
+ if (delack_max > TCP_DELACK_MAX || delack_max < TCP_TIMEOUT_MIN)
+ return -EINVAL;
+ WRITE_ONCE(inet_csk(sk)->icsk_delack_max, delack_max);
+ return 0;
+ }
case TCP_MAXSEG:
- /* Values greater than interface MTU won't take effect. However
- * at the point when this call is done we typically don't yet
- * know which interface is going to be used
- */
- if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) {
- err = -EINVAL;
- break;
- }
- tp->rx_opt.user_mss = val;
- break;
+ return tcp_sock_set_maxseg(sk, val);
+ }
+ sockopt_lock_sock(sk);
+
+ switch (optname) {
case TCP_NODELAY:
- if (val) {
- /* TCP_NODELAY is weaker than TCP_CORK, so that
- * this option on corked socket is remembered, but
- * it is not activated until cork is cleared.
- *
- * However, when TCP_NODELAY is set we make
- * an explicit push, which overrides even TCP_CORK
- * for currently queued segments.
- */
- tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
- tcp_push_pending_frames(sk);
- } else {
- tp->nonagle &= ~TCP_NAGLE_OFF;
- }
+ __tcp_sock_set_nodelay(sk, val);
break;
case TCP_THIN_LINEAR_TIMEOUTS:
@@ -2846,155 +3996,92 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
break;
case TCP_QUEUE_SEQ:
- if (sk->sk_state != TCP_CLOSE)
+ if (sk->sk_state != TCP_CLOSE) {
err = -EPERM;
- else if (tp->repair_queue == TCP_SEND_QUEUE)
- tp->write_seq = val;
- else if (tp->repair_queue == TCP_RECV_QUEUE)
- tp->rcv_nxt = val;
- else
+ } else if (tp->repair_queue == TCP_SEND_QUEUE) {
+ if (!tcp_rtx_queue_empty(sk))
+ err = -EPERM;
+ else
+ WRITE_ONCE(tp->write_seq, val);
+ } else if (tp->repair_queue == TCP_RECV_QUEUE) {
+ if (tp->rcv_nxt != tp->copied_seq) {
+ err = -EPERM;
+ } else {
+ WRITE_ONCE(tp->rcv_nxt, val);
+ WRITE_ONCE(tp->copied_seq, val);
+ }
+ } else {
err = -EINVAL;
+ }
break;
case TCP_REPAIR_OPTIONS:
if (!tp->repair)
err = -EINVAL;
- else if (sk->sk_state == TCP_ESTABLISHED)
- err = tcp_repair_options_est(sk,
- (struct tcp_repair_opt __user *)optval,
- optlen);
+ else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent)
+ err = tcp_repair_options_est(sk, optval, optlen);
else
err = -EPERM;
break;
case TCP_CORK:
- /* When set indicates to always queue non-full frames.
- * Later the user clears this option and we transmit
- * any pending partial frames in the queue. This is
- * meant to be used alongside sendfile() to get properly
- * filled frames when the user (for example) must write
- * out headers with a write() call first and then use
- * sendfile to send out the data parts.
- *
- * TCP_CORK can be set together with TCP_NODELAY and it is
- * stronger than TCP_NODELAY.
- */
- if (val) {
- tp->nonagle |= TCP_NAGLE_CORK;
- } else {
- tp->nonagle &= ~TCP_NAGLE_CORK;
- if (tp->nonagle&TCP_NAGLE_OFF)
- tp->nonagle |= TCP_NAGLE_PUSH;
- tcp_push_pending_frames(sk);
- }
+ __tcp_sock_set_cork(sk, val);
break;
case TCP_KEEPIDLE:
- if (val < 1 || val > MAX_TCP_KEEPIDLE)
- err = -EINVAL;
- else {
- tp->keepalive_time = val * HZ;
- if (sock_flag(sk, SOCK_KEEPOPEN) &&
- !((1 << sk->sk_state) &
- (TCPF_CLOSE | TCPF_LISTEN))) {
- u32 elapsed = keepalive_time_elapsed(tp);
- if (tp->keepalive_time > elapsed)
- elapsed = tp->keepalive_time - elapsed;
- else
- elapsed = 0;
- inet_csk_reset_keepalive_timer(sk, elapsed);
- }
- }
+ err = tcp_sock_set_keepidle_locked(sk, val);
break;
- case TCP_KEEPINTVL:
- if (val < 1 || val > MAX_TCP_KEEPINTVL)
- err = -EINVAL;
- else
- tp->keepalive_intvl = val * HZ;
- break;
- case TCP_KEEPCNT:
- if (val < 1 || val > MAX_TCP_KEEPCNT)
- err = -EINVAL;
- else
- tp->keepalive_probes = val;
- break;
- case TCP_SYNCNT:
- if (val < 1 || val > MAX_TCP_SYNCNT)
- err = -EINVAL;
- else
- icsk->icsk_syn_retries = val;
- break;
-
case TCP_SAVE_SYN:
- if (val < 0 || val > 1)
+ /* 0: disable, 1: enable, 2: start from ether_header */
+ if (val < 0 || val > 2)
err = -EINVAL;
else
tp->save_syn = val;
break;
- case TCP_LINGER2:
- if (val < 0)
- tp->linger2 = -1;
- else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ)
- tp->linger2 = 0;
- else
- tp->linger2 = val * HZ;
- break;
-
- case TCP_DEFER_ACCEPT:
- /* Translate value in seconds to number of retransmits */
- icsk->icsk_accept_queue.rskq_defer_accept =
- secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
- TCP_RTO_MAX / HZ);
- break;
-
case TCP_WINDOW_CLAMP:
- if (!val) {
- if (sk->sk_state != TCP_CLOSE) {
- err = -EINVAL;
- break;
- }
- tp->window_clamp = 0;
- } else
- tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
- SOCK_MIN_RCVBUF / 2 : val;
+ err = tcp_set_window_clamp(sk, val);
break;
case TCP_QUICKACK:
- if (!val) {
- icsk->icsk_ack.pingpong = 1;
- } else {
- icsk->icsk_ack.pingpong = 0;
- if ((1 << sk->sk_state) &
- (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
- inet_csk_ack_scheduled(sk)) {
- icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
- tcp_cleanup_rbuf(sk, 1);
- if (!(val & 1))
- icsk->icsk_ack.pingpong = 1;
- }
- }
+ __tcp_sock_set_quickack(sk, val);
break;
+ case TCP_AO_REPAIR:
+ if (!tcp_can_repair_sock(sk)) {
+ err = -EPERM;
+ break;
+ }
+ err = tcp_ao_set_repair(sk, optval, optlen);
+ break;
+#ifdef CONFIG_TCP_AO
+ case TCP_AO_ADD_KEY:
+ case TCP_AO_DEL_KEY:
+ case TCP_AO_INFO: {
+ /* If this is the first TCP-AO setsockopt() on the socket,
+ * sk_state has to be LISTEN or CLOSE. Allow TCP_REPAIR
+ * in any state.
+ */
+ if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
+ goto ao_parse;
+ if (rcu_dereference_protected(tcp_sk(sk)->ao_info,
+ lockdep_sock_is_held(sk)))
+ goto ao_parse;
+ if (tp->repair)
+ goto ao_parse;
+ err = -EISCONN;
+ break;
+ao_parse:
+ err = tp->af_specific->ao_parse(sk, optname, optval, optlen);
+ break;
+ }
+#endif
#ifdef CONFIG_TCP_MD5SIG
case TCP_MD5SIG:
case TCP_MD5SIG_EXT:
- if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
- err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
- else
- err = -EINVAL;
+ err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
break;
#endif
- case TCP_USER_TIMEOUT:
- /* Cap the max time in ms TCP will retry or probe the window
- * before giving up and aborting (ETIMEDOUT) a connection.
- */
- if (val < 0)
- err = -EINVAL;
- else
- icsk->icsk_user_timeout = val;
- break;
-
case TCP_FASTOPEN:
if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
TCPF_LISTEN))) {
@@ -3008,7 +4095,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_FASTOPEN_CONNECT:
if (val > 1 || val < 0) {
err = -EINVAL;
- } else if (net->ipv4.sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
+ } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) &
+ TFO_CLIENT_ENABLE) {
if (sk->sk_state == TCP_CLOSE)
tp->fastopen_connect = val;
else
@@ -3026,16 +4114,22 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
tp->fastopen_no_cookie = val;
break;
case TCP_TIMESTAMP:
- if (!tp->repair)
+ if (!tp->repair) {
err = -EPERM;
- else
- tp->tsoffset = val - tcp_time_stamp_raw();
+ break;
+ }
+ /* val is an opaque field,
+ * and low order bit contains usec_ts enable bit.
+ * Its a best effort, and we do not care if user makes an error.
+ */
+ tp->tcp_usec_ts = val & 1;
+ WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(tp->tcp_usec_ts));
break;
case TCP_REPAIR_WINDOW:
err = tcp_repair_set_window(tp, optval, optlen);
break;
case TCP_NOTSENT_LOWAT:
- tp->notsent_lowat = val;
+ WRITE_ONCE(tp->notsent_lowat, val);
sk->sk_write_space(sk);
break;
case TCP_INQ:
@@ -3044,38 +4138,36 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
else
tp->recvmsg_inq = val;
break;
+ case TCP_TX_DELAY:
+ /* tp->srtt_us is u32, and is shifted by 3 */
+ if (val < 0 || val >= (1U << (31 - 3))) {
+ err = -EINVAL;
+ break;
+ }
+ tcp_enable_tx_delay(sk, val);
+ WRITE_ONCE(tp->tcp_tx_delay, val);
+ break;
default:
err = -ENOPROTOOPT;
break;
}
- release_sock(sk);
+ sockopt_release_sock(sk);
return err;
}
-int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
+int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
unsigned int optlen)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
if (level != SOL_TCP)
- return icsk->icsk_af_ops->setsockopt(sk, level, optname,
- optval, optlen);
- return do_tcp_setsockopt(sk, level, optname, optval, optlen);
-}
-EXPORT_SYMBOL(tcp_setsockopt);
-
-#ifdef CONFIG_COMPAT
-int compat_tcp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- if (level != SOL_TCP)
- return inet_csk_compat_setsockopt(sk, level, optname,
- optval, optlen);
+ /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */
+ return READ_ONCE(icsk->icsk_af_ops)->setsockopt(sk, level, optname,
+ optval, optlen);
return do_tcp_setsockopt(sk, level, optname, optval, optlen);
}
-EXPORT_SYMBOL(compat_tcp_setsockopt);
-#endif
+EXPORT_IPV6_MOD(tcp_setsockopt);
static void tcp_get_info_chrono_stats(const struct tcp_sock *tp,
struct tcp_info *info)
@@ -3101,10 +4193,13 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
+ const u8 ect1_idx = INET_ECN_ECT_1 - 1;
+ const u8 ect0_idx = INET_ECN_ECT_0 - 1;
+ const u8 ce_idx = INET_ECN_CE - 1;
+ unsigned long rate;
u32 now;
u64 rate64;
bool slow;
- u32 rate;
memset(info, 0, sizeof(*info));
if (sk->sk_type != SOCK_STREAM)
@@ -3114,23 +4209,23 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
/* Report meaningful fields for all TCP states, including listeners */
rate = READ_ONCE(sk->sk_pacing_rate);
- rate64 = rate != ~0U ? rate : ~0ULL;
+ rate64 = (rate != ~0UL) ? rate : ~0ULL;
info->tcpi_pacing_rate = rate64;
rate = READ_ONCE(sk->sk_max_pacing_rate);
- rate64 = rate != ~0U ? rate : ~0ULL;
+ rate64 = (rate != ~0UL) ? rate : ~0ULL;
info->tcpi_max_pacing_rate = rate64;
info->tcpi_reordering = tp->reordering;
- info->tcpi_snd_cwnd = tp->snd_cwnd;
+ info->tcpi_snd_cwnd = tcp_snd_cwnd(tp);
if (info->tcpi_state == TCP_LISTEN) {
/* listeners aliased fields :
* tcpi_unacked -> Number of children ready for accept()
* tcpi_sacked -> max backlog
*/
- info->tcpi_unacked = sk->sk_ack_backlog;
- info->tcpi_sacked = sk->sk_max_ack_backlog;
+ info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog);
+ info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog);
return;
}
@@ -3151,15 +4246,20 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
}
- if (tp->ecn_flags & TCP_ECN_OK)
+ if (tcp_ecn_mode_any(tp))
info->tcpi_options |= TCPI_OPT_ECN;
if (tp->ecn_flags & TCP_ECN_SEEN)
info->tcpi_options |= TCPI_OPT_ECN_SEEN;
if (tp->syn_data_acked)
info->tcpi_options |= TCPI_OPT_SYN_DATA;
+ if (tp->tcp_usec_ts)
+ info->tcpi_options |= TCPI_OPT_USEC_TS;
+ if (tp->syn_fastopen_child)
+ info->tcpi_options |= TCPI_OPT_TFO_CHILD;
info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto);
- info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato);
+ info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato,
+ tcp_delack_max(sk)));
info->tcpi_snd_mss = tp->mss_cache;
info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
@@ -3192,10 +4292,12 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
tcp_get_info_chrono_stats(tp, info);
info->tcpi_segs_out = tp->segs_out;
- info->tcpi_segs_in = tp->segs_in;
+
+ /* segs_in and data_segs_in can be updated from tcp_segs_in() from BH */
+ info->tcpi_segs_in = READ_ONCE(tp->segs_in);
+ info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in);
info->tcpi_min_rtt = tcp_min_rtt(tp);
- info->tcpi_data_segs_in = tp->data_segs_in;
info->tcpi_data_segs_out = tp->data_segs_out;
info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0;
@@ -3208,6 +4310,28 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_bytes_retrans = tp->bytes_retrans;
info->tcpi_dsack_dups = tp->dsack_dups;
info->tcpi_reord_seen = tp->reord_seen;
+ info->tcpi_rcv_ooopack = tp->rcv_ooopack;
+ info->tcpi_snd_wnd = tp->snd_wnd;
+ info->tcpi_rcv_wnd = tp->rcv_wnd;
+ info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash;
+ info->tcpi_fastopen_client_fail = tp->fastopen_client_fail;
+
+ info->tcpi_total_rto = tp->total_rto;
+ info->tcpi_total_rto_recoveries = tp->total_rto_recoveries;
+ info->tcpi_total_rto_time = tp->total_rto_time;
+ if (tp->rto_stamp)
+ info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp;
+
+ info->tcpi_accecn_fail_mode = tp->accecn_fail_mode;
+ info->tcpi_accecn_opt_seen = tp->saw_accecn_opt;
+ info->tcpi_received_ce = tp->received_ce;
+ info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx];
+ info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx];
+ info->tcpi_delivered_ce_bytes = tp->delivered_ecn_bytes[ce_idx];
+ info->tcpi_received_e1_bytes = tp->received_ecn_bytes[ect1_idx];
+ info->tcpi_received_e0_bytes = tp->received_ecn_bytes[ect0_idx];
+ info->tcpi_received_ce_bytes = tp->received_ecn_bytes[ce_idx];
+
unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -3236,16 +4360,35 @@ static size_t tcp_opt_stats_get_size(void)
nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */
+ nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */
+ nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_REHASH */
0;
}
-struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
+/* Returns TTL or hop limit of an incoming packet from skb. */
+static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb)
+{
+ if (skb->protocol == htons(ETH_P_IP))
+ return ip_hdr(skb)->ttl;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ return ipv6_hdr(skb)->hop_limit;
+ else
+ return 0;
+}
+
+struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
+ const struct sk_buff *orig_skb,
+ const struct sk_buff *ack_skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *stats;
struct tcp_info info;
+ unsigned long rate;
u64 rate64;
- u32 rate;
stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
if (!stats)
@@ -3264,17 +4407,18 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
tp->total_retrans, TCP_NLA_PAD);
rate = READ_ONCE(sk->sk_pacing_rate);
- rate64 = rate != ~0U ? rate : ~0ULL;
+ rate64 = (rate != ~0UL) ? rate : ~0ULL;
nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD);
rate64 = tcp_compute_delivery_rate(tp);
nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD);
- nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd);
+ nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp));
nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
- nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
+ nla_put_u8(stats, TCP_NLA_RECUR_RETRANS,
+ READ_ONCE(inet_csk(sk)->icsk_retransmits));
nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
@@ -3289,31 +4433,44 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
TCP_NLA_PAD);
nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
+ nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3);
+ nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash);
+ nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT,
+ max_t(int, 0, tp->write_seq - tp->snd_nxt));
+ nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns,
+ TCP_NLA_PAD);
+ if (ack_skb)
+ nla_put_u8(stats, TCP_NLA_TTL,
+ tcp_skb_ttl_or_hop_limit(ack_skb));
+ nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash);
return stats;
}
-static int do_tcp_getsockopt(struct sock *sk, int level,
- int optname, char __user *optval, int __user *optlen)
+int do_tcp_getsockopt(struct sock *sk, int level,
+ int optname, sockptr_t optval, sockptr_t optlen)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
+ int user_mss;
int val, len;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
- len = min_t(unsigned int, len, sizeof(int));
-
if (len < 0)
return -EINVAL;
+ len = min_t(unsigned int, len, sizeof(int));
+
switch (optname) {
case TCP_MAXSEG:
val = tp->mss_cache;
- if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
- val = tp->rx_opt.user_mss;
+ user_mss = READ_ONCE(tp->rx_opt.user_mss);
+ if (user_mss &&
+ ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+ val = user_mss;
if (tp->repair)
val = tp->rx_opt.mss_clamp;
break;
@@ -3333,32 +4490,34 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = keepalive_probes(tp);
break;
case TCP_SYNCNT:
- val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
+ val = READ_ONCE(icsk->icsk_syn_retries) ? :
+ READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
break;
case TCP_LINGER2:
- val = tp->linger2;
+ val = READ_ONCE(tp->linger2);
if (val >= 0)
- val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
+ val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ;
break;
case TCP_DEFER_ACCEPT:
- val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
- TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ);
+ val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept);
+ val = retrans_to_secs(val, TCP_TIMEOUT_INIT / HZ,
+ TCP_RTO_MAX / HZ);
break;
case TCP_WINDOW_CLAMP:
- val = tp->window_clamp;
+ val = READ_ONCE(tp->window_clamp);
break;
case TCP_INFO: {
struct tcp_info info;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
tcp_get_info(sk, &info);
len = min_t(unsigned int, len, sizeof(info));
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &info, len))
+ if (copy_to_sockptr(optval, &info, len))
return -EFAULT;
return 0;
}
@@ -3368,7 +4527,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
size_t sz = 0;
int attr;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
ca_ops = icsk->icsk_ca_ops;
@@ -3376,60 +4535,55 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
sz = ca_ops->get_info(sk, ~0U, &attr, &info);
len = min_t(unsigned int, len, sz);
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &info, len))
+ if (copy_to_sockptr(optval, &info, len))
return -EFAULT;
return 0;
}
case TCP_QUICKACK:
- val = !icsk->icsk_ack.pingpong;
+ val = !inet_csk_in_pingpong_mode(sk);
break;
case TCP_CONGESTION:
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
len = min_t(unsigned int, len, TCP_CA_NAME_MAX);
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
+ if (copy_to_sockptr(optval, icsk->icsk_ca_ops->name, len))
return -EFAULT;
return 0;
case TCP_ULP:
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
len = min_t(unsigned int, len, TCP_ULP_NAME_MAX);
if (!icsk->icsk_ulp_ops) {
- if (put_user(0, optlen))
+ len = 0;
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
return 0;
}
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len))
+ if (copy_to_sockptr(optval, icsk->icsk_ulp_ops->name, len))
return -EFAULT;
return 0;
case TCP_FASTOPEN_KEY: {
- __u8 key[TCP_FASTOPEN_KEY_LENGTH];
- struct tcp_fastopen_context *ctx;
+ u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)];
+ unsigned int key_len;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
- rcu_read_lock();
- ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx);
- if (ctx)
- memcpy(key, ctx->key, sizeof(key));
- else
- len = 0;
- rcu_read_unlock();
-
- len = min_t(unsigned int, len, sizeof(key));
- if (put_user(len, optlen))
+ key_len = tcp_fastopen_get_cipher(net, icsk, key) *
+ TCP_FASTOPEN_KEY_LENGTH;
+ len = min_t(unsigned int, len, key_len);
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, key, len))
+ if (copy_to_sockptr(optval, key, len))
return -EFAULT;
return 0;
}
@@ -3455,7 +4609,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_REPAIR_WINDOW: {
struct tcp_repair_window opt;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
if (len != sizeof(opt))
@@ -3470,7 +4624,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
opt.rcv_wnd = tp->rcv_wnd;
opt.rcv_wup = tp->rcv_wup;
- if (copy_to_user(optval, &opt, len))
+ if (copy_to_sockptr(optval, &opt, len))
return -EFAULT;
return 0;
}
@@ -3484,11 +4638,11 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
break;
case TCP_USER_TIMEOUT:
- val = icsk->icsk_user_timeout;
+ val = READ_ONCE(icsk->icsk_user_timeout);
break;
case TCP_FASTOPEN:
- val = icsk->icsk_accept_queue.fastopenq.max_qlen;
+ val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen);
break;
case TCP_FASTOPEN_CONNECT:
@@ -3499,11 +4653,19 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = tp->fastopen_no_cookie;
break;
+ case TCP_TX_DELAY:
+ val = READ_ONCE(tp->tcp_tx_delay);
+ break;
+
case TCP_TIMESTAMP:
- val = tcp_time_stamp_raw() + tp->tsoffset;
+ val = tcp_clock_ts(tp->tcp_usec_ts) + READ_ONCE(tp->tsoffset);
+ if (tp->tcp_usec_ts)
+ val |= 1;
+ else
+ val &= ~1;
break;
case TCP_NOTSENT_LOWAT:
- val = tp->notsent_lowat;
+ val = READ_ONCE(tp->notsent_lowat);
break;
case TCP_INQ:
val = tp->recvmsg_inq;
@@ -3512,227 +4674,338 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = tp->save_syn;
break;
case TCP_SAVED_SYN: {
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
if (tp->saved_syn) {
- if (len < tp->saved_syn[0]) {
- if (put_user(tp->saved_syn[0], optlen)) {
- release_sock(sk);
+ if (len < tcp_saved_syn_len(tp->saved_syn)) {
+ len = tcp_saved_syn_len(tp->saved_syn);
+ if (copy_to_sockptr(optlen, &len, sizeof(int))) {
+ sockopt_release_sock(sk);
return -EFAULT;
}
- release_sock(sk);
+ sockopt_release_sock(sk);
return -EINVAL;
}
- len = tp->saved_syn[0];
- if (put_user(len, optlen)) {
- release_sock(sk);
+ len = tcp_saved_syn_len(tp->saved_syn);
+ if (copy_to_sockptr(optlen, &len, sizeof(int))) {
+ sockopt_release_sock(sk);
return -EFAULT;
}
- if (copy_to_user(optval, tp->saved_syn + 1, len)) {
- release_sock(sk);
+ if (copy_to_sockptr(optval, tp->saved_syn->data, len)) {
+ sockopt_release_sock(sk);
return -EFAULT;
}
tcp_saved_syn_free(tp);
- release_sock(sk);
+ sockopt_release_sock(sk);
} else {
- release_sock(sk);
+ sockopt_release_sock(sk);
len = 0;
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
}
return 0;
}
#ifdef CONFIG_MMU
case TCP_ZEROCOPY_RECEIVE: {
- struct tcp_zerocopy_receive zc;
+ struct scm_timestamping_internal tss;
+ struct tcp_zerocopy_receive zc = {};
int err;
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
- if (len != sizeof(zc))
+ if (len < 0 ||
+ len < offsetofend(struct tcp_zerocopy_receive, length))
return -EINVAL;
- if (copy_from_user(&zc, optval, len))
+ if (unlikely(len > sizeof(zc))) {
+ err = check_zeroed_sockptr(optval, sizeof(zc),
+ len - sizeof(zc));
+ if (err < 1)
+ return err == 0 ? -EINVAL : err;
+ len = sizeof(zc);
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
+ return -EFAULT;
+ }
+ if (copy_from_sockptr(&zc, optval, len))
return -EFAULT;
- lock_sock(sk);
- err = tcp_zerocopy_receive(sk, &zc);
- release_sock(sk);
- if (!err && copy_to_user(optval, &zc, len))
+ if (zc.reserved)
+ return -EINVAL;
+ if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS))
+ return -EINVAL;
+ sockopt_lock_sock(sk);
+ err = tcp_zerocopy_receive(sk, &zc, &tss);
+ err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname,
+ &zc, &len, err);
+ sockopt_release_sock(sk);
+ if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags))
+ goto zerocopy_rcv_cmsg;
+ switch (len) {
+ case offsetofend(struct tcp_zerocopy_receive, msg_flags):
+ goto zerocopy_rcv_cmsg;
+ case offsetofend(struct tcp_zerocopy_receive, msg_controllen):
+ case offsetofend(struct tcp_zerocopy_receive, msg_control):
+ case offsetofend(struct tcp_zerocopy_receive, flags):
+ case offsetofend(struct tcp_zerocopy_receive, copybuf_len):
+ case offsetofend(struct tcp_zerocopy_receive, copybuf_address):
+ case offsetofend(struct tcp_zerocopy_receive, err):
+ goto zerocopy_rcv_sk_err;
+ case offsetofend(struct tcp_zerocopy_receive, inq):
+ goto zerocopy_rcv_inq;
+ case offsetofend(struct tcp_zerocopy_receive, length):
+ default:
+ goto zerocopy_rcv_out;
+ }
+zerocopy_rcv_cmsg:
+ if (zc.msg_flags & TCP_CMSG_TS)
+ tcp_zc_finalize_rx_tstamp(sk, &zc, &tss);
+ else
+ zc.msg_flags = 0;
+zerocopy_rcv_sk_err:
+ if (!err)
+ zc.err = sock_error(sk);
+zerocopy_rcv_inq:
+ zc.inq = tcp_inq_hint(sk);
+zerocopy_rcv_out:
+ if (!err && copy_to_sockptr(optval, &zc, len))
err = -EFAULT;
return err;
}
#endif
+ case TCP_AO_REPAIR:
+ if (!tcp_can_repair_sock(sk))
+ return -EPERM;
+ return tcp_ao_get_repair(sk, optval, optlen);
+ case TCP_AO_GET_KEYS:
+ case TCP_AO_INFO: {
+ int err;
+
+ sockopt_lock_sock(sk);
+ if (optname == TCP_AO_GET_KEYS)
+ err = tcp_ao_get_mkts(sk, optval, optlen);
+ else
+ err = tcp_ao_get_sock_info(sk, optval, optlen);
+ sockopt_release_sock(sk);
+
+ return err;
+ }
+ case TCP_IS_MPTCP:
+ val = 0;
+ break;
+ case TCP_RTO_MAX_MS:
+ val = jiffies_to_msecs(tcp_rto_max(sk));
+ break;
+ case TCP_RTO_MIN_US:
+ val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_rto_min));
+ break;
+ case TCP_DELACK_MAX_US:
+ val = jiffies_to_usecs(READ_ONCE(inet_csk(sk)->icsk_delack_max));
+ break;
default:
return -ENOPROTOOPT;
}
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &val, len))
+ if (copy_to_sockptr(optval, &val, len))
return -EFAULT;
return 0;
}
-int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
- int __user *optlen)
+bool tcp_bpf_bypass_getsockopt(int level, int optname)
{
- struct inet_connection_sock *icsk = inet_csk(sk);
+ /* TCP do_tcp_getsockopt has optimized getsockopt implementation
+ * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE.
+ */
+ if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE)
+ return true;
- if (level != SOL_TCP)
- return icsk->icsk_af_ops->getsockopt(sk, level, optname,
- optval, optlen);
- return do_tcp_getsockopt(sk, level, optname, optval, optlen);
+ return false;
}
-EXPORT_SYMBOL(tcp_getsockopt);
+EXPORT_IPV6_MOD(tcp_bpf_bypass_getsockopt);
-#ifdef CONFIG_COMPAT
-int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
+int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
+ int __user *optlen)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
if (level != SOL_TCP)
- return inet_csk_compat_getsockopt(sk, level, optname,
- optval, optlen);
- return do_tcp_getsockopt(sk, level, optname, optval, optlen);
+ /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */
+ return READ_ONCE(icsk->icsk_af_ops)->getsockopt(sk, level, optname,
+ optval, optlen);
+ return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval),
+ USER_SOCKPTR(optlen));
}
-EXPORT_SYMBOL(compat_tcp_getsockopt);
-#endif
+EXPORT_IPV6_MOD(tcp_getsockopt);
#ifdef CONFIG_TCP_MD5SIG
-static DEFINE_PER_CPU(struct tcp_md5sig_pool, tcp_md5sig_pool);
-static DEFINE_MUTEX(tcp_md5sig_mutex);
-static bool tcp_md5sig_pool_populated = false;
-
-static void __tcp_alloc_md5sig_pool(void)
+void tcp_md5_hash_skb_data(struct md5_ctx *ctx, const struct sk_buff *skb,
+ unsigned int header_len)
{
- struct crypto_ahash *hash;
- int cpu;
+ const unsigned int head_data_len = skb_headlen(skb) > header_len ?
+ skb_headlen(skb) - header_len : 0;
+ const struct skb_shared_info *shi = skb_shinfo(skb);
+ struct sk_buff *frag_iter;
+ unsigned int i;
- hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(hash))
- return;
+ md5_update(ctx, (const u8 *)tcp_hdr(skb) + header_len, head_data_len);
- for_each_possible_cpu(cpu) {
- void *scratch = per_cpu(tcp_md5sig_pool, cpu).scratch;
- struct ahash_request *req;
-
- if (!scratch) {
- scratch = kmalloc_node(sizeof(union tcp_md5sum_block) +
- sizeof(struct tcphdr),
- GFP_KERNEL,
- cpu_to_node(cpu));
- if (!scratch)
- return;
- per_cpu(tcp_md5sig_pool, cpu).scratch = scratch;
+ for (i = 0; i < shi->nr_frags; ++i) {
+ const skb_frag_t *f = &shi->frags[i];
+ u32 p_off, p_len, copied;
+ const void *vaddr;
+ struct page *p;
+
+ skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f),
+ p, p_off, p_len, copied) {
+ vaddr = kmap_local_page(p);
+ md5_update(ctx, vaddr + p_off, p_len);
+ kunmap_local(vaddr);
}
- if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
- continue;
+ }
- req = ahash_request_alloc(hash, GFP_KERNEL);
- if (!req)
- return;
+ skb_walk_frags(skb, frag_iter)
+ tcp_md5_hash_skb_data(ctx, frag_iter, 0);
+}
+EXPORT_IPV6_MOD(tcp_md5_hash_skb_data);
- ahash_request_set_callback(req, 0, NULL, NULL);
+void tcp_md5_hash_key(struct md5_ctx *ctx,
+ const struct tcp_md5sig_key *key)
+{
+ u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */
- per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
- }
- /* before setting tcp_md5sig_pool_populated, we must commit all writes
- * to memory. See smp_rmb() in tcp_get_md5sig_pool()
+ /* We use data_race() because tcp_md5_do_add() might change
+ * key->key under us
*/
- smp_wmb();
- tcp_md5sig_pool_populated = true;
+ data_race(({ md5_update(ctx, key->key, keylen), 0; }));
}
+EXPORT_IPV6_MOD(tcp_md5_hash_key);
-bool tcp_alloc_md5sig_pool(void)
+/* Called with rcu_read_lock() */
+static enum skb_drop_reason
+tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
+ const void *saddr, const void *daddr,
+ int family, int l3index, const __u8 *hash_location)
{
- if (unlikely(!tcp_md5sig_pool_populated)) {
- mutex_lock(&tcp_md5sig_mutex);
-
- if (!tcp_md5sig_pool_populated)
- __tcp_alloc_md5sig_pool();
+ /* This gets called for each TCP segment that has TCP-MD5 option.
+ * We have 2 drop cases:
+ * o An MD5 signature is present, but we're not expecting one.
+ * o The MD5 signature is wrong.
+ */
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_md5sig_key *key;
+ u8 newhash[16];
+
+ key = tcp_md5_do_lookup(sk, l3index, saddr, family);
+ if (!key) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
+ trace_tcp_hash_md5_unexpected(sk, skb);
+ return SKB_DROP_REASON_TCP_MD5UNEXPECTED;
+ }
- mutex_unlock(&tcp_md5sig_mutex);
+ /* Check the signature.
+ * To support dual stack listeners, we need to handle
+ * IPv4-mapped case.
+ */
+ if (family == AF_INET)
+ tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
+ else
+ tp->af_specific->calc_md5_hash(newhash, key, NULL, skb);
+ if (memcmp(hash_location, newhash, 16) != 0) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
+ trace_tcp_hash_md5_mismatch(sk, skb);
+ return SKB_DROP_REASON_TCP_MD5FAILURE;
}
- return tcp_md5sig_pool_populated;
+ return SKB_NOT_DROPPED_YET;
}
-EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
-
-
-/**
- * tcp_get_md5sig_pool - get md5sig_pool for this user
- *
- * We use percpu structure, so if we succeed, we exit with preemption
- * and BH disabled, to make sure another thread or softirq handling
- * wont try to get same context.
- */
-struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
+#else
+static inline enum skb_drop_reason
+tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb,
+ const void *saddr, const void *daddr,
+ int family, int l3index, const __u8 *hash_location)
{
- local_bh_disable();
-
- if (tcp_md5sig_pool_populated) {
- /* coupled with smp_wmb() in __tcp_alloc_md5sig_pool() */
- smp_rmb();
- return this_cpu_ptr(&tcp_md5sig_pool);
- }
- local_bh_enable();
- return NULL;
+ return SKB_NOT_DROPPED_YET;
}
-EXPORT_SYMBOL(tcp_get_md5sig_pool);
-int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
- const struct sk_buff *skb, unsigned int header_len)
-{
- struct scatterlist sg;
- const struct tcphdr *tp = tcp_hdr(skb);
- struct ahash_request *req = hp->md5_req;
- unsigned int i;
- const unsigned int head_data_len = skb_headlen(skb) > header_len ?
- skb_headlen(skb) - header_len : 0;
- const struct skb_shared_info *shi = skb_shinfo(skb);
- struct sk_buff *frag_iter;
+#endif
- sg_init_table(&sg, 1);
+/* Called with rcu_read_lock() */
+enum skb_drop_reason
+tcp_inbound_hash(struct sock *sk, const struct request_sock *req,
+ const struct sk_buff *skb,
+ const void *saddr, const void *daddr,
+ int family, int dif, int sdif)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ const struct tcp_ao_hdr *aoh;
+ const __u8 *md5_location;
+ int l3index;
+
+ /* Invalid option or two times meet any of auth options */
+ if (tcp_parse_auth_options(th, &md5_location, &aoh)) {
+ trace_tcp_hash_bad_header(sk, skb);
+ return SKB_DROP_REASON_TCP_AUTH_HDR;
+ }
- sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
- ahash_request_set_crypt(req, &sg, NULL, head_data_len);
- if (crypto_ahash_update(req))
- return 1;
+ if (req) {
+ if (tcp_rsk_used_ao(req) != !!aoh) {
+ u8 keyid, rnext, maclen;
- for (i = 0; i < shi->nr_frags; ++i) {
- const struct skb_frag_struct *f = &shi->frags[i];
- unsigned int offset = f->page_offset;
- struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
+ if (aoh) {
+ keyid = aoh->keyid;
+ rnext = aoh->rnext_keyid;
+ maclen = tcp_ao_hdr_maclen(aoh);
+ } else {
+ keyid = rnext = maclen = 0;
+ }
- sg_set_page(&sg, page, skb_frag_size(f),
- offset_in_page(offset));
- ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
- if (crypto_ahash_update(req))
- return 1;
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD);
+ trace_tcp_ao_handshake_failure(sk, skb, keyid, rnext, maclen);
+ return SKB_DROP_REASON_TCP_AOFAILURE;
+ }
}
- skb_walk_frags(skb, frag_iter)
- if (tcp_md5_hash_skb_data(hp, frag_iter, 0))
- return 1;
-
- return 0;
-}
-EXPORT_SYMBOL(tcp_md5_hash_skb_data);
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and dif is set to the l3mdev
+ */
+ l3index = sdif ? dif : 0;
+
+ /* Fast path: unsigned segments */
+ if (likely(!md5_location && !aoh)) {
+ /* Drop if there's TCP-MD5 or TCP-AO key with any rcvid/sndid
+ * for the remote peer. On TCP-AO established connection
+ * the last key is impossible to remove, so there's
+ * always at least one current_key.
+ */
+ if (tcp_ao_required(sk, saddr, family, l3index, true)) {
+ trace_tcp_hash_ao_required(sk, skb);
+ return SKB_DROP_REASON_TCP_AONOTFOUND;
+ }
+ if (unlikely(tcp_md5_do_lookup(sk, l3index, saddr, family))) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
+ trace_tcp_hash_md5_required(sk, skb);
+ return SKB_DROP_REASON_TCP_MD5NOTFOUND;
+ }
+ return SKB_NOT_DROPPED_YET;
+ }
-int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key)
-{
- struct scatterlist sg;
+ if (aoh)
+ return tcp_inbound_ao_hash(sk, skb, family, req, l3index, aoh);
- sg_init_one(&sg, key->key, key->keylen);
- ahash_request_set_crypt(hp->md5_req, &sg, NULL, key->keylen);
- return crypto_ahash_update(hp->md5_req);
+ return tcp_inbound_md5_hash(sk, skb, saddr, daddr, family,
+ l3index, md5_location);
}
-EXPORT_SYMBOL(tcp_md5_hash_key);
-
-#endif
+EXPORT_IPV6_MOD_GPL(tcp_inbound_hash);
void tcp_done(struct sock *sk)
{
- struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
+ struct request_sock *req;
+
+ /* We might be called with a new socket, after
+ * inet_csk_prepare_forced_close() has been called
+ * so we can not use lockdep_sock_is_held(sk)
+ */
+ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1);
if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
@@ -3742,7 +5015,7 @@ void tcp_done(struct sock *sk)
if (req)
reqsk_fastopen_remove(sk, req, false);
- sk->sk_shutdown = SHUTDOWN_MASK;
+ WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_state_change(sk);
@@ -3753,20 +5026,37 @@ EXPORT_SYMBOL_GPL(tcp_done);
int tcp_abort(struct sock *sk, int err)
{
- if (!sk_fullsock(sk)) {
- if (sk->sk_state == TCP_NEW_SYN_RECV) {
- struct request_sock *req = inet_reqsk(sk);
+ int state = inet_sk_state_load(sk);
- local_bh_disable();
- inet_csk_reqsk_queue_drop(req->rsk_listener, req);
- local_bh_enable();
- return 0;
- }
- return -EOPNOTSUPP;
+ if (state == TCP_NEW_SYN_RECV) {
+ struct request_sock *req = inet_reqsk(sk);
+
+ local_bh_disable();
+ inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+ local_bh_enable();
+ return 0;
}
+ if (state == TCP_TIME_WAIT) {
+ struct inet_timewait_sock *tw = inet_twsk(sk);
- /* Don't race with userspace socket closes such as tcp_close. */
- lock_sock(sk);
+ refcount_inc(&tw->tw_refcnt);
+ local_bh_disable();
+ inet_twsk_deschedule_put(tw);
+ local_bh_enable();
+ return 0;
+ }
+
+ /* BPF context ensures sock locking. */
+ if (!has_current_bpf_ctx())
+ /* Don't race with userspace socket closes such as tcp_close. */
+ lock_sock(sk);
+
+ /* Avoid closing the same socket twice. */
+ if (sk->sk_state == TCP_CLOSE) {
+ if (!has_current_bpf_ctx())
+ release_sock(sk);
+ return -ENOENT;
+ }
if (sk->sk_state == TCP_LISTEN) {
tcp_set_state(sk, TCP_CLOSE);
@@ -3777,20 +5067,15 @@ int tcp_abort(struct sock *sk, int err)
local_bh_disable();
bh_lock_sock(sk);
- if (!sock_flag(sk, SOCK_DEAD)) {
- sk->sk_err = err;
- /* This barrier is coupled with smp_rmb() in tcp_poll() */
- smp_wmb();
- sk->sk_error_report(sk);
- if (tcp_need_reset(sk->sk_state))
- tcp_send_active_reset(sk, GFP_ATOMIC);
- tcp_done(sk);
- }
+ if (tcp_need_reset(sk->sk_state))
+ tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_TCP_STATE);
+ tcp_done_with_error(sk, err);
bh_unlock_sock(sk);
local_bh_enable();
- tcp_write_queue_purge(sk);
- release_sock(sk);
+ if (!has_current_bpf_ctx())
+ release_sock(sk);
return 0;
}
EXPORT_SYMBOL_GPL(tcp_abort);
@@ -3823,25 +5108,130 @@ static void __init tcp_init_mem(void)
sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */
}
+static void __init tcp_struct_check(void)
+{
+ /* TX read-mostly hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, max_window);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, rcv_ssthresh);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint);
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, tcp_clean_acked);
+#endif
+
+ /* TXRX read-mostly hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_wnd);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, mss_cache);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_cwnd);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, prr_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, lost_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, sacked_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, scaling_ratio);
+
+ /* RX read-mostly hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, retrans_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, advmss);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, urg_data);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, lost);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh);
+
+ /* TX read-write hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, data_segs_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, bytes_sent);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, snd_sml);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_start);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_stat);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, write_seq);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, pushed_seq);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags);
+
+ /* TXRX read-write hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_clock_cache);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, tcp_mstamp);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_nxt);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_nxt);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_una);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, window_clamp);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, srtt_us);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, packets_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_tstamp);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt);
+
+ /* RX read-write hotpath cache lines */
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, segs_in);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, data_segs_in);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_wup);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, max_packets_out);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, cwnd_usage_seq);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space);
+}
+
void __init tcp_init(void)
{
int max_rshare, max_wshare, cnt;
unsigned long limit;
unsigned int i;
+ BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE);
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
- FIELD_SIZEOF(struct sk_buff, cb));
+ sizeof_field(struct sk_buff, cb));
+
+ tcp_struct_check();
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
- percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
- inet_hashinfo_init(&tcp_hashinfo);
+
+ timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
+ mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
+
inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
thash_entries, 21, /* one slot per 2 MB*/
0, 64 * 1024);
tcp_hashinfo.bind_bucket_cachep =
kmem_cache_create("tcp_bind_bucket",
sizeof(struct inet_bind_bucket), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC |
+ SLAB_ACCOUNT,
+ NULL);
+ tcp_hashinfo.bind2_bucket_cachep =
+ kmem_cache_create("tcp_bind2_bucket",
+ sizeof(struct inet_bind2_bucket), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC |
+ SLAB_ACCOUNT,
+ NULL);
/* Size and allocate the main established and bind bucket
* hash tables.
@@ -3865,7 +5255,7 @@ void __init tcp_init(void)
panic("TCP: failed to alloc ehash_locks");
tcp_hashinfo.bhash =
alloc_large_system_hash("TCP bind",
- sizeof(struct inet_bind_hashbucket),
+ 2 * sizeof(struct inet_bind_hashbucket),
tcp_hashinfo.ehash_mask + 1,
17, /* one slot per 128 KB of memory */
0,
@@ -3874,11 +5264,15 @@ void __init tcp_init(void)
0,
64 * 1024);
tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
+ tcp_hashinfo.bhash2 = tcp_hashinfo.bhash + tcp_hashinfo.bhash_size;
for (i = 0; i < tcp_hashinfo.bhash_size; i++) {
spin_lock_init(&tcp_hashinfo.bhash[i].lock);
INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain);
+ spin_lock_init(&tcp_hashinfo.bhash2[i].lock);
+ INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain);
}
+ tcp_hashinfo.pernet = false;
cnt = tcp_hashinfo.ehash_mask + 1;
sysctl_tcp_max_orphans = cnt / 2;
@@ -3887,15 +5281,15 @@ void __init tcp_init(void)
/* Set per-socket limits to no more than 1/128 the pressure threshold */
limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
max_wshare = min(4UL*1024*1024, limit);
- max_rshare = min(6UL*1024*1024, limit);
+ max_rshare = min(32UL*1024*1024, limit);
- init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
+ init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
- init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
- init_net.ipv4.sysctl_tcp_rmem[1] = 87380;
- init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare);
+ init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE;
+ init_net.ipv4.sysctl_tcp_rmem[1] = 131072;
+ init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare);
pr_info("Hash tables configured (established %u bind %u)\n",
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
@@ -3903,5 +5297,6 @@ void __init tcp_init(void)
tcp_v4_init();
tcp_metrics_init();
BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0);
- tcp_tasklet_init();
+ tcp_tsq_work_init();
+ mptcp_init();
}
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
new file mode 100644
index 000000000000..34b8450829d0
--- /dev/null
+++ b/net/ipv4/tcp_ao.c
@@ -0,0 +1,2442 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET An implementation of the TCP Authentication Option (TCP-AO).
+ * See RFC5925.
+ *
+ * Authors: Dmitry Safonov <dima@arista.com>
+ * Francesco Ruggeri <fruggeri@arista.com>
+ * Salam Noureddine <noureddine@arista.com>
+ */
+#define pr_fmt(fmt) "TCP: " fmt
+
+#include <crypto/hash.h>
+#include <linux/inetdevice.h>
+#include <linux/tcp.h>
+
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/icmp.h>
+#include <trace/events/tcp.h>
+
+DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_ao_needed, HZ);
+
+int tcp_ao_calc_traffic_key(struct tcp_ao_key *mkt, u8 *key, void *ctx,
+ unsigned int len, struct tcp_sigpool *hp)
+{
+ struct scatterlist sg;
+ int ret;
+
+ if (crypto_ahash_setkey(crypto_ahash_reqtfm(hp->req),
+ mkt->key, mkt->keylen))
+ goto clear_hash;
+
+ ret = crypto_ahash_init(hp->req);
+ if (ret)
+ goto clear_hash;
+
+ sg_init_one(&sg, ctx, len);
+ ahash_request_set_crypt(hp->req, &sg, key, len);
+ crypto_ahash_update(hp->req);
+
+ ret = crypto_ahash_final(hp->req);
+ if (ret)
+ goto clear_hash;
+
+ return 0;
+clear_hash:
+ memset(key, 0, tcp_ao_digest_size(mkt));
+ return 1;
+}
+
+bool tcp_ao_ignore_icmp(const struct sock *sk, int family, int type, int code)
+{
+ bool ignore_icmp = false;
+ struct tcp_ao_info *ao;
+
+ if (!static_branch_unlikely(&tcp_ao_needed.key))
+ return false;
+
+ /* RFC5925, 7.8:
+ * >> A TCP-AO implementation MUST default to ignore incoming ICMPv4
+ * messages of Type 3 (destination unreachable), Codes 2-4 (protocol
+ * unreachable, port unreachable, and fragmentation needed -- ’hard
+ * errors’), and ICMPv6 Type 1 (destination unreachable), Code 1
+ * (administratively prohibited) and Code 4 (port unreachable) intended
+ * for connections in synchronized states (ESTABLISHED, FIN-WAIT-1, FIN-
+ * WAIT-2, CLOSE-WAIT, CLOSING, LAST-ACK, TIME-WAIT) that match MKTs.
+ */
+ if (family == AF_INET) {
+ if (type != ICMP_DEST_UNREACH)
+ return false;
+ if (code < ICMP_PROT_UNREACH || code > ICMP_FRAG_NEEDED)
+ return false;
+ } else {
+ if (type != ICMPV6_DEST_UNREACH)
+ return false;
+ if (code != ICMPV6_ADM_PROHIBITED && code != ICMPV6_PORT_UNREACH)
+ return false;
+ }
+
+ rcu_read_lock();
+ switch (sk->sk_state) {
+ case TCP_TIME_WAIT:
+ ao = rcu_dereference(tcp_twsk(sk)->ao_info);
+ break;
+ case TCP_SYN_SENT:
+ case TCP_SYN_RECV:
+ case TCP_LISTEN:
+ case TCP_NEW_SYN_RECV:
+ /* RFC5925 specifies to ignore ICMPs *only* on connections
+ * in synchronized states.
+ */
+ rcu_read_unlock();
+ return false;
+ default:
+ ao = rcu_dereference(tcp_sk(sk)->ao_info);
+ }
+
+ if (ao && !ao->accept_icmps) {
+ ignore_icmp = true;
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAODROPPEDICMPS);
+ atomic64_inc(&ao->counters.dropped_icmp);
+ }
+ rcu_read_unlock();
+
+ return ignore_icmp;
+}
+
+/* Optimized version of tcp_ao_do_lookup(): only for sockets for which
+ * it's known that the keys in ao_info are matching peer's
+ * family/address/VRF/etc.
+ */
+struct tcp_ao_key *tcp_ao_established_key(const struct sock *sk,
+ struct tcp_ao_info *ao,
+ int sndid, int rcvid)
+{
+ struct tcp_ao_key *key;
+
+ hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) {
+ if ((sndid >= 0 && key->sndid != sndid) ||
+ (rcvid >= 0 && key->rcvid != rcvid))
+ continue;
+ return key;
+ }
+
+ return NULL;
+}
+
+static int ipv4_prefix_cmp(const struct in_addr *addr1,
+ const struct in_addr *addr2,
+ unsigned int prefixlen)
+{
+ __be32 mask = inet_make_mask(prefixlen);
+ __be32 a1 = addr1->s_addr & mask;
+ __be32 a2 = addr2->s_addr & mask;
+
+ if (a1 == a2)
+ return 0;
+ return memcmp(&a1, &a2, sizeof(a1));
+}
+
+static int __tcp_ao_key_cmp(const struct tcp_ao_key *key, int l3index,
+ const union tcp_ao_addr *addr, u8 prefixlen,
+ int family, int sndid, int rcvid)
+{
+ if (sndid >= 0 && key->sndid != sndid)
+ return (key->sndid > sndid) ? 1 : -1;
+ if (rcvid >= 0 && key->rcvid != rcvid)
+ return (key->rcvid > rcvid) ? 1 : -1;
+ if (l3index >= 0 && (key->keyflags & TCP_AO_KEYF_IFINDEX)) {
+ if (key->l3index != l3index)
+ return (key->l3index > l3index) ? 1 : -1;
+ }
+
+ if (family == AF_UNSPEC)
+ return 0;
+ if (key->family != family)
+ return (key->family > family) ? 1 : -1;
+
+ if (family == AF_INET) {
+ if (ntohl(key->addr.a4.s_addr) == INADDR_ANY)
+ return 0;
+ if (ntohl(addr->a4.s_addr) == INADDR_ANY)
+ return 0;
+ return ipv4_prefix_cmp(&key->addr.a4, &addr->a4, prefixlen);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else {
+ if (ipv6_addr_any(&key->addr.a6) || ipv6_addr_any(&addr->a6))
+ return 0;
+ if (ipv6_prefix_equal(&key->addr.a6, &addr->a6, prefixlen))
+ return 0;
+ return memcmp(&key->addr.a6, &addr->a6, sizeof(addr->a6));
+#endif
+ }
+ return -1;
+}
+
+static int tcp_ao_key_cmp(const struct tcp_ao_key *key, int l3index,
+ const union tcp_ao_addr *addr, u8 prefixlen,
+ int family, int sndid, int rcvid)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (family == AF_INET6 && ipv6_addr_v4mapped(&addr->a6)) {
+ __be32 addr4 = addr->a6.s6_addr32[3];
+
+ return __tcp_ao_key_cmp(key, l3index,
+ (union tcp_ao_addr *)&addr4,
+ prefixlen, AF_INET, sndid, rcvid);
+ }
+#endif
+ return __tcp_ao_key_cmp(key, l3index, addr,
+ prefixlen, family, sndid, rcvid);
+}
+
+static struct tcp_ao_key *__tcp_ao_do_lookup(const struct sock *sk, int l3index,
+ const union tcp_ao_addr *addr, int family, u8 prefix,
+ int sndid, int rcvid)
+{
+ struct tcp_ao_key *key;
+ struct tcp_ao_info *ao;
+
+ if (!static_branch_unlikely(&tcp_ao_needed.key))
+ return NULL;
+
+ ao = rcu_dereference_check(tcp_sk(sk)->ao_info,
+ lockdep_sock_is_held(sk));
+ if (!ao)
+ return NULL;
+
+ hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) {
+ u8 prefixlen = min(prefix, key->prefixlen);
+
+ if (!tcp_ao_key_cmp(key, l3index, addr, prefixlen,
+ family, sndid, rcvid))
+ return key;
+ }
+ return NULL;
+}
+
+struct tcp_ao_key *tcp_ao_do_lookup(const struct sock *sk, int l3index,
+ const union tcp_ao_addr *addr,
+ int family, int sndid, int rcvid)
+{
+ return __tcp_ao_do_lookup(sk, l3index, addr, family, U8_MAX, sndid, rcvid);
+}
+
+static struct tcp_ao_info *tcp_ao_alloc_info(gfp_t flags)
+{
+ struct tcp_ao_info *ao;
+
+ ao = kzalloc(sizeof(*ao), flags);
+ if (!ao)
+ return NULL;
+ INIT_HLIST_HEAD(&ao->head);
+ refcount_set(&ao->refcnt, 1);
+
+ return ao;
+}
+
+static void tcp_ao_link_mkt(struct tcp_ao_info *ao, struct tcp_ao_key *mkt)
+{
+ hlist_add_head_rcu(&mkt->node, &ao->head);
+}
+
+static struct tcp_ao_key *tcp_ao_copy_key(struct sock *sk,
+ struct tcp_ao_key *key)
+{
+ struct tcp_ao_key *new_key;
+
+ new_key = sock_kmalloc(sk, tcp_ao_sizeof_key(key),
+ GFP_ATOMIC);
+ if (!new_key)
+ return NULL;
+
+ *new_key = *key;
+ INIT_HLIST_NODE(&new_key->node);
+ tcp_sigpool_get(new_key->tcp_sigpool_id);
+ atomic64_set(&new_key->pkt_good, 0);
+ atomic64_set(&new_key->pkt_bad, 0);
+
+ return new_key;
+}
+
+static void tcp_ao_key_free_rcu(struct rcu_head *head)
+{
+ struct tcp_ao_key *key = container_of(head, struct tcp_ao_key, rcu);
+
+ tcp_sigpool_release(key->tcp_sigpool_id);
+ kfree_sensitive(key);
+}
+
+static void tcp_ao_info_free(struct tcp_ao_info *ao)
+{
+ struct tcp_ao_key *key;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(key, n, &ao->head, node) {
+ hlist_del(&key->node);
+ tcp_sigpool_release(key->tcp_sigpool_id);
+ kfree_sensitive(key);
+ }
+ kfree(ao);
+ static_branch_slow_dec_deferred(&tcp_ao_needed);
+}
+
+static void tcp_ao_sk_omem_free(struct sock *sk, struct tcp_ao_info *ao)
+{
+ size_t total_ao_sk_mem = 0;
+ struct tcp_ao_key *key;
+
+ hlist_for_each_entry(key, &ao->head, node)
+ total_ao_sk_mem += tcp_ao_sizeof_key(key);
+ atomic_sub(total_ao_sk_mem, &sk->sk_omem_alloc);
+}
+
+void tcp_ao_destroy_sock(struct sock *sk, bool twsk)
+{
+ struct tcp_ao_info *ao;
+
+ if (twsk) {
+ ao = rcu_dereference_protected(tcp_twsk(sk)->ao_info, 1);
+ rcu_assign_pointer(tcp_twsk(sk)->ao_info, NULL);
+ } else {
+ ao = rcu_dereference_protected(tcp_sk(sk)->ao_info, 1);
+ rcu_assign_pointer(tcp_sk(sk)->ao_info, NULL);
+ }
+
+ if (!ao || !refcount_dec_and_test(&ao->refcnt))
+ return;
+
+ if (!twsk)
+ tcp_ao_sk_omem_free(sk, ao);
+ tcp_ao_info_free(ao);
+}
+
+void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp)
+{
+ struct tcp_ao_info *ao_info = rcu_dereference_protected(tp->ao_info, 1);
+
+ if (ao_info) {
+ struct tcp_ao_key *key;
+ struct hlist_node *n;
+ int omem = 0;
+
+ hlist_for_each_entry_safe(key, n, &ao_info->head, node) {
+ omem += tcp_ao_sizeof_key(key);
+ }
+
+ refcount_inc(&ao_info->refcnt);
+ atomic_sub(omem, &(((struct sock *)tp)->sk_omem_alloc));
+ rcu_assign_pointer(tcptw->ao_info, ao_info);
+ } else {
+ tcptw->ao_info = NULL;
+ }
+}
+
+/* 4 tuple and ISNs are expected in NBO */
+static int tcp_v4_ao_calc_key(struct tcp_ao_key *mkt, u8 *key,
+ __be32 saddr, __be32 daddr,
+ __be16 sport, __be16 dport,
+ __be32 sisn, __be32 disn)
+{
+ /* See RFC5926 3.1.1 */
+ struct kdf_input_block {
+ u8 counter;
+ u8 label[6];
+ struct tcp4_ao_context ctx;
+ __be16 outlen;
+ } __packed * tmp;
+ struct tcp_sigpool hp;
+ int err;
+
+ err = tcp_sigpool_start(mkt->tcp_sigpool_id, &hp);
+ if (err)
+ return err;
+
+ tmp = hp.scratch;
+ tmp->counter = 1;
+ memcpy(tmp->label, "TCP-AO", 6);
+ tmp->ctx.saddr = saddr;
+ tmp->ctx.daddr = daddr;
+ tmp->ctx.sport = sport;
+ tmp->ctx.dport = dport;
+ tmp->ctx.sisn = sisn;
+ tmp->ctx.disn = disn;
+ tmp->outlen = htons(tcp_ao_digest_size(mkt) * 8); /* in bits */
+
+ err = tcp_ao_calc_traffic_key(mkt, key, tmp, sizeof(*tmp), &hp);
+ tcp_sigpool_end(&hp);
+
+ return err;
+}
+
+int tcp_v4_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
+ const struct sock *sk,
+ __be32 sisn, __be32 disn, bool send)
+{
+ if (send)
+ return tcp_v4_ao_calc_key(mkt, key, sk->sk_rcv_saddr,
+ sk->sk_daddr, htons(sk->sk_num),
+ sk->sk_dport, sisn, disn);
+ else
+ return tcp_v4_ao_calc_key(mkt, key, sk->sk_daddr,
+ sk->sk_rcv_saddr, sk->sk_dport,
+ htons(sk->sk_num), disn, sisn);
+}
+
+static int tcp_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
+ const struct sock *sk,
+ __be32 sisn, __be32 disn, bool send)
+{
+ if (mkt->family == AF_INET)
+ return tcp_v4_ao_calc_key_sk(mkt, key, sk, sisn, disn, send);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (mkt->family == AF_INET6)
+ return tcp_v6_ao_calc_key_sk(mkt, key, sk, sisn, disn, send);
+#endif
+ else
+ return -EOPNOTSUPP;
+}
+
+int tcp_v4_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key,
+ struct request_sock *req)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ return tcp_v4_ao_calc_key(mkt, key,
+ ireq->ir_loc_addr, ireq->ir_rmt_addr,
+ htons(ireq->ir_num), ireq->ir_rmt_port,
+ htonl(tcp_rsk(req)->snt_isn),
+ htonl(tcp_rsk(req)->rcv_isn));
+}
+
+static int tcp_v4_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key,
+ const struct sk_buff *skb,
+ __be32 sisn, __be32 disn)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
+
+ return tcp_v4_ao_calc_key(mkt, key, iph->saddr, iph->daddr,
+ th->source, th->dest, sisn, disn);
+}
+
+static int tcp_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key,
+ const struct sk_buff *skb,
+ __be32 sisn, __be32 disn, int family)
+{
+ if (family == AF_INET)
+ return tcp_v4_ao_calc_key_skb(mkt, key, skb, sisn, disn);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (family == AF_INET6)
+ return tcp_v6_ao_calc_key_skb(mkt, key, skb, sisn, disn);
+#endif
+ return -EAFNOSUPPORT;
+}
+
+static int tcp_v4_ao_hash_pseudoheader(struct tcp_sigpool *hp,
+ __be32 daddr, __be32 saddr,
+ int nbytes)
+{
+ struct tcp4_pseudohdr *bp;
+ struct scatterlist sg;
+
+ bp = hp->scratch;
+ bp->saddr = saddr;
+ bp->daddr = daddr;
+ bp->pad = 0;
+ bp->protocol = IPPROTO_TCP;
+ bp->len = cpu_to_be16(nbytes);
+
+ sg_init_one(&sg, bp, sizeof(*bp));
+ ahash_request_set_crypt(hp->req, &sg, NULL, sizeof(*bp));
+ return crypto_ahash_update(hp->req);
+}
+
+static int tcp_ao_hash_pseudoheader(unsigned short int family,
+ const struct sock *sk,
+ const struct sk_buff *skb,
+ struct tcp_sigpool *hp, int nbytes)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+
+ /* TODO: Can we rely on checksum being zero to mean outbound pkt? */
+ if (!th->check) {
+ if (family == AF_INET)
+ return tcp_v4_ao_hash_pseudoheader(hp, sk->sk_daddr,
+ sk->sk_rcv_saddr, skb->len);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (family == AF_INET6)
+ return tcp_v6_ao_hash_pseudoheader(hp, &sk->sk_v6_daddr,
+ &sk->sk_v6_rcv_saddr, skb->len);
+#endif
+ else
+ return -EAFNOSUPPORT;
+ }
+
+ if (family == AF_INET) {
+ const struct iphdr *iph = ip_hdr(skb);
+
+ return tcp_v4_ao_hash_pseudoheader(hp, iph->daddr,
+ iph->saddr, skb->len);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+
+ return tcp_v6_ao_hash_pseudoheader(hp, &iph->daddr,
+ &iph->saddr, skb->len);
+#endif
+ }
+ return -EAFNOSUPPORT;
+}
+
+u32 tcp_ao_compute_sne(u32 next_sne, u32 next_seq, u32 seq)
+{
+ u32 sne = next_sne;
+
+ if (before(seq, next_seq)) {
+ if (seq > next_seq)
+ sne--;
+ } else {
+ if (seq < next_seq)
+ sne++;
+ }
+
+ return sne;
+}
+
+/* tcp_ao_hash_sne(struct tcp_sigpool *hp)
+ * @hp - used for hashing
+ * @sne - sne value
+ */
+static int tcp_ao_hash_sne(struct tcp_sigpool *hp, u32 sne)
+{
+ struct scatterlist sg;
+ __be32 *bp;
+
+ bp = (__be32 *)hp->scratch;
+ *bp = htonl(sne);
+
+ sg_init_one(&sg, bp, sizeof(*bp));
+ ahash_request_set_crypt(hp->req, &sg, NULL, sizeof(*bp));
+ return crypto_ahash_update(hp->req);
+}
+
+static int tcp_ao_hash_header(struct tcp_sigpool *hp,
+ const struct tcphdr *th,
+ bool exclude_options, u8 *hash,
+ int hash_offset, int hash_len)
+{
+ struct scatterlist sg;
+ u8 *hdr = hp->scratch;
+ int err, len;
+
+ /* We are not allowed to change tcphdr, make a local copy */
+ if (exclude_options) {
+ len = sizeof(*th) + sizeof(struct tcp_ao_hdr) + hash_len;
+ memcpy(hdr, th, sizeof(*th));
+ memcpy(hdr + sizeof(*th),
+ (u8 *)th + hash_offset - sizeof(struct tcp_ao_hdr),
+ sizeof(struct tcp_ao_hdr));
+ memset(hdr + sizeof(*th) + sizeof(struct tcp_ao_hdr),
+ 0, hash_len);
+ ((struct tcphdr *)hdr)->check = 0;
+ } else {
+ len = th->doff << 2;
+ memcpy(hdr, th, len);
+ /* zero out tcp-ao hash */
+ ((struct tcphdr *)hdr)->check = 0;
+ memset(hdr + hash_offset, 0, hash_len);
+ }
+
+ sg_init_one(&sg, hdr, len);
+ ahash_request_set_crypt(hp->req, &sg, NULL, len);
+ err = crypto_ahash_update(hp->req);
+ WARN_ON_ONCE(err != 0);
+ return err;
+}
+
+int tcp_ao_hash_hdr(unsigned short int family, char *ao_hash,
+ struct tcp_ao_key *key, const u8 *tkey,
+ const union tcp_ao_addr *daddr,
+ const union tcp_ao_addr *saddr,
+ const struct tcphdr *th, u32 sne)
+{
+ int tkey_len = tcp_ao_digest_size(key);
+ int hash_offset = ao_hash - (char *)th;
+ struct tcp_sigpool hp;
+ void *hash_buf = NULL;
+
+ hash_buf = kmalloc(tkey_len, GFP_ATOMIC);
+ if (!hash_buf)
+ goto clear_hash_noput;
+
+ if (tcp_sigpool_start(key->tcp_sigpool_id, &hp))
+ goto clear_hash_noput;
+
+ if (crypto_ahash_setkey(crypto_ahash_reqtfm(hp.req), tkey, tkey_len))
+ goto clear_hash;
+
+ if (crypto_ahash_init(hp.req))
+ goto clear_hash;
+
+ if (tcp_ao_hash_sne(&hp, sne))
+ goto clear_hash;
+ if (family == AF_INET) {
+ if (tcp_v4_ao_hash_pseudoheader(&hp, daddr->a4.s_addr,
+ saddr->a4.s_addr, th->doff * 4))
+ goto clear_hash;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ if (tcp_v6_ao_hash_pseudoheader(&hp, &daddr->a6,
+ &saddr->a6, th->doff * 4))
+ goto clear_hash;
+#endif
+ } else {
+ WARN_ON_ONCE(1);
+ goto clear_hash;
+ }
+ if (tcp_ao_hash_header(&hp, th,
+ !!(key->keyflags & TCP_AO_KEYF_EXCLUDE_OPT),
+ ao_hash, hash_offset, tcp_ao_maclen(key)))
+ goto clear_hash;
+ ahash_request_set_crypt(hp.req, NULL, hash_buf, 0);
+ if (crypto_ahash_final(hp.req))
+ goto clear_hash;
+
+ memcpy(ao_hash, hash_buf, tcp_ao_maclen(key));
+ tcp_sigpool_end(&hp);
+ kfree(hash_buf);
+ return 0;
+
+clear_hash:
+ tcp_sigpool_end(&hp);
+clear_hash_noput:
+ memset(ao_hash, 0, tcp_ao_maclen(key));
+ kfree(hash_buf);
+ return 1;
+}
+
+int tcp_ao_hash_skb(unsigned short int family,
+ char *ao_hash, struct tcp_ao_key *key,
+ const struct sock *sk, const struct sk_buff *skb,
+ const u8 *tkey, int hash_offset, u32 sne)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ int tkey_len = tcp_ao_digest_size(key);
+ struct tcp_sigpool hp;
+ void *hash_buf = NULL;
+
+ hash_buf = kmalloc(tkey_len, GFP_ATOMIC);
+ if (!hash_buf)
+ goto clear_hash_noput;
+
+ if (tcp_sigpool_start(key->tcp_sigpool_id, &hp))
+ goto clear_hash_noput;
+
+ if (crypto_ahash_setkey(crypto_ahash_reqtfm(hp.req), tkey, tkey_len))
+ goto clear_hash;
+
+ /* For now use sha1 by default. Depends on alg in tcp_ao_key */
+ if (crypto_ahash_init(hp.req))
+ goto clear_hash;
+
+ if (tcp_ao_hash_sne(&hp, sne))
+ goto clear_hash;
+ if (tcp_ao_hash_pseudoheader(family, sk, skb, &hp, skb->len))
+ goto clear_hash;
+ if (tcp_ao_hash_header(&hp, th,
+ !!(key->keyflags & TCP_AO_KEYF_EXCLUDE_OPT),
+ ao_hash, hash_offset, tcp_ao_maclen(key)))
+ goto clear_hash;
+ if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2))
+ goto clear_hash;
+ ahash_request_set_crypt(hp.req, NULL, hash_buf, 0);
+ if (crypto_ahash_final(hp.req))
+ goto clear_hash;
+
+ memcpy(ao_hash, hash_buf, tcp_ao_maclen(key));
+ tcp_sigpool_end(&hp);
+ kfree(hash_buf);
+ return 0;
+
+clear_hash:
+ tcp_sigpool_end(&hp);
+clear_hash_noput:
+ memset(ao_hash, 0, tcp_ao_maclen(key));
+ kfree(hash_buf);
+ return 1;
+}
+
+int tcp_v4_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key,
+ const struct sock *sk, const struct sk_buff *skb,
+ const u8 *tkey, int hash_offset, u32 sne)
+{
+ return tcp_ao_hash_skb(AF_INET, ao_hash, key, sk, skb,
+ tkey, hash_offset, sne);
+}
+
+int tcp_v4_ao_synack_hash(char *ao_hash, struct tcp_ao_key *ao_key,
+ struct request_sock *req, const struct sk_buff *skb,
+ int hash_offset, u32 sne)
+{
+ void *hash_buf = NULL;
+ int err;
+
+ hash_buf = kmalloc(tcp_ao_digest_size(ao_key), GFP_ATOMIC);
+ if (!hash_buf)
+ return -ENOMEM;
+
+ err = tcp_v4_ao_calc_key_rsk(ao_key, hash_buf, req);
+ if (err)
+ goto out;
+
+ err = tcp_ao_hash_skb(AF_INET, ao_hash, ao_key, req_to_sk(req), skb,
+ hash_buf, hash_offset, sne);
+out:
+ kfree(hash_buf);
+ return err;
+}
+
+struct tcp_ao_key *tcp_v4_ao_lookup_rsk(const struct sock *sk,
+ struct request_sock *req,
+ int sndid, int rcvid)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+ union tcp_ao_addr *addr = (union tcp_ao_addr *)&ireq->ir_rmt_addr;
+ int l3index;
+
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
+ return tcp_ao_do_lookup(sk, l3index, addr, AF_INET, sndid, rcvid);
+}
+
+struct tcp_ao_key *tcp_v4_ao_lookup(const struct sock *sk, struct sock *addr_sk,
+ int sndid, int rcvid)
+{
+ int l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
+ addr_sk->sk_bound_dev_if);
+ union tcp_ao_addr *addr = (union tcp_ao_addr *)&addr_sk->sk_daddr;
+
+ return tcp_ao_do_lookup(sk, l3index, addr, AF_INET, sndid, rcvid);
+}
+
+int tcp_ao_prepare_reset(const struct sock *sk, struct sk_buff *skb,
+ const struct tcp_ao_hdr *aoh, int l3index, u32 seq,
+ struct tcp_ao_key **key, char **traffic_key,
+ bool *allocated_traffic_key, u8 *keyid, u32 *sne)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct tcp_ao_info *ao_info;
+
+ *allocated_traffic_key = false;
+ /* If there's no socket - than initial sisn/disn are unknown.
+ * Drop the segment. RFC5925 (7.7) advises to require graceful
+ * restart [RFC4724]. Alternatively, the RFC5925 advises to
+ * save/restore traffic keys before/after reboot.
+ * Linux TCP-AO support provides TCP_AO_ADD_KEY and TCP_AO_REPAIR
+ * options to restore a socket post-reboot.
+ */
+ if (!sk)
+ return -ENOTCONN;
+
+ if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV)) {
+ unsigned int family = READ_ONCE(sk->sk_family);
+ union tcp_ao_addr *addr;
+ __be32 disn, sisn;
+
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ struct request_sock *req = inet_reqsk(sk);
+
+ sisn = htonl(tcp_rsk(req)->rcv_isn);
+ disn = htonl(tcp_rsk(req)->snt_isn);
+ *sne = tcp_ao_compute_sne(0, tcp_rsk(req)->snt_isn, seq);
+ } else {
+ sisn = th->seq;
+ disn = 0;
+ }
+ if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6)
+ addr = (union tcp_md5_addr *)&ipv6_hdr(skb)->saddr;
+ else
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+#if IS_ENABLED(CONFIG_IPV6)
+ if (family == AF_INET6 && ipv6_addr_v4mapped(&sk->sk_v6_daddr))
+ family = AF_INET;
+#endif
+
+ sk = sk_const_to_full_sk(sk);
+ ao_info = rcu_dereference(tcp_sk(sk)->ao_info);
+ if (!ao_info)
+ return -ENOENT;
+ *key = tcp_ao_do_lookup(sk, l3index, addr, family,
+ -1, aoh->rnext_keyid);
+ if (!*key)
+ return -ENOENT;
+ *traffic_key = kmalloc(tcp_ao_digest_size(*key), GFP_ATOMIC);
+ if (!*traffic_key)
+ return -ENOMEM;
+ *allocated_traffic_key = true;
+ if (tcp_ao_calc_key_skb(*key, *traffic_key, skb,
+ sisn, disn, family))
+ return -1;
+ *keyid = (*key)->rcvid;
+ } else {
+ struct tcp_ao_key *rnext_key;
+ u32 snd_basis;
+
+ if (sk->sk_state == TCP_TIME_WAIT) {
+ ao_info = rcu_dereference(tcp_twsk(sk)->ao_info);
+ snd_basis = tcp_twsk(sk)->tw_snd_nxt;
+ } else {
+ ao_info = rcu_dereference(tcp_sk(sk)->ao_info);
+ snd_basis = tcp_sk(sk)->snd_una;
+ }
+ if (!ao_info)
+ return -ENOENT;
+
+ *key = tcp_ao_established_key(sk, ao_info, aoh->rnext_keyid, -1);
+ if (!*key)
+ return -ENOENT;
+ *traffic_key = snd_other_key(*key);
+ rnext_key = READ_ONCE(ao_info->rnext_key);
+ *keyid = rnext_key->rcvid;
+ *sne = tcp_ao_compute_sne(READ_ONCE(ao_info->snd_sne),
+ snd_basis, seq);
+ }
+ return 0;
+}
+
+int tcp_ao_transmit_skb(struct sock *sk, struct sk_buff *skb,
+ struct tcp_ao_key *key, struct tcphdr *th,
+ __u8 *hash_location)
+{
+ struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_ao_info *ao;
+ void *tkey_buf = NULL;
+ u8 *traffic_key;
+ u32 sne;
+
+ ao = rcu_dereference_protected(tcp_sk(sk)->ao_info,
+ lockdep_sock_is_held(sk));
+ traffic_key = snd_other_key(key);
+ if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
+ __be32 disn;
+
+ if (!(tcb->tcp_flags & TCPHDR_ACK)) {
+ disn = 0;
+ tkey_buf = kmalloc(tcp_ao_digest_size(key), GFP_ATOMIC);
+ if (!tkey_buf)
+ return -ENOMEM;
+ traffic_key = tkey_buf;
+ } else {
+ disn = ao->risn;
+ }
+ tp->af_specific->ao_calc_key_sk(key, traffic_key,
+ sk, ao->lisn, disn, true);
+ }
+ sne = tcp_ao_compute_sne(READ_ONCE(ao->snd_sne), READ_ONCE(tp->snd_una),
+ ntohl(th->seq));
+ tp->af_specific->calc_ao_hash(hash_location, key, sk, skb, traffic_key,
+ hash_location - (u8 *)th, sne);
+ kfree(tkey_buf);
+ return 0;
+}
+
+static struct tcp_ao_key *tcp_ao_inbound_lookup(unsigned short int family,
+ const struct sock *sk, const struct sk_buff *skb,
+ int sndid, int rcvid, int l3index)
+{
+ if (family == AF_INET) {
+ const struct iphdr *iph = ip_hdr(skb);
+
+ return tcp_ao_do_lookup(sk, l3index,
+ (union tcp_ao_addr *)&iph->saddr,
+ AF_INET, sndid, rcvid);
+ } else {
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+
+ return tcp_ao_do_lookup(sk, l3index,
+ (union tcp_ao_addr *)&iph->saddr,
+ AF_INET6, sndid, rcvid);
+ }
+}
+
+void tcp_ao_syncookie(struct sock *sk, const struct sk_buff *skb,
+ struct request_sock *req, unsigned short int family)
+{
+ struct tcp_request_sock *treq = tcp_rsk(req);
+ const struct tcphdr *th = tcp_hdr(skb);
+ const struct tcp_ao_hdr *aoh;
+ struct tcp_ao_key *key;
+ int l3index;
+
+ /* treq->af_specific is used to perform TCP_AO lookup
+ * in tcp_create_openreq_child().
+ */
+#if IS_ENABLED(CONFIG_IPV6)
+ if (family == AF_INET6)
+ treq->af_specific = &tcp_request_sock_ipv6_ops;
+ else
+#endif
+ treq->af_specific = &tcp_request_sock_ipv4_ops;
+
+ treq->used_tcp_ao = false;
+
+ if (tcp_parse_auth_options(th, NULL, &aoh) || !aoh)
+ return;
+
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk), inet_rsk(req)->ir_iif);
+ key = tcp_ao_inbound_lookup(family, sk, skb, -1, aoh->keyid, l3index);
+ if (!key)
+ /* Key not found, continue without TCP-AO */
+ return;
+
+ treq->ao_rcv_next = aoh->keyid;
+ treq->ao_keyid = aoh->rnext_keyid;
+ treq->used_tcp_ao = true;
+}
+
+static enum skb_drop_reason
+tcp_ao_verify_hash(const struct sock *sk, const struct sk_buff *skb,
+ unsigned short int family, struct tcp_ao_info *info,
+ const struct tcp_ao_hdr *aoh, struct tcp_ao_key *key,
+ u8 *traffic_key, u8 *phash, u32 sne, int l3index)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ u8 maclen = tcp_ao_hdr_maclen(aoh);
+ void *hash_buf = NULL;
+
+ if (maclen != tcp_ao_maclen(key)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD);
+ atomic64_inc(&info->counters.pkt_bad);
+ atomic64_inc(&key->pkt_bad);
+ trace_tcp_ao_wrong_maclen(sk, skb, aoh->keyid,
+ aoh->rnext_keyid, maclen);
+ return SKB_DROP_REASON_TCP_AOFAILURE;
+ }
+
+ hash_buf = kmalloc(tcp_ao_digest_size(key), GFP_ATOMIC);
+ if (!hash_buf)
+ return SKB_DROP_REASON_NOT_SPECIFIED;
+
+ /* XXX: make it per-AF callback? */
+ tcp_ao_hash_skb(family, hash_buf, key, sk, skb, traffic_key,
+ (phash - (u8 *)th), sne);
+ if (memcmp(phash, hash_buf, maclen)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOBAD);
+ atomic64_inc(&info->counters.pkt_bad);
+ atomic64_inc(&key->pkt_bad);
+ trace_tcp_ao_mismatch(sk, skb, aoh->keyid,
+ aoh->rnext_keyid, maclen);
+ kfree(hash_buf);
+ return SKB_DROP_REASON_TCP_AOFAILURE;
+ }
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOGOOD);
+ atomic64_inc(&info->counters.pkt_good);
+ atomic64_inc(&key->pkt_good);
+ kfree(hash_buf);
+ return SKB_NOT_DROPPED_YET;
+}
+
+enum skb_drop_reason
+tcp_inbound_ao_hash(struct sock *sk, const struct sk_buff *skb,
+ unsigned short int family, const struct request_sock *req,
+ int l3index, const struct tcp_ao_hdr *aoh)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ u8 maclen = tcp_ao_hdr_maclen(aoh);
+ u8 *phash = (u8 *)(aoh + 1); /* hash goes just after the header */
+ struct tcp_ao_info *info;
+ enum skb_drop_reason ret;
+ struct tcp_ao_key *key;
+ __be32 sisn, disn;
+ u8 *traffic_key;
+ int state;
+ u32 sne = 0;
+
+ info = rcu_dereference(tcp_sk(sk)->ao_info);
+ if (!info) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOKEYNOTFOUND);
+ trace_tcp_ao_key_not_found(sk, skb, aoh->keyid,
+ aoh->rnext_keyid, maclen);
+ return SKB_DROP_REASON_TCP_AOUNEXPECTED;
+ }
+
+ if (unlikely(th->syn)) {
+ sisn = th->seq;
+ disn = 0;
+ }
+
+ state = READ_ONCE(sk->sk_state);
+ /* Fast-path */
+ if (likely((1 << state) & TCP_AO_ESTABLISHED)) {
+ enum skb_drop_reason err;
+ struct tcp_ao_key *current_key;
+
+ /* Check if this socket's rnext_key matches the keyid in the
+ * packet. If not we lookup the key based on the keyid
+ * matching the rcvid in the mkt.
+ */
+ key = READ_ONCE(info->rnext_key);
+ if (key->rcvid != aoh->keyid) {
+ key = tcp_ao_established_key(sk, info, -1, aoh->keyid);
+ if (!key)
+ goto key_not_found;
+ }
+
+ /* Delayed retransmitted SYN */
+ if (unlikely(th->syn && !th->ack))
+ goto verify_hash;
+
+ sne = tcp_ao_compute_sne(info->rcv_sne, tcp_sk(sk)->rcv_nxt,
+ ntohl(th->seq));
+ /* Established socket, traffic key are cached */
+ traffic_key = rcv_other_key(key);
+ err = tcp_ao_verify_hash(sk, skb, family, info, aoh, key,
+ traffic_key, phash, sne, l3index);
+ if (err)
+ return err;
+ current_key = READ_ONCE(info->current_key);
+ /* Key rotation: the peer asks us to use new key (RNext) */
+ if (unlikely(aoh->rnext_keyid != current_key->sndid)) {
+ trace_tcp_ao_rnext_request(sk, skb, current_key->sndid,
+ aoh->rnext_keyid,
+ tcp_ao_hdr_maclen(aoh));
+ /* If the key is not found we do nothing. */
+ key = tcp_ao_established_key(sk, info, aoh->rnext_keyid, -1);
+ if (key)
+ /* pairs with tcp_ao_del_cmd */
+ WRITE_ONCE(info->current_key, key);
+ }
+ return SKB_NOT_DROPPED_YET;
+ }
+
+ if (unlikely(state == TCP_CLOSE))
+ return SKB_DROP_REASON_TCP_CLOSE;
+
+ /* Lookup key based on peer address and keyid.
+ * current_key and rnext_key must not be used on tcp listen
+ * sockets as otherwise:
+ * - request sockets would race on those key pointers
+ * - tcp_ao_del_cmd() allows async key removal
+ */
+ key = tcp_ao_inbound_lookup(family, sk, skb, -1, aoh->keyid, l3index);
+ if (!key)
+ goto key_not_found;
+
+ if (th->syn && !th->ack)
+ goto verify_hash;
+
+ if ((1 << state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV)) {
+ /* Make the initial syn the likely case here */
+ if (unlikely(req)) {
+ sne = tcp_ao_compute_sne(0, tcp_rsk(req)->rcv_isn,
+ ntohl(th->seq));
+ sisn = htonl(tcp_rsk(req)->rcv_isn);
+ disn = htonl(tcp_rsk(req)->snt_isn);
+ } else if (unlikely(th->ack && !th->syn)) {
+ /* Possible syncookie packet */
+ sisn = htonl(ntohl(th->seq) - 1);
+ disn = htonl(ntohl(th->ack_seq) - 1);
+ sne = tcp_ao_compute_sne(0, ntohl(sisn),
+ ntohl(th->seq));
+ } else if (unlikely(!th->syn)) {
+ /* no way to figure out initial sisn/disn - drop */
+ return SKB_DROP_REASON_TCP_FLAGS;
+ }
+ } else if ((1 << state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+ disn = info->lisn;
+ if (th->syn || th->rst)
+ sisn = th->seq;
+ else
+ sisn = info->risn;
+ } else {
+ WARN_ONCE(1, "TCP-AO: Unexpected sk_state %d", state);
+ return SKB_DROP_REASON_TCP_AOFAILURE;
+ }
+verify_hash:
+ traffic_key = kmalloc(tcp_ao_digest_size(key), GFP_ATOMIC);
+ if (!traffic_key)
+ return SKB_DROP_REASON_NOT_SPECIFIED;
+ tcp_ao_calc_key_skb(key, traffic_key, skb, sisn, disn, family);
+ ret = tcp_ao_verify_hash(sk, skb, family, info, aoh, key,
+ traffic_key, phash, sne, l3index);
+ kfree(traffic_key);
+ return ret;
+
+key_not_found:
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAOKEYNOTFOUND);
+ atomic64_inc(&info->counters.key_not_found);
+ trace_tcp_ao_key_not_found(sk, skb, aoh->keyid,
+ aoh->rnext_keyid, maclen);
+ return SKB_DROP_REASON_TCP_AOKEYNOTFOUND;
+}
+
+static int tcp_ao_cache_traffic_keys(const struct sock *sk,
+ struct tcp_ao_info *ao,
+ struct tcp_ao_key *ao_key)
+{
+ u8 *traffic_key = snd_other_key(ao_key);
+ int ret;
+
+ ret = tcp_ao_calc_key_sk(ao_key, traffic_key, sk,
+ ao->lisn, ao->risn, true);
+ if (ret)
+ return ret;
+
+ traffic_key = rcv_other_key(ao_key);
+ ret = tcp_ao_calc_key_sk(ao_key, traffic_key, sk,
+ ao->lisn, ao->risn, false);
+ return ret;
+}
+
+void tcp_ao_connect_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_ao_info *ao_info;
+ struct hlist_node *next;
+ union tcp_ao_addr *addr;
+ struct tcp_ao_key *key;
+ int family, l3index;
+
+ ao_info = rcu_dereference_protected(tp->ao_info,
+ lockdep_sock_is_held(sk));
+ if (!ao_info)
+ return;
+
+ /* Remove all keys that don't match the peer */
+ family = sk->sk_family;
+ if (family == AF_INET)
+ addr = (union tcp_ao_addr *)&sk->sk_daddr;
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (family == AF_INET6)
+ addr = (union tcp_ao_addr *)&sk->sk_v6_daddr;
+#endif
+ else
+ return;
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
+ sk->sk_bound_dev_if);
+
+ hlist_for_each_entry_safe(key, next, &ao_info->head, node) {
+ if (!tcp_ao_key_cmp(key, l3index, addr, key->prefixlen, family, -1, -1))
+ continue;
+
+ if (key == ao_info->current_key)
+ ao_info->current_key = NULL;
+ if (key == ao_info->rnext_key)
+ ao_info->rnext_key = NULL;
+ hlist_del_rcu(&key->node);
+ atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc);
+ call_rcu(&key->rcu, tcp_ao_key_free_rcu);
+ }
+
+ key = tp->af_specific->ao_lookup(sk, sk, -1, -1);
+ if (key) {
+ /* if current_key or rnext_key were not provided,
+ * use the first key matching the peer
+ */
+ if (!ao_info->current_key)
+ ao_info->current_key = key;
+ if (!ao_info->rnext_key)
+ ao_info->rnext_key = key;
+ tp->tcp_header_len += tcp_ao_len_aligned(key);
+
+ ao_info->lisn = htonl(tp->write_seq);
+ ao_info->snd_sne = 0;
+ } else {
+ /* Can't happen: tcp_connect() verifies that there's
+ * at least one tcp-ao key that matches the remote peer.
+ */
+ WARN_ON_ONCE(1);
+ rcu_assign_pointer(tp->ao_info, NULL);
+ kfree(ao_info);
+ }
+}
+
+void tcp_ao_established(struct sock *sk)
+{
+ struct tcp_ao_info *ao;
+ struct tcp_ao_key *key;
+
+ ao = rcu_dereference_protected(tcp_sk(sk)->ao_info,
+ lockdep_sock_is_held(sk));
+ if (!ao)
+ return;
+
+ hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk))
+ tcp_ao_cache_traffic_keys(sk, ao, key);
+}
+
+void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_ao_info *ao;
+ struct tcp_ao_key *key;
+
+ ao = rcu_dereference_protected(tcp_sk(sk)->ao_info,
+ lockdep_sock_is_held(sk));
+ if (!ao)
+ return;
+
+ /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
+ if (skb)
+ WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq);
+ ao->rcv_sne = 0;
+
+ hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk))
+ tcp_ao_cache_traffic_keys(sk, ao, key);
+}
+
+int tcp_ao_copy_all_matching(const struct sock *sk, struct sock *newsk,
+ struct request_sock *req, struct sk_buff *skb,
+ int family)
+{
+ struct tcp_ao_key *key, *new_key, *first_key;
+ struct tcp_ao_info *new_ao, *ao;
+ struct hlist_node *key_head;
+ int l3index, ret = -ENOMEM;
+ union tcp_ao_addr *addr;
+ bool match = false;
+
+ ao = rcu_dereference(tcp_sk(sk)->ao_info);
+ if (!ao)
+ return 0;
+
+ /* New socket without TCP-AO on it */
+ if (!tcp_rsk_used_ao(req))
+ return 0;
+
+ new_ao = tcp_ao_alloc_info(GFP_ATOMIC);
+ if (!new_ao)
+ return -ENOMEM;
+ new_ao->lisn = htonl(tcp_rsk(req)->snt_isn);
+ new_ao->risn = htonl(tcp_rsk(req)->rcv_isn);
+ new_ao->ao_required = ao->ao_required;
+ new_ao->accept_icmps = ao->accept_icmps;
+
+ if (family == AF_INET) {
+ addr = (union tcp_ao_addr *)&newsk->sk_daddr;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (family == AF_INET6) {
+ addr = (union tcp_ao_addr *)&newsk->sk_v6_daddr;
+#endif
+ } else {
+ ret = -EAFNOSUPPORT;
+ goto free_ao;
+ }
+ l3index = l3mdev_master_ifindex_by_index(sock_net(newsk),
+ newsk->sk_bound_dev_if);
+
+ hlist_for_each_entry_rcu(key, &ao->head, node) {
+ if (tcp_ao_key_cmp(key, l3index, addr, key->prefixlen, family, -1, -1))
+ continue;
+
+ new_key = tcp_ao_copy_key(newsk, key);
+ if (!new_key)
+ goto free_and_exit;
+
+ tcp_ao_cache_traffic_keys(newsk, new_ao, new_key);
+ tcp_ao_link_mkt(new_ao, new_key);
+ match = true;
+ }
+
+ if (!match) {
+ /* RFC5925 (7.4.1) specifies that the TCP-AO status
+ * of a connection is determined on the initial SYN.
+ * At this point the connection was TCP-AO enabled, so
+ * it can't switch to being unsigned if peer's key
+ * disappears on the listening socket.
+ */
+ ret = -EKEYREJECTED;
+ goto free_and_exit;
+ }
+
+ if (!static_key_fast_inc_not_disabled(&tcp_ao_needed.key.key)) {
+ ret = -EUSERS;
+ goto free_and_exit;
+ }
+
+ key_head = rcu_dereference(hlist_first_rcu(&new_ao->head));
+ first_key = hlist_entry_safe(key_head, struct tcp_ao_key, node);
+
+ key = tcp_ao_established_key(req_to_sk(req), new_ao, tcp_rsk(req)->ao_keyid, -1);
+ if (key)
+ new_ao->current_key = key;
+ else
+ new_ao->current_key = first_key;
+
+ /* set rnext_key */
+ key = tcp_ao_established_key(req_to_sk(req), new_ao, -1, tcp_rsk(req)->ao_rcv_next);
+ if (key)
+ new_ao->rnext_key = key;
+ else
+ new_ao->rnext_key = first_key;
+
+ sk_gso_disable(newsk);
+ rcu_assign_pointer(tcp_sk(newsk)->ao_info, new_ao);
+
+ return 0;
+
+free_and_exit:
+ hlist_for_each_entry_safe(key, key_head, &new_ao->head, node) {
+ hlist_del(&key->node);
+ tcp_sigpool_release(key->tcp_sigpool_id);
+ atomic_sub(tcp_ao_sizeof_key(key), &newsk->sk_omem_alloc);
+ kfree_sensitive(key);
+ }
+free_ao:
+ kfree(new_ao);
+ return ret;
+}
+
+static bool tcp_ao_can_set_current_rnext(struct sock *sk)
+{
+ /* There aren't current/rnext keys on TCP_LISTEN sockets */
+ if (sk->sk_state == TCP_LISTEN)
+ return false;
+ return true;
+}
+
+static int tcp_ao_verify_ipv4(struct sock *sk, struct tcp_ao_add *cmd,
+ union tcp_ao_addr **addr)
+{
+ struct sockaddr_in *sin = (struct sockaddr_in *)&cmd->addr;
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (sin->sin_family != AF_INET)
+ return -EINVAL;
+
+ /* Currently matching is not performed on port (or port ranges) */
+ if (sin->sin_port != 0)
+ return -EINVAL;
+
+ /* Check prefix and trailing 0's in addr */
+ if (cmd->prefix != 0) {
+ __be32 mask;
+
+ if (ntohl(sin->sin_addr.s_addr) == INADDR_ANY)
+ return -EINVAL;
+ if (cmd->prefix > 32)
+ return -EINVAL;
+
+ mask = inet_make_mask(cmd->prefix);
+ if (sin->sin_addr.s_addr & ~mask)
+ return -EINVAL;
+
+ /* Check that MKT address is consistent with socket */
+ if (ntohl(inet->inet_daddr) != INADDR_ANY &&
+ (inet->inet_daddr & mask) != sin->sin_addr.s_addr)
+ return -EINVAL;
+ } else {
+ if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY)
+ return -EINVAL;
+ }
+
+ *addr = (union tcp_ao_addr *)&sin->sin_addr;
+ return 0;
+}
+
+static int tcp_ao_parse_crypto(struct tcp_ao_add *cmd, struct tcp_ao_key *key)
+{
+ unsigned int syn_tcp_option_space;
+ bool is_kdf_aes_128_cmac = false;
+ struct crypto_ahash *tfm;
+ struct tcp_sigpool hp;
+ void *tmp_key = NULL;
+ int err;
+
+ /* RFC5926, 3.1.1.2. KDF_AES_128_CMAC */
+ if (!strcmp("cmac(aes128)", cmd->alg_name)) {
+ strscpy(cmd->alg_name, "cmac(aes)", sizeof(cmd->alg_name));
+ is_kdf_aes_128_cmac = (cmd->keylen != 16);
+ tmp_key = kmalloc(cmd->keylen, GFP_KERNEL);
+ if (!tmp_key)
+ return -ENOMEM;
+ }
+
+ key->maclen = cmd->maclen ?: 12; /* 12 is the default in RFC5925 */
+
+ /* Check: maclen + tcp-ao header <= (MAX_TCP_OPTION_SPACE - mss
+ * - tstamp (including sackperm)
+ * - wscale),
+ * see tcp_syn_options(), tcp_synack_options(), commit 33ad798c924b.
+ *
+ * In order to allow D-SACK with TCP-AO, the header size should be:
+ * (MAX_TCP_OPTION_SPACE - TCPOLEN_TSTAMP_ALIGNED
+ * - TCPOLEN_SACK_BASE_ALIGNED
+ * - 2 * TCPOLEN_SACK_PERBLOCK) = 8 (maclen = 4),
+ * see tcp_established_options().
+ *
+ * RFC5925, 2.2:
+ * Typical MACs are 96-128 bits (12-16 bytes), but any length
+ * that fits in the header of the segment being authenticated
+ * is allowed.
+ *
+ * RFC5925, 7.6:
+ * TCP-AO continues to consume 16 bytes in non-SYN segments,
+ * leaving a total of 24 bytes for other options, of which
+ * the timestamp consumes 10. This leaves 14 bytes, of which 10
+ * are used for a single SACK block. When two SACK blocks are used,
+ * such as to handle D-SACK, a smaller TCP-AO MAC would be required
+ * to make room for the additional SACK block (i.e., to leave 18
+ * bytes for the D-SACK variant of the SACK option) [RFC2883].
+ * Note that D-SACK is not supportable in TCP MD5 in the presence
+ * of timestamps, because TCP MD5’s MAC length is fixed and too
+ * large to leave sufficient option space.
+ */
+ syn_tcp_option_space = MAX_TCP_OPTION_SPACE;
+ syn_tcp_option_space -= TCPOLEN_MSS_ALIGNED;
+ syn_tcp_option_space -= TCPOLEN_TSTAMP_ALIGNED;
+ syn_tcp_option_space -= TCPOLEN_WSCALE_ALIGNED;
+ if (tcp_ao_len_aligned(key) > syn_tcp_option_space) {
+ err = -EMSGSIZE;
+ goto err_kfree;
+ }
+
+ key->keylen = cmd->keylen;
+ memcpy(key->key, cmd->key, cmd->keylen);
+
+ err = tcp_sigpool_start(key->tcp_sigpool_id, &hp);
+ if (err)
+ goto err_kfree;
+
+ tfm = crypto_ahash_reqtfm(hp.req);
+ if (is_kdf_aes_128_cmac) {
+ void *scratch = hp.scratch;
+ struct scatterlist sg;
+
+ memcpy(tmp_key, cmd->key, cmd->keylen);
+ sg_init_one(&sg, tmp_key, cmd->keylen);
+
+ /* Using zero-key of 16 bytes as described in RFC5926 */
+ memset(scratch, 0, 16);
+ err = crypto_ahash_setkey(tfm, scratch, 16);
+ if (err)
+ goto err_pool_end;
+
+ err = crypto_ahash_init(hp.req);
+ if (err)
+ goto err_pool_end;
+
+ ahash_request_set_crypt(hp.req, &sg, key->key, cmd->keylen);
+ err = crypto_ahash_update(hp.req);
+ if (err)
+ goto err_pool_end;
+
+ err |= crypto_ahash_final(hp.req);
+ if (err)
+ goto err_pool_end;
+ key->keylen = 16;
+ }
+
+ err = crypto_ahash_setkey(tfm, key->key, key->keylen);
+ if (err)
+ goto err_pool_end;
+
+ tcp_sigpool_end(&hp);
+ kfree_sensitive(tmp_key);
+
+ if (tcp_ao_maclen(key) > key->digest_size)
+ return -EINVAL;
+
+ return 0;
+
+err_pool_end:
+ tcp_sigpool_end(&hp);
+err_kfree:
+ kfree_sensitive(tmp_key);
+ return err;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int tcp_ao_verify_ipv6(struct sock *sk, struct tcp_ao_add *cmd,
+ union tcp_ao_addr **paddr,
+ unsigned short int *family)
+{
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd->addr;
+ struct in6_addr *addr = &sin6->sin6_addr;
+ u8 prefix = cmd->prefix;
+
+ if (sin6->sin6_family != AF_INET6)
+ return -EINVAL;
+
+ /* Currently matching is not performed on port (or port ranges) */
+ if (sin6->sin6_port != 0)
+ return -EINVAL;
+
+ /* Check prefix and trailing 0's in addr */
+ if (cmd->prefix != 0 && ipv6_addr_v4mapped(addr)) {
+ __be32 addr4 = addr->s6_addr32[3];
+ __be32 mask;
+
+ if (prefix > 32 || ntohl(addr4) == INADDR_ANY)
+ return -EINVAL;
+
+ mask = inet_make_mask(prefix);
+ if (addr4 & ~mask)
+ return -EINVAL;
+
+ /* Check that MKT address is consistent with socket */
+ if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
+ __be32 daddr4 = sk->sk_v6_daddr.s6_addr32[3];
+
+ if (!ipv6_addr_v4mapped(&sk->sk_v6_daddr))
+ return -EINVAL;
+ if ((daddr4 & mask) != addr4)
+ return -EINVAL;
+ }
+
+ *paddr = (union tcp_ao_addr *)&addr->s6_addr32[3];
+ *family = AF_INET;
+ return 0;
+ } else if (cmd->prefix != 0) {
+ struct in6_addr pfx;
+
+ if (ipv6_addr_any(addr) || prefix > 128)
+ return -EINVAL;
+
+ ipv6_addr_prefix(&pfx, addr, prefix);
+ if (ipv6_addr_cmp(&pfx, addr))
+ return -EINVAL;
+
+ /* Check that MKT address is consistent with socket */
+ if (!ipv6_addr_any(&sk->sk_v6_daddr) &&
+ !ipv6_prefix_equal(&sk->sk_v6_daddr, addr, prefix))
+
+ return -EINVAL;
+ } else {
+ if (!ipv6_addr_any(addr))
+ return -EINVAL;
+ }
+
+ *paddr = (union tcp_ao_addr *)addr;
+ return 0;
+}
+#else
+static int tcp_ao_verify_ipv6(struct sock *sk, struct tcp_ao_add *cmd,
+ union tcp_ao_addr **paddr,
+ unsigned short int *family)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
+static struct tcp_ao_info *setsockopt_ao_info(struct sock *sk)
+{
+ if (sk_fullsock(sk)) {
+ return rcu_dereference_protected(tcp_sk(sk)->ao_info,
+ lockdep_sock_is_held(sk));
+ } else if (sk->sk_state == TCP_TIME_WAIT) {
+ return rcu_dereference_protected(tcp_twsk(sk)->ao_info,
+ lockdep_sock_is_held(sk));
+ }
+ return ERR_PTR(-ESOCKTNOSUPPORT);
+}
+
+static struct tcp_ao_info *getsockopt_ao_info(struct sock *sk)
+{
+ if (sk_fullsock(sk))
+ return rcu_dereference(tcp_sk(sk)->ao_info);
+ else if (sk->sk_state == TCP_TIME_WAIT)
+ return rcu_dereference(tcp_twsk(sk)->ao_info);
+
+ return ERR_PTR(-ESOCKTNOSUPPORT);
+}
+
+#define TCP_AO_KEYF_ALL (TCP_AO_KEYF_IFINDEX | TCP_AO_KEYF_EXCLUDE_OPT)
+#define TCP_AO_GET_KEYF_VALID (TCP_AO_KEYF_IFINDEX)
+
+static struct tcp_ao_key *tcp_ao_key_alloc(struct sock *sk,
+ struct tcp_ao_add *cmd)
+{
+ const char *algo = cmd->alg_name;
+ unsigned int digest_size;
+ struct crypto_ahash *tfm;
+ struct tcp_ao_key *key;
+ struct tcp_sigpool hp;
+ int err, pool_id;
+ size_t size;
+
+ /* Force null-termination of alg_name */
+ cmd->alg_name[ARRAY_SIZE(cmd->alg_name) - 1] = '\0';
+
+ /* RFC5926, 3.1.1.2. KDF_AES_128_CMAC */
+ if (!strcmp("cmac(aes128)", algo))
+ algo = "cmac(aes)";
+
+ /* Full TCP header (th->doff << 2) should fit into scratch area,
+ * see tcp_ao_hash_header().
+ */
+ pool_id = tcp_sigpool_alloc_ahash(algo, 60);
+ if (pool_id < 0)
+ return ERR_PTR(pool_id);
+
+ err = tcp_sigpool_start(pool_id, &hp);
+ if (err)
+ goto err_free_pool;
+
+ tfm = crypto_ahash_reqtfm(hp.req);
+ digest_size = crypto_ahash_digestsize(tfm);
+ tcp_sigpool_end(&hp);
+
+ size = sizeof(struct tcp_ao_key) + (digest_size << 1);
+ key = sock_kmalloc(sk, size, GFP_KERNEL);
+ if (!key) {
+ err = -ENOMEM;
+ goto err_free_pool;
+ }
+
+ key->tcp_sigpool_id = pool_id;
+ key->digest_size = digest_size;
+ return key;
+
+err_free_pool:
+ tcp_sigpool_release(pool_id);
+ return ERR_PTR(err);
+}
+
+static int tcp_ao_add_cmd(struct sock *sk, unsigned short int family,
+ sockptr_t optval, int optlen)
+{
+ struct tcp_ao_info *ao_info;
+ union tcp_ao_addr *addr;
+ struct tcp_ao_key *key;
+ struct tcp_ao_add cmd;
+ int ret, l3index = 0;
+ bool first = false;
+
+ if (optlen < sizeof(cmd))
+ return -EINVAL;
+
+ ret = copy_struct_from_sockptr(&cmd, sizeof(cmd), optval, optlen);
+ if (ret)
+ return ret;
+
+ if (cmd.keylen > TCP_AO_MAXKEYLEN)
+ return -EINVAL;
+
+ if (cmd.reserved != 0 || cmd.reserved2 != 0)
+ return -EINVAL;
+
+ if (family == AF_INET)
+ ret = tcp_ao_verify_ipv4(sk, &cmd, &addr);
+ else
+ ret = tcp_ao_verify_ipv6(sk, &cmd, &addr, &family);
+ if (ret)
+ return ret;
+
+ if (cmd.keyflags & ~TCP_AO_KEYF_ALL)
+ return -EINVAL;
+
+ if (cmd.set_current || cmd.set_rnext) {
+ if (!tcp_ao_can_set_current_rnext(sk))
+ return -EINVAL;
+ }
+
+ if (cmd.ifindex && !(cmd.keyflags & TCP_AO_KEYF_IFINDEX))
+ return -EINVAL;
+
+ /* For cmd.tcp_ifindex = 0 the key will apply to the default VRF */
+ if (cmd.keyflags & TCP_AO_KEYF_IFINDEX && cmd.ifindex) {
+ int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(sk), cmd.ifindex);
+ if (dev && netif_is_l3_master(dev))
+ l3index = dev->ifindex;
+ rcu_read_unlock();
+
+ if (!dev || !l3index)
+ return -EINVAL;
+
+ if (!bound_dev_if || bound_dev_if != cmd.ifindex) {
+ /* tcp_ao_established_key() doesn't expect having
+ * non peer-matching key on an established TCP-AO
+ * connection.
+ */
+ if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)))
+ return -EINVAL;
+ }
+
+ /* It's still possible to bind after adding keys or even
+ * re-bind to a different dev (with CAP_NET_RAW).
+ * So, no reason to return error here, rather try to be
+ * nice and warn the user.
+ */
+ if (bound_dev_if && bound_dev_if != cmd.ifindex)
+ net_warn_ratelimited("AO key ifindex %d != sk bound ifindex %d\n",
+ cmd.ifindex, bound_dev_if);
+ }
+
+ /* Don't allow keys for peers that have a matching TCP-MD5 key */
+ if (cmd.keyflags & TCP_AO_KEYF_IFINDEX) {
+ /* Non-_exact version of tcp_md5_do_lookup() will
+ * as well match keys that aren't bound to a specific VRF
+ * (that will make them match AO key with
+ * sysctl_tcp_l3dev_accept = 1
+ */
+ if (tcp_md5_do_lookup(sk, l3index, addr, family))
+ return -EKEYREJECTED;
+ } else {
+ if (tcp_md5_do_lookup_any_l3index(sk, addr, family))
+ return -EKEYREJECTED;
+ }
+
+ ao_info = setsockopt_ao_info(sk);
+ if (IS_ERR(ao_info))
+ return PTR_ERR(ao_info);
+
+ if (!ao_info) {
+ ao_info = tcp_ao_alloc_info(GFP_KERNEL);
+ if (!ao_info)
+ return -ENOMEM;
+ first = true;
+ } else {
+ /* Check that neither RecvID nor SendID match any
+ * existing key for the peer, RFC5925 3.1:
+ * > The IDs of MKTs MUST NOT overlap where their
+ * > TCP connection identifiers overlap.
+ */
+ if (__tcp_ao_do_lookup(sk, l3index, addr, family, cmd.prefix, -1, cmd.rcvid))
+ return -EEXIST;
+ if (__tcp_ao_do_lookup(sk, l3index, addr, family,
+ cmd.prefix, cmd.sndid, -1))
+ return -EEXIST;
+ }
+
+ key = tcp_ao_key_alloc(sk, &cmd);
+ if (IS_ERR(key)) {
+ ret = PTR_ERR(key);
+ goto err_free_ao;
+ }
+
+ INIT_HLIST_NODE(&key->node);
+ memcpy(&key->addr, addr, (family == AF_INET) ? sizeof(struct in_addr) :
+ sizeof(struct in6_addr));
+ key->prefixlen = cmd.prefix;
+ key->family = family;
+ key->keyflags = cmd.keyflags;
+ key->sndid = cmd.sndid;
+ key->rcvid = cmd.rcvid;
+ key->l3index = l3index;
+ atomic64_set(&key->pkt_good, 0);
+ atomic64_set(&key->pkt_bad, 0);
+
+ ret = tcp_ao_parse_crypto(&cmd, key);
+ if (ret < 0)
+ goto err_free_sock;
+
+ if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))) {
+ tcp_ao_cache_traffic_keys(sk, ao_info, key);
+ if (first) {
+ ao_info->current_key = key;
+ ao_info->rnext_key = key;
+ }
+ }
+
+ tcp_ao_link_mkt(ao_info, key);
+ if (first) {
+ if (!static_branch_inc(&tcp_ao_needed.key)) {
+ ret = -EUSERS;
+ goto err_free_sock;
+ }
+ sk_gso_disable(sk);
+ rcu_assign_pointer(tcp_sk(sk)->ao_info, ao_info);
+ }
+
+ if (cmd.set_current)
+ WRITE_ONCE(ao_info->current_key, key);
+ if (cmd.set_rnext)
+ WRITE_ONCE(ao_info->rnext_key, key);
+ return 0;
+
+err_free_sock:
+ atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc);
+ tcp_sigpool_release(key->tcp_sigpool_id);
+ kfree_sensitive(key);
+err_free_ao:
+ if (first)
+ kfree(ao_info);
+ return ret;
+}
+
+static int tcp_ao_delete_key(struct sock *sk, struct tcp_ao_info *ao_info,
+ bool del_async, struct tcp_ao_key *key,
+ struct tcp_ao_key *new_current,
+ struct tcp_ao_key *new_rnext)
+{
+ int err;
+
+ hlist_del_rcu(&key->node);
+
+ /* Support for async delete on listening sockets: as they don't
+ * need current_key/rnext_key maintaining, we don't need to check
+ * them and we can just free all resources in RCU fashion.
+ */
+ if (del_async) {
+ atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc);
+ call_rcu(&key->rcu, tcp_ao_key_free_rcu);
+ return 0;
+ }
+
+ /* At this moment another CPU could have looked this key up
+ * while it was unlinked from the list. Wait for RCU grace period,
+ * after which the key is off-list and can't be looked up again;
+ * the rx path [just before RCU came] might have used it and set it
+ * as current_key (very unlikely).
+ * Free the key with next RCU grace period (in case it was
+ * current_key before tcp_ao_current_rnext() might have
+ * changed it in forced-delete).
+ */
+ synchronize_rcu();
+ if (new_current)
+ WRITE_ONCE(ao_info->current_key, new_current);
+ if (new_rnext)
+ WRITE_ONCE(ao_info->rnext_key, new_rnext);
+
+ if (unlikely(READ_ONCE(ao_info->current_key) == key ||
+ READ_ONCE(ao_info->rnext_key) == key)) {
+ err = -EBUSY;
+ goto add_key;
+ }
+
+ atomic_sub(tcp_ao_sizeof_key(key), &sk->sk_omem_alloc);
+ call_rcu(&key->rcu, tcp_ao_key_free_rcu);
+
+ return 0;
+add_key:
+ hlist_add_head_rcu(&key->node, &ao_info->head);
+ return err;
+}
+
+#define TCP_AO_DEL_KEYF_ALL (TCP_AO_KEYF_IFINDEX)
+static int tcp_ao_del_cmd(struct sock *sk, unsigned short int family,
+ sockptr_t optval, int optlen)
+{
+ struct tcp_ao_key *key, *new_current = NULL, *new_rnext = NULL;
+ int err, addr_len, l3index = 0;
+ struct tcp_ao_info *ao_info;
+ union tcp_ao_addr *addr;
+ struct tcp_ao_del cmd;
+ __u8 prefix;
+ u16 port;
+
+ if (optlen < sizeof(cmd))
+ return -EINVAL;
+
+ err = copy_struct_from_sockptr(&cmd, sizeof(cmd), optval, optlen);
+ if (err)
+ return err;
+
+ if (cmd.reserved != 0 || cmd.reserved2 != 0)
+ return -EINVAL;
+
+ if (cmd.set_current || cmd.set_rnext) {
+ if (!tcp_ao_can_set_current_rnext(sk))
+ return -EINVAL;
+ }
+
+ if (cmd.keyflags & ~TCP_AO_DEL_KEYF_ALL)
+ return -EINVAL;
+
+ /* No sanity check for TCP_AO_KEYF_IFINDEX as if a VRF
+ * was destroyed, there still should be a way to delete keys,
+ * that were bound to that l3intf. So, fail late at lookup stage
+ * if there is no key for that ifindex.
+ */
+ if (cmd.ifindex && !(cmd.keyflags & TCP_AO_KEYF_IFINDEX))
+ return -EINVAL;
+
+ ao_info = setsockopt_ao_info(sk);
+ if (IS_ERR(ao_info))
+ return PTR_ERR(ao_info);
+ if (!ao_info)
+ return -ENOENT;
+
+ /* For sockets in TCP_CLOSED it's possible set keys that aren't
+ * matching the future peer (address/VRF/etc),
+ * tcp_ao_connect_init() will choose a correct matching MKT
+ * if there's any.
+ */
+ if (cmd.set_current) {
+ new_current = tcp_ao_established_key(sk, ao_info, cmd.current_key, -1);
+ if (!new_current)
+ return -ENOENT;
+ }
+ if (cmd.set_rnext) {
+ new_rnext = tcp_ao_established_key(sk, ao_info, -1, cmd.rnext);
+ if (!new_rnext)
+ return -ENOENT;
+ }
+ if (cmd.del_async && sk->sk_state != TCP_LISTEN)
+ return -EINVAL;
+
+ if (family == AF_INET) {
+ struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.addr;
+
+ addr = (union tcp_ao_addr *)&sin->sin_addr;
+ addr_len = sizeof(struct in_addr);
+ port = ntohs(sin->sin_port);
+ } else {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.addr;
+ struct in6_addr *addr6 = &sin6->sin6_addr;
+
+ if (ipv6_addr_v4mapped(addr6)) {
+ addr = (union tcp_ao_addr *)&addr6->s6_addr32[3];
+ addr_len = sizeof(struct in_addr);
+ family = AF_INET;
+ } else {
+ addr = (union tcp_ao_addr *)addr6;
+ addr_len = sizeof(struct in6_addr);
+ }
+ port = ntohs(sin6->sin6_port);
+ }
+ prefix = cmd.prefix;
+
+ /* Currently matching is not performed on port (or port ranges) */
+ if (port != 0)
+ return -EINVAL;
+
+ /* We could choose random present key here for current/rnext
+ * but that's less predictable. Let's be strict and don't
+ * allow removing a key that's in use. RFC5925 doesn't
+ * specify how-to coordinate key removal, but says:
+ * "It is presumed that an MKT affecting a particular
+ * connection cannot be destroyed during an active connection"
+ */
+ hlist_for_each_entry_rcu(key, &ao_info->head, node,
+ lockdep_sock_is_held(sk)) {
+ if (cmd.sndid != key->sndid ||
+ cmd.rcvid != key->rcvid)
+ continue;
+
+ if (family != key->family ||
+ prefix != key->prefixlen ||
+ memcmp(addr, &key->addr, addr_len))
+ continue;
+
+ if ((cmd.keyflags & TCP_AO_KEYF_IFINDEX) !=
+ (key->keyflags & TCP_AO_KEYF_IFINDEX))
+ continue;
+
+ if (key->l3index != l3index)
+ continue;
+
+ if (key == new_current || key == new_rnext)
+ continue;
+
+ return tcp_ao_delete_key(sk, ao_info, cmd.del_async, key,
+ new_current, new_rnext);
+ }
+ return -ENOENT;
+}
+
+/* cmd.ao_required makes a socket TCP-AO only.
+ * Don't allow any md5 keys for any l3intf on the socket together with it.
+ * Restricting it early in setsockopt() removes a check for
+ * ao_info->ao_required on inbound tcp segment fast-path.
+ */
+static int tcp_ao_required_verify(struct sock *sk)
+{
+#ifdef CONFIG_TCP_MD5SIG
+ const struct tcp_md5sig_info *md5sig;
+
+ if (!static_branch_unlikely(&tcp_md5_needed.key))
+ return 0;
+
+ md5sig = rcu_dereference_check(tcp_sk(sk)->md5sig_info,
+ lockdep_sock_is_held(sk));
+ if (!md5sig)
+ return 0;
+
+ if (rcu_dereference_check(hlist_first_rcu(&md5sig->head),
+ lockdep_sock_is_held(sk)))
+ return 1;
+#endif
+ return 0;
+}
+
+static int tcp_ao_info_cmd(struct sock *sk, unsigned short int family,
+ sockptr_t optval, int optlen)
+{
+ struct tcp_ao_key *new_current = NULL, *new_rnext = NULL;
+ struct tcp_ao_info *ao_info;
+ struct tcp_ao_info_opt cmd;
+ bool first = false;
+ int err;
+
+ if (optlen < sizeof(cmd))
+ return -EINVAL;
+
+ err = copy_struct_from_sockptr(&cmd, sizeof(cmd), optval, optlen);
+ if (err)
+ return err;
+
+ if (cmd.set_current || cmd.set_rnext) {
+ if (!tcp_ao_can_set_current_rnext(sk))
+ return -EINVAL;
+ }
+
+ if (cmd.reserved != 0 || cmd.reserved2 != 0)
+ return -EINVAL;
+
+ ao_info = setsockopt_ao_info(sk);
+ if (IS_ERR(ao_info))
+ return PTR_ERR(ao_info);
+ if (!ao_info) {
+ if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)))
+ return -EINVAL;
+ ao_info = tcp_ao_alloc_info(GFP_KERNEL);
+ if (!ao_info)
+ return -ENOMEM;
+ first = true;
+ }
+
+ if (cmd.ao_required && tcp_ao_required_verify(sk)) {
+ err = -EKEYREJECTED;
+ goto out;
+ }
+
+ /* For sockets in TCP_CLOSED it's possible set keys that aren't
+ * matching the future peer (address/port/VRF/etc),
+ * tcp_ao_connect_init() will choose a correct matching MKT
+ * if there's any.
+ */
+ if (cmd.set_current) {
+ new_current = tcp_ao_established_key(sk, ao_info, cmd.current_key, -1);
+ if (!new_current) {
+ err = -ENOENT;
+ goto out;
+ }
+ }
+ if (cmd.set_rnext) {
+ new_rnext = tcp_ao_established_key(sk, ao_info, -1, cmd.rnext);
+ if (!new_rnext) {
+ err = -ENOENT;
+ goto out;
+ }
+ }
+ if (cmd.set_counters) {
+ atomic64_set(&ao_info->counters.pkt_good, cmd.pkt_good);
+ atomic64_set(&ao_info->counters.pkt_bad, cmd.pkt_bad);
+ atomic64_set(&ao_info->counters.key_not_found, cmd.pkt_key_not_found);
+ atomic64_set(&ao_info->counters.ao_required, cmd.pkt_ao_required);
+ atomic64_set(&ao_info->counters.dropped_icmp, cmd.pkt_dropped_icmp);
+ }
+
+ ao_info->ao_required = cmd.ao_required;
+ ao_info->accept_icmps = cmd.accept_icmps;
+ if (new_current)
+ WRITE_ONCE(ao_info->current_key, new_current);
+ if (new_rnext)
+ WRITE_ONCE(ao_info->rnext_key, new_rnext);
+ if (first) {
+ if (!static_branch_inc(&tcp_ao_needed.key)) {
+ err = -EUSERS;
+ goto out;
+ }
+ sk_gso_disable(sk);
+ rcu_assign_pointer(tcp_sk(sk)->ao_info, ao_info);
+ }
+ return 0;
+out:
+ if (first)
+ kfree(ao_info);
+ return err;
+}
+
+int tcp_parse_ao(struct sock *sk, int cmd, unsigned short int family,
+ sockptr_t optval, int optlen)
+{
+ if (WARN_ON_ONCE(family != AF_INET && family != AF_INET6))
+ return -EAFNOSUPPORT;
+
+ switch (cmd) {
+ case TCP_AO_ADD_KEY:
+ return tcp_ao_add_cmd(sk, family, optval, optlen);
+ case TCP_AO_DEL_KEY:
+ return tcp_ao_del_cmd(sk, family, optval, optlen);
+ case TCP_AO_INFO:
+ return tcp_ao_info_cmd(sk, family, optval, optlen);
+ default:
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+}
+
+int tcp_v4_parse_ao(struct sock *sk, int cmd, sockptr_t optval, int optlen)
+{
+ return tcp_parse_ao(sk, cmd, AF_INET, optval, optlen);
+}
+
+/* tcp_ao_copy_mkts_to_user(ao_info, optval, optlen)
+ *
+ * @ao_info: struct tcp_ao_info on the socket that
+ * socket getsockopt(TCP_AO_GET_KEYS) is executed on
+ * @optval: pointer to array of tcp_ao_getsockopt structures in user space.
+ * Must be != NULL.
+ * @optlen: pointer to size of tcp_ao_getsockopt structure.
+ * Must be != NULL.
+ *
+ * Return value: 0 on success, a negative error number otherwise.
+ *
+ * optval points to an array of tcp_ao_getsockopt structures in user space.
+ * optval[0] is used as both input and output to getsockopt. It determines
+ * which keys are returned by the kernel.
+ * optval[0].nkeys is the size of the array in user space. On return it contains
+ * the number of keys matching the search criteria.
+ * If tcp_ao_getsockopt::get_all is set, then all keys in the socket are
+ * returned, otherwise only keys matching <addr, prefix, sndid, rcvid>
+ * in optval[0] are returned.
+ * optlen is also used as both input and output. The user provides the size
+ * of struct tcp_ao_getsockopt in user space, and the kernel returns the size
+ * of the structure in kernel space.
+ * The size of struct tcp_ao_getsockopt may differ between user and kernel.
+ * There are three cases to consider:
+ * * If usize == ksize, then keys are copied verbatim.
+ * * If usize < ksize, then the userspace has passed an old struct to a
+ * newer kernel. The rest of the trailing bytes in optval[0]
+ * (ksize - usize) are interpreted as 0 by the kernel.
+ * * If usize > ksize, then the userspace has passed a new struct to an
+ * older kernel. The trailing bytes unknown to the kernel (usize - ksize)
+ * are checked to ensure they are zeroed, otherwise -E2BIG is returned.
+ * On return the kernel fills in min(usize, ksize) in each entry of the array.
+ * The layout of the fields in the user and kernel structures is expected to
+ * be the same (including in the 32bit vs 64bit case).
+ */
+static int tcp_ao_copy_mkts_to_user(const struct sock *sk,
+ struct tcp_ao_info *ao_info,
+ sockptr_t optval, sockptr_t optlen)
+{
+ struct tcp_ao_getsockopt opt_in, opt_out;
+ struct tcp_ao_key *key, *current_key;
+ bool do_address_matching = true;
+ union tcp_ao_addr *addr = NULL;
+ int err, l3index, user_len;
+ unsigned int max_keys; /* maximum number of keys to copy to user */
+ size_t out_offset = 0;
+ size_t bytes_to_write; /* number of bytes to write to user level */
+ u32 matched_keys; /* keys from ao_info matched so far */
+ int optlen_out;
+ __be16 port = 0;
+
+ if (copy_from_sockptr(&user_len, optlen, sizeof(int)))
+ return -EFAULT;
+
+ if (user_len <= 0)
+ return -EINVAL;
+
+ memset(&opt_in, 0, sizeof(struct tcp_ao_getsockopt));
+ err = copy_struct_from_sockptr(&opt_in, sizeof(opt_in),
+ optval, user_len);
+ if (err < 0)
+ return err;
+
+ if (opt_in.pkt_good || opt_in.pkt_bad)
+ return -EINVAL;
+ if (opt_in.keyflags & ~TCP_AO_GET_KEYF_VALID)
+ return -EINVAL;
+ if (opt_in.ifindex && !(opt_in.keyflags & TCP_AO_KEYF_IFINDEX))
+ return -EINVAL;
+
+ if (opt_in.reserved != 0)
+ return -EINVAL;
+
+ max_keys = opt_in.nkeys;
+ l3index = (opt_in.keyflags & TCP_AO_KEYF_IFINDEX) ? opt_in.ifindex : -1;
+
+ if (opt_in.get_all || opt_in.is_current || opt_in.is_rnext) {
+ if (opt_in.get_all && (opt_in.is_current || opt_in.is_rnext))
+ return -EINVAL;
+ do_address_matching = false;
+ }
+
+ switch (opt_in.addr.ss_family) {
+ case AF_INET: {
+ struct sockaddr_in *sin;
+ __be32 mask;
+
+ sin = (struct sockaddr_in *)&opt_in.addr;
+ port = sin->sin_port;
+ addr = (union tcp_ao_addr *)&sin->sin_addr;
+
+ if (opt_in.prefix > 32)
+ return -EINVAL;
+
+ if (ntohl(sin->sin_addr.s_addr) == INADDR_ANY &&
+ opt_in.prefix != 0)
+ return -EINVAL;
+
+ mask = inet_make_mask(opt_in.prefix);
+ if (sin->sin_addr.s_addr & ~mask)
+ return -EINVAL;
+
+ break;
+ }
+ case AF_INET6: {
+ struct sockaddr_in6 *sin6;
+ struct in6_addr *addr6;
+
+ sin6 = (struct sockaddr_in6 *)&opt_in.addr;
+ addr = (union tcp_ao_addr *)&sin6->sin6_addr;
+ addr6 = &sin6->sin6_addr;
+ port = sin6->sin6_port;
+
+ /* We don't have to change family and @addr here if
+ * ipv6_addr_v4mapped() like in key adding:
+ * tcp_ao_key_cmp() does it. Do the sanity checks though.
+ */
+ if (opt_in.prefix != 0) {
+ if (ipv6_addr_v4mapped(addr6)) {
+ __be32 mask, addr4 = addr6->s6_addr32[3];
+
+ if (opt_in.prefix > 32 ||
+ ntohl(addr4) == INADDR_ANY)
+ return -EINVAL;
+ mask = inet_make_mask(opt_in.prefix);
+ if (addr4 & ~mask)
+ return -EINVAL;
+ } else {
+ struct in6_addr pfx;
+
+ if (ipv6_addr_any(addr6) ||
+ opt_in.prefix > 128)
+ return -EINVAL;
+
+ ipv6_addr_prefix(&pfx, addr6, opt_in.prefix);
+ if (ipv6_addr_cmp(&pfx, addr6))
+ return -EINVAL;
+ }
+ } else if (!ipv6_addr_any(addr6)) {
+ return -EINVAL;
+ }
+ break;
+ }
+ case 0:
+ if (!do_address_matching)
+ break;
+ fallthrough;
+ default:
+ return -EAFNOSUPPORT;
+ }
+
+ if (!do_address_matching) {
+ /* We could just ignore those, but let's do stricter checks */
+ if (addr || port)
+ return -EINVAL;
+ if (opt_in.prefix || opt_in.sndid || opt_in.rcvid)
+ return -EINVAL;
+ }
+
+ bytes_to_write = min_t(int, user_len, sizeof(struct tcp_ao_getsockopt));
+ matched_keys = 0;
+ /* May change in RX, while we're dumping, pre-fetch it */
+ current_key = READ_ONCE(ao_info->current_key);
+
+ hlist_for_each_entry_rcu(key, &ao_info->head, node,
+ lockdep_sock_is_held(sk)) {
+ if (opt_in.get_all)
+ goto match;
+
+ if (opt_in.is_current || opt_in.is_rnext) {
+ if (opt_in.is_current && key == current_key)
+ goto match;
+ if (opt_in.is_rnext && key == ao_info->rnext_key)
+ goto match;
+ continue;
+ }
+
+ if (tcp_ao_key_cmp(key, l3index, addr, opt_in.prefix,
+ opt_in.addr.ss_family,
+ opt_in.sndid, opt_in.rcvid) != 0)
+ continue;
+match:
+ matched_keys++;
+ if (matched_keys > max_keys)
+ continue;
+
+ memset(&opt_out, 0, sizeof(struct tcp_ao_getsockopt));
+
+ if (key->family == AF_INET) {
+ struct sockaddr_in *sin_out = (struct sockaddr_in *)&opt_out.addr;
+
+ sin_out->sin_family = key->family;
+ sin_out->sin_port = 0;
+ memcpy(&sin_out->sin_addr, &key->addr, sizeof(struct in_addr));
+ } else {
+ struct sockaddr_in6 *sin6_out = (struct sockaddr_in6 *)&opt_out.addr;
+
+ sin6_out->sin6_family = key->family;
+ sin6_out->sin6_port = 0;
+ memcpy(&sin6_out->sin6_addr, &key->addr, sizeof(struct in6_addr));
+ }
+ opt_out.sndid = key->sndid;
+ opt_out.rcvid = key->rcvid;
+ opt_out.prefix = key->prefixlen;
+ opt_out.keyflags = key->keyflags;
+ opt_out.is_current = (key == current_key);
+ opt_out.is_rnext = (key == ao_info->rnext_key);
+ opt_out.nkeys = 0;
+ opt_out.maclen = key->maclen;
+ opt_out.keylen = key->keylen;
+ opt_out.ifindex = key->l3index;
+ opt_out.pkt_good = atomic64_read(&key->pkt_good);
+ opt_out.pkt_bad = atomic64_read(&key->pkt_bad);
+ memcpy(&opt_out.key, key->key, key->keylen);
+ tcp_sigpool_algo(key->tcp_sigpool_id, opt_out.alg_name, 64);
+
+ /* Copy key to user */
+ if (copy_to_sockptr_offset(optval, out_offset,
+ &opt_out, bytes_to_write))
+ return -EFAULT;
+ out_offset += user_len;
+ }
+
+ optlen_out = (int)sizeof(struct tcp_ao_getsockopt);
+ if (copy_to_sockptr(optlen, &optlen_out, sizeof(int)))
+ return -EFAULT;
+
+ out_offset = offsetof(struct tcp_ao_getsockopt, nkeys);
+ if (copy_to_sockptr_offset(optval, out_offset,
+ &matched_keys, sizeof(u32)))
+ return -EFAULT;
+
+ return 0;
+}
+
+int tcp_ao_get_mkts(struct sock *sk, sockptr_t optval, sockptr_t optlen)
+{
+ struct tcp_ao_info *ao_info;
+
+ ao_info = setsockopt_ao_info(sk);
+ if (IS_ERR(ao_info))
+ return PTR_ERR(ao_info);
+ if (!ao_info)
+ return -ENOENT;
+
+ return tcp_ao_copy_mkts_to_user(sk, ao_info, optval, optlen);
+}
+
+int tcp_ao_get_sock_info(struct sock *sk, sockptr_t optval, sockptr_t optlen)
+{
+ struct tcp_ao_info_opt out, in = {};
+ struct tcp_ao_key *current_key;
+ struct tcp_ao_info *ao;
+ int err, len;
+
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
+ return -EFAULT;
+
+ if (len <= 0)
+ return -EINVAL;
+
+ /* Copying this "in" only to check ::reserved, ::reserved2,
+ * that may be needed to extend (struct tcp_ao_info_opt) and
+ * what getsockopt() provides in future.
+ */
+ err = copy_struct_from_sockptr(&in, sizeof(in), optval, len);
+ if (err)
+ return err;
+
+ if (in.reserved != 0 || in.reserved2 != 0)
+ return -EINVAL;
+
+ ao = setsockopt_ao_info(sk);
+ if (IS_ERR(ao))
+ return PTR_ERR(ao);
+ if (!ao)
+ return -ENOENT;
+
+ memset(&out, 0, sizeof(out));
+ out.ao_required = ao->ao_required;
+ out.accept_icmps = ao->accept_icmps;
+ out.pkt_good = atomic64_read(&ao->counters.pkt_good);
+ out.pkt_bad = atomic64_read(&ao->counters.pkt_bad);
+ out.pkt_key_not_found = atomic64_read(&ao->counters.key_not_found);
+ out.pkt_ao_required = atomic64_read(&ao->counters.ao_required);
+ out.pkt_dropped_icmp = atomic64_read(&ao->counters.dropped_icmp);
+
+ current_key = READ_ONCE(ao->current_key);
+ if (current_key) {
+ out.set_current = 1;
+ out.current_key = current_key->sndid;
+ }
+ if (ao->rnext_key) {
+ out.set_rnext = 1;
+ out.rnext = ao->rnext_key->rcvid;
+ }
+
+ if (copy_to_sockptr(optval, &out, min_t(int, len, sizeof(out))))
+ return -EFAULT;
+
+ return 0;
+}
+
+int tcp_ao_set_repair(struct sock *sk, sockptr_t optval, unsigned int optlen)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_ao_repair cmd;
+ struct tcp_ao_key *key;
+ struct tcp_ao_info *ao;
+ int err;
+
+ if (optlen < sizeof(cmd))
+ return -EINVAL;
+
+ err = copy_struct_from_sockptr(&cmd, sizeof(cmd), optval, optlen);
+ if (err)
+ return err;
+
+ if (!tp->repair)
+ return -EPERM;
+
+ ao = setsockopt_ao_info(sk);
+ if (IS_ERR(ao))
+ return PTR_ERR(ao);
+ if (!ao)
+ return -ENOENT;
+
+ WRITE_ONCE(ao->lisn, cmd.snt_isn);
+ WRITE_ONCE(ao->risn, cmd.rcv_isn);
+ WRITE_ONCE(ao->snd_sne, cmd.snd_sne);
+ WRITE_ONCE(ao->rcv_sne, cmd.rcv_sne);
+
+ hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk))
+ tcp_ao_cache_traffic_keys(sk, ao, key);
+
+ return 0;
+}
+
+int tcp_ao_get_repair(struct sock *sk, sockptr_t optval, sockptr_t optlen)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_ao_repair opt;
+ struct tcp_ao_info *ao;
+ int len;
+
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
+ return -EFAULT;
+
+ if (len <= 0)
+ return -EINVAL;
+
+ if (!tp->repair)
+ return -EPERM;
+
+ rcu_read_lock();
+ ao = getsockopt_ao_info(sk);
+ if (IS_ERR_OR_NULL(ao)) {
+ rcu_read_unlock();
+ return ao ? PTR_ERR(ao) : -ENOENT;
+ }
+
+ opt.snt_isn = ao->lisn;
+ opt.rcv_isn = ao->risn;
+ opt.snd_sne = READ_ONCE(ao->snd_sne);
+ opt.rcv_sne = READ_ONCE(ao->rcv_sne);
+ rcu_read_unlock();
+
+ if (copy_to_sockptr(optval, &opt, min_t(int, len, sizeof(opt))))
+ return -EFAULT;
+ return 0;
+}
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 02ff2dde9609..760941e55153 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -56,6 +56,8 @@
* otherwise TCP stack falls back to an internal pacing using one high
* resolution timer per TCP socket and may use more resources.
*/
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
#include <linux/module.h>
#include <net/tcp.h>
#include <linux/inet_diag.h>
@@ -115,6 +117,14 @@ struct bbr {
unused_b:5;
u32 prior_cwnd; /* prior cwnd upon entering loss recovery */
u32 full_bw; /* recent bw, to estimate if pipe is full */
+
+ /* For tracking ACK aggregation: */
+ u64 ack_epoch_mstamp; /* start of ACK sampling epoch */
+ u16 extra_acked[2]; /* max excess data ACKed in epoch */
+ u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */
+ extra_acked_win_rtts:5, /* age of extra_acked, in round trips */
+ extra_acked_win_idx:1, /* current index in extra_acked array */
+ unused_c:6;
};
#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */
@@ -128,6 +138,14 @@ static const u32 bbr_probe_rtt_mode_ms = 200;
/* Skip TSO below the following bandwidth (bits/sec): */
static const int bbr_min_tso_rate = 1200000;
+/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck.
+ * In order to help drive the network toward lower queues and low latency while
+ * maintaining high utilization, the average pacing rate aims to be slightly
+ * lower than the estimated bandwidth. This is an important aspect of the
+ * design.
+ */
+static const int bbr_pacing_margin_percent = 1;
+
/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain
* that will allow a smoothly increasing pacing rate that will double each RTT
* and send the same number of packets per RTT that an un-paced, slow-starting
@@ -174,6 +192,15 @@ static const u32 bbr_lt_bw_diff = 4000 / 8;
/* If we estimate we're policed, use lt_bw for this many round trips: */
static const u32 bbr_lt_bw_max_rtts = 48;
+/* Gain factor for adding extra_acked to target cwnd: */
+static const int bbr_extra_acked_gain = BBR_UNIT;
+/* Window length of extra_acked window. */
+static const u32 bbr_extra_acked_win_rtts = 5;
+/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */
+static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
+/* Time period for clamping cwnd increment due to ack aggregation */
+static const u32 bbr_extra_acked_max_us = 100 * 1000;
+
static void bbr_check_probe_rtt_done(struct sock *sk);
/* Do we estimate that STARTUP filled the pipe? */
@@ -200,6 +227,16 @@ static u32 bbr_bw(const struct sock *sk)
return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
}
+/* Return maximum extra acked in past k-2k round trips,
+ * where k = bbr_extra_acked_win_rtts.
+ */
+static u16 bbr_extra_acked(const struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ return max(bbr->extra_acked[0], bbr->extra_acked[1]);
+}
+
/* Return rate in bytes per second, optionally with a gain.
* The order here is chosen carefully to avoid overflow of u64. This should
* work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
@@ -208,22 +245,20 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain)
{
unsigned int mss = tcp_sk(sk)->mss_cache;
- if (!tcp_needs_internal_pacing(sk))
- mss = tcp_mss_to_mtu(sk, mss);
rate *= mss;
rate *= gain;
rate >>= BBR_SCALE;
- rate *= USEC_PER_SEC;
+ rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent);
return rate >> BW_SCALE;
}
/* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */
-static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
+static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain)
{
u64 rate = bw;
rate = bbr_rate_bytes_per_sec(sk, rate, gain);
- rate = min_t(u64, rate, sk->sk_max_pacing_rate);
+ rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate));
return rate;
}
@@ -241,34 +276,29 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk)
} else { /* no RTT sample yet */
rtt_us = USEC_PER_MSEC; /* use nominal default RTT */
}
- bw = (u64)tp->snd_cwnd * BW_UNIT;
+ bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT;
do_div(bw, rtt_us);
- sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain);
+ WRITE_ONCE(sk->sk_pacing_rate,
+ bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain));
}
-/* Pace using current bw estimate and a gain factor. In order to help drive the
- * network toward lower queues while maintaining high utilization and low
- * latency, the average pacing rate aims to be slightly (~1%) lower than the
- * estimated bandwidth. This is an important aspect of the design. In this
- * implementation this slightly lower pacing rate is achieved implicitly by not
- * including link-layer headers in the packet size used for the pacing rate.
- */
+/* Pace using current bw estimate and a gain factor. */
static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
- u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain);
+ unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain);
if (unlikely(!bbr->has_seen_rtt && tp->srtt_us))
bbr_init_pacing_rate_from_rtt(sk);
- if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate)
- sk->sk_pacing_rate = rate;
+ if (bbr_full_bw_reached(sk) || rate > READ_ONCE(sk->sk_pacing_rate))
+ WRITE_ONCE(sk->sk_pacing_rate, rate);
}
/* override sysctl_tcp_min_tso_segs */
-static u32 bbr_min_tso_segs(struct sock *sk)
+__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk)
{
- return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2;
+ return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2;
}
static u32 bbr_tso_segs_goal(struct sock *sk)
@@ -279,8 +309,9 @@ static u32 bbr_tso_segs_goal(struct sock *sk)
/* Sort of tcp_tso_autosize() but ignoring
* driver provided sk_gso_max_size.
*/
- bytes = min_t(u32, sk->sk_pacing_rate >> sk->sk_pacing_shift,
- GSO_MAX_SIZE - 1 - MAX_TCP_HEADER);
+ bytes = min_t(unsigned long,
+ READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift),
+ GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER);
segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk));
return min(segs, 0x7FU);
@@ -293,18 +324,20 @@ static void bbr_save_cwnd(struct sock *sk)
struct bbr *bbr = inet_csk_ca(sk);
if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT)
- bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */
+ bbr->prior_cwnd = tcp_snd_cwnd(tp); /* this cwnd is good enough */
else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */
- bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd);
+ bbr->prior_cwnd = max(bbr->prior_cwnd, tcp_snd_cwnd(tp));
}
-static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+__bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
if (event == CA_EVENT_TX_START && tp->app_limited) {
bbr->idle_restart = 1;
+ bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+ bbr->ack_epoch_acked = 0;
/* Avoid pointless buffer overflows: pace at est. bw if we don't
* need more speed (we're restarting from idle and app-limited).
*/
@@ -315,30 +348,19 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
}
}
-/* Find target cwnd. Right-size the cwnd based on min RTT and the
- * estimated bottleneck bandwidth:
+/* Calculate bdp based on min RTT and the estimated bottleneck bandwidth:
*
- * cwnd = bw * min_rtt * gain = BDP * gain
+ * bdp = ceil(bw * min_rtt * gain)
*
* The key factor, gain, controls the amount of queue. While a small gain
* builds a smaller queue, it becomes more vulnerable to noise in RTT
* measurements (e.g., delayed ACKs or other ACK compression effects). This
* noise may cause BBR to under-estimate the rate.
- *
- * To achieve full performance in high-speed paths, we budget enough cwnd to
- * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
- * - one skb in sending host Qdisc,
- * - one skb in sending host TSO/GSO engine
- * - one skb being received by receiver host LRO/GRO/delayed-ACK engine
- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
- * which allows 2 outstanding 2-packet sequences, to try to keep pipe
- * full even with ACK-every-other-packet delayed ACKs.
*/
-static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
+static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
{
struct bbr *bbr = inet_csk_ca(sk);
- u32 cwnd;
+ u32 bdp;
u64 w;
/* If we've never had a valid RTT sample, cap cwnd at the initial
@@ -352,8 +374,27 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
w = (u64)bw * bbr->min_rtt_us;
- /* Apply a gain to the given value, then remove the BW_SCALE shift. */
- cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
+ /* Apply a gain to the given value, remove the BW_SCALE shift, and
+ * round the value up to avoid a negative feedback loop.
+ */
+ bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
+
+ return bdp;
+}
+
+/* To achieve full performance in high-speed paths, we budget enough cwnd to
+ * fit full-sized skbs in-flight on both end hosts to fully utilize the path:
+ * - one skb in sending host Qdisc,
+ * - one skb in sending host TSO/GSO engine
+ * - one skb being received by receiver host LRO/GRO/delayed-ACK engine
+ * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
+ * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
+ * which allows 2 outstanding 2-packet sequences, to try to keep pipe
+ * full even with ACK-every-other-packet delayed ACKs.
+ */
+static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
/* Allow enough full-sized skbs in flight to utilize end systems. */
cwnd += 3 * bbr_tso_segs_goal(sk);
@@ -362,12 +403,72 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
cwnd = (cwnd + 1) & ~1U;
/* Ensure gain cycling gets inflight above BDP even for small BDPs. */
- if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT)
+ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0)
cwnd += 2;
return cwnd;
}
+/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */
+static u32 bbr_inflight(struct sock *sk, u32 bw, int gain)
+{
+ u32 inflight;
+
+ inflight = bbr_bdp(sk, bw, gain);
+ inflight = bbr_quantization_budget(sk, inflight);
+
+ return inflight;
+}
+
+/* With pacing at lower layers, there's often less data "in the network" than
+ * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),
+ * we often have several skbs queued in the pacing layer with a pre-scheduled
+ * earliest departure time (EDT). BBR adapts its pacing rate based on the
+ * inflight level that it estimates has already been "baked in" by previous
+ * departure time decisions. We calculate a rough estimate of the number of our
+ * packets that might be in the network at the earliest departure time for the
+ * next skb scheduled:
+ * in_network_at_edt = inflight_at_edt - (EDT - now) * bw
+ * If we're increasing inflight, then we want to know if the transmit of the
+ * EDT skb will push inflight above the target, so inflight_at_edt includes
+ * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight,
+ * then estimate if inflight will sink too low just before the EDT transmit.
+ */
+static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct bbr *bbr = inet_csk_ca(sk);
+ u64 now_ns, edt_ns, interval_us;
+ u32 interval_delivered, inflight_at_edt;
+
+ now_ns = tp->tcp_clock_cache;
+ edt_ns = max(tp->tcp_wstamp_ns, now_ns);
+ interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC);
+ interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE;
+ inflight_at_edt = inflight_now;
+ if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */
+ inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */
+ if (interval_delivered >= inflight_at_edt)
+ return 0;
+ return inflight_at_edt - interval_delivered;
+}
+
+/* Find the cwnd increment based on estimate of ack aggregation */
+static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
+{
+ u32 max_aggr_cwnd, aggr_cwnd = 0;
+
+ if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
+ max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
+ / BW_UNIT;
+ aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
+ >> BBR_SCALE;
+ aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
+ }
+
+ return aggr_cwnd;
+}
+
/* An optimization in BBR to reduce losses: On the first round of recovery, we
* follow the packet conservation principle: send P packets per P packets acked.
* After that, we slow-start and send at most 2*P packets per P packets acked.
@@ -382,7 +483,7 @@ static bool bbr_set_cwnd_to_recover_or_restore(
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state;
- u32 cwnd = tp->snd_cwnd;
+ u32 cwnd = tcp_snd_cwnd(tp);
/* An ACK for P pkts should release at most 2*P packets. We do this
* in two steps. First, here we deduct the number of lost packets.
@@ -420,7 +521,7 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
{
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
- u32 cwnd = tp->snd_cwnd, target_cwnd = 0;
+ u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0;
if (!acked)
goto done; /* no packet fully ACKed; just apply caps */
@@ -428,8 +529,15 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
goto done;
+ target_cwnd = bbr_bdp(sk, bw, gain);
+
+ /* Increment the cwnd to account for excess ACKed data that seems
+ * due to aggregation (of data and/or ACKs) visible in the ACK stream.
+ */
+ target_cwnd += bbr_ack_aggregation_cwnd(sk);
+ target_cwnd = bbr_quantization_budget(sk, target_cwnd);
+
/* If we're below target cwnd, slow start cwnd toward target cwnd. */
- target_cwnd = bbr_target_cwnd(sk, bw, gain);
if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */
cwnd = min(cwnd + acked, target_cwnd);
else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
@@ -437,9 +545,9 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
cwnd = max(cwnd, bbr_cwnd_min_target);
done:
- tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */
+ tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* apply global cap */
if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */
- tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target);
+ tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target));
}
/* End cycle phase if it's time and/or we hit the phase's in-flight target. */
@@ -459,7 +567,7 @@ static bool bbr_is_next_cycle_phase(struct sock *sk,
if (bbr->pacing_gain == BBR_UNIT)
return is_full_length; /* just use wall clock time */
- inflight = rs->prior_in_flight; /* what was in-flight before ACK? */
+ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight);
bw = bbr_max_bw(sk);
/* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at
@@ -470,14 +578,14 @@ static bool bbr_is_next_cycle_phase(struct sock *sk,
if (bbr->pacing_gain > BBR_UNIT)
return is_full_length &&
(rs->losses || /* perhaps pacing_gain*BDP won't fit */
- inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain));
+ inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
/* A pacing_gain < 1.0 tries to drain extra queue we added if bw
* probing didn't find more bw. If inflight falls to match BDP then we
* estimate queue is drained; persisting would underutilize the pipe.
*/
return is_full_length ||
- inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT);
+ inflight <= bbr_inflight(sk, bw, BBR_UNIT);
}
static void bbr_advance_cycle_phase(struct sock *sk)
@@ -487,8 +595,6 @@ static void bbr_advance_cycle_phase(struct sock *sk)
bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1);
bbr->cycle_mstamp = tp->delivered_mstamp;
- bbr->pacing_gain = bbr->lt_use_bw ? BBR_UNIT :
- bbr_pacing_gain[bbr->cycle_idx];
}
/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */
@@ -506,8 +612,6 @@ static void bbr_reset_startup_mode(struct sock *sk)
struct bbr *bbr = inet_csk_ca(sk);
bbr->mode = BBR_STARTUP;
- bbr->pacing_gain = bbr_high_gain;
- bbr->cwnd_gain = bbr_high_gain;
}
static void bbr_reset_probe_bw_mode(struct sock *sk)
@@ -515,9 +619,7 @@ static void bbr_reset_probe_bw_mode(struct sock *sk)
struct bbr *bbr = inet_csk_ca(sk);
bbr->mode = BBR_PROBE_BW;
- bbr->pacing_gain = BBR_UNIT;
- bbr->cwnd_gain = bbr_cwnd_gain;
- bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand);
+ bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand);
bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */
}
@@ -680,8 +782,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
* bandwidth sample. Delivered is in packets and interval_us in uS and
* ratio will be <<1 for most connections. So delivered is first scaled.
*/
- bw = (u64)rs->delivered * BW_UNIT;
- do_div(bw, rs->interval_us);
+ bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us);
/* If this sample is application-limited, it is likely to have a very
* low delivered count that represents application behavior rather than
@@ -700,6 +801,67 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
}
}
+/* Estimates the windowed max degree of ack aggregation.
+ * This is used to provision extra in-flight data to keep sending during
+ * inter-ACK silences.
+ *
+ * Degree of ack aggregation is estimated as extra data acked beyond expected.
+ *
+ * max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval"
+ * cwnd += max_extra_acked
+ *
+ * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
+ * Max filter is an approximate sliding window of 5-10 (packet timed) round
+ * trips.
+ */
+static void bbr_update_ack_aggregation(struct sock *sk,
+ const struct rate_sample *rs)
+{
+ u32 epoch_us, expected_acked, extra_acked;
+ struct bbr *bbr = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
+ rs->delivered < 0 || rs->interval_us <= 0)
+ return;
+
+ if (bbr->round_start) {
+ bbr->extra_acked_win_rtts = min(0x1F,
+ bbr->extra_acked_win_rtts + 1);
+ if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
+ bbr->extra_acked_win_rtts = 0;
+ bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
+ 0 : 1;
+ bbr->extra_acked[bbr->extra_acked_win_idx] = 0;
+ }
+ }
+
+ /* Compute how many packets we expected to be delivered over epoch. */
+ epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp,
+ bbr->ack_epoch_mstamp);
+ expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT;
+
+ /* Reset the aggregation epoch if ACK rate is below expected rate or
+ * significantly large no. of ack received since epoch (potentially
+ * quite old epoch).
+ */
+ if (bbr->ack_epoch_acked <= expected_acked ||
+ (bbr->ack_epoch_acked + rs->acked_sacked >=
+ bbr_ack_epoch_acked_reset_thresh)) {
+ bbr->ack_epoch_acked = 0;
+ bbr->ack_epoch_mstamp = tp->delivered_mstamp;
+ expected_acked = 0;
+ }
+
+ /* Compute excess data delivered, beyond what was expected. */
+ bbr->ack_epoch_acked = min_t(u32, 0xFFFFF,
+ bbr->ack_epoch_acked + rs->acked_sacked);
+ extra_acked = bbr->ack_epoch_acked - expected_acked;
+ extra_acked = min(extra_acked, tcp_snd_cwnd(tp));
+ if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx])
+ bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
+}
+
/* Estimate when the pipe is full, using the change in delivery rate: BBR
* estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
* at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
@@ -734,14 +896,12 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
bbr->mode = BBR_DRAIN; /* drain queue we created */
- bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */
- bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */
tcp_sk(sk)->snd_ssthresh =
- bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT);
+ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
} /* fall through to check if in-flight is already small: */
if (bbr->mode == BBR_DRAIN &&
- tcp_packets_in_flight(tcp_sk(sk)) <=
- bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT))
+ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
+ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */
}
@@ -755,7 +915,7 @@ static void bbr_check_probe_rtt_done(struct sock *sk)
return;
bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */
- tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd);
+ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd));
bbr_reset_mode(sk);
}
@@ -788,7 +948,7 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
filter_expired = after(tcp_jiffies32,
bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ);
if (rs->rtt_us >= 0 &&
- (rs->rtt_us <= bbr->min_rtt_us ||
+ (rs->rtt_us < bbr->min_rtt_us ||
(filter_expired && !rs->is_ack_delayed))) {
bbr->min_rtt_us = rs->rtt_us;
bbr->min_rtt_stamp = tcp_jiffies32;
@@ -797,8 +957,6 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
if (bbr_probe_rtt_mode_ms > 0 && filter_expired &&
!bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) {
bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */
- bbr->pacing_gain = BBR_UNIT;
- bbr->cwnd_gain = BBR_UNIT;
bbr_save_cwnd(sk); /* note cwnd so we can restore it */
bbr->probe_rtt_done_stamp = 0;
}
@@ -826,16 +984,47 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
bbr->idle_restart = 0;
}
+static void bbr_update_gains(struct sock *sk)
+{
+ struct bbr *bbr = inet_csk_ca(sk);
+
+ switch (bbr->mode) {
+ case BBR_STARTUP:
+ bbr->pacing_gain = bbr_high_gain;
+ bbr->cwnd_gain = bbr_high_gain;
+ break;
+ case BBR_DRAIN:
+ bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */
+ bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */
+ break;
+ case BBR_PROBE_BW:
+ bbr->pacing_gain = (bbr->lt_use_bw ?
+ BBR_UNIT :
+ bbr_pacing_gain[bbr->cycle_idx]);
+ bbr->cwnd_gain = bbr_cwnd_gain;
+ break;
+ case BBR_PROBE_RTT:
+ bbr->pacing_gain = BBR_UNIT;
+ bbr->cwnd_gain = BBR_UNIT;
+ break;
+ default:
+ WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode);
+ break;
+ }
+}
+
static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
{
bbr_update_bw(sk, rs);
+ bbr_update_ack_aggregation(sk, rs);
bbr_update_cycle_phase(sk, rs);
bbr_check_full_bw_reached(sk, rs);
bbr_check_drain(sk, rs);
bbr_update_min_rtt(sk, rs);
+ bbr_update_gains(sk);
}
-static void bbr_main(struct sock *sk, const struct rate_sample *rs)
+__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs)
{
struct bbr *bbr = inet_csk_ca(sk);
u32 bw;
@@ -847,7 +1036,7 @@ static void bbr_main(struct sock *sk, const struct rate_sample *rs)
bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain);
}
-static void bbr_init(struct sock *sk)
+__bpf_kfunc static void bbr_init(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bbr *bbr = inet_csk_ca(sk);
@@ -855,7 +1044,7 @@ static void bbr_init(struct sock *sk)
bbr->prior_cwnd = 0;
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
bbr->rtt_cnt = 0;
- bbr->next_rtt_delivered = 0;
+ bbr->next_rtt_delivered = tp->delivered;
bbr->prev_ca_state = TCP_CA_Open;
bbr->packet_conservation = 0;
@@ -879,10 +1068,17 @@ static void bbr_init(struct sock *sk)
bbr_reset_lt_bw_sampling(sk);
bbr_reset_startup_mode(sk);
+ bbr->ack_epoch_mstamp = tp->tcp_mstamp;
+ bbr->ack_epoch_acked = 0;
+ bbr->extra_acked_win_rtts = 0;
+ bbr->extra_acked_win_idx = 0;
+ bbr->extra_acked[0] = 0;
+ bbr->extra_acked[1] = 0;
+
cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
}
-static u32 bbr_sndbuf_expand(struct sock *sk)
+__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk)
{
/* Provision 3 * cwnd since BBR may slow-start even during recovery. */
return 3;
@@ -891,18 +1087,18 @@ static u32 bbr_sndbuf_expand(struct sock *sk)
/* In theory BBR does not need to undo the cwnd since it does not
* always reduce cwnd on losses (see bbr_main()). Keep it for now.
*/
-static u32 bbr_undo_cwnd(struct sock *sk)
+__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk)
{
struct bbr *bbr = inet_csk_ca(sk);
bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */
bbr->full_bw_cnt = 0;
bbr_reset_lt_bw_sampling(sk);
- return tcp_sk(sk)->snd_cwnd;
+ return tcp_snd_cwnd(tcp_sk(sk));
}
/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */
-static u32 bbr_ssthresh(struct sock *sk)
+__bpf_kfunc static u32 bbr_ssthresh(struct sock *sk)
{
bbr_save_cwnd(sk);
return tcp_sk(sk)->snd_ssthresh;
@@ -930,7 +1126,7 @@ static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr,
return 0;
}
-static void bbr_set_state(struct sock *sk, u8 new_state)
+__bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state)
{
struct bbr *bbr = inet_csk_ca(sk);
@@ -959,9 +1155,31 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = {
.set_state = bbr_set_state,
};
+BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids)
+BTF_ID_FLAGS(func, bbr_init)
+BTF_ID_FLAGS(func, bbr_main)
+BTF_ID_FLAGS(func, bbr_sndbuf_expand)
+BTF_ID_FLAGS(func, bbr_undo_cwnd)
+BTF_ID_FLAGS(func, bbr_cwnd_event)
+BTF_ID_FLAGS(func, bbr_ssthresh)
+BTF_ID_FLAGS(func, bbr_min_tso_segs)
+BTF_ID_FLAGS(func, bbr_set_state)
+BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids)
+
+static const struct btf_kfunc_id_set tcp_bbr_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &tcp_bbr_check_kfunc_ids,
+};
+
static int __init bbr_register(void)
{
+ int ret;
+
BUILD_BUG_ON(sizeof(struct bbr) > ICSK_CA_PRIV_SIZE);
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_bbr_kfunc_set);
+ if (ret < 0)
+ return ret;
return tcp_register_congestion_control(&tcp_bbr_cong_ops);
}
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index fc3614377413..58358bf92e1b 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Binary Increase Congestion control for TCP
* Home page:
@@ -144,12 +145,13 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tcp_in_slow_start(tp))
- tcp_slow_start(tp, acked);
- else {
- bictcp_update(ca, tp->snd_cwnd);
- tcp_cong_avoid_ai(tp, ca->cnt, 1);
+ if (tcp_in_slow_start(tp)) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
}
+ bictcp_update(ca, tcp_snd_cwnd(tp));
+ tcp_cong_avoid_ai(tp, ca->cnt, acked);
}
/*
@@ -164,16 +166,16 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk)
ca->epoch_start = 0; /* end of epoch */
/* Wmax and fast convergence */
- if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
- ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
+ if (tcp_snd_cwnd(tp) < ca->last_max_cwnd && fast_convergence)
+ ca->last_max_cwnd = (tcp_snd_cwnd(tp) * (BICTCP_BETA_SCALE + beta))
/ (2 * BICTCP_BETA_SCALE);
else
- ca->last_max_cwnd = tp->snd_cwnd;
+ ca->last_max_cwnd = tcp_snd_cwnd(tp);
- if (tp->snd_cwnd <= low_window)
- return max(tp->snd_cwnd >> 1U, 2U);
+ if (tcp_snd_cwnd(tp) <= low_window)
+ return max(tcp_snd_cwnd(tp) >> 1U, 2U);
else
- return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
+ return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U);
}
static void bictcp_state(struct sock *sk, u8 new_state)
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
new file mode 100644
index 000000000000..a268e1595b22
--- /dev/null
+++ b/net/ipv4/tcp_bpf.c
@@ -0,0 +1,739 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2017 - 2018 Covalent IO, Inc. http://covalent.io */
+
+#include <linux/skmsg.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <linux/init.h>
+#include <linux/wait.h>
+#include <linux/util_macros.h>
+
+#include <net/inet_common.h>
+#include <net/tls.h>
+
+void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tcp;
+ int copied;
+
+ if (!skb || !skb->len || !sk_is_tcp(sk))
+ return;
+
+ if (skb_bpf_strparser(skb))
+ return;
+
+ tcp = tcp_sk(sk);
+ copied = tcp->copied_seq + skb->len;
+ WRITE_ONCE(tcp->copied_seq, copied);
+ tcp_rcv_space_adjust(sk);
+ __tcp_cleanup_rbuf(sk, skb->len);
+}
+
+static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
+ struct sk_msg *msg, u32 apply_bytes)
+{
+ bool apply = apply_bytes;
+ struct scatterlist *sge;
+ u32 size, copied = 0;
+ struct sk_msg *tmp;
+ int i, ret = 0;
+
+ tmp = kzalloc(sizeof(*tmp), __GFP_NOWARN | GFP_KERNEL);
+ if (unlikely(!tmp))
+ return -ENOMEM;
+
+ lock_sock(sk);
+ tmp->sg.start = msg->sg.start;
+ i = msg->sg.start;
+ do {
+ sge = sk_msg_elem(msg, i);
+ size = (apply && apply_bytes < sge->length) ?
+ apply_bytes : sge->length;
+ if (!__sk_rmem_schedule(sk, size, false)) {
+ if (!copied)
+ ret = -ENOMEM;
+ break;
+ }
+
+ sk_mem_charge(sk, size);
+ atomic_add(size, &sk->sk_rmem_alloc);
+ sk_msg_xfer(tmp, msg, i, size);
+ copied += size;
+ if (sge->length)
+ get_page(sk_msg_page(tmp, i));
+ sk_msg_iter_var_next(i);
+ tmp->sg.end = i;
+ if (apply) {
+ apply_bytes -= size;
+ if (!apply_bytes) {
+ if (sge->length)
+ sk_msg_iter_var_prev(i);
+ break;
+ }
+ }
+ } while (i != msg->sg.end);
+
+ if (!ret) {
+ msg->sg.start = i;
+ if (!sk_psock_queue_msg(psock, tmp))
+ atomic_sub(copied, &sk->sk_rmem_alloc);
+ sk_psock_data_ready(sk, psock);
+ } else {
+ sk_msg_free(sk, tmp);
+ kfree(tmp);
+ }
+
+ release_sock(sk);
+ return ret;
+}
+
+static int tcp_bpf_push(struct sock *sk, struct sk_msg *msg, u32 apply_bytes,
+ int flags, bool uncharge)
+{
+ struct msghdr msghdr = {};
+ bool apply = apply_bytes;
+ struct scatterlist *sge;
+ struct page *page;
+ int size, ret = 0;
+ u32 off;
+
+ while (1) {
+ struct bio_vec bvec;
+ bool has_tx_ulp;
+
+ sge = sk_msg_elem(msg, msg->sg.start);
+ size = (apply && apply_bytes < sge->length) ?
+ apply_bytes : sge->length;
+ off = sge->offset;
+ page = sg_page(sge);
+
+ tcp_rate_check_app_limited(sk);
+retry:
+ msghdr.msg_flags = flags | MSG_SPLICE_PAGES;
+ has_tx_ulp = tls_sw_has_ctx_tx(sk);
+ if (has_tx_ulp)
+ msghdr.msg_flags |= MSG_SENDPAGE_NOPOLICY;
+
+ if (size < sge->length && msg->sg.start != msg->sg.end)
+ msghdr.msg_flags |= MSG_MORE;
+
+ bvec_set_page(&bvec, page, size, off);
+ iov_iter_bvec(&msghdr.msg_iter, ITER_SOURCE, &bvec, 1, size);
+ ret = tcp_sendmsg_locked(sk, &msghdr, size);
+ if (ret <= 0)
+ return ret;
+
+ if (apply)
+ apply_bytes -= ret;
+ msg->sg.size -= ret;
+ sge->offset += ret;
+ sge->length -= ret;
+ if (uncharge)
+ sk_mem_uncharge(sk, ret);
+ if (ret != size) {
+ size -= ret;
+ off += ret;
+ goto retry;
+ }
+ if (!sge->length) {
+ put_page(page);
+ sk_msg_iter_next(msg, start);
+ sg_init_table(sge, 1);
+ if (msg->sg.start == msg->sg.end)
+ break;
+ }
+ if (apply && !apply_bytes)
+ break;
+ }
+
+ return 0;
+}
+
+static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg,
+ u32 apply_bytes, int flags, bool uncharge)
+{
+ int ret;
+
+ lock_sock(sk);
+ ret = tcp_bpf_push(sk, msg, apply_bytes, flags, uncharge);
+ release_sock(sk);
+ return ret;
+}
+
+int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress,
+ struct sk_msg *msg, u32 bytes, int flags)
+{
+ struct sk_psock *psock = sk_psock_get(sk);
+ int ret;
+
+ if (unlikely(!psock))
+ return -EPIPE;
+
+ ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes) :
+ tcp_bpf_push_locked(sk, msg, bytes, flags, false);
+ sk_psock_put(sk, psock);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_bpf_sendmsg_redir);
+
+#ifdef CONFIG_BPF_SYSCALL
+static int tcp_msg_wait_data(struct sock *sk, struct sk_psock *psock,
+ long timeo)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ int ret = 0;
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ return 1;
+
+ if (!timeo)
+ return ret;
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ ret = sk_wait_event(sk, &timeo,
+ !list_empty(&psock->ingress_msg) ||
+ !skb_queue_empty_lockless(&sk->sk_receive_queue), &wait);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return ret;
+}
+
+static bool is_next_msg_fin(struct sk_psock *psock)
+{
+ struct scatterlist *sge;
+ struct sk_msg *msg_rx;
+ int i;
+
+ msg_rx = sk_psock_peek_msg(psock);
+ i = msg_rx->sg.start;
+ sge = sk_msg_elem(msg_rx, i);
+ if (!sge->length) {
+ struct sk_buff *skb = msg_rx->skb;
+
+ if (skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ return true;
+ }
+ return false;
+}
+
+static int tcp_bpf_recvmsg_parser(struct sock *sk,
+ struct msghdr *msg,
+ size_t len,
+ int flags,
+ int *addr_len)
+{
+ int peek = flags & MSG_PEEK;
+ struct sk_psock *psock;
+ struct tcp_sock *tcp;
+ int copied = 0;
+ u32 seq;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+
+ if (!len)
+ return 0;
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return tcp_recvmsg(sk, msg, len, flags, addr_len);
+
+ lock_sock(sk);
+ tcp = tcp_sk(sk);
+ seq = tcp->copied_seq;
+ /* We may have received data on the sk_receive_queue pre-accept and
+ * then we can not use read_skb in this context because we haven't
+ * assigned a sk_socket yet so have no link to the ops. The work-around
+ * is to check the sk_receive_queue and in these cases read skbs off
+ * queue again. The read_skb hook is not running at this point because
+ * of lock_sock so we avoid having multiple runners in read_skb.
+ */
+ if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) {
+ tcp_data_ready(sk);
+ /* This handles the ENOMEM errors if we both receive data
+ * pre accept and are already under memory pressure. At least
+ * let user know to retry.
+ */
+ if (unlikely(!skb_queue_empty(&sk->sk_receive_queue))) {
+ copied = -EAGAIN;
+ goto out;
+ }
+ }
+
+msg_bytes_ready:
+ copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+ /* The typical case for EFAULT is the socket was gracefully
+ * shutdown with a FIN pkt. So check here the other case is
+ * some error on copy_page_to_iter which would be unexpected.
+ * On fin return correct return code to zero.
+ */
+ if (copied == -EFAULT) {
+ bool is_fin = is_next_msg_fin(psock);
+
+ if (is_fin) {
+ copied = 0;
+ seq++;
+ goto out;
+ }
+ }
+ seq += copied;
+ if (!copied) {
+ long timeo;
+ int data;
+
+ if (sock_flag(sk, SOCK_DONE))
+ goto out;
+
+ if (sk->sk_err) {
+ copied = sock_error(sk);
+ goto out;
+ }
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ goto out;
+
+ if (sk->sk_state == TCP_CLOSE) {
+ copied = -ENOTCONN;
+ goto out;
+ }
+
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+ if (!timeo) {
+ copied = -EAGAIN;
+ goto out;
+ }
+
+ if (signal_pending(current)) {
+ copied = sock_intr_errno(timeo);
+ goto out;
+ }
+
+ data = tcp_msg_wait_data(sk, psock, timeo);
+ if (data < 0) {
+ copied = data;
+ goto unlock;
+ }
+ if (data && !sk_psock_queue_empty(psock))
+ goto msg_bytes_ready;
+ copied = -EAGAIN;
+ }
+out:
+ if (!peek)
+ WRITE_ONCE(tcp->copied_seq, seq);
+ tcp_rcv_space_adjust(sk);
+ if (copied > 0)
+ __tcp_cleanup_rbuf(sk, copied);
+
+unlock:
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return copied;
+}
+
+static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int flags, int *addr_len)
+{
+ struct sk_psock *psock;
+ int copied, ret;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+
+ if (!len)
+ return 0;
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return tcp_recvmsg(sk, msg, len, flags, addr_len);
+ if (!skb_queue_empty(&sk->sk_receive_queue) &&
+ sk_psock_queue_empty(psock)) {
+ sk_psock_put(sk, psock);
+ return tcp_recvmsg(sk, msg, len, flags, addr_len);
+ }
+ lock_sock(sk);
+msg_bytes_ready:
+ copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+ if (!copied) {
+ long timeo;
+ int data;
+
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+ data = tcp_msg_wait_data(sk, psock, timeo);
+ if (data < 0) {
+ ret = data;
+ goto unlock;
+ }
+ if (data) {
+ if (!sk_psock_queue_empty(psock))
+ goto msg_bytes_ready;
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return tcp_recvmsg(sk, msg, len, flags, addr_len);
+ }
+ copied = -EAGAIN;
+ }
+ ret = copied;
+
+unlock:
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return ret;
+}
+
+static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
+ struct sk_msg *msg, int *copied, int flags)
+{
+ bool cork = false, enospc = sk_msg_full(msg), redir_ingress;
+ struct sock *sk_redir;
+ u32 tosend, origsize, sent, delta = 0;
+ u32 eval;
+ int ret;
+
+more_data:
+ if (psock->eval == __SK_NONE) {
+ /* Track delta in msg size to add/subtract it on SK_DROP from
+ * returned to user copied size. This ensures user doesn't
+ * get a positive return code with msg_cut_data and SK_DROP
+ * verdict.
+ */
+ delta = msg->sg.size;
+ psock->eval = sk_psock_msg_verdict(sk, psock, msg);
+ delta -= msg->sg.size;
+ }
+
+ if (msg->cork_bytes &&
+ msg->cork_bytes > msg->sg.size && !enospc) {
+ psock->cork_bytes = msg->cork_bytes - msg->sg.size;
+ if (!psock->cork) {
+ psock->cork = kzalloc(sizeof(*psock->cork),
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!psock->cork) {
+ sk_msg_free(sk, msg);
+ *copied = 0;
+ return -ENOMEM;
+ }
+ }
+ memcpy(psock->cork, msg, sizeof(*msg));
+ return 0;
+ }
+
+ tosend = msg->sg.size;
+ if (psock->apply_bytes && psock->apply_bytes < tosend)
+ tosend = psock->apply_bytes;
+ eval = __SK_NONE;
+
+ switch (psock->eval) {
+ case __SK_PASS:
+ ret = tcp_bpf_push(sk, msg, tosend, flags, true);
+ if (unlikely(ret)) {
+ *copied -= sk_msg_free(sk, msg);
+ break;
+ }
+ sk_msg_apply_bytes(psock, tosend);
+ break;
+ case __SK_REDIRECT:
+ redir_ingress = psock->redir_ingress;
+ sk_redir = psock->sk_redir;
+ sk_msg_apply_bytes(psock, tosend);
+ if (!psock->apply_bytes) {
+ /* Clean up before releasing the sock lock. */
+ eval = psock->eval;
+ psock->eval = __SK_NONE;
+ psock->sk_redir = NULL;
+ }
+ if (psock->cork) {
+ cork = true;
+ psock->cork = NULL;
+ }
+ release_sock(sk);
+
+ origsize = msg->sg.size;
+ ret = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress,
+ msg, tosend, flags);
+ sent = origsize - msg->sg.size;
+
+ if (eval == __SK_REDIRECT)
+ sock_put(sk_redir);
+
+ lock_sock(sk);
+ sk_mem_uncharge(sk, sent);
+ if (unlikely(ret < 0)) {
+ int free = sk_msg_free(sk, msg);
+
+ if (!cork)
+ *copied -= free;
+ }
+ if (cork) {
+ sk_msg_free(sk, msg);
+ kfree(msg);
+ msg = NULL;
+ ret = 0;
+ }
+ break;
+ case __SK_DROP:
+ default:
+ sk_msg_free(sk, msg);
+ sk_msg_apply_bytes(psock, tosend);
+ *copied -= (tosend + delta);
+ return -EACCES;
+ }
+
+ if (likely(!ret)) {
+ if (!psock->apply_bytes) {
+ psock->eval = __SK_NONE;
+ if (psock->sk_redir) {
+ sock_put(psock->sk_redir);
+ psock->sk_redir = NULL;
+ }
+ }
+ if (msg &&
+ msg->sg.data[msg->sg.start].page_link &&
+ msg->sg.data[msg->sg.start].length)
+ goto more_data;
+ }
+ return ret;
+}
+
+static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+{
+ struct sk_msg tmp, *msg_tx = NULL;
+ int copied = 0, err = 0, ret = 0;
+ struct sk_psock *psock;
+ long timeo;
+ int flags;
+
+ /* Don't let internal flags through */
+ flags = (msg->msg_flags & ~MSG_SENDPAGE_DECRYPTED);
+ flags |= MSG_NO_SHARED_FRAGS;
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return tcp_sendmsg(sk, msg, size);
+
+ lock_sock(sk);
+ timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+ while (msg_data_left(msg)) {
+ bool enospc = false;
+ u32 copy, osize;
+
+ if (sk->sk_err) {
+ err = -sk->sk_err;
+ goto out_err;
+ }
+
+ copy = msg_data_left(msg);
+ if (!sk_stream_memory_free(sk))
+ goto wait_for_sndbuf;
+ if (psock->cork) {
+ msg_tx = psock->cork;
+ } else {
+ msg_tx = &tmp;
+ sk_msg_init(msg_tx);
+ }
+
+ osize = msg_tx->sg.size;
+ err = sk_msg_alloc(sk, msg_tx, msg_tx->sg.size + copy, msg_tx->sg.end - 1);
+ if (err) {
+ if (err != -ENOSPC)
+ goto wait_for_memory;
+ enospc = true;
+ copy = msg_tx->sg.size - osize;
+ }
+
+ ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx,
+ copy);
+ if (ret < 0) {
+ sk_msg_trim(sk, msg_tx, osize);
+ goto out_err;
+ }
+
+ copied += ret;
+ if (psock->cork_bytes) {
+ if (size > psock->cork_bytes)
+ psock->cork_bytes = 0;
+ else
+ psock->cork_bytes -= size;
+ if (psock->cork_bytes && !enospc)
+ goto out_err;
+ /* All cork bytes are accounted, rerun the prog. */
+ psock->eval = __SK_NONE;
+ psock->cork_bytes = 0;
+ }
+
+ err = tcp_bpf_send_verdict(sk, psock, msg_tx, &copied, flags);
+ if (unlikely(err < 0))
+ goto out_err;
+ continue;
+wait_for_sndbuf:
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err) {
+ if (msg_tx && msg_tx != psock->cork)
+ sk_msg_free(sk, msg_tx);
+ goto out_err;
+ }
+ }
+out_err:
+ if (err < 0)
+ err = sk_stream_error(sk, msg->msg_flags, err);
+ release_sock(sk);
+ sk_psock_put(sk, psock);
+ return copied > 0 ? copied : err;
+}
+
+enum {
+ TCP_BPF_IPV4,
+ TCP_BPF_IPV6,
+ TCP_BPF_NUM_PROTS,
+};
+
+enum {
+ TCP_BPF_BASE,
+ TCP_BPF_TX,
+ TCP_BPF_RX,
+ TCP_BPF_TXRX,
+ TCP_BPF_NUM_CFGS,
+};
+
+static struct proto *tcpv6_prot_saved __read_mostly;
+static DEFINE_SPINLOCK(tcpv6_prot_lock);
+static struct proto tcp_bpf_prots[TCP_BPF_NUM_PROTS][TCP_BPF_NUM_CFGS];
+
+static void tcp_bpf_rebuild_protos(struct proto prot[TCP_BPF_NUM_CFGS],
+ struct proto *base)
+{
+ prot[TCP_BPF_BASE] = *base;
+ prot[TCP_BPF_BASE].destroy = sock_map_destroy;
+ prot[TCP_BPF_BASE].close = sock_map_close;
+ prot[TCP_BPF_BASE].recvmsg = tcp_bpf_recvmsg;
+ prot[TCP_BPF_BASE].sock_is_readable = sk_msg_is_readable;
+
+ prot[TCP_BPF_TX] = prot[TCP_BPF_BASE];
+ prot[TCP_BPF_TX].sendmsg = tcp_bpf_sendmsg;
+
+ prot[TCP_BPF_RX] = prot[TCP_BPF_BASE];
+ prot[TCP_BPF_RX].recvmsg = tcp_bpf_recvmsg_parser;
+
+ prot[TCP_BPF_TXRX] = prot[TCP_BPF_TX];
+ prot[TCP_BPF_TXRX].recvmsg = tcp_bpf_recvmsg_parser;
+}
+
+static void tcp_bpf_check_v6_needs_rebuild(struct proto *ops)
+{
+ if (unlikely(ops != smp_load_acquire(&tcpv6_prot_saved))) {
+ spin_lock_bh(&tcpv6_prot_lock);
+ if (likely(ops != tcpv6_prot_saved)) {
+ tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV6], ops);
+ smp_store_release(&tcpv6_prot_saved, ops);
+ }
+ spin_unlock_bh(&tcpv6_prot_lock);
+ }
+}
+
+static int __init tcp_bpf_v4_build_proto(void)
+{
+ tcp_bpf_rebuild_protos(tcp_bpf_prots[TCP_BPF_IPV4], &tcp_prot);
+ return 0;
+}
+late_initcall(tcp_bpf_v4_build_proto);
+
+static int tcp_bpf_assert_proto_ops(struct proto *ops)
+{
+ /* In order to avoid retpoline, we make assumptions when we call
+ * into ops if e.g. a psock is not present. Make sure they are
+ * indeed valid assumptions.
+ */
+ return ops->recvmsg == tcp_recvmsg &&
+ ops->sendmsg == tcp_sendmsg ? 0 : -ENOTSUPP;
+}
+
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor)
+{
+ struct sock *sk = strp->sk;
+ struct sk_psock *psock;
+ struct tcp_sock *tp;
+ int copied = 0;
+
+ tp = tcp_sk(sk);
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (WARN_ON_ONCE(!psock)) {
+ desc->error = -EINVAL;
+ goto out;
+ }
+
+ psock->ingress_bytes = 0;
+ copied = tcp_read_sock_noack(sk, desc, recv_actor, true,
+ &psock->copied_seq);
+ if (copied < 0)
+ goto out;
+ /* recv_actor may redirect skb to another socket (SK_REDIRECT) or
+ * just put skb into ingress queue of current socket (SK_PASS).
+ * For SK_REDIRECT, we need to ack the frame immediately but for
+ * SK_PASS, we want to delay the ack until tcp_bpf_recvmsg_parser().
+ */
+ tp->copied_seq = psock->copied_seq - psock->ingress_bytes;
+ tcp_rcv_space_adjust(sk);
+ __tcp_cleanup_rbuf(sk, copied - psock->ingress_bytes);
+out:
+ rcu_read_unlock();
+ return copied;
+}
+#endif /* CONFIG_BPF_STREAM_PARSER */
+
+int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
+{
+ int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
+ int config = psock->progs.msg_parser ? TCP_BPF_TX : TCP_BPF_BASE;
+
+ if (psock->progs.stream_verdict || psock->progs.skb_verdict) {
+ config = (config == TCP_BPF_TX) ? TCP_BPF_TXRX : TCP_BPF_RX;
+ }
+
+ if (restore) {
+ if (inet_csk_has_ulp(sk)) {
+ /* TLS does not have an unhash proto in SW cases,
+ * but we need to ensure we stop using the sock_map
+ * unhash routine because the associated psock is being
+ * removed. So use the original unhash handler.
+ */
+ WRITE_ONCE(sk->sk_prot->unhash, psock->saved_unhash);
+ tcp_update_ulp(sk, psock->sk_proto, psock->saved_write_space);
+ } else {
+ sk->sk_write_space = psock->saved_write_space;
+ /* Pairs with lockless read in sk_clone_lock() */
+ sock_replace_proto(sk, psock->sk_proto);
+ }
+ return 0;
+ }
+
+ if (sk->sk_family == AF_INET6) {
+ if (tcp_bpf_assert_proto_ops(psock->sk_proto))
+ return -EINVAL;
+
+ tcp_bpf_check_v6_needs_rebuild(psock->sk_proto);
+ }
+
+ /* Pairs with lockless read in sk_clone_lock() */
+ sock_replace_proto(sk, &tcp_bpf_prots[family][config]);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tcp_bpf_update_proto);
+
+/* If a child got cloned from a listening socket that had tcp_bpf
+ * protocol callbacks installed, we need to restore the callbacks to
+ * the default ones because the child does not inherit the psock state
+ * that tcp_bpf callbacks expect.
+ */
+void tcp_bpf_clone(const struct sock *sk, struct sock *newsk)
+{
+ struct proto *prot = newsk->sk_prot;
+
+ if (is_insidevar(prot, tcp_bpf_prots))
+ newsk->sk_prot = sk->sk_prot_creator;
+}
+#endif /* CONFIG_BPF_SYSCALL */
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index 06fbe102a425..fbad6c35dee9 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* CAIA Delay-Gradient (CDG) congestion control
*
@@ -146,7 +147,7 @@ static void tcp_cdg_hystart_update(struct sock *sk)
return;
if (hystart_detect & HYSTART_ACK_TRAIN) {
- u32 now_us = div_u64(local_clock(), NSEC_PER_USEC);
+ u32 now_us = tp->tcp_mstamp;
if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) {
ca->last_ack = now_us;
@@ -160,8 +161,8 @@ static void tcp_cdg_hystart_update(struct sock *sk)
LINUX_MIB_TCPHYSTARTTRAINDETECT);
NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINCWND,
- tp->snd_cwnd);
- tp->snd_ssthresh = tp->snd_cwnd;
+ tcp_snd_cwnd(tp));
+ tp->snd_ssthresh = tcp_snd_cwnd(tp);
return;
}
}
@@ -179,8 +180,8 @@ static void tcp_cdg_hystart_update(struct sock *sk)
LINUX_MIB_TCPHYSTARTDELAYDETECT);
NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYCWND,
- tp->snd_cwnd);
- tp->snd_ssthresh = tp->snd_cwnd;
+ tcp_snd_cwnd(tp));
+ tp->snd_ssthresh = tcp_snd_cwnd(tp);
}
}
}
@@ -242,7 +243,7 @@ static bool tcp_cdg_backoff(struct sock *sk, u32 grad)
struct cdg *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
- if (prandom_u32() <= nexp_u32(grad * backoff_factor))
+ if (get_random_u32() <= nexp_u32(grad * backoff_factor))
return false;
if (use_ineff) {
@@ -251,7 +252,7 @@ static bool tcp_cdg_backoff(struct sock *sk, u32 grad)
return false;
}
- ca->shadow_wnd = max(ca->shadow_wnd, tp->snd_cwnd);
+ ca->shadow_wnd = max(ca->shadow_wnd, tcp_snd_cwnd(tp));
ca->state = CDG_BACKOFF;
tcp_enter_cwr(sk);
return true;
@@ -284,14 +285,14 @@ static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked)
}
if (!tcp_is_cwnd_limited(sk)) {
- ca->shadow_wnd = min(ca->shadow_wnd, tp->snd_cwnd);
+ ca->shadow_wnd = min(ca->shadow_wnd, tcp_snd_cwnd(tp));
return;
}
- prior_snd_cwnd = tp->snd_cwnd;
+ prior_snd_cwnd = tcp_snd_cwnd(tp);
tcp_reno_cong_avoid(sk, ack, acked);
- incr = tp->snd_cwnd - prior_snd_cwnd;
+ incr = tcp_snd_cwnd(tp) - prior_snd_cwnd;
ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr);
}
@@ -330,15 +331,15 @@ static u32 tcp_cdg_ssthresh(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
if (ca->state == CDG_BACKOFF)
- return max(2U, (tp->snd_cwnd * min(1024U, backoff_beta)) >> 10);
+ return max(2U, (tcp_snd_cwnd(tp) * min(1024U, backoff_beta)) >> 10);
if (ca->state == CDG_NONFULL && use_tolerance)
- return tp->snd_cwnd;
+ return tcp_snd_cwnd(tp);
- ca->shadow_wnd = min(ca->shadow_wnd >> 1, tp->snd_cwnd);
+ ca->shadow_wnd = min(ca->shadow_wnd >> 1, tcp_snd_cwnd(tp));
if (use_shadow)
- return max3(2U, ca->shadow_wnd, tp->snd_cwnd >> 1);
- return max(2U, tp->snd_cwnd >> 1);
+ return max3(2U, ca->shadow_wnd, tcp_snd_cwnd(tp) >> 1);
+ return max(2U, tcp_snd_cwnd(tp) >> 1);
}
static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev)
@@ -356,7 +357,7 @@ static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev)
ca->gradients = gradients;
ca->rtt_seq = tp->snd_nxt;
- ca->shadow_wnd = tp->snd_cwnd;
+ ca->shadow_wnd = tcp_snd_cwnd(tp);
break;
case CA_EVENT_COMPLETE_CWR:
ca->state = CDG_UNKNOWN;
@@ -374,12 +375,13 @@ static void tcp_cdg_init(struct sock *sk)
struct cdg *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ ca->gradients = NULL;
/* We silently fall back to window = 1 if allocation fails. */
if (window > 1)
ca->gradients = kcalloc(window, sizeof(ca->gradients[0]),
- GFP_NOWAIT | __GFP_NOWARN);
+ GFP_NOWAIT);
ca->rtt_seq = tp->snd_nxt;
- ca->shadow_wnd = tp->snd_cwnd;
+ ca->shadow_wnd = tcp_snd_cwnd(tp);
}
static void tcp_cdg_release(struct sock *sk)
@@ -387,6 +389,7 @@ static void tcp_cdg_release(struct sock *sk)
struct cdg *ca = inet_csk_ca(sk);
kfree(ca->gradients);
+ ca->gradients = NULL;
}
static struct tcp_congestion_ops tcp_cdg __read_mostly = {
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index bc6c02f16243..df758adbb445 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Pluggable TCP congestion control support and newReno
* congestion control.
@@ -15,12 +16,13 @@
#include <linux/gfp.h>
#include <linux/jhash.h>
#include <net/tcp.h>
+#include <trace/events/tcp.h>
static DEFINE_SPINLOCK(tcp_cong_list_lock);
static LIST_HEAD(tcp_cong_list);
/* Simple linear search, don't expect many entries! */
-static struct tcp_congestion_ops *tcp_ca_find(const char *name)
+struct tcp_congestion_ops *tcp_ca_find(const char *name)
{
struct tcp_congestion_ops *e;
@@ -32,9 +34,19 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name)
return NULL;
}
+void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ trace_tcp_cong_state_set(sk, ca_state);
+
+ if (icsk->icsk_ca_ops->set_state)
+ icsk->icsk_ca_ops->set_state(sk, ca_state);
+ icsk->icsk_ca_state = ca_state;
+}
+
/* Must be called with rcu lock held */
-static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net,
- const char *name)
+static struct tcp_congestion_ops *tcp_ca_find_autoload(const char *name)
{
struct tcp_congestion_ops *ca = tcp_ca_find(name);
@@ -62,14 +74,8 @@ struct tcp_congestion_ops *tcp_ca_find_key(u32 key)
return NULL;
}
-/*
- * Attach new congestion control algorithm to the list
- * of available options.
- */
-int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+int tcp_validate_congestion_control(struct tcp_congestion_ops *ca)
{
- int ret = 0;
-
/* all algorithms must implement these */
if (!ca->ssthresh || !ca->undo_cwnd ||
!(ca->cong_avoid || ca->cong_control)) {
@@ -77,6 +83,20 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
return -EINVAL;
}
+ return 0;
+}
+
+/* Attach new congestion control algorithm to the list
+ * of available options.
+ */
+int tcp_register_congestion_control(struct tcp_congestion_ops *ca)
+{
+ int ret;
+
+ ret = tcp_validate_congestion_control(ca);
+ if (ret)
+ return ret;
+
ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
spin_lock(&tcp_cong_list_lock);
@@ -117,7 +137,47 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
}
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
-u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
+/* Replace a registered old ca with a new one.
+ *
+ * The new ca must have the same name as the old one, that has been
+ * registered.
+ */
+int tcp_update_congestion_control(struct tcp_congestion_ops *ca, struct tcp_congestion_ops *old_ca)
+{
+ struct tcp_congestion_ops *existing;
+ int ret = 0;
+
+ ca->key = jhash(ca->name, sizeof(ca->name), strlen(ca->name));
+
+ spin_lock(&tcp_cong_list_lock);
+ existing = tcp_ca_find_key(old_ca->key);
+ if (ca->key == TCP_CA_UNSPEC || !existing || strcmp(existing->name, ca->name)) {
+ pr_notice("%s not registered or non-unique key\n",
+ ca->name);
+ ret = -EINVAL;
+ } else if (existing != old_ca) {
+ pr_notice("invalid old congestion control algorithm to replace\n");
+ ret = -EINVAL;
+ } else {
+ /* Add the new one before removing the old one to keep
+ * one implementation available all the time.
+ */
+ list_add_tail_rcu(&ca->list, &tcp_cong_list);
+ list_del_rcu(&existing->list);
+ pr_debug("%s updated\n", ca->name);
+ }
+ spin_unlock(&tcp_cong_list_lock);
+
+ /* Wait for outstanding readers to complete before the
+ * module or struct_ops gets removed entirely.
+ */
+ if (!ret)
+ synchronize_rcu();
+
+ return ret;
+}
+
+u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
{
const struct tcp_congestion_ops *ca;
u32 key = TCP_CA_UNSPEC;
@@ -125,7 +185,7 @@ u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
might_sleep();
rcu_read_lock();
- ca = tcp_ca_find_autoload(net, name);
+ ca = tcp_ca_find_autoload(name);
if (ca) {
key = ca->key;
*ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
@@ -134,7 +194,6 @@ u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
return key;
}
-EXPORT_SYMBOL_GPL(tcp_ca_get_key_by_name);
char *tcp_ca_get_name_by_key(u32 key, char *buffer)
{
@@ -143,14 +202,14 @@ char *tcp_ca_get_name_by_key(u32 key, char *buffer)
rcu_read_lock();
ca = tcp_ca_find_key(key);
- if (ca)
- ret = strncpy(buffer, ca->name,
- TCP_CA_NAME_MAX);
+ if (ca) {
+ strscpy(buffer, ca->name, TCP_CA_NAME_MAX);
+ ret = buffer;
+ }
rcu_read_unlock();
return ret;
}
-EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key);
/* Assign choice of congestion control. */
void tcp_assign_congestion_control(struct sock *sk)
@@ -161,7 +220,7 @@ void tcp_assign_congestion_control(struct sock *sk)
rcu_read_lock();
ca = rcu_dereference(net->ipv4.tcp_congestion_control);
- if (unlikely(!try_module_get(ca->owner)))
+ if (unlikely(!bpf_try_module_get(ca, ca->owner)))
ca = &tcp_reno;
icsk->icsk_ca_ops = ca;
rcu_read_unlock();
@@ -175,7 +234,7 @@ void tcp_assign_congestion_control(struct sock *sk)
void tcp_init_congestion_control(struct sock *sk)
{
- const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
tcp_sk(sk)->prior_ssthresh = 0;
if (icsk->icsk_ca_ops->init)
@@ -184,6 +243,7 @@ void tcp_init_congestion_control(struct sock *sk)
INET_ECN_xmit(sk);
else
INET_ECN_dontxmit(sk);
+ icsk->icsk_ca_initialized = 1;
}
static void tcp_reinit_congestion_control(struct sock *sk,
@@ -196,7 +256,12 @@ static void tcp_reinit_congestion_control(struct sock *sk,
icsk->icsk_ca_setsockopt = 1;
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
- if (sk->sk_state != TCP_CLOSE)
+ if (ca->flags & TCP_CONG_NEEDS_ECN)
+ INET_ECN_xmit(sk);
+ else
+ INET_ECN_dontxmit(sk);
+
+ if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
tcp_init_congestion_control(sk);
}
@@ -205,9 +270,10 @@ void tcp_cleanup_congestion_control(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- if (icsk->icsk_ca_ops->release)
+ if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release)
icsk->icsk_ca_ops->release(sk);
- module_put(icsk->icsk_ca_ops->owner);
+ icsk->icsk_ca_initialized = 0;
+ bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
}
/* Used by sysctl to change default congestion control */
@@ -218,15 +284,19 @@ int tcp_set_default_congestion_control(struct net *net, const char *name)
int ret;
rcu_read_lock();
- ca = tcp_ca_find_autoload(net, name);
+ ca = tcp_ca_find_autoload(name);
if (!ca) {
ret = -ENOENT;
- } else if (!try_module_get(ca->owner)) {
+ } else if (!bpf_try_module_get(ca, ca->owner)) {
ret = -EBUSY;
+ } else if (!net_eq(net, &init_net) &&
+ !(ca->flags & TCP_CONG_NON_RESTRICTED)) {
+ /* Only init netns can set default to a restricted algorithm */
+ ret = -EPERM;
} else {
prev = xchg(&net->ipv4.tcp_congestion_control, ca);
if (prev)
- module_put(prev->owner);
+ bpf_module_put(prev, prev->owner);
ca->flags |= TCP_CONG_NON_RESTRICTED;
ret = 0;
@@ -255,6 +325,9 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen)
offs += snprintf(buf + offs, maxlen - offs,
"%s%s",
offs == 0 ? "" : " ", ca->name);
+
+ if (WARN_ON_ONCE(offs >= maxlen))
+ break;
}
rcu_read_unlock();
}
@@ -266,7 +339,7 @@ void tcp_get_default_congestion_control(struct net *net, char *name)
rcu_read_lock();
ca = rcu_dereference(net->ipv4.tcp_congestion_control);
- strncpy(name, ca->name, TCP_CA_NAME_MAX);
+ strscpy(name, ca->name, TCP_CA_NAME_MAX);
rcu_read_unlock();
}
@@ -284,6 +357,9 @@ void tcp_get_allowed_congestion_control(char *buf, size_t maxlen)
offs += snprintf(buf + offs, maxlen - offs,
"%s%s",
offs == 0 ? "" : " ", ca->name);
+
+ if (WARN_ON_ONCE(offs >= maxlen))
+ break;
}
rcu_read_unlock();
}
@@ -332,7 +408,8 @@ out:
* tcp_reinit_congestion_control (if the current congestion control was
* already initialized.
*/
-int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit)
+int tcp_set_congestion_control(struct sock *sk, const char *name, bool load,
+ bool cap_net_admin)
{
struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_congestion_ops *ca;
@@ -345,7 +422,7 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo
if (!load)
ca = tcp_ca_find(name);
else
- ca = tcp_ca_find_autoload(sock_net(sk), name);
+ ca = tcp_ca_find_autoload(name);
/* No change asking for existing value */
if (ca == icsk->icsk_ca_ops) {
@@ -353,29 +430,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo
goto out;
}
- if (!ca) {
+ if (!ca)
err = -ENOENT;
- } else if (!load) {
- const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops;
-
- if (try_module_get(ca->owner)) {
- if (reinit) {
- tcp_reinit_congestion_control(sk, ca);
- } else {
- icsk->icsk_ca_ops = ca;
- module_put(old_ca->owner);
- }
- } else {
- err = -EBUSY;
- }
- } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
- ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) {
+ else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin))
err = -EPERM;
- } else if (!try_module_get(ca->owner)) {
+ else if (!bpf_try_module_get(ca, ca->owner))
err = -EBUSY;
- } else {
+ else
tcp_reinit_congestion_control(sk, ca);
- }
out:
rcu_read_unlock();
return err;
@@ -390,12 +452,12 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo
* ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
* returns the leftover acks to adjust cwnd in congestion avoidance mode.
*/
-u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
+__bpf_kfunc u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
{
- u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh);
+ u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh);
- acked -= cwnd - tp->snd_cwnd;
- tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
+ acked -= cwnd - tcp_snd_cwnd(tp);
+ tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp));
return acked;
}
@@ -404,12 +466,12 @@ EXPORT_SYMBOL_GPL(tcp_slow_start);
/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w),
* for every packet that was ACKed.
*/
-void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
+__bpf_kfunc void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
{
/* If credits accumulated at a higher w, apply them gently now. */
if (tp->snd_cwnd_cnt >= w) {
tp->snd_cwnd_cnt = 0;
- tp->snd_cwnd++;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
}
tp->snd_cwnd_cnt += acked;
@@ -417,9 +479,9 @@ void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
u32 delta = tp->snd_cwnd_cnt / w;
tp->snd_cwnd_cnt -= delta * w;
- tp->snd_cwnd += delta;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + delta);
}
- tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp);
+ tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp));
}
EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
@@ -430,7 +492,7 @@ EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
/* This is Jacobson's slow start and congestion avoidance.
* SIGCOMM '88, p. 328.
*/
-void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+__bpf_kfunc void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -444,24 +506,24 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
return;
}
/* In dangerous area, increase slowly. */
- tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked);
+ tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked);
}
EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
/* Slow start threshold is half the congestion window (min 2) */
-u32 tcp_reno_ssthresh(struct sock *sk)
+__bpf_kfunc u32 tcp_reno_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- return max(tp->snd_cwnd >> 1U, 2U);
+ return max(tcp_snd_cwnd(tp) >> 1U, 2U);
}
EXPORT_SYMBOL_GPL(tcp_reno_ssthresh);
-u32 tcp_reno_undo_cwnd(struct sock *sk)
+__bpf_kfunc u32 tcp_reno_undo_cwnd(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- return max(tp->snd_cwnd, tp->prior_cwnd);
+ return max(tcp_snd_cwnd(tp), tp->prior_cwnd);
}
EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 78bfadfcf342..76c23675ae50 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* TCP CUBIC: Binary Increase Congestion control for TCP v2.3
* Home page:
@@ -24,6 +25,8 @@
*/
#include <linux/mm.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
#include <linux/module.h>
#include <linux/math64.h>
#include <net/tcp.h>
@@ -39,8 +42,8 @@
/* Number of delay samples for detecting the increase of delay */
#define HYSTART_MIN_SAMPLES 8
-#define HYSTART_DELAY_MIN (4U<<3)
-#define HYSTART_DELAY_MAX (16U<<3)
+#define HYSTART_DELAY_MIN (4000U) /* 4 ms */
+#define HYSTART_DELAY_MAX (16000U) /* 16 ms */
#define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX)
static int fast_convergence __read_mostly = 1;
@@ -52,7 +55,7 @@ static int tcp_friendliness __read_mostly = 1;
static int hystart __read_mostly = 1;
static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY;
static int hystart_low_window __read_mostly = 16;
-static int hystart_ack_delta __read_mostly = 2;
+static int hystart_ack_delta_us __read_mostly = 2000;
static u32 cube_rtt_scale __read_mostly;
static u32 beta_scale __read_mostly;
@@ -76,8 +79,8 @@ MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms"
" 1: packet-train 2: delay 3: both packet-train and delay");
module_param(hystart_low_window, int, 0644);
MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start");
-module_param(hystart_ack_delta, int, 0644);
-MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)");
+module_param(hystart_ack_delta_us, int, 0644);
+MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)");
/* BIC TCP Parameters */
struct bictcp {
@@ -88,7 +91,7 @@ struct bictcp {
u32 bic_origin_point;/* origin point of bic function */
u32 bic_K; /* time to origin point
from the beginning of the current epoch */
- u32 delay_min; /* min delay (msec << 3) */
+ u32 delay_min; /* min delay (usec) */
u32 epoch_start; /* beginning of an epoch */
u32 ack_cnt; /* number of acks */
u32 tcp_cwnd; /* estimated tcp cwnd */
@@ -103,26 +106,13 @@ struct bictcp {
static inline void bictcp_reset(struct bictcp *ca)
{
- ca->cnt = 0;
- ca->last_max_cwnd = 0;
- ca->last_cwnd = 0;
- ca->last_time = 0;
- ca->bic_origin_point = 0;
- ca->bic_K = 0;
- ca->delay_min = 0;
- ca->epoch_start = 0;
- ca->ack_cnt = 0;
- ca->tcp_cwnd = 0;
+ memset(ca, 0, offsetof(struct bictcp, unused));
ca->found = 0;
}
-static inline u32 bictcp_clock(void)
+static inline u32 bictcp_clock_us(const struct sock *sk)
{
-#if HZ < 1000
- return ktime_to_ms(ktime_get_real());
-#else
- return jiffies_to_msecs(jiffies);
-#endif
+ return tcp_sk(sk)->tcp_mstamp;
}
static inline void bictcp_hystart_reset(struct sock *sk)
@@ -130,13 +120,13 @@ static inline void bictcp_hystart_reset(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
- ca->round_start = ca->last_ack = bictcp_clock();
+ ca->round_start = ca->last_ack = bictcp_clock_us(sk);
ca->end_seq = tp->snd_nxt;
- ca->curr_rtt = 0;
+ ca->curr_rtt = ~0U;
ca->sample_cnt = 0;
}
-static void bictcp_init(struct sock *sk)
+__bpf_kfunc static void cubictcp_init(struct sock *sk)
{
struct bictcp *ca = inet_csk_ca(sk);
@@ -149,7 +139,7 @@ static void bictcp_init(struct sock *sk)
tcp_sk(sk)->snd_ssthresh = initial_ssthresh;
}
-static void bictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
+__bpf_kfunc static void cubictcp_cwnd_event(struct sock *sk, enum tcp_ca_event event)
{
if (event == CA_EVENT_TX_START) {
struct bictcp *ca = inet_csk_ca(sk);
@@ -275,7 +265,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked)
*/
t = (s32)(tcp_jiffies32 - ca->epoch_start);
- t += msecs_to_jiffies(ca->delay_min >> 3);
+ t += usecs_to_jiffies(ca->delay_min);
/* change the unit from HZ to bictcp_HZ */
t <<= BICTCP_HZ;
do_div(t, HZ);
@@ -331,7 +321,7 @@ tcp_friendliness:
ca->cnt = max(ca->cnt, 2U);
}
-static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
+__bpf_kfunc static void cubictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
@@ -340,17 +330,15 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
return;
if (tcp_in_slow_start(tp)) {
- if (hystart && after(ack, ca->end_seq))
- bictcp_hystart_reset(sk);
acked = tcp_slow_start(tp, acked);
if (!acked)
return;
}
- bictcp_update(ca, tp->snd_cwnd, acked);
+ bictcp_update(ca, tcp_snd_cwnd(tp), acked);
tcp_cong_avoid_ai(tp, ca->cnt, acked);
}
-static u32 bictcp_recalc_ssthresh(struct sock *sk)
+__bpf_kfunc static u32 cubictcp_recalc_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
@@ -358,16 +346,16 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk)
ca->epoch_start = 0; /* end of epoch */
/* Wmax and fast convergence */
- if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence)
- ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta))
+ if (tcp_snd_cwnd(tp) < ca->last_max_cwnd && fast_convergence)
+ ca->last_max_cwnd = (tcp_snd_cwnd(tp) * (BICTCP_BETA_SCALE + beta))
/ (2 * BICTCP_BETA_SCALE);
else
- ca->last_max_cwnd = tp->snd_cwnd;
+ ca->last_max_cwnd = tcp_snd_cwnd(tp);
- return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U);
+ return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U);
}
-static void bictcp_state(struct sock *sk, u8 new_state)
+__bpf_kfunc static void cubictcp_state(struct sock *sk, u8 new_state)
{
if (new_state == TCP_CA_Loss) {
bictcp_reset(inet_csk_ca(sk));
@@ -375,58 +363,93 @@ static void bictcp_state(struct sock *sk, u8 new_state)
}
}
+/* Account for TSO/GRO delays.
+ * Otherwise short RTT flows could get too small ssthresh, since during
+ * slow start we begin with small TSO packets and ca->delay_min would
+ * not account for long aggregation delay when TSO packets get bigger.
+ * Ideally even with a very small RTT we would like to have at least one
+ * TSO packet being sent and received by GRO, and another one in qdisc layer.
+ * We apply another 100% factor because @rate is doubled at this point.
+ * We cap the cushion to 1ms.
+ */
+static u32 hystart_ack_delay(const struct sock *sk)
+{
+ unsigned long rate;
+
+ rate = READ_ONCE(sk->sk_pacing_rate);
+ if (!rate)
+ return 0;
+ return min_t(u64, USEC_PER_MSEC,
+ div64_ul((u64)sk->sk_gso_max_size * 4 * USEC_PER_SEC, rate));
+}
+
static void hystart_update(struct sock *sk, u32 delay)
{
struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
+ u32 threshold;
- if (ca->found & hystart_detect)
+ if (after(tp->snd_una, ca->end_seq))
+ bictcp_hystart_reset(sk);
+
+ /* hystart triggers when cwnd is larger than some threshold */
+ if (tcp_snd_cwnd(tp) < hystart_low_window)
return;
if (hystart_detect & HYSTART_ACK_TRAIN) {
- u32 now = bictcp_clock();
+ u32 now = bictcp_clock_us(sk);
/* first detection parameter - ack-train detection */
- if ((s32)(now - ca->last_ack) <= hystart_ack_delta) {
+ if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) {
ca->last_ack = now;
- if ((s32)(now - ca->round_start) > ca->delay_min >> 4) {
- ca->found |= HYSTART_ACK_TRAIN;
+
+ threshold = ca->delay_min + hystart_ack_delay(sk);
+
+ /* Hystart ack train triggers if we get ack past
+ * ca->delay_min/2.
+ * Pacing might have delayed packets up to RTT/2
+ * during slow start.
+ */
+ if (sk->sk_pacing_status == SK_PACING_NONE)
+ threshold >>= 1;
+
+ if ((s32)(now - ca->round_start) > threshold) {
+ ca->found = 1;
+ pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n",
+ now - ca->round_start, threshold,
+ ca->delay_min, hystart_ack_delay(sk), tcp_snd_cwnd(tp));
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINDETECT);
NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINCWND,
- tp->snd_cwnd);
- tp->snd_ssthresh = tp->snd_cwnd;
+ tcp_snd_cwnd(tp));
+ tp->snd_ssthresh = tcp_snd_cwnd(tp);
}
}
}
if (hystart_detect & HYSTART_DELAY) {
/* obtain the minimum delay of more than sampling packets */
+ if (ca->curr_rtt > delay)
+ ca->curr_rtt = delay;
if (ca->sample_cnt < HYSTART_MIN_SAMPLES) {
- if (ca->curr_rtt == 0 || ca->curr_rtt > delay)
- ca->curr_rtt = delay;
-
ca->sample_cnt++;
} else {
if (ca->curr_rtt > ca->delay_min +
HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
- ca->found |= HYSTART_DELAY;
+ ca->found = 1;
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYDETECT);
NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYCWND,
- tp->snd_cwnd);
- tp->snd_ssthresh = tp->snd_cwnd;
+ tcp_snd_cwnd(tp));
+ tp->snd_ssthresh = tcp_snd_cwnd(tp);
}
}
}
}
-/* Track delayed acknowledgment ratio using sliding window
- * ratio = (15*ratio + sample) / 16
- */
-static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
+__bpf_kfunc static void cubictcp_acked(struct sock *sk, const struct ack_sample *sample)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct bictcp *ca = inet_csk_ca(sk);
@@ -440,7 +463,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ)
return;
- delay = (sample->rtt_us << 3) / USEC_PER_MSEC;
+ delay = sample->rtt_us;
if (delay == 0)
delay = 1;
@@ -448,26 +471,40 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
if (ca->delay_min == 0 || ca->delay_min > delay)
ca->delay_min = delay;
- /* hystart triggers when cwnd is larger than some threshold */
- if (hystart && tcp_in_slow_start(tp) &&
- tp->snd_cwnd >= hystart_low_window)
+ if (!ca->found && tcp_in_slow_start(tp) && hystart)
hystart_update(sk, delay);
}
static struct tcp_congestion_ops cubictcp __read_mostly = {
- .init = bictcp_init,
- .ssthresh = bictcp_recalc_ssthresh,
- .cong_avoid = bictcp_cong_avoid,
- .set_state = bictcp_state,
+ .init = cubictcp_init,
+ .ssthresh = cubictcp_recalc_ssthresh,
+ .cong_avoid = cubictcp_cong_avoid,
+ .set_state = cubictcp_state,
.undo_cwnd = tcp_reno_undo_cwnd,
- .cwnd_event = bictcp_cwnd_event,
- .pkts_acked = bictcp_acked,
+ .cwnd_event = cubictcp_cwnd_event,
+ .pkts_acked = cubictcp_acked,
.owner = THIS_MODULE,
.name = "cubic",
};
+BTF_KFUNCS_START(tcp_cubic_check_kfunc_ids)
+BTF_ID_FLAGS(func, cubictcp_init)
+BTF_ID_FLAGS(func, cubictcp_recalc_ssthresh)
+BTF_ID_FLAGS(func, cubictcp_cong_avoid)
+BTF_ID_FLAGS(func, cubictcp_state)
+BTF_ID_FLAGS(func, cubictcp_cwnd_event)
+BTF_ID_FLAGS(func, cubictcp_acked)
+BTF_KFUNCS_END(tcp_cubic_check_kfunc_ids)
+
+static const struct btf_kfunc_id_set tcp_cubic_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &tcp_cubic_check_kfunc_ids,
+};
+
static int __init cubictcp_register(void)
{
+ int ret;
+
BUILD_BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE);
/* Precompute a bunch of the scaling factors that are used per-packet
@@ -498,6 +535,9 @@ static int __init cubictcp_register(void)
/* divide by bic_scale and by constant Srtt (100ms) */
do_div(cube_factor, bic_scale * 10);
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_cubic_kfunc_set);
+ if (ret < 0)
+ return ret;
return tcp_register_congestion_control(&cubictcp);
}
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index ca61e2a659e7..03abe0848420 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* DataCenter TCP (DCTCP) congestion control.
*
* http://simula.stanford.edu/~alizade/Site/DCTCP.html
@@ -33,64 +34,67 @@
* Daniel Borkmann <dborkman@redhat.com>
* Florian Westphal <fw@strlen.de>
* Glenn Judd <glenn.judd@morganstanley.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or (at
- * your option) any later version.
*/
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <net/tcp.h>
#include <linux/inet_diag.h>
+#include "tcp_dctcp.h"
#define DCTCP_MAX_ALPHA 1024U
struct dctcp {
- u32 acked_bytes_ecn;
- u32 acked_bytes_total;
- u32 prior_snd_una;
+ u32 old_delivered;
+ u32 old_delivered_ce;
u32 prior_rcv_nxt;
u32 dctcp_alpha;
u32 next_seq;
u32 ce_state;
u32 loss_cwnd;
+ struct tcp_plb_state plb;
};
static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
-module_param(dctcp_shift_g, uint, 0644);
+
+static int dctcp_shift_g_set(const char *val, const struct kernel_param *kp)
+{
+ return param_set_uint_minmax(val, kp, 0, 10);
+}
+
+static const struct kernel_param_ops dctcp_shift_g_ops = {
+ .set = dctcp_shift_g_set,
+ .get = param_get_uint,
+};
+
+module_param_cb(dctcp_shift_g, &dctcp_shift_g_ops, &dctcp_shift_g, 0644);
MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha");
static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
module_param(dctcp_alpha_on_init, uint, 0644);
MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
-static unsigned int dctcp_clamp_alpha_on_loss __read_mostly;
-module_param(dctcp_clamp_alpha_on_loss, uint, 0644);
-MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss,
- "parameter for clamping alpha on loss");
-
static struct tcp_congestion_ops dctcp_reno;
static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
{
ca->next_seq = tp->snd_nxt;
- ca->acked_bytes_ecn = 0;
- ca->acked_bytes_total = 0;
+ ca->old_delivered = tp->delivered;
+ ca->old_delivered_ce = tp->delivered_ce;
}
-static void dctcp_init(struct sock *sk)
+__bpf_kfunc static void dctcp_init(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- if ((tp->ecn_flags & TCP_ECN_OK) ||
+ if (tcp_ecn_mode_any(tp) ||
(sk->sk_state == TCP_LISTEN ||
sk->sk_state == TCP_CLOSE)) {
struct dctcp *ca = inet_csk_ca(sk);
- ca->prior_snd_una = tp->snd_una;
ca->prior_rcv_nxt = tp->rcv_nxt;
ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
@@ -99,6 +103,8 @@ static void dctcp_init(struct sock *sk)
ca->ce_state = 0;
dctcp_reset(tp, ca);
+ tcp_plb_init(sk, &ca->plb);
+
return;
}
@@ -109,98 +115,52 @@ static void dctcp_init(struct sock *sk)
INET_ECN_dontxmit(sk);
}
-static u32 dctcp_ssthresh(struct sock *sk)
+__bpf_kfunc static u32 dctcp_ssthresh(struct sock *sk)
{
struct dctcp *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
- ca->loss_cwnd = tp->snd_cwnd;
- return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
-}
-
-/* Minimal DCTP CE state machine:
- *
- * S: 0 <- last pkt was non-CE
- * 1 <- last pkt was CE
- */
-
-static void dctcp_ce_state_0_to_1(struct sock *sk)
-{
- struct dctcp *ca = inet_csk_ca(sk);
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (!ca->ce_state) {
- /* State has changed from CE=0 to CE=1, force an immediate
- * ACK to reflect the new CE state. If an ACK was delayed,
- * send that first to reflect the prior CE state.
- */
- if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
- __tcp_send_ack(sk, ca->prior_rcv_nxt);
- inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
- }
-
- ca->prior_rcv_nxt = tp->rcv_nxt;
- ca->ce_state = 1;
-
- tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
-}
-
-static void dctcp_ce_state_1_to_0(struct sock *sk)
-{
- struct dctcp *ca = inet_csk_ca(sk);
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (ca->ce_state) {
- /* State has changed from CE=1 to CE=0, force an immediate
- * ACK to reflect the new CE state. If an ACK was delayed,
- * send that first to reflect the prior CE state.
- */
- if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
- __tcp_send_ack(sk, ca->prior_rcv_nxt);
- inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
- }
-
- ca->prior_rcv_nxt = tp->rcv_nxt;
- ca->ce_state = 0;
-
- tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+ ca->loss_cwnd = tcp_snd_cwnd(tp);
+ return max(tcp_snd_cwnd(tp) - ((tcp_snd_cwnd(tp) * ca->dctcp_alpha) >> 11U), 2U);
}
-static void dctcp_update_alpha(struct sock *sk, u32 flags)
+__bpf_kfunc static void dctcp_update_alpha(struct sock *sk, u32 flags)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct dctcp *ca = inet_csk_ca(sk);
- u32 acked_bytes = tp->snd_una - ca->prior_snd_una;
-
- /* If ack did not advance snd_una, count dupack as MSS size.
- * If ack did update window, do not count it at all.
- */
- if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE))
- acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss;
- if (acked_bytes) {
- ca->acked_bytes_total += acked_bytes;
- ca->prior_snd_una = tp->snd_una;
-
- if (flags & CA_ACK_ECE)
- ca->acked_bytes_ecn += acked_bytes;
- }
/* Expired RTT */
if (!before(tp->snd_una, ca->next_seq)) {
- u64 bytes_ecn = ca->acked_bytes_ecn;
+ u32 delivered = tp->delivered - ca->old_delivered;
+ u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce;
u32 alpha = ca->dctcp_alpha;
+ u32 ce_ratio = 0;
+
+ if (delivered > 0) {
+ /* dctcp_alpha keeps EWMA of fraction of ECN marked
+ * packets. Because of EWMA smoothing, PLB reaction can
+ * be slow so we use ce_ratio which is an instantaneous
+ * measure of congestion. ce_ratio is the fraction of
+ * ECN marked packets in the previous RTT.
+ */
+ if (delivered_ce > 0)
+ ce_ratio = (delivered_ce << TCP_PLB_SCALE) / delivered;
+ tcp_plb_update_state(sk, &ca->plb, (int)ce_ratio);
+ tcp_plb_check_rehash(sk, &ca->plb);
+ }
/* alpha = (1 - g) * alpha + g * F */
alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g);
- if (bytes_ecn) {
+ if (delivered_ce) {
+
/* If dctcp_shift_g == 1, a 32bit value would overflow
- * after 8 Mbytes.
+ * after 8 M packets.
*/
- bytes_ecn <<= (10 - dctcp_shift_g);
- do_div(bytes_ecn, max(1U, ca->acked_bytes_total));
+ delivered_ce <<= (10 - dctcp_shift_g);
+ delivered_ce /= max(1U, delivered);
- alpha = min(alpha + (u32)bytes_ecn, DCTCP_MAX_ALPHA);
+ alpha = min(alpha + delivered_ce, DCTCP_MAX_ALPHA);
}
/* dctcp_alpha can be read from dctcp_get_info() without
* synchro, so we ask compiler to not use dctcp_alpha
@@ -211,31 +171,40 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags)
}
}
-static void dctcp_state(struct sock *sk, u8 new_state)
+static void dctcp_react_to_loss(struct sock *sk)
{
- if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) {
- struct dctcp *ca = inet_csk_ca(sk);
+ struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
- /* If this extension is enabled, we clamp dctcp_alpha to
- * max on packet loss; the motivation is that dctcp_alpha
- * is an indicator to the extend of congestion and packet
- * loss is an indicator of extreme congestion; setting
- * this in practice turned out to be beneficial, and
- * effectively assumes total congestion which reduces the
- * window by half.
- */
- ca->dctcp_alpha = DCTCP_MAX_ALPHA;
- }
+ ca->loss_cwnd = tcp_snd_cwnd(tp);
+ tp->snd_ssthresh = max(tcp_snd_cwnd(tp) >> 1U, 2U);
}
-static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
+__bpf_kfunc static void dctcp_state(struct sock *sk, u8 new_state)
{
+ if (new_state == TCP_CA_Recovery &&
+ new_state != inet_csk(sk)->icsk_ca_state)
+ dctcp_react_to_loss(sk);
+ /* We handle RTO in dctcp_cwnd_event to ensure that we perform only
+ * one loss-adjustment per RTT.
+ */
+}
+
+__bpf_kfunc static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
+{
+ struct dctcp *ca = inet_csk_ca(sk);
+
switch (ev) {
case CA_EVENT_ECN_IS_CE:
- dctcp_ce_state_0_to_1(sk);
- break;
case CA_EVENT_ECN_NO_CE:
- dctcp_ce_state_1_to_0(sk);
+ dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state);
+ break;
+ case CA_EVENT_LOSS:
+ tcp_plb_update_state_upon_rto(sk, &ca->plb);
+ dctcp_react_to_loss(sk);
+ break;
+ case CA_EVENT_TX_START:
+ tcp_plb_check_rehash(sk, &ca->plb); /* Maybe rehash when inflight is 0 */
break;
default:
/* Don't care for the rest. */
@@ -247,6 +216,7 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
union tcp_cc_info *info)
{
const struct dctcp *ca = inet_csk_ca(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
/* Fill it also in case of VEGASINFO due to req struct limits.
* We can still correctly retrieve it later.
@@ -258,8 +228,10 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
info->dctcp.dctcp_enabled = 1;
info->dctcp.dctcp_ce_state = (u16) ca->ce_state;
info->dctcp.dctcp_alpha = ca->dctcp_alpha;
- info->dctcp.dctcp_ab_ecn = ca->acked_bytes_ecn;
- info->dctcp.dctcp_ab_tot = ca->acked_bytes_total;
+ info->dctcp.dctcp_ab_ecn = tp->mss_cache *
+ (tp->delivered_ce - ca->old_delivered_ce);
+ info->dctcp.dctcp_ab_tot = tp->mss_cache *
+ (tp->delivered - ca->old_delivered);
}
*attr = INET_DIAG_DCTCPINFO;
@@ -268,11 +240,12 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
return 0;
}
-static u32 dctcp_cwnd_undo(struct sock *sk)
+__bpf_kfunc static u32 dctcp_cwnd_undo(struct sock *sk)
{
const struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
- return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
+ return max(tcp_snd_cwnd(tp), ca->loss_cwnd);
}
static struct tcp_congestion_ops dctcp __read_mostly = {
@@ -298,9 +271,29 @@ static struct tcp_congestion_ops dctcp_reno __read_mostly = {
.name = "dctcp-reno",
};
+BTF_KFUNCS_START(tcp_dctcp_check_kfunc_ids)
+BTF_ID_FLAGS(func, dctcp_init)
+BTF_ID_FLAGS(func, dctcp_update_alpha)
+BTF_ID_FLAGS(func, dctcp_cwnd_event)
+BTF_ID_FLAGS(func, dctcp_ssthresh)
+BTF_ID_FLAGS(func, dctcp_cwnd_undo)
+BTF_ID_FLAGS(func, dctcp_state)
+BTF_KFUNCS_END(tcp_dctcp_check_kfunc_ids)
+
+static const struct btf_kfunc_id_set tcp_dctcp_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &tcp_dctcp_check_kfunc_ids,
+};
+
static int __init dctcp_register(void)
{
+ int ret;
+
BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &tcp_dctcp_kfunc_set);
+ if (ret < 0)
+ return ret;
return tcp_register_congestion_control(&dctcp);
}
diff --git a/net/ipv4/tcp_dctcp.h b/net/ipv4/tcp_dctcp.h
new file mode 100644
index 000000000000..4b0259111d81
--- /dev/null
+++ b/net/ipv4/tcp_dctcp.h
@@ -0,0 +1,40 @@
+#ifndef _TCP_DCTCP_H
+#define _TCP_DCTCP_H
+
+static inline void dctcp_ece_ack_cwr(struct sock *sk, u32 ce_state)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (ce_state == 1)
+ tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+ else
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+}
+
+/* Minimal DCTP CE state machine:
+ *
+ * S: 0 <- last pkt was non-CE
+ * 1 <- last pkt was CE
+ */
+static inline void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt,
+ u32 *prior_rcv_nxt, u32 *ce_state)
+{
+ u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0;
+
+ if (*ce_state != new_ce_state) {
+ /* CE state has changed, force an immediate ACK to
+ * reflect the new CE state. If an ACK was delayed,
+ * send that first to reflect the prior CE state.
+ */
+ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
+ dctcp_ece_ack_cwr(sk, *ce_state);
+ __tcp_send_ack(sk, *prior_rcv_nxt, 0);
+ }
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+ }
+ *prior_rcv_nxt = tcp_sk(sk)->rcv_nxt;
+ *ce_state = new_ce_state;
+ dctcp_ece_ack_cwr(sk, new_ce_state);
+}
+
+#endif
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 81148f7a2323..d83efd91f461 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* tcp_diag.c Module for monitoring TCP transport protocols sockets.
*
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
@@ -16,6 +12,9 @@
#include <linux/tcp.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
+#include <net/inet_timewait_sock.h>
#include <net/netlink.h>
#include <net/tcp.h>
@@ -25,13 +24,14 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
struct tcp_info *info = _info;
if (inet_sk_state_load(sk) == TCP_LISTEN) {
- r->idiag_rqueue = sk->sk_ack_backlog;
- r->idiag_wqueue = sk->sk_max_ack_backlog;
+ r->idiag_rqueue = READ_ONCE(sk->sk_ack_backlog);
+ r->idiag_wqueue = READ_ONCE(sk->sk_max_ack_backlog);
} else if (sk->sk_type == SOCK_STREAM) {
const struct tcp_sock *tp = tcp_sk(sk);
- r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
- r->idiag_wqueue = tp->write_seq - tp->snd_una;
+ r->idiag_rqueue = max_t(int, READ_ONCE(tp->rcv_nxt) -
+ READ_ONCE(tp->copied_seq), 0);
+ r->idiag_wqueue = READ_ONCE(tp->write_seq) - tp->snd_una;
}
if (info)
tcp_get_info(sk, info);
@@ -85,13 +85,43 @@ static int tcp_diag_put_md5sig(struct sk_buff *skb,
}
#endif
+static int tcp_diag_put_ulp(struct sk_buff *skb, struct sock *sk,
+ const struct tcp_ulp_ops *ulp_ops, bool net_admin)
+{
+ struct nlattr *nest;
+ int err;
+
+ nest = nla_nest_start_noflag(skb, INET_DIAG_ULP_INFO);
+ if (!nest)
+ return -EMSGSIZE;
+
+ err = nla_put_string(skb, INET_ULP_INFO_NAME, ulp_ops->name);
+ if (err)
+ goto nla_failure;
+
+ if (ulp_ops->get_info)
+ err = ulp_ops->get_info(sk, skb, net_admin);
+ if (err)
+ goto nla_failure;
+
+ nla_nest_end(skb, nest);
+ return 0;
+
+nla_failure:
+ nla_nest_cancel(skb, nest);
+ return err;
+}
+
static int tcp_diag_get_aux(struct sock *sk, bool net_admin,
struct sk_buff *skb)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_ulp_ops *ulp_ops;
+ int err = 0;
+
#ifdef CONFIG_TCP_MD5SIG
if (net_admin) {
struct tcp_md5sig_info *md5sig;
- int err = 0;
rcu_read_lock();
md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info);
@@ -103,11 +133,19 @@ static int tcp_diag_get_aux(struct sock *sk, bool net_admin,
}
#endif
+ ulp_ops = icsk->icsk_ulp_ops;
+ if (ulp_ops) {
+ err = tcp_diag_put_ulp(skb, sk, ulp_ops, net_admin);
+ if (err < 0)
+ return err;
+ }
+
return 0;
}
static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
size_t size = 0;
#ifdef CONFIG_TCP_MD5SIG
@@ -128,19 +166,476 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
}
#endif
- return size;
+ if (sk_fullsock(sk)) {
+ const struct tcp_ulp_ops *ulp_ops;
+
+ ulp_ops = icsk->icsk_ulp_ops;
+ if (ulp_ops) {
+ size += nla_total_size(0) +
+ nla_total_size(TCP_ULP_NAME_MAX);
+ if (ulp_ops->get_info_size)
+ size += ulp_ops->get_info_size(sk, net_admin);
+ }
+ }
+
+ return size
+ + nla_total_size(sizeof(struct tcp_info))
+ + nla_total_size(sizeof(struct inet_diag_msg))
+ + inet_diag_msg_attrs_size()
+ + nla_total_size(sizeof(struct inet_diag_meminfo))
+ + nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
+ + nla_total_size(TCP_CA_NAME_MAX)
+ + nla_total_size(sizeof(struct tcpvegas_info))
+ + 64;
+}
+
+static int tcp_twsk_diag_fill(struct sock *sk,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ u16 nlmsg_flags, bool net_admin)
+{
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ long tmo;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type,
+ sizeof(*r), nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ DEBUG_NET_WARN_ON_ONCE(tw->tw_state != TCP_TIME_WAIT);
+
+ inet_diag_msg_common_fill(r, sk);
+ r->idiag_retrans = 0;
+
+ r->idiag_state = READ_ONCE(tw->tw_substate);
+ r->idiag_timer = 3;
+ tmo = tw->tw_timer.expires - jiffies;
+ r->idiag_expires = jiffies_delta_to_msecs(tmo);
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = 0;
+ r->idiag_inode = 0;
+
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
+ tw->tw_mark)) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int tcp_req_diag_fill(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ u16 nlmsg_flags, bool net_admin)
+{
+ struct request_sock *reqsk = inet_reqsk(sk);
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ long tmo;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ inet_diag_msg_common_fill(r, sk);
+ r->idiag_state = TCP_SYN_RECV;
+ r->idiag_timer = 1;
+ r->idiag_retrans = READ_ONCE(reqsk->num_retrans);
+
+ BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
+ offsetof(struct sock, sk_cookie));
+
+ tmo = READ_ONCE(inet_reqsk(sk)->rsk_timer.expires) - jiffies;
+ r->idiag_expires = jiffies_delta_to_msecs(tmo);
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = 0;
+ r->idiag_inode = 0;
+
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
+ inet_rsk(reqsk)->ir_mark)) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r,
+ u16 nlmsg_flags, bool net_admin)
+{
+ if (sk->sk_state == TCP_TIME_WAIT)
+ return tcp_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
+
+ if (sk->sk_state == TCP_NEW_SYN_RECV)
+ return tcp_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
+
+ return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags,
+ net_admin);
+}
+
+static void twsk_build_assert(void)
+{
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) !=
+ offsetof(struct sock, sk_family));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) !=
+ offsetof(struct inet_sock, inet_num));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) !=
+ offsetof(struct inet_sock, inet_dport));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) !=
+ offsetof(struct inet_sock, inet_rcv_saddr));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) !=
+ offsetof(struct inet_sock, inet_daddr));
+
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) !=
+ offsetof(struct sock, sk_v6_rcv_saddr));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) !=
+ offsetof(struct sock, sk_v6_daddr));
+#endif
}
static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r)
{
- inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r, bc);
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+ struct inet_diag_dump_data *cb_data = cb->data;
+ struct net *net = sock_net(skb->sk);
+ u32 idiag_states = r->idiag_states;
+ struct inet_hashinfo *hashinfo;
+ int i, num, s_i, s_num;
+ struct sock *sk;
+
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
+ if (idiag_states & TCPF_SYN_RECV)
+ idiag_states |= TCPF_NEW_SYN_RECV;
+ s_i = cb->args[1];
+ s_num = num = cb->args[2];
+
+ if (cb->args[0] == 0) {
+ if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport)
+ goto skip_listen_ht;
+
+ for (i = s_i; i <= hashinfo->lhash2_mask; i++) {
+ struct inet_listen_hashbucket *ilb;
+ struct hlist_nulls_node *node;
+
+ num = 0;
+ ilb = &hashinfo->lhash2[i];
+
+ if (hlist_nulls_empty(&ilb->nulls_head)) {
+ s_num = 0;
+ continue;
+ }
+ spin_lock(&ilb->lock);
+ sk_nulls_for_each(sk, node, &ilb->nulls_head) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+
+ if (num < s_num) {
+ num++;
+ continue;
+ }
+
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next_listen;
+
+ if (r->id.idiag_sport != inet->inet_sport &&
+ r->id.idiag_sport)
+ goto next_listen;
+
+ if (!inet_diag_bc_sk(cb_data, sk))
+ goto next_listen;
+
+ if (inet_sk_diag_fill(sk, inet_csk(sk), skb,
+ cb, r, NLM_F_MULTI,
+ net_admin) < 0) {
+ spin_unlock(&ilb->lock);
+ goto done;
+ }
+
+next_listen:
+ ++num;
+ }
+ spin_unlock(&ilb->lock);
+
+ s_num = 0;
+ }
+skip_listen_ht:
+ cb->args[0] = 1;
+ s_i = num = s_num = 0;
+ }
+
+/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets
+ * with bh disabled.
+ */
+#define SKARR_SZ 16
+
+ /* Dump bound but inactive (not listening, connecting, etc.) sockets */
+ if (cb->args[0] == 1) {
+ if (!(idiag_states & TCPF_BOUND_INACTIVE))
+ goto skip_bind_ht;
+
+ for (i = s_i; i < hashinfo->bhash_size; i++) {
+ struct inet_bind_hashbucket *ibb;
+ struct inet_bind2_bucket *tb2;
+ struct sock *sk_arr[SKARR_SZ];
+ int num_arr[SKARR_SZ];
+ int idx, accum, res;
+
+resume_bind_walk:
+ num = 0;
+ accum = 0;
+ ibb = &hashinfo->bhash2[i];
+
+ if (hlist_empty(&ibb->chain)) {
+ s_num = 0;
+ continue;
+ }
+ spin_lock_bh(&ibb->lock);
+ inet_bind_bucket_for_each(tb2, &ibb->chain) {
+ if (!net_eq(ib2_net(tb2), net))
+ continue;
+
+ sk_for_each_bound(sk, &tb2->owners) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (num < s_num)
+ goto next_bind;
+
+ if (sk->sk_state != TCP_CLOSE ||
+ !inet->inet_num)
+ goto next_bind;
+
+ if (r->sdiag_family != AF_UNSPEC &&
+ r->sdiag_family != sk->sk_family)
+ goto next_bind;
+
+ if (!inet_diag_bc_sk(cb_data, sk))
+ goto next_bind;
+
+ sock_hold(sk);
+ num_arr[accum] = num;
+ sk_arr[accum] = sk;
+ if (++accum == SKARR_SZ)
+ goto pause_bind_walk;
+next_bind:
+ num++;
+ }
+ }
+pause_bind_walk:
+ spin_unlock_bh(&ibb->lock);
+
+ res = 0;
+ for (idx = 0; idx < accum; idx++) {
+ if (res >= 0) {
+ res = inet_sk_diag_fill(sk_arr[idx],
+ NULL, skb, cb,
+ r, NLM_F_MULTI,
+ net_admin);
+ if (res < 0)
+ num = num_arr[idx];
+ }
+ sock_put(sk_arr[idx]);
+ }
+ if (res < 0)
+ goto done;
+
+ cond_resched();
+
+ if (accum == SKARR_SZ) {
+ s_num = num + 1;
+ goto resume_bind_walk;
+ }
+
+ s_num = 0;
+ }
+skip_bind_ht:
+ cb->args[0] = 2;
+ s_i = num = s_num = 0;
+ }
+
+ if (!(idiag_states & ~TCPF_LISTEN))
+ goto out;
+
+ for (i = s_i; i <= hashinfo->ehash_mask; i++) {
+ struct inet_ehash_bucket *head = &hashinfo->ehash[i];
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
+ struct hlist_nulls_node *node;
+ struct sock *sk_arr[SKARR_SZ];
+ int num_arr[SKARR_SZ];
+ int idx, accum, res;
+
+ if (hlist_nulls_empty(&head->chain))
+ continue;
+
+ if (i > s_i)
+ s_num = 0;
+
+next_chunk:
+ num = 0;
+ accum = 0;
+ spin_lock_bh(lock);
+ sk_nulls_for_each(sk, node, &head->chain) {
+ int state;
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num)
+ goto next_normal;
+ state = (sk->sk_state == TCP_TIME_WAIT) ?
+ READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state;
+ if (!(idiag_states & (1 << state)))
+ goto next_normal;
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next_normal;
+ if (r->id.idiag_sport != htons(sk->sk_num) &&
+ r->id.idiag_sport)
+ goto next_normal;
+ if (r->id.idiag_dport != sk->sk_dport &&
+ r->id.idiag_dport)
+ goto next_normal;
+ twsk_build_assert();
+
+ if (!inet_diag_bc_sk(cb_data, sk))
+ goto next_normal;
+
+ if (!refcount_inc_not_zero(&sk->sk_refcnt))
+ goto next_normal;
+
+ num_arr[accum] = num;
+ sk_arr[accum] = sk;
+ if (++accum == SKARR_SZ)
+ break;
+next_normal:
+ ++num;
+ }
+ spin_unlock_bh(lock);
+
+ res = 0;
+ for (idx = 0; idx < accum; idx++) {
+ if (res >= 0) {
+ res = sk_diag_fill(sk_arr[idx], skb, cb, r,
+ NLM_F_MULTI, net_admin);
+ if (res < 0)
+ num = num_arr[idx];
+ }
+ sock_gen_put(sk_arr[idx]);
+ }
+ if (res < 0)
+ break;
+
+ cond_resched();
+
+ if (accum == SKARR_SZ) {
+ s_num = num + 1;
+ goto next_chunk;
+ }
+ }
+
+done:
+ cb->args[1] = i;
+ cb->args[2] = num;
+out:
+ ;
}
-static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+static struct sock *tcp_diag_find_one_icsk(struct net *net,
+ const struct inet_diag_req_v2 *req)
+{
+ struct sock *sk;
+
+ rcu_read_lock();
+ if (req->sdiag_family == AF_INET) {
+ sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[0],
+ req->id.idiag_dport, req->id.idiag_src[0],
+ req->id.idiag_sport, req->id.idiag_if);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (req->sdiag_family == AF_INET6) {
+ if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
+ ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
+ sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[3],
+ req->id.idiag_dport, req->id.idiag_src[3],
+ req->id.idiag_sport, req->id.idiag_if);
+ else
+ sk = inet6_lookup(net, NULL, 0,
+ (struct in6_addr *)req->id.idiag_dst,
+ req->id.idiag_dport,
+ (struct in6_addr *)req->id.idiag_src,
+ req->id.idiag_sport,
+ req->id.idiag_if);
+#endif
+ } else {
+ rcu_read_unlock();
+ return ERR_PTR(-EINVAL);
+ }
+ rcu_read_unlock();
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) {
+ sock_gen_put(sk);
+ return ERR_PTR(-ENOENT);
+ }
+
+ return sk;
+}
+
+static int tcp_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
- return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req);
+ struct sk_buff *in_skb = cb->skb;
+ struct sk_buff *rep;
+ struct sock *sk;
+ struct net *net;
+ bool net_admin;
+ int err;
+
+ net = sock_net(in_skb->sk);
+ sk = tcp_diag_find_one_icsk(net, req);
+ if (IS_ERR(sk))
+ return PTR_ERR(sk);
+
+ net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN);
+ rep = nlmsg_new(tcp_diag_get_aux_size(sk, net_admin), GFP_KERNEL);
+ if (!rep) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = sk_diag_fill(sk, rep, cb, req, 0, net_admin);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ nlmsg_free(rep);
+ goto out;
+ }
+ err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
+out:
+ if (sk)
+ sock_gen_put(sk);
+
+ return err;
}
#ifdef CONFIG_INET_DIAG_DESTROY
@@ -148,9 +643,10 @@ static int tcp_diag_destroy(struct sk_buff *in_skb,
const struct inet_diag_req_v2 *req)
{
struct net *net = sock_net(in_skb->sk);
- struct sock *sk = inet_diag_find_one_icsk(net, &tcp_hashinfo, req);
+ struct sock *sk;
int err;
+ sk = tcp_diag_find_one_icsk(net, req);
if (IS_ERR(sk))
return PTR_ERR(sk);
@@ -163,11 +659,11 @@ static int tcp_diag_destroy(struct sk_buff *in_skb,
#endif
static const struct inet_diag_handler tcp_diag_handler = {
+ .owner = THIS_MODULE,
.dump = tcp_diag_dump,
.dump_one = tcp_diag_dump_one,
.idiag_get_info = tcp_diag_get_info,
.idiag_get_aux = tcp_diag_get_aux,
- .idiag_get_aux_size = tcp_diag_get_aux_size,
.idiag_type = IPPROTO_TCP,
.idiag_info_size = sizeof(struct tcp_info),
#ifdef CONFIG_INET_DIAG_DESTROY
@@ -188,4 +684,5 @@ static void __exit tcp_diag_exit(void)
module_init(tcp_diag_init);
module_exit(tcp_diag_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("TCP socket monitoring via SOCK_DIAG");
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-6 /* AF_INET - IPPROTO_TCP */);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 018a48477355..7d945a527daf 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -1,14 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/crypto.h>
-#include <linux/err.h>
-#include <linux/init.h>
#include <linux/kernel.h>
-#include <linux/list.h>
#include <linux/tcp.h>
#include <linux/rcupdate.h>
-#include <linux/rculist.h>
-#include <net/inetpeer.h>
#include <net/tcp.h>
+#include <net/busy_poll.h>
void tcp_fastopen_init_key_once(struct net *net)
{
@@ -30,15 +25,15 @@ void tcp_fastopen_init_key_once(struct net *net)
* for a valid cookie, so this is an acceptable risk.
*/
get_random_bytes(key, sizeof(key));
- tcp_fastopen_reset_cipher(net, NULL, key, sizeof(key));
+ tcp_fastopen_reset_cipher(net, NULL, key, NULL);
}
static void tcp_fastopen_ctx_free(struct rcu_head *head)
{
struct tcp_fastopen_context *ctx =
container_of(head, struct tcp_fastopen_context, rcu);
- crypto_free_cipher(ctx->tfm);
- kfree(ctx);
+
+ kfree_sensitive(ctx);
}
void tcp_fastopen_destroy_cipher(struct sock *sk)
@@ -55,119 +50,120 @@ void tcp_fastopen_ctx_destroy(struct net *net)
{
struct tcp_fastopen_context *ctxt;
- spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
-
- ctxt = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
- lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
- rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, NULL);
- spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);
+ ctxt = unrcu_pointer(xchg(&net->ipv4.tcp_fastopen_ctx, NULL));
if (ctxt)
call_rcu(&ctxt->rcu, tcp_fastopen_ctx_free);
}
int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk,
- void *key, unsigned int len)
+ void *primary_key, void *backup_key)
{
struct tcp_fastopen_context *ctx, *octx;
struct fastopen_queue *q;
- int err;
+ int err = 0;
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx)
- return -ENOMEM;
- ctx->tfm = crypto_alloc_cipher("aes", 0, 0);
-
- if (IS_ERR(ctx->tfm)) {
- err = PTR_ERR(ctx->tfm);
-error: kfree(ctx);
- pr_err("TCP: TFO aes cipher alloc error: %d\n", err);
- return err;
- }
- err = crypto_cipher_setkey(ctx->tfm, key, len);
- if (err) {
- pr_err("TCP: TFO cipher key error: %d\n", err);
- crypto_free_cipher(ctx->tfm);
- goto error;
+ if (!ctx) {
+ err = -ENOMEM;
+ goto out;
}
- memcpy(ctx->key, key, len);
+ ctx->key[0].key[0] = get_unaligned_le64(primary_key);
+ ctx->key[0].key[1] = get_unaligned_le64(primary_key + 8);
+ if (backup_key) {
+ ctx->key[1].key[0] = get_unaligned_le64(backup_key);
+ ctx->key[1].key[1] = get_unaligned_le64(backup_key + 8);
+ ctx->num = 2;
+ } else {
+ ctx->num = 1;
+ }
- spin_lock(&net->ipv4.tcp_fastopen_ctx_lock);
if (sk) {
q = &inet_csk(sk)->icsk_accept_queue.fastopenq;
- octx = rcu_dereference_protected(q->ctx,
- lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
- rcu_assign_pointer(q->ctx, ctx);
+ octx = unrcu_pointer(xchg(&q->ctx, RCU_INITIALIZER(ctx)));
} else {
- octx = rcu_dereference_protected(net->ipv4.tcp_fastopen_ctx,
- lockdep_is_held(&net->ipv4.tcp_fastopen_ctx_lock));
- rcu_assign_pointer(net->ipv4.tcp_fastopen_ctx, ctx);
+ octx = unrcu_pointer(xchg(&net->ipv4.tcp_fastopen_ctx,
+ RCU_INITIALIZER(ctx)));
}
- spin_unlock(&net->ipv4.tcp_fastopen_ctx_lock);
if (octx)
call_rcu(&octx->rcu, tcp_fastopen_ctx_free);
+out:
return err;
}
-static bool __tcp_fastopen_cookie_gen(struct sock *sk, const void *path,
- struct tcp_fastopen_cookie *foc)
+int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk,
+ u64 *key)
{
struct tcp_fastopen_context *ctx;
- bool ok = false;
+ int n_keys = 0, i;
rcu_read_lock();
-
- ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx);
- if (!ctx)
- ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx);
-
+ if (icsk)
+ ctx = rcu_dereference(icsk->icsk_accept_queue.fastopenq.ctx);
+ else
+ ctx = rcu_dereference(net->ipv4.tcp_fastopen_ctx);
if (ctx) {
- crypto_cipher_encrypt_one(ctx->tfm, foc->val, path);
- foc->len = TCP_FASTOPEN_COOKIE_SIZE;
- ok = true;
+ n_keys = tcp_fastopen_context_len(ctx);
+ for (i = 0; i < n_keys; i++) {
+ put_unaligned_le64(ctx->key[i].key[0], key + (i * 2));
+ put_unaligned_le64(ctx->key[i].key[1], key + (i * 2) + 1);
+ }
}
rcu_read_unlock();
- return ok;
+
+ return n_keys;
}
-/* Generate the fastopen cookie by doing aes128 encryption on both
- * the source and destination addresses. Pad 0s for IPv4 or IPv4-mapped-IPv6
- * addresses. For the longer IPv6 addresses use CBC-MAC.
- *
- * XXX (TFO) - refactor when TCP_FASTOPEN_COOKIE_SIZE != AES_BLOCK_SIZE.
- */
-static bool tcp_fastopen_cookie_gen(struct sock *sk,
- struct request_sock *req,
- struct sk_buff *syn,
- struct tcp_fastopen_cookie *foc)
+static bool __tcp_fastopen_cookie_gen_cipher(struct request_sock *req,
+ struct sk_buff *syn,
+ const siphash_key_t *key,
+ struct tcp_fastopen_cookie *foc)
{
+ BUILD_BUG_ON(TCP_FASTOPEN_COOKIE_SIZE != sizeof(u64));
+
if (req->rsk_ops->family == AF_INET) {
const struct iphdr *iph = ip_hdr(syn);
- __be32 path[4] = { iph->saddr, iph->daddr, 0, 0 };
- return __tcp_fastopen_cookie_gen(sk, path, foc);
+ foc->val[0] = cpu_to_le64(siphash(&iph->saddr,
+ sizeof(iph->saddr) +
+ sizeof(iph->daddr),
+ key));
+ foc->len = TCP_FASTOPEN_COOKIE_SIZE;
+ return true;
}
-
#if IS_ENABLED(CONFIG_IPV6)
if (req->rsk_ops->family == AF_INET6) {
const struct ipv6hdr *ip6h = ipv6_hdr(syn);
- struct tcp_fastopen_cookie tmp;
- if (__tcp_fastopen_cookie_gen(sk, &ip6h->saddr, &tmp)) {
- struct in6_addr *buf = &tmp.addr;
- int i;
-
- for (i = 0; i < 4; i++)
- buf->s6_addr32[i] ^= ip6h->daddr.s6_addr32[i];
- return __tcp_fastopen_cookie_gen(sk, buf, foc);
- }
+ foc->val[0] = cpu_to_le64(siphash(&ip6h->saddr,
+ sizeof(ip6h->saddr) +
+ sizeof(ip6h->daddr),
+ key));
+ foc->len = TCP_FASTOPEN_COOKIE_SIZE;
+ return true;
}
#endif
return false;
}
+/* Generate the fastopen cookie by applying SipHash to both the source and
+ * destination addresses.
+ */
+static void tcp_fastopen_cookie_gen(struct sock *sk,
+ struct request_sock *req,
+ struct sk_buff *syn,
+ struct tcp_fastopen_cookie *foc)
+{
+ struct tcp_fastopen_context *ctx;
+
+ rcu_read_lock();
+ ctx = tcp_fastopen_get_ctx(sk);
+ if (ctx)
+ __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[0], foc);
+ rcu_read_unlock();
+}
/* If an incoming SYN or SYNACK frame contains a payload and/or FIN,
* queue this additional data / FIN.
@@ -183,7 +179,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
if (!skb)
return;
- skb_dst_drop(skb);
+ tcp_cleanup_skb(skb);
/* segs_in has been initialized to 1 in tcp_create_openreq_child().
* Hence, reset segs_in to 0 before calling tcp_segs_in()
* to avoid double counting. Also, tcp_segs_in() expects
@@ -200,7 +196,7 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN;
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
- __skb_queue_tail(&sk->sk_receive_queue, skb);
+ tcp_add_receive_queue(sk, skb);
tp->syn_data_acked = 1;
/* u64_stats_update_begin(&tp->syncp) not needed here,
@@ -212,6 +208,35 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
tcp_fin(sk);
}
+/* returns 0 - no key match, 1 for primary, 2 for backup */
+static int tcp_fastopen_cookie_gen_check(struct sock *sk,
+ struct request_sock *req,
+ struct sk_buff *syn,
+ struct tcp_fastopen_cookie *orig,
+ struct tcp_fastopen_cookie *valid_foc)
+{
+ struct tcp_fastopen_cookie search_foc = { .len = -1 };
+ struct tcp_fastopen_cookie *foc = valid_foc;
+ struct tcp_fastopen_context *ctx;
+ int i, ret = 0;
+
+ rcu_read_lock();
+ ctx = tcp_fastopen_get_ctx(sk);
+ if (!ctx)
+ goto out;
+ for (i = 0; i < tcp_fastopen_context_len(ctx); i++) {
+ __tcp_fastopen_cookie_gen_cipher(req, syn, &ctx->key[i], foc);
+ if (tcp_fastopen_cookie_match(foc, orig)) {
+ ret = i + 1;
+ goto out;
+ }
+ foc = &search_foc;
+ }
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
static struct sock *tcp_fastopen_create_child(struct sock *sk,
struct sk_buff *skb,
struct request_sock *req)
@@ -221,10 +246,6 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
struct sock *child;
bool own_req;
- req->num_retrans = 0;
- req->num_timeout = 0;
- req->sk = NULL;
-
child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
NULL, &own_req);
if (!child)
@@ -240,7 +261,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
*/
tp = tcp_sk(child);
- tp->fastopen_rsk = req;
+ rcu_assign_pointer(tp->fastopen_rsk, req);
tcp_rsk(req)->tfo_listener = true;
/* RFC1323: The window in SYN & SYN/ACK segments is never
@@ -253,13 +274,16 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
* The request socket is not added to the ehash
* because it's been added to the accept queue directly.
*/
- inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
- TCP_TIMEOUT_INIT, TCP_RTO_MAX);
+ req->timeout = tcp_timeout_init(child);
+ tcp_reset_xmit_timer(child, ICSK_TIME_RETRANS,
+ req->timeout, false);
refcount_set(&req->rsk_refcnt, 2);
+ sk_mark_napi_id_set(child, skb);
+
/* Now finish processing the fastopen child socket. */
- tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
+ tcp_init_transfer(child, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, skb);
tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
@@ -276,6 +300,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
static bool tcp_fastopen_queue_check(struct sock *sk)
{
struct fastopen_queue *fastopenq;
+ int max_qlen;
/* Make sure the listener has enabled fastopen, and we don't
* exceed the max # of pending TFO requests allowed before trying
@@ -288,10 +313,11 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
* temporarily vs a server not supporting Fast Open at all.
*/
fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq;
- if (fastopenq->max_qlen == 0)
+ max_qlen = READ_ONCE(fastopenq->max_qlen);
+ if (max_qlen == 0)
return false;
- if (fastopenq->qlen >= fastopenq->max_qlen) {
+ if (fastopenq->qlen >= max_qlen) {
struct request_sock *req1;
spin_lock(&fastopenq->lock);
req1 = fastopenq->rskq_rst_head;
@@ -313,7 +339,7 @@ static bool tcp_fastopen_no_cookie(const struct sock *sk,
const struct dst_entry *dst,
int flag)
{
- return (sock_net(sk)->ipv4.sysctl_tcp_fastopen & flag) ||
+ return (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & flag) ||
tcp_sk(sk)->fastopen_no_cookie ||
(dst && dst_metric(dst, RTAX_FASTOPEN_NO_COOKIE));
}
@@ -328,9 +354,10 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
const struct dst_entry *dst)
{
bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
- int tcp_fastopen = sock_net(sk)->ipv4.sysctl_tcp_fastopen;
+ int tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen);
struct tcp_fastopen_cookie valid_foc = { .len = -1 };
struct sock *child;
+ int ret = 0;
if (foc->len == 0) /* Client requests a cookie */
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
@@ -342,35 +369,48 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
return NULL;
}
- if (syn_data &&
- tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD))
+ if (tcp_fastopen_no_cookie(sk, dst, TFO_SERVER_COOKIE_NOT_REQD))
goto fastopen;
- if (foc->len >= 0 && /* Client presents or requests a cookie */
- tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc) &&
- foc->len == TCP_FASTOPEN_COOKIE_SIZE &&
- foc->len == valid_foc.len &&
- !memcmp(foc->val, valid_foc.val, foc->len)) {
- /* Cookie is valid. Create a (full) child socket to accept
- * the data in SYN before returning a SYN-ACK to ack the
- * data. If we fail to create the socket, fall back and
- * ack the ISN only but includes the same cookie.
- *
- * Note: Data-less SYN with valid cookie is allowed to send
- * data in SYN_RECV state.
- */
+ if (foc->len == 0) {
+ /* Client requests a cookie. */
+ tcp_fastopen_cookie_gen(sk, req, skb, &valid_foc);
+ } else if (foc->len > 0) {
+ ret = tcp_fastopen_cookie_gen_check(sk, req, skb, foc,
+ &valid_foc);
+ if (!ret) {
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
+ } else {
+ /* Cookie is valid. Create a (full) child socket to
+ * accept the data in SYN before returning a SYN-ACK to
+ * ack the data. If we fail to create the socket, fall
+ * back and ack the ISN only but includes the same
+ * cookie.
+ *
+ * Note: Data-less SYN with valid cookie is allowed to
+ * send data in SYN_RECV state.
+ */
fastopen:
- child = tcp_fastopen_create_child(sk, skb, req);
- if (child) {
- foc->len = -1;
+ child = tcp_fastopen_create_child(sk, skb, req);
+ if (child) {
+ if (ret == 2) {
+ valid_foc.exp = foc->exp;
+ *foc = valid_foc;
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENPASSIVEALTKEY);
+ } else {
+ foc->len = -1;
+ }
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENPASSIVE);
+ tcp_sk(child)->syn_fastopen_child = 1;
+ return child;
+ }
NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPFASTOPENPASSIVE);
- return child;
+ LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
}
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
- } else if (foc->len > 0) /* Client presents an invalid cookie */
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
-
+ }
valid_foc.exp = foc->exp;
*foc = valid_foc;
return NULL;
@@ -395,7 +435,10 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
cookie->len = -1;
return true;
}
- return cookie->len > 0;
+ if (cookie->len > 0)
+ return true;
+ tcp_sk(sk)->fastopen_client_fail = TFO_COOKIE_UNAVAILABLE;
+ return false;
}
/* This function checks if we want to defer sending SYN until the first
@@ -413,7 +456,7 @@ bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
if (tp->fastopen_connect && !tp->fastopen_req) {
if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) {
- inet_sk(sk)->defer_connect = 1;
+ inet_set_bit(DEFER_CONNECT, sk);
return true;
}
@@ -429,7 +472,7 @@ bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
}
return false;
}
-EXPORT_SYMBOL(tcp_fastopen_defer_connect);
+EXPORT_IPV6_MOD(tcp_fastopen_defer_connect);
/*
* The following code block is to deal with middle box issues with TFO:
@@ -454,8 +497,18 @@ void tcp_fastopen_active_disable(struct sock *sk)
{
struct net *net = sock_net(sk);
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout))
+ return;
+
+ /* Paired with READ_ONCE() in tcp_fastopen_active_should_disable() */
+ WRITE_ONCE(net->ipv4.tfo_active_disable_stamp, jiffies);
+
+ /* Paired with smp_rmb() in tcp_fastopen_active_should_disable().
+ * We want net->ipv4.tfo_active_disable_stamp to be updated first.
+ */
+ smp_mb__before_atomic();
atomic_inc(&net->ipv4.tfo_active_disable_times);
- net->ipv4.tfo_active_disable_stamp = jiffies;
+
NET_INC_STATS(net, LINUX_MIB_TCPFASTOPENBLACKHOLE);
}
@@ -465,18 +518,29 @@ void tcp_fastopen_active_disable(struct sock *sk)
*/
bool tcp_fastopen_active_should_disable(struct sock *sk)
{
- unsigned int tfo_bh_timeout = sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout;
- int tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times);
+ unsigned int tfo_bh_timeout =
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen_blackhole_timeout);
unsigned long timeout;
+ int tfo_da_times;
int multiplier;
+ if (!tfo_bh_timeout)
+ return false;
+
+ tfo_da_times = atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times);
if (!tfo_da_times)
return false;
- /* Limit timout to max: 2^6 * initial timeout */
+ /* Paired with smp_mb__before_atomic() in tcp_fastopen_active_disable() */
+ smp_rmb();
+
+ /* Limit timeout to max: 2^6 * initial timeout */
multiplier = 1 << min(tfo_da_times - 1, 6);
- timeout = multiplier * tfo_bh_timeout * HZ;
- if (time_before(jiffies, sock_net(sk)->ipv4.tfo_active_disable_stamp + timeout))
+
+ /* Paired with the WRITE_ONCE() in tcp_fastopen_active_disable(). */
+ timeout = READ_ONCE(sock_net(sk)->ipv4.tfo_active_disable_stamp) +
+ multiplier * tfo_bh_timeout * HZ;
+ if (time_before(jiffies, timeout))
return true;
/* Mark check bit so we can check for successful active TFO
@@ -495,6 +559,7 @@ bool tcp_fastopen_active_should_disable(struct sock *sk)
void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct net_device *dev;
struct dst_entry *dst;
struct sk_buff *skb;
@@ -511,10 +576,12 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
}
} else if (tp->syn_fastopen_ch &&
atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) {
- dst = sk_dst_get(sk);
- if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)))
+ rcu_read_lock();
+ dst = __sk_dst_get(sk);
+ dev = dst ? dst_dev_rcu(dst) : NULL;
+ if (!(dev && (dev->flags & IFF_LOOPBACK)))
atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0);
- dst_release(dst);
+ rcu_read_unlock();
}
}
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c
index d1c33c91eadc..c6de5ce79ad3 100644
--- a/net/ipv4/tcp_highspeed.c
+++ b/net/ipv4/tcp_highspeed.c
@@ -1,7 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Sally Floyd's High Speed TCP (RFC 3649) congestion control
*
- * See http://www.icir.org/floyd/hstcp.html
+ * See https://www.icir.org/floyd/hstcp.html
*
* John Heffner <jheffner@psc.edu>
*/
@@ -126,22 +127,22 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
* snd_cwnd <=
* hstcp_aimd_vals[ca->ai].cwnd
*/
- if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) {
- while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd &&
+ if (tcp_snd_cwnd(tp) > hstcp_aimd_vals[ca->ai].cwnd) {
+ while (tcp_snd_cwnd(tp) > hstcp_aimd_vals[ca->ai].cwnd &&
ca->ai < HSTCP_AIMD_MAX - 1)
ca->ai++;
- } else if (ca->ai && tp->snd_cwnd <= hstcp_aimd_vals[ca->ai-1].cwnd) {
- while (ca->ai && tp->snd_cwnd <= hstcp_aimd_vals[ca->ai-1].cwnd)
+ } else if (ca->ai && tcp_snd_cwnd(tp) <= hstcp_aimd_vals[ca->ai-1].cwnd) {
+ while (ca->ai && tcp_snd_cwnd(tp) <= hstcp_aimd_vals[ca->ai-1].cwnd)
ca->ai--;
}
/* Do additive increase */
- if (tp->snd_cwnd < tp->snd_cwnd_clamp) {
+ if (tcp_snd_cwnd(tp) < tp->snd_cwnd_clamp) {
/* cwnd = cwnd + a(w) / cwnd */
tp->snd_cwnd_cnt += ca->ai + 1;
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- tp->snd_cwnd_cnt -= tp->snd_cwnd;
- tp->snd_cwnd++;
+ if (tp->snd_cwnd_cnt >= tcp_snd_cwnd(tp)) {
+ tp->snd_cwnd_cnt -= tcp_snd_cwnd(tp);
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
}
}
}
@@ -153,7 +154,7 @@ static u32 hstcp_ssthresh(struct sock *sk)
struct hstcp *ca = inet_csk_ca(sk);
/* Do multiplicative decrease */
- return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
+ return max(tcp_snd_cwnd(tp) - ((tcp_snd_cwnd(tp) * hstcp_aimd_vals[ca->ai].md) >> 8), 2U);
}
static struct tcp_congestion_ops tcp_highspeed __read_mostly = {
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 082d479462fa..81b96331b2bb 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -1,9 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* H-TCP congestion control. The algorithm is detailed in:
* R.N.Shorten, D.J.Leith:
* "H-TCP: TCP for high-speed and long-distance networks"
* Proc. PFLDnet, Argonne, 2004.
- * http://www.hamilton.ie/net/htcp3.pdf
+ * https://www.hamilton.ie/net/htcp3.pdf
*/
#include <linux/mm.h>
@@ -123,7 +124,7 @@ static void measure_achieved_throughput(struct sock *sk,
ca->packetcount += sample->pkts_acked;
- if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) &&
+ if (ca->packetcount >= tcp_snd_cwnd(tp) - (ca->alpha >> 7 ? : 1) &&
now - ca->lasttime >= ca->minRTT &&
ca->minRTT > 0) {
__u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime);
@@ -184,7 +185,7 @@ static inline void htcp_alpha_update(struct htcp *ca)
u32 scale = (HZ << 3) / (10 * minRTT);
/* clamping ratio to interval [0.5,10]<<3 */
- scale = min(max(scale, 1U << 2), 10U << 3);
+ scale = clamp(scale, 1U << 2, 10U << 3);
factor = (factor << 3) / scale;
if (!factor)
factor = 1;
@@ -224,7 +225,7 @@ static u32 htcp_recalc_ssthresh(struct sock *sk)
const struct htcp *ca = inet_csk_ca(sk);
htcp_param_update(sk);
- return max((tp->snd_cwnd * ca->beta) >> 7, 2U);
+ return max((tcp_snd_cwnd(tp) * ca->beta) >> 7, 2U);
}
static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
@@ -241,9 +242,9 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
/* In dangerous area, increase slowly.
* In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd
*/
- if ((tp->snd_cwnd_cnt * ca->alpha)>>7 >= tp->snd_cwnd) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
+ if ((tp->snd_cwnd_cnt * ca->alpha)>>7 >= tcp_snd_cwnd(tp)) {
+ if (tcp_snd_cwnd(tp) < tp->snd_cwnd_clamp)
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
tp->snd_cwnd_cnt = 0;
htcp_alpha_update(ca);
} else
diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c
index 0f7175c3338e..abd7d91807e5 100644
--- a/net/ipv4/tcp_hybla.c
+++ b/net/ipv4/tcp_hybla.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* TCP HYBLA
*
@@ -53,7 +54,7 @@ static void hybla_init(struct sock *sk)
ca->rho2_7ls = 0;
ca->snd_cwnd_cents = 0;
ca->hybla_en = true;
- tp->snd_cwnd = 2;
+ tcp_snd_cwnd_set(tp, 2);
tp->snd_cwnd_clamp = 65535;
/* 1st Rho measurement based on initial srtt */
@@ -61,7 +62,7 @@ static void hybla_init(struct sock *sk)
/* set minimum rtt as this is the 1st ever seen */
ca->minrtt_us = tp->srtt_us;
- tp->snd_cwnd = ca->rho;
+ tcp_snd_cwnd_set(tp, ca->rho);
}
static void hybla_state(struct sock *sk, u8 ca_state)
@@ -136,31 +137,31 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked)
* as long as increment is estimated as (rho<<7)/window
* it already is <<7 and we can easily count its fractions.
*/
- increment = ca->rho2_7ls / tp->snd_cwnd;
+ increment = ca->rho2_7ls / tcp_snd_cwnd(tp);
if (increment < 128)
tp->snd_cwnd_cnt++;
}
odd = increment % 128;
- tp->snd_cwnd += increment >> 7;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + (increment >> 7));
ca->snd_cwnd_cents += odd;
/* check when fractions goes >=128 and increase cwnd by 1. */
while (ca->snd_cwnd_cents >= 128) {
- tp->snd_cwnd++;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
ca->snd_cwnd_cents -= 128;
tp->snd_cwnd_cnt = 0;
}
/* check when cwnd has not been incremented for a while */
- if (increment == 0 && odd == 0 && tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- tp->snd_cwnd++;
+ if (increment == 0 && odd == 0 && tp->snd_cwnd_cnt >= tcp_snd_cwnd(tp)) {
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
tp->snd_cwnd_cnt = 0;
}
/* clamp down slowstart cwnd to ssthresh value. */
if (is_slowstart)
- tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+ tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_ssthresh));
- tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+ tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp));
}
static struct tcp_congestion_ops tcp_hybla __read_mostly = {
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index faddf4f9a707..c0c81a2c77fa 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* TCP Illinois congestion control.
* Home page:
@@ -223,7 +224,7 @@ static void update_params(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct illinois *ca = inet_csk_ca(sk);
- if (tp->snd_cwnd < win_thresh) {
+ if (tcp_snd_cwnd(tp) < win_thresh) {
ca->alpha = ALPHA_BASE;
ca->beta = BETA_BASE;
} else if (ca->cnt_rtt > 0) {
@@ -283,9 +284,9 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked)
* tp->snd_cwnd += alpha/tp->snd_cwnd
*/
delta = (tp->snd_cwnd_cnt * ca->alpha) >> ALPHA_SHIFT;
- if (delta >= tp->snd_cwnd) {
- tp->snd_cwnd = min(tp->snd_cwnd + delta / tp->snd_cwnd,
- (u32)tp->snd_cwnd_clamp);
+ if (delta >= tcp_snd_cwnd(tp)) {
+ tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp) + delta / tcp_snd_cwnd(tp),
+ (u32)tp->snd_cwnd_clamp));
tp->snd_cwnd_cnt = 0;
}
}
@@ -295,9 +296,11 @@ static u32 tcp_illinois_ssthresh(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct illinois *ca = inet_csk_ca(sk);
+ u32 decr;
/* Multiplicative decrease */
- return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U);
+ decr = (tcp_snd_cwnd(tp) * ca->beta) >> BETA_SHIFT;
+ return max(tcp_snd_cwnd(tp) - decr, 2U);
}
/* Extract info for Tcp socket info provided via netlink. */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 47e08c1b5bc3..198f8a0d37be 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -70,15 +70,19 @@
#include <linux/sysctl.h>
#include <linux/kernel.h>
#include <linux/prefetch.h>
+#include <linux/bitops.h>
#include <net/dst.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
+#include <net/proto_memory.h>
#include <net/inet_common.h>
#include <linux/ipsec.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/errqueue.h>
#include <trace/events/tcp.h>
-#include <linux/static_key.h>
+#include <linux/jump_label_ratelimit.h>
#include <net/busy_poll.h>
+#include <net/mptcp.h>
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
@@ -99,6 +103,8 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
#define FLAG_ACK_MAYBE_DELAYED 0x10000 /* Likely a delayed ACK */
+#define FLAG_DSACK_TLP 0x20000 /* DSACK for tail loss probe */
+#define FLAG_TS_PROGRESS 0x40000 /* Positive timestamp delta */
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
@@ -113,41 +119,106 @@ int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
#if IS_ENABLED(CONFIG_TLS_DEVICE)
-static DEFINE_STATIC_KEY_FALSE(clean_acked_data_enabled);
+static DEFINE_STATIC_KEY_DEFERRED_FALSE(clean_acked_data_enabled, HZ);
-void clean_acked_data_enable(struct inet_connection_sock *icsk,
+void clean_acked_data_enable(struct tcp_sock *tp,
void (*cad)(struct sock *sk, u32 ack_seq))
{
- icsk->icsk_clean_acked = cad;
- static_branch_inc(&clean_acked_data_enabled);
+ tp->tcp_clean_acked = cad;
+ static_branch_deferred_inc(&clean_acked_data_enabled);
}
EXPORT_SYMBOL_GPL(clean_acked_data_enable);
-void clean_acked_data_disable(struct inet_connection_sock *icsk)
+void clean_acked_data_disable(struct tcp_sock *tp)
{
- static_branch_dec(&clean_acked_data_enabled);
- icsk->icsk_clean_acked = NULL;
+ static_branch_slow_dec_deferred(&clean_acked_data_enabled);
+ tp->tcp_clean_acked = NULL;
}
EXPORT_SYMBOL_GPL(clean_acked_data_disable);
-#endif
-static void tcp_gro_dev_warn(struct sock *sk, const struct sk_buff *skb,
- unsigned int len)
+void clean_acked_data_flush(void)
{
- static bool __once __read_mostly;
+ static_key_deferred_flush(&clean_acked_data_enabled);
+}
+EXPORT_SYMBOL_GPL(clean_acked_data_flush);
+#endif
- if (!__once) {
- struct net_device *dev;
+#ifdef CONFIG_CGROUP_BPF
+static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
+{
+ bool unknown_opt = tcp_sk(sk)->rx_opt.saw_unknown &&
+ BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+ BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG);
+ bool parse_all_opt = BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+ BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG);
+ struct bpf_sock_ops_kern sock_ops;
- __once = true;
+ if (likely(!unknown_opt && !parse_all_opt))
+ return;
- rcu_read_lock();
- dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
- if (!dev || len >= dev->mtu)
- pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
- dev ? dev->name : "Unknown driver");
- rcu_read_unlock();
+ /* The skb will be handled in the
+ * bpf_skops_established() or
+ * bpf_skops_write_hdr_opt().
+ */
+ switch (sk->sk_state) {
+ case TCP_SYN_RECV:
+ case TCP_SYN_SENT:
+ case TCP_LISTEN:
+ return;
}
+
+ sock_owned_by_me(sk);
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+ sock_ops.op = BPF_SOCK_OPS_PARSE_HDR_OPT_CB;
+ sock_ops.is_fullsock = 1;
+ sock_ops.is_locked_tcp_sock = 1;
+ sock_ops.sk = sk;
+ bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
+
+ BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
+}
+
+static void bpf_skops_established(struct sock *sk, int bpf_op,
+ struct sk_buff *skb)
+{
+ struct bpf_sock_ops_kern sock_ops;
+
+ sock_owned_by_me(sk);
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+ sock_ops.op = bpf_op;
+ sock_ops.is_fullsock = 1;
+ sock_ops.is_locked_tcp_sock = 1;
+ sock_ops.sk = sk;
+ /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */
+ if (skb)
+ bpf_skops_init_skb(&sock_ops, skb, tcp_hdrlen(skb));
+
+ BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops);
+}
+#else
+static void bpf_skops_parse_hdr(struct sock *sk, struct sk_buff *skb)
+{
+}
+
+static void bpf_skops_established(struct sock *sk, int bpf_op,
+ struct sk_buff *skb)
+{
+}
+#endif
+
+static __cold void tcp_gro_dev_warn(const struct sock *sk, const struct sk_buff *skb,
+ unsigned int len)
+{
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif);
+ if (!dev || len >= READ_ONCE(dev->mtu))
+ pr_warn("%s: Driver has suspect GRO implementation, TCP performance may be compromised.\n",
+ dev ? dev->name : "Unknown driver");
+ rcu_read_unlock();
}
/* Adapt the MSS value used to make delayed ack decision to the
@@ -166,12 +237,45 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
*/
len = skb_shinfo(skb)->gso_size ? : skb->len;
if (len >= icsk->icsk_ack.rcv_mss) {
+ /* Note: divides are still a bit expensive.
+ * For the moment, only adjust scaling_ratio
+ * when we update icsk_ack.rcv_mss.
+ */
+ if (unlikely(len != icsk->icsk_ack.rcv_mss)) {
+ u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE;
+ u8 old_ratio = tcp_sk(sk)->scaling_ratio;
+
+ do_div(val, skb->truesize);
+ tcp_sk(sk)->scaling_ratio = val ? val : 1;
+
+ if (old_ratio != tcp_sk(sk)->scaling_ratio) {
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ val = tcp_win_from_space(sk, sk->sk_rcvbuf);
+ tcp_set_window_clamp(sk, val);
+
+ if (tp->window_clamp < tp->rcvq_space.space)
+ tp->rcvq_space.space = tp->window_clamp;
+ }
+ }
icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
tcp_sk(sk)->advmss);
/* Account for possibly-removed options */
- if (unlikely(len > icsk->icsk_ack.rcv_mss +
- MAX_TCP_OPTION_SPACE))
- tcp_gro_dev_warn(sk, skb, len);
+ DO_ONCE_LITE_IF(len > icsk->icsk_ack.rcv_mss + MAX_TCP_OPTION_SPACE,
+ tcp_gro_dev_warn, sk, skb, len);
+ /* If the skb has a len of exactly 1*MSS and has the PSH bit
+ * set then it is likely the end of an application write. So
+ * more data may not be arriving soon, and yet the data sender
+ * may be waiting for an ACK if cwnd-bound or using TX zero
+ * copy. So we set ICSK_ACK_PUSHED here so that
+ * tcp_cleanup_rbuf() will send an ACK immediately if the app
+ * reads all of the data and is not ping-pong. If len > MSS
+ * then this logic does not matter (and does not hurt) because
+ * tcp_cleanup_rbuf() will always ACK immediately if the app
+ * reads data and there is more than an MSS of unACKed data.
+ */
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_PSH)
+ icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
} else {
/* Otherwise, we make more careful check taking into account,
* that SACKs block is variable.
@@ -216,15 +320,14 @@ static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
icsk->icsk_ack.quick = quickacks;
}
-void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
+static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
{
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_incr_quickack(sk, max_quickacks);
- icsk->icsk_ack.pingpong = 0;
+ inet_csk_exit_pingpong_mode(sk);
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
-EXPORT_SYMBOL(tcp_enter_quickack_mode);
/* Send ACKs quickly, if "quick" count is not exhausted
* and the session is not interactive.
@@ -233,40 +336,18 @@ EXPORT_SYMBOL(tcp_enter_quickack_mode);
static bool tcp_in_quickack_mode(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- const struct dst_entry *dst = __sk_dst_get(sk);
- return (dst && dst_metric(dst, RTAX_QUICKACK)) ||
- (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);
+ return icsk->icsk_ack.dst_quick_ack ||
+ (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
}
-static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
-{
- if (tp->ecn_flags & TCP_ECN_OK)
- tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
-}
-
-static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)
-{
- if (tcp_hdr(skb)->cwr) {
- tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
-
- /* If the sender is telling us it has entered CWR, then its
- * cwnd may be very low (even just 1 packet), so we should ACK
- * immediately.
- */
- inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
- }
-}
-
-static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
-{
- tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
-}
-
-static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
+static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
+ if (tcp_ecn_disabled(tp))
+ return;
+
switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
case INET_ECN_NOT_ECT:
/* Funny extension: if ECT is not set on a segment,
@@ -280,44 +361,222 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
if (tcp_ca_needs_ecn(sk))
tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
- if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+ if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) &&
+ tcp_ecn_mode_rfc3168(tp)) {
/* Better not delay acks, sender can have a very low cwnd */
tcp_enter_quickack_mode(sk, 2);
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
}
+ /* As for RFC3168 ECN, the TCP_ECN_SEEN flag is set by
+ * tcp_data_ecn_check() when the ECN codepoint of
+ * received TCP data contains ECT(0), ECT(1), or CE.
+ */
+ if (!tcp_ecn_mode_rfc3168(tp))
+ break;
tp->ecn_flags |= TCP_ECN_SEEN;
break;
default:
if (tcp_ca_needs_ecn(sk))
tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
+ if (!tcp_ecn_mode_rfc3168(tp))
+ break;
tp->ecn_flags |= TCP_ECN_SEEN;
break;
}
}
-static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
+/* Returns true if the byte counters can be used */
+static bool tcp_accecn_process_option(struct tcp_sock *tp,
+ const struct sk_buff *skb,
+ u32 delivered_bytes, int flag)
{
- if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK)
- __tcp_ecn_check_ce(sk, skb);
+ u8 estimate_ecnfield = tp->est_ecnfield;
+ bool ambiguous_ecn_bytes_incr = false;
+ bool first_changed = false;
+ unsigned int optlen;
+ bool order1, res;
+ unsigned int i;
+ u8 *ptr;
+
+ if (tcp_accecn_opt_fail_recv(tp))
+ return false;
+
+ if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) {
+ if (!tp->saw_accecn_opt) {
+ /* Too late to enable after this point due to
+ * potential counter wraps
+ */
+ if (tp->bytes_sent >= (1 << 23) - 1) {
+ u8 saw_opt = TCP_ACCECN_OPT_FAIL_SEEN;
+
+ tcp_accecn_saw_opt_fail_recv(tp, saw_opt);
+ }
+ return false;
+ }
+
+ if (estimate_ecnfield) {
+ u8 ecnfield = estimate_ecnfield - 1;
+
+ tp->delivered_ecn_bytes[ecnfield] += delivered_bytes;
+ return true;
+ }
+ return false;
+ }
+
+ ptr = skb_transport_header(skb) + tp->rx_opt.accecn;
+ optlen = ptr[1] - 2;
+ if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1))
+ return false;
+ order1 = (ptr[0] == TCPOPT_ACCECN1);
+ ptr += 2;
+
+ if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+ tp->saw_accecn_opt = tcp_accecn_option_init(skb,
+ tp->rx_opt.accecn);
+ if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN)
+ tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV);
+ }
+
+ res = !!estimate_ecnfield;
+ for (i = 0; i < 3; i++) {
+ u32 init_offset;
+ u8 ecnfield;
+ s32 delta;
+ u32 *cnt;
+
+ if (optlen < TCPOLEN_ACCECN_PERFIELD)
+ break;
+
+ ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1);
+ init_offset = tcp_accecn_field_init_offset(ecnfield);
+ cnt = &tp->delivered_ecn_bytes[ecnfield - 1];
+ delta = tcp_update_ecn_bytes(cnt, ptr, init_offset);
+ if (delta && delta < 0) {
+ res = false;
+ ambiguous_ecn_bytes_incr = true;
+ }
+ if (delta && ecnfield != estimate_ecnfield) {
+ if (!first_changed) {
+ tp->est_ecnfield = ecnfield;
+ first_changed = true;
+ } else {
+ res = false;
+ ambiguous_ecn_bytes_incr = true;
+ }
+ }
+
+ optlen -= TCPOLEN_ACCECN_PERFIELD;
+ ptr += TCPOLEN_ACCECN_PERFIELD;
+ }
+ if (ambiguous_ecn_bytes_incr)
+ tp->est_ecnfield = 0;
+
+ return res;
}
-static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
+static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count)
{
- if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr))
- tp->ecn_flags &= ~TCP_ECN_OK;
+ tp->delivered_ce += ecn_count;
}
-static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
+/* Updates the delivered and delivered_ce counts */
+static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
+ bool ece_ack)
{
- if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr))
- tp->ecn_flags &= ~TCP_ECN_OK;
+ tp->delivered += delivered;
+ if (tcp_ecn_mode_rfc3168(tp) && ece_ack)
+ tcp_count_delivered_ce(tp, delivered);
}
-static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
+/* Returns the ECN CE delta */
+static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
+ u32 delivered_pkts, u32 delivered_bytes,
+ int flag)
{
- if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK))
- return true;
- return false;
+ u32 old_ceb = tcp_sk(sk)->delivered_ecn_bytes[INET_ECN_CE - 1];
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 delta, safe_delta, d_ceb;
+ bool opt_deltas_valid;
+ u32 corrected_ace;
+
+ /* Reordered ACK or uncertain due to lack of data to send and ts */
+ if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS)))
+ return 0;
+
+ opt_deltas_valid = tcp_accecn_process_option(tp, skb,
+ delivered_bytes, flag);
+
+ if (!(flag & FLAG_SLOWPATH)) {
+ /* AccECN counter might overflow on large ACKs */
+ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK)
+ return 0;
+ }
+
+ /* ACE field is not available during handshake */
+ if (flag & FLAG_SYN_ACKED)
+ return 0;
+
+ if (tp->received_ce_pending >= TCP_ACCECN_ACE_MAX_DELTA)
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+
+ corrected_ace = tcp_accecn_ace(th) - TCP_ACCECN_CEP_INIT_OFFSET;
+ delta = (corrected_ace - tp->delivered_ce) & TCP_ACCECN_CEP_ACE_MASK;
+ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK)
+ return delta;
+
+ safe_delta = delivered_pkts -
+ ((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK);
+
+ if (opt_deltas_valid) {
+ d_ceb = tp->delivered_ecn_bytes[INET_ECN_CE - 1] - old_ceb;
+ if (!d_ceb)
+ return delta;
+
+ if ((delivered_pkts >= (TCP_ACCECN_CEP_ACE_MASK + 1) * 2) &&
+ (tcp_is_sack(tp) ||
+ ((1 << inet_csk(sk)->icsk_ca_state) &
+ (TCPF_CA_Open | TCPF_CA_CWR)))) {
+ u32 est_d_cep;
+
+ if (delivered_bytes <= d_ceb)
+ return safe_delta;
+
+ est_d_cep = DIV_ROUND_UP_ULL((u64)d_ceb *
+ delivered_pkts,
+ delivered_bytes);
+ return min(safe_delta,
+ delta +
+ (est_d_cep & ~TCP_ACCECN_CEP_ACE_MASK));
+ }
+
+ if (d_ceb > delta * tp->mss_cache)
+ return safe_delta;
+ if (d_ceb <
+ safe_delta * tp->mss_cache >> TCP_ACCECN_SAFETY_SHIFT)
+ return delta;
+ }
+
+ return safe_delta;
+}
+
+static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
+ u32 delivered_pkts, u32 delivered_bytes,
+ int *flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 delta;
+
+ delta = __tcp_accecn_process(sk, skb, delivered_pkts,
+ delivered_bytes, *flag);
+ if (delta > 0) {
+ tcp_count_delivered_ce(tp, delta);
+ *flag |= FLAG_ECE;
+ /* Recalculate header predictor */
+ if (tp->pred_flags)
+ tcp_fast_path_on(tp);
+ }
+ return delta;
}
/* Buffer size and advertised window tuning.
@@ -342,7 +601,7 @@ static void tcp_sndbuf_expand(struct sock *sk)
per_mss = roundup_pow_of_two(per_mss) +
SKB_DATA_ALIGN(sizeof(struct sk_buff));
- nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd);
+ nr_segs = max_t(u32, TCP_INIT_CWND, tcp_snd_cwnd(tp));
nr_segs = max_t(u32, nr_segs, tp->reordering + 1);
/* Fast Recovery (RFC 5681 3.2) :
@@ -353,7 +612,8 @@ static void tcp_sndbuf_expand(struct sock *sk)
sndmem *= nr_segs * per_mss;
if (sk->sk_sndbuf < sndmem)
- sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]);
+ WRITE_ONCE(sk->sk_sndbuf,
+ min(sndmem, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[2])));
}
/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
@@ -382,12 +642,13 @@ static void tcp_sndbuf_expand(struct sock *sk)
*/
/* Slow part of check#2. */
-static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
+static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb,
+ unsigned int skbtruesize)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
/* Optimize this! */
- int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
- int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
+ int truesize = tcp_win_from_space(sk, skbtruesize) >> 1;
+ int window = tcp_win_from_space(sk, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])) >> 1;
while (tp->rcv_ssthresh <= window) {
if (truesize <= skb->len)
@@ -399,67 +660,74 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
return 0;
}
-static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
+/* Even if skb appears to have a bad len/truesize ratio, TCP coalescing
+ * can play nice with us, as sk_buff and skb->head might be either
+ * freed or shared with up to MAX_SKB_FRAGS segments.
+ * Only give a boost to drivers using page frag(s) to hold the frame(s),
+ * and if no payload was pulled in skb->head before reaching us.
+ */
+static u32 truesize_adjust(bool adjust, const struct sk_buff *skb)
+{
+ u32 truesize = skb->truesize;
+
+ if (adjust && !skb_headlen(skb)) {
+ truesize -= SKB_TRUESIZE(skb_end_offset(skb));
+ /* paranoid check, some drivers might be buggy */
+ if (unlikely((int)truesize < (int)skb->len))
+ truesize = skb->truesize;
+ }
+ return truesize;
+}
+
+static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb,
+ bool adjust)
{
struct tcp_sock *tp = tcp_sk(sk);
+ int room;
+
+ room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh;
+
+ if (room <= 0)
+ return;
/* Check #1 */
- if (tp->rcv_ssthresh < tp->window_clamp &&
- (int)tp->rcv_ssthresh < tcp_space(sk) &&
- !tcp_under_memory_pressure(sk)) {
+ if (!tcp_under_memory_pressure(sk)) {
+ unsigned int truesize = truesize_adjust(adjust, skb);
int incr;
/* Check #2. Increase window, if skb with such overhead
* will fit to rcvbuf in future.
*/
- if (tcp_win_from_space(sk, skb->truesize) <= skb->len)
+ if (tcp_win_from_space(sk, truesize) <= skb->len)
incr = 2 * tp->advmss;
else
- incr = __tcp_grow_window(sk, skb);
+ incr = __tcp_grow_window(sk, skb, truesize);
if (incr) {
incr = max_t(int, incr, 2 * skb->len);
- tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr,
- tp->window_clamp);
+ tp->rcv_ssthresh += min(room, incr);
inet_csk(sk)->icsk_ack.quick |= 1;
}
+ } else {
+ /* Under pressure:
+ * Adjust rcv_ssthresh according to reserved mem
+ */
+ tcp_adjust_rcv_ssthresh(sk);
}
}
-/* 3. Tuning rcvbuf, when connection enters established state. */
-static void tcp_fixup_rcvbuf(struct sock *sk)
-{
- u32 mss = tcp_sk(sk)->advmss;
- int rcvmem;
-
- rcvmem = 2 * SKB_TRUESIZE(mss + MAX_TCP_HEADER) *
- tcp_default_init_rwnd(mss);
-
- /* Dynamic Right Sizing (DRS) has 2 to 3 RTT latency
- * Allow enough cushion so that sender is not limited by our window
- */
- if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf)
- rcvmem <<= 2;
-
- if (sk->sk_rcvbuf < rcvmem)
- sk->sk_rcvbuf = min(rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
-}
-
-/* 4. Try to fixup all. It is made immediately after connection enters
+/* 3. Try to fixup all. It is made immediately after connection enters
* established state.
*/
-void tcp_init_buffer_space(struct sock *sk)
+static void tcp_init_buffer_space(struct sock *sk)
{
- int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;
+ int tcp_app_win = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_app_win);
struct tcp_sock *tp = tcp_sk(sk);
int maxwin;
- if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
- tcp_fixup_rcvbuf(sk);
if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
tcp_sndbuf_expand(sk);
- tp->rcvq_space.space = tp->rcv_wnd;
tcp_mstamp_refresh(tp);
tp->rcvq_space.time = tp->tcp_mstamp;
tp->rcvq_space.seq = tp->copied_seq;
@@ -467,39 +735,44 @@ void tcp_init_buffer_space(struct sock *sk)
maxwin = tcp_full_space(sk);
if (tp->window_clamp >= maxwin) {
- tp->window_clamp = maxwin;
+ WRITE_ONCE(tp->window_clamp, maxwin);
if (tcp_app_win && maxwin > 4 * tp->advmss)
- tp->window_clamp = max(maxwin -
- (maxwin >> tcp_app_win),
- 4 * tp->advmss);
+ WRITE_ONCE(tp->window_clamp,
+ max(maxwin - (maxwin >> tcp_app_win),
+ 4 * tp->advmss));
}
/* Force reservation of one segment. */
if (tcp_app_win &&
tp->window_clamp > 2 * tp->advmss &&
tp->window_clamp + tp->advmss > maxwin)
- tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
+ WRITE_ONCE(tp->window_clamp,
+ max(2 * tp->advmss, maxwin - tp->advmss));
tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
tp->snd_cwnd_stamp = tcp_jiffies32;
+ tp->rcvq_space.space = min3(tp->rcv_ssthresh, tp->rcv_wnd,
+ (u32)TCP_INIT_CWND * tp->advmss);
}
-/* 5. Recalculate window clamp after socket hit its memory bounds. */
+/* 4. Recalculate window clamp after socket hit its memory bounds. */
static void tcp_clamp_window(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct net *net = sock_net(sk);
+ int rmem2;
icsk->icsk_ack.quick = 0;
+ rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
- if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
+ if (sk->sk_rcvbuf < rmem2 &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
!tcp_under_memory_pressure(sk) &&
sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
- sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
- net->ipv4.sysctl_tcp_rmem[2]);
+ WRITE_ONCE(sk->sk_rcvbuf,
+ min(atomic_read(&sk->sk_rmem_alloc), rmem2));
}
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
@@ -523,13 +796,13 @@ void tcp_initialize_rcv_mss(struct sock *sk)
inet_csk(sk)->icsk_ack.rcv_mss = hint;
}
-EXPORT_SYMBOL(tcp_initialize_rcv_mss);
+EXPORT_IPV6_MOD(tcp_initialize_rcv_mss);
/* Receiver "autotuning" code.
*
* The algorithm for RTT estimation w/o timestamps is based on
* Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
- * <http://public.lanl.gov/radiant/pubs.html#DRS>
+ * <https://public.lanl.gov/radiant/pubs.html#DRS>
*
* More detail on this code can be found at
* <http://staff.psc.edu/jheffner/>,
@@ -538,10 +811,12 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss);
*/
static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
{
- u32 new_sample = tp->rcv_rtt_est.rtt_us;
- long m = sample;
+ u32 new_sample, old_sample = tp->rcv_rtt_est.rtt_us;
+ long m = sample << 3;
- if (new_sample != 0) {
+ if (old_sample == 0 || m < old_sample) {
+ new_sample = m;
+ } else {
/* If we sample in larger samples in the non-timestamp
* case, we could grossly overestimate the RTT especially
* with chatty applications or bulk transfer apps which
@@ -552,17 +827,12 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
* else with timestamps disabled convergence takes too
* long.
*/
- if (!win_dep) {
- m -= (new_sample >> 3);
- new_sample += m;
- } else {
- m <<= 3;
- if (m < new_sample)
- new_sample = m;
- }
- } else {
- /* No previous measure. */
- new_sample = m << 3;
+ if (win_dep)
+ return;
+ /* Do not use this sample if receive queue is not empty. */
+ if (tp->rcv_nxt != tp->copied_seq)
+ return;
+ new_sample = old_sample - (old_sample >> 3) + sample;
}
tp->rcv_rtt_est.rtt_us = new_sample;
@@ -586,6 +856,23 @@ new_measure:
tp->rcv_rtt_est.time = tp->tcp_mstamp;
}
+static s32 tcp_rtt_tsopt_us(const struct tcp_sock *tp, u32 min_delta)
+{
+ u32 delta, delta_us;
+
+ delta = tcp_time_stamp_ts(tp) - tp->rx_opt.rcv_tsecr;
+ if (tp->tcp_usec_ts)
+ return delta;
+
+ if (likely(delta < INT_MAX / (USEC_PER_SEC / TCP_TS_HZ))) {
+ if (!delta)
+ delta = min_delta;
+ delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
+ return delta_us;
+ }
+ return -1;
+}
+
static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
const struct sk_buff *skb)
{
@@ -597,16 +884,58 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
if (TCP_SKB_CB(skb)->end_seq -
TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss) {
- u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
- u32 delta_us;
+ s32 delta = tcp_rtt_tsopt_us(tp, 0);
- if (!delta)
- delta = 1;
- delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
- tcp_rcv_rtt_update(tp, delta_us, 0);
+ if (delta > 0)
+ tcp_rcv_rtt_update(tp, delta, 0);
}
}
+void tcp_rcvbuf_grow(struct sock *sk, u32 newval)
+{
+ const struct net *net = sock_net(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 rcvwin, rcvbuf, cap, oldval;
+ u32 rtt_threshold, rtt_us;
+ u64 grow;
+
+ oldval = tp->rcvq_space.space;
+ tp->rcvq_space.space = newval;
+
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
+ (sk->sk_userlocks & SOCK_RCVBUF_LOCK))
+ return;
+
+ /* DRS is always one RTT late. */
+ rcvwin = newval << 1;
+
+ rtt_us = tp->rcv_rtt_est.rtt_us >> 3;
+ rtt_threshold = READ_ONCE(net->ipv4.sysctl_tcp_rcvbuf_low_rtt);
+ if (rtt_us < rtt_threshold) {
+ /* For small RTT, we set @grow to rcvwin * rtt_us/rtt_threshold.
+ * It might take few additional ms to reach 'line rate',
+ * but will avoid sk_rcvbuf inflation and poor cache use.
+ */
+ grow = div_u64((u64)rcvwin * rtt_us, rtt_threshold);
+ } else {
+ /* slow start: allow the sender to double its rate. */
+ grow = div_u64(((u64)rcvwin << 1) * (newval - oldval), oldval);
+ }
+ rcvwin += grow;
+
+ if (!RB_EMPTY_ROOT(&tp->out_of_order_queue))
+ rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt;
+
+ cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
+
+ rcvbuf = min_t(u32, tcp_space_from_win(sk, rcvwin), cap);
+ if (rcvbuf > sk->sk_rcvbuf) {
+ WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
+ /* Make the window clamp follow along. */
+ WRITE_ONCE(tp->window_clamp,
+ tcp_win_from_space(sk, rcvbuf));
+ }
+}
/*
* This function should be called every time data is copied to user space.
* It calculates the appropriate TCP receive buffer space.
@@ -614,66 +943,48 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
void tcp_rcv_space_adjust(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- u32 copied;
- int time;
+ int time, inq, copied;
trace_tcp_rcv_space_adjust(sk);
- tcp_mstamp_refresh(tp);
+ if (unlikely(!tp->rcv_rtt_est.rtt_us))
+ return;
+
+ /* We do not refresh tp->tcp_mstamp here.
+ * Some platforms have expensive ktime_get() implementations.
+ * Using the last cached value is enough for DRS.
+ */
time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time);
- if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0)
+ if (time < (tp->rcv_rtt_est.rtt_us >> 3))
return;
/* Number of bytes copied to user in last RTT */
copied = tp->copied_seq - tp->rcvq_space.seq;
+ /* Number of bytes in receive queue. */
+ inq = tp->rcv_nxt - tp->copied_seq;
+ copied -= inq;
if (copied <= tp->rcvq_space.space)
goto new_measure;
- /* A bit of theory :
- * copied = bytes received in previous RTT, our base window
- * To cope with packet losses, we need a 2x factor
- * To cope with slow start, and sender growing its cwin by 100 %
- * every RTT, we need a 4x factor, because the ACK we are sending
- * now is for the next RTT, not the current one :
- * <prev RTT . ><current RTT .. ><next RTT .... >
- */
-
- if (sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf &&
- !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- int rcvmem, rcvbuf;
- u64 rcvwin, grow;
-
- /* minimal window to cope with packet losses, assuming
- * steady state. Add some cushion because of small variations.
- */
- rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
+ trace_tcp_rcvbuf_grow(sk, time);
- /* Accommodate for sender rate increase (eg. slow start) */
- grow = rcvwin * (copied - tp->rcvq_space.space);
- do_div(grow, tp->rcvq_space.space);
- rcvwin += (grow << 1);
-
- rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER);
- while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
- rcvmem += 128;
-
- do_div(rcvwin, tp->advmss);
- rcvbuf = min_t(u64, rcvwin * rcvmem,
- sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
- if (rcvbuf > sk->sk_rcvbuf) {
- sk->sk_rcvbuf = rcvbuf;
-
- /* Make the window clamp follow along. */
- tp->window_clamp = tcp_win_from_space(sk, rcvbuf);
- }
- }
- tp->rcvq_space.space = copied;
+ tcp_rcvbuf_grow(sk, copied);
new_measure:
tp->rcvq_space.seq = tp->copied_seq;
tp->rcvq_space.time = tp->tcp_mstamp;
}
+static void tcp_save_lrcv_flowlabel(struct sock *sk, const struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (skb->protocol == htons(ETH_P_IPV6))
+ icsk->icsk_ack.lrcv_flowlabel = ntohl(ip6_flowlabel(ipv6_hdr(skb)));
+#endif
+}
+
/* There is something which you must keep in mind when you analyze the
* behavior of the tp->ato delayed ack timeout interval. When a
* connection starts up, we want to ack as quickly as possible. The
@@ -719,15 +1030,15 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
* restart window, so that we send ACKs quickly.
*/
tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
- sk_mem_reclaim(sk);
}
}
icsk->icsk_ack.lrcvtime = now;
+ tcp_save_lrcv_flowlabel(sk, skb);
- tcp_ecn_check_ce(sk, skb);
+ tcp_data_ecn_check(sk, skb);
if (skb->len >= 128)
- tcp_grow_window(sk, skb);
+ tcp_grow_window(sk, skb, true);
}
/* Called to compute a smoothed rtt estimate. The data fed to this
@@ -791,6 +1102,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
tp->rtt_seq = tp->snd_nxt;
tp->mdev_max_us = tcp_rto_min_us(sk);
+
+ tcp_bpf_rtt(sk, mrtt_us, srtt);
}
} else {
/* no previous measure. */
@@ -799,11 +1112,13 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
tp->mdev_max_us = tp->rttvar_us;
tp->rtt_seq = tp->snd_nxt;
+
+ tcp_bpf_rtt(sk, mrtt_us, srtt);
}
tp->srtt_us = max(1U, srtt);
}
-static void tcp_update_pacing_rate(struct sock *sk)
+void tcp_update_pacing_rate(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
u64 rate;
@@ -819,12 +1134,12 @@ static void tcp_update_pacing_rate(struct sock *sk)
* If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching
* end of slow start and should slow down.
*/
- if (tp->snd_cwnd < tp->snd_ssthresh / 2)
- rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio;
+ if (tcp_snd_cwnd(tp) < tp->snd_ssthresh / 2)
+ rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio);
else
- rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio;
+ rate *= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio);
- rate *= max(tp->snd_cwnd, tp->packets_out);
+ rate *= max(tcp_snd_cwnd(tp), tp->packets_out);
if (likely(tp->srtt_us))
do_div(rate, tp->srtt_us);
@@ -833,14 +1148,14 @@ static void tcp_update_pacing_rate(struct sock *sk)
* without any lock. We want to make sure compiler wont store
* intermediate values in this location.
*/
- WRITE_ONCE(sk->sk_pacing_rate, min_t(u64, rate,
- sk->sk_max_pacing_rate));
+ WRITE_ONCE(sk->sk_pacing_rate,
+ min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)));
}
/* Calculate rto without backoff. This is the second half of Van Jacobson's
* routine referred to above.
*/
-static void tcp_set_rto(struct sock *sk)
+void tcp_set_rto(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
/* Old crap is replaced with new one. 8)
@@ -876,12 +1191,64 @@ __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst)
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
-/* Take a notice that peer is sending D-SACKs */
-static void tcp_dsack_seen(struct tcp_sock *tp)
+struct tcp_sacktag_state {
+ /* Timestamps for earliest and latest never-retransmitted segment
+ * that was SACKed. RTO needs the earliest RTT to stay conservative,
+ * but congestion control should still get an accurate delay signal.
+ */
+ u64 first_sackt;
+ u64 last_sackt;
+ u32 reord;
+ u32 sack_delivered;
+ u32 delivered_bytes;
+ int flag;
+ unsigned int mss_now;
+ struct rate_sample *rate;
+};
+
+/* Take a notice that peer is sending D-SACKs. Skip update of data delivery
+ * and spurious retransmission information if this DSACK is unlikely caused by
+ * sender's action:
+ * - DSACKed sequence range is larger than maximum receiver's window.
+ * - Total no. of DSACKed segments exceed the total no. of retransmitted segs.
+ */
+static u32 tcp_dsack_seen(struct tcp_sock *tp, u32 start_seq,
+ u32 end_seq, struct tcp_sacktag_state *state)
{
+ u32 seq_len, dup_segs = 1;
+
+ if (!before(start_seq, end_seq))
+ return 0;
+
+ seq_len = end_seq - start_seq;
+ /* Dubious DSACK: DSACKed range greater than maximum advertised rwnd */
+ if (seq_len > tp->max_window)
+ return 0;
+ if (seq_len > tp->mss_cache)
+ dup_segs = DIV_ROUND_UP(seq_len, tp->mss_cache);
+ else if (tp->tlp_high_seq && tp->tlp_high_seq == end_seq)
+ state->flag |= FLAG_DSACK_TLP;
+
+ tp->dsack_dups += dup_segs;
+ /* Skip the DSACK if dup segs weren't retransmitted by sender */
+ if (tp->dsack_dups > tp->total_retrans)
+ return 0;
+
tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
- tp->rack.dsack_seen = 1;
- tp->dsack_dups++;
+ /* We increase the RACK ordering window in rounds where we receive
+ * DSACKs that may have been due to reordering causing RACK to trigger
+ * a spurious fast recovery. Thus RACK ignores DSACKs that happen
+ * without having seen reordering, or that match TLP probes (TLP
+ * is timer-driven, not triggered by RACK).
+ */
+ if (tp->reord_seen && !(state->flag & FLAG_DSACK_TLP))
+ tp->rack.dsack_seen = 1;
+
+ state->flag |= FLAG_DSACKING_ACK;
+ /* A spurious retransmission is delivered */
+ state->sack_delivered += dup_segs;
+
+ return dup_segs;
}
/* It's reordering when higher sequence was delivered (i.e. sacked) before
@@ -910,7 +1277,7 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
tp->undo_marker ? tp->undo_retrans : 0);
#endif
tp->reordering = min_t(u32, (metric + mss - 1) / mss,
- sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
}
/* This exciting event is worth to be remembered. 8) */
@@ -919,50 +1286,50 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
}
-/* This must be called before lost_out is incremented */
+ /* This must be called before lost_out or retrans_out are updated
+ * on a new loss, because we want to know if all skbs previously
+ * known to be lost have already been retransmitted, indicating
+ * that this newly lost skb is our next skb to retransmit.
+ */
static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
{
- if (!tp->retransmit_skb_hint ||
- before(TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
+ if ((!tp->retransmit_skb_hint && tp->retrans_out >= tp->lost_out) ||
+ (tp->retransmit_skb_hint &&
+ before(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(tp->retransmit_skb_hint)->seq)))
tp->retransmit_skb_hint = skb;
}
-/* Sum the number of packets on the wire we have marked as lost.
- * There are two cases we care about here:
- * a) Packet hasn't been marked lost (nor retransmitted),
- * and this is the first loss.
- * b) Packet has been marked both lost and retransmitted,
- * and this means we think it was lost again.
+/* Sum the number of packets on the wire we have marked as lost, and
+ * notify the congestion control module that the given skb was marked lost.
*/
-static void tcp_sum_lost(struct tcp_sock *tp, struct sk_buff *skb)
+static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb)
{
- __u8 sacked = TCP_SKB_CB(skb)->sacked;
-
- if (!(sacked & TCPCB_LOST) ||
- ((sacked & TCPCB_LOST) && (sacked & TCPCB_SACKED_RETRANS)))
- tp->lost += tcp_skb_pcount(skb);
+ tp->lost += tcp_skb_pcount(skb);
}
-static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
+void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
{
- if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
- tcp_verify_retransmit_hint(tp, skb);
+ __u8 sacked = TCP_SKB_CB(skb)->sacked;
+ struct tcp_sock *tp = tcp_sk(sk);
- tp->lost_out += tcp_skb_pcount(skb);
- tcp_sum_lost(tp, skb);
- TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
- }
-}
+ if (sacked & TCPCB_SACKED_ACKED)
+ return;
-void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
-{
tcp_verify_retransmit_hint(tp, skb);
-
- tcp_sum_lost(tp, skb);
- if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {
+ if (sacked & TCPCB_LOST) {
+ if (sacked & TCPCB_SACKED_RETRANS) {
+ /* Account for retransmits that are lost again */
+ TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
+ tcp_skb_pcount(skb));
+ tcp_notify_skb_loss_event(tp, skb);
+ }
+ } else {
tp->lost_out += tcp_skb_pcount(skb);
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ tcp_notify_skb_loss_event(tp, skb);
}
}
@@ -981,7 +1348,7 @@ void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
* L|R 1 - orig is lost, retransmit is in flight.
* S|R 1 - orig reached receiver, retrans is still in flight.
* (L|S|R is logically valid, it could occur when L|R is sacked,
- * but it is equivalent to plain S and code short-curcuits it to S.
+ * but it is equivalent to plain S and code short-circuits it to S.
* L|S is logically invalid, it would mean -1 packet in flight 8))
*
* These 6 states form finite state machine, controlled by the following events:
@@ -1098,52 +1465,43 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack,
static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
struct tcp_sack_block_wire *sp, int num_sacks,
- u32 prior_snd_una)
+ u32 prior_snd_una, struct tcp_sacktag_state *state)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
- bool dup_sack = false;
+ u32 dup_segs;
if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
- dup_sack = true;
- tcp_dsack_seen(tp);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECV);
} else if (num_sacks > 1) {
u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
- if (!after(end_seq_0, end_seq_1) &&
- !before(start_seq_0, start_seq_1)) {
- dup_sack = true;
- tcp_dsack_seen(tp);
- NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPDSACKOFORECV);
- }
+ if (after(end_seq_0, end_seq_1) || before(start_seq_0, start_seq_1))
+ return false;
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKOFORECV);
+ } else {
+ return false;
}
+ dup_segs = tcp_dsack_seen(tp, start_seq_0, end_seq_0, state);
+ if (!dup_segs) { /* Skip dubious DSACK */
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDSACKIGNOREDDUBIOUS);
+ return false;
+ }
+
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPDSACKRECVSEGS, dup_segs);
+
/* D-SACK for already forgotten data... Do dumb counting. */
- if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 &&
+ if (tp->undo_marker && tp->undo_retrans > 0 &&
!after(end_seq_0, prior_snd_una) &&
after(end_seq_0, tp->undo_marker))
- tp->undo_retrans--;
+ tp->undo_retrans = max_t(int, 0, tp->undo_retrans - dup_segs);
- return dup_sack;
+ return true;
}
-struct tcp_sacktag_state {
- u32 reord;
- /* Timestamps for earliest and latest never-retransmitted segment
- * that was SACKed. RTO needs the earliest RTT to stay conservative,
- * but congestion control should still get an accurate delay signal.
- */
- u64 first_sackt;
- u64 last_sackt;
- struct rate_sample *rate;
- int flag;
- unsigned int mss_now;
-};
-
/* Check if skb is fully within the SACK block. In presence of GSO skbs,
* the incoming SACK may not exactly match but we can find smaller MSS
* aligned portion of it that matches. Therefore we might need to fragment
@@ -1204,7 +1562,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
static u8 tcp_sacktag_one(struct sock *sk,
struct tcp_sacktag_state *state, u8 sacked,
u32 start_seq, u32 end_seq,
- int dup_sack, int pcount,
+ int dup_sack, int pcount, u32 plen,
u64 xmit_time)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -1213,7 +1571,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
if (dup_sack && (sacked & TCPCB_RETRANS)) {
if (tp->undo_marker && tp->undo_retrans > 0 &&
after(end_seq, tp->undo_marker))
- tp->undo_retrans--;
+ tp->undo_retrans = max_t(int, 0, tp->undo_retrans - pcount);
if ((sacked & TCPCB_SACKED_ACKED) &&
before(start_seq, state->reord))
state->reord = start_seq;
@@ -1262,12 +1620,9 @@ static u8 tcp_sacktag_one(struct sock *sk,
sacked |= TCPCB_SACKED_ACKED;
state->flag |= FLAG_DATA_SACKED;
tp->sacked_out += pcount;
- tp->delivered += pcount; /* Out-of-order packets delivered */
-
- /* Lost marker hint past SACKed? Tweak RFC3517 cnt */
- if (tp->lost_skb_hint &&
- before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq))
- tp->lost_cnt_hint += pcount;
+ /* Out-of-order packets delivered */
+ state->sack_delivered += pcount;
+ state->delivered_bytes += plen;
}
/* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1304,18 +1659,15 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
* tcp_highest_sack_seq() when skb is highest_sack.
*/
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
- start_seq, end_seq, dup_sack, pcount,
- skb->skb_mstamp);
+ start_seq, end_seq, dup_sack, pcount, skb->len,
+ tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate);
- if (skb == tp->lost_skb_hint)
- tp->lost_cnt_hint += pcount;
-
TCP_SKB_CB(prev)->end_seq += shifted;
TCP_SKB_CB(skb)->seq += shifted;
tcp_skb_pcount_add(prev, pcount);
- BUG_ON(tcp_skb_pcount(skb) < pcount);
+ WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount);
tcp_skb_pcount_add(skb, -pcount);
/* When we're adding to gso_segs == 1, gso_size will be zero,
@@ -1343,10 +1695,6 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
if (skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = prev;
- if (skb == tp->lost_skb_hint) {
- tp->lost_skb_hint = prev;
- tp->lost_cnt_hint -= tcp_skb_pcount(prev);
- }
TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor;
@@ -1381,6 +1729,21 @@ static int skb_can_shift(const struct sk_buff *skb)
return !skb_headlen(skb) && skb_is_nonlinear(skb);
}
+int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from,
+ int pcount, int shiftlen)
+{
+ /* TCP min gso_size is 8 bytes (TCP_MIN_GSO_SIZE)
+ * Since TCP_SKB_CB(skb)->tcp_gso_segs is 16 bits, we need
+ * to make sure not storing more than 65535 * 8 bytes per skb,
+ * even if current MSS is bigger.
+ */
+ if (unlikely(to->len + shiftlen >= 65535 * TCP_MIN_GSO_SIZE))
+ return 0;
+ if (unlikely(tcp_skb_pcount(to) + pcount > 65535))
+ return 0;
+ return skb_shift(to, from, shiftlen);
+}
+
/* Try collapsing SACK blocks spanning across multiple skbs to a single
* skb.
*/
@@ -1414,7 +1777,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED)
goto fallback;
- if (!tcp_skb_can_collapse_to(prev))
+ if (!tcp_skb_can_collapse(prev, skb))
goto fallback;
in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
@@ -1486,7 +1849,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una))
goto fallback;
- if (!skb_shift(prev, skb, len))
+ if (!tcp_skb_shift(prev, skb, pcount, len))
goto fallback;
if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack))
goto out;
@@ -1503,12 +1866,13 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
(mss != tcp_skb_seglen(skb)))
goto out;
+ if (!tcp_skb_can_collapse(prev, skb))
+ goto out;
len = skb->len;
- if (skb_shift(prev, skb, len)) {
- pcount += tcp_skb_pcount(skb);
- tcp_shifted_skb(sk, prev, skb, state, tcp_skb_pcount(skb),
+ pcount = tcp_skb_pcount(skb);
+ if (tcp_skb_shift(prev, skb, pcount, len))
+ tcp_shifted_skb(sk, prev, skb, state, pcount,
len, mss, 0);
- }
out:
return prev;
@@ -1580,7 +1944,8 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
TCP_SKB_CB(skb)->end_seq,
dup_sack,
tcp_skb_pcount(skb),
- skb->skb_mstamp);
+ skb->len,
+ tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate);
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
list_del_init(&skb->tcp_tsorted_anchor);
@@ -1593,9 +1958,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
return skb;
}
-static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
- struct tcp_sacktag_state *state,
- u32 seq)
+static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, u32 seq)
{
struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node;
struct sk_buff *skb;
@@ -1617,13 +1980,12 @@ static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk,
}
static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk,
- struct tcp_sacktag_state *state,
u32 skip_to_seq)
{
if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq))
return skb;
- return tcp_sacktag_bsearch(sk, state, skip_to_seq);
+ return tcp_sacktag_bsearch(sk, skip_to_seq);
}
static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
@@ -1636,7 +1998,7 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb,
return skb;
if (before(next_dup->start_seq, skip_to_seq)) {
- skb = tcp_sacktag_skip(skb, sk, state, next_dup->start_seq);
+ skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq);
skb = tcp_sacktag_walk(skb, sk, NULL, state,
next_dup->start_seq, next_dup->end_seq,
1);
@@ -1674,11 +2036,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
tcp_highest_sack_reset(sk);
found_dup_sack = tcp_check_dsack(sk, ack_skb, sp_wire,
- num_sacks, prior_snd_una);
- if (found_dup_sack) {
- state->flag |= FLAG_DSACKING_ACK;
- tp->delivered++; /* A spurious retransmission is delivered */
- }
+ num_sacks, prior_snd_una, state);
/* Eliminate too old ACKs, but take into
* account more or less fresh ones, they can
@@ -1723,8 +2081,11 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
}
/* Ignore very old stuff early */
- if (!after(sp[used_sacks].end_seq, prior_snd_una))
+ if (!after(sp[used_sacks].end_seq, prior_snd_una)) {
+ if (i == 0)
+ first_sack_index = -1;
continue;
+ }
used_sacks++;
}
@@ -1777,8 +2138,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
/* Head todo? */
if (before(start_seq, cache->start_seq)) {
- skb = tcp_sacktag_skip(skb, sk, state,
- start_seq);
+ skb = tcp_sacktag_skip(skb, sk, start_seq);
skb = tcp_sacktag_walk(skb, sk, next_dup,
state,
start_seq,
@@ -1804,7 +2164,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
goto walk;
}
- skb = tcp_sacktag_skip(skb, sk, state, cache->end_seq);
+ skb = tcp_sacktag_skip(skb, sk, cache->end_seq);
/* Check overlap against next cached too (past this one already) */
cache++;
continue;
@@ -1815,7 +2175,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
if (!skb)
break;
}
- skb = tcp_sacktag_skip(skb, sk, state, start_seq);
+ skb = tcp_sacktag_skip(skb, sk, start_seq);
walk:
skb = tcp_sacktag_walk(skb, sk, next_dup, state,
@@ -1877,34 +2237,39 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
return;
tp->reordering = min_t(u32, tp->packets_out + addend,
- sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_max_reordering));
tp->reord_seen++;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
}
/* Emulate SACKs for SACKless connection: account for a new dupack. */
-static void tcp_add_reno_sack(struct sock *sk)
+static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack)
{
- struct tcp_sock *tp = tcp_sk(sk);
- u32 prior_sacked = tp->sacked_out;
+ if (num_dupack) {
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 prior_sacked = tp->sacked_out;
+ s32 delivered;
- tp->sacked_out++;
- tcp_check_reno_reordering(sk, 0);
- if (tp->sacked_out > prior_sacked)
- tp->delivered++; /* Some out-of-order packet is delivered */
- tcp_verify_left_out(tp);
+ tp->sacked_out += num_dupack;
+ tcp_check_reno_reordering(sk, 0);
+ delivered = tp->sacked_out - prior_sacked;
+ if (delivered > 0)
+ tcp_count_delivered(tp, delivered, ece_ack);
+ tcp_verify_left_out(tp);
+ }
}
/* Account for ACK, ACKing some data in Reno Recovery phase. */
-static void tcp_remove_reno_sacks(struct sock *sk, int acked)
+static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack)
{
struct tcp_sock *tp = tcp_sk(sk);
if (acked > 0) {
/* One ACK acked hole. The rest eat duplicate ACKs. */
- tp->delivered += max_t(int, acked - tp->sacked_out, 1);
+ tcp_count_delivered(tp, max_t(int, acked - tp->sacked_out, 1),
+ ece_ack);
if (acked - 1 >= tp->sacked_out)
tp->sacked_out = 0;
else
@@ -1926,18 +2291,25 @@ void tcp_clear_retrans(struct tcp_sock *tp)
tp->undo_marker = 0;
tp->undo_retrans = -1;
tp->sacked_out = 0;
+ tp->rto_stamp = 0;
+ tp->total_rto = 0;
+ tp->total_rto_recoveries = 0;
+ tp->total_rto_time = 0;
}
static inline void tcp_init_undo(struct tcp_sock *tp)
{
tp->undo_marker = tp->snd_una;
- /* Retransmission still in flight may cause DSACKs later. */
- tp->undo_retrans = tp->retrans_out ? : -1;
-}
-static bool tcp_is_rack(const struct sock *sk)
-{
- return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
+ /* Retransmission still in flight may cause DSACKs later. */
+ /* First, account for regular retransmits in flight: */
+ tp->undo_retrans = tp->retrans_out;
+ /* Next, account for TLP retransmits in flight: */
+ if (tp->tlp_high_seq && tp->tlp_retrans)
+ tp->undo_retrans++;
+ /* Finally, avoid 0, because undo_retrans==0 means "can undo now": */
+ if (!tp->undo_retrans)
+ tp->undo_retrans = -1;
}
/* If we detect SACK reneging, forget all SACK information
@@ -1965,8 +2337,7 @@ static void tcp_timeout_mark_lost(struct sock *sk)
skb_rbtree_walk_from(skb) {
if (is_reneg)
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
- else if (tcp_is_rack(sk) && skb != head &&
- tcp_rack_skb_timeout(tp, skb, 0) > 0)
+ else if (skb != head && tcp_rack_skb_timeout(tp, skb, 0) > 0)
continue; /* Don't mark recently sent ones lost yet */
tcp_mark_skb_lost(sk, skb);
}
@@ -1981,6 +2352,7 @@ void tcp_enter_loss(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
+ u8 reordering;
tcp_timeout_mark_lost(sk);
@@ -1989,31 +2361,34 @@ void tcp_enter_loss(struct sock *sk)
!after(tp->high_seq, tp->snd_una) ||
(icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
tp->prior_ssthresh = tcp_current_ssthresh(sk);
- tp->prior_cwnd = tp->snd_cwnd;
+ tp->prior_cwnd = tcp_snd_cwnd(tp);
tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tcp_ca_event(sk, CA_EVENT_LOSS);
tcp_init_undo(tp);
}
- tp->snd_cwnd = tcp_packets_in_flight(tp) + 1;
+ tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + 1);
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_jiffies32;
/* Timeout in disordered state after receiving substantial DUPACKs
* suggests that the degree of reordering is over-estimated.
*/
+ reordering = READ_ONCE(net->ipv4.sysctl_tcp_reordering);
if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
- tp->sacked_out >= net->ipv4.sysctl_tcp_reordering)
+ tp->sacked_out >= reordering)
tp->reordering = min_t(unsigned int, tp->reordering,
- net->ipv4.sysctl_tcp_reordering);
+ reordering);
+
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
+ tp->tlp_high_seq = 0;
tcp_ecn_queue_cwr(tp);
/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
* loss recovery is underway except recurring timeout(s) on
* the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
*/
- tp->frto = net->ipv4.sysctl_tcp_frto &&
+ tp->frto = READ_ONCE(net->ipv4.sysctl_tcp_frto) &&
(new_recovery || icsk->icsk_retransmits) &&
!inet_csk(sk)->icsk_mtup.probe_size;
}
@@ -2028,36 +2403,21 @@ void tcp_enter_loss(struct sock *sk)
* restore sanity to the SACK scoreboard. If the apparent reneging
* persists until this RTO then we'll clear the SACK scoreboard.
*/
-static bool tcp_check_sack_reneging(struct sock *sk, int flag)
+static bool tcp_check_sack_reneging(struct sock *sk, int *ack_flag)
{
- if (flag & FLAG_SACK_RENEGING) {
+ if (*ack_flag & FLAG_SACK_RENEGING &&
+ *ack_flag & FLAG_SND_UNA_ADVANCED) {
struct tcp_sock *tp = tcp_sk(sk);
unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4),
msecs_to_jiffies(10));
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- delay, TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, false);
+ *ack_flag &= ~FLAG_SET_XMIT_TIMER;
return true;
}
return false;
}
-/* Heurestics to calculate number of duplicate ACKs. There's no dupACKs
- * counter when SACK is enabled (without SACK, sacked_out is used for
- * that purpose).
- *
- * With reordering, holes may still be in flight, so RFC3517 recovery
- * uses pure sacked_out (total number of SACKed segments) even though
- * it violates the RFC that uses duplicate ACKs, often these are equal
- * but when e.g. out-of-window ACKs or packet duplication occurs,
- * they differ. Since neither occurs due to loss, TCP should really
- * ignore them.
- */
-static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
-{
- return tp->sacked_out + 1;
-}
-
/* Linux NewReno/SACK/ECN state machine.
* --------------------------------------
*
@@ -2110,13 +2470,7 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
*
* If the receiver supports SACK:
*
- * RFC6675/3517: It is the conventional algorithm. A packet is
- * considered lost if the number of higher sequence packets
- * SACKed is greater than or equal the DUPACK thoreshold
- * (reordering). This is implemented in tcp_mark_head_lost and
- * tcp_update_scoreboard.
- *
- * RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
+ * RACK (RFC8985): RACK is a newer loss detection algorithm
* (2017-) that checks timing instead of counting DUPACKs.
* Essentially a packet is considered lost if it's not S/ACKed
* after RTT + reordering_window, where both metrics are
@@ -2131,8 +2485,8 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
* is lost (NewReno). This heuristics are the same in NewReno
* and SACK.
*
- * Really tricky (and requiring careful tuning) part of algorithm
- * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
+ * The really tricky (and requiring careful tuning) part of the algorithm
+ * is hidden in the RACK code in tcp_recovery.c and tcp_xmit_retransmit_queue().
* The first determines the moment _when_ we should reduce CWND and,
* hence, slow down forward transmission. In fact, it determines the moment
* when we decide that hole is caused by loss, rather than by a reorder.
@@ -2155,99 +2509,10 @@ static inline int tcp_dupack_heuristics(const struct tcp_sock *tp)
* Main question: may we further continue forward transmission
* with the same cwnd?
*/
-static bool tcp_time_to_recover(struct sock *sk, int flag)
+static bool tcp_time_to_recover(const struct tcp_sock *tp)
{
- struct tcp_sock *tp = tcp_sk(sk);
-
- /* Trick#1: The loss is proven. */
- if (tp->lost_out)
- return true;
-
- /* Not-A-Trick#2 : Classic rule... */
- if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
- return true;
-
- return false;
-}
-
-/* Detect loss in event "A" above by marking head of queue up as lost.
- * For non-SACK(Reno) senders, the first "packets" number of segments
- * are considered lost. For RFC3517 SACK, a segment is considered lost if it
- * has at least tp->reordering SACKed seqments above it; "packets" refers to
- * the maximum SACKed segments to pass before reaching this limit.
- */
-static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct sk_buff *skb;
- int cnt, oldcnt, lost;
- unsigned int mss;
- /* Use SACK to deduce losses of new sequences sent during recovery */
- const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
-
- WARN_ON(packets > tp->packets_out);
- skb = tp->lost_skb_hint;
- if (skb) {
- /* Head already handled? */
- if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una))
- return;
- cnt = tp->lost_cnt_hint;
- } else {
- skb = tcp_rtx_queue_head(sk);
- cnt = 0;
- }
-
- skb_rbtree_walk_from(skb) {
- /* TODO: do this better */
- /* this is not the most efficient way to do this... */
- tp->lost_skb_hint = skb;
- tp->lost_cnt_hint = cnt;
-
- if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
- break;
-
- oldcnt = cnt;
- if (tcp_is_reno(tp) ||
- (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
- cnt += tcp_skb_pcount(skb);
-
- if (cnt > packets) {
- if (tcp_is_sack(tp) ||
- (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
- (oldcnt >= packets))
- break;
-
- mss = tcp_skb_mss(skb);
- /* If needed, chop off the prefix to mark as lost. */
- lost = (packets - oldcnt) * mss;
- if (lost < skb->len &&
- tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
- lost, mss, GFP_ATOMIC) < 0)
- break;
- cnt = packets;
- }
-
- tcp_skb_mark_lost(tp, skb);
-
- if (mark_head)
- break;
- }
- tcp_verify_left_out(tp);
-}
-
-/* Account newly detected lost packet(s) */
-
-static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (tcp_is_sack(tp)) {
- int sacked_upto = tp->sacked_out - tp->reordering;
- if (sacked_upto >= 0)
- tcp_mark_head_lost(sk, sacked_upto, 0);
- else if (fast_rexmit)
- tcp_mark_head_lost(sk, 1, 1);
- }
+ /* Has loss detection marked at least one packet lost? */
+ return tp->lost_out != 0;
}
static bool tcp_tsopt_ecr_before(const struct tcp_sock *tp, u32 when)
@@ -2263,7 +2528,7 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
const struct sk_buff *skb)
{
return (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) &&
- tcp_tsopt_ecr_before(tp, tcp_skb_timestamp(skb));
+ tcp_tsopt_ecr_before(tp, tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb));
}
/* Nothing was retransmitted or returned timestamp is less
@@ -2271,8 +2536,35 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp,
*/
static inline bool tcp_packet_delayed(const struct tcp_sock *tp)
{
- return !tp->retrans_stamp ||
- tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
+ const struct sock *sk = (const struct sock *)tp;
+
+ /* Received an echoed timestamp before the first retransmission? */
+ if (tp->retrans_stamp)
+ return tcp_tsopt_ecr_before(tp, tp->retrans_stamp);
+
+ /* We set tp->retrans_stamp upon the first retransmission of a loss
+ * recovery episode, so normally if tp->retrans_stamp is 0 then no
+ * retransmission has happened yet (likely due to TSQ, which can cause
+ * fast retransmits to be delayed). So if snd_una advanced while
+ * (tp->retrans_stamp is 0 then apparently a packet was merely delayed,
+ * not lost. But there are exceptions where we retransmit but then
+ * clear tp->retrans_stamp, so we check for those exceptions.
+ */
+
+ /* (1) For non-SACK connections, tcp_is_non_sack_preventing_reopen()
+ * clears tp->retrans_stamp when snd_una == high_seq.
+ */
+ if (!tcp_is_sack(tp) && !before(tp->snd_una, tp->high_seq))
+ return false;
+
+ /* (2) In TCP_SYN_SENT tcp_clean_rtx_queue() clears tp->retrans_stamp
+ * when setting FLAG_SYN_ACKED is set, even if the SYN was
+ * retransmitted.
+ */
+ if (sk->sk_state == TCP_SYN_SENT)
+ return false;
+
+ return true; /* tp->retrans_stamp is zero; no retransmit yet */
}
/* Undo procedures. */
@@ -2306,6 +2598,16 @@ static bool tcp_any_retrans_done(const struct sock *sk)
return false;
}
+/* If loss recovery is finished and there are no retransmits out in the
+ * network, then we clear retrans_stamp so that upon the next loss recovery
+ * retransmits_timed_out() and timestamp-undo are using the correct value.
+ */
+static void tcp_retrans_stamp_cleanup(struct sock *sk)
+{
+ if (!tcp_any_retrans_done(sk))
+ tcp_sk(sk)->retrans_stamp = 0;
+}
+
static void DBGUNDO(struct sock *sk, const char *msg)
{
#if FASTRETRANS_DEBUG > 1
@@ -2316,7 +2618,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n",
msg,
&inet->inet_daddr, ntohs(inet->inet_dport),
- tp->snd_cwnd, tcp_left_out(tp),
+ tcp_snd_cwnd(tp), tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
}
@@ -2325,7 +2627,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
msg,
&sk->sk_v6_daddr, ntohs(inet->inet_dport),
- tp->snd_cwnd, tcp_left_out(tp),
+ tcp_snd_cwnd(tp), tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
}
@@ -2350,7 +2652,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
if (tp->prior_ssthresh) {
const struct inet_connection_sock *icsk = inet_csk(sk);
- tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk);
+ tcp_snd_cwnd_set(tp, icsk->icsk_ca_ops->undo_cwnd(sk));
if (tp->prior_ssthresh > tp->snd_ssthresh) {
tp->snd_ssthresh = tp->prior_ssthresh;
@@ -2367,6 +2669,21 @@ static inline bool tcp_may_undo(const struct tcp_sock *tp)
return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));
}
+static bool tcp_is_non_sack_preventing_reopen(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
+ /* Hold old state until something *above* high_seq
+ * is ACKed. For Reno it is MUST to prevent false
+ * fast retransmits (RFC2582). SACK TCP is safe. */
+ if (!tcp_any_retrans_done(sk))
+ tp->retrans_stamp = 0;
+ return true;
+ }
+ return false;
+}
+
/* People celebrate: "We love our President!" */
static bool tcp_try_undo_recovery(struct sock *sk)
{
@@ -2389,14 +2706,8 @@ static bool tcp_try_undo_recovery(struct sock *sk)
} else if (tp->rack.reo_wnd_persist) {
tp->rack.reo_wnd_persist--;
}
- if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) {
- /* Hold old state until something *above* high_seq
- * is ACKed. For Reno it is MUST to prevent false
- * fast retransmits (RFC2582). SACK TCP is safe. */
- if (!tcp_any_retrans_done(sk))
- tp->retrans_stamp = 0;
+ if (tcp_is_non_sack_preventing_reopen(sk))
return true;
- }
tcp_set_ca_state(sk, TCP_CA_Open);
tp->is_sack_reneg = 0;
return false;
@@ -2431,7 +2742,9 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
if (frto_undo)
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPSPURIOUSRTOS);
- inet_csk(sk)->icsk_retransmits = 0;
+ WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0);
+ if (tcp_is_non_sack_preventing_reopen(sk))
+ return true;
if (frto_undo || tcp_is_sack(tp)) {
tcp_set_ca_state(sk, TCP_CA_Open);
tp->is_sack_reneg = 0;
@@ -2447,7 +2760,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
* 1) If the packets in flight is larger than ssthresh, PRR spreads the
* cwnd reductions across a full RTT.
* 2) Otherwise PRR uses packet conservation to send as much as delivered.
- * But when the retransmits are acked without further losses, PRR
+ * But when SND_UNA is acked without further losses,
* slow starts cwnd up to ssthresh to speed up the recovery.
*/
static void tcp_init_cwnd_reduction(struct sock *sk)
@@ -2457,14 +2770,14 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
tp->high_seq = tp->snd_nxt;
tp->tlp_high_seq = 0;
tp->snd_cwnd_cnt = 0;
- tp->prior_cwnd = tp->snd_cwnd;
+ tp->prior_cwnd = tcp_snd_cwnd(tp);
tp->prr_delivered = 0;
tp->prr_out = 0;
tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
tcp_ecn_queue_cwr(tp);
}
-void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
+void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0;
@@ -2473,22 +2786,23 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag)
if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
return;
+ trace_tcp_cwnd_reduction_tp(sk, newly_acked_sacked, newly_lost, flag);
+
tp->prr_delivered += newly_acked_sacked;
if (delta < 0) {
u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered +
tp->prior_cwnd - 1;
sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out;
- } else if ((flag & FLAG_RETRANS_DATA_ACKED) &&
- !(flag & FLAG_LOST_RETRANS)) {
- sndcnt = min_t(int, delta,
- max_t(int, tp->prr_delivered - tp->prr_out,
- newly_acked_sacked) + 1);
} else {
- sndcnt = min(delta, newly_acked_sacked);
+ sndcnt = max_t(int, tp->prr_delivered - tp->prr_out,
+ newly_acked_sacked);
+ if (flag & FLAG_SND_UNA_ADVANCED && !newly_lost)
+ sndcnt++;
+ sndcnt = min(delta, sndcnt);
}
/* Force a fast retransmit upon entering fast recovery */
sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
- tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
+ tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + sndcnt);
}
static inline void tcp_end_cwnd_reduction(struct sock *sk)
@@ -2501,7 +2815,7 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
(inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) {
- tp->snd_cwnd = tp->snd_ssthresh;
+ tcp_snd_cwnd_set(tp, tp->snd_ssthresh);
tp->snd_cwnd_stamp = tcp_jiffies32;
}
tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR);
@@ -2565,12 +2879,15 @@ static void tcp_mtup_probe_success(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ u64 val;
- /* FIXME: breaks with very large cwnd */
tp->prior_ssthresh = tcp_current_ssthresh(sk);
- tp->snd_cwnd = tp->snd_cwnd *
- tcp_mss_to_mtu(sk, tp->mss_cache) /
- icsk->icsk_mtup.probe_size;
+
+ val = (u64)tcp_snd_cwnd(tp) * tcp_mss_to_mtu(sk, tp->mss_cache);
+ do_div(val, icsk->icsk_mtup.probe_size);
+ DEBUG_NET_WARN_ON_ONCE((u32)val != val);
+ tcp_snd_cwnd_set(tp, max_t(u32, 1U, val));
+
tp->snd_cwnd_cnt = 0;
tp->snd_cwnd_stamp = tcp_jiffies32;
tp->snd_ssthresh = tcp_current_ssthresh(sk);
@@ -2581,30 +2898,61 @@ static void tcp_mtup_probe_success(struct sock *sk)
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS);
}
+/* Sometimes we deduce that packets have been dropped due to reasons other than
+ * congestion, like path MTU reductions or failed client TFO attempts. In these
+ * cases we call this function to retransmit as many packets as cwnd allows,
+ * without reducing cwnd. Given that retransmits will set retrans_stamp to a
+ * non-zero value (and may do so in a later calling context due to TSQ), we
+ * also enter CA_Loss so that we track when all retransmitted packets are ACKed
+ * and clear retrans_stamp when that happens (to ensure later recurring RTOs
+ * are using the correct retrans_stamp and don't declare ETIMEDOUT
+ * prematurely).
+ */
+static void tcp_non_congestion_loss_retransmit(struct sock *sk)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (icsk->icsk_ca_state != TCP_CA_Loss) {
+ tp->high_seq = tp->snd_nxt;
+ tp->snd_ssthresh = tcp_current_ssthresh(sk);
+ tp->prior_ssthresh = 0;
+ tp->undo_marker = 0;
+ tcp_set_ca_state(sk, TCP_CA_Loss);
+ }
+ tcp_xmit_retransmit_queue(sk);
+}
+
/* Do a simple retransmit without using the backoff mechanisms in
* tcp_timer. This is used for path mtu discovery.
* The socket is already locked here.
*/
void tcp_simple_retransmit(struct sock *sk)
{
- const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- unsigned int mss = tcp_current_mss(sk);
+ int mss;
+
+ /* A fastopen SYN request is stored as two separate packets within
+ * the retransmit queue, this is done by tcp_send_syn_data().
+ * As a result simply checking the MSS of the frames in the queue
+ * will not work for the SYN packet.
+ *
+ * Us being here is an indication of a path MTU issue so we can
+ * assume that the fastopen SYN was lost and just mark all the
+ * frames in the retransmit queue as lost. We will use an MSS of
+ * -1 to mark all frames as lost, otherwise compute the current MSS.
+ */
+ if (tp->syn_data && sk->sk_state == TCP_SYN_SENT)
+ mss = -1;
+ else
+ mss = tcp_current_mss(sk);
skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
- if (tcp_skb_seglen(skb) > mss &&
- !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
- }
- tcp_skb_mark_lost_uncond_verify(tp, skb);
- }
+ if (tcp_skb_seglen(skb) > mss)
+ tcp_mark_skb_lost(sk, skb);
}
- tcp_clear_retrans_hints_partial(tp);
-
if (!tp->lost_out)
return;
@@ -2618,22 +2966,18 @@ void tcp_simple_retransmit(struct sock *sk)
* in network, but units changed and effective
* cwnd/ssthresh really reduced now.
*/
- if (icsk->icsk_ca_state != TCP_CA_Loss) {
- tp->high_seq = tp->snd_nxt;
- tp->snd_ssthresh = tcp_current_ssthresh(sk);
- tp->prior_ssthresh = 0;
- tp->undo_marker = 0;
- tcp_set_ca_state(sk, TCP_CA_Loss);
- }
- tcp_xmit_retransmit_queue(sk);
+ tcp_non_congestion_loss_retransmit(sk);
}
-EXPORT_SYMBOL(tcp_simple_retransmit);
+EXPORT_IPV6_MOD(tcp_simple_retransmit);
void tcp_enter_recovery(struct sock *sk, bool ece_ack)
{
struct tcp_sock *tp = tcp_sk(sk);
int mib_idx;
+ /* Start the clock with our fast retransmit, for undo and ETIMEDOUT. */
+ tcp_retrans_stamp_cleanup(sk);
+
if (tcp_is_reno(tp))
mib_idx = LINUX_MIB_TCPRENORECOVERY;
else
@@ -2652,16 +2996,24 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack)
tcp_set_ca_state(sk, TCP_CA_Recovery);
}
+static void tcp_update_rto_time(struct tcp_sock *tp)
+{
+ if (tp->rto_stamp) {
+ tp->total_rto_time += tcp_time_stamp_ms(tp) - tp->rto_stamp;
+ tp->rto_stamp = 0;
+ }
+}
+
/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
* recovered or spurious. Otherwise retransmits more on partial ACKs.
*/
-static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
+static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
int *rexmit)
{
struct tcp_sock *tp = tcp_sk(sk);
bool recovered = !before(tp->snd_una, tp->high_seq);
- if ((flag & FLAG_SND_UNA_ADVANCED) &&
+ if ((flag & FLAG_SND_UNA_ADVANCED || rcu_access_pointer(tp->fastopen_rsk)) &&
tcp_try_undo_loss(sk, false))
return;
@@ -2674,7 +3026,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
return;
if (after(tp->snd_nxt, tp->high_seq)) {
- if (flag & FLAG_DATA_SACKED || is_dupack)
+ if (flag & FLAG_DATA_SACKED || num_dupack)
tp->frto = 0; /* Step 3.a. loss was real */
} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
tp->high_seq = tp->snd_nxt;
@@ -2698,10 +3050,10 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
}
if (tcp_is_reno(tp)) {
/* A Reno DUPACK means new data in F-RTO step 2.b above are
- * delivered. Lower inflight to clock out (re)tranmissions.
+ * delivered. Lower inflight to clock out (re)transmissions.
*/
- if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
- tcp_add_reno_sack(sk);
+ if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
+ tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE);
else if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
}
@@ -2734,7 +3086,6 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
tcp_undo_cwnd_reduction(sk, true);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
tcp_try_keep_open(sk);
- return true;
}
return false;
}
@@ -2748,23 +3099,16 @@ static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag)
if (unlikely(tcp_is_reno(tp))) {
tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
- } else if (tcp_is_rack(sk)) {
+ } else {
u32 prior_retrans = tp->retrans_out;
- tcp_rack_mark_lost(sk);
+ if (tcp_rack_mark_lost(sk))
+ *ack_flag &= ~FLAG_SET_XMIT_TIMER;
if (prior_retrans > tp->retrans_out)
*ack_flag |= FLAG_LOST_RETRANS;
}
}
-static bool tcp_force_fast_retransmit(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- return after(tcp_highest_sack_seq(tp),
- tp->snd_una + tp->reordering * tp->mss_cache);
-}
-
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
@@ -2778,24 +3122,23 @@ static bool tcp_force_fast_retransmit(struct sock *sk)
* tcp_xmit_retransmit_queue().
*/
static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
- bool is_dupack, int *ack_flag, int *rexmit)
+ int num_dupack, int *ack_flag, int *rexmit)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- int fast_rexmit = 0, flag = *ack_flag;
- bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
- tcp_force_fast_retransmit(sk));
+ int flag = *ack_flag;
+ bool ece_ack = flag & FLAG_ECE;
if (!tp->packets_out && tp->sacked_out)
tp->sacked_out = 0;
/* Now state machine starts.
* A. ECE, hence prohibit cwnd undoing, the reduction is required. */
- if (flag & FLAG_ECE)
+ if (ece_ack)
tp->prior_ssthresh = 0;
/* B. In all the states check for reneging SACKs. */
- if (tcp_check_sack_reneging(sk, flag))
+ if (tcp_check_sack_reneging(sk, ack_flag))
return;
/* C. Check consistency of the current state. */
@@ -2804,7 +3147,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
/* D. Check state exit conditions. State can be terminated
* when high_seq is ACKed. */
if (icsk->icsk_ca_state == TCP_CA_Open) {
- WARN_ON(tp->retrans_out != 0);
+ WARN_ON(tp->retrans_out != 0 && !tp->syn_data);
tp->retrans_stamp = 0;
} else if (!before(tp->snd_una, tp->high_seq)) {
switch (icsk->icsk_ca_state) {
@@ -2831,42 +3174,46 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
switch (icsk->icsk_ca_state) {
case TCP_CA_Recovery:
if (!(flag & FLAG_SND_UNA_ADVANCED)) {
- if (tcp_is_reno(tp) && is_dupack)
- tcp_add_reno_sack(sk);
- } else {
- if (tcp_try_undo_partial(sk, prior_snd_una))
- return;
- /* Partial ACK arrived. Force fast retransmit. */
- do_lost = tcp_is_reno(tp) ||
- tcp_force_fast_retransmit(sk);
- }
- if (tcp_try_undo_dsack(sk)) {
- tcp_try_keep_open(sk);
+ if (tcp_is_reno(tp))
+ tcp_add_reno_sack(sk, num_dupack, ece_ack);
+ } else if (tcp_try_undo_partial(sk, prior_snd_una))
return;
- }
+
+ if (tcp_try_undo_dsack(sk))
+ tcp_try_to_open(sk, flag);
+
tcp_identify_packet_loss(sk, ack_flag);
+ if (icsk->icsk_ca_state != TCP_CA_Recovery) {
+ if (!tcp_time_to_recover(tp))
+ return;
+ /* Undo reverts the recovery state. If loss is evident,
+ * starts a new recovery (e.g. reordering then loss);
+ */
+ tcp_enter_recovery(sk, ece_ack);
+ }
break;
case TCP_CA_Loss:
- tcp_process_loss(sk, flag, is_dupack, rexmit);
+ tcp_process_loss(sk, flag, num_dupack, rexmit);
+ if (icsk->icsk_ca_state != TCP_CA_Loss)
+ tcp_update_rto_time(tp);
tcp_identify_packet_loss(sk, ack_flag);
if (!(icsk->icsk_ca_state == TCP_CA_Open ||
(*ack_flag & FLAG_LOST_RETRANS)))
return;
/* Change state if cwnd is undone or retransmits are lost */
- /* fall through */
+ fallthrough;
default:
if (tcp_is_reno(tp)) {
if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
- if (is_dupack)
- tcp_add_reno_sack(sk);
+ tcp_add_reno_sack(sk, num_dupack, ece_ack);
}
if (icsk->icsk_ca_state <= TCP_CA_Disorder)
tcp_try_undo_dsack(sk);
tcp_identify_packet_loss(sk, ack_flag);
- if (!tcp_time_to_recover(sk, flag)) {
+ if (!tcp_time_to_recover(tp)) {
tcp_try_to_open(sk, flag);
return;
}
@@ -2877,24 +3224,21 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
tp->snd_una == tp->mtu_probe.probe_seq_start) {
tcp_mtup_probe_failed(sk);
/* Restores the reduction we did in tcp_mtup_probe() */
- tp->snd_cwnd++;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
tcp_simple_retransmit(sk);
return;
}
/* Otherwise enter Recovery state */
- tcp_enter_recovery(sk, (flag & FLAG_ECE));
- fast_rexmit = 1;
+ tcp_enter_recovery(sk, ece_ack);
}
- if (!tcp_is_rack(sk) && do_lost)
- tcp_update_scoreboard(sk, fast_rexmit);
*rexmit = REXMIT_LOST;
}
static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us, const int flag)
{
- u32 wlen = sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen * HZ;
+ u32 wlen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_rtt_wlen) * HZ;
struct tcp_sock *tp = tcp_sk(sk);
if ((flag & FLAG_ACK_MAYBE_DELAYED) && rtt_us > tcp_min_rtt(tp)) {
@@ -2928,13 +3272,10 @@ static bool tcp_ack_update_rtt(struct sock *sk, const int flag,
* left edge of the send window.
* See draft-ietf-tcplw-high-performance-00, section 3.3.
*/
- if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
- flag & FLAG_ACKED) {
- u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
- u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
+ if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp &&
+ tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED)
+ seq_rtt_us = ca_rtt_us = tcp_rtt_tsopt_us(tp, 1);
- seq_rtt_us = ca_rtt_us = delta_us;
- }
rs->rtt_us = ca_rtt_us; /* RTT of last (S)ACKed packet (or -1) */
if (seq_rtt_us < 0)
return false;
@@ -2984,7 +3325,7 @@ void tcp_rearm_rto(struct sock *sk)
/* If the retrans timer is currently being used by Fast Open
* for SYN-ACK retrans purpose, stay put.
*/
- if (tp->fastopen_rsk)
+ if (rcu_access_pointer(tp->fastopen_rsk))
return;
if (!tp->packets_out) {
@@ -3000,8 +3341,7 @@ void tcp_rearm_rto(struct sock *sk)
*/
rto = usecs_to_jiffies(max_t(int, delta_us, 1));
}
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
- TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, true);
}
}
@@ -3034,7 +3374,7 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
}
static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
- u32 prior_snd_una)
+ const struct sk_buff *ack_skb, u32 prior_snd_una)
{
const struct skb_shared_info *shinfo;
@@ -3046,7 +3386,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
if (!before(shinfo->tskey, prior_snd_una) &&
before(shinfo->tskey, tcp_sk(sk)->snd_una)) {
tcp_skb_tsorted_save(skb) {
- __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK);
+ __skb_tstamp_tx(skb, ack_skb, NULL, sk, SCM_TSTAMP_ACK);
} tcp_skb_tsorted_restore(skb);
}
}
@@ -3055,9 +3395,9 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
* is before the ack sequence we can discard it as it's confirmed to have
* arrived at the other end.
*/
-static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
- u32 prior_snd_una,
- struct tcp_sacktag_state *sack)
+static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
+ u32 prior_fack, u32 prior_snd_una,
+ struct tcp_sacktag_state *sack, bool ece_ack)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
u64 first_ackt, last_ackt;
@@ -3070,7 +3410,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
long seq_rtt_us = -1L;
long ca_rtt_us = -1L;
u32 pkts_acked = 0;
- u32 last_in_flight = 0;
bool rtt_update;
int flag = 0;
@@ -3082,8 +3421,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
u8 sacked = scb->sacked;
u32 acked_pcount;
- tcp_ack_tstamp(sk, skb, prior_snd_una);
-
/* Determine how many packets and what bytes were acked, tso and else */
if (after(scb->end_seq, tp->snd_una)) {
if (tcp_skb_pcount(skb) == 1 ||
@@ -3103,12 +3440,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
tp->retrans_out -= acked_pcount;
flag |= FLAG_RETRANS_DATA_ACKED;
} else if (!(sacked & TCPCB_SACKED_ACKED)) {
- last_ackt = skb->skb_mstamp;
+ last_ackt = tcp_skb_timestamp_us(skb);
WARN_ON_ONCE(last_ackt == 0);
if (!first_ackt)
first_ackt = last_ackt;
- last_in_flight = TCP_SKB_CB(skb)->tx.in_flight;
if (before(start_seq, reord))
reord = start_seq;
if (!after(scb->end_seq, tp->high_seq))
@@ -3117,11 +3453,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
if (sacked & TCPCB_SACKED_ACKED) {
tp->sacked_out -= acked_pcount;
+ /* snd_una delta covers these skbs */
+ sack->delivered_bytes -= skb->len;
} else if (tcp_is_sack(tp)) {
- tp->delivered += acked_pcount;
+ tcp_count_delivered(tp, acked_pcount, ece_ack);
if (!tcp_skb_spurious_retrans(tp, skb))
tcp_rack_advance(tp, sacked, scb->end_seq,
- skb->skb_mstamp);
+ tcp_skb_timestamp_us(skb));
}
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
@@ -3147,11 +3485,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
if (!fully_acked)
break;
+ tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
+
next = skb_rb_next(skb);
if (unlikely(skb == tp->retransmit_skb_hint))
tp->retransmit_skb_hint = NULL;
- if (unlikely(skb == tp->lost_skb_hint))
- tp->lost_skb_hint = NULL;
+ tcp_highest_sack_replace(sk, skb, next);
tcp_rtx_queue_unlink_and_free(skb, sk);
}
@@ -3161,15 +3500,18 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una)))
tp->snd_up = tp->snd_una;
- if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
- flag |= FLAG_SACK_RENEGING;
+ if (skb) {
+ tcp_ack_tstamp(sk, skb, ack_skb, prior_snd_una);
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+ flag |= FLAG_SACK_RENEGING;
+ }
if (likely(first_ackt) && !(flag & FLAG_RETRANS_DATA_ACKED)) {
seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt);
ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt);
- if (pkts_acked == 1 && last_in_flight < tp->mss_cache &&
- last_in_flight && !prior_sacked && fully_acked &&
+ if (pkts_acked == 1 && fully_acked && !prior_sacked &&
+ (tp->snd_una - prior_snd_una) < tp->mss_cache &&
sack->rate->prior_delivered + 1 == tp->delivered &&
!(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) {
/* Conservatively mark a delayed ACK. It's typically
@@ -3194,7 +3536,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
}
if (tcp_is_reno(tp)) {
- tcp_remove_reno_sacks(sk, pkts_acked);
+ tcp_remove_reno_sacks(sk, pkts_acked, ece_ack);
/* If any of the cumulatively ACKed segments was
* retransmitted, non-SACK case cannot confirm that
@@ -3205,17 +3547,17 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
if (flag & FLAG_RETRANS_DATA_ACKED)
flag &= ~FLAG_ORIG_SACK_ACKED;
} else {
- int delta;
-
/* Non-retransmitted hole got filled? That's reordering */
if (before(reord, prior_fack))
tcp_check_sack_reordering(sk, reord, 0);
-
- delta = prior_sacked - tp->sacked_out;
- tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
}
+
+ sack->delivered_bytes = (skb ?
+ TCP_SKB_CB(skb)->seq : tp->snd_una) -
+ prior_snd_una;
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
- sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp)) {
+ sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
+ tcp_skb_timestamp_us(skb))) {
/* Do not re-arm RTO if the sack RTT is measured from data sent
* after when the head was last (re)transmitted. Otherwise the
* timeout may continue to extend in loss recovery.
@@ -3225,9 +3567,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
if (icsk->icsk_ca_ops->pkts_acked) {
struct ack_sample sample = { .pkts_acked = pkts_acked,
- .rtt_us = sack->rate->rtt_us,
- .in_flight = last_in_flight };
+ .rtt_us = sack->rate->rtt_us };
+ sample.in_flight = tp->mss_cache *
+ (tp->delivered - sack->rate->prior_delivered);
icsk->icsk_ca_ops->pkts_acked(sk, &sample);
}
@@ -3268,15 +3611,16 @@ static void tcp_ack_probe(struct sock *sk)
return;
if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
icsk->icsk_backoff = 0;
+ icsk->icsk_probes_tstamp = 0;
inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
/* Socket must be waked up by subsequent tcp_data_snd_check().
* This function is not for random using!
*/
} else {
- unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
+ unsigned long when = tcp_probe0_when(sk, tcp_rto_max(sk));
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- when, TCP_RTO_MAX);
+ when = tcp_clamp_probe0_to_user_timeout(sk, when);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, when, true);
}
}
@@ -3295,7 +3639,8 @@ static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
* new SACK or ECE mark may first advance cwnd here and later reduce
* cwnd in tcp_fastretrans_alert() based on more states.
*/
- if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering)
+ if (tcp_sk(sk)->reordering >
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering))
return flag & FLAG_FORWARD_PROGRESS;
return flag & FLAG_DATA_ACKED;
@@ -3312,13 +3657,13 @@ static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
const struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ca_ops->cong_control) {
- icsk->icsk_ca_ops->cong_control(sk, rs);
+ icsk->icsk_ca_ops->cong_control(sk, ack, flag, rs);
return;
}
if (tcp_in_cwnd_reduction(sk)) {
/* Reduce cwnd if state mandates */
- tcp_cwnd_reduction(sk, acked_sacked, flag);
+ tcp_cwnd_reduction(sk, acked_sacked, rs->losses, flag);
} else if (tcp_may_raise_cwnd(sk, flag)) {
/* Advance cwnd if state allows */
tcp_cong_avoid(sk, ack, acked_sacked);
@@ -3335,7 +3680,24 @@ static inline bool tcp_may_update_window(const struct tcp_sock *tp,
{
return after(ack, tp->snd_una) ||
after(ack_seq, tp->snd_wl1) ||
- (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
+ (ack_seq == tp->snd_wl1 && (nwin > tp->snd_wnd || !nwin));
+}
+
+static void tcp_snd_sne_update(struct tcp_sock *tp, u32 ack)
+{
+#ifdef CONFIG_TCP_AO
+ struct tcp_ao_info *ao;
+
+ if (!static_branch_unlikely(&tcp_ao_needed.key))
+ return;
+
+ ao = rcu_dereference_protected(tp->ao_info,
+ lockdep_sock_is_held((struct sock *)tp));
+ if (ao && ack < tp->snd_una) {
+ ao->snd_sne++;
+ trace_tcp_ao_snd_sne_update((struct sock *)tp, ao->snd_sne);
+ }
+#endif
}
/* If we update tp->snd_una, also update tp->bytes_acked */
@@ -3345,9 +3707,27 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
sock_owned_by_me((struct sock *)tp);
tp->bytes_acked += delta;
+ tcp_snd_sne_update(tp, ack);
tp->snd_una = ack;
}
+static void tcp_rcv_sne_update(struct tcp_sock *tp, u32 seq)
+{
+#ifdef CONFIG_TCP_AO
+ struct tcp_ao_info *ao;
+
+ if (!static_branch_unlikely(&tcp_ao_needed.key))
+ return;
+
+ ao = rcu_dereference_protected(tp->ao_info,
+ lockdep_sock_is_held((struct sock *)tp));
+ if (ao && seq < tp->rcv_nxt) {
+ ao->rcv_sne++;
+ trace_tcp_ao_rcv_sne_update((struct sock *)tp, ao->rcv_sne);
+ }
+#endif
+}
+
/* If we update tp->rcv_nxt, also update tp->bytes_received */
static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
{
@@ -3355,7 +3735,8 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
sock_owned_by_me((struct sock *)tp);
tp->bytes_received += delta;
- tp->rcv_nxt = seq;
+ tcp_rcv_sne_update(tp, seq);
+ WRITE_ONCE(tp->rcv_nxt, seq);
}
/* Update our send window.
@@ -3404,16 +3785,23 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
static bool __tcp_oow_rate_limited(struct net *net, int mib_idx,
u32 *last_oow_ack_time)
{
- if (*last_oow_ack_time) {
- s32 elapsed = (s32)(tcp_jiffies32 - *last_oow_ack_time);
+ /* Paired with the WRITE_ONCE() in this function. */
+ u32 val = READ_ONCE(*last_oow_ack_time);
+
+ if (val) {
+ s32 elapsed = (s32)(tcp_jiffies32 - val);
- if (0 <= elapsed && elapsed < net->ipv4.sysctl_tcp_invalid_ratelimit) {
+ if (0 <= elapsed &&
+ elapsed < READ_ONCE(net->ipv4.sysctl_tcp_invalid_ratelimit)) {
NET_INC_STATS(net, mib_idx);
return true; /* rate-limited: don't send yet! */
}
}
- *last_oow_ack_time = tcp_jiffies32;
+ /* Paired with the prior READ_ONCE() and with itself,
+ * as we might be lockless.
+ */
+ WRITE_ONCE(*last_oow_ack_time, tcp_jiffies32);
return false; /* not rate-limited: go ahead, send dupack now! */
}
@@ -3436,15 +3824,22 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
}
+static void tcp_send_ack_reflect_ect(struct sock *sk, bool accecn_reflector)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u16 flags = 0;
+
+ if (accecn_reflector)
+ flags = tcp_accecn_reflector_flags(tp->syn_ect_rcv);
+ __tcp_send_ack(sk, tp->rcv_nxt, flags);
+}
+
/* RFC 5961 7 [ACK Throttling] */
-static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
+static void tcp_send_challenge_ack(struct sock *sk, bool accecn_reflector)
{
- /* unprotected vars, we dont care of overwrites */
- static u32 challenge_timestamp;
- static unsigned int challenge_count;
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
- u32 count, now;
+ u32 count, now, ack_limit;
/* First check our per-socket dupack rate limit. */
if (__tcp_oow_rate_limited(net,
@@ -3452,20 +3847,25 @@ static void tcp_send_challenge_ack(struct sock *sk, const struct sk_buff *skb)
&tp->last_oow_ack_time))
return;
+ ack_limit = READ_ONCE(net->ipv4.sysctl_tcp_challenge_ack_limit);
+ if (ack_limit == INT_MAX)
+ goto send_ack;
+
/* Then check host-wide RFC 5961 rate limit. */
now = jiffies / HZ;
- if (now != challenge_timestamp) {
- u32 ack_limit = net->ipv4.sysctl_tcp_challenge_ack_limit;
+ if (now != READ_ONCE(net->ipv4.tcp_challenge_timestamp)) {
u32 half = (ack_limit + 1) >> 1;
- challenge_timestamp = now;
- WRITE_ONCE(challenge_count, half + prandom_u32_max(ack_limit));
+ WRITE_ONCE(net->ipv4.tcp_challenge_timestamp, now);
+ WRITE_ONCE(net->ipv4.tcp_challenge_count,
+ get_random_u32_inclusive(half, ack_limit + half - 1));
}
- count = READ_ONCE(challenge_count);
+ count = READ_ONCE(net->ipv4.tcp_challenge_count);
if (count > 0) {
- WRITE_ONCE(challenge_count, count - 1);
+ WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1);
+send_ack:
NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
- tcp_send_ack(sk);
+ tcp_send_ack_reflect_ect(sk, accecn_reflector);
}
}
@@ -3475,8 +3875,16 @@ static void tcp_store_ts_recent(struct tcp_sock *tp)
tp->rx_opt.ts_recent_stamp = ktime_get_seconds();
}
-static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+static int __tcp_replace_ts_recent(struct tcp_sock *tp, s32 tstamp_delta)
{
+ tcp_store_ts_recent(tp);
+ return tstamp_delta > 0 ? FLAG_TS_PROGRESS : 0;
+}
+
+static int tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+{
+ s32 delta;
+
if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
/* PAWS bug workaround wrt. ACK frames, the PAWS discard
* extra check below makes sure this can only happen
@@ -3485,15 +3893,17 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
* Not only, also it occurs for expired timestamps.
*/
- if (tcp_paws_check(&tp->rx_opt, 0))
- tcp_store_ts_recent(tp);
+ if (tcp_paws_check(&tp->rx_opt, 0)) {
+ delta = tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent;
+ return __tcp_replace_ts_recent(tp, delta);
+ }
}
+
+ return 0;
}
-/* This routine deals with acks during a TLP episode.
- * We mark the end of a TLP episode on receiving TLP dupack or when
- * ack is after tlp_high_seq.
- * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe.
+/* This routine deals with acks during a TLP episode and ends an episode by
+ * resetting tlp_high_seq. Ref: TLP algorithm in RFC8985
*/
static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
{
@@ -3502,7 +3912,10 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
if (before(ack, tp->tlp_high_seq))
return;
- if (flag & FLAG_DSACKING_ACK) {
+ if (!tp->tlp_retrans) {
+ /* TLP of new data has been acknowledged */
+ tp->tlp_high_seq = 0;
+ } else if (flag & FLAG_DSACK_TLP) {
/* This DSACK means original and TLP probe arrived; no loss */
tp->tlp_high_seq = 0;
} else if (after(ack, tp->tlp_high_seq)) {
@@ -3522,12 +3935,23 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
}
}
-static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
+static void tcp_in_ack_event(struct sock *sk, int flag)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- if (icsk->icsk_ca_ops->in_ack_event)
- icsk->icsk_ca_ops->in_ack_event(sk, flags);
+ if (icsk->icsk_ca_ops->in_ack_event) {
+ u32 ack_ev_flags = 0;
+
+ if (flag & FLAG_WIN_UPDATE)
+ ack_ev_flags |= CA_ACK_WIN_UPDATE;
+ if (flag & FLAG_SLOWPATH) {
+ ack_ev_flags |= CA_ACK_SLOWPATH;
+ if (flag & FLAG_ECE)
+ ack_ev_flags |= CA_ACK_ECE;
+ }
+
+ icsk->icsk_ca_ops->in_ack_event(sk, ack_ev_flags);
+ }
}
/* Congestion control has updated the cwnd already. So if we're in
@@ -3538,10 +3962,10 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (rexmit == REXMIT_NONE)
+ if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT)
return;
- if (unlikely(rexmit == 2)) {
+ if (unlikely(rexmit == REXMIT_NEW)) {
__tcp_push_pending_frames(sk, tcp_current_mss(sk),
TCP_NAGLE_OFF);
if (after(tp->snd_nxt, tp->high_seq))
@@ -3552,7 +3976,8 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
}
/* Returns the number of packets newly acked or sacked by the current ACK */
-static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
+static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered,
+ u32 ecn_count, int flag)
{
const struct net *net = sock_net(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -3560,10 +3985,13 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
delivered = tp->delivered - prior_delivered;
NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
+
if (flag & FLAG_ECE) {
- tp->delivered_ce += delivered;
- NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
+ if (tcp_ecn_mode_rfc3168(tp))
+ ecn_count = delivered;
+ NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, ecn_count);
}
+
return delivered;
}
@@ -3578,15 +4006,18 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
bool is_sack_reneg = tp->is_sack_reneg;
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
- bool is_dupack = false;
+ int num_dupack = 0;
int prior_packets = tp->packets_out;
u32 delivered = tp->delivered;
u32 lost = tp->lost;
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+ u32 ecn_count = 0; /* Did we receive ECE/an AccECN ACE update? */
u32 prior_fack;
sack_state.first_sackt = 0;
sack_state.rate = &rs;
+ sack_state.sack_delivered = 0;
+ sack_state.delivered_bytes = 0;
/* We very likely will need to access rtx queue. */
prefetch(sk->tcp_rtx_queue.rb_node);
@@ -3595,11 +4026,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
* then we can probably ignore it.
*/
if (before(ack, prior_snd_una)) {
+ u32 max_window;
+
+ /* do not accept ACK for bytes we never sent. */
+ max_window = min_t(u64, tp->max_window, tp->bytes_acked);
/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
- if (before(ack, prior_snd_una - tp->max_window)) {
+ if (before(ack, prior_snd_una - max_window)) {
if (!(flag & FLAG_NO_CHALLENGE_ACK))
- tcp_send_challenge_ack(sk, skb);
- return -1;
+ tcp_send_challenge_ack(sk, false);
+ return -SKB_DROP_REASON_TCP_TOO_OLD_ACK;
}
goto old_ack;
}
@@ -3608,16 +4043,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
* this segment (RFC793 Section 3.9).
*/
if (after(ack, tp->snd_nxt))
- goto invalid_ack;
+ return -SKB_DROP_REASON_TCP_ACK_UNSENT_DATA;
if (after(ack, prior_snd_una)) {
flag |= FLAG_SND_UNA_ADVANCED;
- icsk->icsk_retransmits = 0;
+ WRITE_ONCE(icsk->icsk_retransmits, 0);
#if IS_ENABLED(CONFIG_TLS_DEVICE)
- if (static_branch_unlikely(&clean_acked_data_enabled))
- if (icsk->icsk_clean_acked)
- icsk->icsk_clean_acked(sk, ack);
+ if (static_branch_unlikely(&clean_acked_data_enabled.key))
+ if (tp->tcp_clean_acked)
+ tp->tcp_clean_acked(sk, ack);
#endif
}
@@ -3628,9 +4063,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
* is in window.
*/
if (flag & FLAG_UPDATE_TS_RECENT)
- tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
+ flag |= tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
- if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
+ if ((flag & (FLAG_SLOWPATH | FLAG_SND_UNA_ADVANCED)) ==
+ FLAG_SND_UNA_ADVANCED) {
/* Window is constant, pure forward advance.
* No more checks are required.
* Note, we use the fact that SND.UNA>=SND.WL2.
@@ -3639,12 +4075,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_snd_una_update(tp, ack);
flag |= FLAG_WIN_UPDATE;
- tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE);
-
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS);
} else {
- u32 ack_ev_flags = CA_ACK_SLOWPATH;
-
if (ack_seq != TCP_SKB_CB(skb)->end_seq)
flag |= FLAG_DATA;
else
@@ -3656,47 +4088,71 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state);
- if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) {
+ if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb)))
flag |= FLAG_ECE;
- ack_ev_flags |= CA_ACK_ECE;
- }
- if (flag & FLAG_WIN_UPDATE)
- ack_ev_flags |= CA_ACK_WIN_UPDATE;
-
- tcp_in_ack_event(sk, ack_ev_flags);
+ if (sack_state.sack_delivered)
+ tcp_count_delivered(tp, sack_state.sack_delivered,
+ flag & FLAG_ECE);
}
+ /* This is a deviation from RFC3168 since it states that:
+ * "When the TCP data sender is ready to set the CWR bit after reducing
+ * the congestion window, it SHOULD set the CWR bit only on the first
+ * new data packet that it transmits."
+ * We accept CWR on pure ACKs to be more robust
+ * with widely-deployed TCP implementations that do this.
+ */
+ tcp_ecn_accept_cwr(sk, skb);
+
/* We passed data and got it acked, remove any soft error
* log. Something worked...
*/
- sk->sk_err_soft = 0;
- icsk->icsk_probes_out = 0;
+ if (READ_ONCE(sk->sk_err_soft))
+ WRITE_ONCE(sk->sk_err_soft, 0);
+ WRITE_ONCE(icsk->icsk_probes_out, 0);
tp->rcv_tstamp = tcp_jiffies32;
if (!prior_packets)
goto no_queue;
/* See if we can take anything off of the retransmit queue. */
- flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state);
+ flag |= tcp_clean_rtx_queue(sk, skb, prior_fack, prior_snd_una,
+ &sack_state, flag & FLAG_ECE);
tcp_rack_update_reo_wnd(sk, &rs);
+ if (tcp_ecn_mode_accecn(tp))
+ ecn_count = tcp_accecn_process(sk, skb,
+ tp->delivered - delivered,
+ sack_state.delivered_bytes,
+ &flag);
+
+ tcp_in_ack_event(sk, flag);
+
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
- /* If needed, reset TLP/RTO timer; RACK may later override this. */
- if (flag & FLAG_SET_XMIT_TIMER)
- tcp_set_xmit_timer(sk);
if (tcp_ack_is_dubious(sk, flag)) {
- is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+ if (!(flag & (FLAG_SND_UNA_ADVANCED |
+ FLAG_NOT_DUP | FLAG_DSACKING_ACK))) {
+ num_dupack = 1;
+ /* Consider if pure acks were aggregated in tcp_add_backlog() */
+ if (!(flag & FLAG_DATA))
+ num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+ }
+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
}
+ /* If needed, reset TLP/RTO timer when RACK doesn't set. */
+ if (flag & FLAG_SET_XMIT_TIMER)
+ tcp_set_xmit_timer(sk);
+
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
sk_dst_confirm(sk);
- delivered = tcp_newly_delivered(sk, delivered, flag);
+ delivered = tcp_newly_delivered(sk, delivered, ecn_count, flag);
+
lost = tp->lost - lost; /* freshly marked lost */
rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
@@ -3705,11 +4161,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
return 1;
no_queue:
+ if (tcp_ecn_mode_accecn(tp))
+ ecn_count = tcp_accecn_process(sk, skb,
+ tp->delivered - delivered,
+ sack_state.delivered_bytes,
+ &flag);
+ tcp_in_ack_event(sk, flag);
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK) {
- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
- tcp_newly_delivered(sk, delivered, flag);
+ tcp_newly_delivered(sk, delivered, ecn_count, flag);
}
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
@@ -3721,10 +4183,6 @@ no_queue:
tcp_process_tlp_ack(sk, ack, flag);
return 1;
-invalid_ack:
- SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
- return -1;
-
old_ack:
/* If data was SACKed, tag it and see if we should send more data.
* If data was DSACKed, see if we can undo a cwnd reduction.
@@ -3732,13 +4190,12 @@ old_ack:
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state);
- tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
+ tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
- tcp_newly_delivered(sk, delivered, flag);
+ tcp_newly_delivered(sk, delivered, ecn_count, flag);
tcp_xmit_recovery(sk, rexmit);
}
- SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
return 0;
}
@@ -3759,7 +4216,7 @@ static void tcp_parse_fastopen_option(int len, const unsigned char *cookie,
foc->exp = exp_opt;
}
-static void smc_parse_options(const struct tcphdr *th,
+static bool smc_parse_options(const struct tcphdr *th,
struct tcp_options_received *opt_rx,
const unsigned char *ptr,
int opsize)
@@ -3768,10 +4225,56 @@ static void smc_parse_options(const struct tcphdr *th,
if (static_branch_unlikely(&tcp_have_smc)) {
if (th->syn && !(opsize & 1) &&
opsize >= TCPOLEN_EXP_SMC_BASE &&
- get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC)
+ get_unaligned_be32(ptr) == TCPOPT_SMC_MAGIC) {
opt_rx->smc_ok = 1;
+ return true;
+ }
}
#endif
+ return false;
+}
+
+/* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped
+ * value on success.
+ */
+u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss)
+{
+ const unsigned char *ptr = (const unsigned char *)(th + 1);
+ int length = (th->doff * 4) - sizeof(struct tcphdr);
+ u16 mss = 0;
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return mss;
+ case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */
+ length--;
+ continue;
+ default:
+ if (length < 2)
+ return mss;
+ opsize = *ptr++;
+ if (opsize < 2) /* "silly options" */
+ return mss;
+ if (opsize > length)
+ return mss; /* fail on partial options */
+ if (opcode == TCPOPT_MSS && opsize == TCPOLEN_MSS) {
+ u16 in_mss = get_unaligned_be16(ptr);
+
+ if (in_mss) {
+ if (user_mss && user_mss < in_mss)
+ in_mss = user_mss;
+ mss = in_mss;
+ }
+ }
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ }
+ return mss;
}
/* Look for tcp options. Normally only called on SYN and SYNACK packets.
@@ -3789,6 +4292,8 @@ void tcp_parse_options(const struct net *net,
ptr = (const unsigned char *)(th + 1);
opt_rx->saw_tstamp = 0;
+ opt_rx->accecn = 0;
+ opt_rx->saw_unknown = 0;
while (length > 0) {
int opcode = *ptr++;
@@ -3801,6 +4306,8 @@ void tcp_parse_options(const struct net *net,
length--;
continue;
default:
+ if (length < 2)
+ return;
opsize = *ptr++;
if (opsize < 2) /* "silly options" */
return;
@@ -3820,7 +4327,7 @@ void tcp_parse_options(const struct net *net,
break;
case TCPOPT_WINDOW:
if (opsize == TCPOLEN_WINDOW && th->syn &&
- !estab && net->ipv4.sysctl_tcp_window_scaling) {
+ !estab && READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) {
__u8 snd_wscale = *(__u8 *)ptr;
opt_rx->wscale_ok = 1;
if (snd_wscale > TCP_MAX_WSCALE) {
@@ -3836,7 +4343,7 @@ void tcp_parse_options(const struct net *net,
case TCPOPT_TIMESTAMP:
if ((opsize == TCPOLEN_TIMESTAMP) &&
((estab && opt_rx->tstamp_ok) ||
- (!estab && net->ipv4.sysctl_tcp_timestamps))) {
+ (!estab && READ_ONCE(net->ipv4.sysctl_tcp_timestamps)))) {
opt_rx->saw_tstamp = 1;
opt_rx->rcv_tsval = get_unaligned_be32(ptr);
opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
@@ -3844,7 +4351,7 @@ void tcp_parse_options(const struct net *net,
break;
case TCPOPT_SACK_PERM:
if (opsize == TCPOLEN_SACK_PERM && th->syn &&
- !estab && net->ipv4.sysctl_tcp_sack) {
+ !estab && READ_ONCE(net->ipv4.sysctl_tcp_sack)) {
opt_rx->sack_ok = TCP_SACK_SEEN;
tcp_sack_reset(opt_rx);
}
@@ -3859,9 +4366,15 @@ void tcp_parse_options(const struct net *net,
break;
#ifdef CONFIG_TCP_MD5SIG
case TCPOPT_MD5SIG:
- /*
- * The MD5 Hash has already been
- * checked (see tcp_v{4,6}_do_rcv()).
+ /* The MD5 Hash has already been
+ * checked (see tcp_v{4,6}_rcv()).
+ */
+ break;
+#endif
+#ifdef CONFIG_TCP_AO
+ case TCPOPT_AO:
+ /* TCP AO has already been checked
+ * (see tcp_inbound_ao_hash()).
*/
break;
#endif
@@ -3871,21 +4384,33 @@ void tcp_parse_options(const struct net *net,
ptr, th->syn, foc, false);
break;
+ case TCPOPT_ACCECN0:
+ case TCPOPT_ACCECN1:
+ /* Save offset of AccECN option in TCP header */
+ opt_rx->accecn = (ptr - 2) - (__u8 *)th;
+ break;
+
case TCPOPT_EXP:
/* Fast Open option shares code 254 using a
* 16 bits magic number.
*/
if (opsize >= TCPOLEN_EXP_FASTOPEN_BASE &&
get_unaligned_be16(ptr) ==
- TCPOPT_FASTOPEN_MAGIC)
+ TCPOPT_FASTOPEN_MAGIC) {
tcp_parse_fastopen_option(opsize -
TCPOLEN_EXP_FASTOPEN_BASE,
ptr + 2, th->syn, foc, true);
- else
- smc_parse_options(th, opt_rx, ptr,
- opsize);
+ break;
+ }
+
+ if (smc_parse_options(th, opt_rx, ptr, opsize))
+ break;
+
+ opt_rx->saw_unknown = 1;
break;
+ default:
+ opt_rx->saw_unknown = 1;
}
ptr += opsize-2;
length -= opsize;
@@ -3925,11 +4450,14 @@ static bool tcp_fast_parse_options(const struct net *net,
*/
if (th->doff == (sizeof(*th) / 4)) {
tp->rx_opt.saw_tstamp = 0;
+ tp->rx_opt.accecn = 0;
return false;
} else if (tp->rx_opt.tstamp_ok &&
th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
- if (tcp_parse_aligned_timestamp(tp, th))
+ if (tcp_parse_aligned_timestamp(tp, th)) {
+ tp->rx_opt.accecn = 0;
return true;
+ }
}
tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
@@ -3939,39 +4467,58 @@ static bool tcp_fast_parse_options(const struct net *net,
return true;
}
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
/*
- * Parse MD5 Signature option
+ * Parse Signature options
*/
-const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
+int tcp_do_parse_auth_options(const struct tcphdr *th,
+ const u8 **md5_hash, const u8 **ao_hash)
{
int length = (th->doff << 2) - sizeof(*th);
const u8 *ptr = (const u8 *)(th + 1);
+ unsigned int minlen = TCPOLEN_MD5SIG;
+
+ if (IS_ENABLED(CONFIG_TCP_AO))
+ minlen = sizeof(struct tcp_ao_hdr) + 1;
+
+ *md5_hash = NULL;
+ *ao_hash = NULL;
/* If not enough data remaining, we can short cut */
- while (length >= TCPOLEN_MD5SIG) {
+ while (length >= minlen) {
int opcode = *ptr++;
int opsize;
switch (opcode) {
case TCPOPT_EOL:
- return NULL;
+ return 0;
case TCPOPT_NOP:
length--;
continue;
default:
opsize = *ptr++;
if (opsize < 2 || opsize > length)
- return NULL;
- if (opcode == TCPOPT_MD5SIG)
- return opsize == TCPOLEN_MD5SIG ? ptr : NULL;
+ return -EINVAL;
+ if (opcode == TCPOPT_MD5SIG) {
+ if (opsize != TCPOLEN_MD5SIG)
+ return -EINVAL;
+ if (unlikely(*md5_hash || *ao_hash))
+ return -EEXIST;
+ *md5_hash = ptr;
+ } else if (opcode == TCPOPT_AO) {
+ if (opsize <= sizeof(struct tcp_ao_hdr))
+ return -EINVAL;
+ if (unlikely(*md5_hash || *ao_hash))
+ return -EEXIST;
+ *ao_hash = ptr;
+ }
}
ptr += opsize - 2;
length -= opsize;
}
- return NULL;
+ return 0;
}
-EXPORT_SYMBOL(tcp_parse_md5sig_option);
+EXPORT_SYMBOL(tcp_do_parse_auth_options);
#endif
/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
@@ -3997,33 +4544,57 @@ EXPORT_SYMBOL(tcp_parse_md5sig_option);
* up to bandwidth of 18Gigabit/sec. 8) ]
*/
-static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
+/* Estimates max number of increments of remote peer TSval in
+ * a replay window (based on our current RTO estimation).
+ */
+static u32 tcp_tsval_replay(const struct sock *sk)
+{
+ /* If we use usec TS resolution,
+ * then expect the remote peer to use the same resolution.
+ */
+ if (tcp_sk(sk)->tcp_usec_ts)
+ return inet_csk(sk)->icsk_rto * (USEC_PER_SEC / HZ);
+
+ /* RFC 7323 recommends a TSval clock between 1ms and 1sec.
+ * We know that some OS (including old linux) can use 1200 Hz.
+ */
+ return inet_csk(sk)->icsk_rto * 1200 / HZ;
+}
+
+static enum skb_drop_reason tcp_disordered_ack_check(const struct sock *sk,
+ const struct sk_buff *skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct tcphdr *th = tcp_hdr(skb);
- u32 seq = TCP_SKB_CB(skb)->seq;
+ SKB_DR_INIT(reason, TCP_RFC7323_PAWS);
u32 ack = TCP_SKB_CB(skb)->ack_seq;
+ u32 seq = TCP_SKB_CB(skb)->seq;
- return (/* 1. Pure ACK with correct sequence number. */
- (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
+ /* 1. Is this not a pure ACK ? */
+ if (!th->ack || seq != TCP_SKB_CB(skb)->end_seq)
+ return reason;
- /* 2. ... and duplicate ACK. */
- ack == tp->snd_una &&
+ /* 2. Is its sequence not the expected one ? */
+ if (seq != tp->rcv_nxt)
+ return before(seq, tp->rcv_nxt) ?
+ SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK :
+ reason;
- /* 3. ... and does not update window. */
- !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
+ /* 3. Is this not a duplicate ACK ? */
+ if (ack != tp->snd_una)
+ return reason;
- /* 4. ... and sits in replay window. */
- (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ);
-}
+ /* 4. Is this updating the window ? */
+ if (tcp_may_update_window(tp, ack, seq, ntohs(th->window) <<
+ tp->rx_opt.snd_wscale))
+ return reason;
-static inline bool tcp_paws_discard(const struct sock *sk,
- const struct sk_buff *skb)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
+ /* 5. Is this not in the replay window ? */
+ if ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) >
+ tcp_tsval_replay(sk))
+ return reason;
- return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
- !tcp_disordered_ack(sk, skb);
+ return 0;
}
/* Check segment sequence number for validity.
@@ -4039,38 +4610,68 @@ static inline bool tcp_paws_discard(const struct sock *sk,
* (borrowed from freebsd)
*/
-static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq)
+static enum skb_drop_reason tcp_sequence(const struct sock *sk,
+ u32 seq, u32 end_seq)
{
- return !before(end_seq, tp->rcv_wup) &&
- !after(seq, tp->rcv_nxt + tcp_receive_window(tp));
+ const struct tcp_sock *tp = tcp_sk(sk);
+
+ if (before(end_seq, tp->rcv_wup))
+ return SKB_DROP_REASON_TCP_OLD_SEQUENCE;
+
+ if (after(end_seq, tp->rcv_nxt + tcp_receive_window(tp))) {
+ if (after(seq, tp->rcv_nxt + tcp_receive_window(tp)))
+ return SKB_DROP_REASON_TCP_INVALID_SEQUENCE;
+
+ /* Only accept this packet if receive queue is empty. */
+ if (skb_queue_len(&sk->sk_receive_queue))
+ return SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE;
+ }
+
+ return SKB_NOT_DROPPED_YET;
+}
+
+
+void tcp_done_with_error(struct sock *sk, int err)
+{
+ /* This barrier is coupled with smp_rmb() in tcp_poll() */
+ WRITE_ONCE(sk->sk_err, err);
+ smp_wmb();
+
+ tcp_write_queue_purge(sk);
+ tcp_done(sk);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk_error_report(sk);
}
+EXPORT_IPV6_MOD(tcp_done_with_error);
/* When we get a reset we do this. */
-void tcp_reset(struct sock *sk)
+void tcp_reset(struct sock *sk, struct sk_buff *skb)
{
+ int err;
+
trace_tcp_receive_reset(sk);
+ /* mptcp can't tell us to ignore reset pkts,
+ * so just ignore the return value of mptcp_incoming_options().
+ */
+ if (sk_is_mptcp(sk))
+ mptcp_incoming_options(sk, skb);
+
/* We want the right error as BSD sees it (and indeed as we do). */
switch (sk->sk_state) {
case TCP_SYN_SENT:
- sk->sk_err = ECONNREFUSED;
+ err = ECONNREFUSED;
break;
case TCP_CLOSE_WAIT:
- sk->sk_err = EPIPE;
+ err = EPIPE;
break;
case TCP_CLOSE:
return;
default:
- sk->sk_err = ECONNRESET;
+ err = ECONNRESET;
}
- /* This barrier is coupled with smp_rmb() in tcp_poll() */
- smp_wmb();
-
- tcp_write_queue_purge(sk);
- tcp_done(sk);
-
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_error_report(sk);
+ tcp_done_with_error(sk, err);
}
/*
@@ -4093,7 +4694,7 @@ void tcp_fin(struct sock *sk)
inet_csk_schedule_ack(sk);
- sk->sk_shutdown |= RCV_SHUTDOWN;
+ WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN);
sock_set_flag(sk, SOCK_DONE);
switch (sk->sk_state) {
@@ -4101,7 +4702,7 @@ void tcp_fin(struct sock *sk)
case TCP_ESTABLISHED:
/* Move to CLOSE_WAIT */
tcp_set_state(sk, TCP_CLOSE_WAIT);
- inet_csk(sk)->icsk_ack.pingpong = 1;
+ inet_csk_enter_pingpong_mode(sk);
break;
case TCP_CLOSE_WAIT:
@@ -4142,7 +4743,6 @@ void tcp_fin(struct sock *sk)
skb_rbtree_purge(&tp->out_of_order_queue);
if (tcp_is_sack(tp))
tcp_sack_reset(&tp->rx_opt);
- sk_mem_reclaim(sk);
if (!sock_flag(sk, SOCK_DEAD)) {
sk->sk_state_change(sk);
@@ -4173,7 +4773,7 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
+ if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
int mib_idx;
if (before(seq, tp->rcv_nxt))
@@ -4199,6 +4799,29 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
}
+static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
+{
+ /* When the ACK path fails or drops most ACKs, the sender would
+ * timeout and spuriously retransmit the same segment repeatedly.
+ * If it seems our ACKs are not reaching the other side,
+ * based on receiving a duplicate data segment with new flowlabel
+ * (suggesting the sender suffered an RTO), and we are not already
+ * repathing due to our own RTO, then rehash the socket to repath our
+ * packets.
+ */
+#if IS_ENABLED(CONFIG_IPV6)
+ if (inet_csk(sk)->icsk_ca_state != TCP_CA_Loss &&
+ skb->protocol == htons(ETH_P_IPV6) &&
+ (tcp_sk(sk)->inet_conn.icsk_ack.lrcv_flowlabel !=
+ ntohl(ip6_flowlabel(ipv6_hdr(skb)))) &&
+ sk_rethink_txhash(sk))
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
+
+ /* Save last flowlabel after a spurious retrans. */
+ tcp_save_lrcv_flowlabel(sk, skb);
+#endif
+}
+
static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -4208,9 +4831,10 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
- if (tcp_is_sack(tp) && sock_net(sk)->ipv4.sysctl_tcp_dsack) {
+ if (tcp_is_sack(tp) && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_dsack)) {
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+ tcp_rcv_spurious_retrans(sk, skb);
if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
end_seq = tp->rcv_nxt;
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, end_seq);
@@ -4244,10 +4868,38 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
sp[i] = sp[i + 1];
continue;
}
- this_sack++, swalk++;
+ this_sack++;
+ swalk++;
}
}
+void tcp_sack_compress_send_ack(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tp->compressed_ack)
+ return;
+
+ if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
+ __sock_put(sk);
+
+ /* Since we have to send one ack finally,
+ * substract one from tp->compressed_ack to keep
+ * LINUX_MIB_TCPACKCOMPRESSED accurate.
+ */
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED,
+ tp->compressed_ack - 1);
+
+ tp->compressed_ack = 0;
+ tcp_send_ack(sk);
+}
+
+/* Reasonable amount of sack blocks included in TCP SACK option
+ * The max is 4, but this becomes 3 if TCP timestamps are there.
+ * Given that SACK packets might be lost, be conservative and use 2.
+ */
+#define TCP_SACK_BLOCKS_EXPECTED 2
+
static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -4260,6 +4912,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) {
if (tcp_sack_extend(sp, seq, end_seq)) {
+ if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
+ tcp_sack_compress_send_ack(sk);
/* Rotate this_sack to the first one. */
for (; this_sack > 0; this_sack--, sp--)
swap(*sp, *(sp - 1));
@@ -4269,6 +4923,9 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
}
}
+ if (this_sack >= TCP_SACK_BLOCKS_EXPECTED)
+ tcp_sack_compress_send_ack(sk);
+
/* Could not find an adjacent existing SACK, build a new one,
* put it at the front, and shift everyone else down. We
* always know there is at least one SACK present already here.
@@ -4276,8 +4933,6 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
* If the sack array is full, forget about the last one.
*/
if (this_sack >= TCP_NUM_SACKS) {
- if (tp->compressed_ack)
- tcp_send_ack(sk);
this_sack--;
tp->rx_opt.num_sacks--;
sp--;
@@ -4329,7 +4984,6 @@ static void tcp_sack_remove(struct tcp_sock *tp)
/**
* tcp_try_coalesce - try to merge skb to prior one
* @sk: socket
- * @dest: destination queue
* @to: prior buffer
* @from: buffer to add in queue
* @fragstolen: pointer to boolean
@@ -4353,10 +5007,8 @@ static bool tcp_try_coalesce(struct sock *sk,
if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
return false;
-#ifdef CONFIG_TLS_DEVICE
- if (from->decrypted != to->decrypted)
+ if (!tcp_skb_can_collapse_rx(to, from))
return false;
-#endif
if (!skb_try_coalesce(to, from, fragstolen, &delta))
return false;
@@ -4371,6 +5023,7 @@ static bool tcp_try_coalesce(struct sock *sk,
if (TCP_SKB_CB(from)->has_rxtstamp) {
TCP_SKB_CB(to)->has_rxtstamp = true;
to->tstamp = from->tstamp;
+ skb_hwtstamps(to)->hwtstamp = skb_hwtstamps(from)->hwtstamp;
}
return true;
@@ -4383,7 +5036,7 @@ static bool tcp_ooo_try_coalesce(struct sock *sk,
{
bool res = tcp_try_coalesce(sk, to, from, fragstolen);
- /* In case tcp_drop() is called later, update to->gso_segs */
+ /* In case tcp_drop_reason() is called later, update to->gso_segs */
if (res) {
u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
max_t(u16, 1, skb_shinfo(from)->gso_segs);
@@ -4393,10 +5046,11 @@ static bool tcp_ooo_try_coalesce(struct sock *sk,
return res;
}
-static void tcp_drop(struct sock *sk, struct sk_buff *skb)
+noinline_for_tracing static void
+tcp_drop_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason)
{
- sk_drops_add(sk, skb);
- __kfree_skb(skb);
+ sk_drops_skbadd(sk, skb);
+ sk_skb_reason_drop(sk, skb, reason);
}
/* This one checks to see if we can put data from the
@@ -4418,28 +5072,25 @@ static void tcp_ofo_queue(struct sock *sk)
if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
__u32 dsack = dsack_high;
+
if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
- dsack_high = TCP_SKB_CB(skb)->end_seq;
+ dsack = TCP_SKB_CB(skb)->end_seq;
tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack);
}
p = rb_next(p);
rb_erase(&skb->rbnode, &tp->out_of_order_queue);
if (unlikely(!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))) {
- SOCK_DEBUG(sk, "ofo packet was already received\n");
- tcp_drop(sk, skb);
+ tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_DROP);
continue;
}
- SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
- tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(skb)->end_seq);
tail = skb_peek_tail(&sk->sk_receive_queue);
eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
if (!eaten)
- __skb_queue_tail(&sk->sk_receive_queue, skb);
+ tcp_add_receive_queue(sk, skb);
else
kfree_skb_partial(skb, fragstolen);
@@ -4453,20 +5104,41 @@ static void tcp_ofo_queue(struct sock *sk)
}
}
-static bool tcp_prune_ofo_queue(struct sock *sk);
-static int tcp_prune_queue(struct sock *sk);
+static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb);
+static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
-static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
+/* Check if this incoming skb can be added to socket receive queues
+ * while satisfying sk->sk_rcvbuf limit.
+ *
+ * In theory we should use skb->truesize, but this can cause problems
+ * when applications use too small SO_RCVBUF values.
+ * When LRO / hw gro is used, the socket might have a high tp->scaling_ratio,
+ * allowing RWIN to be close to available space.
+ * Whenever the receive queue gets full, we can receive a small packet
+ * filling RWIN, but with a high skb->truesize, because most NIC use 4K page
+ * plus sk_buff metadata even when receiving less than 1500 bytes of payload.
+ *
+ * Note that we use skb->len to decide to accept or drop this packet,
+ * but sk->sk_rmem_alloc is the sum of all skb->truesize.
+ */
+static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb)
+{
+ unsigned int rmem = atomic_read(&sk->sk_rmem_alloc);
+
+ return rmem + skb->len <= sk->sk_rcvbuf;
+}
+
+static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb,
unsigned int size)
{
- if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+ if (!tcp_can_ingest(sk, skb) ||
!sk_rmem_schedule(sk, skb, size)) {
- if (tcp_prune_queue(sk) < 0)
+ if (tcp_prune_queue(sk, skb) < 0)
return -1;
while (!sk_rmem_schedule(sk, skb, size)) {
- if (!tcp_prune_ofo_queue(sk))
+ if (!tcp_prune_ofo_queue(sk, skb))
return -1;
}
}
@@ -4481,23 +5153,25 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
u32 seq, end_seq;
bool fragstolen;
- tcp_ecn_check_ce(sk, skb);
+ tcp_save_lrcv_flowlabel(sk, skb);
+ tcp_data_ecn_check(sk, skb);
if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFODROP);
- tcp_drop(sk, skb);
+ sk->sk_data_ready(sk);
+ tcp_drop_reason(sk, skb, SKB_DROP_REASON_PROTO_MEM);
return;
}
+ tcp_measure_rcv_mss(sk, skb);
/* Disable header prediction. */
tp->pred_flags = 0;
inet_csk_schedule_ack(sk);
+ tp->rcv_ooopack += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE);
seq = TCP_SKB_CB(skb)->seq;
end_seq = TCP_SKB_CB(skb)->end_seq;
- SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
- tp->rcv_nxt, seq, end_seq);
p = &tp->out_of_order_queue.rb_node;
if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) {
@@ -4519,7 +5193,11 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
skb, &fragstolen)) {
coalesce_done:
- tcp_grow_window(sk, skb);
+ /* For non sack flows, do not grow window to force DUPACK
+ * and trigger fast retransmit.
+ */
+ if (tcp_is_sack(tp))
+ tcp_grow_window(sk, skb, true);
kfree_skb_partial(skb, fragstolen);
skb = NULL;
goto add_sack;
@@ -4545,7 +5223,8 @@ coalesce_done:
/* All the bits are present. Drop. */
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPOFOMERGE);
- tcp_drop(sk, skb);
+ tcp_drop_reason(sk, skb,
+ SKB_DROP_REASON_TCP_OFOMERGE);
skb = NULL;
tcp_dsack_set(sk, seq, end_seq);
goto add_sack;
@@ -4564,7 +5243,8 @@ coalesce_done:
TCP_SKB_CB(skb1)->end_seq);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPOFOMERGE);
- tcp_drop(sk, skb1);
+ tcp_drop_reason(sk, skb1,
+ SKB_DROP_REASON_TCP_OFOMERGE);
goto merge_right;
}
} else if (tcp_ooo_try_coalesce(sk, skb1,
@@ -4592,7 +5272,7 @@ merge_right:
tcp_dsack_extend(sk, TCP_SKB_CB(skb1)->seq,
TCP_SKB_CB(skb1)->end_seq);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOMERGE);
- tcp_drop(sk, skb1);
+ tcp_drop_reason(sk, skb1, SKB_DROP_REASON_TCP_OFOMERGE);
}
/* If there is no skb after us, we are the last_skb ! */
if (!skb1)
@@ -4603,25 +5283,31 @@ add_sack:
tcp_sack_new_ofo_skb(sk, seq, end_seq);
end:
if (skb) {
- tcp_grow_window(sk, skb);
+ /* For non sack flows, do not grow window to force DUPACK
+ * and trigger fast retransmit.
+ */
+ if (tcp_is_sack(tp))
+ tcp_grow_window(sk, skb, false);
skb_condense(skb);
skb_set_owner_r(skb, sk);
}
+ /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */
+ if (sk->sk_socket)
+ tcp_rcvbuf_grow(sk, tp->rcvq_space.space);
}
-static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
- bool *fragstolen)
+static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,
+ bool *fragstolen)
{
int eaten;
struct sk_buff *tail = skb_peek_tail(&sk->sk_receive_queue);
- __skb_pull(skb, hdrlen);
eaten = (tail &&
tcp_try_coalesce(sk, tail,
skb, fragstolen)) ? 1 : 0;
tcp_rcv_nxt_update(tcp_sk(sk), TCP_SKB_CB(skb)->end_seq);
if (!eaten) {
- __skb_queue_tail(&sk->sk_receive_queue, skb);
+ tcp_add_receive_queue(sk, skb);
skb_set_owner_r(skb, sk);
}
return eaten;
@@ -4666,7 +5352,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size;
TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1;
- if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) {
+ if (tcp_queue_rcv(sk, skb, &fragstolen)) {
WARN_ON_ONCE(fragstolen); /* should not happen */
__kfree_skb(skb);
}
@@ -4681,30 +5367,33 @@ err:
void tcp_data_ready(struct sock *sk)
{
- const struct tcp_sock *tp = tcp_sk(sk);
- int avail = tp->rcv_nxt - tp->copied_seq;
-
- if (avail < sk->sk_rcvlowat && !sock_flag(sk, SOCK_DONE))
- return;
-
- sk->sk_data_ready(sk);
+ if (tcp_epollin_ready(sk, sk->sk_rcvlowat) || sock_flag(sk, SOCK_DONE))
+ sk->sk_data_ready(sk);
}
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
+ enum skb_drop_reason reason;
bool fragstolen;
int eaten;
+ /* If a subflow has been reset, the packet should not continue
+ * to be processed, drop the packet.
+ */
+ if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb)) {
+ __kfree_skb(skb);
+ return;
+ }
+
if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
__kfree_skb(skb);
return;
}
- skb_dst_drop(skb);
+ tcp_cleanup_skb(skb);
__skb_pull(skb, tcp_hdr(skb)->doff * 4);
- tcp_ecn_accept_cwr(sk, skb);
-
+ reason = SKB_DROP_REASON_NOT_SPECIFIED;
tp->rx_opt.dsack = 0;
/* Queue data for delivery to the user.
@@ -4713,20 +5402,39 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
*/
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
if (tcp_receive_window(tp) == 0) {
+ /* Some stacks are known to send bare FIN packets
+ * in a loop even if we send RWIN 0 in our ACK.
+ * Accepting this FIN does not hurt memory pressure
+ * because the FIN flag will simply be merged to the
+ * receive queue tail skb in most cases.
+ */
+ if (!skb->len &&
+ (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
+ goto queue_and_out;
+
+ reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
goto out_of_window;
}
/* Ok. In sequence. In window. */
queue_and_out:
- if (skb_queue_len(&sk->sk_receive_queue) == 0)
+ if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
+ /* TODO: maybe ratelimit these WIN 0 ACK ? */
+ inet_csk(sk)->icsk_ack.pending |=
+ (ICSK_ACK_NOMEM | ICSK_ACK_NOW);
+ inet_csk_schedule_ack(sk);
+ sk->sk_data_ready(sk);
+
+ if (skb_queue_len(&sk->sk_receive_queue) && skb->len) {
+ reason = SKB_DROP_REASON_PROTO_MEM;
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
+ goto drop;
+ }
sk_forced_mem_schedule(sk, skb->truesize);
- else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRCVQDROP);
- goto drop;
}
- eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
+ eaten = tcp_queue_rcv(sk, skb, &fragstolen);
if (skb->len)
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -4755,7 +5463,9 @@ queue_and_out:
}
if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+ tcp_rcv_spurious_retrans(sk, skb);
/* A retransmit, 2nd most common case. Force an immediate ack. */
+ reason = SKB_DROP_REASON_TCP_OLD_DATA;
NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
@@ -4763,26 +5473,26 @@ out_of_window:
tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_schedule_ack(sk);
drop:
- tcp_drop(sk, skb);
+ tcp_drop_reason(sk, skb, reason);
return;
}
/* Out of window. F.e. zero window probe. */
- if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
+ if (!before(TCP_SKB_CB(skb)->seq,
+ tp->rcv_nxt + tcp_receive_window(tp))) {
+ reason = SKB_DROP_REASON_TCP_OVERWINDOW;
goto out_of_window;
+ }
if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
/* Partial packet, seq < rcv_next < end_seq */
- SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
- tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(skb)->end_seq);
-
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
/* If window is closed, drop tail of packet. But after
* remembering D-SACK for its head made in previous line.
*/
if (!tcp_receive_window(tp)) {
+ reason = SKB_DROP_REASON_TCP_ZEROWINDOW;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPZEROWINDOWDROP);
goto out_of_window;
}
@@ -4859,6 +5569,9 @@ restart:
for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
n = tcp_skb_next(skb, list);
+ if (!skb_frags_readable(skb))
+ goto skip_this;
+
/* No new bits? It is possible on ofo queue. */
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
skb = tcp_collapse_one(sk, skb, list, root);
@@ -4870,7 +5583,7 @@ restart:
/* The first skb to collapse is:
* - not SYN/FIN and
* - bloated or contains data before "start" or
- * overlaps to the next one.
+ * overlaps to the next one and mptcp allow collapsing.
*/
if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) &&
(tcp_win_from_space(sk, skb->truesize) > skb->len ||
@@ -4879,17 +5592,20 @@ restart:
break;
}
- if (n && n != tail &&
+ if (n && n != tail && skb_frags_readable(n) &&
+ tcp_skb_can_collapse_rx(skb, n) &&
TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
end_of_skbs = false;
break;
}
+skip_this:
/* Decided to skip this, advance start seq. */
start = TCP_SKB_CB(skb)->end_seq;
}
if (end_of_skbs ||
- (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
+ (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) ||
+ !skb_frags_readable(skb))
return;
__skb_queue_head_init(&tmp);
@@ -4903,15 +5619,14 @@ restart:
break;
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
-#ifdef CONFIG_TLS_DEVICE
- nskb->decrypted = skb->decrypted;
-#endif
+ skb_copy_decrypted(nskb, skb);
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
if (list)
__skb_queue_before(list, skb, nskb);
else
__skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */
skb_set_owner_r(nskb, sk);
+ mptcp_skb_ext_move(nskb, skb);
/* Copy data, releasing collapsed skbs. */
while (copy > 0) {
@@ -4931,12 +5646,10 @@ restart:
skb = tcp_collapse_one(sk, skb, list, root);
if (!skb ||
skb == tail ||
- (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
- goto end;
-#ifdef CONFIG_TLS_DEVICE
- if (skb->decrypted != nskb->decrypted)
+ !tcp_skb_can_collapse_rx(nskb, skb) ||
+ (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) ||
+ !skb_frags_readable(skb))
goto end;
-#endif
}
}
}
@@ -4976,7 +5689,7 @@ new_range:
before(TCP_SKB_CB(skb)->end_seq, start)) {
/* Do not attempt collapsing tiny skbs */
if (range_truesize != head->truesize ||
- end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
+ end - start >= SKB_WITH_OVERHEAD(PAGE_SIZE)) {
tcp_collapse(sk, NULL, &tp->out_of_order_queue,
head, skb, start, end);
} else {
@@ -4999,6 +5712,8 @@ new_range:
* Clean the out-of-order queue to make room.
* We drop high sequences packets to :
* 1) Let a chance for holes to be filled.
+ * This means we do not drop packets from ooo queue if their sequence
+ * is before incoming packet sequence.
* 2) not add too big latencies if thousands of packets sit there.
* (But if application shrinks SO_RCVBUF, we could still end up
* freeing whole queue here)
@@ -5006,42 +5721,51 @@ new_range:
*
* Return true if queue has shrunk.
*/
-static bool tcp_prune_ofo_queue(struct sock *sk)
+static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct rb_node *node, *prev;
+ bool pruned = false;
int goal;
if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
return false;
- NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
goal = sk->sk_rcvbuf >> 3;
node = &tp->ooo_last_skb->rbnode;
+
do {
+ struct sk_buff *skb = rb_to_skb(node);
+
+ /* If incoming skb would land last in ofo queue, stop pruning. */
+ if (after(TCP_SKB_CB(in_skb)->seq, TCP_SKB_CB(skb)->seq))
+ break;
+ pruned = true;
prev = rb_prev(node);
rb_erase(node, &tp->out_of_order_queue);
- goal -= rb_to_skb(node)->truesize;
- tcp_drop(sk, rb_to_skb(node));
+ goal -= skb->truesize;
+ tcp_drop_reason(sk, skb, SKB_DROP_REASON_TCP_OFO_QUEUE_PRUNE);
+ tp->ooo_last_skb = rb_to_skb(prev);
if (!prev || goal <= 0) {
- sk_mem_reclaim(sk);
- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+ if (tcp_can_ingest(sk, in_skb) &&
!tcp_under_memory_pressure(sk))
break;
goal = sk->sk_rcvbuf >> 3;
}
node = prev;
} while (node);
- tp->ooo_last_skb = rb_to_skb(prev);
- /* Reset SACK state. A conforming SACK implementation will
- * do the same at a timeout based retransmit. When a connection
- * is in a sad state like this, we care only about integrity
- * of the connection not performance.
- */
- if (tp->rx_opt.sack_ok)
- tcp_sack_reset(&tp->rx_opt);
- return true;
+ if (pruned) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
+ /* Reset SACK state. A conforming SACK implementation will
+ * do the same at a timeout based retransmit. When a connection
+ * is in a sad state like this, we care only about integrity
+ * of the connection not performance.
+ */
+ if (tp->rx_opt.sack_ok)
+ tcp_sack_reset(&tp->rx_opt);
+ }
+ return pruned;
}
/* Reduce allocated memory if we can, trying to get
@@ -5051,20 +5775,22 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
* until the socket owning process reads some of the data
* to stabilize the situation.
*/
-static int tcp_prune_queue(struct sock *sk)
+static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb)
{
struct tcp_sock *tp = tcp_sk(sk);
- SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
+ /* Do nothing if our queues are empty. */
+ if (!atomic_read(&sk->sk_rmem_alloc))
+ return -1;
NET_INC_STATS(sock_net(sk), LINUX_MIB_PRUNECALLED);
- if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+ if (!tcp_can_ingest(sk, in_skb))
tcp_clamp_window(sk);
else if (tcp_under_memory_pressure(sk))
- tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
+ tcp_adjust_rcv_ssthresh(sk);
- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ if (tcp_can_ingest(sk, in_skb))
return 0;
tcp_collapse_ofo_queue(sk);
@@ -5073,17 +5799,16 @@ static int tcp_prune_queue(struct sock *sk)
skb_peek(&sk->sk_receive_queue),
NULL,
tp->copied_seq, tp->rcv_nxt);
- sk_mem_reclaim(sk);
- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ if (tcp_can_ingest(sk, in_skb))
return 0;
/* Collapsing did not help, destructive actions follow.
* This must not ever occur. */
- tcp_prune_ofo_queue(sk);
+ tcp_prune_ofo_queue(sk, in_skb);
- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ if (tcp_can_ingest(sk, in_skb))
return 0;
/* If we are really being abused, tell the caller to silently
@@ -5097,7 +5822,7 @@ static int tcp_prune_queue(struct sock *sk)
return -1;
}
-static bool tcp_should_expand_sndbuf(const struct sock *sk)
+static bool tcp_should_expand_sndbuf(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -5108,26 +5833,30 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk)
return false;
/* If we are under global TCP memory pressure, do not expand. */
- if (tcp_under_memory_pressure(sk))
+ if (tcp_under_memory_pressure(sk)) {
+ int unused_mem = sk_unused_reserved_mem(sk);
+
+ /* Adjust sndbuf according to reserved mem. But make sure
+ * it never goes below SOCK_MIN_SNDBUF.
+ * See sk_stream_moderate_sndbuf() for more details.
+ */
+ if (unused_mem > SOCK_MIN_SNDBUF)
+ WRITE_ONCE(sk->sk_sndbuf, unused_mem);
+
return false;
+ }
/* If we are under soft global TCP memory pressure, do not expand. */
if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
return false;
/* If we filled the congestion window, do not expand. */
- if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+ if (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp))
return false;
return true;
}
-/* When incoming ACK allowed to free some skb from write_queue,
- * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
- * on the exit from tcp input handler.
- *
- * PROBLEM: sndbuf expansion does not work well with largesend.
- */
static void tcp_new_space(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -5137,21 +5866,28 @@ static void tcp_new_space(struct sock *sk)
tp->snd_cwnd_stamp = tcp_jiffies32;
}
- sk->sk_write_space(sk);
+ INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk);
}
-static void tcp_check_space(struct sock *sk)
+/* Caller made space either from:
+ * 1) Freeing skbs in rtx queues (after tp->snd_una has advanced)
+ * 2) Sent skbs from output queue (and thus advancing tp->snd_nxt)
+ *
+ * We might be able to generate EPOLLOUT to the application if:
+ * 1) Space consumed in output/rtx queues is below sk->sk_sndbuf/2
+ * 2) notsent amount (tp->write_seq - tp->snd_nxt) became
+ * small enough that tcp_stream_memory_free() decides it
+ * is time to generate EPOLLOUT.
+ */
+void tcp_check_space(struct sock *sk)
{
- if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
- sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
- /* pairs with tcp_poll() */
- smp_mb();
- if (sk->sk_socket &&
- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
- tcp_new_space(sk);
- if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
- tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
- }
+ /* pairs with tcp_poll() */
+ smp_mb();
+ if (sk->sk_socket &&
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+ tcp_new_space(sk);
+ if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
}
@@ -5167,7 +5903,9 @@ static inline void tcp_data_snd_check(struct sock *sk)
static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
{
struct tcp_sock *tp = tcp_sk(sk);
- unsigned long rtt, delay;
+ struct net *net = sock_net(sk);
+ unsigned long rtt;
+ u64 delay;
/* More than one full frame received... */
if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
@@ -5182,6 +5920,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
tcp_in_quickack_mode(sk) ||
/* Protocol state mandates a one-time immediate ACK */
inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOW) {
+ /* If we are running from __release_sock() in user context,
+ * Defer the ack until tcp_release_cb().
+ */
+ if (sock_owned_by_user_nocheck(sk) &&
+ READ_ONCE(net->ipv4.sysctl_tcp_backlog_ack_defer)) {
+ set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags);
+ return;
+ }
send_now:
tcp_send_ack(sk);
return;
@@ -5193,24 +5939,42 @@ send_now:
}
if (!tcp_is_sack(tp) ||
- tp->compressed_ack >= sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)
+ tp->compressed_ack >= READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_nr))
goto send_now;
- tp->compressed_ack++;
+ if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
+ tp->compressed_ack_rcv_nxt = tp->rcv_nxt;
+ tp->dup_ack_counter = 0;
+ }
+ if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) {
+ tp->dup_ack_counter++;
+ goto send_now;
+ }
+ tp->compressed_ack++;
if (hrtimer_is_queued(&tp->compressed_ack_timer))
return;
- /* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */
+ /* compress ack timer : comp_sack_rtt_percent of rtt,
+ * but no more than tcp_comp_sack_delay_ns.
+ */
rtt = tp->rcv_rtt_est.rtt_us;
if (tp->srtt_us && tp->srtt_us < rtt)
rtt = tp->srtt_us;
- delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns,
- rtt * (NSEC_PER_USEC >> 3)/20);
+ /* delay = (rtt >> 3) * NSEC_PER_USEC * comp_sack_rtt_percent / 100
+ * ->
+ * delay = rtt * 1.25 * comp_sack_rtt_percent
+ */
+ delay = (u64)(rtt + (rtt >> 2)) *
+ READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_rtt_percent);
+
+ delay = min(delay, READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_delay_ns));
+
sock_hold(sk);
- hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay),
- HRTIMER_MODE_REL_PINNED_SOFT);
+ hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
+ READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_slack_ns),
+ HRTIMER_MODE_REL_PINNED_SOFT);
}
static inline void tcp_ack_snd_check(struct sock *sk)
@@ -5237,7 +6001,7 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
struct tcp_sock *tp = tcp_sk(sk);
u32 ptr = ntohs(th->urg_ptr);
- if (ptr && !sock_net(sk)->ipv4.sysctl_tcp_stdurg)
+ if (ptr && !READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_stdurg))
ptr--;
ptr += ntohl(th->seq);
@@ -5290,8 +6054,8 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th)
}
}
- tp->urg_data = TCP_URG_NOTYET;
- tp->urg_seq = ptr;
+ WRITE_ONCE(tp->urg_data, TCP_URG_NOTYET);
+ WRITE_ONCE(tp->urg_seq, ptr);
/* Disable header prediction. */
tp->pred_flags = 0;
@@ -5303,11 +6067,11 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
struct tcp_sock *tp = tcp_sk(sk);
/* Check if we get a new urgent pointer - normally not. */
- if (th->urg)
+ if (unlikely(th->urg))
tcp_check_urg(sk, th);
/* Do we wait for any urgent data? - normally not... */
- if (tp->urg_data == TCP_URG_NOTYET) {
+ if (unlikely(tp->urg_data == TCP_URG_NOTYET)) {
u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
th->syn;
@@ -5316,7 +6080,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
u8 tmp;
if (skb_copy_bits(skb, ptr, &tmp, 1))
BUG();
- tp->urg_data = TCP_URG_VALID | tmp;
+ WRITE_ONCE(tp->urg_data, TCP_URG_VALID | tmp);
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk);
}
@@ -5333,7 +6097,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t
*/
static bool tcp_reset_check(const struct sock *sk, const struct sk_buff *skb)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
return unlikely(TCP_SKB_CB(skb)->seq == (tp->rcv_nxt - 1) &&
(1 << sk->sk_state) & (TCPF_CLOSE_WAIT | TCPF_LAST_ACK |
@@ -5347,25 +6111,42 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, int syn_inerr)
{
struct tcp_sock *tp = tcp_sk(sk);
- bool rst_seq_match = false;
+ bool accecn_reflector = false;
+ SKB_DR(reason);
/* RFC1323: H1. Apply PAWS check first. */
- if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) &&
- tp->rx_opt.saw_tstamp &&
- tcp_paws_discard(sk, skb)) {
- if (!th->rst) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
- if (!tcp_oow_rate_limited(sock_net(sk), skb,
- LINUX_MIB_TCPACKSKIPPEDPAWS,
- &tp->last_oow_ack_time))
- tcp_send_dupack(sk, skb);
- goto discard;
- }
- /* Reset is accepted even if it did not pass PAWS. */
+ if (!tcp_fast_parse_options(sock_net(sk), skb, th, tp) ||
+ !tp->rx_opt.saw_tstamp ||
+ tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW))
+ goto step1;
+
+ reason = tcp_disordered_ack_check(sk, skb);
+ if (!reason)
+ goto step1;
+ /* Reset is accepted even if it did not pass PAWS. */
+ if (th->rst)
+ goto step1;
+ if (unlikely(th->syn))
+ goto syn_challenge;
+
+ /* Old ACK are common, increment PAWS_OLD_ACK
+ * and do not send a dupack.
+ */
+ if (reason == SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWS_OLD_ACK);
+ goto discard;
}
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+ if (!tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDPAWS,
+ &tp->last_oow_ack_time))
+ tcp_send_dupack(sk, skb);
+ goto discard;
+step1:
/* Step 1: check sequence number */
- if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
+ reason = tcp_sequence(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+ if (reason) {
/* RFC793, page 37: "In all states except SYN-SENT, all reset
* (RST) segments are validated by checking their SEQ-fields."
* And page 69: "If an incoming segment is not acceptable,
@@ -5375,12 +6156,17 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
if (!th->rst) {
if (th->syn)
goto syn_challenge;
+
+ if (reason == SKB_DROP_REASON_TCP_INVALID_SEQUENCE ||
+ reason == SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE)
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_BEYOND_WINDOW);
if (!tcp_oow_rate_limited(sock_net(sk), skb,
LINUX_MIB_TCPACKSKIPPEDSEQ,
&tp->last_oow_ack_time))
tcp_send_dupack(sk, skb);
} else if (tcp_reset_check(sk, skb)) {
- tcp_reset(sk);
+ goto reset;
}
goto discard;
}
@@ -5397,9 +6183,10 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
* Send a challenge ACK
*/
if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt ||
- tcp_reset_check(sk, skb)) {
- rst_seq_match = true;
- } else if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
+ tcp_reset_check(sk, skb))
+ goto reset;
+
+ if (tcp_is_sack(tp) && tp->rx_opt.num_sacks > 0) {
struct tcp_sack_block *sp = &tp->selective_acks[0];
int max_sack = sp[0].end_seq;
int this_sack;
@@ -5412,21 +6199,18 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
}
if (TCP_SKB_CB(skb)->seq == max_sack)
- rst_seq_match = true;
+ goto reset;
}
- if (rst_seq_match)
- tcp_reset(sk);
- else {
- /* Disable TFO if RST is out-of-order
- * and no data has been received
- * for current active TFO socket
- */
- if (tp->syn_fastopen && !tp->data_segs_in &&
- sk->sk_state == TCP_ESTABLISHED)
- tcp_fastopen_active_disable(sk);
- tcp_send_challenge_ack(sk, skb);
- }
+ /* Disable TFO if RST is out-of-order
+ * and no data has been received
+ * for current active TFO socket
+ */
+ if (tp->syn_fastopen && !tp->data_segs_in &&
+ sk->sk_state == TCP_ESTABLISHED)
+ tcp_fastopen_active_disable(sk);
+ tcp_send_challenge_ack(sk, false);
+ SKB_DR_SET(reason, TCP_RESET);
goto discard;
}
@@ -5436,18 +6220,42 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
* RFC 5961 4.2 : Send a challenge ack
*/
if (th->syn) {
+ if (tcp_ecn_mode_accecn(tp)) {
+ accecn_reflector = true;
+ if (tp->rx_opt.accecn &&
+ tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+ u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn);
+
+ tcp_accecn_saw_opt_fail_recv(tp, saw_opt);
+ tcp_accecn_opt_demand_min(sk, 1);
+ }
+ }
+ if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack &&
+ TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq &&
+ TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt &&
+ TCP_SKB_CB(skb)->ack_seq == tp->snd_nxt)
+ goto pass;
syn_challenge:
if (syn_inerr)
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
- tcp_send_challenge_ack(sk, skb);
+ tcp_send_challenge_ack(sk, accecn_reflector);
+ SKB_DR_SET(reason, TCP_INVALID_SYN);
goto discard;
}
+pass:
+ bpf_skops_parse_hdr(sk, skb);
+
return true;
discard:
- tcp_drop(sk, skb);
+ tcp_drop_reason(sk, skb, reason);
+ return false;
+
+reset:
+ tcp_reset(sk, skb);
+ __kfree_skb(skb);
return false;
}
@@ -5476,6 +6284,7 @@ discard:
*/
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
const struct tcphdr *th = (const struct tcphdr *)skb->data;
struct tcp_sock *tp = tcp_sk(sk);
unsigned int len = skb->len;
@@ -5484,7 +6293,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
trace_tcp_probe(sk, skb);
tcp_mstamp_refresh(tp);
- if (unlikely(!sk->sk_rx_dst))
+ if (unlikely(!rcu_access_pointer(sk->sk_rx_dst)))
inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
/*
* Header prediction.
@@ -5502,6 +6311,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
*/
tp->rx_opt.saw_tstamp = 0;
+ tp->rx_opt.accecn = 0;
/* pred_flags is 0xS?10 << 16 + snd_wnd
* if header_prediction is to be made
@@ -5516,6 +6326,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
int tcp_header_len = tp->tcp_header_len;
+ s32 delta = 0;
+ int flag = 0;
/* Timestamp header prediction: tcp_header_len
* is automatically equal to th->doff*4 due to pred_flags
@@ -5528,8 +6340,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
if (!tcp_parse_aligned_timestamp(tp, th))
goto slow_path;
+ delta = tp->rx_opt.rcv_tsval -
+ tp->rx_opt.ts_recent;
/* If PAWS failed, check it more carefully in slow path */
- if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
+ if (delta < 0)
goto slow_path;
/* DO NOT update ts_recent here, if checksum fails
@@ -5549,12 +6363,15 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
if (tcp_header_len ==
(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
tp->rcv_nxt == tp->rcv_wup)
- tcp_store_ts_recent(tp);
+ flag |= __tcp_replace_ts_recent(tp,
+ delta);
+
+ tcp_ecn_received_counters(sk, skb, 0);
/* We know that such packets are checksummed
* on entry.
*/
- tcp_ack(sk, skb, 0);
+ tcp_ack(sk, skb, flag);
__kfree_skb(skb);
tcp_data_snd_check(sk);
/* When receiving pure ack in fast path, update
@@ -5564,6 +6381,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
tp->rcv_rtt_last_tsecr = tp->rx_opt.rcv_tsecr;
return;
} else { /* Header too small */
+ reason = SKB_DROP_REASON_PKT_TOO_SMALL;
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
@@ -5574,6 +6392,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
if (tcp_checksum_complete(skb))
goto csum_error;
+ if (after(TCP_SKB_CB(skb)->end_seq,
+ tp->rcv_nxt + tcp_receive_window(tp)))
+ goto validate;
+
if ((int)skb->truesize > sk->sk_forward_alloc)
goto step5;
@@ -5584,24 +6406,30 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
if (tcp_header_len ==
(sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
tp->rcv_nxt == tp->rcv_wup)
- tcp_store_ts_recent(tp);
+ flag |= __tcp_replace_ts_recent(tp,
+ delta);
tcp_rcv_rtt_measure_ts(sk, skb);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS);
/* Bulk data transfer: receiver */
- eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
- &fragstolen);
+ tcp_cleanup_skb(skb);
+ __skb_pull(skb, tcp_header_len);
+ tcp_ecn_received_counters(sk, skb,
+ len - tcp_header_len);
+ eaten = tcp_queue_rcv(sk, skb, &fragstolen);
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
/* Well, only one small jumplet in fast path... */
- tcp_ack(sk, skb, FLAG_DATA);
+ tcp_ack(sk, skb, flag | FLAG_DATA);
tcp_data_snd_check(sk);
if (!inet_csk_ack_scheduled(sk))
goto no_ack;
+ } else {
+ tcp_update_wl(tp, TCP_SKB_CB(skb)->seq);
}
__tcp_ack_snd_check(sk, 0);
@@ -5617,20 +6445,26 @@ slow_path:
if (len < (th->doff << 2) || tcp_checksum_complete(skb))
goto csum_error;
- if (!th->ack && !th->rst && !th->syn)
+ if (!th->ack && !th->rst && !th->syn) {
+ reason = SKB_DROP_REASON_TCP_FLAGS;
goto discard;
+ }
/*
* Standard slow path.
*/
-
+validate:
if (!tcp_validate_incoming(sk, skb, th, 1))
return;
step5:
- if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0)
- goto discard;
+ tcp_ecn_received_counters_payload(sk, skb);
+ reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT);
+ if ((int)reason < 0) {
+ reason = -reason;
+ goto discard;
+ }
tcp_rcv_rtt_measure_ts(sk, skb);
/* Process urgent data. */
@@ -5644,19 +6478,50 @@ step5:
return;
csum_error:
+ reason = SKB_DROP_REASON_TCP_CSUM;
+ trace_tcp_bad_csum(skb);
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
discard:
- tcp_drop(sk, skb);
+ tcp_drop_reason(sk, skb, reason);
+}
+EXPORT_IPV6_MOD(tcp_rcv_established);
+
+void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tcp_mtup_init(sk);
+ icsk->icsk_af_ops->rebuild_header(sk);
+ tcp_init_metrics(sk);
+
+ /* Initialize the congestion window to start the transfer.
+ * Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
+ * retransmitted. In light of RFC6298 more aggressive 1sec
+ * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
+ * retransmission has occurred.
+ */
+ if (tp->total_retrans > 1 && tp->undo_marker)
+ tcp_snd_cwnd_set(tp, 1);
+ else
+ tcp_snd_cwnd_set(tp, tcp_init_cwnd(tp, __sk_dst_get(sk)));
+ tp->snd_cwnd_stamp = tcp_jiffies32;
+
+ bpf_skops_established(sk, bpf_op, skb);
+ /* Initialize congestion control unless BPF initialized it already: */
+ if (!icsk->icsk_ca_initialized)
+ tcp_init_congestion_control(sk);
+ tcp_init_buffer_space(sk);
}
-EXPORT_SYMBOL(tcp_rcv_established);
void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ tcp_ao_finish_connect(sk, skb);
tcp_set_state(sk, TCP_ESTABLISHED);
icsk->icsk_ack.lrcvtime = tcp_jiffies32;
@@ -5666,7 +6531,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
sk_mark_napi_id(sk, skb);
}
- tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
+ tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, skb);
/* Prevent spurious tcp_cwnd_restart() on first data
* packet.
@@ -5674,7 +6539,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
tp->lsndtime = tcp_jiffies32;
if (sock_flag(sk, SOCK_KEEPOPEN))
- inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp));
+ tcp_reset_keepalive_timer(sk, keepalive_time_when(tp));
if (!tp->rx_opt.snd_wscale)
__tcp_fast_path_on(tp, tp->snd_wnd);
@@ -5690,7 +6555,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
bool syn_drop = false;
- if (mss == tp->rx_opt.user_mss) {
+ if (mss == READ_ONCE(tp->rx_opt.user_mss)) {
struct tcp_options_received opt;
/* Get original SYNACK MSS value if user MSS sets mss_clamp */
@@ -5721,11 +6586,13 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp);
if (data) { /* Retransmit unacked data in SYN */
- skb_rbtree_walk_from(data) {
- if (__tcp_retransmit_skb(sk, data, 1))
- break;
- }
- tcp_rearm_rto(sk);
+ if (tp->total_retrans)
+ tp->fastopen_client_fail = TFO_SYN_RETRANSMITTED;
+ else
+ tp->fastopen_client_fail = TFO_DATA_NOT_ACKED;
+ skb_rbtree_walk_from(data)
+ tcp_mark_skb_lost(sk, data);
+ tcp_non_congestion_loss_retransmit(sk);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENACTIVEFAIL);
return true;
@@ -5753,6 +6620,21 @@ static void smc_check_reset_syn(struct tcp_sock *tp)
#endif
}
+static void tcp_try_undo_spurious_syn(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 syn_stamp;
+
+ /* undo_marker is set when SYN or SYNACK times out. The timeout is
+ * spurious if the ACK's timestamp option echo value matches the
+ * original SYN timestamp.
+ */
+ syn_stamp = tp->retrans_stamp;
+ if (tp->undo_marker && syn_stamp && tp->rx_opt.saw_tstamp &&
+ syn_stamp == tp->rx_opt.rcv_tsecr)
+ tp->undo_marker = 0;
+}
+
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th)
{
@@ -5761,6 +6643,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
struct tcp_fastopen_cookie foc = { .len = -1 };
int saved_clamp = tp->rx_opt.mss_clamp;
bool fastopen_fail;
+ SKB_DR(reason);
tcp_parse_options(sock_net(sk), skb, &tp->rx_opt, 0, &foc);
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
@@ -5776,14 +6659,21 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* the segment and return)"
*/
if (!after(TCP_SKB_CB(skb)->ack_seq, tp->snd_una) ||
- after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt))
+ after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
+ /* Previous FIN/ACK or RST/ACK might be ignored. */
+ if (icsk->icsk_retransmits == 0)
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ TCP_TIMEOUT_MIN, false);
+ SKB_DR_SET(reason, TCP_INVALID_ACK_SEQUENCE);
goto reset_and_undo;
+ }
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
!between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
- tcp_time_stamp(tp))) {
+ tcp_time_stamp_ts(tp))) {
NET_INC_STATS(sock_net(sk),
LINUX_MIB_PAWSACTIVEREJECTED);
+ SKB_DR_SET(reason, TCP_RFC7323_PAWS);
goto reset_and_undo;
}
@@ -5796,8 +6686,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
*/
if (th->rst) {
- tcp_reset(sk);
- goto discard;
+ tcp_reset(sk, skb);
+consume:
+ __kfree_skb(skb);
+ return 0;
}
/* rfc793:
@@ -5807,9 +6699,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* See note below!
* --ANK(990513)
*/
- if (!th->syn)
+ if (!th->syn) {
+ SKB_DR_SET(reason, TCP_FLAGS);
goto discard_and_undo;
-
+ }
/* rfc793:
* "If the SYN bit is on ...
* are acceptable then ...
@@ -5817,15 +6710,18 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* state to ESTABLISHED..."
*/
- tcp_ecn_rcv_synack(tp, th);
+ if (tcp_ecn_mode_any(tp))
+ tcp_ecn_rcv_synack(sk, skb, th,
+ TCP_SKB_CB(skb)->ip_dsfield);
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
+ tcp_try_undo_spurious_syn(sk);
tcp_ack(sk, skb, FLAG_SLOWPATH);
/* Ok.. it's good. Set up sequence numbers and
* move to established.
*/
- tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
/* RFC1323: The window in SYN & SYN/ACK segments is
@@ -5835,7 +6731,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
if (!tp->rx_opt.wscale_ok) {
tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
- tp->window_clamp = min(tp->window_clamp, 65535U);
+ WRITE_ONCE(tp->window_clamp,
+ min(tp->window_clamp, 65535U));
}
if (tp->rx_opt.saw_tstamp) {
@@ -5854,7 +6751,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
/* Remember, tcp_poll() does not lock socket!
* Change state from SYN-SENT only after copied_seq
* is initialized. */
- tp->copied_seq = tp->rcv_nxt;
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
smc_check_reset_syn(tp);
@@ -5872,8 +6769,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
if (fastopen_fail)
return -1;
if (sk->sk_write_pending ||
- icsk->icsk_accept_queue.rskq_defer_accept ||
- icsk->icsk_ack.pingpong) {
+ READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept) ||
+ inet_csk_in_pingpong_mode(sk)) {
/* Save one ACK. Data will be ready after
* several ticks, if write_pending is set.
*
@@ -5883,15 +6780,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
*/
inet_csk_schedule_ack(sk);
tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- TCP_DELACK_MAX, TCP_RTO_MAX);
-
-discard:
- tcp_drop(sk, skb);
- return 0;
- } else {
- tcp_send_ack(sk);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_DACK,
+ TCP_DELACK_MAX, false);
+ goto consume;
}
+ tcp_send_ack_reflect_ect(sk, tcp_ecn_mode_accecn(tp));
return -1;
}
@@ -5903,20 +6796,31 @@ discard:
*
* Otherwise (no ACK) drop the segment and return."
*/
-
+ SKB_DR_SET(reason, TCP_RESET);
goto discard_and_undo;
}
/* PAWS check. */
if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
- tcp_paws_reject(&tp->rx_opt, 0))
+ tcp_paws_reject(&tp->rx_opt, 0)) {
+ SKB_DR_SET(reason, TCP_RFC7323_PAWS);
goto discard_and_undo;
-
+ }
if (th->syn) {
/* We see SYN without ACK. It is attempt of
* simultaneous connect with crossed SYNs.
* Particularly, it can be connect to self.
*/
+#ifdef CONFIG_TCP_AO
+ struct tcp_ao_info *ao;
+
+ ao = rcu_dereference_protected(tp->ao_info,
+ lockdep_sock_is_held(sk));
+ if (ao) {
+ WRITE_ONCE(ao->risn, th->seq);
+ ao->rcv_sne = 0;
+ }
+#endif
tcp_set_state(sk, TCP_SYN_RECV);
if (tp->rx_opt.saw_tstamp) {
@@ -5928,8 +6832,8 @@ discard:
tp->tcp_header_len = sizeof(struct tcphdr);
}
- tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
- tp->copied_seq = tp->rcv_nxt;
+ WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1);
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
/* RFC1323: The window in SYN & SYN/ACK segments is
@@ -5939,7 +6843,7 @@ discard:
tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
tp->max_window = tp->snd_wnd;
- tcp_ecn_rcv_syn(tp, th);
+ tcp_ecn_rcv_syn(tp, th, skb);
tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
@@ -5960,7 +6864,7 @@ discard:
*/
return -1;
#else
- goto discard;
+ goto consume;
#endif
}
/* "fifth, if neither of the SYN or RST bits is set then
@@ -5970,12 +6874,55 @@ discard:
discard_and_undo:
tcp_clear_options(&tp->rx_opt);
tp->rx_opt.mss_clamp = saved_clamp;
- goto discard;
+ tcp_drop_reason(sk, skb, reason);
+ return 0;
reset_and_undo:
tcp_clear_options(&tp->rx_opt);
tp->rx_opt.mss_clamp = saved_clamp;
- return 1;
+ /* we can reuse/return @reason to its caller to handle the exception */
+ return reason;
+}
+
+static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct request_sock *req;
+
+ /* If we are still handling the SYNACK RTO, see if timestamp ECR allows
+ * undo. If peer SACKs triggered fast recovery, we can't undo here.
+ */
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss && !tp->packets_out)
+ tcp_try_undo_recovery(sk);
+
+ tcp_update_rto_time(tp);
+ WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0);
+ /* In tcp_fastopen_synack_timer() on the first SYNACK RTO we set
+ * retrans_stamp but don't enter CA_Loss, so in case that happened we
+ * need to zero retrans_stamp here to prevent spurious
+ * retransmits_timed_out(). However, if the ACK of our SYNACK caused us
+ * to enter CA_Recovery then we need to leave retrans_stamp as it was
+ * set entering CA_Recovery, for correct retransmits_timed_out() and
+ * undo behavior.
+ */
+ tcp_retrans_stamp_cleanup(sk);
+
+ /* Once we leave TCP_SYN_RECV or TCP_FIN_WAIT_1,
+ * we no longer need req so release it.
+ */
+ req = rcu_dereference_protected(tp->fastopen_rsk,
+ lockdep_sock_is_held(sk));
+ reqsk_fastopen_remove(sk, req, false);
+
+ /* Re-arm the timer because data may have been sent out.
+ * This is similar to the regular data transmission case
+ * when new data has just been ack'ed.
+ *
+ * (TFO) - we could try to be more aggressive and
+ * retransmitting any data sooner based on when they
+ * are sent out.
+ */
+ tcp_rearm_rto(sk);
}
/*
@@ -5985,43 +6932,47 @@ reset_and_undo:
* address independent.
*/
-int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
+enum skb_drop_reason
+tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcphdr *th = tcp_hdr(skb);
struct request_sock *req;
int queued = 0;
- bool acceptable;
+ SKB_DR(reason);
switch (sk->sk_state) {
case TCP_CLOSE:
+ SKB_DR_SET(reason, TCP_CLOSE);
goto discard;
case TCP_LISTEN:
if (th->ack)
- return 1;
+ return SKB_DROP_REASON_TCP_FLAGS;
- if (th->rst)
+ if (th->rst) {
+ SKB_DR_SET(reason, TCP_RESET);
goto discard;
-
+ }
if (th->syn) {
- if (th->fin)
+ if (th->fin) {
+ SKB_DR_SET(reason, TCP_FLAGS);
goto discard;
+ }
/* It is possible that we process SYN packets from backlog,
* so we need to make sure to disable BH and RCU right there.
*/
rcu_read_lock();
local_bh_disable();
- acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
+ icsk->icsk_af_ops->conn_request(sk, skb);
local_bh_enable();
rcu_read_unlock();
- if (!acceptable)
- return 1;
consume_skb(skb);
return 0;
}
+ SKB_DR_SET(reason, TCP_FLAGS);
goto discard;
case TCP_SYN_SENT:
@@ -6040,59 +6991,65 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tcp_mstamp_refresh(tp);
tp->rx_opt.saw_tstamp = 0;
- req = tp->fastopen_rsk;
+ req = rcu_dereference_protected(tp->fastopen_rsk,
+ lockdep_sock_is_held(sk));
if (req) {
bool req_stolen;
WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
sk->sk_state != TCP_FIN_WAIT1);
- if (!tcp_check_req(sk, skb, req, true, &req_stolen))
+ SKB_DR_SET(reason, TCP_FASTOPEN);
+ if (!tcp_check_req(sk, skb, req, true, &req_stolen, &reason))
goto discard;
}
- if (!th->ack && !th->rst && !th->syn)
+ if (!th->ack && !th->rst && !th->syn) {
+ SKB_DR_SET(reason, TCP_FLAGS);
goto discard;
-
+ }
if (!tcp_validate_incoming(sk, skb, th, 0))
return 0;
/* step 5: check the ACK field */
- acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
- FLAG_UPDATE_TS_RECENT |
- FLAG_NO_CHALLENGE_ACK) > 0;
-
- if (!acceptable) {
- if (sk->sk_state == TCP_SYN_RECV)
- return 1; /* send one RST */
- tcp_send_challenge_ack(sk, skb);
- goto discard;
+ reason = tcp_ack(sk, skb, FLAG_SLOWPATH |
+ FLAG_UPDATE_TS_RECENT |
+ FLAG_NO_CHALLENGE_ACK);
+
+ if ((int)reason <= 0) {
+ if (sk->sk_state == TCP_SYN_RECV) {
+ /* send one RST */
+ if (!reason)
+ return SKB_DROP_REASON_TCP_OLD_ACK;
+ return -reason;
+ }
+ /* accept old ack during closing */
+ if ((int)reason < 0) {
+ tcp_send_challenge_ack(sk, false);
+ reason = -reason;
+ goto discard;
+ }
}
+ SKB_DR_SET(reason, NOT_SPECIFIED);
switch (sk->sk_state) {
case TCP_SYN_RECV:
tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */
if (!tp->srtt_us)
tcp_synack_rtt_meas(sk, req);
- /* Once we leave TCP_SYN_RECV, we no longer need req
- * so release it.
- */
+ if (tp->rx_opt.tstamp_ok)
+ tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
+
if (req) {
- inet_csk(sk)->icsk_retransmits = 0;
- reqsk_fastopen_remove(sk, req, false);
- /* Re-arm the timer because data may have been sent out.
- * This is similar to the regular data transmission case
- * when new data has just been ack'ed.
- *
- * (TFO) - we could try to be more aggressive and
- * retransmitting any data sooner based on when they
- * are sent out.
- */
- tcp_rearm_rto(sk);
+ tcp_rcv_synrecv_state_fastopen(sk);
} else {
- tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB);
- tp->copied_seq = tp->rcv_nxt;
+ tcp_try_undo_spurious_syn(sk);
+ tp->retrans_stamp = 0;
+ tcp_init_transfer(sk, BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB,
+ skb);
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
}
+ tcp_ao_established(sk);
smp_mb();
tcp_set_state(sk, TCP_ESTABLISHED);
sk->sk_state_change(sk);
@@ -6108,9 +7065,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
- if (tp->rx_opt.tstamp_ok)
- tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
-
if (!inet_csk(sk)->icsk_ca_ops->cong_control)
tcp_update_pacing_rate(sk);
@@ -6118,27 +7072,25 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tp->lsndtime = tcp_jiffies32;
tcp_initialize_rcv_mss(sk);
+ if (tcp_ecn_mode_accecn(tp))
+ tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt);
tcp_fast_path_on(tp);
+ if (sk->sk_shutdown & SEND_SHUTDOWN)
+ tcp_shutdown(sk, SEND_SHUTDOWN);
+
break;
case TCP_FIN_WAIT1: {
int tmo;
- /* If we enter the TCP_FIN_WAIT1 state and we are a
- * Fast Open socket and this is the first acceptable
- * ACK we have received, this would have acknowledged
- * our SYNACK so stop the SYNACK timer.
- */
- if (req) {
- /* We no longer need the request sock. */
- reqsk_fastopen_remove(sk, req, false);
- tcp_rearm_rto(sk);
- }
+ if (req)
+ tcp_rcv_synrecv_state_fastopen(sk);
+
if (tp->snd_una != tp->write_seq)
break;
tcp_set_state(sk, TCP_FIN_WAIT2);
- sk->sk_shutdown |= SEND_SHUTDOWN;
+ WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | SEND_SHUTDOWN);
sk_dst_confirm(sk);
@@ -6148,10 +7100,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
break;
}
- if (tp->linger2 < 0) {
+ if (READ_ONCE(tp->linger2) < 0) {
tcp_done(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
- return 1;
+ return SKB_DROP_REASON_TCP_ABORT_ON_DATA;
}
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
@@ -6160,12 +7112,12 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tcp_fastopen_active_disable(sk);
tcp_done(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
- return 1;
+ return SKB_DROP_REASON_TCP_ABORT_ON_DATA;
}
tmo = tcp_fin_time(sk);
if (tmo > TCP_TIMEWAIT_LEN) {
- inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+ tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
} else if (th->fin || sock_owned_by_user(sk)) {
/* Bad case. We could lose such FIN otherwise.
* It is not a big problem, but it looks confusing
@@ -6173,10 +7125,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
* if it spins in bh_lock_sock(), but it is really
* marginal case.
*/
- inet_csk_reset_keepalive_timer(sk, tmo);
+ tcp_reset_keepalive_timer(sk, tmo);
} else {
tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
- goto discard;
+ goto consume;
}
break;
}
@@ -6184,7 +7136,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
case TCP_CLOSING:
if (tp->snd_una == tp->write_seq) {
tcp_time_wait(sk, TCP_TIME_WAIT, 0);
- goto discard;
+ goto consume;
}
break;
@@ -6192,7 +7144,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (tp->snd_una == tp->write_seq) {
tcp_update_metrics(sk);
tcp_done(sk);
- goto discard;
+ goto consume;
}
break;
}
@@ -6205,9 +7157,15 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
case TCP_CLOSE_WAIT:
case TCP_CLOSING:
case TCP_LAST_ACK:
- if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+ if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+ /* If a subflow has been reset, the packet should not
+ * continue to be processed, drop the packet.
+ */
+ if (sk_is_mptcp(sk) && !mptcp_incoming_options(sk, skb))
+ goto discard;
break;
- /* fall through */
+ }
+ fallthrough;
case TCP_FIN_WAIT1:
case TCP_FIN_WAIT2:
/* RFC 793 says to queue data in these states,
@@ -6218,11 +7176,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
- tcp_reset(sk);
- return 1;
+ tcp_reset(sk, skb);
+ return SKB_DROP_REASON_TCP_ABORT_ON_DATA;
}
}
- /* Fall through */
+ fallthrough;
case TCP_ESTABLISHED:
tcp_data_queue(sk, skb);
queued = 1;
@@ -6237,11 +7195,15 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
if (!queued) {
discard:
- tcp_drop(sk, skb);
+ tcp_drop_reason(sk, skb, reason);
}
return 0;
+
+consume:
+ __kfree_skb(skb);
+ return 0;
}
-EXPORT_SYMBOL(tcp_rcv_state_process);
+EXPORT_IPV6_MOD(tcp_rcv_state_process);
static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
{
@@ -6268,6 +7230,11 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
* congestion control: Linux DCTCP asserts ECT on all packets,
* including SYN, which is most optimal solution; however,
* others, such as FreeBSD do not.
+ *
+ * Exception: At least one of the reserved bits of the TCP header (th->res1) is
+ * set, indicating the use of a future TCP extension (such as AccECN). See
+ * RFC8311 §4.3 which updates RFC3168 to allow the development of such
+ * extensions.
*/
static void tcp_ecn_create_request(struct request_sock *req,
const struct sk_buff *skb,
@@ -6280,14 +7247,24 @@ static void tcp_ecn_create_request(struct request_sock *req,
bool ect, ecn_ok;
u32 ecn_ok_dst;
+ if (tcp_accecn_syn_requested(th) &&
+ READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3) {
+ inet_rsk(req)->ecn_ok = 1;
+ tcp_rsk(req)->accecn_ok = 1;
+ tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
+ INET_ECN_MASK;
+ return;
+ }
+
if (!th_ecn)
return;
ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
- ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
+ ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst;
- if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
+ if (((!ect || th->res1 || th->ae) && ecn_ok) ||
+ tcp_ca_needs_ecn(listen_sk) ||
(ecn_ok_dst & DST_FEATURE_ECN_CA) ||
tcp_bpf_ca_needs_ecn((struct sock *)req))
inet_rsk(req)->ecn_ok = 1;
@@ -6300,11 +7277,16 @@ static void tcp_openreq_init(struct request_sock *req,
struct inet_request_sock *ireq = inet_rsk(req);
req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */
- req->cookie_ts = 0;
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
- tcp_rsk(req)->snt_synack = tcp_clock_us();
+ tcp_rsk(req)->snt_synack = 0;
+ tcp_rsk(req)->snt_tsval_first = 0;
tcp_rsk(req)->last_oow_ack_time = 0;
+ tcp_rsk(req)->accecn_ok = 0;
+ tcp_rsk(req)->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN;
+ tcp_rsk(req)->accecn_fail_mode = 0;
+ tcp_rsk(req)->syn_ect_rcv = 0;
+ tcp_rsk(req)->syn_ect_snt = 0;
req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
ireq->tstamp_ok = rx_opt->tstamp_ok;
@@ -6317,48 +7299,26 @@ static void tcp_openreq_init(struct request_sock *req,
ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
ireq->ir_mark = inet_request_mark(sk, skb);
#if IS_ENABLED(CONFIG_SMC)
- ireq->smc_ok = rx_opt->smc_ok;
-#endif
-}
-
-struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
- struct sock *sk_listener,
- bool attach_listener)
-{
- struct request_sock *req = reqsk_alloc(ops, sk_listener,
- attach_listener);
-
- if (req) {
- struct inet_request_sock *ireq = inet_rsk(req);
-
- ireq->ireq_opt = NULL;
-#if IS_ENABLED(CONFIG_IPV6)
- ireq->pktopts = NULL;
+ ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested &&
+ tcp_sk(sk)->smc_hs_congested(sk));
#endif
- atomic64_set(&ireq->ir_cookie, 0);
- ireq->ireq_state = TCP_NEW_SYN_RECV;
- write_pnet(&ireq->ireq_net, sock_net(sk_listener));
- ireq->ireq_family = sk_listener->sk_family;
- }
-
- return req;
}
-EXPORT_SYMBOL(inet_reqsk_alloc);
/*
* Return true if a syncookie should be sent
*/
-static bool tcp_syn_flood_action(const struct sock *sk,
- const struct sk_buff *skb,
- const char *proto)
+static bool tcp_syn_flood_action(struct sock *sk, const char *proto)
{
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
const char *msg = "Dropping request";
- bool want_cookie = false;
struct net *net = sock_net(sk);
+ bool want_cookie = false;
+ u8 syncookies;
+
+ syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
#ifdef CONFIG_SYN_COOKIES
- if (net->ipv4.sysctl_tcp_syncookies) {
+ if (syncookies) {
msg = "Sending cookies";
want_cookie = true;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
@@ -6366,11 +7326,18 @@ static bool tcp_syn_flood_action(const struct sock *sk,
#endif
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
- if (!queue->synflood_warned &&
- net->ipv4.sysctl_tcp_syncookies != 2 &&
- xchg(&queue->synflood_warned, 1) == 0)
- net_info_ratelimited("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
- proto, ntohs(tcp_hdr(skb)->dest), msg);
+ if (syncookies != 2 && !READ_ONCE(queue->synflood_warned)) {
+ WRITE_ONCE(queue->synflood_warned, 1);
+ if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) {
+ net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n",
+ proto, inet6_rcv_saddr(sk),
+ sk->sk_num, msg);
+ } else {
+ net_info_ratelimited("%s: Possible SYN flooding on port %pI4:%u. %s.\n",
+ proto, &sk->sk_rcv_saddr,
+ sk->sk_num, msg);
+ }
+ }
return want_cookie;
}
@@ -6381,41 +7348,97 @@ static void tcp_reqsk_record_syn(const struct sock *sk,
{
if (tcp_sk(sk)->save_syn) {
u32 len = skb_network_header_len(skb) + tcp_hdrlen(skb);
- u32 *copy;
+ struct saved_syn *saved_syn;
+ u32 mac_hdrlen;
+ void *base;
+
+ if (tcp_sk(sk)->save_syn == 2) { /* Save full header. */
+ base = skb_mac_header(skb);
+ mac_hdrlen = skb_mac_header_len(skb);
+ len += mac_hdrlen;
+ } else {
+ base = skb_network_header(skb);
+ mac_hdrlen = 0;
+ }
- copy = kmalloc(len + sizeof(u32), GFP_ATOMIC);
- if (copy) {
- copy[0] = len;
- memcpy(&copy[1], skb_network_header(skb), len);
- req->saved_syn = copy;
+ saved_syn = kmalloc(struct_size(saved_syn, data, len),
+ GFP_ATOMIC);
+ if (saved_syn) {
+ saved_syn->mac_hdrlen = mac_hdrlen;
+ saved_syn->network_hdrlen = skb_network_header_len(skb);
+ saved_syn->tcp_hdrlen = tcp_hdrlen(skb);
+ memcpy(saved_syn->data, base, len);
+ req->saved_syn = saved_syn;
}
}
}
+/* If a SYN cookie is required and supported, returns a clamped MSS value to be
+ * used for SYN cookie generation.
+ */
+u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
+ const struct tcp_request_sock_ops *af_ops,
+ struct sock *sk, struct tcphdr *th)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u16 mss;
+
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_syncookies) != 2 &&
+ !inet_csk_reqsk_queue_is_full(sk))
+ return 0;
+
+ if (!tcp_syn_flood_action(sk, rsk_ops->slab_name))
+ return 0;
+
+ if (sk_acceptq_is_full(sk)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+ return 0;
+ }
+
+ mss = tcp_parse_mss_option(th, READ_ONCE(tp->rx_opt.user_mss));
+ if (!mss)
+ mss = af_ops->mss_clamp;
+
+ return mss;
+}
+EXPORT_IPV6_MOD_GPL(tcp_get_syncookie_mss);
+
int tcp_conn_request(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct sk_buff *skb)
{
struct tcp_fastopen_cookie foc = { .len = -1 };
- __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
struct tcp_options_received tmp_opt;
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct sock *fastopen_sk = NULL;
struct request_sock *req;
bool want_cookie = false;
struct dst_entry *dst;
struct flowi fl;
+ u8 syncookies;
+ u32 isn;
- /* TW buckets are converted to open requests without
- * limitations, they conserve resources and peer is
- * evidently real one.
- */
- if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
- inet_csk_reqsk_queue_is_full(sk)) && !isn) {
- want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
- if (!want_cookie)
- goto drop;
+#ifdef CONFIG_TCP_AO
+ const struct tcp_ao_hdr *aoh;
+#endif
+
+ isn = __this_cpu_read(tcp_tw_isn);
+ if (isn) {
+ /* TW buckets are converted to open requests without
+ * limitations, they conserve resources and peer is
+ * evidently real one.
+ */
+ __this_cpu_write(tcp_tw_isn, 0);
+ } else {
+ syncookies = READ_ONCE(net->ipv4.sysctl_tcp_syncookies);
+
+ if (syncookies == 2 || inet_csk_reqsk_queue_is_full(sk)) {
+ want_cookie = tcp_syn_flood_action(sk,
+ rsk_ops->slab_name);
+ if (!want_cookie)
+ goto drop;
+ }
}
if (sk_acceptq_is_full(sk)) {
@@ -6427,12 +7450,17 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (!req)
goto drop;
+ req->syncookie = want_cookie;
tcp_rsk(req)->af_specific = af_ops;
tcp_rsk(req)->ts_off = 0;
+ tcp_rsk(req)->req_usec_ts = false;
+#if IS_ENABLED(CONFIG_MPTCP)
+ tcp_rsk(req)->is_mptcp = 0;
+#endif
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = af_ops->mss_clamp;
- tmp_opt.user_mss = tp->rx_opt.user_mss;
+ tmp_opt.user_mss = READ_ONCE(tp->rx_opt.user_mss);
tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
want_cookie ? NULL : &foc);
@@ -6444,28 +7472,26 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
tcp_openreq_init(req, &tmp_opt, skb, sk);
- inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;
+ inet_rsk(req)->no_srccheck = inet_test_bit(TRANSPARENT, sk);
/* Note: tcp_v6_init_req() might override ir_iif for link locals */
inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
- af_ops->init_req(req, sk, skb);
-
- if (security_inet_conn_request(sk, skb, req))
- goto drop_and_free;
-
- if (tmp_opt.tstamp_ok)
- tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
-
- dst = af_ops->route_req(sk, &fl, req);
+ dst = af_ops->route_req(sk, skb, &fl, req, isn);
if (!dst)
goto drop_and_free;
+ if (tmp_opt.tstamp_ok) {
+ tcp_rsk(req)->req_usec_ts = dst_tcp_usec_ts(dst);
+ tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
+ }
if (!want_cookie && !isn) {
+ int max_syn_backlog = READ_ONCE(net->ipv4.sysctl_max_syn_backlog);
+
/* Kill the following clause, if you dislike this way. */
- if (!net->ipv4.sysctl_tcp_syncookies &&
- (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
- (net->ipv4.sysctl_max_syn_backlog >> 2)) &&
+ if (!syncookies &&
+ (max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
+ (max_syn_backlog >> 2)) &&
!tcp_peer_is_proven(req, dst)) {
/* Without syncookies last quarter of
* backlog is filled with destinations,
@@ -6486,13 +7512,25 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (want_cookie) {
isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
- req->cookie_ts = tmp_opt.tstamp_ok;
if (!tmp_opt.tstamp_ok)
inet_rsk(req)->ecn_ok = 0;
}
+#ifdef CONFIG_TCP_AO
+ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
+ goto drop_and_release; /* Invalid TCP options */
+ if (aoh) {
+ tcp_rsk(req)->used_tcp_ao = true;
+ tcp_rsk(req)->ao_rcv_next = aoh->keyid;
+ tcp_rsk(req)->ao_keyid = aoh->rnext_keyid;
+
+ } else {
+ tcp_rsk(req)->used_tcp_ao = false;
+ }
+#endif
tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->txhash = net_tx_rndhash();
+ tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
tcp_openreq_init_rwin(req, sk, dst);
sk_rx_queue_set(req_to_sk(req), skb);
if (!want_cookie) {
@@ -6501,20 +7539,28 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
}
if (fastopen_sk) {
af_ops->send_synack(fastopen_sk, dst, &fl, req,
- &foc, TCP_SYNACK_FASTOPEN);
+ &foc, TCP_SYNACK_FASTOPEN, skb);
/* Add the child socket directly into the accept queue */
- inet_csk_reqsk_queue_add(sk, req, fastopen_sk);
+ if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
+ bh_unlock_sock(fastopen_sk);
+ sock_put(fastopen_sk);
+ goto drop_and_free;
+ }
sk->sk_data_ready(sk);
bh_unlock_sock(fastopen_sk);
sock_put(fastopen_sk);
} else {
tcp_rsk(req)->tfo_listener = false;
- if (!want_cookie)
- inet_csk_reqsk_queue_hash_add(sk, req,
- tcp_timeout_init((struct sock *)req));
+ if (!want_cookie &&
+ unlikely(!inet_csk_reqsk_queue_hash_add(sk, req))) {
+ reqsk_free(req);
+ dst_release(dst);
+ return 0;
+ }
af_ops->send_synack(sk, dst, &fl, req, &foc,
!want_cookie ? TCP_SYNACK_NORMAL :
- TCP_SYNACK_COOKIE);
+ TCP_SYNACK_COOKIE,
+ skb);
if (want_cookie) {
reqsk_free(req);
return 0;
@@ -6526,9 +7572,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
drop_and_release:
dst_release(dst);
drop_and_free:
- reqsk_free(req);
+ __reqsk_free(req);
drop:
tcp_listendrop(sk);
return 0;
}
-EXPORT_SYMBOL(tcp_conn_request);
+EXPORT_IPV6_MOD(tcp_conn_request);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index cd426313a298..f8a9596e8f4d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -7,18 +8,12 @@
*
* IPv4 specific functions
*
- *
* code split from:
* linux/ipv4/tcp.c
* linux/ipv4/tcp_input.c
* linux/ipv4/tcp_output.c
*
* See tcp.c for author information
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/*
@@ -58,22 +53,30 @@
#include <linux/module.h>
#include <linux/random.h>
#include <linux/cache.h>
+#include <linux/fips.h>
#include <linux/jhash.h>
#include <linux/init.h>
#include <linux/times.h>
#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/sock_diag.h>
+#include <net/aligned_data.h>
#include <net/net_namespace.h>
#include <net/icmp.h>
#include <net/inet_hashtables.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/transp_v6.h>
#include <net/ipv6.h>
#include <net/inet_common.h>
+#include <net/inet_ecn.h>
#include <net/timewait_sock.h>
#include <net/xfrm.h>
#include <net/secure_seq.h>
#include <net/busy_poll.h>
+#include <net/rstreason.h>
+#include <net/psp.h>
#include <linux/inet.h>
#include <linux/ipv6.h>
@@ -81,19 +84,25 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/inetdevice.h>
+#include <linux/btf_ids.h>
+#include <linux/skbuff_ref.h>
-#include <crypto/hash.h>
-#include <linux/scatterlist.h>
+#include <crypto/md5.h>
#include <trace/events/tcp.h>
#ifdef CONFIG_TCP_MD5SIG
-static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
- __be32 daddr, __be32 saddr, const struct tcphdr *th);
+static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+ __be32 daddr, __be32 saddr, const struct tcphdr *th);
#endif
struct inet_hashinfo tcp_hashinfo;
-EXPORT_SYMBOL(tcp_hashinfo);
+
+static DEFINE_PER_CPU(struct sock_bh_locked, ipv4_tcp_sk) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
+
+static DEFINE_MUTEX(tcp_exit_batch_mutex);
static u32 tcp_v4_init_seq(const struct sk_buff *skb)
{
@@ -110,10 +119,15 @@ static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
{
+ int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
const struct inet_timewait_sock *tw = inet_twsk(sktw);
const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
struct tcp_sock *tp = tcp_sk(sk);
- int reuse = sock_net(sk)->ipv4.sysctl_tcp_tw_reuse;
+ int ts_recent_stamp;
+ u32 reuse_thresh;
+
+ if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
+ reuse = 0;
if (reuse == 2) {
/* Still does not detect *everything* that goes through
@@ -126,11 +140,9 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == AF_INET6) {
if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
- (ipv6_addr_v4mapped(&tw->tw_v6_daddr) &&
- (tw->tw_v6_daddr.s6_addr[12] == 127)) ||
+ ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
- (ipv6_addr_v4mapped(&tw->tw_v6_rcv_saddr) &&
- (tw->tw_v6_rcv_saddr.s6_addr[12] == 127)))
+ ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
loopback = true;
} else
#endif
@@ -154,9 +166,17 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
If TW bucket has been already destroyed we fall back to VJ's scheme
and use initial timestamp retrieved from peer table.
*/
- if (tcptw->tw_ts_recent_stamp &&
- (!twp || (reuse && time_after32(ktime_get_seconds(),
- tcptw->tw_ts_recent_stamp)))) {
+ ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
+ reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
+ if (ts_recent_stamp &&
+ (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
+ /* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
+ * and releasing the bucket lock.
+ */
+ if (unlikely(!refcount_inc_not_zero(&sktw->sk_refcnt)))
+ return 0;
+
/* In case of repair and re-using TIME-WAIT sockets we still
* want to be sure that it is safe as above but honor the
* sequence numbers and time stamps set as part of the repair
@@ -169,21 +189,23 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
* without appearing to create any others.
*/
if (likely(!tp->repair)) {
- tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
- if (tp->write_seq == 0)
- tp->write_seq = 1;
- tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
- tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+ u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
+
+ if (!seq)
+ seq = 1;
+ WRITE_ONCE(tp->write_seq, seq);
+ tp->rx_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent);
+ tp->rx_opt.ts_recent_stamp = ts_recent_stamp;
}
- sock_hold(sktw);
+
return 1;
}
return 0;
}
-EXPORT_SYMBOL_GPL(tcp_twsk_unique);
+EXPORT_IPV6_MOD_GPL(tcp_twsk_unique);
-static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
int addr_len)
{
/* This check is replicated from tcp_v4_connect() and intended to
@@ -195,22 +217,23 @@ static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
sock_owned_by_me(sk);
- return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
+ return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
}
/* This will initiate an outgoing connection. */
-int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
{
struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+ struct inet_timewait_death_row *tcp_death_row;
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct ip_options_rcu *inet_opt;
+ struct net *net = sock_net(sk);
__be16 orig_sport, orig_dport;
__be32 daddr, nexthop;
struct flowi4 *fl4;
struct rtable *rt;
int err;
- struct ip_options_rcu *inet_opt;
- struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
@@ -231,13 +254,12 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
orig_dport = usin->sin_port;
fl4 = &inet->cork.fl.u.ip4;
rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
- RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
- IPPROTO_TCP,
- orig_sport, orig_dport, sk);
+ sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
+ orig_dport, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
if (err == -ENETUNREACH)
- IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
return err;
}
@@ -249,24 +271,32 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (!inet_opt || !inet_opt->opt.srr)
daddr = fl4->daddr;
- if (!inet->inet_saddr)
- inet->inet_saddr = fl4->saddr;
- sk_rcv_saddr_set(sk, inet->inet_saddr);
+ tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
+
+ if (!inet->inet_saddr) {
+ err = inet_bhash2_update_saddr(sk, &fl4->saddr, AF_INET);
+ if (err) {
+ ip_rt_put(rt);
+ return err;
+ }
+ } else {
+ sk_rcv_saddr_set(sk, inet->inet_saddr);
+ }
if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
/* Reset inherited state */
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
if (likely(!tp->repair))
- tp->write_seq = 0;
+ WRITE_ONCE(tp->write_seq, 0);
}
inet->inet_dport = usin->sin_port;
sk_daddr_set(sk, daddr);
- inet_csk(sk)->icsk_ext_hdr_len = 0;
+ inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk);
if (inet_opt)
- inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
+ inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen;
tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
@@ -289,6 +319,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
rt = NULL;
goto failure;
}
+ tp->tcp_usec_ts = dst_tcp_usec_ts(&rt->dst);
/* OK, now commit destination to socket. */
sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->dst);
@@ -296,16 +327,17 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (likely(!tp->repair)) {
if (!tp->write_seq)
- tp->write_seq = secure_tcp_seq(inet->inet_saddr,
- inet->inet_daddr,
- inet->inet_sport,
- usin->sin_port);
- tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
- inet->inet_saddr,
- inet->inet_daddr);
+ WRITE_ONCE(tp->write_seq,
+ secure_tcp_seq(inet->inet_saddr,
+ inet->inet_daddr,
+ inet->inet_sport,
+ usin->sin_port));
+ WRITE_ONCE(tp->tsoffset,
+ secure_tcp_ts_off(net, inet->inet_saddr,
+ inet->inet_daddr));
}
- inet->inet_id = tp->write_seq ^ jiffies;
+ atomic_set(&inet->inet_id, get_random_u16());
if (tcp_fastopen_defer_connect(sk, &err))
return err;
@@ -325,12 +357,13 @@ failure:
* if necessary.
*/
tcp_set_state(sk, TCP_CLOSE);
+ inet_bhash2_reset_saddr(sk);
ip_rt_put(rt);
sk->sk_route_caps = 0;
inet->inet_dport = 0;
return err;
}
-EXPORT_SYMBOL(tcp_v4_connect);
+EXPORT_IPV6_MOD(tcp_v4_connect);
/*
* This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
@@ -345,7 +378,7 @@ void tcp_v4_mtu_reduced(struct sock *sk)
if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
return;
- mtu = tcp_sk(sk)->mtu_info;
+ mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
dst = inet_csk_update_pmtu(sk, mtu);
if (!dst)
return;
@@ -354,7 +387,7 @@ void tcp_v4_mtu_reduced(struct sock *sk)
* for the case, if this connection will not able to recover.
*/
if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
- sk->sk_err_soft = EMSGSIZE;
+ WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
mtu = dst_mtu(dst);
@@ -371,7 +404,7 @@ void tcp_v4_mtu_reduced(struct sock *sk)
tcp_simple_retransmit(sk);
} /* else let the usual retransmit timer handle it */
}
-EXPORT_SYMBOL(tcp_v4_mtu_reduced);
+EXPORT_IPV6_MOD(tcp_v4_mtu_reduced);
static void do_redirect(struct sk_buff *skb, struct sock *sk)
{
@@ -405,7 +438,46 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort)
}
reqsk_put(req);
}
-EXPORT_SYMBOL(tcp_req_err);
+EXPORT_IPV6_MOD(tcp_req_err);
+
+/* TCP-LD (RFC 6069) logic */
+void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+ s32 remaining;
+ u32 delta_us;
+
+ if (sock_owned_by_user(sk))
+ return;
+
+ if (seq != tp->snd_una || !icsk->icsk_retransmits ||
+ !icsk->icsk_backoff)
+ return;
+
+ skb = tcp_rtx_queue_head(sk);
+ if (WARN_ON_ONCE(!skb))
+ return;
+
+ icsk->icsk_backoff--;
+ icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
+ icsk->icsk_rto = inet_csk_rto_backoff(icsk, tcp_rto_max(sk));
+
+ tcp_mstamp_refresh(tp);
+ delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
+ remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
+
+ if (remaining > 0) {
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, false);
+ } else {
+ /* RTO revert clocked out retransmission.
+ * Will retransmit now.
+ */
+ tcp_retransmit_timer(sk);
+ }
+}
+EXPORT_IPV6_MOD(tcp_ld_RTO_revert);
/*
* This routine is called by the ICMP module when it gets some
@@ -423,43 +495,45 @@ EXPORT_SYMBOL(tcp_req_err);
*
*/
-void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
+int tcp_v4_err(struct sk_buff *skb, u32 info)
{
- const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
- struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
- struct inet_connection_sock *icsk;
- struct tcp_sock *tp;
- struct inet_sock *inet;
- const int type = icmp_hdr(icmp_skb)->type;
- const int code = icmp_hdr(icmp_skb)->code;
- struct sock *sk;
- struct sk_buff *skb;
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
+ struct net *net = dev_net_rcu(skb->dev);
+ const int type = icmp_hdr(skb)->type;
+ const int code = icmp_hdr(skb)->code;
struct request_sock *fastopen;
+ struct tcp_sock *tp;
u32 seq, snd_una;
- s32 remaining;
- u32 delta_us;
+ struct sock *sk;
int err;
- struct net *net = dev_net(icmp_skb->dev);
- sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
- th->dest, iph->saddr, ntohs(th->source),
- inet_iif(icmp_skb), 0);
+ sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr,
+ ntohs(th->source), inet_iif(skb), 0);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
- return;
+ return -ENOENT;
}
if (sk->sk_state == TCP_TIME_WAIT) {
+ /* To increase the counter of ignored icmps for TCP-AO */
+ tcp_ao_ignore_icmp(sk, AF_INET, type, code);
inet_twsk_put(inet_twsk(sk));
- return;
+ return 0;
}
seq = ntohl(th->seq);
- if (sk->sk_state == TCP_NEW_SYN_RECV)
- return tcp_req_err(sk, seq,
- type == ICMP_PARAMETERPROB ||
- type == ICMP_TIME_EXCEEDED ||
- (type == ICMP_DEST_UNREACH &&
- (code == ICMP_NET_UNREACH ||
- code == ICMP_HOST_UNREACH)));
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
+ type == ICMP_TIME_EXCEEDED ||
+ (type == ICMP_DEST_UNREACH &&
+ (code == ICMP_NET_UNREACH ||
+ code == ICMP_HOST_UNREACH)));
+ return 0;
+ }
+
+ if (tcp_ao_ignore_icmp(sk, AF_INET, type, code)) {
+ sock_put(sk);
+ return 0;
+ }
bh_lock_sock(sk);
/* If too many ICMPs get dropped on busy
@@ -474,15 +548,17 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
if (sk->sk_state == TCP_CLOSE)
goto out;
- if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
- __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
- goto out;
+ if (static_branch_unlikely(&ip4_min_ttl)) {
+ /* min_ttl can be changed concurrently from do_ip_setsockopt() */
+ if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+ goto out;
+ }
}
- icsk = inet_csk(sk);
tp = tcp_sk(sk);
/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
- fastopen = tp->fastopen_rsk;
+ fastopen = rcu_dereference(tp->fastopen_rsk);
snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
if (sk->sk_state != TCP_LISTEN &&
!between(seq, snd_una, tp->snd_nxt)) {
@@ -493,7 +569,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
switch (type) {
case ICMP_REDIRECT:
if (!sock_owned_by_user(sk))
- do_redirect(icmp_skb, sk);
+ do_redirect(skb, sk);
goto out;
case ICMP_SOURCE_QUENCH:
/* Just silently ignore these. */
@@ -513,7 +589,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
if (sk->sk_state == TCP_LISTEN)
goto out;
- tp->mtu_info = info;
+ WRITE_ONCE(tp->mtu_info, info);
if (!sock_owned_by_user(sk)) {
tcp_v4_mtu_reduced(sk);
} else {
@@ -524,39 +600,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
}
err = icmp_err_convert[code].errno;
- /* check if icmp_skb allows revert of backoff
- * (see draft-zimmermann-tcp-lcd) */
- if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
- break;
- if (seq != tp->snd_una || !icsk->icsk_retransmits ||
- !icsk->icsk_backoff || fastopen)
- break;
-
- if (sock_owned_by_user(sk))
- break;
-
- icsk->icsk_backoff--;
- icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) :
- TCP_TIMEOUT_INIT;
- icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
-
- skb = tcp_rtx_queue_head(sk);
- BUG_ON(!skb);
-
- tcp_mstamp_refresh(tp);
- delta_us = (u32)(tp->tcp_mstamp - skb->skb_mstamp);
- remaining = icsk->icsk_rto -
- usecs_to_jiffies(delta_us);
-
- if (remaining > 0) {
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- remaining, TCP_RTO_MAX);
- } else {
- /* RTO revert clocked out retransmission.
- * Will retransmit now */
- tcp_retransmit_timer(sk);
- }
-
+ /* check if this ICMP message allows revert of backoff.
+ * (see RFC 6069)
+ */
+ if (!fastopen &&
+ (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
+ tcp_ld_RTO_revert(sk, seq);
break;
case ICMP_TIME_EXCEEDED:
err = EHOSTUNREACH;
@@ -569,20 +618,17 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
case TCP_SYN_SENT:
case TCP_SYN_RECV:
/* Only in fast or simultaneous open. If a fast open socket is
- * is already accepted it is treated as a connected one below.
+ * already accepted it is treated as a connected one below.
*/
if (fastopen && !fastopen->sk)
break;
- if (!sock_owned_by_user(sk)) {
- sk->sk_err = err;
-
- sk->sk_error_report(sk);
+ ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
- tcp_done(sk);
- } else {
- sk->sk_err_soft = err;
- }
+ if (!sock_owned_by_user(sk))
+ tcp_done_with_error(sk, err);
+ else
+ WRITE_ONCE(sk->sk_err_soft, err);
goto out;
}
@@ -602,17 +648,18 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
* --ANK (980905)
*/
- inet = inet_sk(sk);
- if (!sock_owned_by_user(sk) && inet->recverr) {
- sk->sk_err = err;
- sk->sk_error_report(sk);
+ if (!sock_owned_by_user(sk) &&
+ inet_test_bit(RECVERR, sk)) {
+ WRITE_ONCE(sk->sk_err, err);
+ sk_error_report(sk);
} else { /* Only an error on timeout */
- sk->sk_err_soft = err;
+ WRITE_ONCE(sk->sk_err_soft, err);
}
out:
bh_unlock_sock(sk);
sock_put(sk);
+ return 0;
}
void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
@@ -631,7 +678,53 @@ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
}
-EXPORT_SYMBOL(tcp_v4_send_check);
+EXPORT_IPV6_MOD(tcp_v4_send_check);
+
+#define REPLY_OPTIONS_LEN (MAX_TCP_OPTION_SPACE / sizeof(__be32))
+
+static bool tcp_v4_ao_sign_reset(const struct sock *sk, struct sk_buff *skb,
+ const struct tcp_ao_hdr *aoh,
+ struct ip_reply_arg *arg, struct tcphdr *reply,
+ __be32 reply_options[REPLY_OPTIONS_LEN])
+{
+#ifdef CONFIG_TCP_AO
+ int sdif = tcp_v4_sdif(skb);
+ int dif = inet_iif(skb);
+ int l3index = sdif ? dif : 0;
+ bool allocated_traffic_key;
+ struct tcp_ao_key *key;
+ char *traffic_key;
+ bool drop = true;
+ u32 ao_sne = 0;
+ u8 keyid;
+
+ rcu_read_lock();
+ if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, ntohl(reply->seq),
+ &key, &traffic_key, &allocated_traffic_key,
+ &keyid, &ao_sne))
+ goto out;
+
+ reply_options[0] = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key) << 16) |
+ (aoh->rnext_keyid << 8) | keyid);
+ arg->iov[0].iov_len += tcp_ao_len_aligned(key);
+ reply->doff = arg->iov[0].iov_len / 4;
+
+ if (tcp_ao_hash_hdr(AF_INET, (char *)&reply_options[1],
+ key, traffic_key,
+ (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
+ (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
+ reply, ao_sne))
+ goto out;
+ drop = false;
+out:
+ rcu_read_unlock();
+ if (allocated_traffic_key)
+ kfree(traffic_key);
+ return drop;
+#else
+ return true;
+#endif
+}
/*
* This routine will send an RST to the other tcp.
@@ -646,25 +739,26 @@ EXPORT_SYMBOL(tcp_v4_send_check);
* Exception: precedence violation. We do not implement it in any case.
*/
-static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
+static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
+ enum sk_rst_reason reason)
{
const struct tcphdr *th = tcp_hdr(skb);
struct {
struct tcphdr th;
-#ifdef CONFIG_TCP_MD5SIG
- __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
-#endif
+ __be32 opt[REPLY_OPTIONS_LEN];
} rep;
+ const __u8 *md5_hash_location = NULL;
+ const struct tcp_ao_hdr *aoh;
struct ip_reply_arg arg;
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *key = NULL;
- const __u8 *hash_location = NULL;
unsigned char newhash[16];
- int genhash;
struct sock *sk1 = NULL;
#endif
- struct net *net;
+ u64 transmit_time = 0;
struct sock *ctl_sk;
+ struct net *net;
+ u32 txhash = 0;
/* Never send a reset in response to a reset. */
if (th->rst)
@@ -695,14 +789,33 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
arg.iov[0].iov_base = (unsigned char *)&rep;
arg.iov[0].iov_len = sizeof(rep.th);
- net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
+ net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
+
+ /* Invalid TCP option size or twice included auth */
+ if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh))
+ return;
+
+ if (aoh && tcp_v4_ao_sign_reset(sk, skb, aoh, &arg, &rep.th, rep.opt))
+ return;
+
#ifdef CONFIG_TCP_MD5SIG
rcu_read_lock();
- hash_location = tcp_parse_md5sig_option(th);
if (sk && sk_fullsock(sk)) {
- key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
- &ip_hdr(skb)->saddr, AF_INET);
- } else if (hash_location) {
+ const union tcp_md5_addr *addr;
+ int l3index;
+
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and inet_iif is set to it.
+ */
+ l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+ key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
+ } else if (md5_hash_location) {
+ const union tcp_md5_addr *addr;
+ int sdif = tcp_v4_sdif(skb);
+ int dif = inet_iif(skb);
+ int l3index;
+
/*
* active side is lost. Try to find listening socket through
* source port, and then find md5 key through listening socket.
@@ -710,25 +823,25 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
* Incoming packet is checked with md5 hash with finding key,
* no RST generated if md5 hash doesn't match.
*/
- sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
- ip_hdr(skb)->saddr,
+ sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr,
th->source, ip_hdr(skb)->daddr,
- ntohs(th->source), inet_iif(skb),
- tcp_v4_sdif(skb));
+ ntohs(th->source), dif, sdif);
/* don't send rst if it can't find key */
if (!sk1)
goto out;
- key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
- &ip_hdr(skb)->saddr, AF_INET);
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and dif is set to it.
+ */
+ l3index = sdif ? dif : 0;
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+ key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
if (!key)
goto out;
-
- genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
- if (genhash || memcmp(hash_location, newhash, 16) != 0)
+ tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
+ if (memcmp(md5_hash_location, newhash, 16) != 0)
goto out;
-
}
if (key) {
@@ -745,6 +858,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
ip_hdr(skb)->daddr, &rep.th);
}
#endif
+ /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
+ if (rep.opt[0] == 0) {
+ __be32 mrst = mptcp_reset_option(skb);
+
+ if (mrst) {
+ rep.opt[0] = mrst;
+ arg.iov[0].iov_len += sizeof(mrst);
+ rep.th.doff = arg.iov[0].iov_len / 4;
+ }
+ }
+
arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
ip_hdr(skb)->saddr, /* XXX */
arg.iov[0].iov_len, IPPROTO_TCP, 0);
@@ -755,30 +879,46 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
* routing might fail in this case. No choice here, if we choose to force
* input interface, we will misroute in case of asymmetric route.
*/
- if (sk) {
+ if (sk)
arg.bound_dev_if = sk->sk_bound_dev_if;
- if (sk_fullsock(sk))
- trace_tcp_send_reset(sk, skb);
- }
+
+ trace_tcp_send_reset(sk, skb, reason);
BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
offsetof(struct inet_timewait_sock, tw_bound_dev_if));
- arg.tos = ip_hdr(skb)->tos;
+ /* ECN bits of TW reset are cleared */
+ arg.tos = ip_hdr(skb)->tos & ~INET_ECN_MASK;
arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
- if (sk)
+ local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
+ ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
+
+ sock_net_set(ctl_sk, net);
+ if (sk) {
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
- inet_twsk(sk)->tw_mark : sk->sk_mark;
- ip_send_unicast_reply(ctl_sk,
+ inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
+ ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
+ transmit_time = tcp_transmit_time(sk);
+ xfrm_sk_clone_policy(ctl_sk, sk);
+ txhash = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_txhash : sk->sk_txhash;
+ } else {
+ ctl_sk->sk_mark = 0;
+ ctl_sk->sk_priority = 0;
+ }
+ ip_send_unicast_reply(ctl_sk, sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
- &arg, arg.iov[0].iov_len);
+ &arg, arg.iov[0].iov_len,
+ transmit_time, txhash);
- ctl_sk->sk_mark = 0;
+ xfrm_sk_free_policy(ctl_sk);
+ sock_net_set(ctl_sk, &init_net);
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
+ local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
local_bh_enable();
#ifdef CONFIG_TCP_MD5SIG
@@ -794,21 +934,18 @@ out:
static void tcp_v4_send_ack(const struct sock *sk,
struct sk_buff *skb, u32 seq, u32 ack,
u32 win, u32 tsval, u32 tsecr, int oif,
- struct tcp_md5sig_key *key,
- int reply_flags, u8 tos)
+ struct tcp_key *key,
+ int reply_flags, u8 tos, u32 txhash)
{
const struct tcphdr *th = tcp_hdr(skb);
struct {
struct tcphdr th;
- __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
-#ifdef CONFIG_TCP_MD5SIG
- + (TCPOLEN_MD5SIG_ALIGNED >> 2)
-#endif
- ];
+ __be32 opt[(MAX_TCP_OPTION_SPACE >> 2)];
} rep;
struct net *net = sock_net(sk);
struct ip_reply_arg arg;
struct sock *ctl_sk;
+ u64 transmit_time;
memset(&rep.th, 0, sizeof(struct tcphdr));
memset(&arg, 0, sizeof(arg));
@@ -834,7 +971,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
rep.th.window = htons(win);
#ifdef CONFIG_TCP_MD5SIG
- if (key) {
+ if (tcp_key_is_md5(key)) {
int offset = (tsecr) ? 3 : 0;
rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
@@ -845,10 +982,28 @@ static void tcp_v4_send_ack(const struct sock *sk,
rep.th.doff = arg.iov[0].iov_len/4;
tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
- key, ip_hdr(skb)->saddr,
+ key->md5_key, ip_hdr(skb)->saddr,
ip_hdr(skb)->daddr, &rep.th);
}
#endif
+#ifdef CONFIG_TCP_AO
+ if (tcp_key_is_ao(key)) {
+ int offset = (tsecr) ? 3 : 0;
+
+ rep.opt[offset++] = htonl((TCPOPT_AO << 24) |
+ (tcp_ao_len(key->ao_key) << 16) |
+ (key->ao_key->sndid << 8) |
+ key->rcv_next);
+ arg.iov[0].iov_len += tcp_ao_len_aligned(key->ao_key);
+ rep.th.doff = arg.iov[0].iov_len / 4;
+
+ tcp_ao_hash_hdr(AF_INET, (char *)&rep.opt[offset],
+ key->ao_key, key->traffic_key,
+ (union tcp_ao_addr *)&ip_hdr(skb)->saddr,
+ (union tcp_ao_addr *)&ip_hdr(skb)->daddr,
+ &rep.th, key->sne);
+ }
+#endif
arg.flags = reply_flags;
arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
ip_hdr(skb)->saddr, /* XXX */
@@ -859,35 +1014,86 @@ static void tcp_v4_send_ack(const struct sock *sk,
arg.tos = tos;
arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
local_bh_disable();
- ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
- if (sk)
- ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
- inet_twsk(sk)->tw_mark : sk->sk_mark;
- ip_send_unicast_reply(ctl_sk,
+ local_lock_nested_bh(&ipv4_tcp_sk.bh_lock);
+ ctl_sk = this_cpu_read(ipv4_tcp_sk.sock);
+ sock_net_set(ctl_sk, net);
+ ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
+ ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
+ inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
+ transmit_time = tcp_transmit_time(sk);
+ ip_send_unicast_reply(ctl_sk, sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
- &arg, arg.iov[0].iov_len);
+ &arg, arg.iov[0].iov_len,
+ transmit_time, txhash);
- ctl_sk->sk_mark = 0;
+ sock_net_set(ctl_sk, &init_net);
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
+ local_unlock_nested_bh(&ipv4_tcp_sk.bh_lock);
local_bh_enable();
}
-static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
+static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb,
+ enum tcp_tw_status tw_status)
{
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
+ struct tcp_key key = {};
+ u8 tos = tw->tw_tos;
+
+ /* Cleaning only ECN bits of TW ACKs of oow data or is paws_reject,
+ * while not cleaning ECN bits of other TW ACKs to avoid these ACKs
+ * being placed in a different service queues (Classic rather than L4S)
+ */
+ if (tw_status == TCP_TW_ACK_OOW)
+ tos &= ~INET_ECN_MASK;
+
+#ifdef CONFIG_TCP_AO
+ struct tcp_ao_info *ao_info;
+
+ if (static_branch_unlikely(&tcp_ao_needed.key)) {
+ /* FIXME: the segment to-be-acked is not verified yet */
+ ao_info = rcu_dereference(tcptw->ao_info);
+ if (ao_info) {
+ const struct tcp_ao_hdr *aoh;
+
+ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) {
+ inet_twsk_put(tw);
+ return;
+ }
+
+ if (aoh)
+ key.ao_key = tcp_ao_established_key(sk, ao_info,
+ aoh->rnext_keyid, -1);
+ }
+ }
+ if (key.ao_key) {
+ struct tcp_ao_key *rnext_key;
+
+ key.traffic_key = snd_other_key(key.ao_key);
+ key.sne = READ_ONCE(ao_info->snd_sne);
+ rnext_key = READ_ONCE(ao_info->rnext_key);
+ key.rcv_next = rnext_key->rcvid;
+ key.type = TCP_KEY_AO;
+#else
+ if (0) {
+#endif
+ } else if (static_branch_tcp_md5()) {
+ key.md5_key = tcp_twsk_md5_key(tcptw);
+ if (key.md5_key)
+ key.type = TCP_KEY_MD5;
+ }
tcp_v4_send_ack(sk, skb,
- tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+ tcptw->tw_snd_nxt, READ_ONCE(tcptw->tw_rcv_nxt),
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
- tcp_time_stamp_raw() + tcptw->tw_ts_offset,
- tcptw->tw_ts_recent,
- tw->tw_bound_dev_if,
- tcp_twsk_md5_key(tcptw),
+ tcp_tw_tsval(tcptw),
+ READ_ONCE(tcptw->tw_ts_recent),
+ tw->tw_bound_dev_if, &key,
tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
- tw->tw_tos
- );
+ tos,
+ tw->tw_txhash);
inet_twsk_put(tw);
}
@@ -895,27 +1101,79 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req)
{
+ struct tcp_key key = {};
+
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
*/
u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
tcp_sk(sk)->snd_nxt;
- /* RFC 7323 2.3
- * The window field (SEG.WND) of every outgoing segment, with the
- * exception of <SYN> segments, MUST be right-shifted by
- * Rcv.Wind.Shift bits:
- */
+#ifdef CONFIG_TCP_AO
+ if (static_branch_unlikely(&tcp_ao_needed.key) &&
+ tcp_rsk_used_ao(req)) {
+ const union tcp_md5_addr *addr;
+ const struct tcp_ao_hdr *aoh;
+ int l3index;
+
+ /* Invalid TCP option size or twice included auth */
+ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
+ return;
+ if (!aoh)
+ return;
+
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+ l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
+ key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET,
+ aoh->rnext_keyid, -1);
+ if (unlikely(!key.ao_key)) {
+ /* Send ACK with any matching MKT for the peer */
+ key.ao_key = tcp_ao_do_lookup(sk, l3index, addr, AF_INET, -1, -1);
+ /* Matching key disappeared (user removed the key?)
+ * let the handshake timeout.
+ */
+ if (!key.ao_key) {
+ net_info_ratelimited("TCP-AO key for (%pI4, %d)->(%pI4, %d) suddenly disappeared, won't ACK new connection\n",
+ addr,
+ ntohs(tcp_hdr(skb)->source),
+ &ip_hdr(skb)->daddr,
+ ntohs(tcp_hdr(skb)->dest));
+ return;
+ }
+ }
+ key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
+ if (!key.traffic_key)
+ return;
+
+ key.type = TCP_KEY_AO;
+ key.rcv_next = aoh->keyid;
+ tcp_v4_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
+#else
+ if (0) {
+#endif
+ } else if (static_branch_tcp_md5()) {
+ const union tcp_md5_addr *addr;
+ int l3index;
+
+ addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
+ l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
+ key.md5_key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
+ if (key.md5_key)
+ key.type = TCP_KEY_MD5;
+ }
+
+ /* Cleaning ECN bits of TW ACKs of oow data or is paws_reject */
tcp_v4_send_ack(sk, skb, seq,
tcp_rsk(req)->rcv_nxt,
- req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
- tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
+ tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
+ tcp_rsk_tsval(tcp_rsk(req)),
req->ts_recent,
- 0,
- tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
- AF_INET),
+ 0, &key,
inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
- ip_hdr(skb)->tos);
+ ip_hdr(skb)->tos & ~INET_ECN_MASK,
+ READ_ONCE(tcp_rsk(req)->txhash));
+ if (tcp_key_is_ao(&key))
+ kfree(key.traffic_key);
}
/*
@@ -927,26 +1185,40 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type)
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
- const struct inet_request_sock *ireq = inet_rsk(req);
+ struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4;
int err = -1;
struct sk_buff *skb;
+ u8 tos;
/* First, grab a route. */
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
return -1;
- skb = tcp_make_synack(sk, dst, req, foc, synack_type);
+ skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
if (skb) {
+ tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
+ tos = READ_ONCE(inet_sk(sk)->tos);
+
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
+ tos = (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
+ (tos & INET_ECN_MASK);
+
+ if (!INET_ECN_is_capable(tos) &&
+ tcp_bpf_ca_needs_ecn((struct sock *)req))
+ tos |= INET_ECN_ECT_0;
+
rcu_read_lock();
err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
ireq->ir_rmt_addr,
- rcu_dereference(ireq->ireq_opt));
+ rcu_dereference(ireq->ireq_opt),
+ tos);
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -969,10 +1241,27 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
* We need to maintain these in the sk structure.
*/
+DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
+EXPORT_IPV6_MOD(tcp_md5_needed);
+
+static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
+{
+ if (!old)
+ return true;
+
+ /* l3index always overrides non-l3index */
+ if (old->l3index && new->l3index == 0)
+ return false;
+ if (old->l3index == 0 && new->l3index)
+ return true;
+
+ return old->prefixlen < new->prefixlen;
+}
+
/* Find the Key structure for an address. */
-struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
- const union tcp_md5_addr *addr,
- int family)
+struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
+ const union tcp_md5_addr *addr,
+ int family, bool any_l3index)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
@@ -987,10 +1276,13 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
if (!md5sig)
return NULL;
- hlist_for_each_entry_rcu(key, &md5sig->head, node) {
+ hlist_for_each_entry_rcu(key, &md5sig->head, node,
+ lockdep_sock_is_held(sk)) {
if (key->family != family)
continue;
-
+ if (!any_l3index && key->flags & TCP_MD5SIG_FLAG_IFINDEX &&
+ key->l3index != l3index)
+ continue;
if (family == AF_INET) {
mask = inet_make_mask(key->prefixlen);
match = (key->addr.a4.s_addr & mask) ==
@@ -1004,17 +1296,17 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(const struct sock *sk,
match = false;
}
- if (match && (!best_match ||
- key->prefixlen > best_match->prefixlen))
+ if (match && better_md5_match(best_match, key))
best_match = key;
}
return best_match;
}
-EXPORT_SYMBOL(tcp_md5_do_lookup);
+EXPORT_IPV6_MOD(__tcp_md5_do_lookup);
static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
const union tcp_md5_addr *addr,
- int family, u8 prefixlen)
+ int family, u8 prefixlen,
+ int l3index, u8 flags)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
@@ -1030,9 +1322,14 @@ static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
if (family == AF_INET6)
size = sizeof(struct in6_addr);
#endif
- hlist_for_each_entry_rcu(key, &md5sig->head, node) {
+ hlist_for_each_entry_rcu(key, &md5sig->head, node,
+ lockdep_sock_is_held(sk)) {
if (key->family != family)
continue;
+ if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
+ continue;
+ if (key->l3index != l3index)
+ continue;
if (!memcmp(&key->addr, addr, size) &&
key->prefixlen == prefixlen)
return key;
@@ -1044,68 +1341,144 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
const struct sock *addr_sk)
{
const union tcp_md5_addr *addr;
+ int l3index;
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
+ addr_sk->sk_bound_dev_if);
addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
- return tcp_md5_do_lookup(sk, addr, AF_INET);
+ return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
+}
+EXPORT_IPV6_MOD(tcp_v4_md5_lookup);
+
+static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_md5sig_info *md5sig;
+
+ md5sig = kmalloc(sizeof(*md5sig), gfp);
+ if (!md5sig)
+ return -ENOMEM;
+
+ sk_gso_disable(sk);
+ INIT_HLIST_HEAD(&md5sig->head);
+ rcu_assign_pointer(tp->md5sig_info, md5sig);
+ return 0;
}
-EXPORT_SYMBOL(tcp_v4_md5_lookup);
/* This can be called on a newly created socket, from other files */
-int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
- int family, u8 prefixlen, const u8 *newkey, u8 newkeylen,
- gfp_t gfp)
+static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
+ int family, u8 prefixlen, int l3index, u8 flags,
+ const u8 *newkey, u8 newkeylen, gfp_t gfp)
{
/* Add Key to the list */
struct tcp_md5sig_key *key;
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_info *md5sig;
- key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
+ key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
if (key) {
- /* Pre-existing entry - just update that one. */
- memcpy(key->key, newkey, newkeylen);
- key->keylen = newkeylen;
+ /* Pre-existing entry - just update that one.
+ * Note that the key might be used concurrently.
+ * data_race() is telling kcsan that we do not care of
+ * key mismatches, since changing MD5 key on live flows
+ * can lead to packet drops.
+ */
+ data_race(memcpy(key->key, newkey, newkeylen));
+
+ /* Pairs with READ_ONCE() in tcp_md5_hash_key().
+ * Also note that a reader could catch new key->keylen value
+ * but old key->key[], this is the reason we use __GFP_ZERO
+ * at sock_kmalloc() time below these lines.
+ */
+ WRITE_ONCE(key->keylen, newkeylen);
+
return 0;
}
md5sig = rcu_dereference_protected(tp->md5sig_info,
lockdep_sock_is_held(sk));
- if (!md5sig) {
- md5sig = kmalloc(sizeof(*md5sig), gfp);
- if (!md5sig)
- return -ENOMEM;
-
- sk_nocaps_add(sk, NETIF_F_GSO_MASK);
- INIT_HLIST_HEAD(&md5sig->head);
- rcu_assign_pointer(tp->md5sig_info, md5sig);
- }
- key = sock_kmalloc(sk, sizeof(*key), gfp);
+ key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
if (!key)
return -ENOMEM;
- if (!tcp_alloc_md5sig_pool()) {
- sock_kfree_s(sk, key, sizeof(*key));
- return -ENOMEM;
- }
memcpy(key->key, newkey, newkeylen);
key->keylen = newkeylen;
key->family = family;
key->prefixlen = prefixlen;
+ key->l3index = l3index;
+ key->flags = flags;
memcpy(&key->addr, addr,
- (family == AF_INET6) ? sizeof(struct in6_addr) :
- sizeof(struct in_addr));
+ (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
+ sizeof(struct in_addr));
hlist_add_head_rcu(&key->node, &md5sig->head);
return 0;
}
-EXPORT_SYMBOL(tcp_md5_do_add);
+
+int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
+ int family, u8 prefixlen, int l3index, u8 flags,
+ const u8 *newkey, u8 newkeylen)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
+ if (fips_enabled) {
+ pr_warn_once("TCP-MD5 support is disabled due to FIPS\n");
+ return -EOPNOTSUPP;
+ }
+
+ if (tcp_md5sig_info_add(sk, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (!static_branch_inc(&tcp_md5_needed.key)) {
+ struct tcp_md5sig_info *md5sig;
+
+ md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
+ rcu_assign_pointer(tp->md5sig_info, NULL);
+ kfree_rcu(md5sig, rcu);
+ return -EUSERS;
+ }
+ }
+
+ return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
+ newkey, newkeylen, GFP_KERNEL);
+}
+EXPORT_IPV6_MOD(tcp_md5_do_add);
+
+int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
+ int family, u8 prefixlen, int l3index,
+ struct tcp_md5sig_key *key)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
+
+ if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
+ return -ENOMEM;
+
+ if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
+ struct tcp_md5sig_info *md5sig;
+
+ md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
+ net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
+ rcu_assign_pointer(tp->md5sig_info, NULL);
+ kfree_rcu(md5sig, rcu);
+ return -EUSERS;
+ }
+ }
+
+ return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
+ key->flags, key->key, key->keylen,
+ sk_gfp_mask(sk, GFP_ATOMIC));
+}
+EXPORT_IPV6_MOD(tcp_md5_key_copy);
int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
- u8 prefixlen)
+ u8 prefixlen, int l3index, u8 flags)
{
struct tcp_md5sig_key *key;
- key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen);
+ key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
if (!key)
return -ENOENT;
hlist_del_rcu(&key->node);
@@ -1113,9 +1486,9 @@ int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
kfree_rcu(key, rcu);
return 0;
}
-EXPORT_SYMBOL(tcp_md5_do_del);
+EXPORT_IPV6_MOD(tcp_md5_do_del);
-static void tcp_clear_md5_list(struct sock *sk)
+void tcp_clear_md5_list(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_md5sig_key *key;
@@ -1125,28 +1498,35 @@ static void tcp_clear_md5_list(struct sock *sk)
md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
- hlist_del_rcu(&key->node);
+ hlist_del(&key->node);
atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
- kfree_rcu(key, rcu);
+ kfree(key);
}
}
static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
- char __user *optval, int optlen)
+ sockptr_t optval, int optlen)
{
struct tcp_md5sig cmd;
struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
+ const union tcp_md5_addr *addr;
u8 prefixlen = 32;
+ int l3index = 0;
+ bool l3flag;
+ u8 flags;
if (optlen < sizeof(cmd))
return -EINVAL;
- if (copy_from_user(&cmd, optval, sizeof(cmd)))
+ if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
return -EFAULT;
if (sin->sin_family != AF_INET)
return -EINVAL;
+ flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
+ l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
+
if (optname == TCP_MD5SIG_EXT &&
cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
prefixlen = cmd.tcpm_prefixlen;
@@ -1154,82 +1534,80 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
return -EINVAL;
}
+ if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
+ cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
+ if (dev && netif_is_l3_master(dev))
+ l3index = dev->ifindex;
+
+ rcu_read_unlock();
+
+ /* ok to reference set/not set outside of rcu;
+ * right now device MUST be an L3 master
+ */
+ if (!dev || !l3index)
+ return -EINVAL;
+ }
+
+ addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
+
if (!cmd.tcpm_keylen)
- return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
- AF_INET, prefixlen);
+ return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
return -EINVAL;
- return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
- AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen,
- GFP_KERNEL);
+ /* Don't allow keys for peers that have a matching TCP-AO key.
+ * See the comment in tcp_ao_add_cmd()
+ */
+ if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false))
+ return -EKEYREJECTED;
+
+ return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
+ cmd.tcpm_key, cmd.tcpm_keylen);
}
-static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
- __be32 daddr, __be32 saddr,
- const struct tcphdr *th, int nbytes)
+static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx,
+ __be32 daddr, __be32 saddr,
+ const struct tcphdr *th, int nbytes)
{
- struct tcp4_pseudohdr *bp;
- struct scatterlist sg;
- struct tcphdr *_th;
-
- bp = hp->scratch;
- bp->saddr = saddr;
- bp->daddr = daddr;
- bp->pad = 0;
- bp->protocol = IPPROTO_TCP;
- bp->len = cpu_to_be16(nbytes);
-
- _th = (struct tcphdr *)(bp + 1);
- memcpy(_th, th, sizeof(*th));
- _th->check = 0;
+ struct {
+ struct tcp4_pseudohdr ip;
+ struct tcphdr tcp;
+ } h;
- sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
- ahash_request_set_crypt(hp->md5_req, &sg, NULL,
- sizeof(*bp) + sizeof(*th));
- return crypto_ahash_update(hp->md5_req);
+ h.ip.saddr = saddr;
+ h.ip.daddr = daddr;
+ h.ip.pad = 0;
+ h.ip.protocol = IPPROTO_TCP;
+ h.ip.len = cpu_to_be16(nbytes);
+ h.tcp = *th;
+ h.tcp.check = 0;
+ md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp));
}
-static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
- __be32 daddr, __be32 saddr, const struct tcphdr *th)
+static noinline_for_stack void
+tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+ __be32 daddr, __be32 saddr, const struct tcphdr *th)
{
- struct tcp_md5sig_pool *hp;
- struct ahash_request *req;
-
- hp = tcp_get_md5sig_pool();
- if (!hp)
- goto clear_hash_noput;
- req = hp->md5_req;
-
- if (crypto_ahash_init(req))
- goto clear_hash;
- if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
- goto clear_hash;
- if (tcp_md5_hash_key(hp, key))
- goto clear_hash;
- ahash_request_set_crypt(req, NULL, md5_hash, 0);
- if (crypto_ahash_final(req))
- goto clear_hash;
-
- tcp_put_md5sig_pool();
- return 0;
+ struct md5_ctx ctx;
-clear_hash:
- tcp_put_md5sig_pool();
-clear_hash_noput:
- memset(md5_hash, 0, 16);
- return 1;
+ md5_init(&ctx);
+ tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2);
+ tcp_md5_hash_key(&ctx, key);
+ md5_final(&ctx, md5_hash);
}
-int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
- const struct sock *sk,
- const struct sk_buff *skb)
+noinline_for_stack void
+tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
+ const struct sock *sk, const struct sk_buff *skb)
{
- struct tcp_md5sig_pool *hp;
- struct ahash_request *req;
const struct tcphdr *th = tcp_hdr(skb);
__be32 saddr, daddr;
+ struct md5_ctx ctx;
if (sk) { /* valid for establish/request sockets */
saddr = sk->sk_rcv_saddr;
@@ -1240,96 +1618,16 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
daddr = iph->daddr;
}
- hp = tcp_get_md5sig_pool();
- if (!hp)
- goto clear_hash_noput;
- req = hp->md5_req;
-
- if (crypto_ahash_init(req))
- goto clear_hash;
-
- if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
- goto clear_hash;
- if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
- goto clear_hash;
- if (tcp_md5_hash_key(hp, key))
- goto clear_hash;
- ahash_request_set_crypt(req, NULL, md5_hash, 0);
- if (crypto_ahash_final(req))
- goto clear_hash;
-
- tcp_put_md5sig_pool();
- return 0;
-
-clear_hash:
- tcp_put_md5sig_pool();
-clear_hash_noput:
- memset(md5_hash, 0, 16);
- return 1;
+ md5_init(&ctx);
+ tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len);
+ tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2);
+ tcp_md5_hash_key(&ctx, key);
+ md5_final(&ctx, md5_hash);
}
-EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
+EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb);
#endif
-/* Called with rcu_read_lock() */
-static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
- const struct sk_buff *skb)
-{
-#ifdef CONFIG_TCP_MD5SIG
- /*
- * This gets called for each TCP segment that arrives
- * so we want to be efficient.
- * We have 3 drop cases:
- * o No MD5 hash and one expected.
- * o MD5 hash and we're not expecting one.
- * o MD5 hash and its wrong.
- */
- const __u8 *hash_location = NULL;
- struct tcp_md5sig_key *hash_expected;
- const struct iphdr *iph = ip_hdr(skb);
- const struct tcphdr *th = tcp_hdr(skb);
- int genhash;
- unsigned char newhash[16];
-
- hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
- AF_INET);
- hash_location = tcp_parse_md5sig_option(th);
-
- /* We've parsed the options - do we have a hash? */
- if (!hash_expected && !hash_location)
- return false;
-
- if (hash_expected && !hash_location) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
- return true;
- }
-
- if (!hash_expected && hash_location) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
- return true;
- }
-
- /* Okay, so this is hash_expected and hash_location -
- * so we need to calculate the checksum.
- */
- genhash = tcp_v4_md5_hash_skb(newhash,
- hash_expected,
- NULL, skb);
-
- if (genhash || memcmp(hash_location, newhash, 16) != 0) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
- net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
- &iph->saddr, ntohs(th->source),
- &iph->daddr, ntohs(th->dest),
- genhash ? " tcp_v4_calc_md5_hash failed"
- : "");
- return true;
- }
- return false;
-#endif
- return false;
-}
-
static void tcp_v4_init_req(struct request_sock *req,
const struct sock *sk_listener,
struct sk_buff *skb)
@@ -1343,29 +1641,38 @@ static void tcp_v4_init_req(struct request_sock *req,
}
static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
+ struct sk_buff *skb,
struct flowi *fl,
- const struct request_sock *req)
+ struct request_sock *req,
+ u32 tw_isn)
{
+ tcp_v4_init_req(req, sk, skb);
+
+ if (security_inet_conn_request(sk, skb, req))
+ return NULL;
+
return inet_csk_route_req(sk, &fl->u.ip4, req);
}
struct request_sock_ops tcp_request_sock_ops __read_mostly = {
.family = PF_INET,
.obj_size = sizeof(struct tcp_request_sock),
- .rtx_syn_ack = tcp_rtx_synack,
.send_ack = tcp_v4_reqsk_send_ack,
.destructor = tcp_v4_reqsk_destructor,
.send_reset = tcp_v4_send_reset,
- .syn_ack_timeout = tcp_syn_ack_timeout,
};
-static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
+const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
.mss_clamp = TCP_MSS_DEFAULT,
#ifdef CONFIG_TCP_MD5SIG
.req_md5_lookup = tcp_v4_md5_lookup,
.calc_md5_hash = tcp_v4_md5_hash_skb,
#endif
- .init_req = tcp_v4_init_req,
+#ifdef CONFIG_TCP_AO
+ .ao_lookup = tcp_v4_ao_lookup_rsk,
+ .ao_calc_key = tcp_v4_ao_calc_key_rsk,
+ .ao_synack_hash = tcp_v4_ao_synack_hash,
+#endif
#ifdef CONFIG_SYN_COOKIES
.cookie_init_seq = cookie_v4_init_sequence,
#endif
@@ -1388,7 +1695,7 @@ drop:
tcp_listendrop(sk);
return 0;
}
-EXPORT_SYMBOL(tcp_v4_conn_request);
+EXPORT_IPV6_MOD(tcp_v4_conn_request);
/*
@@ -1402,11 +1709,14 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
bool *own_req)
{
struct inet_request_sock *ireq;
+ bool found_dup_sk = false;
struct inet_sock *newinet;
struct tcp_sock *newtp;
struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
+ const union tcp_md5_addr *addr;
struct tcp_md5sig_key *key;
+ int l3index;
#endif
struct ip_options_rcu *inet_opt;
@@ -1423,10 +1733,6 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
newtp = tcp_sk(newsk);
newinet = inet_sk(newsk);
ireq = inet_rsk(req);
- sk_daddr_set(newsk, ireq->ir_rmt_addr);
- sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
- newsk->sk_bound_dev_if = ireq->ir_iif;
- newinet->inet_saddr = ireq->ir_loc_addr;
inet_opt = rcu_dereference(ireq->ireq_opt);
RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
newinet->mc_index = inet_iif(skb);
@@ -1435,7 +1741,13 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
inet_csk(newsk)->icsk_ext_hdr_len = 0;
if (inet_opt)
inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
- newinet->inet_id = newtp->write_seq ^ jiffies;
+ atomic_set(&newinet->inet_id, get_random_u16());
+
+ /* Set ToS of the new socket based upon the value of incoming SYN.
+ * ECT bits are set later in tcp_init_transfer().
+ */
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
+ newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
if (!dst) {
dst = inet_csk_route_child_sock(sk, newsk, req);
@@ -1454,30 +1766,39 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
tcp_initialize_rcv_mss(newsk);
#ifdef CONFIG_TCP_MD5SIG
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
/* Copy over the MD5 key from the original socket */
- key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
- AF_INET);
- if (key) {
- /*
- * We're using one, so create a matching key
- * on the newsk structure. If we fail to get
- * memory, then we end up not copying the key
- * across. Shucks.
- */
- tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
- AF_INET, 32, key->key, key->keylen, GFP_ATOMIC);
- sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
+ addr = (union tcp_md5_addr *)&newinet->inet_daddr;
+ key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
+ if (key && !tcp_rsk_used_ao(req)) {
+ if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
+ goto put_and_exit;
+ sk_gso_disable(newsk);
}
#endif
+#ifdef CONFIG_TCP_AO
+ if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET))
+ goto put_and_exit; /* OOM, release back memory */
+#endif
if (__inet_inherit_port(sk, newsk) < 0)
goto put_and_exit;
- *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
+ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
+ &found_dup_sk);
if (likely(*own_req)) {
tcp_move_syn(newtp, req);
ireq->ireq_opt = NULL;
} else {
newinet->inet_opt = NULL;
+
+ if (!req_unhash && found_dup_sk) {
+ /* This code path should only be executed in the
+ * syncookie case only
+ */
+ bh_unlock_sock(newsk);
+ sock_put(newsk);
+ newsk = NULL;
+ }
}
return newsk;
@@ -1494,7 +1815,7 @@ put_and_exit:
tcp_done(newsk);
goto exit;
}
-EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
+EXPORT_IPV6_MOD(tcp_v4_syn_recv_sock);
static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
{
@@ -1507,6 +1828,23 @@ static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
return sk;
}
+u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
+ struct tcphdr *th, u32 *cookie)
+{
+ u16 mss = 0;
+#ifdef CONFIG_SYN_COOKIES
+ mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
+ &tcp_request_sock_ipv4_ops, sk, th);
+ if (mss) {
+ *cookie = __cookie_v4_init_sequence(iph, th, &mss);
+ tcp_synq_overflow(sk);
+ }
+#endif
+ return mss;
+}
+
+INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
+ u32));
/* The socket must have it's spinlock held when we get
* here, unless it is a TCP_LISTEN socket.
*
@@ -1517,18 +1855,27 @@ static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
*/
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
+ enum skb_drop_reason reason;
struct sock *rsk;
+ reason = psp_sk_rx_policy_check(sk, skb);
+ if (reason)
+ goto err_discard;
+
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
- struct dst_entry *dst = sk->sk_rx_dst;
+ struct dst_entry *dst;
+
+ dst = rcu_dereference_protected(sk->sk_rx_dst,
+ lockdep_sock_is_held(sk));
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
if (dst) {
- if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
- !dst->ops->check(dst, 0)) {
+ if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
+ !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
+ dst, 0)) {
+ RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
dst_release(dst);
- sk->sk_rx_dst = NULL;
}
}
tcp_rcv_established(sk, skb);
@@ -1542,9 +1889,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
struct sock *nsk = tcp_v4_cookie_check(sk, skb);
if (!nsk)
- goto discard;
+ return 0;
if (nsk != sk) {
- if (tcp_child_process(sk, nsk, skb)) {
+ reason = tcp_child_process(sk, nsk, skb);
+ if (reason) {
rsk = nsk;
goto reset;
}
@@ -1553,16 +1901,17 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
} else
sock_rps_save_rxhash(sk, skb);
- if (tcp_rcv_state_process(sk, skb)) {
+ reason = tcp_rcv_state_process(sk, skb);
+ if (reason) {
rsk = sk;
goto reset;
}
return 0;
reset:
- tcp_v4_send_reset(rsk, skb);
+ tcp_v4_send_reset(rsk, skb, sk_rst_convert_drop_reason(reason));
discard:
- kfree_skb(skb);
+ sk_skb_reason_drop(sk, skb, reason);
/* Be careful here. If this function gets more complicated and
* gcc suffers from register pressure on the x86, sk (in %ebx)
* might be destroyed here. This current version compiles correctly,
@@ -1571,7 +1920,10 @@ discard:
return 0;
csum_err:
+ reason = SKB_DROP_REASON_TCP_CSUM;
+ trace_tcp_bad_csum(skb);
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+err_discard:
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
@@ -1579,6 +1931,7 @@ EXPORT_SYMBOL(tcp_v4_do_rcv);
int tcp_v4_early_demux(struct sk_buff *skb)
{
+ struct net *net = dev_net_rcu(skb->dev);
const struct iphdr *iph;
const struct tcphdr *th;
struct sock *sk;
@@ -1595,35 +1948,40 @@ int tcp_v4_early_demux(struct sk_buff *skb)
if (th->doff < sizeof(struct tcphdr) / 4)
return 0;
- sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
- iph->saddr, th->source,
+ sk = __inet_lookup_established(net, iph->saddr, th->source,
iph->daddr, ntohs(th->dest),
skb->skb_iif, inet_sdif(skb));
if (sk) {
skb->sk = sk;
skb->destructor = sock_edemux;
if (sk_fullsock(sk)) {
- struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
+ struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
if (dst)
dst = dst_check(dst, 0);
if (dst &&
- inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
+ sk->sk_rx_dst_ifindex == skb->skb_iif)
skb_dst_set_noref(skb, dst);
}
}
return 0;
}
-bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
+bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
+ enum skb_drop_reason *reason)
{
- u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
-
- /* Only socket owner can try to collapse/prune rx queues
- * to reduce memory overhead, so add a little headroom here.
- * Few sockets backlog are possibly concurrently non empty.
- */
- limit += 64*1024;
+ u32 tail_gso_size, tail_gso_segs;
+ struct skb_shared_info *shinfo;
+ const struct tcphdr *th;
+ struct tcphdr *thtail;
+ struct sk_buff *tail;
+ unsigned int hdrlen;
+ bool fragstolen;
+ u32 gso_segs;
+ u32 gso_size;
+ u64 limit;
+ int delta;
+ int err;
/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
* we can fix skb->truesize to its real value to avoid future drops.
@@ -1633,29 +1991,132 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
*/
skb_condense(skb);
- if (unlikely(sk_add_backlog(sk, skb, limit))) {
+ tcp_cleanup_skb(skb);
+
+ if (unlikely(tcp_checksum_complete(skb))) {
+ bh_unlock_sock(sk);
+ trace_tcp_bad_csum(skb);
+ *reason = SKB_DROP_REASON_TCP_CSUM;
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+ __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
+ return true;
+ }
+
+ /* Attempt coalescing to last skb in backlog, even if we are
+ * above the limits.
+ * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
+ */
+ th = (const struct tcphdr *)skb->data;
+ hdrlen = th->doff * 4;
+
+ tail = sk->sk_backlog.tail;
+ if (!tail)
+ goto no_coalesce;
+ thtail = (struct tcphdr *)tail->data;
+
+ if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
+ TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
+ ((TCP_SKB_CB(tail)->tcp_flags |
+ TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
+ !((TCP_SKB_CB(tail)->tcp_flags &
+ TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
+ ((TCP_SKB_CB(tail)->tcp_flags ^
+ TCP_SKB_CB(skb)->tcp_flags) &
+ (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
+ !tcp_skb_can_collapse_rx(tail, skb) ||
+ thtail->doff != th->doff ||
+ memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) ||
+ /* prior to PSP Rx policy check, retain exact PSP metadata */
+ psp_skb_coalesce_diff(tail, skb))
+ goto no_coalesce;
+
+ __skb_pull(skb, hdrlen);
+
+ shinfo = skb_shinfo(skb);
+ gso_size = shinfo->gso_size ?: skb->len;
+ gso_segs = shinfo->gso_segs ?: 1;
+
+ shinfo = skb_shinfo(tail);
+ tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
+ tail_gso_segs = shinfo->gso_segs ?: 1;
+
+ if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
+ TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
+
+ if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
+ TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
+ thtail->window = th->window;
+ }
+
+ /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
+ * thtail->fin, so that the fast path in tcp_rcv_established()
+ * is not entered if we append a packet with a FIN.
+ * SYN, RST, URG are not present.
+ * ACK is set on both packets.
+ * PSH : we do not really care in TCP stack,
+ * at least for 'GRO' packets.
+ */
+ thtail->fin |= th->fin;
+ TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
+
+ if (TCP_SKB_CB(skb)->has_rxtstamp) {
+ TCP_SKB_CB(tail)->has_rxtstamp = true;
+ tail->tstamp = skb->tstamp;
+ skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
+ }
+
+ /* Not as strict as GRO. We only need to carry mss max value */
+ shinfo->gso_size = max(gso_size, tail_gso_size);
+ shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
+
+ sk->sk_backlog.len += delta;
+ __NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPBACKLOGCOALESCE);
+ kfree_skb_partial(skb, fragstolen);
+ return false;
+ }
+ __skb_push(skb, hdrlen);
+
+no_coalesce:
+ /* sk->sk_backlog.len is reset only at the end of __release_sock().
+ * Both sk->sk_backlog.len and sk->sk_rmem_alloc could reach
+ * sk_rcvbuf in normal conditions.
+ */
+ limit = ((u64)READ_ONCE(sk->sk_rcvbuf)) << 1;
+
+ limit += ((u32)READ_ONCE(sk->sk_sndbuf)) >> 1;
+
+ /* Only socket owner can try to collapse/prune rx queues
+ * to reduce memory overhead, so add a little headroom here.
+ * Few sockets backlog are possibly concurrently non empty.
+ */
+ limit += 64 * 1024;
+
+ limit = min_t(u64, limit, UINT_MAX);
+
+ err = sk_add_backlog(sk, skb, limit);
+ if (unlikely(err)) {
bh_unlock_sock(sk);
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
+ if (err == -ENOMEM) {
+ *reason = SKB_DROP_REASON_PFMEMALLOC;
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP);
+ } else {
+ *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
+ }
return true;
}
return false;
}
-EXPORT_SYMBOL(tcp_add_backlog);
+EXPORT_IPV6_MOD(tcp_add_backlog);
-int tcp_filter(struct sock *sk, struct sk_buff *skb)
+int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason)
{
struct tcphdr *th = (struct tcphdr *)skb->data;
- unsigned int eaten = skb->len;
- int err;
- err = sk_filter_trim_cap(sk, skb, th->doff * 4);
- if (!err) {
- eaten -= skb->len;
- TCP_SKB_CB(skb)->end_seq -= eaten;
- }
- return err;
+ return sk_filter_trim_cap(sk, skb, th->doff * 4, reason);
}
-EXPORT_SYMBOL(tcp_filter);
+EXPORT_IPV6_MOD(tcp_filter);
static void tcp_v4_restore_cb(struct sk_buff *skb)
{
@@ -1677,8 +2138,7 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
skb->len - th->doff * 4);
TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
- TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
- TCP_SKB_CB(skb)->tcp_tw_isn = 0;
+ TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
TCP_SKB_CB(skb)->sacked = 0;
TCP_SKB_CB(skb)->has_rxtstamp =
@@ -1691,14 +2151,19 @@ static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
int tcp_v4_rcv(struct sk_buff *skb)
{
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
+ enum skb_drop_reason drop_reason;
+ enum tcp_tw_status tw_status;
int sdif = inet_sdif(skb);
+ int dif = inet_iif(skb);
const struct iphdr *iph;
const struct tcphdr *th;
+ struct sock *sk = NULL;
bool refcounted;
- struct sock *sk;
int ret;
+ u32 isn;
+ drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (skb->pkt_type != PACKET_HOST)
goto discard_it;
@@ -1710,8 +2175,10 @@ int tcp_v4_rcv(struct sk_buff *skb)
th = (const struct tcphdr *)skb->data;
- if (unlikely(th->doff < sizeof(struct tcphdr) / 4))
+ if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
+ drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
goto bad_packet;
+ }
if (!pskb_may_pull(skb, th->doff * 4))
goto discard_it;
@@ -1726,12 +2193,11 @@ int tcp_v4_rcv(struct sk_buff *skb)
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
lookup:
- sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
+ sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source,
th->dest, sdif, &refcounted);
if (!sk)
goto no_tcp_socket;
-process:
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
@@ -1741,8 +2207,14 @@ process:
struct sock *nsk;
sk = req->rsk_listener;
- if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
- sk_drops_add(sk, skb);
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
+ else
+ drop_reason = tcp_inbound_hash(sk, req, skb,
+ &iph->saddr, &iph->daddr,
+ AF_INET, dif, sdif);
+ if (unlikely(drop_reason)) {
+ sk_drops_skbadd(sk, skb);
reqsk_put(req);
goto discard_it;
}
@@ -1751,20 +2223,29 @@ process:
goto csum_error;
}
if (unlikely(sk->sk_state != TCP_LISTEN)) {
- inet_csk_reqsk_queue_drop_and_put(sk, req);
- goto lookup;
+ nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
+ if (!nsk) {
+ inet_csk_reqsk_queue_drop_and_put(sk, req);
+ goto lookup;
+ }
+ sk = nsk;
+ /* reuseport_migrate_sock() has already held one sk_refcnt
+ * before returning.
+ */
+ } else {
+ /* We own a reference on the listener, increase it again
+ * as we might lose it too soon.
+ */
+ sock_hold(sk);
}
- /* We own a reference on the listener, increase it again
- * as we might lose it too soon.
- */
- sock_hold(sk);
refcounted = true;
nsk = NULL;
- if (!tcp_filter(sk, skb)) {
+ if (!tcp_filter(sk, skb, &drop_reason)) {
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
tcp_v4_fill_cb(skb, iph, th);
- nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
+ nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
+ &drop_reason);
}
if (!nsk) {
reqsk_put(req);
@@ -1780,32 +2261,49 @@ process:
}
goto discard_and_relse;
}
+ nf_reset_ct(skb);
if (nsk == sk) {
reqsk_put(req);
tcp_v4_restore_cb(skb);
- } else if (tcp_child_process(sk, nsk, skb)) {
- tcp_v4_send_reset(nsk, skb);
- goto discard_and_relse;
} else {
+ drop_reason = tcp_child_process(sk, nsk, skb);
+ if (drop_reason) {
+ enum sk_rst_reason rst_reason;
+
+ rst_reason = sk_rst_convert_drop_reason(drop_reason);
+ tcp_v4_send_reset(nsk, skb, rst_reason);
+ goto discard_and_relse;
+ }
sock_put(sk);
return 0;
}
}
- if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
- __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
- goto discard_and_relse;
+
+process:
+ if (static_branch_unlikely(&ip4_min_ttl)) {
+ /* min_ttl can be changed concurrently from do_ip_setsockopt() */
+ if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+ drop_reason = SKB_DROP_REASON_TCP_MINTTL;
+ goto discard_and_relse;
+ }
}
- if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
goto discard_and_relse;
+ }
- if (tcp_v4_inbound_md5_hash(sk, skb))
+ drop_reason = tcp_inbound_hash(sk, NULL, skb, &iph->saddr, &iph->daddr,
+ AF_INET, dif, sdif);
+ if (drop_reason)
goto discard_and_relse;
- nf_reset(skb);
+ nf_reset_ct(skb);
- if (tcp_filter(sk, skb))
+ if (tcp_filter(sk, skb, &drop_reason))
goto discard_and_relse;
+
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
tcp_v4_fill_cb(skb, iph, th);
@@ -1824,8 +2322,9 @@ process:
ret = 0;
if (!sock_owned_by_user(sk)) {
ret = tcp_v4_do_rcv(sk, skb);
- } else if (tcp_add_backlog(sk, skb)) {
- goto discard_and_relse;
+ } else {
+ if (tcp_add_backlog(sk, skb, &drop_reason))
+ goto discard_and_relse;
}
bh_unlock_sock(sk);
@@ -1836,6 +2335,7 @@ put_and_return:
return ret;
no_tcp_socket:
+ drop_reason = SKB_DROP_REASON_NO_SOCKET;
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it;
@@ -1843,26 +2343,30 @@ no_tcp_socket:
if (tcp_checksum_complete(skb)) {
csum_error:
+ drop_reason = SKB_DROP_REASON_TCP_CSUM;
+ trace_tcp_bad_csum(skb);
__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
- tcp_v4_send_reset(NULL, skb);
+ tcp_v4_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
}
discard_it:
+ SKB_DR_OR(drop_reason, NOT_SPECIFIED);
/* Discard frame. */
- kfree_skb(skb);
+ sk_skb_reason_drop(sk, skb, drop_reason);
return 0;
discard_and_relse:
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
if (refcounted)
sock_put(sk);
goto discard_it;
do_time_wait:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
inet_twsk_put(inet_twsk(sk));
goto discard_it;
}
@@ -1873,11 +2377,12 @@ do_time_wait:
inet_twsk_put(inet_twsk(sk));
goto csum_error;
}
- switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
+
+ tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
+ &drop_reason);
+ switch (tw_status) {
case TCP_TW_SYN: {
- struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
- &tcp_hashinfo, skb,
- __tcp_hdrlen(th),
+ struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th),
iph->saddr, th->source,
iph->daddr, th->dest,
inet_iif(skb),
@@ -1887,16 +2392,22 @@ do_time_wait:
sk = sk2;
tcp_v4_restore_cb(skb);
refcounted = false;
+ __this_cpu_write(tcp_tw_isn, isn);
goto process;
}
+
+ drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb);
+ if (drop_reason)
+ break;
}
/* to ACK */
- /* fall through */
+ fallthrough;
case TCP_TW_ACK:
- tcp_v4_timewait_ack(sk, skb);
+ case TCP_TW_ACK_OOW:
+ tcp_v4_timewait_ack(sk, skb, tw_status);
break;
case TCP_TW_RST:
- tcp_v4_send_reset(sk, skb);
+ tcp_v4_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:;
@@ -1906,8 +2417,6 @@ do_time_wait:
static struct timewait_sock_ops tcp_timewait_sock_ops = {
.twsk_obj_size = sizeof(struct tcp_timewait_sock),
- .twsk_unique = tcp_twsk_unique,
- .twsk_destructor= tcp_twsk_destructor,
};
void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
@@ -1915,11 +2424,11 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
struct dst_entry *dst = skb_dst(skb);
if (dst && dst_hold_safe(dst)) {
- sk->sk_rx_dst = dst;
- inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
+ rcu_assign_pointer(sk->sk_rx_dst, dst);
+ sk->sk_rx_dst_ifindex = skb->skb_iif;
}
}
-EXPORT_SYMBOL(inet_sk_rx_dst_set);
+EXPORT_IPV6_MOD(inet_sk_rx_dst_set);
const struct inet_connection_sock_af_ops ipv4_specific = {
.queue_xmit = ip_queue_xmit,
@@ -1931,22 +2440,31 @@ const struct inet_connection_sock_af_ops ipv4_specific = {
.net_header_len = sizeof(struct iphdr),
.setsockopt = ip_setsockopt,
.getsockopt = ip_getsockopt,
- .addr2sockaddr = inet_csk_addr2sockaddr,
- .sockaddr_len = sizeof(struct sockaddr_in),
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_ip_setsockopt,
- .compat_getsockopt = compat_ip_getsockopt,
-#endif
.mtu_reduced = tcp_v4_mtu_reduced,
};
-EXPORT_SYMBOL(ipv4_specific);
+EXPORT_IPV6_MOD(ipv4_specific);
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
+#ifdef CONFIG_TCP_MD5SIG
.md5_lookup = tcp_v4_md5_lookup,
.calc_md5_hash = tcp_v4_md5_hash_skb,
.md5_parse = tcp_v4_parse_md5_keys,
+#endif
+#ifdef CONFIG_TCP_AO
+ .ao_lookup = tcp_v4_ao_lookup,
+ .calc_ao_hash = tcp_v4_ao_hash_skb,
+ .ao_parse = tcp_v4_parse_ao,
+ .ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
+#endif
};
+
+static void tcp4_destruct_sock(struct sock *sk)
+{
+ tcp_md5_destruct_sock(sk);
+ tcp_ao_destroy_sock(sk, false);
+ inet_sock_destruct(sk);
+}
#endif
/* NOTE: A lot of things set to zero explicitly by call to
@@ -1960,17 +2478,33 @@ static int tcp_v4_init_sock(struct sock *sk)
icsk->icsk_af_ops = &ipv4_specific;
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
+ sk->sk_destruct = tcp4_destruct_sock;
#endif
return 0;
}
+static void tcp_release_user_frags(struct sock *sk)
+{
+#ifdef CONFIG_PAGE_POOL
+ unsigned long index;
+ void *netmem;
+
+ xa_for_each(&sk->sk_user_frags, index, netmem)
+ WARN_ON_ONCE(!napi_pp_put_page((__force netmem_ref)netmem));
+#endif
+}
+
void tcp_v4_destroy_sock(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ tcp_release_user_frags(sk);
+
+ xa_destroy(&sk->sk_user_frags);
+
trace_tcp_destroy_sock(sk);
tcp_clear_xmit_timers(sk);
@@ -1988,20 +2522,11 @@ void tcp_v4_destroy_sock(struct sock *sk)
/* Cleans up our, hopefully empty, out_of_order_queue. */
skb_rbtree_purge(&tp->out_of_order_queue);
-#ifdef CONFIG_TCP_MD5SIG
- /* Clean up the MD5 key list, if any */
- if (tp->md5sig_info) {
- tcp_clear_md5_list(sk);
- kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
- tp->md5sig_info = NULL;
- }
-#endif
-
/* Clean up a referenced TCP bind bucket. */
if (inet_csk(sk)->icsk_bind_hash)
inet_put_port(sk);
- BUG_ON(tp->fastopen_rsk);
+ BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
/* If socket is aborted during connect operation */
tcp_free_fastopen_req(tp);
@@ -2010,49 +2535,78 @@ void tcp_v4_destroy_sock(struct sock *sk)
sk_sockets_allocated_dec(sk);
}
-EXPORT_SYMBOL(tcp_v4_destroy_sock);
+EXPORT_IPV6_MOD(tcp_v4_destroy_sock);
#ifdef CONFIG_PROC_FS
/* Proc filesystem TCP sock list dumping. */
-/*
- * Get next listener socket follow cur. If cur is NULL, get first socket
- * starting from bucket given in st->bucket; when st->bucket is zero the
- * very first socket in the hash table is returned.
+static unsigned short seq_file_family(const struct seq_file *seq);
+
+static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
+{
+ unsigned short family = seq_file_family(seq);
+
+ /* AF_UNSPEC is used as a match all */
+ return ((family == AF_UNSPEC || family == sk->sk_family) &&
+ net_eq(sock_net(sk), seq_file_net(seq)));
+}
+
+/* Find a non empty bucket (starting from st->bucket)
+ * and return the first sk from it.
+ */
+static void *listening_get_first(struct seq_file *seq)
+{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+ struct tcp_iter_state *st = seq->private;
+
+ st->offset = 0;
+ for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
+ struct inet_listen_hashbucket *ilb2;
+ struct hlist_nulls_node *node;
+ struct sock *sk;
+
+ ilb2 = &hinfo->lhash2[st->bucket];
+ if (hlist_nulls_empty(&ilb2->nulls_head))
+ continue;
+
+ spin_lock(&ilb2->lock);
+ sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
+ if (seq_sk_match(seq, sk))
+ return sk;
+ }
+ spin_unlock(&ilb2->lock);
+ }
+
+ return NULL;
+}
+
+/* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
+ * If "cur" is the last one in the st->bucket,
+ * call listening_get_first() to return the first sk of the next
+ * non empty bucket.
*/
static void *listening_get_next(struct seq_file *seq, void *cur)
{
- struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
struct tcp_iter_state *st = seq->private;
- struct net *net = seq_file_net(seq);
- struct inet_listen_hashbucket *ilb;
+ struct inet_listen_hashbucket *ilb2;
+ struct hlist_nulls_node *node;
+ struct inet_hashinfo *hinfo;
struct sock *sk = cur;
- if (!sk) {
-get_head:
- ilb = &tcp_hashinfo.listening_hash[st->bucket];
- spin_lock(&ilb->lock);
- sk = sk_head(&ilb->head);
- st->offset = 0;
- goto get_sk;
- }
- ilb = &tcp_hashinfo.listening_hash[st->bucket];
++st->num;
++st->offset;
- sk = sk_next(sk);
-get_sk:
- sk_for_each_from(sk) {
- if (!net_eq(sock_net(sk), net))
- continue;
- if (sk->sk_family == afinfo->family)
+ sk = sk_nulls_next(sk);
+ sk_nulls_for_each_from(sk, node) {
+ if (seq_sk_match(seq, sk))
return sk;
}
- spin_unlock(&ilb->lock);
- st->offset = 0;
- if (++st->bucket < INET_LHTABLE_SIZE)
- goto get_head;
- return NULL;
+
+ hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+ ilb2 = &hinfo->lhash2[st->bucket];
+ spin_unlock(&ilb2->lock);
+ ++st->bucket;
+ return listening_get_first(seq);
}
static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
@@ -2062,7 +2616,7 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
st->bucket = 0;
st->offset = 0;
- rc = listening_get_next(seq, NULL);
+ rc = listening_get_first(seq);
while (rc && *pos) {
rc = listening_get_next(seq, rc);
@@ -2071,9 +2625,10 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
return rc;
}
-static inline bool empty_bucket(const struct tcp_iter_state *st)
+static inline bool empty_bucket(struct inet_hashinfo *hinfo,
+ const struct tcp_iter_state *st)
{
- return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
+ return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
}
/*
@@ -2082,43 +2637,38 @@ static inline bool empty_bucket(const struct tcp_iter_state *st)
*/
static void *established_get_first(struct seq_file *seq)
{
- struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
struct tcp_iter_state *st = seq->private;
- struct net *net = seq_file_net(seq);
- void *rc = NULL;
st->offset = 0;
- for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
+ for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
struct sock *sk;
struct hlist_nulls_node *node;
- spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
+ spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
+
+ cond_resched();
/* Lockless fast path for the common case of empty buckets */
- if (empty_bucket(st))
+ if (empty_bucket(hinfo, st))
continue;
spin_lock_bh(lock);
- sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
- if (sk->sk_family != afinfo->family ||
- !net_eq(sock_net(sk), net)) {
- continue;
- }
- rc = sk;
- goto out;
+ sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
+ if (seq_sk_match(seq, sk))
+ return sk;
}
spin_unlock_bh(lock);
}
-out:
- return rc;
+
+ return NULL;
}
static void *established_get_next(struct seq_file *seq, void *cur)
{
- struct tcp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
- struct sock *sk = cur;
- struct hlist_nulls_node *node;
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
struct tcp_iter_state *st = seq->private;
- struct net *net = seq_file_net(seq);
+ struct hlist_nulls_node *node;
+ struct sock *sk = cur;
++st->num;
++st->offset;
@@ -2126,12 +2676,11 @@ static void *established_get_next(struct seq_file *seq, void *cur)
sk = sk_nulls_next(sk);
sk_nulls_for_each_from(sk, node) {
- if (sk->sk_family == afinfo->family &&
- net_eq(sock_net(sk), net))
+ if (seq_sk_match(seq, sk))
return sk;
}
- spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+ spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
++st->bucket;
return established_get_first(seq);
}
@@ -2169,29 +2718,30 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
static void *tcp_seek_last_pos(struct seq_file *seq)
{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
struct tcp_iter_state *st = seq->private;
+ int bucket = st->bucket;
int offset = st->offset;
int orig_num = st->num;
void *rc = NULL;
switch (st->state) {
case TCP_SEQ_STATE_LISTENING:
- if (st->bucket >= INET_LHTABLE_SIZE)
+ if (st->bucket > hinfo->lhash2_mask)
break;
- st->state = TCP_SEQ_STATE_LISTENING;
- rc = listening_get_next(seq, NULL);
- while (offset-- && rc)
+ rc = listening_get_first(seq);
+ while (offset-- && rc && bucket == st->bucket)
rc = listening_get_next(seq, rc);
if (rc)
break;
st->bucket = 0;
st->state = TCP_SEQ_STATE_ESTABLISHED;
- /* Fallthrough */
+ fallthrough;
case TCP_SEQ_STATE_ESTABLISHED:
- if (st->bucket > tcp_hashinfo.ehash_mask)
+ if (st->bucket > hinfo->ehash_mask)
break;
rc = established_get_first(seq);
- while (offset-- && rc)
+ while (offset-- && rc && bucket == st->bucket)
rc = established_get_next(seq, rc);
}
@@ -2221,7 +2771,7 @@ out:
st->last_pos = *pos;
return rc;
}
-EXPORT_SYMBOL(tcp_seq_start);
+EXPORT_IPV6_MOD(tcp_seq_start);
void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
@@ -2252,24 +2802,25 @@ out:
st->last_pos = *pos;
return rc;
}
-EXPORT_SYMBOL(tcp_seq_next);
+EXPORT_IPV6_MOD(tcp_seq_next);
void tcp_seq_stop(struct seq_file *seq, void *v)
{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
struct tcp_iter_state *st = seq->private;
switch (st->state) {
case TCP_SEQ_STATE_LISTENING:
if (v != SEQ_START_TOKEN)
- spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
+ spin_unlock(&hinfo->lhash2[st->bucket].lock);
break;
case TCP_SEQ_STATE_ESTABLISHED:
if (v)
- spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
+ spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
break;
}
}
-EXPORT_SYMBOL(tcp_seq_stop);
+EXPORT_IPV6_MOD(tcp_seq_stop);
static void get_openreq4(const struct request_sock *req,
struct seq_file *f, int i)
@@ -2290,7 +2841,7 @@ static void get_openreq4(const struct request_sock *req,
jiffies_delta_to_clock_t(delta),
req->num_timeout,
from_kuid_munged(seq_user_ns(f),
- sock_i_uid(req->rsk_listener)),
+ sk_uid(req->rsk_listener)),
0, /* non standard timer */
0, /* open_requests have no inode */
0,
@@ -2309,20 +2860,22 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
__be32 src = inet->inet_rcv_saddr;
__u16 destp = ntohs(inet->inet_dport);
__u16 srcp = ntohs(inet->inet_sport);
+ u8 icsk_pending;
int rx_queue;
int state;
- if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
- icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
- icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ icsk_pending = smp_load_acquire(&icsk->icsk_pending);
+ if (icsk_pending == ICSK_TIME_RETRANS ||
+ icsk_pending == ICSK_TIME_REO_TIMEOUT ||
+ icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
- timer_expires = icsk->icsk_timeout;
- } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+ timer_expires = tcp_timeout_expires(sk);
+ } else if (icsk_pending == ICSK_TIME_PROBE0) {
timer_active = 4;
- timer_expires = icsk->icsk_timeout;
- } else if (timer_pending(&sk->sk_timer)) {
+ timer_expires = tcp_timeout_expires(sk);
+ } else if (timer_pending(&icsk->icsk_keepalive_timer)) {
timer_active = 2;
- timer_expires = sk->sk_timer.expires;
+ timer_expires = icsk->icsk_keepalive_timer.expires;
} else {
timer_active = 0;
timer_expires = jiffies;
@@ -2330,29 +2883,30 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
state = inet_sk_state_load(sk);
if (state == TCP_LISTEN)
- rx_queue = sk->sk_ack_backlog;
+ rx_queue = READ_ONCE(sk->sk_ack_backlog);
else
/* Because we don't lock the socket,
* we might find a transient negative value.
*/
- rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
+ rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
+ READ_ONCE(tp->copied_seq), 0);
seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
i, src, srcp, dest, destp, state,
- tp->write_seq - tp->snd_una,
+ READ_ONCE(tp->write_seq) - tp->snd_una,
rx_queue,
timer_active,
jiffies_delta_to_clock_t(timer_expires - jiffies),
- icsk->icsk_retransmits,
- from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
- icsk->icsk_probes_out,
+ READ_ONCE(icsk->icsk_retransmits),
+ from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
+ READ_ONCE(icsk->icsk_probes_out),
sock_i_ino(sk),
refcount_read(&sk->sk_refcnt), sk,
jiffies_to_clock_t(icsk->icsk_rto),
jiffies_to_clock_t(icsk->icsk_ack.ato),
- (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
- tp->snd_cwnd,
+ (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
+ tcp_snd_cwnd(tp),
state == TCP_LISTEN ?
fastopenq->max_qlen :
(tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
@@ -2372,7 +2926,7 @@ static void get_timewait4_sock(const struct inet_timewait_sock *tw,
seq_printf(f, "%4d: %08X:%04X %08X:%04X"
" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
- i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
+ i, src, srcp, dest, destp, READ_ONCE(tw->tw_substate), 0, 0,
3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
refcount_read(&tw->tw_refcnt), tw);
}
@@ -2404,6 +2958,426 @@ out:
return 0;
}
+#ifdef CONFIG_BPF_SYSCALL
+union bpf_tcp_iter_batch_item {
+ struct sock *sk;
+ __u64 cookie;
+};
+
+struct bpf_tcp_iter_state {
+ struct tcp_iter_state state;
+ unsigned int cur_sk;
+ unsigned int end_sk;
+ unsigned int max_sk;
+ union bpf_tcp_iter_batch_item *batch;
+};
+
+struct bpf_iter__tcp {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct sock_common *, sk_common);
+ uid_t uid __aligned(8);
+};
+
+static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
+ struct sock_common *sk_common, uid_t uid)
+{
+ struct bpf_iter__tcp ctx;
+
+ meta->seq_num--; /* skip SEQ_START_TOKEN */
+ ctx.meta = meta;
+ ctx.sk_common = sk_common;
+ ctx.uid = uid;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
+{
+ union bpf_tcp_iter_batch_item *item;
+ unsigned int cur_sk = iter->cur_sk;
+ __u64 cookie;
+
+ /* Remember the cookies of the sockets we haven't seen yet, so we can
+ * pick up where we left off next time around.
+ */
+ while (cur_sk < iter->end_sk) {
+ item = &iter->batch[cur_sk++];
+ cookie = sock_gen_cookie(item->sk);
+ sock_gen_put(item->sk);
+ item->cookie = cookie;
+ }
+}
+
+static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
+ unsigned int new_batch_sz, gfp_t flags)
+{
+ union bpf_tcp_iter_batch_item *new_batch;
+
+ new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
+ flags | __GFP_NOWARN);
+ if (!new_batch)
+ return -ENOMEM;
+
+ memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
+ kvfree(iter->batch);
+ iter->batch = new_batch;
+ iter->max_sk = new_batch_sz;
+
+ return 0;
+}
+
+static struct sock *bpf_iter_tcp_resume_bucket(struct sock *first_sk,
+ union bpf_tcp_iter_batch_item *cookies,
+ int n_cookies)
+{
+ struct hlist_nulls_node *node;
+ struct sock *sk;
+ int i;
+
+ for (i = 0; i < n_cookies; i++) {
+ sk = first_sk;
+ sk_nulls_for_each_from(sk, node)
+ if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
+ return sk;
+ }
+
+ return NULL;
+}
+
+static struct sock *bpf_iter_tcp_resume_listening(struct seq_file *seq)
+{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ unsigned int find_cookie = iter->cur_sk;
+ unsigned int end_cookie = iter->end_sk;
+ int resume_bucket = st->bucket;
+ struct sock *sk;
+
+ if (end_cookie && find_cookie == end_cookie)
+ ++st->bucket;
+
+ sk = listening_get_first(seq);
+ iter->cur_sk = 0;
+ iter->end_sk = 0;
+
+ if (sk && st->bucket == resume_bucket && end_cookie) {
+ sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
+ end_cookie - find_cookie);
+ if (!sk) {
+ spin_unlock(&hinfo->lhash2[st->bucket].lock);
+ ++st->bucket;
+ sk = listening_get_first(seq);
+ }
+ }
+
+ return sk;
+}
+
+static struct sock *bpf_iter_tcp_resume_established(struct seq_file *seq)
+{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ unsigned int find_cookie = iter->cur_sk;
+ unsigned int end_cookie = iter->end_sk;
+ int resume_bucket = st->bucket;
+ struct sock *sk;
+
+ if (end_cookie && find_cookie == end_cookie)
+ ++st->bucket;
+
+ sk = established_get_first(seq);
+ iter->cur_sk = 0;
+ iter->end_sk = 0;
+
+ if (sk && st->bucket == resume_bucket && end_cookie) {
+ sk = bpf_iter_tcp_resume_bucket(sk, &iter->batch[find_cookie],
+ end_cookie - find_cookie);
+ if (!sk) {
+ spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
+ ++st->bucket;
+ sk = established_get_first(seq);
+ }
+ }
+
+ return sk;
+}
+
+static struct sock *bpf_iter_tcp_resume(struct seq_file *seq)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ struct sock *sk = NULL;
+
+ switch (st->state) {
+ case TCP_SEQ_STATE_LISTENING:
+ sk = bpf_iter_tcp_resume_listening(seq);
+ if (sk)
+ break;
+ st->bucket = 0;
+ st->state = TCP_SEQ_STATE_ESTABLISHED;
+ fallthrough;
+ case TCP_SEQ_STATE_ESTABLISHED:
+ sk = bpf_iter_tcp_resume_established(seq);
+ break;
+ }
+
+ return sk;
+}
+
+static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
+ struct sock **start_sk)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct hlist_nulls_node *node;
+ unsigned int expected = 1;
+ struct sock *sk;
+
+ sock_hold(*start_sk);
+ iter->batch[iter->end_sk++].sk = *start_sk;
+
+ sk = sk_nulls_next(*start_sk);
+ *start_sk = NULL;
+ sk_nulls_for_each_from(sk, node) {
+ if (seq_sk_match(seq, sk)) {
+ if (iter->end_sk < iter->max_sk) {
+ sock_hold(sk);
+ iter->batch[iter->end_sk++].sk = sk;
+ } else if (!*start_sk) {
+ /* Remember where we left off. */
+ *start_sk = sk;
+ }
+ expected++;
+ }
+ }
+
+ return expected;
+}
+
+static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
+ struct sock **start_sk)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct hlist_nulls_node *node;
+ unsigned int expected = 1;
+ struct sock *sk;
+
+ sock_hold(*start_sk);
+ iter->batch[iter->end_sk++].sk = *start_sk;
+
+ sk = sk_nulls_next(*start_sk);
+ *start_sk = NULL;
+ sk_nulls_for_each_from(sk, node) {
+ if (seq_sk_match(seq, sk)) {
+ if (iter->end_sk < iter->max_sk) {
+ sock_hold(sk);
+ iter->batch[iter->end_sk++].sk = sk;
+ } else if (!*start_sk) {
+ /* Remember where we left off. */
+ *start_sk = sk;
+ }
+ expected++;
+ }
+ }
+
+ return expected;
+}
+
+static unsigned int bpf_iter_fill_batch(struct seq_file *seq,
+ struct sock **start_sk)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+
+ if (st->state == TCP_SEQ_STATE_LISTENING)
+ return bpf_iter_tcp_listening_batch(seq, start_sk);
+ else
+ return bpf_iter_tcp_established_batch(seq, start_sk);
+}
+
+static void bpf_iter_tcp_unlock_bucket(struct seq_file *seq)
+{
+ struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+
+ if (st->state == TCP_SEQ_STATE_LISTENING)
+ spin_unlock(&hinfo->lhash2[st->bucket].lock);
+ else
+ spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
+}
+
+static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ unsigned int expected;
+ struct sock *sk;
+ int err;
+
+ sk = bpf_iter_tcp_resume(seq);
+ if (!sk)
+ return NULL; /* Done */
+
+ expected = bpf_iter_fill_batch(seq, &sk);
+ if (likely(iter->end_sk == expected))
+ goto done;
+
+ /* Batch size was too small. */
+ bpf_iter_tcp_unlock_bucket(seq);
+ bpf_iter_tcp_put_batch(iter);
+ err = bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2,
+ GFP_USER);
+ if (err)
+ return ERR_PTR(err);
+
+ sk = bpf_iter_tcp_resume(seq);
+ if (!sk)
+ return NULL; /* Done */
+
+ expected = bpf_iter_fill_batch(seq, &sk);
+ if (likely(iter->end_sk == expected))
+ goto done;
+
+ /* Batch size was still too small. Hold onto the lock while we try
+ * again with a larger batch to make sure the current bucket's size
+ * does not change in the meantime.
+ */
+ err = bpf_iter_tcp_realloc_batch(iter, expected, GFP_NOWAIT);
+ if (err) {
+ bpf_iter_tcp_unlock_bucket(seq);
+ return ERR_PTR(err);
+ }
+
+ expected = bpf_iter_fill_batch(seq, &sk);
+ WARN_ON_ONCE(iter->end_sk != expected);
+done:
+ bpf_iter_tcp_unlock_bucket(seq);
+ return iter->batch[0].sk;
+}
+
+static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ /* bpf iter does not support lseek, so it always
+ * continue from where it was stop()-ped.
+ */
+ if (*pos)
+ return bpf_iter_tcp_batch(seq);
+
+ return SEQ_START_TOKEN;
+}
+
+static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct tcp_iter_state *st = &iter->state;
+ struct sock *sk;
+
+ /* Whenever seq_next() is called, the iter->cur_sk is
+ * done with seq_show(), so advance to the next sk in
+ * the batch.
+ */
+ if (iter->cur_sk < iter->end_sk) {
+ /* Keeping st->num consistent in tcp_iter_state.
+ * bpf_iter_tcp does not use st->num.
+ * meta.seq_num is used instead.
+ */
+ st->num++;
+ sock_gen_put(iter->batch[iter->cur_sk++].sk);
+ }
+
+ if (iter->cur_sk < iter->end_sk)
+ sk = iter->batch[iter->cur_sk].sk;
+ else
+ sk = bpf_iter_tcp_batch(seq);
+
+ ++*pos;
+ /* Keeping st->last_pos consistent in tcp_iter_state.
+ * bpf iter does not do lseek, so st->last_pos always equals to *pos.
+ */
+ st->last_pos = *pos;
+ return sk;
+}
+
+static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ struct sock *sk = v;
+ uid_t uid;
+ int ret;
+
+ if (v == SEQ_START_TOKEN)
+ return 0;
+
+ if (sk_fullsock(sk))
+ lock_sock(sk);
+
+ if (unlikely(sk_unhashed(sk))) {
+ ret = SEQ_SKIP;
+ goto unlock;
+ }
+
+ if (sk->sk_state == TCP_TIME_WAIT) {
+ uid = 0;
+ } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ const struct request_sock *req = v;
+
+ uid = from_kuid_munged(seq_user_ns(seq),
+ sk_uid(req->rsk_listener));
+ } else {
+ uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
+ }
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, false);
+ ret = tcp_prog_seq_show(prog, &meta, v, uid);
+
+unlock:
+ if (sk_fullsock(sk))
+ release_sock(sk);
+ return ret;
+
+}
+
+static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
+{
+ struct bpf_tcp_iter_state *iter = seq->private;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ if (!v) {
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, true);
+ if (prog)
+ (void)tcp_prog_seq_show(prog, &meta, v, 0);
+ }
+
+ if (iter->cur_sk < iter->end_sk)
+ bpf_iter_tcp_put_batch(iter);
+}
+
+static const struct seq_operations bpf_iter_tcp_seq_ops = {
+ .show = bpf_iter_tcp_seq_show,
+ .start = bpf_iter_tcp_seq_start,
+ .next = bpf_iter_tcp_seq_next,
+ .stop = bpf_iter_tcp_seq_stop,
+};
+#endif
+static unsigned short seq_file_family(const struct seq_file *seq)
+{
+ const struct tcp_seq_afinfo *afinfo;
+
+#ifdef CONFIG_BPF_SYSCALL
+ /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
+ if (seq->op == &bpf_iter_tcp_seq_ops)
+ return AF_UNSPEC;
+#endif
+
+ /* Iterated from proc fs */
+ afinfo = pde_data(file_inode(seq->file));
+ return afinfo->family;
+}
+
static const struct seq_operations tcp4_seq_ops = {
.show = tcp4_seq_show,
.start = tcp_seq_start,
@@ -2444,6 +3418,20 @@ void tcp4_proc_exit(void)
}
#endif /* CONFIG_PROC_FS */
+/* @wake is one when sk_stream_write_space() calls us.
+ * This sends EPOLLOUT only if notsent_bytes is half the limit.
+ * This mimics the strategy used in sock_def_write_space().
+ */
+bool tcp_stream_memory_free(const struct sock *sk, int wake)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u32 notsent_bytes = READ_ONCE(tp->write_seq) -
+ READ_ONCE(tp->snd_nxt);
+
+ return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
+}
+EXPORT_SYMBOL(tcp_stream_memory_free);
+
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
@@ -2458,21 +3446,28 @@ struct proto tcp_prot = {
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
+ .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
.keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
- .sendpage = tcp_sendpage,
+ .splice_eof = tcp_splice_eof,
.backlog_rcv = tcp_v4_do_rcv,
.release_cb = tcp_release_cb,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
+ .put_port = inet_put_port,
+#ifdef CONFIG_BPF_SYSCALL
+ .psock_update_sk_prot = tcp_bpf_update_proto,
+#endif
.enter_memory_pressure = tcp_enter_memory_pressure,
.leave_memory_pressure = tcp_leave_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
- .orphan_count = &tcp_orphan_count,
- .memory_allocated = &tcp_memory_allocated,
+
+ .memory_allocated = &net_aligned_data.tcp_memory_allocated,
+ .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
+
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
@@ -2482,58 +3477,61 @@ struct proto tcp_prot = {
.slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
- .h.hashinfo = &tcp_hashinfo,
+ .h.hashinfo = NULL,
.no_autobind = true,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_tcp_setsockopt,
- .compat_getsockopt = compat_tcp_getsockopt,
-#endif
.diag_destroy = tcp_abort,
};
EXPORT_SYMBOL(tcp_prot);
static void __net_exit tcp_sk_exit(struct net *net)
{
- int cpu;
-
- module_put(net->ipv4.tcp_congestion_control->owner);
-
- for_each_possible_cpu(cpu)
- inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
- free_percpu(net->ipv4.tcp_sk);
+ if (net->ipv4.tcp_congestion_control)
+ bpf_module_put(net->ipv4.tcp_congestion_control,
+ net->ipv4.tcp_congestion_control->owner);
}
-static int __net_init tcp_sk_init(struct net *net)
+static void __net_init tcp_set_hashinfo(struct net *net)
{
- int res, cpu, cnt;
+ struct inet_hashinfo *hinfo;
+ unsigned int ehash_entries;
+ struct net *old_net;
- net->ipv4.tcp_sk = alloc_percpu(struct sock *);
- if (!net->ipv4.tcp_sk)
- return -ENOMEM;
+ if (net_eq(net, &init_net))
+ goto fallback;
- for_each_possible_cpu(cpu) {
- struct sock *sk;
+ old_net = current->nsproxy->net_ns;
+ ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
+ if (!ehash_entries)
+ goto fallback;
- res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
- IPPROTO_TCP, net);
- if (res)
- goto fail;
- sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
-
- /* Please enforce IP_DF and IPID==0 for RST and
- * ACK sent in SYN-RECV and TIME-WAIT state.
- */
- inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
-
- *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
+ ehash_entries = roundup_pow_of_two(ehash_entries);
+ hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
+ if (!hinfo) {
+ pr_warn("Failed to allocate TCP ehash (entries: %u) "
+ "for a netns, fallback to the global one\n",
+ ehash_entries);
+fallback:
+ hinfo = &tcp_hashinfo;
+ ehash_entries = tcp_hashinfo.ehash_mask + 1;
}
- net->ipv4.sysctl_tcp_ecn = 2;
+ net->ipv4.tcp_death_row.hashinfo = hinfo;
+ net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
+ net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
+}
+
+static int __net_init tcp_sk_init(struct net *net)
+{
+ net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
+ net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
+ net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
net->ipv4.sysctl_tcp_ecn_fallback = 1;
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
+ net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
+ net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
@@ -2549,12 +3547,12 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
net->ipv4.sysctl_tcp_tw_reuse = 2;
+ net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
+ net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
- cnt = tcp_hashinfo.ehash_mask + 1;
- net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2;
- net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo;
+ refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
+ tcp_set_hashinfo(net);
- net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 256);
net->ipv4.sysctl_tcp_sack = 1;
net->ipv4.sysctl_tcp_window_scaling = 1;
net->ipv4.sysctl_tcp_timestamps = 1;
@@ -2568,16 +3566,20 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_adv_win_scale = 1;
net->ipv4.sysctl_tcp_frto = 2;
net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
+ net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC;
/* This limits the percentage of the congestion window which we
* will allow a single TSO frame to consume. Building TSO frames
* which are too large can cause TCP streams to be bursty.
*/
net->ipv4.sysctl_tcp_tso_win_divisor = 3;
- /* Default TSQ limit of four TSO segments */
- net->ipv4.sysctl_tcp_limit_output_bytes = 262144;
- /* rfc5961 challenge ack rate limiting */
- net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
+ /* Default TSQ limit of 4 MB */
+ net->ipv4.sysctl_tcp_limit_output_bytes = 4 << 20;
+
+ /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
+ net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
+
net->ipv4.sysctl_tcp_min_tso_segs = 2;
+ net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
net->ipv4.sysctl_tcp_autocorking = 1;
net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
@@ -2592,34 +3594,63 @@ static int __net_init tcp_sk_init(struct net *net)
sizeof(init_net.ipv4.sysctl_tcp_wmem));
}
net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
+ net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC;
net->ipv4.sysctl_tcp_comp_sack_nr = 44;
+ net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33;
+ net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
- spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
- net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
+ net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
atomic_set(&net->ipv4.tfo_active_disable_times, 0);
+ /* Set default values for PLB */
+ net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
+ net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
+ net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
+ net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
+ /* Default congestion threshold for PLB to mark a round is 50% */
+ net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
+
/* Reno is always built in */
if (!net_eq(net, &init_net) &&
- try_module_get(init_net.ipv4.tcp_congestion_control->owner))
+ bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
+ init_net.ipv4.tcp_congestion_control->owner))
net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
else
net->ipv4.tcp_congestion_control = &tcp_reno;
- return 0;
-fail:
- tcp_sk_exit(net);
+ net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
+ net->ipv4.sysctl_tcp_shrink_window = 0;
- return res;
+ net->ipv4.sysctl_tcp_pingpong_thresh = 1;
+ net->ipv4.sysctl_tcp_rto_min_us = jiffies_to_usecs(TCP_RTO_MIN);
+ net->ipv4.sysctl_tcp_rto_max_ms = TCP_RTO_MAX_SEC * MSEC_PER_SEC;
+
+ return 0;
}
static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
{
struct net *net;
- inet_twsk_purge(&tcp_hashinfo, AF_INET);
+ /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
+ * and failed setup_net error unwinding path are serialized.
+ *
+ * tcp_twsk_purge() handles twsk in any dead netns, not just those in
+ * net_exit_list, the thread that dismantles a particular twsk must
+ * do so without other thread progressing to refcount_dec_and_test() of
+ * tcp_death_row.tw_refcount.
+ */
+ mutex_lock(&tcp_exit_batch_mutex);
+
+ tcp_twsk_purge(net_exit_list);
- list_for_each_entry(net, net_exit_list, exit_list)
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
+ WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
tcp_fastopen_ctx_destroy(net);
+ }
+
+ mutex_unlock(&tcp_exit_batch_mutex);
}
static struct pernet_operations __net_initdata tcp_sk_ops = {
@@ -2628,8 +3659,105 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
.exit_batch = tcp_sk_exit_batch,
};
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
+ struct sock_common *sk_common, uid_t uid)
+
+#define INIT_BATCH_SZ 16
+
+static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+ struct bpf_tcp_iter_state *iter = priv_data;
+ int err;
+
+ err = bpf_iter_init_seq_net(priv_data, aux);
+ if (err)
+ return err;
+
+ err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
+ if (err) {
+ bpf_iter_fini_seq_net(priv_data);
+ return err;
+ }
+
+ return 0;
+}
+
+static void bpf_iter_fini_tcp(void *priv_data)
+{
+ struct bpf_tcp_iter_state *iter = priv_data;
+
+ bpf_iter_fini_seq_net(priv_data);
+ kvfree(iter->batch);
+}
+
+static const struct bpf_iter_seq_info tcp_seq_info = {
+ .seq_ops = &bpf_iter_tcp_seq_ops,
+ .init_seq_private = bpf_iter_init_tcp,
+ .fini_seq_private = bpf_iter_fini_tcp,
+ .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
+};
+
+static const struct bpf_func_proto *
+bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
+ const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ case BPF_FUNC_setsockopt:
+ return &bpf_sk_setsockopt_proto;
+ case BPF_FUNC_getsockopt:
+ return &bpf_sk_getsockopt_proto;
+ default:
+ return NULL;
+ }
+}
+
+static struct bpf_iter_reg tcp_reg_info = {
+ .target = "tcp",
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__tcp, sk_common),
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
+ },
+ .get_func_proto = bpf_iter_tcp_get_func_proto,
+ .seq_info = &tcp_seq_info,
+};
+
+static void __init bpf_iter_register(void)
+{
+ tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
+ if (bpf_iter_reg_target(&tcp_reg_info))
+ pr_warn("Warning: could not register bpf iterator tcp\n");
+}
+
+#endif
+
void __init tcp_v4_init(void)
{
+ int cpu, res;
+
+ for_each_possible_cpu(cpu) {
+ struct sock *sk;
+
+ res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
+ IPPROTO_TCP, &init_net);
+ if (res)
+ panic("Failed to create the TCP control socket.\n");
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+ /* Please enforce IP_DF and IPID==0 for RST and
+ * ACK sent in SYN-RECV and TIME-WAIT state.
+ */
+ inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
+
+ sk->sk_clockid = CLOCK_MONOTONIC;
+
+ per_cpu(ipv4_tcp_sk.sock, cpu) = sk;
+ }
if (register_pernet_subsys(&tcp_sk_ops))
panic("Failed to create the TCP control socket.\n");
+
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+ bpf_iter_register();
+#endif
}
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index ae10ed64fe13..976b56644a8a 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* TCP Low Priority (TCP-LP)
*
@@ -22,9 +23,9 @@
* Original Author:
* Aleksandar Kuzmanovic <akuzma@northwestern.edu>
* Available from:
- * http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf
+ * https://users.cs.northwestern.edu/~akuzma/doc/TCP-LP-ToN.pdf
* Original implementation for 2.4.19:
- * http://www-ece.rice.edu/networks/TCP-LP/
+ * https://users.cs.northwestern.edu/~akuzma/rice/TCP-LP/linux/tcp-lp-linux.htm
*
* 2.6.x module Authors:
* Wong Hoi Sing, Edison <hswong3i@gmail.com>
@@ -62,7 +63,7 @@ enum tcp_lp_state {
* @sowd: smoothed OWD << 3
* @owd_min: min OWD
* @owd_max: max OWD
- * @owd_max_rsv: resrved max owd
+ * @owd_max_rsv: reserved max owd
* @remote_hz: estimated remote HZ
* @remote_ref_time: remote reference time
* @local_ref_time: local reference time
@@ -88,6 +89,7 @@ struct lp {
/**
* tcp_lp_init
+ * @sk: socket to initialize congestion control algorithm for
*
* Init all required variables.
* Clone the handling from Vegas module implementation.
@@ -110,6 +112,9 @@ static void tcp_lp_init(struct sock *sk)
/**
* tcp_lp_cong_avoid
+ * @sk: socket to avoid congesting
+ * @ack: current ack sequence number
+ * @acked: number of ACKed packets
*
* Implementation of cong_avoid.
* Will only call newReno CA when away from inference.
@@ -125,6 +130,7 @@ static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
/**
* tcp_lp_remote_hz_estimator
+ * @sk: socket which needs an estimate for the remote HZs
*
* Estimate remote HZ.
* We keep on updating the estimated value, where original TCP-LP
@@ -175,6 +181,7 @@ static u32 tcp_lp_remote_hz_estimator(struct sock *sk)
/**
* tcp_lp_owd_calculator
+ * @sk: socket to calculate one way delay for
*
* Calculate one way delay (in relative format).
* Original implement OWD as minus of remote time difference to local time
@@ -209,6 +216,8 @@ static u32 tcp_lp_owd_calculator(struct sock *sk)
/**
* tcp_lp_rtt_sample
+ * @sk: socket to add a rtt sample to
+ * @rtt: round trip time, which is ignored!
*
* Implementation or rtt_sample.
* Will take the following action,
@@ -253,6 +262,8 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt)
/**
* tcp_lp_pkts_acked
+ * @sk: socket requiring congestion avoidance calculations
+ * @sample: ACK sample containing timing and rate information
*
* Implementation of pkts_acked.
* Deal with active drop under Early Congestion Indication.
@@ -264,7 +275,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample)
{
struct tcp_sock *tp = tcp_sk(sk);
struct lp *lp = inet_csk_ca(sk);
- u32 now = tcp_time_stamp(tp);
+ u32 now = tcp_time_stamp_ts(tp);
u32 delta;
if (sample->rtt_us > 0)
@@ -289,7 +300,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample)
lp->flag &= ~LP_WITHIN_THR;
pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag,
- tp->snd_cwnd, lp->remote_hz, lp->owd_min, lp->owd_max,
+ tcp_snd_cwnd(tp), lp->remote_hz, lp->owd_min, lp->owd_max,
lp->sowd >> 3);
if (lp->flag & LP_WITHIN_THR)
@@ -297,7 +308,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample)
/* FIXME: try to reset owd_min and owd_max here
* so decrease the chance the min/max is no longer suitable
- * and will usually within threshold when whithin inference */
+ * and will usually within threshold when within inference */
lp->owd_min = lp->sowd >> 3;
lp->owd_max = lp->sowd >> 2;
lp->owd_max_rsv = lp->sowd >> 2;
@@ -305,12 +316,12 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample)
/* happened within inference
* drop snd_cwnd into 1 */
if (lp->flag & LP_WITHIN_INF)
- tp->snd_cwnd = 1U;
+ tcp_snd_cwnd_set(tp, 1U);
/* happened after inference
* cut snd_cwnd into half */
else
- tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U);
+ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp) >> 1U, 1U));
/* record this drop time */
lp->last_drop = now;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 03b51cdcc731..45b6ecd16412 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -40,7 +40,7 @@ struct tcp_fastopen_metrics {
struct tcp_metrics_block {
struct tcp_metrics_block __rcu *tcpm_next;
- possible_net_t tcpm_net;
+ struct net *tcpm_net;
struct inetpeer_addr tcpm_saddr;
struct inetpeer_addr tcpm_daddr;
unsigned long tcpm_stamp;
@@ -51,34 +51,38 @@ struct tcp_metrics_block {
struct rcu_head rcu_head;
};
-static inline struct net *tm_net(struct tcp_metrics_block *tm)
+static inline struct net *tm_net(const struct tcp_metrics_block *tm)
{
- return read_pnet(&tm->tcpm_net);
+ /* Paired with the WRITE_ONCE() in tcpm_new() */
+ return READ_ONCE(tm->tcpm_net);
}
static bool tcp_metric_locked(struct tcp_metrics_block *tm,
enum tcp_metric_index idx)
{
- return tm->tcpm_lock & (1 << idx);
+ /* Paired with WRITE_ONCE() in tcpm_suck_dst() */
+ return READ_ONCE(tm->tcpm_lock) & (1 << idx);
}
-static u32 tcp_metric_get(struct tcp_metrics_block *tm,
+static u32 tcp_metric_get(const struct tcp_metrics_block *tm,
enum tcp_metric_index idx)
{
- return tm->tcpm_vals[idx];
+ /* Paired with WRITE_ONCE() in tcp_metric_set() */
+ return READ_ONCE(tm->tcpm_vals[idx]);
}
static void tcp_metric_set(struct tcp_metrics_block *tm,
enum tcp_metric_index idx,
u32 val)
{
- tm->tcpm_vals[idx] = val;
+ /* Paired with READ_ONCE() in tcp_metric_get() */
+ WRITE_ONCE(tm->tcpm_vals[idx], val);
}
static bool addr_same(const struct inetpeer_addr *a,
const struct inetpeer_addr *b)
{
- return inetpeer_addr_cmp(a, b) == 0;
+ return (a->family == b->family) && !inetpeer_addr_cmp(a, b);
}
struct tcpm_hash_bucket {
@@ -89,6 +93,7 @@ static struct tcpm_hash_bucket *tcp_metrics_hash __read_mostly;
static unsigned int tcp_metrics_hash_log __read_mostly;
static DEFINE_SPINLOCK(tcp_metrics_lock);
+static DEFINE_SEQLOCK(fastopen_seqlock);
static void tcpm_suck_dst(struct tcp_metrics_block *tm,
const struct dst_entry *dst,
@@ -97,7 +102,7 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm,
u32 msval;
u32 val;
- tm->tcpm_stamp = jiffies;
+ WRITE_ONCE(tm->tcpm_stamp, jiffies);
val = 0;
if (dst_metric_locked(dst, RTAX_RTT))
@@ -110,30 +115,42 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm,
val |= 1 << TCP_METRIC_CWND;
if (dst_metric_locked(dst, RTAX_REORDERING))
val |= 1 << TCP_METRIC_REORDERING;
- tm->tcpm_lock = val;
+ /* Paired with READ_ONCE() in tcp_metric_locked() */
+ WRITE_ONCE(tm->tcpm_lock, val);
msval = dst_metric_raw(dst, RTAX_RTT);
- tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC;
+ tcp_metric_set(tm, TCP_METRIC_RTT, msval * USEC_PER_MSEC);
msval = dst_metric_raw(dst, RTAX_RTTVAR);
- tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC;
- tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
- tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
- tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
+ tcp_metric_set(tm, TCP_METRIC_RTTVAR, msval * USEC_PER_MSEC);
+ tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
+ dst_metric_raw(dst, RTAX_SSTHRESH));
+ tcp_metric_set(tm, TCP_METRIC_CWND,
+ dst_metric_raw(dst, RTAX_CWND));
+ tcp_metric_set(tm, TCP_METRIC_REORDERING,
+ dst_metric_raw(dst, RTAX_REORDERING));
if (fastopen_clear) {
+ write_seqlock(&fastopen_seqlock);
tm->tcpm_fastopen.mss = 0;
tm->tcpm_fastopen.syn_loss = 0;
tm->tcpm_fastopen.try_exp = 0;
tm->tcpm_fastopen.cookie.exp = false;
tm->tcpm_fastopen.cookie.len = 0;
+ write_sequnlock(&fastopen_seqlock);
}
}
#define TCP_METRICS_TIMEOUT (60 * 60 * HZ)
-static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
+static void tcpm_check_stamp(struct tcp_metrics_block *tm,
+ const struct dst_entry *dst)
{
- if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
+ unsigned long limit;
+
+ if (!tm)
+ return;
+ limit = READ_ONCE(tm->tcpm_stamp) + TCP_METRICS_TIMEOUT;
+ if (unlikely(time_after(jiffies, limit)))
tcpm_suck_dst(tm, dst, false);
}
@@ -149,11 +166,11 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
unsigned int hash)
{
struct tcp_metrics_block *tm;
- struct net *net;
bool reclaim = false;
+ struct net *net;
spin_lock_bh(&tcp_metrics_lock);
- net = dev_net(dst->dev);
+ net = dst_dev_net_rcu(dst);
/* While waiting for the spin-lock the cache might have been populated
* with this entry and so we have to check again.
@@ -174,20 +191,23 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
oldest = deref_locked(tcp_metrics_hash[hash].chain);
for (tm = deref_locked(oldest->tcpm_next); tm;
tm = deref_locked(tm->tcpm_next)) {
- if (time_before(tm->tcpm_stamp, oldest->tcpm_stamp))
+ if (time_before(READ_ONCE(tm->tcpm_stamp),
+ READ_ONCE(oldest->tcpm_stamp)))
oldest = tm;
}
tm = oldest;
} else {
- tm = kmalloc(sizeof(*tm), GFP_ATOMIC);
+ tm = kzalloc(sizeof(*tm), GFP_ATOMIC);
if (!tm)
goto out_unlock;
}
- write_pnet(&tm->tcpm_net, net);
+ /* Paired with the READ_ONCE() in tm_net() */
+ WRITE_ONCE(tm->tcpm_net, net);
+
tm->tcpm_saddr = *saddr;
tm->tcpm_daddr = *daddr;
- tcpm_suck_dst(tm, dst, true);
+ tcpm_suck_dst(tm, dst, reclaim);
if (likely(!reclaim)) {
tm->tcpm_next = tcp_metrics_hash[hash].chain;
@@ -253,7 +273,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
return NULL;
}
- net = dev_net(dst->dev);
+ net = dst_dev_net_rcu(dst);
hash ^= net_hash_mix(net);
hash = hash_32(hash, tcp_metrics_hash_log);
@@ -298,7 +318,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
else
return NULL;
- net = dev_net(dst->dev);
+ net = dst_dev_net_rcu(dst);
hash ^= net_hash_mix(net);
hash = hash_32(hash, tcp_metrics_hash_log);
@@ -329,7 +349,7 @@ void tcp_update_metrics(struct sock *sk)
int m;
sk_dst_confirm(sk);
- if (net->ipv4.sysctl_tcp_nometrics_save || !dst)
+ if (READ_ONCE(net->ipv4.sysctl_tcp_nometrics_save) || !dst)
return;
rcu_read_lock();
@@ -385,27 +405,29 @@ void tcp_update_metrics(struct sock *sk)
if (tcp_in_initial_slowstart(tp)) {
/* Slow start still did not finish. */
- if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
+ !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
- if (val && (tp->snd_cwnd >> 1) > val)
+ if (val && (tcp_snd_cwnd(tp) >> 1) > val)
tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
- tp->snd_cwnd >> 1);
+ tcp_snd_cwnd(tp) >> 1);
}
if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
val = tcp_metric_get(tm, TCP_METRIC_CWND);
- if (tp->snd_cwnd > val)
+ if (tcp_snd_cwnd(tp) > val)
tcp_metric_set(tm, TCP_METRIC_CWND,
- tp->snd_cwnd);
+ tcp_snd_cwnd(tp));
}
} else if (!tcp_in_slow_start(tp) &&
icsk->icsk_ca_state == TCP_CA_Open) {
/* Cong. avoidance phase, cwnd is reliable. */
- if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
+ !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
- max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
+ max(tcp_snd_cwnd(tp) >> 1, tp->snd_ssthresh));
if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
val = tcp_metric_get(tm, TCP_METRIC_CWND);
- tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1);
+ tcp_metric_set(tm, TCP_METRIC_CWND, (val + tcp_snd_cwnd(tp)) >> 1);
}
} else {
/* Else slow start did not finish, cwnd is non-sense,
@@ -416,7 +438,8 @@ void tcp_update_metrics(struct sock *sk)
tcp_metric_set(tm, TCP_METRIC_CWND,
(val + tp->snd_ssthresh) >> 1);
}
- if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) &&
+ !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
if (val && tp->snd_ssthresh > val)
tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
@@ -425,12 +448,13 @@ void tcp_update_metrics(struct sock *sk)
if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
if (val < tp->reordering &&
- tp->reordering != net->ipv4.sysctl_tcp_reordering)
+ tp->reordering !=
+ READ_ONCE(net->ipv4.sysctl_tcp_reordering))
tcp_metric_set(tm, TCP_METRIC_REORDERING,
tp->reordering);
}
}
- tm->tcpm_stamp = jiffies;
+ WRITE_ONCE(tm->tcpm_stamp, jiffies);
out_unlock:
rcu_read_unlock();
}
@@ -441,15 +465,20 @@ void tcp_init_metrics(struct sock *sk)
{
struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
struct tcp_metrics_block *tm;
u32 val, crtt = 0; /* cached RTT scaled by 8 */
sk_dst_confirm(sk);
+ /* ssthresh may have been reduced unnecessarily during.
+ * 3WHS. Restore it back to its initial default.
+ */
+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
if (!dst)
goto reset;
rcu_read_lock();
- tm = tcp_get_metrics(sk, dst, true);
+ tm = tcp_get_metrics(sk, dst, false);
if (!tm) {
rcu_read_unlock();
goto reset;
@@ -458,16 +487,12 @@ void tcp_init_metrics(struct sock *sk)
if (tcp_metric_locked(tm, TCP_METRIC_CWND))
tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
- val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+ val = READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) ?
+ 0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
if (val) {
tp->snd_ssthresh = val;
if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
tp->snd_ssthresh = tp->snd_cwnd_clamp;
- } else {
- /* ssthresh may have been reduced unnecessarily during.
- * 3WHS. Restore it back to its initial default.
- */
- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
}
val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
if (val && tp->reordering != val)
@@ -512,16 +537,6 @@ reset:
inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
}
- /* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
- * retransmitted. In light of RFC6298 more aggressive 1sec
- * initRTO, we only reset cwnd when more than 1 SYN/SYN-ACK
- * retransmission has occurred.
- */
- if (tp->total_retrans > 1)
- tp->snd_cwnd = 1;
- else
- tp->snd_cwnd = tcp_init_cwnd(tp, dst);
- tp->snd_cwnd_stamp = tcp_jiffies32;
}
bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
@@ -543,8 +558,6 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst)
return ret;
}
-static DEFINE_SEQLOCK(fastopen_seqlock);
-
void tcp_fastopen_cache_get(struct sock *sk, u16 *mss,
struct tcp_fastopen_cookie *cookie)
{
@@ -604,8 +617,13 @@ static struct genl_family tcp_metrics_nl_family;
static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = {
[TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, },
- [TCP_METRICS_ATTR_ADDR_IPV6] = { .type = NLA_BINARY,
- .len = sizeof(struct in6_addr), },
+ [TCP_METRICS_ATTR_ADDR_IPV6] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+
+ [TCP_METRICS_ATTR_SADDR_IPV4] = { .type = NLA_U32, },
+ [TCP_METRICS_ATTR_SADDR_IPV6] =
+ NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+
/* Following attributes are not received for GET/DEL,
* we keep them for reference
*/
@@ -651,18 +669,18 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
}
if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE,
- jiffies - tm->tcpm_stamp,
+ jiffies - READ_ONCE(tm->tcpm_stamp),
TCP_METRICS_ATTR_PAD) < 0)
goto nla_put_failure;
{
int n = 0;
- nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
+ nest = nla_nest_start_noflag(msg, TCP_METRICS_ATTR_VALS);
if (!nest)
goto nla_put_failure;
for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) {
- u32 val = tm->tcpm_vals[i];
+ u32 val = tcp_metric_get(tm, i);
if (!val)
continue;
@@ -753,6 +771,7 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb,
unsigned int max_rows = 1U << tcp_metrics_hash_log;
unsigned int row, s_row = cb->args[0];
int s_col = cb->args[1], col = s_col;
+ int res = 0;
for (row = s_row; row < max_rows; row++, s_col = 0) {
struct tcp_metrics_block *tm;
@@ -765,7 +784,8 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb,
continue;
if (col < s_col)
continue;
- if (tcp_metrics_dump_info(skb, cb, tm) < 0) {
+ res = tcp_metrics_dump_info(skb, cb, tm);
+ if (res < 0) {
rcu_read_unlock();
goto done;
}
@@ -776,7 +796,7 @@ static int tcp_metrics_nl_dump(struct sk_buff *skb,
done:
cb->args[0] = row;
cb->args[1] = col;
- return skb->len;
+ return res;
}
static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
@@ -795,8 +815,6 @@ static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr,
if (a) {
struct in6_addr in6;
- if (nla_len(a) != sizeof(struct in6_addr))
- return -EINVAL;
in6 = nla_get_in6_addr(a);
inetpeer_set_addr_v6(addr, &in6);
if (hash)
@@ -885,22 +903,25 @@ static void tcp_metrics_flush_all(struct net *net)
unsigned int row;
for (row = 0; row < max_rows; row++, hb++) {
- struct tcp_metrics_block __rcu **pp;
+ struct tcp_metrics_block __rcu **pp = &hb->chain;
bool match;
+ if (!rcu_access_pointer(*pp))
+ continue;
+
spin_lock_bh(&tcp_metrics_lock);
- pp = &hb->chain;
for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) {
match = net ? net_eq(tm_net(tm), net) :
- !refcount_read(&tm_net(tm)->count);
+ !check_net(tm_net(tm));
if (match) {
- *pp = tm->tcpm_next;
+ rcu_assign_pointer(*pp, tm->tcpm_next);
kfree_rcu(tm, rcu_head);
} else {
pp = &tm->tcpm_next;
}
}
spin_unlock_bh(&tcp_metrics_lock);
+ cond_resched();
}
}
@@ -935,7 +956,7 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
if (addr_same(&tm->tcpm_daddr, &daddr) &&
(!src || addr_same(&tm->tcpm_saddr, &saddr)) &&
net_eq(tm_net(tm), net)) {
- *pp = tm->tcpm_next;
+ rcu_assign_pointer(*pp, tm->tcpm_next);
kfree_rcu(tm, rcu_head);
found = true;
} else {
@@ -948,17 +969,17 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info)
return 0;
}
-static const struct genl_ops tcp_metrics_nl_ops[] = {
+static const struct genl_small_ops tcp_metrics_nl_ops[] = {
{
.cmd = TCP_METRICS_CMD_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = tcp_metrics_nl_cmd_get,
.dumpit = tcp_metrics_nl_dump,
- .policy = tcp_metrics_nl_policy,
},
{
.cmd = TCP_METRICS_CMD_DEL,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = tcp_metrics_nl_cmd_del,
- .policy = tcp_metrics_nl_policy,
.flags = GENL_ADMIN_PERM,
},
};
@@ -968,13 +989,16 @@ static struct genl_family tcp_metrics_nl_family __ro_after_init = {
.name = TCP_METRICS_GENL_NAME,
.version = TCP_METRICS_GENL_VERSION,
.maxattr = TCP_METRICS_ATTR_MAX,
+ .policy = tcp_metrics_nl_policy,
.netnsok = true,
+ .parallel_ops = true,
.module = THIS_MODULE,
- .ops = tcp_metrics_nl_ops,
- .n_ops = ARRAY_SIZE(tcp_metrics_nl_ops),
+ .small_ops = tcp_metrics_nl_ops,
+ .n_small_ops = ARRAY_SIZE(tcp_metrics_nl_ops),
+ .resv_start_op = TCP_METRICS_CMD_DEL + 1,
};
-static unsigned int tcpmhash_entries;
+static unsigned int tcpmhash_entries __initdata;
static int __init set_tcpmhash_entries(char *str)
{
ssize_t ret;
@@ -990,17 +1014,13 @@ static int __init set_tcpmhash_entries(char *str)
}
__setup("tcpmhash_entries=", set_tcpmhash_entries);
-static int __net_init tcp_net_metrics_init(struct net *net)
+static void __init tcp_metrics_hash_alloc(void)
{
+ unsigned int slots = tcpmhash_entries;
size_t size;
- unsigned int slots;
-
- if (!net_eq(net, &init_net))
- return 0;
- slots = tcpmhash_entries;
if (!slots) {
- if (totalram_pages >= 128 * 1024)
+ if (totalram_pages() >= 128 * 1024)
slots = 16 * 1024;
else
slots = 8 * 1024;
@@ -1011,9 +1031,7 @@ static int __net_init tcp_net_metrics_init(struct net *net)
tcp_metrics_hash = kvzalloc(size, GFP_KERNEL);
if (!tcp_metrics_hash)
- return -ENOMEM;
-
- return 0;
+ panic("Could not allocate the tcp_metrics hash table\n");
}
static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_list)
@@ -1022,7 +1040,6 @@ static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_lis
}
static __net_initdata struct pernet_operations tcp_net_metrics_ops = {
- .init = tcp_net_metrics_init,
.exit_batch = tcp_net_metrics_exit_batch,
};
@@ -1030,9 +1047,11 @@ void __init tcp_metrics_init(void)
{
int ret;
+ tcp_metrics_hash_alloc();
+
ret = register_pernet_subsys(&tcp_net_metrics_ops);
if (ret < 0)
- panic("Could not allocate the tcp_metrics hash table\n");
+ panic("Could not register tcp_net_metrics_ops\n");
ret = genl_register_family(&tcp_metrics_nl_family);
if (ret < 0)
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 12affb7864d9..bd5462154f97 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -18,16 +19,12 @@
* Jorge Cwik, <jorge@laser.satlink.net>
*/
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/sysctl.h>
-#include <linux/workqueue.h>
-#include <linux/static_key.h>
#include <net/tcp.h>
-#include <net/inet_common.h>
+#include <net/tcp_ecn.h>
#include <net/xfrm.h>
#include <net/busy_poll.h>
+#include <net/rstreason.h>
+#include <net/psp.h>
static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
{
@@ -49,7 +46,7 @@ tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
/* Send ACK. Note, we do not put the bucket,
* it will be released by caller.
*/
- return TCP_TW_ACK;
+ return TCP_TW_ACK_OOW;
}
/* We are rate-limiting, so just release the tw sock and drop skb. */
@@ -57,6 +54,19 @@ tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
return TCP_TW_SUCCESS;
}
+static void twsk_rcv_nxt_update(struct tcp_timewait_sock *tcptw, u32 seq,
+ u32 rcv_nxt)
+{
+#ifdef CONFIG_TCP_AO
+ struct tcp_ao_info *ao;
+
+ ao = rcu_dereference(tcptw->ao_info);
+ if (unlikely(ao && seq < rcv_nxt))
+ WRITE_ONCE(ao->rcv_sne, ao->rcv_sne + 1);
+#endif
+ WRITE_ONCE(tcptw->tw_rcv_nxt, seq);
+}
+
/*
* * Main purpose of TIME-WAIT state is to close connection gracefully,
* when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
@@ -89,45 +99,59 @@ tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw,
*/
enum tcp_tw_status
tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
- const struct tcphdr *th)
+ const struct tcphdr *th, u32 *tw_isn,
+ enum skb_drop_reason *drop_reason)
{
- struct tcp_options_received tmp_opt;
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
+ u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt);
+ struct tcp_options_received tmp_opt;
+ enum skb_drop_reason psp_drop;
bool paws_reject = false;
+ int ts_recent_stamp;
+
+ /* Instead of dropping immediately, wait to see what value is
+ * returned. We will accept a non psp-encapsulated syn in the
+ * case where TCP_TW_SYN is returned.
+ */
+ psp_drop = psp_twsk_rx_policy_check(tw, skb);
tmp_opt.saw_tstamp = 0;
- if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
+ ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
+ if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) {
tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL);
if (tmp_opt.saw_tstamp) {
if (tmp_opt.rcv_tsecr)
tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset;
- tmp_opt.ts_recent = tcptw->tw_ts_recent;
- tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+ tmp_opt.ts_recent = READ_ONCE(tcptw->tw_ts_recent);
+ tmp_opt.ts_recent_stamp = ts_recent_stamp;
paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
}
}
- if (tw->tw_substate == TCP_FIN_WAIT2) {
+ if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) {
/* Just repeat all the checks of tcp_rcv_state_process() */
+ if (psp_drop)
+ goto out_put;
+
/* Out of window, send ACK */
if (paws_reject ||
!tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
- tcptw->tw_rcv_nxt,
- tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
+ rcv_nxt,
+ rcv_nxt + tcptw->tw_rcv_wnd))
return tcp_timewait_check_oow_rate_limit(
tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2);
if (th->rst)
goto kill;
- if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
+ if (th->syn && !before(TCP_SKB_CB(skb)->seq, rcv_nxt))
return TCP_TW_RST;
/* Dup ACK? */
if (!th->ack ||
- !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
+ !after(TCP_SKB_CB(skb)->end_seq, rcv_nxt) ||
TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
inet_twsk_put(tw);
return TCP_TW_SUCCESS;
@@ -137,15 +161,22 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
* reset.
*/
if (!th->fin ||
- TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1)
+ TCP_SKB_CB(skb)->end_seq != rcv_nxt + 1)
return TCP_TW_RST;
/* FIN arrived, enter true time-wait state. */
- tw->tw_substate = TCP_TIME_WAIT;
- tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ WRITE_ONCE(tw->tw_substate, TCP_TIME_WAIT);
+ twsk_rcv_nxt_update(tcptw, TCP_SKB_CB(skb)->end_seq,
+ rcv_nxt);
+
if (tmp_opt.saw_tstamp) {
- tcptw->tw_ts_recent_stamp = ktime_get_seconds();
- tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
+ u64 ts = tcp_clock_ms();
+
+ WRITE_ONCE(tw->tw_entry_stamp, ts);
+ WRITE_ONCE(tcptw->tw_ts_recent_stamp,
+ div_u64(ts, MSEC_PER_SEC));
+ WRITE_ONCE(tcptw->tw_ts_recent,
+ tmp_opt.rcv_tsval);
}
inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN);
@@ -170,16 +201,19 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
*/
if (!paws_reject &&
- (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
+ (TCP_SKB_CB(skb)->seq == rcv_nxt &&
(TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
/* In window segment, it may be only reset or bare ack. */
+ if (psp_drop)
+ goto out_put;
+
if (th->rst) {
/* This is TIME_WAIT assassination, in two flavors.
* Oh well... nobody has a sufficient solution to this
* protocol bug yet.
*/
- if (twsk_net(tw)->ipv4.sysctl_tcp_rfc1337 == 0) {
+ if (!READ_ONCE(twsk_net(tw)->ipv4.sysctl_tcp_rfc1337)) {
kill:
inet_twsk_deschedule_put(tw);
return TCP_TW_SUCCESS;
@@ -189,8 +223,10 @@ kill:
}
if (tmp_opt.saw_tstamp) {
- tcptw->tw_ts_recent = tmp_opt.rcv_tsval;
- tcptw->tw_ts_recent_stamp = ktime_get_seconds();
+ WRITE_ONCE(tcptw->tw_ts_recent,
+ tmp_opt.rcv_tsval);
+ WRITE_ONCE(tcptw->tw_ts_recent_stamp,
+ ktime_get_seconds());
}
inet_twsk_put(tw);
@@ -215,18 +251,23 @@ kill:
*/
if (th->syn && !th->rst && !th->ack && !paws_reject &&
- (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
+ (after(TCP_SKB_CB(skb)->seq, rcv_nxt) ||
(tmp_opt.saw_tstamp &&
- (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
+ (s32)(READ_ONCE(tcptw->tw_ts_recent) - tmp_opt.rcv_tsval) < 0))) {
u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
if (isn == 0)
isn++;
- TCP_SKB_CB(skb)->tcp_tw_isn = isn;
+ *tw_isn = isn;
return TCP_TW_SYN;
}
- if (paws_reject)
- __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED);
+ if (psp_drop)
+ goto out_put;
+
+ if (paws_reject) {
+ *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS;
+ __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED);
+ }
if (!th->rst) {
/* In this case we must reset the TIMEWAIT timer.
@@ -241,10 +282,44 @@ kill:
return tcp_timewait_check_oow_rate_limit(
tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
}
+
+out_put:
inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
-EXPORT_SYMBOL(tcp_timewait_state_process);
+EXPORT_IPV6_MOD(tcp_timewait_state_process);
+
+static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw)
+{
+#ifdef CONFIG_TCP_MD5SIG
+ const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_md5sig_key *key;
+
+ /*
+ * The timewait bucket does not have the key DB from the
+ * sock structure. We just make a quick copy of the
+ * md5 key being used (if indeed we are using one)
+ * so the timewait ack generating code has the key.
+ */
+ tcptw->tw_md5_key = NULL;
+ if (!static_branch_unlikely(&tcp_md5_needed.key))
+ return;
+
+ key = tp->af_specific->md5_lookup(sk, sk);
+ if (key) {
+ tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
+ if (!tcptw->tw_md5_key)
+ return;
+ if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key))
+ goto out_free;
+ }
+ return;
+out_free:
+ WARN_ON_ONCE(1);
+ kfree(tcptw->tw_md5_key);
+ tcptw->tw_md5_key = NULL;
+#endif
+}
/*
* Move a socket to time-wait or dead fin-wait-2 state.
@@ -252,28 +327,35 @@ EXPORT_SYMBOL(tcp_timewait_state_process);
void tcp_time_wait(struct sock *sk, int state, int timeo)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- const struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
struct inet_timewait_sock *tw;
- struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
- tw = inet_twsk_alloc(sk, tcp_death_row, state);
+ tw = inet_twsk_alloc(sk, &net->ipv4.tcp_death_row, state);
if (tw) {
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1);
- struct inet_sock *inet = inet_sk(sk);
- tw->tw_transparent = inet->transparent;
tw->tw_mark = sk->sk_mark;
+ tw->tw_priority = READ_ONCE(sk->sk_priority);
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
+ /* refreshed when we enter true TIME-WAIT state */
+ tw->tw_entry_stamp = tcp_time_stamp_ms(tp);
tcptw->tw_rcv_nxt = tp->rcv_nxt;
tcptw->tw_snd_nxt = tp->snd_nxt;
tcptw->tw_rcv_wnd = tcp_receive_window(tp);
tcptw->tw_ts_recent = tp->rx_opt.ts_recent;
tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
tcptw->tw_ts_offset = tp->tsoffset;
+ tw->tw_usec_ts = tp->tcp_usec_ts;
tcptw->tw_last_oow_ack_time = 0;
-
+ tcptw->tw_tx_delay = tp->tcp_tx_delay;
+ tw->tw_txhash = sk->sk_txhash;
+ tw->tw_tx_queue_mapping = sk->sk_tx_queue_mapping;
+#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING
+ tw->tw_rx_queue_mapping = sk->sk_rx_queue_mapping;
+#endif
#if IS_ENABLED(CONFIG_IPV6)
if (tw->tw_family == PF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -286,23 +368,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
}
#endif
-#ifdef CONFIG_TCP_MD5SIG
- /*
- * The timewait bucket does not have the key DB from the
- * sock structure. We just make a quick copy of the
- * md5 key being used (if indeed we are using one)
- * so the timewait ack generating code has the key.
- */
- do {
- struct tcp_md5sig_key *key;
- tcptw->tw_md5_key = NULL;
- key = tp->af_specific->md5_lookup(sk, sk);
- if (key) {
- tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
- BUG_ON(tcptw->tw_md5_key && !tcp_alloc_md5sig_pool());
- }
- } while (0);
-#endif
+ tcp_time_wait_init(sk, tcptw);
+ tcp_ao_time_wait(tcptw, tp);
/* Get the TIME_WAIT timeout firing. */
if (timeo < rto)
@@ -311,23 +378,16 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
if (state == TCP_TIME_WAIT)
timeo = TCP_TIMEWAIT_LEN;
- /* tw_timer is pinned, so we need to make sure BH are disabled
- * in following section, otherwise timer handler could run before
- * we complete the initialization.
- */
- local_bh_disable();
- inet_twsk_schedule(tw, timeo);
/* Linkage updates.
* Note that access to tw after this point is illegal.
*/
- inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
- local_bh_enable();
+ inet_twsk_hashdance_schedule(tw, sk, net->ipv4.tcp_death_row.hashinfo, timeo);
} else {
/* Sorry, if we're out of memory, just CLOSE this
* socket up. We've got bigger problems than
* non-graceful socket closings.
*/
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
+ NET_INC_STATS(net, LINUX_MIB_TCPTIMEWAITOVERFLOW);
}
tcp_update_metrics(sk);
@@ -338,13 +398,34 @@ EXPORT_SYMBOL(tcp_time_wait);
void tcp_twsk_destructor(struct sock *sk)
{
#ifdef CONFIG_TCP_MD5SIG
- struct tcp_timewait_sock *twsk = tcp_twsk(sk);
+ if (static_branch_unlikely(&tcp_md5_needed.key)) {
+ struct tcp_timewait_sock *twsk = tcp_twsk(sk);
- if (twsk->tw_md5_key)
- kfree_rcu(twsk->tw_md5_key, rcu);
+ if (twsk->tw_md5_key) {
+ kfree(twsk->tw_md5_key);
+ static_branch_slow_dec_deferred(&tcp_md5_needed);
+ }
+ }
#endif
+ tcp_ao_destroy_sock(sk, true);
+ psp_twsk_assoc_free(inet_twsk(sk));
+}
+
+void tcp_twsk_purge(struct list_head *net_exit_list)
+{
+ bool purged_once = false;
+ struct net *net;
+
+ list_for_each_entry(net, net_exit_list, exit_list) {
+ if (net->ipv4.tcp_death_row.hashinfo->pernet) {
+ /* Even if tw_refcount == 1, we must clean up kernel reqsk */
+ inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo);
+ } else if (!purged_once) {
+ inet_twsk_purge(&tcp_hashinfo);
+ purged_once = true;
+ }
+ }
}
-EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
/* Warning : This function is called without sk_listener being locked.
* Be sure to read socket fields once, as their value could change under us.
@@ -387,12 +468,27 @@ void tcp_openreq_init_rwin(struct request_sock *req,
rcv_wnd);
ireq->rcv_wscale = rcv_wscale;
}
-EXPORT_SYMBOL(tcp_openreq_init_rwin);
-static void tcp_ecn_openreq_child(struct tcp_sock *tp,
- const struct request_sock *req)
+static void tcp_ecn_openreq_child(struct sock *sk,
+ const struct request_sock *req,
+ const struct sk_buff *skb)
{
- tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0;
+ const struct tcp_request_sock *treq = tcp_rsk(req);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (treq->accecn_ok) {
+ tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
+ tp->syn_ect_snt = treq->syn_ect_snt;
+ tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt);
+ tp->saw_accecn_opt = treq->saw_accecn_opt;
+ tp->prev_ecnfield = treq->syn_ect_rcv;
+ tp->accecn_opt_demand = 1;
+ tcp_ecn_received_counters_payload(sk, skb);
+ } else {
+ tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ?
+ TCP_ECN_MODE_RFC3168 :
+ TCP_ECN_DISABLED);
+ }
}
void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
@@ -406,7 +502,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
rcu_read_lock();
ca = tcp_ca_find_key(ca_key);
- if (likely(ca && try_module_get(ca->owner))) {
+ if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
icsk->icsk_ca_ops = ca;
ca_got_dst = true;
@@ -417,14 +513,14 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
/* If no valid choice made yet, assign current system default ca. */
if (!ca_got_dst &&
(!icsk->icsk_ca_setsockopt ||
- !try_module_get(icsk->icsk_ca_ops->owner)))
+ !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner)))
tcp_assign_congestion_control(sk);
tcp_set_ca_state(sk, TCP_CA_Open);
}
-EXPORT_SYMBOL_GPL(tcp_ca_openreq_child);
+EXPORT_IPV6_MOD_GPL(tcp_ca_openreq_child);
-static void smc_check_reset_syn_req(struct tcp_sock *oldtp,
+static void smc_check_reset_syn_req(const struct tcp_sock *oldtp,
struct request_sock *req,
struct tcp_sock *newtp)
{
@@ -453,7 +549,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
const struct inet_request_sock *ireq = inet_rsk(req);
struct tcp_request_sock *treq = tcp_rsk(req);
struct inet_connection_sock *newicsk;
- struct tcp_sock *oldtp, *newtp;
+ const struct tcp_sock *oldtp;
+ struct tcp_sock *newtp;
+ u32 seq;
if (!newsk)
return NULL;
@@ -467,58 +565,34 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
/* Now setup tcp_sock */
newtp->pred_flags = 0;
- newtp->rcv_wup = newtp->copied_seq =
- newtp->rcv_nxt = treq->rcv_isn + 1;
+ seq = treq->rcv_isn + 1;
+ newtp->rcv_wup = seq;
+ WRITE_ONCE(newtp->copied_seq, seq);
+ WRITE_ONCE(newtp->rcv_nxt, seq);
newtp->segs_in = 1;
- newtp->snd_sml = newtp->snd_una =
- newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
+ seq = treq->snt_isn + 1;
+ newtp->snd_sml = newtp->snd_una = seq;
+ WRITE_ONCE(newtp->snd_nxt, seq);
+ newtp->snd_up = seq;
INIT_LIST_HEAD(&newtp->tsq_node);
INIT_LIST_HEAD(&newtp->tsorted_sent_queue);
tcp_init_wl(newtp, treq->rcv_isn);
- newtp->srtt_us = 0;
- newtp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT);
minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U);
- newicsk->icsk_rto = TCP_TIMEOUT_INIT;
newicsk->icsk_ack.lrcvtime = tcp_jiffies32;
- newtp->packets_out = 0;
- newtp->retrans_out = 0;
- newtp->sacked_out = 0;
- newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
- newtp->tlp_high_seq = 0;
newtp->lsndtime = tcp_jiffies32;
- newsk->sk_txhash = treq->txhash;
- newtp->last_oow_ack_time = 0;
+ newsk->sk_txhash = READ_ONCE(treq->txhash);
newtp->total_retrans = req->num_retrans;
- /* So many TCP implementations out there (incorrectly) count the
- * initial SYN frame in their delayed-ACK and congestion control
- * algorithms that we must have the following bandaid to talk
- * efficiently to them. -DaveM
- */
- newtp->snd_cwnd = TCP_INIT_CWND;
- newtp->snd_cwnd_cnt = 0;
-
- /* There's a bubble in the pipe until at least the first ACK. */
- newtp->app_limited = ~0U;
-
tcp_init_xmit_timers(newsk);
- newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
-
- newtp->rx_opt.saw_tstamp = 0;
-
- newtp->rx_opt.dsack = 0;
- newtp->rx_opt.num_sacks = 0;
-
- newtp->urg_data = 0;
+ WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1);
if (sock_flag(newsk, SOCK_KEEPOPEN))
- inet_csk_reset_keepalive_timer(newsk,
- keepalive_time_when(newtp));
+ tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp));
newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
newtp->rx_opt.sack_ok = ireq->sack_ok;
@@ -537,35 +611,59 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->max_window = newtp->snd_wnd;
if (newtp->rx_opt.tstamp_ok) {
+ newtp->tcp_usec_ts = treq->req_usec_ts;
newtp->rx_opt.ts_recent = req->ts_recent;
newtp->rx_opt.ts_recent_stamp = ktime_get_seconds();
newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
} else {
+ newtp->tcp_usec_ts = 0;
newtp->rx_opt.ts_recent_stamp = 0;
newtp->tcp_header_len = sizeof(struct tcphdr);
}
+ if (req->num_timeout) {
+ newtp->total_rto = req->num_timeout;
+ newtp->undo_marker = treq->snt_isn;
+ if (newtp->tcp_usec_ts) {
+ newtp->retrans_stamp = treq->snt_synack;
+ newtp->total_rto_time = (u32)(tcp_clock_us() -
+ newtp->retrans_stamp) / USEC_PER_MSEC;
+ } else {
+ newtp->retrans_stamp = div_u64(treq->snt_synack,
+ USEC_PER_SEC / TCP_TS_HZ);
+ newtp->total_rto_time = tcp_clock_ms() -
+ newtp->retrans_stamp;
+ }
+ newtp->total_rto_recoveries = 1;
+ }
newtp->tsoffset = treq->ts_off;
#ifdef CONFIG_TCP_MD5SIG
newtp->md5sig_info = NULL; /*XXX*/
- if (newtp->af_specific->md5_lookup(sk, newsk))
- newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
#endif
+#ifdef CONFIG_TCP_AO
+ newtp->ao_info = NULL;
+
+ if (tcp_rsk_used_ao(req)) {
+ struct tcp_ao_key *ao_key;
+
+ ao_key = treq->af_specific->ao_lookup(sk, req, tcp_rsk(req)->ao_keyid, -1);
+ if (ao_key)
+ newtp->tcp_header_len += tcp_ao_len_aligned(ao_key);
+ }
+ #endif
if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
newtp->rx_opt.mss_clamp = req->mss;
- tcp_ecn_openreq_child(newtp, req);
+ tcp_ecn_openreq_child(newsk, req, skb);
newtp->fastopen_req = NULL;
- newtp->fastopen_rsk = NULL;
- newtp->syn_data_acked = 0;
- newtp->rack.mstamp = 0;
- newtp->rack.advanced = 0;
- newtp->rack.reo_wnd_steps = 1;
- newtp->rack.last_delivered = 0;
- newtp->rack.reo_wnd_persist = 0;
- newtp->rack.dsack_seen = 0;
+ RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
+
+ newtp->bpf_chg_cc_inprogress = 0;
+ tcp_bpf_clone(sk, newsk);
__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
+ xa_init_flags(&newsk->sk_user_frags, XA_FLAGS_ALLOC1);
+
return newsk;
}
EXPORT_SYMBOL(tcp_create_openreq_child);
@@ -579,32 +677,44 @@ EXPORT_SYMBOL(tcp_create_openreq_child);
* validation and inside tcp_v4_reqsk_send_ack(). Can we do better?
*
* We don't need to initialize tmp_opt.sack_ok as we don't use the results
+ *
+ * Note: If @fastopen is true, this can be called from process context.
+ * Otherwise, this is from BH context.
*/
struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
- bool fastopen, bool *req_stolen)
+ bool fastopen, bool *req_stolen,
+ enum skb_drop_reason *drop_reason)
{
struct tcp_options_received tmp_opt;
struct sock *child;
const struct tcphdr *th = tcp_hdr(skb);
__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
+ bool tsecr_reject = false;
bool paws_reject = false;
bool own_req;
tmp_opt.saw_tstamp = 0;
+ tmp_opt.accecn = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = req->ts_recent;
- if (tmp_opt.rcv_tsecr)
+ if (tmp_opt.rcv_tsecr) {
+ if (inet_rsk(req)->tstamp_ok && !fastopen)
+ tsecr_reject = !between(tmp_opt.rcv_tsecr,
+ tcp_rsk(req)->snt_tsval_first,
+ READ_ONCE(tcp_rsk(req)->snt_tsval_last));
tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off;
+ }
/* We do not store true stamp, but it is not required,
* it can be estimated (approximately)
* from another data.
*/
- tmp_opt.ts_recent_stamp = ktime_get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout);
+ tmp_opt.ts_recent_stamp = ktime_get_seconds() -
+ tcp_reqsk_timeout(req) / HZ;
paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
}
}
@@ -640,11 +750,10 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
LINUX_MIB_TCPACKSKIPPEDSYNRECV,
&tcp_rsk(req)->last_oow_ack_time) &&
- !inet_rtx_syn_ack(sk, req)) {
+ !tcp_rtx_synack(sk, req)) {
unsigned long expires = jiffies;
- expires += min(TCP_TIMEOUT_INIT << req->num_timeout,
- TCP_RTO_MAX);
+ expires += tcp_reqsk_timeout(req);
if (!fastopen)
mod_timer_pending(&req->rsk_timer, expires);
else
@@ -715,31 +824,34 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
tcp_rsk(req)->snt_isn + 1))
return sk;
- /* Also, it would be not so bad idea to check rcv_tsecr, which
- * is essentially ACK extension and too early or too late values
- * should cause reset in unsynchronized states.
- */
-
/* RFC793: "first check sequence number". */
- if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
- tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) {
+ if (paws_reject || tsecr_reject ||
+ !tcp_in_window(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(skb)->end_seq,
+ tcp_rsk(req)->rcv_nxt,
+ tcp_rsk(req)->rcv_nxt +
+ tcp_synack_window(req))) {
/* Out of window: send ACK and drop. */
if (!(flg & TCP_FLAG_RST) &&
!tcp_oow_rate_limited(sock_net(sk), skb,
LINUX_MIB_TCPACKSKIPPEDSYNRECV,
&tcp_rsk(req)->last_oow_ack_time))
req->rsk_ops->send_ack(sk, skb, req);
- if (paws_reject)
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+ if (paws_reject) {
+ SKB_DR_SET(*drop_reason, TCP_RFC7323_PAWS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+ } else if (tsecr_reject) {
+ SKB_DR_SET(*drop_reason, TCP_RFC7323_TSECR);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TSECRREJECTED);
+ } else {
+ SKB_DR_SET(*drop_reason, TCP_OVERWINDOW);
+ }
return NULL;
}
/* In sequence, PAWS is OK. */
- if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
- req->ts_recent = tmp_opt.rcv_tsval;
-
if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
/* Truncate SYN, it is out of window starting
at tcp_rsk(req)->rcv_isn + 1. */
@@ -750,7 +862,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* "fourth, check the SYN bit"
*/
if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) {
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
goto embryonic_reset;
}
@@ -763,6 +875,18 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (!(flg & TCP_FLAG_ACK))
return NULL;
+ if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn &&
+ tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+ u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn);
+
+ tcp_rsk(req)->saw_accecn_opt = saw_opt;
+ if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) {
+ u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV;
+
+ tcp_rsk(req)->accecn_fail_mode |= fail_mode;
+ }
+ }
+
/* For Fast Open no more processing is needed (sk is the
* child socket).
*/
@@ -770,7 +894,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
return sk;
/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
- if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
+ if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) &&
TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
inet_rsk(req)->acked = 1;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
@@ -788,13 +912,27 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (!child)
goto listen_overflow;
+ if (own_req && tmp_opt.saw_tstamp &&
+ !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt))
+ tcp_sk(child)->rx_opt.ts_recent = tmp_opt.rcv_tsval;
+
+ if (own_req && rsk_drop_req(req)) {
+ reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req);
+ inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req);
+ return child;
+ }
+
sock_rps_save_rxhash(child, skb);
tcp_synack_rtt_meas(child, req);
*req_stolen = !own_req;
return inet_csk_complete_hashdance(sk, child, req, own_req);
listen_overflow:
- if (!sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow) {
+ SKB_DR_SET(*drop_reason, TCP_LISTEN_OVERFLOW);
+ if (sk != req->rsk_listener)
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE);
+
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow)) {
inet_rsk(req)->acked = 1;
return NULL;
}
@@ -806,18 +944,21 @@ embryonic_reset:
* avoid becoming vulnerable to outside attack aiming at
* resetting legit local connections.
*/
- req->rsk_ops->send_reset(sk, skb);
+ req->rsk_ops->send_reset(sk, skb, SK_RST_REASON_INVALID_SYN);
} else if (fastopen) { /* received a valid RST pkt */
reqsk_fastopen_remove(sk, req, true);
- tcp_reset(sk);
+ tcp_reset(sk, skb);
}
if (!fastopen) {
- inet_csk_reqsk_queue_drop(sk, req);
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
+ bool unlinked = inet_csk_reqsk_queue_drop(sk, req);
+
+ if (unlinked)
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS);
+ *req_stolen = !unlinked;
}
return NULL;
}
-EXPORT_SYMBOL(tcp_check_req);
+EXPORT_IPV6_MOD(tcp_check_req);
/*
* Queue segment on the new socket if the new socket is active,
@@ -831,18 +972,19 @@ EXPORT_SYMBOL(tcp_check_req);
* be created.
*/
-int tcp_child_process(struct sock *parent, struct sock *child,
- struct sk_buff *skb)
+enum skb_drop_reason tcp_child_process(struct sock *parent, struct sock *child,
+ struct sk_buff *skb)
+ __releases(&((child)->sk_lock.slock))
{
- int ret = 0;
+ enum skb_drop_reason reason = SKB_NOT_DROPPED_YET;
int state = child->sk_state;
- /* record NAPI ID of child */
- sk_mark_napi_id(child, skb);
+ /* record sk_napi_id and sk_rx_queue_mapping of child. */
+ sk_mark_napi_id_set(child, skb);
tcp_segs_in(tcp_sk(child), skb);
if (!sock_owned_by_user(child)) {
- ret = tcp_rcv_state_process(child, skb);
+ reason = tcp_rcv_state_process(child, skb);
/* Wakeup parent, send SIGIO */
if (state == TCP_SYN_RECV && child->sk_state != state)
parent->sk_data_ready(parent);
@@ -856,6 +998,6 @@ int tcp_child_process(struct sock *parent, struct sock *child,
bh_unlock_sock(child);
sock_put(child);
- return ret;
+ return reason;
}
-EXPORT_SYMBOL(tcp_child_process);
+EXPORT_IPV6_MOD(tcp_child_process);
diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c
index 764298e52577..a60662f4bdf9 100644
--- a/net/ipv4/tcp_nv.c
+++ b/net/ipv4/tcp_nv.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* TCP NV: TCP with Congestion Avoidance
*
@@ -24,7 +25,6 @@
* 1) Add mechanism to deal with reverse congestion.
*/
-#include <linux/mm.h>
#include <linux/module.h>
#include <linux/math64.h>
#include <net/tcp.h>
@@ -197,10 +197,10 @@ static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked)
}
if (ca->cwnd_growth_factor < 0) {
- cnt = tp->snd_cwnd << -ca->cwnd_growth_factor;
+ cnt = tcp_snd_cwnd(tp) << -ca->cwnd_growth_factor;
tcp_cong_avoid_ai(tp, cnt, acked);
} else {
- cnt = max(4U, tp->snd_cwnd >> ca->cwnd_growth_factor);
+ cnt = max(4U, tcp_snd_cwnd(tp) >> ca->cwnd_growth_factor);
tcp_cong_avoid_ai(tp, cnt, acked);
}
}
@@ -209,7 +209,7 @@ static u32 tcpnv_recalc_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U);
+ return max((tcp_snd_cwnd(tp) * nv_loss_dec_factor) >> 10, 2U);
}
static void tcpnv_state(struct sock *sk, u8 new_state)
@@ -257,7 +257,7 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample)
return;
/* Stop cwnd growth if we were in catch up mode */
- if (ca->nv_catchup && tp->snd_cwnd >= nv_min_cwnd) {
+ if (ca->nv_catchup && tcp_snd_cwnd(tp) >= nv_min_cwnd) {
ca->nv_catchup = 0;
ca->nv_allow_cwnd_growth = 0;
}
@@ -371,7 +371,7 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample)
* if cwnd < max_win, grow cwnd
* else leave the same
*/
- if (tp->snd_cwnd > max_win) {
+ if (tcp_snd_cwnd(tp) > max_win) {
/* there is congestion, check that it is ok
* to make a CA decision
* 1. We should have at least nv_dec_eval_min_calls
@@ -398,20 +398,20 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample)
ca->nv_allow_cwnd_growth = 0;
tp->snd_ssthresh =
(nv_ssthresh_factor * max_win) >> 3;
- if (tp->snd_cwnd - max_win > 2) {
+ if (tcp_snd_cwnd(tp) - max_win > 2) {
/* gap > 2, we do exponential cwnd decrease */
int dec;
- dec = max(2U, ((tp->snd_cwnd - max_win) *
+ dec = max(2U, ((tcp_snd_cwnd(tp) - max_win) *
nv_cong_dec_mult) >> 7);
- tp->snd_cwnd -= dec;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - dec);
} else if (nv_cong_dec_mult > 0) {
- tp->snd_cwnd = max_win;
+ tcp_snd_cwnd_set(tp, max_win);
}
if (ca->cwnd_growth_factor > 0)
ca->cwnd_growth_factor = 0;
ca->nv_no_cong_cnt = 0;
- } else if (tp->snd_cwnd <= max_win - nv_pad_buffer) {
+ } else if (tcp_snd_cwnd(tp) <= max_win - nv_pad_buffer) {
/* There is no congestion, grow cwnd if allowed*/
if (ca->nv_eval_call_cnt < nv_inc_eval_min_calls)
return;
@@ -444,8 +444,8 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample)
* (it wasn't before, if it is now is because nv
* decreased it).
*/
- if (tp->snd_cwnd < nv_min_cwnd)
- tp->snd_cwnd = nv_min_cwnd;
+ if (tcp_snd_cwnd(tp) < nv_min_cwnd)
+ tcp_snd_cwnd_set(tp, nv_min_cwnd);
}
}
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 870b0a335061..fdda18b1abda 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -1,25 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPV4 GSO/GRO offload support
* Linux INET implementation
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* TCPv4 GSO/GRO support
*/
+#include <linux/indirect_call_wrapper.h>
#include <linux/skbuff.h>
+#include <net/gro.h>
+#include <net/gso.h>
#include <net/tcp.h>
#include <net/protocol.h>
-static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq,
+static void tcp_gso_tstamp(struct sk_buff *skb, struct sk_buff *gso_skb,
unsigned int seq, unsigned int mss)
{
+ u32 flags = skb_shinfo(gso_skb)->tx_flags & SKBTX_ANY_TSTAMP;
+ u32 ts_seq = skb_shinfo(gso_skb)->tskey;
+
while (skb) {
if (before(ts_seq, seq + mss)) {
- skb_shinfo(skb)->tx_flags |= SKBTX_SW_TSTAMP;
+ skb_shinfo(skb)->tx_flags |= flags;
skb_shinfo(skb)->tskey = ts_seq;
return;
}
@@ -29,6 +31,70 @@ static void tcp_gso_tstamp(struct sk_buff *skb, unsigned int ts_seq,
}
}
+static void __tcpv4_gso_segment_csum(struct sk_buff *seg,
+ __be32 *oldip, __be32 newip,
+ __be16 *oldport, __be16 newport)
+{
+ struct tcphdr *th;
+ struct iphdr *iph;
+
+ if (*oldip == newip && *oldport == newport)
+ return;
+
+ th = tcp_hdr(seg);
+ iph = ip_hdr(seg);
+
+ inet_proto_csum_replace4(&th->check, seg, *oldip, newip, true);
+ inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false);
+ *oldport = newport;
+
+ csum_replace4(&iph->check, *oldip, newip);
+ *oldip = newip;
+}
+
+static struct sk_buff *__tcpv4_gso_segment_list_csum(struct sk_buff *segs)
+{
+ const struct tcphdr *th;
+ const struct iphdr *iph;
+ struct sk_buff *seg;
+ struct tcphdr *th2;
+ struct iphdr *iph2;
+
+ seg = segs;
+ th = tcp_hdr(seg);
+ iph = ip_hdr(seg);
+ th2 = tcp_hdr(seg->next);
+ iph2 = ip_hdr(seg->next);
+
+ if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) &&
+ iph->daddr == iph2->daddr && iph->saddr == iph2->saddr)
+ return segs;
+
+ while ((seg = seg->next)) {
+ th2 = tcp_hdr(seg);
+ iph2 = ip_hdr(seg);
+
+ __tcpv4_gso_segment_csum(seg,
+ &iph2->saddr, iph->saddr,
+ &th2->source, th->source);
+ __tcpv4_gso_segment_csum(seg,
+ &iph2->daddr, iph->daddr,
+ &th2->dest, th->dest);
+ }
+
+ return segs;
+}
+
+static struct sk_buff *__tcp4_gso_segment_list(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
+ if (IS_ERR(skb))
+ return skb;
+
+ return __tcpv4_gso_segment_list_csum(skb);
+}
+
static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
@@ -38,6 +104,15 @@ static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb,
if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
return ERR_PTR(-EINVAL);
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) {
+ struct tcphdr *th = tcp_hdr(skb);
+
+ if (skb_pagelen(skb) - th->doff * 4 == skb_shinfo(skb)->gso_size)
+ return __tcp4_gso_segment_list(skb, features);
+
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+
if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
const struct iphdr *iph = ip_hdr(skb);
struct tcphdr *th = tcp_hdr(skb);
@@ -62,22 +137,26 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
struct tcphdr *th;
unsigned int thlen;
unsigned int seq;
- __be32 delta;
unsigned int oldlen;
unsigned int mss;
struct sk_buff *gso_skb = skb;
__sum16 newcheck;
bool ooo_okay, copy_destructor;
+ bool ecn_cwr_mask;
+ __wsum delta;
th = tcp_hdr(skb);
thlen = th->doff * 4;
if (thlen < sizeof(*th))
goto out;
+ if (unlikely(skb_checksum_start(skb) != skb_transport_header(skb)))
+ goto out;
+
if (!pskb_may_pull(skb, thlen))
goto out;
- oldlen = (u16)~skb->len;
+ oldlen = ~skb->len;
__skb_pull(skb, thlen);
mss = skb_shinfo(skb)->gso_size;
@@ -112,17 +191,18 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
if (skb_is_gso(segs))
mss *= skb_shinfo(segs)->gso_segs;
- delta = htonl(oldlen + (thlen + mss));
+ delta = (__force __wsum)htonl(oldlen + thlen + mss);
skb = segs;
th = tcp_hdr(skb);
seq = ntohl(th->seq);
- if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_SW_TSTAMP))
- tcp_gso_tstamp(segs, skb_shinfo(gso_skb)->tskey, seq, mss);
+ if (unlikely(skb_shinfo(gso_skb)->tx_flags & SKBTX_ANY_TSTAMP))
+ tcp_gso_tstamp(segs, gso_skb, seq, mss);
+
+ newcheck = ~csum_fold(csum_add(csum_unfold(th->check), delta));
- newcheck = ~csum_fold((__force __wsum)((__force u32)th->check +
- (__force u32)delta));
+ ecn_cwr_mask = !!(skb_shinfo(gso_skb)->gso_type & SKB_GSO_TCP_ACCECN);
while (skb->next) {
th->fin = th->psh = 0;
@@ -143,7 +223,8 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
th = tcp_hdr(skb);
th->seq = htonl(seq);
- th->cwr = 0;
+
+ th->cwr &= ecn_cwr_mask;
}
/* Following permits TCP Small Queues to work well with GSO :
@@ -167,11 +248,11 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
WARN_ON_ONCE(refcount_sub_and_test(-delta, &skb->sk->sk_wmem_alloc));
}
- delta = htonl(oldlen + (skb_tail_pointer(skb) -
- skb_transport_header(skb)) +
- skb->data_len);
- th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
- (__force u32)delta));
+ delta = (__force __wsum)htonl(oldlen +
+ (skb_tail_pointer(skb) -
+ skb_transport_header(skb)) +
+ skb->data_len);
+ th->check = ~csum_fold(csum_add(csum_unfold(th->check), delta));
if (skb->ip_summed == CHECKSUM_PARTIAL)
gso_reset_checksum(skb, ~th->check);
else
@@ -180,91 +261,84 @@ out:
return segs;
}
-struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb)
+struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th)
{
- struct sk_buff *pp = NULL;
- struct sk_buff *p;
- struct tcphdr *th;
struct tcphdr *th2;
- unsigned int len;
- unsigned int thlen;
- __be32 flags;
- unsigned int mss = 1;
- unsigned int hlen;
- unsigned int off;
- int flush = 1;
- int i;
-
- off = skb_gro_offset(skb);
- hlen = off + sizeof(*th);
- th = skb_gro_header_fast(skb, off);
- if (skb_gro_header_hard(skb, hlen)) {
- th = skb_gro_header_slow(skb, hlen, off);
- if (unlikely(!th))
- goto out;
- }
-
- thlen = th->doff * 4;
- if (thlen < sizeof(*th))
- goto out;
-
- hlen = off + thlen;
- if (skb_gro_header_hard(skb, hlen)) {
- th = skb_gro_header_slow(skb, hlen, off);
- if (unlikely(!th))
- goto out;
- }
-
- skb_gro_pull(skb, thlen);
-
- len = skb_gro_len(skb);
- flags = tcp_flag_word(th);
+ struct sk_buff *p;
list_for_each_entry(p, head, list) {
if (!NAPI_GRO_CB(p)->same_flow)
continue;
th2 = tcp_hdr(p);
-
if (*(u32 *)&th->source ^ *(u32 *)&th2->source) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
- goto found;
+ return p;
}
- p = NULL;
- goto out_check_final;
-found:
- /* Include the IP ID check below from the inner most IP hdr */
- flush = NAPI_GRO_CB(p)->flush;
- flush |= (__force int)(flags & TCP_FLAG_CWR);
+ return NULL;
+}
+
+struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
+ struct tcphdr *th)
+{
+ unsigned int thlen = th->doff * 4;
+ struct sk_buff *pp = NULL;
+ struct sk_buff *p;
+ struct tcphdr *th2;
+ unsigned int len;
+ __be32 flags;
+ unsigned int mss = 1;
+ int flush = 1;
+ int i;
+
+ len = skb_gro_len(skb);
+ flags = tcp_flag_word(th);
+
+ p = tcp_gro_lookup(head, th);
+ if (!p)
+ goto out_check_final;
+
+ th2 = tcp_hdr(p);
+ flush = (__force int)(flags & TCP_FLAG_CWR);
flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
- ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH));
+ ~(TCP_FLAG_FIN | TCP_FLAG_PSH));
flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
for (i = sizeof(*th); i < thlen; i += 4)
flush |= *(u32 *)((u8 *)th + i) ^
*(u32 *)((u8 *)th2 + i);
- /* When we receive our second frame we can made a decision on if we
- * continue this flow as an atomic flow with a fixed ID or if we use
- * an incrementing ID.
- */
- if (NAPI_GRO_CB(p)->flush_id != 1 ||
- NAPI_GRO_CB(p)->count != 1 ||
- !NAPI_GRO_CB(p)->is_atomic)
- flush |= NAPI_GRO_CB(p)->flush_id;
- else
- NAPI_GRO_CB(p)->is_atomic = false;
+ flush |= gro_receive_network_flush(th, th2, p);
mss = skb_shinfo(p)->gso_size;
- flush |= (len - 1) >= mss;
+ /* If skb is a GRO packet, make sure its gso_size matches prior packet mss.
+ * If it is a single frame, do not aggregate it if its length
+ * is bigger than our mss.
+ */
+ if (unlikely(skb_is_gso(skb)))
+ flush |= (mss != skb_shinfo(skb)->gso_size);
+ else
+ flush |= (len - 1) >= mss;
+
flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
-#ifdef CONFIG_TLS_DEVICE
- flush |= p->decrypted ^ skb->decrypted;
-#endif
+ flush |= skb_cmp_decrypted(p, skb);
+
+ if (unlikely(NAPI_GRO_CB(p)->is_flist)) {
+ flush |= (__force int)(flags ^ tcp_flag_word(th2));
+ flush |= skb->ip_summed != p->ip_summed;
+ flush |= skb->csum_level != p->csum_level;
+ flush |= NAPI_GRO_CB(p)->count >= 64;
+ skb_set_network_header(skb, skb_gro_receive_network_offset(skb));
+
+ if (flush || skb_gro_receive_list(p, skb))
+ mss = 1;
+
+ goto out_check_final;
+ }
if (flush || skb_gro_receive(p, skb)) {
mss = 1;
@@ -274,7 +348,12 @@ found:
tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH);
out_check_final:
- flush = len < mss;
+ /* Force a flush if last segment is smaller than mss. */
+ if (unlikely(skb_is_gso(skb)))
+ flush = len != NAPI_GRO_CB(skb)->count * skb_shinfo(skb)->gso_size;
+ else
+ flush = len < mss;
+
flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH |
TCP_FLAG_RST | TCP_FLAG_SYN |
TCP_FLAG_FIN));
@@ -282,66 +361,118 @@ out_check_final:
if (p && (!NAPI_GRO_CB(skb)->same_flow || flush))
pp = p;
-out:
NAPI_GRO_CB(skb)->flush |= (flush != 0);
return pp;
}
-int tcp_gro_complete(struct sk_buff *skb)
+void tcp_gro_complete(struct sk_buff *skb)
{
struct tcphdr *th = tcp_hdr(skb);
+ struct skb_shared_info *shinfo;
+
+ if (skb->encapsulation)
+ skb->inner_transport_header = skb->transport_header;
skb->csum_start = (unsigned char *)th - skb->head;
skb->csum_offset = offsetof(struct tcphdr, check);
skb->ip_summed = CHECKSUM_PARTIAL;
- skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+ shinfo = skb_shinfo(skb);
+ shinfo->gso_segs = NAPI_GRO_CB(skb)->count;
if (th->cwr)
- skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
-
- return 0;
+ shinfo->gso_type |= SKB_GSO_TCP_ACCECN;
}
EXPORT_SYMBOL(tcp_gro_complete);
-static struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
+static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
+ struct tcphdr *th)
+{
+ const struct iphdr *iph;
+ struct sk_buff *p;
+ struct sock *sk;
+ struct net *net;
+ int iif, sdif;
+
+ if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST)))
+ return;
+
+ p = tcp_gro_lookup(head, th);
+ if (p) {
+ NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
+ return;
+ }
+
+ inet_get_iif_sdif(skb, &iif, &sdif);
+ iph = skb_gro_network_header(skb);
+ net = dev_net_rcu(skb->dev);
+ sk = __inet_lookup_established(net, iph->saddr, th->source,
+ iph->daddr, ntohs(th->dest),
+ iif, sdif);
+ NAPI_GRO_CB(skb)->is_flist = !sk;
+ if (sk)
+ sock_gen_put(sk);
+}
+
+INDIRECT_CALLABLE_SCOPE
+struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)
{
+ struct tcphdr *th;
+
/* Don't bother verifying checksum if we're going to flush anyway. */
if (!NAPI_GRO_CB(skb)->flush &&
skb_gro_checksum_validate(skb, IPPROTO_TCP,
- inet_gro_compute_pseudo)) {
- NAPI_GRO_CB(skb)->flush = 1;
- return NULL;
- }
+ inet_gro_compute_pseudo))
+ goto flush;
+
+ th = tcp_gro_pull_header(skb);
+ if (!th)
+ goto flush;
+
+ tcp4_check_fraglist_gro(head, skb, th);
+
+ return tcp_gro_receive(head, skb, th);
- return tcp_gro_receive(head, skb);
+flush:
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
}
-static int tcp4_gro_complete(struct sk_buff *skb, int thoff)
+INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
{
- const struct iphdr *iph = ip_hdr(skb);
+ const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
+ const struct iphdr *iph = (struct iphdr *)(skb->data + offset);
struct tcphdr *th = tcp_hdr(skb);
+ if (unlikely(NAPI_GRO_CB(skb)->is_flist)) {
+ skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4;
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+ __skb_incr_checksum_unnecessary(skb);
+
+ return 0;
+ }
+
th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
iph->daddr, 0);
- skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4;
- if (NAPI_GRO_CB(skb)->is_atomic)
- skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_FIXEDID;
+ BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID << 1 != SKB_GSO_TCP_FIXEDID_INNER);
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 |
+ (NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID);
- return tcp_gro_complete(skb);
+ tcp_gro_complete(skb);
+ return 0;
}
-static const struct net_offload tcpv4_offload = {
- .callbacks = {
- .gso_segment = tcp4_gso_segment,
- .gro_receive = tcp4_gro_receive,
- .gro_complete = tcp4_gro_complete,
- },
-};
-
int __init tcpv4_offload_init(void)
{
- return inet_add_offload(&tcpv4_offload, IPPROTO_TCP);
+ net_hotdata.tcpv4_offload = (struct net_offload) {
+ .callbacks = {
+ .gso_segment = tcp4_gso_segment,
+ .gro_receive = tcp4_gro_receive,
+ .gro_complete = tcp4_gro_complete,
+ },
+ };
+ return inet_add_offload(&net_hotdata.tcpv4_offload, IPPROTO_TCP);
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 597dbd749f05..479afb714bdf 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -37,14 +38,31 @@
#define pr_fmt(fmt) "TCP: " fmt
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
+#include <net/mptcp.h>
+#include <net/smc.h>
+#include <net/proto_memory.h>
+#include <net/psp.h>
#include <linux/compiler.h>
#include <linux/gfp.h>
#include <linux/module.h>
#include <linux/static_key.h>
+#include <linux/skbuff_ref.h>
#include <trace/events/tcp.h>
+/* Refresh clocks of a TCP socket,
+ * ensuring monotically increasing values.
+ */
+void tcp_mstamp_refresh(struct tcp_sock *tp)
+{
+ u64 val = tcp_clock_ns();
+
+ tp->tcp_clock_cache = val;
+ tp->tcp_mstamp = div_u64(val, NSEC_PER_USEC);
+}
+
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);
@@ -55,17 +73,21 @@ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
unsigned int prior_packets = tp->packets_out;
- tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+ WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(skb)->end_seq);
__skb_unlink(skb, &sk->sk_write_queue);
tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
+ if (tp->highest_sack == NULL)
+ tp->highest_sack = skb;
+
tp->packets_out += tcp_skb_pcount(skb);
if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
tcp_rearm_rto(sk);
NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT,
tcp_skb_pcount(skb));
+ tcp_check_space(sk);
}
/* SND.NXT, if window was not shrunk or the amount of shrunk was less than one
@@ -126,7 +148,7 @@ void tcp_cwnd_restart(struct sock *sk, s32 delta)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk));
- u32 cwnd = tp->snd_cwnd;
+ u32 cwnd = tcp_snd_cwnd(tp);
tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
@@ -135,7 +157,7 @@ void tcp_cwnd_restart(struct sock *sk, s32 delta)
while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
cwnd >>= 1;
- tp->snd_cwnd = max(cwnd, restart_cwnd);
+ tcp_snd_cwnd_set(tp, max(cwnd, restart_cwnd));
tp->snd_cwnd_stamp = tcp_jiffies32;
tp->snd_cwnd_used = 0;
}
@@ -153,15 +175,14 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
tp->lsndtime = now;
/* If it is a reply for ato after last received
- * packet, enter pingpong mode.
+ * packet, increase pingpong count.
*/
if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
- icsk->icsk_ack.pingpong = 1;
+ inet_csk_inc_pingpong_cnt(sk);
}
/* Account for an ACK we sent. */
-static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
- u32 rcv_nxt)
+static inline void tcp_event_ack_sent(struct sock *sk, u32 rcv_nxt)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -175,25 +196,10 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
if (unlikely(rcv_nxt != tp->rcv_nxt))
return; /* Special ACK sent by DCTCP to reflect ECN */
- tcp_dec_quickack_mode(sk, pkts);
+ tcp_dec_quickack_mode(sk);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
-
-u32 tcp_default_init_rwnd(u32 mss)
-{
- /* Initial receive window should be twice of TCP_INIT_CWND to
- * enable proper sending of new unsent data during fast recovery
- * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a
- * limit when mss is larger than 1460.
- */
- u32 init_rwnd = TCP_INIT_CWND * 2;
-
- if (mss > 1460)
- init_rwnd = max((1460 * init_rwnd) / mss, 2U);
- return init_rwnd;
-}
-
/* Determine a window scaling and initial window to offer.
* Based on the assumption that the given amount of space
* will be offered. Store the results in the tp structure.
@@ -202,16 +208,17 @@ u32 tcp_default_init_rwnd(u32 mss)
* This MUST be enforced by all callers.
*/
void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
- __u32 *rcv_wnd, __u32 *window_clamp,
+ __u32 *rcv_wnd, __u32 *__window_clamp,
int wscale_ok, __u8 *rcv_wscale,
__u32 init_rcv_wnd)
{
unsigned int space = (__space < 0 ? 0 : __space);
+ u32 window_clamp = READ_ONCE(*__window_clamp);
/* If no clamp set the clamp to the max possible scaled window */
- if (*window_clamp == 0)
- (*window_clamp) = (U16_MAX << TCP_MAX_WSCALE);
- space = min(*window_clamp, space);
+ if (window_clamp == 0)
+ window_clamp = (U16_MAX << TCP_MAX_WSCALE);
+ space = min(window_clamp, space);
/* Quantize space offering to a multiple of mss if possible. */
if (space > mss)
@@ -225,31 +232,28 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
* which we interpret as a sign the remote TCP is not
* misinterpreting the window field as a signed quantity.
*/
- if (sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows))
(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
else
(*rcv_wnd) = space;
- (*rcv_wscale) = 0;
+ if (init_rcv_wnd)
+ *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
+
+ *rcv_wscale = 0;
if (wscale_ok) {
/* Set window scaling on max possible window */
- space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
- space = max_t(u32, space, sysctl_rmem_max);
- space = min_t(u32, space, *window_clamp);
- while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
- space >>= 1;
- (*rcv_wscale)++;
- }
+ space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
+ space = max_t(u32, space, READ_ONCE(sysctl_rmem_max));
+ space = min_t(u32, space, window_clamp);
+ *rcv_wscale = clamp_t(int, ilog2(space) - 15,
+ 0, TCP_MAX_WSCALE);
}
-
- if (!init_rcv_wnd) /* Use default unless specified otherwise */
- init_rcv_wnd = tcp_default_init_rwnd(mss);
- *rcv_wnd = min(*rcv_wnd, init_rcv_wnd * mss);
-
/* Set the clamp no higher than max representable value */
- (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp);
+ WRITE_ONCE(*__window_clamp,
+ min_t(__u32, U16_MAX << (*rcv_wscale), window_clamp));
}
-EXPORT_SYMBOL(tcp_select_initial_window);
+EXPORT_IPV6_MOD(tcp_select_initial_window);
/* Chose a new window to advertise, update state in tcp_sock for the
* socket, and return result with RFC1323 scaling applied. The return
@@ -259,11 +263,22 @@ EXPORT_SYMBOL(tcp_select_initial_window);
static u16 tcp_select_window(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
u32 old_win = tp->rcv_wnd;
- u32 cur_win = tcp_receive_window(tp);
- u32 new_win = __tcp_select_window(sk);
+ u32 cur_win, new_win;
- /* Never shrink the offered window */
+ /* Make the window 0 if we failed to queue the data because we
+ * are out of memory.
+ */
+ if (unlikely(inet_csk(sk)->icsk_ack.pending & ICSK_ACK_NOMEM)) {
+ tp->pred_flags = 0;
+ tp->rcv_wnd = 0;
+ tp->rcv_wup = tp->rcv_nxt;
+ return 0;
+ }
+
+ cur_win = tcp_receive_window(tp);
+ new_win = __tcp_select_window(sk);
if (new_win < cur_win) {
/* Danger Will Robinson!
* Don't update rcv_wup/rcv_wnd here or else
@@ -272,11 +287,14 @@ static u16 tcp_select_window(struct sock *sk)
*
* Relax Will Robinson.
*/
- if (new_win == 0)
- NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPWANTZEROWINDOWADV);
- new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) || !tp->rx_opt.rcv_wscale) {
+ /* Never shrink the offered window */
+ if (new_win == 0)
+ NET_INC_STATS(net, LINUX_MIB_TCPWANTZEROWINDOWADV);
+ new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale);
+ }
}
+
tp->rcv_wnd = new_win;
tp->rcv_wup = tp->rcv_nxt;
@@ -284,7 +302,7 @@ static u16 tcp_select_window(struct sock *sk)
* scaled window.
*/
if (!tp->rx_opt.rcv_wscale &&
- sock_net(sk)->ipv4.sysctl_tcp_workaround_signed_windows)
+ READ_ONCE(net->ipv4.sysctl_tcp_workaround_signed_windows))
new_win = min(new_win, MAX_TCP_WINDOW);
else
new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
@@ -296,69 +314,14 @@ static u16 tcp_select_window(struct sock *sk)
if (new_win == 0) {
tp->pred_flags = 0;
if (old_win)
- NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPTOZEROWINDOWADV);
+ NET_INC_STATS(net, LINUX_MIB_TCPTOZEROWINDOWADV);
} else if (old_win == 0) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFROMZEROWINDOWADV);
+ NET_INC_STATS(net, LINUX_MIB_TCPFROMZEROWINDOWADV);
}
return new_win;
}
-/* Packet ECN state for a SYN-ACK */
-static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
-
- TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
- if (!(tp->ecn_flags & TCP_ECN_OK))
- TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
- else if (tcp_ca_needs_ecn(sk) ||
- tcp_bpf_ca_needs_ecn(sk))
- INET_ECN_xmit(sk);
-}
-
-/* Packet ECN state for a SYN. */
-static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
- bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
- tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
-
- if (!use_ecn) {
- const struct dst_entry *dst = __sk_dst_get(sk);
-
- if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
- use_ecn = true;
- }
-
- tp->ecn_flags = 0;
-
- if (use_ecn) {
- TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
- tp->ecn_flags = TCP_ECN_OK;
- if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
- INET_ECN_xmit(sk);
- }
-}
-
-static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
-{
- if (sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)
- /* tp->ecn_flags are cleared at a later point in time when
- * SYN ACK is ultimatively being received.
- */
- TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
-}
-
-static void
-tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
-{
- if (inet_rsk(req)->ecn_ok)
- th->ece = 1;
-}
-
/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
* be sent.
*/
@@ -367,7 +330,15 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tp->ecn_flags & TCP_ECN_OK) {
+ if (!tcp_ecn_mode_any(tp))
+ return;
+
+ if (tcp_ecn_mode_accecn(tp)) {
+ if (!tcp_accecn_ace_fail_recv(tp))
+ INET_ECN_xmit(sk);
+ tcp_accecn_set_ace(tp, skb, th);
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN;
+ } else {
/* Not-retransmitted data segment: set ECT and inject CWR. */
if (skb->len != tcp_header_len &&
!before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
@@ -389,14 +360,15 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
/* Constructs common control bits of non-data skb. If SYN/FIN is present,
* auto increment end seqno.
*/
-static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
+static void tcp_init_nondata_skb(struct sk_buff *skb, struct sock *sk,
+ u32 seq, u16 flags)
{
skb->ip_summed = CHECKSUM_PARTIAL;
TCP_SKB_CB(skb)->tcp_flags = flags;
- TCP_SKB_CB(skb)->sacked = 0;
tcp_skb_pcount_set(skb, 1);
+ psp_enqueue_set_decrypted(sk, skb);
TCP_SKB_CB(skb)->seq = seq;
if (flags & (TCPHDR_SYN | TCPHDR_FIN))
@@ -409,12 +381,15 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
return tp->snd_una != tp->snd_up;
}
-#define OPTION_SACK_ADVERTISE (1 << 0)
-#define OPTION_TS (1 << 1)
-#define OPTION_MD5 (1 << 2)
-#define OPTION_WSCALE (1 << 3)
-#define OPTION_FAST_OPEN_COOKIE (1 << 8)
-#define OPTION_SMC (1 << 9)
+#define OPTION_SACK_ADVERTISE BIT(0)
+#define OPTION_TS BIT(1)
+#define OPTION_MD5 BIT(2)
+#define OPTION_WSCALE BIT(3)
+#define OPTION_FAST_OPEN_COOKIE BIT(8)
+#define OPTION_SMC BIT(9)
+#define OPTION_MPTCP BIT(10)
+#define OPTION_AO BIT(11)
+#define OPTION_ACCECN BIT(12)
static void smc_options_write(__be32 *ptr, u16 *options)
{
@@ -436,12 +411,210 @@ struct tcp_out_options {
u16 mss; /* 0 to disable */
u8 ws; /* window scale, 0 to disable */
u8 num_sack_blocks; /* number of SACK blocks to include */
+ u8 num_accecn_fields:7, /* number of AccECN fields needed */
+ use_synack_ecn_bytes:1; /* Use synack_ecn_bytes or not */
u8 hash_size; /* bytes in hash_location */
+ u8 bpf_opt_len; /* length of BPF hdr option */
__u8 *hash_location; /* temporary pointer, overloaded */
__u32 tsval, tsecr; /* need to include OPTION_TS */
struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
+ struct mptcp_out_options mptcp;
};
+static void mptcp_options_write(struct tcphdr *th, __be32 *ptr,
+ struct tcp_sock *tp,
+ struct tcp_out_options *opts)
+{
+#if IS_ENABLED(CONFIG_MPTCP)
+ if (unlikely(OPTION_MPTCP & opts->options))
+ mptcp_write_options(th, ptr, tp, &opts->mptcp);
+#endif
+}
+
+#ifdef CONFIG_CGROUP_BPF
+static int bpf_skops_write_hdr_opt_arg0(struct sk_buff *skb,
+ enum tcp_synack_type synack_type)
+{
+ if (unlikely(!skb))
+ return BPF_WRITE_HDR_TCP_CURRENT_MSS;
+
+ if (unlikely(synack_type == TCP_SYNACK_COOKIE))
+ return BPF_WRITE_HDR_TCP_SYNACK_COOKIE;
+
+ return 0;
+}
+
+/* req, syn_skb and synack_type are used when writing synack */
+static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+ struct bpf_sock_ops_kern sock_ops;
+ int err;
+
+ if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG)) ||
+ !*remaining)
+ return;
+
+ /* *remaining has already been aligned to 4 bytes, so *remaining >= 4 */
+
+ /* init sock_ops */
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+
+ sock_ops.op = BPF_SOCK_OPS_HDR_OPT_LEN_CB;
+
+ if (req) {
+ /* The listen "sk" cannot be passed here because
+ * it is not locked. It would not make too much
+ * sense to do bpf_setsockopt(listen_sk) based
+ * on individual connection request also.
+ *
+ * Thus, "req" is passed here and the cgroup-bpf-progs
+ * of the listen "sk" will be run.
+ *
+ * "req" is also used here for fastopen even the "sk" here is
+ * a fullsock "child" sk. It is to keep the behavior
+ * consistent between fastopen and non-fastopen on
+ * the bpf programming side.
+ */
+ sock_ops.sk = (struct sock *)req;
+ sock_ops.syn_skb = syn_skb;
+ } else {
+ sock_owned_by_me(sk);
+
+ sock_ops.is_fullsock = 1;
+ sock_ops.is_locked_tcp_sock = 1;
+ sock_ops.sk = sk;
+ }
+
+ sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
+ sock_ops.remaining_opt_len = *remaining;
+ /* tcp_current_mss() does not pass a skb */
+ if (skb)
+ bpf_skops_init_skb(&sock_ops, skb, 0);
+
+ err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
+
+ if (err || sock_ops.remaining_opt_len == *remaining)
+ return;
+
+ opts->bpf_opt_len = *remaining - sock_ops.remaining_opt_len;
+ /* round up to 4 bytes */
+ opts->bpf_opt_len = (opts->bpf_opt_len + 3) & ~3;
+
+ *remaining -= opts->bpf_opt_len;
+}
+
+static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts)
+{
+ u8 first_opt_off, nr_written, max_opt_len = opts->bpf_opt_len;
+ struct bpf_sock_ops_kern sock_ops;
+ int err;
+
+ if (likely(!max_opt_len))
+ return;
+
+ memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
+
+ sock_ops.op = BPF_SOCK_OPS_WRITE_HDR_OPT_CB;
+
+ if (req) {
+ sock_ops.sk = (struct sock *)req;
+ sock_ops.syn_skb = syn_skb;
+ } else {
+ sock_owned_by_me(sk);
+
+ sock_ops.is_fullsock = 1;
+ sock_ops.is_locked_tcp_sock = 1;
+ sock_ops.sk = sk;
+ }
+
+ sock_ops.args[0] = bpf_skops_write_hdr_opt_arg0(skb, synack_type);
+ sock_ops.remaining_opt_len = max_opt_len;
+ first_opt_off = tcp_hdrlen(skb) - max_opt_len;
+ bpf_skops_init_skb(&sock_ops, skb, first_opt_off);
+
+ err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
+
+ if (err)
+ nr_written = 0;
+ else
+ nr_written = max_opt_len - sock_ops.remaining_opt_len;
+
+ if (nr_written < max_opt_len)
+ memset(skb->data + first_opt_off + nr_written, TCPOPT_NOP,
+ max_opt_len - nr_written);
+}
+#else
+static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+}
+
+static void bpf_skops_write_hdr_opt(struct sock *sk, struct sk_buff *skb,
+ struct request_sock *req,
+ struct sk_buff *syn_skb,
+ enum tcp_synack_type synack_type,
+ struct tcp_out_options *opts)
+{
+}
+#endif
+
+static __be32 *process_tcp_ao_options(struct tcp_sock *tp,
+ const struct tcp_request_sock *tcprsk,
+ struct tcp_out_options *opts,
+ struct tcp_key *key, __be32 *ptr)
+{
+#ifdef CONFIG_TCP_AO
+ u8 maclen = tcp_ao_maclen(key->ao_key);
+
+ if (tcprsk) {
+ u8 aolen = maclen + sizeof(struct tcp_ao_hdr);
+
+ *ptr++ = htonl((TCPOPT_AO << 24) | (aolen << 16) |
+ (tcprsk->ao_keyid << 8) |
+ (tcprsk->ao_rcv_next));
+ } else {
+ struct tcp_ao_key *rnext_key;
+ struct tcp_ao_info *ao_info;
+
+ ao_info = rcu_dereference_check(tp->ao_info,
+ lockdep_sock_is_held(&tp->inet_conn.icsk_inet.sk));
+ rnext_key = READ_ONCE(ao_info->rnext_key);
+ if (WARN_ON_ONCE(!rnext_key))
+ return ptr;
+ *ptr++ = htonl((TCPOPT_AO << 24) |
+ (tcp_ao_len(key->ao_key) << 16) |
+ (key->ao_key->sndid << 8) |
+ (rnext_key->rcvid));
+ }
+ opts->hash_location = (__u8 *)ptr;
+ ptr += maclen / sizeof(*ptr);
+ if (unlikely(maclen % sizeof(*ptr))) {
+ memset(ptr, TCPOPT_NOP, sizeof(*ptr));
+ ptr++;
+ }
+#endif
+ return ptr;
+}
+
+/* Initial values for AccECN option, ordered is based on ECN field bits
+ * similar to received_ecn_bytes. Used for SYN/ACK AccECN option.
+ */
+static const u32 synack_ecn_bytes[3] = { 0, 0, 0 };
+
/* Write previously computed TCP options to the packet.
*
* Beware: Something in the Internet is very sensitive to the ordering of
@@ -455,19 +628,25 @@ struct tcp_out_options {
* At least SACK_PERM as the first option is known to lead to a disaster
* (but it may well be that other scenarios fail similarly).
*/
-static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
- struct tcp_out_options *opts)
+static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
+ const struct tcp_request_sock *tcprsk,
+ struct tcp_out_options *opts,
+ struct tcp_key *key)
{
+ u8 leftover_highbyte = TCPOPT_NOP; /* replace 1st NOP if avail */
+ u8 leftover_lowbyte = TCPOPT_NOP; /* replace 2nd NOP in succession */
+ __be32 *ptr = (__be32 *)(th + 1);
u16 options = opts->options; /* mungable copy */
- if (unlikely(OPTION_MD5 & options)) {
+ if (tcp_key_is_md5(key)) {
*ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
(TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
/* overload cookie hash location */
opts->hash_location = (__u8 *)ptr;
ptr += 4;
+ } else if (tcp_key_is_ao(key)) {
+ ptr = process_tcp_ao_options(tp, tcprsk, opts, key, ptr);
}
-
if (unlikely(opts->mss)) {
*ptr++ = htonl((TCPOPT_MSS << 24) |
(TCPOLEN_MSS << 16) |
@@ -491,15 +670,75 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
*ptr++ = htonl(opts->tsecr);
}
+ if (OPTION_ACCECN & options) {
+ const u32 *ecn_bytes = opts->use_synack_ecn_bytes ?
+ synack_ecn_bytes :
+ tp->received_ecn_bytes;
+ const u8 ect0_idx = INET_ECN_ECT_0 - 1;
+ const u8 ect1_idx = INET_ECN_ECT_1 - 1;
+ const u8 ce_idx = INET_ECN_CE - 1;
+ u32 e0b;
+ u32 e1b;
+ u32 ceb;
+ u8 len;
+
+ e0b = ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET;
+ e1b = ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET;
+ ceb = ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET;
+ len = TCPOLEN_ACCECN_BASE +
+ opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD;
+
+ if (opts->num_accecn_fields == 2) {
+ *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
+ ((e1b >> 8) & 0xffff));
+ *ptr++ = htonl(((e1b & 0xff) << 24) |
+ (ceb & 0xffffff));
+ } else if (opts->num_accecn_fields == 1) {
+ *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
+ ((e1b >> 8) & 0xffff));
+ leftover_highbyte = e1b & 0xff;
+ leftover_lowbyte = TCPOPT_NOP;
+ } else if (opts->num_accecn_fields == 0) {
+ leftover_highbyte = TCPOPT_ACCECN1;
+ leftover_lowbyte = len;
+ } else if (opts->num_accecn_fields == 3) {
+ *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
+ ((e1b >> 8) & 0xffff));
+ *ptr++ = htonl(((e1b & 0xff) << 24) |
+ (ceb & 0xffffff));
+ *ptr++ = htonl(((e0b & 0xffffff) << 8) |
+ TCPOPT_NOP);
+ }
+ if (tp) {
+ tp->accecn_minlen = 0;
+ tp->accecn_opt_tstamp = tp->tcp_mstamp;
+ if (tp->accecn_opt_demand)
+ tp->accecn_opt_demand--;
+ }
+ }
+
if (unlikely(OPTION_SACK_ADVERTISE & options)) {
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
+ *ptr++ = htonl((leftover_highbyte << 24) |
+ (leftover_lowbyte << 16) |
(TCPOPT_SACK_PERM << 8) |
TCPOLEN_SACK_PERM);
+ leftover_highbyte = TCPOPT_NOP;
+ leftover_lowbyte = TCPOPT_NOP;
}
if (unlikely(OPTION_WSCALE & options)) {
- *ptr++ = htonl((TCPOPT_NOP << 24) |
+ u8 highbyte = TCPOPT_NOP;
+
+ /* Do not split the leftover 2-byte to fit into a single
+ * NOP, i.e., replace this NOP only when 1 byte is leftover
+ * within leftover_highbyte.
+ */
+ if (unlikely(leftover_highbyte != TCPOPT_NOP &&
+ leftover_lowbyte == TCPOPT_NOP)) {
+ highbyte = leftover_highbyte;
+ leftover_highbyte = TCPOPT_NOP;
+ }
+ *ptr++ = htonl((highbyte << 24) |
(TCPOPT_WINDOW << 16) |
(TCPOLEN_WINDOW << 8) |
opts->ws);
@@ -510,11 +749,13 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
tp->duplicate_sack : tp->selective_acks;
int this_sack;
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
+ *ptr++ = htonl((leftover_highbyte << 24) |
+ (leftover_lowbyte << 16) |
(TCPOPT_SACK << 8) |
(TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
TCPOLEN_SACK_PERBLOCK)));
+ leftover_highbyte = TCPOPT_NOP;
+ leftover_lowbyte = TCPOPT_NOP;
for (this_sack = 0; this_sack < opts->num_sack_blocks;
++this_sack) {
@@ -523,6 +764,14 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
}
tp->rx_opt.dsack = 0;
+ } else if (unlikely(leftover_highbyte != TCPOPT_NOP ||
+ leftover_lowbyte != TCPOPT_NOP)) {
+ *ptr++ = htonl((leftover_highbyte << 24) |
+ (leftover_lowbyte << 16) |
+ (TCPOPT_NOP << 8) |
+ TCPOPT_NOP);
+ leftover_highbyte = TCPOPT_NOP;
+ leftover_lowbyte = TCPOPT_NOP;
}
if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
@@ -550,62 +799,159 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
}
smc_options_write(ptr, &options);
+
+ mptcp_options_write(th, ptr, tp, opts);
}
-static void smc_set_option(const struct tcp_sock *tp,
+static void smc_set_option(struct tcp_sock *tp,
struct tcp_out_options *opts,
unsigned int *remaining)
{
#if IS_ENABLED(CONFIG_SMC)
- if (static_branch_unlikely(&tcp_have_smc)) {
- if (tp->syn_smc) {
- if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
- opts->options |= OPTION_SMC;
- *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
- }
+ if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc) {
+ tp->syn_smc = !!smc_call_hsbpf(1, tp, syn_option);
+ /* re-check syn_smc */
+ if (tp->syn_smc &&
+ *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+ opts->options |= OPTION_SMC;
+ *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
}
}
#endif
}
static void smc_set_option_cond(const struct tcp_sock *tp,
- const struct inet_request_sock *ireq,
+ struct inet_request_sock *ireq,
struct tcp_out_options *opts,
unsigned int *remaining)
{
#if IS_ENABLED(CONFIG_SMC)
- if (static_branch_unlikely(&tcp_have_smc)) {
- if (tp->syn_smc && ireq->smc_ok) {
- if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
- opts->options |= OPTION_SMC;
- *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
- }
+ if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc && ireq->smc_ok) {
+ ireq->smc_ok = !!smc_call_hsbpf(1, tp, synack_option, ireq);
+ /* re-check smc_ok */
+ if (ireq->smc_ok &&
+ *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+ opts->options |= OPTION_SMC;
+ *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
}
}
#endif
}
+static void mptcp_set_option_cond(const struct request_sock *req,
+ struct tcp_out_options *opts,
+ unsigned int *remaining)
+{
+ if (rsk_is_mptcp(req)) {
+ unsigned int size;
+
+ if (mptcp_synack_options(req, &size, &opts->mptcp)) {
+ if (*remaining >= size) {
+ opts->options |= OPTION_MPTCP;
+ *remaining -= size;
+ }
+ }
+ }
+}
+
+static u32 tcp_synack_options_combine_saving(struct tcp_out_options *opts)
+{
+ /* How much there's room for combining with the alignment padding? */
+ if ((opts->options & (OPTION_SACK_ADVERTISE | OPTION_TS)) ==
+ OPTION_SACK_ADVERTISE)
+ return 2;
+ else if (opts->options & OPTION_WSCALE)
+ return 1;
+ return 0;
+}
+
+/* Calculates how long AccECN option will fit to @remaining option space.
+ *
+ * AccECN option can sometimes replace NOPs used for alignment of other
+ * TCP options (up to @max_combine_saving available).
+ *
+ * Only solutions with at least @required AccECN fields are accepted.
+ *
+ * Returns: The size of the AccECN option excluding space repurposed from
+ * the alignment of the other options.
+ */
+static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required,
+ int remaining)
+{
+ int size = TCP_ACCECN_MAXSIZE;
+ int sack_blocks_reduce = 0;
+ int max_combine_saving;
+ int rem = remaining;
+ int align_size;
+
+ if (opts->use_synack_ecn_bytes)
+ max_combine_saving = tcp_synack_options_combine_saving(opts);
+ else
+ max_combine_saving = opts->num_sack_blocks > 0 ? 2 : 0;
+ opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS;
+ while (opts->num_accecn_fields >= required) {
+ /* Pad to dword if cannot combine */
+ if ((size & 0x3) > max_combine_saving)
+ align_size = ALIGN(size, 4);
+ else
+ align_size = ALIGN_DOWN(size, 4);
+
+ if (rem >= align_size) {
+ size = align_size;
+ break;
+ } else if (opts->num_accecn_fields == required &&
+ opts->num_sack_blocks > 2 &&
+ required > 0) {
+ /* Try to fit the option by removing one SACK block */
+ opts->num_sack_blocks--;
+ sack_blocks_reduce++;
+ rem = rem + TCPOLEN_SACK_PERBLOCK;
+
+ opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS;
+ size = TCP_ACCECN_MAXSIZE;
+ continue;
+ }
+
+ opts->num_accecn_fields--;
+ size -= TCPOLEN_ACCECN_PERFIELD;
+ }
+ if (sack_blocks_reduce > 0) {
+ if (opts->num_accecn_fields >= required)
+ size -= sack_blocks_reduce * TCPOLEN_SACK_PERBLOCK;
+ else
+ opts->num_sack_blocks += sack_blocks_reduce;
+ }
+ if (opts->num_accecn_fields < required)
+ return 0;
+
+ opts->options |= OPTION_ACCECN;
+ return size;
+}
+
/* Compute TCP options for SYN packets. This is not the final
* network wire format yet.
*/
static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
struct tcp_out_options *opts,
- struct tcp_md5sig_key **md5)
+ struct tcp_key *key)
{
struct tcp_sock *tp = tcp_sk(sk);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
struct tcp_fastopen_request *fastopen = tp->fastopen_req;
+ bool timestamps;
- *md5 = NULL;
-#ifdef CONFIG_TCP_MD5SIG
- if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
- *md5 = tp->af_specific->md5_lookup(sk, sk);
- if (*md5) {
- opts->options |= OPTION_MD5;
- remaining -= TCPOLEN_MD5SIG_ALIGNED;
+ /* Better than switch (key.type) as it has static branches */
+ if (tcp_key_is_md5(key)) {
+ timestamps = false;
+ opts->options |= OPTION_MD5;
+ remaining -= TCPOLEN_MD5SIG_ALIGNED;
+ } else {
+ timestamps = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps);
+ if (tcp_key_is_ao(key)) {
+ opts->options |= OPTION_AO;
+ remaining -= tcp_ao_len_aligned(key->ao_key);
}
}
-#endif
/* We always get an MSS option. The option bytes which will be seen in
* normal data packets should timestamps be used, must be in the MSS
@@ -619,18 +965,18 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
opts->mss = tcp_advertise_mss(sk);
remaining -= TCPOLEN_MSS_ALIGNED;
- if (likely(sock_net(sk)->ipv4.sysctl_tcp_timestamps && !*md5)) {
+ if (likely(timestamps)) {
opts->options |= OPTION_TS;
- opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
+ opts->tsval = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) + tp->tsoffset;
opts->tsecr = tp->rx_opt.ts_recent;
remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
- if (likely(sock_net(sk)->ipv4.sysctl_tcp_window_scaling)) {
+ if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling))) {
opts->ws = tp->rx_opt.rcv_wscale;
opts->options |= OPTION_WSCALE;
remaining -= TCPOLEN_WSCALE_ALIGNED;
}
- if (likely(sock_net(sk)->ipv4.sysctl_tcp_sack)) {
+ if (likely(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_sack))) {
opts->options |= OPTION_SACK_ADVERTISE;
if (unlikely(!(OPTION_TS & opts->options)))
remaining -= TCPOLEN_SACKPERM_ALIGNED;
@@ -653,6 +999,33 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
smc_set_option(tp, opts, &remaining);
+ if (sk_is_mptcp(sk)) {
+ unsigned int size;
+
+ if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) {
+ if (remaining >= size) {
+ opts->options |= OPTION_MPTCP;
+ remaining -= size;
+ }
+ }
+ }
+
+ /* Simultaneous open SYN/ACK needs AccECN option but not SYN.
+ * It is attempted to negotiate the use of AccECN also on the first
+ * retransmitted SYN, as mentioned in "3.1.4.1. Retransmitted SYNs"
+ * of AccECN draft.
+ */
+ if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) &&
+ tcp_ecn_mode_accecn(tp) &&
+ inet_csk(sk)->icsk_retransmits < 2 &&
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) &&
+ remaining >= TCPOLEN_ACCECN_BASE)) {
+ opts->use_synack_ecn_bytes = 1;
+ remaining -= tcp_options_fit_accecn(opts, 0, remaining);
+ }
+
+ bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
+
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -661,14 +1034,16 @@ static unsigned int tcp_synack_options(const struct sock *sk,
struct request_sock *req,
unsigned int mss, struct sk_buff *skb,
struct tcp_out_options *opts,
- const struct tcp_md5sig_key *md5,
- struct tcp_fastopen_cookie *foc)
+ const struct tcp_key *key,
+ struct tcp_fastopen_cookie *foc,
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
+ struct tcp_request_sock *treq = tcp_rsk(req);
-#ifdef CONFIG_TCP_MD5SIG
- if (md5) {
+ if (tcp_key_is_md5(key)) {
opts->options |= OPTION_MD5;
remaining -= TCPOLEN_MD5SIG_ALIGNED;
@@ -677,9 +1052,13 @@ static unsigned int tcp_synack_options(const struct sock *sk,
* rather than TS in order to fit in better with old,
* buggy kernels, but that was deemed to be unnecessary.
*/
+ if (synack_type != TCP_SYNACK_COOKIE)
+ ireq->tstamp_ok &= !ireq->sack_ok;
+ } else if (tcp_key_is_ao(key)) {
+ opts->options |= OPTION_AO;
+ remaining -= tcp_ao_len_aligned(key->ao_key);
ireq->tstamp_ok &= !ireq->sack_ok;
}
-#endif
/* We always send an MSS option. */
opts->mss = mss;
@@ -692,7 +1071,14 @@ static unsigned int tcp_synack_options(const struct sock *sk,
}
if (likely(ireq->tstamp_ok)) {
opts->options |= OPTION_TS;
- opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off;
+ opts->tsval = tcp_skb_timestamp_ts(tcp_rsk(req)->req_usec_ts, skb) +
+ tcp_rsk(req)->ts_off;
+ if (!tcp_rsk(req)->snt_tsval_first) {
+ if (!opts->tsval)
+ opts->tsval = ~0U;
+ tcp_rsk(req)->snt_tsval_first = opts->tsval;
+ }
+ WRITE_ONCE(tcp_rsk(req)->snt_tsval_last, opts->tsval);
opts->tsecr = req->ts_recent;
remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
@@ -714,8 +1100,20 @@ static unsigned int tcp_synack_options(const struct sock *sk,
}
}
+ mptcp_set_option_cond(req, opts, &remaining);
+
smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
+ if (treq->accecn_ok &&
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) &&
+ req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) {
+ opts->use_synack_ecn_bytes = 1;
+ remaining -= tcp_options_fit_accecn(opts, 0, remaining);
+ }
+
+ bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
+ synack_type, opts, &remaining);
+
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -724,7 +1122,7 @@ static unsigned int tcp_synack_options(const struct sock *sk,
*/
static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb,
struct tcp_out_options *opts,
- struct tcp_md5sig_key **md5)
+ struct tcp_key *key)
{
struct tcp_sock *tp = tcp_sk(sk);
unsigned int size = 0;
@@ -732,33 +1130,78 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
opts->options = 0;
- *md5 = NULL;
-#ifdef CONFIG_TCP_MD5SIG
- if (unlikely(rcu_access_pointer(tp->md5sig_info))) {
- *md5 = tp->af_specific->md5_lookup(sk, sk);
- if (*md5) {
- opts->options |= OPTION_MD5;
- size += TCPOLEN_MD5SIG_ALIGNED;
- }
+ /* Better than switch (key.type) as it has static branches */
+ if (tcp_key_is_md5(key)) {
+ opts->options |= OPTION_MD5;
+ size += TCPOLEN_MD5SIG_ALIGNED;
+ } else if (tcp_key_is_ao(key)) {
+ opts->options |= OPTION_AO;
+ size += tcp_ao_len_aligned(key->ao_key);
}
-#endif
if (likely(tp->rx_opt.tstamp_ok)) {
opts->options |= OPTION_TS;
- opts->tsval = skb ? tcp_skb_timestamp(skb) + tp->tsoffset : 0;
+ opts->tsval = skb ? tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb) +
+ tp->tsoffset : 0;
opts->tsecr = tp->rx_opt.ts_recent;
size += TCPOLEN_TSTAMP_ALIGNED;
}
+ /* MPTCP options have precedence over SACK for the limited TCP
+ * option space because a MPTCP connection would be forced to
+ * fall back to regular TCP if a required multipath option is
+ * missing. SACK still gets a chance to use whatever space is
+ * left.
+ */
+ if (sk_is_mptcp(sk)) {
+ unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+ unsigned int opt_size = 0;
+
+ if (mptcp_established_options(sk, skb, &opt_size, remaining,
+ &opts->mptcp)) {
+ opts->options |= OPTION_MPTCP;
+ size += opt_size;
+ }
+ }
+
eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
if (unlikely(eff_sacks)) {
const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
- opts->num_sack_blocks =
- min_t(unsigned int, eff_sacks,
- (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
- TCPOLEN_SACK_PERBLOCK);
- size += TCPOLEN_SACK_BASE_ALIGNED +
- opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
+ if (likely(remaining >= TCPOLEN_SACK_BASE_ALIGNED +
+ TCPOLEN_SACK_PERBLOCK)) {
+ opts->num_sack_blocks =
+ min_t(unsigned int, eff_sacks,
+ (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
+ TCPOLEN_SACK_PERBLOCK);
+
+ size += TCPOLEN_SACK_BASE_ALIGNED +
+ opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
+ } else {
+ opts->num_sack_blocks = 0;
+ }
+ } else {
+ opts->num_sack_blocks = 0;
+ }
+
+ if (tcp_ecn_mode_accecn(tp)) {
+ int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option);
+
+ if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) &&
+ (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand ||
+ tcp_accecn_option_beacon_check(sk))) {
+ opts->use_synack_ecn_bytes = 0;
+ size += tcp_options_fit_accecn(opts, tp->accecn_minlen,
+ MAX_TCP_OPTION_SPACE - size);
+ }
+ }
+
+ if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
+ BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG))) {
+ unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
+
+ bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
+
+ size = MAX_TCP_OPTION_SPACE - remaining;
}
return size;
@@ -775,15 +1218,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
* needs to be reallocated in a driver.
* The invariant being skb->truesize subtracted from sk->sk_wmem_alloc
*
- * Since transmit from skb destructor is forbidden, we use a tasklet
+ * Since transmit from skb destructor is forbidden, we use a BH work item
* to process all sockets that eventually need to send more skbs.
- * We use one tasklet per cpu, with its own queue of sockets.
+ * We use one work item per cpu, with its own queue of sockets.
*/
-struct tsq_tasklet {
- struct tasklet_struct tasklet;
+struct tsq_work {
+ struct work_struct work;
struct list_head head; /* queue of tcp sockets */
};
-static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet);
+static DEFINE_PER_CPU(struct tsq_work, tsq_work);
static void tcp_tsq_write(struct sock *sk)
{
@@ -793,7 +1236,7 @@ static void tcp_tsq_write(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
if (tp->lost_out > tp->retrans_out &&
- tp->snd_cwnd > tcp_packets_in_flight(tp)) {
+ tcp_snd_cwnd(tp) > tcp_packets_in_flight(tp)) {
tcp_mstamp_refresh(tp);
tcp_xmit_retransmit_queue(sk);
}
@@ -813,14 +1256,14 @@ static void tcp_tsq_handler(struct sock *sk)
bh_unlock_sock(sk);
}
/*
- * One tasklet per cpu tries to send more skbs.
- * We run in tasklet context but need to disable irqs when
+ * One work item per cpu tries to send more skbs.
+ * We run in BH context but need to disable irqs when
* transferring tsq->head because tcp_wfree() might
* interrupt us (non NAPI drivers)
*/
-static void tcp_tasklet_func(unsigned long data)
+static void tcp_tsq_workfn(struct work_struct *work)
{
- struct tsq_tasklet *tsq = (struct tsq_tasklet *)data;
+ struct tsq_work *tsq = container_of(work, struct tsq_work, work);
LIST_HEAD(list);
unsigned long flags;
struct list_head *q, *n;
@@ -847,7 +1290,8 @@ static void tcp_tasklet_func(unsigned long data)
#define TCP_DEFERRED_ALL (TCPF_TSQ_DEFERRED | \
TCPF_WRITE_TIMER_DEFERRED | \
TCPF_DELACK_TIMER_DEFERRED | \
- TCPF_MTU_REDUCED_DEFERRED)
+ TCPF_MTU_REDUCED_DEFERRED | \
+ TCPF_ACK_DEFERRED)
/**
* tcp_release_cb - tcp release_sock() callback
* @sk: socket
@@ -857,30 +1301,20 @@ static void tcp_tasklet_func(unsigned long data)
*/
void tcp_release_cb(struct sock *sk)
{
- unsigned long flags, nflags;
+ unsigned long flags = smp_load_acquire(&sk->sk_tsq_flags);
+ unsigned long nflags;
/* perform an atomic operation only if at least one flag is set */
do {
- flags = sk->sk_tsq_flags;
if (!(flags & TCP_DEFERRED_ALL))
return;
nflags = flags & ~TCP_DEFERRED_ALL;
- } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
+ } while (!try_cmpxchg(&sk->sk_tsq_flags, &flags, nflags));
if (flags & TCPF_TSQ_DEFERRED) {
tcp_tsq_write(sk);
__sock_put(sk);
}
- /* Here begins the tricky part :
- * We are called from release_sock() with :
- * 1) BH disabled
- * 2) sk_lock.slock spinlock held
- * 3) socket owned by us (sk->sk_lock.owned == 1)
- *
- * But following code is meant to be called from BH handlers,
- * so we should keep BH disabled, but early release socket ownership
- */
- sock_release_ownership(sk);
if (flags & TCPF_WRITE_TIMER_DEFERRED) {
tcp_write_timer_handler(sk);
@@ -894,20 +1328,20 @@ void tcp_release_cb(struct sock *sk)
inet_csk(sk)->icsk_af_ops->mtu_reduced(sk);
__sock_put(sk);
}
+ if ((flags & TCPF_ACK_DEFERRED) && inet_csk_ack_scheduled(sk))
+ tcp_send_ack(sk);
}
-EXPORT_SYMBOL(tcp_release_cb);
+EXPORT_IPV6_MOD(tcp_release_cb);
-void __init tcp_tasklet_init(void)
+void __init tcp_tsq_work_init(void)
{
int i;
for_each_possible_cpu(i) {
- struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i);
+ struct tsq_work *tsq = &per_cpu(tsq_work, i);
INIT_LIST_HEAD(&tsq->head);
- tasklet_init(&tsq->tasklet,
- tcp_tasklet_func,
- (unsigned long)tsq);
+ INIT_WORK(&tsq->work, tcp_tsq_workfn);
}
}
@@ -921,9 +1355,11 @@ void tcp_wfree(struct sk_buff *skb)
struct sock *sk = skb->sk;
struct tcp_sock *tp = tcp_sk(sk);
unsigned long flags, nval, oval;
+ struct tsq_work *tsq;
+ bool empty;
/* Keep one reference on sk_wmem_alloc.
- * Will be released by sk_free() from here or tcp_tasklet_func()
+ * Will be released by sk_free() from here or tcp_tsq_workfn()
*/
WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
@@ -937,28 +1373,23 @@ void tcp_wfree(struct sk_buff *skb)
if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
goto out;
- for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
- struct tsq_tasklet *tsq;
- bool empty;
-
+ oval = smp_load_acquire(&sk->sk_tsq_flags);
+ do {
if (!(oval & TSQF_THROTTLED) || (oval & TSQF_QUEUED))
goto out;
nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED;
- nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
- if (nval != oval)
- continue;
+ } while (!try_cmpxchg(&sk->sk_tsq_flags, &oval, nval));
- /* queue this socket to tasklet queue */
- local_irq_save(flags);
- tsq = this_cpu_ptr(&tsq_tasklet);
- empty = list_empty(&tsq->head);
- list_add(&tp->tsq_node, &tsq->head);
- if (empty)
- tasklet_schedule(&tsq->tasklet);
- local_irq_restore(flags);
- return;
- }
+ /* queue this socket to BH workqueue */
+ local_irq_save(flags);
+ tsq = this_cpu_ptr(&tsq_work);
+ empty = list_empty(&tsq->head);
+ list_add(&tp->tsq_node, &tsq->head);
+ if (empty)
+ queue_work(system_bh_wq, &tsq->work);
+ local_irq_restore(flags);
+ return;
out:
sk_free(sk);
}
@@ -977,31 +1408,34 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
+static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
+ u64 prior_wstamp)
{
- u64 len_ns;
- u32 rate;
+ struct tcp_sock *tp = tcp_sk(sk);
- if (!tcp_needs_internal_pacing(sk))
- return;
- rate = sk->sk_pacing_rate;
- if (!rate || rate == ~0U)
- return;
+ if (sk->sk_pacing_status != SK_PACING_NONE) {
+ unsigned long rate = READ_ONCE(sk->sk_pacing_rate);
- len_ns = (u64)skb->len * NSEC_PER_SEC;
- do_div(len_ns, rate);
- hrtimer_start(&tcp_sk(sk)->pacing_timer,
- ktime_add_ns(ktime_get(), len_ns),
- HRTIMER_MODE_ABS_PINNED_SOFT);
- sock_hold(sk);
-}
+ /* Original sch_fq does not pace first 10 MSS
+ * Note that tp->data_segs_out overflows after 2^32 packets,
+ * this is a minor annoyance.
+ */
+ if (rate != ~0UL && rate && tp->data_segs_out >= 10) {
+ u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate);
+ u64 credit = tp->tcp_wstamp_ns - prior_wstamp;
-static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
-{
- skb->skb_mstamp = tp->tcp_mstamp;
+ /* take into account OS jitter */
+ len_ns -= min_t(u64, len_ns / 2, credit);
+ tp->tcp_wstamp_ns += len_ns;
+ }
+ }
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
}
+INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
+INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
+INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb));
+
/* This routine actually transmits TCP packets queued in by
* tcp_do_sendmsg(). This is used by both the initial
* transmission and possible later retransmissions.
@@ -1023,16 +1457,17 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
struct tcp_out_options opts;
unsigned int tcp_options_size, tcp_header_size;
struct sk_buff *oskb = NULL;
- struct tcp_md5sig_key *md5;
+ struct tcp_key key;
struct tcphdr *th;
+ u64 prior_wstamp;
int err;
BUG_ON(!skb || !tcp_skb_pcount(skb));
tp = tcp_sk(sk);
-
+ prior_wstamp = tp->tcp_wstamp_ns;
+ tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
+ skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
if (clone_it) {
- TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
- - tp->snd_una;
oskb = skb;
tcp_skb_tsorted_save(oskb) {
@@ -1044,28 +1479,49 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
if (unlikely(!skb))
return -ENOBUFS;
+ /* retransmit skbs might have a non zero value in skb->dev
+ * because skb->dev is aliased with skb->rbnode.rb_left
+ */
+ skb->dev = NULL;
}
- skb->skb_mstamp = tp->tcp_mstamp;
inet = inet_sk(sk);
tcb = TCP_SKB_CB(skb);
memset(&opts, 0, sizeof(opts));
- if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
- tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
- else
- tcp_options_size = tcp_established_options(sk, skb, &opts,
- &md5);
+ tcp_get_current_key(sk, &key);
+ if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) {
+ tcp_options_size = tcp_syn_options(sk, skb, &opts, &key);
+ } else {
+ tcp_options_size = tcp_established_options(sk, skb, &opts, &key);
+ /* Force a PSH flag on all (GSO) packets to expedite GRO flush
+ * at receiver : This slightly improve GRO performance.
+ * Note that we do not force the PSH flag for non GSO packets,
+ * because they might be sent under high congestion events,
+ * and in this case it is better to delay the delivery of 1-MSS
+ * packets and thus the corresponding ACK packet that would
+ * release the following packet.
+ */
+ if (tcp_skb_pcount(skb) > 1)
+ tcb->tcp_flags |= TCPHDR_PSH;
+ }
tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
- /* if no packet is in qdisc/device queue, then allow XPS to select
- * another queue. We can be called from tcp_tsq_handler()
- * which holds one reference to sk.
- *
- * TODO: Ideally, in-flight pure ACK packets should not matter here.
- * One way to get this would be to set skb->truesize = 2 on them.
+ /* We set skb->ooo_okay to one if this packet can select
+ * a different TX queue than prior packets of this flow,
+ * to avoid self inflicted reorders.
+ * The 'other' queue decision is based on current cpu number
+ * if XPS is enabled, or sk->sk_txhash otherwise.
+ * We can switch to another (and better) queue if:
+ * 1) No packet with payload is in qdisc/device queues.
+ * Delays in TX completion can defeat the test
+ * even if packets were already sent.
+ * 2) Or rtx queue is empty.
+ * This mitigates above case if ACK packets for
+ * all prior packets were already processed.
*/
- skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
+ skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) ||
+ tcp_rtx_queue_empty(sk);
/* If we had to use memory reserve to allocate this skb,
* this might cause drops if packet is looped back :
@@ -1080,10 +1536,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
skb_orphan(skb);
skb->sk = sk;
skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
- skb_set_hash_from_sk(skb, sk);
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
- skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
+ skb_set_dst_pending_confirm(skb, READ_ONCE(sk->sk_dst_pending_confirm));
/* Build TCP header and checksum it. */
th = (struct tcphdr *)skb->data;
@@ -1092,7 +1547,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
th->seq = htonl(tcb->seq);
th->ack_seq = htonl(rcv_nxt);
*(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
- tcb->tcp_flags);
+ (tcb->tcp_flags & TCPHDR_FLAGS_MASK));
th->check = 0;
th->urg_ptr = 0;
@@ -1108,7 +1563,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
}
}
- tcp_options_write((__be32 *)(th + 1), tp, &opts);
skb_shinfo(skb)->gso_type = sk->sk_gso_type;
if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
th->window = htons(tcp_select_window(sk));
@@ -1119,25 +1573,41 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
*/
th->window = htons(min(tp->rcv_wnd, 65535U));
}
+
+ tcp_options_write(th, tp, NULL, &opts, &key);
+
+ if (tcp_key_is_md5(&key)) {
#ifdef CONFIG_TCP_MD5SIG
- /* Calculate the MD5 hash, as we have all we need now */
- if (md5) {
- sk_nocaps_add(sk, NETIF_F_GSO_MASK);
+ /* Calculate the MD5 hash, as we have all we need now */
+ sk_gso_disable(sk);
tp->af_specific->calc_md5_hash(opts.hash_location,
- md5, sk, skb);
- }
+ key.md5_key, sk, skb);
#endif
+ } else if (tcp_key_is_ao(&key)) {
+ int err;
+
+ err = tcp_ao_transmit_skb(sk, skb, key.ao_key, th,
+ opts.hash_location);
+ if (err) {
+ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_NOT_SPECIFIED);
+ return -ENOMEM;
+ }
+ }
+
+ /* BPF prog is the last one writing header option */
+ bpf_skops_write_hdr_opt(sk, skb, NULL, NULL, 0, &opts);
- icsk->icsk_af_ops->send_check(sk, skb);
+ INDIRECT_CALL_INET(icsk->icsk_af_ops->send_check,
+ tcp_v6_send_check, tcp_v4_send_check,
+ sk, skb);
if (likely(tcb->tcp_flags & TCPHDR_ACK))
- tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
+ tcp_event_ack_sent(sk, rcv_nxt);
if (skb->len != tcp_header_size) {
tcp_event_data_sent(tp, sk);
tp->data_segs_out += tcp_skb_pcount(skb);
tp->bytes_sent += skb->len - tcp_header_size;
- tcp_internal_pacing(sk, skb);
}
if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
@@ -1145,25 +1615,29 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
tcp_skb_pcount(skb));
tp->segs_out += tcp_skb_pcount(skb);
+ skb_set_hash_from_sk(skb, sk);
/* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */
skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
- /* Our usage of tstamp should remain private */
- skb->tstamp = 0;
+ /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
/* Cleanup our debris for IP stacks */
memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
sizeof(struct inet6_skb_parm)));
- err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
+ tcp_add_tx_delay(skb, tp);
+
+ err = INDIRECT_CALL_INET(icsk->icsk_af_ops->queue_xmit,
+ inet6_csk_xmit, ip_queue_xmit,
+ sk, skb, &inet->cork.fl);
if (unlikely(err > 0)) {
tcp_enter_cwr(sk);
err = net_xmit_eval(err);
}
if (!err && oskb) {
- tcp_update_skb_after_send(tp, oskb);
+ tcp_update_skb_after_send(sk, oskb, prior_wstamp);
tcp_rate_skb_sent(sk, oskb);
}
return err;
@@ -1186,26 +1660,31 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
/* Advance write_seq and place onto the write_queue. */
- tp->write_seq = TCP_SKB_CB(skb)->end_seq;
+ WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
__skb_header_release(skb);
+ psp_enqueue_set_decrypted(sk, skb);
tcp_add_write_queue_tail(sk, skb);
- sk->sk_wmem_queued += skb->truesize;
+ sk_wmem_queued_add(sk, skb->truesize);
sk_mem_charge(sk, skb->truesize);
}
/* Initialize TSO segments for a packet. */
-static void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
+static int tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
+ int tso_segs;
+
if (skb->len <= mss_now) {
/* Avoid the costly divide in the normal
* non-TSO case.
*/
- tcp_skb_pcount_set(skb, 1);
TCP_SKB_CB(skb)->tcp_gso_size = 0;
- } else {
- tcp_skb_pcount_set(skb, DIV_ROUND_UP(skb->len, mss_now));
- TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
+ tcp_skb_pcount_set(skb, 1);
+ return 1;
}
+ TCP_SKB_CB(skb)->tcp_gso_size = mss_now;
+ tso_segs = DIV_ROUND_UP(skb->len, mss_now);
+ tcp_skb_pcount_set(skb, tso_segs);
+ return tso_segs;
}
/* Pcount in the middle of the write queue got changed, we need to do various
@@ -1228,11 +1707,6 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
if (tcp_is_reno(tp) && decr > 0)
tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
- if (tp->lost_skb_hint &&
- before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
- (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
- tp->lost_cnt_hint -= decr;
-
tcp_verify_left_out(tp);
}
@@ -1288,28 +1762,43 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *buff;
- int nsize, old_factor;
+ int old_factor;
+ long limit;
+ u16 flags;
int nlen;
- u8 flags;
if (WARN_ON(len > skb->len))
return -EINVAL;
- nsize = skb_headlen(skb) - len;
- if (nsize < 0)
- nsize = 0;
+ DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb));
+
+ /* tcp_sendmsg() can overshoot sk_wmem_queued by one full size skb.
+ * We need some allowance to not penalize applications setting small
+ * SO_SNDBUF values.
+ * Also allow first and last skb in retransmit queue to be split.
+ */
+ limit = sk->sk_sndbuf + 2 * SKB_TRUESIZE(GSO_LEGACY_MAX_SIZE);
+ if (unlikely((sk->sk_wmem_queued >> 1) > limit &&
+ tcp_queue != TCP_FRAG_IN_WRITE_QUEUE &&
+ skb != tcp_rtx_queue_head(sk) &&
+ skb != tcp_rtx_queue_tail(sk))) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPWQUEUETOOBIG);
+ return -ENOMEM;
+ }
- if (skb_unclone(skb, gfp))
+ if (skb_unclone_keeptruesize(skb, gfp))
return -ENOMEM;
/* Get a new skb... force flag on. */
- buff = sk_stream_alloc_skb(sk, nsize, gfp, true);
+ buff = tcp_stream_alloc_skb(sk, gfp, true);
if (!buff)
return -ENOMEM; /* We'll just try again later. */
+ skb_copy_decrypted(buff, skb);
+ mptcp_skb_ext_copy(buff, skb);
- sk->sk_wmem_queued += buff->truesize;
+ sk_wmem_queued_add(sk, buff->truesize);
sk_mem_charge(sk, buff->truesize);
- nlen = skb->len - len - nsize;
+ nlen = skb->len - len;
buff->truesize += nlen;
skb->truesize -= nlen;
@@ -1327,9 +1816,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
skb_split(skb, buff, len);
- buff->ip_summed = CHECKSUM_PARTIAL;
-
- buff->tstamp = skb->tstamp;
+ skb_set_delivery_time(buff, skb->tstamp, SKB_CLOCK_MONOTONIC);
tcp_fragment_tstamp(skb, buff);
old_factor = tcp_skb_pcount(skb);
@@ -1369,13 +1856,7 @@ static int __pskb_trim_head(struct sk_buff *skb, int len)
struct skb_shared_info *shinfo;
int i, k, eat;
- eat = min_t(int, len, skb_headlen(skb));
- if (eat) {
- __skb_pull(skb, eat);
- len -= eat;
- if (!len)
- return 0;
- }
+ DEBUG_NET_WARN_ON_ONCE(skb_headlen(skb));
eat = len;
k = 0;
shinfo = skb_shinfo(skb);
@@ -1388,7 +1869,7 @@ static int __pskb_trim_head(struct sk_buff *skb, int len)
} else {
shinfo->frags[k] = shinfo->frags[i];
if (eat) {
- shinfo->frags[k].page_offset += eat;
+ skb_frag_off_add(&shinfo->frags[k], eat);
skb_frag_size_sub(&shinfo->frags[k], eat);
eat = 0;
}
@@ -1407,20 +1888,17 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
{
u32 delta_truesize;
- if (skb_unclone(skb, GFP_ATOMIC))
+ if (skb_unclone_keeptruesize(skb, GFP_ATOMIC))
return -ENOMEM;
delta_truesize = __pskb_trim_head(skb, len);
TCP_SKB_CB(skb)->seq += len;
- skb->ip_summed = CHECKSUM_PARTIAL;
- if (delta_truesize) {
- skb->truesize -= delta_truesize;
- sk->sk_wmem_queued -= delta_truesize;
+ skb->truesize -= delta_truesize;
+ sk_wmem_queued_add(sk, -delta_truesize);
+ if (!skb_zcopy_pure(skb))
sk_mem_uncharge(sk, delta_truesize);
- sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
- }
/* Any change of skb->len requires recalculation of tso factor. */
if (tcp_skb_pcount(skb) > 1)
@@ -1441,14 +1919,6 @@ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
*/
mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
- /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
- if (icsk->icsk_af_ops->net_frag_header_len) {
- const struct dst_entry *dst = __sk_dst_get(sk);
-
- if (dst && dst_allfrag(dst))
- mss_now -= icsk->icsk_af_ops->net_frag_header_len;
- }
-
/* Clamp it (mss_clamp does not include tcp options) */
if (mss_now > tp->rx_opt.mss_clamp)
mss_now = tp->rx_opt.mss_clamp;
@@ -1457,8 +1927,8 @@ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
mss_now -= icsk->icsk_ext_hdr_len;
/* Then reserve room for full set of TCP options and 8 bytes of data */
- if (mss_now < 48)
- mss_now = 48;
+ mss_now = max(mss_now,
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_snd_mss));
return mss_now;
}
@@ -1469,27 +1939,18 @@ int tcp_mtu_to_mss(struct sock *sk, int pmtu)
return __tcp_mtu_to_mss(sk, pmtu) -
(tcp_sk(sk)->tcp_header_len - sizeof(struct tcphdr));
}
+EXPORT_IPV6_MOD(tcp_mtu_to_mss);
/* Inverse of above */
int tcp_mss_to_mtu(struct sock *sk, int mss)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
- int mtu;
- mtu = mss +
+ return mss +
tp->tcp_header_len +
icsk->icsk_ext_hdr_len +
icsk->icsk_af_ops->net_header_len;
-
- /* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
- if (icsk->icsk_af_ops->net_frag_header_len) {
- const struct dst_entry *dst = __sk_dst_get(sk);
-
- if (dst && dst_allfrag(dst))
- mtu += icsk->icsk_af_ops->net_frag_header_len;
- }
- return mtu;
}
EXPORT_SYMBOL(tcp_mss_to_mtu);
@@ -1500,15 +1961,14 @@ void tcp_mtup_init(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
struct net *net = sock_net(sk);
- icsk->icsk_mtup.enabled = net->ipv4.sysctl_tcp_mtu_probing > 1;
+ icsk->icsk_mtup.enabled = READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing) > 1;
icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
icsk->icsk_af_ops->net_header_len;
- icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, READ_ONCE(net->ipv4.sysctl_tcp_base_mss));
icsk->icsk_mtup.probe_size = 0;
if (icsk->icsk_mtup.enabled)
icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
}
-EXPORT_SYMBOL(tcp_mtup_init);
/* This function synchronize snd mss to current pmtu/exthdr set.
@@ -1552,7 +2012,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
return mss_now;
}
-EXPORT_SYMBOL(tcp_sync_mss);
+EXPORT_IPV6_MOD(tcp_sync_mss);
/* Compute the current effective MSS, taking SACKs and IP options,
* and even PMTU discovery events into account.
@@ -1564,7 +2024,7 @@ unsigned int tcp_current_mss(struct sock *sk)
u32 mss_now;
unsigned int header_len;
struct tcp_out_options opts;
- struct tcp_md5sig_key *md5;
+ struct tcp_key key;
mss_now = tp->mss_cache;
@@ -1573,8 +2033,8 @@ unsigned int tcp_current_mss(struct sock *sk)
if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
mss_now = tcp_sync_mss(sk, mtu);
}
-
- header_len = tcp_established_options(sk, NULL, &opts, &md5) +
+ tcp_get_current_key(sk, &key);
+ header_len = tcp_established_options(sk, NULL, &opts, &key) +
sizeof(struct tcphdr);
/* The mss_cache is sized based on tp->tcp_header_len, which assumes
* some common options. If this is an odd packet (because we have SACK
@@ -1601,9 +2061,9 @@ static void tcp_cwnd_application_limited(struct sock *sk)
/* Limited by application or receiver window. */
u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
u32 win_used = max(tp->snd_cwnd_used, init_win);
- if (win_used < tp->snd_cwnd) {
+ if (win_used < tcp_snd_cwnd(tp)) {
tp->snd_ssthresh = tcp_current_ssthresh(sk);
- tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
+ tcp_snd_cwnd_set(tp, (tcp_snd_cwnd(tp) + win_used) >> 1);
}
tp->snd_cwnd_used = 0;
}
@@ -1615,14 +2075,20 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
struct tcp_sock *tp = tcp_sk(sk);
- /* Track the maximum number of outstanding packets in each
- * window, and remember whether we were cwnd-limited then.
+ /* Track the strongest available signal of the degree to which the cwnd
+ * is fully utilized. If cwnd-limited then remember that fact for the
+ * current window. If not cwnd-limited then track the maximum number of
+ * outstanding packets in the current window. (If cwnd-limited then we
+ * chose to not update tp->max_packets_out to avoid an extra else
+ * clause with no functional impact.)
*/
- if (!before(tp->snd_una, tp->max_packets_seq) ||
- tp->packets_out > tp->max_packets_out) {
- tp->max_packets_out = tp->packets_out;
- tp->max_packets_seq = tp->snd_nxt;
+ if (!before(tp->snd_una, tp->cwnd_usage_seq) ||
+ is_cwnd_limited ||
+ (!tp->is_cwnd_limited &&
+ tp->packets_out > tp->max_packets_out)) {
tp->is_cwnd_limited = is_cwnd_limited;
+ tp->max_packets_out = tp->packets_out;
+ tp->cwnd_usage_seq = tp->snd_nxt;
}
if (tcp_is_cwnd_limited(sk)) {
@@ -1634,7 +2100,7 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
if (tp->packets_out > tp->snd_cwnd_used)
tp->snd_cwnd_used = tp->packets_out;
- if (sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle &&
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle) &&
(s32)(tcp_jiffies32 - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto &&
!ca_ops->cong_control)
tcp_cwnd_application_limited(sk);
@@ -1691,24 +2157,34 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
}
/* Return how many segs we'd like on a TSO packet,
- * to send one TSO packet per ms
+ * depending on current pacing rate, and how close the peer is.
+ *
+ * Rationale is:
+ * - For close peers, we rather send bigger packets to reduce
+ * cpu costs, because occasional losses will be repaired fast.
+ * - For long distance/rtt flows, we would like to get ACK clocking
+ * with 1 ACK per ms.
+ *
+ * Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
+ * in bigger TSO bursts. We we cut the RTT-based allowance in half
+ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
+ * is below 1500 bytes after 6 * ~500 usec = 3ms.
*/
static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
int min_tso_segs)
{
- u32 bytes, segs;
+ unsigned long bytes;
+ u32 r;
- bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift,
- sk->sk_gso_max_size - 1 - MAX_TCP_HEADER);
+ bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift);
- /* Goal is to send at least one packet per ms,
- * not one big TSO packet every 100 ms.
- * This preserves ACK clocking and is consistent
- * with tcp_tso_should_defer() heuristic.
- */
- segs = max_t(u32, bytes / mss_now, min_tso_segs);
+ r = tcp_min_rtt(tcp_sk(sk)) >> READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log);
+ if (r < BITS_PER_TYPE(sk->sk_gso_max_size))
+ bytes += sk->sk_gso_max_size >> r;
- return segs;
+ bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size);
+
+ return max_t(u32, bytes / mss_now, min_tso_segs);
}
/* Return the number of segments we want in the skb we are transmitting.
@@ -1721,7 +2197,7 @@ static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
min_tso = ca_ops->min_tso_segs ?
ca_ops->min_tso_segs(sk) :
- sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs;
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
tso_segs = tcp_tso_autosize(sk, mss_now, min_tso);
return min_t(u32, tso_segs, sk->sk_gso_max_segs);
@@ -1762,18 +2238,12 @@ static unsigned int tcp_mss_split_point(const struct sock *sk,
/* Can at least one segment of SKB be sent right now, according to the
* congestion window rules? If so, return how many segments are allowed.
*/
-static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp,
- const struct sk_buff *skb)
+static u32 tcp_cwnd_test(const struct tcp_sock *tp)
{
u32 in_flight, cwnd, halfcwnd;
- /* Don't be strict about the congestion window for the final FIN. */
- if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) &&
- tcp_skb_pcount(skb) == 1)
- return 1;
-
in_flight = tcp_packets_in_flight(tp);
- cwnd = tp->snd_cwnd;
+ cwnd = tcp_snd_cwnd(tp);
if (in_flight >= cwnd)
return 0;
@@ -1792,10 +2262,9 @@ static int tcp_init_tso_segs(struct sk_buff *skb, unsigned int mss_now)
{
int tso_segs = tcp_skb_pcount(skb);
- if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) {
- tcp_set_skb_tso_segs(skb, mss_now);
- tso_segs = tcp_skb_pcount(skb);
- }
+ if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now))
+ return tcp_set_skb_tso_segs(skb, mss_now);
+
return tso_segs;
}
@@ -1845,23 +2314,23 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
* know that all the data is in scatter-gather pages, and that the
* packet has never been sent out before (and thus is not cloned).
*/
-static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
- struct sk_buff *skb, unsigned int len,
+static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
unsigned int mss_now, gfp_t gfp)
{
- struct sk_buff *buff;
int nlen = skb->len - len;
- u8 flags;
+ struct sk_buff *buff;
+ u16 flags;
/* All of a TSO frame must be composed of paged data. */
- if (skb->len != skb->data_len)
- return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp);
+ DEBUG_NET_WARN_ON_ONCE(skb->len != skb->data_len);
- buff = sk_stream_alloc_skb(sk, 0, gfp, true);
+ buff = tcp_stream_alloc_skb(sk, gfp, true);
if (unlikely(!buff))
return -ENOMEM;
+ skb_copy_decrypted(buff, skb);
+ mptcp_skb_ext_copy(buff, skb);
- sk->sk_wmem_queued += buff->truesize;
+ sk_wmem_queued_add(sk, buff->truesize);
sk_mem_charge(sk, buff->truesize);
buff->truesize += nlen;
skb->truesize -= nlen;
@@ -1876,12 +2345,8 @@ static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
TCP_SKB_CB(skb)->tcp_flags = flags & ~(TCPHDR_FIN | TCPHDR_PSH);
TCP_SKB_CB(buff)->tcp_flags = flags;
- /* This packet was never sent out yet, so no SACK bits. */
- TCP_SKB_CB(buff)->sacked = 0;
-
tcp_skb_fragment_eor(skb, buff);
- buff->ip_summed = CHECKSUM_PARTIAL;
skb_split(skb, buff, len);
tcp_fragment_tstamp(skb, buff);
@@ -1891,7 +2356,7 @@ static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
/* Link BUFF into the send queue. */
__skb_header_release(buff);
- tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
+ tcp_insert_write_queue_after(skb, buff, sk, TCP_FRAG_IN_WRITE_QUEUE);
return 0;
}
@@ -1902,35 +2367,39 @@ static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
* This algorithm is from John Heffner.
*/
static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
- bool *is_cwnd_limited, u32 max_segs)
+ bool *is_cwnd_limited,
+ bool *is_rwnd_limited,
+ u32 max_segs)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
- u32 age, send_win, cong_win, limit, in_flight;
+ u32 send_win, cong_win, limit, in_flight, threshold;
+ u64 srtt_in_ns, expected_ack, how_far_is_the_ack;
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *head;
int win_divisor;
-
- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
- goto send_now;
+ s64 delta;
if (icsk->icsk_ca_state >= TCP_CA_Recovery)
goto send_now;
/* Avoid bursty behavior by allowing defer
- * only if the last write was recent.
+ * only if the last write was recent (1 ms).
+ * Note that tp->tcp_wstamp_ns can be in the future if we have
+ * packets waiting in a qdisc or device for EDT delivery.
*/
- if ((s32)(tcp_jiffies32 - tp->lsndtime) > 0)
+ delta = tp->tcp_clock_cache - tp->tcp_wstamp_ns - NSEC_PER_MSEC;
+ if (delta > 0)
goto send_now;
in_flight = tcp_packets_in_flight(tp);
BUG_ON(tcp_skb_pcount(skb) <= 1);
- BUG_ON(tp->snd_cwnd <= in_flight);
+ BUG_ON(tcp_snd_cwnd(tp) <= in_flight);
send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
/* From in_flight test above, we know that cwnd > in_flight. */
- cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache;
+ cong_win = (tcp_snd_cwnd(tp) - in_flight) * tp->mss_cache;
limit = min(send_win, cong_win);
@@ -1944,7 +2413,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor);
if (win_divisor) {
- u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
+ u32 chunk = min(tp->snd_wnd, tcp_snd_cwnd(tp) * tp->mss_cache);
/* If at least some fraction of a window is available,
* just use it.
@@ -1966,15 +2435,43 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
head = tcp_rtx_queue_head(sk);
if (!head)
goto send_now;
- age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
- /* If next ACK is likely to come too late (half srtt), do not defer */
- if (age < (tp->srtt_us >> 4))
+
+ srtt_in_ns = (u64)(NSEC_PER_USEC >> 3) * tp->srtt_us;
+ /* When is the ACK expected ? */
+ expected_ack = head->tstamp + srtt_in_ns;
+ /* How far from now is the ACK expected ? */
+ how_far_is_the_ack = expected_ack - tp->tcp_clock_cache;
+
+ /* If next ACK is likely to come too late,
+ * ie in more than min(1ms, half srtt), do not defer.
+ */
+ threshold = min(srtt_in_ns >> 1, NSEC_PER_MSEC);
+
+ if ((s64)(how_far_is_the_ack - threshold) > 0)
goto send_now;
- /* Ok, it looks like it is advisable to defer. */
+ /* Ok, it looks like it is advisable to defer.
+ * Three cases are tracked :
+ * 1) We are cwnd-limited
+ * 2) We are rwnd-limited
+ * 3) We are application limited.
+ */
+ if (cong_win < send_win) {
+ if (cong_win <= skb->len) {
+ *is_cwnd_limited = true;
+ return true;
+ }
+ } else {
+ if (send_win <= skb->len) {
+ *is_rwnd_limited = true;
+ return true;
+ }
+ }
- if (cong_win < send_win && cong_win <= skb->len)
- *is_cwnd_limited = true;
+ /* If this packet won't get more data, do not wait. */
+ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) ||
+ TCP_SKB_CB(skb)->eor)
+ goto send_now;
return true;
@@ -1990,7 +2487,7 @@ static inline void tcp_mtu_check_reprobe(struct sock *sk)
u32 interval;
s32 delta;
- interval = net->ipv4.sysctl_tcp_probe_interval;
+ interval = READ_ONCE(net->ipv4.sysctl_tcp_probe_interval);
delta = tcp_jiffies32 - icsk->icsk_mtup.probe_timestamp;
if (unlikely(delta >= interval * HZ)) {
int mss = tcp_current_mss(sk);
@@ -2016,7 +2513,7 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
if (len <= skb->len)
break;
- if (unlikely(TCP_SKB_CB(skb)->eor))
+ if (tcp_has_tx_tstamp(skb) || !tcp_skb_can_collapse(skb, next))
return false;
len -= skb->len;
@@ -2025,6 +2522,72 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
return true;
}
+static int tcp_clone_payload(struct sock *sk, struct sk_buff *to,
+ int probe_size)
+{
+ skb_frag_t *lastfrag = NULL, *fragto = skb_shinfo(to)->frags;
+ int i, todo, len = 0, nr_frags = 0;
+ const struct sk_buff *skb;
+
+ if (!sk_wmem_schedule(sk, to->truesize + probe_size))
+ return -ENOMEM;
+
+ skb_queue_walk(&sk->sk_write_queue, skb) {
+ const skb_frag_t *fragfrom = skb_shinfo(skb)->frags;
+
+ if (skb_headlen(skb))
+ return -EINVAL;
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, fragfrom++) {
+ if (len >= probe_size)
+ goto commit;
+ todo = min_t(int, skb_frag_size(fragfrom),
+ probe_size - len);
+ len += todo;
+ if (lastfrag &&
+ skb_frag_page(fragfrom) == skb_frag_page(lastfrag) &&
+ skb_frag_off(fragfrom) == skb_frag_off(lastfrag) +
+ skb_frag_size(lastfrag)) {
+ skb_frag_size_add(lastfrag, todo);
+ continue;
+ }
+ if (unlikely(nr_frags == MAX_SKB_FRAGS))
+ return -E2BIG;
+ skb_frag_page_copy(fragto, fragfrom);
+ skb_frag_off_copy(fragto, fragfrom);
+ skb_frag_size_set(fragto, todo);
+ nr_frags++;
+ lastfrag = fragto++;
+ }
+ }
+commit:
+ WARN_ON_ONCE(len != probe_size);
+ for (i = 0; i < nr_frags; i++)
+ skb_frag_ref(to, i);
+
+ skb_shinfo(to)->nr_frags = nr_frags;
+ to->truesize += probe_size;
+ to->len += probe_size;
+ to->data_len += probe_size;
+ __skb_header_release(to);
+ return 0;
+}
+
+/* tcp_mtu_probe() and tcp_grow_skb() can both eat an skb (src) if
+ * all its payload was moved to another one (dst).
+ * Make sure to transfer tcp_flags, eor, and tstamp.
+ */
+static void tcp_eat_one_skb(struct sock *sk,
+ struct sk_buff *dst,
+ struct sk_buff *src)
+{
+ TCP_SKB_CB(dst)->tcp_flags |= TCP_SKB_CB(src)->tcp_flags;
+ TCP_SKB_CB(dst)->eor = TCP_SKB_CB(src)->eor;
+ tcp_skb_collapse_tstamp(dst, src);
+ tcp_unlink_write_queue(src, sk);
+ tcp_wmem_free_skb(sk, src);
+}
+
/* Create a new MTU probe if we are ready.
* MTU probe is regularly attempting to increase the path MTU by
* deliberately sending larger packets. This discovers routing
@@ -2054,7 +2617,7 @@ static int tcp_mtu_probe(struct sock *sk)
if (likely(!icsk->icsk_mtup.enabled ||
icsk->icsk_mtup.probe_size ||
inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
- tp->snd_cwnd < 11 ||
+ tcp_snd_cwnd(tp) < 11 ||
tp->rx_opt.num_sacks || tp->rx_opt.dsack))
return -1;
@@ -2072,7 +2635,7 @@ static int tcp_mtu_probe(struct sock *sk)
* probing process by not resetting search range to its orignal.
*/
if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
- interval < net->ipv4.sysctl_tcp_probe_threshold) {
+ interval < READ_ONCE(net->ipv4.sysctl_tcp_probe_threshold)) {
/* Check whether enough time has elaplased for
* another round of probing.
*/
@@ -2090,7 +2653,7 @@ static int tcp_mtu_probe(struct sock *sk)
return 0;
/* Do we need to wait to drain cwnd? With none in flight, don't stall */
- if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) {
+ if (tcp_packets_in_flight(tp) + 2 > tcp_snd_cwnd(tp)) {
if (!tcp_packets_in_flight(tp))
return -1;
else
@@ -2101,20 +2664,26 @@ static int tcp_mtu_probe(struct sock *sk)
return -1;
/* We're allowed to probe. Build it now. */
- nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
+ nskb = tcp_stream_alloc_skb(sk, GFP_ATOMIC, false);
if (!nskb)
return -1;
- sk->sk_wmem_queued += nskb->truesize;
+
+ /* build the payload, and be prepared to abort if this fails. */
+ if (tcp_clone_payload(sk, nskb, probe_size)) {
+ tcp_skb_tsorted_anchor_cleanup(nskb);
+ consume_skb(nskb);
+ return -1;
+ }
+ sk_wmem_queued_add(sk, nskb->truesize);
sk_mem_charge(sk, nskb->truesize);
skb = tcp_send_head(sk);
+ skb_copy_decrypted(nskb, skb);
+ mptcp_skb_ext_copy(nskb, skb);
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
TCP_SKB_CB(nskb)->tcp_flags = TCPHDR_ACK;
- TCP_SKB_CB(nskb)->sacked = 0;
- nskb->csum = 0;
- nskb->ip_summed = CHECKSUM_PARTIAL;
tcp_insert_write_queue_before(nskb, skb, sk);
tcp_highest_sack_replace(sk, skb, nskb);
@@ -2122,27 +2691,14 @@ static int tcp_mtu_probe(struct sock *sk)
len = 0;
tcp_for_write_queue_from_safe(skb, next, sk) {
copy = min_t(int, skb->len, probe_size - len);
- skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
if (skb->len <= copy) {
- /* We've eaten all the data from this skb.
- * Throw it away. */
- TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
- /* If this is the last SKB we copy and eor is set
- * we need to propagate it to the new skb.
- */
- TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
- tcp_unlink_write_queue(skb, sk);
- sk_wmem_free_skb(sk, skb);
+ tcp_eat_one_skb(sk, nskb, skb);
} else {
TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags &
~(TCPHDR_FIN|TCPHDR_PSH);
- if (!skb_shinfo(skb)->nr_frags) {
- skb_pull(skb, copy);
- } else {
- __pskb_trim_head(skb, copy);
- tcp_set_skb_tso_segs(skb, mss_now);
- }
+ __pskb_trim_head(skb, copy);
+ tcp_set_skb_tso_segs(skb, mss_now);
TCP_SKB_CB(skb)->seq += copy;
}
@@ -2159,7 +2715,7 @@ static int tcp_mtu_probe(struct sock *sk)
if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
/* Decrement cwnd here because we are sending
* effectively two packets. */
- tp->snd_cwnd--;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1);
tcp_event_new_data_sent(sk, nskb);
icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
@@ -2172,10 +2728,35 @@ static int tcp_mtu_probe(struct sock *sk)
return -1;
}
-static bool tcp_pacing_check(const struct sock *sk)
+static bool tcp_pacing_check(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tcp_needs_internal_pacing(sk))
+ return false;
+
+ if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache)
+ return false;
+
+ if (!hrtimer_is_queued(&tp->pacing_timer)) {
+ hrtimer_start(&tp->pacing_timer,
+ ns_to_ktime(tp->tcp_wstamp_ns),
+ HRTIMER_MODE_ABS_PINNED_SOFT);
+ sock_hold(sk);
+ }
+ return true;
+}
+
+static bool tcp_rtx_queue_empty_or_single_skb(const struct sock *sk)
{
- return tcp_needs_internal_pacing(sk) &&
- hrtimer_is_queued(&tcp_sk(sk)->pacing_timer);
+ const struct rb_node *node = sk->tcp_rtx_queue.rb_node;
+
+ /* No skb in the rtx queue. */
+ if (!node)
+ return true;
+
+ /* Only one skb in rtx queue. */
+ return !node->rb_left && !node->rb_right;
}
/* TCP Small Queues :
@@ -2192,20 +2773,35 @@ static bool tcp_pacing_check(const struct sock *sk)
static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
unsigned int factor)
{
- unsigned int limit;
+ unsigned long limit;
- limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift);
- limit = min_t(u32, limit,
- sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes);
+ limit = max_t(unsigned long,
+ 2 * skb->truesize,
+ READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift));
+ limit = min_t(unsigned long, limit,
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes));
limit <<= factor;
+ if (static_branch_unlikely(&tcp_tx_delay_enabled) &&
+ tcp_sk(sk)->tcp_tx_delay) {
+ u64 extra_bytes = (u64)READ_ONCE(sk->sk_pacing_rate) *
+ tcp_sk(sk)->tcp_tx_delay;
+
+ /* TSQ is based on skb truesize sum (sk_wmem_alloc), so we
+ * approximate our needs assuming an ~100% skb->truesize overhead.
+ * USEC_PER_SEC is approximated by 2^20.
+ * do_div(extra_bytes, USEC_PER_SEC/2) is replaced by a right shift.
+ */
+ extra_bytes >>= (20 - 1);
+ limit += extra_bytes;
+ }
if (refcount_read(&sk->sk_wmem_alloc) > limit) {
- /* Always send skb if rtx queue is empty.
+ /* Always send skb if rtx queue is empty or has one skb.
* No need to wait for TX completion to call us back,
- * after softirq/tasklet schedule.
+ * after softirq schedule.
* This helps when TX completions are delayed too much.
*/
- if (tcp_rtx_queue_empty(sk))
+ if (tcp_rtx_queue_empty_or_single_skb(sk))
return false;
set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
@@ -2262,6 +2858,35 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
tcp_chrono_set(tp, TCP_CHRONO_BUSY);
}
+/* First skb in the write queue is smaller than ideal packet size.
+ * Check if we can move payload from the second skb in the queue.
+ */
+static void tcp_grow_skb(struct sock *sk, struct sk_buff *skb, int amount)
+{
+ struct sk_buff *next_skb = skb->next;
+ unsigned int nlen;
+
+ if (tcp_skb_is_last(sk, skb))
+ return;
+
+ if (!tcp_skb_can_collapse(skb, next_skb))
+ return;
+
+ nlen = min_t(u32, amount, next_skb->len);
+ if (!nlen || !skb_shift(skb, next_skb, nlen))
+ return;
+
+ TCP_SKB_CB(skb)->end_seq += nlen;
+ TCP_SKB_CB(next_skb)->seq += nlen;
+
+ if (!next_skb->len) {
+ /* In case FIN is set, we need to update end_seq */
+ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
+
+ tcp_eat_one_skb(sk, skb, next_skb);
+ }
+}
+
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
@@ -2282,14 +2907,18 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
unsigned int tso_segs, sent_pkts;
- int cwnd_quota;
+ u32 cwnd_quota, max_segs;
int result;
bool is_cwnd_limited = false, is_rwnd_limited = false;
- u32 max_segs;
sent_pkts = 0;
tcp_mstamp_refresh(tp);
+
+ /* AccECN option beacon depends on mstamp, it may change mss */
+ if (tcp_ecn_mode_accecn(tp) && tcp_accecn_option_beacon_check(sk))
+ mss_now = tcp_current_mss(sk);
+
if (!push_one) {
/* Do MTU probing. */
result = tcp_mtu_probe(sk);
@@ -2303,20 +2932,21 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
max_segs = tcp_tso_segs(sk, mss_now);
while ((skb = tcp_send_head(sk))) {
unsigned int limit;
-
- if (tcp_pacing_check(sk))
- break;
-
- tso_segs = tcp_init_tso_segs(skb, mss_now);
- BUG_ON(!tso_segs);
+ int missing_bytes;
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
- /* "skb_mstamp" is used as a start point for the retransmit timer */
- tcp_update_skb_after_send(tp, skb);
+ /* "skb_mstamp_ns" is used as a start point for the retransmit timer */
+ tp->tcp_wstamp_ns = tp->tcp_clock_cache;
+ skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC);
+ list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
+ tcp_init_tso_segs(skb, mss_now);
goto repair; /* Skip network transmission */
}
- cwnd_quota = tcp_cwnd_test(tp, skb);
+ if (tcp_pacing_check(sk))
+ break;
+
+ cwnd_quota = tcp_cwnd_test(tp);
if (!cwnd_quota) {
if (push_one == 2)
/* Force out a loss probe pkt. */
@@ -2324,6 +2954,12 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
else
break;
}
+ cwnd_quota = min(cwnd_quota, max_segs);
+ missing_bytes = cwnd_quota * mss_now - skb->len;
+ if (missing_bytes > 0)
+ tcp_grow_skb(sk, skb, missing_bytes);
+
+ tso_segs = tcp_set_skb_tso_segs(skb, mss_now);
if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now))) {
is_rwnd_limited = true;
@@ -2338,26 +2974,31 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
} else {
if (!push_one &&
tcp_tso_should_defer(sk, skb, &is_cwnd_limited,
- max_segs))
+ &is_rwnd_limited, max_segs))
break;
}
limit = mss_now;
if (tso_segs > 1 && !tcp_urg_mode(tp))
limit = tcp_mss_split_point(sk, skb, mss_now,
- min_t(unsigned int,
- cwnd_quota,
- max_segs),
+ cwnd_quota,
nonagle);
if (skb->len > limit &&
- unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
- skb, limit, mss_now, gfp)))
+ unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
break;
if (tcp_small_queue_check(sk, skb, 0))
break;
+ /* Argh, we hit an empty skb(), presumably a thread
+ * is sleeping in sendmsg()/sk_stream_wait_memory().
+ * We do not want to send a pure-ack packet and have
+ * a strange looking rtx queue with empty packet(s).
+ */
+ if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq)
+ break;
+
if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp)))
break;
@@ -2379,6 +3020,10 @@ repair:
else
tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED);
+ is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp));
+ if (likely(sent_pkts || is_cwnd_limited))
+ tcp_cwnd_validate(sk, is_cwnd_limited);
+
if (likely(sent_pkts)) {
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += sent_pkts;
@@ -2386,8 +3031,6 @@ repair:
/* Send one loss probe per tail loss episode. */
if (push_one != 2)
tcp_schedule_loss_probe(sk, false);
- is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd);
- tcp_cwnd_validate(sk, is_cwnd_limited);
return false;
}
return !tp->packets_out && !tcp_write_queue_empty(sk);
@@ -2397,16 +3040,16 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- u32 timeout, rto_delta_us;
+ u32 timeout, timeout_us, rto_delta_us;
int early_retrans;
/* Don't do any loss probe on a Fast Open connection before 3WHS
* finishes.
*/
- if (tp->fastopen_rsk)
+ if (rcu_access_pointer(tp->fastopen_rsk))
return false;
- early_retrans = sock_net(sk)->ipv4.sysctl_tcp_early_retrans;
+ early_retrans = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_early_retrans);
/* Schedule a loss probe in 2*RTT for SACK capable connections
* not in loss recovery, that are either limited by cwnd or application.
*/
@@ -2421,11 +3064,12 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
* sample is available then probe after TCP_TIMEOUT_INIT.
*/
if (tp->srtt_us) {
- timeout = usecs_to_jiffies(tp->srtt_us >> 2);
+ timeout_us = tp->srtt_us >> 2;
if (tp->packets_out == 1)
- timeout += TCP_RTO_MIN;
+ timeout_us += tcp_rto_min_us(sk);
else
- timeout += TCP_TIMEOUT_MIN;
+ timeout_us += TCP_TIMEOUT_MIN_US;
+ timeout = usecs_to_jiffies(timeout_us);
} else {
timeout = TCP_TIMEOUT_INIT;
}
@@ -2437,8 +3081,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
if (rto_delta_us > 0)
timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
- TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, true);
return true;
}
@@ -2446,13 +3089,17 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
* a packet is still in a qdisc or driver queue.
* In this case, there is very little point doing a retransmit !
*/
-static bool skb_still_in_host_queue(const struct sock *sk,
+static bool skb_still_in_host_queue(struct sock *sk,
const struct sk_buff *skb)
{
if (unlikely(skb_fclone_busy(sk, skb))) {
- NET_INC_STATS(sock_net(sk),
- LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
- return true;
+ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
+ smp_mb__after_atomic();
+ if (skb_fclone_busy(sk, skb)) {
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
+ return true;
+ }
}
return false;
}
@@ -2467,6 +3114,11 @@ void tcp_send_loss_probe(struct sock *sk)
int pcount;
int mss = tcp_current_mss(sk);
+ /* At most one outstanding TLP */
+ if (tp->tlp_high_seq)
+ goto rearm_timer;
+
+ tp->tlp_retrans = 0;
skb = tcp_send_head(sk);
if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
pcount = tp->packets_out;
@@ -2476,14 +3128,11 @@ void tcp_send_loss_probe(struct sock *sk)
goto rearm_timer;
}
skb = skb_rb_last(&sk->tcp_rtx_queue);
-
- /* At most one outstanding TLP retransmission. */
- if (tp->tlp_high_seq)
- goto rearm_timer;
-
- /* Retransmit last segment. */
- if (WARN_ON(!skb))
- goto rearm_timer;
+ if (unlikely(!skb)) {
+ tcp_warn_once(sk, tp->packets_out, "invalid inflight: ");
+ smp_store_release(&inet_csk(sk)->icsk_pending, 0);
+ return;
+ }
if (skb_still_in_host_queue(sk, skb))
goto rearm_timer;
@@ -2506,13 +3155,15 @@ void tcp_send_loss_probe(struct sock *sk)
if (__tcp_retransmit_skb(sk, skb, 1))
goto rearm_timer;
+ tp->tlp_retrans = 1;
+
+probe_sent:
/* Record snd_nxt for loss detection. */
tp->tlp_high_seq = tp->snd_nxt;
-probe_sent:
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
/* Reset s.t. tcp_rearm_rto will restart timer from now */
- inet_csk(sk)->icsk_pending = 0;
+ smp_store_release(&inet_csk(sk)->icsk_pending, 0);
rearm_timer:
tcp_rearm_rto(sk);
}
@@ -2604,6 +3255,7 @@ u32 __tcp_select_window(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
/* MSS for the peer's data. Previous versions used mss_clamp
* here. I don't know if the value based on our guesses
* of peer's MSS is better for the performance. It's more correct
@@ -2613,20 +3265,32 @@ u32 __tcp_select_window(struct sock *sk)
int mss = icsk->icsk_ack.rcv_mss;
int free_space = tcp_space(sk);
int allowed_space = tcp_full_space(sk);
- int full_space = min_t(int, tp->window_clamp, allowed_space);
- int window;
+ int full_space, window;
+
+ if (sk_is_mptcp(sk))
+ mptcp_space(sk, &free_space, &allowed_space);
+
+ full_space = min_t(int, tp->window_clamp, allowed_space);
if (unlikely(mss > full_space)) {
mss = full_space;
if (mss <= 0)
return 0;
}
+
+ /* Only allow window shrink if the sysctl is enabled and we have
+ * a non-zero scaling factor in effect.
+ */
+ if (READ_ONCE(net->ipv4.sysctl_tcp_shrink_window) && tp->rx_opt.rcv_wscale)
+ goto shrink_window_allowed;
+
+ /* do not allow window to shrink */
+
if (free_space < (full_space >> 1)) {
icsk->icsk_ack.quick = 0;
if (tcp_under_memory_pressure(sk))
- tp->rcv_ssthresh = min(tp->rcv_ssthresh,
- 4U * tp->advmss);
+ tcp_adjust_rcv_ssthresh(sk);
/* free_space might become our new window, make sure we don't
* increase it due to wscale.
@@ -2676,6 +3340,36 @@ u32 __tcp_select_window(struct sock *sk)
}
return window;
+
+shrink_window_allowed:
+ /* new window should always be an exact multiple of scaling factor */
+ free_space = round_down(free_space, 1 << tp->rx_opt.rcv_wscale);
+
+ if (free_space < (full_space >> 1)) {
+ icsk->icsk_ack.quick = 0;
+
+ if (tcp_under_memory_pressure(sk))
+ tcp_adjust_rcv_ssthresh(sk);
+
+ /* if free space is too low, return a zero window */
+ if (free_space < (allowed_space >> 4) || free_space < mss ||
+ free_space < (1 << tp->rx_opt.rcv_wscale))
+ return 0;
+ }
+
+ if (free_space > tp->rcv_ssthresh) {
+ free_space = tp->rcv_ssthresh;
+ /* new window should always be an exact multiple of scaling factor
+ *
+ * For this case, we ALIGN "up" (increase free_space) because
+ * we know free_space is not zero here, it has been reduced from
+ * the memory-based limit, and rcv_ssthresh is not a hard limit
+ * (unlike sk_rcvbuf).
+ */
+ free_space = ALIGN(free_space, (1 << tp->rx_opt.rcv_wscale));
+ }
+
+ return free_space;
}
void tcp_skb_collapse_tstamp(struct sk_buff *skb,
@@ -2704,13 +3398,9 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
- if (next_skb_size) {
- if (next_skb_size <= skb_availroom(skb))
- skb_copy_bits(next_skb, 0, skb_put(skb, next_skb_size),
- next_skb_size);
- else if (!skb_shift(skb, next_skb, next_skb_size))
- return false;
- }
+ if (next_skb_size && !tcp_skb_shift(skb, next_skb, 1, next_skb_size))
+ return false;
+
tcp_highest_sack_replace(sk, next_skb, skb);
/* Update sequence range on original skb. */
@@ -2726,7 +3416,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor;
/* changed transmit queue under us so clear hints */
- tcp_clear_retrans_hints_partial(tp);
if (next_skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = skb;
@@ -2745,6 +3434,8 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
return false;
if (skb_cloned(skb))
return false;
+ if (!skb_frags_readable(skb))
+ return false;
/* Some heuristics for collapsing over SACK'd could be invented */
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
return false;
@@ -2762,7 +3453,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
struct sk_buff *skb = to, *tmp;
bool first = true;
- if (!sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse)
+ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retrans_collapse))
return;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
return;
@@ -2771,7 +3462,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
if (!tcp_can_collapse(sk, skb))
break;
- if (!tcp_skb_can_collapse_to(to))
+ if (!tcp_skb_can_collapse(to, skb))
break;
space -= skb->len;
@@ -2802,65 +3493,88 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
struct tcp_sock *tp = tcp_sk(sk);
unsigned int cur_mss;
int diff, len, err;
-
+ int avail_wnd;
/* Inconclusive MTU probe */
if (icsk->icsk_mtup.probe_size)
icsk->icsk_mtup.probe_size = 0;
- /* Do not sent more than we queued. 1/4 is reserved for possible
- * copying overhead: fragmentation, tunneling, mangling etc.
- */
- if (refcount_read(&sk->sk_wmem_alloc) >
- min_t(u32, sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2),
- sk->sk_sndbuf))
- return -EAGAIN;
-
- if (skb_still_in_host_queue(sk, skb))
- return -EBUSY;
+ if (skb_still_in_host_queue(sk, skb)) {
+ err = -EBUSY;
+ goto out;
+ }
+start:
if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
+ if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN;
+ TCP_SKB_CB(skb)->seq++;
+ goto start;
+ }
if (unlikely(before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))) {
WARN_ON_ONCE(1);
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
+ }
+ if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) {
+ err = -ENOMEM;
+ goto out;
}
- if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
- return -ENOMEM;
}
- if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
- return -EHOSTUNREACH; /* Routing failure or similar. */
+ if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) {
+ err = -EHOSTUNREACH; /* Routing failure or similar. */
+ goto out;
+ }
cur_mss = tcp_current_mss(sk);
+ avail_wnd = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
/* If receiver has shrunk his window, and skb is out of
* new window, do not retransmit it. The exception is the
* case, when window is shrunk to zero. In this case
- * our retransmit serves as a zero window probe.
+ * our retransmit of one segment serves as a zero window probe.
*/
- if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
- TCP_SKB_CB(skb)->seq != tp->snd_una)
- return -EAGAIN;
+ if (avail_wnd <= 0) {
+ if (TCP_SKB_CB(skb)->seq != tp->snd_una) {
+ err = -EAGAIN;
+ goto out;
+ }
+ avail_wnd = cur_mss;
+ }
len = cur_mss * segs;
+ if (len > avail_wnd) {
+ len = rounddown(avail_wnd, cur_mss);
+ if (!len)
+ len = avail_wnd;
+ }
if (skb->len > len) {
if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
- cur_mss, GFP_ATOMIC))
- return -ENOMEM; /* We'll try again later. */
+ cur_mss, GFP_ATOMIC)) {
+ err = -ENOMEM; /* We'll try again later. */
+ goto out;
+ }
} else {
- if (skb_unclone(skb, GFP_ATOMIC))
- return -ENOMEM;
+ if (skb_unclone_keeptruesize(skb, GFP_ATOMIC)) {
+ err = -ENOMEM;
+ goto out;
+ }
diff = tcp_skb_pcount(skb);
tcp_set_skb_tso_segs(skb, cur_mss);
diff -= tcp_skb_pcount(skb);
if (diff)
tcp_adjust_pcount(sk, skb, diff);
- if (skb->len < cur_mss)
- tcp_retrans_try_collapse(sk, skb, cur_mss);
+ avail_wnd = min_t(int, avail_wnd, cur_mss);
+ if (skb->len < avail_wnd)
+ tcp_retrans_try_collapse(sk, skb, avail_wnd);
}
- /* RFC3168, section 6.1.1.1. ECN fallback */
+ /* RFC3168, section 6.1.1.1. ECN fallback
+ * As AccECN uses the same SYN flags (+ AE), this check covers both
+ * cases.
+ */
if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
tcp_ecn_clear_syn(sk, skb);
@@ -2882,12 +3596,16 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
tcp_skb_tsorted_save(skb) {
nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC);
- err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) :
- -ENOBUFS;
+ if (nskb) {
+ nskb->dev = NULL;
+ err = tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC);
+ } else {
+ err = -ENOBUFS;
+ }
} tcp_skb_tsorted_restore(skb);
if (!err) {
- tcp_update_skb_after_send(tp, skb);
+ tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns);
tcp_rate_skb_sent(sk, skb);
}
} else {
@@ -2898,12 +3616,16 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB,
TCP_SKB_CB(skb)->seq, segs, err);
- if (likely(!err)) {
- TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
- trace_tcp_retransmit_skb(sk, skb);
- } else if (err != -EBUSY) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
- }
+ if (unlikely(err) && err != -EBUSY)
+ NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs);
+
+ /* To avoid taking spuriously low RTT samples based on a timestamp
+ * for a transmit that never happened, always mark EVER_RETRANS
+ */
+ TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS;
+
+out:
+ trace_tcp_retransmit_skb(sk, skb, err);
return err;
}
@@ -2920,13 +3642,12 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
#endif
TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
tp->retrans_out += tcp_skb_pcount(skb);
-
- /* Save stamp of the first retransmit. */
- if (!tp->retrans_stamp)
- tp->retrans_stamp = tcp_skb_timestamp(skb);
-
}
+ /* Save stamp of the first (attempted) retransmit. */
+ if (!tp->retrans_stamp)
+ tp->retrans_stamp = tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb);
+
if (tp->undo_retrans < 0)
tp->undo_retrans = 0;
tp->undo_retrans += tcp_skb_pcount(skb);
@@ -2943,6 +3664,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb, *rtx_head, *hole = NULL;
struct tcp_sock *tp = tcp_sk(sk);
+ bool rearm_timer = false;
u32 max_segs;
int mib_idx;
@@ -2963,9 +3685,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (!hole)
tp->retransmit_skb_hint = skb;
- segs = tp->snd_cwnd - tcp_packets_in_flight(tp);
+ segs = tcp_snd_cwnd(tp) - tcp_packets_in_flight(tp);
if (segs <= 0)
- return;
+ break;
sacked = TCP_SKB_CB(skb)->sacked;
/* In case tcp_shift_skb_data() have aggregated large skbs,
* we need to make sure not sending too bigs TSO packets
@@ -2990,10 +3712,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
continue;
if (tcp_small_queue_check(sk, skb, 1))
- return;
+ break;
if (tcp_retransmit_skb(sk, skb, segs))
- return;
+ break;
NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb));
@@ -3002,10 +3724,12 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (skb == rtx_head &&
icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto,
- TCP_RTO_MAX);
+ rearm_timer = true;
+
}
+ if (rearm_timer)
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto, true);
}
/* We allow to exceed memory limits for FIN packets to expedite
@@ -3017,16 +3741,22 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
*/
void sk_forced_mem_schedule(struct sock *sk, int size)
{
- int amt;
+ int delta, amt;
- if (size <= sk->sk_forward_alloc)
+ delta = size - sk->sk_forward_alloc;
+ if (delta <= 0)
return;
- amt = sk_mem_pages(size);
- sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
- sk_memory_allocated_add(sk, amt);
- if (mem_cgroup_sockets_enabled && sk->sk_memcg)
- mem_cgroup_charge_skmem(sk->sk_memcg, amt);
+ amt = sk_mem_pages(delta);
+ sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
+
+ if (mem_cgroup_sk_enabled(sk))
+ mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
+
+ if (sk->sk_bypass_prot_mem)
+ return;
+
+ sk_memory_allocated_add(sk, amt);
}
/* Send a FIN. The caller locks the socket for us.
@@ -3034,7 +3764,7 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
*/
void tcp_send_fin(struct sock *sk)
{
- struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
+ struct sk_buff *skb, *tskb, *tail = tcp_write_queue_tail(sk);
struct tcp_sock *tp = tcp_sk(sk);
/* Optimization, tack on the FIN if we have one skb in write queue and
@@ -3042,36 +3772,36 @@ void tcp_send_fin(struct sock *sk)
* Note: in the latter case, FIN packet will be sent after a timeout,
* as TCP stack thinks it has already been transmitted.
*/
+ tskb = tail;
if (!tskb && tcp_under_memory_pressure(sk))
tskb = skb_rb_last(&sk->tcp_rtx_queue);
if (tskb) {
-coalesce:
TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
TCP_SKB_CB(tskb)->end_seq++;
tp->write_seq++;
- if (tcp_write_queue_empty(sk)) {
+ if (!tail) {
/* This means tskb was already sent.
* Pretend we included the FIN on previous transmit.
* We need to set tp->snd_nxt to the value it would have
* if FIN had been sent. This is because retransmit path
* does not change tp->snd_nxt.
*/
- tp->snd_nxt++;
+ WRITE_ONCE(tp->snd_nxt, tp->snd_nxt + 1);
return;
}
} else {
- skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
- if (unlikely(!skb)) {
- if (tskb)
- goto coalesce;
+ skb = alloc_skb_fclone(MAX_TCP_HEADER,
+ sk_gfp_mask(sk, GFP_ATOMIC |
+ __GFP_NOWARN));
+ if (unlikely(!skb))
return;
- }
+
INIT_LIST_HEAD(&skb->tcp_tsorted_anchor);
skb_reserve(skb, MAX_TCP_HEADER);
sk_forced_mem_schedule(sk, skb->truesize);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
- tcp_init_nondata_skb(skb, tp->write_seq,
+ tcp_init_nondata_skb(skb, sk, tp->write_seq,
TCPHDR_ACK | TCPHDR_FIN);
tcp_queue_skb(sk, skb);
}
@@ -3083,7 +3813,8 @@ coalesce:
* was unread data in the receive queue. This behavior is recommended
* by RFC 2525, section 2.17. -DaveM
*/
-void tcp_send_active_reset(struct sock *sk, gfp_t priority)
+void tcp_send_active_reset(struct sock *sk, gfp_t priority,
+ enum sk_rst_reason reason)
{
struct sk_buff *skb;
@@ -3098,7 +3829,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
/* Reserve space for headers and prepare control bits. */
skb_reserve(skb, MAX_TCP_HEADER);
- tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
+ tcp_init_nondata_skb(skb, sk, tcp_acceptable_seq(sk),
TCPHDR_ACK | TCPHDR_RST);
tcp_mstamp_refresh(tcp_sk(sk));
/* Send it off. */
@@ -3108,7 +3839,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority)
/* skb of trace_tcp_send_reset() keeps the skb that caused RST,
* skb here is different to the troublesome skb, so use NULL
*/
- trace_tcp_send_reset(sk, NULL);
+ trace_tcp_send_reset(sk, NULL, reason);
}
/* Send a crossed SYN-ACK during socket establishment.
@@ -3136,10 +3867,11 @@ int tcp_send_synack(struct sock *sk)
if (!nskb)
return -ENOMEM;
INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
+ tcp_highest_sack_replace(sk, skb, nskb);
tcp_rtx_queue_unlink_and_free(skb, sk);
__skb_header_release(nskb);
tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
- sk->sk_wmem_queued += nskb->truesize;
+ sk_wmem_queued_add(sk, nskb->truesize);
sk_mem_charge(sk, nskb->truesize);
skb = nskb;
}
@@ -3151,27 +3883,30 @@ int tcp_send_synack(struct sock *sk)
}
/**
- * tcp_make_synack - Prepare a SYN-ACK.
- * sk: listener socket
- * dst: dst entry attached to the SYNACK
- * req: request_sock pointer
- *
- * Allocate one skb and build a SYNACK packet.
- * @dst is consumed : Caller should not use it again.
+ * tcp_make_synack - Allocate one skb and build a SYNACK packet.
+ * @sk: listener socket
+ * @dst: dst entry attached to the SYNACK. It is consumed and caller
+ * should not use it again.
+ * @req: request_sock pointer
+ * @foc: cookie for tcp fast open
+ * @synack_type: Type of synack to prepare
+ * @syn_skb: SYN packet just received. It could be NULL for rtx case.
*/
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type)
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
const struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_md5sig_key *md5 = NULL;
struct tcp_out_options opts;
+ struct tcp_key key = {};
struct sk_buff *skb;
int tcp_header_size;
struct tcphdr *th;
int mss;
+ u64 now;
skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
if (unlikely(!skb)) {
@@ -3183,7 +3918,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
switch (synack_type) {
case TCP_SYNACK_NORMAL:
- skb_set_owner_w(skb, req_to_sk(req));
+ skb_set_owner_edemux(skb, req_to_sk(req));
break;
case TCP_SYNACK_COOKIE:
/* Under synflood, we do not attach skb to a socket,
@@ -3203,20 +3938,60 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
memset(&opts, 0, sizeof(opts));
+ now = tcp_clock_ns();
#ifdef CONFIG_SYN_COOKIES
- if (unlikely(req->cookie_ts))
- skb->skb_mstamp = cookie_init_timestamp(req);
+ if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
+ skb_set_delivery_time(skb, cookie_init_timestamp(req, now),
+ SKB_CLOCK_MONOTONIC);
else
#endif
- skb->skb_mstamp = tcp_clock_us();
+ {
+ skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC);
+ if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
+ tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
+ }
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
rcu_read_lock();
- md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
#endif
- skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4);
- tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
- foc) + sizeof(*th);
+ if (tcp_rsk_used_ao(req)) {
+#ifdef CONFIG_TCP_AO
+ struct tcp_ao_key *ao_key = NULL;
+ u8 keyid = tcp_rsk(req)->ao_keyid;
+ u8 rnext = tcp_rsk(req)->ao_rcv_next;
+
+ ao_key = tcp_sk(sk)->af_specific->ao_lookup(sk, req_to_sk(req),
+ keyid, -1);
+ /* If there is no matching key - avoid sending anything,
+ * especially usigned segments. It could try harder and lookup
+ * for another peer-matching key, but the peer has requested
+ * ao_keyid (RFC5925 RNextKeyID), so let's keep it simple here.
+ */
+ if (unlikely(!ao_key)) {
+ trace_tcp_ao_synack_no_key(sk, keyid, rnext);
+ rcu_read_unlock();
+ kfree_skb(skb);
+ net_warn_ratelimited("TCP-AO: the keyid %u from SYN packet is not present - not sending SYNACK\n",
+ keyid);
+ return NULL;
+ }
+ key.ao_key = ao_key;
+ key.type = TCP_KEY_AO;
+#endif
+ } else {
+#ifdef CONFIG_TCP_MD5SIG
+ key.md5_key = tcp_rsk(req)->af_specific->req_md5_lookup(sk,
+ req_to_sk(req));
+ if (key.md5_key)
+ key.type = TCP_KEY_MD5;
+#endif
+ }
+ skb_set_hash(skb, READ_ONCE(tcp_rsk(req)->txhash), PKT_HASH_TYPE_L4);
+ /* bpf program will be interested in the tcp_flags */
+ TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK;
+ tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts,
+ &key, foc, synack_type, syn_skb)
+ + sizeof(*th);
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
@@ -3236,23 +4011,36 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
th->window = htons(min(req->rsk_rcv_wnd, 65535U));
- tcp_options_write((__be32 *)(th + 1), NULL, &opts);
+ tcp_options_write(th, NULL, tcp_rsk(req), &opts, &key);
th->doff = (tcp_header_size >> 2);
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
-#ifdef CONFIG_TCP_MD5SIG
/* Okay, we have all we need - do the md5 hash if needed */
- if (md5)
+ if (tcp_key_is_md5(&key)) {
+#ifdef CONFIG_TCP_MD5SIG
tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
- md5, req_to_sk(req), skb);
+ key.md5_key, req_to_sk(req), skb);
+#endif
+ } else if (tcp_key_is_ao(&key)) {
+#ifdef CONFIG_TCP_AO
+ tcp_rsk(req)->af_specific->ao_synack_hash(opts.hash_location,
+ key.ao_key, req, skb,
+ opts.hash_location - (u8 *)th, 0);
+#endif
+ }
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
rcu_read_unlock();
#endif
- /* Do not fool tcpdump (if any), clean our debris */
- skb->tstamp = 0;
+ bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
+ synack_type, &opts);
+
+ skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC);
+ tcp_add_tx_delay(skb, tp);
+
return skb;
}
-EXPORT_SYMBOL(tcp_make_synack);
+EXPORT_IPV6_MOD(tcp_make_synack);
static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
{
@@ -3265,8 +4053,8 @@ static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst)
rcu_read_lock();
ca = tcp_ca_find_key(ca_key);
- if (likely(ca && try_module_get(ca->owner))) {
- module_put(icsk->icsk_ca_ops->owner);
+ if (likely(ca && bpf_try_module_get(ca, ca->owner))) {
+ bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner);
icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst);
icsk->icsk_ca_ops = ca;
}
@@ -3279,23 +4067,22 @@ static void tcp_connect_init(struct sock *sk)
const struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
__u8 rcv_wscale;
+ u16 user_mss;
u32 rcv_wnd;
/* We'll fix this up when we get a response from the other end.
* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
*/
tp->tcp_header_len = sizeof(struct tcphdr);
- if (sock_net(sk)->ipv4.sysctl_tcp_timestamps)
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_timestamps))
tp->tcp_header_len += TCPOLEN_TSTAMP_ALIGNED;
-#ifdef CONFIG_TCP_MD5SIG
- if (tp->af_specific->md5_lookup(sk, sk))
- tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
-#endif
+ tcp_ao_connect_init(sk);
/* If user gave his TCP_MAXSEG, record it to clamp */
- if (tp->rx_opt.user_mss)
- tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+ user_mss = READ_ONCE(tp->rx_opt.user_mss);
+ if (user_mss)
+ tp->rx_opt.mss_clamp = user_mss;
tp->max_window = 0;
tcp_mtup_init(sk);
tcp_sync_mss(sk, dst_mtu(dst));
@@ -3303,7 +4090,7 @@ static void tcp_connect_init(struct sock *sk)
tcp_ca_dst_init(sk, dst);
if (!tp->window_clamp)
- tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
+ WRITE_ONCE(tp->window_clamp, dst_metric(dst, RTAX_WINDOW));
tp->advmss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
tcp_initialize_rcv_mss(sk);
@@ -3311,7 +4098,7 @@ static void tcp_connect_init(struct sock *sk)
/* limit the window selection if the user enforce a smaller rx buffer */
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
(tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
- tp->window_clamp = tcp_full_space(sk);
+ WRITE_ONCE(tp->window_clamp, tcp_full_space(sk));
rcv_wnd = tcp_rwnd_init_bpf(sk);
if (rcv_wnd == 0)
@@ -3321,14 +4108,14 @@ static void tcp_connect_init(struct sock *sk)
tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
&tp->rcv_wnd,
&tp->window_clamp,
- sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling),
&rcv_wscale,
rcv_wnd);
tp->rx_opt.rcv_wscale = rcv_wscale;
tp->rcv_ssthresh = tp->rcv_wnd;
- sk->sk_err = 0;
+ WRITE_ONCE(sk->sk_err, 0);
sock_reset_flag(sk, SOCK_DONE);
tp->snd_wnd = 0;
tcp_init_wl(tp, 0);
@@ -3336,17 +4123,17 @@ static void tcp_connect_init(struct sock *sk)
tp->snd_una = tp->write_seq;
tp->snd_sml = tp->write_seq;
tp->snd_up = tp->write_seq;
- tp->snd_nxt = tp->write_seq;
+ WRITE_ONCE(tp->snd_nxt, tp->write_seq);
if (likely(!tp->repair))
tp->rcv_nxt = 0;
else
tp->rcv_tstamp = tcp_jiffies32;
tp->rcv_wup = tp->rcv_nxt;
- tp->copied_seq = tp->rcv_nxt;
+ WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
- inet_csk(sk)->icsk_retransmits = 0;
+ WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0);
tcp_clear_retrans(tp);
}
@@ -3357,9 +4144,9 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
tcb->end_seq += skb->len;
__skb_header_release(skb);
- sk->sk_wmem_queued += skb->truesize;
+ sk_wmem_queued_add(sk, skb->truesize);
sk_mem_charge(sk, skb->truesize);
- tp->write_seq = tcb->end_seq;
+ WRITE_ONCE(tp->write_seq, tcb->end_seq);
tp->packets_out += tcp_skb_pcount(skb);
}
@@ -3372,10 +4159,12 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
*/
static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
{
+ struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_request *fo = tp->fastopen_req;
- int space, err = 0;
+ struct page_frag *pfrag = sk_page_frag(sk);
struct sk_buff *syn_data;
+ int space, err = 0;
tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
@@ -3386,32 +4175,40 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
* private TCP options. The cost is reduced data space in SYN :(
*/
tp->rx_opt.mss_clamp = tcp_mss_clamp(tp, tp->rx_opt.mss_clamp);
+ /* Sync mss_cache after updating the mss_clamp */
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
- space = __tcp_mtu_to_mss(sk, inet_csk(sk)->icsk_pmtu_cookie) -
+ space = __tcp_mtu_to_mss(sk, icsk->icsk_pmtu_cookie) -
MAX_TCP_OPTION_SPACE;
space = min_t(size_t, space, fo->size);
- /* limit to order-0 allocations */
- space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
-
- syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation, false);
+ if (space &&
+ !skb_page_frag_refill(min_t(size_t, space, PAGE_SIZE),
+ pfrag, sk->sk_allocation))
+ goto fallback;
+ syn_data = tcp_stream_alloc_skb(sk, sk->sk_allocation, false);
if (!syn_data)
goto fallback;
- syn_data->ip_summed = CHECKSUM_PARTIAL;
memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
if (space) {
- int copied = copy_from_iter(skb_put(syn_data, space), space,
- &fo->data->msg_iter);
- if (unlikely(!copied)) {
+ space = min_t(size_t, space, pfrag->size - pfrag->offset);
+ space = tcp_wmem_schedule(sk, space);
+ }
+ if (space) {
+ space = copy_page_from_iter(pfrag->page, pfrag->offset,
+ space, &fo->data->msg_iter);
+ if (unlikely(!space)) {
tcp_skb_tsorted_anchor_cleanup(syn_data);
kfree_skb(syn_data);
goto fallback;
}
- if (copied != space) {
- skb_trim(syn_data, copied);
- space = copied;
- }
+ skb_fill_page_desc(syn_data, 0, pfrag->page,
+ pfrag->offset, space);
+ page_ref_inc(pfrag->page);
+ pfrag->offset += space;
+ skb_len_add(syn_data, space);
+ skb_zcopy_set(syn_data, fo->uarg, NULL);
}
/* No more data pending in inet_wait_for_connect() */
if (space == fo->size)
@@ -3424,7 +4221,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
- syn->skb_mstamp = syn_data->skb_mstamp;
+ skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, SKB_CLOCK_MONOTONIC);
/* Now full SYN+DATA was cloned and sent (or not),
* remove the SYN from the original skb (syn_data)
@@ -3465,6 +4262,53 @@ int tcp_connect(struct sock *sk)
tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB, 0, NULL);
+#if defined(CONFIG_TCP_MD5SIG) && defined(CONFIG_TCP_AO)
+ /* Has to be checked late, after setting daddr/saddr/ops.
+ * Return error if the peer has both a md5 and a tcp-ao key
+ * configured as this is ambiguous.
+ */
+ if (unlikely(rcu_dereference_protected(tp->md5sig_info,
+ lockdep_sock_is_held(sk)))) {
+ bool needs_ao = !!tp->af_specific->ao_lookup(sk, sk, -1, -1);
+ bool needs_md5 = !!tp->af_specific->md5_lookup(sk, sk);
+ struct tcp_ao_info *ao_info;
+
+ ao_info = rcu_dereference_check(tp->ao_info,
+ lockdep_sock_is_held(sk));
+ if (ao_info) {
+ /* This is an extra check: tcp_ao_required() in
+ * tcp_v{4,6}_parse_md5_keys() should prevent adding
+ * md5 keys on ao_required socket.
+ */
+ needs_ao |= ao_info->ao_required;
+ WARN_ON_ONCE(ao_info->ao_required && needs_md5);
+ }
+ if (needs_md5 && needs_ao)
+ return -EKEYREJECTED;
+
+ /* If we have a matching md5 key and no matching tcp-ao key
+ * then free up ao_info if allocated.
+ */
+ if (needs_md5) {
+ tcp_ao_destroy_sock(sk, false);
+ } else if (needs_ao) {
+ tcp_clear_md5_list(sk);
+ kfree(rcu_replace_pointer(tp->md5sig_info, NULL,
+ lockdep_sock_is_held(sk)));
+ }
+ }
+#endif
+#ifdef CONFIG_TCP_AO
+ if (unlikely(rcu_dereference_protected(tp->ao_info,
+ lockdep_sock_is_held(sk)))) {
+ /* Don't allow connecting if ao is configured but no
+ * matching key is found.
+ */
+ if (!tp->af_specific->ao_lookup(sk, sk, -1, -1))
+ return -EKEYREJECTED;
+ }
+#endif
+
if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
return -EHOSTUNREACH; /* Routing failure or similar. */
@@ -3475,13 +4319,16 @@ int tcp_connect(struct sock *sk)
return 0;
}
- buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
+ buff = tcp_stream_alloc_skb(sk, sk->sk_allocation, true);
if (unlikely(!buff))
return -ENOBUFS;
- tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
+ /* SYN eats a sequence byte, write_seq updated by
+ * tcp_connect_queue_skb().
+ */
+ tcp_init_nondata_skb(buff, sk, tp->write_seq, TCPHDR_SYN);
tcp_mstamp_refresh(tp);
- tp->retrans_stamp = tcp_time_stamp(tp);
+ tp->retrans_stamp = tcp_time_stamp_ts(tp);
tcp_connect_queue_skb(sk, buff);
tcp_ecn_send_syn(sk, buff);
tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
@@ -3495,22 +4342,29 @@ int tcp_connect(struct sock *sk)
/* We change tp->snd_nxt after the tcp_transmit_skb() call
* in order to make this packet get counted in tcpOutSegs.
*/
- tp->snd_nxt = tp->write_seq;
+ WRITE_ONCE(tp->snd_nxt, tp->write_seq);
tp->pushed_seq = tp->write_seq;
buff = tcp_send_head(sk);
if (unlikely(buff)) {
- tp->snd_nxt = TCP_SKB_CB(buff)->seq;
+ WRITE_ONCE(tp->snd_nxt, TCP_SKB_CB(buff)->seq);
tp->pushed_seq = TCP_SKB_CB(buff)->seq;
}
TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
/* Timer for repeating the SYN until an answer. */
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)->icsk_rto, false);
return 0;
}
EXPORT_SYMBOL(tcp_connect);
+u32 tcp_delack_max(const struct sock *sk)
+{
+ u32 delack_from_rto_min = max(tcp_rto_min(sk), 2) - 1;
+
+ return min(READ_ONCE(inet_csk(sk)->icsk_delack_max), delack_from_rto_min);
+}
+
/* Send out a delayed ack, the caller does the policy checking
* to see if we should even be here. See tcp_input.c:tcp_ack_snd_check()
* for details.
@@ -3525,7 +4379,7 @@ void tcp_send_delayed_ack(struct sock *sk)
const struct tcp_sock *tp = tcp_sk(sk);
int max_ato = HZ / 2;
- if (icsk->icsk_ack.pingpong ||
+ if (inet_csk_in_pingpong_mode(sk) ||
(icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
max_ato = TCP_DELACK_MAX;
@@ -3546,30 +4400,29 @@ void tcp_send_delayed_ack(struct sock *sk)
ato = min(ato, max_ato);
}
+ ato = min_t(u32, ato, tcp_delack_max(sk));
+
/* Stay within the limit we were given */
timeout = jiffies + ato;
/* Use new timeout only if there wasn't a older one earlier. */
if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
- /* If delack timer was blocked or is about to expire,
- * send ACK now.
- */
- if (icsk->icsk_ack.blocked ||
- time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
+ /* If delack timer is about to expire, send ACK now. */
+ if (time_before_eq(icsk_delack_timeout(icsk), jiffies + (ato >> 2))) {
tcp_send_ack(sk);
return;
}
- if (!time_before(timeout, icsk->icsk_ack.timeout))
- timeout = icsk->icsk_ack.timeout;
+ if (!time_before(timeout, icsk_delack_timeout(icsk)))
+ timeout = icsk_delack_timeout(icsk);
}
- icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
- icsk->icsk_ack.timeout = timeout;
+ smp_store_release(&icsk->icsk_ack.pending,
+ icsk->icsk_ack.pending | ICSK_ACK_SCHED | ICSK_ACK_TIMER);
sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
}
/* This routine sends an ack and also updates the window. */
-void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
+void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags)
{
struct sk_buff *buff;
@@ -3584,16 +4437,22 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
buff = alloc_skb(MAX_TCP_HEADER,
sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
if (unlikely(!buff)) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ unsigned long delay;
+
+ delay = TCP_DELACK_MAX << icsk->icsk_ack.retry;
+ if (delay < tcp_rto_max(sk))
+ icsk->icsk_ack.retry++;
inet_csk_schedule_ack(sk);
- inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
- TCP_DELACK_MAX, TCP_RTO_MAX);
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
+ tcp_reset_xmit_timer(sk, ICSK_TIME_DACK, delay, false);
return;
}
/* Reserve space for headers and prepare control bits. */
skb_reserve(buff, MAX_TCP_HEADER);
- tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK);
+ tcp_init_nondata_skb(buff, sk,
+ tcp_acceptable_seq(sk), TCPHDR_ACK | flags);
/* We do not want pure acks influencing TCP Small Queues or fq/pacing
* too much.
@@ -3608,7 +4467,7 @@ EXPORT_SYMBOL_GPL(__tcp_send_ack);
void tcp_send_ack(struct sock *sk)
{
- __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
+ __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt, 0);
}
/* This routine sends a packet with an out of date sequence
@@ -3639,7 +4498,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
* end to send an ack. Don't queue or clone SKB, just
* send it.
*/
- tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
+ tcp_init_nondata_skb(skb, sk, tp->snd_una - !urgent, TCPHDR_ACK);
NET_INC_STATS(sock_net(sk), mib);
return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
}
@@ -3706,37 +4565,33 @@ void tcp_send_probe0(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
- unsigned long probe_max;
+ unsigned long timeout;
int err;
err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
if (tp->packets_out || tcp_write_queue_empty(sk)) {
/* Cancel probe timer, if it is not required. */
- icsk->icsk_probes_out = 0;
+ WRITE_ONCE(icsk->icsk_probes_out, 0);
icsk->icsk_backoff = 0;
+ icsk->icsk_probes_tstamp = 0;
return;
}
+ WRITE_ONCE(icsk->icsk_probes_out, icsk->icsk_probes_out + 1);
if (err <= 0) {
- if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
+ if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2))
icsk->icsk_backoff++;
- icsk->icsk_probes_out++;
- probe_max = TCP_RTO_MAX;
+ timeout = tcp_probe0_when(sk, tcp_rto_max(sk));
} else {
/* If packet was not sent due to local congestion,
- * do not backoff and do not remember icsk_probes_out.
- * Let local senders to fight for local resources.
- *
- * Use accumulated backoff yet.
+ * Let senders fight for local resources conservatively.
*/
- if (!icsk->icsk_probes_out)
- icsk->icsk_probes_out = 1;
- probe_max = TCP_RESOURCE_PROBE_INTERVAL;
+ timeout = TCP_RESOURCE_PROBE_INTERVAL;
}
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
- tcp_probe0_when(sk, probe_max),
- TCP_RTO_MAX);
+
+ timeout = tcp_clamp_probe0_to_user_timeout(sk, timeout);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, true);
}
int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
@@ -3745,15 +4600,24 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
struct flowi fl;
int res;
- tcp_rsk(req)->txhash = net_tx_rndhash();
- res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL);
+ /* Paired with WRITE_ONCE() in sock_setsockopt() */
+ if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
+ WRITE_ONCE(tcp_rsk(req)->txhash, net_tx_rndhash());
+ res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
+ NULL);
if (!res) {
- __TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
- if (unlikely(tcp_passive_fastopen(sk)))
- tcp_sk(sk)->total_retrans++;
+ TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
+ if (unlikely(tcp_passive_fastopen(sk))) {
+ /* sk has const attribute because listeners are lockless.
+ * However in this case, we are dealing with a passive fastopen
+ * socket thus we can change total_retrans value.
+ */
+ tcp_sk_rw(sk)->total_retrans++;
+ }
trace_tcp_retransmit_synack(sk, req);
+ WRITE_ONCE(req->num_retrans, req->num_retrans + 1);
}
return res;
}
-EXPORT_SYMBOL(tcp_rtx_synack);
+EXPORT_IPV6_MOD(tcp_rtx_synack);
diff --git a/net/ipv4/tcp_plb.c b/net/ipv4/tcp_plb.c
new file mode 100644
index 000000000000..4bcf7eff95e3
--- /dev/null
+++ b/net/ipv4/tcp_plb.c
@@ -0,0 +1,109 @@
+/* Protective Load Balancing (PLB)
+ *
+ * PLB was designed to reduce link load imbalance across datacenter
+ * switches. PLB is a host-based optimization; it leverages congestion
+ * signals from the transport layer to randomly change the path of the
+ * connection experiencing sustained congestion. PLB prefers to repath
+ * after idle periods to minimize packet reordering. It repaths by
+ * changing the IPv6 Flow Label on the packets of a connection, which
+ * datacenter switches include as part of ECMP/WCMP hashing.
+ *
+ * PLB is described in detail in:
+ *
+ * Mubashir Adnan Qureshi, Yuchung Cheng, Qianwen Yin, Qiaobin Fu,
+ * Gautam Kumar, Masoud Moshref, Junhua Yan, Van Jacobson,
+ * David Wetherall,Abdul Kabbani:
+ * "PLB: Congestion Signals are Simple and Effective for
+ * Network Load Balancing"
+ * In ACM SIGCOMM 2022, Amsterdam Netherlands.
+ *
+ */
+
+#include <net/tcp.h>
+
+/* Called once per round-trip to update PLB state for a connection. */
+void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb,
+ const int cong_ratio)
+{
+ struct net *net = sock_net(sk);
+
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))
+ return;
+
+ if (cong_ratio >= 0) {
+ if (cong_ratio < READ_ONCE(net->ipv4.sysctl_tcp_plb_cong_thresh))
+ plb->consec_cong_rounds = 0;
+ else if (plb->consec_cong_rounds <
+ READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds))
+ plb->consec_cong_rounds++;
+ }
+}
+EXPORT_SYMBOL_GPL(tcp_plb_update_state);
+
+/* Check whether recent congestion has been persistent enough to warrant
+ * a load balancing decision that switches the connection to another path.
+ */
+void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb)
+{
+ struct net *net = sock_net(sk);
+ u32 max_suspend;
+ bool forced_rehash = false, idle_rehash = false;
+
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))
+ return;
+
+ forced_rehash = plb->consec_cong_rounds >=
+ READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds);
+ /* If sender goes idle then we check whether to rehash. */
+ idle_rehash = READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds) &&
+ !tcp_sk(sk)->packets_out &&
+ plb->consec_cong_rounds >=
+ READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds);
+
+ if (!forced_rehash && !idle_rehash)
+ return;
+
+ /* Note that tcp_jiffies32 can wrap; we detect wraps by checking for
+ * cases where the max suspension end is before the actual suspension
+ * end. We clear pause_until to 0 to indicate there is no recent
+ * RTO event that constrains PLB rehashing.
+ */
+ max_suspend = 2 * READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ;
+ if (plb->pause_until &&
+ (!before(tcp_jiffies32, plb->pause_until) ||
+ before(tcp_jiffies32 + max_suspend, plb->pause_until)))
+ plb->pause_until = 0;
+
+ if (plb->pause_until)
+ return;
+
+ sk_rethink_txhash(sk);
+ plb->consec_cong_rounds = 0;
+ tcp_sk(sk)->plb_rehash++;
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPLBREHASH);
+}
+EXPORT_SYMBOL_GPL(tcp_plb_check_rehash);
+
+/* Upon RTO, disallow load balancing for a while, to avoid having load
+ * balancing decisions switch traffic to a black-holed path that was
+ * previously avoided with a sk_rethink_txhash() call at RTO time.
+ */
+void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb)
+{
+ struct net *net = sock_net(sk);
+ u32 pause;
+
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))
+ return;
+
+ pause = READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ;
+ pause += get_random_u32_below(pause);
+ plb->pause_until = tcp_jiffies32 + pause;
+
+ /* Reset PLB state upon RTO, since an RTO causes a sk_rethink_txhash() call
+ * that may switch this connection to a path with completely different
+ * congestion characteristics.
+ */
+ plb->consec_cong_rounds = 0;
+}
+EXPORT_SYMBOL_GPL(tcp_plb_update_state_upon_rto);
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
index 4dff40dad4dc..a8f6d9d06f2e 100644
--- a/net/ipv4/tcp_rate.c
+++ b/net/ipv4/tcp_rate.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <net/tcp.h>
/* The bandwidth estimator estimates the rate at which the network
@@ -55,13 +56,16 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
* bandwidth estimate.
*/
if (!tp->packets_out) {
- tp->first_tx_mstamp = skb->skb_mstamp;
- tp->delivered_mstamp = skb->skb_mstamp;
+ u64 tstamp_us = tcp_skb_timestamp_us(skb);
+
+ tp->first_tx_mstamp = tstamp_us;
+ tp->delivered_mstamp = tstamp_us;
}
TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
+ TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce;
TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
}
@@ -70,31 +74,36 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
*
* If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
* called multiple times. We favor the information from the most recently
- * sent skb, i.e., the skb with the highest prior_delivered count.
+ * sent skb, i.e., the skb with the most recently sent time and the highest
+ * sequence.
*/
void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
struct rate_sample *rs)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+ u64 tx_tstamp;
if (!scb->tx.delivered_mstamp)
return;
+ tx_tstamp = tcp_skb_timestamp_us(skb);
if (!rs->prior_delivered ||
- after(scb->tx.delivered, rs->prior_delivered)) {
+ tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
+ scb->end_seq, rs->last_end_seq)) {
+ rs->prior_delivered_ce = scb->tx.delivered_ce;
rs->prior_delivered = scb->tx.delivered;
rs->prior_mstamp = scb->tx.delivered_mstamp;
rs->is_app_limited = scb->tx.is_app_limited;
rs->is_retrans = scb->sacked & TCPCB_RETRANS;
+ rs->last_end_seq = scb->end_seq;
+ /* Record send time of most recently ACKed packet: */
+ tp->first_tx_mstamp = tx_tstamp;
/* Find the duration of the "send phase" of this window: */
- rs->interval_us = tcp_stamp_us_delta(
- skb->skb_mstamp,
- scb->tx.first_tx_mstamp);
+ rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+ scb->tx.first_tx_mstamp);
- /* Record send time of most recently ACKed packet: */
- tp->first_tx_mstamp = skb->skb_mstamp;
}
/* Mark off the skb delivered once it's sacked to avoid being
* used again when it's cumulatively acked. For acked packets
@@ -136,6 +145,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
}
rs->delivered = tp->delivered - rs->prior_delivered;
+ rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
+ /* delivered_ce occupies less than 32 bits in the skb control block */
+ rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK;
+
/* Model sending data and receiving ACKs as separate pipeline phases
* for a window. Usually the ACK phase is longer, but with ACK
* compression the send phase can be longer. To be safe we use the
@@ -187,7 +200,7 @@ void tcp_rate_check_app_limited(struct sock *sk)
/* Nothing in sending host's qdisc queues or NIC tx queue. */
sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
/* We are not limited by CWND. */
- tcp_packets_in_flight(tp) < tp->snd_cwnd &&
+ tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) &&
/* All lost packets have been retransmitted. */
tp->lost_out <= tp->retrans_out)
tp->app_limited =
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index c81aadff769b..c52fd3254b6e 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -2,28 +2,9 @@
#include <linux/tcp.h>
#include <net/tcp.h>
-void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- tcp_skb_mark_lost_uncond_verify(tp, skb);
- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
- /* Account for retransmits that are lost again */
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
- tp->retrans_out -= tcp_skb_pcount(skb);
- NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT,
- tcp_skb_pcount(skb));
- }
-}
-
-static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
-{
- return t1 > t2 || (t1 == t2 && after(seq1, seq2));
-}
-
static u32 tcp_rack_reo_wnd(const struct sock *sk)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
if (!tp->reord_seen) {
/* If reordering has not been observed, be aggressive during
@@ -33,7 +14,8 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk)
return 0;
if (tp->sacked_out >= tp->reordering &&
- !(sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_NO_DUPTHRESH))
+ !(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
+ TCP_RACK_NO_DUPTHRESH))
return 0;
}
@@ -50,10 +32,10 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk)
s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd)
{
return tp->rack.rtt_us + reo_wnd -
- tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
+ tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb));
}
-/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
+/* RACK loss detection (IETF RFC8985):
*
* Marks a packet lost, if some packet sent later has been (s)acked.
* The underlying idea is similar to the traditional dupthresh and FACK
@@ -91,8 +73,9 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
!(scb->sacked & TCPCB_SACKED_RETRANS))
continue;
- if (!tcp_rack_sent_after(tp->rack.mstamp, skb->skb_mstamp,
- tp->rack.end_seq, scb->end_seq))
+ if (!tcp_skb_sent_after(tp->rack.mstamp,
+ tcp_skb_timestamp_us(skb),
+ tp->rack.end_seq, scb->end_seq))
break;
/* A packet is lost if it has not been s/acked beyond
@@ -109,22 +92,23 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
}
}
-void tcp_rack_mark_lost(struct sock *sk)
+bool tcp_rack_mark_lost(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 timeout;
if (!tp->rack.advanced)
- return;
+ return false;
/* Reset the advanced flag to avoid unnecessary queue scanning */
tp->rack.advanced = 0;
tcp_rack_detect_loss(sk, &timeout);
if (timeout) {
- timeout = usecs_to_jiffies(timeout) + TCP_TIMEOUT_MIN;
+ timeout = usecs_to_jiffies(timeout + TCP_TIMEOUT_MIN_US);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT,
timeout, inet_csk(sk)->icsk_rto);
}
+ return !!timeout;
}
/* Record the most recently (re)sent time among the (s)acked packets
@@ -152,8 +136,8 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
}
tp->rack.advanced = 1;
tp->rack.rtt_us = rtt_us;
- if (tcp_rack_sent_after(xmit_time, tp->rack.mstamp,
- end_seq, tp->rack.end_seq)) {
+ if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp,
+ end_seq, tp->rack.end_seq)) {
tp->rack.mstamp = xmit_time;
tp->rack.end_seq = end_seq;
}
@@ -166,6 +150,7 @@ void tcp_rack_reo_timeout(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
u32 timeout, prior_inflight;
+ u32 lost = tp->lost;
prior_inflight = tcp_packets_in_flight(tp);
tcp_rack_detect_loss(sk, &timeout);
@@ -173,7 +158,7 @@ void tcp_rack_reo_timeout(struct sock *sk)
if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) {
tcp_enter_recovery(sk, false);
if (!inet_csk(sk)->icsk_ca_ops->cong_control)
- tcp_cwnd_reduction(sk, 1, 0);
+ tcp_cwnd_reduction(sk, 1, tp->lost - lost, 0);
}
tcp_xmit_retransmit_queue(sk);
}
@@ -183,7 +168,8 @@ void tcp_rack_reo_timeout(struct sock *sk)
/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries.
*
- * If DSACK is received, increment reo_wnd by min_rtt/4 (upper bounded
+ * If a DSACK is received that seems like it may have been due to reordering
+ * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded
* by srtt), since there is possibility that spurious retransmission was
* due to reordering delay longer than reo_wnd.
*
@@ -202,7 +188,8 @@ void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_STATIC_REO_WND ||
+ if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
+ TCP_RACK_STATIC_REO_WND) ||
!rs->prior_delivered)
return;
@@ -245,6 +232,6 @@ void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced)
tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
mss, mss, GFP_ATOMIC);
- tcp_skb_mark_lost_uncond_verify(tp, skb);
+ tcp_mark_skb_lost(sk, skb);
}
}
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index addc122f8818..862b96248a92 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Tom Kelly's Scalable TCP
*
* See http://www.deneholme.net/tom/scalable/
@@ -9,10 +10,9 @@
#include <net/tcp.h>
/* These factors derived from the recommended values in the aer:
- * .01 and and 7/8. We use 50 instead of 100 to account for
- * delayed ack.
+ * .01 and 7/8.
*/
-#define TCP_SCALABLE_AI_CNT 50U
+#define TCP_SCALABLE_AI_CNT 100U
#define TCP_SCALABLE_MD_SCALE 3
static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
@@ -22,18 +22,20 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tcp_in_slow_start(tp))
- tcp_slow_start(tp, acked);
- else
- tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
- 1);
+ if (tcp_in_slow_start(tp)) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ return;
+ }
+ tcp_cong_avoid_ai(tp, min(tcp_snd_cwnd(tp), TCP_SCALABLE_AI_CNT),
+ acked);
}
static u32 tcp_scalable_ssthresh(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
- return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U);
+ return max(tcp_snd_cwnd(tp) - (tcp_snd_cwnd(tp)>>TCP_SCALABLE_MD_SCALE), 2U);
}
static struct tcp_congestion_ops tcp_scalable __read_mostly = {
diff --git a/net/ipv4/tcp_sigpool.c b/net/ipv4/tcp_sigpool.c
new file mode 100644
index 000000000000..d8a4f192873a
--- /dev/null
+++ b/net/ipv4/tcp_sigpool.c
@@ -0,0 +1,366 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <crypto/hash.h>
+#include <linux/cpu.h>
+#include <linux/kref.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/workqueue.h>
+#include <net/tcp.h>
+
+static size_t __scratch_size;
+struct sigpool_scratch {
+ local_lock_t bh_lock;
+ void __rcu *pad;
+};
+
+static DEFINE_PER_CPU(struct sigpool_scratch, sigpool_scratch) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
+
+struct sigpool_entry {
+ struct crypto_ahash *hash;
+ const char *alg;
+ struct kref kref;
+ uint16_t needs_key:1,
+ reserved:15;
+};
+
+#define CPOOL_SIZE (PAGE_SIZE / sizeof(struct sigpool_entry))
+static struct sigpool_entry cpool[CPOOL_SIZE];
+static unsigned int cpool_populated;
+static DEFINE_MUTEX(cpool_mutex);
+
+/* Slow-path */
+struct scratches_to_free {
+ struct rcu_head rcu;
+ unsigned int cnt;
+ void *scratches[];
+};
+
+static void free_old_scratches(struct rcu_head *head)
+{
+ struct scratches_to_free *stf;
+
+ stf = container_of(head, struct scratches_to_free, rcu);
+ while (stf->cnt--)
+ kfree(stf->scratches[stf->cnt]);
+ kfree(stf);
+}
+
+/**
+ * sigpool_reserve_scratch - re-allocates scratch buffer, slow-path
+ * @size: request size for the scratch/temp buffer
+ */
+static int sigpool_reserve_scratch(size_t size)
+{
+ struct scratches_to_free *stf;
+ size_t stf_sz = struct_size(stf, scratches, num_possible_cpus());
+ int cpu, err = 0;
+
+ lockdep_assert_held(&cpool_mutex);
+ if (__scratch_size >= size)
+ return 0;
+
+ stf = kmalloc(stf_sz, GFP_KERNEL);
+ if (!stf)
+ return -ENOMEM;
+ stf->cnt = 0;
+
+ size = max(size, __scratch_size);
+ cpus_read_lock();
+ for_each_possible_cpu(cpu) {
+ void *scratch, *old_scratch;
+
+ scratch = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
+ if (!scratch) {
+ err = -ENOMEM;
+ break;
+ }
+
+ old_scratch = rcu_replace_pointer(per_cpu(sigpool_scratch.pad, cpu),
+ scratch, lockdep_is_held(&cpool_mutex));
+ if (!cpu_online(cpu) || !old_scratch) {
+ kfree(old_scratch);
+ continue;
+ }
+ stf->scratches[stf->cnt++] = old_scratch;
+ }
+ cpus_read_unlock();
+ if (!err)
+ __scratch_size = size;
+
+ call_rcu(&stf->rcu, free_old_scratches);
+ return err;
+}
+
+static void sigpool_scratch_free(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ kfree(rcu_replace_pointer(per_cpu(sigpool_scratch.pad, cpu),
+ NULL, lockdep_is_held(&cpool_mutex)));
+ __scratch_size = 0;
+}
+
+static int __cpool_try_clone(struct crypto_ahash *hash)
+{
+ struct crypto_ahash *tmp;
+
+ tmp = crypto_clone_ahash(hash);
+ if (IS_ERR(tmp))
+ return PTR_ERR(tmp);
+
+ crypto_free_ahash(tmp);
+ return 0;
+}
+
+static int __cpool_alloc_ahash(struct sigpool_entry *e, const char *alg)
+{
+ struct crypto_ahash *cpu0_hash;
+ int ret;
+
+ e->alg = kstrdup(alg, GFP_KERNEL);
+ if (!e->alg)
+ return -ENOMEM;
+
+ cpu0_hash = crypto_alloc_ahash(alg, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(cpu0_hash)) {
+ ret = PTR_ERR(cpu0_hash);
+ goto out_free_alg;
+ }
+
+ e->needs_key = crypto_ahash_get_flags(cpu0_hash) & CRYPTO_TFM_NEED_KEY;
+
+ ret = __cpool_try_clone(cpu0_hash);
+ if (ret)
+ goto out_free_cpu0_hash;
+ e->hash = cpu0_hash;
+ kref_init(&e->kref);
+ return 0;
+
+out_free_cpu0_hash:
+ crypto_free_ahash(cpu0_hash);
+out_free_alg:
+ kfree(e->alg);
+ e->alg = NULL;
+ return ret;
+}
+
+/**
+ * tcp_sigpool_alloc_ahash - allocates pool for ahash requests
+ * @alg: name of async hash algorithm
+ * @scratch_size: reserve a tcp_sigpool::scratch buffer of this size
+ */
+int tcp_sigpool_alloc_ahash(const char *alg, size_t scratch_size)
+{
+ int i, ret;
+
+ /* slow-path */
+ mutex_lock(&cpool_mutex);
+ ret = sigpool_reserve_scratch(scratch_size);
+ if (ret)
+ goto out;
+ for (i = 0; i < cpool_populated; i++) {
+ if (!cpool[i].alg)
+ continue;
+ if (strcmp(cpool[i].alg, alg))
+ continue;
+
+ /* pairs with tcp_sigpool_release() */
+ if (!kref_get_unless_zero(&cpool[i].kref))
+ kref_init(&cpool[i].kref);
+ ret = i;
+ goto out;
+ }
+
+ for (i = 0; i < cpool_populated; i++) {
+ if (!cpool[i].alg)
+ break;
+ }
+ if (i >= CPOOL_SIZE) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ ret = __cpool_alloc_ahash(&cpool[i], alg);
+ if (!ret) {
+ ret = i;
+ if (i == cpool_populated)
+ cpool_populated++;
+ }
+out:
+ mutex_unlock(&cpool_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tcp_sigpool_alloc_ahash);
+
+static void __cpool_free_entry(struct sigpool_entry *e)
+{
+ crypto_free_ahash(e->hash);
+ kfree(e->alg);
+ memset(e, 0, sizeof(*e));
+}
+
+static void cpool_cleanup_work_cb(struct work_struct *work)
+{
+ bool free_scratch = true;
+ unsigned int i;
+
+ mutex_lock(&cpool_mutex);
+ for (i = 0; i < cpool_populated; i++) {
+ if (kref_read(&cpool[i].kref) > 0) {
+ free_scratch = false;
+ continue;
+ }
+ if (!cpool[i].alg)
+ continue;
+ __cpool_free_entry(&cpool[i]);
+ }
+ if (free_scratch)
+ sigpool_scratch_free();
+ mutex_unlock(&cpool_mutex);
+}
+
+static DECLARE_WORK(cpool_cleanup_work, cpool_cleanup_work_cb);
+static void cpool_schedule_cleanup(struct kref *kref)
+{
+ schedule_work(&cpool_cleanup_work);
+}
+
+/**
+ * tcp_sigpool_release - decreases number of users for a pool. If it was
+ * the last user of the pool, releases any memory that was consumed.
+ * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash()
+ */
+void tcp_sigpool_release(unsigned int id)
+{
+ if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg))
+ return;
+
+ /* slow-path */
+ kref_put(&cpool[id].kref, cpool_schedule_cleanup);
+}
+EXPORT_SYMBOL_GPL(tcp_sigpool_release);
+
+/**
+ * tcp_sigpool_get - increases number of users (refcounter) for a pool
+ * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash()
+ */
+void tcp_sigpool_get(unsigned int id)
+{
+ if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg))
+ return;
+ kref_get(&cpool[id].kref);
+}
+EXPORT_SYMBOL_GPL(tcp_sigpool_get);
+
+int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c) __cond_acquires(RCU_BH)
+{
+ struct crypto_ahash *hash;
+
+ rcu_read_lock_bh();
+ if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg)) {
+ rcu_read_unlock_bh();
+ return -EINVAL;
+ }
+
+ hash = crypto_clone_ahash(cpool[id].hash);
+ if (IS_ERR(hash)) {
+ rcu_read_unlock_bh();
+ return PTR_ERR(hash);
+ }
+
+ c->req = ahash_request_alloc(hash, GFP_ATOMIC);
+ if (!c->req) {
+ crypto_free_ahash(hash);
+ rcu_read_unlock_bh();
+ return -ENOMEM;
+ }
+ ahash_request_set_callback(c->req, 0, NULL, NULL);
+
+ /* Pairs with tcp_sigpool_reserve_scratch(), scratch area is
+ * valid (allocated) until tcp_sigpool_end().
+ */
+ local_lock_nested_bh(&sigpool_scratch.bh_lock);
+ c->scratch = rcu_dereference_bh(*this_cpu_ptr(&sigpool_scratch.pad));
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tcp_sigpool_start);
+
+void tcp_sigpool_end(struct tcp_sigpool *c) __releases(RCU_BH)
+{
+ struct crypto_ahash *hash = crypto_ahash_reqtfm(c->req);
+
+ local_unlock_nested_bh(&sigpool_scratch.bh_lock);
+ rcu_read_unlock_bh();
+ ahash_request_free(c->req);
+ crypto_free_ahash(hash);
+}
+EXPORT_SYMBOL_GPL(tcp_sigpool_end);
+
+/**
+ * tcp_sigpool_algo - return algorithm of tcp_sigpool
+ * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash()
+ * @buf: buffer to return name of algorithm
+ * @buf_len: size of @buf
+ */
+size_t tcp_sigpool_algo(unsigned int id, char *buf, size_t buf_len)
+{
+ if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg))
+ return -EINVAL;
+
+ return strscpy(buf, cpool[id].alg, buf_len);
+}
+EXPORT_SYMBOL_GPL(tcp_sigpool_algo);
+
+/**
+ * tcp_sigpool_hash_skb_data - hash data in skb with initialized tcp_sigpool
+ * @hp: tcp_sigpool pointer
+ * @skb: buffer to add sign for
+ * @header_len: TCP header length for this segment
+ */
+int tcp_sigpool_hash_skb_data(struct tcp_sigpool *hp,
+ const struct sk_buff *skb,
+ unsigned int header_len)
+{
+ const unsigned int head_data_len = skb_headlen(skb) > header_len ?
+ skb_headlen(skb) - header_len : 0;
+ const struct skb_shared_info *shi = skb_shinfo(skb);
+ const struct tcphdr *tp = tcp_hdr(skb);
+ struct ahash_request *req = hp->req;
+ struct sk_buff *frag_iter;
+ struct scatterlist sg;
+ unsigned int i;
+
+ sg_init_table(&sg, 1);
+
+ sg_set_buf(&sg, ((u8 *)tp) + header_len, head_data_len);
+ ahash_request_set_crypt(req, &sg, NULL, head_data_len);
+ if (crypto_ahash_update(req))
+ return 1;
+
+ for (i = 0; i < shi->nr_frags; ++i) {
+ const skb_frag_t *f = &shi->frags[i];
+ unsigned int offset = skb_frag_off(f);
+ struct page *page;
+
+ page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
+ sg_set_page(&sg, page, skb_frag_size(f), offset_in_page(offset));
+ ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
+ if (crypto_ahash_update(req))
+ return 1;
+ }
+
+ skb_walk_frags(skb, frag_iter)
+ if (tcp_sigpool_hash_skb_data(hp, frag_iter, 0))
+ return 1;
+
+ return 0;
+}
+EXPORT_SYMBOL(tcp_sigpool_hash_skb_data);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Per-CPU pool of crypto requests");
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 7fdf222a0bdf..160080c9021d 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -21,34 +22,47 @@
#include <linux/module.h>
#include <linux/gfp.h>
#include <net/tcp.h>
+#include <net/rstreason.h>
-static u32 tcp_retransmit_stamp(const struct sock *sk)
+static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
{
- u32 start_ts = tcp_sk(sk)->retrans_stamp;
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u32 elapsed, user_timeout;
+ s32 remaining;
+
+ user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+ if (!user_timeout)
+ return icsk->icsk_rto;
- if (unlikely(!start_ts)) {
- struct sk_buff *head = tcp_rtx_queue_head(sk);
+ elapsed = tcp_time_stamp_ts(tp) - tp->retrans_stamp;
+ if (tp->tcp_usec_ts)
+ elapsed /= USEC_PER_MSEC;
- if (!head)
- return 0;
- start_ts = tcp_skb_timestamp(head);
- }
- return start_ts;
+ remaining = user_timeout - elapsed;
+ if (remaining <= 0)
+ return 1; /* user timeout has passed; fire ASAP */
+
+ return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(remaining));
}
-static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
+u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when)
{
- struct inet_connection_sock *icsk = inet_csk(sk);
- u32 elapsed, start_ts;
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 remaining, user_timeout;
+ s32 elapsed;
- start_ts = tcp_retransmit_stamp(sk);
- if (!icsk->icsk_user_timeout || !start_ts)
- return icsk->icsk_rto;
- elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
- if (elapsed >= icsk->icsk_user_timeout)
- return 1; /* user timeout has passed; fire ASAP */
- else
- return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(icsk->icsk_user_timeout - elapsed));
+ user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+ if (!user_timeout || !icsk->icsk_probes_tstamp)
+ return when;
+
+ elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp;
+ if (unlikely(elapsed < 0))
+ elapsed = 0;
+ remaining = msecs_to_jiffies(user_timeout) - elapsed;
+ remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN);
+
+ return min_t(u32, remaining, when);
}
/**
@@ -60,11 +74,7 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
static void tcp_write_err(struct sock *sk)
{
- sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
- sk->sk_error_report(sk);
-
- tcp_write_queue_purge(sk);
- tcp_done(sk);
+ tcp_done_with_error(sk, READ_ONCE(sk->sk_err_soft) ? : ETIMEDOUT);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONTIMEOUT);
}
@@ -99,11 +109,11 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
/* If peer does not open window for long time, or did not transmit
* anything for long time, penalize it. */
- if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
+ if ((s32)(tcp_jiffies32 - tp->lsndtime) > 2*tcp_rto_max(sk) || !do_reset)
shift++;
/* If some dubious ICMP arrived, penalize even more. */
- if (sk->sk_err_soft)
+ if (READ_ONCE(sk->sk_err_soft))
shift++;
if (tcp_check_oom(sk, shift)) {
@@ -114,7 +124,8 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
(!tp->snd_wnd && !tp->packets_out))
do_reset = true;
if (do_reset)
- tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_TCP_ABORT_ON_MEMORY);
tcp_done(sk);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
return 1;
@@ -136,10 +147,10 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
*/
static int tcp_orphan_retries(struct sock *sk, bool alive)
{
- int retries = sock_net(sk)->ipv4.sysctl_tcp_orphan_retries; /* May be zero. */
+ int retries = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_orphan_retries); /* May be zero. */
/* We know from an ICMP that something is wrong. */
- if (sk->sk_err_soft && !alive)
+ if (READ_ONCE(sk->sk_err_soft) && !alive)
retries = 0;
/* However, if socket sent something recently, select some safe
@@ -156,7 +167,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
int mss;
/* Black hole detection */
- if (!net->ipv4.sysctl_tcp_mtu_probing)
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_mtu_probing))
return;
if (!icsk->icsk_mtup.enabled) {
@@ -164,14 +175,28 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
icsk->icsk_mtup.probe_timestamp = tcp_jiffies32;
} else {
mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
- mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
- mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len);
+ mss = min(READ_ONCE(net->ipv4.sysctl_tcp_base_mss), mss);
+ mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_mtu_probe_floor));
+ mss = max(mss, READ_ONCE(net->ipv4.sysctl_tcp_min_snd_mss));
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
}
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
}
+static unsigned int tcp_model_timeout(struct sock *sk,
+ unsigned int boundary,
+ unsigned int rto_base)
+{
+ unsigned int linear_backoff_thresh, timeout;
+ linear_backoff_thresh = ilog2(tcp_rto_max(sk) / rto_base);
+ if (boundary <= linear_backoff_thresh)
+ timeout = ((2 << boundary) - 1) * rto_base;
+ else
+ timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
+ (boundary - linear_backoff_thresh) * tcp_rto_max(sk);
+ return jiffies_to_msecs(timeout);
+}
/**
* retransmits_timed_out() - returns true if this connection has timed out
* @sk: The current socket
@@ -189,27 +214,27 @@ static bool retransmits_timed_out(struct sock *sk,
unsigned int boundary,
unsigned int timeout)
{
- const unsigned int rto_base = TCP_RTO_MIN;
- unsigned int linear_backoff_thresh, start_ts;
+ struct tcp_sock *tp = tcp_sk(sk);
+ unsigned int start_ts, delta;
if (!inet_csk(sk)->icsk_retransmits)
return false;
- start_ts = tcp_retransmit_stamp(sk);
- if (!start_ts)
- return false;
-
+ start_ts = tp->retrans_stamp;
if (likely(timeout == 0)) {
- linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
+ unsigned int rto_base = TCP_RTO_MIN;
- if (boundary <= linear_backoff_thresh)
- timeout = ((2 << boundary) - 1) * rto_base;
- else
- timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
- (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
- timeout = jiffies_to_msecs(timeout);
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+ rto_base = tcp_timeout_init(sk);
+ timeout = tcp_model_timeout(sk, boundary, rto_base);
}
- return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= timeout;
+
+ if (tp->tcp_usec_ts) {
+ /* delta maybe off up to a jiffy due to timer granularity. */
+ delta = tp->tcp_mstamp - start_ts + jiffies_to_usecs(1);
+ return (s32)(delta - timeout * USEC_PER_MSEC) >= 0;
+ }
+ return (s32)(tcp_time_stamp_ts(tp) - start_ts - timeout) >= 0;
}
/* A write timeout has occurred. Process the after effects. */
@@ -218,30 +243,32 @@ static int tcp_write_timeout(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
- bool expired, do_reset;
- int retry_until;
+ bool expired = false, do_reset;
+ int retry_until, max_retransmits;
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
- if (icsk->icsk_retransmits) {
- dst_negative_advice(sk);
- } else if (!tp->syn_data && !tp->syn_fastopen) {
- sk_rethink_txhash(sk);
- }
- retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
- expired = icsk->icsk_retransmits >= retry_until;
+ if (icsk->icsk_retransmits)
+ __dst_negative_advice(sk);
+ /* Paired with WRITE_ONCE() in tcp_sock_set_syncnt() */
+ retry_until = READ_ONCE(icsk->icsk_syn_retries) ? :
+ READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);
+
+ max_retransmits = retry_until;
+ if (sk->sk_state == TCP_SYN_SENT)
+ max_retransmits += READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts);
+
+ expired = icsk->icsk_retransmits >= max_retransmits;
} else {
- if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0)) {
+ if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1), 0)) {
/* Black hole detection */
tcp_mtu_probing(icsk, sk);
- dst_negative_advice(sk);
- } else {
- sk_rethink_txhash(sk);
+ __dst_negative_advice(sk);
}
- retry_until = net->ipv4.sysctl_tcp_retries2;
+ retry_until = READ_ONCE(net->ipv4.sysctl_tcp_retries2);
if (sock_flag(sk, SOCK_DEAD)) {
- const bool alive = icsk->icsk_rto < TCP_RTO_MAX;
+ const bool alive = icsk->icsk_rto < tcp_rto_max(sk);
retry_until = tcp_orphan_retries(sk, alive);
do_reset = alive ||
@@ -250,10 +277,12 @@ static int tcp_write_timeout(struct sock *sk)
if (tcp_out_of_resources(sk, do_reset))
return 1;
}
- expired = retransmits_timed_out(sk, retry_until,
- icsk->icsk_user_timeout);
}
+ if (!expired)
+ expired = retransmits_timed_out(sk, retry_until,
+ READ_ONCE(icsk->icsk_user_timeout));
tcp_fastopen_active_detect_blackhole(sk, expired);
+ mptcp_active_detect_blackhole(sk, expired);
if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))
tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RTO_CB,
@@ -266,6 +295,11 @@ static int tcp_write_timeout(struct sock *sk)
return 1;
}
+ if (sk_rethink_txhash(sk)) {
+ tp->timeout_rehash++;
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH);
+ }
+
return 0;
}
@@ -273,44 +307,49 @@ static int tcp_write_timeout(struct sock *sk)
void tcp_delack_timer_handler(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
- sk_mem_reclaim_partial(sk);
+ if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
+ return;
- if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
- !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
- goto out;
+ /* Handling the sack compression case */
+ if (tp->compressed_ack) {
+ tcp_mstamp_refresh(tp);
+ tcp_sack_compress_send_ack(sk);
+ return;
+ }
- if (time_after(icsk->icsk_ack.timeout, jiffies)) {
- sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
- goto out;
+ if (!(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
+ return;
+
+ if (time_after(icsk_delack_timeout(icsk), jiffies)) {
+ sk_reset_timer(sk, &icsk->icsk_delack_timer,
+ icsk_delack_timeout(icsk));
+ return;
}
icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
if (inet_csk_ack_scheduled(sk)) {
- if (!icsk->icsk_ack.pingpong) {
+ if (!inet_csk_in_pingpong_mode(sk)) {
/* Delayed ACK missed: inflate ATO. */
- icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
+ icsk->icsk_ack.ato = min_t(u32, icsk->icsk_ack.ato << 1, icsk->icsk_rto);
} else {
/* Delayed ACK missed: leave pingpong mode and
* deflate ATO.
*/
- icsk->icsk_ack.pingpong = 0;
+ inet_csk_exit_pingpong_mode(sk);
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
- tcp_mstamp_refresh(tcp_sk(sk));
+ tcp_mstamp_refresh(tp);
tcp_send_ack(sk);
__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
}
-
-out:
- if (tcp_under_memory_pressure(sk))
- sk_mem_reclaim(sk);
}
/**
* tcp_delack_timer() - The TCP delayed ACK timeout handler
- * @data: Pointer to the current socket. (gets casted to struct sock *)
+ * @t: Pointer to the timer. (gets casted to struct sock *)
*
* This function gets (indirectly) called when the kernel timer for a TCP packet
* of this socket expires. Calls tcp_delack_timer_handler() to do the actual work.
@@ -320,20 +359,28 @@ out:
static void tcp_delack_timer(struct timer_list *t)
{
struct inet_connection_sock *icsk =
- from_timer(icsk, t, icsk_delack_timer);
+ timer_container_of(icsk, t, icsk_delack_timer);
struct sock *sk = &icsk->icsk_inet.sk;
+ /* Avoid taking socket spinlock if there is no ACK to send.
+ * The compressed_ack check is racy, but a separate hrtimer
+ * will take care of it eventually.
+ */
+ if (!(smp_load_acquire(&icsk->icsk_ack.pending) & ICSK_ACK_TIMER) &&
+ !READ_ONCE(tcp_sk(sk)->compressed_ack))
+ goto out;
+
bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) {
tcp_delack_timer_handler(sk);
} else {
- icsk->icsk_ack.blocked = 1;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
/* deleguate our work to tcp_release_cb() */
if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags))
sock_hold(sk);
}
bh_unlock_sock(sk);
+out:
sock_put(sk);
}
@@ -343,10 +390,10 @@ static void tcp_probe_timer(struct sock *sk)
struct sk_buff *skb = tcp_send_head(sk);
struct tcp_sock *tp = tcp_sk(sk);
int max_probes;
- u32 start_ts;
if (tp->packets_out || !skb) {
- icsk->icsk_probes_out = 0;
+ WRITE_ONCE(icsk->icsk_probes_out, 0);
+ icsk->icsk_probes_tstamp = 0;
return;
}
@@ -358,16 +405,20 @@ static void tcp_probe_timer(struct sock *sk)
* corresponding system limit. We also implement similar policy when
* we use RTO to probe window in tcp_retransmit_timer().
*/
- start_ts = tcp_skb_timestamp(skb);
- if (!start_ts)
- skb->skb_mstamp = tp->tcp_mstamp;
- else if (icsk->icsk_user_timeout &&
- (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout)
- goto abort;
-
- max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;
+ if (!icsk->icsk_probes_tstamp) {
+ icsk->icsk_probes_tstamp = tcp_jiffies32;
+ } else {
+ u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+
+ if (user_timeout &&
+ (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >=
+ msecs_to_jiffies(user_timeout))
+ goto abort;
+ }
+ max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2);
if (sock_flag(sk, SOCK_DEAD)) {
- const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
+ unsigned int rto_max = tcp_rto_max(sk);
+ const bool alive = inet_csk_rto_backoff(icsk, rto_max) < rto_max;
max_probes = tcp_orphan_retries(sk, alive);
if (!alive && icsk->icsk_backoff >= max_probes)
@@ -376,7 +427,7 @@ static void tcp_probe_timer(struct sock *sk)
return;
}
- if (icsk->icsk_probes_out > max_probes) {
+ if (icsk->icsk_probes_out >= max_probes) {
abort: tcp_write_err(sk);
} else {
/* Only send another probe if we didn't close things up. */
@@ -384,36 +435,87 @@ abort: tcp_write_err(sk);
}
}
+static void tcp_update_rto_stats(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!icsk->icsk_retransmits) {
+ tp->total_rto_recoveries++;
+ tp->rto_stamp = tcp_time_stamp_ms(tp);
+ }
+ WRITE_ONCE(icsk->icsk_retransmits, icsk->icsk_retransmits + 1);
+ tp->total_rto++;
+}
+
/*
* Timer for Fast Open socket to retransmit SYNACK. Note that the
* sk here is the child socket, not the parent (listener) socket.
*/
-static void tcp_fastopen_synack_timer(struct sock *sk)
+static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- int max_retries = icsk->icsk_syn_retries ? :
- sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
- struct request_sock *req;
+ struct tcp_sock *tp = tcp_sk(sk);
+ int max_retries;
+
+ tcp_syn_ack_timeout(req);
- req = tcp_sk(sk)->fastopen_rsk;
- req->rsk_ops->syn_ack_timeout(req);
+ /* Add one more retry for fastopen.
+ * Paired with WRITE_ONCE() in tcp_sock_set_syncnt()
+ */
+ max_retries = READ_ONCE(icsk->icsk_syn_retries) ? :
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1;
if (req->num_timeout >= max_retries) {
tcp_write_err(sk);
return;
}
+ /* Lower cwnd after certain SYNACK timeout like tcp_init_transfer() */
+ if (icsk->icsk_retransmits == 1)
+ tcp_enter_loss(sk);
/* XXX (TFO) - Unlike regular SYN-ACK retransmit, we ignore error
* returned from rtx_syn_ack() to make it more persistent like
* regular retransmit because if the child socket has been accepted
* it's not good to give up too easily.
*/
- inet_rtx_syn_ack(sk, req);
+ tcp_rtx_synack(sk, req);
req->num_timeout++;
- icsk->icsk_retransmits++;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
+ tcp_update_rto_stats(sk);
+ if (!tp->retrans_stamp)
+ tp->retrans_stamp = tcp_time_stamp_ts(tp);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ req->timeout << req->num_timeout, false);
}
+static bool tcp_rtx_probe0_timed_out(const struct sock *sk,
+ const struct sk_buff *skb,
+ u32 rtx_delta)
+{
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+ const struct tcp_sock *tp = tcp_sk(sk);
+ int timeout = tcp_rto_max(sk) * 2;
+ s32 rcv_delta;
+
+ if (user_timeout) {
+ /* If user application specified a TCP_USER_TIMEOUT,
+ * it does not want win 0 packets to 'reset the timer'
+ * while retransmits are not making progress.
+ */
+ if (rtx_delta > user_timeout)
+ return true;
+ timeout = min_t(u32, timeout, msecs_to_jiffies(user_timeout));
+ }
+ /* Note: timer interrupt might have been delayed by at least one jiffy,
+ * and tp->rcv_tstamp might very well have been written recently.
+ * rcv_delta can thus be negative.
+ */
+ rcv_delta = tcp_timeout_expires(sk) - tp->rcv_tstamp;
+ if (rcv_delta <= timeout)
+ return false;
+
+ return msecs_to_jiffies(rtx_delta) > timeout;
+}
/**
* tcp_retransmit_timer() - The TCP retransmit timeout handler
@@ -422,7 +524,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
* This function gets called when the kernel timer for a TCP packet
* of this socket expires.
*
- * It handles retransmission, timer adjustment and other necesarry measures.
+ * It handles retransmission, timer adjustment and other necessary measures.
*
* Returns: Nothing (void)
*/
@@ -431,22 +533,27 @@ void tcp_retransmit_timer(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct request_sock *req;
+ struct sk_buff *skb;
- if (tp->fastopen_rsk) {
+ req = rcu_dereference_protected(tp->fastopen_rsk,
+ lockdep_sock_is_held(sk));
+ if (req) {
WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
sk->sk_state != TCP_FIN_WAIT1);
- tcp_fastopen_synack_timer(sk);
+ tcp_fastopen_synack_timer(sk, req);
/* Before we receive ACK to our SYN-ACK don't retransmit
* anything else (e.g., data or FIN segments).
*/
return;
}
- if (!tp->packets_out)
- goto out;
- WARN_ON(tcp_rtx_queue_empty(sk));
+ if (!tp->packets_out)
+ return;
- tp->tlp_high_seq = 0;
+ skb = tcp_rtx_queue_head(sk);
+ if (WARN_ON_ONCE(!skb))
+ return;
if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
!((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
@@ -456,37 +563,45 @@ void tcp_retransmit_timer(struct sock *sk)
* we cannot allow such beasts to hang infinitely.
*/
struct inet_sock *inet = inet_sk(sk);
+ u32 rtx_delta;
+
+ rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?:
+ tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb));
+ if (tp->tcp_usec_ts)
+ rtx_delta /= USEC_PER_MSEC;
+
if (sk->sk_family == AF_INET) {
- net_dbg_ratelimited("Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
- &inet->inet_daddr,
- ntohs(inet->inet_dport),
- inet->inet_num,
- tp->snd_una, tp->snd_nxt);
+ net_dbg_ratelimited("Probing zero-window on %pI4:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n",
+ &inet->inet_daddr, ntohs(inet->inet_dport),
+ inet->inet_num, tp->snd_una, tp->snd_nxt,
+ jiffies_to_msecs(jiffies - tp->rcv_tstamp),
+ rtx_delta);
}
#if IS_ENABLED(CONFIG_IPV6)
else if (sk->sk_family == AF_INET6) {
- net_dbg_ratelimited("Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
- &sk->sk_v6_daddr,
- ntohs(inet->inet_dport),
- inet->inet_num,
- tp->snd_una, tp->snd_nxt);
+ net_dbg_ratelimited("Probing zero-window on %pI6:%u/%u, seq=%u:%u, recv %ums ago, lasting %ums\n",
+ &sk->sk_v6_daddr, ntohs(inet->inet_dport),
+ inet->inet_num, tp->snd_una, tp->snd_nxt,
+ jiffies_to_msecs(jiffies - tp->rcv_tstamp),
+ rtx_delta);
}
#endif
- if (tcp_jiffies32 - tp->rcv_tstamp > TCP_RTO_MAX) {
+ if (tcp_rtx_probe0_timed_out(sk, skb, rtx_delta)) {
tcp_write_err(sk);
goto out;
}
tcp_enter_loss(sk);
- tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1);
+ tcp_retransmit_skb(sk, skb, 1);
__sk_dst_reset(sk);
goto out_reset_timer;
}
+ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTS);
if (tcp_write_timeout(sk))
goto out;
if (icsk->icsk_retransmits == 0) {
- int mib_idx;
+ int mib_idx = 0;
if (icsk->icsk_ca_state == TCP_CA_Recovery) {
if (tcp_is_sack(tp))
@@ -501,23 +616,21 @@ void tcp_retransmit_timer(struct sock *sk)
mib_idx = LINUX_MIB_TCPSACKFAILURES;
else
mib_idx = LINUX_MIB_TCPRENOFAILURES;
- } else {
- mib_idx = LINUX_MIB_TCPTIMEOUTS;
}
- __NET_INC_STATS(sock_net(sk), mib_idx);
+ if (mib_idx)
+ __NET_INC_STATS(sock_net(sk), mib_idx);
}
tcp_enter_loss(sk);
+ tcp_update_rto_stats(sk);
if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
/* Retransmission failed because of local congestion,
- * do not backoff.
+ * Let senders fight for local resources conservatively.
*/
- if (!icsk->icsk_retransmits)
- icsk->icsk_retransmits = 1;
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),
- TCP_RTO_MAX);
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ TCP_RESOURCE_PROBE_INTERVAL,
+ false);
goto out;
}
@@ -536,8 +649,6 @@ void tcp_retransmit_timer(struct sock *sk)
* implemented ftp to mars will work nicely. We will have to fix
* the 120 second clamps though!
*/
- icsk->icsk_backoff++;
- icsk->icsk_retransmits++;
out_reset_timer:
/* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
@@ -550,25 +661,33 @@ out_reset_timer:
* linear-timeout retransmissions into a black hole
*/
if (sk->sk_state == TCP_ESTABLISHED &&
- (tp->thin_lto || net->ipv4.sysctl_tcp_thin_linear_timeouts) &&
+ (tp->thin_lto || READ_ONCE(net->ipv4.sysctl_tcp_thin_linear_timeouts)) &&
tcp_stream_is_thin(tp) &&
icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
icsk->icsk_backoff = 0;
- icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
- } else {
- /* Use normal (exponential) backoff */
- icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+ icsk->icsk_rto = clamp(__tcp_set_rto(tp),
+ tcp_rto_min(sk),
+ tcp_rto_max(sk));
+ } else if (sk->sk_state != TCP_SYN_SENT ||
+ tp->total_rto >
+ READ_ONCE(net->ipv4.sysctl_tcp_syn_linear_timeouts)) {
+ /* Use normal (exponential) backoff unless linear timeouts are
+ * activated.
+ */
+ icsk->icsk_backoff++;
+ icsk->icsk_rto = min(icsk->icsk_rto << 1, tcp_rto_max(sk));
}
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX);
- if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0))
+ tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ tcp_clamp_rto_to_user_timeout(sk), false);
+ if (retransmits_timed_out(sk, READ_ONCE(net->ipv4.sysctl_tcp_retries1) + 1, 0))
__sk_dst_reset(sk);
out:;
}
/* Called with bottom-half processing disabled.
- Called by tcp_write_timer() */
+ * Called by tcp_write_timer() and tcp_release_cb().
+ */
void tcp_write_timer_handler(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -576,13 +695,13 @@ void tcp_write_timer_handler(struct sock *sk)
if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
!icsk->icsk_pending)
- goto out;
+ return;
- if (time_after(icsk->icsk_timeout, jiffies)) {
- sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
- goto out;
+ if (time_after(tcp_timeout_expires(sk), jiffies)) {
+ sk_reset_timer(sk, &sk->tcp_retransmit_timer,
+ tcp_timeout_expires(sk));
+ return;
}
-
tcp_mstamp_refresh(tcp_sk(sk));
event = icsk->icsk_pending;
@@ -594,24 +713,23 @@ void tcp_write_timer_handler(struct sock *sk)
tcp_send_loss_probe(sk);
break;
case ICSK_TIME_RETRANS:
- icsk->icsk_pending = 0;
+ smp_store_release(&icsk->icsk_pending, 0);
tcp_retransmit_timer(sk);
break;
case ICSK_TIME_PROBE0:
- icsk->icsk_pending = 0;
+ smp_store_release(&icsk->icsk_pending, 0);
tcp_probe_timer(sk);
break;
}
-
-out:
- sk_mem_reclaim(sk);
}
static void tcp_write_timer(struct timer_list *t)
{
- struct inet_connection_sock *icsk =
- from_timer(icsk, t, icsk_retransmit_timer);
- struct sock *sk = &icsk->icsk_inet.sk;
+ struct sock *sk = timer_container_of(sk, t, tcp_retransmit_timer);
+
+ /* Avoid locking the socket when there is no pending event. */
+ if (!smp_load_acquire(&inet_csk(sk)->icsk_pending))
+ goto out;
bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) {
@@ -622,6 +740,7 @@ static void tcp_write_timer(struct timer_list *t)
sock_hold(sk);
}
bh_unlock_sock(sk);
+out:
sock_put(sk);
}
@@ -631,7 +750,16 @@ void tcp_syn_ack_timeout(const struct request_sock *req)
__NET_INC_STATS(net, LINUX_MIB_TCPTIMEOUTS);
}
-EXPORT_SYMBOL(tcp_syn_ack_timeout);
+
+void tcp_reset_keepalive_timer(struct sock *sk, unsigned long len)
+{
+ sk_reset_timer(sk, &inet_csk(sk)->icsk_keepalive_timer, jiffies + len);
+}
+
+static void tcp_delete_keepalive_timer(struct sock *sk)
+{
+ sk_stop_timer(sk, &inet_csk(sk)->icsk_keepalive_timer);
+}
void tcp_set_keepalive(struct sock *sk, int val)
{
@@ -639,17 +767,17 @@ void tcp_set_keepalive(struct sock *sk, int val)
return;
if (val && !sock_flag(sk, SOCK_KEEPOPEN))
- inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
+ tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
else if (!val)
- inet_csk_delete_keepalive_timer(sk);
+ tcp_delete_keepalive_timer(sk);
}
-EXPORT_SYMBOL_GPL(tcp_set_keepalive);
-
+EXPORT_IPV6_MOD_GPL(tcp_set_keepalive);
-static void tcp_keepalive_timer (struct timer_list *t)
+static void tcp_keepalive_timer(struct timer_list *t)
{
- struct sock *sk = from_timer(sk, t, sk_timer);
- struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_connection_sock *icsk =
+ timer_container_of(icsk, t, icsk_keepalive_timer);
+ struct sock *sk = &icsk->icsk_inet.sk;
struct tcp_sock *tp = tcp_sk(sk);
u32 elapsed;
@@ -657,7 +785,7 @@ static void tcp_keepalive_timer (struct timer_list *t)
bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
/* Try again later. */
- inet_csk_reset_keepalive_timer (sk, HZ/20);
+ tcp_reset_keepalive_timer(sk, HZ/20);
goto out;
}
@@ -668,7 +796,7 @@ static void tcp_keepalive_timer (struct timer_list *t)
tcp_mstamp_refresh(tp);
if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
- if (tp->linger2 >= 0) {
+ if (READ_ONCE(tp->linger2) >= 0) {
const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
if (tmo > 0) {
@@ -676,7 +804,7 @@ static void tcp_keepalive_timer (struct timer_list *t)
goto out;
}
}
- tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_send_active_reset(sk, GFP_ATOMIC, SK_RST_REASON_TCP_STATE);
goto death;
}
@@ -693,20 +821,23 @@ static void tcp_keepalive_timer (struct timer_list *t)
elapsed = keepalive_time_elapsed(tp);
if (elapsed >= keepalive_time_when(tp)) {
+ u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);
+
/* If the TCP_USER_TIMEOUT option is enabled, use that
* to determine when to timeout instead.
*/
- if ((icsk->icsk_user_timeout != 0 &&
- elapsed >= msecs_to_jiffies(icsk->icsk_user_timeout) &&
+ if ((user_timeout != 0 &&
+ elapsed >= msecs_to_jiffies(user_timeout) &&
icsk->icsk_probes_out > 0) ||
- (icsk->icsk_user_timeout == 0 &&
+ (user_timeout == 0 &&
icsk->icsk_probes_out >= keepalive_probes(tp))) {
- tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_send_active_reset(sk, GFP_ATOMIC,
+ SK_RST_REASON_TCP_KEEPALIVE_TIMEOUT);
tcp_write_err(sk);
goto out;
}
if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) {
- icsk->icsk_probes_out++;
+ WRITE_ONCE(icsk->icsk_probes_out, icsk->icsk_probes_out + 1);
elapsed = keepalive_intvl_when(tp);
} else {
/* If keepalive was lost due to local congestion,
@@ -719,10 +850,8 @@ static void tcp_keepalive_timer (struct timer_list *t)
elapsed = keepalive_time_when(tp) - elapsed;
}
- sk_mem_reclaim(sk);
-
resched:
- inet_csk_reset_keepalive_timer (sk, elapsed);
+ tcp_reset_keepalive_timer(sk, elapsed);
goto out;
death:
@@ -740,8 +869,15 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer)
bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) {
- if (tp->compressed_ack)
+ if (tp->compressed_ack) {
+ /* Since we have to send one ack finally,
+ * subtract one from tp->compressed_ack to keep
+ * LINUX_MIB_TCPACKCOMPRESSED accurate.
+ */
+ tp->compressed_ack--;
+ tcp_mstamp_refresh(tp);
tcp_send_ack(sk);
+ }
} else {
if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED,
&sk->sk_tsq_flags))
@@ -758,11 +894,9 @@ void tcp_init_xmit_timers(struct sock *sk)
{
inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
&tcp_keepalive_timer);
- hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_ABS_PINNED_SOFT);
- tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
+ hrtimer_setup(&tcp_sk(sk)->pacing_timer, tcp_pace_kick, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_PINNED_SOFT);
- hrtimer_init(&tcp_sk(sk)->compressed_ack_timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_REL_PINNED_SOFT);
- tcp_sk(sk)->compressed_ack_timer.function = tcp_compressed_ack_kick;
+ hrtimer_setup(&tcp_sk(sk)->compressed_ack_timer, tcp_compressed_ack_kick, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_PINNED_SOFT);
}
diff --git a/net/ipv4/tcp_ulp.c b/net/ipv4/tcp_ulp.c
index a5995bb2eaca..2aa442128630 100644
--- a/net/ipv4/tcp_ulp.c
+++ b/net/ipv4/tcp_ulp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Pluggable TCP upper layer protocol support.
*
@@ -6,7 +7,7 @@
*
*/
-#include<linux/module.h>
+#include <linux/module.h>
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/list.h>
@@ -21,7 +22,8 @@ static struct tcp_ulp_ops *tcp_ulp_find(const char *name)
{
struct tcp_ulp_ops *e;
- list_for_each_entry_rcu(e, &tcp_ulp_list, list) {
+ list_for_each_entry_rcu(e, &tcp_ulp_list, list,
+ lockdep_is_held(&tcp_ulp_list_lock)) {
if (strcmp(e->name, name) == 0)
return e;
}
@@ -29,18 +31,6 @@ static struct tcp_ulp_ops *tcp_ulp_find(const char *name)
return NULL;
}
-static struct tcp_ulp_ops *tcp_ulp_find_id(const int ulp)
-{
- struct tcp_ulp_ops *e;
-
- list_for_each_entry_rcu(e, &tcp_ulp_list, list) {
- if (e->uid == ulp)
- return e;
- }
-
- return NULL;
-}
-
static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
{
const struct tcp_ulp_ops *ulp = NULL;
@@ -63,18 +53,6 @@ static const struct tcp_ulp_ops *__tcp_ulp_find_autoload(const char *name)
return ulp;
}
-static const struct tcp_ulp_ops *__tcp_ulp_lookup(const int uid)
-{
- const struct tcp_ulp_ops *ulp;
-
- rcu_read_lock();
- ulp = tcp_ulp_find_id(uid);
- if (!ulp || !try_module_get(ulp->owner))
- ulp = NULL;
- rcu_read_unlock();
- return ulp;
-}
-
/* Attach new upper layer protocol to the list
* of available protocols.
*/
@@ -115,14 +93,30 @@ void tcp_get_available_ulp(char *buf, size_t maxlen)
offs += snprintf(buf + offs, maxlen - offs,
"%s%s",
offs == 0 ? "" : " ", ulp_ops->name);
+
+ if (WARN_ON_ONCE(offs >= maxlen))
+ break;
}
rcu_read_unlock();
}
+void tcp_update_ulp(struct sock *sk, struct proto *proto,
+ void (*write_space)(struct sock *sk))
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ulp_ops->update)
+ icsk->icsk_ulp_ops->update(sk, proto, write_space);
+}
+
void tcp_cleanup_ulp(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
+ /* No sock_owned_by_me() check here as at the time the
+ * stack calls this function, the socket is dead and
+ * about to be destroyed.
+ */
if (!icsk->icsk_ulp_ops)
return;
@@ -133,54 +127,42 @@ void tcp_cleanup_ulp(struct sock *sk)
icsk->icsk_ulp_ops = NULL;
}
-/* Change upper layer protocol for socket */
-int tcp_set_ulp(struct sock *sk, const char *name)
+static int __tcp_set_ulp(struct sock *sk, const struct tcp_ulp_ops *ulp_ops)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- const struct tcp_ulp_ops *ulp_ops;
- int err = 0;
+ int err;
+ err = -EEXIST;
if (icsk->icsk_ulp_ops)
- return -EEXIST;
+ goto out_err;
- ulp_ops = __tcp_ulp_find_autoload(name);
- if (!ulp_ops)
- return -ENOENT;
+ if (sk->sk_socket)
+ clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
- if (!ulp_ops->user_visible) {
- module_put(ulp_ops->owner);
- return -ENOENT;
- }
+ err = -ENOTCONN;
+ if (!ulp_ops->clone && sk->sk_state == TCP_LISTEN)
+ goto out_err;
err = ulp_ops->init(sk);
- if (err) {
- module_put(ulp_ops->owner);
- return err;
- }
+ if (err)
+ goto out_err;
icsk->icsk_ulp_ops = ulp_ops;
return 0;
+out_err:
+ module_put(ulp_ops->owner);
+ return err;
}
-int tcp_set_ulp_id(struct sock *sk, int ulp)
+int tcp_set_ulp(struct sock *sk, const char *name)
{
- struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_ulp_ops *ulp_ops;
- int err;
- if (icsk->icsk_ulp_ops)
- return -EEXIST;
+ sock_owned_by_me(sk);
- ulp_ops = __tcp_ulp_lookup(ulp);
+ ulp_ops = __tcp_ulp_find_autoload(name);
if (!ulp_ops)
return -ENOENT;
- err = ulp_ops->init(sk);
- if (err) {
- module_put(ulp_ops->owner);
- return err;
- }
-
- icsk->icsk_ulp_ops = ulp_ops;
- return 0;
+ return __tcp_set_ulp(sk, ulp_ops);
}
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index ee113ff15fd0..786848ad37ea 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* TCP Vegas congestion control
*
@@ -158,7 +159,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event);
static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp)
{
- return min(tp->snd_ssthresh, tp->snd_cwnd);
+ return min(tp->snd_ssthresh, tcp_snd_cwnd(tp));
}
static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
@@ -216,14 +217,14 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
* This is:
* (actual rate in segments) * baseRTT
*/
- target_cwnd = (u64)tp->snd_cwnd * vegas->baseRTT;
+ target_cwnd = (u64)tcp_snd_cwnd(tp) * vegas->baseRTT;
do_div(target_cwnd, rtt);
/* Calculate the difference between the window we had,
* and the window we would like to have. This quantity
* is the "Diff" from the Arizona Vegas papers.
*/
- diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT;
+ diff = tcp_snd_cwnd(tp) * (rtt-vegas->baseRTT) / vegas->baseRTT;
if (diff > gamma && tcp_in_slow_start(tp)) {
/* Going too fast. Time to slow down
@@ -237,7 +238,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
* truncation robs us of full link
* utilization.
*/
- tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1);
+ tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp),
+ (u32)target_cwnd + 1));
tp->snd_ssthresh = tcp_vegas_ssthresh(tp);
} else if (tcp_in_slow_start(tp)) {
@@ -253,14 +255,14 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
/* The old window was too fast, so
* we slow down.
*/
- tp->snd_cwnd--;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1);
tp->snd_ssthresh
= tcp_vegas_ssthresh(tp);
} else if (diff < alpha) {
/* We don't have enough extra packets
* in the network, so speed up.
*/
- tp->snd_cwnd++;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
} else {
/* Sending just as fast as we
* should be.
@@ -268,10 +270,10 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked)
}
}
- if (tp->snd_cwnd < 2)
- tp->snd_cwnd = 2;
- else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
- tp->snd_cwnd = tp->snd_cwnd_clamp;
+ if (tcp_snd_cwnd(tp) < 2)
+ tcp_snd_cwnd_set(tp, 2);
+ else if (tcp_snd_cwnd(tp) > tp->snd_cwnd_clamp)
+ tcp_snd_cwnd_set(tp, tp->snd_cwnd_clamp);
tp->snd_ssthresh = tcp_current_ssthresh(sk);
}
@@ -292,10 +294,10 @@ size_t tcp_vegas_get_info(struct sock *sk, u32 ext, int *attr,
const struct vegas *ca = inet_csk_ca(sk);
if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
- info->vegas.tcpv_enabled = ca->doing_vegas_now,
- info->vegas.tcpv_rttcnt = ca->cntRTT,
- info->vegas.tcpv_rtt = ca->baseRTT,
- info->vegas.tcpv_minrtt = ca->minRTT,
+ info->vegas.tcpv_enabled = ca->doing_vegas_now;
+ info->vegas.tcpv_rttcnt = ca->cntRTT;
+ info->vegas.tcpv_rtt = ca->baseRTT;
+ info->vegas.tcpv_minrtt = ca->minRTT;
*attr = INET_DIAG_VEGASINFO;
return sizeof(struct tcpvegas_info);
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 6fcf482d611b..366ff6f214b2 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* TCP Veno congestion control
*
@@ -6,7 +7,7 @@
* "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks."
* IEEE Journal on Selected Areas in Communication,
* Feb. 2003.
- * See http://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf
+ * See https://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf
*/
#include <linux/mm.h>
@@ -145,42 +146,45 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
rtt = veno->minrtt;
- target_cwnd = (u64)tp->snd_cwnd * veno->basertt;
+ target_cwnd = (u64)tcp_snd_cwnd(tp) * veno->basertt;
target_cwnd <<= V_PARAM_SHIFT;
do_div(target_cwnd, rtt);
- veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd;
+ veno->diff = (tcp_snd_cwnd(tp) << V_PARAM_SHIFT) - target_cwnd;
if (tcp_in_slow_start(tp)) {
- /* Slow start. */
- tcp_slow_start(tp, acked);
+ /* Slow start. */
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ goto done;
+ }
+
+ /* Congestion avoidance. */
+ if (veno->diff < beta) {
+ /* In the "non-congestive state", increase cwnd
+ * every rtt.
+ */
+ tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked);
} else {
- /* Congestion avoidance. */
- if (veno->diff < beta) {
- /* In the "non-congestive state", increase cwnd
- * every rtt.
- */
- tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
- } else {
- /* In the "congestive state", increase cwnd
- * every other rtt.
- */
- if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- if (veno->inc &&
- tp->snd_cwnd < tp->snd_cwnd_clamp) {
- tp->snd_cwnd++;
- veno->inc = 0;
- } else
- veno->inc = 1;
- tp->snd_cwnd_cnt = 0;
+ /* In the "congestive state", increase cwnd
+ * every other rtt.
+ */
+ if (tp->snd_cwnd_cnt >= tcp_snd_cwnd(tp)) {
+ if (veno->inc &&
+ tcp_snd_cwnd(tp) < tp->snd_cwnd_clamp) {
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1);
+ veno->inc = 0;
} else
- tp->snd_cwnd_cnt++;
- }
+ veno->inc = 1;
+ tp->snd_cwnd_cnt = 0;
+ } else
+ tp->snd_cwnd_cnt += acked;
}
- if (tp->snd_cwnd < 2)
- tp->snd_cwnd = 2;
- else if (tp->snd_cwnd > tp->snd_cwnd_clamp)
- tp->snd_cwnd = tp->snd_cwnd_clamp;
+done:
+ if (tcp_snd_cwnd(tp) < 2)
+ tcp_snd_cwnd_set(tp, 2);
+ else if (tcp_snd_cwnd(tp) > tp->snd_cwnd_clamp)
+ tcp_snd_cwnd_set(tp, tp->snd_cwnd_clamp);
}
/* Wipe the slate clean for the next rtt. */
/* veno->cntrtt = 0; */
@@ -195,10 +199,10 @@ static u32 tcp_veno_ssthresh(struct sock *sk)
if (veno->diff < beta)
/* in "non-congestive state", cut cwnd by 1/5 */
- return max(tp->snd_cwnd * 4 / 5, 2U);
+ return max(tcp_snd_cwnd(tp) * 4 / 5, 2U);
else
/* in "congestive state", cut cwnd by 1/2 */
- return max(tp->snd_cwnd >> 1U, 2U);
+ return max(tcp_snd_cwnd(tp) >> 1U, 2U);
}
static struct tcp_congestion_ops tcp_veno __read_mostly = {
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index bec9cafbe3f9..c6e97141eef2 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* TCP Westwood+: end-to-end bandwidth estimation for TCP
*
@@ -243,7 +244,8 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
switch (event) {
case CA_EVENT_COMPLETE_CWR:
- tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
+ tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
+ tcp_snd_cwnd_set(tp, tp->snd_ssthresh);
break;
case CA_EVENT_LOSS:
tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 96e829b2e2fc..18b07ff5d20e 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
*
* YeAH TCP
@@ -35,8 +36,6 @@ struct yeah {
u32 reno_count;
u32 fast_count;
-
- u32 pkts_acked;
};
static void tcp_yeah_init(struct sock *sk)
@@ -56,18 +55,6 @@ static void tcp_yeah_init(struct sock *sk)
tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128);
}
-static void tcp_yeah_pkts_acked(struct sock *sk,
- const struct ack_sample *sample)
-{
- const struct inet_connection_sock *icsk = inet_csk(sk);
- struct yeah *yeah = inet_csk_ca(sk);
-
- if (icsk->icsk_ca_state == TCP_CA_Open)
- yeah->pkts_acked = sample->pkts_acked;
-
- tcp_vegas_pkts_acked(sk, sample);
-}
-
static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -76,24 +63,19 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (!tcp_is_cwnd_limited(sk))
return;
- if (tcp_in_slow_start(tp))
- tcp_slow_start(tp, acked);
+ if (tcp_in_slow_start(tp)) {
+ acked = tcp_slow_start(tp, acked);
+ if (!acked)
+ goto do_vegas;
+ }
- else if (!yeah->doing_reno_now) {
+ if (!yeah->doing_reno_now) {
/* Scalable */
-
- tp->snd_cwnd_cnt += yeah->pkts_acked;
- if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)) {
- if (tp->snd_cwnd < tp->snd_cwnd_clamp)
- tp->snd_cwnd++;
- tp->snd_cwnd_cnt = 0;
- }
-
- yeah->pkts_acked = 1;
-
+ tcp_cong_avoid_ai(tp, min(tcp_snd_cwnd(tp), TCP_SCALABLE_AI_CNT),
+ acked);
} else {
/* Reno */
- tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
+ tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked);
}
/* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
@@ -117,7 +99,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
* of bytes we send in an RTT is often less than our cwnd will allow.
* So we keep track of our cwnd separately, in v_beg_snd_cwnd.
*/
-
+do_vegas:
if (after(ack, yeah->vegas.beg_snd_nxt)) {
/* We do the Vegas calculations only if we got enough RTT
* samples that we can be reasonably sure that we got
@@ -148,7 +130,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
/* Compute excess number of packets above bandwidth
* Avoid doing full 64 bit divide.
*/
- bw = tp->snd_cwnd;
+ bw = tcp_snd_cwnd(tp);
bw *= rtt - yeah->vegas.baseRTT;
do_div(bw, rtt);
queue = bw;
@@ -156,20 +138,20 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
if (queue > TCP_YEAH_ALPHA ||
rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) {
if (queue > TCP_YEAH_ALPHA &&
- tp->snd_cwnd > yeah->reno_count) {
+ tcp_snd_cwnd(tp) > yeah->reno_count) {
u32 reduction = min(queue / TCP_YEAH_GAMMA ,
- tp->snd_cwnd >> TCP_YEAH_EPSILON);
+ tcp_snd_cwnd(tp) >> TCP_YEAH_EPSILON);
- tp->snd_cwnd -= reduction;
+ tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - reduction);
- tp->snd_cwnd = max(tp->snd_cwnd,
- yeah->reno_count);
+ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp),
+ yeah->reno_count));
- tp->snd_ssthresh = tp->snd_cwnd;
+ tp->snd_ssthresh = tcp_snd_cwnd(tp);
}
if (yeah->reno_count <= 2)
- yeah->reno_count = max(tp->snd_cwnd>>1, 2U);
+ yeah->reno_count = max(tcp_snd_cwnd(tp)>>1, 2U);
else
yeah->reno_count++;
@@ -194,7 +176,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
*/
yeah->vegas.beg_snd_una = yeah->vegas.beg_snd_nxt;
yeah->vegas.beg_snd_nxt = tp->snd_nxt;
- yeah->vegas.beg_snd_cwnd = tp->snd_cwnd;
+ yeah->vegas.beg_snd_cwnd = tcp_snd_cwnd(tp);
/* Wipe the slate clean for the next RTT. */
yeah->vegas.cntRTT = 0;
@@ -211,16 +193,16 @@ static u32 tcp_yeah_ssthresh(struct sock *sk)
if (yeah->doing_reno_now < TCP_YEAH_RHO) {
reduction = yeah->lastQ;
- reduction = min(reduction, max(tp->snd_cwnd>>1, 2U));
+ reduction = min(reduction, max(tcp_snd_cwnd(tp)>>1, 2U));
- reduction = max(reduction, tp->snd_cwnd >> TCP_YEAH_DELTA);
+ reduction = max(reduction, tcp_snd_cwnd(tp) >> TCP_YEAH_DELTA);
} else
- reduction = max(tp->snd_cwnd>>1, 2U);
+ reduction = max(tcp_snd_cwnd(tp)>>1, 2U);
yeah->fast_count = 0;
yeah->reno_count = max(yeah->reno_count>>1, 2U);
- return max_t(int, tp->snd_cwnd - reduction, 2);
+ return max_t(int, tcp_snd_cwnd(tp) - reduction, 2);
}
static struct tcp_congestion_ops tcp_yeah __read_mostly = {
@@ -231,7 +213,7 @@ static struct tcp_congestion_ops tcp_yeah __read_mostly = {
.set_state = tcp_vegas_state,
.cwnd_event = tcp_vegas_cwnd_event,
.get_info = tcp_vegas_get_info,
- .pkts_acked = tcp_yeah_pkts_acked,
+ .pkts_acked = tcp_vegas_pkts_acked,
.owner = THIS_MODULE,
.name = "yeah",
@@ -239,7 +221,7 @@ static struct tcp_congestion_ops tcp_yeah __read_mostly = {
static int __init tcp_yeah_register(void)
{
- BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE);
+ BUILD_BUG_ON(sizeof(struct yeah) > ICSK_CA_PRIV_SIZE);
tcp_register_congestion_control(&tcp_yeah);
return 0;
}
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index c0630013c1ae..4c1f836aae38 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* tunnel4.c: Generic IP tunnel transformer.
*
* Copyright (C) 2003 David S. Miller (davem@redhat.com)
@@ -109,6 +110,33 @@ drop:
return 0;
}
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+static int tunnel4_rcv_cb(struct sk_buff *skb, u8 proto, int err)
+{
+ struct xfrm_tunnel __rcu *head;
+ struct xfrm_tunnel *handler;
+ int ret;
+
+ head = (proto == IPPROTO_IPIP) ? tunnel4_handlers : tunnel64_handlers;
+
+ for_each_tunnel_rcu(head, handler) {
+ if (handler->cb_handler) {
+ ret = handler->cb_handler(skb, err);
+ if (ret <= 0)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static const struct xfrm_input_afinfo tunnel4_input_afinfo = {
+ .family = AF_INET,
+ .is_ipip = true,
+ .callback = tunnel4_rcv_cb,
+};
+#endif
+
#if IS_ENABLED(CONFIG_IPV6)
static int tunnel64_rcv(struct sk_buff *skb)
{
@@ -149,34 +177,40 @@ drop:
}
#endif
-static void tunnel4_err(struct sk_buff *skb, u32 info)
+static int tunnel4_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
for_each_tunnel_rcu(tunnel4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
#if IS_ENABLED(CONFIG_IPV6)
-static void tunnel64_err(struct sk_buff *skb, u32 info)
+static int tunnel64_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
for_each_tunnel_rcu(tunnel64_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
#endif
#if IS_ENABLED(CONFIG_MPLS)
-static void tunnelmpls4_err(struct sk_buff *skb, u32 info)
+static int tunnelmpls4_err(struct sk_buff *skb, u32 info)
{
struct xfrm_tunnel *handler;
for_each_tunnel_rcu(tunnelmpls4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
#endif
@@ -184,7 +218,6 @@ static const struct net_protocol tunnel4_protocol = {
.handler = tunnel4_rcv,
.err_handler = tunnel4_err,
.no_policy = 1,
- .netns_ok = 1,
};
#if IS_ENABLED(CONFIG_IPV6)
@@ -192,7 +225,6 @@ static const struct net_protocol tunnel64_protocol = {
.handler = tunnel64_rcv,
.err_handler = tunnel64_err,
.no_policy = 1,
- .netns_ok = 1,
};
#endif
@@ -201,7 +233,6 @@ static const struct net_protocol tunnelmpls4_protocol = {
.handler = tunnelmpls4_rcv,
.err_handler = tunnelmpls4_err,
.no_policy = 1,
- .netns_ok = 1,
};
#endif
@@ -224,6 +255,18 @@ static int __init tunnel4_init(void)
goto err;
}
#endif
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+ if (xfrm_input_register_afinfo(&tunnel4_input_afinfo)) {
+ inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP);
+#if IS_ENABLED(CONFIG_IPV6)
+ inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6);
+#endif
+#if IS_ENABLED(CONFIG_MPLS)
+ inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS);
+#endif
+ goto err;
+ }
+#endif
return 0;
err:
@@ -233,6 +276,10 @@ err:
static void __exit tunnel4_fini(void)
{
+#if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL)
+ if (xfrm_input_unregister_afinfo(&tunnel4_input_afinfo))
+ pr_err("tunnel4 close: can't remove input afinfo\n");
+#endif
#if IS_ENABLED(CONFIG_MPLS)
if (inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS))
pr_err("tunnelmpls4 close: can't remove protocol\n");
@@ -247,4 +294,5 @@ static void __exit tunnel4_fini(void)
module_init(tunnel4_init);
module_exit(tunnel4_fini);
+MODULE_DESCRIPTION("IPv4 XFRM tunnel library");
MODULE_LICENSE("GPL");
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c32a4c16b7ff..ffe074cb5865 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -67,23 +68,17 @@
* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
* Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind
* a single port at the same time.
- * Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
+ * Derek Atkins <derek@ihtfp.com>: Add Encapsulation Support
* James Chapman : Add L2TP encapsulation type.
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "UDP: " fmt
+#include <linux/bpf-cgroup.h>
#include <linux/uaccess.h>
#include <asm/ioctls.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/highmem.h>
-#include <linux/swap.h>
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/module.h>
@@ -98,6 +93,7 @@
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
+#include <linux/sock_diag.h>
#include <net/tcp_states.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
@@ -105,38 +101,41 @@
#include <net/net_namespace.h>
#include <net/icmp.h>
#include <net/inet_hashtables.h>
+#include <net/ip.h>
+#include <net/ip_tunnels.h>
#include <net/route.h>
#include <net/checksum.h>
+#include <net/gso.h>
#include <net/xfrm.h>
#include <trace/events/udp.h>
#include <linux/static_key.h>
+#include <linux/btf_ids.h>
#include <trace/events/skb.h>
#include <net/busy_poll.h>
#include "udp_impl.h"
#include <net/sock_reuseport.h>
#include <net/addrconf.h>
+#include <net/udp_tunnel.h>
+#include <net/gro.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6_stubs.h>
+#endif
+#include <net/rps.h>
struct udp_table udp_table __read_mostly;
-EXPORT_SYMBOL(udp_table);
long sysctl_udp_mem[3] __read_mostly;
-EXPORT_SYMBOL(sysctl_udp_mem);
+EXPORT_IPV6_MOD(sysctl_udp_mem);
-atomic_long_t udp_memory_allocated;
-EXPORT_SYMBOL(udp_memory_allocated);
+DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
+EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc);
#define MAX_UDP_PORTS 65536
-#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
+#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN_PERNET)
-/* IPCB reference means this can not be used from early demux */
-static bool udp_lib_exact_dif_match(struct net *net, struct sk_buff *skb)
+static struct udp_table *udp_get_table_prot(struct sock *sk)
{
-#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
- if (!net->ipv4.sysctl_udp_l3mdev_accept &&
- skb && ipv4_l3mdev_skb(IPCB(skb)->flags))
- return true;
-#endif
- return false;
+ return sk->sk_prot->h.udp_table ? : sock_net(sk)->ipv4.udp_table;
}
static int udp_lib_lport_inuse(struct net *net, __u16 num,
@@ -144,8 +143,8 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
unsigned long *bitmap,
struct sock *sk, unsigned int log)
{
+ kuid_t uid = sk_uid(sk);
struct sock *sk2;
- kuid_t uid = sock_i_uid(sk);
sk_for_each(sk2, &hslot->head) {
if (net_eq(sock_net(sk2), net) &&
@@ -157,7 +156,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
inet_rcv_saddr_equal(sk, sk2, true)) {
if (sk2->sk_reuseport && sk->sk_reuseport &&
!rcu_access_pointer(sk->sk_reuseport_cb) &&
- uid_eq(uid, sock_i_uid(sk2))) {
+ uid_eq(uid, sk_uid(sk2))) {
if (!bitmap)
return 0;
} else {
@@ -179,8 +178,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
struct udp_hslot *hslot2,
struct sock *sk)
{
+ kuid_t uid = sk_uid(sk);
struct sock *sk2;
- kuid_t uid = sock_i_uid(sk);
int res = 0;
spin_lock(&hslot2->lock);
@@ -194,7 +193,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
inet_rcv_saddr_equal(sk, sk2, true)) {
if (sk2->sk_reuseport && sk->sk_reuseport &&
!rcu_access_pointer(sk->sk_reuseport_cb) &&
- uid_eq(uid, sock_i_uid(sk2))) {
+ uid_eq(uid, sk_uid(sk2))) {
res = 0;
} else {
res = 1;
@@ -209,7 +208,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
{
struct net *net = sock_net(sk);
- kuid_t uid = sock_i_uid(sk);
+ kuid_t uid = sk_uid(sk);
struct sock *sk2;
sk_for_each(sk2, &hslot->head) {
@@ -219,7 +218,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
(udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) &&
(sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
- sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
+ sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) &&
inet_rcv_saddr_equal(sk, sk2, false)) {
return reuseport_add_sock(sk, sk2,
inet_rcv_saddr_any(sk));
@@ -240,21 +239,21 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
int udp_lib_get_port(struct sock *sk, unsigned short snum,
unsigned int hash2_nulladdr)
{
+ struct udp_table *udptable = udp_get_table_prot(sk);
struct udp_hslot *hslot, *hslot2;
- struct udp_table *udptable = sk->sk_prot->h.udp_table;
- int error = 1;
struct net *net = sock_net(sk);
+ int error = -EADDRINUSE;
if (!snum) {
+ DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
+ unsigned short first, last;
int low, high, remaining;
unsigned int rand;
- unsigned short first, last;
- DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN);
- inet_get_local_port_range(net, &low, &high);
+ inet_sk_get_local_port_range(sk, &low, &high);
remaining = (high - low) + 1;
- rand = prandom_u32();
+ rand = get_random_u32();
first = reciprocal_scale(rand, remaining) + low;
/*
* force rand to be an odd multiple of UDP_HTABLE_SIZE
@@ -327,6 +326,8 @@ found:
goto fail_unlock;
}
+ sock_set_flag(sk, SOCK_RCU_FREE);
+
sk_add_node_rcu(sk, &hslot->head);
hslot->count++;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
@@ -343,14 +344,14 @@ found:
hslot2->count++;
spin_unlock(&hslot2->lock);
}
- sock_set_flag(sk, SOCK_RCU_FREE);
+
error = 0;
fail_unlock:
spin_unlock_bh(&hslot->lock);
fail:
return error;
}
-EXPORT_SYMBOL(udp_lib_get_port);
+EXPORT_IPV6_MOD(udp_lib_get_port);
int udp_v4_get_port(struct sock *sk, unsigned short snum)
{
@@ -364,28 +365,26 @@ int udp_v4_get_port(struct sock *sk, unsigned short snum)
return udp_lib_get_port(sk, snum, hash2_nulladdr);
}
-static int compute_score(struct sock *sk, struct net *net,
+static int compute_score(struct sock *sk, const struct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsigned short hnum,
- int dif, int sdif, bool exact_dif)
+ int dif, int sdif)
{
int score;
struct inet_sock *inet;
+ bool dev_match;
if (!net_eq(sock_net(sk), net) ||
udp_sk(sk)->udp_port_hash != hnum ||
ipv6_only_sock(sk))
return -1;
- score = (sk->sk_family == PF_INET) ? 2 : 1;
- inet = inet_sk(sk);
+ if (sk->sk_rcv_saddr != daddr)
+ return -1;
- if (inet->inet_rcv_saddr) {
- if (inet->inet_rcv_saddr != daddr)
- return -1;
- score += 4;
- }
+ score = (sk->sk_family == PF_INET) ? 2 : 1;
+ inet = inet_sk(sk);
if (inet->inet_daddr) {
if (inet->inet_daddr != saddr)
return -1;
@@ -398,132 +397,348 @@ static int compute_score(struct sock *sk, struct net *net,
score += 4;
}
- if (sk->sk_bound_dev_if || exact_dif) {
- bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
-
- if (!dev_match)
- return -1;
- if (sk->sk_bound_dev_if)
- score += 4;
- }
+ dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+ dif, sdif);
+ if (!dev_match)
+ return -1;
+ if (sk->sk_bound_dev_if)
+ score += 4;
- if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
score++;
return score;
}
-static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
- const __u16 lport, const __be32 faddr,
- const __be16 fport)
+u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport,
+ const __be32 faddr, const __be16 fport)
{
- static u32 udp_ehash_secret __read_mostly;
-
net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret));
return __inet_ehashfn(laddr, lport, faddr, fport,
udp_ehash_secret + net_hash_mix(net));
}
+EXPORT_IPV6_MOD(udp_ehashfn);
+
+/**
+ * udp4_lib_lookup1() - Simplified lookup using primary hash (destination port)
+ * @net: Network namespace
+ * @saddr: Source address, network order
+ * @sport: Source port, network order
+ * @daddr: Destination address, network order
+ * @hnum: Destination port, host order
+ * @dif: Destination interface index
+ * @sdif: Destination bridge port index, if relevant
+ * @udptable: Set of UDP hash tables
+ *
+ * Simplified lookup to be used as fallback if no sockets are found due to a
+ * potential race between (receive) address change, and lookup happening before
+ * the rehash operation. This function ignores SO_REUSEPORT groups while scoring
+ * result sockets, because if we have one, we don't need the fallback at all.
+ *
+ * Called under rcu_read_lock().
+ *
+ * Return: socket with highest matching score if any, NULL if none
+ */
+static struct sock *udp4_lib_lookup1(const struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum,
+ int dif, int sdif,
+ const struct udp_table *udptable)
+{
+ unsigned int slot = udp_hashfn(net, hnum, udptable->mask);
+ struct udp_hslot *hslot = &udptable->hash[slot];
+ struct sock *sk, *result = NULL;
+ int score, badness = 0;
+
+ sk_for_each_rcu(sk, &hslot->head) {
+ score = compute_score(sk, net,
+ saddr, sport, daddr, hnum, dif, sdif);
+ if (score > badness) {
+ result = sk;
+ badness = score;
+ }
+ }
+
+ return result;
+}
/* called with rcu_read_lock() */
-static struct sock *udp4_lib_lookup2(struct net *net,
+static struct sock *udp4_lib_lookup2(const struct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsigned int hnum,
- int dif, int sdif, bool exact_dif,
+ int dif, int sdif,
struct udp_hslot *hslot2,
struct sk_buff *skb)
{
struct sock *sk, *result;
int score, badness;
- u32 hash = 0;
+ bool need_rescore;
result = NULL;
badness = 0;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
- score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif, sdif, exact_dif);
+ need_rescore = false;
+rescore:
+ score = compute_score(need_rescore ? result : sk, net, saddr,
+ sport, daddr, hnum, dif, sdif);
if (score > badness) {
- if (sk->sk_reuseport) {
- hash = udp_ehashfn(net, daddr, hnum,
- saddr, sport);
- result = reuseport_select_sock(sk, hash, skb,
- sizeof(struct udphdr));
- if (result)
- return result;
- }
badness = score;
- result = sk;
+
+ if (need_rescore)
+ continue;
+
+ if (sk->sk_state == TCP_ESTABLISHED) {
+ result = sk;
+ continue;
+ }
+
+ result = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
+ saddr, sport, daddr, hnum, udp_ehashfn);
+ if (!result) {
+ result = sk;
+ continue;
+ }
+
+ /* Fall back to scoring if group has connections */
+ if (!reuseport_has_conns(sk))
+ return result;
+
+ /* Reuseport logic returned an error, keep original score. */
+ if (IS_ERR(result))
+ continue;
+
+ /* compute_score is too long of a function to be
+ * inlined, and calling it again here yields
+ * measurable overhead for some
+ * workloads. Work around it by jumping
+ * backwards to rescore 'result'.
+ */
+ need_rescore = true;
+ goto rescore;
}
}
return result;
}
+#if IS_ENABLED(CONFIG_BASE_SMALL)
+static struct sock *udp4_lib_lookup4(const struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum,
+ int dif, int sdif,
+ struct udp_table *udptable)
+{
+ return NULL;
+}
+
+static void udp_rehash4(struct udp_table *udptable, struct sock *sk,
+ u16 newhash4)
+{
+}
+
+static void udp_unhash4(struct udp_table *udptable, struct sock *sk)
+{
+}
+#else /* !CONFIG_BASE_SMALL */
+static struct sock *udp4_lib_lookup4(const struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum,
+ int dif, int sdif,
+ struct udp_table *udptable)
+{
+ const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
+ const struct hlist_nulls_node *node;
+ struct udp_hslot *hslot4;
+ unsigned int hash4, slot;
+ struct udp_sock *up;
+ struct sock *sk;
+
+ hash4 = udp_ehashfn(net, daddr, hnum, saddr, sport);
+ slot = hash4 & udptable->mask;
+ hslot4 = &udptable->hash4[slot];
+ INET_ADDR_COOKIE(acookie, saddr, daddr);
+
+begin:
+ /* SLAB_TYPESAFE_BY_RCU not used, so we don't need to touch sk_refcnt */
+ udp_lrpa_for_each_entry_rcu(up, node, &hslot4->nulls_head) {
+ sk = (struct sock *)up;
+ if (inet_match(net, sk, acookie, ports, dif, sdif))
+ return sk;
+ }
+
+ /* if the nulls value we got at the end of this lookup is not the
+ * expected one, we must restart lookup. We probably met an item that
+ * was moved to another chain due to rehash.
+ */
+ if (get_nulls_value(node) != slot)
+ goto begin;
+
+ return NULL;
+}
+
+/* udp_rehash4() only checks hslot4, and hash4_cnt is not processed. */
+static void udp_rehash4(struct udp_table *udptable, struct sock *sk,
+ u16 newhash4)
+{
+ struct udp_hslot *hslot4, *nhslot4;
+
+ hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash);
+ nhslot4 = udp_hashslot4(udptable, newhash4);
+ udp_sk(sk)->udp_lrpa_hash = newhash4;
+
+ if (hslot4 != nhslot4) {
+ spin_lock_bh(&hslot4->lock);
+ hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_lrpa_node);
+ hslot4->count--;
+ spin_unlock_bh(&hslot4->lock);
+
+ spin_lock_bh(&nhslot4->lock);
+ hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_lrpa_node,
+ &nhslot4->nulls_head);
+ nhslot4->count++;
+ spin_unlock_bh(&nhslot4->lock);
+ }
+}
+
+static void udp_unhash4(struct udp_table *udptable, struct sock *sk)
+{
+ struct udp_hslot *hslot2, *hslot4;
+
+ if (udp_hashed4(sk)) {
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+ hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash);
+
+ spin_lock(&hslot4->lock);
+ hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_lrpa_node);
+ hslot4->count--;
+ spin_unlock(&hslot4->lock);
+
+ spin_lock(&hslot2->lock);
+ udp_hash4_dec(hslot2);
+ spin_unlock(&hslot2->lock);
+ }
+}
+
+void udp_lib_hash4(struct sock *sk, u16 hash)
+{
+ struct udp_hslot *hslot, *hslot2, *hslot4;
+ struct net *net = sock_net(sk);
+ struct udp_table *udptable;
+
+ /* Connected udp socket can re-connect to another remote address, which
+ * will be handled by rehash. Thus no need to redo hash4 here.
+ */
+ if (udp_hashed4(sk))
+ return;
+
+ udptable = net->ipv4.udp_table;
+ hslot = udp_hashslot(udptable, net, udp_sk(sk)->udp_port_hash);
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+ hslot4 = udp_hashslot4(udptable, hash);
+ udp_sk(sk)->udp_lrpa_hash = hash;
+
+ spin_lock_bh(&hslot->lock);
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ reuseport_detach_sock(sk);
+
+ spin_lock(&hslot4->lock);
+ hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_lrpa_node,
+ &hslot4->nulls_head);
+ hslot4->count++;
+ spin_unlock(&hslot4->lock);
+
+ spin_lock(&hslot2->lock);
+ udp_hash4_inc(hslot2);
+ spin_unlock(&hslot2->lock);
+
+ spin_unlock_bh(&hslot->lock);
+}
+EXPORT_IPV6_MOD(udp_lib_hash4);
+
+/* call with sock lock */
+void udp4_hash4(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ unsigned int hash;
+
+ if (sk_unhashed(sk) || sk->sk_rcv_saddr == htonl(INADDR_ANY))
+ return;
+
+ hash = udp_ehashfn(net, sk->sk_rcv_saddr, sk->sk_num,
+ sk->sk_daddr, sk->sk_dport);
+
+ udp_lib_hash4(sk, hash);
+}
+EXPORT_IPV6_MOD(udp4_hash4);
+#endif /* CONFIG_BASE_SMALL */
+
/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
* harder than this. -DaveM
*/
-struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
+struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr,
__be16 sport, __be32 daddr, __be16 dport, int dif,
int sdif, struct udp_table *udptable, struct sk_buff *skb)
{
- struct sock *sk, *result;
unsigned short hnum = ntohs(dport);
- unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
- struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
- bool exact_dif = udp_lib_exact_dif_match(net, skb);
- int score, badness;
- u32 hash = 0;
-
- if (hslot->count > 10) {
- hash2 = ipv4_portaddr_hash(net, daddr, hnum);
- slot2 = hash2 & udptable->mask;
- hslot2 = &udptable->hash2[slot2];
- if (hslot->count < hslot2->count)
- goto begin;
-
- result = udp4_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif, sdif,
- exact_dif, hslot2, skb);
- if (!result) {
- unsigned int old_slot2 = slot2;
- hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
- slot2 = hash2 & udptable->mask;
- /* avoid searching the same slot again. */
- if (unlikely(slot2 == old_slot2))
- return result;
-
- hslot2 = &udptable->hash2[slot2];
- if (hslot->count < hslot2->count)
- goto begin;
-
- result = udp4_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif, sdif,
- exact_dif, hslot2, skb);
- }
- if (unlikely(IS_ERR(result)))
- return NULL;
- return result;
- }
-begin:
- result = NULL;
- badness = 0;
- sk_for_each_rcu(sk, &hslot->head) {
- score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif, sdif, exact_dif);
- if (score > badness) {
- if (sk->sk_reuseport) {
- hash = udp_ehashfn(net, daddr, hnum,
- saddr, sport);
- result = reuseport_select_sock(sk, hash, skb,
- sizeof(struct udphdr));
- if (unlikely(IS_ERR(result)))
- return NULL;
- if (result)
- return result;
- }
+ struct udp_hslot *hslot2;
+ struct sock *result, *sk;
+ unsigned int hash2;
+
+ hash2 = ipv4_portaddr_hash(net, daddr, hnum);
+ hslot2 = udp_hashslot2(udptable, hash2);
+
+ if (udp_has_hash4(hslot2)) {
+ result = udp4_lib_lookup4(net, saddr, sport, daddr, hnum,
+ dif, sdif, udptable);
+ if (result) /* udp4_lib_lookup4 return sk or NULL */
+ return result;
+ }
+
+ /* Lookup connected or non-wildcard socket */
+ result = udp4_lib_lookup2(net, saddr, sport,
+ daddr, hnum, dif, sdif,
+ hslot2, skb);
+ if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED)
+ goto done;
+
+ /* Lookup redirect from BPF */
+ if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
+ udptable == net->ipv4.udp_table) {
+ sk = inet_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr),
+ saddr, sport, daddr, hnum, dif,
+ udp_ehashfn);
+ if (sk) {
result = sk;
- badness = score;
+ goto done;
}
}
+
+ /* Got non-wildcard socket or error on first lookup */
+ if (result)
+ goto done;
+
+ /* Lookup wildcard sockets */
+ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
+ hslot2 = udp_hashslot2(udptable, hash2);
+
+ result = udp4_lib_lookup2(net, saddr, sport,
+ htonl(INADDR_ANY), hnum, dif, sdif,
+ hslot2, skb);
+ if (!IS_ERR_OR_NULL(result))
+ goto done;
+
+ /* Primary hash (destination port) lookup as fallback for this race:
+ * 1. __ip4_datagram_connect() sets sk_rcv_saddr
+ * 2. lookup (this function): new sk_rcv_saddr, hashes not updated yet
+ * 3. rehash operation updating _secondary and four-tuple_ hashes
+ * The primary hash doesn't need an update after 1., so, thanks to this
+ * further step, 1. and 3. don't need to be atomic against the lookup.
+ */
+ result = udp4_lib_lookup1(net, saddr, sport, daddr, hnum, dif, sdif,
+ udptable);
+
+done:
+ if (IS_ERR(result))
+ return NULL;
return result;
}
EXPORT_SYMBOL_GPL(__udp4_lib_lookup);
@@ -539,24 +754,32 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
inet_sdif(skb), udptable, skb);
}
-struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
+struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb,
__be16 sport, __be16 dport)
{
- return __udp4_lib_lookup_skb(skb, sport, dport, &udp_table);
+ const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
+ const struct iphdr *iph = (struct iphdr *)(skb->data + offset);
+ struct net *net = dev_net(skb->dev);
+ int iif, sdif;
+
+ inet_get_iif_sdif(skb, &iif, &sdif);
+
+ return __udp4_lib_lookup(net, iph->saddr, sport,
+ iph->daddr, dport, iif,
+ sdif, net->ipv4.udp_table, NULL);
}
-EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb);
/* Must be called under rcu_read_lock().
* Does increment socket refcount.
*/
#if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4)
-struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
+struct sock *udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif)
{
struct sock *sk;
sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport,
- dif, 0, &udp_table, NULL);
+ dif, 0, net->ipv4.udp_table, NULL);
if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
return sk;
@@ -564,12 +787,12 @@ struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
EXPORT_SYMBOL_GPL(udp4_lib_lookup);
#endif
-static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
+static inline bool __udp_is_mcast_sock(struct net *net, const struct sock *sk,
__be16 loc_port, __be32 loc_addr,
__be16 rmt_port, __be32 rmt_addr,
int dif, int sdif, unsigned short hnum)
{
- struct inet_sock *inet = inet_sk(sk);
+ const struct inet_sock *inet = inet_sk(sk);
if (!net_eq(sock_net(sk), net) ||
udp_sk(sk)->udp_port_hash != hnum ||
@@ -577,14 +800,123 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
(inet->inet_dport != rmt_port && inet->inet_dport) ||
(inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) ||
ipv6_only_sock(sk) ||
- (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif &&
- sk->sk_bound_dev_if != sdif))
+ !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
return false;
if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif))
return false;
return true;
}
+DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
+EXPORT_IPV6_MOD(udp_encap_needed_key);
+
+#if IS_ENABLED(CONFIG_IPV6)
+DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
+EXPORT_IPV6_MOD(udpv6_encap_needed_key);
+#endif
+
+void udp_encap_enable(void)
+{
+ static_branch_inc(&udp_encap_needed_key);
+}
+EXPORT_SYMBOL(udp_encap_enable);
+
+void udp_encap_disable(void)
+{
+ static_branch_dec(&udp_encap_needed_key);
+}
+EXPORT_SYMBOL(udp_encap_disable);
+
+/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
+ * through error handlers in encapsulations looking for a match.
+ */
+static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info)
+{
+ int i;
+
+ for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
+ int (*handler)(struct sk_buff *skb, u32 info);
+ const struct ip_tunnel_encap_ops *encap;
+
+ encap = rcu_dereference(iptun_encaps[i]);
+ if (!encap)
+ continue;
+ handler = encap->err_handler;
+ if (handler && !handler(skb, info))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+/* Try to match ICMP errors to UDP tunnels by looking up a socket without
+ * reversing source and destination port: this will match tunnels that force the
+ * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that
+ * lwtunnels might actually break this assumption by being configured with
+ * different destination ports on endpoints, in this case we won't be able to
+ * trace ICMP messages back to them.
+ *
+ * If this doesn't match any socket, probe tunnels with arbitrary destination
+ * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port
+ * we've sent packets to won't necessarily match the local destination port.
+ *
+ * Then ask the tunnel implementation to match the error against a valid
+ * association.
+ *
+ * Return an error if we can't find a match, the socket if we need further
+ * processing, zero otherwise.
+ */
+static struct sock *__udp4_lib_err_encap(struct net *net,
+ const struct iphdr *iph,
+ struct udphdr *uh,
+ struct udp_table *udptable,
+ struct sock *sk,
+ struct sk_buff *skb, u32 info)
+{
+ int (*lookup)(struct sock *sk, struct sk_buff *skb);
+ int network_offset, transport_offset;
+ struct udp_sock *up;
+
+ network_offset = skb_network_offset(skb);
+ transport_offset = skb_transport_offset(skb);
+
+ /* Network header needs to point to the outer IPv4 header inside ICMP */
+ skb_reset_network_header(skb);
+
+ /* Transport header needs to point to the UDP header */
+ skb_set_transport_header(skb, iph->ihl << 2);
+
+ if (sk) {
+ up = udp_sk(sk);
+
+ lookup = READ_ONCE(up->encap_err_lookup);
+ if (lookup && lookup(sk, skb))
+ sk = NULL;
+
+ goto out;
+ }
+
+ sk = __udp4_lib_lookup(net, iph->daddr, uh->source,
+ iph->saddr, uh->dest, skb->dev->ifindex, 0,
+ udptable, NULL);
+ if (sk) {
+ up = udp_sk(sk);
+
+ lookup = READ_ONCE(up->encap_err_lookup);
+ if (!lookup || lookup(sk, skb))
+ sk = NULL;
+ }
+
+out:
+ if (!sk)
+ sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info));
+
+ skb_set_transport_header(skb, transport_offset);
+ skb_set_network_header(skb, network_offset);
+
+ return sk;
+}
+
/*
* This routine is called by the ICMP module when it gets some
* sort of error condition. If err < 0 then the socket should
@@ -596,24 +928,39 @@ static inline bool __udp_is_mcast_sock(struct net *net, struct sock *sk,
* to find the appropriate port.
*/
-void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
+int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
{
struct inet_sock *inet;
const struct iphdr *iph = (const struct iphdr *)skb->data;
struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
+ bool tunnel = false;
struct sock *sk;
int harderr;
int err;
struct net *net = dev_net(skb->dev);
sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
- iph->saddr, uh->source, skb->dev->ifindex, 0,
- udptable, NULL);
- if (!sk) {
- __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
- return; /* No socket for error */
+ iph->saddr, uh->source, skb->dev->ifindex,
+ inet_sdif(skb), udptable, NULL);
+
+ if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) {
+ /* No socket for error: try tunnels before discarding */
+ if (static_branch_unlikely(&udp_encap_needed_key)) {
+ sk = __udp4_lib_err_encap(net, iph, uh, udptable, sk, skb,
+ info);
+ if (!sk)
+ return 0;
+ } else
+ sk = ERR_PTR(-ENOENT);
+
+ if (IS_ERR(sk)) {
+ __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
+ return PTR_ERR(sk);
+ }
+
+ tunnel = true;
}
err = 0;
@@ -634,7 +981,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
case ICMP_DEST_UNREACH:
if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
ipv4_sk_update_pmtu(skb, sk, info);
- if (inet->pmtudisc != IP_PMTUDISC_DONT) {
+ if (READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT) {
err = EMSGSIZE;
harderr = 1;
break;
@@ -656,21 +1003,28 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
* RFC1122: OK. Passes ICMP errors back to application, as per
* 4.1.3.3.
*/
- if (!inet->recverr) {
+ if (tunnel) {
+ /* ...not for tunnels though: we don't have a sending socket */
+ if (udp_sk(sk)->encap_err_rcv)
+ udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest, info,
+ (u8 *)(uh+1));
+ goto out;
+ }
+ if (!inet_test_bit(RECVERR, sk)) {
if (!harderr || sk->sk_state != TCP_ESTABLISHED)
goto out;
} else
ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1));
sk->sk_err = err;
- sk->sk_error_report(sk);
+ sk_error_report(sk);
out:
- return;
+ return 0;
}
-void udp_err(struct sk_buff *skb, u32 info)
+int udp_err(struct sk_buff *skb, u32 info)
{
- __udp4_lib_err(skb, info, &udp_table);
+ return __udp4_lib_err(skb, info, dev_net(skb->dev)->ipv4.udp_table);
}
/*
@@ -682,11 +1036,11 @@ void udp_flush_pending_frames(struct sock *sk)
if (up->pending) {
up->len = 0;
- up->pending = 0;
+ WRITE_ONCE(up->pending, 0);
ip_flush_pending_frames(sk);
}
}
-EXPORT_SYMBOL(udp_flush_pending_frames);
+EXPORT_IPV6_MOD(udp_flush_pending_frames);
/**
* udp4_hwcsum - handle outgoing HW checksumming
@@ -766,10 +1120,11 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
struct sock *sk = skb->sk;
struct inet_sock *inet = inet_sk(sk);
struct udphdr *uh;
- int err = 0;
+ int err;
int is_udplite = IS_UDPLITE(sk);
int offset = skb_transport_offset(skb);
int len = skb->len - offset;
+ int datalen = len - sizeof(*uh);
__wsum csum = 0;
/*
@@ -785,21 +1140,32 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4,
const int hlen = skb_network_header_len(skb) +
sizeof(struct udphdr);
- if (hlen + cork->gso_size > cork->fragsize)
- return -EINVAL;
- if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
+ if (hlen + min(datalen, cork->gso_size) > cork->fragsize) {
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+ if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) {
+ kfree_skb(skb);
return -EINVAL;
- if (sk->sk_no_check_tx)
+ }
+ if (sk->sk_no_check_tx) {
+ kfree_skb(skb);
return -EINVAL;
- if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
- dst_xfrm(skb_dst(skb)))
+ }
+ if (is_udplite || dst_xfrm(skb_dst(skb))) {
+ kfree_skb(skb);
return -EIO;
+ }
+
+ if (datalen > cork->gso_size) {
+ skb_shinfo(skb)->gso_size = cork->gso_size;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
+ cork->gso_size);
- skb_shinfo(skb)->gso_size = cork->gso_size;
- skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
- skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(uh),
- cork->gso_size);
- goto csum_partial;
+ /* Don't checksum the payload, skb will get segmented */
+ goto csum_partial;
+ }
}
if (is_udplite) /* UDP-Lite */
@@ -828,7 +1194,8 @@ csum_partial:
send:
err = ip_send_skb(sock_net(sk), skb);
if (err) {
- if (err == -ENOBUFS && !inet->recverr) {
+ if (err == -ENOBUFS &&
+ !inet_test_bit(RECVERR, sk)) {
UDP_INC_STATS(sock_net(sk),
UDP_MIB_SNDBUFERRORS, is_udplite);
err = 0;
@@ -858,10 +1225,10 @@ int udp_push_pending_frames(struct sock *sk)
out:
up->len = 0;
- up->pending = 0;
+ WRITE_ONCE(up->pending, 0);
return err;
}
-EXPORT_SYMBOL(udp_push_pending_frames);
+EXPORT_IPV6_MOD(udp_push_pending_frames);
static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size)
{
@@ -898,7 +1265,7 @@ int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size)
return need_ip;
}
-EXPORT_SYMBOL_GPL(udp_cmsg_send);
+EXPORT_IPV6_MOD_GPL(udp_cmsg_send);
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
@@ -913,13 +1280,14 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int free = 0;
int connected = 0;
__be32 daddr, faddr, saddr;
+ u8 scope;
__be16 dport;
- u8 tos;
int err, is_udplite = IS_UDPLITE(sk);
- int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
+ int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE;
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
struct sk_buff *skb;
struct ip_options_data opt_copy;
+ int uc_index;
if (len > 0xFFFF)
return -EMSGSIZE;
@@ -934,7 +1302,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
fl4 = &inet->cork.fl.u.ip4;
- if (up->pending) {
+ if (READ_ONCE(up->pending)) {
/*
* There are pending frames.
* The socket lock must be held while it's corked.
@@ -978,20 +1346,21 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
ipcm_init_sk(&ipc, inet);
- ipc.gso_size = up->gso_size;
+ ipc.gso_size = READ_ONCE(up->gso_size);
if (msg->msg_controllen) {
err = udp_cmsg_send(sk, msg, &ipc.gso_size);
- if (err > 0)
+ if (err > 0) {
err = ip_cmsg_send(sk, msg, &ipc,
sk->sk_family == AF_INET6);
+ connected = 0;
+ }
if (unlikely(err < 0)) {
kfree(ipc.opt);
return err;
}
if (ipc.opt)
free = 1;
- connected = 0;
}
if (!ipc.opt) {
struct ip_options_rcu *inet_opt;
@@ -1006,9 +1375,11 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
rcu_read_unlock();
}
- if (cgroup_bpf_enabled && !connected) {
+ if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) {
err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
- (struct sockaddr *)usin, &ipc.addr);
+ (struct sockaddr *)usin,
+ &msg->msg_namelen,
+ &ipc.addr);
if (err)
goto out_free;
if (usin) {
@@ -1033,38 +1404,35 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
faddr = ipc.opt->opt.faddr;
connected = 0;
}
- tos = get_rttos(&ipc, inet);
- if (sock_flag(sk, SOCK_LOCALROUTE) ||
- (msg->msg_flags & MSG_DONTROUTE) ||
- (ipc.opt && ipc.opt->opt.is_strictroute)) {
- tos |= RTO_ONLINK;
+ scope = ip_sendmsg_scope(inet, &ipc, msg);
+ if (scope == RT_SCOPE_LINK)
connected = 0;
- }
+ uc_index = READ_ONCE(inet->uc_index);
if (ipv4_is_multicast(daddr)) {
- if (!ipc.oif)
- ipc.oif = inet->mc_index;
+ if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif))
+ ipc.oif = READ_ONCE(inet->mc_index);
if (!saddr)
- saddr = inet->mc_addr;
+ saddr = READ_ONCE(inet->mc_addr);
connected = 0;
} else if (!ipc.oif) {
- ipc.oif = inet->uc_index;
- } else if (ipv4_is_lbcast(daddr) && inet->uc_index) {
+ ipc.oif = uc_index;
+ } else if (ipv4_is_lbcast(daddr) && uc_index) {
/* oif is set, packet is to local broadcast and
- * and uc_index is set. oif is most likely set
+ * uc_index is set. oif is most likely set
* by sk_bound_dev_if. If uc_index != oif check if the
* oif is an L3 master and uc_index is an L3 slave.
* If so, we want to allow the send using the uc_index.
*/
- if (ipc.oif != inet->uc_index &&
+ if (ipc.oif != uc_index &&
ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk),
- inet->uc_index)) {
- ipc.oif = inet->uc_index;
+ uc_index)) {
+ ipc.oif = uc_index;
}
}
if (connected)
- rt = (struct rtable *)sk_dst_check(sk, 0);
+ rt = dst_rtable(sk_dst_check(sk, 0));
if (!rt) {
struct net *net = sock_net(sk);
@@ -1072,13 +1440,13 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl4 = &fl4_stack;
- flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
- RT_SCOPE_UNIVERSE, sk->sk_protocol,
- flow_flags,
- faddr, saddr, dport, inet->inet_sport,
- sk->sk_uid);
+ flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark,
+ ipc.tos & INET_DSCP_MASK, scope,
+ sk->sk_protocol, flow_flags, faddr, saddr,
+ dport, inet->inet_sport,
+ sk_uid(sk));
- security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
+ security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
@@ -1135,7 +1503,7 @@ back_from_confirm:
fl4->saddr = saddr;
fl4->fl4_dport = dport;
fl4->fl4_sport = inet->inet_sport;
- up->pending = AF_INET;
+ WRITE_ONCE(up->pending, AF_INET);
do_append_data:
up->len += ulen;
@@ -1147,7 +1515,7 @@ do_append_data:
else if (!corkreq)
err = udp_push_pending_frames(sk);
else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
- up->pending = 0;
+ WRITE_ONCE(up->pending, 0);
release_sock(sk);
out:
@@ -1180,61 +1548,44 @@ do_confirm:
}
EXPORT_SYMBOL(udp_sendmsg);
-int udp_sendpage(struct sock *sk, struct page *page, int offset,
- size_t size, int flags)
+void udp_splice_eof(struct socket *sock)
{
- struct inet_sock *inet = inet_sk(sk);
+ struct sock *sk = sock->sk;
struct udp_sock *up = udp_sk(sk);
- int ret;
- if (flags & MSG_SENDPAGE_NOTLAST)
- flags |= MSG_MORE;
-
- if (!up->pending) {
- struct msghdr msg = { .msg_flags = flags|MSG_MORE };
-
- /* Call udp_sendmsg to specify destination address which
- * sendpage interface can't pass.
- * This will succeed only when the socket is connected.
- */
- ret = udp_sendmsg(sk, &msg, 0);
- if (ret < 0)
- return ret;
- }
+ if (!READ_ONCE(up->pending) || udp_test_bit(CORK, sk))
+ return;
lock_sock(sk);
+ if (up->pending && !udp_test_bit(CORK, sk))
+ udp_push_pending_frames(sk);
+ release_sock(sk);
+}
+EXPORT_IPV6_MOD_GPL(udp_splice_eof);
- if (unlikely(!up->pending)) {
- release_sock(sk);
+#define UDP_SKB_IS_STATELESS 0x80000000
- net_dbg_ratelimited("cork failed\n");
- return -EINVAL;
- }
+/* all head states (dst, sk, nf conntrack) except skb extensions are
+ * cleared by udp_rcv().
+ *
+ * We need to preserve secpath, if present, to eventually process
+ * IP_CMSG_PASSSEC at recvmsg() time.
+ *
+ * Other extensions can be cleared.
+ */
+static bool udp_try_make_stateless(struct sk_buff *skb)
+{
+ if (!skb_has_extensions(skb))
+ return true;
- ret = ip_append_page(sk, &inet->cork.fl.u.ip4,
- page, offset, size, flags);
- if (ret == -EOPNOTSUPP) {
- release_sock(sk);
- return sock_no_sendpage(sk->sk_socket, page, offset,
- size, flags);
- }
- if (ret < 0) {
- udp_flush_pending_frames(sk);
- goto out;
+ if (!secpath_exists(skb)) {
+ skb_ext_reset(skb);
+ return true;
}
- up->len += size;
- if (!(up->corkflag || (flags&MSG_MORE)))
- ret = udp_push_pending_frames(sk);
- if (!ret)
- ret = size;
-out:
- release_sock(sk);
- return ret;
+ return false;
}
-#define UDP_SKB_IS_STATELESS 0x80000000
-
static void udp_set_dev_scratch(struct sk_buff *skb)
{
struct udp_dev_scratch *scratch = udp_skb_scratch(skb);
@@ -1246,14 +1597,24 @@ static void udp_set_dev_scratch(struct sk_buff *skb)
scratch->csum_unnecessary = !!skb_csum_unnecessary(skb);
scratch->is_linear = !skb_is_nonlinear(skb);
#endif
- /* all head states execept sp (dst, sk, nf) are always cleared by
- * udp_rcv() and we need to preserve secpath, if present, to eventually
- * process IP_CMSG_PASSSEC at recvmsg() time
- */
- if (likely(!skb_sec_path(skb)))
+ if (udp_try_make_stateless(skb))
scratch->_tsize_state |= UDP_SKB_IS_STATELESS;
}
+static void udp_skb_csum_unnecessary_set(struct sk_buff *skb)
+{
+ /* We come here after udp_lib_checksum_complete() returned 0.
+ * This means that __skb_checksum_complete() might have
+ * set skb->csum_valid to 1.
+ * On 64bit platforms, we can set csum_unnecessary
+ * to true, but only if the skb is not shared.
+ */
+#if BITS_PER_LONG == 64
+ if (!skb_shared(skb))
+ udp_skb_scratch(skb)->csum_unnecessary = true;
+#endif
+}
+
static int udp_skb_truesize(struct sk_buff *skb)
{
return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS;
@@ -1265,17 +1626,18 @@ static bool udp_skb_has_head_state(struct sk_buff *skb)
}
/* fully reclaim rmem/fwd memory allocated for skb */
-static void udp_rmem_release(struct sock *sk, int size, int partial,
- bool rx_queue_lock_held)
+static void udp_rmem_release(struct sock *sk, unsigned int size,
+ int partial, bool rx_queue_lock_held)
{
struct udp_sock *up = udp_sk(sk);
struct sk_buff_head *sk_queue;
- int amt;
+ unsigned int amt;
if (likely(partial)) {
up->forward_deficit += size;
size = up->forward_deficit;
- if (size < (sk->sk_rcvbuf >> 2))
+ if (size < READ_ONCE(up->forward_threshold) &&
+ !skb_queue_empty(&up->reader_queue))
return;
} else {
size += up->forward_deficit;
@@ -1289,13 +1651,11 @@ static void udp_rmem_release(struct sock *sk, int size, int partial,
if (!rx_queue_lock_held)
spin_lock(&sk_queue->lock);
-
- sk->sk_forward_alloc += size;
- amt = (sk->sk_forward_alloc - partial) & ~(SK_MEM_QUANTUM - 1);
- sk->sk_forward_alloc -= amt;
+ amt = (size + sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1);
+ sk_forward_alloc_add(sk, size - amt);
if (amt)
- __sk_mem_reduce_allocated(sk, amt >> SK_MEM_QUANTUM_SHIFT);
+ __sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT);
atomic_sub(size, &sk->sk_rmem_alloc);
@@ -1316,7 +1676,7 @@ void udp_skb_destructor(struct sock *sk, struct sk_buff *skb)
prefetch(&skb->data);
udp_rmem_release(sk, udp_skb_truesize(skb), 1, false);
}
-EXPORT_SYMBOL(udp_skb_destructor);
+EXPORT_IPV6_MOD(udp_skb_destructor);
/* as above, but the caller held the rx queue lock, too */
static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb)
@@ -1325,44 +1685,49 @@ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb)
udp_rmem_release(sk, udp_skb_truesize(skb), 1, true);
}
-/* Idea of busylocks is to let producers grab an extra spinlock
- * to relieve pressure on the receive_queue spinlock shared by consumer.
- * Under flood, this means that only one producer can be in line
- * trying to acquire the receive_queue spinlock.
- * These busylock can be allocated on a per cpu manner, instead of a
- * per socket one (that would consume a cache line per socket)
- */
-static int udp_busylocks_log __read_mostly;
-static spinlock_t *udp_busylocks __read_mostly;
-
-static spinlock_t *busylock_acquire(void *ptr)
+static int udp_rmem_schedule(struct sock *sk, int size)
{
- spinlock_t *busy;
+ int delta;
- busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log);
- spin_lock(busy);
- return busy;
-}
+ delta = size - sk->sk_forward_alloc;
+ if (delta > 0 && !__sk_mem_schedule(sk, delta, SK_MEM_RECV))
+ return -ENOBUFS;
-static void busylock_release(spinlock_t *busy)
-{
- if (busy)
- spin_unlock(busy);
+ return 0;
}
int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
{
struct sk_buff_head *list = &sk->sk_receive_queue;
- int rmem, delta, amt, err = -ENOMEM;
- spinlock_t *busy = NULL;
- int size;
+ struct udp_prod_queue *udp_prod_queue;
+ struct sk_buff *next, *to_drop = NULL;
+ struct llist_node *ll_list;
+ unsigned int rmem, rcvbuf;
+ int size, err = -ENOMEM;
+ int total_size = 0;
+ int q_size = 0;
+ int dropcount;
+ int nb = 0;
- /* try to avoid the costly atomic add/sub pair when the receive
- * queue is full; always allow at least a packet
- */
rmem = atomic_read(&sk->sk_rmem_alloc);
- if (rmem > sk->sk_rcvbuf)
- goto drop;
+ rcvbuf = READ_ONCE(sk->sk_rcvbuf);
+ size = skb->truesize;
+
+ udp_prod_queue = &udp_sk(sk)->udp_prod_queue[numa_node_id()];
+
+ rmem += atomic_read(&udp_prod_queue->rmem_alloc);
+
+ /* Immediately drop when the receive queue is full.
+ * Cast to unsigned int performs the boundary check for INT_MAX.
+ */
+ if (rmem + size > rcvbuf) {
+ if (rcvbuf > INT_MAX >> 1)
+ goto drop;
+
+ /* Accept the packet if queue is empty. */
+ if (rmem)
+ goto drop;
+ }
/* Under mem pressure, it might be helpful to help udp_recvmsg()
* having linear skbs :
@@ -1370,61 +1735,85 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
* - Less cache line misses at copyout() time
* - Less work at consume_skb() (less alien page frag freeing)
*/
- if (rmem > (sk->sk_rcvbuf >> 1)) {
+ if (rmem > (rcvbuf >> 1)) {
skb_condense(skb);
-
- busy = busylock_acquire(sk);
+ size = skb->truesize;
}
- size = skb->truesize;
+
udp_set_dev_scratch(skb);
- /* we drop only if the receive buf is full and the receive
- * queue contains some other skb
- */
- rmem = atomic_add_return(size, &sk->sk_rmem_alloc);
- if (rmem > (size + sk->sk_rcvbuf))
- goto uncharge_drop;
+ atomic_add(size, &udp_prod_queue->rmem_alloc);
+
+ if (!llist_add(&skb->ll_node, &udp_prod_queue->ll_root))
+ return 0;
+
+ dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? sk_drops_read(sk) : 0;
spin_lock(&list->lock);
- if (size >= sk->sk_forward_alloc) {
- amt = sk_mem_pages(size);
- delta = amt << SK_MEM_QUANTUM_SHIFT;
- if (!__sk_mem_raise_allocated(sk, delta, amt, SK_MEM_RECV)) {
- err = -ENOBUFS;
- spin_unlock(&list->lock);
- goto uncharge_drop;
+
+ ll_list = llist_del_all(&udp_prod_queue->ll_root);
+
+ ll_list = llist_reverse_order(ll_list);
+
+ llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
+ size = udp_skb_truesize(skb);
+ total_size += size;
+ err = udp_rmem_schedule(sk, size);
+ if (unlikely(err)) {
+ /* Free the skbs outside of locked section. */
+ skb->next = to_drop;
+ to_drop = skb;
+ continue;
}
- sk->sk_forward_alloc += delta;
- }
+ q_size += size;
+ sk_forward_alloc_add(sk, -size);
- sk->sk_forward_alloc -= size;
+ /* no need to setup a destructor, we will explicitly release the
+ * forward allocated memory on dequeue
+ */
+ SOCK_SKB_CB(skb)->dropcount = dropcount;
+ nb++;
+ __skb_queue_tail(list, skb);
+ }
- /* no need to setup a destructor, we will explicitly release the
- * forward allocated memory on dequeue
- */
- sock_skb_set_dropcount(sk, skb);
+ atomic_add(q_size, &sk->sk_rmem_alloc);
- __skb_queue_tail(list, skb);
spin_unlock(&list->lock);
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk);
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ /* Multiple threads might be blocked in recvmsg(),
+ * using prepare_to_wait_exclusive().
+ */
+ while (nb) {
+ INDIRECT_CALL_1(sk->sk_data_ready,
+ sock_def_readable, sk);
+ nb--;
+ }
+ }
- busylock_release(busy);
- return 0;
+ if (unlikely(to_drop)) {
+ for (nb = 0; to_drop != NULL; nb++) {
+ skb = to_drop;
+ to_drop = skb->next;
+ skb_mark_not_on_list(skb);
+ /* TODO: update SNMP values. */
+ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PROTO_MEM);
+ }
+ numa_drop_add(&udp_sk(sk)->drop_counters, nb);
+ }
+
+ atomic_sub(total_size, &udp_prod_queue->rmem_alloc);
-uncharge_drop:
- atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+ return 0;
drop:
- atomic_inc(&sk->sk_drops);
- busylock_release(busy);
+ udp_drops_inc(sk);
return err;
}
-EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb);
+EXPORT_IPV6_MOD_GPL(__udp_enqueue_schedule_skb);
-void udp_destruct_sock(struct sock *sk)
+void udp_destruct_common(struct sock *sk)
{
/* reclaim completely the forward allocated memory */
struct udp_sock *up = udp_sk(sk);
@@ -1437,26 +1826,33 @@ void udp_destruct_sock(struct sock *sk)
kfree_skb(skb);
}
udp_rmem_release(sk, total, 0, true);
+ kfree(up->udp_prod_queue);
+}
+EXPORT_IPV6_MOD_GPL(udp_destruct_common);
+static void udp_destruct_sock(struct sock *sk)
+{
+ udp_destruct_common(sk);
inet_sock_destruct(sk);
}
-EXPORT_SYMBOL_GPL(udp_destruct_sock);
int udp_init_sock(struct sock *sk)
{
- skb_queue_head_init(&udp_sk(sk)->reader_queue);
+ int res = udp_lib_init_sock(sk);
+
sk->sk_destruct = udp_destruct_sock;
- return 0;
+ set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
+ return res;
}
-EXPORT_SYMBOL_GPL(udp_init_sock);
void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
{
- if (unlikely(READ_ONCE(sk->sk_peek_off) >= 0)) {
- bool slow = lock_sock_fast(sk);
-
+ if (unlikely(READ_ONCE(udp_sk(sk)->peeking_with_offset)))
sk_peek_offset_bwd(sk, len);
- unlock_sock_fast(sk, slow);
+
+ if (!skb_shared(skb)) {
+ skb_attempt_defer_free(skb);
+ return;
}
if (!skb_unref(skb))
@@ -1469,11 +1865,11 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
skb_release_head_state(skb);
__consume_stateless_skb(skb);
}
-EXPORT_SYMBOL_GPL(skb_consume_udp);
+EXPORT_IPV6_MOD_GPL(skb_consume_udp);
static struct sk_buff *__first_packet_length(struct sock *sk,
struct sk_buff_head *rcvq,
- int *total)
+ unsigned int *total)
{
struct sk_buff *skb;
@@ -1483,15 +1879,12 @@ static struct sk_buff *__first_packet_length(struct sock *sk,
IS_UDPLITE(sk));
__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
IS_UDPLITE(sk));
- atomic_inc(&sk->sk_drops);
+ udp_drops_inc(sk);
__skb_unlink(skb, rcvq);
*total += skb->truesize;
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM);
} else {
- /* the csum related bits could be changed, refresh
- * the scratch area
- */
- udp_set_dev_scratch(skb);
+ udp_skb_csum_unnecessary_set(skb);
break;
}
}
@@ -1509,13 +1902,13 @@ static int first_packet_length(struct sock *sk)
{
struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue;
struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
+ unsigned int total = 0;
struct sk_buff *skb;
- int total = 0;
int res;
spin_lock_bh(&rcvq->lock);
skb = __first_packet_length(sk, rcvq, &total);
- if (!skb && !skb_queue_empty(sk_queue)) {
+ if (!skb && !skb_queue_empty_lockless(sk_queue)) {
spin_lock(&sk_queue->lock);
skb_queue_splice_tail_init(sk_queue, rcvq);
spin_unlock(&sk_queue->lock);
@@ -1533,21 +1926,19 @@ static int first_packet_length(struct sock *sk)
* IOCTL requests applicable to the UDP protocol
*/
-int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+int udp_ioctl(struct sock *sk, int cmd, int *karg)
{
switch (cmd) {
case SIOCOUTQ:
{
- int amount = sk_wmem_alloc_get(sk);
-
- return put_user(amount, (int __user *)arg);
+ *karg = sk_wmem_alloc_get(sk);
+ return 0;
}
case SIOCINQ:
{
- int amount = max_t(int, 0, first_packet_length(sk));
-
- return put_user(amount, (int __user *)arg);
+ *karg = max_t(int, 0, first_packet_length(sk));
+ return 0;
}
default:
@@ -1556,10 +1947,10 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
return 0;
}
-EXPORT_SYMBOL(udp_ioctl);
+EXPORT_IPV6_MOD(udp_ioctl);
struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
- int noblock, int *peeked, int *off, int *err)
+ int *off, int *err)
{
struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
struct sk_buff_head *queue;
@@ -1568,7 +1959,6 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
int error;
queue = &udp_sk(sk)->reader_queue;
- flags |= noblock ? MSG_DONTWAIT : 0;
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
do {
struct sk_buff *skb;
@@ -1578,19 +1968,18 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
break;
error = -EAGAIN;
- *peeked = 0;
do {
spin_lock_bh(&queue->lock);
- skb = __skb_try_recv_from_queue(sk, queue, flags,
- udp_skb_destructor,
- peeked, off, err,
+ skb = __skb_try_recv_from_queue(queue, flags, off, err,
&last);
if (skb) {
+ if (!(flags & MSG_PEEK))
+ udp_skb_destructor(sk, skb);
spin_unlock_bh(&queue->lock);
return skb;
}
- if (skb_queue_empty(sk_queue)) {
+ if (skb_queue_empty_lockless(sk_queue)) {
spin_unlock_bh(&queue->lock);
goto busy_check;
}
@@ -1603,10 +1992,10 @@ struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags,
spin_lock(&sk_queue->lock);
skb_queue_splice_tail_init(sk_queue, queue);
- skb = __skb_try_recv_from_queue(sk, queue, flags,
- udp_skb_dtor_locked,
- peeked, off, err,
+ skb = __skb_try_recv_from_queue(queue, flags, off, err,
&last);
+ if (skb && !(flags & MSG_PEEK))
+ udp_skb_dtor_locked(sk, skb);
spin_unlock(&sk_queue->lock);
spin_unlock_bh(&queue->lock);
if (skb)
@@ -1617,11 +2006,12 @@ busy_check:
break;
sk_busy_loop(sk, flags & MSG_DONTWAIT);
- } while (!skb_queue_empty(sk_queue));
+ } while (!skb_queue_empty_lockless(sk_queue));
/* sk_queue is empty, reader_queue may contain peeked packets */
} while (timeo &&
- !__skb_wait_for_more_packets(sk, &error, &timeo,
+ !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
+ &error, &timeo,
(struct sk_buff *)sk_queue));
*err = error;
@@ -1629,20 +2019,45 @@ busy_check:
}
EXPORT_SYMBOL(__skb_recv_udp);
+int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
+{
+ struct sk_buff *skb;
+ int err;
+
+try_again:
+ skb = skb_recv_udp(sk, MSG_DONTWAIT, &err);
+ if (!skb)
+ return err;
+
+ if (udp_lib_checksum_complete(skb)) {
+ int is_udplite = IS_UDPLITE(sk);
+ struct net *net = sock_net(sk);
+
+ __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite);
+ __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite);
+ udp_drops_inc(sk);
+ kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM);
+ goto try_again;
+ }
+
+ WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk));
+ return recv_actor(sk, skb);
+}
+EXPORT_IPV6_MOD(udp_read_skb);
+
/*
* This should be easy, if there is something there we
* return it, otherwise we block.
*/
-int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
- int flags, int *addr_len)
+int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+ int *addr_len)
{
struct inet_sock *inet = inet_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
struct sk_buff *skb;
unsigned int ulen, copied;
- int peeked, peeking, off;
- int err;
+ int off, err, peeking = flags & MSG_PEEK;
int is_udplite = IS_UDPLITE(sk);
bool checksum_valid = false;
@@ -1650,9 +2065,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
return ip_recv_error(sk, msg, len, addr_len);
try_again:
- peeking = flags & MSG_PEEK;
off = sk_peek_offset(sk, flags);
- skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
+ skb = __skb_recv_udp(sk, flags, &off, &err);
if (!skb)
return err;
@@ -1690,8 +2104,8 @@ try_again:
}
if (unlikely(err)) {
- if (!peeked) {
- atomic_inc(&sk->sk_drops);
+ if (!peeking) {
+ udp_drops_inc(sk);
UDP_INC_STATS(sock_net(sk),
UDP_MIB_INERRORS, is_udplite);
}
@@ -1699,11 +2113,11 @@ try_again:
return err;
}
- if (!peeked)
+ if (!peeking)
UDP_INC_STATS(sock_net(sk),
UDP_MIB_INDATAGRAMS, is_udplite);
- sock_recv_ts_and_drops(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
/* Copy the address. */
if (sin) {
@@ -1712,8 +2126,16 @@ try_again:
sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
*addr_len = sizeof(*sin);
+
+ BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
+ (struct sockaddr *)sin,
+ addr_len);
}
- if (inet->cmsg_flags)
+
+ if (udp_test_bit(GRO_ENABLED, sk))
+ udp_cmsg_recv(msg, sk, skb);
+
+ if (inet_cmsg_flags(inet))
ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);
err = copied;
@@ -1729,7 +2151,7 @@ csum_copy_err:
UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
}
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM);
/* starting over for a new packet, but check if we need to yield */
cond_resched();
@@ -1737,7 +2159,8 @@ csum_copy_err:
goto try_again;
}
-int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int udp_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
{
/* This check is replicated from __ip4_datagram_connect() and
* intended to prevent BPF program called below from accessing bytes
@@ -1746,9 +2169,22 @@ int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (addr_len < sizeof(struct sockaddr_in))
return -EINVAL;
- return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr);
+ return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len);
+}
+EXPORT_IPV6_MOD(udp_pre_connect);
+
+static int udp_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
+{
+ int res;
+
+ lock_sock(sk);
+ res = __ip4_datagram_connect(sk, uaddr, addr_len);
+ if (!res)
+ udp4_hash4(sk);
+ release_sock(sk);
+ return res;
}
-EXPORT_SYMBOL(udp_pre_connect);
int __udp_disconnect(struct sock *sk, int flags)
{
@@ -1762,8 +2198,12 @@ int __udp_disconnect(struct sock *sk, int flags)
inet->inet_dport = 0;
sock_rps_reset_rxhash(sk);
sk->sk_bound_dev_if = 0;
- if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+ if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) {
inet_reset_saddr(sk);
+ if (sk->sk_prot->rehash &&
+ (sk->sk_userlocks & SOCK_BINDPORT_LOCK))
+ sk->sk_prot->rehash(sk);
+ }
if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
sk->sk_prot->unhash(sk);
@@ -1781,14 +2221,15 @@ int udp_disconnect(struct sock *sk, int flags)
release_sock(sk);
return 0;
}
-EXPORT_SYMBOL(udp_disconnect);
+EXPORT_IPV6_MOD(udp_disconnect);
void udp_lib_unhash(struct sock *sk)
{
if (sk_hashed(sk)) {
- struct udp_table *udptable = sk->sk_prot->h.udp_table;
+ struct udp_table *udptable = udp_get_table_prot(sk);
struct udp_hslot *hslot, *hslot2;
+ sock_rps_delete_flow(sk);
hslot = udp_hashslot(udptable, sock_net(sk),
udp_sk(sk)->udp_port_hash);
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
@@ -1805,29 +2246,31 @@ void udp_lib_unhash(struct sock *sk)
hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
hslot2->count--;
spin_unlock(&hslot2->lock);
+
+ udp_unhash4(udptable, sk);
}
spin_unlock_bh(&hslot->lock);
}
}
-EXPORT_SYMBOL(udp_lib_unhash);
+EXPORT_IPV6_MOD(udp_lib_unhash);
/*
* inet_rcv_saddr was changed, we must rehash secondary hash
*/
-void udp_lib_rehash(struct sock *sk, u16 newhash)
+void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4)
{
if (sk_hashed(sk)) {
- struct udp_table *udptable = sk->sk_prot->h.udp_table;
+ struct udp_table *udptable = udp_get_table_prot(sk);
struct udp_hslot *hslot, *hslot2, *nhslot2;
+ hslot = udp_hashslot(udptable, sock_net(sk),
+ udp_sk(sk)->udp_port_hash);
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
nhslot2 = udp_hashslot2(udptable, newhash);
udp_sk(sk)->udp_portaddr_hash = newhash;
if (hslot2 != nhslot2 ||
rcu_access_pointer(sk->sk_reuseport_cb)) {
- hslot = udp_hashslot(udptable, sock_net(sk),
- udp_sk(sk)->udp_port_hash);
/* we must lock primary chain too */
spin_lock_bh(&hslot->lock);
if (rcu_access_pointer(sk->sk_reuseport_cb))
@@ -1848,16 +2291,43 @@ void udp_lib_rehash(struct sock *sk, u16 newhash)
spin_unlock_bh(&hslot->lock);
}
+
+ /* Now process hash4 if necessary:
+ * (1) update hslot4;
+ * (2) update hslot2->hash4_cnt.
+ * Note that hslot2/hslot4 should be checked separately, as
+ * either of them may change with the other unchanged.
+ */
+ if (udp_hashed4(sk)) {
+ spin_lock_bh(&hslot->lock);
+
+ udp_rehash4(udptable, sk, newhash4);
+ if (hslot2 != nhslot2) {
+ spin_lock(&hslot2->lock);
+ udp_hash4_dec(hslot2);
+ spin_unlock(&hslot2->lock);
+
+ spin_lock(&nhslot2->lock);
+ udp_hash4_inc(nhslot2);
+ spin_unlock(&nhslot2->lock);
+ }
+
+ spin_unlock_bh(&hslot->lock);
+ }
}
}
-EXPORT_SYMBOL(udp_lib_rehash);
+EXPORT_IPV6_MOD(udp_lib_rehash);
-static void udp_v4_rehash(struct sock *sk)
+void udp_v4_rehash(struct sock *sk)
{
u16 new_hash = ipv4_portaddr_hash(sock_net(sk),
inet_sk(sk)->inet_rcv_saddr,
inet_sk(sk)->inet_num);
- udp_lib_rehash(sk, new_hash);
+ u16 new_hash4 = udp_ehashfn(sock_net(sk),
+ sk->sk_rcv_saddr, sk->sk_num,
+ sk->sk_daddr, sk->sk_dport);
+
+ udp_lib_rehash(sk, new_hash, new_hash4);
}
static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
@@ -1875,27 +2345,27 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
rc = __udp_enqueue_schedule_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
+ int drop_reason;
/* Note that an ENOMEM error is charged twice */
- if (rc == -ENOMEM)
+ if (rc == -ENOMEM) {
UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
is_udplite);
+ drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
+ } else {
+ UDP_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS,
+ is_udplite);
+ drop_reason = SKB_DROP_REASON_PROTO_MEM;
+ }
UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
- kfree_skb(skb);
- trace_udp_fail_queue_rcv_skb(rc, sk);
+ trace_udp_fail_queue_rcv_skb(rc, sk, skb);
+ sk_skb_reason_drop(sk, skb, drop_reason);
return -1;
}
return 0;
}
-static DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
-void udp_encap_enable(void)
-{
- static_branch_enable(&udp_encap_needed_key);
-}
-EXPORT_SYMBOL(udp_encap_enable);
-
/* returns:
* -1: error
* 0: success
@@ -1904,19 +2374,23 @@ EXPORT_SYMBOL(udp_encap_enable);
* Note that in the success and error cases, the skb is assumed to
* have either been requeued or freed.
*/
-static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{
+ enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct udp_sock *up = udp_sk(sk);
int is_udplite = IS_UDPLITE(sk);
/*
* Charge it to the socket, dropping if the queue is full.
*/
- if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
goto drop;
- nf_reset(skb);
+ }
+ nf_reset_ct(skb);
- if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
+ if (static_branch_unlikely(&udp_encap_needed_key) &&
+ READ_ONCE(up->encap_type)) {
int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
/*
@@ -1954,7 +2428,8 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
/*
* UDP-Lite specific tests, ignored on UDP sockets
*/
- if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
+ if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) {
+ u16 pcrlen = READ_ONCE(up->pcrlen);
/*
* MIB statistics other than incrementing the error count are
@@ -1967,7 +2442,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
* delivery of packets with coverage values less than a value
* provided by the application."
*/
- if (up->pcrlen == 0) { /* full coverage was set */
+ if (pcrlen == 0) { /* full coverage was set */
net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n",
UDP_SKB_CB(skb)->cscov, skb->len);
goto drop;
@@ -1978,9 +2453,9 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
* that it wants x while sender emits packets of smaller size y.
* Therefore the above ...()->partial_cov statement is essential.
*/
- if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
+ if (UDP_SKB_CB(skb)->cscov < pcrlen) {
net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n",
- UDP_SKB_CB(skb)->cscov, up->pcrlen);
+ UDP_SKB_CB(skb)->cscov, pcrlen);
goto drop;
}
}
@@ -1990,23 +2465,46 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
udp_lib_checksum_complete(skb))
goto csum_error;
- if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr)))
+ if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason))
goto drop;
udp_csum_pull_header(skb);
- ipv4_pktinfo_prepare(sk, skb);
+ ipv4_pktinfo_prepare(sk, skb, true);
return __udp_queue_rcv_skb(sk, skb);
csum_error:
+ drop_reason = SKB_DROP_REASON_UDP_CSUM;
__UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
drop:
__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
- atomic_inc(&sk->sk_drops);
- kfree_skb(skb);
+ udp_drops_inc(sk);
+ sk_skb_reason_drop(sk, skb, drop_reason);
return -1;
}
+static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_buff *next, *segs;
+ int ret;
+
+ if (likely(!udp_unexpected_gso(sk, skb)))
+ return udp_queue_rcv_one_skb(sk, skb);
+
+ BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_GSO_CB_OFFSET);
+ __skb_push(skb, -skb_mac_offset(skb));
+ segs = udp_rcv_segment(sk, skb, true);
+ skb_list_walk_safe(segs, skb, next) {
+ __skb_pull(skb, skb_transport_offset(skb));
+
+ udp_post_segment_fix_csum(skb);
+ ret = udp_queue_rcv_one_skb(sk, skb);
+ if (ret > 0)
+ ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret);
+ }
+ return 0;
+}
+
/* For TCP sockets, sk_rx_dst is protected by socket lock
* For UDP, we use xchg() to guard against concurrent changes.
*/
@@ -2015,13 +2513,13 @@ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
struct dst_entry *old;
if (dst_hold_safe(dst)) {
- old = xchg(&sk->sk_rx_dst, dst);
+ old = unrcu_pointer(xchg(&sk->sk_rx_dst, RCU_INITIALIZER(dst)));
dst_release(old);
return old != dst;
}
return false;
}
-EXPORT_SYMBOL(udp_sk_rx_dst_set);
+EXPORT_IPV6_MOD(udp_sk_rx_dst_set);
/*
* Multicasts and broadcasts go to each listener.
@@ -2049,7 +2547,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
udptable->mask;
hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask;
start_lookup:
- hslot = &udptable->hash2[hash2];
+ hslot = &udptable->hash2[hash2].hslot;
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
}
@@ -2065,7 +2563,7 @@ start_lookup:
nskb = skb_clone(skb, GFP_ATOMIC);
if (unlikely(!nskb)) {
- atomic_inc(&sk->sk_drops);
+ udp_drops_inc(sk);
__UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
IS_UDPLITE(sk));
__UDP_INC_STATS(net, UDP_MIB_INERRORS,
@@ -2095,7 +2593,7 @@ start_lookup:
/* Initialize UDP checksum. If exited with zero value (success),
* CHECKSUM_UNNECESSARY means, that no more checks are required.
- * Otherwise, csum completion requires chacksumming packet body,
+ * Otherwise, csum completion requires checksumming packet body,
* including udp header and folding it to skb->csum.
*/
static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
@@ -2120,11 +2618,27 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
/* Note, we are only interested in != 0 or == 0, thus the
* force to int.
*/
- return (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
- inet_compute_pseudo);
+ err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
+ inet_compute_pseudo);
+ if (err)
+ return err;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) {
+ /* If SW calculated the value, we know it's bad */
+ if (skb->csum_complete_sw)
+ return 1;
+
+ /* HW says the value is bad. Let's validate that.
+ * skb->csum is no longer the full packet checksum,
+ * so don't treat it as such.
+ */
+ skb_checksum_complete_unset(skb);
+ }
+
+ return 0;
}
-/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
+/* wrapper for udp_queue_rcv_skb taking care of csum conversion and
* return code conversion for ip layer consumption
*/
static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
@@ -2133,8 +2647,7 @@ static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
int ret;
if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
- skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
- inet_compute_pseudo);
+ skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo);
ret = udp_queue_rcv_skb(sk, skb);
@@ -2153,12 +2666,16 @@ static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
int proto)
{
- struct sock *sk;
+ struct sock *sk = NULL;
struct udphdr *uh;
unsigned short ulen;
struct rtable *rt = skb_rtable(skb);
__be32 saddr, daddr;
struct net *net = dev_net(skb->dev);
+ bool refcounted;
+ int drop_reason;
+
+ drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
/*
* Validate the packet.
@@ -2184,16 +2701,21 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
if (udp4_csum_init(skb, uh, proto))
goto csum_error;
- sk = skb_steal_sock(skb);
+ sk = inet_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest,
+ &refcounted, udp_ehashfn);
+ if (IS_ERR(sk))
+ goto no_sk;
+
if (sk) {
struct dst_entry *dst = skb_dst(skb);
int ret;
- if (unlikely(sk->sk_rx_dst != dst))
+ if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst))
udp_sk_rx_dst_set(sk, dst);
ret = udp_unicast_rcv_skb(sk, skb, uh);
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
return ret;
}
@@ -2204,15 +2726,16 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
if (sk)
return udp_unicast_rcv_skb(sk, skb, uh);
-
+no_sk:
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
- nf_reset(skb);
+ nf_reset_ct(skb);
/* No socket. Drop packet silently, if checksum is wrong */
if (udp_lib_checksum_complete(skb))
goto csum_error;
+ drop_reason = SKB_DROP_REASON_NO_SOCKET;
__UDP_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
@@ -2220,10 +2743,11 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
* Hmm. We got an UDP packet to a port to which we
* don't wanna listen. Ignore it.
*/
- kfree_skb(skb);
+ sk_skb_reason_drop(sk, skb, drop_reason);
return 0;
short_packet:
+ drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n",
proto == IPPROTO_UDPLITE ? "Lite" : "",
&saddr, ntohs(uh->source),
@@ -2236,6 +2760,7 @@ csum_error:
* RFC1122: OK. Discards the bad packet silently (as far as
* the network is concerned, anyway) as per 4.1.3.4 (MUST).
*/
+ drop_reason = SKB_DROP_REASON_UDP_CSUM;
net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n",
proto == IPPROTO_UDPLITE ? "Lite" : "",
&saddr, ntohs(uh->source), &daddr, ntohs(uh->dest),
@@ -2243,7 +2768,7 @@ csum_error:
__UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
drop:
__UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
- kfree_skb(skb);
+ sk_skb_reason_drop(sk, skb, drop_reason);
return 0;
}
@@ -2255,10 +2780,14 @@ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net,
__be16 rmt_port, __be32 rmt_addr,
int dif, int sdif)
{
- struct sock *sk, *result;
+ struct udp_table *udptable = net->ipv4.udp_table;
unsigned short hnum = ntohs(loc_port);
- unsigned int slot = udp_hashfn(net, hnum, udp_table.mask);
- struct udp_hslot *hslot = &udp_table.hash[slot];
+ struct sock *sk, *result;
+ struct udp_hslot *hslot;
+ unsigned int slot;
+
+ slot = udp_hashfn(net, hnum, udptable->mask);
+ hslot = &udptable->hash[slot];
/* Do not bother scanning a too big list */
if (hslot->count > 10)
@@ -2286,17 +2815,20 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
__be16 rmt_port, __be32 rmt_addr,
int dif, int sdif)
{
- unsigned short hnum = ntohs(loc_port);
- unsigned int hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
- unsigned int slot2 = hash2 & udp_table.mask;
- struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
+ struct udp_table *udptable = net->ipv4.udp_table;
INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr);
- const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
+ unsigned short hnum = ntohs(loc_port);
+ struct udp_hslot *hslot2;
+ unsigned int hash2;
+ __portpair ports;
struct sock *sk;
+ hash2 = ipv4_portaddr_hash(net, loc_addr, hnum);
+ hslot2 = udp_hashslot2(udptable, hash2);
+ ports = INET_COMBINED_PORTS(rmt_port, hnum);
+
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
- if (INET_MATCH(sk, net, acookie, rmt_addr,
- loc_addr, ports, dif, sdif))
+ if (inet_match(net, sk, acookie, ports, dif, sdif))
return sk;
/* Only check first socket in chain */
break;
@@ -2304,7 +2836,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
return NULL;
}
-int udp_v4_early_demux(struct sk_buff *skb)
+enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
struct in_device *in_dev = NULL;
@@ -2318,7 +2850,7 @@ int udp_v4_early_demux(struct sk_buff *skb)
/* validate the packet */
if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
- return 0;
+ return SKB_NOT_DROPPED_YET;
iph = ip_hdr(skb);
uh = udp_hdr(skb);
@@ -2327,12 +2859,12 @@ int udp_v4_early_demux(struct sk_buff *skb)
in_dev = __in_dev_get_rcu(skb->dev);
if (!in_dev)
- return 0;
+ return SKB_NOT_DROPPED_YET;
ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
iph->protocol);
if (!ours)
- return 0;
+ return SKB_NOT_DROPPED_YET;
sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
uh->source, iph->saddr,
@@ -2342,12 +2874,13 @@ int udp_v4_early_demux(struct sk_buff *skb)
uh->source, iph->saddr, dif, sdif);
}
- if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
- return 0;
+ if (!sk)
+ return SKB_NOT_DROPPED_YET;
skb->sk = sk;
- skb->destructor = sock_efree;
- dst = READ_ONCE(sk->sk_rx_dst);
+ DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk));
+ skb->destructor = sock_pfree;
+ dst = rcu_dereference(sk->sk_rx_dst);
if (dst)
dst = dst_check(dst, 0);
@@ -2365,36 +2898,77 @@ int udp_v4_early_demux(struct sk_buff *skb)
*/
if (!inet_sk(sk)->inet_daddr && in_dev)
return ip_mc_validate_source(skb, iph->daddr,
- iph->saddr, iph->tos,
+ iph->saddr,
+ ip4h_dscp(iph),
skb->dev, in_dev, &itag);
}
- return 0;
+ return SKB_NOT_DROPPED_YET;
}
int udp_rcv(struct sk_buff *skb)
{
- return __udp4_lib_rcv(skb, &udp_table, IPPROTO_UDP);
+ return __udp4_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP);
}
void udp_destroy_sock(struct sock *sk)
{
struct udp_sock *up = udp_sk(sk);
bool slow = lock_sock_fast(sk);
+
+ /* protects from races with udp_abort() */
+ sock_set_flag(sk, SOCK_DEAD);
udp_flush_pending_frames(sk);
unlock_sock_fast(sk, slow);
- if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
- void (*encap_destroy)(struct sock *sk);
- encap_destroy = READ_ONCE(up->encap_destroy);
- if (encap_destroy)
- encap_destroy(sk);
+ if (static_branch_unlikely(&udp_encap_needed_key)) {
+ if (up->encap_type) {
+ void (*encap_destroy)(struct sock *sk);
+ encap_destroy = READ_ONCE(up->encap_destroy);
+ if (encap_destroy)
+ encap_destroy(sk);
+ }
+ if (udp_test_bit(ENCAP_ENABLED, sk)) {
+ static_branch_dec(&udp_encap_needed_key);
+ udp_tunnel_cleanup_gro(sk);
+ }
}
}
+typedef struct sk_buff *(*udp_gro_receive_t)(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb);
+
+static void set_xfrm_gro_udp_encap_rcv(__u16 encap_type, unsigned short family,
+ struct sock *sk)
+{
+#ifdef CONFIG_XFRM
+ udp_gro_receive_t new_gro_receive;
+
+ if (udp_test_bit(GRO_ENABLED, sk) && encap_type == UDP_ENCAP_ESPINUDP) {
+ if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6)
+ new_gro_receive = ipv6_stub->xfrm6_gro_udp_encap_rcv;
+ else
+ new_gro_receive = xfrm4_gro_udp_encap_rcv;
+
+ if (udp_sk(sk)->gro_receive != new_gro_receive) {
+ /*
+ * With IPV6_ADDRFORM the gro callback could change
+ * after being set, unregister the old one, if valid.
+ */
+ if (udp_sk(sk)->gro_receive)
+ udp_tunnel_update_gro_rcv(sk, false);
+
+ WRITE_ONCE(udp_sk(sk)->gro_receive, new_gro_receive);
+ udp_tunnel_update_gro_rcv(sk, true);
+ }
+ }
+#endif
+}
+
/*
* Socket option code for UDP
*/
int udp_lib_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen,
+ sockptr_t optval, unsigned int optlen,
int (*push_pending_frames)(struct sock *))
{
struct udp_sock *up = udp_sk(sk);
@@ -2402,10 +2976,22 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
int err = 0;
int is_udplite = IS_UDPLITE(sk);
+ if (level == SOL_SOCKET) {
+ err = sk_setsockopt(sk, level, optname, optval, optlen);
+
+ if (optname == SO_RCVBUF || optname == SO_RCVBUFFORCE) {
+ sockopt_lock_sock(sk);
+ /* paired with READ_ONCE in udp_rmem_release() */
+ WRITE_ONCE(up->forward_threshold, sk->sk_rcvbuf >> 2);
+ sockopt_release_sock(sk);
+ }
+ return err;
+ }
+
if (optlen < sizeof(int))
return -EINVAL;
- if (get_user(val, (int __user *)optval))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
valbool = val ? 1 : 0;
@@ -2413,9 +2999,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
switch (optname) {
case UDP_CORK:
if (val != 0) {
- up->corkflag = 1;
+ udp_set_bit(CORK, sk);
} else {
- up->corkflag = 0;
+ udp_clear_bit(CORK, sk);
lock_sock(sk);
push_pending_frames(sk);
release_sock(sk);
@@ -2423,34 +3009,56 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
break;
case UDP_ENCAP:
+ sockopt_lock_sock(sk);
switch (val) {
case 0:
+#ifdef CONFIG_XFRM
case UDP_ENCAP_ESPINUDP:
- case UDP_ENCAP_ESPINUDP_NON_IKE:
- up->encap_rcv = xfrm4_udp_encap_rcv;
- /* FALLTHROUGH */
+ set_xfrm_gro_udp_encap_rcv(val, sk->sk_family, sk);
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ WRITE_ONCE(up->encap_rcv,
+ ipv6_stub->xfrm6_udp_encap_rcv);
+ else
+#endif
+ WRITE_ONCE(up->encap_rcv,
+ xfrm4_udp_encap_rcv);
+#endif
+ fallthrough;
case UDP_ENCAP_L2TPINUDP:
- up->encap_type = val;
- udp_encap_enable();
+ WRITE_ONCE(up->encap_type, val);
+ udp_tunnel_encap_enable(sk);
break;
default:
err = -ENOPROTOOPT;
break;
}
+ sockopt_release_sock(sk);
break;
case UDP_NO_CHECK6_TX:
- up->no_check6_tx = valbool;
+ udp_set_no_check6_tx(sk, valbool);
break;
case UDP_NO_CHECK6_RX:
- up->no_check6_rx = valbool;
+ udp_set_no_check6_rx(sk, valbool);
break;
case UDP_SEGMENT:
if (val < 0 || val > USHRT_MAX)
return -EINVAL;
- up->gso_size = val;
+ WRITE_ONCE(up->gso_size, val);
+ break;
+
+ case UDP_GRO:
+ sockopt_lock_sock(sk);
+ /* when enabling GRO, accept the related GSO packet type */
+ if (valbool)
+ udp_tunnel_encap_enable(sk);
+ udp_assign_bit(GRO_ENABLED, sk, valbool);
+ udp_assign_bit(ACCEPT_L4, sk, valbool);
+ set_xfrm_gro_udp_encap_rcv(up->encap_type, sk->sk_family, sk);
+ sockopt_release_sock(sk);
break;
/*
@@ -2465,8 +3073,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
val = 8;
else if (val > USHRT_MAX)
val = USHRT_MAX;
- up->pcslen = val;
- up->pcflag |= UDPLITE_SEND_CC;
+ WRITE_ONCE(up->pcslen, val);
+ udp_set_bit(UDPLITE_SEND_CC, sk);
break;
/* The receiver specifies a minimum checksum coverage value. To make
@@ -2479,8 +3087,8 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
val = 8;
else if (val > USHRT_MAX)
val = USHRT_MAX;
- up->pcrlen = val;
- up->pcflag |= UDPLITE_RECV_CC;
+ WRITE_ONCE(up->pcrlen, val);
+ udp_set_bit(UDPLITE_RECV_CC, sk);
break;
default:
@@ -2490,28 +3098,18 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
return err;
}
-EXPORT_SYMBOL(udp_lib_setsockopt);
+EXPORT_IPV6_MOD(udp_lib_setsockopt);
-int udp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
+int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+ unsigned int optlen)
{
- if (level == SOL_UDP || level == SOL_UDPLITE)
- return udp_lib_setsockopt(sk, level, optname, optval, optlen,
+ if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET)
+ return udp_lib_setsockopt(sk, level, optname,
+ optval, optlen,
udp_push_pending_frames);
return ip_setsockopt(sk, level, optname, optval, optlen);
}
-#ifdef CONFIG_COMPAT
-int compat_udp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- if (level == SOL_UDP || level == SOL_UDPLITE)
- return udp_lib_setsockopt(sk, level, optname, optval, optlen,
- udp_push_pending_frames);
- return compat_ip_setsockopt(sk, level, optname, optval, optlen);
-}
-#endif
-
int udp_lib_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -2521,40 +3119,44 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
if (get_user(len, optlen))
return -EFAULT;
- len = min_t(unsigned int, len, sizeof(int));
-
if (len < 0)
return -EINVAL;
+ len = min_t(unsigned int, len, sizeof(int));
+
switch (optname) {
case UDP_CORK:
- val = up->corkflag;
+ val = udp_test_bit(CORK, sk);
break;
case UDP_ENCAP:
- val = up->encap_type;
+ val = READ_ONCE(up->encap_type);
break;
case UDP_NO_CHECK6_TX:
- val = up->no_check6_tx;
+ val = udp_get_no_check6_tx(sk);
break;
case UDP_NO_CHECK6_RX:
- val = up->no_check6_rx;
+ val = udp_get_no_check6_rx(sk);
break;
case UDP_SEGMENT:
- val = up->gso_size;
+ val = READ_ONCE(up->gso_size);
+ break;
+
+ case UDP_GRO:
+ val = udp_test_bit(GRO_ENABLED, sk);
break;
/* The following two cannot be changed on UDP sockets, the return is
* always 0 (which corresponds to the full checksum coverage of UDP). */
case UDPLITE_SEND_CSCOV:
- val = up->pcslen;
+ val = READ_ONCE(up->pcslen);
break;
case UDPLITE_RECV_CSCOV:
- val = up->pcrlen;
+ val = READ_ONCE(up->pcrlen);
break;
default:
@@ -2567,7 +3169,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
return -EFAULT;
return 0;
}
-EXPORT_SYMBOL(udp_lib_getsockopt);
+EXPORT_IPV6_MOD(udp_lib_getsockopt);
int udp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
@@ -2577,20 +3179,11 @@ int udp_getsockopt(struct sock *sk, int level, int optname,
return ip_getsockopt(sk, level, optname, optval, optlen);
}
-#ifdef CONFIG_COMPAT
-int compat_udp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- if (level == SOL_UDP || level == SOL_UDPLITE)
- return udp_lib_getsockopt(sk, level, optname, optval, optlen);
- return compat_ip_getsockopt(sk, level, optname, optval, optlen);
-}
-#endif
/**
* udp_poll - wait for a UDP event.
- * @file - file struct
- * @sock - socket
- * @wait - poll table
+ * @file: - file struct
+ * @sock: - socket
+ * @wait: - poll table
*
* This is same as datagram poll, except for the special case of
* blocking sockets. If application is using a blocking fd
@@ -2604,7 +3197,7 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
__poll_t mask = datagram_poll(file, sock, wait);
struct sock *sk = sock->sk;
- if (!skb_queue_empty(&udp_sk(sk)->reader_queue))
+ if (!skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
mask |= EPOLLIN | EPOLLRDNORM;
/* Check for false positives due to checksum errors */
@@ -2612,31 +3205,43 @@ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait)
!(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
mask &= ~(EPOLLIN | EPOLLRDNORM);
+ /* psock ingress_msg queue should not contain any bad checksum frames */
+ if (sk_is_readable(sk))
+ mask |= EPOLLIN | EPOLLRDNORM;
return mask;
}
-EXPORT_SYMBOL(udp_poll);
+EXPORT_IPV6_MOD(udp_poll);
int udp_abort(struct sock *sk, int err)
{
- lock_sock(sk);
+ if (!has_current_bpf_ctx())
+ lock_sock(sk);
+
+ /* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing
+ * with close()
+ */
+ if (sock_flag(sk, SOCK_DEAD))
+ goto out;
sk->sk_err = err;
- sk->sk_error_report(sk);
+ sk_error_report(sk);
__udp_disconnect(sk, 0);
- release_sock(sk);
+out:
+ if (!has_current_bpf_ctx())
+ release_sock(sk);
return 0;
}
-EXPORT_SYMBOL_GPL(udp_abort);
+EXPORT_IPV6_MOD_GPL(udp_abort);
struct proto udp_prot = {
.name = "UDP",
.owner = THIS_MODULE,
.close = udp_lib_close,
.pre_connect = udp_pre_connect,
- .connect = ip4_datagram_connect,
+ .connect = udp_connect,
.disconnect = udp_disconnect,
.ioctl = udp_ioctl,
.init = udp_init_sock,
@@ -2645,22 +3250,24 @@ struct proto udp_prot = {
.getsockopt = udp_getsockopt,
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
- .sendpage = udp_sendpage,
+ .splice_eof = udp_splice_eof,
.release_cb = ip4_datagram_release_cb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.rehash = udp_v4_rehash,
.get_port = udp_v4_get_port,
- .memory_allocated = &udp_memory_allocated,
+ .put_port = udp_lib_unhash,
+#ifdef CONFIG_BPF_SYSCALL
+ .psock_update_sk_prot = udp_bpf_update_proto,
+#endif
+ .memory_allocated = &net_aligned_data.udp_memory_allocated,
+ .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc,
+
.sysctl_mem = sysctl_udp_mem,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
.obj_size = sizeof(struct udp_sock),
- .h.udp_table = &udp_table,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_udp_setsockopt,
- .compat_getsockopt = compat_udp_getsockopt,
-#endif
+ .h.udp_table = NULL,
.diag_destroy = udp_abort,
};
EXPORT_SYMBOL(udp_prot);
@@ -2668,25 +3275,52 @@ EXPORT_SYMBOL(udp_prot);
/* ------------------------------------------------------------------------ */
#ifdef CONFIG_PROC_FS
+static unsigned short seq_file_family(const struct seq_file *seq);
+static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
+{
+ unsigned short family = seq_file_family(seq);
+
+ /* AF_UNSPEC is used as a match all */
+ return ((family == AF_UNSPEC || family == sk->sk_family) &&
+ net_eq(sock_net(sk), seq_file_net(seq)));
+}
+
+#ifdef CONFIG_BPF_SYSCALL
+static const struct seq_operations bpf_iter_udp_seq_ops;
+#endif
+static struct udp_table *udp_get_table_seq(struct seq_file *seq,
+ struct net *net)
+{
+ const struct udp_seq_afinfo *afinfo;
+
+#ifdef CONFIG_BPF_SYSCALL
+ if (seq->op == &bpf_iter_udp_seq_ops)
+ return net->ipv4.udp_table;
+#endif
+
+ afinfo = pde_data(file_inode(seq->file));
+ return afinfo->udp_table ? : net->ipv4.udp_table;
+}
+
static struct sock *udp_get_first(struct seq_file *seq, int start)
{
- struct sock *sk;
- struct udp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
struct udp_iter_state *state = seq->private;
struct net *net = seq_file_net(seq);
+ struct udp_table *udptable;
+ struct sock *sk;
+
+ udptable = udp_get_table_seq(seq, net);
- for (state->bucket = start; state->bucket <= afinfo->udp_table->mask;
+ for (state->bucket = start; state->bucket <= udptable->mask;
++state->bucket) {
- struct udp_hslot *hslot = &afinfo->udp_table->hash[state->bucket];
+ struct udp_hslot *hslot = &udptable->hash[state->bucket];
if (hlist_empty(&hslot->head))
continue;
spin_lock_bh(&hslot->lock);
sk_for_each(sk, &hslot->head) {
- if (!net_eq(sock_net(sk), net))
- continue;
- if (sk->sk_family == afinfo->family)
+ if (seq_sk_match(seq, sk))
goto found;
}
spin_unlock_bh(&hslot->lock);
@@ -2698,17 +3332,20 @@ found:
static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
{
- struct udp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
struct udp_iter_state *state = seq->private;
struct net *net = seq_file_net(seq);
+ struct udp_table *udptable;
do {
sk = sk_next(sk);
- } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != afinfo->family));
+ } while (sk && !seq_sk_match(seq, sk));
if (!sk) {
- if (state->bucket <= afinfo->udp_table->mask)
- spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock);
+ udptable = udp_get_table_seq(seq, net);
+
+ if (state->bucket <= udptable->mask)
+ spin_unlock_bh(&udptable->hash[state->bucket].lock);
+
return udp_get_first(seq, state->bucket + 1);
}
return sk;
@@ -2731,7 +3368,7 @@ void *udp_seq_start(struct seq_file *seq, loff_t *pos)
return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
}
-EXPORT_SYMBOL(udp_seq_start);
+EXPORT_IPV6_MOD(udp_seq_start);
void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
@@ -2745,17 +3382,19 @@ void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++*pos;
return sk;
}
-EXPORT_SYMBOL(udp_seq_next);
+EXPORT_IPV6_MOD(udp_seq_next);
void udp_seq_stop(struct seq_file *seq, void *v)
{
- struct udp_seq_afinfo *afinfo = PDE_DATA(file_inode(seq->file));
struct udp_iter_state *state = seq->private;
+ struct udp_table *udptable;
+
+ udptable = udp_get_table_seq(seq, seq_file_net(seq));
- if (state->bucket <= afinfo->udp_table->mask)
- spin_unlock_bh(&afinfo->udp_table->hash[state->bucket].lock);
+ if (state->bucket <= udptable->mask)
+ spin_unlock_bh(&udptable->hash[state->bucket].lock);
}
-EXPORT_SYMBOL(udp_seq_stop);
+EXPORT_IPV6_MOD(udp_seq_stop);
/* ------------------------------------------------------------------------ */
static void udp4_format_sock(struct sock *sp, struct seq_file *f,
@@ -2768,22 +3407,22 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
__u16 srcp = ntohs(inet->inet_sport);
seq_printf(f, "%5d: %08X:%04X %08X:%04X"
- " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d",
+ " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u",
bucket, src, srcp, dest, destp, sp->sk_state,
sk_wmem_alloc_get(sp),
udp_rqueue_get(sp),
0, 0L, 0,
- from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)),
+ from_kuid_munged(seq_user_ns(f), sk_uid(sp)),
0, sock_i_ino(sp),
refcount_read(&sp->sk_refcnt), sp,
- atomic_read(&sp->sk_drops));
+ sk_drops_read(sp));
}
int udp4_seq_show(struct seq_file *seq, void *v)
{
seq_setwidth(seq, 127);
if (v == SEQ_START_TOKEN)
- seq_puts(seq, " sl local_address rem_address st tx_queue "
+ seq_puts(seq, " sl local_address rem_address st tx_queue "
"rx_queue tr tm->when retrnsmt uid timeout "
"inode ref pointer drops");
else {
@@ -2795,17 +3434,305 @@ int udp4_seq_show(struct seq_file *seq, void *v)
return 0;
}
+#ifdef CONFIG_BPF_SYSCALL
+struct bpf_iter__udp {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct udp_sock *, udp_sk);
+ uid_t uid __aligned(8);
+ int bucket __aligned(8);
+};
+
+union bpf_udp_iter_batch_item {
+ struct sock *sk;
+ __u64 cookie;
+};
+
+struct bpf_udp_iter_state {
+ struct udp_iter_state state;
+ unsigned int cur_sk;
+ unsigned int end_sk;
+ unsigned int max_sk;
+ union bpf_udp_iter_batch_item *batch;
+};
+
+static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
+ unsigned int new_batch_sz, gfp_t flags);
+static struct sock *bpf_iter_udp_resume(struct sock *first_sk,
+ union bpf_udp_iter_batch_item *cookies,
+ int n_cookies)
+{
+ struct sock *sk = NULL;
+ int i;
+
+ for (i = 0; i < n_cookies; i++) {
+ sk = first_sk;
+ udp_portaddr_for_each_entry_from(sk)
+ if (cookies[i].cookie == atomic64_read(&sk->sk_cookie))
+ goto done;
+ }
+done:
+ return sk;
+}
+
+static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
+{
+ struct bpf_udp_iter_state *iter = seq->private;
+ struct udp_iter_state *state = &iter->state;
+ unsigned int find_cookie, end_cookie;
+ struct net *net = seq_file_net(seq);
+ struct udp_table *udptable;
+ unsigned int batch_sks = 0;
+ int resume_bucket;
+ int resizes = 0;
+ struct sock *sk;
+ int err = 0;
+
+ resume_bucket = state->bucket;
+
+ /* The current batch is done, so advance the bucket. */
+ if (iter->cur_sk == iter->end_sk)
+ state->bucket++;
+
+ udptable = udp_get_table_seq(seq, net);
+
+again:
+ /* New batch for the next bucket.
+ * Iterate over the hash table to find a bucket with sockets matching
+ * the iterator attributes, and return the first matching socket from
+ * the bucket. The remaining matched sockets from the bucket are batched
+ * before releasing the bucket lock. This allows BPF programs that are
+ * called in seq_show to acquire the bucket lock if needed.
+ */
+ find_cookie = iter->cur_sk;
+ end_cookie = iter->end_sk;
+ iter->cur_sk = 0;
+ iter->end_sk = 0;
+ batch_sks = 0;
+
+ for (; state->bucket <= udptable->mask; state->bucket++) {
+ struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot;
+
+ if (hlist_empty(&hslot2->head))
+ goto next_bucket;
+
+ spin_lock_bh(&hslot2->lock);
+ sk = hlist_entry_safe(hslot2->head.first, struct sock,
+ __sk_common.skc_portaddr_node);
+ /* Resume from the first (in iteration order) unseen socket from
+ * the last batch that still exists in resume_bucket. Most of
+ * the time this will just be where the last iteration left off
+ * in resume_bucket unless that socket disappeared between
+ * reads.
+ */
+ if (state->bucket == resume_bucket)
+ sk = bpf_iter_udp_resume(sk, &iter->batch[find_cookie],
+ end_cookie - find_cookie);
+fill_batch:
+ udp_portaddr_for_each_entry_from(sk) {
+ if (seq_sk_match(seq, sk)) {
+ if (iter->end_sk < iter->max_sk) {
+ sock_hold(sk);
+ iter->batch[iter->end_sk++].sk = sk;
+ }
+ batch_sks++;
+ }
+ }
+
+ /* Allocate a larger batch and try again. */
+ if (unlikely(resizes <= 1 && iter->end_sk &&
+ iter->end_sk != batch_sks)) {
+ resizes++;
+
+ /* First, try with GFP_USER to maximize the chances of
+ * grabbing more memory.
+ */
+ if (resizes == 1) {
+ spin_unlock_bh(&hslot2->lock);
+ err = bpf_iter_udp_realloc_batch(iter,
+ batch_sks * 3 / 2,
+ GFP_USER);
+ if (err)
+ return ERR_PTR(err);
+ /* Start over. */
+ goto again;
+ }
+
+ /* Next, hold onto the lock, so the bucket doesn't
+ * change while we get the rest of the sockets.
+ */
+ err = bpf_iter_udp_realloc_batch(iter, batch_sks,
+ GFP_NOWAIT);
+ if (err) {
+ spin_unlock_bh(&hslot2->lock);
+ return ERR_PTR(err);
+ }
+
+ /* Pick up where we left off. */
+ sk = iter->batch[iter->end_sk - 1].sk;
+ sk = hlist_entry_safe(sk->__sk_common.skc_portaddr_node.next,
+ struct sock,
+ __sk_common.skc_portaddr_node);
+ batch_sks = iter->end_sk;
+ goto fill_batch;
+ }
+
+ spin_unlock_bh(&hslot2->lock);
+
+ if (iter->end_sk)
+ break;
+next_bucket:
+ resizes = 0;
+ }
+
+ WARN_ON_ONCE(iter->end_sk != batch_sks);
+ return iter->end_sk ? iter->batch[0].sk : NULL;
+}
+
+static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ struct bpf_udp_iter_state *iter = seq->private;
+ struct sock *sk;
+
+ /* Whenever seq_next() is called, the iter->cur_sk is
+ * done with seq_show(), so unref the iter->cur_sk.
+ */
+ if (iter->cur_sk < iter->end_sk)
+ sock_put(iter->batch[iter->cur_sk++].sk);
+
+ /* After updating iter->cur_sk, check if there are more sockets
+ * available in the current bucket batch.
+ */
+ if (iter->cur_sk < iter->end_sk)
+ sk = iter->batch[iter->cur_sk].sk;
+ else
+ /* Prepare a new batch. */
+ sk = bpf_iter_udp_batch(seq);
+
+ ++*pos;
+ return sk;
+}
+
+static void *bpf_iter_udp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ /* bpf iter does not support lseek, so it always
+ * continue from where it was stop()-ped.
+ */
+ if (*pos)
+ return bpf_iter_udp_batch(seq);
+
+ return SEQ_START_TOKEN;
+}
+
+static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
+ struct udp_sock *udp_sk, uid_t uid, int bucket)
+{
+ struct bpf_iter__udp ctx;
+
+ meta->seq_num--; /* skip SEQ_START_TOKEN */
+ ctx.meta = meta;
+ ctx.udp_sk = udp_sk;
+ ctx.uid = uid;
+ ctx.bucket = bucket;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v)
+{
+ struct udp_iter_state *state = seq->private;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ struct sock *sk = v;
+ uid_t uid;
+ int ret;
+
+ if (v == SEQ_START_TOKEN)
+ return 0;
+
+ lock_sock(sk);
+
+ if (unlikely(sk_unhashed(sk))) {
+ ret = SEQ_SKIP;
+ goto unlock;
+ }
+
+ uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk));
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, false);
+ ret = udp_prog_seq_show(prog, &meta, v, uid, state->bucket);
+
+unlock:
+ release_sock(sk);
+ return ret;
+}
+
+static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter)
+{
+ union bpf_udp_iter_batch_item *item;
+ unsigned int cur_sk = iter->cur_sk;
+ __u64 cookie;
+
+ /* Remember the cookies of the sockets we haven't seen yet, so we can
+ * pick up where we left off next time around.
+ */
+ while (cur_sk < iter->end_sk) {
+ item = &iter->batch[cur_sk++];
+ cookie = sock_gen_cookie(item->sk);
+ sock_put(item->sk);
+ item->cookie = cookie;
+ }
+}
+
+static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v)
+{
+ struct bpf_udp_iter_state *iter = seq->private;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ if (!v) {
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, true);
+ if (prog)
+ (void)udp_prog_seq_show(prog, &meta, v, 0, 0);
+ }
+
+ if (iter->cur_sk < iter->end_sk)
+ bpf_iter_udp_put_batch(iter);
+}
+
+static const struct seq_operations bpf_iter_udp_seq_ops = {
+ .start = bpf_iter_udp_seq_start,
+ .next = bpf_iter_udp_seq_next,
+ .stop = bpf_iter_udp_seq_stop,
+ .show = bpf_iter_udp_seq_show,
+};
+#endif
+
+static unsigned short seq_file_family(const struct seq_file *seq)
+{
+ const struct udp_seq_afinfo *afinfo;
+
+#ifdef CONFIG_BPF_SYSCALL
+ /* BPF iterator: bpf programs to filter sockets. */
+ if (seq->op == &bpf_iter_udp_seq_ops)
+ return AF_UNSPEC;
+#endif
+
+ /* Proc fs iterator */
+ afinfo = pde_data(file_inode(seq->file));
+ return afinfo->family;
+}
+
const struct seq_operations udp_seq_ops = {
.start = udp_seq_start,
.next = udp_seq_next,
.stop = udp_seq_stop,
.show = udp4_seq_show,
};
-EXPORT_SYMBOL(udp_seq_ops);
+EXPORT_IPV6_MOD(udp_seq_ops);
static struct udp_seq_afinfo udp4_seq_afinfo = {
.family = AF_INET,
- .udp_table = &udp_table,
+ .udp_table = NULL,
};
static int __net_init udp4_proc_init_net(struct net *net)
@@ -2857,29 +3784,32 @@ __setup("uhash_entries=", set_uhash_entries);
void __init udp_table_init(struct udp_table *table, const char *name)
{
- unsigned int i;
+ unsigned int i, slot_size;
+ slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main) +
+ udp_hash4_slot_size();
table->hash = alloc_large_system_hash(name,
- 2 * sizeof(struct udp_hslot),
+ slot_size,
uhash_entries,
21, /* one slot per 2 MB */
0,
&table->log,
&table->mask,
UDP_HTABLE_SIZE_MIN,
- 64 * 1024);
+ UDP_HTABLE_SIZE_MAX);
- table->hash2 = table->hash + (table->mask + 1);
+ table->hash2 = (void *)(table->hash + (table->mask + 1));
for (i = 0; i <= table->mask; i++) {
INIT_HLIST_HEAD(&table->hash[i].head);
table->hash[i].count = 0;
spin_lock_init(&table->hash[i].lock);
}
for (i = 0; i <= table->mask; i++) {
- INIT_HLIST_HEAD(&table->hash2[i].head);
- table->hash2[i].count = 0;
- spin_lock_init(&table->hash2[i].lock);
+ INIT_HLIST_HEAD(&table->hash2[i].hslot.head);
+ table->hash2[i].hslot.count = 0;
+ spin_lock_init(&table->hash2[i].hslot.lock);
}
+ udp_table_hash4_init(table);
}
u32 udp_flow_hashrnd(void)
@@ -2892,30 +3822,207 @@ u32 udp_flow_hashrnd(void)
}
EXPORT_SYMBOL(udp_flow_hashrnd);
-static void __udp_sysctl_init(struct net *net)
+static void __net_init udp_sysctl_init(struct net *net)
{
- net->ipv4.sysctl_udp_rmem_min = SK_MEM_QUANTUM;
- net->ipv4.sysctl_udp_wmem_min = SK_MEM_QUANTUM;
+ net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE;
+ net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE;
#ifdef CONFIG_NET_L3_MASTER_DEV
net->ipv4.sysctl_udp_l3mdev_accept = 0;
#endif
}
-static int __net_init udp_sysctl_init(struct net *net)
+static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries)
{
- __udp_sysctl_init(net);
+ struct udp_table *udptable;
+ unsigned int slot_size;
+ int i;
+
+ udptable = kmalloc(sizeof(*udptable), GFP_KERNEL);
+ if (!udptable)
+ goto out;
+
+ slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main) +
+ udp_hash4_slot_size();
+ udptable->hash = vmalloc_huge(hash_entries * slot_size,
+ GFP_KERNEL_ACCOUNT);
+ if (!udptable->hash)
+ goto free_table;
+
+ udptable->hash2 = (void *)(udptable->hash + hash_entries);
+ udptable->mask = hash_entries - 1;
+ udptable->log = ilog2(hash_entries);
+
+ for (i = 0; i < hash_entries; i++) {
+ INIT_HLIST_HEAD(&udptable->hash[i].head);
+ udptable->hash[i].count = 0;
+ spin_lock_init(&udptable->hash[i].lock);
+
+ INIT_HLIST_HEAD(&udptable->hash2[i].hslot.head);
+ udptable->hash2[i].hslot.count = 0;
+ spin_lock_init(&udptable->hash2[i].hslot.lock);
+ }
+ udp_table_hash4_init(udptable);
+
+ return udptable;
+
+free_table:
+ kfree(udptable);
+out:
+ return NULL;
+}
+
+static void __net_exit udp_pernet_table_free(struct net *net)
+{
+ struct udp_table *udptable = net->ipv4.udp_table;
+
+ if (udptable == &udp_table)
+ return;
+
+ kvfree(udptable->hash);
+ kfree(udptable);
+}
+
+static void __net_init udp_set_table(struct net *net)
+{
+ struct udp_table *udptable;
+ unsigned int hash_entries;
+ struct net *old_net;
+
+ if (net_eq(net, &init_net))
+ goto fallback;
+
+ old_net = current->nsproxy->net_ns;
+ hash_entries = READ_ONCE(old_net->ipv4.sysctl_udp_child_hash_entries);
+ if (!hash_entries)
+ goto fallback;
+
+ /* Set min to keep the bitmap on stack in udp_lib_get_port() */
+ if (hash_entries < UDP_HTABLE_SIZE_MIN_PERNET)
+ hash_entries = UDP_HTABLE_SIZE_MIN_PERNET;
+ else
+ hash_entries = roundup_pow_of_two(hash_entries);
+
+ udptable = udp_pernet_table_alloc(hash_entries);
+ if (udptable) {
+ net->ipv4.udp_table = udptable;
+ } else {
+ pr_warn("Failed to allocate UDP hash table (entries: %u) "
+ "for a netns, fallback to the global one\n",
+ hash_entries);
+fallback:
+ net->ipv4.udp_table = &udp_table;
+ }
+}
+
+static int __net_init udp_pernet_init(struct net *net)
+{
+#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL)
+ int i;
+
+ /* No tunnel is configured */
+ for (i = 0; i < ARRAY_SIZE(net->ipv4.udp_tunnel_gro); ++i) {
+ INIT_HLIST_HEAD(&net->ipv4.udp_tunnel_gro[i].list);
+ RCU_INIT_POINTER(net->ipv4.udp_tunnel_gro[i].sk, NULL);
+ }
+#endif
+ udp_sysctl_init(net);
+ udp_set_table(net);
+
return 0;
}
+static void __net_exit udp_pernet_exit(struct net *net)
+{
+ udp_pernet_table_free(net);
+}
+
static struct pernet_operations __net_initdata udp_sysctl_ops = {
- .init = udp_sysctl_init,
+ .init = udp_pernet_init,
+ .exit = udp_pernet_exit,
+};
+
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
+ struct udp_sock *udp_sk, uid_t uid, int bucket)
+
+static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
+ unsigned int new_batch_sz, gfp_t flags)
+{
+ union bpf_udp_iter_batch_item *new_batch;
+
+ new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch),
+ flags | __GFP_NOWARN);
+ if (!new_batch)
+ return -ENOMEM;
+
+ if (flags != GFP_NOWAIT)
+ bpf_iter_udp_put_batch(iter);
+
+ memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
+ kvfree(iter->batch);
+ iter->batch = new_batch;
+ iter->max_sk = new_batch_sz;
+
+ return 0;
+}
+
+#define INIT_BATCH_SZ 16
+
+static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+ struct bpf_udp_iter_state *iter = priv_data;
+ int ret;
+
+ ret = bpf_iter_init_seq_net(priv_data, aux);
+ if (ret)
+ return ret;
+
+ ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER);
+ if (ret)
+ bpf_iter_fini_seq_net(priv_data);
+
+ iter->state.bucket = -1;
+
+ return ret;
+}
+
+static void bpf_iter_fini_udp(void *priv_data)
+{
+ struct bpf_udp_iter_state *iter = priv_data;
+
+ bpf_iter_fini_seq_net(priv_data);
+ kvfree(iter->batch);
+}
+
+static const struct bpf_iter_seq_info udp_seq_info = {
+ .seq_ops = &bpf_iter_udp_seq_ops,
+ .init_seq_private = bpf_iter_init_udp,
+ .fini_seq_private = bpf_iter_fini_udp,
+ .seq_priv_size = sizeof(struct bpf_udp_iter_state),
};
+static struct bpf_iter_reg udp_reg_info = {
+ .target = "udp",
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__udp, udp_sk),
+ PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
+ },
+ .seq_info = &udp_seq_info,
+};
+
+static void __init bpf_iter_register(void)
+{
+ udp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UDP];
+ if (bpf_iter_reg_target(&udp_reg_info))
+ pr_warn("Warning: could not register bpf iterator udp\n");
+}
+#endif
+
void __init udp_init(void)
{
unsigned long limit;
- unsigned int i;
udp_table_init(&udp_table, "UDP");
limit = nr_free_buffer_pages() / 8;
@@ -2924,17 +4031,10 @@ void __init udp_init(void)
sysctl_udp_mem[1] = limit;
sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;
- __udp_sysctl_init(&init_net);
-
- /* 16 spinlocks per cpu */
- udp_busylocks_log = ilog2(nr_cpu_ids) + 4;
- udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log,
- GFP_KERNEL);
- if (!udp_busylocks)
- panic("UDP: failed to alloc udp_busylocks\n");
- for (i = 0; i < (1U << udp_busylocks_log); i++)
- spin_lock_init(udp_busylocks + i);
-
if (register_pernet_subsys(&udp_sysctl_ops))
panic("UDP: failed to init sysctl parameters.\n");
+
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+ bpf_iter_register();
+#endif
}
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
new file mode 100644
index 000000000000..0735d820e413
--- /dev/null
+++ b/net/ipv4/udp_bpf.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Cloudflare Ltd https://cloudflare.com */
+
+#include <linux/skmsg.h>
+#include <net/sock.h>
+#include <net/udp.h>
+#include <net/inet_common.h>
+
+#include "udp_impl.h"
+
+static struct proto *udpv6_prot_saved __read_mostly;
+
+static int sk_udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int flags, int *addr_len)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ return udpv6_prot_saved->recvmsg(sk, msg, len, flags, addr_len);
+#endif
+ return udp_prot.recvmsg(sk, msg, len, flags, addr_len);
+}
+
+static bool udp_sk_has_data(struct sock *sk)
+{
+ return !skb_queue_empty(&udp_sk(sk)->reader_queue) ||
+ !skb_queue_empty(&sk->sk_receive_queue);
+}
+
+static bool psock_has_data(struct sk_psock *psock)
+{
+ return !skb_queue_empty(&psock->ingress_skb) ||
+ !sk_psock_queue_empty(psock);
+}
+
+#define udp_msg_has_data(__sk, __psock) \
+ ({ udp_sk_has_data(__sk) || psock_has_data(__psock); })
+
+static int udp_msg_wait_data(struct sock *sk, struct sk_psock *psock,
+ long timeo)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ int ret = 0;
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ return 1;
+
+ if (!timeo)
+ return ret;
+
+ add_wait_queue(sk_sleep(sk), &wait);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ ret = udp_msg_has_data(sk, psock);
+ if (!ret) {
+ wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+ ret = udp_msg_has_data(sk, psock);
+ }
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
+ remove_wait_queue(sk_sleep(sk), &wait);
+ return ret;
+}
+
+static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ int flags, int *addr_len)
+{
+ struct sk_psock *psock;
+ int copied, ret;
+
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
+
+ if (!len)
+ return 0;
+
+ psock = sk_psock_get(sk);
+ if (unlikely(!psock))
+ return sk_udp_recvmsg(sk, msg, len, flags, addr_len);
+
+ if (!psock_has_data(psock)) {
+ ret = sk_udp_recvmsg(sk, msg, len, flags, addr_len);
+ goto out;
+ }
+
+msg_bytes_ready:
+ copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
+ if (!copied) {
+ long timeo;
+ int data;
+
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+ data = udp_msg_wait_data(sk, psock, timeo);
+ if (data) {
+ if (psock_has_data(psock))
+ goto msg_bytes_ready;
+ ret = sk_udp_recvmsg(sk, msg, len, flags, addr_len);
+ goto out;
+ }
+ copied = -EAGAIN;
+ }
+ ret = copied;
+out:
+ sk_psock_put(sk, psock);
+ return ret;
+}
+
+enum {
+ UDP_BPF_IPV4,
+ UDP_BPF_IPV6,
+ UDP_BPF_NUM_PROTS,
+};
+
+static DEFINE_SPINLOCK(udpv6_prot_lock);
+static struct proto udp_bpf_prots[UDP_BPF_NUM_PROTS];
+
+static void udp_bpf_rebuild_protos(struct proto *prot, const struct proto *base)
+{
+ *prot = *base;
+ prot->close = sock_map_close;
+ prot->recvmsg = udp_bpf_recvmsg;
+ prot->sock_is_readable = sk_msg_is_readable;
+}
+
+static void udp_bpf_check_v6_needs_rebuild(struct proto *ops)
+{
+ if (unlikely(ops != smp_load_acquire(&udpv6_prot_saved))) {
+ spin_lock_bh(&udpv6_prot_lock);
+ if (likely(ops != udpv6_prot_saved)) {
+ udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV6], ops);
+ smp_store_release(&udpv6_prot_saved, ops);
+ }
+ spin_unlock_bh(&udpv6_prot_lock);
+ }
+}
+
+static int __init udp_bpf_v4_build_proto(void)
+{
+ udp_bpf_rebuild_protos(&udp_bpf_prots[UDP_BPF_IPV4], &udp_prot);
+ return 0;
+}
+late_initcall(udp_bpf_v4_build_proto);
+
+int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
+{
+ int family = sk->sk_family == AF_INET ? UDP_BPF_IPV4 : UDP_BPF_IPV6;
+
+ if (restore) {
+ sk->sk_write_space = psock->saved_write_space;
+ sock_replace_proto(sk, psock->sk_proto);
+ return 0;
+ }
+
+ if (sk->sk_family == AF_INET6)
+ udp_bpf_check_v6_needs_rebuild(psock->sk_proto);
+
+ sock_replace_proto(sk, &udp_bpf_prots[family]);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(udp_bpf_update_proto);
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index d9ad986c7b2c..6e491c720c90 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* udp_diag.c Module for monitoring UDP transport protocols sockets.
*
* Authors: Pavel Emelyanov, <xemul@parallels.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
@@ -20,28 +16,28 @@
static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *req,
- struct nlattr *bc, bool net_admin)
+ bool net_admin)
{
- if (!inet_diag_bc_sk(bc, sk))
+ if (!inet_diag_bc_sk(cb->data, sk))
return 0;
- return inet_sk_diag_fill(sk, NULL, skb, req,
- sk_user_ns(NETLINK_CB(cb->skb).sk),
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, net_admin);
+ return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI,
+ net_admin);
}
-static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
- const struct nlmsghdr *nlh,
+static int udp_dump_one(struct udp_table *tbl,
+ struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
- int err = -EINVAL;
+ struct sk_buff *in_skb = cb->skb;
+ int err;
struct sock *sk = NULL;
struct sk_buff *rep;
struct net *net = sock_net(in_skb->sk);
rcu_read_lock();
if (req->sdiag_family == AF_INET)
+ /* src and dst are swapped for historical reasons */
sk = __udp4_lib_lookup(net,
req->id.idiag_src[0], req->id.idiag_sport,
req->id.idiag_dst[0], req->id.idiag_dport,
@@ -67,26 +63,22 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
goto out;
err = -ENOMEM;
- rep = nlmsg_new(sizeof(struct inet_diag_msg) +
- sizeof(struct inet_diag_meminfo) + 64,
+ rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) +
+ inet_diag_msg_attrs_size() +
+ nla_total_size(sizeof(struct inet_diag_meminfo)) + 64,
GFP_KERNEL);
if (!rep)
goto out;
- err = inet_sk_diag_fill(sk, NULL, rep, req,
- sk_user_ns(NETLINK_CB(in_skb).sk),
- NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0, nlh,
- netlink_net_capable(in_skb, CAP_NET_ADMIN));
+ err = inet_sk_diag_fill(sk, NULL, rep, cb, req, 0,
+ netlink_net_capable(in_skb, CAP_NET_ADMIN));
if (err < 0) {
WARN_ON(err == -EMSGSIZE);
kfree_skb(rep);
goto out;
}
- err = netlink_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid,
- MSG_DONTWAIT);
- if (err > 0)
- err = 0;
+ err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
out:
if (sk)
sock_put(sk);
@@ -96,7 +88,7 @@ out_nosk:
static void udp_dump(struct udp_table *table, struct sk_buff *skb,
struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r)
{
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct net *net = sock_net(skb->sk);
@@ -134,7 +126,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
r->id.idiag_dport)
goto next;
- if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) {
+ if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0) {
spin_unlock_bh(&hslot->lock);
goto done;
}
@@ -149,15 +141,15 @@ done:
}
static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r, struct nlattr *bc)
+ const struct inet_diag_req_v2 *r)
{
- udp_dump(&udp_table, skb, cb, r, bc);
+ udp_dump(sock_net(cb->skb->sk)->ipv4.udp_table, skb, cb, r);
}
-static int udp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+static int udp_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
- return udp_dump_one(&udp_table, in_skb, nlh, req);
+ return udp_dump_one(sock_net(cb->skb->sk)->ipv4.udp_table, cb, req);
}
static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
@@ -229,7 +221,7 @@ static int __udp_diag_destroy(struct sk_buff *in_skb,
static int udp_diag_destroy(struct sk_buff *in_skb,
const struct inet_diag_req_v2 *req)
{
- return __udp_diag_destroy(in_skb, req, &udp_table);
+ return __udp_diag_destroy(in_skb, req, sock_net(in_skb->sk)->ipv4.udp_table);
}
static int udplite_diag_destroy(struct sk_buff *in_skb,
@@ -241,6 +233,7 @@ static int udplite_diag_destroy(struct sk_buff *in_skb,
#endif
static const struct inet_diag_handler udp_diag_handler = {
+ .owner = THIS_MODULE,
.dump = udp_diag_dump,
.dump_one = udp_diag_dump_one,
.idiag_get_info = udp_diag_get_info,
@@ -252,19 +245,19 @@ static const struct inet_diag_handler udp_diag_handler = {
};
static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r,
- struct nlattr *bc)
+ const struct inet_diag_req_v2 *r)
{
- udp_dump(&udplite_table, skb, cb, r, bc);
+ udp_dump(&udplite_table, skb, cb, r);
}
-static int udplite_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
+static int udplite_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
- return udp_dump_one(&udplite_table, in_skb, nlh, req);
+ return udp_dump_one(&udplite_table, cb, req);
}
static const struct inet_diag_handler udplite_diag_handler = {
+ .owner = THIS_MODULE,
.dump = udplite_diag_dump,
.dump_one = udplite_diag_dump_one,
.idiag_get_info = udp_diag_get_info,
@@ -301,5 +294,6 @@ static void __exit udp_diag_exit(void)
module_init(udp_diag_init);
module_exit(udp_diag_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("UDP socket monitoring via SOCK_DIAG");
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */);
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */);
diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h
index e7d18b140287..c7142213fc21 100644
--- a/net/ipv4/udp_impl.h
+++ b/net/ipv4/udp_impl.h
@@ -1,31 +1,25 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _UDP4_IMPL_H
#define _UDP4_IMPL_H
+#include <net/aligned_data.h>
#include <net/udp.h>
#include <net/udplite.h>
#include <net/protocol.h>
#include <net/inet_common.h>
int __udp4_lib_rcv(struct sk_buff *, struct udp_table *, int);
-void __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
+int __udp4_lib_err(struct sk_buff *, u32, struct udp_table *);
int udp_v4_get_port(struct sock *sk, unsigned short snum);
+void udp_v4_rehash(struct sock *sk);
-int udp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen);
+int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+ unsigned int optlen);
int udp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen);
-#ifdef CONFIG_COMPAT
-int compat_udp_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen);
-int compat_udp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen);
-#endif
-int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
- int flags, int *addr_len);
-int udp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
- int flags);
+int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+ int *addr_len);
void udp_destroy_sock(struct sock *sk);
#ifdef CONFIG_PROC_FS
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 0c0522b79b43..19d0b5b09ffa 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -1,18 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPV4 GSO/GRO offload support
* Linux INET implementation
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* UDPv4 GSO support
*/
#include <linux/skbuff.h>
+#include <net/gro.h>
+#include <net/gso.h>
#include <net/udp.h>
#include <net/protocol.h>
+#include <net/inet_common.h>
+#include <net/udp_tunnel.h>
+
+#if IS_ENABLED(CONFIG_NET_UDP_TUNNEL)
+
+/*
+ * Dummy GRO tunnel callback, exists mainly to avoid dangling/NULL
+ * values for the udp tunnel static call.
+ */
+static struct sk_buff *dummy_gro_rcv(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
+{
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+}
+
+typedef struct sk_buff *(*udp_tunnel_gro_rcv_t)(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb);
+
+struct udp_tunnel_type_entry {
+ udp_tunnel_gro_rcv_t gro_receive;
+ refcount_t count;
+};
+
+#define UDP_MAX_TUNNEL_TYPES (IS_ENABLED(CONFIG_GENEVE) + \
+ IS_ENABLED(CONFIG_VXLAN) * 2 + \
+ IS_ENABLED(CONFIG_NET_FOU) * 2 + \
+ IS_ENABLED(CONFIG_XFRM) * 2)
+
+DEFINE_STATIC_CALL(udp_tunnel_gro_rcv, dummy_gro_rcv);
+static DEFINE_STATIC_KEY_FALSE(udp_tunnel_static_call);
+static DEFINE_MUTEX(udp_tunnel_gro_type_lock);
+static struct udp_tunnel_type_entry udp_tunnel_gro_types[UDP_MAX_TUNNEL_TYPES];
+static unsigned int udp_tunnel_gro_type_nr;
+static DEFINE_SPINLOCK(udp_tunnel_gro_lock);
+
+void udp_tunnel_update_gro_lookup(struct net *net, struct sock *sk, bool add)
+{
+ bool is_ipv6 = sk->sk_family == AF_INET6;
+ struct udp_sock *tup, *up = udp_sk(sk);
+ struct udp_tunnel_gro *udp_tunnel_gro;
+
+ spin_lock(&udp_tunnel_gro_lock);
+ udp_tunnel_gro = &net->ipv4.udp_tunnel_gro[is_ipv6];
+ if (add)
+ hlist_add_head(&up->tunnel_list, &udp_tunnel_gro->list);
+ else if (up->tunnel_list.pprev)
+ hlist_del_init(&up->tunnel_list);
+
+ if (udp_tunnel_gro->list.first &&
+ !udp_tunnel_gro->list.first->next) {
+ tup = hlist_entry(udp_tunnel_gro->list.first, struct udp_sock,
+ tunnel_list);
+
+ rcu_assign_pointer(udp_tunnel_gro->sk, (struct sock *)tup);
+ } else {
+ RCU_INIT_POINTER(udp_tunnel_gro->sk, NULL);
+ }
+
+ spin_unlock(&udp_tunnel_gro_lock);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_lookup);
+
+void udp_tunnel_update_gro_rcv(struct sock *sk, bool add)
+{
+ struct udp_tunnel_type_entry *cur = NULL;
+ struct udp_sock *up = udp_sk(sk);
+ int i, old_gro_type_nr;
+
+ if (!UDP_MAX_TUNNEL_TYPES || !up->gro_receive)
+ return;
+
+ mutex_lock(&udp_tunnel_gro_type_lock);
+
+ /* Check if the static call is permanently disabled. */
+ if (udp_tunnel_gro_type_nr > UDP_MAX_TUNNEL_TYPES)
+ goto out;
+
+ for (i = 0; i < udp_tunnel_gro_type_nr; i++)
+ if (udp_tunnel_gro_types[i].gro_receive == up->gro_receive)
+ cur = &udp_tunnel_gro_types[i];
+
+ old_gro_type_nr = udp_tunnel_gro_type_nr;
+ if (add) {
+ /*
+ * Update the matching entry, if found, or add a new one
+ * if needed
+ */
+ if (cur) {
+ refcount_inc(&cur->count);
+ goto out;
+ }
+
+ if (unlikely(udp_tunnel_gro_type_nr == UDP_MAX_TUNNEL_TYPES)) {
+ pr_err_once("Too many UDP tunnel types, please increase UDP_MAX_TUNNEL_TYPES\n");
+ /* Ensure static call will never be enabled */
+ udp_tunnel_gro_type_nr = UDP_MAX_TUNNEL_TYPES + 1;
+ } else {
+ cur = &udp_tunnel_gro_types[udp_tunnel_gro_type_nr++];
+ refcount_set(&cur->count, 1);
+ cur->gro_receive = up->gro_receive;
+ }
+ } else {
+ /*
+ * The stack cleanups only successfully added tunnel, the
+ * lookup on removal should never fail.
+ */
+ if (WARN_ON_ONCE(!cur))
+ goto out;
+
+ if (!refcount_dec_and_test(&cur->count))
+ goto out;
+
+ /* Avoid gaps, so that the enable tunnel has always id 0 */
+ *cur = udp_tunnel_gro_types[--udp_tunnel_gro_type_nr];
+ }
+
+ if (udp_tunnel_gro_type_nr == 1) {
+ static_call_update(udp_tunnel_gro_rcv,
+ udp_tunnel_gro_types[0].gro_receive);
+ static_branch_enable(&udp_tunnel_static_call);
+ } else if (old_gro_type_nr == 1) {
+ static_branch_disable(&udp_tunnel_static_call);
+ static_call_update(udp_tunnel_gro_rcv, dummy_gro_rcv);
+ }
+
+out:
+ mutex_unlock(&udp_tunnel_gro_type_lock);
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_update_gro_rcv);
+
+static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
+{
+ if (static_branch_likely(&udp_tunnel_static_call)) {
+ if (unlikely(gro_recursion_inc_test(skb))) {
+ NAPI_GRO_CB(skb)->flush |= 1;
+ return NULL;
+ }
+ return static_call(udp_tunnel_gro_rcv)(sk, head, skb);
+ }
+ return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb);
+}
+
+#else
+
+static struct sk_buff *udp_tunnel_gro_rcv(struct sock *sk,
+ struct list_head *head,
+ struct sk_buff *skb)
+{
+ return call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb);
+}
+
+#endif
static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
netdev_features_t features,
@@ -52,6 +207,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
__skb_pull(skb, tnl_hlen);
skb_reset_mac_header(skb);
skb_set_network_header(skb, skb_inner_network_offset(skb));
+ skb_set_transport_header(skb, skb_inner_transport_offset(skb));
skb->mac_len = skb_inner_network_offset(skb);
skb->protocol = new_protocol;
@@ -61,7 +217,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
skb->remcsum_offload = remcsum;
- need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
+ need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb);
/* Try to offload checksum if possible */
offload_csum = !!(need_csum &&
!need_ipsec &&
@@ -70,6 +226,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
(NETIF_F_HW_CSUM | NETIF_F_IP_CSUM))));
features &= skb->dev->hw_enc_features;
+ if (need_csum)
+ features &= ~NETIF_F_SCTP_CRC;
/* The only checksum offload we care about from here on out is the
* outer one so strip the existing checksum feature flags and
@@ -152,8 +310,8 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
netdev_features_t features,
bool is_ipv6)
{
+ const struct net_offload __rcu **offloads;
__be16 protocol = skb->protocol;
- const struct net_offload **offloads;
const struct net_offload *ops;
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct sk_buff *(*gso_inner_segment)(struct sk_buff *skb,
@@ -187,8 +345,140 @@ out_unlock:
}
EXPORT_SYMBOL(skb_udp_tunnel_segment);
+static void __udpv4_gso_segment_csum(struct sk_buff *seg,
+ __be32 *oldip, __be32 *newip,
+ __be16 *oldport, __be16 *newport)
+{
+ struct udphdr *uh;
+ struct iphdr *iph;
+
+ if (*oldip == *newip && *oldport == *newport)
+ return;
+
+ uh = udp_hdr(seg);
+ iph = ip_hdr(seg);
+
+ if (uh->check) {
+ inet_proto_csum_replace4(&uh->check, seg, *oldip, *newip,
+ true);
+ inet_proto_csum_replace2(&uh->check, seg, *oldport, *newport,
+ false);
+ if (!uh->check)
+ uh->check = CSUM_MANGLED_0;
+ }
+ *oldport = *newport;
+
+ csum_replace4(&iph->check, *oldip, *newip);
+ *oldip = *newip;
+}
+
+static struct sk_buff *__udpv4_gso_segment_list_csum(struct sk_buff *segs)
+{
+ struct sk_buff *seg;
+ struct udphdr *uh, *uh2;
+ struct iphdr *iph, *iph2;
+
+ seg = segs;
+ uh = udp_hdr(seg);
+ iph = ip_hdr(seg);
+
+ if ((udp_hdr(seg)->dest == udp_hdr(seg->next)->dest) &&
+ (udp_hdr(seg)->source == udp_hdr(seg->next)->source) &&
+ (ip_hdr(seg)->daddr == ip_hdr(seg->next)->daddr) &&
+ (ip_hdr(seg)->saddr == ip_hdr(seg->next)->saddr))
+ return segs;
+
+ while ((seg = seg->next)) {
+ uh2 = udp_hdr(seg);
+ iph2 = ip_hdr(seg);
+
+ __udpv4_gso_segment_csum(seg,
+ &iph2->saddr, &iph->saddr,
+ &uh2->source, &uh->source);
+ __udpv4_gso_segment_csum(seg,
+ &iph2->daddr, &iph->daddr,
+ &uh2->dest, &uh->dest);
+ }
+
+ return segs;
+}
+
+static void __udpv6_gso_segment_csum(struct sk_buff *seg,
+ struct in6_addr *oldip,
+ const struct in6_addr *newip,
+ __be16 *oldport, __be16 newport)
+{
+ struct udphdr *uh = udp_hdr(seg);
+
+ if (ipv6_addr_equal(oldip, newip) && *oldport == newport)
+ return;
+
+ if (uh->check) {
+ inet_proto_csum_replace16(&uh->check, seg, oldip->s6_addr32,
+ newip->s6_addr32, true);
+
+ inet_proto_csum_replace2(&uh->check, seg, *oldport, newport,
+ false);
+ if (!uh->check)
+ uh->check = CSUM_MANGLED_0;
+ }
+
+ *oldip = *newip;
+ *oldport = newport;
+}
+
+static struct sk_buff *__udpv6_gso_segment_list_csum(struct sk_buff *segs)
+{
+ const struct ipv6hdr *iph;
+ const struct udphdr *uh;
+ struct ipv6hdr *iph2;
+ struct sk_buff *seg;
+ struct udphdr *uh2;
+
+ seg = segs;
+ uh = udp_hdr(seg);
+ iph = ipv6_hdr(seg);
+ uh2 = udp_hdr(seg->next);
+ iph2 = ipv6_hdr(seg->next);
+
+ if (!(*(const u32 *)&uh->source ^ *(const u32 *)&uh2->source) &&
+ ipv6_addr_equal(&iph->saddr, &iph2->saddr) &&
+ ipv6_addr_equal(&iph->daddr, &iph2->daddr))
+ return segs;
+
+ while ((seg = seg->next)) {
+ uh2 = udp_hdr(seg);
+ iph2 = ipv6_hdr(seg);
+
+ __udpv6_gso_segment_csum(seg, &iph2->saddr, &iph->saddr,
+ &uh2->source, uh->source);
+ __udpv6_gso_segment_csum(seg, &iph2->daddr, &iph->daddr,
+ &uh2->dest, uh->dest);
+ }
+
+ return segs;
+}
+
+static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb,
+ netdev_features_t features,
+ bool is_ipv6)
+{
+ unsigned int mss = skb_shinfo(skb)->gso_size;
+
+ skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
+ if (IS_ERR(skb))
+ return skb;
+
+ udp_hdr(skb)->len = htons(sizeof(struct udphdr) + mss);
+
+ if (is_ipv6)
+ return __udpv6_gso_segment_list_csum(skb);
+ else
+ return __udpv4_gso_segment_list_csum(skb);
+}
+
struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
- netdev_features_t features)
+ netdev_features_t features, bool is_ipv6)
{
struct sock *sk = gso_skb->sk;
unsigned int sum_truesize = 0;
@@ -198,22 +488,70 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
bool copy_dtor;
__sum16 check;
__be16 newlen;
+ int ret = 0;
mss = skb_shinfo(gso_skb)->gso_size;
if (gso_skb->len <= sizeof(*uh) + mss)
return ERR_PTR(-EINVAL);
+ if (unlikely(skb_checksum_start(gso_skb) !=
+ skb_transport_header(gso_skb) &&
+ !(skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST)))
+ return ERR_PTR(-EINVAL);
+
+ /* We don't know if egress device can segment and checksum the packet
+ * when IPv6 extension headers are present. Fall back to software GSO.
+ */
+ if (gso_skb->ip_summed != CHECKSUM_PARTIAL)
+ features &= ~(NETIF_F_GSO_UDP_L4 | NETIF_F_CSUM_MASK);
+
+ if (skb_gso_ok(gso_skb, features | NETIF_F_GSO_ROBUST)) {
+ /* Packet is from an untrusted source, reset gso_segs. */
+ skb_shinfo(gso_skb)->gso_segs = DIV_ROUND_UP(gso_skb->len - sizeof(*uh),
+ mss);
+ return NULL;
+ }
+
+ if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST) {
+ /* Detect modified geometry and pass those to skb_segment. */
+ if (skb_pagelen(gso_skb) - sizeof(*uh) == skb_shinfo(gso_skb)->gso_size)
+ return __udp_gso_segment_list(gso_skb, features, is_ipv6);
+
+ ret = __skb_linearize(gso_skb);
+ if (ret)
+ return ERR_PTR(ret);
+
+ /* Setup csum, as fraglist skips this in udp4_gro_receive. */
+ gso_skb->csum_start = skb_transport_header(gso_skb) - gso_skb->head;
+ gso_skb->csum_offset = offsetof(struct udphdr, check);
+ gso_skb->ip_summed = CHECKSUM_PARTIAL;
+
+ uh = udp_hdr(gso_skb);
+ if (is_ipv6)
+ uh->check = ~udp_v6_check(gso_skb->len,
+ &ipv6_hdr(gso_skb)->saddr,
+ &ipv6_hdr(gso_skb)->daddr, 0);
+ else
+ uh->check = ~udp_v4_check(gso_skb->len,
+ ip_hdr(gso_skb)->saddr,
+ ip_hdr(gso_skb)->daddr, 0);
+ }
+
skb_pull(gso_skb, sizeof(*uh));
/* clear destructor to avoid skb_segment assigning it to tail */
copy_dtor = gso_skb->destructor == sock_wfree;
- if (copy_dtor)
+ if (copy_dtor) {
gso_skb->destructor = NULL;
+ gso_skb->sk = NULL;
+ }
segs = skb_segment(gso_skb, features);
- if (unlikely(IS_ERR_OR_NULL(segs))) {
- if (copy_dtor)
+ if (IS_ERR_OR_NULL(segs)) {
+ if (copy_dtor) {
gso_skb->destructor = sock_wfree;
+ gso_skb->sk = sk;
+ }
return segs;
}
@@ -227,6 +565,11 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
seg = segs;
uh = udp_hdr(seg);
+ /* preserve TX timestamp flags and TS key for first segment */
+ skb_shinfo(seg)->tskey = skb_shinfo(gso_skb)->tskey;
+ skb_shinfo(seg)->tx_flags |=
+ (skb_shinfo(gso_skb)->tx_flags & SKBTX_ANY_TSTAMP);
+
/* compute checksum adjustment based on old length versus new */
newlen = htons(sizeof(*uh) + mss);
check = csum16_add(csum16_sub(uh->check, uh->len), newlen);
@@ -267,6 +610,14 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
else
uh->check = gso_make_checksum(seg, ~check) ? : CSUM_MANGLED_0;
+ /* On the TX path, CHECKSUM_NONE and CHECKSUM_UNNECESSARY have the same
+ * meaning. However, check for bad offloads in the GSO stack expects the
+ * latter, if the checksum was calculated in software. To vouch for the
+ * segment skbs we actually need to set it on the gso_skb.
+ */
+ if (gso_skb->ip_summed == CHECKSUM_NONE)
+ gso_skb->ip_summed = CHECKSUM_UNNECESSARY;
+
/* update refcount for the packet */
if (copy_dtor) {
int delta = sum_truesize - gso_skb->truesize;
@@ -306,7 +657,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
goto out;
if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
- return __udp_gso_segment(skb, features);
+ return __udp_gso_segment(skb, features, false);
mss = skb_shinfo(skb)->gso_size;
if (unlikely(skb->len <= mss))
@@ -343,18 +694,128 @@ out:
return segs;
}
+
+#define UDP_GRO_CNT_MAX 64
+static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
+ struct sk_buff *skb)
+{
+ struct udphdr *uh = udp_gro_udphdr(skb);
+ struct sk_buff *pp = NULL;
+ struct udphdr *uh2;
+ struct sk_buff *p;
+ unsigned int ulen;
+ int ret = 0;
+ int flush;
+
+ /* requires non zero csum, for symmetry with GSO */
+ if (!uh->check) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+
+ /* Do not deal with padded or malicious packets, sorry ! */
+ ulen = ntohs(uh->len);
+ if (ulen <= sizeof(*uh) || ulen != skb_gro_len(skb)) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+ /* pull encapsulating udp header */
+ skb_gro_pull(skb, sizeof(struct udphdr));
+
+ list_for_each_entry(p, head, list) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ uh2 = udp_hdr(p);
+
+ /* Match ports only, as csum is always non zero */
+ if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ if (NAPI_GRO_CB(skb)->is_flist != NAPI_GRO_CB(p)->is_flist) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return p;
+ }
+
+ flush = gro_receive_network_flush(uh, uh2, p);
+
+ /* Terminate the flow on len mismatch or if it grow "too much".
+ * Under small packet flood GRO count could elsewhere grow a lot
+ * leading to excessive truesize values.
+ * On len mismatch merge the first packet shorter than gso_size,
+ * otherwise complete the GRO packet.
+ */
+ if (ulen > ntohs(uh2->len) || flush) {
+ pp = p;
+ } else {
+ if (NAPI_GRO_CB(skb)->is_flist) {
+ if (!pskb_may_pull(skb, skb_gro_offset(skb))) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+ if ((skb->ip_summed != p->ip_summed) ||
+ (skb->csum_level != p->csum_level)) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+ skb_set_network_header(skb, skb_gro_receive_network_offset(skb));
+ ret = skb_gro_receive_list(p, skb);
+ } else {
+ skb_gro_postpull_rcsum(skb, uh,
+ sizeof(struct udphdr));
+
+ ret = skb_gro_receive(p, skb);
+ }
+ }
+
+ if (ret || ulen != ntohs(uh2->len) ||
+ NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX)
+ pp = p;
+
+ return pp;
+ }
+
+ /* mismatch, but we never need to flush */
+ return NULL;
+}
+
struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
- struct udphdr *uh, udp_lookup_t lookup)
+ struct udphdr *uh, struct sock *sk)
{
struct sk_buff *pp = NULL;
struct sk_buff *p;
struct udphdr *uh2;
unsigned int off = skb_gro_offset(skb);
int flush = 1;
- struct sock *sk;
+
+ /* We can do L4 aggregation only if the packet can't land in a tunnel
+ * otherwise we could corrupt the inner stream. Detecting such packets
+ * cannot be foolproof and the aggregation might still happen in some
+ * cases. Such packets should be caught in udp_unexpected_gso later.
+ */
+ NAPI_GRO_CB(skb)->is_flist = 0;
+ if (!sk || !udp_sk(sk)->gro_receive) {
+ /* If the packet was locally encapsulated in a UDP tunnel that
+ * wasn't detected above, do not GRO.
+ */
+ if (skb->encapsulation)
+ goto out;
+
+ if (skb->dev->features & NETIF_F_GRO_FRAGLIST)
+ NAPI_GRO_CB(skb)->is_flist = sk ? !udp_test_bit(GRO_ENABLED, sk) : 1;
+
+ if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) ||
+ (sk && udp_test_bit(GRO_ENABLED, sk)) || NAPI_GRO_CB(skb)->is_flist)
+ return call_gro_receive(udp_gro_receive_segment, head, skb);
+
+ /* no GRO, be sure flush the current packet */
+ goto out;
+ }
if (NAPI_GRO_CB(skb)->encap_mark ||
- (skb->ip_summed != CHECKSUM_PARTIAL &&
+ (uh->check && skb->ip_summed != CHECKSUM_PARTIAL &&
NAPI_GRO_CB(skb)->csum_cnt == 0 &&
!NAPI_GRO_CB(skb)->csum_valid))
goto out;
@@ -362,14 +823,6 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
/* mark that this skb passed once through the tunnel gro layer */
NAPI_GRO_CB(skb)->encap_mark = 1;
- rcu_read_lock();
- sk = (*lookup)(skb, uh->source, uh->dest);
-
- if (sk && udp_sk(sk)->gro_receive)
- goto unflush;
- goto out_unlock;
-
-unflush:
flush = 0;
list_for_each_entry(p, head, list) {
@@ -390,20 +843,39 @@ unflush:
skb_gro_pull(skb, sizeof(struct udphdr)); /* pull encapsulating udp header */
skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr));
- pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb);
+ pp = udp_tunnel_gro_rcv(sk, head, skb);
-out_unlock:
- rcu_read_unlock();
out:
skb_gro_flush_final(skb, pp, flush);
return pp;
}
EXPORT_SYMBOL(udp_gro_receive);
-static struct sk_buff *udp4_gro_receive(struct list_head *head,
- struct sk_buff *skb)
+static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport,
+ __be16 dport)
+{
+ const struct iphdr *iph = skb_gro_network_header(skb);
+ struct net *net = dev_net_rcu(skb->dev);
+ struct sock *sk;
+ int iif, sdif;
+
+ sk = udp_tunnel_sk(net, false);
+ if (sk && dport == htons(sk->sk_num))
+ return sk;
+
+ inet_get_iif_sdif(skb, &iif, &sdif);
+
+ return __udp4_lib_lookup(net, iph->saddr, sport,
+ iph->daddr, dport, iif,
+ sdif, net->ipv4.udp_table, NULL);
+}
+
+INDIRECT_CALLABLE_SCOPE
+struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
+ struct sock *sk = NULL;
+ struct sk_buff *pp;
if (unlikely(!uh))
goto flush;
@@ -416,38 +888,67 @@ static struct sk_buff *udp4_gro_receive(struct list_head *head,
inet_gro_compute_pseudo))
goto flush;
else if (uh->check)
- skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
+ skb_gro_checksum_try_convert(skb, IPPROTO_UDP,
inet_gro_compute_pseudo);
skip:
- NAPI_GRO_CB(skb)->is_ipv6 = 0;
- return udp_gro_receive(head, skb, uh, udp4_lib_lookup_skb);
+ if (static_branch_unlikely(&udp_encap_needed_key))
+ sk = udp4_gro_lookup_skb(skb, uh->source, uh->dest);
+
+ pp = udp_gro_receive(head, skb, uh, sk);
+ return pp;
flush:
NAPI_GRO_CB(skb)->flush = 1;
return NULL;
}
+static int udp_gro_complete_segment(struct sk_buff *skb)
+{
+ struct udphdr *uh = udp_hdr(skb);
+
+ skb->csum_start = (unsigned char *)uh - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_L4;
+
+ if (skb->encapsulation)
+ skb->inner_transport_header = skb->transport_header;
+
+ return 0;
+}
+
int udp_gro_complete(struct sk_buff *skb, int nhoff,
udp_lookup_t lookup)
{
__be16 newlen = htons(skb->len - nhoff);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
- int err = -ENOSYS;
struct sock *sk;
+ int err;
uh->len = newlen;
- /* Set encapsulation before calling into inner gro_complete() functions
- * to make them set up the inner offsets.
- */
- skb->encapsulation = 1;
+ sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb,
+ udp4_lib_lookup_skb, skb, uh->source, uh->dest);
+ if (sk && udp_sk(sk)->gro_complete) {
+ skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM
+ : SKB_GSO_UDP_TUNNEL;
- rcu_read_lock();
- sk = (*lookup)(skb, uh->source, uh->dest);
- if (sk && udp_sk(sk)->gro_complete)
+ /* clear the encap mark, so that inner frag_list gro_complete
+ * can take place
+ */
+ NAPI_GRO_CB(skb)->encap_mark = 0;
+
+ /* Set encapsulation before calling into inner gro_complete()
+ * functions to make them set up the inner offsets.
+ */
+ skb->encapsulation = 1;
err = udp_sk(sk)->gro_complete(sk, skb,
nhoff + sizeof(struct udphdr));
- rcu_read_unlock();
+ } else {
+ err = udp_gro_complete_segment(skb);
+ }
if (skb->remcsum_offload)
skb_shinfo(skb)->gso_type |= SKB_GSO_TUNNEL_REMCSUM;
@@ -456,31 +957,40 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff,
}
EXPORT_SYMBOL(udp_gro_complete);
-static int udp4_gro_complete(struct sk_buff *skb, int nhoff)
+INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff)
{
- const struct iphdr *iph = ip_hdr(skb);
+ const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
+ const struct iphdr *iph = (struct iphdr *)(skb->data + offset);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
- if (uh->check) {
- skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+ /* do fraglist only if there is no outer UDP encap (or we already processed it) */
+ if (NAPI_GRO_CB(skb)->is_flist && !NAPI_GRO_CB(skb)->encap_mark) {
+ uh->len = htons(skb->len - nhoff);
+
+ skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4);
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+ __skb_incr_checksum_unnecessary(skb);
+
+ return 0;
+ }
+
+ if (uh->check)
uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
iph->daddr, 0);
- } else {
- skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
- }
return udp_gro_complete(skb, nhoff, udp4_lib_lookup_skb);
}
-static const struct net_offload udpv4_offload = {
- .callbacks = {
- .gso_segment = udp4_ufo_fragment,
- .gro_receive = udp4_gro_receive,
- .gro_complete = udp4_gro_complete,
- },
-};
-
int __init udpv4_offload_init(void)
{
- return inet_add_offload(&udpv4_offload, IPPROTO_UDP);
+ net_hotdata.udpv4_offload = (struct net_offload) {
+ .callbacks = {
+ .gso_segment = udp4_ufo_fragment,
+ .gro_receive = udp4_gro_receive,
+ .gro_complete = udp4_gro_complete,
+ },
+ };
+
+ return inet_add_offload(&net_hotdata.udpv4_offload, IPPROTO_UDP);
}
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel_core.c
index 6539ff15e9a3..b1f667c52cb2 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -1,13 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/socket.h>
-#include <linux/udp.h>
-#include <linux/types.h>
#include <linux/kernel.h>
#include <net/dst_metadata.h>
-#include <net/net_namespace.h>
+#include <net/flow.h>
#include <net/udp.h>
#include <net/udp_tunnel.h>
+#include <net/inet_dscp.h>
int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
struct socket **sockp)
@@ -20,10 +20,16 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
if (err < 0)
goto error;
+ if (cfg->bind_ifindex) {
+ err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true);
+ if (err < 0)
+ goto error;
+ }
+
udp_addr.sin_family = AF_INET;
udp_addr.sin_addr = cfg->local_ip;
udp_addr.sin_port = cfg->local_udp_port;
- err = kernel_bind(sock, (struct sockaddr *)&udp_addr,
+ err = kernel_bind(sock, (struct sockaddr_unsized *)&udp_addr,
sizeof(udp_addr));
if (err < 0)
goto error;
@@ -32,7 +38,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
udp_addr.sin_family = AF_INET;
udp_addr.sin_addr = cfg->peer_ip;
udp_addr.sin_port = cfg->peer_udp_port;
- err = kernel_connect(sock, (struct sockaddr *)&udp_addr,
+ err = kernel_connect(sock, (struct sockaddr_unsized *)&udp_addr,
sizeof(udp_addr), 0);
if (err < 0)
goto error;
@@ -53,13 +59,22 @@ error:
}
EXPORT_SYMBOL(udp_sock_create4);
+static bool sk_saddr_any(struct sock *sk)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ return ipv6_addr_any(&sk->sk_v6_rcv_saddr);
+#else
+ return !sk->sk_rcv_saddr;
+#endif
+}
+
void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
struct udp_tunnel_sock_cfg *cfg)
{
struct sock *sk = sock->sk;
/* Disable multicast loopback */
- inet_sk(sk)->mc_loop = 0;
+ inet_clear_bit(MC_LOOP, sk);
/* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */
inet_inc_convert_csum(sk);
@@ -68,11 +83,19 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
udp_sk(sk)->encap_type = cfg->encap_type;
udp_sk(sk)->encap_rcv = cfg->encap_rcv;
+ udp_sk(sk)->encap_err_rcv = cfg->encap_err_rcv;
+ udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup;
udp_sk(sk)->encap_destroy = cfg->encap_destroy;
udp_sk(sk)->gro_receive = cfg->gro_receive;
udp_sk(sk)->gro_complete = cfg->gro_complete;
- udp_tunnel_encap_enable(sock);
+ udp_tunnel_encap_enable(sk);
+
+ udp_tunnel_update_gro_rcv(sk, true);
+
+ if (!sk->sk_dport && !sk->sk_bound_dev_if && sk_saddr_any(sk) &&
+ sk->sk_kern_sock)
+ udp_tunnel_update_gro_lookup(net, sk, true);
}
EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
@@ -82,15 +105,11 @@ void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock,
struct sock *sk = sock->sk;
struct udp_tunnel_info ti;
- if (!dev->netdev_ops->ndo_udp_tunnel_add ||
- !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
- return;
-
ti.type = type;
ti.sa_family = sk->sk_family;
ti.port = inet_sk(sk)->inet_sport;
- dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti);
+ udp_tunnel_nic_add_port(dev, &ti);
}
EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port);
@@ -100,15 +119,11 @@ void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock,
struct sock *sk = sock->sk;
struct udp_tunnel_info ti;
- if (!dev->netdev_ops->ndo_udp_tunnel_del ||
- !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
- return;
-
ti.type = type;
ti.sa_family = sk->sk_family;
ti.port = inet_sk(sk)->inet_sport;
- dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti);
+ udp_tunnel_nic_del_port(dev, &ti);
}
EXPORT_SYMBOL_GPL(udp_tunnel_drop_rx_port);
@@ -120,19 +135,17 @@ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type)
struct udp_tunnel_info ti;
struct net_device *dev;
+ ASSERT_RTNL();
+
ti.type = type;
ti.sa_family = sk->sk_family;
ti.port = inet_sk(sk)->inet_sport;
- rcu_read_lock();
- for_each_netdev_rcu(net, dev) {
- if (!dev->netdev_ops->ndo_udp_tunnel_add)
- continue;
- if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
- continue;
- dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti);
+ for_each_netdev(net, dev) {
+ udp_tunnel_nic_lock(dev);
+ udp_tunnel_nic_add_port(dev, &ti);
+ udp_tunnel_nic_unlock(dev);
}
- rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port);
@@ -144,26 +157,24 @@ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type)
struct udp_tunnel_info ti;
struct net_device *dev;
+ ASSERT_RTNL();
+
ti.type = type;
ti.sa_family = sk->sk_family;
ti.port = inet_sk(sk)->inet_sport;
- rcu_read_lock();
- for_each_netdev_rcu(net, dev) {
- if (!dev->netdev_ops->ndo_udp_tunnel_del)
- continue;
- if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT))
- continue;
- dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti);
+ for_each_netdev(net, dev) {
+ udp_tunnel_nic_lock(dev);
+ udp_tunnel_nic_del_port(dev, &ti);
+ udp_tunnel_nic_unlock(dev);
}
- rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port);
void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 tos, __u8 ttl,
__be16 df, __be16 src_port, __be16 dst_port,
- bool xnet, bool nocheck)
+ bool xnet, bool nocheck, u16 ipcb_flags)
{
struct udphdr *uh;
@@ -179,20 +190,23 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb
udp_set_csum(nocheck, skb, src, dst, skb->len);
- iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet);
+ iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet,
+ ipcb_flags);
}
EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb);
void udp_tunnel_sock_release(struct socket *sock)
{
rcu_assign_sk_user_data(sock->sk, NULL);
+ synchronize_rcu();
kernel_sock_shutdown(sock, SHUT_RDWR);
sock_release(sock);
}
EXPORT_SYMBOL_GPL(udp_tunnel_sock_release);
struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family,
- __be16 flags, __be64 tunnel_id, int md_size)
+ const unsigned long *flags,
+ __be64 tunnel_id, int md_size)
{
struct metadata_dst *tun_dst;
struct ip_tunnel_info *info;
@@ -208,9 +222,59 @@ struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family,
info->key.tp_src = udp_hdr(skb)->source;
info->key.tp_dst = udp_hdr(skb)->dest;
if (udp_hdr(skb)->check)
- info->key.tun_flags |= TUNNEL_CSUM;
+ __set_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags);
return tun_dst;
}
EXPORT_SYMBOL_GPL(udp_tun_rx_dst);
+struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb,
+ struct net_device *dev,
+ struct net *net, int oif,
+ __be32 *saddr,
+ const struct ip_tunnel_key *key,
+ __be16 sport, __be16 dport, u8 tos,
+ struct dst_cache *dst_cache)
+{
+ struct rtable *rt = NULL;
+ struct flowi4 fl4;
+
+#ifdef CONFIG_DST_CACHE
+ if (dst_cache) {
+ rt = dst_cache_get_ip4(dst_cache, saddr);
+ if (rt)
+ return rt;
+ }
+#endif
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.flowi4_mark = skb->mark;
+ fl4.flowi4_proto = IPPROTO_UDP;
+ fl4.flowi4_oif = oif;
+ fl4.daddr = key->u.ipv4.dst;
+ fl4.saddr = key->u.ipv4.src;
+ fl4.fl4_dport = dport;
+ fl4.fl4_sport = sport;
+ fl4.flowi4_dscp = inet_dsfield_to_dscp(tos);
+ fl4.flowi4_flags = key->flow_flags;
+
+ rt = ip_route_output_key(net, &fl4);
+ if (IS_ERR(rt)) {
+ netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
+ return ERR_PTR(-ENETUNREACH);
+ }
+ if (rt->dst.dev == dev) { /* is this necessary? */
+ netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
+ ip_rt_put(rt);
+ return ERR_PTR(-ELOOP);
+ }
+#ifdef CONFIG_DST_CACHE
+ if (dst_cache)
+ dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
+#endif
+ *saddr = fl4.saddr;
+ return rt;
+}
+EXPORT_SYMBOL_GPL(udp_tunnel_dst_lookup);
+
+MODULE_DESCRIPTION("IPv4 Foo over UDP tunnel driver");
MODULE_LICENSE("GPL");
diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c
new file mode 100644
index 000000000000..944b3cf25468
--- /dev/null
+++ b/net/ipv4/udp_tunnel_nic.c
@@ -0,0 +1,1010 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (c) 2020 Facebook Inc.
+
+#include <linux/ethtool_netlink.h>
+#include <linux/netdevice.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <net/udp_tunnel.h>
+#include <net/vxlan.h>
+
+enum udp_tunnel_nic_table_entry_flags {
+ UDP_TUNNEL_NIC_ENTRY_ADD = BIT(0),
+ UDP_TUNNEL_NIC_ENTRY_DEL = BIT(1),
+ UDP_TUNNEL_NIC_ENTRY_OP_FAIL = BIT(2),
+ UDP_TUNNEL_NIC_ENTRY_FROZEN = BIT(3),
+};
+
+struct udp_tunnel_nic_table_entry {
+ __be16 port;
+ u8 type;
+ u8 flags;
+ u16 use_cnt;
+#define UDP_TUNNEL_NIC_USE_CNT_MAX U16_MAX
+ u8 hw_priv;
+};
+
+/**
+ * struct udp_tunnel_nic - UDP tunnel port offload state
+ * @work: async work for talking to hardware from process context
+ * @dev: netdev pointer
+ * @lock: protects all fields
+ * @need_sync: at least one port start changed
+ * @need_replay: space was freed, we need a replay of all ports
+ * @work_pending: @work is currently scheduled
+ * @n_tables: number of tables under @entries
+ * @missed: bitmap of tables which overflown
+ * @entries: table of tables of ports currently offloaded
+ */
+struct udp_tunnel_nic {
+ struct work_struct work;
+
+ struct net_device *dev;
+
+ struct mutex lock;
+
+ u8 need_sync:1;
+ u8 need_replay:1;
+ u8 work_pending:1;
+
+ unsigned int n_tables;
+ unsigned long missed;
+ struct udp_tunnel_nic_table_entry *entries[] __counted_by(n_tables);
+};
+
+/* We ensure all work structs are done using driver state, but not the code.
+ * We need a workqueue we can flush before module gets removed.
+ */
+static struct workqueue_struct *udp_tunnel_nic_workqueue;
+
+static const char *udp_tunnel_nic_tunnel_type_name(unsigned int type)
+{
+ switch (type) {
+ case UDP_TUNNEL_TYPE_VXLAN:
+ return "vxlan";
+ case UDP_TUNNEL_TYPE_GENEVE:
+ return "geneve";
+ case UDP_TUNNEL_TYPE_VXLAN_GPE:
+ return "vxlan-gpe";
+ default:
+ return "unknown";
+ }
+}
+
+static bool
+udp_tunnel_nic_entry_is_free(struct udp_tunnel_nic_table_entry *entry)
+{
+ return entry->use_cnt == 0 && !entry->flags;
+}
+
+static bool
+udp_tunnel_nic_entry_is_present(struct udp_tunnel_nic_table_entry *entry)
+{
+ return entry->use_cnt && !(entry->flags & ~UDP_TUNNEL_NIC_ENTRY_FROZEN);
+}
+
+static bool
+udp_tunnel_nic_entry_is_frozen(struct udp_tunnel_nic_table_entry *entry)
+{
+ return entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN;
+}
+
+static void
+udp_tunnel_nic_entry_freeze_used(struct udp_tunnel_nic_table_entry *entry)
+{
+ if (!udp_tunnel_nic_entry_is_free(entry))
+ entry->flags |= UDP_TUNNEL_NIC_ENTRY_FROZEN;
+}
+
+static void
+udp_tunnel_nic_entry_unfreeze(struct udp_tunnel_nic_table_entry *entry)
+{
+ entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_FROZEN;
+}
+
+static bool
+udp_tunnel_nic_entry_is_queued(struct udp_tunnel_nic_table_entry *entry)
+{
+ return entry->flags & (UDP_TUNNEL_NIC_ENTRY_ADD |
+ UDP_TUNNEL_NIC_ENTRY_DEL);
+}
+
+static void
+udp_tunnel_nic_entry_queue(struct udp_tunnel_nic *utn,
+ struct udp_tunnel_nic_table_entry *entry,
+ unsigned int flag)
+{
+ entry->flags |= flag;
+ utn->need_sync = 1;
+}
+
+static void
+udp_tunnel_nic_ti_from_entry(struct udp_tunnel_nic_table_entry *entry,
+ struct udp_tunnel_info *ti)
+{
+ memset(ti, 0, sizeof(*ti));
+ ti->port = entry->port;
+ ti->type = entry->type;
+ ti->hw_priv = entry->hw_priv;
+}
+
+static bool
+udp_tunnel_nic_is_empty(struct net_device *dev, struct udp_tunnel_nic *utn)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ unsigned int i, j;
+
+ for (i = 0; i < utn->n_tables; i++)
+ for (j = 0; j < info->tables[i].n_entries; j++)
+ if (!udp_tunnel_nic_entry_is_free(&utn->entries[i][j]))
+ return false;
+ return true;
+}
+
+static bool
+udp_tunnel_nic_should_replay(struct net_device *dev, struct udp_tunnel_nic *utn)
+{
+ const struct udp_tunnel_nic_table_info *table;
+ unsigned int i, j;
+
+ if (!utn->missed)
+ return false;
+
+ for (i = 0; i < utn->n_tables; i++) {
+ table = &dev->udp_tunnel_nic_info->tables[i];
+ if (!test_bit(i, &utn->missed))
+ continue;
+
+ for (j = 0; j < table->n_entries; j++)
+ if (udp_tunnel_nic_entry_is_free(&utn->entries[i][j]))
+ return true;
+ }
+
+ return false;
+}
+
+static void
+__udp_tunnel_nic_get_port(struct net_device *dev, unsigned int table,
+ unsigned int idx, struct udp_tunnel_info *ti)
+{
+ struct udp_tunnel_nic_table_entry *entry;
+ struct udp_tunnel_nic *utn;
+
+ utn = dev->udp_tunnel_nic;
+ entry = &utn->entries[table][idx];
+
+ if (entry->use_cnt)
+ udp_tunnel_nic_ti_from_entry(entry, ti);
+}
+
+static void
+__udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table,
+ unsigned int idx, u8 priv)
+{
+ dev->udp_tunnel_nic->entries[table][idx].hw_priv = priv;
+}
+
+static void
+udp_tunnel_nic_entry_update_done(struct udp_tunnel_nic_table_entry *entry,
+ int err)
+{
+ bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL;
+
+ WARN_ON_ONCE(entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD &&
+ entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL);
+
+ if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD &&
+ (!err || (err == -EEXIST && dodgy)))
+ entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_ADD;
+
+ if (entry->flags & UDP_TUNNEL_NIC_ENTRY_DEL &&
+ (!err || (err == -ENOENT && dodgy)))
+ entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_DEL;
+
+ if (!err)
+ entry->flags &= ~UDP_TUNNEL_NIC_ENTRY_OP_FAIL;
+ else
+ entry->flags |= UDP_TUNNEL_NIC_ENTRY_OP_FAIL;
+}
+
+static void
+udp_tunnel_nic_device_sync_one(struct net_device *dev,
+ struct udp_tunnel_nic *utn,
+ unsigned int table, unsigned int idx)
+{
+ struct udp_tunnel_nic_table_entry *entry;
+ struct udp_tunnel_info ti;
+ int err;
+
+ entry = &utn->entries[table][idx];
+ if (!udp_tunnel_nic_entry_is_queued(entry))
+ return;
+
+ udp_tunnel_nic_ti_from_entry(entry, &ti);
+ if (entry->flags & UDP_TUNNEL_NIC_ENTRY_ADD)
+ err = dev->udp_tunnel_nic_info->set_port(dev, table, idx, &ti);
+ else
+ err = dev->udp_tunnel_nic_info->unset_port(dev, table, idx,
+ &ti);
+ udp_tunnel_nic_entry_update_done(entry, err);
+
+ if (err)
+ netdev_warn(dev,
+ "UDP tunnel port sync failed port %d type %s: %d\n",
+ be16_to_cpu(entry->port),
+ udp_tunnel_nic_tunnel_type_name(entry->type),
+ err);
+}
+
+static void
+udp_tunnel_nic_device_sync_by_port(struct net_device *dev,
+ struct udp_tunnel_nic *utn)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ unsigned int i, j;
+
+ for (i = 0; i < utn->n_tables; i++)
+ for (j = 0; j < info->tables[i].n_entries; j++)
+ udp_tunnel_nic_device_sync_one(dev, utn, i, j);
+}
+
+static void
+udp_tunnel_nic_device_sync_by_table(struct net_device *dev,
+ struct udp_tunnel_nic *utn)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ unsigned int i, j;
+ int err;
+
+ for (i = 0; i < utn->n_tables; i++) {
+ /* Find something that needs sync in this table */
+ for (j = 0; j < info->tables[i].n_entries; j++)
+ if (udp_tunnel_nic_entry_is_queued(&utn->entries[i][j]))
+ break;
+ if (j == info->tables[i].n_entries)
+ continue;
+
+ err = info->sync_table(dev, i);
+ if (err)
+ netdev_warn(dev, "UDP tunnel port sync failed for table %d: %d\n",
+ i, err);
+
+ for (j = 0; j < info->tables[i].n_entries; j++) {
+ struct udp_tunnel_nic_table_entry *entry;
+
+ entry = &utn->entries[i][j];
+ if (udp_tunnel_nic_entry_is_queued(entry))
+ udp_tunnel_nic_entry_update_done(entry, err);
+ }
+ }
+}
+
+static void
+__udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn)
+{
+ if (!utn->need_sync)
+ return;
+
+ if (dev->udp_tunnel_nic_info->sync_table)
+ udp_tunnel_nic_device_sync_by_table(dev, utn);
+ else
+ udp_tunnel_nic_device_sync_by_port(dev, utn);
+
+ utn->need_sync = 0;
+ /* Can't replay directly here, in case we come from the tunnel driver's
+ * notification - trying to replay may deadlock inside tunnel driver.
+ */
+ utn->need_replay = udp_tunnel_nic_should_replay(dev, utn);
+}
+
+static void
+udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn)
+{
+ if (!utn->need_sync)
+ return;
+
+ queue_work(udp_tunnel_nic_workqueue, &utn->work);
+ utn->work_pending = 1;
+}
+
+static bool
+udp_tunnel_nic_table_is_capable(const struct udp_tunnel_nic_table_info *table,
+ struct udp_tunnel_info *ti)
+{
+ return table->tunnel_types & ti->type;
+}
+
+static bool
+udp_tunnel_nic_is_capable(struct net_device *dev, struct udp_tunnel_nic *utn,
+ struct udp_tunnel_info *ti)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ unsigned int i;
+
+ /* Special case IPv4-only NICs */
+ if (info->flags & UDP_TUNNEL_NIC_INFO_IPV4_ONLY &&
+ ti->sa_family != AF_INET)
+ return false;
+
+ for (i = 0; i < utn->n_tables; i++)
+ if (udp_tunnel_nic_table_is_capable(&info->tables[i], ti))
+ return true;
+ return false;
+}
+
+static int
+udp_tunnel_nic_has_collision(struct net_device *dev, struct udp_tunnel_nic *utn,
+ struct udp_tunnel_info *ti)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ struct udp_tunnel_nic_table_entry *entry;
+ unsigned int i, j;
+
+ for (i = 0; i < utn->n_tables; i++)
+ for (j = 0; j < info->tables[i].n_entries; j++) {
+ entry = &utn->entries[i][j];
+
+ if (!udp_tunnel_nic_entry_is_free(entry) &&
+ entry->port == ti->port &&
+ entry->type != ti->type) {
+ __set_bit(i, &utn->missed);
+ return true;
+ }
+ }
+ return false;
+}
+
+static void
+udp_tunnel_nic_entry_adj(struct udp_tunnel_nic *utn,
+ unsigned int table, unsigned int idx, int use_cnt_adj)
+{
+ struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx];
+ bool dodgy = entry->flags & UDP_TUNNEL_NIC_ENTRY_OP_FAIL;
+ unsigned int from, to;
+
+ WARN_ON(entry->use_cnt + (u32)use_cnt_adj > U16_MAX);
+
+ /* If not going from used to unused or vice versa - all done.
+ * For dodgy entries make sure we try to sync again (queue the entry).
+ */
+ entry->use_cnt += use_cnt_adj;
+ if (!dodgy && !entry->use_cnt == !(entry->use_cnt - use_cnt_adj))
+ return;
+
+ /* Cancel the op before it was sent to the device, if possible,
+ * otherwise we'd need to take special care to issue commands
+ * in the same order the ports arrived.
+ */
+ if (use_cnt_adj < 0) {
+ from = UDP_TUNNEL_NIC_ENTRY_ADD;
+ to = UDP_TUNNEL_NIC_ENTRY_DEL;
+ } else {
+ from = UDP_TUNNEL_NIC_ENTRY_DEL;
+ to = UDP_TUNNEL_NIC_ENTRY_ADD;
+ }
+
+ if (entry->flags & from) {
+ entry->flags &= ~from;
+ if (!dodgy)
+ return;
+ }
+
+ udp_tunnel_nic_entry_queue(utn, entry, to);
+}
+
+static bool
+udp_tunnel_nic_entry_try_adj(struct udp_tunnel_nic *utn,
+ unsigned int table, unsigned int idx,
+ struct udp_tunnel_info *ti, int use_cnt_adj)
+{
+ struct udp_tunnel_nic_table_entry *entry = &utn->entries[table][idx];
+
+ if (udp_tunnel_nic_entry_is_free(entry) ||
+ entry->port != ti->port ||
+ entry->type != ti->type)
+ return false;
+
+ if (udp_tunnel_nic_entry_is_frozen(entry))
+ return true;
+
+ udp_tunnel_nic_entry_adj(utn, table, idx, use_cnt_adj);
+ return true;
+}
+
+/* Try to find existing matching entry and adjust its use count, instead of
+ * adding a new one. Returns true if entry was found. In case of delete the
+ * entry may have gotten removed in the process, in which case it will be
+ * queued for removal.
+ */
+static bool
+udp_tunnel_nic_try_existing(struct net_device *dev, struct udp_tunnel_nic *utn,
+ struct udp_tunnel_info *ti, int use_cnt_adj)
+{
+ const struct udp_tunnel_nic_table_info *table;
+ unsigned int i, j;
+
+ for (i = 0; i < utn->n_tables; i++) {
+ table = &dev->udp_tunnel_nic_info->tables[i];
+ if (!udp_tunnel_nic_table_is_capable(table, ti))
+ continue;
+
+ for (j = 0; j < table->n_entries; j++)
+ if (udp_tunnel_nic_entry_try_adj(utn, i, j, ti,
+ use_cnt_adj))
+ return true;
+ }
+
+ return false;
+}
+
+static bool
+udp_tunnel_nic_add_existing(struct net_device *dev, struct udp_tunnel_nic *utn,
+ struct udp_tunnel_info *ti)
+{
+ return udp_tunnel_nic_try_existing(dev, utn, ti, +1);
+}
+
+static bool
+udp_tunnel_nic_del_existing(struct net_device *dev, struct udp_tunnel_nic *utn,
+ struct udp_tunnel_info *ti)
+{
+ return udp_tunnel_nic_try_existing(dev, utn, ti, -1);
+}
+
+static bool
+udp_tunnel_nic_add_new(struct net_device *dev, struct udp_tunnel_nic *utn,
+ struct udp_tunnel_info *ti)
+{
+ const struct udp_tunnel_nic_table_info *table;
+ unsigned int i, j;
+
+ for (i = 0; i < utn->n_tables; i++) {
+ table = &dev->udp_tunnel_nic_info->tables[i];
+ if (!udp_tunnel_nic_table_is_capable(table, ti))
+ continue;
+
+ for (j = 0; j < table->n_entries; j++) {
+ struct udp_tunnel_nic_table_entry *entry;
+
+ entry = &utn->entries[i][j];
+ if (!udp_tunnel_nic_entry_is_free(entry))
+ continue;
+
+ entry->port = ti->port;
+ entry->type = ti->type;
+ entry->use_cnt = 1;
+ udp_tunnel_nic_entry_queue(utn, entry,
+ UDP_TUNNEL_NIC_ENTRY_ADD);
+ return true;
+ }
+
+ /* The different table may still fit this port in, but there
+ * are no devices currently which have multiple tables accepting
+ * the same tunnel type, and false positives are okay.
+ */
+ __set_bit(i, &utn->missed);
+ }
+
+ return false;
+}
+
+static void
+__udp_tunnel_nic_add_port(struct net_device *dev, struct udp_tunnel_info *ti)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ struct udp_tunnel_nic *utn;
+
+ utn = dev->udp_tunnel_nic;
+ if (!utn)
+ return;
+ if (!netif_running(dev) && info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)
+ return;
+ if (info->flags & UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN &&
+ ti->port == htons(IANA_VXLAN_UDP_PORT)) {
+ if (ti->type != UDP_TUNNEL_TYPE_VXLAN)
+ netdev_warn(dev, "device assumes port 4789 will be used by vxlan tunnels\n");
+ return;
+ }
+
+ if (!udp_tunnel_nic_is_capable(dev, utn, ti))
+ return;
+
+ /* It may happen that a tunnel of one type is removed and different
+ * tunnel type tries to reuse its port before the device was informed.
+ * Rely on utn->missed to re-add this port later.
+ */
+ if (udp_tunnel_nic_has_collision(dev, utn, ti))
+ return;
+
+ if (!udp_tunnel_nic_add_existing(dev, utn, ti))
+ udp_tunnel_nic_add_new(dev, utn, ti);
+
+ udp_tunnel_nic_device_sync(dev, utn);
+}
+
+static void
+__udp_tunnel_nic_del_port(struct net_device *dev, struct udp_tunnel_info *ti)
+{
+ struct udp_tunnel_nic *utn;
+
+ utn = dev->udp_tunnel_nic;
+ if (!utn)
+ return;
+
+ if (!udp_tunnel_nic_is_capable(dev, utn, ti))
+ return;
+
+ udp_tunnel_nic_del_existing(dev, utn, ti);
+
+ udp_tunnel_nic_device_sync(dev, utn);
+}
+
+static void __udp_tunnel_nic_reset_ntf(struct net_device *dev)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ struct udp_tunnel_nic *utn;
+ unsigned int i, j;
+
+ utn = dev->udp_tunnel_nic;
+ if (!utn)
+ return;
+
+ mutex_lock(&utn->lock);
+
+ utn->need_sync = false;
+ for (i = 0; i < utn->n_tables; i++)
+ for (j = 0; j < info->tables[i].n_entries; j++) {
+ struct udp_tunnel_nic_table_entry *entry;
+
+ entry = &utn->entries[i][j];
+
+ entry->flags &= ~(UDP_TUNNEL_NIC_ENTRY_DEL |
+ UDP_TUNNEL_NIC_ENTRY_OP_FAIL);
+ /* We don't release utn lock across ops */
+ WARN_ON(entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN);
+ if (!entry->use_cnt)
+ continue;
+
+ udp_tunnel_nic_entry_queue(utn, entry,
+ UDP_TUNNEL_NIC_ENTRY_ADD);
+ }
+
+ __udp_tunnel_nic_device_sync(dev, utn);
+
+ mutex_unlock(&utn->lock);
+}
+
+static size_t
+__udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ struct udp_tunnel_nic *utn;
+ unsigned int j;
+ size_t size;
+
+ utn = dev->udp_tunnel_nic;
+ if (!utn)
+ return 0;
+
+ size = 0;
+ for (j = 0; j < info->tables[table].n_entries; j++) {
+ if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j]))
+ continue;
+
+ size += nla_total_size(0) + /* _TABLE_ENTRY */
+ nla_total_size(sizeof(__be16)) + /* _ENTRY_PORT */
+ nla_total_size(sizeof(u32)); /* _ENTRY_TYPE */
+ }
+
+ return size;
+}
+
+static int
+__udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table,
+ struct sk_buff *skb)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ struct udp_tunnel_nic *utn;
+ struct nlattr *nest;
+ unsigned int j;
+
+ utn = dev->udp_tunnel_nic;
+ if (!utn)
+ return 0;
+
+ for (j = 0; j < info->tables[table].n_entries; j++) {
+ if (!udp_tunnel_nic_entry_is_present(&utn->entries[table][j]))
+ continue;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT,
+ utn->entries[table][j].port) ||
+ nla_put_u32(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_TYPE,
+ ilog2(utn->entries[table][j].type)))
+ goto err_cancel;
+
+ nla_nest_end(skb, nest);
+ }
+
+ return 0;
+
+err_cancel:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
+static void __udp_tunnel_nic_assert_locked(struct net_device *dev)
+{
+ struct udp_tunnel_nic *utn;
+
+ utn = dev->udp_tunnel_nic;
+ if (utn)
+ lockdep_assert_held(&utn->lock);
+}
+
+static void __udp_tunnel_nic_lock(struct net_device *dev)
+{
+ struct udp_tunnel_nic *utn;
+
+ utn = dev->udp_tunnel_nic;
+ if (utn)
+ mutex_lock(&utn->lock);
+}
+
+static void __udp_tunnel_nic_unlock(struct net_device *dev)
+{
+ struct udp_tunnel_nic *utn;
+
+ utn = dev->udp_tunnel_nic;
+ if (utn)
+ mutex_unlock(&utn->lock);
+}
+
+static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = {
+ .get_port = __udp_tunnel_nic_get_port,
+ .set_port_priv = __udp_tunnel_nic_set_port_priv,
+ .add_port = __udp_tunnel_nic_add_port,
+ .del_port = __udp_tunnel_nic_del_port,
+ .reset_ntf = __udp_tunnel_nic_reset_ntf,
+ .dump_size = __udp_tunnel_nic_dump_size,
+ .dump_write = __udp_tunnel_nic_dump_write,
+ .assert_locked = __udp_tunnel_nic_assert_locked,
+ .lock = __udp_tunnel_nic_lock,
+ .unlock = __udp_tunnel_nic_unlock,
+};
+
+static void
+udp_tunnel_nic_flush(struct net_device *dev, struct udp_tunnel_nic *utn)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ unsigned int i, j;
+
+ for (i = 0; i < utn->n_tables; i++)
+ for (j = 0; j < info->tables[i].n_entries; j++) {
+ int adj_cnt = -utn->entries[i][j].use_cnt;
+
+ if (adj_cnt)
+ udp_tunnel_nic_entry_adj(utn, i, j, adj_cnt);
+ }
+
+ __udp_tunnel_nic_device_sync(dev, utn);
+
+ for (i = 0; i < utn->n_tables; i++)
+ memset(utn->entries[i], 0, array_size(info->tables[i].n_entries,
+ sizeof(**utn->entries)));
+ WARN_ON(utn->need_sync);
+ utn->need_replay = 0;
+}
+
+static void
+udp_tunnel_nic_replay(struct net_device *dev, struct udp_tunnel_nic *utn)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ struct udp_tunnel_nic_shared_node *node;
+ unsigned int i, j;
+
+ /* Freeze all the ports we are already tracking so that the replay
+ * does not double up the refcount.
+ */
+ for (i = 0; i < utn->n_tables; i++)
+ for (j = 0; j < info->tables[i].n_entries; j++)
+ udp_tunnel_nic_entry_freeze_used(&utn->entries[i][j]);
+ utn->missed = 0;
+ utn->need_replay = 0;
+
+ if (!info->shared) {
+ udp_tunnel_get_rx_info(dev);
+ } else {
+ list_for_each_entry(node, &info->shared->devices, list)
+ udp_tunnel_get_rx_info(node->dev);
+ }
+
+ for (i = 0; i < utn->n_tables; i++)
+ for (j = 0; j < info->tables[i].n_entries; j++)
+ udp_tunnel_nic_entry_unfreeze(&utn->entries[i][j]);
+}
+
+static void udp_tunnel_nic_device_sync_work(struct work_struct *work)
+{
+ struct udp_tunnel_nic *utn =
+ container_of(work, struct udp_tunnel_nic, work);
+
+ rtnl_lock();
+ mutex_lock(&utn->lock);
+
+ utn->work_pending = 0;
+ __udp_tunnel_nic_device_sync(utn->dev, utn);
+
+ if (utn->need_replay)
+ udp_tunnel_nic_replay(utn->dev, utn);
+
+ mutex_unlock(&utn->lock);
+ rtnl_unlock();
+}
+
+static struct udp_tunnel_nic *
+udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info,
+ unsigned int n_tables)
+{
+ struct udp_tunnel_nic *utn;
+ unsigned int i;
+
+ utn = kzalloc(struct_size(utn, entries, n_tables), GFP_KERNEL);
+ if (!utn)
+ return NULL;
+ utn->n_tables = n_tables;
+ INIT_WORK(&utn->work, udp_tunnel_nic_device_sync_work);
+ mutex_init(&utn->lock);
+
+ for (i = 0; i < n_tables; i++) {
+ utn->entries[i] = kcalloc(info->tables[i].n_entries,
+ sizeof(*utn->entries[i]), GFP_KERNEL);
+ if (!utn->entries[i])
+ goto err_free_prev_entries;
+ }
+
+ return utn;
+
+err_free_prev_entries:
+ while (i--)
+ kfree(utn->entries[i]);
+ kfree(utn);
+ return NULL;
+}
+
+static void udp_tunnel_nic_free(struct udp_tunnel_nic *utn)
+{
+ unsigned int i;
+
+ for (i = 0; i < utn->n_tables; i++)
+ kfree(utn->entries[i]);
+ kfree(utn);
+}
+
+static int udp_tunnel_nic_register(struct net_device *dev)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+ struct udp_tunnel_nic_shared_node *node = NULL;
+ struct udp_tunnel_nic *utn;
+ unsigned int n_tables, i;
+
+ BUILD_BUG_ON(sizeof(utn->missed) * BITS_PER_BYTE <
+ UDP_TUNNEL_NIC_MAX_TABLES);
+ /* Expect use count of at most 2 (IPv4, IPv6) per device */
+ BUILD_BUG_ON(UDP_TUNNEL_NIC_USE_CNT_MAX <
+ UDP_TUNNEL_NIC_MAX_SHARING_DEVICES * 2);
+
+ /* Check that the driver info is sane */
+ if (WARN_ON(!info->set_port != !info->unset_port) ||
+ WARN_ON(!info->set_port == !info->sync_table) ||
+ WARN_ON(!info->tables[0].n_entries))
+ return -EINVAL;
+
+ if (WARN_ON(info->shared &&
+ info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY))
+ return -EINVAL;
+
+ n_tables = 1;
+ for (i = 1; i < UDP_TUNNEL_NIC_MAX_TABLES; i++) {
+ if (!info->tables[i].n_entries)
+ continue;
+
+ n_tables++;
+ if (WARN_ON(!info->tables[i - 1].n_entries))
+ return -EINVAL;
+ }
+
+ /* Create UDP tunnel state structures */
+ if (info->shared) {
+ node = kzalloc(sizeof(*node), GFP_KERNEL);
+ if (!node)
+ return -ENOMEM;
+
+ node->dev = dev;
+ }
+
+ if (info->shared && info->shared->udp_tunnel_nic_info) {
+ utn = info->shared->udp_tunnel_nic_info;
+ } else {
+ utn = udp_tunnel_nic_alloc(info, n_tables);
+ if (!utn) {
+ kfree(node);
+ return -ENOMEM;
+ }
+ }
+
+ if (info->shared) {
+ if (!info->shared->udp_tunnel_nic_info) {
+ INIT_LIST_HEAD(&info->shared->devices);
+ info->shared->udp_tunnel_nic_info = utn;
+ }
+
+ list_add_tail(&node->list, &info->shared->devices);
+ }
+
+ utn->dev = dev;
+ dev_hold(dev);
+ dev->udp_tunnel_nic = utn;
+
+ if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) {
+ udp_tunnel_nic_lock(dev);
+ udp_tunnel_get_rx_info(dev);
+ udp_tunnel_nic_unlock(dev);
+ }
+
+ return 0;
+}
+
+static void
+udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn)
+{
+ const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info;
+
+ udp_tunnel_nic_lock(dev);
+
+ /* For a shared table remove this dev from the list of sharing devices
+ * and if there are other devices just detach.
+ */
+ if (info->shared) {
+ struct udp_tunnel_nic_shared_node *node, *first;
+
+ list_for_each_entry(node, &info->shared->devices, list)
+ if (node->dev == dev)
+ break;
+ if (list_entry_is_head(node, &info->shared->devices, list)) {
+ udp_tunnel_nic_unlock(dev);
+ return;
+ }
+
+ list_del(&node->list);
+ kfree(node);
+
+ first = list_first_entry_or_null(&info->shared->devices,
+ typeof(*first), list);
+ if (first) {
+ udp_tunnel_drop_rx_info(dev);
+ utn->dev = first->dev;
+ udp_tunnel_nic_unlock(dev);
+ goto release_dev;
+ }
+
+ info->shared->udp_tunnel_nic_info = NULL;
+ }
+
+ /* Flush before we check work, so we don't waste time adding entries
+ * from the work which we will boot immediately.
+ */
+ udp_tunnel_nic_flush(dev, utn);
+ udp_tunnel_nic_unlock(dev);
+
+ /* Wait for the work to be done using the state, netdev core will
+ * retry unregister until we give up our reference on this device.
+ */
+ if (utn->work_pending)
+ return;
+
+ udp_tunnel_nic_free(utn);
+release_dev:
+ dev->udp_tunnel_nic = NULL;
+ dev_put(dev);
+}
+
+static int
+udp_tunnel_nic_netdevice_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ const struct udp_tunnel_nic_info *info;
+ struct udp_tunnel_nic *utn;
+
+ info = dev->udp_tunnel_nic_info;
+ if (!info)
+ return NOTIFY_DONE;
+
+ if (event == NETDEV_REGISTER) {
+ int err;
+
+ err = udp_tunnel_nic_register(dev);
+ if (err)
+ netdev_warn(dev, "failed to register for UDP tunnel offloads: %d", err);
+ return notifier_from_errno(err);
+ }
+ /* All other events will need the udp_tunnel_nic state */
+ utn = dev->udp_tunnel_nic;
+ if (!utn)
+ return NOTIFY_DONE;
+
+ if (event == NETDEV_UNREGISTER) {
+ udp_tunnel_nic_unregister(dev, utn);
+ return NOTIFY_OK;
+ }
+
+ /* All other events only matter if NIC has to be programmed open */
+ if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY))
+ return NOTIFY_DONE;
+
+ if (event == NETDEV_UP) {
+ udp_tunnel_nic_lock(dev);
+ WARN_ON(!udp_tunnel_nic_is_empty(dev, utn));
+ udp_tunnel_get_rx_info(dev);
+ udp_tunnel_nic_unlock(dev);
+ return NOTIFY_OK;
+ }
+ if (event == NETDEV_GOING_DOWN) {
+ udp_tunnel_nic_lock(dev);
+ udp_tunnel_nic_flush(dev, utn);
+ udp_tunnel_nic_unlock(dev);
+ return NOTIFY_OK;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block udp_tunnel_nic_notifier_block __read_mostly = {
+ .notifier_call = udp_tunnel_nic_netdevice_event,
+};
+
+static int __init udp_tunnel_nic_init_module(void)
+{
+ int err;
+
+ udp_tunnel_nic_workqueue = alloc_ordered_workqueue("udp_tunnel_nic", 0);
+ if (!udp_tunnel_nic_workqueue)
+ return -ENOMEM;
+
+ rtnl_lock();
+ udp_tunnel_nic_ops = &__udp_tunnel_nic_ops;
+ rtnl_unlock();
+
+ err = register_netdevice_notifier(&udp_tunnel_nic_notifier_block);
+ if (err)
+ goto err_unset_ops;
+
+ return 0;
+
+err_unset_ops:
+ rtnl_lock();
+ udp_tunnel_nic_ops = NULL;
+ rtnl_unlock();
+ destroy_workqueue(udp_tunnel_nic_workqueue);
+ return err;
+}
+late_initcall(udp_tunnel_nic_init_module);
+
+static void __exit udp_tunnel_nic_cleanup_module(void)
+{
+ unregister_netdevice_notifier(&udp_tunnel_nic_notifier_block);
+
+ rtnl_lock();
+ udp_tunnel_nic_ops = NULL;
+ rtnl_unlock();
+
+ destroy_workqueue(udp_tunnel_nic_workqueue);
+}
+module_exit(udp_tunnel_nic_cleanup_module);
+
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/udp_tunnel_stub.c b/net/ipv4/udp_tunnel_stub.c
new file mode 100644
index 000000000000..c4b2888f5fef
--- /dev/null
+++ b/net/ipv4/udp_tunnel_stub.c
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (c) 2020 Facebook Inc.
+
+#include <net/udp_tunnel.h>
+
+const struct udp_tunnel_nic_ops *udp_tunnel_nic_ops;
+EXPORT_SYMBOL_GPL(udp_tunnel_nic_ops);
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 8545457752fb..d3e621a11a1a 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* UDPLITE An implementation of the UDP-Lite protocol (RFC 3828).
*
@@ -5,10 +6,6 @@
*
* Changes:
* Fixes:
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "UDPLite: " fmt
@@ -20,21 +17,29 @@
struct udp_table udplite_table __read_mostly;
EXPORT_SYMBOL(udplite_table);
+/* Designate sk as UDP-Lite socket */
+static int udplite_sk_init(struct sock *sk)
+{
+ udp_init_sock(sk);
+ pr_warn_once("UDP-Lite is deprecated and scheduled to be removed in 2025, "
+ "please contact the netdev mailing list\n");
+ return 0;
+}
+
static int udplite_rcv(struct sk_buff *skb)
{
return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
}
-static void udplite_err(struct sk_buff *skb, u32 info)
+static int udplite_err(struct sk_buff *skb, u32 info)
{
- __udp4_lib_err(skb, info, &udplite_table);
+ return __udp4_lib_err(skb, info, &udplite_table);
}
static const struct net_protocol udplite_protocol = {
.handler = udplite_rcv,
.err_handler = udplite_err,
.no_policy = 1,
- .netns_ok = 1,
};
struct proto udplite_prot = {
@@ -50,18 +55,19 @@ struct proto udplite_prot = {
.getsockopt = udp_getsockopt,
.sendmsg = udp_sendmsg,
.recvmsg = udp_recvmsg,
- .sendpage = udp_sendpage,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
+ .rehash = udp_v4_rehash,
.get_port = udp_v4_get_port,
- .memory_allocated = &udp_memory_allocated,
+
+ .memory_allocated = &net_aligned_data.udp_memory_allocated,
+ .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc,
+
.sysctl_mem = sysctl_udp_mem,
+ .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
+ .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
.obj_size = sizeof(struct udp_sock),
.h.udp_table = &udplite_table,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_udp_setsockopt,
- .compat_getsockopt = compat_udp_getsockopt,
-#endif
};
EXPORT_SYMBOL(udplite_prot);
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index f8de2482a529..f28cfd88eaf5 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -17,11 +17,8 @@
#include <linux/netfilter_ipv4.h>
#include <net/ip.h>
#include <net/xfrm.h>
-
-int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb)
-{
- return xfrm4_extract_header(skb);
-}
+#include <net/protocol.h>
+#include <net/gro.h>
static int xfrm4_rcv_encap_finish2(struct net *net, struct sock *sk,
struct sk_buff *skb)
@@ -36,7 +33,7 @@ static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk,
const struct iphdr *iph = ip_hdr(skb);
if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
- iph->tos, skb->dev))
+ ip4h_dscp(iph), skb->dev))
goto drop;
}
@@ -61,12 +58,16 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
return -iph->protocol;
#endif
- __skb_push(skb, skb->data - skb_network_header(skb));
+ __skb_push(skb, -skb_network_offset(skb));
iph->tot_len = htons(skb->len);
ip_send_check(iph);
if (xo && (xo->flags & XFRM_GRO)) {
- skb_mac_header_rebuild(skb);
+ /* The full l2 header needs to be preserved so that re-injecting the packet at l2
+ * works correctly in the presence of vlan tags.
+ */
+ skb_mac_header_rebuild_full(skb, xo->orig_mac_len);
+ skb_reset_network_header(skb);
skb_reset_transport_header(skb);
return 0;
}
@@ -77,24 +78,17 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
return 0;
}
-/* If it's a keepalive packet, then just eat it.
- * If it's an encapsulated packet, then pass it to the
- * IPsec xfrm input.
- * Returns 0 if skb passed to xfrm or was dropped.
- * Returns >0 if skb should be passed to UDP.
- * Returns <0 if skb should be resubmitted (-ret is protocol)
- */
-int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
+static int __xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb, bool pull)
{
struct udp_sock *up = udp_sk(sk);
struct udphdr *uh;
struct iphdr *iph;
int iphlen, len;
-
__u8 *udpdata;
__be32 *udpdata32;
- __u16 encap_type = up->encap_type;
+ u16 encap_type;
+ encap_type = READ_ONCE(up->encap_type);
/* if this is not encapsulated socket, then just return now */
if (!encap_type)
return 1;
@@ -115,7 +109,7 @@ int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
case UDP_ENCAP_ESPINUDP:
/* Check if this is a keepalive packet. If so, eat it. */
if (len == 1 && udpdata[0] == 0xff) {
- goto drop;
+ return -EINVAL;
} else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) {
/* ESP Packet without Non-ESP header */
len = sizeof(struct udphdr);
@@ -123,19 +117,6 @@ int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
/* Must be an IKE packet.. pass it through */
return 1;
break;
- case UDP_ENCAP_ESPINUDP_NON_IKE:
- /* Check if this is a keepalive packet. If so, eat it. */
- if (len == 1 && udpdata[0] == 0xff) {
- goto drop;
- } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) &&
- udpdata32[0] == 0 && udpdata32[1] == 0) {
-
- /* ESP Packet with Non-IKE marker */
- len = sizeof(struct udphdr) + 2 * sizeof(u32);
- } else
- /* Must be an IKE packet.. pass it through */
- return 1;
- break;
}
/* At this point we are sure that this is an ESPinUDP packet,
@@ -144,7 +125,7 @@ int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
* protocol to ESP, and then call into the transform receiver.
*/
if (skb_unclone(skb, GFP_ATOMIC))
- goto drop;
+ return -EINVAL;
/* Now we can update and verify the packet length... */
iph = ip_hdr(skb);
@@ -152,24 +133,94 @@ int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
iph->tot_len = htons(ntohs(iph->tot_len) - len);
if (skb->len < iphlen + len) {
/* packet is too small!?! */
- goto drop;
+ return -EINVAL;
}
/* pull the data buffer up to the ESP header and set the
* transport header to point to ESP. Keep UDP on the stack
* for later.
*/
- __skb_pull(skb, len);
- skb_reset_transport_header(skb);
+ if (pull) {
+ __skb_pull(skb, len);
+ skb_reset_transport_header(skb);
+ } else {
+ skb_set_transport_header(skb, len);
+ }
/* process ESP */
- return xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, encap_type);
-
-drop:
- kfree_skb(skb);
return 0;
}
+/* If it's a keepalive packet, then just eat it.
+ * If it's an encapsulated packet, then pass it to the
+ * IPsec xfrm input.
+ * Returns 0 if skb passed to xfrm or was dropped.
+ * Returns >0 if skb should be passed to UDP.
+ * Returns <0 if skb should be resubmitted (-ret is protocol)
+ */
+int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+
+ ret = __xfrm4_udp_encap_rcv(sk, skb, true);
+ if (!ret)
+ return xfrm4_rcv_encap(skb, IPPROTO_ESP, 0,
+ udp_sk(sk)->encap_type);
+
+ if (ret < 0) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL(xfrm4_udp_encap_rcv);
+
+struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head,
+ struct sk_buff *skb)
+{
+ int offset = skb_gro_offset(skb);
+ const struct net_offload *ops;
+ struct sk_buff *pp = NULL;
+ int len, dlen;
+ __u8 *udpdata;
+ __be32 *udpdata32;
+
+ len = skb->len - offset;
+ dlen = offset + min(len, 8);
+ udpdata = skb_gro_header(skb, dlen, offset);
+ udpdata32 = (__be32 *)udpdata;
+ if (unlikely(!udpdata))
+ return NULL;
+
+ rcu_read_lock();
+ ops = rcu_dereference(inet_offloads[IPPROTO_ESP]);
+ if (!ops || !ops->callbacks.gro_receive)
+ goto out;
+
+ /* check if it is a keepalive or IKE packet */
+ if (len <= sizeof(struct ip_esp_hdr) || udpdata32[0] == 0)
+ goto out;
+
+ /* set the transport header to ESP */
+ skb_set_transport_header(skb, offset);
+
+ NAPI_GRO_CB(skb)->proto = IPPROTO_UDP;
+
+ pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
+ rcu_read_unlock();
+
+ return pp;
+
+out:
+ rcu_read_unlock();
+ NAPI_GRO_CB(skb)->same_flow = 0;
+ NAPI_GRO_CB(skb)->flush = 1;
+
+ return NULL;
+}
+EXPORT_SYMBOL(xfrm4_gro_udp_encap_rcv);
+
int xfrm4_rcv(struct sk_buff *skb)
{
return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0);
diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c
deleted file mode 100644
index 856d2dfdb44b..000000000000
--- a/net/ipv4/xfrm4_mode_beet.c
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * xfrm4_mode_beet.c - BEET mode encapsulation for IPv4.
- *
- * Copyright (c) 2006 Diego Beltrami <diego.beltrami@gmail.com>
- * Miika Komu <miika@iki.fi>
- * Herbert Xu <herbert@gondor.apana.org.au>
- * Abhinav Pathak <abhinav.pathak@hiit.fi>
- * Jeff Ahrenholz <ahrenholz@gmail.com>
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/stringify.h>
-#include <net/dst.h>
-#include <net/ip.h>
-#include <net/xfrm.h>
-
-static void xfrm4_beet_make_header(struct sk_buff *skb)
-{
- struct iphdr *iph = ip_hdr(skb);
-
- iph->ihl = 5;
- iph->version = 4;
-
- iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol;
- iph->tos = XFRM_MODE_SKB_CB(skb)->tos;
-
- iph->id = XFRM_MODE_SKB_CB(skb)->id;
- iph->frag_off = XFRM_MODE_SKB_CB(skb)->frag_off;
- iph->ttl = XFRM_MODE_SKB_CB(skb)->ttl;
-}
-
-/* Add encapsulation header.
- *
- * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt.
- */
-static int xfrm4_beet_output(struct xfrm_state *x, struct sk_buff *skb)
-{
- struct ip_beet_phdr *ph;
- struct iphdr *top_iph;
- int hdrlen, optlen;
-
- hdrlen = 0;
- optlen = XFRM_MODE_SKB_CB(skb)->optlen;
- if (unlikely(optlen))
- hdrlen += IPV4_BEET_PHMAXLEN - (optlen & 4);
-
- skb_set_network_header(skb, -x->props.header_len -
- hdrlen + (XFRM_MODE_SKB_CB(skb)->ihl - sizeof(*top_iph)));
- if (x->sel.family != AF_INET6)
- skb->network_header += IPV4_BEET_PHMAXLEN;
- skb->mac_header = skb->network_header +
- offsetof(struct iphdr, protocol);
- skb->transport_header = skb->network_header + sizeof(*top_iph);
-
- xfrm4_beet_make_header(skb);
-
- ph = __skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl - hdrlen);
-
- top_iph = ip_hdr(skb);
-
- if (unlikely(optlen)) {
- BUG_ON(optlen < 0);
-
- ph->padlen = 4 - (optlen & 4);
- ph->hdrlen = optlen / 8;
- ph->nexthdr = top_iph->protocol;
- if (ph->padlen)
- memset(ph + 1, IPOPT_NOP, ph->padlen);
-
- top_iph->protocol = IPPROTO_BEETPH;
- top_iph->ihl = sizeof(struct iphdr) / 4;
- }
-
- top_iph->saddr = x->props.saddr.a4;
- top_iph->daddr = x->id.daddr.a4;
-
- return 0;
-}
-
-static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb)
-{
- struct iphdr *iph;
- int optlen = 0;
- int err = -EINVAL;
-
- if (unlikely(XFRM_MODE_SKB_CB(skb)->protocol == IPPROTO_BEETPH)) {
- struct ip_beet_phdr *ph;
- int phlen;
-
- if (!pskb_may_pull(skb, sizeof(*ph)))
- goto out;
-
- ph = (struct ip_beet_phdr *)skb->data;
-
- phlen = sizeof(*ph) + ph->padlen;
- optlen = ph->hdrlen * 8 + (IPV4_BEET_PHMAXLEN - phlen);
- if (optlen < 0 || optlen & 3 || optlen > 250)
- goto out;
-
- XFRM_MODE_SKB_CB(skb)->protocol = ph->nexthdr;
-
- if (!pskb_may_pull(skb, phlen))
- goto out;
- __skb_pull(skb, phlen);
- }
-
- skb_push(skb, sizeof(*iph));
- skb_reset_network_header(skb);
- skb_mac_header_rebuild(skb);
-
- xfrm4_beet_make_header(skb);
-
- iph = ip_hdr(skb);
-
- iph->ihl += optlen / 4;
- iph->tot_len = htons(skb->len);
- iph->daddr = x->sel.daddr.a4;
- iph->saddr = x->sel.saddr.a4;
- iph->check = 0;
- iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
- err = 0;
-out:
- return err;
-}
-
-static struct xfrm_mode xfrm4_beet_mode = {
- .input2 = xfrm4_beet_input,
- .input = xfrm_prepare_input,
- .output2 = xfrm4_beet_output,
- .output = xfrm4_prepare_output,
- .owner = THIS_MODULE,
- .encap = XFRM_MODE_BEET,
- .flags = XFRM_MODE_FLAG_TUNNEL,
-};
-
-static int __init xfrm4_beet_init(void)
-{
- return xfrm_register_mode(&xfrm4_beet_mode, AF_INET);
-}
-
-static void __exit xfrm4_beet_exit(void)
-{
- int err;
-
- err = xfrm_unregister_mode(&xfrm4_beet_mode, AF_INET);
- BUG_ON(err);
-}
-
-module_init(xfrm4_beet_init);
-module_exit(xfrm4_beet_exit);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_BEET);
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
deleted file mode 100644
index 1ad2c2c4e250..000000000000
--- a/net/ipv4/xfrm4_mode_transport.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4.
- *
- * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/stringify.h>
-#include <net/dst.h>
-#include <net/ip.h>
-#include <net/xfrm.h>
-#include <net/protocol.h>
-
-/* Add encapsulation header.
- *
- * The IP header will be moved forward to make space for the encapsulation
- * header.
- */
-static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
-{
- struct iphdr *iph = ip_hdr(skb);
- int ihl = iph->ihl * 4;
-
- skb_set_inner_transport_header(skb, skb_transport_offset(skb));
-
- skb_set_network_header(skb, -x->props.header_len);
- skb->mac_header = skb->network_header +
- offsetof(struct iphdr, protocol);
- skb->transport_header = skb->network_header + ihl;
- __skb_pull(skb, ihl);
- memmove(skb_network_header(skb), iph, ihl);
- return 0;
-}
-
-/* Remove encapsulation header.
- *
- * The IP header will be moved over the top of the encapsulation header.
- *
- * On entry, skb->h shall point to where the IP header should be and skb->nh
- * shall be set to where the IP header currently is. skb->data shall point
- * to the start of the payload.
- */
-static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
-{
- int ihl = skb->data - skb_transport_header(skb);
-
- if (skb->transport_header != skb->network_header) {
- memmove(skb_transport_header(skb),
- skb_network_header(skb), ihl);
- skb->network_header = skb->transport_header;
- }
- ip_hdr(skb)->tot_len = htons(skb->len + ihl);
- skb_reset_transport_header(skb);
- return 0;
-}
-
-static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x,
- struct sk_buff *skb,
- netdev_features_t features)
-{
- const struct net_offload *ops;
- struct sk_buff *segs = ERR_PTR(-EINVAL);
- struct xfrm_offload *xo = xfrm_offload(skb);
-
- skb->transport_header += x->props.header_len;
- ops = rcu_dereference(inet_offloads[xo->proto]);
- if (likely(ops && ops->callbacks.gso_segment))
- segs = ops->callbacks.gso_segment(skb, features);
-
- return segs;
-}
-
-static void xfrm4_transport_xmit(struct xfrm_state *x, struct sk_buff *skb)
-{
- struct xfrm_offload *xo = xfrm_offload(skb);
-
- skb_reset_mac_len(skb);
- pskb_pull(skb, skb->mac_len + sizeof(struct iphdr) + x->props.header_len);
-
- if (xo->flags & XFRM_GSO_SEGMENT) {
- skb_reset_transport_header(skb);
- skb->transport_header -= x->props.header_len;
- }
-}
-
-static struct xfrm_mode xfrm4_transport_mode = {
- .input = xfrm4_transport_input,
- .output = xfrm4_transport_output,
- .gso_segment = xfrm4_transport_gso_segment,
- .xmit = xfrm4_transport_xmit,
- .owner = THIS_MODULE,
- .encap = XFRM_MODE_TRANSPORT,
-};
-
-static int __init xfrm4_transport_init(void)
-{
- return xfrm_register_mode(&xfrm4_transport_mode, AF_INET);
-}
-
-static void __exit xfrm4_transport_exit(void)
-{
- int err;
-
- err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET);
- BUG_ON(err);
-}
-
-module_init(xfrm4_transport_init);
-module_exit(xfrm4_transport_exit);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
deleted file mode 100644
index 2a9764bd1719..000000000000
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4.
- *
- * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
- */
-
-#include <linux/gfp.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/stringify.h>
-#include <net/dst.h>
-#include <net/inet_ecn.h>
-#include <net/ip.h>
-#include <net/xfrm.h>
-
-static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
-{
- struct iphdr *inner_iph = ipip_hdr(skb);
-
- if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos))
- IP_ECN_set_ce(inner_iph);
-}
-
-/* Add encapsulation header.
- *
- * The top IP header will be constructed per RFC 2401.
- */
-static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
-{
- struct dst_entry *dst = skb_dst(skb);
- struct iphdr *top_iph;
- int flags;
-
- skb_set_inner_network_header(skb, skb_network_offset(skb));
- skb_set_inner_transport_header(skb, skb_transport_offset(skb));
-
- skb_set_network_header(skb, -x->props.header_len);
- skb->mac_header = skb->network_header +
- offsetof(struct iphdr, protocol);
- skb->transport_header = skb->network_header + sizeof(*top_iph);
- top_iph = ip_hdr(skb);
-
- top_iph->ihl = 5;
- top_iph->version = 4;
-
- top_iph->protocol = xfrm_af2proto(skb_dst(skb)->ops->family);
-
- /* DS disclosing depends on XFRM_SA_XFLAG_DONT_ENCAP_DSCP */
- if (x->props.extra_flags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP)
- top_iph->tos = 0;
- else
- top_iph->tos = XFRM_MODE_SKB_CB(skb)->tos;
- top_iph->tos = INET_ECN_encapsulate(top_iph->tos,
- XFRM_MODE_SKB_CB(skb)->tos);
-
- flags = x->props.flags;
- if (flags & XFRM_STATE_NOECN)
- IP_ECN_clear(top_iph);
-
- top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ?
- 0 : (XFRM_MODE_SKB_CB(skb)->frag_off & htons(IP_DF));
-
- top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst));
-
- top_iph->saddr = x->props.saddr.a4;
- top_iph->daddr = x->id.daddr.a4;
- ip_select_ident(dev_net(dst->dev), skb, NULL);
-
- return 0;
-}
-
-static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
-{
- int err = -EINVAL;
-
- if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP)
- goto out;
-
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
- goto out;
-
- err = skb_unclone(skb, GFP_ATOMIC);
- if (err)
- goto out;
-
- if (x->props.flags & XFRM_STATE_DECAP_DSCP)
- ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, ipip_hdr(skb));
- if (!(x->props.flags & XFRM_STATE_NOECN))
- ipip_ecn_decapsulate(skb);
-
- skb_reset_network_header(skb);
- skb_mac_header_rebuild(skb);
- if (skb->mac_len)
- eth_hdr(skb)->h_proto = skb->protocol;
-
- err = 0;
-
-out:
- return err;
-}
-
-static struct sk_buff *xfrm4_mode_tunnel_gso_segment(struct xfrm_state *x,
- struct sk_buff *skb,
- netdev_features_t features)
-{
- __skb_push(skb, skb->mac_len);
- return skb_mac_gso_segment(skb, features);
-}
-
-static void xfrm4_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb)
-{
- struct xfrm_offload *xo = xfrm_offload(skb);
-
- if (xo->flags & XFRM_GSO_SEGMENT)
- skb->transport_header = skb->network_header +
- sizeof(struct iphdr);
-
- skb_reset_mac_len(skb);
- pskb_pull(skb, skb->mac_len + x->props.header_len);
-}
-
-static struct xfrm_mode xfrm4_tunnel_mode = {
- .input2 = xfrm4_mode_tunnel_input,
- .input = xfrm_prepare_input,
- .output2 = xfrm4_mode_tunnel_output,
- .output = xfrm4_prepare_output,
- .gso_segment = xfrm4_mode_tunnel_gso_segment,
- .xmit = xfrm4_mode_tunnel_xmit,
- .owner = THIS_MODULE,
- .encap = XFRM_MODE_TUNNEL,
- .flags = XFRM_MODE_FLAG_TUNNEL,
-};
-
-static int __init xfrm4_mode_tunnel_init(void)
-{
- return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET);
-}
-
-static void __exit xfrm4_mode_tunnel_exit(void)
-{
- int err;
-
- err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET);
- BUG_ON(err);
-}
-
-module_init(xfrm4_mode_tunnel_init);
-module_exit(xfrm4_mode_tunnel_exit);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL);
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index be980c195fc5..0ae67d537499 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* xfrm4_output.c - Common IPsec encapsulation code for IPv4.
* Copyright (c) 2004 Herbert Xu <herbert@gondor.apana.org.au>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/if_ether.h>
@@ -18,90 +14,24 @@
#include <net/xfrm.h>
#include <net/icmp.h>
-static int xfrm4_tunnel_check_size(struct sk_buff *skb)
-{
- int mtu, ret = 0;
-
- if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
- goto out;
-
- if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df)
- goto out;
-
- mtu = dst_mtu(skb_dst(skb));
- if ((!skb_is_gso(skb) && skb->len > mtu) ||
- (skb_is_gso(skb) &&
- !skb_gso_validate_network_len(skb, ip_skb_dst_mtu(skb->sk, skb)))) {
- skb->protocol = htons(ETH_P_IP);
-
- if (skb->sk)
- xfrm_local_error(skb, mtu);
- else
- icmp_send(skb, ICMP_DEST_UNREACH,
- ICMP_FRAG_NEEDED, htonl(mtu));
- ret = -EMSGSIZE;
- }
-out:
- return ret;
-}
-
-int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb)
-{
- int err;
-
- err = xfrm4_tunnel_check_size(skb);
- if (err)
- return err;
-
- XFRM_MODE_SKB_CB(skb)->protocol = ip_hdr(skb)->protocol;
-
- return xfrm4_extract_header(skb);
-}
-
-int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
-{
- int err;
-
- err = xfrm_inner_extract_output(x, skb);
- if (err)
- return err;
-
- IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
- skb->protocol = htons(ETH_P_IP);
-
- return x->outer_mode->output2(x, skb);
-}
-EXPORT_SYMBOL(xfrm4_prepare_output);
-
-int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb)
-{
- memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
-
-#ifdef CONFIG_NETFILTER
- IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
-#endif
-
- return xfrm_output(sk, skb);
-}
-
static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+#ifdef CONFIG_NETFILTER
struct xfrm_state *x = skb_dst(skb)->xfrm;
-#ifdef CONFIG_NETFILTER
if (!x) {
IPCB(skb)->flags |= IPSKB_REROUTED;
return dst_output(net, sk, skb);
}
#endif
- return x->outer_mode->afinfo->output_finish(sk, skb);
+ return xfrm_output(sk, skb);
}
int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
- net, sk, skb, NULL, skb_dst(skb)->dev,
+ net, sk, skb, skb->dev, skb_dst_dev(skb),
__xfrm4_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index d73a6d6652f6..58faf1ddd2b1 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -12,55 +12,49 @@
#include <linux/err.h>
#include <linux/kernel.h>
#include <linux/inetdevice.h>
-#include <linux/if_tunnel.h>
#include <net/dst.h>
#include <net/xfrm.h>
+#include <net/flow.h>
#include <net/ip.h>
#include <net/l3mdev.h>
-static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4,
- int tos, int oif,
- const xfrm_address_t *saddr,
- const xfrm_address_t *daddr,
- u32 mark)
+static struct dst_entry *__xfrm4_dst_lookup(struct flowi4 *fl4,
+ const struct xfrm_dst_lookup_params *params)
{
struct rtable *rt;
memset(fl4, 0, sizeof(*fl4));
- fl4->daddr = daddr->a4;
- fl4->flowi4_tos = tos;
- fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif);
- fl4->flowi4_mark = mark;
- if (saddr)
- fl4->saddr = saddr->a4;
-
- fl4->flowi4_flags = FLOWI_FLAG_SKIP_NH_OIF;
-
- rt = __ip_route_output_key(net, fl4);
+ fl4->daddr = params->daddr->a4;
+ fl4->flowi4_dscp = params->dscp;
+ fl4->flowi4_l3mdev = l3mdev_master_ifindex_by_index(params->net,
+ params->oif);
+ fl4->flowi4_mark = params->mark;
+ if (params->saddr)
+ fl4->saddr = params->saddr->a4;
+ fl4->flowi4_proto = params->ipproto;
+ fl4->uli = params->uli;
+
+ rt = __ip_route_output_key(params->net, fl4);
if (!IS_ERR(rt))
return &rt->dst;
return ERR_CAST(rt);
}
-static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, int oif,
- const xfrm_address_t *saddr,
- const xfrm_address_t *daddr,
- u32 mark)
+static struct dst_entry *xfrm4_dst_lookup(const struct xfrm_dst_lookup_params *params)
{
struct flowi4 fl4;
- return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr, mark);
+ return __xfrm4_dst_lookup(&fl4, params);
}
-static int xfrm4_get_saddr(struct net *net, int oif,
- xfrm_address_t *saddr, xfrm_address_t *daddr,
- u32 mark)
+static int xfrm4_get_saddr(xfrm_address_t *saddr,
+ const struct xfrm_dst_lookup_params *params)
{
struct dst_entry *dst;
struct flowi4 fl4;
- dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr, mark);
+ dst = __xfrm4_dst_lookup(&fl4, params);
if (IS_ERR(dst))
return -EHOSTUNREACH;
@@ -69,27 +63,16 @@ static int xfrm4_get_saddr(struct net *net, int oif,
return 0;
}
-static int xfrm4_get_tos(const struct flowi *fl)
-{
- return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; /* Strip ECN bits */
-}
-
-static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
- int nfheader_len)
-{
- return 0;
-}
-
static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
const struct flowi *fl)
{
- struct rtable *rt = (struct rtable *)xdst->route;
+ struct rtable *rt = dst_rtable(xdst->route);
const struct flowi4 *fl4 = &fl->u.ip4;
xdst->u.rt.rt_iif = fl4->flowi4_iif;
xdst->u.dst.dev = dev;
- dev_hold(dev);
+ netdev_hold(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC);
/* Sheit... I remember I did this right. Apparently,
* it was magically lost, so this code needs audit */
@@ -97,135 +80,27 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
xdst->u.rt.rt_flags = rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST |
RTCF_LOCAL);
xdst->u.rt.rt_type = rt->rt_type;
- xdst->u.rt.rt_gateway = rt->rt_gateway;
xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway;
+ xdst->u.rt.rt_gw_family = rt->rt_gw_family;
+ if (rt->rt_gw_family == AF_INET)
+ xdst->u.rt.rt_gw4 = rt->rt_gw4;
+ else if (rt->rt_gw_family == AF_INET6)
+ xdst->u.rt.rt_gw6 = rt->rt_gw6;
xdst->u.rt.rt_pmtu = rt->rt_pmtu;
xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked;
- INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
rt_add_uncached_list(&xdst->u.rt);
return 0;
}
-static void
-_decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
-{
- const struct iphdr *iph = ip_hdr(skb);
- u8 *xprth = skb_network_header(skb) + iph->ihl * 4;
- struct flowi4 *fl4 = &fl->u.ip4;
- int oif = 0;
-
- if (skb_dst(skb))
- oif = skb_dst(skb)->dev->ifindex;
-
- memset(fl4, 0, sizeof(struct flowi4));
- fl4->flowi4_mark = skb->mark;
- fl4->flowi4_oif = reverse ? skb->skb_iif : oif;
-
- if (!ip_is_fragment(iph)) {
- switch (iph->protocol) {
- case IPPROTO_UDP:
- case IPPROTO_UDPLITE:
- case IPPROTO_TCP:
- case IPPROTO_SCTP:
- case IPPROTO_DCCP:
- if (xprth + 4 < skb->data ||
- pskb_may_pull(skb, xprth + 4 - skb->data)) {
- __be16 *ports;
-
- xprth = skb_network_header(skb) + iph->ihl * 4;
- ports = (__be16 *)xprth;
-
- fl4->fl4_sport = ports[!!reverse];
- fl4->fl4_dport = ports[!reverse];
- }
- break;
-
- case IPPROTO_ICMP:
- if (xprth + 2 < skb->data ||
- pskb_may_pull(skb, xprth + 2 - skb->data)) {
- u8 *icmp;
-
- xprth = skb_network_header(skb) + iph->ihl * 4;
- icmp = xprth;
-
- fl4->fl4_icmp_type = icmp[0];
- fl4->fl4_icmp_code = icmp[1];
- }
- break;
-
- case IPPROTO_ESP:
- if (xprth + 4 < skb->data ||
- pskb_may_pull(skb, xprth + 4 - skb->data)) {
- __be32 *ehdr;
-
- xprth = skb_network_header(skb) + iph->ihl * 4;
- ehdr = (__be32 *)xprth;
-
- fl4->fl4_ipsec_spi = ehdr[0];
- }
- break;
-
- case IPPROTO_AH:
- if (xprth + 8 < skb->data ||
- pskb_may_pull(skb, xprth + 8 - skb->data)) {
- __be32 *ah_hdr;
-
- xprth = skb_network_header(skb) + iph->ihl * 4;
- ah_hdr = (__be32 *)xprth;
-
- fl4->fl4_ipsec_spi = ah_hdr[1];
- }
- break;
-
- case IPPROTO_COMP:
- if (xprth + 4 < skb->data ||
- pskb_may_pull(skb, xprth + 4 - skb->data)) {
- __be16 *ipcomp_hdr;
-
- xprth = skb_network_header(skb) + iph->ihl * 4;
- ipcomp_hdr = (__be16 *)xprth;
-
- fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1]));
- }
- break;
-
- case IPPROTO_GRE:
- if (xprth + 12 < skb->data ||
- pskb_may_pull(skb, xprth + 12 - skb->data)) {
- __be16 *greflags;
- __be32 *gre_hdr;
-
- xprth = skb_network_header(skb) + iph->ihl * 4;
- greflags = (__be16 *)xprth;
- gre_hdr = (__be32 *)xprth;
-
- if (greflags[0] & GRE_KEY) {
- if (greflags[0] & GRE_CSUM)
- gre_hdr++;
- fl4->fl4_gre_key = gre_hdr[1];
- }
- }
- break;
-
- default:
- fl4->fl4_ipsec_spi = 0;
- break;
- }
- }
- fl4->flowi4_proto = iph->protocol;
- fl4->daddr = reverse ? iph->saddr : iph->daddr;
- fl4->saddr = reverse ? iph->daddr : iph->saddr;
- fl4->flowi4_tos = iph->tos;
-}
-
static void xfrm4_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
{
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
struct dst_entry *path = xdst->route;
- path->ops->update_pmtu(path, sk, skb, mtu);
+ path->ops->update_pmtu(path, sk, skb, mtu, confirm_neigh);
}
static void xfrm4_redirect(struct dst_entry *dst, struct sock *sk,
@@ -242,27 +117,17 @@ static void xfrm4_dst_destroy(struct dst_entry *dst)
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
dst_destroy_metrics_generic(dst);
- if (xdst->u.rt.rt_uncached_list)
- rt_del_uncached_list(&xdst->u.rt);
+ rt_del_uncached_list(&xdst->u.rt);
xfrm_dst_destroy(xdst);
}
-static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
- int unregister)
-{
- if (!unregister)
- return;
-
- xfrm_dst_ifdown(dst, dev);
-}
-
static struct dst_ops xfrm4_dst_ops_template = {
.family = AF_INET,
.update_pmtu = xfrm4_update_pmtu,
.redirect = xfrm4_redirect,
.cow_metrics = dst_cow_metrics_generic,
.destroy = xfrm4_dst_destroy,
- .ifdown = xfrm4_dst_ifdown,
+ .ifdown = xfrm_dst_ifdown,
.local_out = __ip_local_out,
.gc_thresh = 32768,
};
@@ -271,9 +136,6 @@ static const struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
.dst_ops = &xfrm4_dst_ops_template,
.dst_lookup = xfrm4_dst_lookup,
.get_saddr = xfrm4_get_saddr,
- .decode_session = _decode_session4,
- .get_tos = xfrm4_get_tos,
- .init_path = xfrm4_init_path,
.fill_dst = xfrm4_fill_dst,
.blackhole_route = ipv4_blackhole_route,
};
@@ -287,7 +149,6 @@ static struct ctl_table xfrm4_policy_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- { }
};
static __net_init int xfrm4_net_sysctl_init(struct net *net)
@@ -304,7 +165,8 @@ static __net_init int xfrm4_net_sysctl_init(struct net *net)
table[0].data = &net->xfrm.xfrm4_dst_ops.gc_thresh;
}
- hdr = register_net_sysctl(net, "net/ipv4", table);
+ hdr = register_net_sysctl_sz(net, "net/ipv4", table,
+ ARRAY_SIZE(xfrm4_policy_table));
if (!hdr)
goto err_reg;
@@ -320,7 +182,7 @@ err_alloc:
static __net_exit void xfrm4_net_sysctl_exit(struct net *net)
{
- struct ctl_table *table;
+ const struct ctl_table *table;
if (!net->ipv4.xfrm4_hdr)
return;
diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c
index 8dd0e6ab8606..4ee624d8e66f 100644
--- a/net/ipv4/xfrm4_protocol.c
+++ b/net/ipv4/xfrm4_protocol.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* xfrm4_protocol.c - Generic xfrm protocol multiplexer.
*
* Copyright (C) 2013 secunet Security Networks AG
@@ -7,11 +8,6 @@
*
* Based on:
* net/ipv4/tunnel4.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/init.h>
@@ -46,7 +42,7 @@ static inline struct xfrm4_protocol __rcu **proto_handlers(u8 protocol)
handler != NULL; \
handler = rcu_dereference(handler->next)) \
-int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
+static int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
{
int ret;
struct xfrm4_protocol *handler;
@@ -61,7 +57,6 @@ int xfrm4_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
return 0;
}
-EXPORT_SYMBOL(xfrm4_rcv_cb);
int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
int encap_type)
@@ -77,6 +72,14 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
if (!head)
goto out;
+ if (!skb_dst(skb)) {
+ const struct iphdr *iph = ip_hdr(skb);
+
+ if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
+ ip4h_dscp(iph), skb->dev))
+ goto drop;
+ }
+
for_each_protocol_rcu(*head, handler)
if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL)
return ret;
@@ -84,6 +87,7 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
out:
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+drop:
kfree_skb(skb);
return 0;
}
@@ -106,13 +110,15 @@ static int xfrm4_esp_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm4_esp_err(struct sk_buff *skb, u32 info)
+static int xfrm4_esp_err(struct sk_buff *skb, u32 info)
{
struct xfrm4_protocol *handler;
for_each_protocol_rcu(esp4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static int xfrm4_ah_rcv(struct sk_buff *skb)
@@ -132,13 +138,15 @@ static int xfrm4_ah_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm4_ah_err(struct sk_buff *skb, u32 info)
+static int xfrm4_ah_err(struct sk_buff *skb, u32 info)
{
struct xfrm4_protocol *handler;
for_each_protocol_rcu(ah4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
@@ -158,34 +166,33 @@ static int xfrm4_ipcomp_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
+static int xfrm4_ipcomp_err(struct sk_buff *skb, u32 info)
{
struct xfrm4_protocol *handler;
for_each_protocol_rcu(ipcomp4_handlers, handler)
if (!handler->err_handler(skb, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static const struct net_protocol esp4_protocol = {
.handler = xfrm4_esp_rcv,
.err_handler = xfrm4_esp_err,
.no_policy = 1,
- .netns_ok = 1,
};
static const struct net_protocol ah4_protocol = {
.handler = xfrm4_ah_rcv,
.err_handler = xfrm4_ah_err,
.no_policy = 1,
- .netns_ok = 1,
};
static const struct net_protocol ipcomp4_protocol = {
.handler = xfrm4_ipcomp_rcv,
.err_handler = xfrm4_ipcomp_err,
.no_policy = 1,
- .netns_ok = 1,
};
static const struct xfrm_input_afinfo xfrm4_input_afinfo = {
@@ -297,4 +304,3 @@ void __init xfrm4_protocol_init(void)
{
xfrm_input_register_afinfo(&xfrm4_input_afinfo);
}
-EXPORT_SYMBOL(xfrm4_protocol_init);
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
index 80c40b4981bb..87d4db591488 100644
--- a/net/ipv4/xfrm4_state.c
+++ b/net/ipv4/xfrm4_state.c
@@ -8,81 +8,12 @@
*
*/
-#include <net/ip.h>
#include <net/xfrm.h>
-#include <linux/pfkeyv2.h>
-#include <linux/ipsec.h>
-#include <linux/netfilter_ipv4.h>
-#include <linux/export.h>
-
-static int xfrm4_init_flags(struct xfrm_state *x)
-{
- if (xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc)
- x->props.flags |= XFRM_STATE_NOPMTUDISC;
- return 0;
-}
-
-static void
-__xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
-{
- const struct flowi4 *fl4 = &fl->u.ip4;
-
- sel->daddr.a4 = fl4->daddr;
- sel->saddr.a4 = fl4->saddr;
- sel->dport = xfrm_flowi_dport(fl, &fl4->uli);
- sel->dport_mask = htons(0xffff);
- sel->sport = xfrm_flowi_sport(fl, &fl4->uli);
- sel->sport_mask = htons(0xffff);
- sel->family = AF_INET;
- sel->prefixlen_d = 32;
- sel->prefixlen_s = 32;
- sel->proto = fl4->flowi4_proto;
- sel->ifindex = fl4->flowi4_oif;
-}
-
-static void
-xfrm4_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
- const xfrm_address_t *daddr, const xfrm_address_t *saddr)
-{
- x->id = tmpl->id;
- if (x->id.daddr.a4 == 0)
- x->id.daddr.a4 = daddr->a4;
- x->props.saddr = tmpl->saddr;
- if (x->props.saddr.a4 == 0)
- x->props.saddr.a4 = saddr->a4;
- x->props.mode = tmpl->mode;
- x->props.reqid = tmpl->reqid;
- x->props.family = AF_INET;
-}
-
-int xfrm4_extract_header(struct sk_buff *skb)
-{
- const struct iphdr *iph = ip_hdr(skb);
-
- XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph);
- XFRM_MODE_SKB_CB(skb)->id = iph->id;
- XFRM_MODE_SKB_CB(skb)->frag_off = iph->frag_off;
- XFRM_MODE_SKB_CB(skb)->tos = iph->tos;
- XFRM_MODE_SKB_CB(skb)->ttl = iph->ttl;
- XFRM_MODE_SKB_CB(skb)->optlen = iph->ihl * 4 - sizeof(*iph);
- memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0,
- sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl));
-
- return 0;
-}
static struct xfrm_state_afinfo xfrm4_state_afinfo = {
.family = AF_INET,
.proto = IPPROTO_IPIP,
- .eth_proto = htons(ETH_P_IP),
- .owner = THIS_MODULE,
- .init_flags = xfrm4_init_flags,
- .init_tempsel = __xfrm4_init_tempsel,
- .init_temprop = xfrm4_init_temprop,
.output = xfrm4_output,
- .output_finish = xfrm4_output_finish,
- .extract_input = xfrm4_extract_input,
- .extract_output = xfrm4_extract_output,
.transport_finish = xfrm4_transport_finish,
.local_error = xfrm4_local_error,
};
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 06347dbd32c1..8cb266af1393 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* xfrm4_tunnel.c: Generic IP tunnel transformer.
*
* Copyright (C) 2003 David S. Miller (davem@redhat.com)
@@ -7,9 +8,7 @@
#include <linux/skbuff.h>
#include <linux/module.h>
-#include <linux/mutex.h>
#include <net/xfrm.h>
-#include <net/ip.h>
#include <net/protocol.h>
static int ipip_output(struct xfrm_state *x, struct sk_buff *skb)
@@ -23,13 +22,17 @@ static int ipip_xfrm_rcv(struct xfrm_state *x, struct sk_buff *skb)
return ip_hdr(skb)->protocol;
}
-static int ipip_init_state(struct xfrm_state *x)
+static int ipip_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
- if (x->props.mode != XFRM_MODE_TUNNEL)
+ if (x->props.mode != XFRM_MODE_TUNNEL) {
+ NL_SET_ERR_MSG(extack, "IPv4 tunnel can only be used with tunnel mode");
return -EINVAL;
+ }
- if (x->encap)
+ if (x->encap) {
+ NL_SET_ERR_MSG(extack, "IPv4 tunnel is not compatible with encapsulation");
return -EINVAL;
+ }
x->props.header_len = sizeof(struct iphdr);
@@ -41,7 +44,6 @@ static void ipip_destroy(struct xfrm_state *x)
}
static const struct xfrm_type ipip_type = {
- .description = "IPIP",
.owner = THIS_MODULE,
.proto = IPPROTO_IPIP,
.init_state = ipip_init_state,
@@ -63,14 +65,14 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info)
static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
.handler = xfrm_tunnel_rcv,
.err_handler = xfrm_tunnel_err,
- .priority = 3,
+ .priority = 4,
};
#if IS_ENABLED(CONFIG_IPV6)
static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
.handler = xfrm_tunnel_rcv,
.err_handler = xfrm_tunnel_err,
- .priority = 2,
+ .priority = 3,
};
#endif
@@ -107,11 +109,11 @@ static void __exit ipip_fini(void)
if (xfrm4_tunnel_deregister(&xfrm_tunnel_handler, AF_INET))
pr_info("%s: can't remove xfrm handler for AF_INET\n",
__func__);
- if (xfrm_unregister_type(&ipip_type, AF_INET) < 0)
- pr_info("%s: can't remove xfrm type\n", __func__);
+ xfrm_unregister_type(&ipip_type, AF_INET);
}
module_init(ipip_init);
module_exit(ipip_fini);
+MODULE_DESCRIPTION("IPv4 XFRM tunnel driver");
MODULE_LICENSE("GPL");
MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_IPIP);
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 613282c65a10..b8f9a8c0302e 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# IPv6 configuration
#
@@ -6,14 +7,15 @@
menuconfig IPV6
tristate "The IPv6 protocol"
default y
- ---help---
+ select CRYPTO_LIB_SHA1
+ help
Support for IP version 6 (IPv6).
For general information about IPv6, see
<https://en.wikipedia.org/wiki/IPv6>.
For specific information about IPv6 under Linux, see
- Documentation/networking/ipv6.txt and read the HOWTO at
- <http://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/>
+ Documentation/networking/ipv6.rst and read the HOWTO at
+ <https://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/>
To compile this protocol support as a module, choose M here: the
module will be called ipv6.
@@ -22,7 +24,7 @@ if IPV6
config IPV6_ROUTER_PREF
bool "IPv6: Router Preference (RFC 4191) support"
- ---help---
+ help
Router Preference is an optional extension to the Router
Advertisement message which improves the ability of hosts
to pick an appropriate router, especially when the hosts
@@ -33,14 +35,14 @@ config IPV6_ROUTER_PREF
config IPV6_ROUTE_INFO
bool "IPv6: Route Information (RFC 4191) support"
depends on IPV6_ROUTER_PREF
- ---help---
+ help
Support of Route Information.
If unsure, say N.
config IPV6_OPTIMISTIC_DAD
bool "IPv6: Enable RFC 4429 Optimistic DAD"
- ---help---
+ help
Support for optimistic Duplicate Address Detection. It allows for
autoconfigured addresses to be used more quickly.
@@ -48,29 +50,31 @@ config IPV6_OPTIMISTIC_DAD
config INET6_AH
tristate "IPv6: AH transformation"
- select XFRM_ALGO
- select CRYPTO
- select CRYPTO_HMAC
- select CRYPTO_MD5
- select CRYPTO_SHA1
- ---help---
- Support for IPsec AH.
+ select XFRM_AH
+ help
+ Support for IPsec AH (Authentication Header).
+
+ AH can be used with various authentication algorithms. Besides
+ enabling AH support itself, this option enables the generic
+ implementations of the algorithms that RFC 8221 lists as MUST be
+ implemented. If you need any other algorithms, you'll need to enable
+ them in the crypto API. You should also enable accelerated
+ implementations of any needed algorithms when available.
If unsure, say Y.
config INET6_ESP
tristate "IPv6: ESP transformation"
- select XFRM_ALGO
- select CRYPTO
- select CRYPTO_AUTHENC
- select CRYPTO_HMAC
- select CRYPTO_MD5
- select CRYPTO_CBC
- select CRYPTO_SHA1
- select CRYPTO_DES
- select CRYPTO_ECHAINIV
- ---help---
- Support for IPsec ESP.
+ select XFRM_ESP
+ help
+ Support for IPsec ESP (Encapsulating Security Payload).
+
+ ESP can be used with various encryption and authentication algorithms.
+ Besides enabling ESP support itself, this option enables the generic
+ implementations of the algorithms that RFC 8221 lists as MUST be
+ implemented. If you need any other algorithms, you'll need to enable
+ them in the crypto API. You should also enable accelerated
+ implementations of any needed algorithms when available.
If unsure, say Y.
@@ -79,7 +83,7 @@ config INET6_ESP_OFFLOAD
depends on INET6_ESP
select XFRM_OFFLOAD
default n
- ---help---
+ help
Support for ESP transformation offload. This makes sense
only if this system really does IPsec and want to do it
with high throughput. A typical desktop system does not
@@ -87,11 +91,23 @@ config INET6_ESP_OFFLOAD
If unsure, say N.
+config INET6_ESPINTCP
+ bool "IPv6: ESP in TCP encapsulation (RFC 8229)"
+ depends on XFRM && INET6_ESP
+ select STREAM_PARSER
+ select NET_SOCK_MSG
+ select XFRM_ESPINTCP
+ help
+ Support for RFC 8229 encapsulation of ESP and IKE over
+ TCP/IPv6 sockets.
+
+ If unsure, say N.
+
config INET6_IPCOMP
tristate "IPv6: IPComp transformation"
select INET6_XFRM_TUNNEL
select XFRM_IPCOMP
- ---help---
+ help
Support for IP Payload Compression Protocol (IPComp) (RFC3173),
typically needed for IPsec.
@@ -100,7 +116,7 @@ config INET6_IPCOMP
config IPV6_MIP6
tristate "IPv6: Mobility"
select XFRM
- ---help---
+ help
Support for IPv6 Mobility described in RFC 3775.
If unsure, say N.
@@ -110,7 +126,7 @@ config IPV6_ILA
depends on NETFILTER
select DST_CACHE
select LWTUNNEL
- ---help---
+ help
Support for IPv6 Identifier Locator Addressing (ILA).
ILA is a mechanism to do network virtualization without
@@ -135,45 +151,12 @@ config INET6_TUNNEL
tristate
default n
-config INET6_XFRM_MODE_TRANSPORT
- tristate "IPv6: IPsec transport mode"
- default IPV6
- select XFRM
- ---help---
- Support for IPsec transport mode.
-
- If unsure, say Y.
-
-config INET6_XFRM_MODE_TUNNEL
- tristate "IPv6: IPsec tunnel mode"
- default IPV6
- select XFRM
- ---help---
- Support for IPsec tunnel mode.
-
- If unsure, say Y.
-
-config INET6_XFRM_MODE_BEET
- tristate "IPv6: IPsec BEET mode"
- default IPV6
- select XFRM
- ---help---
- Support for IPsec BEET mode.
-
- If unsure, say Y.
-
-config INET6_XFRM_MODE_ROUTEOPTIMIZATION
- tristate "IPv6: MIPv6 route optimization mode"
- select XFRM
- ---help---
- Support for MIPv6 route optimization mode.
-
config IPV6_VTI
-tristate "Virtual (secure) IPv6: tunneling"
+ tristate "Virtual (secure) IPv6: tunneling"
select IPV6_TUNNEL
select NET_IP_TUNNEL
- depends on INET6_XFRM_MODE_TUNNEL
- ---help---
+ select XFRM
+ help
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
encapsulating protocol. This can be used with xfrm mode tunnel to give
@@ -186,7 +169,7 @@ config IPV6_SIT
select NET_IP_TUNNEL
select IPV6_NDISC_NODETYPE
default y
- ---help---
+ help
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
encapsulating protocol. This driver implements encapsulation of IPv6
@@ -199,7 +182,7 @@ config IPV6_SIT_6RD
bool "IPv6: IPv6 Rapid Deployment (6RD)"
depends on IPV6_SIT
default n
- ---help---
+ help
IPv6 Rapid Deployment (6rd; draft-ietf-softwire-ipv6-6rd) builds upon
mechanisms of 6to4 (RFC3056) to enable a service provider to rapidly
deploy IPv6 unicast service to IPv4 sites to which it provides
@@ -222,7 +205,7 @@ config IPV6_TUNNEL
select INET6_TUNNEL
select DST_CACHE
select GRO_CELLS
- ---help---
+ help
Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in
RFC 2473.
@@ -233,7 +216,7 @@ config IPV6_GRE
select IPV6_TUNNEL
select NET_IP_TUNNEL
depends on NET_IPGRE_DEMUX
- ---help---
+ help
Tunneling means encapsulating data of one protocol type within
another protocol and sending it over a channel that understands the
encapsulating protocol. This particular tunneling driver implements
@@ -258,13 +241,13 @@ config IPV6_FOU_TUNNEL
config IPV6_MULTIPLE_TABLES
bool "IPv6: Multiple Routing Tables"
select FIB_RULES
- ---help---
+ help
Support multiple routing tables.
config IPV6_SUBTREES
bool "IPv6: source address based routing"
depends on IPV6_MULTIPLE_TABLES
- ---help---
+ help
Enable routing by source address or prefix.
The destination address is still the primary routing key, so mixing
@@ -279,7 +262,7 @@ config IPV6_MROUTE
bool "IPv6: multicast routing"
depends on IPV6
select IP_MROUTE_COMMON
- ---help---
+ help
Support for IPv6 multicast forwarding.
If unsure, say N.
@@ -300,7 +283,7 @@ config IPV6_MROUTE_MULTIPLE_TABLES
config IPV6_PIMSM_V2
bool "IPv6: PIM-SM version 2 support"
depends on IPV6_MROUTE
- ---help---
+ help
Support for IPv6 PIM multicast routing protocol PIM-SMv2.
If unsure, say N.
@@ -310,7 +293,7 @@ config IPV6_SEG6_LWTUNNEL
select LWTUNNEL
select DST_CACHE
select IPV6_MULTIPLE_TABLES
- ---help---
+ help
Support for encapsulation of packets within an outer IPv6
header and a Segment Routing Header using the lightweight
tunnels mechanism. Also enable support for advanced local
@@ -321,10 +304,10 @@ config IPV6_SEG6_LWTUNNEL
config IPV6_SEG6_HMAC
bool "IPv6: Segment Routing HMAC support"
depends on IPV6
- select CRYPTO_HMAC
- select CRYPTO_SHA1
- select CRYPTO_SHA256
- ---help---
+ select CRYPTO_LIB_SHA1
+ select CRYPTO_LIB_SHA256
+ select CRYPTO_LIB_UTILS
+ help
Support for HMAC signature generation and verification
of SR-enabled packets.
@@ -335,4 +318,26 @@ config IPV6_SEG6_BPF
depends on IPV6_SEG6_LWTUNNEL
depends on IPV6 = y
+config IPV6_RPL_LWTUNNEL
+ bool "IPv6: RPL Source Routing Header support"
+ depends on IPV6
+ select LWTUNNEL
+ select DST_CACHE
+ help
+ Support for RFC6554 RPL Source Routing Header using the lightweight
+ tunnels mechanism.
+
+ If unsure, say N.
+
+config IPV6_IOAM6_LWTUNNEL
+ bool "IPv6: IOAM Pre-allocated Trace insertion support"
+ depends on IPV6
+ select LWTUNNEL
+ select DST_CACHE
+ help
+ Support for the insertion of IOAM Pre-allocated Trace
+ Header using the lightweight tunnels mechanism.
+
+ If unsure, say N.
+
endif # IPV6
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index e0026fa1261b..d283c59df4c1 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -5,16 +5,14 @@
obj-$(CONFIG_IPV6) += ipv6.o
-ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
+ipv6-y := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o \
addrlabel.o \
route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o udplite.o \
raw.o icmp.o mcast.o reassembly.o tcp_ipv6.o ping.o \
exthdrs.o datagram.o ip6_flowlabel.o inet6_connection_sock.o \
- udp_offload.o seg6.o fib6_notifier.o
+ udp_offload.o seg6.o fib6_notifier.o rpl.o ioam6.o
-ipv6-offload := ip6_offload.o tcpv6_offload.o exthdrs_offload.o
-
-ipv6-$(CONFIG_SYSCTL) = sysctl_net_ipv6.o
+ipv6-$(CONFIG_SYSCTL) += sysctl_net_ipv6.o
ipv6-$(CONFIG_IPV6_MROUTE) += ip6mr.o
ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \
@@ -26,8 +24,8 @@ ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o
ipv6-$(CONFIG_NETLABEL) += calipso.o
ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o seg6_local.o
ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o
-
-ipv6-objs += $(ipv6-y)
+ipv6-$(CONFIG_IPV6_RPL_LWTUNNEL) += rpl_iptunnel.o
+ipv6-$(CONFIG_IPV6_IOAM6_LWTUNNEL) += ioam6_iptunnel.o
obj-$(CONFIG_INET6_AH) += ah6.o
obj-$(CONFIG_INET6_ESP) += esp6.o
@@ -35,10 +33,6 @@ obj-$(CONFIG_INET6_ESP_OFFLOAD) += esp6_offload.o
obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o
obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o
obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o
-obj-$(CONFIG_INET6_XFRM_MODE_TRANSPORT) += xfrm6_mode_transport.o
-obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o
-obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o
-obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o
obj-$(CONFIG_IPV6_MIP6) += mip6.o
obj-$(CONFIG_IPV6_ILA) += ila/
obj-$(CONFIG_NETFILTER) += netfilter/
@@ -50,11 +44,13 @@ obj-$(CONFIG_IPV6_GRE) += ip6_gre.o
obj-$(CONFIG_IPV6_FOU) += fou6.o
obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o
-obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload)
+obj-$(CONFIG_INET) += output_core.o protocol.o \
+ ip6_offload.o tcpv6_offload.o exthdrs_offload.o
obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
ifneq ($(CONFIG_IPV6),)
obj-$(CONFIG_NET_UDP_TUNNEL) += ip6_udp_tunnel.o
obj-y += mcast_snoop.o
+obj-$(CONFIG_TCP_AO) += tcp_ao.o
endif
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index c63ccce6425f..b66217d1b2f8 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPv6 Address [auto]configuration
* Linux INET6 implementation
@@ -5,11 +6,6 @@
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/*
@@ -67,6 +63,7 @@
#include <linux/string.h>
#include <linux/hash.h>
+#include <net/ip_tunnels.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/snmp.h>
@@ -83,18 +80,18 @@
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/l3mdev.h>
+#include <net/netdev_lock.h>
#include <linux/if_tunnel.h>
#include <linux/rtnetlink.h>
#include <linux/netconf.h>
#include <linux/random.h>
#include <linux/uaccess.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/export.h>
-
-#define INFINITY_LIFE_TIME 0xFFFFFFFF
+#include <linux/ioam6.h>
#define IPV6_MAX_STRLEN \
sizeof("ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255")
@@ -107,7 +104,7 @@ static inline u32 cstamp_delta(unsigned long cstamp)
static inline s32 rfc3315_s14_backoff_init(s32 irt)
{
/* multiply 'initial retransmission time' by 0.9 .. 1.1 */
- u64 tmp = (900000 + prandom_u32() % 200001) * (u64)irt;
+ u64 tmp = get_random_u32_inclusive(900000, 1100000) * (u64)irt;
do_div(tmp, 1000000);
return (s32)tmp;
}
@@ -115,11 +112,11 @@ static inline s32 rfc3315_s14_backoff_init(s32 irt)
static inline s32 rfc3315_s14_backoff_update(s32 rt, s32 mrt)
{
/* multiply 'retransmission timeout' by 1.9 .. 2.1 */
- u64 tmp = (1900000 + prandom_u32() % 200001) * (u64)rt;
+ u64 tmp = get_random_u32_inclusive(1900000, 2100000) * (u64)rt;
do_div(tmp, 1000000);
if ((s32)tmp > mrt) {
/* multiply 'maximum retransmission time' by 0.9 .. 1.1 */
- tmp = (900000 + prandom_u32() % 200001) * (u64)mrt;
+ tmp = get_random_u32_inclusive(900000, 1100000) * (u64)mrt;
do_div(tmp, 1000000);
}
return (s32)tmp;
@@ -139,8 +136,7 @@ static inline void addrconf_sysctl_unregister(struct inet6_dev *idev)
}
#endif
-static void ipv6_regen_rndid(struct inet6_dev *idev);
-static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr);
+static void ipv6_gen_rnd_iid(struct in6_addr *addr);
static int ipv6_generate_eui64(u8 *eui, struct net_device *dev);
static int ipv6_count_addresses(const struct inet6_dev *idev);
@@ -150,36 +146,30 @@ static int ipv6_generate_stable_address(struct in6_addr *addr,
#define IN6_ADDR_HSIZE_SHIFT 8
#define IN6_ADDR_HSIZE (1 << IN6_ADDR_HSIZE_SHIFT)
-/*
- * Configured unicast address hash table
- */
-static struct hlist_head inet6_addr_lst[IN6_ADDR_HSIZE];
-static DEFINE_SPINLOCK(addrconf_hash_lock);
-static void addrconf_verify(void);
-static void addrconf_verify_rtnl(void);
-static void addrconf_verify_work(struct work_struct *);
+static void addrconf_verify(struct net *net);
+static void addrconf_verify_rtnl(struct net *net);
static struct workqueue_struct *addrconf_wq;
-static DECLARE_DELAYED_WORK(addr_chk_work, addrconf_verify_work);
static void addrconf_join_anycast(struct inet6_ifaddr *ifp);
static void addrconf_leave_anycast(struct inet6_ifaddr *ifp);
static void addrconf_type_change(struct net_device *dev,
unsigned long event);
-static int addrconf_ifdown(struct net_device *dev, int how);
+static int addrconf_ifdown(struct net_device *dev, bool unregister);
static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
int plen,
const struct net_device *dev,
- u32 flags, u32 noflags);
+ u32 flags, u32 noflags,
+ bool no_gw);
static void addrconf_dad_start(struct inet6_ifaddr *ifp);
static void addrconf_dad_work(struct work_struct *w);
static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id,
bool send_na);
-static void addrconf_dad_run(struct inet6_dev *idev);
+static void addrconf_dad_run(struct inet6_dev *idev, bool restart);
static void addrconf_rs_timer(struct timer_list *t);
static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa);
@@ -205,12 +195,15 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
.use_tempaddr = 0,
.temp_valid_lft = TEMP_VALID_LIFETIME,
.temp_prefered_lft = TEMP_PREFERRED_LIFETIME,
+ .regen_min_advance = REGEN_MIN_ADVANCE,
.regen_max_retry = REGEN_MAX_RETRY,
.max_desync_factor = MAX_DESYNC_FACTOR,
.max_addresses = IPV6_MAX_ADDRESSES,
.accept_ra_defrtr = 1,
+ .ra_defrtr_metric = IP6_RT_PRIO_USER,
.accept_ra_from_local = 0,
.accept_ra_min_hop_limit= 1,
+ .accept_ra_min_lft = 0,
.accept_ra_pinfo = 1,
#ifdef CONFIG_IPV6_ROUTER_PREF
.accept_ra_rtr_pref = 1,
@@ -239,6 +232,14 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
.enhanced_dad = 1,
.addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64,
.disable_policy = 0,
+ .rpl_seg_enabled = 0,
+ .ioam6_enabled = 0,
+ .ioam6_id = IOAM6_DEFAULT_IF_ID,
+ .ioam6_id_wide = IOAM6_DEFAULT_IF_ID_WIDE,
+ .ndisc_evict_nocarrier = 1,
+ .ra_honor_pio_life = 0,
+ .ra_honor_pio_pflag = 0,
+ .force_forwarding = 0,
};
static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -259,12 +260,15 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.use_tempaddr = 0,
.temp_valid_lft = TEMP_VALID_LIFETIME,
.temp_prefered_lft = TEMP_PREFERRED_LIFETIME,
+ .regen_min_advance = REGEN_MIN_ADVANCE,
.regen_max_retry = REGEN_MAX_RETRY,
.max_desync_factor = MAX_DESYNC_FACTOR,
.max_addresses = IPV6_MAX_ADDRESSES,
.accept_ra_defrtr = 1,
+ .ra_defrtr_metric = IP6_RT_PRIO_USER,
.accept_ra_from_local = 0,
.accept_ra_min_hop_limit= 1,
+ .accept_ra_min_lft = 0,
.accept_ra_pinfo = 1,
#ifdef CONFIG_IPV6_ROUTER_PREF
.accept_ra_rtr_pref = 1,
@@ -293,6 +297,14 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
.enhanced_dad = 1,
.addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64,
.disable_policy = 0,
+ .rpl_seg_enabled = 0,
+ .ioam6_enabled = 0,
+ .ioam6_id = IOAM6_DEFAULT_IF_ID,
+ .ioam6_id_wide = IOAM6_DEFAULT_IF_ID_WIDE,
+ .ndisc_evict_nocarrier = 1,
+ .ra_honor_pio_life = 0,
+ .ra_honor_pio_pflag = 0,
+ .force_forwarding = 0,
};
/* Check if link is ready: is it up and is a valid qdisc available */
@@ -303,7 +315,7 @@ static inline bool addrconf_link_ready(const struct net_device *dev)
static void addrconf_del_rs_timer(struct inet6_dev *idev)
{
- if (del_timer(&idev->rs_timer))
+ if (timer_delete(&idev->rs_timer))
__in6_dev_put(idev);
}
@@ -316,9 +328,8 @@ static void addrconf_del_dad_work(struct inet6_ifaddr *ifp)
static void addrconf_mod_rs_timer(struct inet6_dev *idev,
unsigned long when)
{
- if (!timer_pending(&idev->rs_timer))
+ if (!mod_timer(&idev->rs_timer, jiffies + when))
in6_dev_hold(idev);
- mod_timer(&idev->rs_timer, jiffies + when);
}
static void addrconf_mod_dad_work(struct inet6_ifaddr *ifp,
@@ -333,7 +344,7 @@ static int snmp6_alloc_dev(struct inet6_dev *idev)
{
int i;
- idev->stats.ipv6 = alloc_percpu(struct ipstats_mib);
+ idev->stats.ipv6 = alloc_percpu_gfp(struct ipstats_mib, GFP_KERNEL_ACCOUNT);
if (!idev->stats.ipv6)
goto err_ip;
@@ -349,7 +360,7 @@ static int snmp6_alloc_dev(struct inet6_dev *idev)
if (!idev->stats.icmpv6dev)
goto err_icmp;
idev->stats.icmpv6msgdev = kzalloc(sizeof(struct icmpv6msg_mib_device),
- GFP_KERNEL);
+ GFP_KERNEL_ACCOUNT);
if (!idev->stats.icmpv6msgdev)
goto err_icmpmsg;
@@ -369,11 +380,12 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
int err = -ENOMEM;
ASSERT_RTNL();
+ netdev_ops_assert_locked(dev);
- if (dev->mtu < IPV6_MIN_MTU)
+ if (dev->mtu < IPV6_MIN_MTU && dev != blackhole_netdev)
return ERR_PTR(-EINVAL);
- ndev = kzalloc(sizeof(struct inet6_dev), GFP_KERNEL);
+ ndev = kzalloc(sizeof(*ndev), GFP_KERNEL_ACCOUNT);
if (!ndev)
return ERR_PTR(err);
@@ -387,31 +399,33 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
ndev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
ndev->cnf.mtu6 = dev->mtu;
+ ndev->ra_mtu = 0;
ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
if (!ndev->nd_parms) {
kfree(ndev);
return ERR_PTR(err);
}
if (ndev->cnf.forwarding)
- dev_disable_lro(dev);
+ netif_disable_lro(dev);
/* We refer to the device */
- dev_hold(dev);
+ netdev_hold(dev, &ndev->dev_tracker, GFP_KERNEL);
if (snmp6_alloc_dev(ndev) < 0) {
netdev_dbg(dev, "%s: cannot allocate memory for statistics\n",
__func__);
neigh_parms_release(&nd_tbl, ndev->nd_parms);
- dev_put(dev);
+ netdev_put(dev, &ndev->dev_tracker);
kfree(ndev);
return ERR_PTR(err);
}
- if (snmp6_register_dev(ndev) < 0) {
- netdev_dbg(dev, "%s: cannot create /proc/net/dev_snmp6/%s\n",
- __func__, dev->name);
- goto err_release;
+ if (dev != blackhole_netdev) {
+ if (snmp6_register_dev(ndev) < 0) {
+ netdev_dbg(dev, "%s: cannot create /proc/net/dev_snmp6/%s\n",
+ __func__, dev->name);
+ goto err_release;
+ }
}
-
/* One reference from device. */
refcount_set(&ndev->refcnt, 1);
@@ -433,8 +447,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
dev->type == ARPHRD_SIT ||
dev->type == ARPHRD_NONE) {
ndev->cnf.use_tempaddr = -1;
- } else
- ipv6_regen_rndid(ndev);
+ }
ndev->token = in6addr_any;
@@ -443,25 +456,28 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
ipv6_mc_init_dev(ndev);
ndev->tstamp = jiffies;
- err = addrconf_sysctl_register(ndev);
- if (err) {
- ipv6_mc_destroy_dev(ndev);
- snmp6_unregister_dev(ndev);
- goto err_release;
+ if (dev != blackhole_netdev) {
+ err = addrconf_sysctl_register(ndev);
+ if (err) {
+ ipv6_mc_destroy_dev(ndev);
+ snmp6_unregister_dev(ndev);
+ goto err_release;
+ }
}
/* protected by rtnl_lock */
rcu_assign_pointer(dev->ip6_ptr, ndev);
- /* Join interface-local all-node multicast group */
- ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allnodes);
-
- /* Join all-node multicast group */
- ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes);
+ if (dev != blackhole_netdev) {
+ /* Join interface-local all-node multicast group */
+ ipv6_dev_mc_inc(dev, &in6addr_interfacelocal_allnodes);
- /* Join all-router multicast group if forwarding is set */
- if (ndev->cnf.forwarding && (dev->flags & IFF_MULTICAST))
- ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
+ /* Join all-node multicast group */
+ ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes);
+ /* Join all-router multicast group if forwarding is set */
+ if (ndev->cnf.forwarding && (dev->flags & IFF_MULTICAST))
+ ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters);
+ }
return ndev;
err_release:
@@ -481,7 +497,7 @@ static struct inet6_dev *ipv6_find_idev(struct net_device *dev)
if (!idev) {
idev = ipv6_add_dev(dev);
if (IS_ERR(idev))
- return NULL;
+ return idev;
}
if (dev->flags&IFF_UP)
@@ -540,21 +556,23 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
goto out;
if ((all || type == NETCONFA_FORWARDING) &&
- nla_put_s32(skb, NETCONFA_FORWARDING, devconf->forwarding) < 0)
+ nla_put_s32(skb, NETCONFA_FORWARDING,
+ READ_ONCE(devconf->forwarding)) < 0)
goto nla_put_failure;
#ifdef CONFIG_IPV6_MROUTE
if ((all || type == NETCONFA_MC_FORWARDING) &&
nla_put_s32(skb, NETCONFA_MC_FORWARDING,
- devconf->mc_forwarding) < 0)
+ atomic_read(&devconf->mc_forwarding)) < 0)
goto nla_put_failure;
#endif
if ((all || type == NETCONFA_PROXY_NEIGH) &&
- nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0)
+ nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
+ READ_ONCE(devconf->proxy_ndp)) < 0)
goto nla_put_failure;
if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
- devconf->ignore_routes_with_linkdown) < 0)
+ READ_ONCE(devconf->ignore_routes_with_linkdown)) < 0)
goto nla_put_failure;
out:
@@ -597,6 +615,45 @@ static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = {
[NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) },
};
+static int inet6_netconf_valid_get_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ int i, err;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(struct netconfmsg))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf get request");
+ return -EINVAL;
+ }
+
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse_deprecated(nlh, sizeof(struct netconfmsg),
+ tb, NETCONFA_MAX,
+ devconf_ipv6_policy, extack);
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct netconfmsg),
+ tb, NETCONFA_MAX,
+ devconf_ipv6_policy, extack);
+ if (err)
+ return err;
+
+ for (i = 0; i <= NETCONFA_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case NETCONFA_IFINDEX:
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in netconf get request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
@@ -605,14 +662,12 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
struct nlattr *tb[NETCONFA_MAX+1];
struct inet6_dev *in6_dev = NULL;
struct net_device *dev = NULL;
- struct netconfmsg *ncm;
struct sk_buff *skb;
struct ipv6_devconf *devconf;
int ifindex;
int err;
- err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX,
- devconf_ipv6_policy, extack);
+ err = inet6_netconf_valid_get_req(in_skb, nlh, tb, extack);
if (err < 0)
return err;
@@ -658,80 +713,93 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
errout:
if (in6_dev)
in6_dev_put(in6_dev);
- if (dev)
- dev_put(dev);
+ dev_put(dev);
return err;
}
+/* Combine dev_addr_genid and dev_base_seq to detect changes.
+ */
+static u32 inet6_base_seq(const struct net *net)
+{
+ u32 res = atomic_read(&net->ipv6.dev_addr_genid) +
+ READ_ONCE(net->dev_base_seq);
+
+ /* Must not return 0 (see nl_dump_check_consistent()).
+ * Chose a value far away from 0.
+ */
+ if (!res)
+ res = 0x80000000;
+ return res;
+}
+
static int inet6_netconf_dump_devconf(struct sk_buff *skb,
struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
- int h, s_h;
- int idx, s_idx;
+ struct {
+ unsigned long ifindex;
+ unsigned int all_default;
+ } *ctx = (void *)cb->ctx;
struct net_device *dev;
struct inet6_dev *idev;
- struct hlist_head *head;
-
- s_h = cb->args[0];
- s_idx = idx = cb->args[1];
-
- for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
- idx = 0;
- head = &net->dev_index_head[h];
- rcu_read_lock();
- cb->seq = atomic_read(&net->ipv6.dev_addr_genid) ^
- net->dev_base_seq;
- hlist_for_each_entry_rcu(dev, head, index_hlist) {
- if (idx < s_idx)
- goto cont;
- idev = __in6_dev_get(dev);
- if (!idev)
- goto cont;
-
- if (inet6_netconf_fill_devconf(skb, dev->ifindex,
- &idev->cnf,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWNETCONF,
- NLM_F_MULTI,
- NETCONFA_ALL) < 0) {
- rcu_read_unlock();
- goto done;
- }
- nl_dump_check_consistent(cb, nlmsg_hdr(skb));
-cont:
- idx++;
+ int err = 0;
+
+ if (cb->strict_check) {
+ struct netlink_ext_ack *extack = cb->extack;
+ struct netconfmsg *ncm;
+
+ if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ncm))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid header for netconf dump request");
+ return -EINVAL;
+ }
+
+ if (nlmsg_attrlen(nlh, sizeof(*ncm))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid data after header in netconf dump request");
+ return -EINVAL;
}
- rcu_read_unlock();
}
- if (h == NETDEV_HASHENTRIES) {
- if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
- net->ipv6.devconf_all,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWNETCONF, NLM_F_MULTI,
- NETCONFA_ALL) < 0)
+
+ rcu_read_lock();
+ for_each_netdev_dump(net, dev, ctx->ifindex) {
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ continue;
+ err = inet6_netconf_fill_devconf(skb, dev->ifindex,
+ &idev->cnf,
+ NETLINK_CB(cb->skb).portid,
+ nlh->nlmsg_seq,
+ RTM_NEWNETCONF,
+ NLM_F_MULTI,
+ NETCONFA_ALL);
+ if (err < 0)
goto done;
- else
- h++;
- }
- if (h == NETDEV_HASHENTRIES + 1) {
- if (inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
- net->ipv6.devconf_dflt,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWNETCONF, NLM_F_MULTI,
- NETCONFA_ALL) < 0)
+ }
+ if (ctx->all_default == 0) {
+ err = inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_ALL,
+ net->ipv6.devconf_all,
+ NETLINK_CB(cb->skb).portid,
+ nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ NETCONFA_ALL);
+ if (err < 0)
goto done;
- else
- h++;
+ ctx->all_default++;
+ }
+ if (ctx->all_default == 1) {
+ err = inet6_netconf_fill_devconf(skb, NETCONFA_IFINDEX_DEFAULT,
+ net->ipv6.devconf_dflt,
+ NETLINK_CB(cb->skb).portid,
+ nlh->nlmsg_seq,
+ RTM_NEWNETCONF, NLM_F_MULTI,
+ NETCONFA_ALL);
+ if (err < 0)
+ goto done;
+ ctx->all_default++;
}
done:
- cb->args[0] = h;
- cb->args[1] = idx;
-
- return skb->len;
+ rcu_read_unlock();
+ return err;
}
#ifdef CONFIG_SYSCTL
@@ -739,6 +807,7 @@ static void dev_forward_change(struct inet6_dev *idev)
{
struct net_device *dev;
struct inet6_ifaddr *ifa;
+ LIST_HEAD(tmp_addr_list);
if (!idev)
return;
@@ -757,14 +826,24 @@ static void dev_forward_change(struct inet6_dev *idev)
}
}
+ read_lock_bh(&idev->lock);
list_for_each_entry(ifa, &idev->addr_list, if_list) {
if (ifa->flags&IFA_F_TENTATIVE)
continue;
+ list_add_tail(&ifa->if_list_aux, &tmp_addr_list);
+ }
+ read_unlock_bh(&idev->lock);
+
+ while (!list_empty(&tmp_addr_list)) {
+ ifa = list_first_entry(&tmp_addr_list,
+ struct inet6_ifaddr, if_list_aux);
+ list_del(&ifa->if_list_aux);
if (idev->cnf.forwarding)
addrconf_join_anycast(ifa);
else
addrconf_leave_anycast(ifa);
}
+
inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
NETCONFA_FORWARDING,
dev->ifindex, &idev->cnf);
@@ -777,27 +856,30 @@ static void addrconf_forward_change(struct net *net, __s32 newf)
struct inet6_dev *idev;
for_each_netdev(net, dev) {
- idev = __in6_dev_get(dev);
+ idev = __in6_dev_get_rtnl_net(dev);
if (idev) {
int changed = (!idev->cnf.forwarding) ^ (!newf);
- idev->cnf.forwarding = newf;
+ /* Disabling all.forwarding sets 0 to force_forwarding for all interfaces */
+ if (newf == 0)
+ WRITE_ONCE(idev->cnf.force_forwarding, 0);
+
+ WRITE_ONCE(idev->cnf.forwarding, newf);
if (changed)
dev_forward_change(idev);
}
}
}
-static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf)
+static int addrconf_fixup_forwarding(const struct ctl_table *table, int *p, int newf)
{
- struct net *net;
+ struct net *net = (struct net *)table->extra2;
int old;
- if (!rtnl_trylock())
+ if (!rtnl_net_trylock(net))
return restart_syscall();
- net = (struct net *)table->extra2;
old = *p;
- *p = newf;
+ WRITE_ONCE(*p, newf);
if (p == &net->ipv6.devconf_dflt->forwarding) {
if ((!newf) ^ (!old))
@@ -805,14 +887,14 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf)
NETCONFA_FORWARDING,
NETCONFA_IFINDEX_DEFAULT,
net->ipv6.devconf_dflt);
- rtnl_unlock();
+ rtnl_net_unlock(net);
return 0;
}
if (p == &net->ipv6.devconf_all->forwarding) {
int old_dflt = net->ipv6.devconf_dflt->forwarding;
- net->ipv6.devconf_dflt->forwarding = newf;
+ WRITE_ONCE(net->ipv6.devconf_dflt->forwarding, newf);
if ((!newf) ^ (!old_dflt))
inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
NETCONFA_FORWARDING,
@@ -827,7 +909,7 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf)
net->ipv6.devconf_all);
} else if ((!newf) ^ (!old))
dev_forward_change((struct inet6_dev *)table->extra1);
- rtnl_unlock();
+ rtnl_net_unlock(net);
if (newf)
rt6_purge_dflt_routers(net);
@@ -840,11 +922,11 @@ static void addrconf_linkdown_change(struct net *net, __s32 newf)
struct inet6_dev *idev;
for_each_netdev(net, dev) {
- idev = __in6_dev_get(dev);
+ idev = __in6_dev_get_rtnl_net(dev);
if (idev) {
int changed = (!idev->cnf.ignore_routes_with_linkdown) ^ (!newf);
- idev->cnf.ignore_routes_with_linkdown = newf;
+ WRITE_ONCE(idev->cnf.ignore_routes_with_linkdown, newf);
if (changed)
inet6_netconf_notify_devconf(dev_net(dev),
RTM_NEWNETCONF,
@@ -855,17 +937,16 @@ static void addrconf_linkdown_change(struct net *net, __s32 newf)
}
}
-static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf)
+static int addrconf_fixup_linkdown(const struct ctl_table *table, int *p, int newf)
{
- struct net *net;
+ struct net *net = (struct net *)table->extra2;
int old;
- if (!rtnl_trylock())
+ if (!rtnl_net_trylock(net))
return restart_syscall();
- net = (struct net *)table->extra2;
old = *p;
- *p = newf;
+ WRITE_ONCE(*p, newf);
if (p == &net->ipv6.devconf_dflt->ignore_routes_with_linkdown) {
if ((!newf) ^ (!old))
@@ -874,12 +955,12 @@ static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf)
NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
NETCONFA_IFINDEX_DEFAULT,
net->ipv6.devconf_dflt);
- rtnl_unlock();
+ rtnl_net_unlock(net);
return 0;
}
if (p == &net->ipv6.devconf_all->ignore_routes_with_linkdown) {
- net->ipv6.devconf_dflt->ignore_routes_with_linkdown = newf;
+ WRITE_ONCE(net->ipv6.devconf_dflt->ignore_routes_with_linkdown, newf);
addrconf_linkdown_change(net, newf);
if ((!newf) ^ (!old))
inet6_netconf_notify_devconf(net,
@@ -888,7 +969,8 @@ static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf)
NETCONFA_IFINDEX_ALL,
net->ipv6.devconf_all);
}
- rtnl_unlock();
+
+ rtnl_net_unlock(net);
return 1;
}
@@ -940,7 +1022,7 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
static u32 inet6_addr_hash(const struct net *net, const struct in6_addr *addr)
{
- u32 val = ipv6_addr_hash(addr) ^ net_hash_mix(net);
+ u32 val = __ipv6_addr_jhash(addr, net_hash_mix(net));
return hash_32(val, IN6_ADDR_HSIZE_SHIFT);
}
@@ -950,9 +1032,7 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
{
struct inet6_ifaddr *ifp;
- hlist_for_each_entry(ifp, &inet6_addr_lst[hash], addr_lst) {
- if (!net_eq(dev_net(ifp->idev->dev), net))
- continue;
+ hlist_for_each_entry(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
if (ipv6_addr_equal(&ifp->addr, addr)) {
if (!dev || ifp->idev->dev == dev)
return true;
@@ -963,20 +1043,21 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr,
static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa)
{
- unsigned int hash = inet6_addr_hash(dev_net(dev), &ifa->addr);
+ struct net *net = dev_net(dev);
+ unsigned int hash = inet6_addr_hash(net, &ifa->addr);
int err = 0;
- spin_lock(&addrconf_hash_lock);
+ spin_lock_bh(&net->ipv6.addrconf_hash_lock);
/* Ignore adding duplicate addresses on an interface */
- if (ipv6_chk_same_addr(dev_net(dev), &ifa->addr, dev, hash)) {
+ if (ipv6_chk_same_addr(net, &ifa->addr, dev, hash)) {
netdev_dbg(dev, "ipv6_add_addr: already assigned\n");
err = -EEXIST;
} else {
- hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]);
+ hlist_add_head_rcu(&ifa->addr_lst, &net->ipv6.inet6_addr_lst[hash]);
}
- spin_unlock(&addrconf_hash_lock);
+ spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
return err;
}
@@ -994,18 +1075,28 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
struct fib6_info *f6i = NULL;
int err = 0;
- if (addr_type == IPV6_ADDR_ANY ||
- addr_type & IPV6_ADDR_MULTICAST ||
- (!(idev->dev->flags & IFF_LOOPBACK) &&
- addr_type & IPV6_ADDR_LOOPBACK))
+ if (addr_type == IPV6_ADDR_ANY) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid address");
+ return ERR_PTR(-EADDRNOTAVAIL);
+ } else if (addr_type & IPV6_ADDR_MULTICAST &&
+ !(cfg->ifa_flags & IFA_F_MCAUTOJOIN)) {
+ NL_SET_ERR_MSG_MOD(extack, "Cannot assign multicast address without \"IFA_F_MCAUTOJOIN\" flag");
return ERR_PTR(-EADDRNOTAVAIL);
+ } else if (!(idev->dev->flags & IFF_LOOPBACK) &&
+ !netif_is_l3_master(idev->dev) &&
+ addr_type & IPV6_ADDR_LOOPBACK) {
+ NL_SET_ERR_MSG_MOD(extack, "Cannot assign loopback address on this device");
+ return ERR_PTR(-EADDRNOTAVAIL);
+ }
if (idev->dead) {
- err = -ENODEV; /*XXX*/
+ NL_SET_ERR_MSG_MOD(extack, "device is going away");
+ err = -ENODEV;
goto out;
}
if (idev->cnf.disable_ipv6) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device");
err = -EACCES;
goto out;
}
@@ -1026,23 +1117,19 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
goto out;
}
- ifa = kzalloc(sizeof(*ifa), gfp_flags);
+ ifa = kzalloc(sizeof(*ifa), gfp_flags | __GFP_ACCOUNT);
if (!ifa) {
err = -ENOBUFS;
goto out;
}
- f6i = addrconf_f6i_alloc(net, idev, cfg->pfx, false, gfp_flags);
+ f6i = addrconf_f6i_alloc(net, idev, cfg->pfx, false, gfp_flags, extack);
if (IS_ERR(f6i)) {
err = PTR_ERR(f6i);
f6i = NULL;
goto out;
}
- if (net->ipv6.devconf_all->disable_policy ||
- idev->cnf.disable_policy)
- f6i->dst_nopolicy = true;
-
neigh_parms_data_state_setall(idev->nd_parms);
ifa->addr = *cfg->pfx;
@@ -1056,6 +1143,7 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
ifa->prefix_len = cfg->plen;
ifa->rt_priority = cfg->rt_priority;
ifa->flags = cfg->ifa_flags;
+ ifa->ifa_proto = cfg->ifa_proto;
/* No need to add the TENTATIVE flag for addresses with NODAD */
if (!(cfg->ifa_flags & IFA_F_NODAD))
ifa->flags |= IFA_F_TENTATIVE;
@@ -1072,15 +1160,15 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
/* For caller */
refcount_set(&ifa->refcnt, 1);
- rcu_read_lock_bh();
+ rcu_read_lock();
err = ipv6_add_addr_hash(idev->dev, ifa);
if (err < 0) {
- rcu_read_unlock_bh();
+ rcu_read_unlock();
goto out;
}
- write_lock(&idev->lock);
+ write_lock_bh(&idev->lock);
/* Add to inet6_dev unicast addr list. */
ipv6_link_dev_addr(idev, ifa);
@@ -1091,9 +1179,9 @@ ipv6_add_addr(struct inet6_dev *idev, struct ifa6_config *cfg,
}
in6_ifa_hold(ifa);
- write_unlock(&idev->lock);
+ write_unlock_bh(&idev->lock);
- rcu_read_unlock_bh();
+ rcu_read_unlock();
inet6addr_notifier_call_chain(NETDEV_UP, ifa);
out:
@@ -1148,7 +1236,8 @@ check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires)
list_for_each_entry(ifa, &idev->addr_list, if_list) {
if (ifa == ifp)
continue;
- if (!ipv6_prefix_equal(&ifa->addr, &ifp->addr,
+ if (ifa->prefix_len != ifp->prefix_len ||
+ !ipv6_prefix_equal(&ifa->addr, &ifp->addr,
ifp->prefix_len))
continue;
if (ifa->flags & (IFA_F_PERMANENT | IFA_F_NOPREFIXROUTE))
@@ -1173,20 +1262,28 @@ check_cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long *expires)
}
static void
-cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt)
+cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires,
+ bool del_rt, bool del_peer)
{
+ struct fib6_table *table;
struct fib6_info *f6i;
- f6i = addrconf_get_prefix_route(&ifp->addr,
- ifp->prefix_len,
- ifp->idev->dev,
- 0, RTF_GATEWAY | RTF_DEFAULT);
+ f6i = addrconf_get_prefix_route(del_peer ? &ifp->peer_addr : &ifp->addr,
+ ifp->prefix_len,
+ ifp->idev->dev, 0, RTF_DEFAULT, true);
if (f6i) {
if (del_rt)
- ip6_del_rt(dev_net(ifp->idev->dev), f6i);
+ ip6_del_rt(dev_net(ifp->idev->dev), f6i, false);
else {
- if (!(f6i->fib6_flags & RTF_EXPIRES))
+ if (!(f6i->fib6_flags & RTF_EXPIRES)) {
+ table = f6i->fib6_table;
+ spin_lock_bh(&table->tb6_lock);
+
fib6_set_expires(f6i, expires);
+ fib6_add_gc_list(f6i);
+
+ spin_unlock_bh(&table->tb6_lock);
+ }
fib6_info_release(f6i);
}
}
@@ -1197,9 +1294,10 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_r
static void ipv6_del_addr(struct inet6_ifaddr *ifp)
{
- int state;
enum cleanup_prefix_rt_t action = CLEANUP_PREFIX_RT_NOP;
+ struct net *net = dev_net(ifp->idev->dev);
unsigned long expires;
+ int state;
ASSERT_RTNL();
@@ -1211,9 +1309,9 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
if (state == INET6_IFADDR_STATE_DEAD)
goto out;
- spin_lock_bh(&addrconf_hash_lock);
+ spin_lock_bh(&net->ipv6.addrconf_hash_lock);
hlist_del_init_rcu(&ifp->addr_lst);
- spin_unlock_bh(&addrconf_hash_lock);
+ spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
write_lock_bh(&ifp->idev->lock);
@@ -1226,7 +1324,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
__in6_ifa_put(ifp);
}
- if (ifp->flags & IFA_F_PERMANENT && !(ifp->flags & IFA_F_NOPREFIXROUTE))
+ if (!(ifp->flags & IFA_F_NOPREFIXROUTE))
action = check_cleanup_prefix_route(ifp, &expires);
list_del_rcu(&ifp->if_list);
@@ -1242,7 +1340,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp)
if (action != CLEANUP_PREFIX_RT_NOP) {
cleanup_prefix_route(ifp, expires,
- action == CLEANUP_PREFIX_RT_DEL);
+ action == CLEANUP_PREFIX_RT_DEL, false);
}
/* clean up prefsrc entries */
@@ -1251,32 +1349,33 @@ out:
in6_ifa_put(ifp);
}
-static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp,
- struct inet6_ifaddr *ift,
- bool block)
+static unsigned long ipv6_get_regen_advance(const struct inet6_dev *idev)
+{
+ return READ_ONCE(idev->cnf.regen_min_advance) +
+ READ_ONCE(idev->cnf.regen_max_retry) *
+ READ_ONCE(idev->cnf.dad_transmits) *
+ max(NEIGH_VAR(idev->nd_parms, RETRANS_TIME), HZ/100) / HZ;
+}
+
+static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, bool block)
{
struct inet6_dev *idev = ifp->idev;
- struct in6_addr addr, *tmpaddr;
unsigned long tmp_tstamp, age;
unsigned long regen_advance;
- struct ifa6_config cfg;
- int ret = 0;
unsigned long now = jiffies;
- long max_desync_factor;
+ u32 if_public_preferred_lft;
s32 cnf_temp_preferred_lft;
+ struct inet6_ifaddr *ift;
+ struct ifa6_config cfg;
+ long max_desync_factor;
+ struct in6_addr addr;
+ int ret = 0;
write_lock_bh(&idev->lock);
- if (ift) {
- spin_lock_bh(&ift->lock);
- memcpy(&addr.s6_addr[8], &ift->addr.s6_addr[8], 8);
- spin_unlock_bh(&ift->lock);
- tmpaddr = &addr;
- } else {
- tmpaddr = NULL;
- }
+
retry:
in6_dev_hold(idev);
- if (idev->cnf.use_tempaddr <= 0) {
+ if (READ_ONCE(idev->cnf.use_tempaddr) <= 0) {
write_unlock_bh(&idev->lock);
pr_info("%s: use_tempaddr is disabled\n", __func__);
in6_dev_put(idev);
@@ -1284,8 +1383,8 @@ retry:
goto out;
}
spin_lock_bh(&ifp->lock);
- if (ifp->regen_count++ >= idev->cnf.regen_max_retry) {
- idev->cnf.use_tempaddr = -1; /*XXX*/
+ if (ifp->regen_count++ >= READ_ONCE(idev->cnf.regen_max_retry)) {
+ WRITE_ONCE(idev->cnf.use_tempaddr, -1); /*XXX*/
spin_unlock_bh(&ifp->lock);
write_unlock_bh(&idev->lock);
pr_warn("%s: regeneration time exceeded - disabled temporary address support\n",
@@ -1296,20 +1395,18 @@ retry:
}
in6_ifa_hold(ifp);
memcpy(addr.s6_addr, ifp->addr.s6_addr, 8);
- ipv6_try_regen_rndid(idev, tmpaddr);
- memcpy(&addr.s6_addr[8], idev->rndid, 8);
+ ipv6_gen_rnd_iid(&addr);
+
age = (now - ifp->tstamp) / HZ;
- regen_advance = idev->cnf.regen_max_retry *
- idev->cnf.dad_transmits *
- NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ;
+ regen_advance = ipv6_get_regen_advance(idev);
/* recalculate max_desync_factor each time and update
* idev->desync_factor if it's larger
*/
cnf_temp_preferred_lft = READ_ONCE(idev->cnf.temp_prefered_lft);
- max_desync_factor = min_t(__u32,
- idev->cnf.max_desync_factor,
+ max_desync_factor = min_t(long,
+ READ_ONCE(idev->cnf.max_desync_factor),
cnf_temp_preferred_lft - regen_advance);
if (unlikely(idev->desync_factor > max_desync_factor)) {
@@ -1322,11 +1419,14 @@ retry:
}
}
+ if_public_preferred_lft = ifp->prefered_lft;
+
memset(&cfg, 0, sizeof(cfg));
cfg.valid_lft = min_t(__u32, ifp->valid_lft,
- idev->cnf.temp_valid_lft + age);
+ READ_ONCE(idev->cnf.temp_valid_lft) + age);
cfg.preferred_lft = cnf_temp_preferred_lft + age - idev->desync_factor;
- cfg.preferred_lft = min_t(__u32, ifp->prefered_lft, cfg.preferred_lft);
+ cfg.preferred_lft = min_t(__u32, if_public_preferred_lft, cfg.preferred_lft);
+ cfg.preferred_lft = min_t(__u32, cfg.valid_lft, cfg.preferred_lft);
cfg.plen = ifp->prefix_len;
tmp_tstamp = ifp->tstamp;
@@ -1334,19 +1434,41 @@ retry:
write_unlock_bh(&idev->lock);
- /* A temporary address is created only if this calculated Preferred
- * Lifetime is greater than REGEN_ADVANCE time units. In particular,
- * an implementation must not create a temporary address with a zero
- * Preferred Lifetime.
+ /* From RFC 4941:
+ *
+ * A temporary address is created only if this calculated Preferred
+ * Lifetime is greater than REGEN_ADVANCE time units. In
+ * particular, an implementation must not create a temporary address
+ * with a zero Preferred Lifetime.
+ *
+ * ...
+ *
+ * When creating a temporary address, the lifetime values MUST be
+ * derived from the corresponding prefix as follows:
+ *
+ * ...
+ *
+ * * Its Preferred Lifetime is the lower of the Preferred Lifetime
+ * of the public address or TEMP_PREFERRED_LIFETIME -
+ * DESYNC_FACTOR.
+ *
+ * To comply with the RFC's requirements, clamp the preferred lifetime
+ * to a minimum of regen_advance, unless that would exceed valid_lft or
+ * ifp->prefered_lft.
+ *
* Use age calculation as in addrconf_verify to avoid unnecessary
* temporary addresses being generated.
*/
age = (now - tmp_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
if (cfg.preferred_lft <= regen_advance + age) {
- in6_ifa_put(ifp);
- in6_dev_put(idev);
- ret = -1;
- goto out;
+ cfg.preferred_lft = regen_advance + age + 1;
+ if (cfg.preferred_lft > cfg.valid_lft ||
+ cfg.preferred_lft > if_public_preferred_lft) {
+ in6_ifa_put(ifp);
+ in6_dev_put(idev);
+ ret = -1;
+ goto out;
+ }
}
cfg.ifa_flags = IFA_F_TEMPORARY;
@@ -1362,7 +1484,6 @@ retry:
in6_ifa_put(ifp);
in6_dev_put(idev);
pr_info("%s: retry temporary address regeneration\n", __func__);
- tmpaddr = &addr;
write_lock_bh(&idev->lock);
goto retry;
}
@@ -1426,15 +1547,17 @@ static inline int ipv6_saddr_preferred(int type)
return 0;
}
-static bool ipv6_use_optimistic_addr(struct net *net,
- struct inet6_dev *idev)
+static bool ipv6_use_optimistic_addr(const struct net *net,
+ const struct inet6_dev *idev)
{
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
if (!idev)
return false;
- if (!net->ipv6.devconf_all->optimistic_dad && !idev->cnf.optimistic_dad)
+ if (!READ_ONCE(net->ipv6.devconf_all->optimistic_dad) &&
+ !READ_ONCE(idev->cnf.optimistic_dad))
return false;
- if (!net->ipv6.devconf_all->use_optimistic && !idev->cnf.use_optimistic)
+ if (!READ_ONCE(net->ipv6.devconf_all->use_optimistic) &&
+ !READ_ONCE(idev->cnf.use_optimistic))
return false;
return true;
@@ -1443,13 +1566,14 @@ static bool ipv6_use_optimistic_addr(struct net *net,
#endif
}
-static bool ipv6_allow_optimistic_dad(struct net *net,
- struct inet6_dev *idev)
+static bool ipv6_allow_optimistic_dad(const struct net *net,
+ const struct inet6_dev *idev)
{
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
if (!idev)
return false;
- if (!net->ipv6.devconf_all->optimistic_dad && !idev->cnf.optimistic_dad)
+ if (!READ_ONCE(net->ipv6.devconf_all->optimistic_dad) &&
+ !READ_ONCE(idev->cnf.optimistic_dad))
return false;
return true;
@@ -1555,7 +1679,7 @@ static int ipv6_get_saddr_eval(struct net *net,
*/
int preftmp = dst->prefs & (IPV6_PREFER_SRC_PUBLIC|IPV6_PREFER_SRC_TMP) ?
!!(dst->prefs & IPV6_PREFER_SRC_TMP) :
- score->ifa->idev->cnf.use_tempaddr >= 2;
+ READ_ONCE(score->ifa->idev->cnf.use_tempaddr) >= 2;
ret = (!(score->ifa->flags & IFA_F_TEMPORARY)) ^ preftmp;
break;
}
@@ -1731,7 +1855,7 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
idev = __in6_dev_get(dst_dev);
if ((dst_type & IPV6_ADDR_MULTICAST) ||
dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL ||
- (idev && idev->cnf.use_oif_addrs_only)) {
+ (idev && READ_ONCE(idev->cnf.use_oif_addrs_only))) {
use_oif_addr = true;
}
}
@@ -1755,7 +1879,8 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev,
master, &dst,
scores, hiscore_idx);
- if (scores[hiscore_idx].ifa)
+ if (scores[hiscore_idx].ifa &&
+ scores[hiscore_idx].scopedist >= 0)
goto out;
}
@@ -1784,8 +1909,8 @@ out:
}
EXPORT_SYMBOL(ipv6_dev_get_saddr);
-int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr,
- u32 banned_flags)
+static int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr,
+ u32 banned_flags)
{
struct inet6_ifaddr *ifp;
int err = -EADDRNOTAVAIL;
@@ -1849,12 +1974,13 @@ EXPORT_SYMBOL(ipv6_chk_addr);
* 2. does the address exist on the specific device
* (skip_dev_check = false)
*/
-int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
- const struct net_device *dev, bool skip_dev_check,
- int strict, u32 banned_flags)
+static struct net_device *
+__ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
+ const struct net_device *dev, bool skip_dev_check,
+ int strict, u32 banned_flags)
{
unsigned int hash = inet6_addr_hash(net, addr);
- const struct net_device *l3mdev;
+ struct net_device *l3mdev, *ndev;
struct inet6_ifaddr *ifp;
u32 ifp_flags;
@@ -1864,11 +1990,10 @@ int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
if (skip_dev_check)
dev = NULL;
- hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
- if (!net_eq(dev_net(ifp->idev->dev), net))
- continue;
+ hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
+ ndev = ifp->idev->dev;
- if (l3mdev_master_dev_rcu(ifp->idev->dev) != l3mdev)
+ if (l3mdev_master_dev_rcu(ndev) != l3mdev)
continue;
/* Decouple optimistic from tentative for evaluation here.
@@ -1879,15 +2004,23 @@ int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
: ifp->flags;
if (ipv6_addr_equal(&ifp->addr, addr) &&
!(ifp_flags&banned_flags) &&
- (!dev || ifp->idev->dev == dev ||
+ (!dev || ndev == dev ||
!(ifp->scope&(IFA_LINK|IFA_HOST) || strict))) {
rcu_read_unlock();
- return 1;
+ return ndev;
}
}
rcu_read_unlock();
- return 0;
+ return NULL;
+}
+
+int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr,
+ const struct net_device *dev, bool skip_dev_check,
+ int strict, u32 banned_flags)
+{
+ return __ipv6_chk_addr_and_flags(net, addr, dev, skip_dev_check,
+ strict, banned_flags) ? 1 : 0;
}
EXPORT_SYMBOL(ipv6_chk_addr_and_flags);
@@ -1939,6 +2072,22 @@ int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev)
}
EXPORT_SYMBOL(ipv6_chk_prefix);
+/**
+ * ipv6_dev_find - find the first device with a given source address.
+ * @net: the net namespace
+ * @addr: the source address
+ * @dev: used to find the L3 domain of interest
+ *
+ * The caller should be protected by RCU, or RTNL.
+ */
+struct net_device *ipv6_dev_find(struct net *net, const struct in6_addr *addr,
+ struct net_device *dev)
+{
+ return __ipv6_chk_addr_and_flags(net, addr, dev, !dev, 1,
+ IFA_F_TENTATIVE);
+}
+EXPORT_SYMBOL(ipv6_dev_find);
+
struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr,
struct net_device *dev, int strict)
{
@@ -1946,15 +2095,14 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add
struct inet6_ifaddr *ifp, *result = NULL;
rcu_read_lock();
- hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
- if (!net_eq(dev_net(ifp->idev->dev), net))
- continue;
+ hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
if (ipv6_addr_equal(&ifp->addr, addr)) {
if (!dev || ifp->idev->dev == dev ||
!(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) {
- result = ifp;
- in6_ifa_hold(ifp);
- break;
+ if (in6_ifa_hold_safe(ifp)) {
+ result = ifp;
+ break;
+ }
}
}
}
@@ -1977,7 +2125,7 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed)
if (ifpub) {
in6_ifa_hold(ifpub);
spin_unlock_bh(&ifp->lock);
- ipv6_create_tempaddr(ifpub, ifp, true);
+ ipv6_create_tempaddr(ifpub, true);
in6_ifa_put(ifpub);
} else {
spin_unlock_bh(&ifp->lock);
@@ -2015,7 +2163,8 @@ static int addrconf_dad_end(struct inet6_ifaddr *ifp)
void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
{
struct inet6_dev *idev = ifp->idev;
- struct net *net = dev_net(ifp->idev->dev);
+ struct net *net = dev_net(idev->dev);
+ int max_addresses;
if (addrconf_dad_end(ifp)) {
in6_ifa_put(ifp);
@@ -2053,9 +2202,9 @@ void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp)
spin_unlock_bh(&ifp->lock);
- if (idev->cnf.max_addresses &&
- ipv6_count_addresses(idev) >=
- idev->cnf.max_addresses)
+ max_addresses = READ_ONCE(idev->cnf.max_addresses);
+ if (max_addresses &&
+ ipv6_count_addresses(idev) >= max_addresses)
goto lock_errdad;
net_info_ratelimited("%s: generating new stable privacy address because of DAD conflict\n",
@@ -2085,32 +2234,29 @@ errdad:
in6_ifa_put(ifp);
}
-/* Join to solicited addr multicast group.
- * caller must hold RTNL */
+/* Join to solicited addr multicast group. */
void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr)
{
struct in6_addr maddr;
- if (dev->flags&(IFF_LOOPBACK|IFF_NOARP))
+ if (READ_ONCE(dev->flags) & (IFF_LOOPBACK | IFF_NOARP))
return;
addrconf_addr_solict_mult(addr, &maddr);
ipv6_dev_mc_inc(dev, &maddr);
}
-/* caller must hold RTNL */
void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr)
{
struct in6_addr maddr;
- if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP))
+ if (READ_ONCE(idev->dev->flags) & (IFF_LOOPBACK | IFF_NOARP))
return;
addrconf_addr_solict_mult(addr, &maddr);
__ipv6_dev_mc_dec(idev, &maddr);
}
-/* caller must hold RTNL */
static void addrconf_join_anycast(struct inet6_ifaddr *ifp)
{
struct in6_addr addr;
@@ -2123,7 +2269,6 @@ static void addrconf_join_anycast(struct inet6_ifaddr *ifp)
__ipv6_dev_ac_inc(ifp->idev, &addr);
}
-/* caller must hold RTNL */
static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
{
struct in6_addr addr;
@@ -2158,12 +2303,12 @@ static int addrconf_ifid_6lowpan(u8 *eui, struct net_device *dev)
static int addrconf_ifid_ieee1394(u8 *eui, struct net_device *dev)
{
- union fwnet_hwaddr *ha;
+ const union fwnet_hwaddr *ha;
if (dev->addr_len != FWNET_ALEN)
return -1;
- ha = (union fwnet_hwaddr *)dev->dev_addr;
+ ha = (const union fwnet_hwaddr *)dev->dev_addr;
memcpy(eui, &ha->uc.uniq_id, sizeof(ha->uc.uniq_id));
eui[0] ^= 2;
@@ -2274,40 +2419,38 @@ static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev)
return err;
}
-/* (re)generation of randomized interface identifier (RFC 3041 3.2, 3.5) */
-static void ipv6_regen_rndid(struct inet6_dev *idev)
+/* Generation of a randomized Interface Identifier
+ * draft-ietf-6man-rfc4941bis, Section 3.3.1
+ */
+
+static void ipv6_gen_rnd_iid(struct in6_addr *addr)
{
regen:
- get_random_bytes(idev->rndid, sizeof(idev->rndid));
- idev->rndid[0] &= ~0x02;
+ get_random_bytes(&addr->s6_addr[8], 8);
- /*
- * <draft-ietf-ipngwg-temp-addresses-v2-00.txt>:
- * check if generated address is not inappropriate
+ /* <draft-ietf-6man-rfc4941bis-08.txt>, Section 3.3.1:
+ * check if generated address is not inappropriate:
*
- * - Reserved subnet anycast (RFC 2526)
- * 11111101 11....11 1xxxxxxx
- * - ISATAP (RFC4214) 6.1
- * 00-00-5E-FE-xx-xx-xx-xx
- * - value 0
- * - XXX: already assigned to an address on the device
+ * - Reserved IPv6 Interface Identifiers
+ * - XXX: already assigned to an address on the device
*/
- if (idev->rndid[0] == 0xfd &&
- (idev->rndid[1]&idev->rndid[2]&idev->rndid[3]&idev->rndid[4]&idev->rndid[5]&idev->rndid[6]) == 0xff &&
- (idev->rndid[7]&0x80))
+
+ /* Subnet-router anycast: 0000:0000:0000:0000 */
+ if (!(addr->s6_addr32[2] | addr->s6_addr32[3]))
goto regen;
- if ((idev->rndid[0]|idev->rndid[1]) == 0) {
- if (idev->rndid[2] == 0x5e && idev->rndid[3] == 0xfe)
- goto regen;
- if ((idev->rndid[2]|idev->rndid[3]|idev->rndid[4]|idev->rndid[5]|idev->rndid[6]|idev->rndid[7]) == 0x00)
- goto regen;
- }
-}
-static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr)
-{
- if (tmpaddr && memcmp(idev->rndid, &tmpaddr->s6_addr[8], 8) == 0)
- ipv6_regen_rndid(idev);
+ /* IANA Ethernet block: 0200:5EFF:FE00:0000-0200:5EFF:FE00:5212
+ * Proxy Mobile IPv6: 0200:5EFF:FE00:5213
+ * IANA Ethernet block: 0200:5EFF:FE00:5214-0200:5EFF:FEFF:FFFF
+ */
+ if (ntohl(addr->s6_addr32[2]) == 0x02005eff &&
+ (ntohl(addr->s6_addr32[3]) & 0Xff000000) == 0xfe000000)
+ goto regen;
+
+ /* Reserved subnet anycast addresses */
+ if (ntohl(addr->s6_addr32[2]) == 0xfdffffff &&
+ ntohl(addr->s6_addr32[3]) >= 0Xffffff80)
+ goto regen;
}
/*
@@ -2349,7 +2492,8 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, u32 metric,
static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
int plen,
const struct net_device *dev,
- u32 flags, u32 noflags)
+ u32 flags, u32 noflags,
+ bool no_gw)
{
struct fib6_node *fn;
struct fib6_info *rt = NULL;
@@ -2366,7 +2510,13 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
goto out;
for_each_fib6_node_rt_rcu(fn) {
- if (rt->fib6_nh.nh_dev->ifindex != dev->ifindex)
+ /* prefix routes only use builtin fib6_nh */
+ if (rt->nh)
+ continue;
+
+ if (rt->fib6_nh->fib_nh_dev->ifindex != dev->ifindex)
+ continue;
+ if (no_gw && rt->fib6_nh->fib_nh_gw_family)
continue;
if ((rt->fib6_flags & flags) != flags)
continue;
@@ -2392,8 +2542,9 @@ static void addrconf_add_mroute(struct net_device *dev)
.fc_ifindex = dev->ifindex,
.fc_dst_len = 8,
.fc_flags = RTF_UP,
- .fc_type = RTN_UNICAST,
+ .fc_type = RTN_MULTICAST,
.fc_nlinfo.nl_net = dev_net(dev),
+ .fc_protocol = RTPROT_KERNEL,
};
ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0);
@@ -2408,8 +2559,8 @@ static struct inet6_dev *addrconf_add_dev(struct net_device *dev)
ASSERT_RTNL();
idev = ipv6_find_idev(dev);
- if (!idev)
- return ERR_PTR(-ENOBUFS);
+ if (IS_ERR(idev))
+ return idev;
if (idev->cnf.disable_ipv6)
return ERR_PTR(-EACCES);
@@ -2421,6 +2572,24 @@ static struct inet6_dev *addrconf_add_dev(struct net_device *dev)
return idev;
}
+static void delete_tempaddrs(struct inet6_dev *idev,
+ struct inet6_ifaddr *ifp)
+{
+ struct inet6_ifaddr *ift, *tmp;
+
+ write_lock_bh(&idev->lock);
+ list_for_each_entry_safe(ift, tmp, &idev->tempaddr_list, tmp_list) {
+ if (ift->ifpub != ifp)
+ continue;
+
+ in6_ifa_hold(ift);
+ write_unlock_bh(&idev->lock);
+ ipv6_del_addr(ift);
+ write_lock_bh(&idev->lock);
+ }
+ write_unlock_bh(&idev->lock);
+}
+
static void manage_tempaddrs(struct inet6_dev *idev,
struct inet6_ifaddr *ifp,
__u32 valid_lft, __u32 prefered_lft,
@@ -2446,11 +2615,11 @@ static void manage_tempaddrs(struct inet6_dev *idev,
* (TEMP_PREFERRED_LIFETIME - DESYNC_FACTOR), respectively.
*/
age = (now - ift->cstamp) / HZ;
- max_valid = idev->cnf.temp_valid_lft - age;
+ max_valid = READ_ONCE(idev->cnf.temp_valid_lft) - age;
if (max_valid < 0)
max_valid = 0;
- max_prefered = idev->cnf.temp_prefered_lft -
+ max_prefered = READ_ONCE(idev->cnf.temp_prefered_lft) -
idev->desync_factor - age;
if (max_prefered < 0)
max_prefered = 0;
@@ -2474,15 +2643,21 @@ static void manage_tempaddrs(struct inet6_dev *idev,
ipv6_ifa_notify(0, ift);
}
- if ((create || list_empty(&idev->tempaddr_list)) &&
- idev->cnf.use_tempaddr > 0) {
+ /* Also create a temporary address if it's enabled but no temporary
+ * address currently exists.
+ * However, we get called with valid_lft == 0, prefered_lft == 0, create == false
+ * as part of cleanup (ie. deleting the mngtmpaddr).
+ * We don't want that to result in creating a new temporary ip address.
+ */
+ if (list_empty(&idev->tempaddr_list) && (valid_lft || prefered_lft))
+ create = true;
+
+ if (create && READ_ONCE(idev->cnf.use_tempaddr) > 0) {
/* When a new public address is created as described
* in [ADDRCONF], also create a new temporary address.
- * Also create a temporary address if it's enabled but
- * no temporary address currently exists.
*/
read_unlock_bh(&idev->lock);
- ipv6_create_tempaddr(ifp, NULL, false);
+ ipv6_create_tempaddr(ifp, false);
} else {
read_unlock_bh(&idev->lock);
}
@@ -2505,7 +2680,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
int create = 0, update_lft = 0;
if (!ifp && valid_lft) {
- int max_addresses = in6_dev->cnf.max_addresses;
+ int max_addresses = READ_ONCE(in6_dev->cnf.max_addresses);
struct ifa6_config cfg = {
.pfx = addr,
.plen = pinfo->prefix_len,
@@ -2513,11 +2688,12 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
.valid_lft = valid_lft,
.preferred_lft = prefered_lft,
.scope = addr_type & IPV6_ADDR_SCOPE_MASK,
+ .ifa_proto = IFAPROT_KERNEL_RA
};
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
- if ((net->ipv6.devconf_all->optimistic_dad ||
- in6_dev->cnf.optimistic_dad) &&
+ if ((READ_ONCE(net->ipv6.devconf_all->optimistic_dad) ||
+ READ_ONCE(in6_dev->cnf.optimistic_dad)) &&
!net->ipv6.devconf_all->forwarding && sllao)
cfg.ifa_flags |= IFA_F_OPTIMISTIC;
#endif
@@ -2553,28 +2729,29 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ;
else
stored_lft = 0;
- if (!create && stored_lft) {
+
+ /* RFC4862 Section 5.5.3e:
+ * "Note that the preferred lifetime of the
+ * corresponding address is always reset to
+ * the Preferred Lifetime in the received
+ * Prefix Information option, regardless of
+ * whether the valid lifetime is also reset or
+ * ignored."
+ *
+ * So we should always update prefered_lft here.
+ */
+ update_lft = !create && stored_lft;
+
+ if (update_lft && !READ_ONCE(in6_dev->cnf.ra_honor_pio_life)) {
const u32 minimum_lft = min_t(u32,
stored_lft, MIN_VALID_LIFETIME);
valid_lft = max(valid_lft, minimum_lft);
-
- /* RFC4862 Section 5.5.3e:
- * "Note that the preferred lifetime of the
- * corresponding address is always reset to
- * the Preferred Lifetime in the received
- * Prefix Information option, regardless of
- * whether the valid lifetime is also reset or
- * ignored."
- *
- * So we should always update prefered_lft here.
- */
- update_lft = 1;
}
if (update_lft) {
ifp->valid_lft = valid_lft;
ifp->prefered_lft = prefered_lft;
- ifp->tstamp = now;
+ WRITE_ONCE(ifp->tstamp, now);
flags = ifp->flags;
ifp->flags &= ~IFA_F_DEPRECATED;
spin_unlock_bh(&ifp->lock);
@@ -2588,7 +2765,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev,
create, now);
in6_ifa_put(ifp);
- addrconf_verify();
+ addrconf_verify(net);
}
return 0;
@@ -2598,12 +2775,14 @@ EXPORT_SYMBOL_GPL(addrconf_prefix_rcv_add_addr);
void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
{
struct prefix_info *pinfo;
+ struct fib6_table *table;
__u32 valid_lft;
__u32 prefered_lft;
int addr_type, err;
u32 addr_flags = 0;
struct inet6_dev *in6_dev;
struct net *net = dev_net(dev);
+ bool ignore_autoconf = false;
pinfo = (struct prefix_info *) opt;
@@ -2637,6 +2816,9 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
return;
}
+ if (valid_lft != 0 && valid_lft < in6_dev->cnf.accept_ra_min_lft)
+ goto put;
+
/*
* Two things going on here:
* 1) Add routes for on-link prefixes
@@ -2664,18 +2846,27 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
pinfo->prefix_len,
dev,
RTF_ADDRCONF | RTF_PREFIX_RT,
- RTF_GATEWAY | RTF_DEFAULT);
+ RTF_DEFAULT, true);
if (rt) {
/* Autoconf prefix route */
if (valid_lft == 0) {
- ip6_del_rt(net, rt);
+ ip6_del_rt(net, rt, false);
rt = NULL;
- } else if (addrconf_finite_timeout(rt_expires)) {
- /* not infinity */
- fib6_set_expires(rt, jiffies + rt_expires);
} else {
- fib6_clean_expires(rt);
+ table = rt->fib6_table;
+ spin_lock_bh(&table->tb6_lock);
+
+ if (addrconf_finite_timeout(rt_expires)) {
+ /* not infinity */
+ fib6_set_expires(rt, jiffies + rt_expires);
+ fib6_add_gc_list(rt);
+ } else {
+ fib6_clean_expires(rt);
+ fib6_remove_gc_list(rt);
+ }
+
+ spin_unlock_bh(&table->tb6_lock);
}
} else if (valid_lft) {
clock_t expires = 0;
@@ -2694,7 +2885,8 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
/* Try to figure out our local address for this prefix */
- if (pinfo->autoconf && in6_dev->cnf.autoconf) {
+ ignore_autoconf = READ_ONCE(in6_dev->cnf.ra_honor_pio_pflag) && pinfo->preferpd;
+ if (pinfo->autoconf && in6_dev->cnf.autoconf && !ignore_autoconf) {
struct in6_addr addr;
bool tokenized = false, dev_addr_generated = false;
@@ -2747,6 +2939,33 @@ put:
in6_dev_put(in6_dev);
}
+static int addrconf_set_sit_dstaddr(struct net *net, struct net_device *dev,
+ struct in6_ifreq *ireq)
+{
+ struct ip_tunnel_parm_kern p = { };
+ int err;
+
+ if (!(ipv6_addr_type(&ireq->ifr6_addr) & IPV6_ADDR_COMPATv4))
+ return -EADDRNOTAVAIL;
+
+ p.iph.daddr = ireq->ifr6_addr.s6_addr32[3];
+ p.iph.version = 4;
+ p.iph.ihl = 5;
+ p.iph.protocol = IPPROTO_IPV6;
+ p.iph.ttl = 64;
+
+ if (!dev->netdev_ops->ndo_tunnel_ctl)
+ return -EOPNOTSUPP;
+ err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, SIOCADDTUNNEL);
+ if (err)
+ return err;
+
+ dev = __dev_get_by_name(net, p.name);
+ if (!dev)
+ return -ENOBUFS;
+ return dev_open(dev, NULL);
+}
+
/*
* Set destination address.
* Special case for SIT interfaces where we create a new "virtual"
@@ -2754,62 +2973,20 @@ put:
*/
int addrconf_set_dstaddr(struct net *net, void __user *arg)
{
- struct in6_ifreq ireq;
struct net_device *dev;
- int err = -EINVAL;
-
- rtnl_lock();
+ struct in6_ifreq ireq;
+ int err = -ENODEV;
- err = -EFAULT;
+ if (!IS_ENABLED(CONFIG_IPV6_SIT))
+ return -ENODEV;
if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
- goto err_exit;
+ return -EFAULT;
+ rtnl_net_lock(net);
dev = __dev_get_by_index(net, ireq.ifr6_ifindex);
-
- err = -ENODEV;
- if (!dev)
- goto err_exit;
-
-#if IS_ENABLED(CONFIG_IPV6_SIT)
- if (dev->type == ARPHRD_SIT) {
- const struct net_device_ops *ops = dev->netdev_ops;
- struct ifreq ifr;
- struct ip_tunnel_parm p;
-
- err = -EADDRNOTAVAIL;
- if (!(ipv6_addr_type(&ireq.ifr6_addr) & IPV6_ADDR_COMPATv4))
- goto err_exit;
-
- memset(&p, 0, sizeof(p));
- p.iph.daddr = ireq.ifr6_addr.s6_addr32[3];
- p.iph.saddr = 0;
- p.iph.version = 4;
- p.iph.ihl = 5;
- p.iph.protocol = IPPROTO_IPV6;
- p.iph.ttl = 64;
- ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
-
- if (ops->ndo_do_ioctl) {
- mm_segment_t oldfs = get_fs();
-
- set_fs(KERNEL_DS);
- err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
- set_fs(oldfs);
- } else
- err = -EOPNOTSUPP;
-
- if (err == 0) {
- err = -ENOBUFS;
- dev = __dev_get_by_name(net, p.name);
- if (!dev)
- goto err_exit;
- err = dev_open(dev);
- }
- }
-#endif
-
-err_exit:
- rtnl_unlock();
+ if (dev && dev->type == ARPHRD_SIT)
+ err = addrconf_set_sit_dstaddr(net, dev, &ireq);
+ rtnl_net_unlock(net);
return err;
}
@@ -2833,65 +3010,43 @@ static int ipv6_mc_config(struct sock *sk, bool join,
/*
* Manual configuration of address on an interface
*/
-static int inet6_addr_add(struct net *net, int ifindex,
- struct ifa6_config *cfg,
+static int inet6_addr_add(struct net *net, struct net_device *dev,
+ struct ifa6_config *cfg, clock_t expires, u32 flags,
struct netlink_ext_ack *extack)
{
struct inet6_ifaddr *ifp;
struct inet6_dev *idev;
- struct net_device *dev;
- unsigned long timeout;
- clock_t expires;
- u32 flags;
- ASSERT_RTNL();
+ ASSERT_RTNL_NET(net);
- if (cfg->plen > 128)
- return -EINVAL;
-
- /* check the lifetime */
- if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft)
+ if (cfg->plen > 128) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid prefix length");
return -EINVAL;
+ }
- if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64)
+ if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64) {
+ NL_SET_ERR_MSG_MOD(extack, "address with \"mngtmpaddr\" flag must have a prefix length of 64");
return -EINVAL;
-
- dev = __dev_get_by_index(net, ifindex);
- if (!dev)
- return -ENODEV;
+ }
idev = addrconf_add_dev(dev);
- if (IS_ERR(idev))
+ if (IS_ERR(idev)) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device");
return PTR_ERR(idev);
+ }
if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk,
- true, cfg->pfx, ifindex);
+ true, cfg->pfx, dev->ifindex);
- if (ret < 0)
+ if (ret < 0) {
+ NL_SET_ERR_MSG_MOD(extack, "Multicast auto join failed");
return ret;
+ }
}
cfg->scope = ipv6_addr_scope(cfg->pfx);
- timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ);
- if (addrconf_finite_timeout(timeout)) {
- expires = jiffies_to_clock_t(timeout * HZ);
- cfg->valid_lft = timeout;
- flags = RTF_EXPIRES;
- } else {
- expires = 0;
- flags = 0;
- cfg->ifa_flags |= IFA_F_PERMANENT;
- }
-
- timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ);
- if (addrconf_finite_timeout(timeout)) {
- if (timeout == 0)
- cfg->ifa_flags |= IFA_F_DEPRECATED;
- cfg->preferred_lft = timeout;
- }
-
ifp = ipv6_add_addr(idev, cfg, true, extack);
if (!IS_ERR(ifp)) {
if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
@@ -2915,33 +3070,40 @@ static int inet6_addr_add(struct net *net, int ifindex,
manage_tempaddrs(idev, ifp, cfg->valid_lft,
cfg->preferred_lft, true, jiffies);
in6_ifa_put(ifp);
- addrconf_verify_rtnl();
+ addrconf_verify_rtnl(net);
return 0;
} else if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
ipv6_mc_config(net->ipv6.mc_autojoin_sk, false,
- cfg->pfx, ifindex);
+ cfg->pfx, dev->ifindex);
}
return PTR_ERR(ifp);
}
static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags,
- const struct in6_addr *pfx, unsigned int plen)
+ const struct in6_addr *pfx, unsigned int plen,
+ struct netlink_ext_ack *extack)
{
struct inet6_ifaddr *ifp;
struct inet6_dev *idev;
struct net_device *dev;
- if (plen > 128)
+ if (plen > 128) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid prefix length");
return -EINVAL;
+ }
dev = __dev_get_by_index(net, ifindex);
- if (!dev)
+ if (!dev) {
+ NL_SET_ERR_MSG_MOD(extack, "Unable to find the interface");
return -ENODEV;
+ }
- idev = __in6_dev_get(dev);
- if (!idev)
+ idev = __in6_dev_get_rtnl_net(dev);
+ if (!idev) {
+ NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device");
return -ENXIO;
+ }
read_lock_bh(&idev->lock);
list_for_each_entry(ifp, &idev->addr_list, if_list) {
@@ -2950,12 +3112,13 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags,
in6_ifa_hold(ifp);
read_unlock_bh(&idev->lock);
- if (!(ifp->flags & IFA_F_TEMPORARY) &&
- (ifa_flags & IFA_F_MANAGETEMPADDR))
- manage_tempaddrs(idev, ifp, 0, 0, false,
- jiffies);
ipv6_del_addr(ifp);
- addrconf_verify_rtnl();
+
+ if (!(ifp->flags & IFA_F_TEMPORARY) &&
+ (ifp->flags & IFA_F_MANAGETEMPADDR))
+ delete_tempaddrs(idev, ifp);
+
+ addrconf_verify_rtnl(net);
if (ipv6_addr_is_multicast(pfx)) {
ipv6_mc_config(net->ipv6.mc_autojoin_sk,
false, pfx, dev->ifindex);
@@ -2964,6 +3127,8 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags,
}
}
read_unlock_bh(&idev->lock);
+
+ NL_SET_ERR_MSG_MOD(extack, "address not found");
return -EADDRNOTAVAIL;
}
@@ -2975,6 +3140,7 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg)
.preferred_lft = INFINITY_LIFE_TIME,
.valid_lft = INFINITY_LIFE_TIME,
};
+ struct net_device *dev;
struct in6_ifreq ireq;
int err;
@@ -2987,9 +3153,16 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg)
cfg.pfx = &ireq.ifr6_addr;
cfg.plen = ireq.ifr6_prefixlen;
- rtnl_lock();
- err = inet6_addr_add(net, ireq.ifr6_ifindex, &cfg, NULL);
- rtnl_unlock();
+ rtnl_net_lock(net);
+ dev = __dev_get_by_index(net, ireq.ifr6_ifindex);
+ if (dev) {
+ netdev_lock_ops(dev);
+ err = inet6_addr_add(net, dev, &cfg, 0, 0, NULL);
+ netdev_unlock_ops(dev);
+ } else {
+ err = -ENODEV;
+ }
+ rtnl_net_unlock(net);
return err;
}
@@ -3004,15 +3177,15 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg)
if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
return -EFAULT;
- rtnl_lock();
+ rtnl_net_lock(net);
err = inet6_addr_del(net, ireq.ifr6_ifindex, 0, &ireq.ifr6_addr,
- ireq.ifr6_prefixlen);
- rtnl_unlock();
+ ireq.ifr6_prefixlen, NULL);
+ rtnl_net_unlock(net);
return err;
}
static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
- int plen, int scope)
+ int plen, int scope, u8 proto)
{
struct inet6_ifaddr *ifp;
struct ifa6_config cfg = {
@@ -3021,7 +3194,8 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
.ifa_flags = IFA_F_PERMANENT,
.valid_lft = INFINITY_LIFE_TIME,
.preferred_lft = INFINITY_LIFE_TIME,
- .scope = scope
+ .scope = scope,
+ .ifa_proto = proto
};
ifp = ipv6_add_addr(idev, &cfg, true, NULL);
@@ -3035,8 +3209,8 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
}
}
-#if IS_ENABLED(CONFIG_IPV6_SIT)
-static void sit_add_v4_addrs(struct inet6_dev *idev)
+#if IS_ENABLED(CONFIG_IPV6_SIT) || IS_ENABLED(CONFIG_NET_IPGRE)
+static void add_v4_addrs(struct inet6_dev *idev)
{
struct in6_addr addr;
struct net_device *dev;
@@ -3049,18 +3223,21 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
memset(&addr, 0, sizeof(struct in6_addr));
memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4);
- if (idev->dev->flags&IFF_POINTOPOINT) {
- addr.s6_addr32[0] = htonl(0xfe800000);
- scope = IFA_LINK;
- plen = 64;
- } else {
+ if (!(idev->dev->flags & IFF_POINTOPOINT) && idev->dev->type == ARPHRD_SIT) {
scope = IPV6_ADDR_COMPATv4;
plen = 96;
pflags |= RTF_NONEXTHOP;
+ } else {
+ if (idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_NONE)
+ return;
+
+ addr.s6_addr32[0] = htonl(0xfe800000);
+ scope = IFA_LINK;
+ plen = 64;
}
if (addr.s6_addr32[3]) {
- add_addr(idev, &addr, plen, scope);
+ add_addr(idev, &addr, plen, scope, IFAPROT_UNSPEC);
addrconf_prefix_route(&addr, plen, 0, idev->dev, 0, pflags,
GFP_KERNEL);
return;
@@ -3070,11 +3247,9 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
struct in_device *in_dev = __in_dev_get_rtnl(dev);
if (in_dev && (dev->flags & IFF_UP)) {
struct in_ifaddr *ifa;
-
int flag = scope;
- for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
-
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
addr.s6_addr32[3] = ifa->ifa_local;
if (ifa->ifa_scope == RT_SCOPE_LINK)
@@ -3085,7 +3260,8 @@ static void sit_add_v4_addrs(struct inet6_dev *idev)
flag |= IFA_HOST;
}
- add_addr(idev, &addr, plen, flag);
+ add_addr(idev, &addr, plen, flag,
+ IFAPROT_UNSPEC);
addrconf_prefix_route(&addr, plen, 0, idev->dev,
0, pflags, GFP_KERNEL);
}
@@ -3103,12 +3279,12 @@ static void init_loopback(struct net_device *dev)
ASSERT_RTNL();
idev = ipv6_find_idev(dev);
- if (!idev) {
+ if (IS_ERR(idev)) {
pr_debug("%s: add_dev failed\n", __func__);
return;
}
- add_addr(idev, &in6addr_loopback, 128, IFA_HOST);
+ add_addr(idev, &in6addr_loopback, 128, IFA_HOST, IFAPROT_KERNEL_LO);
}
void addrconf_add_linklocal(struct inet6_dev *idev,
@@ -3120,13 +3296,14 @@ void addrconf_add_linklocal(struct inet6_dev *idev,
.ifa_flags = flags | IFA_F_PERMANENT,
.valid_lft = INFINITY_LIFE_TIME,
.preferred_lft = INFINITY_LIFE_TIME,
- .scope = IFA_LINK
+ .scope = IFA_LINK,
+ .ifa_proto = IFAPROT_KERNEL_LL
};
struct inet6_ifaddr *ifp;
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
- if ((dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad ||
- idev->cnf.optimistic_dad) &&
+ if ((READ_ONCE(dev_net(idev->dev)->ipv6.devconf_all->optimistic_dad) ||
+ READ_ONCE(idev->cnf.optimistic_dad)) &&
!dev_net(idev->dev)->ipv6.devconf_all->forwarding)
cfg.ifa_flags |= IFA_F_OPTIMISTIC;
#endif
@@ -3162,11 +3339,11 @@ static int ipv6_generate_stable_address(struct in6_addr *address,
const struct inet6_dev *idev)
{
static DEFINE_SPINLOCK(lock);
- static __u32 digest[SHA_DIGEST_WORDS];
- static __u32 workspace[SHA_WORKSPACE_WORDS];
+ static __u32 digest[SHA1_DIGEST_WORDS];
+ static __u32 workspace[SHA1_WORKSPACE_WORDS];
static union {
- char __data[SHA_MESSAGE_BYTES];
+ char __data[SHA1_BLOCK_SIZE];
struct {
struct in6_addr secret;
__be32 prefix[2];
@@ -3191,7 +3368,7 @@ static int ipv6_generate_stable_address(struct in6_addr *address,
retry:
spin_lock_bh(&lock);
- sha_init(digest);
+ sha1_init_raw(digest);
memset(&data, 0, sizeof(data));
memset(workspace, 0, sizeof(workspace));
memcpy(data.hwaddr, idev->dev->perm_addr, idev->dev->addr_len);
@@ -3200,7 +3377,7 @@ retry:
data.secret = secret;
data.dad_count = dad_count;
- sha_transform(digest, data.__data, workspace);
+ sha1_transform(digest, data.__data, workspace);
temp = *address;
temp.s6_addr32[2] = (__force __be32)digest[0];
@@ -3238,12 +3415,16 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
if (netif_is_l3_master(idev->dev))
return;
+ /* no link local addresses on devices flagged as slaves */
+ if (idev->dev->priv_flags & IFF_NO_ADDRCONF)
+ return;
+
ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
switch (idev->cnf.addr_gen_mode) {
case IN6_ADDR_GEN_MODE_RANDOM:
ipv6_gen_mode_random_init(idev);
- /* fallthrough */
+ fallthrough;
case IN6_ADDR_GEN_MODE_STABLE_PRIVACY:
if (!ipv6_generate_stable_address(&addr, 0, idev))
addrconf_add_linklocal(idev, &addr,
@@ -3284,11 +3465,14 @@ static void addrconf_dev_config(struct net_device *dev)
(dev->type != ARPHRD_TUNNEL6) &&
(dev->type != ARPHRD_6LOWPAN) &&
(dev->type != ARPHRD_IP6GRE) &&
- (dev->type != ARPHRD_IPGRE) &&
(dev->type != ARPHRD_TUNNEL) &&
(dev->type != ARPHRD_NONE) &&
(dev->type != ARPHRD_RAWIP)) {
/* Alas, we support only Ethernet autoconfiguration. */
+ idev = __in6_dev_get(dev);
+ if (!IS_ERR_OR_NULL(idev) && dev->flags & IFF_UP &&
+ dev->flags & IFF_MULTICAST)
+ ipv6_mc_up(idev);
return;
}
@@ -3299,7 +3483,8 @@ static void addrconf_dev_config(struct net_device *dev)
/* this device type has no EUI support */
if (dev->type == ARPHRD_NONE &&
idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)
- idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM;
+ WRITE_ONCE(idev->cnf.addr_gen_mode,
+ IN6_ADDR_GEN_MODE_RANDOM);
addrconf_addr_gen(idev, false);
}
@@ -3318,7 +3503,7 @@ static void addrconf_sit_config(struct net_device *dev)
*/
idev = ipv6_find_idev(dev);
- if (!idev) {
+ if (IS_ERR(idev)) {
pr_debug("%s: add_dev failed\n", __func__);
return;
}
@@ -3328,7 +3513,7 @@ static void addrconf_sit_config(struct net_device *dev)
return;
}
- sit_add_v4_addrs(idev);
+ add_v4_addrs(idev);
if (dev->flags&IFF_POINTOPOINT)
addrconf_add_mroute(dev);
@@ -3342,18 +3527,48 @@ static void addrconf_gre_config(struct net_device *dev)
ASSERT_RTNL();
- idev = ipv6_find_idev(dev);
- if (!idev) {
- pr_debug("%s: add_dev failed\n", __func__);
+ idev = addrconf_add_dev(dev);
+ if (IS_ERR(idev))
+ return;
+
+ /* Generate the IPv6 link-local address using addrconf_addr_gen(),
+ * unless we have an IPv4 GRE device not bound to an IP address and
+ * which is in EUI64 mode (as __ipv6_isatap_ifid() would fail in this
+ * case). Such devices fall back to add_v4_addrs() instead.
+ */
+ if (!(*(__be32 *)dev->dev_addr == 0 &&
+ idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)) {
+ addrconf_addr_gen(idev, true);
return;
}
- addrconf_addr_gen(idev, true);
- if (dev->flags & IFF_POINTOPOINT)
- addrconf_add_mroute(dev);
+ add_v4_addrs(idev);
}
#endif
+static void addrconf_init_auto_addrs(struct net_device *dev)
+{
+ switch (dev->type) {
+#if IS_ENABLED(CONFIG_IPV6_SIT)
+ case ARPHRD_SIT:
+ addrconf_sit_config(dev);
+ break;
+#endif
+#if IS_ENABLED(CONFIG_NET_IPGRE)
+ case ARPHRD_IPGRE:
+ addrconf_gre_config(dev);
+ break;
+#endif
+ case ARPHRD_LOOPBACK:
+ init_loopback(dev);
+ break;
+
+ default:
+ addrconf_dev_config(dev);
+ break;
+ }
+}
+
static int fixup_permanent_addr(struct net *net,
struct inet6_dev *idev,
struct inet6_ifaddr *ifp)
@@ -3366,7 +3581,7 @@ static int fixup_permanent_addr(struct net *net,
struct fib6_info *f6i, *prev;
f6i = addrconf_f6i_alloc(net, idev, &ifp->addr, false,
- GFP_ATOMIC);
+ GFP_ATOMIC, NULL);
if (IS_ERR(f6i))
return PTR_ERR(f6i);
@@ -3422,6 +3637,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_notifier_change_info *change_info;
struct netdev_notifier_changeupper_info *info;
struct inet6_dev *idev = __in6_dev_get(dev);
struct net *net = dev_net(dev);
@@ -3446,7 +3662,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
if (idev) {
rt6_mtu_change(dev, dev->mtu);
- idev->cnf.mtu6 = dev->mtu;
+ WRITE_ONCE(idev->cnf.mtu6, dev->mtu);
break;
}
@@ -3460,16 +3676,18 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
break;
run_pending = 1;
-
- /* fall through */
-
+ fallthrough;
case NETDEV_UP:
case NETDEV_CHANGE:
- if (dev->flags & IFF_SLAVE)
+ if (idev && idev->cnf.disable_ipv6)
break;
- if (idev && idev->cnf.disable_ipv6)
+ if (dev->priv_flags & IFF_NO_ADDRCONF) {
+ if (event == NETDEV_UP && !IS_ERR_OR_NULL(idev) &&
+ dev->flags & IFF_UP && dev->flags & IFF_MULTICAST)
+ ipv6_mc_up(idev);
break;
+ }
if (event == NETDEV_UP) {
/* restore routes for permanent addresses */
@@ -3477,8 +3695,8 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
if (!addrconf_link_ready(dev)) {
/* device is not ready yet. */
- pr_info("ADDRCONF(NETDEV_UP): %s: link is not ready\n",
- dev->name);
+ pr_debug("ADDRCONF(NETDEV_UP): %s: link is not ready\n",
+ dev->name);
break;
}
@@ -3496,7 +3714,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
break;
}
- if (idev) {
+ if (!IS_ERR_OR_NULL(idev)) {
if (idev->if_flags & IF_READY) {
/* device is already configured -
* but resend MLD reports, we might
@@ -3504,41 +3722,26 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
* multicast snooping switches
*/
ipv6_mc_up(idev);
+ change_info = ptr;
+ if (change_info->flags_changed & IFF_NOARP)
+ addrconf_dad_run(idev, true);
rt6_sync_up(dev, RTNH_F_LINKDOWN);
break;
}
idev->if_flags |= IF_READY;
}
- pr_info("ADDRCONF(NETDEV_CHANGE): %s: link becomes ready\n",
- dev->name);
+ pr_debug("ADDRCONF(NETDEV_CHANGE): %s: link becomes ready\n",
+ dev->name);
run_pending = 1;
}
- switch (dev->type) {
-#if IS_ENABLED(CONFIG_IPV6_SIT)
- case ARPHRD_SIT:
- addrconf_sit_config(dev);
- break;
-#endif
-#if IS_ENABLED(CONFIG_NET_IPGRE)
- case ARPHRD_IPGRE:
- addrconf_gre_config(dev);
- break;
-#endif
- case ARPHRD_LOOPBACK:
- init_loopback(dev);
- break;
-
- default:
- addrconf_dev_config(dev);
- break;
- }
+ addrconf_init_auto_addrs(dev);
if (!IS_ERR_OR_NULL(idev)) {
if (run_pending)
- addrconf_dad_run(idev);
+ addrconf_dad_run(idev, false);
/* Device has an address by now */
rt6_sync_up(dev, RTNH_F_DEAD);
@@ -3551,9 +3754,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
if (idev->cnf.mtu6 != dev->mtu &&
dev->mtu >= IPV6_MIN_MTU) {
rt6_mtu_change(dev, dev->mtu);
- idev->cnf.mtu6 = dev->mtu;
+ WRITE_ONCE(idev->cnf.mtu6, dev->mtu);
}
- idev->tstamp = jiffies;
+ WRITE_ONCE(idev->tstamp, jiffies);
inet6_ifinfo_notify(RTM_NEWLINK, idev);
/*
@@ -3601,7 +3804,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
* an L3 master device (e.g., VRF)
*/
if (info->upper_dev && netif_is_l3_master(info->upper_dev))
- addrconf_ifdown(dev, 0);
+ addrconf_ifdown(dev, false);
}
return NOTIFY_OK;
@@ -3634,13 +3837,15 @@ static bool addr_is_local(const struct in6_addr *addr)
(IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
}
-static int addrconf_ifdown(struct net_device *dev, int how)
+static int addrconf_ifdown(struct net_device *dev, bool unregister)
{
- unsigned long event = how ? NETDEV_UNREGISTER : NETDEV_DOWN;
+ unsigned long event = unregister ? NETDEV_UNREGISTER : NETDEV_DOWN;
struct net *net = dev_net(dev);
struct inet6_dev *idev;
- struct inet6_ifaddr *ifa, *tmp;
+ struct inet6_ifaddr *ifa;
+ LIST_HEAD(tmp_addr_list);
bool keep_addr = false;
+ bool was_ready;
int state, i;
ASSERT_RTNL();
@@ -3655,8 +3860,8 @@ static int addrconf_ifdown(struct net_device *dev, int how)
* Step 1: remove reference to ipv6 device from parent device.
* Do not dev_put!
*/
- if (how) {
- idev->dead = 1;
+ if (unregister) {
+ WRITE_ONCE(idev->dead, 1);
/* protected by rtnl_lock */
RCU_INIT_POINTER(dev->ip6_ptr, NULL);
@@ -3669,21 +3874,21 @@ static int addrconf_ifdown(struct net_device *dev, int how)
/* combine the user config with event to determine if permanent
* addresses are to be removed from address hash table
*/
- if (!how && !idev->cnf.disable_ipv6) {
+ if (!unregister && !idev->cnf.disable_ipv6) {
/* aggregate the system setting and interface setting */
- int _keep_addr = net->ipv6.devconf_all->keep_addr_on_down;
+ int _keep_addr = READ_ONCE(net->ipv6.devconf_all->keep_addr_on_down);
if (!_keep_addr)
- _keep_addr = idev->cnf.keep_addr_on_down;
+ _keep_addr = READ_ONCE(idev->cnf.keep_addr_on_down);
keep_addr = (_keep_addr > 0);
}
/* Step 2: clear hash table */
for (i = 0; i < IN6_ADDR_HSIZE; i++) {
- struct hlist_head *h = &inet6_addr_lst[i];
+ struct hlist_head *h = &net->ipv6.inet6_addr_lst[i];
- spin_lock_bh(&addrconf_hash_lock);
+ spin_lock_bh(&net->ipv6.addrconf_hash_lock);
restart:
hlist_for_each_entry_rcu(ifa, h, addr_lst) {
if (ifa->idev == idev) {
@@ -3699,15 +3904,18 @@ restart:
}
}
}
- spin_unlock_bh(&addrconf_hash_lock);
+ spin_unlock_bh(&net->ipv6.addrconf_hash_lock);
}
write_lock_bh(&idev->lock);
addrconf_del_rs_timer(idev);
- /* Step 2: clear flags for stateless addrconf */
- if (!how)
+ /* Step 2: clear flags for stateless addrconf, repeated down
+ * detection
+ */
+ was_ready = idev->if_flags & IF_READY;
+ if (!unregister)
idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD|IF_READY);
/* Step 3: clear tempaddr list */
@@ -3727,16 +3935,23 @@ restart:
write_lock_bh(&idev->lock);
}
- list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
+ list_for_each_entry(ifa, &idev->addr_list, if_list)
+ list_add_tail(&ifa->if_list_aux, &tmp_addr_list);
+ write_unlock_bh(&idev->lock);
+
+ while (!list_empty(&tmp_addr_list)) {
struct fib6_info *rt = NULL;
bool keep;
+ ifa = list_first_entry(&tmp_addr_list,
+ struct inet6_ifaddr, if_list_aux);
+ list_del(&ifa->if_list_aux);
+
addrconf_del_dad_work(ifa);
keep = keep_addr && (ifa->flags & IFA_F_PERMANENT) &&
!addr_is_local(&ifa->addr);
- write_unlock_bh(&idev->lock);
spin_lock_bh(&ifa->lock);
if (keep) {
@@ -3756,7 +3971,7 @@ restart:
spin_unlock_bh(&ifa->lock);
if (rt)
- ip6_del_rt(net, rt);
+ ip6_del_rt(net, rt, false);
if (state != INET6_IFADDR_STATE_DEAD) {
__ipv6_ifa_notify(RTM_DELADDR, ifa);
@@ -3767,27 +3982,27 @@ restart:
addrconf_leave_solict(ifa->idev, &ifa->addr);
}
- write_lock_bh(&idev->lock);
if (!keep) {
+ write_lock_bh(&idev->lock);
list_del_rcu(&ifa->if_list);
+ write_unlock_bh(&idev->lock);
in6_ifa_put(ifa);
}
}
- write_unlock_bh(&idev->lock);
-
/* Step 5: Discard anycast and multicast list */
- if (how) {
+ if (unregister) {
ipv6_ac_destroy_dev(idev);
ipv6_mc_destroy_dev(idev);
- } else {
+ } else if (was_ready) {
ipv6_mc_down(idev);
}
- idev->tstamp = jiffies;
+ WRITE_ONCE(idev->tstamp, jiffies);
+ idev->ra_mtu = 0;
/* Last: Shot the device (if unregistered) */
- if (how) {
+ if (unregister) {
addrconf_sysctl_unregister(idev);
neigh_parms_release(&nd_tbl, idev->nd_parms);
neigh_ifdown(&nd_tbl, dev);
@@ -3798,9 +4013,10 @@ restart:
static void addrconf_rs_timer(struct timer_list *t)
{
- struct inet6_dev *idev = from_timer(idev, t, rs_timer);
+ struct inet6_dev *idev = timer_container_of(idev, t, rs_timer);
struct net_device *dev = idev->dev;
struct in6_addr lladdr;
+ int rtr_solicits;
write_lock(&idev->lock);
if (idev->dead || !(idev->if_flags & IF_READY))
@@ -3813,7 +4029,9 @@ static void addrconf_rs_timer(struct timer_list *t)
if (idev->if_flags & IF_RA_RCVD)
goto out;
- if (idev->rs_probes++ < idev->cnf.rtr_solicits || idev->cnf.rtr_solicits < 0) {
+ rtr_solicits = READ_ONCE(idev->cnf.rtr_solicits);
+
+ if (idev->rs_probes++ < rtr_solicits || rtr_solicits < 0) {
write_unlock(&idev->lock);
if (!ipv6_get_lladdr(dev, &lladdr, IFA_F_TENTATIVE))
ndisc_send_rs(dev, &lladdr,
@@ -3823,11 +4041,12 @@ static void addrconf_rs_timer(struct timer_list *t)
write_lock(&idev->lock);
idev->rs_interval = rfc3315_s14_backoff_update(
- idev->rs_interval, idev->cnf.rtr_solicit_max_interval);
+ idev->rs_interval,
+ READ_ONCE(idev->cnf.rtr_solicit_max_interval));
/* The wait after the last probe can be shorter */
addrconf_mod_rs_timer(idev, (idev->rs_probes ==
- idev->cnf.rtr_solicits) ?
- idev->cnf.rtr_solicit_delay :
+ READ_ONCE(idev->cnf.rtr_solicits)) ?
+ READ_ONCE(idev->cnf.rtr_solicit_delay) :
idev->rs_interval);
} else {
/*
@@ -3848,24 +4067,25 @@ put:
*/
static void addrconf_dad_kick(struct inet6_ifaddr *ifp)
{
- unsigned long rand_num;
struct inet6_dev *idev = ifp->idev;
+ unsigned long rand_num;
u64 nonce;
if (ifp->flags & IFA_F_OPTIMISTIC)
rand_num = 0;
else
- rand_num = prandom_u32() % (idev->cnf.rtr_solicit_delay ? : 1);
+ rand_num = get_random_u32_below(
+ READ_ONCE(idev->cnf.rtr_solicit_delay) ? : 1);
nonce = 0;
- if (idev->cnf.enhanced_dad ||
- dev_net(idev->dev)->ipv6.devconf_all->enhanced_dad) {
+ if (READ_ONCE(idev->cnf.enhanced_dad) ||
+ READ_ONCE(dev_net(idev->dev)->ipv6.devconf_all->enhanced_dad)) {
do
get_random_bytes(&nonce, 6);
while (nonce == 0);
}
ifp->dad_nonce = nonce;
- ifp->dad_probes = idev->cnf.dad_transmits;
+ ifp->dad_probes = READ_ONCE(idev->cnf.dad_transmits);
addrconf_mod_dad_work(ifp, rand_num);
}
@@ -3878,8 +4098,6 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
addrconf_join_solict(dev, &ifp->addr);
- prandom_seed((__force u32) ifp->addr.s6_addr32[3]);
-
read_lock_bh(&idev->lock);
spin_lock(&ifp->lock);
if (ifp->state == INET6_IFADDR_STATE_DEAD)
@@ -3887,8 +4105,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
net = dev_net(dev);
if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
- (net->ipv6.devconf_all->accept_dad < 1 &&
- idev->cnf.accept_dad < 1) ||
+ (READ_ONCE(net->ipv6.devconf_all->accept_dad) < 1 &&
+ READ_ONCE(idev->cnf.accept_dad) < 1) ||
!(ifp->flags&IFA_F_TENTATIVE) ||
ifp->flags & IFA_F_NODAD) {
bool send_na = false;
@@ -3963,6 +4181,7 @@ static void addrconf_dad_work(struct work_struct *w)
struct inet6_dev *idev = ifp->idev;
bool bump_id, disable_ipv6 = false;
struct in6_addr mcaddr;
+ struct net *net;
enum {
DAD_PROCESS,
@@ -3970,7 +4189,9 @@ static void addrconf_dad_work(struct work_struct *w)
DAD_ABORT,
} action = DAD_PROCESS;
- rtnl_lock();
+ net = dev_net(idev->dev);
+
+ rtnl_net_lock(net);
spin_lock_bh(&ifp->lock);
if (ifp->state == INET6_IFADDR_STATE_PREDAD) {
@@ -3980,8 +4201,8 @@ static void addrconf_dad_work(struct work_struct *w)
action = DAD_ABORT;
ifp->state = INET6_IFADDR_STATE_POSTDAD;
- if ((dev_net(idev->dev)->ipv6.devconf_all->accept_dad > 1 ||
- idev->cnf.accept_dad > 1) &&
+ if ((READ_ONCE(net->ipv6.devconf_all->accept_dad) > 1 ||
+ READ_ONCE(idev->cnf.accept_dad) > 1) &&
!idev->cnf.disable_ipv6 &&
!(ifp->flags & IFA_F_STABLE_PRIVACY)) {
struct in6_addr addr;
@@ -3992,7 +4213,7 @@ static void addrconf_dad_work(struct work_struct *w)
if (!ipv6_generate_eui64(addr.s6_addr + 8, idev->dev) &&
ipv6_addr_equal(&ifp->addr, &addr)) {
/* DAD failed for link-local based on MAC */
- idev->cnf.disable_ipv6 = 1;
+ WRITE_ONCE(idev->cnf.disable_ipv6, 1);
pr_info("%s: IPv6 being disabled!\n",
ifp->idev->dev->name);
@@ -4009,7 +4230,7 @@ static void addrconf_dad_work(struct work_struct *w)
in6_ifa_hold(ifp);
addrconf_dad_stop(ifp, 1);
if (disable_ipv6)
- addrconf_ifdown(idev->dev, 0);
+ addrconf_ifdown(idev->dev, false);
goto out;
}
@@ -4051,7 +4272,8 @@ static void addrconf_dad_work(struct work_struct *w)
ifp->dad_probes--;
addrconf_mod_dad_work(ifp,
- NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME));
+ max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME),
+ HZ/100));
spin_unlock(&ifp->lock);
write_unlock_bh(&idev->lock);
@@ -4061,7 +4283,7 @@ static void addrconf_dad_work(struct work_struct *w)
ifp->dad_nonce);
out:
in6_ifa_put(ifp);
- rtnl_unlock();
+ rtnl_net_unlock(net);
}
/* ifp->idev must be at least read locked */
@@ -4105,8 +4327,10 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id,
send_mld = ifp->scope == IFA_LINK && ipv6_lonely_lladdr(ifp);
send_rs = send_mld &&
ipv6_accept_ra(ifp->idev) &&
- ifp->idev->cnf.rtr_solicits != 0 &&
- (dev->flags&IFF_LOOPBACK) == 0;
+ READ_ONCE(ifp->idev->cnf.rtr_solicits) != 0 &&
+ (dev->flags & IFF_LOOPBACK) == 0 &&
+ (dev->type != ARPHRD_TUNNEL) &&
+ !netif_is_team_port(dev);
read_unlock_bh(&ifp->idev->lock);
/* While dad is in progress mld report's source address is in6_addrany.
@@ -4117,8 +4341,8 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id,
/* send unsolicited NA if enabled */
if (send_na &&
- (ifp->idev->cnf.ndisc_notify ||
- dev_net(dev)->ipv6.devconf_all->ndisc_notify)) {
+ (READ_ONCE(ifp->idev->cnf.ndisc_notify) ||
+ READ_ONCE(dev_net(dev)->ipv6.devconf_all->ndisc_notify))) {
ndisc_send_na(dev, &in6addr_linklocal_allnodes, &ifp->addr,
/*router=*/ !!ifp->idev->cnf.forwarding,
/*solicited=*/ false, /*override=*/ true,
@@ -4138,7 +4362,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id,
write_lock_bh(&ifp->idev->lock);
spin_lock(&ifp->lock);
ifp->idev->rs_interval = rfc3315_s14_backoff_init(
- ifp->idev->cnf.rtr_solicit_interval);
+ READ_ONCE(ifp->idev->cnf.rtr_solicit_interval));
ifp->idev->rs_probes = 1;
ifp->idev->if_flags |= IF_RS_SENT;
addrconf_mod_rs_timer(ifp->idev, ifp->idev->rs_interval);
@@ -4153,19 +4377,22 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id,
* before this temporary address becomes deprecated.
*/
if (ifp->flags & IFA_F_TEMPORARY)
- addrconf_verify_rtnl();
+ addrconf_verify_rtnl(dev_net(dev));
}
-static void addrconf_dad_run(struct inet6_dev *idev)
+static void addrconf_dad_run(struct inet6_dev *idev, bool restart)
{
struct inet6_ifaddr *ifp;
read_lock_bh(&idev->lock);
list_for_each_entry(ifp, &idev->addr_list, if_list) {
spin_lock(&ifp->lock);
- if (ifp->flags & IFA_F_TENTATIVE &&
- ifp->state == INET6_IFADDR_STATE_DAD)
+ if ((ifp->flags & IFA_F_TENTATIVE &&
+ ifp->state == INET6_IFADDR_STATE_DAD) || restart) {
+ if (restart)
+ ifp->state = INET6_IFADDR_STATE_PREDAD;
addrconf_dad_kick(ifp);
+ }
spin_unlock(&ifp->lock);
}
read_unlock_bh(&idev->lock);
@@ -4192,10 +4419,8 @@ static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos)
}
for (; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) {
- hlist_for_each_entry_rcu(ifa, &inet6_addr_lst[state->bucket],
+ hlist_for_each_entry_rcu(ifa, &net->ipv6.inet6_addr_lst[state->bucket],
addr_lst) {
- if (!net_eq(dev_net(ifa->idev->dev), net))
- continue;
/* sync with offset */
if (p < state->offset) {
p++;
@@ -4218,8 +4443,6 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
hlist_for_each_entry_continue_rcu(ifa, addr_lst) {
- if (!net_eq(dev_net(ifa->idev->dev), net))
- continue;
state->offset++;
return ifa;
}
@@ -4227,9 +4450,7 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq,
state->offset = 0;
while (++state->bucket < IN6_ADDR_HSIZE) {
hlist_for_each_entry_rcu(ifa,
- &inet6_addr_lst[state->bucket], addr_lst) {
- if (!net_eq(dev_net(ifa->idev->dev), net))
- continue;
+ &net->ipv6.inet6_addr_lst[state->bucket], addr_lst) {
return ifa;
}
}
@@ -4317,9 +4538,7 @@ int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr)
int ret = 0;
rcu_read_lock();
- hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) {
- if (!net_eq(dev_net(ifp->idev->dev), net))
- continue;
+ hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
if (ipv6_addr_equal(&ifp->addr, addr) &&
(ifp->flags & IFA_F_HOMEADDRESS)) {
ret = 1;
@@ -4331,11 +4550,62 @@ int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr)
}
#endif
+/* RFC6554 has some algorithm to avoid loops in segment routing by
+ * checking if the segments contains any of a local interface address.
+ *
+ * Quote:
+ *
+ * To detect loops in the SRH, a router MUST determine if the SRH
+ * includes multiple addresses assigned to any interface on that router.
+ * If such addresses appear more than once and are separated by at least
+ * one address not assigned to that router.
+ */
+int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs,
+ unsigned char nsegs)
+{
+ const struct in6_addr *addr;
+ int i, ret = 0, found = 0;
+ struct inet6_ifaddr *ifp;
+ bool separated = false;
+ unsigned int hash;
+ bool hash_found;
+
+ rcu_read_lock();
+ for (i = 0; i < nsegs; i++) {
+ addr = &segs[i];
+ hash = inet6_addr_hash(net, addr);
+
+ hash_found = false;
+ hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) {
+
+ if (ipv6_addr_equal(&ifp->addr, addr)) {
+ hash_found = true;
+ break;
+ }
+ }
+
+ if (hash_found) {
+ if (found > 1 && separated) {
+ ret = 1;
+ break;
+ }
+
+ separated = false;
+ found++;
+ } else {
+ separated = true;
+ }
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+
/*
* Periodic address status verification
*/
-static void addrconf_verify_rtnl(void)
+static void addrconf_verify_rtnl(struct net *net)
{
unsigned long now, next, next_sec, next_sched;
struct inet6_ifaddr *ifp;
@@ -4347,11 +4617,11 @@ static void addrconf_verify_rtnl(void)
now = jiffies;
next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY);
- cancel_delayed_work(&addr_chk_work);
+ cancel_delayed_work(&net->ipv6.addr_chk_work);
for (i = 0; i < IN6_ADDR_HSIZE; i++) {
restart:
- hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[i], addr_lst) {
+ hlist_for_each_entry_rcu_bh(ifp, &net->ipv6.inet6_addr_lst[i], addr_lst) {
unsigned long age;
/* When setting preferred_lft to a value not zero or
@@ -4366,11 +4636,44 @@ restart:
/* We try to batch several events at once. */
age = (now - ifp->tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ;
+ if ((ifp->flags&IFA_F_TEMPORARY) &&
+ !(ifp->flags&IFA_F_TENTATIVE) &&
+ ifp->prefered_lft != INFINITY_LIFE_TIME &&
+ !ifp->regen_count && ifp->ifpub) {
+ /* This is a non-regenerated temporary addr. */
+
+ unsigned long regen_advance = ipv6_get_regen_advance(ifp->idev);
+
+ if (age + regen_advance >= ifp->prefered_lft) {
+ struct inet6_ifaddr *ifpub = ifp->ifpub;
+ if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
+ next = ifp->tstamp + ifp->prefered_lft * HZ;
+
+ ifp->regen_count++;
+ in6_ifa_hold(ifp);
+ in6_ifa_hold(ifpub);
+ spin_unlock(&ifp->lock);
+
+ spin_lock(&ifpub->lock);
+ ifpub->regen_count = 0;
+ spin_unlock(&ifpub->lock);
+ rcu_read_unlock_bh();
+ ipv6_create_tempaddr(ifpub, true);
+ in6_ifa_put(ifpub);
+ in6_ifa_put(ifp);
+ rcu_read_lock_bh();
+ goto restart;
+ } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next))
+ next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ;
+ }
+
if (ifp->valid_lft != INFINITY_LIFE_TIME &&
age >= ifp->valid_lft) {
spin_unlock(&ifp->lock);
in6_ifa_hold(ifp);
+ rcu_read_unlock_bh();
ipv6_del_addr(ifp);
+ rcu_read_lock_bh();
goto restart;
} else if (ifp->prefered_lft == INFINITY_LIFE_TIME) {
spin_unlock(&ifp->lock);
@@ -4397,35 +4700,6 @@ restart:
in6_ifa_put(ifp);
goto restart;
}
- } else if ((ifp->flags&IFA_F_TEMPORARY) &&
- !(ifp->flags&IFA_F_TENTATIVE)) {
- unsigned long regen_advance = ifp->idev->cnf.regen_max_retry *
- ifp->idev->cnf.dad_transmits *
- NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME) / HZ;
-
- if (age >= ifp->prefered_lft - regen_advance) {
- struct inet6_ifaddr *ifpub = ifp->ifpub;
- if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
- next = ifp->tstamp + ifp->prefered_lft * HZ;
- if (!ifp->regen_count && ifpub) {
- ifp->regen_count++;
- in6_ifa_hold(ifp);
- in6_ifa_hold(ifpub);
- spin_unlock(&ifp->lock);
-
- spin_lock(&ifpub->lock);
- ifpub->regen_count = 0;
- spin_unlock(&ifpub->lock);
- rcu_read_unlock_bh();
- ipv6_create_tempaddr(ifpub, ifp, true);
- in6_ifa_put(ifpub);
- in6_ifa_put(ifp);
- rcu_read_lock_bh();
- goto restart;
- }
- } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next))
- next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ;
- spin_unlock(&ifp->lock);
} else {
/* ifp->prefered_lft <= ifp->valid_lft */
if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next))
@@ -4448,20 +4722,23 @@ restart:
pr_debug("now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n",
now, next, next_sec, next_sched);
- mod_delayed_work(addrconf_wq, &addr_chk_work, next_sched - now);
+ mod_delayed_work(addrconf_wq, &net->ipv6.addr_chk_work, next_sched - now);
rcu_read_unlock_bh();
}
static void addrconf_verify_work(struct work_struct *w)
{
- rtnl_lock();
- addrconf_verify_rtnl();
- rtnl_unlock();
+ struct net *net = container_of(to_delayed_work(w), struct net,
+ ipv6.addr_chk_work);
+
+ rtnl_net_lock(net);
+ addrconf_verify_rtnl(net);
+ rtnl_net_unlock(net);
}
-static void addrconf_verify(void)
+static void addrconf_verify(struct net *net)
{
- mod_delayed_work(addrconf_wq, &addr_chk_work, 0);
+ mod_delayed_work(addrconf_wq, &net->ipv6.addr_chk_work, 0);
}
static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local,
@@ -4489,6 +4766,8 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = {
[IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
[IFA_FLAGS] = { .len = sizeof(u32) },
[IFA_RT_PRIORITY] = { .len = sizeof(u32) },
+ [IFA_TARGET_NETNSID] = { .type = NLA_S32 },
+ [IFA_PROTO] = { .type = NLA_U8 },
};
static int
@@ -4502,8 +4781,8 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
u32 ifa_flags;
int err;
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy,
- extack);
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
+ ifa_ipv6_policy, extack);
if (err < 0)
return err;
@@ -4512,61 +4791,73 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
if (!pfx)
return -EINVAL;
- ifa_flags = tb[IFA_FLAGS] ? nla_get_u32(tb[IFA_FLAGS]) : ifm->ifa_flags;
+ ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags);
/* We ignore other flags so far. */
ifa_flags &= IFA_F_MANAGETEMPADDR;
- return inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx,
- ifm->ifa_prefixlen);
+ rtnl_net_lock(net);
+ err = inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx,
+ ifm->ifa_prefixlen, extack);
+ rtnl_net_unlock(net);
+
+ return err;
}
-static int modify_prefix_route(struct inet6_ifaddr *ifp,
- unsigned long expires, u32 flags)
+static int modify_prefix_route(struct net *net, struct inet6_ifaddr *ifp,
+ unsigned long expires, u32 flags,
+ bool modify_peer)
{
+ struct fib6_table *table;
struct fib6_info *f6i;
u32 prio;
- f6i = addrconf_get_prefix_route(&ifp->addr,
+ f6i = addrconf_get_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr,
ifp->prefix_len,
- ifp->idev->dev,
- 0, RTF_GATEWAY | RTF_DEFAULT);
+ ifp->idev->dev, 0, RTF_DEFAULT, true);
if (!f6i)
return -ENOENT;
prio = ifp->rt_priority ? : IP6_RT_PRIO_ADDRCONF;
if (f6i->fib6_metric != prio) {
/* delete old one */
- ip6_del_rt(dev_net(ifp->idev->dev), f6i);
+ ip6_del_rt(dev_net(ifp->idev->dev), f6i, false);
/* add new one */
- addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+ addrconf_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr,
+ ifp->prefix_len,
ifp->rt_priority, ifp->idev->dev,
expires, flags, GFP_KERNEL);
- } else {
- if (!expires)
+ return 0;
+ }
+ if (f6i != net->ipv6.fib6_null_entry) {
+ table = f6i->fib6_table;
+ spin_lock_bh(&table->tb6_lock);
+
+ if (!(flags & RTF_EXPIRES)) {
fib6_clean_expires(f6i);
- else
+ fib6_remove_gc_list(f6i);
+ } else {
fib6_set_expires(f6i, expires);
+ fib6_add_gc_list(f6i);
+ }
- fib6_info_release(f6i);
+ spin_unlock_bh(&table->tb6_lock);
}
+ fib6_info_release(f6i);
return 0;
}
-static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
+static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp,
+ struct ifa6_config *cfg, clock_t expires,
+ u32 flags)
{
- u32 flags;
- clock_t expires;
- unsigned long timeout;
bool was_managetempaddr;
+ bool new_peer = false;
bool had_prefixroute;
- ASSERT_RTNL();
-
- if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft)
- return -EINVAL;
+ ASSERT_RTNL_NET(net);
if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR &&
(ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64))
@@ -4575,22 +4866,11 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
if (!(ifp->flags & IFA_F_TENTATIVE) || ifp->flags & IFA_F_DADFAILED)
cfg->ifa_flags &= ~IFA_F_OPTIMISTIC;
- timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ);
- if (addrconf_finite_timeout(timeout)) {
- expires = jiffies_to_clock_t(timeout * HZ);
- cfg->valid_lft = timeout;
- flags = RTF_EXPIRES;
- } else {
- expires = 0;
- flags = 0;
- cfg->ifa_flags |= IFA_F_PERMANENT;
- }
-
- timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ);
- if (addrconf_finite_timeout(timeout)) {
- if (timeout == 0)
- cfg->ifa_flags |= IFA_F_DEPRECATED;
- cfg->preferred_lft = timeout;
+ if (cfg->peer_pfx &&
+ memcmp(&ifp->peer_addr, cfg->peer_pfx, sizeof(struct in6_addr))) {
+ if (!ipv6_addr_any(&ifp->peer_addr))
+ cleanup_prefix_route(ifp, expires, true, true);
+ new_peer = true;
}
spin_lock_bh(&ifp->lock);
@@ -4601,12 +4881,16 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
IFA_F_HOMEADDRESS | IFA_F_MANAGETEMPADDR |
IFA_F_NOPREFIXROUTE);
ifp->flags |= cfg->ifa_flags;
- ifp->tstamp = jiffies;
- ifp->valid_lft = cfg->valid_lft;
- ifp->prefered_lft = cfg->preferred_lft;
+ WRITE_ONCE(ifp->tstamp, jiffies);
+ WRITE_ONCE(ifp->valid_lft, cfg->valid_lft);
+ WRITE_ONCE(ifp->prefered_lft, cfg->preferred_lft);
+ WRITE_ONCE(ifp->ifa_proto, cfg->ifa_proto);
if (cfg->rt_priority && cfg->rt_priority != ifp->rt_priority)
- ifp->rt_priority = cfg->rt_priority;
+ WRITE_ONCE(ifp->rt_priority, cfg->rt_priority);
+
+ if (new_peer)
+ ifp->peer_addr = *cfg->peer_pfx;
spin_unlock_bh(&ifp->lock);
if (!(ifp->flags&IFA_F_TENTATIVE))
@@ -4616,7 +4900,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
int rc = -ENOENT;
if (had_prefixroute)
- rc = modify_prefix_route(ifp, expires, flags);
+ rc = modify_prefix_route(net, ifp, expires, flags, false);
/* prefix route could have been deleted; if so restore it */
if (rc == -ENOENT) {
@@ -4624,6 +4908,15 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
ifp->rt_priority, ifp->idev->dev,
expires, flags, GFP_KERNEL);
}
+
+ if (had_prefixroute && !ipv6_addr_any(&ifp->peer_addr))
+ rc = modify_prefix_route(net, ifp, expires, flags, true);
+
+ if (rc == -ENOENT && !ipv6_addr_any(&ifp->peer_addr)) {
+ addrconf_prefix_route(&ifp->peer_addr, ifp->prefix_len,
+ ifp->rt_priority, ifp->idev->dev,
+ expires, flags, GFP_KERNEL);
+ }
} else if (had_prefixroute) {
enum cleanup_prefix_rt_t action;
unsigned long rt_expires;
@@ -4634,22 +4927,20 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg)
if (action != CLEANUP_PREFIX_RT_NOP) {
cleanup_prefix_route(ifp, rt_expires,
- action == CLEANUP_PREFIX_RT_DEL);
+ action == CLEANUP_PREFIX_RT_DEL, false);
}
}
if (was_managetempaddr || ifp->flags & IFA_F_MANAGETEMPADDR) {
- if (was_managetempaddr &&
- !(ifp->flags & IFA_F_MANAGETEMPADDR)) {
- cfg->valid_lft = 0;
- cfg->preferred_lft = 0;
- }
- manage_tempaddrs(ifp->idev, ifp, cfg->valid_lft,
- cfg->preferred_lft, !was_managetempaddr,
- jiffies);
+ if (was_managetempaddr && !(ifp->flags & IFA_F_MANAGETEMPADDR))
+ delete_tempaddrs(ifp->idev, ifp);
+ else
+ manage_tempaddrs(ifp->idev, ifp, cfg->valid_lft,
+ cfg->preferred_lft, !was_managetempaddr,
+ jiffies);
}
- addrconf_verify_rtnl();
+ addrconf_verify_rtnl(net);
return 0;
}
@@ -4659,17 +4950,20 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
- struct ifaddrmsg *ifm;
struct nlattr *tb[IFA_MAX+1];
struct in6_addr *peer_pfx;
struct inet6_ifaddr *ifa;
struct net_device *dev;
struct inet6_dev *idev;
struct ifa6_config cfg;
+ struct ifaddrmsg *ifm;
+ unsigned long timeout;
+ clock_t expires;
+ u32 flags;
int err;
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy,
- extack);
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
+ ifa_ipv6_policy, extack);
if (err < 0)
return err;
@@ -4685,8 +4979,21 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tb[IFA_RT_PRIORITY])
cfg.rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
+ if (tb[IFA_PROTO])
+ cfg.ifa_proto = nla_get_u8(tb[IFA_PROTO]);
+
+ cfg.ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags);
+
+ /* We ignore other flags so far. */
+ cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS |
+ IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE |
+ IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC;
+
+ cfg.ifa_flags |= IFA_F_PERMANENT;
cfg.valid_lft = INFINITY_LIFE_TIME;
cfg.preferred_lft = INFINITY_LIFE_TIME;
+ expires = 0;
+ flags = 0;
if (tb[IFA_CACHEINFO]) {
struct ifa_cacheinfo *ci;
@@ -4694,25 +5001,44 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
ci = nla_data(tb[IFA_CACHEINFO]);
cfg.valid_lft = ci->ifa_valid;
cfg.preferred_lft = ci->ifa_prefered;
- }
- dev = __dev_get_by_index(net, ifm->ifa_index);
- if (!dev)
- return -ENODEV;
+ if (!cfg.valid_lft || cfg.preferred_lft > cfg.valid_lft) {
+ NL_SET_ERR_MSG_MOD(extack, "address lifetime invalid");
+ return -EINVAL;
+ }
- if (tb[IFA_FLAGS])
- cfg.ifa_flags = nla_get_u32(tb[IFA_FLAGS]);
- else
- cfg.ifa_flags = ifm->ifa_flags;
+ timeout = addrconf_timeout_fixup(cfg.valid_lft, HZ);
+ if (addrconf_finite_timeout(timeout)) {
+ cfg.ifa_flags &= ~IFA_F_PERMANENT;
+ cfg.valid_lft = timeout;
+ expires = jiffies_to_clock_t(timeout * HZ);
+ flags = RTF_EXPIRES;
+ }
- /* We ignore other flags so far. */
- cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS |
- IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE |
- IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC;
+ timeout = addrconf_timeout_fixup(cfg.preferred_lft, HZ);
+ if (addrconf_finite_timeout(timeout)) {
+ if (timeout == 0)
+ cfg.ifa_flags |= IFA_F_DEPRECATED;
+ cfg.preferred_lft = timeout;
+ }
+ }
+
+ rtnl_net_lock(net);
+
+ dev = __dev_get_by_index(net, ifm->ifa_index);
+ if (!dev) {
+ NL_SET_ERR_MSG_MOD(extack, "Unable to find the interface");
+ err = -ENODEV;
+ goto unlock_rtnl;
+ }
+
+ netdev_lock_ops(dev);
idev = ipv6_find_idev(dev);
- if (IS_ERR(idev))
- return PTR_ERR(idev);
+ if (IS_ERR(idev)) {
+ err = PTR_ERR(idev);
+ goto unlock;
+ }
if (!ipv6_allow_optimistic_dad(net, idev))
cfg.ifa_flags &= ~IFA_F_OPTIMISTIC;
@@ -4720,7 +5046,8 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
if (cfg.ifa_flags & IFA_F_NODAD &&
cfg.ifa_flags & IFA_F_OPTIMISTIC) {
NL_SET_ERR_MSG(extack, "IFA_F_NODAD and IFA_F_OPTIMISTIC are mutually exclusive");
- return -EINVAL;
+ err = -EINVAL;
+ goto unlock;
}
ifa = ipv6_get_ifaddr(net, cfg.pfx, dev, 1);
@@ -4729,16 +5056,23 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
* It would be best to check for !NLM_F_CREATE here but
* userspace already relies on not having to provide this.
*/
- return inet6_addr_add(net, ifm->ifa_index, &cfg, extack);
+ err = inet6_addr_add(net, dev, &cfg, expires, flags, extack);
+ goto unlock;
}
if (nlh->nlmsg_flags & NLM_F_EXCL ||
- !(nlh->nlmsg_flags & NLM_F_REPLACE))
+ !(nlh->nlmsg_flags & NLM_F_REPLACE)) {
+ NL_SET_ERR_MSG_MOD(extack, "address already assigned");
err = -EEXIST;
- else
- err = inet6_addr_modify(ifa, &cfg);
+ } else {
+ err = inet6_addr_modify(net, ifa, &cfg, expires, flags);
+ }
in6_ifa_put(ifa);
+unlock:
+ netdev_unlock_ops(dev);
+unlock_rtnl:
+ rtnl_net_unlock(net);
return err;
}
@@ -4788,28 +5122,40 @@ static inline int inet6_ifaddr_msgsize(void)
+ nla_total_size(16) /* IFA_ADDRESS */
+ nla_total_size(sizeof(struct ifa_cacheinfo))
+ nla_total_size(4) /* IFA_FLAGS */
+ + nla_total_size(1) /* IFA_PROTO */
+ nla_total_size(4) /* IFA_RT_PRIORITY */;
}
-static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
- u32 portid, u32 seq, int event, unsigned int flags)
+static int inet6_fill_ifaddr(struct sk_buff *skb,
+ const struct inet6_ifaddr *ifa,
+ struct inet6_fill_args *args)
{
- struct nlmsghdr *nlh;
+ struct nlmsghdr *nlh;
u32 preferred, valid;
+ u32 flags, priority;
+ u8 proto;
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags);
+ nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
+ sizeof(struct ifaddrmsg), args->flags);
if (!nlh)
return -EMSGSIZE;
+ flags = READ_ONCE(ifa->flags);
put_ifaddrmsg(nlh, ifa->prefix_len, ifa->flags, rt_scope(ifa->scope),
ifa->idev->dev->ifindex);
- if (!((ifa->flags&IFA_F_PERMANENT) &&
- (ifa->prefered_lft == INFINITY_LIFE_TIME))) {
- preferred = ifa->prefered_lft;
- valid = ifa->valid_lft;
+ if (args->netnsid >= 0 &&
+ nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid))
+ goto error;
+
+ preferred = READ_ONCE(ifa->prefered_lft);
+ valid = READ_ONCE(ifa->valid_lft);
+
+ if (!((flags & IFA_F_PERMANENT) &&
+ (preferred == INFINITY_LIFE_TIME))) {
if (preferred != INFINITY_LIFE_TIME) {
- long tval = (jiffies - ifa->tstamp)/HZ;
+ long tval = (jiffies - READ_ONCE(ifa->tstamp)) / HZ;
+
if (preferred > tval)
preferred -= tval;
else
@@ -4830,18 +5176,24 @@ static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa,
if (nla_put_in6_addr(skb, IFA_LOCAL, &ifa->addr) < 0 ||
nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->peer_addr) < 0)
goto error;
- } else
+ } else {
if (nla_put_in6_addr(skb, IFA_ADDRESS, &ifa->addr) < 0)
goto error;
+ }
- if (ifa->rt_priority &&
- nla_put_u32(skb, IFA_RT_PRIORITY, ifa->rt_priority))
+ priority = READ_ONCE(ifa->rt_priority);
+ if (priority && nla_put_u32(skb, IFA_RT_PRIORITY, priority))
goto error;
- if (put_cacheinfo(skb, ifa->cstamp, ifa->tstamp, preferred, valid) < 0)
+ if (put_cacheinfo(skb, ifa->cstamp, READ_ONCE(ifa->tstamp),
+ preferred, valid) < 0)
goto error;
- if (nla_put_u32(skb, IFA_FLAGS, ifa->flags) < 0)
+ if (nla_put_u32(skb, IFA_FLAGS, flags) < 0)
+ goto error;
+
+ proto = READ_ONCE(ifa->ifa_proto);
+ if (proto && nla_put_u8(skb, IFA_PROTO, proto))
goto error;
nlmsg_end(skb, nlh);
@@ -4852,23 +5204,32 @@ error:
return -EMSGSIZE;
}
-static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
- u32 portid, u32 seq, int event, u16 flags)
+int inet6_fill_ifmcaddr(struct sk_buff *skb,
+ const struct ifmcaddr6 *ifmca,
+ struct inet6_fill_args *args)
{
- struct nlmsghdr *nlh;
- u8 scope = RT_SCOPE_UNIVERSE;
int ifindex = ifmca->idev->dev->ifindex;
+ u8 scope = RT_SCOPE_UNIVERSE;
+ struct nlmsghdr *nlh;
- if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE)
+ if (!args->force_rt_scope_universe &&
+ ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE)
scope = RT_SCOPE_SITE;
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags);
+ nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
+ sizeof(struct ifaddrmsg), args->flags);
if (!nlh)
return -EMSGSIZE;
+ if (args->netnsid >= 0 &&
+ nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+
put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
if (nla_put_in6_addr(skb, IFA_MULTICAST, &ifmca->mca_addr) < 0 ||
- put_cacheinfo(skb, ifmca->mca_cstamp, ifmca->mca_tstamp,
+ put_cacheinfo(skb, ifmca->mca_cstamp, READ_ONCE(ifmca->mca_tstamp),
INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) {
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
@@ -4878,24 +5239,32 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca,
return 0;
}
-static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
- u32 portid, u32 seq, int event, unsigned int flags)
+int inet6_fill_ifacaddr(struct sk_buff *skb,
+ const struct ifacaddr6 *ifaca,
+ struct inet6_fill_args *args)
{
struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt);
int ifindex = dev ? dev->ifindex : 1;
- struct nlmsghdr *nlh;
u8 scope = RT_SCOPE_UNIVERSE;
+ struct nlmsghdr *nlh;
if (ipv6_addr_scope(&ifaca->aca_addr) & IFA_SITE)
scope = RT_SCOPE_SITE;
- nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct ifaddrmsg), flags);
+ nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
+ sizeof(struct ifaddrmsg), args->flags);
if (!nlh)
return -EMSGSIZE;
+ if (args->netnsid >= 0 &&
+ nla_put_s32(skb, IFA_TARGET_NETNSID, args->netnsid)) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+
put_ifaddrmsg(nlh, 128, IFA_F_PERMANENT, scope, ifindex);
if (nla_put_in6_addr(skb, IFA_ANYCAST, &ifaca->aca_addr) < 0 ||
- put_cacheinfo(skb, ifaca->aca_cstamp, ifaca->aca_tstamp,
+ put_cacheinfo(skb, ifaca->aca_cstamp, READ_ONCE(ifaca->aca_tstamp),
INFINITY_LIFE_TIME, INFINITY_LIFE_TIME) < 0) {
nlmsg_cancel(skb, nlh);
return -EMSGSIZE;
@@ -4905,68 +5274,56 @@ static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca,
return 0;
}
-enum addr_type_t {
- UNICAST_ADDR,
- MULTICAST_ADDR,
- ANYCAST_ADDR,
-};
-
/* called with rcu_read_lock() */
-static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
- struct netlink_callback *cb, enum addr_type_t type,
- int s_ip_idx, int *p_ip_idx)
+static int in6_dump_addrs(const struct inet6_dev *idev, struct sk_buff *skb,
+ struct netlink_callback *cb, int *s_ip_idx,
+ struct inet6_fill_args *fillargs)
{
- struct ifmcaddr6 *ifmca;
- struct ifacaddr6 *ifaca;
- int err = 1;
- int ip_idx = *p_ip_idx;
+ const struct ifmcaddr6 *ifmca;
+ const struct ifacaddr6 *ifaca;
+ int ip_idx = 0;
+ int err = 0;
- read_lock_bh(&idev->lock);
- switch (type) {
+ switch (fillargs->type) {
case UNICAST_ADDR: {
- struct inet6_ifaddr *ifa;
+ const struct inet6_ifaddr *ifa;
+ fillargs->event = RTM_NEWADDR;
/* unicast address incl. temp addr */
- list_for_each_entry(ifa, &idev->addr_list, if_list) {
- if (++ip_idx < s_ip_idx)
- continue;
- err = inet6_fill_ifaddr(skb, ifa,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWADDR,
- NLM_F_MULTI);
+ list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) {
+ if (ip_idx < *s_ip_idx)
+ goto next;
+ err = inet6_fill_ifaddr(skb, ifa, fillargs);
if (err < 0)
break;
nl_dump_check_consistent(cb, nlmsg_hdr(skb));
+next:
+ ip_idx++;
}
break;
}
case MULTICAST_ADDR:
+ fillargs->event = RTM_GETMULTICAST;
+
/* multicast address */
- for (ifmca = idev->mc_list; ifmca;
- ifmca = ifmca->next, ip_idx++) {
- if (ip_idx < s_ip_idx)
+ for (ifmca = rcu_dereference(idev->mc_list);
+ ifmca;
+ ifmca = rcu_dereference(ifmca->next), ip_idx++) {
+ if (ip_idx < *s_ip_idx)
continue;
- err = inet6_fill_ifmcaddr(skb, ifmca,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_GETMULTICAST,
- NLM_F_MULTI);
+ err = inet6_fill_ifmcaddr(skb, ifmca, fillargs);
if (err < 0)
break;
}
break;
case ANYCAST_ADDR:
+ fillargs->event = RTM_GETANYCAST;
/* anycast address */
- for (ifaca = idev->ac_list; ifaca;
- ifaca = ifaca->aca_next, ip_idx++) {
- if (ip_idx < s_ip_idx)
+ for (ifaca = rcu_dereference(idev->ac_list); ifaca;
+ ifaca = rcu_dereference(ifaca->aca_next), ip_idx++) {
+ if (ip_idx < *s_ip_idx)
continue;
- err = inet6_fill_ifacaddr(skb, ifaca,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_GETANYCAST,
- NLM_F_MULTI);
+ err = inet6_fill_ifacaddr(skb, ifaca, fillargs);
if (err < 0)
break;
}
@@ -4974,55 +5331,126 @@ static int in6_dump_addrs(struct inet6_dev *idev, struct sk_buff *skb,
default:
break;
}
- read_unlock_bh(&idev->lock);
- *p_ip_idx = ip_idx;
+ *s_ip_idx = err ? ip_idx : 0;
return err;
}
+static int inet6_valid_dump_ifaddr_req(const struct nlmsghdr *nlh,
+ struct inet6_fill_args *fillargs,
+ struct net **tgt_net, struct sock *sk,
+ struct netlink_callback *cb)
+{
+ struct netlink_ext_ack *extack = cb->extack;
+ struct nlattr *tb[IFA_MAX+1];
+ struct ifaddrmsg *ifm;
+ int err, i;
+
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid header for address dump request");
+ return -EINVAL;
+ }
+
+ if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address dump request");
+ return -EINVAL;
+ }
+
+ fillargs->ifindex = ifm->ifa_index;
+ if (fillargs->ifindex) {
+ cb->answer_flags |= NLM_F_DUMP_FILTERED;
+ fillargs->flags |= NLM_F_DUMP_FILTERED;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
+ ifa_ipv6_policy, extack);
+ if (err < 0)
+ return err;
+
+ for (i = 0; i <= IFA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ if (i == IFA_TARGET_NETNSID) {
+ struct net *net;
+
+ fillargs->netnsid = nla_get_s32(tb[i]);
+ net = rtnl_get_net_ns_capable(sk, fillargs->netnsid);
+ if (IS_ERR(net)) {
+ fillargs->netnsid = -1;
+ NL_SET_ERR_MSG_MOD(extack, "Invalid target network namespace id");
+ return PTR_ERR(net);
+ }
+ *tgt_net = net;
+ } else {
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in dump request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
enum addr_type_t type)
{
- struct net *net = sock_net(skb->sk);
- int h, s_h;
- int idx, ip_idx;
- int s_idx, s_ip_idx;
+ struct net *tgt_net = sock_net(skb->sk);
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct inet6_fill_args fillargs = {
+ .portid = NETLINK_CB(cb->skb).portid,
+ .seq = cb->nlh->nlmsg_seq,
+ .flags = NLM_F_MULTI,
+ .netnsid = -1,
+ .type = type,
+ .force_rt_scope_universe = false,
+ };
+ struct {
+ unsigned long ifindex;
+ int ip_idx;
+ } *ctx = (void *)cb->ctx;
struct net_device *dev;
struct inet6_dev *idev;
- struct hlist_head *head;
-
- s_h = cb->args[0];
- s_idx = idx = cb->args[1];
- s_ip_idx = ip_idx = cb->args[2];
+ int err = 0;
rcu_read_lock();
- cb->seq = atomic_read(&net->ipv6.dev_addr_genid) ^ net->dev_base_seq;
- for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
- idx = 0;
- head = &net->dev_index_head[h];
- hlist_for_each_entry_rcu(dev, head, index_hlist) {
- if (idx < s_idx)
- goto cont;
- if (h > s_h || idx > s_idx)
- s_ip_idx = 0;
- ip_idx = 0;
- idev = __in6_dev_get(dev);
- if (!idev)
- goto cont;
+ if (cb->strict_check) {
+ err = inet6_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net,
+ skb->sk, cb);
+ if (err < 0)
+ goto done;
- if (in6_dump_addrs(idev, skb, cb, type,
- s_ip_idx, &ip_idx) < 0)
+ err = 0;
+ if (fillargs.ifindex) {
+ dev = dev_get_by_index_rcu(tgt_net, fillargs.ifindex);
+ if (!dev) {
+ err = -ENODEV;
goto done;
-cont:
- idx++;
+ }
+ idev = __in6_dev_get(dev);
+ if (idev)
+ err = in6_dump_addrs(idev, skb, cb,
+ &ctx->ip_idx,
+ &fillargs);
+ goto done;
}
}
+
+ cb->seq = inet6_base_seq(tgt_net);
+ for_each_netdev_dump(tgt_net, dev, ctx->ifindex) {
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ continue;
+ err = in6_dump_addrs(idev, skb, cb, &ctx->ip_idx,
+ &fillargs);
+ if (err < 0)
+ goto done;
+ }
done:
rcu_read_unlock();
- cb->args[0] = h;
- cb->args[1] = idx;
- cb->args[2] = ip_idx;
+ if (fillargs.netnsid >= 0)
+ put_net(tgt_net);
- return skb->len;
+ return err;
}
static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
@@ -5047,10 +5475,64 @@ static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb)
return inet6_dump_addr(skb, cb, type);
}
+static int inet6_rtm_valid_getaddr_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct ifaddrmsg *ifm;
+ int i, err;
+
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid header for get address request");
+ return -EINVAL;
+ }
+
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
+ ifa_ipv6_policy, extack);
+
+ if (ifm->ifa_prefixlen || ifm->ifa_flags || ifm->ifa_scope) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get address request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFA_MAX,
+ ifa_ipv6_policy, extack);
+ if (err)
+ return err;
+
+ for (i = 0; i <= IFA_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case IFA_TARGET_NETNSID:
+ case IFA_ADDRESS:
+ case IFA_LOCAL:
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get address request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
- struct net *net = sock_net(in_skb->sk);
+ struct net *tgt_net = sock_net(in_skb->sk);
+ struct inet6_fill_args fillargs = {
+ .portid = NETLINK_CB(in_skb).portid,
+ .seq = nlh->nlmsg_seq,
+ .event = RTM_NEWADDR,
+ .flags = 0,
+ .netnsid = -1,
+ .force_rt_scope_universe = false,
+ };
struct ifaddrmsg *ifm;
struct nlattr *tb[IFA_MAX+1];
struct in6_addr *addr = NULL, *peer;
@@ -5059,20 +5541,29 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct sk_buff *skb;
int err;
- err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy,
- extack);
+ err = inet6_rtm_valid_getaddr_req(in_skb, nlh, tb, extack);
if (err < 0)
return err;
- addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer);
- if (!addr)
- return -EINVAL;
+ if (tb[IFA_TARGET_NETNSID]) {
+ fillargs.netnsid = nla_get_s32(tb[IFA_TARGET_NETNSID]);
+
+ tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(in_skb).sk,
+ fillargs.netnsid);
+ if (IS_ERR(tgt_net))
+ return PTR_ERR(tgt_net);
+ }
+ addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer);
+ if (!addr) {
+ err = -EINVAL;
+ goto errout;
+ }
ifm = nlmsg_data(nlh);
if (ifm->ifa_index)
- dev = dev_get_by_index(net, ifm->ifa_index);
+ dev = dev_get_by_index(tgt_net, ifm->ifa_index);
- ifa = ipv6_get_ifaddr(net, addr, dev, 1);
+ ifa = ipv6_get_ifaddr(tgt_net, addr, dev, 1);
if (!ifa) {
err = -EADDRNOTAVAIL;
goto errout;
@@ -5084,20 +5575,21 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
goto errout_ifa;
}
- err = inet6_fill_ifaddr(skb, ifa, NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, RTM_NEWADDR, 0);
+ err = inet6_fill_ifaddr(skb, ifa, &fillargs);
if (err < 0) {
/* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */
WARN_ON(err == -EMSGSIZE);
kfree_skb(skb);
goto errout_ifa;
}
- err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+ err = rtnl_unicast(skb, tgt_net, NETLINK_CB(in_skb).portid);
errout_ifa:
in6_ifa_put(ifa);
errout:
- if (dev)
- dev_put(dev);
+ dev_put(dev);
+ if (fillargs.netnsid >= 0)
+ put_net(tgt_net);
+
return err;
}
@@ -5105,13 +5597,21 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
{
struct sk_buff *skb;
struct net *net = dev_net(ifa->idev->dev);
+ struct inet6_fill_args fillargs = {
+ .portid = 0,
+ .seq = 0,
+ .event = event,
+ .flags = 0,
+ .netnsid = -1,
+ .force_rt_scope_universe = false,
+ };
int err = -ENOBUFS;
skb = nlmsg_new(inet6_ifaddr_msgsize(), GFP_ATOMIC);
if (!skb)
goto errout;
- err = inet6_fill_ifaddr(skb, ifa, 0, 0, event, 0);
+ err = inet6_fill_ifaddr(skb, ifa, &fillargs);
if (err < 0) {
/* -EMSGSIZE implies BUG in inet6_ifaddr_msgsize() */
WARN_ON(err == -EMSGSIZE);
@@ -5121,83 +5621,101 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
return;
errout:
- if (err < 0)
- rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err);
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err);
}
-static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
- __s32 *array, int bytes)
+static void ipv6_store_devconf(const struct ipv6_devconf *cnf,
+ __s32 *array, int bytes)
{
BUG_ON(bytes < (DEVCONF_MAX * 4));
memset(array, 0, bytes);
- array[DEVCONF_FORWARDING] = cnf->forwarding;
- array[DEVCONF_HOPLIMIT] = cnf->hop_limit;
- array[DEVCONF_MTU6] = cnf->mtu6;
- array[DEVCONF_ACCEPT_RA] = cnf->accept_ra;
- array[DEVCONF_ACCEPT_REDIRECTS] = cnf->accept_redirects;
- array[DEVCONF_AUTOCONF] = cnf->autoconf;
- array[DEVCONF_DAD_TRANSMITS] = cnf->dad_transmits;
- array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits;
+ array[DEVCONF_FORWARDING] = READ_ONCE(cnf->forwarding);
+ array[DEVCONF_HOPLIMIT] = READ_ONCE(cnf->hop_limit);
+ array[DEVCONF_MTU6] = READ_ONCE(cnf->mtu6);
+ array[DEVCONF_ACCEPT_RA] = READ_ONCE(cnf->accept_ra);
+ array[DEVCONF_ACCEPT_REDIRECTS] = READ_ONCE(cnf->accept_redirects);
+ array[DEVCONF_AUTOCONF] = READ_ONCE(cnf->autoconf);
+ array[DEVCONF_DAD_TRANSMITS] = READ_ONCE(cnf->dad_transmits);
+ array[DEVCONF_RTR_SOLICITS] = READ_ONCE(cnf->rtr_solicits);
array[DEVCONF_RTR_SOLICIT_INTERVAL] =
- jiffies_to_msecs(cnf->rtr_solicit_interval);
+ jiffies_to_msecs(READ_ONCE(cnf->rtr_solicit_interval));
array[DEVCONF_RTR_SOLICIT_MAX_INTERVAL] =
- jiffies_to_msecs(cnf->rtr_solicit_max_interval);
+ jiffies_to_msecs(READ_ONCE(cnf->rtr_solicit_max_interval));
array[DEVCONF_RTR_SOLICIT_DELAY] =
- jiffies_to_msecs(cnf->rtr_solicit_delay);
- array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version;
+ jiffies_to_msecs(READ_ONCE(cnf->rtr_solicit_delay));
+ array[DEVCONF_FORCE_MLD_VERSION] = READ_ONCE(cnf->force_mld_version);
array[DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL] =
- jiffies_to_msecs(cnf->mldv1_unsolicited_report_interval);
+ jiffies_to_msecs(READ_ONCE(cnf->mldv1_unsolicited_report_interval));
array[DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL] =
- jiffies_to_msecs(cnf->mldv2_unsolicited_report_interval);
- array[DEVCONF_USE_TEMPADDR] = cnf->use_tempaddr;
- array[DEVCONF_TEMP_VALID_LFT] = cnf->temp_valid_lft;
- array[DEVCONF_TEMP_PREFERED_LFT] = cnf->temp_prefered_lft;
- array[DEVCONF_REGEN_MAX_RETRY] = cnf->regen_max_retry;
- array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor;
- array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses;
- array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr;
- array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] = cnf->accept_ra_min_hop_limit;
- array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo;
+ jiffies_to_msecs(READ_ONCE(cnf->mldv2_unsolicited_report_interval));
+ array[DEVCONF_USE_TEMPADDR] = READ_ONCE(cnf->use_tempaddr);
+ array[DEVCONF_TEMP_VALID_LFT] = READ_ONCE(cnf->temp_valid_lft);
+ array[DEVCONF_TEMP_PREFERED_LFT] = READ_ONCE(cnf->temp_prefered_lft);
+ array[DEVCONF_REGEN_MAX_RETRY] = READ_ONCE(cnf->regen_max_retry);
+ array[DEVCONF_MAX_DESYNC_FACTOR] = READ_ONCE(cnf->max_desync_factor);
+ array[DEVCONF_MAX_ADDRESSES] = READ_ONCE(cnf->max_addresses);
+ array[DEVCONF_ACCEPT_RA_DEFRTR] = READ_ONCE(cnf->accept_ra_defrtr);
+ array[DEVCONF_RA_DEFRTR_METRIC] = READ_ONCE(cnf->ra_defrtr_metric);
+ array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] =
+ READ_ONCE(cnf->accept_ra_min_hop_limit);
+ array[DEVCONF_ACCEPT_RA_PINFO] = READ_ONCE(cnf->accept_ra_pinfo);
#ifdef CONFIG_IPV6_ROUTER_PREF
- array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref;
+ array[DEVCONF_ACCEPT_RA_RTR_PREF] = READ_ONCE(cnf->accept_ra_rtr_pref);
array[DEVCONF_RTR_PROBE_INTERVAL] =
- jiffies_to_msecs(cnf->rtr_probe_interval);
+ jiffies_to_msecs(READ_ONCE(cnf->rtr_probe_interval));
#ifdef CONFIG_IPV6_ROUTE_INFO
- array[DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN] = cnf->accept_ra_rt_info_min_plen;
- array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen;
+ array[DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN] =
+ READ_ONCE(cnf->accept_ra_rt_info_min_plen);
+ array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] =
+ READ_ONCE(cnf->accept_ra_rt_info_max_plen);
#endif
#endif
- array[DEVCONF_PROXY_NDP] = cnf->proxy_ndp;
- array[DEVCONF_ACCEPT_SOURCE_ROUTE] = cnf->accept_source_route;
+ array[DEVCONF_PROXY_NDP] = READ_ONCE(cnf->proxy_ndp);
+ array[DEVCONF_ACCEPT_SOURCE_ROUTE] =
+ READ_ONCE(cnf->accept_source_route);
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
- array[DEVCONF_OPTIMISTIC_DAD] = cnf->optimistic_dad;
- array[DEVCONF_USE_OPTIMISTIC] = cnf->use_optimistic;
+ array[DEVCONF_OPTIMISTIC_DAD] = READ_ONCE(cnf->optimistic_dad);
+ array[DEVCONF_USE_OPTIMISTIC] = READ_ONCE(cnf->use_optimistic);
#endif
#ifdef CONFIG_IPV6_MROUTE
- array[DEVCONF_MC_FORWARDING] = cnf->mc_forwarding;
+ array[DEVCONF_MC_FORWARDING] = atomic_read(&cnf->mc_forwarding);
#endif
- array[DEVCONF_DISABLE_IPV6] = cnf->disable_ipv6;
- array[DEVCONF_ACCEPT_DAD] = cnf->accept_dad;
- array[DEVCONF_FORCE_TLLAO] = cnf->force_tllao;
- array[DEVCONF_NDISC_NOTIFY] = cnf->ndisc_notify;
- array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc;
- array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local;
- array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu;
- array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = cnf->ignore_routes_with_linkdown;
+ array[DEVCONF_DISABLE_IPV6] = READ_ONCE(cnf->disable_ipv6);
+ array[DEVCONF_ACCEPT_DAD] = READ_ONCE(cnf->accept_dad);
+ array[DEVCONF_FORCE_TLLAO] = READ_ONCE(cnf->force_tllao);
+ array[DEVCONF_NDISC_NOTIFY] = READ_ONCE(cnf->ndisc_notify);
+ array[DEVCONF_SUPPRESS_FRAG_NDISC] =
+ READ_ONCE(cnf->suppress_frag_ndisc);
+ array[DEVCONF_ACCEPT_RA_FROM_LOCAL] =
+ READ_ONCE(cnf->accept_ra_from_local);
+ array[DEVCONF_ACCEPT_RA_MTU] = READ_ONCE(cnf->accept_ra_mtu);
+ array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] =
+ READ_ONCE(cnf->ignore_routes_with_linkdown);
/* we omit DEVCONF_STABLE_SECRET for now */
- array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
- array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast;
- array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na;
- array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down;
- array[DEVCONF_SEG6_ENABLED] = cnf->seg6_enabled;
+ array[DEVCONF_USE_OIF_ADDRS_ONLY] = READ_ONCE(cnf->use_oif_addrs_only);
+ array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] =
+ READ_ONCE(cnf->drop_unicast_in_l2_multicast);
+ array[DEVCONF_DROP_UNSOLICITED_NA] = READ_ONCE(cnf->drop_unsolicited_na);
+ array[DEVCONF_KEEP_ADDR_ON_DOWN] = READ_ONCE(cnf->keep_addr_on_down);
+ array[DEVCONF_SEG6_ENABLED] = READ_ONCE(cnf->seg6_enabled);
#ifdef CONFIG_IPV6_SEG6_HMAC
- array[DEVCONF_SEG6_REQUIRE_HMAC] = cnf->seg6_require_hmac;
+ array[DEVCONF_SEG6_REQUIRE_HMAC] = READ_ONCE(cnf->seg6_require_hmac);
#endif
- array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad;
- array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode;
- array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy;
- array[DEVCONF_NDISC_TCLASS] = cnf->ndisc_tclass;
+ array[DEVCONF_ENHANCED_DAD] = READ_ONCE(cnf->enhanced_dad);
+ array[DEVCONF_ADDR_GEN_MODE] = READ_ONCE(cnf->addr_gen_mode);
+ array[DEVCONF_DISABLE_POLICY] = READ_ONCE(cnf->disable_policy);
+ array[DEVCONF_NDISC_TCLASS] = READ_ONCE(cnf->ndisc_tclass);
+ array[DEVCONF_RPL_SEG_ENABLED] = READ_ONCE(cnf->rpl_seg_enabled);
+ array[DEVCONF_IOAM6_ENABLED] = READ_ONCE(cnf->ioam6_enabled);
+ array[DEVCONF_IOAM6_ID] = READ_ONCE(cnf->ioam6_id);
+ array[DEVCONF_IOAM6_ID_WIDE] = READ_ONCE(cnf->ioam6_id_wide);
+ array[DEVCONF_NDISC_EVICT_NOCARRIER] =
+ READ_ONCE(cnf->ndisc_evict_nocarrier);
+ array[DEVCONF_ACCEPT_UNTRACKED_NA] =
+ READ_ONCE(cnf->accept_untracked_na);
+ array[DEVCONF_ACCEPT_RA_MIN_LFT] = READ_ONCE(cnf->accept_ra_min_lft);
+ array[DEVCONF_FORCE_FORWARDING] = READ_ONCE(cnf->force_forwarding);
}
static inline size_t inet6_ifla6_size(void)
@@ -5209,6 +5727,7 @@ static inline size_t inet6_ifla6_size(void)
+ nla_total_size(ICMP6_MIB_MAX * 8) /* IFLA_INET6_ICMP6STATS */
+ nla_total_size(sizeof(struct in6_addr)) /* IFLA_INET6_TOKEN */
+ nla_total_size(1) /* IFLA_INET6_ADDR_GEN_MODE */
+ + nla_total_size(4) /* IFLA_INET6_RA_MTU */
+ 0;
}
@@ -5273,16 +5792,38 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype,
}
}
+static int inet6_fill_ifla6_stats_attrs(struct sk_buff *skb,
+ struct inet6_dev *idev)
+{
+ struct nlattr *nla;
+
+ nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64));
+ if (!nla)
+ goto nla_put_failure;
+ snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla));
+
+ nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64));
+ if (!nla)
+ goto nla_put_failure;
+ snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla));
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
u32 ext_filter_mask)
{
- struct nlattr *nla;
struct ifla_cacheinfo ci;
+ struct nlattr *nla;
+ u32 ra_mtu;
- if (nla_put_u32(skb, IFLA_INET6_FLAGS, idev->if_flags))
+ if (nla_put_u32(skb, IFLA_INET6_FLAGS, READ_ONCE(idev->if_flags)))
goto nla_put_failure;
ci.max_reasm_len = IPV6_MAXPLEN;
- ci.tstamp = cstamp_delta(idev->tstamp);
+ ci.tstamp = cstamp_delta(READ_ONCE(idev->tstamp));
ci.reachable_time = jiffies_to_msecs(idev->nd_parms->reachable_time);
ci.retrans_time = jiffies_to_msecs(NEIGH_VAR(idev->nd_parms, RETRANS_TIME));
if (nla_put(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci))
@@ -5294,30 +5835,26 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
/* XXX - MC not implemented */
- if (ext_filter_mask & RTEXT_FILTER_SKIP_STATS)
- return 0;
-
- nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64));
- if (!nla)
- goto nla_put_failure;
- snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla));
-
- nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64));
- if (!nla)
- goto nla_put_failure;
- snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla));
+ if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS)) {
+ if (inet6_fill_ifla6_stats_attrs(skb, idev) < 0)
+ goto nla_put_failure;
+ }
nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr));
if (!nla)
goto nla_put_failure;
-
- if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE, idev->cnf.addr_gen_mode))
- goto nla_put_failure;
-
read_lock_bh(&idev->lock);
memcpy(nla_data(nla), idev->token.s6_addr, nla_len(nla));
read_unlock_bh(&idev->lock);
+ if (nla_put_u8(skb, IFLA_INET6_ADDR_GEN_MODE,
+ READ_ONCE(idev->cnf.addr_gen_mode)))
+ goto nla_put_failure;
+
+ ra_mtu = READ_ONCE(idev->ra_mtu);
+ if (ra_mtu && nla_put_u32(skb, IFLA_INET6_RA_MTU, ra_mtu))
+ goto nla_put_failure;
+
return 0;
nla_put_failure:
@@ -5347,7 +5884,8 @@ static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device *dev,
return 0;
}
-static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
+static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token,
+ struct netlink_ext_ack *extack)
{
struct inet6_ifaddr *ifp;
struct net_device *dev = idev->dev;
@@ -5358,12 +5896,29 @@ static int inet6_set_iftoken(struct inet6_dev *idev, struct in6_addr *token)
if (!token)
return -EINVAL;
- if (dev->flags & (IFF_LOOPBACK | IFF_NOARP))
+
+ if (dev->flags & IFF_LOOPBACK) {
+ NL_SET_ERR_MSG_MOD(extack, "Device is loopback");
return -EINVAL;
- if (!ipv6_accept_ra(idev))
+ }
+
+ if (dev->flags & IFF_NOARP) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Device does not do neighbour discovery");
return -EINVAL;
- if (idev->cnf.rtr_solicits == 0)
+ }
+
+ if (!ipv6_accept_ra(idev)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Router advertisement is disabled on device");
return -EINVAL;
+ }
+
+ if (READ_ONCE(idev->cnf.rtr_solicits) == 0) {
+ NL_SET_ERR_MSG(extack,
+ "Router solicitation is disabled on device");
+ return -EINVAL;
+ }
write_lock_bh(&idev->lock);
@@ -5392,7 +5947,7 @@ update_lft:
if (update_rs) {
idev->if_flags |= IF_RS_SENT;
idev->rs_interval = rfc3315_s14_backoff_init(
- idev->cnf.rtr_solicit_interval);
+ READ_ONCE(idev->cnf.rtr_solicit_interval));
idev->rs_probes = 1;
addrconf_mod_rs_timer(idev, idev->rs_interval);
}
@@ -5409,27 +5964,18 @@ update_lft:
write_unlock_bh(&idev->lock);
inet6_ifinfo_notify(RTM_NEWLINK, idev);
- addrconf_verify_rtnl();
+ addrconf_verify_rtnl(dev_net(dev));
return 0;
}
static const struct nla_policy inet6_af_policy[IFLA_INET6_MAX + 1] = {
[IFLA_INET6_ADDR_GEN_MODE] = { .type = NLA_U8 },
[IFLA_INET6_TOKEN] = { .len = sizeof(struct in6_addr) },
+ [IFLA_INET6_RA_MTU] = { .type = NLA_REJECT,
+ .reject_message =
+ "IFLA_INET6_RA_MTU can not be set" },
};
-static int inet6_validate_link_af(const struct net_device *dev,
- const struct nlattr *nla)
-{
- struct nlattr *tb[IFLA_INET6_MAX + 1];
-
- if (dev && !__in6_dev_get(dev))
- return -EAFNOSUPPORT;
-
- return nla_parse_nested(tb, IFLA_INET6_MAX, nla, inet6_af_policy,
- NULL);
-}
-
static int check_addr_gen_mode(int mode)
{
if (mode != IN6_ADDR_GEN_MODE_EUI64 &&
@@ -5450,20 +5996,56 @@ static int check_stable_privacy(struct inet6_dev *idev, struct net *net,
return 1;
}
-static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
+static int inet6_validate_link_af(const struct net_device *dev,
+ const struct nlattr *nla,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IFLA_INET6_MAX + 1];
+ struct inet6_dev *idev = NULL;
+ int err;
+
+ if (dev) {
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ return -EAFNOSUPPORT;
+ }
+
+ err = nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla,
+ inet6_af_policy, extack);
+ if (err)
+ return err;
+
+ if (!tb[IFLA_INET6_TOKEN] && !tb[IFLA_INET6_ADDR_GEN_MODE])
+ return -EINVAL;
+
+ if (tb[IFLA_INET6_ADDR_GEN_MODE]) {
+ u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]);
+
+ if (check_addr_gen_mode(mode) < 0)
+ return -EINVAL;
+ if (dev && check_stable_privacy(idev, dev_net(dev), mode) < 0)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla,
+ struct netlink_ext_ack *extack)
{
- int err = -EINVAL;
struct inet6_dev *idev = __in6_dev_get(dev);
struct nlattr *tb[IFLA_INET6_MAX + 1];
+ int err;
if (!idev)
return -EAFNOSUPPORT;
- if (nla_parse_nested(tb, IFLA_INET6_MAX, nla, NULL, NULL) < 0)
- BUG();
+ if (nla_parse_nested_deprecated(tb, IFLA_INET6_MAX, nla, NULL, NULL) < 0)
+ return -EINVAL;
if (tb[IFLA_INET6_TOKEN]) {
- err = inet6_set_iftoken(idev, nla_data(tb[IFLA_INET6_TOKEN]));
+ err = inet6_set_iftoken(idev, nla_data(tb[IFLA_INET6_TOKEN]),
+ extack);
if (err)
return err;
}
@@ -5471,15 +6053,10 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
if (tb[IFLA_INET6_ADDR_GEN_MODE]) {
u8 mode = nla_get_u8(tb[IFLA_INET6_ADDR_GEN_MODE]);
- if (check_addr_gen_mode(mode) < 0 ||
- check_stable_privacy(idev, dev_net(dev), mode) < 0)
- return -EINVAL;
-
- idev->cnf.addr_gen_mode = mode;
- err = 0;
+ WRITE_ONCE(idev->cnf.addr_gen_mode, mode);
}
- return err;
+ return 0;
}
static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
@@ -5488,6 +6065,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
struct net_device *dev = idev->dev;
struct ifinfomsg *hdr;
struct nlmsghdr *nlh;
+ int ifindex, iflink;
void *protoinfo;
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags);
@@ -5498,20 +6076,22 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev,
hdr->ifi_family = AF_INET6;
hdr->__ifi_pad = 0;
hdr->ifi_type = dev->type;
- hdr->ifi_index = dev->ifindex;
- hdr->ifi_flags = dev_get_flags(dev);
+ ifindex = READ_ONCE(dev->ifindex);
+ hdr->ifi_index = ifindex;
+ hdr->ifi_flags = netif_get_flags(dev);
hdr->ifi_change = 0;
+ iflink = dev_get_iflink(dev);
if (nla_put_string(skb, IFLA_IFNAME, dev->name) ||
(dev->addr_len &&
nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) ||
- nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
- (dev->ifindex != dev_get_iflink(dev) &&
- nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) ||
+ nla_put_u32(skb, IFLA_MTU, READ_ONCE(dev->mtu)) ||
+ (ifindex != iflink &&
+ nla_put_u32(skb, IFLA_LINK, iflink)) ||
nla_put_u8(skb, IFLA_OPERSTATE,
- netif_running(dev) ? dev->operstate : IF_OPER_DOWN))
+ netif_running(dev) ? READ_ONCE(dev->operstate) : IF_OPER_DOWN))
goto nla_put_failure;
- protoinfo = nla_nest_start(skb, IFLA_PROTINFO);
+ protoinfo = nla_nest_start_noflag(skb, IFLA_PROTINFO);
if (!protoinfo)
goto nla_put_failure;
@@ -5527,43 +6107,67 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int inet6_valid_dump_ifinfo(const struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct ifinfomsg *ifm;
+
+ ifm = nlmsg_payload(nlh, sizeof(*ifm));
+ if (!ifm) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid header for link dump request");
+ return -EINVAL;
+ }
+
+ if (nlmsg_attrlen(nlh, sizeof(*ifm))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid data after header");
+ return -EINVAL;
+ }
+
+ if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags ||
+ ifm->ifi_change || ifm->ifi_index) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for dump request");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
- int h, s_h;
- int idx = 0, s_idx;
+ struct {
+ unsigned long ifindex;
+ } *ctx = (void *)cb->ctx;
struct net_device *dev;
struct inet6_dev *idev;
- struct hlist_head *head;
+ int err;
+
+ /* only requests using strict checking can pass data to
+ * influence the dump
+ */
+ if (cb->strict_check) {
+ err = inet6_valid_dump_ifinfo(cb->nlh, cb->extack);
- s_h = cb->args[0];
- s_idx = cb->args[1];
+ if (err < 0)
+ return err;
+ }
+ err = 0;
rcu_read_lock();
- for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
- idx = 0;
- head = &net->dev_index_head[h];
- hlist_for_each_entry_rcu(dev, head, index_hlist) {
- if (idx < s_idx)
- goto cont;
- idev = __in6_dev_get(dev);
- if (!idev)
- goto cont;
- if (inet6_fill_ifinfo(skb, idev,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWLINK, NLM_F_MULTI) < 0)
- goto out;
-cont:
- idx++;
- }
+ for_each_netdev_dump(net, dev, ctx->ifindex) {
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ continue;
+ err = inet6_fill_ifinfo(skb, idev,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWLINK, NLM_F_MULTI);
+ if (err < 0)
+ break;
}
-out:
rcu_read_unlock();
- cb->args[1] = idx;
- cb->args[0] = h;
- return skb->len;
+ return err;
}
void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
@@ -5586,8 +6190,7 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFINFO, NULL, GFP_ATOMIC);
return;
errout:
- if (err < 0)
- rtnl_set_sk_err(net, RTNLGRP_IPV6_IFINFO, err);
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_IFINFO, err);
}
static inline size_t inet6_prefix_nlmsg_size(void)
@@ -5617,11 +6220,7 @@ static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev,
pmsg->prefix_len = pinfo->prefix_len;
pmsg->prefix_type = pinfo->type;
pmsg->prefix_pad3 = 0;
- pmsg->prefix_flags = 0;
- if (pinfo->onlink)
- pmsg->prefix_flags |= IF_PREFIX_ONLINK;
- if (pinfo->autoconf)
- pmsg->prefix_flags |= IF_PREFIX_AUTOCONF;
+ pmsg->prefix_flags = pinfo->flags;
if (nla_put(skb, PREFIX_ADDRESS, sizeof(pinfo->prefix), &pinfo->prefix))
goto nla_put_failure;
@@ -5658,8 +6257,7 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev,
rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC);
return;
errout:
- if (err < 0)
- rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err);
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err);
}
static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
@@ -5674,19 +6272,26 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
switch (event) {
case RTM_NEWADDR:
/*
- * If the address was optimistic
- * we inserted the route at the start of
- * our DAD process, so we don't need
- * to do it again
+ * If the address was optimistic we inserted the route at the
+ * start of our DAD process, so we don't need to do it again.
+ * If the device was taken down in the middle of the DAD
+ * cycle there is a race where we could get here without a
+ * host route, so nothing to insert. That will be fixed when
+ * the device is brought up.
*/
- if (!rcu_access_pointer(ifp->rt->fib6_node))
+ if (ifp->rt && !rcu_access_pointer(ifp->rt->fib6_node)) {
ip6_ins_rt(net, ifp->rt);
+ } else if (!ifp->rt && (ifp->idev->dev->flags & IFF_UP)) {
+ pr_warn("BUG: Address %pI6c on device %s is missing its host route.\n",
+ &ifp->addr, ifp->idev->dev->name);
+ }
+
if (ifp->idev->cnf.forwarding)
addrconf_join_anycast(ifp);
if (!ipv6_addr_any(&ifp->peer_addr))
- addrconf_prefix_route(&ifp->peer_addr, 128, 0,
- ifp->idev->dev, 0, 0,
- GFP_ATOMIC);
+ addrconf_prefix_route(&ifp->peer_addr, 128,
+ ifp->rt_priority, ifp->idev->dev,
+ 0, 0, GFP_ATOMIC);
break;
case RTM_DELADDR:
if (ifp->idev->cnf.forwarding)
@@ -5696,12 +6301,13 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
struct fib6_info *rt;
rt = addrconf_get_prefix_route(&ifp->peer_addr, 128,
- ifp->idev->dev, 0, 0);
+ ifp->idev->dev, 0, 0,
+ false);
if (rt)
- ip6_del_rt(net, rt);
+ ip6_del_rt(net, rt, false);
}
if (ifp->rt) {
- ip6_del_rt(net, ifp->rt);
+ ip6_del_rt(net, ifp->rt, false);
ifp->rt = NULL;
}
rt_genid_bump_ipv6(net);
@@ -5712,17 +6318,14 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp)
{
- rcu_read_lock_bh();
if (likely(ifp->idev->dead == 0))
__ipv6_ifa_notify(event, ifp);
- rcu_read_unlock_bh();
}
#ifdef CONFIG_SYSCTL
-static
-int addrconf_sysctl_forward(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int addrconf_sysctl_forward(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int val = *valp;
@@ -5746,9 +6349,8 @@ int addrconf_sysctl_forward(struct ctl_table *ctl, int write,
return ret;
}
-static
-int addrconf_sysctl_mtu(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int addrconf_sysctl_mtu(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct inet6_dev *idev = ctl->extra1;
int min_mtu = IPV6_MIN_MTU;
@@ -5781,46 +6383,46 @@ static void addrconf_disable_change(struct net *net, __s32 newf)
struct inet6_dev *idev;
for_each_netdev(net, dev) {
- idev = __in6_dev_get(dev);
+ idev = __in6_dev_get_rtnl_net(dev);
if (idev) {
int changed = (!idev->cnf.disable_ipv6) ^ (!newf);
- idev->cnf.disable_ipv6 = newf;
+
+ WRITE_ONCE(idev->cnf.disable_ipv6, newf);
if (changed)
dev_disable_change(idev);
}
}
}
-static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf)
+static int addrconf_disable_ipv6(const struct ctl_table *table, int *p, int newf)
{
- struct net *net;
+ struct net *net = (struct net *)table->extra2;
int old;
- if (!rtnl_trylock())
- return restart_syscall();
-
- net = (struct net *)table->extra2;
- old = *p;
- *p = newf;
-
if (p == &net->ipv6.devconf_dflt->disable_ipv6) {
- rtnl_unlock();
+ WRITE_ONCE(*p, newf);
return 0;
}
+ if (!rtnl_net_trylock(net))
+ return restart_syscall();
+
+ old = *p;
+ WRITE_ONCE(*p, newf);
+
if (p == &net->ipv6.devconf_all->disable_ipv6) {
- net->ipv6.devconf_dflt->disable_ipv6 = newf;
+ WRITE_ONCE(net->ipv6.devconf_dflt->disable_ipv6, newf);
addrconf_disable_change(net, newf);
- } else if ((!newf) ^ (!old))
+ } else if ((!newf) ^ (!old)) {
dev_disable_change((struct inet6_dev *)table->extra1);
+ }
- rtnl_unlock();
+ rtnl_net_unlock(net);
return 0;
}
-static
-int addrconf_sysctl_disable(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int addrconf_sysctl_disable(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int val = *valp;
@@ -5844,9 +6446,8 @@ int addrconf_sysctl_disable(struct ctl_table *ctl, int write,
return ret;
}
-static
-int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int addrconf_sysctl_proxy_ndp(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int ret;
@@ -5859,20 +6460,20 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write,
if (write && old != new) {
struct net *net = ctl->extra2;
- if (!rtnl_trylock())
+ if (!rtnl_net_trylock(net))
return restart_syscall();
- if (valp == &net->ipv6.devconf_dflt->proxy_ndp)
+ if (valp == &net->ipv6.devconf_dflt->proxy_ndp) {
inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
NETCONFA_PROXY_NEIGH,
NETCONFA_IFINDEX_DEFAULT,
net->ipv6.devconf_dflt);
- else if (valp == &net->ipv6.devconf_all->proxy_ndp)
+ } else if (valp == &net->ipv6.devconf_all->proxy_ndp) {
inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
NETCONFA_PROXY_NEIGH,
NETCONFA_IFINDEX_ALL,
net->ipv6.devconf_all);
- else {
+ } else {
struct inet6_dev *idev = ctl->extra1;
inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
@@ -5880,14 +6481,14 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write,
idev->dev->ifindex,
&idev->cnf);
}
- rtnl_unlock();
+ rtnl_net_unlock(net);
}
return ret;
}
-static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
+static int addrconf_sysctl_addr_gen_mode(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int ret = 0;
@@ -5900,7 +6501,7 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write,
.mode = ctl->mode,
};
- if (!rtnl_trylock())
+ if (!rtnl_net_trylock(net))
return restart_syscall();
new_val = *((u32 *)ctl->data);
@@ -5922,34 +6523,39 @@ static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write,
}
if (idev->cnf.addr_gen_mode != new_val) {
- idev->cnf.addr_gen_mode = new_val;
- addrconf_dev_config(idev->dev);
+ WRITE_ONCE(idev->cnf.addr_gen_mode, new_val);
+ netdev_lock_ops(idev->dev);
+ addrconf_init_auto_addrs(idev->dev);
+ netdev_unlock_ops(idev->dev);
}
} else if (&net->ipv6.devconf_all->addr_gen_mode == ctl->data) {
struct net_device *dev;
- net->ipv6.devconf_dflt->addr_gen_mode = new_val;
+ WRITE_ONCE(net->ipv6.devconf_dflt->addr_gen_mode, new_val);
for_each_netdev(net, dev) {
- idev = __in6_dev_get(dev);
+ idev = __in6_dev_get_rtnl_net(dev);
if (idev &&
idev->cnf.addr_gen_mode != new_val) {
- idev->cnf.addr_gen_mode = new_val;
- addrconf_dev_config(idev->dev);
+ WRITE_ONCE(idev->cnf.addr_gen_mode,
+ new_val);
+ netdev_lock_ops(idev->dev);
+ addrconf_init_auto_addrs(idev->dev);
+ netdev_unlock_ops(idev->dev);
}
}
}
- *((u32 *)ctl->data) = new_val;
+ WRITE_ONCE(*((u32 *)ctl->data), new_val);
}
out:
- rtnl_unlock();
+ rtnl_net_unlock(net);
return ret;
}
-static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
+static int addrconf_sysctl_stable_secret(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp,
loff_t *ppos)
{
int err;
@@ -5965,7 +6571,7 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
lctl.maxlen = IPV6_MAX_STRLEN;
lctl.data = str;
- if (!rtnl_trylock())
+ if (!rtnl_net_trylock(net))
return restart_syscall();
if (!write && !secret->initialized) {
@@ -5995,29 +6601,29 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
struct net_device *dev;
for_each_netdev(net, dev) {
- struct inet6_dev *idev = __in6_dev_get(dev);
+ struct inet6_dev *idev = __in6_dev_get_rtnl_net(dev);
if (idev) {
- idev->cnf.addr_gen_mode =
- IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
+ WRITE_ONCE(idev->cnf.addr_gen_mode,
+ IN6_ADDR_GEN_MODE_STABLE_PRIVACY);
}
}
} else {
struct inet6_dev *idev = ctl->extra1;
- idev->cnf.addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
+ WRITE_ONCE(idev->cnf.addr_gen_mode,
+ IN6_ADDR_GEN_MODE_STABLE_PRIVACY);
}
out:
- rtnl_unlock();
+ rtnl_net_unlock(net);
return err;
}
static
-int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl,
- int write,
- void __user *buffer,
+int addrconf_sysctl_ignore_routes_with_linkdown(const struct ctl_table *ctl,
+ int write, void *buffer,
size_t *lenp,
loff_t *ppos)
{
@@ -6062,16 +6668,17 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
list_for_each_entry(ifa, &idev->addr_list, if_list) {
spin_lock(&ifa->lock);
if (ifa->rt) {
- struct fib6_info *rt = ifa->rt;
+ /* host routes only use builtin fib6_nh */
+ struct fib6_nh *nh = ifa->rt->fib6_nh;
int cpu;
rcu_read_lock();
ifa->rt->dst_nopolicy = val ? true : false;
- if (rt->rt6i_pcpu) {
+ if (nh->rt6i_pcpu) {
for_each_possible_cpu(cpu) {
struct rt6_info **rtp;
- rtp = per_cpu_ptr(rt->rt6i_pcpu, cpu);
+ rtp = per_cpu_ptr(nh->rt6i_pcpu, cpu);
addrconf_set_nopolicy(*rtp, val);
}
}
@@ -6083,27 +6690,26 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
}
static
-int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val)
+int addrconf_disable_policy(const struct ctl_table *ctl, int *valp, int val)
{
+ struct net *net = (struct net *)ctl->extra2;
struct inet6_dev *idev;
- struct net *net;
-
- if (!rtnl_trylock())
- return restart_syscall();
- *valp = val;
-
- net = (struct net *)ctl->extra2;
if (valp == &net->ipv6.devconf_dflt->disable_policy) {
- rtnl_unlock();
+ WRITE_ONCE(*valp, val);
return 0;
}
+ if (!rtnl_net_trylock(net))
+ return restart_syscall();
+
+ WRITE_ONCE(*valp, val);
+
if (valp == &net->ipv6.devconf_all->disable_policy) {
struct net_device *dev;
for_each_netdev(net, dev) {
- idev = __in6_dev_get(dev);
+ idev = __in6_dev_get_rtnl_net(dev);
if (idev)
addrconf_disable_policy_idev(idev, val);
}
@@ -6112,14 +6718,12 @@ int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val)
addrconf_disable_policy_idev(idev, val);
}
- rtnl_unlock();
+ rtnl_net_unlock(net);
return 0;
}
-static
-int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+static int addrconf_sysctl_disable_policy(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int val = *valp;
@@ -6140,10 +6744,78 @@ int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write,
return ret;
}
+static void addrconf_force_forward_change(struct net *net, __s32 newf)
+{
+ struct net_device *dev;
+ struct inet6_dev *idev;
+
+ for_each_netdev(net, dev) {
+ idev = __in6_dev_get_rtnl_net(dev);
+ if (idev) {
+ int changed = (!idev->cnf.force_forwarding) ^ (!newf);
+
+ WRITE_ONCE(idev->cnf.force_forwarding, newf);
+ if (changed)
+ inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
+ NETCONFA_FORCE_FORWARDING,
+ dev->ifindex, &idev->cnf);
+ }
+ }
+}
+
+static int addrconf_sysctl_force_forwarding(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct inet6_dev *idev = ctl->extra1;
+ struct ctl_table tmp_ctl = *ctl;
+ struct net *net = ctl->extra2;
+ int *valp = ctl->data;
+ int new_val = *valp;
+ int old_val = *valp;
+ loff_t pos = *ppos;
+ int ret;
+
+ tmp_ctl.extra1 = SYSCTL_ZERO;
+ tmp_ctl.extra2 = SYSCTL_ONE;
+ tmp_ctl.data = &new_val;
+
+ ret = proc_douintvec_minmax(&tmp_ctl, write, buffer, lenp, ppos);
+
+ if (write && old_val != new_val) {
+ if (!rtnl_net_trylock(net))
+ return restart_syscall();
+
+ WRITE_ONCE(*valp, new_val);
+
+ if (valp == &net->ipv6.devconf_dflt->force_forwarding) {
+ inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_FORCE_FORWARDING,
+ NETCONFA_IFINDEX_DEFAULT,
+ net->ipv6.devconf_dflt);
+ } else if (valp == &net->ipv6.devconf_all->force_forwarding) {
+ inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_FORCE_FORWARDING,
+ NETCONFA_IFINDEX_ALL,
+ net->ipv6.devconf_all);
+
+ addrconf_force_forward_change(net, new_val);
+ } else {
+ inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
+ NETCONFA_FORCE_FORWARDING,
+ idev->dev->ifindex,
+ &idev->cnf);
+ }
+ rtnl_net_unlock(net);
+ }
+
+ if (ret)
+ *ppos = pos;
+ return ret;
+}
+
static int minus_one = -1;
-static const int zero = 0;
-static const int one = 1;
static const int two_five_five = 255;
+static u32 ioam6_if_id_max = U16_MAX;
static const struct ctl_table addrconf_sysctl[] = {
{
@@ -6159,7 +6831,7 @@ static const struct ctl_table addrconf_sysctl[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = (void *)&one,
+ .extra1 = (void *)SYSCTL_ONE,
.extra2 = (void *)&two_five_five,
},
{
@@ -6271,6 +6943,13 @@ static const struct ctl_table addrconf_sysctl[] = {
.proc_handler = proc_dointvec,
},
{
+ .procname = "regen_min_advance",
+ .data = &ipv6_devconf.regen_min_advance,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "regen_max_retry",
.data = &ipv6_devconf.regen_max_retry,
.maxlen = sizeof(int),
@@ -6299,6 +6978,14 @@ static const struct ctl_table addrconf_sysctl[] = {
.proc_handler = proc_dointvec,
},
{
+ .procname = "ra_defrtr_metric",
+ .data = &ipv6_devconf.ra_defrtr_metric,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = (void *)SYSCTL_ONE,
+ },
+ {
.procname = "accept_ra_min_hop_limit",
.data = &ipv6_devconf.accept_ra_min_hop_limit,
.maxlen = sizeof(int),
@@ -6306,12 +6993,37 @@ static const struct ctl_table addrconf_sysctl[] = {
.proc_handler = proc_dointvec,
},
{
+ .procname = "accept_ra_min_lft",
+ .data = &ipv6_devconf.accept_ra_min_lft,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "accept_ra_pinfo",
.data = &ipv6_devconf.accept_ra_pinfo,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "ra_honor_pio_life",
+ .data = &ipv6_devconf.ra_honor_pio_life,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "ra_honor_pio_pflag",
+ .data = &ipv6_devconf.ra_honor_pio_pflag,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
#ifdef CONFIG_IPV6_ROUTER_PREF
{
.procname = "accept_ra_rtr_pref",
@@ -6499,10 +7211,10 @@ static const struct ctl_table addrconf_sysctl[] = {
.proc_handler = proc_dointvec,
},
{
- .procname = "addr_gen_mode",
- .data = &ipv6_devconf.addr_gen_mode,
- .maxlen = sizeof(int),
- .mode = 0644,
+ .procname = "addr_gen_mode",
+ .data = &ipv6_devconf.addr_gen_mode,
+ .maxlen = sizeof(int),
+ .mode = 0644,
.proc_handler = addrconf_sysctl_addr_gen_mode,
},
{
@@ -6518,26 +7230,83 @@ static const struct ctl_table addrconf_sysctl[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = (void *)&zero,
+ .extra1 = (void *)SYSCTL_ZERO,
.extra2 = (void *)&two_five_five,
},
{
- /* sentinel */
- }
+ .procname = "rpl_seg_enabled",
+ .data = &ipv6_devconf.rpl_seg_enabled,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "ioam6_enabled",
+ .data = &ipv6_devconf.ioam6_enabled,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = (void *)SYSCTL_ZERO,
+ .extra2 = (void *)SYSCTL_ONE,
+ },
+ {
+ .procname = "ioam6_id",
+ .data = &ipv6_devconf.ioam6_id,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = (void *)SYSCTL_ZERO,
+ .extra2 = (void *)&ioam6_if_id_max,
+ },
+ {
+ .procname = "ioam6_id_wide",
+ .data = &ipv6_devconf.ioam6_id_wide,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_douintvec,
+ },
+ {
+ .procname = "ndisc_evict_nocarrier",
+ .data = &ipv6_devconf.ndisc_evict_nocarrier,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = (void *)SYSCTL_ZERO,
+ .extra2 = (void *)SYSCTL_ONE,
+ },
+ {
+ .procname = "accept_untracked_na",
+ .data = &ipv6_devconf.accept_untracked_na,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "force_forwarding",
+ .data = &ipv6_devconf.force_forwarding,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = addrconf_sysctl_force_forwarding,
+ },
};
static int __addrconf_sysctl_register(struct net *net, char *dev_name,
struct inet6_dev *idev, struct ipv6_devconf *p)
{
+ size_t table_size = ARRAY_SIZE(addrconf_sysctl);
int i, ifindex;
struct ctl_table *table;
char path[sizeof("net/ipv6/conf/") + IFNAMSIZ];
- table = kmemdup(addrconf_sysctl, sizeof(addrconf_sysctl), GFP_KERNEL);
+ table = kmemdup(addrconf_sysctl, sizeof(addrconf_sysctl), GFP_KERNEL_ACCOUNT);
if (!table)
goto out;
- for (i = 0; table[i].data; i++) {
+ for (i = 0; i < table_size; i++) {
table[i].data += (char *)p - (char *)&ipv6_devconf;
/* If one of these is already set, then it is not safe to
* overwrite either of them: this makes proc_dointvec_minmax
@@ -6551,7 +7320,8 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name,
snprintf(path, sizeof(path), "net/ipv6/conf/%s", dev_name);
- p->sysctl_header = register_net_sysctl(net, path, table);
+ p->sysctl_header = register_net_sysctl_sz(net, path, table,
+ table_size);
if (!p->sysctl_header)
goto free;
@@ -6574,7 +7344,7 @@ out:
static void __addrconf_sysctl_unregister(struct net *net,
struct ipv6_devconf *p, int ifindex)
{
- struct ctl_table *table;
+ const struct ctl_table *table;
if (!p->sysctl_header)
return;
@@ -6621,6 +7391,14 @@ static int __net_init addrconf_init_net(struct net *net)
int err = -ENOMEM;
struct ipv6_devconf *all, *dflt;
+ spin_lock_init(&net->ipv6.addrconf_hash_lock);
+ INIT_DEFERRABLE_WORK(&net->ipv6.addr_chk_work, addrconf_verify_work);
+ net->ipv6.inet6_addr_lst = kcalloc(IN6_ADDR_HSIZE,
+ sizeof(struct hlist_head),
+ GFP_KERNEL);
+ if (!net->ipv6.inet6_addr_lst)
+ goto err_alloc_addr;
+
all = kmemdup(&ipv6_devconf, sizeof(ipv6_devconf), GFP_KERNEL);
if (!all)
goto err_alloc_all;
@@ -6629,6 +7407,28 @@ static int __net_init addrconf_init_net(struct net *net)
if (!dflt)
goto err_alloc_dflt;
+ if (!net_eq(net, &init_net)) {
+ switch (net_inherit_devconf()) {
+ case 1: /* copy from init_net */
+ memcpy(all, init_net.ipv6.devconf_all,
+ sizeof(ipv6_devconf));
+ memcpy(dflt, init_net.ipv6.devconf_dflt,
+ sizeof(ipv6_devconf_dflt));
+ break;
+ case 3: /* copy from the current netns */
+ memcpy(all, current->nsproxy->net_ns->ipv6.devconf_all,
+ sizeof(ipv6_devconf));
+ memcpy(dflt,
+ current->nsproxy->net_ns->ipv6.devconf_dflt,
+ sizeof(ipv6_devconf_dflt));
+ break;
+ case 0:
+ case 2:
+ /* use compiled values */
+ break;
+ }
+ }
+
/* these will be inherited by all namespaces */
dflt->autoconf = ipv6_defaults.autoconf;
dflt->disable_ipv6 = ipv6_defaults.disable_ipv6;
@@ -6655,15 +7455,21 @@ err_reg_dflt:
__addrconf_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL);
err_reg_all:
kfree(dflt);
+ net->ipv6.devconf_dflt = NULL;
#endif
err_alloc_dflt:
kfree(all);
+ net->ipv6.devconf_all = NULL;
err_alloc_all:
+ kfree(net->ipv6.inet6_addr_lst);
+err_alloc_addr:
return err;
}
static void __net_exit addrconf_exit_net(struct net *net)
{
+ int i;
+
#ifdef CONFIG_SYSCTL
__addrconf_sysctl_unregister(net, net->ipv6.devconf_dflt,
NETCONFA_IFINDEX_DEFAULT);
@@ -6671,7 +7477,19 @@ static void __net_exit addrconf_exit_net(struct net *net)
NETCONFA_IFINDEX_ALL);
#endif
kfree(net->ipv6.devconf_dflt);
+ net->ipv6.devconf_dflt = NULL;
kfree(net->ipv6.devconf_all);
+ net->ipv6.devconf_all = NULL;
+
+ cancel_delayed_work_sync(&net->ipv6.addr_chk_work);
+ /*
+ * Check hash table, then free it.
+ */
+ for (i = 0; i < IN6_ADDR_HSIZE; i++)
+ WARN_ON_ONCE(!hlist_empty(&net->ipv6.inet6_addr_lst[i]));
+
+ kfree(net->ipv6.inet6_addr_lst);
+ net->ipv6.inet6_addr_lst = NULL;
}
static struct pernet_operations addrconf_ops = {
@@ -6687,6 +7505,27 @@ static struct rtnl_af_ops inet6_ops __read_mostly = {
.set_link_af = inet6_set_link_af,
};
+static const struct rtnl_msg_handler addrconf_rtnl_msg_handlers[] __initconst_or_module = {
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETLINK,
+ .dumpit = inet6_dump_ifinfo, .flags = RTNL_FLAG_DUMP_UNLOCKED},
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWADDR,
+ .doit = inet6_rtm_newaddr, .flags = RTNL_FLAG_DOIT_PERNET},
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELADDR,
+ .doit = inet6_rtm_deladdr, .flags = RTNL_FLAG_DOIT_PERNET},
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETADDR,
+ .doit = inet6_rtm_getaddr, .dumpit = inet6_dump_ifaddr,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETMULTICAST,
+ .dumpit = inet6_dump_ifmcaddr,
+ .flags = RTNL_FLAG_DUMP_UNLOCKED},
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETANYCAST,
+ .dumpit = inet6_dump_ifacaddr,
+ .flags = RTNL_FLAG_DUMP_UNLOCKED},
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETNETCONF,
+ .doit = inet6_netconf_get_devconf, .dumpit = inet6_netconf_dump_devconf,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
+};
+
/*
* Init / cleanup code
*/
@@ -6694,7 +7533,7 @@ static struct rtnl_af_ops inet6_ops __read_mostly = {
int __init addrconf_init(void)
{
struct inet6_dev *idev;
- int i, err;
+ int err;
err = ipv6_addr_label_init();
if (err < 0) {
@@ -6707,33 +7546,16 @@ int __init addrconf_init(void)
if (err < 0)
goto out_addrlabel;
- addrconf_wq = create_workqueue("ipv6_addrconf");
+ /* All works using addrconf_wq need to lock rtnl. */
+ addrconf_wq = create_singlethread_workqueue("ipv6_addrconf");
if (!addrconf_wq) {
err = -ENOMEM;
goto out_nowq;
}
- /* The addrconf netdev notifier requires that loopback_dev
- * has it's ipv6 private information allocated and setup
- * before it can bring up and give link-local addresses
- * to other devices which are up.
- *
- * Unfortunately, loopback_dev is not necessarily the first
- * entry in the global dev_base list of net devices. In fact,
- * it is likely to be the very last entry on that list.
- * So this causes the notifier registry below to try and
- * give link-local addresses to all devices besides loopback_dev
- * first, then loopback_dev, which cases all the non-loopback_dev
- * devices to fail to get a link-local address.
- *
- * So, as a temporary fix, allocate the ipv6 structure for
- * loopback_dev first by hand.
- * Longer term, all of the dependencies ipv6 has upon the loopback
- * device and it being up should be removed.
- */
- rtnl_lock();
- idev = ipv6_add_dev(init_net.loopback_dev);
- rtnl_unlock();
+ rtnl_net_lock(&init_net);
+ idev = ipv6_add_dev(blackhole_netdev);
+ rtnl_net_unlock(&init_net);
if (IS_ERR(idev)) {
err = PTR_ERR(idev);
goto errlo;
@@ -6741,47 +7563,18 @@ int __init addrconf_init(void)
ip6_route_init_special_entries();
- for (i = 0; i < IN6_ADDR_HSIZE; i++)
- INIT_HLIST_HEAD(&inet6_addr_lst[i]);
-
register_netdevice_notifier(&ipv6_dev_notf);
- addrconf_verify();
+ addrconf_verify(&init_net);
- rtnl_af_register(&inet6_ops);
+ err = rtnl_af_register(&inet6_ops);
+ if (err)
+ goto erraf;
- err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETLINK,
- NULL, inet6_dump_ifinfo, 0);
- if (err < 0)
+ err = rtnl_register_many(addrconf_rtnl_msg_handlers);
+ if (err)
goto errout;
- err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDR,
- inet6_rtm_newaddr, NULL, 0);
- if (err < 0)
- goto errout;
- err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDR,
- inet6_rtm_deladdr, NULL, 0);
- if (err < 0)
- goto errout;
- err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDR,
- inet6_rtm_getaddr, inet6_dump_ifaddr,
- RTNL_FLAG_DOIT_UNLOCKED);
- if (err < 0)
- goto errout;
- err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETMULTICAST,
- NULL, inet6_dump_ifmcaddr, 0);
- if (err < 0)
- goto errout;
- err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETANYCAST,
- NULL, inet6_dump_ifacaddr, 0);
- if (err < 0)
- goto errout;
- err = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETNETCONF,
- inet6_netconf_get_devconf,
- inet6_netconf_dump_devconf,
- RTNL_FLAG_DOIT_UNLOCKED);
- if (err < 0)
- goto errout;
err = ipv6_addr_label_rtnl_register();
if (err < 0)
goto errout;
@@ -6790,6 +7583,7 @@ int __init addrconf_init(void)
errout:
rtnl_unregister_all(PF_INET6);
rtnl_af_unregister(&inet6_ops);
+erraf:
unregister_netdevice_notifier(&ipv6_dev_notf);
errlo:
destroy_workqueue(addrconf_wq);
@@ -6804,7 +7598,6 @@ out:
void addrconf_cleanup(void)
{
struct net_device *dev;
- int i;
unregister_netdevice_notifier(&ipv6_dev_notf);
unregister_pernet_subsys(&addrconf_ops);
@@ -6812,25 +7605,17 @@ void addrconf_cleanup(void)
rtnl_af_unregister(&inet6_ops);
- rtnl_lock();
+ rtnl_net_lock(&init_net);
/* clean dev list */
for_each_netdev(&init_net, dev) {
- if (__in6_dev_get(dev) == NULL)
+ if (!__in6_dev_get_rtnl_net(dev))
continue;
- addrconf_ifdown(dev, 1);
+ addrconf_ifdown(dev, true);
}
- addrconf_ifdown(init_net.loopback_dev, 2);
+ addrconf_ifdown(init_net.loopback_dev, true);
- /*
- * Check hash table.
- */
- spin_lock_bh(&addrconf_hash_lock);
- for (i = 0; i < IN6_ADDR_HSIZE; i++)
- WARN_ON(!hlist_empty(&inet6_addr_lst[i]));
- spin_unlock_bh(&addrconf_hash_lock);
- cancel_delayed_work(&addr_chk_work);
- rtnl_unlock();
+ rtnl_net_unlock(&init_net);
destroy_workqueue(addrconf_wq);
}
diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c
index 5cd0029d930e..c008d21925d7 100644
--- a/net/ipv6/addrconf_core.c
+++ b/net/ipv6/addrconf_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* IPv6 library code, needed by static components when full IPv6 support is
* not configured or static.
@@ -5,6 +6,7 @@
#include <linux/export.h>
#include <net/ipv6.h>
+#include <net/ipv6_stubs.h>
#include <net/addrconf.h>
#include <net/ip.h>
@@ -127,9 +129,15 @@ int inet6addr_validator_notifier_call_chain(unsigned long val, void *v)
}
EXPORT_SYMBOL(inet6addr_validator_notifier_call_chain);
-static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1,
- struct dst_entry **u2,
- struct flowi6 *u3)
+static struct dst_entry *eafnosupport_ipv6_dst_lookup_flow(struct net *net,
+ const struct sock *sk,
+ struct flowi6 *fl6,
+ const struct in6_addr *final_dst)
+{
+ return ERR_PTR(-EAFNOSUPPORT);
+}
+
+static int eafnosupport_ipv6_route_input(struct sk_buff *skb)
{
return -EAFNOSUPPORT;
}
@@ -139,59 +147,99 @@ static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id)
return NULL;
}
-static struct fib6_info *
+static int
eafnosupport_fib6_table_lookup(struct net *net, struct fib6_table *table,
- int oif, struct flowi6 *fl6, int flags)
+ int oif, struct flowi6 *fl6,
+ struct fib6_result *res, int flags)
{
- return NULL;
+ return -EAFNOSUPPORT;
}
-static struct fib6_info *
+static int
eafnosupport_fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
- int flags)
+ struct fib6_result *res, int flags)
{
- return NULL;
+ return -EAFNOSUPPORT;
}
-static struct fib6_info *
-eafnosupport_fib6_multipath_select(const struct net *net, struct fib6_info *f6i,
- struct flowi6 *fl6, int oif,
- const struct sk_buff *skb, int strict)
+static void
+eafnosupport_fib6_select_path(const struct net *net, struct fib6_result *res,
+ struct flowi6 *fl6, int oif, bool have_oif_match,
+ const struct sk_buff *skb, int strict)
{
- return f6i;
}
static u32
-eafnosupport_ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
- struct in6_addr *saddr)
+eafnosupport_ip6_mtu_from_fib6(const struct fib6_result *res,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
{
return 0;
}
+static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
+ struct fib6_config *cfg, gfp_t gfp_flags,
+ struct netlink_ext_ack *extack)
+{
+ NL_SET_ERR_MSG(extack, "IPv6 support not enabled in kernel");
+ return -EAFNOSUPPORT;
+}
+
+static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt,
+ bool skip_notify)
+{
+ return -EAFNOSUPPORT;
+}
+
+static int eafnosupport_ipv6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
+{
+ kfree_skb(skb);
+ return -EAFNOSUPPORT;
+}
+
+static struct net_device *eafnosupport_ipv6_dev_find(struct net *net, const struct in6_addr *addr,
+ struct net_device *dev)
+{
+ return ERR_PTR(-EAFNOSUPPORT);
+}
+
const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
- .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
+ .ipv6_dst_lookup_flow = eafnosupport_ipv6_dst_lookup_flow,
+ .ipv6_route_input = eafnosupport_ipv6_route_input,
.fib6_get_table = eafnosupport_fib6_get_table,
.fib6_table_lookup = eafnosupport_fib6_table_lookup,
.fib6_lookup = eafnosupport_fib6_lookup,
- .fib6_multipath_select = eafnosupport_fib6_multipath_select,
+ .fib6_select_path = eafnosupport_fib6_select_path,
.ip6_mtu_from_fib6 = eafnosupport_ip6_mtu_from_fib6,
+ .fib6_nh_init = eafnosupport_fib6_nh_init,
+ .ip6_del_rt = eafnosupport_ip6_del_rt,
+ .ipv6_fragment = eafnosupport_ipv6_fragment,
+ .ipv6_dev_find = eafnosupport_ipv6_dev_find,
};
EXPORT_SYMBOL_GPL(ipv6_stub);
/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */
-const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT;
+const struct in6_addr in6addr_loopback __aligned(BITS_PER_LONG/8)
+ = IN6ADDR_LOOPBACK_INIT;
EXPORT_SYMBOL(in6addr_loopback);
-const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
+const struct in6_addr in6addr_any __aligned(BITS_PER_LONG/8)
+ = IN6ADDR_ANY_INIT;
EXPORT_SYMBOL(in6addr_any);
-const struct in6_addr in6addr_linklocal_allnodes = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
+const struct in6_addr in6addr_linklocal_allnodes __aligned(BITS_PER_LONG/8)
+ = IN6ADDR_LINKLOCAL_ALLNODES_INIT;
EXPORT_SYMBOL(in6addr_linklocal_allnodes);
-const struct in6_addr in6addr_linklocal_allrouters = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;
+const struct in6_addr in6addr_linklocal_allrouters __aligned(BITS_PER_LONG/8)
+ = IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;
EXPORT_SYMBOL(in6addr_linklocal_allrouters);
-const struct in6_addr in6addr_interfacelocal_allnodes = IN6ADDR_INTERFACELOCAL_ALLNODES_INIT;
+const struct in6_addr in6addr_interfacelocal_allnodes __aligned(BITS_PER_LONG/8)
+ = IN6ADDR_INTERFACELOCAL_ALLNODES_INIT;
EXPORT_SYMBOL(in6addr_interfacelocal_allnodes);
-const struct in6_addr in6addr_interfacelocal_allrouters = IN6ADDR_INTERFACELOCAL_ALLROUTERS_INIT;
+const struct in6_addr in6addr_interfacelocal_allrouters __aligned(BITS_PER_LONG/8)
+ = IN6ADDR_INTERFACELOCAL_ALLROUTERS_INIT;
EXPORT_SYMBOL(in6addr_interfacelocal_allrouters);
-const struct in6_addr in6addr_sitelocal_allrouters = IN6ADDR_SITELOCAL_ALLROUTERS_INIT;
+const struct in6_addr in6addr_sitelocal_allrouters __aligned(BITS_PER_LONG/8)
+ = IN6ADDR_SITELOCAL_ALLROUTERS_INIT;
EXPORT_SYMBOL(in6addr_sitelocal_allrouters);
static void snmp6_free_dev(struct inet6_dev *idev)
@@ -216,13 +264,13 @@ void in6_dev_finish_destroy(struct inet6_dev *idev)
struct net_device *dev = idev->dev;
WARN_ON(!list_empty(&idev->addr_list));
- WARN_ON(idev->mc_list);
+ WARN_ON(rcu_access_pointer(idev->mc_list));
WARN_ON(timer_pending(&idev->rs_timer));
#ifdef NET_REFCNT_DEBUG
pr_debug("%s: %s\n", __func__, dev ? dev->name : "NIL");
#endif
- dev_put(dev);
+ netdev_put(dev, &idev->dev_tracker);
if (!idev->dead) {
pr_warn("Freeing alive inet6 device %p\n", idev);
return;
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index 1d6ced37ad71..567efd626ab4 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -20,12 +20,6 @@
#include <linux/netlink.h>
#include <linux/rtnetlink.h>
-#if 0
-#define ADDRLABEL(x...) printk(x)
-#else
-#define ADDRLABEL(x...) do { ; } while (0)
-#endif
-
/*
* Policy Table
*/
@@ -150,8 +144,8 @@ u32 ipv6_addr_label(struct net *net,
label = p ? p->label : IPV6_ADDR_LABEL_DEFAULT;
rcu_read_unlock();
- ADDRLABEL(KERN_DEBUG "%s(addr=%pI6, type=%d, ifindex=%d) => %08x\n",
- __func__, addr, type, ifindex, label);
+ net_dbg_ratelimited("%s(addr=%pI6, type=%d, ifindex=%d) => %08x\n", __func__, addr, type,
+ ifindex, label);
return label;
}
@@ -164,8 +158,8 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix,
struct ip6addrlbl_entry *newp;
int addrtype;
- ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u)\n",
- __func__, prefix, prefixlen, ifindex, (unsigned int)label);
+ net_dbg_ratelimited("%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u)\n", __func__,
+ prefix, prefixlen, ifindex, (unsigned int)label);
addrtype = ipv6_addr_type(prefix) & (IPV6_ADDR_MAPPED | IPV6_ADDR_COMPATv4 | IPV6_ADDR_LOOPBACK);
@@ -207,8 +201,7 @@ static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp,
struct hlist_node *n;
int ret = 0;
- ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", __func__, newp,
- replace);
+ net_dbg_ratelimited("%s(newp=%p, replace=%d)\n", __func__, newp, replace);
hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) {
if (p->prefixlen == newp->prefixlen &&
@@ -234,7 +227,8 @@ static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp,
hlist_add_head_rcu(&newp->list, &net->ipv6.ip6addrlbl_table.head);
out:
if (!ret)
- net->ipv6.ip6addrlbl_table.seq++;
+ WRITE_ONCE(net->ipv6.ip6addrlbl_table.seq,
+ net->ipv6.ip6addrlbl_table.seq + 1);
return ret;
}
@@ -246,9 +240,8 @@ static int ip6addrlbl_add(struct net *net,
struct ip6addrlbl_entry *newp;
int ret = 0;
- ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u, replace=%d)\n",
- __func__, prefix, prefixlen, ifindex, (unsigned int)label,
- replace);
+ net_dbg_ratelimited("%s(prefix=%pI6, prefixlen=%d, ifindex=%d, label=%u, replace=%d)\n",
+ __func__, prefix, prefixlen, ifindex, (unsigned int)label, replace);
newp = ip6addrlbl_alloc(prefix, prefixlen, ifindex, label);
if (IS_ERR(newp))
@@ -270,8 +263,8 @@ static int __ip6addrlbl_del(struct net *net,
struct hlist_node *n;
int ret = -ESRCH;
- ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n",
- __func__, prefix, prefixlen, ifindex);
+ net_dbg_ratelimited("%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", __func__, prefix,
+ prefixlen, ifindex);
hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) {
if (p->prefixlen == prefixlen &&
@@ -293,8 +286,8 @@ static int ip6addrlbl_del(struct net *net,
struct in6_addr prefix_buf;
int ret;
- ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n",
- __func__, prefix, prefixlen, ifindex);
+ net_dbg_ratelimited("%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", __func__, prefix,
+ prefixlen, ifindex);
ipv6_addr_prefix(&prefix_buf, prefix, prefixlen);
spin_lock(&net->ipv6.ip6addrlbl_table.lock);
@@ -306,23 +299,29 @@ static int ip6addrlbl_del(struct net *net,
/* add default label */
static int __net_init ip6addrlbl_net_init(struct net *net)
{
- int err = 0;
+ struct ip6addrlbl_entry *p = NULL;
+ struct hlist_node *n;
+ int err;
int i;
- ADDRLABEL(KERN_DEBUG "%s\n", __func__);
-
spin_lock_init(&net->ipv6.ip6addrlbl_table.lock);
INIT_HLIST_HEAD(&net->ipv6.ip6addrlbl_table.head);
for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) {
- int ret = ip6addrlbl_add(net,
- ip6addrlbl_init_table[i].prefix,
- ip6addrlbl_init_table[i].prefixlen,
- 0,
- ip6addrlbl_init_table[i].label, 0);
- /* XXX: should we free all rules when we catch an error? */
- if (ret && (!err || err != -ENOMEM))
- err = ret;
+ err = ip6addrlbl_add(net,
+ ip6addrlbl_init_table[i].prefix,
+ ip6addrlbl_init_table[i].prefixlen,
+ 0,
+ ip6addrlbl_init_table[i].label, 0);
+ if (err)
+ goto err_ip6addrlbl_add;
+ }
+ return 0;
+
+err_ip6addrlbl_add:
+ hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) {
+ hlist_del_rcu(&p->list);
+ kfree_rcu(p, rcu);
}
return err;
}
@@ -383,8 +382,8 @@ static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh,
u32 label;
int err = 0;
- err = nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy,
- extack);
+ err = nlmsg_parse_deprecated(nlh, sizeof(*ifal), tb, IFAL_MAX,
+ ifal_policy, extack);
if (err < 0)
return err;
@@ -429,6 +428,7 @@ static void ip6addrlbl_putmsg(struct nlmsghdr *nlh,
{
struct ifaddrlblmsg *ifal = nlmsg_data(nlh);
ifal->ifal_family = AF_INET6;
+ ifal->__ifal_reserved = 0;
ifal->ifal_prefixlen = prefixlen;
ifal->ifal_flags = 0;
ifal->ifal_index = ifindex;
@@ -436,7 +436,7 @@ static void ip6addrlbl_putmsg(struct nlmsghdr *nlh,
};
static int ip6addrlbl_fill(struct sk_buff *skb,
- struct ip6addrlbl_entry *p,
+ const struct ip6addrlbl_entry *p,
u32 lseq,
u32 portid, u32 seq, int event,
unsigned int flags)
@@ -458,20 +458,54 @@ static int ip6addrlbl_fill(struct sk_buff *skb,
return 0;
}
+static int ip6addrlbl_valid_dump_req(const struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct ifaddrlblmsg *ifal;
+
+ ifal = nlmsg_payload(nlh, sizeof(*ifal));
+ if (!ifal) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid header for address label dump request");
+ return -EINVAL;
+ }
+
+ if (ifal->__ifal_reserved || ifal->ifal_prefixlen ||
+ ifal->ifal_flags || ifal->ifal_index || ifal->ifal_seq) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for address label dump request");
+ return -EINVAL;
+ }
+
+ if (nlmsg_attrlen(nlh, sizeof(*ifal))) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid data after header for address label dump request");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
struct ip6addrlbl_entry *p;
int idx = 0, s_idx = cb->args[0];
- int err;
+ int err = 0;
+ u32 lseq;
+
+ if (cb->strict_check) {
+ err = ip6addrlbl_valid_dump_req(nlh, cb->extack);
+ if (err < 0)
+ return err;
+ }
rcu_read_lock();
+ lseq = READ_ONCE(net->ipv6.ip6addrlbl_table.seq);
hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) {
if (idx >= s_idx) {
err = ip6addrlbl_fill(skb, p,
- net->ipv6.ip6addrlbl_table.seq,
+ lseq,
NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
+ nlh->nlmsg_seq,
RTM_NEWADDRLABEL,
NLM_F_MULTI);
if (err < 0)
@@ -481,7 +515,7 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb)
}
rcu_read_unlock();
cb->args[0] = idx;
- return skb->len;
+ return err;
}
static inline int ip6addrlbl_msgsize(void)
@@ -491,6 +525,50 @@ static inline int ip6addrlbl_msgsize(void)
+ nla_total_size(4); /* IFAL_LABEL */
}
+static int ip6addrlbl_valid_get_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct ifaddrlblmsg *ifal;
+ int i, err;
+
+ ifal = nlmsg_payload(nlh, sizeof(*ifal));
+ if (!ifal) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid header for addrlabel get request");
+ return -EINVAL;
+ }
+
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse_deprecated(nlh, sizeof(*ifal), tb,
+ IFAL_MAX, ifal_policy, extack);
+
+ if (ifal->__ifal_reserved || ifal->ifal_flags || ifal->ifal_seq) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for addrlabel get request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifal), tb, IFAL_MAX,
+ ifal_policy, extack);
+ if (err)
+ return err;
+
+ for (i = 0; i <= IFAL_MAX; i++) {
+ if (!tb[i])
+ continue;
+
+ switch (i) {
+ case IFAL_ADDRESS:
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in addrlabel get request");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
@@ -503,8 +581,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct ip6addrlbl_entry *p;
struct sk_buff *skb;
- err = nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy,
- extack);
+ err = ip6addrlbl_valid_get_req(in_skb, nlh, tb, extack);
if (err < 0)
return err;
@@ -530,7 +607,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
rcu_read_lock();
p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index);
- lseq = net->ipv6.ip6addrlbl_table.seq;
+ lseq = READ_ONCE(net->ipv6.ip6addrlbl_table.seq);
if (p)
err = ip6addrlbl_fill(skb, p, lseq,
NETLINK_CB(in_skb).portid,
@@ -547,22 +624,17 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh,
return err;
}
+static const struct rtnl_msg_handler ipv6_adddr_label_rtnl_msg_handlers[] __initconst_or_module = {
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWADDRLABEL,
+ .doit = ip6addrlbl_newdel, .flags = RTNL_FLAG_DOIT_UNLOCKED},
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELADDRLABEL,
+ .doit = ip6addrlbl_newdel, .flags = RTNL_FLAG_DOIT_UNLOCKED},
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETADDRLABEL,
+ .doit = ip6addrlbl_get, .dumpit = ip6addrlbl_dump,
+ .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
+};
+
int __init ipv6_addr_label_rtnl_register(void)
{
- int ret;
-
- ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWADDRLABEL,
- ip6addrlbl_newdel,
- NULL, RTNL_FLAG_DOIT_UNLOCKED);
- if (ret < 0)
- return ret;
- ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELADDRLABEL,
- ip6addrlbl_newdel,
- NULL, RTNL_FLAG_DOIT_UNLOCKED);
- if (ret < 0)
- return ret;
- ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETADDRLABEL,
- ip6addrlbl_get,
- ip6addrlbl_dump, RTNL_FLAG_DOIT_UNLOCKED);
- return ret;
+ return rtnl_register_many(ipv6_adddr_label_rtnl_msg_handlers);
}
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 9a4261e50272..b705751eb73c 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* PF_INET6 socket protocol family
* Linux INET6 implementation
@@ -11,11 +12,6 @@
* piggy, Karl Knutson : Socket protocol table
* Hideaki YOSHIFUJI : sin6_scope_id support
* Arnaldo Melo : check proc_net_create return, cleanups
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "IPv6: " fmt
@@ -56,12 +52,19 @@
#include <net/transp_v6.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
+#include <net/ipv6_stubs.h>
#include <net/ndisc.h>
#ifdef CONFIG_IPV6_TUNNEL
#include <net/ip6_tunnel.h>
#endif
#include <net/calipso.h>
#include <net/seg6.h>
+#include <net/rpl.h>
+#include <net/compat.h>
+#include <net/xfrm.h>
+#include <net/ioam6.h>
+#include <net/rawv6.h>
+#include <net/rps.h>
#include <linux/uaccess.h>
#include <linux/mroute6.h>
@@ -100,13 +103,20 @@ bool ipv6_mod_enabled(void)
}
EXPORT_SYMBOL_GPL(ipv6_mod_enabled);
-static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
+static struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
{
- const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo);
+ const int offset = sk->sk_prot->ipv6_pinfo_offset;
return (struct ipv6_pinfo *)(((u8 *)sk) + offset);
}
+void inet6_sock_destruct(struct sock *sk)
+{
+ inet6_cleanup_sock(sk);
+ inet_sock_destruct(sk);
+}
+EXPORT_SYMBOL_GPL(inet6_sock_destruct);
+
static int inet6_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
@@ -190,16 +200,19 @@ lookup_protocol:
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = SK_CAN_REUSE;
+ if (INET_PROTOSW_ICSK & answer_flags)
+ inet_init_csk_locks(sk);
+
inet = inet_sk(sk);
- inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
+ inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags);
if (SOCK_RAW == sock->type) {
inet->inet_num = protocol;
if (IPPROTO_RAW == protocol)
- inet->hdrincl = 1;
+ inet_set_bit(HDRINCL, sk);
}
- sk->sk_destruct = inet_sock_destruct;
+ sk->sk_destruct = inet6_sock_destruct;
sk->sk_family = PF_INET6;
sk->sk_protocol = protocol;
@@ -208,36 +221,29 @@ lookup_protocol:
inet_sk(sk)->pinet6 = np = inet6_sk_generic(sk);
np->hop_limit = -1;
np->mcast_hops = IPV6_DEFAULT_MCASTHOPS;
- np->mc_loop = 1;
+ inet6_set_bit(MC6_LOOP, sk);
+ inet6_set_bit(MC6_ALL, sk);
np->pmtudisc = IPV6_PMTUDISC_WANT;
- np->repflow = net->ipv6.sysctl.flowlabel_reflect;
+ inet6_assign_bit(REPFLOW, sk, net->ipv6.sysctl.flowlabel_reflect &
+ FLOWLABEL_REFLECT_ESTABLISHED);
sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
+ sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash);
/* Init the ipv4 part of the socket since we can have sockets
* using v6 API for ipv4.
*/
inet->uc_ttl = -1;
- inet->mc_loop = 1;
+ inet_set_bit(MC_LOOP, sk);
inet->mc_ttl = 1;
inet->mc_index = 0;
- inet->mc_list = NULL;
+ RCU_INIT_POINTER(inet->mc_list, NULL);
inet->rcv_tos = 0;
- if (net->ipv4.sysctl_ip_no_pmtu_disc)
+ if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc))
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT;
- /*
- * Increment only the relevant sk_prot->socks debug field, this changes
- * the previous behaviour of incrementing both the equivalent to
- * answer->prot->socks (inet6_sock_nr) and inet_sock_nr.
- *
- * This allows better debug granularity as we'll know exactly how many
- * UDPv6, TCPv6, etc socks were allocated, not the sum of all IPv6
- * transport protocol socks. -acme
- */
- sk_refcnt_debug_inc(sk);
if (inet->inet_num) {
/* It assumes that any protocol which allows
@@ -246,35 +252,33 @@ lookup_protocol:
*/
inet->inet_sport = htons(inet->inet_num);
err = sk->sk_prot->hash(sk);
- if (err) {
- sk_common_release(sk);
- goto out;
- }
+ if (err)
+ goto out_sk_release;
}
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
- if (err) {
- sk_common_release(sk);
- goto out;
- }
+ if (err)
+ goto out_sk_release;
}
if (!kern) {
err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk);
- if (err) {
- sk_common_release(sk);
- goto out;
- }
+ if (err)
+ goto out_sk_release;
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
+out_sk_release:
+ sk_common_release(sk);
+ sock->sk = NULL;
+ goto out;
}
-static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
- bool force_bind_address_no_port, bool with_lock)
+static int __inet6_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len,
+ u32 flags)
{
struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr;
struct inet_sock *inet = inet_sk(sk);
@@ -294,11 +298,12 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
return -EINVAL;
snum = ntohs(addr->sin6_port);
- if (snum && snum < inet_prot_sock(net) &&
+ if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
+ snum && inet_port_requires_bind_service(net, snum) &&
!ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
return -EACCES;
- if (with_lock)
+ if (flags & BIND_WITH_LOCK)
lock_sock(sk);
/* Check these errors (active socket, double bind). */
@@ -309,24 +314,33 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
/* Check if the address belongs to the host. */
if (addr_type == IPV6_ADDR_MAPPED) {
+ struct net_device *dev = NULL;
int chk_addr_ret;
/* Binding to v4-mapped address on a v6-only socket
* makes no sense
*/
- if (sk->sk_ipv6only) {
+ if (ipv6_only_sock(sk)) {
err = -EINVAL;
goto out;
}
+ rcu_read_lock();
+ if (sk->sk_bound_dev_if) {
+ dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
+ if (!dev) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+ }
+
/* Reproduce AF_INET checks to make the bindings consistent */
v4addr = addr->sin6_addr.s6_addr32[3];
- chk_addr_ret = inet_addr_type(net, v4addr);
- if (!inet_can_nonlocal_bind(net, inet) &&
- v4addr != htonl(INADDR_ANY) &&
- chk_addr_ret != RTN_LOCAL &&
- chk_addr_ret != RTN_MULTICAST &&
- chk_addr_ret != RTN_BROADCAST) {
+ chk_addr_ret = inet_addr_type_dev_table(net, dev, v4addr);
+ rcu_read_unlock();
+
+ if (!inet_addr_valid_or_nonlocal(net, inet, v4addr,
+ chk_addr_ret)) {
err = -EADDRNOTAVAIL;
goto out;
}
@@ -349,6 +363,9 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
err = -EINVAL;
goto out_unlock;
}
+ }
+
+ if (sk->sk_bound_dev_if) {
dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if);
if (!dev) {
err = -ENODEV;
@@ -385,20 +402,24 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
sk->sk_ipv6only = 1;
/* Make sure we are allowed to bind here. */
- if (snum || !(inet->bind_address_no_port ||
- force_bind_address_no_port)) {
- if (sk->sk_prot->get_port(sk, snum)) {
- sk->sk_ipv6only = saved_ipv6only;
- inet_reset_saddr(sk);
- err = -EADDRINUSE;
- goto out;
- }
- err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk);
+ if (snum || !(inet_test_bit(BIND_ADDRESS_NO_PORT, sk) ||
+ (flags & BIND_FORCE_ADDRESS_NO_PORT))) {
+ err = sk->sk_prot->get_port(sk, snum);
if (err) {
sk->sk_ipv6only = saved_ipv6only;
inet_reset_saddr(sk);
goto out;
}
+ if (!(flags & BIND_FROM_BPF)) {
+ err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk);
+ if (err) {
+ sk->sk_ipv6only = saved_ipv6only;
+ inet_reset_saddr(sk);
+ if (sk->sk_prot->put_port)
+ sk->sk_prot->put_port(sk);
+ goto out;
+ }
+ }
}
if (addr_type != IPV6_ADDR_ANY)
@@ -409,7 +430,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
inet->inet_dport = 0;
inet->inet_daddr = 0;
out:
- if (with_lock)
+ if (flags & BIND_WITH_LOCK)
release_sock(sk);
return err;
out_unlock:
@@ -417,15 +438,17 @@ out_unlock:
goto out;
}
-/* bind for INET6 API */
-int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+int inet6_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
{
- struct sock *sk = sock->sk;
+ u32 flags = BIND_WITH_LOCK;
+ const struct proto *prot;
int err = 0;
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ prot = READ_ONCE(sk->sk_prot);
/* If the socket has its own bind function then use it. */
- if (sk->sk_prot->bind)
- return sk->sk_prot->bind(sk, uaddr, addr_len);
+ if (prot->bind)
+ return prot->bind(sk, uaddr, addr_len);
if (addr_len < SIN6_LEN_RFC2133)
return -EINVAL;
@@ -433,11 +456,18 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
/* BPF prog is run before any checks are done so that if the prog
* changes context in a wrong way it will be caught.
*/
- err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr);
+ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len,
+ CGROUP_INET6_BIND, &flags);
if (err)
return err;
- return __inet6_bind(sk, uaddr, addr_len, false, true);
+ return __inet6_bind(sk, uaddr, addr_len, flags);
+}
+
+/* bind for INET6 API */
+int inet6_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len)
+{
+ return inet6_bind_sk(sock->sk, uaddr, addr_len);
}
EXPORT_SYMBOL(inet6_bind);
@@ -458,7 +488,7 @@ int inet6_release(struct socket *sock)
}
EXPORT_SYMBOL(inet6_release);
-void inet6_destroy_sock(struct sock *sk)
+void inet6_cleanup_sock(struct sock *sk)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct sk_buff *skb;
@@ -467,34 +497,32 @@ void inet6_destroy_sock(struct sock *sk)
/* Release rx options */
skb = xchg(&np->pktoptions, NULL);
- if (skb)
- kfree_skb(skb);
+ kfree_skb(skb);
skb = xchg(&np->rxpmtu, NULL);
- if (skb)
- kfree_skb(skb);
+ kfree_skb(skb);
/* Free flowlabels */
fl6_free_socklist(sk);
/* Free tx options */
- opt = xchg((__force struct ipv6_txoptions **)&np->opt, NULL);
+ opt = unrcu_pointer(xchg(&np->opt, NULL));
if (opt) {
atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
txopt_put(opt);
}
}
-EXPORT_SYMBOL_GPL(inet6_destroy_sock);
+EXPORT_SYMBOL_GPL(inet6_cleanup_sock);
/*
* This does both peername and sockname.
*/
-
int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
- int peer)
+ int peer)
{
struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr;
+ int sin_addr_len = sizeof(*sin);
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -502,63 +530,158 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
sin->sin6_family = AF_INET6;
sin->sin6_flowinfo = 0;
sin->sin6_scope_id = 0;
+ lock_sock(sk);
if (peer) {
- if (!inet->inet_dport)
- return -ENOTCONN;
- if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
- peer == 1)
+ if (!inet->inet_dport ||
+ (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
+ peer == 1)) {
+ release_sock(sk);
return -ENOTCONN;
+ }
sin->sin6_port = inet->inet_dport;
sin->sin6_addr = sk->sk_v6_daddr;
- if (np->sndflow)
+ if (inet6_test_bit(SNDFLOW, sk))
sin->sin6_flowinfo = np->flow_label;
+ BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
+ CGROUP_INET6_GETPEERNAME);
} else {
if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
sin->sin6_addr = np->saddr;
else
sin->sin6_addr = sk->sk_v6_rcv_saddr;
-
sin->sin6_port = inet->inet_sport;
+ BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
+ CGROUP_INET6_GETSOCKNAME);
}
sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
sk->sk_bound_dev_if);
- return sizeof(*sin);
+ release_sock(sk);
+ return sin_addr_len;
}
EXPORT_SYMBOL(inet6_getname);
int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
+ void __user *argp = (void __user *)arg;
struct sock *sk = sock->sk;
struct net *net = sock_net(sk);
+ const struct proto *prot;
switch (cmd) {
- case SIOCGSTAMP:
- return sock_get_timestamp(sk, (struct timeval __user *)arg);
-
- case SIOCGSTAMPNS:
- return sock_get_timestampns(sk, (struct timespec __user *)arg);
-
case SIOCADDRT:
- case SIOCDELRT:
-
- return ipv6_route_ioctl(net, cmd, (void __user *)arg);
+ case SIOCDELRT: {
+ struct in6_rtmsg rtmsg;
+ if (copy_from_user(&rtmsg, argp, sizeof(rtmsg)))
+ return -EFAULT;
+ return ipv6_route_ioctl(net, cmd, &rtmsg);
+ }
case SIOCSIFADDR:
- return addrconf_add_ifaddr(net, (void __user *) arg);
+ return addrconf_add_ifaddr(net, argp);
case SIOCDIFADDR:
- return addrconf_del_ifaddr(net, (void __user *) arg);
+ return addrconf_del_ifaddr(net, argp);
case SIOCSIFDSTADDR:
- return addrconf_set_dstaddr(net, (void __user *) arg);
+ return addrconf_set_dstaddr(net, argp);
default:
- if (!sk->sk_prot->ioctl)
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ prot = READ_ONCE(sk->sk_prot);
+ if (!prot->ioctl)
return -ENOIOCTLCMD;
- return sk->sk_prot->ioctl(sk, cmd, arg);
+ return sk_ioctl(sk, cmd, (void __user *)arg);
}
/*NOTREACHED*/
return 0;
}
EXPORT_SYMBOL(inet6_ioctl);
+#ifdef CONFIG_COMPAT
+struct compat_in6_rtmsg {
+ struct in6_addr rtmsg_dst;
+ struct in6_addr rtmsg_src;
+ struct in6_addr rtmsg_gateway;
+ u32 rtmsg_type;
+ u16 rtmsg_dst_len;
+ u16 rtmsg_src_len;
+ u32 rtmsg_metric;
+ u32 rtmsg_info;
+ u32 rtmsg_flags;
+ s32 rtmsg_ifindex;
+};
+
+static int inet6_compat_routing_ioctl(struct sock *sk, unsigned int cmd,
+ struct compat_in6_rtmsg __user *ur)
+{
+ struct in6_rtmsg rt;
+
+ if (copy_from_user(&rt.rtmsg_dst, &ur->rtmsg_dst,
+ 3 * sizeof(struct in6_addr)) ||
+ get_user(rt.rtmsg_type, &ur->rtmsg_type) ||
+ get_user(rt.rtmsg_dst_len, &ur->rtmsg_dst_len) ||
+ get_user(rt.rtmsg_src_len, &ur->rtmsg_src_len) ||
+ get_user(rt.rtmsg_metric, &ur->rtmsg_metric) ||
+ get_user(rt.rtmsg_info, &ur->rtmsg_info) ||
+ get_user(rt.rtmsg_flags, &ur->rtmsg_flags) ||
+ get_user(rt.rtmsg_ifindex, &ur->rtmsg_ifindex))
+ return -EFAULT;
+
+
+ return ipv6_route_ioctl(sock_net(sk), cmd, &rt);
+}
+
+int inet6_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ void __user *argp = compat_ptr(arg);
+ struct sock *sk = sock->sk;
+
+ switch (cmd) {
+ case SIOCADDRT:
+ case SIOCDELRT:
+ return inet6_compat_routing_ioctl(sk, cmd, argp);
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+EXPORT_SYMBOL_GPL(inet6_compat_ioctl);
+#endif /* CONFIG_COMPAT */
+
+INDIRECT_CALLABLE_DECLARE(int udpv6_sendmsg(struct sock *, struct msghdr *,
+ size_t));
+int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+{
+ struct sock *sk = sock->sk;
+ const struct proto *prot;
+
+ if (unlikely(inet_send_prepare(sk)))
+ return -EAGAIN;
+
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ prot = READ_ONCE(sk->sk_prot);
+ return INDIRECT_CALL_2(prot->sendmsg, tcp_sendmsg, udpv6_sendmsg,
+ sk, msg, size);
+}
+
+INDIRECT_CALLABLE_DECLARE(int udpv6_recvmsg(struct sock *, struct msghdr *,
+ size_t, int, int *));
+int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+ int flags)
+{
+ struct sock *sk = sock->sk;
+ const struct proto *prot;
+ int addr_len = 0;
+ int err;
+
+ if (likely(!(flags & MSG_ERRQUEUE)))
+ sock_rps_record_flow(sk);
+
+ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
+ prot = READ_ONCE(sk->sk_prot);
+ err = INDIRECT_CALL_2(prot->recvmsg, tcp_recvmsg, udpv6_recvmsg,
+ sk, msg, size, flags, &addr_len);
+ if (err >= 0)
+ msg->msg_namelen = addr_len;
+ return err;
+}
+
const struct proto_ops inet6_stream_ops = {
.family = PF_INET6,
.owner = THIS_MODULE,
@@ -570,27 +693,29 @@ const struct proto_ops inet6_stream_ops = {
.getname = inet6_getname,
.poll = tcp_poll, /* ok */
.ioctl = inet6_ioctl, /* must change */
+ .gettstamp = sock_gettstamp,
.listen = inet_listen, /* ok */
.shutdown = inet_shutdown, /* ok */
.setsockopt = sock_common_setsockopt, /* ok */
.getsockopt = sock_common_getsockopt, /* ok */
- .sendmsg = inet_sendmsg, /* ok */
- .recvmsg = inet_recvmsg, /* ok */
+ .sendmsg = inet6_sendmsg, /* retpoline's sake */
+ .recvmsg = inet6_recvmsg, /* retpoline's sake */
#ifdef CONFIG_MMU
.mmap = tcp_mmap,
#endif
- .sendpage = inet_sendpage,
+ .splice_eof = inet_splice_eof,
.sendmsg_locked = tcp_sendmsg_locked,
- .sendpage_locked = tcp_sendpage_locked,
.splice_read = tcp_splice_read,
+ .set_peek_off = sk_set_peek_off,
.read_sock = tcp_read_sock,
+ .read_skb = tcp_read_skb,
.peek_len = tcp_peek_len,
#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet6_compat_ioctl,
#endif
.set_rcvlowat = tcp_set_rcvlowat,
};
+EXPORT_SYMBOL_GPL(inet6_stream_ops);
const struct proto_ops inet6_dgram_ops = {
.family = PF_INET6,
@@ -603,18 +728,18 @@ const struct proto_ops inet6_dgram_ops = {
.getname = inet6_getname,
.poll = udp_poll, /* ok */
.ioctl = inet6_ioctl, /* must change */
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen, /* ok */
.shutdown = inet_shutdown, /* ok */
.setsockopt = sock_common_setsockopt, /* ok */
.getsockopt = sock_common_getsockopt, /* ok */
- .sendmsg = inet_sendmsg, /* ok */
- .recvmsg = inet_recvmsg, /* ok */
+ .sendmsg = inet6_sendmsg, /* retpoline's sake */
+ .recvmsg = inet6_recvmsg, /* retpoline's sake */
+ .read_skb = udp_read_skb,
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
- .set_peek_off = sk_set_peek_off,
+ .set_peek_off = udp_set_peek_off,
#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet6_compat_ioctl,
#endif
};
@@ -717,22 +842,22 @@ int inet6_sk_rebuild_header(struct sock *sk)
fl6.flowi6_mark = sk->sk_mark;
fl6.fl6_dport = inet->inet_dport;
fl6.fl6_sport = inet->inet_sport;
- fl6.flowi6_uid = sk->sk_uid;
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ fl6.flowi6_uid = sk_uid(sk);
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
rcu_read_lock();
final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt),
&final);
rcu_read_unlock();
- dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
if (IS_ERR(dst)) {
sk->sk_route_caps = 0;
- sk->sk_err_soft = -PTR_ERR(dst);
+ WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst));
return PTR_ERR(dst);
}
- ip6_dst_store(sk, dst, NULL, NULL);
+ ip6_dst_store(sk, dst, false, false);
}
return 0;
@@ -757,7 +882,6 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
}
return false;
}
-EXPORT_SYMBOL_GPL(ipv6_opt_accepted);
static struct packet_type ipv6_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IPV6),
@@ -833,6 +957,19 @@ static int __net_init inet6_net_init(struct net *net)
net->ipv6.sysctl.bindv6only = 0;
net->ipv6.sysctl.icmpv6_time = 1*HZ;
net->ipv6.sysctl.icmpv6_echo_ignore_all = 0;
+ net->ipv6.sysctl.icmpv6_echo_ignore_multicast = 0;
+ net->ipv6.sysctl.icmpv6_echo_ignore_anycast = 0;
+ net->ipv6.sysctl.icmpv6_error_anycast_as_unicast = 0;
+ net->ipv6.sysctl.icmpv6_errors_extension_mask = 0;
+
+ /* By default, rate limit error messages.
+ * Except for pmtu discovery, it would break it.
+ * proc_do_large_bitmap needs pointer to the bitmap.
+ */
+ bitmap_set(net->ipv6.sysctl.icmpv6_ratemask, 0, ICMPV6_ERRMSG_MAX + 1);
+ bitmap_clear(net->ipv6.sysctl.icmpv6_ratemask, ICMPV6_PKT_TOOBIG, 1);
+ net->ipv6.sysctl.icmpv6_ratemask_ptr = net->ipv6.sysctl.icmpv6_ratemask;
+
net->ipv6.sysctl.flowlabel_consistency = 1;
net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS;
net->ipv6.sysctl.idgen_retries = 3;
@@ -842,8 +979,12 @@ static int __net_init inet6_net_init(struct net *net)
net->ipv6.sysctl.max_hbh_opts_cnt = IP6_DEFAULT_MAX_HBH_OPTS_CNT;
net->ipv6.sysctl.max_dst_opts_len = IP6_DEFAULT_MAX_DST_OPTS_LEN;
net->ipv6.sysctl.max_hbh_opts_len = IP6_DEFAULT_MAX_HBH_OPTS_LEN;
+ net->ipv6.sysctl.fib_notify_on_flag_change = 0;
atomic_set(&net->ipv6.fib6_sernum, 1);
+ net->ipv6.sysctl.ioam6_id = IOAM6_DEFAULT_ID;
+ net->ipv6.sysctl.ioam6_id_wide = IOAM6_DEFAULT_ID_WIDE;
+
err = ipv6_init_mibs(net);
if (err)
return err;
@@ -886,22 +1027,48 @@ static struct pernet_operations inet6_net_ops = {
.exit = inet6_net_exit,
};
+static int ipv6_route_input(struct sk_buff *skb)
+{
+ ip6_route_input(skb);
+ return skb_dst(skb)->error;
+}
+
static const struct ipv6_stub ipv6_stub_impl = {
.ipv6_sock_mc_join = ipv6_sock_mc_join,
.ipv6_sock_mc_drop = ipv6_sock_mc_drop,
- .ipv6_dst_lookup = ip6_dst_lookup,
+ .ipv6_dst_lookup_flow = ip6_dst_lookup_flow,
+ .ipv6_route_input = ipv6_route_input,
.fib6_get_table = fib6_get_table,
.fib6_table_lookup = fib6_table_lookup,
.fib6_lookup = fib6_lookup,
- .fib6_multipath_select = fib6_multipath_select,
+ .fib6_select_path = fib6_select_path,
.ip6_mtu_from_fib6 = ip6_mtu_from_fib6,
+ .fib6_nh_init = fib6_nh_init,
+ .fib6_nh_release = fib6_nh_release,
+ .fib6_nh_release_dsts = fib6_nh_release_dsts,
+ .fib6_update_sernum = fib6_update_sernum_stub,
+ .fib6_rt_update = fib6_rt_update,
+ .ip6_del_rt = ip6_del_rt,
.udpv6_encap_enable = udpv6_encap_enable,
.ndisc_send_na = ndisc_send_na,
+#if IS_ENABLED(CONFIG_XFRM)
+ .xfrm6_local_rxpmtu = xfrm6_local_rxpmtu,
+ .xfrm6_udp_encap_rcv = xfrm6_udp_encap_rcv,
+ .xfrm6_gro_udp_encap_rcv = xfrm6_gro_udp_encap_rcv,
+ .xfrm6_rcv_encap = xfrm6_rcv_encap,
+#endif
.nd_tbl = &nd_tbl,
+ .ipv6_fragment = ip6_fragment,
+ .ipv6_dev_find = ipv6_dev_find,
+ .ip6_xmit = ip6_xmit,
};
static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = {
.inet6_bind = __inet6_bind,
+ .udp6_lib_lookup = __udp6_lib_lookup,
+ .ipv6_setsockopt = do_ipv6_setsockopt,
+ .ipv6_getsockopt = do_ipv6_getsockopt,
+ .ipv6_dev_get_saddr = ipv6_dev_get_saddr,
};
static int __init inet6_init(void)
@@ -915,6 +1082,8 @@ static int __init inet6_init(void)
for (r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r)
INIT_LIST_HEAD(r);
+ raw_hashinfo_init(&raw_v6_hashinfo);
+
if (disable_ipv6_mod) {
pr_info("Loaded, but administratively disabled, reboot required to enable\n");
goto out;
@@ -1001,6 +1170,9 @@ static int __init inet6_init(void)
err = ip6_flowlabel_init();
if (err)
goto ip6_flowlabel_fail;
+ err = ipv6_anycast_init();
+ if (err)
+ goto ipv6_anycast_fail;
err = addrconf_init();
if (err)
goto addrconf_fail;
@@ -1047,6 +1219,14 @@ static int __init inet6_init(void)
if (err)
goto seg6_fail;
+ err = rpl_init();
+ if (err)
+ goto rpl_fail;
+
+ err = ioam6_init();
+ if (err)
+ goto ioam6_fail;
+
err = igmp6_late_init();
if (err)
goto igmp6_late_err;
@@ -1069,6 +1249,10 @@ sysctl_fail:
igmp6_late_cleanup();
#endif
igmp6_late_err:
+ ioam6_exit();
+ioam6_fail:
+ rpl_exit();
+rpl_fail:
seg6_exit();
seg6_fail:
calipso_exit();
@@ -1091,6 +1275,8 @@ ipv6_frag_fail:
ipv6_exthdrs_fail:
addrconf_cleanup();
addrconf_fail:
+ ipv6_anycast_cleanup();
+ipv6_anycast_fail:
ip6_flowlabel_cleanup();
ip6_flowlabel_fail:
ndisc_late_cleanup();
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index 78c974391567..95372e0f1d21 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C)2002 USAGI/WIDE Project
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- *
* Authors
*
* Mitsuru KANDA @USAGI : IPv6 Support
@@ -25,8 +13,8 @@
#define pr_fmt(fmt) "IPv6: " fmt
-#include <crypto/algapi.h>
#include <crypto/hash.h>
+#include <crypto/utils.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <net/ip.h>
@@ -48,7 +36,7 @@ struct tmp_ext {
struct in6_addr saddr;
#endif
struct in6_addr daddr;
- char hdrs[0];
+ char hdrs[];
};
struct ah_skb_cb {
@@ -58,14 +46,40 @@ struct ah_skb_cb {
#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0]))
+/* Helper to save IPv6 addresses and extension headers to temporary storage */
+static inline void ah6_save_hdrs(struct tmp_ext *iph_ext,
+ struct ipv6hdr *top_iph, int extlen)
+{
+ if (!extlen)
+ return;
+
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ iph_ext->saddr = top_iph->saddr;
+#endif
+ iph_ext->daddr = top_iph->daddr;
+ memcpy(&iph_ext->hdrs, top_iph + 1, extlen - sizeof(*iph_ext));
+}
+
+/* Helper to restore IPv6 addresses and extension headers from temporary storage */
+static inline void ah6_restore_hdrs(struct ipv6hdr *top_iph,
+ struct tmp_ext *iph_ext, int extlen)
+{
+ if (!extlen)
+ return;
+
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ top_iph->saddr = iph_ext->saddr;
+#endif
+ top_iph->daddr = iph_ext->daddr;
+ memcpy(top_iph + 1, &iph_ext->hdrs, extlen - sizeof(*iph_ext));
+}
+
static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags,
unsigned int size)
{
unsigned int len;
- len = size + crypto_ahash_digestsize(ahash) +
- (crypto_ahash_alignmask(ahash) &
- ~(crypto_tfm_ctx_alignment() - 1));
+ len = size + crypto_ahash_digestsize(ahash);
len = ALIGN(len, crypto_tfm_ctx_alignment());
@@ -87,10 +101,9 @@ static inline u8 *ah_tmp_auth(u8 *tmp, unsigned int offset)
return tmp + offset;
}
-static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp,
- unsigned int offset)
+static inline u8 *ah_tmp_icv(void *tmp, unsigned int offset)
{
- return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1);
+ return tmp + offset;
}
static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash,
@@ -187,7 +200,6 @@ static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *des
* See 11.3.2 of RFC 3775 for details.
*/
if (opt[off] == IPV6_TLV_HAO) {
- struct in6_addr final_addr;
struct ipv6_destopt_hao *hao;
hao = (struct ipv6_destopt_hao *)&opt[off];
@@ -196,9 +208,7 @@ static void ipv6_rearrange_destopt(struct ipv6hdr *iph, struct ipv6_opt_hdr *des
hao->length);
goto bad;
}
- final_addr = hao->addr;
- hao->addr = iph->saddr;
- iph->saddr = final_addr;
+ swap(hao->addr, iph->saddr);
}
break;
}
@@ -271,7 +281,7 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir)
case NEXTHDR_DEST:
if (dir == XFRM_POLICY_OUT)
ipv6_rearrange_destopt(iph, exthdr.opth);
- /* fall through */
+ fallthrough;
case NEXTHDR_HOP:
if (!zero_out_mutable_opts(exthdr.opth)) {
net_dbg_ratelimited("overrun %sopts\n",
@@ -296,12 +306,12 @@ static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len, int dir)
return 0;
}
-static void ah6_output_done(struct crypto_async_request *base, int err)
+static void ah6_output_done(void *data, int err)
{
int extlen;
u8 *iph_base;
u8 *icv;
- struct sk_buff *skb = base->data;
+ struct sk_buff *skb = data;
struct xfrm_state *x = skb_dst(skb)->xfrm;
struct ah_data *ahp = x->data;
struct ipv6hdr *top_iph = ipv6_hdr(skb);
@@ -314,21 +324,15 @@ static void ah6_output_done(struct crypto_async_request *base, int err)
iph_base = AH_SKB_CB(skb)->tmp;
iph_ext = ah_tmp_ext(iph_base);
- icv = ah_tmp_icv(ahp->ahash, iph_ext, extlen);
+ icv = ah_tmp_icv(iph_ext, extlen);
memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
memcpy(top_iph, iph_base, IPV6HDR_BASELEN);
- if (extlen) {
-#if IS_ENABLED(CONFIG_IPV6_MIP6)
- memcpy(&top_iph->saddr, iph_ext, extlen);
-#else
- memcpy(&top_iph->daddr, iph_ext, extlen);
-#endif
- }
+ ah6_restore_hdrs(top_iph, iph_ext, extlen);
kfree(AH_SKB_CB(skb)->tmp);
- xfrm_output_resume(skb, err);
+ xfrm_output_resume(skb->sk, skb, err);
}
static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
@@ -377,7 +381,7 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
iph_ext = ah_tmp_ext(iph_base);
seqhi = (__be32 *)((char *)iph_ext + extlen);
- icv = ah_tmp_icv(ahash, seqhi, seqhi_len);
+ icv = ah_tmp_icv(seqhi, seqhi_len);
req = ah_tmp_req(ahash, icv);
sg = ah_req_sg(ahash, req);
seqhisg = sg + nfrags;
@@ -396,12 +400,8 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
*/
memcpy(iph_base, top_iph, IPV6HDR_BASELEN);
+ ah6_save_hdrs(iph_ext, top_iph, extlen);
if (extlen) {
-#if IS_ENABLED(CONFIG_IPV6_MIP6)
- memcpy(iph_ext, &top_iph->saddr, extlen);
-#else
- memcpy(iph_ext, &top_iph->daddr, extlen);
-#endif
err = ipv6_clear_mutable_options(top_iph,
extlen - sizeof(*iph_ext) +
sizeof(*top_iph),
@@ -452,13 +452,7 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
memcpy(top_iph, iph_base, IPV6HDR_BASELEN);
- if (extlen) {
-#if IS_ENABLED(CONFIG_IPV6_MIP6)
- memcpy(&top_iph->saddr, iph_ext, extlen);
-#else
- memcpy(&top_iph->daddr, iph_ext, extlen);
-#endif
- }
+ ah6_restore_hdrs(top_iph, iph_ext, extlen);
out_free:
kfree(iph_base);
@@ -466,24 +460,24 @@ out:
return err;
}
-static void ah6_input_done(struct crypto_async_request *base, int err)
+static void ah6_input_done(void *data, int err)
{
u8 *auth_data;
u8 *icv;
u8 *work_iph;
- struct sk_buff *skb = base->data;
+ struct sk_buff *skb = data;
struct xfrm_state *x = xfrm_input_state(skb);
struct ah_data *ahp = x->data;
struct ip_auth_hdr *ah = ip_auth_hdr(skb);
int hdr_len = skb_network_header_len(skb);
- int ah_hlen = (ah->hdrlen + 2) << 2;
+ int ah_hlen = ipv6_authlen(ah);
if (err)
goto out;
work_iph = AH_SKB_CB(skb)->tmp;
auth_data = ah_tmp_auth(work_iph, hdr_len);
- icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
+ icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len);
err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0;
if (err)
@@ -558,7 +552,7 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
ahash = ahp->ahash;
nexthdr = ah->nexthdr;
- ah_hlen = (ah->hdrlen + 2) << 2;
+ ah_hlen = ipv6_authlen(ah);
if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) &&
ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len))
@@ -591,7 +585,7 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
auth_data = ah_tmp_auth((u8 *)work_iph, hdr_len);
seqhi = (__be32 *)(auth_data + ahp->icv_trunc_len);
- icv = ah_tmp_icv(ahash, seqhi, seqhi_len);
+ icv = ah_tmp_icv(seqhi, seqhi_len);
req = ah_tmp_req(ahash, icv);
sg = ah_req_sg(ahash, req);
seqhisg = sg + nfrags;
@@ -600,7 +594,8 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
memset(ah->auth_data, 0, ahp->icv_trunc_len);
- if (ipv6_clear_mutable_options(ip6h, hdr_len, XFRM_POLICY_IN))
+ err = ipv6_clear_mutable_options(ip6h, hdr_len, XFRM_POLICY_IN);
+ if (err)
goto out_free;
ip6h->priority = 0;
@@ -680,30 +675,38 @@ static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
}
-static int ah6_init_state(struct xfrm_state *x)
+static int ah6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
struct ah_data *ahp = NULL;
struct xfrm_algo_desc *aalg_desc;
struct crypto_ahash *ahash;
- if (!x->aalg)
+ if (!x->aalg) {
+ NL_SET_ERR_MSG(extack, "AH requires a state with an AUTH algorithm");
goto error;
+ }
- if (x->encap)
+ if (x->encap) {
+ NL_SET_ERR_MSG(extack, "AH is not compatible with encapsulation");
goto error;
+ }
ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
if (!ahp)
return -ENOMEM;
ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
- if (IS_ERR(ahash))
+ if (IS_ERR(ahash)) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto error;
+ }
ahp->ahash = ahash;
if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
- (x->aalg->alg_key_len + 7) / 8))
+ (x->aalg->alg_key_len + 7) / 8)) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto error;
+ }
/*
* Lookup the algorithm description maintained by xfrm_algo,
@@ -716,9 +719,7 @@ static int ah6_init_state(struct xfrm_state *x)
if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
crypto_ahash_digestsize(ahash)) {
- pr_info("AH: %s digestsize %u != %hu\n",
- x->aalg->alg_name, crypto_ahash_digestsize(ahash),
- aalg_desc->uinfo.auth.icv_fullbits/8);
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto error;
}
@@ -735,6 +736,7 @@ static int ah6_init_state(struct xfrm_state *x)
x->props.header_len += sizeof(struct ipv6hdr);
break;
default:
+ NL_SET_ERR_MSG(extack, "Invalid mode requested for AH, must be one of TRANSPORT, TUNNEL, BEET");
goto error;
}
x->data = ahp;
@@ -766,7 +768,6 @@ static int ah6_rcv_cb(struct sk_buff *skb, int err)
}
static const struct xfrm_type ah6_type = {
- .description = "AH6",
.owner = THIS_MODULE,
.proto = IPPROTO_AH,
.flags = XFRM_TYPE_REPLAY_PROT,
@@ -774,11 +775,11 @@ static const struct xfrm_type ah6_type = {
.destructor = ah6_destroy,
.input = ah6_input,
.output = ah6_output,
- .hdr_offset = xfrm6_find_1stfragopt,
};
static struct xfrm6_protocol ah6_protocol = {
.handler = xfrm6_rcv,
+ .input_handler = xfrm_input,
.cb_handler = ah6_rcv_cb,
.err_handler = ah6_err,
.priority = 0,
@@ -805,13 +806,12 @@ static void __exit ah6_fini(void)
if (xfrm6_protocol_deregister(&ah6_protocol, IPPROTO_AH) < 0)
pr_info("%s: can't remove protocol\n", __func__);
- if (xfrm_unregister_type(&ah6_type, AF_INET6) < 0)
- pr_info("%s: can't remove xfrm type\n", __func__);
-
+ xfrm_unregister_type(&ah6_type, AF_INET6);
}
module_init(ah6_init);
module_exit(ah6_fini);
+MODULE_DESCRIPTION("IPv6 AH transformation helpers");
MODULE_LICENSE("GPL");
MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_AH);
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 4e0ff7031edd..52599584422b 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Anycast support for IPv6
* Linux INET6 implementation
@@ -6,11 +7,6 @@
* David L Stevens (dlstevens@us.ibm.com)
*
* based heavily on net/ipv6/mcast.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/capability.h>
@@ -44,8 +40,26 @@
#include <net/checksum.h>
+#define IN6_ADDR_HSIZE_SHIFT 8
+#define IN6_ADDR_HSIZE BIT(IN6_ADDR_HSIZE_SHIFT)
+/* anycast address hash table
+ */
+static struct hlist_head inet6_acaddr_lst[IN6_ADDR_HSIZE];
+static DEFINE_SPINLOCK(acaddr_hash_lock);
+
+#define ac_dereference(a, idev) \
+ rcu_dereference_protected(a, lockdep_is_held(&(idev)->lock))
+
static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr);
+static u32 inet6_acaddr_hash(const struct net *net,
+ const struct in6_addr *addr)
+{
+ u32 val = __ipv6_addr_jhash(addr, net_hash_mix(net));
+
+ return hash_32(val, IN6_ADDR_HSIZE_SHIFT);
+}
+
/*
* socket join an anycast group
*/
@@ -53,14 +67,12 @@ static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr);
int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
{
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_ac_socklist *pac = NULL;
+ struct net *net = sock_net(sk);
+ netdevice_tracker dev_tracker;
struct net_device *dev = NULL;
struct inet6_dev *idev;
- struct ipv6_ac_socklist *pac;
- struct net *net = sock_net(sk);
- int ishost = !net->ipv6.devconf_all->forwarding;
- int err = 0;
-
- ASSERT_RTNL();
+ int err = 0, ishost;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
@@ -68,32 +80,43 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
return -EINVAL;
if (ifindex)
- dev = __dev_get_by_index(net, ifindex);
+ dev = netdev_get_by_index(net, ifindex, &dev_tracker, GFP_KERNEL);
- if (ipv6_chk_addr_and_flags(net, addr, dev, true, 0, IFA_F_TENTATIVE))
- return -EINVAL;
+ if (ipv6_chk_addr_and_flags(net, addr, dev, true, 0, IFA_F_TENTATIVE)) {
+ err = -EINVAL;
+ goto error;
+ }
pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL);
- if (!pac)
- return -ENOMEM;
+ if (!pac) {
+ err = -ENOMEM;
+ goto error;
+ }
+
pac->acl_next = NULL;
pac->acl_addr = *addr;
+ ishost = !READ_ONCE(net->ipv6.devconf_all->forwarding);
+
if (ifindex == 0) {
struct rt6_info *rt;
+ rcu_read_lock();
rt = rt6_lookup(net, addr, NULL, 0, NULL, 0);
if (rt) {
- dev = rt->dst.dev;
+ dev = dst_dev_rcu(&rt->dst);
+ netdev_hold(dev, &dev_tracker, GFP_ATOMIC);
ip6_rt_put(rt);
} else if (ishost) {
+ rcu_read_unlock();
err = -EADDRNOTAVAIL;
goto error;
} else {
/* router, no matching interface: just pick one */
- dev = __dev_get_by_flags(net, IFF_UP,
- IFF_UP | IFF_LOOPBACK);
+ dev = netdev_get_by_flags_rcu(net, &dev_tracker, IFF_UP,
+ IFF_UP | IFF_LOOPBACK);
}
+ rcu_read_unlock();
}
if (!dev) {
@@ -101,7 +124,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
goto error;
}
- idev = __in6_dev_get(dev);
+ idev = in6_dev_get(dev);
if (!idev) {
if (ifindex)
err = -ENODEV;
@@ -109,8 +132,9 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
err = -EADDRNOTAVAIL;
goto error;
}
+
/* reset ishost, now that we have a specific device */
- ishost = !idev->cnf.forwarding;
+ ishost = !READ_ONCE(idev->cnf.forwarding);
pac->acl_ifindex = dev->ifindex;
@@ -123,7 +147,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
if (ishost)
err = -EADDRNOTAVAIL;
if (err)
- goto error;
+ goto error_idev;
}
err = __ipv6_dev_ac_inc(idev, addr);
@@ -133,7 +157,11 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
pac = NULL;
}
+error_idev:
+ in6_dev_put(idev);
error:
+ netdev_put(dev, &dev_tracker);
+
if (pac)
sock_kfree_s(sk, pac, sizeof(*pac));
return err;
@@ -144,12 +172,10 @@ error:
*/
int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct net_device *dev;
struct ipv6_ac_socklist *pac, *prev_pac;
+ struct ipv6_pinfo *np = inet6_sk(sk);
struct net *net = sock_net(sk);
-
- ASSERT_RTNL();
+ struct net_device *dev;
prev_pac = NULL;
for (pac = np->ipv6_ac_list; pac; pac = pac->acl_next) {
@@ -165,35 +191,33 @@ int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
else
np->ipv6_ac_list = pac->acl_next;
- dev = __dev_get_by_index(net, pac->acl_ifindex);
- if (dev)
+ dev = dev_get_by_index(net, pac->acl_ifindex);
+ if (dev) {
ipv6_dev_ac_dec(dev, &pac->acl_addr);
+ dev_put(dev);
+ }
sock_kfree_s(sk, pac, sizeof(*pac));
return 0;
}
-void ipv6_sock_ac_close(struct sock *sk)
+void __ipv6_sock_ac_close(struct sock *sk)
{
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct net *net = sock_net(sk);
struct net_device *dev = NULL;
struct ipv6_ac_socklist *pac;
- struct net *net = sock_net(sk);
- int prev_index;
+ int prev_index = 0;
- if (!np->ipv6_ac_list)
- return;
-
- rtnl_lock();
pac = np->ipv6_ac_list;
np->ipv6_ac_list = NULL;
- prev_index = 0;
while (pac) {
struct ipv6_ac_socklist *next = pac->acl_next;
if (pac->acl_ifindex != prev_index) {
- dev = __dev_get_by_index(net, pac->acl_ifindex);
+ dev_put(dev);
+ dev = dev_get_by_index(net, pac->acl_ifindex);
prev_index = pac->acl_ifindex;
}
if (dev)
@@ -201,7 +225,34 @@ void ipv6_sock_ac_close(struct sock *sk)
sock_kfree_s(sk, pac, sizeof(*pac));
pac = next;
}
- rtnl_unlock();
+
+ dev_put(dev);
+}
+
+void ipv6_sock_ac_close(struct sock *sk)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ if (!np->ipv6_ac_list)
+ return;
+
+ __ipv6_sock_ac_close(sk);
+}
+
+static void ipv6_add_acaddr_hash(struct net *net, struct ifacaddr6 *aca)
+{
+ unsigned int hash = inet6_acaddr_hash(net, &aca->aca_addr);
+
+ spin_lock(&acaddr_hash_lock);
+ hlist_add_head_rcu(&aca->aca_addr_lst, &inet6_acaddr_lst[hash]);
+ spin_unlock(&acaddr_hash_lock);
+}
+
+static void ipv6_del_acaddr_hash(struct ifacaddr6 *aca)
+{
+ spin_lock(&acaddr_hash_lock);
+ hlist_del_init_rcu(&aca->aca_addr_lst);
+ spin_unlock(&acaddr_hash_lock);
}
static void aca_get(struct ifacaddr6 *aca)
@@ -209,12 +260,18 @@ static void aca_get(struct ifacaddr6 *aca)
refcount_inc(&aca->aca_refcnt);
}
+static void aca_free_rcu(struct rcu_head *h)
+{
+ struct ifacaddr6 *aca = container_of(h, struct ifacaddr6, rcu);
+
+ fib6_info_release(aca->aca_rt);
+ kfree(aca);
+}
+
static void aca_put(struct ifacaddr6 *ac)
{
- if (refcount_dec_and_test(&ac->aca_refcnt)) {
- fib6_info_release(ac->aca_rt);
- kfree(ac);
- }
+ if (refcount_dec_and_test(&ac->aca_refcnt))
+ call_rcu_hurry(&ac->rcu, aca_free_rcu);
}
static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
@@ -229,6 +286,7 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
aca->aca_addr = *addr;
fib6_info_hold(f6i);
aca->aca_rt = f6i;
+ INIT_HLIST_NODE(&aca->aca_addr_lst);
aca->aca_users = 1;
/* aca_tstamp should be updated upon changes */
aca->aca_cstamp = aca->aca_tstamp = jiffies;
@@ -237,6 +295,37 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
return aca;
}
+static void inet6_ifacaddr_notify(struct net_device *dev,
+ const struct ifacaddr6 *ifaca, int event)
+{
+ struct inet6_fill_args fillargs = {
+ .event = event,
+ .netnsid = -1,
+ };
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOMEM;
+
+ skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) +
+ nla_total_size(sizeof(struct in6_addr)) +
+ nla_total_size(sizeof(struct ifa_cacheinfo)),
+ GFP_KERNEL);
+ if (!skb)
+ goto error;
+
+ err = inet6_fill_ifacaddr(skb, ifaca, &fillargs);
+ if (err < 0) {
+ pr_err("Failed to fill in anycast addresses (err %d)\n", err);
+ nlmsg_free(skb);
+ goto error;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ACADDR, NULL, GFP_KERNEL);
+ return;
+error:
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_ACADDR, err);
+}
+
/*
* device anycast group inc (add if not found)
*/
@@ -247,15 +336,14 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
struct net *net;
int err;
- ASSERT_RTNL();
-
write_lock_bh(&idev->lock);
if (idev->dead) {
err = -ENODEV;
goto out;
}
- for (aca = idev->ac_list; aca; aca = aca->aca_next) {
+ for (aca = ac_dereference(idev->ac_list, idev); aca;
+ aca = ac_dereference(aca->aca_next, idev)) {
if (ipv6_addr_equal(&aca->aca_addr, addr)) {
aca->aca_users++;
err = 0;
@@ -264,7 +352,7 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
}
net = dev_net(idev->dev);
- f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC);
+ f6i = addrconf_f6i_alloc(net, idev, addr, true, GFP_ATOMIC, NULL);
if (IS_ERR(f6i)) {
err = PTR_ERR(f6i);
goto out;
@@ -276,19 +364,23 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
goto out;
}
- aca->aca_next = idev->ac_list;
- idev->ac_list = aca;
-
/* Hold this for addrconf_join_solict() below before we unlock,
* it is already exposed via idev->ac_list.
*/
aca_get(aca);
+ aca->aca_next = idev->ac_list;
+ rcu_assign_pointer(idev->ac_list, aca);
+
write_unlock_bh(&idev->lock);
+ ipv6_add_acaddr_hash(net, aca);
+
ip6_ins_rt(net, f6i);
addrconf_join_solict(idev->dev, &aca->aca_addr);
+ inet6_ifacaddr_notify(idev->dev, aca, RTM_NEWANYCAST);
+
aca_put(aca);
return 0;
out:
@@ -303,11 +395,10 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
{
struct ifacaddr6 *aca, *prev_aca;
- ASSERT_RTNL();
-
write_lock_bh(&idev->lock);
prev_aca = NULL;
- for (aca = idev->ac_list; aca; aca = aca->aca_next) {
+ for (aca = ac_dereference(idev->ac_list, idev); aca;
+ aca = ac_dereference(aca->aca_next, idev)) {
if (ipv6_addr_equal(&aca->aca_addr, addr))
break;
prev_aca = aca;
@@ -321,26 +412,33 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
return 0;
}
if (prev_aca)
- prev_aca->aca_next = aca->aca_next;
+ rcu_assign_pointer(prev_aca->aca_next, aca->aca_next);
else
- idev->ac_list = aca->aca_next;
+ rcu_assign_pointer(idev->ac_list, aca->aca_next);
write_unlock_bh(&idev->lock);
+ ipv6_del_acaddr_hash(aca);
addrconf_leave_solict(idev, &aca->aca_addr);
- ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
+ ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false);
+
+ inet6_ifacaddr_notify(idev->dev, aca, RTM_DELANYCAST);
aca_put(aca);
return 0;
}
-/* called with rtnl_lock() */
static int ipv6_dev_ac_dec(struct net_device *dev, const struct in6_addr *addr)
{
- struct inet6_dev *idev = __in6_dev_get(dev);
+ struct inet6_dev *idev = in6_dev_get(dev);
+ int err;
if (!idev)
return -ENODEV;
- return __ipv6_dev_ac_dec(idev, addr);
+
+ err = __ipv6_dev_ac_dec(idev, addr);
+ in6_dev_put(idev);
+
+ return err;
}
void ipv6_ac_destroy_dev(struct inet6_dev *idev)
@@ -348,13 +446,15 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev)
struct ifacaddr6 *aca;
write_lock_bh(&idev->lock);
- while ((aca = idev->ac_list) != NULL) {
- idev->ac_list = aca->aca_next;
+ while ((aca = ac_dereference(idev->ac_list, idev)) != NULL) {
+ rcu_assign_pointer(idev->ac_list, aca->aca_next);
write_unlock_bh(&idev->lock);
+ ipv6_del_acaddr_hash(aca);
+
addrconf_leave_solict(idev, &aca->aca_addr);
- ip6_del_rt(dev_net(idev->dev), aca->aca_rt);
+ ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false);
aca_put(aca);
@@ -374,11 +474,10 @@ static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *ad
idev = __in6_dev_get(dev);
if (idev) {
- read_lock_bh(&idev->lock);
- for (aca = idev->ac_list; aca; aca = aca->aca_next)
+ for (aca = rcu_dereference(idev->ac_list); aca;
+ aca = rcu_dereference(aca->aca_next))
if (ipv6_addr_equal(&aca->aca_addr, addr))
break;
- read_unlock_bh(&idev->lock);
return aca != NULL;
}
return false;
@@ -390,17 +489,27 @@ static bool ipv6_chk_acast_dev(struct net_device *dev, const struct in6_addr *ad
bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev,
const struct in6_addr *addr)
{
+ struct net_device *nh_dev;
+ struct ifacaddr6 *aca;
bool found = false;
rcu_read_lock();
if (dev)
found = ipv6_chk_acast_dev(dev, addr);
- else
- for_each_netdev_rcu(net, dev)
- if (ipv6_chk_acast_dev(dev, addr)) {
+ else {
+ unsigned int hash = inet6_acaddr_hash(net, addr);
+
+ hlist_for_each_entry_rcu(aca, &inet6_acaddr_lst[hash],
+ aca_addr_lst) {
+ nh_dev = fib6_info_nh_dev(aca->aca_rt);
+ if (!nh_dev || !net_eq(dev_net(nh_dev), net))
+ continue;
+ if (ipv6_addr_equal(&aca->aca_addr, addr)) {
found = true;
break;
}
+ }
+ }
rcu_read_unlock();
return found;
}
@@ -421,30 +530,25 @@ bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev,
struct ac6_iter_state {
struct seq_net_private p;
struct net_device *dev;
- struct inet6_dev *idev;
};
#define ac6_seq_private(seq) ((struct ac6_iter_state *)(seq)->private)
static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq)
{
- struct ifacaddr6 *im = NULL;
struct ac6_iter_state *state = ac6_seq_private(seq);
struct net *net = seq_file_net(seq);
+ struct ifacaddr6 *im = NULL;
- state->idev = NULL;
for_each_netdev_rcu(net, state->dev) {
struct inet6_dev *idev;
+
idev = __in6_dev_get(state->dev);
if (!idev)
continue;
- read_lock_bh(&idev->lock);
- im = idev->ac_list;
- if (im) {
- state->idev = idev;
+ im = rcu_dereference(idev->ac_list);
+ if (im)
break;
- }
- read_unlock_bh(&idev->lock);
}
return im;
}
@@ -452,22 +556,17 @@ static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq)
static struct ifacaddr6 *ac6_get_next(struct seq_file *seq, struct ifacaddr6 *im)
{
struct ac6_iter_state *state = ac6_seq_private(seq);
+ struct inet6_dev *idev;
- im = im->aca_next;
+ im = rcu_dereference(im->aca_next);
while (!im) {
- if (likely(state->idev != NULL))
- read_unlock_bh(&state->idev->lock);
-
state->dev = next_net_device_rcu(state->dev);
- if (!state->dev) {
- state->idev = NULL;
+ if (!state->dev)
break;
- }
- state->idev = __in6_dev_get(state->dev);
- if (!state->idev)
+ idev = __in6_dev_get(state->dev);
+ if (!idev)
continue;
- read_lock_bh(&state->idev->lock);
- im = state->idev->ac_list;
+ im = rcu_dereference(idev->ac_list);
}
return im;
}
@@ -499,12 +598,6 @@ static void *ac6_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static void ac6_seq_stop(struct seq_file *seq, void *v)
__releases(RCU)
{
- struct ac6_iter_state *state = ac6_seq_private(seq);
-
- if (likely(state->idev != NULL)) {
- read_unlock_bh(&state->idev->lock);
- state->idev = NULL;
- }
rcu_read_unlock();
}
@@ -540,3 +633,24 @@ void ac6_proc_exit(struct net *net)
remove_proc_entry("anycast6", net->proc_net);
}
#endif
+
+/* Init / cleanup code
+ */
+int __init ipv6_anycast_init(void)
+{
+ int i;
+
+ for (i = 0; i < IN6_ADDR_HSIZE; i++)
+ INIT_HLIST_HEAD(&inet6_acaddr_lst[i]);
+ return 0;
+}
+
+void ipv6_anycast_cleanup(void)
+{
+ int i;
+
+ spin_lock(&acaddr_hash_lock);
+ for (i = 0; i < IN6_ADDR_HSIZE; i++)
+ WARN_ON(!hlist_empty(&inet6_acaddr_lst[i]));
+ spin_unlock(&acaddr_hash_lock);
+}
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index 1c0bb9fb76e6..df1986973430 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* CALIPSO - Common Architecture Label IPv6 Security Option
*
@@ -6,25 +7,10 @@
*
* Authors: Paul Moore <paul.moore@hp.com>
* Huw Davies <huw@codeweavers.com>
- *
*/
/* (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008
* (c) Copyright Huw Davies <huw@codeweavers.com>, 2015
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- *
*/
#include <linux/init.h>
@@ -43,10 +29,10 @@
#include <net/calipso.h>
#include <linux/atomic.h>
#include <linux/bug.h>
-#include <asm/unaligned.h>
+#include <linux/unaligned.h>
#include <linux/crc-ccitt.h>
-/* Maximium size of the calipso option including
+/* Maximum size of the calipso option including
* the two-byte TLV header.
*/
#define CALIPSO_OPT_LEN_MAX (2 + 252)
@@ -56,13 +42,13 @@
*/
#define CALIPSO_HDR_LEN (2 + 8)
-/* Maximium size of the calipso option including
+/* Maximum size of the calipso option including
* the two-byte TLV header and upto 3 bytes of
* leading pad and 7 bytes of trailing pad.
*/
#define CALIPSO_OPT_LEN_MAX_WITH_PAD (3 + CALIPSO_OPT_LEN_MAX + 7)
- /* Maximium size of u32 aligned buffer required to hold calipso
+ /* Maximum size of u32 aligned buffer required to hold calipso
* option. Max of 3 initial pad bytes starting from buffer + 3.
* i.e. the worst case is when the previous tlv finishes on 4n + 3.
*/
@@ -97,6 +83,9 @@ struct calipso_map_cache_entry {
static struct calipso_map_cache_bkt *calipso_cache;
+static void calipso_cache_invalidate(void);
+static void calipso_doi_putdef(struct calipso_doi *doi_def);
+
/* Label Mapping Cache Functions
*/
@@ -437,7 +426,7 @@ static void calipso_doi_free_rcu(struct rcu_head *entry)
/**
* calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine
* @doi: the DOI value
- * @audit_secid: the LSM secid to use in the audit message
+ * @audit_info: NetLabel audit information
*
* Description:
* Removes a DOI definition from the CALIPSO engine. The NetLabel routines will
@@ -458,15 +447,10 @@ static int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info)
ret_val = -ENOENT;
goto doi_remove_return;
}
- if (!refcount_dec_and_test(&doi_def->refcount)) {
- spin_unlock(&calipso_doi_list_lock);
- ret_val = -EBUSY;
- goto doi_remove_return;
- }
list_del_rcu(&doi_def->list);
spin_unlock(&calipso_doi_list_lock);
- call_rcu(&doi_def->rcu, calipso_doi_free_rcu);
+ calipso_doi_putdef(doi_def);
ret_val = 0;
doi_remove_return:
@@ -522,10 +506,8 @@ static void calipso_doi_putdef(struct calipso_doi *doi_def)
if (!refcount_dec_and_test(&doi_def->refcount))
return;
- spin_lock(&calipso_doi_list_lock);
- list_del_rcu(&doi_def->list);
- spin_unlock(&calipso_doi_list_lock);
+ calipso_cache_invalidate();
call_rcu(&doi_def->rcu, calipso_doi_free_rcu);
}
@@ -675,11 +657,8 @@ static int calipso_map_cat_ntoh(const struct calipso_doi *doi_def,
net_clen_bits,
spot + 1,
1);
- if (spot < 0) {
- if (spot == -2)
- return -EFAULT;
+ if (spot < 0)
return 0;
- }
ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat,
spot,
@@ -775,7 +754,7 @@ static int calipso_genopt(unsigned char *buf, u32 start, u32 buf_len,
calipso[1] = len - 2;
*(__be32 *)(calipso + 2) = htonl(doi_def->doi);
calipso[6] = (len - CALIPSO_HDR_LEN) / 4;
- calipso[7] = secattr->attr.mls.lvl,
+ calipso[7] = secattr->attr.mls.lvl;
crc = ~crc_ccitt(0xffff, calipso, len);
calipso[8] = crc & 0xff;
calipso[9] = (crc >> 8) & 0xff;
@@ -1061,7 +1040,8 @@ static int calipso_opt_getattr(const unsigned char *calipso,
goto getattr_return;
}
- secattr->flags |= NETLBL_SECATTR_MLS_CAT;
+ if (secattr->attr.mls.cat)
+ secattr->flags |= NETLBL_SECATTR_MLS_CAT;
}
secattr->type = NETLBL_NLTYPE_CALIPSO;
@@ -1092,8 +1072,13 @@ static int calipso_sock_getattr(struct sock *sk,
struct ipv6_opt_hdr *hop;
int opt_len, len, ret_val = -ENOMSG, offset;
unsigned char *opt;
- struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk));
+ struct ipv6_pinfo *pinfo = inet6_sk(sk);
+ struct ipv6_txoptions *txopts;
+ if (!pinfo)
+ return -EAFNOSUPPORT;
+
+ txopts = txopt_get(pinfo);
if (!txopts || !txopts->hopopt)
goto done;
@@ -1145,8 +1130,13 @@ static int calipso_sock_setattr(struct sock *sk,
{
int ret_val;
struct ipv6_opt_hdr *old, *new;
- struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk));
+ struct ipv6_pinfo *pinfo = inet6_sk(sk);
+ struct ipv6_txoptions *txopts;
+
+ if (!pinfo)
+ return -EAFNOSUPPORT;
+ txopts = txopt_get(pinfo);
old = NULL;
if (txopts)
old = txopts->hopopt;
@@ -1173,8 +1163,13 @@ static int calipso_sock_setattr(struct sock *sk,
static void calipso_sock_delattr(struct sock *sk)
{
struct ipv6_opt_hdr *new_hop;
- struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk));
+ struct ipv6_pinfo *pinfo = inet6_sk(sk);
+ struct ipv6_txoptions *txopts;
+
+ if (!pinfo)
+ return;
+ txopts = txopt_get(pinfo);
if (!txopts || !txopts->hopopt)
goto done;
@@ -1212,6 +1207,10 @@ static int calipso_req_setattr(struct request_sock *req,
struct ipv6_opt_hdr *old, *new;
struct sock *sk = sk_to_full_sk(req_to_sk(req));
+ /* sk is NULL for SYN+ACK w/ SYN Cookie */
+ if (!sk)
+ return -ENOMEM;
+
if (req_inet->ipv6_opt && req_inet->ipv6_opt->hopopt)
old = req_inet->ipv6_opt->hopopt;
else
@@ -1239,7 +1238,7 @@ static int calipso_req_setattr(struct request_sock *req,
/**
* calipso_req_delattr - Delete the CALIPSO option from a request socket
- * @reg: the request socket
+ * @req: the request socket
*
* Description:
* Removes the CALIPSO option from a request socket, if present.
@@ -1252,6 +1251,10 @@ static void calipso_req_delattr(struct request_sock *req)
struct ipv6_txoptions *txopts;
struct sock *sk = sk_to_full_sk(req_to_sk(req));
+ /* sk is NULL for SYN+ACK w/ SYN Cookie */
+ if (!sk)
+ return;
+
if (!req_inet->ipv6_opt || !req_inet->ipv6_opt->hopopt)
return;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 1ede7a16a0be..83e03176819c 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* common UDP/RAW code
* Linux INET6 implementation
*
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/capability.h>
@@ -23,6 +19,7 @@
#include <linux/route.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/icmp.h>
#include <net/ipv6.h>
#include <net/ndisc.h>
@@ -31,6 +28,7 @@
#include <net/ip6_route.h>
#include <net/tcp_states.h>
#include <net/dsfield.h>
+#include <net/sock_reuseport.h>
#include <linux/errqueue.h>
#include <linux/uaccess.h>
@@ -40,29 +38,35 @@ static bool ipv6_mapped_addr_any(const struct in6_addr *a)
return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0);
}
-static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk)
+static void ip6_datagram_flow_key_init(struct flowi6 *fl6,
+ const struct sock *sk)
{
- struct inet_sock *inet = inet_sk(sk);
- struct ipv6_pinfo *np = inet6_sk(sk);
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+ int oif = sk->sk_bound_dev_if;
memset(fl6, 0, sizeof(*fl6));
fl6->flowi6_proto = sk->sk_protocol;
fl6->daddr = sk->sk_v6_daddr;
fl6->saddr = np->saddr;
- fl6->flowi6_oif = sk->sk_bound_dev_if;
fl6->flowi6_mark = sk->sk_mark;
fl6->fl6_dport = inet->inet_dport;
fl6->fl6_sport = inet->inet_sport;
- fl6->flowlabel = np->flow_label;
- fl6->flowi6_uid = sk->sk_uid;
+ fl6->flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label);
+ fl6->flowi6_uid = sk_uid(sk);
- if (!fl6->flowi6_oif)
- fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
+ if (!oif)
+ oif = np->sticky_pktinfo.ipi6_ifindex;
- if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr))
- fl6->flowi6_oif = np->mcast_oif;
+ if (!oif) {
+ if (ipv6_addr_is_multicast(&fl6->daddr))
+ oif = READ_ONCE(np->mcast_oif);
+ else
+ oif = READ_ONCE(np->ucast_oif);
+ }
- security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
+ fl6->flowi6_oif = oif;
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
}
int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr)
@@ -76,9 +80,10 @@ int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr)
struct flowi6 fl6;
int err = 0;
- if (np->sndflow && (np->flow_label & IPV6_FLOWLABEL_MASK)) {
+ if (inet6_test_bit(SNDFLOW, sk) &&
+ (np->flow_label & IPV6_FLOWLABEL_MASK)) {
flowlabel = fl6_sock_lookup(sk, np->flow_label);
- if (!flowlabel)
+ if (IS_ERR(flowlabel))
return -EINVAL;
}
ip6_datagram_flow_key_init(&fl6, sk);
@@ -88,7 +93,7 @@ int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr)
final_p = fl6_update_dst(&fl6, opt, &final);
rcu_read_unlock();
- dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
goto out;
@@ -122,7 +127,7 @@ void ip6_datagram_release_cb(struct sock *sk)
rcu_read_lock();
dst = __sk_dst_get(sk);
- if (!dst || !dst->obsolete ||
+ if (!dst || !READ_ONCE(dst->obsolete) ||
dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) {
rcu_read_unlock();
return;
@@ -133,7 +138,7 @@ void ip6_datagram_release_cb(struct sock *sk)
}
EXPORT_SYMBOL_GPL(ip6_datagram_release_cb);
-int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
+int __ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
int addr_len)
{
struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
@@ -147,7 +152,7 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
int err;
if (usin->sin6_family == AF_INET) {
- if (__ipv6_only_sock(sk))
+ if (ipv6_only_sock(sk))
return -EAFNOSUPPORT;
err = __ip4_datagram_connect(sk, uaddr, addr_len);
goto ipv4_connected;
@@ -159,7 +164,7 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
if (usin->sin6_family != AF_INET6)
return -EAFNOSUPPORT;
- if (np->sndflow)
+ if (inet6_test_bit(SNDFLOW, sk))
fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
if (ipv6_addr_any(&usin->sin6_addr)) {
@@ -180,7 +185,7 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
if (addr_type & IPV6_ADDR_MAPPED) {
struct sockaddr_in sin;
- if (__ipv6_only_sock(sk)) {
+ if (ipv6_only_sock(sk)) {
err = -ENETUNREACH;
goto out;
}
@@ -189,7 +194,7 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr,
sin.sin_port = usin->sin6_port;
err = __ip4_datagram_connect(sk,
- (struct sockaddr *) &sin,
+ (struct sockaddr_unsized *)&sin,
sizeof(sin));
ipv4_connected:
@@ -220,11 +225,11 @@ ipv4_connected:
err = -EINVAL;
goto out;
}
- sk->sk_bound_dev_if = usin->sin6_scope_id;
+ WRITE_ONCE(sk->sk_bound_dev_if, usin->sin6_scope_id);
}
if (!sk->sk_bound_dev_if && (addr_type & IPV6_ADDR_MULTICAST))
- sk->sk_bound_dev_if = np->mcast_oif;
+ WRITE_ONCE(sk->sk_bound_dev_if, READ_ONCE(np->mcast_oif));
/* Connect to link-local address requires an interface */
if (!sk->sk_bound_dev_if) {
@@ -258,6 +263,7 @@ ipv4_connected:
goto out;
}
+ reuseport_has_conns_set(sk);
sk->sk_state = TCP_ESTABLISHED;
sk_set_txhash(sk);
out:
@@ -265,7 +271,7 @@ out:
}
EXPORT_SYMBOL_GPL(__ip6_datagram_connect);
-int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+int ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len)
{
int res;
@@ -276,7 +282,7 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
}
EXPORT_SYMBOL_GPL(ip6_datagram_connect);
-int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr,
+int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr_unsized *uaddr,
int addr_len)
{
DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, uaddr);
@@ -286,14 +292,24 @@ int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr,
}
EXPORT_SYMBOL_GPL(ip6_datagram_connect_v6_only);
+static void ipv6_icmp_error_rfc4884(const struct sk_buff *skb,
+ struct sock_ee_data_rfc4884 *out)
+{
+ switch (icmp6_hdr(skb)->icmp6_type) {
+ case ICMPV6_TIME_EXCEED:
+ case ICMPV6_DEST_UNREACH:
+ ip_icmp_error_rfc4884(skb, out, sizeof(struct icmp6hdr),
+ icmp6_hdr(skb)->icmp6_datagram_len * 8);
+ }
+}
+
void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
__be16 port, u32 info, u8 *payload)
{
- struct ipv6_pinfo *np = inet6_sk(sk);
struct icmp6hdr *icmph = icmp6_hdr(skb);
struct sock_exterr_skb *serr;
- if (!np->recverr)
+ if (!inet6_test_bit(RECVERR6, sk))
return;
skb = skb_clone(skb, GFP_ATOMIC);
@@ -315,20 +331,24 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
serr->port = port;
__skb_pull(skb, payload - skb->data);
+
+ if (inet6_test_bit(RECVERR6_RFC4884, sk))
+ ipv6_icmp_error_rfc4884(skb, &serr->ee.ee_rfc4884);
+
skb_reset_transport_header(skb);
if (sock_queue_err_skb(sk, skb))
kfree_skb(skb);
}
+EXPORT_SYMBOL_GPL(ipv6_icmp_error);
void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info)
{
- const struct ipv6_pinfo *np = inet6_sk(sk);
struct sock_exterr_skb *serr;
struct ipv6hdr *iph;
struct sk_buff *skb;
- if (!np->recverr)
+ if (!inet6_test_bit(RECVERR6, sk))
return;
skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
@@ -341,6 +361,7 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info)
skb_reset_network_header(skb);
iph = ipv6_hdr(skb);
iph->daddr = fl6->daddr;
+ ip6_flow_hdr(iph, 0, 0);
serr = SKB_EXT_ERR(skb);
serr->ee.ee_errno = err;
@@ -471,7 +492,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
const struct ipv6hdr *ip6h = container_of((struct in6_addr *)(nh + serr->addr_offset),
struct ipv6hdr, daddr);
sin->sin6_addr = ip6h->daddr;
- if (np->sndflow)
+ if (inet6_test_bit(SNDFLOW, sk))
sin->sin6_flowinfo = ip6_flowinfo(ip6h);
sin->sin6_scope_id =
ipv6_iface_scope_id(&sin->sin6_addr,
@@ -502,7 +523,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
} else {
ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr,
&sin->sin6_addr);
- if (inet_sk(sk)->cmsg_flags)
+ if (inet_cmsg_flags(inet_sk(sk)))
ip_cmsg_recv(msg, skb);
}
}
@@ -700,17 +721,15 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
}
if (np->rxopt.bits.rxorigdstaddr) {
struct sockaddr_in6 sin6;
- __be16 *ports;
- int end;
+ __be16 _ports[2], *ports;
- end = skb_transport_offset(skb) + 4;
- if (end <= 0 || pskb_may_pull(skb, end)) {
+ ports = skb_header_pointer(skb, skb_transport_offset(skb),
+ sizeof(_ports), &_ports);
+ if (ports) {
/* All current transport protocols have the port numbers in the
* first four bytes of the transport header and this function is
* written with this assumption in mind.
*/
- ports = (__be16 *)skb_transport_header(skb);
-
sin6.sin6_family = AF_INET6;
sin6.sin6_addr = ipv6_hdr(skb)->daddr;
sin6.sin6_port = ports[1];
@@ -758,7 +777,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
}
if (cmsg->cmsg_level == SOL_SOCKET) {
- err = __sock_cmsg_send(sk, msg, cmsg, &ipc6->sockc);
+ err = __sock_cmsg_send(sk, cmsg, &ipc6->sockc);
if (err)
return err;
continue;
@@ -772,6 +791,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
case IPV6_2292PKTINFO:
{
struct net_device *dev = NULL;
+ int src_idx;
if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) {
err = -EINVAL;
@@ -779,12 +799,15 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
}
src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg);
+ src_idx = src_info->ipi6_ifindex;
- if (src_info->ipi6_ifindex) {
+ if (src_idx) {
if (fl6->flowi6_oif &&
- src_info->ipi6_ifindex != fl6->flowi6_oif)
+ src_idx != fl6->flowi6_oif &&
+ (READ_ONCE(sk->sk_bound_dev_if) != fl6->flowi6_oif ||
+ !sk_dev_equal_l3scope(sk, src_idx)))
return -EINVAL;
- fl6->flowi6_oif = src_info->ipi6_ifindex;
+ fl6->flowi6_oif = src_idx;
}
addr_type = __ipv6_addr_type(&src_info->ipi6_addr);
@@ -1031,7 +1054,7 @@ void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp,
src = &sp->sk_v6_rcv_saddr;
seq_printf(seq,
"%5d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
- "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %d\n",
+ "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u\n",
bucket,
src->s6_addr32[0], src->s6_addr32[1],
src->s6_addr32[2], src->s6_addr32[3], srcp,
@@ -1041,9 +1064,9 @@ void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp,
sk_wmem_alloc_get(sp),
rqueue,
0, 0L, 0,
- from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
+ from_kuid_munged(seq_user_ns(seq), sk_uid(sp)),
0,
sock_i_ino(sp),
refcount_read(&sp->sk_refcnt), sp,
- atomic_read(&sp->sk_drops));
+ sk_drops_read(sp));
}
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 88a7579c23bd..e75da98f5283 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C)2002 USAGI/WIDE Project
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- *
* Authors
*
* Mitsuru KANDA @USAGI : IPv6 Support
@@ -38,11 +26,17 @@
#include <linux/random.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <net/ip6_checksum.h>
#include <net/ip6_route.h>
#include <net/icmp.h>
#include <net/ipv6.h>
#include <net/protocol.h>
+#include <net/udp.h>
#include <linux/icmpv6.h>
+#include <net/tcp.h>
+#include <net/espintcp.h>
+#include <net/inet6_hashtables.h>
+#include <linux/skbuff_ref.h>
#include <linux/highmem.h>
@@ -51,9 +45,12 @@ struct esp_skb_cb {
void *tmp;
};
-#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
+struct esp_output_extra {
+ __be32 seqhi;
+ u32 esphoff;
+};
-static u32 esp6_get_mtu(struct xfrm_state *x, int mtu);
+#define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0]))
/*
* Allocate an AEAD request structure with extra space for SG and IV.
@@ -86,9 +83,9 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqihlen)
return kmalloc(len, GFP_ATOMIC);
}
-static inline __be32 *esp_tmp_seqhi(void *tmp)
+static inline void *esp_tmp_extra(void *tmp)
{
- return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32));
+ return PTR_ALIGN(tmp, __alignof__(struct esp_output_extra));
}
static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen)
@@ -116,18 +113,18 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
__alignof__(struct scatterlist));
}
-static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
+static void esp_ssg_unref(struct xfrm_state *x, void *tmp, struct sk_buff *skb)
{
struct crypto_aead *aead = x->data;
- int seqhilen = 0;
+ int extralen = 0;
u8 *iv;
struct aead_request *req;
struct scatterlist *sg;
if (x->props.flags & XFRM_STATE_ESN)
- seqhilen += sizeof(__be32);
+ extralen += sizeof(struct esp_output_extra);
- iv = esp_tmp_iv(aead, tmp, seqhilen);
+ iv = esp_tmp_iv(aead, tmp, extralen);
req = esp_tmp_req(aead, iv);
/* Unref skb_frag_pages in the src scatterlist if necessary.
@@ -135,25 +132,133 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
*/
if (req->src != req->dst)
for (sg = sg_next(req->src); sg; sg = sg_next(sg))
- put_page(sg_page(sg));
+ skb_page_unref(page_to_netmem(sg_page(sg)),
+ skb->pp_recycle);
+}
+
+#ifdef CONFIG_INET6_ESPINTCP
+static struct sock *esp6_find_tcp_sk(struct xfrm_state *x)
+{
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct net *net = xs_net(x);
+ __be16 sport, dport;
+ struct sock *sk;
+
+ spin_lock_bh(&x->lock);
+ sport = encap->encap_sport;
+ dport = encap->encap_dport;
+ spin_unlock_bh(&x->lock);
+
+ sk = __inet6_lookup_established(net, &x->id.daddr.in6, dport,
+ &x->props.saddr.in6, ntohs(sport), 0, 0);
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ if (!tcp_is_ulp_esp(sk)) {
+ sock_put(sk);
+ return ERR_PTR(-EINVAL);
+ }
+
+ return sk;
+}
+
+static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct sock *sk;
+ int err;
+
+ rcu_read_lock();
+
+ sk = esp6_find_tcp_sk(x);
+ err = PTR_ERR_OR_ZERO(sk);
+ if (err) {
+ kfree_skb(skb);
+ goto out;
+ }
+
+ bh_lock_sock(sk);
+ if (sock_owned_by_user(sk))
+ err = espintcp_queue_out(sk, skb);
+ else
+ err = espintcp_push_skb(sk, skb);
+ bh_unlock_sock(sk);
+
+ sock_put(sk);
+
+out:
+ rcu_read_unlock();
+ return err;
+}
+
+static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct xfrm_state *x = dst->xfrm;
+
+ return esp_output_tcp_finish(x, skb);
+}
+
+static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
+{
+ int err;
+
+ local_bh_disable();
+ err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb);
+ local_bh_enable();
+
+ /* EINPROGRESS just happens to do the right thing. It
+ * actually means that the skb has been consumed and
+ * isn't coming back.
+ */
+ return err ?: -EINPROGRESS;
+}
+#else
+static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb)
+{
+ WARN_ON(1);
+ return -EOPNOTSUPP;
+}
+#endif
+
+static void esp_output_encap_csum(struct sk_buff *skb)
+{
+ /* UDP encap with IPv6 requires a valid checksum */
+ if (*skb_mac_header(skb) == IPPROTO_UDP) {
+ struct udphdr *uh = udp_hdr(skb);
+ struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ int len = ntohs(uh->len);
+ unsigned int offset = skb_transport_offset(skb);
+ __wsum csum = skb_checksum(skb, offset, skb->len - offset, 0);
+
+ uh->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ len, IPPROTO_UDP, csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+ }
}
-static void esp_output_done(struct crypto_async_request *base, int err)
+static void esp_output_done(void *data, int err)
{
- struct sk_buff *skb = base->data;
+ struct sk_buff *skb = data;
struct xfrm_offload *xo = xfrm_offload(skb);
void *tmp;
struct xfrm_state *x;
- if (xo && (xo->flags & XFRM_DEV_RESUME))
- x = skb->sp->xvec[skb->sp->len - 1];
- else
+ if (xo && (xo->flags & XFRM_DEV_RESUME)) {
+ struct sec_path *sp = skb_sec_path(skb);
+
+ x = sp->xvec[sp->len - 1];
+ } else {
x = skb_dst(skb)->xfrm;
+ }
tmp = ESP_SKB_CB(skb)->tmp;
- esp_ssg_unref(x, tmp);
+ esp_ssg_unref(x, tmp, skb);
kfree(tmp);
+ esp_output_encap_csum(skb);
+
if (xo && (xo->flags & XFRM_DEV_RESUME)) {
if (err) {
XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR);
@@ -165,7 +270,11 @@ static void esp_output_done(struct crypto_async_request *base, int err)
secpath_reset(skb);
xfrm_dev_resume(skb);
} else {
- xfrm_output_resume(skb, err);
+ if (!err &&
+ x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
+ esp_output_tail_tcp(x, skb);
+ else
+ xfrm_output_resume(skb_to_full_sk(skb), skb, err);
}
}
@@ -174,7 +283,7 @@ static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
{
struct ip_esp_hdr *esph = (void *)(skb->data + offset);
void *tmp = ESP_SKB_CB(skb)->tmp;
- __be32 *seqhi = esp_tmp_seqhi(tmp);
+ __be32 *seqhi = esp_tmp_extra(tmp);
esph->seq_no = esph->spi;
esph->spi = *seqhi;
@@ -182,27 +291,36 @@ static void esp_restore_header(struct sk_buff *skb, unsigned int offset)
static void esp_output_restore_header(struct sk_buff *skb)
{
- esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32));
+ void *tmp = ESP_SKB_CB(skb)->tmp;
+ struct esp_output_extra *extra = esp_tmp_extra(tmp);
+
+ esp_restore_header(skb, skb_transport_offset(skb) + extra->esphoff -
+ sizeof(__be32));
}
static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb,
struct xfrm_state *x,
struct ip_esp_hdr *esph,
- __be32 *seqhi)
+ struct esp_output_extra *extra)
{
/* For ESN we move the header forward by 4 bytes to
- * accomodate the high bits. We will move it back after
+ * accommodate the high bits. We will move it back after
* encryption.
*/
if ((x->props.flags & XFRM_STATE_ESN)) {
+ __u32 seqhi;
struct xfrm_offload *xo = xfrm_offload(skb);
- esph = (void *)(skb_transport_header(skb) - sizeof(__be32));
- *seqhi = esph->spi;
if (xo)
- esph->seq_no = htonl(xo->seq.hi);
+ seqhi = xo->seq.hi;
else
- esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+ seqhi = XFRM_SKB_CB(skb)->seq.output.hi;
+
+ extra->esphoff = (unsigned char *)esph -
+ skb_transport_header(skb);
+ esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
+ extra->seqhi = esph->spi;
+ esph->seq_no = htonl(seqhi);
}
esph->spi = x->id.spi;
@@ -210,39 +328,127 @@ static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb,
return esph;
}
-static void esp_output_done_esn(struct crypto_async_request *base, int err)
+static void esp_output_done_esn(void *data, int err)
{
- struct sk_buff *skb = base->data;
+ struct sk_buff *skb = data;
esp_output_restore_header(skb);
- esp_output_done(base, err);
+ esp_output_done(data, err);
}
-static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto)
+static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb,
+ int encap_type,
+ struct esp_info *esp,
+ __be16 sport,
+ __be16 dport)
+{
+ struct udphdr *uh;
+ unsigned int len;
+
+ len = skb->len + esp->tailen - skb_transport_offset(skb);
+ if (len > U16_MAX)
+ return ERR_PTR(-EMSGSIZE);
+
+ uh = (struct udphdr *)esp->esph;
+ uh->source = sport;
+ uh->dest = dport;
+ uh->len = htons(len);
+ uh->check = 0;
+
+ *skb_mac_header(skb) = IPPROTO_UDP;
+
+ return (struct ip_esp_hdr *)(uh + 1);
+}
+
+#ifdef CONFIG_INET6_ESPINTCP
+static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x,
+ struct sk_buff *skb,
+ struct esp_info *esp)
+{
+ __be16 *lenp = (void *)esp->esph;
+ struct ip_esp_hdr *esph;
+ unsigned int len;
+ struct sock *sk;
+
+ len = skb->len + esp->tailen - skb_transport_offset(skb);
+ if (len > IP_MAX_MTU)
+ return ERR_PTR(-EMSGSIZE);
+
+ rcu_read_lock();
+ sk = esp6_find_tcp_sk(x);
+ rcu_read_unlock();
+
+ if (IS_ERR(sk))
+ return ERR_CAST(sk);
+
+ sock_put(sk);
+
+ *lenp = htons(len);
+ esph = (struct ip_esp_hdr *)(lenp + 1);
+
+ return esph;
+}
+#else
+static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x,
+ struct sk_buff *skb,
+ struct esp_info *esp)
{
- /* Fill padding... */
- if (tfclen) {
- memset(tail, 0, tfclen);
- tail += tfclen;
+ return ERR_PTR(-EOPNOTSUPP);
+}
+#endif
+
+static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb,
+ struct esp_info *esp)
+{
+ struct xfrm_encap_tmpl *encap = x->encap;
+ struct ip_esp_hdr *esph;
+ __be16 sport, dport;
+ int encap_type;
+
+ spin_lock_bh(&x->lock);
+ sport = encap->encap_sport;
+ dport = encap->encap_dport;
+ encap_type = encap->encap_type;
+ spin_unlock_bh(&x->lock);
+
+ switch (encap_type) {
+ default:
+ case UDP_ENCAP_ESPINUDP:
+ esph = esp6_output_udp_encap(skb, encap_type, esp, sport, dport);
+ break;
+ case TCP_ENCAP_ESPINTCP:
+ esph = esp6_output_tcp_encap(x, skb, esp);
+ break;
}
- do {
- int i;
- for (i = 0; i < plen - 2; i++)
- tail[i] = i + 1;
- } while (0);
- tail[plen - 2] = plen - 2;
- tail[plen - 1] = proto;
+
+ if (IS_ERR(esph))
+ return PTR_ERR(esph);
+
+ esp->esph = esph;
+
+ return 0;
}
int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp)
{
u8 *tail;
- u8 *vaddr;
int nfrags;
+ int esph_offset;
struct page *page;
struct sk_buff *trailer;
int tailen = esp->tailen;
+ if (x->encap) {
+ int err = esp6_output_encap(x, skb, esp);
+
+ if (err < 0)
+ return err;
+ }
+
+ if (ALIGN(tailen, L1_CACHE_BYTES) > PAGE_SIZE ||
+ ALIGN(skb->data_len, L1_CACHE_BYTES) > PAGE_SIZE)
+ goto cow;
+
if (!skb_cloned(skb)) {
if (tailen <= skb_tailroom(skb)) {
nfrags = 1;
@@ -270,14 +476,10 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
page = pfrag->page;
get_page(page);
- vaddr = kmap_atomic(page);
-
- tail = vaddr + pfrag->offset;
+ tail = page_address(page) + pfrag->offset;
esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
- kunmap_atomic(vaddr);
-
nfrags = skb_shinfo(skb)->nr_frags;
__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
@@ -293,7 +495,7 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
skb->len += tailen;
skb->data_len += tailen;
skb->truesize += tailen;
- if (sk)
+ if (sk && sk_fullsock(sk))
refcount_add(tailen, &sk->sk_wmem_alloc);
goto out;
@@ -301,10 +503,13 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
}
cow:
+ esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb);
+
nfrags = skb_cow_data(skb, tailen, &trailer);
if (nfrags < 0)
goto out;
tail = skb_tail_pointer(trailer);
+ esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset);
skip_cow:
esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto);
@@ -322,20 +527,20 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
void *tmp;
int ivlen;
int assoclen;
- int seqhilen;
- __be32 *seqhi;
+ int extralen;
struct page *page;
struct ip_esp_hdr *esph;
struct aead_request *req;
struct crypto_aead *aead;
struct scatterlist *sg, *dsg;
+ struct esp_output_extra *extra;
int err = -ENOMEM;
assoclen = sizeof(struct ip_esp_hdr);
- seqhilen = 0;
+ extralen = 0;
if (x->props.flags & XFRM_STATE_ESN) {
- seqhilen += sizeof(__be32);
+ extralen += sizeof(*extra);
assoclen += sizeof(__be32);
}
@@ -343,12 +548,12 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
alen = crypto_aead_authsize(aead);
ivlen = crypto_aead_ivsize(aead);
- tmp = esp_alloc_tmp(aead, esp->nfrags + 2, seqhilen);
+ tmp = esp_alloc_tmp(aead, esp->nfrags + 2, extralen);
if (!tmp)
goto error;
- seqhi = esp_tmp_seqhi(tmp);
- iv = esp_tmp_iv(aead, tmp, seqhilen);
+ extra = esp_tmp_extra(tmp);
+ iv = esp_tmp_iv(aead, tmp, extralen);
req = esp_tmp_req(aead, iv);
sg = esp_req_sg(aead, req);
@@ -357,7 +562,8 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
else
dsg = &sg[esp->nfrags];
- esph = esp_output_set_esn(skb, x, ip_esp_hdr(skb), seqhi);
+ esph = esp_output_set_esn(skb, x, esp->esph, extra);
+ esp->esph = esph;
sg_init_table(sg, esp->nfrags);
err = skb_to_sgvec(skb, sg,
@@ -421,10 +627,14 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
case 0:
if ((x->props.flags & XFRM_STATE_ESN))
esp_output_restore_header(skb);
+ esp_output_encap_csum(skb);
}
if (sg != dsg)
- esp_ssg_unref(x, tmp);
+ esp_ssg_unref(x, tmp, skb);
+
+ if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
+ err = esp_output_tail_tcp(x, skb);
error_free:
kfree(tmp);
@@ -456,7 +666,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb);
u32 padto;
- padto = min(x->tfcpad, esp6_get_mtu(x, dst->child_mtu_cached));
+ padto = min(x->tfcpad, xfrm_state_mtu(x, dst->child_mtu_cached));
if (skb->len < padto)
esp.tfclen = padto - skb->len;
}
@@ -465,11 +675,13 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
esp.plen = esp.clen - skb->len - esp.tfclen;
esp.tailen = esp.tfclen + esp.plen + alen;
+ esp.esph = ip_esp_hdr(skb);
+
esp.nfrags = esp6_output_head(x, skb, &esp);
if (esp.nfrags < 0)
return esp.nfrags;
- esph = ip_esp_hdr(skb);
+ esph = esp.esph;
esph->spi = x->id.spi;
esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
@@ -484,7 +696,6 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
static inline int esp_remove_trailer(struct sk_buff *skb)
{
struct xfrm_state *x = xfrm_input_state(skb);
- struct xfrm_offload *xo = xfrm_offload(skb);
struct crypto_aead *aead = x->data;
int alen, hlen, elen;
int padlen, trimlen;
@@ -496,11 +707,6 @@ static inline int esp_remove_trailer(struct sk_buff *skb)
hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
elen = skb->len - hlen;
- if (xo && (xo->flags & XFRM_ESP_NO_TRAILER)) {
- ret = xo->proto;
- goto out;
- }
-
ret = skb_copy_bits(skb, skb->len - alen - 2, nexthdr, 2);
BUG_ON(ret);
@@ -518,7 +724,9 @@ static inline int esp_remove_trailer(struct sk_buff *skb)
skb->csum = csum_block_sub(skb->csum, csumdiff,
skb->len - trimlen);
}
- pskb_trim(skb, skb->len - trimlen);
+ ret = pskb_trim(skb, skb->len - trimlen);
+ if (unlikely(ret))
+ return ret;
ret = nexthdr[1];
@@ -534,7 +742,7 @@ int esp6_input_done2(struct sk_buff *skb, int err)
int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
int hdr_len = skb_network_header_len(skb);
- if (!xo || (xo && !(xo->flags & CRYPTO_DONE)))
+ if (!xo || !(xo->flags & CRYPTO_DONE))
kfree(ESP_SKB_CB(skb)->tmp);
if (unlikely(err))
@@ -544,10 +752,76 @@ int esp6_input_done2(struct sk_buff *skb, int err)
if (unlikely(err < 0))
goto out;
+ if (x->encap) {
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ int offset = skb_network_offset(skb) + sizeof(*ip6h);
+ struct xfrm_encap_tmpl *encap = x->encap;
+ u8 nexthdr = ip6h->nexthdr;
+ __be16 frag_off, source;
+ struct udphdr *uh;
+ struct tcphdr *th;
+
+ offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);
+ if (offset == -1) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ uh = (void *)(skb->data + offset);
+ th = (void *)(skb->data + offset);
+ hdr_len += offset;
+
+ switch (x->encap->encap_type) {
+ case TCP_ENCAP_ESPINTCP:
+ source = th->source;
+ break;
+ case UDP_ENCAP_ESPINUDP:
+ source = uh->source;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ err = -EINVAL;
+ goto out;
+ }
+
+ /*
+ * 1) if the NAT-T peer's IP or port changed then
+ * advertise the change to the keying daemon.
+ * This is an inbound SA, so just compare
+ * SRC ports.
+ */
+ if (!ipv6_addr_equal(&ip6h->saddr, &x->props.saddr.in6) ||
+ source != encap->encap_sport) {
+ xfrm_address_t ipaddr;
+
+ memcpy(&ipaddr.a6, &ip6h->saddr.s6_addr, sizeof(ipaddr.a6));
+ km_new_mapping(x, &ipaddr, source);
+
+ /* XXX: perhaps add an extra
+ * policy check here, to see
+ * if we should allow or
+ * reject a packet from a
+ * different source
+ * address/port.
+ */
+ }
+
+ /*
+ * 2) ignore UDP/TCP checksums in case
+ * of NAT-T in Transport Mode, or
+ * perform other post-processing fixes
+ * as per draft-ietf-ipsec-udp-encaps-06,
+ * section 3.1.2
+ */
+ if (x->props.mode == XFRM_MODE_TRANSPORT)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+
skb_postpull_rcsum(skb, skb_network_header(skb),
skb_network_header_len(skb));
skb_pull_rcsum(skb, hlen);
- if (x->props.mode == XFRM_MODE_TUNNEL)
+ if (x->props.mode == XFRM_MODE_TUNNEL ||
+ x->props.mode == XFRM_MODE_IPTFS)
skb_reset_transport_header(skb);
else
skb_set_transport_header(skb, -hdr_len);
@@ -561,9 +835,9 @@ out:
}
EXPORT_SYMBOL_GPL(esp6_input_done2);
-static void esp_input_done(struct crypto_async_request *base, int err)
+static void esp_input_done(void *data, int err)
{
- struct sk_buff *skb = base->data;
+ struct sk_buff *skb = data;
xfrm_input_resume(skb, esp6_input_done2(skb, err));
}
@@ -579,7 +853,7 @@ static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
struct xfrm_state *x = xfrm_input_state(skb);
/* For ESN we move the header forward by 4 bytes to
- * accomodate the high bits. We will move it back after
+ * accommodate the high bits. We will move it back after
* decryption.
*/
if ((x->props.flags & XFRM_STATE_ESN)) {
@@ -591,22 +865,21 @@ static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
}
}
-static void esp_input_done_esn(struct crypto_async_request *base, int err)
+static void esp_input_done_esn(void *data, int err)
{
- struct sk_buff *skb = base->data;
+ struct sk_buff *skb = data;
esp_input_restore_header(skb);
- esp_input_done(base, err);
+ esp_input_done(data, err);
}
static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
{
- struct ip_esp_hdr *esph;
struct crypto_aead *aead = x->data;
struct aead_request *req;
struct sk_buff *trailer;
int ivlen = crypto_aead_ivsize(aead);
- int elen = skb->len - sizeof(*esph) - ivlen;
+ int elen = skb->len - sizeof(struct ip_esp_hdr) - ivlen;
int nfrags;
int assoclen;
int seqhilen;
@@ -616,7 +889,7 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
u8 *iv;
struct scatterlist *sg;
- if (!pskb_may_pull(skb, sizeof(*esph) + ivlen)) {
+ if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + ivlen)) {
ret = -EINVAL;
goto out;
}
@@ -626,7 +899,7 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
goto out;
}
- assoclen = sizeof(*esph);
+ assoclen = sizeof(struct ip_esp_hdr);
seqhilen = 0;
if (x->props.flags & XFRM_STATE_ESN) {
@@ -660,7 +933,7 @@ skip_cow:
goto out;
ESP_SKB_CB(skb)->tmp = tmp;
- seqhi = esp_tmp_seqhi(tmp);
+ seqhi = esp_tmp_extra(tmp);
iv = esp_tmp_iv(aead, tmp, seqhilen);
req = esp_tmp_req(aead, iv);
sg = esp_req_sg(aead, req);
@@ -697,21 +970,6 @@ out:
return ret;
}
-static u32 esp6_get_mtu(struct xfrm_state *x, int mtu)
-{
- struct crypto_aead *aead = x->data;
- u32 blksize = ALIGN(crypto_aead_blocksize(aead), 4);
- unsigned int net_adj;
-
- if (x->props.mode != XFRM_MODE_TUNNEL)
- net_adj = sizeof(struct ipv6hdr);
- else
- net_adj = 0;
-
- return ((mtu - x->props.header_len - crypto_aead_authsize(aead) -
- net_adj) & ~(blksize - 1)) + net_adj - 2;
-}
-
static int esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
@@ -749,16 +1007,17 @@ static void esp6_destroy(struct xfrm_state *x)
crypto_free_aead(aead);
}
-static int esp_init_aead(struct xfrm_state *x)
+static int esp_init_aead(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
char aead_name[CRYPTO_MAX_ALG_NAME];
struct crypto_aead *aead;
int err;
- err = -ENAMETOOLONG;
if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)",
- x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME)
- goto error;
+ x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) {
+ NL_SET_ERR_MSG(extack, "Algorithm name is too long");
+ return -ENAMETOOLONG;
+ }
aead = crypto_alloc_aead(aead_name, 0, 0);
err = PTR_ERR(aead);
@@ -776,11 +1035,15 @@ static int esp_init_aead(struct xfrm_state *x)
if (err)
goto error;
+ return 0;
+
error:
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
return err;
}
-static int esp_init_authenc(struct xfrm_state *x)
+static int esp_init_authenc(struct xfrm_state *x,
+ struct netlink_ext_ack *extack)
{
struct crypto_aead *aead;
struct crypto_authenc_key_param *param;
@@ -791,10 +1054,6 @@ static int esp_init_authenc(struct xfrm_state *x)
unsigned int keylen;
int err;
- err = -EINVAL;
- if (!x->ealg)
- goto error;
-
err = -ENAMETOOLONG;
if ((x->props.flags & XFRM_STATE_ESN)) {
@@ -803,22 +1062,28 @@ static int esp_init_authenc(struct xfrm_state *x)
x->geniv ?: "", x->geniv ? "(" : "",
x->aalg ? x->aalg->alg_name : "digest_null",
x->ealg->alg_name,
- x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME)
+ x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
+ NL_SET_ERR_MSG(extack, "Algorithm name is too long");
goto error;
+ }
} else {
if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME,
"%s%sauthenc(%s,%s)%s",
x->geniv ?: "", x->geniv ? "(" : "",
x->aalg ? x->aalg->alg_name : "digest_null",
x->ealg->alg_name,
- x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME)
+ x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) {
+ NL_SET_ERR_MSG(extack, "Algorithm name is too long");
goto error;
+ }
}
aead = crypto_alloc_aead(authenc_name, 0, 0);
err = PTR_ERR(aead);
- if (IS_ERR(aead))
+ if (IS_ERR(aead)) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto error;
+ }
x->data = aead;
@@ -848,17 +1113,16 @@ static int esp_init_authenc(struct xfrm_state *x)
err = -EINVAL;
if (aalg_desc->uinfo.auth.icv_fullbits / 8 !=
crypto_aead_authsize(aead)) {
- pr_info("ESP: %s digestsize %u != %hu\n",
- x->aalg->alg_name,
- crypto_aead_authsize(aead),
- aalg_desc->uinfo.auth.icv_fullbits / 8);
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto free_key;
}
err = crypto_aead_setauthsize(
aead, x->aalg->alg_trunc_len / 8);
- if (err)
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations");
goto free_key;
+ }
}
param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8);
@@ -873,21 +1137,22 @@ error:
return err;
}
-static int esp6_init_state(struct xfrm_state *x)
+static int esp6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
struct crypto_aead *aead;
u32 align;
int err;
- if (x->encap)
- return -EINVAL;
-
x->data = NULL;
- if (x->aead)
- err = esp_init_aead(x);
- else
- err = esp_init_authenc(x);
+ if (x->aead) {
+ err = esp_init_aead(x, extack);
+ } else if (x->ealg) {
+ err = esp_init_authenc(x, extack);
+ } else {
+ NL_SET_ERR_MSG(extack, "ESP: AEAD or CRYPT must be provided");
+ err = -EINVAL;
+ }
if (err)
goto error;
@@ -910,6 +1175,28 @@ static int esp6_init_state(struct xfrm_state *x)
break;
}
+ if (x->encap) {
+ struct xfrm_encap_tmpl *encap = x->encap;
+
+ switch (encap->encap_type) {
+ default:
+ NL_SET_ERR_MSG(extack, "Unsupported encapsulation type for ESP");
+ err = -EINVAL;
+ goto error;
+ case UDP_ENCAP_ESPINUDP:
+ x->props.header_len += sizeof(struct udphdr);
+ break;
+#ifdef CONFIG_INET6_ESPINTCP
+ case TCP_ENCAP_ESPINTCP:
+ /* only the length field, TCP encap is done by
+ * the socket
+ */
+ x->props.header_len += 2;
+ break;
+#endif
+ }
+ }
+
align = ALIGN(crypto_aead_blocksize(aead), 4);
x->props.trailer_len = align + 1 + crypto_aead_authsize(aead);
@@ -923,20 +1210,18 @@ static int esp6_rcv_cb(struct sk_buff *skb, int err)
}
static const struct xfrm_type esp6_type = {
- .description = "ESP6",
.owner = THIS_MODULE,
.proto = IPPROTO_ESP,
.flags = XFRM_TYPE_REPLAY_PROT,
.init_state = esp6_init_state,
.destructor = esp6_destroy,
- .get_mtu = esp6_get_mtu,
.input = esp6_input,
.output = esp6_output,
- .hdr_offset = xfrm6_find_1stfragopt,
};
static struct xfrm6_protocol esp6_protocol = {
.handler = xfrm6_rcv,
+ .input_handler = xfrm_input,
.cb_handler = esp6_rcv_cb,
.err_handler = esp6_err,
.priority = 0,
@@ -961,12 +1246,12 @@ static void __exit esp6_fini(void)
{
if (xfrm6_protocol_deregister(&esp6_protocol, IPPROTO_ESP) < 0)
pr_info("%s: can't remove protocol\n", __func__);
- if (xfrm_unregister_type(&esp6_type, AF_INET6) < 0)
- pr_info("%s: can't remove xfrm type\n", __func__);
+ xfrm_unregister_type(&esp6_type, AF_INET6);
}
module_init(esp6_init);
module_exit(esp6_fini);
+MODULE_DESCRIPTION("IPv6 ESP transformation helpers");
MODULE_LICENSE("GPL");
MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ESP);
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index 6177e2171171..22410243ebe8 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* IPV6 GSO/GRO offload support
* Linux INET implementation
@@ -5,10 +6,6 @@
* Copyright (C) 2016 secunet Security Networks AG
* Author: Steffen Klassert <steffen.klassert@secunet.com>
*
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
* ESP GRO support
*/
@@ -19,6 +16,8 @@
#include <crypto/authenc.h>
#include <linux/err.h>
#include <linux/module.h>
+#include <net/gro.h>
+#include <net/gso.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/esp.h>
@@ -35,7 +34,9 @@ static __u16 esp6_nexthdr_esp_offset(struct ipv6hdr *ipv6_hdr, int nhlen)
int off = sizeof(struct ipv6hdr);
struct ipv6_opt_hdr *exthdr;
- if (likely(ipv6_hdr->nexthdr == NEXTHDR_ESP))
+ /* ESP or ESPINUDP */
+ if (likely(ipv6_hdr->nexthdr == NEXTHDR_ESP ||
+ ipv6_hdr->nexthdr == NEXTHDR_UDP))
return offsetof(struct ipv6hdr, nexthdr);
while (off < nhlen) {
@@ -55,40 +56,51 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head,
int offset = skb_gro_offset(skb);
struct xfrm_offload *xo;
struct xfrm_state *x;
+ int encap_type = 0;
__be32 seq;
__be32 spi;
int nhoff;
- int err;
+
+ if (NAPI_GRO_CB(skb)->proto == IPPROTO_UDP)
+ encap_type = UDP_ENCAP_ESPINUDP;
if (!pskb_pull(skb, offset))
return NULL;
- if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0)
+ if (xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq) != 0)
goto out;
xo = xfrm_offload(skb);
if (!xo || !(xo->flags & CRYPTO_DONE)) {
- err = secpath_set(skb);
- if (err)
- goto out;
+ struct sec_path *sp = secpath_set(skb);
- if (skb->sp->len == XFRM_MAX_DEPTH)
+ if (!sp)
goto out;
- x = xfrm_state_lookup(dev_net(skb->dev), skb->mark,
- (xfrm_address_t *)&ipv6_hdr(skb)->daddr,
- spi, IPPROTO_ESP, AF_INET6);
- if (!x)
- goto out;
+ if (sp->len == XFRM_MAX_DEPTH)
+ goto out_reset;
- skb->sp->xvec[skb->sp->len++] = x;
- skb->sp->olen++;
+ x = xfrm_input_state_lookup(dev_net(skb->dev), skb->mark,
+ (xfrm_address_t *)&ipv6_hdr(skb)->daddr,
+ spi, IPPROTO_ESP, AF_INET6);
- xo = xfrm_offload(skb);
- if (!xo) {
+ if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) {
+ /* non-offload path will record the error and audit log */
xfrm_state_put(x);
- goto out;
+ x = NULL;
}
+
+ if (!x)
+ goto out_reset;
+
+ skb->mark = xfrm_smark_get(skb->mark, x);
+
+ sp->xvec[sp->len++] = x;
+ sp->olen++;
+
+ xo = xfrm_offload(skb);
+ if (!xo)
+ goto out_reset;
}
xo->flags |= XFRM_GRO;
@@ -105,9 +117,11 @@ static struct sk_buff *esp6_gro_receive(struct list_head *head,
/* We don't need to handle errors from xfrm_input, it does all
* the error handling and frees the resources on error. */
- xfrm_input(skb, IPPROTO_ESP, spi, -2);
+ xfrm_input(skb, IPPROTO_ESP, spi, encap_type);
return ERR_PTR(-EINPROGRESS);
+out_reset:
+ secpath_reset(skb);
out:
skb_push(skb, offset);
NAPI_GRO_CB(skb)->same_flow = 0;
@@ -121,9 +135,16 @@ static void esp6_gso_encap(struct xfrm_state *x, struct sk_buff *skb)
struct ip_esp_hdr *esph;
struct ipv6hdr *iph = ipv6_hdr(skb);
struct xfrm_offload *xo = xfrm_offload(skb);
- int proto = iph->nexthdr;
+ u8 proto = iph->nexthdr;
skb_push(skb, -skb_network_offset(skb));
+
+ if (x->outer_mode.encap == XFRM_MODE_TRANSPORT) {
+ __be16 frag;
+
+ ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto, &frag);
+ }
+
esph = ip_esp_hdr(skb);
*skb_mac_header(skb) = IPPROTO_ESP;
@@ -133,6 +154,95 @@ static void esp6_gso_encap(struct xfrm_state *x, struct sk_buff *skb)
xo->proto = proto;
}
+static struct sk_buff *xfrm6_tunnel_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x,
+ XFRM_MODE_SKB_CB(skb)->protocol);
+ __be16 type = inner_mode->family == AF_INET ? htons(ETH_P_IP)
+ : htons(ETH_P_IPV6);
+
+ return skb_eth_gso_segment(skb, features, type);
+}
+
+static struct sk_buff *xfrm6_transport_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ const struct net_offload *ops;
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ struct xfrm_offload *xo = xfrm_offload(skb);
+
+ skb->transport_header += x->props.header_len;
+ ops = rcu_dereference(inet6_offloads[xo->proto]);
+ if (likely(ops && ops->callbacks.gso_segment))
+ segs = ops->callbacks.gso_segment(skb, features);
+
+ return segs;
+}
+
+static struct sk_buff *xfrm6_beet_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ struct xfrm_offload *xo = xfrm_offload(skb);
+ struct sk_buff *segs = ERR_PTR(-EINVAL);
+ const struct net_offload *ops;
+ u8 proto = xo->proto;
+
+ skb->transport_header += x->props.header_len;
+
+ if (x->sel.family != AF_INET6) {
+ skb->transport_header -=
+ (sizeof(struct ipv6hdr) - sizeof(struct iphdr));
+
+ if (proto == IPPROTO_BEETPH) {
+ struct ip_beet_phdr *ph =
+ (struct ip_beet_phdr *)skb->data;
+
+ skb->transport_header += ph->hdrlen * 8;
+ proto = ph->nexthdr;
+ } else {
+ skb->transport_header -= IPV4_BEET_PHMAXLEN;
+ }
+
+ if (proto == IPPROTO_TCP)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6;
+ } else {
+ __be16 frag;
+
+ skb->transport_header +=
+ ipv6_skip_exthdr(skb, 0, &proto, &frag);
+ }
+
+ if (proto == IPPROTO_IPIP)
+ skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP6;
+
+ __skb_pull(skb, skb_transport_offset(skb));
+ ops = rcu_dereference(inet6_offloads[proto]);
+ if (likely(ops && ops->callbacks.gso_segment))
+ segs = ops->callbacks.gso_segment(skb, features);
+
+ return segs;
+}
+
+static struct sk_buff *xfrm6_outer_mode_gso_segment(struct xfrm_state *x,
+ struct sk_buff *skb,
+ netdev_features_t features)
+{
+ switch (x->outer_mode.encap) {
+ case XFRM_MODE_TUNNEL:
+ return xfrm6_tunnel_gso_segment(x, skb, features);
+ case XFRM_MODE_TRANSPORT:
+ return xfrm6_transport_gso_segment(x, skb, features);
+ case XFRM_MODE_BEET:
+ return xfrm6_beet_gso_segment(x, skb, features);
+ }
+
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
static struct sk_buff *esp6_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
@@ -141,6 +251,7 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb,
struct crypto_aead *aead;
netdev_features_t esp_features = features;
struct xfrm_offload *xo = xfrm_offload(skb);
+ struct sec_path *sp;
if (!xo)
return ERR_PTR(-EINVAL);
@@ -148,7 +259,8 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb,
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP))
return ERR_PTR(-EINVAL);
- x = skb->sp->xvec[skb->sp->len - 1];
+ sp = skb_sec_path(skb);
+ x = sp->xvec[sp->len - 1];
aead = x->data;
esph = ip_esp_hdr(skb);
@@ -163,13 +275,15 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb,
skb->encap_hdr_csum = 1;
if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev)
- esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
+ esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK |
+ NETIF_F_SCTP_CRC);
else if (!(features & NETIF_F_HW_ESP_TX_CSUM))
- esp_features = features & ~NETIF_F_CSUM_MASK;
+ esp_features = features & ~(NETIF_F_CSUM_MASK |
+ NETIF_F_SCTP_CRC);
xo->flags |= XFRM_GSO_SEGMENT;
- return x->outer_mode->gso_segment(x, skb, esp_features);
+ return xfrm6_outer_mode_gso_segment(x, skb, esp_features);
}
static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb)
@@ -193,7 +307,6 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features
int alen;
int blksize;
struct xfrm_offload *xo;
- struct ip_esp_hdr *esph;
struct crypto_aead *aead;
struct esp_info esp;
bool hw_offload = true;
@@ -226,7 +339,7 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features
esp.plen = esp.clen - skb->len - esp.tfclen;
esp.tailen = esp.tfclen + esp.plen + alen;
- if (!hw_offload || (hw_offload && !skb_is_gso(skb))) {
+ if (!hw_offload || !skb_is_gso(skb)) {
esp.nfrags = esp6_output_head(x, skb, &esp);
if (esp.nfrags < 0)
return esp.nfrags;
@@ -234,13 +347,13 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features
seq = xo->seq.low;
- esph = ip_esp_hdr(skb);
- esph->spi = x->id.spi;
+ esp.esph = ip_esp_hdr(skb);
+ esp.esph->spi = x->id.spi;
skb_push(skb, -skb_network_offset(skb));
if (xo->flags & XFRM_GSO_SEGMENT) {
- esph->seq_no = htonl(seq);
+ esp.esph->seq_no = htonl(seq);
if (!skb_is_gso(skb))
xo->seq.low++;
@@ -248,6 +361,9 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features
xo->seq.low += skb_shinfo(skb)->gso_segs;
}
+ if (xo->seq.low < seq)
+ xo->seq.hi++;
+
esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32));
len = skb->len - sizeof(struct ipv6hdr);
@@ -256,8 +372,17 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features
ipv6_hdr(skb)->payload_len = htons(len);
- if (hw_offload)
+ if (hw_offload) {
+ if (!skb_ext_add(skb, SKB_EXT_SEC_PATH))
+ return -ENOMEM;
+
+ xo = xfrm_offload(skb);
+ if (!xo)
+ return -EINVAL;
+
+ xo->flags |= XFRM_XMIT;
return 0;
+ }
err = esp6_output_tail(x, skb, &esp);
if (err)
@@ -265,6 +390,9 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features
secpath_reset(skb);
+ if (skb_needs_linearize(skb, skb->dev->features) &&
+ __skb_linearize(skb))
+ return -ENOMEM;
return 0;
}
@@ -276,7 +404,6 @@ static const struct net_offload esp6_offload = {
};
static const struct xfrm_type_offload esp6_type_offload = {
- .description = "ESP6 OFFLOAD",
.owner = THIS_MODULE,
.proto = IPPROTO_ESP,
.input_tail = esp6_input_tail,
@@ -296,9 +423,7 @@ static int __init esp6_offload_init(void)
static void __exit esp6_offload_exit(void)
{
- if (xfrm_unregister_type_offload(&esp6_type_offload, AF_INET6) < 0)
- pr_info("%s: can't remove xfrm type offload\n", __func__);
-
+ xfrm_unregister_type_offload(&esp6_type_offload, AF_INET6);
inet6_del_offload(&esp6_offload, IPPROTO_ESP);
}
@@ -307,3 +432,4 @@ module_exit(esp6_offload_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET6, XFRM_PROTO_ESP);
+MODULE_DESCRIPTION("IPV6 GSO/GRO offload support");
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 20291c2036fc..a23eb8734e15 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Extension Header handling for IPv6
* Linux INET6 implementation
@@ -6,11 +7,6 @@
* Pedro Roque <roque@di.fc.ul.pt>
* Andi Kleen <ak@muc.de>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/* Changes:
@@ -52,22 +48,14 @@
#ifdef CONFIG_IPV6_SEG6_HMAC
#include <net/seg6_hmac.h>
#endif
+#include <net/rpl.h>
+#include <linux/ioam6.h>
+#include <linux/ioam6_genl.h>
+#include <net/ioam6.h>
+#include <net/dst_metadata.h>
#include <linux/uaccess.h>
-/*
- * Parsing tlv encoded headers.
- *
- * Parsing function "func" returns true, if parsing succeed
- * and false, if it failed.
- * It MUST NOT touch skb->h.
- */
-
-struct tlvtype_proc {
- int type;
- bool (*func)(struct sk_buff *skb, int offset);
-};
-
/*********************
Generic functions
*********************/
@@ -101,27 +89,35 @@ static bool ip6_tlvopt_unknown(struct sk_buff *skb, int optoff,
*/
if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr))
break;
- /* fall through */
+ fallthrough;
case 2: /* send ICMP PARM PROB regardless and drop packet */
- icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, optoff);
+ icmpv6_param_prob_reason(skb, ICMPV6_UNK_OPTION, optoff,
+ SKB_DROP_REASON_UNHANDLED_PROTO);
return false;
}
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO);
return false;
}
+static bool ipv6_hop_ra(struct sk_buff *skb, int optoff);
+static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff);
+static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff);
+static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff);
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+static bool ipv6_dest_hao(struct sk_buff *skb, int optoff);
+#endif
+
/* Parse tlv encoded option header (hop-by-hop or destination) */
-static bool ip6_parse_tlv(const struct tlvtype_proc *procs,
+static bool ip6_parse_tlv(bool hopbyhop,
struct sk_buff *skb,
int max_count)
{
int len = (skb_transport_header(skb)[1] + 1) << 3;
const unsigned char *nh = skb_network_header(skb);
int off = skb_network_header_len(skb);
- const struct tlvtype_proc *curr;
bool disallow_unknowns = false;
int tlv_count = 0;
int padlen = 0;
@@ -131,25 +127,27 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs,
max_count = -max_count;
}
- if (skb_transport_offset(skb) + len > skb_headlen(skb))
- goto bad;
-
off += 2;
len -= 2;
while (len > 0) {
- int optlen = nh[off + 1] + 2;
- int i;
+ int optlen, i;
- switch (nh[off]) {
- case IPV6_TLV_PAD1:
- optlen = 1;
+ if (nh[off] == IPV6_TLV_PAD1) {
padlen++;
if (padlen > 7)
goto bad;
- break;
+ off++;
+ len--;
+ continue;
+ }
+ if (len < 2)
+ goto bad;
+ optlen = nh[off + 1] + 2;
+ if (optlen > len)
+ goto bad;
- case IPV6_TLV_PADN:
+ if (nh[off] == IPV6_TLV_PADN) {
/* RFC 2460 states that the purpose of PadN is
* to align the containing header to multiples
* of 8. 7 is therefore the highest valid value.
@@ -166,32 +164,53 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs,
if (nh[off + i] != 0)
goto bad;
}
- break;
-
- default: /* Other TLV code so scan list */
- if (optlen > len)
- goto bad;
-
+ } else {
tlv_count++;
if (tlv_count > max_count)
goto bad;
- for (curr = procs; curr->type >= 0; curr++) {
- if (curr->type == nh[off]) {
- /* type specific length/alignment
- checks will be performed in the
- func(). */
- if (curr->func(skb, off) == false)
+ if (hopbyhop) {
+ switch (nh[off]) {
+ case IPV6_TLV_ROUTERALERT:
+ if (!ipv6_hop_ra(skb, off))
+ return false;
+ break;
+ case IPV6_TLV_IOAM:
+ if (!ipv6_hop_ioam(skb, off))
+ return false;
+
+ nh = skb_network_header(skb);
+ break;
+ case IPV6_TLV_JUMBO:
+ if (!ipv6_hop_jumbo(skb, off))
+ return false;
+ break;
+ case IPV6_TLV_CALIPSO:
+ if (!ipv6_hop_calipso(skb, off))
+ return false;
+ break;
+ default:
+ if (!ip6_tlvopt_unknown(skb, off,
+ disallow_unknowns))
+ return false;
+ break;
+ }
+ } else {
+ switch (nh[off]) {
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ case IPV6_TLV_HAO:
+ if (!ipv6_dest_hao(skb, off))
+ return false;
+ break;
+#endif
+ default:
+ if (!ip6_tlvopt_unknown(skb, off,
+ disallow_unknowns))
return false;
break;
}
}
- if (curr->type < 0 &&
- !ip6_tlvopt_unknown(skb, off, disallow_unknowns))
- return false;
-
padlen = 0;
- break;
}
off += optlen;
len -= optlen;
@@ -200,7 +219,7 @@ static bool ip6_parse_tlv(const struct tlvtype_proc *procs,
if (len == 0)
return true;
bad:
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
return false;
}
@@ -214,6 +233,7 @@ static bool ipv6_dest_hao(struct sk_buff *skb, int optoff)
struct ipv6_destopt_hao *hao;
struct inet6_skb_parm *opt = IP6CB(skb);
struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ SKB_DR(reason);
int ret;
if (opt->dsthao) {
@@ -228,19 +248,23 @@ static bool ipv6_dest_hao(struct sk_buff *skb, int optoff)
if (hao->length != 16) {
net_dbg_ratelimited("hao invalid option length = %d\n",
hao->length);
+ SKB_DR_SET(reason, IP_INHDR);
goto discard;
}
if (!(ipv6_addr_type(&hao->addr) & IPV6_ADDR_UNICAST)) {
net_dbg_ratelimited("hao is not an unicast addr: %pI6\n",
&hao->addr);
+ SKB_DR_SET(reason, INVALID_PROTO);
goto discard;
}
ret = xfrm6_input_addr(skb, (xfrm_address_t *)&ipv6h->daddr,
(xfrm_address_t *)&hao->addr, IPPROTO_DSTOPTS);
- if (unlikely(ret < 0))
+ if (unlikely(ret < 0)) {
+ SKB_DR_SET(reason, XFRM_POLICY);
goto discard;
+ }
if (skb_cloned(skb)) {
if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
@@ -263,21 +287,11 @@ static bool ipv6_dest_hao(struct sk_buff *skb, int optoff)
return true;
discard:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return false;
}
#endif
-static const struct tlvtype_proc tlvprocdestopt_lst[] = {
-#if IS_ENABLED(CONFIG_IPV6_MIP6)
- {
- .type = IPV6_TLV_HAO,
- .func = ipv6_dest_hao,
- },
-#endif
- {-1, NULL}
-};
-
static int ipv6_destopt_rcv(struct sk_buff *skb)
{
struct inet6_dev *idev = __in6_dev_get(skb->dev);
@@ -292,7 +306,7 @@ static int ipv6_destopt_rcv(struct sk_buff *skb)
if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
!pskb_may_pull(skb, (skb_transport_offset(skb) +
((skb_transport_header(skb)[1] + 1) << 3)))) {
- __IP6_INC_STATS(dev_net(dst->dev), idev,
+ __IP6_INC_STATS(dev_net(dst_dev(dst)), idev,
IPSTATS_MIB_INHDRERRORS);
fail_and_free:
kfree_skb(skb);
@@ -308,8 +322,7 @@ fail_and_free:
dstbuf = opt->dst1;
#endif
- if (ip6_parse_tlv(tlvprocdestopt_lst, skb,
- init_net.ipv6.sysctl.max_dst_opts_cnt)) {
+ if (ip6_parse_tlv(false, skb, net->ipv6.sysctl.max_dst_opts_cnt)) {
skb->transport_header += extlen;
opt = IP6CB(skb);
#if IS_ENABLED(CONFIG_IPV6_MIP6)
@@ -366,9 +379,8 @@ static int ipv6_srh_rcv(struct sk_buff *skb)
idev = __in6_dev_get(skb->dev);
- accept_seg6 = net->ipv6.devconf_all->seg6_enabled;
- if (accept_seg6 > idev->cnf.seg6_enabled)
- accept_seg6 = idev->cnf.seg6_enabled;
+ accept_seg6 = min(READ_ONCE(net->ipv6.devconf_all->seg6_enabled),
+ READ_ONCE(idev->cnf.seg6_enabled));
if (!accept_seg6) {
kfree_skb(skb);
@@ -384,23 +396,20 @@ static int ipv6_srh_rcv(struct sk_buff *skb)
looped_back:
if (hdr->segments_left == 0) {
- if (hdr->nexthdr == NEXTHDR_IPV6) {
+ if (hdr->nexthdr == NEXTHDR_IPV6 || hdr->nexthdr == NEXTHDR_IPV4) {
int offset = (hdr->hdrlen + 1) << 3;
skb_postpull_rcsum(skb, skb_network_header(skb),
skb_network_header_len(skb));
-
- if (!pskb_pull(skb, offset)) {
- kfree_skb(skb);
- return -1;
- }
+ skb_pull(skb, offset);
skb_postpull_rcsum(skb, skb_transport_header(skb),
offset);
skb_reset_network_header(skb);
skb_reset_transport_header(skb);
skb->encapsulation = 0;
-
+ if (hdr->nexthdr == NEXTHDR_IPV4)
+ skb->protocol = htons(ETH_P_IP);
__skb_tunnel_rx(skb, skb->dev, net);
netif_rx(skb);
@@ -430,9 +439,9 @@ looped_back:
kfree_skb(skb);
return -1;
}
- }
- hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
+ hdr = (struct ipv6_sr_hdr *)skb_transport_header(skb);
+ }
hdr->segments_left--;
addr = hdr->segments + hdr->segments_left;
@@ -444,7 +453,164 @@ looped_back:
ipv6_hdr(skb)->daddr = *addr;
- skb_dst_drop(skb);
+ ip6_route_input(skb);
+
+ if (skb_dst(skb)->error) {
+ dst_input(skb);
+ return -1;
+ }
+
+ if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) {
+ if (ipv6_hdr(skb)->hop_limit <= 1) {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ icmpv6_send(skb, ICMPV6_TIME_EXCEED,
+ ICMPV6_EXC_HOPLIMIT, 0);
+ kfree_skb(skb);
+ return -1;
+ }
+ ipv6_hdr(skb)->hop_limit--;
+
+ skb_pull(skb, sizeof(struct ipv6hdr));
+ goto looped_back;
+ }
+
+ dst_input(skb);
+
+ return -1;
+}
+
+static int ipv6_rpl_srh_rcv(struct sk_buff *skb)
+{
+ struct ipv6_rpl_sr_hdr *hdr, *ohdr, *chdr;
+ struct inet6_skb_parm *opt = IP6CB(skb);
+ struct net *net = dev_net(skb->dev);
+ struct inet6_dev *idev;
+ struct ipv6hdr *oldhdr;
+ unsigned char *buf;
+ int accept_rpl_seg;
+ int i, err;
+ u64 n = 0;
+ u32 r;
+
+ idev = __in6_dev_get(skb->dev);
+
+ accept_rpl_seg = min(READ_ONCE(net->ipv6.devconf_all->rpl_seg_enabled),
+ READ_ONCE(idev->cnf.rpl_seg_enabled));
+ if (!accept_rpl_seg) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+looped_back:
+ hdr = (struct ipv6_rpl_sr_hdr *)skb_transport_header(skb);
+
+ if (hdr->segments_left == 0) {
+ if (hdr->nexthdr == NEXTHDR_IPV6) {
+ int offset = (hdr->hdrlen + 1) << 3;
+
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ skb_network_header_len(skb));
+ skb_pull(skb, offset);
+ skb_postpull_rcsum(skb, skb_transport_header(skb),
+ offset);
+
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
+ skb->encapsulation = 0;
+
+ __skb_tunnel_rx(skb, skb->dev, net);
+
+ netif_rx(skb);
+ return -1;
+ }
+
+ opt->srcrt = skb_network_header_len(skb);
+ opt->lastopt = opt->srcrt;
+ skb->transport_header += (hdr->hdrlen + 1) << 3;
+ opt->nhoff = (&hdr->nexthdr) - skb_network_header(skb);
+
+ return 1;
+ }
+
+ n = (hdr->hdrlen << 3) - hdr->pad - (16 - hdr->cmpre);
+ r = do_div(n, (16 - hdr->cmpri));
+ /* checks if calculation was without remainder and n fits into
+ * unsigned char which is segments_left field. Should not be
+ * higher than that.
+ */
+ if (r || (n + 1) > 255) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+ if (hdr->segments_left > n + 1) {
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD,
+ ((&hdr->segments_left) -
+ skb_network_header(skb)));
+ return -1;
+ }
+
+ hdr->segments_left--;
+ i = n - hdr->segments_left;
+
+ buf = kcalloc(struct_size(hdr, segments.addr, n + 2), 2, GFP_ATOMIC);
+ if (unlikely(!buf)) {
+ kfree_skb(skb);
+ return -1;
+ }
+
+ ohdr = (struct ipv6_rpl_sr_hdr *)buf;
+ ipv6_rpl_srh_decompress(ohdr, hdr, &ipv6_hdr(skb)->daddr, n);
+ chdr = (struct ipv6_rpl_sr_hdr *)(buf + ((ohdr->hdrlen + 1) << 3));
+
+ if (ipv6_addr_is_multicast(&ohdr->rpl_segaddr[i])) {
+ kfree_skb(skb);
+ kfree(buf);
+ return -1;
+ }
+
+ err = ipv6_chk_rpl_srh_loop(net, ohdr->rpl_segaddr, n + 1);
+ if (err) {
+ icmpv6_send(skb, ICMPV6_PARAMPROB, 0, 0);
+ kfree_skb(skb);
+ kfree(buf);
+ return -1;
+ }
+
+ swap(ipv6_hdr(skb)->daddr, ohdr->rpl_segaddr[i]);
+
+ ipv6_rpl_srh_compress(chdr, ohdr, &ipv6_hdr(skb)->daddr, n);
+
+ oldhdr = ipv6_hdr(skb);
+
+ skb_pull(skb, ((hdr->hdrlen + 1) << 3));
+ skb_postpull_rcsum(skb, oldhdr,
+ sizeof(struct ipv6hdr) + ((hdr->hdrlen + 1) << 3));
+ if (unlikely(!hdr->segments_left)) {
+ if (pskb_expand_head(skb, sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3), 0,
+ GFP_ATOMIC)) {
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTDISCARDS);
+ kfree_skb(skb);
+ kfree(buf);
+ return -1;
+ }
+
+ oldhdr = ipv6_hdr(skb);
+ }
+ skb_push(skb, ((chdr->hdrlen + 1) << 3) + sizeof(struct ipv6hdr));
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ memmove(ipv6_hdr(skb), oldhdr, sizeof(struct ipv6hdr));
+ memcpy(skb_transport_header(skb), chdr, (chdr->hdrlen + 1) << 3);
+
+ ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+ skb_postpush_rcsum(skb, ipv6_hdr(skb),
+ sizeof(struct ipv6hdr) + ((chdr->hdrlen + 1) << 3));
+
+ kfree(buf);
ip6_route_input(skb);
@@ -453,7 +619,7 @@ looped_back:
return -1;
}
- if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) {
+ if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) {
if (ipv6_hdr(skb)->hop_limit <= 1) {
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
icmpv6_send(skb, ICMPV6_TIME_EXCEED,
@@ -482,16 +648,17 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
struct inet6_dev *idev = __in6_dev_get(skb->dev);
struct inet6_skb_parm *opt = IP6CB(skb);
struct in6_addr *addr = NULL;
- struct in6_addr daddr;
int n, i;
struct ipv6_rt_hdr *hdr;
struct rt0_hdr *rthdr;
struct net *net = dev_net(skb->dev);
- int accept_source_route = net->ipv6.devconf_all->accept_source_route;
+ int accept_source_route;
- idev = __in6_dev_get(skb->dev);
- if (idev && accept_source_route > idev->cnf.accept_source_route)
- accept_source_route = idev->cnf.accept_source_route;
+ accept_source_route = READ_ONCE(net->ipv6.devconf_all->accept_source_route);
+
+ if (idev)
+ accept_source_route = min(accept_source_route,
+ READ_ONCE(idev->cnf.accept_source_route));
if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) ||
!pskb_may_pull(skb, (skb_transport_offset(skb) +
@@ -510,9 +677,16 @@ static int ipv6_rthdr_rcv(struct sk_buff *skb)
return -1;
}
- /* segment routing */
- if (hdr->type == IPV6_SRCRT_TYPE_4)
+ switch (hdr->type) {
+ case IPV6_SRCRT_TYPE_4:
+ /* segment routing */
return ipv6_srh_rcv(skb);
+ case IPV6_SRCRT_TYPE_3:
+ /* rpl segment routing */
+ return ipv6_rpl_srh_rcv(skb);
+ default:
+ break;
+ }
looped_back:
if (hdr->segments_left == 0) {
@@ -607,7 +781,7 @@ looped_back:
kfree_skb(skb);
return -1;
}
- if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) {
+ if (!ipv6_chk_home_addr(skb_dst_dev_net(skb), addr)) {
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
kfree_skb(skb);
return -1;
@@ -624,19 +798,16 @@ looped_back:
return -1;
}
- daddr = *addr;
- *addr = ipv6_hdr(skb)->daddr;
- ipv6_hdr(skb)->daddr = daddr;
+ swap(*addr, ipv6_hdr(skb)->daddr);
- skb_dst_drop(skb);
ip6_route_input(skb);
if (skb_dst(skb)->error) {
- skb_push(skb, skb->data - skb_network_header(skb));
+ skb_push(skb, -skb_network_offset(skb));
dst_input(skb);
return -1;
}
- if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) {
+ if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) {
if (ipv6_hdr(skb)->hop_limit <= 1) {
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
@@ -648,7 +819,7 @@ looped_back:
goto looped_back;
}
- skb_push(skb, skb->data - skb_network_header(skb));
+ skb_push(skb, -skb_network_offset(skb));
dst_input(skb);
return -1;
@@ -710,19 +881,6 @@ void ipv6_exthdrs_exit(void)
Hop-by-hop options.
**********************************/
-/*
- * Note: we cannot rely on skb_dst(skb) before we assign it in ip6_route_input().
- */
-static inline struct inet6_dev *ipv6_skb_idev(struct sk_buff *skb)
-{
- return skb_dst(skb) ? ip6_dst_idev(skb_dst(skb)) : __in6_dev_get(skb->dev);
-}
-
-static inline struct net *ipv6_skb_net(struct sk_buff *skb)
-{
- return skb_dst(skb) ? dev_net(skb_dst(skb)->dev) : dev_net(skb->dev);
-}
-
/* Router Alert as of RFC 2711 */
static bool ipv6_hop_ra(struct sk_buff *skb, int optoff)
@@ -736,7 +894,72 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff)
}
net_dbg_ratelimited("ipv6_hop_ra: wrong RA length %d\n",
nh[optoff + 1]);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
+ return false;
+}
+
+/* IOAM */
+
+static bool ipv6_hop_ioam(struct sk_buff *skb, int optoff)
+{
+ struct ioam6_trace_hdr *trace;
+ struct ioam6_namespace *ns;
+ struct ioam6_hdr *hdr;
+
+ /* Bad alignment (must be 4n-aligned) */
+ if (optoff & 3)
+ goto drop;
+
+ /* Ignore if IOAM is not enabled on ingress */
+ if (!READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_enabled))
+ goto ignore;
+
+ /* Truncated Option header */
+ hdr = (struct ioam6_hdr *)(skb_network_header(skb) + optoff);
+ if (hdr->opt_len < 2)
+ goto drop;
+
+ switch (hdr->type) {
+ case IOAM6_TYPE_PREALLOC:
+ /* Truncated Pre-allocated Trace header */
+ if (hdr->opt_len < 2 + sizeof(*trace))
+ goto drop;
+
+ /* Malformed Pre-allocated Trace header */
+ trace = (struct ioam6_trace_hdr *)((u8 *)hdr + sizeof(*hdr));
+ if (hdr->opt_len < 2 + sizeof(*trace) + trace->remlen * 4)
+ goto drop;
+
+ /* Ignore if the IOAM namespace is unknown */
+ ns = ioam6_namespace(dev_net(skb->dev), trace->namespace_id);
+ if (!ns)
+ goto ignore;
+
+ if (!skb_valid_dst(skb))
+ ip6_route_input(skb);
+
+ /* About to mangle packet header */
+ if (skb_ensure_writable(skb, optoff + 2 + hdr->opt_len))
+ goto drop;
+
+ /* Trace pointer may have changed */
+ trace = (struct ioam6_trace_hdr *)(skb_network_header(skb)
+ + optoff + sizeof(*hdr));
+
+ ioam6_fill_trace_data(skb, ns, trace, true);
+
+ ioam6_event(IOAM6_EVENT_TRACE, dev_net(skb->dev),
+ GFP_ATOMIC, (void *)trace, hdr->opt_len - 2);
+ break;
+ default:
+ break;
+ }
+
+ignore:
+ return true;
+
+drop:
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
return false;
}
@@ -745,31 +968,30 @@ static bool ipv6_hop_ra(struct sk_buff *skb, int optoff)
static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
{
const unsigned char *nh = skb_network_header(skb);
- struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
- struct net *net = ipv6_skb_net(skb);
+ SKB_DR(reason);
u32 pkt_len;
if (nh[optoff + 1] != 4 || (optoff & 3) != 2) {
net_dbg_ratelimited("ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n",
nh[optoff+1]);
- __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ SKB_DR_SET(reason, IP_INHDR);
goto drop;
}
pkt_len = ntohl(*(__be32 *)(nh + optoff + 2));
if (pkt_len <= IPV6_MAXPLEN) {
- __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
- icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2);
+ icmpv6_param_prob_reason(skb, ICMPV6_HDR_FIELD, optoff + 2,
+ SKB_DROP_REASON_IP_INHDR);
return false;
}
if (ipv6_hdr(skb)->payload_len) {
- __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
- icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff);
+ icmpv6_param_prob_reason(skb, ICMPV6_HDR_FIELD, optoff,
+ SKB_DROP_REASON_IP_INHDR);
return false;
}
if (pkt_len > skb->len - sizeof(struct ipv6hdr)) {
- __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTRUNCATEDPKTS);
+ SKB_DR_SET(reason, PKT_TOO_SMALL);
goto drop;
}
@@ -780,7 +1002,7 @@ static bool ipv6_hop_jumbo(struct sk_buff *skb, int optoff)
return true;
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return false;
}
@@ -802,26 +1024,10 @@ static bool ipv6_hop_calipso(struct sk_buff *skb, int optoff)
return true;
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
return false;
}
-static const struct tlvtype_proc tlvprochopopt_lst[] = {
- {
- .type = IPV6_TLV_ROUTERALERT,
- .func = ipv6_hop_ra,
- },
- {
- .type = IPV6_TLV_JUMBO,
- .func = ipv6_hop_jumbo,
- },
- {
- .type = IPV6_TLV_CALIPSO,
- .func = ipv6_hop_calipso,
- },
- { -1, }
-};
-
int ipv6_parse_hopopts(struct sk_buff *skb)
{
struct inet6_skb_parm *opt = IP6CB(skb);
@@ -847,8 +1053,7 @@ fail_and_free:
goto fail_and_free;
opt->flags |= IP6SKB_HOPBYHOP;
- if (ip6_parse_tlv(tlvprochopopt_lst, skb,
- init_net.ipv6.sysctl.max_hbh_opts_cnt)) {
+ if (ip6_parse_tlv(true, skb, net->ipv6.sysctl.max_hbh_opts_cnt)) {
skb->transport_header += extlen;
opt = IP6CB(skb);
opt->nhoff = sizeof(struct ipv6hdr);
@@ -997,10 +1202,9 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
{
struct ipv6_txoptions *opt2;
- opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC);
+ opt2 = sock_kmemdup(sk, opt, opt->tot_len, GFP_ATOMIC);
if (opt2) {
long dif = (char *)opt2 - (char *)opt;
- memcpy(opt2, opt, opt->tot_len);
if (opt2->hopopt)
*((char **)&opt2->hopopt) += dif;
if (opt2->dst0opt)
@@ -1039,7 +1243,6 @@ static void ipv6_renew_option(int renewtype,
* @opt: original options
* @newtype: option type to replace in @opt
* @newopt: new option of type @newtype to replace (user-mem)
- * @newoptlen: length of @newopt
*
* Returns a new set of options which is a copy of @opt with the
* option type @newtype replaced with @newopt.
@@ -1110,14 +1313,14 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
return opt2;
}
-struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
- struct ipv6_txoptions *opt)
+struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space,
+ struct ipv6_txoptions *opt)
{
/*
* ignore the dest before srcrt unless srcrt is being included.
* --yoshfuji
*/
- if (opt && opt->dst0opt && !opt->srcrt) {
+ if (opt->dst0opt && !opt->srcrt) {
if (opt_space != opt) {
memcpy(opt_space, opt, sizeof(*opt_space));
opt = opt_space;
@@ -1128,7 +1331,7 @@ struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
return opt;
}
-EXPORT_SYMBOL_GPL(ipv6_fixup_options);
+EXPORT_SYMBOL_GPL(__ipv6_fixup_options);
/**
* fl6_update_dst - update flowi destination address with info given
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index ae365df8abf7..49e31e4ae7b7 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* IPv6 library code, needed by static components when full IPv6 support is
* not configured or static.
@@ -142,6 +143,8 @@ int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type)
optlen = 1;
break;
default:
+ if (len < 2)
+ goto bad;
optlen = nh[offset + 1] + 2;
if (optlen > len)
goto bad;
@@ -196,10 +199,8 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
struct ipv6hdr _ip6, *ip6;
ip6 = skb_header_pointer(skb, *offset, sizeof(_ip6), &_ip6);
- if (!ip6 || (ip6->version != 6)) {
- printk(KERN_ERR "IPv6 header not found\n");
+ if (!ip6 || (ip6->version != 6))
return -EBADMSG;
- }
start = *offset + sizeof(struct ipv6hdr);
nexthdr = ip6->nexthdr;
}
@@ -265,7 +266,7 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
} else if (nexthdr == NEXTHDR_AUTH) {
if (flags && (*flags & IP6_FH_F_AUTH) && (target < 0))
break;
- hdrlen = (hp->hdrlen + 2) << 2;
+ hdrlen = ipv6_authlen(hp);
} else
hdrlen = ipv6_optlen(hp);
diff --git a/net/ipv6/exthdrs_offload.c b/net/ipv6/exthdrs_offload.c
index f5e2ba1c18bf..4c00398f4dca 100644
--- a/net/ipv6/exthdrs_offload.c
+++ b/net/ipv6/exthdrs_offload.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPV6 GSO/GRO offload support
* Linux INET6 implementation
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* IPV6 Extension Header GSO/GRO support
*/
#include <net/protocol.h>
@@ -20,6 +16,10 @@ static const struct net_offload dstopt_offload = {
.flags = INET6_PROTO_GSO_EXTHDR,
};
+static const struct net_offload hbh_offload = {
+ .flags = INET6_PROTO_GSO_EXTHDR,
+};
+
int __init ipv6_exthdrs_offload_init(void)
{
int ret;
@@ -32,9 +32,16 @@ int __init ipv6_exthdrs_offload_init(void)
if (ret)
goto out_rt;
+ ret = inet6_add_offload(&hbh_offload, IPPROTO_HOPOPTS);
+ if (ret)
+ goto out_dstopts;
+
out:
return ret;
+out_dstopts:
+ inet6_del_offload(&dstopt_offload, IPPROTO_DSTOPTS);
+
out_rt:
inet6_del_offload(&rthdr_offload, IPPROTO_ROUTING);
goto out;
diff --git a/net/ipv6/fib6_notifier.c b/net/ipv6/fib6_notifier.c
index 05f82baaa99e..949b72610df7 100644
--- a/net/ipv6/fib6_notifier.c
+++ b/net/ipv6/fib6_notifier.c
@@ -7,12 +7,12 @@
#include <net/netns/ipv6.h>
#include <net/ip6_fib.h>
-int call_fib6_notifier(struct notifier_block *nb, struct net *net,
+int call_fib6_notifier(struct notifier_block *nb,
enum fib_event_type event_type,
struct fib_notifier_info *info)
{
info->family = AF_INET6;
- return call_fib_notifier(nb, net, event_type, info);
+ return call_fib_notifier(nb, event_type, info);
}
int call_fib6_notifiers(struct net *net, enum fib_event_type event_type,
@@ -22,20 +22,21 @@ int call_fib6_notifiers(struct net *net, enum fib_event_type event_type,
return call_fib_notifiers(net, event_type, info);
}
-static unsigned int fib6_seq_read(struct net *net)
+static unsigned int fib6_seq_read(const struct net *net)
{
return fib6_tables_seq_read(net) + fib6_rules_seq_read(net);
}
-static int fib6_dump(struct net *net, struct notifier_block *nb)
+static int fib6_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
int err;
- err = fib6_rules_dump(net, nb);
+ err = fib6_rules_dump(net, nb, extack);
if (err)
return err;
- return fib6_tables_dump(net, nb);
+ return fib6_tables_dump(net, nb, extack);
}
static const struct fib_notifier_ops fib6_notifier_ops_template = {
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index f590446595d8..fd5f7112a51f 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* net/ipv6/fib6_rules.c IPv6 Routing Policy Rules
*
* Copyright (C)2003-2006 Helsinki University of Technology
* Copyright (C)2003-2006 USAGI/WIDE Project
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation, version 2.
- *
* Authors
* Thomas Graf <tgraf@suug.ch>
* Ville Nuorvala <vnuorval@tcs.hut.fi>
@@ -16,8 +13,10 @@
#include <linux/netdevice.h>
#include <linux/notifier.h>
#include <linux/export.h>
+#include <linux/indirect_call_wrapper.h>
#include <net/fib_rules.h>
+#include <net/inet_dscp.h>
#include <net/ipv6.h>
#include <net/addrconf.h>
#include <net/ip6_route.h>
@@ -27,14 +26,18 @@ struct fib6_rule {
struct fib_rule common;
struct rt6key src;
struct rt6key dst;
- u8 tclass;
+ __be32 flowlabel;
+ __be32 flowlabel_mask;
+ dscp_t dscp;
+ dscp_t dscp_mask;
+ u8 dscp_full:1; /* DSCP or TOS selector */
};
static bool fib6_rule_matchall(const struct fib_rule *rule)
{
struct fib6_rule *r = container_of(rule, struct fib6_rule, common);
- if (r->dst.plen || r->src.plen || r->tclass)
+ if (r->dst.plen || r->src.plen || r->dscp || r->flowlabel_mask)
return false;
return fib_rule_matchall(rule);
}
@@ -50,27 +53,28 @@ bool fib6_rule_default(const struct fib_rule *rule)
}
EXPORT_SYMBOL_GPL(fib6_rule_default);
-int fib6_rules_dump(struct net *net, struct notifier_block *nb)
+int fib6_rules_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
- return fib_rules_dump(net, nb, AF_INET6);
+ return fib_rules_dump(net, nb, AF_INET6, extack);
}
-unsigned int fib6_rules_seq_read(struct net *net)
+unsigned int fib6_rules_seq_read(const struct net *net)
{
return fib_rules_seq_read(net, AF_INET6);
}
/* called with rcu lock held; no reference taken on fib6_info */
-struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
- int flags)
+int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+ struct fib6_result *res, int flags)
{
- struct fib6_info *f6i;
int err;
if (net->ipv6.fib6_has_custom_rules) {
struct fib_lookup_arg arg = {
.lookup_ptr = fib6_table_lookup,
.lookup_data = &oif,
+ .result = res,
.flags = FIB_LOOKUP_NOREF,
};
@@ -78,19 +82,15 @@ struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
err = fib_rules_lookup(net->ipv6.fib6_rules_ops,
flowi6_to_flowi(fl6), flags, &arg);
- if (err)
- return ERR_PTR(err);
-
- f6i = arg.result ? : net->ipv6.fib6_null_entry;
} else {
- f6i = fib6_table_lookup(net, net->ipv6.fib6_local_tbl,
- oif, fl6, flags);
- if (!f6i || f6i == net->ipv6.fib6_null_entry)
- f6i = fib6_table_lookup(net, net->ipv6.fib6_main_tbl,
- oif, fl6, flags);
+ err = fib6_table_lookup(net, net->ipv6.fib6_local_tbl, oif,
+ fl6, res, flags);
+ if (err || res->f6i == net->ipv6.fib6_null_entry)
+ err = fib6_table_lookup(net, net->ipv6.fib6_main_tbl,
+ oif, fl6, res, flags);
}
- return f6i;
+ return err;
}
struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
@@ -98,9 +98,11 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
int flags, pol_lookup_t lookup)
{
if (net->ipv6.fib6_has_custom_rules) {
+ struct fib6_result res = {};
struct fib_lookup_arg arg = {
.lookup_ptr = lookup,
.lookup_data = skb,
+ .result = &res,
.flags = FIB_LOOKUP_NOREF,
};
@@ -110,22 +112,25 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
fib_rules_lookup(net->ipv6.fib6_rules_ops,
flowi6_to_flowi(fl6), flags, &arg);
- if (arg.result)
- return arg.result;
+ if (res.rt6)
+ return &res.rt6->dst;
} else {
struct rt6_info *rt;
- rt = lookup(net, net->ipv6.fib6_local_tbl, fl6, skb, flags);
+ rt = pol_lookup_func(lookup,
+ net, net->ipv6.fib6_local_tbl, fl6, skb, flags);
if (rt != net->ipv6.ip6_null_entry && rt->dst.error != -EAGAIN)
return &rt->dst;
- ip6_rt_put(rt);
- rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
+ ip6_rt_put_flags(rt, flags);
+ rt = pol_lookup_func(lookup,
+ net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
if (rt->dst.error != -EAGAIN)
return &rt->dst;
- ip6_rt_put(rt);
+ ip6_rt_put_flags(rt, flags);
}
- dst_hold(&net->ipv6.ip6_null_entry->dst);
+ if (!(flags & RT6_LOOKUP_F_DST_NOREF))
+ dst_hold(&net->ipv6.ip6_null_entry->dst);
return &net->ipv6.ip6_null_entry->dst;
}
@@ -157,11 +162,11 @@ static int fib6_rule_saddr(struct net *net, struct fib_rule *rule, int flags,
static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp,
int flags, struct fib_lookup_arg *arg)
{
+ struct fib6_result *res = arg->result;
struct flowi6 *flp6 = &flp->u.ip6;
struct net *net = rule->fr_net;
struct fib6_table *table;
- struct fib6_info *f6i;
- int err = -EAGAIN, *oif;
+ int err, *oif;
u32 tb_id;
switch (rule->action) {
@@ -182,14 +187,12 @@ static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp,
return -EAGAIN;
oif = (int *)arg->lookup_data;
- f6i = fib6_table_lookup(net, table, *oif, flp6, flags);
- if (f6i != net->ipv6.fib6_null_entry) {
+ err = fib6_table_lookup(net, table, *oif, flp6, res, flags);
+ if (!err && res->f6i != net->ipv6.fib6_null_entry)
err = fib6_rule_saddr(net, rule, flags, flp6,
- fib6_info_nh_dev(f6i));
-
- if (likely(!err))
- arg->result = f6i;
- }
+ res->nh->fib_nh_dev);
+ else
+ err = -EAGAIN;
return err;
}
@@ -197,6 +200,7 @@ static int fib6_rule_action_alt(struct fib_rule *rule, struct flowi *flp,
static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
int flags, struct fib_lookup_arg *arg)
{
+ struct fib6_result *res = arg->result;
struct flowi6 *flp6 = &flp->u.ip6;
struct rt6_info *rt = NULL;
struct fib6_table *table;
@@ -230,10 +234,15 @@ static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
goto out;
}
- rt = lookup(net, table, flp6, arg->lookup_data, flags);
+ rt = pol_lookup_func(lookup,
+ net, table, flp6, arg->lookup_data, flags);
if (rt != net->ipv6.ip6_null_entry) {
+ struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
+
+ if (!idev)
+ goto again;
err = fib6_rule_saddr(net, rule, flags, flp6,
- ip6_dst_idev(&rt->dst)->dev);
+ idev->dev);
if (err == -EAGAIN)
goto again;
@@ -243,20 +252,22 @@ static int __fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
goto out;
}
again:
- ip6_rt_put(rt);
+ ip6_rt_put_flags(rt, flags);
err = -EAGAIN;
rt = NULL;
goto out;
discard_pkt:
- dst_hold(&rt->dst);
+ if (!(flags & RT6_LOOKUP_F_DST_NOREF))
+ dst_hold(&rt->dst);
out:
- arg->result = rt;
+ res->rt6 = rt;
return err;
}
-static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
- int flags, struct fib_lookup_arg *arg)
+INDIRECT_CALLABLE_SCOPE int fib6_rule_action(struct fib_rule *rule,
+ struct flowi *flp, int flags,
+ struct fib_lookup_arg *arg)
{
if (arg->lookup_ptr == fib6_table_lookup)
return fib6_rule_action_alt(rule, flp, flags, arg);
@@ -264,11 +275,17 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
return __fib6_rule_action(rule, flp, flags, arg);
}
-static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg)
+INDIRECT_CALLABLE_SCOPE bool fib6_rule_suppress(struct fib_rule *rule,
+ int flags,
+ struct fib_lookup_arg *arg)
{
- struct rt6_info *rt = (struct rt6_info *) arg->result;
+ struct fib6_result *res = arg->result;
+ struct rt6_info *rt = res->rt6;
struct net_device *dev = NULL;
+ if (!rt)
+ return false;
+
if (rt->rt6i_idev)
dev = rt->rt6i_idev->dev;
@@ -287,11 +304,12 @@ static bool fib6_rule_suppress(struct fib_rule *rule, struct fib_lookup_arg *arg
return false;
suppress_route:
- ip6_rt_put(rt);
+ ip6_rt_put_flags(rt, flags);
return true;
}
-static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
+INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule,
+ struct flowi *fl, int flags)
{
struct fib6_rule *r = (struct fib6_rule *) rule;
struct flowi6 *fl6 = &fl->u.ip6;
@@ -314,35 +332,120 @@ static int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
return 0;
}
- if (r->tclass && r->tclass != ip6_tclass(fl6->flowlabel))
+ if ((r->dscp ^ ip6_dscp(fl6->flowlabel)) & r->dscp_mask)
+ return 0;
+
+ if ((r->flowlabel ^ flowi6_get_flowlabel(fl6)) & r->flowlabel_mask)
return 0;
if (rule->ip_proto && (rule->ip_proto != fl6->flowi6_proto))
return 0;
- if (fib_rule_port_range_set(&rule->sport_range) &&
- !fib_rule_port_inrange(&rule->sport_range, fl6->fl6_sport))
+ if (!fib_rule_port_match(&rule->sport_range, rule->sport_mask,
+ fl6->fl6_sport))
return 0;
- if (fib_rule_port_range_set(&rule->dport_range) &&
- !fib_rule_port_inrange(&rule->dport_range, fl6->fl6_dport))
+ if (!fib_rule_port_match(&rule->dport_range, rule->dport_mask,
+ fl6->fl6_dport))
return 0;
return 1;
}
-static const struct nla_policy fib6_rule_policy[FRA_MAX+1] = {
- FRA_GENERIC_POLICY,
-};
+static int fib6_nl2rule_dscp(const struct nlattr *nla, struct fib6_rule *rule6,
+ struct netlink_ext_ack *extack)
+{
+ if (rule6->dscp) {
+ NL_SET_ERR_MSG(extack, "Cannot specify both TOS and DSCP");
+ return -EINVAL;
+ }
+
+ rule6->dscp = inet_dsfield_to_dscp(nla_get_u8(nla) << 2);
+ rule6->dscp_mask = inet_dsfield_to_dscp(INET_DSCP_MASK);
+ rule6->dscp_full = true;
+
+ return 0;
+}
+
+static int fib6_nl2rule_dscp_mask(const struct nlattr *nla,
+ struct fib6_rule *rule6,
+ struct netlink_ext_ack *extack)
+{
+ dscp_t dscp_mask;
+
+ if (!rule6->dscp_full) {
+ NL_SET_ERR_MSG_ATTR(extack, nla,
+ "Cannot specify DSCP mask without DSCP value");
+ return -EINVAL;
+ }
+
+ dscp_mask = inet_dsfield_to_dscp(nla_get_u8(nla) << 2);
+ if (rule6->dscp & ~dscp_mask) {
+ NL_SET_ERR_MSG_ATTR(extack, nla, "Invalid DSCP mask");
+ return -EINVAL;
+ }
+
+ rule6->dscp_mask = dscp_mask;
+
+ return 0;
+}
+
+static int fib6_nl2rule_flowlabel(struct nlattr **tb, struct fib6_rule *rule6,
+ struct netlink_ext_ack *extack)
+{
+ __be32 flowlabel, flowlabel_mask;
+
+ if (NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL) ||
+ NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL_MASK))
+ return -EINVAL;
+
+ flowlabel = nla_get_be32(tb[FRA_FLOWLABEL]);
+ flowlabel_mask = nla_get_be32(tb[FRA_FLOWLABEL_MASK]);
+
+ if (flowlabel_mask & ~IPV6_FLOWLABEL_MASK) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[FRA_FLOWLABEL_MASK],
+ "Invalid flow label mask");
+ return -EINVAL;
+ }
+
+ if (flowlabel & ~flowlabel_mask) {
+ NL_SET_ERR_MSG(extack, "Flow label and mask do not match");
+ return -EINVAL;
+ }
+
+ rule6->flowlabel = flowlabel;
+ rule6->flowlabel_mask = flowlabel_mask;
+
+ return 0;
+}
static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh,
struct nlattr **tb,
struct netlink_ext_ack *extack)
{
+ struct fib6_rule *rule6 = (struct fib6_rule *)rule;
+ struct net *net = rule->fr_net;
int err = -EINVAL;
- struct net *net = sock_net(skb->sk);
- struct fib6_rule *rule6 = (struct fib6_rule *) rule;
+
+ if (!inet_validate_dscp(frh->tos)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid dsfield (tos): ECN bits must be 0");
+ goto errout;
+ }
+ rule6->dscp = inet_dsfield_to_dscp(frh->tos);
+ rule6->dscp_mask = frh->tos ? inet_dsfield_to_dscp(INET_DSCP_MASK) : 0;
+
+ if (tb[FRA_DSCP] && fib6_nl2rule_dscp(tb[FRA_DSCP], rule6, extack) < 0)
+ goto errout;
+
+ if (tb[FRA_DSCP_MASK] &&
+ fib6_nl2rule_dscp_mask(tb[FRA_DSCP_MASK], rule6, extack) < 0)
+ goto errout;
+
+ if ((tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) &&
+ fib6_nl2rule_flowlabel(tb, rule6, extack) < 0)
+ goto errout;
if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) {
if (rule->table == RT6_TABLE_UNSPEC) {
@@ -364,7 +467,6 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
rule6->src.plen = frh->src_len;
rule6->dst.plen = frh->dst_len;
- rule6->tclass = frh->tos;
if (fib_rule_requires_fldissect(rule))
net->ipv6.fib6_rules_require_fldissect++;
@@ -397,7 +499,33 @@ static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
if (frh->dst_len && (rule6->dst.plen != frh->dst_len))
return 0;
- if (frh->tos && (rule6->tclass != frh->tos))
+ if (frh->tos &&
+ (rule6->dscp_full ||
+ inet_dscp_to_dsfield(rule6->dscp) != frh->tos))
+ return 0;
+
+ if (tb[FRA_DSCP]) {
+ dscp_t dscp;
+
+ dscp = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP]) << 2);
+ if (!rule6->dscp_full || rule6->dscp != dscp)
+ return 0;
+ }
+
+ if (tb[FRA_DSCP_MASK]) {
+ dscp_t dscp_mask;
+
+ dscp_mask = inet_dsfield_to_dscp(nla_get_u8(tb[FRA_DSCP_MASK]) << 2);
+ if (!rule6->dscp_full || rule6->dscp_mask != dscp_mask)
+ return 0;
+ }
+
+ if (tb[FRA_FLOWLABEL] &&
+ nla_get_be32(tb[FRA_FLOWLABEL]) != rule6->flowlabel)
+ return 0;
+
+ if (tb[FRA_FLOWLABEL_MASK] &&
+ nla_get_be32(tb[FRA_FLOWLABEL_MASK]) != rule6->flowlabel_mask)
return 0;
if (frh->src_len &&
@@ -418,7 +546,22 @@ static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
frh->dst_len = rule6->dst.plen;
frh->src_len = rule6->src.plen;
- frh->tos = rule6->tclass;
+
+ if (rule6->dscp_full) {
+ frh->tos = 0;
+ if (nla_put_u8(skb, FRA_DSCP,
+ inet_dscp_to_dsfield(rule6->dscp) >> 2) ||
+ nla_put_u8(skb, FRA_DSCP_MASK,
+ inet_dscp_to_dsfield(rule6->dscp_mask) >> 2))
+ goto nla_put_failure;
+ } else {
+ frh->tos = inet_dscp_to_dsfield(rule6->dscp);
+ }
+
+ if (rule6->flowlabel_mask &&
+ (nla_put_be32(skb, FRA_FLOWLABEL, rule6->flowlabel) ||
+ nla_put_be32(skb, FRA_FLOWLABEL_MASK, rule6->flowlabel_mask)))
+ goto nla_put_failure;
if ((rule6->dst.plen &&
nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) ||
@@ -434,7 +577,16 @@ nla_put_failure:
static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule)
{
return nla_total_size(16) /* dst */
- + nla_total_size(16); /* src */
+ + nla_total_size(16) /* src */
+ + nla_total_size(1) /* dscp */
+ + nla_total_size(1) /* dscp mask */
+ + nla_total_size(4) /* flowlabel */
+ + nla_total_size(4); /* flowlabel mask */
+}
+
+static void fib6_rule_flush_cache(struct fib_rules_ops *ops)
+{
+ rt_genid_bump_ipv6(ops->fro_net);
}
static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = {
@@ -449,8 +601,8 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = {
.compare = fib6_rule_compare,
.fill = fib6_rule_fill,
.nlmsg_payload = fib6_rule_nlmsg_payload,
+ .flush_cache = fib6_rule_flush_cache,
.nlgroup = RTNLGRP_IPV6_RULE,
- .policy = fib6_rule_policy,
.owner = THIS_MODULE,
.fro_net = &init_net,
};
@@ -458,17 +610,17 @@ static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = {
static int __net_init fib6_rules_net_init(struct net *net)
{
struct fib_rules_ops *ops;
- int err = -ENOMEM;
+ int err;
ops = fib_rules_register(&fib6_rules_ops_template, net);
if (IS_ERR(ops))
return PTR_ERR(ops);
- err = fib_default_rule_add(ops, 0, RT6_TABLE_LOCAL, 0);
+ err = fib_default_rule_add(ops, 0, RT6_TABLE_LOCAL);
if (err)
goto out_fib6_rules_ops;
- err = fib_default_rule_add(ops, 0x7FFE, RT6_TABLE_MAIN, 0);
+ err = fib_default_rule_add(ops, 0x7FFE, RT6_TABLE_MAIN);
if (err)
goto out_fib6_rules_ops;
@@ -482,16 +634,21 @@ out_fib6_rules_ops:
goto out;
}
-static void __net_exit fib6_rules_net_exit(struct net *net)
+static void __net_exit fib6_rules_net_exit_batch(struct list_head *net_list)
{
+ struct net *net;
+
rtnl_lock();
- fib_rules_unregister(net->ipv6.fib6_rules_ops);
+ list_for_each_entry(net, net_list, exit_list) {
+ fib_rules_unregister(net->ipv6.fib6_rules_ops);
+ cond_resched();
+ }
rtnl_unlock();
}
static struct pernet_operations fib6_rules_net_ops = {
.init = fib6_rules_net_init,
- .exit = fib6_rules_net_exit,
+ .exit_batch = fib6_rules_net_exit_batch,
};
int __init fib6_rules_init(void)
diff --git a/net/ipv6/fou6.c b/net/ipv6/fou6.c
index 6de3c04b0f30..430518ae26fa 100644
--- a/net/ipv6/fou6.c
+++ b/net/ipv6/fou6.c
@@ -1,9 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/socket.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/udp.h>
+#include <linux/icmpv6.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <net/fou.h>
@@ -69,14 +71,100 @@ static int gue6_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
return 0;
}
+static int gue6_err_proto_handler(int proto, struct sk_buff *skb,
+ struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ const struct inet6_protocol *ipprot;
+
+ ipprot = rcu_dereference(inet6_protos[proto]);
+ if (ipprot && ipprot->err_handler) {
+ if (!ipprot->err_handler(skb, opt, type, code, offset, info))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static int gue6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ int transport_offset = skb_transport_offset(skb);
+ struct guehdr *guehdr;
+ size_t len, optlen;
+ int ret;
+
+ len = sizeof(struct udphdr) + sizeof(struct guehdr);
+ if (!pskb_may_pull(skb, transport_offset + len))
+ return -EINVAL;
+
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+
+ switch (guehdr->version) {
+ case 0: /* Full GUE header present */
+ break;
+ case 1: {
+ /* Direct encasulation of IPv4 or IPv6 */
+ skb_set_transport_header(skb, -(int)sizeof(struct icmp6hdr));
+
+ switch (((struct iphdr *)guehdr)->version) {
+ case 4:
+ ret = gue6_err_proto_handler(IPPROTO_IPIP, skb, opt,
+ type, code, offset, info);
+ goto out;
+ case 6:
+ ret = gue6_err_proto_handler(IPPROTO_IPV6, skb, opt,
+ type, code, offset, info);
+ goto out;
+ default:
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ }
+ default: /* Undefined version */
+ return -EOPNOTSUPP;
+ }
+
+ if (guehdr->control)
+ return -ENOENT;
+
+ optlen = guehdr->hlen << 2;
+
+ if (!pskb_may_pull(skb, transport_offset + len + optlen))
+ return -EINVAL;
+
+ guehdr = (struct guehdr *)&udp_hdr(skb)[1];
+ if (validate_gue_flags(guehdr, optlen))
+ return -EINVAL;
+
+ /* Handling exceptions for direct UDP encapsulation in GUE would lead to
+ * recursion. Besides, this kind of encapsulation can't even be
+ * configured currently. Discard this.
+ */
+ if (guehdr->proto_ctype == IPPROTO_UDP ||
+ guehdr->proto_ctype == IPPROTO_UDPLITE)
+ return -EOPNOTSUPP;
+
+ skb_set_transport_header(skb, -(int)sizeof(struct icmp6hdr));
+ ret = gue6_err_proto_handler(guehdr->proto_ctype, skb,
+ opt, type, code, offset, info);
+
+out:
+ skb_set_transport_header(skb, transport_offset);
+ return ret;
+}
+
+
static const struct ip6_tnl_encap_ops fou_ip6tun_ops = {
.encap_hlen = fou_encap_hlen,
.build_header = fou6_build_header,
+ .err_handler = gue6_err,
};
static const struct ip6_tnl_encap_ops gue_ip6tun_ops = {
.encap_hlen = gue_encap_hlen,
.build_header = gue6_build_header,
+ .err_handler = gue6_err,
};
static int ip6_tnl_encap_add_fou_ops(void)
@@ -136,3 +224,4 @@ module_init(fou6_init);
module_exit(fou6_fini);
MODULE_AUTHOR("Tom Herbert <therbert@google.com>");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Foo over UDP (IPv6)");
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index c9c53ade55c3..5d2f90babaa5 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Internet Control Message Protocol (ICMPv6)
* Linux INET6 implementation
@@ -8,11 +9,6 @@
* Based on net/ipv4/icmp.c
*
* RFC 1885
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/*
@@ -61,6 +57,7 @@
#include <net/protocol.h>
#include <net/raw.h>
#include <net/rawv6.h>
+#include <net/seg6.h>
#include <net/transp_v6.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
@@ -72,24 +69,14 @@
#include <linux/uaccess.h>
-/*
- * The ICMP socket(s). This is the most convenient way to flow control
- * our ICMP output as well as maintain a clean interface throughout
- * all layers. All Socketless IP sends will soon be gone.
- *
- * On SMP we have one ICMP socket per-cpu.
- */
-static inline struct sock *icmpv6_sk(struct net *net)
-{
- return net->ipv6.icmp_sk[smp_processor_id()];
-}
+static DEFINE_PER_CPU(struct sock *, ipv6_icmp_sk);
-static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
/* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */
struct icmp6hdr *icmp6 = (struct icmp6hdr *) (skb->data + offset);
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
if (type == ICMPV6_PKT_TOOBIG)
ip6_update_pmtu(skb, net, info, skb->dev->ifindex, 0, sock_net_uid(net, NULL));
@@ -100,6 +87,8 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (!(type & ICMPV6_INFOMSG_MASK))
if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST)
ping_err(skb, offset, ntohl(info));
+
+ return 0;
}
static int icmpv6_rcv(struct sk_buff *skb);
@@ -111,11 +100,11 @@ static const struct inet6_protocol icmpv6_protocol = {
};
/* Called with BH disabled */
-static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
+static struct sock *icmpv6_xmit_lock(struct net *net)
{
struct sock *sk;
- sk = icmpv6_sk(net);
+ sk = this_cpu_read(ipv6_icmp_sk);
if (unlikely(!spin_trylock(&sk->sk_lock.slock))) {
/* This can happen if the output path (f.e. SIT or
* ip6ip6 tunnel) signals dst_link_failure() for an
@@ -123,11 +112,13 @@ static __inline__ struct sock *icmpv6_xmit_lock(struct net *net)
*/
return NULL;
}
+ sock_net_set(sk, net);
return sk;
}
-static __inline__ void icmpv6_xmit_unlock(struct sock *sk)
+static void icmpv6_xmit_unlock(struct sock *sk)
{
+ sock_net_set(sk, &init_net);
spin_unlock(&sk->sk_lock.slock);
}
@@ -160,33 +151,41 @@ static bool is_ineligible(const struct sk_buff *skb)
tp = skb_header_pointer(skb,
ptr+offsetof(struct icmp6hdr, icmp6_type),
sizeof(_type), &_type);
- if (!tp || !(*tp & ICMPV6_INFOMSG_MASK))
+
+ /* Based on RFC 8200, Section 4.5 Fragment Header, return
+ * false if this is a fragment packet with no icmp header info.
+ */
+ if (!tp && frag_off != 0)
+ return false;
+ else if (!tp || !(*tp & ICMPV6_INFOMSG_MASK))
return true;
}
return false;
}
-static bool icmpv6_mask_allow(int type)
+static bool icmpv6_mask_allow(struct net *net, int type)
{
- /* Informational messages are not limited. */
- if (type & ICMPV6_INFOMSG_MASK)
+ if (type > ICMPV6_MSG_MAX)
return true;
- /* Do not limit pmtu discovery, it would break it. */
- if (type == ICMPV6_PKT_TOOBIG)
+ /* Limit if icmp type is set in ratemask. */
+ if (!test_bit(type, net->ipv6.sysctl.icmpv6_ratemask))
return true;
return false;
}
-static bool icmpv6_global_allow(int type)
+static bool icmpv6_global_allow(struct net *net, int type,
+ bool *apply_ratelimit)
{
- if (icmpv6_mask_allow(type))
+ if (icmpv6_mask_allow(net, type))
return true;
- if (icmp_global_allow())
+ if (icmp_global_allow(net)) {
+ *apply_ratelimit = true;
return true;
-
+ }
+ __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL);
return false;
}
@@ -194,13 +193,14 @@ static bool icmpv6_global_allow(int type)
* Check the ICMP output rate limit
*/
static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
- struct flowi6 *fl6)
+ struct flowi6 *fl6, bool apply_ratelimit)
{
struct net *net = sock_net(sk);
+ struct net_device *dev;
struct dst_entry *dst;
bool res = false;
- if (icmpv6_mask_allow(type))
+ if (!apply_ratelimit)
return true;
/*
@@ -209,13 +209,15 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
* this lookup should be more aggressive (not longer than timeout).
*/
dst = ip6_route_output(net, sk, fl6);
+ rcu_read_lock();
+ dev = dst_dev_rcu(dst);
if (dst->error) {
IP6_INC_STATS(net, ip6_dst_idev(dst),
IPSTATS_MIB_OUTNOROUTES);
- } else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) {
+ } else if (dev && (dev->flags & IFF_LOOPBACK)) {
res = true;
} else {
- struct rt6_info *rt = (struct rt6_info *)dst;
+ struct rt6_info *rt = dst_rt6_info(dst);
int tmo = net->ipv6.sysctl.icmpv6_time;
struct inet_peer *peer;
@@ -223,10 +225,32 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
if (rt->rt6i_dst.plen < 128)
tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
- peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1);
+ peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr);
res = inet_peer_xrlim_allow(peer, tmo);
- if (peer)
- inet_putpeer(peer);
+ }
+ rcu_read_unlock();
+ if (!res)
+ __ICMP6_INC_STATS(net, NULL, ICMP6_MIB_RATELIMITHOST);
+ else
+ icmp_global_consume(net);
+ dst_release(dst);
+ return res;
+}
+
+static bool icmpv6_rt_has_prefsrc(struct sock *sk, u8 type,
+ struct flowi6 *fl6)
+{
+ struct net *net = sock_net(sk);
+ struct dst_entry *dst;
+ bool res = false;
+
+ dst = ip6_route_output(net, sk, fl6);
+ if (!dst->error) {
+ struct rt6_info *rt = dst_rt6_info(dst);
+ struct in6_addr prefsrc;
+
+ rt6_get_prefsrc(rt, &prefsrc);
+ res = !ipv6_addr_any(&prefsrc);
}
dst_release(dst);
return res;
@@ -298,10 +322,10 @@ static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, st
{
struct icmpv6_msg *msg = (struct icmpv6_msg *) from;
struct sk_buff *org_skb = msg->skb;
- __wsum csum = 0;
+ __wsum csum;
csum = skb_copy_and_csum_bits(org_skb, msg->offset + offset,
- to, len, csum);
+ to, len);
skb->csum = csum_block_add(skb->csum, csum, odd);
if (!(msg->type & ICMPV6_INFOMSG_MASK))
nf_ct_attach(skb, org_skb);
@@ -309,12 +333,10 @@ static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, st
}
#if IS_ENABLED(CONFIG_IPV6_MIP6)
-static void mip6_addr_swap(struct sk_buff *skb)
+static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt)
{
struct ipv6hdr *iph = ipv6_hdr(skb);
- struct inet6_skb_parm *opt = IP6CB(skb);
struct ipv6_destopt_hao *hao;
- struct in6_addr tmp;
int off;
if (opt->dsthao) {
@@ -322,14 +344,12 @@ static void mip6_addr_swap(struct sk_buff *skb)
if (likely(off >= 0)) {
hao = (struct ipv6_destopt_hao *)
(skb_network_header(skb) + off);
- tmp = iph->saddr;
- iph->saddr = hao->addr;
- hao->addr = tmp;
+ swap(iph->saddr, hao->addr);
}
}
}
#else
-static inline void mip6_addr_swap(struct sk_buff *skb) {}
+static inline void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) {}
#endif
static struct dst_entry *icmpv6_route_lookup(struct net *net,
@@ -347,9 +367,10 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net,
/*
* We won't send icmp if the destination is known
- * anycast.
+ * anycast unless we need to treat anycast as unicast.
*/
- if (ipv6_anycast_destination(dst, &fl6->daddr)) {
+ if (!READ_ONCE(net->ipv6.sysctl.icmpv6_error_anycast_as_unicast) &&
+ ipv6_anycast_destination(dst, &fl6->daddr)) {
net_dbg_ratelimited("icmp6_send: acast source\n");
dst_release(dst);
return ERR_PTR(-EINVAL);
@@ -369,7 +390,7 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net,
return dst;
}
- err = xfrm_decode_session_reverse(skb, flowi6_to_flowi(&fl2), AF_INET6);
+ err = xfrm_decode_session_reverse(net, skb, flowi6_to_flowi(&fl2), AF_INET6);
if (err)
goto relookup_failed;
@@ -396,38 +417,237 @@ relookup_failed:
return ERR_PTR(err);
}
-static int icmp6_iif(const struct sk_buff *skb)
+static struct net_device *icmp6_dev(const struct sk_buff *skb)
{
- int iif = skb->dev->ifindex;
+ struct net_device *dev = skb->dev;
/* for local traffic to local address, skb dev is the loopback
* device. Check if there is a dst attached to the skb and if so
* get the real device index. Same is needed for replies to a link
* local address on a device enslaved to an L3 master device
*/
- if (unlikely(iif == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) {
+ if (unlikely(dev->ifindex == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) {
const struct rt6_info *rt6 = skb_rt6_info(skb);
- if (rt6)
- iif = rt6->rt6i_idev->dev->ifindex;
+ /* The destination could be an external IP in Ext Hdr (SRv6, RPL, etc.),
+ * and ip6_null_entry could be set to skb if no route is found.
+ */
+ if (rt6 && rt6->rt6i_idev)
+ dev = rt6->rt6i_idev->dev;
+ }
+
+ return dev;
+}
+
+static int icmp6_iif(const struct sk_buff *skb)
+{
+ return icmp6_dev(skb)->ifindex;
+}
+
+struct icmp6_ext_iio_addr6_subobj {
+ __be16 afi;
+ __be16 reserved;
+ struct in6_addr addr6;
+};
+
+static unsigned int icmp6_ext_iio_len(void)
+{
+ return sizeof(struct icmp_extobj_hdr) +
+ /* ifIndex */
+ sizeof(__be32) +
+ /* Interface Address Sub-Object */
+ sizeof(struct icmp6_ext_iio_addr6_subobj) +
+ /* Interface Name Sub-Object. Length must be a multiple of 4
+ * bytes.
+ */
+ ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) +
+ /* MTU */
+ sizeof(__be32);
+}
+
+static unsigned int icmp6_ext_max_len(u8 ext_objs)
+{
+ unsigned int ext_max_len;
+
+ ext_max_len = sizeof(struct icmp_ext_hdr);
+
+ if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
+ ext_max_len += icmp6_ext_iio_len();
+
+ return ext_max_len;
+}
+
+static struct in6_addr *icmp6_ext_iio_addr6_find(const struct net_device *dev)
+{
+ struct inet6_dev *in6_dev;
+ struct inet6_ifaddr *ifa;
+
+ in6_dev = __in6_dev_get(dev);
+ if (!in6_dev)
+ return NULL;
+
+ /* It is unclear from RFC 5837 which IP address should be chosen, but
+ * it makes sense to choose a global unicast address.
+ */
+ list_for_each_entry_rcu(ifa, &in6_dev->addr_list, if_list) {
+ if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DADFAILED))
+ continue;
+ if (ipv6_addr_type(&ifa->addr) != IPV6_ADDR_UNICAST ||
+ ipv6_addr_src_scope(&ifa->addr) != IPV6_ADDR_SCOPE_GLOBAL)
+ continue;
+ return &ifa->addr;
+ }
+
+ return NULL;
+}
+
+static void icmp6_ext_iio_iif_append(struct net *net, struct sk_buff *skb,
+ int iif)
+{
+ struct icmp_ext_iio_name_subobj *name_subobj;
+ struct icmp_extobj_hdr *objh;
+ struct net_device *dev;
+ struct in6_addr *addr6;
+ __be32 data;
+
+ if (!iif)
+ return;
+
+ /* Add the fields in the order specified by RFC 5837. */
+ objh = skb_put(skb, sizeof(*objh));
+ objh->class_num = ICMP_EXT_OBJ_CLASS_IIO;
+ objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF);
+
+ data = htonl(iif);
+ skb_put_data(skb, &data, sizeof(__be32));
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX;
+
+ rcu_read_lock();
+
+ dev = dev_get_by_index_rcu(net, iif);
+ if (!dev)
+ goto out;
+
+ addr6 = icmp6_ext_iio_addr6_find(dev);
+ if (addr6) {
+ struct icmp6_ext_iio_addr6_subobj *addr6_subobj;
+
+ addr6_subobj = skb_put_zero(skb, sizeof(*addr6_subobj));
+ addr6_subobj->afi = htons(ICMP_AFI_IP6);
+ addr6_subobj->addr6 = *addr6;
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR;
}
- return iif;
+ name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4));
+ name_subobj->len = ALIGN(sizeof(*name_subobj), 4);
+ netdev_copy_name(dev, name_subobj->name);
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME;
+
+ data = htonl(READ_ONCE(dev->mtu));
+ skb_put_data(skb, &data, sizeof(__be32));
+ objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU;
+
+out:
+ rcu_read_unlock();
+ objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh);
+}
+
+static void icmp6_ext_objs_append(struct net *net, struct sk_buff *skb,
+ u8 ext_objs, int iif)
+{
+ if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF))
+ icmp6_ext_iio_iif_append(net, skb, iif);
+}
+
+static struct sk_buff *
+icmp6_ext_append(struct net *net, struct sk_buff *skb_in,
+ struct icmp6hdr *icmp6h, unsigned int room, int iif)
+{
+ unsigned int payload_len, ext_max_len, ext_len;
+ struct icmp_ext_hdr *ext_hdr;
+ struct sk_buff *skb;
+ u8 ext_objs;
+ int nhoff;
+
+ switch (icmp6h->icmp6_type) {
+ case ICMPV6_DEST_UNREACH:
+ case ICMPV6_TIME_EXCEED:
+ break;
+ default:
+ return NULL;
+ }
+
+ /* Do not overwrite existing extensions. This can happen when we
+ * receive an ICMPv4 message with extensions from a tunnel and
+ * translate it to an ICMPv6 message towards an IPv6 host in the
+ * overlay network.
+ */
+ if (icmp6h->icmp6_datagram_len)
+ return NULL;
+
+ ext_objs = READ_ONCE(net->ipv6.sysctl.icmpv6_errors_extension_mask);
+ if (!ext_objs)
+ return NULL;
+
+ ext_max_len = icmp6_ext_max_len(ext_objs);
+ if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room)
+ return NULL;
+
+ skb = skb_clone(skb_in, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ nhoff = skb_network_offset(skb);
+ payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN);
+
+ if (!pskb_network_may_pull(skb, payload_len))
+ goto free_skb;
+
+ if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) ||
+ __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false))
+ goto free_skb;
+
+ if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC))
+ goto free_skb;
+
+ ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr));
+ ext_hdr->version = ICMP_EXT_VERSION_2;
+
+ icmp6_ext_objs_append(net, skb, ext_objs, iif);
+
+ /* Do not send an empty extension structure. */
+ ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr;
+ if (ext_len == sizeof(*ext_hdr))
+ goto free_skb;
+
+ ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len);
+ /* The length of the original datagram in 64-bit words (RFC 4884). */
+ icmp6h->icmp6_datagram_len = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u64);
+
+ return skb;
+
+free_skb:
+ consume_skb(skb);
+ return NULL;
}
/*
* Send an ICMP message in response to a packet in error
*/
-static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
- const struct in6_addr *force_saddr)
+void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+ const struct in6_addr *force_saddr,
+ const struct inet6_skb_parm *parm)
{
- struct net *net = dev_net(skb->dev);
struct inet6_dev *idev = NULL;
struct ipv6hdr *hdr = ipv6_hdr(skb);
struct sock *sk;
+ struct net *net;
struct ipv6_pinfo *np;
const struct in6_addr *saddr = NULL;
+ bool apply_ratelimit = false;
+ struct sk_buff *ext_skb;
struct dst_entry *dst;
+ unsigned int room;
struct icmp6hdr tmp_hdr;
struct flowi6 fl6;
struct icmpv6_msg msg;
@@ -435,12 +655,19 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
int iif = 0;
int addr_type = 0;
int len;
- u32 mark = IP6_REPLY_MARK(net, skb->mark);
+ u32 mark;
if ((u8 *)hdr < skb->head ||
(skb_network_header(skb) + sizeof(*hdr)) > skb_tail_pointer(skb))
return;
+ if (!skb->dev)
+ return;
+
+ rcu_read_lock();
+
+ net = dev_net_rcu(skb->dev);
+ mark = IP6_REPLY_MARK(net, skb->mark);
/*
* Make sure we respect the rules
* i.e. RFC 1885 2.4(e)
@@ -462,7 +689,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
!(type == ICMPV6_PARAMPROB &&
code == ICMPV6_UNK_OPTION &&
(opt_unrec(skb, info))))
- return;
+ goto out;
saddr = NULL;
}
@@ -476,8 +703,11 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
if (__ipv6_addr_needs_scope_id(addr_type)) {
iif = icmp6_iif(skb);
} else {
- dst = skb_dst(skb);
- iif = l3mdev_master_ifindex(dst ? dst->dev : skb->dev);
+ /*
+ * The source device is used for looking up which routing table
+ * to use for sending an ICMP error.
+ */
+ iif = l3mdev_master_ifindex(skb->dev);
}
/*
@@ -489,7 +719,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) {
net_dbg_ratelimited("icmp6_send: addr_any/mcast source [%pI6c > %pI6c]\n",
&hdr->saddr, &hdr->daddr);
- return;
+ goto out;
}
/*
@@ -498,42 +728,54 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
if (is_ineligible(skb)) {
net_dbg_ratelimited("icmp6_send: no reply to icmp error [%pI6c > %pI6c]\n",
&hdr->saddr, &hdr->daddr);
- return;
+ goto out;
}
- /* Needed by both icmp_global_allow and icmpv6_xmit_lock */
+ /* Needed by both icmpv6_global_allow and icmpv6_xmit_lock */
local_bh_disable();
/* Check global sysctl_icmp_msgs_per_sec ratelimit */
- if (!(skb->dev->flags&IFF_LOOPBACK) && !icmpv6_global_allow(type))
+ if (!(skb->dev->flags & IFF_LOOPBACK) &&
+ !icmpv6_global_allow(net, type, &apply_ratelimit))
goto out_bh_enable;
- mip6_addr_swap(skb);
+ mip6_addr_swap(skb, parm);
+
+ sk = icmpv6_xmit_lock(net);
+ if (!sk)
+ goto out_bh_enable;
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_ICMPV6;
fl6.daddr = hdr->saddr;
if (force_saddr)
saddr = force_saddr;
- if (saddr)
+ if (saddr) {
fl6.saddr = *saddr;
+ } else if (!icmpv6_rt_has_prefsrc(sk, type, &fl6)) {
+ /* select a more meaningful saddr from input if */
+ struct net_device *in_netdev;
+
+ in_netdev = dev_get_by_index(net, parm->iif);
+ if (in_netdev) {
+ ipv6_dev_get_saddr(net, in_netdev, &fl6.daddr,
+ inet6_sk(sk)->srcprefs,
+ &fl6.saddr);
+ dev_put(in_netdev);
+ }
+ }
fl6.flowi6_mark = mark;
fl6.flowi6_oif = iif;
fl6.fl6_icmp_type = type;
fl6.fl6_icmp_code = code;
fl6.flowi6_uid = sock_net_uid(net, NULL);
fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL);
- security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
-
- sk = icmpv6_xmit_lock(net);
- if (!sk)
- goto out_bh_enable;
+ security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));
- sk->sk_mark = mark;
np = inet6_sk(sk);
- if (!icmpv6_xrlim_allow(sk, type, &fl6))
- goto out;
+ if (!icmpv6_xrlim_allow(sk, type, &fl6, apply_ratelimit))
+ goto out_unlock;
tmp_hdr.icmp6_type = type;
tmp_hdr.icmp6_code = code;
@@ -541,16 +783,17 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
tmp_hdr.icmp6_pointer = htonl(info);
if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
- fl6.flowi6_oif = np->mcast_oif;
+ fl6.flowi6_oif = READ_ONCE(np->mcast_oif);
else if (!fl6.flowi6_oif)
- fl6.flowi6_oif = np->ucast_oif;
+ fl6.flowi6_oif = READ_ONCE(np->ucast_oif);
- ipcm6_init_sk(&ipc6, np);
+ ipcm6_init_sk(&ipc6, sk);
+ ipc6.sockc.mark = mark;
fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
dst = icmpv6_route_lookup(net, skb, sk, &fl6);
if (IS_ERR(dst))
- goto out;
+ goto out_unlock;
ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
@@ -558,21 +801,25 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
msg.offset = skb_network_offset(skb);
msg.type = type;
- len = skb->len - msg.offset;
- len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr));
+ room = IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr);
+ ext_skb = icmp6_ext_append(net, skb, &tmp_hdr, room, parm->iif);
+ if (ext_skb)
+ msg.skb = ext_skb;
+
+ len = msg.skb->len - msg.offset;
+ len = min_t(unsigned int, len, room);
if (len < 0) {
net_dbg_ratelimited("icmp: len problem [%pI6c > %pI6c]\n",
&hdr->saddr, &hdr->daddr);
goto out_dst_release;
}
- rcu_read_lock();
idev = __in6_dev_get(skb->dev);
if (ip6_append_data(sk, icmpv6_getfrag, &msg,
len + sizeof(struct icmp6hdr),
sizeof(struct icmp6hdr),
- &ipc6, &fl6, (struct rt6_info *)dst,
+ &ipc6, &fl6, dst_rt6_info(dst),
MSG_DONTWAIT)) {
ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
ip6_flush_pending_frames(sk);
@@ -580,21 +827,27 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
len + sizeof(struct icmp6hdr));
}
- rcu_read_unlock();
+
out_dst_release:
+ if (ext_skb)
+ consume_skb(ext_skb);
dst_release(dst);
-out:
+out_unlock:
icmpv6_xmit_unlock(sk);
out_bh_enable:
local_bh_enable();
+out:
+ rcu_read_unlock();
}
+EXPORT_SYMBOL(icmp6_send);
-/* Slightly more convenient version of icmp6_send.
+/* Slightly more convenient version of icmp6_send with drop reasons.
*/
-void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos)
+void icmpv6_param_prob_reason(struct sk_buff *skb, u8 code, int pos,
+ enum skb_drop_reason reason)
{
- icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL);
- kfree_skb(skb);
+ icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL, IP6CB(skb));
+ kfree_skb_reason(skb, reason);
}
/* Generate icmpv6 with type/code ICMPV6_DEST_UNREACH/ICMPV6_ADDR_UNREACH
@@ -627,8 +880,8 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
skb_pull(skb2, nhs);
skb_reset_network_header(skb2);
- rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0,
- skb, 0);
+ rt = rt6_lookup(dev_net_rcu(skb->dev), &ipv6_hdr(skb2)->saddr,
+ NULL, 0, skb, 0);
if (rt && rt->dst.dev)
skb2->dev = rt->dst.dev;
@@ -650,10 +903,10 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
}
if (type == ICMP_TIME_EXCEEDED)
icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
- info, &temp_saddr);
+ info, &temp_saddr, IP6CB(skb2));
else
icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH,
- info, &temp_saddr);
+ info, &temp_saddr, IP6CB(skb2));
if (rt)
ip6_rt_put(rt);
@@ -663,53 +916,71 @@ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type,
}
EXPORT_SYMBOL(ip6_err_gen_icmpv6_unreach);
-static void icmpv6_echo_reply(struct sk_buff *skb)
+static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb)
{
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
struct sock *sk;
struct inet6_dev *idev;
struct ipv6_pinfo *np;
const struct in6_addr *saddr = NULL;
struct icmp6hdr *icmph = icmp6_hdr(skb);
+ bool apply_ratelimit = false;
struct icmp6hdr tmp_hdr;
struct flowi6 fl6;
struct icmpv6_msg msg;
struct dst_entry *dst;
struct ipcm6_cookie ipc6;
u32 mark = IP6_REPLY_MARK(net, skb->mark);
+ SKB_DR(reason);
+ bool acast;
+ u8 type;
+
+ if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) &&
+ net->ipv6.sysctl.icmpv6_echo_ignore_multicast)
+ return reason;
saddr = &ipv6_hdr(skb)->daddr;
+ acast = ipv6_anycast_destination(skb_dst(skb), saddr);
+ if (acast && net->ipv6.sysctl.icmpv6_echo_ignore_anycast)
+ return reason;
+
if (!ipv6_unicast_destination(skb) &&
- !(net->ipv6.sysctl.anycast_src_echo_reply &&
- ipv6_anycast_destination(skb_dst(skb), saddr)))
+ !(net->ipv6.sysctl.anycast_src_echo_reply && acast))
saddr = NULL;
+ if (icmph->icmp6_type == ICMPV6_EXT_ECHO_REQUEST)
+ type = ICMPV6_EXT_ECHO_REPLY;
+ else
+ type = ICMPV6_ECHO_REPLY;
+
memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr));
- tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY;
+ tmp_hdr.icmp6_type = type;
memset(&fl6, 0, sizeof(fl6));
+ if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES)
+ fl6.flowlabel = ip6_flowlabel(ipv6_hdr(skb));
+
fl6.flowi6_proto = IPPROTO_ICMPV6;
fl6.daddr = ipv6_hdr(skb)->saddr;
if (saddr)
fl6.saddr = *saddr;
fl6.flowi6_oif = icmp6_iif(skb);
- fl6.fl6_icmp_type = ICMPV6_ECHO_REPLY;
+ fl6.fl6_icmp_type = type;
fl6.flowi6_mark = mark;
fl6.flowi6_uid = sock_net_uid(net, NULL);
- security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+ security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));
local_bh_disable();
sk = icmpv6_xmit_lock(net);
if (!sk)
goto out_bh_enable;
- sk->sk_mark = mark;
np = inet6_sk(sk);
if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
- fl6.flowi6_oif = np->mcast_oif;
+ fl6.flowi6_oif = READ_ONCE(np->mcast_oif);
else if (!fl6.flowi6_oif)
- fl6.flowi6_oif = np->ucast_oif;
+ fl6.flowi6_oif = READ_ONCE(np->ucast_oif);
if (ip6_dst_lookup(net, sk, &dst, &fl6))
goto out;
@@ -717,57 +988,80 @@ static void icmpv6_echo_reply(struct sk_buff *skb)
if (IS_ERR(dst))
goto out;
+ /* Check the ratelimit */
+ if ((!(skb->dev->flags & IFF_LOOPBACK) &&
+ !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY, &apply_ratelimit)) ||
+ !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6, apply_ratelimit))
+ goto out_dst_release;
+
idev = __in6_dev_get(skb->dev);
msg.skb = skb;
msg.offset = 0;
- msg.type = ICMPV6_ECHO_REPLY;
+ msg.type = type;
- ipcm6_init_sk(&ipc6, np);
+ ipcm6_init_sk(&ipc6, sk);
ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb));
+ ipc6.sockc.mark = mark;
+
+ if (icmph->icmp6_type == ICMPV6_EXT_ECHO_REQUEST)
+ if (!icmp_build_probe(skb, (struct icmphdr *)&tmp_hdr))
+ goto out_dst_release;
if (ip6_append_data(sk, icmpv6_getfrag, &msg,
skb->len + sizeof(struct icmp6hdr),
sizeof(struct icmp6hdr), &ipc6, &fl6,
- (struct rt6_info *)dst, MSG_DONTWAIT)) {
+ dst_rt6_info(dst), MSG_DONTWAIT)) {
__ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS);
ip6_flush_pending_frames(sk);
} else {
icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr,
skb->len + sizeof(struct icmp6hdr));
+ reason = SKB_CONSUMED;
}
+out_dst_release:
dst_release(dst);
out:
icmpv6_xmit_unlock(sk);
out_bh_enable:
local_bh_enable();
+ return reason;
}
-void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
+enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type,
+ u8 code, __be32 info)
{
+ struct inet6_skb_parm *opt = IP6CB(skb);
+ struct net *net = dev_net_rcu(skb->dev);
const struct inet6_protocol *ipprot;
+ enum skb_drop_reason reason;
int inner_offset;
__be16 frag_off;
u8 nexthdr;
- struct net *net = dev_net(skb->dev);
- if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ reason = pskb_may_pull_reason(skb, sizeof(struct ipv6hdr));
+ if (reason != SKB_NOT_DROPPED_YET)
goto out;
+ seg6_icmp_srh(skb, opt);
+
nexthdr = ((struct ipv6hdr *)skb->data)->nexthdr;
if (ipv6_ext_hdr(nexthdr)) {
/* now skip over extension headers */
inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
&nexthdr, &frag_off);
- if (inner_offset < 0)
+ if (inner_offset < 0) {
+ SKB_DR_SET(reason, IPV6_BAD_EXTHDR);
goto out;
+ }
} else {
inner_offset = sizeof(struct ipv6hdr);
}
/* Checkin header including 8 bytes of inner protocol header. */
- if (!pskb_may_pull(skb, inner_offset+8))
+ reason = pskb_may_pull_reason(skb, inner_offset + 8);
+ if (reason != SKB_NOT_DROPPED_YET)
goto out;
/* BUGGG_FUTURE: we should try to parse exthdrs in this packet.
@@ -779,13 +1073,14 @@ void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info)
ipprot = rcu_dereference(inet6_protos[nexthdr]);
if (ipprot && ipprot->err_handler)
- ipprot->err_handler(skb, NULL, type, code, inner_offset, info);
+ ipprot->err_handler(skb, opt, type, code, inner_offset, info);
raw6_icmp_error(skb, nexthdr, type, code, inner_offset, info);
- return;
+ return SKB_CONSUMED;
out:
__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS);
+ return reason;
}
/*
@@ -794,21 +1089,23 @@ out:
static int icmpv6_rcv(struct sk_buff *skb)
{
- struct net *net = dev_net(skb->dev);
- struct net_device *dev = skb->dev;
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ struct net *net = dev_net_rcu(skb->dev);
+ struct net_device *dev = icmp6_dev(skb);
struct inet6_dev *idev = __in6_dev_get(dev);
const struct in6_addr *saddr, *daddr;
struct icmp6hdr *hdr;
u8 type;
- bool success = false;
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
struct sec_path *sp = skb_sec_path(skb);
int nh;
if (!(sp && sp->xvec[sp->len - 1]->props.flags &
- XFRM_STATE_ICMP))
+ XFRM_STATE_ICMP)) {
+ reason = SKB_DROP_REASON_XFRM_POLICY;
goto drop_no_count;
+ }
if (!pskb_may_pull(skb, sizeof(*hdr) + sizeof(struct ipv6hdr)))
goto drop_no_count;
@@ -816,13 +1113,16 @@ static int icmpv6_rcv(struct sk_buff *skb)
nh = skb_network_offset(skb);
skb_set_network_header(skb, sizeof(*hdr));
- if (!xfrm6_policy_check_reverse(NULL, XFRM_POLICY_IN, skb))
+ if (!xfrm6_policy_check_reverse(NULL, XFRM_POLICY_IN,
+ skb)) {
+ reason = SKB_DROP_REASON_XFRM_POLICY;
goto drop_no_count;
+ }
skb_set_network_header(skb, nh);
}
- __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INMSGS);
+ __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INMSGS);
saddr = &ipv6_hdr(skb)->saddr;
daddr = &ipv6_hdr(skb)->daddr;
@@ -840,17 +1140,23 @@ static int icmpv6_rcv(struct sk_buff *skb)
type = hdr->icmp6_type;
- ICMP6MSGIN_INC_STATS(dev_net(dev), idev, type);
+ ICMP6MSGIN_INC_STATS(dev_net_rcu(dev), idev, type);
switch (type) {
case ICMPV6_ECHO_REQUEST:
if (!net->ipv6.sysctl.icmpv6_echo_ignore_all)
- icmpv6_echo_reply(skb);
+ reason = icmpv6_echo_reply(skb);
+ break;
+ case ICMPV6_EXT_ECHO_REQUEST:
+ if (!net->ipv6.sysctl.icmpv6_echo_ignore_all &&
+ READ_ONCE(net->ipv4.sysctl_icmp_echo_enable_probe))
+ reason = icmpv6_echo_reply(skb);
break;
case ICMPV6_ECHO_REPLY:
- success = ping_rcv(skb);
- break;
+ case ICMPV6_EXT_ECHO_REPLY:
+ ping_rcv(skb);
+ return 0;
case ICMPV6_PKT_TOOBIG:
/* BUGGG_FUTURE: if packet contains rthdr, we cannot update
@@ -863,11 +1169,12 @@ static int icmpv6_rcv(struct sk_buff *skb)
hdr = icmp6_hdr(skb);
/* to notify */
- /* fall through */
+ fallthrough;
case ICMPV6_DEST_UNREACH:
case ICMPV6_TIME_EXCEED:
case ICMPV6_PARAMPROB:
- icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu);
+ reason = icmpv6_notify(skb, type, hdr->icmp6_code,
+ hdr->icmp6_mtu);
break;
case NDISC_ROUTER_SOLICITATION:
@@ -875,16 +1182,16 @@ static int icmpv6_rcv(struct sk_buff *skb)
case NDISC_NEIGHBOUR_SOLICITATION:
case NDISC_NEIGHBOUR_ADVERTISEMENT:
case NDISC_REDIRECT:
- ndisc_rcv(skb);
+ reason = ndisc_rcv(skb);
break;
case ICMPV6_MGM_QUERY:
igmp6_event_query(skb);
- break;
+ return 0;
case ICMPV6_MGM_REPORT:
igmp6_event_report(skb);
- break;
+ return 0;
case ICMPV6_MGM_REDUCTION:
case ICMPV6_NI_QUERY:
@@ -909,33 +1216,33 @@ static int icmpv6_rcv(struct sk_buff *skb)
* must pass to upper level
*/
- icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu);
+ reason = icmpv6_notify(skb, type, hdr->icmp6_code,
+ hdr->icmp6_mtu);
}
/* until the v6 path can be better sorted assume failure and
* preserve the status quo behaviour for the rest of the paths to here
*/
- if (success)
- consume_skb(skb);
+ if (reason)
+ kfree_skb_reason(skb, reason);
else
- kfree_skb(skb);
+ consume_skb(skb);
return 0;
csum_error:
- __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS);
+ reason = SKB_DROP_REASON_ICMP_CSUM;
+ __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_CSUMERRORS);
discard_it:
- __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INERRORS);
+ __ICMP6_INC_STATS(dev_net_rcu(dev), idev, ICMP6_MIB_INERRORS);
drop_no_count:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return 0;
}
-void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6,
- u8 type,
+void icmpv6_flow_init(const struct sock *sk, struct flowi6 *fl6, u8 type,
const struct in6_addr *saddr,
- const struct in6_addr *daddr,
- int oif)
+ const struct in6_addr *daddr, int oif)
{
memset(fl6, 0, sizeof(*fl6));
fl6->saddr = *saddr;
@@ -944,66 +1251,30 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6,
fl6->fl6_icmp_type = type;
fl6->fl6_icmp_code = 0;
fl6->flowi6_oif = oif;
- security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
}
-static int __net_init icmpv6_sk_init(struct net *net)
+int __init icmpv6_init(void)
{
struct sock *sk;
- int err, i, j;
-
- net->ipv6.icmp_sk =
- kcalloc(nr_cpu_ids, sizeof(struct sock *), GFP_KERNEL);
- if (!net->ipv6.icmp_sk)
- return -ENOMEM;
+ int err, i;
for_each_possible_cpu(i) {
err = inet_ctl_sock_create(&sk, PF_INET6,
- SOCK_RAW, IPPROTO_ICMPV6, net);
+ SOCK_RAW, IPPROTO_ICMPV6, &init_net);
if (err < 0) {
pr_err("Failed to initialize the ICMP6 control socket (err %d)\n",
err);
- goto fail;
+ return err;
}
- net->ipv6.icmp_sk[i] = sk;
+ per_cpu(ipv6_icmp_sk, i) = sk;
/* Enough space for 2 64K ICMP packets, including
* sk_buff struct overhead.
*/
sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024);
}
- return 0;
-
- fail:
- for (j = 0; j < i; j++)
- inet_ctl_sock_destroy(net->ipv6.icmp_sk[j]);
- kfree(net->ipv6.icmp_sk);
- return err;
-}
-
-static void __net_exit icmpv6_sk_exit(struct net *net)
-{
- int i;
-
- for_each_possible_cpu(i) {
- inet_ctl_sock_destroy(net->ipv6.icmp_sk[i]);
- }
- kfree(net->ipv6.icmp_sk);
-}
-
-static struct pernet_operations icmpv6_sk_ops = {
- .init = icmpv6_sk_init,
- .exit = icmpv6_sk_exit,
-};
-
-int __init icmpv6_init(void)
-{
- int err;
-
- err = register_pernet_subsys(&icmpv6_sk_ops);
- if (err < 0)
- return err;
err = -EAGAIN;
if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0)
@@ -1018,14 +1289,12 @@ sender_reg_err:
inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
fail:
pr_err("Failed to register ICMP6 protocol\n");
- unregister_pernet_subsys(&icmpv6_sk_ops);
return err;
}
void icmpv6_cleanup(void)
{
inet6_unregister_icmp_sender(icmp6_send);
- unregister_pernet_subsys(&icmpv6_sk_ops);
inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
}
@@ -1098,6 +1367,10 @@ int icmpv6_err_convert(u8 type, u8 code, int *err)
EXPORT_SYMBOL(icmpv6_err_convert);
#ifdef CONFIG_SYSCTL
+
+static u32 icmpv6_errors_extension_mask_all =
+ GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0);
+
static struct ctl_table ipv6_icmp_table_template[] = {
{
.procname = "ratelimit",
@@ -1109,11 +1382,49 @@ static struct ctl_table ipv6_icmp_table_template[] = {
{
.procname = "echo_ignore_all",
.data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_all,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "echo_ignore_multicast",
+ .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_multicast,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "echo_ignore_anycast",
+ .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_anycast,
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
+ .procname = "ratemask",
+ .data = &init_net.ipv6.sysctl.icmpv6_ratemask_ptr,
+ .maxlen = ICMPV6_MSG_MAX + 1,
+ .mode = 0644,
+ .proc_handler = proc_do_large_bitmap,
+ },
+ {
+ .procname = "error_anycast_as_unicast",
+ .data = &init_net.ipv6.sysctl.icmpv6_error_anycast_as_unicast,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
+ {
+ .procname = "errors_extension_mask",
+ .data = &init_net.ipv6.sysctl.icmpv6_errors_extension_mask,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &icmpv6_errors_extension_mask_all,
},
- { },
};
struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net)
@@ -1127,7 +1438,17 @@ struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net)
if (table) {
table[0].data = &net->ipv6.sysctl.icmpv6_time;
table[1].data = &net->ipv6.sysctl.icmpv6_echo_ignore_all;
+ table[2].data = &net->ipv6.sysctl.icmpv6_echo_ignore_multicast;
+ table[3].data = &net->ipv6.sysctl.icmpv6_echo_ignore_anycast;
+ table[4].data = &net->ipv6.sysctl.icmpv6_ratemask_ptr;
+ table[5].data = &net->ipv6.sysctl.icmpv6_error_anycast_as_unicast;
+ table[6].data = &net->ipv6.sysctl.icmpv6_errors_extension_mask;
}
return table;
}
+
+size_t ipv6_icmp_sysctl_table_size(void)
+{
+ return ARRAY_SIZE(ipv6_icmp_table_template);
+}
#endif
diff --git a/net/ipv6/ila/Makefile b/net/ipv6/ila/Makefile
index b7739aba6e68..1bc88ed7edc5 100644
--- a/net/ipv6/ila/Makefile
+++ b/net/ipv6/ila/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for ILA module
#
diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h
index 1f747bcbec29..85b92917849b 100644
--- a/net/ipv6/ila/ila.h
+++ b/net/ipv6/ila/ila.h
@@ -1,11 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Copyright (c) 2015 Tom Herbert <tom@herbertland.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of
- * the License, or (at your option) any later version.
- *
*/
#ifndef __ILA_H
@@ -73,11 +68,6 @@ static inline struct ila_addr *ila_a2i(struct in6_addr *addr)
return (struct ila_addr *)addr;
}
-static inline bool ila_addr_is_ila(struct ila_addr *iaddr)
-{
- return (iaddr->ident.type != ILA_ATYPE_IID);
-}
-
struct ila_params {
struct ila_locator locator;
struct ila_locator locator_match;
@@ -118,6 +108,7 @@ int ila_lwt_init(void);
void ila_lwt_fini(void);
int ila_xlat_init_net(struct net *net);
+void ila_xlat_pre_exit_net(struct net *net);
void ila_xlat_exit_net(struct net *net);
int ila_xlat_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info);
diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c
index 95e9146918cc..b8d43ed4689d 100644
--- a/net/ipv6/ila/ila_common.c
+++ b/net/ipv6/ila/ila_common.c
@@ -86,7 +86,7 @@ static void ila_csum_adjust_transport(struct sk_buff *skb,
diff = get_csum_diff(ip6h, p);
inet_proto_csum_replace_by_diff(&th->check, skb,
- diff, true);
+ diff, true, true);
}
break;
case NEXTHDR_UDP:
@@ -97,7 +97,7 @@ static void ila_csum_adjust_transport(struct sk_buff *skb,
if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
diff = get_csum_diff(ip6h, p);
inet_proto_csum_replace_by_diff(&uh->check, skb,
- diff, true);
+ diff, true, true);
if (!uh->check)
uh->check = CSUM_MANGLED_0;
}
@@ -111,7 +111,7 @@ static void ila_csum_adjust_transport(struct sk_buff *skb,
diff = get_csum_diff(ip6h, p);
inet_proto_csum_replace_by_diff(&ih->icmp6_cksum, skb,
- diff, true);
+ diff, true, true);
}
break;
}
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index 3d56a2fb6f86..7bb9edc5c28c 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -38,7 +38,7 @@ static inline struct ila_params *ila_params_lwtunnel(
static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *orig_dst = skb_dst(skb);
- struct rt6_info *rt = (struct rt6_info *)orig_dst;
+ struct rt6_info *rt = dst_rt6_info(orig_dst);
struct ila_lwt *ilwt = ila_lwt_lwtunnel(orig_dst->lwtstate);
struct dst_entry *dst;
int err = -EINVAL;
@@ -58,7 +58,9 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
return orig_dst->lwtstate->orig_output(net, sk, skb);
}
+ local_bh_disable();
dst = dst_cache_get(&ilwt->dst_cache);
+ local_bh_enable();
if (unlikely(!dst)) {
struct ipv6hdr *ip6h = ipv6_hdr(skb);
struct flowi6 fl6;
@@ -68,9 +70,9 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
*/
memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_oif = orig_dst->dev->ifindex;
+ fl6.flowi6_oif = dst_dev(orig_dst)->ifindex;
fl6.flowi6_iif = LOOPBACK_IFINDEX;
- fl6.daddr = *rt6_nexthop((struct rt6_info *)orig_dst,
+ fl6.daddr = *rt6_nexthop(dst_rt6_info(orig_dst),
&ip6h->daddr);
dst = ip6_route_output(net, NULL, &fl6);
@@ -86,10 +88,15 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
goto drop;
}
- if (ilwt->connected)
+ /* cache only if we don't create a dst reference loop */
+ if (ilwt->connected && orig_dst->lwtstate != dst->lwtstate) {
+ local_bh_disable();
dst_cache_set_ip6(&ilwt->dst_cache, dst, &fl6.saddr);
+ local_bh_enable();
+ }
}
+ skb_dst_drop(skb);
skb_dst_set(skb, dst);
return dst_output(net, sk, skb);
@@ -125,7 +132,7 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
[ILA_ATTR_HOOK_TYPE] = { .type = NLA_U8, },
};
-static int ila_build_state(struct nlattr *nla,
+static int ila_build_state(struct net *net, struct nlattr *nla,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
@@ -146,7 +153,8 @@ static int ila_build_state(struct nlattr *nla,
if (family != AF_INET6)
return -EINVAL;
- ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, ila_nl_policy, extack);
+ ret = nla_parse_nested_deprecated(tb, ILA_ATTR_MAX, nla,
+ ila_nl_policy, extack);
if (ret < 0)
return ret;
diff --git a/net/ipv6/ila/ila_main.c b/net/ipv6/ila/ila_main.c
index 18fac76b9520..976c78efbae1 100644
--- a/net/ipv6/ila/ila_main.c
+++ b/net/ipv6/ila/ila_main.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
#include <net/genetlink.h>
-#include <net/ila.h>
#include <net/netns/generic.h>
#include <uapi/linux/genetlink.h>
#include "ila.h"
@@ -16,29 +15,29 @@ static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
static const struct genl_ops ila_nl_ops[] = {
{
.cmd = ILA_CMD_ADD,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = ila_xlat_nl_cmd_add_mapping,
- .policy = ila_nl_policy,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = ILA_CMD_DEL,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = ila_xlat_nl_cmd_del_mapping,
- .policy = ila_nl_policy,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = ILA_CMD_FLUSH,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = ila_xlat_nl_cmd_flush,
- .policy = ila_nl_policy,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = ILA_CMD_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = ila_xlat_nl_cmd_get_mapping,
.start = ila_xlat_nl_dump_start,
.dumpit = ila_xlat_nl_dump,
.done = ila_xlat_nl_dump_done,
- .policy = ila_nl_policy,
},
};
@@ -49,11 +48,13 @@ struct genl_family ila_nl_family __ro_after_init = {
.name = ILA_GENL_NAME,
.version = ILA_GENL_VERSION,
.maxattr = ILA_ATTR_MAX,
+ .policy = ila_nl_policy,
.netnsok = true,
.parallel_ops = true,
.module = THIS_MODULE,
.ops = ila_nl_ops,
.n_ops = ARRAY_SIZE(ila_nl_ops),
+ .resv_start_op = ILA_CMD_FLUSH + 1,
};
static __net_init int ila_init_net(struct net *net)
@@ -70,6 +71,11 @@ ila_xlat_init_fail:
return err;
}
+static __net_exit void ila_pre_exit_net(struct net *net)
+{
+ ila_xlat_pre_exit_net(net);
+}
+
static __net_exit void ila_exit_net(struct net *net)
{
ila_xlat_exit_net(net);
@@ -77,6 +83,7 @@ static __net_exit void ila_exit_net(struct net *net)
static struct pernet_operations ila_net_ops = {
.init = ila_init_net,
+ .pre_exit = ila_pre_exit_net,
.exit = ila_exit_net,
.id = &ila_net_id,
.size = sizeof(struct ila_net),
@@ -119,3 +126,4 @@ module_init(ila_init);
module_exit(ila_fini);
MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IPv6: Identifier Locator Addressing (ILA)");
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 17c455ff69ff..1d41b2ab4884 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -5,7 +5,6 @@
#include <linux/rhashtable.h>
#include <linux/vmalloc.h>
#include <net/genetlink.h>
-#include <net/ila.h>
#include <net/netns/generic.h>
#include <uapi/linux/genetlink.h>
#include "ila.h"
@@ -106,16 +105,11 @@ static int parse_nl_config(struct genl_info *info,
xp->ip.locator_match.v64 = (__force __be64)nla_get_u64(
info->attrs[ILA_ATTR_LOCATOR_MATCH]);
- if (info->attrs[ILA_ATTR_CSUM_MODE])
- xp->ip.csum_mode = nla_get_u8(info->attrs[ILA_ATTR_CSUM_MODE]);
- else
- xp->ip.csum_mode = ILA_CSUM_NO_ACTION;
+ xp->ip.csum_mode = nla_get_u8_default(info->attrs[ILA_ATTR_CSUM_MODE],
+ ILA_CSUM_NO_ACTION);
- if (info->attrs[ILA_ATTR_IDENT_TYPE])
- xp->ip.ident_type = nla_get_u8(
- info->attrs[ILA_ATTR_IDENT_TYPE]);
- else
- xp->ip.ident_type = ILA_ATYPE_USE_FORMAT;
+ xp->ip.ident_type = nla_get_u8_default(info->attrs[ILA_ATTR_IDENT_TYPE],
+ ILA_ATYPE_USE_FORMAT);
if (info->attrs[ILA_ATTR_IFINDEX])
xp->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]);
@@ -201,6 +195,8 @@ static const struct nf_hook_ops ila_nf_hook_ops[] = {
},
};
+static DEFINE_MUTEX(ila_mutex);
+
static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp)
{
struct ila_net *ilan = net_generic(net, ila_net_id);
@@ -208,16 +204,20 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp)
spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match);
int err = 0, order;
- if (!ilan->xlat.hooks_registered) {
+ if (!READ_ONCE(ilan->xlat.hooks_registered)) {
/* We defer registering net hooks in the namespace until the
* first mapping is added.
*/
- err = nf_register_net_hooks(net, ila_nf_hook_ops,
- ARRAY_SIZE(ila_nf_hook_ops));
+ mutex_lock(&ila_mutex);
+ if (!ilan->xlat.hooks_registered) {
+ err = nf_register_net_hooks(net, ila_nf_hook_ops,
+ ARRAY_SIZE(ila_nf_hook_ops));
+ if (!err)
+ WRITE_ONCE(ilan->xlat.hooks_registered, true);
+ }
+ mutex_unlock(&ila_mutex);
if (err)
return err;
-
- ilan->xlat.hooks_registered = true;
}
ila = kzalloc(sizeof(*ila), GFP_KERNEL);
@@ -383,12 +383,9 @@ int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info)
struct rhashtable_iter iter;
struct ila_map *ila;
spinlock_t *lock;
- int ret;
-
- ret = rhashtable_walk_init(&ilan->xlat.rhash_table, &iter, GFP_KERNEL);
- if (ret)
- goto done;
+ int ret = 0;
+ rhashtable_walk_enter(&ilan->xlat.rhash_table, &iter);
rhashtable_walk_start(&iter);
for (;;) {
@@ -420,6 +417,7 @@ int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info)
done:
rhashtable_walk_stop(&iter);
+ rhashtable_walk_exit(&iter);
return ret;
}
@@ -479,6 +477,7 @@ int ila_xlat_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info)
rcu_read_lock();
+ ret = -ESRCH;
ila = ila_lookup_by_params(&xp, ilan);
if (ila) {
ret = ila_dump_info(ila,
@@ -509,23 +508,17 @@ int ila_xlat_nl_dump_start(struct netlink_callback *cb)
struct net *net = sock_net(cb->skb->sk);
struct ila_net *ilan = net_generic(net, ila_net_id);
struct ila_dump_iter *iter;
- int ret;
iter = kmalloc(sizeof(*iter), GFP_KERNEL);
if (!iter)
return -ENOMEM;
- ret = rhashtable_walk_init(&ilan->xlat.rhash_table, &iter->rhiter,
- GFP_KERNEL);
- if (ret) {
- kfree(iter);
- return ret;
- }
+ rhashtable_walk_enter(&ilan->xlat.rhash_table, &iter->rhiter);
iter->skip = 0;
cb->args[0] = (long)iter;
- return ret;
+ return 0;
}
int ila_xlat_nl_dump_done(struct netlink_callback *cb)
@@ -609,8 +602,6 @@ out_ret:
return ret;
}
-#define ILA_HASH_TABLE_SIZE 1024
-
int ila_xlat_init_net(struct net *net)
{
struct ila_net *ilan = net_generic(net, ila_net_id);
@@ -620,11 +611,24 @@ int ila_xlat_init_net(struct net *net)
if (err)
return err;
- rhashtable_init(&ilan->xlat.rhash_table, &rht_params);
+ err = rhashtable_init(&ilan->xlat.rhash_table, &rht_params);
+ if (err) {
+ free_bucket_spinlocks(ilan->xlat.locks);
+ return err;
+ }
return 0;
}
+void ila_xlat_pre_exit_net(struct net *net)
+{
+ struct ila_net *ilan = net_generic(net, ila_net_id);
+
+ if (ilan->xlat.hooks_registered)
+ nf_unregister_net_hooks(net, ila_nf_hook_ops,
+ ARRAY_SIZE(ila_nf_hook_ops));
+}
+
void ila_xlat_exit_net(struct net *net)
{
struct ila_net *ilan = net_generic(net, ila_net_id);
@@ -632,10 +636,6 @@ void ila_xlat_exit_net(struct net *net)
rhashtable_free_and_destroy(&ilan->xlat.rhash_table, ila_free_cb, NULL);
free_bucket_spinlocks(ilan->xlat.locks);
-
- if (ilan->xlat.hooks_registered)
- nf_unregister_net_hooks(net, ila_nf_hook_ops,
- ARRAY_SIZE(ila_nf_hook_ops));
}
static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila)
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 9a31d13bf180..ea5cf3fdfdd6 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -6,11 +7,6 @@
* Support for INET6 connection oriented protocols.
*
* Authors: See the TCPv6 sources
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or(at your option) any later version.
*/
#include <linux/module.h>
@@ -49,30 +45,15 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk,
fl6->flowi6_mark = ireq->ir_mark;
fl6->fl6_dport = ireq->ir_rmt_port;
fl6->fl6_sport = htons(ireq->ir_num);
- fl6->flowi6_uid = sk->sk_uid;
- security_req_classify_flow(req, flowi6_to_flowi(fl6));
+ fl6->flowi6_uid = sk_uid(sk);
+ security_req_classify_flow(req, flowi6_to_flowi_common(fl6));
- dst = ip6_dst_lookup_flow(sk, fl6, final_p);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
if (IS_ERR(dst))
return NULL;
return dst;
}
-EXPORT_SYMBOL(inet6_csk_route_req);
-
-void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
-{
- struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr;
-
- sin6->sin6_family = AF_INET6;
- sin6->sin6_addr = sk->sk_v6_daddr;
- sin6->sin6_port = inet_sk(sk)->inet_dport;
- /* We do not store received flowlabel for TCP */
- sin6->sin6_flowinfo = 0;
- sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr,
- sk->sk_bound_dev_if);
-}
-EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr);
static inline
struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie)
@@ -98,8 +79,8 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
fl6->flowi6_mark = sk->sk_mark;
fl6->fl6_sport = inet->inet_sport;
fl6->fl6_dport = inet->inet_dport;
- fl6->flowi6_uid = sk->sk_uid;
- security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
+ fl6->flowi6_uid = sk_uid(sk);
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
rcu_read_lock();
final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
@@ -107,10 +88,10 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
dst = __inet6_csk_dst_check(sk, np->dst_cookie);
if (!dst) {
- dst = ip6_dst_lookup_flow(sk, fl6, final_p);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
if (!IS_ERR(dst))
- ip6_dst_store(sk, dst, NULL, NULL);
+ ip6_dst_store(sk, dst, false, false);
}
return dst;
}
@@ -124,7 +105,7 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused
dst = inet6_csk_route_socket(sk, &fl6);
if (IS_ERR(dst)) {
- sk->sk_err_soft = -PTR_ERR(dst);
+ WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst));
sk->sk_route_caps = 0;
kfree_skb(skb);
return PTR_ERR(dst);
@@ -137,7 +118,7 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused
fl6.daddr = sk->sk_v6_daddr;
res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt),
- np->tclass);
+ np->tclass, READ_ONCE(sk->sk_priority));
rcu_read_unlock();
return res;
}
@@ -150,9 +131,8 @@ struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu)
if (IS_ERR(dst))
return NULL;
- dst->ops->update_pmtu(dst, sk, NULL, mtu);
+ dst->ops->update_pmtu(dst, sk, NULL, mtu, true);
dst = inet6_csk_route_socket(sk, &fl6);
return IS_ERR(dst) ? NULL : dst;
}
-EXPORT_SYMBOL_GPL(inet6_csk_update_pmtu);
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 3d7c7460a0c5..5e1da088d8e1 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -7,42 +8,37 @@
*
* Authors: Lotsa people, from code originally in tcp, generalised here
* by Arnaldo Carvalho de Melo <acme@mandriva.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
#include <linux/random.h>
#include <net/addrconf.h>
+#include <net/hotdata.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
#include <net/inet6_hashtables.h>
#include <net/secure_seq.h>
#include <net/ip.h>
#include <net/sock_reuseport.h>
+#include <net/tcp.h>
u32 inet6_ehashfn(const struct net *net,
const struct in6_addr *laddr, const u16 lport,
const struct in6_addr *faddr, const __be16 fport)
{
- static u32 inet6_ehash_secret __read_mostly;
- static u32 ipv6_hash_secret __read_mostly;
-
u32 lhash, fhash;
net_get_random_once(&inet6_ehash_secret, sizeof(inet6_ehash_secret));
- net_get_random_once(&ipv6_hash_secret, sizeof(ipv6_hash_secret));
+ net_get_random_once(&tcp_ipv6_hash_secret, sizeof(tcp_ipv6_hash_secret));
lhash = (__force u32)laddr->s6_addr32[3];
- fhash = __ipv6_addr_jhash(faddr, ipv6_hash_secret);
+ fhash = __ipv6_addr_jhash(faddr, tcp_ipv6_hash_secret);
- return __inet6_ehashfn(lhash, lport, fhash, fport,
- inet6_ehash_secret + net_hash_mix(net));
+ return lport + __inet6_ehashfn(lhash, 0, fhash, fport,
+ inet6_ehash_secret + net_hash_mix(net));
}
+EXPORT_SYMBOL_GPL(inet6_ehashfn);
/*
* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
@@ -50,35 +46,34 @@ u32 inet6_ehashfn(const struct net *net,
*
* The sockhash lock must be held as a reader here.
*/
-struct sock *__inet6_lookup_established(struct net *net,
- struct inet_hashinfo *hashinfo,
- const struct in6_addr *saddr,
- const __be16 sport,
- const struct in6_addr *daddr,
- const u16 hnum,
- const int dif, const int sdif)
+struct sock *__inet6_lookup_established(const struct net *net,
+ const struct in6_addr *saddr,
+ const __be16 sport,
+ const struct in6_addr *daddr,
+ const u16 hnum,
+ const int dif, const int sdif)
{
- struct sock *sk;
- const struct hlist_nulls_node *node;
const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
- /* Optimize here for direct hit, only listening connections can
- * have wildcards anyways.
- */
- unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
- unsigned int slot = hash & hashinfo->ehash_mask;
- struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
-
+ const struct hlist_nulls_node *node;
+ struct inet_ehash_bucket *head;
+ struct inet_hashinfo *hashinfo;
+ unsigned int hash, slot;
+ struct sock *sk;
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
+ hash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
+ slot = hash & hashinfo->ehash_mask;
+ head = &hashinfo->ehash[slot];
begin:
sk_nulls_for_each_rcu(sk, node, &head->chain) {
if (sk->sk_hash != hash)
continue;
- if (!INET6_MATCH(sk, net, saddr, daddr, ports, dif, sdif))
+ if (!inet6_match(net, sk, saddr, daddr, ports, dif, sdif))
continue;
if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
goto out;
- if (unlikely(!INET6_MATCH(sk, net, saddr, daddr, ports, dif, sdif))) {
+ if (unlikely(!inet6_match(net, sk, saddr, daddr, ports, dif, sdif))) {
sock_gen_put(sk);
goto begin;
}
@@ -93,64 +88,83 @@ found:
}
EXPORT_SYMBOL(__inet6_lookup_established);
-static inline int compute_score(struct sock *sk, struct net *net,
+static inline int compute_score(struct sock *sk, const struct net *net,
const unsigned short hnum,
const struct in6_addr *daddr,
- const int dif, const int sdif, bool exact_dif)
+ const int dif, const int sdif)
{
int score = -1;
if (net_eq(sock_net(sk), net) && inet_sk(sk)->inet_num == hnum &&
sk->sk_family == PF_INET6) {
+ if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
+ return -1;
- score = 1;
- if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
- if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
- return -1;
- score++;
- }
- if (sk->sk_bound_dev_if || exact_dif) {
- bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
-
- if (!dev_match)
- return -1;
- if (sk->sk_bound_dev_if)
- score++;
- }
- if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
+ return -1;
+
+ score = sk->sk_bound_dev_if ? 2 : 1;
+ if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
score++;
}
return score;
}
+/**
+ * inet6_lookup_reuseport() - execute reuseport logic on AF_INET6 socket if necessary.
+ * @net: network namespace.
+ * @sk: AF_INET6 socket, must be in TCP_LISTEN state for TCP or TCP_CLOSE for UDP.
+ * @skb: context for a potential SK_REUSEPORT program.
+ * @doff: header offset.
+ * @saddr: source address.
+ * @sport: source port.
+ * @daddr: destination address.
+ * @hnum: destination port in host byte order.
+ * @ehashfn: hash function used to generate the fallback hash.
+ *
+ * Return: NULL if sk doesn't have SO_REUSEPORT set, otherwise a pointer to
+ * the selected sock or an error.
+ */
+struct sock *inet6_lookup_reuseport(const struct net *net, struct sock *sk,
+ struct sk_buff *skb, int doff,
+ const struct in6_addr *saddr,
+ __be16 sport,
+ const struct in6_addr *daddr,
+ unsigned short hnum,
+ inet6_ehashfn_t *ehashfn)
+{
+ struct sock *reuse_sk = NULL;
+ u32 phash;
+
+ if (sk->sk_reuseport) {
+ phash = INDIRECT_CALL_INET(ehashfn, udp6_ehashfn, inet6_ehashfn,
+ net, daddr, hnum, saddr, sport);
+ reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
+ }
+ return reuse_sk;
+}
+EXPORT_SYMBOL_GPL(inet6_lookup_reuseport);
+
/* called with rcu_read_lock() */
-static struct sock *inet6_lhash2_lookup(struct net *net,
+static struct sock *inet6_lhash2_lookup(const struct net *net,
struct inet_listen_hashbucket *ilb2,
struct sk_buff *skb, int doff,
const struct in6_addr *saddr,
const __be16 sport, const struct in6_addr *daddr,
const unsigned short hnum, const int dif, const int sdif)
{
- bool exact_dif = inet6_exact_dif_match(net, skb);
- struct inet_connection_sock *icsk;
struct sock *sk, *result = NULL;
+ struct hlist_nulls_node *node;
int score, hiscore = 0;
- u32 phash = 0;
- inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) {
- sk = (struct sock *)icsk;
- score = compute_score(sk, net, hnum, daddr, dif, sdif,
- exact_dif);
+ sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) {
+ score = compute_score(sk, net, hnum, daddr, dif, sdif);
if (score > hiscore) {
- if (sk->sk_reuseport) {
- phash = inet6_ehashfn(net, daddr, hnum,
- saddr, sport);
- result = reuseport_select_sock(sk, phash,
- skb, doff);
- if (result)
- return result;
- }
+ result = inet6_lookup_reuseport(net, sk, skb, doff,
+ saddr, sport, daddr, hnum, inet6_ehashfn);
+ if (result)
+ return result;
+
result = sk;
hiscore = score;
}
@@ -159,33 +173,56 @@ static struct sock *inet6_lhash2_lookup(struct net *net,
return result;
}
-struct sock *inet6_lookup_listener(struct net *net,
- struct inet_hashinfo *hashinfo,
- struct sk_buff *skb, int doff,
- const struct in6_addr *saddr,
- const __be16 sport, const struct in6_addr *daddr,
- const unsigned short hnum, const int dif, const int sdif)
+struct sock *inet6_lookup_run_sk_lookup(const struct net *net,
+ int protocol,
+ struct sk_buff *skb, int doff,
+ const struct in6_addr *saddr,
+ const __be16 sport,
+ const struct in6_addr *daddr,
+ const u16 hnum, const int dif,
+ inet6_ehashfn_t *ehashfn)
+{
+ struct sock *sk, *reuse_sk;
+ bool no_reuseport;
+
+ no_reuseport = bpf_sk_lookup_run_v6(net, protocol, saddr, sport,
+ daddr, hnum, dif, &sk);
+ if (no_reuseport || IS_ERR_OR_NULL(sk))
+ return sk;
+
+ reuse_sk = inet6_lookup_reuseport(net, sk, skb, doff,
+ saddr, sport, daddr, hnum, ehashfn);
+ if (reuse_sk)
+ sk = reuse_sk;
+ return sk;
+}
+EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup);
+
+struct sock *inet6_lookup_listener(const struct net *net,
+ struct sk_buff *skb, int doff,
+ const struct in6_addr *saddr,
+ const __be16 sport,
+ const struct in6_addr *daddr,
+ const unsigned short hnum,
+ const int dif, const int sdif)
{
- unsigned int hash = inet_lhashfn(net, hnum);
- struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
- bool exact_dif = inet6_exact_dif_match(net, skb);
struct inet_listen_hashbucket *ilb2;
- struct sock *sk, *result = NULL;
- int score, hiscore = 0;
+ struct inet_hashinfo *hashinfo;
+ struct sock *result = NULL;
unsigned int hash2;
- u32 phash = 0;
- if (ilb->count <= 10 || !hashinfo->lhash2)
- goto port_lookup;
-
- /* Too many sk in the ilb bucket (which is hashed by port alone).
- * Try lhash2 (which is hashed by port and addr) instead.
- */
+ /* Lookup redirect from BPF */
+ if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
+ result = inet6_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff,
+ saddr, sport, daddr, hnum, dif,
+ inet6_ehashfn);
+ if (result)
+ goto done;
+ }
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
hash2 = ipv6_portaddr_hash(net, daddr, hnum);
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
- if (ilb2->count > ilb->count)
- goto port_lookup;
result = inet6_lhash2_lookup(net, ilb2, skb, doff,
saddr, sport, daddr, hnum,
@@ -194,41 +231,20 @@ struct sock *inet6_lookup_listener(struct net *net,
goto done;
/* Lookup lhash2 with in6addr_any */
-
hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
- if (ilb2->count > ilb->count)
- goto port_lookup;
result = inet6_lhash2_lookup(net, ilb2, skb, doff,
- saddr, sport, daddr, hnum,
+ saddr, sport, &in6addr_any, hnum,
dif, sdif);
- goto done;
-
-port_lookup:
- sk_for_each(sk, &ilb->head) {
- score = compute_score(sk, net, hnum, daddr, dif, sdif, exact_dif);
- if (score > hiscore) {
- if (sk->sk_reuseport) {
- phash = inet6_ehashfn(net, daddr, hnum,
- saddr, sport);
- result = reuseport_select_sock(sk, phash,
- skb, doff);
- if (result)
- goto done;
- }
- result = sk;
- hiscore = score;
- }
- }
done:
- if (unlikely(IS_ERR(result)))
+ if (IS_ERR(result))
return NULL;
return result;
}
EXPORT_SYMBOL_GPL(inet6_lookup_listener);
-struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
+struct sock *inet6_lookup(const struct net *net,
struct sk_buff *skb, int doff,
const struct in6_addr *saddr, const __be16 sport,
const struct in6_addr *daddr, const __be16 dport,
@@ -237,7 +253,7 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
struct sock *sk;
bool refcounted;
- sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr,
+ sk = __inet6_lookup(net, skb, doff, saddr, sport, daddr,
ntohs(dport), dif, 0, &refcounted);
if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
@@ -247,7 +263,9 @@ EXPORT_SYMBOL_GPL(inet6_lookup);
static int __inet6_check_established(struct inet_timewait_death_row *death_row,
struct sock *sk, const __u16 lport,
- struct inet_timewait_sock **twp)
+ struct inet_timewait_sock **twp,
+ bool rcu_lookup,
+ u32 hash)
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
struct inet_sock *inet = inet_sk(sk);
@@ -257,25 +275,37 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
struct net *net = sock_net(sk);
const int sdif = l3mdev_master_ifindex_by_index(net, dif);
const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
- const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr,
- inet->inet_dport);
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
- spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
- struct sock *sk2;
- const struct hlist_nulls_node *node;
struct inet_timewait_sock *tw = NULL;
+ const struct hlist_nulls_node *node;
+ struct sock *sk2;
+ spinlock_t *lock;
+
+ if (rcu_lookup) {
+ sk_nulls_for_each(sk2, node, &head->chain) {
+ if (sk2->sk_hash != hash ||
+ !inet6_match(net, sk2, saddr, daddr,
+ ports, dif, sdif))
+ continue;
+ if (sk2->sk_state == TCP_TIME_WAIT)
+ break;
+ return -EADDRNOTAVAIL;
+ }
+ return 0;
+ }
+ lock = inet_ehash_lockp(hinfo, hash);
spin_lock(lock);
sk_nulls_for_each(sk2, node, &head->chain) {
if (sk2->sk_hash != hash)
continue;
- if (likely(INET6_MATCH(sk2, net, saddr, daddr, ports,
+ if (likely(inet6_match(net, sk2, saddr, daddr, ports,
dif, sdif))) {
if (sk2->sk_state == TCP_TIME_WAIT) {
tw = inet_twsk(sk2);
- if (twsk_unique(sk, sk2, twp))
+ if (tcp_twsk_unique(sk, sk2, twp))
break;
}
goto not_unique;
@@ -310,7 +340,7 @@ not_unique:
return -EADDRNOTAVAIL;
}
-static u32 inet6_sk_port_offset(const struct sock *sk)
+static u64 inet6_sk_port_offset(const struct sock *sk)
{
const struct inet_sock *inet = inet_sk(sk);
@@ -322,25 +352,19 @@ static u32 inet6_sk_port_offset(const struct sock *sk)
int inet6_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk)
{
- u32 port_offset = 0;
+ const struct in6_addr *daddr = &sk->sk_v6_rcv_saddr;
+ const struct in6_addr *saddr = &sk->sk_v6_daddr;
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct net *net = sock_net(sk);
+ u64 port_offset = 0;
+ u32 hash_port0;
if (!inet_sk(sk)->inet_num)
port_offset = inet6_sk_port_offset(sk);
- return __inet_hash_connect(death_row, sk, port_offset,
- __inet6_check_established);
-}
-EXPORT_SYMBOL_GPL(inet6_hash_connect);
-int inet6_hash(struct sock *sk)
-{
- int err = 0;
+ hash_port0 = inet6_ehashfn(net, daddr, 0, saddr, inet->inet_dport);
- if (sk->sk_state != TCP_CLOSE) {
- local_bh_disable();
- err = __inet_hash(sk, NULL);
- local_bh_enable();
- }
-
- return err;
+ return __inet_hash_connect(death_row, sk, port_offset, hash_port0,
+ __inet6_check_established);
}
-EXPORT_SYMBOL_GPL(inet6_hash);
+EXPORT_SYMBOL_GPL(inet6_hash_connect);
diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c
new file mode 100644
index 000000000000..9553a3200081
--- /dev/null
+++ b/net/ipv6/ioam6.c
@@ -0,0 +1,1040 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * IPv6 IOAM implementation
+ *
+ * Author:
+ * Justin Iurman <justin.iurman@uliege.be>
+ */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/net.h>
+#include <linux/ioam6.h>
+#include <linux/ioam6_genl.h>
+#include <linux/rhashtable.h>
+#include <linux/netdevice.h>
+
+#include <net/addrconf.h>
+#include <net/genetlink.h>
+#include <net/ioam6.h>
+#include <net/sch_generic.h>
+
+static void ioam6_ns_release(struct ioam6_namespace *ns)
+{
+ kfree_rcu(ns, rcu);
+}
+
+static void ioam6_sc_release(struct ioam6_schema *sc)
+{
+ kfree_rcu(sc, rcu);
+}
+
+static void ioam6_free_ns(void *ptr, void *arg)
+{
+ struct ioam6_namespace *ns = (struct ioam6_namespace *)ptr;
+
+ if (ns)
+ ioam6_ns_release(ns);
+}
+
+static void ioam6_free_sc(void *ptr, void *arg)
+{
+ struct ioam6_schema *sc = (struct ioam6_schema *)ptr;
+
+ if (sc)
+ ioam6_sc_release(sc);
+}
+
+static int ioam6_ns_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+ const struct ioam6_namespace *ns = obj;
+
+ return (ns->id != *(__be16 *)arg->key);
+}
+
+static int ioam6_sc_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
+{
+ const struct ioam6_schema *sc = obj;
+
+ return (sc->id != *(u32 *)arg->key);
+}
+
+static const struct rhashtable_params rht_ns_params = {
+ .key_len = sizeof(__be16),
+ .key_offset = offsetof(struct ioam6_namespace, id),
+ .head_offset = offsetof(struct ioam6_namespace, head),
+ .automatic_shrinking = true,
+ .obj_cmpfn = ioam6_ns_cmpfn,
+};
+
+static const struct rhashtable_params rht_sc_params = {
+ .key_len = sizeof(u32),
+ .key_offset = offsetof(struct ioam6_schema, id),
+ .head_offset = offsetof(struct ioam6_schema, head),
+ .automatic_shrinking = true,
+ .obj_cmpfn = ioam6_sc_cmpfn,
+};
+
+static struct genl_family ioam6_genl_family;
+
+static const struct nla_policy ioam6_genl_policy_addns[] = {
+ [IOAM6_ATTR_NS_ID] = { .type = NLA_U16 },
+ [IOAM6_ATTR_NS_DATA] = { .type = NLA_U32 },
+ [IOAM6_ATTR_NS_DATA_WIDE] = { .type = NLA_U64 },
+};
+
+static const struct nla_policy ioam6_genl_policy_delns[] = {
+ [IOAM6_ATTR_NS_ID] = { .type = NLA_U16 },
+};
+
+static const struct nla_policy ioam6_genl_policy_addsc[] = {
+ [IOAM6_ATTR_SC_ID] = { .type = NLA_U32 },
+ [IOAM6_ATTR_SC_DATA] = { .type = NLA_BINARY,
+ .len = IOAM6_MAX_SCHEMA_DATA_LEN },
+};
+
+static const struct nla_policy ioam6_genl_policy_delsc[] = {
+ [IOAM6_ATTR_SC_ID] = { .type = NLA_U32 },
+};
+
+static const struct nla_policy ioam6_genl_policy_ns_sc[] = {
+ [IOAM6_ATTR_NS_ID] = { .type = NLA_U16 },
+ [IOAM6_ATTR_SC_ID] = { .type = NLA_U32 },
+ [IOAM6_ATTR_SC_NONE] = { .type = NLA_FLAG },
+};
+
+static int ioam6_genl_addns(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ioam6_pernet_data *nsdata;
+ struct ioam6_namespace *ns;
+ u64 data64;
+ u32 data32;
+ __be16 id;
+ int err;
+
+ if (!info->attrs[IOAM6_ATTR_NS_ID])
+ return -EINVAL;
+
+ id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID]));
+ nsdata = ioam6_pernet(genl_info_net(info));
+
+ mutex_lock(&nsdata->lock);
+
+ ns = rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params);
+ if (ns) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+
+ ns = kzalloc(sizeof(*ns), GFP_KERNEL);
+ if (!ns) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ ns->id = id;
+
+ data32 = nla_get_u32_default(info->attrs[IOAM6_ATTR_NS_DATA],
+ IOAM6_U32_UNAVAILABLE);
+
+ data64 = nla_get_u64_default(info->attrs[IOAM6_ATTR_NS_DATA_WIDE],
+ IOAM6_U64_UNAVAILABLE);
+
+ ns->data = cpu_to_be32(data32);
+ ns->data_wide = cpu_to_be64(data64);
+
+ err = rhashtable_lookup_insert_fast(&nsdata->namespaces, &ns->head,
+ rht_ns_params);
+ if (err)
+ kfree(ns);
+
+out_unlock:
+ mutex_unlock(&nsdata->lock);
+ return err;
+}
+
+static int ioam6_genl_delns(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ioam6_pernet_data *nsdata;
+ struct ioam6_namespace *ns;
+ struct ioam6_schema *sc;
+ __be16 id;
+ int err;
+
+ if (!info->attrs[IOAM6_ATTR_NS_ID])
+ return -EINVAL;
+
+ id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID]));
+ nsdata = ioam6_pernet(genl_info_net(info));
+
+ mutex_lock(&nsdata->lock);
+
+ ns = rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params);
+ if (!ns) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+
+ sc = rcu_dereference_protected(ns->schema,
+ lockdep_is_held(&nsdata->lock));
+
+ err = rhashtable_remove_fast(&nsdata->namespaces, &ns->head,
+ rht_ns_params);
+ if (err)
+ goto out_unlock;
+
+ if (sc)
+ rcu_assign_pointer(sc->ns, NULL);
+
+ ioam6_ns_release(ns);
+
+out_unlock:
+ mutex_unlock(&nsdata->lock);
+ return err;
+}
+
+static int __ioam6_genl_dumpns_element(struct ioam6_namespace *ns,
+ u32 portid,
+ u32 seq,
+ u32 flags,
+ struct sk_buff *skb,
+ u8 cmd)
+{
+ struct ioam6_schema *sc;
+ u64 data64;
+ u32 data32;
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &ioam6_genl_family, flags, cmd);
+ if (!hdr)
+ return -ENOMEM;
+
+ data32 = be32_to_cpu(ns->data);
+ data64 = be64_to_cpu(ns->data_wide);
+
+ if (nla_put_u16(skb, IOAM6_ATTR_NS_ID, be16_to_cpu(ns->id)) ||
+ (data32 != IOAM6_U32_UNAVAILABLE &&
+ nla_put_u32(skb, IOAM6_ATTR_NS_DATA, data32)) ||
+ (data64 != IOAM6_U64_UNAVAILABLE &&
+ nla_put_u64_64bit(skb, IOAM6_ATTR_NS_DATA_WIDE,
+ data64, IOAM6_ATTR_PAD)))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+
+ sc = rcu_dereference(ns->schema);
+ if (sc && nla_put_u32(skb, IOAM6_ATTR_SC_ID, sc->id)) {
+ rcu_read_unlock();
+ goto nla_put_failure;
+ }
+
+ rcu_read_unlock();
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int ioam6_genl_dumpns_start(struct netlink_callback *cb)
+{
+ struct ioam6_pernet_data *nsdata = ioam6_pernet(sock_net(cb->skb->sk));
+ struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+ if (!iter) {
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ cb->args[0] = (long)iter;
+ }
+
+ rhashtable_walk_enter(&nsdata->namespaces, iter);
+
+ return 0;
+}
+
+static int ioam6_genl_dumpns_done(struct netlink_callback *cb)
+{
+ struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+ rhashtable_walk_exit(iter);
+ kfree(iter);
+
+ return 0;
+}
+
+static int ioam6_genl_dumpns(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct rhashtable_iter *iter;
+ struct ioam6_namespace *ns;
+ int err;
+
+ iter = (struct rhashtable_iter *)cb->args[0];
+ rhashtable_walk_start(iter);
+
+ for (;;) {
+ ns = rhashtable_walk_next(iter);
+
+ if (IS_ERR(ns)) {
+ if (PTR_ERR(ns) == -EAGAIN)
+ continue;
+ err = PTR_ERR(ns);
+ goto done;
+ } else if (!ns) {
+ break;
+ }
+
+ err = __ioam6_genl_dumpns_element(ns,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI,
+ skb,
+ IOAM6_CMD_DUMP_NAMESPACES);
+ if (err)
+ goto done;
+ }
+
+ err = skb->len;
+
+done:
+ rhashtable_walk_stop(iter);
+ return err;
+}
+
+static int ioam6_genl_addsc(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ioam6_pernet_data *nsdata;
+ int len, len_aligned, err;
+ struct ioam6_schema *sc;
+ u32 id;
+
+ if (!info->attrs[IOAM6_ATTR_SC_ID] || !info->attrs[IOAM6_ATTR_SC_DATA])
+ return -EINVAL;
+
+ id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]);
+ nsdata = ioam6_pernet(genl_info_net(info));
+
+ mutex_lock(&nsdata->lock);
+
+ sc = rhashtable_lookup_fast(&nsdata->schemas, &id, rht_sc_params);
+ if (sc) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+
+ len = nla_len(info->attrs[IOAM6_ATTR_SC_DATA]);
+ len_aligned = ALIGN(len, 4);
+
+ sc = kzalloc(sizeof(*sc) + len_aligned, GFP_KERNEL);
+ if (!sc) {
+ err = -ENOMEM;
+ goto out_unlock;
+ }
+
+ sc->id = id;
+ sc->len = len_aligned;
+ sc->hdr = cpu_to_be32(sc->id | ((u8)(sc->len / 4) << 24));
+ nla_memcpy(sc->data, info->attrs[IOAM6_ATTR_SC_DATA], len);
+
+ err = rhashtable_lookup_insert_fast(&nsdata->schemas, &sc->head,
+ rht_sc_params);
+ if (err)
+ goto free_sc;
+
+out_unlock:
+ mutex_unlock(&nsdata->lock);
+ return err;
+free_sc:
+ kfree(sc);
+ goto out_unlock;
+}
+
+static int ioam6_genl_delsc(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ioam6_pernet_data *nsdata;
+ struct ioam6_namespace *ns;
+ struct ioam6_schema *sc;
+ int err;
+ u32 id;
+
+ if (!info->attrs[IOAM6_ATTR_SC_ID])
+ return -EINVAL;
+
+ id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]);
+ nsdata = ioam6_pernet(genl_info_net(info));
+
+ mutex_lock(&nsdata->lock);
+
+ sc = rhashtable_lookup_fast(&nsdata->schemas, &id, rht_sc_params);
+ if (!sc) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+
+ ns = rcu_dereference_protected(sc->ns, lockdep_is_held(&nsdata->lock));
+
+ err = rhashtable_remove_fast(&nsdata->schemas, &sc->head,
+ rht_sc_params);
+ if (err)
+ goto out_unlock;
+
+ if (ns)
+ rcu_assign_pointer(ns->schema, NULL);
+
+ ioam6_sc_release(sc);
+
+out_unlock:
+ mutex_unlock(&nsdata->lock);
+ return err;
+}
+
+static int __ioam6_genl_dumpsc_element(struct ioam6_schema *sc,
+ u32 portid, u32 seq, u32 flags,
+ struct sk_buff *skb, u8 cmd)
+{
+ struct ioam6_namespace *ns;
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &ioam6_genl_family, flags, cmd);
+ if (!hdr)
+ return -ENOMEM;
+
+ if (nla_put_u32(skb, IOAM6_ATTR_SC_ID, sc->id) ||
+ nla_put(skb, IOAM6_ATTR_SC_DATA, sc->len, sc->data))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+
+ ns = rcu_dereference(sc->ns);
+ if (ns && nla_put_u16(skb, IOAM6_ATTR_NS_ID, be16_to_cpu(ns->id))) {
+ rcu_read_unlock();
+ goto nla_put_failure;
+ }
+
+ rcu_read_unlock();
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int ioam6_genl_dumpsc_start(struct netlink_callback *cb)
+{
+ struct ioam6_pernet_data *nsdata = ioam6_pernet(sock_net(cb->skb->sk));
+ struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+ if (!iter) {
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ cb->args[0] = (long)iter;
+ }
+
+ rhashtable_walk_enter(&nsdata->schemas, iter);
+
+ return 0;
+}
+
+static int ioam6_genl_dumpsc_done(struct netlink_callback *cb)
+{
+ struct rhashtable_iter *iter = (struct rhashtable_iter *)cb->args[0];
+
+ rhashtable_walk_exit(iter);
+ kfree(iter);
+
+ return 0;
+}
+
+static int ioam6_genl_dumpsc(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct rhashtable_iter *iter;
+ struct ioam6_schema *sc;
+ int err;
+
+ iter = (struct rhashtable_iter *)cb->args[0];
+ rhashtable_walk_start(iter);
+
+ for (;;) {
+ sc = rhashtable_walk_next(iter);
+
+ if (IS_ERR(sc)) {
+ if (PTR_ERR(sc) == -EAGAIN)
+ continue;
+ err = PTR_ERR(sc);
+ goto done;
+ } else if (!sc) {
+ break;
+ }
+
+ err = __ioam6_genl_dumpsc_element(sc,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI,
+ skb,
+ IOAM6_CMD_DUMP_SCHEMAS);
+ if (err)
+ goto done;
+ }
+
+ err = skb->len;
+
+done:
+ rhashtable_walk_stop(iter);
+ return err;
+}
+
+static int ioam6_genl_ns_set_schema(struct sk_buff *skb, struct genl_info *info)
+{
+ struct ioam6_namespace *ns, *ns_ref;
+ struct ioam6_schema *sc, *sc_ref;
+ struct ioam6_pernet_data *nsdata;
+ __be16 ns_id;
+ u32 sc_id;
+ int err;
+
+ if (!info->attrs[IOAM6_ATTR_NS_ID] ||
+ (!info->attrs[IOAM6_ATTR_SC_ID] &&
+ !info->attrs[IOAM6_ATTR_SC_NONE]))
+ return -EINVAL;
+
+ ns_id = cpu_to_be16(nla_get_u16(info->attrs[IOAM6_ATTR_NS_ID]));
+ nsdata = ioam6_pernet(genl_info_net(info));
+
+ mutex_lock(&nsdata->lock);
+
+ ns = rhashtable_lookup_fast(&nsdata->namespaces, &ns_id, rht_ns_params);
+ if (!ns) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+
+ if (info->attrs[IOAM6_ATTR_SC_NONE]) {
+ sc = NULL;
+ } else {
+ sc_id = nla_get_u32(info->attrs[IOAM6_ATTR_SC_ID]);
+ sc = rhashtable_lookup_fast(&nsdata->schemas, &sc_id,
+ rht_sc_params);
+ if (!sc) {
+ err = -ENOENT;
+ goto out_unlock;
+ }
+ }
+
+ sc_ref = rcu_dereference_protected(ns->schema,
+ lockdep_is_held(&nsdata->lock));
+ if (sc_ref)
+ rcu_assign_pointer(sc_ref->ns, NULL);
+ rcu_assign_pointer(ns->schema, sc);
+
+ if (sc) {
+ ns_ref = rcu_dereference_protected(sc->ns,
+ lockdep_is_held(&nsdata->lock));
+ if (ns_ref)
+ rcu_assign_pointer(ns_ref->schema, NULL);
+ rcu_assign_pointer(sc->ns, ns);
+ }
+
+ err = 0;
+
+out_unlock:
+ mutex_unlock(&nsdata->lock);
+ return err;
+}
+
+static const struct genl_ops ioam6_genl_ops[] = {
+ {
+ .cmd = IOAM6_CMD_ADD_NAMESPACE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = ioam6_genl_addns,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ioam6_genl_policy_addns,
+ .maxattr = ARRAY_SIZE(ioam6_genl_policy_addns) - 1,
+ },
+ {
+ .cmd = IOAM6_CMD_DEL_NAMESPACE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = ioam6_genl_delns,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ioam6_genl_policy_delns,
+ .maxattr = ARRAY_SIZE(ioam6_genl_policy_delns) - 1,
+ },
+ {
+ .cmd = IOAM6_CMD_DUMP_NAMESPACES,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .start = ioam6_genl_dumpns_start,
+ .dumpit = ioam6_genl_dumpns,
+ .done = ioam6_genl_dumpns_done,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = IOAM6_CMD_ADD_SCHEMA,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = ioam6_genl_addsc,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ioam6_genl_policy_addsc,
+ .maxattr = ARRAY_SIZE(ioam6_genl_policy_addsc) - 1,
+ },
+ {
+ .cmd = IOAM6_CMD_DEL_SCHEMA,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = ioam6_genl_delsc,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ioam6_genl_policy_delsc,
+ .maxattr = ARRAY_SIZE(ioam6_genl_policy_delsc) - 1,
+ },
+ {
+ .cmd = IOAM6_CMD_DUMP_SCHEMAS,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .start = ioam6_genl_dumpsc_start,
+ .dumpit = ioam6_genl_dumpsc,
+ .done = ioam6_genl_dumpsc_done,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = IOAM6_CMD_NS_SET_SCHEMA,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .doit = ioam6_genl_ns_set_schema,
+ .flags = GENL_ADMIN_PERM,
+ .policy = ioam6_genl_policy_ns_sc,
+ .maxattr = ARRAY_SIZE(ioam6_genl_policy_ns_sc) - 1,
+ },
+};
+
+#define IOAM6_GENL_EV_GRP_OFFSET 0
+
+static const struct genl_multicast_group ioam6_mcgrps[] = {
+ [IOAM6_GENL_EV_GRP_OFFSET] = { .name = IOAM6_GENL_EV_GRP_NAME,
+ .flags = GENL_MCAST_CAP_NET_ADMIN },
+};
+
+static int ioam6_event_put_trace(struct sk_buff *skb,
+ struct ioam6_trace_hdr *trace,
+ unsigned int len)
+{
+ if (nla_put_u16(skb, IOAM6_EVENT_ATTR_TRACE_NAMESPACE,
+ be16_to_cpu(trace->namespace_id)) ||
+ nla_put_u8(skb, IOAM6_EVENT_ATTR_TRACE_NODELEN, trace->nodelen) ||
+ nla_put_u32(skb, IOAM6_EVENT_ATTR_TRACE_TYPE,
+ be32_to_cpu(trace->type_be32)) ||
+ nla_put(skb, IOAM6_EVENT_ATTR_TRACE_DATA,
+ len - sizeof(struct ioam6_trace_hdr) - trace->remlen * 4,
+ trace->data + trace->remlen * 4))
+ return 1;
+
+ return 0;
+}
+
+void ioam6_event(enum ioam6_event_type type, struct net *net, gfp_t gfp,
+ void *opt, unsigned int opt_len)
+{
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+
+ if (!genl_has_listeners(&ioam6_genl_family, net,
+ IOAM6_GENL_EV_GRP_OFFSET))
+ return;
+
+ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!skb)
+ return;
+
+ nlh = genlmsg_put(skb, 0, 0, &ioam6_genl_family, 0, type);
+ if (!nlh)
+ goto nla_put_failure;
+
+ switch (type) {
+ case IOAM6_EVENT_UNSPEC:
+ WARN_ON_ONCE(1);
+ break;
+ case IOAM6_EVENT_TRACE:
+ if (ioam6_event_put_trace(skb, (struct ioam6_trace_hdr *)opt,
+ opt_len))
+ goto nla_put_failure;
+ break;
+ }
+
+ genlmsg_end(skb, nlh);
+ genlmsg_multicast_netns(&ioam6_genl_family, net, skb, 0,
+ IOAM6_GENL_EV_GRP_OFFSET, gfp);
+ return;
+
+nla_put_failure:
+ nlmsg_free(skb);
+}
+
+static struct genl_family ioam6_genl_family __ro_after_init = {
+ .name = IOAM6_GENL_NAME,
+ .version = IOAM6_GENL_VERSION,
+ .netnsok = true,
+ .parallel_ops = true,
+ .ops = ioam6_genl_ops,
+ .n_ops = ARRAY_SIZE(ioam6_genl_ops),
+ .resv_start_op = IOAM6_CMD_NS_SET_SCHEMA + 1,
+ .mcgrps = ioam6_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(ioam6_mcgrps),
+ .module = THIS_MODULE,
+};
+
+struct ioam6_namespace *ioam6_namespace(struct net *net, __be16 id)
+{
+ struct ioam6_pernet_data *nsdata = ioam6_pernet(net);
+
+ return rhashtable_lookup_fast(&nsdata->namespaces, &id, rht_ns_params);
+}
+
+static void __ioam6_fill_trace_data(struct sk_buff *skb,
+ struct ioam6_namespace *ns,
+ struct ioam6_trace_hdr *trace,
+ struct ioam6_schema *sc,
+ u8 sclen, bool is_input)
+{
+ struct net_device *dev = skb_dst_dev(skb);
+ struct timespec64 ts;
+ ktime_t tstamp;
+ u64 raw64;
+ u32 raw32;
+ u16 raw16;
+ u8 *data;
+ u8 byte;
+
+ data = trace->data + trace->remlen * 4 - trace->nodelen * 4 - sclen * 4;
+
+ /* hop_lim and node_id */
+ if (trace->type.bit0) {
+ byte = ipv6_hdr(skb)->hop_limit;
+ if (is_input)
+ byte--;
+
+ raw32 = dev_net(dev)->ipv6.sysctl.ioam6_id;
+
+ *(__be32 *)data = cpu_to_be32((byte << 24) | raw32);
+ data += sizeof(__be32);
+ }
+
+ /* ingress_if_id and egress_if_id */
+ if (trace->type.bit1) {
+ if (!skb->dev)
+ raw16 = IOAM6_U16_UNAVAILABLE;
+ else
+ raw16 = (__force u16)READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_id);
+
+ *(__be16 *)data = cpu_to_be16(raw16);
+ data += sizeof(__be16);
+
+ if (dev->flags & IFF_LOOPBACK)
+ raw16 = IOAM6_U16_UNAVAILABLE;
+ else
+ raw16 = (__force u16)READ_ONCE(__in6_dev_get(dev)->cnf.ioam6_id);
+
+ *(__be16 *)data = cpu_to_be16(raw16);
+ data += sizeof(__be16);
+ }
+
+ /* timestamp seconds */
+ if (trace->type.bit2) {
+ if (!skb->dev) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ } else {
+ tstamp = skb_tstamp_cond(skb, true);
+ ts = ktime_to_timespec64(tstamp);
+
+ *(__be32 *)data = cpu_to_be32((u32)ts.tv_sec);
+ }
+ data += sizeof(__be32);
+ }
+
+ /* timestamp subseconds */
+ if (trace->type.bit3) {
+ if (!skb->dev) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ } else {
+ if (!trace->type.bit2) {
+ tstamp = skb_tstamp_cond(skb, true);
+ ts = ktime_to_timespec64(tstamp);
+ }
+
+ *(__be32 *)data = cpu_to_be32((u32)(ts.tv_nsec / NSEC_PER_USEC));
+ }
+ data += sizeof(__be32);
+ }
+
+ /* transit delay */
+ if (trace->type.bit4) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* namespace data */
+ if (trace->type.bit5) {
+ *(__be32 *)data = ns->data;
+ data += sizeof(__be32);
+ }
+
+ /* queue depth */
+ if (trace->type.bit6) {
+ struct netdev_queue *queue;
+ struct Qdisc *qdisc;
+ __u32 qlen, backlog;
+
+ if (dev->flags & IFF_LOOPBACK) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ } else {
+ queue = skb_get_tx_queue(dev, skb);
+ qdisc = rcu_dereference(queue->qdisc);
+ qdisc_qstats_qlen_backlog(qdisc, &qlen, &backlog);
+
+ *(__be32 *)data = cpu_to_be32(backlog);
+ }
+ data += sizeof(__be32);
+ }
+
+ /* checksum complement */
+ if (trace->type.bit7) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* hop_lim and node_id (wide) */
+ if (trace->type.bit8) {
+ byte = ipv6_hdr(skb)->hop_limit;
+ if (is_input)
+ byte--;
+
+ raw64 = dev_net(dev)->ipv6.sysctl.ioam6_id_wide;
+
+ *(__be64 *)data = cpu_to_be64(((u64)byte << 56) | raw64);
+ data += sizeof(__be64);
+ }
+
+ /* ingress_if_id and egress_if_id (wide) */
+ if (trace->type.bit9) {
+ if (!skb->dev)
+ raw32 = IOAM6_U32_UNAVAILABLE;
+ else
+ raw32 = READ_ONCE(__in6_dev_get(skb->dev)->cnf.ioam6_id_wide);
+
+ *(__be32 *)data = cpu_to_be32(raw32);
+ data += sizeof(__be32);
+
+ if (dev->flags & IFF_LOOPBACK)
+ raw32 = IOAM6_U32_UNAVAILABLE;
+ else
+ raw32 = READ_ONCE(__in6_dev_get(dev)->cnf.ioam6_id_wide);
+
+ *(__be32 *)data = cpu_to_be32(raw32);
+ data += sizeof(__be32);
+ }
+
+ /* namespace data (wide) */
+ if (trace->type.bit10) {
+ *(__be64 *)data = ns->data_wide;
+ data += sizeof(__be64);
+ }
+
+ /* buffer occupancy */
+ if (trace->type.bit11) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit12 undefined: filled with empty value */
+ if (trace->type.bit12) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit13 undefined: filled with empty value */
+ if (trace->type.bit13) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit14 undefined: filled with empty value */
+ if (trace->type.bit14) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit15 undefined: filled with empty value */
+ if (trace->type.bit15) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit16 undefined: filled with empty value */
+ if (trace->type.bit16) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit17 undefined: filled with empty value */
+ if (trace->type.bit17) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit18 undefined: filled with empty value */
+ if (trace->type.bit18) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit19 undefined: filled with empty value */
+ if (trace->type.bit19) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit20 undefined: filled with empty value */
+ if (trace->type.bit20) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* bit21 undefined: filled with empty value */
+ if (trace->type.bit21) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE);
+ data += sizeof(__be32);
+ }
+
+ /* opaque state snapshot */
+ if (trace->type.bit22) {
+ if (!sc) {
+ *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE >> 8);
+ } else {
+ *(__be32 *)data = sc->hdr;
+ data += sizeof(__be32);
+
+ memcpy(data, sc->data, sc->len);
+ }
+ }
+}
+
+/* called with rcu_read_lock() */
+void ioam6_fill_trace_data(struct sk_buff *skb,
+ struct ioam6_namespace *ns,
+ struct ioam6_trace_hdr *trace,
+ bool is_input)
+{
+ struct ioam6_schema *sc;
+ u8 sclen = 0;
+
+ /* Skip if Overflow flag is set
+ */
+ if (trace->overflow)
+ return;
+
+ /* NodeLen does not include Opaque State Snapshot length. We need to
+ * take it into account if the corresponding bit is set (bit 22) and
+ * if the current IOAM namespace has an active schema attached to it
+ */
+ sc = rcu_dereference(ns->schema);
+ if (trace->type.bit22) {
+ sclen = sizeof_field(struct ioam6_schema, hdr) / 4;
+
+ if (sc)
+ sclen += sc->len / 4;
+ }
+
+ /* If there is no space remaining, we set the Overflow flag and we
+ * skip without filling the trace
+ */
+ if (!trace->remlen || trace->remlen < trace->nodelen + sclen) {
+ trace->overflow = 1;
+ return;
+ }
+
+ __ioam6_fill_trace_data(skb, ns, trace, sc, sclen, is_input);
+ trace->remlen -= trace->nodelen + sclen;
+}
+
+static int __net_init ioam6_net_init(struct net *net)
+{
+ struct ioam6_pernet_data *nsdata;
+ int err = -ENOMEM;
+
+ nsdata = kzalloc(sizeof(*nsdata), GFP_KERNEL);
+ if (!nsdata)
+ goto out;
+
+ mutex_init(&nsdata->lock);
+ net->ipv6.ioam6_data = nsdata;
+
+ err = rhashtable_init(&nsdata->namespaces, &rht_ns_params);
+ if (err)
+ goto free_nsdata;
+
+ err = rhashtable_init(&nsdata->schemas, &rht_sc_params);
+ if (err)
+ goto free_rht_ns;
+
+out:
+ return err;
+free_rht_ns:
+ rhashtable_destroy(&nsdata->namespaces);
+free_nsdata:
+ kfree(nsdata);
+ net->ipv6.ioam6_data = NULL;
+ goto out;
+}
+
+static void __net_exit ioam6_net_exit(struct net *net)
+{
+ struct ioam6_pernet_data *nsdata = ioam6_pernet(net);
+
+ rhashtable_free_and_destroy(&nsdata->namespaces, ioam6_free_ns, NULL);
+ rhashtable_free_and_destroy(&nsdata->schemas, ioam6_free_sc, NULL);
+
+ kfree(nsdata);
+}
+
+static struct pernet_operations ioam6_net_ops = {
+ .init = ioam6_net_init,
+ .exit = ioam6_net_exit,
+};
+
+int __init ioam6_init(void)
+{
+ int err = register_pernet_subsys(&ioam6_net_ops);
+ if (err)
+ goto out;
+
+ err = genl_register_family(&ioam6_genl_family);
+ if (err)
+ goto out_unregister_pernet_subsys;
+
+#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL
+ err = ioam6_iptunnel_init();
+ if (err)
+ goto out_unregister_genl;
+#endif
+
+ pr_info("In-situ OAM (IOAM) with IPv6\n");
+
+out:
+ return err;
+#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL
+out_unregister_genl:
+ genl_unregister_family(&ioam6_genl_family);
+#endif
+out_unregister_pernet_subsys:
+ unregister_pernet_subsys(&ioam6_net_ops);
+ goto out;
+}
+
+void ioam6_exit(void)
+{
+#ifdef CONFIG_IPV6_IOAM6_LWTUNNEL
+ ioam6_iptunnel_exit();
+#endif
+ genl_unregister_family(&ioam6_genl_family);
+ unregister_pernet_subsys(&ioam6_net_ops);
+}
diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c
new file mode 100644
index 000000000000..1fe7894f14dd
--- /dev/null
+++ b/net/ipv6/ioam6_iptunnel.c
@@ -0,0 +1,570 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * IPv6 IOAM Lightweight Tunnel implementation
+ *
+ * Author:
+ * Justin Iurman <justin.iurman@uliege.be>
+ */
+
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/ioam6.h>
+#include <linux/ioam6_iptunnel.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <net/lwtunnel.h>
+#include <net/ioam6.h>
+#include <net/netlink.h>
+#include <net/ipv6.h>
+#include <net/dst_cache.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+
+#define IOAM6_MASK_SHORT_FIELDS 0xff100000
+#define IOAM6_MASK_WIDE_FIELDS 0xe00000
+
+struct ioam6_lwt_encap {
+ struct ipv6_hopopt_hdr eh;
+ u8 pad[2]; /* 2-octet padding for 4n-alignment */
+ struct ioam6_hdr ioamh;
+ struct ioam6_trace_hdr traceh;
+} __packed;
+
+struct ioam6_lwt_freq {
+ u32 k;
+ u32 n;
+};
+
+struct ioam6_lwt {
+ struct dst_entry null_dst;
+ struct dst_cache cache;
+ struct ioam6_lwt_freq freq;
+ atomic_t pkt_cnt;
+ u8 mode;
+ bool has_tunsrc;
+ struct in6_addr tunsrc;
+ struct in6_addr tundst;
+ struct ioam6_lwt_encap tuninfo;
+};
+
+static const struct netlink_range_validation freq_range = {
+ .min = IOAM6_IPTUNNEL_FREQ_MIN,
+ .max = IOAM6_IPTUNNEL_FREQ_MAX,
+};
+
+static struct ioam6_lwt *ioam6_lwt_state(struct lwtunnel_state *lwt)
+{
+ return (struct ioam6_lwt *)lwt->data;
+}
+
+static struct ioam6_lwt_encap *ioam6_lwt_info(struct lwtunnel_state *lwt)
+{
+ return &ioam6_lwt_state(lwt)->tuninfo;
+}
+
+static struct ioam6_trace_hdr *ioam6_lwt_trace(struct lwtunnel_state *lwt)
+{
+ return &(ioam6_lwt_state(lwt)->tuninfo.traceh);
+}
+
+static const struct nla_policy ioam6_iptunnel_policy[IOAM6_IPTUNNEL_MAX + 1] = {
+ [IOAM6_IPTUNNEL_FREQ_K] = NLA_POLICY_FULL_RANGE(NLA_U32, &freq_range),
+ [IOAM6_IPTUNNEL_FREQ_N] = NLA_POLICY_FULL_RANGE(NLA_U32, &freq_range),
+ [IOAM6_IPTUNNEL_MODE] = NLA_POLICY_RANGE(NLA_U8,
+ IOAM6_IPTUNNEL_MODE_MIN,
+ IOAM6_IPTUNNEL_MODE_MAX),
+ [IOAM6_IPTUNNEL_SRC] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+ [IOAM6_IPTUNNEL_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+ [IOAM6_IPTUNNEL_TRACE] = NLA_POLICY_EXACT_LEN(
+ sizeof(struct ioam6_trace_hdr)),
+};
+
+static bool ioam6_validate_trace_hdr(struct ioam6_trace_hdr *trace)
+{
+ u32 fields;
+
+ if (!trace->type_be32 || !trace->remlen ||
+ trace->remlen > IOAM6_TRACE_DATA_SIZE_MAX / 4 ||
+ trace->type.bit12 | trace->type.bit13 | trace->type.bit14 |
+ trace->type.bit15 | trace->type.bit16 | trace->type.bit17 |
+ trace->type.bit18 | trace->type.bit19 | trace->type.bit20 |
+ trace->type.bit21 | trace->type.bit23)
+ return false;
+
+ trace->nodelen = 0;
+ fields = be32_to_cpu(trace->type_be32);
+
+ trace->nodelen += hweight32(fields & IOAM6_MASK_SHORT_FIELDS)
+ * (sizeof(__be32) / 4);
+ trace->nodelen += hweight32(fields & IOAM6_MASK_WIDE_FIELDS)
+ * (sizeof(__be64) / 4);
+
+ return true;
+}
+
+static int ioam6_build_state(struct net *net, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[IOAM6_IPTUNNEL_MAX + 1];
+ struct ioam6_lwt_encap *tuninfo;
+ struct ioam6_trace_hdr *trace;
+ struct lwtunnel_state *lwt;
+ struct ioam6_lwt *ilwt;
+ int len_aligned, err;
+ u32 freq_k, freq_n;
+ u8 mode;
+
+ if (family != AF_INET6)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, IOAM6_IPTUNNEL_MAX, nla,
+ ioam6_iptunnel_policy, extack);
+ if (err < 0)
+ return err;
+
+ if ((!tb[IOAM6_IPTUNNEL_FREQ_K] && tb[IOAM6_IPTUNNEL_FREQ_N]) ||
+ (tb[IOAM6_IPTUNNEL_FREQ_K] && !tb[IOAM6_IPTUNNEL_FREQ_N])) {
+ NL_SET_ERR_MSG(extack, "freq: missing parameter");
+ return -EINVAL;
+ } else if (!tb[IOAM6_IPTUNNEL_FREQ_K] && !tb[IOAM6_IPTUNNEL_FREQ_N]) {
+ freq_k = IOAM6_IPTUNNEL_FREQ_MIN;
+ freq_n = IOAM6_IPTUNNEL_FREQ_MIN;
+ } else {
+ freq_k = nla_get_u32(tb[IOAM6_IPTUNNEL_FREQ_K]);
+ freq_n = nla_get_u32(tb[IOAM6_IPTUNNEL_FREQ_N]);
+
+ if (freq_k > freq_n) {
+ NL_SET_ERR_MSG(extack, "freq: k > n is forbidden");
+ return -EINVAL;
+ }
+ }
+
+ mode = nla_get_u8_default(tb[IOAM6_IPTUNNEL_MODE],
+ IOAM6_IPTUNNEL_MODE_INLINE);
+
+ if (tb[IOAM6_IPTUNNEL_SRC] && mode == IOAM6_IPTUNNEL_MODE_INLINE) {
+ NL_SET_ERR_MSG(extack, "no tunnel src expected with this mode");
+ return -EINVAL;
+ }
+
+ if (!tb[IOAM6_IPTUNNEL_DST] && mode != IOAM6_IPTUNNEL_MODE_INLINE) {
+ NL_SET_ERR_MSG(extack, "this mode needs a tunnel destination");
+ return -EINVAL;
+ }
+
+ if (!tb[IOAM6_IPTUNNEL_TRACE]) {
+ NL_SET_ERR_MSG(extack, "missing trace");
+ return -EINVAL;
+ }
+
+ trace = nla_data(tb[IOAM6_IPTUNNEL_TRACE]);
+ if (!ioam6_validate_trace_hdr(trace)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_TRACE],
+ "invalid trace validation");
+ return -EINVAL;
+ }
+
+ len_aligned = ALIGN(trace->remlen * 4, 8);
+ lwt = lwtunnel_state_alloc(sizeof(*ilwt) + len_aligned);
+ if (!lwt)
+ return -ENOMEM;
+
+ ilwt = ioam6_lwt_state(lwt);
+ err = dst_cache_init(&ilwt->cache, GFP_ATOMIC);
+ if (err)
+ goto free_lwt;
+
+ /* This "fake" dst_entry will be stored in a dst_cache, which will call
+ * dst_hold() and dst_release() on it. We must ensure that dst_destroy()
+ * will never be called. For that, its initial refcount is 1 and +1 when
+ * it is stored in the cache. Then, +1/-1 each time we read the cache
+ * and release it. Long story short, we're fine.
+ */
+ dst_init(&ilwt->null_dst, NULL, NULL, DST_OBSOLETE_NONE, DST_NOCOUNT);
+
+ atomic_set(&ilwt->pkt_cnt, 0);
+ ilwt->freq.k = freq_k;
+ ilwt->freq.n = freq_n;
+
+ ilwt->mode = mode;
+
+ if (!tb[IOAM6_IPTUNNEL_SRC]) {
+ ilwt->has_tunsrc = false;
+ } else {
+ ilwt->has_tunsrc = true;
+ ilwt->tunsrc = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_SRC]);
+
+ if (ipv6_addr_any(&ilwt->tunsrc)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_SRC],
+ "invalid tunnel source address");
+ err = -EINVAL;
+ goto free_cache;
+ }
+ }
+
+ if (tb[IOAM6_IPTUNNEL_DST]) {
+ ilwt->tundst = nla_get_in6_addr(tb[IOAM6_IPTUNNEL_DST]);
+
+ if (ipv6_addr_any(&ilwt->tundst)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[IOAM6_IPTUNNEL_DST],
+ "invalid tunnel dest address");
+ err = -EINVAL;
+ goto free_cache;
+ }
+ }
+
+ tuninfo = ioam6_lwt_info(lwt);
+ tuninfo->eh.hdrlen = ((sizeof(*tuninfo) + len_aligned) >> 3) - 1;
+ tuninfo->pad[0] = IPV6_TLV_PADN;
+ tuninfo->ioamh.type = IOAM6_TYPE_PREALLOC;
+ tuninfo->ioamh.opt_type = IPV6_TLV_IOAM;
+ tuninfo->ioamh.opt_len = sizeof(tuninfo->ioamh) - 2 + sizeof(*trace)
+ + trace->remlen * 4;
+
+ memcpy(&tuninfo->traceh, trace, sizeof(*trace));
+
+ if (len_aligned - trace->remlen * 4) {
+ tuninfo->traceh.data[trace->remlen * 4] = IPV6_TLV_PADN;
+ tuninfo->traceh.data[trace->remlen * 4 + 1] = 2;
+ }
+
+ lwt->type = LWTUNNEL_ENCAP_IOAM6;
+ lwt->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+
+ *ts = lwt;
+
+ return 0;
+free_cache:
+ dst_cache_destroy(&ilwt->cache);
+free_lwt:
+ kfree(lwt);
+ return err;
+}
+
+static int ioam6_do_fill(struct net *net, struct sk_buff *skb)
+{
+ struct ioam6_trace_hdr *trace;
+ struct ioam6_namespace *ns;
+
+ trace = (struct ioam6_trace_hdr *)(skb_transport_header(skb)
+ + sizeof(struct ipv6_hopopt_hdr) + 2
+ + sizeof(struct ioam6_hdr));
+
+ ns = ioam6_namespace(net, trace->namespace_id);
+ if (ns)
+ ioam6_fill_trace_data(skb, ns, trace, false);
+
+ return 0;
+}
+
+static int ioam6_do_inline(struct net *net, struct sk_buff *skb,
+ struct ioam6_lwt_encap *tuninfo,
+ struct dst_entry *cache_dst)
+{
+ struct ipv6hdr *oldhdr, *hdr;
+ int hdrlen, err;
+
+ hdrlen = (tuninfo->eh.hdrlen + 1) << 3;
+
+ err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb));
+ if (unlikely(err))
+ return err;
+
+ oldhdr = ipv6_hdr(skb);
+ skb_pull(skb, sizeof(*oldhdr));
+ skb_postpull_rcsum(skb, skb_network_header(skb), sizeof(*oldhdr));
+
+ skb_push(skb, sizeof(*oldhdr) + hdrlen);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+
+ hdr = ipv6_hdr(skb);
+ memmove(hdr, oldhdr, sizeof(*oldhdr));
+ tuninfo->eh.nexthdr = hdr->nexthdr;
+
+ skb_set_transport_header(skb, sizeof(*hdr));
+ skb_postpush_rcsum(skb, hdr, sizeof(*hdr) + hdrlen);
+
+ memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen);
+
+ hdr->nexthdr = NEXTHDR_HOP;
+ hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr));
+
+ return ioam6_do_fill(net, skb);
+}
+
+static int ioam6_do_encap(struct net *net, struct sk_buff *skb,
+ struct ioam6_lwt_encap *tuninfo,
+ bool has_tunsrc,
+ struct in6_addr *tunsrc,
+ struct in6_addr *tundst,
+ struct dst_entry *cache_dst)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct ipv6hdr *hdr, *inner_hdr;
+ int hdrlen, len, err;
+
+ hdrlen = (tuninfo->eh.hdrlen + 1) << 3;
+ len = sizeof(*hdr) + hdrlen;
+
+ err = skb_cow_head(skb, len + dst_dev_overhead(cache_dst, skb));
+ if (unlikely(err))
+ return err;
+
+ inner_hdr = ipv6_hdr(skb);
+
+ skb_push(skb, len);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+ skb_set_transport_header(skb, sizeof(*hdr));
+
+ tuninfo->eh.nexthdr = NEXTHDR_IPV6;
+ memcpy(skb_transport_header(skb), (u8 *)tuninfo, hdrlen);
+
+ hdr = ipv6_hdr(skb);
+ memcpy(hdr, inner_hdr, sizeof(*hdr));
+
+ hdr->nexthdr = NEXTHDR_HOP;
+ hdr->payload_len = cpu_to_be16(skb->len - sizeof(*hdr));
+ hdr->daddr = *tundst;
+
+ if (has_tunsrc)
+ memcpy(&hdr->saddr, tunsrc, sizeof(*tunsrc));
+ else
+ ipv6_dev_get_saddr(net, dst_dev(dst), &hdr->daddr,
+ IPV6_PREFER_SRC_PUBLIC, &hdr->saddr);
+
+ skb_postpush_rcsum(skb, hdr, len);
+
+ return ioam6_do_fill(net, skb);
+}
+
+static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct dst_entry *dst = NULL;
+ struct ioam6_lwt *ilwt;
+ int err = -EINVAL;
+ u32 pkt_cnt;
+
+ if (skb->protocol != htons(ETH_P_IPV6))
+ goto drop;
+
+ ilwt = ioam6_lwt_state(orig_dst->lwtstate);
+
+ /* Check for insertion frequency (i.e., "k over n" insertions) */
+ pkt_cnt = atomic_fetch_inc(&ilwt->pkt_cnt);
+ if (pkt_cnt % ilwt->freq.n >= ilwt->freq.k)
+ goto out;
+
+ local_bh_disable();
+ dst = dst_cache_get(&ilwt->cache);
+ local_bh_enable();
+
+ /* This is how we notify that the destination does not change after
+ * transformation and that we need to use orig_dst instead of the cache
+ */
+ if (dst == &ilwt->null_dst) {
+ dst_release(dst);
+
+ dst = orig_dst;
+ /* keep refcount balance: dst_release() is called at the end */
+ dst_hold(dst);
+ }
+
+ switch (ilwt->mode) {
+ case IOAM6_IPTUNNEL_MODE_INLINE:
+do_inline:
+ /* Direct insertion - if there is no Hop-by-Hop yet */
+ if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP)
+ goto out;
+
+ err = ioam6_do_inline(net, skb, &ilwt->tuninfo, dst);
+ if (unlikely(err))
+ goto drop;
+
+ break;
+ case IOAM6_IPTUNNEL_MODE_ENCAP:
+do_encap:
+ /* Encapsulation (ip6ip6) */
+ err = ioam6_do_encap(net, skb, &ilwt->tuninfo,
+ ilwt->has_tunsrc, &ilwt->tunsrc,
+ &ilwt->tundst, dst);
+ if (unlikely(err))
+ goto drop;
+
+ break;
+ case IOAM6_IPTUNNEL_MODE_AUTO:
+ /* Automatic (RFC8200 compliant):
+ * - local packets -> INLINE mode
+ * - in-transit packets -> ENCAP mode
+ */
+ if (!skb->dev)
+ goto do_inline;
+
+ goto do_encap;
+ default:
+ goto drop;
+ }
+
+ if (unlikely(!dst)) {
+ struct ipv6hdr *hdr = ipv6_hdr(skb);
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.daddr = hdr->daddr;
+ fl6.saddr = hdr->saddr;
+ fl6.flowlabel = ip6_flowinfo(hdr);
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_proto = hdr->nexthdr;
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ err = dst->error;
+ goto drop;
+ }
+
+ /* If the destination is the same after transformation (which is
+ * a valid use case for IOAM), then we don't want to add it to
+ * the cache in order to avoid a reference loop. Instead, we add
+ * our fake dst_entry to the cache as a way to detect this case.
+ * Otherwise, we add the resolved destination to the cache.
+ */
+ local_bh_disable();
+ if (orig_dst->lwtstate == dst->lwtstate)
+ dst_cache_set_ip6(&ilwt->cache,
+ &ilwt->null_dst, &fl6.saddr);
+ else
+ dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr);
+ local_bh_enable();
+
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst)));
+ if (unlikely(err))
+ goto drop;
+ }
+
+ /* avoid lwtunnel_output() reentry loop when destination is the same
+ * after transformation (e.g., with the inline mode)
+ */
+ if (orig_dst->lwtstate != dst->lwtstate) {
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+ return dst_output(net, sk, skb);
+ }
+out:
+ dst_release(dst);
+ return orig_dst->lwtstate->orig_output(net, sk, skb);
+drop:
+ dst_release(dst);
+ kfree_skb(skb);
+ return err;
+}
+
+static void ioam6_destroy_state(struct lwtunnel_state *lwt)
+{
+ /* Since the refcount of per-cpu dst_entry caches will never be 0 (see
+ * why above) when our "fake" dst_entry is used, it is not necessary to
+ * remove them before calling dst_cache_destroy()
+ */
+ dst_cache_destroy(&ioam6_lwt_state(lwt)->cache);
+}
+
+static int ioam6_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate);
+ int err;
+
+ err = nla_put_u32(skb, IOAM6_IPTUNNEL_FREQ_K, ilwt->freq.k);
+ if (err)
+ goto ret;
+
+ err = nla_put_u32(skb, IOAM6_IPTUNNEL_FREQ_N, ilwt->freq.n);
+ if (err)
+ goto ret;
+
+ err = nla_put_u8(skb, IOAM6_IPTUNNEL_MODE, ilwt->mode);
+ if (err)
+ goto ret;
+
+ if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) {
+ if (ilwt->has_tunsrc) {
+ err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_SRC,
+ &ilwt->tunsrc);
+ if (err)
+ goto ret;
+ }
+
+ err = nla_put_in6_addr(skb, IOAM6_IPTUNNEL_DST, &ilwt->tundst);
+ if (err)
+ goto ret;
+ }
+
+ err = nla_put(skb, IOAM6_IPTUNNEL_TRACE, sizeof(ilwt->tuninfo.traceh),
+ &ilwt->tuninfo.traceh);
+ret:
+ return err;
+}
+
+static int ioam6_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ struct ioam6_lwt *ilwt = ioam6_lwt_state(lwtstate);
+ int nlsize;
+
+ nlsize = nla_total_size(sizeof(ilwt->freq.k)) +
+ nla_total_size(sizeof(ilwt->freq.n)) +
+ nla_total_size(sizeof(ilwt->mode)) +
+ nla_total_size(sizeof(ilwt->tuninfo.traceh));
+
+ if (ilwt->mode != IOAM6_IPTUNNEL_MODE_INLINE) {
+ if (ilwt->has_tunsrc)
+ nlsize += nla_total_size(sizeof(ilwt->tunsrc));
+
+ nlsize += nla_total_size(sizeof(ilwt->tundst));
+ }
+
+ return nlsize;
+}
+
+static int ioam6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct ioam6_trace_hdr *trace_a = ioam6_lwt_trace(a);
+ struct ioam6_trace_hdr *trace_b = ioam6_lwt_trace(b);
+ struct ioam6_lwt *ilwt_a = ioam6_lwt_state(a);
+ struct ioam6_lwt *ilwt_b = ioam6_lwt_state(b);
+
+ return (ilwt_a->freq.k != ilwt_b->freq.k ||
+ ilwt_a->freq.n != ilwt_b->freq.n ||
+ ilwt_a->mode != ilwt_b->mode ||
+ ilwt_a->has_tunsrc != ilwt_b->has_tunsrc ||
+ (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE &&
+ !ipv6_addr_equal(&ilwt_a->tundst, &ilwt_b->tundst)) ||
+ (ilwt_a->mode != IOAM6_IPTUNNEL_MODE_INLINE &&
+ ilwt_a->has_tunsrc &&
+ !ipv6_addr_equal(&ilwt_a->tunsrc, &ilwt_b->tunsrc)) ||
+ trace_a->namespace_id != trace_b->namespace_id);
+}
+
+static const struct lwtunnel_encap_ops ioam6_iptun_ops = {
+ .build_state = ioam6_build_state,
+ .destroy_state = ioam6_destroy_state,
+ .output = ioam6_output,
+ .fill_encap = ioam6_fill_encap_info,
+ .get_encap_size = ioam6_encap_nlsize,
+ .cmp_encap = ioam6_encap_cmp,
+ .owner = THIS_MODULE,
+};
+
+int __init ioam6_iptunnel_init(void)
+{
+ return lwtunnel_encap_add_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6);
+}
+
+void ioam6_iptunnel_exit(void)
+{
+ lwtunnel_encap_del_ops(&ioam6_iptun_ops, LWTUNNEL_ENCAP_IOAM6);
+}
diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c
index 547515e8450a..377717045f8f 100644
--- a/net/ipv6/ip6_checksum.c
+++ b/net/ipv6/ip6_checksum.c
@@ -88,8 +88,24 @@ int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto)
* Note, we are only interested in != 0 or == 0, thus the
* force to int.
*/
- return (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
- ip6_compute_pseudo);
+ err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check,
+ ip6_compute_pseudo);
+ if (err)
+ return err;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) {
+ /* If SW calculated the value, we know it's bad */
+ if (skb->csum_complete_sw)
+ return 1;
+
+ /* HW says the value is bad. Let's validate that.
+ * skb->csum is no longer the full packet checksum,
+ * so don't treat is as such.
+ */
+ skb_checksum_complete_unset(skb);
+ }
+
+ return 0;
}
EXPORT_SYMBOL(udp6_csum_init);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 5516f55e214b..2111af022d94 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux INET6 implementation
* Forwarding Information Database
@@ -5,11 +6,6 @@
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes:
* Yuji SEKIYA @USAGI: Support default route on router node;
* remove ip6_null_entry from the top of
@@ -19,6 +15,7 @@
#define pr_fmt(fmt) "IPv6: " fmt
+#include <linux/bpf.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/net.h>
@@ -29,12 +26,14 @@
#include <linux/list.h>
#include <linux/slab.h>
+#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/lwtunnel.h>
#include <net/fib_notifier.h>
+#include <net/ip_fib.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
@@ -46,6 +45,7 @@ struct fib6_cleaner {
int (*func)(struct fib6_info *, void *arg);
int sernum;
void *arg;
+ bool skip_notify;
};
#ifdef CONFIG_IPV6_SUBTREES
@@ -91,13 +91,12 @@ static void fib6_walker_unlink(struct net *net, struct fib6_walker *w)
static int fib6_new_sernum(struct net *net)
{
- int new, old;
+ int new, old = atomic_read(&net->ipv6.fib6_sernum);
do {
- old = atomic_read(&net->ipv6.fib6_sernum);
new = old < INT_MAX ? old + 1 : 1;
- } while (atomic_cmpxchg(&net->ipv6.fib6_sernum,
- old, new) != old);
+ } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new));
+
return new;
}
@@ -112,7 +111,7 @@ void fib6_update_sernum(struct net *net, struct fib6_info *f6i)
fn = rcu_dereference_protected(f6i->fib6_node,
lockdep_is_held(&f6i->fib6_table->tb6_lock));
if (fn)
- fn->fn_sernum = fib6_new_sernum(net);
+ WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net));
}
/*
@@ -145,24 +144,23 @@ static __be32 addr_bit_set(const void *token, int fn_bit)
addr[fn_bit >> 5];
}
-struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
+struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
{
struct fib6_info *f6i;
+ size_t sz = sizeof(*f6i);
- f6i = kzalloc(sizeof(*f6i), gfp_flags);
- if (!f6i)
- return NULL;
+ if (with_fib6_nh)
+ sz += sizeof(struct fib6_nh);
- f6i->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
- if (!f6i->rt6i_pcpu) {
- kfree(f6i);
+ f6i = kzalloc(sz, gfp_flags);
+ if (!f6i)
return NULL;
- }
+ /* fib6_siblings is a union with nh_list, so this initializes both */
INIT_LIST_HEAD(&f6i->fib6_siblings);
- f6i->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
+ refcount_set(&f6i->fib6_ref, 1);
- atomic_inc(&f6i->fib6_ref);
+ INIT_HLIST_NODE(&f6i->gc_link);
return f6i;
}
@@ -170,43 +168,15 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags)
void fib6_info_destroy_rcu(struct rcu_head *head)
{
struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
- struct rt6_exception_bucket *bucket;
- struct dst_metrics *m;
WARN_ON(f6i->fib6_node);
- bucket = rcu_dereference_protected(f6i->rt6i_exception_bucket, 1);
- if (bucket) {
- f6i->rt6i_exception_bucket = NULL;
- kfree(bucket);
- }
-
- if (f6i->rt6i_pcpu) {
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct rt6_info **ppcpu_rt;
- struct rt6_info *pcpu_rt;
-
- ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
- pcpu_rt = *ppcpu_rt;
- if (pcpu_rt) {
- dst_dev_put(&pcpu_rt->dst);
- dst_release(&pcpu_rt->dst);
- *ppcpu_rt = NULL;
- }
- }
- }
-
- lwtstate_put(f6i->fib6_nh.nh_lwtstate);
-
- if (f6i->fib6_nh.nh_dev)
- dev_put(f6i->fib6_nh.nh_dev);
-
- m = f6i->fib6_metrics;
- if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
- kfree(m);
+ if (f6i->nh)
+ nexthop_put(f6i->nh);
+ else
+ fib6_nh_release(f6i->fib6_nh);
+ ip_fib_metrics_put(f6i->fib6_metrics);
kfree(f6i);
}
EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu);
@@ -228,16 +198,9 @@ static void node_free_immediate(struct net *net, struct fib6_node *fn)
net->ipv6.rt6_stats->fib_nodes--;
}
-static void node_free_rcu(struct rcu_head *head)
-{
- struct fib6_node *fn = container_of(head, struct fib6_node, rcu);
-
- kmem_cache_free(fib6_node_kmem, fn);
-}
-
static void node_free(struct net *net, struct fib6_node *fn)
{
- call_rcu(&fn->rcu, node_free_rcu);
+ kfree_rcu(fn, rcu);
net->ipv6.rt6_stats->fib_nodes--;
}
@@ -278,6 +241,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
net->ipv6.fib6_null_entry);
table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
inet_peer_base_init(&table->tb6_peers);
+ INIT_HLIST_HEAD(&table->tb6_gc_hlist);
}
return table;
@@ -285,40 +249,52 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
struct fib6_table *fib6_new_table(struct net *net, u32 id)
{
- struct fib6_table *tb;
+ struct fib6_table *tb, *new_tb;
if (id == 0)
id = RT6_TABLE_MAIN;
+
tb = fib6_get_table(net, id);
if (tb)
return tb;
- tb = fib6_alloc_table(net, id);
- if (tb)
- fib6_link_table(net, tb);
+ new_tb = fib6_alloc_table(net, id);
+ if (!new_tb)
+ return NULL;
+
+ spin_lock_bh(&net->ipv6.fib_table_hash_lock);
+
+ tb = fib6_get_table(net, id);
+ if (unlikely(tb)) {
+ spin_unlock_bh(&net->ipv6.fib_table_hash_lock);
+ kfree(new_tb);
+ return tb;
+ }
+
+ fib6_link_table(net, new_tb);
+
+ spin_unlock_bh(&net->ipv6.fib_table_hash_lock);
- return tb;
+ return new_tb;
}
EXPORT_SYMBOL_GPL(fib6_new_table);
struct fib6_table *fib6_get_table(struct net *net, u32 id)
{
- struct fib6_table *tb;
struct hlist_head *head;
- unsigned int h;
+ struct fib6_table *tb;
- if (id == 0)
+ if (!id)
id = RT6_TABLE_MAIN;
- h = id & (FIB6_TABLE_HASHSZ - 1);
- rcu_read_lock();
- head = &net->ipv6.fib_table_hash[h];
- hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
- if (tb->tb6_id == id) {
- rcu_read_unlock();
+
+ head = &net->ipv6.fib_table_hash[id & (FIB6_TABLE_HASHSZ - 1)];
+
+ /* See comment in fib6_link_table(). RCU is not required,
+ * but rcu_dereference_raw() is used to avoid data-race.
+ */
+ hlist_for_each_entry_rcu(tb, head, tb6_hlist, true)
+ if (tb->tb6_id == id)
return tb;
- }
- }
- rcu_read_unlock();
return NULL;
}
@@ -347,21 +323,24 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
{
struct rt6_info *rt;
- rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
+ rt = pol_lookup_func(lookup,
+ net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
if (rt->dst.error == -EAGAIN) {
- ip6_rt_put(rt);
+ ip6_rt_put_flags(rt, flags);
rt = net->ipv6.ip6_null_entry;
- dst_hold(&rt->dst);
+ if (!(flags & RT6_LOOKUP_F_DST_NOREF))
+ dst_hold(&rt->dst);
}
return &rt->dst;
}
/* called with rcu lock held; no reference taken on fib6_info */
-struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
- int flags)
+int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
+ struct fib6_result *res, int flags)
{
- return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, flags);
+ return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6,
+ res, flags);
}
static void __net_init fib6_tables_init(struct net *net)
@@ -371,85 +350,149 @@ static void __net_init fib6_tables_init(struct net *net)
#endif
-unsigned int fib6_tables_seq_read(struct net *net)
+unsigned int fib6_tables_seq_read(const struct net *net)
{
unsigned int h, fib_seq = 0;
rcu_read_lock();
for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
- struct hlist_head *head = &net->ipv6.fib_table_hash[h];
- struct fib6_table *tb;
+ const struct hlist_head *head = &net->ipv6.fib_table_hash[h];
+ const struct fib6_table *tb;
hlist_for_each_entry_rcu(tb, head, tb6_hlist)
- fib_seq += tb->fib_seq;
+ fib_seq += READ_ONCE(tb->fib_seq);
}
rcu_read_unlock();
return fib_seq;
}
-static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
+static int call_fib6_entry_notifier(struct notifier_block *nb,
enum fib_event_type event_type,
- struct fib6_info *rt)
+ struct fib6_info *rt,
+ struct netlink_ext_ack *extack)
{
struct fib6_entry_notifier_info info = {
+ .info.extack = extack,
+ .rt = rt,
+ };
+
+ return call_fib6_notifier(nb, event_type, &info.info);
+}
+
+static int call_fib6_multipath_entry_notifier(struct notifier_block *nb,
+ enum fib_event_type event_type,
+ struct fib6_info *rt,
+ unsigned int nsiblings,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_entry_notifier_info info = {
+ .info.extack = extack,
.rt = rt,
+ .nsiblings = nsiblings,
};
- return call_fib6_notifier(nb, net, event_type, &info.info);
+ return call_fib6_notifier(nb, event_type, &info.info);
}
-static int call_fib6_entry_notifiers(struct net *net,
- enum fib_event_type event_type,
- struct fib6_info *rt,
- struct netlink_ext_ack *extack)
+int call_fib6_entry_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct fib6_info *rt,
+ struct netlink_ext_ack *extack)
{
struct fib6_entry_notifier_info info = {
.info.extack = extack,
.rt = rt,
};
- rt->fib6_table->fib_seq++;
+ WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1);
return call_fib6_notifiers(net, event_type, &info.info);
}
+int call_fib6_multipath_entry_notifiers(struct net *net,
+ enum fib_event_type event_type,
+ struct fib6_info *rt,
+ unsigned int nsiblings,
+ struct netlink_ext_ack *extack)
+{
+ struct fib6_entry_notifier_info info = {
+ .info.extack = extack,
+ .rt = rt,
+ .nsiblings = nsiblings,
+ };
+
+ WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1);
+ return call_fib6_notifiers(net, event_type, &info.info);
+}
+
+int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt)
+{
+ struct fib6_entry_notifier_info info = {
+ .rt = rt,
+ .nsiblings = rt->fib6_nsiblings,
+ };
+
+ WRITE_ONCE(rt->fib6_table->fib_seq, rt->fib6_table->fib_seq + 1);
+ return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info);
+}
+
struct fib6_dump_arg {
struct net *net;
struct notifier_block *nb;
+ struct netlink_ext_ack *extack;
};
-static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
+static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
{
- if (rt == arg->net->ipv6.fib6_null_entry)
- return;
- call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt);
+ enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE;
+ unsigned int nsiblings;
+ int err;
+
+ if (!rt || rt == arg->net->ipv6.fib6_null_entry)
+ return 0;
+
+ nsiblings = READ_ONCE(rt->fib6_nsiblings);
+ if (nsiblings)
+ err = call_fib6_multipath_entry_notifier(arg->nb, fib_event,
+ rt,
+ nsiblings,
+ arg->extack);
+ else
+ err = call_fib6_entry_notifier(arg->nb, fib_event, rt,
+ arg->extack);
+
+ return err;
}
static int fib6_node_dump(struct fib6_walker *w)
{
- struct fib6_info *rt;
+ int err;
- for_each_fib6_walker_rt(w)
- fib6_rt_dump(rt, w->args);
+ err = fib6_rt_dump(w->leaf, w->args);
w->leaf = NULL;
- return 0;
+ return err;
}
-static void fib6_table_dump(struct net *net, struct fib6_table *tb,
- struct fib6_walker *w)
+static int fib6_table_dump(struct net *net, struct fib6_table *tb,
+ struct fib6_walker *w)
{
+ int err;
+
w->root = &tb->tb6_root;
spin_lock_bh(&tb->tb6_lock);
- fib6_walk(net, w);
+ err = fib6_walk(net, w);
spin_unlock_bh(&tb->tb6_lock);
+ return err;
}
/* Called with rcu_read_lock() */
-int fib6_tables_dump(struct net *net, struct notifier_block *nb)
+int fib6_tables_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
struct fib6_dump_arg arg;
struct fib6_walker *w;
unsigned int h;
+ int err = 0;
w = kzalloc(sizeof(*w), GFP_ATOMIC);
if (!w)
@@ -458,19 +501,25 @@ int fib6_tables_dump(struct net *net, struct notifier_block *nb)
w->func = fib6_node_dump;
arg.net = net;
arg.nb = nb;
+ arg.extack = extack;
w->args = &arg;
for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
struct hlist_head *head = &net->ipv6.fib_table_hash[h];
struct fib6_table *tb;
- hlist_for_each_entry_rcu(tb, head, tb6_hlist)
- fib6_table_dump(net, tb, w);
+ hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
+ err = fib6_table_dump(net, tb, w);
+ if (err)
+ goto out;
+ }
}
+out:
kfree(w);
- return 0;
+ /* The tree traversal function should never return a positive value. */
+ return err > 0 ? -EINVAL : err;
}
static int fib6_dump_node(struct fib6_walker *w)
@@ -479,12 +528,19 @@ static int fib6_dump_node(struct fib6_walker *w)
struct fib6_info *rt;
for_each_fib6_walker_rt(w) {
- res = rt6_dump_route(rt, w->args);
- if (res < 0) {
+ res = rt6_dump_route(rt, w->args, w->skip_in_node);
+ if (res >= 0) {
/* Frame is full, suspend walking */
w->leaf = rt;
+
+ /* We'll restart from this node, so if some routes were
+ * already dumped, skip them next time.
+ */
+ w->skip_in_node += res;
+
return 1;
}
+ w->skip_in_node = 0;
/* Multipath routes are dumped in one route with the
* RTA_MULTIPATH attribute. Jump 'rt' to point to the
@@ -536,21 +592,24 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
if (cb->args[4] == 0) {
w->count = 0;
w->skip = 0;
+ w->skip_in_node = 0;
spin_lock_bh(&table->tb6_lock);
res = fib6_walk(net, w);
spin_unlock_bh(&table->tb6_lock);
if (res > 0) {
cb->args[4] = 1;
- cb->args[5] = w->root->fn_sernum;
+ cb->args[5] = READ_ONCE(w->root->fn_sernum);
}
} else {
- if (cb->args[5] != w->root->fn_sernum) {
+ int sernum = READ_ONCE(w->root->fn_sernum);
+ if (cb->args[5] != sernum) {
/* Begin at the root if the tree changed */
- cb->args[5] = w->root->fn_sernum;
+ cb->args[5] = sernum;
w->state = FWS_INIT;
w->node = w->root;
w->skip = w->count;
+ w->skip_in_node = 0;
} else
w->skip = 0;
@@ -568,35 +627,51 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct rt6_rtnl_dump_arg arg = {
+ .filter.dump_exceptions = true,
+ .filter.dump_routes = true,
+ .filter.rtnl_held = false,
+ };
+ const struct nlmsghdr *nlh = cb->nlh;
struct net *net = sock_net(skb->sk);
- unsigned int h, s_h;
unsigned int e = 0, s_e;
- struct rt6_rtnl_dump_arg arg;
+ struct hlist_head *head;
struct fib6_walker *w;
struct fib6_table *tb;
- struct hlist_head *head;
- int res = 0;
+ unsigned int h, s_h;
+ int err = 0;
- s_h = cb->args[0];
- s_e = cb->args[1];
+ rcu_read_lock();
+ if (cb->strict_check) {
+ err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb);
+ if (err < 0)
+ goto unlock;
+ } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
+ struct rtmsg *rtm = nlmsg_data(nlh);
+
+ if (rtm->rtm_flags & RTM_F_PREFIX)
+ arg.filter.flags = RTM_F_PREFIX;
+ }
w = (void *)cb->args[2];
if (!w) {
/* New dump:
*
- * 1. hook callback destructor.
- */
- cb->args[3] = (long)cb->done;
- cb->done = fib6_dump_done;
-
- /*
- * 2. allocate and initialize walker.
+ * 1. allocate and initialize walker.
*/
w = kzalloc(sizeof(*w), GFP_ATOMIC);
- if (!w)
- return -ENOMEM;
+ if (!w) {
+ err = -ENOMEM;
+ goto unlock;
+ }
w->func = fib6_dump_node;
cb->args[2] = (long)w;
+
+ /* 2. hook callback destructor.
+ */
+ cb->args[3] = (long)cb->done;
+ cb->done = fib6_dump_done;
+
}
arg.skb = skb;
@@ -604,29 +679,50 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
arg.net = net;
w->args = &arg;
- rcu_read_lock();
+ if (arg.filter.table_id) {
+ tb = fib6_get_table(net, arg.filter.table_id);
+ if (!tb) {
+ if (rtnl_msg_family(cb->nlh) != PF_INET6)
+ goto unlock;
+
+ NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist");
+ err = -ENOENT;
+ goto unlock;
+ }
+
+ if (!cb->args[0]) {
+ err = fib6_dump_table(tb, skb, cb);
+ if (!err)
+ cb->args[0] = 1;
+ }
+ goto unlock;
+ }
+
+ s_h = cb->args[0];
+ s_e = cb->args[1];
+
for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
e = 0;
head = &net->ipv6.fib_table_hash[h];
hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
if (e < s_e)
goto next;
- res = fib6_dump_table(tb, skb, cb);
- if (res != 0)
+ err = fib6_dump_table(tb, skb, cb);
+ if (err != 0)
goto out;
next:
e++;
}
}
out:
- rcu_read_unlock();
cb->args[1] = e;
cb->args[0] = h;
- res = res < 0 ? res : skb->len;
- if (res <= 0)
+unlock:
+ rcu_read_unlock();
+ if (err <= 0)
fib6_dump_end(cb);
- return res;
+ return err;
}
void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val)
@@ -669,8 +765,6 @@ static struct fib6_node *fib6_add_1(struct net *net,
int bit;
__be32 dir = 0;
- RT6_TRACE("fib6_add_1\n");
-
/* insert node in tree */
fn = root;
@@ -817,8 +911,8 @@ insert_above:
RCU_INIT_POINTER(in->parent, pn);
in->leaf = fn->leaf;
- atomic_inc(&rcu_dereference_protected(in->leaf,
- lockdep_is_held(&table->tb6_lock))->fib6_ref);
+ fib6_info_hold(rcu_dereference_protected(in->leaf,
+ lockdep_is_held(&table->tb6_lock)));
/* update parent pointer */
if (dir)
@@ -870,11 +964,15 @@ insert_above:
return ln;
}
-static void fib6_drop_pcpu_from(struct fib6_info *f6i,
- const struct fib6_table *table)
+static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh,
+ const struct fib6_info *match)
{
int cpu;
+ if (!fib6_nh->rt6i_pcpu)
+ return;
+
+ rcu_read_lock();
/* release the reference to this fib entry from
* all of its cached pcpu routes
*/
@@ -882,17 +980,52 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i,
struct rt6_info **ppcpu_rt;
struct rt6_info *pcpu_rt;
- ppcpu_rt = per_cpu_ptr(f6i->rt6i_pcpu, cpu);
- pcpu_rt = *ppcpu_rt;
- if (pcpu_rt) {
+ ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
+
+ /* Paired with xchg() in rt6_get_pcpu_route() */
+ pcpu_rt = READ_ONCE(*ppcpu_rt);
+
+ /* only dropping the 'from' reference if the cached route
+ * is using 'match'. The cached pcpu_rt->from only changes
+ * from a fib6_info to NULL (ip6_dst_destroy); it can never
+ * change from one fib6_info reference to another
+ */
+ if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) {
struct fib6_info *from;
- from = rcu_dereference_protected(pcpu_rt->from,
- lockdep_is_held(&table->tb6_lock));
- rcu_assign_pointer(pcpu_rt->from, NULL);
+ from = unrcu_pointer(xchg(&pcpu_rt->from, NULL));
fib6_info_release(from);
}
}
+ rcu_read_unlock();
+}
+
+static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_info *arg = _arg;
+
+ __fib6_drop_pcpu_from(nh, arg);
+ return 0;
+}
+
+static void fib6_drop_pcpu_from(struct fib6_info *f6i)
+{
+ /* Make sure rt6_make_pcpu_route() wont add other percpu routes
+ * while we are cleaning them here.
+ */
+ f6i->fib6_destroying = 1;
+ mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
+
+ if (f6i->nh) {
+ rcu_read_lock();
+ nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, f6i);
+ rcu_read_unlock();
+ } else {
+ struct fib6_nh *fib6_nh;
+
+ fib6_nh = f6i->fib6_nh;
+ __fib6_drop_pcpu_from(fib6_nh, f6i);
+ }
}
static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
@@ -900,7 +1033,20 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
{
struct fib6_table *table = rt->fib6_table;
- if (atomic_read(&rt->fib6_ref) != 1) {
+ /* Flush all cached dst in exception table */
+ rt6_flush_exceptions(rt);
+ fib6_drop_pcpu_from(rt);
+
+ if (rt->nh) {
+ spin_lock(&rt->nh->lock);
+
+ if (!list_empty(&rt->nh_list))
+ list_del_init(&rt->nh_list);
+
+ spin_unlock(&rt->nh->lock);
+ }
+
+ if (refcount_read(&rt->fib6_ref) != 1) {
/* This route is used as dummy address holder in some split
* nodes. It is not leaked, but it still holds other resources,
* which must be released in time. So, scan ascendant nodes
@@ -913,7 +1059,7 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
struct fib6_info *new_leaf;
if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
new_leaf = fib6_find_prefix(net, table, fn);
- atomic_inc(&new_leaf->fib6_ref);
+ fib6_info_hold(new_leaf);
rcu_assign_pointer(fn->leaf, new_leaf);
fib6_info_release(rt);
@@ -921,10 +1067,10 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
fn = rcu_dereference_protected(fn->parent,
lockdep_is_held(&table->tb6_lock));
}
-
- if (rt->rt6i_pcpu)
- fib6_drop_pcpu_from(rt, table);
}
+
+ fib6_clean_expires(rt);
+ fib6_remove_gc_list(rt);
}
/*
@@ -932,8 +1078,8 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
*/
static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
- struct nl_info *info,
- struct netlink_ext_ack *extack)
+ struct nl_info *info, struct netlink_ext_ack *extack,
+ struct list_head *purge_list)
{
struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
lockdep_is_held(&rt->fib6_table->tb6_lock));
@@ -946,6 +1092,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
(info->nlh->nlmsg_flags & NLM_F_CREATE));
int found = 0;
bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
+ bool notify_sibling_rt = false;
u16 nlflags = NLM_F_EXCL;
int err;
@@ -975,20 +1122,26 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
found++;
break;
}
- if (rt_can_ecmp)
- fallback_ins = fallback_ins ?: ins;
+ fallback_ins = fallback_ins ?: ins;
goto next_iter;
}
if (rt6_duplicate_nexthop(iter, rt)) {
if (rt->fib6_nsiblings)
- rt->fib6_nsiblings = 0;
+ WRITE_ONCE(rt->fib6_nsiblings, 0);
if (!(iter->fib6_flags & RTF_EXPIRES))
return -EEXIST;
- if (!(rt->fib6_flags & RTF_EXPIRES))
+ if (!(rt->fib6_flags & RTF_EXPIRES)) {
fib6_clean_expires(iter);
- else
+ fib6_remove_gc_list(iter);
+ } else {
fib6_set_expires(iter, rt->expires);
+ fib6_add_gc_list(iter);
+ }
+ if (!(rt->fib6_flags & (RTF_ADDRCONF | RTF_PREFIX_RT))) {
+ iter->fib6_flags &= ~RTF_ADDRCONF;
+ iter->fib6_flags &= ~RTF_PREFIX_RT;
+ }
if (rt->fib6_pmtu)
fib6_metric_set(iter, RTAX_MTU,
@@ -1008,7 +1161,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
*/
if (rt_can_ecmp &&
rt6_qualify_for_ecmp(iter))
- rt->fib6_nsiblings++;
+ WRITE_ONCE(rt->fib6_nsiblings,
+ rt->fib6_nsiblings + 1);
}
if (iter->fib6_metric > rt->fib6_metric)
@@ -1019,7 +1173,9 @@ next_iter:
}
if (fallback_ins && !found) {
- /* No ECMP-able route found, replace first non-ECMP one */
+ /* No matching route with same ecmp-able-ness found, replace
+ * first matching route
+ */
ins = fallback_ins;
iter = rcu_dereference_protected(*ins,
lockdep_is_held(&rt->fib6_table->tb6_lock));
@@ -1037,15 +1193,17 @@ next_iter:
/* Find the first route that have the same metric */
sibling = leaf;
+ notify_sibling_rt = true;
while (sibling) {
if (sibling->fib6_metric == rt->fib6_metric &&
rt6_qualify_for_ecmp(sibling)) {
- list_add_tail(&rt->fib6_siblings,
- &sibling->fib6_siblings);
+ list_add_tail_rcu(&rt->fib6_siblings,
+ &sibling->fib6_siblings);
break;
}
sibling = rcu_dereference_protected(sibling->fib6_next,
lockdep_is_held(&rt->fib6_table->tb6_lock));
+ notify_sibling_rt = false;
}
/* For each sibling in the list, increment the counter of
* siblings. BUG() if counters does not match, list of siblings
@@ -1054,12 +1212,15 @@ next_iter:
fib6_nsiblings = 0;
list_for_each_entry_safe(sibling, temp_sibling,
&rt->fib6_siblings, fib6_siblings) {
- sibling->fib6_nsiblings++;
+ WRITE_ONCE(sibling->fib6_nsiblings,
+ sibling->fib6_nsiblings + 1);
BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
fib6_nsiblings++;
}
BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
+ rcu_read_lock();
rt6_multipath_rebalance(temp_sibling);
+ rcu_read_unlock();
}
/*
@@ -1072,14 +1233,46 @@ next_iter:
add:
nlflags |= NLM_F_CREATE;
- err = call_fib6_entry_notifiers(info->nl_net,
- FIB_EVENT_ENTRY_ADD,
- rt, extack);
- if (err)
- return err;
+ /* The route should only be notified if it is the first
+ * route in the node or if it is added as a sibling
+ * route to the first route in the node.
+ */
+ if (!info->skip_notify_kernel &&
+ (notify_sibling_rt || ins == &fn->leaf)) {
+ enum fib_event_type fib_event;
+
+ if (notify_sibling_rt)
+ fib_event = FIB_EVENT_ENTRY_APPEND;
+ else
+ fib_event = FIB_EVENT_ENTRY_REPLACE;
+ err = call_fib6_entry_notifiers(info->nl_net,
+ fib_event, rt,
+ extack);
+ if (err) {
+ struct fib6_info *sibling, *next_sibling;
+
+ /* If the route has siblings, then it first
+ * needs to be unlinked from them.
+ */
+ if (!rt->fib6_nsiblings)
+ return err;
+
+ list_for_each_entry_safe(sibling, next_sibling,
+ &rt->fib6_siblings,
+ fib6_siblings)
+ WRITE_ONCE(sibling->fib6_nsiblings,
+ sibling->fib6_nsiblings - 1);
+ WRITE_ONCE(rt->fib6_nsiblings, 0);
+ list_del_rcu(&rt->fib6_siblings);
+ rcu_read_lock();
+ rt6_multipath_rebalance(next_sibling);
+ rcu_read_unlock();
+ return err;
+ }
+ }
rcu_assign_pointer(rt->fib6_next, iter);
- atomic_inc(&rt->fib6_ref);
+ fib6_info_hold(rt);
rcu_assign_pointer(rt->fib6_node, fn);
rcu_assign_pointer(*ins, rt);
if (!info->skip_notify)
@@ -1101,13 +1294,15 @@ add:
return -ENOENT;
}
- err = call_fib6_entry_notifiers(info->nl_net,
- FIB_EVENT_ENTRY_REPLACE,
- rt, extack);
- if (err)
- return err;
+ if (!info->skip_notify_kernel && ins == &fn->leaf) {
+ err = call_fib6_entry_notifiers(info->nl_net,
+ FIB_EVENT_ENTRY_REPLACE,
+ rt, extack);
+ if (err)
+ return err;
+ }
- atomic_inc(&rt->fib6_ref);
+ fib6_info_hold(rt);
rcu_assign_pointer(rt->fib6_node, fn);
rt->fib6_next = iter->fib6_next;
rcu_assign_pointer(*ins, rt);
@@ -1119,10 +1314,9 @@ add:
}
nsiblings = iter->fib6_nsiblings;
iter->fib6_node = NULL;
- fib6_purge_rt(iter, fn, info->nl_net);
+ list_add(&iter->purge_link, purge_list);
if (rcu_access_pointer(fn->rr_ptr) == iter)
fn->rr_ptr = NULL;
- fib6_info_release(iter);
if (nsiblings) {
/* Replacing an ECMP route, remove all siblings */
@@ -1135,10 +1329,9 @@ add:
if (rt6_qualify_for_ecmp(iter)) {
*ins = iter->fib6_next;
iter->fib6_node = NULL;
- fib6_purge_rt(iter, fn, info->nl_net);
+ list_add(&iter->purge_link, purge_list);
if (rcu_access_pointer(fn->rr_ptr) == iter)
fn->rr_ptr = NULL;
- fib6_info_release(iter);
nsiblings--;
info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
} else {
@@ -1154,6 +1347,28 @@ add:
return 0;
}
+static int fib6_add_rt2node_nh(struct fib6_node *fn, struct fib6_info *rt,
+ struct nl_info *info, struct netlink_ext_ack *extack,
+ struct list_head *purge_list)
+{
+ int err;
+
+ spin_lock(&rt->nh->lock);
+
+ if (rt->nh->dead) {
+ NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+ err = -EINVAL;
+ } else {
+ err = fib6_add_rt2node(fn, rt, info, extack, purge_list);
+ if (!err)
+ list_add(&rt->nh_list, &rt->nh->f6i_list);
+ }
+
+ spin_unlock(&rt->nh->lock);
+
+ return err;
+}
+
static void fib6_start_gc(struct net *net, struct fib6_info *rt)
{
if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
@@ -1175,10 +1390,10 @@ static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
lockdep_is_held(&rt->fib6_table->tb6_lock));
- /* paired with smp_rmb() in rt6_get_cookie_safe() */
+ /* paired with smp_rmb() in fib6_get_cookie_safe() */
smp_wmb();
while (fn) {
- fn->fn_sernum = sernum;
+ WRITE_ONCE(fn->fn_sernum, sernum);
fn = rcu_dereference_protected(fn->parent,
lockdep_is_held(&rt->fib6_table->tb6_lock));
}
@@ -1189,6 +1404,14 @@ void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
__fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
}
+/* allow ipv4 to update sernum via ipv6_stub */
+void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i)
+{
+ spin_lock_bh(&f6i->fib6_table->tb6_lock);
+ fib6_update_sernum_upto_root(net, f6i);
+ spin_unlock_bh(&f6i->fib6_table->tb6_lock);
+}
+
/*
* Add routing information to the routing tree.
* <destination addr>/<source addr>
@@ -1200,11 +1423,14 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
struct nl_info *info, struct netlink_ext_ack *extack)
{
struct fib6_table *table = rt->fib6_table;
- struct fib6_node *fn, *pn = NULL;
+ LIST_HEAD(purge_list);
+ struct fib6_node *fn;
+#ifdef CONFIG_IPV6_SUBTREES
+ struct fib6_node *pn = NULL;
+#endif
int err = -ENOMEM;
int allow_create = 1;
int replace_required = 0;
- int sernum = fib6_new_sernum(info->nl_net);
if (info->nlh) {
if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
@@ -1225,9 +1451,9 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
goto out;
}
+#ifdef CONFIG_IPV6_SUBTREES
pn = fn;
-#ifdef CONFIG_IPV6_SUBTREES
if (rt->fib6_src.plen) {
struct fib6_node *sn;
@@ -1249,7 +1475,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
if (!sfn)
goto failure;
- atomic_inc(&info->nl_net->ipv6.fib6_null_entry->fib6_ref);
+ fib6_info_hold(info->nl_net->ipv6.fib6_null_entry);
rcu_assign_pointer(sfn->leaf,
info->nl_net->ipv6.fib6_null_entry);
sfn->fn_flags = RTN_ROOT;
@@ -1292,7 +1518,7 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
rcu_assign_pointer(fn->leaf,
info->nl_net->ipv6.fib6_null_entry);
} else {
- atomic_inc(&rt->fib6_ref);
+ fib6_info_hold(rt);
rcu_assign_pointer(fn->leaf, rt);
}
}
@@ -1300,9 +1526,24 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
}
#endif
- err = fib6_add_rt2node(fn, rt, info, extack);
+ if (rt->nh)
+ err = fib6_add_rt2node_nh(fn, rt, info, extack, &purge_list);
+ else
+ err = fib6_add_rt2node(fn, rt, info, extack, &purge_list);
if (!err) {
- __fib6_update_sernum_upto_root(rt, sernum);
+ struct fib6_info *iter, *next;
+
+ list_for_each_entry_safe(iter, next, &purge_list, purge_link) {
+ list_del(&iter->purge_link);
+ fib6_purge_rt(iter, fn, info->nl_net);
+ fib6_info_release(iter);
+ }
+
+ __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net));
+
+ if (rt->fib6_flags & RTF_EXPIRES)
+ fib6_add_gc_list(rt);
+
fib6_start_gc(info->nl_net, rt);
}
@@ -1325,19 +1566,17 @@ out:
if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
pn_leaf = fib6_find_prefix(info->nl_net, table,
pn);
-#if RT6_DEBUG >= 2
- if (!pn_leaf) {
- WARN_ON(!pn_leaf);
+ if (!pn_leaf)
pn_leaf =
info->nl_net->ipv6.fib6_null_entry;
- }
-#endif
fib6_info_hold(pn_leaf);
rcu_assign_pointer(pn->leaf, pn_leaf);
}
}
#endif
goto failure;
+ } else if (fib6_requires_src(rt)) {
+ fib6_routes_require_src_inc(info->nl_net);
}
return err;
@@ -1507,7 +1746,8 @@ static struct fib6_node *fib6_locate_1(struct fib6_node *root,
if (plen == fn->fn_bit)
return fn;
- prev = fn;
+ if (fn->fn_flags & RTN_RTINFO)
+ prev = fn;
next:
/*
@@ -1628,7 +1868,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
lockdep_is_held(&table->tb6_lock));
struct fib6_info *new_fn_leaf;
- RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
+ pr_debug("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
iter++;
WARN_ON(fn->fn_flags & RTN_RTINFO);
@@ -1637,10 +1877,14 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
children = 0;
child = NULL;
- if (fn_r)
- child = fn_r, children |= 1;
- if (fn_l)
- child = fn_l, children |= 2;
+ if (fn_r) {
+ child = fn_r;
+ children |= 1;
+ }
+ if (fn_l) {
+ child = fn_l;
+ children |= 2;
+ }
if (children == 3 || FIB6_SUBTREE(fn)
#ifdef CONFIG_IPV6_SUBTREES
@@ -1687,7 +1931,8 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
FOR_WALKERS(net, w) {
if (!child) {
if (w->node == fn) {
- RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
+ pr_debug("W %p adjusted by delnode 1, s=%d/%d\n",
+ w, w->state, nstate);
w->node = pn;
w->state = nstate;
}
@@ -1695,10 +1940,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
if (w->node == fn) {
w->node = child;
if (children&2) {
- RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
+ pr_debug("W %p adjusted by delnode 2, s=%d\n",
+ w, w->state);
w->state = w->state >= FWS_R ? FWS_U : FWS_INIT;
} else {
- RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
+ pr_debug("W %p adjusted by delnode 2, s=%d\n",
+ w, w->state);
w->state = w->state >= FWS_C ? FWS_U : FWS_INIT;
}
}
@@ -1719,12 +1966,26 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
struct fib6_info __rcu **rtp, struct nl_info *info)
{
+ struct fib6_info *leaf, *replace_rt = NULL;
struct fib6_walker *w;
struct fib6_info *rt = rcu_dereference_protected(*rtp,
lockdep_is_held(&table->tb6_lock));
struct net *net = info->nl_net;
+ bool notify_del = false;
- RT6_TRACE("fib6_del_route\n");
+ /* If the deleted route is the first in the node and it is not part of
+ * a multipath route, then we need to replace it with the next route
+ * in the node, if exists.
+ */
+ leaf = rcu_dereference_protected(fn->leaf,
+ lockdep_is_held(&table->tb6_lock));
+ if (leaf == rt && !rt->fib6_nsiblings) {
+ if (rcu_access_pointer(rt->fib6_next))
+ replace_rt = rcu_dereference_protected(rt->fib6_next,
+ lockdep_is_held(&table->tb6_lock));
+ else
+ notify_del = true;
+ }
/* Unlink it */
*rtp = rt->fib6_next;
@@ -1732,9 +1993,6 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
net->ipv6.rt6_stats->fib_rt_entries--;
net->ipv6.rt6_stats->fib_discarded_routes++;
- /* Flush all cached dst in exception table */
- rt6_flush_exceptions(rt);
-
/* Reset round-robin state, if necessary */
if (rcu_access_pointer(fn->rr_ptr) == rt)
fn->rr_ptr = NULL;
@@ -1743,11 +2001,20 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
if (rt->fib6_nsiblings) {
struct fib6_info *sibling, *next_sibling;
+ /* The route is deleted from a multipath route. If this
+ * multipath route is the first route in the node, then we need
+ * to emit a delete notification. Otherwise, we need to skip
+ * the notification.
+ */
+ if (rt->fib6_metric == leaf->fib6_metric &&
+ rt6_qualify_for_ecmp(leaf))
+ notify_del = true;
list_for_each_entry_safe(sibling, next_sibling,
&rt->fib6_siblings, fib6_siblings)
- sibling->fib6_nsiblings--;
- rt->fib6_nsiblings = 0;
- list_del_init(&rt->fib6_siblings);
+ WRITE_ONCE(sibling->fib6_nsiblings,
+ sibling->fib6_nsiblings - 1);
+ WRITE_ONCE(rt->fib6_nsiblings, 0);
+ list_del_rcu(&rt->fib6_siblings);
rt6_multipath_rebalance(next_sibling);
}
@@ -1755,7 +2022,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
read_lock(&net->ipv6.fib6_walker_lock);
FOR_WALKERS(net, w) {
if (w->state == FWS_C && w->leaf == rt) {
- RT6_TRACE("walker %p adjusted by delroute\n", w);
+ pr_debug("walker %p adjusted by delroute\n", w);
w->leaf = rcu_dereference_protected(rt->fib6_next,
lockdep_is_held(&table->tb6_lock));
if (!w->leaf)
@@ -1778,23 +2045,35 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
fib6_purge_rt(rt, fn, net);
- call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
+ if (!info->skip_notify_kernel) {
+ if (notify_del)
+ call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL,
+ rt, NULL);
+ else if (replace_rt)
+ call_fib6_entry_notifiers_replace(net, replace_rt);
+ }
if (!info->skip_notify)
inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
+
fib6_info_release(rt);
}
/* Need to own table->tb6_lock */
int fib6_del(struct fib6_info *rt, struct nl_info *info)
{
- struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
- lockdep_is_held(&rt->fib6_table->tb6_lock));
- struct fib6_table *table = rt->fib6_table;
struct net *net = info->nl_net;
struct fib6_info __rcu **rtp;
struct fib6_info __rcu **rtp_next;
+ struct fib6_table *table;
+ struct fib6_node *fn;
+
+ if (rt == net->ipv6.fib6_null_entry)
+ return -ENOENT;
- if (!fn || rt == net->ipv6.fib6_null_entry)
+ table = rt->fib6_table;
+ fn = rcu_dereference_protected(rt->fib6_node,
+ lockdep_is_held(&table->tb6_lock));
+ if (!fn)
return -ENOENT;
WARN_ON(!(fn->fn_flags & RTN_RTINFO));
@@ -1807,6 +2086,8 @@ int fib6_del(struct fib6_info *rt, struct nl_info *info)
struct fib6_info *cur = rcu_dereference_protected(*rtp,
lockdep_is_held(&table->tb6_lock));
if (rt == cur) {
+ if (fib6_requires_src(cur))
+ fib6_routes_require_src_dec(info->nl_net);
fib6_del_route(table, fn, rtp, info);
return 0;
}
@@ -1861,8 +2142,8 @@ static int fib6_walk_continue(struct fib6_walker *w)
continue;
}
w->state = FWS_L;
+ fallthrough;
#endif
- /* fall through */
case FWS_L:
left = rcu_dereference_protected(fn->left, 1);
if (left) {
@@ -1871,7 +2152,7 @@ static int fib6_walk_continue(struct fib6_walker *w)
continue;
}
w->state = FWS_R;
- /* fall through */
+ fallthrough;
case FWS_R:
right = rcu_dereference_protected(fn->right, 1);
if (right) {
@@ -1881,7 +2162,7 @@ static int fib6_walk_continue(struct fib6_walker *w)
}
w->state = FWS_C;
w->leaf = rcu_dereference_protected(fn->leaf, 1);
- /* fall through */
+ fallthrough;
case FWS_C:
if (w->leaf && fn->fn_flags & RTN_RTINFO) {
int err;
@@ -1900,7 +2181,7 @@ static int fib6_walk_continue(struct fib6_walker *w)
}
skip:
w->state = FWS_U;
- /* fall through */
+ fallthrough;
case FWS_U:
if (fn == w->root)
return 0;
@@ -1952,11 +2233,12 @@ static int fib6_clean_node(struct fib6_walker *w)
struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
struct nl_info info = {
.nl_net = c->net,
+ .skip_notify = c->skip_notify,
};
if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
- w->node->fn_sernum != c->sernum)
- w->node->fn_sernum = c->sernum;
+ READ_ONCE(w->node->fn_sernum) != c->sernum)
+ WRITE_ONCE(w->node->fn_sernum, c->sernum);
if (!c->func) {
WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE);
@@ -2003,7 +2285,7 @@ static int fib6_clean_node(struct fib6_walker *w)
static void fib6_clean_tree(struct net *net, struct fib6_node *root,
int (*func)(struct fib6_info *, void *arg),
- int sernum, void *arg)
+ int sernum, void *arg, bool skip_notify)
{
struct fib6_cleaner c;
@@ -2011,17 +2293,19 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root,
c.w.func = fib6_clean_node;
c.w.count = 0;
c.w.skip = 0;
+ c.w.skip_in_node = 0;
c.func = func;
c.sernum = sernum;
c.arg = arg;
c.net = net;
+ c.skip_notify = skip_notify;
fib6_walk(net, &c.w);
}
static void __fib6_clean_all(struct net *net,
int (*func)(struct fib6_info *, void *),
- int sernum, void *arg)
+ int sernum, void *arg, bool skip_notify)
{
struct fib6_table *table;
struct hlist_head *head;
@@ -2033,7 +2317,7 @@ static void __fib6_clean_all(struct net *net,
hlist_for_each_entry_rcu(table, head, tb6_hlist) {
spin_lock_bh(&table->tb6_lock);
fib6_clean_tree(net, &table->tb6_root,
- func, sernum, arg);
+ func, sernum, arg, skip_notify);
spin_unlock_bh(&table->tb6_lock);
}
}
@@ -2043,23 +2327,29 @@ static void __fib6_clean_all(struct net *net,
void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
void *arg)
{
- __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg);
+ __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false);
+}
+
+void fib6_clean_all_skip_notify(struct net *net,
+ int (*func)(struct fib6_info *, void *),
+ void *arg)
+{
+ __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true);
}
static void fib6_flush_trees(struct net *net)
{
int new_sernum = fib6_new_sernum(net);
- __fib6_clean_all(net, NULL, new_sernum, NULL);
+ __fib6_clean_all(net, NULL, new_sernum, NULL, false);
}
/*
* Garbage collection
*/
-static int fib6_age(struct fib6_info *rt, void *arg)
+static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args)
{
- struct fib6_gc_args *gc_args = arg;
unsigned long now = jiffies;
/*
@@ -2069,7 +2359,7 @@ static int fib6_age(struct fib6_info *rt, void *arg)
if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
if (time_after(now, rt->expires)) {
- RT6_TRACE("expiring %p\n", rt);
+ pr_debug("expiring %p\n", rt);
return -1;
}
gc_args->more++;
@@ -2084,6 +2374,42 @@ static int fib6_age(struct fib6_info *rt, void *arg)
return 0;
}
+static void fib6_gc_table(struct net *net,
+ struct fib6_table *tb6,
+ struct fib6_gc_args *gc_args)
+{
+ struct fib6_info *rt;
+ struct hlist_node *n;
+ struct nl_info info = {
+ .nl_net = net,
+ .skip_notify = false,
+ };
+
+ hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link)
+ if (fib6_age(rt, gc_args) == -1)
+ fib6_del(rt, &info);
+}
+
+static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args)
+{
+ struct fib6_table *table;
+ struct hlist_head *head;
+ unsigned int h;
+
+ rcu_read_lock();
+ for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
+ head = &net->ipv6.fib_table_hash[h];
+ hlist_for_each_entry_rcu(table, head, tb6_hlist) {
+ spin_lock_bh(&table->tb6_lock);
+
+ fib6_gc_table(net, table, gc_args);
+
+ spin_unlock_bh(&table->tb6_lock);
+ }
+ }
+ rcu_read_unlock();
+}
+
void fib6_run_gc(unsigned long expires, struct net *net, bool force)
{
struct fib6_gc_args gc_args;
@@ -2099,7 +2425,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force)
net->ipv6.sysctl.ip6_rt_gc_interval;
gc_args.more = 0;
- fib6_clean_all(net, fib6_age, &gc_args);
+ fib6_gc_all(net, &gc_args);
now = jiffies;
net->ipv6.ip6_rt_last_gc = now;
@@ -2108,13 +2434,13 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force)
round_jiffies(now
+ net->ipv6.sysctl.ip6_rt_gc_interval));
else
- del_timer(&net->ipv6.ip6_fib_timer);
+ timer_delete(&net->ipv6.ip6_fib_timer);
spin_unlock_bh(&net->ipv6.fib6_gc_lock);
}
static void fib6_gc_timer_cb(struct timer_list *t)
{
- struct net *arg = from_timer(arg, t, ipv6.ip6_fib_timer);
+ struct net *arg = timer_container_of(arg, t, ipv6.ip6_fib_timer);
fib6_run_gc(0, arg, true);
}
@@ -2128,6 +2454,10 @@ static int __net_init fib6_net_init(struct net *net)
if (err)
return err;
+ /* Default to 3-tuple */
+ net->ipv6.sysctl.multipath_hash_fields =
+ FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK;
+
spin_lock_init(&net->ipv6.fib6_gc_lock);
rwlock_init(&net->ipv6.fib6_walker_lock);
INIT_LIST_HEAD(&net->ipv6.fib6_walkers);
@@ -2135,7 +2465,7 @@ static int __net_init fib6_net_init(struct net *net)
net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
if (!net->ipv6.rt6_stats)
- goto out_timer;
+ goto out_notifier;
/* Avoid false sharing : Use at least a full cache line */
size = max_t(size_t, size, L1_CACHE_BYTES);
@@ -2144,6 +2474,8 @@ static int __net_init fib6_net_init(struct net *net)
if (!net->ipv6.fib_table_hash)
goto out_rt6_stats;
+ spin_lock_init(&net->ipv6.fib_table_hash_lock);
+
net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl),
GFP_KERNEL);
if (!net->ipv6.fib6_main_tbl)
@@ -2155,6 +2487,7 @@ static int __net_init fib6_net_init(struct net *net)
net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
+ INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl),
@@ -2167,6 +2500,7 @@ static int __net_init fib6_net_init(struct net *net)
net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
+ INIT_HLIST_HEAD(&net->ipv6.fib6_local_tbl->tb6_gc_hlist);
#endif
fib6_tables_init(net);
@@ -2180,7 +2514,7 @@ out_fib_table_hash:
kfree(net->ipv6.fib_table_hash);
out_rt6_stats:
kfree(net->ipv6.rt6_stats);
-out_timer:
+out_notifier:
fib6_notifier_exit(net);
return -ENOMEM;
}
@@ -2189,7 +2523,7 @@ static void fib6_net_exit(struct net *net)
{
unsigned int i;
- del_timer_sync(&net->ipv6.ip6_fib_timer);
+ timer_delete_sync(&net->ipv6.ip6_fib_timer);
for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
struct hlist_head *head = &net->ipv6.fib_table_hash[i];
@@ -2212,14 +2546,18 @@ static struct pernet_operations fib6_net_ops = {
.exit = fib6_net_exit,
};
+static const struct rtnl_msg_handler fib6_rtnl_msg_handlers[] __initconst_or_module = {
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE,
+ .dumpit = inet6_dump_fib,
+ .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
+};
+
int __init fib6_init(void)
{
int ret = -ENOMEM;
- fib6_node_kmem = kmem_cache_create("fib6_nodes",
- sizeof(struct fib6_node),
- 0, SLAB_HWCACHE_ALIGN,
- NULL);
+ fib6_node_kmem = KMEM_CACHE(fib6_node,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT);
if (!fib6_node_kmem)
goto out;
@@ -2227,8 +2565,7 @@ int __init fib6_init(void)
if (ret)
goto out_kmem_cache_create;
- ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL,
- inet6_dump_fib, 0);
+ ret = rtnl_register_many(fib6_rtnl_msg_handlers);
if (ret)
goto out_unregister_subsys;
@@ -2250,12 +2587,17 @@ void fib6_gc_cleanup(void)
}
#ifdef CONFIG_PROC_FS
-static int ipv6_route_seq_show(struct seq_file *seq, void *v)
+static int ipv6_route_native_seq_show(struct seq_file *seq, void *v)
{
struct fib6_info *rt = v;
struct ipv6_route_iter *iter = seq->private;
+ struct fib6_nh *fib6_nh = rt->fib6_nh;
+ unsigned int flags = rt->fib6_flags;
const struct net_device *dev;
+ if (rt->nh)
+ fib6_nh = nexthop_fib6_nh(rt->nh);
+
seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
#ifdef CONFIG_IPV6_SUBTREES
@@ -2263,15 +2605,17 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
#else
seq_puts(seq, "00000000000000000000000000000000 00 ");
#endif
- if (rt->fib6_flags & RTF_GATEWAY)
- seq_printf(seq, "%pi6", &rt->fib6_nh.nh_gw);
- else
+ if (fib6_nh->fib_nh_gw_family) {
+ flags |= RTF_GATEWAY;
+ seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6);
+ } else {
seq_puts(seq, "00000000000000000000000000000000");
+ }
- dev = rt->fib6_nh.nh_dev;
+ dev = fib6_nh->fib_nh_dev;
seq_printf(seq, " %08x %08x %08x %08x %8s\n",
- rt->fib6_metric, atomic_read(&rt->fib6_ref), 0,
- rt->fib6_flags, dev ? dev->name : "");
+ rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
+ flags, dev ? dev->name : "");
iter->w.leaf = NULL;
return 0;
}
@@ -2304,7 +2648,7 @@ static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter,
iter->w.state = FWS_INIT;
iter->w.node = iter->w.root;
iter->w.args = iter;
- iter->sernum = iter->w.root->fn_sernum;
+ iter->sernum = READ_ONCE(iter->w.root->fn_sernum);
INIT_LIST_HEAD(&iter->w.lh);
fib6_walker_link(net, &iter->w);
}
@@ -2317,14 +2661,14 @@ static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
if (tbl) {
h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1;
- node = rcu_dereference_bh(hlist_next_rcu(&tbl->tb6_hlist));
+ node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist));
} else {
h = 0;
node = NULL;
}
while (!node && h < FIB6_TABLE_HASHSZ) {
- node = rcu_dereference_bh(
+ node = rcu_dereference(
hlist_first_rcu(&net->ipv6.fib_table_hash[h++]));
}
return hlist_entry_safe(node, struct fib6_table, tb6_hlist);
@@ -2332,8 +2676,10 @@ static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
{
- if (iter->sernum != iter->w.root->fn_sernum) {
- iter->sernum = iter->w.root->fn_sernum;
+ int sernum = READ_ONCE(iter->w.root->fn_sernum);
+
+ if (iter->sernum != sernum) {
+ iter->sernum = sernum;
iter->w.state = FWS_INIT;
iter->w.node = iter->w.root;
WARN_ON(iter->w.skip);
@@ -2348,14 +2694,13 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
struct net *net = seq_file_net(seq);
struct ipv6_route_iter *iter = seq->private;
+ ++(*pos);
if (!v)
goto iter_table;
- n = rcu_dereference_bh(((struct fib6_info *)v)->fib6_next);
- if (n) {
- ++*pos;
+ n = rcu_dereference(((struct fib6_info *)v)->fib6_next);
+ if (n)
return n;
- }
iter_table:
ipv6_route_check_sernum(iter);
@@ -2363,8 +2708,6 @@ iter_table:
r = fib6_walk_continue(&iter->w);
spin_unlock_bh(&iter->tbl->tb6_lock);
if (r > 0) {
- if (v)
- ++*pos;
return iter->w.leaf;
} else if (r < 0) {
fib6_walker_unlink(net, &iter->w);
@@ -2381,18 +2724,20 @@ iter_table:
}
static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(RCU_BH)
+ __acquires(RCU)
{
struct net *net = seq_file_net(seq);
struct ipv6_route_iter *iter = seq->private;
- rcu_read_lock_bh();
+ rcu_read_lock();
iter->tbl = ipv6_route_seq_next_table(NULL, net);
iter->skip = *pos;
if (iter->tbl) {
+ loff_t p = 0;
+
ipv6_route_seq_setup_walk(iter, net);
- return ipv6_route_seq_next(seq, NULL, pos);
+ return ipv6_route_seq_next(seq, NULL, &p);
} else {
return NULL;
}
@@ -2404,8 +2749,8 @@ static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
return w->node && !(w->state == FWS_U && w->node == w->root);
}
-static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
- __releases(RCU_BH)
+static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v)
+ __releases(RCU)
{
struct net *net = seq_file_net(seq);
struct ipv6_route_iter *iter = seq->private;
@@ -2413,9 +2758,65 @@ static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
if (ipv6_route_iter_active(iter))
fib6_walker_unlink(net, &iter->w);
- rcu_read_unlock_bh();
+ rcu_read_unlock();
}
+#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL)
+static int ipv6_route_prog_seq_show(struct bpf_prog *prog,
+ struct bpf_iter_meta *meta,
+ void *v)
+{
+ struct bpf_iter__ipv6_route ctx;
+
+ ctx.meta = meta;
+ ctx.rt = v;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int ipv6_route_seq_show(struct seq_file *seq, void *v)
+{
+ struct ipv6_route_iter *iter = seq->private;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+ int ret;
+
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, false);
+ if (!prog)
+ return ipv6_route_native_seq_show(seq, v);
+
+ ret = ipv6_route_prog_seq_show(prog, &meta, v);
+ iter->w.leaf = NULL;
+
+ return ret;
+}
+
+static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
+{
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ if (!v) {
+ meta.seq = seq;
+ prog = bpf_iter_get_info(&meta, true);
+ if (prog)
+ (void)ipv6_route_prog_seq_show(prog, &meta, v);
+ }
+
+ ipv6_route_native_seq_stop(seq, v);
+}
+#else
+static int ipv6_route_seq_show(struct seq_file *seq, void *v)
+{
+ return ipv6_route_native_seq_show(seq, v);
+}
+
+static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
+{
+ ipv6_route_native_seq_stop(seq, v);
+}
+#endif
+
const struct seq_operations ipv6_route_seq_ops = {
.start = ipv6_route_seq_start,
.next = ipv6_route_seq_next,
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index cb54a8a3c273..60d0be47a9f3 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* ip6_flowlabel.c IPv6 flowlabel manager.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
*/
@@ -21,6 +17,7 @@
#include <linux/slab.h>
#include <linux/export.h>
#include <linux/pid_namespace.h>
+#include <linux/jump_label_ratelimit.h>
#include <net/net_namespace.h>
#include <net/sock.h>
@@ -57,19 +54,22 @@ static DEFINE_SPINLOCK(ip6_fl_lock);
static DEFINE_SPINLOCK(ip6_sk_fl_lock);
+DEFINE_STATIC_KEY_DEFERRED_FALSE(ipv6_flowlabel_exclusive, HZ);
+EXPORT_SYMBOL(ipv6_flowlabel_exclusive);
+
#define for_each_fl_rcu(hash, fl) \
- for (fl = rcu_dereference_bh(fl_ht[(hash)]); \
+ for (fl = rcu_dereference(fl_ht[(hash)]); \
fl != NULL; \
- fl = rcu_dereference_bh(fl->next))
+ fl = rcu_dereference(fl->next))
#define for_each_fl_continue_rcu(fl) \
- for (fl = rcu_dereference_bh(fl->next); \
+ for (fl = rcu_dereference(fl->next); \
fl != NULL; \
- fl = rcu_dereference_bh(fl->next))
+ fl = rcu_dereference(fl->next))
-#define for_each_sk_fl_rcu(np, sfl) \
- for (sfl = rcu_dereference_bh(np->ipv6_fl_list); \
+#define for_each_sk_fl_rcu(sk, sfl) \
+ for (sfl = rcu_dereference(inet_sk(sk)->ipv6_fl_list); \
sfl != NULL; \
- sfl = rcu_dereference_bh(sfl->next))
+ sfl = rcu_dereference(sfl->next))
static inline struct ip6_flowlabel *__fl_lookup(struct net *net, __be32 label)
{
@@ -86,23 +86,41 @@ static struct ip6_flowlabel *fl_lookup(struct net *net, __be32 label)
{
struct ip6_flowlabel *fl;
- rcu_read_lock_bh();
+ rcu_read_lock();
fl = __fl_lookup(net, label);
if (fl && !atomic_inc_not_zero(&fl->users))
fl = NULL;
- rcu_read_unlock_bh();
+ rcu_read_unlock();
return fl;
}
+static bool fl_shared_exclusive(struct ip6_flowlabel *fl)
+{
+ return fl->share == IPV6_FL_S_EXCL ||
+ fl->share == IPV6_FL_S_PROCESS ||
+ fl->share == IPV6_FL_S_USER;
+}
+
+static void fl_free_rcu(struct rcu_head *head)
+{
+ struct ip6_flowlabel *fl = container_of(head, struct ip6_flowlabel, rcu);
+
+ if (fl->share == IPV6_FL_S_PROCESS)
+ put_pid(fl->owner.pid);
+ kfree(fl->opt);
+ kfree(fl);
+}
+
static void fl_free(struct ip6_flowlabel *fl)
{
- if (fl) {
- if (fl->share == IPV6_FL_S_PROCESS)
- put_pid(fl->owner.pid);
- kfree(fl->opt);
- kfree_rcu(fl, rcu);
- }
+ if (!fl)
+ return;
+
+ if (fl_shared_exclusive(fl) || fl->opt)
+ static_branch_slow_dec_deferred(&ipv6_flowlabel_exclusive);
+
+ call_rcu(&fl->rcu, fl_free_rcu);
}
static void fl_release(struct ip6_flowlabel *fl)
@@ -199,10 +217,11 @@ static struct ip6_flowlabel *fl_intern(struct net *net,
fl->label = label & IPV6_FLOWLABEL_MASK;
+ rcu_read_lock();
spin_lock_bh(&ip6_fl_lock);
if (label == 0) {
for (;;) {
- fl->label = htonl(prandom_u32())&IPV6_FLOWLABEL_MASK;
+ fl->label = htonl(get_random_u32())&IPV6_FLOWLABEL_MASK;
if (fl->label) {
lfl = __fl_lookup(net, fl->label);
if (!lfl)
@@ -222,6 +241,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net,
if (lfl) {
atomic_inc(&lfl->users);
spin_unlock_bh(&ip6_fl_lock);
+ rcu_read_unlock();
return lfl;
}
}
@@ -231,6 +251,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net,
rcu_assign_pointer(fl_ht[FL_HASH(fl->label)], fl);
atomic_inc(&fl_size);
spin_unlock_bh(&ip6_fl_lock);
+ rcu_read_unlock();
return NULL;
}
@@ -238,40 +259,39 @@ static struct ip6_flowlabel *fl_intern(struct net *net,
/* Socket flowlabel lists */
-struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label)
+struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label)
{
struct ipv6_fl_socklist *sfl;
- struct ipv6_pinfo *np = inet6_sk(sk);
label &= IPV6_FLOWLABEL_MASK;
- rcu_read_lock_bh();
- for_each_sk_fl_rcu(np, sfl) {
+ rcu_read_lock();
+ for_each_sk_fl_rcu(sk, sfl) {
struct ip6_flowlabel *fl = sfl->fl;
- if (fl->label == label) {
+
+ if (fl->label == label && atomic_inc_not_zero(&fl->users)) {
fl->lastuse = jiffies;
- atomic_inc(&fl->users);
- rcu_read_unlock_bh();
+ rcu_read_unlock();
return fl;
}
}
- rcu_read_unlock_bh();
+ rcu_read_unlock();
return NULL;
}
-EXPORT_SYMBOL_GPL(fl6_sock_lookup);
+EXPORT_SYMBOL_GPL(__fl6_sock_lookup);
void fl6_free_socklist(struct sock *sk)
{
- struct ipv6_pinfo *np = inet6_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
struct ipv6_fl_socklist *sfl;
- if (!rcu_access_pointer(np->ipv6_fl_list))
+ if (!rcu_access_pointer(inet->ipv6_fl_list))
return;
spin_lock_bh(&ip6_sk_fl_lock);
- while ((sfl = rcu_dereference_protected(np->ipv6_fl_list,
+ while ((sfl = rcu_dereference_protected(inet->ipv6_fl_list,
lockdep_is_held(&ip6_sk_fl_lock))) != NULL) {
- np->ipv6_fl_list = sfl->next;
+ inet->ipv6_fl_list = sfl->next;
spin_unlock_bh(&ip6_sk_fl_lock);
fl_release(sfl->fl);
@@ -353,7 +373,7 @@ static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned lo
static struct ip6_flowlabel *
fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
- char __user *optval, int optlen, int *err_p)
+ sockptr_t optval, int optlen, int *err_p)
{
struct ip6_flowlabel *fl = NULL;
int olen;
@@ -383,7 +403,8 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
memset(fl->opt, 0, sizeof(*fl->opt));
fl->opt->tot_len = sizeof(*fl->opt) + olen;
err = -EFAULT;
- if (copy_from_user(fl->opt+1, optval+CMSG_ALIGN(sizeof(*freq)), olen))
+ if (copy_from_sockptr_offset(fl->opt + 1, optval,
+ CMSG_ALIGN(sizeof(*freq)), olen))
goto done;
msg.msg_controllen = olen;
@@ -431,28 +452,34 @@ fl_create(struct net *net, struct sock *sk, struct in6_flowlabel_req *freq,
err = -EINVAL;
goto done;
}
+ if (fl_shared_exclusive(fl) || fl->opt) {
+ WRITE_ONCE(sock_net(sk)->ipv6.flowlabel_has_excl, 1);
+ static_branch_deferred_inc(&ipv6_flowlabel_exclusive);
+ }
return fl;
done:
- fl_free(fl);
+ if (fl) {
+ kfree(fl->opt);
+ kfree(fl);
+ }
*err_p = err;
return NULL;
}
static int mem_check(struct sock *sk)
{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct ipv6_fl_socklist *sfl;
int room = FL_MAX_SIZE - atomic_read(&fl_size);
+ struct ipv6_fl_socklist *sfl;
int count = 0;
if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK)
return 0;
- rcu_read_lock_bh();
- for_each_sk_fl_rcu(np, sfl)
+ rcu_read_lock();
+ for_each_sk_fl_rcu(sk, sfl)
count++;
- rcu_read_unlock_bh();
+ rcu_read_unlock();
if (room <= 0 ||
((count >= FL_MAX_PER_SOCK ||
@@ -463,13 +490,15 @@ static int mem_check(struct sock *sk)
return 0;
}
-static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl,
- struct ip6_flowlabel *fl)
+static inline void fl_link(struct sock *sk, struct ipv6_fl_socklist *sfl,
+ struct ip6_flowlabel *fl)
{
+ struct inet_sock *inet = inet_sk(sk);
+
spin_lock_bh(&ip6_sk_fl_lock);
sfl->fl = fl;
- sfl->next = np->ipv6_fl_list;
- rcu_assign_pointer(np->ipv6_fl_list, sfl);
+ sfl->next = inet->ipv6_fl_list;
+ rcu_assign_pointer(inet->ipv6_fl_list, sfl);
spin_unlock_bh(&ip6_sk_fl_lock);
}
@@ -484,14 +513,14 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq,
return 0;
}
- if (np->repflow) {
+ if (inet6_test_bit(REPFLOW, sk)) {
freq->flr_label = np->flow_label;
return 0;
}
- rcu_read_lock_bh();
+ rcu_read_lock();
- for_each_sk_fl_rcu(np, sfl) {
+ for_each_sk_fl_rcu(sk, sfl) {
if (sfl->fl->label == (np->flow_label & IPV6_FLOWLABEL_MASK)) {
spin_lock_bh(&ip6_fl_lock);
freq->flr_label = sfl->fl->label;
@@ -501,195 +530,219 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq,
freq->flr_linger = sfl->fl->linger / HZ;
spin_unlock_bh(&ip6_fl_lock);
- rcu_read_unlock_bh();
+ rcu_read_unlock();
return 0;
}
}
- rcu_read_unlock_bh();
+ rcu_read_unlock();
return -ENOENT;
}
-int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen)
+#define socklist_dereference(__sflp) \
+ rcu_dereference_protected(__sflp, lockdep_is_held(&ip6_sk_fl_lock))
+
+static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq)
{
- int uninitialized_var(err);
- struct net *net = sock_net(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
- struct in6_flowlabel_req freq;
- struct ipv6_fl_socklist *sfl1 = NULL;
- struct ipv6_fl_socklist *sfl;
struct ipv6_fl_socklist __rcu **sflp;
- struct ip6_flowlabel *fl, *fl1 = NULL;
+ struct ipv6_fl_socklist *sfl;
+ if (freq->flr_flags & IPV6_FL_F_REFLECT) {
+ if (sk->sk_protocol != IPPROTO_TCP)
+ return -ENOPROTOOPT;
+ if (!inet6_test_bit(REPFLOW, sk))
+ return -ESRCH;
+ np->flow_label = 0;
+ inet6_clear_bit(REPFLOW, sk);
+ return 0;
+ }
- if (optlen < sizeof(freq))
- return -EINVAL;
+ spin_lock_bh(&ip6_sk_fl_lock);
+ for (sflp = &inet_sk(sk)->ipv6_fl_list;
+ (sfl = socklist_dereference(*sflp)) != NULL;
+ sflp = &sfl->next) {
+ if (sfl->fl->label == freq->flr_label)
+ goto found;
+ }
+ spin_unlock_bh(&ip6_sk_fl_lock);
+ return -ESRCH;
+found:
+ if (freq->flr_label == (np->flow_label & IPV6_FLOWLABEL_MASK))
+ np->flow_label &= ~IPV6_FLOWLABEL_MASK;
+ *sflp = sfl->next;
+ spin_unlock_bh(&ip6_sk_fl_lock);
+ fl_release(sfl->fl);
+ kfree_rcu(sfl, rcu);
+ return 0;
+}
- if (copy_from_user(&freq, optval, sizeof(freq)))
- return -EFAULT;
+static int ipv6_flowlabel_renew(struct sock *sk, struct in6_flowlabel_req *freq)
+{
+ struct net *net = sock_net(sk);
+ struct ipv6_fl_socklist *sfl;
+ int err;
- switch (freq.flr_action) {
- case IPV6_FL_A_PUT:
- if (freq.flr_flags & IPV6_FL_F_REFLECT) {
- if (sk->sk_protocol != IPPROTO_TCP)
- return -ENOPROTOOPT;
- if (!np->repflow)
- return -ESRCH;
- np->flow_label = 0;
- np->repflow = 0;
- return 0;
- }
- spin_lock_bh(&ip6_sk_fl_lock);
- for (sflp = &np->ipv6_fl_list;
- (sfl = rcu_dereference_protected(*sflp,
- lockdep_is_held(&ip6_sk_fl_lock))) != NULL;
- sflp = &sfl->next) {
- if (sfl->fl->label == freq.flr_label) {
- if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK))
- np->flow_label &= ~IPV6_FLOWLABEL_MASK;
- *sflp = sfl->next;
- spin_unlock_bh(&ip6_sk_fl_lock);
- fl_release(sfl->fl);
- kfree_rcu(sfl, rcu);
- return 0;
- }
+ rcu_read_lock();
+ for_each_sk_fl_rcu(sk, sfl) {
+ if (sfl->fl->label == freq->flr_label) {
+ err = fl6_renew(sfl->fl, freq->flr_linger,
+ freq->flr_expires);
+ rcu_read_unlock();
+ return err;
}
- spin_unlock_bh(&ip6_sk_fl_lock);
- return -ESRCH;
+ }
+ rcu_read_unlock();
- case IPV6_FL_A_RENEW:
- rcu_read_lock_bh();
- for_each_sk_fl_rcu(np, sfl) {
- if (sfl->fl->label == freq.flr_label) {
- err = fl6_renew(sfl->fl, freq.flr_linger, freq.flr_expires);
- rcu_read_unlock_bh();
- return err;
- }
- }
- rcu_read_unlock_bh();
-
- if (freq.flr_share == IPV6_FL_S_NONE &&
- ns_capable(net->user_ns, CAP_NET_ADMIN)) {
- fl = fl_lookup(net, freq.flr_label);
- if (fl) {
- err = fl6_renew(fl, freq.flr_linger, freq.flr_expires);
- fl_release(fl);
- return err;
- }
- }
- return -ESRCH;
+ if (freq->flr_share == IPV6_FL_S_NONE &&
+ ns_capable(net->user_ns, CAP_NET_ADMIN)) {
+ struct ip6_flowlabel *fl = fl_lookup(net, freq->flr_label);
- case IPV6_FL_A_GET:
- if (freq.flr_flags & IPV6_FL_F_REFLECT) {
- struct net *net = sock_net(sk);
- if (net->ipv6.sysctl.flowlabel_consistency) {
- net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n");
- return -EPERM;
- }
+ if (fl) {
+ err = fl6_renew(fl, freq->flr_linger,
+ freq->flr_expires);
+ fl_release(fl);
+ return err;
+ }
+ }
+ return -ESRCH;
+}
- if (sk->sk_protocol != IPPROTO_TCP)
- return -ENOPROTOOPT;
+static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq,
+ sockptr_t optval, int optlen)
+{
+ struct ipv6_fl_socklist *sfl, *sfl1 = NULL;
+ struct ip6_flowlabel *fl, *fl1 = NULL;
+ struct net *net = sock_net(sk);
+ int err;
- np->repflow = 1;
- return 0;
+ if (freq->flr_flags & IPV6_FL_F_REFLECT) {
+ if (net->ipv6.sysctl.flowlabel_consistency) {
+ net_info_ratelimited("Can not set IPV6_FL_F_REFLECT if flowlabel_consistency sysctl is enable\n");
+ return -EPERM;
}
- if (freq.flr_label & ~IPV6_FLOWLABEL_MASK)
- return -EINVAL;
-
- if (net->ipv6.sysctl.flowlabel_state_ranges &&
- (freq.flr_label & IPV6_FLOWLABEL_STATELESS_FLAG))
- return -ERANGE;
+ if (sk->sk_protocol != IPPROTO_TCP)
+ return -ENOPROTOOPT;
+ inet6_set_bit(REPFLOW, sk);
+ return 0;
+ }
- fl = fl_create(net, sk, &freq, optval, optlen, &err);
- if (!fl)
- return err;
- sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL);
+ if (freq->flr_label & ~IPV6_FLOWLABEL_MASK)
+ return -EINVAL;
+ if (net->ipv6.sysctl.flowlabel_state_ranges &&
+ (freq->flr_label & IPV6_FLOWLABEL_STATELESS_FLAG))
+ return -ERANGE;
- if (freq.flr_label) {
- err = -EEXIST;
- rcu_read_lock_bh();
- for_each_sk_fl_rcu(np, sfl) {
- if (sfl->fl->label == freq.flr_label) {
- if (freq.flr_flags&IPV6_FL_F_EXCL) {
- rcu_read_unlock_bh();
- goto done;
- }
- fl1 = sfl->fl;
- atomic_inc(&fl1->users);
- break;
+ fl = fl_create(net, sk, freq, optval, optlen, &err);
+ if (!fl)
+ return err;
+
+ sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL);
+
+ if (freq->flr_label) {
+ err = -EEXIST;
+ rcu_read_lock();
+ for_each_sk_fl_rcu(sk, sfl) {
+ if (sfl->fl->label == freq->flr_label) {
+ if (freq->flr_flags & IPV6_FL_F_EXCL) {
+ rcu_read_unlock();
+ goto done;
}
+ fl1 = sfl->fl;
+ if (!atomic_inc_not_zero(&fl1->users))
+ fl1 = NULL;
+ break;
}
- rcu_read_unlock_bh();
+ }
+ rcu_read_unlock();
- if (!fl1)
- fl1 = fl_lookup(net, freq.flr_label);
- if (fl1) {
+ if (!fl1)
+ fl1 = fl_lookup(net, freq->flr_label);
+ if (fl1) {
recheck:
- err = -EEXIST;
- if (freq.flr_flags&IPV6_FL_F_EXCL)
- goto release;
- err = -EPERM;
- if (fl1->share == IPV6_FL_S_EXCL ||
- fl1->share != fl->share ||
- ((fl1->share == IPV6_FL_S_PROCESS) &&
- (fl1->owner.pid == fl->owner.pid)) ||
- ((fl1->share == IPV6_FL_S_USER) &&
- uid_eq(fl1->owner.uid, fl->owner.uid)))
- goto release;
-
- err = -ENOMEM;
- if (!sfl1)
- goto release;
- if (fl->linger > fl1->linger)
- fl1->linger = fl->linger;
- if ((long)(fl->expires - fl1->expires) > 0)
- fl1->expires = fl->expires;
- fl_link(np, sfl1, fl1);
- fl_free(fl);
- return 0;
+ err = -EEXIST;
+ if (freq->flr_flags&IPV6_FL_F_EXCL)
+ goto release;
+ err = -EPERM;
+ if (fl1->share == IPV6_FL_S_EXCL ||
+ fl1->share != fl->share ||
+ ((fl1->share == IPV6_FL_S_PROCESS) &&
+ (fl1->owner.pid != fl->owner.pid)) ||
+ ((fl1->share == IPV6_FL_S_USER) &&
+ !uid_eq(fl1->owner.uid, fl->owner.uid)))
+ goto release;
+
+ err = -ENOMEM;
+ if (!sfl1)
+ goto release;
+ if (fl->linger > fl1->linger)
+ fl1->linger = fl->linger;
+ if ((long)(fl->expires - fl1->expires) > 0)
+ fl1->expires = fl->expires;
+ fl_link(sk, sfl1, fl1);
+ fl_free(fl);
+ return 0;
release:
- fl_release(fl1);
- goto done;
- }
- }
- err = -ENOENT;
- if (!(freq.flr_flags&IPV6_FL_F_CREATE))
- goto done;
-
- err = -ENOMEM;
- if (!sfl1)
+ fl_release(fl1);
goto done;
+ }
+ }
+ err = -ENOENT;
+ if (!(freq->flr_flags & IPV6_FL_F_CREATE))
+ goto done;
- err = mem_check(sk);
- if (err != 0)
- goto done;
+ err = -ENOMEM;
+ if (!sfl1)
+ goto done;
- fl1 = fl_intern(net, fl, freq.flr_label);
- if (fl1)
- goto recheck;
+ err = mem_check(sk);
+ if (err != 0)
+ goto done;
- if (!freq.flr_label) {
- if (copy_to_user(&((struct in6_flowlabel_req __user *) optval)->flr_label,
- &fl->label, sizeof(fl->label))) {
- /* Intentionally ignore fault. */
- }
- }
+ fl1 = fl_intern(net, fl, freq->flr_label);
+ if (fl1)
+ goto recheck;
- fl_link(np, sfl1, fl);
- return 0;
+ if (!freq->flr_label) {
+ size_t offset = offsetof(struct in6_flowlabel_req, flr_label);
- default:
- return -EINVAL;
+ if (copy_to_sockptr_offset(optval, offset, &fl->label,
+ sizeof(fl->label))) {
+ /* Intentionally ignore fault. */
+ }
}
+ fl_link(sk, sfl1, fl);
+ return 0;
done:
fl_free(fl);
kfree(sfl1);
return err;
}
+int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen)
+{
+ struct in6_flowlabel_req freq;
+
+ if (optlen < sizeof(freq))
+ return -EINVAL;
+ if (copy_from_sockptr(&freq, optval, sizeof(freq)))
+ return -EFAULT;
+
+ switch (freq.flr_action) {
+ case IPV6_FL_A_PUT:
+ return ipv6_flowlabel_put(sk, &freq);
+ case IPV6_FL_A_RENEW:
+ return ipv6_flowlabel_renew(sk, &freq);
+ case IPV6_FL_A_GET:
+ return ipv6_flowlabel_get(sk, &freq, optval, optlen);
+ default:
+ return -EINVAL;
+ }
+}
+
#ifdef CONFIG_PROC_FS
struct ip6fl_iter_state {
@@ -755,9 +808,9 @@ static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos)
{
struct ip6fl_iter_state *state = ip6fl_seq_private(seq);
- state->pid_ns = proc_pid_ns(file_inode(seq->file));
+ state->pid_ns = proc_pid_ns(file_inode(seq->file)->i_sb);
- rcu_read_lock_bh();
+ rcu_read_lock();
return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
@@ -776,7 +829,7 @@ static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos)
static void ip6fl_seq_stop(struct seq_file *seq, void *v)
__releases(RCU)
{
- rcu_read_unlock_bh();
+ rcu_read_unlock();
}
static int ip6fl_seq_show(struct seq_file *seq, void *v)
@@ -851,6 +904,7 @@ int ip6_flowlabel_init(void)
void ip6_flowlabel_cleanup(void)
{
- del_timer(&ip6_fl_gc_timer);
+ static_key_deferred_flush(&ipv6_flowlabel_exclusive);
+ timer_delete(&ip6_fl_gc_timer);
unregister_pernet_subsys(&ip6_flowlabel_net_ops);
}
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index e493b041d4ac..c82a75510c0e 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1,13 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* GRE over IPv6 protocol decoder.
*
* Authors: Dmitry Kozlov (xeb@mail.ru)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -48,6 +43,7 @@
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <net/netdev_lock.h>
#include <net/rtnetlink.h>
#include <net/ipv6.h>
@@ -115,8 +111,32 @@ static u32 HASH_ADDR(const struct in6_addr *addr)
#define tunnels_l tunnels[1]
#define tunnels_wc tunnels[0]
-/* Given src, dst and key, find appropriate for input tunnel. */
+static bool ip6gre_tunnel_match(struct ip6_tnl *t, int dev_type, int link,
+ int *cand_score, struct ip6_tnl **ret)
+{
+ int score = 0;
+
+ if (t->dev->type != ARPHRD_IP6GRE &&
+ t->dev->type != dev_type)
+ return false;
+
+ if (t->parms.link != link)
+ score |= 1;
+ if (t->dev->type != dev_type)
+ score |= 2;
+ if (score == 0) {
+ *ret = t;
+ return true;
+ }
+ if (score < *cand_score) {
+ *ret = t;
+ *cand_score = score;
+ }
+ return false;
+}
+
+/* Given src, dst and key, find appropriate for input tunnel. */
static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
const struct in6_addr *remote, const struct in6_addr *local,
__be32 key, __be16 gre_proto)
@@ -131,7 +151,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
gre_proto == htons(ETH_P_ERSPAN) ||
gre_proto == htons(ETH_P_ERSPAN2)) ?
ARPHRD_ETHER : ARPHRD_IP6GRE;
- int score, cand_score = 4;
+ struct net_device *ndev;
+ int cand_score = 4;
for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) {
if (!ipv6_addr_equal(local, &t->parms.laddr) ||
@@ -140,22 +161,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
!(t->dev->flags & IFF_UP))
continue;
- if (t->dev->type != ARPHRD_IP6GRE &&
- t->dev->type != dev_type)
- continue;
-
- score = 0;
- if (t->parms.link != link)
- score |= 1;
- if (t->dev->type != dev_type)
- score |= 2;
- if (score == 0)
- return t;
-
- if (score < cand_score) {
- cand = t;
- cand_score = score;
- }
+ if (ip6gre_tunnel_match(t, dev_type, link, &cand_score, &cand))
+ return cand;
}
for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) {
@@ -164,22 +171,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
!(t->dev->flags & IFF_UP))
continue;
- if (t->dev->type != ARPHRD_IP6GRE &&
- t->dev->type != dev_type)
- continue;
-
- score = 0;
- if (t->parms.link != link)
- score |= 1;
- if (t->dev->type != dev_type)
- score |= 2;
- if (score == 0)
- return t;
-
- if (score < cand_score) {
- cand = t;
- cand_score = score;
- }
+ if (ip6gre_tunnel_match(t, dev_type, link, &cand_score, &cand))
+ return cand;
}
for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) {
@@ -190,22 +183,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
!(t->dev->flags & IFF_UP))
continue;
- if (t->dev->type != ARPHRD_IP6GRE &&
- t->dev->type != dev_type)
- continue;
-
- score = 0;
- if (t->parms.link != link)
- score |= 1;
- if (t->dev->type != dev_type)
- score |= 2;
- if (score == 0)
- return t;
-
- if (score < cand_score) {
- cand = t;
- cand_score = score;
- }
+ if (ip6gre_tunnel_match(t, dev_type, link, &cand_score, &cand))
+ return cand;
}
for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) {
@@ -213,22 +192,8 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
!(t->dev->flags & IFF_UP))
continue;
- if (t->dev->type != ARPHRD_IP6GRE &&
- t->dev->type != dev_type)
- continue;
-
- score = 0;
- if (t->parms.link != link)
- score |= 1;
- if (t->dev->type != dev_type)
- score |= 2;
- if (score == 0)
- return t;
-
- if (score < cand_score) {
- cand = t;
- cand_score = score;
- }
+ if (ip6gre_tunnel_match(t, dev_type, link, &cand_score, &cand))
+ return cand;
}
if (cand)
@@ -243,9 +208,9 @@ static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev,
if (t && t->dev->flags & IFF_UP)
return t;
- dev = ign->fb_tunnel_dev;
- if (dev && dev->flags & IFF_UP)
- return netdev_priv(dev);
+ ndev = READ_ONCE(ign->fb_tunnel_dev);
+ if (ndev && ndev->flags & IFF_UP)
+ return netdev_priv(ndev);
return NULL;
}
@@ -364,9 +329,9 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net,
if (parms->name[0]) {
if (!dev_valid_name(parms->name))
return NULL;
- strlcpy(name, parms->name, IFNAMSIZ);
+ strscpy(name, parms->name);
} else {
- strcpy(name, "ip6gre%d");
+ strscpy(name, "ip6gre%d");
}
dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN,
ip6gre_tunnel_setup);
@@ -386,12 +351,6 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net,
goto failed_free;
ip6gre_tnl_link_config(nt, 1);
-
- /* Can use a lockless transmit, unless we generate output sequences */
- if (!(nt->parms.o_flags & TUNNEL_SEQ))
- dev->features |= NETIF_F_LLTX;
-
- dev_hold(dev);
ip6gre_tunnel_link(ign, nt);
return nt;
@@ -408,7 +367,7 @@ static void ip6erspan_tunnel_uninit(struct net_device *dev)
ip6erspan_tunnel_unlink_md(ign, t);
ip6gre_tunnel_unlink(ign, t);
dst_cache_reset(&t->dst_cache);
- dev_put(dev);
+ netdev_put(dev, &t->dev_tracker);
}
static void ip6gre_tunnel_uninit(struct net_device *dev)
@@ -418,64 +377,49 @@ static void ip6gre_tunnel_uninit(struct net_device *dev)
ip6gre_tunnel_unlink_md(ign, t);
ip6gre_tunnel_unlink(ign, t);
+ if (ign->fb_tunnel_dev == dev)
+ WRITE_ONCE(ign->fb_tunnel_dev, NULL);
dst_cache_reset(&t->dst_cache);
- dev_put(dev);
+ netdev_put(dev, &t->dev_tracker);
}
-static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct net *net = dev_net(skb->dev);
- const struct gre_base_hdr *greh;
const struct ipv6hdr *ipv6h;
- int grehlen = sizeof(*greh);
+ struct tnl_ptk_info tpi;
struct ip6_tnl *t;
- int key_off = 0;
- __be16 flags;
- __be32 key;
- if (!pskb_may_pull(skb, offset + grehlen))
- return;
- greh = (const struct gre_base_hdr *)(skb->data + offset);
- flags = greh->flags;
- if (flags & (GRE_VERSION | GRE_ROUTING))
- return;
- if (flags & GRE_CSUM)
- grehlen += 4;
- if (flags & GRE_KEY) {
- key_off = grehlen + offset;
- grehlen += 4;
- }
+ if (gre_parse_header(skb, &tpi, NULL, htons(ETH_P_IPV6),
+ offset) < 0)
+ return -EINVAL;
- if (!pskb_may_pull(skb, offset + grehlen))
- return;
ipv6h = (const struct ipv6hdr *)skb->data;
- greh = (const struct gre_base_hdr *)(skb->data + offset);
- key = key_off ? *(__be32 *)(skb->data + key_off) : 0;
-
t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr,
- key, greh->protocol);
+ tpi.key, tpi.proto);
if (!t)
- return;
+ return -ENOENT;
switch (type) {
- struct ipv6_tlv_tnl_enc_lim *tel;
- __u32 teli;
case ICMPV6_DEST_UNREACH:
net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n",
t->parms.name);
if (code != ICMPV6_PORT_UNREACH)
break;
- return;
+ return 0;
case ICMPV6_TIME_EXCEED:
if (code == ICMPV6_EXC_HOPLIMIT) {
net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n",
t->parms.name);
break;
}
- return;
- case ICMPV6_PARAMPROB:
+ return 0;
+ case ICMPV6_PARAMPROB: {
+ struct ipv6_tlv_tnl_enc_lim *tel;
+ __u32 teli;
+
teli = 0;
if (code == ICMPV6_HDR_FIELD)
teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data);
@@ -490,14 +434,15 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n",
t->parms.name);
}
- return;
+ return 0;
+ }
case ICMPV6_PKT_TOOBIG:
ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
- return;
+ return 0;
case NDISC_REDIRECT:
ip6_redirect(skb, net, skb->dev->ifindex, 0,
sock_net_uid(net, NULL));
- return;
+ return 0;
}
if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO))
@@ -505,6 +450,8 @@ static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
else
t->err_count = 1;
t->err_time = jiffies;
+
+ return 0;
}
static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
@@ -518,11 +465,11 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
tpi->proto);
if (tunnel) {
if (tunnel->parms.collect_md) {
+ IP_TUNNEL_DECLARE_FLAGS(flags);
struct metadata_dst *tun_dst;
__be64 tun_id;
- __be16 flags;
- flags = tpi->flags;
+ ip_tunnel_flags_copy(flags, tpi->flags);
tun_id = key32_to_tunnel_id(tpi->key);
tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id, 0);
@@ -540,11 +487,11 @@ static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
return PACKET_REJECT;
}
-static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len,
- struct tnl_ptk_info *tpi)
+static int ip6erspan_rcv(struct sk_buff *skb,
+ struct tnl_ptk_info *tpi,
+ int gre_hdr_len)
{
struct erspan_base_hdr *ershdr;
- struct erspan_metadata *pkt_md;
const struct ipv6hdr *ipv6h;
struct erspan_md2 *md2;
struct ip6_tnl *tunnel;
@@ -556,7 +503,6 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len,
ipv6h = ipv6_hdr(skb);
ershdr = (struct erspan_base_hdr *)skb->data;
ver = ershdr->ver;
- tpi->key = cpu_to_be32(get_session_id(ershdr));
tunnel = ip6gre_tunnel_lookup(skb->dev,
&ipv6h->saddr, &ipv6h->daddr, tpi->key,
@@ -567,23 +513,21 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len,
if (unlikely(!pskb_may_pull(skb, len)))
return PACKET_REJECT;
- ershdr = (struct erspan_base_hdr *)skb->data;
- pkt_md = (struct erspan_metadata *)(ershdr + 1);
-
if (__iptunnel_pull_header(skb, len,
htons(ETH_P_TEB),
false, false) < 0)
return PACKET_REJECT;
if (tunnel->parms.collect_md) {
+ struct erspan_metadata *pkt_md, *md;
+ IP_TUNNEL_DECLARE_FLAGS(flags);
struct metadata_dst *tun_dst;
struct ip_tunnel_info *info;
- struct erspan_metadata *md;
+ unsigned char *gh;
__be64 tun_id;
- __be16 flags;
- tpi->flags |= TUNNEL_KEY;
- flags = tpi->flags;
+ __set_bit(IP_TUNNEL_KEY_BIT, tpi->flags);
+ ip_tunnel_flags_copy(flags, tpi->flags);
tun_id = key32_to_tunnel_id(tpi->key);
tun_dst = ipv6_tun_rx_dst(skb, flags, tun_id,
@@ -591,13 +535,22 @@ static int ip6erspan_rcv(struct sk_buff *skb, int gre_hdr_len,
if (!tun_dst)
return PACKET_REJECT;
+ /* skb can be uncloned in __iptunnel_pull_header, so
+ * old pkt_md is no longer valid and we need to reset
+ * it
+ */
+ gh = skb_network_header(skb) +
+ skb_network_header_len(skb);
+ pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len +
+ sizeof(*ershdr));
info = &tun_dst->u.tun_info;
md = ip_tunnel_info_opts(info);
md->version = ver;
md2 = &md->u.md2;
memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE :
ERSPAN_V2_MDSIZE);
- info->key.tun_flags |= TUNNEL_ERSPAN_OPT;
+ __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT,
+ info->key.tun_flags);
info->options_len = sizeof(*md);
ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error);
@@ -627,7 +580,7 @@ static int gre_rcv(struct sk_buff *skb)
if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) ||
tpi.proto == htons(ETH_P_ERSPAN2))) {
- if (ip6erspan_rcv(skb, hdr_len, &tpi) == PACKET_RCVD)
+ if (ip6erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD)
return 0;
goto out;
}
@@ -679,20 +632,21 @@ static int prepare_ip6gre_xmit_ipv6(struct sk_buff *skb,
struct flowi6 *fl6, __u8 *dsfield,
int *encap_limit)
{
- struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ struct ipv6hdr *ipv6h;
struct ip6_tnl *t = netdev_priv(dev);
__u16 offset;
offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
/* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
+ ipv6h = ipv6_hdr(skb);
if (offset > 0) {
struct ipv6_tlv_tnl_enc_lim *tel;
tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
if (tel->encap_limit == 0) {
- icmpv6_send(skb, ICMPV6_PARAMPROB,
- ICMPV6_HDR_FIELD, offset + 2);
+ icmpv6_ndo_send(skb, ICMPV6_PARAMPROB,
+ ICMPV6_HDR_FIELD, offset + 2);
return -1;
}
*encap_limit = tel->encap_limit - 1;
@@ -720,12 +674,51 @@ static int prepare_ip6gre_xmit_ipv6(struct sk_buff *skb,
return 0;
}
+static int prepare_ip6gre_xmit_other(struct sk_buff *skb,
+ struct net_device *dev,
+ struct flowi6 *fl6, __u8 *dsfield,
+ int *encap_limit)
+{
+ struct ip6_tnl *t = netdev_priv(dev);
+
+ if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+ *encap_limit = t->parms.encap_limit;
+
+ memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6));
+
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ *dsfield = 0;
+ else
+ *dsfield = ip6_tclass(t->parms.flowinfo);
+
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
+ fl6->flowi6_mark = skb->mark;
+ else
+ fl6->flowi6_mark = t->parms.fwmark;
+
+ fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+
+ return 0;
+}
+
+static struct ip_tunnel_info *skb_tunnel_info_txcheck(struct sk_buff *skb)
+{
+ struct ip_tunnel_info *tun_info;
+
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX)))
+ return ERR_PTR(-EINVAL);
+
+ return tun_info;
+}
+
static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
struct net_device *dev, __u8 dsfield,
struct flowi6 *fl6, int encap_limit,
__u32 *pmtu, __be16 proto)
{
struct ip6_tnl *tunnel = netdev_priv(dev);
+ IP_TUNNEL_DECLARE_FLAGS(flags);
__be16 protocol;
if (dev->type == ARPHRD_ETHER)
@@ -736,21 +729,17 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
else
fl6->daddr = tunnel->parms.raddr;
- if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen))
- return -ENOMEM;
-
/* Push GRE header. */
protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto;
if (tunnel->parms.collect_md) {
struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
- __be16 flags;
+ int tun_hlen;
- tun_info = skb_tunnel_info(skb);
- if (unlikely(!tun_info ||
- !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
- ip_tunnel_info_af(tun_info) != AF_INET6))
+ tun_info = skb_tunnel_info_txcheck(skb);
+ if (IS_ERR(tun_info) ||
+ unlikely(ip_tunnel_info_af(tun_info) != AF_INET6))
return -EINVAL;
key = &tun_info->key;
@@ -759,25 +748,37 @@ static netdev_tx_t __gre6_xmit(struct sk_buff *skb,
fl6->daddr = key->u.ipv6.dst;
fl6->flowlabel = key->label;
fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+ fl6->fl6_gre_key = tunnel_id_to_key32(key->tun_id);
dsfield = key->tos;
- flags = key->tun_flags &
- (TUNNEL_CSUM | TUNNEL_KEY | TUNNEL_SEQ);
- tunnel->tun_hlen = gre_calc_hlen(flags);
+ ip_tunnel_flags_zero(flags);
+ __set_bit(IP_TUNNEL_CSUM_BIT, flags);
+ __set_bit(IP_TUNNEL_KEY_BIT, flags);
+ __set_bit(IP_TUNNEL_SEQ_BIT, flags);
+ ip_tunnel_flags_and(flags, flags, key->tun_flags);
+ tun_hlen = gre_calc_hlen(flags);
+
+ if (skb_cow_head(skb, dev->needed_headroom ?: tun_hlen + tunnel->encap_hlen))
+ return -ENOMEM;
- gre_build_header(skb, tunnel->tun_hlen,
+ gre_build_header(skb, tun_hlen,
flags, protocol,
tunnel_id_to_key32(tun_info->key.tun_id),
- (flags & TUNNEL_SEQ) ? htonl(tunnel->o_seqno++)
- : 0);
+ test_bit(IP_TUNNEL_SEQ_BIT, flags) ?
+ htonl(atomic_fetch_inc(&tunnel->o_seqno)) :
+ 0);
} else {
- if (tunnel->parms.o_flags & TUNNEL_SEQ)
- tunnel->o_seqno++;
+ if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen))
+ return -ENOMEM;
- gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
+ ip_tunnel_flags_copy(flags, tunnel->parms.o_flags);
+
+ gre_build_header(skb, tunnel->tun_hlen, flags,
protocol, tunnel->parms.o_key,
- htonl(tunnel->o_seqno));
+ test_bit(IP_TUNNEL_SEQ_BIT, flags) ?
+ htonl(atomic_fetch_inc(&tunnel->o_seqno)) :
+ 0);
}
return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu,
@@ -799,7 +800,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
prepare_ip6gre_xmit_ipv4(skb, dev, &fl6,
&dsfield, &encap_limit);
- err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
+ err = gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
+ t->parms.o_flags));
if (err)
return -1;
@@ -808,8 +810,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
if (err != 0) {
/* XXX: send ICMP error even if DF is not set. */
if (err == -EMSGSIZE)
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(mtu));
+ icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
return -1;
}
@@ -833,59 +835,39 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev)
prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit))
return -1;
- if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)))
+ if (gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
+ t->parms.o_flags)))
return -1;
err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit,
&mtu, skb->protocol);
if (err != 0) {
if (err == -EMSGSIZE)
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
return -1;
}
return 0;
}
-/**
- * ip6gre_tnl_addr_conflict - compare packet addresses to tunnel's own
- * @t: the outgoing tunnel device
- * @hdr: IPv6 header from the incoming packet
- *
- * Description:
- * Avoid trivial tunneling loop by checking that tunnel exit-point
- * doesn't match source of incoming packet.
- *
- * Return:
- * 1 if conflict,
- * 0 else
- **/
-
-static inline bool ip6gre_tnl_addr_conflict(const struct ip6_tnl *t,
- const struct ipv6hdr *hdr)
-{
- return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr);
-}
-
static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
int encap_limit = -1;
struct flowi6 fl6;
+ __u8 dsfield = 0;
__u32 mtu;
int err;
- if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
- encap_limit = t->parms.encap_limit;
-
- if (!t->parms.collect_md)
- memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
+ if (!t->parms.collect_md &&
+ prepare_ip6gre_xmit_other(skb, dev, &fl6, &dsfield, &encap_limit))
+ return -1;
- err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM));
+ err = gre_handle_offloads(skb, test_bit(IP_TUNNEL_CSUM_BIT,
+ t->parms.o_flags));
if (err)
return err;
-
- err = __gre6_xmit(skb, dev, 0, &fl6, encap_limit, &mtu, skb->protocol);
+ err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, skb->protocol);
return err;
}
@@ -894,13 +876,17 @@ static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb,
struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
- struct net_device_stats *stats = &t->dev->stats;
+ __be16 payload_protocol;
int ret;
+ if (!pskb_inet_may_pull(skb))
+ goto tx_err;
+
if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr))
goto tx_err;
- switch (skb->protocol) {
+ payload_protocol = skb_protocol(skb, true);
+ switch (payload_protocol) {
case htons(ETH_P_IP):
ret = ip6gre_xmit_ipv4(skb, dev);
break;
@@ -918,8 +904,9 @@ static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb,
return NETDEV_TX_OK;
tx_err:
- stats->tx_errors++;
- stats->tx_dropped++;
+ if (!t->parms.collect_md || !IS_ERR(skb_tunnel_info_txcheck(skb)))
+ DEV_STATS_INC(dev, tx_errors);
+ DEV_STATS_INC(dev, tx_dropped);
kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -927,17 +914,21 @@ tx_err:
static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
struct net_device *dev)
{
+ struct ip_tunnel_info *tun_info = NULL;
struct ip6_tnl *t = netdev_priv(dev);
struct dst_entry *dst = skb_dst(skb);
- struct net_device_stats *stats;
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
bool truncate = false;
int encap_limit = -1;
__u8 dsfield = false;
struct flowi6 fl6;
int err = -EINVAL;
+ __be16 proto;
__u32 mtu;
int nhoff;
- int thoff;
+
+ if (!pskb_inet_may_pull(skb))
+ goto tx_err;
if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr))
goto tx_err;
@@ -946,40 +937,45 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
goto tx_err;
if (skb->len > dev->mtu + dev->hard_header_len) {
- pskb_trim(skb, dev->mtu + dev->hard_header_len);
+ if (pskb_trim(skb, dev->mtu + dev->hard_header_len))
+ goto tx_err;
truncate = true;
}
- nhoff = skb_network_header(skb) - skb_mac_header(skb);
+ nhoff = skb_network_offset(skb);
if (skb->protocol == htons(ETH_P_IP) &&
(ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff))
truncate = true;
- thoff = skb_transport_header(skb) - skb_mac_header(skb);
- if (skb->protocol == htons(ETH_P_IPV6) &&
- (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff))
- truncate = true;
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ int thoff;
+
+ if (skb_transport_header_was_set(skb))
+ thoff = skb_transport_offset(skb);
+ else
+ thoff = nhoff + sizeof(struct ipv6hdr);
+ if (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)
+ truncate = true;
+ }
if (skb_cow_head(skb, dev->needed_headroom ?: t->hlen))
goto tx_err;
- t->parms.o_flags &= ~TUNNEL_KEY;
+ __clear_bit(IP_TUNNEL_KEY_BIT, t->parms.o_flags);
IPCB(skb)->flags = 0;
/* For collect_md mode, derive fl6 from the tunnel key,
* for native mode, call prepare_ip6gre_xmit_{ipv4,ipv6}.
*/
if (t->parms.collect_md) {
- struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
struct erspan_metadata *md;
__be32 tun_id;
- tun_info = skb_tunnel_info(skb);
- if (unlikely(!tun_info ||
- !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
- ip_tunnel_info_af(tun_info) != AF_INET6))
- return -EINVAL;
+ tun_info = skb_tunnel_info_txcheck(skb);
+ if (IS_ERR(tun_info) ||
+ unlikely(ip_tunnel_info_af(tun_info) != AF_INET6))
+ goto tx_err;
key = &tun_info->key;
memset(&fl6, 0, sizeof(fl6));
@@ -987,13 +983,15 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
fl6.daddr = key->u.ipv6.dst;
fl6.flowlabel = key->label;
fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+ fl6.fl6_gre_key = tunnel_id_to_key32(key->tun_id);
dsfield = key->tos;
- if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT))
+ if (!test_bit(IP_TUNNEL_ERSPAN_OPT_BIT,
+ tun_info->key.tun_flags))
goto tx_err;
- md = ip_tunnel_info_opts(tun_info);
- if (!md)
+ if (tun_info->options_len < sizeof(*md))
goto tx_err;
+ md = ip_tunnel_info_opts(tun_info);
tun_id = tunnel_id_to_key32(key->tun_id);
if (md->version == 1) {
@@ -1001,18 +999,18 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
ntohl(tun_id),
ntohl(md->u.index), truncate,
false);
+ proto = htons(ETH_P_ERSPAN);
} else if (md->version == 2) {
erspan_build_header_v2(skb,
ntohl(tun_id),
md->u.md2.dir,
get_hwid(&md->u.md2),
truncate, false);
+ proto = htons(ETH_P_ERSPAN2);
} else {
goto tx_err;
}
} else {
- struct ipv6hdr *ipv6h = ipv6_hdr(skb);
-
switch (skb->protocol) {
case htons(ETH_P_IP):
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
@@ -1020,7 +1018,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
&dsfield, &encap_limit);
break;
case htons(ETH_P_IPV6):
- if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr))
+ if (ipv6_addr_equal(&t->parms.raddr, &ipv6_hdr(skb)->saddr))
goto tx_err;
if (prepare_ip6gre_xmit_ipv6(skb, dev, &fl6,
&dsfield, &encap_limit))
@@ -1031,39 +1029,45 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
break;
}
- if (t->parms.erspan_ver == 1)
+ if (t->parms.erspan_ver == 1) {
erspan_build_header(skb, ntohl(t->parms.o_key),
t->parms.index,
truncate, false);
- else if (t->parms.erspan_ver == 2)
+ proto = htons(ETH_P_ERSPAN);
+ } else if (t->parms.erspan_ver == 2) {
erspan_build_header_v2(skb, ntohl(t->parms.o_key),
t->parms.dir,
t->parms.hwid,
truncate, false);
- else
+ proto = htons(ETH_P_ERSPAN2);
+ } else {
goto tx_err;
+ }
fl6.daddr = t->parms.raddr;
}
/* Push GRE header. */
- gre_build_header(skb, 8, TUNNEL_SEQ,
- htons(ETH_P_ERSPAN), 0, htonl(t->o_seqno++));
+ __set_bit(IP_TUNNEL_SEQ_BIT, flags);
+ gre_build_header(skb, 8, flags, proto, 0,
+ htonl(atomic_fetch_inc(&t->o_seqno)));
/* TooBig packet may have updated dst->dev's mtu */
- if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu)
- dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu);
-
+ if (!t->parms.collect_md && dst) {
+ mtu = READ_ONCE(dst_dev(dst)->mtu);
+ if (dst_mtu(dst) > mtu)
+ dst->ops->update_pmtu(dst, NULL, skb, mtu, false);
+ }
err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
NEXTHDR_GRE);
if (err != 0) {
/* XXX: send ICMP error even if DF is not set. */
if (err == -EMSGSIZE) {
if (skb->protocol == htons(ETH_P_IP))
- icmp_send(skb, ICMP_DEST_UNREACH,
- ICMP_FRAG_NEEDED, htonl(mtu));
+ icmp_ndo_send(skb, ICMP_DEST_UNREACH,
+ ICMP_FRAG_NEEDED, htonl(mtu));
else
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
}
goto tx_err;
@@ -1071,9 +1075,9 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
return NETDEV_TX_OK;
tx_err:
- stats = &t->dev->stats;
- stats->tx_errors++;
- stats->tx_dropped++;
+ if (!IS_ERR(tun_info))
+ DEV_STATS_INC(dev, tx_errors);
+ DEV_STATS_INC(dev, tx_dropped);
kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -1085,7 +1089,7 @@ static void ip6gre_tnl_link_config_common(struct ip6_tnl *t)
struct flowi6 *fl6 = &t->fl.u.ip6;
if (dev->type != ARPHRD_ETHER) {
- memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
+ __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr));
memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
}
@@ -1095,6 +1099,7 @@ static void ip6gre_tnl_link_config_common(struct ip6_tnl *t)
fl6->flowi6_oif = p->link;
fl6->flowlabel = 0;
fl6->flowi6_proto = IPPROTO_GRE;
+ fl6->fl6_gre_key = t->parms.o_key;
if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS))
fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo;
@@ -1129,18 +1134,25 @@ static void ip6gre_tnl_link_config_route(struct ip6_tnl *t, int set_mtu,
return;
if (rt->dst.dev) {
- dev->needed_headroom = rt->dst.dev->hard_header_len +
- t_hlen;
+ unsigned short dst_len = rt->dst.dev->hard_header_len +
+ t_hlen;
+
+ if (t->dev->header_ops)
+ dev->hard_header_len = dst_len;
+ else
+ dev->needed_headroom = dst_len;
if (set_mtu) {
- dev->mtu = rt->dst.dev->mtu - t_hlen;
+ int mtu = rt->dst.dev->mtu - t_hlen;
+
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
- dev->mtu -= 8;
+ mtu -= 8;
if (dev->type == ARPHRD_ETHER)
- dev->mtu -= ETH_HLEN;
+ mtu -= ETH_HLEN;
- if (dev->mtu < IPV6_MIN_MTU)
- dev->mtu = IPV6_MIN_MTU;
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+ WRITE_ONCE(dev->mtu, mtu);
}
}
ip6_rt_put(rt);
@@ -1155,7 +1167,12 @@ static int ip6gre_calc_hlen(struct ip6_tnl *tunnel)
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
t_hlen = tunnel->hlen + sizeof(struct ipv6hdr);
- tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen;
+
+ if (tunnel->dev->header_ops)
+ tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+ else
+ tunnel->dev->needed_headroom = LL_MAX_HEADER + t_hlen;
+
return t_hlen;
}
@@ -1178,9 +1195,13 @@ static void ip6gre_tnl_copy_tnl_parm(struct ip6_tnl *t,
t->parms.proto = p->proto;
t->parms.i_key = p->i_key;
t->parms.o_key = p->o_key;
- t->parms.i_flags = p->i_flags;
- t->parms.o_flags = p->o_flags;
+ ip_tunnel_flags_copy(t->parms.i_flags, p->i_flags);
+ ip_tunnel_flags_copy(t->parms.o_flags, p->o_flags);
t->parms.fwmark = p->fwmark;
+ t->parms.erspan_ver = p->erspan_ver;
+ t->parms.index = p->index;
+ t->parms.dir = p->dir;
+ t->parms.hwid = p->hwid;
dst_cache_reset(&t->dst_cache);
}
@@ -1204,8 +1225,8 @@ static void ip6gre_tnl_parm_from_user(struct __ip6_tnl_parm *p,
p->link = u->link;
p->i_key = u->i_key;
p->o_key = u->o_key;
- p->i_flags = gre_flags_to_tnl_flags(u->i_flags);
- p->o_flags = gre_flags_to_tnl_flags(u->o_flags);
+ gre_flags_to_tnl_flags(p->i_flags, u->i_flags);
+ gre_flags_to_tnl_flags(p->o_flags, u->o_flags);
memcpy(p->name, u->name, sizeof(u->name));
}
@@ -1227,8 +1248,9 @@ static void ip6gre_tnl_parm_to_user(struct ip6_tnl_parm2 *u,
memcpy(u->name, p->name, sizeof(u->name));
}
-static int ip6gre_tunnel_ioctl(struct net_device *dev,
- struct ifreq *ifr, int cmd)
+static int ip6gre_tunnel_siocdevprivate(struct net_device *dev,
+ struct ifreq *ifr, void __user *data,
+ int cmd)
{
int err = 0;
struct ip6_tnl_parm2 p;
@@ -1242,7 +1264,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev,
switch (cmd) {
case SIOCGETTUNNEL:
if (dev == ign->fb_tunnel_dev) {
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+ if (copy_from_user(&p, data, sizeof(p))) {
err = -EFAULT;
break;
}
@@ -1253,7 +1275,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev,
}
memset(&p, 0, sizeof(p));
ip6gre_tnl_parm_to_user(&p, &t->parms);
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ if (copy_to_user(data, &p, sizeof(p)))
err = -EFAULT;
break;
@@ -1264,7 +1286,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev,
goto done;
err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ if (copy_from_user(&p, data, sizeof(p)))
goto done;
err = -EINVAL;
@@ -1301,7 +1323,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev,
memset(&p, 0, sizeof(p));
ip6gre_tnl_parm_to_user(&p, &t->parms);
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ if (copy_to_user(data, &p, sizeof(p)))
err = -EFAULT;
} else
err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
@@ -1314,7 +1336,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev,
if (dev == ign->fb_tunnel_dev) {
err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ if (copy_from_user(&p, data, sizeof(p)))
goto done;
err = -ENOENT;
ip6gre_tnl_parm_from_user(&p1, &p);
@@ -1356,7 +1378,7 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev,
ipv6h->daddr = t->parms.raddr;
p = (__be16 *)(ipv6h + 1);
- p[0] = t->parms.o_flags;
+ p[0] = ip_tunnel_flags_to_be16(t->parms.o_flags);
p[1] = htons(type);
/*
@@ -1381,9 +1403,8 @@ static const struct net_device_ops ip6gre_netdev_ops = {
.ndo_init = ip6gre_tunnel_init,
.ndo_uninit = ip6gre_tunnel_uninit,
.ndo_start_xmit = ip6gre_tunnel_xmit,
- .ndo_do_ioctl = ip6gre_tunnel_ioctl,
+ .ndo_siocdevprivate = ip6gre_tunnel_siocdevprivate,
.ndo_change_mtu = ip6_tnl_change_mtu,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
.ndo_get_iflink = ip6_tnl_get_iflink,
};
@@ -1393,7 +1414,6 @@ static void ip6gre_dev_free(struct net_device *dev)
gro_cells_destroy(&t->gro_cells);
dst_cache_destroy(&t->dst_cache);
- free_percpu(dev->tstats);
}
static void ip6gre_tunnel_setup(struct net_device *dev)
@@ -1402,6 +1422,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
dev->needs_free_netdev = true;
dev->priv_destructor = ip6gre_dev_free;
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
dev->type = ARPHRD_IP6GRE;
dev->flags |= IFF_NOARP;
@@ -1424,22 +1445,19 @@ static void ip6gre_tnl_init_features(struct net_device *dev)
dev->features |= GRE6_FEATURES;
dev->hw_features |= GRE6_FEATURES;
- if (!(nt->parms.o_flags & TUNNEL_SEQ)) {
- /* TCP offload with GRE SEQ is not supported, nor
- * can we support 2 levels of outer headers requiring
- * an update.
- */
- if (!(nt->parms.o_flags & TUNNEL_CSUM) ||
- nt->encap.type == TUNNEL_ENCAP_NONE) {
- dev->features |= NETIF_F_GSO_SOFTWARE;
- dev->hw_features |= NETIF_F_GSO_SOFTWARE;
- }
+ /* TCP offload with GRE SEQ is not supported, nor can we support 2
+ * levels of outer headers requiring an update.
+ */
+ if (test_bit(IP_TUNNEL_SEQ_BIT, nt->parms.o_flags))
+ return;
+ if (test_bit(IP_TUNNEL_CSUM_BIT, nt->parms.o_flags) &&
+ nt->encap.type != TUNNEL_ENCAP_NONE)
+ return;
- /* Can use a lockless transmit, unless we generate
- * output sequences
- */
- dev->features |= NETIF_F_LLTX;
- }
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+
+ dev->lltx = true;
}
static int ip6gre_tunnel_init_common(struct net_device *dev)
@@ -1451,16 +1469,11 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
tunnel = netdev_priv(dev);
tunnel->dev = dev;
- tunnel->net = dev_net(dev);
- strcpy(tunnel->parms.name, dev->name);
-
- dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
- if (!dev->tstats)
- return -ENOMEM;
+ strscpy(tunnel->parms.name, dev->name);
ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
if (ret)
- goto cleanup_alloc_pcpu_stats;
+ return ret;
ret = gro_cells_init(&tunnel->gro_cells, dev);
if (ret)
@@ -1474,18 +1487,16 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
dev->mtu -= 8;
if (tunnel->parms.collect_md) {
- dev->features |= NETIF_F_NETNS_LOCAL;
netif_keep_dst(dev);
}
ip6gre_tnl_init_features(dev);
+ netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL);
+ netdev_lockdep_set_classes(dev);
return 0;
cleanup_dst_cache_init:
dst_cache_destroy(&tunnel->dst_cache);
-cleanup_alloc_pcpu_stats:
- free_percpu(dev->tstats);
- dev->tstats = NULL;
return ret;
}
@@ -1503,7 +1514,7 @@ static int ip6gre_tunnel_init(struct net_device *dev)
if (tunnel->parms.collect_md)
return 0;
- memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr));
+ __dev_addr_set(dev, &tunnel->parms.laddr, sizeof(struct in6_addr));
memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr));
if (ipv6_addr_any(&tunnel->parms.raddr))
@@ -1518,20 +1529,18 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev)
tunnel->dev = dev;
tunnel->net = dev_net(dev);
- strcpy(tunnel->parms.name, dev->name);
+ strscpy(tunnel->parms.name, dev->name);
tunnel->hlen = sizeof(struct ipv6hdr) + 4;
-
- dev_hold(dev);
}
static struct inet6_protocol ip6gre_protocol __read_mostly = {
.handler = gre_rcv,
.err_handler = ip6gre_err,
- .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+ .flags = INET6_PROTO_FINAL,
};
-static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head)
+static void __net_exit ip6gre_exit_rtnl_net(struct net *net, struct list_head *head)
{
struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
struct net_device *dev, *aux;
@@ -1548,16 +1557,16 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head)
for (h = 0; h < IP6_GRE_HASH_SIZE; h++) {
struct ip6_tnl *t;
- t = rtnl_dereference(ign->tunnels[prio][h]);
+ t = rtnl_net_dereference(net, ign->tunnels[prio][h]);
while (t) {
/* If dev is in the same netns, it has already
* been added to the list by the previous loop.
*/
if (!net_eq(dev_net(t->dev), net))
- unregister_netdevice_queue(t->dev,
- head);
- t = rtnl_dereference(t->next);
+ unregister_netdevice_queue(t->dev, head);
+
+ t = rtnl_net_dereference(net, t->next);
}
}
}
@@ -1566,23 +1575,23 @@ static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head)
static int __net_init ip6gre_init_net(struct net *net)
{
struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
+ struct net_device *ndev;
int err;
if (!net_has_fallback_tunnels(net))
return 0;
- ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0",
- NET_NAME_UNKNOWN,
- ip6gre_tunnel_setup);
- if (!ign->fb_tunnel_dev) {
+ ndev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0",
+ NET_NAME_UNKNOWN, ip6gre_tunnel_setup);
+ if (!ndev) {
err = -ENOMEM;
goto err_alloc_dev;
}
+ ign->fb_tunnel_dev = ndev;
dev_net_set(ign->fb_tunnel_dev, net);
/* FB netdevice is special: we have one, and only one per netns.
* Allowing to move it to another netns is clearly unsafe.
*/
- ign->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
-
+ ign->fb_tunnel_dev->netns_immutable = true;
ip6gre_fb_tunnel_init(ign->fb_tunnel_dev);
ign->fb_tunnel_dev->rtnl_link_ops = &ip6gre_link_ops;
@@ -1596,26 +1605,14 @@ static int __net_init ip6gre_init_net(struct net *net)
return 0;
err_reg_dev:
- free_netdev(ign->fb_tunnel_dev);
+ free_netdev(ndev);
err_alloc_dev:
return err;
}
-static void __net_exit ip6gre_exit_batch_net(struct list_head *net_list)
-{
- struct net *net;
- LIST_HEAD(list);
-
- rtnl_lock();
- list_for_each_entry(net, net_list, exit_list)
- ip6gre_destroy_tunnels(net, &list);
- unregister_netdevice_many(&list);
- rtnl_unlock();
-}
-
static struct pernet_operations ip6gre_net_ops = {
.init = ip6gre_init_net,
- .exit_batch = ip6gre_exit_batch_net,
+ .exit_rtnl = ip6gre_exit_rtnl_net,
.id = &ip6gre_net_id,
.size = sizeof(struct ip6gre_net),
};
@@ -1729,6 +1726,27 @@ static int ip6erspan_tap_validate(struct nlattr *tb[], struct nlattr *data[],
return 0;
}
+static void ip6erspan_set_version(struct nlattr *data[],
+ struct __ip6_tnl_parm *parms)
+{
+ if (!data)
+ return;
+
+ parms->erspan_ver = 1;
+ if (data[IFLA_GRE_ERSPAN_VER])
+ parms->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
+
+ if (parms->erspan_ver == 1) {
+ if (data[IFLA_GRE_ERSPAN_INDEX])
+ parms->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
+ } else if (parms->erspan_ver == 2) {
+ if (data[IFLA_GRE_ERSPAN_DIR])
+ parms->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
+ if (data[IFLA_GRE_ERSPAN_HWID])
+ parms->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
+ }
+}
+
static void ip6gre_netlink_parms(struct nlattr *data[],
struct __ip6_tnl_parm *parms)
{
@@ -1741,12 +1759,12 @@ static void ip6gre_netlink_parms(struct nlattr *data[],
parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
if (data[IFLA_GRE_IFLAGS])
- parms->i_flags = gre_flags_to_tnl_flags(
- nla_get_be16(data[IFLA_GRE_IFLAGS]));
+ gre_flags_to_tnl_flags(parms->i_flags,
+ nla_get_be16(data[IFLA_GRE_IFLAGS]));
if (data[IFLA_GRE_OFLAGS])
- parms->o_flags = gre_flags_to_tnl_flags(
- nla_get_be16(data[IFLA_GRE_OFLAGS]));
+ gre_flags_to_tnl_flags(parms->o_flags,
+ nla_get_be16(data[IFLA_GRE_OFLAGS]));
if (data[IFLA_GRE_IKEY])
parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
@@ -1777,20 +1795,6 @@ static void ip6gre_netlink_parms(struct nlattr *data[],
if (data[IFLA_GRE_COLLECT_METADATA])
parms->collect_md = true;
-
- parms->erspan_ver = 1;
- if (data[IFLA_GRE_ERSPAN_VER])
- parms->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]);
-
- if (parms->erspan_ver == 1) {
- if (data[IFLA_GRE_ERSPAN_INDEX])
- parms->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]);
- } else if (parms->erspan_ver == 2) {
- if (data[IFLA_GRE_ERSPAN_DIR])
- parms->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]);
- if (data[IFLA_GRE_ERSPAN_HWID])
- parms->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]);
- }
}
static int ip6gre_tap_init(struct net_device *dev)
@@ -1813,7 +1817,6 @@ static const struct net_device_ops ip6gre_tap_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
.ndo_change_mtu = ip6_tnl_change_mtu,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
.ndo_get_iflink = ip6_tnl_get_iflink,
};
@@ -1839,16 +1842,11 @@ static int ip6erspan_tap_init(struct net_device *dev)
tunnel = netdev_priv(dev);
tunnel->dev = dev;
- tunnel->net = dev_net(dev);
- strcpy(tunnel->parms.name, dev->name);
-
- dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
- if (!dev->tstats)
- return -ENOMEM;
+ strscpy(tunnel->parms.name, dev->name);
ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
if (ret)
- goto cleanup_alloc_pcpu_stats;
+ return ret;
ret = gro_cells_init(&tunnel->gro_cells, dev);
if (ret)
@@ -1864,13 +1862,12 @@ static int ip6erspan_tap_init(struct net_device *dev)
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
ip6erspan_tnl_link_config(tunnel, 1);
+ netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL);
+ netdev_lockdep_set_classes(dev);
return 0;
cleanup_dst_cache_init:
dst_cache_destroy(&tunnel->dst_cache);
-cleanup_alloc_pcpu_stats:
- free_percpu(dev->tstats);
- dev->tstats = NULL;
return ret;
}
@@ -1881,7 +1878,6 @@ static const struct net_device_ops ip6erspan_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_validate_addr = eth_validate_addr,
.ndo_change_mtu = ip6_tnl_change_mtu,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
.ndo_get_iflink = ip6_tnl_get_iflink,
};
@@ -1895,18 +1891,12 @@ static void ip6gre_tap_setup(struct net_device *dev)
dev->needs_free_netdev = true;
dev->priv_destructor = ip6gre_dev_free;
- dev->features |= NETIF_F_NETNS_LOCAL;
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
netif_keep_dst(dev);
}
-bool is_ip6gretap_dev(const struct net_device *dev)
-{
- return dev->netdev_ops == &ip6gre_tap_netdev_ops;
-}
-EXPORT_SYMBOL_GPL(is_ip6gretap_dev);
-
static bool ip6gre_netlink_encap_parms(struct nlattr *data[],
struct ip_tunnel_encap *ipencap)
{
@@ -1940,7 +1930,7 @@ static bool ip6gre_netlink_encap_parms(struct nlattr *data[],
return ret;
}
-static int ip6gre_newlink_common(struct net *src_net, struct net_device *dev,
+static int ip6gre_newlink_common(struct net *link_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[],
struct netlink_ext_ack *extack)
{
@@ -1961,7 +1951,7 @@ static int ip6gre_newlink_common(struct net *src_net, struct net_device *dev,
eth_hw_addr_random(dev);
nt->dev = dev;
- nt->net = dev_net(dev);
+ nt->net = link_net;
err = register_netdevice(dev);
if (err)
@@ -1970,18 +1960,18 @@ static int ip6gre_newlink_common(struct net *src_net, struct net_device *dev,
if (tb[IFLA_MTU])
ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
- dev_hold(dev);
-
out:
return err;
}
-static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[],
+static int ip6gre_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
struct netlink_ext_ack *extack)
{
+ struct net *net = params->link_net ? : dev_net(dev);
struct ip6_tnl *nt = netdev_priv(dev);
- struct net *net = dev_net(dev);
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
struct ip6gre_net *ign;
int err;
@@ -1996,7 +1986,7 @@ static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
return -EEXIST;
}
- err = ip6gre_newlink_common(src_net, dev, tb, data, extack);
+ err = ip6gre_newlink_common(net, dev, tb, data, extack);
if (!err) {
ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]);
ip6gre_tunnel_link_md(ign, nt);
@@ -2043,9 +2033,9 @@ static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[],
struct nlattr *data[],
struct netlink_ext_ack *extack)
{
- struct ip6gre_net *ign = net_generic(dev_net(dev), ip6gre_net_id);
+ struct ip6_tnl *t = netdev_priv(dev);
+ struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id);
struct __ip6_tnl_parm p;
- struct ip6_tnl *t;
t = ip6gre_changelink_common(dev, tb, data, &p, extack);
if (IS_ERR(t))
@@ -2114,12 +2104,33 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
struct __ip6_tnl_parm *p = &t->parms;
+ IP_TUNNEL_DECLARE_FLAGS(o_flags);
+
+ ip_tunnel_flags_copy(o_flags, p->o_flags);
+
+ if (p->erspan_ver == 1 || p->erspan_ver == 2) {
+ if (!p->collect_md)
+ __set_bit(IP_TUNNEL_KEY_BIT, o_flags);
+
+ if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver))
+ goto nla_put_failure;
+
+ if (p->erspan_ver == 1) {
+ if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index))
+ goto nla_put_failure;
+ } else {
+ if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, p->dir))
+ goto nla_put_failure;
+ if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, p->hwid))
+ goto nla_put_failure;
+ }
+ }
if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
nla_put_be16(skb, IFLA_GRE_IFLAGS,
gre_tnl_flags_to_gre_flags(p->i_flags)) ||
nla_put_be16(skb, IFLA_GRE_OFLAGS,
- gre_tnl_flags_to_gre_flags(p->o_flags)) ||
+ gre_tnl_flags_to_gre_flags(o_flags)) ||
nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
nla_put_in6_addr(skb, IFLA_GRE_LOCAL, &p->laddr) ||
@@ -2128,8 +2139,7 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) ||
nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) ||
nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags) ||
- nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark) ||
- nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index))
+ nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark))
goto nla_put_failure;
if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE,
@@ -2147,19 +2157,6 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev)
goto nla_put_failure;
}
- if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver))
- goto nla_put_failure;
-
- if (p->erspan_ver == 1) {
- if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index))
- goto nla_put_failure;
- } else if (p->erspan_ver == 2) {
- if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, p->dir))
- goto nla_put_failure;
- if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, p->hwid))
- goto nla_put_failure;
- }
-
return 0;
nla_put_failure:
@@ -2172,8 +2169,8 @@ static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = {
[IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
[IFLA_GRE_IKEY] = { .type = NLA_U32 },
[IFLA_GRE_OKEY] = { .type = NLA_U32 },
- [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct ipv6hdr, saddr) },
- [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct ipv6hdr, daddr) },
+ [IFLA_GRE_LOCAL] = { .len = sizeof_field(struct ipv6hdr, saddr) },
+ [IFLA_GRE_REMOTE] = { .len = sizeof_field(struct ipv6hdr, daddr) },
[IFLA_GRE_TTL] = { .type = NLA_U8 },
[IFLA_GRE_ENCAP_LIMIT] = { .type = NLA_U8 },
[IFLA_GRE_FLOWINFO] = { .type = NLA_U32 },
@@ -2194,26 +2191,30 @@ static void ip6erspan_tap_setup(struct net_device *dev)
{
ether_setup(dev);
+ dev->max_mtu = 0;
dev->netdev_ops = &ip6erspan_netdev_ops;
dev->needs_free_netdev = true;
dev->priv_destructor = ip6gre_dev_free;
- dev->features |= NETIF_F_NETNS_LOCAL;
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
netif_keep_dst(dev);
}
-static int ip6erspan_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[],
+static int ip6erspan_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
struct netlink_ext_ack *extack)
{
+ struct net *net = params->link_net ? : dev_net(dev);
struct ip6_tnl *nt = netdev_priv(dev);
- struct net *net = dev_net(dev);
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
struct ip6gre_net *ign;
int err;
ip6gre_netlink_parms(data, &nt->parms);
+ ip6erspan_set_version(data, &nt->parms);
ign = net_generic(net, ip6gre_net_id);
if (nt->parms.collect_md) {
@@ -2224,7 +2225,7 @@ static int ip6erspan_newlink(struct net *src_net, struct net_device *dev,
return -EEXIST;
}
- err = ip6gre_newlink_common(src_net, dev, tb, data, extack);
+ err = ip6gre_newlink_common(net, dev, tb, data, extack);
if (!err) {
ip6erspan_tnl_link_config(nt, !tb[IFLA_MTU]);
ip6erspan_tunnel_link_md(ign, nt);
@@ -2259,6 +2260,7 @@ static int ip6erspan_changelink(struct net_device *dev, struct nlattr *tb[],
if (IS_ERR(t))
return PTR_ERR(t);
+ ip6erspan_set_version(data, &p);
ip6gre_tunnel_unlink_md(ign, t);
ip6gre_tunnel_unlink(ign, t);
ip6erspan_tnl_change(t, &p, !tb[IFLA_MTU]);
@@ -2368,7 +2370,7 @@ static void __exit ip6gre_fini(void)
module_init(ip6gre_init);
module_exit(ip6gre_fini);
MODULE_LICENSE("GPL");
-MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
+MODULE_AUTHOR("D. Kozlov <xeb@mail.ru>");
MODULE_DESCRIPTION("GRE over IPv6 tunneling device");
MODULE_ALIAS_RTNL_LINK("ip6gre");
MODULE_ALIAS_RTNL_LINK("ip6gretap");
diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c
index 02045494c24c..233914b63bdb 100644
--- a/net/ipv6/ip6_icmp.c
+++ b/net/ipv6/ip6_icmp.c
@@ -9,6 +9,8 @@
#if IS_ENABLED(CONFIG_IPV6)
+#if !IS_BUILTIN(CONFIG_IPV6)
+
static ip6_icmp_send_t __rcu *ip6_icmp_send;
int inet6_register_icmp_sender(ip6_icmp_send_t *fn)
@@ -31,18 +33,54 @@ int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn)
}
EXPORT_SYMBOL(inet6_unregister_icmp_sender);
-void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
+void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
+ const struct inet6_skb_parm *parm)
{
ip6_icmp_send_t *send;
rcu_read_lock();
send = rcu_dereference(ip6_icmp_send);
+ if (send)
+ send(skb, type, code, info, NULL, parm);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(__icmpv6_send);
+#endif
+
+#if IS_ENABLED(CONFIG_NF_NAT)
+#include <net/netfilter/nf_conntrack.h>
+void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info)
+{
+ struct inet6_skb_parm parm = { 0 };
+ struct sk_buff *cloned_skb = NULL;
+ enum ip_conntrack_info ctinfo;
+ enum ip_conntrack_dir dir;
+ struct in6_addr orig_ip;
+ struct nf_conn *ct;
- if (!send)
+ ct = nf_ct_get(skb_in, &ctinfo);
+ if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) {
+ __icmpv6_send(skb_in, type, code, info, &parm);
+ return;
+ }
+
+ if (skb_shared(skb_in))
+ skb_in = cloned_skb = skb_clone(skb_in, GFP_ATOMIC);
+
+ if (unlikely(!skb_in || skb_network_header(skb_in) < skb_in->head ||
+ (skb_network_header(skb_in) + sizeof(struct ipv6hdr)) >
+ skb_tail_pointer(skb_in) || skb_ensure_writable(skb_in,
+ skb_network_offset(skb_in) + sizeof(struct ipv6hdr))))
goto out;
- send(skb, type, code, info, NULL);
+
+ orig_ip = ipv6_hdr(skb_in)->saddr;
+ dir = CTINFO2DIR(ctinfo);
+ ipv6_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.in6;
+ __icmpv6_send(skb_in, type, code, info, &parm);
+ ipv6_hdr(skb_in)->saddr = orig_ip;
out:
- rcu_read_unlock();
+ consume_skb(cloned_skb);
}
-EXPORT_SYMBOL(icmpv6_send);
+EXPORT_SYMBOL(icmpv6_ndo_send);
+#endif
#endif
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 6242682be876..168ec07e31cc 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPv6 input
* Linux INET6 implementation
@@ -7,11 +8,6 @@
* Ian P. Morris <I.P.Morris@soton.ac.uk>
*
* Based in linux/net/ipv4/ip_input.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/* Changes
*
@@ -29,12 +25,14 @@
#include <linux/icmpv6.h>
#include <linux/mroute6.h>
#include <linux/slab.h>
+#include <linux/indirect_call_wrapper.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
#include <net/sock.h>
#include <net/snmp.h>
+#include <net/udp.h>
#include <net/ipv6.h>
#include <net/protocol.h>
@@ -50,15 +48,20 @@
static void ip6_rcv_finish_core(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
- void (*edemux)(struct sk_buff *skb);
-
- if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
- const struct inet6_protocol *ipprot;
-
- ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]);
- if (ipprot && (edemux = READ_ONCE(ipprot->early_demux)))
- edemux(skb);
+ if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
+ !skb_dst(skb) && !skb->sk) {
+ switch (ipv6_hdr(skb)->nexthdr) {
+ case IPPROTO_TCP:
+ if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux))
+ tcp_v6_early_demux(skb);
+ break;
+ case IPPROTO_UDP:
+ if (READ_ONCE(net->ipv4.sysctl_udp_early_demux))
+ udp_v6_early_demux(skb);
+ break;
+ }
}
+
if (!skb_valid_dst(skb))
ip6_route_input(skb);
}
@@ -80,31 +83,55 @@ static void ip6_sublist_rcv_finish(struct list_head *head)
{
struct sk_buff *skb, *next;
- list_for_each_entry_safe(skb, next, head, list)
+ list_for_each_entry_safe(skb, next, head, list) {
+ skb_list_del_init(skb);
dst_input(skb);
+ }
+}
+
+static bool ip6_can_use_hint(const struct sk_buff *skb,
+ const struct sk_buff *hint)
+{
+ return hint && !skb_dst(skb) &&
+ ipv6_addr_equal(&ipv6_hdr(hint)->daddr, &ipv6_hdr(skb)->daddr);
+}
+
+static struct sk_buff *ip6_extract_route_hint(const struct net *net,
+ struct sk_buff *skb)
+{
+ if (fib6_routes_require_src(net) || fib6_has_custom_rules(net) ||
+ IP6CB(skb)->flags & IP6SKB_MULTIPATH)
+ return NULL;
+
+ return skb;
}
static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
struct list_head *head)
{
+ struct sk_buff *skb, *next, *hint = NULL;
struct dst_entry *curr_dst = NULL;
- struct sk_buff *skb, *next;
- struct list_head sublist;
+ LIST_HEAD(sublist);
- INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
struct dst_entry *dst;
- list_del(&skb->list);
+ skb_list_del_init(skb);
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
*/
skb = l3mdev_ip6_rcv(skb);
if (!skb)
continue;
- ip6_rcv_finish_core(net, sk, skb);
+
+ if (ip6_can_use_hint(skb, hint))
+ skb_dst_copy(skb, hint);
+ else
+ ip6_rcv_finish_core(net, sk, skb);
dst = skb_dst(skb);
if (curr_dst != dst) {
+ hint = ip6_extract_route_hint(net, skb);
+
/* dispatch old sublist */
if (!list_empty(&sublist))
ip6_sublist_rcv_finish(&sublist);
@@ -121,12 +148,14 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
struct net *net)
{
+ enum skb_drop_reason reason;
const struct ipv6hdr *hdr;
u32 pkt_len;
struct inet6_dev *idev;
if (skb->pkt_type == PACKET_OTHERHOST) {
- kfree_skb(skb);
+ dev_core_stats_rx_otherhost_dropped_inc(skb->dev);
+ kfree_skb_reason(skb, SKB_DROP_REASON_OTHERHOST);
return NULL;
}
@@ -136,9 +165,12 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
__IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_IN, skb->len);
+ SKB_DR_SET(reason, NOT_SPECIFIED);
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL ||
- !idev || unlikely(idev->cnf.disable_ipv6)) {
+ !idev || unlikely(READ_ONCE(idev->cnf.disable_ipv6))) {
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
+ if (idev && unlikely(READ_ONCE(idev->cnf.disable_ipv6)))
+ SKB_DR_SET(reason, IPV6DISABLED);
goto drop;
}
@@ -155,15 +187,19 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
* arrived via the sending interface (ethX), because of the
* nature of scoping architecture. --yoshfuji
*/
- IP6CB(skb)->iif = skb_valid_dst(skb) ? ip6_dst_idev(skb_dst(skb))->dev->ifindex : dev->ifindex;
+ IP6CB(skb)->iif = skb_valid_dst(skb) ?
+ ip6_dst_idev(skb_dst(skb))->dev->ifindex :
+ dev->ifindex;
if (unlikely(!pskb_may_pull(skb, sizeof(*hdr))))
goto err;
hdr = ipv6_hdr(skb);
- if (hdr->version != 6)
+ if (hdr->version != 6) {
+ SKB_DR_SET(reason, UNHANDLED_PROTO);
goto err;
+ }
__IP6_ADD_STATS(net, idev,
IPSTATS_MIB_NOECTPKTS +
@@ -178,7 +214,8 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
*/
if ((ipv6_addr_loopback(&hdr->saddr) ||
ipv6_addr_loopback(&hdr->daddr)) &&
- !(dev->flags & IFF_LOOPBACK))
+ !(dev->flags & IFF_LOOPBACK) &&
+ !netif_is_l3_master(dev))
goto err;
/* RFC4291 Errata ID: 3480
@@ -200,8 +237,10 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
if (!ipv6_addr_is_multicast(&hdr->daddr) &&
(skb->pkt_type == PACKET_BROADCAST ||
skb->pkt_type == PACKET_MULTICAST) &&
- idev->cnf.drop_unicast_in_l2_multicast)
+ READ_ONCE(idev->cnf.drop_unicast_in_l2_multicast)) {
+ SKB_DR_SET(reason, UNICAST_IN_L2_MULTICAST);
goto err;
+ }
/* RFC4291 2.7
* Nodes must not originate a packet to a multicast address whose scope
@@ -230,12 +269,11 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
if (pkt_len + sizeof(struct ipv6hdr) > skb->len) {
__IP6_INC_STATS(net,
idev, IPSTATS_MIB_INTRUNCATEDPKTS);
+ SKB_DR_SET(reason, PKT_TOO_SMALL);
goto drop;
}
- if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) {
- __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
- goto drop;
- }
+ if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
+ goto err;
hdr = ipv6_hdr(skb);
}
@@ -250,14 +288,16 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
rcu_read_unlock();
/* Must drop socket now because of tproxy. */
- skb_orphan(skb);
+ if (!skb_sk_is_prefetched(skb))
+ skb_orphan(skb);
return skb;
err:
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
+ SKB_DR_OR(reason, IP_INHDR);
drop:
rcu_read_unlock();
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return NULL;
}
@@ -288,14 +328,13 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
struct net_device *curr_dev = NULL;
struct net *curr_net = NULL;
struct sk_buff *skb, *next;
- struct list_head sublist;
+ LIST_HEAD(sublist);
- INIT_LIST_HEAD(&sublist);
list_for_each_entry_safe(skb, next, head, list) {
struct net_device *dev = skb->dev;
struct net *net = dev_net(dev);
- list_del(&skb->list);
+ skb_list_del_init(skb);
skb = ip6_rcv_core(skb, dev, net);
if (skb == NULL)
continue;
@@ -312,34 +351,36 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
list_add_tail(&skb->list, &sublist);
}
/* dispatch final sublist */
- ip6_sublist_rcv(&sublist, curr_dev, curr_net);
+ if (!list_empty(&sublist))
+ ip6_sublist_rcv(&sublist, curr_dev, curr_net);
}
+INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *));
+
/*
* Deliver the packet to the host
*/
-
-
-static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr,
+ bool have_final)
{
const struct inet6_protocol *ipprot;
struct inet6_dev *idev;
unsigned int nhoff;
- int nexthdr;
+ SKB_DR(reason);
bool raw;
- bool have_final = false;
/*
* Parse extension headers
*/
- rcu_read_lock();
resubmit:
idev = ip6_dst_idev(skb_dst(skb));
- if (!pskb_pull(skb, skb_transport_offset(skb)))
- goto discard;
nhoff = IP6CB(skb)->nhoff;
- nexthdr = skb_network_header(skb)[nhoff];
+ if (!have_final) {
+ if (!pskb_pull(skb, skb_transport_offset(skb)))
+ goto discard;
+ nexthdr = skb_network_header(skb)[nhoff];
+ }
resubmit_final:
raw = raw6_local_deliver(skb, nexthdr);
@@ -358,29 +399,44 @@ resubmit_final:
}
} else if (ipprot->flags & INET6_PROTO_FINAL) {
const struct ipv6hdr *hdr;
+ int sdif = inet6_sdif(skb);
+ struct net_device *dev;
/* Only do this once for first final protocol */
have_final = true;
- /* Free reference early: we don't need it any more,
- and it may hold ip_conntrack module loaded
- indefinitely. */
- nf_reset(skb);
skb_postpull_rcsum(skb, skb_network_header(skb),
skb_network_header_len(skb));
hdr = ipv6_hdr(skb);
+
+ /* skb->dev passed may be master dev for vrfs. */
+ if (sdif) {
+ dev = dev_get_by_index_rcu(net, sdif);
+ if (!dev)
+ goto discard;
+ } else {
+ dev = skb->dev;
+ }
+
if (ipv6_addr_is_multicast(&hdr->daddr) &&
- !ipv6_chk_mcast_addr(skb->dev, &hdr->daddr,
- &hdr->saddr) &&
- !ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb)))
+ !ipv6_chk_mcast_addr(dev, &hdr->daddr,
+ &hdr->saddr) &&
+ !ipv6_is_mld(skb, nexthdr, skb_network_header_len(skb))) {
+ SKB_DR_SET(reason, IP_INADDRERRORS);
goto discard;
+ }
+ }
+ if (!(ipprot->flags & INET6_PROTO_NOPOLICY)) {
+ if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ SKB_DR_SET(reason, XFRM_POLICY);
+ goto discard;
+ }
+ nf_reset_ct(skb);
}
- if (!(ipprot->flags & INET6_PROTO_NOPOLICY) &&
- !xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
- goto discard;
- ret = ipprot->handler(skb);
+ ret = INDIRECT_CALL_2(ipprot->handler, tcp_v6_rcv, udpv6_rcv,
+ skb);
if (ret > 0) {
if (ipprot->flags & INET6_PROTO_FINAL) {
/* Not an extension header, most likely UDP
@@ -403,49 +459,81 @@ resubmit_final:
IPSTATS_MIB_INUNKNOWNPROTOS);
icmpv6_send(skb, ICMPV6_PARAMPROB,
ICMPV6_UNK_NEXTHDR, nhoff);
+ SKB_DR_SET(reason, IP_NOPROTO);
+ } else {
+ SKB_DR_SET(reason, XFRM_POLICY);
}
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
} else {
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDELIVERS);
consume_skb(skb);
}
}
- rcu_read_unlock();
- return 0;
+ return;
discard:
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
- rcu_read_unlock();
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
+}
+
+static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) {
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_INDISCARDS);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NOMEM);
+ return 0;
+ }
+
+ skb_clear_delivery_time(skb);
+ ip6_protocol_deliver_rcu(net, skb, 0, false);
+
return 0;
}
int ip6_input(struct sk_buff *skb)
{
- return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
- dev_net(skb->dev), NULL, skb, skb->dev, NULL,
- ip6_input_finish);
+ int res;
+
+ rcu_read_lock();
+ res = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
+ dev_net_rcu(skb->dev), NULL, skb, skb->dev, NULL,
+ ip6_input_finish);
+ rcu_read_unlock();
+
+ return res;
}
EXPORT_SYMBOL_GPL(ip6_input);
int ip6_mc_input(struct sk_buff *skb)
{
+ struct net_device *dev = skb->dev;
+ int sdif = inet6_sdif(skb);
const struct ipv6hdr *hdr;
bool deliver;
- __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev),
- __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST,
- skb->len);
+ __IP6_UPD_PO_STATS(skb_dst_dev_net_rcu(skb),
+ __in6_dev_get_safely(dev), IPSTATS_MIB_INMCAST,
+ skb->len);
+
+ /* skb->dev passed may be master dev for vrfs. */
+ if (sdif) {
+ dev = dev_get_by_index_rcu(dev_net_rcu(dev), sdif);
+ if (!dev) {
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+ }
hdr = ipv6_hdr(skb);
- deliver = ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, NULL);
+ deliver = ipv6_chk_mcast_addr(dev, &hdr->daddr, NULL);
#ifdef CONFIG_IPV6_MROUTE
/*
* IPv6 multicast router mode is now supported ;)
*/
- if (dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding &&
+ if (atomic_read(&dev_net_rcu(skb->dev)->ipv6.devconf_all->mc_forwarding) &&
!(ipv6_addr_type(&hdr->daddr) &
(IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL)) &&
likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) {
@@ -486,22 +574,21 @@ int ip6_mc_input(struct sk_buff *skb)
/* unknown RA - process it normally */
}
- if (deliver)
+ if (deliver) {
skb2 = skb_clone(skb, GFP_ATOMIC);
- else {
+ } else {
skb2 = skb;
skb = NULL;
}
- if (skb2) {
+ if (skb2)
ip6_mr_input(skb2);
- }
}
out:
#endif
- if (likely(deliver))
+ if (likely(deliver)) {
ip6_input(skb);
- else {
+ } else {
/* discard */
kfree_skb(skb);
}
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index c7e495f12011..fce91183797a 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPV6 GSO/GRO offload support
* Linux INET6 implementation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -17,9 +13,64 @@
#include <net/protocol.h>
#include <net/ipv6.h>
#include <net/inet_common.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/gro.h>
+#include <net/gso.h>
#include "ip6_offload.h"
+/* All GRO functions are always builtin, except UDP over ipv6, which lays in
+ * ipv6 module, as it depends on UDPv6 lookup function, so we need special care
+ * when ipv6 is built as a module
+ */
+#if IS_BUILTIN(CONFIG_IPV6)
+#define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__)
+#else
+#define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__)
+#endif
+
+#define indirect_call_gro_receive_l4(f2, f1, cb, head, skb) \
+({ \
+ unlikely(gro_recursion_inc_test(skb)) ? \
+ NAPI_GRO_CB(skb)->flush |= 1, NULL : \
+ INDIRECT_CALL_L4(cb, f2, f1, head, skb); \
+})
+
+static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto)
+{
+ const struct net_offload *ops = NULL;
+ struct ipv6_opt_hdr *opth;
+
+ for (;;) {
+ int len;
+
+ ops = rcu_dereference(inet6_offloads[proto]);
+
+ if (unlikely(!ops))
+ break;
+
+ if (!(ops->flags & INET6_PROTO_GSO_EXTHDR))
+ break;
+
+ opth = skb_gro_header(skb, off + sizeof(*opth), off);
+ if (unlikely(!opth))
+ break;
+
+ len = ipv6_optlen(opth);
+
+ opth = skb_gro_header(skb, off + len, off);
+ if (unlikely(!opth))
+ break;
+ proto = opth->nexthdr;
+
+ off += len;
+ }
+
+ skb_gro_pull(skb, off - skb_gro_receive_network_offset(skb));
+ return proto;
+}
+
static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto)
{
const struct net_offload *ops = NULL;
@@ -28,15 +79,13 @@ static int ipv6_gso_pull_exthdrs(struct sk_buff *skb, int proto)
struct ipv6_opt_hdr *opth;
int len;
- if (proto != NEXTHDR_HOP) {
- ops = rcu_dereference(inet6_offloads[proto]);
+ ops = rcu_dereference(inet6_offloads[proto]);
- if (unlikely(!ops))
- break;
+ if (unlikely(!ops))
+ break;
- if (!(ops->flags & INET6_PROTO_GSO_EXTHDR))
- break;
- }
+ if (!(ops->flags & INET6_PROTO_GSO_EXTHDR))
+ break;
if (unlikely(!pskb_may_pull(skb, 8)))
break;
@@ -61,7 +110,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct ipv6hdr *ipv6h;
const struct net_offload *ops;
- int proto;
+ int proto, err;
struct frag_hdr *fptr;
unsigned int payload_len;
u8 *prevhdr;
@@ -71,6 +120,9 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
bool gso_partial;
skb_reset_network_header(skb);
+ err = ipv6_hopopt_jumbo_remove(skb);
+ if (err)
+ return ERR_PTR(err);
nhoff = skb_network_header(skb) - skb_mac_header(skb);
if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
goto out;
@@ -96,8 +148,12 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
ops = rcu_dereference(inet6_offloads[proto]);
if (likely(ops && ops->callbacks.gso_segment)) {
- skb_reset_transport_header(skb);
+ if (!skb_reset_transport_header_careful(skb))
+ goto out;
+
segs = ops->callbacks.gso_segment(skb, features);
+ if (!segs)
+ skb->network_header = skb_mac_header(skb) + nhoff - skb->head;
}
if (IS_ERR_OR_NULL(segs))
@@ -149,13 +205,12 @@ static int ipv6_exthdrs_len(struct ipv6hdr *iph,
proto = iph->nexthdr;
for (;;) {
- if (proto != NEXTHDR_HOP) {
- *opps = rcu_dereference(inet6_offloads[proto]);
- if (unlikely(!(*opps)))
- break;
- if (!((*opps)->flags & INET6_PROTO_GSO_EXTHDR))
- break;
- }
+ *opps = rcu_dereference(inet6_offloads[proto]);
+ if (unlikely(!(*opps)))
+ break;
+ if (!((*opps)->flags & INET6_PROTO_GSO_EXTHDR))
+ break;
+
opth = (void *)opth + optlen;
optlen = ipv6_optlen(opth);
len += optlen;
@@ -164,8 +219,8 @@ static int ipv6_exthdrs_len(struct ipv6hdr *iph,
return len;
}
-static struct sk_buff *ipv6_gro_receive(struct list_head *head,
- struct sk_buff *skb)
+INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
const struct net_offload *ops;
struct sk_buff *pp = NULL;
@@ -179,41 +234,34 @@ static struct sk_buff *ipv6_gro_receive(struct list_head *head,
off = skb_gro_offset(skb);
hlen = off + sizeof(*iph);
- iph = skb_gro_header_fast(skb, off);
- if (skb_gro_header_hard(skb, hlen)) {
- iph = skb_gro_header_slow(skb, hlen, off);
- if (unlikely(!iph))
- goto out;
- }
+ iph = skb_gro_header(skb, hlen, off);
+ if (unlikely(!iph))
+ goto out;
- skb_set_network_header(skb, off);
- skb_gro_pull(skb, sizeof(*iph));
- skb_set_transport_header(skb, skb_gro_offset(skb));
+ NAPI_GRO_CB(skb)->network_offsets[NAPI_GRO_CB(skb)->encap_mark] = off;
- flush += ntohs(iph->payload_len) != skb_gro_len(skb);
+ flush += ntohs(iph->payload_len) != skb->len - hlen;
- rcu_read_lock();
proto = iph->nexthdr;
ops = rcu_dereference(inet6_offloads[proto]);
if (!ops || !ops->callbacks.gro_receive) {
- __pskb_pull(skb, skb_gro_offset(skb));
- skb_gro_frag0_invalidate(skb);
- proto = ipv6_gso_pull_exthdrs(skb, proto);
- skb_gro_pull(skb, -skb_transport_offset(skb));
- skb_reset_transport_header(skb);
- __skb_push(skb, skb_gro_offset(skb));
+ proto = ipv6_gro_pull_exthdrs(skb, hlen, proto);
ops = rcu_dereference(inet6_offloads[proto]);
if (!ops || !ops->callbacks.gro_receive)
- goto out_unlock;
+ goto out;
- iph = ipv6_hdr(skb);
+ iph = skb_gro_network_header(skb);
+ } else {
+ skb_gro_pull(skb, sizeof(*iph));
}
+ skb_set_transport_header(skb, skb_gro_offset(skb));
+
NAPI_GRO_CB(skb)->proto = proto;
flush--;
- nlen = skb_network_header_len(skb);
+ nlen = skb_gro_offset(skb) - off;
list_for_each_entry(p, head, list) {
const struct ipv6hdr *iph2;
@@ -229,34 +277,29 @@ static struct sk_buff *ipv6_gro_receive(struct list_head *head,
* XXX skbs on the gro_list have all been parsed and pulled
* already so we don't need to compare nlen
* (nlen != (sizeof(*iph2) + ipv6_exthdrs_len(iph2, &ops)))
- * memcmp() alone below is suffcient, right?
+ * memcmp() alone below is sufficient, right?
*/
if ((first_word & htonl(0xF00FFFFF)) ||
- memcmp(&iph->nexthdr, &iph2->nexthdr,
- nlen - offsetof(struct ipv6hdr, nexthdr))) {
+ !ipv6_addr_equal(&iph->saddr, &iph2->saddr) ||
+ !ipv6_addr_equal(&iph->daddr, &iph2->daddr) ||
+ iph->nexthdr != iph2->nexthdr) {
+not_same_flow:
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
- /* flush if Traffic Class fields are different */
- NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000));
- NAPI_GRO_CB(p)->flush |= flush;
-
- /* If the previous IP ID value was based on an atomic
- * datagram we can overwrite the value and ignore it.
- */
- if (NAPI_GRO_CB(skb)->is_atomic)
- NAPI_GRO_CB(p)->flush_id = 0;
+ if (unlikely(nlen > sizeof(struct ipv6hdr))) {
+ if (memcmp(iph + 1, iph2 + 1,
+ nlen - sizeof(struct ipv6hdr)))
+ goto not_same_flow;
+ }
}
- NAPI_GRO_CB(skb)->is_atomic = true;
NAPI_GRO_CB(skb)->flush |= flush;
skb_gro_postpull_rcsum(skb, iph, nlen);
- pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
-
-out_unlock:
- rcu_read_unlock();
+ pp = indirect_call_gro_receive_l4(tcp6_gro_receive, udp6_gro_receive,
+ ops->callbacks.gro_receive, head, skb);
out:
skb_gro_flush_final(skb, pp, flush);
@@ -294,30 +337,55 @@ static struct sk_buff *ip4ip6_gro_receive(struct list_head *head,
return inet_gro_receive(head, skb);
}
-static int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
+INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
{
const struct net_offload *ops;
- struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff);
+ struct ipv6hdr *iph;
int err = -ENOSYS;
+ u32 payload_len;
if (skb->encapsulation) {
skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IPV6));
skb_set_inner_network_header(skb, nhoff);
}
- iph->payload_len = htons(skb->len - nhoff - sizeof(*iph));
-
- rcu_read_lock();
+ payload_len = skb->len - nhoff - sizeof(*iph);
+ if (unlikely(payload_len > IPV6_MAXPLEN)) {
+ struct hop_jumbo_hdr *hop_jumbo;
+ int hoplen = sizeof(*hop_jumbo);
+
+ /* Move network header left */
+ memmove(skb_mac_header(skb) - hoplen, skb_mac_header(skb),
+ skb->transport_header - skb->mac_header);
+ skb->data -= hoplen;
+ skb->len += hoplen;
+ skb->mac_header -= hoplen;
+ skb->network_header -= hoplen;
+ iph = (struct ipv6hdr *)(skb->data + nhoff);
+ hop_jumbo = (struct hop_jumbo_hdr *)(iph + 1);
+
+ /* Build hop-by-hop options */
+ hop_jumbo->nexthdr = iph->nexthdr;
+ hop_jumbo->hdrlen = 0;
+ hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
+ hop_jumbo->tlv_len = 4;
+ hop_jumbo->jumbo_payload_len = htonl(payload_len + hoplen);
+
+ iph->nexthdr = NEXTHDR_HOP;
+ iph->payload_len = 0;
+ } else {
+ iph = (struct ipv6hdr *)(skb->data + nhoff);
+ iph->payload_len = htons(payload_len);
+ }
nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops);
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
- goto out_unlock;
-
- err = ops->callbacks.gro_complete(skb, nhoff);
+ goto out;
-out_unlock:
- rcu_read_unlock();
+ err = INDIRECT_CALL_L4(ops->callbacks.gro_complete, tcp6_gro_complete,
+ udp6_gro_complete, skb, nhoff);
+out:
return err;
}
@@ -342,18 +410,37 @@ static int ip4ip6_gro_complete(struct sk_buff *skb, int nhoff)
return inet_gro_complete(skb, nhoff);
}
-static struct packet_offload ipv6_packet_offload __read_mostly = {
- .type = cpu_to_be16(ETH_P_IPV6),
- .callbacks = {
- .gso_segment = ipv6_gso_segment,
- .gro_receive = ipv6_gro_receive,
- .gro_complete = ipv6_gro_complete,
- },
-};
+
+static struct sk_buff *sit_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4))
+ return ERR_PTR(-EINVAL);
+
+ return ipv6_gso_segment(skb, features);
+}
+
+static struct sk_buff *ip4ip6_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP6))
+ return ERR_PTR(-EINVAL);
+
+ return inet_gso_segment(skb, features);
+}
+
+static struct sk_buff *ip6ip6_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP6))
+ return ERR_PTR(-EINVAL);
+
+ return ipv6_gso_segment(skb, features);
+}
static const struct net_offload sit_offload = {
.callbacks = {
- .gso_segment = ipv6_gso_segment,
+ .gso_segment = sit_gso_segment,
.gro_receive = sit_ip6ip6_gro_receive,
.gro_complete = sit_gro_complete,
},
@@ -361,7 +448,7 @@ static const struct net_offload sit_offload = {
static const struct net_offload ip4ip6_offload = {
.callbacks = {
- .gso_segment = inet_gso_segment,
+ .gso_segment = ip4ip6_gso_segment,
.gro_receive = ip4ip6_gro_receive,
.gro_complete = ip4ip6_gro_complete,
},
@@ -369,7 +456,7 @@ static const struct net_offload ip4ip6_offload = {
static const struct net_offload ip6ip6_offload = {
.callbacks = {
- .gso_segment = ipv6_gso_segment,
+ .gso_segment = ip6ip6_gso_segment,
.gro_receive = sit_ip6ip6_gro_receive,
.gro_complete = ip6ip6_gro_complete,
},
@@ -382,7 +469,15 @@ static int __init ipv6_offload_init(void)
if (ipv6_exthdrs_offload_init() < 0)
pr_crit("%s: Cannot add EXTHDRS protocol offload\n", __func__);
- dev_add_offload(&ipv6_packet_offload);
+ net_hotdata.ipv6_packet_offload = (struct packet_offload) {
+ .type = cpu_to_be16(ETH_P_IPV6),
+ .callbacks = {
+ .gso_segment = ipv6_gso_segment,
+ .gro_receive = ipv6_gro_receive,
+ .gro_complete = ipv6_gro_complete,
+ },
+ };
+ dev_add_offload(&net_hotdata.ipv6_packet_offload);
inet_add_offload(&sit_offload, IPPROTO_IPV6);
inet6_add_offload(&ip6ip6_offload, IPPROTO_IPV6);
diff --git a/net/ipv6/ip6_offload.h b/net/ipv6/ip6_offload.h
index 96b40e41ac53..e768987604f1 100644
--- a/net/ipv6/ip6_offload.h
+++ b/net/ipv6/ip6_offload.h
@@ -1,11 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* IPV6 GSO/GRO offload support
* Linux INET6 implementation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#ifndef __ip6_offload_h
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index f9f8f554d141..f904739e99b9 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPv6 output functions
* Linux INET6 implementation
@@ -7,11 +8,6 @@
*
* Based on linux/net/ipv4/ip_output.c
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes:
* A.N.Kuznetsov : airthmetics in fragmentation.
* extension headers are implemented.
@@ -46,6 +42,7 @@
#include <net/sock.h>
#include <net/snmp.h>
+#include <net/gso.h>
#include <net/ipv6.h>
#include <net/ndisc.h>
#include <net/protocol.h>
@@ -58,23 +55,36 @@
#include <linux/mroute6.h>
#include <net/l3mdev.h>
#include <net/lwtunnel.h>
+#include <net/ip_tunnels.h>
static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
- struct net_device *dev = dst->dev;
+ struct net_device *dev = dst_dev_rcu(dst);
+ struct inet6_dev *idev = ip6_dst_idev(dst);
+ unsigned int hh_len = LL_RESERVED_SPACE(dev);
+ const struct in6_addr *daddr, *nexthop;
+ struct ipv6hdr *hdr;
struct neighbour *neigh;
- struct in6_addr *nexthop;
int ret;
- if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
- struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
+ /* Be paranoid, rather than too clever. */
+ if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
+ /* idev stays alive because we hold rcu_read_lock(). */
+ skb = skb_expand_head(skb, hh_len);
+ if (!skb) {
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
+ return -ENOMEM;
+ }
+ }
+ hdr = ipv6_hdr(skb);
+ daddr = &hdr->daddr;
+ if (ipv6_addr_is_multicast(daddr)) {
if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
((mroute6_is_socket(net, skb) &&
!(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
- ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
- &ipv6_hdr(skb)->saddr))) {
+ ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
/* Do not check for IFF_ALLMULTI; multicast routing
@@ -85,7 +95,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
net, sk, newskb, NULL, newskb->dev,
dev_loopback_xmit);
- if (ipv6_hdr(skb)->hop_limit == 0) {
+ if (hdr->hop_limit == 0) {
IP6_INC_STATS(net, idev,
IPSTATS_MIB_OUTDISCARDS);
kfree_skb(skb);
@@ -94,9 +104,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
}
IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
-
- if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
- IPV6_ADDR_SCOPE_NODELOCAL &&
+ if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
!(dev->flags & IFF_LOOPBACK)) {
kfree_skb(skb);
return 0;
@@ -106,131 +114,218 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
if (lwtunnel_xmit_redirect(dst->lwtstate)) {
int res = lwtunnel_xmit(skb);
- if (res < 0 || res == LWTUNNEL_XMIT_DONE)
+ if (res != LWTUNNEL_XMIT_CONTINUE)
return res;
}
- rcu_read_lock_bh();
- nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
- neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
- if (unlikely(!neigh))
- neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
- if (!IS_ERR(neigh)) {
- sock_confirm_neigh(skb, neigh);
- ret = neigh_output(neigh, skb);
- rcu_read_unlock_bh();
- return ret;
- }
- rcu_read_unlock_bh();
+ IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
- IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
- kfree_skb(skb);
- return -EINVAL;
+ nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
+ neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
+
+ if (IS_ERR_OR_NULL(neigh)) {
+ if (unlikely(!neigh))
+ neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
+ if (IS_ERR(neigh)) {
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
+ kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
+ return -EINVAL;
+ }
+ }
+ sock_confirm_neigh(skb, neigh);
+ ret = neigh_output(neigh, skb, false);
+ return ret;
}
-static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int
+ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
+ struct sk_buff *skb, unsigned int mtu)
{
- int ret;
+ struct sk_buff *segs, *nskb;
+ netdev_features_t features;
+ int ret = 0;
- ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
- if (ret) {
+ /* Please see corresponding comment in ip_finish_output_gso
+ * describing the cases where GSO segment length exceeds the
+ * egress MTU.
+ */
+ features = netif_skb_features(skb);
+ segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
+ if (IS_ERR_OR_NULL(segs)) {
kfree_skb(skb);
- return ret;
+ return -ENOMEM;
+ }
+
+ consume_skb(skb);
+
+ skb_list_walk_safe(segs, segs, nskb) {
+ int err;
+
+ skb_mark_not_on_list(segs);
+ /* Last GSO segment can be smaller than gso_size (and MTU).
+ * Adding a fragment header would produce an "atomic fragment",
+ * which is considered harmful (RFC-8021). Avoid that.
+ */
+ err = segs->len > mtu ?
+ ip6_fragment(net, sk, segs, ip6_finish_output2) :
+ ip6_finish_output2(net, sk, segs);
+ if (err && ret == 0)
+ ret = err;
}
+ return ret;
+}
+
+static int ip6_finish_output_gso(struct net *net, struct sock *sk,
+ struct sk_buff *skb, unsigned int mtu)
+{
+ if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
+ !skb_gso_validate_network_len(skb, mtu))
+ return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
+
+ return ip6_finish_output2(net, sk, skb);
+}
+
+static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ unsigned int mtu;
+
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) {
- IPCB(skb)->flags |= IPSKB_REROUTED;
+ IP6CB(skb)->flags |= IP6SKB_REROUTED;
return dst_output(net, sk, skb);
}
#endif
- if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
- dst_allfrag(skb_dst(skb)) ||
+ mtu = ip6_skb_dst_mtu(skb);
+ if (skb_is_gso(skb))
+ return ip6_finish_output_gso(net, sk, skb, mtu);
+
+ if (skb->len > mtu ||
(IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
return ip6_fragment(net, sk, skb, ip6_finish_output2);
- else
- return ip6_finish_output2(net, sk, skb);
+
+ return ip6_finish_output2(net, sk, skb);
+}
+
+static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+
+ ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
+ switch (ret) {
+ case NET_XMIT_SUCCESS:
+ case NET_XMIT_CN:
+ return __ip6_finish_output(net, sk, skb) ? : ret;
+ default:
+ kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
+ return ret;
+ }
}
int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct net_device *dev = skb_dst(skb)->dev;
- struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
+ struct dst_entry *dst = skb_dst(skb);
+ struct net_device *dev, *indev = skb->dev;
+ struct inet6_dev *idev;
+ int ret;
skb->protocol = htons(ETH_P_IPV6);
+ rcu_read_lock();
+ dev = dst_dev_rcu(dst);
+ idev = ip6_dst_idev(dst);
skb->dev = dev;
- if (unlikely(idev->cnf.disable_ipv6)) {
+ if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
- kfree_skb(skb);
+ rcu_read_unlock();
+ kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
return 0;
}
- return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
- net, sk, skb, NULL, dev,
- ip6_finish_output,
- !(IP6CB(skb)->flags & IP6SKB_REROUTED));
+ ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
+ net, sk, skb, indev, dev,
+ ip6_finish_output,
+ !(IP6CB(skb)->flags & IP6SKB_REROUTED));
+ rcu_read_unlock();
+ return ret;
}
+EXPORT_SYMBOL(ip6_output);
-bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
+bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
{
- if (!np->autoflowlabel_set)
+ if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
return ip6_default_np_autolabel(net);
- else
- return np->autoflowlabel;
+ return inet6_test_bit(AUTOFLOWLABEL, sk);
}
/*
- * xmit an sk_buff (used by TCP, SCTP and DCCP)
+ * xmit an sk_buff (used by TCP and SCTP)
* Note : socket lock is not held for SYNACK packets, but might be modified
* by calls to skb_set_owner_w() and ipv6_local_error(),
* which are using proper atomic operations or spinlocks.
*/
int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
- __u32 mark, struct ipv6_txoptions *opt, int tclass)
+ __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
{
- struct net *net = sock_net(sk);
const struct ipv6_pinfo *np = inet6_sk(sk);
struct in6_addr *first_hop = &fl6->daddr;
struct dst_entry *dst = skb_dst(skb);
+ struct inet6_dev *idev = ip6_dst_idev(dst);
+ struct hop_jumbo_hdr *hop_jumbo;
+ int hoplen = sizeof(*hop_jumbo);
+ struct net *net = sock_net(sk);
+ unsigned int head_room;
+ struct net_device *dev;
struct ipv6hdr *hdr;
u8 proto = fl6->flowi6_proto;
int seg_len = skb->len;
- int hlimit = -1;
+ int ret, hlimit = -1;
u32 mtu;
- if (opt) {
- unsigned int head_room;
+ rcu_read_lock();
- /* First: exthdrs may take lots of space (~8K for now)
- MAX_HEADER is not enough.
- */
- head_room = opt->opt_nflen + opt->opt_flen;
- seg_len += head_room;
- head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
-
- if (skb_headroom(skb) < head_room) {
- struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
- if (!skb2) {
- IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_OUTDISCARDS);
- kfree_skb(skb);
- return -ENOBUFS;
- }
- if (skb->sk)
- skb_set_owner_w(skb2, skb->sk);
- consume_skb(skb);
- skb = skb2;
+ dev = dst_dev_rcu(dst);
+ head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
+ if (opt)
+ head_room += opt->opt_nflen + opt->opt_flen;
+
+ if (unlikely(head_room > skb_headroom(skb))) {
+ /* idev stays alive while we hold rcu_read_lock(). */
+ skb = skb_expand_head(skb, head_room);
+ if (!skb) {
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
+ ret = -ENOBUFS;
+ goto unlock;
}
+ }
+
+ if (opt) {
+ seg_len += opt->opt_nflen + opt->opt_flen;
+
if (opt->opt_flen)
ipv6_push_frag_opts(skb, opt, &proto);
+
if (opt->opt_nflen)
ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
&fl6->saddr);
}
+ if (unlikely(seg_len > IPV6_MAXPLEN)) {
+ hop_jumbo = skb_push(skb, hoplen);
+
+ hop_jumbo->nexthdr = proto;
+ hop_jumbo->hdrlen = 0;
+ hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
+ hop_jumbo->tlv_len = 4;
+ hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
+
+ proto = IPPROTO_HOPOPTS;
+ seg_len = 0;
+ IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
+ }
+
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
hdr = ipv6_hdr(skb);
@@ -239,12 +334,12 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
* Fill in the IPv6 header
*/
if (np)
- hlimit = np->hop_limit;
+ hlimit = READ_ONCE(np->hop_limit);
if (hlimit < 0)
hlimit = ip6_dst_hoplimit(dst);
ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
- ip6_autoflowlabel(net, np), fl6));
+ ip6_autoflowlabel(net, sk), fl6));
hdr->payload_len = htons(seg_len);
hdr->nexthdr = proto;
@@ -254,38 +349,43 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
hdr->daddr = *first_hop;
skb->protocol = htons(ETH_P_IPV6);
- skb->priority = sk->sk_priority;
+ skb->priority = priority;
skb->mark = mark;
mtu = dst_mtu(dst);
if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
- IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_OUT, skb->len);
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
/* if egress device is enslaved to an L3 master device pass the
* skb to its handler for processing
*/
skb = l3mdev_ip6_out((struct sock *)sk, skb);
- if (unlikely(!skb))
- return 0;
+ if (unlikely(!skb)) {
+ ret = 0;
+ goto unlock;
+ }
/* hooks should never assume socket lock is held.
* we promote our socket to non const
*/
- return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
- net, (struct sock *)sk, skb, NULL, dst->dev,
- dst_output);
+ ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
+ net, (struct sock *)sk, skb, NULL, dev,
+ dst_output);
+ goto unlock;
}
- skb->dev = dst->dev;
+ ret = -EMSGSIZE;
+ skb->dev = dev;
/* ipv6_local_error() does not require socket lock,
* we promote our socket to non const
*/
ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
- IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
kfree_skb(skb);
- return -EMSGSIZE;
+unlock:
+ rcu_read_unlock();
+ return ret;
}
EXPORT_SYMBOL(ip6_xmit);
@@ -300,6 +400,11 @@ static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
if (sk && ra->sel == sel &&
(!sk->sk_bound_dev_if ||
sk->sk_bound_dev_if == skb->dev->ifindex)) {
+
+ if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
+ !net_eq(sock_net(sk), dev_net(skb->dev))) {
+ continue;
+ }
if (last) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2)
@@ -373,11 +478,14 @@ static int ip6_forward_proxy_check(struct sk_buff *skb)
static inline int ip6_forward_finish(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb);
-
- __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
- __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
+#ifdef CONFIG_NET_SWITCHDEV
+ if (skb->offload_l3_fwd_mark) {
+ consume_skb(skb);
+ return 0;
+ }
+#endif
+ skb_clear_tstamp(skb);
return dst_output(net, sk, skb);
}
@@ -401,14 +509,18 @@ static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
int ip6_forward(struct sk_buff *skb)
{
- struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
struct dst_entry *dst = skb_dst(skb);
struct ipv6hdr *hdr = ipv6_hdr(skb);
struct inet6_skb_parm *opt = IP6CB(skb);
- struct net *net = dev_net(dst->dev);
+ struct net *net = dev_net(dst_dev(dst));
+ struct net_device *dev;
+ struct inet6_dev *idev;
+ SKB_DR(reason);
u32 mtu;
- if (net->ipv6.devconf_all->forwarding == 0)
+ idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
+ if (!READ_ONCE(net->ipv6.devconf_all->forwarding) &&
+ (!idev || !READ_ONCE(idev->cnf.force_forwarding)))
goto error;
if (skb->pkt_type != PACKET_HOST)
@@ -420,7 +532,9 @@ int ip6_forward(struct sk_buff *skb)
if (skb_warn_if_lro(skb))
goto drop;
- if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
+ if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) &&
+ (!idev || !READ_ONCE(idev->cnf.disable_policy)) &&
+ !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
goto drop;
}
@@ -449,22 +563,34 @@ int ip6_forward(struct sk_buff *skb)
* check and decrement ttl
*/
if (hdr->hop_limit <= 1) {
- /* Force OUTPUT device used as source address */
- skb->dev = dst->dev;
icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
return -ETIMEDOUT;
}
/* XXX: idev->cnf.proxy_ndp? */
- if (net->ipv6.devconf_all->proxy_ndp &&
- pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
+ if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
+ pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) {
int proxied = ip6_forward_proxy_check(skb);
- if (proxied > 0)
+ if (proxied > 0) {
+ /* It's tempting to decrease the hop limit
+ * here by 1, as we do at the end of the
+ * function too.
+ *
+ * But that would be incorrect, as proxying is
+ * not forwarding. The ip6_input function
+ * will handle this packet locally, and it
+ * depends on the hop limit being unchanged.
+ *
+ * One example is the NDP hop limit, that
+ * always has to stay 255, but other would be
+ * similar checks around RA packets, where the
+ * user can even change the desired limit.
+ */
return ip6_input(skb);
- else if (proxied < 0) {
+ } else if (proxied < 0) {
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
goto drop;
}
@@ -472,15 +598,16 @@ int ip6_forward(struct sk_buff *skb)
if (!xfrm6_route_forward(skb)) {
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
+ SKB_DR_SET(reason, XFRM_POLICY);
goto drop;
}
dst = skb_dst(skb);
-
+ dev = dst_dev(dst);
/* IPv6 specs say nothing about it, but it is clear that we cannot
send redirects to source routed frames.
We don't send redirects to frames decapsulated from IPsec.
*/
- if (IP6CB(skb)->iif == dst->dev->ifindex &&
+ if (IP6CB(skb)->iif == dev->ifindex &&
opt->srcrt == 0 && !skb_sec_path(skb)) {
struct in6_addr *target = NULL;
struct inet_peer *peer;
@@ -491,21 +618,21 @@ int ip6_forward(struct sk_buff *skb)
* send a redirect.
*/
- rt = (struct rt6_info *) dst;
+ rt = dst_rt6_info(dst);
if (rt->rt6i_flags & RTF_GATEWAY)
target = &rt->rt6i_gateway;
else
target = &hdr->daddr;
- peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
+ rcu_read_lock();
+ peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr);
/* Limit redirects both by destination (here)
and by source (inside ndisc_send_redirect)
*/
if (inet_peer_xrlim_allow(peer, 1*HZ))
ndisc_send_redirect(skb, target);
- if (peer)
- inet_putpeer(peer);
+ rcu_read_unlock();
} else {
int addrtype = ipv6_addr_type(&hdr->saddr);
@@ -520,22 +647,24 @@ int ip6_forward(struct sk_buff *skb)
}
}
- mtu = ip6_dst_mtu_forward(dst);
+ __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
+
+ mtu = ip6_dst_mtu_maybe_forward(dst, true);
if (mtu < IPV6_MIN_MTU)
mtu = IPV6_MIN_MTU;
if (ip6_pkt_too_big(skb, mtu)) {
/* Again, force OUTPUT device used as source address */
- skb->dev = dst->dev;
+ skb->dev = dev;
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
__IP6_INC_STATS(net, ip6_dst_idev(dst),
IPSTATS_MIB_FRAGFAILS);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
return -EMSGSIZE;
}
- if (skb_cow(skb, dst->dev->hard_header_len)) {
+ if (skb_cow(skb, dev->hard_header_len)) {
__IP6_INC_STATS(net, ip6_dst_idev(dst),
IPSTATS_MIB_OUTDISCARDS);
goto drop;
@@ -548,13 +677,14 @@ int ip6_forward(struct sk_buff *skb)
hdr->hop_limit--;
return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
- net, NULL, skb, skb->dev, dst->dev,
+ net, NULL, skb, skb->dev, dev,
ip6_forward_finish);
error:
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
+ SKB_DR_SET(reason, IP_INADDRERRORS);
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return -EINVAL;
}
@@ -574,22 +704,186 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->tc_index = from->tc_index;
#endif
nf_copy(to, from);
+ skb_ext_copy(to, from);
skb_copy_secmark(to, from);
}
+int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
+ u8 nexthdr, __be32 frag_id,
+ struct ip6_fraglist_iter *iter)
+{
+ unsigned int first_len;
+ struct frag_hdr *fh;
+
+ /* BUILD HEADER */
+ *prevhdr = NEXTHDR_FRAGMENT;
+ iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
+ if (!iter->tmp_hdr)
+ return -ENOMEM;
+
+ iter->frag = skb_shinfo(skb)->frag_list;
+ skb_frag_list_init(skb);
+
+ iter->offset = 0;
+ iter->hlen = hlen;
+ iter->frag_id = frag_id;
+ iter->nexthdr = nexthdr;
+
+ __skb_pull(skb, hlen);
+ fh = __skb_push(skb, sizeof(struct frag_hdr));
+ __skb_push(skb, hlen);
+ skb_reset_network_header(skb);
+ memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
+
+ fh->nexthdr = nexthdr;
+ fh->reserved = 0;
+ fh->frag_off = htons(IP6_MF);
+ fh->identification = frag_id;
+
+ first_len = skb_pagelen(skb);
+ skb->data_len = first_len - skb_headlen(skb);
+ skb->len = first_len;
+ ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
+
+ return 0;
+}
+EXPORT_SYMBOL(ip6_fraglist_init);
+
+void ip6_fraglist_prepare(struct sk_buff *skb,
+ struct ip6_fraglist_iter *iter)
+{
+ struct sk_buff *frag = iter->frag;
+ unsigned int hlen = iter->hlen;
+ struct frag_hdr *fh;
+
+ frag->ip_summed = CHECKSUM_NONE;
+ skb_reset_transport_header(frag);
+ fh = __skb_push(frag, sizeof(struct frag_hdr));
+ __skb_push(frag, hlen);
+ skb_reset_network_header(frag);
+ memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
+ iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
+ fh->nexthdr = iter->nexthdr;
+ fh->reserved = 0;
+ fh->frag_off = htons(iter->offset);
+ if (frag->next)
+ fh->frag_off |= htons(IP6_MF);
+ fh->identification = iter->frag_id;
+ ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
+ ip6_copy_metadata(frag, skb);
+}
+EXPORT_SYMBOL(ip6_fraglist_prepare);
+
+void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
+ unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
+ u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
+{
+ state->prevhdr = prevhdr;
+ state->nexthdr = nexthdr;
+ state->frag_id = frag_id;
+
+ state->hlen = hlen;
+ state->mtu = mtu;
+
+ state->left = skb->len - hlen; /* Space per frame */
+ state->ptr = hlen; /* Where to start from */
+
+ state->hroom = hdr_room;
+ state->troom = needed_tailroom;
+
+ state->offset = 0;
+}
+EXPORT_SYMBOL(ip6_frag_init);
+
+struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
+{
+ u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
+ struct sk_buff *frag;
+ struct frag_hdr *fh;
+ unsigned int len;
+
+ len = state->left;
+ /* IF: it doesn't fit, use 'mtu' - the data space left */
+ if (len > state->mtu)
+ len = state->mtu;
+ /* IF: we are not sending up to and including the packet end
+ then align the next start on an eight byte boundary */
+ if (len < state->left)
+ len &= ~7;
+
+ /* Allocate buffer */
+ frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
+ state->hroom + state->troom, GFP_ATOMIC);
+ if (!frag)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * Set up data on packet
+ */
+
+ ip6_copy_metadata(frag, skb);
+ skb_reserve(frag, state->hroom);
+ skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
+ skb_reset_network_header(frag);
+ fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
+ frag->transport_header = (frag->network_header + state->hlen +
+ sizeof(struct frag_hdr));
+
+ /*
+ * Charge the memory for the fragment to any owner
+ * it might possess
+ */
+ if (skb->sk)
+ skb_set_owner_w(frag, skb->sk);
+
+ /*
+ * Copy the packet header into the new buffer.
+ */
+ skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
+
+ fragnexthdr_offset = skb_network_header(frag);
+ fragnexthdr_offset += prevhdr - skb_network_header(skb);
+ *fragnexthdr_offset = NEXTHDR_FRAGMENT;
+
+ /*
+ * Build fragment header.
+ */
+ fh->nexthdr = state->nexthdr;
+ fh->reserved = 0;
+ fh->identification = state->frag_id;
+
+ /*
+ * Copy a block of the IP datagram.
+ */
+ BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
+ len));
+ state->left -= len;
+
+ fh->frag_off = htons(state->offset);
+ if (state->left > 0)
+ fh->frag_off |= htons(IP6_MF);
+ ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
+
+ state->ptr += len;
+ state->offset += len;
+
+ return frag;
+}
+EXPORT_SYMBOL(ip6_frag_next);
+
int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
int (*output)(struct net *, struct sock *, struct sk_buff *))
{
struct sk_buff *frag;
- struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
+ struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
inet6_sk(skb->sk) : NULL;
- struct ipv6hdr *tmp_hdr;
- struct frag_hdr *fh;
- unsigned int mtu, hlen, left, len;
- int hroom, troom;
+ u8 tstamp_type = skb->tstamp_type;
+ struct ip6_frag_state state;
+ unsigned int mtu, hlen, nexthdr_offset;
+ ktime_t tstamp = skb->tstamp;
+ int hroom, err = 0;
__be32 frag_id;
- int ptr, offset = 0, err = 0;
u8 *prevhdr, nexthdr = 0;
err = ip6_find_1stfragopt(skb, &prevhdr);
@@ -597,6 +891,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
goto fail;
hlen = err;
nexthdr = *prevhdr;
+ nexthdr_offset = prevhdr - skb_network_header(skb);
mtu = ip6_skb_dst_mtu(skb);
@@ -616,9 +911,11 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
mtu = IPV6_MIN_MTU;
}
- if (np && np->frag_size < mtu) {
- if (np->frag_size)
- mtu = np->frag_size;
+ if (np) {
+ u32 frag_size = READ_ONCE(np->frag_size);
+
+ if (frag_size && frag_size < mtu)
+ mtu = frag_size;
}
if (mtu < hlen + sizeof(struct frag_hdr) + 8)
goto fail_toobig;
@@ -631,9 +928,11 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
(err = skb_checksum_help(skb)))
goto fail;
+ prevhdr = skb_network_header(skb) + nexthdr_offset;
hroom = LL_RESERVED_SPACE(rt->dst.dev);
if (skb_has_frag_list(skb)) {
unsigned int first_len = skb_pagelen(skb);
+ struct ip6_fraglist_iter iter;
struct sk_buff *frag2;
if (first_len - hlen > mtu ||
@@ -661,85 +960,46 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
skb->truesize -= frag->truesize;
}
- err = 0;
- offset = 0;
- /* BUILD HEADER */
-
- *prevhdr = NEXTHDR_FRAGMENT;
- tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
- if (!tmp_hdr) {
- err = -ENOMEM;
+ err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
+ &iter);
+ if (err < 0)
goto fail;
- }
- frag = skb_shinfo(skb)->frag_list;
- skb_frag_list_init(skb);
-
- __skb_pull(skb, hlen);
- fh = __skb_push(skb, sizeof(struct frag_hdr));
- __skb_push(skb, hlen);
- skb_reset_network_header(skb);
- memcpy(skb_network_header(skb), tmp_hdr, hlen);
-
- fh->nexthdr = nexthdr;
- fh->reserved = 0;
- fh->frag_off = htons(IP6_MF);
- fh->identification = frag_id;
-
- first_len = skb_pagelen(skb);
- skb->data_len = first_len - skb_headlen(skb);
- skb->len = first_len;
- ipv6_hdr(skb)->payload_len = htons(first_len -
- sizeof(struct ipv6hdr));
+
+ /* We prevent @rt from being freed. */
+ rcu_read_lock();
for (;;) {
/* Prepare header of the next frame,
* before previous one went down. */
- if (frag) {
- frag->ip_summed = CHECKSUM_NONE;
- skb_reset_transport_header(frag);
- fh = __skb_push(frag, sizeof(struct frag_hdr));
- __skb_push(frag, hlen);
- skb_reset_network_header(frag);
- memcpy(skb_network_header(frag), tmp_hdr,
- hlen);
- offset += skb->len - hlen - sizeof(struct frag_hdr);
- fh->nexthdr = nexthdr;
- fh->reserved = 0;
- fh->frag_off = htons(offset);
- if (frag->next)
- fh->frag_off |= htons(IP6_MF);
- fh->identification = frag_id;
- ipv6_hdr(frag)->payload_len =
- htons(frag->len -
- sizeof(struct ipv6hdr));
- ip6_copy_metadata(frag, skb);
- }
+ if (iter.frag)
+ ip6_fraglist_prepare(skb, &iter);
+ skb_set_delivery_time(skb, tstamp, tstamp_type);
err = output(net, sk, skb);
if (!err)
IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
IPSTATS_MIB_FRAGCREATES);
- if (err || !frag)
+ if (err || !iter.frag)
break;
- skb = frag;
- frag = skb->next;
- skb->next = NULL;
+ skb = ip6_fraglist_next(&iter);
}
- kfree(tmp_hdr);
+ kfree(iter.tmp_hdr);
if (err == 0) {
IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
IPSTATS_MIB_FRAGOKS);
+ rcu_read_unlock();
return 0;
}
- kfree_skb_list(frag);
+ kfree_skb_list(iter.frag);
IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
IPSTATS_MIB_FRAGFAILS);
+ rcu_read_unlock();
return err;
slow_path_clean:
@@ -753,93 +1013,29 @@ slow_path_clean:
}
slow_path:
- left = skb->len - hlen; /* Space per frame */
- ptr = hlen; /* Where to start from */
-
/*
* Fragment the datagram.
*/
- troom = rt->dst.dev->needed_tailroom;
+ ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
+ LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
+ &state);
/*
* Keep copying data until we run out.
*/
- while (left > 0) {
- u8 *fragnexthdr_offset;
-
- len = left;
- /* IF: it doesn't fit, use 'mtu' - the data space left */
- if (len > mtu)
- len = mtu;
- /* IF: we are not sending up to and including the packet end
- then align the next start on an eight byte boundary */
- if (len < left) {
- len &= ~7;
- }
- /* Allocate buffer */
- frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
- hroom + troom, GFP_ATOMIC);
- if (!frag) {
- err = -ENOMEM;
+ while (state.left > 0) {
+ frag = ip6_frag_next(skb, &state);
+ if (IS_ERR(frag)) {
+ err = PTR_ERR(frag);
goto fail;
}
/*
- * Set up data on packet
- */
-
- ip6_copy_metadata(frag, skb);
- skb_reserve(frag, hroom);
- skb_put(frag, len + hlen + sizeof(struct frag_hdr));
- skb_reset_network_header(frag);
- fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
- frag->transport_header = (frag->network_header + hlen +
- sizeof(struct frag_hdr));
-
- /*
- * Charge the memory for the fragment to any owner
- * it might possess
- */
- if (skb->sk)
- skb_set_owner_w(frag, skb->sk);
-
- /*
- * Copy the packet header into the new buffer.
- */
- skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
-
- fragnexthdr_offset = skb_network_header(frag);
- fragnexthdr_offset += prevhdr - skb_network_header(skb);
- *fragnexthdr_offset = NEXTHDR_FRAGMENT;
-
- /*
- * Build fragment header.
- */
- fh->nexthdr = nexthdr;
- fh->reserved = 0;
- fh->identification = frag_id;
-
- /*
- * Copy a block of the IP datagram.
- */
- BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
- len));
- left -= len;
-
- fh->frag_off = htons(offset);
- if (left > 0)
- fh->frag_off |= htons(IP6_MF);
- ipv6_hdr(frag)->payload_len = htons(frag->len -
- sizeof(struct ipv6hdr));
-
- ptr += len;
- offset += len;
-
- /*
* Put this fragment into the sending queue.
*/
+ skb_set_delivery_time(frag, tstamp, tstamp_type);
err = output(net, sk, frag);
if (err)
goto fail;
@@ -853,9 +1049,6 @@ slow_path:
return err;
fail_toobig:
- if (skb->sk && dst_allfrag(skb_dst(skb)))
- sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
-
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
err = -EMSGSIZE;
@@ -889,7 +1082,7 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
return NULL;
}
- rt = (struct rt6_info *)dst;
+ rt = dst_rt6_info(dst);
/* Yes, checking route validity in not connected
* case is not very simple. Take into account,
* that we do not support routing by source, TOS,
@@ -907,12 +1100,13 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
* sockets.
* 2. oif also should be the same.
*/
- if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
+ if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr,
+ np->daddr_cache ? &sk->sk_v6_daddr : NULL) ||
#ifdef CONFIG_IPV6_SUBTREES
- ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
+ ip6_rt_check(&rt->rt6i_src, &fl6->saddr,
+ np->saddr_cache ? &np->saddr : NULL) ||
#endif
- (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
- (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
+ (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) {
dst_release(dst);
dst = NULL;
}
@@ -940,19 +1134,18 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
* ip6_route_output will fail given src=any saddr, though, so
* that's why we try it again later.
*/
- if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
+ if (ipv6_addr_any(&fl6->saddr)) {
struct fib6_info *from;
struct rt6_info *rt;
- bool had_dst = *dst != NULL;
- if (!had_dst)
- *dst = ip6_route_output(net, sk, fl6);
- rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
+ *dst = ip6_route_output(net, sk, fl6);
+ rt = (*dst)->error ? NULL : dst_rt6_info(*dst);
rcu_read_lock();
from = rt ? rcu_dereference(rt->from) : NULL;
err = ip6_route_get_saddr(net, from, &fl6->daddr,
- sk ? inet6_sk(sk)->srcprefs : 0,
+ sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
+ fl6->flowi6_l3mdev,
&fl6->saddr);
rcu_read_unlock();
@@ -963,7 +1156,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
* never existed and let the SA-enabled version take
* over.
*/
- if (!had_dst && (*dst)->error) {
+ if ((*dst)->error) {
dst_release(*dst);
*dst = NULL;
}
@@ -988,12 +1181,12 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
* dst entry and replace it instead with the
* dst entry of the nexthop router
*/
- rt = (struct rt6_info *) *dst;
- rcu_read_lock_bh();
+ rt = dst_rt6_info(*dst);
+ rcu_read_lock();
n = __ipv6_neigh_lookup_noref(rt->dst.dev,
rt6_nexthop(rt, &fl6->daddr));
- err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
- rcu_read_unlock_bh();
+ err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
+ rcu_read_unlock();
if (err) {
struct inet6_ifaddr *ifp;
@@ -1041,6 +1234,7 @@ out_err_release:
/**
* ip6_dst_lookup - perform route lookup on flow
+ * @net: Network namespace to perform lookup in
* @sk: socket which provides route info
* @dst: pointer to dst_entry * for result
* @fl6: flow to lookup
@@ -1059,6 +1253,7 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup);
/**
* ip6_dst_lookup_flow - perform route lookup on flow with ipsec
+ * @net: Network namespace to perform lookup in
* @sk: socket which provides route info
* @fl6: flow to lookup
* @final_dst: final destination address for ipsec lookup
@@ -1068,19 +1263,19 @@ EXPORT_SYMBOL_GPL(ip6_dst_lookup);
* It returns a valid dst pointer on success, or a pointer encoded
* error code.
*/
-struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
+struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
const struct in6_addr *final_dst)
{
struct dst_entry *dst = NULL;
int err;
- err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
+ err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
if (err)
return ERR_PTR(err);
if (final_dst)
fl6->daddr = *final_dst;
- return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
+ return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
}
EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
@@ -1112,7 +1307,7 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
if (dst)
return dst;
- dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
if (connected && !IS_ERR(dst))
ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
@@ -1158,11 +1353,16 @@ static void ip6_append_data_mtu(unsigned int *mtu,
static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
- struct rt6_info *rt, struct flowi6 *fl6)
+ struct rt6_info *rt)
{
struct ipv6_pinfo *np = inet6_sk(sk);
- unsigned int mtu;
- struct ipv6_txoptions *opt = ipc6->opt;
+ unsigned int mtu, frag_size;
+ struct ipv6_txoptions *nopt, *opt = ipc6->opt;
+
+ /* callers pass dst together with a reference, set it first so
+ * ip6_cork_release() can put it down even in case of an error.
+ */
+ cork->base.dst = &rt->dst;
/*
* setup for corking
@@ -1171,93 +1371,91 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
if (WARN_ON(v6_cork->opt))
return -EINVAL;
- v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
- if (unlikely(!v6_cork->opt))
+ nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
+ if (unlikely(!nopt))
return -ENOBUFS;
- v6_cork->opt->tot_len = sizeof(*opt);
- v6_cork->opt->opt_flen = opt->opt_flen;
- v6_cork->opt->opt_nflen = opt->opt_nflen;
+ nopt->tot_len = sizeof(*opt);
+ nopt->opt_flen = opt->opt_flen;
+ nopt->opt_nflen = opt->opt_nflen;
- v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
- sk->sk_allocation);
- if (opt->dst0opt && !v6_cork->opt->dst0opt)
+ nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
+ if (opt->dst0opt && !nopt->dst0opt)
return -ENOBUFS;
- v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
- sk->sk_allocation);
- if (opt->dst1opt && !v6_cork->opt->dst1opt)
+ nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
+ if (opt->dst1opt && !nopt->dst1opt)
return -ENOBUFS;
- v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
- sk->sk_allocation);
- if (opt->hopopt && !v6_cork->opt->hopopt)
+ nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
+ if (opt->hopopt && !nopt->hopopt)
return -ENOBUFS;
- v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
- sk->sk_allocation);
- if (opt->srcrt && !v6_cork->opt->srcrt)
+ nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
+ if (opt->srcrt && !nopt->srcrt)
return -ENOBUFS;
/* need source address above miyazawa*/
}
- dst_hold(&rt->dst);
- cork->base.dst = &rt->dst;
- cork->fl.u.ip6 = *fl6;
v6_cork->hop_limit = ipc6->hlimit;
v6_cork->tclass = ipc6->tclass;
+ v6_cork->dontfrag = ipc6->dontfrag;
if (rt->dst.flags & DST_XFRM_TUNNEL)
- mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
+ mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
else
- mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
+ mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
- if (np->frag_size < mtu) {
- if (np->frag_size)
- mtu = np->frag_size;
- }
- if (mtu < IPV6_MIN_MTU)
- return -EINVAL;
+
+ frag_size = READ_ONCE(np->frag_size);
+ if (frag_size && frag_size < mtu)
+ mtu = frag_size;
+
cork->base.fragsize = mtu;
cork->base.gso_size = ipc6->gso_size;
cork->base.tx_flags = 0;
- sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
-
- if (dst_allfrag(xfrm_dst_path(&rt->dst)))
- cork->base.flags |= IPCORK_ALLFRAG;
+ cork->base.mark = ipc6->sockc.mark;
+ cork->base.priority = ipc6->sockc.priority;
+ sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags);
+ if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) {
+ cork->base.flags |= IPCORK_TS_OPT_ID;
+ cork->base.ts_opt_id = ipc6->sockc.ts_opt_id;
+ }
cork->base.length = 0;
-
cork->base.transmit_time = ipc6->sockc.transmit_time;
return 0;
}
static int __ip6_append_data(struct sock *sk,
- struct flowi6 *fl6,
struct sk_buff_head *queue,
- struct inet_cork *cork,
+ struct inet_cork_full *cork_full,
struct inet6_cork *v6_cork,
struct page_frag *pfrag,
int getfrag(void *from, char *to, int offset,
int len, int odd, struct sk_buff *skb),
- void *from, int length, int transhdrlen,
- unsigned int flags, struct ipcm6_cookie *ipc6)
+ void *from, size_t length, int transhdrlen,
+ unsigned int flags)
{
struct sk_buff *skb, *skb_prev = NULL;
+ struct inet_cork *cork = &cork_full->base;
+ struct flowi6 *fl6 = &cork_full->fl.u.ip6;
unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
+ struct ubuf_info *uarg = NULL;
int exthdrlen = 0;
int dst_exthdrlen = 0;
int hh_len;
int copy;
int err;
int offset = 0;
+ bool zc = false;
u32 tskey = 0;
- struct rt6_info *rt = (struct rt6_info *)cork->dst;
+ struct rt6_info *rt = dst_rt6_info(cork->dst);
+ bool paged, hold_tskey = false, extra_uref = false;
struct ipv6_txoptions *opt = v6_cork->opt;
int csummode = CHECKSUM_NONE;
unsigned int maxnonfragsize, headersize;
unsigned int wmem_alloc_delta = 0;
- bool paged;
skb = skb_peek_tail(queue);
if (!skb) {
@@ -1269,31 +1467,31 @@ static int __ip6_append_data(struct sock *sk,
mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
orig_mtu = mtu;
- if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
- sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
- tskey = sk->sk_tskey++;
-
hh_len = LL_RESERVED_SPACE(rt->dst.dev);
fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
(opt ? opt->opt_nflen : 0);
- maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
- sizeof(struct frag_hdr);
headersize = sizeof(struct ipv6hdr) +
(opt ? opt->opt_flen + opt->opt_nflen : 0) +
- (dst_allfrag(&rt->dst) ?
- sizeof(struct frag_hdr) : 0) +
rt->rt6i_nfheader_len;
+ if (mtu <= fragheaderlen ||
+ ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
+ goto emsgsize;
+
+ maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
+ sizeof(struct frag_hdr);
+
/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
* the first fragment
*/
if (headersize + transhdrlen > mtu)
goto emsgsize;
- if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
+ if (cork->length + length > mtu - headersize && v6_cork->dontfrag &&
(sk->sk_protocol == IPPROTO_UDP ||
+ sk->sk_protocol == IPPROTO_ICMPV6 ||
sk->sk_protocol == IPPROTO_RAW)) {
ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
sizeof(struct ipv6hdr));
@@ -1322,13 +1520,65 @@ emsgsize:
rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
csummode = CHECKSUM_PARTIAL;
+ if ((flags & MSG_ZEROCOPY) && length) {
+ struct msghdr *msg = from;
+
+ if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
+ if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
+ return -EINVAL;
+
+ /* Leave uarg NULL if can't zerocopy, callers should
+ * be able to handle it.
+ */
+ if ((rt->dst.dev->features & NETIF_F_SG) &&
+ csummode == CHECKSUM_PARTIAL) {
+ paged = true;
+ zc = true;
+ uarg = msg->msg_ubuf;
+ }
+ } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
+ uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
+ false);
+ if (!uarg)
+ return -ENOBUFS;
+ extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
+ if (rt->dst.dev->features & NETIF_F_SG &&
+ csummode == CHECKSUM_PARTIAL) {
+ paged = true;
+ zc = true;
+ } else {
+ uarg_to_msgzc(uarg)->zerocopy = 0;
+ skb_zcopy_set(skb, uarg, &extra_uref);
+ }
+ }
+ } else if ((flags & MSG_SPLICE_PAGES) && length) {
+ if (inet_test_bit(HDRINCL, sk))
+ return -EPERM;
+ if (rt->dst.dev->features & NETIF_F_SG &&
+ getfrag == ip_generic_getfrag)
+ /* We need an empty buffer to attach stuff to */
+ paged = true;
+ else
+ flags &= ~MSG_SPLICE_PAGES;
+ }
+
+ if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
+ READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
+ if (cork->flags & IPCORK_TS_OPT_ID) {
+ tskey = cork->ts_opt_id;
+ } else {
+ tskey = atomic_inc_return(&sk->sk_tskey) - 1;
+ hold_tskey = true;
+ }
+ }
+
/*
* Let's try using as much space as possible.
* Use MTU if total length of the message fits into the MTU.
* Otherwise, we need to reserve fragment header and
* fragment alignment (= 8-15 octects, in total).
*
- * Note that we may need to "move" the data from the tail of
+ * Note that we may need to "move" the data from the tail
* of the buffer to the new fragment when we split
* the message.
*
@@ -1344,7 +1594,7 @@ emsgsize:
while (length > 0) {
/* Check if the remaining data fits into current packet. */
- copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
+ copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
if (copy < length)
copy = maxfraglen - skb->len;
@@ -1353,8 +1603,8 @@ emsgsize:
unsigned int datalen;
unsigned int fraglen;
unsigned int fraggap;
- unsigned int alloclen;
- unsigned int pagedlen = 0;
+ unsigned int alloclen, alloc_extra;
+ unsigned int pagedlen;
alloc_new_skb:
/* There's no room in the current skb */
if (skb)
@@ -1375,21 +1625,33 @@ alloc_new_skb:
*/
datalen = length + fraggap;
- if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
+ if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
fraglen = datalen + fragheaderlen;
+ pagedlen = 0;
+
+ alloc_extra = hh_len;
+ alloc_extra += dst_exthdrlen;
+ alloc_extra += rt->dst.trailer_len;
+
+ /* We just reserve space for fragment header.
+ * Note: this may be overallocation if the message
+ * (without MSG_MORE) fits into the MTU.
+ */
+ alloc_extra += sizeof(struct frag_hdr);
if ((flags & MSG_MORE) &&
!(rt->dst.dev->features&NETIF_F_SG))
alloclen = mtu;
- else if (!paged)
+ else if (!paged &&
+ (fraglen + alloc_extra < SKB_MAX_ALLOC ||
+ !(rt->dst.dev->features & NETIF_F_SG)))
alloclen = fraglen;
else {
- alloclen = min_t(int, fraglen, MAX_HEADER);
- pagedlen = fraglen - alloclen;
+ alloclen = fragheaderlen + transhdrlen;
+ pagedlen = datalen - transhdrlen;
}
-
- alloclen += dst_exthdrlen;
+ alloclen += alloc_extra;
if (datalen != length + fraggap) {
/*
@@ -1399,30 +1661,24 @@ alloc_new_skb:
datalen += rt->dst.trailer_len;
}
- alloclen += rt->dst.trailer_len;
fraglen = datalen + fragheaderlen;
- /*
- * We just reserve space for fragment header.
- * Note: this may be overallocation if the message
- * (without MSG_MORE) fits into the MTU.
- */
- alloclen += sizeof(struct frag_hdr);
-
copy = datalen - transhdrlen - fraggap - pagedlen;
- if (copy < 0) {
+ /* [!] NOTE: copy may be negative if pagedlen>0
+ * because then the equation may reduces to -fraggap.
+ */
+ if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
err = -EINVAL;
goto error;
}
if (transhdrlen) {
- skb = sock_alloc_send_skb(sk,
- alloclen + hh_len,
+ skb = sock_alloc_send_skb(sk, alloclen,
(flags & MSG_DONTWAIT), &err);
} else {
skb = NULL;
if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
2 * sk->sk_sndbuf)
- skb = alloc_skb(alloclen + hh_len,
+ skb = alloc_skb(alloclen,
sk->sk_allocation);
if (unlikely(!skb))
err = -ENOBUFS;
@@ -1439,12 +1695,6 @@ alloc_new_skb:
skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
dst_exthdrlen);
- /* Only the initial fragment is time stamped */
- skb_shinfo(skb)->tx_flags = cork->tx_flags;
- cork->tx_flags = 0;
- skb_shinfo(skb)->tskey = tskey;
- tskey = 0;
-
/*
* Find where to start putting bytes
*/
@@ -1456,18 +1706,21 @@ alloc_new_skb:
if (fraggap) {
skb->csum = skb_copy_and_csum_bits(
skb_prev, maxfraglen,
- data + transhdrlen, fraggap, 0);
+ data + transhdrlen, fraggap);
skb_prev->csum = csum_sub(skb_prev->csum,
skb->csum);
data += fraggap;
pskb_trim_unique(skb_prev, maxfraglen);
}
if (copy > 0 &&
- getfrag(from, data + transhdrlen, offset,
- copy, fraggap, skb) < 0) {
+ INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from, data + transhdrlen, offset,
+ copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
goto error;
+ } else if (flags & MSG_SPLICE_PAGES) {
+ copy = 0;
}
offset += copy;
@@ -1476,6 +1729,13 @@ alloc_new_skb:
exthdrlen = 0;
dst_exthdrlen = 0;
+ /* Only the initial fragment is time stamped */
+ skb_shinfo(skb)->tx_flags = cork->tx_flags;
+ cork->tx_flags = 0;
+ skb_shinfo(skb)->tskey = tskey;
+ tskey = 0;
+ skb_zcopy_set(skb, uarg, &extra_uref);
+
if ((flags & MSG_CONFIRM) && !skb_prev)
skb_set_dst_pending_confirm(skb, 1);
@@ -1499,19 +1759,33 @@ alloc_new_skb:
unsigned int off;
off = skb->len;
- if (getfrag(from, skb_put(skb, copy),
- offset, copy, off, skb) < 0) {
+ if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from, skb_put(skb, copy),
+ offset, copy, off, skb) < 0) {
__skb_trim(skb, off);
err = -EFAULT;
goto error;
}
- } else {
+ } else if (flags & MSG_SPLICE_PAGES) {
+ struct msghdr *msg = from;
+
+ err = -EIO;
+ if (WARN_ON_ONCE(copy > msg->msg_iter.count))
+ goto error;
+
+ err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
+ if (err < 0)
+ goto error;
+ copy = err;
+ wmem_alloc_delta += copy;
+ } else if (!zc) {
int i = skb_shinfo(skb)->nr_frags;
err = -ENOMEM;
if (!sk_page_frag_refill(sk, pfrag))
goto error;
+ skb_zcopy_downgrade_managed(skb);
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
err = -EMSGSIZE;
@@ -1524,7 +1798,8 @@ alloc_new_skb:
get_page(pfrag->page);
}
copy = min_t(int, copy, pfrag->size - pfrag->offset);
- if (getfrag(from,
+ if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from,
page_address(pfrag->page) + pfrag->offset,
offset, copy, skb->len, skb) < 0)
goto error_efault;
@@ -1535,6 +1810,10 @@ alloc_new_skb:
skb->data_len += copy;
skb->truesize += copy;
wmem_alloc_delta += copy;
+ } else {
+ err = skb_zerocopy_iter_dgram(skb, from, copy);
+ if (err < 0)
+ goto error;
}
offset += copy;
length -= copy;
@@ -1547,16 +1826,19 @@ alloc_new_skb:
error_efault:
err = -EFAULT;
error:
+ net_zcopy_put_abort(uarg, extra_uref);
cork->length -= length;
IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
+ if (hold_tskey)
+ atomic_dec(&sk->sk_tskey);
return err;
}
int ip6_append_data(struct sock *sk,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
- void *from, int length, int transhdrlen,
+ void *from, size_t length, int transhdrlen,
struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
struct rt6_info *rt, unsigned int flags)
{
@@ -1571,43 +1853,52 @@ int ip6_append_data(struct sock *sk,
/*
* setup for corking
*/
+ dst_hold(&rt->dst);
err = ip6_setup_cork(sk, &inet->cork, &np->cork,
- ipc6, rt, fl6);
+ ipc6, rt);
if (err)
return err;
+ inet->cork.fl.u.ip6 = *fl6;
exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
length += exthdrlen;
transhdrlen += exthdrlen;
} else {
- fl6 = &inet->cork.fl.u.ip6;
transhdrlen = 0;
}
- return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
+ return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
&np->cork, sk_page_frag(sk), getfrag,
- from, length, transhdrlen, flags, ipc6);
+ from, length, transhdrlen, flags);
}
EXPORT_SYMBOL_GPL(ip6_append_data);
+static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
+{
+ struct dst_entry *dst = cork->base.dst;
+
+ cork->base.dst = NULL;
+ skb_dst_set(skb, dst);
+}
+
static void ip6_cork_release(struct inet_cork_full *cork,
struct inet6_cork *v6_cork)
{
if (v6_cork->opt) {
- kfree(v6_cork->opt->dst0opt);
- kfree(v6_cork->opt->dst1opt);
- kfree(v6_cork->opt->hopopt);
- kfree(v6_cork->opt->srcrt);
- kfree(v6_cork->opt);
+ struct ipv6_txoptions *opt = v6_cork->opt;
+
+ kfree(opt->dst0opt);
+ kfree(opt->dst1opt);
+ kfree(opt->hopopt);
+ kfree(opt->srcrt);
+ kfree(opt);
v6_cork->opt = NULL;
}
if (cork->base.dst) {
dst_release(cork->base.dst);
cork->base.dst = NULL;
- cork->base.flags &= ~IPCORK_ALLFRAG;
}
- memset(&cork->fl, 0, sizeof(cork->fl));
}
struct sk_buff *__ip6_make_skb(struct sock *sk,
@@ -1617,12 +1908,11 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
{
struct sk_buff *skb, *tmp_skb;
struct sk_buff **tail_skb;
- struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
- struct ipv6_pinfo *np = inet6_sk(sk);
+ struct in6_addr *final_dst;
struct net *net = sock_net(sk);
struct ipv6hdr *hdr;
struct ipv6_txoptions *opt = v6_cork->opt;
- struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
+ struct rt6_info *rt = dst_rt6_info(cork->base.dst);
struct flowi6 *fl6 = &cork->fl.u.ip6;
unsigned char proto = fl6->flowi6_proto;
@@ -1647,9 +1937,9 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
/* Allow local fragmentation. */
skb->ignore_df = ip6_sk_ignore_df(sk);
-
- *final_dst = fl6->daddr;
__skb_pull(skb, skb_network_header_len(skb));
+
+ final_dst = &fl6->daddr;
if (opt && opt->opt_flen)
ipv6_push_frag_opts(skb, opt, &proto);
if (opt && opt->opt_nflen)
@@ -1661,23 +1951,31 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
ip6_flow_hdr(hdr, v6_cork->tclass,
ip6_make_flowlabel(net, skb, fl6->flowlabel,
- ip6_autoflowlabel(net, np), fl6));
+ ip6_autoflowlabel(net, sk), fl6));
hdr->hop_limit = v6_cork->hop_limit;
hdr->nexthdr = proto;
hdr->saddr = fl6->saddr;
hdr->daddr = *final_dst;
- skb->priority = sk->sk_priority;
- skb->mark = sk->sk_mark;
-
- skb->tstamp = cork->base.transmit_time;
+ skb->priority = cork->base.priority;
+ skb->mark = cork->base.mark;
+ if (sk_is_tcp(sk))
+ skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC);
+ else
+ skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid);
- skb_dst_set(skb, dst_clone(&rt->dst));
- IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
+ ip6_cork_steal_dst(skb, cork);
+ IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
if (proto == IPPROTO_ICMPV6) {
struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
+ u8 icmp6_type;
- ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
+ if (sk->sk_socket->type == SOCK_RAW &&
+ !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
+ icmp6_type = fl6->fl6_icmp_type;
+ else
+ icmp6_type = icmp6_hdr(skb)->icmp6_type;
+ ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
}
@@ -1689,9 +1987,10 @@ out:
int ip6_send_skb(struct sk_buff *skb)
{
struct net *net = sock_net(skb->sk);
- struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
+ struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
int err;
+ rcu_read_lock();
err = ip6_local_out(net, skb->sk, skb);
if (err) {
if (err > 0)
@@ -1701,6 +2000,7 @@ int ip6_send_skb(struct sk_buff *skb)
IPSTATS_MIB_OUTDISCARDS);
}
+ rcu_read_unlock();
return err;
}
@@ -1743,38 +2043,36 @@ EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
struct sk_buff *ip6_make_skb(struct sock *sk,
int getfrag(void *from, char *to, int offset,
int len, int odd, struct sk_buff *skb),
- void *from, int length, int transhdrlen,
- struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
- struct rt6_info *rt, unsigned int flags,
- struct inet_cork_full *cork)
+ void *from, size_t length, int transhdrlen,
+ struct ipcm6_cookie *ipc6, struct rt6_info *rt,
+ unsigned int flags, struct inet_cork_full *cork)
{
struct inet6_cork v6_cork;
struct sk_buff_head queue;
int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
int err;
- if (flags & MSG_PROBE)
+ if (flags & MSG_PROBE) {
+ dst_release(&rt->dst);
return NULL;
+ }
__skb_queue_head_init(&queue);
cork->base.flags = 0;
cork->base.addr = 0;
cork->base.opt = NULL;
- cork->base.dst = NULL;
v6_cork.opt = NULL;
- err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6);
+ err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
if (err) {
ip6_cork_release(cork, &v6_cork);
return ERR_PTR(err);
}
- if (ipc6->dontfrag < 0)
- ipc6->dontfrag = inet6_sk(sk)->dontfrag;
- err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork,
+ err = __ip6_append_data(sk, &queue, cork, &v6_cork,
&current->task_frag, getfrag, from,
length + exthdrlen, transhdrlen + exthdrlen,
- flags, ipc6);
+ flags);
if (err) {
__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
return ERR_PTR(err);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index a0b6932c3afd..6405072050e0 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPv6 tunneling device
* Linux INET6 implementation
@@ -10,12 +11,6 @@
* linux/net/ipv6/sit.c and linux/net/ipv4/ipip.c
*
* RFC 2473
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -57,7 +52,9 @@
#include <net/inet_ecn.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <net/netdev_lock.h>
#include <net/dst_metadata.h>
+#include <net/inet_dscp.h>
MODULE_AUTHOR("Ville Nuorvala");
MODULE_DESCRIPTION("IPv6 tunneling device");
@@ -94,38 +91,18 @@ struct ip6_tnl_net {
struct ip6_tnl __rcu *collect_md_tun;
};
-static struct net_device_stats *ip6_get_stats(struct net_device *dev)
+static inline int ip6_tnl_mpls_supported(void)
{
- struct pcpu_sw_netstats tmp, sum = { 0 };
- int i;
-
- for_each_possible_cpu(i) {
- unsigned int start;
- const struct pcpu_sw_netstats *tstats =
- per_cpu_ptr(dev->tstats, i);
-
- do {
- start = u64_stats_fetch_begin_irq(&tstats->syncp);
- tmp.rx_packets = tstats->rx_packets;
- tmp.rx_bytes = tstats->rx_bytes;
- tmp.tx_packets = tstats->tx_packets;
- tmp.tx_bytes = tstats->tx_bytes;
- } while (u64_stats_fetch_retry_irq(&tstats->syncp, start));
-
- sum.rx_packets += tmp.rx_packets;
- sum.rx_bytes += tmp.rx_bytes;
- sum.tx_packets += tmp.tx_packets;
- sum.tx_bytes += tmp.tx_bytes;
- }
- dev->stats.rx_packets = sum.rx_packets;
- dev->stats.rx_bytes = sum.rx_bytes;
- dev->stats.tx_packets = sum.tx_packets;
- dev->stats.tx_bytes = sum.tx_bytes;
- return &dev->stats;
+ return IS_ENABLED(CONFIG_MPLS);
}
+#define for_each_ip6_tunnel_rcu(start) \
+ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
+
/**
* ip6_tnl_lookup - fetch tunnel matching the end-point addresses
+ * @net: network namespace
+ * @link: ifindex of underlying interface
* @remote: the address of the tunnel exit-point
* @local: the address of the tunnel entry-point
*
@@ -135,41 +112,57 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev)
* else %NULL
**/
-#define for_each_ip6_tunnel_rcu(start) \
- for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
-
static struct ip6_tnl *
-ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local)
+ip6_tnl_lookup(struct net *net, int link,
+ const struct in6_addr *remote, const struct in6_addr *local)
{
unsigned int hash = HASH(remote, local);
- struct ip6_tnl *t;
+ struct ip6_tnl *t, *cand = NULL;
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
struct in6_addr any;
for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
- if (ipv6_addr_equal(local, &t->parms.laddr) &&
- ipv6_addr_equal(remote, &t->parms.raddr) &&
- (t->dev->flags & IFF_UP))
+ if (!ipv6_addr_equal(local, &t->parms.laddr) ||
+ !ipv6_addr_equal(remote, &t->parms.raddr) ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (link == t->parms.link)
return t;
+ else
+ cand = t;
}
memset(&any, 0, sizeof(any));
hash = HASH(&any, local);
for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
- if (ipv6_addr_equal(local, &t->parms.laddr) &&
- ipv6_addr_any(&t->parms.raddr) &&
- (t->dev->flags & IFF_UP))
+ if (!ipv6_addr_equal(local, &t->parms.laddr) ||
+ !ipv6_addr_any(&t->parms.raddr) ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (link == t->parms.link)
return t;
+ else if (!cand)
+ cand = t;
}
hash = HASH(remote, &any);
for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) {
- if (ipv6_addr_equal(remote, &t->parms.raddr) &&
- ipv6_addr_any(&t->parms.laddr) &&
- (t->dev->flags & IFF_UP))
+ if (!ipv6_addr_equal(remote, &t->parms.raddr) ||
+ !ipv6_addr_any(&t->parms.laddr) ||
+ !(t->dev->flags & IFF_UP))
+ continue;
+
+ if (link == t->parms.link)
return t;
+ else if (!cand)
+ cand = t;
}
+ if (cand)
+ return cand;
+
t = rcu_dereference(ip6n->collect_md_tun);
if (t && t->dev->flags & IFF_UP)
return t;
@@ -183,6 +176,7 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_
/**
* ip6_tnl_bucket - get head of list matching given tunnel parameters
+ * @ip6n: the private data for ip6_vti in the netns
* @p: parameters containing tunnel end-points
*
* Description:
@@ -209,6 +203,7 @@ ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm *p)
/**
* ip6_tnl_link - add tunnel to hash table
+ * @ip6n: the private data for ip6_vti in the netns
* @t: tunnel to be added
**/
@@ -225,6 +220,7 @@ ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
/**
* ip6_tnl_unlink - remove tunnel from hash table
+ * @ip6n: the private data for ip6_vti in the netns
* @t: tunnel to be removed
**/
@@ -253,18 +249,14 @@ static void ip6_dev_free(struct net_device *dev)
gro_cells_destroy(&t->gro_cells);
dst_cache_destroy(&t->dst_cache);
- free_percpu(dev->tstats);
}
static int ip6_tnl_create2(struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
- struct net *net = dev_net(dev);
- struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+ struct ip6_tnl_net *ip6n = net_generic(t->net, ip6_tnl_net_id);
int err;
- t = netdev_priv(dev);
-
dev->rtnl_link_ops = &ip6_link_ops;
err = register_netdevice(dev);
if (err < 0)
@@ -272,7 +264,6 @@ static int ip6_tnl_create2(struct net_device *dev)
strcpy(t->parms.name, dev->name);
- dev_hold(dev);
ip6_tnl_link(ip6n, t);
return 0;
@@ -282,8 +273,8 @@ out:
/**
* ip6_tnl_create - create a new tunnel
+ * @net: network namespace
* @p: tunnel parameters
- * @pt: pointer to new tunnel
*
* Description:
* Create tunnel matching given parameters.
@@ -302,7 +293,7 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p)
if (p->name[0]) {
if (!dev_valid_name(p->name))
goto failed;
- strlcpy(name, p->name, IFNAMSIZ);
+ strscpy(name, p->name, IFNAMSIZ);
} else {
sprintf(name, "ip6tnl%%d");
}
@@ -331,6 +322,7 @@ failed:
/**
* ip6_tnl_locate - find or create tunnel matching given parameters
+ * @net: network namespace
* @p: tunnel parameters
* @create: != 0 if allowed to create new tunnel if no match found
*
@@ -356,7 +348,8 @@ static struct ip6_tnl *ip6_tnl_locate(struct net *net,
(t = rtnl_dereference(*tp)) != NULL;
tp = &t->next) {
if (ipv6_addr_equal(local, &t->parms.laddr) &&
- ipv6_addr_equal(remote, &t->parms.raddr)) {
+ ipv6_addr_equal(remote, &t->parms.raddr) &&
+ p->link == t->parms.link) {
if (create)
return ERR_PTR(-EEXIST);
@@ -388,12 +381,13 @@ ip6_tnl_dev_uninit(struct net_device *dev)
else
ip6_tnl_unlink(ip6n, t);
dst_cache_reset(&t->dst_cache);
- dev_put(dev);
+ netdev_put(dev, &t->dev_tracker);
}
/**
- * parse_tvl_tnl_enc_lim - handle encapsulation limit option
+ * ip6_tnl_parse_tlv_enc_lim - handle encapsulation limit option
* @skb: received socket buffer
+ * @raw: the ICMPv6 error message data
*
* Return:
* 0 if none was found,
@@ -405,7 +399,7 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)raw;
unsigned int nhoff = raw - skb->data;
unsigned int off = nhoff + sizeof(*ipv6h);
- u8 next, nexthdr = ipv6h->nexthdr;
+ u8 nexthdr = ipv6h->nexthdr;
while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) {
struct ipv6_opt_hdr *hdr;
@@ -416,25 +410,25 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
hdr = (struct ipv6_opt_hdr *)(skb->data + off);
if (nexthdr == NEXTHDR_FRAGMENT) {
- struct frag_hdr *frag_hdr = (struct frag_hdr *) hdr;
- if (frag_hdr->frag_off)
- break;
optlen = 8;
} else if (nexthdr == NEXTHDR_AUTH) {
- optlen = (hdr->hdrlen + 2) << 2;
+ optlen = ipv6_authlen(hdr);
} else {
optlen = ipv6_optlen(hdr);
}
- /* cache hdr->nexthdr, since pskb_may_pull() might
- * invalidate hdr
- */
- next = hdr->nexthdr;
- if (nexthdr == NEXTHDR_DEST) {
- u16 i = 2;
- /* Remember : hdr is no longer valid at this point. */
- if (!pskb_may_pull(skb, off + optlen))
+ if (!pskb_may_pull(skb, off + optlen))
+ break;
+
+ hdr = (struct ipv6_opt_hdr *)(skb->data + off);
+ if (nexthdr == NEXTHDR_FRAGMENT) {
+ struct frag_hdr *frag_hdr = (struct frag_hdr *)hdr;
+
+ if (frag_hdr->frag_off)
break;
+ }
+ if (nexthdr == NEXTHDR_DEST) {
+ u16 i = 2;
while (1) {
struct ipv6_tlv_tnl_enc_lim *tel;
@@ -455,21 +449,16 @@ __u16 ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw)
i++;
}
}
- nexthdr = next;
+ nexthdr = hdr->nexthdr;
off += optlen;
}
return 0;
}
EXPORT_SYMBOL(ip6_tnl_parse_tlv_enc_lim);
-/**
- * ip6_tnl_err - tunnel error handler
- *
- * Description:
- * ip6_tnl_err() should handle errors in the tunnel according
- * to the specifications in RFC 2473.
- **/
-
+/* ip6_tnl_err() should handle errors in the tunnel according to the
+ * specifications in RFC 2473.
+ */
static int
ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
u8 *type, u8 *code, int *msg, __u32 *info, int offset)
@@ -490,7 +479,7 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
processing of the error. */
rcu_read_lock();
- t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->daddr, &ipv6h->saddr);
+ t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->daddr, &ipv6h->saddr);
if (!t)
goto out;
@@ -501,8 +490,6 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
err = 0;
switch (*type) {
- struct ipv6_tlv_tnl_enc_lim *tel;
- __u32 mtu, teli;
case ICMPV6_DEST_UNREACH:
net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n",
t->parms.name);
@@ -515,7 +502,10 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
rel_msg = 1;
}
break;
- case ICMPV6_PARAMPROB:
+ case ICMPV6_PARAMPROB: {
+ struct ipv6_tlv_tnl_enc_lim *tel;
+ __u32 teli;
+
teli = 0;
if ((*code) == ICMPV6_HDR_FIELD)
teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data);
@@ -532,7 +522,10 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
t->parms.name);
}
break;
- case ICMPV6_PKT_TOOBIG:
+ }
+ case ICMPV6_PKT_TOOBIG: {
+ __u32 mtu;
+
ip6_update_pmtu(skb, net, htonl(*info), 0, 0,
sock_net_uid(net, NULL));
mtu = *info - offset;
@@ -546,6 +539,7 @@ ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
rel_msg = 1;
}
break;
+ }
case NDISC_REDIRECT:
ip6_redirect(skb, net, skb->dev->ifindex, 0,
sock_net_uid(net, NULL));
@@ -615,7 +609,8 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
/* Try to guess incoming interface */
rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL, eiph->saddr,
- 0, 0, 0, IPPROTO_IPIP, RT_TOS(eiph->tos), 0);
+ 0, 0, 0, IPPROTO_IPIP,
+ eiph->tos & INET_DSCP_MASK, 0);
if (IS_ERR(rt))
goto out;
@@ -626,17 +621,18 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (rt->rt_flags & RTCF_LOCAL) {
rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL,
eiph->daddr, eiph->saddr, 0, 0,
- IPPROTO_IPIP, RT_TOS(eiph->tos), 0);
- if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL) {
+ IPPROTO_IPIP,
+ eiph->tos & INET_DSCP_MASK, 0);
+ if (IS_ERR(rt) || rt->dst.dev->type != ARPHRD_TUNNEL6) {
if (!IS_ERR(rt))
ip_rt_put(rt);
goto out;
}
skb_dst_set(skb2, &rt->dst);
} else {
- if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos,
- skb2->dev) ||
- skb_dst(skb2)->dev->type != ARPHRD_TUNNEL)
+ if (ip_route_input(skb2, eiph->daddr, eiph->saddr,
+ ip4h_dscp(eiph), skb2->dev) ||
+ skb_dst_dev(skb2)->type != ARPHRD_TUNNEL6)
goto out;
}
@@ -645,7 +641,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (rel_info > dst_mtu(skb_dst(skb2)))
goto out;
- skb_dst_update_pmtu(skb2, rel_info);
+ skb_dst_update_pmtu_no_confirm(skb2, rel_info);
}
icmp_send(skb2, rel_type, rel_code, htonl(rel_info));
@@ -697,6 +693,20 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
}
+static int
+mplsip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ __u32 rel_info = ntohl(info);
+ int err, rel_msg = 0;
+ u8 rel_type = type;
+ u8 rel_code = code;
+
+ err = ip6_tnl_err(skb, IPPROTO_MPLS, opt, &rel_type, &rel_code,
+ &rel_msg, &rel_info, offset);
+ return err;
+}
+
static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
const struct ipv6hdr *ipv6h,
struct sk_buff *skb)
@@ -719,6 +729,14 @@ static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
return IP6_ECN_decapsulate(ipv6h, skb);
}
+static inline int mplsip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
+ const struct ipv6hdr *ipv6h,
+ struct sk_buff *skb)
+{
+ /* ECN is not supported in AF_MPLS */
+ return 0;
+}
+
__u32 ip6_tnl_get_cap(struct ip6_tnl *t,
const struct in6_addr *laddr,
const struct in6_addr *raddr)
@@ -780,25 +798,22 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
struct sk_buff *skb),
bool log_ecn_err)
{
- struct pcpu_sw_netstats *tstats;
- const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
- int err;
+ const struct ipv6hdr *ipv6h;
+ int nh, err;
- if ((!(tpi->flags & TUNNEL_CSUM) &&
- (tunnel->parms.i_flags & TUNNEL_CSUM)) ||
- ((tpi->flags & TUNNEL_CSUM) &&
- !(tunnel->parms.i_flags & TUNNEL_CSUM))) {
- tunnel->dev->stats.rx_crc_errors++;
- tunnel->dev->stats.rx_errors++;
+ if (test_bit(IP_TUNNEL_CSUM_BIT, tunnel->parms.i_flags) !=
+ test_bit(IP_TUNNEL_CSUM_BIT, tpi->flags)) {
+ DEV_STATS_INC(tunnel->dev, rx_crc_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
goto drop;
}
- if (tunnel->parms.i_flags & TUNNEL_SEQ) {
- if (!(tpi->flags & TUNNEL_SEQ) ||
+ if (test_bit(IP_TUNNEL_SEQ_BIT, tunnel->parms.i_flags)) {
+ if (!test_bit(IP_TUNNEL_SEQ_BIT, tpi->flags) ||
(tunnel->i_seqno &&
(s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
- tunnel->dev->stats.rx_fifo_errors++;
- tunnel->dev->stats.rx_errors++;
+ DEV_STATS_INC(tunnel->dev, rx_fifo_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
goto drop;
}
tunnel->i_seqno = ntohl(tpi->seq) + 1;
@@ -809,19 +824,35 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
/* Warning: All skb pointers will be invalidated! */
if (tunnel->dev->type == ARPHRD_ETHER) {
if (!pskb_may_pull(skb, ETH_HLEN)) {
- tunnel->dev->stats.rx_length_errors++;
- tunnel->dev->stats.rx_errors++;
+ DEV_STATS_INC(tunnel->dev, rx_length_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
goto drop;
}
- ipv6h = ipv6_hdr(skb);
skb->protocol = eth_type_trans(skb, tunnel->dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
} else {
skb->dev = tunnel->dev;
+ skb_reset_mac_header(skb);
}
+ /* Save offset of outer header relative to skb->head,
+ * because we are going to reset the network header to the inner header
+ * and might change skb->head.
+ */
+ nh = skb_network_header(skb) - skb->head;
+
skb_reset_network_header(skb);
+
+ if (!pskb_inet_may_pull(skb)) {
+ DEV_STATS_INC(tunnel->dev, rx_length_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
+ goto drop;
+ }
+
+ /* Get the outer header. */
+ ipv6h = (struct ipv6hdr *)(skb->head + nh);
+
memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
__skb_tunnel_rx(skb, tunnel->dev, tunnel->net);
@@ -833,17 +864,13 @@ static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb,
&ipv6h->saddr,
ipv6_get_dsfield(ipv6h));
if (err > 1) {
- ++tunnel->dev->stats.rx_frame_errors;
- ++tunnel->dev->stats.rx_errors;
+ DEV_STATS_INC(tunnel->dev, rx_frame_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
goto drop;
}
}
- tstats = this_cpu_ptr(tunnel->dev->tstats);
- u64_stats_update_begin(&tstats->syncp);
- tstats->rx_packets++;
- tstats->rx_bytes += skb->len;
- u64_stats_update_end(&tstats->syncp);
+ dev_sw_netstats_rx_add(tunnel->dev, skb->len);
skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
@@ -865,7 +892,15 @@ int ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb,
struct metadata_dst *tun_dst,
bool log_ecn_err)
{
- return __ip6_tnl_rcv(t, skb, tpi, tun_dst, ip6ip6_dscp_ecn_decapsulate,
+ int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
+ const struct ipv6hdr *ipv6h,
+ struct sk_buff *skb);
+
+ dscp_ecn_decapsulate = ip6ip6_dscp_ecn_decapsulate;
+ if (tpi->proto == htons(ETH_P_IP))
+ dscp_ecn_decapsulate = ip4ip6_dscp_ecn_decapsulate;
+
+ return __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate,
log_ecn_err);
}
EXPORT_SYMBOL(ip6_tnl_rcv);
@@ -880,6 +915,11 @@ static const struct tnl_ptk_info tpi_v4 = {
.proto = htons(ETH_P_IP),
};
+static const struct tnl_ptk_info tpi_mpls = {
+ /* no tunnel info required for mplsip6. */
+ .proto = htons(ETH_P_MPLS_UC),
+};
+
static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
const struct tnl_ptk_info *tpi,
int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
@@ -892,7 +932,7 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
int ret = -1;
rcu_read_lock();
- t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr);
+ t = ip6_tnl_lookup(dev_net(skb->dev), skb->dev->ifindex, &ipv6h->saddr, &ipv6h->daddr);
if (t) {
u8 tproto = READ_ONCE(t->parms.proto);
@@ -901,12 +941,15 @@ static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto,
goto drop;
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
+ ipv6h = ipv6_hdr(skb);
if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr))
goto drop;
if (iptunnel_pull_header(skb, 0, tpi->proto, false))
goto drop;
if (t->parms.collect_md) {
- tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0);
+ IP_TUNNEL_DECLARE_FLAGS(flags) = { };
+
+ tun_dst = ipv6_tun_rx_dst(skb, flags, 0, 0);
if (!tun_dst)
goto drop;
}
@@ -936,6 +979,12 @@ static int ip6ip6_rcv(struct sk_buff *skb)
ip6ip6_dscp_ecn_decapsulate);
}
+static int mplsip6_rcv(struct sk_buff *skb)
+{
+ return ipxip6_rcv(skb, IPPROTO_MPLS, &tpi_mpls,
+ mplsip6_dscp_ecn_decapsulate);
+}
+
struct ipv6_tel_txoption {
struct ipv6_txoptions ops;
__u8 dst_opt[8];
@@ -997,14 +1046,14 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t,
if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false,
0, IFA_F_TENTATIVE)))
- pr_warn("%s xmit: Local address not yet configured!\n",
- p->name);
+ pr_warn_ratelimited("%s xmit: Local address not yet configured!\n",
+ p->name);
else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) &&
!ipv6_addr_is_multicast(raddr) &&
unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev,
true, 0, IFA_F_TENTATIVE)))
- pr_warn("%s xmit: Routing loop! Remote address found on this node!\n",
- p->name);
+ pr_warn_ratelimited("%s xmit: Routing loop! Remote address found on this node!\n",
+ p->name);
else
ret = 1;
rcu_read_unlock();
@@ -1039,7 +1088,6 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
{
struct ip6_tnl *t = netdev_priv(dev);
struct net *net = t->net;
- struct net_device_stats *stats = &t->dev->stats;
struct ipv6hdr *ipv6h;
struct ipv6_tel_txoption opt;
struct dst_entry *dst = NULL, *ndst = NULL;
@@ -1048,10 +1096,13 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0;
unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen;
unsigned int max_headroom = psh_hlen;
+ __be16 payload_protocol;
bool use_cache = false;
u8 hop_limit;
int err = -1;
+ payload_protocol = skb_protocol(skb, true);
+
if (t->parms.collect_md) {
hop_limit = skb_tunnel_info(skb)->key.ttl;
goto route_lookup;
@@ -1061,7 +1112,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
/* NBMA tunnel */
if (ipv6_addr_any(&t->parms.raddr)) {
- if (skb->protocol == htons(ETH_P_IPV6)) {
+ if (payload_protocol == htons(ETH_P_IPV6)) {
struct in6_addr *addr6;
struct neighbour *neigh;
int addr_type;
@@ -1082,6 +1133,14 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield,
memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
neigh_release(neigh);
+ } else if (payload_protocol == htons(ETH_P_IP)) {
+ const struct rtable *rt = skb_rtable(skb);
+
+ if (!rt)
+ goto tx_err_link_failure;
+
+ if (rt->rt_gw_family == AF_INET6)
+ memcpy(&fl6->daddr, &rt->rt_gw6, sizeof(fl6->daddr));
}
} else if (t->parms.proto != 0 && !(t->parms.flags &
(IP6_TNL_F_USE_ORIG_TCLASS |
@@ -1120,10 +1179,10 @@ route_lookup:
ndst = dst;
}
- tdev = dst->dev;
+ tdev = dst_dev(dst);
if (tdev == dev) {
- stats->collisions++;
+ DEV_STATS_INC(dev, collisions);
net_warn_ratelimited("%s: Local routing loop detected!\n",
t->parms.name);
goto tx_err_dst_release;
@@ -1136,7 +1195,7 @@ route_lookup:
mtu = max(mtu, skb->protocol == htons(ETH_P_IPV6) ?
IPV6_MIN_MTU : IPV4_MIN_MTU);
- skb_dst_update_pmtu(skb, mtu);
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) {
*pmtu = mtu;
err = -EMSGSIZE;
@@ -1184,15 +1243,10 @@ route_lookup:
}
skb_dst_set(skb, dst);
- if (encap_limit >= 0) {
- init_tel_txopt(&opt, encap_limit);
- ipv6_push_frag_opts(skb, &opt.ops, &proto);
- }
-
if (hop_limit == 0) {
- if (skb->protocol == htons(ETH_P_IP))
+ if (payload_protocol == htons(ETH_P_IP))
hop_limit = ip_hdr(skb)->ttl;
- else if (skb->protocol == htons(ETH_P_IPV6))
+ else if (payload_protocol == htons(ETH_P_IPV6))
hop_limit = ipv6_hdr(skb)->hop_limit;
else
hop_limit = ip6_dst_hoplimit(dst);
@@ -1201,15 +1255,19 @@ route_lookup:
/* Calculate max headroom for all the headers and adjust
* needed_headroom if necessary.
*/
- max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr)
+ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr)
+ dst->header_len + t->hlen;
- if (max_headroom > dev->needed_headroom)
- dev->needed_headroom = max_headroom;
+ ip_tunnel_adj_headroom(dev, max_headroom);
err = ip6_tnl_encap(skb, t, &proto, fl6);
if (err)
return err;
+ if (encap_limit >= 0) {
+ init_tel_txopt(&opt, encap_limit);
+ ipv6_push_frag_opts(skb, &opt.ops, &proto);
+ }
+
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
ipv6h = ipv6_hdr(skb);
@@ -1219,10 +1277,10 @@ route_lookup:
ipv6h->nexthdr = proto;
ipv6h->saddr = fl6->saddr;
ipv6h->daddr = fl6->daddr;
- ip6tunnel_xmit(NULL, skb, dev);
+ ip6tunnel_xmit(NULL, skb, dev, 0);
return 0;
tx_err_link_failure:
- stats->tx_carrier_errors++;
+ DEV_STATS_INC(dev, tx_carrier_errors);
dst_link_failure(skb);
tx_err_dst_release:
dst_release(dst);
@@ -1231,26 +1289,22 @@ tx_err_dst_release:
EXPORT_SYMBOL(ip6_tnl_xmit);
static inline int
-ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
+ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev,
+ u8 protocol)
{
struct ip6_tnl *t = netdev_priv(dev);
+ struct ipv6hdr *ipv6h;
const struct iphdr *iph;
int encap_limit = -1;
+ __u16 offset;
struct flowi6 fl6;
- __u8 dsfield;
+ __u8 dsfield, orig_dsfield;
__u32 mtu;
u8 tproto;
int err;
- /* ensure we can access the full inner ip header */
- if (!pskb_may_pull(skb, sizeof(struct iphdr)))
- return -1;
-
- iph = ip_hdr(skb);
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-
tproto = READ_ONCE(t->parms.proto);
- if (tproto != IPPROTO_IPIP && tproto != 0)
+ if (tproto != protocol && tproto != 0)
return -1;
if (t->parms.collect_md) {
@@ -1263,134 +1317,102 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
return -1;
key = &tun_info->key;
memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_IPIP;
+ fl6.flowi6_proto = protocol;
fl6.saddr = key->u.ipv6.src;
fl6.daddr = key->u.ipv6.dst;
fl6.flowlabel = key->label;
dsfield = key->tos;
+ switch (protocol) {
+ case IPPROTO_IPIP:
+ iph = ip_hdr(skb);
+ orig_dsfield = ipv4_get_dsfield(iph);
+ break;
+ case IPPROTO_IPV6:
+ ipv6h = ipv6_hdr(skb);
+ orig_dsfield = ipv6_get_dsfield(ipv6h);
+ break;
+ default:
+ orig_dsfield = dsfield;
+ break;
+ }
} else {
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
encap_limit = t->parms.encap_limit;
+ if (protocol == IPPROTO_IPV6) {
+ offset = ip6_tnl_parse_tlv_enc_lim(skb,
+ skb_network_header(skb));
+ /* ip6_tnl_parse_tlv_enc_lim() might have
+ * reallocated skb->head
+ */
+ if (offset > 0) {
+ struct ipv6_tlv_tnl_enc_lim *tel;
- memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_IPIP;
-
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
- dsfield = ipv4_get_dsfield(iph);
- else
- dsfield = ip6_tclass(t->parms.flowinfo);
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
- fl6.flowi6_mark = skb->mark;
- else
- fl6.flowi6_mark = t->parms.fwmark;
- }
-
- fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
-
- if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
- return -1;
-
- dsfield = INET_ECN_encapsulate(dsfield, ipv4_get_dsfield(iph));
-
- skb_set_inner_ipproto(skb, IPPROTO_IPIP);
-
- err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
- IPPROTO_IPIP);
- if (err != 0) {
- /* XXX: send ICMP error even if DF is not set. */
- if (err == -EMSGSIZE)
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(mtu));
- return -1;
- }
-
- return 0;
-}
-
-static inline int
-ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
-{
- struct ip6_tnl *t = netdev_priv(dev);
- struct ipv6hdr *ipv6h;
- int encap_limit = -1;
- __u16 offset;
- struct flowi6 fl6;
- __u8 dsfield;
- __u32 mtu;
- u8 tproto;
- int err;
-
- if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
- return -1;
-
- ipv6h = ipv6_hdr(skb);
- tproto = READ_ONCE(t->parms.proto);
- if ((tproto != IPPROTO_IPV6 && tproto != 0) ||
- ip6_tnl_addr_conflict(t, ipv6h))
- return -1;
-
- if (t->parms.collect_md) {
- struct ip_tunnel_info *tun_info;
- const struct ip_tunnel_key *key;
-
- tun_info = skb_tunnel_info(skb);
- if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
- ip_tunnel_info_af(tun_info) != AF_INET6))
- return -1;
- key = &tun_info->key;
- memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_IPV6;
- fl6.saddr = key->u.ipv6.src;
- fl6.daddr = key->u.ipv6.dst;
- fl6.flowlabel = key->label;
- dsfield = key->tos;
- } else {
- offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
- /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
- ipv6h = ipv6_hdr(skb);
- if (offset > 0) {
- struct ipv6_tlv_tnl_enc_lim *tel;
-
- tel = (void *)&skb_network_header(skb)[offset];
- if (tel->encap_limit == 0) {
- icmpv6_send(skb, ICMPV6_PARAMPROB,
- ICMPV6_HDR_FIELD, offset + 2);
- return -1;
+ tel = (void *)&skb_network_header(skb)[offset];
+ if (tel->encap_limit == 0) {
+ icmpv6_ndo_send(skb, ICMPV6_PARAMPROB,
+ ICMPV6_HDR_FIELD, offset + 2);
+ return -1;
+ }
+ encap_limit = tel->encap_limit - 1;
}
- encap_limit = tel->encap_limit - 1;
- } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
- encap_limit = t->parms.encap_limit;
}
memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_IPV6;
+ fl6.flowi6_proto = protocol;
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
- dsfield = ipv6_get_dsfield(ipv6h);
- else
- dsfield = ip6_tclass(t->parms.flowinfo);
- if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
- fl6.flowlabel |= ip6_flowlabel(ipv6h);
if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
fl6.flowi6_mark = skb->mark;
else
fl6.flowi6_mark = t->parms.fwmark;
+ switch (protocol) {
+ case IPPROTO_IPIP:
+ iph = ip_hdr(skb);
+ orig_dsfield = ipv4_get_dsfield(iph);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ dsfield = orig_dsfield;
+ else
+ dsfield = ip6_tclass(t->parms.flowinfo);
+ break;
+ case IPPROTO_IPV6:
+ ipv6h = ipv6_hdr(skb);
+ orig_dsfield = ipv6_get_dsfield(ipv6h);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
+ dsfield = orig_dsfield;
+ else
+ dsfield = ip6_tclass(t->parms.flowinfo);
+ if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
+ fl6.flowlabel |= ip6_flowlabel(ipv6h);
+ break;
+ default:
+ orig_dsfield = dsfield = ip6_tclass(t->parms.flowinfo);
+ break;
+ }
}
fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL);
+ dsfield = INET_ECN_encapsulate(dsfield, orig_dsfield);
if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6))
return -1;
- dsfield = INET_ECN_encapsulate(dsfield, ipv6_get_dsfield(ipv6h));
-
- skb_set_inner_ipproto(skb, IPPROTO_IPV6);
+ skb_set_inner_ipproto(skb, protocol);
err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
- IPPROTO_IPV6);
+ protocol);
if (err != 0) {
+ /* XXX: send ICMP error even if DF is not set. */
if (err == -EMSGSIZE)
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ switch (protocol) {
+ case IPPROTO_IPIP:
+ icmp_ndo_send(skb, ICMP_DEST_UNREACH,
+ ICMP_FRAG_NEEDED, htonl(mtu));
+ break;
+ case IPPROTO_IPV6:
+ icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ break;
+ default:
+ break;
+ }
return -1;
}
@@ -1401,28 +1423,37 @@ static netdev_tx_t
ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
- struct net_device_stats *stats = &t->dev->stats;
+ u8 ipproto;
int ret;
+ if (!pskb_inet_may_pull(skb))
+ goto tx_err;
+
switch (skb->protocol) {
case htons(ETH_P_IP):
- ret = ip4ip6_tnl_xmit(skb, dev);
+ ipproto = IPPROTO_IPIP;
break;
case htons(ETH_P_IPV6):
- ret = ip6ip6_tnl_xmit(skb, dev);
+ if (ip6_tnl_addr_conflict(t, ipv6_hdr(skb)))
+ goto tx_err;
+ ipproto = IPPROTO_IPV6;
+ break;
+ case htons(ETH_P_MPLS_UC):
+ ipproto = IPPROTO_MPLS;
break;
default:
goto tx_err;
}
+ ret = ipxip6_tnl_xmit(skb, dev, ipproto);
if (ret < 0)
goto tx_err;
return NETDEV_TX_OK;
tx_err:
- stats->tx_errors++;
- stats->tx_dropped++;
+ DEV_STATS_INC(dev, tx_errors);
+ DEV_STATS_INC(dev, tx_dropped);
kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -1430,11 +1461,13 @@ tx_err:
static void ip6_tnl_link_config(struct ip6_tnl *t)
{
struct net_device *dev = t->dev;
+ struct net_device *tdev = NULL;
struct __ip6_tnl_parm *p = &t->parms;
struct flowi6 *fl6 = &t->fl.u.ip6;
int t_hlen;
+ int mtu;
- memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
+ __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr));
memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
/* Set up flowi template */
@@ -1467,22 +1500,27 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
struct rt6_info *rt = rt6_lookup(t->net,
&p->raddr, &p->laddr,
p->link, NULL, strict);
+ if (rt) {
+ tdev = rt->dst.dev;
+ ip6_rt_put(rt);
+ }
- if (!rt)
- return;
+ if (!tdev && p->link)
+ tdev = __dev_get_by_index(t->net, p->link);
- if (rt->dst.dev) {
- dev->hard_header_len = rt->dst.dev->hard_header_len +
- t_hlen;
+ if (tdev) {
+ dev->needed_headroom = tdev->hard_header_len +
+ tdev->needed_headroom + t_hlen;
+ mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU);
- dev->mtu = rt->dst.dev->mtu - t_hlen;
+ mtu = mtu - t_hlen;
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
- dev->mtu -= 8;
+ mtu -= 8;
- if (dev->mtu < IPV6_MIN_MTU)
- dev->mtu = IPV6_MIN_MTU;
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+ WRITE_ONCE(dev->mtu, mtu);
}
- ip6_rt_put(rt);
}
}
@@ -1495,7 +1533,7 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
* ip6_tnl_change() updates the tunnel parameters
**/
-static int
+static void
ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p)
{
t->parms.laddr = p->laddr;
@@ -1509,26 +1547,33 @@ ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p)
t->parms.fwmark = p->fwmark;
dst_cache_reset(&t->dst_cache);
ip6_tnl_link_config(t);
- return 0;
}
-static int ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p)
+static void ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p)
{
struct net *net = t->net;
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
- int err;
ip6_tnl_unlink(ip6n, t);
synchronize_net();
- err = ip6_tnl_change(t, p);
+ ip6_tnl_change(t, p);
ip6_tnl_link(ip6n, t);
netdev_state_change(t->dev);
- return err;
}
-static int ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p)
+static int ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p,
+ bool strict)
{
- /* for default tnl0 device allow to change only the proto */
+ /* For the default ip6tnl0 device, allow changing only the protocol
+ * (the IP6_TNL_F_CAP_PER_PACKET flag is set on ip6tnl0, and all other
+ * parameters are 0).
+ */
+ if (strict &&
+ (!ipv6_addr_any(&p->laddr) || !ipv6_addr_any(&p->raddr) ||
+ p->flags != t->parms.flags || p->hop_limit || p->encap_limit ||
+ p->flowinfo || p->link || p->fwmark || p->collect_md))
+ return -EINVAL;
+
t->parms.proto = p->proto;
netdev_state_change(t->dev);
return 0;
@@ -1563,9 +1608,10 @@ ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p)
}
/**
- * ip6_tnl_ioctl - configure ipv6 tunnels from userspace
+ * ip6_tnl_siocdevprivate - configure ipv6 tunnels from userspace
* @dev: virtual device associated with tunnel
- * @ifr: parameters passed from userspace
+ * @ifr: unused
+ * @data: parameters passed from userspace
* @cmd: command to be performed
*
* Description:
@@ -1591,7 +1637,8 @@ ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p)
**/
static int
-ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+ void __user *data, int cmd)
{
int err = 0;
struct ip6_tnl_parm p;
@@ -1605,7 +1652,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
switch (cmd) {
case SIOCGETTUNNEL:
if (dev == ip6n->fb_tnl_dev) {
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+ if (copy_from_user(&p, data, sizeof(p))) {
err = -EFAULT;
break;
}
@@ -1617,9 +1664,8 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
memset(&p, 0, sizeof(p));
}
ip6_tnl_parm_to_user(&p, &t->parms);
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) {
+ if (copy_to_user(data, &p, sizeof(p)))
err = -EFAULT;
- }
break;
case SIOCADDTUNNEL:
case SIOCCHGTUNNEL:
@@ -1627,7 +1673,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
break;
err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ if (copy_from_user(&p, data, sizeof(p)))
break;
err = -EINVAL;
if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP &&
@@ -1644,14 +1690,14 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
} else
t = netdev_priv(dev);
if (dev == ip6n->fb_tnl_dev)
- err = ip6_tnl0_update(t, &p1);
+ ip6_tnl0_update(t, &p1, false);
else
- err = ip6_tnl_update(t, &p1);
+ ip6_tnl_update(t, &p1);
}
if (!IS_ERR(t)) {
err = 0;
ip6_tnl_parm_to_user(&p, &t->parms);
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ if (copy_to_user(data, &p, sizeof(p)))
err = -EFAULT;
} else {
@@ -1665,7 +1711,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
if (dev == ip6n->fb_tnl_dev) {
err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ if (copy_from_user(&p, data, sizeof(p)))
break;
err = -ENOENT;
ip6_tnl_parm_from_user(&p1, &p);
@@ -1699,7 +1745,9 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
{
struct ip6_tnl *tnl = netdev_priv(dev);
+ int t_hlen;
+ t_hlen = tnl->hlen + sizeof(struct ipv6hdr);
if (tnl->parms.proto == IPPROTO_IPV6) {
if (new_mtu < IPV6_MIN_MTU)
return -EINVAL;
@@ -1708,13 +1756,13 @@ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
return -EINVAL;
}
if (tnl->parms.proto == IPPROTO_IPV6 || tnl->parms.proto == 0) {
- if (new_mtu > IP6_MAX_MTU - dev->hard_header_len)
+ if (new_mtu > IP6_MAX_MTU - dev->hard_header_len - t_hlen)
return -EINVAL;
} else {
- if (new_mtu > IP_MAX_MTU - dev->hard_header_len)
+ if (new_mtu > IP_MAX_MTU - dev->hard_header_len - t_hlen)
return -EINVAL;
}
- dev->mtu = new_mtu;
+ WRITE_ONCE(dev->mtu, new_mtu);
return 0;
}
EXPORT_SYMBOL(ip6_tnl_change_mtu);
@@ -1723,7 +1771,7 @@ int ip6_tnl_get_iflink(const struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
- return t->parms.link;
+ return READ_ONCE(t->parms.link);
}
EXPORT_SYMBOL(ip6_tnl_get_iflink);
@@ -1784,9 +1832,9 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
.ndo_init = ip6_tnl_dev_init,
.ndo_uninit = ip6_tnl_dev_uninit,
.ndo_start_xmit = ip6_tnl_start_xmit,
- .ndo_do_ioctl = ip6_tnl_ioctl,
+ .ndo_siocdevprivate = ip6_tnl_siocdevprivate,
.ndo_change_mtu = ip6_tnl_change_mtu,
- .ndo_get_stats = ip6_get_stats,
+ .ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip6_tnl_get_iflink,
};
@@ -1807,13 +1855,15 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
static void ip6_tnl_dev_setup(struct net_device *dev)
{
dev->netdev_ops = &ip6_tnl_netdev_ops;
+ dev->header_ops = &ip_tunnel_header_ops;
dev->needs_free_netdev = true;
dev->priv_destructor = ip6_dev_free;
dev->type = ARPHRD_TUNNEL6;
dev->flags |= IFF_NOARP;
dev->addr_len = sizeof(struct in6_addr);
- dev->features |= NETIF_F_LLTX;
+ dev->lltx = true;
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
netif_keep_dst(dev);
dev->features |= IPXIPX_FEATURES;
@@ -1838,14 +1888,10 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
int t_hlen;
t->dev = dev;
- t->net = dev_net(dev);
- dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
- if (!dev->tstats)
- return -ENOMEM;
ret = dst_cache_init(&t->dst_cache, GFP_KERNEL);
if (ret)
- goto free_stats;
+ return ret;
ret = gro_cells_init(&t->gro_cells, dev);
if (ret)
@@ -1856,20 +1902,18 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
t_hlen = t->hlen + sizeof(struct ipv6hdr);
dev->type = ARPHRD_TUNNEL6;
- dev->hard_header_len = LL_MAX_HEADER + t_hlen;
dev->mtu = ETH_DATA_LEN - t_hlen;
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
dev->mtu -= 8;
dev->min_mtu = ETH_MIN_MTU;
- dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len;
+ dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len - t_hlen;
+ netdev_hold(dev, &t->dev_tracker, GFP_KERNEL);
+ netdev_lockdep_set_classes(dev);
return 0;
destroy_dst:
dst_cache_destroy(&t->dst_cache);
-free_stats:
- free_percpu(dev->tstats);
- dev->tstats = NULL;
return ret;
}
@@ -1887,10 +1931,8 @@ static int ip6_tnl_dev_init(struct net_device *dev)
if (err)
return err;
ip6_tnl_link_config(t);
- if (t->parms.collect_md) {
- dev->features |= NETIF_F_NETNS_LOCAL;
+ if (t->parms.collect_md)
netif_keep_dst(dev);
- }
return 0;
}
@@ -1907,8 +1949,8 @@ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev)
struct net *net = dev_net(dev);
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+ t->net = net;
t->parms.proto = IPPROTO_IPV6;
- dev_hold(dev);
rcu_assign_pointer(ip6n->tnls_wc[0], t);
return 0;
@@ -1970,52 +2012,24 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[],
parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]);
}
-static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[],
- struct ip_tunnel_encap *ipencap)
-{
- bool ret = false;
-
- memset(ipencap, 0, sizeof(*ipencap));
-
- if (!data)
- return ret;
-
- if (data[IFLA_IPTUN_ENCAP_TYPE]) {
- ret = true;
- ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
- }
-
- if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
- ret = true;
- ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
- }
-
- if (data[IFLA_IPTUN_ENCAP_SPORT]) {
- ret = true;
- ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
- }
-
- if (data[IFLA_IPTUN_ENCAP_DPORT]) {
- ret = true;
- ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
- }
-
- return ret;
-}
-
-static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[],
+static int ip6_tnl_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
struct netlink_ext_ack *extack)
{
- struct net *net = dev_net(dev);
- struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
struct ip_tunnel_encap ipencap;
+ struct ip6_tnl_net *ip6n;
struct ip6_tnl *nt, *t;
+ struct net *net;
int err;
+ net = params->link_net ? : dev_net(dev);
+ ip6n = net_generic(net, ip6_tnl_net_id);
nt = netdev_priv(dev);
+ nt->net = net;
- if (ip6_tnl_netlink_encap_parms(data, &ipencap)) {
+ if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
err = ip6_tnl_encap_setup(nt, &ipencap);
if (err < 0)
return err;
@@ -2049,10 +2063,30 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
struct ip_tunnel_encap ipencap;
- if (dev == ip6n->fb_tnl_dev)
- return -EINVAL;
+ if (dev == ip6n->fb_tnl_dev) {
+ if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
+ /* iproute2 always sets TUNNEL_ENCAP_FLAG_CSUM6, so
+ * let's ignore this flag.
+ */
+ ipencap.flags &= ~TUNNEL_ENCAP_FLAG_CSUM6;
+ if (memchr_inv(&ipencap, 0, sizeof(ipencap))) {
+ NL_SET_ERR_MSG(extack,
+ "Only protocol can be changed for fallback tunnel, not encap params");
+ return -EINVAL;
+ }
+ }
+
+ ip6_tnl_netlink_parms(data, &p);
+ if (ip6_tnl0_update(t, &p, true) < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Only protocol can be changed for fallback tunnel");
+ return -EINVAL;
+ }
+
+ return 0;
+ }
- if (ip6_tnl_netlink_encap_parms(data, &ipencap)) {
+ if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
int err = ip6_tnl_encap_setup(t, &ipencap);
if (err < 0)
@@ -2069,7 +2103,8 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[],
} else
t = netdev_priv(dev);
- return ip6_tnl_update(t, &p);
+ ip6_tnl_update(t, &p);
+ return 0;
}
static void ip6_tnl_dellink(struct net_device *dev, struct list_head *head)
@@ -2151,7 +2186,7 @@ struct net *ip6_tnl_get_link_net(const struct net_device *dev)
{
struct ip6_tnl *tunnel = netdev_priv(dev);
- return tunnel->net;
+ return READ_ONCE(tunnel->net);
}
EXPORT_SYMBOL(ip6_tnl_get_link_net);
@@ -2199,7 +2234,13 @@ static struct xfrm6_tunnel ip6ip6_handler __read_mostly = {
.priority = 1,
};
-static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list)
+static struct xfrm6_tunnel mplsip6_handler __read_mostly = {
+ .handler = mplsip6_rcv,
+ .err_handler = mplsip6_err,
+ .priority = 1,
+};
+
+static void __net_exit ip6_tnl_exit_rtnl_net(struct net *net, struct list_head *list)
{
struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
struct net_device *dev, *aux;
@@ -2211,16 +2252,28 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head
unregister_netdevice_queue(dev, list);
for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) {
- t = rtnl_dereference(ip6n->tnls_r_l[h]);
+ t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]);
while (t) {
/* If dev is in the same netns, it has already
* been added to the list by the previous loop.
*/
if (!net_eq(dev_net(t->dev), net))
unregister_netdevice_queue(t->dev, list);
- t = rtnl_dereference(t->next);
+
+ t = rtnl_net_dereference(net, t->next);
}
}
+
+ t = rtnl_net_dereference(net, ip6n->tnls_wc[0]);
+ while (t) {
+ /* If dev is in the same netns, it has already
+ * been added to the list by the previous loop.
+ */
+ if (!net_eq(dev_net(t->dev), net))
+ unregister_netdevice_queue(t->dev, list);
+
+ t = rtnl_net_dereference(net, t->next);
+ }
}
static int __net_init ip6_tnl_init_net(struct net *net)
@@ -2245,7 +2298,7 @@ static int __net_init ip6_tnl_init_net(struct net *net)
/* FB netdevice is special: we have one, and only one per netns.
* Allowing to move it to another netns is clearly unsafe.
*/
- ip6n->fb_tnl_dev->features |= NETIF_F_NETNS_LOCAL;
+ ip6n->fb_tnl_dev->netns_immutable = true;
err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev);
if (err < 0)
@@ -2266,21 +2319,9 @@ err_alloc_dev:
return err;
}
-static void __net_exit ip6_tnl_exit_batch_net(struct list_head *net_list)
-{
- struct net *net;
- LIST_HEAD(list);
-
- rtnl_lock();
- list_for_each_entry(net, net_list, exit_list)
- ip6_tnl_destroy_tunnels(net, &list);
- unregister_netdevice_many(&list);
- rtnl_unlock();
-}
-
static struct pernet_operations ip6_tnl_net_ops = {
.init = ip6_tnl_init_net,
- .exit_batch = ip6_tnl_exit_batch_net,
+ .exit_rtnl = ip6_tnl_exit_rtnl_net,
.id = &ip6_tnl_net_id,
.size = sizeof(struct ip6_tnl_net),
};
@@ -2313,6 +2354,15 @@ static int __init ip6_tunnel_init(void)
pr_err("%s: can't register ip6ip6\n", __func__);
goto out_ip6ip6;
}
+
+ if (ip6_tnl_mpls_supported()) {
+ err = xfrm6_tunnel_register(&mplsip6_handler, AF_MPLS);
+ if (err < 0) {
+ pr_err("%s: can't register mplsip6\n", __func__);
+ goto out_mplsip6;
+ }
+ }
+
err = rtnl_link_register(&ip6_link_ops);
if (err < 0)
goto rtnl_link_failed;
@@ -2320,6 +2370,9 @@ static int __init ip6_tunnel_init(void)
return 0;
rtnl_link_failed:
+ if (ip6_tnl_mpls_supported())
+ xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS);
+out_mplsip6:
xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6);
out_ip6ip6:
xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET);
@@ -2342,6 +2395,9 @@ static void __exit ip6_tunnel_cleanup(void)
if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6))
pr_info("%s: can't deregister ip6ip6\n", __func__);
+ if (ip6_tnl_mpls_supported() &&
+ xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS))
+ pr_info("%s: can't deregister mplsip6\n", __func__);
unregister_pernet_device(&ip6_tnl_net_ops);
}
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index b283f293ee4a..cef3e0210744 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -1,3 +1,5 @@
+
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/socket.h>
@@ -15,7 +17,7 @@
int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
struct socket **sockp)
{
- struct sockaddr_in6 udp6_addr;
+ struct sockaddr_in6 udp6_addr = {};
int err;
struct socket *sock = NULL;
@@ -24,10 +26,12 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
goto error;
if (cfg->ipv6_v6only) {
- int val = 1;
-
- err = kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY,
- (char *) &val, sizeof(val));
+ err = ip6_sock_set_v6only(sock->sk);
+ if (err < 0)
+ goto error;
+ }
+ if (cfg->bind_ifindex) {
+ err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true);
if (err < 0)
goto error;
}
@@ -36,18 +40,19 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg,
memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6,
sizeof(udp6_addr.sin6_addr));
udp6_addr.sin6_port = cfg->local_udp_port;
- err = kernel_bind(sock, (struct sockaddr *)&udp6_addr,
+ err = kernel_bind(sock, (struct sockaddr_unsized *)&udp6_addr,
sizeof(udp6_addr));
if (err < 0)
goto error;
if (cfg->peer_udp_port) {
+ memset(&udp6_addr, 0, sizeof(udp6_addr));
udp6_addr.sin6_family = AF_INET6;
memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6,
sizeof(udp6_addr.sin6_addr));
udp6_addr.sin6_port = cfg->peer_udp_port;
err = kernel_connect(sock,
- (struct sockaddr *)&udp6_addr,
+ (struct sockaddr_unsized *)&udp6_addr,
sizeof(udp6_addr), 0);
}
if (err < 0)
@@ -69,12 +74,14 @@ error:
}
EXPORT_SYMBOL_GPL(udp_sock_create6);
-int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb,
- struct net_device *dev, struct in6_addr *saddr,
- struct in6_addr *daddr,
- __u8 prio, __u8 ttl, __be32 label,
- __be16 src_port, __be16 dst_port, bool nocheck)
+void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb,
+ struct net_device *dev,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __u8 prio, __u8 ttl, __be32 label,
+ __be16 src_port, __be16 dst_port, bool nocheck,
+ u16 ip6cb_flags)
{
struct udphdr *uh;
struct ipv6hdr *ip6h;
@@ -102,9 +109,78 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
ip6h->daddr = *daddr;
ip6h->saddr = *saddr;
- ip6tunnel_xmit(sk, skb, dev);
- return 0;
+ ip6tunnel_xmit(sk, skb, dev, ip6cb_flags);
}
EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb);
+/**
+ * udp_tunnel6_dst_lookup - perform route lookup on UDP tunnel
+ * @skb: Packet for which lookup is done
+ * @dev: Tunnel device
+ * @net: Network namespace of tunnel device
+ * @sock: Socket which provides route info
+ * @oif: Index of the output interface
+ * @saddr: Memory to store the src ip address
+ * @key: Tunnel information
+ * @sport: UDP source port
+ * @dport: UDP destination port
+ * @dsfield: The traffic class field
+ * @dst_cache: The dst cache to use for lookup
+ * This function performs a route lookup on a UDP tunnel
+ *
+ * It returns a valid dst pointer and stores src address to be used in
+ * tunnel in param saddr on success, else a pointer encoded error code.
+ */
+
+struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb,
+ struct net_device *dev,
+ struct net *net,
+ struct socket *sock,
+ int oif,
+ struct in6_addr *saddr,
+ const struct ip_tunnel_key *key,
+ __be16 sport, __be16 dport, u8 dsfield,
+ struct dst_cache *dst_cache)
+{
+ struct dst_entry *dst = NULL;
+ struct flowi6 fl6;
+
+#ifdef CONFIG_DST_CACHE
+ if (dst_cache) {
+ dst = dst_cache_get_ip6(dst_cache, saddr);
+ if (dst)
+ return dst;
+ }
+#endif
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_proto = IPPROTO_UDP;
+ fl6.flowi6_oif = oif;
+ fl6.daddr = key->u.ipv6.dst;
+ fl6.saddr = key->u.ipv6.src;
+ fl6.fl6_sport = sport;
+ fl6.fl6_dport = dport;
+ fl6.flowlabel = ip6_make_flowinfo(dsfield, key->label);
+
+ dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
+ NULL);
+ if (IS_ERR(dst)) {
+ netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
+ return ERR_PTR(-ENETUNREACH);
+ }
+ if (dst_dev(dst) == dev) { /* is this necessary? */
+ netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
+ dst_release(dst);
+ return ERR_PTR(-ELOOP);
+ }
+#ifdef CONFIG_DST_CACHE
+ if (dst_cache)
+ dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
+#endif
+ *saddr = fl6.saddr;
+ return dst;
+}
+EXPORT_SYMBOL_GPL(udp_tunnel6_dst_lookup);
+
+MODULE_DESCRIPTION("IPv6 Foo over UDP tunnel driver");
MODULE_LICENSE("GPL");
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index eeaf7455d51e..ad5290be4dd6 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPv6 virtual tunneling interface
*
@@ -8,11 +9,6 @@
*
* Based on:
* net/ipv6/ip6_tunnel.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/module.h>
@@ -49,6 +45,7 @@
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <net/netdev_lock.h>
#include <linux/etherdevice.h>
#define IP6_VTI_HASH_SIZE_SHIFT 5
@@ -129,6 +126,7 @@ vti6_tnl_lookup(struct net *net, const struct in6_addr *remote,
/**
* vti6_tnl_bucket - get head of list matching given tunnel parameters
+ * @ip6n: the private data for ip6_vti in the netns
* @p: parameters containing tunnel end-points
*
* Description:
@@ -157,7 +155,7 @@ vti6_tnl_link(struct vti6_net *ip6n, struct ip6_tnl *t)
{
struct ip6_tnl __rcu **tp = vti6_tnl_bucket(ip6n, &t->parms);
- rcu_assign_pointer(t->next , rtnl_dereference(*tp));
+ rcu_assign_pointer(t->next, rtnl_dereference(*tp));
rcu_assign_pointer(*tp, t);
}
@@ -177,16 +175,10 @@ vti6_tnl_unlink(struct vti6_net *ip6n, struct ip6_tnl *t)
}
}
-static void vti6_dev_free(struct net_device *dev)
-{
- free_percpu(dev->tstats);
-}
-
static int vti6_tnl_create2(struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
- struct net *net = dev_net(dev);
- struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+ struct vti6_net *ip6n = net_generic(t->net, vti6_net_id);
int err;
dev->rtnl_link_ops = &vti6_link_ops;
@@ -196,7 +188,6 @@ static int vti6_tnl_create2(struct net_device *dev)
strcpy(t->parms.name, dev->name);
- dev_hold(dev);
vti6_tnl_link(ip6n, t);
return 0;
@@ -215,7 +206,7 @@ static struct ip6_tnl *vti6_tnl_create(struct net *net, struct __ip6_tnl_parm *p
if (p->name[0]) {
if (!dev_valid_name(p->name))
goto failed;
- strlcpy(name, p->name, IFNAMSIZ);
+ strscpy(name, p->name, IFNAMSIZ);
} else {
sprintf(name, "ip6_vti%%d");
}
@@ -297,10 +288,11 @@ static void vti6_dev_uninit(struct net_device *dev)
RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL);
else
vti6_tnl_unlink(ip6n, t);
- dev_put(dev);
+ netdev_put(dev, &t->dev_tracker);
}
-static int vti6_rcv(struct sk_buff *skb)
+static int vti6_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi,
+ int encap_type)
{
struct ip6_tnl *t;
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
@@ -315,18 +307,22 @@ static int vti6_rcv(struct sk_buff *skb)
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
rcu_read_unlock();
- return 0;
+ goto discard;
}
+ ipv6h = ipv6_hdr(skb);
if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) {
- t->dev->stats.rx_dropped++;
+ DEV_STATS_INC(t->dev, rx_dropped);
rcu_read_unlock();
goto discard;
}
rcu_read_unlock();
- return xfrm6_rcv_tnl(skb, t);
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t;
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
+ return xfrm_input(skb, nexthdr, spi, encap_type);
}
rcu_read_unlock();
return -EINVAL;
@@ -335,13 +331,19 @@ discard:
return 0;
}
+static int vti6_rcv(struct sk_buff *skb)
+{
+ int nexthdr = skb_network_header(skb)[IP6CB(skb)->nhoff];
+
+ return vti6_input_proto(skb, nexthdr, 0, 0);
+}
+
static int vti6_rcv_cb(struct sk_buff *skb, int err)
{
unsigned short family;
struct net_device *dev;
- struct pcpu_sw_netstats *tstats;
struct xfrm_state *x;
- struct xfrm_mode *inner_mode;
+ const struct xfrm_mode *inner_mode;
struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6;
u32 orig_mark = skb->mark;
int ret;
@@ -352,15 +354,15 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err)
dev = t->dev;
if (err) {
- dev->stats.rx_errors++;
- dev->stats.rx_dropped++;
+ DEV_STATS_INC(dev, rx_errors);
+ DEV_STATS_INC(dev, rx_dropped);
return 0;
}
x = xfrm_input_state(skb);
- inner_mode = x->inner_mode;
+ inner_mode = &x->inner_mode;
if (x->sel.family == AF_UNSPEC) {
inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
@@ -371,7 +373,7 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err)
}
}
- family = inner_mode->afinfo->family;
+ family = inner_mode->family;
skb->mark = be32_to_cpu(t->parms.i_key);
ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
@@ -382,12 +384,7 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err)
skb_scrub_packet(skb, !net_eq(t->net, dev_net(skb->dev)));
skb->dev = dev;
-
- tstats = this_cpu_ptr(dev->tstats);
- u64_stats_update_begin(&tstats->syncp);
- tstats->rx_packets++;
- tstats->rx_bytes += skb->len;
- u64_stats_update_end(&tstats->syncp);
+ dev_sw_netstats_rx_add(dev, skb->len);
return 0;
}
@@ -444,7 +441,6 @@ static int
vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
{
struct ip6_tnl *t = netdev_priv(dev);
- struct net_device_stats *stats = &t->dev->stats;
struct dst_entry *dst = skb_dst(skb);
struct net_device *tdev;
struct xfrm_state *x;
@@ -452,17 +448,47 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
int err = -1;
int mtu;
- if (!dst)
- goto tx_err_link_failure;
+ if (!dst) {
+ switch (skb->protocol) {
+ case htons(ETH_P_IP): {
+ struct rtable *rt;
+
+ fl->u.ip4.flowi4_oif = dev->ifindex;
+ fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC;
+ rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4);
+ if (IS_ERR(rt))
+ goto tx_err_link_failure;
+ dst = &rt->dst;
+ skb_dst_set(skb, dst);
+ break;
+ }
+ case htons(ETH_P_IPV6):
+ fl->u.ip6.flowi6_oif = dev->ifindex;
+ fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC;
+ dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6);
+ if (dst->error) {
+ dst_release(dst);
+ dst = NULL;
+ goto tx_err_link_failure;
+ }
+ skb_dst_set(skb, dst);
+ break;
+ default:
+ goto tx_err_link_failure;
+ }
+ }
dst_hold(dst);
- dst = xfrm_lookup(t->net, dst, fl, NULL, 0);
+ dst = xfrm_lookup_route(t->net, dst, fl, NULL, 0);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
dst = NULL;
goto tx_err_link_failure;
}
+ if (dst->flags & DST_XFRM_QUEUE)
+ goto xmit;
+
x = dst->xfrm;
if (!vti6_state_check(x, &t->parms.raddr, &t->parms.laddr))
goto tx_err_link_failure;
@@ -471,10 +497,10 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
(const struct in6_addr *)&x->id.daddr))
goto tx_err_link_failure;
- tdev = dst->dev;
+ tdev = dst_dev(dst);
if (tdev == dev) {
- stats->collisions++;
+ DEV_STATS_INC(dev, collisions);
net_warn_ratelimited("%s: Local routing loop detected!\n",
t->parms.name);
goto tx_err_dst_release;
@@ -482,25 +508,28 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
mtu = dst_mtu(dst);
if (skb->len > mtu) {
- skb_dst_update_pmtu(skb, mtu);
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
if (skb->protocol == htons(ETH_P_IPV6)) {
if (mtu < IPV6_MIN_MTU)
mtu = IPV6_MIN_MTU;
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
} else {
- icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(mtu));
+ if (!(ip_hdr(skb)->frag_off & htons(IP_DF)))
+ goto xmit;
+ icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
}
err = -EMSGSIZE;
goto tx_err_dst_release;
}
+xmit:
skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev)));
skb_dst_set(skb, dst);
- skb->dev = skb_dst(skb)->dev;
+ skb->dev = dst_dev(dst);
err = dst_output(t->net, skb->sk, skb);
if (net_xmit_eval(err) == 0)
@@ -509,7 +538,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
return 0;
tx_err_link_failure:
- stats->tx_carrier_errors++;
+ DEV_STATS_INC(dev, tx_carrier_errors);
dst_link_failure(skb);
tx_err_dst_release:
dst_release(dst);
@@ -520,27 +549,26 @@ static netdev_tx_t
vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
- struct net_device_stats *stats = &t->dev->stats;
- struct ipv6hdr *ipv6h;
struct flowi fl;
int ret;
+ if (!pskb_inet_may_pull(skb))
+ goto tx_err;
+
memset(&fl, 0, sizeof(fl));
switch (skb->protocol) {
case htons(ETH_P_IPV6):
- ipv6h = ipv6_hdr(skb);
-
if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) ||
- vti6_addr_conflict(t, ipv6h))
+ vti6_addr_conflict(t, ipv6_hdr(skb)))
goto tx_err;
- xfrm_decode_session(skb, &fl, AF_INET6);
memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+ xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6);
break;
case htons(ETH_P_IP):
- xfrm_decode_session(skb, &fl, AF_INET);
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET);
break;
default:
goto tx_err;
@@ -556,8 +584,8 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK;
tx_err:
- stats->tx_errors++;
- stats->tx_dropped++;
+ DEV_STATS_INC(dev, tx_errors);
+ DEV_STATS_INC(dev, tx_dropped);
kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -625,7 +653,7 @@ static void vti6_link_config(struct ip6_tnl *t, bool keep_mtu)
struct net_device *tdev = NULL;
int mtu;
- memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
+ __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr));
memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
p->flags &= ~(IP6_TNL_F_CAP_XMIT | IP6_TNL_F_CAP_RCV |
@@ -638,7 +666,8 @@ static void vti6_link_config(struct ip6_tnl *t, bool keep_mtu)
dev->flags &= ~IFF_POINTOPOINT;
if (keep_mtu && dev->mtu) {
- dev->mtu = clamp(dev->mtu, dev->min_mtu, dev->max_mtu);
+ WRITE_ONCE(dev->mtu,
+ clamp(dev->mtu, dev->min_mtu, dev->max_mtu));
return;
}
@@ -736,13 +765,14 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p)
}
/**
- * vti6_ioctl - configure vti6 tunnels from userspace
+ * vti6_siocdevprivate - configure vti6 tunnels from userspace
* @dev: virtual device associated with tunnel
- * @ifr: parameters passed from userspace
+ * @ifr: unused
+ * @data: parameters passed from userspace
* @cmd: command to be performed
*
* Description:
- * vti6_ioctl() is used for managing vti6 tunnels
+ * vti6_siocdevprivate() is used for managing vti6 tunnels
* from userspace.
*
* The possible commands are the following:
@@ -763,7 +793,7 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p)
* %-ENODEV if attempting to change or delete a nonexisting device
**/
static int
-vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+vti6_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd)
{
int err = 0;
struct ip6_tnl_parm2 p;
@@ -772,10 +802,12 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
struct net *net = dev_net(dev);
struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+ memset(&p1, 0, sizeof(p1));
+
switch (cmd) {
case SIOCGETTUNNEL:
if (dev == ip6n->fb_tnl_dev) {
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+ if (copy_from_user(&p, data, sizeof(p))) {
err = -EFAULT;
break;
}
@@ -787,7 +819,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
if (!t)
t = netdev_priv(dev);
vti6_parm_to_user(&p, &t->parms);
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ if (copy_to_user(data, &p, sizeof(p)))
err = -EFAULT;
break;
case SIOCADDTUNNEL:
@@ -796,7 +828,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
break;
err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ if (copy_from_user(&p, data, sizeof(p)))
break;
err = -EINVAL;
if (p.proto != IPPROTO_IPV6 && p.proto != 0)
@@ -817,7 +849,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
if (t) {
err = 0;
vti6_parm_to_user(&p, &t->parms);
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+ if (copy_to_user(data, &p, sizeof(p)))
err = -EFAULT;
} else
@@ -830,7 +862,7 @@ vti6_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
if (dev == ip6n->fb_tnl_dev) {
err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+ if (copy_from_user(&p, data, sizeof(p)))
break;
err = -ENOENT;
vti6_parm_from_user(&p1, &p);
@@ -855,8 +887,7 @@ static const struct net_device_ops vti6_netdev_ops = {
.ndo_init = vti6_dev_init,
.ndo_uninit = vti6_dev_uninit,
.ndo_start_xmit = vti6_tnl_xmit,
- .ndo_do_ioctl = vti6_ioctl,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_siocdevprivate = vti6_siocdevprivate,
.ndo_get_iflink = ip6_tnl_get_iflink,
};
@@ -870,9 +901,10 @@ static const struct net_device_ops vti6_netdev_ops = {
static void vti6_dev_setup(struct net_device *dev)
{
dev->netdev_ops = &vti6_netdev_ops;
+ dev->header_ops = &ip_tunnel_header_ops;
dev->needs_free_netdev = true;
- dev->priv_destructor = vti6_dev_free;
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
dev->type = ARPHRD_TUNNEL6;
dev->min_mtu = IPV4_MIN_MTU;
dev->max_mtu = IP_MAX_MTU - sizeof(struct ipv6hdr);
@@ -893,10 +925,8 @@ static inline int vti6_dev_init_gen(struct net_device *dev)
struct ip6_tnl *t = netdev_priv(dev);
t->dev = dev;
- t->net = dev_net(dev);
- dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
- if (!dev->tstats)
- return -ENOMEM;
+ netdev_hold(dev, &t->dev_tracker, GFP_KERNEL);
+ netdev_lockdep_set_classes(dev);
return 0;
}
@@ -927,8 +957,8 @@ static int __net_init vti6_fb_tnl_dev_init(struct net_device *dev)
struct net *net = dev_net(dev);
struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+ t->net = net;
t->parms.proto = IPPROTO_IPV6;
- dev_hold(dev);
rcu_assign_pointer(ip6n->tnls_wc[0], t);
return 0;
@@ -967,17 +997,20 @@ static void vti6_netlink_parms(struct nlattr *data[],
parms->fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]);
}
-static int vti6_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[],
+static int vti6_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
struct netlink_ext_ack *extack)
{
- struct net *net = dev_net(dev);
+ struct nlattr **data = params->data;
struct ip6_tnl *nt;
+ struct net *net;
+ net = params->link_net ? : dev_net(dev);
nt = netdev_priv(dev);
vti6_netlink_parms(data, &nt->parms);
nt->parms.proto = IPPROTO_IPV6;
+ nt->net = net;
if (vti6_locate(net, &nt->parms, 0))
return -EEXIST;
@@ -1079,21 +1112,21 @@ static struct rtnl_link_ops vti6_link_ops __read_mostly = {
.get_link_net = ip6_tnl_get_link_net,
};
-static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n,
- struct list_head *list)
+static void __net_exit vti6_exit_rtnl_net(struct net *net, struct list_head *list)
{
- int h;
+ struct vti6_net *ip6n = net_generic(net, vti6_net_id);
struct ip6_tnl *t;
+ int h;
for (h = 0; h < IP6_VTI_HASH_SIZE; h++) {
- t = rtnl_dereference(ip6n->tnls_r_l[h]);
+ t = rtnl_net_dereference(net, ip6n->tnls_r_l[h]);
while (t) {
unregister_netdevice_queue(t->dev, list);
- t = rtnl_dereference(t->next);
+ t = rtnl_net_dereference(net, t->next);
}
}
- t = rtnl_dereference(ip6n->tnls_wc[0]);
+ t = rtnl_net_dereference(net, ip6n->tnls_wc[0]);
if (t)
unregister_netdevice_queue(t->dev, list);
}
@@ -1137,30 +1170,16 @@ err_alloc_dev:
return err;
}
-static void __net_exit vti6_exit_batch_net(struct list_head *net_list)
-{
- struct vti6_net *ip6n;
- struct net *net;
- LIST_HEAD(list);
-
- rtnl_lock();
- list_for_each_entry(net, net_list, exit_list) {
- ip6n = net_generic(net, vti6_net_id);
- vti6_destroy_tunnels(ip6n, &list);
- }
- unregister_netdevice_many(&list);
- rtnl_unlock();
-}
-
static struct pernet_operations vti6_net_ops = {
.init = vti6_init_net,
- .exit_batch = vti6_exit_batch_net,
+ .exit_rtnl = vti6_exit_rtnl_net,
.id = &vti6_net_id,
.size = sizeof(struct vti6_net),
};
static struct xfrm6_protocol vti_esp6_protocol __read_mostly = {
.handler = vti6_rcv,
+ .input_handler = vti6_input_proto,
.cb_handler = vti6_rcv_cb,
.err_handler = vti6_err,
.priority = 100,
@@ -1168,6 +1187,7 @@ static struct xfrm6_protocol vti_esp6_protocol __read_mostly = {
static struct xfrm6_protocol vti_ah6_protocol __read_mostly = {
.handler = vti6_rcv,
+ .input_handler = vti6_input_proto,
.cb_handler = vti6_rcv_cb,
.err_handler = vti6_err,
.priority = 100,
@@ -1175,11 +1195,39 @@ static struct xfrm6_protocol vti_ah6_protocol __read_mostly = {
static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = {
.handler = vti6_rcv,
+ .input_handler = vti6_input_proto,
.cb_handler = vti6_rcv_cb,
.err_handler = vti6_err,
.priority = 100,
};
+#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL)
+static int vti6_rcv_tunnel(struct sk_buff *skb)
+{
+ const xfrm_address_t *saddr;
+ __be32 spi;
+
+ saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr;
+ spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr);
+
+ return vti6_input_proto(skb, IPPROTO_IPV6, spi, 0);
+}
+
+static struct xfrm6_tunnel vti_ipv6_handler __read_mostly = {
+ .handler = vti6_rcv_tunnel,
+ .cb_handler = vti6_rcv_cb,
+ .err_handler = vti6_err,
+ .priority = 0,
+};
+
+static struct xfrm6_tunnel vti_ip6ip_handler __read_mostly = {
+ .handler = vti6_rcv_tunnel,
+ .cb_handler = vti6_rcv_cb,
+ .err_handler = vti6_err,
+ .priority = 0,
+};
+#endif
+
/**
* vti6_tunnel_init - register protocol and reserve needed resources
*
@@ -1205,6 +1253,15 @@ static int __init vti6_tunnel_init(void)
err = xfrm6_protocol_register(&vti_ipcomp6_protocol, IPPROTO_COMP);
if (err < 0)
goto xfrm_proto_comp_failed;
+#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL)
+ msg = "ipv6 tunnel";
+ err = xfrm6_tunnel_register(&vti_ipv6_handler, AF_INET6);
+ if (err < 0)
+ goto vti_tunnel_ipv6_failed;
+ err = xfrm6_tunnel_register(&vti_ip6ip_handler, AF_INET);
+ if (err < 0)
+ goto vti_tunnel_ip6ip_failed;
+#endif
msg = "netlink interface";
err = rtnl_link_register(&vti6_link_ops);
@@ -1214,6 +1271,12 @@ static int __init vti6_tunnel_init(void)
return 0;
rtnl_link_failed:
+#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL)
+ err = xfrm6_tunnel_deregister(&vti_ip6ip_handler, AF_INET);
+vti_tunnel_ip6ip_failed:
+ err = xfrm6_tunnel_deregister(&vti_ipv6_handler, AF_INET6);
+vti_tunnel_ipv6_failed:
+#endif
xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP);
xfrm_proto_comp_failed:
xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH);
@@ -1232,6 +1295,10 @@ pernet_dev_failed:
static void __exit vti6_tunnel_cleanup(void)
{
rtnl_link_unregister(&vti6_link_ops);
+#if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL)
+ xfrm6_tunnel_deregister(&vti_ip6ip_handler, AF_INET);
+ xfrm6_tunnel_deregister(&vti_ipv6_handler, AF_INET6);
+#endif
xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP);
xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH);
xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index d0b7e0249c13..e047a4680ab0 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux IPv6 multicast routing support for BSD pim6sd
* Based on net/ipv4/ipmr.c.
@@ -8,12 +9,6 @@
* 6WIND, Paris, France
* Copyright (C)2007,2008 USAGI/WIDE Project
* YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/uaccess.h>
@@ -51,6 +46,9 @@
#include <linux/export.h>
#include <net/ip6_checksum.h>
#include <linux/netconf.h>
+#include <net/ip_tunnels.h>
+
+#include <linux/nospec.h>
struct ip6mr_rule {
struct fib_rule common;
@@ -64,7 +62,12 @@ struct ip6mr_result {
Note that the changes are semaphored via rtnl_lock.
*/
-static DEFINE_RWLOCK(mrt_lock);
+static DEFINE_SPINLOCK(mrt_lock);
+
+static struct net_device *vif_dev_read(const struct vif_device *vif)
+{
+ return rcu_dereference(vif->dev);
+}
/* Multicast router control variables */
@@ -85,20 +88,25 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id);
static void ip6mr_free_table(struct mr_table *mrt);
static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
- struct sk_buff *skb, struct mfc6_cache *cache);
-static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
+ struct net_device *dev, struct sk_buff *skb,
+ struct mfc6_cache *cache);
+static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt,
mifi_t mifi, int assert);
static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc,
int cmd);
-static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
+static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt);
+static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack);
static int ip6mr_rtm_dumproute(struct sk_buff *skb,
struct netlink_callback *cb);
-static void mroute_clean_tables(struct mr_table *mrt, bool all);
+static void mroute_clean_tables(struct mr_table *mrt, int flags);
static void ipmr_expire_process(struct timer_list *t);
#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
#define ip6mr_for_each_table(mrt, net) \
- list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list)
+ list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list, \
+ lockdep_rtnl_is_held() || \
+ list_empty(&net->ipv6.mr6_tables))
static struct mr_table *ip6mr_mr_table_iter(struct net *net,
struct mr_table *mrt)
@@ -117,7 +125,7 @@ static struct mr_table *ip6mr_mr_table_iter(struct net *net,
return ret;
}
-static struct mr_table *ip6mr_get_table(struct net *net, u32 id)
+static struct mr_table *__ip6mr_get_table(struct net *net, u32 id)
{
struct mr_table *mrt;
@@ -128,6 +136,16 @@ static struct mr_table *ip6mr_get_table(struct net *net, u32 id)
return NULL;
}
+static struct mr_table *ip6mr_get_table(struct net *net, u32 id)
+{
+ struct mr_table *mrt;
+
+ rcu_read_lock();
+ mrt = __ip6mr_get_table(net, id);
+ rcu_read_unlock();
+ return mrt;
+}
+
static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
struct mr_table **mrt)
{
@@ -138,6 +156,9 @@ static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
.flags = FIB_LOOKUP_NOREF,
};
+ /* update flow if oif or iif point to device enslaved to l3mdev */
+ l3mdev_update_flow(net, flowi6_to_flowi(flp6));
+
err = fib_rules_lookup(net->ipv6.mr6_rules_ops,
flowi6_to_flowi(flp6), 0, &arg);
if (err < 0)
@@ -164,7 +185,9 @@ static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp,
return -EINVAL;
}
- mrt = ip6mr_get_table(rule->fr_net, rule->table);
+ arg->table = fib_rule_get_table(rule, arg);
+
+ mrt = __ip6mr_get_table(rule->fr_net, arg->table);
if (!mrt)
return -EAGAIN;
res->mrt = mrt;
@@ -176,10 +199,6 @@ static int ip6mr_rule_match(struct fib_rule *rule, struct flowi *flp, int flags)
return 1;
}
-static const struct nla_policy ip6mr_rule_policy[FRA_MAX + 1] = {
- FRA_GENERIC_POLICY,
-};
-
static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh, struct nlattr **tb,
struct netlink_ext_ack *extack)
@@ -212,7 +231,6 @@ static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = {
.compare = ip6mr_rule_compare,
.fill = ip6mr_rule_fill,
.nlgroup = RTNLGRP_IPV6_RULE,
- .policy = ip6mr_rule_policy,
.owner = THIS_MODULE,
};
@@ -234,7 +252,7 @@ static int __net_init ip6mr_rules_init(struct net *net)
goto err1;
}
- err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT, 0);
+ err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT);
if (err < 0)
goto err2;
@@ -242,7 +260,9 @@ static int __net_init ip6mr_rules_init(struct net *net)
return 0;
err2:
+ rtnl_lock();
ip6mr_free_table(mrt);
+ rtnl_unlock();
err1:
fib_rules_unregister(ops);
return err;
@@ -252,21 +272,21 @@ static void __net_exit ip6mr_rules_exit(struct net *net)
{
struct mr_table *mrt, *next;
- rtnl_lock();
+ ASSERT_RTNL();
list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) {
list_del(&mrt->list);
ip6mr_free_table(mrt);
}
fib_rules_unregister(net->ipv6.mr6_rules_ops);
- rtnl_unlock();
}
-static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb)
+static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
- return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR);
+ return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR, extack);
}
-static unsigned int ip6mr_rules_seq_read(struct net *net)
+static unsigned int ip6mr_rules_seq_read(const struct net *net)
{
return fib_rules_seq_read(net, RTNL_FAMILY_IP6MR);
}
@@ -294,6 +314,8 @@ static struct mr_table *ip6mr_get_table(struct net *net, u32 id)
return net->ipv6.mrt6;
}
+#define __ip6mr_get_table ip6mr_get_table
+
static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6,
struct mr_table **mrt)
{
@@ -314,18 +336,18 @@ static int __net_init ip6mr_rules_init(struct net *net)
static void __net_exit ip6mr_rules_exit(struct net *net)
{
- rtnl_lock();
+ ASSERT_RTNL();
ip6mr_free_table(net->ipv6.mrt6);
net->ipv6.mrt6 = NULL;
- rtnl_unlock();
}
-static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb)
+static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
return 0;
}
-static unsigned int ip6mr_rules_seq_read(struct net *net)
+static unsigned int ip6mr_rules_seq_read(const struct net *net)
{
return 0;
}
@@ -346,7 +368,6 @@ static const struct rhashtable_params ip6mr_rht_params = {
.key_offset = offsetof(struct mfc6_cache, cmparg),
.key_len = sizeof(struct mfc6_cache_cmp_arg),
.nelem_hint = 3,
- .locks_mul = 1,
.obj_cmpfn = ip6mr_hash_cmp,
.automatic_shrinking = true,
};
@@ -373,7 +394,7 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id)
{
struct mr_table *mrt;
- mrt = ip6mr_get_table(net, id);
+ mrt = __ip6mr_get_table(net, id);
if (mrt)
return mrt;
@@ -383,8 +404,13 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id)
static void ip6mr_free_table(struct mr_table *mrt)
{
- del_timer_sync(&mrt->ipmr_expire_timer);
- mroute_clean_tables(mrt, true);
+ struct net *net = read_pnet(&mrt->net);
+
+ WARN_ON_ONCE(!mr_can_free_table(net));
+
+ timer_shutdown_sync(&mrt->ipmr_expire_timer);
+ mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC |
+ MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC);
rhltable_destroy(&mrt->mfc_hash);
kfree(mrt);
}
@@ -395,26 +421,28 @@ static void ip6mr_free_table(struct mr_table *mrt)
*/
static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(mrt_lock)
+ __acquires(RCU)
{
struct mr_vif_iter *iter = seq->private;
struct net *net = seq_file_net(seq);
struct mr_table *mrt;
- mrt = ip6mr_get_table(net, RT6_TABLE_DFLT);
- if (!mrt)
+ rcu_read_lock();
+ mrt = __ip6mr_get_table(net, RT6_TABLE_DFLT);
+ if (!mrt) {
+ rcu_read_unlock();
return ERR_PTR(-ENOENT);
+ }
iter->mrt = mrt;
- read_lock(&mrt_lock);
return mr_vif_seq_start(seq, pos);
}
static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v)
- __releases(mrt_lock)
+ __releases(RCU)
{
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
}
static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
@@ -427,7 +455,11 @@ static int ip6mr_vif_seq_show(struct seq_file *seq, void *v)
"Interface BytesIn PktsIn BytesOut PktsOut Flags\n");
} else {
const struct vif_device *vif = v;
- const char *name = vif->dev ? vif->dev->name : "none";
+ const struct net_device *vif_dev;
+ const char *name;
+
+ vif_dev = vif_dev_read(vif);
+ name = vif_dev ? vif_dev->name : "none";
seq_printf(seq,
"%2td %-10s %8ld %7ld %8ld %7ld %05X\n",
@@ -478,9 +510,9 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
if (it->cache != &mrt->mfc_unres_queue) {
seq_printf(seq, " %8lu %8lu %8lu",
- mfc->_c.mfc_un.res.pkt,
- mfc->_c.mfc_un.res.bytes,
- mfc->_c.mfc_un.res.wrong_if);
+ atomic_long_read(&mfc->_c.mfc_un.res.pkt),
+ atomic_long_read(&mfc->_c.mfc_un.res.bytes),
+ atomic_long_read(&mfc->_c.mfc_un.res.wrong_if));
for (n = mfc->_c.mfc_un.res.minvif;
n < mfc->_c.mfc_un.res.maxvif; n++) {
if (VIF_EXISTS(mrt, n) &&
@@ -546,14 +578,11 @@ static int pim6_rcv(struct sk_buff *skb)
if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
goto drop;
- reg_vif_num = mrt->mroute_reg_vif_num;
- read_lock(&mrt_lock);
+ /* Pairs with WRITE_ONCE() in mif6_add()/mif6_delete() */
+ reg_vif_num = READ_ONCE(mrt->mroute_reg_vif_num);
if (reg_vif_num >= 0)
- reg_dev = mrt->vif_table[reg_vif_num].dev;
- if (reg_dev)
- dev_hold(reg_dev);
- read_unlock(&mrt_lock);
+ reg_dev = vif_dev_read(&mrt->vif_table[reg_vif_num]);
if (!reg_dev)
goto drop;
@@ -568,7 +597,6 @@ static int pim6_rcv(struct sk_buff *skb)
netif_rx(skb);
- dev_put(reg_dev);
return 0;
drop:
kfree_skb(skb);
@@ -591,19 +619,24 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb,
.flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX,
.flowi6_mark = skb->mark,
};
- int err;
- err = ip6mr_fib_lookup(net, &fl6, &mrt);
- if (err < 0) {
- kfree_skb(skb);
- return err;
- }
+ if (!pskb_inet_may_pull(skb))
+ goto tx_err;
+
+ if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0)
+ goto tx_err;
- read_lock(&mrt_lock);
- dev->stats.tx_bytes += skb->len;
- dev->stats.tx_packets++;
- ip6mr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, MRT6MSG_WHOLEPKT);
- read_unlock(&mrt_lock);
+ DEV_STATS_ADD(dev, tx_bytes, skb->len);
+ DEV_STATS_INC(dev, tx_packets);
+ rcu_read_lock();
+ ip6mr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num),
+ MRT6MSG_WHOLEPKT);
+ rcu_read_unlock();
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+
+tx_err:
+ DEV_STATS_INC(dev, tx_errors);
kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -625,7 +658,7 @@ static void reg_vif_setup(struct net_device *dev)
dev->flags = IFF_NOARP;
dev->netdev_ops = &reg_vif_netdev_ops;
dev->needs_free_netdev = true;
- dev->features |= NETIF_F_NETNS_LOCAL;
+ dev->netns_immutable = true;
}
static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt)
@@ -649,7 +682,7 @@ static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt)
return NULL;
}
- if (dev_open(dev))
+ if (dev_open(dev, NULL))
goto failure;
dev_hold(dev);
@@ -664,10 +697,11 @@ failure:
static int call_ip6mr_vif_entry_notifiers(struct net *net,
enum fib_event_type event_type,
struct vif_device *vif,
+ struct net_device *vif_dev,
mifi_t vif_index, u32 tb_id)
{
return mr_call_vif_notifiers(net, RTNL_FAMILY_IP6MR, event_type,
- vif, vif_index, tb_id,
+ vif, vif_dev, vif_index, tb_id,
&net->ipv6.ipmr_seq);
}
@@ -692,23 +726,21 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify,
v = &mrt->vif_table[vifi];
- if (VIF_EXISTS(mrt, vifi))
- call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net),
- FIB_EVENT_VIF_DEL, v, vifi,
- mrt->id);
-
- write_lock_bh(&mrt_lock);
- dev = v->dev;
- v->dev = NULL;
-
- if (!dev) {
- write_unlock_bh(&mrt_lock);
+ dev = rtnl_dereference(v->dev);
+ if (!dev)
return -EADDRNOTAVAIL;
- }
+
+ call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net),
+ FIB_EVENT_VIF_DEL, v, dev,
+ vifi, mrt->id);
+ spin_lock(&mrt_lock);
+ RCU_INIT_POINTER(v->dev, NULL);
#ifdef CONFIG_IPV6_PIMSM_V2
- if (vifi == mrt->mroute_reg_vif_num)
- mrt->mroute_reg_vif_num = -1;
+ if (vifi == mrt->mroute_reg_vif_num) {
+ /* Pairs with READ_ONCE() in ip6mr_cache_report() and reg_vif_xmit() */
+ WRITE_ONCE(mrt->mroute_reg_vif_num, -1);
+ }
#endif
if (vifi + 1 == mrt->maxvif) {
@@ -717,16 +749,16 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify,
if (VIF_EXISTS(mrt, tmp))
break;
}
- mrt->maxvif = tmp + 1;
+ WRITE_ONCE(mrt->maxvif, tmp + 1);
}
- write_unlock_bh(&mrt_lock);
+ spin_unlock(&mrt_lock);
dev_set_allmulti(dev, -1);
in6_dev = __in6_dev_get(dev);
if (in6_dev) {
- in6_dev->cnf.mc_forwarding--;
+ atomic_dec(&in6_dev->cnf.mc_forwarding);
inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
NETCONFA_MC_FORWARDING,
dev->ifindex, &in6_dev->cnf);
@@ -735,7 +767,7 @@ static int mif6_delete(struct mr_table *mrt, int vifi, int notify,
if ((v->flags & MIFF_REGISTER) && !notify)
unregister_netdevice_queue(dev, head);
- dev_put(dev);
+ netdev_put(dev, &v->dev_tracker);
return 0;
}
@@ -807,7 +839,7 @@ static void ipmr_do_expire_process(struct mr_table *mrt)
static void ipmr_expire_process(struct timer_list *t)
{
- struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer);
+ struct mr_table *mrt = timer_container_of(mrt, t, ipmr_expire_timer);
if (!spin_trylock(&mfc_unres_lock)) {
mod_timer(&mrt->ipmr_expire_timer, jiffies + 1);
@@ -820,7 +852,7 @@ static void ipmr_expire_process(struct timer_list *t)
spin_unlock(&mfc_unres_lock);
}
-/* Fill oifs list. It is called under write locked mrt_lock. */
+/* Fill oifs list. It is called under locked mrt_lock. */
static void ip6mr_update_thresholds(struct mr_table *mrt,
struct mr_mfc *cache,
@@ -842,7 +874,7 @@ static void ip6mr_update_thresholds(struct mr_table *mrt,
cache->mfc_un.res.maxvif = vifi + 1;
}
}
- cache->mfc_un.res.lastuse = jiffies;
+ WRITE_ONCE(cache->mfc_un.res.lastuse, jiffies);
}
static int mif6_add(struct net *net, struct mr_table *mrt,
@@ -894,7 +926,7 @@ static int mif6_add(struct net *net, struct mr_table *mrt,
in6_dev = __in6_dev_get(dev);
if (in6_dev) {
- in6_dev->cnf.mc_forwarding++;
+ atomic_inc(&in6_dev->cnf.mc_forwarding);
inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
NETCONFA_MC_FORWARDING,
dev->ifindex, &in6_dev->cnf);
@@ -906,17 +938,18 @@ static int mif6_add(struct net *net, struct mr_table *mrt,
MIFF_REGISTER);
/* And finish update writing critical data */
- write_lock_bh(&mrt_lock);
- v->dev = dev;
+ spin_lock(&mrt_lock);
+ rcu_assign_pointer(v->dev, dev);
+ netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC);
#ifdef CONFIG_IPV6_PIMSM_V2
if (v->flags & MIFF_REGISTER)
- mrt->mroute_reg_vif_num = vifi;
+ WRITE_ONCE(mrt->mroute_reg_vif_num, vifi);
#endif
if (vifi + 1 > mrt->maxvif)
- mrt->maxvif = vifi + 1;
- write_unlock_bh(&mrt_lock);
+ WRITE_ONCE(mrt->maxvif, vifi + 1);
+ spin_unlock(&mrt_lock);
call_ip6mr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD,
- v, vifi, mrt->id);
+ v, dev, vifi, mrt->id);
return 0;
}
@@ -1013,18 +1046,21 @@ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt,
((struct nlmsgerr *)nlmsg_data(nlh))->error = -EMSGSIZE;
}
rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
- } else
- ip6_mr_forward(net, mrt, skb, c);
+ } else {
+ rcu_read_lock();
+ ip6_mr_forward(net, mrt, skb->dev, skb, c);
+ rcu_read_unlock();
+ }
}
}
/*
* Bounce a cache query up to pim6sd and netlink.
*
- * Called under mrt_lock.
+ * Called under rcu_read_lock()
*/
-static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
+static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt,
mifi_t mifi, int assert)
{
struct sock *mroute6_sk;
@@ -1033,7 +1069,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
int ret;
#ifdef CONFIG_IPV6_PIMSM_V2
- if (assert == MRT6MSG_WHOLEPKT)
+ if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE)
skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt)
+sizeof(*msg));
else
@@ -1049,20 +1085,23 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
skb->ip_summed = CHECKSUM_UNNECESSARY;
#ifdef CONFIG_IPV6_PIMSM_V2
- if (assert == MRT6MSG_WHOLEPKT) {
+ if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE) {
/* Ugly, but we have no choice with this interface.
Duplicate old header, fix length etc.
And all this only to mangle msg->im6_msgtype and
to set msg->im6_mbz to "mbz" :-)
*/
- skb_push(skb, -skb_network_offset(pkt));
+ __skb_pull(skb, skb_network_offset(pkt));
skb_push(skb, sizeof(*msg));
skb_reset_transport_header(skb);
msg = (struct mrt6msg *)skb_transport_header(skb);
msg->im6_mbz = 0;
- msg->im6_msgtype = MRT6MSG_WHOLEPKT;
- msg->im6_mif = mrt->mroute_reg_vif_num;
+ msg->im6_msgtype = assert;
+ if (assert == MRT6MSG_WRMIFWHOLE)
+ msg->im6_mif = mifi;
+ else
+ msg->im6_mif = READ_ONCE(mrt->mroute_reg_vif_num);
msg->im6_pad = 0;
msg->im6_src = ipv6_hdr(pkt)->saddr;
msg->im6_dst = ipv6_hdr(pkt)->daddr;
@@ -1097,10 +1136,8 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
- rcu_read_lock();
mroute6_sk = rcu_dereference(mrt->mroute_sk);
if (!mroute6_sk) {
- rcu_read_unlock();
kfree_skb(skb);
return -EINVAL;
}
@@ -1109,7 +1146,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
/* Deliver to user space multicast routing algorithms */
ret = sock_queue_rcv_skb(mroute6_sk, skb);
- rcu_read_unlock();
+
if (ret < 0) {
net_warn_ratelimited("mroute6: pending queue full, dropping entries\n");
kfree_skb(skb);
@@ -1120,7 +1157,7 @@ static int ip6mr_cache_report(struct mr_table *mrt, struct sk_buff *pkt,
/* Queue a packet for resolution. It gets locked cache entry! */
static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi,
- struct sk_buff *skb)
+ struct sk_buff *skb, struct net_device *dev)
{
struct mfc6_cache *c;
bool found = false;
@@ -1140,8 +1177,8 @@ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi,
* Create a new entry if allowable
*/
- if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
- (c = ip6mr_cache_alloc_unres()) == NULL) {
+ c = ip6mr_cache_alloc_unres();
+ if (!c) {
spin_unlock_bh(&mfc_unres_lock);
kfree_skb(skb);
@@ -1180,6 +1217,10 @@ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi,
kfree_skb(skb);
err = -ENOBUFS;
} else {
+ if (dev) {
+ skb->dev = dev;
+ skb->skb_iif = dev->ifindex;
+ }
skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb);
err = 0;
}
@@ -1229,7 +1270,7 @@ static int ip6mr_device_event(struct notifier_block *this,
ip6mr_for_each_table(mrt, net) {
v = &mrt->vif_table[0];
for (ct = 0; ct < mrt->maxvif; ct++, v++) {
- if (v->dev == dev)
+ if (rcu_access_pointer(v->dev) == dev)
mif6_delete(mrt, ct, 1, NULL);
}
}
@@ -1237,17 +1278,16 @@ static int ip6mr_device_event(struct notifier_block *this,
return NOTIFY_DONE;
}
-static unsigned int ip6mr_seq_read(struct net *net)
+static unsigned int ip6mr_seq_read(const struct net *net)
{
- ASSERT_RTNL();
-
- return net->ipv6.ipmr_seq + ip6mr_rules_seq_read(net);
+ return READ_ONCE(net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net);
}
-static int ip6mr_dump(struct net *net, struct notifier_block *nb)
+static int ip6mr_dump(struct net *net, struct notifier_block *nb,
+ struct netlink_ext_ack *extack)
{
return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump,
- ip6mr_mr_table_iter, &mrt_lock);
+ ip6mr_mr_table_iter, extack);
}
static struct notifier_block ip6_mr_notifier = {
@@ -1311,7 +1351,9 @@ static int __net_init ip6mr_net_init(struct net *net)
proc_cache_fail:
remove_proc_entry("ip6_mr_vif", net->proc_net);
proc_vif_fail:
+ rtnl_lock();
ip6mr_rules_exit(net);
+ rtnl_unlock();
#endif
ip6mr_rules_fail:
ip6mr_notifier_exit(net);
@@ -1324,23 +1366,36 @@ static void __net_exit ip6mr_net_exit(struct net *net)
remove_proc_entry("ip6_mr_cache", net->proc_net);
remove_proc_entry("ip6_mr_vif", net->proc_net);
#endif
- ip6mr_rules_exit(net);
ip6mr_notifier_exit(net);
}
+static void __net_exit ip6mr_net_exit_batch(struct list_head *net_list)
+{
+ struct net *net;
+
+ rtnl_lock();
+ list_for_each_entry(net, net_list, exit_list)
+ ip6mr_rules_exit(net);
+ rtnl_unlock();
+}
+
static struct pernet_operations ip6mr_net_ops = {
.init = ip6mr_net_init,
.exit = ip6mr_net_exit,
+ .exit_batch = ip6mr_net_exit_batch,
+};
+
+static const struct rtnl_msg_handler ip6mr_rtnl_msg_handlers[] __initconst_or_module = {
+ {.owner = THIS_MODULE, .protocol = RTNL_FAMILY_IP6MR,
+ .msgtype = RTM_GETROUTE,
+ .doit = ip6mr_rtm_getroute, .dumpit = ip6mr_rtm_dumproute},
};
int __init ip6_mr_init(void)
{
int err;
- mrt_cachep = kmem_cache_create("ip6_mrt_cache",
- sizeof(struct mfc6_cache),
- 0, SLAB_HWCACHE_ALIGN,
- NULL);
+ mrt_cachep = KMEM_CACHE(mfc6_cache, SLAB_HWCACHE_ALIGN);
if (!mrt_cachep)
return -ENOMEM;
@@ -1358,9 +1413,8 @@ int __init ip6_mr_init(void)
goto add_proto_fail;
}
#endif
- err = rtnl_register_module(THIS_MODULE, RTNL_FAMILY_IP6MR, RTM_GETROUTE,
- NULL, ip6mr_rtm_dumproute, 0);
- if (err == 0)
+ err = rtnl_register_many(ip6mr_rtnl_msg_handlers);
+ if (!err)
return 0;
#ifdef CONFIG_IPV6_PIMSM_V2
@@ -1375,9 +1429,9 @@ reg_pernet_fail:
return err;
}
-void ip6_mr_cleanup(void)
+void __init ip6_mr_cleanup(void)
{
- rtnl_unregister(RTNL_FAMILY_IP6MR, RTM_GETROUTE);
+ rtnl_unregister_many(ip6mr_rtnl_msg_handlers);
#ifdef CONFIG_IPV6_PIMSM_V2
inet6_del_protocol(&pim6_protocol, IPPROTO_PIM);
#endif
@@ -1410,12 +1464,12 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt,
&mfc->mf6cc_mcastgrp.sin6_addr, parent);
rcu_read_unlock();
if (c) {
- write_lock_bh(&mrt_lock);
+ spin_lock(&mrt_lock);
c->_c.mfc_parent = mfc->mf6cc_parent;
ip6mr_update_thresholds(mrt, &c->_c, ttls);
if (!mrtsock)
c->_c.mfc_flags |= MFC_STATIC;
- write_unlock_bh(&mrt_lock);
+ spin_unlock(&mrt_lock);
call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE,
c, mrt->id);
mr6_netlink_event(mrt, c, RTM_NEWROUTE);
@@ -1462,7 +1516,7 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt,
}
}
if (list_empty(&mrt->mfc_unres_queue))
- del_timer(&mrt->ipmr_expire_timer);
+ timer_delete(&mrt->ipmr_expire_timer);
spin_unlock_bh(&mfc_unres_lock);
if (found) {
@@ -1479,43 +1533,51 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt,
* Close the multicast socket, and clear the vif tables etc
*/
-static void mroute_clean_tables(struct mr_table *mrt, bool all)
+static void mroute_clean_tables(struct mr_table *mrt, int flags)
{
struct mr_mfc *c, *tmp;
LIST_HEAD(list);
int i;
/* Shut down all active vif entries */
- for (i = 0; i < mrt->maxvif; i++) {
- if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
- continue;
- mif6_delete(mrt, i, 0, &list);
+ if (flags & (MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC)) {
+ for (i = 0; i < mrt->maxvif; i++) {
+ if (((mrt->vif_table[i].flags & VIFF_STATIC) &&
+ !(flags & MRT6_FLUSH_MIFS_STATIC)) ||
+ (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS)))
+ continue;
+ mif6_delete(mrt, i, 0, &list);
+ }
+ unregister_netdevice_many(&list);
}
- unregister_netdevice_many(&list);
/* Wipe the cache */
- list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
- if (!all && (c->mfc_flags & MFC_STATIC))
- continue;
- rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params);
- list_del_rcu(&c->list);
- mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE);
- mr_cache_put(c);
- }
-
- if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
- spin_lock_bh(&mfc_unres_lock);
- list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
- list_del(&c->list);
+ if (flags & (MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC)) {
+ list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
+ if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC_STATIC)) ||
+ (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC)))
+ continue;
+ rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params);
+ list_del_rcu(&c->list);
call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net),
FIB_EVENT_ENTRY_DEL,
- (struct mfc6_cache *)c,
- mrt->id);
- mr6_netlink_event(mrt, (struct mfc6_cache *)c,
- RTM_DELROUTE);
- ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c);
+ (struct mfc6_cache *)c, mrt->id);
+ mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE);
+ mr_cache_put(c);
+ }
+ }
+
+ if (flags & MRT6_FLUSH_MFC) {
+ if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
+ spin_lock_bh(&mfc_unres_lock);
+ list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
+ list_del(&c->list);
+ mr6_netlink_event(mrt, (struct mfc6_cache *)c,
+ RTM_DELROUTE);
+ ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c);
+ }
+ spin_unlock_bh(&mfc_unres_lock);
}
- spin_unlock_bh(&mfc_unres_lock);
}
}
@@ -1525,15 +1587,15 @@ static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk)
struct net *net = sock_net(sk);
rtnl_lock();
- write_lock_bh(&mrt_lock);
+ spin_lock(&mrt_lock);
if (rtnl_dereference(mrt->mroute_sk)) {
err = -EADDRINUSE;
} else {
rcu_assign_pointer(mrt->mroute_sk, sk);
sock_set_flag(sk, SOCK_RCU_FREE);
- net->ipv6.devconf_all->mc_forwarding++;
+ atomic_inc(&net->ipv6.devconf_all->mc_forwarding);
}
- write_unlock_bh(&mrt_lock);
+ spin_unlock(&mrt_lock);
if (!err)
inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
@@ -1547,31 +1609,36 @@ static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk)
int ip6mr_sk_done(struct sock *sk)
{
- int err = -EACCES;
struct net *net = sock_net(sk);
+ struct ipv6_devconf *devconf;
struct mr_table *mrt;
+ int err = -EACCES;
if (sk->sk_type != SOCK_RAW ||
inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
return err;
+ devconf = net->ipv6.devconf_all;
+ if (!devconf || !atomic_read(&devconf->mc_forwarding))
+ return err;
+
rtnl_lock();
ip6mr_for_each_table(mrt, net) {
if (sk == rtnl_dereference(mrt->mroute_sk)) {
- write_lock_bh(&mrt_lock);
+ spin_lock(&mrt_lock);
RCU_INIT_POINTER(mrt->mroute_sk, NULL);
/* Note that mroute_sk had SOCK_RCU_FREE set,
* so the RCU grace period before sk freeing
* is guaranteed by sk_destruct()
*/
- net->ipv6.devconf_all->mc_forwarding--;
- write_unlock_bh(&mrt_lock);
+ atomic_dec(&devconf->mc_forwarding);
+ spin_unlock(&mrt_lock);
inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
NETCONFA_MC_FORWARDING,
NETCONFA_IFINDEX_ALL,
net->ipv6.devconf_all);
- mroute_clean_tables(mrt, false);
+ mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MFC);
err = 0;
break;
}
@@ -1604,7 +1671,8 @@ EXPORT_SYMBOL(mroute6_is_socket);
* MOSPF/PIM router set up we can clean this up.
*/
-int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
+int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval,
+ unsigned int optlen)
{
int ret, parent = 0;
struct mif6ctl vif;
@@ -1640,7 +1708,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
case MRT6_ADD_MIF:
if (optlen < sizeof(vif))
return -EINVAL;
- if (copy_from_user(&vif, optval, sizeof(vif)))
+ if (copy_from_sockptr(&vif, optval, sizeof(vif)))
return -EFAULT;
if (vif.mif6c_mifi >= MAXMIFS)
return -ENFILE;
@@ -1653,7 +1721,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
case MRT6_DEL_MIF:
if (optlen < sizeof(mifi_t))
return -EINVAL;
- if (copy_from_user(&mifi, optval, sizeof(mifi_t)))
+ if (copy_from_sockptr(&mifi, optval, sizeof(mifi_t)))
return -EFAULT;
rtnl_lock();
ret = mif6_delete(mrt, mifi, 0, NULL);
@@ -1667,12 +1735,12 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
case MRT6_ADD_MFC:
case MRT6_DEL_MFC:
parent = -1;
- /* fall through */
+ fallthrough;
case MRT6_ADD_MFC_PROXY:
case MRT6_DEL_MFC_PROXY:
if (optlen < sizeof(mfc))
return -EINVAL;
- if (copy_from_user(&mfc, optval, sizeof(mfc)))
+ if (copy_from_sockptr(&mfc, optval, sizeof(mfc)))
return -EFAULT;
if (parent == 0)
parent = mfc.mf6cc_parent;
@@ -1687,6 +1755,20 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
rtnl_unlock();
return ret;
+ case MRT6_FLUSH:
+ {
+ int flags;
+
+ if (optlen != sizeof(flags))
+ return -EINVAL;
+ if (copy_from_sockptr(&flags, optval, sizeof(flags)))
+ return -EFAULT;
+ rtnl_lock();
+ mroute_clean_tables(mrt, flags);
+ rtnl_unlock();
+ return 0;
+ }
+
/*
* Control PIM assert (to activate pim will activate assert)
*/
@@ -1696,7 +1778,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
if (optlen != sizeof(v))
return -EINVAL;
- if (get_user(v, (int __user *)optval))
+ if (copy_from_sockptr(&v, optval, sizeof(v)))
return -EFAULT;
mrt->mroute_do_assert = v;
return 0;
@@ -1705,18 +1787,22 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
#ifdef CONFIG_IPV6_PIMSM_V2
case MRT6_PIM:
{
+ bool do_wrmifwhole;
int v;
if (optlen != sizeof(v))
return -EINVAL;
- if (get_user(v, (int __user *)optval))
+ if (copy_from_sockptr(&v, optval, sizeof(v)))
return -EFAULT;
+
+ do_wrmifwhole = (v == MRT6MSG_WRMIFWHOLE);
v = !!v;
rtnl_lock();
ret = 0;
if (v != mrt->mroute_do_pim) {
mrt->mroute_do_pim = v;
mrt->mroute_do_assert = v;
+ mrt->mroute_do_wrvifwhole = do_wrmifwhole;
}
rtnl_unlock();
return ret;
@@ -1730,7 +1816,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
if (optlen != sizeof(u32))
return -EINVAL;
- if (get_user(v, (u32 __user *)optval))
+ if (copy_from_sockptr(&v, optval, sizeof(v)))
return -EFAULT;
/* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */
if (v != RT_TABLE_DEFAULT && v >= 100000000)
@@ -1762,8 +1848,8 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
* Getsock opt support for the multicast routing system.
*/
-int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
- int __user *optlen)
+int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval,
+ sockptr_t optlen)
{
int olr;
int val;
@@ -1794,16 +1880,16 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
return -ENOPROTOOPT;
}
- if (get_user(olr, optlen))
+ if (copy_from_sockptr(&olr, optlen, sizeof(int)))
return -EFAULT;
olr = min_t(int, olr, sizeof(int));
if (olr < 0)
return -EINVAL;
- if (put_user(olr, optlen))
+ if (copy_to_sockptr(optlen, &olr, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &val, olr))
+ if (copy_to_sockptr(optval, &val, olr))
return -EFAULT;
return 0;
}
@@ -1811,11 +1897,10 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
/*
* The IP multicast ioctl support routines.
*/
-
-int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
+int ip6mr_ioctl(struct sock *sk, int cmd, void *arg)
{
- struct sioc_sg_req6 sr;
- struct sioc_mif_req6 vr;
+ struct sioc_sg_req6 *sr;
+ struct sioc_mif_req6 *vr;
struct vif_device *vif;
struct mfc6_cache *c;
struct net *net = sock_net(sk);
@@ -1827,39 +1912,33 @@ int ip6mr_ioctl(struct sock *sk, int cmd, void __user *arg)
switch (cmd) {
case SIOCGETMIFCNT_IN6:
- if (copy_from_user(&vr, arg, sizeof(vr)))
- return -EFAULT;
- if (vr.mifi >= mrt->maxvif)
+ vr = (struct sioc_mif_req6 *)arg;
+ if (vr->mifi >= mrt->maxvif)
return -EINVAL;
- read_lock(&mrt_lock);
- vif = &mrt->vif_table[vr.mifi];
- if (VIF_EXISTS(mrt, vr.mifi)) {
- vr.icount = vif->pkt_in;
- vr.ocount = vif->pkt_out;
- vr.ibytes = vif->bytes_in;
- vr.obytes = vif->bytes_out;
- read_unlock(&mrt_lock);
-
- if (copy_to_user(arg, &vr, sizeof(vr)))
- return -EFAULT;
+ vr->mifi = array_index_nospec(vr->mifi, mrt->maxvif);
+ rcu_read_lock();
+ vif = &mrt->vif_table[vr->mifi];
+ if (VIF_EXISTS(mrt, vr->mifi)) {
+ vr->icount = READ_ONCE(vif->pkt_in);
+ vr->ocount = READ_ONCE(vif->pkt_out);
+ vr->ibytes = READ_ONCE(vif->bytes_in);
+ vr->obytes = READ_ONCE(vif->bytes_out);
+ rcu_read_unlock();
return 0;
}
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -EADDRNOTAVAIL;
case SIOCGETSGCNT_IN6:
- if (copy_from_user(&sr, arg, sizeof(sr)))
- return -EFAULT;
+ sr = (struct sioc_sg_req6 *)arg;
rcu_read_lock();
- c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr);
+ c = ip6mr_cache_find(mrt, &sr->src.sin6_addr,
+ &sr->grp.sin6_addr);
if (c) {
- sr.pktcnt = c->_c.mfc_un.res.pkt;
- sr.bytecnt = c->_c.mfc_un.res.bytes;
- sr.wrong_if = c->_c.mfc_un.res.wrong_if;
+ sr->pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt);
+ sr->bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes);
+ sr->wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if);
rcu_read_unlock();
-
- if (copy_to_user(arg, &sr, sizeof(sr)))
- return -EFAULT;
return 0;
}
rcu_read_unlock();
@@ -1905,20 +1984,21 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
return -EFAULT;
if (vr.mifi >= mrt->maxvif)
return -EINVAL;
- read_lock(&mrt_lock);
+ vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif);
+ rcu_read_lock();
vif = &mrt->vif_table[vr.mifi];
if (VIF_EXISTS(mrt, vr.mifi)) {
- vr.icount = vif->pkt_in;
- vr.ocount = vif->pkt_out;
- vr.ibytes = vif->bytes_in;
- vr.obytes = vif->bytes_out;
- read_unlock(&mrt_lock);
+ vr.icount = READ_ONCE(vif->pkt_in);
+ vr.ocount = READ_ONCE(vif->pkt_out);
+ vr.ibytes = READ_ONCE(vif->bytes_in);
+ vr.obytes = READ_ONCE(vif->bytes_out);
+ rcu_read_unlock();
if (copy_to_user(arg, &vr, sizeof(vr)))
return -EFAULT;
return 0;
}
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -EADDRNOTAVAIL;
case SIOCGETSGCNT_IN6:
if (copy_from_user(&sr, arg, sizeof(sr)))
@@ -1927,9 +2007,9 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
rcu_read_lock();
c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr);
if (c) {
- sr.pktcnt = c->_c.mfc_un.res.pkt;
- sr.bytecnt = c->_c.mfc_un.res.bytes;
- sr.wrong_if = c->_c.mfc_un.res.wrong_if;
+ sr.pktcnt = atomic_long_read(&c->_c.mfc_un.res.pkt);
+ sr.bytecnt = atomic_long_read(&c->_c.mfc_un.res.bytes);
+ sr.wrong_if = atomic_long_read(&c->_c.mfc_un.res.wrong_if);
rcu_read_unlock();
if (copy_to_user(arg, &sr, sizeof(sr)))
@@ -1946,10 +2026,8 @@ int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_OUTFORWDATAGRAMS);
- __IP6_ADD_STATS(net, ip6_dst_idev(skb_dst(skb)),
- IPSTATS_MIB_OUTOCTETS, skb->len);
+ IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_OUTFORWDATAGRAMS);
return dst_output(net, sk, skb);
}
@@ -1957,26 +2035,27 @@ static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct
* Processing handlers for ip6mr_forward
*/
-static int ip6mr_forward2(struct net *net, struct mr_table *mrt,
- struct sk_buff *skb, struct mfc6_cache *c, int vifi)
+static int ip6mr_prepare_xmit(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, int vifi)
{
- struct ipv6hdr *ipv6h;
struct vif_device *vif = &mrt->vif_table[vifi];
- struct net_device *dev;
+ struct net_device *vif_dev;
+ struct ipv6hdr *ipv6h;
struct dst_entry *dst;
struct flowi6 fl6;
- if (!vif->dev)
- goto out_free;
+ vif_dev = vif_dev_read(vif);
+ if (!vif_dev)
+ return -1;
#ifdef CONFIG_IPV6_PIMSM_V2
if (vif->flags & MIFF_REGISTER) {
- vif->pkt_out++;
- vif->bytes_out += skb->len;
- vif->dev->stats.tx_bytes += skb->len;
- vif->dev->stats.tx_packets++;
+ WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
+ WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
+ DEV_STATS_ADD(vif_dev, tx_bytes, skb->len);
+ DEV_STATS_INC(vif_dev, tx_packets);
ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT);
- goto out_free;
+ return -1;
}
#endif
@@ -1990,7 +2069,7 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt,
dst = ip6_route_output(net, NULL, &fl6);
if (dst->error) {
dst_release(dst);
- goto out_free;
+ return -1;
}
skb_dst_drop(skb);
@@ -2007,52 +2086,78 @@ static int ip6mr_forward2(struct net *net, struct mr_table *mrt,
* not mrouter) cannot join to more than one interface - it will
* result in receiving multiple packets.
*/
- dev = vif->dev;
- skb->dev = dev;
- vif->pkt_out++;
- vif->bytes_out += skb->len;
+ skb->dev = vif_dev;
+ WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1);
+ WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len);
/* We are about to write */
/* XXX: extension headers? */
- if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(dev)))
- goto out_free;
+ if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(vif_dev)))
+ return -1;
ipv6h = ipv6_hdr(skb);
ipv6h->hop_limit--;
+ return 0;
+}
+
+static void ip6mr_forward2(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, int vifi)
+{
+ struct net_device *indev = skb->dev;
+
+ if (ip6mr_prepare_xmit(net, mrt, skb, vifi))
+ goto out_free;
IP6CB(skb)->flags |= IP6SKB_FORWARDED;
- return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
- net, NULL, skb, skb->dev, dev,
- ip6mr_forward2_finish);
+ NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
+ net, NULL, skb, indev, skb->dev,
+ ip6mr_forward2_finish);
+ return;
out_free:
kfree_skb(skb);
- return 0;
}
+static void ip6mr_output2(struct net *net, struct mr_table *mrt,
+ struct sk_buff *skb, int vifi)
+{
+ if (ip6mr_prepare_xmit(net, mrt, skb, vifi))
+ goto out_free;
+
+ ip6_output(net, NULL, skb);
+ return;
+
+out_free:
+ kfree_skb(skb);
+}
+
+/* Called with rcu_read_lock() */
static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev)
{
int ct;
- for (ct = mrt->maxvif - 1; ct >= 0; ct--) {
- if (mrt->vif_table[ct].dev == dev)
+ /* Pairs with WRITE_ONCE() in mif6_delete()/mif6_add() */
+ for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) {
+ if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev)
break;
}
return ct;
}
+/* Called under rcu_read_lock() */
static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
- struct sk_buff *skb, struct mfc6_cache *c)
+ struct net_device *dev, struct sk_buff *skb,
+ struct mfc6_cache *c)
{
int psend = -1;
int vif, ct;
- int true_vifi = ip6mr_find_vif(mrt, skb->dev);
+ int true_vifi = ip6mr_find_vif(mrt, dev);
vif = c->_c.mfc_parent;
- c->_c.mfc_un.res.pkt++;
- c->_c.mfc_un.res.bytes += skb->len;
- c->_c.mfc_un.res.lastuse = jiffies;
+ atomic_long_inc(&c->_c.mfc_un.res.pkt);
+ atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes);
+ WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies);
if (ipv6_addr_any(&c->mf6c_origin) && true_vifi >= 0) {
struct mfc6_cache *cache_proxy;
@@ -2060,21 +2165,17 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
/* For an (*,G) entry, we only check that the incoming
* interface is part of the static tree.
*/
- rcu_read_lock();
cache_proxy = mr_mfc_find_any_parent(mrt, vif);
if (cache_proxy &&
- cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) {
- rcu_read_unlock();
+ cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255)
goto forward;
- }
- rcu_read_unlock();
}
/*
* Wrong interface: drop packet and (maybe) send PIM assert.
*/
- if (mrt->vif_table[vif].dev != skb->dev) {
- c->_c.mfc_un.res.wrong_if++;
+ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) {
+ atomic_long_inc(&c->_c.mfc_un.res.wrong_if);
if (true_vifi >= 0 && mrt->mroute_do_assert &&
/* pimsm uses asserts, when switching from RPT to SPT,
@@ -2089,13 +2190,18 @@ static void ip6_mr_forward(struct net *net, struct mr_table *mrt,
MFC_ASSERT_THRESH)) {
c->_c.mfc_un.res.last_assert = jiffies;
ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF);
+ if (mrt->mroute_do_wrvifwhole)
+ ip6mr_cache_report(mrt, skb, true_vifi,
+ MRT6MSG_WRMIFWHOLE);
}
goto dont_forward;
}
forward:
- mrt->vif_table[vif].pkt_in++;
- mrt->vif_table[vif].bytes_in += skb->len;
+ WRITE_ONCE(mrt->vif_table[vif].pkt_in,
+ mrt->vif_table[vif].pkt_in + 1);
+ WRITE_ONCE(mrt->vif_table[vif].bytes_in,
+ mrt->vif_table[vif].bytes_in + skb->len);
/*
* Forward the frame
@@ -2123,15 +2229,14 @@ forward:
if (psend != -1) {
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2)
- ip6mr_forward2(net, mrt, skb2,
- c, psend);
+ ip6mr_forward2(net, mrt, skb2, psend);
}
psend = ct;
}
}
last_forward:
if (psend != -1) {
- ip6mr_forward2(net, mrt, skb, c, psend);
+ ip6mr_forward2(net, mrt, skb, psend);
return;
}
@@ -2139,6 +2244,56 @@ dont_forward:
kfree_skb(skb);
}
+/* Called under rcu_read_lock() */
+static void ip6_mr_output_finish(struct net *net, struct mr_table *mrt,
+ struct net_device *dev, struct sk_buff *skb,
+ struct mfc6_cache *c)
+{
+ int psend = -1;
+ int ct;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ atomic_long_inc(&c->_c.mfc_un.res.pkt);
+ atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes);
+ WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies);
+
+ /* Forward the frame */
+ if (ipv6_addr_any(&c->mf6c_origin) &&
+ ipv6_addr_any(&c->mf6c_mcastgrp)) {
+ if (ipv6_hdr(skb)->hop_limit >
+ c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) {
+ /* It's an (*,*) entry and the packet is not coming from
+ * the upstream: forward the packet to the upstream
+ * only.
+ */
+ psend = c->_c.mfc_parent;
+ goto last_forward;
+ }
+ goto dont_forward;
+ }
+ for (ct = c->_c.mfc_un.res.maxvif - 1;
+ ct >= c->_c.mfc_un.res.minvif; ct--) {
+ if (ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) {
+ if (psend != -1) {
+ struct sk_buff *skb2;
+
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (skb2)
+ ip6mr_output2(net, mrt, skb2, psend);
+ }
+ psend = ct;
+ }
+ }
+last_forward:
+ if (psend != -1) {
+ ip6mr_output2(net, mrt, skb, psend);
+ return;
+ }
+
+dont_forward:
+ kfree_skb(skb);
+}
/*
* Multicast packets for forwarding arrive here
@@ -2146,26 +2301,37 @@ dont_forward:
int ip6_mr_input(struct sk_buff *skb)
{
+ struct net_device *dev = skb->dev;
+ struct net *net = dev_net_rcu(dev);
struct mfc6_cache *cache;
- struct net *net = dev_net(skb->dev);
struct mr_table *mrt;
struct flowi6 fl6 = {
- .flowi6_iif = skb->dev->ifindex,
+ .flowi6_iif = dev->ifindex,
.flowi6_mark = skb->mark,
};
int err;
+ /* skb->dev passed in is the master dev for vrfs.
+ * Get the proper interface that does have a vif associated with it.
+ */
+ if (netif_is_l3_master(dev)) {
+ dev = dev_get_by_index_rcu(net, IPCB(skb)->iif);
+ if (!dev) {
+ kfree_skb(skb);
+ return -ENODEV;
+ }
+ }
+
err = ip6mr_fib_lookup(net, &fl6, &mrt);
if (err < 0) {
kfree_skb(skb);
return err;
}
- read_lock(&mrt_lock);
cache = ip6mr_cache_find(mrt,
&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr);
if (!cache) {
- int vif = ip6mr_find_vif(mrt, skb->dev);
+ int vif = ip6mr_find_vif(mrt, dev);
if (vif >= 0)
cache = ip6mr_cache_find_any(mrt,
@@ -2179,23 +2345,74 @@ int ip6_mr_input(struct sk_buff *skb)
if (!cache) {
int vif;
- vif = ip6mr_find_vif(mrt, skb->dev);
+ vif = ip6mr_find_vif(mrt, dev);
if (vif >= 0) {
- int err = ip6mr_cache_unresolved(mrt, vif, skb);
- read_unlock(&mrt_lock);
+ int err = ip6mr_cache_unresolved(mrt, vif, skb, dev);
return err;
}
- read_unlock(&mrt_lock);
kfree_skb(skb);
return -ENODEV;
}
- ip6_mr_forward(net, mrt, skb, cache);
+ ip6_mr_forward(net, mrt, dev, skb, cache);
- read_unlock(&mrt_lock);
+ return 0;
+}
+
+int ip6_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct net_device *dev = skb_dst(skb)->dev;
+ struct flowi6 fl6 = (struct flowi6) {
+ .flowi6_iif = LOOPBACK_IFINDEX,
+ .flowi6_mark = skb->mark,
+ };
+ struct mfc6_cache *cache;
+ struct mr_table *mrt;
+ int err;
+ int vif;
+ guard(rcu)();
+
+ if (IP6CB(skb)->flags & IP6SKB_FORWARDED)
+ goto ip6_output;
+ if (!(IP6CB(skb)->flags & IP6SKB_MCROUTE))
+ goto ip6_output;
+
+ err = ip6mr_fib_lookup(net, &fl6, &mrt);
+ if (err < 0) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ cache = ip6mr_cache_find(mrt,
+ &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr);
+ if (!cache) {
+ vif = ip6mr_find_vif(mrt, dev);
+ if (vif >= 0)
+ cache = ip6mr_cache_find_any(mrt,
+ &ipv6_hdr(skb)->daddr,
+ vif);
+ }
+
+ /* No usable cache entry */
+ if (!cache) {
+ vif = ip6mr_find_vif(mrt, dev);
+ if (vif >= 0)
+ return ip6mr_cache_unresolved(mrt, vif, skb, dev);
+ goto ip6_output;
+ }
+
+ /* Wrong interface */
+ vif = cache->_c.mfc_parent;
+ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev)
+ goto ip6_output;
+
+ ip6_mr_output_finish(net, mrt, dev, skb, cache);
return 0;
+
+ip6_output:
+ return ip6_output(net, sk, skb);
}
int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
@@ -2204,13 +2421,15 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
int err;
struct mr_table *mrt;
struct mfc6_cache *cache;
- struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
+ struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
- mrt = ip6mr_get_table(net, RT6_TABLE_DFLT);
- if (!mrt)
+ rcu_read_lock();
+ mrt = __ip6mr_get_table(net, RT6_TABLE_DFLT);
+ if (!mrt) {
+ rcu_read_unlock();
return -ENOENT;
+ }
- read_lock(&mrt_lock);
cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr);
if (!cache && skb->dev) {
int vif = ip6mr_find_vif(mrt, skb->dev);
@@ -2228,14 +2447,14 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
dev = skb->dev;
if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) {
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -ENODEV;
}
/* really correct? */
skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC);
if (!skb2) {
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return -ENOMEM;
}
@@ -2257,14 +2476,14 @@ int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm,
iph->saddr = rt->rt6i_src.addr;
iph->daddr = rt->rt6i_dst.addr;
- err = ip6mr_cache_unresolved(mrt, vif, skb2);
- read_unlock(&mrt_lock);
+ err = ip6mr_cache_unresolved(mrt, vif, skb2, dev);
+ rcu_read_unlock();
return err;
}
err = mr_fill_mroute(mrt, skb, &cache->_c, rtm);
- read_unlock(&mrt_lock);
+ rcu_read_unlock();
return err;
}
@@ -2362,8 +2581,7 @@ static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc,
errout:
kfree_skb(skb);
- if (err < 0)
- rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err);
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err);
}
static size_t mrt6msg_netlink_msgsize(size_t payloadlen)
@@ -2383,7 +2601,7 @@ static size_t mrt6msg_netlink_msgsize(size_t payloadlen)
return len;
}
-static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt)
+static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt)
{
struct net *net = read_pnet(&mrt->net);
struct nlmsghdr *nlh;
@@ -2431,8 +2649,126 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE_R, -ENOBUFS);
}
+static const struct nla_policy ip6mr_getroute_policy[RTA_MAX + 1] = {
+ [RTA_SRC] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+ [RTA_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
+ [RTA_TABLE] = { .type = NLA_U32 },
+};
+
+static int ip6mr_rtm_valid_getroute_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct rtmsg *rtm;
+ int err;
+
+ err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, ip6mr_getroute_policy,
+ extack);
+ if (err)
+ return err;
+
+ rtm = nlmsg_data(nlh);
+ if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
+ (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
+ rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol ||
+ rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Invalid values in header for multicast route get request");
+ return -EINVAL;
+ }
+
+ if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
+ (tb[RTA_DST] && !rtm->rtm_dst_len)) {
+ NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct in6_addr src = {}, grp = {};
+ struct nlattr *tb[RTA_MAX + 1];
+ struct mfc6_cache *cache;
+ struct mr_table *mrt;
+ struct sk_buff *skb;
+ u32 tableid;
+ int err;
+
+ err = ip6mr_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
+ if (err < 0)
+ return err;
+
+ if (tb[RTA_SRC])
+ src = nla_get_in6_addr(tb[RTA_SRC]);
+ if (tb[RTA_DST])
+ grp = nla_get_in6_addr(tb[RTA_DST]);
+ tableid = nla_get_u32_default(tb[RTA_TABLE], 0);
+
+ mrt = __ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT);
+ if (!mrt) {
+ NL_SET_ERR_MSG_MOD(extack, "MR table does not exist");
+ return -ENOENT;
+ }
+
+ /* entries are added/deleted only under RTNL */
+ rcu_read_lock();
+ cache = ip6mr_cache_find(mrt, &src, &grp);
+ rcu_read_unlock();
+ if (!cache) {
+ NL_SET_ERR_MSG_MOD(extack, "MR cache entry not found");
+ return -ENOENT;
+ }
+
+ skb = nlmsg_new(mr6_msgsize(false, mrt->maxvif), GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ err = ip6mr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, cache, RTM_NEWROUTE, 0);
+ if (err < 0) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
+}
+
static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
{
+ const struct nlmsghdr *nlh = cb->nlh;
+ struct fib_dump_filter filter = {
+ .rtnl_held = true,
+ };
+ int err;
+
+ if (cb->strict_check) {
+ err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh,
+ &filter, cb);
+ if (err < 0)
+ return err;
+ }
+
+ if (filter.table_id) {
+ struct mr_table *mrt;
+
+ mrt = __ip6mr_get_table(sock_net(skb->sk), filter.table_id);
+ if (!mrt) {
+ if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IP6MR)
+ return skb->len;
+
+ NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist");
+ return -ENOENT;
+ }
+ err = mr_table_dump(mrt, skb, cb, _ip6mr_fill_mroute,
+ &mfc_unres_lock, &filter);
+ return skb->len ? : err;
+ }
+
return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter,
- _ip6mr_fill_mroute, &mfc_unres_lock);
+ _ip6mr_fill_mroute, &mfc_unres_lock, &filter);
}
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c
index 54d165b9845a..8607569de34f 100644
--- a/net/ipv6/ipcomp6.c
+++ b/net/ipv6/ipcomp6.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173
*
* Copyright (C)2003 USAGI/WIDE Project
*
* Author Mitsuru KANDA <mk@linux-ipv6.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
/*
* [Memo]
@@ -83,6 +71,7 @@ static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
}
+static struct lock_class_key xfrm_state_lock_key;
static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x)
{
struct net *net = xs_net(x);
@@ -91,6 +80,7 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x)
t = xfrm_state_alloc(net);
if (!t)
goto out;
+ lockdep_set_class(&t->lock, &xfrm_state_lock_key);
t->id.proto = IPPROTO_IPV6;
t->id.spi = xfrm6_tunnel_alloc_spi(net, (xfrm_address_t *)&x->props.saddr);
@@ -103,6 +93,7 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x)
t->props.mode = x->props.mode;
memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr));
memcpy(&t->mark, &x->mark, sizeof(t->mark));
+ t->if_id = x->if_id;
if (xfrm_init_state(t))
goto error;
@@ -147,7 +138,8 @@ out:
return err;
}
-static int ipcomp6_init_state(struct xfrm_state *x)
+static int ipcomp6_init_state(struct xfrm_state *x,
+ struct netlink_ext_ack *extack)
{
int err = -EINVAL;
@@ -159,17 +151,20 @@ static int ipcomp6_init_state(struct xfrm_state *x)
x->props.header_len += sizeof(struct ipv6hdr);
break;
default:
+ NL_SET_ERR_MSG(extack, "Unsupported XFRM mode for IPcomp");
goto out;
}
- err = ipcomp_init_state(x);
+ err = ipcomp_init_state(x, extack);
if (err)
goto out;
if (x->props.mode == XFRM_MODE_TUNNEL) {
err = ipcomp6_tunnel_attach(x);
- if (err)
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Kernel error: failed to initialize the associated state");
goto out;
+ }
}
err = 0;
@@ -183,18 +178,17 @@ static int ipcomp6_rcv_cb(struct sk_buff *skb, int err)
}
static const struct xfrm_type ipcomp6_type = {
- .description = "IPCOMP6",
.owner = THIS_MODULE,
.proto = IPPROTO_COMP,
.init_state = ipcomp6_init_state,
.destructor = ipcomp_destroy,
.input = ipcomp_input,
.output = ipcomp_output,
- .hdr_offset = xfrm6_find_1stfragopt,
};
static struct xfrm6_protocol ipcomp6_protocol = {
.handler = xfrm6_rcv,
+ .input_handler = xfrm_input,
.cb_handler = ipcomp6_rcv_cb,
.err_handler = ipcomp6_err,
.priority = 0,
@@ -218,8 +212,7 @@ static void __exit ipcomp6_fini(void)
{
if (xfrm6_protocol_deregister(&ipcomp6_protocol, IPPROTO_COMP) < 0)
pr_info("%s: can't remove protocol\n", __func__);
- if (xfrm_unregister_type(&ipcomp6_type, AF_INET6) < 0)
- pr_info("%s: can't remove xfrm type\n", __func__);
+ xfrm_unregister_type(&ipcomp6_type, AF_INET6);
}
module_init(ipcomp6_init);
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index c0cac9cc3a28..a61e742794f9 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPv6 BSD socket options interface
* Linux INET6 implementation
@@ -7,11 +8,6 @@
*
* Based on linux/net/ipv4/ip_sockglue.c
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* FIXME: Make the setsockopt code POSIX compliant: That is
*
* o Truncate getsockopt returns
@@ -53,12 +49,15 @@
#include <net/xfrm.h>
#include <net/compat.h>
#include <net/seg6.h>
+#include <net/psp.h>
#include <linux/uaccess.h>
struct ip6_ra_chain *ip6_ra_chain;
DEFINE_RWLOCK(ip6_ra_lock);
+DEFINE_STATIC_KEY_FALSE(ip6_min_hopcount);
+
int ip6_ra_control(struct sock *sk, int sel)
{
struct ip6_ra_chain *ra, *new_ra, **rap;
@@ -68,6 +67,8 @@ int ip6_ra_control(struct sock *sk, int sel)
return -ENOPROTOOPT;
new_ra = (sel >= 0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
+ if (sel >= 0 && !new_ra)
+ return -ENOMEM;
write_lock_bh(&ip6_ra_lock);
for (rap = &ip6_ra_chain; (ra = *rap) != NULL; rap = &ra->next) {
@@ -102,56 +103,290 @@ int ip6_ra_control(struct sock *sk, int sel)
struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
struct ipv6_txoptions *opt)
{
- if (inet_sk(sk)->is_icsk) {
+ if (inet_test_bit(IS_ICSK, sk)) {
if (opt &&
!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) &&
inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) {
struct inet_connection_sock *icsk = inet_csk(sk);
- icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen;
+
+ icsk->icsk_ext_hdr_len =
+ psp_sk_overhead(sk) +
+ opt->opt_flen + opt->opt_nflen;
icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
}
}
- opt = xchg((__force struct ipv6_txoptions **)&inet6_sk(sk)->opt,
- opt);
+ opt = unrcu_pointer(xchg(&inet6_sk(sk)->opt, RCU_INITIALIZER(opt)));
sk_dst_reset(sk);
return opt;
}
-static bool setsockopt_needs_rtnl(int optname)
+static int copy_group_source_from_sockptr(struct group_source_req *greqs,
+ sockptr_t optval, int optlen)
{
- switch (optname) {
- case IPV6_ADDRFORM:
- case IPV6_ADD_MEMBERSHIP:
- case IPV6_DROP_MEMBERSHIP:
- case IPV6_JOIN_ANYCAST:
- case IPV6_LEAVE_ANYCAST:
- case MCAST_JOIN_GROUP:
- case MCAST_LEAVE_GROUP:
- case MCAST_JOIN_SOURCE_GROUP:
- case MCAST_LEAVE_SOURCE_GROUP:
- case MCAST_BLOCK_SOURCE:
- case MCAST_UNBLOCK_SOURCE:
- case MCAST_MSFILTER:
- return true;
+ if (in_compat_syscall()) {
+ struct compat_group_source_req gr32;
+
+ if (optlen < sizeof(gr32))
+ return -EINVAL;
+ if (copy_from_sockptr(&gr32, optval, sizeof(gr32)))
+ return -EFAULT;
+ greqs->gsr_interface = gr32.gsr_interface;
+ greqs->gsr_group = gr32.gsr_group;
+ greqs->gsr_source = gr32.gsr_source;
+ } else {
+ if (optlen < sizeof(*greqs))
+ return -EINVAL;
+ if (copy_from_sockptr(greqs, optval, sizeof(*greqs)))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int do_ipv6_mcast_group_source(struct sock *sk, int optname,
+ sockptr_t optval, int optlen)
+{
+ struct group_source_req greqs;
+ int omode, add;
+ int ret;
+
+ ret = copy_group_source_from_sockptr(&greqs, optval, optlen);
+ if (ret)
+ return ret;
+
+ if (greqs.gsr_group.ss_family != AF_INET6 ||
+ greqs.gsr_source.ss_family != AF_INET6)
+ return -EADDRNOTAVAIL;
+
+ if (optname == MCAST_BLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 1;
+ } else if (optname == MCAST_UNBLOCK_SOURCE) {
+ omode = MCAST_EXCLUDE;
+ add = 0;
+ } else if (optname == MCAST_JOIN_SOURCE_GROUP) {
+ struct sockaddr_in6 *psin6;
+ int retv;
+
+ psin6 = (struct sockaddr_in6 *)&greqs.gsr_group;
+ retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface,
+ &psin6->sin6_addr,
+ MCAST_INCLUDE);
+ /* prior join w/ different source is ok */
+ if (retv && retv != -EADDRINUSE)
+ return retv;
+ omode = MCAST_INCLUDE;
+ add = 1;
+ } else /* MCAST_LEAVE_SOURCE_GROUP */ {
+ omode = MCAST_INCLUDE;
+ add = 0;
}
- return false;
+ return ip6_mc_source(add, omode, sk, &greqs);
}
-static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
+static int ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
+ int optlen)
+{
+ struct group_filter *gsf;
+ int ret;
+
+ if (optlen < GROUP_FILTER_SIZE(0))
+ return -EINVAL;
+ if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
+ return -ENOBUFS;
+
+ gsf = memdup_sockptr(optval, optlen);
+ if (IS_ERR(gsf))
+ return PTR_ERR(gsf);
+
+ /* numsrc >= (4G-140)/128 overflow in 32 bits */
+ ret = -ENOBUFS;
+ if (gsf->gf_numsrc >= 0x1ffffffU ||
+ gsf->gf_numsrc > sysctl_mld_max_msf)
+ goto out_free_gsf;
+
+ ret = -EINVAL;
+ if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen)
+ goto out_free_gsf;
+
+ ret = ip6_mc_msfilter(sk, gsf, gsf->gf_slist_flex);
+out_free_gsf:
+ kfree(gsf);
+ return ret;
+}
+
+static int compat_ipv6_set_mcast_msfilter(struct sock *sk, sockptr_t optval,
+ int optlen)
+{
+ const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
+ struct compat_group_filter *gf32;
+ void *p;
+ int ret;
+ int n;
+
+ if (optlen < size0)
+ return -EINVAL;
+ if (optlen > READ_ONCE(sock_net(sk)->core.sysctl_optmem_max) - 4)
+ return -ENOBUFS;
+
+ p = kmalloc(optlen + 4, GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ gf32 = p + 4; /* we want ->gf_group and ->gf_slist_flex aligned */
+ ret = -EFAULT;
+ if (copy_from_sockptr(gf32, optval, optlen))
+ goto out_free_p;
+
+ /* numsrc >= (4G-140)/128 overflow in 32 bits */
+ ret = -ENOBUFS;
+ n = gf32->gf_numsrc;
+ if (n >= 0x1ffffffU || n > sysctl_mld_max_msf)
+ goto out_free_p;
+
+ ret = -EINVAL;
+ if (offsetof(struct compat_group_filter, gf_slist_flex[n]) > optlen)
+ goto out_free_p;
+
+ ret = ip6_mc_msfilter(sk, &(struct group_filter){
+ .gf_interface = gf32->gf_interface,
+ .gf_group = gf32->gf_group,
+ .gf_fmode = gf32->gf_fmode,
+ .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist_flex);
+
+out_free_p:
+ kfree(p);
+ return ret;
+}
+
+static int ipv6_mcast_join_leave(struct sock *sk, int optname,
+ sockptr_t optval, int optlen)
+{
+ struct sockaddr_in6 *psin6;
+ struct group_req greq;
+
+ if (optlen < sizeof(greq))
+ return -EINVAL;
+ if (copy_from_sockptr(&greq, optval, sizeof(greq)))
+ return -EFAULT;
+
+ if (greq.gr_group.ss_family != AF_INET6)
+ return -EADDRNOTAVAIL;
+ psin6 = (struct sockaddr_in6 *)&greq.gr_group;
+ if (optname == MCAST_JOIN_GROUP)
+ return ipv6_sock_mc_join(sk, greq.gr_interface,
+ &psin6->sin6_addr);
+ return ipv6_sock_mc_drop(sk, greq.gr_interface, &psin6->sin6_addr);
+}
+
+static int compat_ipv6_mcast_join_leave(struct sock *sk, int optname,
+ sockptr_t optval, int optlen)
+{
+ struct compat_group_req gr32;
+ struct sockaddr_in6 *psin6;
+
+ if (optlen < sizeof(gr32))
+ return -EINVAL;
+ if (copy_from_sockptr(&gr32, optval, sizeof(gr32)))
+ return -EFAULT;
+
+ if (gr32.gr_group.ss_family != AF_INET6)
+ return -EADDRNOTAVAIL;
+ psin6 = (struct sockaddr_in6 *)&gr32.gr_group;
+ if (optname == MCAST_JOIN_GROUP)
+ return ipv6_sock_mc_join(sk, gr32.gr_interface,
+ &psin6->sin6_addr);
+ return ipv6_sock_mc_drop(sk, gr32.gr_interface, &psin6->sin6_addr);
+}
+
+static int ipv6_set_opt_hdr(struct sock *sk, int optname, sockptr_t optval,
+ int optlen)
+{
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_opt_hdr *new = NULL;
+ struct net *net = sock_net(sk);
+ struct ipv6_txoptions *opt;
+ int err;
+
+ /* hop-by-hop / destination options are privileged option */
+ if (optname != IPV6_RTHDR && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW))
+ return -EPERM;
+
+ /* remove any sticky options header with a zero option
+ * length, per RFC3542.
+ */
+ if (optlen > 0) {
+ if (sockptr_is_null(optval))
+ return -EINVAL;
+ if (optlen < sizeof(struct ipv6_opt_hdr) ||
+ optlen & 0x7 ||
+ optlen > 8 * 255)
+ return -EINVAL;
+
+ new = memdup_sockptr(optval, optlen);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+ if (unlikely(ipv6_optlen(new) > optlen)) {
+ kfree(new);
+ return -EINVAL;
+ }
+ }
+
+ opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
+ opt = ipv6_renew_options(sk, opt, optname, new);
+ kfree(new);
+ if (IS_ERR(opt))
+ return PTR_ERR(opt);
+
+ /* routing header option needs extra check */
+ err = -EINVAL;
+ if (optname == IPV6_RTHDR && opt && opt->srcrt) {
+ struct ipv6_rt_hdr *rthdr = opt->srcrt;
+ switch (rthdr->type) {
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ case IPV6_SRCRT_TYPE_2:
+ if (rthdr->hdrlen != 2 || rthdr->segments_left != 1)
+ goto sticky_done;
+ break;
+#endif
+ case IPV6_SRCRT_TYPE_4:
+ {
+ struct ipv6_sr_hdr *srh =
+ (struct ipv6_sr_hdr *)opt->srcrt;
+
+ if (!seg6_validate_srh(srh, optlen, false))
+ goto sticky_done;
+ break;
+ }
+ default:
+ goto sticky_done;
+ }
+ }
+
+ err = 0;
+ opt = ipv6_update_options(sk, opt);
+sticky_done:
+ if (opt) {
+ atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
+ txopt_put(opt);
+ }
+ return err;
+}
+
+int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, unsigned int optlen)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct net *net = sock_net(sk);
- int val, valbool;
int retv = -ENOPROTOOPT;
- bool needs_rtnl = setsockopt_needs_rtnl(optname);
+ int val, valbool;
- if (!optval)
+ if (sockptr_is_null(optval))
val = 0;
else {
if (optlen >= sizeof(int)) {
- if (get_user(val, (int __user *) optval))
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
} else
val = 0;
@@ -162,9 +397,162 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
if (ip6_mroute_opt(optname))
return ip6_mroute_setsockopt(sk, optname, optval, optlen);
- if (needs_rtnl)
- rtnl_lock();
- lock_sock(sk);
+ /* Handle options that can be set without locking the socket. */
+ switch (optname) {
+ case IPV6_UNICAST_HOPS:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val > 255 || val < -1)
+ return -EINVAL;
+ WRITE_ONCE(np->hop_limit, val);
+ return 0;
+ case IPV6_MULTICAST_LOOP:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val != valbool)
+ return -EINVAL;
+ inet6_assign_bit(MC6_LOOP, sk, valbool);
+ return 0;
+ case IPV6_MULTICAST_HOPS:
+ if (sk->sk_type == SOCK_STREAM)
+ return retv;
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val > 255 || val < -1)
+ return -EINVAL;
+ WRITE_ONCE(np->mcast_hops,
+ val == -1 ? IPV6_DEFAULT_MCASTHOPS : val);
+ return 0;
+ case IPV6_MTU:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val && val < IPV6_MIN_MTU)
+ return -EINVAL;
+ WRITE_ONCE(np->frag_size, val);
+ return 0;
+ case IPV6_MINHOPCOUNT:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val < 0 || val > 255)
+ return -EINVAL;
+
+ if (val)
+ static_branch_enable(&ip6_min_hopcount);
+
+ /* tcp_v6_err() and tcp_v6_rcv() might read min_hopcount
+ * while we are changing it.
+ */
+ WRITE_ONCE(np->min_hopcount, val);
+ return 0;
+ case IPV6_RECVERR_RFC4884:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val < 0 || val > 1)
+ return -EINVAL;
+ inet6_assign_bit(RECVERR6_RFC4884, sk, valbool);
+ return 0;
+ case IPV6_MULTICAST_ALL:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ inet6_assign_bit(MC6_ALL, sk, valbool);
+ return 0;
+ case IPV6_AUTOFLOWLABEL:
+ inet6_assign_bit(AUTOFLOWLABEL, sk, valbool);
+ inet6_set_bit(AUTOFLOWLABEL_SET, sk);
+ return 0;
+ case IPV6_DONTFRAG:
+ inet6_assign_bit(DONTFRAG, sk, valbool);
+ return 0;
+ case IPV6_RECVERR:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ inet6_assign_bit(RECVERR6, sk, valbool);
+ if (!val)
+ skb_errqueue_purge(&sk->sk_error_queue);
+ return 0;
+ case IPV6_ROUTER_ALERT_ISOLATE:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ inet6_assign_bit(RTALERT_ISOLATE, sk, valbool);
+ return 0;
+ case IPV6_MTU_DISCOVER:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT)
+ return -EINVAL;
+ WRITE_ONCE(np->pmtudisc, val);
+ return 0;
+ case IPV6_FLOWINFO_SEND:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ inet6_assign_bit(SNDFLOW, sk, valbool);
+ return 0;
+ case IPV6_ADDR_PREFERENCES:
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ return ip6_sock_set_addr_preferences(sk, val);
+ case IPV6_MULTICAST_IF:
+ if (sk->sk_type == SOCK_STREAM)
+ return -ENOPROTOOPT;
+ if (optlen < sizeof(int))
+ return -EINVAL;
+ if (val) {
+ struct net_device *dev;
+ int bound_dev_if, midx;
+
+ rcu_read_lock();
+
+ dev = dev_get_by_index_rcu(net, val);
+ if (!dev) {
+ rcu_read_unlock();
+ return -ENODEV;
+ }
+ midx = l3mdev_master_ifindex_rcu(dev);
+
+ rcu_read_unlock();
+
+ bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
+ if (bound_dev_if &&
+ bound_dev_if != val &&
+ (!midx || midx != bound_dev_if))
+ return -EINVAL;
+ }
+ WRITE_ONCE(np->mcast_oif, val);
+ return 0;
+ case IPV6_UNICAST_IF:
+ {
+ struct net_device *dev;
+ int ifindex;
+
+ if (optlen != sizeof(int))
+ return -EINVAL;
+
+ ifindex = (__force int)ntohl((__force __be32)val);
+ if (!ifindex) {
+ WRITE_ONCE(np->ucast_oif, 0);
+ return 0;
+ }
+
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return -EADDRNOTAVAIL;
+ dev_put(dev);
+
+ if (READ_ONCE(sk->sk_bound_dev_if))
+ return -EINVAL;
+
+ WRITE_ONCE(np->ucast_oif, ifindex);
+ return 0;
+ }
+ }
+
+ sockopt_lock_sock(sk);
+
+ /* Another thread has converted the socket into IPv4 with
+ * IPV6_ADDRFORM concurrently.
+ */
+ if (unlikely(sk->sk_family != AF_INET6))
+ goto unlock;
switch (optname) {
@@ -172,9 +560,6 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
if (optlen < sizeof(int))
goto e_inval;
if (val == PF_INET) {
- struct ipv6_txoptions *opt;
- struct sk_buff *pktopt;
-
if (sk->sk_type == SOCK_RAW)
break;
@@ -185,8 +570,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
retv = -EBUSY;
break;
}
- } else if (sk->sk_protocol != IPPROTO_TCP)
+ } else if (sk->sk_protocol == IPPROTO_TCP) {
+ if (sk->sk_prot != &tcpv6_prot) {
+ retv = -EBUSY;
+ break;
+ }
+ } else {
break;
+ }
if (sk->sk_state != TCP_ESTABLISHED) {
retv = -ENOTCONN;
@@ -199,54 +590,45 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
break;
}
- fl6_free_socklist(sk);
__ipv6_sock_mc_close(sk);
-
- /*
- * Sock is moving from IPv6 to IPv4 (sk_prot), so
- * remove it from the refcnt debug socks count in the
- * original family...
- */
- sk_refcnt_debug_dec(sk);
+ __ipv6_sock_ac_close(sk);
if (sk->sk_protocol == IPPROTO_TCP) {
struct inet_connection_sock *icsk = inet_csk(sk);
- local_bh_disable();
+
sock_prot_inuse_add(net, sk->sk_prot, -1);
sock_prot_inuse_add(net, &tcp_prot, 1);
- local_bh_enable();
- sk->sk_prot = &tcp_prot;
- icsk->icsk_af_ops = &ipv4_specific;
- sk->sk_socket->ops = &inet_stream_ops;
- sk->sk_family = PF_INET;
+
+ /* Paired with READ_ONCE(sk->sk_prot) in inet6_stream_ops */
+ WRITE_ONCE(sk->sk_prot, &tcp_prot);
+ /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */
+ WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific);
+ WRITE_ONCE(sk->sk_socket->ops, &inet_stream_ops);
+ WRITE_ONCE(sk->sk_family, PF_INET);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
} else {
struct proto *prot = &udp_prot;
if (sk->sk_protocol == IPPROTO_UDPLITE)
prot = &udplite_prot;
- local_bh_disable();
+
sock_prot_inuse_add(net, sk->sk_prot, -1);
sock_prot_inuse_add(net, prot, 1);
- local_bh_enable();
- sk->sk_prot = prot;
- sk->sk_socket->ops = &inet_dgram_ops;
- sk->sk_family = PF_INET;
- }
- opt = xchg((__force struct ipv6_txoptions **)&np->opt,
- NULL);
- if (opt) {
- atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
- txopt_put(opt);
+
+ /* Paired with READ_ONCE(sk->sk_prot) in inet6_dgram_ops */
+ WRITE_ONCE(sk->sk_prot, prot);
+ WRITE_ONCE(sk->sk_socket->ops, &inet_dgram_ops);
+ WRITE_ONCE(sk->sk_family, PF_INET);
}
- pktopt = xchg(&np->pktoptions, NULL);
- kfree_skb(pktopt);
- /*
- * ... and add it to the refcnt debug socks count
- * in the new family. -acme
+ /* Disable all options not to allocate memory anymore,
+ * but there is still a race. See the lockless path
+ * in udpv6_sendmsg() and ipv6_local_rxpmtu().
*/
- sk_refcnt_debug_inc(sk);
+ np->rxopt.all = 0;
+
+ inet6_cleanup_sock(sk);
+
module_put(THIS_MODULE);
retv = 0;
break;
@@ -339,7 +721,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
/* RFC 3542, 6.5: default traffic class of 0x0 */
if (val == -1)
val = 0;
- np->tclass = val;
+ if (sk->sk_type == SOCK_STREAM) {
+ val &= ~INET_ECN_MASK;
+ val |= np->tclass & INET_ECN_MASK;
+ }
+ if (np->tclass != val) {
+ np->tclass = val;
+ sk_dst_reset(sk);
+ }
retv = 0;
break;
@@ -365,15 +754,15 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
break;
case IPV6_TRANSPARENT:
- if (valbool && !ns_capable(net->user_ns, CAP_NET_ADMIN) &&
- !ns_capable(net->user_ns, CAP_NET_RAW)) {
+ if (valbool && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW) &&
+ !sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) {
retv = -EPERM;
break;
}
if (optlen < sizeof(int))
goto e_inval;
/* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */
- inet_sk(sk)->transparent = valbool;
+ inet_assign_bit(TRANSPARENT, sk, valbool);
retv = 0;
break;
@@ -381,7 +770,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
if (optlen < sizeof(int))
goto e_inval;
/* we also don't have a separate freebind bit for IPV6 */
- inet_sk(sk)->freebind = valbool;
+ inet_assign_bit(FREEBIND, sk, valbool);
retv = 0;
break;
@@ -396,82 +785,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
case IPV6_RTHDRDSTOPTS:
case IPV6_RTHDR:
case IPV6_DSTOPTS:
- {
- struct ipv6_txoptions *opt;
- struct ipv6_opt_hdr *new = NULL;
-
- /* hop-by-hop / destination options are privileged option */
- retv = -EPERM;
- if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW))
- break;
-
- /* remove any sticky options header with a zero option
- * length, per RFC3542.
- */
- if (optlen == 0)
- optval = NULL;
- else if (!optval)
- goto e_inval;
- else if (optlen < sizeof(struct ipv6_opt_hdr) ||
- optlen & 0x7 || optlen > 8 * 255)
- goto e_inval;
- else {
- new = memdup_user(optval, optlen);
- if (IS_ERR(new)) {
- retv = PTR_ERR(new);
- break;
- }
- if (unlikely(ipv6_optlen(new) > optlen)) {
- kfree(new);
- goto e_inval;
- }
- }
-
- opt = rcu_dereference_protected(np->opt,
- lockdep_sock_is_held(sk));
- opt = ipv6_renew_options(sk, opt, optname, new);
- kfree(new);
- if (IS_ERR(opt)) {
- retv = PTR_ERR(opt);
- break;
- }
-
- /* routing header option needs extra check */
- retv = -EINVAL;
- if (optname == IPV6_RTHDR && opt && opt->srcrt) {
- struct ipv6_rt_hdr *rthdr = opt->srcrt;
- switch (rthdr->type) {
-#if IS_ENABLED(CONFIG_IPV6_MIP6)
- case IPV6_SRCRT_TYPE_2:
- if (rthdr->hdrlen != 2 ||
- rthdr->segments_left != 1)
- goto sticky_done;
-
- break;
-#endif
- case IPV6_SRCRT_TYPE_4:
- {
- struct ipv6_sr_hdr *srh = (struct ipv6_sr_hdr *)
- opt->srcrt;
-
- if (!seg6_validate_srh(srh, optlen))
- goto sticky_done;
- break;
- }
- default:
- goto sticky_done;
- }
- }
-
- retv = 0;
- opt = ipv6_update_options(sk, opt);
-sticky_done:
- if (opt) {
- atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
- txopt_put(opt);
- }
+ retv = ipv6_set_opt_hdr(sk, optname, optval, optlen);
break;
- }
case IPV6_PKTINFO:
{
@@ -479,14 +794,15 @@ sticky_done:
if (optlen == 0)
goto e_inval;
- else if (optlen < sizeof(struct in6_pktinfo) || !optval)
+ else if (optlen < sizeof(struct in6_pktinfo) ||
+ sockptr_is_null(optval))
goto e_inval;
- if (copy_from_user(&pkt, optval, sizeof(struct in6_pktinfo))) {
- retv = -EFAULT;
- break;
+ if (copy_from_sockptr(&pkt, optval, sizeof(pkt))) {
+ retv = -EFAULT;
+ break;
}
- if (sk->sk_bound_dev_if && pkt.ipi6_ifindex != sk->sk_bound_dev_if)
+ if (!sk_dev_equal_l3scope(sk, pkt.ipi6_ifindex))
goto e_inval;
np->sticky_pktinfo.ipi6_ifindex = pkt.ipi6_ifindex;
@@ -525,10 +841,11 @@ sticky_done:
refcount_set(&opt->refcnt, 1);
opt->tot_len = sizeof(*opt) + optlen;
retv = -EFAULT;
- if (copy_from_user(opt+1, optval, optlen))
+ if (copy_from_sockptr(opt + 1, optval, optlen))
goto done;
msg.msg_controllen = optlen;
+ msg.msg_control_is_user = false;
msg.msg_control = (void *)(opt+1);
ipc6.opt = opt;
@@ -545,95 +862,7 @@ done:
}
break;
}
- case IPV6_UNICAST_HOPS:
- if (optlen < sizeof(int))
- goto e_inval;
- if (val > 255 || val < -1)
- goto e_inval;
- np->hop_limit = val;
- retv = 0;
- break;
-
- case IPV6_MULTICAST_HOPS:
- if (sk->sk_type == SOCK_STREAM)
- break;
- if (optlen < sizeof(int))
- goto e_inval;
- if (val > 255 || val < -1)
- goto e_inval;
- np->mcast_hops = (val == -1 ? IPV6_DEFAULT_MCASTHOPS : val);
- retv = 0;
- break;
-
- case IPV6_MULTICAST_LOOP:
- if (optlen < sizeof(int))
- goto e_inval;
- if (val != valbool)
- goto e_inval;
- np->mc_loop = valbool;
- retv = 0;
- break;
-
- case IPV6_UNICAST_IF:
- {
- struct net_device *dev = NULL;
- int ifindex;
-
- if (optlen != sizeof(int))
- goto e_inval;
-
- ifindex = (__force int)ntohl((__force __be32)val);
- if (ifindex == 0) {
- np->ucast_oif = 0;
- retv = 0;
- break;
- }
-
- dev = dev_get_by_index(net, ifindex);
- retv = -EADDRNOTAVAIL;
- if (!dev)
- break;
- dev_put(dev);
-
- retv = -EINVAL;
- if (sk->sk_bound_dev_if)
- break;
-
- np->ucast_oif = ifindex;
- retv = 0;
- break;
- }
-
- case IPV6_MULTICAST_IF:
- if (sk->sk_type == SOCK_STREAM)
- break;
- if (optlen < sizeof(int))
- goto e_inval;
-
- if (val) {
- struct net_device *dev;
- int midx;
- rcu_read_lock();
-
- dev = dev_get_by_index_rcu(net, val);
- if (!dev) {
- rcu_read_unlock();
- retv = -ENODEV;
- break;
- }
- midx = l3mdev_master_ifindex_rcu(dev);
-
- rcu_read_unlock();
-
- if (sk->sk_bound_dev_if &&
- sk->sk_bound_dev_if != val &&
- (!midx || midx != sk->sk_bound_dev_if))
- goto e_inval;
- }
- np->mcast_oif = val;
- retv = 0;
- break;
case IPV6_ADD_MEMBERSHIP:
case IPV6_DROP_MEMBERSHIP:
{
@@ -643,11 +872,11 @@ done:
goto e_inval;
retv = -EPROTO;
- if (inet_sk(sk)->is_icsk)
+ if (inet_test_bit(IS_ICSK, sk))
break;
retv = -EFAULT;
- if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq)))
+ if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq)))
break;
if (optname == IPV6_ADD_MEMBERSHIP)
@@ -665,7 +894,7 @@ done:
goto e_inval;
retv = -EFAULT;
- if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq)))
+ if (copy_from_sockptr(&mreq, optval, sizeof(struct ipv6_mreq)))
break;
if (optname == IPV6_JOIN_ANYCAST)
@@ -676,139 +905,32 @@ done:
}
case MCAST_JOIN_GROUP:
case MCAST_LEAVE_GROUP:
- {
- struct group_req greq;
- struct sockaddr_in6 *psin6;
-
- if (optlen < sizeof(struct group_req))
- goto e_inval;
-
- retv = -EFAULT;
- if (copy_from_user(&greq, optval, sizeof(struct group_req)))
- break;
- if (greq.gr_group.ss_family != AF_INET6) {
- retv = -EADDRNOTAVAIL;
- break;
- }
- psin6 = (struct sockaddr_in6 *)&greq.gr_group;
- if (optname == MCAST_JOIN_GROUP)
- retv = ipv6_sock_mc_join(sk, greq.gr_interface,
- &psin6->sin6_addr);
+ if (in_compat_syscall())
+ retv = compat_ipv6_mcast_join_leave(sk, optname, optval,
+ optlen);
else
- retv = ipv6_sock_mc_drop(sk, greq.gr_interface,
- &psin6->sin6_addr);
+ retv = ipv6_mcast_join_leave(sk, optname, optval,
+ optlen);
break;
- }
case MCAST_JOIN_SOURCE_GROUP:
case MCAST_LEAVE_SOURCE_GROUP:
case MCAST_BLOCK_SOURCE:
case MCAST_UNBLOCK_SOURCE:
- {
- struct group_source_req greqs;
- int omode, add;
-
- if (optlen < sizeof(struct group_source_req))
- goto e_inval;
- if (copy_from_user(&greqs, optval, sizeof(greqs))) {
- retv = -EFAULT;
- break;
- }
- if (greqs.gsr_group.ss_family != AF_INET6 ||
- greqs.gsr_source.ss_family != AF_INET6) {
- retv = -EADDRNOTAVAIL;
- break;
- }
- if (optname == MCAST_BLOCK_SOURCE) {
- omode = MCAST_EXCLUDE;
- add = 1;
- } else if (optname == MCAST_UNBLOCK_SOURCE) {
- omode = MCAST_EXCLUDE;
- add = 0;
- } else if (optname == MCAST_JOIN_SOURCE_GROUP) {
- struct sockaddr_in6 *psin6;
-
- psin6 = (struct sockaddr_in6 *)&greqs.gsr_group;
- retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface,
- &psin6->sin6_addr,
- MCAST_INCLUDE);
- /* prior join w/ different source is ok */
- if (retv && retv != -EADDRINUSE)
- break;
- omode = MCAST_INCLUDE;
- add = 1;
- } else /* MCAST_LEAVE_SOURCE_GROUP */ {
- omode = MCAST_INCLUDE;
- add = 0;
- }
- retv = ip6_mc_source(add, omode, sk, &greqs);
+ retv = do_ipv6_mcast_group_source(sk, optname, optval, optlen);
break;
- }
case MCAST_MSFILTER:
- {
- struct group_filter *gsf;
-
- if (optlen < GROUP_FILTER_SIZE(0))
- goto e_inval;
- if (optlen > sysctl_optmem_max) {
- retv = -ENOBUFS;
- break;
- }
- gsf = memdup_user(optval, optlen);
- if (IS_ERR(gsf)) {
- retv = PTR_ERR(gsf);
- break;
- }
- /* numsrc >= (4G-140)/128 overflow in 32 bits */
- if (gsf->gf_numsrc >= 0x1ffffffU ||
- gsf->gf_numsrc > sysctl_mld_max_msf) {
- kfree(gsf);
- retv = -ENOBUFS;
- break;
- }
- if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) {
- kfree(gsf);
- retv = -EINVAL;
- break;
- }
- retv = ip6_mc_msfilter(sk, gsf);
- kfree(gsf);
-
+ if (in_compat_syscall())
+ retv = compat_ipv6_set_mcast_msfilter(sk, optval,
+ optlen);
+ else
+ retv = ipv6_set_mcast_msfilter(sk, optval, optlen);
break;
- }
case IPV6_ROUTER_ALERT:
if (optlen < sizeof(int))
goto e_inval;
retv = ip6_ra_control(sk, val);
- break;
- case IPV6_MTU_DISCOVER:
- if (optlen < sizeof(int))
- goto e_inval;
- if (val < IPV6_PMTUDISC_DONT || val > IPV6_PMTUDISC_OMIT)
- goto e_inval;
- np->pmtudisc = val;
- retv = 0;
- break;
- case IPV6_MTU:
- if (optlen < sizeof(int))
- goto e_inval;
- if (val && val < IPV6_MIN_MTU)
- goto e_inval;
- np->frag_size = val;
- retv = 0;
- break;
- case IPV6_RECVERR:
- if (optlen < sizeof(int))
- goto e_inval;
- np->recverr = valbool;
- if (!val)
- skb_queue_purge(&sk->sk_error_queue);
- retv = 0;
- break;
- case IPV6_FLOWINFO_SEND:
- if (optlen < sizeof(int))
- goto e_inval;
- np->sndflow = valbool;
- retv = 0;
+ if (retv == 0)
+ inet6_assign_bit(RTALERT, sk, valbool);
break;
case IPV6_FLOWLABEL_MGR:
retv = ipv6_flowlabel_opt(sk, optval, optlen);
@@ -816,116 +938,34 @@ done:
case IPV6_IPSEC_POLICY:
case IPV6_XFRM_POLICY:
retv = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ if (!sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN))
break;
retv = xfrm_user_policy(sk, optname, optval, optlen);
break;
- case IPV6_ADDR_PREFERENCES:
- {
- unsigned int pref = 0;
- unsigned int prefmask = ~0;
-
- if (optlen < sizeof(int))
- goto e_inval;
-
- retv = -EINVAL;
-
- /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */
- switch (val & (IPV6_PREFER_SRC_PUBLIC|
- IPV6_PREFER_SRC_TMP|
- IPV6_PREFER_SRC_PUBTMP_DEFAULT)) {
- case IPV6_PREFER_SRC_PUBLIC:
- pref |= IPV6_PREFER_SRC_PUBLIC;
- break;
- case IPV6_PREFER_SRC_TMP:
- pref |= IPV6_PREFER_SRC_TMP;
- break;
- case IPV6_PREFER_SRC_PUBTMP_DEFAULT:
- break;
- case 0:
- goto pref_skip_pubtmp;
- default:
- goto e_inval;
- }
-
- prefmask &= ~(IPV6_PREFER_SRC_PUBLIC|
- IPV6_PREFER_SRC_TMP);
-pref_skip_pubtmp:
-
- /* check HOME/COA conflicts */
- switch (val & (IPV6_PREFER_SRC_HOME|IPV6_PREFER_SRC_COA)) {
- case IPV6_PREFER_SRC_HOME:
- break;
- case IPV6_PREFER_SRC_COA:
- pref |= IPV6_PREFER_SRC_COA;
- case 0:
- goto pref_skip_coa;
- default:
- goto e_inval;
- }
-
- prefmask &= ~IPV6_PREFER_SRC_COA;
-pref_skip_coa:
-
- /* check CGA/NONCGA conflicts */
- switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) {
- case IPV6_PREFER_SRC_CGA:
- case IPV6_PREFER_SRC_NONCGA:
- case 0:
- break;
- default:
- goto e_inval;
- }
-
- np->srcprefs = (np->srcprefs & prefmask) | pref;
- retv = 0;
-
- break;
- }
- case IPV6_MINHOPCOUNT:
- if (optlen < sizeof(int))
- goto e_inval;
- if (val < 0 || val > 255)
- goto e_inval;
- np->min_hopcount = val;
- retv = 0;
- break;
- case IPV6_DONTFRAG:
- np->dontfrag = valbool;
- retv = 0;
- break;
- case IPV6_AUTOFLOWLABEL:
- np->autoflowlabel = valbool;
- np->autoflowlabel_set = 1;
- retv = 0;
- break;
case IPV6_RECVFRAGSIZE:
np->rxopt.bits.recvfragsize = valbool;
retv = 0;
break;
}
- release_sock(sk);
- if (needs_rtnl)
- rtnl_unlock();
+unlock:
+ sockopt_release_sock(sk);
return retv;
e_inval:
- release_sock(sk);
- if (needs_rtnl)
- rtnl_unlock();
- return -EINVAL;
+ retv = -EINVAL;
+ goto unlock;
}
-int ipv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
+int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+ unsigned int optlen)
{
int err;
if (level == SOL_IP && sk->sk_type != SOCK_RAW)
- return udp_prot.setsockopt(sk, level, optname, optval, optlen);
+ return ip_setsockopt(sk, level, optname, optval, optlen);
if (level != SOL_IPV6)
return -ENOPROTOOPT;
@@ -941,41 +981,8 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname,
}
EXPORT_SYMBOL(ipv6_setsockopt);
-#ifdef CONFIG_COMPAT
-int compat_ipv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- int err;
-
- if (level == SOL_IP && sk->sk_type != SOCK_RAW) {
- if (udp_prot.compat_setsockopt != NULL)
- return udp_prot.compat_setsockopt(sk, level, optname,
- optval, optlen);
- return udp_prot.setsockopt(sk, level, optname, optval, optlen);
- }
-
- if (level != SOL_IPV6)
- return -ENOPROTOOPT;
-
- if (optname >= MCAST_JOIN_GROUP && optname <= MCAST_MSFILTER)
- return compat_mc_setsockopt(sk, level, optname, optval, optlen,
- ipv6_setsockopt);
-
- err = do_ipv6_setsockopt(sk, level, optname, optval, optlen);
-#ifdef CONFIG_NETFILTER
- /* we need to exclude all possible ENOPROTOOPTs except default case */
- if (err == -ENOPROTOOPT && optname != IPV6_IPSEC_POLICY &&
- optname != IPV6_XFRM_POLICY)
- err = compat_nf_setsockopt(sk, PF_INET6, optname, optval,
- optlen);
-#endif
- return err;
-}
-EXPORT_SYMBOL(compat_ipv6_setsockopt);
-#endif
-
static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt,
- int optname, char __user *optval, int len)
+ int optname, sockptr_t optval, int len)
{
struct ipv6_opt_hdr *hdr;
@@ -1003,13 +1010,81 @@ static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt,
return 0;
len = min_t(unsigned int, len, ipv6_optlen(hdr));
- if (copy_to_user(optval, hdr, len))
+ if (copy_to_sockptr(optval, hdr, len))
return -EFAULT;
return len;
}
-static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen, unsigned int flags)
+static int ipv6_get_msfilter(struct sock *sk, sockptr_t optval,
+ sockptr_t optlen, int len)
+{
+ const int size0 = offsetof(struct group_filter, gf_slist_flex);
+ struct group_filter gsf;
+ int num;
+ int err;
+
+ if (len < size0)
+ return -EINVAL;
+ if (copy_from_sockptr(&gsf, optval, size0))
+ return -EFAULT;
+ if (gsf.gf_group.ss_family != AF_INET6)
+ return -EADDRNOTAVAIL;
+ num = gsf.gf_numsrc;
+ sockopt_lock_sock(sk);
+ err = ip6_mc_msfget(sk, &gsf, optval, size0);
+ if (!err) {
+ if (num > gsf.gf_numsrc)
+ num = gsf.gf_numsrc;
+ len = GROUP_FILTER_SIZE(num);
+ if (copy_to_sockptr(optlen, &len, sizeof(int)) ||
+ copy_to_sockptr(optval, &gsf, size0))
+ err = -EFAULT;
+ }
+ sockopt_release_sock(sk);
+ return err;
+}
+
+static int compat_ipv6_get_msfilter(struct sock *sk, sockptr_t optval,
+ sockptr_t optlen, int len)
+{
+ const int size0 = offsetof(struct compat_group_filter, gf_slist_flex);
+ struct compat_group_filter gf32;
+ struct group_filter gf;
+ int err;
+ int num;
+
+ if (len < size0)
+ return -EINVAL;
+
+ if (copy_from_sockptr(&gf32, optval, size0))
+ return -EFAULT;
+ gf.gf_interface = gf32.gf_interface;
+ gf.gf_fmode = gf32.gf_fmode;
+ num = gf.gf_numsrc = gf32.gf_numsrc;
+ gf.gf_group = gf32.gf_group;
+
+ if (gf.gf_group.ss_family != AF_INET6)
+ return -EADDRNOTAVAIL;
+
+ sockopt_lock_sock(sk);
+ err = ip6_mc_msfget(sk, &gf, optval, size0);
+ sockopt_release_sock(sk);
+ if (err)
+ return err;
+ if (num > gf.gf_numsrc)
+ num = gf.gf_numsrc;
+ len = GROUP_FILTER_SIZE(num) - (sizeof(gf)-sizeof(gf32));
+ if (copy_to_sockptr(optlen, &len, sizeof(int)) ||
+ copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode),
+ &gf.gf_fmode, sizeof(gf32.gf_fmode)) ||
+ copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc),
+ &gf.gf_numsrc, sizeof(gf32.gf_numsrc)))
+ return -EFAULT;
+ return 0;
+}
+
+int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
+ sockptr_t optval, sockptr_t optlen)
{
struct ipv6_pinfo *np = inet6_sk(sk);
int len;
@@ -1018,7 +1093,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
if (ip6_mroute_opt(optname))
return ip6_mroute_getsockopt(sk, optname, optval, optlen);
- if (get_user(len, optlen))
+ if (copy_from_sockptr(&len, optlen, sizeof(int)))
return -EFAULT;
switch (optname) {
case IPV6_ADDRFORM:
@@ -1031,23 +1106,9 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
val = sk->sk_family;
break;
case MCAST_MSFILTER:
- {
- struct group_filter gsf;
- int err;
-
- if (len < GROUP_FILTER_SIZE(0))
- return -EINVAL;
- if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0)))
- return -EFAULT;
- if (gsf.gf_group.ss_family != AF_INET6)
- return -EADDRNOTAVAIL;
- lock_sock(sk);
- err = ip6_mc_msfget(sk, &gsf,
- (struct group_filter __user *)optval, optlen);
- release_sock(sk);
- return err;
- }
-
+ if (in_compat_syscall())
+ return compat_ipv6_get_msfilter(sk, optval, optlen, len);
+ return ipv6_get_msfilter(sk, optval, optlen, len);
case IPV6_2292PKTOPTIONS:
{
struct msghdr msg;
@@ -1056,25 +1117,34 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
if (sk->sk_type != SOCK_STREAM)
return -ENOPROTOOPT;
- msg.msg_control = optval;
+ if (optval.is_kernel) {
+ msg.msg_control_is_user = false;
+ msg.msg_control = optval.kernel;
+ } else {
+ msg.msg_control_is_user = true;
+ msg.msg_control_user = optval.user;
+ }
msg.msg_controllen = len;
- msg.msg_flags = flags;
+ msg.msg_flags = 0;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
skb = np->pktoptions;
if (skb)
ip6_datagram_recv_ctl(sk, &msg, skb);
- release_sock(sk);
+ sockopt_release_sock(sk);
if (!skb) {
if (np->rxopt.bits.rxinfo) {
+ int mcast_oif = READ_ONCE(np->mcast_oif);
struct in6_pktinfo src_info;
- src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif :
+
+ src_info.ipi6_ifindex = mcast_oif ? :
np->sticky_pktinfo.ipi6_ifindex;
- src_info.ipi6_addr = np->mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr;
+ src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr : np->sticky_pktinfo.ipi6_addr;
put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info);
}
if (np->rxopt.bits.rxhlim) {
- int hlim = np->mcast_hops;
+ int hlim = READ_ONCE(np->mcast_hops);
+
put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim);
}
if (np->rxopt.bits.rxtclass) {
@@ -1083,15 +1153,18 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
put_cmsg(&msg, SOL_IPV6, IPV6_TCLASS, sizeof(tclass), &tclass);
}
if (np->rxopt.bits.rxoinfo) {
+ int mcast_oif = READ_ONCE(np->mcast_oif);
struct in6_pktinfo src_info;
- src_info.ipi6_ifindex = np->mcast_oif ? np->mcast_oif :
+
+ src_info.ipi6_ifindex = mcast_oif ? :
np->sticky_pktinfo.ipi6_ifindex;
- src_info.ipi6_addr = np->mcast_oif ? sk->sk_v6_daddr :
- np->sticky_pktinfo.ipi6_addr;
+ src_info.ipi6_addr = mcast_oif ? sk->sk_v6_daddr :
+ np->sticky_pktinfo.ipi6_addr;
put_cmsg(&msg, SOL_IPV6, IPV6_2292PKTINFO, sizeof(src_info), &src_info);
}
if (np->rxopt.bits.rxohlim) {
- int hlim = np->mcast_hops;
+ int hlim = READ_ONCE(np->mcast_hops);
+
put_cmsg(&msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim);
}
if (np->rxopt.bits.rxflow) {
@@ -1101,7 +1174,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
}
}
len -= msg.msg_controllen;
- return put_user(len, optlen);
+ return copy_to_sockptr(optlen, &len, sizeof(int));
}
case IPV6_MTU:
{
@@ -1153,15 +1226,15 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
{
struct ipv6_txoptions *opt;
- lock_sock(sk);
+ sockopt_lock_sock(sk);
opt = rcu_dereference_protected(np->opt,
lockdep_sock_is_held(sk));
len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len);
- release_sock(sk);
+ sockopt_release_sock(sk);
/* check if ipv6_getsockopt_sticky() returns err code */
if (len < 0)
return len;
- return put_user(len, optlen);
+ return copy_to_sockptr(optlen, &len, sizeof(int));
}
case IPV6_RECVHOPOPTS:
@@ -1215,20 +1288,20 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
if (!mtuinfo.ip6m_mtu)
return -ENOTCONN;
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &mtuinfo, len))
+ if (copy_to_sockptr(optval, &mtuinfo, len))
return -EFAULT;
return 0;
}
case IPV6_TRANSPARENT:
- val = inet_sk(sk)->transparent;
+ val = inet_test_bit(TRANSPARENT, sk);
break;
case IPV6_FREEBIND:
- val = inet_sk(sk)->freebind;
+ val = inet_test_bit(FREEBIND, sk);
break;
case IPV6_RECVORIGDSTADDR:
@@ -1241,9 +1314,9 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
struct dst_entry *dst;
if (optname == IPV6_UNICAST_HOPS)
- val = np->hop_limit;
+ val = READ_ONCE(np->hop_limit);
else
- val = np->mcast_hops;
+ val = READ_ONCE(np->mcast_hops);
if (val < 0) {
rcu_read_lock();
@@ -1254,32 +1327,36 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
}
if (val < 0)
- val = sock_net(sk)->ipv6.devconf_all->hop_limit;
+ val = READ_ONCE(sock_net(sk)->ipv6.devconf_all->hop_limit);
break;
}
case IPV6_MULTICAST_LOOP:
- val = np->mc_loop;
+ val = inet6_test_bit(MC6_LOOP, sk);
break;
case IPV6_MULTICAST_IF:
- val = np->mcast_oif;
+ val = READ_ONCE(np->mcast_oif);
+ break;
+
+ case IPV6_MULTICAST_ALL:
+ val = inet6_test_bit(MC6_ALL, sk);
break;
case IPV6_UNICAST_IF:
- val = (__force int)htonl((__u32) np->ucast_oif);
+ val = (__force int)htonl((__u32) READ_ONCE(np->ucast_oif));
break;
case IPV6_MTU_DISCOVER:
- val = np->pmtudisc;
+ val = READ_ONCE(np->pmtudisc);
break;
case IPV6_RECVERR:
- val = np->recverr;
+ val = inet6_test_bit(RECVERR6, sk);
break;
case IPV6_FLOWINFO_SEND:
- val = np->sndflow;
+ val = inet6_test_bit(SNDFLOW, sk);
break;
case IPV6_FLOWLABEL_MGR:
@@ -1290,7 +1367,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
if (len < sizeof(freq))
return -EINVAL;
- if (copy_from_user(&freq, optval, sizeof(freq)))
+ if (copy_from_sockptr(&freq, optval, sizeof(freq)))
return -EFAULT;
if (freq.flr_action != IPV6_FL_A_GET)
@@ -1305,55 +1382,69 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
if (val < 0)
return val;
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &freq, len))
+ if (copy_to_sockptr(optval, &freq, len))
return -EFAULT;
return 0;
}
case IPV6_ADDR_PREFERENCES:
+ {
+ u8 srcprefs = READ_ONCE(np->srcprefs);
val = 0;
- if (np->srcprefs & IPV6_PREFER_SRC_TMP)
+ if (srcprefs & IPV6_PREFER_SRC_TMP)
val |= IPV6_PREFER_SRC_TMP;
- else if (np->srcprefs & IPV6_PREFER_SRC_PUBLIC)
+ else if (srcprefs & IPV6_PREFER_SRC_PUBLIC)
val |= IPV6_PREFER_SRC_PUBLIC;
else {
/* XXX: should we return system default? */
val |= IPV6_PREFER_SRC_PUBTMP_DEFAULT;
}
- if (np->srcprefs & IPV6_PREFER_SRC_COA)
+ if (srcprefs & IPV6_PREFER_SRC_COA)
val |= IPV6_PREFER_SRC_COA;
else
val |= IPV6_PREFER_SRC_HOME;
break;
-
+ }
case IPV6_MINHOPCOUNT:
- val = np->min_hopcount;
+ val = READ_ONCE(np->min_hopcount);
break;
case IPV6_DONTFRAG:
- val = np->dontfrag;
+ val = inet6_test_bit(DONTFRAG, sk);
break;
case IPV6_AUTOFLOWLABEL:
- val = ip6_autoflowlabel(sock_net(sk), np);
+ val = ip6_autoflowlabel(sock_net(sk), sk);
break;
case IPV6_RECVFRAGSIZE:
val = np->rxopt.bits.recvfragsize;
break;
+ case IPV6_ROUTER_ALERT:
+ val = inet6_test_bit(RTALERT, sk);
+ break;
+
+ case IPV6_ROUTER_ALERT_ISOLATE:
+ val = inet6_test_bit(RTALERT_ISOLATE, sk);
+ break;
+
+ case IPV6_RECVERR_RFC4884:
+ val = inet6_test_bit(RECVERR6_RFC4884, sk);
+ break;
+
default:
return -ENOPROTOOPT;
}
len = min_t(unsigned int, sizeof(int), len);
- if (put_user(len, optlen))
+ if (copy_to_sockptr(optlen, &len, sizeof(int)))
return -EFAULT;
- if (copy_to_user(optval, &val, len))
+ if (copy_to_sockptr(optval, &val, len))
return -EFAULT;
return 0;
}
@@ -1364,12 +1455,13 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname,
int err;
if (level == SOL_IP && sk->sk_type != SOCK_RAW)
- return udp_prot.getsockopt(sk, level, optname, optval, optlen);
+ return ip_getsockopt(sk, level, optname, optval, optlen);
if (level != SOL_IPV6)
return -ENOPROTOOPT;
- err = do_ipv6_getsockopt(sk, level, optname, optval, optlen, 0);
+ err = do_ipv6_getsockopt(sk, level, optname,
+ USER_SOCKPTR(optval), USER_SOCKPTR(optlen));
#ifdef CONFIG_NETFILTER
/* we need to exclude all possible ENOPROTOOPTs except default case */
if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) {
@@ -1386,43 +1478,3 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname,
return err;
}
EXPORT_SYMBOL(ipv6_getsockopt);
-
-#ifdef CONFIG_COMPAT
-int compat_ipv6_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- int err;
-
- if (level == SOL_IP && sk->sk_type != SOCK_RAW) {
- if (udp_prot.compat_getsockopt != NULL)
- return udp_prot.compat_getsockopt(sk, level, optname,
- optval, optlen);
- return udp_prot.getsockopt(sk, level, optname, optval, optlen);
- }
-
- if (level != SOL_IPV6)
- return -ENOPROTOOPT;
-
- if (optname == MCAST_MSFILTER)
- return compat_mc_getsockopt(sk, level, optname, optval, optlen,
- ipv6_getsockopt);
-
- err = do_ipv6_getsockopt(sk, level, optname, optval, optlen,
- MSG_CMSG_COMPAT);
-#ifdef CONFIG_NETFILTER
- /* we need to exclude all possible ENOPROTOOPTs except default case */
- if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) {
- int len;
-
- if (get_user(len, optlen))
- return -EFAULT;
-
- err = compat_nf_getsockopt(sk, PF_INET6, optname, optval, &len);
- if (err >= 0)
- err = put_user(len, optlen);
- }
-#endif
- return err;
-}
-EXPORT_SYMBOL(compat_ipv6_getsockopt);
-#endif
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 4ae54aaca373..016b572e7d6f 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Multicast support for IPv6
* Linux INET6 implementation
@@ -6,11 +7,6 @@
* Pedro Roque <roque@di.fc.ul.pt>
*
* Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/* Changes:
@@ -33,24 +29,27 @@
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/jiffies.h>
-#include <linux/times.h>
#include <linux/net.h>
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
+#include <linux/if_addr.h>
#include <linux/if_arp.h>
#include <linux/route.h>
+#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/pkt_sched.h>
#include <net/mld.h>
+#include <linux/workqueue.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
#include <net/net_namespace.h>
+#include <net/netlink.h>
#include <net/sock.h>
#include <net/snmp.h>
@@ -71,18 +70,14 @@ static int __mld2_query_bugs[] __attribute__((__unused__)) = {
BUILD_BUG_ON_ZERO(offsetof(struct mld2_grec, grec_mca) % 4)
};
+static struct workqueue_struct *mld_wq;
static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT;
static void igmp6_join_group(struct ifmcaddr6 *ma);
static void igmp6_leave_group(struct ifmcaddr6 *ma);
-static void igmp6_timer_handler(struct timer_list *t);
+static void mld_mca_work(struct work_struct *work);
-static void mld_gq_timer_expire(struct timer_list *t);
-static void mld_ifc_timer_expire(struct timer_list *t);
static void mld_ifc_event(struct inet6_dev *idev);
-static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc);
-static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc);
-static void mld_clear_delrec(struct inet6_dev *idev);
static bool mld_in_v1_mode(const struct inet6_dev *idev);
static int sf_setstate(struct ifmcaddr6 *pmc);
static void sf_markstate(struct ifmcaddr6 *pmc);
@@ -113,69 +108,119 @@ static int __ipv6_dev_mc_inc(struct net_device *dev,
int sysctl_mld_max_msf __read_mostly = IPV6_MLD_MAX_MSF;
int sysctl_mld_qrv __read_mostly = MLD_QRV_DEFAULT;
-/*
- * socket join on multicast group
- */
+#define mc_assert_locked(idev) \
+ lockdep_assert_held(&(idev)->mc_lock)
+
+#define mc_dereference(e, idev) \
+ rcu_dereference_protected(e, lockdep_is_held(&(idev)->mc_lock))
+
+#define sock_dereference(e, sk) \
+ rcu_dereference_protected(e, lockdep_sock_is_held(sk))
+
+#define for_each_pmc_socklock(np, sk, pmc) \
+ for (pmc = sock_dereference((np)->ipv6_mc_list, sk); \
+ pmc; \
+ pmc = sock_dereference(pmc->next, sk))
#define for_each_pmc_rcu(np, pmc) \
- for (pmc = rcu_dereference(np->ipv6_mc_list); \
- pmc != NULL; \
+ for (pmc = rcu_dereference((np)->ipv6_mc_list); \
+ pmc; \
pmc = rcu_dereference(pmc->next))
+#define for_each_psf_mclock(mc, psf) \
+ for (psf = mc_dereference((mc)->mca_sources, mc->idev); \
+ psf; \
+ psf = mc_dereference(psf->sf_next, mc->idev))
+
+#define for_each_psf_rcu(mc, psf) \
+ for (psf = rcu_dereference((mc)->mca_sources); \
+ psf; \
+ psf = rcu_dereference(psf->sf_next))
+
+#define for_each_psf_tomb(mc, psf) \
+ for (psf = mc_dereference((mc)->mca_tomb, mc->idev); \
+ psf; \
+ psf = mc_dereference(psf->sf_next, mc->idev))
+
+#define for_each_mc_mclock(idev, mc) \
+ for (mc = mc_dereference((idev)->mc_list, idev); \
+ mc; \
+ mc = mc_dereference(mc->next, idev))
+
+#define for_each_mc_rcu(idev, mc) \
+ for (mc = rcu_dereference((idev)->mc_list); \
+ mc; \
+ mc = rcu_dereference(mc->next))
+
+#define for_each_mc_tomb(idev, mc) \
+ for (mc = mc_dereference((idev)->mc_tomb, idev); \
+ mc; \
+ mc = mc_dereference(mc->next, idev))
+
static int unsolicited_report_interval(struct inet6_dev *idev)
{
int iv;
if (mld_in_v1_mode(idev))
- iv = idev->cnf.mldv1_unsolicited_report_interval;
+ iv = READ_ONCE(idev->cnf.mldv1_unsolicited_report_interval);
else
- iv = idev->cnf.mldv2_unsolicited_report_interval;
+ iv = READ_ONCE(idev->cnf.mldv2_unsolicited_report_interval);
return iv > 0 ? iv : 1;
}
+static struct net_device *ip6_mc_find_dev(struct net *net,
+ const struct in6_addr *group,
+ int ifindex)
+{
+ struct net_device *dev = NULL;
+ struct rt6_info *rt;
+
+ if (ifindex == 0) {
+ rcu_read_lock();
+ rt = rt6_lookup(net, group, NULL, 0, NULL, 0);
+ if (rt) {
+ dev = dst_dev_rcu(&rt->dst);
+ dev_hold(dev);
+ ip6_rt_put(rt);
+ }
+ rcu_read_unlock();
+ } else {
+ dev = dev_get_by_index(net, ifindex);
+ }
+
+ return dev;
+}
+
+/*
+ * socket join on multicast group
+ */
static int __ipv6_sock_mc_join(struct sock *sk, int ifindex,
const struct in6_addr *addr, unsigned int mode)
{
- struct net_device *dev = NULL;
- struct ipv6_mc_socklist *mc_lst;
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_mc_socklist *mc_lst;
struct net *net = sock_net(sk);
+ struct net_device *dev = NULL;
int err;
- ASSERT_RTNL();
-
if (!ipv6_addr_is_multicast(addr))
return -EINVAL;
- rcu_read_lock();
- for_each_pmc_rcu(np, mc_lst) {
+ for_each_pmc_socklock(np, sk, mc_lst) {
if ((ifindex == 0 || mc_lst->ifindex == ifindex) &&
- ipv6_addr_equal(&mc_lst->addr, addr)) {
- rcu_read_unlock();
+ ipv6_addr_equal(&mc_lst->addr, addr))
return -EADDRINUSE;
- }
}
- rcu_read_unlock();
mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL);
-
if (!mc_lst)
return -ENOMEM;
mc_lst->next = NULL;
mc_lst->addr = *addr;
- if (ifindex == 0) {
- struct rt6_info *rt;
- rt = rt6_lookup(net, addr, NULL, 0, NULL, 0);
- if (rt) {
- dev = rt->dst.dev;
- ip6_rt_put(rt);
- }
- } else
- dev = __dev_get_by_index(net, ifindex);
-
+ dev = ip6_mc_find_dev(net, addr, ifindex);
if (!dev) {
sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
return -ENODEV;
@@ -183,15 +228,13 @@ static int __ipv6_sock_mc_join(struct sock *sk, int ifindex,
mc_lst->ifindex = dev->ifindex;
mc_lst->sfmode = mode;
- rwlock_init(&mc_lst->sflock);
- mc_lst->sflist = NULL;
-
- /*
- * now add/increase the group membership on the device
- */
+ RCU_INIT_POINTER(mc_lst->sflist, NULL);
+ /* now add/increase the group membership on the device */
err = __ipv6_dev_mc_inc(dev, addr, mode);
+ dev_put(dev);
+
if (err) {
sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
return err;
@@ -218,39 +261,47 @@ int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex,
/*
* socket leave on multicast group
*/
+static void __ipv6_sock_mc_drop(struct sock *sk, struct ipv6_mc_socklist *mc_lst)
+{
+ struct net *net = sock_net(sk);
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, mc_lst->ifindex);
+ if (dev) {
+ struct inet6_dev *idev = in6_dev_get(dev);
+
+ ip6_mc_leave_src(sk, mc_lst, idev);
+
+ if (idev) {
+ __ipv6_dev_mc_dec(idev, &mc_lst->addr);
+ in6_dev_put(idev);
+ }
+
+ dev_put(dev);
+ } else {
+ ip6_mc_leave_src(sk, mc_lst, NULL);
+ }
+
+ atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc);
+ kfree_rcu(mc_lst, rcu);
+}
+
int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
{
struct ipv6_pinfo *np = inet6_sk(sk);
- struct ipv6_mc_socklist *mc_lst;
struct ipv6_mc_socklist __rcu **lnk;
- struct net *net = sock_net(sk);
-
- ASSERT_RTNL();
+ struct ipv6_mc_socklist *mc_lst;
if (!ipv6_addr_is_multicast(addr))
return -EINVAL;
for (lnk = &np->ipv6_mc_list;
- (mc_lst = rtnl_dereference(*lnk)) != NULL;
+ (mc_lst = sock_dereference(*lnk, sk)) != NULL;
lnk = &mc_lst->next) {
if ((ifindex == 0 || mc_lst->ifindex == ifindex) &&
ipv6_addr_equal(&mc_lst->addr, addr)) {
- struct net_device *dev;
-
*lnk = mc_lst->next;
-
- dev = __dev_get_by_index(net, mc_lst->ifindex);
- if (dev) {
- struct inet6_dev *idev = __in6_dev_get(dev);
-
- (void) ip6_mc_leave_src(sk, mc_lst, idev);
- if (idev)
- __ipv6_dev_mc_dec(idev, &mc_lst->addr);
- } else
- (void) ip6_mc_leave_src(sk, mc_lst, NULL);
-
- atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc);
- kfree_rcu(mc_lst, rcu);
+ __ipv6_sock_mc_drop(sk, mc_lst);
return 0;
}
}
@@ -259,34 +310,20 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
}
EXPORT_SYMBOL(ipv6_sock_mc_drop);
-/* called with rcu_read_lock() */
-static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net,
- const struct in6_addr *group,
- int ifindex)
+static struct inet6_dev *ip6_mc_find_idev(struct net *net,
+ const struct in6_addr *group,
+ int ifindex)
{
- struct net_device *dev = NULL;
- struct inet6_dev *idev = NULL;
-
- if (ifindex == 0) {
- struct rt6_info *rt = rt6_lookup(net, group, NULL, 0, NULL, 0);
-
- if (rt) {
- dev = rt->dst.dev;
- ip6_rt_put(rt);
- }
- } else
- dev = dev_get_by_index_rcu(net, ifindex);
+ struct net_device *dev;
+ struct inet6_dev *idev;
+ dev = ip6_mc_find_dev(net, group, ifindex);
if (!dev)
return NULL;
- idev = __in6_dev_get(dev);
- if (!idev)
- return NULL;
- read_lock_bh(&idev->lock);
- if (idev->dead) {
- read_unlock_bh(&idev->lock);
- return NULL;
- }
+
+ idev = in6_dev_get(dev);
+ dev_put(dev);
+
return idev;
}
@@ -294,27 +331,10 @@ void __ipv6_sock_mc_close(struct sock *sk)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct ipv6_mc_socklist *mc_lst;
- struct net *net = sock_net(sk);
-
- ASSERT_RTNL();
-
- while ((mc_lst = rtnl_dereference(np->ipv6_mc_list)) != NULL) {
- struct net_device *dev;
+ while ((mc_lst = sock_dereference(np->ipv6_mc_list, sk)) != NULL) {
np->ipv6_mc_list = mc_lst->next;
-
- dev = __dev_get_by_index(net, mc_lst->ifindex);
- if (dev) {
- struct inet6_dev *idev = __in6_dev_get(dev);
-
- (void) ip6_mc_leave_src(sk, mc_lst, idev);
- if (idev)
- __ipv6_dev_mc_dec(idev, &mc_lst->addr);
- } else
- (void) ip6_mc_leave_src(sk, mc_lst, NULL);
-
- atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc);
- kfree_rcu(mc_lst, rcu);
+ __ipv6_sock_mc_drop(sk, mc_lst);
}
}
@@ -324,23 +344,23 @@ void ipv6_sock_mc_close(struct sock *sk)
if (!rcu_access_pointer(np->ipv6_mc_list))
return;
- rtnl_lock();
+
+ lock_sock(sk);
__ipv6_sock_mc_close(sk);
- rtnl_unlock();
+ release_sock(sk);
}
int ip6_mc_source(int add, int omode, struct sock *sk,
- struct group_source_req *pgsr)
+ struct group_source_req *pgsr)
{
+ struct ipv6_pinfo *inet6 = inet6_sk(sk);
struct in6_addr *source, *group;
+ struct net *net = sock_net(sk);
struct ipv6_mc_socklist *pmc;
- struct inet6_dev *idev;
- struct ipv6_pinfo *inet6 = inet6_sk(sk);
struct ip6_sf_socklist *psl;
- struct net *net = sock_net(sk);
- int i, j, rv;
+ struct inet6_dev *idev;
int leavegroup = 0;
- int pmclocked = 0;
+ int i, j, rv;
int err;
source = &((struct sockaddr_in6 *)&pgsr->gsr_source)->sin6_addr;
@@ -349,16 +369,20 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
if (!ipv6_addr_is_multicast(group))
return -EINVAL;
- rcu_read_lock();
- idev = ip6_mc_find_dev_rcu(net, group, pgsr->gsr_interface);
- if (!idev) {
- rcu_read_unlock();
+ idev = ip6_mc_find_idev(net, group, pgsr->gsr_interface);
+ if (!idev)
return -ENODEV;
+
+ mutex_lock(&idev->mc_lock);
+
+ if (idev->dead) {
+ err = -ENODEV;
+ goto done;
}
err = -EADDRNOTAVAIL;
- for_each_pmc_rcu(inet6, pmc) {
+ for_each_pmc_socklock(inet6, sk, pmc) {
if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface)
continue;
if (ipv6_addr_equal(&pmc->addr, group))
@@ -369,7 +393,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
goto done;
}
/* if a source filter was set, must be the same mode as before */
- if (pmc->sflist) {
+ if (rcu_access_pointer(pmc->sflist)) {
if (pmc->sfmode != omode) {
err = -EINVAL;
goto done;
@@ -381,10 +405,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
pmc->sfmode = omode;
}
- write_lock(&pmc->sflock);
- pmclocked = 1;
-
- psl = pmc->sflist;
+ psl = sock_dereference(pmc->sflist, sk);
if (!add) {
if (!psl)
goto done; /* err = -EADDRNOTAVAIL */
@@ -424,7 +445,8 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
if (psl)
count += psl->sl_max;
- newpsl = sock_kmalloc(sk, IP6_SFLSIZE(count), GFP_ATOMIC);
+ newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr, count),
+ GFP_KERNEL);
if (!newpsl) {
err = -ENOBUFS;
goto done;
@@ -434,9 +456,12 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
if (psl) {
for (i = 0; i < psl->sl_count; i++)
newpsl->sl_addr[i] = psl->sl_addr[i];
- sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max));
+ atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+ &sk->sk_omem_alloc);
}
- pmc->sflist = psl = newpsl;
+ rcu_assign_pointer(pmc->sflist, newpsl);
+ kfree_rcu(psl, rcu);
+ psl = newpsl;
}
rv = 1; /* > 0 for insert logic below if sl_count is 0 */
for (i = 0; i < psl->sl_count; i++) {
@@ -452,23 +477,22 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
/* update the interface list */
ip6_mc_add_src(idev, group, omode, 1, source, 1);
done:
- if (pmclocked)
- write_unlock(&pmc->sflock);
- read_unlock_bh(&idev->lock);
- rcu_read_unlock();
+ mutex_unlock(&idev->mc_lock);
+ in6_dev_put(idev);
if (leavegroup)
err = ipv6_sock_mc_drop(sk, pgsr->gsr_interface, group);
return err;
}
-int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
+int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf,
+ struct sockaddr_storage *list)
{
- const struct in6_addr *group;
- struct ipv6_mc_socklist *pmc;
- struct inet6_dev *idev;
struct ipv6_pinfo *inet6 = inet6_sk(sk);
struct ip6_sf_socklist *newpsl, *psl;
struct net *net = sock_net(sk);
+ const struct in6_addr *group;
+ struct ipv6_mc_socklist *pmc;
+ struct inet6_dev *idev;
int leavegroup = 0;
int i, err;
@@ -480,12 +504,15 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
gsf->gf_fmode != MCAST_EXCLUDE)
return -EINVAL;
- rcu_read_lock();
- idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface);
-
- if (!idev) {
- rcu_read_unlock();
+ idev = ip6_mc_find_idev(net, group, gsf->gf_interface);
+ if (!idev)
return -ENODEV;
+
+ mutex_lock(&idev->mc_lock);
+
+ if (idev->dead) {
+ err = -ENODEV;
+ goto done;
}
err = 0;
@@ -495,7 +522,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
goto done;
}
- for_each_pmc_rcu(inet6, pmc) {
+ for_each_pmc_socklock(inet6, sk, pmc) {
if (pmc->ifindex != gsf->gf_interface)
continue;
if (ipv6_addr_equal(&pmc->addr, group))
@@ -506,103 +533,85 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf)
goto done;
}
if (gsf->gf_numsrc) {
- newpsl = sock_kmalloc(sk, IP6_SFLSIZE(gsf->gf_numsrc),
- GFP_ATOMIC);
+ newpsl = sock_kmalloc(sk, struct_size(newpsl, sl_addr,
+ gsf->gf_numsrc),
+ GFP_KERNEL);
if (!newpsl) {
err = -ENOBUFS;
goto done;
}
newpsl->sl_max = newpsl->sl_count = gsf->gf_numsrc;
- for (i = 0; i < newpsl->sl_count; ++i) {
+ for (i = 0; i < newpsl->sl_count; ++i, ++list) {
struct sockaddr_in6 *psin6;
- psin6 = (struct sockaddr_in6 *)&gsf->gf_slist[i];
+ psin6 = (struct sockaddr_in6 *)list;
newpsl->sl_addr[i] = psin6->sin6_addr;
}
+
err = ip6_mc_add_src(idev, group, gsf->gf_fmode,
- newpsl->sl_count, newpsl->sl_addr, 0);
+ newpsl->sl_count, newpsl->sl_addr, 0);
if (err) {
- sock_kfree_s(sk, newpsl, IP6_SFLSIZE(newpsl->sl_max));
+ sock_kfree_s(sk, newpsl, struct_size(newpsl, sl_addr,
+ newpsl->sl_max));
goto done;
}
} else {
newpsl = NULL;
- (void) ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0);
+ ip6_mc_add_src(idev, group, gsf->gf_fmode, 0, NULL, 0);
}
- write_lock(&pmc->sflock);
- psl = pmc->sflist;
+ psl = sock_dereference(pmc->sflist, sk);
if (psl) {
- (void) ip6_mc_del_src(idev, group, pmc->sfmode,
- psl->sl_count, psl->sl_addr, 0);
- sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max));
- } else
- (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
- pmc->sflist = newpsl;
+ ip6_mc_del_src(idev, group, pmc->sfmode,
+ psl->sl_count, psl->sl_addr, 0);
+ atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+ &sk->sk_omem_alloc);
+ } else {
+ ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0);
+ }
+
+ rcu_assign_pointer(pmc->sflist, newpsl);
+ kfree_rcu(psl, rcu);
pmc->sfmode = gsf->gf_fmode;
- write_unlock(&pmc->sflock);
err = 0;
done:
- read_unlock_bh(&idev->lock);
- rcu_read_unlock();
+ mutex_unlock(&idev->mc_lock);
+ in6_dev_put(idev);
if (leavegroup)
err = ipv6_sock_mc_drop(sk, gsf->gf_interface, group);
return err;
}
int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
- struct group_filter __user *optval, int __user *optlen)
+ sockptr_t optval, size_t ss_offset)
{
- int err, i, count, copycount;
+ struct ipv6_pinfo *inet6 = inet6_sk(sk);
const struct in6_addr *group;
struct ipv6_mc_socklist *pmc;
- struct inet6_dev *idev;
- struct ipv6_pinfo *inet6 = inet6_sk(sk);
struct ip6_sf_socklist *psl;
- struct net *net = sock_net(sk);
+ unsigned int count;
+ int i, copycount;
group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr;
if (!ipv6_addr_is_multicast(group))
return -EINVAL;
- rcu_read_lock();
- idev = ip6_mc_find_dev_rcu(net, group, gsf->gf_interface);
-
- if (!idev) {
- rcu_read_unlock();
- return -ENODEV;
- }
-
- err = -EADDRNOTAVAIL;
- /* changes to the ipv6_mc_list require the socket lock and
- * rtnl lock. We have the socket lock and rcu read lock,
- * so reading the list is safe.
- */
-
- for_each_pmc_rcu(inet6, pmc) {
+ for_each_pmc_socklock(inet6, sk, pmc) {
if (pmc->ifindex != gsf->gf_interface)
continue;
if (ipv6_addr_equal(group, &pmc->addr))
break;
}
if (!pmc) /* must have a prior join */
- goto done;
+ return -EADDRNOTAVAIL;
+
gsf->gf_fmode = pmc->sfmode;
- psl = pmc->sflist;
+ psl = sock_dereference(pmc->sflist, sk);
count = psl ? psl->sl_count : 0;
- read_unlock_bh(&idev->lock);
- rcu_read_unlock();
- copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
+ copycount = min(count, gsf->gf_numsrc);
gsf->gf_numsrc = count;
- if (put_user(GROUP_FILTER_SIZE(copycount), optlen) ||
- copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
- return -EFAULT;
- }
- /* changes to psl require the socket lock, and a write lock
- * on pmc->sflock. We have the socket lock so reading here is safe.
- */
for (i = 0; i < copycount; i++) {
struct sockaddr_in6 *psin6;
struct sockaddr_storage ss;
@@ -611,22 +620,19 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf,
memset(&ss, 0, sizeof(ss));
psin6->sin6_family = AF_INET6;
psin6->sin6_addr = psl->sl_addr[i];
- if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss)))
+ if (copy_to_sockptr_offset(optval, ss_offset, &ss, sizeof(ss)))
return -EFAULT;
+ ss_offset += sizeof(ss);
}
return 0;
-done:
- read_unlock_bh(&idev->lock);
- rcu_read_unlock();
- return err;
}
-bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr,
+bool inet6_mc_check(const struct sock *sk, const struct in6_addr *mc_addr,
const struct in6_addr *src_addr)
{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct ipv6_mc_socklist *mc;
- struct ip6_sf_socklist *psl;
+ const struct ipv6_pinfo *np = inet6_sk(sk);
+ const struct ipv6_mc_socklist *mc;
+ const struct ip6_sf_socklist *psl;
bool rv = true;
rcu_read_lock();
@@ -636,10 +642,9 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr,
}
if (!mc) {
rcu_read_unlock();
- return true;
+ return inet6_test_bit(MC6_ALL, sk);
}
- read_lock(&mc->sflock);
- psl = mc->sflist;
+ psl = rcu_dereference(mc->sflist);
if (!psl) {
rv = mc->sfmode == MCAST_EXCLUDE;
} else {
@@ -654,7 +659,6 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr,
if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
rv = false;
}
- read_unlock(&mc->sflock);
rcu_read_unlock();
return rv;
@@ -665,17 +669,17 @@ static void igmp6_group_added(struct ifmcaddr6 *mc)
struct net_device *dev = mc->idev->dev;
char buf[MAX_ADDR_LEN];
+ mc_assert_locked(mc->idev);
+
if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) <
IPV6_ADDR_SCOPE_LINKLOCAL)
return;
- spin_lock_bh(&mc->mca_lock);
if (!(mc->mca_flags&MAF_LOADED)) {
mc->mca_flags |= MAF_LOADED;
if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0)
dev_mc_add(dev, buf);
}
- spin_unlock_bh(&mc->mca_lock);
if (!(dev->flags & IFF_UP) || (mc->mca_flags & MAF_NOREPORT))
return;
@@ -701,49 +705,45 @@ static void igmp6_group_dropped(struct ifmcaddr6 *mc)
struct net_device *dev = mc->idev->dev;
char buf[MAX_ADDR_LEN];
+ mc_assert_locked(mc->idev);
+
if (IPV6_ADDR_MC_SCOPE(&mc->mca_addr) <
IPV6_ADDR_SCOPE_LINKLOCAL)
return;
- spin_lock_bh(&mc->mca_lock);
if (mc->mca_flags&MAF_LOADED) {
mc->mca_flags &= ~MAF_LOADED;
if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0)
dev_mc_del(dev, buf);
}
- spin_unlock_bh(&mc->mca_lock);
if (mc->mca_flags & MAF_NOREPORT)
return;
if (!mc->idev->dead)
igmp6_leave_group(mc);
- spin_lock_bh(&mc->mca_lock);
- if (del_timer(&mc->mca_timer))
+ if (cancel_delayed_work(&mc->mca_work))
refcount_dec(&mc->mca_refcnt);
- spin_unlock_bh(&mc->mca_lock);
}
-/*
- * deleted ifmcaddr6 manipulation
- */
+/* deleted ifmcaddr6 manipulation */
static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
{
struct ifmcaddr6 *pmc;
+ mc_assert_locked(idev);
+
/* this is an "ifmcaddr6" for convenience; only the fields below
* are actually used. In particular, the refcnt and users are not
* used for management of the delete list. Using the same structure
* for deleted items allows change reports to use common code with
* non-deleted or query-response MCA's.
*/
- pmc = kzalloc(sizeof(*pmc), GFP_ATOMIC);
+ pmc = kzalloc(sizeof(*pmc), GFP_KERNEL);
if (!pmc)
return;
- spin_lock_bh(&im->mca_lock);
- spin_lock_init(&pmc->mca_lock);
pmc->idev = im->idev;
in6_dev_hold(idev);
pmc->mca_addr = im->mca_addr;
@@ -752,101 +752,111 @@ static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
if (pmc->mca_sfmode == MCAST_INCLUDE) {
struct ip6_sf_list *psf;
- pmc->mca_tomb = im->mca_tomb;
- pmc->mca_sources = im->mca_sources;
- im->mca_tomb = im->mca_sources = NULL;
- for (psf = pmc->mca_sources; psf; psf = psf->sf_next)
+ rcu_assign_pointer(pmc->mca_tomb,
+ mc_dereference(im->mca_tomb, idev));
+ rcu_assign_pointer(pmc->mca_sources,
+ mc_dereference(im->mca_sources, idev));
+ RCU_INIT_POINTER(im->mca_tomb, NULL);
+ RCU_INIT_POINTER(im->mca_sources, NULL);
+
+ for_each_psf_mclock(pmc, psf)
psf->sf_crcount = pmc->mca_crcount;
}
- spin_unlock_bh(&im->mca_lock);
- spin_lock_bh(&idev->mc_lock);
- pmc->next = idev->mc_tomb;
- idev->mc_tomb = pmc;
- spin_unlock_bh(&idev->mc_lock);
+ rcu_assign_pointer(pmc->next, idev->mc_tomb);
+ rcu_assign_pointer(idev->mc_tomb, pmc);
}
static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
{
- struct ifmcaddr6 *pmc, *pmc_prev;
- struct ip6_sf_list *psf;
+ struct ip6_sf_list *psf, *sources, *tomb;
struct in6_addr *pmca = &im->mca_addr;
+ struct ifmcaddr6 *pmc, *pmc_prev;
+
+ mc_assert_locked(idev);
- spin_lock_bh(&idev->mc_lock);
pmc_prev = NULL;
- for (pmc = idev->mc_tomb; pmc; pmc = pmc->next) {
+ for_each_mc_tomb(idev, pmc) {
if (ipv6_addr_equal(&pmc->mca_addr, pmca))
break;
pmc_prev = pmc;
}
- if (pmc) {
- if (pmc_prev)
- pmc_prev->next = pmc->next;
- else
- idev->mc_tomb = pmc->next;
- }
- spin_unlock_bh(&idev->mc_lock);
-
- spin_lock_bh(&im->mca_lock);
- if (pmc) {
- im->idev = pmc->idev;
- if (im->mca_sfmode == MCAST_INCLUDE) {
- im->mca_tomb = pmc->mca_tomb;
- im->mca_sources = pmc->mca_sources;
- for (psf = im->mca_sources; psf; psf = psf->sf_next)
- psf->sf_crcount = idev->mc_qrv;
- } else {
- im->mca_crcount = idev->mc_qrv;
- }
- in6_dev_put(pmc->idev);
- kfree(pmc);
+ if (!pmc)
+ return;
+ if (pmc_prev)
+ rcu_assign_pointer(pmc_prev->next, pmc->next);
+ else
+ rcu_assign_pointer(idev->mc_tomb, pmc->next);
+
+ im->idev = pmc->idev;
+ if (im->mca_sfmode == MCAST_INCLUDE) {
+ tomb = rcu_replace_pointer(im->mca_tomb,
+ mc_dereference(pmc->mca_tomb, pmc->idev),
+ lockdep_is_held(&im->idev->mc_lock));
+ rcu_assign_pointer(pmc->mca_tomb, tomb);
+
+ sources = rcu_replace_pointer(im->mca_sources,
+ mc_dereference(pmc->mca_sources, pmc->idev),
+ lockdep_is_held(&im->idev->mc_lock));
+ rcu_assign_pointer(pmc->mca_sources, sources);
+ for_each_psf_mclock(im, psf)
+ psf->sf_crcount = idev->mc_qrv;
+ } else {
+ im->mca_crcount = idev->mc_qrv;
}
- spin_unlock_bh(&im->mca_lock);
+ ip6_mc_clear_src(pmc);
+ in6_dev_put(pmc->idev);
+ kfree_rcu(pmc, rcu);
}
static void mld_clear_delrec(struct inet6_dev *idev)
{
struct ifmcaddr6 *pmc, *nextpmc;
- spin_lock_bh(&idev->mc_lock);
- pmc = idev->mc_tomb;
- idev->mc_tomb = NULL;
- spin_unlock_bh(&idev->mc_lock);
+ mc_assert_locked(idev);
+
+ pmc = mc_dereference(idev->mc_tomb, idev);
+ RCU_INIT_POINTER(idev->mc_tomb, NULL);
for (; pmc; pmc = nextpmc) {
- nextpmc = pmc->next;
+ nextpmc = mc_dereference(pmc->next, idev);
ip6_mc_clear_src(pmc);
in6_dev_put(pmc->idev);
- kfree(pmc);
+ kfree_rcu(pmc, rcu);
}
/* clear dead sources, too */
- read_lock_bh(&idev->lock);
- for (pmc = idev->mc_list; pmc; pmc = pmc->next) {
+ for_each_mc_mclock(idev, pmc) {
struct ip6_sf_list *psf, *psf_next;
- spin_lock_bh(&pmc->mca_lock);
- psf = pmc->mca_tomb;
- pmc->mca_tomb = NULL;
- spin_unlock_bh(&pmc->mca_lock);
+ psf = mc_dereference(pmc->mca_tomb, idev);
+ RCU_INIT_POINTER(pmc->mca_tomb, NULL);
for (; psf; psf = psf_next) {
- psf_next = psf->sf_next;
- kfree(psf);
+ psf_next = mc_dereference(psf->sf_next, idev);
+ kfree_rcu(psf, rcu);
}
}
- read_unlock_bh(&idev->lock);
}
-static void mca_get(struct ifmcaddr6 *mc)
+static void mld_clear_query(struct inet6_dev *idev)
{
- refcount_inc(&mc->mca_refcnt);
+ spin_lock_bh(&idev->mc_query_lock);
+ __skb_queue_purge(&idev->mc_query_queue);
+ spin_unlock_bh(&idev->mc_query_lock);
+}
+
+static void mld_clear_report(struct inet6_dev *idev)
+{
+ spin_lock_bh(&idev->mc_report_lock);
+ __skb_queue_purge(&idev->mc_report_queue);
+ spin_unlock_bh(&idev->mc_report_lock);
}
static void ma_put(struct ifmcaddr6 *mc)
{
if (refcount_dec_and_test(&mc->mca_refcnt)) {
in6_dev_put(mc->idev);
- kfree(mc);
+ kfree_rcu(mc, rcu);
}
}
@@ -856,11 +866,13 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
{
struct ifmcaddr6 *mc;
- mc = kzalloc(sizeof(*mc), GFP_ATOMIC);
+ mc_assert_locked(idev);
+
+ mc = kzalloc(sizeof(*mc), GFP_KERNEL);
if (!mc)
return NULL;
- timer_setup(&mc->mca_timer, igmp6_timer_handler, 0);
+ INIT_DELAYED_WORK(&mc->mca_work, mld_mca_work);
mc->mca_addr = *addr;
mc->idev = idev; /* reference taken by caller */
@@ -868,7 +880,6 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
/* mca_stamp should be updated upon changes */
mc->mca_cstamp = mc->mca_tstamp = jiffies;
refcount_set(&mc->mca_refcnt, 1);
- spin_lock_init(&mc->mca_lock);
mc->mca_sfmode = mode;
mc->mca_sfcount[mode] = 1;
@@ -880,35 +891,68 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
return mc;
}
+static void inet6_ifmcaddr_notify(struct net_device *dev,
+ const struct ifmcaddr6 *ifmca, int event)
+{
+ struct inet6_fill_args fillargs = {
+ .portid = 0,
+ .seq = 0,
+ .event = event,
+ .flags = 0,
+ .netnsid = -1,
+ .force_rt_scope_universe = true,
+ };
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOMEM;
+
+ skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) +
+ nla_total_size(sizeof(struct in6_addr)) +
+ nla_total_size(sizeof(struct ifa_cacheinfo)),
+ GFP_KERNEL);
+ if (!skb)
+ goto error;
+
+ err = inet6_fill_ifmcaddr(skb, ifmca, &fillargs);
+ if (err < 0) {
+ WARN_ON_ONCE(err == -EMSGSIZE);
+ nlmsg_free(skb);
+ goto error;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MCADDR, NULL, GFP_KERNEL);
+ return;
+error:
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_MCADDR, err);
+}
+
/*
* device multicast group inc (add if not found)
*/
static int __ipv6_dev_mc_inc(struct net_device *dev,
const struct in6_addr *addr, unsigned int mode)
{
- struct ifmcaddr6 *mc;
struct inet6_dev *idev;
-
- ASSERT_RTNL();
+ struct ifmcaddr6 *mc;
/* we need to take a reference on idev */
idev = in6_dev_get(dev);
-
if (!idev)
return -EINVAL;
- write_lock_bh(&idev->lock);
- if (idev->dead) {
- write_unlock_bh(&idev->lock);
+ mutex_lock(&idev->mc_lock);
+
+ if (READ_ONCE(idev->dead)) {
+ mutex_unlock(&idev->mc_lock);
in6_dev_put(idev);
return -ENODEV;
}
- for (mc = idev->mc_list; mc; mc = mc->next) {
+ for_each_mc_mclock(idev, mc) {
if (ipv6_addr_equal(&mc->mca_addr, addr)) {
mc->mca_users++;
- write_unlock_bh(&idev->lock);
ip6_mc_add_src(idev, &mc->mca_addr, mode, 0, NULL, 0);
+ mutex_unlock(&idev->mc_lock);
in6_dev_put(idev);
return 0;
}
@@ -916,23 +960,19 @@ static int __ipv6_dev_mc_inc(struct net_device *dev,
mc = mca_alloc(idev, addr, mode);
if (!mc) {
- write_unlock_bh(&idev->lock);
+ mutex_unlock(&idev->mc_lock);
in6_dev_put(idev);
return -ENOMEM;
}
- mc->next = idev->mc_list;
- idev->mc_list = mc;
-
- /* Hold this for the code below before we unlock,
- * it is already exposed via idev->mc_list.
- */
- mca_get(mc);
- write_unlock_bh(&idev->lock);
+ rcu_assign_pointer(mc->next, idev->mc_list);
+ rcu_assign_pointer(idev->mc_list, mc);
mld_del_delrec(idev, mc);
igmp6_group_added(mc);
- ma_put(mc);
+ inet6_ifmcaddr_notify(dev, mc, RTM_NEWMULTICAST);
+ mutex_unlock(&idev->mc_lock);
+
return 0;
}
@@ -940,35 +980,39 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
{
return __ipv6_dev_mc_inc(dev, addr, MCAST_EXCLUDE);
}
+EXPORT_SYMBOL(ipv6_dev_mc_inc);
/*
- * device multicast group del
+ * device multicast group del
*/
int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr)
{
- struct ifmcaddr6 *ma, **map;
+ struct ifmcaddr6 *ma, __rcu **map;
- ASSERT_RTNL();
+ mutex_lock(&idev->mc_lock);
- write_lock_bh(&idev->lock);
- for (map = &idev->mc_list; (ma = *map) != NULL; map = &ma->next) {
+ for (map = &idev->mc_list;
+ (ma = mc_dereference(*map, idev));
+ map = &ma->next) {
if (ipv6_addr_equal(&ma->mca_addr, addr)) {
if (--ma->mca_users == 0) {
*map = ma->next;
- write_unlock_bh(&idev->lock);
igmp6_group_dropped(ma);
+ inet6_ifmcaddr_notify(idev->dev, ma,
+ RTM_DELMULTICAST);
ip6_mc_clear_src(ma);
+ mutex_unlock(&idev->mc_lock);
ma_put(ma);
return 0;
}
- write_unlock_bh(&idev->lock);
+ mutex_unlock(&idev->mc_lock);
return 0;
}
}
- write_unlock_bh(&idev->lock);
+ mutex_unlock(&idev->mc_lock);
return -ENOENT;
}
@@ -977,16 +1021,16 @@ int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr)
struct inet6_dev *idev;
int err;
- ASSERT_RTNL();
-
- idev = __in6_dev_get(dev);
+ idev = in6_dev_get(dev);
if (!idev)
- err = -ENODEV;
- else
- err = __ipv6_dev_mc_dec(idev, addr);
+ return -ENODEV;
+
+ err = __ipv6_dev_mc_dec(idev, addr);
+ in6_dev_put(idev);
return err;
}
+EXPORT_SYMBOL(ipv6_dev_mc_dec);
/*
* check if the interface/address pair is valid
@@ -1000,105 +1044,125 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
rcu_read_lock();
idev = __in6_dev_get(dev);
- if (idev) {
- read_lock_bh(&idev->lock);
- for (mc = idev->mc_list; mc; mc = mc->next) {
- if (ipv6_addr_equal(&mc->mca_addr, group))
- break;
- }
- if (mc) {
- if (src_addr && !ipv6_addr_any(src_addr)) {
- struct ip6_sf_list *psf;
+ if (!idev)
+ goto unlock;
+ for_each_mc_rcu(idev, mc) {
+ if (ipv6_addr_equal(&mc->mca_addr, group))
+ break;
+ }
+ if (!mc)
+ goto unlock;
+ if (src_addr && !ipv6_addr_any(src_addr)) {
+ struct ip6_sf_list *psf;
- spin_lock_bh(&mc->mca_lock);
- for (psf = mc->mca_sources; psf; psf = psf->sf_next) {
- if (ipv6_addr_equal(&psf->sf_addr, src_addr))
- break;
- }
- if (psf)
- rv = psf->sf_count[MCAST_INCLUDE] ||
- psf->sf_count[MCAST_EXCLUDE] !=
- mc->mca_sfcount[MCAST_EXCLUDE];
- else
- rv = mc->mca_sfcount[MCAST_EXCLUDE] != 0;
- spin_unlock_bh(&mc->mca_lock);
- } else
- rv = true; /* don't filter unspecified source */
+ for_each_psf_rcu(mc, psf) {
+ if (ipv6_addr_equal(&psf->sf_addr, src_addr))
+ break;
}
- read_unlock_bh(&idev->lock);
+ if (psf)
+ rv = READ_ONCE(psf->sf_count[MCAST_INCLUDE]) ||
+ READ_ONCE(psf->sf_count[MCAST_EXCLUDE]) !=
+ READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]);
+ else
+ rv = READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]) != 0;
+ } else {
+ rv = true; /* don't filter unspecified source */
}
+unlock:
rcu_read_unlock();
return rv;
}
-static void mld_gq_start_timer(struct inet6_dev *idev)
+static void mld_gq_start_work(struct inet6_dev *idev)
{
- unsigned long tv = prandom_u32() % idev->mc_maxdelay;
+ unsigned long tv = get_random_u32_below(idev->mc_maxdelay);
+
+ mc_assert_locked(idev);
idev->mc_gq_running = 1;
- if (!mod_timer(&idev->mc_gq_timer, jiffies+tv+2))
+ if (!mod_delayed_work(mld_wq, &idev->mc_gq_work, tv + 2))
in6_dev_hold(idev);
}
-static void mld_gq_stop_timer(struct inet6_dev *idev)
+static void mld_gq_stop_work(struct inet6_dev *idev)
{
+ mc_assert_locked(idev);
+
idev->mc_gq_running = 0;
- if (del_timer(&idev->mc_gq_timer))
+ if (cancel_delayed_work(&idev->mc_gq_work))
__in6_dev_put(idev);
}
-static void mld_ifc_start_timer(struct inet6_dev *idev, unsigned long delay)
+static void mld_ifc_start_work(struct inet6_dev *idev, unsigned long delay)
{
- unsigned long tv = prandom_u32() % delay;
+ unsigned long tv = get_random_u32_below(delay);
- if (!mod_timer(&idev->mc_ifc_timer, jiffies+tv+2))
+ mc_assert_locked(idev);
+
+ if (!mod_delayed_work(mld_wq, &idev->mc_ifc_work, tv + 2))
in6_dev_hold(idev);
}
-static void mld_ifc_stop_timer(struct inet6_dev *idev)
+static void mld_ifc_stop_work(struct inet6_dev *idev)
{
+ mc_assert_locked(idev);
+
idev->mc_ifc_count = 0;
- if (del_timer(&idev->mc_ifc_timer))
+ if (cancel_delayed_work(&idev->mc_ifc_work))
__in6_dev_put(idev);
}
-static void mld_dad_start_timer(struct inet6_dev *idev, unsigned long delay)
+static void mld_dad_start_work(struct inet6_dev *idev, unsigned long delay)
{
- unsigned long tv = prandom_u32() % delay;
+ unsigned long tv = get_random_u32_below(delay);
+
+ mc_assert_locked(idev);
- if (!mod_timer(&idev->mc_dad_timer, jiffies+tv+2))
+ if (!mod_delayed_work(mld_wq, &idev->mc_dad_work, tv + 2))
in6_dev_hold(idev);
}
-static void mld_dad_stop_timer(struct inet6_dev *idev)
+static void mld_dad_stop_work(struct inet6_dev *idev)
{
- if (del_timer(&idev->mc_dad_timer))
+ if (cancel_delayed_work(&idev->mc_dad_work))
__in6_dev_put(idev);
}
-/*
- * IGMP handling (alias multicast ICMPv6 messages)
- */
+static void mld_query_stop_work(struct inet6_dev *idev)
+{
+ spin_lock_bh(&idev->mc_query_lock);
+ if (cancel_delayed_work(&idev->mc_query_work))
+ __in6_dev_put(idev);
+ spin_unlock_bh(&idev->mc_query_lock);
+}
+static void mld_report_stop_work(struct inet6_dev *idev)
+{
+ if (cancel_delayed_work_sync(&idev->mc_report_work))
+ __in6_dev_put(idev);
+}
+
+/* IGMP handling (alias multicast ICMPv6 messages) */
static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime)
{
unsigned long delay = resptime;
- /* Do not start timer for these addresses */
+ mc_assert_locked(ma->idev);
+
+ /* Do not start work for these addresses */
if (ipv6_addr_is_ll_all_nodes(&ma->mca_addr) ||
IPV6_ADDR_MC_SCOPE(&ma->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL)
return;
- if (del_timer(&ma->mca_timer)) {
+ if (cancel_delayed_work(&ma->mca_work)) {
refcount_dec(&ma->mca_refcnt);
- delay = ma->mca_timer.expires - jiffies;
+ delay = ma->mca_work.timer.expires - jiffies;
}
if (delay >= resptime)
- delay = prandom_u32() % resptime;
+ delay = get_random_u32_below(resptime);
- ma->mca_timer.expires = jiffies + delay;
- if (!mod_timer(&ma->mca_timer, jiffies + delay))
+ if (!mod_delayed_work(mld_wq, &ma->mca_work, delay))
refcount_inc(&ma->mca_refcnt);
ma->mca_flags |= MAF_TIMER_RUNNING;
}
@@ -1110,8 +1174,10 @@ static bool mld_xmarksources(struct ifmcaddr6 *pmc, int nsrcs,
struct ip6_sf_list *psf;
int i, scount;
+ mc_assert_locked(pmc->idev);
+
scount = 0;
- for (psf = pmc->mca_sources; psf; psf = psf->sf_next) {
+ for_each_psf_mclock(pmc, psf) {
if (scount == nsrcs)
break;
for (i = 0; i < nsrcs; i++) {
@@ -1138,13 +1204,15 @@ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs,
struct ip6_sf_list *psf;
int i, scount;
+ mc_assert_locked(pmc->idev);
+
if (pmc->mca_sfmode == MCAST_EXCLUDE)
return mld_xmarksources(pmc, nsrcs, srcs);
/* mark INCLUDE-mode sources */
scount = 0;
- for (psf = pmc->mca_sources; psf; psf = psf->sf_next) {
+ for_each_psf_mclock(pmc, psf) {
if (scount == nsrcs)
break;
for (i = 0; i < nsrcs; i++) {
@@ -1165,15 +1233,15 @@ static bool mld_marksources(struct ifmcaddr6 *pmc, int nsrcs,
static int mld_force_mld_version(const struct inet6_dev *idev)
{
+ const struct net *net = dev_net(idev->dev);
+ int all_force;
+
+ all_force = READ_ONCE(net->ipv6.devconf_all->force_mld_version);
/* Normally, both are 0 here. If enforcement to a particular is
* being used, individual device enforcement will have a lower
* precedence over 'all' device (.../conf/all/force_mld_version).
*/
-
- if (dev_net(idev->dev)->ipv6.devconf_all->force_mld_version != 0)
- return dev_net(idev->dev)->ipv6.devconf_all->force_mld_version;
- else
- return idev->cnf.force_mld_version;
+ return all_force ?: READ_ONCE(idev->cnf.force_mld_version);
}
static bool mld_in_v2_mode_only(const struct inet6_dev *idev)
@@ -1309,18 +1377,18 @@ static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld,
if (v1_query)
mld_set_v1_mode(idev);
- /* cancel MLDv2 report timer */
- mld_gq_stop_timer(idev);
- /* cancel the interface change timer */
- mld_ifc_stop_timer(idev);
+ /* cancel MLDv2 report work */
+ mld_gq_stop_work(idev);
+ /* cancel the interface change work */
+ mld_ifc_stop_work(idev);
/* clear deleted report items */
mld_clear_delrec(idev);
return 0;
}
-static int mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld,
- unsigned long *max_delay)
+static void mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld,
+ unsigned long *max_delay)
{
*max_delay = max(msecs_to_jiffies(mldv2_mrc(mld)), 1UL);
@@ -1330,24 +1398,43 @@ static int mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld,
idev->mc_maxdelay = *max_delay;
- return 0;
+ return;
}
/* called with rcu_read_lock() */
-int igmp6_event_query(struct sk_buff *skb)
+void igmp6_event_query(struct sk_buff *skb)
+{
+ struct inet6_dev *idev = __in6_dev_get(skb->dev);
+
+ if (!idev || idev->dead)
+ goto out;
+
+ spin_lock_bh(&idev->mc_query_lock);
+ if (skb_queue_len(&idev->mc_query_queue) < MLD_MAX_SKBS) {
+ __skb_queue_tail(&idev->mc_query_queue, skb);
+ if (!mod_delayed_work(mld_wq, &idev->mc_query_work, 0))
+ in6_dev_hold(idev);
+ skb = NULL;
+ }
+ spin_unlock_bh(&idev->mc_query_lock);
+out:
+ kfree_skb(skb);
+}
+
+static void __mld_query_work(struct sk_buff *skb)
{
struct mld2_query *mlh2 = NULL;
- struct ifmcaddr6 *ma;
const struct in6_addr *group;
unsigned long max_delay;
struct inet6_dev *idev;
+ struct ifmcaddr6 *ma;
struct mld_msg *mld;
int group_type;
int mark = 0;
int len, err;
if (!pskb_may_pull(skb, sizeof(struct in6_addr)))
- return -EINVAL;
+ goto kfree_skb;
/* compute payload length excluding extension headers */
len = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr);
@@ -1364,11 +1451,11 @@ int igmp6_event_query(struct sk_buff *skb)
ipv6_hdr(skb)->hop_limit != 1 ||
!(IP6CB(skb)->flags & IP6SKB_ROUTERALERT) ||
IP6CB(skb)->ra != htons(IPV6_OPT_ROUTERALERT_MLD))
- return -EINVAL;
+ goto kfree_skb;
- idev = __in6_dev_get(skb->dev);
+ idev = in6_dev_get(skb->dev);
if (!idev)
- return 0;
+ goto kfree_skb;
mld = (struct mld_msg *)icmp6_hdr(skb);
group = &mld->mld_mca;
@@ -1376,60 +1463,54 @@ int igmp6_event_query(struct sk_buff *skb)
if (group_type != IPV6_ADDR_ANY &&
!(group_type&IPV6_ADDR_MULTICAST))
- return -EINVAL;
+ goto out;
if (len < MLD_V1_QUERY_LEN) {
- return -EINVAL;
+ goto out;
} else if (len == MLD_V1_QUERY_LEN || mld_in_v1_mode(idev)) {
err = mld_process_v1(idev, mld, &max_delay,
len == MLD_V1_QUERY_LEN);
if (err < 0)
- return err;
+ goto out;
} else if (len >= MLD_V2_QUERY_LEN_MIN) {
int srcs_offset = sizeof(struct mld2_query) -
sizeof(struct icmp6hdr);
if (!pskb_may_pull(skb, srcs_offset))
- return -EINVAL;
+ goto out;
mlh2 = (struct mld2_query *)skb_transport_header(skb);
- err = mld_process_v2(idev, mlh2, &max_delay);
- if (err < 0)
- return err;
+ mld_process_v2(idev, mlh2, &max_delay);
if (group_type == IPV6_ADDR_ANY) { /* general query */
if (mlh2->mld2q_nsrcs)
- return -EINVAL; /* no sources allowed */
+ goto out; /* no sources allowed */
- mld_gq_start_timer(idev);
- return 0;
+ mld_gq_start_work(idev);
+ goto out;
}
/* mark sources to include, if group & source-specific */
if (mlh2->mld2q_nsrcs != 0) {
if (!pskb_may_pull(skb, srcs_offset +
ntohs(mlh2->mld2q_nsrcs) * sizeof(struct in6_addr)))
- return -EINVAL;
+ goto out;
mlh2 = (struct mld2_query *)skb_transport_header(skb);
mark = 1;
}
} else {
- return -EINVAL;
+ goto out;
}
- read_lock_bh(&idev->lock);
if (group_type == IPV6_ADDR_ANY) {
- for (ma = idev->mc_list; ma; ma = ma->next) {
- spin_lock_bh(&ma->mca_lock);
+ for_each_mc_mclock(idev, ma) {
igmp6_group_queried(ma, max_delay);
- spin_unlock_bh(&ma->mca_lock);
}
} else {
- for (ma = idev->mc_list; ma; ma = ma->next) {
+ for_each_mc_mclock(idev, ma) {
if (!ipv6_addr_equal(group, &ma->mca_addr))
continue;
- spin_lock_bh(&ma->mca_lock);
if (ma->mca_flags & MAF_TIMER_RUNNING) {
/* gsquery <- gsquery && mark */
if (!mark)
@@ -1444,34 +1525,88 @@ int igmp6_event_query(struct sk_buff *skb)
if (!(ma->mca_flags & MAF_GSQUERY) ||
mld_marksources(ma, ntohs(mlh2->mld2q_nsrcs), mlh2->mld2q_srcs))
igmp6_group_queried(ma, max_delay);
- spin_unlock_bh(&ma->mca_lock);
break;
}
}
- read_unlock_bh(&idev->lock);
- return 0;
+out:
+ in6_dev_put(idev);
+kfree_skb:
+ consume_skb(skb);
+}
+
+static void mld_query_work(struct work_struct *work)
+{
+ struct inet6_dev *idev = container_of(to_delayed_work(work),
+ struct inet6_dev,
+ mc_query_work);
+ struct sk_buff_head q;
+ struct sk_buff *skb;
+ bool rework = false;
+ int cnt = 0;
+
+ skb_queue_head_init(&q);
+
+ spin_lock_bh(&idev->mc_query_lock);
+ while ((skb = __skb_dequeue(&idev->mc_query_queue))) {
+ __skb_queue_tail(&q, skb);
+
+ if (++cnt >= MLD_MAX_QUEUE) {
+ rework = true;
+ break;
+ }
+ }
+ spin_unlock_bh(&idev->mc_query_lock);
+
+ mutex_lock(&idev->mc_lock);
+ while ((skb = __skb_dequeue(&q)))
+ __mld_query_work(skb);
+ mutex_unlock(&idev->mc_lock);
+
+ if (rework && queue_delayed_work(mld_wq, &idev->mc_query_work, 0))
+ return;
+
+ in6_dev_put(idev);
}
/* called with rcu_read_lock() */
-int igmp6_event_report(struct sk_buff *skb)
+void igmp6_event_report(struct sk_buff *skb)
+{
+ struct inet6_dev *idev = __in6_dev_get(skb->dev);
+
+ if (!idev || idev->dead)
+ goto out;
+
+ spin_lock_bh(&idev->mc_report_lock);
+ if (skb_queue_len(&idev->mc_report_queue) < MLD_MAX_SKBS) {
+ __skb_queue_tail(&idev->mc_report_queue, skb);
+ if (!mod_delayed_work(mld_wq, &idev->mc_report_work, 0))
+ in6_dev_hold(idev);
+ skb = NULL;
+ }
+ spin_unlock_bh(&idev->mc_report_lock);
+out:
+ kfree_skb(skb);
+}
+
+static void __mld_report_work(struct sk_buff *skb)
{
- struct ifmcaddr6 *ma;
struct inet6_dev *idev;
+ struct ifmcaddr6 *ma;
struct mld_msg *mld;
int addr_type;
/* Our own report looped back. Ignore it. */
if (skb->pkt_type == PACKET_LOOPBACK)
- return 0;
+ goto kfree_skb;
/* send our report if the MC router may not have heard this report */
if (skb->pkt_type != PACKET_MULTICAST &&
skb->pkt_type != PACKET_BROADCAST)
- return 0;
+ goto kfree_skb;
if (!pskb_may_pull(skb, sizeof(*mld) - sizeof(struct icmp6hdr)))
- return -EINVAL;
+ goto kfree_skb;
mld = (struct mld_msg *)icmp6_hdr(skb);
@@ -1479,29 +1614,62 @@ int igmp6_event_report(struct sk_buff *skb)
addr_type = ipv6_addr_type(&ipv6_hdr(skb)->saddr);
if (addr_type != IPV6_ADDR_ANY &&
!(addr_type&IPV6_ADDR_LINKLOCAL))
- return -EINVAL;
+ goto kfree_skb;
- idev = __in6_dev_get(skb->dev);
+ idev = in6_dev_get(skb->dev);
if (!idev)
- return -ENODEV;
+ goto kfree_skb;
/*
- * Cancel the timer for this group
+ * Cancel the work for this group
*/
- read_lock_bh(&idev->lock);
- for (ma = idev->mc_list; ma; ma = ma->next) {
+ for_each_mc_mclock(idev, ma) {
if (ipv6_addr_equal(&ma->mca_addr, &mld->mld_mca)) {
- spin_lock(&ma->mca_lock);
- if (del_timer(&ma->mca_timer))
+ if (cancel_delayed_work(&ma->mca_work))
refcount_dec(&ma->mca_refcnt);
- ma->mca_flags &= ~(MAF_LAST_REPORTER|MAF_TIMER_RUNNING);
- spin_unlock(&ma->mca_lock);
+ ma->mca_flags &= ~(MAF_LAST_REPORTER |
+ MAF_TIMER_RUNNING);
break;
}
}
- read_unlock_bh(&idev->lock);
- return 0;
+
+ in6_dev_put(idev);
+kfree_skb:
+ consume_skb(skb);
+}
+
+static void mld_report_work(struct work_struct *work)
+{
+ struct inet6_dev *idev = container_of(to_delayed_work(work),
+ struct inet6_dev,
+ mc_report_work);
+ struct sk_buff_head q;
+ struct sk_buff *skb;
+ bool rework = false;
+ int cnt = 0;
+
+ skb_queue_head_init(&q);
+ spin_lock_bh(&idev->mc_report_lock);
+ while ((skb = __skb_dequeue(&idev->mc_report_queue))) {
+ __skb_queue_tail(&q, skb);
+
+ if (++cnt >= MLD_MAX_QUEUE) {
+ rework = true;
+ break;
+ }
+ }
+ spin_unlock_bh(&idev->mc_report_lock);
+
+ mutex_lock(&idev->mc_lock);
+ while ((skb = __skb_dequeue(&q)))
+ __mld_report_work(skb);
+ mutex_unlock(&idev->mc_lock);
+
+ if (rework && queue_delayed_work(mld_wq, &idev->mc_report_work, 0))
+ return;
+
+ in6_dev_put(idev);
}
static bool is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type,
@@ -1554,7 +1722,7 @@ mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted)
struct ip6_sf_list *psf;
int scount = 0;
- for (psf = pmc->mca_sources; psf; psf = psf->sf_next) {
+ for_each_psf_mclock(pmc, psf) {
if (!is_in(pmc, psf, type, gdeleted, sdeleted))
continue;
scount++;
@@ -1562,11 +1730,9 @@ mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted)
return scount;
}
-static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb,
- struct net_device *dev,
- const struct in6_addr *saddr,
- const struct in6_addr *daddr,
- int proto, int len)
+static void ip6_mc_hdr(const struct sock *sk, struct sk_buff *skb,
+ struct net_device *dev, const struct in6_addr *saddr,
+ const struct in6_addr *daddr, int proto, int len)
{
struct ipv6hdr *hdr;
@@ -1581,7 +1747,7 @@ static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb,
hdr->payload_len = htons(len);
hdr->nexthdr = proto;
- hdr->hop_limit = inet6_sk(sk)->hop_limit;
+ hdr->hop_limit = READ_ONCE(inet6_sk(sk)->hop_limit);
hdr->saddr = *saddr;
hdr->daddr = *daddr;
@@ -1589,26 +1755,24 @@ static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb,
static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu)
{
+ u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT,
+ 2, 0, 0, IPV6_TLV_PADN, 0 };
struct net_device *dev = idev->dev;
- struct net *net = dev_net(dev);
- struct sock *sk = net->ipv6.igmp_sk;
- struct sk_buff *skb;
- struct mld2_report *pmr;
- struct in6_addr addr_buf;
- const struct in6_addr *saddr;
int hlen = LL_RESERVED_SPACE(dev);
int tlen = dev->needed_tailroom;
- unsigned int size = mtu + hlen + tlen;
- int err;
- u8 ra[8] = { IPPROTO_ICMPV6, 0,
- IPV6_TLV_ROUTERALERT, 2, 0, 0,
- IPV6_TLV_PADN, 0 };
-
- /* we assume size > sizeof(ra) here */
- /* limit our allocations to order-0 page */
- size = min_t(int, size, SKB_MAX_ORDER(0, 0));
- skb = sock_alloc_send_skb(sk, size, 1, &err);
+ const struct in6_addr *saddr;
+ struct in6_addr addr_buf;
+ struct mld2_report *pmr;
+ struct sk_buff *skb;
+ unsigned int size;
+ struct sock *sk;
+ struct net *net;
+ /* we assume size > sizeof(ra) here
+ * Also try to not allocate high-order pages for big MTU
+ */
+ size = min_t(int, mtu, PAGE_SIZE / 2) + hlen + tlen;
+ skb = alloc_skb(size, GFP_KERNEL);
if (!skb)
return NULL;
@@ -1616,7 +1780,13 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu)
skb_reserve(skb, hlen);
skb_tailroom_reserve(skb, mtu, tlen);
- if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) {
+ rcu_read_lock();
+
+ net = dev_net_rcu(dev);
+ sk = net->ipv6.igmp_sk;
+ skb_set_owner_w(skb, sk);
+
+ if (ipv6_get_lladdr(dev, &addr_buf, IFA_F_TENTATIVE)) {
/* <draft-ietf-magma-mld-source-05.txt>:
* use unspecified address as the source address
* when a valid link-local address is not available.
@@ -1627,6 +1797,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu)
ip6_mc_hdr(sk, skb, dev, saddr, &mld2_all_mcr, NEXTHDR_HOP, 0);
+ rcu_read_unlock();
+
skb_put_data(skb, ra, sizeof(ra));
skb_set_transport_header(skb, skb_tail_pointer(skb) - skb->data);
@@ -1654,7 +1826,7 @@ static void mld_sendpack(struct sk_buff *skb)
rcu_read_lock();
idev = __in6_dev_get(skb->dev);
- IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
payload_len = (skb_tail_pointer(skb) - skb_network_header(skb)) -
sizeof(*pip6);
@@ -1729,16 +1901,20 @@ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc,
#define AVAILABLE(skb) ((skb) ? skb_availroom(skb) : 0)
static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
- int type, int gdeleted, int sdeleted, int crsend)
+ int type, int gdeleted, int sdeleted,
+ int crsend)
{
+ struct ip6_sf_list *psf, *psf_prev, *psf_next;
+ int scount, stotal, first, isquery, truncate;
+ struct ip6_sf_list __rcu **psf_list;
struct inet6_dev *idev = pmc->idev;
struct net_device *dev = idev->dev;
- struct mld2_report *pmr;
struct mld2_grec *pgr = NULL;
- struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list;
- int scount, stotal, first, isquery, truncate;
+ struct mld2_report *pmr;
unsigned int mtu;
+ mc_assert_locked(idev);
+
if (pmc->mca_flags & MAF_NOREPORT)
return skb;
@@ -1755,7 +1931,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
psf_list = sdeleted ? &pmc->mca_tomb : &pmc->mca_sources;
- if (!*psf_list)
+ if (!rcu_access_pointer(*psf_list))
goto empty_source;
pmr = skb ? (struct mld2_report *)skb_transport_header(skb) : NULL;
@@ -1771,10 +1947,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
}
first = 1;
psf_prev = NULL;
- for (psf = *psf_list; psf; psf = psf_next) {
+ for (psf = mc_dereference(*psf_list, idev);
+ psf;
+ psf = psf_next) {
struct in6_addr *psrc;
- psf_next = psf->sf_next;
+ psf_next = mc_dereference(psf->sf_next, idev);
if (!is_in(pmc, psf, type, gdeleted, sdeleted) && !crsend) {
psf_prev = psf;
@@ -1821,10 +1999,12 @@ decrease_sf_crcount:
psf->sf_crcount--;
if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
if (psf_prev)
- psf_prev->sf_next = psf->sf_next;
+ rcu_assign_pointer(psf_prev->sf_next,
+ mc_dereference(psf->sf_next, idev));
else
- *psf_list = psf->sf_next;
- kfree(psf);
+ rcu_assign_pointer(*psf_list,
+ mc_dereference(psf->sf_next, idev));
+ kfree_rcu(psf, rcu);
continue;
}
}
@@ -1858,51 +2038,50 @@ static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc)
struct sk_buff *skb = NULL;
int type;
- read_lock_bh(&idev->lock);
+ mc_assert_locked(idev);
+
if (!pmc) {
- for (pmc = idev->mc_list; pmc; pmc = pmc->next) {
+ for_each_mc_mclock(idev, pmc) {
if (pmc->mca_flags & MAF_NOREPORT)
continue;
- spin_lock_bh(&pmc->mca_lock);
if (pmc->mca_sfcount[MCAST_EXCLUDE])
type = MLD2_MODE_IS_EXCLUDE;
else
type = MLD2_MODE_IS_INCLUDE;
skb = add_grec(skb, pmc, type, 0, 0, 0);
- spin_unlock_bh(&pmc->mca_lock);
}
} else {
- spin_lock_bh(&pmc->mca_lock);
if (pmc->mca_sfcount[MCAST_EXCLUDE])
type = MLD2_MODE_IS_EXCLUDE;
else
type = MLD2_MODE_IS_INCLUDE;
skb = add_grec(skb, pmc, type, 0, 0, 0);
- spin_unlock_bh(&pmc->mca_lock);
}
- read_unlock_bh(&idev->lock);
if (skb)
mld_sendpack(skb);
}
-/*
- * remove zero-count source records from a source filter list
- */
-static void mld_clear_zeros(struct ip6_sf_list **ppsf)
+/* remove zero-count source records from a source filter list */
+static void mld_clear_zeros(struct ip6_sf_list __rcu **ppsf, struct inet6_dev *idev)
{
struct ip6_sf_list *psf_prev, *psf_next, *psf;
psf_prev = NULL;
- for (psf = *ppsf; psf; psf = psf_next) {
- psf_next = psf->sf_next;
+ for (psf = mc_dereference(*ppsf, idev);
+ psf;
+ psf = psf_next) {
+ psf_next = mc_dereference(psf->sf_next, idev);
if (psf->sf_crcount == 0) {
if (psf_prev)
- psf_prev->sf_next = psf->sf_next;
+ rcu_assign_pointer(psf_prev->sf_next,
+ mc_dereference(psf->sf_next, idev));
else
- *ppsf = psf->sf_next;
- kfree(psf);
- } else
+ rcu_assign_pointer(*ppsf,
+ mc_dereference(psf->sf_next, idev));
+ kfree_rcu(psf, rcu);
+ } else {
psf_prev = psf;
+ }
}
}
@@ -1912,13 +2091,12 @@ static void mld_send_cr(struct inet6_dev *idev)
struct sk_buff *skb = NULL;
int type, dtype;
- read_lock_bh(&idev->lock);
- spin_lock(&idev->mc_lock);
-
/* deleted MCA's */
pmc_prev = NULL;
- for (pmc = idev->mc_tomb; pmc; pmc = pmc_next) {
- pmc_next = pmc->next;
+ for (pmc = mc_dereference(idev->mc_tomb, idev);
+ pmc;
+ pmc = pmc_next) {
+ pmc_next = mc_dereference(pmc->next, idev);
if (pmc->mca_sfmode == MCAST_INCLUDE) {
type = MLD2_BLOCK_OLD_SOURCES;
dtype = MLD2_BLOCK_OLD_SOURCES;
@@ -1932,26 +2110,25 @@ static void mld_send_cr(struct inet6_dev *idev)
}
pmc->mca_crcount--;
if (pmc->mca_crcount == 0) {
- mld_clear_zeros(&pmc->mca_tomb);
- mld_clear_zeros(&pmc->mca_sources);
+ mld_clear_zeros(&pmc->mca_tomb, idev);
+ mld_clear_zeros(&pmc->mca_sources, idev);
}
}
- if (pmc->mca_crcount == 0 && !pmc->mca_tomb &&
- !pmc->mca_sources) {
+ if (pmc->mca_crcount == 0 &&
+ !rcu_access_pointer(pmc->mca_tomb) &&
+ !rcu_access_pointer(pmc->mca_sources)) {
if (pmc_prev)
- pmc_prev->next = pmc_next;
+ rcu_assign_pointer(pmc_prev->next, pmc_next);
else
- idev->mc_tomb = pmc_next;
+ rcu_assign_pointer(idev->mc_tomb, pmc_next);
in6_dev_put(pmc->idev);
- kfree(pmc);
+ kfree_rcu(pmc, rcu);
} else
pmc_prev = pmc;
}
- spin_unlock(&idev->mc_lock);
/* change recs */
- for (pmc = idev->mc_list; pmc; pmc = pmc->next) {
- spin_lock_bh(&pmc->mca_lock);
+ for_each_mc_mclock(idev, pmc) {
if (pmc->mca_sfcount[MCAST_EXCLUDE]) {
type = MLD2_BLOCK_OLD_SOURCES;
dtype = MLD2_ALLOW_NEW_SOURCES;
@@ -1971,9 +2148,7 @@ static void mld_send_cr(struct inet6_dev *idev)
skb = add_grec(skb, pmc, type, 0, 0, 0);
pmc->mca_crcount--;
}
- spin_unlock_bh(&pmc->mca_lock);
}
- read_unlock_bh(&idev->lock);
if (!skb)
return;
(void) mld_sendpack(skb);
@@ -1981,21 +2156,21 @@ static void mld_send_cr(struct inet6_dev *idev)
static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
{
- struct net *net = dev_net(dev);
- struct sock *sk = net->ipv6.igmp_sk;
+ const struct in6_addr *snd_addr, *saddr;
+ int err, len, payload_len, full_len;
+ struct in6_addr addr_buf;
struct inet6_dev *idev;
struct sk_buff *skb;
struct mld_msg *hdr;
- const struct in6_addr *snd_addr, *saddr;
- struct in6_addr addr_buf;
int hlen = LL_RESERVED_SPACE(dev);
int tlen = dev->needed_tailroom;
- int err, len, payload_len, full_len;
u8 ra[8] = { IPPROTO_ICMPV6, 0,
IPV6_TLV_ROUTERALERT, 2, 0, 0,
IPV6_TLV_PADN, 0 };
- struct flowi6 fl6;
struct dst_entry *dst;
+ struct flowi6 fl6;
+ struct net *net;
+ struct sock *sk;
if (type == ICMPV6_MGM_REDUCTION)
snd_addr = &in6addr_linklocal_allrouters;
@@ -2006,20 +2181,21 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
payload_len = len + sizeof(ra);
full_len = sizeof(struct ipv6hdr) + payload_len;
- rcu_read_lock();
- IP6_UPD_PO_STATS(net, __in6_dev_get(dev),
- IPSTATS_MIB_OUT, full_len);
- rcu_read_unlock();
+ skb = alloc_skb(hlen + tlen + full_len, GFP_KERNEL);
- skb = sock_alloc_send_skb(sk, hlen + tlen + full_len, 1, &err);
+ rcu_read_lock();
+ net = dev_net_rcu(dev);
+ idev = __in6_dev_get(dev);
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
if (!skb) {
- rcu_read_lock();
- IP6_INC_STATS(net, __in6_dev_get(dev),
- IPSTATS_MIB_OUTDISCARDS);
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
rcu_read_unlock();
return;
}
+ sk = net->ipv6.igmp_sk;
+ skb_set_owner_w(skb, sk);
+
skb->priority = TC_PRIO_CONTROL;
skb_reserve(skb, hlen);
@@ -2044,9 +2220,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type)
IPPROTO_ICMPV6,
csum_partial(hdr, len, 0));
- rcu_read_lock();
- idev = __in6_dev_get(skb->dev);
-
icmpv6_flow_init(sk, &fl6, type,
&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr,
skb->dev->ifindex);
@@ -2077,63 +2250,68 @@ err_out:
static void mld_send_initial_cr(struct inet6_dev *idev)
{
- struct sk_buff *skb;
struct ifmcaddr6 *pmc;
+ struct sk_buff *skb;
int type;
+ mc_assert_locked(idev);
+
if (mld_in_v1_mode(idev))
return;
skb = NULL;
- read_lock_bh(&idev->lock);
- for (pmc = idev->mc_list; pmc; pmc = pmc->next) {
- spin_lock_bh(&pmc->mca_lock);
+ for_each_mc_mclock(idev, pmc) {
if (pmc->mca_sfcount[MCAST_EXCLUDE])
type = MLD2_CHANGE_TO_EXCLUDE;
else
type = MLD2_ALLOW_NEW_SOURCES;
skb = add_grec(skb, pmc, type, 0, 0, 1);
- spin_unlock_bh(&pmc->mca_lock);
}
- read_unlock_bh(&idev->lock);
if (skb)
mld_sendpack(skb);
}
void ipv6_mc_dad_complete(struct inet6_dev *idev)
{
+ mutex_lock(&idev->mc_lock);
idev->mc_dad_count = idev->mc_qrv;
if (idev->mc_dad_count) {
mld_send_initial_cr(idev);
idev->mc_dad_count--;
if (idev->mc_dad_count)
- mld_dad_start_timer(idev,
- unsolicited_report_interval(idev));
+ mld_dad_start_work(idev,
+ unsolicited_report_interval(idev));
}
+ mutex_unlock(&idev->mc_lock);
}
-static void mld_dad_timer_expire(struct timer_list *t)
+static void mld_dad_work(struct work_struct *work)
{
- struct inet6_dev *idev = from_timer(idev, t, mc_dad_timer);
-
+ struct inet6_dev *idev = container_of(to_delayed_work(work),
+ struct inet6_dev,
+ mc_dad_work);
+ mutex_lock(&idev->mc_lock);
mld_send_initial_cr(idev);
if (idev->mc_dad_count) {
idev->mc_dad_count--;
if (idev->mc_dad_count)
- mld_dad_start_timer(idev,
- unsolicited_report_interval(idev));
+ mld_dad_start_work(idev,
+ unsolicited_report_interval(idev));
}
+ mutex_unlock(&idev->mc_lock);
in6_dev_put(idev);
}
static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode,
- const struct in6_addr *psfsrc)
+ const struct in6_addr *psfsrc)
{
struct ip6_sf_list *psf, *psf_prev;
int rv = 0;
+ mc_assert_locked(pmc->idev);
+
psf_prev = NULL;
- for (psf = pmc->mca_sources; psf; psf = psf->sf_next) {
+ for_each_psf_mclock(pmc, psf) {
if (ipv6_addr_equal(&psf->sf_addr, psfsrc))
break;
psf_prev = psf;
@@ -2142,23 +2320,28 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode,
/* source filter not found, or count wrong => bug */
return -ESRCH;
}
- psf->sf_count[sfmode]--;
+ WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] - 1);
if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
struct inet6_dev *idev = pmc->idev;
/* no more filters for this source */
if (psf_prev)
- psf_prev->sf_next = psf->sf_next;
+ rcu_assign_pointer(psf_prev->sf_next,
+ mc_dereference(psf->sf_next, idev));
else
- pmc->mca_sources = psf->sf_next;
+ rcu_assign_pointer(pmc->mca_sources,
+ mc_dereference(psf->sf_next, idev));
+
if (psf->sf_oldin && !(pmc->mca_flags & MAF_NOREPORT) &&
!mld_in_v1_mode(idev)) {
psf->sf_crcount = idev->mc_qrv;
- psf->sf_next = pmc->mca_tomb;
- pmc->mca_tomb = psf;
+ rcu_assign_pointer(psf->sf_next,
+ mc_dereference(pmc->mca_tomb, idev));
+ rcu_assign_pointer(pmc->mca_tomb, psf);
rv = 1;
- } else
- kfree(psf);
+ } else {
+ kfree_rcu(psf, rcu);
+ }
}
return rv;
}
@@ -2173,24 +2356,21 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca,
if (!idev)
return -ENODEV;
- read_lock_bh(&idev->lock);
- for (pmc = idev->mc_list; pmc; pmc = pmc->next) {
+
+ mc_assert_locked(idev);
+
+ for_each_mc_mclock(idev, pmc) {
if (ipv6_addr_equal(pmca, &pmc->mca_addr))
break;
}
- if (!pmc) {
- /* MCA not found?? bug */
- read_unlock_bh(&idev->lock);
+ if (!pmc)
return -ESRCH;
- }
- spin_lock_bh(&pmc->mca_lock);
+
sf_markstate(pmc);
if (!delta) {
- if (!pmc->mca_sfcount[sfmode]) {
- spin_unlock_bh(&pmc->mca_lock);
- read_unlock_bh(&idev->lock);
+ if (!pmc->mca_sfcount[sfmode])
return -EINVAL;
- }
+
pmc->mca_sfcount[sfmode]--;
}
err = 0;
@@ -2210,68 +2390,75 @@ static int ip6_mc_del_src(struct inet6_dev *idev, const struct in6_addr *pmca,
pmc->mca_sfmode = MCAST_INCLUDE;
pmc->mca_crcount = idev->mc_qrv;
idev->mc_ifc_count = pmc->mca_crcount;
- for (psf = pmc->mca_sources; psf; psf = psf->sf_next)
+ for_each_psf_mclock(pmc, psf)
psf->sf_crcount = 0;
mld_ifc_event(pmc->idev);
- } else if (sf_setstate(pmc) || changerec)
+ } else if (sf_setstate(pmc) || changerec) {
mld_ifc_event(pmc->idev);
- spin_unlock_bh(&pmc->mca_lock);
- read_unlock_bh(&idev->lock);
+ }
+
return err;
}
-/*
- * Add multicast single-source filter to the interface list
- */
+/* Add multicast single-source filter to the interface list */
static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode,
- const struct in6_addr *psfsrc)
+ const struct in6_addr *psfsrc)
{
struct ip6_sf_list *psf, *psf_prev;
+ mc_assert_locked(pmc->idev);
+
psf_prev = NULL;
- for (psf = pmc->mca_sources; psf; psf = psf->sf_next) {
+ for_each_psf_mclock(pmc, psf) {
if (ipv6_addr_equal(&psf->sf_addr, psfsrc))
break;
psf_prev = psf;
}
if (!psf) {
- psf = kzalloc(sizeof(*psf), GFP_ATOMIC);
+ psf = kzalloc(sizeof(*psf), GFP_KERNEL);
if (!psf)
return -ENOBUFS;
psf->sf_addr = *psfsrc;
if (psf_prev) {
- psf_prev->sf_next = psf;
- } else
- pmc->mca_sources = psf;
+ rcu_assign_pointer(psf_prev->sf_next, psf);
+ } else {
+ rcu_assign_pointer(pmc->mca_sources, psf);
+ }
}
- psf->sf_count[sfmode]++;
+ WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] + 1);
return 0;
}
static void sf_markstate(struct ifmcaddr6 *pmc)
{
- struct ip6_sf_list *psf;
int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE];
+ struct ip6_sf_list *psf;
- for (psf = pmc->mca_sources; psf; psf = psf->sf_next)
+ mc_assert_locked(pmc->idev);
+
+ for_each_psf_mclock(pmc, psf) {
if (pmc->mca_sfcount[MCAST_EXCLUDE]) {
psf->sf_oldin = mca_xcount ==
psf->sf_count[MCAST_EXCLUDE] &&
!psf->sf_count[MCAST_INCLUDE];
- } else
+ } else {
psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0;
+ }
+ }
}
static int sf_setstate(struct ifmcaddr6 *pmc)
{
- struct ip6_sf_list *psf, *dpsf;
int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE];
+ struct ip6_sf_list *psf, *dpsf;
int qrv = pmc->idev->mc_qrv;
int new_in, rv;
+ mc_assert_locked(pmc->idev);
+
rv = 0;
- for (psf = pmc->mca_sources; psf; psf = psf->sf_next) {
+ for_each_psf_mclock(pmc, psf) {
if (pmc->mca_sfcount[MCAST_EXCLUDE]) {
new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
!psf->sf_count[MCAST_INCLUDE];
@@ -2281,8 +2468,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc)
if (!psf->sf_oldin) {
struct ip6_sf_list *prev = NULL;
- for (dpsf = pmc->mca_tomb; dpsf;
- dpsf = dpsf->sf_next) {
+ for_each_psf_tomb(pmc, dpsf) {
if (ipv6_addr_equal(&dpsf->sf_addr,
&psf->sf_addr))
break;
@@ -2290,10 +2476,14 @@ static int sf_setstate(struct ifmcaddr6 *pmc)
}
if (dpsf) {
if (prev)
- prev->sf_next = dpsf->sf_next;
+ rcu_assign_pointer(prev->sf_next,
+ mc_dereference(dpsf->sf_next,
+ pmc->idev));
else
- pmc->mca_tomb = dpsf->sf_next;
- kfree(dpsf);
+ rcu_assign_pointer(pmc->mca_tomb,
+ mc_dereference(dpsf->sf_next,
+ pmc->idev));
+ kfree_rcu(dpsf, rcu);
}
psf->sf_crcount = qrv;
rv++;
@@ -2304,18 +2494,19 @@ static int sf_setstate(struct ifmcaddr6 *pmc)
* add or update "delete" records if an active filter
* is now inactive
*/
- for (dpsf = pmc->mca_tomb; dpsf; dpsf = dpsf->sf_next)
+
+ for_each_psf_tomb(pmc, dpsf)
if (ipv6_addr_equal(&dpsf->sf_addr,
&psf->sf_addr))
break;
if (!dpsf) {
- dpsf = kmalloc(sizeof(*dpsf), GFP_ATOMIC);
+ dpsf = kmalloc(sizeof(*dpsf), GFP_KERNEL);
if (!dpsf)
continue;
*dpsf = *psf;
- /* pmc->mca_lock held by callers */
- dpsf->sf_next = pmc->mca_tomb;
- pmc->mca_tomb = dpsf;
+ rcu_assign_pointer(dpsf->sf_next,
+ mc_dereference(pmc->mca_tomb, pmc->idev));
+ rcu_assign_pointer(pmc->mca_tomb, dpsf);
}
dpsf->sf_crcount = qrv;
rv++;
@@ -2324,9 +2515,7 @@ static int sf_setstate(struct ifmcaddr6 *pmc)
return rv;
}
-/*
- * Add multicast source filter list to the interface list
- */
+/* Add multicast source filter list to the interface list */
static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
int sfmode, int sfcount, const struct in6_addr *psfsrc,
int delta)
@@ -2337,22 +2526,21 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
if (!idev)
return -ENODEV;
- read_lock_bh(&idev->lock);
- for (pmc = idev->mc_list; pmc; pmc = pmc->next) {
+
+ mc_assert_locked(idev);
+
+ for_each_mc_mclock(idev, pmc) {
if (ipv6_addr_equal(pmca, &pmc->mca_addr))
break;
}
- if (!pmc) {
- /* MCA not found?? bug */
- read_unlock_bh(&idev->lock);
+ if (!pmc)
return -ESRCH;
- }
- spin_lock_bh(&pmc->mca_lock);
sf_markstate(pmc);
isexclude = pmc->mca_sfmode == MCAST_EXCLUDE;
if (!delta)
- pmc->mca_sfcount[sfmode]++;
+ WRITE_ONCE(pmc->mca_sfcount[sfmode],
+ pmc->mca_sfcount[sfmode] + 1);
err = 0;
for (i = 0; i < sfcount; i++) {
err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i]);
@@ -2363,7 +2551,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
int j;
if (!delta)
- pmc->mca_sfcount[sfmode]--;
+ WRITE_ONCE(pmc->mca_sfcount[sfmode],
+ pmc->mca_sfcount[sfmode] - 1);
for (j = 0; j < i; j++)
ip6_mc_del1_src(pmc, sfmode, &psfsrc[j]);
} else if (isexclude != (pmc->mca_sfcount[MCAST_EXCLUDE] != 0)) {
@@ -2378,13 +2567,12 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
pmc->mca_crcount = idev->mc_qrv;
idev->mc_ifc_count = pmc->mca_crcount;
- for (psf = pmc->mca_sources; psf; psf = psf->sf_next)
+ for_each_psf_mclock(pmc, psf)
psf->sf_crcount = 0;
mld_ifc_event(idev);
- } else if (sf_setstate(pmc))
+ } else if (sf_setstate(pmc)) {
mld_ifc_event(idev);
- spin_unlock_bh(&pmc->mca_lock);
- read_unlock_bh(&idev->lock);
+ }
return err;
}
@@ -2392,120 +2580,153 @@ static void ip6_mc_clear_src(struct ifmcaddr6 *pmc)
{
struct ip6_sf_list *psf, *nextpsf;
- for (psf = pmc->mca_tomb; psf; psf = nextpsf) {
- nextpsf = psf->sf_next;
- kfree(psf);
+ mc_assert_locked(pmc->idev);
+
+ for (psf = mc_dereference(pmc->mca_tomb, pmc->idev);
+ psf;
+ psf = nextpsf) {
+ nextpsf = mc_dereference(psf->sf_next, pmc->idev);
+ kfree_rcu(psf, rcu);
}
- pmc->mca_tomb = NULL;
- for (psf = pmc->mca_sources; psf; psf = nextpsf) {
- nextpsf = psf->sf_next;
- kfree(psf);
+ RCU_INIT_POINTER(pmc->mca_tomb, NULL);
+ for (psf = mc_dereference(pmc->mca_sources, pmc->idev);
+ psf;
+ psf = nextpsf) {
+ nextpsf = mc_dereference(psf->sf_next, pmc->idev);
+ kfree_rcu(psf, rcu);
}
- pmc->mca_sources = NULL;
+ RCU_INIT_POINTER(pmc->mca_sources, NULL);
pmc->mca_sfmode = MCAST_EXCLUDE;
pmc->mca_sfcount[MCAST_INCLUDE] = 0;
- pmc->mca_sfcount[MCAST_EXCLUDE] = 1;
+ /* Paired with the READ_ONCE() from ipv6_chk_mcast_addr() */
+ WRITE_ONCE(pmc->mca_sfcount[MCAST_EXCLUDE], 1);
}
-
static void igmp6_join_group(struct ifmcaddr6 *ma)
{
unsigned long delay;
+ mc_assert_locked(ma->idev);
+
if (ma->mca_flags & MAF_NOREPORT)
return;
igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
- delay = prandom_u32() % unsolicited_report_interval(ma->idev);
+ delay = get_random_u32_below(unsolicited_report_interval(ma->idev));
- spin_lock_bh(&ma->mca_lock);
- if (del_timer(&ma->mca_timer)) {
+ if (cancel_delayed_work(&ma->mca_work)) {
refcount_dec(&ma->mca_refcnt);
- delay = ma->mca_timer.expires - jiffies;
+ delay = ma->mca_work.timer.expires - jiffies;
}
- if (!mod_timer(&ma->mca_timer, jiffies + delay))
+ if (!mod_delayed_work(mld_wq, &ma->mca_work, delay))
refcount_inc(&ma->mca_refcnt);
ma->mca_flags |= MAF_TIMER_RUNNING | MAF_LAST_REPORTER;
- spin_unlock_bh(&ma->mca_lock);
}
static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
struct inet6_dev *idev)
{
+ struct ip6_sf_socklist *psl;
int err;
- /* callers have the socket lock and rtnl lock
- * so no other readers or writers of iml or its sflist
- */
- if (!iml->sflist) {
+ psl = sock_dereference(iml->sflist, sk);
+
+ if (idev)
+ mutex_lock(&idev->mc_lock);
+
+ if (!psl) {
/* any-source empty exclude case */
- return ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0);
+ err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0);
+ } else {
+ err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode,
+ psl->sl_count, psl->sl_addr, 0);
+ RCU_INIT_POINTER(iml->sflist, NULL);
+ atomic_sub(struct_size(psl, sl_addr, psl->sl_max),
+ &sk->sk_omem_alloc);
+ kfree_rcu(psl, rcu);
}
- err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode,
- iml->sflist->sl_count, iml->sflist->sl_addr, 0);
- sock_kfree_s(sk, iml->sflist, IP6_SFLSIZE(iml->sflist->sl_max));
- iml->sflist = NULL;
+
+ if (idev)
+ mutex_unlock(&idev->mc_lock);
+
return err;
}
static void igmp6_leave_group(struct ifmcaddr6 *ma)
{
+ mc_assert_locked(ma->idev);
+
if (mld_in_v1_mode(ma->idev)) {
- if (ma->mca_flags & MAF_LAST_REPORTER)
+ if (ma->mca_flags & MAF_LAST_REPORTER) {
igmp6_send(&ma->mca_addr, ma->idev->dev,
ICMPV6_MGM_REDUCTION);
+ }
} else {
mld_add_delrec(ma->idev, ma);
mld_ifc_event(ma->idev);
}
}
-static void mld_gq_timer_expire(struct timer_list *t)
+static void mld_gq_work(struct work_struct *work)
{
- struct inet6_dev *idev = from_timer(idev, t, mc_gq_timer);
+ struct inet6_dev *idev = container_of(to_delayed_work(work),
+ struct inet6_dev,
+ mc_gq_work);
- idev->mc_gq_running = 0;
+ mutex_lock(&idev->mc_lock);
mld_send_report(idev, NULL);
+ idev->mc_gq_running = 0;
+ mutex_unlock(&idev->mc_lock);
+
in6_dev_put(idev);
}
-static void mld_ifc_timer_expire(struct timer_list *t)
+static void mld_ifc_work(struct work_struct *work)
{
- struct inet6_dev *idev = from_timer(idev, t, mc_ifc_timer);
+ struct inet6_dev *idev = container_of(to_delayed_work(work),
+ struct inet6_dev,
+ mc_ifc_work);
+ mutex_lock(&idev->mc_lock);
mld_send_cr(idev);
+
if (idev->mc_ifc_count) {
idev->mc_ifc_count--;
if (idev->mc_ifc_count)
- mld_ifc_start_timer(idev,
- unsolicited_report_interval(idev));
+ mld_ifc_start_work(idev,
+ unsolicited_report_interval(idev));
}
+ mutex_unlock(&idev->mc_lock);
in6_dev_put(idev);
}
static void mld_ifc_event(struct inet6_dev *idev)
{
+ mc_assert_locked(idev);
+
if (mld_in_v1_mode(idev))
return;
+
idev->mc_ifc_count = idev->mc_qrv;
- mld_ifc_start_timer(idev, 1);
+ mld_ifc_start_work(idev, 1);
}
-static void igmp6_timer_handler(struct timer_list *t)
+static void mld_mca_work(struct work_struct *work)
{
- struct ifmcaddr6 *ma = from_timer(ma, t, mca_timer);
+ struct ifmcaddr6 *ma = container_of(to_delayed_work(work),
+ struct ifmcaddr6, mca_work);
+ mutex_lock(&ma->idev->mc_lock);
if (mld_in_v1_mode(ma->idev))
igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT);
else
mld_send_report(ma->idev, ma);
-
- spin_lock(&ma->mca_lock);
ma->mca_flags |= MAF_LAST_REPORTER;
ma->mca_flags &= ~MAF_TIMER_RUNNING;
- spin_unlock(&ma->mca_lock);
+ mutex_unlock(&ma->idev->mc_lock);
+
ma_put(ma);
}
@@ -2517,10 +2738,10 @@ void ipv6_mc_unmap(struct inet6_dev *idev)
/* Install multicast list, except for all-nodes (already installed) */
- read_lock_bh(&idev->lock);
- for (i = idev->mc_list; i; i = i->next)
+ mutex_lock(&idev->mc_lock);
+ for_each_mc_mclock(idev, i)
igmp6_group_dropped(i);
- read_unlock_bh(&idev->lock);
+ mutex_unlock(&idev->mc_lock);
}
void ipv6_mc_remap(struct inet6_dev *idev)
@@ -2529,25 +2750,28 @@ void ipv6_mc_remap(struct inet6_dev *idev)
}
/* Device going down */
-
void ipv6_mc_down(struct inet6_dev *idev)
{
struct ifmcaddr6 *i;
+ mutex_lock(&idev->mc_lock);
/* Withdraw multicast list */
-
- read_lock_bh(&idev->lock);
-
- for (i = idev->mc_list; i; i = i->next)
+ for_each_mc_mclock(idev, i)
igmp6_group_dropped(i);
+ mutex_unlock(&idev->mc_lock);
- /* Should stop timer after group drop. or we will
- * start timer again in mld_ifc_event()
+ /* Should stop work after group drop. or we will
+ * start work again in mld_ifc_event()
*/
- mld_ifc_stop_timer(idev);
- mld_gq_stop_timer(idev);
- mld_dad_stop_timer(idev);
- read_unlock_bh(&idev->lock);
+ mld_query_stop_work(idev);
+ mld_report_stop_work(idev);
+
+ mutex_lock(&idev->mc_lock);
+ mld_ifc_stop_work(idev);
+ mld_gq_stop_work(idev);
+ mutex_unlock(&idev->mc_lock);
+
+ mld_dad_stop_work(idev);
}
static void ipv6_mc_reset(struct inet6_dev *idev)
@@ -2567,29 +2791,33 @@ void ipv6_mc_up(struct inet6_dev *idev)
/* Install multicast list, except for all-nodes (already installed) */
- read_lock_bh(&idev->lock);
ipv6_mc_reset(idev);
- for (i = idev->mc_list; i; i = i->next) {
+ mutex_lock(&idev->mc_lock);
+ for_each_mc_mclock(idev, i) {
mld_del_delrec(idev, i);
igmp6_group_added(i);
}
- read_unlock_bh(&idev->lock);
+ mutex_unlock(&idev->mc_lock);
}
/* IPv6 device initialization. */
void ipv6_mc_init_dev(struct inet6_dev *idev)
{
- write_lock_bh(&idev->lock);
- spin_lock_init(&idev->mc_lock);
idev->mc_gq_running = 0;
- timer_setup(&idev->mc_gq_timer, mld_gq_timer_expire, 0);
- idev->mc_tomb = NULL;
+ INIT_DELAYED_WORK(&idev->mc_gq_work, mld_gq_work);
+ RCU_INIT_POINTER(idev->mc_tomb, NULL);
idev->mc_ifc_count = 0;
- timer_setup(&idev->mc_ifc_timer, mld_ifc_timer_expire, 0);
- timer_setup(&idev->mc_dad_timer, mld_dad_timer_expire, 0);
+ INIT_DELAYED_WORK(&idev->mc_ifc_work, mld_ifc_work);
+ INIT_DELAYED_WORK(&idev->mc_dad_work, mld_dad_work);
+ INIT_DELAYED_WORK(&idev->mc_query_work, mld_query_work);
+ INIT_DELAYED_WORK(&idev->mc_report_work, mld_report_work);
+ skb_queue_head_init(&idev->mc_query_queue);
+ skb_queue_head_init(&idev->mc_report_queue);
+ spin_lock_init(&idev->mc_query_lock);
+ spin_lock_init(&idev->mc_report_lock);
+ mutex_init(&idev->mc_lock);
ipv6_mc_reset(idev);
- write_unlock_bh(&idev->lock);
}
/*
@@ -2600,9 +2828,13 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev)
{
struct ifmcaddr6 *i;
- /* Deactivate timers */
+ /* Deactivate works */
ipv6_mc_down(idev);
+ mutex_lock(&idev->mc_lock);
mld_clear_delrec(idev);
+ mutex_unlock(&idev->mc_lock);
+ mld_clear_query(idev);
+ mld_clear_report(idev);
/* Delete all-nodes address. */
/* We cannot call ipv6_dev_mc_dec() directly, our caller in
@@ -2614,30 +2846,28 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev)
if (idev->cnf.forwarding)
__ipv6_dev_mc_dec(idev, &in6addr_linklocal_allrouters);
- write_lock_bh(&idev->lock);
- while ((i = idev->mc_list) != NULL) {
- idev->mc_list = i->next;
+ mutex_lock(&idev->mc_lock);
+ while ((i = mc_dereference(idev->mc_list, idev))) {
+ rcu_assign_pointer(idev->mc_list, mc_dereference(i->next, idev));
- write_unlock_bh(&idev->lock);
+ ip6_mc_clear_src(i);
ma_put(i);
- write_lock_bh(&idev->lock);
}
- write_unlock_bh(&idev->lock);
+ mutex_unlock(&idev->mc_lock);
}
static void ipv6_mc_rejoin_groups(struct inet6_dev *idev)
{
struct ifmcaddr6 *pmc;
- ASSERT_RTNL();
-
+ mutex_lock(&idev->mc_lock);
if (mld_in_v1_mode(idev)) {
- read_lock_bh(&idev->lock);
- for (pmc = idev->mc_list; pmc; pmc = pmc->next)
+ for_each_mc_mclock(idev, pmc)
igmp6_join_group(pmc);
- read_unlock_bh(&idev->lock);
- } else
+ } else {
mld_send_report(idev, NULL);
+ }
+ mutex_unlock(&idev->mc_lock);
}
static int ipv6_mc_netdev_event(struct notifier_block *this,
@@ -2684,13 +2914,12 @@ static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq)
idev = __in6_dev_get(state->dev);
if (!idev)
continue;
- read_lock_bh(&idev->lock);
- im = idev->mc_list;
+
+ im = rcu_dereference(idev->mc_list);
if (im) {
state->idev = idev;
break;
}
- read_unlock_bh(&idev->lock);
}
return im;
}
@@ -2699,11 +2928,8 @@ static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr
{
struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
- im = im->next;
+ im = rcu_dereference(im->next);
while (!im) {
- if (likely(state->idev))
- read_unlock_bh(&state->idev->lock);
-
state->dev = next_net_device_rcu(state->dev);
if (!state->dev) {
state->idev = NULL;
@@ -2712,8 +2938,7 @@ static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr
state->idev = __in6_dev_get(state->dev);
if (!state->idev)
continue;
- read_lock_bh(&state->idev->lock);
- im = state->idev->mc_list;
+ im = rcu_dereference(state->idev->mc_list);
}
return im;
}
@@ -2747,10 +2972,8 @@ static void igmp6_mc_seq_stop(struct seq_file *seq, void *v)
{
struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq);
- if (likely(state->idev)) {
- read_unlock_bh(&state->idev->lock);
+ if (likely(state->idev))
state->idev = NULL;
- }
state->dev = NULL;
rcu_read_unlock();
}
@@ -2765,8 +2988,8 @@ static int igmp6_mc_seq_show(struct seq_file *seq, void *v)
state->dev->ifindex, state->dev->name,
&im->mca_addr,
im->mca_users, im->mca_flags,
- (im->mca_flags&MAF_TIMER_RUNNING) ?
- jiffies_to_clock_t(im->mca_timer.expires-jiffies) : 0);
+ (im->mca_flags & MAF_TIMER_RUNNING) ?
+ jiffies_to_clock_t(im->mca_work.timer.expires - jiffies) : 0);
return 0;
}
@@ -2800,19 +3023,16 @@ static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq)
idev = __in6_dev_get(state->dev);
if (unlikely(idev == NULL))
continue;
- read_lock_bh(&idev->lock);
- im = idev->mc_list;
+
+ im = rcu_dereference(idev->mc_list);
if (likely(im)) {
- spin_lock_bh(&im->mca_lock);
- psf = im->mca_sources;
+ psf = rcu_dereference(im->mca_sources);
if (likely(psf)) {
state->im = im;
state->idev = idev;
break;
}
- spin_unlock_bh(&im->mca_lock);
}
- read_unlock_bh(&idev->lock);
}
return psf;
}
@@ -2821,14 +3041,10 @@ static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_s
{
struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);
- psf = psf->sf_next;
+ psf = rcu_dereference(psf->sf_next);
while (!psf) {
- spin_unlock_bh(&state->im->mca_lock);
- state->im = state->im->next;
+ state->im = rcu_dereference(state->im->next);
while (!state->im) {
- if (likely(state->idev))
- read_unlock_bh(&state->idev->lock);
-
state->dev = next_net_device_rcu(state->dev);
if (!state->dev) {
state->idev = NULL;
@@ -2837,13 +3053,9 @@ static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_s
state->idev = __in6_dev_get(state->dev);
if (!state->idev)
continue;
- read_lock_bh(&state->idev->lock);
- state->im = state->idev->mc_list;
+ state->im = rcu_dereference(state->idev->mc_list);
}
- if (!state->im)
- break;
- spin_lock_bh(&state->im->mca_lock);
- psf = state->im->mca_sources;
+ psf = rcu_dereference(state->im->mca_sources);
}
out:
return psf;
@@ -2880,14 +3092,12 @@ static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v)
__releases(RCU)
{
struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq);
- if (likely(state->im)) {
- spin_unlock_bh(&state->im->mca_lock);
+
+ if (likely(state->im))
state->im = NULL;
- }
- if (likely(state->idev)) {
- read_unlock_bh(&state->idev->lock);
+ if (likely(state->idev))
state->idev = NULL;
- }
+
state->dev = NULL;
rcu_read_unlock();
}
@@ -2905,8 +3115,8 @@ static int igmp6_mcf_seq_show(struct seq_file *seq, void *v)
state->dev->ifindex, state->dev->name,
&state->im->mca_addr,
&psf->sf_addr,
- psf->sf_count[MCAST_INCLUDE],
- psf->sf_count[MCAST_EXCLUDE]);
+ READ_ONCE(psf->sf_count[MCAST_INCLUDE]),
+ READ_ONCE(psf->sf_count[MCAST_EXCLUDE]));
}
return 0;
}
@@ -2968,6 +3178,7 @@ static int __net_init igmp6_net_init(struct net *net)
}
inet6_sk(net->ipv6.igmp_sk)->hop_limit = 1;
+ net->ipv6.igmp_sk->sk_allocation = GFP_KERNEL;
err = inet_ctl_sock_create(&net->ipv6.mc_autojoin_sk, PF_INET6,
SOCK_RAW, IPPROTO_ICMPV6, net);
@@ -3005,7 +3216,19 @@ static struct pernet_operations igmp6_net_ops = {
int __init igmp6_init(void)
{
- return register_pernet_subsys(&igmp6_net_ops);
+ int err;
+
+ err = register_pernet_subsys(&igmp6_net_ops);
+ if (err)
+ return err;
+
+ mld_wq = create_workqueue("mld");
+ if (!mld_wq) {
+ unregister_pernet_subsys(&igmp6_net_ops);
+ return -ENOMEM;
+ }
+
+ return err;
}
int __init igmp6_late_init(void)
@@ -3016,6 +3239,7 @@ int __init igmp6_late_init(void)
void igmp6_cleanup(void)
{
unregister_pernet_subsys(&igmp6_net_ops);
+ destroy_workqueue(mld_wq);
}
void igmp6_late_cleanup(void)
diff --git a/net/ipv6/mcast_snoop.c b/net/ipv6/mcast_snoop.c
index 9405b04eecc6..04d5fcdfa6e0 100644
--- a/net/ipv6/mcast_snoop.c
+++ b/net/ipv6/mcast_snoop.c
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (C) 2010: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
* Copyright (C) 2015: Linus Lüssing <linus.luessing@c0d3.blue>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- *
- *
* Based on the MLD support added to br_multicast.c by YOSHIFUJI Hideaki.
*/
@@ -41,6 +29,8 @@ static int ipv6_mc_check_ip6hdr(struct sk_buff *skb)
if (skb->len < len || len <= offset)
return -EINVAL;
+ skb_set_transport_header(skb, offset);
+
return 0;
}
@@ -77,27 +67,27 @@ static int ipv6_mc_check_mld_reportv2(struct sk_buff *skb)
len += sizeof(struct mld2_report);
- return pskb_may_pull(skb, len) ? 0 : -EINVAL;
+ return ipv6_mc_may_pull(skb, len) ? 0 : -EINVAL;
}
static int ipv6_mc_check_mld_query(struct sk_buff *skb)
{
+ unsigned int transport_len = ipv6_transport_len(skb);
struct mld_msg *mld;
- unsigned int len = skb_transport_offset(skb);
+ unsigned int len;
/* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */
if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL))
return -EINVAL;
- len += sizeof(struct mld_msg);
- if (skb->len < len)
- return -EINVAL;
-
/* MLDv1? */
- if (skb->len != len) {
+ if (transport_len != sizeof(struct mld_msg)) {
/* or MLDv2? */
- len += sizeof(struct mld2_query) - sizeof(struct mld_msg);
- if (skb->len < len || !pskb_may_pull(skb, len))
+ if (transport_len < sizeof(struct mld2_query))
+ return -EINVAL;
+
+ len = skb_transport_offset(skb) + sizeof(struct mld2_query);
+ if (!ipv6_mc_may_pull(skb, len))
return -EINVAL;
}
@@ -115,19 +105,24 @@ static int ipv6_mc_check_mld_query(struct sk_buff *skb)
static int ipv6_mc_check_mld_msg(struct sk_buff *skb)
{
- struct mld_msg *mld = (struct mld_msg *)skb_transport_header(skb);
+ unsigned int len = skb_transport_offset(skb) + sizeof(struct mld_msg);
+ struct mld_msg *mld;
+
+ if (!ipv6_mc_may_pull(skb, len))
+ return -ENODATA;
+
+ mld = (struct mld_msg *)skb_transport_header(skb);
switch (mld->mld_type) {
case ICMPV6_MGM_REDUCTION:
case ICMPV6_MGM_REPORT:
- /* fall through */
return 0;
case ICMPV6_MLD2_REPORT:
return ipv6_mc_check_mld_reportv2(skb);
case ICMPV6_MGM_QUERY:
return ipv6_mc_check_mld_query(skb);
default:
- return -ENOMSG;
+ return -ENODATA;
}
}
@@ -136,70 +131,45 @@ static inline __sum16 ipv6_mc_validate_checksum(struct sk_buff *skb)
return skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo);
}
-static int __ipv6_mc_check_mld(struct sk_buff *skb,
- struct sk_buff **skb_trimmed)
-
+static int ipv6_mc_check_icmpv6(struct sk_buff *skb)
{
- struct sk_buff *skb_chk = NULL;
- unsigned int transport_len;
- unsigned int len = skb_transport_offset(skb) + sizeof(struct mld_msg);
- int ret = -EINVAL;
+ unsigned int len = skb_transport_offset(skb) + sizeof(struct icmp6hdr);
+ unsigned int transport_len = ipv6_transport_len(skb);
+ struct sk_buff *skb_chk;
- transport_len = ntohs(ipv6_hdr(skb)->payload_len);
- transport_len -= skb_transport_offset(skb) - sizeof(struct ipv6hdr);
+ if (!ipv6_mc_may_pull(skb, len))
+ return -EINVAL;
skb_chk = skb_checksum_trimmed(skb, transport_len,
ipv6_mc_validate_checksum);
if (!skb_chk)
- goto err;
-
- if (!pskb_may_pull(skb_chk, len))
- goto err;
-
- ret = ipv6_mc_check_mld_msg(skb_chk);
- if (ret)
- goto err;
-
- if (skb_trimmed)
- *skb_trimmed = skb_chk;
- /* free now unneeded clone */
- else if (skb_chk != skb)
- kfree_skb(skb_chk);
-
- ret = 0;
+ return -EINVAL;
-err:
- if (ret && skb_chk && skb_chk != skb)
+ if (skb_chk != skb)
kfree_skb(skb_chk);
- return ret;
+ return 0;
}
/**
* ipv6_mc_check_mld - checks whether this is a sane MLD packet
* @skb: the skb to validate
- * @skb_trimmed: to store an skb pointer trimmed to IPv6 packet tail (optional)
*
* Checks whether an IPv6 packet is a valid MLD packet. If so sets
* skb transport header accordingly and returns zero.
*
* -EINVAL: A broken packet was detected, i.e. it violates some internet
* standard
- * -ENOMSG: IP header validation succeeded but it is not an MLD packet.
+ * -ENOMSG: IP header validation succeeded but it is not an ICMPv6 packet
+ * with a hop-by-hop option.
+ * -ENODATA: IP+ICMPv6 header with hop-by-hop option validation succeeded
+ * but it is not an MLD packet.
* -ENOMEM: A memory allocation failure happened.
*
- * Optionally, an skb pointer might be provided via skb_trimmed (or set it
- * to NULL): After parsing an MLD packet successfully it will point to
- * an skb which has its tail aligned to the IP packet end. This might
- * either be the originally provided skb or a trimmed, cloned version if
- * the skb frame had data beyond the IP packet. A cloned skb allows us
- * to leave the original skb and its full frame unchanged (which might be
- * desirable for layer 2 frame jugglers).
- *
* Caller needs to set the skb network header and free any returned skb if it
* differs from the provided skb.
*/
-int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed)
+int ipv6_mc_check_mld(struct sk_buff *skb)
{
int ret;
@@ -211,6 +181,10 @@ int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed)
if (ret < 0)
return ret;
- return __ipv6_mc_check_mld(skb, skb_trimmed);
+ ret = ipv6_mc_check_icmpv6(skb);
+ if (ret < 0)
+ return ret;
+
+ return ipv6_mc_check_mld_msg(skb);
}
EXPORT_SYMBOL(ipv6_mc_check_mld);
diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c
index 64f0f7be9e5e..6a16a5bd0d91 100644
--- a/net/ipv6/mip6.c
+++ b/net/ipv6/mip6.c
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C)2003-2006 Helsinki University of Technology
* Copyright (C)2003-2006 USAGI/WIDE Project
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
/*
* Authors:
@@ -259,63 +247,14 @@ static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb,
return err;
}
-static int mip6_destopt_offset(struct xfrm_state *x, struct sk_buff *skb,
- u8 **nexthdr)
-{
- u16 offset = sizeof(struct ipv6hdr);
- struct ipv6_opt_hdr *exthdr =
- (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
- const unsigned char *nh = skb_network_header(skb);
- unsigned int packet_len = skb_tail_pointer(skb) -
- skb_network_header(skb);
- int found_rhdr = 0;
-
- *nexthdr = &ipv6_hdr(skb)->nexthdr;
-
- while (offset + 1 <= packet_len) {
-
- switch (**nexthdr) {
- case NEXTHDR_HOP:
- break;
- case NEXTHDR_ROUTING:
- found_rhdr = 1;
- break;
- case NEXTHDR_DEST:
- /*
- * HAO MUST NOT appear more than once.
- * XXX: It is better to try to find by the end of
- * XXX: packet if HAO exists.
- */
- if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) {
- net_dbg_ratelimited("mip6: hao exists already, override\n");
- return offset;
- }
-
- if (found_rhdr)
- return offset;
-
- break;
- default:
- return offset;
- }
-
- offset += ipv6_optlen(exthdr);
- *nexthdr = &exthdr->nexthdr;
- exthdr = (struct ipv6_opt_hdr *)(nh + offset);
- }
-
- return offset;
-}
-
-static int mip6_destopt_init_state(struct xfrm_state *x)
+static int mip6_destopt_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
if (x->id.spi) {
- pr_info("%s: spi is not 0: %u\n", __func__, x->id.spi);
+ NL_SET_ERR_MSG(extack, "SPI must be 0");
return -EINVAL;
}
if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) {
- pr_info("%s: state's mode is not %u: %u\n",
- __func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode);
+ NL_SET_ERR_MSG(extack, "XFRM mode must be XFRM_MODE_ROUTEOPTIMIZATION");
return -EINVAL;
}
@@ -336,7 +275,6 @@ static void mip6_destopt_destroy(struct xfrm_state *x)
}
static const struct xfrm_type mip6_destopt_type = {
- .description = "MIP6DESTOPT",
.owner = THIS_MODULE,
.proto = IPPROTO_DSTOPTS,
.flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_LOCAL_COADDR,
@@ -345,7 +283,6 @@ static const struct xfrm_type mip6_destopt_type = {
.input = mip6_destopt_input,
.output = mip6_destopt_output,
.reject = mip6_destopt_reject,
- .hdr_offset = mip6_destopt_offset,
};
static int mip6_rthdr_input(struct xfrm_state *x, struct sk_buff *skb)
@@ -395,62 +332,14 @@ static int mip6_rthdr_output(struct xfrm_state *x, struct sk_buff *skb)
return 0;
}
-static int mip6_rthdr_offset(struct xfrm_state *x, struct sk_buff *skb,
- u8 **nexthdr)
-{
- u16 offset = sizeof(struct ipv6hdr);
- struct ipv6_opt_hdr *exthdr =
- (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
- const unsigned char *nh = skb_network_header(skb);
- unsigned int packet_len = skb_tail_pointer(skb) -
- skb_network_header(skb);
- int found_rhdr = 0;
-
- *nexthdr = &ipv6_hdr(skb)->nexthdr;
-
- while (offset + 1 <= packet_len) {
-
- switch (**nexthdr) {
- case NEXTHDR_HOP:
- break;
- case NEXTHDR_ROUTING:
- if (offset + 3 <= packet_len) {
- struct ipv6_rt_hdr *rt;
- rt = (struct ipv6_rt_hdr *)(nh + offset);
- if (rt->type != 0)
- return offset;
- }
- found_rhdr = 1;
- break;
- case NEXTHDR_DEST:
- if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
- return offset;
-
- if (found_rhdr)
- return offset;
-
- break;
- default:
- return offset;
- }
-
- offset += ipv6_optlen(exthdr);
- *nexthdr = &exthdr->nexthdr;
- exthdr = (struct ipv6_opt_hdr *)(nh + offset);
- }
-
- return offset;
-}
-
-static int mip6_rthdr_init_state(struct xfrm_state *x)
+static int mip6_rthdr_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
if (x->id.spi) {
- pr_info("%s: spi is not 0: %u\n", __func__, x->id.spi);
+ NL_SET_ERR_MSG(extack, "SPI must be 0");
return -EINVAL;
}
if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) {
- pr_info("%s: state's mode is not %u: %u\n",
- __func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode);
+ NL_SET_ERR_MSG(extack, "XFRM mode must be XFRM_MODE_ROUTEOPTIMIZATION");
return -EINVAL;
}
@@ -468,7 +357,6 @@ static void mip6_rthdr_destroy(struct xfrm_state *x)
}
static const struct xfrm_type mip6_rthdr_type = {
- .description = "MIP6RT",
.owner = THIS_MODULE,
.proto = IPPROTO_ROUTING,
.flags = XFRM_TYPE_NON_FRAGMENT | XFRM_TYPE_REMOTE_COADDR,
@@ -476,7 +364,6 @@ static const struct xfrm_type mip6_rthdr_type = {
.destructor = mip6_rthdr_destroy,
.input = mip6_rthdr_input,
.output = mip6_rthdr_output,
- .hdr_offset = mip6_rthdr_offset,
};
static int __init mip6_init(void)
@@ -511,15 +398,14 @@ static void __exit mip6_fini(void)
{
if (rawv6_mh_filter_unregister(mip6_mh_filter) < 0)
pr_info("%s: can't remove rawv6 mh filter\n", __func__);
- if (xfrm_unregister_type(&mip6_rthdr_type, AF_INET6) < 0)
- pr_info("%s: can't remove xfrm type(rthdr)\n", __func__);
- if (xfrm_unregister_type(&mip6_destopt_type, AF_INET6) < 0)
- pr_info("%s: can't remove xfrm type(destopt)\n", __func__);
+ xfrm_unregister_type(&mip6_rthdr_type, AF_INET6);
+ xfrm_unregister_type(&mip6_destopt_type, AF_INET6);
}
module_init(mip6_init);
module_exit(mip6_fini);
+MODULE_DESCRIPTION("IPv6 Mobility driver");
MODULE_LICENSE("GPL");
MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_DSTOPTS);
MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ROUTING);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 0ec273997d1d..59d17b6f06bf 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Neighbour Discovery for IPv6
* Linux INET6 implementation
@@ -5,11 +6,6 @@
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
* Mike Shaver <shaver@ingenia.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/*
@@ -77,12 +73,15 @@ static u32 ndisc_hash(const void *pkey,
const struct net_device *dev,
__u32 *hash_rnd);
static bool ndisc_key_eq(const struct neighbour *neigh, const void *pkey);
+static bool ndisc_allow_add(const struct net_device *dev,
+ struct netlink_ext_ack *extack);
static int ndisc_constructor(struct neighbour *neigh);
static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb);
static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb);
static int pndisc_constructor(struct pneigh_entry *n);
static void pndisc_destructor(struct pneigh_entry *n);
static void pndisc_redo(struct sk_buff *skb);
+static int ndisc_is_multicast(const void *pkey);
static const struct neigh_ops ndisc_generic_ops = {
.family = AF_INET6,
@@ -117,6 +116,8 @@ struct neigh_table nd_tbl = {
.pconstructor = pndisc_constructor,
.pdestructor = pndisc_destructor,
.proxy_redo = pndisc_redo,
+ .is_multicast = ndisc_is_multicast,
+ .allow_add = ndisc_allow_add,
.id = "ndisc_cache",
.parms = {
.tbl = &nd_tbl,
@@ -127,8 +128,9 @@ struct neigh_table nd_tbl = {
[NEIGH_VAR_RETRANS_TIME] = ND_RETRANS_TIMER,
[NEIGH_VAR_BASE_REACHABLE_TIME] = ND_REACHABLE_TIME,
[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
+ [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
- [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
+ [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT,
[NEIGH_VAR_PROXY_QLEN] = 64,
[NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
[NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
@@ -141,7 +143,7 @@ struct neigh_table nd_tbl = {
};
EXPORT_SYMBOL_GPL(nd_tbl);
-void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data,
+void __ndisc_fill_addr_option(struct sk_buff *skb, int type, const void *data,
int data_len, int pad)
{
int space = __ndisc_opt_addr_space(data_len, pad);
@@ -164,7 +166,7 @@ void __ndisc_fill_addr_option(struct sk_buff *skb, int type, void *data,
EXPORT_SYMBOL_GPL(__ndisc_fill_addr_option);
static inline void ndisc_fill_addr_option(struct sk_buff *skb, int type,
- void *data, u8 icmp6_type)
+ const void *data, u8 icmp6_type)
{
__ndisc_fill_addr_option(skb, type, data, skb->dev->addr_len,
ndisc_addr_option_pad(skb->dev->type));
@@ -195,9 +197,12 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur,
static inline int ndisc_is_useropt(const struct net_device *dev,
struct nd_opt_hdr *opt)
{
- return opt->nd_opt_type == ND_OPT_RDNSS ||
+ return opt->nd_opt_type == ND_OPT_PREFIX_INFO ||
+ opt->nd_opt_type == ND_OPT_RDNSS ||
opt->nd_opt_type == ND_OPT_DNSSL ||
- ndisc_ops_is_useropt(dev, opt->nd_opt_type);
+ opt->nd_opt_type == ND_OPT_6CO ||
+ opt->nd_opt_type == ND_OPT_CAPTIVE_PORTAL ||
+ opt->nd_opt_type == ND_OPT_PREF64;
}
static struct nd_opt_hdr *ndisc_next_useropt(const struct net_device *dev,
@@ -222,6 +227,7 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
return NULL;
memset(ndopts, 0, sizeof(*ndopts));
while (opt_len) {
+ bool unknown = false;
int l;
if (opt_len < sizeof(struct nd_opt_hdr))
return NULL;
@@ -237,9 +243,8 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
case ND_OPT_NONCE:
case ND_OPT_REDIRECT_HDR:
if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
- ND_PRINTK(2, warn,
- "%s: duplicated ND6 option found: type=%d\n",
- __func__, nd_opt->nd_opt_type);
+ net_dbg_ratelimited("%s: duplicated ND6 option found: type=%d\n",
+ __func__, nd_opt->nd_opt_type);
} else {
ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt;
}
@@ -257,22 +262,20 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev,
break;
#endif
default:
- if (ndisc_is_useropt(dev, nd_opt)) {
- ndopts->nd_useropts_end = nd_opt;
- if (!ndopts->nd_useropts)
- ndopts->nd_useropts = nd_opt;
- } else {
- /*
- * Unknown options must be silently ignored,
- * to accommodate future extension to the
- * protocol.
- */
- ND_PRINTK(2, notice,
- "%s: ignored unsupported option; type=%d, len=%d\n",
- __func__,
- nd_opt->nd_opt_type,
- nd_opt->nd_opt_len);
- }
+ unknown = true;
+ }
+ if (ndisc_is_useropt(dev, nd_opt)) {
+ ndopts->nd_useropts_end = nd_opt;
+ if (!ndopts->nd_useropts)
+ ndopts->nd_useropts = nd_opt;
+ } else if (unknown) {
+ /*
+ * Unknown options must be silently ignored,
+ * to accommodate future extension to the
+ * protocol.
+ */
+ net_dbg_ratelimited("%s: ignored unsupported option; type=%d, len=%d\n",
+ __func__, nd_opt->nd_opt_type, nd_opt->nd_opt_len);
}
next_opt:
opt_len -= l;
@@ -370,42 +373,53 @@ static int ndisc_constructor(struct neighbour *neigh)
static int pndisc_constructor(struct pneigh_entry *n)
{
struct in6_addr *addr = (struct in6_addr *)&n->key;
- struct in6_addr maddr;
struct net_device *dev = n->dev;
+ struct in6_addr maddr;
- if (!dev || !__in6_dev_get(dev))
+ if (!dev)
return -EINVAL;
+
addrconf_addr_solict_mult(addr, &maddr);
- ipv6_dev_mc_inc(dev, &maddr);
- return 0;
+ return ipv6_dev_mc_inc(dev, &maddr);
}
static void pndisc_destructor(struct pneigh_entry *n)
{
struct in6_addr *addr = (struct in6_addr *)&n->key;
- struct in6_addr maddr;
struct net_device *dev = n->dev;
+ struct in6_addr maddr;
- if (!dev || !__in6_dev_get(dev))
+ if (!dev)
return;
+
addrconf_addr_solict_mult(addr, &maddr);
ipv6_dev_mc_dec(dev, &maddr);
}
+/* called with rtnl held */
+static bool ndisc_allow_add(const struct net_device *dev,
+ struct netlink_ext_ack *extack)
+{
+ struct inet6_dev *idev = __in6_dev_get(dev);
+
+ if (!idev || idev->cnf.disable_ipv6) {
+ NL_SET_ERR_MSG(extack, "IPv6 is disabled on this device");
+ return false;
+ }
+
+ return true;
+}
+
static struct sk_buff *ndisc_alloc_skb(struct net_device *dev,
int len)
{
int hlen = LL_RESERVED_SPACE(dev);
int tlen = dev->needed_tailroom;
- struct sock *sk = dev_net(dev)->ipv6.ndisc_sk;
struct sk_buff *skb;
skb = alloc_skb(hlen + sizeof(struct ipv6hdr) + len + tlen, GFP_ATOMIC);
- if (!skb) {
- ND_PRINTK(0, err, "ndisc: %s failed to allocate an skb\n",
- __func__);
+ if (!skb)
return NULL;
- }
skb->protocol = htons(ETH_P_IPV6);
skb->dev = dev;
@@ -416,7 +430,9 @@ static struct sk_buff *ndisc_alloc_skb(struct net_device *dev,
/* Manually assign socket ownership as we avoid calling
* sock_alloc_send_pskb() to bypass wmem buffer limits
*/
- skb_set_owner_w(skb, sk);
+ rcu_read_lock();
+ skb_set_owner_w(skb, dev_net_rcu(dev)->ipv6.ndisc_sk);
+ rcu_read_unlock();
return skb;
}
@@ -432,7 +448,7 @@ static void ip6_nd_hdr(struct sk_buff *skb,
rcu_read_lock();
idev = __in6_dev_get(skb->dev);
- tclass = idev ? idev->cnf.ndisc_tclass : 0;
+ tclass = idev ? READ_ONCE(idev->cnf.ndisc_tclass) : 0;
rcu_read_unlock();
skb_push(skb, sizeof(*hdr));
@@ -449,20 +465,24 @@ static void ip6_nd_hdr(struct sk_buff *skb,
hdr->daddr = *daddr;
}
-static void ndisc_send_skb(struct sk_buff *skb,
- const struct in6_addr *daddr,
- const struct in6_addr *saddr)
+void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
{
+ struct icmp6hdr *icmp6h = icmp6_hdr(skb);
struct dst_entry *dst = skb_dst(skb);
- struct net *net = dev_net(skb->dev);
- struct sock *sk = net->ipv6.ndisc_sk;
+ struct net_device *dev;
struct inet6_dev *idev;
+ struct net *net;
+ struct sock *sk;
int err;
- struct icmp6hdr *icmp6h = icmp6_hdr(skb);
u8 type;
type = icmp6h->icmp6_type;
+ rcu_read_lock();
+
+ net = dev_net_rcu(skb->dev);
+ sk = net->ipv6.ndisc_sk;
if (!dst) {
struct flowi6 fl6;
int oif = skb->dev->ifindex;
@@ -470,6 +490,7 @@ static void ndisc_send_skb(struct sk_buff *skb,
icmpv6_flow_init(sk, &fl6, type, saddr, daddr, oif);
dst = icmp6_dst_alloc(skb->dev, &fl6);
if (IS_ERR(dst)) {
+ rcu_read_unlock();
kfree_skb(skb);
return;
}
@@ -482,14 +503,14 @@ static void ndisc_send_skb(struct sk_buff *skb,
csum_partial(icmp6h,
skb->len, 0));
- ip6_nd_hdr(skb, saddr, daddr, inet6_sk(sk)->hop_limit, skb->len);
+ ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len);
- rcu_read_lock();
- idev = __in6_dev_get(dst->dev);
- IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
+ dev = dst_dev_rcu(dst);
+ idev = __in6_dev_get(dev);
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
- net, sk, skb, NULL, dst->dev,
+ net, sk, skb, NULL, dev,
dst_output);
if (!err) {
ICMP6MSGOUT_INC_STATS(net, idev, type);
@@ -498,6 +519,7 @@ static void ndisc_send_skb(struct sk_buff *skb,
rcu_read_unlock();
}
+EXPORT_SYMBOL(ndisc_send_skb);
void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
const struct in6_addr *solicited_addr,
@@ -516,7 +538,7 @@ void ndisc_send_na(struct net_device *dev, const struct in6_addr *daddr,
src_addr = solicited_addr;
if (ifp->flags & IFA_F_OPTIMISTIC)
override = false;
- inc_opt |= ifp->idev->cnf.force_tllao;
+ inc_opt |= READ_ONCE(ifp->idev->cnf.force_tllao);
in6_ifa_put(ifp);
} else {
if (ipv6_dev_get_saddr(dev_net(dev), dev, daddr,
@@ -581,22 +603,16 @@ static void ndisc_send_unsol_na(struct net_device *dev)
in6_dev_put(idev);
}
-void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
- const struct in6_addr *daddr, const struct in6_addr *saddr,
- u64 nonce)
+struct sk_buff *ndisc_ns_create(struct net_device *dev, const struct in6_addr *solicit,
+ const struct in6_addr *saddr, u64 nonce)
{
- struct sk_buff *skb;
- struct in6_addr addr_buf;
int inc_opt = dev->addr_len;
- int optlen = 0;
+ struct sk_buff *skb;
struct nd_msg *msg;
+ int optlen = 0;
- if (!saddr) {
- if (ipv6_get_lladdr(dev, &addr_buf,
- (IFA_F_TENTATIVE|IFA_F_OPTIMISTIC)))
- return;
- saddr = &addr_buf;
- }
+ if (!saddr)
+ return NULL;
if (ipv6_addr_any(saddr))
inc_opt = false;
@@ -608,7 +624,7 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
skb = ndisc_alloc_skb(dev, sizeof(*msg) + optlen);
if (!skb)
- return;
+ return NULL;
msg = skb_put(skb, sizeof(*msg));
*msg = (struct nd_msg) {
@@ -630,7 +646,28 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
memcpy(opt + 2, &nonce, 6);
}
- ndisc_send_skb(skb, daddr, saddr);
+ return skb;
+}
+EXPORT_SYMBOL(ndisc_ns_create);
+
+void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
+ const struct in6_addr *daddr, const struct in6_addr *saddr,
+ u64 nonce)
+{
+ struct in6_addr addr_buf;
+ struct sk_buff *skb;
+
+ if (!saddr) {
+ if (ipv6_get_lladdr(dev, &addr_buf,
+ (IFA_F_TENTATIVE | IFA_F_OPTIMISTIC)))
+ return;
+ saddr = &addr_buf;
+ }
+
+ skb = ndisc_ns_create(dev, solicit, saddr, nonce);
+
+ if (skb)
+ ndisc_send_skb(skb, daddr, saddr);
}
void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr,
@@ -712,10 +749,9 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
saddr = &ipv6_hdr(skb)->saddr;
probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES);
if (probes < 0) {
- if (!(neigh->nud_state & NUD_VALID)) {
- ND_PRINTK(1, dbg,
- "%s: trying to ucast probe in NUD_INVALID: %pI6\n",
- __func__, target);
+ if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) {
+ net_dbg_ratelimited("%s: trying to ucast probe in NUD_INVALID: %pI6\n",
+ __func__, target);
}
ndisc_send_ns(dev, target, target, saddr, 0);
} else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) {
@@ -732,11 +768,9 @@ static int pndisc_is_router(const void *pkey,
struct pneigh_entry *n;
int ret = -1;
- read_lock_bh(&nd_tbl.lock);
- n = __pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev);
+ n = pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev);
if (n)
- ret = !!(n->flags & NTF_ROUTER);
- read_unlock_bh(&nd_tbl.lock);
+ ret = !!(READ_ONCE(n->flags) & NTF_ROUTER);
return ret;
}
@@ -750,7 +784,7 @@ void ndisc_update(const struct net_device *dev, struct neighbour *neigh,
ndisc_ops_update(dev, neigh, flags, icmp6_type, ndopts);
}
-static void ndisc_recv_ns(struct sk_buff *skb)
+static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb)
{
struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb);
const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
@@ -764,18 +798,17 @@ static void ndisc_recv_ns(struct sk_buff *skb)
struct inet6_dev *idev = NULL;
struct neighbour *neigh;
int dad = ipv6_addr_any(saddr);
- bool inc;
int is_router = -1;
+ SKB_DR(reason);
u64 nonce = 0;
+ bool inc;
- if (skb->len < sizeof(struct nd_msg)) {
- ND_PRINTK(2, warn, "NS: packet too short\n");
- return;
- }
+ if (skb->len < sizeof(struct nd_msg))
+ return SKB_DROP_REASON_PKT_TOO_SMALL;
if (ipv6_addr_is_multicast(&msg->target)) {
- ND_PRINTK(2, warn, "NS: multicast target address\n");
- return;
+ net_dbg_ratelimited("NS: multicast target address\n");
+ return reason;
}
/*
@@ -783,21 +816,18 @@ static void ndisc_recv_ns(struct sk_buff *skb)
* DAD has to be destined for solicited node multicast address.
*/
if (dad && !ipv6_addr_is_solict_mult(daddr)) {
- ND_PRINTK(2, warn, "NS: bad DAD packet (wrong destination)\n");
- return;
+ net_dbg_ratelimited("NS: bad DAD packet (wrong destination)\n");
+ return reason;
}
- if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) {
- ND_PRINTK(2, warn, "NS: invalid ND options\n");
- return;
- }
+ if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts))
+ return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS;
if (ndopts.nd_opts_src_lladdr) {
lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev);
if (!lladdr) {
- ND_PRINTK(2, warn,
- "NS: invalid link-layer address length\n");
- return;
+ net_dbg_ratelimited("NS: invalid link-layer address length\n");
+ return reason;
}
/* RFC2461 7.1.1:
@@ -806,9 +836,8 @@ static void ndisc_recv_ns(struct sk_buff *skb)
* in the message.
*/
if (dad) {
- ND_PRINTK(2, warn,
- "NS: bad DAD packet (link-layer address option)\n");
- return;
+ net_dbg_ratelimited("NS: bad DAD packet (link-layer address option)\n");
+ return reason;
}
}
if (ndopts.nd_opts_nonce && ndopts.nd_opts_nonce->nd_opt_len == 1)
@@ -824,10 +853,8 @@ have_ifp:
if (nonce != 0 && ifp->dad_nonce == nonce) {
u8 *np = (u8 *)&nonce;
/* Matching nonce if looped back */
- ND_PRINTK(2, notice,
- "%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n",
- ifp->idev->dev->name,
- &ifp->addr, np);
+ net_dbg_ratelimited("%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n",
+ ifp->idev->dev->name, &ifp->addr, np);
goto out;
}
/*
@@ -836,7 +863,7 @@ have_ifp:
* so fail our DAD process
*/
addrconf_dad_failure(skb, ifp);
- return;
+ return reason;
} else {
/*
* This is not a dad solicitation.
@@ -868,12 +895,13 @@ have_ifp:
idev = in6_dev_get(dev);
if (!idev) {
/* XXX: count this drop? */
- return;
+ return reason;
}
if (ipv6_chk_acast_addr(net, dev, &msg->target) ||
- (idev->cnf.forwarding &&
- (net->ipv6.devconf_all->proxy_ndp || idev->cnf.proxy_ndp) &&
+ (READ_ONCE(idev->cnf.forwarding) &&
+ (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) ||
+ READ_ONCE(idev->cnf.proxy_ndp)) &&
(is_router = pndisc_is_router(&msg->target, dev)) >= 0)) {
if (!(NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED) &&
skb->pkt_type != PACKET_HOST &&
@@ -891,12 +919,14 @@ have_ifp:
pneigh_enqueue(&nd_tbl, idev->nd_parms, n);
goto out;
}
- } else
+ } else {
+ SKB_DR_SET(reason, IPV6_NDISC_NS_OTHERHOST);
goto out;
+ }
}
if (is_router < 0)
- is_router = idev->cnf.forwarding;
+ is_router = READ_ONCE(idev->cnf.forwarding);
if (dad) {
ndisc_send_na(dev, &in6addr_linklocal_allnodes, &msg->target,
@@ -925,6 +955,7 @@ have_ifp:
true, (ifp != NULL && inc), inc);
if (neigh)
neigh_release(neigh);
+ reason = SKB_CONSUMED;
}
out:
@@ -932,9 +963,29 @@ out:
in6_ifa_put(ifp);
else
in6_dev_put(idev);
+ return reason;
+}
+
+static int accept_untracked_na(struct net_device *dev, struct in6_addr *saddr)
+{
+ struct inet6_dev *idev = __in6_dev_get(dev);
+
+ switch (READ_ONCE(idev->cnf.accept_untracked_na)) {
+ case 0: /* Don't accept untracked na (absent in neighbor cache) */
+ return 0;
+ case 1: /* Create new entries from na if currently untracked */
+ return 1;
+ case 2: /* Create new entries from untracked na only if saddr is in the
+ * same subnet as an address configured on the interface that
+ * received the na
+ */
+ return !!ipv6_chk_prefix(saddr, dev);
+ default:
+ return 0;
+ }
}
-static void ndisc_recv_na(struct sk_buff *skb)
+static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb)
{
struct nd_msg *msg = (struct nd_msg *)skb_transport_header(skb);
struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
@@ -947,41 +998,40 @@ static void ndisc_recv_na(struct sk_buff *skb)
struct inet6_dev *idev = __in6_dev_get(dev);
struct inet6_ifaddr *ifp;
struct neighbour *neigh;
+ SKB_DR(reason);
+ u8 new_state;
- if (skb->len < sizeof(struct nd_msg)) {
- ND_PRINTK(2, warn, "NA: packet too short\n");
- return;
- }
+ if (skb->len < sizeof(struct nd_msg))
+ return SKB_DROP_REASON_PKT_TOO_SMALL;
if (ipv6_addr_is_multicast(&msg->target)) {
- ND_PRINTK(2, warn, "NA: target address is multicast\n");
- return;
+ net_dbg_ratelimited("NA: target address is multicast\n");
+ return reason;
}
if (ipv6_addr_is_multicast(daddr) &&
msg->icmph.icmp6_solicited) {
- ND_PRINTK(2, warn, "NA: solicited NA is multicasted\n");
- return;
+ net_dbg_ratelimited("NA: solicited NA is multicasted\n");
+ return reason;
}
/* For some 802.11 wireless deployments (and possibly other networks),
* there will be a NA proxy and unsolicitd packets are attacks
* and thus should not be accepted.
+ * drop_unsolicited_na takes precedence over accept_untracked_na
*/
if (!msg->icmph.icmp6_solicited && idev &&
- idev->cnf.drop_unsolicited_na)
- return;
+ READ_ONCE(idev->cnf.drop_unsolicited_na))
+ return reason;
+
+ if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts))
+ return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS;
- if (!ndisc_parse_options(dev, msg->opt, ndoptlen, &ndopts)) {
- ND_PRINTK(2, warn, "NS: invalid ND option\n");
- return;
- }
if (ndopts.nd_opts_tgt_lladdr) {
lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev);
if (!lladdr) {
- ND_PRINTK(2, warn,
- "NA: invalid link-layer address length\n");
- return;
+ net_dbg_ratelimited("NA: invalid link-layer address length\n");
+ return reason;
}
}
ifp = ipv6_get_ifaddr(dev_net(dev), &msg->target, dev, 1);
@@ -989,7 +1039,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
if (skb->pkt_type != PACKET_LOOPBACK
&& (ifp->flags & IFA_F_TENTATIVE)) {
addrconf_dad_failure(skb, ifp);
- return;
+ return reason;
}
/* What should we make now? The advertisement
is invalid, but ndisc specs say nothing
@@ -1001,19 +1051,43 @@ static void ndisc_recv_na(struct sk_buff *skb)
unsolicited advertisement.
*/
if (skb->pkt_type != PACKET_LOOPBACK)
- ND_PRINTK(1, warn,
- "NA: %pM advertised our address %pI6c on %s!\n",
- eth_hdr(skb)->h_source, &ifp->addr, ifp->idev->dev->name);
+ net_warn_ratelimited("NA: %pM advertised our address %pI6c on %s!\n",
+ eth_hdr(skb)->h_source, &ifp->addr,
+ ifp->idev->dev->name);
in6_ifa_put(ifp);
- return;
+ return reason;
}
+
neigh = neigh_lookup(&nd_tbl, &msg->target, dev);
- if (neigh) {
+ /* RFC 9131 updates original Neighbour Discovery RFC 4861.
+ * NAs with Target LL Address option without a corresponding
+ * entry in the neighbour cache can now create a STALE neighbour
+ * cache entry on routers.
+ *
+ * entry accept fwding solicited behaviour
+ * ------- ------ ------ --------- ----------------------
+ * present X X 0 Set state to STALE
+ * present X X 1 Set state to REACHABLE
+ * absent 0 X X Do nothing
+ * absent 1 0 X Do nothing
+ * absent 1 1 X Add a new STALE entry
+ *
+ * Note that we don't do a (daddr == all-routers-mcast) check.
+ */
+ new_state = msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE;
+ if (!neigh && lladdr && idev && READ_ONCE(idev->cnf.forwarding)) {
+ if (accept_untracked_na(dev, saddr)) {
+ neigh = neigh_create(&nd_tbl, &msg->target, dev);
+ new_state = NUD_STALE;
+ }
+ }
+
+ if (neigh && !IS_ERR(neigh)) {
u8 old_flags = neigh->flags;
struct net *net = dev_net(dev);
- if (neigh->nud_state & NUD_FAILED)
+ if (READ_ONCE(neigh->nud_state) & NUD_FAILED)
goto out;
/*
@@ -1022,14 +1096,15 @@ static void ndisc_recv_na(struct sk_buff *skb)
* has already sent a NA to us.
*/
if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) &&
- net->ipv6.devconf_all->forwarding && net->ipv6.devconf_all->proxy_ndp &&
- pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) {
+ READ_ONCE(net->ipv6.devconf_all->forwarding) &&
+ READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
+ pneigh_lookup(&nd_tbl, net, &msg->target, dev)) {
/* XXX: idev->cnf.proxy_ndp */
goto out;
}
ndisc_update(dev, neigh, lladdr,
- msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE,
+ new_state,
NEIGH_UPDATE_F_WEAK_OVERRIDE|
(msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)|
NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
@@ -1042,13 +1117,14 @@ static void ndisc_recv_na(struct sk_buff *skb)
*/
rt6_clean_tohost(dev_net(dev), saddr);
}
-
+ reason = SKB_CONSUMED;
out:
neigh_release(neigh);
}
+ return reason;
}
-static void ndisc_recv_rs(struct sk_buff *skb)
+static enum skb_drop_reason ndisc_recv_rs(struct sk_buff *skb)
{
struct rs_msg *rs_msg = (struct rs_msg *)skb_transport_header(skb);
unsigned long ndoptlen = skb->len - sizeof(*rs_msg);
@@ -1057,18 +1133,19 @@ static void ndisc_recv_rs(struct sk_buff *skb)
const struct in6_addr *saddr = &ipv6_hdr(skb)->saddr;
struct ndisc_options ndopts;
u8 *lladdr = NULL;
+ SKB_DR(reason);
if (skb->len < sizeof(*rs_msg))
- return;
+ return SKB_DROP_REASON_PKT_TOO_SMALL;
idev = __in6_dev_get(skb->dev);
if (!idev) {
- ND_PRINTK(1, err, "RS: can't find in6 device\n");
- return;
+ net_err_ratelimited("RS: can't find in6 device\n");
+ return reason;
}
/* Don't accept RS if we're not in router mode */
- if (!idev->cnf.forwarding)
+ if (!READ_ONCE(idev->cnf.forwarding))
goto out;
/*
@@ -1079,10 +1156,8 @@ static void ndisc_recv_rs(struct sk_buff *skb)
goto out;
/* Parse ND options */
- if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts)) {
- ND_PRINTK(2, notice, "NS: invalid ND option, ignored\n");
- goto out;
- }
+ if (!ndisc_parse_options(skb->dev, rs_msg->opt, ndoptlen, &ndopts))
+ return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS;
if (ndopts.nd_opts_src_lladdr) {
lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr,
@@ -1099,9 +1174,10 @@ static void ndisc_recv_rs(struct sk_buff *skb)
NEIGH_UPDATE_F_OVERRIDE_ISROUTER,
NDISC_ROUTER_SOLICITATION, &ndopts);
neigh_release(neigh);
+ reason = SKB_CONSUMED;
}
out:
- return;
+ return reason;
}
static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt)
@@ -1150,73 +1226,63 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err);
}
-static void ndisc_router_discovery(struct sk_buff *skb)
+static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb)
{
struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb);
+ bool send_ifinfo_notify = false;
struct neighbour *neigh = NULL;
- struct inet6_dev *in6_dev;
+ struct ndisc_options ndopts;
struct fib6_info *rt = NULL;
+ struct inet6_dev *in6_dev;
+ struct fib6_table *table;
+ u32 defrtr_usr_metric;
+ unsigned int pref = 0;
+ __u32 old_if_flags;
struct net *net;
+ SKB_DR(reason);
int lifetime;
- struct ndisc_options ndopts;
int optlen;
- unsigned int pref = 0;
- __u32 old_if_flags;
- bool send_ifinfo_notify = false;
__u8 *opt = (__u8 *)(ra_msg + 1);
optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) -
sizeof(struct ra_msg);
- ND_PRINTK(2, info,
- "RA: %s, dev: %s\n",
- __func__, skb->dev->name);
+ net_dbg_ratelimited("RA: %s, dev: %s\n", __func__, skb->dev->name);
if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) {
- ND_PRINTK(2, warn, "RA: source address is not link-local\n");
- return;
- }
- if (optlen < 0) {
- ND_PRINTK(2, warn, "RA: packet too short\n");
- return;
+ net_dbg_ratelimited("RA: source address is not link-local\n");
+ return reason;
}
+ if (optlen < 0)
+ return SKB_DROP_REASON_PKT_TOO_SMALL;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) {
- ND_PRINTK(2, warn, "RA: from host or unauthorized router\n");
- return;
+ net_dbg_ratelimited("RA: from host or unauthorized router\n");
+ return reason;
}
#endif
- /*
- * set the RA_RECV flag in the interface
- */
-
in6_dev = __in6_dev_get(skb->dev);
if (!in6_dev) {
- ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n",
- skb->dev->name);
- return;
+ net_err_ratelimited("RA: can't find inet6 device for %s\n", skb->dev->name);
+ return reason;
}
- if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) {
- ND_PRINTK(2, warn, "RA: invalid ND options\n");
- return;
- }
+ if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts))
+ return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS;
if (!ipv6_accept_ra(in6_dev)) {
- ND_PRINTK(2, info,
- "RA: %s, did not accept ra for dev: %s\n",
- __func__, skb->dev->name);
+ net_dbg_ratelimited("RA: %s, did not accept ra for dev: %s\n", __func__,
+ skb->dev->name);
goto skip_linkparms;
}
#ifdef CONFIG_IPV6_NDISC_NODETYPE
/* skip link-specific parameters from interior routers */
if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) {
- ND_PRINTK(2, info,
- "RA: %s, nodetype is NODEFAULT, dev: %s\n",
- __func__, skb->dev->name);
+ net_dbg_ratelimited("RA: %s, nodetype is NODEFAULT, dev: %s\n", __func__,
+ skb->dev->name);
goto skip_linkparms;
}
#endif
@@ -1244,10 +1310,17 @@ static void ndisc_router_discovery(struct sk_buff *skb)
if (old_if_flags != in6_dev->if_flags)
send_ifinfo_notify = true;
- if (!in6_dev->cnf.accept_ra_defrtr) {
- ND_PRINTK(2, info,
- "RA: %s, defrtr is false for dev: %s\n",
- __func__, skb->dev->name);
+ if (!READ_ONCE(in6_dev->cnf.accept_ra_defrtr)) {
+ net_dbg_ratelimited("RA: %s, defrtr is false for dev: %s\n", __func__,
+ skb->dev->name);
+ goto skip_defrtr;
+ }
+
+ lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime);
+ if (lifetime != 0 &&
+ lifetime < READ_ONCE(in6_dev->cnf.accept_ra_min_lft)) {
+ net_dbg_ratelimited("RA: router lifetime (%ds) is too short: %s\n", lifetime,
+ skb->dev->name);
goto skip_defrtr;
}
@@ -1255,82 +1328,94 @@ static void ndisc_router_discovery(struct sk_buff *skb)
* accept_ra_from_local is set to true.
*/
net = dev_net(in6_dev->dev);
- if (!in6_dev->cnf.accept_ra_from_local &&
+ if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) &&
ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) {
- ND_PRINTK(2, info,
- "RA from local address detected on dev: %s: default router ignored\n",
- skb->dev->name);
+ net_dbg_ratelimited("RA from local address detected on dev: %s: default router ignored\n",
+ skb->dev->name);
goto skip_defrtr;
}
- lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime);
-
#ifdef CONFIG_IPV6_ROUTER_PREF
pref = ra_msg->icmph.icmp6_router_pref;
/* 10b is handled as if it were 00b (medium) */
if (pref == ICMPV6_ROUTER_PREF_INVALID ||
- !in6_dev->cnf.accept_ra_rtr_pref)
+ !READ_ONCE(in6_dev->cnf.accept_ra_rtr_pref))
pref = ICMPV6_ROUTER_PREF_MEDIUM;
#endif
-
+ /* routes added from RAs do not use nexthop objects */
rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
-
if (rt) {
- neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw,
- rt->fib6_nh.nh_dev, NULL,
+ neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
+ rt->fib6_nh->fib_nh_dev, NULL,
&ipv6_hdr(skb)->saddr);
if (!neigh) {
- ND_PRINTK(0, err,
- "RA: %s got default router without neighbour\n",
- __func__);
+ net_err_ratelimited("RA: %s got default router without neighbour\n",
+ __func__);
fib6_info_release(rt);
- return;
+ return reason;
}
}
- if (rt && lifetime == 0) {
- ip6_del_rt(net, rt);
+ /* Set default route metric as specified by user */
+ defrtr_usr_metric = in6_dev->cnf.ra_defrtr_metric;
+ /* delete the route if lifetime is 0 or if metric needs change */
+ if (rt && (lifetime == 0 || rt->fib6_metric != defrtr_usr_metric)) {
+ ip6_del_rt(net, rt, false);
rt = NULL;
}
- ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, for dev: %s\n",
- rt, lifetime, skb->dev->name);
+ net_dbg_ratelimited("RA: rt: %p lifetime: %d, metric: %d, for dev: %s\n", rt, lifetime,
+ defrtr_usr_metric, skb->dev->name);
if (!rt && lifetime) {
- ND_PRINTK(3, info, "RA: adding default router\n");
+ net_dbg_ratelimited("RA: adding default router\n");
+
+ if (neigh)
+ neigh_release(neigh);
rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr,
- skb->dev, pref);
+ skb->dev, pref, defrtr_usr_metric,
+ lifetime);
if (!rt) {
- ND_PRINTK(0, err,
- "RA: %s failed to add default route\n",
- __func__);
- return;
+ net_err_ratelimited("RA: %s failed to add default route\n", __func__);
+ return reason;
}
- neigh = ip6_neigh_lookup(&rt->fib6_nh.nh_gw,
- rt->fib6_nh.nh_dev, NULL,
+ neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
+ rt->fib6_nh->fib_nh_dev, NULL,
&ipv6_hdr(skb)->saddr);
if (!neigh) {
- ND_PRINTK(0, err,
- "RA: %s got default router without neighbour\n",
- __func__);
+ net_err_ratelimited("RA: %s got default router without neighbour\n",
+ __func__);
fib6_info_release(rt);
- return;
+ return reason;
}
neigh->flags |= NTF_ROUTER;
- } else if (rt) {
+ } else if (rt && IPV6_EXTRACT_PREF(rt->fib6_flags) != pref) {
+ struct nl_info nlinfo = {
+ .nl_net = net,
+ };
rt->fib6_flags = (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
+ inet6_rt_notify(RTM_NEWROUTE, rt, &nlinfo, NLM_F_REPLACE);
}
- if (rt)
+ if (rt) {
+ table = rt->fib6_table;
+ spin_lock_bh(&table->tb6_lock);
+
fib6_set_expires(rt, jiffies + (HZ * lifetime));
- if (in6_dev->cnf.accept_ra_min_hop_limit < 256 &&
+ fib6_add_gc_list(rt);
+
+ spin_unlock_bh(&table->tb6_lock);
+ }
+ if (READ_ONCE(in6_dev->cnf.accept_ra_min_hop_limit) < 256 &&
ra_msg->icmph.icmp6_hop_limit) {
- if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) {
- in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit;
+ if (READ_ONCE(in6_dev->cnf.accept_ra_min_hop_limit) <=
+ ra_msg->icmph.icmp6_hop_limit) {
+ WRITE_ONCE(in6_dev->cnf.hop_limit,
+ ra_msg->icmph.icmp6_hop_limit);
fib6_metric_set(rt, RTAX_HOPLIMIT,
ra_msg->icmph.icmp6_hop_limit);
} else {
- ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n");
+ net_dbg_ratelimited("RA: Got route advertisement with lower hop_limit than minimum\n");
}
}
@@ -1345,8 +1430,8 @@ skip_defrtr:
if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) {
rtime = (rtime*HZ)/1000;
- if (rtime < HZ/10)
- rtime = HZ/10;
+ if (rtime < HZ/100)
+ rtime = HZ/100;
NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime);
in6_dev->tstamp = jiffies;
send_ifinfo_notify = true;
@@ -1364,19 +1449,13 @@ skip_defrtr:
BASE_REACHABLE_TIME, rtime);
NEIGH_VAR_SET(in6_dev->nd_parms,
GC_STALETIME, 3 * rtime);
- in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime);
+ neigh_set_reach_time(in6_dev->nd_parms);
in6_dev->tstamp = jiffies;
send_ifinfo_notify = true;
}
}
}
- /*
- * Send a notify if RA changed managed/otherconf flags or timer settings
- */
- if (send_ifinfo_notify)
- inet6_ifinfo_notify(RTM_NEWLINK, in6_dev);
-
skip_linkparms:
/*
@@ -1392,8 +1471,7 @@ skip_linkparms:
lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr,
skb->dev);
if (!lladdr) {
- ND_PRINTK(2, warn,
- "RA: invalid link-layer address length\n");
+ net_dbg_ratelimited("RA: invalid link-layer address length\n");
goto out;
}
}
@@ -1403,26 +1481,25 @@ skip_linkparms:
NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
NEIGH_UPDATE_F_ISROUTER,
NDISC_ROUTER_ADVERTISEMENT, &ndopts);
+ reason = SKB_CONSUMED;
}
if (!ipv6_accept_ra(in6_dev)) {
- ND_PRINTK(2, info,
- "RA: %s, accept_ra is false for dev: %s\n",
- __func__, skb->dev->name);
+ net_dbg_ratelimited("RA: %s, accept_ra is false for dev: %s\n", __func__,
+ skb->dev->name);
goto out;
}
#ifdef CONFIG_IPV6_ROUTE_INFO
- if (!in6_dev->cnf.accept_ra_from_local &&
+ if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) &&
ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr,
in6_dev->dev, 0)) {
- ND_PRINTK(2, info,
- "RA from local address detected on dev: %s: router info ignored.\n",
- skb->dev->name);
+ net_dbg_ratelimited("RA from local address detected on dev: %s: router info ignored.\n",
+ skb->dev->name);
goto skip_routeinfo;
}
- if (in6_dev->cnf.accept_ra_rtr_pref && ndopts.nd_opts_ri) {
+ if (READ_ONCE(in6_dev->cnf.accept_ra_rtr_pref) && ndopts.nd_opts_ri) {
struct nd_opt_hdr *p;
for (p = ndopts.nd_opts_ri;
p;
@@ -1434,11 +1511,14 @@ skip_linkparms:
continue;
#endif
if (ri->prefix_len == 0 &&
- !in6_dev->cnf.accept_ra_defrtr)
+ !READ_ONCE(in6_dev->cnf.accept_ra_defrtr))
continue;
- if (ri->prefix_len < in6_dev->cnf.accept_ra_rt_info_min_plen)
+ if (ri->lifetime != 0 &&
+ ntohl(ri->lifetime) < READ_ONCE(in6_dev->cnf.accept_ra_min_lft))
continue;
- if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen)
+ if (ri->prefix_len < READ_ONCE(in6_dev->cnf.accept_ra_rt_info_min_plen))
+ continue;
+ if (ri->prefix_len > READ_ONCE(in6_dev->cnf.accept_ra_rt_info_max_plen))
continue;
rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3,
&ipv6_hdr(skb)->saddr);
@@ -1451,14 +1531,13 @@ skip_routeinfo:
#ifdef CONFIG_IPV6_NDISC_NODETYPE
/* skip link-specific ndopts from interior routers */
if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) {
- ND_PRINTK(2, info,
- "RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n",
- __func__, skb->dev->name);
+ net_dbg_ratelimited("RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n",
+ __func__, skb->dev->name);
goto out;
}
#endif
- if (in6_dev->cnf.accept_ra_pinfo && ndopts.nd_opts_pi) {
+ if (READ_ONCE(in6_dev->cnf.accept_ra_pinfo) && ndopts.nd_opts_pi) {
struct nd_opt_hdr *p;
for (p = ndopts.nd_opts_pi;
p;
@@ -1469,17 +1548,22 @@ skip_routeinfo:
}
}
- if (ndopts.nd_opts_mtu && in6_dev->cnf.accept_ra_mtu) {
+ if (ndopts.nd_opts_mtu && READ_ONCE(in6_dev->cnf.accept_ra_mtu)) {
__be32 n;
u32 mtu;
memcpy(&n, ((u8 *)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu));
mtu = ntohl(n);
+ if (in6_dev->ra_mtu != mtu) {
+ in6_dev->ra_mtu = mtu;
+ send_ifinfo_notify = true;
+ }
+
if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) {
- ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu);
- } else if (in6_dev->cnf.mtu6 != mtu) {
- in6_dev->cnf.mtu6 = mtu;
+ net_dbg_ratelimited("RA: invalid mtu: %d\n", mtu);
+ } else if (READ_ONCE(in6_dev->cnf.mtu6) != mtu) {
+ WRITE_ONCE(in6_dev->cnf.mtu6, mtu);
fib6_metric_set(rt, RTAX_MTU, mtu);
rt6_mtu_change(skb->dev, mtu);
}
@@ -1496,53 +1580,59 @@ skip_routeinfo:
}
if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) {
- ND_PRINTK(2, warn, "RA: invalid RA options\n");
+ net_dbg_ratelimited("RA: invalid RA options\n");
}
out:
+ /* Send a notify if RA changed managed/otherconf flags or
+ * timer settings or ra_mtu value
+ */
+ if (send_ifinfo_notify)
+ inet6_ifinfo_notify(RTM_NEWLINK, in6_dev);
+
fib6_info_release(rt);
if (neigh)
neigh_release(neigh);
+ return reason;
}
-static void ndisc_redirect_rcv(struct sk_buff *skb)
+static enum skb_drop_reason ndisc_redirect_rcv(struct sk_buff *skb)
{
- u8 *hdr;
- struct ndisc_options ndopts;
struct rd_msg *msg = (struct rd_msg *)skb_transport_header(skb);
u32 ndoptlen = skb_tail_pointer(skb) - (skb_transport_header(skb) +
offsetof(struct rd_msg, opt));
+ struct ndisc_options ndopts;
+ SKB_DR(reason);
+ u8 *hdr;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
switch (skb->ndisc_nodetype) {
case NDISC_NODETYPE_HOST:
case NDISC_NODETYPE_NODEFAULT:
- ND_PRINTK(2, warn,
- "Redirect: from host or unauthorized router\n");
- return;
+ net_dbg_ratelimited("Redirect: from host or unauthorized router\n");
+ return reason;
}
#endif
if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) {
- ND_PRINTK(2, warn,
- "Redirect: source address is not link-local\n");
- return;
+ net_dbg_ratelimited("Redirect: source address is not link-local\n");
+ return reason;
}
if (!ndisc_parse_options(skb->dev, msg->opt, ndoptlen, &ndopts))
- return;
+ return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS;
if (!ndopts.nd_opts_rh) {
ip6_redirect_no_header(skb, dev_net(skb->dev),
- skb->dev->ifindex, 0);
- return;
+ skb->dev->ifindex);
+ return reason;
}
hdr = (u8 *)ndopts.nd_opts_rh;
hdr += 8;
if (!pskb_pull(skb, hdr - skb_transport_header(skb)))
- return;
+ return SKB_DROP_REASON_PKT_TOO_SMALL;
- icmpv6_notify(skb, NDISC_REDIRECT, 0, 0);
+ return icmpv6_notify(skb, NDISC_REDIRECT, 0, 0);
}
static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb,
@@ -1563,7 +1653,7 @@ static void ndisc_fill_redirect_hdr_option(struct sk_buff *skb,
void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
{
struct net_device *dev = skb->dev;
- struct net *net = dev_net(dev);
+ struct net *net = dev_net_rcu(dev);
struct sock *sk = net->ipv6.ndisc_sk;
int optlen = 0;
struct inet_peer *peer;
@@ -1578,22 +1668,20 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
ops_data_buf[NDISC_OPS_REDIRECT_DATA_SPACE], *ops_data = NULL;
bool ret;
- if (netif_is_l3_master(skb->dev)) {
- dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
+ if (netif_is_l3_master(dev)) {
+ dev = dev_get_by_index_rcu(net, IPCB(skb)->iif);
if (!dev)
return;
}
if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) {
- ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n",
- dev->name);
+ net_dbg_ratelimited("Redirect: no link-local address on %s\n", dev->name);
return;
}
if (!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, target) &&
ipv6_addr_type(target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
- ND_PRINTK(2, warn,
- "Redirect: target address is not link-local unicast\n");
+ net_dbg_ratelimited("Redirect: target address is not link-local unicast\n");
return;
}
@@ -1609,25 +1697,23 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target)
if (IS_ERR(dst))
return;
- rt = (struct rt6_info *) dst;
+ rt = dst_rt6_info(dst);
if (rt->rt6i_flags & RTF_GATEWAY) {
- ND_PRINTK(2, warn,
- "Redirect: destination is not a neighbour\n");
+ net_dbg_ratelimited("Redirect: destination is not a neighbour\n");
goto release;
}
- peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr, 1);
+
+ peer = inet_getpeer_v6(net->ipv6.peers, &ipv6_hdr(skb)->saddr);
ret = inet_peer_xrlim_allow(peer, 1*HZ);
- if (peer)
- inet_putpeer(peer);
+
if (!ret)
goto release;
if (dev->addr_len) {
struct neighbour *neigh = dst_neigh_lookup(skb_dst(skb), target);
if (!neigh) {
- ND_PRINTK(2, warn,
- "Redirect: no neigh for target address\n");
+ net_dbg_ratelimited("Redirect: no neigh for target address\n");
goto release;
}
@@ -1688,8 +1774,14 @@ release:
static void pndisc_redo(struct sk_buff *skb)
{
- ndisc_recv_ns(skb);
- kfree_skb(skb);
+ enum skb_drop_reason reason = ndisc_recv_ns(skb);
+
+ kfree_skb_reason(skb, reason);
+}
+
+static int ndisc_is_multicast(const void *pkey)
+{
+ return ipv6_addr_is_multicast((struct in6_addr *)pkey);
}
static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb)
@@ -1699,64 +1791,62 @@ static bool ndisc_suppress_frag_ndisc(struct sk_buff *skb)
if (!idev)
return true;
if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED &&
- idev->cnf.suppress_frag_ndisc) {
+ READ_ONCE(idev->cnf.suppress_frag_ndisc)) {
net_warn_ratelimited("Received fragmented ndisc packet. Carefully consider disabling suppress_frag_ndisc.\n");
return true;
}
return false;
}
-int ndisc_rcv(struct sk_buff *skb)
+enum skb_drop_reason ndisc_rcv(struct sk_buff *skb)
{
struct nd_msg *msg;
+ SKB_DR(reason);
if (ndisc_suppress_frag_ndisc(skb))
- return 0;
+ return SKB_DROP_REASON_IPV6_NDISC_FRAG;
if (skb_linearize(skb))
- return 0;
+ return SKB_DROP_REASON_NOMEM;
msg = (struct nd_msg *)skb_transport_header(skb);
__skb_push(skb, skb->data - skb_transport_header(skb));
if (ipv6_hdr(skb)->hop_limit != 255) {
- ND_PRINTK(2, warn, "NDISC: invalid hop-limit: %d\n",
- ipv6_hdr(skb)->hop_limit);
- return 0;
+ net_dbg_ratelimited("NDISC: invalid hop-limit: %d\n", ipv6_hdr(skb)->hop_limit);
+ return SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT;
}
if (msg->icmph.icmp6_code != 0) {
- ND_PRINTK(2, warn, "NDISC: invalid ICMPv6 code: %d\n",
- msg->icmph.icmp6_code);
- return 0;
+ net_dbg_ratelimited("NDISC: invalid ICMPv6 code: %d\n", msg->icmph.icmp6_code);
+ return SKB_DROP_REASON_IPV6_NDISC_BAD_CODE;
}
- memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
-
switch (msg->icmph.icmp6_type) {
case NDISC_NEIGHBOUR_SOLICITATION:
- ndisc_recv_ns(skb);
+ memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
+ reason = ndisc_recv_ns(skb);
break;
case NDISC_NEIGHBOUR_ADVERTISEMENT:
- ndisc_recv_na(skb);
+ reason = ndisc_recv_na(skb);
break;
case NDISC_ROUTER_SOLICITATION:
- ndisc_recv_rs(skb);
+ reason = ndisc_recv_rs(skb);
break;
case NDISC_ROUTER_ADVERTISEMENT:
- ndisc_router_discovery(skb);
+ reason = ndisc_router_discovery(skb);
break;
case NDISC_REDIRECT:
- ndisc_redirect_rcv(skb);
+ reason = ndisc_redirect_rcv(skb);
break;
}
- return 0;
+ return reason;
}
static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
@@ -1765,25 +1855,37 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event,
struct netdev_notifier_change_info *change_info;
struct net *net = dev_net(dev);
struct inet6_dev *idev;
+ bool evict_nocarrier;
switch (event) {
case NETDEV_CHANGEADDR:
neigh_changeaddr(&nd_tbl, dev);
fib6_run_gc(0, net, false);
- /* fallthrough */
+ fallthrough;
case NETDEV_UP:
idev = in6_dev_get(dev);
if (!idev)
break;
- if (idev->cnf.ndisc_notify ||
- net->ipv6.devconf_all->ndisc_notify)
+ if (READ_ONCE(idev->cnf.ndisc_notify) ||
+ READ_ONCE(net->ipv6.devconf_all->ndisc_notify))
ndisc_send_unsol_na(dev);
in6_dev_put(idev);
break;
case NETDEV_CHANGE:
+ idev = in6_dev_get(dev);
+ if (!idev)
+ evict_nocarrier = true;
+ else {
+ evict_nocarrier = READ_ONCE(idev->cnf.ndisc_evict_nocarrier) &&
+ READ_ONCE(net->ipv6.devconf_all->ndisc_evict_nocarrier);
+ in6_dev_put(idev);
+ }
+
change_info = ptr;
if (change_info->flags_changed & IFF_NOARP)
neigh_changeaddr(&nd_tbl, dev);
+ if (evict_nocarrier && !netif_carrier_ok(dev))
+ neigh_carrier_down(&nd_tbl, dev);
break;
case NETDEV_DOWN:
neigh_ifdown(&nd_tbl, dev);
@@ -1805,13 +1907,13 @@ static struct notifier_block ndisc_netdev_notifier = {
};
#ifdef CONFIG_SYSCTL
-static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl,
+static void ndisc_warn_deprecated_sysctl(const struct ctl_table *ctl,
const char *func, const char *dev_name)
{
static char warncomm[TASK_COMM_LEN];
static int warned;
if (strcmp(warncomm, current->comm) && warned < 5) {
- strcpy(warncomm, current->comm);
+ strscpy(warncomm, current->comm);
pr_warn("process `%s' is using deprecated sysctl (%s) net.ipv6.neigh.%s.%s - use net.ipv6.neigh.%s.%s_ms instead\n",
warncomm, func,
dev_name, ctl->procname,
@@ -1820,7 +1922,8 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl,
}
}
-int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos)
+int ndisc_ifinfo_sysctl_change(const struct ctl_table *ctl, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
struct net_device *dev = ctl->extra1;
struct inet6_dev *idev;
@@ -1845,10 +1948,10 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *bu
ret = -1;
if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) {
- if (ctl->data == &NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME))
- idev->nd_parms->reachable_time =
- neigh_rand_reach_time(NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME));
- idev->tstamp = jiffies;
+ if (ctl->data == NEIGH_VAR_PTR(idev->nd_parms, BASE_REACHABLE_TIME))
+ neigh_set_reach_time(idev->nd_parms);
+
+ WRITE_ONCE(idev->tstamp, jiffies);
inet6_ifinfo_notify(RTM_NEWLINK, idev);
in6_dev_put(idev);
}
@@ -1867,9 +1970,8 @@ static int __net_init ndisc_net_init(struct net *net)
err = inet_ctl_sock_create(&sk, PF_INET6,
SOCK_RAW, IPPROTO_ICMPV6, net);
if (err < 0) {
- ND_PRINTK(0, err,
- "NDISC: Failed to initialize the control socket (err %d)\n",
- err);
+ net_err_ratelimited("NDISC: Failed to initialize the control socket (err %d)\n",
+ err);
return err;
}
@@ -1878,7 +1980,7 @@ static int __net_init ndisc_net_init(struct net *net)
np = inet6_sk(sk);
np->hop_limit = 255;
/* Do not loopback ndisc messages */
- np->mc_loop = 0;
+ inet6_clear_bit(MC6_LOOP, sk);
return 0;
}
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 5ae8e1c51079..46540a5a4331 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -16,22 +16,36 @@
#include <net/ip6_route.h>
#include <net/xfrm.h>
#include <net/netfilter/nf_queue.h>
+#include <net/netfilter/nf_conntrack_bridge.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include "../bridge/br_private.h"
-int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
+int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff *skb)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
- struct sock *sk = sk_to_full_sk(skb->sk);
+ struct sock *sk = sk_to_full_sk(sk_partial);
+ struct net_device *dev = skb_dst_dev(skb);
+ struct flow_keys flkeys;
unsigned int hh_len;
struct dst_entry *dst;
+ int strict = (ipv6_addr_type(&iph->daddr) &
+ (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
struct flowi6 fl6 = {
- .flowi6_oif = sk ? sk->sk_bound_dev_if : 0,
+ .flowi6_l3mdev = l3mdev_master_ifindex(dev),
.flowi6_mark = skb->mark,
.flowi6_uid = sock_net_uid(net, sk),
.daddr = iph->daddr,
.saddr = iph->saddr,
+ .flowlabel = ip6_flowinfo(iph),
};
int err;
+ if (sk && sk->sk_bound_dev_if)
+ fl6.flowi6_oif = sk->sk_bound_dev_if;
+ else if (strict)
+ fl6.flowi6_oif = dev->ifindex;
+
+ fib6_rules_early_flow_dissect(net, skb, &fl6, &flkeys);
dst = ip6_route_output(net, sk, &fl6);
err = dst->error;
if (err) {
@@ -48,8 +62,11 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
#ifdef CONFIG_XFRM
if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
- xfrm_decode_session(skb, flowi6_to_flowi(&fl6), AF_INET6) == 0) {
- skb_dst_set(skb, NULL);
+ xfrm_decode_session(net, skb, flowi6_to_flowi(&fl6), AF_INET6) == 0) {
+ /* ignore return value from skb_dstref_steal, xfrm_lookup takes
+ * care of dropping the refcnt if needed.
+ */
+ skb_dstref_steal(skb);
dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0);
if (IS_ERR(dst))
return PTR_ERR(dst);
@@ -58,7 +75,7 @@ int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
#endif
/* Change in oif may mean change in hh_len. */
- hh_len = skb_dst(skb)->dev->hard_header_len;
+ hh_len = skb_dst_dev(skb)->hard_header_len;
if (skb_headroom(skb) < hh_len &&
pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)),
0, GFP_ATOMIC))
@@ -78,13 +95,13 @@ static int nf_ip6_reroute(struct sk_buff *skb,
if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) ||
!ipv6_addr_equal(&iph->saddr, &rt_info->saddr) ||
skb->mark != rt_info->mark)
- return ip6_route_me_harder(entry->state.net, skb);
+ return ip6_route_me_harder(entry->state.net, entry->state.sk, skb);
}
return 0;
}
-static int nf_ip6_route(struct net *net, struct dst_entry **dst,
- struct flowi *fl, bool strict)
+int __nf_ip6_route(struct net *net, struct dst_entry **dst,
+ struct flowi *fl, bool strict)
{
static const struct ipv6_pinfo fake_pinfo;
static const struct inet_sock fake_sk = {
@@ -104,13 +121,145 @@ static int nf_ip6_route(struct net *net, struct dst_entry **dst,
*dst = result;
return err;
}
+EXPORT_SYMBOL_GPL(__nf_ip6_route);
+
+int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ struct nf_bridge_frag_data *data,
+ int (*output)(struct net *, struct sock *sk,
+ const struct nf_bridge_frag_data *data,
+ struct sk_buff *))
+{
+ int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+ u8 tstamp_type = skb->tstamp_type;
+ ktime_t tstamp = skb->tstamp;
+ struct ip6_frag_state state;
+ u8 *prevhdr, nexthdr = 0;
+ unsigned int mtu, hlen;
+ int hroom, err = 0;
+ __be32 frag_id;
+
+ err = ip6_find_1stfragopt(skb, &prevhdr);
+ if (err < 0)
+ goto blackhole;
+ hlen = err;
+ nexthdr = *prevhdr;
+
+ mtu = skb->dev->mtu;
+ if (frag_max_size > mtu ||
+ frag_max_size < IPV6_MIN_MTU)
+ goto blackhole;
+
+ mtu = frag_max_size;
+ if (mtu < hlen + sizeof(struct frag_hdr) + 8)
+ goto blackhole;
+ mtu -= hlen + sizeof(struct frag_hdr);
+
+ frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
+ &ipv6_hdr(skb)->saddr);
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL &&
+ (err = skb_checksum_help(skb)))
+ goto blackhole;
+
+ hroom = LL_RESERVED_SPACE(skb->dev);
+ if (skb_has_frag_list(skb)) {
+ unsigned int first_len = skb_pagelen(skb);
+ struct ip6_fraglist_iter iter;
+ struct sk_buff *frag2;
+
+ if (first_len - hlen > mtu)
+ goto blackhole;
+
+ if (skb_cloned(skb) ||
+ skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
+ goto slow_path;
+
+ skb_walk_frags(skb, frag2) {
+ if (frag2->len > mtu)
+ goto blackhole;
+
+ /* Partially cloned skb? */
+ if (skb_shared(frag2) ||
+ skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr)))
+ goto slow_path;
+ }
+
+ err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
+ &iter);
+ if (err < 0)
+ goto blackhole;
+
+ for (;;) {
+ /* Prepare header of the next frame,
+ * before previous one went down.
+ */
+ if (iter.frag)
+ ip6_fraglist_prepare(skb, &iter);
+
+ skb_set_delivery_time(skb, tstamp, tstamp_type);
+ err = output(net, sk, data, skb);
+ if (err || !iter.frag)
+ break;
+
+ skb = ip6_fraglist_next(&iter);
+ }
+
+ kfree(iter.tmp_hdr);
+ if (!err)
+ return 0;
+
+ kfree_skb_list(iter.frag);
+ return err;
+ }
+slow_path:
+ /* This is a linearized skbuff, the original geometry is lost for us.
+ * This may also be a clone skbuff, we could preserve the geometry for
+ * the copies but probably not worth the effort.
+ */
+ ip6_frag_init(skb, hlen, mtu, skb->dev->needed_tailroom,
+ LL_RESERVED_SPACE(skb->dev), prevhdr, nexthdr, frag_id,
+ &state);
+
+ while (state.left > 0) {
+ struct sk_buff *skb2;
+
+ skb2 = ip6_frag_next(skb, &state);
+ if (IS_ERR(skb2)) {
+ err = PTR_ERR(skb2);
+ goto blackhole;
+ }
+
+ skb_set_delivery_time(skb2, tstamp, tstamp_type);
+ err = output(net, sk, data, skb2);
+ if (err)
+ goto blackhole;
+ }
+ consume_skb(skb);
+ return err;
+
+blackhole:
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(br_ip6_fragment);
static const struct nf_ipv6_ops ipv6ops = {
+#if IS_MODULE(CONFIG_IPV6)
.chk_addr = ipv6_chk_addr,
- .route_input = ip6_route_input,
+ .route_me_harder = ip6_route_me_harder,
+ .dev_get_saddr = ipv6_dev_get_saddr,
+ .route = __nf_ip6_route,
+#if IS_ENABLED(CONFIG_SYN_COOKIES)
+ .cookie_init_sequence = __cookie_v6_init_sequence,
+ .cookie_v6_check = __cookie_v6_check,
+#endif
+#endif
+ .route_input = ip6_route_input,
.fragment = ip6_fragment,
- .route = nf_ip6_route,
.reroute = nf_ip6_reroute,
+#if IS_MODULE(CONFIG_IPV6)
+ .br_fragment = br_ip6_fragment,
+#endif
};
int __init ipv6_netfilter_init(void)
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 339d0762b027..81daf82ddc2d 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# IP netfilter configuration
#
@@ -5,6 +6,17 @@
menu "IPv6: Netfilter Configuration"
depends on INET && IPV6 && NETFILTER
+# old sockopt interface and eval loop
+config IP6_NF_IPTABLES_LEGACY
+ tristate "Legacy IP6 tables support"
+ depends on INET && IPV6 && NETFILTER_XTABLES_LEGACY
+ depends on NETFILTER_XTABLES
+ default m if NETFILTER_XTABLES_LEGACY
+ help
+ ip6tables is a legacy packet classifier.
+ This is not needed if you are using iptables over nftables
+ (iptables-nft).
+
config NF_SOCKET_IPV6
tristate "IPv6 socket lookup support"
help
@@ -23,42 +35,6 @@ config NF_TABLES_IPV6
if NF_TABLES_IPV6
-config NFT_CHAIN_ROUTE_IPV6
- tristate "IPv6 nf_tables route chain support"
- help
- This option enables the "route" chain for IPv6 in nf_tables. This
- chain type is used to force packet re-routing after mangling header
- fields such as the source, destination, flowlabel, hop-limit and
- the packet mark.
-
-if NF_NAT_IPV6
-
-config NFT_CHAIN_NAT_IPV6
- tristate "IPv6 nf_tables nat chain support"
- help
- This option enables the "nat" chain for IPv6 in nf_tables. This
- chain type is used to perform Network Address Translation (NAT)
- packet transformations such as the source, destination address and
- source and destination ports.
-
-config NFT_MASQ_IPV6
- tristate "IPv6 masquerade support for nf_tables"
- depends on NFT_MASQ
- select NF_NAT_MASQUERADE_IPV6
- help
- This is the expression that provides IPv4 masquerading support for
- nf_tables.
-
-config NFT_REDIR_IPV6
- tristate "IPv6 redirect support for nf_tables"
- depends on NFT_REDIR
- select NF_NAT_REDIRECT
- help
- This is the expression that provides IPv4 redirect support for
- nf_tables.
-
-endif # NF_NAT_IPV6
-
config NFT_REJECT_IPV6
select NF_REJECT_IPV6
default NFT_REJECT
@@ -82,14 +58,6 @@ config NFT_FIB_IPV6
endif # NF_TABLES_IPV6
endif # NF_TABLES
-config NF_FLOW_TABLE_IPV6
- tristate "Netfilter flow table IPv6 module"
- depends on NF_FLOW_TABLE
- help
- This option adds the flow table IPv6 support.
-
- To compile it as a module, choose M here.
-
config NF_DUP_IPV6
tristate "Netfilter IPv6 packet duplication to alternate destination"
depends on !NF_CONNTRACK || NF_CONNTRACK
@@ -104,24 +72,10 @@ config NF_REJECT_IPV6
config NF_LOG_IPV6
tristate "IPv6 packet logging"
default m if NETFILTER_ADVANCED=n
- select NF_LOG_COMMON
-
-config NF_NAT_IPV6
- tristate "IPv6 NAT"
- depends on NF_CONNTRACK
- depends on NETFILTER_ADVANCED
- select NF_NAT
+ select NF_LOG_SYSLOG
help
- The IPv6 NAT option allows masquerading, port forwarding and other
- forms of full Network Address Port Translation. This can be
- controlled by iptables or nft.
-
-if NF_NAT_IPV6
-
-config NF_NAT_MASQUERADE_IPV6
- bool
-
-endif # NF_NAT_IPV6
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects CONFIG_NF_LOG_SYSLOG.
config IP6_NF_IPTABLES
tristate "IP6 tables support (required for filtering)"
@@ -179,10 +133,10 @@ config IP6_NF_MATCH_HL
tristate '"hl" hoplimit match support'
depends on NETFILTER_ADVANCED
select NETFILTER_XT_MATCH_HL
- ---help---
- This is a backwards-compat option for the user's convenience
- (e.g. when running oldconfig). It selects
- CONFIG_NETFILTER_XT_MATCH_HL.
+ help
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_MATCH_HL.
config IP6_NF_MATCH_IPV6HEADER
tristate '"ipv6header" IPv6 Extension Headers Match'
@@ -204,8 +158,8 @@ config IP6_NF_MATCH_MH
config IP6_NF_MATCH_RPFILTER
tristate '"rpfilter" reverse path filter match support'
depends on NETFILTER_ADVANCED
- depends on IP6_NF_MANGLE || IP6_NF_RAW
- ---help---
+ depends on IP6_NF_MANGLE || IP6_NF_RAW || NFT_COMPAT
+ help
This option allows you to match packets whose replies would
go out via the interface the packet came in.
@@ -222,27 +176,29 @@ config IP6_NF_MATCH_RT
To compile it as a module, choose M here. If unsure, say N.
config IP6_NF_MATCH_SRH
- tristate '"srh" Segment Routing header match support'
- depends on NETFILTER_ADVANCED
- help
- srh matching allows you to match packets based on the segment
+ tristate '"srh" Segment Routing header match support'
+ depends on NETFILTER_ADVANCED
+ help
+ srh matching allows you to match packets based on the segment
routing header of the packet.
- To compile it as a module, choose M here. If unsure, say N.
+ To compile it as a module, choose M here. If unsure, say N.
# The targets
config IP6_NF_TARGET_HL
tristate '"HL" hoplimit target support'
depends on NETFILTER_ADVANCED && IP6_NF_MANGLE
select NETFILTER_XT_TARGET_HL
- ---help---
- This is a backwards-compatible option for the user's convenience
- (e.g. when running oldconfig). It selects
- CONFIG_NETFILTER_XT_TARGET_HL.
+ help
+ This is a backwards-compatible option for the user's convenience
+ (e.g. when running oldconfig). It selects
+ CONFIG_NETFILTER_XT_TARGET_HL.
config IP6_NF_FILTER
tristate "Packet filtering"
- default m if NETFILTER_ADVANCED=n
+ default m if NETFILTER_ADVANCED=n || IP6_NF_IPTABLES_LEGACY
+ depends on IP6_NF_IPTABLES_LEGACY
+ tristate
help
Packet filtering defines a table `filter', which has a series of
rules for simple packet filtering at local input, forwarding and
@@ -252,7 +208,7 @@ config IP6_NF_FILTER
config IP6_NF_TARGET_REJECT
tristate "REJECT target support"
- depends on IP6_NF_FILTER
+ depends on IP6_NF_FILTER || NFT_COMPAT
select NF_REJECT_IPV6
default m if NETFILTER_ADVANCED=n
help
@@ -277,7 +233,8 @@ config IP6_NF_TARGET_SYNPROXY
config IP6_NF_MANGLE
tristate "Packet mangling"
- default m if NETFILTER_ADVANCED=n
+ default m if NETFILTER_ADVANCED=n || IP6_NF_IPTABLES_LEGACY
+ depends on IP6_NF_IPTABLES_LEGACY
help
This option adds a `mangle' table to iptables: see the man page for
iptables(8). This table is used for various packet alterations
@@ -287,31 +244,33 @@ config IP6_NF_MANGLE
config IP6_NF_RAW
tristate 'raw table support (required for TRACE)'
+ depends on IP6_NF_IPTABLES_LEGACY
help
This option adds a `raw' table to ip6tables. This table is the very
first in the netfilter framework and hooks in at the PREROUTING
and OUTPUT chains.
If you want to compile it as a module, say M here and read
- <file:Documentation/kbuild/modules.txt>. If unsure, say `N'.
+ <file:Documentation/kbuild/modules.rst>. If unsure, say `N'.
# security table for MAC policy
config IP6_NF_SECURITY
- tristate "Security table"
- depends on SECURITY
- depends on NETFILTER_ADVANCED
- help
- This option adds a `security' table to iptables, for use
- with Mandatory Access Control (MAC) policy.
+ tristate "Security table"
+ depends on SECURITY
+ depends on NETFILTER_ADVANCED
+ depends on IP6_NF_IPTABLES_LEGACY
+ help
+ This option adds a `security' table to iptables, for use
+ with Mandatory Access Control (MAC) policy.
- If unsure, say N.
+ If unsure, say N.
config IP6_NF_NAT
tristate "ip6tables NAT support"
depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
+ depends on IP6_NF_IPTABLES_LEGACY
select NF_NAT
- select NF_NAT_IPV6
select NETFILTER_XT_NAT
help
This enables the `nat' table in ip6tables. This allows masquerading,
@@ -320,30 +279,23 @@ config IP6_NF_NAT
To compile it as a module, choose M here. If unsure, say N.
-if IP6_NF_NAT
-
config IP6_NF_TARGET_MASQUERADE
tristate "MASQUERADE target support"
- select NF_NAT_MASQUERADE_IPV6
+ select NETFILTER_XT_TARGET_MASQUERADE
+ depends on IP6_NF_NAT
help
- Masquerading is a special case of NAT: all outgoing connections are
- changed to seem to come from a particular interface's address, and
- if the interface goes down, those connections are lost. This is
- only useful for dialup accounts with dynamic IP address (ie. your IP
- address will be different on next dialup).
-
- To compile it as a module, choose M here. If unsure, say N.
+ This is a backwards-compat option for the user's convenience
+ (e.g. when running oldconfig). It selects NETFILTER_XT_TARGET_MASQUERADE.
config IP6_NF_TARGET_NPT
tristate "NPT (Network Prefix translation) target support"
+ depends on IP6_NF_NAT || NFT_COMPAT
help
This option adds the `SNPT' and `DNPT' target, which perform
stateless IPv6-to-IPv6 Network Prefix Translation per RFC 6296.
To compile it as a module, choose M here. If unsure, say N.
-endif # IP6_NF_NAT
-
endif # IP6_NF_IPTABLES
endmenu
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 200c0c235565..66ce6fa5b2f5 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -4,17 +4,13 @@
#
# Link order matters here.
-obj-$(CONFIG_IP6_NF_IPTABLES) += ip6_tables.o
+obj-$(CONFIG_IP6_NF_IPTABLES_LEGACY) += ip6_tables.o
obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o
obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o
obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o
obj-$(CONFIG_IP6_NF_NAT) += ip6table_nat.o
-nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o
-nf_nat_ipv6-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
-obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o
-
# defrag
nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o
obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
@@ -22,26 +18,16 @@ obj-$(CONFIG_NF_DEFRAG_IPV6) += nf_defrag_ipv6.o
obj-$(CONFIG_NF_SOCKET_IPV6) += nf_socket_ipv6.o
obj-$(CONFIG_NF_TPROXY_IPV6) += nf_tproxy_ipv6.o
-# logging
-obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o
-
# reject
obj-$(CONFIG_NF_REJECT_IPV6) += nf_reject_ipv6.o
obj-$(CONFIG_NF_DUP_IPV6) += nf_dup_ipv6.o
# nf_tables
-obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o
-obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o
obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o
-obj-$(CONFIG_NFT_MASQ_IPV6) += nft_masq_ipv6.o
-obj-$(CONFIG_NFT_REDIR_IPV6) += nft_redir_ipv6.o
obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o
obj-$(CONFIG_NFT_FIB_IPV6) += nft_fib_ipv6.o
-# flow table support
-obj-$(CONFIG_NF_FLOW_TABLE_IPV6) += nf_flow_table_ipv6.o
-
# matches
obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o
obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o
@@ -54,7 +40,6 @@ obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o
obj-$(CONFIG_IP6_NF_MATCH_SRH) += ip6t_srh.o
# targets
-obj-$(CONFIG_IP6_NF_TARGET_MASQUERADE) += ip6t_MASQUERADE.o
obj-$(CONFIG_IP6_NF_TARGET_NPT) += ip6t_NPT.o
obj-$(CONFIG_IP6_NF_TARGET_REJECT) += ip6t_REJECT.o
obj-$(CONFIG_IP6_NF_TARGET_SYNPROXY) += ip6t_SYNPROXY.o
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index daf2e9e9193d..d585ac3c1113 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Packet matching code.
*
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
* Copyright (C) 2000-2005 Netfilter Core Team <coreteam@netfilter.org>
* Copyright (c) 2006-2010 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -21,7 +18,6 @@
#include <linux/netdevice.h>
#include <linux/module.h>
#include <linux/poison.h>
-#include <linux/icmpv6.h>
#include <net/ipv6.h>
#include <net/compat.h>
#include <linux/uaccess.h>
@@ -38,7 +34,6 @@
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
MODULE_DESCRIPTION("IPv6 packet filter");
-MODULE_ALIAS("ip6t_icmp6");
void *ip6t_alloc_initial_table(const struct xt_table *info)
{
@@ -54,7 +49,7 @@ ip6_packet_match(const struct sk_buff *skb,
const char *outdev,
const struct ip6t_ip6 *ip6info,
unsigned int *protoff,
- int *fragoff, bool *hotdrop)
+ u16 *fragoff, bool *hotdrop)
{
unsigned long ret;
const struct ipv6hdr *ipv6 = ipv6_hdr(skb);
@@ -250,10 +245,10 @@ ip6t_next_entry(const struct ip6t_entry *entry)
/* Returns one of the generic firewall policies, like NF_ACCEPT. */
unsigned int
-ip6t_do_table(struct sk_buff *skb,
- const struct nf_hook_state *state,
- struct xt_table *table)
+ip6t_do_table(void *priv, struct sk_buff *skb,
+ const struct nf_hook_state *state)
{
+ const struct xt_table *table = priv;
unsigned int hook = state->hook;
static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
/* Initializing verdict to NF_DROP keeps gcc happy. */
@@ -276,6 +271,7 @@ ip6t_do_table(struct sk_buff *skb,
* things we don't know, ie. tcp syn flag or ports). If the
* rule is also a fragment-specific rule, non-fragments won't
* match it. */
+ acpar.fragoff = 0;
acpar.hotdrop = false;
acpar.state = state;
@@ -296,7 +292,7 @@ ip6t_do_table(struct sk_buff *skb,
* but it is no problem since absolute verdict is issued by these.
*/
if (static_key_false(&xt_tee_enabled))
- jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated);
+ jumpstack += private->stacksize * current->in_nf_duplicate;
e = get_entry(table_base, private->hook_entry[hook]);
@@ -887,7 +883,7 @@ copy_entries_to_user(unsigned int total_size,
return ret;
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
static void compat_standard_from_user(void *dst, const void *src)
{
int v = *(compat_int_t *)src;
@@ -963,8 +959,7 @@ static int compat_table_info(const struct xt_table_info *info,
}
#endif
-static int get_info(struct net *net, void __user *user,
- const int *len, int compat)
+static int get_info(struct net *net, void __user *user, const int *len)
{
char name[XT_TABLE_MAXNAMELEN];
struct xt_table *t;
@@ -977,18 +972,18 @@ static int get_info(struct net *net, void __user *user,
return -EFAULT;
name[XT_TABLE_MAXNAMELEN-1] = '\0';
-#ifdef CONFIG_COMPAT
- if (compat)
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
xt_compat_lock(AF_INET6);
#endif
t = xt_request_find_table_lock(net, AF_INET6, name);
if (!IS_ERR(t)) {
struct ip6t_getinfo info;
const struct xt_table_info *private = t->private;
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
struct xt_table_info tmp;
- if (compat) {
+ if (in_compat_syscall()) {
ret = compat_table_info(private, &tmp);
xt_compat_flush_offsets(AF_INET6);
private = &tmp;
@@ -1013,8 +1008,8 @@ static int get_info(struct net *net, void __user *user,
module_put(t->me);
} else
ret = PTR_ERR(t);
-#ifdef CONFIG_COMPAT
- if (compat)
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
xt_compat_unlock(AF_INET6);
#endif
return ret;
@@ -1065,7 +1060,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
struct xt_counters *counters;
struct ip6t_entry *iter;
- ret = 0;
counters = xt_counters_alloc(num_counters);
if (!counters) {
ret = -ENOMEM;
@@ -1111,7 +1105,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n");
}
vfree(counters);
- return ret;
+ return 0;
put_module:
module_put(t->me);
@@ -1123,7 +1117,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
}
static int
-do_replace(struct net *net, const void __user *user, unsigned int len)
+do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
int ret;
struct ip6t_replace tmp;
@@ -1131,7 +1125,9 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
void *loc_cpu_entry;
struct ip6t_entry *iter;
- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ if (len < sizeof(tmp))
+ return -EINVAL;
+ if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
return -EFAULT;
/* overflow check */
@@ -1139,6 +1135,8 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
return -ENOMEM;
if (tmp.num_counters == 0)
return -EINVAL;
+ if ((u64)len < (u64)tmp.size + sizeof(tmp))
+ return -EINVAL;
tmp.name[sizeof(tmp.name)-1] = 0;
@@ -1147,8 +1145,8 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
return -ENOMEM;
loc_cpu_entry = newinfo->entries;
- if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
- tmp.size) != 0) {
+ if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
+ tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
}
@@ -1172,8 +1170,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
}
static int
-do_add_counters(struct net *net, const void __user *user, unsigned int len,
- int compat)
+do_add_counters(struct net *net, sockptr_t arg, unsigned int len)
{
unsigned int i;
struct xt_counters_info tmp;
@@ -1184,7 +1181,7 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
struct ip6t_entry *iter;
unsigned int addend;
- paddc = xt_copy_counters_from_user(user, len, &tmp, compat);
+ paddc = xt_copy_counters(arg, len, &tmp);
if (IS_ERR(paddc))
return PTR_ERR(paddc);
t = xt_find_table_lock(net, AF_INET6, tmp.name);
@@ -1220,7 +1217,7 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
return ret;
}
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
struct compat_ip6t_replace {
char name[XT_TABLE_MAXNAMELEN];
u32 valid_hooks;
@@ -1230,7 +1227,7 @@ struct compat_ip6t_replace {
u32 underflow[NF_INET_NUMHOOKS];
u32 num_counters;
compat_uptr_t counters; /* struct xt_counters * */
- struct compat_ip6t_entry entries[0];
+ struct compat_ip6t_entry entries[];
};
static int
@@ -1448,6 +1445,8 @@ translate_compat_table(struct net *net,
if (!newinfo)
goto out_unlock;
+ memset(newinfo->entries, 0, size);
+
newinfo->number = compatr->num_entries;
for (i = 0; i < NF_INET_NUMHOOKS; i++) {
newinfo->hook_entry[i] = compatr->hook_entry[i];
@@ -1498,7 +1497,7 @@ out_unlock:
}
static int
-compat_do_replace(struct net *net, void __user *user, unsigned int len)
+compat_do_replace(struct net *net, sockptr_t arg, unsigned int len)
{
int ret;
struct compat_ip6t_replace tmp;
@@ -1506,7 +1505,9 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
void *loc_cpu_entry;
struct ip6t_entry *iter;
- if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+ if (len < sizeof(tmp))
+ return -EINVAL;
+ if (copy_from_sockptr(&tmp, arg, sizeof(tmp)) != 0)
return -EFAULT;
/* overflow check */
@@ -1514,6 +1515,8 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
return -ENOMEM;
if (tmp.num_counters == 0)
return -EINVAL;
+ if ((u64)len < (u64)tmp.size + sizeof(tmp))
+ return -EINVAL;
tmp.name[sizeof(tmp.name)-1] = 0;
@@ -1522,8 +1525,8 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
return -ENOMEM;
loc_cpu_entry = newinfo->entries;
- if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
- tmp.size) != 0) {
+ if (copy_from_sockptr_offset(loc_cpu_entry, arg, sizeof(tmp),
+ tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
}
@@ -1546,35 +1549,10 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
return ret;
}
-static int
-compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user,
- unsigned int len)
-{
- int ret;
-
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
- return -EPERM;
-
- switch (cmd) {
- case IP6T_SO_SET_REPLACE:
- ret = compat_do_replace(sock_net(sk), user, len);
- break;
-
- case IP6T_SO_SET_ADD_COUNTERS:
- ret = do_add_counters(sock_net(sk), user, len, 1);
- break;
-
- default:
- ret = -EINVAL;
- }
-
- return ret;
-}
-
struct compat_ip6t_get_entries {
char name[XT_TABLE_MAXNAMELEN];
compat_uint_t size;
- struct compat_ip6t_entry entrytable[0];
+ struct compat_ip6t_entry entrytable[];
};
static int
@@ -1646,33 +1624,10 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
xt_compat_unlock(AF_INET6);
return ret;
}
-
-static int do_ip6t_get_ctl(struct sock *, int, void __user *, int *);
-
-static int
-compat_do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
-{
- int ret;
-
- if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
- return -EPERM;
-
- switch (cmd) {
- case IP6T_SO_GET_INFO:
- ret = get_info(sock_net(sk), user, len, 1);
- break;
- case IP6T_SO_GET_ENTRIES:
- ret = compat_get_entries(sock_net(sk), user, len);
- break;
- default:
- ret = do_ip6t_get_ctl(sk, cmd, user, len);
- }
- return ret;
-}
#endif
static int
-do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+do_ip6t_set_ctl(struct sock *sk, int cmd, sockptr_t arg, unsigned int len)
{
int ret;
@@ -1681,11 +1636,16 @@ do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
switch (cmd) {
case IP6T_SO_SET_REPLACE:
- ret = do_replace(sock_net(sk), user, len);
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ ret = compat_do_replace(sock_net(sk), arg, len);
+ else
+#endif
+ ret = do_replace(sock_net(sk), arg, len);
break;
case IP6T_SO_SET_ADD_COUNTERS:
- ret = do_add_counters(sock_net(sk), user, len, 0);
+ ret = do_add_counters(sock_net(sk), arg, len);
break;
default:
@@ -1705,11 +1665,16 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
switch (cmd) {
case IP6T_SO_GET_INFO:
- ret = get_info(sock_net(sk), user, len, 0);
+ ret = get_info(sock_net(sk), user, len);
break;
case IP6T_SO_GET_ENTRIES:
- ret = get_entries(sock_net(sk), user, len);
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
+ if (in_compat_syscall())
+ ret = compat_get_entries(sock_net(sk), user, len);
+ else
+#endif
+ ret = get_entries(sock_net(sk), user, len);
break;
case IP6T_SO_GET_REVISION_MATCH:
@@ -1766,10 +1731,11 @@ static void __ip6t_unregister_table(struct net *net, struct xt_table *table)
int ip6t_register_table(struct net *net, const struct xt_table *table,
const struct ip6t_replace *repl,
- const struct nf_hook_ops *ops,
- struct xt_table **res)
+ const struct nf_hook_ops *template_ops)
{
- int ret;
+ struct nf_hook_ops *ops;
+ unsigned int num_ops;
+ int ret, i;
struct xt_table_info *newinfo;
struct xt_table_info bootstrap = {0};
void *loc_cpu_entry;
@@ -1783,85 +1749,66 @@ int ip6t_register_table(struct net *net, const struct xt_table *table,
memcpy(loc_cpu_entry, repl->entries, repl->size);
ret = translate_table(net, newinfo, loc_cpu_entry, repl);
- if (ret != 0)
- goto out_free;
+ if (ret != 0) {
+ xt_free_table_info(newinfo);
+ return ret;
+ }
new_table = xt_register_table(net, table, &bootstrap, newinfo);
if (IS_ERR(new_table)) {
- ret = PTR_ERR(new_table);
- goto out_free;
+ struct ip6t_entry *iter;
+
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter, net);
+ xt_free_table_info(newinfo);
+ return PTR_ERR(new_table);
}
- /* set res now, will see skbs right after nf_register_net_hooks */
- WRITE_ONCE(*res, new_table);
- if (!ops)
+ if (!template_ops)
return 0;
- ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
- if (ret != 0) {
- __ip6t_unregister_table(net, new_table);
- *res = NULL;
+ num_ops = hweight32(table->valid_hooks);
+ if (num_ops == 0) {
+ ret = -EINVAL;
+ goto out_free;
+ }
+
+ ops = kmemdup_array(template_ops, num_ops, sizeof(*ops), GFP_KERNEL);
+ if (!ops) {
+ ret = -ENOMEM;
+ goto out_free;
}
+ for (i = 0; i < num_ops; i++)
+ ops[i].priv = new_table;
+
+ new_table->ops = ops;
+
+ ret = nf_register_net_hooks(net, ops, num_ops);
+ if (ret != 0)
+ goto out_free;
+
return ret;
out_free:
- xt_free_table_info(newinfo);
+ __ip6t_unregister_table(net, new_table);
return ret;
}
-void ip6t_unregister_table(struct net *net, struct xt_table *table,
- const struct nf_hook_ops *ops)
+void ip6t_unregister_table_pre_exit(struct net *net, const char *name)
{
- if (ops)
- nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
- __ip6t_unregister_table(net, table);
-}
+ struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name);
-/* Returns 1 if the type and code is matched by the range, 0 otherwise */
-static inline bool
-icmp6_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
- u_int8_t type, u_int8_t code,
- bool invert)
-{
- return (type == test_type && code >= min_code && code <= max_code)
- ^ invert;
+ if (table)
+ nf_unregister_net_hooks(net, table->ops, hweight32(table->valid_hooks));
}
-static bool
-icmp6_match(const struct sk_buff *skb, struct xt_action_param *par)
+void ip6t_unregister_table_exit(struct net *net, const char *name)
{
- const struct icmp6hdr *ic;
- struct icmp6hdr _icmph;
- const struct ip6t_icmp *icmpinfo = par->matchinfo;
-
- /* Must not be a fragment. */
- if (par->fragoff != 0)
- return false;
-
- ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph);
- if (ic == NULL) {
- /* We've been asked to examine this packet, and we
- * can't. Hence, no choice but to drop.
- */
- par->hotdrop = true;
- return false;
- }
+ struct xt_table *table = xt_find_table(net, NFPROTO_IPV6, name);
- return icmp6_type_code_match(icmpinfo->type,
- icmpinfo->code[0],
- icmpinfo->code[1],
- ic->icmp6_type, ic->icmp6_code,
- !!(icmpinfo->invflags&IP6T_ICMP_INV));
-}
-
-/* Called when user tries to insert an entry of this type. */
-static int icmp6_checkentry(const struct xt_mtchk_param *par)
-{
- const struct ip6t_icmp *icmpinfo = par->matchinfo;
-
- /* Must specify no unknown invflags */
- return (icmpinfo->invflags & ~IP6T_ICMP_INV) ? -EINVAL : 0;
+ if (table)
+ __ip6t_unregister_table(net, table);
}
/* The built-in targets: standard (NULL) and error. */
@@ -1870,7 +1817,7 @@ static struct xt_target ip6t_builtin_tg[] __read_mostly = {
.name = XT_STANDARD_TARGET,
.targetsize = sizeof(int),
.family = NFPROTO_IPV6,
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_NETFILTER_XTABLES_COMPAT
.compatsize = sizeof(compat_int_t),
.compat_from_user = compat_standard_from_user,
.compat_to_user = compat_standard_to_user,
@@ -1889,30 +1836,12 @@ static struct nf_sockopt_ops ip6t_sockopts = {
.set_optmin = IP6T_BASE_CTL,
.set_optmax = IP6T_SO_SET_MAX+1,
.set = do_ip6t_set_ctl,
-#ifdef CONFIG_COMPAT
- .compat_set = compat_do_ip6t_set_ctl,
-#endif
.get_optmin = IP6T_BASE_CTL,
.get_optmax = IP6T_SO_GET_MAX+1,
.get = do_ip6t_get_ctl,
-#ifdef CONFIG_COMPAT
- .compat_get = compat_do_ip6t_get_ctl,
-#endif
.owner = THIS_MODULE,
};
-static struct xt_match ip6t_builtin_mt[] __read_mostly = {
- {
- .name = "icmp6",
- .match = icmp6_match,
- .matchsize = sizeof(struct ip6t_icmp),
- .checkentry = icmp6_checkentry,
- .proto = IPPROTO_ICMPV6,
- .family = NFPROTO_IPV6,
- .me = THIS_MODULE,
- },
-};
-
static int __net_init ip6_tables_net_init(struct net *net)
{
return xt_proto_init(net, NFPROTO_IPV6);
@@ -1940,19 +1869,14 @@ static int __init ip6_tables_init(void)
ret = xt_register_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg));
if (ret < 0)
goto err2;
- ret = xt_register_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt));
- if (ret < 0)
- goto err4;
/* Register setsockopt */
ret = nf_register_sockopt(&ip6t_sockopts);
if (ret < 0)
- goto err5;
+ goto err4;
return 0;
-err5:
- xt_unregister_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt));
err4:
xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg));
err2:
@@ -1965,13 +1889,13 @@ static void __exit ip6_tables_fini(void)
{
nf_unregister_sockopt(&ip6t_sockopts);
- xt_unregister_matches(ip6t_builtin_mt, ARRAY_SIZE(ip6t_builtin_mt));
xt_unregister_targets(ip6t_builtin_tg, ARRAY_SIZE(ip6t_builtin_tg));
unregister_pernet_subsys(&ip6_tables_net_ops);
}
EXPORT_SYMBOL(ip6t_register_table);
-EXPORT_SYMBOL(ip6t_unregister_table);
+EXPORT_SYMBOL(ip6t_unregister_table_pre_exit);
+EXPORT_SYMBOL(ip6t_unregister_table_exit);
EXPORT_SYMBOL(ip6t_do_table);
module_init(ip6_tables_init);
diff --git a/net/ipv6/netfilter/ip6t_MASQUERADE.c b/net/ipv6/netfilter/ip6t_MASQUERADE.c
deleted file mode 100644
index 491f808e356a..000000000000
--- a/net/ipv6/netfilter/ip6t_MASQUERADE.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on Rusty Russell's IPv6 MASQUERADE target. Development of IPv6
- * NAT funded by Astaro.
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/netdevice.h>
-#include <linux/ipv6.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv6.h>
-#include <linux/netfilter/x_tables.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/addrconf.h>
-#include <net/ipv6.h>
-#include <net/netfilter/ipv6/nf_nat_masquerade.h>
-
-static unsigned int
-masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par)
-{
- return nf_nat_masquerade_ipv6(skb, par->targinfo, xt_out(par));
-}
-
-static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par)
-{
- const struct nf_nat_range2 *range = par->targinfo;
-
- if (range->flags & NF_NAT_RANGE_MAP_IPS)
- return -EINVAL;
- return nf_ct_netns_get(par->net, par->family);
-}
-
-static void masquerade_tg6_destroy(const struct xt_tgdtor_param *par)
-{
- nf_ct_netns_put(par->net, par->family);
-}
-
-static struct xt_target masquerade_tg6_reg __read_mostly = {
- .name = "MASQUERADE",
- .family = NFPROTO_IPV6,
- .checkentry = masquerade_tg6_checkentry,
- .destroy = masquerade_tg6_destroy,
- .target = masquerade_tg6,
- .targetsize = sizeof(struct nf_nat_range),
- .table = "nat",
- .hooks = 1 << NF_INET_POST_ROUTING,
- .me = THIS_MODULE,
-};
-
-static int __init masquerade_tg6_init(void)
-{
- int err;
-
- err = xt_register_target(&masquerade_tg6_reg);
- if (err == 0)
- nf_nat_masquerade_ipv6_register_notifier();
-
- return err;
-}
-static void __exit masquerade_tg6_exit(void)
-{
- nf_nat_masquerade_ipv6_unregister_notifier();
- xt_unregister_target(&masquerade_tg6_reg);
-}
-
-module_init(masquerade_tg6_init);
-module_exit(masquerade_tg6_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_DESCRIPTION("Xtables: automatic address SNAT");
diff --git a/net/ipv6/netfilter/ip6t_NPT.c b/net/ipv6/netfilter/ip6t_NPT.c
index a379d2f79b19..787c74aa85e3 100644
--- a/net/ipv6/netfilter/ip6t_NPT.c
+++ b/net/ipv6/netfilter/ip6t_NPT.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2011, 2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -80,16 +77,43 @@ static bool ip6t_npt_map_pfx(const struct ip6t_npt_tginfo *npt,
return true;
}
+static struct ipv6hdr *icmpv6_bounced_ipv6hdr(struct sk_buff *skb,
+ struct ipv6hdr *_bounced_hdr)
+{
+ if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
+ return NULL;
+
+ if (!icmpv6_is_err(icmp6_hdr(skb)->icmp6_type))
+ return NULL;
+
+ return skb_header_pointer(skb,
+ skb_transport_offset(skb) + sizeof(struct icmp6hdr),
+ sizeof(struct ipv6hdr),
+ _bounced_hdr);
+}
+
static unsigned int
ip6t_snpt_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ip6t_npt_tginfo *npt = par->targinfo;
+ struct ipv6hdr _bounced_hdr;
+ struct ipv6hdr *bounced_hdr;
+ struct in6_addr bounced_pfx;
if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->saddr)) {
icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD,
offsetof(struct ipv6hdr, saddr));
return NF_DROP;
}
+
+ /* rewrite dst addr of bounced packet which was sent to dst range */
+ bounced_hdr = icmpv6_bounced_ipv6hdr(skb, &_bounced_hdr);
+ if (bounced_hdr) {
+ ipv6_addr_prefix(&bounced_pfx, &bounced_hdr->daddr, npt->src_pfx_len);
+ if (ipv6_addr_cmp(&bounced_pfx, &npt->src_pfx.in6) == 0)
+ ip6t_npt_map_pfx(npt, &bounced_hdr->daddr);
+ }
+
return XT_CONTINUE;
}
@@ -97,12 +121,24 @@ static unsigned int
ip6t_dnpt_tg(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct ip6t_npt_tginfo *npt = par->targinfo;
+ struct ipv6hdr _bounced_hdr;
+ struct ipv6hdr *bounced_hdr;
+ struct in6_addr bounced_pfx;
if (!ip6t_npt_map_pfx(npt, &ipv6_hdr(skb)->daddr)) {
icmpv6_send(skb, ICMPV6_PARAMPROB, ICMPV6_HDR_FIELD,
offsetof(struct ipv6hdr, daddr));
return NF_DROP;
}
+
+ /* rewrite src addr of bounced packet which was sent from dst range */
+ bounced_hdr = icmpv6_bounced_ipv6hdr(skb, &_bounced_hdr);
+ if (bounced_hdr) {
+ ipv6_addr_prefix(&bounced_pfx, &bounced_hdr->saddr, npt->src_pfx_len);
+ if (ipv6_addr_cmp(&bounced_pfx, &npt->src_pfx.in6) == 0)
+ ip6t_npt_map_pfx(npt, &bounced_hdr->saddr);
+ }
+
return XT_CONTINUE;
}
diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c
index 38dea8ff680f..a35019d2e480 100644
--- a/net/ipv6/netfilter/ip6t_REJECT.c
+++ b/net/ipv6/netfilter/ip6t_REJECT.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IP6 tables REJECT target module
* Linux INET6 implementation
@@ -10,11 +11,6 @@
* Copyright (c) 2005-2007 Patrick McHardy <kaber@trash.net>
*
* Based on net/ipv4/netfilter/ipt_REJECT.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -65,7 +61,7 @@ reject_tg6(struct sk_buff *skb, const struct xt_action_param *par)
/* Do nothing */
break;
case IP6T_TCP_RESET:
- nf_send_reset6(net, skb, xt_hooknum(par));
+ nf_send_reset6(net, par->state->sk, skb, xt_hooknum(par));
break;
case IP6T_ICMP6_POLICY_FAIL:
nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, xt_hooknum(par));
diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c
index cb6d42b03cb5..d51d0c3e5fe9 100644
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -1,277 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2013 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <net/ip6_checksum.h>
-#include <net/ip6_route.h>
-#include <net/tcp.h>
-
#include <linux/netfilter_ipv6/ip6_tables.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_SYNPROXY.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/nf_conntrack_synproxy.h>
-#include <net/netfilter/nf_conntrack_ecache.h>
-
-static struct ipv6hdr *
-synproxy_build_ip(struct net *net, struct sk_buff *skb,
- const struct in6_addr *saddr,
- const struct in6_addr *daddr)
-{
- struct ipv6hdr *iph;
-
- skb_reset_network_header(skb);
- iph = skb_put(skb, sizeof(*iph));
- ip6_flow_hdr(iph, 0, 0);
- iph->hop_limit = net->ipv6.devconf_all->hop_limit;
- iph->nexthdr = IPPROTO_TCP;
- iph->saddr = *saddr;
- iph->daddr = *daddr;
-
- return iph;
-}
-
-static void
-synproxy_send_tcp(struct net *net,
- const struct sk_buff *skb, struct sk_buff *nskb,
- struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
- struct ipv6hdr *niph, struct tcphdr *nth,
- unsigned int tcp_hdr_size)
-{
- struct dst_entry *dst;
- struct flowi6 fl6;
-
- nth->check = ~tcp_v6_check(tcp_hdr_size, &niph->saddr, &niph->daddr, 0);
- nskb->ip_summed = CHECKSUM_PARTIAL;
- nskb->csum_start = (unsigned char *)nth - nskb->head;
- nskb->csum_offset = offsetof(struct tcphdr, check);
-
- memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_proto = IPPROTO_TCP;
- fl6.saddr = niph->saddr;
- fl6.daddr = niph->daddr;
- fl6.fl6_sport = nth->source;
- fl6.fl6_dport = nth->dest;
- security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6));
- dst = ip6_route_output(net, NULL, &fl6);
- if (dst->error) {
- dst_release(dst);
- goto free_nskb;
- }
- dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0);
- if (IS_ERR(dst))
- goto free_nskb;
-
- skb_dst_set(nskb, dst);
-
- if (nfct) {
- nf_ct_set(nskb, (struct nf_conn *)nfct, ctinfo);
- nf_conntrack_get(nfct);
- }
-
- ip6_local_out(net, nskb->sk, nskb);
- return;
-
-free_nskb:
- kfree_skb(nskb);
-}
-
-static void
-synproxy_send_client_synack(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts)
-{
- struct sk_buff *nskb;
- struct ipv6hdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
- u16 mss = opts->mss;
-
- iph = ipv6_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->dest;
- nth->dest = th->source;
- nth->seq = htonl(__cookie_v6_init_sequence(iph, th, &mss));
- nth->ack_seq = htonl(ntohl(th->seq) + 1);
- tcp_flag_word(nth) = TCP_FLAG_SYN | TCP_FLAG_ACK;
- if (opts->options & XT_SYNPROXY_OPT_ECN)
- tcp_flag_word(nth) |= TCP_FLAG_ECE;
- nth->doff = tcp_hdr_size / 4;
- nth->window = 0;
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
- IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
-}
-static void
-synproxy_send_server_syn(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts, u32 recv_seq)
-{
- struct synproxy_net *snet = synproxy_pernet(net);
- struct sk_buff *nskb;
- struct ipv6hdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
-
- iph = ipv6_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->source;
- nth->dest = th->dest;
- nth->seq = htonl(recv_seq - 1);
- /* ack_seq is used to relay our ISN to the synproxy hook to initialize
- * sequence number translation once a connection tracking entry exists.
- */
- nth->ack_seq = htonl(ntohl(th->ack_seq) - 1);
- tcp_flag_word(nth) = TCP_FLAG_SYN;
- if (opts->options & XT_SYNPROXY_OPT_ECN)
- tcp_flag_word(nth) |= TCP_FLAG_ECE | TCP_FLAG_CWR;
- nth->doff = tcp_hdr_size / 4;
- nth->window = th->window;
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
- niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_server_ack(struct net *net,
- const struct ip_ct_tcp *state,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts)
-{
- struct sk_buff *nskb;
- struct ipv6hdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
-
- iph = ipv6_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, &iph->daddr, &iph->saddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->dest;
- nth->dest = th->source;
- nth->seq = htonl(ntohl(th->ack_seq));
- nth->ack_seq = htonl(ntohl(th->seq) + 1);
- tcp_flag_word(nth) = TCP_FLAG_ACK;
- nth->doff = tcp_hdr_size / 4;
- nth->window = htons(state->seen[IP_CT_DIR_ORIGINAL].td_maxwin);
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
-}
-
-static void
-synproxy_send_client_ack(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- const struct synproxy_options *opts)
-{
- struct sk_buff *nskb;
- struct ipv6hdr *iph, *niph;
- struct tcphdr *nth;
- unsigned int tcp_hdr_size;
-
- iph = ipv6_hdr(skb);
-
- tcp_hdr_size = sizeof(*nth) + synproxy_options_size(opts);
- nskb = alloc_skb(sizeof(*niph) + tcp_hdr_size + MAX_TCP_HEADER,
- GFP_ATOMIC);
- if (nskb == NULL)
- return;
- skb_reserve(nskb, MAX_TCP_HEADER);
-
- niph = synproxy_build_ip(net, nskb, &iph->saddr, &iph->daddr);
-
- skb_reset_transport_header(nskb);
- nth = skb_put(nskb, tcp_hdr_size);
- nth->source = th->source;
- nth->dest = th->dest;
- nth->seq = htonl(ntohl(th->seq) + 1);
- nth->ack_seq = th->ack_seq;
- tcp_flag_word(nth) = TCP_FLAG_ACK;
- nth->doff = tcp_hdr_size / 4;
- nth->window = htons(ntohs(th->window) >> opts->wscale);
- nth->check = 0;
- nth->urg_ptr = 0;
-
- synproxy_build_options(nth, opts);
-
- synproxy_send_tcp(net, skb, nskb, skb_nfct(skb),
- IP_CT_ESTABLISHED_REPLY, niph, nth, tcp_hdr_size);
-}
-
-static bool
-synproxy_recv_client_ack(struct net *net,
- const struct sk_buff *skb, const struct tcphdr *th,
- struct synproxy_options *opts, u32 recv_seq)
-{
- struct synproxy_net *snet = synproxy_pernet(net);
- int mss;
-
- mss = __cookie_v6_check(ipv6_hdr(skb), th, ntohl(th->ack_seq) - 1);
- if (mss == 0) {
- this_cpu_inc(snet->stats->cookie_invalid);
- return false;
- }
-
- this_cpu_inc(snet->stats->cookie_valid);
- opts->mss = mss;
- opts->options |= XT_SYNPROXY_OPT_MSS;
-
- if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
- synproxy_check_timestamp_cookie(opts);
-
- synproxy_send_server_syn(net, skb, th, opts, recv_seq);
- return true;
-}
+#include <net/netfilter/nf_synproxy.h>
static unsigned int
synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
@@ -300,6 +36,8 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
opts.options |= XT_SYNPROXY_OPT_ECN;
opts.options &= info->options;
+ opts.mss_encode = opts.mss_option;
+ opts.mss_option = info->mss;
if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
synproxy_init_timestamp_cookie(info, &opts);
else
@@ -307,13 +45,14 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
XT_SYNPROXY_OPT_SACK_PERM |
XT_SYNPROXY_OPT_ECN);
- synproxy_send_client_synack(net, skb, th, &opts);
+ synproxy_send_client_synack_ipv6(net, skb, th, &opts);
consume_skb(skb);
return NF_STOLEN;
} else if (th->ack && !(th->fin || th->rst || th->syn)) {
/* ACK from client */
- if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq))) {
+ if (synproxy_recv_client_ack_ipv6(net, skb, th, &opts,
+ ntohl(th->seq))) {
consume_skb(skb);
return NF_STOLEN;
} else {
@@ -324,141 +63,6 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-static unsigned int ipv6_synproxy_hook(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *nhs)
-{
- struct net *net = nhs->net;
- struct synproxy_net *snet = synproxy_pernet(net);
- enum ip_conntrack_info ctinfo;
- struct nf_conn *ct;
- struct nf_conn_synproxy *synproxy;
- struct synproxy_options opts = {};
- const struct ip_ct_tcp *state;
- struct tcphdr *th, _th;
- __be16 frag_off;
- u8 nexthdr;
- int thoff;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (ct == NULL)
- return NF_ACCEPT;
-
- synproxy = nfct_synproxy(ct);
- if (synproxy == NULL)
- return NF_ACCEPT;
-
- if (nf_is_loopback_packet(skb))
- return NF_ACCEPT;
-
- nexthdr = ipv6_hdr(skb)->nexthdr;
- thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
- &frag_off);
- if (thoff < 0 || nexthdr != IPPROTO_TCP)
- return NF_ACCEPT;
-
- th = skb_header_pointer(skb, thoff, sizeof(_th), &_th);
- if (th == NULL)
- return NF_DROP;
-
- state = &ct->proto.tcp;
- switch (state->state) {
- case TCP_CONNTRACK_CLOSE:
- if (th->rst && !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
- nf_ct_seqadj_init(ct, ctinfo, synproxy->isn -
- ntohl(th->seq) + 1);
- break;
- }
-
- if (!th->syn || th->ack ||
- CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
- break;
-
- /* Reopened connection - reset the sequence number and timestamp
- * adjustments, they will get initialized once the connection is
- * reestablished.
- */
- nf_ct_seqadj_init(ct, ctinfo, 0);
- synproxy->tsoff = 0;
- this_cpu_inc(snet->stats->conn_reopened);
-
- /* fall through */
- case TCP_CONNTRACK_SYN_SENT:
- if (!synproxy_parse_options(skb, thoff, th, &opts))
- return NF_DROP;
-
- if (!th->syn && th->ack &&
- CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
- /* Keep-Alives are sent with SEG.SEQ = SND.NXT-1,
- * therefore we need to add 1 to make the SYN sequence
- * number match the one of first SYN.
- */
- if (synproxy_recv_client_ack(net, skb, th, &opts,
- ntohl(th->seq) + 1)) {
- this_cpu_inc(snet->stats->cookie_retrans);
- consume_skb(skb);
- return NF_STOLEN;
- } else {
- return NF_DROP;
- }
- }
-
- synproxy->isn = ntohl(th->ack_seq);
- if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP)
- synproxy->its = opts.tsecr;
-
- nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
- break;
- case TCP_CONNTRACK_SYN_RECV:
- if (!th->syn || !th->ack)
- break;
-
- if (!synproxy_parse_options(skb, thoff, th, &opts))
- return NF_DROP;
-
- if (opts.options & XT_SYNPROXY_OPT_TIMESTAMP) {
- synproxy->tsoff = opts.tsval - synproxy->its;
- nf_conntrack_event_cache(IPCT_SYNPROXY, ct);
- }
-
- opts.options &= ~(XT_SYNPROXY_OPT_MSS |
- XT_SYNPROXY_OPT_WSCALE |
- XT_SYNPROXY_OPT_SACK_PERM);
-
- swap(opts.tsval, opts.tsecr);
- synproxy_send_server_ack(net, state, skb, th, &opts);
-
- nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
- nf_conntrack_event_cache(IPCT_SEQADJ, ct);
-
- swap(opts.tsval, opts.tsecr);
- synproxy_send_client_ack(net, skb, th, &opts);
-
- consume_skb(skb);
- return NF_STOLEN;
- default:
- break;
- }
-
- synproxy_tstamp_adjust(skb, thoff, th, ct, ctinfo, synproxy);
- return NF_ACCEPT;
-}
-
-static const struct nf_hook_ops ipv6_synproxy_ops[] = {
- {
- .hook = ipv6_synproxy_hook,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
- },
- {
- .hook = ipv6_synproxy_hook,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1,
- },
-};
-
static int synproxy_tg6_check(const struct xt_tgchk_param *par)
{
struct synproxy_net *snet = synproxy_pernet(par->net);
@@ -474,16 +78,12 @@ static int synproxy_tg6_check(const struct xt_tgchk_param *par)
if (err)
return err;
- if (snet->hook_ref6 == 0) {
- err = nf_register_net_hooks(par->net, ipv6_synproxy_ops,
- ARRAY_SIZE(ipv6_synproxy_ops));
- if (err) {
- nf_ct_netns_put(par->net, par->family);
- return err;
- }
+ err = nf_synproxy_ipv6_init(snet, par->net);
+ if (err) {
+ nf_ct_netns_put(par->net, par->family);
+ return err;
}
- snet->hook_ref6++;
return err;
}
@@ -491,10 +91,7 @@ static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par)
{
struct synproxy_net *snet = synproxy_pernet(par->net);
- snet->hook_ref6--;
- if (snet->hook_ref6 == 0)
- nf_unregister_net_hooks(par->net, ipv6_synproxy_ops,
- ARRAY_SIZE(ipv6_synproxy_ops));
+ nf_synproxy_ipv6_fini(snet, par->net);
nf_ct_netns_put(par->net, par->family);
}
@@ -524,3 +121,4 @@ module_exit(synproxy_tg6_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("Intercept IPv6 TCP connections and establish them using syncookies");
diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c
index 04099ab7d2e3..70da2f2ce064 100644
--- a/net/ipv6/netfilter/ip6t_ah.c
+++ b/net/ipv6/netfilter/ip6t_ah.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel module to match AH parameters. */
/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -58,7 +55,7 @@ static bool ah_mt6(const struct sk_buff *skb, struct xt_action_param *par)
return false;
}
- hdrlen = (ah->hdrlen + 2) << 2;
+ hdrlen = ipv6_authlen(ah);
pr_debug("IPv6 AH LEN %u %u ", hdrlen, ah->hdrlen);
pr_debug("RES %04X ", ah->reserved);
@@ -77,8 +74,7 @@ static bool ah_mt6(const struct sk_buff *skb, struct xt_action_param *par)
ahinfo->hdrres, ah->reserved,
!(ahinfo->hdrres && ah->reserved));
- return (ah != NULL) &&
- spi_match(ahinfo->spis[0], ahinfo->spis[1],
+ return spi_match(ahinfo->spis[0], ahinfo->spis[1],
ntohl(ah->spi),
!!(ahinfo->invflags & IP6T_AH_INV_SPI)) &&
(!ahinfo->hdrlen ||
diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c
index aab0706908c5..d704f7ed300c 100644
--- a/net/ipv6/netfilter/ip6t_eui64.c
+++ b/net/ipv6/netfilter/ip6t_eui64.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel module to match EUI64 address parameters. */
/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c
index 3b5735e56bfe..3aad6439386b 100644
--- a/net/ipv6/netfilter/ip6t_frag.c
+++ b/net/ipv6/netfilter/ip6t_frag.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel module to match FRAG parameters. */
/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -88,8 +85,7 @@ frag_mt6(const struct sk_buff *skb, struct xt_action_param *par)
!((fraginfo->flags & IP6T_FRAG_NMF) &&
(ntohs(fh->frag_off) & IP6_MF)));
- return (fh != NULL) &&
- id_match(fraginfo->ids[0], fraginfo->ids[1],
+ return id_match(fraginfo->ids[0], fraginfo->ids[1],
ntohl(fh->identification),
!!(fraginfo->invflags & IP6T_FRAG_INV_IDS)) &&
!((fraginfo->flags & IP6T_FRAG_RES) &&
diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c
index 01df142bb027..e7a3fb9355ee 100644
--- a/net/ipv6/netfilter/ip6t_hbh.c
+++ b/net/ipv6/netfilter/ip6t_hbh.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel module to match Hop-by-Hop and Destination parameters. */
/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -89,8 +86,7 @@ hbh_mt6(const struct sk_buff *skb, struct xt_action_param *par)
((optinfo->hdrlen == hdrlen) ^
!!(optinfo->invflags & IP6T_OPTS_INV_LEN))));
- ret = (oh != NULL) &&
- (!(optinfo->flags & IP6T_OPTS_LEN) ||
+ ret = (!(optinfo->flags & IP6T_OPTS_LEN) ||
((optinfo->hdrlen == hdrlen) ^
!!(optinfo->invflags & IP6T_OPTS_INV_LEN)));
diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c
index 8b147440fbdc..c52ff929c93b 100644
--- a/net/ipv6/netfilter/ip6t_ipv6header.c
+++ b/net/ipv6/netfilter/ip6t_ipv6header.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* ipv6header match - matches IPv6 packets based
on whether they contain certain headers */
@@ -5,10 +6,6 @@
* Rewritten by: Andras Kis-Szabo <kisza@sch.bme.hu> */
/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -19,7 +16,7 @@
#include <net/ipv6.h>
#include <linux/netfilter/x_tables.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_ipv6/ip6t_ipv6header.h>
MODULE_LICENSE("GPL");
@@ -45,7 +42,7 @@ ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par)
len = skb->len - ptr;
temp = 0;
- while (ip6t_ext_hdr(nexthdr)) {
+ while (nf_ip6_ext_hdr(nexthdr)) {
const struct ipv6_opt_hdr *hp;
struct ipv6_opt_hdr _hdr;
int hdrlen;
@@ -65,13 +62,16 @@ ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par)
}
hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
- BUG_ON(hp == NULL);
+ if (!hp) {
+ par->hotdrop = true;
+ return false;
+ }
/* Calculate the header length */
if (nexthdr == NEXTHDR_FRAGMENT)
hdrlen = 8;
else if (nexthdr == NEXTHDR_AUTH)
- hdrlen = (hp->hdrlen + 2) << 2;
+ hdrlen = ipv6_authlen(hp);
else
hdrlen = ipv6_optlen(hp);
diff --git a/net/ipv6/netfilter/ip6t_mh.c b/net/ipv6/netfilter/ip6t_mh.c
index 0c90c66b1992..fd492b69acbc 100644
--- a/net/ipv6/netfilter/ip6t_mh.c
+++ b/net/ipv6/netfilter/ip6t_mh.c
@@ -1,15 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C)2006 USAGI/WIDE Project
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Author:
* Masahide NAKAMURA @USAGI <masahide.nakamura.cz@hitachi.com>
*
* Based on net/netfilter/xt_tcpudp.c
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/types.h>
diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c
index c3c6b09acdc4..67c87a88cde4 100644
--- a/net/ipv6/netfilter/ip6t_rpfilter.c
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2011 Florian Westphal <fw@strlen.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -40,8 +37,10 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
bool ret = false;
struct flowi6 fl6 = {
.flowi6_iif = LOOPBACK_IFINDEX,
+ .flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev),
.flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
.flowi6_proto = iph->nexthdr,
+ .flowi6_uid = sock_net_uid(net, NULL),
.daddr = iph->saddr,
};
int lookup_flags;
@@ -73,7 +72,9 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
goto out;
}
- if (rt->rt6i_idev->dev == dev || (flags & XT_RPFILTER_LOOSE))
+ if (rt->rt6i_idev->dev == dev ||
+ l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == dev->ifindex ||
+ (flags & XT_RPFILTER_LOOSE))
ret = true;
out:
ip6_rt_put(rt);
diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c
index 2c99b94eeca3..4ad8b2032f1f 100644
--- a/net/ipv6/netfilter/ip6t_rt.c
+++ b/net/ipv6/netfilter/ip6t_rt.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Kernel module to match ROUTING parameters. */
/* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -28,12 +25,7 @@ MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>");
static inline bool
segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert)
{
- bool r;
- pr_debug("segsleft_match:%c 0x%x <= 0x%x <= 0x%x\n",
- invert ? '!' : ' ', min, id, max);
- r = (id >= min && id <= max) ^ invert;
- pr_debug(" result %s\n", r ? "PASS" : "FAILED");
- return r;
+ return (id >= min && id <= max) ^ invert;
}
static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par)
@@ -68,32 +60,7 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par)
return false;
}
- pr_debug("IPv6 RT LEN %u %u ", hdrlen, rh->hdrlen);
- pr_debug("TYPE %04X ", rh->type);
- pr_debug("SGS_LEFT %u %02X\n", rh->segments_left, rh->segments_left);
-
- pr_debug("IPv6 RT segsleft %02X ",
- segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1],
- rh->segments_left,
- !!(rtinfo->invflags & IP6T_RT_INV_SGS)));
- pr_debug("type %02X %02X %02X ",
- rtinfo->rt_type, rh->type,
- (!(rtinfo->flags & IP6T_RT_TYP) ||
- ((rtinfo->rt_type == rh->type) ^
- !!(rtinfo->invflags & IP6T_RT_INV_TYP))));
- pr_debug("len %02X %04X %02X ",
- rtinfo->hdrlen, hdrlen,
- !(rtinfo->flags & IP6T_RT_LEN) ||
- ((rtinfo->hdrlen == hdrlen) ^
- !!(rtinfo->invflags & IP6T_RT_INV_LEN)));
- pr_debug("res %02X %02X %02X ",
- rtinfo->flags & IP6T_RT_RES,
- ((const struct rt0_hdr *)rh)->reserved,
- !((rtinfo->flags & IP6T_RT_RES) &&
- (((const struct rt0_hdr *)rh)->reserved)));
-
- ret = (rh != NULL) &&
- (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1],
+ ret = (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1],
rh->segments_left,
!!(rtinfo->invflags & IP6T_RT_INV_SGS))) &&
(!(rtinfo->flags & IP6T_RT_LEN) ||
@@ -111,22 +78,22 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par)
reserved),
sizeof(_reserved),
&_reserved);
+ if (!rp) {
+ par->hotdrop = true;
+ return false;
+ }
ret = (*rp == 0);
}
- pr_debug("#%d ", rtinfo->addrnr);
if (!(rtinfo->flags & IP6T_RT_FST)) {
return ret;
} else if (rtinfo->flags & IP6T_RT_FST_NSTRICT) {
- pr_debug("Not strict ");
if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) {
- pr_debug("There isn't enough space\n");
return false;
} else {
unsigned int i = 0;
- pr_debug("#%d ", rtinfo->addrnr);
for (temp = 0;
temp < (unsigned int)((hdrlen - 8) / 16);
temp++) {
@@ -137,28 +104,25 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par)
sizeof(_addr),
&_addr);
- BUG_ON(ap == NULL);
+ if (ap == NULL) {
+ par->hotdrop = true;
+ return false;
+ }
- if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) {
- pr_debug("i=%d temp=%d;\n", i, temp);
+ if (ipv6_addr_equal(ap, &rtinfo->addrs[i]))
i++;
- }
if (i == rtinfo->addrnr)
break;
}
- pr_debug("i=%d #%d\n", i, rtinfo->addrnr);
if (i == rtinfo->addrnr)
return ret;
else
return false;
}
} else {
- pr_debug("Strict ");
if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) {
- pr_debug("There isn't enough space\n");
return false;
} else {
- pr_debug("#%d ", rtinfo->addrnr);
for (temp = 0; temp < rtinfo->addrnr; temp++) {
ap = skb_header_pointer(skb,
ptr
@@ -166,12 +130,14 @@ static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par)
+ temp * sizeof(_addr),
sizeof(_addr),
&_addr);
- BUG_ON(ap == NULL);
+ if (ap == NULL) {
+ par->hotdrop = true;
+ return false;
+ }
if (!ipv6_addr_equal(ap, &rtinfo->addrs[temp]))
break;
}
- pr_debug("temp=%d #%d\n", temp, rtinfo->addrnr);
if (temp == rtinfo->addrnr &&
temp == (unsigned int)((hdrlen - 8) / 16))
return ret;
diff --git a/net/ipv6/netfilter/ip6t_srh.c b/net/ipv6/netfilter/ip6t_srh.c
index 1059894a6f4c..db0fd64d8986 100644
--- a/net/ipv6/netfilter/ip6t_srh.c
+++ b/net/ipv6/netfilter/ip6t_srh.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* Kernel module to match Segment Routing Header (SRH) parameters. */
/* Author:
* Ahmed Abdelsalam <amsalam20@gmail.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -210,6 +206,8 @@ static bool srh1_mt6(const struct sk_buff *skb, struct xt_action_param *par)
psidoff = srhoff + sizeof(struct ipv6_sr_hdr) +
((srh->segments_left + 1) * sizeof(struct in6_addr));
psid = skb_header_pointer(skb, psidoff, sizeof(_psid), &_psid);
+ if (!psid)
+ return false;
if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_PSID,
ipv6_masked_addr_cmp(psid, &srhinfo->psid_msk,
&srhinfo->psid_addr)))
@@ -223,6 +221,8 @@ static bool srh1_mt6(const struct sk_buff *skb, struct xt_action_param *par)
nsidoff = srhoff + sizeof(struct ipv6_sr_hdr) +
((srh->segments_left - 1) * sizeof(struct in6_addr));
nsid = skb_header_pointer(skb, nsidoff, sizeof(_nsid), &_nsid);
+ if (!nsid)
+ return false;
if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NSID,
ipv6_masked_addr_cmp(nsid, &srhinfo->nsid_msk,
&srhinfo->nsid_addr)))
@@ -233,6 +233,8 @@ static bool srh1_mt6(const struct sk_buff *skb, struct xt_action_param *par)
if (srhinfo->mt_flags & IP6T_SRH_LSID) {
lsidoff = srhoff + sizeof(struct ipv6_sr_hdr);
lsid = skb_header_pointer(skb, lsidoff, sizeof(_lsid), &_lsid);
+ if (!lsid)
+ return false;
if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LSID,
ipv6_masked_addr_cmp(lsid, &srhinfo->lsid_msk,
&srhinfo->lsid_addr)))
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index 1343077dde93..e8992693e14a 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
*
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
* Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -22,84 +19,81 @@ MODULE_DESCRIPTION("ip6tables filter table");
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT))
-static int __net_init ip6table_filter_table_init(struct net *net);
-
static const struct xt_table packet_filter = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
.priority = NF_IP6_PRI_FILTER,
- .table_init = ip6table_filter_table_init,
};
-/* The work comes in here from netfilter.c. */
-static unsigned int
-ip6table_filter_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip6t_do_table(skb, state, state->net->ipv6.ip6table_filter);
-}
-
static struct nf_hook_ops *filter_ops __read_mostly;
/* Default to forward because I got too much mail already. */
static bool forward = true;
module_param(forward, bool, 0000);
-static int __net_init ip6table_filter_table_init(struct net *net)
+static int ip6table_filter_table_init(struct net *net)
{
struct ip6t_replace *repl;
int err;
- if (net->ipv6.ip6table_filter)
- return 0;
-
repl = ip6t_alloc_initial_table(&packet_filter);
if (repl == NULL)
return -ENOMEM;
/* Entry 1 is the FORWARD hook */
((struct ip6t_standard *)repl->entries)[1].target.verdict =
- forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
+ forward ? -NF_ACCEPT - 1 : NF_DROP - 1;
- err = ip6t_register_table(net, &packet_filter, repl, filter_ops,
- &net->ipv6.ip6table_filter);
+ err = ip6t_register_table(net, &packet_filter, repl, filter_ops);
kfree(repl);
return err;
}
static int __net_init ip6table_filter_net_init(struct net *net)
{
- if (net == &init_net || !forward)
+ if (!forward)
return ip6table_filter_table_init(net);
return 0;
}
+static void __net_exit ip6table_filter_net_pre_exit(struct net *net)
+{
+ ip6t_unregister_table_pre_exit(net, "filter");
+}
+
static void __net_exit ip6table_filter_net_exit(struct net *net)
{
- if (!net->ipv6.ip6table_filter)
- return;
- ip6t_unregister_table(net, net->ipv6.ip6table_filter, filter_ops);
- net->ipv6.ip6table_filter = NULL;
+ ip6t_unregister_table_exit(net, "filter");
}
static struct pernet_operations ip6table_filter_net_ops = {
.init = ip6table_filter_net_init,
+ .pre_exit = ip6table_filter_net_pre_exit,
.exit = ip6table_filter_net_exit,
};
static int __init ip6table_filter_init(void)
{
- int ret;
+ int ret = xt_register_template(&packet_filter,
+ ip6table_filter_table_init);
- filter_ops = xt_hook_ops_alloc(&packet_filter, ip6table_filter_hook);
- if (IS_ERR(filter_ops))
+ if (ret < 0)
+ return ret;
+
+ filter_ops = xt_hook_ops_alloc(&packet_filter, ip6t_do_table);
+ if (IS_ERR(filter_ops)) {
+ xt_unregister_template(&packet_filter);
return PTR_ERR(filter_ops);
+ }
ret = register_pernet_subsys(&ip6table_filter_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ xt_unregister_template(&packet_filter);
kfree(filter_ops);
+ return ret;
+ }
return ret;
}
@@ -107,6 +101,7 @@ static int __init ip6table_filter_init(void)
static void __exit ip6table_filter_fini(void)
{
unregister_pernet_subsys(&ip6table_filter_net_ops);
+ xt_unregister_template(&packet_filter);
kfree(filter_ops);
}
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index b0524b18c4fb..8dd4cd0c47bd 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* IPv6 packet mangling table, a port of the IPv4 mangle table to IPv6
*
* Copyright (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
* Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
@@ -23,24 +20,21 @@ MODULE_DESCRIPTION("ip6tables mangle table");
(1 << NF_INET_LOCAL_OUT) | \
(1 << NF_INET_POST_ROUTING))
-static int __net_init ip6table_mangle_table_init(struct net *net);
-
static const struct xt_table packet_mangler = {
.name = "mangle",
.valid_hooks = MANGLE_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
.priority = NF_IP6_PRI_MANGLE,
- .table_init = ip6table_mangle_table_init,
};
static unsigned int
-ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
+ip6t_mangle_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state)
{
- unsigned int ret;
struct in6_addr saddr, daddr;
- u_int8_t hop_limit;
- u_int32_t flowlabel, mark;
+ unsigned int ret, verdict;
+ u32 flowlabel, mark;
+ u8 hop_limit;
int err;
/* save source/dest address, mark, hoplimit, flowlabel, priority, */
@@ -52,15 +46,16 @@ ip6t_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state)
/* flowlabel and prio (includes version, which shouldn't change either */
flowlabel = *((u_int32_t *)ipv6_hdr(skb));
- ret = ip6t_do_table(skb, state, state->net->ipv6.ip6table_mangle);
+ ret = ip6t_do_table(priv, skb, state);
+ verdict = ret & NF_VERDICT_MASK;
- if (ret != NF_DROP && ret != NF_STOLEN &&
+ if (verdict != NF_DROP && verdict != NF_STOLEN &&
(!ipv6_addr_equal(&ipv6_hdr(skb)->saddr, &saddr) ||
!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, &daddr) ||
skb->mark != mark ||
ipv6_hdr(skb)->hop_limit != hop_limit ||
flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) {
- err = ip6_route_me_harder(state->net, skb);
+ err = ip6_route_me_harder(state->net, state->sk, skb);
if (err < 0)
ret = NF_DROP_ERR(err);
}
@@ -74,66 +69,67 @@ ip6table_mangle_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
if (state->hook == NF_INET_LOCAL_OUT)
- return ip6t_mangle_out(skb, state);
- return ip6t_do_table(skb, state, state->net->ipv6.ip6table_mangle);
+ return ip6t_mangle_out(priv, skb, state);
+ return ip6t_do_table(priv, skb, state);
}
static struct nf_hook_ops *mangle_ops __read_mostly;
-static int __net_init ip6table_mangle_table_init(struct net *net)
+static int ip6table_mangle_table_init(struct net *net)
{
struct ip6t_replace *repl;
int ret;
- if (net->ipv6.ip6table_mangle)
- return 0;
-
repl = ip6t_alloc_initial_table(&packet_mangler);
if (repl == NULL)
return -ENOMEM;
- ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops,
- &net->ipv6.ip6table_mangle);
+ ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops);
kfree(repl);
return ret;
}
-static void __net_exit ip6table_mangle_net_exit(struct net *net)
+static void __net_exit ip6table_mangle_net_pre_exit(struct net *net)
{
- if (!net->ipv6.ip6table_mangle)
- return;
+ ip6t_unregister_table_pre_exit(net, "mangle");
+}
- ip6t_unregister_table(net, net->ipv6.ip6table_mangle, mangle_ops);
- net->ipv6.ip6table_mangle = NULL;
+static void __net_exit ip6table_mangle_net_exit(struct net *net)
+{
+ ip6t_unregister_table_exit(net, "mangle");
}
static struct pernet_operations ip6table_mangle_net_ops = {
+ .pre_exit = ip6table_mangle_net_pre_exit,
.exit = ip6table_mangle_net_exit,
};
static int __init ip6table_mangle_init(void)
{
- int ret;
+ int ret = xt_register_template(&packet_mangler,
+ ip6table_mangle_table_init);
+
+ if (ret < 0)
+ return ret;
mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook);
- if (IS_ERR(mangle_ops))
+ if (IS_ERR(mangle_ops)) {
+ xt_unregister_template(&packet_mangler);
return PTR_ERR(mangle_ops);
+ }
ret = register_pernet_subsys(&ip6table_mangle_net_ops);
if (ret < 0) {
+ xt_unregister_template(&packet_mangler);
kfree(mangle_ops);
return ret;
}
- ret = ip6table_mangle_table_init(&init_net);
- if (ret) {
- unregister_pernet_subsys(&ip6table_mangle_net_ops);
- kfree(mangle_ops);
- }
return ret;
}
static void __exit ip6table_mangle_fini(void)
{
unregister_pernet_subsys(&ip6table_mangle_net_ops);
+ xt_unregister_template(&packet_mangler);
kfree(mangle_ops);
}
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index 67ba70ab9f5c..e119d4f090cc 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Based on Rusty Russell's IPv4 NAT code. Development of IPv6 NAT
* funded by Astaro.
*/
@@ -17,10 +14,12 @@
#include <net/ipv6.h>
#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-static int __net_init ip6table_nat_table_init(struct net *net);
+struct ip6table_nat_pernet {
+ struct nf_hook_ops *nf_nat_ops;
+};
+
+static unsigned int ip6table_nat_net_id __read_mostly;
static const struct xt_table nf_nat_ipv6_table = {
.name = "nat",
@@ -30,37 +29,29 @@ static const struct xt_table nf_nat_ipv6_table = {
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
- .table_init = ip6table_nat_table_init,
};
-static unsigned int ip6table_nat_do_chain(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip6t_do_table(skb, state, state->net->ipv6.ip6table_nat);
-}
-
static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
{
- .hook = ip6table_nat_do_chain,
+ .hook = ip6t_do_table,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_PRE_ROUTING,
.priority = NF_IP6_PRI_NAT_DST,
},
{
- .hook = ip6table_nat_do_chain,
+ .hook = ip6t_do_table,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_POST_ROUTING,
.priority = NF_IP6_PRI_NAT_SRC,
},
{
- .hook = ip6table_nat_do_chain,
+ .hook = ip6t_do_table,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_OUT,
.priority = NF_IP6_PRI_NAT_DST,
},
{
- .hook = ip6table_nat_do_chain,
+ .hook = ip6t_do_table,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_LOCAL_IN,
.priority = NF_IP6_PRI_NAT_SRC,
@@ -69,84 +60,113 @@ static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
static int ip6t_nat_register_lookups(struct net *net)
{
+ struct ip6table_nat_pernet *xt_nat_net;
+ struct nf_hook_ops *ops;
+ struct xt_table *table;
int i, ret;
+ table = xt_find_table(net, NFPROTO_IPV6, "nat");
+ if (WARN_ON_ONCE(!table))
+ return -ENOENT;
+
+ xt_nat_net = net_generic(net, ip6table_nat_net_id);
+ ops = kmemdup(nf_nat_ipv6_ops, sizeof(nf_nat_ipv6_ops), GFP_KERNEL);
+ if (!ops)
+ return -ENOMEM;
+
for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++) {
- ret = nf_nat_l3proto_ipv6_register_fn(net, &nf_nat_ipv6_ops[i]);
+ ops[i].priv = table;
+ ret = nf_nat_ipv6_register_fn(net, &ops[i]);
if (ret) {
while (i)
- nf_nat_l3proto_ipv6_unregister_fn(net, &nf_nat_ipv6_ops[--i]);
+ nf_nat_ipv6_unregister_fn(net, &ops[--i]);
+ kfree(ops);
return ret;
}
}
+ xt_nat_net->nf_nat_ops = ops;
return 0;
}
static void ip6t_nat_unregister_lookups(struct net *net)
{
+ struct ip6table_nat_pernet *xt_nat_net = net_generic(net, ip6table_nat_net_id);
+ struct nf_hook_ops *ops = xt_nat_net->nf_nat_ops;
int i;
+ if (!ops)
+ return;
+
for (i = 0; i < ARRAY_SIZE(nf_nat_ipv6_ops); i++)
- nf_nat_l3proto_ipv6_unregister_fn(net, &nf_nat_ipv6_ops[i]);
+ nf_nat_ipv6_unregister_fn(net, &ops[i]);
+
+ kfree(ops);
}
-static int __net_init ip6table_nat_table_init(struct net *net)
+static int ip6table_nat_table_init(struct net *net)
{
struct ip6t_replace *repl;
int ret;
- if (net->ipv6.ip6table_nat)
- return 0;
-
repl = ip6t_alloc_initial_table(&nf_nat_ipv6_table);
if (repl == NULL)
return -ENOMEM;
ret = ip6t_register_table(net, &nf_nat_ipv6_table, repl,
- NULL, &net->ipv6.ip6table_nat);
+ NULL);
if (ret < 0) {
kfree(repl);
return ret;
}
ret = ip6t_nat_register_lookups(net);
- if (ret < 0) {
- ip6t_unregister_table(net, net->ipv6.ip6table_nat, NULL);
- net->ipv6.ip6table_nat = NULL;
- }
+ if (ret < 0)
+ ip6t_unregister_table_exit(net, "nat");
+
kfree(repl);
return ret;
}
-static void __net_exit ip6table_nat_net_exit(struct net *net)
+static void __net_exit ip6table_nat_net_pre_exit(struct net *net)
{
- if (!net->ipv6.ip6table_nat)
- return;
ip6t_nat_unregister_lookups(net);
- ip6t_unregister_table(net, net->ipv6.ip6table_nat, NULL);
- net->ipv6.ip6table_nat = NULL;
+}
+
+static void __net_exit ip6table_nat_net_exit(struct net *net)
+{
+ ip6t_unregister_table_exit(net, "nat");
}
static struct pernet_operations ip6table_nat_net_ops = {
+ .pre_exit = ip6table_nat_net_pre_exit,
.exit = ip6table_nat_net_exit,
+ .id = &ip6table_nat_net_id,
+ .size = sizeof(struct ip6table_nat_pernet),
};
static int __init ip6table_nat_init(void)
{
- int ret = register_pernet_subsys(&ip6table_nat_net_ops);
+ int ret;
- if (ret)
+ /* net->gen->ptr[ip6table_nat_net_id] must be allocated
+ * before calling ip6t_nat_register_lookups().
+ */
+ ret = register_pernet_subsys(&ip6table_nat_net_ops);
+ if (ret < 0)
return ret;
- ret = ip6table_nat_table_init(&init_net);
+ ret = xt_register_template(&nf_nat_ipv6_table,
+ ip6table_nat_table_init);
if (ret)
unregister_pernet_subsys(&ip6table_nat_net_ops);
+
return ret;
}
static void __exit ip6table_nat_exit(void)
{
+ xt_unregister_template(&nf_nat_ipv6_table);
unregister_pernet_subsys(&ip6table_nat_net_ops);
}
@@ -154,3 +174,4 @@ module_init(ip6table_nat_init);
module_exit(ip6table_nat_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Ip6tables legacy nat table");
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 710fa0806c37..fc9f6754028f 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -1,7 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* IPv6 raw table, a port of the IPv4 raw table to IPv6
*
- * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@netfilter.org>
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -10,8 +11,6 @@
#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
-static int __net_init ip6table_raw_table_init(struct net *net);
-
static bool raw_before_defrag __read_mostly;
MODULE_PARM_DESC(raw_before_defrag, "Enable raw table before defrag");
module_param(raw_before_defrag, bool, 0000);
@@ -22,7 +21,6 @@ static const struct xt_table packet_raw = {
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
.priority = NF_IP6_PRI_RAW,
- .table_init = ip6table_raw_table_init,
};
static const struct xt_table packet_raw_before_defrag = {
@@ -31,20 +29,11 @@ static const struct xt_table packet_raw_before_defrag = {
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
.priority = NF_IP6_PRI_RAW_BEFORE_DEFRAG,
- .table_init = ip6table_raw_table_init,
};
-/* The work comes in here from netfilter.c. */
-static unsigned int
-ip6table_raw_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip6t_do_table(skb, state, state->net->ipv6.ip6table_raw);
-}
-
static struct nf_hook_ops *rawtable_ops __read_mostly;
-static int __net_init ip6table_raw_table_init(struct net *net)
+static int ip6table_raw_table_init(struct net *net)
{
struct ip6t_replace *repl;
const struct xt_table *table = &packet_raw;
@@ -53,66 +42,68 @@ static int __net_init ip6table_raw_table_init(struct net *net)
if (raw_before_defrag)
table = &packet_raw_before_defrag;
- if (net->ipv6.ip6table_raw)
- return 0;
-
repl = ip6t_alloc_initial_table(table);
if (repl == NULL)
return -ENOMEM;
- ret = ip6t_register_table(net, table, repl, rawtable_ops,
- &net->ipv6.ip6table_raw);
+ ret = ip6t_register_table(net, table, repl, rawtable_ops);
kfree(repl);
return ret;
}
+static void __net_exit ip6table_raw_net_pre_exit(struct net *net)
+{
+ ip6t_unregister_table_pre_exit(net, "raw");
+}
+
static void __net_exit ip6table_raw_net_exit(struct net *net)
{
- if (!net->ipv6.ip6table_raw)
- return;
- ip6t_unregister_table(net, net->ipv6.ip6table_raw, rawtable_ops);
- net->ipv6.ip6table_raw = NULL;
+ ip6t_unregister_table_exit(net, "raw");
}
static struct pernet_operations ip6table_raw_net_ops = {
+ .pre_exit = ip6table_raw_net_pre_exit,
.exit = ip6table_raw_net_exit,
};
static int __init ip6table_raw_init(void)
{
- int ret;
const struct xt_table *table = &packet_raw;
+ int ret;
if (raw_before_defrag) {
table = &packet_raw_before_defrag;
-
pr_info("Enabling raw table before defrag\n");
}
+ ret = xt_register_template(table, ip6table_raw_table_init);
+ if (ret < 0)
+ return ret;
+
/* Register hooks */
- rawtable_ops = xt_hook_ops_alloc(table, ip6table_raw_hook);
- if (IS_ERR(rawtable_ops))
+ rawtable_ops = xt_hook_ops_alloc(table, ip6t_do_table);
+ if (IS_ERR(rawtable_ops)) {
+ xt_unregister_template(table);
return PTR_ERR(rawtable_ops);
+ }
ret = register_pernet_subsys(&ip6table_raw_net_ops);
if (ret < 0) {
kfree(rawtable_ops);
+ xt_unregister_template(table);
return ret;
}
- ret = ip6table_raw_table_init(&init_net);
- if (ret) {
- unregister_pernet_subsys(&ip6table_raw_net_ops);
- kfree(rawtable_ops);
- }
return ret;
}
static void __exit ip6table_raw_fini(void)
{
unregister_pernet_subsys(&ip6table_raw_net_ops);
+ xt_unregister_template(&packet_raw);
kfree(rawtable_ops);
}
module_init(ip6table_raw_init);
module_exit(ip6table_raw_fini);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Ip6tables legacy raw table");
diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c
index cf26ccb04056..4df14a9bae78 100644
--- a/net/ipv6/netfilter/ip6table_security.c
+++ b/net/ipv6/netfilter/ip6table_security.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* "security" table for IPv6
*
@@ -10,10 +11,6 @@
* Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
* Copyright (C) 2000-2004 Netfilter Core Team <coreteam <at> netfilter.org>
* Copyright (C) 2008 Red Hat, Inc., James Morris <jmorris <at> redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/netfilter_ipv6/ip6_tables.h>
@@ -27,80 +24,72 @@ MODULE_DESCRIPTION("ip6tables security table, for MAC rules");
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT)
-static int __net_init ip6table_security_table_init(struct net *net);
-
static const struct xt_table security_table = {
.name = "security",
.valid_hooks = SECURITY_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
.priority = NF_IP6_PRI_SECURITY,
- .table_init = ip6table_security_table_init,
};
-static unsigned int
-ip6table_security_hook(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return ip6t_do_table(skb, state, state->net->ipv6.ip6table_security);
-}
-
static struct nf_hook_ops *sectbl_ops __read_mostly;
-static int __net_init ip6table_security_table_init(struct net *net)
+static int ip6table_security_table_init(struct net *net)
{
struct ip6t_replace *repl;
int ret;
- if (net->ipv6.ip6table_security)
- return 0;
-
repl = ip6t_alloc_initial_table(&security_table);
if (repl == NULL)
return -ENOMEM;
- ret = ip6t_register_table(net, &security_table, repl, sectbl_ops,
- &net->ipv6.ip6table_security);
+ ret = ip6t_register_table(net, &security_table, repl, sectbl_ops);
kfree(repl);
return ret;
}
+static void __net_exit ip6table_security_net_pre_exit(struct net *net)
+{
+ ip6t_unregister_table_pre_exit(net, "security");
+}
+
static void __net_exit ip6table_security_net_exit(struct net *net)
{
- if (!net->ipv6.ip6table_security)
- return;
- ip6t_unregister_table(net, net->ipv6.ip6table_security, sectbl_ops);
- net->ipv6.ip6table_security = NULL;
+ ip6t_unregister_table_exit(net, "security");
}
static struct pernet_operations ip6table_security_net_ops = {
+ .pre_exit = ip6table_security_net_pre_exit,
.exit = ip6table_security_net_exit,
};
static int __init ip6table_security_init(void)
{
- int ret;
+ int ret = xt_register_template(&security_table,
+ ip6table_security_table_init);
+
+ if (ret < 0)
+ return ret;
- sectbl_ops = xt_hook_ops_alloc(&security_table, ip6table_security_hook);
- if (IS_ERR(sectbl_ops))
+ sectbl_ops = xt_hook_ops_alloc(&security_table, ip6t_do_table);
+ if (IS_ERR(sectbl_ops)) {
+ xt_unregister_template(&security_table);
return PTR_ERR(sectbl_ops);
+ }
ret = register_pernet_subsys(&ip6table_security_net_ops);
if (ret < 0) {
kfree(sectbl_ops);
+ xt_unregister_template(&security_table);
return ret;
}
- ret = ip6table_security_table_init(&init_net);
- if (ret) {
- unregister_pernet_subsys(&ip6table_security_net_ops);
- kfree(sectbl_ops);
- }
return ret;
}
static void __exit ip6table_security_fini(void)
{
unregister_pernet_subsys(&ip6table_security_net_ops);
+ xt_unregister_template(&security_table);
kfree(sectbl_ops);
}
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 8f68a518d9db..64ab23ff559b 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPv6 fragment reassembly for connection tracking
*
@@ -7,11 +8,6 @@
* Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
*
* Based on: net/ipv6/reassembly.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) "IPv6-nf: " fmt
@@ -19,28 +15,13 @@
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/string.h>
-#include <linux/socket.h>
-#include <linux/sockios.h>
-#include <linux/jiffies.h>
#include <linux/net.h>
-#include <linux/list.h>
#include <linux/netdevice.h>
-#include <linux/in6.h>
#include <linux/ipv6.h>
-#include <linux/icmpv6.h>
-#include <linux/random.h>
#include <linux/slab.h>
-#include <net/sock.h>
-#include <net/snmp.h>
#include <net/ipv6_frag.h>
-#include <net/protocol.h>
-#include <net/transp_v6.h>
-#include <net/rawv6.h>
-#include <net/ndisc.h>
-#include <net/addrconf.h>
-#include <net/inet_ecn.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
#include <linux/sysctl.h>
#include <linux/netfilter.h>
@@ -48,42 +29,44 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <net/netns/generic.h>
static const char nf_frags_cache_name[] = "nf-frags";
+static unsigned int nf_frag_pernet_id __read_mostly;
static struct inet_frags nf_frags;
+static struct nft_ct_frag6_pernet *nf_frag_pernet(struct net *net)
+{
+ return net_generic(net, nf_frag_pernet_id);
+}
+
#ifdef CONFIG_SYSCTL
static struct ctl_table nf_ct_frag6_sysctl_table[] = {
{
.procname = "nf_conntrack_frag6_timeout",
- .data = &init_net.nf_frag.frags.timeout,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
{
.procname = "nf_conntrack_frag6_low_thresh",
- .data = &init_net.nf_frag.frags.low_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra2 = &init_net.nf_frag.frags.high_thresh
},
{
.procname = "nf_conntrack_frag6_high_thresh",
- .data = &init_net.nf_frag.frags.high_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra1 = &init_net.nf_frag.frags.low_thresh
},
- { }
};
static int nf_ct_frag6_sysctl_register(struct net *net)
{
+ struct nft_ct_frag6_pernet *nf_frag;
struct ctl_table *table;
struct ctl_table_header *hdr;
@@ -93,20 +76,22 @@ static int nf_ct_frag6_sysctl_register(struct net *net)
GFP_KERNEL);
if (table == NULL)
goto err_alloc;
-
- table[0].data = &net->nf_frag.frags.timeout;
- table[1].data = &net->nf_frag.frags.low_thresh;
- table[1].extra2 = &net->nf_frag.frags.high_thresh;
- table[2].data = &net->nf_frag.frags.high_thresh;
- table[2].extra1 = &net->nf_frag.frags.low_thresh;
- table[2].extra2 = &init_net.nf_frag.frags.high_thresh;
}
- hdr = register_net_sysctl(net, "net/netfilter", table);
+ nf_frag = nf_frag_pernet(net);
+
+ table[0].data = &nf_frag->fqdir->timeout;
+ table[1].data = &nf_frag->fqdir->low_thresh;
+ table[1].extra2 = &nf_frag->fqdir->high_thresh;
+ table[2].data = &nf_frag->fqdir->high_thresh;
+ table[2].extra1 = &nf_frag->fqdir->low_thresh;
+
+ hdr = register_net_sysctl_sz(net, "net/netfilter", table,
+ ARRAY_SIZE(nf_ct_frag6_sysctl_table));
if (hdr == NULL)
goto err_reg;
- net->nf_frag_frags_hdr = hdr;
+ nf_frag->nf_frag_frags_hdr = hdr;
return 0;
err_reg:
@@ -118,10 +103,11 @@ err_alloc:
static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net)
{
- struct ctl_table *table;
+ struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net);
+ const struct ctl_table *table;
- table = net->nf_frag_frags_hdr->ctl_table_arg;
- unregister_net_sysctl_table(net->nf_frag_frags_hdr);
+ table = nf_frag->nf_frag_frags_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(nf_frag->nf_frag_frags_hdr);
if (!net_eq(net, &init_net))
kfree(table);
}
@@ -136,6 +122,10 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net)
}
#endif
+static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
+ struct sk_buff *prev_tail, struct net_device *dev,
+ int *refs);
+
static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
{
return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
@@ -143,20 +133,19 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
static void nf_ct_frag6_expire(struct timer_list *t)
{
- struct inet_frag_queue *frag = from_timer(frag, t, timer);
+ struct inet_frag_queue *frag = timer_container_of(frag, t, timer);
struct frag_queue *fq;
- struct net *net;
fq = container_of(frag, struct frag_queue, q);
- net = container_of(fq->q.net, struct net, nf_frag.frags);
- ip6frag_expire_frag_queue(net, fq);
+ ip6frag_expire_frag_queue(fq->q.fqdir->net, fq);
}
/* Creation primitives. */
static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
const struct ipv6hdr *hdr, int iif)
{
+ struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net);
struct frag_v6_compare_key key = {
.id = id,
.saddr = hdr->saddr,
@@ -166,7 +155,11 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
};
struct inet_frag_queue *q;
- q = inet_frag_find(&net->nf_frag.frags, &key);
+ if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST |
+ IPV6_ADDR_LINKLOCAL)))
+ key.iif = 0;
+
+ q = inet_frag_find(nf_frag->fqdir, &key);
if (!q)
return NULL;
@@ -175,11 +168,13 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
- const struct frag_hdr *fhdr, int nhoff)
+ const struct frag_hdr *fhdr, int nhoff,
+ int *refs)
{
- struct sk_buff *prev, *next;
unsigned int payload_len;
- int offset, end;
+ struct net_device *dev;
+ struct sk_buff *prev;
+ int offset, end, err;
u8 ecn;
if (fq->q.flags & INET_FRAG_COMPLETE) {
@@ -228,7 +223,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
* this case. -DaveM
*/
pr_debug("end of fragment not rounded to 8 bytes.\n");
- inet_frag_kill(&fq->q);
+ inet_frag_kill(&fq->q, refs);
return -EPROTO;
}
if (end > fq->q.len) {
@@ -254,62 +249,32 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
goto err;
}
- /* Find out which fragments are in front and at the back of us
- * in the chain of fragments so far. We must know where to put
- * this fragment, right?
- */
- prev = fq->q.fragments_tail;
- if (!prev || prev->ip_defrag_offset < offset) {
- next = NULL;
- goto found;
- }
- prev = NULL;
- for (next = fq->q.fragments; next != NULL; next = next->next) {
- if (next->ip_defrag_offset >= offset)
- break; /* bingo! */
- prev = next;
- }
-
-found:
- /* RFC5722, Section 4:
- * When reassembling an IPv6 datagram, if
- * one or more its constituent fragments is determined to be an
- * overlapping fragment, the entire datagram (and any constituent
- * fragments, including those not yet received) MUST be silently
- * discarded.
- */
-
- /* Check for overlap with preceding fragment. */
- if (prev &&
- (prev->ip_defrag_offset + prev->len) > offset)
- goto discard_fq;
-
- /* Look for overlap with succeeding segment. */
- if (next && next->ip_defrag_offset < end)
- goto discard_fq;
-
- /* Note : skb->ip_defrag_offset and skb->dev share the same location */
- if (skb->dev)
- fq->iif = skb->dev->ifindex;
+ /* Note : skb->rbnode and skb->dev share the same location. */
+ dev = skb->dev;
/* Makes sure compiler wont do silly aliasing games */
barrier();
- skb->ip_defrag_offset = offset;
- /* Insert this fragment in the chain of fragments. */
- skb->next = next;
- if (!next)
- fq->q.fragments_tail = skb;
- if (prev)
- prev->next = skb;
- else
- fq->q.fragments = skb;
+ prev = fq->q.fragments_tail;
+ err = inet_frag_queue_insert(&fq->q, skb, offset, end);
+ if (err) {
+ if (err == IPFRAG_DUP) {
+ /* No error for duplicates, pretend they got queued. */
+ kfree_skb_reason(skb, SKB_DROP_REASON_DUP_FRAG);
+ return -EINPROGRESS;
+ }
+ goto insert_error;
+ }
+
+ if (dev)
+ fq->iif = dev->ifindex;
fq->q.stamp = skb->tstamp;
+ fq->q.tstamp_type = skb->tstamp_type;
fq->q.meat += skb->len;
fq->ecn |= ecn;
if (payload_len > fq->q.max_size)
fq->q.max_size = payload_len;
- add_frag_mem_limit(fq->q.net, skb->truesize);
+ add_frag_mem_limit(fq->q.fqdir, skb->truesize);
/* The first fragment.
* nhoffset is obtained from the first fragment, of course.
@@ -319,11 +284,28 @@ found:
fq->q.flags |= INET_FRAG_FIRST_IN;
}
- return 0;
+ if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+ fq->q.meat == fq->q.len) {
+ unsigned long orefdst = skb->_skb_refdst;
+
+ skb->_skb_refdst = 0UL;
+ err = nf_ct_frag6_reasm(fq, skb, prev, dev, refs);
+ skb->_skb_refdst = orefdst;
+
+ /* After queue has assumed skb ownership, only 0 or
+ * -EINPROGRESS must be returned.
+ */
+ return err ? -EINPROGRESS : 0;
+ }
-discard_fq:
- inet_frag_kill(&fq->q);
+ skb_dst_drop(skb);
+ skb_orphan(skb);
+ return -EINPROGRESS;
+
+insert_error:
+ inet_frag_kill(&fq->q, refs);
err:
+ skb_dst_drop(skb);
return -EINVAL;
}
@@ -333,141 +315,68 @@ err:
* It is called with locked fq, and caller must check that
* queue is eligible for reassembly i.e. it is not COMPLETE,
* the last and the first frames arrived and all the bits are here.
- *
- * returns true if *prev skb has been transformed into the reassembled
- * skb, false otherwise.
*/
-static bool
-nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev)
+static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
+ struct sk_buff *prev_tail, struct net_device *dev,
+ int *refs)
{
- struct sk_buff *fp, *head = fq->q.fragments;
- int payload_len;
+ void *reasm_data;
+ int payload_len;
u8 ecn;
- inet_frag_kill(&fq->q);
-
- WARN_ON(head == NULL);
- WARN_ON(head->ip_defrag_offset != 0);
+ inet_frag_kill(&fq->q, refs);
ecn = ip_frag_ecn_table[fq->ecn];
if (unlikely(ecn == 0xff))
- return false;
+ goto err;
- /* Unfragmented part is taken from the first segment. */
- payload_len = ((head->data - skb_network_header(head)) -
+ reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail);
+ if (!reasm_data)
+ goto err;
+
+ payload_len = -skb_network_offset(skb) -
sizeof(struct ipv6hdr) + fq->q.len -
- sizeof(struct frag_hdr));
+ sizeof(struct frag_hdr);
if (payload_len > IPV6_MAXPLEN) {
net_dbg_ratelimited("nf_ct_frag6_reasm: payload len = %d\n",
payload_len);
- return false;
- }
-
- /* Head of list must not be cloned. */
- if (skb_unclone(head, GFP_ATOMIC))
- return false;
-
- /* If the first fragment is fragmented itself, we split
- * it to two chunks: the first with data and paged part
- * and the second, holding only fragments. */
- if (skb_has_frag_list(head)) {
- struct sk_buff *clone;
- int i, plen = 0;
-
- clone = alloc_skb(0, GFP_ATOMIC);
- if (clone == NULL)
- return false;
-
- clone->next = head->next;
- head->next = clone;
- skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
- skb_frag_list_init(head);
- for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
- plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
- clone->len = clone->data_len = head->data_len - plen;
- head->data_len -= clone->len;
- head->len -= clone->len;
- clone->csum = 0;
- clone->ip_summed = head->ip_summed;
-
- add_frag_mem_limit(fq->q.net, clone->truesize);
- }
-
- /* morph head into last received skb: prev.
- *
- * This allows callers of ipv6 conntrack defrag to continue
- * to use the last skb(frag) passed into the reasm engine.
- * The last skb frag 'silently' turns into the full reassembled skb.
- *
- * Since prev is also part of q->fragments we have to clone it first.
- */
- if (head != prev) {
- struct sk_buff *iter;
-
- fp = skb_clone(prev, GFP_ATOMIC);
- if (!fp)
- return false;
-
- fp->next = prev->next;
-
- iter = head;
- while (iter) {
- if (iter->next == prev) {
- iter->next = fp;
- break;
- }
- iter = iter->next;
- }
-
- skb_morph(prev, head);
- prev->next = head->next;
- consume_skb(head);
- head = prev;
+ goto err;
}
/* We have to remove fragment header from datagram and to relocate
* header in order to calculate ICV correctly. */
- skb_network_header(head)[fq->nhoffset] = skb_transport_header(head)[0];
- memmove(head->head + sizeof(struct frag_hdr), head->head,
- (head->data - head->head) - sizeof(struct frag_hdr));
- head->mac_header += sizeof(struct frag_hdr);
- head->network_header += sizeof(struct frag_hdr);
-
- skb_shinfo(head)->frag_list = head->next;
- skb_reset_transport_header(head);
- skb_push(head, head->data - skb_network_header(head));
-
- for (fp = head->next; fp; fp = fp->next) {
- head->data_len += fp->len;
- head->len += fp->len;
- if (head->ip_summed != fp->ip_summed)
- head->ip_summed = CHECKSUM_NONE;
- else if (head->ip_summed == CHECKSUM_COMPLETE)
- head->csum = csum_add(head->csum, fp->csum);
- head->truesize += fp->truesize;
- fp->sk = NULL;
- }
- sub_frag_mem_limit(fq->q.net, head->truesize);
+ skb_network_header(skb)[fq->nhoffset] = skb_transport_header(skb)[0];
+ memmove(skb->head + sizeof(struct frag_hdr), skb->head,
+ (skb->data - skb->head) - sizeof(struct frag_hdr));
+ skb->mac_header += sizeof(struct frag_hdr);
+ skb->network_header += sizeof(struct frag_hdr);
+
+ skb_reset_transport_header(skb);
+
+ inet_frag_reasm_finish(&fq->q, skb, reasm_data, false);
- head->ignore_df = 1;
- head->next = NULL;
- head->dev = dev;
- head->tstamp = fq->q.stamp;
- ipv6_hdr(head)->payload_len = htons(payload_len);
- ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn);
- IP6CB(head)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size;
+ skb->ignore_df = 1;
+ skb->dev = dev;
+ ipv6_hdr(skb)->payload_len = htons(payload_len);
+ ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn);
+ IP6CB(skb)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size;
+ IP6CB(skb)->flags |= IP6SKB_FRAGMENTED;
/* Yes, and fold redundant checksum back. 8) */
- if (head->ip_summed == CHECKSUM_COMPLETE)
- head->csum = csum_partial(skb_network_header(head),
- skb_network_header_len(head),
- head->csum);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->csum = csum_partial(skb_network_header(skb),
+ skb_network_header_len(skb),
+ skb->csum);
- fq->q.fragments = NULL;
fq->q.rb_fragments = RB_ROOT;
fq->q.fragments_tail = NULL;
+ fq->q.last_run_head = NULL;
+
+ return 0;
- return true;
+err:
+ inet_frag_kill(&fq->q, refs);
+ return -EINVAL;
}
/*
@@ -511,7 +420,7 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff)
if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
BUG();
if (nexthdr == NEXTHDR_AUTH)
- hdrlen = (hdr.hdrlen+2)<<2;
+ hdrlen = ipv6_authlen(&hdr);
else
hdrlen = ipv6_optlen(&hdr);
@@ -536,11 +445,12 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff)
int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
{
u16 savethdr = skb->transport_header;
- struct net_device *dev = skb->dev;
+ u8 nexthdr = NEXTHDR_FRAGMENT;
int fhoff, nhoff, ret;
struct frag_hdr *fhdr;
struct frag_queue *fq;
struct ipv6hdr *hdr;
+ int refs = 0;
u8 prevhdr;
/* Jumbo payload inhibits frag. header */
@@ -552,6 +462,14 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0)
return 0;
+ /* Discard the first fragment if it does not include all headers
+ * RFC 8200, Section 4.5
+ */
+ if (ipv6frag_thdr_truncated(skb, fhoff, &nexthdr)) {
+ pr_debug("Drop incomplete fragment\n");
+ return 0;
+ }
+
if (!pskb_may_pull(skb, fhoff + sizeof(*fhdr)))
return -ENOMEM;
@@ -559,74 +477,70 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
hdr = ipv6_hdr(skb);
fhdr = (struct frag_hdr *)skb_transport_header(skb);
- if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU &&
- fhdr->frag_off & htons(IP6_MF))
- return -EINVAL;
-
- skb_orphan(skb);
+ rcu_read_lock();
fq = fq_find(net, fhdr->identification, user, hdr,
skb->dev ? skb->dev->ifindex : 0);
if (fq == NULL) {
+ rcu_read_unlock();
pr_debug("Can't find and can't create new queue\n");
return -ENOMEM;
}
spin_lock_bh(&fq->q.lock);
- ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff);
- if (ret < 0) {
- if (ret == -EPROTO) {
- skb->transport_header = savethdr;
- ret = 0;
- }
- goto out_unlock;
- }
-
- /* after queue has assumed skb ownership, only 0 or -EINPROGRESS
- * must be returned.
- */
- ret = -EINPROGRESS;
- if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
- fq->q.meat == fq->q.len &&
- nf_ct_frag6_reasm(fq, skb, dev))
+ ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff, &refs);
+ if (ret == -EPROTO) {
+ skb->transport_header = savethdr;
ret = 0;
- else
- skb_dst_drop(skb);
+ }
-out_unlock:
spin_unlock_bh(&fq->q.lock);
- inet_frag_put(&fq->q);
+ rcu_read_unlock();
+ inet_frag_putn(&fq->q, refs);
return ret;
}
EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
static int nf_ct_net_init(struct net *net)
{
+ struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net);
int res;
- net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
- net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
- net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
- net->nf_frag.frags.f = &nf_frags;
-
- res = inet_frags_init_net(&net->nf_frag.frags);
+ res = fqdir_init(&nf_frag->fqdir, &nf_frags, net);
if (res < 0)
return res;
+
+ nf_frag->fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH;
+ nf_frag->fqdir->low_thresh = IPV6_FRAG_LOW_THRESH;
+ nf_frag->fqdir->timeout = IPV6_FRAG_TIMEOUT;
+
res = nf_ct_frag6_sysctl_register(net);
if (res < 0)
- inet_frags_exit_net(&net->nf_frag.frags);
+ fqdir_exit(nf_frag->fqdir);
return res;
}
+static void nf_ct_net_pre_exit(struct net *net)
+{
+ struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net);
+
+ fqdir_pre_exit(nf_frag->fqdir);
+}
+
static void nf_ct_net_exit(struct net *net)
{
+ struct nft_ct_frag6_pernet *nf_frag = nf_frag_pernet(net);
+
nf_ct_frags6_sysctl_unregister(net);
- inet_frags_exit_net(&net->nf_frag.frags);
+ fqdir_exit(nf_frag->fqdir);
}
static struct pernet_operations nf_ct_net_ops = {
- .init = nf_ct_net_init,
- .exit = nf_ct_net_exit,
+ .init = nf_ct_net_init,
+ .pre_exit = nf_ct_net_pre_exit,
+ .exit = nf_ct_net_exit,
+ .id = &nf_frag_pernet_id,
+ .size = sizeof(struct nft_ct_frag6_pernet),
};
static const struct rhashtable_params nfct_rhash_params = {
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index 72dd3e202375..be7817fbc024 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/types.h>
@@ -13,6 +10,7 @@
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/icmp.h>
+#include <linux/rcupdate.h>
#include <linux/sysctl.h>
#include <net/ipv6_frag.h>
@@ -92,13 +90,19 @@ static const struct nf_hook_ops ipv6_defrag_ops[] = {
static void __net_exit defrag6_net_exit(struct net *net)
{
- if (net->nf.defrag_ipv6) {
+ if (net->nf.defrag_ipv6_users) {
nf_unregister_net_hooks(net, ipv6_defrag_ops,
ARRAY_SIZE(ipv6_defrag_ops));
- net->nf.defrag_ipv6 = false;
+ net->nf.defrag_ipv6_users = 0;
}
}
+static const struct nf_defrag_hook defrag_hook = {
+ .owner = THIS_MODULE,
+ .enable = nf_defrag_ipv6_enable,
+ .disable = nf_defrag_ipv6_disable,
+};
+
static struct pernet_operations defrag6_net_ops = {
.exit = defrag6_net_exit,
};
@@ -117,6 +121,9 @@ static int __init nf_defrag_init(void)
pr_err("nf_defrag_ipv6: can't register pernet ops\n");
goto cleanup_frag6;
}
+
+ rcu_assign_pointer(nf_defrag_v6_hook, &defrag_hook);
+
return ret;
cleanup_frag6:
@@ -127,6 +134,7 @@ cleanup_frag6:
static void __exit nf_defrag_fini(void)
{
+ rcu_assign_pointer(nf_defrag_v6_hook, NULL);
unregister_pernet_subsys(&defrag6_net_ops);
nf_ct_frag6_cleanup();
}
@@ -135,19 +143,21 @@ int nf_defrag_ipv6_enable(struct net *net)
{
int err = 0;
- might_sleep();
-
- if (net->nf.defrag_ipv6)
- return 0;
-
mutex_lock(&defrag6_mutex);
- if (net->nf.defrag_ipv6)
+ if (net->nf.defrag_ipv6_users == UINT_MAX) {
+ err = -EOVERFLOW;
+ goto out_unlock;
+ }
+
+ if (net->nf.defrag_ipv6_users) {
+ net->nf.defrag_ipv6_users++;
goto out_unlock;
+ }
err = nf_register_net_hooks(net, ipv6_defrag_ops,
ARRAY_SIZE(ipv6_defrag_ops));
if (err == 0)
- net->nf.defrag_ipv6 = true;
+ net->nf.defrag_ipv6_users = 1;
out_unlock:
mutex_unlock(&defrag6_mutex);
@@ -155,7 +165,21 @@ int nf_defrag_ipv6_enable(struct net *net)
}
EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable);
+void nf_defrag_ipv6_disable(struct net *net)
+{
+ mutex_lock(&defrag6_mutex);
+ if (net->nf.defrag_ipv6_users) {
+ net->nf.defrag_ipv6_users--;
+ if (net->nf.defrag_ipv6_users == 0)
+ nf_unregister_net_hooks(net, ipv6_defrag_ops,
+ ARRAY_SIZE(ipv6_defrag_ops));
+ }
+ mutex_unlock(&defrag6_mutex);
+}
+EXPORT_SYMBOL_GPL(nf_defrag_ipv6_disable);
+
module_init(nf_defrag_init);
module_exit(nf_defrag_fini);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IPv6 defragmentation support");
diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c
index 4a7ddeddbaab..6da3102b7c1b 100644
--- a/net/ipv6/netfilter/nf_dup_ipv6.c
+++ b/net/ipv6/netfilter/nf_dup_ipv6.c
@@ -1,12 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* (C) 2007 by Sebastian Claßen <sebastian.classen@freenet.ag>
* (C) 2007-2010 by Jan Engelhardt <jengelh@medozas.de>
*
* Extracted from xt_TEE.c
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 or later, as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/percpu.h>
@@ -41,7 +38,7 @@ static bool nf_dup_ipv6_route(struct net *net, struct sk_buff *skb,
}
skb_dst_drop(skb);
skb_dst_set(skb, dst);
- skb->dev = dst->dev;
+ skb->dev = dst_dev(dst);
skb->protocol = htons(ETH_P_IPV6);
return true;
@@ -50,14 +47,15 @@ static bool nf_dup_ipv6_route(struct net *net, struct sk_buff *skb,
void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
const struct in6_addr *gw, int oif)
{
- if (this_cpu_read(nf_skb_duplicated))
- return;
+ local_bh_disable();
+ if (current->in_nf_duplicate)
+ goto out;
skb = pskb_copy(skb, GFP_ATOMIC);
if (skb == NULL)
- return;
+ goto out;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
- nf_reset(skb);
+ nf_reset_ct(skb);
nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
#endif
if (hooknum == NF_INET_PRE_ROUTING ||
@@ -66,12 +64,14 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum,
--iph->hop_limit;
}
if (nf_dup_ipv6_route(net, skb, gw, oif)) {
- __this_cpu_write(nf_skb_duplicated, true);
+ current->in_nf_duplicate = true;
ip6_local_out(net, skb->sk, skb);
- __this_cpu_write(nf_skb_duplicated, false);
+ current->in_nf_duplicate = false;
} else {
kfree_skb(skb);
}
+out:
+ local_bh_enable();
}
EXPORT_SYMBOL_GPL(nf_dup_ipv6);
diff --git a/net/ipv6/netfilter/nf_flow_table_ipv6.c b/net/ipv6/netfilter/nf_flow_table_ipv6.c
deleted file mode 100644
index c511d206bf9b..000000000000
--- a/net/ipv6/netfilter/nf_flow_table_ipv6.c
+++ /dev/null
@@ -1,34 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/netfilter.h>
-#include <linux/rhashtable.h>
-#include <net/netfilter/nf_flow_table.h>
-#include <net/netfilter/nf_tables.h>
-
-static struct nf_flowtable_type flowtable_ipv6 = {
- .family = NFPROTO_IPV6,
- .init = nf_flow_table_init,
- .free = nf_flow_table_free,
- .hook = nf_flow_offload_ipv6_hook,
- .owner = THIS_MODULE,
-};
-
-static int __init nf_flow_ipv6_module_init(void)
-{
- nft_register_flowtable_type(&flowtable_ipv6);
-
- return 0;
-}
-
-static void __exit nf_flow_ipv6_module_exit(void)
-{
- nft_unregister_flowtable_type(&flowtable_ipv6);
-}
-
-module_init(nf_flow_ipv6_module_init);
-module_exit(nf_flow_ipv6_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
-MODULE_ALIAS_NF_FLOWTABLE(AF_INET6);
diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c
deleted file mode 100644
index c6bf580d0f33..000000000000
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ /dev/null
@@ -1,428 +0,0 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/skbuff.h>
-#include <linux/if_arp.h>
-#include <linux/ip.h>
-#include <net/ipv6.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/tcp.h>
-#include <net/route.h>
-
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
-#include <linux/netfilter/xt_LOG.h>
-#include <net/netfilter/nf_log.h>
-
-static const struct nf_loginfo default_loginfo = {
- .type = NF_LOG_TYPE_LOG,
- .u = {
- .log = {
- .level = LOGLEVEL_NOTICE,
- .logflags = NF_LOG_DEFAULT_MASK,
- },
- },
-};
-
-/* One level of recursion won't kill us */
-static void dump_ipv6_packet(struct net *net, struct nf_log_buf *m,
- const struct nf_loginfo *info,
- const struct sk_buff *skb, unsigned int ip6hoff,
- int recurse)
-{
- u_int8_t currenthdr;
- int fragment;
- struct ipv6hdr _ip6h;
- const struct ipv6hdr *ih;
- unsigned int ptr;
- unsigned int hdrlen = 0;
- unsigned int logflags;
-
- if (info->type == NF_LOG_TYPE_LOG)
- logflags = info->u.log.logflags;
- else
- logflags = NF_LOG_DEFAULT_MASK;
-
- ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
- if (ih == NULL) {
- nf_log_buf_add(m, "TRUNCATED");
- return;
- }
-
- /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */
- nf_log_buf_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
-
- /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
- nf_log_buf_add(m, "LEN=%zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
- ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
- (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
- ih->hop_limit,
- (ntohl(*(__be32 *)ih) & 0x000fffff));
-
- fragment = 0;
- ptr = ip6hoff + sizeof(struct ipv6hdr);
- currenthdr = ih->nexthdr;
- while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) {
- struct ipv6_opt_hdr _hdr;
- const struct ipv6_opt_hdr *hp;
-
- hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
- if (hp == NULL) {
- nf_log_buf_add(m, "TRUNCATED");
- return;
- }
-
- /* Max length: 48 "OPT (...) " */
- if (logflags & NF_LOG_IPOPT)
- nf_log_buf_add(m, "OPT ( ");
-
- switch (currenthdr) {
- case IPPROTO_FRAGMENT: {
- struct frag_hdr _fhdr;
- const struct frag_hdr *fh;
-
- nf_log_buf_add(m, "FRAG:");
- fh = skb_header_pointer(skb, ptr, sizeof(_fhdr),
- &_fhdr);
- if (fh == NULL) {
- nf_log_buf_add(m, "TRUNCATED ");
- return;
- }
-
- /* Max length: 6 "65535 " */
- nf_log_buf_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8);
-
- /* Max length: 11 "INCOMPLETE " */
- if (fh->frag_off & htons(0x0001))
- nf_log_buf_add(m, "INCOMPLETE ");
-
- nf_log_buf_add(m, "ID:%08x ",
- ntohl(fh->identification));
-
- if (ntohs(fh->frag_off) & 0xFFF8)
- fragment = 1;
-
- hdrlen = 8;
-
- break;
- }
- case IPPROTO_DSTOPTS:
- case IPPROTO_ROUTING:
- case IPPROTO_HOPOPTS:
- if (fragment) {
- if (logflags & NF_LOG_IPOPT)
- nf_log_buf_add(m, ")");
- return;
- }
- hdrlen = ipv6_optlen(hp);
- break;
- /* Max Length */
- case IPPROTO_AH:
- if (logflags & NF_LOG_IPOPT) {
- struct ip_auth_hdr _ahdr;
- const struct ip_auth_hdr *ah;
-
- /* Max length: 3 "AH " */
- nf_log_buf_add(m, "AH ");
-
- if (fragment) {
- nf_log_buf_add(m, ")");
- return;
- }
-
- ah = skb_header_pointer(skb, ptr, sizeof(_ahdr),
- &_ahdr);
- if (ah == NULL) {
- /*
- * Max length: 26 "INCOMPLETE [65535
- * bytes] )"
- */
- nf_log_buf_add(m, "INCOMPLETE [%u bytes] )",
- skb->len - ptr);
- return;
- }
-
- /* Length: 15 "SPI=0xF1234567 */
- nf_log_buf_add(m, "SPI=0x%x ", ntohl(ah->spi));
-
- }
-
- hdrlen = (hp->hdrlen+2)<<2;
- break;
- case IPPROTO_ESP:
- if (logflags & NF_LOG_IPOPT) {
- struct ip_esp_hdr _esph;
- const struct ip_esp_hdr *eh;
-
- /* Max length: 4 "ESP " */
- nf_log_buf_add(m, "ESP ");
-
- if (fragment) {
- nf_log_buf_add(m, ")");
- return;
- }
-
- /*
- * Max length: 26 "INCOMPLETE [65535 bytes] )"
- */
- eh = skb_header_pointer(skb, ptr, sizeof(_esph),
- &_esph);
- if (eh == NULL) {
- nf_log_buf_add(m, "INCOMPLETE [%u bytes] )",
- skb->len - ptr);
- return;
- }
-
- /* Length: 16 "SPI=0xF1234567 )" */
- nf_log_buf_add(m, "SPI=0x%x )",
- ntohl(eh->spi));
- }
- return;
- default:
- /* Max length: 20 "Unknown Ext Hdr 255" */
- nf_log_buf_add(m, "Unknown Ext Hdr %u", currenthdr);
- return;
- }
- if (logflags & NF_LOG_IPOPT)
- nf_log_buf_add(m, ") ");
-
- currenthdr = hp->nexthdr;
- ptr += hdrlen;
- }
-
- switch (currenthdr) {
- case IPPROTO_TCP:
- if (nf_log_dump_tcp_header(m, skb, currenthdr, fragment,
- ptr, logflags))
- return;
- break;
- case IPPROTO_UDP:
- case IPPROTO_UDPLITE:
- if (nf_log_dump_udp_header(m, skb, currenthdr, fragment, ptr))
- return;
- break;
- case IPPROTO_ICMPV6: {
- struct icmp6hdr _icmp6h;
- const struct icmp6hdr *ic;
-
- /* Max length: 13 "PROTO=ICMPv6 " */
- nf_log_buf_add(m, "PROTO=ICMPv6 ");
-
- if (fragment)
- break;
-
- /* Max length: 25 "INCOMPLETE [65535 bytes] " */
- ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h);
- if (ic == NULL) {
- nf_log_buf_add(m, "INCOMPLETE [%u bytes] ",
- skb->len - ptr);
- return;
- }
-
- /* Max length: 18 "TYPE=255 CODE=255 " */
- nf_log_buf_add(m, "TYPE=%u CODE=%u ",
- ic->icmp6_type, ic->icmp6_code);
-
- switch (ic->icmp6_type) {
- case ICMPV6_ECHO_REQUEST:
- case ICMPV6_ECHO_REPLY:
- /* Max length: 19 "ID=65535 SEQ=65535 " */
- nf_log_buf_add(m, "ID=%u SEQ=%u ",
- ntohs(ic->icmp6_identifier),
- ntohs(ic->icmp6_sequence));
- break;
- case ICMPV6_MGM_QUERY:
- case ICMPV6_MGM_REPORT:
- case ICMPV6_MGM_REDUCTION:
- break;
-
- case ICMPV6_PARAMPROB:
- /* Max length: 17 "POINTER=ffffffff " */
- nf_log_buf_add(m, "POINTER=%08x ",
- ntohl(ic->icmp6_pointer));
- /* Fall through */
- case ICMPV6_DEST_UNREACH:
- case ICMPV6_PKT_TOOBIG:
- case ICMPV6_TIME_EXCEED:
- /* Max length: 3+maxlen */
- if (recurse) {
- nf_log_buf_add(m, "[");
- dump_ipv6_packet(net, m, info, skb,
- ptr + sizeof(_icmp6h), 0);
- nf_log_buf_add(m, "] ");
- }
-
- /* Max length: 10 "MTU=65535 " */
- if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) {
- nf_log_buf_add(m, "MTU=%u ",
- ntohl(ic->icmp6_mtu));
- }
- }
- break;
- }
- /* Max length: 10 "PROTO=255 " */
- default:
- nf_log_buf_add(m, "PROTO=%u ", currenthdr);
- }
-
- /* Max length: 15 "UID=4294967295 " */
- if ((logflags & NF_LOG_UID) && recurse)
- nf_log_dump_sk_uid_gid(net, m, skb->sk);
-
- /* Max length: 16 "MARK=0xFFFFFFFF " */
- if (recurse && skb->mark)
- nf_log_buf_add(m, "MARK=0x%x ", skb->mark);
-}
-
-static void dump_ipv6_mac_header(struct nf_log_buf *m,
- const struct nf_loginfo *info,
- const struct sk_buff *skb)
-{
- struct net_device *dev = skb->dev;
- unsigned int logflags = 0;
-
- if (info->type == NF_LOG_TYPE_LOG)
- logflags = info->u.log.logflags;
-
- if (!(logflags & NF_LOG_MACDECODE))
- goto fallback;
-
- switch (dev->type) {
- case ARPHRD_ETHER:
- nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
- eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
- ntohs(eth_hdr(skb)->h_proto));
- return;
- default:
- break;
- }
-
-fallback:
- nf_log_buf_add(m, "MAC=");
- if (dev->hard_header_len &&
- skb->mac_header != skb->network_header) {
- const unsigned char *p = skb_mac_header(skb);
- unsigned int len = dev->hard_header_len;
- unsigned int i;
-
- if (dev->type == ARPHRD_SIT) {
- p -= ETH_HLEN;
-
- if (p < skb->head)
- p = NULL;
- }
-
- if (p != NULL) {
- nf_log_buf_add(m, "%02x", *p++);
- for (i = 1; i < len; i++)
- nf_log_buf_add(m, ":%02x", *p++);
- }
- nf_log_buf_add(m, " ");
-
- if (dev->type == ARPHRD_SIT) {
- const struct iphdr *iph =
- (struct iphdr *)skb_mac_header(skb);
- nf_log_buf_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr,
- &iph->daddr);
- }
- } else {
- nf_log_buf_add(m, " ");
- }
-}
-
-static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
- unsigned int hooknum, const struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- const struct nf_loginfo *loginfo,
- const char *prefix)
-{
- struct nf_log_buf *m;
-
- /* FIXME: Disabled from containers until syslog ns is supported */
- if (!net_eq(net, &init_net) && !sysctl_nf_log_all_netns)
- return;
-
- m = nf_log_buf_open();
-
- if (!loginfo)
- loginfo = &default_loginfo;
-
- nf_log_dump_packet_common(m, pf, hooknum, skb, in, out,
- loginfo, prefix);
-
- if (in != NULL)
- dump_ipv6_mac_header(m, loginfo, skb);
-
- dump_ipv6_packet(net, m, loginfo, skb, skb_network_offset(skb), 1);
-
- nf_log_buf_close(m);
-}
-
-static struct nf_logger nf_ip6_logger __read_mostly = {
- .name = "nf_log_ipv6",
- .type = NF_LOG_TYPE_LOG,
- .logfn = nf_log_ip6_packet,
- .me = THIS_MODULE,
-};
-
-static int __net_init nf_log_ipv6_net_init(struct net *net)
-{
- return nf_log_set(net, NFPROTO_IPV6, &nf_ip6_logger);
-}
-
-static void __net_exit nf_log_ipv6_net_exit(struct net *net)
-{
- nf_log_unset(net, &nf_ip6_logger);
-}
-
-static struct pernet_operations nf_log_ipv6_net_ops = {
- .init = nf_log_ipv6_net_init,
- .exit = nf_log_ipv6_net_exit,
-};
-
-static int __init nf_log_ipv6_init(void)
-{
- int ret;
-
- ret = register_pernet_subsys(&nf_log_ipv6_net_ops);
- if (ret < 0)
- return ret;
-
- ret = nf_log_register(NFPROTO_IPV6, &nf_ip6_logger);
- if (ret < 0) {
- pr_err("failed to register logger\n");
- goto err1;
- }
-
- return 0;
-
-err1:
- unregister_pernet_subsys(&nf_log_ipv6_net_ops);
- return ret;
-}
-
-static void __exit nf_log_ipv6_exit(void)
-{
- unregister_pernet_subsys(&nf_log_ipv6_net_ops);
- nf_log_unregister(&nf_ip6_logger);
-}
-
-module_init(nf_log_ipv6_init);
-module_exit(nf_log_ipv6_exit);
-
-MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
-MODULE_DESCRIPTION("Netfilter IPv6 packet logging");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_NF_LOGGER(AF_INET6, 0);
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
deleted file mode 100644
index ca6d38698b1a..000000000000
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ /dev/null
@@ -1,444 +0,0 @@
-/*
- * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Development of IPv6 NAT funded by Astaro.
- */
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/ipv6.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv6.h>
-#include <net/secure_seq.h>
-#include <net/checksum.h>
-#include <net/ip6_checksum.h>
-#include <net/ip6_route.h>
-#include <net/ipv6.h>
-
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/netfilter/nf_nat_l4proto.h>
-
-static const struct nf_nat_l3proto nf_nat_l3proto_ipv6;
-
-#ifdef CONFIG_XFRM
-static void nf_nat_ipv6_decode_session(struct sk_buff *skb,
- const struct nf_conn *ct,
- enum ip_conntrack_dir dir,
- unsigned long statusbit,
- struct flowi *fl)
-{
- const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple;
- struct flowi6 *fl6 = &fl->u.ip6;
-
- if (ct->status & statusbit) {
- fl6->daddr = t->dst.u3.in6;
- if (t->dst.protonum == IPPROTO_TCP ||
- t->dst.protonum == IPPROTO_UDP ||
- t->dst.protonum == IPPROTO_UDPLITE ||
- t->dst.protonum == IPPROTO_DCCP ||
- t->dst.protonum == IPPROTO_SCTP)
- fl6->fl6_dport = t->dst.u.all;
- }
-
- statusbit ^= IPS_NAT_MASK;
-
- if (ct->status & statusbit) {
- fl6->saddr = t->src.u3.in6;
- if (t->dst.protonum == IPPROTO_TCP ||
- t->dst.protonum == IPPROTO_UDP ||
- t->dst.protonum == IPPROTO_UDPLITE ||
- t->dst.protonum == IPPROTO_DCCP ||
- t->dst.protonum == IPPROTO_SCTP)
- fl6->fl6_sport = t->src.u.all;
- }
-}
-#endif
-
-static bool nf_nat_ipv6_in_range(const struct nf_conntrack_tuple *t,
- const struct nf_nat_range2 *range)
-{
- return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 &&
- ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0;
-}
-
-static u32 nf_nat_ipv6_secure_port(const struct nf_conntrack_tuple *t,
- __be16 dport)
-{
- return secure_ipv6_port_ephemeral(t->src.u3.ip6, t->dst.u3.ip6, dport);
-}
-
-static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb,
- unsigned int iphdroff,
- const struct nf_nat_l4proto *l4proto,
- const struct nf_conntrack_tuple *target,
- enum nf_nat_manip_type maniptype)
-{
- struct ipv6hdr *ipv6h;
- __be16 frag_off;
- int hdroff;
- u8 nexthdr;
-
- if (!skb_make_writable(skb, iphdroff + sizeof(*ipv6h)))
- return false;
-
- ipv6h = (void *)skb->data + iphdroff;
- nexthdr = ipv6h->nexthdr;
- hdroff = ipv6_skip_exthdr(skb, iphdroff + sizeof(*ipv6h),
- &nexthdr, &frag_off);
- if (hdroff < 0)
- goto manip_addr;
-
- if ((frag_off & htons(~0x7)) == 0 &&
- !l4proto->manip_pkt(skb, &nf_nat_l3proto_ipv6, iphdroff, hdroff,
- target, maniptype))
- return false;
-
- /* must reload, offset might have changed */
- ipv6h = (void *)skb->data + iphdroff;
-
-manip_addr:
- if (maniptype == NF_NAT_MANIP_SRC)
- ipv6h->saddr = target->src.u3.in6;
- else
- ipv6h->daddr = target->dst.u3.in6;
-
- return true;
-}
-
-static void nf_nat_ipv6_csum_update(struct sk_buff *skb,
- unsigned int iphdroff, __sum16 *check,
- const struct nf_conntrack_tuple *t,
- enum nf_nat_manip_type maniptype)
-{
- const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + iphdroff);
- const struct in6_addr *oldip, *newip;
-
- if (maniptype == NF_NAT_MANIP_SRC) {
- oldip = &ipv6h->saddr;
- newip = &t->src.u3.in6;
- } else {
- oldip = &ipv6h->daddr;
- newip = &t->dst.u3.in6;
- }
- inet_proto_csum_replace16(check, skb, oldip->s6_addr32,
- newip->s6_addr32, true);
-}
-
-static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
- u8 proto, void *data, __sum16 *check,
- int datalen, int oldlen)
-{
- if (skb->ip_summed != CHECKSUM_PARTIAL) {
- const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
-
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
- (data - (void *)skb->data);
- skb->csum_offset = (void *)check - data;
- *check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
- datalen, proto, 0);
- } else
- inet_proto_csum_replace2(check, skb,
- htons(oldlen), htons(datalen), true);
-}
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[],
- struct nf_nat_range2 *range)
-{
- if (tb[CTA_NAT_V6_MINIP]) {
- nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP],
- sizeof(struct in6_addr));
- range->flags |= NF_NAT_RANGE_MAP_IPS;
- }
-
- if (tb[CTA_NAT_V6_MAXIP])
- nla_memcpy(&range->max_addr.ip6, tb[CTA_NAT_V6_MAXIP],
- sizeof(struct in6_addr));
- else
- range->max_addr = range->min_addr;
-
- return 0;
-}
-#endif
-
-static const struct nf_nat_l3proto nf_nat_l3proto_ipv6 = {
- .l3proto = NFPROTO_IPV6,
- .secure_port = nf_nat_ipv6_secure_port,
- .in_range = nf_nat_ipv6_in_range,
- .manip_pkt = nf_nat_ipv6_manip_pkt,
- .csum_update = nf_nat_ipv6_csum_update,
- .csum_recalc = nf_nat_ipv6_csum_recalc,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_to_range = nf_nat_ipv6_nlattr_to_range,
-#endif
-#ifdef CONFIG_XFRM
- .decode_session = nf_nat_ipv6_decode_session,
-#endif
-};
-
-int nf_nat_icmpv6_reply_translation(struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int hooknum,
- unsigned int hdrlen)
-{
- struct {
- struct icmp6hdr icmp6;
- struct ipv6hdr ip6;
- } *inside;
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
- enum nf_nat_manip_type manip = HOOK2MANIP(hooknum);
- const struct nf_nat_l4proto *l4proto;
- struct nf_conntrack_tuple target;
- unsigned long statusbit;
-
- WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY);
-
- if (!skb_make_writable(skb, hdrlen + sizeof(*inside)))
- return 0;
- if (nf_ip6_checksum(skb, hooknum, hdrlen, IPPROTO_ICMPV6))
- return 0;
-
- inside = (void *)skb->data + hdrlen;
- if (inside->icmp6.icmp6_type == NDISC_REDIRECT) {
- if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
- return 0;
- if (ct->status & IPS_NAT_MASK)
- return 0;
- }
-
- if (manip == NF_NAT_MANIP_SRC)
- statusbit = IPS_SRC_NAT;
- else
- statusbit = IPS_DST_NAT;
-
- /* Invert if this is reply direction */
- if (dir == IP_CT_DIR_REPLY)
- statusbit ^= IPS_NAT_MASK;
-
- if (!(ct->status & statusbit))
- return 1;
-
- l4proto = __nf_nat_l4proto_find(NFPROTO_IPV6, inside->ip6.nexthdr);
- if (!nf_nat_ipv6_manip_pkt(skb, hdrlen + sizeof(inside->icmp6),
- l4proto, &ct->tuplehash[!dir].tuple, !manip))
- return 0;
-
- if (skb->ip_summed != CHECKSUM_PARTIAL) {
- struct ipv6hdr *ipv6h = ipv6_hdr(skb);
- inside = (void *)skb->data + hdrlen;
- inside->icmp6.icmp6_cksum = 0;
- inside->icmp6.icmp6_cksum =
- csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
- skb->len - hdrlen, IPPROTO_ICMPV6,
- skb_checksum(skb, hdrlen,
- skb->len - hdrlen, 0));
- }
-
- nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
- l4proto = __nf_nat_l4proto_find(NFPROTO_IPV6, IPPROTO_ICMPV6);
- if (!nf_nat_ipv6_manip_pkt(skb, 0, l4proto, &target, manip))
- return 0;
-
- return 1;
-}
-EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation);
-
-static unsigned int
-nf_nat_ipv6_fn(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- __be16 frag_off;
- int hdrlen;
- u8 nexthdr;
-
- ct = nf_ct_get(skb, &ctinfo);
- /* Can't track? It's not due to stress, or conntrack would
- * have dropped it. Hence it's the user's responsibilty to
- * packet filter it out, or implement conntrack/NAT for that
- * protocol. 8) --RR
- */
- if (!ct)
- return NF_ACCEPT;
-
- if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) {
- nexthdr = ipv6_hdr(skb)->nexthdr;
- hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr),
- &nexthdr, &frag_off);
-
- if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
- if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo,
- state->hook,
- hdrlen))
- return NF_DROP;
- else
- return NF_ACCEPT;
- }
- }
-
- return nf_nat_inet_fn(priv, skb, state);
-}
-
-static unsigned int
-nf_nat_ipv6_in(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- unsigned int ret;
- struct in6_addr daddr = ipv6_hdr(skb)->daddr;
-
- ret = nf_nat_ipv6_fn(priv, skb, state);
- if (ret != NF_DROP && ret != NF_STOLEN &&
- ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr))
- skb_dst_drop(skb);
-
- return ret;
-}
-
-static unsigned int
-nf_nat_ipv6_out(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
-#ifdef CONFIG_XFRM
- const struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- int err;
-#endif
- unsigned int ret;
-
- ret = nf_nat_ipv6_fn(priv, skb, state);
-#ifdef CONFIG_XFRM
- if (ret != NF_DROP && ret != NF_STOLEN &&
- !(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
- (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-
- if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3,
- &ct->tuplehash[!dir].tuple.dst.u3) ||
- (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 &&
- ct->tuplehash[dir].tuple.src.u.all !=
- ct->tuplehash[!dir].tuple.dst.u.all)) {
- err = nf_xfrm_me_harder(state->net, skb, AF_INET6);
- if (err < 0)
- ret = NF_DROP_ERR(err);
- }
- }
-#endif
- return ret;
-}
-
-static unsigned int
-nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- const struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- unsigned int ret;
- int err;
-
- ret = nf_nat_ipv6_fn(priv, skb, state);
- if (ret != NF_DROP && ret != NF_STOLEN &&
- (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
- enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
-
- if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3,
- &ct->tuplehash[!dir].tuple.src.u3)) {
- err = ip6_route_me_harder(state->net, skb);
- if (err < 0)
- ret = NF_DROP_ERR(err);
- }
-#ifdef CONFIG_XFRM
- else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
- ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 &&
- ct->tuplehash[dir].tuple.dst.u.all !=
- ct->tuplehash[!dir].tuple.src.u.all) {
- err = nf_xfrm_me_harder(state->net, skb, AF_INET6);
- if (err < 0)
- ret = NF_DROP_ERR(err);
- }
-#endif
- }
- return ret;
-}
-
-static const struct nf_hook_ops nf_nat_ipv6_ops[] = {
- /* Before packet filtering, change destination */
- {
- .hook = nf_nat_ipv6_in,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_PRE_ROUTING,
- .priority = NF_IP6_PRI_NAT_DST,
- },
- /* After packet filtering, change source */
- {
- .hook = nf_nat_ipv6_out,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP6_PRI_NAT_SRC,
- },
- /* Before packet filtering, change destination */
- {
- .hook = nf_nat_ipv6_local_fn,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_LOCAL_OUT,
- .priority = NF_IP6_PRI_NAT_DST,
- },
- /* After packet filtering, change source */
- {
- .hook = nf_nat_ipv6_fn,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP6_PRI_NAT_SRC,
- },
-};
-
-int nf_nat_l3proto_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops)
-{
- return nf_nat_register_fn(net, ops, nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops));
-}
-EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv6_register_fn);
-
-void nf_nat_l3proto_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops)
-{
- nf_nat_unregister_fn(net, ops, ARRAY_SIZE(nf_nat_ipv6_ops));
-}
-EXPORT_SYMBOL_GPL(nf_nat_l3proto_ipv6_unregister_fn);
-
-static int __init nf_nat_l3proto_ipv6_init(void)
-{
- int err;
-
- err = nf_nat_l4proto_register(NFPROTO_IPV6, &nf_nat_l4proto_icmpv6);
- if (err < 0)
- goto err1;
- err = nf_nat_l3proto_register(&nf_nat_l3proto_ipv6);
- if (err < 0)
- goto err2;
- return err;
-
-err2:
- nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_icmpv6);
-err1:
- return err;
-}
-
-static void __exit nf_nat_l3proto_ipv6_exit(void)
-{
- nf_nat_l3proto_unregister(&nf_nat_l3proto_ipv6);
- nf_nat_l4proto_unregister(NFPROTO_IPV6, &nf_nat_l4proto_icmpv6);
-}
-
-MODULE_LICENSE("GPL");
-MODULE_ALIAS("nf-nat-" __stringify(AF_INET6));
-
-module_init(nf_nat_l3proto_ipv6_init);
-module_exit(nf_nat_l3proto_ipv6_exit);
diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
deleted file mode 100644
index e6eb7cf9b54f..000000000000
--- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on Rusty Russell's IPv6 MASQUERADE target. Development of IPv6
- * NAT funded by Astaro.
- */
-
-#include <linux/kernel.h>
-#include <linux/atomic.h>
-#include <linux/netdevice.h>
-#include <linux/ipv6.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv6.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/addrconf.h>
-#include <net/ipv6.h>
-#include <net/netfilter/ipv6/nf_nat_masquerade.h>
-
-#define MAX_WORK_COUNT 16
-
-static atomic_t v6_worker_count;
-
-unsigned int
-nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
- const struct net_device *out)
-{
- enum ip_conntrack_info ctinfo;
- struct nf_conn_nat *nat;
- struct in6_addr src;
- struct nf_conn *ct;
- struct nf_nat_range2 newrange;
-
- ct = nf_ct_get(skb, &ctinfo);
- WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
- ctinfo == IP_CT_RELATED_REPLY)));
-
- if (ipv6_dev_get_saddr(nf_ct_net(ct), out,
- &ipv6_hdr(skb)->daddr, 0, &src) < 0)
- return NF_DROP;
-
- nat = nf_ct_nat_ext_add(ct);
- if (nat)
- nat->masq_index = out->ifindex;
-
- newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS;
- newrange.min_addr.in6 = src;
- newrange.max_addr.in6 = src;
- newrange.min_proto = range->min_proto;
- newrange.max_proto = range->max_proto;
-
- return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_SRC);
-}
-EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6);
-
-static int device_cmp(struct nf_conn *ct, void *ifindex)
-{
- const struct nf_conn_nat *nat = nfct_nat(ct);
-
- if (!nat)
- return 0;
- if (nf_ct_l3num(ct) != NFPROTO_IPV6)
- return 0;
- return nat->masq_index == (int)(long)ifindex;
-}
-
-static int masq_device_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- const struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- struct net *net = dev_net(dev);
-
- if (event == NETDEV_DOWN)
- nf_ct_iterate_cleanup_net(net, device_cmp,
- (void *)(long)dev->ifindex, 0, 0);
-
- return NOTIFY_DONE;
-}
-
-static struct notifier_block masq_dev_notifier = {
- .notifier_call = masq_device_event,
-};
-
-struct masq_dev_work {
- struct work_struct work;
- struct net *net;
- int ifindex;
-};
-
-static void iterate_cleanup_work(struct work_struct *work)
-{
- struct masq_dev_work *w;
- long index;
-
- w = container_of(work, struct masq_dev_work, work);
-
- index = w->ifindex;
- nf_ct_iterate_cleanup_net(w->net, device_cmp, (void *)index, 0, 0);
-
- put_net(w->net);
- kfree(w);
- atomic_dec(&v6_worker_count);
- module_put(THIS_MODULE);
-}
-
-/* ipv6 inet notifier is an atomic notifier, i.e. we cannot
- * schedule.
- *
- * Unfortunately, nf_ct_iterate_cleanup_net can run for a long
- * time if there are lots of conntracks and the system
- * handles high softirq load, so it frequently calls cond_resched
- * while iterating the conntrack table.
- *
- * So we defer nf_ct_iterate_cleanup_net walk to the system workqueue.
- *
- * As we can have 'a lot' of inet_events (depending on amount
- * of ipv6 addresses being deleted), we also need to add an upper
- * limit to the number of queued work items.
- */
-static int masq_inet_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- struct inet6_ifaddr *ifa = ptr;
- const struct net_device *dev;
- struct masq_dev_work *w;
- struct net *net;
-
- if (event != NETDEV_DOWN ||
- atomic_read(&v6_worker_count) >= MAX_WORK_COUNT)
- return NOTIFY_DONE;
-
- dev = ifa->idev->dev;
- net = maybe_get_net(dev_net(dev));
- if (!net)
- return NOTIFY_DONE;
-
- if (!try_module_get(THIS_MODULE))
- goto err_module;
-
- w = kmalloc(sizeof(*w), GFP_ATOMIC);
- if (w) {
- atomic_inc(&v6_worker_count);
-
- INIT_WORK(&w->work, iterate_cleanup_work);
- w->ifindex = dev->ifindex;
- w->net = net;
- schedule_work(&w->work);
-
- return NOTIFY_DONE;
- }
-
- module_put(THIS_MODULE);
- err_module:
- put_net(net);
- return NOTIFY_DONE;
-}
-
-static struct notifier_block masq_inet_notifier = {
- .notifier_call = masq_inet_event,
-};
-
-static atomic_t masquerade_notifier_refcount = ATOMIC_INIT(0);
-
-void nf_nat_masquerade_ipv6_register_notifier(void)
-{
- /* check if the notifier is already set */
- if (atomic_inc_return(&masquerade_notifier_refcount) > 1)
- return;
-
- register_netdevice_notifier(&masq_dev_notifier);
- register_inet6addr_notifier(&masq_inet_notifier);
-}
-EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_register_notifier);
-
-void nf_nat_masquerade_ipv6_unregister_notifier(void)
-{
- /* check if the notifier still has clients */
- if (atomic_dec_return(&masquerade_notifier_refcount) > 0)
- return;
-
- unregister_inet6addr_notifier(&masq_inet_notifier);
- unregister_netdevice_notifier(&masq_dev_notifier);
-}
-EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_unregister_notifier);
diff --git a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
deleted file mode 100644
index d9bf42ba44fa..000000000000
--- a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2011 Patrick Mchardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Based on Rusty Russell's IPv4 ICMP NAT code. Development of IPv6
- * NAT funded by Astaro.
- */
-
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/icmpv6.h>
-
-#include <linux/netfilter.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/netfilter/nf_nat_l4proto.h>
-
-static bool
-icmpv6_in_range(const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype,
- const union nf_conntrack_man_proto *min,
- const union nf_conntrack_man_proto *max)
-{
- return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) &&
- ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id);
-}
-
-static void
-icmpv6_unique_tuple(const struct nf_nat_l3proto *l3proto,
- struct nf_conntrack_tuple *tuple,
- const struct nf_nat_range2 *range,
- enum nf_nat_manip_type maniptype,
- const struct nf_conn *ct)
-{
- static u16 id;
- unsigned int range_size;
- unsigned int i;
-
- range_size = ntohs(range->max_proto.icmp.id) -
- ntohs(range->min_proto.icmp.id) + 1;
-
- if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED))
- range_size = 0xffff;
-
- for (i = 0; ; ++id) {
- tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
- (id % range_size));
- if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
- return;
- }
-}
-
-static bool
-icmpv6_manip_pkt(struct sk_buff *skb,
- const struct nf_nat_l3proto *l3proto,
- unsigned int iphdroff, unsigned int hdroff,
- const struct nf_conntrack_tuple *tuple,
- enum nf_nat_manip_type maniptype)
-{
- struct icmp6hdr *hdr;
-
- if (!skb_make_writable(skb, hdroff + sizeof(*hdr)))
- return false;
-
- hdr = (struct icmp6hdr *)(skb->data + hdroff);
- l3proto->csum_update(skb, iphdroff, &hdr->icmp6_cksum,
- tuple, maniptype);
- if (hdr->icmp6_type == ICMPV6_ECHO_REQUEST ||
- hdr->icmp6_type == ICMPV6_ECHO_REPLY) {
- inet_proto_csum_replace2(&hdr->icmp6_cksum, skb,
- hdr->icmp6_identifier,
- tuple->src.u.icmp.id, false);
- hdr->icmp6_identifier = tuple->src.u.icmp.id;
- }
- return true;
-}
-
-const struct nf_nat_l4proto nf_nat_l4proto_icmpv6 = {
- .l4proto = IPPROTO_ICMPV6,
- .manip_pkt = icmpv6_manip_pkt,
- .in_range = icmpv6_in_range,
- .unique_tuple = icmpv6_unique_tuple,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .nlattr_to_range = nf_nat_l4proto_nlattr_to_range,
-#endif
-};
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index 24858402e374..ef5b7e85cffa 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* (C) 1999-2001 Paul `Rusty' Russell
* (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/module.h>
@@ -15,9 +12,187 @@
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_bridge.h>
-const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
- struct tcphdr *otcph,
- unsigned int *otcplen, int hook)
+static struct ipv6hdr *
+nf_reject_ip6hdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ __u8 protocol, int hoplimit);
+static void
+nf_reject_ip6_tcphdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ const struct tcphdr *oth, unsigned int otcplen);
+static const struct tcphdr *
+nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
+ struct tcphdr *otcph,
+ unsigned int *otcplen, int hook);
+
+static bool nf_reject_v6_csum_ok(struct sk_buff *skb, int hook)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ int thoff;
+ __be16 fo;
+ u8 proto = ip6h->nexthdr;
+
+ if (skb_csum_unnecessary(skb))
+ return true;
+
+ if (ip6h->payload_len &&
+ pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h)))
+ return false;
+
+ ip6h = ipv6_hdr(skb);
+ thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo);
+ if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0)
+ return false;
+
+ if (!nf_reject_verify_csum(skb, thoff, proto))
+ return true;
+
+ return nf_ip6_checksum(skb, hook, thoff, proto) == 0;
+}
+
+static int nf_reject_ip6hdr_validate(struct sk_buff *skb)
+{
+ struct ipv6hdr *hdr;
+ u32 pkt_len;
+
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ return 0;
+
+ hdr = ipv6_hdr(skb);
+ if (hdr->version != 6)
+ return 0;
+
+ pkt_len = ntohs(hdr->payload_len);
+ if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
+ return 0;
+
+ return 1;
+}
+
+struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook)
+{
+ struct sk_buff *nskb;
+ const struct tcphdr *oth;
+ struct tcphdr _oth;
+ unsigned int otcplen;
+ struct ipv6hdr *nip6h;
+
+ if (!nf_reject_ip6hdr_validate(oldskb))
+ return NULL;
+
+ oth = nf_reject_ip6_tcphdr_get(oldskb, &_oth, &otcplen, hook);
+ if (!oth)
+ return NULL;
+
+ nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct tcphdr) +
+ LL_MAX_HEADER, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+
+ nskb->dev = (struct net_device *)dev;
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP,
+ READ_ONCE(net->ipv6.devconf_all->hop_limit));
+ nf_reject_ip6_tcphdr_put(nskb, oldskb, oth, otcplen);
+ nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr));
+
+ return nskb;
+}
+EXPORT_SYMBOL_GPL(nf_reject_skb_v6_tcp_reset);
+
+static bool nf_skb_is_icmp6_unreach(const struct sk_buff *skb)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ u8 proto = ip6h->nexthdr;
+ u8 _type, *tp;
+ int thoff;
+ __be16 fo;
+
+ thoff = ipv6_skip_exthdr(skb, ((u8 *)(ip6h + 1) - skb->data), &proto, &fo);
+
+ if (thoff < 0 || thoff >= skb->len || fo != 0)
+ return false;
+
+ if (proto != IPPROTO_ICMPV6)
+ return false;
+
+ tp = skb_header_pointer(skb,
+ thoff + offsetof(struct icmp6hdr, icmp6_type),
+ sizeof(_type), &_type);
+
+ if (!tp)
+ return false;
+
+ return *tp == ICMPV6_DEST_UNREACH;
+}
+
+struct sk_buff *nf_reject_skb_v6_unreach(struct net *net,
+ struct sk_buff *oldskb,
+ const struct net_device *dev,
+ int hook, u8 code)
+{
+ struct sk_buff *nskb;
+ struct ipv6hdr *nip6h;
+ struct icmp6hdr *icmp6h;
+ unsigned int len;
+
+ if (!nf_reject_ip6hdr_validate(oldskb))
+ return NULL;
+
+ /* Don't reply to ICMPV6_DEST_UNREACH with ICMPV6_DEST_UNREACH */
+ if (nf_skb_is_icmp6_unreach(oldskb))
+ return NULL;
+
+ /* Include "As much of invoking packet as possible without the ICMPv6
+ * packet exceeding the minimum IPv6 MTU" in the ICMP payload.
+ */
+ len = min_t(unsigned int, 1220, oldskb->len);
+
+ if (!pskb_may_pull(oldskb, len))
+ return NULL;
+
+ if (!nf_reject_v6_csum_ok(oldskb, hook))
+ return NULL;
+
+ nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr) +
+ LL_MAX_HEADER + len, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+
+ nskb->dev = (struct net_device *)dev;
+
+ skb_reserve(nskb, LL_MAX_HEADER);
+ nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_ICMPV6,
+ READ_ONCE(net->ipv6.devconf_all->hop_limit));
+
+ skb_reset_transport_header(nskb);
+ icmp6h = skb_put_zero(nskb, sizeof(struct icmp6hdr));
+ icmp6h->icmp6_type = ICMPV6_DEST_UNREACH;
+ icmp6h->icmp6_code = code;
+
+ skb_put_data(nskb, skb_network_header(oldskb), len);
+ nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr));
+
+ icmp6h->icmp6_cksum =
+ csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr,
+ nskb->len - sizeof(struct ipv6hdr),
+ IPPROTO_ICMPV6,
+ csum_partial(icmp6h,
+ nskb->len - sizeof(struct ipv6hdr),
+ 0));
+
+ return nskb;
+}
+EXPORT_SYMBOL_GPL(nf_reject_skb_v6_unreach);
+
+static const struct tcphdr *
+nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
+ struct tcphdr *otcph,
+ unsigned int *otcplen, int hook)
{
const struct ipv6hdr *oip6h = ipv6_hdr(oldskb);
u8 proto;
@@ -61,11 +236,11 @@ const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
return otcph;
}
-EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_get);
-struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb,
- const struct sk_buff *oldskb,
- __u8 protocol, int hoplimit)
+static struct ipv6hdr *
+nf_reject_ip6hdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ __u8 protocol, int hoplimit)
{
struct ipv6hdr *ip6h;
const struct ipv6hdr *oip6h = ipv6_hdr(oldskb);
@@ -85,40 +260,30 @@ struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb,
return ip6h;
}
-EXPORT_SYMBOL_GPL(nf_reject_ip6hdr_put);
-void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb,
- const struct sk_buff *oldskb,
- const struct tcphdr *oth, unsigned int otcplen)
+static void
+nf_reject_ip6_tcphdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ const struct tcphdr *oth, unsigned int otcplen)
{
struct tcphdr *tcph;
- int needs_ack;
skb_reset_transport_header(nskb);
- tcph = skb_put(nskb, sizeof(struct tcphdr));
+ tcph = skb_put_zero(nskb, sizeof(struct tcphdr));
/* Truncate to length (no data) */
tcph->doff = sizeof(struct tcphdr)/4;
tcph->source = oth->dest;
tcph->dest = oth->source;
if (oth->ack) {
- needs_ack = 0;
tcph->seq = oth->ack_seq;
- tcph->ack_seq = 0;
} else {
- needs_ack = 1;
tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin +
otcplen - (oth->doff<<2));
- tcph->seq = 0;
+ tcph->ack = 1;
}
- /* Reset flags */
- ((u_int8_t *)tcph)[13] = 0;
tcph->rst = 1;
- tcph->ack = needs_ack;
- tcph->window = 0;
- tcph->urg_ptr = 0;
- tcph->check = 0;
/* Adjust TCP checksum */
tcph->check = csum_ipv6_magic(&ipv6_hdr(nskb)->saddr,
@@ -127,17 +292,31 @@ void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb,
csum_partial(tcph,
sizeof(struct tcphdr), 0));
}
-EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_put);
-void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
+static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in)
+{
+ struct dst_entry *dst = NULL;
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(struct flowi));
+ fl.u.ip6.daddr = ipv6_hdr(skb_in)->saddr;
+ nf_ip6_route(dev_net(skb_in->dev), &dst, &fl, false);
+ if (!dst)
+ return -1;
+
+ skb_dst_set(skb_in, dst);
+ return 0;
+}
+
+void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
+ int hook)
{
- struct sk_buff *nskb;
- struct tcphdr _otcph;
- const struct tcphdr *otcph;
- unsigned int otcplen, hh_len;
const struct ipv6hdr *oip6h = ipv6_hdr(oldskb);
- struct ipv6hdr *ip6h;
struct dst_entry *dst = NULL;
+ const struct tcphdr *otcph;
+ struct sk_buff *nskb;
+ struct tcphdr _otcph;
+ unsigned int otcplen;
struct flowi6 fl6;
if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) ||
@@ -156,9 +335,17 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
fl6.daddr = oip6h->saddr;
fl6.fl6_sport = otcph->dest;
fl6.fl6_dport = otcph->source;
- fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev);
+
+ if (!skb_dst(oldskb)) {
+ nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false);
+ if (!dst)
+ return;
+ skb_dst_set(oldskb, dst);
+ }
+
+ fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst_dev(oldskb));
fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark);
- security_skb_classify_flow(oldskb, flowi6_to_flowi(&fl6));
+ security_skb_classify_flow(oldskb, flowi6_to_flowi_common(&fl6));
dst = ip6_route_output(net, NULL, &fl6);
if (dst->error) {
dst_release(dst);
@@ -168,9 +355,8 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
if (IS_ERR(dst))
return;
- hh_len = (dst->dev->hard_header_len + 15)&~15;
- nskb = alloc_skb(hh_len + 15 + dst->header_len + sizeof(struct ipv6hdr)
- + sizeof(struct tcphdr) + dst->trailer_len,
+ nskb = alloc_skb(LL_MAX_HEADER + sizeof(struct ipv6hdr) +
+ sizeof(struct tcphdr) + dst->trailer_len,
GFP_ATOMIC);
if (!nskb) {
@@ -183,12 +369,12 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
nskb->mark = fl6.flowi6_mark;
- skb_reserve(nskb, hh_len + dst->header_len);
- ip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP,
- ip6_dst_hoplimit(dst));
+ skb_reserve(nskb, LL_MAX_HEADER);
+ nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, ip6_dst_hoplimit(dst));
nf_reject_ip6_tcphdr_put(nskb, oldskb, otcph, otcplen);
nf_ct_attach(nskb, oldskb);
+ nf_ct_set_closing(skb_nfct(oldskb));
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
/* If we use ip6_local_out for bridged traffic, the MAC source on
@@ -197,19 +383,29 @@ void nf_send_reset6(struct net *net, struct sk_buff *oldskb, int hook)
* build the eth header using the original destination's MAC as the
* source, and send the RST packet directly.
*/
- if (oldskb->nf_bridge) {
+ if (nf_bridge_info_exists(oldskb)) {
struct ethhdr *oeth = eth_hdr(oldskb);
+ struct ipv6hdr *ip6h = ipv6_hdr(nskb);
+ struct net_device *br_indev;
- nskb->dev = nf_bridge_get_physindev(oldskb);
+ br_indev = nf_bridge_get_physindev(oldskb, net);
+ if (!br_indev) {
+ kfree_skb(nskb);
+ return;
+ }
+
+ nskb->dev = br_indev;
nskb->protocol = htons(ETH_P_IPV6);
ip6h->payload_len = htons(sizeof(struct tcphdr));
if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol),
- oeth->h_source, oeth->h_dest, nskb->len) < 0)
+ oeth->h_source, oeth->h_dest, nskb->len) < 0) {
+ kfree_skb(nskb);
return;
+ }
dev_queue_xmit(nskb);
} else
#endif
- ip6_local_out(net, nskb->sk, nskb);
+ ip6_local_out(net, sk, nskb);
}
EXPORT_SYMBOL_GPL(nf_send_reset6);
@@ -229,6 +425,9 @@ static bool reject6_csum_ok(struct sk_buff *skb, int hook)
if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0)
return false;
+ if (!nf_reject_verify_csum(skb, thoff, proto))
+ return true;
+
return nf_ip6_checksum(skb, hook, thoff, proto) == 0;
}
@@ -241,8 +440,12 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in,
if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL)
skb_in->dev = net->loopback_dev;
+ if (!skb_dst(skb_in) && nf_reject6_fill_skb_dst(skb_in) < 0)
+ return;
+
icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0);
}
EXPORT_SYMBOL_GPL(nf_send_unreach6);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IPv6 packet rejection core");
diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c
index f14de4b6d639..ced8bd44828e 100644
--- a/net/ipv6/netfilter/nf_socket_ipv6.c
+++ b/net/ipv6/netfilter/nf_socket_ipv6.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2007-2008 BalaBit IT Ltd.
* Author: Krisztian Kovacs
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
@@ -16,7 +12,6 @@
#include <net/sock.h>
#include <net/inet_sock.h>
#include <net/inet6_hashtables.h>
-#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include <net/netfilter/nf_socket.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack.h>
@@ -88,8 +83,7 @@ nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff,
{
switch (protocol) {
case IPPROTO_TCP:
- return inet6_lookup(net, &tcp_hashinfo, skb, doff,
- saddr, sport, daddr, dport,
+ return inet6_lookup(net, skb, doff, saddr, sport, daddr, dport,
in->ifindex);
case IPPROTO_UDP:
return udp6_lib_lookup(net, saddr, sport, daddr, dport,
@@ -102,12 +96,16 @@ nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff,
struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
const struct net_device *indev)
{
- __be16 uninitialized_var(dport), uninitialized_var(sport);
+ __be16 dport, sport;
const struct in6_addr *daddr = NULL, *saddr = NULL;
- struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct ipv6hdr *iph = ipv6_hdr(skb), ipv6_var;
struct sk_buff *data_skb = NULL;
int doff = 0;
int thoff = 0, tproto;
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn const *ct;
+#endif
tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
if (tproto < 0) {
@@ -134,8 +132,6 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
thoff + sizeof(*hp);
} else if (tproto == IPPROTO_ICMPV6) {
- struct ipv6hdr ipv6_var;
-
if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr,
&sport, &dport, &ipv6_var))
return NULL;
@@ -143,6 +139,25 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
return NULL;
}
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ /* Do the lookup with the original socket address in
+ * case this is a reply packet of an established
+ * SNAT-ted connection.
+ */
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct &&
+ ((tproto != IPPROTO_ICMPV6 &&
+ ctinfo == IP_CT_ESTABLISHED_REPLY) ||
+ (tproto == IPPROTO_ICMPV6 &&
+ ctinfo == IP_CT_RELATED_REPLY)) &&
+ (ct->status & IPS_SRC_NAT_DONE)) {
+ daddr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6;
+ dport = (tproto == IPPROTO_TCP) ?
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port :
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
+ }
+#endif
+
return nf_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr,
sport, dport, indev);
}
diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c
index 5dfd33af6451..b2f59ed9d7cc 100644
--- a/net/ipv6/netfilter/nf_tproxy_ipv6.c
+++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <net/netfilter/nf_tproxy.h>
#include <linux/module.h>
#include <net/inet6_hashtables.h>
@@ -62,7 +63,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
lport ? lport : hp->dest,
skb->dev, NF_TPROXY_LOOKUP_LISTENER);
if (sk2) {
- inet_twsk_deschedule_put(inet_twsk(sk));
+ nf_tproxy_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
}
}
@@ -92,7 +93,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
switch (lookup_type) {
case NF_TPROXY_LOOKUP_LISTENER:
- sk = inet6_lookup_listener(net, &tcp_hashinfo, skb,
+ sk = inet6_lookup_listener(net, skb,
thoff + __tcp_hdrlen(hp),
saddr, sport,
daddr, ntohs(dport),
@@ -107,9 +108,8 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
*/
break;
case NF_TPROXY_LOOKUP_ESTABLISHED:
- sk = __inet6_lookup_established(net, &tcp_hashinfo,
- saddr, sport, daddr, ntohs(dport),
- in->ifindex, 0);
+ sk = __inet6_lookup_established(net, saddr, sport, daddr,
+ ntohs(dport), in->ifindex, 0);
break;
default:
BUG();
@@ -149,4 +149,4 @@ EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v6);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs");
-MODULE_DESCRIPTION("Netfilter IPv4 transparent proxy support");
+MODULE_DESCRIPTION("Netfilter IPv6 transparent proxy support");
diff --git a/net/ipv6/netfilter/nft_chain_nat_ipv6.c b/net/ipv6/netfilter/nft_chain_nat_ipv6.c
deleted file mode 100644
index 8a081ad7d5db..000000000000
--- a/net/ipv6/netfilter/nft_chain_nat_ipv6.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2011 Patrick McHardy <kaber@trash.net>
- * Copyright (c) 2012 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/skbuff.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv6.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables_ipv6.h>
-#include <net/netfilter/nf_nat_l3proto.h>
-#include <net/ipv6.h>
-
-static unsigned int nft_nat_do_chain(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct nft_pktinfo pkt;
-
- nft_set_pktinfo(&pkt, skb, state);
- nft_set_pktinfo_ipv6(&pkt, skb);
-
- return nft_do_chain(&pkt, priv);
-}
-
-static int nft_nat_ipv6_reg(struct net *net, const struct nf_hook_ops *ops)
-{
- return nf_nat_l3proto_ipv6_register_fn(net, ops);
-}
-
-static void nft_nat_ipv6_unreg(struct net *net, const struct nf_hook_ops *ops)
-{
- nf_nat_l3proto_ipv6_unregister_fn(net, ops);
-}
-
-static const struct nft_chain_type nft_chain_nat_ipv6 = {
- .name = "nat",
- .type = NFT_CHAIN_T_NAT,
- .family = NFPROTO_IPV6,
- .owner = THIS_MODULE,
- .hook_mask = (1 << NF_INET_PRE_ROUTING) |
- (1 << NF_INET_POST_ROUTING) |
- (1 << NF_INET_LOCAL_OUT) |
- (1 << NF_INET_LOCAL_IN),
- .hooks = {
- [NF_INET_PRE_ROUTING] = nft_nat_do_chain,
- [NF_INET_POST_ROUTING] = nft_nat_do_chain,
- [NF_INET_LOCAL_OUT] = nft_nat_do_chain,
- [NF_INET_LOCAL_IN] = nft_nat_do_chain,
- },
- .ops_register = nft_nat_ipv6_reg,
- .ops_unregister = nft_nat_ipv6_unreg,
-};
-
-static int __init nft_chain_nat_ipv6_init(void)
-{
- nft_register_chain_type(&nft_chain_nat_ipv6);
-
- return 0;
-}
-
-static void __exit nft_chain_nat_ipv6_exit(void)
-{
- nft_unregister_chain_type(&nft_chain_nat_ipv6);
-}
-
-module_init(nft_chain_nat_ipv6_init);
-module_exit(nft_chain_nat_ipv6_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Tomasz Bursztyka <tomasz.bursztyka@linux.intel.com>");
-MODULE_ALIAS_NFT_CHAIN(AF_INET6, "nat");
diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c b/net/ipv6/netfilter/nft_chain_route_ipv6.c
deleted file mode 100644
index da3f1f8cb325..000000000000
--- a/net/ipv6/netfilter/nft_chain_route_ipv6.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2008 Patrick McHardy <kaber@trash.net>
- * Copyright (c) 2012 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Development of this code funded by Astaro AG (http://www.astaro.com/)
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/skbuff.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter_ipv6.h>
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables_ipv6.h>
-#include <net/route.h>
-
-static unsigned int nf_route_table_hook(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- unsigned int ret;
- struct nft_pktinfo pkt;
- struct in6_addr saddr, daddr;
- u_int8_t hop_limit;
- u32 mark, flowlabel;
- int err;
-
- nft_set_pktinfo(&pkt, skb, state);
- nft_set_pktinfo_ipv6(&pkt, skb);
-
- /* save source/dest address, mark, hoplimit, flowlabel, priority */
- memcpy(&saddr, &ipv6_hdr(skb)->saddr, sizeof(saddr));
- memcpy(&daddr, &ipv6_hdr(skb)->daddr, sizeof(daddr));
- mark = skb->mark;
- hop_limit = ipv6_hdr(skb)->hop_limit;
-
- /* flowlabel and prio (includes version, which shouldn't change either */
- flowlabel = *((u32 *)ipv6_hdr(skb));
-
- ret = nft_do_chain(&pkt, priv);
- if (ret != NF_DROP && ret != NF_STOLEN &&
- (memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
- memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
- skb->mark != mark ||
- ipv6_hdr(skb)->hop_limit != hop_limit ||
- flowlabel != *((u_int32_t *)ipv6_hdr(skb)))) {
- err = ip6_route_me_harder(state->net, skb);
- if (err < 0)
- ret = NF_DROP_ERR(err);
- }
-
- return ret;
-}
-
-static const struct nft_chain_type nft_chain_route_ipv6 = {
- .name = "route",
- .type = NFT_CHAIN_T_ROUTE,
- .family = NFPROTO_IPV6,
- .owner = THIS_MODULE,
- .hook_mask = (1 << NF_INET_LOCAL_OUT),
- .hooks = {
- [NF_INET_LOCAL_OUT] = nf_route_table_hook,
- },
-};
-
-static int __init nft_chain_route_init(void)
-{
- nft_register_chain_type(&nft_chain_route_ipv6);
-
- return 0;
-}
-
-static void __exit nft_chain_route_exit(void)
-{
- nft_unregister_chain_type(&nft_chain_route_ipv6);
-}
-
-module_init(nft_chain_route_init);
-module_exit(nft_chain_route_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_CHAIN(AF_INET6, "route");
diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c
index d8b5b60b7d53..492a811828a7 100644
--- a/net/ipv6/netfilter/nft_dup_ipv6.c
+++ b/net/ipv6/netfilter/nft_dup_ipv6.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published by
- * the Free Software Foundation.
*/
#include <linux/kernel.h>
@@ -16,8 +13,8 @@
#include <net/netfilter/ipv6/nf_dup_ipv6.h>
struct nft_dup_ipv6 {
- enum nft_registers sreg_addr:8;
- enum nft_registers sreg_dev:8;
+ u8 sreg_addr;
+ u8 sreg_dev;
};
static void nft_dup_ipv6_eval(const struct nft_expr *expr,
@@ -41,19 +38,20 @@ static int nft_dup_ipv6_init(const struct nft_ctx *ctx,
if (tb[NFTA_DUP_SREG_ADDR] == NULL)
return -EINVAL;
- priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]);
- err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in6_addr));
+ err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_ADDR], &priv->sreg_addr,
+ sizeof(struct in6_addr));
if (err < 0)
return err;
- if (tb[NFTA_DUP_SREG_DEV] != NULL) {
- priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]);
- return nft_validate_register_load(priv->sreg_dev, sizeof(int));
- }
- return 0;
+ if (tb[NFTA_DUP_SREG_DEV])
+ err = nft_parse_register_load(ctx, tb[NFTA_DUP_SREG_DEV],
+ &priv->sreg_dev, sizeof(int));
+
+ return err;
}
-static int nft_dup_ipv6_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_dup_ipv6_dump(struct sk_buff *skb,
+ const struct nft_expr *expr, bool reset)
{
struct nft_dup_ipv6 *priv = nft_expr_priv(expr);
@@ -76,6 +74,7 @@ static const struct nft_expr_ops nft_dup_ipv6_ops = {
.eval = nft_dup_ipv6_eval,
.init = nft_dup_ipv6_init,
.dump = nft_dup_ipv6_dump,
+ .reduce = NFT_REDUCE_READONLY,
};
static const struct nla_policy nft_dup_ipv6_policy[NFTA_DUP_MAX + 1] = {
@@ -108,3 +107,4 @@ module_exit(nft_dup_ipv6_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "dup");
+MODULE_DESCRIPTION("IPv6 nftables packet duplication support");
diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c
index 36be3cf0adef..421036a3605b 100644
--- a/net/ipv6/netfilter/nft_fib_ipv6.c
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -1,8 +1,4 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/init.h>
@@ -34,6 +30,10 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv,
fl6->daddr = iph->daddr;
fl6->saddr = iph->saddr;
} else {
+ if (nft_hook(pkt) == NF_INET_FORWARD &&
+ priv->flags & NFTA_FIB_F_IIF)
+ fl6->flowi6_iif = nft_out(pkt)->ifindex;
+
fl6->daddr = iph->saddr;
fl6->saddr = iph->daddr;
}
@@ -50,6 +50,7 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv,
fl6->flowi6_mark = pkt->skb->mark;
fl6->flowlabel = (*(__be32 *)iph) & IPV6_FLOWINFO_MASK;
+ fl6->flowi6_l3mdev = nft_fib_l3mdev_master_ifindex_rcu(pkt, dev);
return lookup_flags;
}
@@ -59,19 +60,15 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv,
struct ipv6hdr *iph)
{
const struct net_device *dev = NULL;
- const struct nf_ipv6_ops *v6ops;
int route_err, addrtype;
struct rt6_info *rt;
struct flowi6 fl6 = {
.flowi6_iif = LOOPBACK_IFINDEX,
.flowi6_proto = pkt->tprot,
+ .flowi6_uid = sock_net_uid(nft_net(pkt), NULL),
};
u32 ret = 0;
- v6ops = nf_get_ipv6_ops();
- if (!v6ops)
- return RTN_UNREACHABLE;
-
if (priv->flags & NFTA_FIB_F_IIF)
dev = nft_in(pkt);
else if (priv->flags & NFTA_FIB_F_OIF)
@@ -79,10 +76,10 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv,
nft_fib6_flowi_init(&fl6, priv, pkt, dev, iph);
- if (dev && v6ops->chk_addr(nft_net(pkt), &fl6.daddr, dev, true))
+ if (dev && nf_ipv6_chk_addr(nft_net(pkt), &fl6.daddr, dev, true))
ret = RTN_LOCAL;
- route_err = v6ops->route(nft_net(pkt), (struct dst_entry **)&rt,
+ route_err = nf_ip6_route(nft_net(pkt), (struct dst_entry **)&rt,
flowi6_to_flowi(&fl6), false);
if (route_err)
goto err;
@@ -144,21 +141,39 @@ void nft_fib6_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
}
EXPORT_SYMBOL_GPL(nft_fib6_eval_type);
+static bool nft_fib_v6_skip_icmpv6(const struct sk_buff *skb, u8 next, const struct ipv6hdr *iph)
+{
+ if (likely(next != IPPROTO_ICMPV6))
+ return false;
+
+ if (ipv6_addr_type(&iph->saddr) != IPV6_ADDR_ANY)
+ return false;
+
+ return ipv6_addr_type(&iph->daddr) & IPV6_ADDR_LINKLOCAL;
+}
+
void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
const struct nft_pktinfo *pkt)
{
const struct nft_fib *priv = nft_expr_priv(expr);
int noff = skb_network_offset(pkt->skb);
+ const struct net_device *found = NULL;
const struct net_device *oif = NULL;
u32 *dest = &regs->data[priv->dreg];
struct ipv6hdr *iph, _iph;
struct flowi6 fl6 = {
.flowi6_iif = LOOPBACK_IFINDEX,
.flowi6_proto = pkt->tprot,
+ .flowi6_uid = sock_net_uid(nft_net(pkt), NULL),
};
struct rt6_info *rt;
int lookup_flags;
+ if (nft_fib_can_skip(pkt)) {
+ nft_fib_store_result(dest, priv, nft_in(pkt));
+ return;
+ }
+
if (priv->flags & NFTA_FIB_F_IIF)
oif = nft_in(pkt);
else if (priv->flags & NFTA_FIB_F_OIF)
@@ -170,15 +185,13 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
return;
}
- lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph);
-
- if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
- nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
- nft_fib_store_result(dest, priv, pkt,
- nft_in(pkt)->ifindex);
+ if (nft_fib_v6_skip_icmpv6(pkt->skb, pkt->tprot, iph)) {
+ nft_fib_store_result(dest, priv, nft_in(pkt));
return;
}
+ lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif, iph);
+
*dest = 0;
rt = (void *)ip6_route_lookup(nft_net(pkt), &fl6, pkt->skb,
lookup_flags);
@@ -189,21 +202,15 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL))
goto put_rt_err;
- if (oif && oif != rt->rt6i_idev->dev)
- goto put_rt_err;
-
- switch (priv->result) {
- case NFT_FIB_RESULT_OIF:
- *dest = rt->rt6i_idev->dev->ifindex;
- break;
- case NFT_FIB_RESULT_OIFNAME:
- strncpy((char *)dest, rt->rt6i_idev->dev->name, IFNAMSIZ);
- break;
- default:
- WARN_ON_ONCE(1);
- break;
+ if (!oif) {
+ found = rt->rt6i_idev->dev;
+ } else {
+ if (oif == rt->rt6i_idev->dev ||
+ l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == oif->ifindex)
+ found = oif;
}
+ nft_fib_store_result(dest, priv, found);
put_rt_err:
ip6_rt_put(rt);
}
@@ -218,6 +225,7 @@ static const struct nft_expr_ops nft_fib6_type_ops = {
.init = nft_fib_init,
.dump = nft_fib_dump,
.validate = nft_fib_validate,
+ .reduce = nft_fib_reduce,
};
static const struct nft_expr_ops nft_fib6_ops = {
@@ -227,6 +235,7 @@ static const struct nft_expr_ops nft_fib6_ops = {
.init = nft_fib_init,
.dump = nft_fib_dump,
.validate = nft_fib_validate,
+ .reduce = nft_fib_reduce,
};
static const struct nft_expr_ops *
@@ -276,3 +285,4 @@ module_exit(nft_fib6_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
MODULE_ALIAS_NFT_AF_EXPR(10, "fib");
+MODULE_DESCRIPTION("nftables fib / ipv6 route lookup support");
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
deleted file mode 100644
index dd0122f3cffe..000000000000
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nft_masq.h>
-#include <net/netfilter/ipv6/nf_nat_masquerade.h>
-
-static void nft_masq_ipv6_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
-{
- struct nft_masq *priv = nft_expr_priv(expr);
- struct nf_nat_range2 range;
-
- memset(&range, 0, sizeof(range));
- range.flags = priv->flags;
- if (priv->sreg_proto_min) {
- range.min_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_min]);
- range.max_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_max]);
- }
- regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range,
- nft_out(pkt));
-}
-
-static void
-nft_masq_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
-{
- nf_ct_netns_put(ctx->net, NFPROTO_IPV6);
-}
-
-static struct nft_expr_type nft_masq_ipv6_type;
-static const struct nft_expr_ops nft_masq_ipv6_ops = {
- .type = &nft_masq_ipv6_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_masq)),
- .eval = nft_masq_ipv6_eval,
- .init = nft_masq_init,
- .destroy = nft_masq_ipv6_destroy,
- .dump = nft_masq_dump,
- .validate = nft_masq_validate,
-};
-
-static struct nft_expr_type nft_masq_ipv6_type __read_mostly = {
- .family = NFPROTO_IPV6,
- .name = "masq",
- .ops = &nft_masq_ipv6_ops,
- .policy = nft_masq_policy,
- .maxattr = NFTA_MASQ_MAX,
- .owner = THIS_MODULE,
-};
-
-static int __init nft_masq_ipv6_module_init(void)
-{
- int ret;
-
- ret = nft_register_expr(&nft_masq_ipv6_type);
- if (ret < 0)
- return ret;
-
- nf_nat_masquerade_ipv6_register_notifier();
-
- return ret;
-}
-
-static void __exit nft_masq_ipv6_module_exit(void)
-{
- nft_unregister_expr(&nft_masq_ipv6_type);
- nf_nat_masquerade_ipv6_unregister_notifier();
-}
-
-module_init(nft_masq_ipv6_module_init);
-module_exit(nft_masq_ipv6_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
-MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "masq");
diff --git a/net/ipv6/netfilter/nft_redir_ipv6.c b/net/ipv6/netfilter/nft_redir_ipv6.c
deleted file mode 100644
index 74269865acc8..000000000000
--- a/net/ipv6/netfilter/nft_redir_ipv6.c
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2014 Arturo Borrero Gonzalez <arturo@debian.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/netlink.h>
-#include <linux/netfilter.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
-#include <net/netfilter/nf_nat.h>
-#include <net/netfilter/nft_redir.h>
-#include <net/netfilter/nf_nat_redirect.h>
-
-static void nft_redir_ipv6_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
-{
- struct nft_redir *priv = nft_expr_priv(expr);
- struct nf_nat_range2 range;
-
- memset(&range, 0, sizeof(range));
- if (priv->sreg_proto_min) {
- range.min_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_min]);
- range.max_proto.all = (__force __be16)nft_reg_load16(
- &regs->data[priv->sreg_proto_max]);
- range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
- }
-
- range.flags |= priv->flags;
-
- regs->verdict.code =
- nf_nat_redirect_ipv6(pkt->skb, &range, nft_hook(pkt));
-}
-
-static void
-nft_redir_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
-{
- nf_ct_netns_put(ctx->net, NFPROTO_IPV6);
-}
-
-static struct nft_expr_type nft_redir_ipv6_type;
-static const struct nft_expr_ops nft_redir_ipv6_ops = {
- .type = &nft_redir_ipv6_type,
- .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
- .eval = nft_redir_ipv6_eval,
- .init = nft_redir_init,
- .destroy = nft_redir_ipv6_destroy,
- .dump = nft_redir_dump,
- .validate = nft_redir_validate,
-};
-
-static struct nft_expr_type nft_redir_ipv6_type __read_mostly = {
- .family = NFPROTO_IPV6,
- .name = "redir",
- .ops = &nft_redir_ipv6_ops,
- .policy = nft_redir_policy,
- .maxattr = NFTA_REDIR_MAX,
- .owner = THIS_MODULE,
-};
-
-static int __init nft_redir_ipv6_module_init(void)
-{
- return nft_register_expr(&nft_redir_ipv6_type);
-}
-
-static void __exit nft_redir_ipv6_module_exit(void)
-{
- nft_unregister_expr(&nft_redir_ipv6_type);
-}
-
-module_init(nft_redir_ipv6_module_init);
-module_exit(nft_redir_ipv6_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Arturo Borrero Gonzalez <arturo@debian.org>");
-MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "redir");
diff --git a/net/ipv6/netfilter/nft_reject_ipv6.c b/net/ipv6/netfilter/nft_reject_ipv6.c
index 057deeaff1cb..5c61294f410e 100644
--- a/net/ipv6/netfilter/nft_reject_ipv6.c
+++ b/net/ipv6/netfilter/nft_reject_ipv6.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
* Copyright (c) 2013 Eric Leblond <eric@regit.org>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
@@ -31,7 +28,8 @@ static void nft_reject_ipv6_eval(const struct nft_expr *expr,
nft_hook(pkt));
break;
case NFT_REJECT_TCP_RST:
- nf_send_reset6(nft_net(pkt), pkt->skb, nft_hook(pkt));
+ nf_send_reset6(nft_net(pkt), nft_sk(pkt), pkt->skb,
+ nft_hook(pkt));
break;
default:
break;
@@ -48,6 +46,7 @@ static const struct nft_expr_ops nft_reject_ipv6_ops = {
.init = nft_reject_init,
.dump = nft_reject_dump,
.validate = nft_reject_validate,
+ .reduce = NFT_REDUCE_READONLY,
};
static struct nft_expr_type nft_reject_ipv6_type __read_mostly = {
@@ -75,3 +74,4 @@ module_exit(nft_reject_ipv6_module_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "reject");
+MODULE_DESCRIPTION("IPv6 packet rejection for nftables");
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 4fe7c90962dd..1c9b283a4132 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* IPv6 library code, needed by static components when full IPv6 support is
* not configured or static. These functions are needed by GSO/GRO implementation.
@@ -10,25 +11,11 @@
#include <net/secure_seq.h>
#include <linux/netfilter.h>
-static u32 __ipv6_select_ident(struct net *net, u32 hashrnd,
+static u32 __ipv6_select_ident(struct net *net,
const struct in6_addr *dst,
const struct in6_addr *src)
{
- u32 hash, id;
-
- hash = __ipv6_addr_jhash(dst, hashrnd);
- hash = __ipv6_addr_jhash(src, hash);
- hash ^= net_hash_mix(net);
-
- /* Treat id of 0 as unset and if we get 0 back from ip_idents_reserve,
- * set the hight order instead thus minimizing possible future
- * collisions.
- */
- id = ip_idents_reserve(hash, 1);
- if (unlikely(!id))
- id = 1 << 31;
-
- return id;
+ return get_random_u32_above(0);
}
/* This function exists only for tap drivers that must support broken
@@ -41,7 +28,6 @@ static u32 __ipv6_select_ident(struct net *net, u32 hashrnd,
*/
__be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
{
- static u32 ip6_proxy_idents_hashrnd __read_mostly;
struct in6_addr buf[2];
struct in6_addr *addrs;
u32 id;
@@ -53,11 +39,7 @@ __be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb)
if (!addrs)
return 0;
- net_get_random_once(&ip6_proxy_idents_hashrnd,
- sizeof(ip6_proxy_idents_hashrnd));
-
- id = __ipv6_select_ident(net, ip6_proxy_idents_hashrnd,
- &addrs[1], &addrs[0]);
+ id = __ipv6_select_ident(net, &addrs[1], &addrs[0]);
return htonl(id);
}
EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident);
@@ -66,12 +48,9 @@ __be32 ipv6_select_ident(struct net *net,
const struct in6_addr *daddr,
const struct in6_addr *saddr)
{
- static u32 ip6_idents_hashrnd __read_mostly;
u32 id;
- net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
-
- id = __ipv6_select_ident(net, ip6_idents_hashrnd, daddr, saddr);
+ id = __ipv6_select_ident(net, daddr, saddr);
return htonl(id);
}
EXPORT_SYMBOL(ipv6_select_ident);
@@ -125,18 +104,20 @@ EXPORT_SYMBOL(ip6_find_1stfragopt);
int ip6_dst_hoplimit(struct dst_entry *dst)
{
int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
+
+ rcu_read_lock();
if (hoplimit == 0) {
- struct net_device *dev = dst->dev;
+ struct net_device *dev = dst_dev_rcu(dst);
struct inet6_dev *idev;
- rcu_read_lock();
idev = __in6_dev_get(dev);
if (idev)
- hoplimit = idev->cnf.hop_limit;
+ hoplimit = READ_ONCE(idev->cnf.hop_limit);
else
- hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
- rcu_read_unlock();
+ hoplimit = READ_ONCE(dev_net(dev)->ipv6.devconf_all->hop_limit);
}
+ rcu_read_unlock();
+
return hoplimit;
}
EXPORT_SYMBOL(ip6_dst_hoplimit);
@@ -162,7 +143,7 @@ int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
skb->protocol = htons(ETH_P_IPV6);
return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
- net, sk, skb, NULL, skb_dst(skb)->dev,
+ net, sk, skb, NULL, skb_dst_dev(skb),
dst_output);
}
EXPORT_SYMBOL_GPL(__ip6_local_out);
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 4c04bccc7417..e4afc651731a 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -5,17 +6,11 @@
*
* "Ping" sockets
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Based on ipv4/ping.c code.
*
* Authors: Lorenzo Colitti (IPv6 support)
* Vasiliy Kulikov / Openwall (IPv4 implementation, for Linux 2.6),
* Pavel Kankovsky (IPv4 implementation, for Linux 2.4.32)
- *
*/
#include <net/addrconf.h>
@@ -25,6 +20,7 @@
#include <net/udp.h>
#include <net/transp_v6.h>
#include <linux/proc_fs.h>
+#include <linux/bpf-cgroup.h>
#include <net/ping.h>
/* Compatibility glue so we can support IPv6 when it's compiled as a module */
@@ -49,6 +45,20 @@ static int dummy_ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
return 0;
}
+static int ping_v6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
+{
+ /* This check is replicated from __ip6_datagram_connect() and
+ * intended to prevent BPF program called below from accessing
+ * bytes that are out of the bound specified by user in addr_len.
+ */
+
+ if (addr_len < SIN6_LEN_RFC2133)
+ return -EINVAL;
+
+ return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len);
+}
+
static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct inet_sock *inet = inet_sk(sk);
@@ -64,13 +74,13 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct pingfakehdr pfh;
struct ipcm6_cookie ipc6;
- pr_debug("ping_v6_sendmsg(sk=%p,sk->num=%u)\n", inet, inet->inet_num);
-
err = ping_common_sendmsg(AF_INET6, msg, len, &user_icmph,
sizeof(user_icmph));
if (err)
return err;
+ memset(&fl6, 0, sizeof(fl6));
+
if (msg->msg_name) {
DECLARE_SOCKADDR(struct sockaddr_in6 *, u, msg->msg_name);
if (msg->msg_namelen < sizeof(*u))
@@ -79,12 +89,15 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
return -EAFNOSUPPORT;
}
daddr = &(u->sin6_addr);
+ if (inet6_test_bit(SNDFLOW, sk))
+ fl6.flowlabel = u->sin6_flowinfo & IPV6_FLOWINFO_MASK;
if (__ipv6_addr_needs_scope_id(ipv6_addr_type(daddr)))
oif = u->sin6_scope_id;
} else {
if (sk->sk_state != TCP_ESTABLISHED)
return -EDESTADDRREQ;
daddr = &sk->sk_v6_daddr;
+ fl6.flowlabel = np->flow_label;
}
if (!oif)
@@ -94,42 +107,57 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
oif = np->sticky_pktinfo.ipi6_ifindex;
if (!oif && ipv6_addr_is_multicast(daddr))
- oif = np->mcast_oif;
+ oif = READ_ONCE(np->mcast_oif);
else if (!oif)
- oif = np->ucast_oif;
+ oif = READ_ONCE(np->ucast_oif);
addr_type = ipv6_addr_type(daddr);
if ((__ipv6_addr_needs_scope_id(addr_type) && !oif) ||
(addr_type & IPV6_ADDR_MAPPED) ||
- (oif && sk->sk_bound_dev_if && oif != sk->sk_bound_dev_if))
+ (oif && sk->sk_bound_dev_if && oif != sk->sk_bound_dev_if &&
+ l3mdev_master_ifindex_by_index(sock_net(sk), oif) != sk->sk_bound_dev_if))
return -EINVAL;
- /* TODO: use ip6_datagram_send_ctl to get options from cmsg */
+ ipcm6_init_sk(&ipc6, sk);
- memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_oif = oif;
+
+ if (msg->msg_controllen) {
+ struct ipv6_txoptions opt = {};
+
+ opt.tot_len = sizeof(opt);
+ ipc6.opt = &opt;
+
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6);
+ if (err < 0)
+ return err;
+
+ /* Changes to txoptions and flow info are not implemented, yet.
+ * Drop the options.
+ */
+ ipc6.opt = NULL;
+ }
fl6.flowi6_proto = IPPROTO_ICMPV6;
fl6.saddr = np->saddr;
fl6.daddr = *daddr;
- fl6.flowi6_oif = oif;
- fl6.flowi6_mark = sk->sk_mark;
- fl6.flowi6_uid = sk->sk_uid;
+ fl6.flowi6_mark = ipc6.sockc.mark;
+ fl6.flowi6_uid = sk_uid(sk);
fl6.fl6_icmp_type = user_icmph.icmp6_type;
fl6.fl6_icmp_code = user_icmph.icmp6_code;
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
- ipcm6_init_sk(&ipc6, np);
fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
dst = ip6_sk_dst_lookup_flow(sk, &fl6, daddr, false);
if (IS_ERR(dst))
return PTR_ERR(dst);
- rt = (struct rt6_info *) dst;
+ rt = dst_rt6_info(dst);
if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
- fl6.flowi6_oif = np->mcast_oif;
+ fl6.flowi6_oif = READ_ONCE(np->mcast_oif);
else if (!fl6.flowi6_oif)
- fl6.flowi6_oif = np->ucast_oif;
+ fl6.flowi6_oif = READ_ONCE(np->ucast_oif);
pfh.icmph.type = user_icmph.icmp6_type;
pfh.icmph.code = user_icmph.icmp6_code;
@@ -140,11 +168,12 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
pfh.wcheck = 0;
pfh.family = AF_INET6;
- ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+ if (ipc6.hlimit < 0)
+ ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
lock_sock(sk);
err = ip6_append_data(sk, ping_getfrag, &pfh, len,
- 0, &ipc6, &fl6, rt,
+ sizeof(struct icmp6hdr), &ipc6, &fl6, rt,
MSG_DONTWAIT);
if (err) {
@@ -170,6 +199,7 @@ struct proto pingv6_prot = {
.owner = THIS_MODULE,
.init = ping_init_sock,
.close = ping_close,
+ .pre_connect = ping_v6_pre_connect,
.connect = ip6_datagram_connect_v6_only,
.disconnect = __udp_disconnect,
.setsockopt = ipv6_setsockopt,
@@ -178,10 +208,11 @@ struct proto pingv6_prot = {
.recvmsg = ping_recvmsg,
.bind = ping_bind,
.backlog_rcv = ping_queue_rcv_skb,
- .hash = ping_hash,
.unhash = ping_unhash,
.get_port = ping_get_port,
+ .put_port = ping_unhash,
.obj_size = sizeof(struct raw6_sock),
+ .ipv6_pinfo_offset = offsetof(struct raw6_sock, inet6),
};
EXPORT_SYMBOL_GPL(pingv6_prot);
@@ -205,7 +236,7 @@ static int ping_v6_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, IPV6_SEQ_DGRAM_HEADER);
} else {
int bucket = ((struct ping_iter_state *) seq->private)->bucket;
- struct inet_sock *inet = inet_sk(v);
+ struct inet_sock *inet = inet_sk((struct sock *)v);
__u16 srcp = ntohs(inet->inet_sport);
__u16 destp = ntohs(inet->inet_dport);
ip6_dgram_sock_seq_show(seq, v, srcp, destp, bucket);
@@ -228,7 +259,7 @@ static int __net_init ping_v6_proc_init_net(struct net *net)
return 0;
}
-static void __net_init ping_v6_proc_exit_net(struct net *net)
+static void __net_exit ping_v6_proc_exit_net(struct net *net)
{
remove_proc_entry("icmp6", net->proc_net);
}
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 2356b4af7309..73296f38c252 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -9,11 +10,6 @@
*
* Authors: David S. Miller (davem@caip.rutgers.edu)
* YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/socket.h>
#include <linux/net.h>
@@ -31,7 +27,7 @@
#include <net/ipv6.h>
#define MAX4(a, b, c, d) \
- max_t(u32, max_t(u32, a, b), max_t(u32, c, d))
+ MAX_T(u32, MAX_T(u32, a, b), MAX_T(u32, c, d))
#define SNMP_MIB_MAX MAX4(UDP_MIB_MAX, TCP_MIB_MAX, \
IPSTATS_MIB_MAX, ICMP_MIB_MAX)
@@ -48,8 +44,8 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "RAW6: inuse %d\n",
sock_prot_inuse_get(net, &rawv6_prot));
seq_printf(seq, "FRAG6: inuse %u memory %lu\n",
- atomic_read(&net->ipv6.frags.rhashtable.nelems),
- frag_mem_limit(&net->ipv6.frags));
+ atomic_read(&net->ipv6.fqdir->rhashtable.nelems),
+ frag_mem_limit(net->ipv6.fqdir));
return 0;
}
@@ -65,7 +61,7 @@ static const struct snmp_mib snmp6_ipstats_list[] = {
SNMP_MIB_ITEM("Ip6InDiscards", IPSTATS_MIB_INDISCARDS),
SNMP_MIB_ITEM("Ip6InDelivers", IPSTATS_MIB_INDELIVERS),
SNMP_MIB_ITEM("Ip6OutForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS),
- SNMP_MIB_ITEM("Ip6OutRequests", IPSTATS_MIB_OUTPKTS),
+ SNMP_MIB_ITEM("Ip6OutRequests", IPSTATS_MIB_OUTREQUESTS),
SNMP_MIB_ITEM("Ip6OutDiscards", IPSTATS_MIB_OUTDISCARDS),
SNMP_MIB_ITEM("Ip6OutNoRoutes", IPSTATS_MIB_OUTNOROUTES),
SNMP_MIB_ITEM("Ip6ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT),
@@ -88,7 +84,7 @@ static const struct snmp_mib snmp6_ipstats_list[] = {
SNMP_MIB_ITEM("Ip6InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS),
- SNMP_MIB_SENTINEL
+ SNMP_MIB_ITEM("Ip6OutTransmits", IPSTATS_MIB_OUTPKTS),
};
static const struct snmp_mib snmp6_icmp6_list[] = {
@@ -98,29 +94,10 @@ static const struct snmp_mib snmp6_icmp6_list[] = {
SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS),
SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS),
SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS),
- SNMP_MIB_SENTINEL
+/* ICMP6_MIB_RATELIMITHOST needs to be last, see snmp6_dev_seq_show(). */
+ SNMP_MIB_ITEM("Icmp6OutRateLimitHost", ICMP6_MIB_RATELIMITHOST),
};
-/* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */
-static const char *const icmp6type2name[256] = {
- [ICMPV6_DEST_UNREACH] = "DestUnreachs",
- [ICMPV6_PKT_TOOBIG] = "PktTooBigs",
- [ICMPV6_TIME_EXCEED] = "TimeExcds",
- [ICMPV6_PARAMPROB] = "ParmProblems",
- [ICMPV6_ECHO_REQUEST] = "Echos",
- [ICMPV6_ECHO_REPLY] = "EchoReplies",
- [ICMPV6_MGM_QUERY] = "GroupMembQueries",
- [ICMPV6_MGM_REPORT] = "GroupMembResponses",
- [ICMPV6_MGM_REDUCTION] = "GroupMembReductions",
- [ICMPV6_MLD2_REPORT] = "MLDv2Reports",
- [NDISC_ROUTER_ADVERTISEMENT] = "RouterAdvertisements",
- [NDISC_ROUTER_SOLICITATION] = "RouterSolicits",
- [NDISC_NEIGHBOUR_ADVERTISEMENT] = "NeighborAdvertisements",
- [NDISC_NEIGHBOUR_SOLICITATION] = "NeighborSolicits",
- [NDISC_REDIRECT] = "Redirects",
-};
-
-
static const struct snmp_mib snmp6_udp6_list[] = {
SNMP_MIB_ITEM("Udp6InDatagrams", UDP_MIB_INDATAGRAMS),
SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS),
@@ -130,7 +107,7 @@ static const struct snmp_mib snmp6_udp6_list[] = {
SNMP_MIB_ITEM("Udp6SndbufErrors", UDP_MIB_SNDBUFERRORS),
SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS),
SNMP_MIB_ITEM("Udp6IgnoredMulti", UDP_MIB_IGNOREDMULTI),
- SNMP_MIB_SENTINEL
+ SNMP_MIB_ITEM("Udp6MemErrors", UDP_MIB_MEMERRORS),
};
static const struct snmp_mib snmp6_udplite6_list[] = {
@@ -141,7 +118,7 @@ static const struct snmp_mib snmp6_udplite6_list[] = {
SNMP_MIB_ITEM("UdpLite6RcvbufErrors", UDP_MIB_RCVBUFERRORS),
SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS),
SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS),
- SNMP_MIB_SENTINEL
+ SNMP_MIB_ITEM("UdpLite6MemErrors", UDP_MIB_MEMERRORS),
};
static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib)
@@ -151,11 +128,31 @@ static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib)
/* print by name -- deprecated items */
for (i = 0; i < ICMP6MSG_MIB_MAX; i++) {
+ const char *p = NULL;
int icmptype;
- const char *p;
+
+#define CASE(TYP, STR) case TYP: p = STR; break;
icmptype = i & 0xff;
- p = icmp6type2name[icmptype];
+ switch (icmptype) {
+/* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */
+ CASE(ICMPV6_DEST_UNREACH, "DestUnreachs")
+ CASE(ICMPV6_PKT_TOOBIG, "PktTooBigs")
+ CASE(ICMPV6_TIME_EXCEED, "TimeExcds")
+ CASE(ICMPV6_PARAMPROB, "ParmProblems")
+ CASE(ICMPV6_ECHO_REQUEST, "Echos")
+ CASE(ICMPV6_ECHO_REPLY, "EchoReplies")
+ CASE(ICMPV6_MGM_QUERY, "GroupMembQueries")
+ CASE(ICMPV6_MGM_REPORT, "GroupMembResponses")
+ CASE(ICMPV6_MGM_REDUCTION, "GroupMembReductions")
+ CASE(ICMPV6_MLD2_REPORT, "MLDv2Reports")
+ CASE(NDISC_ROUTER_ADVERTISEMENT, "RouterAdvertisements")
+ CASE(NDISC_ROUTER_SOLICITATION, "RouterSolicits")
+ CASE(NDISC_NEIGHBOUR_ADVERTISEMENT, "NeighborAdvertisements")
+ CASE(NDISC_NEIGHBOUR_SOLICITATION, "NeighborSolicits")
+ CASE(NDISC_REDIRECT, "Redirects")
+ }
+#undef CASE
if (!p) /* don't print un-named types here */
continue;
snprintf(name, sizeof(name), "Icmp6%s%s",
@@ -182,35 +179,37 @@ static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib)
*/
static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib,
atomic_long_t *smib,
- const struct snmp_mib *itemlist)
+ const struct snmp_mib *itemlist,
+ int cnt)
{
unsigned long buff[SNMP_MIB_MAX];
int i;
if (pcpumib) {
- memset(buff, 0, sizeof(unsigned long) * SNMP_MIB_MAX);
+ memset(buff, 0, sizeof(unsigned long) * cnt);
- snmp_get_cpu_field_batch(buff, itemlist, pcpumib);
- for (i = 0; itemlist[i].name; i++)
+ snmp_get_cpu_field_batch_cnt(buff, itemlist, cnt, pcpumib);
+ for (i = 0; i < cnt; i++)
seq_printf(seq, "%-32s\t%lu\n",
itemlist[i].name, buff[i]);
} else {
- for (i = 0; itemlist[i].name; i++)
+ for (i = 0; i < cnt; i++)
seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name,
atomic_long_read(smib + itemlist[i].entry));
}
}
static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib,
- const struct snmp_mib *itemlist, size_t syncpoff)
+ const struct snmp_mib *itemlist,
+ int cnt, size_t syncpoff)
{
u64 buff64[SNMP_MIB_MAX];
int i;
- memset(buff64, 0, sizeof(u64) * SNMP_MIB_MAX);
+ memset(buff64, 0, sizeof(u64) * cnt);
- snmp_get_cpu_field64_batch(buff64, itemlist, mib, syncpoff);
- for (i = 0; itemlist[i].name; i++)
+ snmp_get_cpu_field64_batch_cnt(buff64, itemlist, cnt, mib, syncpoff);
+ for (i = 0; i < cnt; i++)
seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, buff64[i]);
}
@@ -219,14 +218,19 @@ static int snmp6_seq_show(struct seq_file *seq, void *v)
struct net *net = (struct net *)seq->private;
snmp6_seq_show_item64(seq, net->mib.ipv6_statistics,
- snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp));
+ snmp6_ipstats_list,
+ ARRAY_SIZE(snmp6_ipstats_list),
+ offsetof(struct ipstats_mib, syncp));
snmp6_seq_show_item(seq, net->mib.icmpv6_statistics,
- NULL, snmp6_icmp6_list);
+ NULL, snmp6_icmp6_list,
+ ARRAY_SIZE(snmp6_icmp6_list));
snmp6_seq_show_icmpv6msg(seq, net->mib.icmpv6msg_statistics->mibs);
snmp6_seq_show_item(seq, net->mib.udp_stats_in6,
- NULL, snmp6_udp6_list);
+ NULL, snmp6_udp6_list,
+ ARRAY_SIZE(snmp6_udp6_list));
snmp6_seq_show_item(seq, net->mib.udplite_stats_in6,
- NULL, snmp6_udplite6_list);
+ NULL, snmp6_udplite6_list,
+ ARRAY_SIZE(snmp6_udplite6_list));
return 0;
}
@@ -236,9 +240,14 @@ static int snmp6_dev_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex);
snmp6_seq_show_item64(seq, idev->stats.ipv6,
- snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp));
+ snmp6_ipstats_list,
+ ARRAY_SIZE(snmp6_ipstats_list),
+ offsetof(struct ipstats_mib, syncp));
+
+ /* Per idev icmp stats do not have ICMP6_MIB_RATELIMITHOST */
snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs,
- snmp6_icmp6_list);
+ snmp6_icmp6_list, ARRAY_SIZE(snmp6_icmp6_list) - 1);
+
snmp6_seq_show_icmpv6msg(seq, idev->stats.icmpv6msgdev->mibs);
return 0;
}
diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c
index b5d54d4f995c..d4b1806bab1b 100644
--- a/net/ipv6/protocol.c
+++ b/net/ipv6/protocol.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
@@ -6,11 +7,6 @@
* PF_INET6 protocol dispatch tables.
*
* Authors: Pedro Roque <roque@di.fc.ul.pt>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/*
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 5e0efd3954e9..b4cd05dba9b6 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* RAW sockets for IPv6
* Linux INET6 implementation
@@ -11,11 +12,6 @@
* Hideaki YOSHIFUJI : sin6_scope_id support
* YOSHIFUJI,H.@USAGI : raw checksum (RFC2292(bis) compliance)
* Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/errno.h>
@@ -65,47 +61,30 @@
#define ICMPV6_HDRLEN 4 /* ICMPv6 header, RFC 4443 Section 2.1 */
-struct raw_hashinfo raw_v6_hashinfo = {
- .lock = __RW_LOCK_UNLOCKED(raw_v6_hashinfo.lock),
-};
+struct raw_hashinfo raw_v6_hashinfo;
EXPORT_SYMBOL_GPL(raw_v6_hashinfo);
-struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
- unsigned short num, const struct in6_addr *loc_addr,
- const struct in6_addr *rmt_addr, int dif, int sdif)
+bool raw_v6_match(struct net *net, const struct sock *sk, unsigned short num,
+ const struct in6_addr *loc_addr,
+ const struct in6_addr *rmt_addr, int dif, int sdif)
{
- bool is_multicast = ipv6_addr_is_multicast(loc_addr);
-
- sk_for_each_from(sk)
- if (inet_sk(sk)->inet_num == num) {
-
- if (!net_eq(sock_net(sk), net))
- continue;
-
- if (!ipv6_addr_any(&sk->sk_v6_daddr) &&
- !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr))
- continue;
-
- if (sk->sk_bound_dev_if &&
- sk->sk_bound_dev_if != dif &&
- sk->sk_bound_dev_if != sdif)
- continue;
-
- if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
- if (ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr))
- goto found;
- if (is_multicast &&
- inet6_mc_check(sk, loc_addr, rmt_addr))
- goto found;
- continue;
- }
- goto found;
- }
- sk = NULL;
-found:
- return sk;
+ if (inet_sk(sk)->inet_num != num ||
+ !net_eq(sock_net(sk), net) ||
+ (!ipv6_addr_any(&sk->sk_v6_daddr) &&
+ !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
+ !raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if,
+ dif, sdif))
+ return false;
+
+ if (ipv6_addr_any(&sk->sk_v6_rcv_saddr) ||
+ ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr) ||
+ (ipv6_addr_is_multicast(loc_addr) &&
+ inet6_mc_check(sk, loc_addr, rmt_addr)))
+ return true;
+
+ return false;
}
-EXPORT_SYMBOL_GPL(__raw_v6_lookup);
+EXPORT_SYMBOL_GPL(raw_v6_match);
/*
* 0 - deliver
@@ -161,30 +140,32 @@ EXPORT_SYMBOL(rawv6_mh_filter_unregister);
*/
static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
{
+ struct net *net = dev_net(skb->dev);
const struct in6_addr *saddr;
const struct in6_addr *daddr;
+ struct hlist_head *hlist;
struct sock *sk;
bool delivered = false;
__u8 hash;
- struct net *net;
saddr = &ipv6_hdr(skb)->saddr;
daddr = saddr + 1;
- hash = nexthdr & (RAW_HTABLE_SIZE - 1);
-
- read_lock(&raw_v6_hashinfo.lock);
- sk = sk_head(&raw_v6_hashinfo.ht[hash]);
-
- if (!sk)
- goto out;
+ hash = raw_hashfunc(net, nexthdr);
+ hlist = &raw_v6_hashinfo.ht[hash];
+ rcu_read_lock();
+ sk_for_each_rcu(sk, hlist) {
+ int filtered;
- net = dev_net(skb->dev);
- sk = __raw_v6_lookup(net, sk, nexthdr, daddr, saddr,
- inet6_iif(skb), inet6_sdif(skb));
+ if (!raw_v6_match(net, sk, nexthdr, daddr, saddr,
+ inet6_iif(skb), inet6_sdif(skb)))
+ continue;
- while (sk) {
- int filtered;
+ if (atomic_read(&sk->sk_rmem_alloc) >=
+ READ_ONCE(sk->sk_rcvbuf)) {
+ sk_drops_inc(sk);
+ continue;
+ }
delivered = true;
switch (nexthdr) {
@@ -219,32 +200,22 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
/* Not releasing hash table! */
- if (clone) {
- nf_reset(clone);
+ if (clone)
rawv6_rcv(sk, clone);
- }
}
- sk = __raw_v6_lookup(net, sk_next(sk), nexthdr, daddr, saddr,
- inet6_iif(skb), inet6_sdif(skb));
}
-out:
- read_unlock(&raw_v6_hashinfo.lock);
+ rcu_read_unlock();
return delivered;
}
bool raw6_local_deliver(struct sk_buff *skb, int nexthdr)
{
- struct sock *raw_sk;
-
- raw_sk = sk_head(&raw_v6_hashinfo.ht[nexthdr & (RAW_HTABLE_SIZE - 1)]);
- if (raw_sk && !ipv6_raw_deliver(skb, nexthdr))
- raw_sk = NULL;
-
- return raw_sk != NULL;
+ return ipv6_raw_deliver(skb, nexthdr);
}
/* This cleans up af_inet6 a bit. -DaveM */
-static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+static int rawv6_bind(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -288,7 +259,9 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
/* Binding to link-local address requires an interface */
if (!sk->sk_bound_dev_if)
goto out_unlock;
+ }
+ if (sk->sk_bound_dev_if) {
err = -ENODEV;
dev = dev_get_by_index_rcu(sock_net(sk),
sk->sk_bound_dev_if);
@@ -301,7 +274,7 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
*/
v4addr = LOOPBACK4_IPV6;
if (!(addr_type & IPV6_ADDR_MULTICAST) &&
- !sock_net(sk)->ipv6.sysctl.ip_nonlocal_bind) {
+ !ipv6_can_nonlocal_bind(sock_net(sk), inet)) {
err = -EADDRNOTAVAIL;
if (!ipv6_chk_addr(sock_net(sk), &addr->sin6_addr,
dev, 0)) {
@@ -323,10 +296,9 @@ out:
}
static void rawv6_err(struct sock *sk, struct sk_buff *skb,
- struct inet6_skb_parm *opt,
- u8 type, u8 code, int offset, __be32 info)
+ u8 type, u8 code, int offset, __be32 info)
{
- struct inet_sock *inet = inet_sk(sk);
+ bool recverr = inet6_test_bit(RECVERR6, sk);
struct ipv6_pinfo *np = inet6_sk(sk);
int err;
int harderr;
@@ -336,73 +308,69 @@ static void rawv6_err(struct sock *sk, struct sk_buff *skb,
2. Socket is connected (otherwise the error indication
is useless without recverr and error is hard.
*/
- if (!np->recverr && sk->sk_state != TCP_ESTABLISHED)
+ if (!recverr && sk->sk_state != TCP_ESTABLISHED)
return;
harderr = icmpv6_err_convert(type, code, &err);
if (type == ICMPV6_PKT_TOOBIG) {
ip6_sk_update_pmtu(skb, sk, info);
- harderr = (np->pmtudisc == IPV6_PMTUDISC_DO);
+ harderr = (READ_ONCE(np->pmtudisc) == IPV6_PMTUDISC_DO);
}
if (type == NDISC_REDIRECT) {
ip6_sk_redirect(skb, sk);
return;
}
- if (np->recverr) {
+ if (recverr) {
u8 *payload = skb->data;
- if (!inet->hdrincl)
+ if (!inet_test_bit(HDRINCL, sk))
payload += offset;
ipv6_icmp_error(sk, skb, err, 0, ntohl(info), payload);
}
- if (np->recverr || harderr) {
+ if (recverr || harderr) {
sk->sk_err = err;
- sk->sk_error_report(sk);
+ sk_error_report(sk);
}
}
void raw6_icmp_error(struct sk_buff *skb, int nexthdr,
u8 type, u8 code, int inner_offset, __be32 info)
{
+ struct net *net = dev_net(skb->dev);
+ struct hlist_head *hlist;
struct sock *sk;
int hash;
- const struct in6_addr *saddr, *daddr;
- struct net *net;
-
- hash = nexthdr & (RAW_HTABLE_SIZE - 1);
- read_lock(&raw_v6_hashinfo.lock);
- sk = sk_head(&raw_v6_hashinfo.ht[hash]);
- if (sk) {
+ hash = raw_hashfunc(net, nexthdr);
+ hlist = &raw_v6_hashinfo.ht[hash];
+ rcu_read_lock();
+ sk_for_each_rcu(sk, hlist) {
/* Note: ipv6_hdr(skb) != skb->data */
const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data;
- saddr = &ip6h->saddr;
- daddr = &ip6h->daddr;
- net = dev_net(skb->dev);
-
- while ((sk = __raw_v6_lookup(net, sk, nexthdr, saddr, daddr,
- inet6_iif(skb), inet6_iif(skb)))) {
- rawv6_err(sk, skb, NULL, type, code,
- inner_offset, info);
- sk = sk_next(sk);
- }
+
+ if (!raw_v6_match(net, sk, nexthdr, &ip6h->saddr, &ip6h->daddr,
+ inet6_iif(skb), inet6_iif(skb)))
+ continue;
+ rawv6_err(sk, skb, type, code, inner_offset, info);
}
- read_unlock(&raw_v6_hashinfo.lock);
+ rcu_read_unlock();
}
static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
+ enum skb_drop_reason reason;
+
if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) &&
skb_checksum_complete(skb)) {
- atomic_inc(&sk->sk_drops);
- kfree_skb(skb);
+ sk_drops_inc(sk);
+ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM);
return NET_RX_DROP;
}
/* Charge it to the socket. */
skb_dst_drop(skb);
- if (sock_queue_rcv_skb(sk, skb) < 0) {
- kfree_skb(skb);
+ if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) {
+ sk_skb_reason_drop(sk, skb, reason);
return NET_RX_DROP;
}
@@ -422,10 +390,11 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
struct raw6_sock *rp = raw6_sk(sk);
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
- atomic_inc(&sk->sk_drops);
- kfree_skb(skb);
+ sk_drops_inc(sk);
+ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY);
return NET_RX_DROP;
}
+ nf_reset_ct(skb);
if (!rp->checksum)
skb->ip_summed = CHECKSUM_UNNECESSARY;
@@ -444,10 +413,10 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
skb->len,
inet->inet_num, 0));
- if (inet->hdrincl) {
+ if (inet_test_bit(HDRINCL, sk)) {
if (skb_checksum_complete(skb)) {
- atomic_inc(&sk->sk_drops);
- kfree_skb(skb);
+ sk_drops_inc(sk);
+ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM);
return NET_RX_DROP;
}
}
@@ -463,7 +432,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
*/
static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int noblock, int flags, int *addr_len)
+ int flags, int *addr_len)
{
struct ipv6_pinfo *np = inet6_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
@@ -477,10 +446,10 @@ static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (flags & MSG_ERRQUEUE)
return ipv6_recv_error(sk, msg, len, addr_len);
- if (np->rxpmtu && np->rxopt.bits.rxpmtu)
+ if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu))
return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
- skb = skb_recv_datagram(sk, flags, noblock, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
if (!skb)
goto out;
@@ -515,7 +484,7 @@ static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
*addr_len = sizeof(*sin6);
}
- sock_recv_ts_and_drops(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
if (np->rxopt.all)
ip6_datagram_recv_ctl(sk, msg, skb);
@@ -542,6 +511,7 @@ csum_copy_err:
static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
struct raw6_sock *rp)
{
+ struct ipv6_txoptions *opt;
struct sk_buff *skb;
int err = 0;
int offset;
@@ -559,6 +529,9 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
offset = rp->offset;
total_len = inet_sk(sk)->cork.base.length;
+ opt = inet6_sk(sk)->cork.opt;
+ total_len -= opt ? opt->opt_flen : 0;
+
if (offset >= total_len - 1) {
err = -EINVAL;
ip6_flush_pending_frames(sk);
@@ -622,12 +595,11 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
struct flowi6 *fl6, struct dst_entry **dstp,
unsigned int flags, const struct sockcm_cookie *sockc)
{
- struct ipv6_pinfo *np = inet6_sk(sk);
struct net *net = sock_net(sk);
struct ipv6hdr *iph;
struct sk_buff *skb;
int err;
- struct rt6_info *rt = (struct rt6_info *)*dstp;
+ struct rt6_info *rt = dst_rt6_info(*dstp);
int hlen = LL_RESERVED_SPACE(rt->dst.dev);
int tlen = rt->dst.dev->needed_tailroom;
@@ -648,9 +620,9 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
skb_reserve(skb, hlen);
skb->protocol = htons(ETH_P_IPV6);
- skb->priority = sk->sk_priority;
- skb->mark = sk->sk_mark;
- skb->tstamp = sockc->transmit_time;
+ skb->priority = sockc->priority;
+ skb->mark = sockc->mark;
+ skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid);
skb_put(skb, length);
skb_reset_network_header(skb);
@@ -658,6 +630,8 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
skb->ip_summed = CHECKSUM_NONE;
+ skb_setup_tx_timestamp(skb, sockc);
+
if (flags & MSG_CONFIRM)
skb_set_dst_pending_confirm(skb, 1);
@@ -684,7 +658,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
* have been queued for deletion.
*/
rcu_read_lock();
- IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
+ IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb,
NULL, rt->dst.dev, dst_output);
if (err > 0)
@@ -701,7 +675,7 @@ out:
error:
IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
error_check:
- if (err == -ENOBUFS && !np->recverr)
+ if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk))
err = 0;
return err;
}
@@ -747,7 +721,7 @@ static int raw6_getfrag(void *from, char *to, int offset, int len, int odd,
skb->csum = csum_block_add(
skb->csum,
csum_partial_copy_nocheck(rfv->c + offset,
- to, copy, 0),
+ to, copy),
odd);
odd = 0;
@@ -780,6 +754,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct flowi6 fl6;
struct ipcm6_cookie ipc6;
int addr_len = msg->msg_namelen;
+ int hdrincl;
u16 proto;
int err;
@@ -793,16 +768,17 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (msg->msg_flags & MSG_OOB)
return -EOPNOTSUPP;
+ hdrincl = inet_test_bit(HDRINCL, sk);
+
+ ipcm6_init_sk(&ipc6, sk);
+
/*
* Get and verify the address.
*/
memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_mark = sk->sk_mark;
- fl6.flowi6_uid = sk->sk_uid;
-
- ipcm6_init(&ipc6);
- ipc6.sockc.tsflags = sk->sk_tsflags;
+ fl6.flowi6_mark = ipc6.sockc.mark;
+ fl6.flowi6_uid = sk_uid(sk);
if (sin6) {
if (addr_len < SIN6_LEN_RFC2133)
@@ -816,18 +792,19 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (!proto)
proto = inet->inet_num;
- else if (proto != inet->inet_num)
+ else if (proto != inet->inet_num &&
+ inet->inet_num != IPPROTO_RAW)
return -EINVAL;
if (proto > 255)
return -EINVAL;
daddr = &sin6->sin6_addr;
- if (np->sndflow) {
+ if (inet6_test_bit(SNDFLOW, sk)) {
fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (!flowlabel)
+ if (IS_ERR(flowlabel))
return -EINVAL;
}
}
@@ -869,7 +846,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (!flowlabel)
+ if (IS_ERR(flowlabel))
return -EINVAL;
}
if (!(opt->opt_nflen|opt->opt_flen))
@@ -884,11 +861,15 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
opt = ipv6_fixup_options(&opt_space, opt);
fl6.flowi6_proto = proto;
- rfv.msg = msg;
- rfv.hlen = 0;
- err = rawv6_probe_proto_opt(&rfv, &fl6);
- if (err)
- goto out;
+ fl6.flowi6_mark = ipc6.sockc.mark;
+
+ if (!hdrincl) {
+ rfv.msg = msg;
+ rfv.hlen = 0;
+ err = rawv6_probe_proto_opt(&rfv, &fl6);
+ if (err)
+ goto out;
+ }
if (!ipv6_addr_any(daddr))
fl6.daddr = *daddr;
@@ -900,20 +881,17 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
final_p = fl6_update_dst(&fl6, opt, &final);
if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
- fl6.flowi6_oif = np->mcast_oif;
+ fl6.flowi6_oif = READ_ONCE(np->mcast_oif);
else if (!fl6.flowi6_oif)
- fl6.flowi6_oif = np->ucast_oif;
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ fl6.flowi6_oif = READ_ONCE(np->ucast_oif);
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
- if (inet->hdrincl)
+ if (hdrincl)
fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH;
- if (ipc6.tclass < 0)
- ipc6.tclass = np->tclass;
-
fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
- dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
goto out;
@@ -921,21 +899,18 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (ipc6.hlimit < 0)
ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
- if (ipc6.dontfrag < 0)
- ipc6.dontfrag = np->dontfrag;
-
if (msg->msg_flags&MSG_CONFIRM)
goto do_confirm;
back_from_confirm:
- if (inet->hdrincl)
+ if (hdrincl)
err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst,
msg->msg_flags, &ipc6.sockc);
else {
ipc6.opt = opt;
lock_sock(sk);
err = ip6_append_data(sk, raw6_getfrag, &rfv,
- len, 0, &ipc6, &fl6, (struct rt6_info *)dst,
+ len, 0, &ipc6, &fl6, dst_rt6_info(dst),
msg->msg_flags);
if (err)
@@ -959,14 +934,14 @@ do_confirm:
goto done;
}
-static int rawv6_seticmpfilter(struct sock *sk, int level, int optname,
- char __user *optval, int optlen)
+static int rawv6_seticmpfilter(struct sock *sk, int optname,
+ sockptr_t optval, int optlen)
{
switch (optname) {
case ICMPV6_FILTER:
if (optlen > sizeof(struct icmp6_filter))
optlen = sizeof(struct icmp6_filter);
- if (copy_from_user(&raw6_sk(sk)->filter, optval, optlen))
+ if (copy_from_sockptr(&raw6_sk(sk)->filter, optval, optlen))
return -EFAULT;
return 0;
default:
@@ -976,7 +951,7 @@ static int rawv6_seticmpfilter(struct sock *sk, int level, int optname,
return 0;
}
-static int rawv6_geticmpfilter(struct sock *sk, int level, int optname,
+static int rawv6_geticmpfilter(struct sock *sk, int optname,
char __user *optval, int __user *optlen)
{
int len;
@@ -1003,19 +978,22 @@ static int rawv6_geticmpfilter(struct sock *sk, int level, int optname,
static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct raw6_sock *rp = raw6_sk(sk);
int val;
- if (get_user(val, (int __user *)optval))
+ if (optlen < sizeof(val))
+ return -EINVAL;
+
+ if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
switch (optname) {
case IPV6_HDRINCL:
if (sk->sk_type != SOCK_RAW)
return -EINVAL;
- inet_sk(sk)->hdrincl = !!val;
+ inet_assign_bit(HDRINCL, sk, val);
return 0;
case IPV6_CHECKSUM:
if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 &&
@@ -1050,7 +1028,7 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
}
static int rawv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
switch (level) {
case SOL_RAW:
@@ -1059,12 +1037,12 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname,
case SOL_ICMPV6:
if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
return -EOPNOTSUPP;
- return rawv6_seticmpfilter(sk, level, optname, optval, optlen);
+ return rawv6_seticmpfilter(sk, optname, optval, optlen);
case SOL_IPV6:
if (optname == IPV6_CHECKSUM ||
optname == IPV6_HDRINCL)
break;
- /* fall through */
+ fallthrough;
default:
return ipv6_setsockopt(sk, level, optname, optval, optlen);
}
@@ -1072,30 +1050,6 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname,
return do_rawv6_setsockopt(sk, level, optname, optval, optlen);
}
-#ifdef CONFIG_COMPAT
-static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- switch (level) {
- case SOL_RAW:
- break;
- case SOL_ICMPV6:
- if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
- return -EOPNOTSUPP;
- return rawv6_seticmpfilter(sk, level, optname, optval, optlen);
- case SOL_IPV6:
- if (optname == IPV6_CHECKSUM ||
- optname == IPV6_HDRINCL)
- break;
- /* fall through */
- default:
- return compat_ipv6_setsockopt(sk, level, optname,
- optval, optlen);
- }
- return do_rawv6_setsockopt(sk, level, optname, optval, optlen);
-}
-#endif
-
static int do_rawv6_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -1107,7 +1061,7 @@ static int do_rawv6_getsockopt(struct sock *sk, int level, int optname,
switch (optname) {
case IPV6_HDRINCL:
- val = inet_sk(sk)->hdrincl;
+ val = inet_test_bit(HDRINCL, sk);
break;
case IPV6_CHECKSUM:
/*
@@ -1144,12 +1098,12 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname,
case SOL_ICMPV6:
if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
return -EOPNOTSUPP;
- return rawv6_geticmpfilter(sk, level, optname, optval, optlen);
+ return rawv6_geticmpfilter(sk, optname, optval, optlen);
case SOL_IPV6:
if (optname == IPV6_CHECKSUM ||
optname == IPV6_HDRINCL)
break;
- /* fall through */
+ fallthrough;
default:
return ipv6_getsockopt(sk, level, optname, optval, optlen);
}
@@ -1157,53 +1111,29 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname,
return do_rawv6_getsockopt(sk, level, optname, optval, optlen);
}
-#ifdef CONFIG_COMPAT
-static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- switch (level) {
- case SOL_RAW:
- break;
- case SOL_ICMPV6:
- if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
- return -EOPNOTSUPP;
- return rawv6_geticmpfilter(sk, level, optname, optval, optlen);
- case SOL_IPV6:
- if (optname == IPV6_CHECKSUM ||
- optname == IPV6_HDRINCL)
- break;
- /* fall through */
- default:
- return compat_ipv6_getsockopt(sk, level, optname,
- optval, optlen);
- }
- return do_rawv6_getsockopt(sk, level, optname, optval, optlen);
-}
-#endif
-
-static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg)
+static int rawv6_ioctl(struct sock *sk, int cmd, int *karg)
{
switch (cmd) {
case SIOCOUTQ: {
- int amount = sk_wmem_alloc_get(sk);
-
- return put_user(amount, (int __user *)arg);
+ *karg = sk_wmem_alloc_get(sk);
+ return 0;
}
case SIOCINQ: {
struct sk_buff *skb;
- int amount = 0;
spin_lock_bh(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue);
if (skb)
- amount = skb->len;
+ *karg = skb->len;
+ else
+ *karg = 0;
spin_unlock_bh(&sk->sk_receive_queue.lock);
- return put_user(amount, (int __user *)arg);
+ return 0;
}
default:
#ifdef CONFIG_IPV6_MROUTE
- return ip6mr_ioctl(sk, cmd, (void __user *)arg);
+ return ip6mr_ioctl(sk, cmd, karg);
#else
return -ENOIOCTLCMD;
#endif
@@ -1240,14 +1170,13 @@ static void raw6_destroy(struct sock *sk)
lock_sock(sk);
ip6_flush_pending_frames(sk);
release_sock(sk);
-
- inet6_destroy_sock(sk);
}
static int rawv6_init_sk(struct sock *sk)
{
struct raw6_sock *rp = raw6_sk(sk);
+ sk->sk_drop_counters = &rp->drop_counters;
switch (inet_sk(sk)->inet_num) {
case IPPROTO_ICMPV6:
rp->checksum = 1;
@@ -1281,12 +1210,11 @@ struct proto rawv6_prot = {
.hash = raw_hash_sk,
.unhash = raw_unhash_sk,
.obj_size = sizeof(struct raw6_sock),
+ .ipv6_pinfo_offset = offsetof(struct raw6_sock, inet6),
.useroffset = offsetof(struct raw6_sock, filter),
.usersize = sizeof_field(struct raw6_sock, filter),
.h.raw_hash = &raw_v6_hashinfo,
#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_rawv6_setsockopt,
- .compat_getsockopt = compat_rawv6_getsockopt,
.compat_ioctl = compat_rawv6_ioctl,
#endif
.diag_destroy = raw_abort,
@@ -1355,6 +1283,7 @@ const struct proto_ops inet6_sockraw_ops = {
.getname = inet6_getname,
.poll = datagram_poll, /* ok */
.ioctl = inet6_ioctl, /* must change */
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen, /* ok */
.shutdown = inet_shutdown, /* ok */
.setsockopt = sock_common_setsockopt, /* ok */
@@ -1362,10 +1291,8 @@ const struct proto_ops inet6_sockraw_ops = {
.sendmsg = inet_sendmsg, /* ok */
.recvmsg = sock_common_recvmsg, /* ok */
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet6_compat_ioctl,
#endif
};
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 5c5b4f79296e..25ec8001898d 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPv6 fragment reassembly
* Linux INET6 implementation
@@ -6,11 +7,6 @@
* Pedro Roque <roque@di.fc.ul.pt>
*
* Based on: net/ipv4/ip_fragment.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/*
@@ -46,6 +42,8 @@
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
#include <net/sock.h>
#include <net/snmp.h>
@@ -69,19 +67,18 @@ static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
static struct inet_frags ip6_frags;
-static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
- struct net_device *dev);
+static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
+ struct sk_buff *prev_tail, struct net_device *dev,
+ int *refs);
static void ip6_frag_expire(struct timer_list *t)
{
- struct inet_frag_queue *frag = from_timer(frag, t, timer);
+ struct inet_frag_queue *frag = timer_container_of(frag, t, timer);
struct frag_queue *fq;
- struct net *net;
fq = container_of(frag, struct frag_queue, q);
- net = container_of(fq->q.net, struct net, ipv6.frags);
- ip6frag_expire_frag_queue(net, fq);
+ ip6frag_expire_frag_queue(fq->q.fqdir->net, fq);
}
static struct frag_queue *
@@ -100,32 +97,41 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
IPV6_ADDR_LINKLOCAL)))
key.iif = 0;
- q = inet_frag_find(&net->ipv6.frags, &key);
+ q = inet_frag_find(net->ipv6.fqdir, &key);
if (!q)
return NULL;
return container_of(q, struct frag_queue, q);
}
-static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
+static int ip6_frag_queue(struct net *net,
+ struct frag_queue *fq, struct sk_buff *skb,
struct frag_hdr *fhdr, int nhoff,
- u32 *prob_offset)
+ u32 *prob_offset, int *refs)
{
- struct sk_buff *prev, *next;
- struct net_device *dev;
int offset, end, fragsize;
- struct net *net = dev_net(skb_dst(skb)->dev);
+ struct sk_buff *prev_tail;
+ struct net_device *dev;
+ int err = -ENOENT;
+ SKB_DR(reason);
u8 ecn;
- if (fq->q.flags & INET_FRAG_COMPLETE)
+ /* If reassembly is already done, @skb must be a duplicate frag. */
+ if (fq->q.flags & INET_FRAG_COMPLETE) {
+ SKB_DR_SET(reason, DUP_FRAG);
goto err;
+ }
+ err = -EINVAL;
offset = ntohs(fhdr->frag_off) & ~0x7;
end = offset + (ntohs(ipv6_hdr(skb)->payload_len) -
((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1)));
if ((unsigned int)end > IPV6_MAXPLEN) {
*prob_offset = (u8 *)&fhdr->frag_off - skb_network_header(skb);
+ /* note that if prob_offset is set, the skb is freed elsewhere,
+ * we do not free it here.
+ */
return -1;
}
@@ -145,7 +151,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
*/
if (end < fq->q.len ||
((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len))
- goto err;
+ goto discard_fq;
fq->q.flags |= INET_FRAG_LAST_IN;
fq->q.len = end;
} else {
@@ -162,75 +168,41 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
if (end > fq->q.len) {
/* Some bits beyond end -> corruption. */
if (fq->q.flags & INET_FRAG_LAST_IN)
- goto err;
+ goto discard_fq;
fq->q.len = end;
}
}
if (end == offset)
- goto err;
+ goto discard_fq;
+ err = -ENOMEM;
/* Point into the IP datagram 'data' part. */
if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data))
- goto err;
-
- if (pskb_trim_rcsum(skb, end - offset))
- goto err;
-
- /* Find out which fragments are in front and at the back of us
- * in the chain of fragments so far. We must know where to put
- * this fragment, right?
- */
- prev = fq->q.fragments_tail;
- if (!prev || prev->ip_defrag_offset < offset) {
- next = NULL;
- goto found;
- }
- prev = NULL;
- for (next = fq->q.fragments; next != NULL; next = next->next) {
- if (next->ip_defrag_offset >= offset)
- break; /* bingo! */
- prev = next;
- }
-
-found:
- /* RFC5722, Section 4, amended by Errata ID : 3089
- * When reassembling an IPv6 datagram, if
- * one or more its constituent fragments is determined to be an
- * overlapping fragment, the entire datagram (and any constituent
- * fragments) MUST be silently discarded.
- */
-
- /* Check for overlap with preceding fragment. */
- if (prev &&
- (prev->ip_defrag_offset + prev->len) > offset)
goto discard_fq;
- /* Look for overlap with succeeding segment. */
- if (next && next->ip_defrag_offset < end)
+ err = pskb_trim_rcsum(skb, end - offset);
+ if (err)
goto discard_fq;
- /* Note : skb->ip_defrag_offset and skb->dev share the same location */
+ /* Note : skb->rbnode and skb->dev share the same location. */
dev = skb->dev;
- if (dev)
- fq->iif = dev->ifindex;
/* Makes sure compiler wont do silly aliasing games */
barrier();
- skb->ip_defrag_offset = offset;
- /* Insert this fragment in the chain of fragments. */
- skb->next = next;
- if (!next)
- fq->q.fragments_tail = skb;
- if (prev)
- prev->next = skb;
- else
- fq->q.fragments = skb;
+ prev_tail = fq->q.fragments_tail;
+ err = inet_frag_queue_insert(&fq->q, skb, offset, end);
+ if (err)
+ goto insert_error;
+
+ if (dev)
+ fq->iif = dev->ifindex;
fq->q.stamp = skb->tstamp;
+ fq->q.tstamp_type = skb->tstamp_type;
fq->q.meat += skb->len;
fq->ecn |= ecn;
- add_frag_mem_limit(fq->q.net, skb->truesize);
+ add_frag_mem_limit(fq->q.fqdir, skb->truesize);
fragsize = -skb_network_offset(skb) + skb->len;
if (fragsize > fq->q.max_size)
@@ -246,167 +218,97 @@ found:
if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
fq->q.meat == fq->q.len) {
- int res;
unsigned long orefdst = skb->_skb_refdst;
skb->_skb_refdst = 0UL;
- res = ip6_frag_reasm(fq, prev, dev);
+ err = ip6_frag_reasm(fq, skb, prev_tail, dev, refs);
skb->_skb_refdst = orefdst;
- return res;
+ return err;
}
skb_dst_drop(skb);
- return -1;
+ return -EINPROGRESS;
+insert_error:
+ if (err == IPFRAG_DUP) {
+ SKB_DR_SET(reason, DUP_FRAG);
+ err = -EINVAL;
+ goto err;
+ }
+ err = -EINVAL;
+ __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+ IPSTATS_MIB_REASM_OVERLAPS);
discard_fq:
- inet_frag_kill(&fq->q);
-err:
+ inet_frag_kill(&fq->q, refs);
__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
IPSTATS_MIB_REASMFAILS);
- kfree_skb(skb);
- return -1;
+err:
+ kfree_skb_reason(skb, reason);
+ return err;
}
/*
* Check if this packet is complete.
- * Returns NULL on failure by any reason, and pointer
- * to current nexthdr field in reassembled frame.
*
* It is called with locked fq, and caller must check that
* queue is eligible for reassembly i.e. it is not COMPLETE,
* the last and the first frames arrived and all the bits are here.
*/
-static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
- struct net_device *dev)
+static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb,
+ struct sk_buff *prev_tail, struct net_device *dev,
+ int *refs)
{
- struct net *net = container_of(fq->q.net, struct net, ipv6.frags);
- struct sk_buff *fp, *head = fq->q.fragments;
- int payload_len;
+ struct net *net = fq->q.fqdir->net;
unsigned int nhoff;
- int sum_truesize;
+ void *reasm_data;
+ int payload_len;
u8 ecn;
- inet_frag_kill(&fq->q);
+ inet_frag_kill(&fq->q, refs);
ecn = ip_frag_ecn_table[fq->ecn];
if (unlikely(ecn == 0xff))
goto out_fail;
- /* Make the one we just received the head. */
- if (prev) {
- head = prev->next;
- fp = skb_clone(head, GFP_ATOMIC);
-
- if (!fp)
- goto out_oom;
-
- fp->next = head->next;
- if (!fp->next)
- fq->q.fragments_tail = fp;
- prev->next = fp;
-
- skb_morph(head, fq->q.fragments);
- head->next = fq->q.fragments->next;
-
- consume_skb(fq->q.fragments);
- fq->q.fragments = head;
- }
-
- WARN_ON(head == NULL);
- WARN_ON(head->ip_defrag_offset != 0);
+ reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail);
+ if (!reasm_data)
+ goto out_oom;
- /* Unfragmented part is taken from the first segment. */
- payload_len = ((head->data - skb_network_header(head)) -
+ payload_len = -skb_network_offset(skb) -
sizeof(struct ipv6hdr) + fq->q.len -
- sizeof(struct frag_hdr));
+ sizeof(struct frag_hdr);
if (payload_len > IPV6_MAXPLEN)
goto out_oversize;
- /* Head of list must not be cloned. */
- if (skb_unclone(head, GFP_ATOMIC))
- goto out_oom;
-
- /* If the first fragment is fragmented itself, we split
- * it to two chunks: the first with data and paged part
- * and the second, holding only fragments. */
- if (skb_has_frag_list(head)) {
- struct sk_buff *clone;
- int i, plen = 0;
-
- clone = alloc_skb(0, GFP_ATOMIC);
- if (!clone)
- goto out_oom;
- clone->next = head->next;
- head->next = clone;
- skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
- skb_frag_list_init(head);
- for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
- plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
- clone->len = clone->data_len = head->data_len - plen;
- head->data_len -= clone->len;
- head->len -= clone->len;
- clone->csum = 0;
- clone->ip_summed = head->ip_summed;
- add_frag_mem_limit(fq->q.net, clone->truesize);
- }
-
/* We have to remove fragment header from datagram and to relocate
* header in order to calculate ICV correctly. */
nhoff = fq->nhoffset;
- skb_network_header(head)[nhoff] = skb_transport_header(head)[0];
- memmove(head->head + sizeof(struct frag_hdr), head->head,
- (head->data - head->head) - sizeof(struct frag_hdr));
- if (skb_mac_header_was_set(head))
- head->mac_header += sizeof(struct frag_hdr);
- head->network_header += sizeof(struct frag_hdr);
-
- skb_reset_transport_header(head);
- skb_push(head, head->data - skb_network_header(head));
-
- sum_truesize = head->truesize;
- for (fp = head->next; fp;) {
- bool headstolen;
- int delta;
- struct sk_buff *next = fp->next;
-
- sum_truesize += fp->truesize;
- if (head->ip_summed != fp->ip_summed)
- head->ip_summed = CHECKSUM_NONE;
- else if (head->ip_summed == CHECKSUM_COMPLETE)
- head->csum = csum_add(head->csum, fp->csum);
-
- if (skb_try_coalesce(head, fp, &headstolen, &delta)) {
- kfree_skb_partial(fp, headstolen);
- } else {
- if (!skb_shinfo(head)->frag_list)
- skb_shinfo(head)->frag_list = fp;
- head->data_len += fp->len;
- head->len += fp->len;
- head->truesize += fp->truesize;
- }
- fp = next;
- }
- sub_frag_mem_limit(fq->q.net, sum_truesize);
+ skb_network_header(skb)[nhoff] = skb_transport_header(skb)[0];
+ memmove(skb->head + sizeof(struct frag_hdr), skb->head,
+ (skb->data - skb->head) - sizeof(struct frag_hdr));
+ if (skb_mac_header_was_set(skb))
+ skb->mac_header += sizeof(struct frag_hdr);
+ skb->network_header += sizeof(struct frag_hdr);
+
+ skb_reset_transport_header(skb);
- head->next = NULL;
- head->dev = dev;
- head->tstamp = fq->q.stamp;
- ipv6_hdr(head)->payload_len = htons(payload_len);
- ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn);
- IP6CB(head)->nhoff = nhoff;
- IP6CB(head)->flags |= IP6SKB_FRAGMENTED;
- IP6CB(head)->frag_max_size = fq->q.max_size;
+ inet_frag_reasm_finish(&fq->q, skb, reasm_data, true);
+
+ skb->dev = dev;
+ ipv6_hdr(skb)->payload_len = htons(payload_len);
+ ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn);
+ IP6CB(skb)->nhoff = nhoff;
+ IP6CB(skb)->flags |= IP6SKB_FRAGMENTED;
+ IP6CB(skb)->frag_max_size = fq->q.max_size;
/* Yes, and fold redundant checksum back. 8) */
- skb_postpush_rcsum(head, skb_network_header(head),
- skb_network_header_len(head));
+ skb_postpush_rcsum(skb, skb_network_header(skb),
+ skb_network_header_len(skb));
- rcu_read_lock();
- __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
- rcu_read_unlock();
- fq->q.fragments = NULL;
+ __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMOKS);
fq->q.rb_fragments = RB_ROOT;
fq->q.fragments_tail = NULL;
+ fq->q.last_run_head = NULL;
return 1;
out_oversize:
@@ -415,18 +317,18 @@ out_oversize:
out_oom:
net_dbg_ratelimited("ip6_frag_reasm: no memory for reassembly\n");
out_fail:
- rcu_read_lock();
- __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
- rcu_read_unlock();
+ __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMFAILS);
+ inet_frag_kill(&fq->q, refs);
return -1;
}
static int ipv6_frag_rcv(struct sk_buff *skb)
{
+ const struct ipv6hdr *hdr = ipv6_hdr(skb);
+ struct net *net = skb_dst_dev_net(skb);
struct frag_hdr *fhdr;
struct frag_queue *fq;
- const struct ipv6hdr *hdr = ipv6_hdr(skb);
- struct net *net = dev_net(skb_dst(skb)->dev);
+ u8 nexthdr;
int iif;
if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED)
@@ -445,7 +347,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
hdr = ipv6_hdr(skb);
fhdr = (struct frag_hdr *)skb_transport_header(skb);
- if (!(fhdr->frag_off & htons(0xFFF9))) {
+ if (!(fhdr->frag_off & htons(IP6_OFFSET | IP6_MF))) {
/* It is not a fragmented frame */
skb->transport_header += sizeof(struct frag_hdr);
__IP6_INC_STATS(net,
@@ -453,34 +355,50 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
IP6CB(skb)->nhoff = (u8 *)fhdr - skb_network_header(skb);
IP6CB(skb)->flags |= IP6SKB_FRAGMENTED;
+ IP6CB(skb)->frag_max_size = ntohs(hdr->payload_len) +
+ sizeof(struct ipv6hdr);
return 1;
}
- if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU &&
- fhdr->frag_off & htons(IP6_MF))
- goto fail_hdr;
+ /* RFC 8200, Section 4.5 Fragment Header:
+ * If the first fragment does not include all headers through an
+ * Upper-Layer header, then that fragment should be discarded and
+ * an ICMP Parameter Problem, Code 3, message should be sent to
+ * the source of the fragment, with the Pointer field set to zero.
+ */
+ nexthdr = hdr->nexthdr;
+ if (ipv6frag_thdr_truncated(skb, skb_network_offset(skb) + sizeof(struct ipv6hdr), &nexthdr)) {
+ __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
+ IPSTATS_MIB_INHDRERRORS);
+ icmpv6_param_prob(skb, ICMPV6_HDR_INCOMP, 0);
+ return -1;
+ }
iif = skb->dev ? skb->dev->ifindex : 0;
+ rcu_read_lock();
fq = fq_find(net, fhdr->identification, hdr, iif);
if (fq) {
u32 prob_offset = 0;
- int ret;
+ int ret, refs = 0;
spin_lock(&fq->q.lock);
fq->iif = iif;
- ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff,
- &prob_offset);
+ ret = ip6_frag_queue(net, fq, skb, fhdr, IP6CB(skb)->nhoff,
+ &prob_offset, &refs);
spin_unlock(&fq->q.lock);
- inet_frag_put(&fq->q);
+ rcu_read_unlock();
+ inet_frag_putn(&fq->q, refs);
if (prob_offset) {
__IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev),
IPSTATS_MIB_INHDRERRORS);
+ /* icmpv6_param_prob() calls kfree_skb(skb) */
icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, prob_offset);
}
return ret;
}
+ rcu_read_unlock();
__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS);
kfree_skb(skb);
@@ -503,28 +421,22 @@ static const struct inet6_protocol frag_protocol = {
static struct ctl_table ip6_frags_ns_ctl_table[] = {
{
.procname = "ip6frag_high_thresh",
- .data = &init_net.ipv6.frags.high_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra1 = &init_net.ipv6.frags.low_thresh
},
{
.procname = "ip6frag_low_thresh",
- .data = &init_net.ipv6.frags.low_thresh,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
- .extra2 = &init_net.ipv6.frags.high_thresh
},
{
.procname = "ip6frag_time",
- .data = &init_net.ipv6.frags.timeout,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- { }
};
/* secret interval has been deprecated */
@@ -537,7 +449,6 @@ static struct ctl_table ip6_frags_ctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- { }
};
static int __net_init ip6_frags_ns_sysctl_register(struct net *net)
@@ -551,15 +462,15 @@ static int __net_init ip6_frags_ns_sysctl_register(struct net *net)
if (!table)
goto err_alloc;
- table[0].data = &net->ipv6.frags.high_thresh;
- table[0].extra1 = &net->ipv6.frags.low_thresh;
- table[0].extra2 = &init_net.ipv6.frags.high_thresh;
- table[1].data = &net->ipv6.frags.low_thresh;
- table[1].extra2 = &net->ipv6.frags.high_thresh;
- table[2].data = &net->ipv6.frags.timeout;
}
-
- hdr = register_net_sysctl(net, "net/ipv6", table);
+ table[0].data = &net->ipv6.fqdir->high_thresh;
+ table[0].extra1 = &net->ipv6.fqdir->low_thresh;
+ table[1].data = &net->ipv6.fqdir->low_thresh;
+ table[1].extra2 = &net->ipv6.fqdir->high_thresh;
+ table[2].data = &net->ipv6.fqdir->timeout;
+
+ hdr = register_net_sysctl_sz(net, "net/ipv6", table,
+ ARRAY_SIZE(ip6_frags_ns_ctl_table));
if (!hdr)
goto err_reg;
@@ -575,7 +486,7 @@ err_alloc:
static void __net_exit ip6_frags_ns_sysctl_unregister(struct net *net)
{
- struct ctl_table *table;
+ const struct ctl_table *table;
table = net->ipv6.sysctl.frags_hdr->ctl_table_arg;
unregister_net_sysctl_table(net->ipv6.sysctl.frags_hdr);
@@ -620,30 +531,35 @@ static int __net_init ipv6_frags_init_net(struct net *net)
{
int res;
- net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
- net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
- net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
- net->ipv6.frags.f = &ip6_frags;
-
- res = inet_frags_init_net(&net->ipv6.frags);
+ res = fqdir_init(&net->ipv6.fqdir, &ip6_frags, net);
if (res < 0)
return res;
+ net->ipv6.fqdir->high_thresh = IPV6_FRAG_HIGH_THRESH;
+ net->ipv6.fqdir->low_thresh = IPV6_FRAG_LOW_THRESH;
+ net->ipv6.fqdir->timeout = IPV6_FRAG_TIMEOUT;
+
res = ip6_frags_ns_sysctl_register(net);
if (res < 0)
- inet_frags_exit_net(&net->ipv6.frags);
+ fqdir_exit(net->ipv6.fqdir);
return res;
}
+static void __net_exit ipv6_frags_pre_exit_net(struct net *net)
+{
+ fqdir_pre_exit(net->ipv6.fqdir);
+}
+
static void __net_exit ipv6_frags_exit_net(struct net *net)
{
ip6_frags_ns_sysctl_unregister(net);
- inet_frags_exit_net(&net->ipv6.frags);
+ fqdir_exit(net->ipv6.fqdir);
}
static struct pernet_operations ip6_frags_ops = {
- .init = ipv6_frags_init_net,
- .exit = ipv6_frags_exit_net,
+ .init = ipv6_frags_init_net,
+ .pre_exit = ipv6_frags_pre_exit_net,
+ .exit = ipv6_frags_exit_net,
};
static const struct rhashtable_params ip6_rhash_params = {
@@ -694,8 +610,8 @@ err_protocol:
void ipv6_frag_exit(void)
{
- inet_frags_fini(&ip6_frags);
ip6_frags_sysctl_unregister();
unregister_pernet_subsys(&ip6_frags_ops);
inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+ inet_frags_fini(&ip6_frags);
}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a366c05a239d..aee6a10b112a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1,14 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Linux INET6 implementation
* FIB front-end.
*
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
/* Changes:
@@ -45,6 +41,7 @@
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/jhash.h>
+#include <linux/siphash.h>
#include <net/net_namespace.h>
#include <net/snmp.h>
#include <net/ipv6.h>
@@ -59,12 +56,13 @@
#include <net/xfrm.h>
#include <net/netevent.h>
#include <net/netlink.h>
-#include <net/nexthop.h>
+#include <net/rtnh.h>
#include <net/lwtunnel.h>
#include <net/ip_tunnels.h>
#include <net/l3mdev.h>
#include <net/ip.h>
#include <linux/uaccess.h>
+#include <linux/btf_ids.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
@@ -84,14 +82,17 @@ enum rt6_nud_state {
RT6_NUD_SUCCEED = 1
};
-static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
+INDIRECT_CALLABLE_SCOPE
+struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int ip6_default_advmss(const struct dst_entry *dst);
-static unsigned int ip6_mtu(const struct dst_entry *dst);
-static struct dst_entry *ip6_negative_advice(struct dst_entry *);
+INDIRECT_CALLABLE_SCOPE
+unsigned int ip6_mtu(const struct dst_entry *dst);
+static void ip6_negative_advice(struct sock *sk,
+ struct dst_entry *dst);
static void ip6_dst_destroy(struct dst_entry *);
static void ip6_dst_ifdown(struct dst_entry *,
- struct net_device *dev, int how);
-static int ip6_dst_gc(struct dst_ops *ops);
+ struct net_device *dev);
+static void ip6_dst_gc(struct dst_ops *ops);
static int ip6_pkt_discard(struct sk_buff *skb);
static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
@@ -99,19 +100,21 @@ static int ip6_pkt_prohibit(struct sk_buff *skb);
static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static void ip6_link_failure(struct sk_buff *skb);
static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu);
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh);
static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
-static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
-static size_t rt6_nlmsg_size(struct fib6_info *rt);
+static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
+ int strict);
+static size_t rt6_nlmsg_size(struct fib6_info *f6i);
static int rt6_fill_node(struct net *net, struct sk_buff *skb,
struct fib6_info *rt, struct dst_entry *dst,
struct in6_addr *dest, struct in6_addr *src,
int iif, int type, u32 portid, u32 seq,
unsigned int flags);
-static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
- struct in6_addr *daddr,
- struct in6_addr *saddr);
+static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr);
#ifdef CONFIG_IPV6_ROUTE_INFO
static struct fib6_info *rt6_add_route_info(struct net *net,
@@ -136,53 +139,56 @@ void rt6_uncached_list_add(struct rt6_info *rt)
{
struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
- rt->rt6i_uncached_list = ul;
+ rt->dst.rt_uncached_list = ul;
spin_lock_bh(&ul->lock);
- list_add_tail(&rt->rt6i_uncached, &ul->head);
+ list_add_tail(&rt->dst.rt_uncached, &ul->head);
spin_unlock_bh(&ul->lock);
}
void rt6_uncached_list_del(struct rt6_info *rt)
{
- if (!list_empty(&rt->rt6i_uncached)) {
- struct uncached_list *ul = rt->rt6i_uncached_list;
- struct net *net = dev_net(rt->dst.dev);
+ if (!list_empty(&rt->dst.rt_uncached)) {
+ struct uncached_list *ul = rt->dst.rt_uncached_list;
spin_lock_bh(&ul->lock);
- list_del(&rt->rt6i_uncached);
- atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
+ list_del_init(&rt->dst.rt_uncached);
spin_unlock_bh(&ul->lock);
}
}
-static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
+static void rt6_uncached_list_flush_dev(struct net_device *dev)
{
- struct net_device *loopback_dev = net->loopback_dev;
int cpu;
- if (dev == loopback_dev)
- return;
-
for_each_possible_cpu(cpu) {
struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
- struct rt6_info *rt;
+ struct rt6_info *rt, *safe;
+
+ if (list_empty(&ul->head))
+ continue;
spin_lock_bh(&ul->lock);
- list_for_each_entry(rt, &ul->head, rt6i_uncached) {
+ list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
struct inet6_dev *rt_idev = rt->rt6i_idev;
struct net_device *rt_dev = rt->dst.dev;
+ bool handled = false;
- if (rt_idev->dev == dev) {
- rt->rt6i_idev = in6_dev_get(loopback_dev);
+ if (rt_idev && rt_idev->dev == dev) {
+ rt->rt6i_idev = in6_dev_get(blackhole_netdev);
in6_dev_put(rt_idev);
+ handled = true;
}
if (rt_dev == dev) {
- rt->dst.dev = loopback_dev;
- dev_hold(rt->dst.dev);
- dev_put(rt_dev);
+ rt->dst.dev = blackhole_netdev;
+ netdev_ref_replace(rt_dev, blackhole_netdev,
+ &rt->dst.dev_tracker,
+ GFP_ATOMIC);
+ handled = true;
}
+ if (handled)
+ list_del_init(&rt->dst.rt_uncached);
}
spin_unlock_bh(&ul->lock);
}
@@ -210,24 +216,27 @@ struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
n = __ipv6_neigh_lookup(dev, daddr);
if (n)
return n;
- return neigh_create(&nd_tbl, daddr, dev);
+
+ n = neigh_create(&nd_tbl, daddr, dev);
+ return IS_ERR(n) ? NULL : n;
}
static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
struct sk_buff *skb,
const void *daddr)
{
- const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
+ const struct rt6_info *rt = dst_rt6_info(dst);
- return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
+ return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any),
+ dst_dev(dst), skb, daddr);
}
static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
- struct net_device *dev = dst->dev;
- struct rt6_info *rt = (struct rt6_info *)dst;
+ const struct rt6_info *rt = dst_rt6_info(dst);
+ struct net_device *dev = dst_dev(dst);
- daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
+ daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr);
if (!daddr)
return;
if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
@@ -256,33 +265,16 @@ static struct dst_ops ip6_dst_ops_template = {
.confirm_neigh = ip6_confirm_neigh,
};
-static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
-{
- unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
-
- return mtu ? : dst->dev->mtu;
-}
-
-static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
-{
-}
-
-static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb)
-{
-}
-
static struct dst_ops ip6_dst_blackhole_ops = {
- .family = AF_INET6,
- .destroy = ip6_dst_destroy,
- .check = ip6_dst_check,
- .mtu = ip6_blackhole_mtu,
- .default_advmss = ip6_default_advmss,
- .update_pmtu = ip6_rt_blackhole_update_pmtu,
- .redirect = ip6_rt_blackhole_redirect,
- .cow_metrics = dst_cow_metrics_generic,
- .neigh_lookup = ip6_dst_neigh_lookup,
+ .family = AF_INET6,
+ .default_advmss = ip6_default_advmss,
+ .neigh_lookup = ip6_dst_neigh_lookup,
+ .check = ip6_dst_check,
+ .destroy = ip6_dst_destroy,
+ .cow_metrics = dst_cow_metrics_generic,
+ .update_pmtu = dst_blackhole_update_pmtu,
+ .redirect = dst_blackhole_redirect,
+ .mtu = dst_blackhole_mtu,
};
static const u32 ip6_template_metrics[RTAX_MAX] = {
@@ -293,14 +285,14 @@ static const struct fib6_info fib6_null_entry_template = {
.fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
.fib6_protocol = RTPROT_KERNEL,
.fib6_metric = ~(u32)0,
- .fib6_ref = ATOMIC_INIT(1),
+ .fib6_ref = REFCOUNT_INIT(1),
.fib6_type = RTN_UNREACHABLE,
.fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
};
static const struct rt6_info ip6_null_entry_template = {
.dst = {
- .__refcnt = ATOMIC_INIT(1),
+ .__rcuref = RCUREF_INIT(1),
.__use = 1,
.obsolete = DST_OBSOLETE_FORCE_CHK,
.error = -ENETUNREACH,
@@ -314,7 +306,7 @@ static const struct rt6_info ip6_null_entry_template = {
static const struct rt6_info ip6_prohibit_entry_template = {
.dst = {
- .__refcnt = ATOMIC_INIT(1),
+ .__rcuref = RCUREF_INIT(1),
.__use = 1,
.obsolete = DST_OBSOLETE_FORCE_CHK,
.error = -EACCES,
@@ -326,7 +318,7 @@ static const struct rt6_info ip6_prohibit_entry_template = {
static const struct rt6_info ip6_blk_hole_entry_template = {
.dst = {
- .__refcnt = ATOMIC_INIT(1),
+ .__rcuref = RCUREF_INIT(1),
.__use = 1,
.obsolete = DST_OBSOLETE_FORCE_CHK,
.error = -EINVAL,
@@ -340,10 +332,7 @@ static const struct rt6_info ip6_blk_hole_entry_template = {
static void rt6_info_init(struct rt6_info *rt)
{
- struct dst_entry *dst = &rt->dst;
-
- memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
- INIT_LIST_HEAD(&rt->rt6i_uncached);
+ memset_after(rt, 0, dst);
}
/* allocate dst with ip6_dst_ops */
@@ -351,7 +340,7 @@ struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
int flags)
{
struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
- 1, DST_OBSOLETE_FORCE_CHK, flags);
+ DST_OBSOLETE_FORCE_CHK, flags);
if (rt) {
rt6_info_init(rt);
@@ -364,14 +353,11 @@ EXPORT_SYMBOL(ip6_dst_alloc);
static void ip6_dst_destroy(struct dst_entry *dst)
{
- struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
- struct rt6_info *rt = (struct rt6_info *)dst;
+ struct rt6_info *rt = dst_rt6_info(dst);
struct fib6_info *from;
struct inet6_dev *idev;
- if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
- kfree(p);
-
+ ip_dst_metrics_put(dst);
rt6_uncached_list_del(rt);
idev = rt->rt6i_idev;
@@ -380,36 +366,33 @@ static void ip6_dst_destroy(struct dst_entry *dst)
in6_dev_put(idev);
}
- rcu_read_lock();
- from = rcu_dereference(rt->from);
- rcu_assign_pointer(rt->from, NULL);
+ from = unrcu_pointer(xchg(&rt->from, NULL));
fib6_info_release(from);
- rcu_read_unlock();
}
-static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
- int how)
+static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
{
- struct rt6_info *rt = (struct rt6_info *)dst;
+ struct rt6_info *rt = dst_rt6_info(dst);
struct inet6_dev *idev = rt->rt6i_idev;
- struct net_device *loopback_dev =
- dev_net(dev)->loopback_dev;
+ struct fib6_info *from;
- if (idev && idev->dev != loopback_dev) {
- struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
- if (loopback_idev) {
- rt->rt6i_idev = loopback_idev;
+ if (idev && idev->dev != blackhole_netdev) {
+ struct inet6_dev *blackhole_idev = in6_dev_get(blackhole_netdev);
+
+ if (blackhole_idev) {
+ rt->rt6i_idev = blackhole_idev;
in6_dev_put(idev);
}
}
+ from = unrcu_pointer(xchg(&rt->from, NULL));
+ fib6_info_release(from);
}
static bool __rt6_check_expired(const struct rt6_info *rt)
{
if (rt->rt6i_flags & RTF_EXPIRES)
- return time_after(jiffies, rt->dst.expires);
- else
- return false;
+ return time_after(jiffies, READ_ONCE(rt->dst.expires));
+ return false;
}
static bool rt6_check_expired(const struct rt6_info *rt)
@@ -419,84 +402,229 @@ static bool rt6_check_expired(const struct rt6_info *rt)
from = rcu_dereference(rt->from);
if (rt->rt6i_flags & RTF_EXPIRES) {
- if (time_after(jiffies, rt->dst.expires))
+ if (time_after(jiffies, READ_ONCE(rt->dst.expires)))
return true;
} else if (from) {
- return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
+ return READ_ONCE(rt->dst.obsolete) != DST_OBSOLETE_FORCE_CHK ||
fib6_check_expired(from);
}
return false;
}
-struct fib6_info *fib6_multipath_select(const struct net *net,
- struct fib6_info *match,
- struct flowi6 *fl6, int oif,
- const struct sk_buff *skb,
- int strict)
+static struct fib6_info *
+rt6_multipath_first_sibling_rcu(const struct fib6_info *rt)
{
- struct fib6_info *sibling, *next_sibling;
+ struct fib6_info *iter;
+ struct fib6_node *fn;
+
+ fn = rcu_dereference(rt->fib6_node);
+ if (!fn)
+ goto out;
+ iter = rcu_dereference(fn->leaf);
+ if (!iter)
+ goto out;
+
+ while (iter) {
+ if (iter->fib6_metric == rt->fib6_metric &&
+ rt6_qualify_for_ecmp(iter))
+ return iter;
+ iter = rcu_dereference(iter->fib6_next);
+ }
+
+out:
+ return NULL;
+}
+
+void fib6_select_path(const struct net *net, struct fib6_result *res,
+ struct flowi6 *fl6, int oif, bool have_oif_match,
+ const struct sk_buff *skb, int strict)
+{
+ struct fib6_info *first, *match = res->f6i;
+ struct fib6_info *sibling;
+ int hash;
+
+ if (!match->nh && (!match->fib6_nsiblings || have_oif_match))
+ goto out;
+
+ if (match->nh && have_oif_match && res->nh)
+ return;
+
+ if (skb)
+ IP6CB(skb)->flags |= IP6SKB_MULTIPATH;
/* We might have already computed the hash for ICMPv6 errors. In such
* case it will always be non-zero. Otherwise now is the time to do it.
*/
- if (!fl6->mp_hash)
+ if (!fl6->mp_hash &&
+ (!match->nh || nexthop_is_multipath(match->nh)))
fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
- if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
- return match;
+ if (unlikely(match->nh)) {
+ nexthop_path_fib6_result(res, fl6->mp_hash);
+ return;
+ }
+
+ first = rt6_multipath_first_sibling_rcu(match);
+ if (!first)
+ goto out;
+
+ hash = fl6->mp_hash;
+ if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound)) {
+ if (rt6_score_route(first->fib6_nh, first->fib6_flags, oif,
+ strict) >= 0)
+ match = first;
+ goto out;
+ }
- list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
- fib6_siblings) {
+ list_for_each_entry_rcu(sibling, &first->fib6_siblings,
+ fib6_siblings) {
+ const struct fib6_nh *nh = sibling->fib6_nh;
int nh_upper_bound;
- nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
- if (fl6->mp_hash > nh_upper_bound)
+ nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
+ if (hash > nh_upper_bound)
continue;
- if (rt6_score_route(sibling, oif, strict) < 0)
+ if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
break;
match = sibling;
break;
}
- return match;
+out:
+ res->f6i = match;
+ res->nh = match->fib6_nh;
}
/*
* Route lookup. rcu_read_lock() should be held.
*/
-static inline struct fib6_info *rt6_device_match(struct net *net,
- struct fib6_info *rt,
- const struct in6_addr *saddr,
- int oif,
- int flags)
+static bool __rt6_device_match(struct net *net, const struct fib6_nh *nh,
+ const struct in6_addr *saddr, int oif, int flags)
+{
+ const struct net_device *dev;
+
+ if (nh->fib_nh_flags & RTNH_F_DEAD)
+ return false;
+
+ dev = nh->fib_nh_dev;
+ if (oif) {
+ if (dev->ifindex == oif)
+ return true;
+ } else {
+ if (ipv6_chk_addr(net, saddr, dev,
+ flags & RT6_LOOKUP_F_IFACE))
+ return true;
+ }
+
+ return false;
+}
+
+struct fib6_nh_dm_arg {
+ struct net *net;
+ const struct in6_addr *saddr;
+ int oif;
+ int flags;
+ struct fib6_nh *nh;
+};
+
+static int __rt6_nh_dev_match(struct fib6_nh *nh, void *_arg)
{
- struct fib6_info *sprt;
+ struct fib6_nh_dm_arg *arg = _arg;
- if (!oif && ipv6_addr_any(saddr) &&
- !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
- return rt;
+ arg->nh = nh;
+ return __rt6_device_match(arg->net, nh, arg->saddr, arg->oif,
+ arg->flags);
+}
- for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
- const struct net_device *dev = sprt->fib6_nh.nh_dev;
+/* returns fib6_nh from nexthop or NULL */
+static struct fib6_nh *rt6_nh_dev_match(struct net *net, struct nexthop *nh,
+ struct fib6_result *res,
+ const struct in6_addr *saddr,
+ int oif, int flags)
+{
+ struct fib6_nh_dm_arg arg = {
+ .net = net,
+ .saddr = saddr,
+ .oif = oif,
+ .flags = flags,
+ };
- if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
- continue;
+ if (nexthop_is_blackhole(nh))
+ return NULL;
+
+ if (nexthop_for_each_fib6_nh(nh, __rt6_nh_dev_match, &arg))
+ return arg.nh;
- if (oif) {
- if (dev->ifindex == oif)
- return sprt;
+ return NULL;
+}
+
+static void rt6_device_match(struct net *net, struct fib6_result *res,
+ const struct in6_addr *saddr, int oif, int flags)
+{
+ struct fib6_info *f6i = res->f6i;
+ struct fib6_info *spf6i;
+ struct fib6_nh *nh;
+
+ if (!oif && ipv6_addr_any(saddr)) {
+ if (unlikely(f6i->nh)) {
+ nh = nexthop_fib6_nh(f6i->nh);
+ if (nexthop_is_blackhole(f6i->nh))
+ goto out_blackhole;
} else {
- if (ipv6_chk_addr(net, saddr, dev,
- flags & RT6_LOOKUP_F_IFACE))
- return sprt;
+ nh = f6i->fib6_nh;
}
+ if (!(nh->fib_nh_flags & RTNH_F_DEAD))
+ goto out;
}
- if (oif && flags & RT6_LOOKUP_F_IFACE)
- return net->ipv6.fib6_null_entry;
+ for (spf6i = f6i; spf6i; spf6i = rcu_dereference(spf6i->fib6_next)) {
+ bool matched = false;
- return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
+ if (unlikely(spf6i->nh)) {
+ nh = rt6_nh_dev_match(net, spf6i->nh, res, saddr,
+ oif, flags);
+ if (nh)
+ matched = true;
+ } else {
+ nh = spf6i->fib6_nh;
+ if (__rt6_device_match(net, nh, saddr, oif, flags))
+ matched = true;
+ }
+ if (matched) {
+ res->f6i = spf6i;
+ goto out;
+ }
+ }
+
+ if (oif && flags & RT6_LOOKUP_F_IFACE) {
+ res->f6i = net->ipv6.fib6_null_entry;
+ nh = res->f6i->fib6_nh;
+ goto out;
+ }
+
+ if (unlikely(f6i->nh)) {
+ nh = nexthop_fib6_nh(f6i->nh);
+ if (nexthop_is_blackhole(f6i->nh))
+ goto out_blackhole;
+ } else {
+ nh = f6i->fib6_nh;
+ }
+
+ if (nh->fib_nh_flags & RTNH_F_DEAD) {
+ res->f6i = net->ipv6.fib6_null_entry;
+ nh = res->f6i->fib6_nh;
+ }
+out:
+ res->nh = nh;
+ res->fib6_type = res->f6i->fib6_type;
+ res->fib6_flags = res->f6i->fib6_flags;
+ return;
+
+out_blackhole:
+ res->fib6_flags |= RTF_REJECT;
+ res->fib6_type = RTN_BLACKHOLE;
+ res->nh = nh;
}
#ifdef CONFIG_IPV6_ROUTER_PREF
@@ -504,6 +632,7 @@ struct __rt6_probe_work {
struct work_struct work;
struct in6_addr target;
struct net_device *dev;
+ netdevice_tracker dev_tracker;
};
static void rt6_probe_deferred(struct work_struct *w)
@@ -514,16 +643,18 @@ static void rt6_probe_deferred(struct work_struct *w)
addrconf_addr_solict_mult(&work->target, &mcaddr);
ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
- dev_put(work->dev);
+ netdev_put(work->dev, &work->dev_tracker);
kfree(work);
}
-static void rt6_probe(struct fib6_info *rt)
+static void rt6_probe(struct fib6_nh *fib6_nh)
{
- struct __rt6_probe_work *work;
+ struct __rt6_probe_work *work = NULL;
const struct in6_addr *nh_gw;
+ unsigned long last_probe;
struct neighbour *neigh;
struct net_device *dev;
+ struct inet6_dev *idev;
/*
* Okay, this does not seem to be appropriate
@@ -533,47 +664,52 @@ static void rt6_probe(struct fib6_info *rt)
* Router Reachability Probe MUST be rate-limited
* to no more than one per minute.
*/
- if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
+ if (!fib6_nh->fib_nh_gw_family)
return;
- nh_gw = &rt->fib6_nh.nh_gw;
- dev = rt->fib6_nh.nh_dev;
- rcu_read_lock_bh();
+ nh_gw = &fib6_nh->fib_nh_gw6;
+ dev = fib6_nh->fib_nh_dev;
+ rcu_read_lock();
+ last_probe = READ_ONCE(fib6_nh->last_probe);
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ goto out;
neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
if (neigh) {
- struct inet6_dev *idev;
-
- if (neigh->nud_state & NUD_VALID)
+ if (READ_ONCE(neigh->nud_state) & NUD_VALID)
goto out;
- idev = __in6_dev_get(dev);
- work = NULL;
- write_lock(&neigh->lock);
+ write_lock_bh(&neigh->lock);
if (!(neigh->nud_state & NUD_VALID) &&
time_after(jiffies,
- neigh->updated + idev->cnf.rtr_probe_interval)) {
+ neigh->updated +
+ READ_ONCE(idev->cnf.rtr_probe_interval))) {
work = kmalloc(sizeof(*work), GFP_ATOMIC);
if (work)
__neigh_set_probe_once(neigh);
}
- write_unlock(&neigh->lock);
- } else {
+ write_unlock_bh(&neigh->lock);
+ } else if (time_after(jiffies, last_probe +
+ READ_ONCE(idev->cnf.rtr_probe_interval))) {
work = kmalloc(sizeof(*work), GFP_ATOMIC);
}
- if (work) {
+ if (!work || cmpxchg(&fib6_nh->last_probe,
+ last_probe, jiffies) != last_probe) {
+ kfree(work);
+ } else {
INIT_WORK(&work->work, rt6_probe_deferred);
work->target = *nh_gw;
- dev_hold(dev);
+ netdev_hold(dev, &work->dev_tracker, GFP_ATOMIC);
work->dev = dev;
schedule_work(&work->work);
}
out:
- rcu_read_unlock_bh();
+ rcu_read_unlock();
}
#else
-static inline void rt6_probe(struct fib6_info *rt)
+static inline void rt6_probe(struct fib6_nh *fib6_nh)
{
}
#endif
@@ -581,99 +717,72 @@ static inline void rt6_probe(struct fib6_info *rt)
/*
* Default Router Selection (RFC 2461 6.3.6)
*/
-static inline int rt6_check_dev(struct fib6_info *rt, int oif)
-{
- const struct net_device *dev = rt->fib6_nh.nh_dev;
-
- if (!oif || dev->ifindex == oif)
- return 2;
- return 0;
-}
-
-static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
+static enum rt6_nud_state rt6_check_neigh(const struct fib6_nh *fib6_nh)
{
enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
struct neighbour *neigh;
- if (rt->fib6_flags & RTF_NONEXTHOP ||
- !(rt->fib6_flags & RTF_GATEWAY))
- return RT6_NUD_SUCCEED;
-
- rcu_read_lock_bh();
- neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
- &rt->fib6_nh.nh_gw);
+ rcu_read_lock();
+ neigh = __ipv6_neigh_lookup_noref(fib6_nh->fib_nh_dev,
+ &fib6_nh->fib_nh_gw6);
if (neigh) {
- read_lock(&neigh->lock);
- if (neigh->nud_state & NUD_VALID)
+ u8 nud_state = READ_ONCE(neigh->nud_state);
+
+ if (nud_state & NUD_VALID)
ret = RT6_NUD_SUCCEED;
#ifdef CONFIG_IPV6_ROUTER_PREF
- else if (!(neigh->nud_state & NUD_FAILED))
+ else if (!(nud_state & NUD_FAILED))
ret = RT6_NUD_SUCCEED;
else
ret = RT6_NUD_FAIL_PROBE;
#endif
- read_unlock(&neigh->lock);
} else {
ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
}
- rcu_read_unlock_bh();
+ rcu_read_unlock();
return ret;
}
-static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
+static int rt6_score_route(const struct fib6_nh *nh, u32 fib6_flags, int oif,
+ int strict)
{
- int m;
+ int m = 0;
+
+ if (!oif || nh->fib_nh_dev->ifindex == oif)
+ m = 2;
- m = rt6_check_dev(rt, oif);
if (!m && (strict & RT6_LOOKUP_F_IFACE))
return RT6_NUD_FAIL_HARD;
#ifdef CONFIG_IPV6_ROUTER_PREF
- m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
+ m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(fib6_flags)) << 2;
#endif
- if (strict & RT6_LOOKUP_F_REACHABLE) {
- int n = rt6_check_neigh(rt);
+ if ((strict & RT6_LOOKUP_F_REACHABLE) &&
+ !(fib6_flags & RTF_NONEXTHOP) && nh->fib_nh_gw_family) {
+ int n = rt6_check_neigh(nh);
if (n < 0)
return n;
}
return m;
}
-/* called with rc_read_lock held */
-static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
+static bool find_match(struct fib6_nh *nh, u32 fib6_flags,
+ int oif, int strict, int *mpri, bool *do_rr)
{
- const struct net_device *dev = fib6_info_nh_dev(f6i);
+ bool match_do_rr = false;
bool rc = false;
-
- if (dev) {
- const struct inet6_dev *idev = __in6_dev_get(dev);
-
- rc = !!idev->cnf.ignore_routes_with_linkdown;
- }
-
- return rc;
-}
-
-static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
- int *mpri, struct fib6_info *match,
- bool *do_rr)
-{
int m;
- bool match_do_rr = false;
- if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
+ if (nh->fib_nh_flags & RTNH_F_DEAD)
goto out;
- if (fib6_ignore_linkdown(rt) &&
- rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
+ if (ip6_ignore_linkdown(nh->fib_nh_dev) &&
+ nh->fib_nh_flags & RTNH_F_LINKDOWN &&
!(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
goto out;
- if (fib6_check_expired(rt))
- goto out;
-
- m = rt6_score_route(rt, oif, strict);
+ m = rt6_score_route(nh, fib6_flags, oif, strict);
if (m == RT6_NUD_FAIL_DO_RR) {
match_do_rr = true;
m = 0; /* lowest valid score */
@@ -682,67 +791,127 @@ static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
}
if (strict & RT6_LOOKUP_F_REACHABLE)
- rt6_probe(rt);
+ rt6_probe(nh);
/* note that m can be RT6_NUD_FAIL_PROBE at this point */
if (m > *mpri) {
*do_rr = match_do_rr;
*mpri = m;
- match = rt;
+ rc = true;
}
out:
- return match;
+ return rc;
}
-static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
- struct fib6_info *leaf,
- struct fib6_info *rr_head,
- u32 metric, int oif, int strict,
- bool *do_rr)
+struct fib6_nh_frl_arg {
+ u32 flags;
+ int oif;
+ int strict;
+ int *mpri;
+ bool *do_rr;
+ struct fib6_nh *nh;
+};
+
+static int rt6_nh_find_match(struct fib6_nh *nh, void *_arg)
{
- struct fib6_info *rt, *match, *cont;
- int mpri = -1;
+ struct fib6_nh_frl_arg *arg = _arg;
- match = NULL;
- cont = NULL;
- for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
- if (rt->fib6_metric != metric) {
- cont = rt;
- break;
- }
+ arg->nh = nh;
+ return find_match(nh, arg->flags, arg->oif, arg->strict,
+ arg->mpri, arg->do_rr);
+}
- match = find_match(rt, oif, strict, &mpri, match, do_rr);
- }
+static void __find_rr_leaf(struct fib6_info *f6i_start,
+ struct fib6_info *nomatch, u32 metric,
+ struct fib6_result *res, struct fib6_info **cont,
+ int oif, int strict, bool *do_rr, int *mpri)
+{
+ struct fib6_info *f6i;
- for (rt = leaf; rt && rt != rr_head;
- rt = rcu_dereference(rt->fib6_next)) {
- if (rt->fib6_metric != metric) {
- cont = rt;
- break;
+ for (f6i = f6i_start;
+ f6i && f6i != nomatch;
+ f6i = rcu_dereference(f6i->fib6_next)) {
+ bool matched = false;
+ struct fib6_nh *nh;
+
+ if (cont && f6i->fib6_metric != metric) {
+ *cont = f6i;
+ return;
}
- match = find_match(rt, oif, strict, &mpri, match, do_rr);
+ if (fib6_check_expired(f6i))
+ continue;
+
+ if (unlikely(f6i->nh)) {
+ struct fib6_nh_frl_arg arg = {
+ .flags = f6i->fib6_flags,
+ .oif = oif,
+ .strict = strict,
+ .mpri = mpri,
+ .do_rr = do_rr
+ };
+
+ if (nexthop_is_blackhole(f6i->nh)) {
+ res->fib6_flags = RTF_REJECT;
+ res->fib6_type = RTN_BLACKHOLE;
+ res->f6i = f6i;
+ res->nh = nexthop_fib6_nh(f6i->nh);
+ return;
+ }
+ if (nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_find_match,
+ &arg)) {
+ matched = true;
+ nh = arg.nh;
+ }
+ } else {
+ nh = f6i->fib6_nh;
+ if (find_match(nh, f6i->fib6_flags, oif, strict,
+ mpri, do_rr))
+ matched = true;
+ }
+ if (matched) {
+ res->f6i = f6i;
+ res->nh = nh;
+ res->fib6_flags = f6i->fib6_flags;
+ res->fib6_type = f6i->fib6_type;
+ }
}
+}
- if (match || !cont)
- return match;
+static void find_rr_leaf(struct fib6_node *fn, struct fib6_info *leaf,
+ struct fib6_info *rr_head, int oif, int strict,
+ bool *do_rr, struct fib6_result *res)
+{
+ u32 metric = rr_head->fib6_metric;
+ struct fib6_info *cont = NULL;
+ int mpri = -1;
+
+ __find_rr_leaf(rr_head, NULL, metric, res, &cont,
+ oif, strict, do_rr, &mpri);
- for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
- match = find_match(rt, oif, strict, &mpri, match, do_rr);
+ __find_rr_leaf(leaf, rr_head, metric, res, &cont,
+ oif, strict, do_rr, &mpri);
- return match;
+ if (res->f6i || !cont)
+ return;
+
+ __find_rr_leaf(cont, NULL, metric, res, NULL,
+ oif, strict, do_rr, &mpri);
}
-static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
- int oif, int strict)
+static void rt6_select(struct net *net, struct fib6_node *fn, int oif,
+ struct fib6_result *res, int strict)
{
struct fib6_info *leaf = rcu_dereference(fn->leaf);
- struct fib6_info *match, *rt0;
+ struct fib6_info *rt0;
bool do_rr = false;
int key_plen;
+ /* make sure this function or its helpers sets f6i */
+ res->f6i = NULL;
+
if (!leaf || leaf == net->ipv6.fib6_null_entry)
- return net->ipv6.fib6_null_entry;
+ goto out;
rt0 = rcu_dereference(fn->rr_ptr);
if (!rt0)
@@ -759,11 +928,9 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
key_plen = rt0->fib6_src.plen;
#endif
if (fn->fn_bit != key_plen)
- return net->ipv6.fib6_null_entry;
-
- match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
- &do_rr);
+ goto out;
+ find_rr_leaf(fn, leaf, rt0, oif, strict, &do_rr, res);
if (do_rr) {
struct fib6_info *next = rcu_dereference(rt0->fib6_next);
@@ -780,12 +947,19 @@ static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
}
}
- return match ? match : net->ipv6.fib6_null_entry;
+out:
+ if (!res->f6i) {
+ res->f6i = net->ipv6.fib6_null_entry;
+ res->nh = res->f6i->fib6_nh;
+ res->fib6_flags = res->f6i->fib6_flags;
+ res->fib6_type = res->f6i->fib6_type;
+ }
}
-static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
+static bool rt6_is_gw_or_nonexthop(const struct fib6_result *res)
{
- return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
+ return (res->f6i->fib6_flags & RTF_NONEXTHOP) ||
+ res->nh->fib_nh_gw_family;
}
#ifdef CONFIG_IPV6_ROUTE_INFO
@@ -795,6 +969,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
struct net *net = dev_net(dev);
struct route_info *rinfo = (struct route_info *) opt;
struct in6_addr prefix_buf, *prefix;
+ struct fib6_table *table;
unsigned int pref;
unsigned long lifetime;
struct fib6_info *rt;
@@ -841,7 +1016,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
gwaddr, dev);
if (rt && !lifetime) {
- ip6_del_rt(net, rt);
+ ip6_del_rt(net, rt, false);
rt = NULL;
}
@@ -853,10 +1028,18 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
(rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
if (rt) {
- if (!addrconf_finite_timeout(lifetime))
+ table = rt->fib6_table;
+ spin_lock_bh(&table->tb6_lock);
+
+ if (!addrconf_finite_timeout(lifetime)) {
fib6_clean_expires(rt);
- else
+ fib6_remove_gc_list(rt);
+ } else {
fib6_set_expires(rt, jiffies + HZ * lifetime);
+ fib6_add_gc_list(rt);
+ }
+
+ spin_unlock_bh(&table->tb6_lock);
fib6_info_release(rt);
}
@@ -869,17 +1052,17 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
*/
/* called with rcu_lock held */
-static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
+static struct net_device *ip6_rt_get_dev_rcu(const struct fib6_result *res)
{
- struct net_device *dev = rt->fib6_nh.nh_dev;
+ struct net_device *dev = res->nh->fib_nh_dev;
- if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
+ if (res->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
/* for copies of local routes, dst->dev needs to be the
* device if it is a master device, the master device if
* device is enslaved, and the loopback as the default
*/
if (netif_is_l3_slave(dev) &&
- !rt6_need_strict(&rt->fib6_dst.addr))
+ !rt6_need_strict(&res->f6i->fib6_dst.addr))
dev = l3mdev_master_dev_rcu(dev);
else if (!netif_is_l3_master(dev))
dev = dev_net(dev)->loopback_dev;
@@ -919,17 +1102,15 @@ static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
flags |= DST_NOCOUNT;
if (rt->dst_nopolicy)
flags |= DST_NOPOLICY;
- if (rt->dst_host)
- flags |= DST_HOST;
return flags;
}
-static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
+static void ip6_rt_init_dst_reject(struct rt6_info *rt, u8 fib6_type)
{
- rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
+ rt->dst.error = ip6_rt_type_to_error(fib6_type);
- switch (ort->fib6_type) {
+ switch (fib6_type) {
case RTN_BLACKHOLE:
rt->dst.output = dst_discard_out;
rt->dst.input = dst_discard;
@@ -947,26 +1128,29 @@ static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
}
}
-static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
+static void ip6_rt_init_dst(struct rt6_info *rt, const struct fib6_result *res)
{
- if (ort->fib6_flags & RTF_REJECT) {
- ip6_rt_init_dst_reject(rt, ort);
+ struct fib6_info *f6i = res->f6i;
+
+ if (res->fib6_flags & RTF_REJECT) {
+ ip6_rt_init_dst_reject(rt, res->fib6_type);
return;
}
rt->dst.error = 0;
rt->dst.output = ip6_output;
- if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
+ if (res->fib6_type == RTN_LOCAL || res->fib6_type == RTN_ANYCAST) {
rt->dst.input = ip6_input;
- } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
+ } else if (ipv6_addr_type(&f6i->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
rt->dst.input = ip6_mc_input;
+ rt->dst.output = ip6_mr_output;
} else {
rt->dst.input = ip6_forward;
}
- if (ort->fib6_nh.nh_lwtstate) {
- rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
+ if (res->nh->fib_nh_lws) {
+ rt->dst.lwtstate = lwtstate_get(res->nh->fib_nh_lws);
lwtunnel_set_redirect(&rt->dst);
}
@@ -978,29 +1162,29 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
{
rt->rt6i_flags &= ~RTF_EXPIRES;
rcu_assign_pointer(rt->from, from);
- dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
- if (from->fib6_metrics != &dst_default_metrics) {
- rt->dst._metrics |= DST_METRICS_REFCOUNTED;
- refcount_inc(&from->fib6_metrics->refcnt);
- }
+ ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
}
-/* Caller must already hold reference to @ort */
-static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
+/* Caller must already hold reference to f6i in result */
+static void ip6_rt_copy_init(struct rt6_info *rt, const struct fib6_result *res)
{
- struct net_device *dev = fib6_info_nh_dev(ort);
+ const struct fib6_nh *nh = res->nh;
+ const struct net_device *dev = nh->fib_nh_dev;
+ struct fib6_info *f6i = res->f6i;
- ip6_rt_init_dst(rt, ort);
+ ip6_rt_init_dst(rt, res);
- rt->rt6i_dst = ort->fib6_dst;
+ rt->rt6i_dst = f6i->fib6_dst;
rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
- rt->rt6i_gateway = ort->fib6_nh.nh_gw;
- rt->rt6i_flags = ort->fib6_flags;
- rt6_set_from(rt, ort);
+ rt->rt6i_flags = res->fib6_flags;
+ if (nh->fib_nh_gw_family) {
+ rt->rt6i_gateway = nh->fib_nh_gw6;
+ rt->rt6i_flags |= RTF_GATEWAY;
+ }
+ rt6_set_from(rt, f6i);
#ifdef CONFIG_IPV6_SUBTREES
- rt->rt6i_src = ort->fib6_src;
+ rt->rt6i_src = f6i->fib6_src;
#endif
- rt->rt6i_prefsrc = ort->fib6_prefsrc;
}
static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
@@ -1021,14 +1205,13 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
}
}
-static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
- bool null_fallback)
+static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
{
struct rt6_info *rt = *prt;
if (dst_hold_safe(&rt->dst))
return true;
- if (null_fallback) {
+ if (net) {
rt = net->ipv6.ip6_null_entry;
dst_hold(&rt->dst);
} else {
@@ -1039,75 +1222,80 @@ static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
}
/* called with rcu_lock held */
-static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
+static struct rt6_info *ip6_create_rt_rcu(const struct fib6_result *res)
{
- unsigned short flags = fib6_info_dst_flags(rt);
- struct net_device *dev = rt->fib6_nh.nh_dev;
+ struct net_device *dev = res->nh->fib_nh_dev;
+ struct fib6_info *f6i = res->f6i;
+ unsigned short flags;
struct rt6_info *nrt;
- if (!fib6_info_hold_safe(rt))
- return NULL;
+ if (!fib6_info_hold_safe(f6i))
+ goto fallback;
+ flags = fib6_info_dst_flags(f6i);
nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
- if (nrt)
- ip6_rt_copy_init(nrt, rt);
- else
- fib6_info_release(rt);
+ if (!nrt) {
+ fib6_info_release(f6i);
+ goto fallback;
+ }
+
+ ip6_rt_copy_init(nrt, res);
+ return nrt;
+fallback:
+ nrt = dev_net(dev)->ipv6.ip6_null_entry;
+ dst_hold(&nrt->dst);
return nrt;
}
-static struct rt6_info *ip6_pol_route_lookup(struct net *net,
+INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_lookup(struct net *net,
struct fib6_table *table,
struct flowi6 *fl6,
const struct sk_buff *skb,
int flags)
{
- struct fib6_info *f6i;
+ struct fib6_result res = {};
struct fib6_node *fn;
struct rt6_info *rt;
- if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
- flags &= ~RT6_LOOKUP_F_IFACE;
-
rcu_read_lock();
fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
- f6i = rcu_dereference(fn->leaf);
- if (!f6i) {
- f6i = net->ipv6.fib6_null_entry;
- } else {
- f6i = rt6_device_match(net, f6i, &fl6->saddr,
- fl6->flowi6_oif, flags);
- if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
- f6i = fib6_multipath_select(net, f6i, fl6,
- fl6->flowi6_oif, skb,
- flags);
- }
- if (f6i == net->ipv6.fib6_null_entry) {
+ res.f6i = rcu_dereference(fn->leaf);
+ if (!res.f6i)
+ res.f6i = net->ipv6.fib6_null_entry;
+ else
+ rt6_device_match(net, &res, &fl6->saddr, fl6->flowi6_oif,
+ flags);
+
+ if (res.f6i == net->ipv6.fib6_null_entry) {
fn = fib6_backtrack(fn, &fl6->saddr);
if (fn)
goto restart;
+
+ rt = net->ipv6.ip6_null_entry;
+ dst_hold(&rt->dst);
+ goto out;
+ } else if (res.fib6_flags & RTF_REJECT) {
+ goto do_create;
}
- trace_fib6_table_lookup(net, f6i, table, fl6);
+ fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
+ fl6->flowi6_oif != 0, skb, flags);
/* Search through exception table */
- rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
+ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
if (rt) {
- if (ip6_hold_safe(net, &rt, true))
+ if (ip6_hold_safe(net, &rt))
dst_use_noref(&rt->dst, jiffies);
- } else if (f6i == net->ipv6.fib6_null_entry) {
- rt = net->ipv6.ip6_null_entry;
- dst_hold(&rt->dst);
} else {
- rt = ip6_create_rt_rcu(f6i);
- if (!rt) {
- rt = net->ipv6.ip6_null_entry;
- dst_hold(&rt->dst);
- }
+do_create:
+ rt = ip6_create_rt_rcu(&res);
}
+out:
+ trace_fib6_table_lookup(net, &res, table, fl6);
+
rcu_read_unlock();
return rt;
@@ -1138,7 +1326,7 @@ struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
if (dst->error == 0)
- return (struct rt6_info *) dst;
+ return dst_rt6_info(dst);
dst_release(dst);
@@ -1173,10 +1361,11 @@ int ip6_ins_rt(struct net *net, struct fib6_info *rt)
return __ip6_ins_rt(rt, &info, NULL);
}
-static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
+static struct rt6_info *ip6_rt_cache_alloc(const struct fib6_result *res,
const struct in6_addr *daddr,
const struct in6_addr *saddr)
{
+ struct fib6_info *f6i = res->f6i;
struct net_device *dev;
struct rt6_info *rt;
@@ -1184,25 +1373,24 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
* Clone the route.
*/
- if (!fib6_info_hold_safe(ort))
+ if (!fib6_info_hold_safe(f6i))
return NULL;
- dev = ip6_rt_get_dev_rcu(ort);
+ dev = ip6_rt_get_dev_rcu(res);
rt = ip6_dst_alloc(dev_net(dev), dev, 0);
if (!rt) {
- fib6_info_release(ort);
+ fib6_info_release(f6i);
return NULL;
}
- ip6_rt_copy_init(rt, ort);
+ ip6_rt_copy_init(rt, res);
rt->rt6i_flags |= RTF_CACHE;
- rt->dst.flags |= DST_HOST;
rt->rt6i_dst.addr = *daddr;
rt->rt6i_dst.plen = 128;
- if (!rt6_is_gw_or_nonexthop(ort)) {
- if (ort->fib6_dst.plen != 128 &&
- ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
+ if (!rt6_is_gw_or_nonexthop(res)) {
+ if (f6i->fib6_dst.plen != 128 &&
+ ipv6_addr_equal(&f6i->fib6_dst.addr, daddr))
rt->rt6i_flags |= RTF_ANYCAST;
#ifdef CONFIG_IPV6_SUBTREES
if (rt->rt6i_src.plen && saddr) {
@@ -1215,58 +1403,82 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
return rt;
}
-static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
+static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res)
{
- unsigned short flags = fib6_info_dst_flags(rt);
+ struct fib6_info *f6i = res->f6i;
+ unsigned short flags = fib6_info_dst_flags(f6i);
struct net_device *dev;
struct rt6_info *pcpu_rt;
- if (!fib6_info_hold_safe(rt))
+ if (!fib6_info_hold_safe(f6i))
return NULL;
rcu_read_lock();
- dev = ip6_rt_get_dev_rcu(rt);
- pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
+ dev = ip6_rt_get_dev_rcu(res);
+ pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT);
rcu_read_unlock();
if (!pcpu_rt) {
- fib6_info_release(rt);
+ fib6_info_release(f6i);
return NULL;
}
- ip6_rt_copy_init(pcpu_rt, rt);
+ ip6_rt_copy_init(pcpu_rt, res);
pcpu_rt->rt6i_flags |= RTF_PCPU;
+
+ if (f6i->nh)
+ pcpu_rt->sernum = rt_genid_ipv6(dev_net(dev));
+
return pcpu_rt;
}
+static bool rt6_is_valid(const struct rt6_info *rt6)
+{
+ return rt6->sernum == rt_genid_ipv6(dev_net(rt6->dst.dev));
+}
+
/* It should be called with rcu_read_lock() acquired */
-static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
+static struct rt6_info *rt6_get_pcpu_route(const struct fib6_result *res)
{
- struct rt6_info *pcpu_rt, **p;
+ struct rt6_info *pcpu_rt;
- p = this_cpu_ptr(rt->rt6i_pcpu);
- pcpu_rt = *p;
+ pcpu_rt = this_cpu_read(*res->nh->rt6i_pcpu);
- if (pcpu_rt)
- ip6_hold_safe(NULL, &pcpu_rt, false);
+ if (pcpu_rt && pcpu_rt->sernum && !rt6_is_valid(pcpu_rt)) {
+ struct rt6_info *prev, **p;
+
+ p = this_cpu_ptr(res->nh->rt6i_pcpu);
+ /* Paired with READ_ONCE() in __fib6_drop_pcpu_from() */
+ prev = xchg(p, NULL);
+ if (prev) {
+ dst_dev_put(&prev->dst);
+ dst_release(&prev->dst);
+ }
+
+ pcpu_rt = NULL;
+ }
return pcpu_rt;
}
static struct rt6_info *rt6_make_pcpu_route(struct net *net,
- struct fib6_info *rt)
+ const struct fib6_result *res)
{
struct rt6_info *pcpu_rt, *prev, **p;
- pcpu_rt = ip6_rt_pcpu_alloc(rt);
- if (!pcpu_rt) {
- dst_hold(&net->ipv6.ip6_null_entry->dst);
- return net->ipv6.ip6_null_entry;
- }
+ pcpu_rt = ip6_rt_pcpu_alloc(res);
+ if (!pcpu_rt)
+ return NULL;
- dst_hold(&pcpu_rt->dst);
- p = this_cpu_ptr(rt->rt6i_pcpu);
+ p = this_cpu_ptr(res->nh->rt6i_pcpu);
prev = cmpxchg(p, NULL, pcpu_rt);
BUG_ON(prev);
+ if (res->f6i->fib6_destroying) {
+ struct fib6_info *from;
+
+ from = unrcu_pointer(xchg(&pcpu_rt->from, NULL));
+ fib6_info_release(from);
+ }
+
return pcpu_rt;
}
@@ -1286,12 +1498,18 @@ static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
return;
net = dev_net(rt6_ex->rt6i->dst.dev);
+ net->ipv6.rt6_stats->fib_rt_cache--;
+
+ /* purge completely the exception to allow releasing the held resources:
+ * some [sk] cache may keep the dst around for unlimited time
+ */
+ dst_dev_put(&rt6_ex->rt6i->dst);
+
hlist_del_rcu(&rt6_ex->hlist);
dst_release(&rt6_ex->rt6i->dst);
kfree_rcu(rt6_ex, rcu);
WARN_ON_ONCE(!bucket->depth);
bucket->depth--;
- net->ipv6.rt6_stats->fib_rt_cache--;
}
/* Remove oldest rt6_ex in bucket and free the memory
@@ -1314,17 +1532,24 @@ static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
static u32 rt6_exception_hash(const struct in6_addr *dst,
const struct in6_addr *src)
{
- static u32 seed __read_mostly;
- u32 val;
+ static siphash_aligned_key_t rt6_exception_key;
+ struct {
+ struct in6_addr dst;
+ struct in6_addr src;
+ } __aligned(SIPHASH_ALIGNMENT) combined = {
+ .dst = *dst,
+ };
+ u64 val;
- net_get_random_once(&seed, sizeof(seed));
- val = jhash(dst, sizeof(*dst), seed);
+ net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key));
#ifdef CONFIG_IPV6_SUBTREES
if (src)
- val = jhash(src, sizeof(*src), val);
+ combined.src = *src;
#endif
- return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
+ val = siphash(&combined, sizeof(combined), &rt6_exception_key);
+
+ return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
}
/* Helper function to find the cached rt in the hash table
@@ -1395,45 +1620,97 @@ __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
return NULL;
}
-static unsigned int fib6_mtu(const struct fib6_info *rt)
+static unsigned int fib6_mtu(const struct fib6_result *res)
{
+ const struct fib6_nh *nh = res->nh;
unsigned int mtu;
- if (rt->fib6_pmtu) {
- mtu = rt->fib6_pmtu;
+ if (res->f6i->fib6_pmtu) {
+ mtu = res->f6i->fib6_pmtu;
} else {
- struct net_device *dev = fib6_info_nh_dev(rt);
+ struct net_device *dev = nh->fib_nh_dev;
struct inet6_dev *idev;
rcu_read_lock();
idev = __in6_dev_get(dev);
- mtu = idev->cnf.mtu6;
+ mtu = READ_ONCE(idev->cnf.mtu6);
rcu_read_unlock();
}
mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
- return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
+ return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
+}
+
+#define FIB6_EXCEPTION_BUCKET_FLUSHED 0x1UL
+
+/* used when the flushed bit is not relevant, only access to the bucket
+ * (ie., all bucket users except rt6_insert_exception);
+ *
+ * called under rcu lock; sometimes called with rt6_exception_lock held
+ */
+static
+struct rt6_exception_bucket *fib6_nh_get_excptn_bucket(const struct fib6_nh *nh,
+ spinlock_t *lock)
+{
+ struct rt6_exception_bucket *bucket;
+
+ if (lock)
+ bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
+ lockdep_is_held(lock));
+ else
+ bucket = rcu_dereference(nh->rt6i_exception_bucket);
+
+ /* remove bucket flushed bit if set */
+ if (bucket) {
+ unsigned long p = (unsigned long)bucket;
+
+ p &= ~FIB6_EXCEPTION_BUCKET_FLUSHED;
+ bucket = (struct rt6_exception_bucket *)p;
+ }
+
+ return bucket;
+}
+
+static bool fib6_nh_excptn_bucket_flushed(struct rt6_exception_bucket *bucket)
+{
+ unsigned long p = (unsigned long)bucket;
+
+ return !!(p & FIB6_EXCEPTION_BUCKET_FLUSHED);
+}
+
+/* called with rt6_exception_lock held */
+static void fib6_nh_excptn_bucket_set_flushed(struct fib6_nh *nh,
+ spinlock_t *lock)
+{
+ struct rt6_exception_bucket *bucket;
+ unsigned long p;
+
+ bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
+ lockdep_is_held(lock));
+
+ p = (unsigned long)bucket;
+ p |= FIB6_EXCEPTION_BUCKET_FLUSHED;
+ bucket = (struct rt6_exception_bucket *)p;
+ rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
}
static int rt6_insert_exception(struct rt6_info *nrt,
- struct fib6_info *ort)
+ const struct fib6_result *res)
{
struct net *net = dev_net(nrt->dst.dev);
struct rt6_exception_bucket *bucket;
+ struct fib6_info *f6i = res->f6i;
struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
+ struct fib6_nh *nh = res->nh;
+ int max_depth;
int err = 0;
spin_lock_bh(&rt6_exception_lock);
- if (ort->exception_bucket_flushed) {
- err = -EINVAL;
- goto out;
- }
-
- bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
+ bucket = rcu_dereference_protected(nh->rt6i_exception_bucket,
+ lockdep_is_held(&rt6_exception_lock));
if (!bucket) {
bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
GFP_ATOMIC);
@@ -1441,29 +1718,27 @@ static int rt6_insert_exception(struct rt6_info *nrt,
err = -ENOMEM;
goto out;
}
- rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
+ rcu_assign_pointer(nh->rt6i_exception_bucket, bucket);
+ } else if (fib6_nh_excptn_bucket_flushed(bucket)) {
+ err = -EINVAL;
+ goto out;
}
#ifdef CONFIG_IPV6_SUBTREES
- /* rt6i_src.plen != 0 indicates ort is in subtree
+ /* fib6_src.plen != 0 indicates f6i is in subtree
* and exception table is indexed by a hash of
- * both rt6i_dst and rt6i_src.
+ * both fib6_dst and fib6_src.
* Otherwise, the exception table is indexed by
- * a hash of only rt6i_dst.
+ * a hash of only fib6_dst.
*/
- if (ort->fib6_src.plen)
+ if (f6i->fib6_src.plen)
src_key = &nrt->rt6i_src.addr;
#endif
-
- /* Update rt6i_prefsrc as it could be changed
- * in rt6_remove_prefsrc()
- */
- nrt->rt6i_prefsrc = ort->fib6_prefsrc;
- /* rt6_mtu_change() might lower mtu on ort.
+ /* rt6_mtu_change() might lower mtu on f6i.
* Only insert this exception route if its mtu
- * is less than ort's mtu value.
+ * is less than f6i's mtu value.
*/
- if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
+ if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(res)) {
err = -EINVAL;
goto out;
}
@@ -1484,7 +1759,9 @@ static int rt6_insert_exception(struct rt6_info *nrt,
bucket->depth++;
net->ipv6.rt6_stats->fib_rt_cache++;
- if (bucket->depth > FIB6_MAX_DEPTH)
+ /* Randomize max depth to avoid some side channels attacks. */
+ max_depth = FIB6_MAX_DEPTH + get_random_u32_below(FIB6_MAX_DEPTH);
+ while (bucket->depth > max_depth)
rt6_exception_remove_oldest(bucket);
out:
@@ -1492,16 +1769,17 @@ out:
/* Update fn->fn_sernum to invalidate all cached dst */
if (!err) {
- spin_lock_bh(&ort->fib6_table->tb6_lock);
- fib6_update_sernum(net, ort);
- spin_unlock_bh(&ort->fib6_table->tb6_lock);
+ spin_lock_bh(&f6i->fib6_table->tb6_lock);
+ fib6_update_sernum(net, f6i);
+ fib6_add_gc_list(f6i);
+ spin_unlock_bh(&f6i->fib6_table->tb6_lock);
fib6_force_start_gc(net);
}
return err;
}
-void rt6_flush_exceptions(struct fib6_info *rt)
+static void fib6_nh_flush_exceptions(struct fib6_nh *nh, struct fib6_info *from)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
@@ -1509,77 +1787,108 @@ void rt6_flush_exceptions(struct fib6_info *rt)
int i;
spin_lock_bh(&rt6_exception_lock);
- /* Prevent rt6_insert_exception() to recreate the bucket list */
- rt->exception_bucket_flushed = 1;
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
if (!bucket)
goto out;
+ /* Prevent rt6_insert_exception() to recreate the bucket list */
+ if (!from)
+ fib6_nh_excptn_bucket_set_flushed(nh, &rt6_exception_lock);
+
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
- hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
- rt6_remove_exception(bucket, rt6_ex);
- WARN_ON_ONCE(bucket->depth);
+ hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) {
+ if (!from ||
+ rcu_access_pointer(rt6_ex->rt6i->from) == from)
+ rt6_remove_exception(bucket, rt6_ex);
+ }
+ WARN_ON_ONCE(!from && bucket->depth);
bucket++;
}
-
out:
spin_unlock_bh(&rt6_exception_lock);
}
+static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg)
+{
+ struct fib6_info *f6i = arg;
+
+ fib6_nh_flush_exceptions(nh, f6i);
+
+ return 0;
+}
+
+void rt6_flush_exceptions(struct fib6_info *f6i)
+{
+ if (f6i->nh) {
+ rcu_read_lock();
+ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, f6i);
+ rcu_read_unlock();
+ } else {
+ fib6_nh_flush_exceptions(f6i->fib6_nh, f6i);
+ }
+}
+
/* Find cached rt in the hash table inside passed in rt
* Caller has to hold rcu_read_lock()
*/
-static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
- struct in6_addr *daddr,
- struct in6_addr *saddr)
+static struct rt6_info *rt6_find_cached_rt(const struct fib6_result *res,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
{
+ const struct in6_addr *src_key = NULL;
struct rt6_exception_bucket *bucket;
- struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
- struct rt6_info *res = NULL;
-
- bucket = rcu_dereference(rt->rt6i_exception_bucket);
+ struct rt6_info *ret = NULL;
#ifdef CONFIG_IPV6_SUBTREES
- /* rt6i_src.plen != 0 indicates rt is in subtree
+ /* fib6i_src.plen != 0 indicates f6i is in subtree
* and exception table is indexed by a hash of
- * both rt6i_dst and rt6i_src.
- * Otherwise, the exception table is indexed by
- * a hash of only rt6i_dst.
+ * both fib6_dst and fib6_src.
+ * However, the src addr used to create the hash
+ * might not be exactly the passed in saddr which
+ * is a /128 addr from the flow.
+ * So we need to use f6i->fib6_src to redo lookup
+ * if the passed in saddr does not find anything.
+ * (See the logic in ip6_rt_cache_alloc() on how
+ * rt->rt6i_src is updated.)
*/
- if (rt->fib6_src.plen)
+ if (res->f6i->fib6_src.plen)
src_key = saddr;
+find_ex:
#endif
+ bucket = fib6_nh_get_excptn_bucket(res->nh, NULL);
rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
- res = rt6_ex->rt6i;
+ ret = rt6_ex->rt6i;
- return res;
+#ifdef CONFIG_IPV6_SUBTREES
+ /* Use fib6_src as src_key and redo lookup */
+ if (!ret && src_key && src_key != &res->f6i->fib6_src.addr) {
+ src_key = &res->f6i->fib6_src.addr;
+ goto find_ex;
+ }
+#endif
+
+ return ret;
}
/* Remove the passed in cached rt from the hash table that contains it */
-static int rt6_remove_exception_rt(struct rt6_info *rt)
+static int fib6_nh_remove_exception(const struct fib6_nh *nh, int plen,
+ const struct rt6_info *rt)
{
+ const struct in6_addr *src_key = NULL;
struct rt6_exception_bucket *bucket;
- struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
- struct fib6_info *from;
int err;
- from = rcu_dereference(rt->from);
- if (!from ||
- !(rt->rt6i_flags & RTF_CACHE))
- return -EINVAL;
-
- if (!rcu_access_pointer(from->rt6i_exception_bucket))
+ if (!rcu_access_pointer(nh->rt6i_exception_bucket))
return -ENOENT;
spin_lock_bh(&rt6_exception_lock);
- bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
+
#ifdef CONFIG_IPV6_SUBTREES
/* rt6i_src.plen != 0 indicates 'from' is in subtree
* and exception table is indexed by a hash of
@@ -1587,7 +1896,7 @@ static int rt6_remove_exception_rt(struct rt6_info *rt)
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
- if (from->fib6_src.plen)
+ if (plen)
src_key = &rt->rt6i_src.addr;
#endif
rt6_ex = __rt6_find_exception_spinlock(&bucket,
@@ -1604,23 +1913,60 @@ static int rt6_remove_exception_rt(struct rt6_info *rt)
return err;
}
+struct fib6_nh_excptn_arg {
+ struct rt6_info *rt;
+ int plen;
+};
+
+static int rt6_nh_remove_exception_rt(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_excptn_arg *arg = _arg;
+ int err;
+
+ err = fib6_nh_remove_exception(nh, arg->plen, arg->rt);
+ if (err == 0)
+ return 1;
+
+ return 0;
+}
+
+static int rt6_remove_exception_rt(struct rt6_info *rt)
+{
+ struct fib6_info *from;
+
+ from = rcu_dereference(rt->from);
+ if (!from || !(rt->rt6i_flags & RTF_CACHE))
+ return -EINVAL;
+
+ if (from->nh) {
+ struct fib6_nh_excptn_arg arg = {
+ .rt = rt,
+ .plen = from->fib6_src.plen
+ };
+ int rc;
+
+ /* rc = 1 means an entry was found */
+ rc = nexthop_for_each_fib6_nh(from->nh,
+ rt6_nh_remove_exception_rt,
+ &arg);
+ return rc ? 0 : -ENOENT;
+ }
+
+ return fib6_nh_remove_exception(from->fib6_nh,
+ from->fib6_src.plen, rt);
+}
+
/* Find rt6_ex which contains the passed in rt cache and
* refresh its stamp
*/
-static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
+static void fib6_nh_update_exception(const struct fib6_nh *nh, int plen,
+ const struct rt6_info *rt)
{
+ const struct in6_addr *src_key = NULL;
struct rt6_exception_bucket *bucket;
- struct fib6_info *from = rt->from;
- struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
- if (!from ||
- !(rt->rt6i_flags & RTF_CACHE))
- return;
-
- rcu_read_lock();
- bucket = rcu_dereference(from->rt6i_exception_bucket);
-
+ bucket = fib6_nh_get_excptn_bucket(nh, NULL);
#ifdef CONFIG_IPV6_SUBTREES
/* rt6i_src.plen != 0 indicates 'from' is in subtree
* and exception table is indexed by a hash of
@@ -1628,35 +1974,65 @@ static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
- if (from->fib6_src.plen)
+ if (plen)
src_key = &rt->rt6i_src.addr;
#endif
- rt6_ex = __rt6_find_exception_rcu(&bucket,
- &rt->rt6i_dst.addr,
- src_key);
+ rt6_ex = __rt6_find_exception_rcu(&bucket, &rt->rt6i_dst.addr, src_key);
if (rt6_ex)
rt6_ex->stamp = jiffies;
+}
- rcu_read_unlock();
+struct fib6_nh_match_arg {
+ const struct net_device *dev;
+ const struct in6_addr *gw;
+ struct fib6_nh *match;
+};
+
+/* determine if fib6_nh has given device and gateway */
+static int fib6_nh_find_match(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_match_arg *arg = _arg;
+
+ if (arg->dev != nh->fib_nh_dev ||
+ (arg->gw && !nh->fib_nh_gw_family) ||
+ (!arg->gw && nh->fib_nh_gw_family) ||
+ (arg->gw && !ipv6_addr_equal(arg->gw, &nh->fib_nh_gw6)))
+ return 0;
+
+ arg->match = nh;
+
+ /* found a match, break the loop */
+ return 1;
}
-static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
+static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
{
- struct rt6_exception_bucket *bucket;
- struct rt6_exception *rt6_ex;
- int i;
+ struct fib6_info *from;
+ struct fib6_nh *fib6_nh;
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
+ rcu_read_lock();
- if (bucket) {
- for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
- hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
- rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
- }
- bucket++;
- }
+ from = rcu_dereference(rt->from);
+ if (!from || !(rt->rt6i_flags & RTF_CACHE))
+ goto unlock;
+
+ if (from->nh) {
+ struct fib6_nh_match_arg arg = {
+ .dev = rt->dst.dev,
+ .gw = &rt->rt6i_gateway,
+ };
+
+ nexthop_for_each_fib6_nh(from->nh, fib6_nh_find_match, &arg);
+
+ if (!arg.match)
+ goto unlock;
+ fib6_nh = arg.match;
+ } else {
+ fib6_nh = from->fib6_nh;
}
+ fib6_nh_update_exception(fib6_nh, from->fib6_src.plen, rt);
+unlock:
+ rcu_read_unlock();
}
static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
@@ -1682,15 +2058,13 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
}
static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
- struct fib6_info *rt, int mtu)
+ const struct fib6_nh *nh, int mtu)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
int i;
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
-
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
if (!bucket)
return;
@@ -1712,21 +2086,19 @@ static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
-static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
- struct in6_addr *gateway)
+static void fib6_nh_exceptions_clean_tohost(const struct fib6_nh *nh,
+ const struct in6_addr *gateway)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
struct hlist_node *tmp;
int i;
- if (!rcu_access_pointer(rt->rt6i_exception_bucket))
+ if (!rcu_access_pointer(nh->rt6i_exception_bucket))
return;
spin_lock_bh(&rt6_exception_lock);
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
-
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
if (bucket) {
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
hlist_for_each_entry_safe(rt6_ex, tmp,
@@ -1761,28 +2133,26 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
* expired, independently from their aging, as per RFC 8201 section 4
*/
if (!(rt->rt6i_flags & RTF_EXPIRES)) {
- if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
- RT6_TRACE("aging clone %p\n", rt);
+ if (time_after_eq(now, READ_ONCE(rt->dst.lastuse) +
+ gc_args->timeout)) {
+ pr_debug("aging clone %p\n", rt);
rt6_remove_exception(bucket, rt6_ex);
return;
}
- } else if (time_after(jiffies, rt->dst.expires)) {
- RT6_TRACE("purging expired route %p\n", rt);
+ } else if (time_after(jiffies, READ_ONCE(rt->dst.expires))) {
+ pr_debug("purging expired route %p\n", rt);
rt6_remove_exception(bucket, rt6_ex);
return;
}
if (rt->rt6i_flags & RTF_GATEWAY) {
struct neighbour *neigh;
- __u8 neigh_flags = 0;
neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
- if (neigh)
- neigh_flags = neigh->flags;
- if (!(neigh_flags & NTF_ROUTER)) {
- RT6_TRACE("purging route %p via non-router but gateway\n",
- rt);
+ if (!(neigh && (neigh->flags & NTF_ROUTER))) {
+ pr_debug("purging route %p via non-router but gateway\n",
+ rt);
rt6_remove_exception(bucket, rt6_ex);
return;
}
@@ -1791,23 +2161,21 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
gc_args->more++;
}
-void rt6_age_exceptions(struct fib6_info *rt,
- struct fib6_gc_args *gc_args,
- unsigned long now)
+static void fib6_nh_age_exceptions(const struct fib6_nh *nh,
+ struct fib6_gc_args *gc_args,
+ unsigned long now)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
struct hlist_node *tmp;
int i;
- if (!rcu_access_pointer(rt->rt6i_exception_bucket))
+ if (!rcu_access_pointer(nh->rt6i_exception_bucket))
return;
rcu_read_lock_bh();
spin_lock(&rt6_exception_lock);
- bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
- lockdep_is_held(&rt6_exception_lock));
-
+ bucket = fib6_nh_get_excptn_bucket(nh, &rt6_exception_lock);
if (bucket) {
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
hlist_for_each_entry_safe(rt6_ex, tmp,
@@ -1822,22 +2190,48 @@ void rt6_age_exceptions(struct fib6_info *rt,
rcu_read_unlock_bh();
}
+struct fib6_nh_age_excptn_arg {
+ struct fib6_gc_args *gc_args;
+ unsigned long now;
+};
+
+static int rt6_nh_age_exceptions(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_age_excptn_arg *arg = _arg;
+
+ fib6_nh_age_exceptions(nh, arg->gc_args, arg->now);
+ return 0;
+}
+
+void rt6_age_exceptions(struct fib6_info *f6i,
+ struct fib6_gc_args *gc_args,
+ unsigned long now)
+{
+ if (f6i->nh) {
+ struct fib6_nh_age_excptn_arg arg = {
+ .gc_args = gc_args,
+ .now = now
+ };
+
+ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_age_exceptions,
+ &arg);
+ } else {
+ fib6_nh_age_exceptions(f6i->fib6_nh, gc_args, now);
+ }
+}
+
/* must be called with rcu lock held */
-struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
- int oif, struct flowi6 *fl6, int strict)
+int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif,
+ struct flowi6 *fl6, struct fib6_result *res, int strict)
{
struct fib6_node *fn, *saved_fn;
- struct fib6_info *f6i;
fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
saved_fn = fn;
- if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
- oif = 0;
-
redo_rt6_select:
- f6i = rt6_select(net, fn, oif, strict);
- if (f6i == net->ipv6.fib6_null_entry) {
+ rt6_select(net, fn, oif, res, strict);
+ if (res->f6i == net->ipv6.fib6_null_entry) {
fn = fib6_backtrack(fn, &fl6->saddr);
if (fn)
goto redo_rt6_select;
@@ -1849,90 +2243,81 @@ redo_rt6_select:
}
}
- trace_fib6_table_lookup(net, f6i, table, fl6);
+ trace_fib6_table_lookup(net, res, table, fl6);
- return f6i;
+ return 0;
}
struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
int oif, struct flowi6 *fl6,
const struct sk_buff *skb, int flags)
{
- struct fib6_info *f6i;
- struct rt6_info *rt;
+ struct fib6_result res = {};
+ struct rt6_info *rt = NULL;
int strict = 0;
+ WARN_ON_ONCE((flags & RT6_LOOKUP_F_DST_NOREF) &&
+ !rcu_read_lock_held());
+
strict |= flags & RT6_LOOKUP_F_IFACE;
strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
- if (net->ipv6.devconf_all->forwarding == 0)
+ if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0)
strict |= RT6_LOOKUP_F_REACHABLE;
rcu_read_lock();
- f6i = fib6_table_lookup(net, table, oif, fl6, strict);
- if (f6i->fib6_nsiblings)
- f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
+ fib6_table_lookup(net, table, oif, fl6, &res, strict);
+ if (res.f6i == net->ipv6.fib6_null_entry)
+ goto out;
- if (f6i == net->ipv6.fib6_null_entry) {
- rt = net->ipv6.ip6_null_entry;
- rcu_read_unlock();
- dst_hold(&rt->dst);
- return rt;
- }
+ fib6_select_path(net, &res, fl6, oif, false, skb, strict);
/*Search through exception table */
- rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
+ rt = rt6_find_cached_rt(&res, &fl6->daddr, &fl6->saddr);
if (rt) {
- if (ip6_hold_safe(net, &rt, true))
- dst_use_noref(&rt->dst, jiffies);
-
- rcu_read_unlock();
- return rt;
+ goto out;
} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
- !(f6i->fib6_flags & RTF_GATEWAY))) {
+ !res.nh->fib_nh_gw_family)) {
/* Create a RTF_CACHE clone which will not be
* owned by the fib6 tree. It is for the special case where
* the daddr in the skb during the neighbor look-up is different
* from the fl6->daddr used to look-up route here.
*/
- struct rt6_info *uncached_rt;
-
- uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
+ rt = ip6_rt_cache_alloc(&res, &fl6->daddr, NULL);
- rcu_read_unlock();
-
- if (uncached_rt) {
- /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
- * No need for another dst_hold()
+ if (rt) {
+ /* 1 refcnt is taken during ip6_rt_cache_alloc().
+ * As rt6_uncached_list_add() does not consume refcnt,
+ * this refcnt is always returned to the caller even
+ * if caller sets RT6_LOOKUP_F_DST_NOREF flag.
*/
- rt6_uncached_list_add(uncached_rt);
- atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
- } else {
- uncached_rt = net->ipv6.ip6_null_entry;
- dst_hold(&uncached_rt->dst);
- }
+ rt6_uncached_list_add(rt);
+ rcu_read_unlock();
- return uncached_rt;
+ return rt;
+ }
} else {
/* Get a percpu copy */
-
- struct rt6_info *pcpu_rt;
-
local_bh_disable();
- pcpu_rt = rt6_get_pcpu_route(f6i);
+ rt = rt6_get_pcpu_route(&res);
- if (!pcpu_rt)
- pcpu_rt = rt6_make_pcpu_route(net, f6i);
+ if (!rt)
+ rt = rt6_make_pcpu_route(net, &res);
local_bh_enable();
- rcu_read_unlock();
-
- return pcpu_rt;
}
+out:
+ if (!rt)
+ rt = net->ipv6.ip6_null_entry;
+ if (!(flags & RT6_LOOKUP_F_DST_NOREF))
+ ip6_hold_safe(net, &rt);
+ rcu_read_unlock();
+
+ return rt;
}
EXPORT_SYMBOL_GPL(ip6_pol_route);
-static struct rt6_info *ip6_pol_route_input(struct net *net,
+INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_input(struct net *net,
struct fib6_table *table,
struct flowi6 *fl6,
const struct sk_buff *skb,
@@ -1974,10 +2359,7 @@ static void ip6_multipath_l3_keys(const struct sk_buff *skb,
if (!icmph)
goto out;
- if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
- icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
- icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
- icmph->icmp6_type != ICMPV6_PARAMPROB)
+ if (!icmpv6_is_err(icmph->icmp6_type))
goto out;
inner_iph = skb_header_pointer(skb,
@@ -2002,12 +2384,135 @@ out:
}
}
+static u32 rt6_multipath_custom_hash_outer(const struct net *net,
+ const struct sk_buff *skb,
+ bool *p_has_inner)
+{
+ u32 hash_fields = ip6_multipath_hash_fields(net);
+ struct flow_keys keys, hash_keys;
+
+ if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
+ return 0;
+
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP);
+
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
+ hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
+ hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
+ hash_keys.basic.ip_proto = keys.basic.ip_proto;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
+ hash_keys.tags.flow_label = keys.tags.flow_label;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT)
+ hash_keys.ports.src = keys.ports.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
+ hash_keys.ports.dst = keys.ports.dst;
+
+ *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION);
+ return fib_multipath_hash_from_keys(net, &hash_keys);
+}
+
+static u32 rt6_multipath_custom_hash_inner(const struct net *net,
+ const struct sk_buff *skb,
+ bool has_inner)
+{
+ u32 hash_fields = ip6_multipath_hash_fields(net);
+ struct flow_keys keys, hash_keys;
+
+ /* We assume the packet carries an encapsulation, but if none was
+ * encountered during dissection of the outer flow, then there is no
+ * point in calling the flow dissector again.
+ */
+ if (!has_inner)
+ return 0;
+
+ if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK))
+ return 0;
+
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ skb_flow_dissect_flow_keys(skb, &keys, 0);
+
+ if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION))
+ return 0;
+
+ if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
+ hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
+ hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+ } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP)
+ hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP)
+ hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL)
+ hash_keys.tags.flow_label = keys.tags.flow_label;
+ }
+
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO)
+ hash_keys.basic.ip_proto = keys.basic.ip_proto;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT)
+ hash_keys.ports.src = keys.ports.src;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT)
+ hash_keys.ports.dst = keys.ports.dst;
+
+ return fib_multipath_hash_from_keys(net, &hash_keys);
+}
+
+static u32 rt6_multipath_custom_hash_skb(const struct net *net,
+ const struct sk_buff *skb)
+{
+ u32 mhash, mhash_inner;
+ bool has_inner = true;
+
+ mhash = rt6_multipath_custom_hash_outer(net, skb, &has_inner);
+ mhash_inner = rt6_multipath_custom_hash_inner(net, skb, has_inner);
+
+ return jhash_2words(mhash, mhash_inner, 0);
+}
+
+static u32 rt6_multipath_custom_hash_fl6(const struct net *net,
+ const struct flowi6 *fl6)
+{
+ u32 hash_fields = ip6_multipath_hash_fields(net);
+ struct flow_keys hash_keys;
+
+ if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK))
+ return 0;
+
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP)
+ hash_keys.addrs.v6addrs.src = fl6->saddr;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP)
+ hash_keys.addrs.v6addrs.dst = fl6->daddr;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO)
+ hash_keys.basic.ip_proto = fl6->flowi6_proto;
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL)
+ hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) {
+ if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT)
+ hash_keys.ports.src = (__force __be16)get_random_u16();
+ else
+ hash_keys.ports.src = fl6->fl6_sport;
+ }
+ if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT)
+ hash_keys.ports.dst = fl6->fl6_dport;
+
+ return fib_multipath_hash_from_keys(net, &hash_keys);
+}
+
/* if skb is set it will be used and fl6 can be NULL */
u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
const struct sk_buff *skb, struct flow_keys *flkeys)
{
struct flow_keys hash_keys;
- u32 mhash;
+ u32 mhash = 0;
switch (ip6_multipath_hash_policy(net)) {
case 0:
@@ -2021,6 +2526,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
hash_keys.basic.ip_proto = fl6->flowi6_proto;
}
+ mhash = fib_multipath_hash_from_keys(net, &hash_keys);
break;
case 1:
if (skb) {
@@ -2033,7 +2539,7 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
memset(&hash_keys, 0, sizeof(hash_keys));
- if (!flkeys) {
+ if (!flkeys) {
skb_flow_dissect_flow_keys(skb, &keys, flag);
flkeys = &keys;
}
@@ -2048,22 +2554,69 @@ u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
hash_keys.addrs.v6addrs.src = fl6->saddr;
hash_keys.addrs.v6addrs.dst = fl6->daddr;
- hash_keys.ports.src = fl6->fl6_sport;
+ if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT)
+ hash_keys.ports.src = (__force __be16)get_random_u16();
+ else
+ hash_keys.ports.src = fl6->fl6_sport;
hash_keys.ports.dst = fl6->fl6_dport;
hash_keys.basic.ip_proto = fl6->flowi6_proto;
}
+ mhash = fib_multipath_hash_from_keys(net, &hash_keys);
+ break;
+ case 2:
+ memset(&hash_keys, 0, sizeof(hash_keys));
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ if (skb) {
+ struct flow_keys keys;
+
+ if (!flkeys) {
+ skb_flow_dissect_flow_keys(skb, &keys, 0);
+ flkeys = &keys;
+ }
+
+ /* Inner can be v4 or v6 */
+ if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
+ hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
+ } else if (flkeys->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
+ hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
+ hash_keys.tags.flow_label = flkeys->tags.flow_label;
+ hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
+ } else {
+ /* Same as case 0 */
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
+ }
+ } else {
+ /* Same as case 0 */
+ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+ hash_keys.addrs.v6addrs.src = fl6->saddr;
+ hash_keys.addrs.v6addrs.dst = fl6->daddr;
+ hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
+ hash_keys.basic.ip_proto = fl6->flowi6_proto;
+ }
+ mhash = fib_multipath_hash_from_keys(net, &hash_keys);
+ break;
+ case 3:
+ if (skb)
+ mhash = rt6_multipath_custom_hash_skb(net, skb);
+ else
+ mhash = rt6_multipath_custom_hash_fl6(net, fl6);
break;
}
- mhash = flow_hash_from_keys(&hash_keys);
return mhash >> 1;
}
+/* Called with rcu held */
void ip6_route_input(struct sk_buff *skb)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
struct net *net = dev_net(skb->dev);
- int flags = RT6_LOOKUP_F_HAS_SADDR;
+ int flags = RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_DST_NOREF;
struct ip_tunnel_info *tun_info;
struct flowi6 fl6 = {
.flowi6_iif = skb->dev->ifindex,
@@ -2085,11 +2638,11 @@ void ip6_route_input(struct sk_buff *skb)
if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
skb_dst_drop(skb);
- skb_dst_set(skb,
- ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
+ skb_dst_set_noref(skb, ip6_route_input_lookup(net, skb->dev,
+ &fl6, skb, flags));
}
-static struct rt6_info *ip6_pol_route_output(struct net *net,
+INDIRECT_CALLABLE_SCOPE struct rt6_info *ip6_pol_route_output(struct net *net,
struct fib6_table *table,
struct flowi6 *fl6,
const struct sk_buff *skb,
@@ -2098,14 +2651,18 @@ static struct rt6_info *ip6_pol_route_output(struct net *net,
return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
}
-struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
- struct flowi6 *fl6, int flags)
+static struct dst_entry *ip6_route_output_flags_noref(struct net *net,
+ const struct sock *sk,
+ struct flowi6 *fl6,
+ int flags)
{
bool any_src;
- if (rt6_need_strict(&fl6->daddr)) {
+ if (ipv6_addr_type(&fl6->daddr) &
+ (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
struct dst_entry *dst;
+ /* This function does not take refcnt on the dst */
dst = l3mdev_link_scope_lookup(net, fl6);
if (dst)
return dst;
@@ -2113,6 +2670,7 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
fl6->flowi6_iif = LOOPBACK_IFINDEX;
+ flags |= RT6_LOOKUP_F_DST_NOREF;
any_src = ipv6_addr_any(&fl6->saddr);
if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
(fl6->flowi6_oif && any_src))
@@ -2121,19 +2679,40 @@ struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
if (!any_src)
flags |= RT6_LOOKUP_F_HAS_SADDR;
else if (sk)
- flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
+ flags |= rt6_srcprefs2flags(READ_ONCE(inet6_sk(sk)->srcprefs));
return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
}
+
+struct dst_entry *ip6_route_output_flags(struct net *net,
+ const struct sock *sk,
+ struct flowi6 *fl6,
+ int flags)
+{
+ struct dst_entry *dst;
+ struct rt6_info *rt6;
+
+ rcu_read_lock();
+ dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
+ rt6 = dst_rt6_info(dst);
+ /* For dst cached in uncached_list, refcnt is already taken. */
+ if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) {
+ dst = &net->ipv6.ip6_null_entry->dst;
+ dst_hold(dst);
+ }
+ rcu_read_unlock();
+
+ return dst;
+}
EXPORT_SYMBOL_GPL(ip6_route_output_flags);
struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
{
- struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
+ struct rt6_info *rt, *ort = dst_rt6_info(dst_orig);
struct net_device *loopback_dev = net->loopback_dev;
struct dst_entry *new = NULL;
- rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
+ rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev,
DST_OBSOLETE_DEAD, 0);
if (rt) {
rt6_info_init(rt);
@@ -2183,7 +2762,7 @@ static struct dst_entry *rt6_check(struct rt6_info *rt,
{
u32 rt_cookie = 0;
- if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
+ if (!from || !fib6_get_cookie_safe(from, &rt_cookie) ||
rt_cookie != cookie)
return NULL;
@@ -2198,20 +2777,23 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
u32 cookie)
{
if (!__rt6_check_expired(rt) &&
- rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
+ READ_ONCE(rt->dst.obsolete) == DST_OBSOLETE_FORCE_CHK &&
fib6_check(from, cookie))
return &rt->dst;
- else
- return NULL;
+ return NULL;
}
-static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
+INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst,
+ u32 cookie)
{
struct dst_entry *dst_ret;
struct fib6_info *from;
struct rt6_info *rt;
- rt = container_of(dst, struct rt6_info, dst);
+ rt = dst_rt6_info(dst);
+
+ if (rt->sernum)
+ return rt6_is_valid(rt) ? dst : NULL;
rcu_read_lock();
@@ -2223,7 +2805,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
from = rcu_dereference(rt->from);
if (from && (rt->rt6i_flags & RTF_PCPU ||
- unlikely(!list_empty(&rt->rt6i_uncached))))
+ unlikely(!list_empty(&rt->dst.rt_uncached))))
dst_ret = rt6_dst_from_check(rt, from, cookie);
else
dst_ret = rt6_check(rt, from, cookie);
@@ -2232,25 +2814,26 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
return dst_ret;
}
+EXPORT_INDIRECT_CALLABLE(ip6_dst_check);
-static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
+static void ip6_negative_advice(struct sock *sk,
+ struct dst_entry *dst)
{
- struct rt6_info *rt = (struct rt6_info *) dst;
+ struct rt6_info *rt = dst_rt6_info(dst);
- if (rt) {
- if (rt->rt6i_flags & RTF_CACHE) {
- rcu_read_lock();
- if (rt6_check_expired(rt)) {
- rt6_remove_exception_rt(rt);
- dst = NULL;
- }
- rcu_read_unlock();
- } else {
- dst_release(dst);
- dst = NULL;
+ if (rt->rt6i_flags & RTF_CACHE) {
+ rcu_read_lock();
+ if (rt6_check_expired(rt)) {
+ /* rt/dst can not be destroyed yet,
+ * because of rcu_read_lock()
+ */
+ sk_dst_reset(sk);
+ rt6_remove_exception_rt(rt);
}
+ rcu_read_unlock();
+ return;
}
- return dst;
+ sk_dst_reset(sk);
}
static void ip6_link_failure(struct sk_buff *skb)
@@ -2259,12 +2842,11 @@ static void ip6_link_failure(struct sk_buff *skb)
icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
- rt = (struct rt6_info *) skb_dst(skb);
+ rt = dst_rt6_info(skb_dst(skb));
if (rt) {
rcu_read_lock();
if (rt->rt6i_flags & RTF_CACHE) {
- if (dst_hold_safe(&rt->dst))
- rt6_remove_exception_rt(rt);
+ rt6_remove_exception_rt(rt);
} else {
struct fib6_info *from;
struct fib6_node *fn;
@@ -2273,7 +2855,7 @@ static void ip6_link_failure(struct sk_buff *skb)
if (from) {
fn = rcu_dereference(from->fib6_node);
if (fn && (rt->rt6i_flags & RTF_DEFAULT))
- fn->fn_sernum = -1;
+ WRITE_ONCE(fn->fn_sernum, -1);
}
}
rcu_read_unlock();
@@ -2288,7 +2870,7 @@ static void rt6_update_expires(struct rt6_info *rt0, int timeout)
rcu_read_lock();
from = rcu_dereference(rt0->from);
if (from)
- rt0->dst.expires = from->expires;
+ WRITE_ONCE(rt0->dst.expires, from->expires);
rcu_read_unlock();
}
@@ -2307,24 +2889,21 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
{
- bool from_set;
-
- rcu_read_lock();
- from_set = !!rcu_dereference(rt->from);
- rcu_read_unlock();
-
return !(rt->rt6i_flags & RTF_CACHE) &&
- (rt->rt6i_flags & RTF_PCPU || from_set);
+ (rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
}
static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
- const struct ipv6hdr *iph, u32 mtu)
+ const struct ipv6hdr *iph, u32 mtu,
+ bool confirm_neigh)
{
const struct in6_addr *daddr, *saddr;
- struct rt6_info *rt6 = (struct rt6_info *)dst;
+ struct rt6_info *rt6 = dst_rt6_info(dst);
- if (dst_metric_locked(dst, RTAX_MTU))
- return;
+ /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU)
+ * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it.
+ * [see also comment in rt6_mtu_change_route()]
+ */
if (iph) {
daddr = &iph->daddr;
@@ -2336,8 +2915,12 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
daddr = NULL;
saddr = NULL;
}
- dst_confirm_neigh(dst, daddr);
- mtu = max_t(u32, mtu, IPV6_MIN_MTU);
+
+ if (confirm_neigh)
+ dst_confirm_neigh(dst, daddr);
+
+ if (mtu < IPV6_MIN_MTU)
+ return;
if (mtu >= dst_mtu(dst))
return;
@@ -2347,25 +2930,54 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
if (rt6->rt6i_flags & RTF_CACHE)
rt6_update_exception_stamp_rt(rt6);
} else if (daddr) {
- struct fib6_info *from;
+ struct fib6_result res = {};
struct rt6_info *nrt6;
rcu_read_lock();
- from = rcu_dereference(rt6->from);
- nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
+ res.f6i = rcu_dereference(rt6->from);
+ if (!res.f6i)
+ goto out_unlock;
+
+ res.fib6_flags = res.f6i->fib6_flags;
+ res.fib6_type = res.f6i->fib6_type;
+
+ if (res.f6i->nh) {
+ struct fib6_nh_match_arg arg = {
+ .dev = dst_dev_rcu(dst),
+ .gw = &rt6->rt6i_gateway,
+ };
+
+ nexthop_for_each_fib6_nh(res.f6i->nh,
+ fib6_nh_find_match, &arg);
+
+ /* fib6_info uses a nexthop that does not have fib6_nh
+ * using the dst->dev + gw. Should be impossible.
+ */
+ if (!arg.match)
+ goto out_unlock;
+
+ res.nh = arg.match;
+ } else {
+ res.nh = res.f6i->fib6_nh;
+ }
+
+ nrt6 = ip6_rt_cache_alloc(&res, daddr, saddr);
if (nrt6) {
rt6_do_update_pmtu(nrt6, mtu);
- if (rt6_insert_exception(nrt6, from))
+ if (rt6_insert_exception(nrt6, &res))
dst_release_immediate(&nrt6->dst);
}
+out_unlock:
rcu_read_unlock();
}
}
static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
{
- __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
+ __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu,
+ confirm_neigh);
}
void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
@@ -2373,32 +2985,35 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
{
const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
struct dst_entry *dst;
- struct flowi6 fl6;
-
- memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_oif = oif;
- fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
- fl6.daddr = iph->daddr;
- fl6.saddr = iph->saddr;
- fl6.flowlabel = ip6_flowinfo(iph);
- fl6.flowi6_uid = uid;
+ struct flowi6 fl6 = {
+ .flowi6_oif = oif,
+ .flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .flowlabel = ip6_flowinfo(iph),
+ .flowi6_uid = uid,
+ };
dst = ip6_route_output(net, NULL, &fl6);
if (!dst->error)
- __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
+ __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true);
dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_update_pmtu);
void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
{
+ int oif = sk->sk_bound_dev_if;
struct dst_entry *dst;
- ip6_update_pmtu(skb, sock_net(sk), mtu,
- sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
+ if (!oif && skb->dev)
+ oif = l3mdev_master_ifindex(skb->dev);
+
+ ip6_update_pmtu(skb, sock_net(sk), mtu, oif, READ_ONCE(sk->sk_mark),
+ sk_uid(sk));
dst = __sk_dst_get(sk);
- if (!dst || !dst->obsolete ||
+ if (!dst || !READ_ONCE(dst->obsolete) ||
dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
return;
@@ -2417,13 +3032,57 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
#endif
ip6_dst_store(sk, dst,
- ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
- &sk->sk_v6_daddr : NULL,
+ ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr),
#ifdef CONFIG_IPV6_SUBTREES
ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
- &np->saddr :
+ true :
#endif
- NULL);
+ false);
+}
+
+static bool ip6_redirect_nh_match(const struct fib6_result *res,
+ struct flowi6 *fl6,
+ const struct in6_addr *gw,
+ struct rt6_info **ret)
+{
+ const struct fib6_nh *nh = res->nh;
+
+ if (nh->fib_nh_flags & RTNH_F_DEAD || !nh->fib_nh_gw_family ||
+ fl6->flowi6_oif != nh->fib_nh_dev->ifindex)
+ return false;
+
+ /* rt_cache's gateway might be different from its 'parent'
+ * in the case of an ip redirect.
+ * So we keep searching in the exception table if the gateway
+ * is different.
+ */
+ if (!ipv6_addr_equal(gw, &nh->fib_nh_gw6)) {
+ struct rt6_info *rt_cache;
+
+ rt_cache = rt6_find_cached_rt(res, &fl6->daddr, &fl6->saddr);
+ if (rt_cache &&
+ ipv6_addr_equal(gw, &rt_cache->rt6i_gateway)) {
+ *ret = rt_cache;
+ return true;
+ }
+ return false;
+ }
+ return true;
+}
+
+struct fib6_nh_rd_arg {
+ struct fib6_result *res;
+ struct flowi6 *fl6;
+ const struct in6_addr *gw;
+ struct rt6_info **ret;
+};
+
+static int fib6_nh_redirect_match(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_rd_arg *arg = _arg;
+
+ arg->res->nh = nh;
+ return ip6_redirect_nh_match(arg->res, arg->fl6, arg->gw, arg->ret);
}
/* Handle redirects */
@@ -2432,14 +3091,21 @@ struct ip6rd_flowi {
struct in6_addr gateway;
};
-static struct rt6_info *__ip6_route_redirect(struct net *net,
+INDIRECT_CALLABLE_SCOPE struct rt6_info *__ip6_route_redirect(struct net *net,
struct fib6_table *table,
struct flowi6 *fl6,
const struct sk_buff *skb,
int flags)
{
struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
- struct rt6_info *ret = NULL, *rt_cache;
+ struct rt6_info *ret = NULL;
+ struct fib6_result res = {};
+ struct fib6_nh_rd_arg arg = {
+ .res = &res,
+ .fl6 = fl6,
+ .gw = &rdfl->gateway,
+ .ret = &ret
+ };
struct fib6_info *rt;
struct fib6_node *fn;
@@ -2457,34 +3123,25 @@ static struct rt6_info *__ip6_route_redirect(struct net *net,
fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
for_each_fib6_node_rt_rcu(fn) {
- if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
- continue;
+ res.f6i = rt;
if (fib6_check_expired(rt))
continue;
if (rt->fib6_flags & RTF_REJECT)
break;
- if (!(rt->fib6_flags & RTF_GATEWAY))
- continue;
- if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
- continue;
- /* rt_cache's gateway might be different from its 'parent'
- * in the case of an ip redirect.
- * So we keep searching in the exception table if the gateway
- * is different.
- */
- if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
- rt_cache = rt6_find_cached_rt(rt,
- &fl6->daddr,
- &fl6->saddr);
- if (rt_cache &&
- ipv6_addr_equal(&rdfl->gateway,
- &rt_cache->rt6i_gateway)) {
- ret = rt_cache;
- break;
- }
- continue;
+ if (unlikely(rt->nh)) {
+ if (nexthop_is_blackhole(rt->nh))
+ continue;
+ /* on match, res->nh is filled in and potentially ret */
+ if (nexthop_for_each_fib6_nh(rt->nh,
+ fib6_nh_redirect_match,
+ &arg))
+ goto out;
+ } else {
+ res.nh = rt->fib6_nh;
+ if (ip6_redirect_nh_match(&res, fl6, &rdfl->gateway,
+ &ret))
+ goto out;
}
- break;
}
if (!rt)
@@ -2500,15 +3157,20 @@ restart:
goto restart;
}
+ res.f6i = rt;
+ res.nh = rt->fib6_nh;
out:
- if (ret)
- ip6_hold_safe(net, &ret, true);
- else
- ret = ip6_create_rt_rcu(rt);
+ if (ret) {
+ ip6_hold_safe(net, &ret);
+ } else {
+ res.fib6_flags = res.f6i->fib6_flags;
+ res.fib6_type = res.f6i->fib6_type;
+ ret = ip6_create_rt_rcu(&res);
+ }
rcu_read_unlock();
- trace_fib6_table_lookup(net, rt, table, fl6);
+ trace_fib6_table_lookup(net, &res, table, fl6);
return ret;
};
@@ -2532,16 +3194,15 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
{
const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
struct dst_entry *dst;
- struct flowi6 fl6;
-
- memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_iif = LOOPBACK_IFINDEX;
- fl6.flowi6_oif = oif;
- fl6.flowi6_mark = mark;
- fl6.daddr = iph->daddr;
- fl6.saddr = iph->saddr;
- fl6.flowlabel = ip6_flowinfo(iph);
- fl6.flowi6_uid = uid;
+ struct flowi6 fl6 = {
+ .flowi6_iif = LOOPBACK_IFINDEX,
+ .flowi6_oif = oif,
+ .flowi6_mark = mark,
+ .daddr = iph->daddr,
+ .saddr = iph->saddr,
+ .flowlabel = ip6_flowinfo(iph),
+ .flowi6_uid = uid,
+ };
dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
rt6_do_redirect(dst, NULL, skb);
@@ -2549,21 +3210,18 @@ void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
}
EXPORT_SYMBOL_GPL(ip6_redirect);
-void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
- u32 mark)
+void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
struct dst_entry *dst;
- struct flowi6 fl6;
-
- memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_iif = LOOPBACK_IFINDEX;
- fl6.flowi6_oif = oif;
- fl6.flowi6_mark = mark;
- fl6.daddr = msg->dest;
- fl6.saddr = iph->daddr;
- fl6.flowi6_uid = sock_net_uid(net, NULL);
+ struct flowi6 fl6 = {
+ .flowi6_iif = LOOPBACK_IFINDEX,
+ .flowi6_oif = oif,
+ .daddr = msg->dest,
+ .saddr = iph->daddr,
+ .flowi6_uid = sock_net_uid(net, NULL),
+ };
dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
rt6_do_redirect(dst, NULL, skb);
@@ -2572,22 +3230,26 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
- ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
- sk->sk_uid);
+ ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if,
+ READ_ONCE(sk->sk_mark), sk_uid(sk));
}
EXPORT_SYMBOL_GPL(ip6_sk_redirect);
static unsigned int ip6_default_advmss(const struct dst_entry *dst)
{
- struct net_device *dev = dst->dev;
unsigned int mtu = dst_mtu(dst);
- struct net *net = dev_net(dev);
+ struct net *net;
mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
+ rcu_read_lock();
+
+ net = dst_dev_net_rcu(dst);
if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
+ rcu_read_unlock();
+
/*
* Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
* corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
@@ -2599,28 +3261,11 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst)
return mtu;
}
-static unsigned int ip6_mtu(const struct dst_entry *dst)
+INDIRECT_CALLABLE_SCOPE unsigned int ip6_mtu(const struct dst_entry *dst)
{
- struct inet6_dev *idev;
- unsigned int mtu;
-
- mtu = dst_metric_raw(dst, RTAX_MTU);
- if (mtu)
- goto out;
-
- mtu = IPV6_MIN_MTU;
-
- rcu_read_lock();
- idev = __in6_dev_get(dst->dev);
- if (idev)
- mtu = idev->cnf.mtu6;
- rcu_read_unlock();
-
-out:
- mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
-
- return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
+ return ip6_dst_mtu_maybe_forward(dst, false);
}
+EXPORT_INDIRECT_CALLABLE(ip6_mtu);
/* MTU selection:
* 1. mtu on route is locked - use it
@@ -2630,13 +3275,14 @@ out:
* based on ip6_dst_mtu_forward and exception logic of
* rt6_find_cached_rt; called with rcu_read_lock
*/
-u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
- struct in6_addr *saddr)
+u32 ip6_mtu_from_fib6(const struct fib6_result *res,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr)
{
- struct rt6_exception_bucket *bucket;
- struct rt6_exception *rt6_ex;
- struct in6_addr *src_key;
+ const struct fib6_nh *nh = res->nh;
+ struct fib6_info *f6i = res->f6i;
struct inet6_dev *idev;
+ struct rt6_info *rt;
u32 mtu = 0;
if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
@@ -2645,29 +3291,21 @@ u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
goto out;
}
- src_key = NULL;
-#ifdef CONFIG_IPV6_SUBTREES
- if (f6i->fib6_src.plen)
- src_key = saddr;
-#endif
-
- bucket = rcu_dereference(f6i->rt6i_exception_bucket);
- rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
- if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
- mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);
-
- if (likely(!mtu)) {
- struct net_device *dev = fib6_info_nh_dev(f6i);
+ rt = rt6_find_cached_rt(res, daddr, saddr);
+ if (unlikely(rt)) {
+ mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
+ } else {
+ struct net_device *dev = nh->fib_nh_dev;
mtu = IPV6_MIN_MTU;
idev = __in6_dev_get(dev);
- if (idev && idev->cnf.mtu6 > mtu)
- mtu = idev->cnf.mtu6;
+ if (idev)
+ mtu = max_t(u32, mtu, READ_ONCE(idev->cnf.mtu6));
}
mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
out:
- return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
+ return mtu - lwtunnel_headroom(nh->fib_nh_lws, mtu);
}
struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
@@ -2688,7 +3326,6 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
goto out;
}
- rt->dst.flags |= DST_HOST;
rt->dst.input = ip6_input;
rt->dst.output = ip6_output;
rt->rt6i_gateway = fl6->daddr;
@@ -2701,7 +3338,6 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
* do proper release of the net_device
*/
rt6_uncached_list_add(rt);
- atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
@@ -2709,53 +3345,31 @@ out:
return dst;
}
-static int ip6_dst_gc(struct dst_ops *ops)
+static void ip6_dst_gc(struct dst_ops *ops)
{
struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
- int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
+ unsigned int val;
int entries;
- entries = dst_entries_get_fast(ops);
- if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
- entries <= rt_max_size)
+ if (time_after(rt_last_gc + rt_min_interval, jiffies))
goto out;
- net->ipv6.ip6_rt_gc_expire++;
- fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
+ fib6_run_gc(atomic_inc_return(&net->ipv6.ip6_rt_gc_expire), net, true);
entries = dst_entries_get_slow(ops);
if (entries < ops->gc_thresh)
- net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
+ atomic_set(&net->ipv6.ip6_rt_gc_expire, rt_gc_timeout >> 1);
out:
- net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
- return entries > rt_max_size;
+ val = atomic_read(&net->ipv6.ip6_rt_gc_expire);
+ atomic_set(&net->ipv6.ip6_rt_gc_expire, val - (val >> rt_elasticity));
}
-static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
- struct fib6_config *cfg)
-{
- struct dst_metrics *p;
-
- if (!cfg->fc_mx)
- return 0;
-
- p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
- if (unlikely(!p))
- return -ENOMEM;
-
- refcount_set(&p->refcnt, 1);
- rt->fib6_metrics = p;
-
- return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
-}
-
-static struct rt6_info *ip6_nh_lookup_table(struct net *net,
- struct fib6_config *cfg,
- const struct in6_addr *gw_addr,
- u32 tbid, int flags)
+static int ip6_nh_lookup_table(struct net *net, struct fib6_config *cfg,
+ const struct in6_addr *gw_addr, u32 tbid,
+ int flags, struct fib6_result *res)
{
struct flowi6 fl6 = {
.flowi6_oif = cfg->fc_ifindex,
@@ -2763,25 +3377,23 @@ static struct rt6_info *ip6_nh_lookup_table(struct net *net,
.saddr = cfg->fc_prefsrc,
};
struct fib6_table *table;
- struct rt6_info *rt;
+ int err;
table = fib6_get_table(net, tbid);
if (!table)
- return NULL;
+ return -EINVAL;
if (!ipv6_addr_any(&cfg->fc_prefsrc))
flags |= RT6_LOOKUP_F_HAS_SADDR;
flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
- rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
- /* if table lookup failed, fall back to full lookup */
- if (rt == net->ipv6.ip6_null_entry) {
- ip6_rt_put(rt);
- rt = NULL;
- }
+ err = fib6_table_lookup(net, table, cfg->fc_ifindex, &fl6, res, flags);
+ if (!err && res->f6i != net->ipv6.fib6_null_entry)
+ fib6_select_path(net, res, &fl6, cfg->fc_ifindex,
+ cfg->fc_ifindex != 0, NULL, flags);
- return rt;
+ return err;
}
static int ip6_route_check_nh_onlink(struct net *net,
@@ -2789,23 +3401,19 @@ static int ip6_route_check_nh_onlink(struct net *net,
const struct net_device *dev,
struct netlink_ext_ack *extack)
{
- u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
+ u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
const struct in6_addr *gw_addr = &cfg->fc_gateway;
- u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
- struct rt6_info *grt;
+ struct fib6_result res = {};
int err;
- err = 0;
- grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
- if (grt) {
- if (!grt->dst.error &&
- (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
- NL_SET_ERR_MSG(extack,
- "Nexthop has invalid gateway or device mismatch");
- err = -EINVAL;
- }
-
- ip6_rt_put(grt);
+ err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res);
+ if (!err && !(res.fib6_flags & RTF_REJECT) &&
+ /* ignore match if it is the default route */
+ !ipv6_addr_any(&res.f6i->fib6_dst.addr) &&
+ (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop has invalid gateway or device mismatch");
+ err = -EINVAL;
}
return err;
@@ -2814,56 +3422,62 @@ static int ip6_route_check_nh_onlink(struct net *net,
static int ip6_route_check_nh(struct net *net,
struct fib6_config *cfg,
struct net_device **_dev,
+ netdevice_tracker *dev_tracker,
struct inet6_dev **idev)
{
const struct in6_addr *gw_addr = &cfg->fc_gateway;
struct net_device *dev = _dev ? *_dev : NULL;
- struct rt6_info *grt = NULL;
+ int flags = RT6_LOOKUP_F_IFACE;
+ struct fib6_result res = {};
int err = -EHOSTUNREACH;
if (cfg->fc_table) {
- int flags = RT6_LOOKUP_F_IFACE;
-
- grt = ip6_nh_lookup_table(net, cfg, gw_addr,
- cfg->fc_table, flags);
- if (grt) {
- if (grt->rt6i_flags & RTF_GATEWAY ||
- (dev && dev != grt->dst.dev)) {
- ip6_rt_put(grt);
- grt = NULL;
- }
- }
+ err = ip6_nh_lookup_table(net, cfg, gw_addr,
+ cfg->fc_table, flags, &res);
+ /* gw_addr can not require a gateway or resolve to a reject
+ * route. If a device is given, it must match the result.
+ */
+ if (err || res.fib6_flags & RTF_REJECT ||
+ res.nh->fib_nh_gw_family ||
+ (dev && dev != res.nh->fib_nh_dev))
+ err = -EHOSTUNREACH;
}
- if (!grt)
- grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
+ if (err < 0) {
+ struct flowi6 fl6 = {
+ .flowi6_oif = cfg->fc_ifindex,
+ .daddr = *gw_addr,
+ };
- if (!grt)
- goto out;
+ err = fib6_lookup(net, cfg->fc_ifindex, &fl6, &res, flags);
+ if (err || res.fib6_flags & RTF_REJECT ||
+ res.nh->fib_nh_gw_family)
+ err = -EHOSTUNREACH;
+
+ if (err)
+ return err;
+ fib6_select_path(net, &res, &fl6, cfg->fc_ifindex,
+ cfg->fc_ifindex != 0, NULL, flags);
+ }
+
+ err = 0;
if (dev) {
- if (dev != grt->dst.dev) {
- ip6_rt_put(grt);
- goto out;
- }
+ if (dev != res.nh->fib_nh_dev)
+ err = -EHOSTUNREACH;
} else {
- *_dev = dev = grt->dst.dev;
- *idev = grt->rt6i_idev;
- dev_hold(dev);
- in6_dev_hold(grt->rt6i_idev);
+ *_dev = dev = res.nh->fib_nh_dev;
+ netdev_hold(dev, dev_tracker, GFP_ATOMIC);
+ *idev = in6_dev_get(dev);
}
- if (!(grt->rt6i_flags & RTF_GATEWAY))
- err = 0;
-
- ip6_rt_put(grt);
-
-out:
return err;
}
static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
- struct net_device **_dev, struct inet6_dev **idev,
+ struct net_device **_dev,
+ netdevice_tracker *dev_tracker,
+ struct inet6_dev **idev,
struct netlink_ext_ack *extack)
{
const struct in6_addr *gw_addr = &cfg->fc_gateway;
@@ -2899,10 +3513,15 @@ static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
goto out;
}
+ rcu_read_lock();
+
if (cfg->fc_flags & RTNH_F_ONLINK)
err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
else
- err = ip6_route_check_nh(net, cfg, _dev, idev);
+ err = ip6_route_check_nh(net, cfg, _dev, dev_tracker,
+ idev);
+
+ rcu_read_unlock();
if (err)
goto out;
@@ -2935,79 +3554,243 @@ out:
return err;
}
-static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
- gfp_t gfp_flags,
- struct netlink_ext_ack *extack)
+static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
{
- struct net *net = cfg->fc_nlinfo.nl_net;
- struct fib6_info *rt = NULL;
+ if ((flags & RTF_REJECT) ||
+ (dev && (dev->flags & IFF_LOOPBACK) &&
+ !(addr_type & IPV6_ADDR_LOOPBACK) &&
+ !(flags & (RTF_ANYCAST | RTF_LOCAL))))
+ return true;
+
+ return false;
+}
+
+int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
+ struct fib6_config *cfg, gfp_t gfp_flags,
+ struct netlink_ext_ack *extack)
+{
+ netdevice_tracker *dev_tracker = &fib6_nh->fib_nh_dev_tracker;
struct net_device *dev = NULL;
struct inet6_dev *idev = NULL;
- struct fib6_table *table;
int addr_type;
- int err = -EINVAL;
+ int err;
+
+ fib6_nh->fib_nh_family = AF_INET6;
+#ifdef CONFIG_IPV6_ROUTER_PREF
+ fib6_nh->last_probe = jiffies;
+#endif
+ if (cfg->fc_is_fdb) {
+ fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
+ fib6_nh->fib_nh_gw_family = AF_INET6;
+ return 0;
+ }
+
+ err = -ENODEV;
+ if (cfg->fc_ifindex) {
+ dev = netdev_get_by_index(net, cfg->fc_ifindex,
+ dev_tracker, gfp_flags);
+ if (!dev)
+ goto out;
+ idev = in6_dev_get(dev);
+ if (!idev)
+ goto out;
+ }
+
+ if (cfg->fc_flags & RTNH_F_ONLINK) {
+ if (!dev) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop device required for onlink");
+ goto out;
+ }
+
+ if (!(dev->flags & IFF_UP)) {
+ NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+ err = -ENETDOWN;
+ goto out;
+ }
+
+ fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
+ }
+
+ fib6_nh->fib_nh_weight = 1;
+
+ /* We cannot add true routes via loopback here,
+ * they would result in kernel looping; promote them to reject routes
+ */
+ addr_type = ipv6_addr_type(&cfg->fc_dst);
+ if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
+ /* hold loopback dev/idev if we haven't done so. */
+ if (dev != net->loopback_dev) {
+ if (dev) {
+ netdev_put(dev, dev_tracker);
+ in6_dev_put(idev);
+ }
+ dev = net->loopback_dev;
+ netdev_hold(dev, dev_tracker, gfp_flags);
+ idev = in6_dev_get(dev);
+ if (!idev) {
+ err = -ENODEV;
+ goto out;
+ }
+ }
+ goto pcpu_alloc;
+ }
+
+ if (cfg->fc_flags & RTF_GATEWAY) {
+ err = ip6_validate_gw(net, cfg, &dev, dev_tracker,
+ &idev, extack);
+ if (err)
+ goto out;
+
+ fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
+ fib6_nh->fib_nh_gw_family = AF_INET6;
+ }
+
+ err = -ENODEV;
+ if (!dev)
+ goto out;
+
+ if (!idev || idev->cnf.disable_ipv6) {
+ NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
+ err = -EACCES;
+ goto out;
+ }
+
+ if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
+ NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+ err = -ENETDOWN;
+ goto out;
+ }
+
+ if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
+ !netif_carrier_ok(dev))
+ fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
+
+ err = fib_nh_common_init(net, &fib6_nh->nh_common, cfg->fc_encap,
+ cfg->fc_encap_type, cfg, gfp_flags, extack);
+ if (err)
+ goto out;
+pcpu_alloc:
+ fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
+ if (!fib6_nh->rt6i_pcpu) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ fib6_nh->fib_nh_dev = dev;
+ fib6_nh->fib_nh_oif = dev->ifindex;
+ err = 0;
+out:
+ if (idev)
+ in6_dev_put(idev);
+
+ if (err) {
+ fib_nh_common_release(&fib6_nh->nh_common);
+ fib6_nh->nh_common.nhc_pcpu_rth_output = NULL;
+ fib6_nh->fib_nh_lws = NULL;
+ netdev_put(dev, dev_tracker);
+ }
+
+ return err;
+}
+
+void fib6_nh_release(struct fib6_nh *fib6_nh)
+{
+ struct rt6_exception_bucket *bucket;
+
+ rcu_read_lock();
+
+ fib6_nh_flush_exceptions(fib6_nh, NULL);
+ bucket = fib6_nh_get_excptn_bucket(fib6_nh, NULL);
+ if (bucket) {
+ rcu_assign_pointer(fib6_nh->rt6i_exception_bucket, NULL);
+ kfree(bucket);
+ }
+
+ rcu_read_unlock();
+
+ fib6_nh_release_dsts(fib6_nh);
+ free_percpu(fib6_nh->rt6i_pcpu);
+
+ fib_nh_common_release(&fib6_nh->nh_common);
+}
+
+void fib6_nh_release_dsts(struct fib6_nh *fib6_nh)
+{
+ int cpu;
+
+ if (!fib6_nh->rt6i_pcpu)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct rt6_info *pcpu_rt, **ppcpu_rt;
+
+ ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
+ pcpu_rt = xchg(ppcpu_rt, NULL);
+ if (pcpu_rt) {
+ dst_dev_put(&pcpu_rt->dst);
+ dst_release(&pcpu_rt->dst);
+ }
+ }
+}
+
+static int fib6_config_validate(struct fib6_config *cfg,
+ struct netlink_ext_ack *extack)
+{
/* RTF_PCPU is an internal flag; can not be set by userspace */
if (cfg->fc_flags & RTF_PCPU) {
NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
- goto out;
+ goto errout;
}
/* RTF_CACHE is an internal flag; can not be set by userspace */
if (cfg->fc_flags & RTF_CACHE) {
NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
- goto out;
+ goto errout;
}
if (cfg->fc_type > RTN_MAX) {
NL_SET_ERR_MSG(extack, "Invalid route type");
- goto out;
+ goto errout;
}
if (cfg->fc_dst_len > 128) {
NL_SET_ERR_MSG(extack, "Invalid prefix length");
- goto out;
+ goto errout;
}
+
+#ifdef CONFIG_IPV6_SUBTREES
if (cfg->fc_src_len > 128) {
NL_SET_ERR_MSG(extack, "Invalid source address length");
- goto out;
+ goto errout;
+ }
+
+ if (cfg->fc_nh_id && cfg->fc_src_len) {
+ NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing");
+ goto errout;
}
-#ifndef CONFIG_IPV6_SUBTREES
+#else
if (cfg->fc_src_len) {
NL_SET_ERR_MSG(extack,
"Specifying source address requires IPV6_SUBTREES to be enabled");
- goto out;
+ goto errout;
}
#endif
- if (cfg->fc_ifindex) {
- err = -ENODEV;
- dev = dev_get_by_index(net, cfg->fc_ifindex);
- if (!dev)
- goto out;
- idev = in6_dev_get(dev);
- if (!idev)
- goto out;
- }
-
- if (cfg->fc_metric == 0)
- cfg->fc_metric = IP6_RT_PRIO_USER;
-
- if (cfg->fc_flags & RTNH_F_ONLINK) {
- if (!dev) {
- NL_SET_ERR_MSG(extack,
- "Nexthop device required for onlink");
- err = -ENODEV;
- goto out;
- }
+ return 0;
+errout:
+ return -EINVAL;
+}
- if (!(dev->flags & IFF_UP)) {
- NL_SET_ERR_MSG(extack, "Nexthop device is not up");
- err = -ENETDOWN;
- goto out;
- }
- }
+static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
+ gfp_t gfp_flags,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = cfg->fc_nlinfo.nl_net;
+ struct fib6_table *table;
+ struct fib6_info *rt;
+ int err;
- err = -ENOBUFS;
if (cfg->fc_nlinfo.nlh &&
!(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
table = fib6_get_table(net, cfg->fc_table);
@@ -3018,144 +3801,128 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
} else {
table = fib6_new_table(net, cfg->fc_table);
}
+ if (!table) {
+ err = -ENOBUFS;
+ goto err;
+ }
- if (!table)
- goto out;
+ rt = fib6_info_alloc(gfp_flags, !cfg->fc_nh_id);
+ if (!rt) {
+ err = -ENOMEM;
+ goto err;
+ }
- err = -ENOMEM;
- rt = fib6_info_alloc(gfp_flags);
- if (!rt)
- goto out;
+ rt->fib6_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len,
+ extack);
+ if (IS_ERR(rt->fib6_metrics)) {
+ err = PTR_ERR(rt->fib6_metrics);
+ goto free;
+ }
if (cfg->fc_flags & RTF_ADDRCONF)
rt->dst_nocount = true;
- err = ip6_convert_metrics(net, rt, cfg);
- if (err < 0)
- goto out;
-
if (cfg->fc_flags & RTF_EXPIRES)
fib6_set_expires(rt, jiffies +
- clock_t_to_jiffies(cfg->fc_expires));
- else
- fib6_clean_expires(rt);
+ clock_t_to_jiffies(cfg->fc_expires));
if (cfg->fc_protocol == RTPROT_UNSPEC)
cfg->fc_protocol = RTPROT_BOOT;
- rt->fib6_protocol = cfg->fc_protocol;
-
- addr_type = ipv6_addr_type(&cfg->fc_dst);
- if (cfg->fc_encap) {
- struct lwtunnel_state *lwtstate;
-
- err = lwtunnel_build_state(cfg->fc_encap_type,
- cfg->fc_encap, AF_INET6, cfg,
- &lwtstate, extack);
- if (err)
- goto out;
- rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
- }
+ rt->fib6_protocol = cfg->fc_protocol;
+ rt->fib6_table = table;
+ rt->fib6_metric = cfg->fc_metric;
+ rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
+ rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
rt->fib6_dst.plen = cfg->fc_dst_len;
- if (rt->fib6_dst.plen == 128)
- rt->dst_host = true;
#ifdef CONFIG_IPV6_SUBTREES
ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
rt->fib6_src.plen = cfg->fc_src_len;
#endif
+ return rt;
+free:
+ kfree(rt);
+err:
+ return ERR_PTR(err);
+}
- rt->fib6_metric = cfg->fc_metric;
- rt->fib6_nh.nh_weight = 1;
+static int ip6_route_info_create_nh(struct fib6_info *rt,
+ struct fib6_config *cfg,
+ gfp_t gfp_flags,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = cfg->fc_nlinfo.nl_net;
+ struct fib6_nh *fib6_nh;
+ int err;
- rt->fib6_type = cfg->fc_type;
+ if (cfg->fc_nh_id) {
+ struct nexthop *nh;
- /* We cannot add true routes via loopback here,
- they would result in kernel looping; promote them to reject routes
- */
- if ((cfg->fc_flags & RTF_REJECT) ||
- (dev && (dev->flags & IFF_LOOPBACK) &&
- !(addr_type & IPV6_ADDR_LOOPBACK) &&
- !(cfg->fc_flags & RTF_LOCAL))) {
- /* hold loopback dev/idev if we haven't done so. */
- if (dev != net->loopback_dev) {
- if (dev) {
- dev_put(dev);
- in6_dev_put(idev);
- }
- dev = net->loopback_dev;
- dev_hold(dev);
- idev = in6_dev_get(dev);
- if (!idev) {
- err = -ENODEV;
- goto out;
- }
+ rcu_read_lock();
+
+ nh = nexthop_find_by_id(net, cfg->fc_nh_id);
+ if (!nh) {
+ err = -EINVAL;
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ goto out_free;
}
- rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
- goto install_route;
- }
- if (cfg->fc_flags & RTF_GATEWAY) {
- err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
+ err = fib6_check_nexthop(nh, cfg, extack);
if (err)
- goto out;
+ goto out_free;
- rt->fib6_nh.nh_gw = cfg->fc_gateway;
- }
+ if (!nexthop_get(nh)) {
+ NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+ err = -ENOENT;
+ goto out_free;
+ }
- err = -ENODEV;
- if (!dev)
- goto out;
+ rt->nh = nh;
+ fib6_nh = nexthop_fib6_nh(rt->nh);
- if (idev->cnf.disable_ipv6) {
- NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
- err = -EACCES;
- goto out;
- }
+ rcu_read_unlock();
+ } else {
+ int addr_type;
- if (!(dev->flags & IFF_UP)) {
- NL_SET_ERR_MSG(extack, "Nexthop device is not up");
- err = -ENETDOWN;
- goto out;
+ err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
+ if (err)
+ goto out_release;
+
+ fib6_nh = rt->fib6_nh;
+
+ /* We cannot add true routes via loopback here, they would
+ * result in kernel looping; promote them to reject routes
+ */
+ addr_type = ipv6_addr_type(&cfg->fc_dst);
+ if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev,
+ addr_type))
+ rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
}
if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
+ struct net_device *dev = fib6_nh->fib_nh_dev;
+
if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
NL_SET_ERR_MSG(extack, "Invalid source address");
err = -EINVAL;
- goto out;
+ goto out_release;
}
rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
rt->fib6_prefsrc.plen = 128;
- } else
- rt->fib6_prefsrc.plen = 0;
-
- rt->fib6_flags = cfg->fc_flags;
-
-install_route:
- if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
- !netif_carrier_ok(dev))
- rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
- rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
- rt->fib6_nh.nh_dev = dev;
- rt->fib6_table = table;
-
- cfg->fc_nlinfo.nl_net = dev_net(dev);
-
- if (idev)
- in6_dev_put(idev);
-
- return rt;
-out:
- if (dev)
- dev_put(dev);
- if (idev)
- in6_dev_put(idev);
+ }
+ return 0;
+out_release:
fib6_info_release(rt);
- return ERR_PTR(err);
+ return err;
+out_free:
+ rcu_read_unlock();
+ ip_fib_metrics_put(rt->fib6_metrics);
+ kfree(rt);
+ return err;
}
int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
@@ -3164,10 +3931,18 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
struct fib6_info *rt;
int err;
+ err = fib6_config_validate(cfg, extack);
+ if (err)
+ return err;
+
rt = ip6_route_info_create(cfg, gfp_flags, extack);
if (IS_ERR(rt))
return PTR_ERR(rt);
+ err = ip6_route_info_create_nh(rt, cfg, gfp_flags, extack);
+ if (err)
+ return err;
+
err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
fib6_info_release(rt);
@@ -3195,9 +3970,12 @@ out:
return err;
}
-int ip6_del_rt(struct net *net, struct fib6_info *rt)
+int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify)
{
- struct nl_info info = { .nl_net = net };
+ struct nl_info info = {
+ .nl_net = net,
+ .skip_notify = skip_notify
+ };
return __ip6_del_rt(rt, &info);
}
@@ -3217,6 +3995,7 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
struct fib6_info *sibling, *next_sibling;
+ struct fib6_node *fn;
/* prefer to send a single notification with all hops */
skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
@@ -3232,6 +4011,32 @@ static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
info->skip_notify = 1;
}
+ /* 'rt' points to the first sibling route. If it is not the
+ * leaf, then we do not need to send a notification. Otherwise,
+ * we need to check if the last sibling has a next route or not
+ * and emit a replace or delete notification, respectively.
+ */
+ info->skip_notify_kernel = 1;
+ fn = rcu_dereference_protected(rt->fib6_node,
+ lockdep_is_held(&table->tb6_lock));
+ if (rcu_access_pointer(fn->leaf) == rt) {
+ struct fib6_info *last_sibling, *replace_rt;
+
+ last_sibling = list_last_entry(&rt->fib6_siblings,
+ struct fib6_info,
+ fib6_siblings);
+ replace_rt = rcu_dereference_protected(
+ last_sibling->fib6_next,
+ lockdep_is_held(&table->tb6_lock));
+ if (replace_rt)
+ call_fib6_entry_notifiers_replace(net,
+ replace_rt);
+ else
+ call_fib6_multipath_entry_notifiers(net,
+ FIB_EVENT_ENTRY_DEL,
+ rt, rt->fib6_nsiblings,
+ NULL);
+ }
list_for_each_entry_safe(sibling, next_sibling,
&rt->fib6_siblings,
fib6_siblings) {
@@ -3254,7 +4059,7 @@ out_put:
return err;
}
-static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
+static int __ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
{
int rc = -ESRCH;
@@ -3264,16 +4069,55 @@ static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
if (cfg->fc_flags & RTF_GATEWAY &&
!ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
goto out;
- if (dst_hold_safe(&rt->dst))
- rc = rt6_remove_exception_rt(rt);
+
+ rc = rt6_remove_exception_rt(rt);
out:
return rc;
}
+static int ip6_del_cached_rt(struct fib6_config *cfg, struct fib6_info *rt,
+ struct fib6_nh *nh)
+{
+ struct fib6_result res = {
+ .f6i = rt,
+ .nh = nh,
+ };
+ struct rt6_info *rt_cache;
+
+ rt_cache = rt6_find_cached_rt(&res, &cfg->fc_dst, &cfg->fc_src);
+ if (rt_cache)
+ return __ip6_del_cached_rt(rt_cache, cfg);
+
+ return 0;
+}
+
+struct fib6_nh_del_cached_rt_arg {
+ struct fib6_config *cfg;
+ struct fib6_info *f6i;
+};
+
+static int fib6_nh_del_cached_rt(struct fib6_nh *nh, void *_arg)
+{
+ struct fib6_nh_del_cached_rt_arg *arg = _arg;
+ int rc;
+
+ rc = ip6_del_cached_rt(arg->cfg, arg->f6i, nh);
+ return rc != -ESRCH ? rc : 0;
+}
+
+static int ip6_del_cached_rt_nh(struct fib6_config *cfg, struct fib6_info *f6i)
+{
+ struct fib6_nh_del_cached_rt_arg arg = {
+ .cfg = cfg,
+ .f6i = f6i
+ };
+
+ return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_del_cached_rt, &arg);
+}
+
static int ip6_route_del(struct fib6_config *cfg,
struct netlink_ext_ack *extack)
{
- struct rt6_info *rt_cache;
struct fib6_table *table;
struct fib6_info *rt;
struct fib6_node *fn;
@@ -3294,40 +4138,63 @@ static int ip6_route_del(struct fib6_config *cfg,
if (fn) {
for_each_fib6_node_rt_rcu(fn) {
+ struct fib6_nh *nh;
+
+ if (rt->nh && cfg->fc_nh_id &&
+ rt->nh->id != cfg->fc_nh_id)
+ continue;
+
if (cfg->fc_flags & RTF_CACHE) {
- int rc;
-
- rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
- &cfg->fc_src);
- if (rt_cache) {
- rc = ip6_del_cached_rt(rt_cache, cfg);
- if (rc != -ESRCH) {
- rcu_read_unlock();
- return rc;
- }
+ int rc = 0;
+
+ if (rt->nh) {
+ rc = ip6_del_cached_rt_nh(cfg, rt);
+ } else if (cfg->fc_nh_id) {
+ continue;
+ } else {
+ nh = rt->fib6_nh;
+ rc = ip6_del_cached_rt(cfg, rt, nh);
+ }
+ if (rc != -ESRCH) {
+ rcu_read_unlock();
+ return rc;
}
continue;
}
- if (cfg->fc_ifindex &&
- (!rt->fib6_nh.nh_dev ||
- rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
+
+ if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
continue;
- if (cfg->fc_flags & RTF_GATEWAY &&
- !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
+ if (cfg->fc_protocol &&
+ cfg->fc_protocol != rt->fib6_protocol)
continue;
- if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
+
+ if (rt->nh) {
+ if (!fib6_info_hold_safe(rt))
+ continue;
+
+ err = __ip6_del_rt(rt, &cfg->fc_nlinfo);
+ break;
+ }
+ if (cfg->fc_nh_id)
continue;
- if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
+
+ nh = rt->fib6_nh;
+ if (cfg->fc_ifindex &&
+ (!nh->fib_nh_dev ||
+ nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
+ continue;
+ if (cfg->fc_flags & RTF_GATEWAY &&
+ !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
continue;
if (!fib6_info_hold_safe(rt))
continue;
- rcu_read_unlock();
/* if gateway was specified only delete the one hop */
if (cfg->fc_flags & RTF_GATEWAY)
- return __ip6_del_rt(rt, &cfg->fc_nlinfo);
-
- return __ip6_del_rt_siblings(rt, cfg);
+ err = __ip6_del_rt(rt, &cfg->fc_nlinfo);
+ else
+ err = __ip6_del_rt_siblings(rt, cfg);
+ break;
}
}
rcu_read_unlock();
@@ -3339,10 +4206,10 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
{
struct netevent_redirect netevent;
struct rt6_info *rt, *nrt = NULL;
+ struct fib6_result res = {};
struct ndisc_options ndopts;
struct inet6_dev *in6_dev;
struct neighbour *neigh;
- struct fib6_info *from;
struct rd_msg *msg;
int optlen, on_link;
u8 *lladdr;
@@ -3374,7 +4241,8 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
in6_dev = __in6_dev_get(skb->dev);
if (!in6_dev)
return;
- if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
+ if (READ_ONCE(in6_dev->cnf.forwarding) ||
+ !READ_ONCE(in6_dev->cnf.accept_redirects))
return;
/* RFC2461 8.1:
@@ -3397,7 +4265,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
}
}
- rt = (struct rt6_info *) dst;
+ rt = dst_rt6_info(dst);
if (rt->rt6i_flags & RTF_REJECT) {
net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
return;
@@ -3425,14 +4293,32 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
NDISC_REDIRECT, &ndopts);
rcu_read_lock();
- from = rcu_dereference(rt->from);
- /* This fib6_info_hold() is safe here because we hold reference to rt
- * and rt already holds reference to fib6_info.
- */
- fib6_info_hold(from);
- rcu_read_unlock();
+ res.f6i = rcu_dereference(rt->from);
+ if (!res.f6i)
+ goto out;
+
+ if (res.f6i->nh) {
+ struct fib6_nh_match_arg arg = {
+ .dev = dst_dev_rcu(dst),
+ .gw = &rt->rt6i_gateway,
+ };
+
+ nexthop_for_each_fib6_nh(res.f6i->nh,
+ fib6_nh_find_match, &arg);
- nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
+ /* fib6_info uses a nexthop that does not have fib6_nh
+ * using the dst->dev. Should be impossible
+ */
+ if (!arg.match)
+ goto out;
+ res.nh = arg.match;
+ } else {
+ res.nh = res.f6i->fib6_nh;
+ }
+
+ res.fib6_flags = res.f6i->fib6_flags;
+ res.fib6_type = res.f6i->fib6_type;
+ nrt = ip6_rt_cache_alloc(&res, &msg->dest, NULL);
if (!nrt)
goto out;
@@ -3442,11 +4328,8 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
- /* No need to remove rt from the exception table if rt is
- * a cached route because rt6_insert_exception() will
- * takes care of it
- */
- if (rt6_insert_exception(nrt, from)) {
+ /* rt6_insert_exception() will take care of duplicated exceptions */
+ if (rt6_insert_exception(nrt, &res)) {
dst_release_immediate(&nrt->dst);
goto out;
}
@@ -3458,7 +4341,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
out:
- fib6_info_release(from);
+ rcu_read_unlock();
neigh_release(neigh);
}
@@ -3484,11 +4367,15 @@ static struct fib6_info *rt6_get_route_info(struct net *net,
goto out;
for_each_fib6_node_rt_rcu(fn) {
- if (rt->fib6_nh.nh_dev->ifindex != ifindex)
+ /* these routes do not use nexthops */
+ if (rt->nh)
+ continue;
+ if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
continue;
- if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
+ if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
+ !rt->fib6_nh->fib_nh_gw_family)
continue;
- if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
+ if (!ipv6_addr_equal(&rt->fib6_nh->fib_nh_gw6, gwaddr))
continue;
if (!fib6_info_hold_safe(rt))
continue;
@@ -3518,7 +4405,7 @@ static struct fib6_info *rt6_add_route_info(struct net *net,
.fc_nlinfo.nl_net = net,
};
- cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
+ cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
cfg.fc_dst = *prefix;
cfg.fc_gateway = *gwaddr;
@@ -3546,9 +4433,16 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
rcu_read_lock();
for_each_fib6_node_rt_rcu(&table->tb6_root) {
- if (dev == rt->fib6_nh.nh_dev &&
+ struct fib6_nh *nh;
+
+ /* RA routes do not use nexthops */
+ if (rt->nh)
+ continue;
+
+ nh = rt->fib6_nh;
+ if (dev == nh->fib_nh_dev &&
((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
- ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
+ ipv6_addr_equal(&nh->fib_nh_gw6, addr))
break;
}
if (rt && !fib6_info_hold_safe(rt))
@@ -3560,11 +4454,13 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
struct fib6_info *rt6_add_dflt_router(struct net *net,
const struct in6_addr *gwaddr,
struct net_device *dev,
- unsigned int pref)
+ unsigned int pref,
+ u32 defrtr_usr_metric,
+ int lifetime)
{
struct fib6_config cfg = {
.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
- .fc_metric = IP6_RT_PRIO_USER,
+ .fc_metric = defrtr_usr_metric,
.fc_ifindex = dev->ifindex,
.fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
@@ -3573,6 +4469,7 @@ struct fib6_info *rt6_add_dflt_router(struct net *net,
.fc_nlinfo.portid = 0,
.fc_nlinfo.nlh = NULL,
.fc_nlinfo.nl_net = net,
+ .fc_expires = jiffies_to_clock_t(lifetime * HZ),
};
cfg.fc_gateway = *gwaddr;
@@ -3603,7 +4500,7 @@ restart:
(!idev || idev->cnf.accept_ra != 2) &&
fib6_info_hold_safe(rt)) {
rcu_read_unlock();
- ip6_del_rt(net, rt);
+ ip6_del_rt(net, rt, false);
goto restart;
}
}
@@ -3635,60 +4532,50 @@ static void rtmsg_to_fib6_config(struct net *net,
struct in6_rtmsg *rtmsg,
struct fib6_config *cfg)
{
- memset(cfg, 0, sizeof(*cfg));
-
- cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
- : RT6_TABLE_MAIN;
- cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
- cfg->fc_metric = rtmsg->rtmsg_metric;
- cfg->fc_expires = rtmsg->rtmsg_info;
- cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
- cfg->fc_src_len = rtmsg->rtmsg_src_len;
- cfg->fc_flags = rtmsg->rtmsg_flags;
- cfg->fc_type = rtmsg->rtmsg_type;
+ *cfg = (struct fib6_config){
+ .fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
+ : RT6_TABLE_MAIN,
+ .fc_ifindex = rtmsg->rtmsg_ifindex,
+ .fc_metric = rtmsg->rtmsg_metric,
+ .fc_expires = rtmsg->rtmsg_info,
+ .fc_dst_len = rtmsg->rtmsg_dst_len,
+ .fc_src_len = rtmsg->rtmsg_src_len,
+ .fc_flags = rtmsg->rtmsg_flags,
+ .fc_type = rtmsg->rtmsg_type,
- cfg->fc_nlinfo.nl_net = net;
+ .fc_nlinfo.nl_net = net,
- cfg->fc_dst = rtmsg->rtmsg_dst;
- cfg->fc_src = rtmsg->rtmsg_src;
- cfg->fc_gateway = rtmsg->rtmsg_gateway;
+ .fc_dst = rtmsg->rtmsg_dst,
+ .fc_src = rtmsg->rtmsg_src,
+ .fc_gateway = rtmsg->rtmsg_gateway,
+ };
}
-int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
+int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg)
{
struct fib6_config cfg;
- struct in6_rtmsg rtmsg;
int err;
- switch (cmd) {
- case SIOCADDRT: /* Add a route */
- case SIOCDELRT: /* Delete a route */
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- return -EPERM;
- err = copy_from_user(&rtmsg, arg,
- sizeof(struct in6_rtmsg));
- if (err)
- return -EFAULT;
-
- rtmsg_to_fib6_config(net, &rtmsg, &cfg);
+ if (cmd != SIOCADDRT && cmd != SIOCDELRT)
+ return -EINVAL;
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
- rtnl_lock();
- switch (cmd) {
- case SIOCADDRT:
- err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
- break;
- case SIOCDELRT:
- err = ip6_route_del(&cfg, NULL);
- break;
- default:
- err = -EINVAL;
- }
- rtnl_unlock();
+ rtmsg_to_fib6_config(net, rtmsg, &cfg);
- return err;
+ switch (cmd) {
+ case SIOCADDRT:
+ /* Only do the default setting of fc_metric in route adding */
+ if (cfg.fc_metric == 0)
+ cfg.fc_metric = IP6_RT_PRIO_USER;
+ err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
+ break;
+ case SIOCDELRT:
+ err = ip6_route_del(&cfg, NULL);
+ break;
}
- return -EINVAL;
+ return err;
}
/*
@@ -3697,25 +4584,41 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
{
- int type;
struct dst_entry *dst = skb_dst(skb);
+ struct net_device *dev = dst_dev(dst);
+ struct net *net = dev_net(dev);
+ struct inet6_dev *idev;
+ SKB_DR(reason);
+ int type;
+
+ if (netif_is_l3_master(skb->dev) ||
+ dev == net->loopback_dev)
+ idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
+ else
+ idev = ip6_dst_idev(dst);
+
switch (ipstats_mib_noroutes) {
case IPSTATS_MIB_INNOROUTES:
type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
if (type == IPV6_ADDR_ANY) {
- IP6_INC_STATS(dev_net(dst->dev),
- __in6_dev_get_safely(skb->dev),
- IPSTATS_MIB_INADDRERRORS);
+ SKB_DR_SET(reason, IP_INADDRERRORS);
+ IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
break;
}
- /* FALLTHROUGH */
+ SKB_DR_SET(reason, IP_INNOROUTES);
+ fallthrough;
case IPSTATS_MIB_OUTNOROUTES:
- IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
- ipstats_mib_noroutes);
+ SKB_DR_OR(reason, IP_OUTNOROUTES);
+ IP6_INC_STATS(net, idev, ipstats_mib_noroutes);
break;
}
+
+ /* Start over by dropping the dst for l3mdev case */
+ if (netif_is_l3_master(skb->dev))
+ skb_dst_drop(skb);
+
icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return 0;
}
@@ -3726,7 +4629,7 @@ static int ip6_pkt_discard(struct sk_buff *skb)
static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- skb->dev = skb_dst(skb)->dev;
+ skb->dev = skb_dst_dev(skb);
return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
}
@@ -3737,7 +4640,7 @@ static int ip6_pkt_prohibit(struct sk_buff *skb)
static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- skb->dev = skb_dst(skb)->dev;
+ skb->dev = skb_dst_dev(skb);
return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
}
@@ -3748,60 +4651,66 @@ static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff
struct fib6_info *addrconf_f6i_alloc(struct net *net,
struct inet6_dev *idev,
const struct in6_addr *addr,
- bool anycast, gfp_t gfp_flags)
+ bool anycast, gfp_t gfp_flags,
+ struct netlink_ext_ack *extack)
{
- u32 tb_id;
- struct net_device *dev = idev->dev;
+ struct fib6_config cfg = {
+ .fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
+ .fc_ifindex = idev->dev->ifindex,
+ .fc_flags = RTF_UP | RTF_NONEXTHOP,
+ .fc_dst = *addr,
+ .fc_dst_len = 128,
+ .fc_protocol = RTPROT_KERNEL,
+ .fc_nlinfo.nl_net = net,
+ .fc_ignore_dev_down = true,
+ };
struct fib6_info *f6i;
+ int err;
- f6i = fib6_info_alloc(gfp_flags);
- if (!f6i)
- return ERR_PTR(-ENOMEM);
-
- f6i->dst_nocount = true;
- f6i->dst_host = true;
- f6i->fib6_protocol = RTPROT_KERNEL;
- f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
if (anycast) {
- f6i->fib6_type = RTN_ANYCAST;
- f6i->fib6_flags |= RTF_ANYCAST;
+ cfg.fc_type = RTN_ANYCAST;
+ cfg.fc_flags |= RTF_ANYCAST;
} else {
- f6i->fib6_type = RTN_LOCAL;
- f6i->fib6_flags |= RTF_LOCAL;
+ cfg.fc_type = RTN_LOCAL;
+ cfg.fc_flags |= RTF_LOCAL;
}
- f6i->fib6_nh.nh_gw = *addr;
- dev_hold(dev);
- f6i->fib6_nh.nh_dev = dev;
- f6i->fib6_dst.addr = *addr;
- f6i->fib6_dst.plen = 128;
- tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
- f6i->fib6_table = fib6_get_table(net, tb_id);
+ f6i = ip6_route_info_create(&cfg, gfp_flags, extack);
+ if (IS_ERR(f6i))
+ return f6i;
+
+ err = ip6_route_info_create_nh(f6i, &cfg, gfp_flags, extack);
+ if (err)
+ return ERR_PTR(err);
+
+ f6i->dst_nocount = true;
+
+ if (!anycast &&
+ (READ_ONCE(net->ipv6.devconf_all->disable_policy) ||
+ READ_ONCE(idev->cnf.disable_policy)))
+ f6i->dst_nopolicy = true;
return f6i;
}
/* remove deleted ip from prefsrc entries */
struct arg_dev_net_ip {
- struct net_device *dev;
struct net *net;
struct in6_addr *addr;
};
static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
{
- struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
struct net *net = ((struct arg_dev_net_ip *)arg)->net;
struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
- if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
+ if (!rt->nh &&
rt != net->ipv6.fib6_null_entry &&
- ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
+ ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr) &&
+ !ipv6_chk_addr(net, addr, rt->fib6_nh->fib_nh_dev, 0)) {
spin_lock_bh(&rt6_exception_lock);
/* remove prefsrc entry */
rt->fib6_prefsrc.plen = 0;
- /* need to update cache as well */
- rt6_exceptions_remove_prefsrc(rt);
spin_unlock_bh(&rt6_exception_lock);
}
return 0;
@@ -3811,30 +4720,34 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
{
struct net *net = dev_net(ifp->idev->dev);
struct arg_dev_net_ip adni = {
- .dev = ifp->idev->dev,
.net = net,
.addr = &ifp->addr,
};
fib6_clean_all(net, fib6_remove_prefsrc, &adni);
}
-#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
+#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT)
/* Remove routers and update dst entries when gateway turn into host. */
static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
{
struct in6_addr *gateway = (struct in6_addr *)arg;
+ struct fib6_nh *nh;
+ /* RA routes do not use nexthops */
+ if (rt->nh)
+ return 0;
+
+ nh = rt->fib6_nh;
if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
- ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
+ nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
return -1;
- }
/* Further clean up cached routes in exception table.
* This is needed because cached route may have a different
* gateway than its 'parent' in the case of an ip redirect.
*/
- rt6_exceptions_clean_tohost(rt, gateway);
+ fib6_nh_exceptions_clean_tohost(nh, gateway);
return 0;
}
@@ -3847,7 +4760,7 @@ void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
struct arg_netdev_event {
const struct net_device *dev;
union {
- unsigned int nh_flags;
+ unsigned char nh_flags;
unsigned long event;
};
};
@@ -3872,11 +4785,12 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
return NULL;
}
+/* only called for fib entries with builtin fib6_nh */
static bool rt6_is_dead(const struct fib6_info *rt)
{
- if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
- (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
- fib6_ignore_linkdown(rt)))
+ if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
+ (rt->fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN &&
+ ip6_ignore_linkdown(rt->fib6_nh->fib_nh_dev)))
return true;
return false;
@@ -3888,11 +4802,11 @@ static int rt6_multipath_total_weight(const struct fib6_info *rt)
int total = 0;
if (!rt6_is_dead(rt))
- total += rt->fib6_nh.nh_weight;
+ total += rt->fib6_nh->fib_nh_weight;
list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
if (!rt6_is_dead(iter))
- total += iter->fib6_nh.nh_weight;
+ total += iter->fib6_nh->fib_nh_weight;
}
return total;
@@ -3903,11 +4817,11 @@ static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
int upper_bound = -1;
if (!rt6_is_dead(rt)) {
- *weight += rt->fib6_nh.nh_weight;
+ *weight += rt->fib6_nh->fib_nh_weight;
upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
total) - 1;
}
- atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
+ atomic_set(&rt->fib6_nh->fib_nh_upper_bound, upper_bound);
}
static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
@@ -3950,8 +4864,9 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg)
const struct arg_netdev_event *arg = p_arg;
struct net *net = dev_net(arg->dev);
- if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
- rt->fib6_nh.nh_flags &= ~arg->nh_flags;
+ if (rt != net->ipv6.fib6_null_entry && !rt->nh &&
+ rt->fib6_nh->fib_nh_dev == arg->dev) {
+ rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
fib6_update_sernum_upto_root(net, rt);
rt6_multipath_rebalance(rt);
}
@@ -3959,7 +4874,7 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg)
return 0;
}
-void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
+void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
{
struct arg_netdev_event arg = {
.dev = dev,
@@ -3974,15 +4889,16 @@ void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
}
+/* only called for fib entries with inline fib6_nh */
static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
const struct net_device *dev)
{
struct fib6_info *iter;
- if (rt->fib6_nh.nh_dev == dev)
+ if (rt->fib6_nh->fib_nh_dev == dev)
return true;
list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
- if (iter->fib6_nh.nh_dev == dev)
+ if (iter->fib6_nh->fib_nh_dev == dev)
return true;
return false;
@@ -4003,12 +4919,12 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
struct fib6_info *iter;
unsigned int dead = 0;
- if (rt->fib6_nh.nh_dev == down_dev ||
- rt->fib6_nh.nh_flags & RTNH_F_DEAD)
+ if (rt->fib6_nh->fib_nh_dev == down_dev ||
+ rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
dead++;
list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
- if (iter->fib6_nh.nh_dev == down_dev ||
- iter->fib6_nh.nh_flags & RTNH_F_DEAD)
+ if (iter->fib6_nh->fib_nh_dev == down_dev ||
+ iter->fib6_nh->fib_nh_flags & RTNH_F_DEAD)
dead++;
return dead;
@@ -4016,15 +4932,15 @@ static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
const struct net_device *dev,
- unsigned int nh_flags)
+ unsigned char nh_flags)
{
struct fib6_info *iter;
- if (rt->fib6_nh.nh_dev == dev)
- rt->fib6_nh.nh_flags |= nh_flags;
+ if (rt->fib6_nh->fib_nh_dev == dev)
+ rt->fib6_nh->fib_nh_flags |= nh_flags;
list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
- if (iter->fib6_nh.nh_dev == dev)
- iter->fib6_nh.nh_flags |= nh_flags;
+ if (iter->fib6_nh->fib_nh_dev == dev)
+ iter->fib6_nh->fib_nh_flags |= nh_flags;
}
/* called with write lock held for table with rt */
@@ -4034,17 +4950,17 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
const struct net_device *dev = arg->dev;
struct net *net = dev_net(dev);
- if (rt == net->ipv6.fib6_null_entry)
+ if (rt == net->ipv6.fib6_null_entry || rt->nh)
return 0;
switch (arg->event) {
case NETDEV_UNREGISTER:
- return rt->fib6_nh.nh_dev == dev ? -1 : 0;
+ return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
case NETDEV_DOWN:
if (rt->should_flush)
return -1;
if (!rt->fib6_nsiblings)
- return rt->fib6_nh.nh_dev == dev ? -1 : 0;
+ return rt->fib6_nh->fib_nh_dev == dev ? -1 : 0;
if (rt6_multipath_uses_dev(rt, dev)) {
unsigned int count;
@@ -4060,10 +4976,10 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
}
return -2;
case NETDEV_CHANGE:
- if (rt->fib6_nh.nh_dev != dev ||
+ if (rt->fib6_nh->fib_nh_dev != dev ||
rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
break;
- rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
+ rt->fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
rt6_multipath_rebalance(rt);
break;
}
@@ -4079,23 +4995,54 @@ void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
.event = event,
},
};
+ struct net *net = dev_net(dev);
- fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
+ if (net->ipv6.sysctl.skip_notify_on_dev_down)
+ fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
+ else
+ fib6_clean_all(net, fib6_ifdown, &arg);
}
void rt6_disable_ip(struct net_device *dev, unsigned long event)
{
rt6_sync_down_dev(dev, event);
- rt6_uncached_list_flush_dev(dev_net(dev), dev);
+ rt6_uncached_list_flush_dev(dev);
neigh_ifdown(&nd_tbl, dev);
}
struct rt6_mtu_change_arg {
struct net_device *dev;
unsigned int mtu;
+ struct fib6_info *f6i;
};
-static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
+static int fib6_nh_mtu_change(struct fib6_nh *nh, void *_arg)
+{
+ struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *)_arg;
+ struct fib6_info *f6i = arg->f6i;
+
+ /* For administrative MTU increase, there is no way to discover
+ * IPv6 PMTU increase, so PMTU increase should be updated here.
+ * Since RFC 1981 doesn't include administrative MTU increase
+ * update PMTU increase is a MUST. (i.e. jumbo frame)
+ */
+ if (nh->fib_nh_dev == arg->dev) {
+ struct inet6_dev *idev = __in6_dev_get(arg->dev);
+ u32 mtu = f6i->fib6_pmtu;
+
+ if (mtu >= arg->mtu ||
+ (mtu < arg->mtu && mtu == idev->cnf.mtu6))
+ fib6_metric_set(f6i, RTAX_MTU, arg->mtu);
+
+ spin_lock_bh(&rt6_exception_lock);
+ rt6_exceptions_update_pmtu(idev, nh, arg->mtu);
+ spin_unlock_bh(&rt6_exception_lock);
+ }
+
+ return 0;
+}
+
+static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg)
{
struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
struct inet6_dev *idev;
@@ -4110,24 +5057,17 @@ static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
if (!idev)
return 0;
- /* For administrative MTU increase, there is no way to discover
- IPv6 PMTU increase, so PMTU increase should be updated here.
- Since RFC 1981 doesn't include administrative MTU increase
- update PMTU increase is a MUST. (i.e. jumbo frame)
- */
- if (rt->fib6_nh.nh_dev == arg->dev &&
- !fib6_metric_locked(rt, RTAX_MTU)) {
- u32 mtu = rt->fib6_pmtu;
-
- if (mtu >= arg->mtu ||
- (mtu < arg->mtu && mtu == idev->cnf.mtu6))
- fib6_metric_set(rt, RTAX_MTU, arg->mtu);
+ if (fib6_metric_locked(f6i, RTAX_MTU))
+ return 0;
- spin_lock_bh(&rt6_exception_lock);
- rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
- spin_unlock_bh(&rt6_exception_lock);
+ arg->f6i = f6i;
+ if (f6i->nh) {
+ /* fib6_nh_mtu_change only returns 0, so this is safe */
+ return nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_mtu_change,
+ arg);
}
- return 0;
+
+ return fib6_nh_mtu_change(f6i->fib6_nh, arg);
}
void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
@@ -4141,6 +5081,7 @@ void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
}
static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
+ [RTA_UNSPEC] = { .strict_start_type = RTA_DPORT + 1 },
[RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
[RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
[RTA_OIF] = { .type = NLA_U32 },
@@ -4158,32 +5099,99 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_IP_PROTO] = { .type = NLA_U8 },
[RTA_SPORT] = { .type = NLA_U16 },
[RTA_DPORT] = { .type = NLA_U16 },
+ [RTA_NH_ID] = { .type = NLA_U32 },
+ [RTA_FLOWLABEL] = { .type = NLA_BE32 },
};
+static int rtm_to_fib6_multipath_config(struct fib6_config *cfg,
+ struct netlink_ext_ack *extack,
+ bool newroute)
+{
+ struct rtnexthop *rtnh;
+ int remaining;
+
+ remaining = cfg->fc_mp_len;
+ rtnh = (struct rtnexthop *)cfg->fc_mp;
+
+ if (!rtnh_ok(rtnh, remaining)) {
+ NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - no valid nexthops");
+ return -EINVAL;
+ }
+
+ do {
+ bool has_gateway = cfg->fc_flags & RTF_GATEWAY;
+ int attrlen = rtnh_attrlen(rtnh);
+
+ if (attrlen > 0) {
+ struct nlattr *nla, *attrs;
+
+ attrs = rtnh_attrs(rtnh);
+ nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+ if (nla) {
+ if (nla_len(nla) < sizeof(cfg->fc_gateway)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid IPv6 address in RTA_GATEWAY");
+ return -EINVAL;
+ }
+
+ has_gateway = true;
+ }
+ }
+
+ if (newroute && (cfg->fc_nh_id || !has_gateway)) {
+ NL_SET_ERR_MSG(extack,
+ "Device only routes can not be added for IPv6 using the multipath API.");
+ return -EINVAL;
+ }
+
+ rtnh = rtnh_next(rtnh, &remaining);
+ } while (rtnh_ok(rtnh, remaining));
+
+ return lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len, extack);
+}
+
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
struct fib6_config *cfg,
struct netlink_ext_ack *extack)
{
- struct rtmsg *rtm;
+ bool newroute = nlh->nlmsg_type == RTM_NEWROUTE;
struct nlattr *tb[RTA_MAX+1];
+ struct rtmsg *rtm;
unsigned int pref;
int err;
- err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
- NULL);
+ err = nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv6_policy, extack);
if (err < 0)
goto errout;
err = -EINVAL;
rtm = nlmsg_data(nlh);
- memset(cfg, 0, sizeof(*cfg));
- cfg->fc_table = rtm->rtm_table;
- cfg->fc_dst_len = rtm->rtm_dst_len;
- cfg->fc_src_len = rtm->rtm_src_len;
- cfg->fc_flags = RTF_UP;
- cfg->fc_protocol = rtm->rtm_protocol;
- cfg->fc_type = rtm->rtm_type;
+ if (rtm->rtm_tos) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid dsfield (tos): option not available for IPv6");
+ goto errout;
+ }
+
+ if (tb[RTA_FLOWLABEL]) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL],
+ "Flow label cannot be specified for this operation");
+ goto errout;
+ }
+
+ *cfg = (struct fib6_config){
+ .fc_table = rtm->rtm_table,
+ .fc_dst_len = rtm->rtm_dst_len,
+ .fc_src_len = rtm->rtm_src_len,
+ .fc_flags = RTF_UP,
+ .fc_protocol = rtm->rtm_protocol,
+ .fc_type = rtm->rtm_type,
+
+ .fc_nlinfo.portid = NETLINK_CB(skb).portid,
+ .fc_nlinfo.nlh = nlh,
+ .fc_nlinfo.nl_net = sock_net(skb->sk),
+ };
if (rtm->rtm_type == RTN_UNREACHABLE ||
rtm->rtm_type == RTN_BLACKHOLE ||
@@ -4199,14 +5207,24 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
- cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
- cfg->fc_nlinfo.nlh = nlh;
- cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
+ if (tb[RTA_NH_ID]) {
+ if (tb[RTA_GATEWAY] || tb[RTA_OIF] ||
+ tb[RTA_MULTIPATH] || tb[RTA_ENCAP]) {
+ NL_SET_ERR_MSG(extack,
+ "Nexthop specification and nexthop id are mutually exclusive");
+ goto errout;
+ }
+ cfg->fc_nh_id = nla_get_u32(tb[RTA_NH_ID]);
+ }
if (tb[RTA_GATEWAY]) {
cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
cfg->fc_flags |= RTF_GATEWAY;
}
+ if (tb[RTA_VIA]) {
+ NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
+ goto errout;
+ }
if (tb[RTA_DST]) {
int plen = (rtm->rtm_dst_len + 7) >> 3;
@@ -4247,8 +5265,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
- err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
- cfg->fc_mp_len, extack);
+ err = rtm_to_fib6_multipath_config(cfg, extack, newroute);
if (err < 0)
goto errout;
}
@@ -4289,40 +5306,28 @@ errout:
struct rt6_nh {
struct fib6_info *fib6_info;
struct fib6_config r_cfg;
- struct list_head next;
+ struct list_head list;
};
-static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
-{
- struct rt6_nh *nh;
-
- list_for_each_entry(nh, rt6_nh_list, next) {
- pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
- &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
- nh->r_cfg.fc_ifindex);
- }
-}
-
-static int ip6_route_info_append(struct net *net,
- struct list_head *rt6_nh_list,
+static int ip6_route_info_append(struct list_head *rt6_nh_list,
struct fib6_info *rt,
struct fib6_config *r_cfg)
{
struct rt6_nh *nh;
- int err = -EEXIST;
- list_for_each_entry(nh, rt6_nh_list, next) {
+ list_for_each_entry(nh, rt6_nh_list, list) {
/* check if fib6_info already exists */
if (rt6_duplicate_nexthop(nh->fib6_info, rt))
- return err;
+ return -EEXIST;
}
nh = kzalloc(sizeof(*nh), GFP_KERNEL);
if (!nh)
return -ENOMEM;
+
nh->fib6_info = rt;
memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
- list_add_tail(&nh->next, rt6_nh_list);
+ list_add_tail(&nh->list, rt6_nh_list);
return 0;
}
@@ -4338,14 +5343,45 @@ static void ip6_route_mpath_notify(struct fib6_info *rt,
* nexthop. Since sibling routes are always added at the end of
* the list, find the first sibling of the last route appended
*/
- if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
- rt = list_first_entry(&rt_last->fib6_siblings,
- struct fib6_info,
- fib6_siblings);
+ rcu_read_lock();
+
+ if ((nlflags & NLM_F_APPEND) && rt_last &&
+ READ_ONCE(rt_last->fib6_nsiblings)) {
+ rt = list_first_or_null_rcu(&rt_last->fib6_siblings,
+ struct fib6_info,
+ fib6_siblings);
}
if (rt)
inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
+
+ rcu_read_unlock();
+}
+
+static bool ip6_route_mpath_should_notify(const struct fib6_info *rt)
+{
+ bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
+ bool should_notify = false;
+ struct fib6_info *leaf;
+ struct fib6_node *fn;
+
+ rcu_read_lock();
+ fn = rcu_dereference(rt->fib6_node);
+ if (!fn)
+ goto out;
+
+ leaf = rcu_dereference(fn->leaf);
+ if (!leaf)
+ goto out;
+
+ if (rt == leaf ||
+ (rt_can_ecmp && rt->fib6_metric == leaf->fib6_metric &&
+ rt6_qualify_for_ecmp(leaf)))
+ should_notify = true;
+out:
+ rcu_read_unlock();
+
+ return should_notify;
}
static int ip6_route_multipath_add(struct fib6_config *cfg,
@@ -4353,19 +5389,25 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
{
struct fib6_info *rt_notif = NULL, *rt_last = NULL;
struct nl_info *info = &cfg->fc_nlinfo;
+ struct rt6_nh *nh, *nh_safe;
struct fib6_config r_cfg;
struct rtnexthop *rtnh;
- struct fib6_info *rt;
+ LIST_HEAD(rt6_nh_list);
struct rt6_nh *err_nh;
- struct rt6_nh *nh, *nh_safe;
+ struct fib6_info *rt;
__u16 nlflags;
int remaining;
int attrlen;
- int err = 1;
+ int replace;
int nhn = 0;
- int replace = (cfg->fc_nlinfo.nlh &&
- (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
- LIST_HEAD(rt6_nh_list);
+ int err;
+
+ err = fib6_config_validate(cfg, extack);
+ if (err)
+ return err;
+
+ replace = (cfg->fc_nlinfo.nlh &&
+ (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
@@ -4391,6 +5433,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
r_cfg.fc_gateway = nla_get_in6_addr(nla);
r_cfg.fc_flags |= RTF_GATEWAY;
}
+
r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
if (nla)
@@ -4404,18 +5447,16 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
rt = NULL;
goto cleanup;
}
- if (!rt6_qualify_for_ecmp(rt)) {
- err = -EINVAL;
- NL_SET_ERR_MSG(extack,
- "Device only routes can not be added for IPv6 using the multipath API.");
- fib6_info_release(rt);
+
+ err = ip6_route_info_create_nh(rt, &r_cfg, GFP_KERNEL, extack);
+ if (err) {
+ rt = NULL;
goto cleanup;
}
- rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
+ rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;
- err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
- rt, &r_cfg);
+ err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
if (err) {
fib6_info_release(rt);
goto cleanup;
@@ -4430,28 +5471,28 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
*/
info->skip_notify = 1;
+ /* For add and replace, send one notification with all nexthops. For
+ * append, send one notification with all appended nexthops.
+ */
+ info->skip_notify_kernel = 1;
+
err_nh = NULL;
- list_for_each_entry(nh, &rt6_nh_list, next) {
+ list_for_each_entry(nh, &rt6_nh_list, list) {
err = __ip6_ins_rt(nh->fib6_info, info, extack);
- fib6_info_release(nh->fib6_info);
-
- if (!err) {
- /* save reference to last route successfully inserted */
- rt_last = nh->fib6_info;
-
- /* save reference to first route for notification */
- if (!rt_notif)
- rt_notif = nh->fib6_info;
- }
- /* nh->fib6_info is used or freed at this point, reset to NULL*/
- nh->fib6_info = NULL;
if (err) {
if (replace && nhn)
- ip6_print_replace_route_err(&rt6_nh_list);
+ NL_SET_ERR_MSG_MOD(extack,
+ "multipath route replace failed (check consistency of installed routes)");
err_nh = nh;
goto add_errout;
}
+ /* save reference to last route successfully inserted */
+ rt_last = nh->fib6_info;
+
+ /* save reference to first route for notification */
+ if (!rt_notif)
+ rt_notif = nh->fib6_info;
/* Because each route is added like a single route we remove
* these flags after the first nexthop: if there is a collision,
@@ -4460,11 +5501,37 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
* nexthops have been replaced by first new, the rest should
* be added to it.
*/
- cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
- NLM_F_REPLACE);
+ if (cfg->fc_nlinfo.nlh) {
+ cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
+ NLM_F_REPLACE);
+ cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
+ }
nhn++;
}
+ /* An in-kernel notification should only be sent in case the new
+ * multipath route is added as the first route in the node, or if
+ * it was appended to it. We pass 'rt_notif' since it is the first
+ * sibling and might allow us to skip some checks in the replace case.
+ */
+ if (ip6_route_mpath_should_notify(rt_notif)) {
+ enum fib_event_type fib_event;
+
+ if (rt_notif->fib6_nsiblings != nhn - 1)
+ fib_event = FIB_EVENT_ENTRY_APPEND;
+ else
+ fib_event = FIB_EVENT_ENTRY_REPLACE;
+
+ err = call_fib6_multipath_entry_notifiers(info->nl_net,
+ fib_event, rt_notif,
+ nhn - 1, extack);
+ if (err) {
+ /* Delete all the siblings that were just added */
+ err_nh = NULL;
+ goto add_errout;
+ }
+ }
+
/* success ... tell user about new route */
ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
goto cleanup;
@@ -4478,17 +5545,16 @@ add_errout:
ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
/* Delete routes that were already added */
- list_for_each_entry(nh, &rt6_nh_list, next) {
+ list_for_each_entry(nh, &rt6_nh_list, list) {
if (err_nh == nh)
break;
ip6_route_del(&nh->r_cfg, extack);
}
cleanup:
- list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
- if (nh->fib6_info)
- fib6_info_release(nh->fib6_info);
- list_del(&nh->next);
+ list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, list) {
+ fib6_info_release(nh->fib6_info);
+ list_del(&nh->list);
kfree(nh);
}
@@ -4500,9 +5566,10 @@ static int ip6_route_multipath_del(struct fib6_config *cfg,
{
struct fib6_config r_cfg;
struct rtnexthop *rtnh;
+ int last_err = 0;
int remaining;
int attrlen;
- int err = 1, last_err = 0;
+ int err;
remaining = cfg->fc_mp_len;
rtnh = (struct rtnexthop *)cfg->fc_mp;
@@ -4519,10 +5586,11 @@ static int ip6_route_multipath_del(struct fib6_config *cfg,
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
if (nla) {
- nla_memcpy(&r_cfg.fc_gateway, nla, 16);
+ r_cfg.fc_gateway = nla_get_in6_addr(nla);
r_cfg.fc_flags |= RTF_GATEWAY;
}
}
+
err = ip6_route_del(&r_cfg, extack);
if (err)
last_err = err;
@@ -4543,9 +5611,20 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
return err;
- if (cfg.fc_mp)
+ if (cfg.fc_nh_id) {
+ rcu_read_lock();
+ err = !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id);
+ rcu_read_unlock();
+
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ return -EINVAL;
+ }
+ }
+
+ if (cfg.fc_mp) {
return ip6_route_multipath_del(&cfg, extack);
- else {
+ } else {
cfg.fc_delete_all_nh = 1;
return ip6_route_del(&cfg, extack);
}
@@ -4561,25 +5640,64 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
return err;
+ if (cfg.fc_metric == 0)
+ cfg.fc_metric = IP6_RT_PRIO_USER;
+
if (cfg.fc_mp)
return ip6_route_multipath_add(&cfg, extack);
else
return ip6_route_add(&cfg, GFP_KERNEL, extack);
}
-static size_t rt6_nlmsg_size(struct fib6_info *rt)
+/* add the overhead of this fib6_nh to nexthop_len */
+static int rt6_nh_nlmsg_size(struct fib6_nh *nh, void *arg)
{
- int nexthop_len = 0;
+ int *nexthop_len = arg;
- if (rt->fib6_nsiblings) {
- nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
- + NLA_ALIGN(sizeof(struct rtnexthop))
- + nla_total_size(16) /* RTA_GATEWAY */
- + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
+ *nexthop_len += nla_total_size(0) /* RTA_MULTIPATH */
+ + NLA_ALIGN(sizeof(struct rtnexthop))
+ + nla_total_size(16); /* RTA_GATEWAY */
- nexthop_len *= rt->fib6_nsiblings;
+ if (nh->fib_nh_lws) {
+ /* RTA_ENCAP_TYPE */
+ *nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
+ /* RTA_ENCAP */
+ *nexthop_len += nla_total_size(2);
}
+ return 0;
+}
+
+static size_t rt6_nlmsg_size(struct fib6_info *f6i)
+{
+ struct fib6_info *sibling;
+ struct fib6_nh *nh;
+ int nexthop_len;
+
+ if (f6i->nh) {
+ nexthop_len = nla_total_size(4); /* RTA_NH_ID */
+ nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size,
+ &nexthop_len);
+ goto common;
+ }
+
+ rcu_read_lock();
+retry:
+ nh = f6i->fib6_nh;
+ nexthop_len = 0;
+ if (READ_ONCE(f6i->fib6_nsiblings)) {
+ rt6_nh_nlmsg_size(nh, &nexthop_len);
+
+ list_for_each_entry_rcu(sibling, &f6i->fib6_siblings,
+ fib6_siblings) {
+ rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len);
+ if (!READ_ONCE(f6i->fib6_nsiblings))
+ goto retry;
+ }
+ }
+ rcu_read_unlock();
+ nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws);
+common:
return NLMSG_ALIGN(sizeof(struct rtmsg))
+ nla_total_size(16) /* RTA_SRC */
+ nla_total_size(16) /* RTA_DST */
@@ -4593,70 +5711,31 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt)
+ nla_total_size(sizeof(struct rta_cacheinfo))
+ nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
+ nla_total_size(1) /* RTA_PREF */
- + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
+ nexthop_len;
}
-static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
- unsigned int *flags, bool skip_oif)
+static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
+ unsigned char *flags)
{
- if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
- *flags |= RTNH_F_DEAD;
-
- if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
- *flags |= RTNH_F_LINKDOWN;
-
- rcu_read_lock();
- if (fib6_ignore_linkdown(rt))
- *flags |= RTNH_F_DEAD;
- rcu_read_unlock();
- }
+ if (nexthop_is_multipath(nh)) {
+ struct nlattr *mp;
- if (rt->fib6_flags & RTF_GATEWAY) {
- if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
+ mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
+ if (!mp)
goto nla_put_failure;
- }
-
- *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
- if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
- *flags |= RTNH_F_OFFLOAD;
-
- /* not needed for multipath encoding b/c it has a rtnexthop struct */
- if (!skip_oif && rt->fib6_nh.nh_dev &&
- nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
- goto nla_put_failure;
-
- if (rt->fib6_nh.nh_lwtstate &&
- lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return -EMSGSIZE;
-}
-
-/* add multipath next hop */
-static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
-{
- const struct net_device *dev = rt->fib6_nh.nh_dev;
- struct rtnexthop *rtnh;
- unsigned int flags = 0;
-
- rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
- if (!rtnh)
- goto nla_put_failure;
-
- rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
- rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
-
- if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
- goto nla_put_failure;
+ if (nexthop_mpath_fill_node(skb, nh, AF_INET6))
+ goto nla_put_failure;
- rtnh->rtnh_flags = flags;
+ nla_nest_end(skb, mp);
+ } else {
+ struct fib6_nh *fib6_nh;
- /* length of rtnetlink header + attributes */
- rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
+ fib6_nh = nexthop_fib6_nh(nh);
+ if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6,
+ flags, false) < 0)
+ goto nla_put_failure;
+ }
return 0;
@@ -4670,9 +5749,10 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
int iif, int type, u32 portid, u32 seq,
unsigned int flags)
{
- struct rt6_info *rt6 = (struct rt6_info *)dst;
+ struct rt6_info *rt6 = dst_rt6_info(dst);
struct rt6key *rt6_dst, *rt6_src;
u32 *pmetrics, table, rt6_flags;
+ unsigned char nh_flags = 0;
struct nlmsghdr *nlh;
struct rtmsg *rtm;
long expires = 0;
@@ -4700,7 +5780,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
table = rt->fib6_table->tb6_id;
else
table = RT6_TABLE_UNSPEC;
- rtm->rtm_table = table;
+ rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
if (nla_put_u32(skb, RTA_TABLE, table))
goto nla_put_failure;
@@ -4743,7 +5823,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
goto nla_put_failure;
} else if (dest) {
struct in6_addr saddr_buf;
- if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
+ if (ip6_route_get_saddr(net, rt, dest, 0, 0, &saddr_buf) == 0 &&
nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
goto nla_put_failure;
}
@@ -4766,40 +5846,81 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
* each as a nexthop within RTA_MULTIPATH.
*/
if (rt6) {
+ struct net_device *dev;
+
if (rt6_flags & RTF_GATEWAY &&
nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
goto nla_put_failure;
- if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
+ dev = dst_dev(dst);
+ if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
goto nla_put_failure;
- } else if (rt->fib6_nsiblings) {
- struct fib6_info *sibling, *next_sibling;
+
+ if (lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0)
+ goto nla_put_failure;
+ } else if (READ_ONCE(rt->fib6_nsiblings)) {
+ struct fib6_info *sibling;
struct nlattr *mp;
- mp = nla_nest_start(skb, RTA_MULTIPATH);
+ mp = nla_nest_start_noflag(skb, RTA_MULTIPATH);
if (!mp)
goto nla_put_failure;
- if (rt6_add_nexthop(skb, rt) < 0)
+ if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common,
+ rt->fib6_nh->fib_nh_weight, AF_INET6,
+ 0) < 0)
goto nla_put_failure;
- list_for_each_entry_safe(sibling, next_sibling,
- &rt->fib6_siblings, fib6_siblings) {
- if (rt6_add_nexthop(skb, sibling) < 0)
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(sibling, &rt->fib6_siblings,
+ fib6_siblings) {
+ if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
+ sibling->fib6_nh->fib_nh_weight,
+ AF_INET6, 0) < 0) {
+ rcu_read_unlock();
+
goto nla_put_failure;
+ }
}
+ rcu_read_unlock();
+
nla_nest_end(skb, mp);
+ } else if (rt->nh) {
+ if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
+ goto nla_put_failure;
+
+ if (nexthop_is_blackhole(rt->nh))
+ rtm->rtm_type = RTN_BLACKHOLE;
+
+ if (READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode) &&
+ rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
+ goto nla_put_failure;
+
+ rtm->rtm_flags |= nh_flags;
} else {
- if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
+ if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6,
+ &nh_flags, false) < 0)
goto nla_put_failure;
+
+ rtm->rtm_flags |= nh_flags;
}
if (rt6_flags & RTF_EXPIRES) {
- expires = dst ? dst->expires : rt->expires;
+ expires = dst ? READ_ONCE(dst->expires) : rt->expires;
expires -= jiffies;
}
+ if (!dst) {
+ if (READ_ONCE(rt->offload))
+ rtm->rtm_flags |= RTM_F_OFFLOAD;
+ if (READ_ONCE(rt->trap))
+ rtm->rtm_flags |= RTM_F_TRAP;
+ if (READ_ONCE(rt->offload_failed))
+ rtm->rtm_flags |= RTM_F_OFFLOAD_FAILED;
+ }
+
if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
goto nla_put_failure;
@@ -4815,28 +5936,248 @@ nla_put_failure:
return -EMSGSIZE;
}
-int rt6_dump_route(struct fib6_info *rt, void *p_arg)
+static int fib6_info_nh_uses_dev(struct fib6_nh *nh, void *arg)
+{
+ const struct net_device *dev = arg;
+
+ if (nh->fib_nh_dev == dev)
+ return 1;
+
+ return 0;
+}
+
+static bool fib6_info_uses_dev(const struct fib6_info *f6i,
+ const struct net_device *dev)
+{
+ if (f6i->nh) {
+ struct net_device *_dev = (struct net_device *)dev;
+
+ return !!nexthop_for_each_fib6_nh(f6i->nh,
+ fib6_info_nh_uses_dev,
+ _dev);
+ }
+
+ if (f6i->fib6_nh->fib_nh_dev == dev)
+ return true;
+
+ if (READ_ONCE(f6i->fib6_nsiblings)) {
+ const struct fib6_info *sibling;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sibling, &f6i->fib6_siblings,
+ fib6_siblings) {
+ if (sibling->fib6_nh->fib_nh_dev == dev) {
+ rcu_read_unlock();
+ return true;
+ }
+ if (!READ_ONCE(f6i->fib6_nsiblings))
+ break;
+ }
+ rcu_read_unlock();
+ }
+ return false;
+}
+
+struct fib6_nh_exception_dump_walker {
+ struct rt6_rtnl_dump_arg *dump;
+ struct fib6_info *rt;
+ unsigned int flags;
+ unsigned int skip;
+ unsigned int count;
+};
+
+static int rt6_nh_dump_exceptions(struct fib6_nh *nh, void *arg)
+{
+ struct fib6_nh_exception_dump_walker *w = arg;
+ struct rt6_rtnl_dump_arg *dump = w->dump;
+ struct rt6_exception_bucket *bucket;
+ struct rt6_exception *rt6_ex;
+ int i, err;
+
+ bucket = fib6_nh_get_excptn_bucket(nh, NULL);
+ if (!bucket)
+ return 0;
+
+ for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
+ hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
+ if (w->skip) {
+ w->skip--;
+ continue;
+ }
+
+ /* Expiration of entries doesn't bump sernum, insertion
+ * does. Removal is triggered by insertion, so we can
+ * rely on the fact that if entries change between two
+ * partial dumps, this node is scanned again completely,
+ * see rt6_insert_exception() and fib6_dump_table().
+ *
+ * Count expired entries we go through as handled
+ * entries that we'll skip next time, in case of partial
+ * node dump. Otherwise, if entries expire meanwhile,
+ * we'll skip the wrong amount.
+ */
+ if (rt6_check_expired(rt6_ex->rt6i)) {
+ w->count++;
+ continue;
+ }
+
+ err = rt6_fill_node(dump->net, dump->skb, w->rt,
+ &rt6_ex->rt6i->dst, NULL, NULL, 0,
+ RTM_NEWROUTE,
+ NETLINK_CB(dump->cb->skb).portid,
+ dump->cb->nlh->nlmsg_seq, w->flags);
+ if (err)
+ return err;
+
+ w->count++;
+ }
+ bucket++;
+ }
+
+ return 0;
+}
+
+/* Return -1 if done with node, number of handled routes on partial dump */
+int rt6_dump_route(struct fib6_info *rt, void *p_arg, unsigned int skip)
{
struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
+ struct fib_dump_filter *filter = &arg->filter;
+ unsigned int flags = NLM_F_MULTI;
struct net *net = arg->net;
+ int count = 0;
if (rt == net->ipv6.fib6_null_entry)
- return 0;
+ return -1;
+
+ if ((filter->flags & RTM_F_PREFIX) &&
+ !(rt->fib6_flags & RTF_PREFIX_RT)) {
+ /* success since this is not a prefix route */
+ return -1;
+ }
+ if (filter->filter_set &&
+ ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
+ (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
+ (filter->protocol && rt->fib6_protocol != filter->protocol))) {
+ return -1;
+ }
+
+ if (filter->filter_set ||
+ !filter->dump_routes || !filter->dump_exceptions) {
+ flags |= NLM_F_DUMP_FILTERED;
+ }
+
+ if (filter->dump_routes) {
+ if (skip) {
+ skip--;
+ } else {
+ if (rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL,
+ 0, RTM_NEWROUTE,
+ NETLINK_CB(arg->cb->skb).portid,
+ arg->cb->nlh->nlmsg_seq, flags)) {
+ return 0;
+ }
+ count++;
+ }
+ }
+
+ if (filter->dump_exceptions) {
+ struct fib6_nh_exception_dump_walker w = { .dump = arg,
+ .rt = rt,
+ .flags = flags,
+ .skip = skip,
+ .count = 0 };
+ int err;
+
+ rcu_read_lock();
+ if (rt->nh) {
+ err = nexthop_for_each_fib6_nh(rt->nh,
+ rt6_nh_dump_exceptions,
+ &w);
+ } else {
+ err = rt6_nh_dump_exceptions(rt->fib6_nh, &w);
+ }
+ rcu_read_unlock();
+
+ if (err)
+ return count + w.count;
+ }
+
+ return -1;
+}
+
+static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct rtmsg *rtm;
+ int i, err;
+
+ rtm = nlmsg_payload(nlh, sizeof(*rtm));
+ if (!rtm) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Invalid header for get route request");
+ return -EINVAL;
+ }
+
+ if (!netlink_strict_get_check(skb))
+ return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv6_policy, extack);
+
+ if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
+ (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
+ rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
+ rtm->rtm_type) {
+ NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
+ return -EINVAL;
+ }
+ if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Invalid flags for get route request");
+ return -EINVAL;
+ }
+
+ err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
+ rtm_ipv6_policy, extack);
+ if (err)
+ return err;
+
+ if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
+ (tb[RTA_DST] && !rtm->rtm_dst_len)) {
+ NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
+ return -EINVAL;
+ }
+
+ if (tb[RTA_FLOWLABEL] &&
+ (nla_get_be32(tb[RTA_FLOWLABEL]) & ~IPV6_FLOWLABEL_MASK)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL],
+ "Invalid flow label");
+ return -EINVAL;
+ }
- if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
- struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
+ for (i = 0; i <= RTA_MAX; i++) {
+ if (!tb[i])
+ continue;
- /* user wants prefix routes only */
- if (rtm->rtm_flags & RTM_F_PREFIX &&
- !(rt->fib6_flags & RTF_PREFIX_RT)) {
- /* success since this is not a prefix route */
- return 1;
+ switch (i) {
+ case RTA_SRC:
+ case RTA_DST:
+ case RTA_IIF:
+ case RTA_OIF:
+ case RTA_MARK:
+ case RTA_UID:
+ case RTA_SPORT:
+ case RTA_DPORT:
+ case RTA_IP_PROTO:
+ case RTA_FLOWLABEL:
+ break;
+ default:
+ NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
+ return -EINVAL;
}
}
- return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
- RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
- arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
+ return 0;
}
static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
@@ -4850,18 +6191,16 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct rt6_info *rt;
struct sk_buff *skb;
struct rtmsg *rtm;
- struct flowi6 fl6;
+ struct flowi6 fl6 = {};
+ __be32 flowlabel;
bool fibmatch;
- err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
- extack);
+ err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
if (err < 0)
goto errout;
err = -EINVAL;
- memset(&fl6, 0, sizeof(fl6));
rtm = nlmsg_data(nlh);
- fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
if (tb[RTA_SRC]) {
@@ -4901,11 +6240,15 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
if (tb[RTA_IP_PROTO]) {
err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
- &fl6.flowi6_proto, extack);
+ &fl6.flowi6_proto, AF_INET6,
+ extack);
if (err)
goto errout;
}
+ flowlabel = nla_get_be32_default(tb[RTA_FLOWLABEL], 0);
+ fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, flowlabel);
+
if (iif) {
struct net_device *dev;
int flags = 0;
@@ -4934,7 +6277,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
}
- rt = container_of(dst, struct rt6_info, dst);
+ rt = dst_rt6_info(dst);
if (rt->dst.error) {
err = rt->dst.error;
ip6_rt_put(rt);
@@ -4958,16 +6301,20 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
rcu_read_lock();
from = rcu_dereference(rt->from);
-
- if (fibmatch)
- err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
- RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
- nlh->nlmsg_seq, 0);
- else
- err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
- &fl6.saddr, iif, RTM_NEWROUTE,
- NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
- 0);
+ if (from) {
+ if (fibmatch)
+ err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
+ iif, RTM_NEWROUTE,
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0);
+ else
+ err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
+ &fl6.saddr, iif, RTM_NEWROUTE,
+ NETLINK_CB(in_skb).portid,
+ nlh->nlmsg_seq, 0);
+ } else {
+ err = -ENETUNREACH;
+ }
rcu_read_unlock();
if (err < 0) {
@@ -4983,21 +6330,58 @@ errout:
void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
unsigned int nlm_flags)
{
- struct sk_buff *skb;
struct net *net = info->nl_net;
+ struct sk_buff *skb;
+ size_t sz;
u32 seq;
int err;
err = -ENOBUFS;
seq = info->nlh ? info->nlh->nlmsg_seq : 0;
- skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
+ rcu_read_lock();
+ sz = rt6_nlmsg_size(rt);
+retry:
+ skb = nlmsg_new(sz, GFP_ATOMIC);
if (!skb)
goto errout;
err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
event, info->portid, seq, nlm_flags);
if (err < 0) {
+ kfree_skb(skb);
+ /* -EMSGSIZE implies needed space grew under us. */
+ if (err == -EMSGSIZE) {
+ sz = max(rt6_nlmsg_size(rt), sz << 1);
+ goto retry;
+ }
+ goto errout;
+ }
+
+ rcu_read_unlock();
+
+ rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
+ info->nlh, GFP_ATOMIC);
+ return;
+errout:
+ rcu_read_unlock();
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
+}
+
+void fib6_rt_update(struct net *net, struct fib6_info *rt,
+ struct nl_info *info)
+{
+ u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
+ if (!skb)
+ goto errout;
+
+ err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
+ RTM_NEWROUTE, info->portid, seq, NLM_F_REPLACE);
+ if (err < 0) {
/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
kfree_skb(skb);
@@ -5007,9 +6391,61 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
info->nlh, gfp_any());
return;
errout:
- if (err < 0)
- rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
+}
+
+void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i,
+ bool offload, bool trap, bool offload_failed)
+{
+ struct sk_buff *skb;
+ int err;
+
+ if (READ_ONCE(f6i->offload) == offload &&
+ READ_ONCE(f6i->trap) == trap &&
+ READ_ONCE(f6i->offload_failed) == offload_failed)
+ return;
+
+ WRITE_ONCE(f6i->offload, offload);
+ WRITE_ONCE(f6i->trap, trap);
+
+ /* 2 means send notifications only if offload_failed was changed. */
+ if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 &&
+ READ_ONCE(f6i->offload_failed) == offload_failed)
+ return;
+
+ WRITE_ONCE(f6i->offload_failed, offload_failed);
+
+ if (!rcu_access_pointer(f6i->fib6_node))
+ /* The route was removed from the tree, do not send
+ * notification.
+ */
+ return;
+
+ if (!net->ipv6.sysctl.fib_notify_on_flag_change)
+ return;
+
+ skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL);
+ if (!skb) {
+ err = -ENOBUFS;
+ goto errout;
+ }
+
+ err = rt6_fill_node(net, skb, f6i, NULL, NULL, NULL, 0, RTM_NEWROUTE, 0,
+ 0, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ROUTE, NULL, GFP_KERNEL);
+ return;
+
+errout:
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
}
+EXPORT_SYMBOL(fib6_info_hw_flags_set);
static int ip6_route_dev_notify(struct notifier_block *this,
unsigned long event, void *ptr)
@@ -5021,7 +6457,7 @@ static int ip6_route_dev_notify(struct notifier_block *this,
return NOTIFY_OK;
if (event == NETDEV_REGISTER) {
- net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
+ net->ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = dev;
net->ipv6.ip6_null_entry->dst.dev = dev;
net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -5068,29 +6504,32 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v)
#ifdef CONFIG_SYSCTL
-static
-int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+static int ipv6_sysctl_rtcache_flush(const struct ctl_table *ctl, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net;
int delay;
+ int ret;
if (!write)
return -EINVAL;
+ ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+
net = (struct net *)ctl->extra1;
delay = net->ipv6.sysctl.flush_delay;
- proc_dointvec(ctl, write, buffer, lenp, ppos);
fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
return 0;
}
-struct ctl_table ipv6_route_table_template[] = {
+static struct ctl_table ipv6_route_table_template[] = {
{
- .procname = "flush",
- .data = &init_net.ipv6.sysctl.flush_delay,
+ .procname = "max_size",
+ .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
.maxlen = sizeof(int),
- .mode = 0200,
- .proc_handler = ipv6_sysctl_rtcache_flush
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
},
{
.procname = "gc_thresh",
@@ -5100,11 +6539,11 @@ struct ctl_table ipv6_route_table_template[] = {
.proc_handler = proc_dointvec,
},
{
- .procname = "max_size",
- .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
+ .procname = "flush",
+ .data = &init_net.ipv6.sysctl.flush_delay,
.maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
+ .mode = 0200,
+ .proc_handler = ipv6_sysctl_rtcache_flush
},
{
.procname = "gc_min_interval",
@@ -5155,7 +6594,15 @@ struct ctl_table ipv6_route_table_template[] = {
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
},
- { }
+ {
+ .procname = "skip_notify_on_dev_down",
+ .data = &init_net.ipv6.sysctl.skip_notify_on_dev_down,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
};
struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
@@ -5167,10 +6614,10 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
GFP_KERNEL);
if (table) {
- table[0].data = &net->ipv6.sysctl.flush_delay;
- table[0].extra1 = net;
+ table[0].data = &net->ipv6.sysctl.ip6_rt_max_size;
table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
- table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
+ table[2].data = &net->ipv6.sysctl.flush_delay;
+ table[2].extra1 = net;
table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
@@ -5178,14 +6625,20 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
-
- /* Don't export sysctls to unprivileged users */
- if (net->user_ns != &init_user_ns)
- table[0].procname = NULL;
+ table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
}
return table;
}
+
+size_t ipv6_route_sysctl_table_size(struct net *net)
+{
+ /* Don't export sysctls to unprivileged users */
+ if (net->user_ns != &init_user_ns)
+ return 1;
+
+ return ARRAY_SIZE(ipv6_route_table_template);
+}
#endif
static int __net_init ip6_route_net_init(struct net *net)
@@ -5198,11 +6651,11 @@ static int __net_init ip6_route_net_init(struct net *net)
if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
goto out_ip6_dst_ops;
- net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
- sizeof(*net->ipv6.fib6_null_entry),
- GFP_KERNEL);
+ net->ipv6.fib6_null_entry = fib6_info_alloc(GFP_KERNEL, true);
if (!net->ipv6.fib6_null_entry)
goto out_ip6_dst_entries;
+ memcpy(net->ipv6.fib6_null_entry, &fib6_null_entry_template,
+ sizeof(*net->ipv6.fib6_null_entry));
net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
sizeof(*net->ipv6.ip6_null_entry),
@@ -5212,6 +6665,7 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
ip6_template_metrics, true);
+ INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
net->ipv6.fib6_has_custom_rules = false;
@@ -5223,6 +6677,7 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
ip6_template_metrics, true);
+ INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached);
net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
sizeof(*net->ipv6.ip6_blk_hole_entry),
@@ -5232,18 +6687,23 @@ static int __net_init ip6_route_net_init(struct net *net)
net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
ip6_template_metrics, true);
+ INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached);
+#ifdef CONFIG_IPV6_SUBTREES
+ net->ipv6.fib6_routes_require_src = 0;
+#endif
#endif
net->ipv6.sysctl.flush_delay = 0;
- net->ipv6.sysctl.ip6_rt_max_size = 4096;
+ net->ipv6.sysctl.ip6_rt_max_size = INT_MAX;
net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
+ net->ipv6.sysctl.skip_notify_on_dev_down = 0;
- net->ipv6.ip6_rt_gc_expire = 30*HZ;
+ atomic_set(&net->ipv6.ip6_rt_gc_expire, 30*HZ);
ret = 0;
out:
@@ -5277,10 +6737,16 @@ static void __net_exit ip6_route_net_exit(struct net *net)
static int __net_init ip6_route_net_init_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
- proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
- sizeof(struct ipv6_route_iter));
- proc_create_net_single("rt6_stats", 0444, net->proc_net,
- rt6_stats_seq_show, NULL);
+ if (!proc_create_net("ipv6_route", 0, net->proc_net,
+ &ipv6_route_seq_ops,
+ sizeof(struct ipv6_route_iter)))
+ return -ENOMEM;
+
+ if (!proc_create_net_single("rt6_stats", 0444, net->proc_net,
+ rt6_stats_seq_show, NULL)) {
+ remove_proc_entry("ipv6_route", net->proc_net);
+ return -ENOMEM;
+ }
#endif
return 0;
}
@@ -5338,7 +6804,7 @@ void __init ip6_route_init_special_entries(void)
/* Registering of the loopback is done before this portion of code,
* the loopback reference in rt6_info will not be taken, do it
* manually for init_net */
- init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
+ init_net.ipv6.fib6_null_entry->fib6_nh->fib_nh_dev = init_net.loopback_dev;
init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -5349,6 +6815,51 @@ void __init ip6_route_init_special_entries(void)
#endif
}
+#if IS_BUILTIN(CONFIG_IPV6)
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt)
+
+BTF_ID_LIST_SINGLE(btf_fib6_info_id, struct, fib6_info)
+
+static const struct bpf_iter_seq_info ipv6_route_seq_info = {
+ .seq_ops = &ipv6_route_seq_ops,
+ .init_seq_private = bpf_iter_init_seq_net,
+ .fini_seq_private = bpf_iter_fini_seq_net,
+ .seq_priv_size = sizeof(struct ipv6_route_iter),
+};
+
+static struct bpf_iter_reg ipv6_route_reg_info = {
+ .target = "ipv6_route",
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__ipv6_route, rt),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &ipv6_route_seq_info,
+};
+
+static int __init bpf_iter_register(void)
+{
+ ipv6_route_reg_info.ctx_arg_info[0].btf_id = *btf_fib6_info_id;
+ return bpf_iter_reg_target(&ipv6_route_reg_info);
+}
+
+static void bpf_iter_unregister(void)
+{
+ bpf_iter_unreg_target(&ipv6_route_reg_info);
+}
+#endif
+#endif
+
+static const struct rtnl_msg_handler ip6_route_rtnl_msg_handlers[] __initconst_or_module = {
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWROUTE,
+ .doit = inet6_rtm_newroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELROUTE,
+ .doit = inet6_rtm_delroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
+ {.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE,
+ .doit = inet6_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
+};
+
int __init ip6_route_init(void)
{
int ret;
@@ -5357,7 +6868,7 @@ int __init ip6_route_init(void)
ret = -ENOMEM;
ip6_dst_ops_template.kmem_cachep =
kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
- SLAB_HWCACHE_ALIGN, NULL);
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
if (!ip6_dst_ops_template.kmem_cachep)
goto out;
@@ -5391,25 +6902,21 @@ int __init ip6_route_init(void)
if (ret)
goto fib6_rules_init;
- ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
- inet6_rtm_newroute, NULL, 0);
+ ret = rtnl_register_many(ip6_route_rtnl_msg_handlers);
if (ret < 0)
goto out_register_late_subsys;
- ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
- inet6_rtm_delroute, NULL, 0);
- if (ret < 0)
- goto out_register_late_subsys;
-
- ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
- inet6_rtm_getroute, NULL,
- RTNL_FLAG_DOIT_UNLOCKED);
- if (ret < 0)
+ ret = register_netdevice_notifier(&ip6_route_dev_notifier);
+ if (ret)
goto out_register_late_subsys;
- ret = register_netdevice_notifier(&ip6_route_dev_notifier);
+#if IS_BUILTIN(CONFIG_IPV6)
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+ ret = bpf_iter_register();
if (ret)
goto out_register_late_subsys;
+#endif
+#endif
for_each_possible_cpu(cpu) {
struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
@@ -5443,6 +6950,11 @@ out_kmem_cache:
void ip6_route_cleanup(void)
{
+#if IS_BUILTIN(CONFIG_IPV6)
+#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
+ bpf_iter_unregister();
+#endif
+#endif
unregister_netdevice_notifier(&ip6_route_dev_notifier);
unregister_pernet_subsys(&ip6_route_net_late_ops);
fib6_rules_cleanup();
diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c
new file mode 100644
index 000000000000..e186998bfbf7
--- /dev/null
+++ b/net/ipv6/rpl.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Authors:
+ * (C) 2020 Alexander Aring <alex.aring@gmail.com>
+ */
+
+#include <net/ipv6.h>
+#include <net/rpl.h>
+
+#define IPV6_PFXTAIL_LEN(x) (sizeof(struct in6_addr) - (x))
+#define IPV6_RPL_BEST_ADDR_COMPRESSION 15
+
+static void ipv6_rpl_addr_decompress(struct in6_addr *dst,
+ const struct in6_addr *daddr,
+ const void *post, unsigned char pfx)
+{
+ memcpy(dst, daddr, pfx);
+ memcpy(&dst->s6_addr[pfx], post, IPV6_PFXTAIL_LEN(pfx));
+}
+
+static void ipv6_rpl_addr_compress(void *dst, const struct in6_addr *addr,
+ unsigned char pfx)
+{
+ memcpy(dst, &addr->s6_addr[pfx], IPV6_PFXTAIL_LEN(pfx));
+}
+
+static void *ipv6_rpl_segdata_pos(const struct ipv6_rpl_sr_hdr *hdr, int i)
+{
+ return (void *)&hdr->rpl_segdata[i * IPV6_PFXTAIL_LEN(hdr->cmpri)];
+}
+
+void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr,
+ const struct ipv6_rpl_sr_hdr *inhdr,
+ const struct in6_addr *daddr, unsigned char n)
+{
+ int i;
+
+ outhdr->nexthdr = inhdr->nexthdr;
+ outhdr->hdrlen = (((n + 1) * sizeof(struct in6_addr)) >> 3);
+ outhdr->pad = 0;
+ outhdr->type = inhdr->type;
+ outhdr->segments_left = inhdr->segments_left;
+ outhdr->cmpri = 0;
+ outhdr->cmpre = 0;
+
+ for (i = 0; i < n; i++)
+ ipv6_rpl_addr_decompress(&outhdr->rpl_segaddr[i], daddr,
+ ipv6_rpl_segdata_pos(inhdr, i),
+ inhdr->cmpri);
+
+ ipv6_rpl_addr_decompress(&outhdr->rpl_segaddr[n], daddr,
+ ipv6_rpl_segdata_pos(inhdr, n),
+ inhdr->cmpre);
+}
+
+static unsigned char ipv6_rpl_srh_calc_cmpri(const struct ipv6_rpl_sr_hdr *inhdr,
+ const struct in6_addr *daddr,
+ unsigned char n)
+{
+ unsigned char plen;
+ int i;
+
+ for (plen = 0; plen < sizeof(*daddr); plen++) {
+ for (i = 0; i < n; i++) {
+ if (daddr->s6_addr[plen] !=
+ inhdr->rpl_segaddr[i].s6_addr[plen])
+ return plen;
+ }
+ }
+
+ return IPV6_RPL_BEST_ADDR_COMPRESSION;
+}
+
+static unsigned char ipv6_rpl_srh_calc_cmpre(const struct in6_addr *daddr,
+ const struct in6_addr *last_segment)
+{
+ unsigned int plen;
+
+ for (plen = 0; plen < sizeof(*daddr); plen++) {
+ if (daddr->s6_addr[plen] != last_segment->s6_addr[plen])
+ return plen;
+ }
+
+ return IPV6_RPL_BEST_ADDR_COMPRESSION;
+}
+
+void ipv6_rpl_srh_compress(struct ipv6_rpl_sr_hdr *outhdr,
+ const struct ipv6_rpl_sr_hdr *inhdr,
+ const struct in6_addr *daddr, unsigned char n)
+{
+ unsigned char cmpri, cmpre;
+ size_t seglen;
+ int i;
+
+ cmpri = ipv6_rpl_srh_calc_cmpri(inhdr, daddr, n);
+ cmpre = ipv6_rpl_srh_calc_cmpre(daddr, &inhdr->rpl_segaddr[n]);
+
+ outhdr->nexthdr = inhdr->nexthdr;
+ seglen = (n * IPV6_PFXTAIL_LEN(cmpri)) + IPV6_PFXTAIL_LEN(cmpre);
+ outhdr->hdrlen = seglen >> 3;
+ if (seglen & 0x7) {
+ outhdr->hdrlen++;
+ outhdr->pad = 8 - (seglen & 0x7);
+ } else {
+ outhdr->pad = 0;
+ }
+ outhdr->type = inhdr->type;
+ outhdr->segments_left = inhdr->segments_left;
+ outhdr->cmpri = cmpri;
+ outhdr->cmpre = cmpre;
+
+ for (i = 0; i < n; i++)
+ ipv6_rpl_addr_compress(ipv6_rpl_segdata_pos(outhdr, i),
+ &inhdr->rpl_segaddr[i], cmpri);
+
+ ipv6_rpl_addr_compress(ipv6_rpl_segdata_pos(outhdr, n),
+ &inhdr->rpl_segaddr[n], cmpre);
+}
diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c
new file mode 100644
index 000000000000..c7942cf65567
--- /dev/null
+++ b/net/ipv6/rpl_iptunnel.c
@@ -0,0 +1,394 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Authors:
+ * (C) 2020 Alexander Aring <alex.aring@gmail.com>
+ */
+
+#include <linux/rpl_iptunnel.h>
+
+#include <net/dst_cache.h>
+#include <net/ip6_route.h>
+#include <net/lwtunnel.h>
+#include <net/ipv6.h>
+#include <net/rpl.h>
+
+struct rpl_iptunnel_encap {
+ DECLARE_FLEX_ARRAY(struct ipv6_rpl_sr_hdr, srh);
+};
+
+struct rpl_lwt {
+ struct dst_cache cache;
+ struct rpl_iptunnel_encap tuninfo;
+};
+
+static inline struct rpl_lwt *rpl_lwt_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return (struct rpl_lwt *)lwt->data;
+}
+
+static inline struct rpl_iptunnel_encap *
+rpl_encap_lwtunnel(struct lwtunnel_state *lwt)
+{
+ return &rpl_lwt_lwtunnel(lwt)->tuninfo;
+}
+
+static const struct nla_policy rpl_iptunnel_policy[RPL_IPTUNNEL_MAX + 1] = {
+ [RPL_IPTUNNEL_SRH] = { .type = NLA_BINARY },
+};
+
+static bool rpl_validate_srh(struct net *net, struct ipv6_rpl_sr_hdr *srh,
+ size_t seglen)
+{
+ int err;
+
+ if ((srh->hdrlen << 3) != seglen)
+ return false;
+
+ /* check at least one segment and seglen fit with segments_left */
+ if (!srh->segments_left ||
+ (srh->segments_left * sizeof(struct in6_addr)) != seglen)
+ return false;
+
+ if (srh->cmpri || srh->cmpre)
+ return false;
+
+ err = ipv6_chk_rpl_srh_loop(net, srh->rpl_segaddr,
+ srh->segments_left);
+ if (err)
+ return false;
+
+ if (ipv6_addr_type(&srh->rpl_segaddr[srh->segments_left - 1]) &
+ IPV6_ADDR_MULTICAST)
+ return false;
+
+ return true;
+}
+
+static int rpl_build_state(struct net *net, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts,
+ struct netlink_ext_ack *extack)
+{
+ struct nlattr *tb[RPL_IPTUNNEL_MAX + 1];
+ struct lwtunnel_state *newts;
+ struct ipv6_rpl_sr_hdr *srh;
+ struct rpl_lwt *rlwt;
+ int err, srh_len;
+
+ if (family != AF_INET6)
+ return -EINVAL;
+
+ err = nla_parse_nested(tb, RPL_IPTUNNEL_MAX, nla,
+ rpl_iptunnel_policy, extack);
+ if (err < 0)
+ return err;
+
+ if (!tb[RPL_IPTUNNEL_SRH])
+ return -EINVAL;
+
+ srh = nla_data(tb[RPL_IPTUNNEL_SRH]);
+ srh_len = nla_len(tb[RPL_IPTUNNEL_SRH]);
+
+ if (srh_len < sizeof(*srh))
+ return -EINVAL;
+
+ /* verify that SRH is consistent */
+ if (!rpl_validate_srh(net, srh, srh_len - sizeof(*srh)))
+ return -EINVAL;
+
+ newts = lwtunnel_state_alloc(srh_len + sizeof(*rlwt));
+ if (!newts)
+ return -ENOMEM;
+
+ rlwt = rpl_lwt_lwtunnel(newts);
+
+ err = dst_cache_init(&rlwt->cache, GFP_ATOMIC);
+ if (err) {
+ kfree(newts);
+ return err;
+ }
+
+ memcpy(&rlwt->tuninfo.srh, srh, srh_len);
+
+ newts->type = LWTUNNEL_ENCAP_RPL;
+ newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT;
+ newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+
+ *ts = newts;
+
+ return 0;
+}
+
+static void rpl_destroy_state(struct lwtunnel_state *lwt)
+{
+ dst_cache_destroy(&rpl_lwt_lwtunnel(lwt)->cache);
+}
+
+static int rpl_do_srh_inline(struct sk_buff *skb, const struct rpl_lwt *rlwt,
+ const struct ipv6_rpl_sr_hdr *srh,
+ struct dst_entry *cache_dst)
+{
+ struct ipv6_rpl_sr_hdr *isrh, *csrh;
+ struct ipv6hdr oldhdr;
+ struct ipv6hdr *hdr;
+ unsigned char *buf;
+ size_t hdrlen;
+ int err;
+
+ memcpy(&oldhdr, ipv6_hdr(skb), sizeof(oldhdr));
+
+ buf = kcalloc(struct_size(srh, segments.addr, srh->segments_left), 2, GFP_ATOMIC);
+ if (!buf)
+ return -ENOMEM;
+
+ isrh = (struct ipv6_rpl_sr_hdr *)buf;
+ csrh = (struct ipv6_rpl_sr_hdr *)(buf + ((srh->hdrlen + 1) << 3));
+
+ memcpy(isrh, srh, sizeof(*isrh));
+ memcpy(isrh->rpl_segaddr, &srh->rpl_segaddr[1],
+ (srh->segments_left - 1) * 16);
+ isrh->rpl_segaddr[srh->segments_left - 1] = oldhdr.daddr;
+
+ ipv6_rpl_srh_compress(csrh, isrh, &srh->rpl_segaddr[0],
+ isrh->segments_left - 1);
+
+ hdrlen = ((csrh->hdrlen + 1) << 3);
+
+ err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb));
+ if (unlikely(err)) {
+ kfree(buf);
+ return err;
+ }
+
+ skb_pull(skb, sizeof(struct ipv6hdr));
+ skb_postpull_rcsum(skb, skb_network_header(skb),
+ sizeof(struct ipv6hdr));
+
+ skb_push(skb, sizeof(struct ipv6hdr) + hdrlen);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+
+ hdr = ipv6_hdr(skb);
+ memmove(hdr, &oldhdr, sizeof(*hdr));
+ isrh = (void *)hdr + sizeof(*hdr);
+ memcpy(isrh, csrh, hdrlen);
+
+ isrh->nexthdr = hdr->nexthdr;
+ hdr->nexthdr = NEXTHDR_ROUTING;
+ hdr->daddr = srh->rpl_segaddr[0];
+
+ ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen);
+
+ kfree(buf);
+
+ return 0;
+}
+
+static int rpl_do_srh(struct sk_buff *skb, const struct rpl_lwt *rlwt,
+ struct dst_entry *cache_dst)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct rpl_iptunnel_encap *tinfo;
+
+ if (skb->protocol != htons(ETH_P_IPV6))
+ return -EINVAL;
+
+ tinfo = rpl_encap_lwtunnel(dst->lwtstate);
+
+ return rpl_do_srh_inline(skb, rlwt, tinfo->srh, cache_dst);
+}
+
+static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct dst_entry *dst = NULL;
+ struct rpl_lwt *rlwt;
+ int err;
+
+ rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate);
+
+ local_bh_disable();
+ dst = dst_cache_get(&rlwt->cache);
+ local_bh_enable();
+
+ err = rpl_do_srh(skb, rlwt, dst);
+ if (unlikely(err))
+ goto drop;
+
+ if (unlikely(!dst)) {
+ struct ipv6hdr *hdr = ipv6_hdr(skb);
+ struct flowi6 fl6;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.daddr = hdr->daddr;
+ fl6.saddr = hdr->saddr;
+ fl6.flowlabel = ip6_flowinfo(hdr);
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_proto = hdr->nexthdr;
+
+ dst = ip6_route_output(net, NULL, &fl6);
+ if (dst->error) {
+ err = dst->error;
+ goto drop;
+ }
+
+ /* cache only if we don't create a dst reference loop */
+ if (orig_dst->lwtstate != dst->lwtstate) {
+ local_bh_disable();
+ dst_cache_set_ip6(&rlwt->cache, dst, &fl6.saddr);
+ local_bh_enable();
+ }
+
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst)));
+ if (unlikely(err))
+ goto drop;
+ }
+
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+
+ return dst_output(net, sk, skb);
+
+drop:
+ dst_release(dst);
+ kfree_skb(skb);
+ return err;
+}
+
+static int rpl_input(struct sk_buff *skb)
+{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ struct dst_entry *dst = NULL;
+ struct lwtunnel_state *lwtst;
+ struct rpl_lwt *rlwt;
+ int err;
+
+ /* We cannot dereference "orig_dst" once ip6_route_input() or
+ * skb_dst_drop() is called. However, in order to detect a dst loop, we
+ * need the address of its lwtstate. So, save the address of lwtstate
+ * now and use it later as a comparison.
+ */
+ lwtst = orig_dst->lwtstate;
+
+ rlwt = rpl_lwt_lwtunnel(lwtst);
+
+ local_bh_disable();
+ dst = dst_cache_get(&rlwt->cache);
+ local_bh_enable();
+
+ err = rpl_do_srh(skb, rlwt, dst);
+ if (unlikely(err)) {
+ dst_release(dst);
+ goto drop;
+ }
+
+ if (!dst) {
+ ip6_route_input(skb);
+ dst = skb_dst(skb);
+
+ /* cache only if we don't create a dst reference loop */
+ if (!dst->error && lwtst != dst->lwtstate) {
+ local_bh_disable();
+ dst_cache_set_ip6(&rlwt->cache, dst,
+ &ipv6_hdr(skb)->saddr);
+ local_bh_enable();
+ }
+
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst)));
+ if (unlikely(err))
+ goto drop;
+ } else {
+ skb_dst_drop(skb);
+ skb_dst_set(skb, dst);
+ }
+
+ return dst_input(skb);
+
+drop:
+ kfree_skb(skb);
+ return err;
+}
+
+static int nla_put_rpl_srh(struct sk_buff *skb, int attrtype,
+ struct rpl_iptunnel_encap *tuninfo)
+{
+ struct rpl_iptunnel_encap *data;
+ struct nlattr *nla;
+ int len;
+
+ len = RPL_IPTUNNEL_SRH_SIZE(tuninfo->srh);
+
+ nla = nla_reserve(skb, attrtype, len);
+ if (!nla)
+ return -EMSGSIZE;
+
+ data = nla_data(nla);
+ memcpy(data, tuninfo->srh, len);
+
+ return 0;
+}
+
+static int rpl_fill_encap_info(struct sk_buff *skb,
+ struct lwtunnel_state *lwtstate)
+{
+ struct rpl_iptunnel_encap *tuninfo = rpl_encap_lwtunnel(lwtstate);
+
+ if (nla_put_rpl_srh(skb, RPL_IPTUNNEL_SRH, tuninfo))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int rpl_encap_nlsize(struct lwtunnel_state *lwtstate)
+{
+ struct rpl_iptunnel_encap *tuninfo = rpl_encap_lwtunnel(lwtstate);
+
+ return nla_total_size(RPL_IPTUNNEL_SRH_SIZE(tuninfo->srh));
+}
+
+static int rpl_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
+{
+ struct rpl_iptunnel_encap *a_hdr = rpl_encap_lwtunnel(a);
+ struct rpl_iptunnel_encap *b_hdr = rpl_encap_lwtunnel(b);
+ int len = RPL_IPTUNNEL_SRH_SIZE(a_hdr->srh);
+
+ if (len != RPL_IPTUNNEL_SRH_SIZE(b_hdr->srh))
+ return 1;
+
+ return memcmp(a_hdr, b_hdr, len);
+}
+
+static const struct lwtunnel_encap_ops rpl_ops = {
+ .build_state = rpl_build_state,
+ .destroy_state = rpl_destroy_state,
+ .output = rpl_output,
+ .input = rpl_input,
+ .fill_encap = rpl_fill_encap_info,
+ .get_encap_size = rpl_encap_nlsize,
+ .cmp_encap = rpl_encap_cmp,
+ .owner = THIS_MODULE,
+};
+
+int __init rpl_init(void)
+{
+ int err;
+
+ err = lwtunnel_encap_add_ops(&rpl_ops, LWTUNNEL_ENCAP_RPL);
+ if (err)
+ goto out;
+
+ pr_info("RPL Segment Routing with IPv6\n");
+
+ return 0;
+
+out:
+ return err;
+}
+
+void rpl_exit(void)
+{
+ lwtunnel_encap_del_ops(&rpl_ops, LWTUNNEL_ENCAP_RPL);
+}
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index 8d0ba757a46c..a5c4c629b788 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -1,14 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* SR-IPv6 implementation
*
* Author:
* David Lebrun <david.lebrun@uclouvain.be>
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/errno.h>
@@ -26,14 +21,13 @@
#include <net/genetlink.h>
#include <linux/seg6.h>
#include <linux/seg6_genl.h>
-#ifdef CONFIG_IPV6_SEG6_HMAC
#include <net/seg6_hmac.h>
-#endif
-bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len)
+bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len, bool reduced)
{
- int trailing;
unsigned int tlv_offset;
+ int max_last_entry;
+ int trailing;
if (srh->type != IPV6_SRCRT_TYPE_4)
return false;
@@ -41,8 +35,17 @@ bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len)
if (((srh->hdrlen + 1) << 3) != len)
return false;
- if (srh->segments_left > srh->first_segment)
+ if (!reduced && srh->segments_left > srh->first_segment) {
return false;
+ } else {
+ max_last_entry = (srh->hdrlen / 2) - 1;
+
+ if (srh->first_segment > max_last_entry)
+ return false;
+
+ if (srh->segments_left > srh->first_segment + 1)
+ return false;
+ }
tlv_offset = sizeof(*srh) + ((srh->first_segment + 1) << 4);
@@ -70,6 +73,65 @@ bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len)
return true;
}
+struct ipv6_sr_hdr *seg6_get_srh(struct sk_buff *skb, int flags)
+{
+ struct ipv6_sr_hdr *srh;
+ int len, srhoff = 0;
+
+ if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, &flags) < 0)
+ return NULL;
+
+ if (!pskb_may_pull(skb, srhoff + sizeof(*srh)))
+ return NULL;
+
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+ len = (srh->hdrlen + 1) << 3;
+
+ if (!pskb_may_pull(skb, srhoff + len))
+ return NULL;
+
+ /* note that pskb_may_pull may change pointers in header;
+ * for this reason it is necessary to reload them when needed.
+ */
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+
+ if (!seg6_validate_srh(srh, len, true))
+ return NULL;
+
+ return srh;
+}
+
+/* Determine if an ICMP invoking packet contains a segment routing
+ * header. If it does, extract the offset to the true destination
+ * address, which is in the first segment address.
+ */
+void seg6_icmp_srh(struct sk_buff *skb, struct inet6_skb_parm *opt)
+{
+ __u16 network_header = skb->network_header;
+ struct ipv6_sr_hdr *srh;
+
+ /* Update network header to point to the invoking packet
+ * inside the ICMP packet, so we can use the seg6_get_srh()
+ * helper.
+ */
+ skb_reset_network_header(skb);
+
+ srh = seg6_get_srh(skb, 0);
+ if (!srh)
+ goto out;
+
+ if (srh->type != IPV6_SRCRT_TYPE_4)
+ goto out;
+
+ opt->flags |= IP6SKB_SEG6;
+ opt->srhoff = (unsigned char *)srh - skb->data;
+
+out:
+ /* Restore the network header back to the ICMP packet */
+ skb->network_header = network_header;
+}
+
static struct genl_family seg6_genl_family;
static const struct nla_policy seg6_genl_policy[SEG6_ATTR_MAX + 1] = {
@@ -117,9 +179,6 @@ static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info)
hinfo = seg6_hmac_info_lookup(net, hmackeyid);
if (!slen) {
- if (!hinfo)
- err = -ENOENT;
-
err = seg6_hmac_info_del(net, hmackeyid);
goto out_unlock;
@@ -130,6 +189,11 @@ static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info)
goto out_unlock;
}
+ if (slen > nla_len(info->attrs[SEG6_ATTR_SECRET])) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+
if (hinfo) {
err = seg6_hmac_info_del(net, hmackeyid);
if (err)
@@ -221,9 +285,7 @@ static int seg6_genl_get_tunsrc(struct sk_buff *skb, struct genl_info *info)
rcu_read_unlock();
genlmsg_end(msg, hdr);
- genlmsg_reply(msg, info);
-
- return 0;
+ return genlmsg_reply(msg, info);
nla_put_failure:
rcu_read_unlock();
@@ -373,9 +435,11 @@ static int __net_init seg6_net_init(struct net *net)
net->ipv6.seg6_data = sdata;
-#ifdef CONFIG_IPV6_SEG6_HMAC
- seg6_hmac_net_init(net);
-#endif
+ if (seg6_hmac_net_init(net)) {
+ kfree(rcu_dereference_raw(sdata->tun_src));
+ kfree(sdata);
+ return -ENOMEM;
+ }
return 0;
}
@@ -384,11 +448,9 @@ static void __net_exit seg6_net_exit(struct net *net)
{
struct seg6_pernet_data *sdata = seg6_pernet(net);
-#ifdef CONFIG_IPV6_SEG6_HMAC
seg6_hmac_net_exit(net);
-#endif
- kfree(sdata->tun_src);
+ kfree(rcu_dereference_raw(sdata->tun_src));
kfree(sdata);
}
@@ -400,28 +462,28 @@ static struct pernet_operations ip6_segments_ops = {
static const struct genl_ops seg6_genl_ops[] = {
{
.cmd = SEG6_CMD_SETHMAC,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = seg6_genl_sethmac,
- .policy = seg6_genl_policy,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = SEG6_CMD_DUMPHMAC,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.start = seg6_genl_dumphmac_start,
.dumpit = seg6_genl_dumphmac,
.done = seg6_genl_dumphmac_done,
- .policy = seg6_genl_policy,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = SEG6_CMD_SET_TUNSRC,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = seg6_genl_set_tunsrc,
- .policy = seg6_genl_policy,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = SEG6_CMD_GET_TUNSRC,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = seg6_genl_get_tunsrc,
- .policy = seg6_genl_policy,
.flags = GENL_ADMIN_PERM,
},
};
@@ -431,69 +493,52 @@ static struct genl_family seg6_genl_family __ro_after_init = {
.name = SEG6_GENL_NAME,
.version = SEG6_GENL_VERSION,
.maxattr = SEG6_ATTR_MAX,
+ .policy = seg6_genl_policy,
.netnsok = true,
.parallel_ops = true,
.ops = seg6_genl_ops,
.n_ops = ARRAY_SIZE(seg6_genl_ops),
+ .resv_start_op = SEG6_CMD_GET_TUNSRC + 1,
.module = THIS_MODULE,
};
int __init seg6_init(void)
{
- int err = -ENOMEM;
+ int err;
- err = genl_register_family(&seg6_genl_family);
+ err = register_pernet_subsys(&ip6_segments_ops);
if (err)
goto out;
- err = register_pernet_subsys(&ip6_segments_ops);
+ err = genl_register_family(&seg6_genl_family);
if (err)
- goto out_unregister_genl;
+ goto out_unregister_pernet;
-#ifdef CONFIG_IPV6_SEG6_LWTUNNEL
err = seg6_iptunnel_init();
if (err)
- goto out_unregister_pernet;
+ goto out_unregister_genl;
err = seg6_local_init();
if (err)
- goto out_unregister_pernet;
-#endif
-
-#ifdef CONFIG_IPV6_SEG6_HMAC
- err = seg6_hmac_init();
- if (err)
goto out_unregister_iptun;
-#endif
pr_info("Segment Routing with IPv6\n");
out:
return err;
-#ifdef CONFIG_IPV6_SEG6_HMAC
out_unregister_iptun:
-#ifdef CONFIG_IPV6_SEG6_LWTUNNEL
- seg6_local_exit();
seg6_iptunnel_exit();
-#endif
-#endif
-#ifdef CONFIG_IPV6_SEG6_LWTUNNEL
-out_unregister_pernet:
- unregister_pernet_subsys(&ip6_segments_ops);
-#endif
out_unregister_genl:
genl_unregister_family(&seg6_genl_family);
+out_unregister_pernet:
+ unregister_pernet_subsys(&ip6_segments_ops);
goto out;
}
void seg6_exit(void)
{
-#ifdef CONFIG_IPV6_SEG6_HMAC
- seg6_hmac_exit();
-#endif
-#ifdef CONFIG_IPV6_SEG6_LWTUNNEL
+ seg6_local_exit();
seg6_iptunnel_exit();
-#endif
- unregister_pernet_subsys(&ip6_segments_ops);
genl_unregister_family(&seg6_genl_family);
+ unregister_pernet_subsys(&ip6_segments_ops);
}
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index 8546f94f30d4..ee6bac0160ac 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -1,14 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* SR-IPv6 implementation -- HMAC functions
*
* Author:
* David Lebrun <david.lebrun@uclouvain.be>
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/errno.h>
@@ -21,7 +16,6 @@
#include <linux/in6.h>
#include <linux/icmpv6.h>
#include <linux/mroute6.h>
-#include <linux/slab.h>
#include <linux/rhashtable.h>
#include <linux/netfilter.h>
@@ -39,15 +33,22 @@
#include <net/addrconf.h>
#include <net/xfrm.h>
-#include <linux/cryptohash.h>
-#include <crypto/hash.h>
-#include <crypto/sha.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
+#include <crypto/utils.h>
#include <net/seg6.h>
#include <net/genetlink.h>
#include <net/seg6_hmac.h>
#include <linux/random.h>
-static DEFINE_PER_CPU(char [SEG6_HMAC_RING_SIZE], hmac_ring);
+struct hmac_storage {
+ local_lock_t bh_lock;
+ char hmac_ring[SEG6_HMAC_RING_SIZE];
+};
+
+static DEFINE_PER_CPU(struct hmac_storage, hmac_storage) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
static int seg6_hmac_cmpfn(struct rhashtable_compare_arg *arg, const void *obj)
{
@@ -77,17 +78,6 @@ static const struct rhashtable_params rht_params = {
.obj_cmpfn = seg6_hmac_cmpfn,
};
-static struct seg6_hmac_algo hmac_algos[] = {
- {
- .alg_id = SEG6_HMAC_ALGO_SHA1,
- .name = "hmac(sha1)",
- },
- {
- .alg_id = SEG6_HMAC_ALGO_SHA256,
- .name = "hmac(sha256)",
- },
-};
-
static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh)
{
struct sr6_tlv_hmac *tlv;
@@ -107,75 +97,13 @@ static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh)
return tlv;
}
-static struct seg6_hmac_algo *__hmac_get_algo(u8 alg_id)
-{
- struct seg6_hmac_algo *algo;
- int i, alg_count;
-
- alg_count = ARRAY_SIZE(hmac_algos);
- for (i = 0; i < alg_count; i++) {
- algo = &hmac_algos[i];
- if (algo->alg_id == alg_id)
- return algo;
- }
-
- return NULL;
-}
-
-static int __do_hmac(struct seg6_hmac_info *hinfo, const char *text, u8 psize,
- u8 *output, int outlen)
-{
- struct seg6_hmac_algo *algo;
- struct crypto_shash *tfm;
- struct shash_desc *shash;
- int ret, dgsize;
-
- algo = __hmac_get_algo(hinfo->alg_id);
- if (!algo)
- return -ENOENT;
-
- tfm = *this_cpu_ptr(algo->tfms);
-
- dgsize = crypto_shash_digestsize(tfm);
- if (dgsize > outlen) {
- pr_debug("sr-ipv6: __do_hmac: digest size too big (%d / %d)\n",
- dgsize, outlen);
- return -ENOMEM;
- }
-
- ret = crypto_shash_setkey(tfm, hinfo->secret, hinfo->slen);
- if (ret < 0) {
- pr_debug("sr-ipv6: crypto_shash_setkey failed: err %d\n", ret);
- goto failed;
- }
-
- shash = *this_cpu_ptr(algo->shashs);
- shash->tfm = tfm;
-
- ret = crypto_shash_digest(shash, text, psize, output);
- if (ret < 0) {
- pr_debug("sr-ipv6: crypto_shash_digest failed: err %d\n", ret);
- goto failed;
- }
-
- return dgsize;
-
-failed:
- return ret;
-}
-
int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
struct in6_addr *saddr, u8 *output)
{
__be32 hmackeyid = cpu_to_be32(hinfo->hmackeyid);
- u8 tmp_out[SEG6_HMAC_MAX_DIGESTSIZE];
- int plen, i, dgsize, wrsize;
+ int plen, i, ret = 0;
char *ring, *off;
- /* a 160-byte buffer for digest output allows to store highest known
- * hash function (RadioGatun) with up to 1216 bits
- */
-
/* saddr(16) + first_seg(1) + flags(1) + keyid(4) + seglist(16n) */
plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16;
@@ -194,7 +122,8 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
*/
local_bh_disable();
- ring = this_cpu_ptr(hmac_ring);
+ local_lock_nested_bh(&hmac_storage.bh_lock);
+ ring = this_cpu_ptr(hmac_storage.hmac_ring);
off = ring;
/* source address */
@@ -217,21 +146,25 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
off += 16;
}
- dgsize = __do_hmac(hinfo, ring, plen, tmp_out,
- SEG6_HMAC_MAX_DIGESTSIZE);
+ switch (hinfo->alg_id) {
+ case SEG6_HMAC_ALGO_SHA1:
+ hmac_sha1(&hinfo->key.sha1, ring, plen, output);
+ static_assert(SEG6_HMAC_FIELD_LEN > SHA1_DIGEST_SIZE);
+ memset(&output[SHA1_DIGEST_SIZE], 0,
+ SEG6_HMAC_FIELD_LEN - SHA1_DIGEST_SIZE);
+ break;
+ case SEG6_HMAC_ALGO_SHA256:
+ hmac_sha256(&hinfo->key.sha256, ring, plen, output);
+ static_assert(SEG6_HMAC_FIELD_LEN == SHA256_DIGEST_SIZE);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ ret = -EINVAL;
+ break;
+ }
+ local_unlock_nested_bh(&hmac_storage.bh_lock);
local_bh_enable();
-
- if (dgsize < 0)
- return dgsize;
-
- wrsize = SEG6_HMAC_FIELD_LEN;
- if (wrsize > dgsize)
- wrsize = dgsize;
-
- memset(output, 0, SEG6_HMAC_FIELD_LEN);
- memcpy(output, tmp_out, wrsize);
-
- return 0;
+ return ret;
}
EXPORT_SYMBOL(seg6_hmac_compute);
@@ -248,6 +181,7 @@ bool seg6_hmac_validate_skb(struct sk_buff *skb)
struct sr6_tlv_hmac *tlv;
struct ipv6_sr_hdr *srh;
struct inet6_dev *idev;
+ int require_hmac;
idev = __in6_dev_get(skb->dev);
@@ -255,16 +189,17 @@ bool seg6_hmac_validate_skb(struct sk_buff *skb)
tlv = seg6_get_tlv_hmac(srh);
+ require_hmac = READ_ONCE(idev->cnf.seg6_require_hmac);
/* mandatory check but no tlv */
- if (idev->cnf.seg6_require_hmac > 0 && !tlv)
+ if (require_hmac > 0 && !tlv)
return false;
/* no check */
- if (idev->cnf.seg6_require_hmac < 0)
+ if (require_hmac < 0)
return true;
/* check only if present */
- if (idev->cnf.seg6_require_hmac == 0 && !tlv)
+ if (require_hmac == 0 && !tlv)
return true;
/* now, seg6_require_hmac >= 0 && tlv */
@@ -276,7 +211,7 @@ bool seg6_hmac_validate_skb(struct sk_buff *skb)
if (seg6_hmac_compute(hinfo, srh, &ipv6_hdr(skb)->saddr, hmac_output))
return false;
- if (memcmp(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN) != 0)
+ if (crypto_memneq(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN))
return false;
return true;
@@ -300,6 +235,19 @@ int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo)
struct seg6_pernet_data *sdata = seg6_pernet(net);
int err;
+ switch (hinfo->alg_id) {
+ case SEG6_HMAC_ALGO_SHA1:
+ hmac_sha1_preparekey(&hinfo->key.sha1,
+ hinfo->secret, hinfo->slen);
+ break;
+ case SEG6_HMAC_ALGO_SHA256:
+ hmac_sha256_preparekey(&hinfo->key.sha256,
+ hinfo->secret, hinfo->slen);
+ break;
+ default:
+ return -EINVAL;
+ }
+
err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node,
rht_params);
@@ -355,91 +303,12 @@ out:
}
EXPORT_SYMBOL(seg6_push_hmac);
-static int seg6_hmac_init_algo(void)
-{
- struct seg6_hmac_algo *algo;
- struct crypto_shash *tfm;
- struct shash_desc *shash;
- int i, alg_count, cpu;
-
- alg_count = ARRAY_SIZE(hmac_algos);
-
- for (i = 0; i < alg_count; i++) {
- struct crypto_shash **p_tfm;
- int shsize;
-
- algo = &hmac_algos[i];
- algo->tfms = alloc_percpu(struct crypto_shash *);
- if (!algo->tfms)
- return -ENOMEM;
-
- for_each_possible_cpu(cpu) {
- tfm = crypto_alloc_shash(algo->name, 0, 0);
- if (IS_ERR(tfm))
- return PTR_ERR(tfm);
- p_tfm = per_cpu_ptr(algo->tfms, cpu);
- *p_tfm = tfm;
- }
-
- p_tfm = raw_cpu_ptr(algo->tfms);
- tfm = *p_tfm;
-
- shsize = sizeof(*shash) + crypto_shash_descsize(tfm);
-
- algo->shashs = alloc_percpu(struct shash_desc *);
- if (!algo->shashs)
- return -ENOMEM;
-
- for_each_possible_cpu(cpu) {
- shash = kzalloc_node(shsize, GFP_KERNEL,
- cpu_to_node(cpu));
- if (!shash)
- return -ENOMEM;
- *per_cpu_ptr(algo->shashs, cpu) = shash;
- }
- }
-
- return 0;
-}
-
-int __init seg6_hmac_init(void)
-{
- return seg6_hmac_init_algo();
-}
-EXPORT_SYMBOL(seg6_hmac_init);
-
int __net_init seg6_hmac_net_init(struct net *net)
{
struct seg6_pernet_data *sdata = seg6_pernet(net);
- rhashtable_init(&sdata->hmac_infos, &rht_params);
-
- return 0;
-}
-EXPORT_SYMBOL(seg6_hmac_net_init);
-
-void seg6_hmac_exit(void)
-{
- struct seg6_hmac_algo *algo = NULL;
- int i, alg_count, cpu;
-
- alg_count = ARRAY_SIZE(hmac_algos);
- for (i = 0; i < alg_count; i++) {
- algo = &hmac_algos[i];
- for_each_possible_cpu(cpu) {
- struct crypto_shash *tfm;
- struct shash_desc *shash;
-
- shash = *per_cpu_ptr(algo->shashs, cpu);
- kfree(shash);
- tfm = *per_cpu_ptr(algo->tfms, cpu);
- crypto_free_shash(tfm);
- }
- free_percpu(algo->tfms);
- free_percpu(algo->shashs);
- }
+ return rhashtable_init(&sdata->hmac_infos, &rht_params);
}
-EXPORT_SYMBOL(seg6_hmac_exit);
void __net_exit seg6_hmac_net_exit(struct net *net)
{
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index a8854dd3e9c5..3e1b9991131a 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -1,14 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* SR-IPv6 implementation
*
* Author:
* David Lebrun <david.lebrun@uclouvain.be>
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/types.h>
@@ -31,10 +26,30 @@
#ifdef CONFIG_IPV6_SEG6_HMAC
#include <net/seg6_hmac.h>
#endif
+#include <linux/netfilter.h>
+
+static size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo)
+{
+ int head = 0;
+
+ switch (tuninfo->mode) {
+ case SEG6_IPTUN_MODE_INLINE:
+ break;
+ case SEG6_IPTUN_MODE_ENCAP:
+ case SEG6_IPTUN_MODE_ENCAP_RED:
+ head = sizeof(struct ipv6hdr);
+ break;
+ case SEG6_IPTUN_MODE_L2ENCAP:
+ case SEG6_IPTUN_MODE_L2ENCAP_RED:
+ return 0;
+ }
+
+ return ((tuninfo->srh->hdrlen + 1) << 3) + head;
+}
struct seg6_lwt {
struct dst_cache cache;
- struct seg6_iptunnel_encap tuninfo[0];
+ struct seg6_iptunnel_encap tuninfo[];
};
static inline struct seg6_lwt *seg6_lwt_lwtunnel(struct lwtunnel_state *lwt)
@@ -109,11 +124,12 @@ static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb,
return flowlabel;
}
-/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */
-int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
+static int __seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh,
+ int proto, struct dst_entry *cache_dst)
{
struct dst_entry *dst = skb_dst(skb);
- struct net *net = dev_net(dst->dev);
+ struct net_device *dev = dst_dev(dst);
+ struct net *net = dev_net(dev);
struct ipv6hdr *hdr, *inner_hdr;
struct ipv6_sr_hdr *isrh;
int hdrlen, tot_len, err;
@@ -122,7 +138,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
hdrlen = (osrh->hdrlen + 1) << 3;
tot_len = hdrlen + sizeof(*hdr);
- err = skb_cow_head(skb, tot_len + skb->mac_len);
+ err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb));
if (unlikely(err))
return err;
@@ -146,6 +162,16 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
} else {
ip6_flow_hdr(hdr, 0, flowlabel);
hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb));
+
+ memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+
+ /* the control block has been erased, so we have to set the
+ * iif once again.
+ * We read the receiving interface index directly from the
+ * skb->skb_iif as it is done in the IPv4 receiving path (i.e.:
+ * ip_rcv_core(...)).
+ */
+ IP6CB(skb)->iif = skb->skb_iif;
}
hdr->nexthdr = NEXTHDR_ROUTING;
@@ -156,7 +182,7 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
isrh->nexthdr = proto;
hdr->daddr = isrh->segments[isrh->first_segment];
- set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr);
+ set_tun_src(net, dev, &hdr->daddr, &hdr->saddr);
#ifdef CONFIG_IPV6_SEG6_HMAC
if (sr_has_hmac(isrh)) {
@@ -166,14 +192,142 @@ int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
}
#endif
+ hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
skb_postpush_rcsum(skb, hdr, tot_len);
return 0;
}
+
+/* encapsulate an IPv6 packet within an outer IPv6 header with a given SRH */
+int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto)
+{
+ return __seg6_do_srh_encap(skb, osrh, proto, NULL);
+}
EXPORT_SYMBOL_GPL(seg6_do_srh_encap);
-/* insert an SRH within an IPv6 packet, just after the IPv6 header */
-int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
+/* encapsulate an IPv6 packet within an outer IPv6 header with reduced SRH */
+static int seg6_do_srh_encap_red(struct sk_buff *skb,
+ struct ipv6_sr_hdr *osrh, int proto,
+ struct dst_entry *cache_dst)
+{
+ __u8 first_seg = osrh->first_segment;
+ struct dst_entry *dst = skb_dst(skb);
+ struct net_device *dev = dst_dev(dst);
+ struct net *net = dev_net(dev);
+ struct ipv6hdr *hdr, *inner_hdr;
+ int hdrlen = ipv6_optlen(osrh);
+ int red_tlv_offset, tlv_offset;
+ struct ipv6_sr_hdr *isrh;
+ bool skip_srh = false;
+ __be32 flowlabel;
+ int tot_len, err;
+ int red_hdrlen;
+ int tlvs_len;
+
+ if (first_seg > 0) {
+ red_hdrlen = hdrlen - sizeof(struct in6_addr);
+ } else {
+ /* NOTE: if tag/flags and/or other TLVs are introduced in the
+ * seg6_iptunnel infrastructure, they should be considered when
+ * deciding to skip the SRH.
+ */
+ skip_srh = !sr_has_hmac(osrh);
+
+ red_hdrlen = skip_srh ? 0 : hdrlen;
+ }
+
+ tot_len = red_hdrlen + sizeof(struct ipv6hdr);
+
+ err = skb_cow_head(skb, tot_len + dst_dev_overhead(cache_dst, skb));
+ if (unlikely(err))
+ return err;
+
+ inner_hdr = ipv6_hdr(skb);
+ flowlabel = seg6_make_flowlabel(net, skb, inner_hdr);
+
+ skb_push(skb, tot_len);
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+ hdr = ipv6_hdr(skb);
+
+ /* based on seg6_do_srh_encap() */
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ ip6_flow_hdr(hdr, ip6_tclass(ip6_flowinfo(inner_hdr)),
+ flowlabel);
+ hdr->hop_limit = inner_hdr->hop_limit;
+ } else {
+ ip6_flow_hdr(hdr, 0, flowlabel);
+ hdr->hop_limit = ip6_dst_hoplimit(skb_dst(skb));
+
+ memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+ IP6CB(skb)->iif = skb->skb_iif;
+ }
+
+ /* no matter if we have to skip the SRH or not, the first segment
+ * always comes in the pushed IPv6 header.
+ */
+ hdr->daddr = osrh->segments[first_seg];
+
+ if (skip_srh) {
+ hdr->nexthdr = proto;
+
+ set_tun_src(net, dev, &hdr->daddr, &hdr->saddr);
+ goto out;
+ }
+
+ /* we cannot skip the SRH, slow path */
+
+ hdr->nexthdr = NEXTHDR_ROUTING;
+ isrh = (void *)hdr + sizeof(struct ipv6hdr);
+
+ if (unlikely(!first_seg)) {
+ /* this is a very rare case; we have only one SID but
+ * we cannot skip the SRH since we are carrying some
+ * other info.
+ */
+ memcpy(isrh, osrh, hdrlen);
+ goto srcaddr;
+ }
+
+ tlv_offset = sizeof(*osrh) + (first_seg + 1) * sizeof(struct in6_addr);
+ red_tlv_offset = tlv_offset - sizeof(struct in6_addr);
+
+ memcpy(isrh, osrh, red_tlv_offset);
+
+ tlvs_len = hdrlen - tlv_offset;
+ if (unlikely(tlvs_len > 0)) {
+ const void *s = (const void *)osrh + tlv_offset;
+ void *d = (void *)isrh + red_tlv_offset;
+
+ memcpy(d, s, tlvs_len);
+ }
+
+ --isrh->first_segment;
+ isrh->hdrlen -= 2;
+
+srcaddr:
+ isrh->nexthdr = proto;
+ set_tun_src(net, dev, &hdr->daddr, &hdr->saddr);
+
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ if (unlikely(!skip_srh && sr_has_hmac(isrh))) {
+ err = seg6_push_hmac(net, &hdr->saddr, isrh);
+ if (unlikely(err))
+ return err;
+ }
+#endif
+
+out:
+ hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
+ skb_postpush_rcsum(skb, hdr, tot_len);
+
+ return 0;
+}
+
+static int __seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh,
+ struct dst_entry *cache_dst)
{
struct ipv6hdr *hdr, *oldhdr;
struct ipv6_sr_hdr *isrh;
@@ -181,7 +335,7 @@ int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
hdrlen = (osrh->hdrlen + 1) << 3;
- err = skb_cow_head(skb, hdrlen + skb->mac_len);
+ err = skb_cow_head(skb, hdrlen + dst_dev_overhead(cache_dst, skb));
if (unlikely(err))
return err;
@@ -210,7 +364,7 @@ int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
#ifdef CONFIG_IPV6_SEG6_HMAC
if (sr_has_hmac(isrh)) {
- struct net *net = dev_net(skb_dst(skb)->dev);
+ struct net *net = skb_dst_dev_net(skb);
err = seg6_push_hmac(net, &hdr->saddr, isrh);
if (unlikely(err))
@@ -218,13 +372,14 @@ int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
}
#endif
+ hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
skb_postpush_rcsum(skb, hdr, sizeof(struct ipv6hdr) + hdrlen);
return 0;
}
-EXPORT_SYMBOL_GPL(seg6_do_srh_inline);
-static int seg6_do_srh(struct sk_buff *skb)
+static int seg6_do_srh(struct sk_buff *skb, struct dst_entry *cache_dst)
{
struct dst_entry *dst = skb_dst(skb);
struct seg6_iptunnel_encap *tinfo;
@@ -237,11 +392,12 @@ static int seg6_do_srh(struct sk_buff *skb)
if (skb->protocol != htons(ETH_P_IPV6))
return -EINVAL;
- err = seg6_do_srh_inline(skb, tinfo->srh);
+ err = __seg6_do_srh_inline(skb, tinfo->srh, cache_dst);
if (err)
return err;
break;
case SEG6_IPTUN_MODE_ENCAP:
+ case SEG6_IPTUN_MODE_ENCAP_RED:
err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6);
if (err)
return err;
@@ -253,7 +409,13 @@ static int seg6_do_srh(struct sk_buff *skb)
else
return -EINVAL;
- err = seg6_do_srh_encap(skb, tinfo->srh, proto);
+ if (tinfo->mode == SEG6_IPTUN_MODE_ENCAP)
+ err = __seg6_do_srh_encap(skb, tinfo->srh,
+ proto, cache_dst);
+ else
+ err = seg6_do_srh_encap_red(skb, tinfo->srh,
+ proto, cache_dst);
+
if (err)
return err;
@@ -262,6 +424,7 @@ static int seg6_do_srh(struct sk_buff *skb)
skb->protocol = htons(ETH_P_IPV6);
break;
case SEG6_IPTUN_MODE_L2ENCAP:
+ case SEG6_IPTUN_MODE_L2ENCAP_RED:
if (!skb_mac_header_was_set(skb))
return -EINVAL;
@@ -271,7 +434,15 @@ static int seg6_do_srh(struct sk_buff *skb)
skb_mac_header_rebuild(skb);
skb_push(skb, skb->mac_len);
- err = seg6_do_srh_encap(skb, tinfo->srh, NEXTHDR_NONE);
+ if (tinfo->mode == SEG6_IPTUN_MODE_L2ENCAP)
+ err = __seg6_do_srh_encap(skb, tinfo->srh,
+ IPPROTO_ETHERNET,
+ cache_dst);
+ else
+ err = seg6_do_srh_encap_red(skb, tinfo->srh,
+ IPPROTO_ETHERNET,
+ cache_dst);
+
if (err)
return err;
@@ -279,74 +450,132 @@ static int seg6_do_srh(struct sk_buff *skb)
break;
}
- ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+ nf_reset_ct(skb);
return 0;
}
-static int seg6_input(struct sk_buff *skb)
+/* insert an SRH within an IPv6 packet, just after the IPv6 header */
+int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh)
+{
+ return __seg6_do_srh_inline(skb, osrh, NULL);
+}
+EXPORT_SYMBOL_GPL(seg6_do_srh_inline);
+
+static int seg6_input_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
+{
+ return dst_input(skb);
+}
+
+static int seg6_input_core(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
struct dst_entry *orig_dst = skb_dst(skb);
struct dst_entry *dst = NULL;
+ struct lwtunnel_state *lwtst;
struct seg6_lwt *slwt;
int err;
- err = seg6_do_srh(skb);
- if (unlikely(err)) {
- kfree_skb(skb);
- return err;
- }
+ /* We cannot dereference "orig_dst" once ip6_route_input() or
+ * skb_dst_drop() is called. However, in order to detect a dst loop, we
+ * need the address of its lwtstate. So, save the address of lwtstate
+ * now and use it later as a comparison.
+ */
+ lwtst = orig_dst->lwtstate;
- slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate);
+ slwt = seg6_lwt_lwtunnel(lwtst);
- preempt_disable();
+ local_bh_disable();
dst = dst_cache_get(&slwt->cache);
- preempt_enable();
+ local_bh_enable();
- skb_dst_drop(skb);
+ err = seg6_do_srh(skb, dst);
+ if (unlikely(err)) {
+ dst_release(dst);
+ goto drop;
+ }
if (!dst) {
ip6_route_input(skb);
dst = skb_dst(skb);
- if (!dst->error) {
- preempt_disable();
+
+ /* cache only if we don't create a dst reference loop */
+ if (!dst->error && lwtst != dst->lwtstate) {
+ local_bh_disable();
dst_cache_set_ip6(&slwt->cache, dst,
&ipv6_hdr(skb)->saddr);
- preempt_enable();
+ local_bh_enable();
}
+
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst)));
+ if (unlikely(err))
+ goto drop;
} else {
+ skb_dst_drop(skb);
skb_dst_set(skb, dst);
}
- err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
- if (unlikely(err))
- return err;
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
+ dev_net(skb->dev), NULL, skb, NULL,
+ skb_dst_dev(skb), seg6_input_finish);
- return dst_input(skb);
+ return seg6_input_finish(dev_net(skb->dev), NULL, skb);
+drop:
+ kfree_skb(skb);
+ return err;
}
-static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int seg6_input_nf(struct sk_buff *skb)
+{
+ struct net_device *dev = skb_dst_dev(skb);
+ struct net *net = dev_net(skb->dev);
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, NULL,
+ skb, NULL, dev, seg6_input_core);
+ case htons(ETH_P_IPV6):
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, NULL,
+ skb, NULL, dev, seg6_input_core);
+ }
+
+ return -EINVAL;
+}
+
+static int seg6_input(struct sk_buff *skb)
+{
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return seg6_input_nf(skb);
+
+ return seg6_input_core(dev_net(skb->dev), NULL, skb);
+}
+
+static int seg6_output_core(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
struct dst_entry *orig_dst = skb_dst(skb);
struct dst_entry *dst = NULL;
struct seg6_lwt *slwt;
- int err = -EINVAL;
-
- err = seg6_do_srh(skb);
- if (unlikely(err))
- goto drop;
+ int err;
slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate);
- preempt_disable();
+ local_bh_disable();
dst = dst_cache_get(&slwt->cache);
- preempt_enable();
+ local_bh_enable();
+
+ err = seg6_do_srh(skb, dst);
+ if (unlikely(err))
+ goto drop;
if (unlikely(!dst)) {
struct ipv6hdr *hdr = ipv6_hdr(skb);
struct flowi6 fl6;
+ memset(&fl6, 0, sizeof(fl6));
fl6.daddr = hdr->daddr;
fl6.saddr = hdr->saddr;
fl6.flowlabel = ip6_flowinfo(hdr);
@@ -356,29 +585,60 @@ static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
dst = ip6_route_output(net, NULL, &fl6);
if (dst->error) {
err = dst->error;
- dst_release(dst);
goto drop;
}
- preempt_disable();
- dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr);
- preempt_enable();
+ /* cache only if we don't create a dst reference loop */
+ if (orig_dst->lwtstate != dst->lwtstate) {
+ local_bh_disable();
+ dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr);
+ local_bh_enable();
+ }
+
+ err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst)));
+ if (unlikely(err))
+ goto drop;
}
skb_dst_drop(skb);
skb_dst_set(skb, dst);
- err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
- if (unlikely(err))
- goto drop;
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb,
+ NULL, dst_dev(dst), dst_output);
return dst_output(net, sk, skb);
drop:
+ dst_release(dst);
kfree_skb(skb);
return err;
}
-static int seg6_build_state(struct nlattr *nla,
+static int seg6_output_nf(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct net_device *dev = skb_dst_dev(skb);
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IP):
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb,
+ NULL, dev, seg6_output_core);
+ case htons(ETH_P_IPV6):
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb,
+ NULL, dev, seg6_output_core);
+ }
+
+ return -EINVAL;
+}
+
+static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return seg6_output_nf(net, sk, skb);
+
+ return seg6_output_core(net, sk, skb);
+}
+
+static int seg6_build_state(struct net *net, struct nlattr *nla,
unsigned int family, const void *cfg,
struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
@@ -393,8 +653,8 @@ static int seg6_build_state(struct nlattr *nla,
if (family != AF_INET && family != AF_INET6)
return -EINVAL;
- err = nla_parse_nested(tb, SEG6_IPTUNNEL_MAX, nla,
- seg6_iptunnel_policy, extack);
+ err = nla_parse_nested_deprecated(tb, SEG6_IPTUNNEL_MAX, nla,
+ seg6_iptunnel_policy, extack);
if (err < 0)
return err;
@@ -423,12 +683,16 @@ static int seg6_build_state(struct nlattr *nla,
break;
case SEG6_IPTUN_MODE_L2ENCAP:
break;
+ case SEG6_IPTUN_MODE_ENCAP_RED:
+ break;
+ case SEG6_IPTUN_MODE_L2ENCAP_RED:
+ break;
default:
return -EINVAL;
}
/* verify that SRH is consistent */
- if (!seg6_validate_srh(tuninfo->srh, tuninfo_len - sizeof(*tuninfo)))
+ if (!seg6_validate_srh(tuninfo->srh, tuninfo_len - sizeof(*tuninfo), false))
return -EINVAL;
newts = lwtunnel_state_alloc(tuninfo_len + sizeof(*slwt));
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index 60325dbfe88b..2b41e4c0dddd 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -1,17 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* SR-IPv6 implementation
*
* Authors:
* David Lebrun <david.lebrun@uclouvain.be>
* eBPF support: Mathieu Xhonneux <m.xhonneux@gmail.com>
- *
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
+#include <linux/filter.h>
#include <linux/types.h>
#include <linux/skbuff.h>
#include <linux/net.h>
@@ -28,20 +24,48 @@
#include <net/addrconf.h>
#include <net/ip6_route.h>
#include <net/dst_cache.h>
+#include <net/ip_tunnels.h>
#ifdef CONFIG_IPV6_SEG6_HMAC
#include <net/seg6_hmac.h>
#endif
#include <net/seg6_local.h>
#include <linux/etherdevice.h>
#include <linux/bpf.h>
+#include <linux/netfilter.h>
+
+#define SEG6_F_ATTR(i) BIT(i)
struct seg6_local_lwt;
+/* callbacks used for customizing the creation and destruction of a behavior */
+struct seg6_local_lwtunnel_ops {
+ int (*build_state)(struct seg6_local_lwt *slwt, const void *cfg,
+ struct netlink_ext_ack *extack);
+ void (*destroy_state)(struct seg6_local_lwt *slwt);
+};
+
struct seg6_action_desc {
int action;
unsigned long attrs;
+
+ /* The optattrs field is used for specifying all the optional
+ * attributes supported by a specific behavior.
+ * It means that if one of these attributes is not provided in the
+ * netlink message during the behavior creation, no errors will be
+ * returned to the userspace.
+ *
+ * Each attribute can be only of two types (mutually exclusive):
+ * 1) required or 2) optional.
+ * Every user MUST obey to this rule! If you set an attribute as
+ * required the same attribute CANNOT be set as optional and vice
+ * versa.
+ */
+ unsigned long optattrs;
+
int (*input)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
int static_headroom;
+
+ struct seg6_local_lwtunnel_ops slwt_ops;
};
struct bpf_lwt_prog {
@@ -49,6 +73,117 @@ struct bpf_lwt_prog {
char *name;
};
+/* default length values (expressed in bits) for both Locator-Block and
+ * Locator-Node Function.
+ *
+ * Both SEG6_LOCAL_LCBLOCK_DBITS and SEG6_LOCAL_LCNODE_FN_DBITS *must* be:
+ * i) greater than 0;
+ * ii) evenly divisible by 8. In other terms, the lengths of the
+ * Locator-Block and Locator-Node Function must be byte-aligned (we can
+ * relax this constraint in the future if really needed).
+ *
+ * Moreover, a third condition must hold:
+ * iii) SEG6_LOCAL_LCBLOCK_DBITS + SEG6_LOCAL_LCNODE_FN_DBITS <= 128.
+ *
+ * The correctness of SEG6_LOCAL_LCBLOCK_DBITS and SEG6_LOCAL_LCNODE_FN_DBITS
+ * values are checked during the kernel compilation. If the compilation stops,
+ * check the value of these parameters to see if they meet conditions (i), (ii)
+ * and (iii).
+ */
+#define SEG6_LOCAL_LCBLOCK_DBITS 32
+#define SEG6_LOCAL_LCNODE_FN_DBITS 16
+
+/* The following next_csid_chk_{cntr,lcblock,lcblock_fn}_bits macros can be
+ * used directly to check whether the lengths (in bits) of Locator-Block and
+ * Locator-Node Function are valid according to (i), (ii), (iii).
+ */
+#define next_csid_chk_cntr_bits(blen, flen) \
+ ((blen) + (flen) > 128)
+
+#define next_csid_chk_lcblock_bits(blen) \
+({ \
+ typeof(blen) __tmp = blen; \
+ (!__tmp || __tmp > 120 || (__tmp & 0x07)); \
+})
+
+#define next_csid_chk_lcnode_fn_bits(flen) \
+ next_csid_chk_lcblock_bits(flen)
+
+/* flag indicating that flavors are set up for a given End* behavior */
+#define SEG6_F_LOCAL_FLAVORS SEG6_F_ATTR(SEG6_LOCAL_FLAVORS)
+
+#define SEG6_F_LOCAL_FLV_OP(flvname) BIT(SEG6_LOCAL_FLV_OP_##flvname)
+#define SEG6_F_LOCAL_FLV_NEXT_CSID SEG6_F_LOCAL_FLV_OP(NEXT_CSID)
+#define SEG6_F_LOCAL_FLV_PSP SEG6_F_LOCAL_FLV_OP(PSP)
+
+/* Supported RFC8986 Flavor operations are reported in this bitmask */
+#define SEG6_LOCAL_FLV8986_SUPP_OPS SEG6_F_LOCAL_FLV_PSP
+
+#define SEG6_LOCAL_END_FLV_SUPP_OPS (SEG6_F_LOCAL_FLV_NEXT_CSID | \
+ SEG6_LOCAL_FLV8986_SUPP_OPS)
+#define SEG6_LOCAL_END_X_FLV_SUPP_OPS SEG6_F_LOCAL_FLV_NEXT_CSID
+
+struct seg6_flavors_info {
+ /* Flavor operations */
+ __u32 flv_ops;
+
+ /* Locator-Block length, expressed in bits */
+ __u8 lcblock_bits;
+ /* Locator-Node Function length, expressed in bits*/
+ __u8 lcnode_func_bits;
+};
+
+enum seg6_end_dt_mode {
+ DT_INVALID_MODE = -EINVAL,
+ DT_LEGACY_MODE = 0,
+ DT_VRF_MODE = 1,
+};
+
+struct seg6_end_dt_info {
+ enum seg6_end_dt_mode mode;
+
+ struct net *net;
+ /* VRF device associated to the routing table used by the SRv6
+ * End.DT4/DT6 behavior for routing IPv4/IPv6 packets.
+ */
+ int vrf_ifindex;
+ int vrf_table;
+
+ /* tunneled packet family (IPv4 or IPv6).
+ * Protocol and header length are inferred from family.
+ */
+ u16 family;
+};
+
+struct pcpu_seg6_local_counters {
+ u64_stats_t packets;
+ u64_stats_t bytes;
+ u64_stats_t errors;
+
+ struct u64_stats_sync syncp;
+};
+
+/* This struct groups all the SRv6 Behavior counters supported so far.
+ *
+ * put_nla_counters() makes use of this data structure to collect all counter
+ * values after the per-CPU counter evaluation has been performed.
+ * Finally, each counter value (in seg6_local_counters) is stored in the
+ * corresponding netlink attribute and sent to user space.
+ *
+ * NB: we don't want to expose this structure to user space!
+ */
+struct seg6_local_counters {
+ __u64 packets;
+ __u64 bytes;
+ __u64 errors;
+};
+
+#define seg6_local_alloc_pcpu_counters(__gfp) \
+ __netdev_alloc_pcpu_stats(struct pcpu_seg6_local_counters, \
+ ((__gfp) | __GFP_ZERO))
+
+#define SEG6_F_LOCAL_COUNTERS SEG6_F_ATTR(SEG6_LOCAL_COUNTERS)
+
struct seg6_local_lwt {
int action;
struct ipv6_sr_hdr *srh;
@@ -58,9 +193,19 @@ struct seg6_local_lwt {
int iif;
int oif;
struct bpf_lwt_prog bpf;
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ struct seg6_end_dt_info dt_info;
+#endif
+ struct seg6_flavors_info flv_info;
+
+ struct pcpu_seg6_local_counters __percpu *pcpu_counters;
int headroom;
struct seg6_action_desc *desc;
+ /* unlike the required attrs, we have to track the optional attributes
+ * that have been effectively parsed.
+ */
+ unsigned long parsed_optattrs;
};
static struct seg6_local_lwt *seg6_local_lwtunnel(struct lwtunnel_state *lwt)
@@ -68,41 +213,14 @@ static struct seg6_local_lwt *seg6_local_lwtunnel(struct lwtunnel_state *lwt)
return (struct seg6_local_lwt *)lwt->data;
}
-static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb)
-{
- struct ipv6_sr_hdr *srh;
- int len, srhoff = 0;
-
- if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0)
- return NULL;
-
- if (!pskb_may_pull(skb, srhoff + sizeof(*srh)))
- return NULL;
-
- srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
-
- len = (srh->hdrlen + 1) << 3;
-
- if (!pskb_may_pull(skb, srhoff + len))
- return NULL;
-
- if (!seg6_validate_srh(srh, len))
- return NULL;
-
- return srh;
-}
-
static struct ipv6_sr_hdr *get_and_validate_srh(struct sk_buff *skb)
{
struct ipv6_sr_hdr *srh;
- srh = get_srh(skb);
+ srh = seg6_get_srh(skb, IP6_FH_F_SKIP_RH);
if (!srh)
return NULL;
- if (srh->segments_left == 0)
- return NULL;
-
#ifdef CONFIG_IPV6_SEG6_HMAC
if (!seg6_hmac_validate_skb(skb))
return NULL;
@@ -116,7 +234,7 @@ static bool decap_and_validate(struct sk_buff *skb, int proto)
struct ipv6_sr_hdr *srh;
unsigned int off = 0;
- srh = get_srh(skb);
+ srh = seg6_get_srh(skb, 0);
if (srh && srh->segments_left > 0)
return false;
@@ -135,7 +253,8 @@ static bool decap_and_validate(struct sk_buff *skb, int proto)
skb_reset_network_header(skb);
skb_reset_transport_header(skb);
- skb->encapsulation = 0;
+ if (iptunnel_pull_offloads(skb))
+ return false;
return true;
}
@@ -149,8 +268,9 @@ static void advance_nextseg(struct ipv6_sr_hdr *srh, struct in6_addr *daddr)
*daddr = *addr;
}
-int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
- u32 tbl_id)
+static int
+seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
+ u32 tbl_id, bool local_delivery, int oif)
{
struct net *net = dev_net(skb->dev);
struct ipv6hdr *hdr = ipv6_hdr(skb);
@@ -158,8 +278,11 @@ int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
struct dst_entry *dst = NULL;
struct rt6_info *rt;
struct flowi6 fl6;
+ int dev_flags = 0;
+ memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_iif = skb->dev->ifindex;
+ fl6.flowi6_oif = oif;
fl6.daddr = nhaddr ? *nhaddr : hdr->daddr;
fl6.saddr = hdr->saddr;
fl6.flowlabel = ip6_flowinfo(hdr);
@@ -169,20 +292,28 @@ int seg6_lookup_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr,
if (nhaddr)
fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH;
- if (!tbl_id) {
+ if (!tbl_id && !oif) {
dst = ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags);
- } else {
+ } else if (tbl_id) {
struct fib6_table *table;
table = fib6_get_table(net, tbl_id);
if (!table)
goto out;
- rt = ip6_pol_route(net, table, 0, &fl6, skb, flags);
+ rt = ip6_pol_route(net, table, oif, &fl6, skb, flags);
dst = &rt->dst;
+ } else {
+ dst = ip6_route_output(net, NULL, &fl6);
}
- if (dst && dst->dev->flags & IFF_LOOPBACK && !dst->error) {
+ /* we want to discard traffic destined for local packet processing,
+ * if @local_delivery is set to false.
+ */
+ if (!local_delivery)
+ dev_flags |= IFF_LOOPBACK;
+
+ if (dst && (dst_dev(dst)->flags & dev_flags) && !dst->error) {
dst_release(dst);
dst = NULL;
}
@@ -199,8 +330,64 @@ out:
return dst->error;
}
-/* regular endpoint function */
-static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+int seg6_lookup_nexthop(struct sk_buff *skb,
+ struct in6_addr *nhaddr, u32 tbl_id)
+{
+ return seg6_lookup_any_nexthop(skb, nhaddr, tbl_id, false, 0);
+}
+
+static __u8 seg6_flv_lcblock_octects(const struct seg6_flavors_info *finfo)
+{
+ return finfo->lcblock_bits >> 3;
+}
+
+static __u8 seg6_flv_lcnode_func_octects(const struct seg6_flavors_info *finfo)
+{
+ return finfo->lcnode_func_bits >> 3;
+}
+
+static bool seg6_next_csid_is_arg_zero(const struct in6_addr *addr,
+ const struct seg6_flavors_info *finfo)
+{
+ __u8 fnc_octects = seg6_flv_lcnode_func_octects(finfo);
+ __u8 blk_octects = seg6_flv_lcblock_octects(finfo);
+ __u8 arg_octects;
+ int i;
+
+ arg_octects = 16 - blk_octects - fnc_octects;
+ for (i = 0; i < arg_octects; ++i) {
+ if (addr->s6_addr[blk_octects + fnc_octects + i] != 0x00)
+ return false;
+ }
+
+ return true;
+}
+
+/* assume that DA.Argument length > 0 */
+static void seg6_next_csid_advance_arg(struct in6_addr *addr,
+ const struct seg6_flavors_info *finfo)
+{
+ __u8 fnc_octects = seg6_flv_lcnode_func_octects(finfo);
+ __u8 blk_octects = seg6_flv_lcblock_octects(finfo);
+
+ /* advance DA.Argument */
+ memmove(&addr->s6_addr[blk_octects],
+ &addr->s6_addr[blk_octects + fnc_octects],
+ 16 - blk_octects - fnc_octects);
+
+ memset(&addr->s6_addr[16 - fnc_octects], 0x00, fnc_octects);
+}
+
+static int input_action_end_finish(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ seg6_lookup_nexthop(skb, NULL, 0);
+
+ return dst_input(skb);
+}
+
+static int input_action_end_core(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
{
struct ipv6_sr_hdr *srh;
@@ -210,17 +397,37 @@ static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt)
advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
- seg6_lookup_nexthop(skb, NULL, 0);
-
- return dst_input(skb);
+ return input_action_end_finish(skb, slwt);
drop:
kfree_skb(skb);
return -EINVAL;
}
-/* regular endpoint, and forward to specified nexthop */
-static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+static int end_next_csid_core(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ const struct seg6_flavors_info *finfo = &slwt->flv_info;
+ struct in6_addr *daddr = &ipv6_hdr(skb)->daddr;
+
+ if (seg6_next_csid_is_arg_zero(daddr, finfo))
+ return input_action_end_core(skb, slwt);
+
+ /* update DA */
+ seg6_next_csid_advance_arg(daddr, finfo);
+
+ return input_action_end_finish(skb, slwt);
+}
+
+static int input_action_end_x_finish(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ seg6_lookup_any_nexthop(skb, &slwt->nh6, 0, false, slwt->oif);
+
+ return dst_input(skb);
+}
+
+static int input_action_end_x_core(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
{
struct ipv6_sr_hdr *srh;
@@ -230,15 +437,395 @@ static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt)
advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
- seg6_lookup_nexthop(skb, &slwt->nh6, 0);
+ return input_action_end_x_finish(skb, slwt);
- return dst_input(skb);
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+static int end_x_next_csid_core(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ const struct seg6_flavors_info *finfo = &slwt->flv_info;
+ struct in6_addr *daddr = &ipv6_hdr(skb)->daddr;
+
+ if (seg6_next_csid_is_arg_zero(daddr, finfo))
+ return input_action_end_x_core(skb, slwt);
+
+ /* update DA */
+ seg6_next_csid_advance_arg(daddr, finfo);
+
+ return input_action_end_x_finish(skb, slwt);
+}
+
+static bool seg6_next_csid_enabled(__u32 fops)
+{
+ return fops & SEG6_F_LOCAL_FLV_NEXT_CSID;
+}
+
+/* Processing of SRv6 End, End.X, and End.T behaviors can be extended through
+ * the flavors framework. These behaviors must report the subset of (flavor)
+ * operations they currently implement. In this way, if a user specifies a
+ * flavor combination that is not supported by a given End* behavior, the
+ * kernel refuses to instantiate the tunnel reporting the error.
+ */
+static int seg6_flv_supp_ops_by_action(int action, __u32 *fops)
+{
+ switch (action) {
+ case SEG6_LOCAL_ACTION_END:
+ *fops = SEG6_LOCAL_END_FLV_SUPP_OPS;
+ break;
+ case SEG6_LOCAL_ACTION_END_X:
+ *fops = SEG6_LOCAL_END_X_FLV_SUPP_OPS;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+/* We describe the packet state in relation to the absence/presence of the SRH
+ * and the Segment Left (SL) field.
+ * For our purposes, it is not necessary to record the exact value of the SL
+ * when the SID List consists of two or more segments.
+ */
+enum seg6_local_pktinfo {
+ /* the order really matters! */
+ SEG6_LOCAL_PKTINFO_NOHDR = 0,
+ SEG6_LOCAL_PKTINFO_SL_ZERO,
+ SEG6_LOCAL_PKTINFO_SL_ONE,
+ SEG6_LOCAL_PKTINFO_SL_MORE,
+ __SEG6_LOCAL_PKTINFO_MAX,
+};
+
+#define SEG6_LOCAL_PKTINFO_MAX (__SEG6_LOCAL_PKTINFO_MAX - 1)
+
+static enum seg6_local_pktinfo seg6_get_srh_pktinfo(struct ipv6_sr_hdr *srh)
+{
+ __u8 sgl;
+
+ if (!srh)
+ return SEG6_LOCAL_PKTINFO_NOHDR;
+
+ sgl = srh->segments_left;
+ if (sgl < 2)
+ return SEG6_LOCAL_PKTINFO_SL_ZERO + sgl;
+
+ return SEG6_LOCAL_PKTINFO_SL_MORE;
+}
+
+enum seg6_local_flv_action {
+ SEG6_LOCAL_FLV_ACT_UNSPEC = 0,
+ SEG6_LOCAL_FLV_ACT_END,
+ SEG6_LOCAL_FLV_ACT_PSP,
+ SEG6_LOCAL_FLV_ACT_USP,
+ SEG6_LOCAL_FLV_ACT_USD,
+ __SEG6_LOCAL_FLV_ACT_MAX
+};
+
+#define SEG6_LOCAL_FLV_ACT_MAX (__SEG6_LOCAL_FLV_ACT_MAX - 1)
+
+/* The action table for RFC8986 flavors (see the flv8986_act_tbl below)
+ * contains the actions (i.e. processing operations) to be applied on packets
+ * when flavors are configured for an End* behavior.
+ * By combining the pkinfo data and from the flavors mask, the macro
+ * computes the index used to access the elements (actions) stored in the
+ * action table. The index is structured as follows:
+ *
+ * index
+ * _______________/\________________
+ * / \
+ * +----------------+----------------+
+ * | pf | afm |
+ * +----------------+----------------+
+ * ph-1 ... p1 p0 fk-1 ... f1 f0
+ * MSB LSB
+ *
+ * where:
+ * - 'afm' (adjusted flavor mask) is the mask containing a combination of the
+ * RFC8986 flavors currently supported. 'afm' corresponds to the @fm
+ * argument of the macro whose value is righ-shifted by 1 bit. By doing so,
+ * we discard the SEG6_LOCAL_FLV_OP_UNSPEC flag (bit 0 in @fm) which is
+ * never used here;
+ * - 'pf' encodes the packet info (pktinfo) regarding the presence/absence of
+ * the SRH, SL = 0, etc. 'pf' is set with the value of @pf provided as
+ * argument to the macro.
+ */
+#define flv8986_act_tbl_idx(pf, fm) \
+ ((((pf) << bits_per(SEG6_LOCAL_FLV8986_SUPP_OPS)) | \
+ ((fm) & SEG6_LOCAL_FLV8986_SUPP_OPS)) >> SEG6_LOCAL_FLV_OP_PSP)
+
+/* We compute the size of the action table by considering the RFC8986 flavors
+ * actually supported by the kernel. In this way, the size is automatically
+ * adjusted when new flavors are supported.
+ */
+#define FLV8986_ACT_TBL_SIZE \
+ roundup_pow_of_two(flv8986_act_tbl_idx(SEG6_LOCAL_PKTINFO_MAX, \
+ SEG6_LOCAL_FLV8986_SUPP_OPS))
+
+/* tbl_cfg(act, pf, fm) macro is used to easily configure the action
+ * table; it accepts 3 arguments:
+ * i) @act, the suffix from SEG6_LOCAL_FLV_ACT_{act} representing
+ * the action that should be applied on the packet;
+ * ii) @pf, the suffix from SEG6_LOCAL_PKTINFO_{pf} reporting the packet
+ * info about the lack/presence of SRH, SRH with SL = 0, etc;
+ * iii) @fm, the mask of flavors.
+ */
+#define tbl_cfg(act, pf, fm) \
+ [flv8986_act_tbl_idx(SEG6_LOCAL_PKTINFO_##pf, \
+ (fm))] = SEG6_LOCAL_FLV_ACT_##act
+
+/* shorthand for improving readability */
+#define F_PSP SEG6_F_LOCAL_FLV_PSP
+
+/* The table contains, for each combination of the pktinfo data and
+ * flavors, the action that should be taken on a packet (e.g.
+ * "standard" Endpoint processing, Penultimate Segment Pop, etc).
+ *
+ * By default, table entries not explicitly configured are initialized with the
+ * SEG6_LOCAL_FLV_ACT_UNSPEC action, which generally has the effect of
+ * discarding the processed packet.
+ */
+static const u8 flv8986_act_tbl[FLV8986_ACT_TBL_SIZE] = {
+ /* PSP variant for packet where SRH with SL = 1 */
+ tbl_cfg(PSP, SL_ONE, F_PSP),
+ /* End for packet where the SRH with SL > 1*/
+ tbl_cfg(END, SL_MORE, F_PSP),
+};
+
+#undef F_PSP
+#undef tbl_cfg
+
+/* For each flavor defined in RFC8986 (or a combination of them) an action is
+ * performed on the packet. The specific action depends on:
+ * - info extracted from the packet (i.e. pktinfo data) regarding the
+ * lack/presence of the SRH, and if the SRH is available, on the value of
+ * Segment Left field;
+ * - the mask of flavors configured for the specific SRv6 End* behavior.
+ *
+ * The function combines both the pkinfo and the flavors mask to evaluate the
+ * corresponding action to be taken on the packet.
+ */
+static enum seg6_local_flv_action
+seg6_local_flv8986_act_lookup(enum seg6_local_pktinfo pinfo, __u32 flvmask)
+{
+ unsigned long index;
+
+ /* check if the provided mask of flavors is supported */
+ if (unlikely(flvmask & ~SEG6_LOCAL_FLV8986_SUPP_OPS))
+ return SEG6_LOCAL_FLV_ACT_UNSPEC;
+
+ index = flv8986_act_tbl_idx(pinfo, flvmask);
+ if (unlikely(index >= FLV8986_ACT_TBL_SIZE))
+ return SEG6_LOCAL_FLV_ACT_UNSPEC;
+
+ return flv8986_act_tbl[index];
+}
+
+/* skb->data must be aligned with skb->network_header */
+static bool seg6_pop_srh(struct sk_buff *skb, int srhoff)
+{
+ struct ipv6_sr_hdr *srh;
+ struct ipv6hdr *iph;
+ __u8 srh_nexthdr;
+ int thoff = -1;
+ int srhlen;
+ int nhlen;
+
+ if (unlikely(srhoff < sizeof(*iph) ||
+ !pskb_may_pull(skb, srhoff + sizeof(*srh))))
+ return false;
+
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+ srhlen = ipv6_optlen(srh);
+
+ /* we are about to mangle the pkt, let's check if we can write on it */
+ if (unlikely(skb_ensure_writable(skb, srhoff + srhlen)))
+ return false;
+
+ /* skb_ensure_writable() may change skb pointers; evaluate srh again */
+ srh = (struct ipv6_sr_hdr *)(skb->data + srhoff);
+ srh_nexthdr = srh->nexthdr;
+
+ if (unlikely(!skb_transport_header_was_set(skb)))
+ goto pull;
+
+ nhlen = skb_network_header_len(skb);
+ /* we have to deal with the transport header: it could be set before
+ * the SRH, after the SRH, or within it (which is considered wrong,
+ * however).
+ */
+ if (likely(nhlen <= srhoff))
+ thoff = nhlen;
+ else if (nhlen >= srhoff + srhlen)
+ /* transport_header is set after the SRH */
+ thoff = nhlen - srhlen;
+ else
+ /* transport_header falls inside the SRH; hence, we can't
+ * restore the transport_header pointer properly after
+ * SRH removing operation.
+ */
+ return false;
+pull:
+ /* we need to pop the SRH:
+ * 1) first of all, we pull out everything from IPv6 header up to SRH
+ * (included) evaluating also the rcsum;
+ * 2) we overwrite (and then remove) the SRH by properly moving the
+ * IPv6 along with any extension header that precedes the SRH;
+ * 3) At the end, we push back the pulled headers (except for SRH,
+ * obviously).
+ */
+ skb_pull_rcsum(skb, srhoff + srhlen);
+ memmove(skb_network_header(skb) + srhlen, skb_network_header(skb),
+ srhoff);
+ skb_push(skb, srhoff);
+
+ skb_reset_network_header(skb);
+ skb_mac_header_rebuild(skb);
+ if (likely(thoff >= 0))
+ skb_set_transport_header(skb, thoff);
+
+ iph = ipv6_hdr(skb);
+ if (iph->nexthdr == NEXTHDR_ROUTING) {
+ iph->nexthdr = srh_nexthdr;
+ } else {
+ /* we must look for the extension header (EXTH, for short) that
+ * immediately precedes the SRH we have just removed.
+ * Then, we update the value of the EXTH nexthdr with the one
+ * contained in the SRH nexthdr.
+ */
+ unsigned int off = sizeof(*iph);
+ struct ipv6_opt_hdr *hp, _hdr;
+ __u8 nexthdr = iph->nexthdr;
+
+ for (;;) {
+ if (unlikely(!ipv6_ext_hdr(nexthdr) ||
+ nexthdr == NEXTHDR_NONE))
+ return false;
+
+ hp = skb_header_pointer(skb, off, sizeof(_hdr), &_hdr);
+ if (unlikely(!hp))
+ return false;
+
+ if (hp->nexthdr == NEXTHDR_ROUTING) {
+ hp->nexthdr = srh_nexthdr;
+ break;
+ }
+
+ switch (nexthdr) {
+ case NEXTHDR_FRAGMENT:
+ fallthrough;
+ case NEXTHDR_AUTH:
+ /* we expect SRH before FRAG and AUTH */
+ return false;
+ default:
+ off += ipv6_optlen(hp);
+ break;
+ }
+
+ nexthdr = hp->nexthdr;
+ }
+ }
+
+ iph->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
+
+ skb_postpush_rcsum(skb, iph, srhoff);
+
+ return true;
+}
+
+/* process the packet on the basis of the RFC8986 flavors set for the given
+ * SRv6 End behavior instance.
+ */
+static int end_flv8986_core(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ const struct seg6_flavors_info *finfo = &slwt->flv_info;
+ enum seg6_local_flv_action action;
+ enum seg6_local_pktinfo pinfo;
+ struct ipv6_sr_hdr *srh;
+ __u32 flvmask;
+ int srhoff;
+
+ srh = seg6_get_srh(skb, 0);
+ srhoff = srh ? ((unsigned char *)srh - skb->data) : 0;
+ pinfo = seg6_get_srh_pktinfo(srh);
+#ifdef CONFIG_IPV6_SEG6_HMAC
+ if (srh && !seg6_hmac_validate_skb(skb))
+ goto drop;
+#endif
+ flvmask = finfo->flv_ops;
+ if (unlikely(flvmask & ~SEG6_LOCAL_FLV8986_SUPP_OPS)) {
+ pr_warn_once("seg6local: invalid RFC8986 flavors\n");
+ goto drop;
+ }
+
+ /* retrieve the action triggered by the combination of pktinfo data and
+ * the flavors mask.
+ */
+ action = seg6_local_flv8986_act_lookup(pinfo, flvmask);
+ switch (action) {
+ case SEG6_LOCAL_FLV_ACT_END:
+ /* process the packet as the "standard" End behavior */
+ advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
+ break;
+ case SEG6_LOCAL_FLV_ACT_PSP:
+ advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
+
+ if (unlikely(!seg6_pop_srh(skb, srhoff)))
+ goto drop;
+ break;
+ case SEG6_LOCAL_FLV_ACT_UNSPEC:
+ fallthrough;
+ default:
+ /* by default, we drop the packet since we could not find a
+ * suitable action.
+ */
+ goto drop;
+ }
+
+ return input_action_end_finish(skb, slwt);
drop:
kfree_skb(skb);
return -EINVAL;
}
+/* regular endpoint function */
+static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ const struct seg6_flavors_info *finfo = &slwt->flv_info;
+ __u32 fops = finfo->flv_ops;
+
+ if (!fops)
+ return input_action_end_core(skb, slwt);
+
+ /* check for the presence of NEXT-C-SID since it applies first */
+ if (seg6_next_csid_enabled(fops))
+ return end_next_csid_core(skb, slwt);
+
+ /* the specific processing function to be performed on the packet
+ * depends on the combination of flavors defined in RFC8986 and some
+ * information extracted from the packet, e.g. presence/absence of SRH,
+ * Segment Left = 0, etc.
+ */
+ return end_flv8986_core(skb, slwt);
+}
+
+/* regular endpoint, and forward to specified nexthop */
+static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ const struct seg6_flavors_info *finfo = &slwt->flv_info;
+ __u32 fops = finfo->flv_ops;
+
+ /* check for the presence of NEXT-C-SID since it applies first */
+ if (seg6_next_csid_enabled(fops))
+ return end_x_next_csid_core(skb, slwt);
+
+ return input_action_end_x_core(skb, slwt);
+}
+
static int input_action_end_t(struct sk_buff *skb, struct seg6_local_lwt *slwt)
{
struct ipv6_sr_hdr *srh;
@@ -266,7 +853,7 @@ static int input_action_end_dx2(struct sk_buff *skb,
struct net_device *odev;
struct ethhdr *eth;
- if (!decap_and_validate(skb, NEXTHDR_NONE))
+ if (!decap_and_validate(skb, IPPROTO_ETHERNET))
goto drop;
if (!pskb_may_pull(skb, ETH_HLEN))
@@ -315,21 +902,14 @@ drop:
return -EINVAL;
}
-/* decapsulate and forward to specified nexthop */
-static int input_action_end_dx6(struct sk_buff *skb,
- struct seg6_local_lwt *slwt)
+static int input_action_end_dx6_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
+ struct dst_entry *orig_dst = skb_dst(skb);
struct in6_addr *nhaddr = NULL;
+ struct seg6_local_lwt *slwt;
- /* this function accepts IPv6 encapsulated packets, with either
- * an SRH with SL=0, or no SRH.
- */
-
- if (!decap_and_validate(skb, IPPROTO_IPV6))
- goto drop;
-
- if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
- goto drop;
+ slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
/* The inner packet is not associated to any local interface,
* so we do not call netif_rx().
@@ -337,25 +917,71 @@ static int input_action_end_dx6(struct sk_buff *skb,
* If slwt->nh6 is set to ::, then lookup the nexthop for the
* inner packet's DA. Otherwise, use the specified nexthop.
*/
-
if (!ipv6_addr_any(&slwt->nh6))
nhaddr = &slwt->nh6;
seg6_lookup_nexthop(skb, nhaddr, 0);
return dst_input(skb);
+}
+
+/* decapsulate and forward to specified nexthop */
+static int input_action_end_dx6(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ /* this function accepts IPv6 encapsulated packets, with either
+ * an SRH with SL=0, or no SRH.
+ */
+
+ if (!decap_and_validate(skb, IPPROTO_IPV6))
+ goto drop;
+
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ goto drop;
+
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+ nf_reset_ct(skb);
+
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
+ dev_net(skb->dev), NULL, skb, skb->dev,
+ NULL, input_action_end_dx6_finish);
+
+ return input_action_end_dx6_finish(dev_net(skb->dev), NULL, skb);
drop:
kfree_skb(skb);
return -EINVAL;
}
-static int input_action_end_dx4(struct sk_buff *skb,
- struct seg6_local_lwt *slwt)
+static int input_action_end_dx4_finish(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
+ struct dst_entry *orig_dst = skb_dst(skb);
+ enum skb_drop_reason reason;
+ struct seg6_local_lwt *slwt;
struct iphdr *iph;
__be32 nhaddr;
- int err;
+ slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
+
+ iph = ip_hdr(skb);
+
+ nhaddr = slwt->nh4.s_addr ?: iph->daddr;
+
+ skb_dst_drop(skb);
+
+ reason = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev);
+ if (reason) {
+ kfree_skb_reason(skb, reason);
+ return -EINVAL;
+ }
+
+ return dst_input(skb);
+}
+
+static int input_action_end_dx4(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
if (!decap_and_validate(skb, IPPROTO_IPIP))
goto drop;
@@ -363,15 +989,215 @@ static int input_action_end_dx4(struct sk_buff *skb,
goto drop;
skb->protocol = htons(ETH_P_IP);
+ skb_set_transport_header(skb, sizeof(struct iphdr));
+ nf_reset_ct(skb);
- iph = ip_hdr(skb);
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
+ dev_net(skb->dev), NULL, skb, skb->dev,
+ NULL, input_action_end_dx4_finish);
- nhaddr = slwt->nh4.s_addr ?: iph->daddr;
+ return input_action_end_dx4_finish(dev_net(skb->dev), NULL, skb);
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+
+#ifdef CONFIG_NET_L3_MASTER_DEV
+static struct net *fib6_config_get_net(const struct fib6_config *fib6_cfg)
+{
+ const struct nl_info *nli = &fib6_cfg->fc_nlinfo;
+
+ return nli->nl_net;
+}
+
+static int __seg6_end_dt_vrf_build(struct seg6_local_lwt *slwt, const void *cfg,
+ u16 family, struct netlink_ext_ack *extack)
+{
+ struct seg6_end_dt_info *info = &slwt->dt_info;
+ int vrf_ifindex;
+ struct net *net;
+
+ net = fib6_config_get_net(cfg);
+
+ /* note that vrf_table was already set by parse_nla_vrftable() */
+ vrf_ifindex = l3mdev_ifindex_lookup_by_table_id(L3MDEV_TYPE_VRF, net,
+ info->vrf_table);
+ if (vrf_ifindex < 0) {
+ if (vrf_ifindex == -EPERM) {
+ NL_SET_ERR_MSG(extack,
+ "Strict mode for VRF is disabled");
+ } else if (vrf_ifindex == -ENODEV) {
+ NL_SET_ERR_MSG(extack,
+ "Table has no associated VRF device");
+ } else {
+ pr_debug("seg6local: SRv6 End.DT* creation error=%d\n",
+ vrf_ifindex);
+ }
+
+ return vrf_ifindex;
+ }
+
+ info->net = net;
+ info->vrf_ifindex = vrf_ifindex;
+
+ info->family = family;
+ info->mode = DT_VRF_MODE;
+
+ return 0;
+}
+
+/* The SRv6 End.DT4/DT6 behavior extracts the inner (IPv4/IPv6) packet and
+ * routes the IPv4/IPv6 packet by looking at the configured routing table.
+ *
+ * In the SRv6 End.DT4/DT6 use case, we can receive traffic (IPv6+Segment
+ * Routing Header packets) from several interfaces and the outer IPv6
+ * destination address (DA) is used for retrieving the specific instance of the
+ * End.DT4/DT6 behavior that should process the packets.
+ *
+ * However, the inner IPv4/IPv6 packet is not really bound to any receiving
+ * interface and thus the End.DT4/DT6 sets the VRF (associated with the
+ * corresponding routing table) as the *receiving* interface.
+ * In other words, the End.DT4/DT6 processes a packet as if it has been received
+ * directly by the VRF (and not by one of its slave devices, if any).
+ * In this way, the VRF interface is used for routing the IPv4/IPv6 packet in
+ * according to the routing table configured by the End.DT4/DT6 instance.
+ *
+ * This design allows you to get some interesting features like:
+ * 1) the statistics on rx packets;
+ * 2) the possibility to install a packet sniffer on the receiving interface
+ * (the VRF one) for looking at the incoming packets;
+ * 3) the possibility to leverage the netfilter prerouting hook for the inner
+ * IPv4 packet.
+ *
+ * This function returns:
+ * - the sk_buff* when the VRF rcv handler has processed the packet correctly;
+ * - NULL when the skb is consumed by the VRF rcv handler;
+ * - a pointer which encodes a negative error number in case of error.
+ * Note that in this case, the function takes care of freeing the skb.
+ */
+static struct sk_buff *end_dt_vrf_rcv(struct sk_buff *skb, u16 family,
+ struct net_device *dev)
+{
+ /* based on l3mdev_ip_rcv; we are only interested in the master */
+ if (unlikely(!netif_is_l3_master(dev) && !netif_has_l3_rx_handler(dev)))
+ goto drop;
+
+ if (unlikely(!dev->l3mdev_ops->l3mdev_l3_rcv))
+ goto drop;
+
+ /* the decap packet IPv4/IPv6 does not come with any mac header info.
+ * We must unset the mac header to allow the VRF device to rebuild it,
+ * just in case there is a sniffer attached on the device.
+ */
+ skb_unset_mac_header(skb);
+
+ skb = dev->l3mdev_ops->l3mdev_l3_rcv(dev, skb, family);
+ if (!skb)
+ /* the skb buffer was consumed by the handler */
+ return NULL;
+
+ /* when a packet is received by a VRF or by one of its slaves, the
+ * master device reference is set into the skb.
+ */
+ if (unlikely(skb->dev != dev || skb->skb_iif != dev->ifindex))
+ goto drop;
+
+ return skb;
+
+drop:
+ kfree_skb(skb);
+ return ERR_PTR(-EINVAL);
+}
+
+static struct net_device *end_dt_get_vrf_rcu(struct sk_buff *skb,
+ struct seg6_end_dt_info *info)
+{
+ int vrf_ifindex = info->vrf_ifindex;
+ struct net *net = info->net;
+
+ if (unlikely(vrf_ifindex < 0))
+ goto error;
+
+ if (unlikely(!net_eq(dev_net(skb->dev), net)))
+ goto error;
+
+ return dev_get_by_index_rcu(net, vrf_ifindex);
+
+error:
+ return NULL;
+}
+
+static struct sk_buff *end_dt_vrf_core(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt, u16 family)
+{
+ struct seg6_end_dt_info *info = &slwt->dt_info;
+ struct net_device *vrf;
+ __be16 protocol;
+ int hdrlen;
+
+ vrf = end_dt_get_vrf_rcu(skb, info);
+ if (unlikely(!vrf))
+ goto drop;
+
+ switch (family) {
+ case AF_INET:
+ protocol = htons(ETH_P_IP);
+ hdrlen = sizeof(struct iphdr);
+ break;
+ case AF_INET6:
+ protocol = htons(ETH_P_IPV6);
+ hdrlen = sizeof(struct ipv6hdr);
+ break;
+ case AF_UNSPEC:
+ fallthrough;
+ default:
+ goto drop;
+ }
+
+ if (unlikely(info->family != AF_UNSPEC && info->family != family)) {
+ pr_warn_once("seg6local: SRv6 End.DT* family mismatch");
+ goto drop;
+ }
+
+ skb->protocol = protocol;
skb_dst_drop(skb);
- err = ip_route_input(skb, nhaddr, iph->saddr, 0, skb->dev);
- if (err)
+ skb_set_transport_header(skb, hdrlen);
+ nf_reset_ct(skb);
+
+ return end_dt_vrf_rcv(skb, family, vrf);
+
+drop:
+ kfree_skb(skb);
+ return ERR_PTR(-EINVAL);
+}
+
+static int input_action_end_dt4(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ enum skb_drop_reason reason;
+ struct iphdr *iph;
+
+ if (!decap_and_validate(skb, IPPROTO_IPIP))
+ goto drop;
+
+ if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+ goto drop;
+
+ skb = end_dt_vrf_core(skb, slwt, AF_INET);
+ if (!skb)
+ /* packet has been processed and consumed by the VRF */
+ return 0;
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ iph = ip_hdr(skb);
+
+ reason = ip_route_input(skb, iph->daddr, iph->saddr, 0, skb->dev);
+ if (unlikely(reason))
goto drop;
return dst_input(skb);
@@ -381,6 +1207,54 @@ drop:
return -EINVAL;
}
+static int seg6_end_dt4_build(struct seg6_local_lwt *slwt, const void *cfg,
+ struct netlink_ext_ack *extack)
+{
+ return __seg6_end_dt_vrf_build(slwt, cfg, AF_INET, extack);
+}
+
+static enum
+seg6_end_dt_mode seg6_end_dt6_parse_mode(struct seg6_local_lwt *slwt)
+{
+ unsigned long parsed_optattrs = slwt->parsed_optattrs;
+ bool legacy, vrfmode;
+
+ legacy = !!(parsed_optattrs & SEG6_F_ATTR(SEG6_LOCAL_TABLE));
+ vrfmode = !!(parsed_optattrs & SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE));
+
+ if (!(legacy ^ vrfmode))
+ /* both are absent or present: invalid DT6 mode */
+ return DT_INVALID_MODE;
+
+ return legacy ? DT_LEGACY_MODE : DT_VRF_MODE;
+}
+
+static enum seg6_end_dt_mode seg6_end_dt6_get_mode(struct seg6_local_lwt *slwt)
+{
+ struct seg6_end_dt_info *info = &slwt->dt_info;
+
+ return info->mode;
+}
+
+static int seg6_end_dt6_build(struct seg6_local_lwt *slwt, const void *cfg,
+ struct netlink_ext_ack *extack)
+{
+ enum seg6_end_dt_mode mode = seg6_end_dt6_parse_mode(slwt);
+ struct seg6_end_dt_info *info = &slwt->dt_info;
+
+ switch (mode) {
+ case DT_LEGACY_MODE:
+ info->mode = DT_LEGACY_MODE;
+ return 0;
+ case DT_VRF_MODE:
+ return __seg6_end_dt_vrf_build(slwt, cfg, AF_INET6, extack);
+ default:
+ NL_SET_ERR_MSG(extack, "table or vrftable must be specified");
+ return -EINVAL;
+ }
+}
+#endif
+
static int input_action_end_dt6(struct sk_buff *skb,
struct seg6_local_lwt *slwt)
{
@@ -390,7 +1264,31 @@ static int input_action_end_dt6(struct sk_buff *skb,
if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
goto drop;
- seg6_lookup_nexthop(skb, NULL, slwt->table);
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ if (seg6_end_dt6_get_mode(slwt) == DT_LEGACY_MODE)
+ goto legacy_mode;
+
+ /* DT6_VRF_MODE */
+ skb = end_dt_vrf_core(skb, slwt, AF_INET6);
+ if (!skb)
+ /* packet has been processed and consumed by the VRF */
+ return 0;
+
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
+ /* note: this time we do not need to specify the table because the VRF
+ * takes care of selecting the correct table.
+ */
+ seg6_lookup_any_nexthop(skb, NULL, 0, true, 0);
+
+ return dst_input(skb);
+
+legacy_mode:
+#endif
+ skb_set_transport_header(skb, sizeof(struct ipv6hdr));
+
+ seg6_lookup_any_nexthop(skb, NULL, slwt->table, true, 0);
return dst_input(skb);
@@ -399,6 +1297,36 @@ drop:
return -EINVAL;
}
+#ifdef CONFIG_NET_L3_MASTER_DEV
+static int seg6_end_dt46_build(struct seg6_local_lwt *slwt, const void *cfg,
+ struct netlink_ext_ack *extack)
+{
+ return __seg6_end_dt_vrf_build(slwt, cfg, AF_UNSPEC, extack);
+}
+
+static int input_action_end_dt46(struct sk_buff *skb,
+ struct seg6_local_lwt *slwt)
+{
+ unsigned int off = 0;
+ int nexthdr;
+
+ nexthdr = ipv6_find_hdr(skb, &off, -1, NULL, NULL);
+ if (unlikely(nexthdr < 0))
+ goto drop;
+
+ switch (nexthdr) {
+ case IPPROTO_IPIP:
+ return input_action_end_dt4(skb, slwt);
+ case IPPROTO_IPV6:
+ return input_action_end_dt6(skb, slwt);
+ }
+
+drop:
+ kfree_skb(skb);
+ return -EINVAL;
+}
+#endif
+
/* push an SRH on top of the current one */
static int input_action_end_b6(struct sk_buff *skb, struct seg6_local_lwt *slwt)
{
@@ -413,7 +1341,6 @@ static int input_action_end_b6(struct sk_buff *skb, struct seg6_local_lwt *slwt)
if (err)
goto drop;
- ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
skb_set_transport_header(skb, sizeof(struct ipv6hdr));
seg6_lookup_nexthop(skb, NULL, 0);
@@ -445,7 +1372,6 @@ static int input_action_end_b6_encap(struct sk_buff *skb,
if (err)
goto drop;
- ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr));
skb_set_transport_header(skb, sizeof(struct ipv6hdr));
seg6_lookup_nexthop(skb, NULL, 0);
@@ -457,7 +1383,9 @@ drop:
return err;
}
-DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states);
+DEFINE_PER_CPU(struct seg6_bpf_srh_state, seg6_bpf_srh_states) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
bool seg6_bpf_has_valid_srh(struct sk_buff *skb)
{
@@ -465,6 +1393,7 @@ bool seg6_bpf_has_valid_srh(struct sk_buff *skb)
this_cpu_ptr(&seg6_bpf_srh_states);
struct ipv6_sr_hdr *srh = srh_state->srh;
+ lockdep_assert_held(&srh_state->bh_lock);
if (unlikely(srh == NULL))
return false;
@@ -473,7 +1402,7 @@ bool seg6_bpf_has_valid_srh(struct sk_buff *skb)
return false;
srh->hdrlen = (u8)(srh_state->hdrlen >> 3);
- if (!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3))
+ if (!seg6_validate_srh(srh, (srh->hdrlen + 1) << 3, true))
return false;
srh_state->valid = true;
@@ -485,8 +1414,7 @@ bool seg6_bpf_has_valid_srh(struct sk_buff *skb)
static int input_action_end_bpf(struct sk_buff *skb,
struct seg6_local_lwt *slwt)
{
- struct seg6_bpf_srh_state *srh_state =
- this_cpu_ptr(&seg6_bpf_srh_states);
+ struct seg6_bpf_srh_state *srh_state;
struct ipv6_sr_hdr *srh;
int ret;
@@ -497,10 +1425,14 @@ static int input_action_end_bpf(struct sk_buff *skb,
}
advance_nextseg(srh, &ipv6_hdr(skb)->daddr);
- /* preempt_disable is needed to protect the per-CPU buffer srh_state,
- * which is also accessed by the bpf_lwt_seg6_* helpers
+ /* The access to the per-CPU buffer srh_state is protected by running
+ * always in softirq context (with disabled BH). On PREEMPT_RT the
+ * required locking is provided by the following local_lock_nested_bh()
+ * statement. It is also accessed by the bpf_lwt_seg6_* helpers via
+ * bpf_prog_run_save_cb().
*/
- preempt_disable();
+ local_lock_nested_bh(&seg6_bpf_srh_states.bh_lock);
+ srh_state = this_cpu_ptr(&seg6_bpf_srh_states);
srh_state->srh = srh;
srh_state->hdrlen = srh->hdrlen << 3;
srh_state->valid = true;
@@ -523,15 +1455,15 @@ static int input_action_end_bpf(struct sk_buff *skb,
if (srh_state->srh && !seg6_bpf_has_valid_srh(skb))
goto drop;
+ local_unlock_nested_bh(&seg6_bpf_srh_states.bh_lock);
- preempt_enable();
if (ret != BPF_REDIRECT)
seg6_lookup_nexthop(skb, NULL, 0);
return dst_input(skb);
drop:
- preempt_enable();
+ local_unlock_nested_bh(&seg6_bpf_srh_states.bh_lock);
kfree_skb(skb);
return -EINVAL;
}
@@ -540,52 +1472,97 @@ static struct seg6_action_desc seg6_action_table[] = {
{
.action = SEG6_LOCAL_ACTION_END,
.attrs = 0,
+ .optattrs = SEG6_F_LOCAL_COUNTERS |
+ SEG6_F_LOCAL_FLAVORS,
.input = input_action_end,
},
{
.action = SEG6_LOCAL_ACTION_END_X,
- .attrs = (1 << SEG6_LOCAL_NH6),
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_NH6),
+ .optattrs = SEG6_F_LOCAL_COUNTERS |
+ SEG6_F_LOCAL_FLAVORS |
+ SEG6_F_ATTR(SEG6_LOCAL_OIF),
.input = input_action_end_x,
},
{
.action = SEG6_LOCAL_ACTION_END_T,
- .attrs = (1 << SEG6_LOCAL_TABLE),
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_TABLE),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
.input = input_action_end_t,
},
{
.action = SEG6_LOCAL_ACTION_END_DX2,
- .attrs = (1 << SEG6_LOCAL_OIF),
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_OIF),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
.input = input_action_end_dx2,
},
{
.action = SEG6_LOCAL_ACTION_END_DX6,
- .attrs = (1 << SEG6_LOCAL_NH6),
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_NH6),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
.input = input_action_end_dx6,
},
{
.action = SEG6_LOCAL_ACTION_END_DX4,
- .attrs = (1 << SEG6_LOCAL_NH4),
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_NH4),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
.input = input_action_end_dx4,
},
{
+ .action = SEG6_LOCAL_ACTION_END_DT4,
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ .input = input_action_end_dt4,
+ .slwt_ops = {
+ .build_state = seg6_end_dt4_build,
+ },
+#endif
+ },
+ {
.action = SEG6_LOCAL_ACTION_END_DT6,
- .attrs = (1 << SEG6_LOCAL_TABLE),
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ .attrs = 0,
+ .optattrs = SEG6_F_LOCAL_COUNTERS |
+ SEG6_F_ATTR(SEG6_LOCAL_TABLE) |
+ SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE),
+ .slwt_ops = {
+ .build_state = seg6_end_dt6_build,
+ },
+#else
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_TABLE),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
+#endif
.input = input_action_end_dt6,
},
{
+ .action = SEG6_LOCAL_ACTION_END_DT46,
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ .input = input_action_end_dt46,
+ .slwt_ops = {
+ .build_state = seg6_end_dt46_build,
+ },
+#endif
+ },
+ {
.action = SEG6_LOCAL_ACTION_END_B6,
- .attrs = (1 << SEG6_LOCAL_SRH),
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_SRH),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
.input = input_action_end_b6,
},
{
.action = SEG6_LOCAL_ACTION_END_B6_ENCAP,
- .attrs = (1 << SEG6_LOCAL_SRH),
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_SRH),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
.input = input_action_end_b6_encap,
.static_headroom = sizeof(struct ipv6hdr),
},
{
.action = SEG6_LOCAL_ACTION_END_BPF,
- .attrs = (1 << SEG6_LOCAL_BPF),
+ .attrs = SEG6_F_ATTR(SEG6_LOCAL_BPF),
+ .optattrs = SEG6_F_LOCAL_COUNTERS,
.input = input_action_end_bpf,
},
@@ -606,37 +1583,82 @@ static struct seg6_action_desc *__get_action_desc(int action)
return NULL;
}
-static int seg6_local_input(struct sk_buff *skb)
+static bool seg6_lwtunnel_counters_enabled(struct seg6_local_lwt *slwt)
+{
+ return slwt->parsed_optattrs & SEG6_F_LOCAL_COUNTERS;
+}
+
+static void seg6_local_update_counters(struct seg6_local_lwt *slwt,
+ unsigned int len, int err)
+{
+ struct pcpu_seg6_local_counters *pcounters;
+
+ pcounters = this_cpu_ptr(slwt->pcpu_counters);
+ u64_stats_update_begin(&pcounters->syncp);
+
+ if (likely(!err)) {
+ u64_stats_inc(&pcounters->packets);
+ u64_stats_add(&pcounters->bytes, len);
+ } else {
+ u64_stats_inc(&pcounters->errors);
+ }
+
+ u64_stats_update_end(&pcounters->syncp);
+}
+
+static int seg6_local_input_core(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
struct dst_entry *orig_dst = skb_dst(skb);
struct seg6_action_desc *desc;
struct seg6_local_lwt *slwt;
+ unsigned int len = skb->len;
+ int rc;
+
+ slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
+ desc = slwt->desc;
+
+ rc = desc->input(skb, slwt);
+ if (!seg6_lwtunnel_counters_enabled(slwt))
+ return rc;
+
+ seg6_local_update_counters(slwt, len, rc);
+
+ return rc;
+}
+
+static int seg6_local_input(struct sk_buff *skb)
+{
if (skb->protocol != htons(ETH_P_IPV6)) {
kfree_skb(skb);
return -EINVAL;
}
- slwt = seg6_local_lwtunnel(orig_dst->lwtstate);
- desc = slwt->desc;
+ if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled))
+ return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
+ dev_net(skb->dev), NULL, skb, skb->dev, NULL,
+ seg6_local_input_core);
- return desc->input(skb, slwt);
+ return seg6_local_input_core(dev_net(skb->dev), NULL, skb);
}
static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = {
[SEG6_LOCAL_ACTION] = { .type = NLA_U32 },
[SEG6_LOCAL_SRH] = { .type = NLA_BINARY },
[SEG6_LOCAL_TABLE] = { .type = NLA_U32 },
- [SEG6_LOCAL_NH4] = { .type = NLA_BINARY,
- .len = sizeof(struct in_addr) },
- [SEG6_LOCAL_NH6] = { .type = NLA_BINARY,
- .len = sizeof(struct in6_addr) },
+ [SEG6_LOCAL_VRFTABLE] = { .type = NLA_U32 },
+ [SEG6_LOCAL_NH4] = NLA_POLICY_EXACT_LEN(sizeof(struct in_addr)),
+ [SEG6_LOCAL_NH6] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
[SEG6_LOCAL_IIF] = { .type = NLA_U32 },
[SEG6_LOCAL_OIF] = { .type = NLA_U32 },
[SEG6_LOCAL_BPF] = { .type = NLA_NESTED },
+ [SEG6_LOCAL_COUNTERS] = { .type = NLA_NESTED },
+ [SEG6_LOCAL_FLAVORS] = { .type = NLA_NESTED },
};
-static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
{
struct ipv6_sr_hdr *srh;
int len;
@@ -648,7 +1670,7 @@ static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt)
if (len < sizeof(*srh) + sizeof(struct in6_addr))
return -EINVAL;
- if (!seg6_validate_srh(srh, len))
+ if (!seg6_validate_srh(srh, len, false))
return -EINVAL;
slwt->srh = kmemdup(srh, len, GFP_KERNEL);
@@ -688,7 +1710,13 @@ static int cmp_nla_srh(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
return memcmp(a->srh, b->srh, len);
}
-static int parse_nla_table(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+static void destroy_attr_srh(struct seg6_local_lwt *slwt)
+{
+ kfree(slwt->srh);
+}
+
+static int parse_nla_table(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
{
slwt->table = nla_get_u32(attrs[SEG6_LOCAL_TABLE]);
@@ -711,7 +1739,56 @@ static int cmp_nla_table(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
return 0;
}
-static int parse_nla_nh4(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+static struct
+seg6_end_dt_info *seg6_possible_end_dt_info(struct seg6_local_lwt *slwt)
+{
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ return &slwt->dt_info;
+#else
+ return ERR_PTR(-EOPNOTSUPP);
+#endif
+}
+
+static int parse_nla_vrftable(struct nlattr **attrs,
+ struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
+{
+ struct seg6_end_dt_info *info = seg6_possible_end_dt_info(slwt);
+
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+
+ info->vrf_table = nla_get_u32(attrs[SEG6_LOCAL_VRFTABLE]);
+
+ return 0;
+}
+
+static int put_nla_vrftable(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ struct seg6_end_dt_info *info = seg6_possible_end_dt_info(slwt);
+
+ if (IS_ERR(info))
+ return PTR_ERR(info);
+
+ if (nla_put_u32(skb, SEG6_LOCAL_VRFTABLE, info->vrf_table))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int cmp_nla_vrftable(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+ struct seg6_end_dt_info *info_a = seg6_possible_end_dt_info(a);
+ struct seg6_end_dt_info *info_b = seg6_possible_end_dt_info(b);
+
+ if (info_a->vrf_table != info_b->vrf_table)
+ return 1;
+
+ return 0;
+}
+
+static int parse_nla_nh4(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
{
memcpy(&slwt->nh4, nla_data(attrs[SEG6_LOCAL_NH4]),
sizeof(struct in_addr));
@@ -737,7 +1814,8 @@ static int cmp_nla_nh4(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
return memcmp(&a->nh4, &b->nh4, sizeof(struct in_addr));
}
-static int parse_nla_nh6(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+static int parse_nla_nh6(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
{
memcpy(&slwt->nh6, nla_data(attrs[SEG6_LOCAL_NH6]),
sizeof(struct in6_addr));
@@ -763,7 +1841,8 @@ static int cmp_nla_nh6(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
return memcmp(&a->nh6, &b->nh6, sizeof(struct in6_addr));
}
-static int parse_nla_iif(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+static int parse_nla_iif(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
{
slwt->iif = nla_get_u32(attrs[SEG6_LOCAL_IIF]);
@@ -786,7 +1865,8 @@ static int cmp_nla_iif(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
return 0;
}
-static int parse_nla_oif(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+static int parse_nla_oif(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
{
slwt->oif = nla_get_u32(attrs[SEG6_LOCAL_OIF]);
@@ -816,15 +1896,17 @@ static const struct nla_policy bpf_prog_policy[SEG6_LOCAL_BPF_PROG_MAX + 1] = {
.len = MAX_PROG_NAME },
};
-static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
{
struct nlattr *tb[SEG6_LOCAL_BPF_PROG_MAX + 1];
struct bpf_prog *p;
int ret;
u32 fd;
- ret = nla_parse_nested(tb, SEG6_LOCAL_BPF_PROG_MAX,
- attrs[SEG6_LOCAL_BPF], bpf_prog_policy, NULL);
+ ret = nla_parse_nested_deprecated(tb, SEG6_LOCAL_BPF_PROG_MAX,
+ attrs[SEG6_LOCAL_BPF],
+ bpf_prog_policy, NULL);
if (ret < 0)
return ret;
@@ -853,7 +1935,7 @@ static int put_nla_bpf(struct sk_buff *skb, struct seg6_local_lwt *slwt)
if (!slwt->bpf.prog)
return 0;
- nest = nla_nest_start(skb, SEG6_LOCAL_BPF);
+ nest = nla_nest_start_noflag(skb, SEG6_LOCAL_BPF);
if (!nest)
return -EMSGSIZE;
@@ -878,16 +1960,326 @@ static int cmp_nla_bpf(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
return strcmp(a->bpf.name, b->bpf.name);
}
+static void destroy_attr_bpf(struct seg6_local_lwt *slwt)
+{
+ kfree(slwt->bpf.name);
+ if (slwt->bpf.prog)
+ bpf_prog_put(slwt->bpf.prog);
+}
+
+static const struct
+nla_policy seg6_local_counters_policy[SEG6_LOCAL_CNT_MAX + 1] = {
+ [SEG6_LOCAL_CNT_PACKETS] = { .type = NLA_U64 },
+ [SEG6_LOCAL_CNT_BYTES] = { .type = NLA_U64 },
+ [SEG6_LOCAL_CNT_ERRORS] = { .type = NLA_U64 },
+};
+
+static int parse_nla_counters(struct nlattr **attrs,
+ struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
+{
+ struct pcpu_seg6_local_counters __percpu *pcounters;
+ struct nlattr *tb[SEG6_LOCAL_CNT_MAX + 1];
+ int ret;
+
+ ret = nla_parse_nested_deprecated(tb, SEG6_LOCAL_CNT_MAX,
+ attrs[SEG6_LOCAL_COUNTERS],
+ seg6_local_counters_policy, NULL);
+ if (ret < 0)
+ return ret;
+
+ /* basic support for SRv6 Behavior counters requires at least:
+ * packets, bytes and errors.
+ */
+ if (!tb[SEG6_LOCAL_CNT_PACKETS] || !tb[SEG6_LOCAL_CNT_BYTES] ||
+ !tb[SEG6_LOCAL_CNT_ERRORS])
+ return -EINVAL;
+
+ /* counters are always zero initialized */
+ pcounters = seg6_local_alloc_pcpu_counters(GFP_KERNEL);
+ if (!pcounters)
+ return -ENOMEM;
+
+ slwt->pcpu_counters = pcounters;
+
+ return 0;
+}
+
+static int seg6_local_fill_nla_counters(struct sk_buff *skb,
+ struct seg6_local_counters *counters)
+{
+ if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_PACKETS, counters->packets,
+ SEG6_LOCAL_CNT_PAD))
+ return -EMSGSIZE;
+
+ if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_BYTES, counters->bytes,
+ SEG6_LOCAL_CNT_PAD))
+ return -EMSGSIZE;
+
+ if (nla_put_u64_64bit(skb, SEG6_LOCAL_CNT_ERRORS, counters->errors,
+ SEG6_LOCAL_CNT_PAD))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int put_nla_counters(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ struct seg6_local_counters counters = { 0, 0, 0 };
+ struct nlattr *nest;
+ int rc, i;
+
+ nest = nla_nest_start(skb, SEG6_LOCAL_COUNTERS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ for_each_possible_cpu(i) {
+ struct pcpu_seg6_local_counters *pcounters;
+ u64 packets, bytes, errors;
+ unsigned int start;
+
+ pcounters = per_cpu_ptr(slwt->pcpu_counters, i);
+ do {
+ start = u64_stats_fetch_begin(&pcounters->syncp);
+
+ packets = u64_stats_read(&pcounters->packets);
+ bytes = u64_stats_read(&pcounters->bytes);
+ errors = u64_stats_read(&pcounters->errors);
+
+ } while (u64_stats_fetch_retry(&pcounters->syncp, start));
+
+ counters.packets += packets;
+ counters.bytes += bytes;
+ counters.errors += errors;
+ }
+
+ rc = seg6_local_fill_nla_counters(skb, &counters);
+ if (rc < 0) {
+ nla_nest_cancel(skb, nest);
+ return rc;
+ }
+
+ return nla_nest_end(skb, nest);
+}
+
+static int cmp_nla_counters(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+ /* a and b are equal if both have pcpu_counters set or not */
+ return (!!((unsigned long)a->pcpu_counters)) ^
+ (!!((unsigned long)b->pcpu_counters));
+}
+
+static void destroy_attr_counters(struct seg6_local_lwt *slwt)
+{
+ free_percpu(slwt->pcpu_counters);
+}
+
+static const
+struct nla_policy seg6_local_flavors_policy[SEG6_LOCAL_FLV_MAX + 1] = {
+ [SEG6_LOCAL_FLV_OPERATION] = { .type = NLA_U32 },
+ [SEG6_LOCAL_FLV_LCBLOCK_BITS] = { .type = NLA_U8 },
+ [SEG6_LOCAL_FLV_LCNODE_FN_BITS] = { .type = NLA_U8 },
+};
+
+/* check whether the lengths of the Locator-Block and Locator-Node Function
+ * are compatible with the dimension of a C-SID container.
+ */
+static int seg6_chk_next_csid_cfg(__u8 block_len, __u8 func_len)
+{
+ /* Locator-Block and Locator-Node Function cannot exceed 128 bits
+ * (i.e. C-SID container length).
+ */
+ if (next_csid_chk_cntr_bits(block_len, func_len))
+ return -EINVAL;
+
+ /* Locator-Block length must be greater than zero and evenly divisible
+ * by 8. There must be room for a Locator-Node Function, at least.
+ */
+ if (next_csid_chk_lcblock_bits(block_len))
+ return -EINVAL;
+
+ /* Locator-Node Function length must be greater than zero and evenly
+ * divisible by 8. There must be room for the Locator-Block.
+ */
+ if (next_csid_chk_lcnode_fn_bits(func_len))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int seg6_parse_nla_next_csid_cfg(struct nlattr **tb,
+ struct seg6_flavors_info *finfo,
+ struct netlink_ext_ack *extack)
+{
+ __u8 func_len = SEG6_LOCAL_LCNODE_FN_DBITS;
+ __u8 block_len = SEG6_LOCAL_LCBLOCK_DBITS;
+ int rc;
+
+ if (tb[SEG6_LOCAL_FLV_LCBLOCK_BITS])
+ block_len = nla_get_u8(tb[SEG6_LOCAL_FLV_LCBLOCK_BITS]);
+
+ if (tb[SEG6_LOCAL_FLV_LCNODE_FN_BITS])
+ func_len = nla_get_u8(tb[SEG6_LOCAL_FLV_LCNODE_FN_BITS]);
+
+ rc = seg6_chk_next_csid_cfg(block_len, func_len);
+ if (rc < 0) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid Locator Block/Node Function lengths");
+ return rc;
+ }
+
+ finfo->lcblock_bits = block_len;
+ finfo->lcnode_func_bits = func_len;
+
+ return 0;
+}
+
+static int parse_nla_flavors(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
+{
+ struct seg6_flavors_info *finfo = &slwt->flv_info;
+ struct nlattr *tb[SEG6_LOCAL_FLV_MAX + 1];
+ int action = slwt->action;
+ __u32 fops, supp_fops;
+ int rc;
+
+ rc = nla_parse_nested_deprecated(tb, SEG6_LOCAL_FLV_MAX,
+ attrs[SEG6_LOCAL_FLAVORS],
+ seg6_local_flavors_policy, NULL);
+ if (rc < 0)
+ return rc;
+
+ /* this attribute MUST always be present since it represents the Flavor
+ * operation(s) to be carried out.
+ */
+ if (!tb[SEG6_LOCAL_FLV_OPERATION])
+ return -EINVAL;
+
+ fops = nla_get_u32(tb[SEG6_LOCAL_FLV_OPERATION]);
+ rc = seg6_flv_supp_ops_by_action(action, &supp_fops);
+ if (rc < 0 || (fops & ~supp_fops)) {
+ NL_SET_ERR_MSG(extack, "Unsupported Flavor operation(s)");
+ return -EOPNOTSUPP;
+ }
+
+ finfo->flv_ops = fops;
+
+ if (seg6_next_csid_enabled(fops)) {
+ /* Locator-Block and Locator-Node Function lengths can be
+ * provided by the user space. Otherwise, default values are
+ * applied.
+ */
+ rc = seg6_parse_nla_next_csid_cfg(tb, finfo, extack);
+ if (rc < 0)
+ return rc;
+ }
+
+ return 0;
+}
+
+static int seg6_fill_nla_next_csid_cfg(struct sk_buff *skb,
+ struct seg6_flavors_info *finfo)
+{
+ if (nla_put_u8(skb, SEG6_LOCAL_FLV_LCBLOCK_BITS, finfo->lcblock_bits))
+ return -EMSGSIZE;
+
+ if (nla_put_u8(skb, SEG6_LOCAL_FLV_LCNODE_FN_BITS,
+ finfo->lcnode_func_bits))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
+static int put_nla_flavors(struct sk_buff *skb, struct seg6_local_lwt *slwt)
+{
+ struct seg6_flavors_info *finfo = &slwt->flv_info;
+ __u32 fops = finfo->flv_ops;
+ struct nlattr *nest;
+ int rc;
+
+ nest = nla_nest_start(skb, SEG6_LOCAL_FLAVORS);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, SEG6_LOCAL_FLV_OPERATION, fops)) {
+ rc = -EMSGSIZE;
+ goto err;
+ }
+
+ if (seg6_next_csid_enabled(fops)) {
+ rc = seg6_fill_nla_next_csid_cfg(skb, finfo);
+ if (rc < 0)
+ goto err;
+ }
+
+ return nla_nest_end(skb, nest);
+
+err:
+ nla_nest_cancel(skb, nest);
+ return rc;
+}
+
+static int seg6_cmp_nla_next_csid_cfg(struct seg6_flavors_info *finfo_a,
+ struct seg6_flavors_info *finfo_b)
+{
+ if (finfo_a->lcblock_bits != finfo_b->lcblock_bits)
+ return 1;
+
+ if (finfo_a->lcnode_func_bits != finfo_b->lcnode_func_bits)
+ return 1;
+
+ return 0;
+}
+
+static int cmp_nla_flavors(struct seg6_local_lwt *a, struct seg6_local_lwt *b)
+{
+ struct seg6_flavors_info *finfo_a = &a->flv_info;
+ struct seg6_flavors_info *finfo_b = &b->flv_info;
+
+ if (finfo_a->flv_ops != finfo_b->flv_ops)
+ return 1;
+
+ if (seg6_next_csid_enabled(finfo_a->flv_ops)) {
+ if (seg6_cmp_nla_next_csid_cfg(finfo_a, finfo_b))
+ return 1;
+ }
+
+ return 0;
+}
+
+static int encap_size_flavors(struct seg6_local_lwt *slwt)
+{
+ struct seg6_flavors_info *finfo = &slwt->flv_info;
+ int nlsize;
+
+ nlsize = nla_total_size(0) + /* nest SEG6_LOCAL_FLAVORS */
+ nla_total_size(4); /* SEG6_LOCAL_FLV_OPERATION */
+
+ if (seg6_next_csid_enabled(finfo->flv_ops))
+ nlsize += nla_total_size(1) + /* SEG6_LOCAL_FLV_LCBLOCK_BITS */
+ nla_total_size(1); /* SEG6_LOCAL_FLV_LCNODE_FN_BITS */
+
+ return nlsize;
+}
+
struct seg6_action_param {
- int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt);
+ int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack);
int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt);
int (*cmp)(struct seg6_local_lwt *a, struct seg6_local_lwt *b);
+
+ /* optional destroy() callback useful for releasing resources which
+ * have been previously acquired in the corresponding parse()
+ * function.
+ */
+ void (*destroy)(struct seg6_local_lwt *slwt);
};
static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
[SEG6_LOCAL_SRH] = { .parse = parse_nla_srh,
.put = put_nla_srh,
- .cmp = cmp_nla_srh },
+ .cmp = cmp_nla_srh,
+ .destroy = destroy_attr_srh },
[SEG6_LOCAL_TABLE] = { .parse = parse_nla_table,
.put = put_nla_table,
@@ -911,14 +2303,140 @@ static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = {
[SEG6_LOCAL_BPF] = { .parse = parse_nla_bpf,
.put = put_nla_bpf,
- .cmp = cmp_nla_bpf },
+ .cmp = cmp_nla_bpf,
+ .destroy = destroy_attr_bpf },
+
+ [SEG6_LOCAL_VRFTABLE] = { .parse = parse_nla_vrftable,
+ .put = put_nla_vrftable,
+ .cmp = cmp_nla_vrftable },
+ [SEG6_LOCAL_COUNTERS] = { .parse = parse_nla_counters,
+ .put = put_nla_counters,
+ .cmp = cmp_nla_counters,
+ .destroy = destroy_attr_counters },
+
+ [SEG6_LOCAL_FLAVORS] = { .parse = parse_nla_flavors,
+ .put = put_nla_flavors,
+ .cmp = cmp_nla_flavors },
};
-static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
+/* call the destroy() callback (if available) for each set attribute in
+ * @parsed_attrs, starting from the first attribute up to the @max_parsed
+ * (excluded) attribute.
+ */
+static void __destroy_attrs(unsigned long parsed_attrs, int max_parsed,
+ struct seg6_local_lwt *slwt)
+{
+ struct seg6_action_param *param;
+ int i;
+
+ /* Every required seg6local attribute is identified by an ID which is
+ * encoded as a flag (i.e: 1 << ID) in the 'attrs' bitmask;
+ *
+ * We scan the 'parsed_attrs' bitmask, starting from the first attribute
+ * up to the @max_parsed (excluded) attribute.
+ * For each set attribute, we retrieve the corresponding destroy()
+ * callback. If the callback is not available, then we skip to the next
+ * attribute; otherwise, we call the destroy() callback.
+ */
+ for (i = SEG6_LOCAL_SRH; i < max_parsed; ++i) {
+ if (!(parsed_attrs & SEG6_F_ATTR(i)))
+ continue;
+
+ param = &seg6_action_params[i];
+
+ if (param->destroy)
+ param->destroy(slwt);
+ }
+}
+
+/* release all the resources that may have been acquired during parsing
+ * operations.
+ */
+static void destroy_attrs(struct seg6_local_lwt *slwt)
+{
+ unsigned long attrs = slwt->desc->attrs | slwt->parsed_optattrs;
+
+ __destroy_attrs(attrs, SEG6_LOCAL_MAX + 1, slwt);
+}
+
+static int parse_nla_optional_attrs(struct nlattr **attrs,
+ struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
+{
+ struct seg6_action_desc *desc = slwt->desc;
+ unsigned long parsed_optattrs = 0;
+ struct seg6_action_param *param;
+ int err, i;
+
+ for (i = SEG6_LOCAL_SRH; i < SEG6_LOCAL_MAX + 1; ++i) {
+ if (!(desc->optattrs & SEG6_F_ATTR(i)) || !attrs[i])
+ continue;
+
+ /* once here, the i-th attribute is provided by the
+ * userspace AND it is identified optional as well.
+ */
+ param = &seg6_action_params[i];
+
+ err = param->parse(attrs, slwt, extack);
+ if (err < 0)
+ goto parse_optattrs_err;
+
+ /* current attribute has been correctly parsed */
+ parsed_optattrs |= SEG6_F_ATTR(i);
+ }
+
+ /* store in the tunnel state all the optional attributed successfully
+ * parsed.
+ */
+ slwt->parsed_optattrs = parsed_optattrs;
+
+ return 0;
+
+parse_optattrs_err:
+ __destroy_attrs(parsed_optattrs, i, slwt);
+
+ return err;
+}
+
+/* call the custom constructor of the behavior during its initialization phase
+ * and after that all its attributes have been parsed successfully.
+ */
+static int
+seg6_local_lwtunnel_build_state(struct seg6_local_lwt *slwt, const void *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct seg6_action_desc *desc = slwt->desc;
+ struct seg6_local_lwtunnel_ops *ops;
+
+ ops = &desc->slwt_ops;
+ if (!ops->build_state)
+ return 0;
+
+ return ops->build_state(slwt, cfg, extack);
+}
+
+/* call the custom destructor of the behavior which is invoked before the
+ * tunnel is going to be destroyed.
+ */
+static void seg6_local_lwtunnel_destroy_state(struct seg6_local_lwt *slwt)
+{
+ struct seg6_action_desc *desc = slwt->desc;
+ struct seg6_local_lwtunnel_ops *ops;
+
+ ops = &desc->slwt_ops;
+ if (!ops->destroy_state)
+ return;
+
+ ops->destroy_state(slwt);
+}
+
+static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt,
+ struct netlink_ext_ack *extack)
{
struct seg6_action_param *param;
struct seg6_action_desc *desc;
+ unsigned long invalid_attrs;
int i, err;
desc = __get_action_desc(slwt->action);
@@ -931,24 +2449,58 @@ static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt)
slwt->desc = desc;
slwt->headroom += desc->static_headroom;
- for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) {
- if (desc->attrs & (1 << i)) {
+ /* Forcing the desc->optattrs *set* and the desc->attrs *set* to be
+ * disjoined, this allow us to release acquired resources by optional
+ * attributes and by required attributes independently from each other
+ * without any interference.
+ * In other terms, we are sure that we do not release some the acquired
+ * resources twice.
+ *
+ * Note that if an attribute is configured both as required and as
+ * optional, it means that the user has messed something up in the
+ * seg6_action_table. Therefore, this check is required for SRv6
+ * behaviors to work properly.
+ */
+ invalid_attrs = desc->attrs & desc->optattrs;
+ if (invalid_attrs) {
+ WARN_ONCE(1,
+ "An attribute cannot be both required AND optional");
+ return -EINVAL;
+ }
+
+ /* parse the required attributes */
+ for (i = SEG6_LOCAL_SRH; i < SEG6_LOCAL_MAX + 1; i++) {
+ if (desc->attrs & SEG6_F_ATTR(i)) {
if (!attrs[i])
return -EINVAL;
param = &seg6_action_params[i];
- err = param->parse(attrs, slwt);
+ err = param->parse(attrs, slwt, extack);
if (err < 0)
- return err;
+ goto parse_attrs_err;
}
}
+ /* parse the optional attributes, if any */
+ err = parse_nla_optional_attrs(attrs, slwt, extack);
+ if (err < 0)
+ goto parse_attrs_err;
+
return 0;
+
+parse_attrs_err:
+ /* release any resource that may have been acquired during the i-1
+ * parse() operations.
+ */
+ __destroy_attrs(desc->attrs, i, slwt);
+
+ return err;
}
-static int seg6_local_build_state(struct nlattr *nla, unsigned int family,
- const void *cfg, struct lwtunnel_state **ts,
+static int seg6_local_build_state(struct net *net, struct nlattr *nla,
+ unsigned int family, const void *cfg,
+ struct lwtunnel_state **ts,
struct netlink_ext_ack *extack)
{
struct nlattr *tb[SEG6_LOCAL_MAX + 1];
@@ -959,8 +2511,8 @@ static int seg6_local_build_state(struct nlattr *nla, unsigned int family,
if (family != AF_INET6)
return -EINVAL;
- err = nla_parse_nested(tb, SEG6_LOCAL_MAX, nla, seg6_local_policy,
- extack);
+ err = nla_parse_nested_deprecated(tb, SEG6_LOCAL_MAX, nla,
+ seg6_local_policy, extack);
if (err < 0)
return err;
@@ -975,10 +2527,14 @@ static int seg6_local_build_state(struct nlattr *nla, unsigned int family,
slwt = seg6_local_lwtunnel(newts);
slwt->action = nla_get_u32(tb[SEG6_LOCAL_ACTION]);
- err = parse_nla_action(tb, slwt);
+ err = parse_nla_action(tb, slwt, extack);
if (err < 0)
goto out_free;
+ err = seg6_local_lwtunnel_build_state(slwt, cfg, extack);
+ if (err < 0)
+ goto out_destroy_attrs;
+
newts->type = LWTUNNEL_ENCAP_SEG6_LOCAL;
newts->flags = LWTUNNEL_STATE_INPUT_REDIRECT;
newts->headroom = slwt->headroom;
@@ -987,8 +2543,9 @@ static int seg6_local_build_state(struct nlattr *nla, unsigned int family,
return 0;
+out_destroy_attrs:
+ destroy_attrs(slwt);
out_free:
- kfree(slwt->srh);
kfree(newts);
return err;
}
@@ -997,12 +2554,9 @@ static void seg6_local_destroy_state(struct lwtunnel_state *lwt)
{
struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
- kfree(slwt->srh);
+ seg6_local_lwtunnel_destroy_state(slwt);
- if (slwt->desc->attrs & (1 << SEG6_LOCAL_BPF)) {
- kfree(slwt->bpf.name);
- bpf_prog_put(slwt->bpf.prog);
- }
+ destroy_attrs(slwt);
return;
}
@@ -1012,13 +2566,16 @@ static int seg6_local_fill_encap(struct sk_buff *skb,
{
struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt);
struct seg6_action_param *param;
+ unsigned long attrs;
int i, err;
if (nla_put_u32(skb, SEG6_LOCAL_ACTION, slwt->action))
return -EMSGSIZE;
- for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) {
- if (slwt->desc->attrs & (1 << i)) {
+ attrs = slwt->desc->attrs | slwt->parsed_optattrs;
+
+ for (i = SEG6_LOCAL_SRH; i < SEG6_LOCAL_MAX + 1; i++) {
+ if (attrs & SEG6_F_ATTR(i)) {
param = &seg6_action_params[i];
err = param->put(skb, slwt);
if (err < 0)
@@ -1037,31 +2594,46 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt)
nlsize = nla_total_size(4); /* action */
- attrs = slwt->desc->attrs;
+ attrs = slwt->desc->attrs | slwt->parsed_optattrs;
- if (attrs & (1 << SEG6_LOCAL_SRH))
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_SRH))
nlsize += nla_total_size((slwt->srh->hdrlen + 1) << 3);
- if (attrs & (1 << SEG6_LOCAL_TABLE))
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_TABLE))
nlsize += nla_total_size(4);
- if (attrs & (1 << SEG6_LOCAL_NH4))
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_NH4))
nlsize += nla_total_size(4);
- if (attrs & (1 << SEG6_LOCAL_NH6))
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_NH6))
nlsize += nla_total_size(16);
- if (attrs & (1 << SEG6_LOCAL_IIF))
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_IIF))
nlsize += nla_total_size(4);
- if (attrs & (1 << SEG6_LOCAL_OIF))
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_OIF))
nlsize += nla_total_size(4);
- if (attrs & (1 << SEG6_LOCAL_BPF))
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_BPF))
nlsize += nla_total_size(sizeof(struct nlattr)) +
nla_total_size(MAX_PROG_NAME) +
nla_total_size(4);
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_VRFTABLE))
+ nlsize += nla_total_size(4);
+
+ if (attrs & SEG6_F_LOCAL_COUNTERS)
+ nlsize += nla_total_size(0) + /* nest SEG6_LOCAL_COUNTERS */
+ /* SEG6_LOCAL_CNT_PACKETS */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* SEG6_LOCAL_CNT_BYTES */
+ nla_total_size_64bit(sizeof(__u64)) +
+ /* SEG6_LOCAL_CNT_ERRORS */
+ nla_total_size_64bit(sizeof(__u64));
+
+ if (attrs & SEG6_F_ATTR(SEG6_LOCAL_FLAVORS))
+ nlsize += encap_size_flavors(slwt);
+
return nlsize;
}
@@ -1070,6 +2642,7 @@ static int seg6_local_cmp_encap(struct lwtunnel_state *a,
{
struct seg6_local_lwt *slwt_a, *slwt_b;
struct seg6_action_param *param;
+ unsigned long attrs_a, attrs_b;
int i;
slwt_a = seg6_local_lwtunnel(a);
@@ -1078,11 +2651,14 @@ static int seg6_local_cmp_encap(struct lwtunnel_state *a,
if (slwt_a->action != slwt_b->action)
return 1;
- if (slwt_a->desc->attrs != slwt_b->desc->attrs)
+ attrs_a = slwt_a->desc->attrs | slwt_a->parsed_optattrs;
+ attrs_b = slwt_b->desc->attrs | slwt_b->parsed_optattrs;
+
+ if (attrs_a != attrs_b)
return 1;
- for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) {
- if (slwt_a->desc->attrs & (1 << i)) {
+ for (i = SEG6_LOCAL_SRH; i < SEG6_LOCAL_MAX + 1; i++) {
+ if (attrs_a & SEG6_F_ATTR(i)) {
param = &seg6_action_params[i];
if (param->cmp(slwt_a, slwt_b))
return 1;
@@ -1104,6 +2680,36 @@ static const struct lwtunnel_encap_ops seg6_local_ops = {
int __init seg6_local_init(void)
{
+ /* If the max total number of defined attributes is reached, then your
+ * kernel build stops here.
+ *
+ * This check is required to avoid arithmetic overflows when processing
+ * behavior attributes and the maximum number of defined attributes
+ * exceeds the allowed value.
+ */
+ BUILD_BUG_ON(SEG6_LOCAL_MAX + 1 > BITS_PER_TYPE(unsigned long));
+
+ /* Check whether the number of defined flavors exceeds the maximum
+ * allowed value.
+ */
+ BUILD_BUG_ON(SEG6_LOCAL_FLV_OP_MAX + 1 > BITS_PER_TYPE(__u32));
+
+ /* If the default NEXT-C-SID Locator-Block/Node Function lengths (in
+ * bits) have been changed with invalid values, kernel build stops
+ * here.
+ */
+ BUILD_BUG_ON(next_csid_chk_cntr_bits(SEG6_LOCAL_LCBLOCK_DBITS,
+ SEG6_LOCAL_LCNODE_FN_DBITS));
+ BUILD_BUG_ON(next_csid_chk_lcblock_bits(SEG6_LOCAL_LCBLOCK_DBITS));
+ BUILD_BUG_ON(next_csid_chk_lcnode_fn_bits(SEG6_LOCAL_LCNODE_FN_DBITS));
+
+ /* To be memory efficient, we use 'u8' to represent the different
+ * actions related to RFC8986 flavors. If the kernel build stops here,
+ * it means that it is not possible to correctly encode these actions
+ * with the data type chosen for the action table.
+ */
+ BUILD_BUG_ON(SEG6_LOCAL_FLV_ACT_MAX > (typeof(flv8986_act_tbl[0]))~0U);
+
return lwtunnel_encap_add_ops(&seg6_local_ops,
LWTUNNEL_ENCAP_SEG6_LOCAL);
}
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index e9400ffa7875..cf37ad9686e6 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPv6 over IPv4 tunnel device - Simple Internet Transition (SIT)
* Linux INET6 implementation
@@ -6,11 +7,6 @@
* Pedro Roque <roque@di.fc.ul.pt>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Changes:
* Roger Venning <r.venning@telstra.com>: 6to4 support
* Nate Thompson <nate@thebog.net>: 6to4 support
@@ -55,6 +51,8 @@
#include <net/dsfield.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <net/netdev_lock.h>
+#include <net/inet_dscp.h>
/*
This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c
@@ -87,6 +85,13 @@ struct sit_net {
struct net_device *fb_tunnel_dev;
};
+static inline struct sit_net *dev_to_sit_net(struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+
+ return net_generic(t->net, sit_net_id);
+}
+
/*
* Must be invoked with rcu_read_lock
*/
@@ -129,8 +134,8 @@ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net,
return NULL;
}
-static struct ip_tunnel __rcu **__ipip6_bucket(struct sit_net *sitn,
- struct ip_tunnel_parm *parms)
+static struct ip_tunnel __rcu **
+__ipip6_bucket(struct sit_net *sitn, struct ip_tunnel_parm_kern *parms)
{
__be32 remote = parms->iph.daddr;
__be32 local = parms->iph.saddr;
@@ -197,14 +202,13 @@ static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn)
static int ipip6_tunnel_create(struct net_device *dev)
{
struct ip_tunnel *t = netdev_priv(dev);
- struct net *net = dev_net(dev);
- struct sit_net *sitn = net_generic(net, sit_net_id);
+ struct sit_net *sitn = net_generic(t->net, sit_net_id);
int err;
- memcpy(dev->dev_addr, &t->parms.iph.saddr, 4);
+ __dev_addr_set(dev, &t->parms.iph.saddr, 4);
memcpy(dev->broadcast, &t->parms.iph.daddr, 4);
- if ((__force u16)t->parms.i_flags & SIT_ISATAP)
+ if (test_bit(IP_TUNNEL_SIT_ISATAP_BIT, t->parms.i_flags))
dev->priv_flags |= IFF_ISATAP;
dev->rtnl_link_ops = &sit_link_ops;
@@ -215,8 +219,6 @@ static int ipip6_tunnel_create(struct net_device *dev)
ipip6_tunnel_clone_6rd(dev, sitn);
- dev_hold(dev);
-
ipip6_tunnel_link(sitn, t);
return 0;
@@ -225,7 +227,8 @@ out:
}
static struct ip_tunnel *ipip6_tunnel_locate(struct net *net,
- struct ip_tunnel_parm *parms, int create)
+ struct ip_tunnel_parm_kern *parms,
+ int create)
{
__be32 remote = parms->iph.daddr;
__be32 local = parms->iph.saddr;
@@ -253,7 +256,7 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net,
if (parms->name[0]) {
if (!dev_valid_name(parms->name))
goto failed;
- strlcpy(name, parms->name, IFNAMSIZ);
+ strscpy(name, parms->name, IFNAMSIZ);
} else {
strcpy(name, "sit%d");
}
@@ -266,10 +269,14 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net,
nt = netdev_priv(dev);
+ nt->net = net;
nt->parms = *parms;
if (ipip6_tunnel_create(dev) < 0)
goto failed_free;
+ if (!parms->name[0])
+ strcpy(parms->name, dev->name);
+
return nt;
failed_free:
@@ -295,14 +302,17 @@ __ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr)
}
-static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
- struct ip_tunnel_prl __user *a)
+static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __user *a)
{
+ struct ip_tunnel *t = netdev_priv(dev);
struct ip_tunnel_prl kprl, *kp;
struct ip_tunnel_prl_entry *prl;
unsigned int cmax, c = 0, ca, len;
int ret = 0;
+ if (dev == dev_to_sit_net(dev)->fb_tunnel_dev)
+ return -EINVAL;
+
if (copy_from_user(&kprl, a, sizeof(kprl)))
return -EFAULT;
cmax = kprl.datalen / sizeof(kprl);
@@ -313,12 +323,10 @@ static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
* we try harder to allocate.
*/
kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ?
- kcalloc(cmax, sizeof(*kp), GFP_KERNEL | __GFP_NOWARN) :
+ kcalloc(cmax, sizeof(*kp), GFP_KERNEL_ACCOUNT | __GFP_NOWARN) :
NULL;
- rcu_read_lock();
-
- ca = t->prl_count < cmax ? t->prl_count : cmax;
+ ca = min(t->prl_count, cmax);
if (!kp) {
/* We don't try hard to allocate much memory for
@@ -326,14 +334,15 @@ static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
* For root users, retry allocating enough memory for
* the answer.
*/
- kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC);
+ kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC | __GFP_ACCOUNT |
+ __GFP_NOWARN);
if (!kp) {
ret = -ENOMEM;
goto out;
}
}
- c = 0;
+ rcu_read_lock();
for_each_prl_rcu(t->prl) {
if (c >= cmax)
break;
@@ -345,7 +354,7 @@ static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
if (kprl.addr != htonl(INADDR_ANY))
break;
}
-out:
+
rcu_read_unlock();
len = sizeof(*kp) * c;
@@ -354,7 +363,7 @@ out:
ret = -EFAULT;
kfree(kp);
-
+out:
return ret;
}
@@ -445,6 +454,35 @@ out:
return err;
}
+static int ipip6_tunnel_prl_ctl(struct net_device *dev,
+ struct ip_tunnel_prl __user *data, int cmd)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_prl prl;
+ int err;
+
+ if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ if (dev == dev_to_sit_net(dev)->fb_tunnel_dev)
+ return -EINVAL;
+
+ if (copy_from_user(&prl, data, sizeof(prl)))
+ return -EFAULT;
+
+ switch (cmd) {
+ case SIOCDELPRL:
+ err = ipip6_tunnel_del_prl(t, &prl);
+ break;
+ case SIOCADDPRL:
+ case SIOCCHGPRL:
+ err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL);
+ break;
+ }
+ dst_cache_reset(&t->dst_cache);
+ netdev_state_change(dev);
+ return err;
+}
+
static int
isatap_chksrc(struct sk_buff *skb, const struct iphdr *iph, struct ip_tunnel *t)
{
@@ -484,7 +522,7 @@ static void ipip6_tunnel_uninit(struct net_device *dev)
ipip6_tunnel_del_prl(tunnel, NULL);
}
dst_cache_reset(&tunnel->dst_cache);
- dev_put(dev);
+ netdev_put(dev, &tunnel->dev_tracker);
}
static int ipip6_err(struct sk_buff *skb, u32 info)
@@ -534,19 +572,20 @@ static int ipip6_err(struct sk_buff *skb, u32 info)
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
ipv4_update_pmtu(skb, dev_net(skb->dev), info,
- t->parms.link, 0, iph->protocol, 0);
+ t->parms.link, iph->protocol);
err = 0;
goto out;
}
if (type == ICMP_REDIRECT) {
- ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
- iph->protocol, 0);
+ ipv4_redirect(skb, dev_net(skb->dev), t->parms.link,
+ iph->protocol);
err = 0;
goto out;
}
err = 0;
- if (!ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4, type, data_len))
+ if (__in6_dev_get(skb->dev) &&
+ !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4, type, data_len))
goto out;
if (t->parms.iph.daddr == 0)
@@ -648,8 +687,6 @@ static int ipip6_rcv(struct sk_buff *skb)
tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev,
iph->saddr, iph->daddr, sifindex);
if (tunnel) {
- struct pcpu_sw_netstats *tstats;
-
if (tunnel->parms.iph.protocol != IPPROTO_IPV6 &&
tunnel->parms.iph.protocol != 0)
goto out;
@@ -660,7 +697,7 @@ static int ipip6_rcv(struct sk_buff *skb)
skb->dev = tunnel->dev;
if (packet_is_spoofed(skb, iph, tunnel)) {
- tunnel->dev->stats.rx_errors++;
+ DEV_STATS_INC(tunnel->dev, rx_errors);
goto out;
}
@@ -668,23 +705,25 @@ static int ipip6_rcv(struct sk_buff *skb)
!net_eq(tunnel->net, dev_net(tunnel->dev))))
goto out;
+ /* skb can be uncloned in iptunnel_pull_header, so
+ * old iph is no longer valid
+ */
+ iph = (const struct iphdr *)skb_mac_header(skb);
+ skb_reset_mac_header(skb);
+
err = IP_ECN_decapsulate(iph, skb);
if (unlikely(err)) {
if (log_ecn_error)
net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
&iph->saddr, iph->tos);
if (err > 1) {
- ++tunnel->dev->stats.rx_frame_errors;
- ++tunnel->dev->stats.rx_errors;
+ DEV_STATS_INC(tunnel->dev, rx_frame_errors);
+ DEV_STATS_INC(tunnel->dev, rx_errors);
goto out;
}
}
- tstats = this_cpu_ptr(tunnel->dev->tstats);
- u64_stats_update_begin(&tstats->syncp);
- tstats->rx_packets++;
- tstats->rx_bytes += skb->len;
- u64_stats_update_end(&tstats->syncp);
+ dev_sw_netstats_rx_add(tunnel->dev, skb->len);
netif_rx(skb);
@@ -738,6 +777,8 @@ static int sit_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
tpi = &ipip_tpi;
if (iptunnel_pull_header(skb, 0, tpi->proto, false))
goto drop;
+ skb_reset_mac_header(skb);
+
return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
}
@@ -777,8 +818,9 @@ static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst,
pbw0 = tunnel->ip6rd.prefixlen >> 5;
pbi0 = tunnel->ip6rd.prefixlen & 0x1f;
- d = (ntohl(v6dst->s6_addr32[pbw0]) << pbi0) >>
- tunnel->ip6rd.relay_prefixlen;
+ d = tunnel->ip6rd.relay_prefixlen < 32 ?
+ (ntohl(v6dst->s6_addr32[pbw0]) << pbi0) >>
+ tunnel->ip6rd.relay_prefixlen : 0;
pbi1 = pbi0 - tunnel->ip6rd.relay_prefixlen;
if (pbi1 > 0)
@@ -806,6 +848,49 @@ static inline __be32 try_6rd(struct ip_tunnel *tunnel,
return dst;
}
+static bool ipip6_tunnel_dst_find(struct sk_buff *skb, __be32 *dst,
+ bool is_isatap)
+{
+ const struct ipv6hdr *iph6 = ipv6_hdr(skb);
+ struct neighbour *neigh = NULL;
+ const struct in6_addr *addr6;
+ bool found = false;
+ int addr_type;
+
+ if (skb_dst(skb))
+ neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr);
+
+ if (!neigh) {
+ net_dbg_ratelimited("nexthop == NULL\n");
+ return false;
+ }
+
+ addr6 = (const struct in6_addr *)&neigh->primary_key;
+ addr_type = ipv6_addr_type(addr6);
+
+ if (is_isatap) {
+ if ((addr_type & IPV6_ADDR_UNICAST) &&
+ ipv6_addr_is_isatap(addr6)) {
+ *dst = addr6->s6_addr32[3];
+ found = true;
+ }
+ } else {
+ if (addr_type == IPV6_ADDR_ANY) {
+ addr6 = &ipv6_hdr(skb)->daddr;
+ addr_type = ipv6_addr_type(addr6);
+ }
+
+ if ((addr_type & IPV6_ADDR_COMPATv4) != 0) {
+ *dst = addr6->s6_addr32[3];
+ found = true;
+ }
+ }
+
+ neigh_release(neigh);
+
+ return found;
+}
+
/*
* This function assumes it is being called from dev_queue_xmit()
* and that skb is filled properly by that function.
@@ -825,8 +910,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
__be32 dst = tiph->daddr;
struct flowi4 fl4;
int mtu;
- const struct in6_addr *addr6;
- int addr_type;
u8 ttl;
u8 protocol = IPPROTO_IPV6;
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
@@ -835,85 +918,41 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
tos = ipv6_get_dsfield(iph6);
/* ISATAP (RFC4214) - must come before 6to4 */
- if (dev->priv_flags & IFF_ISATAP) {
- struct neighbour *neigh = NULL;
- bool do_tx_error = false;
-
- if (skb_dst(skb))
- neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr);
-
- if (!neigh) {
- net_dbg_ratelimited("nexthop == NULL\n");
- goto tx_error;
- }
-
- addr6 = (const struct in6_addr *)&neigh->primary_key;
- addr_type = ipv6_addr_type(addr6);
-
- if ((addr_type & IPV6_ADDR_UNICAST) &&
- ipv6_addr_is_isatap(addr6))
- dst = addr6->s6_addr32[3];
- else
- do_tx_error = true;
-
- neigh_release(neigh);
- if (do_tx_error)
- goto tx_error;
- }
+ if ((dev->priv_flags & IFF_ISATAP) &&
+ !ipip6_tunnel_dst_find(skb, &dst, true))
+ goto tx_error;
if (!dst)
dst = try_6rd(tunnel, &iph6->daddr);
- if (!dst) {
- struct neighbour *neigh = NULL;
- bool do_tx_error = false;
-
- if (skb_dst(skb))
- neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr);
-
- if (!neigh) {
- net_dbg_ratelimited("nexthop == NULL\n");
- goto tx_error;
- }
-
- addr6 = (const struct in6_addr *)&neigh->primary_key;
- addr_type = ipv6_addr_type(addr6);
-
- if (addr_type == IPV6_ADDR_ANY) {
- addr6 = &ipv6_hdr(skb)->daddr;
- addr_type = ipv6_addr_type(addr6);
- }
-
- if ((addr_type & IPV6_ADDR_COMPATv4) != 0)
- dst = addr6->s6_addr32[3];
- else
- do_tx_error = true;
-
- neigh_release(neigh);
- if (do_tx_error)
- goto tx_error;
- }
+ if (!dst && !ipip6_tunnel_dst_find(skb, &dst, false))
+ goto tx_error;
flowi4_init_output(&fl4, tunnel->parms.link, tunnel->fwmark,
- RT_TOS(tos), RT_SCOPE_UNIVERSE, IPPROTO_IPV6,
- 0, dst, tiph->saddr, 0, 0,
+ tos & INET_DSCP_MASK, RT_SCOPE_UNIVERSE,
+ IPPROTO_IPV6, 0, dst, tiph->saddr, 0, 0,
sock_net_uid(tunnel->net, NULL));
- rt = ip_route_output_flow(tunnel->net, &fl4, NULL);
- if (IS_ERR(rt)) {
- dev->stats.tx_carrier_errors++;
- goto tx_error_icmp;
+ rt = dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr);
+ if (!rt) {
+ rt = ip_route_output_flow(tunnel->net, &fl4, NULL);
+ if (IS_ERR(rt)) {
+ DEV_STATS_INC(dev, tx_carrier_errors);
+ goto tx_error_icmp;
+ }
+ dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, fl4.saddr);
}
- if (rt->rt_type != RTN_UNICAST) {
+
+ if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
ip_rt_put(rt);
- dev->stats.tx_carrier_errors++;
+ DEV_STATS_INC(dev, tx_carrier_errors);
goto tx_error_icmp;
}
tdev = rt->dst.dev;
if (tdev == dev) {
ip_rt_put(rt);
- dev->stats.collisions++;
+ DEV_STATS_INC(dev, collisions);
goto tx_error;
}
@@ -925,8 +964,8 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
if (df) {
mtu = dst_mtu(&rt->dst) - t_hlen;
- if (mtu < 68) {
- dev->stats.collisions++;
+ if (mtu < IPV4_MIN_MTU) {
+ DEV_STATS_INC(dev, collisions);
ip_rt_put(rt);
goto tx_error;
}
@@ -937,10 +976,10 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
}
if (tunnel->parms.iph.daddr)
- skb_dst_update_pmtu(skb, mtu);
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
if (skb->len > mtu && !skb_is_gso(skb)) {
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
ip_rt_put(rt);
goto tx_error;
}
@@ -965,7 +1004,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
if (!new_skb) {
ip_rt_put(rt);
- dev->stats.tx_dropped++;
+ DEV_STATS_INC(dev, tx_dropped);
kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -980,7 +1019,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
ttl = iph6->hop_limit;
tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6));
- if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) {
+ if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) {
ip_rt_put(rt);
goto tx_error;
}
@@ -988,14 +1027,14 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
skb_set_inner_ipproto(skb, IPPROTO_IPV6);
iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
- df, !net_eq(tunnel->net, dev_net(dev)));
+ df, !net_eq(tunnel->net, dev_net(dev)), 0);
return NETDEV_TX_OK;
tx_error_icmp:
dst_link_failure(skb);
tx_error:
kfree_skb(skb);
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
return NETDEV_TX_OK;
}
@@ -1014,13 +1053,16 @@ static netdev_tx_t sit_tunnel_xmit__(struct sk_buff *skb,
return NETDEV_TX_OK;
tx_error:
kfree_skb(skb);
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
return NETDEV_TX_OK;
}
static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb,
struct net_device *dev)
{
+ if (!pskb_inet_may_pull(skb))
+ goto tx_err;
+
switch (skb->protocol) {
case htons(ETH_P_IP):
sit_tunnel_xmit__(skb, dev, IPPROTO_IPIP);
@@ -1040,7 +1082,7 @@ static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb,
return NETDEV_TX_OK;
tx_err:
- dev->stats.tx_errors++;
+ DEV_STATS_INC(dev, tx_errors);
kfree_skb(skb);
return NETDEV_TX_OK;
@@ -1048,12 +1090,13 @@ tx_err:
static void ipip6_tunnel_bind_dev(struct net_device *dev)
{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
struct net_device *tdev = NULL;
- struct ip_tunnel *tunnel;
+ int hlen = LL_MAX_HEADER;
const struct iphdr *iph;
struct flowi4 fl4;
- tunnel = netdev_priv(dev);
iph = &tunnel->parms.iph;
if (iph->daddr) {
@@ -1062,7 +1105,7 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev)
iph->daddr, iph->saddr,
0, 0,
IPPROTO_IPV6,
- RT_TOS(iph->tos),
+ iph->tos & INET_DSCP_MASK,
tunnel->parms.link);
if (!IS_ERR(rt)) {
@@ -1075,17 +1118,20 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev)
if (!tdev && tunnel->parms.link)
tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
- if (tdev) {
- int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+ if (tdev && !netif_is_l3_master(tdev)) {
+ int mtu;
- dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
- dev->mtu = tdev->mtu - t_hlen;
- if (dev->mtu < IPV6_MIN_MTU)
- dev->mtu = IPV6_MIN_MTU;
+ mtu = tdev->mtu - t_hlen;
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+ WRITE_ONCE(dev->mtu, mtu);
+ hlen = tdev->hard_header_len + tdev->needed_headroom;
}
+ dev->needed_headroom = t_hlen + hlen;
}
-static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p,
+static void ipip6_tunnel_update(struct ip_tunnel *t,
+ struct ip_tunnel_parm_kern *p,
__u32 fwmark)
{
struct net *net = t->net;
@@ -1095,7 +1141,7 @@ static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p,
synchronize_net();
t->parms.iph.saddr = p->iph.saddr;
t->parms.iph.daddr = p->iph.daddr;
- memcpy(t->dev->dev_addr, &p->iph.saddr, 4);
+ __dev_addr_set(t->dev, &p->iph.saddr, 4);
memcpy(t->dev->broadcast, &p->iph.daddr, 4);
ipip6_tunnel_link(sitn, t);
t->parms.iph.ttl = p->iph.ttl;
@@ -1141,7 +1187,54 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t,
netdev_state_change(t->dev);
return 0;
}
-#endif
+
+static int
+ipip6_tunnel_get6rd(struct net_device *dev, struct ip_tunnel_parm __user *data)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_parm_kern p;
+ struct ip_tunnel_6rd ip6rd;
+
+ if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) {
+ if (!ip_tunnel_parm_from_user(&p, data))
+ return -EFAULT;
+ t = ipip6_tunnel_locate(t->net, &p, 0);
+ }
+ if (!t)
+ t = netdev_priv(dev);
+
+ ip6rd.prefix = t->ip6rd.prefix;
+ ip6rd.relay_prefix = t->ip6rd.relay_prefix;
+ ip6rd.prefixlen = t->ip6rd.prefixlen;
+ ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen;
+ if (copy_to_user(data, &ip6rd, sizeof(ip6rd)))
+ return -EFAULT;
+ return 0;
+}
+
+static int
+ipip6_tunnel_6rdctl(struct net_device *dev, struct ip_tunnel_6rd __user *data,
+ int cmd)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_6rd ip6rd;
+ int err;
+
+ if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ if (copy_from_user(&ip6rd, data, sizeof(ip6rd)))
+ return -EFAULT;
+
+ if (cmd != SIOCDEL6RD) {
+ err = ipip6_tunnel_update_6rd(t, &ip6rd);
+ if (err < 0)
+ return err;
+ } else
+ ipip6_tunnel_clone_6rd(dev, dev_to_sit_net(dev));
+ return 0;
+}
+
+#endif /* CONFIG_IPV6_SIT_6RD */
static bool ipip6_valid_ip_proto(u8 ipproto)
{
@@ -1154,194 +1247,156 @@ static bool ipip6_valid_ip_proto(u8 ipproto)
}
static int
-ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+__ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm_kern *p)
+{
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!ipip6_valid_ip_proto(p->iph.protocol))
+ return -EINVAL;
+ if (p->iph.version != 4 ||
+ p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)))
+ return -EINVAL;
+
+ if (p->iph.ttl)
+ p->iph.frag_off |= htons(IP_DF);
+ return 0;
+}
+
+static int
+ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm_kern *p)
{
- int err = 0;
- struct ip_tunnel_parm p;
- struct ip_tunnel_prl prl;
struct ip_tunnel *t = netdev_priv(dev);
- struct net *net = t->net;
- struct sit_net *sitn = net_generic(net, sit_net_id);
-#ifdef CONFIG_IPV6_SIT_6RD
- struct ip_tunnel_6rd ip6rd;
-#endif
- switch (cmd) {
- case SIOCGETTUNNEL:
-#ifdef CONFIG_IPV6_SIT_6RD
- case SIOCGET6RD:
-#endif
- if (dev == sitn->fb_tunnel_dev) {
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
- err = -EFAULT;
- break;
- }
- t = ipip6_tunnel_locate(net, &p, 0);
- if (!t)
- t = netdev_priv(dev);
- }
+ if (dev == dev_to_sit_net(dev)->fb_tunnel_dev)
+ t = ipip6_tunnel_locate(t->net, p, 0);
+ if (!t)
+ t = netdev_priv(dev);
+ memcpy(p, &t->parms, sizeof(*p));
+ return 0;
+}
- err = -EFAULT;
- if (cmd == SIOCGETTUNNEL) {
- memcpy(&p, &t->parms, sizeof(p));
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &p,
- sizeof(p)))
- goto done;
-#ifdef CONFIG_IPV6_SIT_6RD
+static int
+ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm_kern *p)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err;
+
+ err = __ipip6_tunnel_ioctl_validate(t->net, p);
+ if (err)
+ return err;
+
+ t = ipip6_tunnel_locate(t->net, p, 1);
+ if (!t)
+ return -ENOBUFS;
+ return 0;
+}
+
+static int
+ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm_kern *p)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ int err;
+
+ err = __ipip6_tunnel_ioctl_validate(t->net, p);
+ if (err)
+ return err;
+
+ t = ipip6_tunnel_locate(t->net, p, 0);
+ if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) {
+ if (!t)
+ return -ENOENT;
+ } else {
+ if (t) {
+ if (t->dev != dev)
+ return -EEXIST;
} else {
- ip6rd.prefix = t->ip6rd.prefix;
- ip6rd.relay_prefix = t->ip6rd.relay_prefix;
- ip6rd.prefixlen = t->ip6rd.prefixlen;
- ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen;
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &ip6rd,
- sizeof(ip6rd)))
- goto done;
-#endif
+ if (((dev->flags & IFF_POINTOPOINT) && !p->iph.daddr) ||
+ (!(dev->flags & IFF_POINTOPOINT) && p->iph.daddr))
+ return -EINVAL;
+ t = netdev_priv(dev);
}
- err = 0;
- break;
- case SIOCADDTUNNEL:
- case SIOCCHGTUNNEL:
- err = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- goto done;
-
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
-
- err = -EINVAL;
- if (!ipip6_valid_ip_proto(p.iph.protocol))
- goto done;
- if (p.iph.version != 4 ||
- p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
- goto done;
- if (p.iph.ttl)
- p.iph.frag_off |= htons(IP_DF);
-
- t = ipip6_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
-
- if (dev != sitn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
- if (t) {
- if (t->dev != dev) {
- err = -EEXIST;
- break;
- }
- } else {
- if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
- (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
- err = -EINVAL;
- break;
- }
- t = netdev_priv(dev);
- }
+ ipip6_tunnel_update(t, p, t->fwmark);
+ }
- ipip6_tunnel_update(t, &p, t->fwmark);
- }
+ return 0;
+}
- if (t) {
- err = 0;
- if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
- err = -EFAULT;
- } else
- err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
- break;
+static int
+ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm_kern *p)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+
+ if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+ if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) {
+ t = ipip6_tunnel_locate(t->net, p, 0);
+ if (!t)
+ return -ENOENT;
+ if (t == netdev_priv(dev_to_sit_net(dev)->fb_tunnel_dev))
+ return -EPERM;
+ dev = t->dev;
+ }
+ unregister_netdevice(dev);
+ return 0;
+}
+
+static int
+ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p,
+ int cmd)
+{
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ return ipip6_tunnel_get(dev, p);
+ case SIOCADDTUNNEL:
+ return ipip6_tunnel_add(dev, p);
+ case SIOCCHGTUNNEL:
+ return ipip6_tunnel_change(dev, p);
case SIOCDELTUNNEL:
- err = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- goto done;
-
- if (dev == sitn->fb_tunnel_dev) {
- err = -EFAULT;
- if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
- goto done;
- err = -ENOENT;
- t = ipip6_tunnel_locate(net, &p, 0);
- if (!t)
- goto done;
- err = -EPERM;
- if (t == netdev_priv(sitn->fb_tunnel_dev))
- goto done;
- dev = t->dev;
- }
- unregister_netdevice(dev);
- err = 0;
- break;
+ return ipip6_tunnel_del(dev, p);
+ default:
+ return -EINVAL;
+ }
+}
+static int
+ipip6_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+ void __user *data, int cmd)
+{
+ switch (cmd) {
+ case SIOCGETTUNNEL:
+ case SIOCADDTUNNEL:
+ case SIOCCHGTUNNEL:
+ case SIOCDELTUNNEL:
+ return ip_tunnel_siocdevprivate(dev, ifr, data, cmd);
case SIOCGETPRL:
- err = -EINVAL;
- if (dev == sitn->fb_tunnel_dev)
- goto done;
- err = ipip6_tunnel_get_prl(t, ifr->ifr_ifru.ifru_data);
- break;
-
+ return ipip6_tunnel_get_prl(dev, data);
case SIOCADDPRL:
case SIOCDELPRL:
case SIOCCHGPRL:
- err = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- goto done;
- err = -EINVAL;
- if (dev == sitn->fb_tunnel_dev)
- goto done;
- err = -EFAULT;
- if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl)))
- goto done;
-
- switch (cmd) {
- case SIOCDELPRL:
- err = ipip6_tunnel_del_prl(t, &prl);
- break;
- case SIOCADDPRL:
- case SIOCCHGPRL:
- err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL);
- break;
- }
- dst_cache_reset(&t->dst_cache);
- netdev_state_change(dev);
- break;
-
+ return ipip6_tunnel_prl_ctl(dev, data, cmd);
#ifdef CONFIG_IPV6_SIT_6RD
+ case SIOCGET6RD:
+ return ipip6_tunnel_get6rd(dev, data);
case SIOCADD6RD:
case SIOCCHG6RD:
case SIOCDEL6RD:
- err = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
- goto done;
-
- err = -EFAULT;
- if (copy_from_user(&ip6rd, ifr->ifr_ifru.ifru_data,
- sizeof(ip6rd)))
- goto done;
-
- if (cmd != SIOCDEL6RD) {
- err = ipip6_tunnel_update_6rd(t, &ip6rd);
- if (err < 0)
- goto done;
- } else
- ipip6_tunnel_clone_6rd(dev, sitn);
-
- err = 0;
- break;
+ return ipip6_tunnel_6rdctl(dev, data, cmd);
#endif
-
default:
- err = -EINVAL;
+ return -EINVAL;
}
-
-done:
- return err;
}
static const struct net_device_ops ipip6_netdev_ops = {
.ndo_init = ipip6_tunnel_init,
.ndo_uninit = ipip6_tunnel_uninit,
.ndo_start_xmit = sit_tunnel_xmit,
- .ndo_do_ioctl = ipip6_tunnel_ioctl,
- .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_siocdevprivate = ipip6_tunnel_siocdevprivate,
.ndo_get_iflink = ip_tunnel_get_iflink,
+ .ndo_tunnel_ctl = ipip6_tunnel_ctl,
};
static void ipip6_dev_free(struct net_device *dev)
@@ -1349,7 +1404,6 @@ static void ipip6_dev_free(struct net_device *dev)
struct ip_tunnel *tunnel = netdev_priv(dev);
dst_cache_destroy(&tunnel->dst_cache);
- free_percpu(dev->tstats);
}
#define SIT_FEATURES (NETIF_F_SG | \
@@ -1364,20 +1418,22 @@ static void ipip6_tunnel_setup(struct net_device *dev)
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
dev->netdev_ops = &ipip6_netdev_ops;
+ dev->header_ops = &ip_tunnel_header_ops;
dev->needs_free_netdev = true;
dev->priv_destructor = ipip6_dev_free;
dev->type = ARPHRD_SIT;
- dev->hard_header_len = LL_MAX_HEADER + t_hlen;
dev->mtu = ETH_DATA_LEN - t_hlen;
dev->min_mtu = IPV6_MIN_MTU;
dev->max_mtu = IP6_MAX_MTU - t_hlen;
dev->flags = IFF_NOARP;
netif_keep_dst(dev);
dev->addr_len = 4;
- dev->features |= NETIF_F_LLTX;
+ dev->lltx = true;
dev->features |= SIT_FEATURES;
dev->hw_features |= SIT_FEATURES;
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
+
}
static int ipip6_tunnel_init(struct net_device *dev)
@@ -1386,21 +1442,16 @@ static int ipip6_tunnel_init(struct net_device *dev)
int err;
tunnel->dev = dev;
- tunnel->net = dev_net(dev);
strcpy(tunnel->parms.name, dev->name);
ipip6_tunnel_bind_dev(dev);
- dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
- if (!dev->tstats)
- return -ENOMEM;
err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
- if (err) {
- free_percpu(dev->tstats);
- dev->tstats = NULL;
+ if (err)
return err;
- }
+ netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL);
+ netdev_lockdep_set_classes(dev);
return 0;
}
@@ -1416,7 +1467,6 @@ static void __net_init ipip6_fb_tunnel_init(struct net_device *dev)
iph->ihl = 5;
iph->ttl = 64;
- dev_hold(dev);
rcu_assign_pointer(sitn->tunnels_wc[0], tunnel);
}
@@ -1436,7 +1486,7 @@ static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[],
}
static void ipip6_netlink_parms(struct nlattr *data[],
- struct ip_tunnel_parm *parms,
+ struct ip_tunnel_parm_kern *parms,
__u32 *fwmark)
{
memset(parms, 0, sizeof(*parms));
@@ -1449,71 +1499,12 @@ static void ipip6_netlink_parms(struct nlattr *data[],
if (!data)
return;
- if (data[IFLA_IPTUN_LINK])
- parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);
-
- if (data[IFLA_IPTUN_LOCAL])
- parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]);
-
- if (data[IFLA_IPTUN_REMOTE])
- parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]);
-
- if (data[IFLA_IPTUN_TTL]) {
- parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
- if (parms->iph.ttl)
- parms->iph.frag_off = htons(IP_DF);
- }
-
- if (data[IFLA_IPTUN_TOS])
- parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);
-
- if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
- parms->iph.frag_off = htons(IP_DF);
-
- if (data[IFLA_IPTUN_FLAGS])
- parms->i_flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]);
-
- if (data[IFLA_IPTUN_PROTO])
- parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]);
+ ip_tunnel_netlink_parms(data, parms);
if (data[IFLA_IPTUN_FWMARK])
*fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]);
}
-/* This function returns true when ENCAP attributes are present in the nl msg */
-static bool ipip6_netlink_encap_parms(struct nlattr *data[],
- struct ip_tunnel_encap *ipencap)
-{
- bool ret = false;
-
- memset(ipencap, 0, sizeof(*ipencap));
-
- if (!data)
- return ret;
-
- if (data[IFLA_IPTUN_ENCAP_TYPE]) {
- ret = true;
- ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
- }
-
- if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
- ret = true;
- ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
- }
-
- if (data[IFLA_IPTUN_ENCAP_SPORT]) {
- ret = true;
- ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
- }
-
- if (data[IFLA_IPTUN_ENCAP_DPORT]) {
- ret = true;
- ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
- }
-
- return ret;
-}
-
#ifdef CONFIG_IPV6_SIT_6RD
/* This function returns true when 6RD attributes are present in the nl msg */
static bool ipip6_netlink_6rd_parms(struct nlattr *data[],
@@ -1551,21 +1542,25 @@ static bool ipip6_netlink_6rd_parms(struct nlattr *data[],
}
#endif
-static int ipip6_newlink(struct net *src_net, struct net_device *dev,
- struct nlattr *tb[], struct nlattr *data[],
+static int ipip6_newlink(struct net_device *dev,
+ struct rtnl_newlink_params *params,
struct netlink_ext_ack *extack)
{
- struct net *net = dev_net(dev);
+ struct nlattr **data = params->data;
+ struct nlattr **tb = params->tb;
struct ip_tunnel *nt;
struct ip_tunnel_encap ipencap;
#ifdef CONFIG_IPV6_SIT_6RD
struct ip_tunnel_6rd ip6rd;
#endif
+ struct net *net;
int err;
+ net = params->link_net ? : dev_net(dev);
nt = netdev_priv(dev);
+ nt->net = net;
- if (ipip6_netlink_encap_parms(data, &ipencap)) {
+ if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
err = ip_tunnel_encap_setup(nt, &ipencap);
if (err < 0)
return err;
@@ -1589,8 +1584,11 @@ static int ipip6_newlink(struct net *src_net, struct net_device *dev,
}
#ifdef CONFIG_IPV6_SIT_6RD
- if (ipip6_netlink_6rd_parms(data, &ip6rd))
+ if (ipip6_netlink_6rd_parms(data, &ip6rd)) {
err = ipip6_tunnel_update_6rd(nt, &ip6rd);
+ if (err < 0)
+ unregister_netdevice_queue(dev, NULL);
+ }
#endif
return err;
@@ -1601,8 +1599,8 @@ static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[],
struct netlink_ext_ack *extack)
{
struct ip_tunnel *t = netdev_priv(dev);
- struct ip_tunnel_parm p;
struct ip_tunnel_encap ipencap;
+ struct ip_tunnel_parm_kern p;
struct net *net = t->net;
struct sit_net *sitn = net_generic(net, sit_net_id);
#ifdef CONFIG_IPV6_SIT_6RD
@@ -1614,7 +1612,7 @@ static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[],
if (dev == sitn->fb_tunnel_dev)
return -EINVAL;
- if (ipip6_netlink_encap_parms(data, &ipencap)) {
+ if (ip_tunnel_netlink_encap_parms(data, &ipencap)) {
err = ip_tunnel_encap_setup(t, &ipencap);
if (err < 0)
return err;
@@ -1689,7 +1687,7 @@ static size_t ipip6_get_size(const struct net_device *dev)
static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct ip_tunnel_parm *parm = &tunnel->parms;
+ struct ip_tunnel_parm_kern *parm = &tunnel->parms;
if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
@@ -1699,7 +1697,8 @@ static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
!!(parm->iph.frag_off & htons(IP_DF))) ||
nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) ||
- nla_put_be16(skb, IFLA_IPTUN_FLAGS, parm->i_flags) ||
+ nla_put_be16(skb, IFLA_IPTUN_FLAGS,
+ ip_tunnel_flags_to_be16(parm->i_flags)) ||
nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark))
goto nla_put_failure;
@@ -1797,8 +1796,7 @@ static struct xfrm_tunnel mplsip_handler __read_mostly = {
};
#endif
-static void __net_exit sit_destroy_tunnels(struct net *net,
- struct list_head *head)
+static void __net_exit sit_exit_rtnl_net(struct net *net, struct list_head *head)
{
struct sit_net *sitn = net_generic(net, sit_net_id);
struct net_device *dev, *aux;
@@ -1808,20 +1806,20 @@ static void __net_exit sit_destroy_tunnels(struct net *net,
if (dev->rtnl_link_ops == &sit_link_ops)
unregister_netdevice_queue(dev, head);
- for (prio = 1; prio < 4; prio++) {
+ for (prio = 0; prio < 4; prio++) {
int h;
- for (h = 0; h < IP6_SIT_HASH_SIZE; h++) {
+ for (h = 0; h < (prio ? IP6_SIT_HASH_SIZE : 1); h++) {
struct ip_tunnel *t;
- t = rtnl_dereference(sitn->tunnels[prio][h]);
+ t = rtnl_net_dereference(net, sitn->tunnels[prio][h]);
while (t) {
/* If dev is in the same netns, it has already
* been added to the list by the previous loop.
*/
if (!net_eq(dev_net(t->dev), net))
- unregister_netdevice_queue(t->dev,
- head);
- t = rtnl_dereference(t->next);
+ unregister_netdevice_queue(t->dev, head);
+
+ t = rtnl_net_dereference(net, t->next);
}
}
}
@@ -1853,7 +1851,10 @@ static int __net_init sit_init_net(struct net *net)
/* FB netdevice is special: we have one, and only one per netns.
* Allowing to move it to another netns is clearly unsafe.
*/
- sitn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
+ sitn->fb_tunnel_dev->netns_immutable = true;
+
+ t = netdev_priv(sitn->fb_tunnel_dev);
+ t->net = net;
err = register_netdev(sitn->fb_tunnel_dev);
if (err)
@@ -1862,33 +1863,18 @@ static int __net_init sit_init_net(struct net *net)
ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn);
ipip6_fb_tunnel_init(sitn->fb_tunnel_dev);
- t = netdev_priv(sitn->fb_tunnel_dev);
-
strcpy(t->parms.name, sitn->fb_tunnel_dev->name);
return 0;
err_reg_dev:
- ipip6_dev_free(sitn->fb_tunnel_dev);
+ free_netdev(sitn->fb_tunnel_dev);
err_alloc_dev:
return err;
}
-static void __net_exit sit_exit_batch_net(struct list_head *net_list)
-{
- LIST_HEAD(list);
- struct net *net;
-
- rtnl_lock();
- list_for_each_entry(net, net_list, exit_list)
- sit_destroy_tunnels(net, &list);
-
- unregister_netdevice_many(&list);
- rtnl_unlock();
-}
-
static struct pernet_operations sit_net_ops = {
.init = sit_init_net,
- .exit_batch = sit_exit_batch_net,
+ .exit_rtnl = sit_exit_rtnl_net,
.id = &sit_net_id,
.size = sizeof(struct sit_net),
};
@@ -1954,6 +1940,7 @@ xfrm_tunnel_failed:
module_init(sit_init);
module_exit(sit_cleanup);
+MODULE_DESCRIPTION("IPv6-in-IPv4 tunnel SIT driver");
MODULE_LICENSE("GPL");
MODULE_ALIAS_RTNL_LINK("sit");
MODULE_ALIAS_NETDEV("sit0");
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index e997141aed8c..7e007f013ec8 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPv6 Syncookies implementation for the Linux kernel
*
@@ -6,12 +7,6 @@
*
* Based on IPv4 implementation by Andi Kleen
* linux/net/ipv4/syncookies.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <linux/tcp.h>
@@ -21,11 +16,12 @@
#include <net/secure_seq.h>
#include <net/ipv6.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
-static siphash_key_t syncookie6_secret[2] __read_mostly;
+static siphash_aligned_key_t syncookie6_secret[2];
/* RFC 2460, Section 8.3:
* [ipv6 tcp] MSS must be computed as the maximum packet size minus 60 [..]
@@ -119,78 +115,97 @@ __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mssp)
return __cookie_v6_init_sequence(iph, th, mssp);
}
-int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th,
- __u32 cookie)
+int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th)
{
+ __u32 cookie = ntohl(th->ack_seq) - 1;
__u32 seq = ntohl(th->seq) - 1;
- __u32 mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr,
- th->source, th->dest, seq);
+ __u32 mssind;
+
+ mssind = check_tcp_syn_cookie(cookie, &iph->saddr, &iph->daddr,
+ th->source, th->dest, seq);
return mssind < ARRAY_SIZE(msstab) ? msstab[mssind] : 0;
}
EXPORT_SYMBOL_GPL(__cookie_v6_check);
-struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
+static struct request_sock *cookie_tcp_check(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
struct tcp_options_received tcp_opt;
- struct inet_request_sock *ireq;
- struct tcp_request_sock *treq;
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
- const struct tcphdr *th = tcp_hdr(skb);
- __u32 cookie = ntohl(th->ack_seq) - 1;
- struct sock *ret = sk;
- struct request_sock *req;
- int mss;
- struct dst_entry *dst;
- __u8 rcv_wscale;
u32 tsoff = 0;
-
- if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst)
- goto out;
+ int mss;
if (tcp_synq_no_recent_overflow(sk))
goto out;
- mss = __cookie_v6_check(ipv6_hdr(skb), th, cookie);
- if (mss == 0) {
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESFAILED);
+ mss = __cookie_v6_check(ipv6_hdr(skb), tcp_hdr(skb));
+ if (!mss) {
+ __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESFAILED);
goto out;
}
- __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESRECV);
+ __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESRECV);
/* check for timestamp cookie support */
memset(&tcp_opt, 0, sizeof(tcp_opt));
- tcp_parse_options(sock_net(sk), skb, &tcp_opt, 0, NULL);
+ tcp_parse_options(net, skb, &tcp_opt, 0, NULL);
if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) {
- tsoff = secure_tcpv6_ts_off(sock_net(sk),
+ tsoff = secure_tcpv6_ts_off(net,
ipv6_hdr(skb)->daddr.s6_addr32,
ipv6_hdr(skb)->saddr.s6_addr32);
tcp_opt.rcv_tsecr -= tsoff;
}
- if (!cookie_timestamp_decode(sock_net(sk), &tcp_opt))
+ if (!cookie_timestamp_decode(net, &tcp_opt))
goto out;
- ret = NULL;
- req = inet_reqsk_alloc(&tcp6_request_sock_ops, sk, false);
- if (!req)
+ return cookie_tcp_reqsk_alloc(&tcp6_request_sock_ops, sk, skb,
+ &tcp_opt, mss, tsoff);
+out:
+ return ERR_PTR(-EINVAL);
+}
+
+struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
+{
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_request_sock *ireq;
+ struct net *net = sock_net(sk);
+ struct request_sock *req;
+ struct dst_entry *dst;
+ struct sock *ret = sk;
+ __u8 rcv_wscale;
+ int full_space;
+ SKB_DR(reason);
+
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_syncookies) ||
+ !th->ack || th->rst)
goto out;
- ireq = inet_rsk(req);
- treq = tcp_rsk(req);
- treq->tfo_listener = false;
+ if (cookie_bpf_ok(skb)) {
+ req = cookie_bpf_check(sk, skb);
+ } else {
+ req = cookie_tcp_check(net, sk, skb);
+ if (IS_ERR(req))
+ goto out;
+ }
+ if (!req) {
+ SKB_DR_SET(reason, NO_SOCKET);
+ goto out_drop;
+ }
- if (security_inet_conn_request(sk, skb, req))
- goto out_free;
+ ireq = inet_rsk(req);
- req->mss = mss;
- ireq->ir_rmt_port = th->source;
- ireq->ir_num = ntohs(th->dest);
ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
+
+ if (security_inet_conn_request(sk, skb, req)) {
+ SKB_DR_SET(reason, SECURITY_HOOK);
+ goto out_free;
+ }
+
if (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) ||
np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
@@ -198,27 +213,12 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
ireq->pktopts = skb;
}
- ireq->ir_iif = inet_request_bound_dev_if(sk, skb);
/* So that link locals have meaning */
if (!sk->sk_bound_dev_if &&
ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
ireq->ir_iif = tcp_v6_iif(skb);
- ireq->ir_mark = inet_request_mark(sk, skb);
-
- req->num_retrans = 0;
- ireq->snd_wscale = tcp_opt.snd_wscale;
- ireq->sack_ok = tcp_opt.sack_ok;
- ireq->wscale_ok = tcp_opt.wscale_ok;
- ireq->tstamp_ok = tcp_opt.saw_tstamp;
- req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0;
- treq->snt_synack = 0;
- treq->rcv_isn = ntohl(th->seq) - 1;
- treq->snt_isn = cookie;
- treq->ts_off = 0;
- treq->txhash = net_tx_rndhash();
- if (IS_ENABLED(CONFIG_SMC))
- ireq->smc_ok = 0;
+ tcp_ao_syncookie(sk, skb, req, AF_INET6);
/*
* We need to lookup the dst_entry to get the correct window size.
@@ -237,27 +237,46 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
fl6.flowi6_mark = ireq->ir_mark;
fl6.fl6_dport = ireq->ir_rmt_port;
fl6.fl6_sport = inet_sk(sk)->inet_sport;
- fl6.flowi6_uid = sk->sk_uid;
- security_req_classify_flow(req, flowi6_to_flowi(&fl6));
+ fl6.flowi6_uid = sk_uid(sk);
+ security_req_classify_flow(req, flowi6_to_flowi_common(&fl6));
- dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
- if (IS_ERR(dst))
+ dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p);
+ if (IS_ERR(dst)) {
+ SKB_DR_SET(reason, IP_OUTNOROUTES);
goto out_free;
+ }
}
- req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW);
- tcp_select_initial_window(sk, tcp_full_space(sk), req->mss,
+ req->rsk_window_clamp = READ_ONCE(tp->window_clamp) ? :dst_metric(dst, RTAX_WINDOW);
+ /* limit the window selection if the user enforce a smaller rx buffer */
+ full_space = tcp_full_space(sk);
+ if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+ (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
+ req->rsk_window_clamp = full_space;
+
+ tcp_select_initial_window(sk, full_space, req->mss,
&req->rsk_rcv_wnd, &req->rsk_window_clamp,
ireq->wscale_ok, &rcv_wscale,
dst_metric(dst, RTAX_INITRWND));
- ireq->rcv_wscale = rcv_wscale;
- ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst);
-
- ret = tcp_get_cookie_sock(sk, skb, req, dst, tsoff);
+ /* req->syncookie is set true only if ACK is validated
+ * by BPF kfunc, then, rcv_wscale is already configured.
+ */
+ if (!req->syncookie)
+ ireq->rcv_wscale = rcv_wscale;
+ ireq->ecn_ok &= cookie_ecn_ok(net, dst);
+ tcp_rsk(req)->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th);
+
+ ret = tcp_get_cookie_sock(sk, skb, req, dst);
+ if (!ret) {
+ SKB_DR_SET(reason, NO_SOCKET);
+ goto out_drop;
+ }
out:
return ret;
out_free:
reqsk_free(req);
+out_drop:
+ sk_skb_reason_drop(sk, skb, reason);
return NULL;
}
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index e15cd37024fd..d2cd33e2698d 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -17,25 +17,44 @@
#include <net/addrconf.h>
#include <net/inet_frag.h>
#include <net/netevent.h>
+#include <net/ip_fib.h>
#ifdef CONFIG_NETLABEL
#include <net/calipso.h>
#endif
+#include <linux/ioam6.h>
-static int zero;
-static int one = 1;
-static int auto_flowlabels_min;
+static int flowlabel_reflect_max = 0x7;
static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
+static u32 rt6_multipath_hash_fields_all_mask =
+ FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
+static u32 ioam6_id_max = IOAM6_DEFAULT_ID;
+static u64 ioam6_id_wide_max = IOAM6_DEFAULT_ID_WIDE;
-static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+static int proc_rt6_multipath_hash_policy(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net;
int ret;
net = container_of(table->data, struct net,
ipv6.sysctl.multipath_hash_policy);
- ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net);
+
+ return ret;
+}
+
+static int
+proc_rt6_multipath_hash_fields(const struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct net *net;
+ int ret;
+
+ net = container_of(table->data, struct net,
+ ipv6.sysctl.multipath_hash_fields);
+ ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos);
if (write && ret == 0)
call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net);
@@ -46,39 +65,38 @@ static struct ctl_table ipv6_table_template[] = {
{
.procname = "bindv6only",
.data = &init_net.ipv6.sysctl.bindv6only,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "anycast_src_echo_reply",
.data = &init_net.ipv6.sysctl.anycast_src_echo_reply,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "flowlabel_consistency",
.data = &init_net.ipv6.sysctl.flowlabel_consistency,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "auto_flowlabels",
.data = &init_net.ipv6.sysctl.auto_flowlabels,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &auto_flowlabels_min,
+ .proc_handler = proc_dou8vec_minmax,
.extra2 = &auto_flowlabels_max
},
{
.procname = "fwmark_reflect",
.data = &init_net.ipv6.sysctl.fwmark_reflect,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "idgen_retries",
@@ -97,23 +115,25 @@ static struct ctl_table ipv6_table_template[] = {
{
.procname = "flowlabel_state_ranges",
.data = &init_net.ipv6.sysctl.flowlabel_state_ranges,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "ip_nonlocal_bind",
.data = &init_net.ipv6.sysctl.ip_nonlocal_bind,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
- .proc_handler = proc_dointvec
+ .proc_handler = proc_dou8vec_minmax,
},
{
.procname = "flowlabel_reflect",
.data = &init_net.ipv6.sysctl.flowlabel_reflect,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &flowlabel_reflect_max,
},
{
.procname = "max_dst_opts_number",
@@ -146,11 +166,20 @@ static struct ctl_table ipv6_table_template[] = {
{
.procname = "fib_multipath_hash_policy",
.data = &init_net.ipv6.sysctl.multipath_hash_policy,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(u8),
.mode = 0644,
.proc_handler = proc_rt6_multipath_hash_policy,
- .extra1 = &zero,
- .extra2 = &one,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_THREE,
+ },
+ {
+ .procname = "fib_multipath_hash_fields",
+ .data = &init_net.ipv6.sysctl.multipath_hash_fields,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_rt6_multipath_hash_fields,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = &rt6_multipath_hash_fields_all_mask,
},
{
.procname = "seg6_flowlabel",
@@ -159,7 +188,31 @@ static struct ctl_table ipv6_table_template[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
- { }
+ {
+ .procname = "fib_notify_on_flag_change",
+ .data = &init_net.ipv6.sysctl.fib_notify_on_flag_change,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_TWO,
+ },
+ {
+ .procname = "ioam6_id",
+ .data = &init_net.ipv6.sysctl.ioam6_id,
+ .maxlen = sizeof(u32),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra2 = &ioam6_id_max,
+ },
+ {
+ .procname = "ioam6_id_wide",
+ .data = &init_net.ipv6.sysctl.ioam6_id_wide,
+ .maxlen = sizeof(u64),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ .extra2 = &ioam6_id_wide_max,
+ },
};
static struct ctl_table ipv6_rotable[] = {
@@ -176,7 +229,7 @@ static struct ctl_table ipv6_rotable[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = &one
+ .extra1 = SYSCTL_ONE
},
#ifdef CONFIG_NETLABEL
{
@@ -194,37 +247,24 @@ static struct ctl_table ipv6_rotable[] = {
.proc_handler = proc_dointvec,
},
#endif /* CONFIG_NETLABEL */
- { }
};
static int __net_init ipv6_sysctl_net_init(struct net *net)
{
+ size_t table_size = ARRAY_SIZE(ipv6_table_template);
struct ctl_table *ipv6_table;
struct ctl_table *ipv6_route_table;
struct ctl_table *ipv6_icmp_table;
- int err;
+ int err, i;
err = -ENOMEM;
ipv6_table = kmemdup(ipv6_table_template, sizeof(ipv6_table_template),
GFP_KERNEL);
if (!ipv6_table)
goto out;
- ipv6_table[0].data = &net->ipv6.sysctl.bindv6only;
- ipv6_table[1].data = &net->ipv6.sysctl.anycast_src_echo_reply;
- ipv6_table[2].data = &net->ipv6.sysctl.flowlabel_consistency;
- ipv6_table[3].data = &net->ipv6.sysctl.auto_flowlabels;
- ipv6_table[4].data = &net->ipv6.sysctl.fwmark_reflect;
- ipv6_table[5].data = &net->ipv6.sysctl.idgen_retries;
- ipv6_table[6].data = &net->ipv6.sysctl.idgen_delay;
- ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges;
- ipv6_table[8].data = &net->ipv6.sysctl.ip_nonlocal_bind;
- ipv6_table[9].data = &net->ipv6.sysctl.flowlabel_reflect;
- ipv6_table[10].data = &net->ipv6.sysctl.max_dst_opts_cnt;
- ipv6_table[11].data = &net->ipv6.sysctl.max_hbh_opts_cnt;
- ipv6_table[12].data = &net->ipv6.sysctl.max_dst_opts_len;
- ipv6_table[13].data = &net->ipv6.sysctl.max_hbh_opts_len;
- ipv6_table[14].data = &net->ipv6.sysctl.multipath_hash_policy,
- ipv6_table[15].data = &net->ipv6.sysctl.seg6_flowlabel;
+ /* Update the variables to point into the current struct net */
+ for (i = 0; i < table_size; i++)
+ ipv6_table[i].data += (void *)net - (void *)&init_net;
ipv6_route_table = ipv6_route_sysctl_init(net);
if (!ipv6_route_table)
@@ -234,17 +274,22 @@ static int __net_init ipv6_sysctl_net_init(struct net *net)
if (!ipv6_icmp_table)
goto out_ipv6_route_table;
- net->ipv6.sysctl.hdr = register_net_sysctl(net, "net/ipv6", ipv6_table);
+ net->ipv6.sysctl.hdr = register_net_sysctl_sz(net, "net/ipv6",
+ ipv6_table, table_size);
if (!net->ipv6.sysctl.hdr)
goto out_ipv6_icmp_table;
- net->ipv6.sysctl.route_hdr =
- register_net_sysctl(net, "net/ipv6/route", ipv6_route_table);
+ net->ipv6.sysctl.route_hdr = register_net_sysctl_sz(net,
+ "net/ipv6/route",
+ ipv6_route_table,
+ ipv6_route_sysctl_table_size(net));
if (!net->ipv6.sysctl.route_hdr)
goto out_unregister_ipv6_table;
- net->ipv6.sysctl.icmp_hdr =
- register_net_sysctl(net, "net/ipv6/icmp", ipv6_icmp_table);
+ net->ipv6.sysctl.icmp_hdr = register_net_sysctl_sz(net,
+ "net/ipv6/icmp",
+ ipv6_icmp_table,
+ ipv6_icmp_sysctl_table_size());
if (!net->ipv6.sysctl.icmp_hdr)
goto out_unregister_route_table;
@@ -266,9 +311,9 @@ out_ipv6_table:
static void __net_exit ipv6_sysctl_net_exit(struct net *net)
{
- struct ctl_table *ipv6_table;
- struct ctl_table *ipv6_route_table;
- struct ctl_table *ipv6_icmp_table;
+ const struct ctl_table *ipv6_table;
+ const struct ctl_table *ipv6_route_table;
+ const struct ctl_table *ipv6_icmp_table;
ipv6_table = net->ipv6.sysctl.hdr->ctl_table_arg;
ipv6_route_table = net->ipv6.sysctl.route_hdr->ctl_table_arg;
diff --git a/net/ipv6/tcp_ao.c b/net/ipv6/tcp_ao.c
new file mode 100644
index 000000000000..3c09ac26206e
--- /dev/null
+++ b/net/ipv6/tcp_ao.c
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * INET An implementation of the TCP Authentication Option (TCP-AO).
+ * See RFC5925.
+ *
+ * Authors: Dmitry Safonov <dima@arista.com>
+ * Francesco Ruggeri <fruggeri@arista.com>
+ * Salam Noureddine <noureddine@arista.com>
+ */
+#include <crypto/hash.h>
+#include <linux/tcp.h>
+
+#include <net/tcp.h>
+#include <net/ipv6.h>
+
+static int tcp_v6_ao_calc_key(struct tcp_ao_key *mkt, u8 *key,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __be16 sport, __be16 dport,
+ __be32 sisn, __be32 disn)
+{
+ struct kdf_input_block {
+ u8 counter;
+ u8 label[6];
+ struct tcp6_ao_context ctx;
+ __be16 outlen;
+ } __packed * tmp;
+ struct tcp_sigpool hp;
+ int err;
+
+ err = tcp_sigpool_start(mkt->tcp_sigpool_id, &hp);
+ if (err)
+ return err;
+
+ tmp = hp.scratch;
+ tmp->counter = 1;
+ memcpy(tmp->label, "TCP-AO", 6);
+ tmp->ctx.saddr = *saddr;
+ tmp->ctx.daddr = *daddr;
+ tmp->ctx.sport = sport;
+ tmp->ctx.dport = dport;
+ tmp->ctx.sisn = sisn;
+ tmp->ctx.disn = disn;
+ tmp->outlen = htons(tcp_ao_digest_size(mkt) * 8); /* in bits */
+
+ err = tcp_ao_calc_traffic_key(mkt, key, tmp, sizeof(*tmp), &hp);
+ tcp_sigpool_end(&hp);
+
+ return err;
+}
+
+int tcp_v6_ao_calc_key_skb(struct tcp_ao_key *mkt, u8 *key,
+ const struct sk_buff *skb,
+ __be32 sisn, __be32 disn)
+{
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ const struct tcphdr *th = tcp_hdr(skb);
+
+ return tcp_v6_ao_calc_key(mkt, key, &iph->saddr,
+ &iph->daddr, th->source,
+ th->dest, sisn, disn);
+}
+
+int tcp_v6_ao_calc_key_sk(struct tcp_ao_key *mkt, u8 *key,
+ const struct sock *sk, __be32 sisn,
+ __be32 disn, bool send)
+{
+ if (send)
+ return tcp_v6_ao_calc_key(mkt, key, &sk->sk_v6_rcv_saddr,
+ &sk->sk_v6_daddr, htons(sk->sk_num),
+ sk->sk_dport, sisn, disn);
+ else
+ return tcp_v6_ao_calc_key(mkt, key, &sk->sk_v6_daddr,
+ &sk->sk_v6_rcv_saddr, sk->sk_dport,
+ htons(sk->sk_num), disn, sisn);
+}
+
+int tcp_v6_ao_calc_key_rsk(struct tcp_ao_key *mkt, u8 *key,
+ struct request_sock *req)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+
+ return tcp_v6_ao_calc_key(mkt, key,
+ &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr,
+ htons(ireq->ir_num), ireq->ir_rmt_port,
+ htonl(tcp_rsk(req)->snt_isn),
+ htonl(tcp_rsk(req)->rcv_isn));
+}
+
+struct tcp_ao_key *tcp_v6_ao_lookup(const struct sock *sk,
+ struct sock *addr_sk,
+ int sndid, int rcvid)
+{
+ int l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
+ addr_sk->sk_bound_dev_if);
+ struct in6_addr *addr = &addr_sk->sk_v6_daddr;
+
+ return tcp_ao_do_lookup(sk, l3index, (union tcp_ao_addr *)addr,
+ AF_INET6, sndid, rcvid);
+}
+
+struct tcp_ao_key *tcp_v6_ao_lookup_rsk(const struct sock *sk,
+ struct request_sock *req,
+ int sndid, int rcvid)
+{
+ struct inet_request_sock *ireq = inet_rsk(req);
+ struct in6_addr *addr = &ireq->ir_v6_rmt_addr;
+ int l3index;
+
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
+ return tcp_ao_do_lookup(sk, l3index, (union tcp_ao_addr *)addr,
+ AF_INET6, sndid, rcvid);
+}
+
+int tcp_v6_ao_hash_pseudoheader(struct tcp_sigpool *hp,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr, int nbytes)
+{
+ struct tcp6_pseudohdr *bp;
+ struct scatterlist sg;
+
+ bp = hp->scratch;
+ /* 1. TCP pseudo-header (RFC2460) */
+ bp->saddr = *saddr;
+ bp->daddr = *daddr;
+ bp->len = cpu_to_be32(nbytes);
+ bp->protocol = cpu_to_be32(IPPROTO_TCP);
+
+ sg_init_one(&sg, bp, sizeof(*bp));
+ ahash_request_set_crypt(hp->req, &sg, NULL, sizeof(*bp));
+ return crypto_ahash_update(hp->req);
+}
+
+int tcp_v6_ao_hash_skb(char *ao_hash, struct tcp_ao_key *key,
+ const struct sock *sk, const struct sk_buff *skb,
+ const u8 *tkey, int hash_offset, u32 sne)
+{
+ return tcp_ao_hash_skb(AF_INET6, ao_hash, key, sk, skb, tkey,
+ hash_offset, sne);
+}
+
+int tcp_v6_parse_ao(struct sock *sk, int cmd,
+ sockptr_t optval, int optlen)
+{
+ return tcp_parse_ao(sk, cmd, AF_INET6, optval, optlen);
+}
+
+int tcp_v6_ao_synack_hash(char *ao_hash, struct tcp_ao_key *ao_key,
+ struct request_sock *req, const struct sk_buff *skb,
+ int hash_offset, u32 sne)
+{
+ void *hash_buf = NULL;
+ int err;
+
+ hash_buf = kmalloc(tcp_ao_digest_size(ao_key), GFP_ATOMIC);
+ if (!hash_buf)
+ return -ENOMEM;
+
+ err = tcp_v6_ao_calc_key_rsk(ao_key, hash_buf, req);
+ if (err)
+ goto out;
+
+ err = tcp_ao_hash_skb(AF_INET6, ao_hash, ao_key, req_to_sk(req), skb,
+ hash_buf, hash_offset, sne);
+out:
+ kfree(hash_buf);
+ return err;
+}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 03e6b7a2bc53..280fe5978559 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* TCP over IPv6
* Linux INET6 implementation
@@ -16,11 +17,6 @@
* Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
* a single port at the same time.
* YOSHIFUJI Hideaki @USAGI: convert /proc/net/tcp6 to seq_file.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/bottom_half.h>
@@ -43,7 +39,9 @@
#include <linux/ipv6.h>
#include <linux/icmpv6.h>
#include <linux/random.h>
+#include <linux/indirect_call_wrapper.h>
+#include <net/aligned_data.h>
#include <net/tcp.h>
#include <net/ndisc.h>
#include <net/inet6_hashtables.h>
@@ -61,45 +59,48 @@
#include <net/timewait_sock.h>
#include <net/inet_common.h>
#include <net/secure_seq.h>
+#include <net/hotdata.h>
#include <net/busy_poll.h>
+#include <net/rstreason.h>
+#include <net/psp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-#include <crypto/hash.h>
-#include <linux/scatterlist.h>
+#include <crypto/md5.h>
#include <trace/events/tcp.h>
-static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb);
+static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb,
+ enum sk_rst_reason reason);
static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req);
-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
+INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb);
static const struct inet_connection_sock_af_ops ipv6_mapped;
-static const struct inet_connection_sock_af_ops ipv6_specific;
-#ifdef CONFIG_TCP_MD5SIG
+const struct inet_connection_sock_af_ops ipv6_specific;
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
static const struct tcp_sock_af_ops tcp_sock_ipv6_specific;
static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific;
-#else
-static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk,
- const struct in6_addr *addr)
-{
- return NULL;
-}
#endif
+/* Helper returning the inet6 address from a given tcp socket.
+ * It can be used in TCP stack instead of inet6_sk(sk).
+ * This avoids a dereference and allow compiler optimizations.
+ * It is a specialized version of inet6_sk_generic().
+ */
+#define tcp_inet6_sk(sk) (&container_of_const(tcp_sk(sk), \
+ struct tcp6_sock, tcp)->inet6)
+
static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
if (dst && dst_hold_safe(dst)) {
- const struct rt6_info *rt = (const struct rt6_info *)dst;
-
- sk->sk_rx_dst = dst;
- inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
- inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
+ rcu_assign_pointer(sk->sk_rx_dst, dst);
+ sk->sk_rx_dst_ifindex = skb->skb_iif;
+ sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst));
}
}
@@ -117,7 +118,7 @@ static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb)
ipv6_hdr(skb)->saddr.s6_addr32);
}
-static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
int addr_len)
{
/* This check is replicated from tcp_v6_connect() and intended to
@@ -129,24 +130,25 @@ static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
sock_owned_by_me(sk);
- return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr);
+ return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, &addr_len);
}
-static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
+static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
int addr_len)
{
struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
- struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct tcp_sock *tp = tcp_sk(sk);
struct in6_addr *saddr = NULL, *final_p, final;
+ struct inet_timewait_death_row *tcp_death_row;
+ struct ipv6_pinfo *np = tcp_inet6_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
struct ipv6_txoptions *opt;
- struct flowi6 fl6;
struct dst_entry *dst;
+ struct flowi6 fl6;
int addr_type;
int err;
- struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
if (addr_len < SIN6_LEN_RFC2133)
return -EINVAL;
@@ -156,13 +158,13 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
memset(&fl6, 0, sizeof(fl6));
- if (np->sndflow) {
+ if (inet6_test_bit(SNDFLOW, sk)) {
fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
IP6_ECN_flow_init(fl6.flowlabel);
if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
struct ip6_flowlabel *flowlabel;
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (!flowlabel)
+ if (IS_ERR(flowlabel))
return -EINVAL;
fl6_sock_release(flowlabel);
}
@@ -206,7 +208,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
!ipv6_addr_equal(&sk->sk_v6_daddr, &usin->sin6_addr)) {
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
- tp->write_seq = 0;
+ WRITE_ONCE(tp->write_seq, 0);
}
sk->sk_v6_daddr = usin->sin6_addr;
@@ -220,28 +222,32 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
u32 exthdrlen = icsk->icsk_ext_hdr_len;
struct sockaddr_in sin;
- SOCK_DEBUG(sk, "connect: ipv4 mapped\n");
-
- if (__ipv6_only_sock(sk))
+ if (ipv6_only_sock(sk))
return -ENETUNREACH;
sin.sin_family = AF_INET;
sin.sin_port = usin->sin6_port;
sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3];
- icsk->icsk_af_ops = &ipv6_mapped;
+ /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */
+ WRITE_ONCE(icsk->icsk_af_ops, &ipv6_mapped);
+ if (sk_is_mptcp(sk))
+ mptcpv6_handle_mapped(sk, true);
sk->sk_backlog_rcv = tcp_v4_do_rcv;
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
tp->af_specific = &tcp_sock_ipv6_mapped_specific;
#endif
- err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin));
+ err = tcp_v4_connect(sk, (struct sockaddr_unsized *)&sin, sizeof(sin));
if (err) {
icsk->icsk_ext_hdr_len = exthdrlen;
- icsk->icsk_af_ops = &ipv6_specific;
+ /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */
+ WRITE_ONCE(icsk->icsk_af_ops, &ipv6_specific);
+ if (sk_is_mptcp(sk))
+ mptcpv6_handle_mapped(sk, false);
sk->sk_backlog_rcv = tcp_v6_do_rcv;
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
tp->af_specific = &tcp_sock_ipv6_specific;
#endif
goto failure;
@@ -257,26 +263,35 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
fl6.flowi6_proto = IPPROTO_TCP;
fl6.daddr = sk->sk_v6_daddr;
fl6.saddr = saddr ? *saddr : np->saddr;
+ fl6.flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label);
fl6.flowi6_oif = sk->sk_bound_dev_if;
fl6.flowi6_mark = sk->sk_mark;
fl6.fl6_dport = usin->sin6_port;
fl6.fl6_sport = inet->inet_sport;
- fl6.flowi6_uid = sk->sk_uid;
+ if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6.fl6_sport)
+ fl6.flowi6_flags = FLOWI_FLAG_ANY_SPORT;
+ fl6.flowi6_uid = sk_uid(sk);
opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
final_p = fl6_update_dst(&fl6, opt, &final);
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
- dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
goto failure;
}
+ tp->tcp_usec_ts = dst_tcp_usec_ts(dst);
+ tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
+
if (!saddr) {
saddr = &fl6.saddr;
- sk->sk_v6_rcv_saddr = *saddr;
+
+ err = inet_bhash2_update_saddr(sk, saddr, AF_INET6);
+ if (err)
+ goto failure;
}
/* set the source address */
@@ -284,12 +299,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
inet->inet_rcv_saddr = LOOPBACK4_IPV6;
sk->sk_gso_type = SKB_GSO_TCPV6;
- ip6_dst_store(sk, dst, NULL, NULL);
+ ip6_dst_store(sk, dst, false, false);
- icsk->icsk_ext_hdr_len = 0;
+ icsk->icsk_ext_hdr_len = psp_sk_overhead(sk);
if (opt)
- icsk->icsk_ext_hdr_len = opt->opt_flen +
- opt->opt_nflen;
+ icsk->icsk_ext_hdr_len += opt->opt_flen +
+ opt->opt_nflen;
tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
@@ -304,12 +319,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
if (likely(!tp->repair)) {
if (!tp->write_seq)
- tp->write_seq = secure_tcpv6_seq(np->saddr.s6_addr32,
- sk->sk_v6_daddr.s6_addr32,
- inet->inet_sport,
- inet->inet_dport);
- tp->tsoffset = secure_tcpv6_ts_off(sock_net(sk),
- np->saddr.s6_addr32,
+ WRITE_ONCE(tp->write_seq,
+ secure_tcpv6_seq(np->saddr.s6_addr32,
+ sk->sk_v6_daddr.s6_addr32,
+ inet->inet_sport,
+ inet->inet_dport));
+ tp->tsoffset = secure_tcpv6_ts_off(net, np->saddr.s6_addr32,
sk->sk_v6_daddr.s6_addr32);
}
@@ -326,6 +341,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
late_failure:
tcp_set_state(sk, TCP_CLOSE);
+ inet_bhash2_reset_saddr(sk);
failure:
inet->inet_dport = 0;
sk->sk_route_caps = 0;
@@ -335,11 +351,20 @@ failure:
static void tcp_v6_mtu_reduced(struct sock *sk)
{
struct dst_entry *dst;
+ u32 mtu;
if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
return;
- dst = inet6_csk_update_pmtu(sk, tcp_sk(sk)->mtu_info);
+ mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
+
+ /* Drop requests trying to increase our current mss.
+ * Check done in __ip6_rt_update_pmtu() is too late.
+ */
+ if (tcp_mtu_to_mss(sk, mtu) >= tcp_sk(sk)->mss_cache)
+ return;
+
+ dst = inet6_csk_update_pmtu(sk, mtu);
if (!dst)
return;
@@ -349,12 +374,12 @@ static void tcp_v6_mtu_reduced(struct sock *sk)
}
}
-static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
const struct tcphdr *th = (struct tcphdr *)(skb->data+offset);
- struct net *net = dev_net(skb->dev);
+ struct net *net = dev_net_rcu(skb->dev);
struct request_sock *fastopen;
struct ipv6_pinfo *np;
struct tcp_sock *tp;
@@ -363,25 +388,33 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
bool fatal;
int err;
- sk = __inet6_lookup_established(net, &tcp_hashinfo,
- &hdr->daddr, th->dest,
+ sk = __inet6_lookup_established(net, &hdr->daddr, th->dest,
&hdr->saddr, ntohs(th->source),
skb->dev->ifindex, inet6_sdif(skb));
if (!sk) {
__ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
ICMP6_MIB_INERRORS);
- return;
+ return -ENOENT;
}
if (sk->sk_state == TCP_TIME_WAIT) {
+ /* To increase the counter of ignored icmps for TCP-AO */
+ tcp_ao_ignore_icmp(sk, AF_INET6, type, code);
inet_twsk_put(inet_twsk(sk));
- return;
+ return 0;
}
seq = ntohl(th->seq);
fatal = icmpv6_err_convert(type, code, &err);
- if (sk->sk_state == TCP_NEW_SYN_RECV)
- return tcp_req_err(sk, seq, fatal);
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ tcp_req_err(sk, seq, fatal);
+ return 0;
+ }
+
+ if (tcp_ao_ignore_icmp(sk, AF_INET6, type, code)) {
+ sock_put(sk);
+ return 0;
+ }
bh_lock_sock(sk);
if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
@@ -390,14 +423,17 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (sk->sk_state == TCP_CLOSE)
goto out;
- if (ipv6_hdr(skb)->hop_limit < inet6_sk(sk)->min_hopcount) {
- __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
- goto out;
+ if (static_branch_unlikely(&ip6_min_hopcount)) {
+ /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */
+ if (ipv6_hdr(skb)->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) {
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+ goto out;
+ }
}
tp = tcp_sk(sk);
/* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
- fastopen = tp->fastopen_rsk;
+ fastopen = rcu_dereference(tp->fastopen_rsk);
snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
if (sk->sk_state != TCP_LISTEN &&
!between(seq, snd_una, tp->snd_nxt)) {
@@ -405,7 +441,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
goto out;
}
- np = inet6_sk(sk);
+ np = tcp_inet6_sk(sk);
if (type == NDISC_REDIRECT) {
if (!sock_owned_by_user(sk)) {
@@ -418,6 +454,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
}
if (type == ICMPV6_PKT_TOOBIG) {
+ u32 mtu = ntohl(info);
+
/* We are not interested in TCP_LISTEN and open_requests
* (SYN-ACKs send out by Linux are always <576bytes so
* they should go through unfragmented).
@@ -428,7 +466,11 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (!ip6_sk_accept_pmtu(sk))
goto out;
- tp->mtu_info = ntohl(info);
+ if (mtu < IPV6_MIN_MTU)
+ goto out;
+
+ WRITE_ONCE(tp->mtu_info, mtu);
+
if (!sock_owned_by_user(sk))
tcp_v6_mtu_reduced(sk);
else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED,
@@ -443,30 +485,39 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
case TCP_SYN_SENT:
case TCP_SYN_RECV:
/* Only in fast or simultaneous open. If a fast open socket is
- * is already accepted it is treated as a connected one below.
+ * already accepted it is treated as a connected one below.
*/
if (fastopen && !fastopen->sk)
break;
- if (!sock_owned_by_user(sk)) {
- sk->sk_err = err;
- sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */
+ ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th);
- tcp_done(sk);
- } else
- sk->sk_err_soft = err;
+ if (!sock_owned_by_user(sk))
+ tcp_done_with_error(sk, err);
+ else
+ WRITE_ONCE(sk->sk_err_soft, err);
goto out;
+ case TCP_LISTEN:
+ break;
+ default:
+ /* check if this ICMP message allows revert of backoff.
+ * (see RFC 6069)
+ */
+ if (!fastopen && type == ICMPV6_DEST_UNREACH &&
+ code == ICMPV6_NOROUTE)
+ tcp_ld_RTO_revert(sk, seq);
}
- if (!sock_owned_by_user(sk) && np->recverr) {
- sk->sk_err = err;
- sk->sk_error_report(sk);
- } else
- sk->sk_err_soft = err;
-
+ if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) {
+ WRITE_ONCE(sk->sk_err, err);
+ sk_error_report(sk);
+ } else {
+ WRITE_ONCE(sk->sk_err_soft, err);
+ }
out:
bh_unlock_sock(sk);
sock_put(sk);
+ return 0;
}
@@ -474,35 +525,48 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
- enum tcp_synack_type synack_type)
+ enum tcp_synack_type synack_type,
+ struct sk_buff *syn_skb)
{
struct inet_request_sock *ireq = inet_rsk(req);
- struct ipv6_pinfo *np = inet6_sk(sk);
+ const struct ipv6_pinfo *np = tcp_inet6_sk(sk);
struct ipv6_txoptions *opt;
struct flowi6 *fl6 = &fl->u.ip6;
struct sk_buff *skb;
int err = -ENOMEM;
+ u8 tclass;
/* First, grab a route. */
if (!dst && (dst = inet6_csk_route_req(sk, fl6, req,
IPPROTO_TCP)) == NULL)
goto done;
- skb = tcp_make_synack(sk, dst, req, foc, synack_type);
+ skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
if (skb) {
+ tcp_rsk(req)->syn_ect_snt = np->tclass & INET_ECN_MASK;
__tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,
&ireq->ir_v6_rmt_addr);
fl6->daddr = ireq->ir_v6_rmt_addr;
- if (np->repflow && ireq->pktopts)
+ if (inet6_test_bit(REPFLOW, sk) && ireq->pktopts)
fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts));
+ tclass = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
+ (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
+ (np->tclass & INET_ECN_MASK) :
+ np->tclass;
+
+ if (!INET_ECN_is_capable(tclass) &&
+ tcp_bpf_ca_needs_ecn((struct sock *)req))
+ tclass |= INET_ECN_ECT_0;
+
rcu_read_lock();
opt = ireq->ipv6_opt;
if (!opt)
opt = rcu_dereference(np->opt);
- err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass);
+ err = ip6_xmit(sk, skb, fl6, skb->mark ? : READ_ONCE(sk->sk_mark),
+ opt, tclass, READ_ONCE(sk->sk_priority));
rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -515,38 +579,52 @@ done:
static void tcp_v6_reqsk_destructor(struct request_sock *req)
{
kfree(inet_rsk(req)->ipv6_opt);
- kfree_skb(inet_rsk(req)->pktopts);
+ consume_skb(inet_rsk(req)->pktopts);
}
#ifdef CONFIG_TCP_MD5SIG
static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk,
- const struct in6_addr *addr)
+ const struct in6_addr *addr,
+ int l3index)
{
- return tcp_md5_do_lookup(sk, (union tcp_md5_addr *)addr, AF_INET6);
+ return tcp_md5_do_lookup(sk, l3index,
+ (union tcp_md5_addr *)addr, AF_INET6);
}
static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk,
const struct sock *addr_sk)
{
- return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr);
+ int l3index;
+
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
+ addr_sk->sk_bound_dev_if);
+ return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr,
+ l3index);
}
static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
- char __user *optval, int optlen)
+ sockptr_t optval, int optlen)
{
struct tcp_md5sig cmd;
struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr;
+ union tcp_ao_addr *addr;
+ int l3index = 0;
u8 prefixlen;
+ bool l3flag;
+ u8 flags;
if (optlen < sizeof(cmd))
return -EINVAL;
- if (copy_from_user(&cmd, optval, sizeof(cmd)))
+ if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
return -EFAULT;
if (sin6->sin6_family != AF_INET6)
return -EINVAL;
+ flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
+ l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
+
if (optname == TCP_MD5SIG_EXT &&
cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
prefixlen = cmd.tcpm_prefixlen;
@@ -557,94 +635,100 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
prefixlen = ipv6_addr_v4mapped(&sin6->sin6_addr) ? 32 : 128;
}
+ if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
+ cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
+ struct net_device *dev;
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
+ if (dev && netif_is_l3_master(dev))
+ l3index = dev->ifindex;
+ rcu_read_unlock();
+
+ /* ok to reference set/not set outside of rcu;
+ * right now device MUST be an L3 master
+ */
+ if (!dev || !l3index)
+ return -EINVAL;
+ }
+
if (!cmd.tcpm_keylen) {
if (ipv6_addr_v4mapped(&sin6->sin6_addr))
return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3],
- AF_INET, prefixlen);
+ AF_INET, prefixlen,
+ l3index, flags);
return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr,
- AF_INET6, prefixlen);
+ AF_INET6, prefixlen, l3index, flags);
}
if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
return -EINVAL;
- if (ipv6_addr_v4mapped(&sin6->sin6_addr))
- return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3],
- AF_INET, prefixlen, cmd.tcpm_key,
- cmd.tcpm_keylen, GFP_KERNEL);
+ if (ipv6_addr_v4mapped(&sin6->sin6_addr)) {
+ addr = (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3];
- return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr,
- AF_INET6, prefixlen, cmd.tcpm_key,
- cmd.tcpm_keylen, GFP_KERNEL);
+ /* Don't allow keys for peers that have a matching TCP-AO key.
+ * See the comment in tcp_ao_add_cmd()
+ */
+ if (tcp_ao_required(sk, addr, AF_INET,
+ l3flag ? l3index : -1, false))
+ return -EKEYREJECTED;
+ return tcp_md5_do_add(sk, addr,
+ AF_INET, prefixlen, l3index, flags,
+ cmd.tcpm_key, cmd.tcpm_keylen);
+ }
+
+ addr = (union tcp_md5_addr *)&sin6->sin6_addr;
+
+ /* Don't allow keys for peers that have a matching TCP-AO key.
+ * See the comment in tcp_ao_add_cmd()
+ */
+ if (tcp_ao_required(sk, addr, AF_INET6, l3flag ? l3index : -1, false))
+ return -EKEYREJECTED;
+
+ return tcp_md5_do_add(sk, addr, AF_INET6, prefixlen, l3index, flags,
+ cmd.tcpm_key, cmd.tcpm_keylen);
}
-static int tcp_v6_md5_hash_headers(struct tcp_md5sig_pool *hp,
- const struct in6_addr *daddr,
- const struct in6_addr *saddr,
- const struct tcphdr *th, int nbytes)
+static void tcp_v6_md5_hash_headers(struct md5_ctx *ctx,
+ const struct in6_addr *daddr,
+ const struct in6_addr *saddr,
+ const struct tcphdr *th, int nbytes)
{
- struct tcp6_pseudohdr *bp;
- struct scatterlist sg;
- struct tcphdr *_th;
-
- bp = hp->scratch;
- /* 1. TCP pseudo-header (RFC2460) */
- bp->saddr = *saddr;
- bp->daddr = *daddr;
- bp->protocol = cpu_to_be32(IPPROTO_TCP);
- bp->len = cpu_to_be32(nbytes);
-
- _th = (struct tcphdr *)(bp + 1);
- memcpy(_th, th, sizeof(*th));
- _th->check = 0;
-
- sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
- ahash_request_set_crypt(hp->md5_req, &sg, NULL,
- sizeof(*bp) + sizeof(*th));
- return crypto_ahash_update(hp->md5_req);
+ struct {
+ struct tcp6_pseudohdr ip; /* TCP pseudo-header (RFC2460) */
+ struct tcphdr tcp;
+ } h;
+
+ h.ip.saddr = *saddr;
+ h.ip.daddr = *daddr;
+ h.ip.protocol = cpu_to_be32(IPPROTO_TCP);
+ h.ip.len = cpu_to_be32(nbytes);
+ h.tcp = *th;
+ h.tcp.check = 0;
+ md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp));
}
-static int tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
- const struct in6_addr *daddr, struct in6_addr *saddr,
- const struct tcphdr *th)
+static noinline_for_stack void
+tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
+ const struct in6_addr *daddr, struct in6_addr *saddr,
+ const struct tcphdr *th)
{
- struct tcp_md5sig_pool *hp;
- struct ahash_request *req;
-
- hp = tcp_get_md5sig_pool();
- if (!hp)
- goto clear_hash_noput;
- req = hp->md5_req;
-
- if (crypto_ahash_init(req))
- goto clear_hash;
- if (tcp_v6_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
- goto clear_hash;
- if (tcp_md5_hash_key(hp, key))
- goto clear_hash;
- ahash_request_set_crypt(req, NULL, md5_hash, 0);
- if (crypto_ahash_final(req))
- goto clear_hash;
-
- tcp_put_md5sig_pool();
- return 0;
+ struct md5_ctx ctx;
-clear_hash:
- tcp_put_md5sig_pool();
-clear_hash_noput:
- memset(md5_hash, 0, 16);
- return 1;
+ md5_init(&ctx);
+ tcp_v6_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2);
+ tcp_md5_hash_key(&ctx, key);
+ md5_final(&ctx, md5_hash);
}
-static int tcp_v6_md5_hash_skb(char *md5_hash,
- const struct tcp_md5sig_key *key,
- const struct sock *sk,
- const struct sk_buff *skb)
+static noinline_for_stack void
+tcp_v6_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
+ const struct sock *sk, const struct sk_buff *skb)
{
- const struct in6_addr *saddr, *daddr;
- struct tcp_md5sig_pool *hp;
- struct ahash_request *req;
const struct tcphdr *th = tcp_hdr(skb);
+ const struct in6_addr *saddr, *daddr;
+ struct md5_ctx ctx;
if (sk) { /* valid for establish/request sockets */
saddr = &sk->sk_v6_rcv_saddr;
@@ -655,131 +739,77 @@ static int tcp_v6_md5_hash_skb(char *md5_hash,
daddr = &ip6h->daddr;
}
- hp = tcp_get_md5sig_pool();
- if (!hp)
- goto clear_hash_noput;
- req = hp->md5_req;
-
- if (crypto_ahash_init(req))
- goto clear_hash;
-
- if (tcp_v6_md5_hash_headers(hp, daddr, saddr, th, skb->len))
- goto clear_hash;
- if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
- goto clear_hash;
- if (tcp_md5_hash_key(hp, key))
- goto clear_hash;
- ahash_request_set_crypt(req, NULL, md5_hash, 0);
- if (crypto_ahash_final(req))
- goto clear_hash;
-
- tcp_put_md5sig_pool();
- return 0;
-
-clear_hash:
- tcp_put_md5sig_pool();
-clear_hash_noput:
- memset(md5_hash, 0, 16);
- return 1;
+ md5_init(&ctx);
+ tcp_v6_md5_hash_headers(&ctx, daddr, saddr, th, skb->len);
+ tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2);
+ tcp_md5_hash_key(&ctx, key);
+ md5_final(&ctx, md5_hash);
}
-
#endif
-static bool tcp_v6_inbound_md5_hash(const struct sock *sk,
- const struct sk_buff *skb)
-{
-#ifdef CONFIG_TCP_MD5SIG
- const __u8 *hash_location = NULL;
- struct tcp_md5sig_key *hash_expected;
- const struct ipv6hdr *ip6h = ipv6_hdr(skb);
- const struct tcphdr *th = tcp_hdr(skb);
- int genhash;
- u8 newhash[16];
-
- hash_expected = tcp_v6_md5_do_lookup(sk, &ip6h->saddr);
- hash_location = tcp_parse_md5sig_option(th);
-
- /* We've parsed the options - do we have a hash? */
- if (!hash_expected && !hash_location)
- return false;
-
- if (hash_expected && !hash_location) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
- return true;
- }
-
- if (!hash_expected && hash_location) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
- return true;
- }
-
- /* check the signature */
- genhash = tcp_v6_md5_hash_skb(newhash,
- hash_expected,
- NULL, skb);
-
- if (genhash || memcmp(hash_location, newhash, 16) != 0) {
- NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE);
- net_info_ratelimited("MD5 Hash %s for [%pI6c]:%u->[%pI6c]:%u\n",
- genhash ? "failed" : "mismatch",
- &ip6h->saddr, ntohs(th->source),
- &ip6h->daddr, ntohs(th->dest));
- return true;
- }
-#endif
- return false;
-}
-
static void tcp_v6_init_req(struct request_sock *req,
const struct sock *sk_listener,
- struct sk_buff *skb)
+ struct sk_buff *skb,
+ u32 tw_isn)
{
+ bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags);
struct inet_request_sock *ireq = inet_rsk(req);
- const struct ipv6_pinfo *np = inet6_sk(sk_listener);
+ const struct ipv6_pinfo *np = tcp_inet6_sk(sk_listener);
ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
+ ireq->ir_rmt_addr = LOOPBACK4_IPV6;
+ ireq->ir_loc_addr = LOOPBACK4_IPV6;
/* So that link locals have meaning */
- if (!sk_listener->sk_bound_dev_if &&
+ if ((!sk_listener->sk_bound_dev_if || l3_slave) &&
ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
ireq->ir_iif = tcp_v6_iif(skb);
- if (!TCP_SKB_CB(skb)->tcp_tw_isn &&
+ if (!tw_isn &&
(ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) ||
np->rxopt.bits.rxinfo ||
np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim ||
- np->rxopt.bits.rxohlim || np->repflow)) {
+ np->rxopt.bits.rxohlim || inet6_test_bit(REPFLOW, sk_listener))) {
refcount_inc(&skb->users);
ireq->pktopts = skb;
}
}
static struct dst_entry *tcp_v6_route_req(const struct sock *sk,
+ struct sk_buff *skb,
struct flowi *fl,
- const struct request_sock *req)
+ struct request_sock *req,
+ u32 tw_isn)
{
+ tcp_v6_init_req(req, sk, skb, tw_isn);
+
+ if (security_inet_conn_request(sk, skb, req))
+ return NULL;
+
return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP);
}
struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
.family = AF_INET6,
.obj_size = sizeof(struct tcp6_request_sock),
- .rtx_syn_ack = tcp_rtx_synack,
.send_ack = tcp_v6_reqsk_send_ack,
.destructor = tcp_v6_reqsk_destructor,
.send_reset = tcp_v6_send_reset,
- .syn_ack_timeout = tcp_syn_ack_timeout,
};
-static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
+const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) -
sizeof(struct ipv6hdr),
#ifdef CONFIG_TCP_MD5SIG
.req_md5_lookup = tcp_v6_md5_lookup,
.calc_md5_hash = tcp_v6_md5_hash_skb,
#endif
- .init_req = tcp_v6_init_req,
+#ifdef CONFIG_TCP_AO
+ .ao_lookup = tcp_v6_ao_lookup_rsk,
+ .ao_calc_key = tcp_v6_ao_calc_key_rsk,
+ .ao_synack_hash = tcp_v6_ao_synack_hash,
+#endif
#ifdef CONFIG_SYN_COOKIES
.cookie_init_seq = cookie_v6_init_sequence,
#endif
@@ -791,33 +821,41 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 win, u32 tsval, u32 tsecr,
- int oif, struct tcp_md5sig_key *key, int rst,
- u8 tclass, __be32 label)
+ int oif, int rst, u8 tclass, __be32 label,
+ u32 priority, u32 txhash, struct tcp_key *key)
{
+ struct net *net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
+ unsigned int tot_len = sizeof(struct tcphdr);
+ struct sock *ctl_sk = net->ipv6.tcp_sk;
const struct tcphdr *th = tcp_hdr(skb);
- struct tcphdr *t1;
+ __be32 mrst = 0, *topt;
+ struct dst_entry *dst;
struct sk_buff *buff;
+ struct tcphdr *t1;
struct flowi6 fl6;
- struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
- struct sock *ctl_sk = net->ipv6.tcp_sk;
- unsigned int tot_len = sizeof(struct tcphdr);
- struct dst_entry *dst;
- __be32 *topt;
- __u32 mark = 0;
+ u32 mark = 0;
if (tsecr)
tot_len += TCPOLEN_TSTAMP_ALIGNED;
-#ifdef CONFIG_TCP_MD5SIG
- if (key)
+ if (tcp_key_is_md5(key))
tot_len += TCPOLEN_MD5SIG_ALIGNED;
+ if (tcp_key_is_ao(key))
+ tot_len += tcp_ao_len_aligned(key->ao_key);
+
+#ifdef CONFIG_MPTCP
+ if (rst && !tcp_key_is_md5(key)) {
+ mrst = mptcp_reset_option(skb);
+
+ if (mrst)
+ tot_len += sizeof(__be32);
+ }
#endif
- buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len,
- GFP_ATOMIC);
+ buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
if (!buff)
return;
- skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len);
+ skb_reserve(buff, MAX_TCP_HEADER);
t1 = skb_push(buff, tot_len);
skb_reset_transport_header(buff);
@@ -842,15 +880,32 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
*topt++ = htonl(tsecr);
}
+ if (mrst)
+ *topt++ = mrst;
+
#ifdef CONFIG_TCP_MD5SIG
- if (key) {
+ if (tcp_key_is_md5(key)) {
*topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
(TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
- tcp_v6_md5_hash_hdr((__u8 *)topt, key,
+ tcp_v6_md5_hash_hdr((__u8 *)topt, key->md5_key,
&ipv6_hdr(skb)->saddr,
&ipv6_hdr(skb)->daddr, t1);
}
#endif
+#ifdef CONFIG_TCP_AO
+ if (tcp_key_is_ao(key)) {
+ *topt++ = htonl((TCPOPT_AO << 24) |
+ (tcp_ao_len(key->ao_key) << 16) |
+ (key->ao_key->sndid << 8) |
+ (key->rcv_next));
+
+ tcp_ao_hash_hdr(AF_INET6, (char *)topt, key->ao_key,
+ key->traffic_key,
+ (union tcp_ao_addr *)&ipv6_hdr(skb)->saddr,
+ (union tcp_ao_addr *)&ipv6_hdr(skb)->daddr,
+ t1, key->sne);
+ }
+#endif
memset(&fl6, 0, sizeof(fl6));
fl6.daddr = ipv6_hdr(skb)->saddr;
@@ -858,7 +913,6 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
fl6.flowlabel = label;
buff->ip_summed = CHECKSUM_PARTIAL;
- buff->csum = 0;
__tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr);
@@ -872,23 +926,39 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
fl6.flowi6_oif = oif;
}
- if (sk)
- mark = (sk->sk_state == TCP_TIME_WAIT) ?
- inet_twsk(sk)->tw_mark : sk->sk_mark;
+ if (sk) {
+ /* unconstify the socket only to attach it to buff with care. */
+ skb_set_owner_edemux(buff, (struct sock *)sk);
+ psp_reply_set_decrypted(sk, buff);
+
+ if (sk->sk_state == TCP_TIME_WAIT)
+ mark = inet_twsk(sk)->tw_mark;
+ else
+ mark = READ_ONCE(sk->sk_mark);
+ skb_set_delivery_time(buff, tcp_transmit_time(sk), SKB_CLOCK_MONOTONIC);
+ }
+ if (txhash) {
+ /* autoflowlabel/skb_get_hash_flowi6 rely on buff->hash */
+ skb_set_hash(buff, txhash, PKT_HASH_TYPE_L4);
+ }
fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
fl6.fl6_dport = t1->dest;
fl6.fl6_sport = t1->source;
fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
- security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
+ security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6));
/* Pass a socket to ip6_dst_lookup either it is for RST
* Underlying function will use this to retrieve the network
* namespace
*/
- dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
+ if (sk && sk->sk_state != TCP_TIME_WAIT)
+ dst = ip6_dst_lookup_flow(net, sk, &fl6, NULL); /*sk's xfrm_policy can be referred*/
+ else
+ dst = ip6_dst_lookup_flow(net, ctl_sk, &fl6, NULL);
if (!IS_ERR(dst)) {
skb_dst_set(buff, dst);
- ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass);
+ ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL,
+ tclass, priority);
TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
if (rst)
TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
@@ -898,19 +968,27 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
kfree_skb(buff);
}
-static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
+static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb,
+ enum sk_rst_reason reason)
{
const struct tcphdr *th = tcp_hdr(skb);
+ struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ const __u8 *md5_hash_location = NULL;
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
+ bool allocated_traffic_key = false;
+#endif
+ const struct tcp_ao_hdr *aoh;
+ struct tcp_key key = {};
u32 seq = 0, ack_seq = 0;
- struct tcp_md5sig_key *key = NULL;
+ __be32 label = 0;
+ u32 priority = 0;
+ struct net *net;
+ u32 txhash = 0;
+ int oif = 0;
#ifdef CONFIG_TCP_MD5SIG
- const __u8 *hash_location = NULL;
- struct ipv6hdr *ipv6h = ipv6_hdr(skb);
unsigned char newhash[16];
- int genhash;
struct sock *sk1 = NULL;
#endif
- int oif = 0;
if (th->rst)
return;
@@ -921,12 +999,29 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
if (!sk && !ipv6_unicast_destination(skb))
return;
-#ifdef CONFIG_TCP_MD5SIG
+ net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb);
+ /* Invalid TCP option size or twice included auth */
+ if (tcp_parse_auth_options(th, &md5_hash_location, &aoh))
+ return;
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
rcu_read_lock();
- hash_location = tcp_parse_md5sig_option(th);
+#endif
+#ifdef CONFIG_TCP_MD5SIG
if (sk && sk_fullsock(sk)) {
- key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr);
- } else if (hash_location) {
+ int l3index;
+
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and inet_iif is set to it.
+ */
+ l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
+ key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr, l3index);
+ if (key.md5_key)
+ key.type = TCP_KEY_MD5;
+ } else if (md5_hash_location) {
+ int dif = tcp_v6_iif_l3_slave(skb);
+ int sdif = tcp_v6_sdif(skb);
+ int l3index;
+
/*
* active side is lost. Try to find listening socket through
* source port, and then find md5 key through listening socket.
@@ -934,22 +1029,24 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
* Incoming packet is checked with md5 hash with finding key,
* no RST generated if md5 hash doesn't match.
*/
- sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev),
- &tcp_hashinfo, NULL, 0,
- &ipv6h->saddr,
- th->source, &ipv6h->daddr,
- ntohs(th->source),
- tcp_v6_iif_l3_slave(skb),
- tcp_v6_sdif(skb));
+ sk1 = inet6_lookup_listener(net, NULL, 0, &ipv6h->saddr, th->source,
+ &ipv6h->daddr, ntohs(th->source),
+ dif, sdif);
if (!sk1)
goto out;
- key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr);
- if (!key)
+ /* sdif set, means packet ingressed via a device
+ * in an L3 domain and dif is set to it.
+ */
+ l3index = tcp_v6_sdif(skb) ? dif : 0;
+
+ key.md5_key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr, l3index);
+ if (!key.md5_key)
goto out;
+ key.type = TCP_KEY_MD5;
- genhash = tcp_v6_md5_hash_skb(newhash, key, NULL, skb);
- if (genhash || memcmp(hash_location, newhash, 16) != 0)
+ tcp_v6_md5_hash_skb(newhash, key.md5_key, NULL, skb);
+ if (memcmp(md5_hash_location, newhash, 16) != 0)
goto out;
}
#endif
@@ -960,62 +1057,198 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len -
(th->doff << 2);
+#ifdef CONFIG_TCP_AO
+ if (aoh) {
+ int l3index;
+
+ l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
+ if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, seq,
+ &key.ao_key, &key.traffic_key,
+ &allocated_traffic_key,
+ &key.rcv_next, &key.sne))
+ goto out;
+ key.type = TCP_KEY_AO;
+ }
+#endif
+
if (sk) {
oif = sk->sk_bound_dev_if;
- if (sk_fullsock(sk))
- trace_tcp_send_reset(sk, skb);
+ if (sk_fullsock(sk)) {
+ if (inet6_test_bit(REPFLOW, sk))
+ label = ip6_flowlabel(ipv6h);
+ priority = READ_ONCE(sk->sk_priority);
+ txhash = sk->sk_txhash;
+ }
+ if (sk->sk_state == TCP_TIME_WAIT) {
+ label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel);
+ priority = inet_twsk(sk)->tw_priority;
+ txhash = inet_twsk(sk)->tw_txhash;
+ }
+ } else {
+ if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET)
+ label = ip6_flowlabel(ipv6h);
}
- tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0);
+ trace_tcp_send_reset(sk, skb, reason);
-#ifdef CONFIG_TCP_MD5SIG
+ tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1,
+ ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK,
+ label, priority, txhash,
+ &key);
+
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
out:
+ if (allocated_traffic_key)
+ kfree(key.traffic_key);
rcu_read_unlock();
#endif
}
static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq,
u32 ack, u32 win, u32 tsval, u32 tsecr, int oif,
- struct tcp_md5sig_key *key, u8 tclass,
- __be32 label)
+ struct tcp_key *key, u8 tclass,
+ __be32 label, u32 priority, u32 txhash)
{
- tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, key, 0,
- tclass, label);
+ tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, 0,
+ tclass, label, priority, txhash, key);
}
-static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
+static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb,
+ enum tcp_tw_status tw_status)
{
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
+ u8 tclass = tw->tw_tclass;
+ struct tcp_key key = {};
+
+ if (tw_status == TCP_TW_ACK_OOW)
+ tclass &= ~INET_ECN_MASK;
+#ifdef CONFIG_TCP_AO
+ struct tcp_ao_info *ao_info;
+
+ if (static_branch_unlikely(&tcp_ao_needed.key)) {
+
+ /* FIXME: the segment to-be-acked is not verified yet */
+ ao_info = rcu_dereference(tcptw->ao_info);
+ if (ao_info) {
+ const struct tcp_ao_hdr *aoh;
+
+ /* Invalid TCP option size or twice included auth */
+ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
+ goto out;
+ if (aoh)
+ key.ao_key = tcp_ao_established_key(sk, ao_info,
+ aoh->rnext_keyid, -1);
+ }
+ }
+ if (key.ao_key) {
+ struct tcp_ao_key *rnext_key;
+
+ key.traffic_key = snd_other_key(key.ao_key);
+ /* rcv_next switches to our rcv_next */
+ rnext_key = READ_ONCE(ao_info->rnext_key);
+ key.rcv_next = rnext_key->rcvid;
+ key.sne = READ_ONCE(ao_info->snd_sne);
+ key.type = TCP_KEY_AO;
+#else
+ if (0) {
+#endif
+#ifdef CONFIG_TCP_MD5SIG
+ } else if (static_branch_unlikely(&tcp_md5_needed.key)) {
+ key.md5_key = tcp_twsk_md5_key(tcptw);
+ if (key.md5_key)
+ key.type = TCP_KEY_MD5;
+#endif
+ }
- tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+ tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt,
+ READ_ONCE(tcptw->tw_rcv_nxt),
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
- tcp_time_stamp_raw() + tcptw->tw_ts_offset,
- tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),
- tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel));
+ tcp_tw_tsval(tcptw),
+ READ_ONCE(tcptw->tw_ts_recent), tw->tw_bound_dev_if,
+ &key, tclass, cpu_to_be32(tw->tw_flowlabel),
+ tw->tw_priority, tw->tw_txhash);
+#ifdef CONFIG_TCP_AO
+out:
+#endif
inet_twsk_put(tw);
}
static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req)
{
+ struct tcp_key key = {};
+
+#ifdef CONFIG_TCP_AO
+ if (static_branch_unlikely(&tcp_ao_needed.key) &&
+ tcp_rsk_used_ao(req)) {
+ const struct in6_addr *addr = &ipv6_hdr(skb)->saddr;
+ const struct tcp_ao_hdr *aoh;
+ int l3index;
+
+ l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
+ /* Invalid TCP option size or twice included auth */
+ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh))
+ return;
+ if (!aoh)
+ return;
+ key.ao_key = tcp_ao_do_lookup(sk, l3index,
+ (union tcp_ao_addr *)addr,
+ AF_INET6, aoh->rnext_keyid, -1);
+ if (unlikely(!key.ao_key)) {
+ /* Send ACK with any matching MKT for the peer */
+ key.ao_key = tcp_ao_do_lookup(sk, l3index,
+ (union tcp_ao_addr *)addr,
+ AF_INET6, -1, -1);
+ /* Matching key disappeared (user removed the key?)
+ * let the handshake timeout.
+ */
+ if (!key.ao_key) {
+ net_info_ratelimited("TCP-AO key for (%pI6, %d)->(%pI6, %d) suddenly disappeared, won't ACK new connection\n",
+ addr,
+ ntohs(tcp_hdr(skb)->source),
+ &ipv6_hdr(skb)->daddr,
+ ntohs(tcp_hdr(skb)->dest));
+ return;
+ }
+ }
+ key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC);
+ if (!key.traffic_key)
+ return;
+
+ key.type = TCP_KEY_AO;
+ key.rcv_next = aoh->keyid;
+ tcp_v6_ao_calc_key_rsk(key.ao_key, key.traffic_key, req);
+#else
+ if (0) {
+#endif
+#ifdef CONFIG_TCP_MD5SIG
+ } else if (static_branch_unlikely(&tcp_md5_needed.key)) {
+ int l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0;
+
+ key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr,
+ l3index);
+ if (key.md5_key)
+ key.type = TCP_KEY_MD5;
+#endif
+ }
+
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
*/
- /* RFC 7323 2.3
- * The window field (SEG.WND) of every outgoing segment, with the
- * exception of <SYN> segments, MUST be right-shifted by
- * Rcv.Wind.Shift bits:
- */
tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ?
tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
tcp_rsk(req)->rcv_nxt,
- req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
- tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
+ tcp_synack_window(req) >> inet_rsk(req)->rcv_wscale,
+ tcp_rsk_tsval(tcp_rsk(req)),
req->ts_recent, sk->sk_bound_dev_if,
- tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr),
- 0, 0);
+ &key, ipv6_get_dsfield(ipv6_hdr(skb)) & ~INET_ECN_MASK,
+ 0,
+ READ_ONCE(sk->sk_priority),
+ READ_ONCE(tcp_rsk(req)->txhash));
+ if (tcp_key_is_ao(&key))
+ kfree(key.traffic_key);
}
@@ -1030,6 +1263,21 @@ static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb)
return sk;
}
+u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph,
+ struct tcphdr *th, u32 *cookie)
+{
+ u16 mss = 0;
+#ifdef CONFIG_SYN_COOKIES
+ mss = tcp_get_syncookie_mss(&tcp6_request_sock_ops,
+ &tcp_request_sock_ipv6_ops, sk, th);
+ if (mss) {
+ *cookie = __cookie_v6_init_sequence(iph, th, &mss);
+ tcp_synq_overflow(sk);
+ }
+#endif
+ return mss;
+}
+
static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{
if (skb->protocol == htons(ETH_P_IP))
@@ -1038,6 +1286,11 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
if (!ipv6_unicast_destination(skb))
goto drop;
+ if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) {
+ __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS);
+ return 0;
+ }
+
return tcp_conn_request(&tcp6_request_sock_ops,
&tcp_request_sock_ipv6_ops, sk, skb);
@@ -1064,14 +1317,15 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
{
struct inet_request_sock *ireq;
struct ipv6_pinfo *newnp;
- const struct ipv6_pinfo *np = inet6_sk(sk);
+ const struct ipv6_pinfo *np = tcp_inet6_sk(sk);
struct ipv6_txoptions *opt;
- struct tcp6_sock *newtcp6sk;
struct inet_sock *newinet;
+ bool found_dup_sk = false;
struct tcp_sock *newtp;
struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
struct tcp_md5sig_key *key;
+ int l3index;
#endif
struct flowi6 fl6;
@@ -1086,11 +1340,11 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
if (!newsk)
return NULL;
- newtcp6sk = (struct tcp6_sock *)newsk;
- inet_sk(newsk)->pinet6 = &newtcp6sk->inet6;
-
newinet = inet_sk(newsk);
- newnp = inet6_sk(newsk);
+ newinet->pinet6 = tcp_inet6_sk(newsk);
+ newinet->ipv6_fl_list = NULL;
+
+ newnp = tcp_inet6_sk(newsk);
newtp = tcp_sk(newsk);
memcpy(newnp, np, sizeof(struct ipv6_pinfo));
@@ -1098,21 +1352,22 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
newnp->saddr = newsk->sk_v6_rcv_saddr;
inet_csk(newsk)->icsk_af_ops = &ipv6_mapped;
+ if (sk_is_mptcp(newsk))
+ mptcpv6_handle_mapped(newsk, true);
newsk->sk_backlog_rcv = tcp_v4_do_rcv;
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
#endif
newnp->ipv6_mc_list = NULL;
newnp->ipv6_ac_list = NULL;
- newnp->ipv6_fl_list = NULL;
newnp->pktoptions = NULL;
newnp->opt = NULL;
- newnp->mcast_oif = tcp_v6_iif(skb);
- newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
- newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb));
- if (np->repflow)
- newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));
+ newnp->mcast_oif = inet_iif(skb);
+ newnp->mcast_hops = ip_hdr(skb)->ttl;
+ newnp->rcv_flowinfo = 0;
+ if (inet6_test_bit(REPFLOW, sk))
+ newnp->flow_label = 0;
/*
* No need to charge this sock to the relevant IPv6 refcnt debug socks count
@@ -1132,17 +1387,17 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
ireq = inet_rsk(req);
if (sk_acceptq_is_full(sk))
- goto out_overflow;
+ goto exit_overflow;
if (!dst) {
dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_TCP);
if (!dst)
- goto out;
+ goto exit;
}
newsk = tcp_create_openreq_child(sk, req, skb);
if (!newsk)
- goto out_nonewsk;
+ goto exit_nonewsk;
/*
* No need to charge this sock to the relevant IPv6 refcnt debug socks
@@ -1151,31 +1406,28 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
*/
newsk->sk_gso_type = SKB_GSO_TCPV6;
- ip6_dst_store(newsk, dst, NULL, NULL);
inet6_sk_rx_dst_set(newsk, skb);
- newtcp6sk = (struct tcp6_sock *)newsk;
- inet_sk(newsk)->pinet6 = &newtcp6sk->inet6;
+ newinet = inet_sk(newsk);
+ newinet->pinet6 = tcp_inet6_sk(newsk);
+ newinet->ipv6_fl_list = NULL;
+ newinet->inet_opt = NULL;
newtp = tcp_sk(newsk);
- newinet = inet_sk(newsk);
- newnp = inet6_sk(newsk);
+ newnp = tcp_inet6_sk(newsk);
memcpy(newnp, np, sizeof(struct ipv6_pinfo));
- newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr;
+ ip6_dst_store(newsk, dst, false, false);
+
newnp->saddr = ireq->ir_v6_loc_addr;
- newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr;
- newsk->sk_bound_dev_if = ireq->ir_iif;
/* Now IPv6 options...
First: no IPv4 options.
*/
- newinet->inet_opt = NULL;
newnp->ipv6_mc_list = NULL;
newnp->ipv6_ac_list = NULL;
- newnp->ipv6_fl_list = NULL;
/* Clone RX bits */
newnp->rxopt.all = np->rxopt.all;
@@ -1185,9 +1437,15 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
newnp->mcast_oif = tcp_v6_iif(skb);
newnp->mcast_hops = ipv6_hdr(skb)->hop_limit;
newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb));
- if (np->repflow)
+ if (inet6_test_bit(REPFLOW, sk))
newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));
+ /* Set ToS of the new socket based upon the value of incoming SYN.
+ * ECT bits are set later in tcp_init_transfer().
+ */
+ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
+ newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
+
/* Clone native IPv6 options from listening socket (if any)
Yes, keeping reference count would be much more clever,
@@ -1213,57 +1471,70 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
tcp_initialize_rcv_mss(newsk);
- newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
- newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
-
#ifdef CONFIG_TCP_MD5SIG
- /* Copy over the MD5 key from the original socket */
- key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr);
- if (key) {
- /* We're using one, so create a matching key
- * on the newsk structure. If we fail to get
- * memory, then we end up not copying the key
- * across. Shucks.
- */
- tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr,
- AF_INET6, 128, key->key, key->keylen,
- sk_gfp_mask(sk, GFP_ATOMIC));
+ l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
+
+ if (!tcp_rsk_used_ao(req)) {
+ /* Copy over the MD5 key from the original socket */
+ key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr, l3index);
+ if (key) {
+ const union tcp_md5_addr *addr;
+
+ addr = (union tcp_md5_addr *)&newsk->sk_v6_daddr;
+ if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key))
+ goto put_and_exit;
+ }
}
#endif
+#ifdef CONFIG_TCP_AO
+ /* Copy over tcp_ao_info if any */
+ if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET6))
+ goto put_and_exit; /* OOM */
+#endif
- if (__inet_inherit_port(sk, newsk) < 0) {
- inet_csk_prepare_forced_close(newsk);
- tcp_done(newsk);
- goto out;
- }
- *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
+ if (__inet_inherit_port(sk, newsk) < 0)
+ goto put_and_exit;
+ *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
+ &found_dup_sk);
if (*own_req) {
tcp_move_syn(newtp, req);
/* Clone pktoptions received with SYN, if we own the req */
if (ireq->pktopts) {
- newnp->pktoptions = skb_clone(ireq->pktopts,
- sk_gfp_mask(sk, GFP_ATOMIC));
+ newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk);
consume_skb(ireq->pktopts);
ireq->pktopts = NULL;
- if (newnp->pktoptions) {
+ if (newnp->pktoptions)
tcp_v6_restore_cb(newnp->pktoptions);
- skb_set_owner_r(newnp->pktoptions, newsk);
- }
+ }
+ } else {
+ if (!req_unhash && found_dup_sk) {
+ /* This code path should only be executed in the
+ * syncookie case only
+ */
+ bh_unlock_sock(newsk);
+ sock_put(newsk);
+ newsk = NULL;
}
}
return newsk;
-out_overflow:
+exit_overflow:
__NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
-out_nonewsk:
+exit_nonewsk:
dst_release(dst);
-out:
+exit:
tcp_listendrop(sk);
return NULL;
+put_and_exit:
+ inet_csk_prepare_forced_close(newsk);
+ tcp_done(newsk);
+ goto exit;
}
+INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
+ u32));
/* The socket must have it's spinlock held when we get
* here, unless it is a TCP_LISTEN socket.
*
@@ -1272,11 +1543,13 @@ out:
* This is because we cannot sleep with the original spinlock
* held.
*/
-static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
+INDIRECT_CALLABLE_SCOPE
+int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
{
- struct ipv6_pinfo *np = inet6_sk(sk);
- struct tcp_sock *tp;
+ struct ipv6_pinfo *np = tcp_inet6_sk(sk);
struct sk_buff *opt_skb = NULL;
+ enum skb_drop_reason reason;
+ struct tcp_sock *tp;
/* Imagine: socket is IPv6. IPv4 packet arrives,
goes to IPv4 receive handler and backlogged.
@@ -1289,6 +1562,10 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
if (skb->protocol == htons(ETH_P_IP))
return tcp_v4_do_rcv(sk, skb);
+ reason = psp_sk_rx_policy_check(sk, skb);
+ if (reason)
+ goto err_discard;
+
/*
* socket locking is here for SMP purposes as backlog rcv
* is currently called with bh processing disabled.
@@ -1307,19 +1584,23 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
by tcp. Feel free to propose better solution.
--ANK (980728)
*/
- if (np->rxopt.all)
- opt_skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC));
+ if (np->rxopt.all && sk->sk_state != TCP_LISTEN)
+ opt_skb = skb_clone_and_charge_r(skb, sk);
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
- struct dst_entry *dst = sk->sk_rx_dst;
+ struct dst_entry *dst;
+
+ dst = rcu_dereference_protected(sk->sk_rx_dst,
+ lockdep_sock_is_held(sk));
sock_rps_save_rxhash(sk, skb);
sk_mark_napi_id(sk, skb);
if (dst) {
- if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
- dst->ops->check(dst, np->rx_dst_cookie) == NULL) {
+ if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
+ INDIRECT_CALL_1(dst->ops->check, ip6_dst_check,
+ dst, sk->sk_rx_dst_cookie) == NULL) {
+ RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
dst_release(dst);
- sk->sk_rx_dst = NULL;
}
}
@@ -1335,34 +1616,36 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
if (sk->sk_state == TCP_LISTEN) {
struct sock *nsk = tcp_v6_cookie_check(sk, skb);
- if (!nsk)
- goto discard;
-
if (nsk != sk) {
- if (tcp_child_process(sk, nsk, skb))
- goto reset;
- if (opt_skb)
- __kfree_skb(opt_skb);
+ if (nsk) {
+ reason = tcp_child_process(sk, nsk, skb);
+ if (reason)
+ goto reset;
+ }
return 0;
}
} else
sock_rps_save_rxhash(sk, skb);
- if (tcp_rcv_state_process(sk, skb))
+ reason = tcp_rcv_state_process(sk, skb);
+ if (reason)
goto reset;
if (opt_skb)
goto ipv6_pktoptions;
return 0;
reset:
- tcp_v6_send_reset(sk, skb);
+ tcp_v6_send_reset(sk, skb, sk_rst_convert_drop_reason(reason));
discard:
if (opt_skb)
__kfree_skb(opt_skb);
- kfree_skb(skb);
+ sk_skb_reason_drop(sk, skb, reason);
return 0;
csum_err:
+ reason = SKB_DROP_REASON_TCP_CSUM;
+ trace_tcp_bad_csum(skb);
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+err_discard:
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
@@ -1379,15 +1662,15 @@ ipv6_pktoptions:
if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt &&
!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) {
if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo)
- np->mcast_oif = tcp_v6_iif(opt_skb);
+ WRITE_ONCE(np->mcast_oif, tcp_v6_iif(opt_skb));
if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim)
- np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit;
+ WRITE_ONCE(np->mcast_hops,
+ ipv6_hdr(opt_skb)->hop_limit);
if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass)
np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb));
- if (np->repflow)
+ if (inet6_test_bit(REPFLOW, sk))
np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) {
- skb_set_owner_r(opt_skb, sk);
tcp_v6_restore_cb(opt_skb);
opt_skb = xchg(&np->pktoptions, opt_skb);
} else {
@@ -1396,7 +1679,7 @@ ipv6_pktoptions:
}
}
- kfree_skb(opt_skb);
+ consume_skb(opt_skb);
return 0;
}
@@ -1416,24 +1699,28 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
skb->len - th->doff*4);
TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
- TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
- TCP_SKB_CB(skb)->tcp_tw_isn = 0;
+ TCP_SKB_CB(skb)->tcp_flags = tcp_flags_ntohs(th);
TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr);
TCP_SKB_CB(skb)->sacked = 0;
TCP_SKB_CB(skb)->has_rxtstamp =
skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
}
-static int tcp_v6_rcv(struct sk_buff *skb)
+INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
{
+ struct net *net = dev_net_rcu(skb->dev);
+ enum skb_drop_reason drop_reason;
+ enum tcp_tw_status tw_status;
int sdif = inet6_sdif(skb);
+ int dif = inet6_iif(skb);
const struct tcphdr *th;
const struct ipv6hdr *hdr;
+ struct sock *sk = NULL;
bool refcounted;
- struct sock *sk;
int ret;
- struct net *net = dev_net(skb->dev);
+ u32 isn;
+ drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (skb->pkt_type != PACKET_HOST)
goto discard_it;
@@ -1447,8 +1734,10 @@ static int tcp_v6_rcv(struct sk_buff *skb)
th = (const struct tcphdr *)skb->data;
- if (unlikely(th->doff < sizeof(struct tcphdr)/4))
+ if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
+ drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
goto bad_packet;
+ }
if (!pskb_may_pull(skb, th->doff*4))
goto discard_it;
@@ -1459,13 +1748,12 @@ static int tcp_v6_rcv(struct sk_buff *skb)
hdr = ipv6_hdr(skb);
lookup:
- sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th),
+ sk = __inet6_lookup_skb(skb, __tcp_hdrlen(th),
th->source, th->dest, inet6_iif(skb), sdif,
&refcounted);
if (!sk)
goto no_tcp_socket;
-process:
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
@@ -1475,8 +1763,14 @@ process:
struct sock *nsk;
sk = req->rsk_listener;
- if (tcp_v6_inbound_md5_hash(sk, skb)) {
- sk_drops_add(sk, skb);
+ if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
+ else
+ drop_reason = tcp_inbound_hash(sk, req, skb,
+ &hdr->saddr, &hdr->daddr,
+ AF_INET6, dif, sdif);
+ if (drop_reason) {
+ sk_drops_skbadd(sk, skb);
reqsk_put(req);
goto discard_it;
}
@@ -1485,17 +1779,26 @@ process:
goto csum_error;
}
if (unlikely(sk->sk_state != TCP_LISTEN)) {
- inet_csk_reqsk_queue_drop_and_put(sk, req);
- goto lookup;
+ nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
+ if (!nsk) {
+ inet_csk_reqsk_queue_drop_and_put(sk, req);
+ goto lookup;
+ }
+ sk = nsk;
+ /* reuseport_migrate_sock() has already held one sk_refcnt
+ * before returning.
+ */
+ } else {
+ sock_hold(sk);
}
- sock_hold(sk);
refcounted = true;
nsk = NULL;
- if (!tcp_filter(sk, skb)) {
+ if (!tcp_filter(sk, skb, &drop_reason)) {
th = (const struct tcphdr *)skb->data;
hdr = ipv6_hdr(skb);
tcp_v6_fill_cb(skb, hdr, th);
- nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
+ nsk = tcp_check_req(sk, skb, req, false, &req_stolen,
+ &drop_reason);
}
if (!nsk) {
reqsk_put(req);
@@ -1511,30 +1814,49 @@ process:
}
goto discard_and_relse;
}
+ nf_reset_ct(skb);
if (nsk == sk) {
reqsk_put(req);
tcp_v6_restore_cb(skb);
- } else if (tcp_child_process(sk, nsk, skb)) {
- tcp_v6_send_reset(nsk, skb);
- goto discard_and_relse;
} else {
+ drop_reason = tcp_child_process(sk, nsk, skb);
+ if (drop_reason) {
+ enum sk_rst_reason rst_reason;
+
+ rst_reason = sk_rst_convert_drop_reason(drop_reason);
+ tcp_v6_send_reset(nsk, skb, rst_reason);
+ goto discard_and_relse;
+ }
sock_put(sk);
return 0;
}
}
- if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) {
- __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
- goto discard_and_relse;
+
+process:
+ if (static_branch_unlikely(&ip6_min_hopcount)) {
+ /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */
+ if (unlikely(hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount))) {
+ __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
+ drop_reason = SKB_DROP_REASON_TCP_MINTTL;
+ goto discard_and_relse;
+ }
}
- if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
+ if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
goto discard_and_relse;
+ }
- if (tcp_v6_inbound_md5_hash(sk, skb))
+ drop_reason = tcp_inbound_hash(sk, NULL, skb, &hdr->saddr, &hdr->daddr,
+ AF_INET6, dif, sdif);
+ if (drop_reason)
goto discard_and_relse;
- if (tcp_filter(sk, skb))
+ nf_reset_ct(skb);
+
+ if (tcp_filter(sk, skb, &drop_reason))
goto discard_and_relse;
+
th = (const struct tcphdr *)skb->data;
hdr = ipv6_hdr(skb);
tcp_v6_fill_cb(skb, hdr, th);
@@ -1553,17 +1875,18 @@ process:
ret = 0;
if (!sock_owned_by_user(sk)) {
ret = tcp_v6_do_rcv(sk, skb);
- } else if (tcp_add_backlog(sk, skb)) {
- goto discard_and_relse;
+ } else {
+ if (tcp_add_backlog(sk, skb, &drop_reason))
+ goto discard_and_relse;
}
bh_unlock_sock(sk);
-
put_and_return:
if (refcounted)
sock_put(sk);
return ret ? -1 : 0;
no_tcp_socket:
+ drop_reason = SKB_DROP_REASON_NO_SOCKET;
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it;
@@ -1571,25 +1894,29 @@ no_tcp_socket:
if (tcp_checksum_complete(skb)) {
csum_error:
+ drop_reason = SKB_DROP_REASON_TCP_CSUM;
+ trace_tcp_bad_csum(skb);
__TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
bad_packet:
__TCP_INC_STATS(net, TCP_MIB_INERRS);
} else {
- tcp_v6_send_reset(NULL, skb);
+ tcp_v6_send_reset(NULL, skb, sk_rst_convert_drop_reason(drop_reason));
}
discard_it:
- kfree_skb(skb);
+ SKB_DR_OR(drop_reason, NOT_SPECIFIED);
+ sk_skb_reason_drop(sk, skb, drop_reason);
return 0;
discard_and_relse:
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
if (refcounted)
sock_put(sk);
goto discard_it;
do_time_wait:
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
inet_twsk_put(inet_twsk(sk));
goto discard_it;
}
@@ -1601,13 +1928,14 @@ do_time_wait:
goto csum_error;
}
- switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
+ tw_status = tcp_timewait_state_process(inet_twsk(sk), skb, th, &isn,
+ &drop_reason);
+ switch (tw_status) {
case TCP_TW_SYN:
{
struct sock *sk2;
- sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo,
- skb, __tcp_hdrlen(th),
+ sk2 = inet6_lookup_listener(net, skb, __tcp_hdrlen(th),
&ipv6_hdr(skb)->saddr, th->source,
&ipv6_hdr(skb)->daddr,
ntohs(th->dest),
@@ -1619,16 +1947,22 @@ do_time_wait:
sk = sk2;
tcp_v6_restore_cb(skb);
refcounted = false;
+ __this_cpu_write(tcp_tw_isn, isn);
goto process;
}
+
+ drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb);
+ if (drop_reason)
+ break;
}
/* to ACK */
- /* fall through */
+ fallthrough;
case TCP_TW_ACK:
- tcp_v6_timewait_ack(sk, skb);
+ case TCP_TW_ACK_OOW:
+ tcp_v6_timewait_ack(sk, skb, tw_status);
break;
case TCP_TW_RST:
- tcp_v6_send_reset(sk, skb);
+ tcp_v6_send_reset(sk, skb, SK_RST_REASON_TCP_TIMEWAIT_SOCKET);
inet_twsk_deschedule_put(inet_twsk(sk));
goto discard_it;
case TCP_TW_SUCCESS:
@@ -1637,8 +1971,9 @@ do_time_wait:
goto discard_it;
}
-static void tcp_v6_early_demux(struct sk_buff *skb)
+void tcp_v6_early_demux(struct sk_buff *skb)
{
+ struct net *net = dev_net_rcu(skb->dev);
const struct ipv6hdr *hdr;
const struct tcphdr *th;
struct sock *sk;
@@ -1656,20 +1991,19 @@ static void tcp_v6_early_demux(struct sk_buff *skb)
return;
/* Note : We use inet6_iif() here, not tcp_v6_iif() */
- sk = __inet6_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
- &hdr->saddr, th->source,
+ sk = __inet6_lookup_established(net, &hdr->saddr, th->source,
&hdr->daddr, ntohs(th->dest),
inet6_iif(skb), inet6_sdif(skb));
if (sk) {
skb->sk = sk;
skb->destructor = sock_edemux;
if (sk_fullsock(sk)) {
- struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst);
+ struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
if (dst)
- dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie);
+ dst = dst_check(dst, sk->sk_rx_dst_cookie);
if (dst &&
- inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
+ sk->sk_rx_dst_ifindex == skb->skb_iif)
skb_dst_set_noref(skb, dst);
}
}
@@ -1677,11 +2011,14 @@ static void tcp_v6_early_demux(struct sk_buff *skb)
static struct timewait_sock_ops tcp6_timewait_sock_ops = {
.twsk_obj_size = sizeof(struct tcp6_timewait_sock),
- .twsk_unique = tcp_twsk_unique,
- .twsk_destructor = tcp_twsk_destructor,
};
-static const struct inet_connection_sock_af_ops ipv6_specific = {
+INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)
+{
+ __tcp_v6_send_check(skb, &sk->sk_v6_rcv_saddr, &sk->sk_v6_daddr);
+}
+
+const struct inet_connection_sock_af_ops ipv6_specific = {
.queue_xmit = inet6_csk_xmit,
.send_check = tcp_v6_send_check,
.rebuild_header = inet6_sk_rebuild_header,
@@ -1689,23 +2026,24 @@ static const struct inet_connection_sock_af_ops ipv6_specific = {
.conn_request = tcp_v6_conn_request,
.syn_recv_sock = tcp_v6_syn_recv_sock,
.net_header_len = sizeof(struct ipv6hdr),
- .net_frag_header_len = sizeof(struct frag_hdr),
.setsockopt = ipv6_setsockopt,
.getsockopt = ipv6_getsockopt,
- .addr2sockaddr = inet6_csk_addr2sockaddr,
- .sockaddr_len = sizeof(struct sockaddr_in6),
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_ipv6_setsockopt,
- .compat_getsockopt = compat_ipv6_getsockopt,
-#endif
.mtu_reduced = tcp_v6_mtu_reduced,
};
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = {
+#ifdef CONFIG_TCP_MD5SIG
.md5_lookup = tcp_v6_md5_lookup,
.calc_md5_hash = tcp_v6_md5_hash_skb,
.md5_parse = tcp_v6_parse_md5_keys,
+#endif
+#ifdef CONFIG_TCP_AO
+ .ao_lookup = tcp_v6_ao_lookup,
+ .calc_ao_hash = tcp_v6_ao_hash_skb,
+ .ao_parse = tcp_v6_parse_ao,
+ .ao_calc_key_sk = tcp_v6_ao_calc_key_sk,
+#endif
};
#endif
@@ -1722,21 +2060,30 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = {
.net_header_len = sizeof(struct iphdr),
.setsockopt = ipv6_setsockopt,
.getsockopt = ipv6_getsockopt,
- .addr2sockaddr = inet6_csk_addr2sockaddr,
- .sockaddr_len = sizeof(struct sockaddr_in6),
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_ipv6_setsockopt,
- .compat_getsockopt = compat_ipv6_getsockopt,
-#endif
.mtu_reduced = tcp_v4_mtu_reduced,
};
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = {
+#ifdef CONFIG_TCP_MD5SIG
.md5_lookup = tcp_v4_md5_lookup,
.calc_md5_hash = tcp_v4_md5_hash_skb,
.md5_parse = tcp_v6_parse_md5_keys,
+#endif
+#ifdef CONFIG_TCP_AO
+ .ao_lookup = tcp_v6_ao_lookup,
+ .calc_ao_hash = tcp_v4_ao_hash_skb,
+ .ao_parse = tcp_v6_parse_ao,
+ .ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
+#endif
};
+
+static void tcp6_destruct_sock(struct sock *sk)
+{
+ tcp_md5_destruct_sock(sk);
+ tcp_ao_destroy_sock(sk, false);
+ inet6_sock_destruct(sk);
+}
#endif
/* NOTE: A lot of things set to zero explicitly by call to
@@ -1750,19 +2097,14 @@ static int tcp_v6_init_sock(struct sock *sk)
icsk->icsk_af_ops = &ipv6_specific;
-#ifdef CONFIG_TCP_MD5SIG
+#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific;
+ sk->sk_destruct = tcp6_destruct_sock;
#endif
return 0;
}
-static void tcp_v6_destroy_sock(struct sock *sk)
-{
- tcp_v4_destroy_sock(sk);
- inet6_destroy_sock(sk);
-}
-
#ifdef CONFIG_PROC_FS
/* Proc filesystem TCPv6 sock list dumping. */
static void get_openreq6(struct seq_file *seq,
@@ -1791,7 +2133,7 @@ static void get_openreq6(struct seq_file *seq,
jiffies_to_clock_t(ttd),
req->num_timeout,
from_kuid_munged(seq_user_ns(seq),
- sock_i_uid(req->rsk_listener)),
+ sk_uid(req->rsk_listener)),
0, /* non standard timer */
0, /* open_requests have no inode */
0, req);
@@ -1807,6 +2149,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
const struct tcp_sock *tp = tcp_sk(sp);
const struct inet_connection_sock *icsk = inet_csk(sp);
const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
+ u8 icsk_pending;
int rx_queue;
int state;
@@ -1815,17 +2158,18 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
destp = ntohs(inet->inet_dport);
srcp = ntohs(inet->inet_sport);
- if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
- icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
- icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ icsk_pending = smp_load_acquire(&icsk->icsk_pending);
+ if (icsk_pending == ICSK_TIME_RETRANS ||
+ icsk_pending == ICSK_TIME_REO_TIMEOUT ||
+ icsk_pending == ICSK_TIME_LOSS_PROBE) {
timer_active = 1;
- timer_expires = icsk->icsk_timeout;
- } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
+ timer_expires = tcp_timeout_expires(sp);
+ } else if (icsk_pending == ICSK_TIME_PROBE0) {
timer_active = 4;
- timer_expires = icsk->icsk_timeout;
- } else if (timer_pending(&sp->sk_timer)) {
+ timer_expires = tcp_timeout_expires(sp);
+ } else if (timer_pending(&icsk->icsk_keepalive_timer)) {
timer_active = 2;
- timer_expires = sp->sk_timer.expires;
+ timer_expires = icsk->icsk_keepalive_timer.expires;
} else {
timer_active = 0;
timer_expires = jiffies;
@@ -1833,12 +2177,13 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
state = inet_sk_state_load(sp);
if (state == TCP_LISTEN)
- rx_queue = sp->sk_ack_backlog;
+ rx_queue = READ_ONCE(sp->sk_ack_backlog);
else
/* Because we don't lock the socket,
* we might find a transient negative value.
*/
- rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
+ rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
+ READ_ONCE(tp->copied_seq), 0);
seq_printf(seq,
"%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
@@ -1849,19 +2194,19 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
dest->s6_addr32[0], dest->s6_addr32[1],
dest->s6_addr32[2], dest->s6_addr32[3], destp,
state,
- tp->write_seq - tp->snd_una,
+ READ_ONCE(tp->write_seq) - tp->snd_una,
rx_queue,
timer_active,
jiffies_delta_to_clock_t(timer_expires - jiffies),
- icsk->icsk_retransmits,
- from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)),
- icsk->icsk_probes_out,
+ READ_ONCE(icsk->icsk_retransmits),
+ from_kuid_munged(seq_user_ns(seq), sk_uid(sp)),
+ READ_ONCE(icsk->icsk_probes_out),
sock_i_ino(sp),
refcount_read(&sp->sk_refcnt), sp,
jiffies_to_clock_t(icsk->icsk_rto),
jiffies_to_clock_t(icsk->icsk_ack.ato),
- (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
- tp->snd_cwnd,
+ (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sp),
+ tcp_snd_cwnd(tp),
state == TCP_LISTEN ?
fastopenq->max_qlen :
(tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)
@@ -1888,7 +2233,7 @@ static void get_timewait6_sock(struct seq_file *seq,
src->s6_addr32[2], src->s6_addr32[3], srcp,
dest->s6_addr32[0], dest->s6_addr32[1],
dest->s6_addr32[2], dest->s6_addr32[3], destp,
- tw->tw_substate, 0, 0,
+ READ_ONCE(tw->tw_substate), 0, 0,
3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
refcount_read(&tw->tw_refcnt), tw);
}
@@ -1954,53 +2299,48 @@ struct proto tcpv6_prot = {
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v6_init_sock,
- .destroy = tcp_v6_destroy_sock,
+ .destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
+ .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
.keepalive = tcp_set_keepalive,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
- .sendpage = tcp_sendpage,
+ .splice_eof = tcp_splice_eof,
.backlog_rcv = tcp_v6_do_rcv,
.release_cb = tcp_release_cb,
- .hash = inet6_hash,
+ .hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
+ .put_port = inet_put_port,
+#ifdef CONFIG_BPF_SYSCALL
+ .psock_update_sk_prot = tcp_bpf_update_proto,
+#endif
.enter_memory_pressure = tcp_enter_memory_pressure,
.leave_memory_pressure = tcp_leave_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
- .memory_allocated = &tcp_memory_allocated,
+
+ .memory_allocated = &net_aligned_data.tcp_memory_allocated,
+ .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
+
.memory_pressure = &tcp_memory_pressure,
- .orphan_count = &tcp_orphan_count,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp6_sock),
+ .ipv6_pinfo_offset = offsetof(struct tcp6_sock, inet6),
.slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp6_timewait_sock_ops,
.rsk_prot = &tcp6_request_sock_ops,
- .h.hashinfo = &tcp_hashinfo,
+ .h.hashinfo = NULL,
.no_autobind = true,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_tcp_setsockopt,
- .compat_getsockopt = compat_tcp_getsockopt,
-#endif
.diag_destroy = tcp_abort,
};
+EXPORT_SYMBOL_GPL(tcpv6_prot);
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct inet6_protocol tcpv6_protocol = {
- .early_demux = tcp_v6_early_demux,
- .early_demux_handler = tcp_v6_early_demux,
- .handler = tcp_v6_rcv,
- .err_handler = tcp_v6_err,
- .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
-};
static struct inet_protosw tcpv6_protosw = {
.type = SOCK_STREAM,
@@ -2013,8 +2353,14 @@ static struct inet_protosw tcpv6_protosw = {
static int __net_init tcpv6_net_init(struct net *net)
{
- return inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6,
- SOCK_RAW, IPPROTO_TCP, net);
+ int res;
+
+ res = inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6,
+ SOCK_RAW, IPPROTO_TCP, net);
+ if (!res)
+ net->ipv6.tcp_sk->sk_clockid = CLOCK_MONOTONIC;
+
+ return res;
}
static void __net_exit tcpv6_net_exit(struct net *net)
@@ -2022,22 +2368,21 @@ static void __net_exit tcpv6_net_exit(struct net *net)
inet_ctl_sock_destroy(net->ipv6.tcp_sk);
}
-static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list)
-{
- inet_twsk_purge(&tcp_hashinfo, AF_INET6);
-}
-
static struct pernet_operations tcpv6_net_ops = {
.init = tcpv6_net_init,
.exit = tcpv6_net_exit,
- .exit_batch = tcpv6_net_exit_batch,
};
int __init tcpv6_init(void)
{
int ret;
- ret = inet6_add_protocol(&tcpv6_protocol, IPPROTO_TCP);
+ net_hotdata.tcpv6_protocol = (struct inet6_protocol) {
+ .handler = tcp_v6_rcv,
+ .err_handler = tcp_v6_err,
+ .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
+ };
+ ret = inet6_add_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP);
if (ret)
goto out;
@@ -2049,13 +2394,20 @@ int __init tcpv6_init(void)
ret = register_pernet_subsys(&tcpv6_net_ops);
if (ret)
goto out_tcpv6_protosw;
+
+ ret = mptcpv6_init();
+ if (ret)
+ goto out_tcpv6_pernet_subsys;
+
out:
return ret;
+out_tcpv6_pernet_subsys:
+ unregister_pernet_subsys(&tcpv6_net_ops);
out_tcpv6_protosw:
inet6_unregister_protosw(&tcpv6_protosw);
out_tcpv6_protocol:
- inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP);
+ inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP);
goto out;
}
@@ -2063,5 +2415,5 @@ void tcpv6_exit(void)
{
unregister_pernet_subsys(&tcpv6_net_ops);
inet6_unregister_protosw(&tcpv6_protosw);
- inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP);
+ inet6_del_protocol(&net_hotdata.tcpv6_protocol, IPPROTO_TCP);
}
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index e72947c99454..effeba58630b 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -1,44 +1,159 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPV6 GSO/GRO offload support
* Linux INET6 implementation
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* TCPv6 GSO/GRO support
*/
+#include <linux/indirect_call_wrapper.h>
#include <linux/skbuff.h>
+#include <net/inet6_hashtables.h>
+#include <net/gro.h>
#include <net/protocol.h>
#include <net/tcp.h>
#include <net/ip6_checksum.h>
#include "ip6_offload.h"
-static struct sk_buff *tcp6_gro_receive(struct list_head *head,
- struct sk_buff *skb)
+static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
+ struct tcphdr *th)
{
+#if IS_ENABLED(CONFIG_IPV6)
+ const struct ipv6hdr *hdr;
+ struct sk_buff *p;
+ struct sock *sk;
+ struct net *net;
+ int iif, sdif;
+
+ if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST)))
+ return;
+
+ p = tcp_gro_lookup(head, th);
+ if (p) {
+ NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
+ return;
+ }
+
+ inet6_get_iif_sdif(skb, &iif, &sdif);
+ hdr = skb_gro_network_header(skb);
+ net = dev_net_rcu(skb->dev);
+ sk = __inet6_lookup_established(net, &hdr->saddr, th->source,
+ &hdr->daddr, ntohs(th->dest),
+ iif, sdif);
+ NAPI_GRO_CB(skb)->is_flist = !sk;
+ if (sk)
+ sock_gen_put(sk);
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+}
+
+INDIRECT_CALLABLE_SCOPE
+struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)
+{
+ struct tcphdr *th;
+
/* Don't bother verifying checksum if we're going to flush anyway. */
if (!NAPI_GRO_CB(skb)->flush &&
skb_gro_checksum_validate(skb, IPPROTO_TCP,
- ip6_gro_compute_pseudo)) {
- NAPI_GRO_CB(skb)->flush = 1;
- return NULL;
- }
+ ip6_gro_compute_pseudo))
+ goto flush;
- return tcp_gro_receive(head, skb);
+ th = tcp_gro_pull_header(skb);
+ if (!th)
+ goto flush;
+
+ tcp6_check_fraglist_gro(head, skb, th);
+
+ return tcp_gro_receive(head, skb, th);
+
+flush:
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
}
-static int tcp6_gro_complete(struct sk_buff *skb, int thoff)
+INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff)
{
- const struct ipv6hdr *iph = ipv6_hdr(skb);
+ const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
+ const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset);
struct tcphdr *th = tcp_hdr(skb);
+ if (unlikely(NAPI_GRO_CB(skb)->is_flist)) {
+ skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV6;
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+ __skb_incr_checksum_unnecessary(skb);
+
+ return 0;
+ }
+
th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr,
&iph->daddr, 0);
skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6;
- return tcp_gro_complete(skb);
+ tcp_gro_complete(skb);
+ return 0;
+}
+
+static void __tcpv6_gso_segment_csum(struct sk_buff *seg,
+ struct in6_addr *oldip,
+ const struct in6_addr *newip,
+ __be16 *oldport, __be16 newport)
+{
+ struct tcphdr *th = tcp_hdr(seg);
+
+ if (!ipv6_addr_equal(oldip, newip)) {
+ inet_proto_csum_replace16(&th->check, seg,
+ oldip->s6_addr32,
+ newip->s6_addr32,
+ true);
+ *oldip = *newip;
+ }
+
+ if (*oldport == newport)
+ return;
+
+ inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false);
+ *oldport = newport;
+}
+
+static struct sk_buff *__tcpv6_gso_segment_list_csum(struct sk_buff *segs)
+{
+ const struct tcphdr *th;
+ const struct ipv6hdr *iph;
+ struct sk_buff *seg;
+ struct tcphdr *th2;
+ struct ipv6hdr *iph2;
+
+ seg = segs;
+ th = tcp_hdr(seg);
+ iph = ipv6_hdr(seg);
+ th2 = tcp_hdr(seg->next);
+ iph2 = ipv6_hdr(seg->next);
+
+ if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) &&
+ ipv6_addr_equal(&iph->saddr, &iph2->saddr) &&
+ ipv6_addr_equal(&iph->daddr, &iph2->daddr))
+ return segs;
+
+ while ((seg = seg->next)) {
+ th2 = tcp_hdr(seg);
+ iph2 = ipv6_hdr(seg);
+
+ __tcpv6_gso_segment_csum(seg, &iph2->saddr, &iph->saddr,
+ &th2->source, th->source);
+ __tcpv6_gso_segment_csum(seg, &iph2->daddr, &iph->daddr,
+ &th2->dest, th->dest);
+ }
+
+ return segs;
+}
+
+static struct sk_buff *__tcp6_gso_segment_list(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ skb = skb_segment_list(skb, features, skb_mac_header_len(skb));
+ if (IS_ERR(skb))
+ return skb;
+
+ return __tcpv6_gso_segment_list_csum(skb);
}
static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb,
@@ -52,6 +167,15 @@ static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb,
if (!pskb_may_pull(skb, sizeof(*th)))
return ERR_PTR(-EINVAL);
+ if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) {
+ struct tcphdr *th = tcp_hdr(skb);
+
+ if (skb_pagelen(skb) - th->doff * 4 == skb_shinfo(skb)->gso_size)
+ return __tcp6_gso_segment_list(skb, features);
+
+ skb->ip_summed = CHECKSUM_NONE;
+ }
+
if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
struct tcphdr *th = tcp_hdr(skb);
@@ -67,15 +191,15 @@ static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb,
return tcp_gso_segment(skb, features);
}
-static const struct net_offload tcpv6_offload = {
- .callbacks = {
- .gso_segment = tcp6_gso_segment,
- .gro_receive = tcp6_gro_receive,
- .gro_complete = tcp6_gro_complete,
- },
-};
int __init tcpv6_offload_init(void)
{
- return inet6_add_offload(&tcpv6_offload, IPPROTO_TCP);
+ net_hotdata.tcpv6_offload = (struct net_offload) {
+ .callbacks = {
+ .gso_segment = tcp6_gso_segment,
+ .gro_receive = tcp6_gro_receive,
+ .gro_complete = tcp6_gro_complete,
+ },
+ };
+ return inet6_add_offload(&net_hotdata.tcpv6_offload, IPPROTO_TCP);
}
diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c
index dae25cad05cd..dc4ea9b11794 100644
--- a/net/ipv6/tunnel6.c
+++ b/net/ipv6/tunnel6.c
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C)2003,2004 USAGI/WIDE Project
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- *
* Authors Mitsuru KANDA <mk@linux-ipv6.org>
* YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
*/
@@ -33,8 +21,14 @@
static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly;
static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly;
+static struct xfrm6_tunnel __rcu *tunnelmpls6_handlers __read_mostly;
static DEFINE_MUTEX(tunnel6_mutex);
+static inline int xfrm6_tunnel_mpls_supported(void)
+{
+ return IS_ENABLED(CONFIG_MPLS);
+}
+
int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family)
{
struct xfrm6_tunnel __rcu **pprev;
@@ -44,8 +38,21 @@ int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family)
mutex_lock(&tunnel6_mutex);
- for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers;
- (t = rcu_dereference_protected(*pprev,
+ switch (family) {
+ case AF_INET6:
+ pprev = &tunnel6_handlers;
+ break;
+ case AF_INET:
+ pprev = &tunnel46_handlers;
+ break;
+ case AF_MPLS:
+ pprev = &tunnelmpls6_handlers;
+ break;
+ default:
+ goto err;
+ }
+
+ for (; (t = rcu_dereference_protected(*pprev,
lockdep_is_held(&tunnel6_mutex))) != NULL;
pprev = &t->next) {
if (t->priority > priority)
@@ -74,8 +81,21 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family)
mutex_lock(&tunnel6_mutex);
- for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers;
- (t = rcu_dereference_protected(*pprev,
+ switch (family) {
+ case AF_INET6:
+ pprev = &tunnel6_handlers;
+ break;
+ case AF_INET:
+ pprev = &tunnel46_handlers;
+ break;
+ case AF_MPLS:
+ pprev = &tunnelmpls6_handlers;
+ break;
+ default:
+ goto err;
+ }
+
+ for (; (t = rcu_dereference_protected(*pprev,
lockdep_is_held(&tunnel6_mutex))) != NULL;
pprev = &t->next) {
if (t == handler) {
@@ -85,6 +105,7 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family)
}
}
+err:
mutex_unlock(&tunnel6_mutex);
synchronize_net();
@@ -98,6 +119,24 @@ EXPORT_SYMBOL(xfrm6_tunnel_deregister);
handler != NULL; \
handler = rcu_dereference(handler->next)) \
+static int tunnelmpls6_rcv(struct sk_buff *skb)
+{
+ struct xfrm6_tunnel *handler;
+
+ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
+ goto drop;
+
+ for_each_tunnel_rcu(tunnelmpls6_handlers, handler)
+ if (!handler->handler(skb))
+ return 0;
+
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+
static int tunnel6_rcv(struct sk_buff *skb)
{
struct xfrm6_tunnel *handler;
@@ -116,6 +155,33 @@ drop:
return 0;
}
+#if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL)
+static int tunnel6_rcv_cb(struct sk_buff *skb, u8 proto, int err)
+{
+ struct xfrm6_tunnel __rcu *head;
+ struct xfrm6_tunnel *handler;
+ int ret;
+
+ head = (proto == IPPROTO_IPV6) ? tunnel6_handlers : tunnel46_handlers;
+
+ for_each_tunnel_rcu(head, handler) {
+ if (handler->cb_handler) {
+ ret = handler->cb_handler(skb, err);
+ if (ret <= 0)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static const struct xfrm_input_afinfo tunnel6_input_afinfo = {
+ .family = AF_INET6,
+ .is_ipip = true,
+ .callback = tunnel6_rcv_cb,
+};
+#endif
+
static int tunnel46_rcv(struct sk_buff *skb)
{
struct xfrm6_tunnel *handler;
@@ -134,24 +200,40 @@ drop:
return 0;
}
-static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct xfrm6_tunnel *handler;
for_each_tunnel_rcu(tunnel6_handlers, handler)
if (!handler->err_handler(skb, opt, type, code, offset, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
-static void tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct xfrm6_tunnel *handler;
for_each_tunnel_rcu(tunnel46_handlers, handler)
if (!handler->err_handler(skb, opt, type, code, offset, info))
- break;
+ return 0;
+
+ return -ENOENT;
+}
+
+static int tunnelmpls6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ struct xfrm6_tunnel *handler;
+
+ for_each_tunnel_rcu(tunnelmpls6_handlers, handler)
+ if (!handler->err_handler(skb, opt, type, code, offset, info))
+ return 0;
+
+ return -ENOENT;
}
static const struct inet6_protocol tunnel6_protocol = {
@@ -166,6 +248,12 @@ static const struct inet6_protocol tunnel46_protocol = {
.flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
};
+static const struct inet6_protocol tunnelmpls6_protocol = {
+ .handler = tunnelmpls6_rcv,
+ .err_handler = tunnelmpls6_err,
+ .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
+};
+
static int __init tunnel6_init(void)
{
if (inet6_add_protocol(&tunnel6_protocol, IPPROTO_IPV6)) {
@@ -177,17 +265,42 @@ static int __init tunnel6_init(void)
inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6);
return -EAGAIN;
}
+ if (xfrm6_tunnel_mpls_supported() &&
+ inet6_add_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS)) {
+ pr_err("%s: can't add protocol\n", __func__);
+ inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6);
+ inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP);
+ return -EAGAIN;
+ }
+#if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL)
+ if (xfrm_input_register_afinfo(&tunnel6_input_afinfo)) {
+ pr_err("%s: can't add input afinfo\n", __func__);
+ inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6);
+ inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP);
+ if (xfrm6_tunnel_mpls_supported())
+ inet6_del_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS);
+ return -EAGAIN;
+ }
+#endif
return 0;
}
static void __exit tunnel6_fini(void)
{
+#if IS_ENABLED(CONFIG_INET6_XFRM_TUNNEL)
+ if (xfrm_input_unregister_afinfo(&tunnel6_input_afinfo))
+ pr_err("%s: can't remove input afinfo\n", __func__);
+#endif
if (inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP))
pr_err("%s: can't remove protocol\n", __func__);
if (inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6))
pr_err("%s: can't remove protocol\n", __func__);
+ if (xfrm6_tunnel_mpls_supported() &&
+ inet6_del_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS))
+ pr_err("%s: can't remove protocol\n", __func__);
}
module_init(tunnel6_init);
module_exit(tunnel6_fini);
+MODULE_DESCRIPTION("IP-in-IPv6 tunnel driver");
MODULE_LICENSE("GPL");
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 28c4aa5078fc..794c13674e8a 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* UDP over IPv6
* Linux INET6 implementation
@@ -14,13 +15,9 @@
* a single port at the same time.
* Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data
* YOSHIFUJI Hideaki @USAGI: convert /proc/net/udp6 to seq_file.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
+#include <linux/bpf-cgroup.h>
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
@@ -36,6 +33,8 @@
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
+#include <linux/indirect_call_wrapper.h>
+#include <trace/events/udp.h>
#include <net/addrconf.h>
#include <net/ndisc.h>
@@ -43,38 +42,45 @@
#include <net/transp_v6.h>
#include <net/ip6_route.h>
#include <net/raw.h>
+#include <net/seg6.h>
#include <net/tcp_states.h>
#include <net/ip6_checksum.h>
+#include <net/ip6_tunnel.h>
+#include <net/udp_tunnel.h>
#include <net/xfrm.h>
#include <net/inet_hashtables.h>
#include <net/inet6_hashtables.h>
#include <net/busy_poll.h>
#include <net/sock_reuseport.h>
+#include <net/gro.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <trace/events/skb.h>
#include "udp_impl.h"
-static bool udp6_lib_exact_dif_match(struct net *net, struct sk_buff *skb)
+static void udpv6_destruct_sock(struct sock *sk)
{
-#if defined(CONFIG_NET_L3_MASTER_DEV)
- if (!net->ipv4.sysctl_udp_l3mdev_accept &&
- skb && ipv6_l3mdev_skb(IP6CB(skb)->flags))
- return true;
-#endif
- return false;
+ udp_destruct_common(sk);
+ inet6_sock_destruct(sk);
}
-static u32 udp6_ehashfn(const struct net *net,
- const struct in6_addr *laddr,
- const u16 lport,
- const struct in6_addr *faddr,
- const __be16 fport)
+int udpv6_init_sock(struct sock *sk)
{
- static u32 udp6_ehash_secret __read_mostly;
- static u32 udp_ipv6_hash_secret __read_mostly;
+ int res = udp_lib_init_sock(sk);
+
+ sk->sk_destruct = udpv6_destruct_sock;
+ set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
+ return res;
+}
+INDIRECT_CALLABLE_SCOPE
+u32 udp6_ehashfn(const struct net *net,
+ const struct in6_addr *laddr,
+ const u16 lport,
+ const struct in6_addr *faddr,
+ const __be16 fport)
+{
u32 lhash, fhash;
net_get_random_once(&udp6_ehash_secret,
@@ -86,7 +92,7 @@ static u32 udp6_ehashfn(const struct net *net,
fhash = __ipv6_addr_jhash(faddr, udp_ipv6_hash_secret);
return __inet6_ehashfn(lhash, lport, fhash, fport,
- udp_ipv6_hash_secret + net_hash_mix(net));
+ udp6_ehash_secret + net_hash_mix(net));
}
int udp_v6_get_port(struct sock *sk, unsigned short snum)
@@ -101,28 +107,43 @@ int udp_v6_get_port(struct sock *sk, unsigned short snum)
return udp_lib_get_port(sk, snum, hash2_nulladdr);
}
-static void udp_v6_rehash(struct sock *sk)
+void udp_v6_rehash(struct sock *sk)
{
u16 new_hash = ipv6_portaddr_hash(sock_net(sk),
&sk->sk_v6_rcv_saddr,
inet_sk(sk)->inet_num);
+ u16 new_hash4;
+
+ if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) {
+ new_hash4 = udp_ehashfn(sock_net(sk),
+ sk->sk_rcv_saddr, sk->sk_num,
+ sk->sk_daddr, sk->sk_dport);
+ } else {
+ new_hash4 = udp6_ehashfn(sock_net(sk),
+ &sk->sk_v6_rcv_saddr, sk->sk_num,
+ &sk->sk_v6_daddr, sk->sk_dport);
+ }
- udp_lib_rehash(sk, new_hash);
+ udp_lib_rehash(sk, new_hash, new_hash4);
}
-static int compute_score(struct sock *sk, struct net *net,
+static int compute_score(struct sock *sk, const struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, unsigned short hnum,
- int dif, int sdif, bool exact_dif)
+ int dif, int sdif)
{
- int score;
+ int bound_dev_if, score;
struct inet_sock *inet;
+ bool dev_match;
if (!net_eq(sock_net(sk), net) ||
udp_sk(sk)->udp_port_hash != hnum ||
sk->sk_family != PF_INET6)
return -1;
+ if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
+ return -1;
+
score = 0;
inet = inet_sk(sk);
@@ -132,134 +153,255 @@ static int compute_score(struct sock *sk, struct net *net,
score++;
}
- if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
- if (!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, daddr))
- return -1;
- score++;
- }
-
if (!ipv6_addr_any(&sk->sk_v6_daddr)) {
if (!ipv6_addr_equal(&sk->sk_v6_daddr, saddr))
return -1;
score++;
}
- if (sk->sk_bound_dev_if || exact_dif) {
- bool dev_match = (sk->sk_bound_dev_if == dif ||
- sk->sk_bound_dev_if == sdif);
-
- if (!dev_match)
- return -1;
- if (sk->sk_bound_dev_if)
- score++;
- }
+ bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
+ dev_match = udp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif);
+ if (!dev_match)
+ return -1;
+ if (bound_dev_if)
+ score++;
- if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
score++;
return score;
}
+/**
+ * udp6_lib_lookup1() - Simplified lookup using primary hash (destination port)
+ * @net: Network namespace
+ * @saddr: Source address, network order
+ * @sport: Source port, network order
+ * @daddr: Destination address, network order
+ * @hnum: Destination port, host order
+ * @dif: Destination interface index
+ * @sdif: Destination bridge port index, if relevant
+ * @udptable: Set of UDP hash tables
+ *
+ * Simplified lookup to be used as fallback if no sockets are found due to a
+ * potential race between (receive) address change, and lookup happening before
+ * the rehash operation. This function ignores SO_REUSEPORT groups while scoring
+ * result sockets, because if we have one, we don't need the fallback at all.
+ *
+ * Called under rcu_read_lock().
+ *
+ * Return: socket with highest matching score if any, NULL if none
+ */
+static struct sock *udp6_lib_lookup1(const struct net *net,
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr,
+ unsigned int hnum, int dif, int sdif,
+ const struct udp_table *udptable)
+{
+ unsigned int slot = udp_hashfn(net, hnum, udptable->mask);
+ struct udp_hslot *hslot = &udptable->hash[slot];
+ struct sock *sk, *result = NULL;
+ int score, badness = 0;
+
+ sk_for_each_rcu(sk, &hslot->head) {
+ score = compute_score(sk, net,
+ saddr, sport, daddr, hnum, dif, sdif);
+ if (score > badness) {
+ result = sk;
+ badness = score;
+ }
+ }
+
+ return result;
+}
+
/* called with rcu_read_lock() */
-static struct sock *udp6_lib_lookup2(struct net *net,
+static struct sock *udp6_lib_lookup2(const struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, unsigned int hnum,
- int dif, int sdif, bool exact_dif,
- struct udp_hslot *hslot2, struct sk_buff *skb)
+ int dif, int sdif, struct udp_hslot *hslot2,
+ struct sk_buff *skb)
{
struct sock *sk, *result;
int score, badness;
- u32 hash = 0;
+ bool need_rescore;
result = NULL;
badness = -1;
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
- score = compute_score(sk, net, saddr, sport,
- daddr, hnum, dif, sdif, exact_dif);
+ need_rescore = false;
+rescore:
+ score = compute_score(need_rescore ? result : sk, net, saddr,
+ sport, daddr, hnum, dif, sdif);
if (score > badness) {
- if (sk->sk_reuseport) {
- hash = udp6_ehashfn(net, daddr, hnum,
- saddr, sport);
-
- result = reuseport_select_sock(sk, hash, skb,
- sizeof(struct udphdr));
- if (result)
- return result;
- }
- result = sk;
badness = score;
+
+ if (need_rescore)
+ continue;
+
+ if (sk->sk_state == TCP_ESTABLISHED) {
+ result = sk;
+ continue;
+ }
+
+ result = inet6_lookup_reuseport(net, sk, skb, sizeof(struct udphdr),
+ saddr, sport, daddr, hnum, udp6_ehashfn);
+ if (!result) {
+ result = sk;
+ continue;
+ }
+
+ /* Fall back to scoring if group has connections */
+ if (!reuseport_has_conns(sk))
+ return result;
+
+ /* Reuseport logic returned an error, keep original score. */
+ if (IS_ERR(result))
+ continue;
+
+ /* compute_score is too long of a function to be
+ * inlined, and calling it again here yields
+ * measurable overhead for some
+ * workloads. Work around it by jumping
+ * backwards to rescore 'result'.
+ */
+ need_rescore = true;
+ goto rescore;
}
}
return result;
}
+#if IS_ENABLED(CONFIG_BASE_SMALL)
+static struct sock *udp6_lib_lookup4(const struct net *net,
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr,
+ unsigned int hnum, int dif, int sdif,
+ struct udp_table *udptable)
+{
+ return NULL;
+}
+
+static void udp6_hash4(struct sock *sk)
+{
+}
+#else /* !CONFIG_BASE_SMALL */
+static struct sock *udp6_lib_lookup4(const struct net *net,
+ const struct in6_addr *saddr, __be16 sport,
+ const struct in6_addr *daddr,
+ unsigned int hnum, int dif, int sdif,
+ struct udp_table *udptable)
+{
+ const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
+ const struct hlist_nulls_node *node;
+ struct udp_hslot *hslot4;
+ unsigned int hash4, slot;
+ struct udp_sock *up;
+ struct sock *sk;
+
+ hash4 = udp6_ehashfn(net, daddr, hnum, saddr, sport);
+ slot = hash4 & udptable->mask;
+ hslot4 = &udptable->hash4[slot];
+
+begin:
+ udp_lrpa_for_each_entry_rcu(up, node, &hslot4->nulls_head) {
+ sk = (struct sock *)up;
+ if (inet6_match(net, sk, saddr, daddr, ports, dif, sdif))
+ return sk;
+ }
+
+ /* if the nulls value we got at the end of this lookup is not the
+ * expected one, we must restart lookup. We probably met an item that
+ * was moved to another chain due to rehash.
+ */
+ if (get_nulls_value(node) != slot)
+ goto begin;
+
+ return NULL;
+}
+
+static void udp6_hash4(struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+ unsigned int hash;
+
+ if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) {
+ udp4_hash4(sk);
+ return;
+ }
+
+ if (sk_unhashed(sk) || ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+ return;
+
+ hash = udp6_ehashfn(net, &sk->sk_v6_rcv_saddr, sk->sk_num,
+ &sk->sk_v6_daddr, sk->sk_dport);
+
+ udp_lib_hash4(sk, hash);
+}
+#endif /* CONFIG_BASE_SMALL */
+
/* rcu_read_lock() must be held */
-struct sock *__udp6_lib_lookup(struct net *net,
+struct sock *__udp6_lib_lookup(const struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, __be16 dport,
int dif, int sdif, struct udp_table *udptable,
struct sk_buff *skb)
{
- struct sock *sk, *result;
unsigned short hnum = ntohs(dport);
- unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
- struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
- bool exact_dif = udp6_lib_exact_dif_match(net, skb);
- int score, badness;
- u32 hash = 0;
-
- if (hslot->count > 10) {
- hash2 = ipv6_portaddr_hash(net, daddr, hnum);
- slot2 = hash2 & udptable->mask;
- hslot2 = &udptable->hash2[slot2];
- if (hslot->count < hslot2->count)
- goto begin;
-
- result = udp6_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif, sdif, exact_dif,
- hslot2, skb);
- if (!result) {
- unsigned int old_slot2 = slot2;
- hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
- slot2 = hash2 & udptable->mask;
- /* avoid searching the same slot again. */
- if (unlikely(slot2 == old_slot2))
- return result;
-
- hslot2 = &udptable->hash2[slot2];
- if (hslot->count < hslot2->count)
- goto begin;
-
- result = udp6_lib_lookup2(net, saddr, sport,
- daddr, hnum, dif, sdif,
- exact_dif, hslot2,
- skb);
- }
- if (unlikely(IS_ERR(result)))
- return NULL;
- return result;
+ struct udp_hslot *hslot2;
+ struct sock *result, *sk;
+ unsigned int hash2;
+
+ hash2 = ipv6_portaddr_hash(net, daddr, hnum);
+ hslot2 = udp_hashslot2(udptable, hash2);
+
+ if (udp_has_hash4(hslot2)) {
+ result = udp6_lib_lookup4(net, saddr, sport, daddr, hnum,
+ dif, sdif, udptable);
+ if (result) /* udp6_lib_lookup4 return sk or NULL */
+ return result;
}
-begin:
- result = NULL;
- badness = -1;
- sk_for_each_rcu(sk, &hslot->head) {
- score = compute_score(sk, net, saddr, sport, daddr, hnum, dif,
- sdif, exact_dif);
- if (score > badness) {
- if (sk->sk_reuseport) {
- hash = udp6_ehashfn(net, daddr, hnum,
- saddr, sport);
- result = reuseport_select_sock(sk, hash, skb,
- sizeof(struct udphdr));
- if (unlikely(IS_ERR(result)))
- return NULL;
- if (result)
- return result;
- }
+
+ /* Lookup connected or non-wildcard sockets */
+ result = udp6_lib_lookup2(net, saddr, sport,
+ daddr, hnum, dif, sdif,
+ hslot2, skb);
+ if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED)
+ goto done;
+
+ /* Lookup redirect from BPF */
+ if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
+ udptable == net->ipv4.udp_table) {
+ sk = inet6_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr),
+ saddr, sport, daddr, hnum, dif,
+ udp6_ehashfn);
+ if (sk) {
result = sk;
- badness = score;
+ goto done;
}
}
+
+ /* Got non-wildcard socket or error on first lookup */
+ if (result)
+ goto done;
+
+ /* Lookup wildcard sockets */
+ hash2 = ipv6_portaddr_hash(net, &in6addr_any, hnum);
+ hslot2 = udp_hashslot2(udptable, hash2);
+
+ result = udp6_lib_lookup2(net, saddr, sport,
+ &in6addr_any, hnum, dif, sdif,
+ hslot2, skb);
+ if (!IS_ERR_OR_NULL(result))
+ goto done;
+
+ /* Cover address change/lookup/rehash race: see __udp4_lib_lookup() */
+ result = udp6_lib_lookup1(net, saddr, sport, daddr, hnum, dif, sdif,
+ udptable);
+
+done:
+ if (IS_ERR(result))
+ return NULL;
return result;
}
EXPORT_SYMBOL_GPL(__udp6_lib_lookup);
@@ -275,28 +417,32 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
inet6_sdif(skb), udptable, skb);
}
-struct sock *udp6_lib_lookup_skb(struct sk_buff *skb,
+struct sock *udp6_lib_lookup_skb(const struct sk_buff *skb,
__be16 sport, __be16 dport)
{
- const struct ipv6hdr *iph = ipv6_hdr(skb);
+ const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
+ const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset);
+ struct net *net = dev_net(skb->dev);
+ int iif, sdif;
- return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
- &iph->daddr, dport, inet6_iif(skb),
- inet6_sdif(skb), &udp_table, skb);
+ inet6_get_iif_sdif(skb, &iif, &sdif);
+
+ return __udp6_lib_lookup(net, &iph->saddr, sport,
+ &iph->daddr, dport, iif,
+ sdif, net->ipv4.udp_table, NULL);
}
-EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb);
/* Must be called under rcu_read_lock().
* Does increment socket refcount.
*/
#if IS_ENABLED(CONFIG_NF_TPROXY_IPV6) || IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
-struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
+struct sock *udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, __be16 dport, int dif)
{
struct sock *sk;
sk = __udp6_lib_lookup(net, saddr, sport, daddr, dport,
- dif, 0, &udp_table, NULL);
+ dif, 0, net->ipv4.udp_table, NULL);
if (sk && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
return sk;
@@ -304,7 +450,7 @@ struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be
EXPORT_SYMBOL_GPL(udp6_lib_lookup);
#endif
-/* do not use the scratch area len for jumbogram: their length execeeds the
+/* do not use the scratch area len for jumbogram: their length exceeds the
* scratch area space; note that the IP6CB flags is still in the first
* cacheline, so checking for jumbograms is cheap
*/
@@ -319,28 +465,27 @@ static int udp6_skb_len(struct sk_buff *skb)
*/
int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int noblock, int flags, int *addr_len)
+ int flags, int *addr_len)
{
struct ipv6_pinfo *np = inet6_sk(sk);
struct inet_sock *inet = inet_sk(sk);
struct sk_buff *skb;
unsigned int ulen, copied;
- int peeked, peeking, off;
- int err;
+ int off, err, peeking = flags & MSG_PEEK;
int is_udplite = IS_UDPLITE(sk);
+ struct udp_mib __percpu *mib;
bool checksum_valid = false;
int is_udp4;
if (flags & MSG_ERRQUEUE)
return ipv6_recv_error(sk, msg, len, addr_len);
- if (np->rxpmtu && np->rxopt.bits.rxpmtu)
+ if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu))
return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
try_again:
- peeking = flags & MSG_PEEK;
off = sk_peek_offset(sk, flags);
- skb = __skb_recv_udp(sk, flags, noblock, &peeked, &off, &err);
+ skb = __skb_recv_udp(sk, flags, &off, &err);
if (!skb)
return err;
@@ -352,6 +497,7 @@ try_again:
msg->msg_flags |= MSG_TRUNC;
is_udp4 = (skb->protocol == htons(ETH_P_IP));
+ mib = __UDPX_MIB(sk, is_udp4);
/*
* If checksum is needed at all, try to do it while copying the
@@ -378,28 +524,17 @@ try_again:
goto csum_copy_err;
}
if (unlikely(err)) {
- if (!peeked) {
- atomic_inc(&sk->sk_drops);
- if (is_udp4)
- UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
- is_udplite);
- else
- UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
- is_udplite);
+ if (!peeking) {
+ udp_drops_inc(sk);
+ SNMP_INC_STATS(mib, UDP_MIB_INERRORS);
}
kfree_skb(skb);
return err;
}
- if (!peeked) {
- if (is_udp4)
- UDP_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS,
- is_udplite);
- else
- UDP6_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS,
- is_udplite);
- }
+ if (!peeking)
+ SNMP_INC_STATS(mib, UDP_MIB_INDATAGRAMS);
- sock_recv_ts_and_drops(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
/* Copy the address. */
if (msg->msg_name) {
@@ -419,13 +554,20 @@ try_again:
inet6_iif(skb));
}
*addr_len = sizeof(*sin6);
+
+ BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
+ (struct sockaddr *)sin6,
+ addr_len);
}
+ if (udp_test_bit(GRO_ENABLED, sk))
+ udp_cmsg_recv(msg, sk, skb);
+
if (np->rxopt.all)
ip6_datagram_recv_common_ctl(sk, msg, skb);
if (is_udp4) {
- if (inet->cmsg_flags)
+ if (inet_cmsg_flags(inet))
ip_cmsg_recv_offset(msg, sk, skb,
sizeof(struct udphdr), off);
} else {
@@ -443,19 +585,10 @@ try_again:
csum_copy_err:
if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags,
udp_skb_destructor)) {
- if (is_udp4) {
- UDP_INC_STATS(sock_net(sk),
- UDP_MIB_CSUMERRORS, is_udplite);
- UDP_INC_STATS(sock_net(sk),
- UDP_MIB_INERRORS, is_udplite);
- } else {
- UDP6_INC_STATS(sock_net(sk),
- UDP_MIB_CSUMERRORS, is_udplite);
- UDP6_INC_STATS(sock_net(sk),
- UDP_MIB_INERRORS, is_udplite);
- }
+ SNMP_INC_STATS(mib, UDP_MIB_CSUMERRORS);
+ SNMP_INC_STATS(mib, UDP_MIB_INERRORS);
}
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM);
/* starting over for a new packet, but check if we need to yield */
cond_resched();
@@ -463,26 +596,146 @@ csum_copy_err:
goto try_again;
}
-void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
- u8 type, u8 code, int offset, __be32 info,
- struct udp_table *udptable)
+DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
+void udpv6_encap_enable(void)
+{
+ static_branch_inc(&udpv6_encap_needed_key);
+}
+EXPORT_SYMBOL(udpv6_encap_enable);
+
+/* Handler for tunnels with arbitrary destination ports: no socket lookup, go
+ * through error handlers in encapsulations looking for a match.
+ */
+static int __udp6_lib_err_encap_no_sk(struct sk_buff *skb,
+ struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ int i;
+
+ for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) {
+ int (*handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info);
+ const struct ip6_tnl_encap_ops *encap;
+
+ encap = rcu_dereference(ip6tun_encaps[i]);
+ if (!encap)
+ continue;
+ handler = encap->err_handler;
+ if (handler && !handler(skb, opt, type, code, offset, info))
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+/* Try to match ICMP errors to UDP tunnels by looking up a socket without
+ * reversing source and destination port: this will match tunnels that force the
+ * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that
+ * lwtunnels might actually break this assumption by being configured with
+ * different destination ports on endpoints, in this case we won't be able to
+ * trace ICMP messages back to them.
+ *
+ * If this doesn't match any socket, probe tunnels with arbitrary destination
+ * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port
+ * we've sent packets to won't necessarily match the local destination port.
+ *
+ * Then ask the tunnel implementation to match the error against a valid
+ * association.
+ *
+ * Return an error if we can't find a match, the socket if we need further
+ * processing, zero otherwise.
+ */
+static struct sock *__udp6_lib_err_encap(struct net *net,
+ const struct ipv6hdr *hdr, int offset,
+ struct udphdr *uh,
+ struct udp_table *udptable,
+ struct sock *sk,
+ struct sk_buff *skb,
+ struct inet6_skb_parm *opt,
+ u8 type, u8 code, __be32 info)
+{
+ int (*lookup)(struct sock *sk, struct sk_buff *skb);
+ int network_offset, transport_offset;
+ struct udp_sock *up;
+
+ network_offset = skb_network_offset(skb);
+ transport_offset = skb_transport_offset(skb);
+
+ /* Network header needs to point to the outer IPv6 header inside ICMP */
+ skb_reset_network_header(skb);
+
+ /* Transport header needs to point to the UDP header */
+ skb_set_transport_header(skb, offset);
+
+ if (sk) {
+ up = udp_sk(sk);
+
+ lookup = READ_ONCE(up->encap_err_lookup);
+ if (lookup && lookup(sk, skb))
+ sk = NULL;
+
+ goto out;
+ }
+
+ sk = __udp6_lib_lookup(net, &hdr->daddr, uh->source,
+ &hdr->saddr, uh->dest,
+ inet6_iif(skb), 0, udptable, skb);
+ if (sk) {
+ up = udp_sk(sk);
+
+ lookup = READ_ONCE(up->encap_err_lookup);
+ if (!lookup || lookup(sk, skb))
+ sk = NULL;
+ }
+
+out:
+ if (!sk) {
+ sk = ERR_PTR(__udp6_lib_err_encap_no_sk(skb, opt, type, code,
+ offset, info));
+ }
+
+ skb_set_transport_header(skb, transport_offset);
+ skb_set_network_header(skb, network_offset);
+
+ return sk;
+}
+
+int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info,
+ struct udp_table *udptable)
{
struct ipv6_pinfo *np;
const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data;
const struct in6_addr *saddr = &hdr->saddr;
- const struct in6_addr *daddr = &hdr->daddr;
+ const struct in6_addr *daddr = seg6_get_daddr(skb, opt) ? : &hdr->daddr;
struct udphdr *uh = (struct udphdr *)(skb->data+offset);
+ bool tunnel = false;
struct sock *sk;
int harderr;
int err;
struct net *net = dev_net(skb->dev);
sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
- inet6_iif(skb), 0, udptable, skb);
- if (!sk) {
- __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
- ICMP6_MIB_INERRORS);
- return;
+ inet6_iif(skb), inet6_sdif(skb), udptable, NULL);
+
+ if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) {
+ /* No socket for error: try tunnels before discarding */
+ if (static_branch_unlikely(&udpv6_encap_needed_key)) {
+ sk = __udp6_lib_err_encap(net, hdr, offset, uh,
+ udptable, sk, skb,
+ opt, type, code, info);
+ if (!sk)
+ return 0;
+ } else
+ sk = ERR_PTR(-ENOENT);
+
+ if (IS_ERR(sk)) {
+ __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
+ ICMP6_MIB_INERRORS);
+ return PTR_ERR(sk);
+ }
+
+ tunnel = true;
}
harderr = icmpv6_err_convert(type, code, &err);
@@ -492,15 +745,29 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
if (!ip6_sk_accept_pmtu(sk))
goto out;
ip6_sk_update_pmtu(skb, sk, info);
- if (np->pmtudisc != IPV6_PMTUDISC_DONT)
+ if (READ_ONCE(np->pmtudisc) != IPV6_PMTUDISC_DONT)
harderr = 1;
}
if (type == NDISC_REDIRECT) {
- ip6_sk_redirect(skb, sk);
+ if (tunnel) {
+ ip6_redirect(skb, sock_net(sk), inet6_iif(skb),
+ READ_ONCE(sk->sk_mark),
+ sk_uid(sk));
+ } else {
+ ip6_sk_redirect(skb, sk);
+ }
+ goto out;
+ }
+
+ /* Tunnels don't have an application socket: don't pass errors back */
+ if (tunnel) {
+ if (udp_sk(sk)->encap_err_rcv)
+ udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest,
+ ntohl(info), (u8 *)(uh+1));
goto out;
}
- if (!np->recverr) {
+ if (!inet6_test_bit(RECVERR6, sk)) {
if (!harderr || sk->sk_state != TCP_ESTABLISHED)
goto out;
} else {
@@ -508,9 +775,9 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
}
sk->sk_err = err;
- sk->sk_error_report(sk);
+ sk_error_report(sk);
out:
- return;
+ return 0;
}
static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
@@ -528,42 +795,49 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
rc = __udp_enqueue_schedule_skb(sk, skb);
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
+ enum skb_drop_reason drop_reason;
/* Note that an ENOMEM error is charged twice */
- if (rc == -ENOMEM)
+ if (rc == -ENOMEM) {
UDP6_INC_STATS(sock_net(sk),
UDP_MIB_RCVBUFERRORS, is_udplite);
+ drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
+ } else {
+ UDP6_INC_STATS(sock_net(sk),
+ UDP_MIB_MEMERRORS, is_udplite);
+ drop_reason = SKB_DROP_REASON_PROTO_MEM;
+ }
UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
- kfree_skb(skb);
+ trace_udp_fail_queue_rcv_skb(rc, sk, skb);
+ sk_skb_reason_drop(sk, skb, drop_reason);
return -1;
}
return 0;
}
-static __inline__ void udpv6_err(struct sk_buff *skb,
- struct inet6_skb_parm *opt, u8 type,
- u8 code, int offset, __be32 info)
-{
- __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
-}
-
-static DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
-void udpv6_encap_enable(void)
+static __inline__ int udpv6_err(struct sk_buff *skb,
+ struct inet6_skb_parm *opt, u8 type,
+ u8 code, int offset, __be32 info)
{
- static_branch_enable(&udpv6_encap_needed_key);
+ return __udp6_lib_err(skb, opt, type, code, offset, info,
+ dev_net(skb->dev)->ipv4.udp_table);
}
-EXPORT_SYMBOL(udpv6_encap_enable);
-static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
{
+ enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct udp_sock *up = udp_sk(sk);
int is_udplite = IS_UDPLITE(sk);
- if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
+ if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
+ drop_reason = SKB_DROP_REASON_XFRM_POLICY;
goto drop;
+ }
+ nf_reset_ct(skb);
- if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) {
+ if (static_branch_unlikely(&udpv6_encap_needed_key) &&
+ READ_ONCE(up->encap_type)) {
int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
/*
@@ -588,9 +862,9 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
ret = encap_rcv(sk, skb);
if (ret <= 0) {
- __UDP_INC_STATS(sock_net(sk),
- UDP_MIB_INDATAGRAMS,
- is_udplite);
+ __UDP6_INC_STATS(sock_net(sk),
+ UDP_MIB_INDATAGRAMS,
+ is_udplite);
return -ret;
}
}
@@ -601,16 +875,17 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
/*
* UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c).
*/
- if ((is_udplite & UDPLITE_RECV_CC) && UDP_SKB_CB(skb)->partial_cov) {
+ if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) {
+ u16 pcrlen = READ_ONCE(up->pcrlen);
- if (up->pcrlen == 0) { /* full coverage was set */
+ if (pcrlen == 0) { /* full coverage was set */
net_dbg_ratelimited("UDPLITE6: partial coverage %d while full coverage %d requested\n",
UDP_SKB_CB(skb)->cscov, skb->len);
goto drop;
}
- if (UDP_SKB_CB(skb)->cscov < up->pcrlen) {
+ if (UDP_SKB_CB(skb)->cscov < pcrlen) {
net_dbg_ratelimited("UDPLITE6: coverage %d too small, need min %d\n",
- UDP_SKB_CB(skb)->cscov, up->pcrlen);
+ UDP_SKB_CB(skb)->cscov, pcrlen);
goto drop;
}
}
@@ -620,7 +895,7 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
udp_lib_checksum_complete(skb))
goto csum_error;
- if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr)))
+ if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason))
goto drop;
udp_csum_pull_header(skb);
@@ -630,20 +905,43 @@ static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
return __udpv6_queue_rcv_skb(sk, skb);
csum_error:
+ drop_reason = SKB_DROP_REASON_UDP_CSUM;
__UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
drop:
__UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
- atomic_inc(&sk->sk_drops);
- kfree_skb(skb);
+ udp_drops_inc(sk);
+ sk_skb_reason_drop(sk, skb, drop_reason);
return -1;
}
-static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
+static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_buff *next, *segs;
+ int ret;
+
+ if (likely(!udp_unexpected_gso(sk, skb)))
+ return udpv6_queue_rcv_one_skb(sk, skb);
+
+ __skb_push(skb, -skb_mac_offset(skb));
+ segs = udp_rcv_segment(sk, skb, false);
+ skb_list_walk_safe(segs, skb, next) {
+ __skb_pull(skb, skb_transport_offset(skb));
+
+ udp_post_segment_fix_csum(skb);
+ ret = udpv6_queue_rcv_one_skb(sk, skb);
+ if (ret > 0)
+ ip6_protocol_deliver_rcu(dev_net(skb->dev), skb, ret,
+ true);
+ }
+ return 0;
+}
+
+static bool __udp_v6_is_mcast_sock(struct net *net, const struct sock *sk,
__be16 loc_port, const struct in6_addr *loc_addr,
__be16 rmt_port, const struct in6_addr *rmt_addr,
- int dif, unsigned short hnum)
+ int dif, int sdif, unsigned short hnum)
{
- struct inet_sock *inet = inet_sk(sk);
+ const struct inet_sock *inet = inet_sk(sk);
if (!net_eq(sock_net(sk), net))
return false;
@@ -653,7 +951,7 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
(inet->inet_dport && inet->inet_dport != rmt_port) ||
(!ipv6_addr_any(&sk->sk_v6_daddr) &&
!ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
- (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) ||
+ !udp_sk_bound_dev_eq(net, READ_ONCE(sk->sk_bound_dev_if), dif, sdif) ||
(!ipv6_addr_any(&sk->sk_v6_rcv_saddr) &&
!ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)))
return false;
@@ -687,6 +985,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
unsigned int offset = offsetof(typeof(*sk), sk_node);
unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10);
int dif = inet6_iif(skb);
+ int sdif = inet6_sdif(skb);
struct hlist_node *node;
struct sk_buff *nskb;
@@ -695,18 +994,19 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
udptable->mask;
hash2 = ipv6_portaddr_hash(net, daddr, hnum) & udptable->mask;
start_lookup:
- hslot = &udptable->hash2[hash2];
+ hslot = &udptable->hash2[hash2].hslot;
offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node);
}
sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) {
if (!__udp_v6_is_mcast_sock(net, sk, uh->dest, daddr,
- uh->source, saddr, dif, hnum))
+ uh->source, saddr, dif, sdif,
+ hnum))
continue;
/* If zero checksum and no_check is not on for
* the socket then skip it.
*/
- if (!uh->check && !udp_sk(sk)->no_check6_rx)
+ if (!uh->check && !udp_get_no_check6_rx(sk))
continue;
if (!first) {
first = sk;
@@ -714,7 +1014,7 @@ start_lookup:
}
nskb = skb_clone(skb, GFP_ATOMIC);
if (unlikely(!nskb)) {
- atomic_inc(&sk->sk_drops);
+ udp_drops_inc(sk);
__UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
IS_UDPLITE(sk));
__UDP6_INC_STATS(net, UDP_MIB_INERRORS,
@@ -745,14 +1045,11 @@ start_lookup:
static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
{
- if (udp_sk_rx_dst_set(sk, dst)) {
- const struct rt6_info *rt = (const struct rt6_info *)dst;
-
- inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
- }
+ if (udp_sk_rx_dst_set(sk, dst))
+ sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst));
}
-/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
+/* wrapper for udp_queue_rcv_skb taking care of csum conversion and
* return code conversion for ip layer consumption
*/
static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
@@ -761,26 +1058,25 @@ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
int ret;
if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk))
- skb_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
- ip6_compute_pseudo);
+ skb_checksum_try_convert(skb, IPPROTO_UDP, ip6_compute_pseudo);
ret = udpv6_queue_rcv_skb(sk, skb);
- /* a return value > 0 means to resubmit the input, but
- * it wants the return to be -protocol, or 0
- */
+ /* a return value > 0 means to resubmit the input */
if (ret > 0)
- return -ret;
+ return ret;
return 0;
}
int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
int proto)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
const struct in6_addr *saddr, *daddr;
struct net *net = dev_net(skb->dev);
+ struct sock *sk = NULL;
struct udphdr *uh;
- struct sock *sk;
+ bool refcounted;
u32 ulen = 0;
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
@@ -817,21 +1113,27 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
goto csum_error;
/* Check if the socket is already available, e.g. due to early demux */
- sk = skb_steal_sock(skb);
+ sk = inet6_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest,
+ &refcounted, udp6_ehashfn);
+ if (IS_ERR(sk))
+ goto no_sk;
+
if (sk) {
struct dst_entry *dst = skb_dst(skb);
int ret;
- if (unlikely(sk->sk_rx_dst != dst))
+ if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst))
udp6_sk_rx_dst_set(sk, dst);
- if (!uh->check && !udp_sk(sk)->no_check6_rx) {
- sock_put(sk);
+ if (!uh->check && !udp_get_no_check6_rx(sk)) {
+ if (refcounted)
+ sock_put(sk);
goto report_csum_error;
}
ret = udp6_unicast_rcv_skb(sk, skb, uh);
- sock_put(sk);
+ if (refcounted)
+ sock_put(sk);
return ret;
}
@@ -845,16 +1147,19 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
/* Unicast */
sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
if (sk) {
- if (!uh->check && !udp_sk(sk)->no_check6_rx)
+ if (!uh->check && !udp_get_no_check6_rx(sk))
goto report_csum_error;
return udp6_unicast_rcv_skb(sk, skb, uh);
}
+no_sk:
+ reason = SKB_DROP_REASON_NO_SOCKET;
if (!uh->check)
goto report_csum_error;
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard;
+ nf_reset_ct(skb);
if (udp_lib_checksum_complete(skb))
goto csum_error;
@@ -862,10 +1167,12 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
__UDP6_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
- kfree_skb(skb);
+ sk_skb_reason_drop(sk, skb, reason);
return 0;
short_packet:
+ if (reason == SKB_DROP_REASON_NOT_SPECIFIED)
+ reason = SKB_DROP_REASON_PKT_TOO_SMALL;
net_dbg_ratelimited("UDP%sv6: short packet: From [%pI6c]:%u %d/%d to [%pI6c]:%u\n",
proto == IPPROTO_UDPLITE ? "-Lite" : "",
saddr, ntohs(uh->source),
@@ -876,10 +1183,12 @@ short_packet:
report_csum_error:
udp6_csum_zero_error(skb);
csum_error:
+ if (reason == SKB_DROP_REASON_NOT_SPECIFIED)
+ reason = SKB_DROP_REASON_UDP_CSUM;
__UDP6_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE);
discard:
__UDP6_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE);
- kfree_skb(skb);
+ sk_skb_reason_drop(sk, skb, reason);
return 0;
}
@@ -889,16 +1198,20 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net,
__be16 rmt_port, const struct in6_addr *rmt_addr,
int dif, int sdif)
{
+ struct udp_table *udptable = net->ipv4.udp_table;
unsigned short hnum = ntohs(loc_port);
- unsigned int hash2 = ipv6_portaddr_hash(net, loc_addr, hnum);
- unsigned int slot2 = hash2 & udp_table.mask;
- struct udp_hslot *hslot2 = &udp_table.hash2[slot2];
- const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum);
+ struct udp_hslot *hslot2;
+ unsigned int hash2;
+ __portpair ports;
struct sock *sk;
+ hash2 = ipv6_portaddr_hash(net, loc_addr, hnum);
+ hslot2 = udp_hashslot2(udptable, hash2);
+ ports = INET_COMBINED_PORTS(rmt_port, hnum);
+
udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
if (sk->sk_state == TCP_ESTABLISHED &&
- INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif, sdif))
+ inet6_match(net, sk, rmt_addr, loc_addr, ports, dif, sdif))
return sk;
/* Only check first socket in chain */
break;
@@ -906,7 +1219,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net,
return NULL;
}
-static void udp_v6_early_demux(struct sk_buff *skb)
+void udp_v6_early_demux(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
const struct udphdr *uh;
@@ -929,15 +1242,16 @@ static void udp_v6_early_demux(struct sk_buff *skb)
else
return;
- if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt))
+ if (!sk)
return;
skb->sk = sk;
- skb->destructor = sock_efree;
- dst = READ_ONCE(sk->sk_rx_dst);
+ DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk));
+ skb->destructor = sock_pfree;
+ dst = rcu_dereference(sk->sk_rx_dst);
if (dst)
- dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie);
+ dst = dst_check(dst, sk->sk_rx_dst_cookie);
if (dst) {
/* set noref for now.
* any place which wants to hold dst has to call
@@ -947,9 +1261,9 @@ static void udp_v6_early_demux(struct sk_buff *skb)
}
}
-static __inline__ int udpv6_rcv(struct sk_buff *skb)
+INDIRECT_CALLABLE_SCOPE int udpv6_rcv(struct sk_buff *skb)
{
- return __udp6_lib_rcv(skb, &udp_table, IPPROTO_UDP);
+ return __udp6_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP);
}
/*
@@ -963,20 +1277,22 @@ static void udp_v6_flush_pending_frames(struct sock *sk)
udp_flush_pending_frames(sk);
else if (up->pending) {
up->len = 0;
- up->pending = 0;
+ WRITE_ONCE(up->pending, 0);
ip6_flush_pending_frames(sk);
}
}
-static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+static int udpv6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
int addr_len)
{
+ if (addr_len < offsetofend(struct sockaddr, sa_family))
+ return -EINVAL;
/* The following checks are replicated from __ip6_datagram_connect()
* and intended to prevent BPF program called below from accessing
* bytes that are out of the bound specified by user in addr_len.
*/
if (uaddr->sa_family == AF_INET) {
- if (__ipv6_only_sock(sk))
+ if (ipv6_only_sock(sk))
return -EAFNOSUPPORT;
return udp_pre_connect(sk, uaddr, addr_len);
}
@@ -984,7 +1300,20 @@ static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
if (addr_len < SIN6_LEN_RFC2133)
return -EINVAL;
- return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr);
+ return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len);
+}
+
+static int udpv6_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
+{
+ int res;
+
+ lock_sock(sk);
+ res = __ip6_datagram_connect(sk, uaddr, addr_len);
+ if (!res)
+ udp6_hash4(sk);
+ release_sock(sk);
+ return res;
}
/**
@@ -992,6 +1321,9 @@ static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
* @sk: socket we are sending on
* @skb: sk_buff containing the filled-in UDP header
* (checksum field must be zeroed out)
+ * @saddr: source address
+ * @daddr: destination address
+ * @len: length of packet
*/
static void udp6_hwcsum_outgoing(struct sock *sk, struct sk_buff *skb,
const struct in6_addr *saddr,
@@ -1044,6 +1376,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
__wsum csum = 0;
int offset = skb_transport_offset(skb);
int len = skb->len - offset;
+ int datalen = len - sizeof(*uh);
/*
* Create a UDP header
@@ -1058,24 +1391,37 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
const int hlen = skb_network_header_len(skb) +
sizeof(struct udphdr);
- if (hlen + cork->gso_size > cork->fragsize)
- return -EINVAL;
- if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS)
+ if (hlen + min(datalen, cork->gso_size) > cork->fragsize) {
+ kfree_skb(skb);
+ return -EMSGSIZE;
+ }
+ if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) {
+ kfree_skb(skb);
return -EINVAL;
- if (udp_sk(sk)->no_check6_tx)
+ }
+ if (udp_get_no_check6_tx(sk)) {
+ kfree_skb(skb);
return -EINVAL;
- if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite ||
- dst_xfrm(skb_dst(skb)))
+ }
+ if (is_udplite || dst_xfrm(skb_dst(skb))) {
+ kfree_skb(skb);
return -EIO;
+ }
+
+ if (datalen > cork->gso_size) {
+ skb_shinfo(skb)->gso_size = cork->gso_size;
+ skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
+ skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen,
+ cork->gso_size);
- skb_shinfo(skb)->gso_size = cork->gso_size;
- skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4;
- goto csum_partial;
+ /* Don't checksum the payload, skb will get segmented */
+ goto csum_partial;
+ }
}
if (is_udplite)
csum = udplite_csum(skb);
- else if (udp_sk(sk)->no_check6_tx) { /* UDP csum disabled */
+ else if (udp_get_no_check6_tx(sk)) { /* UDP csum disabled */
skb->ip_summed = CHECKSUM_NONE;
goto send;
} else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
@@ -1094,7 +1440,7 @@ csum_partial:
send:
err = ip6_send_skb(skb);
if (err) {
- if (err == -ENOBUFS && !inet6_sk(sk)->recverr) {
+ if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk)) {
UDP6_INC_STATS(sock_net(sk),
UDP_MIB_SNDBUFERRORS, is_udplite);
err = 0;
@@ -1110,26 +1456,20 @@ static int udp_v6_push_pending_frames(struct sock *sk)
{
struct sk_buff *skb;
struct udp_sock *up = udp_sk(sk);
- struct flowi6 fl6;
int err = 0;
if (up->pending == AF_INET)
return udp_push_pending_frames(sk);
- /* ip6_finish_skb will release the cork, so make a copy of
- * fl6 here.
- */
- fl6 = inet_sk(sk)->cork.fl.u.ip6;
-
skb = ip6_finish_skb(sk);
if (!skb)
goto out;
- err = udp_v6_send_skb(skb, &fl6, &inet_sk(sk)->cork.base);
-
+ err = udp_v6_send_skb(skb, &inet_sk(sk)->cork.fl.u.ip6,
+ &inet_sk(sk)->cork.base);
out:
up->len = 0;
- up->pending = 0;
+ WRITE_ONCE(up->pending, 0);
return err;
}
@@ -1144,20 +1484,20 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct ipv6_txoptions *opt = NULL;
struct ipv6_txoptions *opt_to_free = NULL;
struct ip6_flowlabel *flowlabel = NULL;
- struct flowi6 fl6;
+ struct inet_cork_full cork;
+ struct flowi6 *fl6 = &cork.fl.u.ip6;
struct dst_entry *dst;
struct ipcm6_cookie ipc6;
int addr_len = msg->msg_namelen;
bool connected = false;
int ulen = len;
- int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
+ int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE;
int err;
int is_udplite = IS_UDPLITE(sk);
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
- ipcm6_init(&ipc6);
- ipc6.gso_size = up->gso_size;
- ipc6.sockc.tsflags = sk->sk_tsflags;
+ ipcm6_init_sk(&ipc6, sk);
+ ipc6.gso_size = READ_ONCE(up->gso_size);
/* destination address check */
if (sin6) {
@@ -1184,7 +1524,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
default:
return -EINVAL;
}
- } else if (!up->pending) {
+ } else if (!READ_ONCE(up->pending)) {
if (sk->sk_state != TCP_ESTABLISHED)
return -EDESTADDRREQ;
daddr = &sk->sk_v6_daddr;
@@ -1200,15 +1540,14 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
msg->msg_name = &sin;
msg->msg_namelen = sizeof(sin);
do_udp_sendmsg:
- if (__ipv6_only_sock(sk))
- return -ENETUNREACH;
- return udp_sendmsg(sk, msg, len);
+ err = ipv6_only_sock(sk) ?
+ -ENETUNREACH : udp_sendmsg(sk, msg, len);
+ msg->msg_name = sin6;
+ msg->msg_namelen = addr_len;
+ return err;
}
}
- if (up->pending == AF_INET)
- return udp_sendmsg(sk, msg, len);
-
/* Rough check on arithmetic overflow,
better check is made in ip6_append_data().
*/
@@ -1216,7 +1555,9 @@ do_udp_sendmsg:
return -EMSGSIZE;
getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag;
- if (up->pending) {
+ if (READ_ONCE(up->pending)) {
+ if (READ_ONCE(up->pending) == AF_INET)
+ return udp_sendmsg(sk, msg, len);
/*
* There are pending frames.
* The socket lock must be held while it's corked.
@@ -1234,20 +1575,20 @@ do_udp_sendmsg:
}
ulen += sizeof(struct udphdr);
- memset(&fl6, 0, sizeof(fl6));
+ memset(fl6, 0, sizeof(*fl6));
if (sin6) {
if (sin6->sin6_port == 0)
return -EINVAL;
- fl6.fl6_dport = sin6->sin6_port;
+ fl6->fl6_dport = sin6->sin6_port;
daddr = &sin6->sin6_addr;
- if (np->sndflow) {
- fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
- if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
- flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (!flowlabel)
+ if (inet6_test_bit(SNDFLOW, sk)) {
+ fl6->flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK;
+ if (fl6->flowlabel & IPV6_FLOWLABEL_MASK) {
+ flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
+ if (IS_ERR(flowlabel))
return -EINVAL;
}
}
@@ -1263,25 +1604,24 @@ do_udp_sendmsg:
if (addr_len >= sizeof(struct sockaddr_in6) &&
sin6->sin6_scope_id &&
__ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr)))
- fl6.flowi6_oif = sin6->sin6_scope_id;
+ fl6->flowi6_oif = sin6->sin6_scope_id;
} else {
if (sk->sk_state != TCP_ESTABLISHED)
return -EDESTADDRREQ;
- fl6.fl6_dport = inet->inet_dport;
+ fl6->fl6_dport = inet->inet_dport;
daddr = &sk->sk_v6_daddr;
- fl6.flowlabel = np->flow_label;
+ fl6->flowlabel = np->flow_label;
connected = true;
}
- if (!fl6.flowi6_oif)
- fl6.flowi6_oif = sk->sk_bound_dev_if;
+ if (!fl6->flowi6_oif)
+ fl6->flowi6_oif = READ_ONCE(sk->sk_bound_dev_if);
- if (!fl6.flowi6_oif)
- fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
+ if (!fl6->flowi6_oif)
+ fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
- fl6.flowi6_mark = sk->sk_mark;
- fl6.flowi6_uid = sk->sk_uid;
+ fl6->flowi6_uid = sk_uid(sk);
if (msg->msg_controllen) {
opt = &opt_space;
@@ -1290,21 +1630,22 @@ do_udp_sendmsg:
ipc6.opt = opt;
err = udp_cmsg_send(sk, msg, &ipc6.gso_size);
- if (err > 0)
- err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6,
+ if (err > 0) {
+ err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, fl6,
&ipc6);
+ connected = false;
+ }
if (err < 0) {
fl6_sock_release(flowlabel);
return err;
}
- if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
- flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (!flowlabel)
+ if ((fl6->flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) {
+ flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
+ if (IS_ERR(flowlabel))
return -EINVAL;
}
if (!(opt->opt_nflen|opt->opt_flen))
opt = NULL;
- connected = false;
}
if (!opt) {
opt = txopt_get(np);
@@ -1315,18 +1656,18 @@ do_udp_sendmsg:
opt = ipv6_fixup_options(&opt_space, opt);
ipc6.opt = opt;
- fl6.flowi6_proto = sk->sk_protocol;
- if (!ipv6_addr_any(daddr))
- fl6.daddr = *daddr;
- else
- fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
- if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr))
- fl6.saddr = np->saddr;
- fl6.fl6_sport = inet->inet_sport;
+ fl6->flowi6_proto = sk->sk_protocol;
+ fl6->flowi6_mark = ipc6.sockc.mark;
+ fl6->daddr = *daddr;
+ if (ipv6_addr_any(&fl6->saddr) && !ipv6_addr_any(&np->saddr))
+ fl6->saddr = np->saddr;
+ fl6->fl6_sport = inet->inet_sport;
- if (cgroup_bpf_enabled && !connected) {
+ if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) {
err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
- (struct sockaddr *)sin6, &fl6.saddr);
+ (struct sockaddr *)sin6,
+ &addr_len,
+ &fl6->saddr);
if (err)
goto out_no_dst;
if (sin6) {
@@ -1342,29 +1683,29 @@ do_udp_sendmsg:
err = -EINVAL;
goto out_no_dst;
}
- fl6.fl6_dport = sin6->sin6_port;
- fl6.daddr = sin6->sin6_addr;
+ fl6->fl6_dport = sin6->sin6_port;
+ fl6->daddr = sin6->sin6_addr;
}
}
- final_p = fl6_update_dst(&fl6, opt, &final);
+ if (ipv6_addr_any(&fl6->daddr))
+ fl6->daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */
+
+ final_p = fl6_update_dst(fl6, opt, &final);
if (final_p)
connected = false;
- if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) {
- fl6.flowi6_oif = np->mcast_oif;
+ if (!fl6->flowi6_oif && ipv6_addr_is_multicast(&fl6->daddr)) {
+ fl6->flowi6_oif = READ_ONCE(np->mcast_oif);
connected = false;
- } else if (!fl6.flowi6_oif)
- fl6.flowi6_oif = np->ucast_oif;
-
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ } else if (!fl6->flowi6_oif)
+ fl6->flowi6_oif = READ_ONCE(np->ucast_oif);
- if (ipc6.tclass < 0)
- ipc6.tclass = np->tclass;
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
- fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
+ fl6->flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6->flowlabel);
- dst = ip6_sk_dst_lookup_flow(sk, &fl6, final_p, connected);
+ dst = ip6_sk_dst_lookup_flow(sk, fl6, final_p, connected);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
dst = NULL;
@@ -1372,7 +1713,7 @@ do_udp_sendmsg:
}
if (ipc6.hlimit < 0)
- ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
+ ipc6.hlimit = ip6_sk_dst_hoplimit(np, fl6, dst);
if (msg->msg_flags&MSG_CONFIRM)
goto do_confirm;
@@ -1380,17 +1721,17 @@ back_from_confirm:
/* Lockless fast path for the non-corking case */
if (!corkreq) {
- struct inet_cork_full cork;
struct sk_buff *skb;
skb = ip6_make_skb(sk, getfrag, msg, ulen,
sizeof(struct udphdr), &ipc6,
- &fl6, (struct rt6_info *)dst,
+ dst_rt6_info(dst),
msg->msg_flags, &cork);
err = PTR_ERR(skb);
if (!IS_ERR_OR_NULL(skb))
- err = udp_v6_send_skb(skb, &fl6, &cork.base);
- goto out;
+ err = udp_v6_send_skb(skb, fl6, &cork.base);
+ /* ip6_make_skb steals dst reference */
+ goto out_no_dst;
}
lock_sock(sk);
@@ -1404,24 +1745,22 @@ back_from_confirm:
goto out;
}
- up->pending = AF_INET6;
+ WRITE_ONCE(up->pending, AF_INET6);
do_append_data:
- if (ipc6.dontfrag < 0)
- ipc6.dontfrag = np->dontfrag;
up->len += ulen;
err = ip6_append_data(sk, getfrag, msg, ulen, sizeof(struct udphdr),
- &ipc6, &fl6, (struct rt6_info *)dst,
+ &ipc6, fl6, dst_rt6_info(dst),
corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
if (err)
udp_v6_flush_pending_frames(sk);
else if (!corkreq)
err = udp_v6_push_pending_frames(sk);
else if (unlikely(skb_queue_empty(&sk->sk_write_queue)))
- up->pending = 0;
+ WRITE_ONCE(up->pending, 0);
if (err > 0)
- err = np->recverr ? net_xmit_errno(err) : 0;
+ err = inet6_test_bit(RECVERR6, sk) ? net_xmit_errno(err) : 0;
release_sock(sk);
out:
@@ -1446,53 +1785,66 @@ out_no_dst:
do_confirm:
if (msg->msg_flags & MSG_PROBE)
- dst_confirm_neigh(dst, &fl6.daddr);
+ dst_confirm_neigh(dst, &fl6->daddr);
if (!(msg->msg_flags&MSG_PROBE) || len)
goto back_from_confirm;
err = 0;
goto out;
}
+EXPORT_SYMBOL(udpv6_sendmsg);
+
+static void udpv6_splice_eof(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct udp_sock *up = udp_sk(sk);
+
+ if (!READ_ONCE(up->pending) || udp_test_bit(CORK, sk))
+ return;
+
+ lock_sock(sk);
+ if (up->pending && !udp_test_bit(CORK, sk))
+ udp_v6_push_pending_frames(sk);
+ release_sock(sk);
+}
void udpv6_destroy_sock(struct sock *sk)
{
struct udp_sock *up = udp_sk(sk);
lock_sock(sk);
+
+ /* protects from races with udp_abort() */
+ sock_set_flag(sk, SOCK_DEAD);
udp_v6_flush_pending_frames(sk);
release_sock(sk);
- if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) {
- void (*encap_destroy)(struct sock *sk);
- encap_destroy = READ_ONCE(up->encap_destroy);
- if (encap_destroy)
- encap_destroy(sk);
+ if (static_branch_unlikely(&udpv6_encap_needed_key)) {
+ if (up->encap_type) {
+ void (*encap_destroy)(struct sock *sk);
+ encap_destroy = READ_ONCE(up->encap_destroy);
+ if (encap_destroy)
+ encap_destroy(sk);
+ }
+ if (udp_test_bit(ENCAP_ENABLED, sk)) {
+ static_branch_dec(&udpv6_encap_needed_key);
+ udp_encap_disable();
+ udp_tunnel_cleanup_gro(sk);
+ }
}
-
- inet6_destroy_sock(sk);
}
/*
* Socket option code for UDP
*/
-int udpv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
+int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+ unsigned int optlen)
{
- if (level == SOL_UDP || level == SOL_UDPLITE)
- return udp_lib_setsockopt(sk, level, optname, optval, optlen,
+ if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET)
+ return udp_lib_setsockopt(sk, level, optname,
+ optval, optlen,
udp_v6_push_pending_frames);
return ipv6_setsockopt(sk, level, optname, optval, optlen);
}
-#ifdef CONFIG_COMPAT
-int compat_udpv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen)
-{
- if (level == SOL_UDP || level == SOL_UDPLITE)
- return udp_lib_setsockopt(sk, level, optname, optval, optlen,
- udp_v6_push_pending_frames);
- return compat_ipv6_setsockopt(sk, level, optname, optval, optlen);
-}
-#endif
-
int udpv6_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
@@ -1501,26 +1853,6 @@ int udpv6_getsockopt(struct sock *sk, int level, int optname,
return ipv6_getsockopt(sk, level, optname, optval, optlen);
}
-#ifdef CONFIG_COMPAT
-int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen)
-{
- if (level == SOL_UDP || level == SOL_UDPLITE)
- return udp_lib_getsockopt(sk, level, optname, optval, optlen);
- return compat_ipv6_getsockopt(sk, level, optname, optval, optlen);
-}
-#endif
-
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct inet6_protocol udpv6_protocol = {
- .early_demux = udp_v6_early_demux,
- .early_demux_handler = udp_v6_early_demux,
- .handler = udpv6_rcv,
- .err_handler = udpv6_err,
- .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
-};
/* ------------------------------------------------------------------------ */
#ifdef CONFIG_PROC_FS
@@ -1530,7 +1862,7 @@ int udp6_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, IPV6_SEQ_DGRAM_HEADER);
} else {
int bucket = ((struct udp_iter_state *)seq->private)->bucket;
- struct inet_sock *inet = inet_sk(v);
+ const struct inet_sock *inet = inet_sk((const struct sock *)v);
__u16 srcp = ntohs(inet->inet_sport);
__u16 destp = ntohs(inet->inet_dport);
__ip6_dgram_sock_seq_show(seq, v, srcp, destp,
@@ -1549,7 +1881,7 @@ EXPORT_SYMBOL(udp6_seq_ops);
static struct udp_seq_afinfo udp6_seq_afinfo = {
.family = AF_INET6,
- .udp_table = &udp_table,
+ .udp_table = NULL,
};
int __net_init udp6_proc_init(struct net *net)
@@ -1573,30 +1905,35 @@ struct proto udpv6_prot = {
.owner = THIS_MODULE,
.close = udp_lib_close,
.pre_connect = udpv6_pre_connect,
- .connect = ip6_datagram_connect,
+ .connect = udpv6_connect,
.disconnect = udp_disconnect,
.ioctl = udp_ioctl,
- .init = udp_init_sock,
+ .init = udpv6_init_sock,
.destroy = udpv6_destroy_sock,
.setsockopt = udpv6_setsockopt,
.getsockopt = udpv6_getsockopt,
.sendmsg = udpv6_sendmsg,
.recvmsg = udpv6_recvmsg,
+ .splice_eof = udpv6_splice_eof,
.release_cb = ip6_datagram_release_cb,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
.rehash = udp_v6_rehash,
.get_port = udp_v6_get_port,
- .memory_allocated = &udp_memory_allocated,
+ .put_port = udp_lib_unhash,
+#ifdef CONFIG_BPF_SYSCALL
+ .psock_update_sk_prot = udp_bpf_update_proto,
+#endif
+
+ .memory_allocated = &net_aligned_data.udp_memory_allocated,
+ .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc,
+
.sysctl_mem = sysctl_udp_mem,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
.obj_size = sizeof(struct udp6_sock),
- .h.udp_table = &udp_table,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_udpv6_setsockopt,
- .compat_getsockopt = compat_udpv6_getsockopt,
-#endif
+ .ipv6_pinfo_offset = offsetof(struct udp6_sock, inet6),
+ .h.udp_table = NULL,
.diag_destroy = udp_abort,
};
@@ -1612,7 +1949,12 @@ int __init udpv6_init(void)
{
int ret;
- ret = inet6_add_protocol(&udpv6_protocol, IPPROTO_UDP);
+ net_hotdata.udpv6_protocol = (struct inet6_protocol) {
+ .handler = udpv6_rcv,
+ .err_handler = udpv6_err,
+ .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL,
+ };
+ ret = inet6_add_protocol(&net_hotdata.udpv6_protocol, IPPROTO_UDP);
if (ret)
goto out;
@@ -1623,12 +1965,12 @@ out:
return ret;
out_udpv6_protocol:
- inet6_del_protocol(&udpv6_protocol, IPPROTO_UDP);
+ inet6_del_protocol(&net_hotdata.udpv6_protocol, IPPROTO_UDP);
goto out;
}
void udpv6_exit(void)
{
inet6_unregister_protosw(&udpv6_protosw);
- inet6_del_protocol(&udpv6_protocol, IPPROTO_UDP);
+ inet6_del_protocol(&net_hotdata.udpv6_protocol, IPPROTO_UDP);
}
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index 7903e21c178b..8a406be25a3a 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _UDP6_IMPL_H
#define _UDP6_IMPL_H
+#include <net/aligned_data.h>
#include <net/udp.h>
#include <net/udplite.h>
#include <net/protocol.h>
@@ -9,24 +10,20 @@
#include <net/transp_v6.h>
int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int);
-void __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int,
- __be32, struct udp_table *);
+int __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int,
+ __be32, struct udp_table *);
+int udpv6_init_sock(struct sock *sk);
int udp_v6_get_port(struct sock *sk, unsigned short snum);
+void udp_v6_rehash(struct sock *sk);
int udpv6_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen);
-int udpv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen);
-#ifdef CONFIG_COMPAT
-int compat_udpv6_setsockopt(struct sock *sk, int level, int optname,
- char __user *optval, unsigned int optlen);
-int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *optlen);
-#endif
+int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
+ unsigned int optlen);
int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
-int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
- int flags, int *addr_len);
+int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+ int *addr_len);
void udpv6_destroy_sock(struct sock *sk);
#ifdef CONFIG_PROC_FS
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 95dee9ca8d22..046f13b1d77a 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -1,21 +1,20 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IPV6 GSO/GRO offload support
* Linux INET6 implementation
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* UDPv6 GSO support
*/
#include <linux/skbuff.h>
#include <linux/netdevice.h>
+#include <linux/indirect_call_wrapper.h>
#include <net/protocol.h>
#include <net/ipv6.h>
#include <net/udp.h>
#include <net/ip6_checksum.h>
#include "ip6_offload.h"
+#include <net/gro.h>
+#include <net/gso.h>
static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
netdev_features_t features)
@@ -31,10 +30,6 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
int tnl_hlen;
int err;
- mss = skb_shinfo(skb)->gso_size;
- if (unlikely(skb->len <= mss))
- goto out;
-
if (skb->encapsulation && skb_shinfo(skb)->gso_type &
(SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))
segs = skb_udp_tunnel_segment(skb, features, true);
@@ -49,7 +44,11 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
goto out;
if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
- return __udp_gso_segment(skb, features);
+ return __udp_gso_segment(skb, features, true);
+
+ mss = skb_shinfo(skb)->gso_size;
+ if (unlikely(skb->len <= mss))
+ goto out;
/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
* do checksum of UDP packets sent as multiple IP fragments.
@@ -114,10 +113,31 @@ out:
return segs;
}
-static struct sk_buff *udp6_gro_receive(struct list_head *head,
- struct sk_buff *skb)
+static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport,
+ __be16 dport)
+{
+ const struct ipv6hdr *iph = skb_gro_network_header(skb);
+ struct net *net = dev_net_rcu(skb->dev);
+ struct sock *sk;
+ int iif, sdif;
+
+ sk = udp_tunnel_sk(net, true);
+ if (sk && dport == htons(sk->sk_num))
+ return sk;
+
+ inet6_get_iif_sdif(skb, &iif, &sdif);
+
+ return __udp6_lib_lookup(net, &iph->saddr, sport,
+ &iph->daddr, dport, iif,
+ sdif, net->ipv4.udp_table, NULL);
+}
+
+INDIRECT_CALLABLE_SCOPE
+struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
+ struct sock *sk = NULL;
+ struct sk_buff *pp;
if (unlikely(!uh))
goto flush;
@@ -130,48 +150,59 @@ static struct sk_buff *udp6_gro_receive(struct list_head *head,
ip6_gro_compute_pseudo))
goto flush;
else if (uh->check)
- skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check,
+ skb_gro_checksum_try_convert(skb, IPPROTO_UDP,
ip6_gro_compute_pseudo);
skip:
- NAPI_GRO_CB(skb)->is_ipv6 = 1;
- return udp_gro_receive(head, skb, uh, udp6_lib_lookup_skb);
+ if (static_branch_unlikely(&udpv6_encap_needed_key))
+ sk = udp6_gro_lookup_skb(skb, uh->source, uh->dest);
+
+ pp = udp_gro_receive(head, skb, uh, sk);
+ return pp;
flush:
NAPI_GRO_CB(skb)->flush = 1;
return NULL;
}
-static int udp6_gro_complete(struct sk_buff *skb, int nhoff)
+INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff)
{
- const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+ const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
+ const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + offset);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
- if (uh->check) {
- skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+ /* do fraglist only if there is no outer UDP encap (or we already processed it) */
+ if (NAPI_GRO_CB(skb)->is_flist && !NAPI_GRO_CB(skb)->encap_mark) {
+ uh->len = htons(skb->len - nhoff);
+
+ skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4);
+ skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count;
+
+ __skb_incr_checksum_unnecessary(skb);
+
+ return 0;
+ }
+
+ if (uh->check)
uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr,
&ipv6h->daddr, 0);
- } else {
- skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
- }
return udp_gro_complete(skb, nhoff, udp6_lib_lookup_skb);
}
-static const struct net_offload udpv6_offload = {
- .callbacks = {
- .gso_segment = udp6_ufo_fragment,
- .gro_receive = udp6_gro_receive,
- .gro_complete = udp6_gro_complete,
- },
-};
-
-int udpv6_offload_init(void)
+int __init udpv6_offload_init(void)
{
- return inet6_add_offload(&udpv6_offload, IPPROTO_UDP);
+ net_hotdata.udpv6_offload = (struct net_offload) {
+ .callbacks = {
+ .gso_segment = udp6_ufo_fragment,
+ .gro_receive = udp6_gro_receive,
+ .gro_complete = udp6_gro_complete,
+ },
+ };
+ return inet6_add_offload(&net_hotdata.udpv6_offload, IPPROTO_UDP);
}
int udpv6_offload_exit(void)
{
- return inet6_del_offload(&udpv6_offload, IPPROTO_UDP);
+ return inet6_del_offload(&net_hotdata.udpv6_offload, IPPROTO_UDP);
}
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 5000ad6878e6..2cec542437f7 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* UDPLITEv6 An implementation of the UDP-Lite protocol over IPv6.
* See also net/ipv4/udplite.c
@@ -6,25 +7,32 @@
*
* Changes:
* Fixes:
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
+#define pr_fmt(fmt) "UDPLite6: " fmt
+
#include <linux/export.h>
#include <linux/proc_fs.h>
#include "udp_impl.h"
+static int udplitev6_sk_init(struct sock *sk)
+{
+ udpv6_init_sock(sk);
+ pr_warn_once("UDP-Lite is deprecated and scheduled to be removed in 2025, "
+ "please contact the netdev mailing list\n");
+ return 0;
+}
+
static int udplitev6_rcv(struct sk_buff *skb)
{
return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE);
}
-static void udplitev6_err(struct sk_buff *skb,
+static int udplitev6_err(struct sk_buff *skb,
struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
- __udp6_lib_err(skb, opt, type, code, offset, info, &udplite_table);
+ return __udp6_lib_err(skb, opt, type, code, offset, info,
+ &udplite_table);
}
static const struct inet6_protocol udplitev6_protocol = {
@@ -40,7 +48,7 @@ struct proto udplitev6_prot = {
.connect = ip6_datagram_connect,
.disconnect = udp_disconnect,
.ioctl = udp_ioctl,
- .init = udplite_sk_init,
+ .init = udplitev6_sk_init,
.destroy = udpv6_destroy_sock,
.setsockopt = udpv6_setsockopt,
.getsockopt = udpv6_getsockopt,
@@ -48,15 +56,18 @@ struct proto udplitev6_prot = {
.recvmsg = udpv6_recvmsg,
.hash = udp_lib_hash,
.unhash = udp_lib_unhash,
+ .rehash = udp_v6_rehash,
.get_port = udp_v6_get_port,
- .memory_allocated = &udp_memory_allocated,
+
+ .memory_allocated = &net_aligned_data.udp_memory_allocated,
+ .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc,
+
.sysctl_mem = sysctl_udp_mem,
+ .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
+ .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
.obj_size = sizeof(struct udp6_sock),
+ .ipv6_pinfo_offset = offsetof(struct udp6_sock, inet6),
.h.udp_table = &udplite_table,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_udpv6_setsockopt,
- .compat_getsockopt = compat_udpv6_getsockopt,
-#endif
};
static struct inet_protosw udplite6_protosw = {
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 9ef490dddcea..9005fc156a20 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -16,11 +16,8 @@
#include <linux/netfilter_ipv6.h>
#include <net/ipv6.h>
#include <net/xfrm.h>
-
-int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb)
-{
- return xfrm6_extract_header(skb);
-}
+#include <net/protocol.h>
+#include <net/gro.h>
int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi,
struct ip6_tnl *t)
@@ -35,15 +32,18 @@ EXPORT_SYMBOL(xfrm6_rcv_spi);
static int xfrm6_transport_finish2(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
- if (xfrm_trans_queue(skb, ip6_rcv_finish))
- __kfree_skb(skb);
- return -1;
+ if (xfrm_trans_queue(skb, ip6_rcv_finish)) {
+ kfree_skb(skb);
+ return NET_RX_DROP;
+ }
+
+ return 0;
}
int xfrm6_transport_finish(struct sk_buff *skb, int async)
{
struct xfrm_offload *xo = xfrm_offload(skb);
- int nhlen = skb->data - skb_network_header(skb);
+ int nhlen = -skb_network_offset(skb);
skb_network_header(skb)[IP6CB(skb)->nhoff] =
XFRM_MODE_SKB_CB(skb)->protocol;
@@ -58,15 +58,166 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
skb_postpush_rcsum(skb, skb_network_header(skb), nhlen);
if (xo && (xo->flags & XFRM_GRO)) {
- skb_mac_header_rebuild(skb);
+ /* The full l2 header needs to be preserved so that re-injecting the packet at l2
+ * works correctly in the presence of vlan tags.
+ */
+ skb_mac_header_rebuild_full(skb, xo->orig_mac_len);
+ skb_reset_network_header(skb);
skb_reset_transport_header(skb);
- return -1;
+ return 0;
}
NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
dev_net(skb->dev), NULL, skb, skb->dev, NULL,
xfrm6_transport_finish2);
- return -1;
+ return 0;
+}
+
+static int __xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb, bool pull)
+{
+ struct udp_sock *up = udp_sk(sk);
+ struct udphdr *uh;
+ struct ipv6hdr *ip6h;
+ int len;
+ int ip6hlen = sizeof(struct ipv6hdr);
+ __u8 *udpdata;
+ __be32 *udpdata32;
+ u16 encap_type;
+
+ encap_type = READ_ONCE(up->encap_type);
+ /* if this is not encapsulated socket, then just return now */
+ if (!encap_type)
+ return 1;
+
+ /* If this is a paged skb, make sure we pull up
+ * whatever data we need to look at. */
+ len = skb->len - sizeof(struct udphdr);
+ if (!pskb_may_pull(skb, sizeof(struct udphdr) + min(len, 8)))
+ return 1;
+
+ /* Now we can get the pointers */
+ uh = udp_hdr(skb);
+ udpdata = (__u8 *)uh + sizeof(struct udphdr);
+ udpdata32 = (__be32 *)udpdata;
+
+ switch (encap_type) {
+ default:
+ case UDP_ENCAP_ESPINUDP:
+ /* Check if this is a keepalive packet. If so, eat it. */
+ if (len == 1 && udpdata[0] == 0xff) {
+ return -EINVAL;
+ } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) {
+ /* ESP Packet without Non-ESP header */
+ len = sizeof(struct udphdr);
+ } else
+ /* Must be an IKE packet.. pass it through */
+ return 1;
+ break;
+ }
+
+ /* At this point we are sure that this is an ESPinUDP packet,
+ * so we need to remove 'len' bytes from the packet (the UDP
+ * header and optional ESP marker bytes) and then modify the
+ * protocol to ESP, and then call into the transform receiver.
+ */
+ if (skb_unclone(skb, GFP_ATOMIC))
+ return -EINVAL;
+
+ /* Now we can update and verify the packet length... */
+ ip6h = ipv6_hdr(skb);
+ ip6h->payload_len = htons(ntohs(ip6h->payload_len) - len);
+ if (skb->len < ip6hlen + len) {
+ /* packet is too small!?! */
+ return -EINVAL;
+ }
+
+ /* pull the data buffer up to the ESP header and set the
+ * transport header to point to ESP. Keep UDP on the stack
+ * for later.
+ */
+ if (pull) {
+ __skb_pull(skb, len);
+ skb_reset_transport_header(skb);
+ } else {
+ skb_set_transport_header(skb, len);
+ }
+
+ /* process ESP */
+ return 0;
+}
+
+/* If it's a keepalive packet, then just eat it.
+ * If it's an encapsulated packet, then pass it to the
+ * IPsec xfrm input.
+ * Returns 0 if skb passed to xfrm or was dropped.
+ * Returns >0 if skb should be passed to UDP.
+ * Returns <0 if skb should be resubmitted (-ret is protocol)
+ */
+int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ int ret;
+
+ if (skb->protocol == htons(ETH_P_IP))
+ return xfrm4_udp_encap_rcv(sk, skb);
+
+ ret = __xfrm6_udp_encap_rcv(sk, skb, true);
+ if (!ret)
+ return xfrm6_rcv_encap(skb, IPPROTO_ESP, 0,
+ udp_sk(sk)->encap_type);
+
+ if (ret < 0) {
+ kfree_skb(skb);
+ return 0;
+ }
+
+ return ret;
+}
+
+struct sk_buff *xfrm6_gro_udp_encap_rcv(struct sock *sk, struct list_head *head,
+ struct sk_buff *skb)
+{
+ int offset = skb_gro_offset(skb);
+ const struct net_offload *ops;
+ struct sk_buff *pp = NULL;
+ int len, dlen;
+ __u8 *udpdata;
+ __be32 *udpdata32;
+
+ if (skb->protocol == htons(ETH_P_IP))
+ return xfrm4_gro_udp_encap_rcv(sk, head, skb);
+
+ len = skb->len - offset;
+ dlen = offset + min(len, 8);
+ udpdata = skb_gro_header(skb, dlen, offset);
+ udpdata32 = (__be32 *)udpdata;
+ if (unlikely(!udpdata))
+ return NULL;
+
+ rcu_read_lock();
+ ops = rcu_dereference(inet6_offloads[IPPROTO_ESP]);
+ if (!ops || !ops->callbacks.gro_receive)
+ goto out;
+
+ /* check if it is a keepalive or IKE packet */
+ if (len <= sizeof(struct ip_esp_hdr) || udpdata32[0] == 0)
+ goto out;
+
+ /* set the transport header to ESP */
+ skb_set_transport_header(skb, offset);
+
+ NAPI_GRO_CB(skb)->proto = IPPROTO_UDP;
+
+ pp = call_gro_receive(ops->callbacks.gro_receive, head, skb);
+ rcu_read_unlock();
+
+ return pp;
+
+out:
+ rcu_read_unlock();
+ NAPI_GRO_CB(skb)->same_flow = 0;
+ NAPI_GRO_CB(skb)->flush = 1;
+
+ return NULL;
}
int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t)
@@ -86,14 +237,16 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
{
struct net *net = dev_net(skb->dev);
struct xfrm_state *x = NULL;
+ struct sec_path *sp;
int i = 0;
- if (secpath_set(skb)) {
+ sp = secpath_set(skb);
+ if (!sp) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR);
goto drop;
}
- if (1 + skb->sp->len == XFRM_MAX_DEPTH) {
+ if (1 + sp->len == XFRM_MAX_DEPTH) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
goto drop;
}
@@ -122,6 +275,13 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
if (!x)
continue;
+ if (unlikely(x->dir && x->dir != XFRM_SA_DIR_IN)) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEDIRERROR);
+ xfrm_state_put(x);
+ x = NULL;
+ continue;
+ }
+
spin_lock(&x->lock);
if ((!i || (x->props.flags & XFRM_STATE_WILDRECV)) &&
@@ -145,7 +305,7 @@ int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr,
goto drop;
}
- skb->sp->xvec[skb->sp->len++] = x;
+ sp->xvec[sp->len++] = x;
spin_lock(&x->lock);
diff --git a/net/ipv6/xfrm6_mode_beet.c b/net/ipv6/xfrm6_mode_beet.c
deleted file mode 100644
index 57fd314ec2b8..000000000000
--- a/net/ipv6/xfrm6_mode_beet.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * xfrm6_mode_beet.c - BEET mode encapsulation for IPv6.
- *
- * Copyright (c) 2006 Diego Beltrami <diego.beltrami@gmail.com>
- * Miika Komu <miika@iki.fi>
- * Herbert Xu <herbert@gondor.apana.org.au>
- * Abhinav Pathak <abhinav.pathak@hiit.fi>
- * Jeff Ahrenholz <ahrenholz@gmail.com>
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/stringify.h>
-#include <net/dsfield.h>
-#include <net/dst.h>
-#include <net/inet_ecn.h>
-#include <net/ipv6.h>
-#include <net/xfrm.h>
-
-static void xfrm6_beet_make_header(struct sk_buff *skb)
-{
- struct ipv6hdr *iph = ipv6_hdr(skb);
-
- iph->version = 6;
-
- memcpy(iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl,
- sizeof(iph->flow_lbl));
- iph->nexthdr = XFRM_MODE_SKB_CB(skb)->protocol;
-
- ipv6_change_dsfield(iph, 0, XFRM_MODE_SKB_CB(skb)->tos);
- iph->hop_limit = XFRM_MODE_SKB_CB(skb)->ttl;
-}
-
-/* Add encapsulation header.
- *
- * The top IP header will be constructed per draft-nikander-esp-beet-mode-06.txt.
- */
-static int xfrm6_beet_output(struct xfrm_state *x, struct sk_buff *skb)
-{
- struct ipv6hdr *top_iph;
- struct ip_beet_phdr *ph;
- int optlen, hdr_len;
-
- hdr_len = 0;
- optlen = XFRM_MODE_SKB_CB(skb)->optlen;
- if (unlikely(optlen))
- hdr_len += IPV4_BEET_PHMAXLEN - (optlen & 4);
-
- skb_set_network_header(skb, -x->props.header_len - hdr_len);
- if (x->sel.family != AF_INET6)
- skb->network_header += IPV4_BEET_PHMAXLEN;
- skb->mac_header = skb->network_header +
- offsetof(struct ipv6hdr, nexthdr);
- skb->transport_header = skb->network_header + sizeof(*top_iph);
- ph = __skb_pull(skb, XFRM_MODE_SKB_CB(skb)->ihl - hdr_len);
-
- xfrm6_beet_make_header(skb);
-
- top_iph = ipv6_hdr(skb);
- if (unlikely(optlen)) {
-
- BUG_ON(optlen < 0);
-
- ph->padlen = 4 - (optlen & 4);
- ph->hdrlen = optlen / 8;
- ph->nexthdr = top_iph->nexthdr;
- if (ph->padlen)
- memset(ph + 1, IPOPT_NOP, ph->padlen);
-
- top_iph->nexthdr = IPPROTO_BEETPH;
- }
-
- top_iph->saddr = *(struct in6_addr *)&x->props.saddr;
- top_iph->daddr = *(struct in6_addr *)&x->id.daddr;
- return 0;
-}
-
-static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb)
-{
- struct ipv6hdr *ip6h;
- int size = sizeof(struct ipv6hdr);
- int err;
-
- err = skb_cow_head(skb, size + skb->mac_len);
- if (err)
- goto out;
-
- __skb_push(skb, size);
- skb_reset_network_header(skb);
- skb_mac_header_rebuild(skb);
-
- xfrm6_beet_make_header(skb);
-
- ip6h = ipv6_hdr(skb);
- ip6h->payload_len = htons(skb->len - size);
- ip6h->daddr = x->sel.daddr.in6;
- ip6h->saddr = x->sel.saddr.in6;
- err = 0;
-out:
- return err;
-}
-
-static struct xfrm_mode xfrm6_beet_mode = {
- .input2 = xfrm6_beet_input,
- .input = xfrm_prepare_input,
- .output2 = xfrm6_beet_output,
- .output = xfrm6_prepare_output,
- .owner = THIS_MODULE,
- .encap = XFRM_MODE_BEET,
- .flags = XFRM_MODE_FLAG_TUNNEL,
-};
-
-static int __init xfrm6_beet_init(void)
-{
- return xfrm_register_mode(&xfrm6_beet_mode, AF_INET6);
-}
-
-static void __exit xfrm6_beet_exit(void)
-{
- int err;
-
- err = xfrm_unregister_mode(&xfrm6_beet_mode, AF_INET6);
- BUG_ON(err);
-}
-
-module_init(xfrm6_beet_init);
-module_exit(xfrm6_beet_exit);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_BEET);
diff --git a/net/ipv6/xfrm6_mode_ro.c b/net/ipv6/xfrm6_mode_ro.c
deleted file mode 100644
index da28e4407b8f..000000000000
--- a/net/ipv6/xfrm6_mode_ro.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * xfrm6_mode_ro.c - Route optimization mode for IPv6.
- *
- * Copyright (C)2003-2006 Helsinki University of Technology
- * Copyright (C)2003-2006 USAGI/WIDE Project
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- */
-/*
- * Authors:
- * Noriaki TAKAMIYA @USAGI
- * Masahide NAKAMURA @USAGI
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/spinlock.h>
-#include <linux/stringify.h>
-#include <linux/time.h>
-#include <net/ipv6.h>
-#include <net/xfrm.h>
-
-/* Add route optimization header space.
- *
- * The IP header and mutable extension headers will be moved forward to make
- * space for the route optimization header.
- */
-static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb)
-{
- struct ipv6hdr *iph;
- u8 *prevhdr;
- int hdr_len;
-
- iph = ipv6_hdr(skb);
-
- hdr_len = x->type->hdr_offset(x, skb, &prevhdr);
- if (hdr_len < 0)
- return hdr_len;
- skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data);
- skb_set_network_header(skb, -x->props.header_len);
- skb->transport_header = skb->network_header + hdr_len;
- __skb_pull(skb, hdr_len);
- memmove(ipv6_hdr(skb), iph, hdr_len);
-
- x->lastused = ktime_get_real_seconds();
-
- return 0;
-}
-
-static struct xfrm_mode xfrm6_ro_mode = {
- .output = xfrm6_ro_output,
- .owner = THIS_MODULE,
- .encap = XFRM_MODE_ROUTEOPTIMIZATION,
-};
-
-static int __init xfrm6_ro_init(void)
-{
- return xfrm_register_mode(&xfrm6_ro_mode, AF_INET6);
-}
-
-static void __exit xfrm6_ro_exit(void)
-{
- int err;
-
- err = xfrm_unregister_mode(&xfrm6_ro_mode, AF_INET6);
- BUG_ON(err);
-}
-
-module_init(xfrm6_ro_init);
-module_exit(xfrm6_ro_exit);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_ROUTEOPTIMIZATION);
diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c
deleted file mode 100644
index 3c29da5defe6..000000000000
--- a/net/ipv6/xfrm6_mode_transport.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * xfrm6_mode_transport.c - Transport mode encapsulation for IPv6.
- *
- * Copyright (C) 2002 USAGI/WIDE Project
- * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/stringify.h>
-#include <net/dst.h>
-#include <net/ipv6.h>
-#include <net/xfrm.h>
-#include <net/protocol.h>
-
-/* Add encapsulation header.
- *
- * The IP header and mutable extension headers will be moved forward to make
- * space for the encapsulation header.
- */
-static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb)
-{
- struct ipv6hdr *iph;
- u8 *prevhdr;
- int hdr_len;
-
- iph = ipv6_hdr(skb);
- skb_set_inner_transport_header(skb, skb_transport_offset(skb));
-
- hdr_len = x->type->hdr_offset(x, skb, &prevhdr);
- if (hdr_len < 0)
- return hdr_len;
- skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data);
- skb_set_network_header(skb, -x->props.header_len);
- skb->transport_header = skb->network_header + hdr_len;
- __skb_pull(skb, hdr_len);
- memmove(ipv6_hdr(skb), iph, hdr_len);
- return 0;
-}
-
-/* Remove encapsulation header.
- *
- * The IP header will be moved over the top of the encapsulation header.
- *
- * On entry, skb->h shall point to where the IP header should be and skb->nh
- * shall be set to where the IP header currently is. skb->data shall point
- * to the start of the payload.
- */
-static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb)
-{
- int ihl = skb->data - skb_transport_header(skb);
-
- if (skb->transport_header != skb->network_header) {
- memmove(skb_transport_header(skb),
- skb_network_header(skb), ihl);
- skb->network_header = skb->transport_header;
- }
- ipv6_hdr(skb)->payload_len = htons(skb->len + ihl -
- sizeof(struct ipv6hdr));
- skb_reset_transport_header(skb);
- return 0;
-}
-
-static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x,
- struct sk_buff *skb,
- netdev_features_t features)
-{
- const struct net_offload *ops;
- struct sk_buff *segs = ERR_PTR(-EINVAL);
- struct xfrm_offload *xo = xfrm_offload(skb);
-
- skb->transport_header += x->props.header_len;
- ops = rcu_dereference(inet6_offloads[xo->proto]);
- if (likely(ops && ops->callbacks.gso_segment))
- segs = ops->callbacks.gso_segment(skb, features);
-
- return segs;
-}
-
-static void xfrm6_transport_xmit(struct xfrm_state *x, struct sk_buff *skb)
-{
- struct xfrm_offload *xo = xfrm_offload(skb);
-
- skb_reset_mac_len(skb);
- pskb_pull(skb, skb->mac_len + sizeof(struct ipv6hdr) + x->props.header_len);
-
- if (xo->flags & XFRM_GSO_SEGMENT) {
- skb_reset_transport_header(skb);
- skb->transport_header -= x->props.header_len;
- }
-}
-
-
-static struct xfrm_mode xfrm6_transport_mode = {
- .input = xfrm6_transport_input,
- .output = xfrm6_transport_output,
- .gso_segment = xfrm4_transport_gso_segment,
- .xmit = xfrm6_transport_xmit,
- .owner = THIS_MODULE,
- .encap = XFRM_MODE_TRANSPORT,
-};
-
-static int __init xfrm6_transport_init(void)
-{
- return xfrm_register_mode(&xfrm6_transport_mode, AF_INET6);
-}
-
-static void __exit xfrm6_transport_exit(void)
-{
- int err;
-
- err = xfrm_unregister_mode(&xfrm6_transport_mode, AF_INET6);
- BUG_ON(err);
-}
-
-module_init(xfrm6_transport_init);
-module_exit(xfrm6_transport_exit);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TRANSPORT);
diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c
deleted file mode 100644
index de1b0b8c53b0..000000000000
--- a/net/ipv6/xfrm6_mode_tunnel.c
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * xfrm6_mode_tunnel.c - Tunnel mode encapsulation for IPv6.
- *
- * Copyright (C) 2002 USAGI/WIDE Project
- * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
- */
-
-#include <linux/gfp.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/stringify.h>
-#include <net/dsfield.h>
-#include <net/dst.h>
-#include <net/inet_ecn.h>
-#include <net/ip6_route.h>
-#include <net/ipv6.h>
-#include <net/xfrm.h>
-
-static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
-{
- struct ipv6hdr *inner_iph = ipipv6_hdr(skb);
-
- if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos))
- IP6_ECN_set_ce(skb, inner_iph);
-}
-
-/* Add encapsulation header.
- *
- * The top IP header will be constructed per RFC 2401.
- */
-static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
-{
- struct dst_entry *dst = skb_dst(skb);
- struct ipv6hdr *top_iph;
- int dsfield;
-
- skb_set_inner_network_header(skb, skb_network_offset(skb));
- skb_set_inner_transport_header(skb, skb_transport_offset(skb));
-
- skb_set_network_header(skb, -x->props.header_len);
- skb->mac_header = skb->network_header +
- offsetof(struct ipv6hdr, nexthdr);
- skb->transport_header = skb->network_header + sizeof(*top_iph);
- top_iph = ipv6_hdr(skb);
-
- top_iph->version = 6;
-
- memcpy(top_iph->flow_lbl, XFRM_MODE_SKB_CB(skb)->flow_lbl,
- sizeof(top_iph->flow_lbl));
- top_iph->nexthdr = xfrm_af2proto(skb_dst(skb)->ops->family);
-
- if (x->props.extra_flags & XFRM_SA_XFLAG_DONT_ENCAP_DSCP)
- dsfield = 0;
- else
- dsfield = XFRM_MODE_SKB_CB(skb)->tos;
- dsfield = INET_ECN_encapsulate(dsfield, XFRM_MODE_SKB_CB(skb)->tos);
- if (x->props.flags & XFRM_STATE_NOECN)
- dsfield &= ~INET_ECN_MASK;
- ipv6_change_dsfield(top_iph, 0, dsfield);
- top_iph->hop_limit = ip6_dst_hoplimit(xfrm_dst_child(dst));
- top_iph->saddr = *(struct in6_addr *)&x->props.saddr;
- top_iph->daddr = *(struct in6_addr *)&x->id.daddr;
- return 0;
-}
-
-#define for_each_input_rcu(head, handler) \
- for (handler = rcu_dereference(head); \
- handler != NULL; \
- handler = rcu_dereference(handler->next))
-
-
-static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb)
-{
- int err = -EINVAL;
-
- if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPV6)
- goto out;
- if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
- goto out;
-
- err = skb_unclone(skb, GFP_ATOMIC);
- if (err)
- goto out;
-
- if (x->props.flags & XFRM_STATE_DECAP_DSCP)
- ipv6_copy_dscp(ipv6_get_dsfield(ipv6_hdr(skb)),
- ipipv6_hdr(skb));
- if (!(x->props.flags & XFRM_STATE_NOECN))
- ipip6_ecn_decapsulate(skb);
-
- skb_reset_network_header(skb);
- skb_mac_header_rebuild(skb);
- if (skb->mac_len)
- eth_hdr(skb)->h_proto = skb->protocol;
-
- err = 0;
-
-out:
- return err;
-}
-
-static struct sk_buff *xfrm6_mode_tunnel_gso_segment(struct xfrm_state *x,
- struct sk_buff *skb,
- netdev_features_t features)
-{
- __skb_push(skb, skb->mac_len);
- return skb_mac_gso_segment(skb, features);
-}
-
-static void xfrm6_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb)
-{
- struct xfrm_offload *xo = xfrm_offload(skb);
-
- if (xo->flags & XFRM_GSO_SEGMENT)
- skb->transport_header = skb->network_header + sizeof(struct ipv6hdr);
-
- skb_reset_mac_len(skb);
- pskb_pull(skb, skb->mac_len + x->props.header_len);
-}
-
-static struct xfrm_mode xfrm6_tunnel_mode = {
- .input2 = xfrm6_mode_tunnel_input,
- .input = xfrm_prepare_input,
- .output2 = xfrm6_mode_tunnel_output,
- .output = xfrm6_prepare_output,
- .gso_segment = xfrm6_mode_tunnel_gso_segment,
- .xmit = xfrm6_mode_tunnel_xmit,
- .owner = THIS_MODULE,
- .encap = XFRM_MODE_TUNNEL,
- .flags = XFRM_MODE_FLAG_TUNNEL,
-};
-
-static int __init xfrm6_mode_tunnel_init(void)
-{
- return xfrm_register_mode(&xfrm6_tunnel_mode, AF_INET6);
-}
-
-static void __exit xfrm6_mode_tunnel_exit(void)
-{
- int err;
-
- err = xfrm_unregister_mode(&xfrm6_tunnel_mode, AF_INET6);
- BUG_ON(err);
-}
-
-module_init(xfrm6_mode_tunnel_init);
-module_exit(xfrm6_mode_tunnel_exit);
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TUNNEL);
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 6a74080005cf..512bdaf13699 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* xfrm6_output.c - Common IPsec encapsulation code for IPv6.
* Copyright (C) 2002 USAGI/WIDE Project
* Copyright (c) 2004 Herbert Xu <herbert@gondor.apana.org.au>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/if_ether.h>
@@ -20,31 +16,7 @@
#include <net/ip6_route.h>
#include <net/xfrm.h>
-int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb,
- u8 **prevhdr)
-{
- return ip6_find_1stfragopt(skb, prevhdr);
-}
-EXPORT_SYMBOL(xfrm6_find_1stfragopt);
-
-static int xfrm6_local_dontfrag(struct sk_buff *skb)
-{
- int proto;
- struct sock *sk = skb->sk;
-
- if (sk) {
- if (sk->sk_family != AF_INET6)
- return 0;
-
- proto = sk->sk_protocol;
- if (proto == IPPROTO_UDP || proto == IPPROTO_RAW)
- return inet6_sk(sk)->dontfrag;
- }
-
- return 0;
-}
-
-static void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu)
+void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu)
{
struct flowi6 fl6;
struct sock *sk = skb->sk;
@@ -68,87 +40,29 @@ void xfrm6_local_error(struct sk_buff *skb, u32 mtu)
ipv6_local_error(sk, EMSGSIZE, &fl6, mtu);
}
-static int xfrm6_tunnel_check_size(struct sk_buff *skb)
-{
- int mtu, ret = 0;
- struct dst_entry *dst = skb_dst(skb);
-
- if (skb->ignore_df)
- goto out;
-
- mtu = dst_mtu(dst);
- if (mtu < IPV6_MIN_MTU)
- mtu = IPV6_MIN_MTU;
-
- if ((!skb_is_gso(skb) && skb->len > mtu) ||
- (skb_is_gso(skb) &&
- !skb_gso_validate_network_len(skb, ip6_skb_dst_mtu(skb)))) {
- skb->dev = dst->dev;
- skb->protocol = htons(ETH_P_IPV6);
-
- if (xfrm6_local_dontfrag(skb))
- xfrm6_local_rxpmtu(skb, mtu);
- else if (skb->sk)
- xfrm_local_error(skb, mtu);
- else
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
- ret = -EMSGSIZE;
- }
-out:
- return ret;
-}
-
-int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb)
-{
- int err;
-
- err = xfrm6_tunnel_check_size(skb);
- if (err)
- return err;
-
- XFRM_MODE_SKB_CB(skb)->protocol = ipv6_hdr(skb)->nexthdr;
-
- return xfrm6_extract_header(skb);
-}
-
-int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
-{
- int err;
-
- err = xfrm_inner_extract_output(x, skb);
- if (err)
- return err;
-
- skb->ignore_df = 1;
- skb->protocol = htons(ETH_P_IPV6);
-
- return x->outer_mode->output2(x, skb);
-}
-EXPORT_SYMBOL(xfrm6_prepare_output);
-
-int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb)
+static int __xfrm6_output_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
-
-#ifdef CONFIG_NETFILTER
- IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED;
-#endif
-
return xfrm_output(sk, skb);
}
-static int __xfrm6_output_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int xfrm6_noneed_fragment(struct sk_buff *skb)
{
- struct xfrm_state *x = skb_dst(skb)->xfrm;
-
- return x->outer_mode->afinfo->output_finish(sk, skb);
+ struct frag_hdr *fh;
+ u8 prevhdr = ipv6_hdr(skb)->nexthdr;
+
+ if (prevhdr != NEXTHDR_FRAGMENT)
+ return 0;
+ fh = (struct frag_hdr *)(skb->data + sizeof(struct ipv6hdr));
+ if (fh->nexthdr == NEXTHDR_ESP || fh->nexthdr == NEXTHDR_AUTH)
+ return 1;
+ return 0;
}
static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct xfrm_state *x = dst->xfrm;
- int mtu;
+ unsigned int mtu;
bool toobig;
#ifdef CONFIG_NETFILTER
@@ -168,28 +82,31 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
toobig = skb->len > mtu && !skb_is_gso(skb);
- if (toobig && xfrm6_local_dontfrag(skb)) {
+ if (toobig && xfrm6_local_dontfrag(sk)) {
xfrm6_local_rxpmtu(skb, mtu);
kfree_skb(skb);
return -EMSGSIZE;
- } else if (!skb->ignore_df && toobig && skb->sk) {
+ } else if (toobig && xfrm6_noneed_fragment(skb)) {
+ skb->ignore_df = 1;
+ goto skip_frag;
+ } else if (!skb->ignore_df && toobig && sk) {
xfrm_local_error(skb, mtu);
kfree_skb(skb);
return -EMSGSIZE;
}
- if (toobig || dst_allfrag(skb_dst(skb)))
+ if (toobig)
return ip6_fragment(net, sk, skb,
__xfrm6_output_finish);
skip_frag:
- return x->outer_mode->afinfo->output_finish(sk, skb);
+ return xfrm_output(sk, skb);
}
int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
- net, sk, skb, NULL, skb_dst(skb)->dev,
+ net, sk, skb, skb->dev, skb_dst_dev(skb),
__xfrm6_output,
!(IP6CB(skb)->flags & IP6SKB_REROUTED));
}
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index ef3defaf43b9..1f19b6f14484 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -22,28 +22,25 @@
#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <net/l3mdev.h>
-#if IS_ENABLED(CONFIG_IPV6_MIP6)
-#include <net/mip6.h>
-#endif
-static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
- const xfrm_address_t *saddr,
- const xfrm_address_t *daddr,
- u32 mark)
+static struct dst_entry *xfrm6_dst_lookup(const struct xfrm_dst_lookup_params *params)
{
struct flowi6 fl6;
struct dst_entry *dst;
int err;
memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_oif = l3mdev_master_ifindex_by_index(net, oif);
- fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF;
- fl6.flowi6_mark = mark;
- memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr));
- if (saddr)
- memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr));
+ fl6.flowi6_l3mdev = l3mdev_master_ifindex_by_index(params->net,
+ params->oif);
+ fl6.flowi6_mark = params->mark;
+ memcpy(&fl6.daddr, params->daddr, sizeof(fl6.daddr));
+ if (params->saddr)
+ memcpy(&fl6.saddr, params->saddr, sizeof(fl6.saddr));
+
+ fl6.flowi4_proto = params->ipproto;
+ fl6.uli = params->uli;
- dst = ip6_route_output(net, NULL, &fl6);
+ dst = ip6_route_output(params->net, NULL, &fl6);
err = dst->error;
if (dst->error) {
@@ -54,52 +51,40 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif,
return dst;
}
-static int xfrm6_get_saddr(struct net *net, int oif,
- xfrm_address_t *saddr, xfrm_address_t *daddr,
- u32 mark)
+static int xfrm6_get_saddr(xfrm_address_t *saddr,
+ const struct xfrm_dst_lookup_params *params)
{
struct dst_entry *dst;
struct net_device *dev;
+ struct inet6_dev *idev;
- dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr, mark);
+ dst = xfrm6_dst_lookup(params);
if (IS_ERR(dst))
return -EHOSTUNREACH;
- dev = ip6_dst_idev(dst)->dev;
- ipv6_dev_get_saddr(dev_net(dev), dev, &daddr->in6, 0, &saddr->in6);
- dst_release(dst);
- return 0;
-}
-
-static int xfrm6_get_tos(const struct flowi *fl)
-{
- return 0;
-}
-
-static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst,
- int nfheader_len)
-{
- if (dst->ops->family == AF_INET6) {
- struct rt6_info *rt = (struct rt6_info *)dst;
- path->path_cookie = rt6_get_cookie(rt);
+ idev = ip6_dst_idev(dst);
+ if (!idev) {
+ dst_release(dst);
+ return -EHOSTUNREACH;
}
-
- path->u.rt6.rt6i_nfheader_len = nfheader_len;
-
+ dev = idev->dev;
+ ipv6_dev_get_saddr(dev_net(dev), dev, &params->daddr->in6, 0,
+ &saddr->in6);
+ dst_release(dst);
return 0;
}
static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
const struct flowi *fl)
{
- struct rt6_info *rt = (struct rt6_info *)xdst->route;
+ struct rt6_info *rt = dst_rt6_info(xdst->route);
xdst->u.dst.dev = dev;
- dev_hold(dev);
+ netdev_hold(dev, &xdst->u.dst.dev_tracker, GFP_ATOMIC);
xdst->u.rt6.rt6i_idev = in6_dev_get(dev);
if (!xdst->u.rt6.rt6i_idev) {
- dev_put(dev);
+ netdev_put(dev, &xdst->u.dst.dev_tracker);
return -ENODEV;
}
@@ -111,122 +96,19 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway;
xdst->u.rt6.rt6i_dst = rt->rt6i_dst;
xdst->u.rt6.rt6i_src = rt->rt6i_src;
- INIT_LIST_HEAD(&xdst->u.rt6.rt6i_uncached);
rt6_uncached_list_add(&xdst->u.rt6);
- atomic_inc(&dev_net(dev)->ipv6.rt6_stats->fib_rt_uncache);
return 0;
}
-static inline void
-_decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
-{
- struct flowi6 *fl6 = &fl->u.ip6;
- int onlyproto = 0;
- const struct ipv6hdr *hdr = ipv6_hdr(skb);
- u32 offset = sizeof(*hdr);
- struct ipv6_opt_hdr *exthdr;
- const unsigned char *nh = skb_network_header(skb);
- u16 nhoff = IP6CB(skb)->nhoff;
- int oif = 0;
- u8 nexthdr;
-
- if (!nhoff)
- nhoff = offsetof(struct ipv6hdr, nexthdr);
-
- nexthdr = nh[nhoff];
-
- if (skb_dst(skb))
- oif = skb_dst(skb)->dev->ifindex;
-
- memset(fl6, 0, sizeof(struct flowi6));
- fl6->flowi6_mark = skb->mark;
- fl6->flowi6_oif = reverse ? skb->skb_iif : oif;
-
- fl6->daddr = reverse ? hdr->saddr : hdr->daddr;
- fl6->saddr = reverse ? hdr->daddr : hdr->saddr;
-
- while (nh + offset + 1 < skb->data ||
- pskb_may_pull(skb, nh + offset + 1 - skb->data)) {
- nh = skb_network_header(skb);
- exthdr = (struct ipv6_opt_hdr *)(nh + offset);
-
- switch (nexthdr) {
- case NEXTHDR_FRAGMENT:
- onlyproto = 1;
- /* fall through */
- case NEXTHDR_ROUTING:
- case NEXTHDR_HOP:
- case NEXTHDR_DEST:
- offset += ipv6_optlen(exthdr);
- nexthdr = exthdr->nexthdr;
- exthdr = (struct ipv6_opt_hdr *)(nh + offset);
- break;
-
- case IPPROTO_UDP:
- case IPPROTO_UDPLITE:
- case IPPROTO_TCP:
- case IPPROTO_SCTP:
- case IPPROTO_DCCP:
- if (!onlyproto && (nh + offset + 4 < skb->data ||
- pskb_may_pull(skb, nh + offset + 4 - skb->data))) {
- __be16 *ports;
-
- nh = skb_network_header(skb);
- ports = (__be16 *)(nh + offset);
- fl6->fl6_sport = ports[!!reverse];
- fl6->fl6_dport = ports[!reverse];
- }
- fl6->flowi6_proto = nexthdr;
- return;
-
- case IPPROTO_ICMPV6:
- if (!onlyproto && (nh + offset + 2 < skb->data ||
- pskb_may_pull(skb, nh + offset + 2 - skb->data))) {
- u8 *icmp;
-
- nh = skb_network_header(skb);
- icmp = (u8 *)(nh + offset);
- fl6->fl6_icmp_type = icmp[0];
- fl6->fl6_icmp_code = icmp[1];
- }
- fl6->flowi6_proto = nexthdr;
- return;
-
-#if IS_ENABLED(CONFIG_IPV6_MIP6)
- case IPPROTO_MH:
- offset += ipv6_optlen(exthdr);
- if (!onlyproto && (nh + offset + 3 < skb->data ||
- pskb_may_pull(skb, nh + offset + 3 - skb->data))) {
- struct ip6_mh *mh;
-
- nh = skb_network_header(skb);
- mh = (struct ip6_mh *)(nh + offset);
- fl6->fl6_mh_type = mh->ip6mh_type;
- }
- fl6->flowi6_proto = nexthdr;
- return;
-#endif
-
- /* XXX Why are there these headers? */
- case IPPROTO_AH:
- case IPPROTO_ESP:
- case IPPROTO_COMP:
- default:
- fl6->fl6_ipsec_spi = 0;
- fl6->flowi6_proto = nexthdr;
- return;
- }
- }
-}
-
static void xfrm6_update_pmtu(struct dst_entry *dst, struct sock *sk,
- struct sk_buff *skb, u32 mtu)
+ struct sk_buff *skb, u32 mtu,
+ bool confirm_neigh)
{
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
struct dst_entry *path = xdst->route;
- path->ops->update_pmtu(path, sk, skb, mtu);
+ path->ops->update_pmtu(path, sk, skb, mtu, confirm_neigh);
}
static void xfrm6_redirect(struct dst_entry *dst, struct sock *sk,
@@ -242,27 +124,21 @@ static void xfrm6_dst_destroy(struct dst_entry *dst)
{
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+ dst_destroy_metrics_generic(dst);
+ rt6_uncached_list_del(&xdst->u.rt6);
if (likely(xdst->u.rt6.rt6i_idev))
in6_dev_put(xdst->u.rt6.rt6i_idev);
- dst_destroy_metrics_generic(dst);
- if (xdst->u.rt6.rt6i_uncached_list)
- rt6_uncached_list_del(&xdst->u.rt6);
xfrm_dst_destroy(xdst);
}
-static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
- int unregister)
+static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev)
{
struct xfrm_dst *xdst;
- if (!unregister)
- return;
-
xdst = (struct xfrm_dst *)dst;
if (xdst->u.rt6.rt6i_idev->dev == dev) {
struct inet6_dev *loopback_idev =
in6_dev_get(dev_net(dev)->loopback_dev);
- BUG_ON(!loopback_idev);
do {
in6_dev_put(xdst->u.rt6.rt6i_idev);
@@ -292,9 +168,6 @@ static const struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
.dst_ops = &xfrm6_dst_ops_template,
.dst_lookup = xfrm6_dst_lookup,
.get_saddr = xfrm6_get_saddr,
- .decode_session = _decode_session6,
- .get_tos = xfrm6_get_tos,
- .init_path = xfrm6_init_path,
.fill_dst = xfrm6_fill_dst,
.blackhole_route = ip6_blackhole_route,
};
@@ -318,7 +191,6 @@ static struct ctl_table xfrm6_policy_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- { }
};
static int __net_init xfrm6_net_sysctl_init(struct net *net)
@@ -335,7 +207,8 @@ static int __net_init xfrm6_net_sysctl_init(struct net *net)
table[0].data = &net->xfrm.xfrm6_dst_ops.gc_thresh;
}
- hdr = register_net_sysctl(net, "net/ipv6", table);
+ hdr = register_net_sysctl_sz(net, "net/ipv6", table,
+ ARRAY_SIZE(xfrm6_policy_table));
if (!hdr)
goto err_reg;
@@ -351,7 +224,7 @@ err_alloc:
static void __net_exit xfrm6_net_sysctl_exit(struct net *net)
{
- struct ctl_table *table;
+ const struct ctl_table *table;
if (!net->ipv6.sysctl.xfrm6_hdr)
return;
@@ -415,9 +288,19 @@ int __init xfrm6_init(void)
if (ret)
goto out_state;
- register_pernet_subsys(&xfrm6_net_ops);
+ ret = register_pernet_subsys(&xfrm6_net_ops);
+ if (ret)
+ goto out_protocol;
+
+ ret = xfrm_nat_keepalive_init(AF_INET6);
+ if (ret)
+ goto out_nat_keepalive;
out:
return ret;
+out_nat_keepalive:
+ unregister_pernet_subsys(&xfrm6_net_ops);
+out_protocol:
+ xfrm6_protocol_fini();
out_state:
xfrm6_state_fini();
out_policy:
@@ -427,6 +310,7 @@ out_policy:
void xfrm6_fini(void)
{
+ xfrm_nat_keepalive_fini(AF_INET6);
unregister_pernet_subsys(&xfrm6_net_ops);
xfrm6_protocol_fini();
xfrm6_policy_fini();
diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c
index b2dc8ce49378..ea2f805d3b01 100644
--- a/net/ipv6/xfrm6_protocol.c
+++ b/net/ipv6/xfrm6_protocol.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* xfrm6_protocol.c - Generic xfrm protocol multiplexer for ipv6.
*
* Copyright (C) 2013 secunet Security Networks AG
@@ -7,17 +8,13 @@
*
* Based on:
* net/ipv4/xfrm4_protocol.c
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/init.h>
#include <linux/mutex.h>
#include <linux/skbuff.h>
#include <linux/icmpv6.h>
+#include <net/ip6_route.h>
#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/xfrm.h>
@@ -46,7 +43,7 @@ static inline struct xfrm6_protocol __rcu **proto_handlers(u8 protocol)
handler != NULL; \
handler = rcu_dereference(handler->next)) \
-int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
+static int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
{
int ret;
struct xfrm6_protocol *handler;
@@ -61,7 +58,53 @@ int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err)
return 0;
}
-EXPORT_SYMBOL(xfrm6_rcv_cb);
+
+int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi,
+ int encap_type)
+{
+ int ret;
+ struct xfrm6_protocol *handler;
+ struct xfrm6_protocol __rcu **head = proto_handlers(nexthdr);
+
+ XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL;
+ XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
+ XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
+
+ if (!head)
+ goto out;
+
+ if (!skb_dst(skb)) {
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ int flags = RT6_LOOKUP_F_HAS_SADDR;
+ struct dst_entry *dst;
+ struct flowi6 fl6 = {
+ .flowi6_iif = skb->dev->ifindex,
+ .daddr = ip6h->daddr,
+ .saddr = ip6h->saddr,
+ .flowlabel = ip6_flowinfo(ip6h),
+ .flowi6_mark = skb->mark,
+ .flowi6_proto = ip6h->nexthdr,
+ };
+
+ dst = ip6_route_input_lookup(dev_net(skb->dev), skb->dev, &fl6,
+ skb, flags);
+ if (dst->error)
+ goto drop;
+ skb_dst_set(skb, dst);
+ }
+
+ for_each_protocol_rcu(*head, handler)
+ if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL)
+ return ret;
+
+out:
+ icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
+
+drop:
+ kfree_skb(skb);
+ return 0;
+}
+EXPORT_SYMBOL(xfrm6_rcv_encap);
static int xfrm6_esp_rcv(struct sk_buff *skb)
{
@@ -80,14 +123,16 @@ static int xfrm6_esp_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int xfrm6_esp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct xfrm6_protocol *handler;
for_each_protocol_rcu(esp6_handlers, handler)
if (!handler->err_handler(skb, opt, type, code, offset, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static int xfrm6_ah_rcv(struct sk_buff *skb)
@@ -107,14 +152,16 @@ static int xfrm6_ah_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int xfrm6_ah_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct xfrm6_protocol *handler;
for_each_protocol_rcu(ah6_handlers, handler)
if (!handler->err_handler(skb, opt, type, code, offset, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static int xfrm6_ipcomp_rcv(struct sk_buff *skb)
@@ -134,14 +181,16 @@ static int xfrm6_ipcomp_rcv(struct sk_buff *skb)
return 0;
}
-static void xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+static int xfrm6_ipcomp_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
u8 type, u8 code, int offset, __be32 info)
{
struct xfrm6_protocol *handler;
for_each_protocol_rcu(ipcomp6_handlers, handler)
if (!handler->err_handler(skb, opt, type, code, offset, info))
- break;
+ return 0;
+
+ return -ENOENT;
}
static const struct inet6_protocol esp6_protocol = {
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 5bdca3d5d6b7..6610b2198fa9 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -13,174 +13,11 @@
*/
#include <net/xfrm.h>
-#include <linux/pfkeyv2.h>
-#include <linux/ipsec.h>
-#include <linux/netfilter_ipv6.h>
-#include <linux/export.h>
-#include <net/dsfield.h>
-#include <net/ipv6.h>
-#include <net/addrconf.h>
-
-static void
-__xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl)
-{
- const struct flowi6 *fl6 = &fl->u.ip6;
-
- /* Initialize temporary selector matching only
- * to current session. */
- *(struct in6_addr *)&sel->daddr = fl6->daddr;
- *(struct in6_addr *)&sel->saddr = fl6->saddr;
- sel->dport = xfrm_flowi_dport(fl, &fl6->uli);
- sel->dport_mask = htons(0xffff);
- sel->sport = xfrm_flowi_sport(fl, &fl6->uli);
- sel->sport_mask = htons(0xffff);
- sel->family = AF_INET6;
- sel->prefixlen_d = 128;
- sel->prefixlen_s = 128;
- sel->proto = fl6->flowi6_proto;
- sel->ifindex = fl6->flowi6_oif;
-}
-
-static void
-xfrm6_init_temprop(struct xfrm_state *x, const struct xfrm_tmpl *tmpl,
- const xfrm_address_t *daddr, const xfrm_address_t *saddr)
-{
- x->id = tmpl->id;
- if (ipv6_addr_any((struct in6_addr *)&x->id.daddr))
- memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr));
- memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr));
- if (ipv6_addr_any((struct in6_addr *)&x->props.saddr))
- memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr));
- x->props.mode = tmpl->mode;
- x->props.reqid = tmpl->reqid;
- x->props.family = AF_INET6;
-}
-
-/* distribution counting sort function for xfrm_state and xfrm_tmpl */
-static int
-__xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass)
-{
- int count[XFRM_MAX_DEPTH] = { };
- int class[XFRM_MAX_DEPTH];
- int i;
-
- for (i = 0; i < n; i++) {
- int c;
- class[i] = c = cmp(src[i]);
- count[c]++;
- }
-
- for (i = 2; i < maxclass; i++)
- count[i] += count[i - 1];
-
- for (i = 0; i < n; i++) {
- dst[count[class[i] - 1]++] = src[i];
- src[i] = NULL;
- }
-
- return 0;
-}
-
-/*
- * Rule for xfrm_state:
- *
- * rule 1: select IPsec transport except AH
- * rule 2: select MIPv6 RO or inbound trigger
- * rule 3: select IPsec transport AH
- * rule 4: select IPsec tunnel
- * rule 5: others
- */
-static int __xfrm6_state_sort_cmp(void *p)
-{
- struct xfrm_state *v = p;
-
- switch (v->props.mode) {
- case XFRM_MODE_TRANSPORT:
- if (v->id.proto != IPPROTO_AH)
- return 1;
- else
- return 3;
-#if IS_ENABLED(CONFIG_IPV6_MIP6)
- case XFRM_MODE_ROUTEOPTIMIZATION:
- case XFRM_MODE_IN_TRIGGER:
- return 2;
-#endif
- case XFRM_MODE_TUNNEL:
- case XFRM_MODE_BEET:
- return 4;
- }
- return 5;
-}
-
-static int
-__xfrm6_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n)
-{
- return __xfrm6_sort((void **)dst, (void **)src, n,
- __xfrm6_state_sort_cmp, 6);
-}
-
-/*
- * Rule for xfrm_tmpl:
- *
- * rule 1: select IPsec transport
- * rule 2: select MIPv6 RO or inbound trigger
- * rule 3: select IPsec tunnel
- * rule 4: others
- */
-static int __xfrm6_tmpl_sort_cmp(void *p)
-{
- struct xfrm_tmpl *v = p;
- switch (v->mode) {
- case XFRM_MODE_TRANSPORT:
- return 1;
-#if IS_ENABLED(CONFIG_IPV6_MIP6)
- case XFRM_MODE_ROUTEOPTIMIZATION:
- case XFRM_MODE_IN_TRIGGER:
- return 2;
-#endif
- case XFRM_MODE_TUNNEL:
- case XFRM_MODE_BEET:
- return 3;
- }
- return 4;
-}
-
-static int
-__xfrm6_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n)
-{
- return __xfrm6_sort((void **)dst, (void **)src, n,
- __xfrm6_tmpl_sort_cmp, 5);
-}
-
-int xfrm6_extract_header(struct sk_buff *skb)
-{
- struct ipv6hdr *iph = ipv6_hdr(skb);
-
- XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph);
- XFRM_MODE_SKB_CB(skb)->id = 0;
- XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF);
- XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph);
- XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit;
- XFRM_MODE_SKB_CB(skb)->optlen = 0;
- memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl,
- sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl));
-
- return 0;
-}
static struct xfrm_state_afinfo xfrm6_state_afinfo = {
.family = AF_INET6,
.proto = IPPROTO_IPV6,
- .eth_proto = htons(ETH_P_IPV6),
- .owner = THIS_MODULE,
- .init_tempsel = __xfrm6_init_tempsel,
- .init_temprop = xfrm6_init_temprop,
- .tmpl_sort = __xfrm6_tmpl_sort,
- .state_sort = __xfrm6_state_sort,
.output = xfrm6_output,
- .output_finish = xfrm6_output_finish,
- .extract_input = xfrm6_extract_input,
- .extract_output = xfrm6_extract_output,
.transport_finish = xfrm6_transport_finish,
.local_error = xfrm6_local_error,
};
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index 4a46df8441c9..0a0eeaed0591 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -1,24 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright (C)2003,2004 USAGI/WIDE Project
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, see <http://www.gnu.org/licenses/>.
- *
* Authors Mitsuru KANDA <mk@linux-ipv6.org>
* YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
*
* Based on net/ipv4/xfrm4_tunnel.c
- *
*/
#include <linux/module.h>
#include <linux/xfrm.h>
@@ -91,7 +78,7 @@ static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(struct net *net, const
hlist_for_each_entry_rcu(x6spi,
&xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)],
- list_byaddr) {
+ list_byaddr, lockdep_is_held(&xfrm6_tunnel_spi_lock)) {
if (xfrm6_addr_equal(&x6spi->addr, saddr))
return x6spi;
}
@@ -144,6 +131,9 @@ static u32 __xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr)
index = __xfrm6_tunnel_spi_check(net, spi);
if (index >= 0)
goto alloc_spi;
+
+ if (spi == XFRM6_TUNNEL_SPI_MAX)
+ break;
}
for (spi = XFRM6_TUNNEL_SPI_MIN; spi < xfrm6_tn->spi; spi++) {
index = __xfrm6_tunnel_spi_check(net, spi);
@@ -280,13 +270,17 @@ static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return 0;
}
-static int xfrm6_tunnel_init_state(struct xfrm_state *x)
+static int xfrm6_tunnel_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack)
{
- if (x->props.mode != XFRM_MODE_TUNNEL)
+ if (x->props.mode != XFRM_MODE_TUNNEL) {
+ NL_SET_ERR_MSG(extack, "IPv6 tunnel can only be used with tunnel mode");
return -EINVAL;
+ }
- if (x->encap)
+ if (x->encap) {
+ NL_SET_ERR_MSG(extack, "IPv6 tunnel is not compatible with encapsulation");
return -EINVAL;
+ }
x->props.header_len = sizeof(struct ipv6hdr);
@@ -301,7 +295,6 @@ static void xfrm6_tunnel_destroy(struct xfrm_state *x)
}
static const struct xfrm_type xfrm6_tunnel_type = {
- .description = "IP6IP6",
.owner = THIS_MODULE,
.proto = IPPROTO_IPV6,
.init_state = xfrm6_tunnel_init_state,
@@ -313,13 +306,13 @@ static const struct xfrm_type xfrm6_tunnel_type = {
static struct xfrm6_tunnel xfrm6_tunnel_handler __read_mostly = {
.handler = xfrm6_tunnel_rcv,
.err_handler = xfrm6_tunnel_err,
- .priority = 2,
+ .priority = 3,
};
static struct xfrm6_tunnel xfrm46_tunnel_handler __read_mostly = {
.handler = xfrm6_tunnel_rcv,
.err_handler = xfrm6_tunnel_err,
- .priority = 2,
+ .priority = 3,
};
static int __net_init xfrm6_tunnel_net_init(struct net *net)
@@ -341,7 +334,7 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net)
struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net);
unsigned int i;
- xfrm_state_flush(net, IPSEC_PROTO_ANY, false);
+ xfrm_state_flush(net, 0, false);
xfrm_flush_gc();
for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++)
@@ -362,10 +355,7 @@ static int __init xfrm6_tunnel_init(void)
{
int rv;
- xfrm6_tunnel_spi_kmem = kmem_cache_create("xfrm6_tunnel_spi",
- sizeof(struct xfrm6_tunnel_spi),
- 0, SLAB_HWCACHE_ALIGN,
- NULL);
+ xfrm6_tunnel_spi_kmem = KMEM_CACHE(xfrm6_tunnel_spi, SLAB_HWCACHE_ALIGN);
if (!xfrm6_tunnel_spi_kmem)
return -ENOMEM;
rv = register_pernet_subsys(&xfrm6_tunnel_net_ops);
@@ -399,10 +389,15 @@ static void __exit xfrm6_tunnel_fini(void)
xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6);
xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6);
unregister_pernet_subsys(&xfrm6_tunnel_net_ops);
+ /* Someone maybe has gotten the xfrm6_tunnel_spi.
+ * So need to wait it.
+ */
+ rcu_barrier();
kmem_cache_destroy(xfrm6_tunnel_spi_kmem);
}
module_init(xfrm6_tunnel_init);
module_exit(xfrm6_tunnel_fini);
+MODULE_DESCRIPTION("IPv6 XFRM tunnel driver");
MODULE_LICENSE("GPL");
MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_IPV6);
diff --git a/net/iucv/Kconfig b/net/iucv/Kconfig
index 497fbe732def..5cfddc9c6498 100644
--- a/net/iucv/Kconfig
+++ b/net/iucv/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config IUCV
depends on S390
def_tristate y if S390
diff --git a/net/iucv/Makefile b/net/iucv/Makefile
index 7bfdc8532675..984d7ff056ed 100644
--- a/net/iucv/Makefile
+++ b/net/iucv/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for IUCV
#
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index e2f16a0173a9..1e62fbc22cb7 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* IUCV protocol stack for Linux on zSeries
*
@@ -9,11 +10,13 @@
* Ursula Braun <ursula.braun@de.ibm.com>
*/
-#define KMSG_COMPONENT "af_iucv"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "af_iucv: " fmt
+#include <linux/filter.h>
#include <linux/module.h>
+#include <linux/netdevice.h>
#include <linux/types.h>
+#include <linux/limits.h>
#include <linux/list.h>
#include <linux/errno.h>
#include <linux/kernel.h>
@@ -24,6 +27,7 @@
#include <linux/poll.h>
#include <linux/security.h>
#include <net/sock.h>
+#include <asm/machine.h>
#include <asm/ebcdic.h>
#include <asm/cpcmd.h>
#include <linux/kmod.h>
@@ -34,8 +38,6 @@
static char iucv_userid[80];
-static const struct proto_ops iucv_sock_ops;
-
static struct proto iucv_proto = {
.name = "AF_IUCV",
.owner = THIS_MODULE,
@@ -43,12 +45,13 @@ static struct proto iucv_proto = {
};
static struct iucv_interface *pr_iucv;
+static struct iucv_handler af_iucv_handler;
/* special AF_IUCV IPRM messages */
static const u8 iprm_shutdown[8] =
{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01};
-#define TRGCLS_SIZE (sizeof(((struct iucv_message *)0)->class))
+#define TRGCLS_SIZE sizeof_field(struct iucv_message, class)
#define __iucv_sock_wait(sk, condition, timeo, ret) \
do { \
@@ -83,38 +86,18 @@ do { \
__ret; \
})
+static struct sock *iucv_accept_dequeue(struct sock *parent,
+ struct socket *newsock);
static void iucv_sock_kill(struct sock *sk);
static void iucv_sock_close(struct sock *sk);
-static void iucv_sever_path(struct sock *, int);
-static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev,
- struct packet_type *pt, struct net_device *orig_dev);
-static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock,
- struct sk_buff *skb, u8 flags);
-static void afiucv_hs_callback_txnotify(struct sk_buff *, enum iucv_tx_notify);
-
-/* Call Back functions */
-static void iucv_callback_rx(struct iucv_path *, struct iucv_message *);
-static void iucv_callback_txdone(struct iucv_path *, struct iucv_message *);
-static void iucv_callback_connack(struct iucv_path *, u8 *);
-static int iucv_callback_connreq(struct iucv_path *, u8 *, u8 *);
-static void iucv_callback_connrej(struct iucv_path *, u8 *);
-static void iucv_callback_shutdown(struct iucv_path *, u8 *);
+static void afiucv_hs_callback_txnotify(struct sock *sk, enum iucv_tx_notify);
static struct iucv_sock_list iucv_sk_list = {
.lock = __RW_LOCK_UNLOCKED(iucv_sk_list.lock),
.autobind_name = ATOMIC_INIT(0)
};
-static struct iucv_handler af_iucv_handler = {
- .path_pending = iucv_callback_connreq,
- .path_complete = iucv_callback_connack,
- .path_severed = iucv_callback_connrej,
- .message_pending = iucv_callback_rx,
- .message_complete = iucv_callback_txdone,
- .path_quiesced = iucv_callback_shutdown,
-};
-
static inline void high_nmcpy(unsigned char *dst, char *src)
{
memcpy(dst, src, 8);
@@ -125,110 +108,6 @@ static inline void low_nmcpy(unsigned char *dst, char *src)
memcpy(&dst[8], src, 8);
}
-static int afiucv_pm_prepare(struct device *dev)
-{
-#ifdef CONFIG_PM_DEBUG
- printk(KERN_WARNING "afiucv_pm_prepare\n");
-#endif
- return 0;
-}
-
-static void afiucv_pm_complete(struct device *dev)
-{
-#ifdef CONFIG_PM_DEBUG
- printk(KERN_WARNING "afiucv_pm_complete\n");
-#endif
-}
-
-/**
- * afiucv_pm_freeze() - Freeze PM callback
- * @dev: AFIUCV dummy device
- *
- * Sever all established IUCV communication pathes
- */
-static int afiucv_pm_freeze(struct device *dev)
-{
- struct iucv_sock *iucv;
- struct sock *sk;
-
-#ifdef CONFIG_PM_DEBUG
- printk(KERN_WARNING "afiucv_pm_freeze\n");
-#endif
- read_lock(&iucv_sk_list.lock);
- sk_for_each(sk, &iucv_sk_list.head) {
- iucv = iucv_sk(sk);
- switch (sk->sk_state) {
- case IUCV_DISCONN:
- case IUCV_CLOSING:
- case IUCV_CONNECTED:
- iucv_sever_path(sk, 0);
- break;
- case IUCV_OPEN:
- case IUCV_BOUND:
- case IUCV_LISTEN:
- case IUCV_CLOSED:
- default:
- break;
- }
- skb_queue_purge(&iucv->send_skb_q);
- skb_queue_purge(&iucv->backlog_skb_q);
- }
- read_unlock(&iucv_sk_list.lock);
- return 0;
-}
-
-/**
- * afiucv_pm_restore_thaw() - Thaw and restore PM callback
- * @dev: AFIUCV dummy device
- *
- * socket clean up after freeze
- */
-static int afiucv_pm_restore_thaw(struct device *dev)
-{
- struct sock *sk;
-
-#ifdef CONFIG_PM_DEBUG
- printk(KERN_WARNING "afiucv_pm_restore_thaw\n");
-#endif
- read_lock(&iucv_sk_list.lock);
- sk_for_each(sk, &iucv_sk_list.head) {
- switch (sk->sk_state) {
- case IUCV_CONNECTED:
- sk->sk_err = EPIPE;
- sk->sk_state = IUCV_DISCONN;
- sk->sk_state_change(sk);
- break;
- case IUCV_DISCONN:
- case IUCV_CLOSING:
- case IUCV_LISTEN:
- case IUCV_BOUND:
- case IUCV_OPEN:
- default:
- break;
- }
- }
- read_unlock(&iucv_sk_list.lock);
- return 0;
-}
-
-static const struct dev_pm_ops afiucv_pm_ops = {
- .prepare = afiucv_pm_prepare,
- .complete = afiucv_pm_complete,
- .freeze = afiucv_pm_freeze,
- .thaw = afiucv_pm_restore_thaw,
- .restore = afiucv_pm_restore_thaw,
-};
-
-static struct device_driver af_iucv_driver = {
- .owner = THIS_MODULE,
- .name = "afiucv",
- .bus = NULL,
- .pm = &afiucv_pm_ops,
-};
-
-/* dummy device used as trigger for PM functions */
-static struct device *af_iucv_dev;
-
/**
* iucv_msg_length() - Returns the length of an iucv message.
* @msg: Pointer to struct iucv_message, MUST NOT be NULL
@@ -264,7 +143,7 @@ static inline size_t iucv_msg_length(struct iucv_message *msg)
* iucv_sock_in_state() - check for specific states
* @sk: sock structure
* @state: first iucv sk state
- * @state: second iucv sk state
+ * @state2: second iucv sk state
*
* Returns true if the socket in either in the first or second state.
*/
@@ -288,13 +167,13 @@ static inline int iucv_below_msglim(struct sock *sk)
if (sk->sk_state != IUCV_CONNECTED)
return 1;
if (iucv->transport == AF_IUCV_TRANS_IUCV)
- return (skb_queue_len(&iucv->send_skb_q) < iucv->path->msglim);
+ return (atomic_read(&iucv->skbs_in_xmit) < iucv->path->msglim);
else
return ((atomic_read(&iucv->msg_sent) < iucv->msglimit_peer) &&
(atomic_read(&iucv->pendings) <= 0));
}
-/**
+/*
* iucv_sock_wake_msglim() - Wake up thread waiting on msg limit
*/
static void iucv_sock_wake_msglim(struct sock *sk)
@@ -305,11 +184,11 @@ static void iucv_sock_wake_msglim(struct sock *sk)
wq = rcu_dereference(sk->sk_wq);
if (skwq_has_sleeper(wq))
wake_up_interruptible_all(&wq->wait);
- sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
+ sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
rcu_read_unlock();
}
-/**
+/*
* afiucv_hs_send() - send a message through HiperSockets transport
*/
static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock,
@@ -317,16 +196,11 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock,
{
struct iucv_sock *iucv = iucv_sk(sock);
struct af_iucv_trans_hdr *phs_hdr;
- struct sk_buff *nskb;
int err, confirm_recv = 0;
- memset(skb->head, 0, ETH_HLEN);
- phs_hdr = skb_push(skb, sizeof(struct af_iucv_trans_hdr));
- skb_reset_mac_header(skb);
+ phs_hdr = skb_push(skb, sizeof(*phs_hdr));
+ memset(phs_hdr, 0, sizeof(*phs_hdr));
skb_reset_network_header(skb);
- skb_push(skb, ETH_HLEN);
- skb_reset_mac_header(skb);
- memset(phs_hdr, 0, sizeof(struct af_iucv_trans_hdr));
phs_hdr->magic = ETH_P_AF_IUCV;
phs_hdr->version = 1;
@@ -355,6 +229,9 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock,
err = -ENODEV;
goto err_free;
}
+
+ dev_hard_header(skb, skb->dev, ETH_P_AF_IUCV, NULL, NULL, skb->len);
+
if (!(skb->dev->flags & IFF_UP) || !netif_carrier_ok(skb->dev)) {
err = -ENETDOWN;
goto err_free;
@@ -364,20 +241,16 @@ static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock,
err = -EMSGSIZE;
goto err_free;
}
- skb_trim(skb, skb->dev->mtu);
+ err = pskb_trim(skb, skb->dev->mtu);
+ if (err)
+ goto err_free;
}
skb->protocol = cpu_to_be16(ETH_P_AF_IUCV);
- nskb = skb_clone(skb, GFP_ATOMIC);
- if (!nskb) {
- err = -ENOMEM;
- goto err_free;
- }
- skb_queue_tail(&iucv->send_skb_q, nskb);
+ atomic_inc(&iucv->skbs_in_xmit);
err = dev_queue_xmit(skb);
if (net_xmit_eval(err)) {
- skb_unlink(nskb, &iucv->send_skb_q);
- kfree_skb(nskb);
+ atomic_dec(&iucv->skbs_in_xmit);
} else {
atomic_sub(confirm_recv, &iucv->msg_recv);
WARN_ON(atomic_read(&iucv->msg_recv) < 0);
@@ -405,8 +278,6 @@ static void iucv_sock_destruct(struct sock *sk)
skb_queue_purge(&sk->sk_receive_queue);
skb_queue_purge(&sk->sk_error_queue);
- sk_mem_reclaim(sk);
-
if (!sock_flag(sk, SOCK_DEAD)) {
pr_err("Attempt to release alive iucv socket %p\n", sk);
return;
@@ -432,6 +303,20 @@ static void iucv_sock_cleanup_listen(struct sock *parent)
parent->sk_state = IUCV_CLOSED;
}
+static void iucv_sock_link(struct iucv_sock_list *l, struct sock *sk)
+{
+ write_lock_bh(&l->lock);
+ sk_add_node(sk, &l->head);
+ write_unlock_bh(&l->lock);
+}
+
+static void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *sk)
+{
+ write_lock_bh(&l->lock);
+ sk_del_node_init(sk);
+ write_unlock_bh(&l->lock);
+}
+
/* Kill socket (only if zapped and orphaned) */
static void iucv_sock_kill(struct sock *sk)
{
@@ -450,8 +335,8 @@ static void iucv_sever_path(struct sock *sk, int with_user_data)
struct iucv_sock *iucv = iucv_sk(sk);
struct iucv_path *path = iucv->path;
- if (iucv->path) {
- iucv->path = NULL;
+ /* Whoever resets the path pointer, must sever and free it. */
+ if (xchg(&iucv->path, NULL)) {
if (with_user_data) {
low_nmcpy(user_data, iucv->src_name);
high_nmcpy(user_data, iucv->dst_name);
@@ -466,12 +351,14 @@ static void iucv_sever_path(struct sock *sk, int with_user_data)
/* Send controlling flags through an IUCV socket for HIPER transport */
static int iucv_send_ctrl(struct sock *sk, u8 flags)
{
+ struct iucv_sock *iucv = iucv_sk(sk);
int err = 0;
int blen;
struct sk_buff *skb;
u8 shutdown = 0;
- blen = sizeof(struct af_iucv_trans_hdr) + ETH_HLEN;
+ blen = sizeof(struct af_iucv_trans_hdr) +
+ LL_RESERVED_SPACE(iucv->hs_dev);
if (sk->sk_shutdown & SEND_SHUTDOWN) {
/* controlling flags should be sent anyway */
shutdown = sk->sk_shutdown;
@@ -507,11 +394,13 @@ static void iucv_sock_close(struct sock *sk)
sk->sk_state = IUCV_DISCONN;
sk->sk_state_change(sk);
}
- case IUCV_DISCONN: /* fall through */
+ fallthrough;
+
+ case IUCV_DISCONN:
sk->sk_state = IUCV_CLOSING;
sk->sk_state_change(sk);
- if (!err && !skb_queue_empty(&iucv->send_skb_q)) {
+ if (!err && atomic_read(&iucv->skbs_in_xmit) > 0) {
if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime)
timeo = sk->sk_lingertime;
else
@@ -520,8 +409,9 @@ static void iucv_sock_close(struct sock *sk)
iucv_sock_in_state(sk, IUCV_CLOSED, 0),
timeo);
}
+ fallthrough;
- case IUCV_CLOSING: /* fall through */
+ case IUCV_CLOSING:
sk->sk_state = IUCV_CLOSED;
sk->sk_state_change(sk);
@@ -530,8 +420,9 @@ static void iucv_sock_close(struct sock *sk)
skb_queue_purge(&iucv->send_skb_q);
skb_queue_purge(&iucv->backlog_skb_q);
+ fallthrough;
- default: /* fall through */
+ default:
iucv_sever_path(sk, 1);
}
@@ -576,11 +467,12 @@ static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio,
atomic_set(&iucv->pendings, 0);
iucv->flags = 0;
iucv->msglimit = 0;
+ atomic_set(&iucv->skbs_in_xmit, 0);
atomic_set(&iucv->msg_sent, 0);
atomic_set(&iucv->msg_recv, 0);
iucv->path = NULL;
iucv->sk_txnotify = afiucv_hs_callback_txnotify;
- memset(&iucv->src_user_id , 0, 32);
+ memset(&iucv->init, 0, sizeof(iucv->init));
if (pr_iucv)
iucv->transport = AF_IUCV_TRANS_IUCV;
else
@@ -588,7 +480,6 @@ static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio,
sk->sk_destruct = iucv_sock_destruct;
sk->sk_sndtimeo = IUCV_CONN_TIMEOUT;
- sk->sk_allocation = GFP_DMA;
sock_reset_flag(sk, SOCK_ZAPPED);
@@ -599,53 +490,7 @@ static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio,
return sk;
}
-/* Create an IUCV socket */
-static int iucv_sock_create(struct net *net, struct socket *sock, int protocol,
- int kern)
-{
- struct sock *sk;
-
- if (protocol && protocol != PF_IUCV)
- return -EPROTONOSUPPORT;
-
- sock->state = SS_UNCONNECTED;
-
- switch (sock->type) {
- case SOCK_STREAM:
- sock->ops = &iucv_sock_ops;
- break;
- case SOCK_SEQPACKET:
- /* currently, proto ops can handle both sk types */
- sock->ops = &iucv_sock_ops;
- break;
- default:
- return -ESOCKTNOSUPPORT;
- }
-
- sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL, kern);
- if (!sk)
- return -ENOMEM;
-
- iucv_sock_init(sk, NULL);
-
- return 0;
-}
-
-void iucv_sock_link(struct iucv_sock_list *l, struct sock *sk)
-{
- write_lock_bh(&l->lock);
- sk_add_node(sk, &l->head);
- write_unlock_bh(&l->lock);
-}
-
-void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *sk)
-{
- write_lock_bh(&l->lock);
- sk_del_node_init(sk);
- write_unlock_bh(&l->lock);
-}
-
-void iucv_accept_enqueue(struct sock *parent, struct sock *sk)
+static void iucv_accept_enqueue(struct sock *parent, struct sock *sk)
{
unsigned long flags;
struct iucv_sock *par = iucv_sk(parent);
@@ -658,7 +503,7 @@ void iucv_accept_enqueue(struct sock *parent, struct sock *sk)
sk_acceptq_added(parent);
}
-void iucv_accept_unlink(struct sock *sk)
+static void iucv_accept_unlink(struct sock *sk)
{
unsigned long flags;
struct iucv_sock *par = iucv_sk(iucv_sk(sk)->parent);
@@ -671,7 +516,8 @@ void iucv_accept_unlink(struct sock *sk)
sock_put(sk);
}
-struct sock *iucv_accept_dequeue(struct sock *parent, struct socket *newsock)
+static struct sock *iucv_accept_dequeue(struct sock *parent,
+ struct socket *newsock)
{
struct iucv_sock *isk, *n;
struct sock *sk;
@@ -706,24 +552,25 @@ static void __iucv_auto_name(struct iucv_sock *iucv)
{
char name[12];
- sprintf(name, "%08x", atomic_inc_return(&iucv_sk_list.autobind_name));
+ scnprintf(name, sizeof(name),
+ "%08x", atomic_inc_return(&iucv_sk_list.autobind_name));
while (__iucv_get_sock_by_name(name)) {
- sprintf(name, "%08x",
- atomic_inc_return(&iucv_sk_list.autobind_name));
+ scnprintf(name, sizeof(name), "%08x",
+ atomic_inc_return(&iucv_sk_list.autobind_name));
}
memcpy(iucv->src_name, name, 8);
}
/* Bind an unbound socket */
-static int iucv_sock_bind(struct socket *sock, struct sockaddr *addr,
+static int iucv_sock_bind(struct socket *sock, struct sockaddr_unsized *addr,
int addr_len)
{
- struct sockaddr_iucv *sa = (struct sockaddr_iucv *) addr;
+ DECLARE_SOCKADDR(struct sockaddr_iucv *, sa, addr);
+ char uid[sizeof(sa->siucv_user_id)];
struct sock *sk = sock->sk;
struct iucv_sock *iucv;
int err = 0;
struct net_device *dev;
- char uid[9];
/* Verify the input sockaddr */
if (addr_len < sizeof(struct sockaddr_iucv) ||
@@ -758,7 +605,7 @@ static int iucv_sock_bind(struct socket *sock, struct sockaddr *addr,
for_each_netdev_rcu(&init_net, dev) {
if (!memcmp(dev->perm_addr, uid, 8)) {
memcpy(iucv->src_user_id, sa->siucv_user_id, 8);
- /* Check for unitialized siucv_name */
+ /* Check for uninitialized siucv_name */
if (strncmp(sa->siucv_name, " ", 8) == 0)
__iucv_auto_name(iucv);
else
@@ -782,6 +629,7 @@ vm_bind:
memcpy(iucv->src_user_id, iucv_userid, 8);
sk->sk_state = IUCV_BOUND;
iucv->transport = AF_IUCV_TRANS_IUCV;
+ sk->sk_allocation |= GFP_DMA;
if (!iucv->msglimit)
iucv->msglimit = IUCV_QUEUELEN_DEFAULT;
goto done_unlock;
@@ -806,6 +654,8 @@ static int iucv_sock_autobind(struct sock *sk)
return -EPROTO;
memcpy(iucv->src_user_id, iucv_userid, 8);
+ iucv->transport = AF_IUCV_TRANS_IUCV;
+ sk->sk_allocation |= GFP_DMA;
write_lock_bh(&iucv_sk_list.lock);
__iucv_auto_name(iucv);
@@ -817,9 +667,9 @@ static int iucv_sock_autobind(struct sock *sk)
return err;
}
-static int afiucv_path_connect(struct socket *sock, struct sockaddr *addr)
+static int afiucv_path_connect(struct socket *sock, struct sockaddr_unsized *addr)
{
- struct sockaddr_iucv *sa = (struct sockaddr_iucv *) addr;
+ DECLARE_SOCKADDR(struct sockaddr_iucv *, sa, addr);
struct sock *sk = sock->sk;
struct iucv_sock *iucv = iucv_sk(sk);
unsigned char user_data[16];
@@ -863,10 +713,10 @@ done:
}
/* Connect an unconnected socket */
-static int iucv_sock_connect(struct socket *sock, struct sockaddr *addr,
+static int iucv_sock_connect(struct socket *sock, struct sockaddr_unsized *addr,
int alen, int flags)
{
- struct sockaddr_iucv *sa = (struct sockaddr_iucv *) addr;
+ DECLARE_SOCKADDR(struct sockaddr_iucv *, sa, addr);
struct sock *sk = sock->sk;
struct iucv_sock *iucv = iucv_sk(sk);
int err;
@@ -946,7 +796,7 @@ done:
/* Accept a pending connection */
static int iucv_sock_accept(struct socket *sock, struct socket *newsock,
- int flags, bool kern)
+ struct proto_accept_arg *arg)
{
DECLARE_WAITQUEUE(wait, current);
struct sock *sk = sock->sk, *nsk;
@@ -960,7 +810,7 @@ static int iucv_sock_accept(struct socket *sock, struct socket *newsock,
goto done;
}
- timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+ timeo = sock_rcvtimeo(sk, arg->flags & O_NONBLOCK);
/* Wait for an incoming connection */
add_wait_queue_exclusive(sk_sleep(sk), &wait);
@@ -1002,7 +852,7 @@ done:
static int iucv_sock_getname(struct socket *sock, struct sockaddr *addr,
int peer)
{
- struct sockaddr_iucv *siucv = (struct sockaddr_iucv *) addr;
+ DECLARE_SOCKADDR(struct sockaddr_iucv *, siucv, addr);
struct sock *sk = sock->sk;
struct iucv_sock *iucv = iucv_sk(sk);
@@ -1089,7 +939,6 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
/* initialize defaults */
cmsg_done = 0; /* check for duplicate headers */
- txmsg.class = 0;
/* iterate over control messages */
for_each_cmsghdr(cmsg, msg) {
@@ -1131,8 +980,9 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
* segmented records using the MSG_EOR flag), but
* for SOCK_STREAM we might want to improve it in future */
if (iucv->transport == AF_IUCV_TRANS_HIPER) {
- headroom = sizeof(struct af_iucv_trans_hdr) + ETH_HLEN;
- linear = len;
+ headroom = sizeof(struct af_iucv_trans_hdr) +
+ LL_RESERVED_SPACE(iucv->hs_dev);
+ linear = min(len, PAGE_SIZE - headroom);
} else {
if (len < PAGE_SIZE) {
linear = len;
@@ -1183,6 +1033,7 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
}
} else { /* Classic VM IUCV transport */
skb_queue_tail(&iucv->send_skb_q, skb);
+ atomic_inc(&iucv->skbs_in_xmit);
if (((iucv->path->flags & IUCV_IPRMDATA) & iucv->flags) &&
skb->len <= 7) {
@@ -1191,14 +1042,16 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
/* on success: there is no message_complete callback */
/* for an IPRMDATA msg; remove skb from send queue */
if (err == 0) {
+ atomic_dec(&iucv->skbs_in_xmit);
skb_unlink(skb, &iucv->send_skb_q);
- kfree_skb(skb);
+ consume_skb(skb);
}
/* this error should never happen since the */
/* IUCV_IPRMDATA path flag is set... sever path */
if (err == 0x15) {
pr_iucv->path_sever(iucv->path, NULL);
+ atomic_dec(&iucv->skbs_in_xmit);
skb_unlink(skb, &iucv->send_skb_q);
err = -EPIPE;
goto fail;
@@ -1208,13 +1061,12 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
int i;
/* skip iucv_array lying in the headroom */
- iba[0].address = (u32)(addr_t)skb->data;
+ iba[0].address = virt_to_dma32(skb->data);
iba[0].length = (u32)skb_headlen(skb);
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- iba[i + 1].address =
- (u32)(addr_t)skb_frag_address(frag);
+ iba[i + 1].address = virt_to_dma32(skb_frag_address(frag));
iba[i + 1].length = (u32)skb_frag_size(frag);
}
err = pr_iucv->message_send(iucv->path, &txmsg,
@@ -1237,6 +1089,8 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
} else {
err = -EPIPE;
}
+
+ atomic_dec(&iucv->skbs_in_xmit);
skb_unlink(skb, &iucv->send_skb_q);
goto fail;
}
@@ -1308,13 +1162,12 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb,
struct iucv_array *iba = (struct iucv_array *)skb->head;
int i;
- iba[0].address = (u32)(addr_t)skb->data;
+ iba[0].address = virt_to_dma32(skb->data);
iba[0].length = (u32)skb_headlen(skb);
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- iba[i + 1].address =
- (u32)(addr_t)skb_frag_address(frag);
+ iba[i + 1].address = virt_to_dma32(skb_frag_address(frag));
iba[i + 1].length = (u32)skb_frag_size(frag);
}
rc = pr_iucv->message_receive(path, msg,
@@ -1334,7 +1187,7 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb,
IUCV_SKB_CB(skb)->offset = 0;
if (sk_filter(sk, skb)) {
- atomic_inc(&sk->sk_drops); /* skb rejected by filter */
+ sk_drops_inc(sk); /* skb rejected by filter */
kfree_skb(skb);
return;
}
@@ -1367,7 +1220,6 @@ static void iucv_process_message_q(struct sock *sk)
static int iucv_sock_recvmsg(struct socket *sock, struct msghdr *msg,
size_t len, int flags)
{
- int noblock = flags & MSG_DONTWAIT;
struct sock *sk = sock->sk;
struct iucv_sock *iucv = iucv_sk(sk);
unsigned int copied, rlen;
@@ -1385,8 +1237,10 @@ static int iucv_sock_recvmsg(struct socket *sock, struct msghdr *msg,
return -EOPNOTSUPP;
/* receive/dequeue next skb:
- * the function understands MSG_PEEK and, thus, does not dequeue skb */
- skb = skb_recv_datagram(sk, flags, noblock, &err);
+ * the function understands MSG_PEEK and, thus, does not dequeue skb
+ * only refcount is increased.
+ */
+ skb = skb_recv_datagram(sk, flags, &err);
if (!skb) {
if (sk->sk_shutdown & RCV_SHUTDOWN)
return 0;
@@ -1401,9 +1255,8 @@ static int iucv_sock_recvmsg(struct socket *sock, struct msghdr *msg,
cskb = skb;
if (skb_copy_datagram_msg(cskb, offset, msg, copied)) {
- if (!(flags & MSG_PEEK))
- skb_queue_head(&sk->sk_receive_queue, skb);
- return -EFAULT;
+ err = -EFAULT;
+ goto err_out;
}
/* SOCK_SEQPACKET: set MSG_TRUNC if recv buf size is too small */
@@ -1420,11 +1273,8 @@ static int iucv_sock_recvmsg(struct socket *sock, struct msghdr *msg,
err = put_cmsg(msg, SOL_IUCV, SCM_IUCV_TRGCLS,
sizeof(IUCV_SKB_CB(skb)->class),
(void *)&IUCV_SKB_CB(skb)->class);
- if (err) {
- if (!(flags & MSG_PEEK))
- skb_queue_head(&sk->sk_receive_queue, skb);
- return err;
- }
+ if (err)
+ goto err_out;
/* Mark read part of skb as used */
if (!(flags & MSG_PEEK)) {
@@ -1438,7 +1288,7 @@ static int iucv_sock_recvmsg(struct socket *sock, struct msghdr *msg,
}
}
- kfree_skb(skb);
+ consume_skb(skb);
if (iucv->transport == AF_IUCV_TRANS_HIPER) {
atomic_inc(&iucv->msg_recv);
if (atomic_read(&iucv->msg_recv) > iucv->msglimit) {
@@ -1480,8 +1330,18 @@ done:
/* SOCK_SEQPACKET: return real length if MSG_TRUNC is set */
if (sk->sk_type == SOCK_SEQPACKET && (flags & MSG_TRUNC))
copied = rlen;
+ if (flags & MSG_PEEK)
+ skb_unref(skb);
return copied;
+
+err_out:
+ if (!(flags & MSG_PEEK))
+ skb_queue_head(&sk->sk_receive_queue, skb);
+ else
+ skb_unref(skb);
+
+ return err;
}
static inline __poll_t iucv_accept_poll(struct sock *parent)
@@ -1499,13 +1359,13 @@ static inline __poll_t iucv_accept_poll(struct sock *parent)
return 0;
}
-__poll_t iucv_sock_poll(struct file *file, struct socket *sock,
- poll_table *wait)
+static __poll_t iucv_sock_poll(struct file *file, struct socket *sock,
+ poll_table *wait)
{
struct sock *sk = sock->sk;
__poll_t mask = 0;
- sock_poll_wait(file, wait);
+ sock_poll_wait(file, sock, wait);
if (sk->sk_state == IUCV_LISTEN)
return iucv_accept_poll(sk);
@@ -1562,7 +1422,8 @@ static int iucv_sock_shutdown(struct socket *sock, int how)
break;
}
- if (how == SEND_SHUTDOWN || how == SHUTDOWN_MASK) {
+ if ((how == SEND_SHUTDOWN || how == SHUTDOWN_MASK) &&
+ sk->sk_state == IUCV_CONNECTED) {
if (iucv->transport == AF_IUCV_TRANS_IUCV) {
txmsg.class = 0;
txmsg.tag = 0;
@@ -1622,7 +1483,7 @@ static int iucv_sock_release(struct socket *sock)
/* getsockopt and setsockopt */
static int iucv_sock_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
struct iucv_sock *iucv = iucv_sk(sk);
@@ -1635,7 +1496,7 @@ static int iucv_sock_setsockopt(struct socket *sock, int level, int optname,
if (optlen < sizeof(int))
return -EINVAL;
- if (get_user(val, (int __user *) optval))
+ if (copy_from_sockptr(&val, optval, sizeof(int)))
return -EFAULT;
rc = 0;
@@ -1652,7 +1513,7 @@ static int iucv_sock_setsockopt(struct socket *sock, int level, int optname,
switch (sk->sk_state) {
case IUCV_OPEN:
case IUCV_BOUND:
- if (val < 1 || val > (u16)(~0))
+ if (val < 1 || val > U16_MAX)
rc = -EINVAL;
else
iucv->msglimit = val;
@@ -1772,7 +1633,7 @@ static int iucv_callback_connreq(struct iucv_path *path,
}
/* Create the new socket */
- nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0);
+ nsk = iucv_sock_alloc(NULL, sk->sk_protocol, GFP_ATOMIC, 0);
if (!nsk) {
err = pr_iucv->path_sever(path, user_data);
iucv_path_free(path);
@@ -1781,6 +1642,8 @@ static int iucv_callback_connreq(struct iucv_path *path,
niucv = iucv_sk(nsk);
iucv_sock_init(nsk, sk);
+ niucv->transport = AF_IUCV_TRANS_IUCV;
+ nsk->sk_allocation |= GFP_DMA;
/* Set the new iucv_sock */
memcpy(niucv->dst_name, ipuser + 8, 8);
@@ -1873,35 +1736,38 @@ static void iucv_callback_txdone(struct iucv_path *path,
{
struct sock *sk = path->private;
struct sk_buff *this = NULL;
- struct sk_buff_head *list = &iucv_sk(sk)->send_skb_q;
- struct sk_buff *list_skb = list->next;
+ struct sk_buff_head *list;
+ struct sk_buff *list_skb;
+ struct iucv_sock *iucv;
unsigned long flags;
+ iucv = iucv_sk(sk);
+ list = &iucv->send_skb_q;
+
bh_lock_sock(sk);
- if (!skb_queue_empty(list)) {
- spin_lock_irqsave(&list->lock, flags);
- while (list_skb != (struct sk_buff *)list) {
- if (msg->tag == IUCV_SKB_CB(list_skb)->tag) {
- this = list_skb;
- break;
- }
- list_skb = list_skb->next;
+ spin_lock_irqsave(&list->lock, flags);
+ skb_queue_walk(list, list_skb) {
+ if (msg->tag == IUCV_SKB_CB(list_skb)->tag) {
+ this = list_skb;
+ break;
}
- if (this)
- __skb_unlink(this, list);
+ }
+ if (this) {
+ atomic_dec(&iucv->skbs_in_xmit);
+ __skb_unlink(this, list);
+ }
- spin_unlock_irqrestore(&list->lock, flags);
+ spin_unlock_irqrestore(&list->lock, flags);
- if (this) {
- kfree_skb(this);
- /* wake up any process waiting for sending */
- iucv_sock_wake_msglim(sk);
- }
+ if (this) {
+ consume_skb(this);
+ /* wake up any process waiting for sending */
+ iucv_sock_wake_msglim(sk);
}
if (sk->sk_state == IUCV_CLOSING) {
- if (skb_queue_empty(&iucv_sk(sk)->send_skb_q)) {
+ if (atomic_read(&iucv->skbs_in_xmit) == 0) {
sk->sk_state = IUCV_CLOSED;
sk->sk_state_change(sk);
}
@@ -1940,11 +1806,19 @@ static void iucv_callback_shutdown(struct iucv_path *path, u8 ipuser[16])
bh_unlock_sock(sk);
}
+static struct iucv_handler af_iucv_handler = {
+ .path_pending = iucv_callback_connreq,
+ .path_complete = iucv_callback_connack,
+ .path_severed = iucv_callback_connrej,
+ .message_pending = iucv_callback_rx,
+ .message_complete = iucv_callback_txdone,
+ .path_quiesced = iucv_callback_shutdown,
+};
+
/***************** HiperSockets transport callbacks ********************/
static void afiucv_swap_src_dest(struct sk_buff *skb)
{
- struct af_iucv_trans_hdr *trans_hdr =
- (struct af_iucv_trans_hdr *)skb->data;
+ struct af_iucv_trans_hdr *trans_hdr = iucv_trans_hdr(skb);
char tmpID[8];
char tmpName[8];
@@ -1962,18 +1836,17 @@ static void afiucv_swap_src_dest(struct sk_buff *skb)
memset(skb->data, 0, ETH_HLEN);
}
-/**
+/*
* afiucv_hs_callback_syn - react on received SYN
- **/
+ */
static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb)
{
+ struct af_iucv_trans_hdr *trans_hdr = iucv_trans_hdr(skb);
struct sock *nsk;
struct iucv_sock *iucv, *niucv;
- struct af_iucv_trans_hdr *trans_hdr;
int err;
iucv = iucv_sk(sk);
- trans_hdr = (struct af_iucv_trans_hdr *)skb->data;
if (!iucv) {
/* no sock - connection refused */
afiucv_swap_src_dest(skb);
@@ -1982,7 +1855,7 @@ static int afiucv_hs_callback_syn(struct sock *sk, struct sk_buff *skb)
goto out;
}
- nsk = iucv_sock_alloc(NULL, sk->sk_type, GFP_ATOMIC, 0);
+ nsk = iucv_sock_alloc(NULL, sk->sk_protocol, GFP_ATOMIC, 0);
bh_lock_sock(sk);
if ((sk->sk_state != IUCV_LISTEN) ||
sk_acceptq_is_full(sk) ||
@@ -2028,78 +1901,76 @@ out:
return NET_RX_SUCCESS;
}
-/**
+/*
* afiucv_hs_callback_synack() - react on received SYN-ACK
- **/
+ */
static int afiucv_hs_callback_synack(struct sock *sk, struct sk_buff *skb)
{
struct iucv_sock *iucv = iucv_sk(sk);
- struct af_iucv_trans_hdr *trans_hdr =
- (struct af_iucv_trans_hdr *)skb->data;
- if (!iucv)
- goto out;
- if (sk->sk_state != IUCV_BOUND)
- goto out;
+ if (!iucv || sk->sk_state != IUCV_BOUND) {
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+ }
+
bh_lock_sock(sk);
- iucv->msglimit_peer = trans_hdr->window;
+ iucv->msglimit_peer = iucv_trans_hdr(skb)->window;
sk->sk_state = IUCV_CONNECTED;
sk->sk_state_change(sk);
bh_unlock_sock(sk);
-out:
- kfree_skb(skb);
+ consume_skb(skb);
return NET_RX_SUCCESS;
}
-/**
+/*
* afiucv_hs_callback_synfin() - react on received SYN_FIN
- **/
+ */
static int afiucv_hs_callback_synfin(struct sock *sk, struct sk_buff *skb)
{
struct iucv_sock *iucv = iucv_sk(sk);
- if (!iucv)
- goto out;
- if (sk->sk_state != IUCV_BOUND)
- goto out;
+ if (!iucv || sk->sk_state != IUCV_BOUND) {
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+ }
+
bh_lock_sock(sk);
sk->sk_state = IUCV_DISCONN;
sk->sk_state_change(sk);
bh_unlock_sock(sk);
-out:
- kfree_skb(skb);
+ consume_skb(skb);
return NET_RX_SUCCESS;
}
-/**
+/*
* afiucv_hs_callback_fin() - react on received FIN
- **/
+ */
static int afiucv_hs_callback_fin(struct sock *sk, struct sk_buff *skb)
{
struct iucv_sock *iucv = iucv_sk(sk);
/* other end of connection closed */
- if (!iucv)
- goto out;
+ if (!iucv) {
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+ }
+
bh_lock_sock(sk);
if (sk->sk_state == IUCV_CONNECTED) {
sk->sk_state = IUCV_DISCONN;
sk->sk_state_change(sk);
}
bh_unlock_sock(sk);
-out:
- kfree_skb(skb);
+ consume_skb(skb);
return NET_RX_SUCCESS;
}
-/**
+/*
* afiucv_hs_callback_win() - react on received WIN
- **/
+ */
static int afiucv_hs_callback_win(struct sock *sk, struct sk_buff *skb)
{
struct iucv_sock *iucv = iucv_sk(sk);
- struct af_iucv_trans_hdr *trans_hdr =
- (struct af_iucv_trans_hdr *)skb->data;
if (!iucv)
return NET_RX_SUCCESS;
@@ -2107,14 +1978,14 @@ static int afiucv_hs_callback_win(struct sock *sk, struct sk_buff *skb)
if (sk->sk_state != IUCV_CONNECTED)
return NET_RX_SUCCESS;
- atomic_sub(trans_hdr->window, &iucv->msg_sent);
+ atomic_sub(iucv_trans_hdr(skb)->window, &iucv->msg_sent);
iucv_sock_wake_msglim(sk);
return NET_RX_SUCCESS;
}
-/**
+/*
* afiucv_hs_callback_rx() - react on received data
- **/
+ */
static int afiucv_hs_callback_rx(struct sock *sk, struct sk_buff *skb)
{
struct iucv_sock *iucv = iucv_sk(sk);
@@ -2140,7 +2011,7 @@ static int afiucv_hs_callback_rx(struct sock *sk, struct sk_buff *skb)
skb_reset_network_header(skb);
IUCV_SKB_CB(skb)->offset = 0;
if (sk_filter(sk, skb)) {
- atomic_inc(&sk->sk_drops); /* skb rejected by filter */
+ sk_drops_inc(sk); /* skb rejected by filter */
kfree_skb(skb);
return NET_RX_SUCCESS;
}
@@ -2156,11 +2027,11 @@ static int afiucv_hs_callback_rx(struct sock *sk, struct sk_buff *skb)
return NET_RX_SUCCESS;
}
-/**
+/*
* afiucv_hs_rcv() - base function for arriving data through HiperSockets
* transport
* called from netif RX softirq
- **/
+ */
static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
@@ -2170,22 +2041,12 @@ static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev,
int err = NET_RX_SUCCESS;
char nullstring[8];
- if (skb->len < (ETH_HLEN + sizeof(struct af_iucv_trans_hdr))) {
- WARN_ONCE(1, "AF_IUCV too short skb, len=%d, min=%d",
- (int)skb->len,
- (int)(ETH_HLEN + sizeof(struct af_iucv_trans_hdr)));
+ if (!pskb_may_pull(skb, sizeof(*trans_hdr))) {
kfree_skb(skb);
return NET_RX_SUCCESS;
}
- if (skb_headlen(skb) < (ETH_HLEN + sizeof(struct af_iucv_trans_hdr)))
- if (skb_linearize(skb)) {
- WARN_ONCE(1, "AF_IUCV skb_linearize failed, len=%d",
- (int)skb->len);
- kfree_skb(skb);
- return NET_RX_SUCCESS;
- }
- skb_pull(skb, ETH_HLEN);
- trans_hdr = (struct af_iucv_trans_hdr *)skb->data;
+
+ trans_hdr = iucv_trans_hdr(skb);
EBCASC(trans_hdr->destAppName, sizeof(trans_hdr->destAppName));
EBCASC(trans_hdr->destUserID, sizeof(trans_hdr->destUserID));
EBCASC(trans_hdr->srcAppName, sizeof(trans_hdr->srcAppName));
@@ -2253,13 +2114,13 @@ static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev,
case (AF_IUCV_FLAG_WIN):
err = afiucv_hs_callback_win(sk, skb);
if (skb->len == sizeof(struct af_iucv_trans_hdr)) {
- kfree_skb(skb);
+ consume_skb(skb);
break;
}
- /* fall through and receive non-zero length data */
+ fallthrough; /* and receive non-zero length data */
case (AF_IUCV_FLAG_SHT):
/* shutdown request */
- /* fall through and receive zero length data */
+ fallthrough; /* and receive zero length data */
case 0:
/* plain data frame */
IUCV_SKB_CB(skb)->class = trans_hdr->iucv_hdr.class;
@@ -2272,84 +2133,44 @@ static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev,
return err;
}
-/**
- * afiucv_hs_callback_txnotify() - handle send notifcations from HiperSockets
+/*
+ * afiucv_hs_callback_txnotify() - handle send notifications from HiperSockets
* transport
- **/
-static void afiucv_hs_callback_txnotify(struct sk_buff *skb,
- enum iucv_tx_notify n)
+ */
+static void afiucv_hs_callback_txnotify(struct sock *sk, enum iucv_tx_notify n)
{
- struct sock *isk = skb->sk;
- struct sock *sk = NULL;
- struct iucv_sock *iucv = NULL;
- struct sk_buff_head *list;
- struct sk_buff *list_skb;
- struct sk_buff *nskb;
- unsigned long flags;
-
- read_lock_irqsave(&iucv_sk_list.lock, flags);
- sk_for_each(sk, &iucv_sk_list.head)
- if (sk == isk) {
- iucv = iucv_sk(sk);
- break;
- }
- read_unlock_irqrestore(&iucv_sk_list.lock, flags);
+ struct iucv_sock *iucv = iucv_sk(sk);
- if (!iucv || sock_flag(sk, SOCK_ZAPPED))
+ if (sock_flag(sk, SOCK_ZAPPED))
return;
- list = &iucv->send_skb_q;
- spin_lock_irqsave(&list->lock, flags);
- if (skb_queue_empty(list))
- goto out_unlock;
- list_skb = list->next;
- nskb = list_skb->next;
- while (list_skb != (struct sk_buff *)list) {
- if (skb_shinfo(list_skb) == skb_shinfo(skb)) {
- switch (n) {
- case TX_NOTIFY_OK:
- __skb_unlink(list_skb, list);
- kfree_skb(list_skb);
- iucv_sock_wake_msglim(sk);
- break;
- case TX_NOTIFY_PENDING:
- atomic_inc(&iucv->pendings);
- break;
- case TX_NOTIFY_DELAYED_OK:
- __skb_unlink(list_skb, list);
- atomic_dec(&iucv->pendings);
- if (atomic_read(&iucv->pendings) <= 0)
- iucv_sock_wake_msglim(sk);
- kfree_skb(list_skb);
- break;
- case TX_NOTIFY_UNREACHABLE:
- case TX_NOTIFY_DELAYED_UNREACHABLE:
- case TX_NOTIFY_TPQFULL: /* not yet used */
- case TX_NOTIFY_GENERALERROR:
- case TX_NOTIFY_DELAYED_GENERALERROR:
- __skb_unlink(list_skb, list);
- kfree_skb(list_skb);
- if (sk->sk_state == IUCV_CONNECTED) {
- sk->sk_state = IUCV_DISCONN;
- sk->sk_state_change(sk);
- }
- break;
- }
- break;
+ switch (n) {
+ case TX_NOTIFY_OK:
+ atomic_dec(&iucv->skbs_in_xmit);
+ iucv_sock_wake_msglim(sk);
+ break;
+ case TX_NOTIFY_PENDING:
+ atomic_inc(&iucv->pendings);
+ break;
+ case TX_NOTIFY_DELAYED_OK:
+ atomic_dec(&iucv->skbs_in_xmit);
+ if (atomic_dec_return(&iucv->pendings) <= 0)
+ iucv_sock_wake_msglim(sk);
+ break;
+ default:
+ atomic_dec(&iucv->skbs_in_xmit);
+ if (sk->sk_state == IUCV_CONNECTED) {
+ sk->sk_state = IUCV_DISCONN;
+ sk->sk_state_change(sk);
}
- list_skb = nskb;
- nskb = nskb->next;
}
-out_unlock:
- spin_unlock_irqrestore(&list->lock, flags);
if (sk->sk_state == IUCV_CLOSING) {
- if (skb_queue_empty(&iucv_sk(sk)->send_skb_q)) {
+ if (atomic_read(&iucv->skbs_in_xmit) == 0) {
sk->sk_state = IUCV_CLOSED;
sk->sk_state_change(sk);
}
}
-
}
/*
@@ -2408,6 +2229,35 @@ static const struct proto_ops iucv_sock_ops = {
.getsockopt = iucv_sock_getsockopt,
};
+static int iucv_sock_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
+{
+ struct sock *sk;
+
+ if (protocol && protocol != PF_IUCV)
+ return -EPROTONOSUPPORT;
+
+ sock->state = SS_UNCONNECTED;
+
+ switch (sock->type) {
+ case SOCK_STREAM:
+ case SOCK_SEQPACKET:
+ /* currently, proto ops can handle both sk types */
+ sock->ops = &iucv_sock_ops;
+ break;
+ default:
+ return -ESOCKTNOSUPPORT;
+ }
+
+ sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL, kern);
+ if (!sk)
+ return -ENOMEM;
+
+ iucv_sock_init(sk, NULL);
+
+ return 0;
+}
+
static const struct net_proto_family iucv_sock_family_ops = {
.family = AF_IUCV,
.owner = THIS_MODULE,
@@ -2419,48 +2269,11 @@ static struct packet_type iucv_packet_type = {
.func = afiucv_hs_rcv,
};
-static int afiucv_iucv_init(void)
-{
- int err;
-
- err = pr_iucv->iucv_register(&af_iucv_handler, 0);
- if (err)
- goto out;
- /* establish dummy device */
- af_iucv_driver.bus = pr_iucv->bus;
- err = driver_register(&af_iucv_driver);
- if (err)
- goto out_iucv;
- af_iucv_dev = kzalloc(sizeof(struct device), GFP_KERNEL);
- if (!af_iucv_dev) {
- err = -ENOMEM;
- goto out_driver;
- }
- dev_set_name(af_iucv_dev, "af_iucv");
- af_iucv_dev->bus = pr_iucv->bus;
- af_iucv_dev->parent = pr_iucv->root;
- af_iucv_dev->release = (void (*)(struct device *))kfree;
- af_iucv_dev->driver = &af_iucv_driver;
- err = device_register(af_iucv_dev);
- if (err)
- goto out_iucv_dev;
- return 0;
-
-out_iucv_dev:
- put_device(af_iucv_dev);
-out_driver:
- driver_unregister(&af_iucv_driver);
-out_iucv:
- pr_iucv->iucv_unregister(&af_iucv_handler, 0);
-out:
- return err;
-}
-
static int __init afiucv_init(void)
{
int err;
- if (MACHINE_IS_VM) {
+ if (machine_is_vm() && IS_ENABLED(CONFIG_IUCV)) {
cpcmd("QUERY USERID", iucv_userid, sizeof(iucv_userid), &err);
if (unlikely(err)) {
WARN_ON(err);
@@ -2468,11 +2281,7 @@ static int __init afiucv_init(void)
goto out;
}
- pr_iucv = try_then_request_module(symbol_get(iucv_if), "iucv");
- if (!pr_iucv) {
- printk(KERN_WARNING "iucv_if lookup failed\n");
- memset(&iucv_userid, 0, sizeof(iucv_userid));
- }
+ pr_iucv = &iucv_if;
} else {
memset(&iucv_userid, 0, sizeof(iucv_userid));
pr_iucv = NULL;
@@ -2486,33 +2295,35 @@ static int __init afiucv_init(void)
goto out_proto;
if (pr_iucv) {
- err = afiucv_iucv_init();
+ err = pr_iucv->iucv_register(&af_iucv_handler, 0);
if (err)
goto out_sock;
- } else
- register_netdevice_notifier(&afiucv_netdev_notifier);
+ }
+
+ err = register_netdevice_notifier(&afiucv_netdev_notifier);
+ if (err)
+ goto out_notifier;
+
dev_add_pack(&iucv_packet_type);
return 0;
+out_notifier:
+ if (pr_iucv)
+ pr_iucv->iucv_unregister(&af_iucv_handler, 0);
out_sock:
sock_unregister(PF_IUCV);
out_proto:
proto_unregister(&iucv_proto);
out:
- if (pr_iucv)
- symbol_put(iucv_if);
return err;
}
static void __exit afiucv_exit(void)
{
- if (pr_iucv) {
- device_unregister(af_iucv_dev);
- driver_unregister(&af_iucv_driver);
+ if (pr_iucv)
pr_iucv->iucv_unregister(&af_iucv_handler, 0);
- symbol_put(iucv_if);
- } else
- unregister_netdevice_notifier(&afiucv_netdev_notifier);
+
+ unregister_netdevice_notifier(&afiucv_netdev_notifier);
dev_remove_pack(&iucv_packet_type);
sock_unregister(PF_IUCV);
proto_unregister(&iucv_proto);
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index eb502c6290c2..da2af413c89d 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* IUCV base infrastructure.
*
@@ -17,26 +18,12 @@
* Documentation used:
* The original source
* CP Programming Service, IBM document # SC24-5760
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
-#define KMSG_COMPONENT "iucv"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#define pr_fmt(fmt) "iucv: " fmt
#include <linux/kernel_stat.h>
+#include <linux/export.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/spinlock.h>
@@ -52,6 +39,7 @@
#include <linux/reboot.h>
#include <net/iucv/iucv.h>
#include <linux/atomic.h>
+#include <asm/machine.h>
#include <asm/ebcdic.h>
#include <asm/io.h>
#include <asm/irq.h>
@@ -75,42 +63,55 @@
#define IUCV_IPNORPY 0x10
#define IUCV_IPALL 0x80
-static int iucv_bus_match(struct device *dev, struct device_driver *drv)
+static int iucv_bus_match(struct device *dev, const struct device_driver *drv)
{
return 0;
}
-enum iucv_pm_states {
- IUCV_PM_INITIAL = 0,
- IUCV_PM_FREEZING = 1,
- IUCV_PM_THAWING = 2,
- IUCV_PM_RESTORING = 3,
-};
-static enum iucv_pm_states iucv_pm_state;
-
-static int iucv_pm_prepare(struct device *);
-static void iucv_pm_complete(struct device *);
-static int iucv_pm_freeze(struct device *);
-static int iucv_pm_thaw(struct device *);
-static int iucv_pm_restore(struct device *);
-
-static const struct dev_pm_ops iucv_pm_ops = {
- .prepare = iucv_pm_prepare,
- .complete = iucv_pm_complete,
- .freeze = iucv_pm_freeze,
- .thaw = iucv_pm_thaw,
- .restore = iucv_pm_restore,
-};
-
-struct bus_type iucv_bus = {
+const struct bus_type iucv_bus = {
.name = "iucv",
.match = iucv_bus_match,
- .pm = &iucv_pm_ops,
};
EXPORT_SYMBOL(iucv_bus);
-struct device *iucv_root;
-EXPORT_SYMBOL(iucv_root);
+static struct device *iucv_root;
+
+static void iucv_release_device(struct device *device)
+{
+ kfree(device);
+}
+
+struct device *iucv_alloc_device(const struct attribute_group **attrs,
+ struct device_driver *driver,
+ void *priv, const char *fmt, ...)
+{
+ struct device *dev;
+ va_list vargs;
+ char buf[20];
+ int rc;
+
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ goto out_error;
+ va_start(vargs, fmt);
+ vscnprintf(buf, sizeof(buf), fmt, vargs);
+ rc = dev_set_name(dev, "%s", buf);
+ va_end(vargs);
+ if (rc)
+ goto out_error;
+ dev->bus = &iucv_bus;
+ dev->parent = iucv_root;
+ dev->driver = driver;
+ dev->groups = attrs;
+ dev->release = iucv_release_device;
+ dev_set_drvdata(dev, priv);
+ return dev;
+
+out_error:
+ kfree(dev);
+ return NULL;
+}
+EXPORT_SYMBOL(iucv_alloc_device);
static int iucv_available;
@@ -119,7 +120,7 @@ struct iucv_irq_data {
u16 ippathid;
u8 ipflags1;
u8 iptype;
- u32 res2[8];
+ u32 res2[9];
};
struct iucv_irq_list {
@@ -141,7 +142,7 @@ static LIST_HEAD(iucv_task_queue);
* The tasklet for fast delivery of iucv interrupts.
*/
static void iucv_tasklet_fn(unsigned long);
-static DECLARE_TASKLET(iucv_tasklet, iucv_tasklet_fn,0);
+static DECLARE_TASKLET_OLD(iucv_tasklet, iucv_tasklet_fn);
/*
* Queue of interrupt buffers for delivery via a work queue
@@ -192,7 +193,7 @@ static char iucv_error_pathid[16] = "INVALID PATHID";
static LIST_HEAD(iucv_handler_list);
/*
- * iucv_path_table: an array of iucv_path structures.
+ * iucv_path_table: array of pointers to iucv_path structures.
*/
static struct iucv_path **iucv_path_table;
static unsigned long iucv_max_pathid;
@@ -246,7 +247,7 @@ struct iucv_cmd_dpl {
u8 iprmmsg[8];
u32 ipsrccls;
u32 ipmsgtag;
- u32 ipbfadr2;
+ dma32_t ipbfadr2;
u32 ipbfln2f;
u32 res;
} __attribute__ ((packed,aligned(8)));
@@ -262,11 +263,11 @@ struct iucv_cmd_db {
u8 iprcode;
u32 ipmsgid;
u32 iptrgcls;
- u32 ipbfadr1;
+ dma32_t ipbfadr1;
u32 ipbfln1f;
u32 ipsrccls;
u32 ipmsgtag;
- u32 ipbfadr2;
+ dma32_t ipbfadr2;
u32 ipbfln2f;
u32 res;
} __attribute__ ((packed,aligned(8)));
@@ -312,8 +313,8 @@ static union iucv_param *iucv_param[NR_CPUS];
static union iucv_param *iucv_param_irq[NR_CPUS];
/**
- * iucv_call_b2f0
- * @code: identifier of IUCV call to CP.
+ * __iucv_call_b2f0
+ * @command: identifier of IUCV call to CP.
* @parm: pointer to a struct iucv_parm block
*
* Calls CP to execute IUCV commands.
@@ -322,19 +323,20 @@ static union iucv_param *iucv_param_irq[NR_CPUS];
*/
static inline int __iucv_call_b2f0(int command, union iucv_param *parm)
{
- register unsigned long reg0 asm ("0");
- register unsigned long reg1 asm ("1");
- int ccode;
+ unsigned long reg1 = virt_to_phys(parm);
+ int cc;
- reg0 = command;
- reg1 = (unsigned long)parm;
asm volatile(
- " .long 0xb2f01000\n"
- " ipm %0\n"
- " srl %0,28\n"
- : "=d" (ccode), "=m" (*parm), "+d" (reg0), "+a" (reg1)
- : "m" (*parm) : "cc");
- return ccode;
+ " lgr 0,%[reg0]\n"
+ " lgr 1,%[reg1]\n"
+ " .long 0xb2f01000\n"
+ " ipm %[cc]\n"
+ " srl %[cc],28\n"
+ : [cc] "=&d" (cc), "+m" (*parm)
+ : [reg0] "d" ((unsigned long)command),
+ [reg1] "d" (reg1)
+ : "cc", "0", "1");
+ return cc;
}
static inline int iucv_call_b2f0(int command, union iucv_param *parm)
@@ -345,7 +347,7 @@ static inline int iucv_call_b2f0(int command, union iucv_param *parm)
return ccode == 1 ? parm->ctrl.iprcode : ccode;
}
-/**
+/*
* iucv_query_maxconn
*
* Determines the maximum number of connections that may be established.
@@ -355,19 +357,21 @@ static inline int iucv_call_b2f0(int command, union iucv_param *parm)
*/
static int __iucv_query_maxconn(void *param, unsigned long *max_pathid)
{
- register unsigned long reg0 asm ("0");
- register unsigned long reg1 asm ("1");
- int ccode;
+ unsigned long reg1 = virt_to_phys(param);
+ int cc;
- reg0 = IUCV_QUERY;
- reg1 = (unsigned long) param;
asm volatile (
+ " lghi 0,%[cmd]\n"
+ " lgr 1,%[reg1]\n"
" .long 0xb2f01000\n"
- " ipm %0\n"
- " srl %0,28\n"
- : "=d" (ccode), "+d" (reg0), "+d" (reg1) : : "cc");
+ " ipm %[cc]\n"
+ " srl %[cc],28\n"
+ " lgr %[reg1],1\n"
+ : [cc] "=&d" (cc), [reg1] "+&d" (reg1)
+ : [cmd] "K" (IUCV_QUERY)
+ : "cc", "0", "1");
*max_pathid = reg1;
- return ccode;
+ return cc;
}
static int iucv_query_maxconn(void)
@@ -448,31 +452,6 @@ static void iucv_block_cpu(void *data)
}
/**
- * iucv_block_cpu_almost
- * @data: unused
- *
- * Allow connection-severed interrupts only on this cpu.
- */
-static void iucv_block_cpu_almost(void *data)
-{
- int cpu = smp_processor_id();
- union iucv_param *parm;
-
- /* Allow iucv control interrupts only */
- parm = iucv_param_irq[cpu];
- memset(parm, 0, sizeof(union iucv_param));
- parm->set_mask.ipmask = 0x08;
- iucv_call_b2f0(IUCV_SETMASK, parm);
- /* Allow iucv-severed interrupt only */
- memset(parm, 0, sizeof(union iucv_param));
- parm->set_mask.ipmask = 0x20;
- iucv_call_b2f0(IUCV_SETCONTROLMASK, parm);
-
- /* Clear indication that iucv interrupts are allowed for this cpu. */
- cpumask_clear_cpu(cpu, &iucv_irq_cpumask);
-}
-
-/**
* iucv_declare_cpu
* @data: unused
*
@@ -490,7 +469,7 @@ static void iucv_declare_cpu(void *data)
/* Declare interrupt buffer. */
parm = iucv_param_irq[cpu];
memset(parm, 0, sizeof(union iucv_param));
- parm->db.ipbfadr1 = virt_to_phys(iucv_irq_data[cpu]);
+ parm->db.ipbfadr1 = virt_to_dma32(iucv_irq_data[cpu]);
rc = iucv_call_b2f0(IUCV_DECLARE_BUFFER, parm);
if (rc) {
char *err = "Unknown";
@@ -552,8 +531,8 @@ static void iucv_retrieve_cpu(void *data)
cpumask_clear_cpu(cpu, &iucv_buffer_cpumask);
}
-/**
- * iucv_setmask_smp
+/*
+ * iucv_setmask_mp
*
* Allow iucv interrupts on all cpus.
*/
@@ -561,24 +540,24 @@ static void iucv_setmask_mp(void)
{
int cpu;
- get_online_cpus();
+ cpus_read_lock();
for_each_online_cpu(cpu)
/* Enable all cpus with a declared buffer. */
if (cpumask_test_cpu(cpu, &iucv_buffer_cpumask) &&
!cpumask_test_cpu(cpu, &iucv_irq_cpumask))
smp_call_function_single(cpu, iucv_allow_cpu,
NULL, 1);
- put_online_cpus();
+ cpus_read_unlock();
}
-/**
+/*
* iucv_setmask_up
*
* Allow iucv interrupts on a single cpu.
*/
static void iucv_setmask_up(void)
{
- cpumask_t cpumask;
+ static cpumask_t cpumask;
int cpu;
/* Disable all cpu but the first in cpu_irq_cpumask. */
@@ -588,7 +567,7 @@ static void iucv_setmask_up(void)
smp_call_function_single(cpu, iucv_block_cpu, NULL, 1);
}
-/**
+/*
* iucv_enable
*
* This function makes iucv ready for use. It allocates the pathid
@@ -601,9 +580,9 @@ static int iucv_enable(void)
size_t alloc_size;
int cpu, rc;
- get_online_cpus();
+ cpus_read_lock();
rc = -ENOMEM;
- alloc_size = iucv_max_pathid * sizeof(struct iucv_path);
+ alloc_size = iucv_max_pathid * sizeof(*iucv_path_table);
iucv_path_table = kzalloc(alloc_size, GFP_KERNEL);
if (!iucv_path_table)
goto out;
@@ -614,16 +593,16 @@ static int iucv_enable(void)
if (cpumask_empty(&iucv_buffer_cpumask))
/* No cpu could declare an iucv buffer. */
goto out;
- put_online_cpus();
+ cpus_read_unlock();
return 0;
out:
kfree(iucv_path_table);
iucv_path_table = NULL;
- put_online_cpus();
+ cpus_read_unlock();
return rc;
}
-/**
+/*
* iucv_disable
*
* This function shuts down iucv. It disables iucv interrupts, retrieves
@@ -632,11 +611,11 @@ out:
*/
static void iucv_disable(void)
{
- get_online_cpus();
+ cpus_read_lock();
on_each_cpu(iucv_retrieve_cpu, NULL, 1);
kfree(iucv_path_table);
iucv_path_table = NULL;
- put_online_cpus();
+ cpus_read_unlock();
}
static int iucv_cpu_dead(unsigned int cpu)
@@ -686,23 +665,33 @@ static int iucv_cpu_online(unsigned int cpu)
static int iucv_cpu_down_prep(unsigned int cpu)
{
- cpumask_t cpumask;
+ cpumask_var_t cpumask;
+ int ret = 0;
if (!iucv_path_table)
return 0;
- cpumask_copy(&cpumask, &iucv_buffer_cpumask);
- cpumask_clear_cpu(cpu, &cpumask);
- if (cpumask_empty(&cpumask))
+ if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpumask_copy(cpumask, &iucv_buffer_cpumask);
+ cpumask_clear_cpu(cpu, cpumask);
+ if (cpumask_empty(cpumask)) {
/* Can't offline last IUCV enabled cpu. */
- return -EINVAL;
+ ret = -EINVAL;
+ goto __free_cpumask;
+ }
iucv_retrieve_cpu(NULL);
if (!cpumask_empty(&iucv_irq_cpumask))
- return 0;
+ goto __free_cpumask;
+
smp_call_function_single(cpumask_first(&iucv_buffer_cpumask),
iucv_allow_cpu, NULL, 1);
- return 0;
+
+__free_cpumask:
+ free_cpumask_var(cpumask);
+ return ret;
}
/**
@@ -845,7 +834,7 @@ static int iucv_reboot_event(struct notifier_block *this,
if (cpumask_empty(&iucv_irq_cpumask))
return NOTIFY_DONE;
- get_online_cpus();
+ cpus_read_lock();
on_each_cpu_mask(&iucv_irq_cpumask, iucv_block_cpu, NULL, 1);
preempt_disable();
for (i = 0; i < iucv_max_pathid; i++) {
@@ -853,7 +842,7 @@ static int iucv_reboot_event(struct notifier_block *this,
iucv_sever_pathid(i, NULL);
}
preempt_enable();
- put_online_cpus();
+ cpus_read_unlock();
iucv_disable();
return NOTIFY_DONE;
}
@@ -1139,8 +1128,7 @@ static int iucv_message_receive_iprmdata(struct iucv_path *path,
size = (size < 8) ? size : 8;
for (array = buffer; size > 0; array++) {
copy = min_t(size_t, size, array->length);
- memcpy((u8 *)(addr_t) array->address,
- rmmsg, copy);
+ memcpy(dma32_to_virt(array->address), rmmsg, copy);
rmmsg += copy;
size -= copy;
}
@@ -1177,13 +1165,12 @@ int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg,
if (msg->flags & IUCV_IPRMDATA)
return iucv_message_receive_iprmdata(path, msg, flags,
buffer, size, residual);
- if (cpumask_empty(&iucv_buffer_cpumask)) {
- rc = -EIO;
- goto out;
- }
+ if (cpumask_empty(&iucv_buffer_cpumask))
+ return -EIO;
+
parm = iucv_param[smp_processor_id()];
memset(parm, 0, sizeof(union iucv_param));
- parm->db.ipbfadr1 = (u32)(addr_t) buffer;
+ parm->db.ipbfadr1 = virt_to_dma32(buffer);
parm->db.ipbfln1f = (u32) size;
parm->db.ipmsgid = msg->id;
parm->db.ippathid = path->pathid;
@@ -1196,7 +1183,6 @@ int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg,
if (residual)
*residual = parm->db.ipbfln1f;
}
-out:
return rc;
}
EXPORT_SYMBOL(__iucv_message_receive);
@@ -1302,7 +1288,7 @@ int iucv_message_reply(struct iucv_path *path, struct iucv_message *msg,
parm->dpl.iptrgcls = msg->class;
memcpy(parm->dpl.iprmmsg, reply, min_t(size_t, size, 8));
} else {
- parm->db.ipbfadr1 = (u32)(addr_t) reply;
+ parm->db.ipbfadr1 = virt_to_dma32(reply);
parm->db.ipbfln1f = (u32) size;
parm->db.ippathid = path->pathid;
parm->db.ipflags1 = flags;
@@ -1354,7 +1340,7 @@ int __iucv_message_send(struct iucv_path *path, struct iucv_message *msg,
parm->dpl.ipmsgtag = msg->tag;
memcpy(parm->dpl.iprmmsg, buffer, 8);
} else {
- parm->db.ipbfadr1 = (u32)(addr_t) buffer;
+ parm->db.ipbfadr1 = virt_to_dma32(buffer);
parm->db.ipbfln1f = (u32) size;
parm->db.ippathid = path->pathid;
parm->db.ipflags1 = flags | IUCV_IPNORPY;
@@ -1408,8 +1394,9 @@ EXPORT_SYMBOL(iucv_message_send);
* @srccls: source class of message
* @buffer: address of send buffer or address of struct iucv_array
* @size: length of send buffer
- * @ansbuf: address of answer buffer or address of struct iucv_array
+ * @answer: address of answer buffer or address of struct iucv_array
* @asize: size of reply buffer
+ * @residual: ignored
*
* This function transmits data to another application. Data to be
* transmitted is in a buffer. The receiver of the send is expected to
@@ -1438,7 +1425,7 @@ int iucv_message_send2way(struct iucv_path *path, struct iucv_message *msg,
parm->dpl.iptrgcls = msg->class;
parm->dpl.ipsrccls = srccls;
parm->dpl.ipmsgtag = msg->tag;
- parm->dpl.ipbfadr2 = (u32)(addr_t) answer;
+ parm->dpl.ipbfadr2 = virt_to_dma32(answer);
parm->dpl.ipbfln2f = (u32) asize;
memcpy(parm->dpl.iprmmsg, buffer, 8);
} else {
@@ -1447,9 +1434,9 @@ int iucv_message_send2way(struct iucv_path *path, struct iucv_message *msg,
parm->db.iptrgcls = msg->class;
parm->db.ipsrccls = srccls;
parm->db.ipmsgtag = msg->tag;
- parm->db.ipbfadr1 = (u32)(addr_t) buffer;
+ parm->db.ipbfadr1 = virt_to_dma32(buffer);
parm->db.ipbfln1f = (u32) size;
- parm->db.ipbfadr2 = (u32)(addr_t) answer;
+ parm->db.ipbfadr2 = virt_to_dma32(answer);
parm->db.ipbfln2f = (u32) asize;
}
rc = iucv_call_b2f0(IUCV_SEND, parm);
@@ -1461,13 +1448,6 @@ out:
}
EXPORT_SYMBOL(iucv_message_send2way);
-/**
- * iucv_path_pending
- * @data: Pointer to external interrupt buffer
- *
- * Process connection pending work item. Called from tasklet while holding
- * iucv_table_lock.
- */
struct iucv_path_pending {
u16 ippathid;
u8 ipflags1;
@@ -1481,6 +1461,13 @@ struct iucv_path_pending {
u8 res4[3];
} __packed;
+/**
+ * iucv_path_pending
+ * @data: Pointer to external interrupt buffer
+ *
+ * Process connection pending work item. Called from tasklet while holding
+ * iucv_table_lock.
+ */
static void iucv_path_pending(struct iucv_irq_data *data)
{
struct iucv_path_pending *ipp = (void *) data;
@@ -1522,13 +1509,6 @@ out_sever:
iucv_sever_pathid(ipp->ippathid, error);
}
-/**
- * iucv_path_complete
- * @data: Pointer to external interrupt buffer
- *
- * Process connection complete work item. Called from tasklet while holding
- * iucv_table_lock.
- */
struct iucv_path_complete {
u16 ippathid;
u8 ipflags1;
@@ -1542,6 +1522,13 @@ struct iucv_path_complete {
u8 res4[3];
} __packed;
+/**
+ * iucv_path_complete
+ * @data: Pointer to external interrupt buffer
+ *
+ * Process connection complete work item. Called from tasklet while holding
+ * iucv_table_lock.
+ */
static void iucv_path_complete(struct iucv_irq_data *data)
{
struct iucv_path_complete *ipc = (void *) data;
@@ -1553,13 +1540,6 @@ static void iucv_path_complete(struct iucv_irq_data *data)
path->handler->path_complete(path, ipc->ipuser);
}
-/**
- * iucv_path_severed
- * @data: Pointer to external interrupt buffer
- *
- * Process connection severed work item. Called from tasklet while holding
- * iucv_table_lock.
- */
struct iucv_path_severed {
u16 ippathid;
u8 res1;
@@ -1572,6 +1552,13 @@ struct iucv_path_severed {
u8 res5[3];
} __packed;
+/**
+ * iucv_path_severed
+ * @data: Pointer to external interrupt buffer
+ *
+ * Process connection severed work item. Called from tasklet while holding
+ * iucv_table_lock.
+ */
static void iucv_path_severed(struct iucv_irq_data *data)
{
struct iucv_path_severed *ips = (void *) data;
@@ -1589,13 +1576,6 @@ static void iucv_path_severed(struct iucv_irq_data *data)
}
}
-/**
- * iucv_path_quiesced
- * @data: Pointer to external interrupt buffer
- *
- * Process connection quiesced work item. Called from tasklet while holding
- * iucv_table_lock.
- */
struct iucv_path_quiesced {
u16 ippathid;
u8 res1;
@@ -1608,6 +1588,13 @@ struct iucv_path_quiesced {
u8 res5[3];
} __packed;
+/**
+ * iucv_path_quiesced
+ * @data: Pointer to external interrupt buffer
+ *
+ * Process connection quiesced work item. Called from tasklet while holding
+ * iucv_table_lock.
+ */
static void iucv_path_quiesced(struct iucv_irq_data *data)
{
struct iucv_path_quiesced *ipq = (void *) data;
@@ -1617,13 +1604,6 @@ static void iucv_path_quiesced(struct iucv_irq_data *data)
path->handler->path_quiesced(path, ipq->ipuser);
}
-/**
- * iucv_path_resumed
- * @data: Pointer to external interrupt buffer
- *
- * Process connection resumed work item. Called from tasklet while holding
- * iucv_table_lock.
- */
struct iucv_path_resumed {
u16 ippathid;
u8 res1;
@@ -1636,6 +1616,13 @@ struct iucv_path_resumed {
u8 res5[3];
} __packed;
+/**
+ * iucv_path_resumed
+ * @data: Pointer to external interrupt buffer
+ *
+ * Process connection resumed work item. Called from tasklet while holding
+ * iucv_table_lock.
+ */
static void iucv_path_resumed(struct iucv_irq_data *data)
{
struct iucv_path_resumed *ipr = (void *) data;
@@ -1645,13 +1632,6 @@ static void iucv_path_resumed(struct iucv_irq_data *data)
path->handler->path_resumed(path, ipr->ipuser);
}
-/**
- * iucv_message_complete
- * @data: Pointer to external interrupt buffer
- *
- * Process message complete work item. Called from tasklet while holding
- * iucv_table_lock.
- */
struct iucv_message_complete {
u16 ippathid;
u8 ipflags1;
@@ -1667,6 +1647,13 @@ struct iucv_message_complete {
u8 res2[3];
} __packed;
+/**
+ * iucv_message_complete
+ * @data: Pointer to external interrupt buffer
+ *
+ * Process message complete work item. Called from tasklet while holding
+ * iucv_table_lock.
+ */
static void iucv_message_complete(struct iucv_irq_data *data)
{
struct iucv_message_complete *imc = (void *) data;
@@ -1685,33 +1672,35 @@ static void iucv_message_complete(struct iucv_irq_data *data)
}
}
-/**
- * iucv_message_pending
- * @data: Pointer to external interrupt buffer
- *
- * Process message pending work item. Called from tasklet while holding
- * iucv_table_lock.
- */
struct iucv_message_pending {
u16 ippathid;
u8 ipflags1;
u8 iptype;
u32 ipmsgid;
u32 iptrgcls;
- union {
- u32 iprmmsg1_u32;
- u8 iprmmsg1[4];
- } ln1msg1;
- union {
- u32 ipbfln1f;
- u8 iprmmsg2[4];
- } ln1msg2;
+ struct {
+ union {
+ u32 iprmmsg1_u32;
+ u8 iprmmsg1[4];
+ } ln1msg1;
+ union {
+ u32 ipbfln1f;
+ u8 iprmmsg2[4];
+ } ln1msg2;
+ } rmmsg;
u32 res1[3];
u32 ipbfln2f;
u8 ippollfg;
u8 res2[3];
} __packed;
+/**
+ * iucv_message_pending
+ * @data: Pointer to external interrupt buffer
+ *
+ * Process message pending work item. Called from tasklet while holding
+ * iucv_table_lock.
+ */
static void iucv_message_pending(struct iucv_irq_data *data)
{
struct iucv_message_pending *imp = (void *) data;
@@ -1723,16 +1712,16 @@ static void iucv_message_pending(struct iucv_irq_data *data)
msg.id = imp->ipmsgid;
msg.class = imp->iptrgcls;
if (imp->ipflags1 & IUCV_IPRMDATA) {
- memcpy(msg.rmmsg, imp->ln1msg1.iprmmsg1, 8);
+ memcpy(msg.rmmsg, &imp->rmmsg, 8);
msg.length = 8;
} else
- msg.length = imp->ln1msg2.ipbfln1f;
+ msg.length = imp->rmmsg.ln1msg2.ipbfln1f;
msg.reply_size = imp->ipbfln2f;
path->handler->message_pending(path, &msg);
}
}
-/**
+/*
* iucv_tasklet_fn:
*
* This tasklet loops over the queue of irq buffers created by
@@ -1776,7 +1765,7 @@ static void iucv_tasklet_fn(unsigned long ignored)
spin_unlock(&iucv_table_lock);
}
-/**
+/*
* iucv_work_fn:
*
* This work function loops over the queue of path pending irq blocks
@@ -1807,9 +1796,8 @@ static void iucv_work_fn(struct work_struct *work)
spin_unlock_bh(&iucv_table_lock);
}
-/**
+/*
* iucv_external_interrupt
- * @code: irq code
*
* Handles external interrupts coming in from CP.
* Places the interrupt buffer on a queue and schedules iucv_tasklet_fn().
@@ -1847,146 +1835,6 @@ static void iucv_external_interrupt(struct ext_code ext_code,
spin_unlock(&iucv_queue_lock);
}
-static int iucv_pm_prepare(struct device *dev)
-{
- int rc = 0;
-
-#ifdef CONFIG_PM_DEBUG
- printk(KERN_INFO "iucv_pm_prepare\n");
-#endif
- if (dev->driver && dev->driver->pm && dev->driver->pm->prepare)
- rc = dev->driver->pm->prepare(dev);
- return rc;
-}
-
-static void iucv_pm_complete(struct device *dev)
-{
-#ifdef CONFIG_PM_DEBUG
- printk(KERN_INFO "iucv_pm_complete\n");
-#endif
- if (dev->driver && dev->driver->pm && dev->driver->pm->complete)
- dev->driver->pm->complete(dev);
-}
-
-/**
- * iucv_path_table_empty() - determine if iucv path table is empty
- *
- * Returns 0 if there are still iucv pathes defined
- * 1 if there are no iucv pathes defined
- */
-static int iucv_path_table_empty(void)
-{
- int i;
-
- for (i = 0; i < iucv_max_pathid; i++) {
- if (iucv_path_table[i])
- return 0;
- }
- return 1;
-}
-
-/**
- * iucv_pm_freeze() - Freeze PM callback
- * @dev: iucv-based device
- *
- * disable iucv interrupts
- * invoke callback function of the iucv-based driver
- * shut down iucv, if no iucv-pathes are established anymore
- */
-static int iucv_pm_freeze(struct device *dev)
-{
- int cpu;
- struct iucv_irq_list *p, *n;
- int rc = 0;
-
-#ifdef CONFIG_PM_DEBUG
- printk(KERN_WARNING "iucv_pm_freeze\n");
-#endif
- if (iucv_pm_state != IUCV_PM_FREEZING) {
- for_each_cpu(cpu, &iucv_irq_cpumask)
- smp_call_function_single(cpu, iucv_block_cpu_almost,
- NULL, 1);
- cancel_work_sync(&iucv_work);
- list_for_each_entry_safe(p, n, &iucv_work_queue, list) {
- list_del_init(&p->list);
- iucv_sever_pathid(p->data.ippathid,
- iucv_error_no_listener);
- kfree(p);
- }
- }
- iucv_pm_state = IUCV_PM_FREEZING;
- if (dev->driver && dev->driver->pm && dev->driver->pm->freeze)
- rc = dev->driver->pm->freeze(dev);
- if (iucv_path_table_empty())
- iucv_disable();
- return rc;
-}
-
-/**
- * iucv_pm_thaw() - Thaw PM callback
- * @dev: iucv-based device
- *
- * make iucv ready for use again: allocate path table, declare interrupt buffers
- * and enable iucv interrupts
- * invoke callback function of the iucv-based driver
- */
-static int iucv_pm_thaw(struct device *dev)
-{
- int rc = 0;
-
-#ifdef CONFIG_PM_DEBUG
- printk(KERN_WARNING "iucv_pm_thaw\n");
-#endif
- iucv_pm_state = IUCV_PM_THAWING;
- if (!iucv_path_table) {
- rc = iucv_enable();
- if (rc)
- goto out;
- }
- if (cpumask_empty(&iucv_irq_cpumask)) {
- if (iucv_nonsmp_handler)
- /* enable interrupts on one cpu */
- iucv_allow_cpu(NULL);
- else
- /* enable interrupts on all cpus */
- iucv_setmask_mp();
- }
- if (dev->driver && dev->driver->pm && dev->driver->pm->thaw)
- rc = dev->driver->pm->thaw(dev);
-out:
- return rc;
-}
-
-/**
- * iucv_pm_restore() - Restore PM callback
- * @dev: iucv-based device
- *
- * make iucv ready for use again: allocate path table, declare interrupt buffers
- * and enable iucv interrupts
- * invoke callback function of the iucv-based driver
- */
-static int iucv_pm_restore(struct device *dev)
-{
- int rc = 0;
-
-#ifdef CONFIG_PM_DEBUG
- printk(KERN_WARNING "iucv_pm_restore %p\n", iucv_path_table);
-#endif
- if ((iucv_pm_state != IUCV_PM_RESTORING) && iucv_path_table)
- pr_warn("Suspending Linux did not completely close all IUCV connections\n");
- iucv_pm_state = IUCV_PM_RESTORING;
- if (cpumask_empty(&iucv_irq_cpumask)) {
- rc = iucv_query_maxconn();
- rc = iucv_enable();
- if (rc)
- goto out;
- }
- if (dev->driver && dev->driver->pm && dev->driver->pm->restore)
- rc = dev->driver->pm->restore(dev);
-out:
- return rc;
-}
-
struct iucv_interface iucv_if = {
.message_receive = iucv_message_receive,
.__message_receive = __iucv_message_receive,
@@ -2018,11 +1866,11 @@ static int __init iucv_init(void)
{
int rc;
- if (!MACHINE_IS_VM) {
+ if (!machine_is_vm()) {
rc = -EPROTONOSUPPORT;
goto out;
}
- ctl_set_bit(0, 1);
+ system_ctl_set_bit(0, CR0_IUCV_BIT);
rc = iucv_query_maxconn();
if (rc)
goto out_ctl;
@@ -2070,7 +1918,7 @@ out_dev:
out_int:
unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt);
out_ctl:
- ctl_clear_bit(0, 1);
+ system_ctl_clear_bit(0, 1);
out:
return rc;
}
@@ -2102,6 +1950,6 @@ static void __exit iucv_exit(void)
subsys_initcall(iucv_init);
module_exit(iucv_exit);
-MODULE_AUTHOR("(C) 2001 IBM Corp. by Fritz Elfert (felfert@millenux.com)");
+MODULE_AUTHOR("(C) 2001 IBM Corp. by Fritz Elfert <felfert@millenux.com>");
MODULE_DESCRIPTION("Linux for S/390 IUCV lowlevel driver");
MODULE_LICENSE("GPL");
diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig
index 9ca83f2ade6f..66660a06cacf 100644
--- a/net/kcm/Kconfig
+++ b/net/kcm/Kconfig
@@ -1,10 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
config AF_KCM
tristate "KCM sockets"
depends on INET
select BPF_SYSCALL
select STREAM_PARSER
- ---help---
+ help
KCM (Kernel Connection Multiplexor) sockets provide a method
- for multiplexing messages of a message based application
- protocol over kernel connectons (e.g. TCP connections).
+ for multiplexing messages of a message-based application
+ protocol over kernel connections (e.g. TCP connections).
diff --git a/net/kcm/Makefile b/net/kcm/Makefile
index 71256133e677..6c4569221da8 100644
--- a/net/kcm/Makefile
+++ b/net/kcm/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
obj-$(CONFIG_AF_KCM) += kcm.o
kcm-y := kcmsock.o kcmproc.o
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
index 370da2f80e3c..25c1007f1098 100644
--- a/net/kcm/kcmproc.c
+++ b/net/kcm/kcmproc.c
@@ -261,7 +261,7 @@ static int kcm_stats_seq_show(struct seq_file *seq, void *v)
aggregate_strp_stats(&knet->aggregate_strp_stats,
&strp_stats);
- list_for_each_entry_rcu(mux, &knet->mux_list, kcm_mux_list) {
+ list_for_each_entry(mux, &knet->mux_list, kcm_mux_list) {
spin_lock_bh(&mux->lock);
aggregate_mux_stats(&mux->stats, &mux_stats);
aggregate_psock_stats(&mux->aggregate_psock_stats,
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index 571d824e4e24..5dd7e0509a48 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -1,17 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel Connection Multiplexor
*
* Copyright (c) 2016 Tom Herbert <tom@herbertland.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2
- * as published by the Free Software Foundation.
*/
#include <linux/bpf.h>
#include <linux/errno.h>
#include <linux/errqueue.h>
#include <linux/file.h>
+#include <linux/filter.h>
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/module.h>
@@ -21,6 +19,7 @@
#include <linux/rculist.h>
#include <linux/skbuff.h>
#include <linux/socket.h>
+#include <linux/splice.h>
#include <linux/uaccess.h>
#include <linux/workqueue.h>
#include <linux/syscalls.h>
@@ -30,6 +29,7 @@
#include <net/netns/generic.h>
#include <net/sock.h>
#include <uapi/linux/kcm.h>
+#include <trace/events/sock.h>
unsigned int kcm_net_id;
@@ -50,7 +50,7 @@ static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb)
static void report_csk_error(struct sock *csk, int err)
{
csk->sk_err = EPIPE;
- csk->sk_error_report(csk);
+ sk_error_report(csk);
}
static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
@@ -164,7 +164,8 @@ static void kcm_rcv_ready(struct kcm_sock *kcm)
/* Buffer limit is okay now, add to ready list */
list_add_tail(&kcm->wait_rx_list,
&kcm->mux->kcm_rx_waiters);
- kcm->rx_wait = true;
+ /* paired with lockless reads in kcm_rfree() */
+ WRITE_ONCE(kcm->rx_wait, true);
}
static void kcm_rfree(struct sk_buff *skb)
@@ -180,7 +181,7 @@ static void kcm_rfree(struct sk_buff *skb)
/* For reading rx_wait and rx_psock without holding lock */
smp_mb__after_atomic();
- if (!kcm->rx_wait && !kcm->rx_psock &&
+ if (!READ_ONCE(kcm->rx_wait) && !READ_ONCE(kcm->rx_psock) &&
sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) {
spin_lock_bh(&mux->rx_lock);
kcm_rcv_ready(kcm);
@@ -223,7 +224,7 @@ static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head)
struct sk_buff *skb;
struct kcm_sock *kcm;
- while ((skb = __skb_dequeue(head))) {
+ while ((skb = skb_dequeue(head))) {
/* Reset destructor to avoid calling kcm_rcv_ready */
skb->destructor = sock_rfree;
skb_orphan(skb);
@@ -239,7 +240,8 @@ try_again:
if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
/* Should mean socket buffer full */
list_del(&kcm->wait_rx_list);
- kcm->rx_wait = false;
+ /* paired with lockless reads in kcm_rfree() */
+ WRITE_ONCE(kcm->rx_wait, false);
/* Commit rx_wait to read in kcm_free */
smp_wmb();
@@ -282,10 +284,12 @@ static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock,
kcm = list_first_entry(&mux->kcm_rx_waiters,
struct kcm_sock, wait_rx_list);
list_del(&kcm->wait_rx_list);
- kcm->rx_wait = false;
+ /* paired with lockless reads in kcm_rfree() */
+ WRITE_ONCE(kcm->rx_wait, false);
psock->rx_kcm = kcm;
- kcm->rx_psock = psock;
+ /* paired with lockless reads in kcm_rfree() */
+ WRITE_ONCE(kcm->rx_psock, psock);
spin_unlock_bh(&mux->rx_lock);
@@ -312,7 +316,8 @@ static void unreserve_rx_kcm(struct kcm_psock *psock,
spin_lock_bh(&mux->rx_lock);
psock->rx_kcm = NULL;
- kcm->rx_psock = NULL;
+ /* paired with lockless reads in kcm_rfree() */
+ WRITE_ONCE(kcm->rx_psock, NULL);
/* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with
* kcm_rfree
@@ -346,6 +351,8 @@ static void psock_data_ready(struct sock *sk)
{
struct kcm_psock *psock;
+ trace_sk_data_ready(sk);
+
read_lock_bh(&sk->sk_callback_lock);
psock = (struct kcm_psock *)sk->sk_user_data;
@@ -381,8 +388,10 @@ static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb)
{
struct kcm_psock *psock = container_of(strp, struct kcm_psock, strp);
struct bpf_prog *prog = psock->bpf_prog;
+ int res;
- return (*prog->bpf_func)(skb, prog->insnsi);
+ res = bpf_prog_run_pin_on_cpu(prog, skb);
+ return res;
}
static int kcm_read_sock_done(struct strparser *strp, int err)
@@ -421,7 +430,7 @@ static void psock_write_space(struct sock *sk)
/* Check if the socket is reserved so someone is waiting for sending. */
kcm = psock->tx_kcm;
- if (kcm && !unlikely(kcm->tx_stopped))
+ if (kcm)
queue_work(kcm_wq, &kcm->tx_work);
spin_unlock_bh(&mux->lock);
@@ -573,12 +582,10 @@ static void kcm_report_tx_retry(struct kcm_sock *kcm)
*/
static int kcm_write_msgs(struct kcm_sock *kcm)
{
+ unsigned int total_sent = 0;
struct sock *sk = &kcm->sk;
struct kcm_psock *psock;
- struct sk_buff *skb, *head;
- struct kcm_tx_msg *txm;
- unsigned short fragidx, frag_offset;
- unsigned int sent, total_sent = 0;
+ struct sk_buff *head;
int ret = 0;
kcm->tx_wait_more = false;
@@ -592,123 +599,108 @@ static int kcm_write_msgs(struct kcm_sock *kcm)
if (skb_queue_empty(&sk->sk_write_queue))
return 0;
- kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0;
-
- } else if (skb_queue_empty(&sk->sk_write_queue)) {
- return 0;
+ kcm_tx_msg(skb_peek(&sk->sk_write_queue))->started_tx = false;
}
- head = skb_peek(&sk->sk_write_queue);
- txm = kcm_tx_msg(head);
+retry:
+ while ((head = skb_peek(&sk->sk_write_queue))) {
+ struct msghdr msg = {
+ .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
+ };
+ struct kcm_tx_msg *txm = kcm_tx_msg(head);
+ struct sk_buff *skb;
+ unsigned int msize;
+ int i;
- if (txm->sent) {
- /* Send of first skbuff in queue already in progress */
- if (WARN_ON(!psock)) {
- ret = -EINVAL;
- goto out;
+ if (!txm->started_tx) {
+ psock = reserve_psock(kcm);
+ if (!psock)
+ goto out;
+ skb = head;
+ txm->frag_offset = 0;
+ txm->sent = 0;
+ txm->started_tx = true;
+ } else {
+ if (WARN_ON(!psock)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ skb = txm->frag_skb;
}
- sent = txm->sent;
- frag_offset = txm->frag_offset;
- fragidx = txm->fragidx;
- skb = txm->frag_skb;
-
- goto do_frag;
- }
-try_again:
- psock = reserve_psock(kcm);
- if (!psock)
- goto out;
-
- do {
- skb = head;
- txm = kcm_tx_msg(head);
- sent = 0;
-
-do_frag_list:
- if (WARN_ON(!skb_shinfo(skb)->nr_frags)) {
+ if (WARN_ON(!skb_shinfo(skb)->nr_frags) ||
+ WARN_ON_ONCE(!skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
ret = -EINVAL;
goto out;
}
- for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags;
- fragidx++) {
- skb_frag_t *frag;
+ msize = 0;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
+ msize += skb_frag_size(&skb_shinfo(skb)->frags[i]);
- frag_offset = 0;
-do_frag:
- frag = &skb_shinfo(skb)->frags[fragidx];
- if (WARN_ON(!frag->size)) {
- ret = -EINVAL;
- goto out;
- }
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE,
+ (const struct bio_vec *)skb_shinfo(skb)->frags,
+ skb_shinfo(skb)->nr_frags, msize);
+ iov_iter_advance(&msg.msg_iter, txm->frag_offset);
- ret = kernel_sendpage(psock->sk->sk_socket,
- frag->page.p,
- frag->page_offset + frag_offset,
- frag->size - frag_offset,
- MSG_DONTWAIT);
+ do {
+ ret = sock_sendmsg(psock->sk->sk_socket, &msg);
if (ret <= 0) {
if (ret == -EAGAIN) {
/* Save state to try again when there's
* write space on the socket
*/
- txm->sent = sent;
- txm->frag_offset = frag_offset;
- txm->fragidx = fragidx;
txm->frag_skb = skb;
-
ret = 0;
goto out;
}
/* Hard failure in sending message, abort this
* psock since it has lost framing
- * synchonization and retry sending the
+ * synchronization and retry sending the
* message from the beginning.
*/
kcm_abort_tx_psock(psock, ret ? -ret : EPIPE,
true);
unreserve_psock(kcm);
+ psock = NULL;
- txm->sent = 0;
+ txm->started_tx = false;
kcm_report_tx_retry(kcm);
ret = 0;
-
- goto try_again;
+ goto retry;
}
- sent += ret;
- frag_offset += ret;
+ txm->sent += ret;
+ txm->frag_offset += ret;
KCM_STATS_ADD(psock->stats.tx_bytes, ret);
- if (frag_offset < frag->size) {
- /* Not finished with this frag */
- goto do_frag;
- }
- }
+ } while (msg.msg_iter.count > 0);
if (skb == head) {
if (skb_has_frag_list(skb)) {
- skb = skb_shinfo(skb)->frag_list;
- goto do_frag_list;
+ txm->frag_skb = skb_shinfo(skb)->frag_list;
+ txm->frag_offset = 0;
+ continue;
}
} else if (skb->next) {
- skb = skb->next;
- goto do_frag_list;
+ txm->frag_skb = skb->next;
+ txm->frag_offset = 0;
+ continue;
}
/* Successfully sent the whole packet, account for it. */
+ sk->sk_wmem_queued -= txm->sent;
+ total_sent += txm->sent;
skb_dequeue(&sk->sk_write_queue);
kfree_skb(head);
- sk->sk_wmem_queued -= sent;
- total_sent += sent;
KCM_STATS_INCR(psock->stats.tx_msgs);
- } while ((head = skb_peek(&sk->sk_write_queue)));
+ }
out:
if (!head) {
/* Done with all queued messages. */
WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
- unreserve_psock(kcm);
+ if (psock)
+ unreserve_psock(kcm);
}
/* Check if write space is available */
@@ -753,149 +745,6 @@ static void kcm_push(struct kcm_sock *kcm)
kcm_write_msgs(kcm);
}
-static ssize_t kcm_sendpage(struct socket *sock, struct page *page,
- int offset, size_t size, int flags)
-
-{
- struct sock *sk = sock->sk;
- struct kcm_sock *kcm = kcm_sk(sk);
- struct sk_buff *skb = NULL, *head = NULL;
- long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
- bool eor;
- int err = 0;
- int i;
-
- if (flags & MSG_SENDPAGE_NOTLAST)
- flags |= MSG_MORE;
-
- /* No MSG_EOR from splice, only look at MSG_MORE */
- eor = !(flags & MSG_MORE);
-
- lock_sock(sk);
-
- sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
-
- err = -EPIPE;
- if (sk->sk_err)
- goto out_error;
-
- if (kcm->seq_skb) {
- /* Previously opened message */
- head = kcm->seq_skb;
- skb = kcm_tx_msg(head)->last_skb;
- i = skb_shinfo(skb)->nr_frags;
-
- if (skb_can_coalesce(skb, i, page, offset)) {
- skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
- skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
- goto coalesced;
- }
-
- if (i >= MAX_SKB_FRAGS) {
- struct sk_buff *tskb;
-
- tskb = alloc_skb(0, sk->sk_allocation);
- while (!tskb) {
- kcm_push(kcm);
- err = sk_stream_wait_memory(sk, &timeo);
- if (err)
- goto out_error;
- }
-
- if (head == skb)
- skb_shinfo(head)->frag_list = tskb;
- else
- skb->next = tskb;
-
- skb = tskb;
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- i = 0;
- }
- } else {
- /* Call the sk_stream functions to manage the sndbuf mem. */
- if (!sk_stream_memory_free(sk)) {
- kcm_push(kcm);
- set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
- err = sk_stream_wait_memory(sk, &timeo);
- if (err)
- goto out_error;
- }
-
- head = alloc_skb(0, sk->sk_allocation);
- while (!head) {
- kcm_push(kcm);
- err = sk_stream_wait_memory(sk, &timeo);
- if (err)
- goto out_error;
- }
-
- skb = head;
- i = 0;
- }
-
- get_page(page);
- skb_fill_page_desc(skb, i, page, offset, size);
- skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
-
-coalesced:
- skb->len += size;
- skb->data_len += size;
- skb->truesize += size;
- sk->sk_wmem_queued += size;
- sk_mem_charge(sk, size);
-
- if (head != skb) {
- head->len += size;
- head->data_len += size;
- head->truesize += size;
- }
-
- if (eor) {
- bool not_busy = skb_queue_empty(&sk->sk_write_queue);
-
- /* Message complete, queue it on send buffer */
- __skb_queue_tail(&sk->sk_write_queue, head);
- kcm->seq_skb = NULL;
- KCM_STATS_INCR(kcm->stats.tx_msgs);
-
- if (flags & MSG_BATCH) {
- kcm->tx_wait_more = true;
- } else if (kcm->tx_wait_more || not_busy) {
- err = kcm_write_msgs(kcm);
- if (err < 0) {
- /* We got a hard error in write_msgs but have
- * already queued this message. Report an error
- * in the socket, but don't affect return value
- * from sendmsg
- */
- pr_warn("KCM: Hard failure on kcm_write_msgs\n");
- report_csk_error(&kcm->sk, -err);
- }
- }
- } else {
- /* Message not complete, save state */
- kcm->seq_skb = head;
- kcm_tx_msg(head)->last_skb = skb;
- }
-
- KCM_STATS_ADD(kcm->stats.tx_bytes, size);
-
- release_sock(sk);
- return size;
-
-out_error:
- kcm_push(kcm);
-
- err = sk_stream_error(sk, flags, err);
-
- /* make sure we wake any epoll edge trigger waiter */
- if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
- sk->sk_write_space(sk);
-
- release_sock(sk);
- return err;
-}
-
static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
struct sock *sk = sock->sk;
@@ -907,6 +756,7 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
!(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR);
int err = -EPIPE;
+ mutex_lock(&kcm->tx_mutex);
lock_sock(sk);
/* Per tcp_sendmsg this should be in poll */
@@ -981,29 +831,51 @@ start:
merge = false;
}
- copy = min_t(int, msg_data_left(msg),
- pfrag->size - pfrag->offset);
+ if (msg->msg_flags & MSG_SPLICE_PAGES) {
+ copy = msg_data_left(msg);
+ if (!sk_wmem_schedule(sk, copy))
+ goto wait_for_memory;
- if (!sk_wmem_schedule(sk, copy))
- goto wait_for_memory;
+ err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
+ if (err < 0) {
+ if (err == -EMSGSIZE)
+ goto wait_for_memory;
+ goto out_error;
+ }
- err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
- pfrag->page,
- pfrag->offset,
- copy);
- if (err)
- goto out_error;
+ copy = err;
+ skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
+ sk_wmem_queued_add(sk, copy);
+ sk_mem_charge(sk, copy);
- /* Update the skb. */
- if (merge) {
- skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+ if (head != skb)
+ head->truesize += copy;
} else {
- skb_fill_page_desc(skb, i, pfrag->page,
- pfrag->offset, copy);
- get_page(pfrag->page);
+ copy = min_t(int, msg_data_left(msg),
+ pfrag->size - pfrag->offset);
+ if (!sk_wmem_schedule(sk, copy))
+ goto wait_for_memory;
+
+ err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
+ pfrag->page,
+ pfrag->offset,
+ copy);
+ if (err)
+ goto out_error;
+
+ /* Update the skb. */
+ if (merge) {
+ skb_frag_size_add(
+ &skb_shinfo(skb)->frags[i - 1], copy);
+ } else {
+ skb_fill_page_desc(skb, i, pfrag->page,
+ pfrag->offset, copy);
+ get_page(pfrag->page);
+ }
+
+ pfrag->offset += copy;
}
- pfrag->offset += copy;
copied += copy;
if (head != skb) {
head->len += copy;
@@ -1055,20 +927,24 @@ partial_message:
KCM_STATS_ADD(kcm->stats.tx_bytes, copied);
release_sock(sk);
+ mutex_unlock(&kcm->tx_mutex);
return copied;
out_error:
kcm_push(kcm);
- if (copied && sock->type == SOCK_SEQPACKET) {
+ if (sock->type == SOCK_SEQPACKET) {
/* Wrote some bytes before encountering an
* error, return partial success.
*/
- goto partial_message;
- }
-
- if (head != kcm->seq_skb)
+ if (copied)
+ goto partial_message;
+ if (head != kcm->seq_skb)
+ kfree_skb(head);
+ } else {
kfree_skb(head);
+ kcm->seq_skb = NULL;
+ }
err = sk_stream_error(sk, msg->msg_flags, err);
@@ -1077,38 +953,21 @@ out_error:
sk->sk_write_space(sk);
release_sock(sk);
+ mutex_unlock(&kcm->tx_mutex);
return err;
}
-static struct sk_buff *kcm_wait_data(struct sock *sk, int flags,
- long timeo, int *err)
+static void kcm_splice_eof(struct socket *sock)
{
- struct sk_buff *skb;
-
- while (!(skb = skb_peek(&sk->sk_receive_queue))) {
- if (sk->sk_err) {
- *err = sock_error(sk);
- return NULL;
- }
-
- if (sock_flag(sk, SOCK_DONE))
- return NULL;
-
- if ((flags & MSG_DONTWAIT) || !timeo) {
- *err = -EAGAIN;
- return NULL;
- }
-
- sk_wait_data(sk, &timeo, NULL);
+ struct sock *sk = sock->sk;
+ struct kcm_sock *kcm = kcm_sk(sk);
- /* Handle signals */
- if (signal_pending(current)) {
- *err = sock_intr_errno(timeo);
- return NULL;
- }
- }
+ if (skb_queue_empty_lockless(&sk->sk_write_queue))
+ return;
- return skb;
+ lock_sock(sk);
+ kcm_write_msgs(kcm);
+ release_sock(sk);
}
static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
@@ -1117,16 +976,11 @@ static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
struct sock *sk = sock->sk;
struct kcm_sock *kcm = kcm_sk(sk);
int err = 0;
- long timeo;
struct strp_msg *stm;
int copied = 0;
struct sk_buff *skb;
- timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
-
- lock_sock(sk);
-
- skb = kcm_wait_data(sk, flags, timeo, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
if (!skb)
goto out;
@@ -1157,14 +1011,11 @@ msg_finished:
/* Finished with message */
msg->msg_flags |= MSG_EOR;
KCM_STATS_INCR(kcm->stats.rx_msgs);
- skb_unlink(skb, &sk->sk_receive_queue);
- kfree_skb(skb);
}
}
out:
- release_sock(sk);
-
+ skb_free_datagram(sk, skb);
return copied ? : err;
}
@@ -1174,19 +1025,19 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
{
struct sock *sk = sock->sk;
struct kcm_sock *kcm = kcm_sk(sk);
- long timeo;
struct strp_msg *stm;
int err = 0;
ssize_t copied;
struct sk_buff *skb;
- /* Only support splice for SOCKSEQPACKET */
-
- timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+ if (sock->file->f_flags & O_NONBLOCK || flags & SPLICE_F_NONBLOCK)
+ flags = MSG_DONTWAIT;
+ else
+ flags = 0;
- lock_sock(sk);
+ /* Only support splice for SOCKSEQPACKET */
- skb = kcm_wait_data(sk, flags, timeo, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
if (!skb)
goto err_out;
@@ -1214,13 +1065,11 @@ static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
* finish reading the message.
*/
- release_sock(sk);
-
+ skb_free_datagram(sk, skb);
return copied;
err_out:
- release_sock(sk);
-
+ skb_free_datagram(sk, skb);
return err;
}
@@ -1240,7 +1089,8 @@ static void kcm_recv_disable(struct kcm_sock *kcm)
if (!kcm->rx_psock) {
if (kcm->rx_wait) {
list_del(&kcm->wait_rx_list);
- kcm->rx_wait = false;
+ /* paired with lockless reads in kcm_rfree() */
+ WRITE_ONCE(kcm->rx_wait, false);
}
requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
@@ -1266,7 +1116,7 @@ static void kcm_recv_enable(struct kcm_sock *kcm)
}
static int kcm_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct kcm_sock *kcm = kcm_sk(sock->sk);
int val, valbool;
@@ -1278,8 +1128,8 @@ static int kcm_setsockopt(struct socket *sock, int level, int optname,
if (optlen < sizeof(int))
return -EINVAL;
- if (get_user(val, (int __user *)optval))
- return -EINVAL;
+ if (copy_from_sockptr(&val, optval, sizeof(int)))
+ return -EFAULT;
valbool = val ? 1 : 0;
@@ -1311,10 +1161,11 @@ static int kcm_getsockopt(struct socket *sock, int level, int optname,
if (get_user(len, optlen))
return -EFAULT;
- len = min_t(unsigned int, len, sizeof(int));
if (len < 0)
return -EINVAL;
+ len = min_t(unsigned int, len, sizeof(int));
+
switch (optname) {
case KCM_RECV_DISABLE:
val = kcm->rx_disabled;
@@ -1361,6 +1212,7 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
spin_unlock_bh(&mux->lock);
INIT_WORK(&kcm->tx_work, kcm_tx_work);
+ mutex_init(&kcm->tx_mutex);
spin_lock_bh(&mux->rx_lock);
kcm_rcv_ready(kcm);
@@ -1412,26 +1264,25 @@ static int kcm_attach(struct socket *sock, struct socket *csock,
psock->sk = csk;
psock->bpf_prog = prog;
- err = strp_init(&psock->strp, csk, &cb);
- if (err) {
- kmem_cache_free(kcm_psockp, psock);
- goto out;
- }
-
write_lock_bh(&csk->sk_callback_lock);
- /* Check if sk_user_data is aready by KCM or someone else.
+ /* Check if sk_user_data is already by KCM or someone else.
* Must be done under lock to prevent race conditions.
*/
if (csk->sk_user_data) {
write_unlock_bh(&csk->sk_callback_lock);
- strp_stop(&psock->strp);
- strp_done(&psock->strp);
kmem_cache_free(kcm_psockp, psock);
err = -EALREADY;
goto out;
}
+ err = strp_init(&psock->strp, csk, &cb);
+ if (err) {
+ write_unlock_bh(&csk->sk_callback_lock);
+ kmem_cache_free(kcm_psockp, psock);
+ goto out;
+ }
+
psock->save_data_ready = csk->sk_data_ready;
psock->save_write_space = csk->sk_write_space;
psock->save_state_change = csk->sk_state_change;
@@ -1497,7 +1348,7 @@ static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info)
return 0;
out:
- fput(csock->file);
+ sockfd_put(csock);
return err;
}
@@ -1645,7 +1496,7 @@ static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info)
spin_unlock_bh(&mux->lock);
out:
- fput(csock->file);
+ sockfd_put(csock);
return err;
}
@@ -1709,24 +1560,16 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
}
case SIOCKCMCLONE: {
struct kcm_clone info;
- struct file *file;
- info.fd = get_unused_fd_flags(0);
- if (unlikely(info.fd < 0))
- return info.fd;
+ FD_PREPARE(fdf, 0, kcm_clone(sock));
+ if (fdf.err)
+ return fdf.err;
- file = kcm_clone(sock);
- if (IS_ERR(file)) {
- put_unused_fd(info.fd);
- return PTR_ERR(file);
- }
- if (copy_to_user((void __user *)arg, &info,
- sizeof(info))) {
- put_unused_fd(info.fd);
- fput(file);
+ info.fd = fd_prepare_fd(fdf);
+ if (copy_to_user((void __user *)arg, &info, sizeof(info)))
return -EFAULT;
- }
- fd_install(info.fd, file);
+
+ fd_publish(fdf);
err = 0;
break;
}
@@ -1738,14 +1581,6 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
return err;
}
-static void free_mux(struct rcu_head *rcu)
-{
- struct kcm_mux *mux = container_of(rcu,
- struct kcm_mux, rcu);
-
- kmem_cache_free(kcm_muxp, mux);
-}
-
static void release_mux(struct kcm_mux *mux)
{
struct kcm_net *knet = mux->knet;
@@ -1773,7 +1608,7 @@ static void release_mux(struct kcm_mux *mux)
knet->count--;
mutex_unlock(&knet->mutex);
- call_rcu(&mux->rcu, free_mux);
+ kfree_rcu(mux, rcu);
}
static void kcm_done(struct kcm_sock *kcm)
@@ -1794,7 +1629,8 @@ static void kcm_done(struct kcm_sock *kcm)
if (kcm->rx_wait) {
list_del(&kcm->wait_rx_list);
- kcm->rx_wait = false;
+ /* paired with lockless reads in kcm_rfree() */
+ WRITE_ONCE(kcm->rx_wait, false);
}
/* Move any pending receive messages to other kcm sockets */
requeue_rx_msgs(mux, &sk->sk_receive_queue);
@@ -1839,22 +1675,16 @@ static int kcm_release(struct socket *sock)
kcm = kcm_sk(sk);
mux = kcm->mux;
+ lock_sock(sk);
sock_orphan(sk);
kfree_skb(kcm->seq_skb);
- lock_sock(sk);
/* Purge queue under lock to avoid race condition with tx_work trying
* to act when queue is nonempty. If tx_work runs after this point
* it will just return.
*/
__skb_queue_purge(&sk->sk_write_queue);
- /* Set tx_stopped. This is checked when psock is bound to a kcm and we
- * get a writespace callback. This prevents further work being queued
- * from the callback (unbinding the psock occurs after canceling work.
- */
- kcm->tx_stopped = 1;
-
release_sock(sk);
spin_lock_bh(&mux->lock);
@@ -1870,7 +1700,7 @@ static int kcm_release(struct socket *sock)
/* Cancel work. After this point there should be no outside references
* to the kcm socket.
*/
- cancel_work_sync(&kcm->tx_work);
+ disable_work_sync(&kcm->tx_work);
lock_sock(sk);
psock = kcm->tx_psock;
@@ -1912,7 +1742,7 @@ static const struct proto_ops kcm_dgram_ops = {
.sendmsg = kcm_sendmsg,
.recvmsg = kcm_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = kcm_sendpage,
+ .splice_eof = kcm_splice_eof,
};
static const struct proto_ops kcm_seqpacket_ops = {
@@ -1933,7 +1763,7 @@ static const struct proto_ops kcm_seqpacket_ops = {
.sendmsg = kcm_sendmsg,
.recvmsg = kcm_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = kcm_sendpage,
+ .splice_eof = kcm_splice_eof,
.splice_read = kcm_splice_read,
};
@@ -2021,6 +1851,8 @@ static __net_exit void kcm_exit_net(struct net *net)
* that all multiplexors and psocks have been destroyed.
*/
WARN_ON(!list_empty(&knet->mux_list));
+
+ mutex_destroy(&knet->mutex);
}
static struct pernet_operations kcm_net_ops = {
@@ -2034,15 +1866,11 @@ static int __init kcm_init(void)
{
int err = -ENOMEM;
- kcm_muxp = kmem_cache_create("kcm_mux_cache",
- sizeof(struct kcm_mux), 0,
- SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+ kcm_muxp = KMEM_CACHE(kcm_mux, SLAB_HWCACHE_ALIGN);
if (!kcm_muxp)
goto fail;
- kcm_psockp = kmem_cache_create("kcm_psock_cache",
- sizeof(struct kcm_psock), 0,
- SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+ kcm_psockp = KMEM_CACHE(kcm_psock, SLAB_HWCACHE_ALIGN);
if (!kcm_psockp)
goto fail;
@@ -2054,14 +1882,14 @@ static int __init kcm_init(void)
if (err)
goto fail;
- err = sock_register(&kcm_family_ops);
- if (err)
- goto sock_register_fail;
-
err = register_pernet_device(&kcm_net_ops);
if (err)
goto net_ops_fail;
+ err = sock_register(&kcm_family_ops);
+ if (err)
+ goto sock_register_fail;
+
err = kcm_proc_init();
if (err)
goto proc_init_fail;
@@ -2069,12 +1897,12 @@ static int __init kcm_init(void)
return 0;
proc_init_fail:
- unregister_pernet_device(&kcm_net_ops);
-
-net_ops_fail:
sock_unregister(PF_KCM);
sock_register_fail:
+ unregister_pernet_device(&kcm_net_ops);
+
+net_ops_fail:
proto_unregister(&kcm_proto);
fail:
@@ -2090,8 +1918,8 @@ fail:
static void __exit kcm_exit(void)
{
kcm_proc_exit();
- unregister_pernet_device(&kcm_net_ops);
sock_unregister(PF_KCM);
+ unregister_pernet_device(&kcm_net_ops);
proto_unregister(&kcm_proto);
destroy_workqueue(kcm_wq);
@@ -2103,4 +1931,5 @@ module_init(kcm_init);
module_exit(kcm_exit);
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("KCM (Kernel Connection Multiplexor) sockets");
MODULE_ALIAS_NETPROTO(PF_KCM);
diff --git a/net/key/Makefile b/net/key/Makefile
index 857608042475..ed779c22fbbb 100644
--- a/net/key/Makefile
+++ b/net/key/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the key AF.
#
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 9d61266526e7..571200433aa9 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1,11 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/key/af_key.c An implementation of PF_KEYv2 sockets.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* Authors: Maxim Giryaev <gem@asplinux.ru>
* David S. Miller <davem@redhat.com>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
@@ -145,7 +141,6 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol,
struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
struct sock *sk;
struct pfkey_sock *pfk;
- int err;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
@@ -154,10 +149,9 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol,
if (protocol != PF_KEY_V2)
return -EPROTONOSUPPORT;
- err = -ENOMEM;
sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto, kern);
if (sk == NULL)
- goto out;
+ return -ENOMEM;
pfk = pfkey_sk(sk);
mutex_init(&pfk->dump_lock);
@@ -173,8 +167,6 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol,
pfkey_insert(sk);
return 0;
-out:
- return err;
}
static int pfkey_release(struct socket *sock)
@@ -196,30 +188,22 @@ static int pfkey_release(struct socket *sock)
return 0;
}
-static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2,
- gfp_t allocation, struct sock *sk)
+static int pfkey_broadcast_one(struct sk_buff *skb, gfp_t allocation,
+ struct sock *sk)
{
int err = -ENOBUFS;
- sock_hold(sk);
- if (*skb2 == NULL) {
- if (refcount_read(&skb->users) != 1) {
- *skb2 = skb_clone(skb, allocation);
- } else {
- *skb2 = skb;
- refcount_inc(&skb->users);
- }
- }
- if (*skb2 != NULL) {
- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) {
- skb_set_owner_r(*skb2, sk);
- skb_queue_tail(&sk->sk_receive_queue, *skb2);
- sk->sk_data_ready(sk);
- *skb2 = NULL;
- err = 0;
- }
+ if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
+ return err;
+
+ skb = skb_clone(skb, allocation);
+
+ if (skb) {
+ skb_set_owner_r(skb, sk);
+ skb_queue_tail(&sk->sk_receive_queue, skb);
+ sk->sk_data_ready(sk);
+ err = 0;
}
- sock_put(sk);
return err;
}
@@ -234,7 +218,6 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation,
{
struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
struct sock *sk;
- struct sk_buff *skb2 = NULL;
int err = -ESRCH;
/* XXX Do we need something like netlink_overrun? I think
@@ -253,7 +236,7 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation,
* socket.
*/
if (pfk->promisc)
- pfkey_broadcast_one(skb, &skb2, GFP_ATOMIC, sk);
+ pfkey_broadcast_one(skb, GFP_ATOMIC, sk);
/* the exact target will be processed later */
if (sk == one_sk)
@@ -268,7 +251,7 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation,
continue;
}
- err2 = pfkey_broadcast_one(skb, &skb2, GFP_ATOMIC, sk);
+ err2 = pfkey_broadcast_one(skb, GFP_ATOMIC, sk);
/* Error is cleared after successful sending to at least one
* registered KM */
@@ -278,9 +261,8 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation,
rcu_read_unlock();
if (one_sk != NULL)
- err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk);
+ err = pfkey_broadcast_one(skb, allocation, one_sk);
- kfree_skb(skb2);
kfree_skb(skb);
return err;
}
@@ -942,8 +924,7 @@ static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x,
pfkey_sockaddr_fill(&x->props.saddr, 0,
(struct sockaddr *) (addr + 1),
x->props.family);
- if (!addr->sadb_address_prefixlen)
- BUG();
+ BUG_ON(!addr->sadb_address_prefixlen);
/* dst address */
addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size);
@@ -958,8 +939,7 @@ static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x,
pfkey_sockaddr_fill(&x->id.daddr, 0,
(struct sockaddr *) (addr + 1),
x->props.family);
- if (!addr->sadb_address_prefixlen)
- BUG();
+ BUG_ON(!addr->sadb_address_prefixlen);
if (!xfrm_addr_equal(&x->sel.saddr, &x->props.saddr,
x->props.family)) {
@@ -1281,7 +1261,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
const struct sadb_x_nat_t_type* n_type;
struct xfrm_encap_tmpl *natt;
- x->encap = kmalloc(sizeof(*x->encap), GFP_KERNEL);
+ x->encap = kzalloc(sizeof(*x->encap), GFP_KERNEL);
if (!x->encap) {
err = -ENOMEM;
goto out;
@@ -1301,7 +1281,6 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net,
ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1];
natt->encap_dport = n_port->sadb_x_nat_t_port_port;
}
- memset(&natt->encap_oa, 0, sizeof(natt->encap_oa));
}
err = xfrm_init_state(x);
@@ -1375,7 +1354,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_
}
if (hdr->sadb_msg_seq) {
- x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq);
+ x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq, UINT_MAX);
if (x && !xfrm_addr_equal(&x->id.daddr, xdaddr, family)) {
xfrm_state_put(x);
x = NULL;
@@ -1383,7 +1362,8 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_
}
if (!x)
- x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, proto, xdaddr, xsaddr, 1, family);
+ x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, UINT_MAX,
+ proto, xdaddr, xsaddr, 1, family);
if (x == NULL)
return -ENOENT;
@@ -1397,13 +1377,13 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_
max_spi = range->sadb_spirange_max;
}
- err = verify_spi_info(x->id.proto, min_spi, max_spi);
+ err = verify_spi_info(x->id.proto, min_spi, max_spi, NULL);
if (err) {
xfrm_state_put(x);
return err;
}
- err = xfrm_alloc_spi(x, min_spi, max_spi);
+ err = xfrm_alloc_spi(x, min_spi, max_spi, NULL);
resp_skb = err ? ERR_PTR(err) : pfkey_xfrm_state2msg(x);
if (IS_ERR(resp_skb)) {
@@ -1438,7 +1418,7 @@ static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, const struct sadb
if (hdr->sadb_msg_seq == 0 || hdr->sadb_msg_errno == 0)
return 0;
- x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq);
+ x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq, UINT_MAX);
if (x == NULL)
return 0;
@@ -1717,9 +1697,12 @@ static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sad
pfk->registered |= (1<<hdr->sadb_msg_satype);
}
+ mutex_lock(&pfkey_mutex);
xfrm_probe_algs();
- supp_skb = compose_sadb_supported(hdr, GFP_KERNEL);
+ supp_skb = compose_sadb_supported(hdr, GFP_KERNEL | __GFP_ZERO);
+ mutex_unlock(&pfkey_mutex);
+
if (!supp_skb) {
if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC)
pfk->registered &= ~(1<<hdr->sadb_msg_satype);
@@ -1865,6 +1848,13 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_ms
if (ext_hdrs[SADB_X_EXT_FILTER - 1]) {
struct sadb_x_filter *xfilter = ext_hdrs[SADB_X_EXT_FILTER - 1];
+ if ((xfilter->sadb_x_filter_splen >
+ (sizeof(xfrm_address_t) << 3)) ||
+ (xfilter->sadb_x_filter_dplen >
+ (sizeof(xfrm_address_t) << 3))) {
+ mutex_unlock(&pfk->dump_lock);
+ return -EINVAL;
+ }
filter = kmalloc(sizeof(*filter), GFP_KERNEL);
if (filter == NULL) {
mutex_unlock(&pfk->dump_lock);
@@ -1950,7 +1940,8 @@ static u32 gen_reqid(struct net *net)
}
static int
-parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq)
+parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_policy *pol,
+ struct sadb_x_ipsecrequest *rq)
{
struct net *net = xp_net(xp);
struct xfrm_tmpl *t = xp->xfrm_vec + xp->xfrm_nr;
@@ -1961,14 +1952,19 @@ parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq)
if (rq->sadb_x_ipsecrequest_mode == 0)
return -EINVAL;
+ if (!xfrm_id_proto_valid(rq->sadb_x_ipsecrequest_proto))
+ return -EINVAL;
- t->id.proto = rq->sadb_x_ipsecrequest_proto; /* XXX check proto */
+ t->id.proto = rq->sadb_x_ipsecrequest_proto;
if ((mode = pfkey_mode_to_xfrm(rq->sadb_x_ipsecrequest_mode)) < 0)
return -EINVAL;
t->mode = mode;
- if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_USE)
+ if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_USE) {
+ if ((mode == XFRM_MODE_TUNNEL || mode == XFRM_MODE_BEET) &&
+ pol->sadb_x_policy_dir == IPSEC_DIR_OUTBOUND)
+ return -EINVAL;
t->optional = 1;
- else if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_UNIQUE) {
+ } else if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_UNIQUE) {
t->reqid = rq->sadb_x_ipsecrequest_reqid;
if (t->reqid > IPSEC_MANUAL_REQID_MAX)
t->reqid = 0;
@@ -2010,7 +2006,7 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol)
rq->sadb_x_ipsecrequest_len < sizeof(*rq))
return -EINVAL;
- if ((err = parse_ipsecrequest(xp, rq)) < 0)
+ if ((err = parse_ipsecrequest(xp, pol, rq)) < 0)
return err;
len -= rq->sadb_x_ipsecrequest_len;
rq = (void*)((u8*)rq + rq->sadb_x_ipsecrequest_len);
@@ -2020,7 +2016,7 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol)
static inline int pfkey_xfrm_policy2sec_ctx_size(const struct xfrm_policy *xp)
{
- struct xfrm_sec_ctx *xfrm_ctx = xp->security;
+ struct xfrm_sec_ctx *xfrm_ctx = xp->security;
if (xfrm_ctx) {
int len = sizeof(struct sadb_x_sec_ctx);
@@ -2414,7 +2410,7 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sa
return err;
}
- xp = xfrm_policy_bysel_ctx(net, DUMMY_MARK, 0, XFRM_POLICY_TYPE_MAIN,
+ xp = xfrm_policy_bysel_ctx(net, &dummy_mark, 0, XFRM_POLICY_TYPE_MAIN,
pol->sadb_x_policy_dir - 1, &sel, pol_ctx,
1, &err);
security_xfrm_policy_free(pol_ctx);
@@ -2450,8 +2446,10 @@ static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struc
goto out;
}
err = pfkey_xfrm_policy2msg(out_skb, xp, dir);
- if (err < 0)
+ if (err < 0) {
+ kfree_skb(out_skb);
goto out;
+ }
out_hdr = (struct sadb_msg *) out_skb->data;
out_hdr->sadb_msg_version = hdr->sadb_msg_version;
@@ -2632,7 +2630,7 @@ static int pfkey_migrate(struct sock *sk, struct sk_buff *skb,
}
return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i,
- kma ? &k : NULL, net, NULL);
+ kma ? &k : NULL, net, NULL, 0, NULL, NULL);
out:
return err;
@@ -2663,7 +2661,7 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_
return -EINVAL;
delete = (hdr->sadb_msg_type == SADB_X_SPDDELETE2);
- xp = xfrm_policy_byid(net, DUMMY_MARK, 0, XFRM_POLICY_TYPE_MAIN,
+ xp = xfrm_policy_byid(net, &dummy_mark, 0, XFRM_POLICY_TYPE_MAIN,
dir, pol->sadb_x_policy_id, delete, &err);
if (xp == NULL)
return -ENOENT;
@@ -2702,8 +2700,10 @@ static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr)
return PTR_ERR(out_skb);
err = pfkey_xfrm_policy2msg(out_skb, xp, dir);
- if (err < 0)
+ if (err < 0) {
+ kfree_skb(out_skb);
return err;
+ }
out_hdr = (struct sadb_msg *) out_skb->data;
out_hdr->sadb_msg_version = pfk->dump.msg_version;
@@ -2833,6 +2833,10 @@ static int pfkey_process(struct sock *sk, struct sk_buff *skb, const struct sadb
void *ext_hdrs[SADB_EXT_MAX];
int err;
+ /* Non-zero return value of pfkey_broadcast() does not always signal
+ * an error and even on an actual error we may still want to process
+ * the message so rather ignore the return value.
+ */
pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL,
BROADCAST_PROMISC_ONLY, NULL, sock_net(sk));
@@ -2905,7 +2909,7 @@ static int count_ah_combs(const struct xfrm_tmpl *t)
break;
if (!aalg->pfkey_supported)
continue;
- if (aalg_tmpl_set(t, aalg) && aalg->available)
+ if (aalg_tmpl_set(t, aalg))
sz += sizeof(struct sadb_comb);
}
return sz + sizeof(struct sadb_prop);
@@ -2923,7 +2927,7 @@ static int count_esp_combs(const struct xfrm_tmpl *t)
if (!ealg->pfkey_supported)
continue;
- if (!(ealg_tmpl_set(t, ealg) && ealg->available))
+ if (!(ealg_tmpl_set(t, ealg)))
continue;
for (k = 1; ; k++) {
@@ -2934,16 +2938,17 @@ static int count_esp_combs(const struct xfrm_tmpl *t)
if (!aalg->pfkey_supported)
continue;
- if (aalg_tmpl_set(t, aalg) && aalg->available)
+ if (aalg_tmpl_set(t, aalg))
sz += sizeof(struct sadb_comb);
}
}
return sz + sizeof(struct sadb_prop);
}
-static void dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t)
+static int dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t)
{
struct sadb_prop *p;
+ int sz = 0;
int i;
p = skb_put(skb, sizeof(struct sadb_prop));
@@ -2971,13 +2976,17 @@ static void dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t)
c->sadb_comb_soft_addtime = 20*60*60;
c->sadb_comb_hard_usetime = 8*60*60;
c->sadb_comb_soft_usetime = 7*60*60;
+ sz += sizeof(*c);
}
}
+
+ return sz + sizeof(*p);
}
-static void dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t)
+static int dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t)
{
struct sadb_prop *p;
+ int sz = 0;
int i, k;
p = skb_put(skb, sizeof(struct sadb_prop));
@@ -3019,8 +3028,11 @@ static void dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t)
c->sadb_comb_soft_addtime = 20*60*60;
c->sadb_comb_hard_usetime = 8*60*60;
c->sadb_comb_soft_usetime = 7*60*60;
+ sz += sizeof(*c);
}
}
+
+ return sz + sizeof(*p);
}
static int key_notify_policy_expire(struct xfrm_policy *xp, const struct km_event *c)
@@ -3150,6 +3162,7 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct
struct sadb_x_sec_ctx *sec_ctx;
struct xfrm_sec_ctx *xfrm_ctx;
int ctx_size = 0;
+ int alg_size = 0;
sockaddr_size = pfkey_sockaddr_size(x->props.family);
if (!sockaddr_size)
@@ -3161,16 +3174,16 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct
sizeof(struct sadb_x_policy);
if (x->id.proto == IPPROTO_AH)
- size += count_ah_combs(t);
+ alg_size = count_ah_combs(t);
else if (x->id.proto == IPPROTO_ESP)
- size += count_esp_combs(t);
+ alg_size = count_esp_combs(t);
if ((xfrm_ctx = x->security)) {
ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len);
size += sizeof(struct sadb_x_sec_ctx) + ctx_size;
}
- skb = alloc_skb(size + 16, GFP_ATOMIC);
+ skb = alloc_skb(size + alg_size + 16, GFP_ATOMIC);
if (skb == NULL)
return -ENOMEM;
@@ -3224,10 +3237,13 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct
pol->sadb_x_policy_priority = xp->priority;
/* Set sadb_comb's. */
+ alg_size = 0;
if (x->id.proto == IPPROTO_AH)
- dump_ah_combs(skb, t);
+ alg_size = dump_ah_combs(skb, t);
else if (x->id.proto == IPPROTO_ESP)
- dump_esp_combs(skb, t);
+ alg_size = dump_esp_combs(skb, t);
+
+ hdr->sadb_msg_len += alg_size / 8;
/* security context */
if (xfrm_ctx) {
@@ -3382,7 +3398,7 @@ static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr,
hdr->sadb_msg_len = size / sizeof(uint64_t);
hdr->sadb_msg_errno = 0;
hdr->sadb_msg_reserved = 0;
- hdr->sadb_msg_seq = x->km.seq = get_acqseq();
+ hdr->sadb_msg_seq = x->km.seq;
hdr->sadb_msg_pid = 0;
/* SA */
@@ -3703,7 +3719,7 @@ static int pfkey_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
goto out;
- skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
if (skb == NULL)
goto out;
@@ -3718,7 +3734,7 @@ static int pfkey_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
if (err)
goto out_free;
- sock_recv_ts_and_drops(msg, sk, skb);
+ sock_recv_cmsgs(msg, sk, skb);
err = (flags & MSG_TRUNC) ? skb->len : copied;
@@ -3744,10 +3760,7 @@ static const struct proto_ops pfkey_ops = {
.ioctl = sock_no_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
- .setsockopt = sock_no_setsockopt,
- .getsockopt = sock_no_getsockopt,
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
/* Now the operations that really occur. */
.release = pfkey_release,
@@ -3775,7 +3788,7 @@ static int pfkey_seq_show(struct seq_file *f, void *v)
refcount_read(&s->sk_refcnt),
sk_rmem_alloc_get(s),
sk_wmem_alloc_get(s),
- from_kuid_munged(seq_user_ns(f), sock_i_uid(s)),
+ from_kuid_munged(seq_user_ns(f), sk_uid(s)),
sock_i_ino(s)
);
return 0;
@@ -3890,6 +3903,8 @@ static int __init ipsec_pfkey_init(void)
{
int err = proto_register(&key_proto, 0);
+ pr_warn_once("PFKEY is deprecated and scheduled to be removed in 2027, "
+ "please contact the netdev mailing list\n");
if (err != 0)
goto out;
@@ -3899,14 +3914,10 @@ static int __init ipsec_pfkey_init(void)
err = sock_register(&pfkey_family_ops);
if (err != 0)
goto out_unregister_pernet;
- err = xfrm_register_km(&pfkeyv2_mgr);
- if (err != 0)
- goto out_sock_unregister;
+ xfrm_register_km(&pfkeyv2_mgr);
out:
return err;
-out_sock_unregister:
- sock_unregister(PF_KEY);
out_unregister_pernet:
unregister_pernet_subsys(&pfkey_net_ops);
out_unregister_key_proto:
@@ -3916,5 +3927,6 @@ out_unregister_key_proto:
module_init(ipsec_pfkey_init);
module_exit(ipsec_pfkey_exit);
+MODULE_DESCRIPTION("PF_KEY socket helpers");
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_KEY);
diff --git a/net/l2tp/Kconfig b/net/l2tp/Kconfig
index 378c73b26093..b7856748e960 100644
--- a/net/l2tp/Kconfig
+++ b/net/l2tp/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Layer Two Tunneling Protocol (L2TP)
#
@@ -7,7 +8,7 @@ menuconfig L2TP
depends on (IPV6 || IPV6=n)
depends on INET
select NET_UDP_TUNNEL
- ---help---
+ help
Layer Two Tunneling Protocol
From RFC 2661 <http://www.ietf.org/rfc/rfc2661.txt>.
diff --git a/net/l2tp/Makefile b/net/l2tp/Makefile
index 399a7e5db2f4..cf8f27071d3f 100644
--- a/net/l2tp/Makefile
+++ b/net/l2tp/Makefile
@@ -5,6 +5,8 @@
obj-$(CONFIG_L2TP) += l2tp_core.o
+CFLAGS_l2tp_core.o += -I$(src)
+
# Build l2tp as modules if L2TP is M
obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_PPPOL2TP)) += l2tp_ppp.o
obj-$(subst y,$(CONFIG_L2TP),$(CONFIG_L2TP_IP)) += l2tp_ip.o
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 82cdf9020b53..687c1366a4d0 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1,5 +1,5 @@
-/*
- * L2TP core.
+// SPDX-License-Identifier: GPL-2.0-only
+/* L2TP core.
*
* Copyright (c) 2008,2009,2010 Katalix Systems Ltd
*
@@ -12,10 +12,6 @@
* Michal Ostrowski <mostrows@speakeasy.net>
* Arnaldo Carvalho de Melo <acme@xconectiva.com.br>
* David S. Miller (davem@redhat.com)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -43,7 +39,6 @@
#include <linux/ip.h>
#include <linux/udp.h>
#include <linux/l2tp.h>
-#include <linux/hash.h>
#include <linux/sort.h>
#include <linux/file.h>
#include <linux/nsproxy.h>
@@ -66,6 +61,9 @@
#include "l2tp_core.h"
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
#define L2TP_DRV_VERSION "V2.0"
/* L2TP header constants */
@@ -83,12 +81,16 @@
#define L2TP_SLFLAG_S 0x40000000
#define L2TP_SL_SEQ_MASK 0x00ffffff
-#define L2TP_HDR_SIZE_SEQ 10
-#define L2TP_HDR_SIZE_NOSEQ 6
+#define L2TP_HDR_SIZE_MAX 14
/* Default trace flags */
#define L2TP_DEFAULT_DEBUG_FLAGS 0
+#define L2TP_DEPTH_NESTING 2
+#if L2TP_DEPTH_NESTING == SINGLE_DEPTH_NESTING
+#error "L2TP requires its own lockdep subclass"
+#endif
+
/* Private data stored for received packets in the skb.
*/
struct l2tp_skb_cb {
@@ -98,19 +100,33 @@ struct l2tp_skb_cb {
unsigned long expires;
};
-#define L2TP_SKB_CB(skb) ((struct l2tp_skb_cb *) &skb->cb[sizeof(struct inet_skb_parm)])
+#define L2TP_SKB_CB(skb) ((struct l2tp_skb_cb *)&(skb)->cb[sizeof(struct inet_skb_parm)])
static struct workqueue_struct *l2tp_wq;
/* per-net private data for this module */
static unsigned int l2tp_net_id;
struct l2tp_net {
- struct list_head l2tp_tunnel_list;
- spinlock_t l2tp_tunnel_list_lock;
- struct hlist_head l2tp_session_hlist[L2TP_HASH_SIZE_2];
- spinlock_t l2tp_session_hlist_lock;
+ /* Lock for write access to l2tp_tunnel_idr */
+ spinlock_t l2tp_tunnel_idr_lock;
+ struct idr l2tp_tunnel_idr;
+ /* Lock for write access to l2tp_v[23]_session_idr/htable */
+ spinlock_t l2tp_session_idr_lock;
+ struct idr l2tp_v2_session_idr;
+ struct idr l2tp_v3_session_idr;
+ struct hlist_head l2tp_v3_session_htable[16];
};
+static u32 l2tp_v2_session_key(u16 tunnel_id, u16 session_id)
+{
+ return ((u32)tunnel_id) << 16 | session_id;
+}
+
+static unsigned long l2tp_v3_session_hashkey(struct sock *sk, u32 session_id)
+{
+ return ((unsigned long)sk) + session_id;
+}
+
#if IS_ENABLED(CONFIG_IPV6)
static bool l2tp_sk_is_v6(struct sock *sk)
{
@@ -119,48 +135,81 @@ static bool l2tp_sk_is_v6(struct sock *sk)
}
#endif
-static inline struct l2tp_tunnel *l2tp_tunnel(struct sock *sk)
+static struct l2tp_net *l2tp_pernet(const struct net *net)
{
- return sk->sk_user_data;
+ return net_generic(net, l2tp_net_id);
}
-static inline struct l2tp_net *l2tp_pernet(const struct net *net)
+static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel)
{
- BUG_ON(!net);
+ struct sock *sk = tunnel->sock;
- return net_generic(net, l2tp_net_id);
+ trace_free_tunnel(tunnel);
+
+ if (sk) {
+ /* Disable udp encapsulation */
+ switch (tunnel->encap) {
+ case L2TP_ENCAPTYPE_UDP:
+ /* No longer an encapsulation socket. See net/ipv4/udp.c */
+ WRITE_ONCE(udp_sk(sk)->encap_type, 0);
+ udp_sk(sk)->encap_rcv = NULL;
+ udp_sk(sk)->encap_destroy = NULL;
+ break;
+ case L2TP_ENCAPTYPE_IP:
+ break;
+ }
+
+ tunnel->sock = NULL;
+ sock_put(sk);
+ }
+
+ kfree_rcu(tunnel, rcu);
}
-/* Session hash global list for L2TPv3.
- * The session_id SHOULD be random according to RFC3931, but several
- * L2TP implementations use incrementing session_ids. So we do a real
- * hash on the session_id, rather than a simple bitmask.
- */
-static inline struct hlist_head *
-l2tp_session_id_hash_2(struct l2tp_net *pn, u32 session_id)
+static void l2tp_session_free(struct l2tp_session *session)
+{
+ trace_free_session(session);
+ if (session->tunnel)
+ l2tp_tunnel_put(session->tunnel);
+ kfree_rcu(session, rcu);
+}
+
+struct l2tp_tunnel *l2tp_sk_to_tunnel(const struct sock *sk)
{
- return &pn->l2tp_session_hlist[hash_32(session_id, L2TP_HASH_BITS_2)];
+ const struct net *net = sock_net(sk);
+ unsigned long tunnel_id, tmp;
+ struct l2tp_tunnel *tunnel;
+ struct l2tp_net *pn;
+ rcu_read_lock_bh();
+ pn = l2tp_pernet(net);
+ idr_for_each_entry_ul(&pn->l2tp_tunnel_idr, tunnel, tmp, tunnel_id) {
+ if (tunnel &&
+ tunnel->sock == sk &&
+ refcount_inc_not_zero(&tunnel->ref_count)) {
+ rcu_read_unlock_bh();
+ return tunnel;
+ }
+ }
+ rcu_read_unlock_bh();
+
+ return NULL;
}
+EXPORT_SYMBOL_GPL(l2tp_sk_to_tunnel);
-/* Session hash list.
- * The session_id SHOULD be random according to RFC2661, but several
- * L2TP implementations (Cisco and Microsoft) use incrementing
- * session_ids. So we do a real hash on the session_id, rather than a
- * simple bitmask.
- */
-static inline struct hlist_head *
-l2tp_session_id_hash(struct l2tp_tunnel *tunnel, u32 session_id)
+void l2tp_tunnel_put(struct l2tp_tunnel *tunnel)
{
- return &tunnel->session_hlist[hash_32(session_id, L2TP_HASH_BITS)];
+ if (refcount_dec_and_test(&tunnel->ref_count))
+ l2tp_tunnel_free(tunnel);
}
+EXPORT_SYMBOL_GPL(l2tp_tunnel_put);
-void l2tp_tunnel_free(struct l2tp_tunnel *tunnel)
+void l2tp_session_put(struct l2tp_session *session)
{
- sock_put(tunnel->sock);
- /* the tunnel is freed in the socket destructor */
+ if (refcount_dec_and_test(&session->ref_count))
+ l2tp_session_free(session);
}
-EXPORT_SYMBOL(l2tp_tunnel_free);
+EXPORT_SYMBOL_GPL(l2tp_session_put);
/* Lookup a tunnel. A new reference is held on the returned tunnel. */
struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id)
@@ -169,13 +218,10 @@ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id)
struct l2tp_tunnel *tunnel;
rcu_read_lock_bh();
- list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) {
- if (tunnel->tunnel_id == tunnel_id) {
- l2tp_tunnel_inc_refcount(tunnel);
- rcu_read_unlock_bh();
-
- return tunnel;
- }
+ tunnel = idr_find(&pn->l2tp_tunnel_idr, tunnel_id);
+ if (tunnel && refcount_inc_not_zero(&tunnel->ref_count)) {
+ rcu_read_unlock_bh();
+ return tunnel;
}
rcu_read_unlock_bh();
@@ -183,91 +229,202 @@ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id)
}
EXPORT_SYMBOL_GPL(l2tp_tunnel_get);
-struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth)
+struct l2tp_tunnel *l2tp_tunnel_get_next(const struct net *net, unsigned long *key)
{
- const struct l2tp_net *pn = l2tp_pernet(net);
- struct l2tp_tunnel *tunnel;
- int count = 0;
+ struct l2tp_net *pn = l2tp_pernet(net);
+ struct l2tp_tunnel *tunnel = NULL;
rcu_read_lock_bh();
- list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) {
- if (++count > nth) {
- l2tp_tunnel_inc_refcount(tunnel);
+again:
+ tunnel = idr_get_next_ul(&pn->l2tp_tunnel_idr, key);
+ if (tunnel) {
+ if (refcount_inc_not_zero(&tunnel->ref_count)) {
rcu_read_unlock_bh();
return tunnel;
}
+ (*key)++;
+ goto again;
}
rcu_read_unlock_bh();
return NULL;
}
-EXPORT_SYMBOL_GPL(l2tp_tunnel_get_nth);
+EXPORT_SYMBOL_GPL(l2tp_tunnel_get_next);
-struct l2tp_session *l2tp_tunnel_get_session(struct l2tp_tunnel *tunnel,
- u32 session_id)
+struct l2tp_session *l2tp_v3_session_get(const struct net *net, struct sock *sk, u32 session_id)
{
- struct hlist_head *session_list;
+ const struct l2tp_net *pn = l2tp_pernet(net);
struct l2tp_session *session;
- session_list = l2tp_session_id_hash(tunnel, session_id);
+ rcu_read_lock_bh();
+ session = idr_find(&pn->l2tp_v3_session_idr, session_id);
+ if (session && !hash_hashed(&session->hlist) &&
+ refcount_inc_not_zero(&session->ref_count)) {
+ rcu_read_unlock_bh();
+ return session;
+ }
- read_lock_bh(&tunnel->hlist_lock);
- hlist_for_each_entry(session, session_list, hlist)
- if (session->session_id == session_id) {
- l2tp_session_inc_refcount(session);
- read_unlock_bh(&tunnel->hlist_lock);
+ /* If we get here and session is non-NULL, the session_id
+ * collides with one in another tunnel. If sk is non-NULL,
+ * find the session matching sk.
+ */
+ if (session && sk) {
+ unsigned long key = l2tp_v3_session_hashkey(sk, session->session_id);
+
+ hash_for_each_possible_rcu(pn->l2tp_v3_session_htable, session,
+ hlist, key) {
+ /* session->tunnel may be NULL if another thread is in
+ * l2tp_session_register and has added an item to
+ * l2tp_v3_session_htable but hasn't yet added the
+ * session to its tunnel's session_list.
+ */
+ struct l2tp_tunnel *tunnel = READ_ONCE(session->tunnel);
- return session;
+ if (session->session_id == session_id &&
+ tunnel && tunnel->sock == sk &&
+ refcount_inc_not_zero(&session->ref_count)) {
+ rcu_read_unlock_bh();
+ return session;
+ }
}
- read_unlock_bh(&tunnel->hlist_lock);
+ }
+ rcu_read_unlock_bh();
return NULL;
}
-EXPORT_SYMBOL_GPL(l2tp_tunnel_get_session);
+EXPORT_SYMBOL_GPL(l2tp_v3_session_get);
-struct l2tp_session *l2tp_session_get(const struct net *net, u32 session_id)
+struct l2tp_session *l2tp_v2_session_get(const struct net *net, u16 tunnel_id, u16 session_id)
{
- struct hlist_head *session_list;
+ u32 session_key = l2tp_v2_session_key(tunnel_id, session_id);
+ const struct l2tp_net *pn = l2tp_pernet(net);
struct l2tp_session *session;
- session_list = l2tp_session_id_hash_2(l2tp_pernet(net), session_id);
+ rcu_read_lock_bh();
+ session = idr_find(&pn->l2tp_v2_session_idr, session_key);
+ if (session && refcount_inc_not_zero(&session->ref_count)) {
+ rcu_read_unlock_bh();
+ return session;
+ }
+ rcu_read_unlock_bh();
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(l2tp_v2_session_get);
+
+struct l2tp_session *l2tp_session_get(const struct net *net, struct sock *sk, int pver,
+ u32 tunnel_id, u32 session_id)
+{
+ if (pver == L2TP_HDR_VER_2)
+ return l2tp_v2_session_get(net, tunnel_id, session_id);
+ else
+ return l2tp_v3_session_get(net, sk, session_id);
+}
+EXPORT_SYMBOL_GPL(l2tp_session_get);
+
+static struct l2tp_session *l2tp_v2_session_get_next(const struct net *net,
+ u16 tid,
+ unsigned long *key)
+{
+ struct l2tp_net *pn = l2tp_pernet(net);
+ struct l2tp_session *session = NULL;
+
+ /* Start searching within the range of the tid */
+ if (*key == 0)
+ *key = l2tp_v2_session_key(tid, 0);
rcu_read_lock_bh();
- hlist_for_each_entry_rcu(session, session_list, global_hlist)
- if (session->session_id == session_id) {
- l2tp_session_inc_refcount(session);
- rcu_read_unlock_bh();
+again:
+ session = idr_get_next_ul(&pn->l2tp_v2_session_idr, key);
+ if (session) {
+ struct l2tp_tunnel *tunnel = READ_ONCE(session->tunnel);
+ /* ignore sessions with id 0 as they are internal for pppol2tp */
+ if (session->session_id == 0) {
+ (*key)++;
+ goto again;
+ }
+
+ if (tunnel->tunnel_id == tid &&
+ refcount_inc_not_zero(&session->ref_count)) {
+ rcu_read_unlock_bh();
return session;
}
+
+ (*key)++;
+ if (tunnel->tunnel_id == tid)
+ goto again;
+ }
rcu_read_unlock_bh();
return NULL;
}
-EXPORT_SYMBOL_GPL(l2tp_session_get);
-struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth)
+static struct l2tp_session *l2tp_v3_session_get_next(const struct net *net,
+ u32 tid, struct sock *sk,
+ unsigned long *key)
{
- int hash;
- struct l2tp_session *session;
- int count = 0;
-
- read_lock_bh(&tunnel->hlist_lock);
- for (hash = 0; hash < L2TP_HASH_SIZE; hash++) {
- hlist_for_each_entry(session, &tunnel->session_hlist[hash], hlist) {
- if (++count > nth) {
- l2tp_session_inc_refcount(session);
- read_unlock_bh(&tunnel->hlist_lock);
+ struct l2tp_net *pn = l2tp_pernet(net);
+ struct l2tp_session *session = NULL;
+
+ rcu_read_lock_bh();
+again:
+ session = idr_get_next_ul(&pn->l2tp_v3_session_idr, key);
+ if (session && !hash_hashed(&session->hlist)) {
+ struct l2tp_tunnel *tunnel = READ_ONCE(session->tunnel);
+
+ if (tunnel && tunnel->tunnel_id == tid &&
+ refcount_inc_not_zero(&session->ref_count)) {
+ rcu_read_unlock_bh();
+ return session;
+ }
+
+ (*key)++;
+ goto again;
+ }
+
+ /* If we get here and session is non-NULL, the IDR entry may be one
+ * where the session_id collides with one in another tunnel. Check
+ * session_htable for a match. There can only be one session of a given
+ * ID per tunnel so we can return as soon as a match is found.
+ */
+ if (session && hash_hashed(&session->hlist)) {
+ unsigned long hkey = l2tp_v3_session_hashkey(sk, session->session_id);
+ u32 sid = session->session_id;
+
+ hash_for_each_possible_rcu(pn->l2tp_v3_session_htable, session,
+ hlist, hkey) {
+ struct l2tp_tunnel *tunnel = READ_ONCE(session->tunnel);
+
+ if (session->session_id == sid &&
+ tunnel && tunnel->tunnel_id == tid &&
+ refcount_inc_not_zero(&session->ref_count)) {
+ rcu_read_unlock_bh();
return session;
}
}
+
+ /* If no match found, the colliding session ID isn't in our
+ * tunnel so try the next session ID.
+ */
+ (*key)++;
+ goto again;
}
- read_unlock_bh(&tunnel->hlist_lock);
+ rcu_read_unlock_bh();
return NULL;
}
-EXPORT_SYMBOL_GPL(l2tp_session_get_nth);
+
+struct l2tp_session *l2tp_session_get_next(const struct net *net, struct sock *sk, int pver,
+ u32 tunnel_id, unsigned long *key)
+{
+ if (pver == L2TP_HDR_VER_2)
+ return l2tp_v2_session_get_next(net, tunnel_id, key);
+ else
+ return l2tp_v3_session_get_next(net, tunnel_id, sk, key);
+}
+EXPORT_SYMBOL_GPL(l2tp_session_get_next);
/* Lookup a session by interface name.
* This is very inefficient but is only used by management interfaces.
@@ -276,79 +433,190 @@ struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net,
const char *ifname)
{
struct l2tp_net *pn = l2tp_pernet(net);
- int hash;
+ unsigned long tunnel_id, tmp;
struct l2tp_session *session;
+ struct l2tp_tunnel *tunnel;
rcu_read_lock_bh();
- for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++) {
- hlist_for_each_entry_rcu(session, &pn->l2tp_session_hlist[hash], global_hlist) {
- if (!strcmp(session->ifname, ifname)) {
- l2tp_session_inc_refcount(session);
- rcu_read_unlock_bh();
-
- return session;
+ idr_for_each_entry_ul(&pn->l2tp_tunnel_idr, tunnel, tmp, tunnel_id) {
+ if (tunnel) {
+ list_for_each_entry_rcu(session, &tunnel->session_list, list) {
+ if (!strcmp(session->ifname, ifname)) {
+ refcount_inc(&session->ref_count);
+ rcu_read_unlock_bh();
+
+ return session;
+ }
}
}
}
-
rcu_read_unlock_bh();
return NULL;
}
EXPORT_SYMBOL_GPL(l2tp_session_get_by_ifname);
+static void l2tp_session_coll_list_add(struct l2tp_session_coll_list *clist,
+ struct l2tp_session *session)
+{
+ refcount_inc(&session->ref_count);
+ WARN_ON_ONCE(session->coll_list);
+ session->coll_list = clist;
+ spin_lock(&clist->lock);
+ list_add(&session->clist, &clist->list);
+ spin_unlock(&clist->lock);
+}
+
+static int l2tp_session_collision_add(struct l2tp_net *pn,
+ struct l2tp_session *session1,
+ struct l2tp_session *session2)
+{
+ struct l2tp_session_coll_list *clist;
+
+ lockdep_assert_held(&pn->l2tp_session_idr_lock);
+
+ if (!session2)
+ return -EEXIST;
+
+ /* If existing session is in IP-encap tunnel, refuse new session */
+ if (session2->tunnel->encap == L2TP_ENCAPTYPE_IP)
+ return -EEXIST;
+
+ clist = session2->coll_list;
+ if (!clist) {
+ /* First collision. Allocate list to manage the collided sessions
+ * and add the existing session to the list.
+ */
+ clist = kmalloc(sizeof(*clist), GFP_ATOMIC);
+ if (!clist)
+ return -ENOMEM;
+
+ spin_lock_init(&clist->lock);
+ INIT_LIST_HEAD(&clist->list);
+ refcount_set(&clist->ref_count, 1);
+ l2tp_session_coll_list_add(clist, session2);
+ }
+
+ /* If existing session isn't already in the session hlist, add it. */
+ if (!hash_hashed(&session2->hlist))
+ hash_add_rcu(pn->l2tp_v3_session_htable, &session2->hlist,
+ session2->hlist_key);
+
+ /* Add new session to the hlist and collision list */
+ hash_add_rcu(pn->l2tp_v3_session_htable, &session1->hlist,
+ session1->hlist_key);
+ refcount_inc(&clist->ref_count);
+ l2tp_session_coll_list_add(clist, session1);
+
+ return 0;
+}
+
+static void l2tp_session_collision_del(struct l2tp_net *pn,
+ struct l2tp_session *session)
+{
+ struct l2tp_session_coll_list *clist = session->coll_list;
+ unsigned long session_key = session->session_id;
+ struct l2tp_session *session2;
+
+ lockdep_assert_held(&pn->l2tp_session_idr_lock);
+
+ hash_del_rcu(&session->hlist);
+
+ if (clist) {
+ /* Remove session from its collision list. If there
+ * are other sessions with the same ID, replace this
+ * session's IDR entry with that session, otherwise
+ * remove the IDR entry. If this is the last session,
+ * the collision list data is freed.
+ */
+ spin_lock(&clist->lock);
+ list_del_init(&session->clist);
+ session2 = list_first_entry_or_null(&clist->list, struct l2tp_session, clist);
+ if (session2) {
+ void *old = idr_replace(&pn->l2tp_v3_session_idr, session2, session_key);
+
+ WARN_ON_ONCE(IS_ERR_VALUE(old));
+ } else {
+ void *removed = idr_remove(&pn->l2tp_v3_session_idr, session_key);
+
+ WARN_ON_ONCE(removed != session);
+ }
+ session->coll_list = NULL;
+ spin_unlock(&clist->lock);
+ if (refcount_dec_and_test(&clist->ref_count))
+ kfree(clist);
+ l2tp_session_put(session);
+ }
+}
+
int l2tp_session_register(struct l2tp_session *session,
struct l2tp_tunnel *tunnel)
{
- struct l2tp_session *session_walk;
- struct hlist_head *g_head;
- struct hlist_head *head;
- struct l2tp_net *pn;
+ struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net);
+ struct l2tp_session *other_session = NULL;
+ void *old = NULL;
+ u32 session_key;
int err;
- head = l2tp_session_id_hash(tunnel, session->session_id);
+ spin_lock_bh(&tunnel->list_lock);
+ spin_lock_bh(&pn->l2tp_session_idr_lock);
- write_lock_bh(&tunnel->hlist_lock);
if (!tunnel->acpt_newsess) {
err = -ENODEV;
- goto err_tlock;
+ goto out;
}
- hlist_for_each_entry(session_walk, head, hlist)
- if (session_walk->session_id == session->session_id) {
- err = -EEXIST;
- goto err_tlock;
- }
-
if (tunnel->version == L2TP_HDR_VER_3) {
- pn = l2tp_pernet(tunnel->l2tp_net);
- g_head = l2tp_session_id_hash_2(pn, session->session_id);
-
- spin_lock_bh(&pn->l2tp_session_hlist_lock);
+ session_key = session->session_id;
+ err = idr_alloc_u32(&pn->l2tp_v3_session_idr, NULL,
+ &session_key, session_key, GFP_ATOMIC);
+ /* IP encap expects session IDs to be globally unique, while
+ * UDP encap doesn't. This isn't per the RFC, which says that
+ * sessions are identified only by the session ID, but is to
+ * support existing userspace which depends on it.
+ */
+ if (err == -ENOSPC && tunnel->encap == L2TP_ENCAPTYPE_UDP) {
+ other_session = idr_find(&pn->l2tp_v3_session_idr,
+ session_key);
+ err = l2tp_session_collision_add(pn, session,
+ other_session);
+ }
+ } else {
+ session_key = l2tp_v2_session_key(tunnel->tunnel_id,
+ session->session_id);
+ err = idr_alloc_u32(&pn->l2tp_v2_session_idr, NULL,
+ &session_key, session_key, GFP_ATOMIC);
+ }
- hlist_for_each_entry(session_walk, g_head, global_hlist)
- if (session_walk->session_id == session->session_id) {
- err = -EEXIST;
- goto err_tlock_pnlock;
- }
+ if (err) {
+ if (err == -ENOSPC)
+ err = -EEXIST;
+ goto out;
+ }
- l2tp_tunnel_inc_refcount(tunnel);
- hlist_add_head_rcu(&session->global_hlist, g_head);
+ refcount_inc(&tunnel->ref_count);
+ WRITE_ONCE(session->tunnel, tunnel);
+ list_add_rcu(&session->list, &tunnel->session_list);
- spin_unlock_bh(&pn->l2tp_session_hlist_lock);
+ /* this makes session available to lockless getters */
+ if (tunnel->version == L2TP_HDR_VER_3) {
+ if (!other_session)
+ old = idr_replace(&pn->l2tp_v3_session_idr, session, session_key);
} else {
- l2tp_tunnel_inc_refcount(tunnel);
+ old = idr_replace(&pn->l2tp_v2_session_idr, session, session_key);
}
- hlist_add_head(&session->hlist, head);
- write_unlock_bh(&tunnel->hlist_lock);
-
- return 0;
+ /* old should be NULL, unless something removed or modified
+ * the IDR entry after our idr_alloc_32 above (which shouldn't
+ * happen).
+ */
+ WARN_ON_ONCE(old);
+out:
+ spin_unlock_bh(&pn->l2tp_session_idr_lock);
+ spin_unlock_bh(&tunnel->list_lock);
-err_tlock_pnlock:
- spin_unlock_bh(&pn->l2tp_session_hlist_lock);
-err_tlock:
- write_unlock_bh(&tunnel->hlist_lock);
+ if (!err)
+ trace_register_session(session);
return err;
}
@@ -371,10 +639,6 @@ static void l2tp_recv_queue_skb(struct l2tp_session *session, struct sk_buff *sk
skb_queue_walk_safe(&session->reorder_q, skbp, tmp) {
if (L2TP_SKB_CB(skbp)->ns > ns) {
__skb_queue_before(&session->reorder_q, skbp, skb);
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: pkt %hu, inserted before %hu, reorder_q len=%d\n",
- session->name, ns, L2TP_SKB_CB(skbp)->ns,
- skb_queue_len(&session->reorder_q));
atomic_long_inc(&session->stats.rx_oos_packets);
goto out;
}
@@ -407,13 +671,11 @@ static void l2tp_recv_dequeue_skb(struct l2tp_session *session, struct sk_buff *
/* Bump our Nr */
session->nr++;
session->nr &= session->nr_max;
-
- l2tp_dbg(session, L2TP_MSG_SEQ, "%s: updated nr to %hu\n",
- session->name, session->nr);
+ trace_session_seqnum_update(session);
}
/* call private receive handler */
- if (session->recv_skb != NULL)
+ if (session->recv_skb)
(*session->recv_skb)(session, skb, L2TP_SKB_CB(skb)->length);
else
kfree_skb(skb);
@@ -434,37 +696,27 @@ static void l2tp_recv_dequeue(struct l2tp_session *session)
start:
spin_lock_bh(&session->reorder_q.lock);
skb_queue_walk_safe(&session->reorder_q, skb, tmp) {
- if (time_after(jiffies, L2TP_SKB_CB(skb)->expires)) {
+ struct l2tp_skb_cb *cb = L2TP_SKB_CB(skb);
+
+ /* If the packet has been pending on the queue for too long, discard it */
+ if (time_after(jiffies, cb->expires)) {
atomic_long_inc(&session->stats.rx_seq_discards);
atomic_long_inc(&session->stats.rx_errors);
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: oos pkt %u len %d discarded (too old), waiting for %u, reorder_q_len=%d\n",
- session->name, L2TP_SKB_CB(skb)->ns,
- L2TP_SKB_CB(skb)->length, session->nr,
- skb_queue_len(&session->reorder_q));
+ trace_session_pkt_expired(session, cb->ns);
session->reorder_skip = 1;
__skb_unlink(skb, &session->reorder_q);
kfree_skb(skb);
continue;
}
- if (L2TP_SKB_CB(skb)->has_seq) {
+ if (cb->has_seq) {
if (session->reorder_skip) {
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: advancing nr to next pkt: %u -> %u",
- session->name, session->nr,
- L2TP_SKB_CB(skb)->ns);
session->reorder_skip = 0;
- session->nr = L2TP_SKB_CB(skb)->ns;
+ session->nr = cb->ns;
+ trace_session_seqnum_reset(session);
}
- if (L2TP_SKB_CB(skb)->ns != session->nr) {
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: holding oos pkt %u len %d, waiting for %u, reorder_q_len=%d\n",
- session->name, L2TP_SKB_CB(skb)->ns,
- L2TP_SKB_CB(skb)->length, session->nr,
- skb_queue_len(&session->reorder_q));
+ if (cb->ns != session->nr)
goto out;
- }
}
__skb_unlink(skb, &session->reorder_q);
@@ -497,14 +749,13 @@ static int l2tp_seq_check_rx_window(struct l2tp_session *session, u32 nr)
*/
static int l2tp_recv_data_seq(struct l2tp_session *session, struct sk_buff *skb)
{
- if (!l2tp_seq_check_rx_window(session, L2TP_SKB_CB(skb)->ns)) {
+ struct l2tp_skb_cb *cb = L2TP_SKB_CB(skb);
+
+ if (!l2tp_seq_check_rx_window(session, cb->ns)) {
/* Packet sequence number is outside allowed window.
* Discard it.
*/
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: pkt %u len %d discarded, outside window, nr=%u\n",
- session->name, L2TP_SKB_CB(skb)->ns,
- L2TP_SKB_CB(skb)->length, session->nr);
+ trace_session_pkt_outside_rx_window(session, cb->ns);
goto discard;
}
@@ -521,10 +772,10 @@ static int l2tp_recv_data_seq(struct l2tp_session *session, struct sk_buff *skb)
* is seen. After nr_oos_count_max in-sequence packets, reset the
* sequence number to re-enable packet reception.
*/
- if (L2TP_SKB_CB(skb)->ns == session->nr) {
+ if (cb->ns == session->nr) {
skb_queue_tail(&session->reorder_q, skb);
} else {
- u32 nr_oos = L2TP_SKB_CB(skb)->ns;
+ u32 nr_oos = cb->ns;
u32 nr_next = (session->nr_oos + 1) & session->nr_max;
if (nr_oos == nr_next)
@@ -535,17 +786,10 @@ static int l2tp_recv_data_seq(struct l2tp_session *session, struct sk_buff *skb)
session->nr_oos = nr_oos;
if (session->nr_oos_count > session->nr_oos_count_max) {
session->reorder_skip = 1;
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: %d oos packets received. Resetting sequence numbers\n",
- session->name, session->nr_oos_count);
}
if (!session->reorder_skip) {
atomic_long_inc(&session->stats.rx_seq_discards);
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: oos pkt %u len %d discarded, waiting for %u, reorder_q_len=%d\n",
- session->name, L2TP_SKB_CB(skb)->ns,
- L2TP_SKB_CB(skb)->length, session->nr,
- skb_queue_len(&session->reorder_q));
+ trace_session_pkt_oos(session, cb->ns);
goto discard;
}
skb_queue_tail(&session->reorder_q, skb);
@@ -623,15 +867,13 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
{
struct l2tp_tunnel *tunnel = session->tunnel;
int offset;
- u32 ns, nr;
/* Parse and check optional cookie */
if (session->peer_cookie_len > 0) {
if (memcmp(ptr, &session->peer_cookie[0], session->peer_cookie_len)) {
- l2tp_info(tunnel, L2TP_MSG_DATA,
- "%s: cookie mismatch (%u/%u). Discarding.\n",
- tunnel->name, tunnel->tunnel_id,
- session->session_id);
+ pr_debug_ratelimited("%s: cookie mismatch (%u/%u). Discarding.\n",
+ tunnel->name, tunnel->tunnel_id,
+ session->session_id);
atomic_long_inc(&session->stats.rx_cookie_discards);
goto discard;
}
@@ -645,60 +887,46 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
* the control of the LNS. If no sequence numbers present but
* we were expecting them, discard frame.
*/
- ns = nr = 0;
L2TP_SKB_CB(skb)->has_seq = 0;
if (tunnel->version == L2TP_HDR_VER_2) {
if (hdrflags & L2TP_HDRFLAG_S) {
- ns = ntohs(*(__be16 *) ptr);
- ptr += 2;
- nr = ntohs(*(__be16 *) ptr);
- ptr += 2;
-
/* Store L2TP info in the skb */
- L2TP_SKB_CB(skb)->ns = ns;
+ L2TP_SKB_CB(skb)->ns = ntohs(*(__be16 *)ptr);
L2TP_SKB_CB(skb)->has_seq = 1;
+ ptr += 2;
+ /* Skip past nr in the header */
+ ptr += 2;
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: recv data ns=%u, nr=%u, session nr=%u\n",
- session->name, ns, nr, session->nr);
}
} else if (session->l2specific_type == L2TP_L2SPECTYPE_DEFAULT) {
- u32 l2h = ntohl(*(__be32 *) ptr);
+ u32 l2h = ntohl(*(__be32 *)ptr);
if (l2h & 0x40000000) {
- ns = l2h & 0x00ffffff;
-
/* Store L2TP info in the skb */
- L2TP_SKB_CB(skb)->ns = ns;
+ L2TP_SKB_CB(skb)->ns = l2h & 0x00ffffff;
L2TP_SKB_CB(skb)->has_seq = 1;
-
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: recv data ns=%u, session nr=%u\n",
- session->name, ns, session->nr);
}
ptr += 4;
}
if (L2TP_SKB_CB(skb)->has_seq) {
- /* Received a packet with sequence numbers. If we're the LNS,
+ /* Received a packet with sequence numbers. If we're the LAC,
* check if we sre sending sequence numbers and if not,
* configure it so.
*/
- if ((!session->lns_mode) && (!session->send_seq)) {
- l2tp_info(session, L2TP_MSG_SEQ,
- "%s: requested to enable seq numbers by LNS\n",
- session->name);
+ if (!session->lns_mode && !session->send_seq) {
+ trace_session_seqnum_lns_enable(session);
session->send_seq = 1;
- l2tp_session_set_header_len(session, tunnel->version);
+ l2tp_session_set_header_len(session, tunnel->version,
+ tunnel->encap);
}
} else {
/* No sequence numbers.
* If user has configured mandatory sequence numbers, discard.
*/
if (session->recv_seq) {
- l2tp_warn(session, L2TP_MSG_SEQ,
- "%s: recv data has no seq numbers when required. Discarding.\n",
- session->name);
+ pr_debug_ratelimited("%s: recv data has no seq numbers when required. Discarding.\n",
+ session->name);
atomic_long_inc(&session->stats.rx_seq_discards);
goto discard;
}
@@ -708,16 +936,14 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
* If we're the LNS and we're sending sequence numbers, the
* LAC is broken. Discard the frame.
*/
- if ((!session->lns_mode) && (session->send_seq)) {
- l2tp_info(session, L2TP_MSG_SEQ,
- "%s: requested to disable seq numbers by LNS\n",
- session->name);
+ if (!session->lns_mode && session->send_seq) {
+ trace_session_seqnum_lns_disable(session);
session->send_seq = 0;
- l2tp_session_set_header_len(session, tunnel->version);
+ l2tp_session_set_header_len(session, tunnel->version,
+ tunnel->encap);
} else if (session->send_seq) {
- l2tp_warn(session, L2TP_MSG_SEQ,
- "%s: recv data has no seq numbers when required. Discarding.\n",
- session->name);
+ pr_debug_ratelimited("%s: recv data has no seq numbers when required. Discarding.\n",
+ session->name);
atomic_long_inc(&session->stats.rx_seq_discards);
goto discard;
}
@@ -771,157 +997,137 @@ discard:
atomic_long_inc(&session->stats.rx_errors);
kfree_skb(skb);
}
-EXPORT_SYMBOL(l2tp_recv_common);
+EXPORT_SYMBOL_GPL(l2tp_recv_common);
/* Drop skbs from the session's reorder_q
*/
-static int l2tp_session_queue_purge(struct l2tp_session *session)
+static void l2tp_session_queue_purge(struct l2tp_session *session)
{
struct sk_buff *skb = NULL;
- BUG_ON(!session);
- BUG_ON(session->magic != L2TP_SESSION_MAGIC);
+
while ((skb = skb_dequeue(&session->reorder_q))) {
atomic_long_inc(&session->stats.rx_errors);
kfree_skb(skb);
}
- return 0;
}
-/* Internal UDP receive frame. Do the real work of receiving an L2TP data frame
- * here. The skb is not on a list when we get here.
- * Returns 0 if the packet was a data packet and was successfully passed on.
- * Returns 1 if the packet was not a good data packet and could not be
- * forwarded. All such packets are passed up to userspace to deal with.
- */
-static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb)
+/* UDP encapsulation receive handler. See net/ipv4/udp.c for details. */
+int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
{
struct l2tp_session *session = NULL;
+ struct l2tp_tunnel *tunnel = NULL;
+ struct net *net = sock_net(sk);
unsigned char *ptr, *optr;
u16 hdrflags;
- u32 tunnel_id, session_id;
u16 version;
int length;
- /* UDP has verifed checksum */
+ /* UDP has verified checksum */
/* UDP always verifies the packet length. */
__skb_pull(skb, sizeof(struct udphdr));
/* Short packet? */
- if (!pskb_may_pull(skb, L2TP_HDR_SIZE_SEQ)) {
- l2tp_info(tunnel, L2TP_MSG_DATA,
- "%s: recv short packet (len=%d)\n",
- tunnel->name, skb->len);
- goto error;
- }
-
- /* Trace packet contents, if enabled */
- if (tunnel->debug & L2TP_MSG_DATA) {
- length = min(32u, skb->len);
- if (!pskb_may_pull(skb, length))
- goto error;
-
- pr_debug("%s: recv\n", tunnel->name);
- print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, skb->data, length);
- }
+ if (!pskb_may_pull(skb, L2TP_HDR_SIZE_MAX))
+ goto pass;
/* Point to L2TP header */
- optr = ptr = skb->data;
+ optr = skb->data;
+ ptr = skb->data;
/* Get L2TP header flags */
- hdrflags = ntohs(*(__be16 *) ptr);
+ hdrflags = ntohs(*(__be16 *)ptr);
- /* Check protocol version */
+ /* Get protocol version */
version = hdrflags & L2TP_HDR_VER_MASK;
- if (version != tunnel->version) {
- l2tp_info(tunnel, L2TP_MSG_DATA,
- "%s: recv protocol version mismatch: got %d expected %d\n",
- tunnel->name, version, tunnel->version);
- goto error;
- }
/* Get length of L2TP packet */
length = skb->len;
/* If type is control packet, it is handled by userspace. */
- if (hdrflags & L2TP_HDRFLAG_T) {
- l2tp_dbg(tunnel, L2TP_MSG_DATA,
- "%s: recv control packet, len=%d\n",
- tunnel->name, length);
- goto error;
- }
+ if (hdrflags & L2TP_HDRFLAG_T)
+ goto pass;
/* Skip flags */
ptr += 2;
- if (tunnel->version == L2TP_HDR_VER_2) {
+ if (version == L2TP_HDR_VER_2) {
+ u16 tunnel_id, session_id;
+
/* If length is present, skip it */
if (hdrflags & L2TP_HDRFLAG_L)
ptr += 2;
/* Extract tunnel and session ID */
- tunnel_id = ntohs(*(__be16 *) ptr);
+ tunnel_id = ntohs(*(__be16 *)ptr);
ptr += 2;
- session_id = ntohs(*(__be16 *) ptr);
+ session_id = ntohs(*(__be16 *)ptr);
ptr += 2;
+
+ session = l2tp_v2_session_get(net, tunnel_id, session_id);
} else {
+ u32 session_id;
+
ptr += 2; /* skip reserved bits */
- tunnel_id = tunnel->tunnel_id;
- session_id = ntohl(*(__be32 *) ptr);
+ session_id = ntohl(*(__be32 *)ptr);
ptr += 4;
+
+ session = l2tp_v3_session_get(net, sk, session_id);
}
- /* Find the session context */
- session = l2tp_tunnel_get_session(tunnel, session_id);
if (!session || !session->recv_skb) {
if (session)
- l2tp_session_dec_refcount(session);
+ l2tp_session_put(session);
/* Not found? Pass to userspace to deal with */
- l2tp_info(tunnel, L2TP_MSG_DATA,
- "%s: no session found (%u/%u). Passing up.\n",
- tunnel->name, tunnel_id, session_id);
- goto error;
+ goto pass;
+ }
+
+ tunnel = session->tunnel;
+
+ /* Check protocol version */
+ if (version != tunnel->version)
+ goto invalid;
+
+ if (version == L2TP_HDR_VER_3 &&
+ l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) {
+ l2tp_session_put(session);
+ goto invalid;
}
l2tp_recv_common(session, skb, ptr, optr, hdrflags, length);
- l2tp_session_dec_refcount(session);
+ l2tp_session_put(session);
return 0;
-error:
+invalid:
+ atomic_long_inc(&tunnel->stats.rx_invalid);
+
+pass:
/* Put UDP header back */
__skb_push(skb, sizeof(struct udphdr));
return 1;
}
+EXPORT_SYMBOL_GPL(l2tp_udp_encap_recv);
-/* UDP encapsulation receive handler. See net/ipv4/udp.c.
- * Return codes:
- * 0 : success.
- * <0: error
- * >0: skb should be passed up to userspace as UDP.
- */
-int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
+/* UDP encapsulation receive error handler. See net/ipv4/udp.c for details. */
+static void l2tp_udp_encap_err_recv(struct sock *sk, struct sk_buff *skb, int err,
+ __be16 port, u32 info, u8 *payload)
{
- struct l2tp_tunnel *tunnel;
-
- tunnel = l2tp_tunnel(sk);
- if (tunnel == NULL)
- goto pass_up;
-
- l2tp_dbg(tunnel, L2TP_MSG_DATA, "%s: received %d bytes\n",
- tunnel->name, skb->len);
-
- if (l2tp_udp_recv_core(tunnel, skb))
- goto pass_up;
-
- return 0;
+ sk->sk_err = err;
+ sk_error_report(sk);
-pass_up:
- return 1;
+ if (ip_hdr(skb)->version == IPVERSION) {
+ if (inet_test_bit(RECVERR, sk))
+ return ip_icmp_error(sk, skb, err, port, info, payload);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else {
+ if (inet6_test_bit(RECVERR6, sk))
+ return ipv6_icmp_error(sk, skb, err, port, info, payload);
+#endif
+ }
}
-EXPORT_SYMBOL_GPL(l2tp_udp_encap_recv);
/************************************************************************
* Transmit handling
@@ -950,8 +1156,7 @@ static int l2tp_build_l2tpv2_header(struct l2tp_session *session, void *buf)
*bufp++ = 0;
session->ns++;
session->ns &= 0xffff;
- l2tp_dbg(session, L2TP_MSG_SEQ, "%s: updated ns to %u\n",
- session->name, session->ns);
+ trace_session_seqnum_update(session);
}
return bufp - optr;
@@ -968,13 +1173,13 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf)
*/
if (tunnel->encap == L2TP_ENCAPTYPE_UDP) {
u16 flags = L2TP_HDR_VER_3;
- *((__be16 *) bufp) = htons(flags);
+ *((__be16 *)bufp) = htons(flags);
bufp += 2;
- *((__be16 *) bufp) = 0;
+ *((__be16 *)bufp) = 0;
bufp += 2;
}
- *((__be32 *) bufp) = htonl(session->peer_session_id);
+ *((__be32 *)bufp) = htonl(session->peer_session_id);
bufp += 4;
if (session->cookie_len) {
memcpy(bufp, &session->cookie[0], session->cookie_len);
@@ -987,9 +1192,7 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf)
l2h = 0x40000000 | session->ns;
session->ns++;
session->ns &= 0xffffff;
- l2tp_dbg(session, L2TP_MSG_SEQ,
- "%s: updated ns to %u\n",
- session->name, session->ns);
+ trace_session_seqnum_update(session);
}
*((__be32 *)bufp) = htonl(l2h);
@@ -999,88 +1202,62 @@ static int l2tp_build_l2tpv3_header(struct l2tp_session *session, void *buf)
return bufp - optr;
}
-static void l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb,
- struct flowi *fl, size_t data_len)
+/* Queue the packet to IP for output: tunnel socket lock must be held */
+static int l2tp_xmit_queue(struct l2tp_tunnel *tunnel, struct sk_buff *skb, struct flowi *fl)
{
- struct l2tp_tunnel *tunnel = session->tunnel;
- unsigned int len = skb->len;
- int error;
-
- /* Debug */
- if (session->send_seq)
- l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes, ns=%u\n",
- session->name, data_len, session->ns - 1);
- else
- l2tp_dbg(session, L2TP_MSG_DATA, "%s: send %zd bytes\n",
- session->name, data_len);
-
- if (session->debug & L2TP_MSG_DATA) {
- int uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0;
- unsigned char *datap = skb->data + uhlen;
-
- pr_debug("%s: xmit\n", session->name);
- print_hex_dump_bytes("", DUMP_PREFIX_OFFSET,
- datap, min_t(size_t, 32, len - uhlen));
- }
+ int err;
- /* Queue the packet to IP for output */
skb->ignore_df = 1;
+ skb_dst_drop(skb);
#if IS_ENABLED(CONFIG_IPV6)
if (l2tp_sk_is_v6(tunnel->sock))
- error = inet6_csk_xmit(tunnel->sock, skb, NULL);
+ err = inet6_csk_xmit(tunnel->sock, skb, NULL);
else
#endif
- error = ip_queue_xmit(tunnel->sock, skb, fl);
+ err = ip_queue_xmit(tunnel->sock, skb, fl);
- /* Update stats */
- if (error >= 0) {
- atomic_long_inc(&tunnel->stats.tx_packets);
- atomic_long_add(len, &tunnel->stats.tx_bytes);
- atomic_long_inc(&session->stats.tx_packets);
- atomic_long_add(len, &session->stats.tx_bytes);
- } else {
- atomic_long_inc(&tunnel->stats.tx_errors);
- atomic_long_inc(&session->stats.tx_errors);
- }
+ return err >= 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
}
-/* If caller requires the skb to have a ppp header, the header must be
- * inserted in the skb data before calling this function.
- */
-int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len)
+static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, unsigned int *len)
{
- int data_len = skb->len;
struct l2tp_tunnel *tunnel = session->tunnel;
+ unsigned int data_len = skb->len;
struct sock *sk = tunnel->sock;
- struct flowi *fl;
- struct udphdr *uh;
- struct inet_sock *inet;
- int headroom;
- int uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(struct udphdr) : 0;
- int udp_len;
+ int headroom, uhlen, udp_len;
int ret = NET_XMIT_SUCCESS;
+ struct inet_sock *inet;
+ struct udphdr *uh;
/* Check that there's enough headroom in the skb to insert IP,
* UDP and L2TP headers. If not enough, expand it to
* make room. Adjust truesize.
*/
- headroom = NET_SKB_PAD + sizeof(struct iphdr) +
- uhlen + hdr_len;
+ uhlen = (tunnel->encap == L2TP_ENCAPTYPE_UDP) ? sizeof(*uh) : 0;
+ headroom = NET_SKB_PAD + sizeof(struct iphdr) + uhlen + session->hdr_len;
if (skb_cow_head(skb, headroom)) {
kfree_skb(skb);
return NET_XMIT_DROP;
}
/* Setup L2TP header */
- session->build_header(session, __skb_push(skb, hdr_len));
+ if (tunnel->version == L2TP_HDR_VER_2)
+ l2tp_build_l2tpv2_header(session, __skb_push(skb, session->hdr_len));
+ else
+ l2tp_build_l2tpv3_header(session, __skb_push(skb, session->hdr_len));
+
+ /* Reset control buffer */
+ memset(skb->cb, 0, sizeof(skb->cb));
- /* Reset skb netfilter state */
- memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
- IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
- IPSKB_REROUTED);
- nf_reset(skb);
+ nf_reset_ct(skb);
+
+ /* L2TP uses its own lockdep subclass to avoid lockdep splats caused by
+ * nested socket calls on the same lockdep socket class. This can
+ * happen when data from a user socket is routed over l2tp, which uses
+ * another userspace socket.
+ */
+ spin_lock_nested(&sk->sk_lock.slock, L2TP_DEPTH_NESTING);
- bh_lock_sock(sk);
if (sock_owned_by_user(sk)) {
kfree_skb(skb);
ret = NET_XMIT_DROP;
@@ -1096,12 +1273,12 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len
goto out_unlock;
}
- /* Get routing info from the tunnel socket */
- skb_dst_drop(skb);
- skb_dst_set(skb, sk_dst_check(sk, 0));
+ /* Report transmitted length before we add encap header, which keeps
+ * statistics consistent for both UDP and IP encap tx/rx paths.
+ */
+ *len = skb->len;
inet = inet_sk(sk);
- fl = &inet->cork.fl;
switch (tunnel->encap) {
case L2TP_ENCAPTYPE_UDP:
/* Setup UDP header */
@@ -1110,7 +1287,7 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len
uh = udp_hdr(skb);
uh->source = inet->inet_sport;
uh->dest = inet->inet_dport;
- udp_len = uhlen + hdr_len + data_len;
+ udp_len = uhlen + session->hdr_len + data_len;
uh->len = htons(udp_len);
/* Calculate UDP checksum if configured to do so */
@@ -1121,123 +1298,115 @@ int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb, int hdr_len
&sk->sk_v6_daddr, udp_len);
else
#endif
- udp_set_csum(sk->sk_no_check_tx, skb, inet->inet_saddr,
- inet->inet_daddr, udp_len);
+ udp_set_csum(sk->sk_no_check_tx, skb, inet->inet_saddr,
+ inet->inet_daddr, udp_len);
break;
case L2TP_ENCAPTYPE_IP:
break;
}
- l2tp_xmit_core(session, skb, fl, data_len);
+ ret = l2tp_xmit_queue(tunnel, skb, &inet->cork.fl);
+
out_unlock:
- bh_unlock_sock(sk);
+ spin_unlock(&sk->sk_lock.slock);
return ret;
}
+
+/* If caller requires the skb to have a ppp header, the header must be
+ * inserted in the skb data before calling this function.
+ */
+int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb)
+{
+ unsigned int len = 0;
+ int ret;
+
+ ret = l2tp_xmit_core(session, skb, &len);
+ if (ret == NET_XMIT_SUCCESS) {
+ atomic_long_inc(&session->tunnel->stats.tx_packets);
+ atomic_long_add(len, &session->tunnel->stats.tx_bytes);
+ atomic_long_inc(&session->stats.tx_packets);
+ atomic_long_add(len, &session->stats.tx_bytes);
+ } else {
+ atomic_long_inc(&session->tunnel->stats.tx_errors);
+ atomic_long_inc(&session->stats.tx_errors);
+ }
+ return ret;
+}
EXPORT_SYMBOL_GPL(l2tp_xmit_skb);
/*****************************************************************************
* Tinnel and session create/destroy.
*****************************************************************************/
-/* Tunnel socket destruct hook.
- * The tunnel context is deleted only when all session sockets have been
- * closed.
- */
-static void l2tp_tunnel_destruct(struct sock *sk)
+/* Remove an l2tp session from l2tp_core's lists. */
+static void l2tp_session_unhash(struct l2tp_session *session)
{
- struct l2tp_tunnel *tunnel = l2tp_tunnel(sk);
-
- if (tunnel == NULL)
- goto end;
+ struct l2tp_tunnel *tunnel = session->tunnel;
- l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: closing...\n", tunnel->name);
+ if (tunnel) {
+ struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net);
+ struct l2tp_session *removed = session;
+
+ spin_lock_bh(&tunnel->list_lock);
+ spin_lock_bh(&pn->l2tp_session_idr_lock);
+
+ /* Remove from the per-tunnel list */
+ list_del_init(&session->list);
+
+ /* Remove from per-net IDR */
+ if (tunnel->version == L2TP_HDR_VER_3) {
+ if (hash_hashed(&session->hlist))
+ l2tp_session_collision_del(pn, session);
+ else
+ removed = idr_remove(&pn->l2tp_v3_session_idr,
+ session->session_id);
+ } else {
+ u32 session_key = l2tp_v2_session_key(tunnel->tunnel_id,
+ session->session_id);
+ removed = idr_remove(&pn->l2tp_v2_session_idr,
+ session_key);
+ }
+ WARN_ON_ONCE(removed && removed != session);
- /* Disable udp encapsulation */
- switch (tunnel->encap) {
- case L2TP_ENCAPTYPE_UDP:
- /* No longer an encapsulation socket. See net/ipv4/udp.c */
- (udp_sk(sk))->encap_type = 0;
- (udp_sk(sk))->encap_rcv = NULL;
- (udp_sk(sk))->encap_destroy = NULL;
- break;
- case L2TP_ENCAPTYPE_IP:
- break;
+ spin_unlock_bh(&pn->l2tp_session_idr_lock);
+ spin_unlock_bh(&tunnel->list_lock);
}
-
- /* Remove hooks into tunnel socket */
- sk->sk_destruct = tunnel->old_sk_destruct;
- sk->sk_user_data = NULL;
-
- /* Call the original destructor */
- if (sk->sk_destruct)
- (*sk->sk_destruct)(sk);
-
- kfree_rcu(tunnel, rcu);
-end:
- return;
}
/* When the tunnel is closed, all the attached sessions need to go too.
*/
static void l2tp_tunnel_closeall(struct l2tp_tunnel *tunnel)
{
- int hash;
- struct hlist_node *walk;
- struct hlist_node *tmp;
struct l2tp_session *session;
- BUG_ON(tunnel == NULL);
-
- l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: closing all sessions...\n",
- tunnel->name);
-
- write_lock_bh(&tunnel->hlist_lock);
+ spin_lock_bh(&tunnel->list_lock);
tunnel->acpt_newsess = false;
- for (hash = 0; hash < L2TP_HASH_SIZE; hash++) {
-again:
- hlist_for_each_safe(walk, tmp, &tunnel->session_hlist[hash]) {
- session = hlist_entry(walk, struct l2tp_session, hlist);
-
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: closing session\n", session->name);
-
- hlist_del_init(&session->hlist);
-
- if (test_and_set_bit(0, &session->dead))
- goto again;
-
- write_unlock_bh(&tunnel->hlist_lock);
-
- __l2tp_session_unhash(session);
- l2tp_session_queue_purge(session);
-
- if (session->session_close != NULL)
- (*session->session_close)(session);
-
- l2tp_session_dec_refcount(session);
-
- write_lock_bh(&tunnel->hlist_lock);
-
- /* Now restart from the beginning of this hash
- * chain. We always remove a session from the
- * list so we are guaranteed to make forward
- * progress.
- */
- goto again;
- }
- }
- write_unlock_bh(&tunnel->hlist_lock);
+ list_for_each_entry(session, &tunnel->session_list, list)
+ l2tp_session_delete(session);
+ spin_unlock_bh(&tunnel->list_lock);
}
/* Tunnel socket destroy hook for UDP encapsulation */
static void l2tp_udp_encap_destroy(struct sock *sk)
{
- struct l2tp_tunnel *tunnel = l2tp_tunnel(sk);
+ struct l2tp_tunnel *tunnel;
- if (tunnel)
+ tunnel = l2tp_sk_to_tunnel(sk);
+ if (tunnel) {
l2tp_tunnel_delete(tunnel);
+ l2tp_tunnel_put(tunnel);
+ }
+}
+
+static void l2tp_tunnel_remove(struct net *net, struct l2tp_tunnel *tunnel)
+{
+ struct l2tp_net *pn = l2tp_pernet(net);
+
+ spin_lock_bh(&pn->l2tp_tunnel_idr_lock);
+ idr_remove(&pn->l2tp_tunnel_idr, tunnel->tunnel_id);
+ spin_unlock_bh(&pn->l2tp_tunnel_idr_lock);
}
/* Workqueue tunnel deletion function */
@@ -1247,7 +1416,6 @@ static void l2tp_tunnel_del_work(struct work_struct *work)
del_work);
struct sock *sk = tunnel->sock;
struct socket *sock = sk->sk_socket;
- struct l2tp_net *pn;
l2tp_tunnel_closeall(tunnel);
@@ -1261,17 +1429,12 @@ static void l2tp_tunnel_del_work(struct work_struct *work)
}
}
- /* Remove the tunnel struct from the tunnel list */
- pn = l2tp_pernet(tunnel->l2tp_net);
- spin_lock_bh(&pn->l2tp_tunnel_list_lock);
- list_del_rcu(&tunnel->list);
- spin_unlock_bh(&pn->l2tp_tunnel_list_lock);
-
+ l2tp_tunnel_remove(tunnel->l2tp_net, tunnel);
/* drop initial ref */
- l2tp_tunnel_dec_refcount(tunnel);
+ l2tp_tunnel_put(tunnel);
/* drop workqueue ref */
- l2tp_tunnel_dec_refcount(tunnel);
+ l2tp_tunnel_put(tunnel);
}
/* Create a socket for the tunnel, if one isn't set up by
@@ -1284,10 +1447,10 @@ static void l2tp_tunnel_del_work(struct work_struct *work)
* exit hook.
*/
static int l2tp_tunnel_sock_create(struct net *net,
- u32 tunnel_id,
- u32 peer_tunnel_id,
- struct l2tp_tunnel_cfg *cfg,
- struct socket **sockp)
+ u32 tunnel_id,
+ u32 peer_tunnel_id,
+ struct l2tp_tunnel_cfg *cfg,
+ struct socket **sockp)
{
int err = -EINVAL;
struct socket *sock = NULL;
@@ -1305,9 +1468,9 @@ static int l2tp_tunnel_sock_create(struct net *net,
memcpy(&udp_conf.peer_ip6, cfg->peer_ip6,
sizeof(udp_conf.peer_ip6));
udp_conf.use_udp6_tx_checksums =
- ! cfg->udp6_zero_tx_checksums;
+ !cfg->udp6_zero_tx_checksums;
udp_conf.use_udp6_rx_checksums =
- ! cfg->udp6_zero_rx_checksums;
+ !cfg->udp6_zero_rx_checksums;
} else
#endif
{
@@ -1332,7 +1495,7 @@ static int l2tp_tunnel_sock_create(struct net *net,
struct sockaddr_l2tpip6 ip6_addr = {0};
err = sock_create_kern(net, AF_INET6, SOCK_DGRAM,
- IPPROTO_L2TP, &sock);
+ IPPROTO_L2TP, &sock);
if (err < 0)
goto out;
@@ -1340,7 +1503,7 @@ static int l2tp_tunnel_sock_create(struct net *net,
memcpy(&ip6_addr.l2tp_addr, cfg->local_ip6,
sizeof(ip6_addr.l2tp_addr));
ip6_addr.l2tp_conn_id = tunnel_id;
- err = kernel_bind(sock, (struct sockaddr *) &ip6_addr,
+ err = kernel_bind(sock, (struct sockaddr_unsized *)&ip6_addr,
sizeof(ip6_addr));
if (err < 0)
goto out;
@@ -1350,7 +1513,7 @@ static int l2tp_tunnel_sock_create(struct net *net,
sizeof(ip6_addr.l2tp_addr));
ip6_addr.l2tp_conn_id = peer_tunnel_id;
err = kernel_connect(sock,
- (struct sockaddr *) &ip6_addr,
+ (struct sockaddr_unsized *)&ip6_addr,
sizeof(ip6_addr), 0);
if (err < 0)
goto out;
@@ -1360,14 +1523,14 @@ static int l2tp_tunnel_sock_create(struct net *net,
struct sockaddr_l2tpip ip_addr = {0};
err = sock_create_kern(net, AF_INET, SOCK_DGRAM,
- IPPROTO_L2TP, &sock);
+ IPPROTO_L2TP, &sock);
if (err < 0)
goto out;
ip_addr.l2tp_family = AF_INET;
ip_addr.l2tp_addr = cfg->local_ip;
ip_addr.l2tp_conn_id = tunnel_id;
- err = kernel_bind(sock, (struct sockaddr *) &ip_addr,
+ err = kernel_bind(sock, (struct sockaddr_unsized *)&ip_addr,
sizeof(ip_addr));
if (err < 0)
goto out;
@@ -1375,7 +1538,7 @@ static int l2tp_tunnel_sock_create(struct net *net,
ip_addr.l2tp_family = AF_INET;
ip_addr.l2tp_addr = cfg->peer_ip;
ip_addr.l2tp_conn_id = peer_tunnel_id;
- err = kernel_connect(sock, (struct sockaddr *) &ip_addr,
+ err = kernel_connect(sock, (struct sockaddr_unsized *)&ip_addr,
sizeof(ip_addr), 0);
if (err < 0)
goto out;
@@ -1388,7 +1551,7 @@ static int l2tp_tunnel_sock_create(struct net *net,
out:
*sockp = sock;
- if ((err < 0) && sock) {
+ if (err < 0 && sock) {
kernel_sock_shutdown(sock, SHUT_RDWR);
sock_release(sock);
*sockp = NULL;
@@ -1397,19 +1560,18 @@ out:
return err;
}
-static struct lock_class_key l2tp_socket_class;
-
-int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp)
+int l2tp_tunnel_create(int fd, int version, u32 tunnel_id, u32 peer_tunnel_id,
+ struct l2tp_tunnel_cfg *cfg, struct l2tp_tunnel **tunnelp)
{
struct l2tp_tunnel *tunnel = NULL;
int err;
enum l2tp_encap_type encap = L2TP_ENCAPTYPE_UDP;
- if (cfg != NULL)
+ if (cfg)
encap = cfg->encap;
- tunnel = kzalloc(sizeof(struct l2tp_tunnel), GFP_KERNEL);
- if (tunnel == NULL) {
+ tunnel = kzalloc(sizeof(*tunnel), GFP_KERNEL);
+ if (!tunnel) {
err = -ENOMEM;
goto err;
}
@@ -1417,15 +1579,11 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32
tunnel->version = version;
tunnel->tunnel_id = tunnel_id;
tunnel->peer_tunnel_id = peer_tunnel_id;
- tunnel->debug = L2TP_DEFAULT_DEBUG_FLAGS;
- tunnel->magic = L2TP_TUNNEL_MAGIC;
sprintf(&tunnel->name[0], "tunl %u", tunnel_id);
- rwlock_init(&tunnel->hlist_lock);
+ spin_lock_init(&tunnel->list_lock);
tunnel->acpt_newsess = true;
-
- if (cfg != NULL)
- tunnel->debug = cfg->debug;
+ INIT_LIST_HEAD(&tunnel->session_list);
tunnel->encap = encap;
@@ -1435,8 +1593,6 @@ int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32
/* Init delete workqueue struct */
INIT_WORK(&tunnel->del_work, l2tp_tunnel_del_work);
- INIT_LIST_HEAD(&tunnel->list);
-
err = 0;
err:
if (tunnelp)
@@ -1449,18 +1605,29 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_create);
static int l2tp_validate_socket(const struct sock *sk, const struct net *net,
enum l2tp_encap_type encap)
{
+ struct l2tp_tunnel *tunnel;
+
if (!net_eq(sock_net(sk), net))
return -EINVAL;
if (sk->sk_type != SOCK_DGRAM)
return -EPROTONOSUPPORT;
+ if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
+ return -EPROTONOSUPPORT;
+
if ((encap == L2TP_ENCAPTYPE_UDP && sk->sk_protocol != IPPROTO_UDP) ||
(encap == L2TP_ENCAPTYPE_IP && sk->sk_protocol != IPPROTO_L2TP))
return -EPROTONOSUPPORT;
- if (sk->sk_user_data)
+ if (encap == L2TP_ENCAPTYPE_UDP && sk->sk_user_data)
+ return -EBUSY;
+
+ tunnel = l2tp_sk_to_tunnel(sk);
+ if (tunnel) {
+ l2tp_tunnel_put(tunnel);
return -EBUSY;
+ }
return 0;
}
@@ -1468,12 +1635,19 @@ static int l2tp_validate_socket(const struct sock *sk, const struct net *net,
int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net,
struct l2tp_tunnel_cfg *cfg)
{
- struct l2tp_tunnel *tunnel_walk;
- struct l2tp_net *pn;
+ struct l2tp_net *pn = l2tp_pernet(net);
+ u32 tunnel_id = tunnel->tunnel_id;
struct socket *sock;
struct sock *sk;
int ret;
+ spin_lock_bh(&pn->l2tp_tunnel_idr_lock);
+ ret = idr_alloc_u32(&pn->l2tp_tunnel_idr, NULL, &tunnel_id, tunnel_id,
+ GFP_ATOMIC);
+ spin_unlock_bh(&pn->l2tp_tunnel_idr_lock);
+ if (ret)
+ return ret == -ENOSPC ? -EEXIST : ret;
+
if (tunnel->fd < 0) {
ret = l2tp_tunnel_sock_create(net, tunnel->tunnel_id,
tunnel->peer_tunnel_id, cfg,
@@ -1484,62 +1658,55 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net,
sock = sockfd_lookup(tunnel->fd, &ret);
if (!sock)
goto err;
-
- ret = l2tp_validate_socket(sock->sk, net, tunnel->encap);
- if (ret < 0)
- goto err_sock;
}
sk = sock->sk;
-
- sock_hold(sk);
- tunnel->sock = sk;
- tunnel->l2tp_net = net;
-
- pn = l2tp_pernet(net);
-
- spin_lock_bh(&pn->l2tp_tunnel_list_lock);
- list_for_each_entry(tunnel_walk, &pn->l2tp_tunnel_list, list) {
- if (tunnel_walk->tunnel_id == tunnel->tunnel_id) {
- spin_unlock_bh(&pn->l2tp_tunnel_list_lock);
-
- ret = -EEXIST;
- goto err_sock;
- }
- }
- list_add_rcu(&tunnel->list, &pn->l2tp_tunnel_list);
- spin_unlock_bh(&pn->l2tp_tunnel_list_lock);
+ lock_sock(sk);
+ write_lock_bh(&sk->sk_callback_lock);
+ ret = l2tp_validate_socket(sk, net, tunnel->encap);
+ if (ret < 0)
+ goto err_inval_sock;
+ write_unlock_bh(&sk->sk_callback_lock);
if (tunnel->encap == L2TP_ENCAPTYPE_UDP) {
struct udp_tunnel_sock_cfg udp_cfg = {
- .sk_user_data = tunnel,
.encap_type = UDP_ENCAP_L2TPINUDP,
.encap_rcv = l2tp_udp_encap_recv,
+ .encap_err_rcv = l2tp_udp_encap_err_recv,
.encap_destroy = l2tp_udp_encap_destroy,
};
setup_udp_tunnel_sock(net, sock, &udp_cfg);
- } else {
- sk->sk_user_data = tunnel;
}
- tunnel->old_sk_destruct = sk->sk_destruct;
- sk->sk_destruct = &l2tp_tunnel_destruct;
- lockdep_set_class_and_name(&sk->sk_lock.slock, &l2tp_socket_class,
- "l2tp_sock");
sk->sk_allocation = GFP_ATOMIC;
+ release_sock(sk);
+
+ sock_hold(sk);
+ tunnel->sock = sk;
+ tunnel->l2tp_net = net;
+
+ spin_lock_bh(&pn->l2tp_tunnel_idr_lock);
+ idr_replace(&pn->l2tp_tunnel_idr, tunnel, tunnel->tunnel_id);
+ spin_unlock_bh(&pn->l2tp_tunnel_idr_lock);
+
+ trace_register_tunnel(tunnel);
if (tunnel->fd >= 0)
sockfd_put(sock);
return 0;
-err_sock:
+err_inval_sock:
+ write_unlock_bh(&sk->sk_callback_lock);
+ release_sock(sk);
+
if (tunnel->fd < 0)
sock_release(sock);
else
sockfd_put(sock);
err:
+ l2tp_tunnel_remove(net, tunnel);
return ret;
}
EXPORT_SYMBOL_GPL(l2tp_tunnel_register);
@@ -1549,80 +1716,46 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_register);
void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel)
{
if (!test_and_set_bit(0, &tunnel->dead)) {
- l2tp_tunnel_inc_refcount(tunnel);
+ trace_delete_tunnel(tunnel);
+ refcount_inc(&tunnel->ref_count);
queue_work(l2tp_wq, &tunnel->del_work);
}
}
EXPORT_SYMBOL_GPL(l2tp_tunnel_delete);
-/* Really kill the session.
- */
-void l2tp_session_free(struct l2tp_session *session)
+void l2tp_session_delete(struct l2tp_session *session)
{
- struct l2tp_tunnel *tunnel = session->tunnel;
-
- BUG_ON(refcount_read(&session->ref_count) != 0);
-
- if (tunnel) {
- BUG_ON(tunnel->magic != L2TP_TUNNEL_MAGIC);
- l2tp_tunnel_dec_refcount(tunnel);
+ if (!test_and_set_bit(0, &session->dead)) {
+ trace_delete_session(session);
+ refcount_inc(&session->ref_count);
+ queue_work(l2tp_wq, &session->del_work);
}
-
- kfree(session);
}
-EXPORT_SYMBOL_GPL(l2tp_session_free);
-
-/* Remove an l2tp session from l2tp_core's hash lists.
- * Provides a tidyup interface for pseudowire code which can't just route all
- * shutdown via. l2tp_session_delete and a pseudowire-specific session_close
- * callback.
- */
-void __l2tp_session_unhash(struct l2tp_session *session)
-{
- struct l2tp_tunnel *tunnel = session->tunnel;
-
- /* Remove the session from core hashes */
- if (tunnel) {
- /* Remove from the per-tunnel hash */
- write_lock_bh(&tunnel->hlist_lock);
- hlist_del_init(&session->hlist);
- write_unlock_bh(&tunnel->hlist_lock);
-
- /* For L2TPv3 we have a per-net hash: remove from there, too */
- if (tunnel->version != L2TP_HDR_VER_2) {
- struct l2tp_net *pn = l2tp_pernet(tunnel->l2tp_net);
- spin_lock_bh(&pn->l2tp_session_hlist_lock);
- hlist_del_init_rcu(&session->global_hlist);
- spin_unlock_bh(&pn->l2tp_session_hlist_lock);
- synchronize_rcu();
- }
- }
-}
-EXPORT_SYMBOL_GPL(__l2tp_session_unhash);
+EXPORT_SYMBOL_GPL(l2tp_session_delete);
-/* This function is used by the netlink SESSION_DELETE command and by
- pseudowire modules.
- */
-int l2tp_session_delete(struct l2tp_session *session)
+/* Workqueue session deletion function */
+static void l2tp_session_del_work(struct work_struct *work)
{
- if (test_and_set_bit(0, &session->dead))
- return 0;
+ struct l2tp_session *session = container_of(work, struct l2tp_session,
+ del_work);
- __l2tp_session_unhash(session);
+ l2tp_session_unhash(session);
l2tp_session_queue_purge(session);
- if (session->session_close != NULL)
+ if (session->session_close)
(*session->session_close)(session);
- l2tp_session_dec_refcount(session);
+ /* drop initial ref */
+ l2tp_session_put(session);
- return 0;
+ /* drop workqueue ref */
+ l2tp_session_put(session);
}
-EXPORT_SYMBOL_GPL(l2tp_session_delete);
/* We come here whenever a session's send_seq, cookie_len or
* l2specific_type parameters are set.
*/
-void l2tp_session_set_header_len(struct l2tp_session *session, int version)
+void l2tp_session_set_header_len(struct l2tp_session *session, int version,
+ enum l2tp_encap_type encap)
{
if (version == L2TP_HDR_VER_2) {
session->hdr_len = 6;
@@ -1631,21 +1764,20 @@ void l2tp_session_set_header_len(struct l2tp_session *session, int version)
} else {
session->hdr_len = 4 + session->cookie_len;
session->hdr_len += l2tp_get_l2specific_len(session);
- if (session->tunnel->encap == L2TP_ENCAPTYPE_UDP)
+ if (encap == L2TP_ENCAPTYPE_UDP)
session->hdr_len += 4;
}
-
}
EXPORT_SYMBOL_GPL(l2tp_session_set_header_len);
-struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg)
+struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunnel, u32 session_id,
+ u32 peer_session_id, struct l2tp_session_cfg *cfg)
{
struct l2tp_session *session;
- session = kzalloc(sizeof(struct l2tp_session) + priv_size, GFP_KERNEL);
- if (session != NULL) {
+ session = kzalloc(sizeof(*session) + priv_size, GFP_KERNEL);
+ if (session) {
session->magic = L2TP_SESSION_MAGIC;
- session->tunnel = tunnel;
session->session_id = session_id;
session->peer_session_id = peer_session_id;
@@ -1665,15 +1797,14 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
skb_queue_head_init(&session->reorder_q);
+ session->hlist_key = l2tp_v3_session_hashkey(tunnel->sock, session->session_id);
INIT_HLIST_NODE(&session->hlist);
- INIT_HLIST_NODE(&session->global_hlist);
-
- /* Inherit debug options from tunnel */
- session->debug = tunnel->debug;
+ INIT_LIST_HEAD(&session->clist);
+ INIT_LIST_HEAD(&session->list);
+ INIT_WORK(&session->del_work, l2tp_session_del_work);
if (cfg) {
session->pwtype = cfg->pw_type;
- session->debug = cfg->debug;
session->send_seq = cfg->send_seq;
session->recv_seq = cfg->recv_seq;
session->lns_mode = cfg->lns_mode;
@@ -1685,12 +1816,7 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
memcpy(&session->peer_cookie[0], &cfg->peer_cookie[0], cfg->peer_cookie_len);
}
- if (tunnel->version == L2TP_HDR_VER_2)
- session->build_header = l2tp_build_l2tpv2_header;
- else
- session->build_header = l2tp_build_l2tpv3_header;
-
- l2tp_session_set_header_len(session, tunnel->version);
+ l2tp_session_set_header_len(session, tunnel->version, tunnel->encap);
refcount_set(&session->ref_count, 1);
@@ -1708,41 +1834,74 @@ EXPORT_SYMBOL_GPL(l2tp_session_create);
static __net_init int l2tp_init_net(struct net *net)
{
struct l2tp_net *pn = net_generic(net, l2tp_net_id);
- int hash;
-
- INIT_LIST_HEAD(&pn->l2tp_tunnel_list);
- spin_lock_init(&pn->l2tp_tunnel_list_lock);
- for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++)
- INIT_HLIST_HEAD(&pn->l2tp_session_hlist[hash]);
+ idr_init(&pn->l2tp_tunnel_idr);
+ spin_lock_init(&pn->l2tp_tunnel_idr_lock);
- spin_lock_init(&pn->l2tp_session_hlist_lock);
+ idr_init(&pn->l2tp_v2_session_idr);
+ idr_init(&pn->l2tp_v3_session_idr);
+ spin_lock_init(&pn->l2tp_session_idr_lock);
return 0;
}
-static __net_exit void l2tp_exit_net(struct net *net)
+static __net_exit void l2tp_pre_exit_net(struct net *net)
{
struct l2tp_net *pn = l2tp_pernet(net);
struct l2tp_tunnel *tunnel = NULL;
- int hash;
+ unsigned long tunnel_id, tmp;
rcu_read_lock_bh();
- list_for_each_entry_rcu(tunnel, &pn->l2tp_tunnel_list, list) {
- l2tp_tunnel_delete(tunnel);
+ idr_for_each_entry_ul(&pn->l2tp_tunnel_idr, tunnel, tmp, tunnel_id) {
+ if (tunnel)
+ l2tp_tunnel_delete(tunnel);
}
rcu_read_unlock_bh();
- flush_workqueue(l2tp_wq);
- rcu_barrier();
+ if (l2tp_wq) {
+ /* Run all TUNNEL_DELETE work items just queued. */
+ __flush_workqueue(l2tp_wq);
- for (hash = 0; hash < L2TP_HASH_SIZE_2; hash++)
- WARN_ON_ONCE(!hlist_empty(&pn->l2tp_session_hlist[hash]));
+ /* Each TUNNEL_DELETE work item will queue a SESSION_DELETE
+ * work item for each session in the tunnel. Flush the
+ * workqueue again to process these.
+ */
+ __flush_workqueue(l2tp_wq);
+ }
+}
+
+static int l2tp_idr_item_unexpected(int id, void *p, void *data)
+{
+ const char *idr_name = data;
+
+ pr_err("l2tp: %s IDR not empty at net %d exit\n", idr_name, id);
+ WARN_ON_ONCE(1);
+ return 1;
+}
+
+static __net_exit void l2tp_exit_net(struct net *net)
+{
+ struct l2tp_net *pn = l2tp_pernet(net);
+
+ /* Our per-net IDRs should be empty. Check that is so, to
+ * help catch cleanup races or refcnt leaks.
+ */
+ idr_for_each(&pn->l2tp_v2_session_idr, l2tp_idr_item_unexpected,
+ "v2_session");
+ idr_for_each(&pn->l2tp_v3_session_idr, l2tp_idr_item_unexpected,
+ "v3_session");
+ idr_for_each(&pn->l2tp_tunnel_idr, l2tp_idr_item_unexpected,
+ "tunnel");
+
+ idr_destroy(&pn->l2tp_v2_session_idr);
+ idr_destroy(&pn->l2tp_v3_session_idr);
+ idr_destroy(&pn->l2tp_tunnel_idr);
}
static struct pernet_operations l2tp_net_ops = {
.init = l2tp_init_net,
.exit = l2tp_exit_net,
+ .pre_exit = l2tp_pre_exit_net,
.id = &l2tp_net_id,
.size = sizeof(struct l2tp_net),
};
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index 9c9afe94d389..ffd8ced3a51f 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -1,11 +1,7 @@
-/*
- * L2TP internal definitions.
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* L2TP internal definitions.
*
* Copyright (c) 2008,2009 Katalix Systems Ltd
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/refcount.h>
@@ -19,18 +15,9 @@
#include <net/xfrm.h>
#endif
-/* Just some random numbers */
-#define L2TP_TUNNEL_MAGIC 0x42114DDA
+/* Random numbers used for internal consistency checks of tunnel and session structures */
#define L2TP_SESSION_MAGIC 0x0C04EB7D
-/* Per tunnel, session hash table size */
-#define L2TP_HASH_BITS 4
-#define L2TP_HASH_SIZE (1 << L2TP_HASH_BITS)
-
-/* System-wide, session hash table size */
-#define L2TP_HASH_BITS_2 8
-#define L2TP_HASH_SIZE_2 (1 << L2TP_HASH_BITS_2)
-
struct sk_buff;
struct l2tp_stats {
@@ -43,41 +30,45 @@ struct l2tp_stats {
atomic_long_t rx_oos_packets;
atomic_long_t rx_errors;
atomic_long_t rx_cookie_discards;
+ atomic_long_t rx_invalid;
};
struct l2tp_tunnel;
-/* Describes a session. Contains information to determine incoming
- * packets and transmit outgoing ones.
- */
+/* L2TP session configuration */
struct l2tp_session_cfg {
enum l2tp_pwtype pw_type;
- unsigned int recv_seq:1; /* expect receive packets with
- * sequence numbers? */
- unsigned int send_seq:1; /* send packets with sequence
- * numbers? */
- unsigned int lns_mode:1; /* behave as LNS? LAC enables
- * sequence numbers under
- * control of LNS. */
- int debug; /* bitmask of debug message
- * categories */
+ unsigned int recv_seq:1; /* expect receive packets with sequence numbers? */
+ unsigned int send_seq:1; /* send packets with sequence numbers? */
+ unsigned int lns_mode:1; /* behave as LNS?
+ * LAC enables sequence numbers under LNS control.
+ */
u16 l2specific_type; /* Layer 2 specific type */
u8 cookie[8]; /* optional cookie */
int cookie_len; /* 0, 4 or 8 bytes */
u8 peer_cookie[8]; /* peer's cookie */
int peer_cookie_len; /* 0, 4 or 8 bytes */
- int reorder_timeout; /* configured reorder timeout
- * (in jiffies) */
+ int reorder_timeout; /* configured reorder timeout (in jiffies) */
char *ifname;
};
+struct l2tp_session_coll_list {
+ spinlock_t lock; /* for access to list */
+ struct list_head list;
+ refcount_t ref_count;
+};
+
+/* Represents a session (pseudowire) instance.
+ * Tracks runtime state including cookies, dataplane packet sequencing, and IO statistics.
+ * Is linked into a per-tunnel session list and a per-net ("global") IDR tree.
+ */
+#define L2TP_SESSION_NAME_MAX 32
struct l2tp_session {
- int magic; /* should be
- * L2TP_SESSION_MAGIC */
+ int magic; /* should be L2TP_SESSION_MAGIC */
long dead;
+ struct rcu_head rcu;
- struct l2tp_tunnel *tunnel; /* back pointer to tunnel
- * context */
+ struct l2tp_tunnel *tunnel; /* back pointer to tunnel context */
u32 session_id;
u32 peer_session_id;
u8 cookie[8];
@@ -92,42 +83,55 @@ struct l2tp_session {
u32 nr_max; /* max NR. Depends on tunnel */
u32 nr_window_size; /* NR window size */
u32 nr_oos; /* NR of last OOS packet */
- int nr_oos_count; /* For OOS recovery */
+ int nr_oos_count; /* for OOS recovery */
int nr_oos_count_max;
- struct hlist_node hlist; /* Hash list node */
+ struct list_head list; /* per-tunnel list node */
refcount_t ref_count;
+ struct hlist_node hlist; /* per-net session hlist */
+ unsigned long hlist_key; /* key for session hlist */
+ struct l2tp_session_coll_list *coll_list; /* session collision list */
+ struct list_head clist; /* for coll_list */
- char name[32]; /* for logging */
+ char name[L2TP_SESSION_NAME_MAX]; /* for logging */
char ifname[IFNAMSIZ];
- unsigned int recv_seq:1; /* expect receive packets with
- * sequence numbers? */
- unsigned int send_seq:1; /* send packets with sequence
- * numbers? */
- unsigned int lns_mode:1; /* behave as LNS? LAC enables
- * sequence numbers under
- * control of LNS. */
- int debug; /* bitmask of debug message
- * categories */
- int reorder_timeout; /* configured reorder timeout
- * (in jiffies) */
+ unsigned int recv_seq:1; /* expect receive packets with sequence numbers? */
+ unsigned int send_seq:1; /* send packets with sequence numbers? */
+ unsigned int lns_mode:1; /* behave as LNS?
+ * LAC enables sequence numbers under LNS control.
+ */
+ int reorder_timeout; /* configured reorder timeout (in jiffies) */
int reorder_skip; /* set if skip to next nr */
enum l2tp_pwtype pwtype;
struct l2tp_stats stats;
- struct hlist_node global_hlist; /* Global hash list node */
+ struct work_struct del_work;
- int (*build_header)(struct l2tp_session *session, void *buf);
+ /* Session receive handler for data packets.
+ * Each pseudowire implementation should implement this callback in order to
+ * handle incoming packets. Packets are passed to the pseudowire handler after
+ * reordering, if data sequence numbers are enabled for the session.
+ */
void (*recv_skb)(struct l2tp_session *session, struct sk_buff *skb, int data_len);
+
+ /* Session close handler.
+ * Each pseudowire implementation may implement this callback in order to carry
+ * out pseudowire-specific shutdown actions.
+ * The callback is called by core after unlisting the session and purging its
+ * reorder queue.
+ */
void (*session_close)(struct l2tp_session *session);
+
+ /* Session show handler.
+ * Pseudowire-specific implementation of debugfs session rendering.
+ * The callback is called by l2tp_debugfs.c after rendering core session
+ * information.
+ */
void (*show)(struct seq_file *m, void *priv);
- uint8_t priv[0]; /* private data */
+
+ u8 priv[]; /* private data */
};
-/* Describes the tunnel. It contains info to track all the associated
- * sessions so incoming packets can be sorted out
- */
+/* L2TP tunnel configuration */
struct l2tp_tunnel_cfg {
- int debug; /* bitmask of debug message
- * categories */
enum l2tp_encap_type encap;
/* Used only for kernel-created sockets */
@@ -144,47 +148,59 @@ struct l2tp_tunnel_cfg {
udp6_zero_rx_checksums:1;
};
+/* Represents a tunnel instance.
+ * Tracks runtime state including IO statistics.
+ * Holds the tunnel socket (either passed from userspace or directly created by the kernel).
+ * Maintains a list of sessions belonging to the tunnel instance.
+ * Is linked into a per-net list of tunnels.
+ */
+#define L2TP_TUNNEL_NAME_MAX 20
struct l2tp_tunnel {
- int magic; /* Should be L2TP_TUNNEL_MAGIC */
-
unsigned long dead;
struct rcu_head rcu;
- rwlock_t hlist_lock; /* protect session_hlist */
- bool acpt_newsess; /* Indicates whether this
- * tunnel accepts new sessions.
- * Protected by hlist_lock.
+ spinlock_t list_lock; /* write-protection for session_list */
+ bool acpt_newsess; /* indicates whether this tunnel accepts
+ * new sessions. Protected by list_lock.
*/
- struct hlist_head session_hlist[L2TP_HASH_SIZE];
- /* hashed list of sessions,
- * hashed by id */
+ struct list_head session_list; /* list of sessions */
u32 tunnel_id;
u32 peer_tunnel_id;
int version; /* 2=>L2TPv2, 3=>L2TPv3 */
- char name[20]; /* for logging */
- int debug; /* bitmask of debug message
- * categories */
+ char name[L2TP_TUNNEL_NAME_MAX]; /* for logging */
enum l2tp_encap_type encap;
struct l2tp_stats stats;
- struct list_head list; /* Keep a list of all tunnels */
struct net *l2tp_net; /* the net we belong to */
refcount_t ref_count;
- void (*old_sk_destruct)(struct sock *);
- struct sock *sock; /* Parent socket */
- int fd; /* Parent fd, if tunnel socket
- * was created by userspace */
+ struct sock *sock; /* parent socket */
+ int fd; /* parent fd, if tunnel socket was created
+ * by userspace
+ */
struct work_struct del_work;
};
+/* Pseudowire ops callbacks for use with the l2tp genetlink interface */
struct l2tp_nl_cmd_ops {
+ /* The pseudowire session create callback is responsible for creating a session
+ * instance for a specific pseudowire type.
+ * It must call l2tp_session_create and l2tp_session_register to register the
+ * session instance, as well as carry out any pseudowire-specific initialisation.
+ * It must return >= 0 on success, or an appropriate negative errno value on failure.
+ */
int (*session_create)(struct net *net, struct l2tp_tunnel *tunnel,
u32 session_id, u32 peer_session_id,
struct l2tp_session_cfg *cfg);
- int (*session_delete)(struct l2tp_session *session);
+
+ /* The pseudowire session delete callback is responsible for initiating the deletion
+ * of a session instance.
+ * It must call l2tp_session_delete, as well as carry out any pseudowire-specific
+ * teardown actions.
+ */
+ void (*session_delete)(struct l2tp_session *session);
};
static inline void *l2tp_session_priv(struct l2tp_session *session)
@@ -192,73 +208,70 @@ static inline void *l2tp_session_priv(struct l2tp_session *session)
return &session->priv[0];
}
-struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id);
-struct l2tp_tunnel *l2tp_tunnel_get_nth(const struct net *net, int nth);
-struct l2tp_session *l2tp_tunnel_get_session(struct l2tp_tunnel *tunnel,
- u32 session_id);
-
-void l2tp_tunnel_free(struct l2tp_tunnel *tunnel);
+/* Tunnel and session refcounts */
+void l2tp_tunnel_put(struct l2tp_tunnel *tunnel);
+void l2tp_session_put(struct l2tp_session *session);
-struct l2tp_session *l2tp_session_get(const struct net *net, u32 session_id);
-struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth);
+/* Tunnel and session lookup.
+ * These functions take a reference on the instances they return, so
+ * the caller must ensure that the reference is dropped appropriately.
+ */
+struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id);
+struct l2tp_tunnel *l2tp_tunnel_get_next(const struct net *net, unsigned long *key);
+
+struct l2tp_session *l2tp_v3_session_get(const struct net *net, struct sock *sk, u32 session_id);
+struct l2tp_session *l2tp_v2_session_get(const struct net *net, u16 tunnel_id, u16 session_id);
+struct l2tp_session *l2tp_session_get(const struct net *net, struct sock *sk, int pver,
+ u32 tunnel_id, u32 session_id);
+struct l2tp_session *l2tp_session_get_next(const struct net *net, struct sock *sk, int pver,
+ u32 tunnel_id, unsigned long *key);
struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net,
const char *ifname);
-int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id,
+/* Tunnel and session lifetime management.
+ * Creation of a new instance is a two-step process: create, then register.
+ * Destruction is triggered using the *_delete functions, and completes asynchronously.
+ */
+int l2tp_tunnel_create(int fd, int version, u32 tunnel_id,
u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg,
struct l2tp_tunnel **tunnelp);
int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net,
struct l2tp_tunnel_cfg *cfg);
-
void l2tp_tunnel_delete(struct l2tp_tunnel *tunnel);
+
struct l2tp_session *l2tp_session_create(int priv_size,
struct l2tp_tunnel *tunnel,
u32 session_id, u32 peer_session_id,
struct l2tp_session_cfg *cfg);
int l2tp_session_register(struct l2tp_session *session,
struct l2tp_tunnel *tunnel);
+void l2tp_session_delete(struct l2tp_session *session);
-void __l2tp_session_unhash(struct l2tp_session *session);
-int l2tp_session_delete(struct l2tp_session *session);
-void l2tp_session_free(struct l2tp_session *session);
+/* Receive path helpers. If data sequencing is enabled for the session these
+ * functions handle queuing and reordering prior to passing packets to the
+ * pseudowire code to be passed to userspace.
+ */
void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
unsigned char *ptr, unsigned char *optr, u16 hdrflags,
int length);
int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb);
-void l2tp_session_set_header_len(struct l2tp_session *session, int version);
-int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb,
- int hdr_len);
+/* Transmit path helpers for sending packets over the tunnel socket. */
+void l2tp_session_set_header_len(struct l2tp_session *session, int version,
+ enum l2tp_encap_type encap);
+int l2tp_xmit_skb(struct l2tp_session *session, struct sk_buff *skb);
-int l2tp_nl_register_ops(enum l2tp_pwtype pw_type,
- const struct l2tp_nl_cmd_ops *ops);
+/* Pseudowire management.
+ * Pseudowires should register with l2tp core on module init, and unregister
+ * on module exit.
+ */
+int l2tp_nl_register_ops(enum l2tp_pwtype pw_type, const struct l2tp_nl_cmd_ops *ops);
void l2tp_nl_unregister_ops(enum l2tp_pwtype pw_type);
-int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg);
-static inline void l2tp_tunnel_inc_refcount(struct l2tp_tunnel *tunnel)
-{
- refcount_inc(&tunnel->ref_count);
-}
-
-static inline void l2tp_tunnel_dec_refcount(struct l2tp_tunnel *tunnel)
-{
- if (refcount_dec_and_test(&tunnel->ref_count))
- l2tp_tunnel_free(tunnel);
-}
-
-/* Session reference counts. Incremented when code obtains a reference
- * to a session.
- */
-static inline void l2tp_session_inc_refcount(struct l2tp_session *session)
-{
- refcount_inc(&session->ref_count);
-}
+/* IOCTL helper for IP encap modules. */
+int l2tp_ioctl(struct sock *sk, int cmd, int *karg);
-static inline void l2tp_session_dec_refcount(struct l2tp_session *session)
-{
- if (refcount_dec_and_test(&session->ref_count))
- l2tp_session_free(session);
-}
+struct l2tp_tunnel *l2tp_sk_to_tunnel(const struct sock *sk);
static inline int l2tp_get_l2specific_len(struct l2tp_session *session)
{
@@ -301,18 +314,25 @@ static inline bool l2tp_tunnel_uses_xfrm(const struct l2tp_tunnel *tunnel)
}
#endif
-#define l2tp_printk(ptr, type, func, fmt, ...) \
-do { \
- if (((ptr)->debug) & (type)) \
- func(fmt, ##__VA_ARGS__); \
-} while (0)
-
-#define l2tp_warn(ptr, type, fmt, ...) \
- l2tp_printk(ptr, type, pr_warn, fmt, ##__VA_ARGS__)
-#define l2tp_info(ptr, type, fmt, ...) \
- l2tp_printk(ptr, type, pr_info, fmt, ##__VA_ARGS__)
-#define l2tp_dbg(ptr, type, fmt, ...) \
- l2tp_printk(ptr, type, pr_debug, fmt, ##__VA_ARGS__)
+static inline int l2tp_v3_ensure_opt_in_linear(struct l2tp_session *session, struct sk_buff *skb,
+ unsigned char **ptr, unsigned char **optr)
+{
+ int opt_len = session->peer_cookie_len + l2tp_get_l2specific_len(session);
+
+ if (opt_len > 0) {
+ int off = *ptr - *optr;
+
+ if (!pskb_may_pull(skb, off + opt_len))
+ return -1;
+
+ if (skb->data != *optr) {
+ *optr = skb->data;
+ *ptr = skb->data + off;
+ }
+ }
+
+ return 0;
+}
#define MODULE_ALIAS_L2TP_PWTYPE(type) \
MODULE_ALIAS("net-l2tp-type-" __stringify(type))
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index 9821a1458555..5cfaab7d0890 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -1,12 +1,7 @@
-/*
- * L2TP subsystem debugfs
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* L2TP subsystem debugfs
*
* Copyright (c) 2010 Katalix Systems Ltd
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -35,12 +30,12 @@
#include "l2tp_core.h"
static struct dentry *rootdir;
-static struct dentry *tunnels;
struct l2tp_dfs_seq_data {
- struct net *net;
- int tunnel_idx; /* current tunnel */
- int session_idx; /* index of session within current tunnel */
+ struct net *net;
+ netns_tracker ns_tracker;
+ unsigned long tkey; /* lookup key of current tunnel */
+ unsigned long skey; /* lookup key of current session */
struct l2tp_tunnel *tunnel;
struct l2tp_session *session; /* NULL means get next tunnel */
};
@@ -49,26 +44,27 @@ static void l2tp_dfs_next_tunnel(struct l2tp_dfs_seq_data *pd)
{
/* Drop reference taken during previous invocation */
if (pd->tunnel)
- l2tp_tunnel_dec_refcount(pd->tunnel);
+ l2tp_tunnel_put(pd->tunnel);
- pd->tunnel = l2tp_tunnel_get_nth(pd->net, pd->tunnel_idx);
- pd->tunnel_idx++;
+ pd->tunnel = l2tp_tunnel_get_next(pd->net, &pd->tkey);
+ pd->tkey++;
}
static void l2tp_dfs_next_session(struct l2tp_dfs_seq_data *pd)
{
/* Drop reference taken during previous invocation */
if (pd->session)
- l2tp_session_dec_refcount(pd->session);
+ l2tp_session_put(pd->session);
- pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx);
- pd->session_idx++;
+ pd->session = l2tp_session_get_next(pd->net, pd->tunnel->sock,
+ pd->tunnel->version,
+ pd->tunnel->tunnel_id, &pd->skey);
+ pd->skey++;
- if (pd->session == NULL) {
- pd->session_idx = 0;
+ if (!pd->session) {
+ pd->skey = 0;
l2tp_dfs_next_tunnel(pd);
}
-
}
static void *l2tp_dfs_seq_start(struct seq_file *m, loff_t *offs)
@@ -79,23 +75,25 @@ static void *l2tp_dfs_seq_start(struct seq_file *m, loff_t *offs)
if (!pos)
goto out;
- BUG_ON(m->private == NULL);
+ if (WARN_ON(!m->private)) {
+ pd = NULL;
+ goto out;
+ }
pd = m->private;
- if (pd->tunnel == NULL)
+ if (!pd->tunnel)
l2tp_dfs_next_tunnel(pd);
else
l2tp_dfs_next_session(pd);
/* NULL tunnel and session indicates end of list */
- if ((pd->tunnel == NULL) && (pd->session == NULL))
+ if (!pd->tunnel && !pd->session)
pd = NULL;
out:
return pd;
}
-
static void *l2tp_dfs_seq_next(struct seq_file *m, void *v, loff_t *pos)
{
(*pos)++;
@@ -113,11 +111,11 @@ static void l2tp_dfs_seq_stop(struct seq_file *p, void *v)
* or l2tp_dfs_next_tunnel().
*/
if (pd->session) {
- l2tp_session_dec_refcount(pd->session);
+ l2tp_session_put(pd->session);
pd->session = NULL;
}
if (pd->tunnel) {
- l2tp_tunnel_dec_refcount(pd->tunnel);
+ l2tp_tunnel_put(pd->tunnel);
pd->tunnel = NULL;
}
}
@@ -125,24 +123,18 @@ static void l2tp_dfs_seq_stop(struct seq_file *p, void *v)
static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v)
{
struct l2tp_tunnel *tunnel = v;
+ struct l2tp_session *session;
int session_count = 0;
- int hash;
- struct hlist_node *walk;
- struct hlist_node *tmp;
-
- read_lock_bh(&tunnel->hlist_lock);
- for (hash = 0; hash < L2TP_HASH_SIZE; hash++) {
- hlist_for_each_safe(walk, tmp, &tunnel->session_hlist[hash]) {
- struct l2tp_session *session;
- session = hlist_entry(walk, struct l2tp_session, hlist);
- if (session->session_id == 0)
- continue;
+ rcu_read_lock_bh();
+ list_for_each_entry_rcu(session, &tunnel->session_list, list) {
+ /* Session ID of zero is a dummy/reserved value used by pppol2tp */
+ if (session->session_id == 0)
+ continue;
- session_count++;
- }
+ session_count++;
}
- read_unlock_bh(&tunnel->hlist_lock);
+ rcu_read_unlock_bh();
seq_printf(m, "\nTUNNEL %u peer %u", tunnel->tunnel_id, tunnel->peer_tunnel_id);
if (tunnel->sock) {
@@ -153,11 +145,13 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v)
const struct ipv6_pinfo *np = inet6_sk(tunnel->sock);
seq_printf(m, " from %pI6c to %pI6c\n",
- &np->saddr, &tunnel->sock->sk_v6_daddr);
- } else
+ &np->saddr, &tunnel->sock->sk_v6_daddr);
+ }
#endif
- seq_printf(m, " from %pI4 to %pI4\n",
- &inet->inet_saddr, &inet->inet_daddr);
+ if (tunnel->sock->sk_family == AF_INET)
+ seq_printf(m, " from %pI4 to %pI4\n",
+ &inet->inet_saddr, &inet->inet_daddr);
+
if (tunnel->encap == L2TP_ENCAPTYPE_UDP)
seq_printf(m, " source port %hu, dest port %hu\n",
ntohs(inet->inet_sport), ntohs(inet->inet_dport));
@@ -169,8 +163,8 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v)
seq_printf(m, " %d sessions, refcnt %d/%d\n", session_count,
tunnel->sock ? refcount_read(&tunnel->sock->sk_refcnt) : 0,
refcount_read(&tunnel->ref_count));
- seq_printf(m, " %08x rx %ld/%ld/%ld rx %ld/%ld/%ld\n",
- tunnel->debug,
+ seq_printf(m, " %08x tx %ld/%ld/%ld rx %ld/%ld/%ld\n",
+ 0,
atomic_long_read(&tunnel->stats.tx_packets),
atomic_long_read(&tunnel->stats.tx_bytes),
atomic_long_read(&tunnel->stats.tx_errors),
@@ -189,15 +183,15 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
session->pwtype == L2TP_PWTYPE_PPP ? "PPP" :
"");
if (session->send_seq || session->recv_seq)
- seq_printf(m, " nr %hu, ns %hu\n", session->nr, session->ns);
+ seq_printf(m, " nr %u, ns %u\n", session->nr, session->ns);
seq_printf(m, " refcnt %d\n", refcount_read(&session->ref_count));
seq_printf(m, " config 0/0/%c/%c/-/%s %08x %u\n",
session->recv_seq ? 'R' : '-',
session->send_seq ? 'S' : '-',
session->lns_mode ? "LNS" : "LAC",
- session->debug,
+ 0,
jiffies_to_msecs(session->reorder_timeout));
- seq_printf(m, " offset 0 l2specific %hu/%hu\n",
+ seq_printf(m, " offset 0 l2specific %hu/%d\n",
session->l2specific_type, l2tp_get_l2specific_len(session));
if (session->cookie_len) {
seq_printf(m, " cookie %02x%02x%02x%02x",
@@ -207,7 +201,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
seq_printf(m, "%02x%02x%02x%02x",
session->cookie[4], session->cookie[5],
session->cookie[6], session->cookie[7]);
- seq_printf(m, "\n");
+ seq_puts(m, "\n");
}
if (session->peer_cookie_len) {
seq_printf(m, " peer cookie %02x%02x%02x%02x",
@@ -217,10 +211,10 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
seq_printf(m, "%02x%02x%02x%02x",
session->peer_cookie[4], session->peer_cookie[5],
session->peer_cookie[6], session->peer_cookie[7]);
- seq_printf(m, "\n");
+ seq_puts(m, "\n");
}
- seq_printf(m, " %hu/%hu tx %ld/%ld/%ld rx %ld/%ld/%ld\n",
+ seq_printf(m, " %u/%u tx %ld/%ld/%ld rx %ld/%ld/%ld\n",
session->nr, session->ns,
atomic_long_read(&session->stats.tx_packets),
atomic_long_read(&session->stats.tx_bytes),
@@ -229,7 +223,7 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
atomic_long_read(&session->stats.rx_bytes),
atomic_long_read(&session->stats.rx_errors));
- if (session->show != NULL)
+ if (session->show)
session->show(m, session);
}
@@ -276,7 +270,7 @@ static int l2tp_dfs_seq_open(struct inode *inode, struct file *file)
int rc = -ENOMEM;
pd = kzalloc(sizeof(*pd), GFP_KERNEL);
- if (pd == NULL)
+ if (!pd)
goto out;
/* Derive the network namespace from the pid opening the
@@ -287,7 +281,7 @@ static int l2tp_dfs_seq_open(struct inode *inode, struct file *file)
rc = PTR_ERR(pd->net);
goto err_free_pd;
}
-
+ netns_tracker_alloc(pd->net, &pd->ns_tracker, GFP_KERNEL);
rc = seq_open(file, &l2tp_dfs_seq_ops);
if (rc)
goto err_free_net;
@@ -299,7 +293,7 @@ out:
return rc;
err_free_net:
- put_net(pd->net);
+ put_net_track(pd->net, &pd->ns_tracker);
err_free_pd:
kfree(pd);
goto out;
@@ -313,7 +307,7 @@ static int l2tp_dfs_seq_release(struct inode *inode, struct file *file)
seq = file->private_data;
pd = seq->private;
if (pd->net)
- put_net(pd->net);
+ put_net_track(pd->net, &pd->ns_tracker);
kfree(pd);
seq_release(inode, file);
@@ -330,32 +324,18 @@ static const struct file_operations l2tp_dfs_fops = {
static int __init l2tp_debugfs_init(void)
{
- int rc = 0;
-
rootdir = debugfs_create_dir("l2tp", NULL);
- if (IS_ERR(rootdir)) {
- rc = PTR_ERR(rootdir);
- rootdir = NULL;
- goto out;
- }
- tunnels = debugfs_create_file("tunnels", 0600, rootdir, NULL, &l2tp_dfs_fops);
- if (tunnels == NULL)
- rc = -EIO;
+ debugfs_create_file("tunnels", 0600, rootdir, NULL, &l2tp_dfs_fops);
pr_info("L2TP debugfs support\n");
-out:
- if (rc)
- pr_warn("unable to init\n");
-
- return rc;
+ return 0;
}
static void __exit l2tp_debugfs_exit(void)
{
- debugfs_remove(tunnels);
- debugfs_remove(rootdir);
+ debugfs_remove_recursive(rootdir);
}
module_init(l2tp_debugfs_init);
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 8aadc4f3bb9e..cf0b66f4fb29 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -1,12 +1,7 @@
-/*
- * L2TPv3 ethernet pseudowire driver
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* L2TPv3 ethernet pseudowire driver
*
* Copyright (c) 2008,2009,2010 Katalix Systems Ltd
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -30,6 +25,7 @@
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <net/netdev_lock.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/udp.h>
@@ -42,12 +38,6 @@
/* via netdev_priv() */
struct l2tp_eth {
struct l2tp_session *session;
- atomic_long_t tx_bytes;
- atomic_long_t tx_packets;
- atomic_long_t tx_dropped;
- atomic_long_t rx_bytes;
- atomic_long_t rx_packets;
- atomic_long_t rx_errors;
};
/* via l2tp_session_priv() */
@@ -55,7 +45,6 @@ struct l2tp_eth_sess {
struct net_device __rcu *dev;
};
-
static int l2tp_eth_dev_init(struct net_device *dev)
{
eth_hw_addr_random(dev);
@@ -77,45 +66,29 @@ static void l2tp_eth_dev_uninit(struct net_device *dev)
*/
}
-static int l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct l2tp_eth *priv = netdev_priv(dev);
struct l2tp_session *session = priv->session;
unsigned int len = skb->len;
- int ret = l2tp_xmit_skb(session, skb, session->hdr_len);
-
- if (likely(ret == NET_XMIT_SUCCESS)) {
- atomic_long_add(len, &priv->tx_bytes);
- atomic_long_inc(&priv->tx_packets);
- } else {
- atomic_long_inc(&priv->tx_dropped);
- }
- return NETDEV_TX_OK;
-}
+ int ret = l2tp_xmit_skb(session, skb);
-static void l2tp_eth_get_stats64(struct net_device *dev,
- struct rtnl_link_stats64 *stats)
-{
- struct l2tp_eth *priv = netdev_priv(dev);
-
- stats->tx_bytes = (unsigned long) atomic_long_read(&priv->tx_bytes);
- stats->tx_packets = (unsigned long) atomic_long_read(&priv->tx_packets);
- stats->tx_dropped = (unsigned long) atomic_long_read(&priv->tx_dropped);
- stats->rx_bytes = (unsigned long) atomic_long_read(&priv->rx_bytes);
- stats->rx_packets = (unsigned long) atomic_long_read(&priv->rx_packets);
- stats->rx_errors = (unsigned long) atomic_long_read(&priv->rx_errors);
+ if (likely(ret == NET_XMIT_SUCCESS))
+ dev_dstats_tx_add(dev, len);
+ else
+ dev_dstats_tx_dropped(dev);
+ return NETDEV_TX_OK;
}
static const struct net_device_ops l2tp_eth_netdev_ops = {
.ndo_init = l2tp_eth_dev_init,
.ndo_uninit = l2tp_eth_dev_uninit,
.ndo_start_xmit = l2tp_eth_dev_xmit,
- .ndo_get_stats64 = l2tp_eth_get_stats64,
.ndo_set_mac_address = eth_mac_addr,
};
-static struct device_type l2tpeth_type = {
+static const struct device_type l2tpeth_type = {
.name = "l2tpeth",
};
@@ -124,27 +97,16 @@ static void l2tp_eth_dev_setup(struct net_device *dev)
SET_NETDEV_DEVTYPE(dev, &l2tpeth_type);
ether_setup(dev);
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
- dev->features |= NETIF_F_LLTX;
+ dev->lltx = true;
dev->netdev_ops = &l2tp_eth_netdev_ops;
dev->needs_free_netdev = true;
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS;
}
static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len)
{
struct l2tp_eth_sess *spriv = l2tp_session_priv(session);
struct net_device *dev;
- struct l2tp_eth *priv;
-
- if (session->debug & L2TP_MSG_DATA) {
- unsigned int length;
-
- length = min(32u, skb->len);
- if (!pskb_may_pull(skb, length))
- goto error;
-
- pr_debug("%s: eth recv\n", session->name);
- print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, skb->data, length);
- }
if (!pskb_may_pull(skb, ETH_HLEN))
goto error;
@@ -154,21 +116,22 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb,
/* checksums verified by L2TP */
skb->ip_summed = CHECKSUM_NONE;
+ /* drop outer flow-hash */
+ skb_clear_hash(skb);
+
skb_dst_drop(skb);
- nf_reset(skb);
+ nf_reset_ct(skb);
rcu_read_lock();
dev = rcu_dereference(spriv->dev);
if (!dev)
goto error_rcu;
- priv = netdev_priv(dev);
- if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS) {
- atomic_long_inc(&priv->rx_packets);
- atomic_long_add(data_len, &priv->rx_bytes);
- } else {
- atomic_long_inc(&priv->rx_errors);
- }
+ if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS)
+ dev_dstats_rx_add(dev, data_len);
+ else
+ DEV_STATS_INC(dev, rx_errors);
+
rcu_read_unlock();
return;
@@ -272,7 +235,7 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel,
int rc;
if (cfg->ifname) {
- strlcpy(name, cfg->ifname, IFNAMSIZ);
+ strscpy(name, cfg->ifname, IFNAMSIZ);
name_assign_type = NET_NAME_USER;
} else {
strcpy(name, L2TP_ETH_DEV_NAME);
@@ -308,7 +271,7 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel,
spriv = l2tp_session_priv(session);
- l2tp_session_inc_refcount(session);
+ refcount_inc(&session->ref_count);
rtnl_lock();
@@ -326,39 +289,37 @@ static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel,
if (rc < 0) {
rtnl_unlock();
l2tp_session_delete(session);
- l2tp_session_dec_refcount(session);
+ l2tp_session_put(session);
free_netdev(dev);
return rc;
}
- strlcpy(session->ifname, dev->name, IFNAMSIZ);
+ strscpy(session->ifname, dev->name, IFNAMSIZ);
rcu_assign_pointer(spriv->dev, dev);
rtnl_unlock();
- l2tp_session_dec_refcount(session);
+ l2tp_session_put(session);
__module_get(THIS_MODULE);
return 0;
err_sess_dev:
- l2tp_session_dec_refcount(session);
+ l2tp_session_put(session);
free_netdev(dev);
err_sess:
- kfree(session);
+ l2tp_session_put(session);
err:
return rc;
}
-
static const struct l2tp_nl_cmd_ops l2tp_eth_nl_cmd_ops = {
.session_create = l2tp_eth_create,
.session_delete = l2tp_session_delete,
};
-
static int __init l2tp_eth_init(void)
{
int err = 0;
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 35f6f86d4dcc..cac1ff59cb83 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -1,12 +1,7 @@
-/*
- * L2TPv3 IP encapsulation support
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* L2TPv3 IP encapsulation support
*
* Copyright (c) 2008,2009,2010 Katalix Systems Ltd
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -24,13 +19,22 @@
#include <net/icmp.h>
#include <net/udp.h>
#include <net/inet_common.h>
-#include <net/inet_hashtables.h>
#include <net/tcp_states.h>
#include <net/protocol.h>
#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include "l2tp_core.h"
+/* per-net private data for this module */
+static unsigned int l2tp_ip_net_id;
+struct l2tp_ip_net {
+ rwlock_t l2tp_ip_lock;
+ struct hlist_head l2tp_ip_table;
+ struct hlist_head l2tp_ip_bind_table;
+};
+
struct l2tp_ip_sock {
/* inet_sock has to be the first member of l2tp_ip_sock */
struct inet_sock inet;
@@ -39,28 +43,32 @@ struct l2tp_ip_sock {
u32 peer_conn_id;
};
-static DEFINE_RWLOCK(l2tp_ip_lock);
-static struct hlist_head l2tp_ip_table;
-static struct hlist_head l2tp_ip_bind_table;
-
-static inline struct l2tp_ip_sock *l2tp_ip_sk(const struct sock *sk)
+static struct l2tp_ip_sock *l2tp_ip_sk(const struct sock *sk)
{
return (struct l2tp_ip_sock *)sk;
}
+static struct l2tp_ip_net *l2tp_ip_pernet(const struct net *net)
+{
+ return net_generic(net, l2tp_ip_net_id);
+}
+
static struct sock *__l2tp_ip_bind_lookup(const struct net *net, __be32 laddr,
__be32 raddr, int dif, u32 tunnel_id)
{
+ struct l2tp_ip_net *pn = l2tp_ip_pernet(net);
struct sock *sk;
- sk_for_each_bound(sk, &l2tp_ip_bind_table) {
+ sk_for_each_bound(sk, &pn->l2tp_ip_bind_table) {
const struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk);
const struct inet_sock *inet = inet_sk(sk);
+ int bound_dev_if;
if (!net_eq(sock_net(sk), net))
continue;
- if (sk->sk_bound_dev_if && dif && sk->sk_bound_dev_if != dif)
+ bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
+ if (bound_dev_if && dif && bound_dev_if != dif)
continue;
if (inet->inet_rcv_saddr && laddr &&
@@ -117,6 +125,7 @@ found:
static int l2tp_ip_recv(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
+ struct l2tp_ip_net *pn;
struct sock *sk;
u32 session_id;
u32 tunnel_id;
@@ -124,14 +133,16 @@ static int l2tp_ip_recv(struct sk_buff *skb)
struct l2tp_session *session;
struct l2tp_tunnel *tunnel = NULL;
struct iphdr *iph;
- int length;
+
+ pn = l2tp_ip_pernet(net);
if (!pskb_may_pull(skb, 4))
goto discard;
/* Point to L2TP header */
- optr = ptr = skb->data;
- session_id = ntohl(*((__be32 *) ptr));
+ optr = skb->data;
+ ptr = skb->data;
+ session_id = ntohl(*((__be32 *)ptr));
ptr += 4;
/* RFC3931: L2TP/IP packets have the first 4 bytes containing
@@ -144,7 +155,7 @@ static int l2tp_ip_recv(struct sk_buff *skb)
}
/* Ok, this is a data packet. Lookup the session. */
- session = l2tp_session_get(net, session_id);
+ session = l2tp_v3_session_get(net, NULL, session_id);
if (!session)
goto discard;
@@ -152,21 +163,11 @@ static int l2tp_ip_recv(struct sk_buff *skb)
if (!tunnel)
goto discard_sess;
- /* Trace packet contents, if enabled */
- if (tunnel->debug & L2TP_MSG_DATA) {
- length = min(32u, skb->len);
- if (!pskb_may_pull(skb, length))
- goto discard_sess;
-
- /* Point to L2TP header */
- optr = ptr = skb->data;
- ptr += 4;
- pr_debug("%s: ip recv\n", tunnel->name);
- print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length);
- }
+ if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr))
+ goto discard_sess;
l2tp_recv_common(session, skb, ptr, optr, 0, skb->len);
- l2tp_session_dec_refcount(session);
+ l2tp_session_put(session);
return 0;
@@ -178,28 +179,28 @@ pass_up:
if ((skb->data[0] & 0xc0) != 0xc0)
goto discard;
- tunnel_id = ntohl(*(__be32 *) &skb->data[4]);
+ tunnel_id = ntohl(*(__be32 *)&skb->data[4]);
iph = (struct iphdr *)skb_network_header(skb);
- read_lock_bh(&l2tp_ip_lock);
+ read_lock_bh(&pn->l2tp_ip_lock);
sk = __l2tp_ip_bind_lookup(net, iph->daddr, iph->saddr, inet_iif(skb),
tunnel_id);
if (!sk) {
- read_unlock_bh(&l2tp_ip_lock);
+ read_unlock_bh(&pn->l2tp_ip_lock);
goto discard;
}
sock_hold(sk);
- read_unlock_bh(&l2tp_ip_lock);
+ read_unlock_bh(&pn->l2tp_ip_lock);
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_put;
- nf_reset(skb);
+ nf_reset_ct(skb);
return sk_receive_skb(sk, skb, 1);
discard_sess:
- l2tp_session_dec_refcount(session);
+ l2tp_session_put(session);
goto discard;
discard_put:
@@ -210,44 +211,69 @@ discard:
return 0;
}
+static int l2tp_ip_hash(struct sock *sk)
+{
+ struct l2tp_ip_net *pn = l2tp_ip_pernet(sock_net(sk));
+
+ if (sk_unhashed(sk)) {
+ write_lock_bh(&pn->l2tp_ip_lock);
+ sk_add_node(sk, &pn->l2tp_ip_table);
+ write_unlock_bh(&pn->l2tp_ip_lock);
+ }
+ return 0;
+}
+
+static void l2tp_ip_unhash(struct sock *sk)
+{
+ struct l2tp_ip_net *pn = l2tp_ip_pernet(sock_net(sk));
+
+ if (sk_unhashed(sk))
+ return;
+ write_lock_bh(&pn->l2tp_ip_lock);
+ sk_del_node_init(sk);
+ write_unlock_bh(&pn->l2tp_ip_lock);
+}
+
static int l2tp_ip_open(struct sock *sk)
{
/* Prevent autobind. We don't have ports. */
inet_sk(sk)->inet_num = IPPROTO_L2TP;
- write_lock_bh(&l2tp_ip_lock);
- sk_add_node(sk, &l2tp_ip_table);
- write_unlock_bh(&l2tp_ip_lock);
-
+ l2tp_ip_hash(sk);
return 0;
}
static void l2tp_ip_close(struct sock *sk, long timeout)
{
- write_lock_bh(&l2tp_ip_lock);
+ struct l2tp_ip_net *pn = l2tp_ip_pernet(sock_net(sk));
+
+ write_lock_bh(&pn->l2tp_ip_lock);
hlist_del_init(&sk->sk_bind_node);
sk_del_node_init(sk);
- write_unlock_bh(&l2tp_ip_lock);
+ write_unlock_bh(&pn->l2tp_ip_lock);
sk_common_release(sk);
}
static void l2tp_ip_destroy_sock(struct sock *sk)
{
- struct sk_buff *skb;
- struct l2tp_tunnel *tunnel = sk->sk_user_data;
+ struct l2tp_tunnel *tunnel;
- while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
- kfree_skb(skb);
+ __skb_queue_purge(&sk->sk_write_queue);
- if (tunnel)
+ tunnel = l2tp_sk_to_tunnel(sk);
+ if (tunnel) {
l2tp_tunnel_delete(tunnel);
+ l2tp_tunnel_put(tunnel);
+ }
}
-static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+static int l2tp_ip_bind(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
- struct sockaddr_l2tpip *addr = (struct sockaddr_l2tpip *) uaddr;
+ struct sockaddr_l2tpip *addr = (struct sockaddr_l2tpip *)uaddr;
struct net *net = sock_net(sk);
+ struct l2tp_ip_net *pn;
int ret;
int chk_addr_ret;
@@ -271,15 +297,18 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
goto out;
- if (addr->l2tp_addr.s_addr)
- inet->inet_rcv_saddr = inet->inet_saddr = addr->l2tp_addr.s_addr;
+ if (addr->l2tp_addr.s_addr) {
+ inet->inet_rcv_saddr = addr->l2tp_addr.s_addr;
+ inet->inet_saddr = addr->l2tp_addr.s_addr;
+ }
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
inet->inet_saddr = 0; /* Use device */
- write_lock_bh(&l2tp_ip_lock);
+ pn = l2tp_ip_pernet(net);
+ write_lock_bh(&pn->l2tp_ip_lock);
if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr, 0,
sk->sk_bound_dev_if, addr->l2tp_conn_id)) {
- write_unlock_bh(&l2tp_ip_lock);
+ write_unlock_bh(&pn->l2tp_ip_lock);
ret = -EADDRINUSE;
goto out;
}
@@ -287,9 +316,9 @@ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
sk_dst_reset(sk);
l2tp_ip_sk(sk)->conn_id = addr->l2tp_conn_id;
- sk_add_bind_node(sk, &l2tp_ip_bind_table);
+ sk_add_bind_node(sk, &pn->l2tp_ip_bind_table);
sk_del_node_init(sk);
- write_unlock_bh(&l2tp_ip_lock);
+ write_unlock_bh(&pn->l2tp_ip_lock);
ret = 0;
sock_reset_flag(sk, SOCK_ZAPPED);
@@ -300,9 +329,11 @@ out:
return ret;
}
-static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+static int l2tp_ip_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
{
- struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *) uaddr;
+ struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *)uaddr;
+ struct l2tp_ip_net *pn = l2tp_ip_pernet(sock_net(sk));
int rc;
if (addr_len < sizeof(*lsa))
@@ -325,10 +356,10 @@ static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len
l2tp_ip_sk(sk)->peer_conn_id = lsa->l2tp_conn_id;
- write_lock_bh(&l2tp_ip_lock);
+ write_lock_bh(&pn->l2tp_ip_lock);
hlist_del_init(&sk->sk_bind_node);
- sk_add_bind_node(sk, &l2tp_ip_bind_table);
- write_unlock_bh(&l2tp_ip_lock);
+ sk_add_bind_node(sk, &pn->l2tp_ip_bind_table);
+ write_unlock_bh(&pn->l2tp_ip_lock);
out_sk:
release_sock(sk);
@@ -361,6 +392,7 @@ static int l2tp_ip_getname(struct socket *sock, struct sockaddr *uaddr,
lsa->l2tp_addr.s_addr = inet->inet_daddr;
} else {
__be32 addr = inet->inet_rcv_saddr;
+
if (!addr)
addr = inet->inet_saddr;
lsa->l2tp_conn_id = lsk->conn_id;
@@ -395,7 +427,6 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int rc;
struct inet_sock *inet = inet_sk(sk);
struct rtable *rt = NULL;
- struct flowi4 *fl4;
int connected = 0;
__be32 daddr;
@@ -408,6 +439,7 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
/* Get and verify the address. */
if (msg->msg_name) {
DECLARE_SOCKADDR(struct sockaddr_l2tpip *, lip, msg->msg_name);
+
rc = -EINVAL;
if (msg->msg_namelen < sizeof(*lip))
goto out;
@@ -424,7 +456,6 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (sk->sk_state != TCP_ESTABLISHED)
goto out;
- daddr = inet->inet_daddr;
connected = 1;
}
@@ -442,7 +473,7 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
skb_reset_transport_header(skb);
/* Insert 0 session_id */
- *((__be32 *) skb_put(skb, 4)) = 0;
+ *((__be32 *)skb_put(skb, 4)) = 0;
/* Copy user data into skb */
rc = memcpy_from_msg(skb_put(skb, len), msg, len);
@@ -451,29 +482,24 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
goto error;
}
- fl4 = &inet->cork.fl.u.ip4;
if (connected)
- rt = (struct rtable *) __sk_dst_check(sk, 0);
+ rt = dst_rtable(__sk_dst_check(sk, 0));
rcu_read_lock();
- if (rt == NULL) {
- const struct ip_options_rcu *inet_opt;
+ if (!rt) {
+ struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
- inet_opt = rcu_dereference(inet->inet_opt);
+ inet_sk_init_flowi4(inet, fl4);
- /* Use correct destination address if we have options. */
- if (inet_opt && inet_opt->opt.srr)
- daddr = inet_opt->opt.faddr;
+ /* Overwrite ->daddr if msg->msg_name was provided */
+ if (!connected)
+ fl4->daddr = daddr;
/* If this fails, retransmit mechanism of transport layer will
* keep trying until route appears or the connection times
* itself out.
*/
- rt = ip_route_output_ports(sock_net(sk), fl4, sk,
- daddr, inet->inet_saddr,
- inet->inet_dport, inet->inet_sport,
- sk->sk_protocol, RT_CONN_FLAGS(sk),
- sk->sk_bound_dev_if);
+ rt = ip_route_output_flow(sock_net(sk), fl4, sk);
if (IS_ERR(rt))
goto no_route;
if (connected) {
@@ -484,7 +510,7 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
}
- /* We dont need to clone dst here, it is guaranteed to not disappear.
+ /* We don't need to clone dst here, it is guaranteed to not disappear.
* __dev_xmit_skb() might force a refcount if needed.
*/
skb_dst_set_noref(skb, &rt->dst);
@@ -511,7 +537,7 @@ no_route:
}
static int l2tp_ip_recvmsg(struct sock *sk, struct msghdr *msg,
- size_t len, int noblock, int flags, int *addr_len)
+ size_t len, int flags, int *addr_len)
{
struct inet_sock *inet = inet_sk(sk);
size_t copied = 0;
@@ -522,7 +548,7 @@ static int l2tp_ip_recvmsg(struct sock *sk, struct msghdr *msg,
if (flags & MSG_OOB)
goto out;
- skb = skb_recv_datagram(sk, flags, noblock, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
if (!skb)
goto out;
@@ -546,7 +572,7 @@ static int l2tp_ip_recvmsg(struct sock *sk, struct msghdr *msg,
memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
*addr_len = sizeof(*sin);
}
- if (inet->cmsg_flags)
+ if (inet_cmsg_flags(inet))
ip_cmsg_recv(msg, skb);
if (flags & MSG_TRUNC)
copied = skb->len;
@@ -556,19 +582,18 @@ out:
return err ? err : copied;
}
-int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+int l2tp_ioctl(struct sock *sk, int cmd, int *karg)
{
struct sk_buff *skb;
- int amount;
switch (cmd) {
case SIOCOUTQ:
- amount = sk_wmem_alloc_get(sk);
+ *karg = sk_wmem_alloc_get(sk);
break;
case SIOCINQ:
spin_lock_bh(&sk->sk_receive_queue.lock);
skb = skb_peek(&sk->sk_receive_queue);
- amount = skb ? skb->len : 0;
+ *karg = skb ? skb->len : 0;
spin_unlock_bh(&sk->sk_receive_queue.lock);
break;
@@ -576,9 +601,9 @@ int l2tp_ioctl(struct sock *sk, int cmd, unsigned long arg)
return -ENOIOCTLCMD;
}
- return put_user(amount, (int __user *)arg);
+ return 0;
}
-EXPORT_SYMBOL(l2tp_ioctl);
+EXPORT_SYMBOL_GPL(l2tp_ioctl);
static struct proto l2tp_ip_prot = {
.name = "L2TP/IP",
@@ -595,13 +620,9 @@ static struct proto l2tp_ip_prot = {
.sendmsg = l2tp_ip_sendmsg,
.recvmsg = l2tp_ip_recvmsg,
.backlog_rcv = l2tp_ip_backlog_recv,
- .hash = inet_hash,
- .unhash = inet_unhash,
+ .hash = l2tp_ip_hash,
+ .unhash = l2tp_ip_unhash,
.obj_size = sizeof(struct l2tp_ip_sock),
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_ip_setsockopt,
- .compat_getsockopt = compat_ip_getsockopt,
-#endif
};
static const struct proto_ops l2tp_ip_ops = {
@@ -615,6 +636,7 @@ static const struct proto_ops l2tp_ip_ops = {
.getname = l2tp_ip_getname,
.poll = datagram_poll,
.ioctl = inet_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
@@ -622,11 +644,6 @@ static const struct proto_ops l2tp_ip_ops = {
.sendmsg = inet_sendmsg,
.recvmsg = sock_common_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
-#endif
};
static struct inet_protosw l2tp_ip_protosw = {
@@ -638,7 +655,33 @@ static struct inet_protosw l2tp_ip_protosw = {
static struct net_protocol l2tp_ip_protocol __read_mostly = {
.handler = l2tp_ip_recv,
- .netns_ok = 1,
+};
+
+static __net_init int l2tp_ip_init_net(struct net *net)
+{
+ struct l2tp_ip_net *pn = net_generic(net, l2tp_ip_net_id);
+
+ rwlock_init(&pn->l2tp_ip_lock);
+ INIT_HLIST_HEAD(&pn->l2tp_ip_table);
+ INIT_HLIST_HEAD(&pn->l2tp_ip_bind_table);
+ return 0;
+}
+
+static __net_exit void l2tp_ip_exit_net(struct net *net)
+{
+ struct l2tp_ip_net *pn = l2tp_ip_pernet(net);
+
+ write_lock_bh(&pn->l2tp_ip_lock);
+ WARN_ON_ONCE(hlist_count_nodes(&pn->l2tp_ip_table) != 0);
+ WARN_ON_ONCE(hlist_count_nodes(&pn->l2tp_ip_bind_table) != 0);
+ write_unlock_bh(&pn->l2tp_ip_lock);
+}
+
+static struct pernet_operations l2tp_ip_net_ops = {
+ .init = l2tp_ip_init_net,
+ .exit = l2tp_ip_exit_net,
+ .id = &l2tp_ip_net_id,
+ .size = sizeof(struct l2tp_ip_net),
};
static int __init l2tp_ip_init(void)
@@ -647,19 +690,25 @@ static int __init l2tp_ip_init(void)
pr_info("L2TP IP encapsulation support (L2TPv3)\n");
+ err = register_pernet_device(&l2tp_ip_net_ops);
+ if (err)
+ goto out;
+
err = proto_register(&l2tp_ip_prot, 1);
if (err != 0)
- goto out;
+ goto out1;
err = inet_add_protocol(&l2tp_ip_protocol, IPPROTO_L2TP);
if (err)
- goto out1;
+ goto out2;
inet_register_protosw(&l2tp_ip_protosw);
return 0;
-out1:
+out2:
proto_unregister(&l2tp_ip_prot);
+out1:
+ unregister_pernet_device(&l2tp_ip_net_ops);
out:
return err;
}
@@ -669,6 +718,7 @@ static void __exit l2tp_ip_exit(void)
inet_unregister_protosw(&l2tp_ip_protosw);
inet_del_protocol(&l2tp_ip_protocol, IPPROTO_L2TP);
proto_unregister(&l2tp_ip_prot);
+ unregister_pernet_device(&l2tp_ip_net_ops);
}
module_init(l2tp_ip_init);
@@ -679,8 +729,8 @@ MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
MODULE_DESCRIPTION("L2TP over IP");
MODULE_VERSION("1.0");
-/* Use the value of SOCK_DGRAM (2) directory, because __stringify doesn't like
- * enums
+/* Use the values of SOCK_DGRAM (2) as type and IPPROTO_L2TP (115) as protocol,
+ * because __stringify doesn't like enums
*/
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 2, IPPROTO_L2TP);
-MODULE_ALIAS_NET_PF_PROTO(PF_INET, IPPROTO_L2TP);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 115, 2);
+MODULE_ALIAS_NET_PF_PROTO(PF_INET, 115);
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 237f1a4a0b0c..05a396ba6a3e 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -1,12 +1,7 @@
-/*
- * L2TPv3 IP encapsulation support for IPv6
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* L2TPv3 IP encapsulation support for IPv6
*
* Copyright (c) 2012 Katalix Systems Ltd
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -24,11 +19,11 @@
#include <net/icmp.h>
#include <net/udp.h>
#include <net/inet_common.h>
-#include <net/inet_hashtables.h>
-#include <net/inet6_hashtables.h>
#include <net/tcp_states.h>
#include <net/protocol.h>
#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include <net/transp_v6.h>
#include <net/addrconf.h>
@@ -36,6 +31,14 @@
#include "l2tp_core.h"
+/* per-net private data for this module */
+static unsigned int l2tp_ip6_net_id;
+struct l2tp_ip6_net {
+ rwlock_t l2tp_ip6_lock;
+ struct hlist_head l2tp_ip6_table;
+ struct hlist_head l2tp_ip6_bind_table;
+};
+
struct l2tp_ip6_sock {
/* inet_sock has to be the first member of l2tp_ip6_sock */
struct inet_sock inet;
@@ -43,36 +46,38 @@ struct l2tp_ip6_sock {
u32 conn_id;
u32 peer_conn_id;
- /* ipv6_pinfo has to be the last member of l2tp_ip6_sock, see
- inet6_sk_generic */
struct ipv6_pinfo inet6;
};
-static DEFINE_RWLOCK(l2tp_ip6_lock);
-static struct hlist_head l2tp_ip6_table;
-static struct hlist_head l2tp_ip6_bind_table;
-
-static inline struct l2tp_ip6_sock *l2tp_ip6_sk(const struct sock *sk)
+static struct l2tp_ip6_sock *l2tp_ip6_sk(const struct sock *sk)
{
return (struct l2tp_ip6_sock *)sk;
}
+static struct l2tp_ip6_net *l2tp_ip6_pernet(const struct net *net)
+{
+ return net_generic(net, l2tp_ip6_net_id);
+}
+
static struct sock *__l2tp_ip6_bind_lookup(const struct net *net,
const struct in6_addr *laddr,
const struct in6_addr *raddr,
int dif, u32 tunnel_id)
{
+ struct l2tp_ip6_net *pn = l2tp_ip6_pernet(net);
struct sock *sk;
- sk_for_each_bound(sk, &l2tp_ip6_bind_table) {
+ sk_for_each_bound(sk, &pn->l2tp_ip6_bind_table) {
const struct in6_addr *sk_laddr = inet6_rcv_saddr(sk);
const struct in6_addr *sk_raddr = &sk->sk_v6_daddr;
const struct l2tp_ip6_sock *l2tp = l2tp_ip6_sk(sk);
+ int bound_dev_if;
if (!net_eq(sock_net(sk), net))
continue;
- if (sk->sk_bound_dev_if && dif && sk->sk_bound_dev_if != dif)
+ bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
+ if (bound_dev_if && dif && bound_dev_if != dif)
continue;
if (sk_laddr && !ipv6_addr_any(sk_laddr) &&
@@ -130,6 +135,7 @@ found:
static int l2tp_ip6_recv(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
+ struct l2tp_ip6_net *pn;
struct sock *sk;
u32 session_id;
u32 tunnel_id;
@@ -137,14 +143,16 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
struct l2tp_session *session;
struct l2tp_tunnel *tunnel = NULL;
struct ipv6hdr *iph;
- int length;
+
+ pn = l2tp_ip6_pernet(net);
if (!pskb_may_pull(skb, 4))
goto discard;
/* Point to L2TP header */
- optr = ptr = skb->data;
- session_id = ntohl(*((__be32 *) ptr));
+ optr = skb->data;
+ ptr = skb->data;
+ session_id = ntohl(*((__be32 *)ptr));
ptr += 4;
/* RFC3931: L2TP/IP packets have the first 4 bytes containing
@@ -157,7 +165,7 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
}
/* Ok, this is a data packet. Lookup the session. */
- session = l2tp_session_get(net, session_id);
+ session = l2tp_v3_session_get(net, NULL, session_id);
if (!session)
goto discard;
@@ -165,21 +173,11 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
if (!tunnel)
goto discard_sess;
- /* Trace packet contents, if enabled */
- if (tunnel->debug & L2TP_MSG_DATA) {
- length = min(32u, skb->len);
- if (!pskb_may_pull(skb, length))
- goto discard_sess;
-
- /* Point to L2TP header */
- optr = ptr = skb->data;
- ptr += 4;
- pr_debug("%s: ip recv\n", tunnel->name);
- print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length);
- }
+ if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr))
+ goto discard_sess;
l2tp_recv_common(session, skb, ptr, optr, 0, skb->len);
- l2tp_session_dec_refcount(session);
+ l2tp_session_put(session);
return 0;
@@ -191,28 +189,28 @@ pass_up:
if ((skb->data[0] & 0xc0) != 0xc0)
goto discard;
- tunnel_id = ntohl(*(__be32 *) &skb->data[4]);
+ tunnel_id = ntohl(*(__be32 *)&skb->data[4]);
iph = ipv6_hdr(skb);
- read_lock_bh(&l2tp_ip6_lock);
+ read_lock_bh(&pn->l2tp_ip6_lock);
sk = __l2tp_ip6_bind_lookup(net, &iph->daddr, &iph->saddr,
inet6_iif(skb), tunnel_id);
if (!sk) {
- read_unlock_bh(&l2tp_ip6_lock);
+ read_unlock_bh(&pn->l2tp_ip6_lock);
goto discard;
}
sock_hold(sk);
- read_unlock_bh(&l2tp_ip6_lock);
+ read_unlock_bh(&pn->l2tp_ip6_lock);
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_put;
- nf_reset(skb);
+ nf_reset_ct(skb);
return sk_receive_skb(sk, skb, 1);
discard_sess:
- l2tp_session_dec_refcount(session);
+ l2tp_session_put(session);
goto discard;
discard_put:
@@ -223,53 +221,80 @@ discard:
return 0;
}
+static int l2tp_ip6_hash(struct sock *sk)
+{
+ struct l2tp_ip6_net *pn = l2tp_ip6_pernet(sock_net(sk));
+
+ if (sk_unhashed(sk)) {
+ write_lock_bh(&pn->l2tp_ip6_lock);
+ sk_add_node(sk, &pn->l2tp_ip6_table);
+ write_unlock_bh(&pn->l2tp_ip6_lock);
+ }
+ return 0;
+}
+
+static void l2tp_ip6_unhash(struct sock *sk)
+{
+ struct l2tp_ip6_net *pn = l2tp_ip6_pernet(sock_net(sk));
+
+ if (sk_unhashed(sk))
+ return;
+ write_lock_bh(&pn->l2tp_ip6_lock);
+ sk_del_node_init(sk);
+ write_unlock_bh(&pn->l2tp_ip6_lock);
+}
+
static int l2tp_ip6_open(struct sock *sk)
{
/* Prevent autobind. We don't have ports. */
inet_sk(sk)->inet_num = IPPROTO_L2TP;
- write_lock_bh(&l2tp_ip6_lock);
- sk_add_node(sk, &l2tp_ip6_table);
- write_unlock_bh(&l2tp_ip6_lock);
-
+ l2tp_ip6_hash(sk);
return 0;
}
static void l2tp_ip6_close(struct sock *sk, long timeout)
{
- write_lock_bh(&l2tp_ip6_lock);
+ struct l2tp_ip6_net *pn = l2tp_ip6_pernet(sock_net(sk));
+
+ write_lock_bh(&pn->l2tp_ip6_lock);
hlist_del_init(&sk->sk_bind_node);
sk_del_node_init(sk);
- write_unlock_bh(&l2tp_ip6_lock);
+ write_unlock_bh(&pn->l2tp_ip6_lock);
sk_common_release(sk);
}
static void l2tp_ip6_destroy_sock(struct sock *sk)
{
- struct l2tp_tunnel *tunnel = sk->sk_user_data;
+ struct l2tp_tunnel *tunnel;
lock_sock(sk);
ip6_flush_pending_frames(sk);
release_sock(sk);
- if (tunnel)
+ tunnel = l2tp_sk_to_tunnel(sk);
+ if (tunnel) {
l2tp_tunnel_delete(tunnel);
-
- inet6_destroy_sock(sk);
+ l2tp_tunnel_put(tunnel);
+ }
}
-static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+static int l2tp_ip6_bind(struct sock *sk, struct sockaddr_unsized *uaddr,
+ int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
- struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *) uaddr;
+ struct sockaddr_l2tpip6 *addr = (struct sockaddr_l2tpip6 *)uaddr;
struct net *net = sock_net(sk);
+ struct l2tp_ip6_net *pn;
__be32 v4addr = 0;
int bound_dev_if;
int addr_type;
int err;
+ pn = l2tp_ip6_pernet(net);
+
if (addr->l2tp_family != AF_INET6)
return -EINVAL;
if (addr_len < sizeof(*addr))
@@ -327,10 +352,10 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
}
rcu_read_unlock();
- write_lock_bh(&l2tp_ip6_lock);
+ write_lock_bh(&pn->l2tp_ip6_lock);
if (__l2tp_ip6_bind_lookup(net, &addr->l2tp_addr, NULL, bound_dev_if,
addr->l2tp_conn_id)) {
- write_unlock_bh(&l2tp_ip6_lock);
+ write_unlock_bh(&pn->l2tp_ip6_lock);
err = -EADDRINUSE;
goto out_unlock;
}
@@ -343,9 +368,9 @@ static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
l2tp_ip6_sk(sk)->conn_id = addr->l2tp_conn_id;
- sk_add_bind_node(sk, &l2tp_ip6_bind_table);
+ sk_add_bind_node(sk, &pn->l2tp_ip6_bind_table);
sk_del_node_init(sk);
- write_unlock_bh(&l2tp_ip6_lock);
+ write_unlock_bh(&pn->l2tp_ip6_lock);
sock_reset_flag(sk, SOCK_ZAPPED);
release_sock(sk);
@@ -359,14 +384,15 @@ out_unlock:
return err;
}
-static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr,
+static int l2tp_ip6_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
int addr_len)
{
- struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *) uaddr;
- struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
+ struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *)uaddr;
+ struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr;
struct in6_addr *daddr;
int addr_type;
int rc;
+ struct l2tp_ip6_net *pn;
if (addr_len < sizeof(*lsa))
return -EINVAL;
@@ -398,10 +424,11 @@ static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr,
l2tp_ip6_sk(sk)->peer_conn_id = lsa->l2tp_conn_id;
- write_lock_bh(&l2tp_ip6_lock);
+ pn = l2tp_ip6_pernet(sock_net(sk));
+ write_lock_bh(&pn->l2tp_ip6_lock);
hlist_del_init(&sk->sk_bind_node);
- sk_add_bind_node(sk, &l2tp_ip6_bind_table);
- write_unlock_bh(&l2tp_ip6_lock);
+ sk_add_bind_node(sk, &pn->l2tp_ip6_bind_table);
+ write_unlock_bh(&pn->l2tp_ip6_lock);
out_sk:
release_sock(sk);
@@ -434,7 +461,7 @@ static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr,
return -ENOTCONN;
lsa->l2tp_conn_id = lsk->peer_conn_id;
lsa->l2tp_addr = sk->sk_v6_daddr;
- if (np->sndflow)
+ if (inet6_test_bit(SNDFLOW, sk))
lsa->l2tp_flowinfo = np->flow_label;
} else {
if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
@@ -445,7 +472,7 @@ static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr,
lsa->l2tp_conn_id = lsk->conn_id;
}
if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL)
- lsa->l2tp_scope_id = sk->sk_bound_dev_if;
+ lsa->l2tp_scope_id = READ_ONCE(sk->sk_bound_dev_if);
return sizeof(*lsa);
}
@@ -473,7 +500,7 @@ static int l2tp_ip6_push_pending_frames(struct sock *sk)
int err = 0;
skb = skb_peek(&sk->sk_write_queue);
- if (skb == NULL)
+ if (!skb)
goto out;
transhdr = (__be32 *)skb_transport_header(skb);
@@ -502,28 +529,26 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct ipcm6_cookie ipc6;
int addr_len = msg->msg_namelen;
int transhdrlen = 4; /* zero session-id */
- int ulen = len + transhdrlen;
+ int ulen;
int err;
/* Rough check on arithmetic overflow,
- better check is made in ip6_append_data().
+ * better check is made in ip6_append_data().
*/
- if (len > INT_MAX)
+ if (len > INT_MAX - transhdrlen)
return -EMSGSIZE;
/* Mirror BSD error message compatibility */
if (msg->msg_flags & MSG_OOB)
return -EOPNOTSUPP;
- /*
- * Get and verify the address.
- */
+ /* Get and verify the address */
memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_mark = sk->sk_mark;
- fl6.flowi6_uid = sk->sk_uid;
+ fl6.flowi6_mark = READ_ONCE(sk->sk_mark);
+ fl6.flowi6_uid = sk_uid(sk);
- ipcm6_init(&ipc6);
+ ipcm6_init_sk(&ipc6, sk);
if (lsa) {
if (addr_len < SIN6_LEN_RFC2133)
@@ -533,17 +558,16 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
return -EAFNOSUPPORT;
daddr = &lsa->l2tp_addr;
- if (np->sndflow) {
+ if (inet6_test_bit(SNDFLOW, sk)) {
fl6.flowlabel = lsa->l2tp_flowinfo & IPV6_FLOWINFO_MASK;
- if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
+ if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) {
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (flowlabel == NULL)
+ if (IS_ERR(flowlabel))
return -EINVAL;
}
}
- /*
- * Otherwise it will be difficult to maintain
+ /* Otherwise it will be difficult to maintain
* sk->sk_dst_cache.
*/
if (sk->sk_state == TCP_ESTABLISHED &&
@@ -563,7 +587,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
if (fl6.flowi6_oif == 0)
- fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.flowi6_oif = READ_ONCE(sk->sk_bound_dev_if);
if (msg->msg_controllen) {
opt = &opt_space;
@@ -578,10 +602,10 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
if ((fl6.flowlabel & IPV6_FLOWLABEL_MASK) && !flowlabel) {
flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
- if (flowlabel == NULL)
+ if (IS_ERR(flowlabel))
return -EINVAL;
}
- if (!(opt->opt_nflen|opt->opt_flen))
+ if (!(opt->opt_nflen | opt->opt_flen))
opt = NULL;
}
@@ -605,18 +629,15 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
final_p = fl6_update_dst(&fl6, opt, &final);
if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr))
- fl6.flowi6_oif = np->mcast_oif;
+ fl6.flowi6_oif = READ_ONCE(np->mcast_oif);
else if (!fl6.flowi6_oif)
- fl6.flowi6_oif = np->ucast_oif;
-
- security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
+ fl6.flowi6_oif = READ_ONCE(np->ucast_oif);
- if (ipc6.tclass < 0)
- ipc6.tclass = np->tclass;
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel);
- dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
goto out;
@@ -625,17 +646,15 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (ipc6.hlimit < 0)
ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst);
- if (ipc6.dontfrag < 0)
- ipc6.dontfrag = np->dontfrag;
-
if (msg->msg_flags & MSG_CONFIRM)
goto do_confirm;
back_from_confirm:
lock_sock(sk);
+ ulen = len + (skb_queue_empty(&sk->sk_write_queue) ? transhdrlen : 0);
err = ip6_append_data(sk, ip_generic_getfrag, msg,
ulen, transhdrlen, &ipc6,
- &fl6, (struct rt6_info *)dst,
+ &fl6, dst_rt6_info(dst),
msg->msg_flags);
if (err)
ip6_flush_pending_frames(sk);
@@ -660,7 +679,7 @@ do_confirm:
}
static int l2tp_ip6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
- int noblock, int flags, int *addr_len)
+ int flags, int *addr_len)
{
struct ipv6_pinfo *np = inet6_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name);
@@ -671,13 +690,10 @@ static int l2tp_ip6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (flags & MSG_OOB)
goto out;
- if (addr_len)
- *addr_len = sizeof(*lsa);
-
if (flags & MSG_ERRQUEUE)
return ipv6_recv_error(sk, msg, len, addr_len);
- skb = skb_recv_datagram(sk, flags, noblock, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
if (!skb)
goto out;
@@ -703,6 +719,7 @@ static int l2tp_ip6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
lsa->l2tp_conn_id = 0;
if (ipv6_addr_type(&lsa->l2tp_addr) & IPV6_ADDR_LINKLOCAL)
lsa->l2tp_scope_id = inet6_iif(skb);
+ *addr_len = sizeof(*lsa);
}
if (np->rxopt.all)
@@ -731,13 +748,10 @@ static struct proto l2tp_ip6_prot = {
.sendmsg = l2tp_ip6_sendmsg,
.recvmsg = l2tp_ip6_recvmsg,
.backlog_rcv = l2tp_ip6_backlog_recv,
- .hash = inet6_hash,
- .unhash = inet_unhash,
+ .hash = l2tp_ip6_hash,
+ .unhash = l2tp_ip6_unhash,
.obj_size = sizeof(struct l2tp_ip6_sock),
-#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_ipv6_setsockopt,
- .compat_getsockopt = compat_ipv6_getsockopt,
-#endif
+ .ipv6_pinfo_offset = offsetof(struct l2tp_ip6_sock, inet6),
};
static const struct proto_ops l2tp_ip6_ops = {
@@ -751,6 +765,7 @@ static const struct proto_ops l2tp_ip6_ops = {
.getname = l2tp_ip6_getname,
.poll = datagram_poll,
.ioctl = inet6_ioctl,
+ .gettstamp = sock_gettstamp,
.listen = sock_no_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
@@ -758,10 +773,8 @@ static const struct proto_ops l2tp_ip6_ops = {
.sendmsg = inet_sendmsg,
.recvmsg = sock_common_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
#ifdef CONFIG_COMPAT
- .compat_setsockopt = compat_sock_common_setsockopt,
- .compat_getsockopt = compat_sock_common_getsockopt,
+ .compat_ioctl = inet6_compat_ioctl,
#endif
};
@@ -776,25 +789,58 @@ static struct inet6_protocol l2tp_ip6_protocol __read_mostly = {
.handler = l2tp_ip6_recv,
};
+static __net_init int l2tp_ip6_init_net(struct net *net)
+{
+ struct l2tp_ip6_net *pn = net_generic(net, l2tp_ip6_net_id);
+
+ rwlock_init(&pn->l2tp_ip6_lock);
+ INIT_HLIST_HEAD(&pn->l2tp_ip6_table);
+ INIT_HLIST_HEAD(&pn->l2tp_ip6_bind_table);
+ return 0;
+}
+
+static __net_exit void l2tp_ip6_exit_net(struct net *net)
+{
+ struct l2tp_ip6_net *pn = l2tp_ip6_pernet(net);
+
+ write_lock_bh(&pn->l2tp_ip6_lock);
+ WARN_ON_ONCE(hlist_count_nodes(&pn->l2tp_ip6_table) != 0);
+ WARN_ON_ONCE(hlist_count_nodes(&pn->l2tp_ip6_bind_table) != 0);
+ write_unlock_bh(&pn->l2tp_ip6_lock);
+}
+
+static struct pernet_operations l2tp_ip6_net_ops = {
+ .init = l2tp_ip6_init_net,
+ .exit = l2tp_ip6_exit_net,
+ .id = &l2tp_ip6_net_id,
+ .size = sizeof(struct l2tp_ip6_net),
+};
+
static int __init l2tp_ip6_init(void)
{
int err;
pr_info("L2TP IP encapsulation support for IPv6 (L2TPv3)\n");
+ err = register_pernet_device(&l2tp_ip6_net_ops);
+ if (err)
+ goto out;
+
err = proto_register(&l2tp_ip6_prot, 1);
if (err != 0)
- goto out;
+ goto out1;
err = inet6_add_protocol(&l2tp_ip6_protocol, IPPROTO_L2TP);
if (err)
- goto out1;
+ goto out2;
inet6_register_protosw(&l2tp_ip6_protosw);
return 0;
-out1:
+out2:
proto_unregister(&l2tp_ip6_prot);
+out1:
+ unregister_pernet_device(&l2tp_ip6_net_ops);
out:
return err;
}
@@ -804,6 +850,7 @@ static void __exit l2tp_ip6_exit(void)
inet6_unregister_protosw(&l2tp_ip6_protosw);
inet6_del_protocol(&l2tp_ip6_protocol, IPPROTO_L2TP);
proto_unregister(&l2tp_ip6_prot);
+ unregister_pernet_device(&l2tp_ip6_net_ops);
}
module_init(l2tp_ip6_init);
@@ -814,8 +861,8 @@ MODULE_AUTHOR("Chris Elston <celston@katalix.com>");
MODULE_DESCRIPTION("L2TP IP encapsulation for IPv6");
MODULE_VERSION("1.0");
-/* Use the value of SOCK_DGRAM (2) directory, because __stringify doesn't like
- * enums
+/* Use the values of SOCK_DGRAM (2) as type and IPPROTO_L2TP (115) as protocol,
+ * because __stringify doesn't like enums
*/
-MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 2, IPPROTO_L2TP);
-MODULE_ALIAS_NET_PF_PROTO(PF_INET6, IPPROTO_L2TP);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 115, 2);
+MODULE_ALIAS_NET_PF_PROTO(PF_INET6, 115);
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index edbd5d1fbcde..59457c0c14aa 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -1,5 +1,5 @@
-/*
- * L2TP netlink layer, for management
+// SPDX-License-Identifier: GPL-2.0-only
+/* L2TP netlink layer, for management
*
* Copyright (c) 2008,2009,2010 Katalix Systems Ltd
*
@@ -8,10 +8,6 @@
* Copyright (c) 2007 Samuel Ortiz <samuel@sortiz.org>
* which is in turn partly based on the wireless netlink code:
* Copyright 2006 Johannes Berg <johannes@sipsolutions.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -30,7 +26,6 @@
#include "l2tp_core.h"
-
static struct genl_family l2tp_nl_family;
static const struct genl_multicast_group l2tp_multicast_group[] = {
@@ -66,8 +61,9 @@ static struct l2tp_session *l2tp_nl_session_get(struct genl_info *info)
session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]);
tunnel = l2tp_tunnel_get(net, tunnel_id);
if (tunnel) {
- session = l2tp_tunnel_get_session(tunnel, session_id);
- l2tp_tunnel_dec_refcount(tunnel);
+ session = l2tp_session_get(net, tunnel->sock, tunnel->version,
+ tunnel_id, session_id);
+ l2tp_tunnel_put(tunnel);
}
}
@@ -120,7 +116,7 @@ static int l2tp_tunnel_notify(struct genl_family *family,
NLM_F_ACK, tunnel, cmd);
if (ret >= 0) {
- ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC);
+ ret = genlmsg_multicast_allns(family, msg, 0, 0);
/* We don't care if no one is listening */
if (ret == -ESRCH)
ret = 0;
@@ -148,7 +144,7 @@ static int l2tp_session_notify(struct genl_family *family,
NLM_F_ACK, session, cmd);
if (ret >= 0) {
- ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC);
+ ret = genlmsg_multicast_allns(family, msg, 0, 0);
/* We don't care if no one is listening */
if (ret == -ESRCH)
ret = 0;
@@ -160,87 +156,85 @@ static int l2tp_session_notify(struct genl_family *family,
return ret;
}
+static int l2tp_nl_cmd_tunnel_create_get_addr(struct nlattr **attrs, struct l2tp_tunnel_cfg *cfg)
+{
+ if (attrs[L2TP_ATTR_UDP_SPORT])
+ cfg->local_udp_port = nla_get_u16(attrs[L2TP_ATTR_UDP_SPORT]);
+ if (attrs[L2TP_ATTR_UDP_DPORT])
+ cfg->peer_udp_port = nla_get_u16(attrs[L2TP_ATTR_UDP_DPORT]);
+ cfg->use_udp_checksums = nla_get_flag(attrs[L2TP_ATTR_UDP_CSUM]);
+
+ /* Must have either AF_INET or AF_INET6 address for source and destination */
+#if IS_ENABLED(CONFIG_IPV6)
+ if (attrs[L2TP_ATTR_IP6_SADDR] && attrs[L2TP_ATTR_IP6_DADDR]) {
+ cfg->local_ip6 = nla_data(attrs[L2TP_ATTR_IP6_SADDR]);
+ cfg->peer_ip6 = nla_data(attrs[L2TP_ATTR_IP6_DADDR]);
+ cfg->udp6_zero_tx_checksums = nla_get_flag(attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX]);
+ cfg->udp6_zero_rx_checksums = nla_get_flag(attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX]);
+ return 0;
+ }
+#endif
+ if (attrs[L2TP_ATTR_IP_SADDR] && attrs[L2TP_ATTR_IP_DADDR]) {
+ cfg->local_ip.s_addr = nla_get_in_addr(attrs[L2TP_ATTR_IP_SADDR]);
+ cfg->peer_ip.s_addr = nla_get_in_addr(attrs[L2TP_ATTR_IP_DADDR]);
+ return 0;
+ }
+ return -EINVAL;
+}
+
static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info)
{
u32 tunnel_id;
u32 peer_tunnel_id;
int proto_version;
- int fd;
+ int fd = -1;
int ret = 0;
struct l2tp_tunnel_cfg cfg = { 0, };
struct l2tp_tunnel *tunnel;
struct net *net = genl_info_net(info);
+ struct nlattr **attrs = info->attrs;
- if (!info->attrs[L2TP_ATTR_CONN_ID]) {
+ if (!attrs[L2TP_ATTR_CONN_ID]) {
ret = -EINVAL;
goto out;
}
- tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_CONN_ID]);
+ tunnel_id = nla_get_u32(attrs[L2TP_ATTR_CONN_ID]);
- if (!info->attrs[L2TP_ATTR_PEER_CONN_ID]) {
+ if (!attrs[L2TP_ATTR_PEER_CONN_ID]) {
ret = -EINVAL;
goto out;
}
- peer_tunnel_id = nla_get_u32(info->attrs[L2TP_ATTR_PEER_CONN_ID]);
+ peer_tunnel_id = nla_get_u32(attrs[L2TP_ATTR_PEER_CONN_ID]);
- if (!info->attrs[L2TP_ATTR_PROTO_VERSION]) {
+ if (!attrs[L2TP_ATTR_PROTO_VERSION]) {
ret = -EINVAL;
goto out;
}
- proto_version = nla_get_u8(info->attrs[L2TP_ATTR_PROTO_VERSION]);
+ proto_version = nla_get_u8(attrs[L2TP_ATTR_PROTO_VERSION]);
- if (!info->attrs[L2TP_ATTR_ENCAP_TYPE]) {
+ if (!attrs[L2TP_ATTR_ENCAP_TYPE]) {
ret = -EINVAL;
goto out;
}
- cfg.encap = nla_get_u16(info->attrs[L2TP_ATTR_ENCAP_TYPE]);
-
- fd = -1;
- if (info->attrs[L2TP_ATTR_FD]) {
- fd = nla_get_u32(info->attrs[L2TP_ATTR_FD]);
+ cfg.encap = nla_get_u16(attrs[L2TP_ATTR_ENCAP_TYPE]);
+
+ /* Managed tunnels take the tunnel socket from userspace.
+ * Unmanaged tunnels must call out the source and destination addresses
+ * for the kernel to create the tunnel socket itself.
+ */
+ if (attrs[L2TP_ATTR_FD]) {
+ fd = nla_get_u32(attrs[L2TP_ATTR_FD]);
} else {
-#if IS_ENABLED(CONFIG_IPV6)
- if (info->attrs[L2TP_ATTR_IP6_SADDR] &&
- info->attrs[L2TP_ATTR_IP6_DADDR]) {
- cfg.local_ip6 = nla_data(
- info->attrs[L2TP_ATTR_IP6_SADDR]);
- cfg.peer_ip6 = nla_data(
- info->attrs[L2TP_ATTR_IP6_DADDR]);
- } else
-#endif
- if (info->attrs[L2TP_ATTR_IP_SADDR] &&
- info->attrs[L2TP_ATTR_IP_DADDR]) {
- cfg.local_ip.s_addr = nla_get_in_addr(
- info->attrs[L2TP_ATTR_IP_SADDR]);
- cfg.peer_ip.s_addr = nla_get_in_addr(
- info->attrs[L2TP_ATTR_IP_DADDR]);
- } else {
- ret = -EINVAL;
+ ret = l2tp_nl_cmd_tunnel_create_get_addr(attrs, &cfg);
+ if (ret < 0)
goto out;
- }
- if (info->attrs[L2TP_ATTR_UDP_SPORT])
- cfg.local_udp_port = nla_get_u16(info->attrs[L2TP_ATTR_UDP_SPORT]);
- if (info->attrs[L2TP_ATTR_UDP_DPORT])
- cfg.peer_udp_port = nla_get_u16(info->attrs[L2TP_ATTR_UDP_DPORT]);
- cfg.use_udp_checksums = nla_get_flag(
- info->attrs[L2TP_ATTR_UDP_CSUM]);
-
-#if IS_ENABLED(CONFIG_IPV6)
- cfg.udp6_zero_tx_checksums = nla_get_flag(
- info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_TX]);
- cfg.udp6_zero_rx_checksums = nla_get_flag(
- info->attrs[L2TP_ATTR_UDP_ZERO_CSUM6_RX]);
-#endif
}
- if (info->attrs[L2TP_ATTR_DEBUG])
- cfg.debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
-
ret = -EINVAL;
switch (cfg.encap) {
case L2TP_ENCAPTYPE_UDP:
case L2TP_ENCAPTYPE_IP:
- ret = l2tp_tunnel_create(net, fd, proto_version, tunnel_id,
+ ret = l2tp_tunnel_create(fd, proto_version, tunnel_id,
peer_tunnel_id, &cfg, &tunnel);
break;
}
@@ -248,7 +242,7 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info
if (ret < 0)
goto out;
- l2tp_tunnel_inc_refcount(tunnel);
+ refcount_inc(&tunnel->ref_count);
ret = l2tp_tunnel_register(tunnel, net, &cfg);
if (ret < 0) {
kfree(tunnel);
@@ -256,7 +250,7 @@ static int l2tp_nl_cmd_tunnel_create(struct sk_buff *skb, struct genl_info *info
}
ret = l2tp_tunnel_notify(&l2tp_nl_family, info, tunnel,
L2TP_CMD_TUNNEL_CREATE);
- l2tp_tunnel_dec_refcount(tunnel);
+ l2tp_tunnel_put(tunnel);
out:
return ret;
@@ -286,7 +280,7 @@ static int l2tp_nl_cmd_tunnel_delete(struct sk_buff *skb, struct genl_info *info
l2tp_tunnel_delete(tunnel);
- l2tp_tunnel_dec_refcount(tunnel);
+ l2tp_tunnel_put(tunnel);
out:
return ret;
@@ -311,28 +305,88 @@ static int l2tp_nl_cmd_tunnel_modify(struct sk_buff *skb, struct genl_info *info
goto out;
}
- if (info->attrs[L2TP_ATTR_DEBUG])
- tunnel->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
-
ret = l2tp_tunnel_notify(&l2tp_nl_family, info,
tunnel, L2TP_CMD_TUNNEL_MODIFY);
- l2tp_tunnel_dec_refcount(tunnel);
+ l2tp_tunnel_put(tunnel);
out:
return ret;
}
+#if IS_ENABLED(CONFIG_IPV6)
+static int l2tp_nl_tunnel_send_addr6(struct sk_buff *skb, struct sock *sk,
+ enum l2tp_encap_type encap)
+{
+ struct inet_sock *inet = inet_sk(sk);
+ struct ipv6_pinfo *np = inet6_sk(sk);
+
+ switch (encap) {
+ case L2TP_ENCAPTYPE_UDP:
+ if (udp_get_no_check6_tx(sk) &&
+ nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_TX))
+ return -1;
+ if (udp_get_no_check6_rx(sk) &&
+ nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_RX))
+ return -1;
+ if (nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) ||
+ nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport)))
+ return -1;
+ fallthrough;
+ case L2TP_ENCAPTYPE_IP:
+ if (nla_put_in6_addr(skb, L2TP_ATTR_IP6_SADDR, &np->saddr) ||
+ nla_put_in6_addr(skb, L2TP_ATTR_IP6_DADDR, &sk->sk_v6_daddr))
+ return -1;
+ break;
+ }
+ return 0;
+}
+#endif
+
+static int l2tp_nl_tunnel_send_addr4(struct sk_buff *skb, struct sock *sk,
+ enum l2tp_encap_type encap)
+{
+ struct inet_sock *inet = inet_sk(sk);
+
+ switch (encap) {
+ case L2TP_ENCAPTYPE_UDP:
+ if (nla_put_u8(skb, L2TP_ATTR_UDP_CSUM, !sk->sk_no_check_tx) ||
+ nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) ||
+ nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport)))
+ return -1;
+ fallthrough;
+ case L2TP_ENCAPTYPE_IP:
+ if (nla_put_in_addr(skb, L2TP_ATTR_IP_SADDR, inet->inet_saddr) ||
+ nla_put_in_addr(skb, L2TP_ATTR_IP_DADDR, inet->inet_daddr))
+ return -1;
+ break;
+ }
+
+ return 0;
+}
+
+/* Append attributes for the tunnel address, handling the different attribute types
+ * used for different tunnel encapsulation and AF_INET v.s. AF_INET6.
+ */
+static int l2tp_nl_tunnel_send_addr(struct sk_buff *skb, struct l2tp_tunnel *tunnel)
+{
+ struct sock *sk = tunnel->sock;
+
+ if (!sk)
+ return 0;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (sk->sk_family == AF_INET6)
+ return l2tp_nl_tunnel_send_addr6(skb, sk, tunnel->encap);
+#endif
+ return l2tp_nl_tunnel_send_addr4(skb, sk, tunnel->encap);
+}
+
static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int flags,
struct l2tp_tunnel *tunnel, u8 cmd)
{
void *hdr;
struct nlattr *nest;
- struct sock *sk = NULL;
- struct inet_sock *inet;
-#if IS_ENABLED(CONFIG_IPV6)
- struct ipv6_pinfo *np = NULL;
-#endif
hdr = genlmsg_put(skb, portid, seq, &l2tp_nl_family, flags, cmd);
if (!hdr)
@@ -341,12 +395,12 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla
if (nla_put_u8(skb, L2TP_ATTR_PROTO_VERSION, tunnel->version) ||
nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) ||
nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) ||
- nla_put_u32(skb, L2TP_ATTR_DEBUG, tunnel->debug) ||
+ nla_put_u32(skb, L2TP_ATTR_DEBUG, 0) ||
nla_put_u16(skb, L2TP_ATTR_ENCAP_TYPE, tunnel->encap))
goto nla_put_failure;
- nest = nla_nest_start(skb, L2TP_ATTR_STATS);
- if (nest == NULL)
+ nest = nla_nest_start_noflag(skb, L2TP_ATTR_STATS);
+ if (!nest)
goto nla_put_failure;
if (nla_put_u64_64bit(skb, L2TP_ATTR_TX_PACKETS,
@@ -367,67 +421,24 @@ static int l2tp_nl_tunnel_send(struct sk_buff *skb, u32 portid, u32 seq, int fla
nla_put_u64_64bit(skb, L2TP_ATTR_RX_SEQ_DISCARDS,
atomic_long_read(&tunnel->stats.rx_seq_discards),
L2TP_ATTR_STATS_PAD) ||
+ nla_put_u64_64bit(skb, L2TP_ATTR_RX_COOKIE_DISCARDS,
+ atomic_long_read(&tunnel->stats.rx_cookie_discards),
+ L2TP_ATTR_STATS_PAD) ||
nla_put_u64_64bit(skb, L2TP_ATTR_RX_OOS_PACKETS,
atomic_long_read(&tunnel->stats.rx_oos_packets),
L2TP_ATTR_STATS_PAD) ||
nla_put_u64_64bit(skb, L2TP_ATTR_RX_ERRORS,
atomic_long_read(&tunnel->stats.rx_errors),
+ L2TP_ATTR_STATS_PAD) ||
+ nla_put_u64_64bit(skb, L2TP_ATTR_RX_INVALID,
+ atomic_long_read(&tunnel->stats.rx_invalid),
L2TP_ATTR_STATS_PAD))
goto nla_put_failure;
nla_nest_end(skb, nest);
- sk = tunnel->sock;
- if (!sk)
- goto out;
-
-#if IS_ENABLED(CONFIG_IPV6)
- if (sk->sk_family == AF_INET6)
- np = inet6_sk(sk);
-#endif
-
- inet = inet_sk(sk);
-
- switch (tunnel->encap) {
- case L2TP_ENCAPTYPE_UDP:
- switch (sk->sk_family) {
- case AF_INET:
- if (nla_put_u8(skb, L2TP_ATTR_UDP_CSUM, !sk->sk_no_check_tx))
- goto nla_put_failure;
- break;
-#if IS_ENABLED(CONFIG_IPV6)
- case AF_INET6:
- if (udp_get_no_check6_tx(sk) &&
- nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_TX))
- goto nla_put_failure;
- if (udp_get_no_check6_rx(sk) &&
- nla_put_flag(skb, L2TP_ATTR_UDP_ZERO_CSUM6_RX))
- goto nla_put_failure;
- break;
-#endif
- }
- if (nla_put_u16(skb, L2TP_ATTR_UDP_SPORT, ntohs(inet->inet_sport)) ||
- nla_put_u16(skb, L2TP_ATTR_UDP_DPORT, ntohs(inet->inet_dport)))
- goto nla_put_failure;
- /* fall through */
- case L2TP_ENCAPTYPE_IP:
-#if IS_ENABLED(CONFIG_IPV6)
- if (np) {
- if (nla_put_in6_addr(skb, L2TP_ATTR_IP6_SADDR,
- &np->saddr) ||
- nla_put_in6_addr(skb, L2TP_ATTR_IP6_DADDR,
- &sk->sk_v6_daddr))
- goto nla_put_failure;
- } else
-#endif
- if (nla_put_in_addr(skb, L2TP_ATTR_IP_SADDR,
- inet->inet_saddr) ||
- nla_put_in_addr(skb, L2TP_ATTR_IP_DADDR,
- inet->inet_daddr))
- goto nla_put_failure;
- break;
- }
+ if (l2tp_nl_tunnel_send_addr(skb, tunnel))
+ goto nla_put_failure;
-out:
genlmsg_end(skb, hdr);
return 0;
@@ -468,42 +479,48 @@ static int l2tp_nl_cmd_tunnel_get(struct sk_buff *skb, struct genl_info *info)
if (ret < 0)
goto err_nlmsg_tunnel;
- l2tp_tunnel_dec_refcount(tunnel);
+ l2tp_tunnel_put(tunnel);
return genlmsg_unicast(net, msg, info->snd_portid);
err_nlmsg_tunnel:
- l2tp_tunnel_dec_refcount(tunnel);
+ l2tp_tunnel_put(tunnel);
err_nlmsg:
nlmsg_free(msg);
err:
return ret;
}
+struct l2tp_nl_cb_data {
+ unsigned long tkey;
+ unsigned long skey;
+};
+
static int l2tp_nl_cmd_tunnel_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
- int ti = cb->args[0];
+ struct l2tp_nl_cb_data *cbd = (void *)&cb->ctx[0];
+ unsigned long key = cbd->tkey;
struct l2tp_tunnel *tunnel;
struct net *net = sock_net(skb->sk);
for (;;) {
- tunnel = l2tp_tunnel_get_nth(net, ti);
- if (tunnel == NULL)
+ tunnel = l2tp_tunnel_get_next(net, &key);
+ if (!tunnel)
goto out;
if (l2tp_nl_tunnel_send(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
tunnel, L2TP_CMD_TUNNEL_GET) < 0) {
- l2tp_tunnel_dec_refcount(tunnel);
+ l2tp_tunnel_put(tunnel);
goto out;
}
- l2tp_tunnel_dec_refcount(tunnel);
+ l2tp_tunnel_put(tunnel);
- ti++;
+ key++;
}
out:
- cb->args[0] = ti;
+ cbd->tkey = key;
return skb->len;
}
@@ -573,6 +590,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
if (info->attrs[L2TP_ATTR_COOKIE]) {
u16 len = nla_len(info->attrs[L2TP_ATTR_COOKIE]);
+
if (len > 8) {
ret = -EINVAL;
goto out_tunnel;
@@ -582,6 +600,7 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
}
if (info->attrs[L2TP_ATTR_PEER_COOKIE]) {
u16 len = nla_len(info->attrs[L2TP_ATTR_PEER_COOKIE]);
+
if (len > 8) {
ret = -EINVAL;
goto out_tunnel;
@@ -593,9 +612,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
cfg.ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]);
}
- if (info->attrs[L2TP_ATTR_DEBUG])
- cfg.debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
-
if (info->attrs[L2TP_ATTR_RECV_SEQ])
cfg.recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]);
@@ -609,14 +625,13 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
cfg.reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]);
#ifdef CONFIG_MODULES
- if (l2tp_nl_cmd_ops[cfg.pw_type] == NULL) {
+ if (!l2tp_nl_cmd_ops[cfg.pw_type]) {
genl_unlock();
request_module("net-l2tp-type-%u", cfg.pw_type);
genl_lock();
}
#endif
- if ((l2tp_nl_cmd_ops[cfg.pw_type] == NULL) ||
- (l2tp_nl_cmd_ops[cfg.pw_type]->session_create == NULL)) {
+ if (!l2tp_nl_cmd_ops[cfg.pw_type] || !l2tp_nl_cmd_ops[cfg.pw_type]->session_create) {
ret = -EPROTONOSUPPORT;
goto out_tunnel;
}
@@ -627,16 +642,17 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
&cfg);
if (ret >= 0) {
- session = l2tp_tunnel_get_session(tunnel, session_id);
+ session = l2tp_session_get(net, tunnel->sock, tunnel->version,
+ tunnel_id, session_id);
if (session) {
ret = l2tp_session_notify(&l2tp_nl_family, info, session,
L2TP_CMD_SESSION_CREATE);
- l2tp_session_dec_refcount(session);
+ l2tp_session_put(session);
}
}
out_tunnel:
- l2tp_tunnel_dec_refcount(tunnel);
+ l2tp_tunnel_put(tunnel);
out:
return ret;
}
@@ -648,7 +664,7 @@ static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *inf
u16 pw_type;
session = l2tp_nl_session_get(info);
- if (session == NULL) {
+ if (!session) {
ret = -ENODEV;
goto out;
}
@@ -659,9 +675,9 @@ static int l2tp_nl_cmd_session_delete(struct sk_buff *skb, struct genl_info *inf
pw_type = session->pwtype;
if (pw_type < __L2TP_PWTYPE_MAX)
if (l2tp_nl_cmd_ops[pw_type] && l2tp_nl_cmd_ops[pw_type]->session_delete)
- ret = (*l2tp_nl_cmd_ops[pw_type]->session_delete)(session);
+ l2tp_nl_cmd_ops[pw_type]->session_delete(session);
- l2tp_session_dec_refcount(session);
+ l2tp_session_put(session);
out:
return ret;
@@ -673,20 +689,19 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf
struct l2tp_session *session;
session = l2tp_nl_session_get(info);
- if (session == NULL) {
+ if (!session) {
ret = -ENODEV;
goto out;
}
- if (info->attrs[L2TP_ATTR_DEBUG])
- session->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
-
if (info->attrs[L2TP_ATTR_RECV_SEQ])
session->recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]);
if (info->attrs[L2TP_ATTR_SEND_SEQ]) {
+ struct l2tp_tunnel *tunnel = session->tunnel;
+
session->send_seq = nla_get_u8(info->attrs[L2TP_ATTR_SEND_SEQ]);
- l2tp_session_set_header_len(session, session->tunnel->version);
+ l2tp_session_set_header_len(session, tunnel->version, tunnel->encap);
}
if (info->attrs[L2TP_ATTR_LNS_MODE])
@@ -698,7 +713,7 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf
ret = l2tp_session_notify(&l2tp_nl_family, info,
session, L2TP_CMD_SESSION_MODIFY);
- l2tp_session_dec_refcount(session);
+ l2tp_session_put(session);
out:
return ret;
@@ -718,20 +733,17 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
if (nla_put_u32(skb, L2TP_ATTR_CONN_ID, tunnel->tunnel_id) ||
nla_put_u32(skb, L2TP_ATTR_SESSION_ID, session->session_id) ||
nla_put_u32(skb, L2TP_ATTR_PEER_CONN_ID, tunnel->peer_tunnel_id) ||
- nla_put_u32(skb, L2TP_ATTR_PEER_SESSION_ID,
- session->peer_session_id) ||
- nla_put_u32(skb, L2TP_ATTR_DEBUG, session->debug) ||
+ nla_put_u32(skb, L2TP_ATTR_PEER_SESSION_ID, session->peer_session_id) ||
+ nla_put_u32(skb, L2TP_ATTR_DEBUG, 0) ||
nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype))
goto nla_put_failure;
if ((session->ifname[0] &&
nla_put_string(skb, L2TP_ATTR_IFNAME, session->ifname)) ||
(session->cookie_len &&
- nla_put(skb, L2TP_ATTR_COOKIE, session->cookie_len,
- &session->cookie[0])) ||
+ nla_put(skb, L2TP_ATTR_COOKIE, session->cookie_len, session->cookie)) ||
(session->peer_cookie_len &&
- nla_put(skb, L2TP_ATTR_PEER_COOKIE, session->peer_cookie_len,
- &session->peer_cookie[0])) ||
+ nla_put(skb, L2TP_ATTR_PEER_COOKIE, session->peer_cookie_len, session->peer_cookie)) ||
nla_put_u8(skb, L2TP_ATTR_RECV_SEQ, session->recv_seq) ||
nla_put_u8(skb, L2TP_ATTR_SEND_SEQ, session->send_seq) ||
nla_put_u8(skb, L2TP_ATTR_LNS_MODE, session->lns_mode) ||
@@ -742,8 +754,8 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
session->reorder_timeout, L2TP_ATTR_PAD)))
goto nla_put_failure;
- nest = nla_nest_start(skb, L2TP_ATTR_STATS);
- if (nest == NULL)
+ nest = nla_nest_start_noflag(skb, L2TP_ATTR_STATS);
+ if (!nest)
goto nla_put_failure;
if (nla_put_u64_64bit(skb, L2TP_ATTR_TX_PACKETS,
@@ -764,11 +776,17 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
nla_put_u64_64bit(skb, L2TP_ATTR_RX_SEQ_DISCARDS,
atomic_long_read(&session->stats.rx_seq_discards),
L2TP_ATTR_STATS_PAD) ||
+ nla_put_u64_64bit(skb, L2TP_ATTR_RX_COOKIE_DISCARDS,
+ atomic_long_read(&session->stats.rx_cookie_discards),
+ L2TP_ATTR_STATS_PAD) ||
nla_put_u64_64bit(skb, L2TP_ATTR_RX_OOS_PACKETS,
atomic_long_read(&session->stats.rx_oos_packets),
L2TP_ATTR_STATS_PAD) ||
nla_put_u64_64bit(skb, L2TP_ATTR_RX_ERRORS,
atomic_long_read(&session->stats.rx_errors),
+ L2TP_ATTR_STATS_PAD) ||
+ nla_put_u64_64bit(skb, L2TP_ATTR_RX_INVALID,
+ atomic_long_read(&session->stats.rx_invalid),
L2TP_ATTR_STATS_PAD))
goto nla_put_failure;
nla_nest_end(skb, nest);
@@ -788,7 +806,7 @@ static int l2tp_nl_cmd_session_get(struct sk_buff *skb, struct genl_info *info)
int ret;
session = l2tp_nl_session_get(info);
- if (session == NULL) {
+ if (!session) {
ret = -ENODEV;
goto err;
}
@@ -806,57 +824,59 @@ static int l2tp_nl_cmd_session_get(struct sk_buff *skb, struct genl_info *info)
ret = genlmsg_unicast(genl_info_net(info), msg, info->snd_portid);
- l2tp_session_dec_refcount(session);
+ l2tp_session_put(session);
return ret;
err_ref_msg:
nlmsg_free(msg);
err_ref:
- l2tp_session_dec_refcount(session);
+ l2tp_session_put(session);
err:
return ret;
}
static int l2tp_nl_cmd_session_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct l2tp_nl_cb_data *cbd = (void *)&cb->ctx[0];
struct net *net = sock_net(skb->sk);
struct l2tp_session *session;
struct l2tp_tunnel *tunnel = NULL;
- int ti = cb->args[0];
- int si = cb->args[1];
+ unsigned long tkey = cbd->tkey;
+ unsigned long skey = cbd->skey;
for (;;) {
- if (tunnel == NULL) {
- tunnel = l2tp_tunnel_get_nth(net, ti);
- if (tunnel == NULL)
+ if (!tunnel) {
+ tunnel = l2tp_tunnel_get_next(net, &tkey);
+ if (!tunnel)
goto out;
}
- session = l2tp_session_get_nth(tunnel, si);
- if (session == NULL) {
- ti++;
- l2tp_tunnel_dec_refcount(tunnel);
+ session = l2tp_session_get_next(net, tunnel->sock, tunnel->version,
+ tunnel->tunnel_id, &skey);
+ if (!session) {
+ tkey++;
+ l2tp_tunnel_put(tunnel);
tunnel = NULL;
- si = 0;
+ skey = 0;
continue;
}
if (l2tp_nl_session_send(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
session, L2TP_CMD_SESSION_GET) < 0) {
- l2tp_session_dec_refcount(session);
- l2tp_tunnel_dec_refcount(tunnel);
+ l2tp_session_put(session);
+ l2tp_tunnel_put(tunnel);
break;
}
- l2tp_session_dec_refcount(session);
+ l2tp_session_put(session);
- si++;
+ skey++;
}
out:
- cb->args[0] = ti;
- cb->args[1] = si;
+ cbd->tkey = tkey;
+ cbd->skey = skey;
return skb->len;
}
@@ -912,62 +932,62 @@ static const struct nla_policy l2tp_nl_policy[L2TP_ATTR_MAX + 1] = {
},
};
-static const struct genl_ops l2tp_nl_ops[] = {
+static const struct genl_small_ops l2tp_nl_ops[] = {
{
.cmd = L2TP_CMD_NOOP,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = l2tp_nl_cmd_noop,
- .policy = l2tp_nl_policy,
/* can be retrieved by unprivileged users */
},
{
.cmd = L2TP_CMD_TUNNEL_CREATE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = l2tp_nl_cmd_tunnel_create,
- .policy = l2tp_nl_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
},
{
.cmd = L2TP_CMD_TUNNEL_DELETE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = l2tp_nl_cmd_tunnel_delete,
- .policy = l2tp_nl_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
},
{
.cmd = L2TP_CMD_TUNNEL_MODIFY,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = l2tp_nl_cmd_tunnel_modify,
- .policy = l2tp_nl_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
},
{
.cmd = L2TP_CMD_TUNNEL_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = l2tp_nl_cmd_tunnel_get,
.dumpit = l2tp_nl_cmd_tunnel_dump,
- .policy = l2tp_nl_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
},
{
.cmd = L2TP_CMD_SESSION_CREATE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = l2tp_nl_cmd_session_create,
- .policy = l2tp_nl_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
},
{
.cmd = L2TP_CMD_SESSION_DELETE,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = l2tp_nl_cmd_session_delete,
- .policy = l2tp_nl_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
},
{
.cmd = L2TP_CMD_SESSION_MODIFY,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = l2tp_nl_cmd_session_modify,
- .policy = l2tp_nl_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
},
{
.cmd = L2TP_CMD_SESSION_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.doit = l2tp_nl_cmd_session_get,
.dumpit = l2tp_nl_cmd_session_dump,
- .policy = l2tp_nl_policy,
- .flags = GENL_ADMIN_PERM,
+ .flags = GENL_UNS_ADMIN_PERM,
},
};
@@ -976,10 +996,12 @@ static struct genl_family l2tp_nl_family __ro_after_init = {
.version = L2TP_GENL_VERSION,
.hdrsize = 0,
.maxattr = L2TP_ATTR_MAX,
+ .policy = l2tp_nl_policy,
.netnsok = true,
.module = THIS_MODULE,
- .ops = l2tp_nl_ops,
- .n_ops = ARRAY_SIZE(l2tp_nl_ops),
+ .small_ops = l2tp_nl_ops,
+ .n_small_ops = ARRAY_SIZE(l2tp_nl_ops),
+ .resv_start_op = L2TP_CMD_SESSION_GET + 1,
.mcgrps = l2tp_multicast_group,
.n_mcgrps = ARRAY_SIZE(l2tp_multicast_group),
};
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 04d9946dcdba..ae4543d5597b 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*****************************************************************************
* Linux PPP over L2TP (PPPoX/PPPoL2TP) Sockets
*
@@ -11,11 +12,6 @@
* Based on original work by Martijn van Oosterhout <kleptog@svana.org>
*
* License:
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
/* This driver handles only L2TP data frames; control frames are handled by a
@@ -121,10 +117,8 @@ struct pppol2tp_session {
int owner; /* pid that opened the socket */
struct mutex sk_lock; /* Protects .sk */
- struct sock __rcu *sk; /* Pointer to the session
- * PPPoX socket */
+ struct sock __rcu *sk; /* Pointer to the session PPPoX socket */
struct sock *__sk; /* Copy of .sk, for cleanup */
- struct rcu_head rcu; /* For asynchronous release */
};
static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb);
@@ -135,44 +129,33 @@ static const struct ppp_channel_ops pppol2tp_chan_ops = {
static const struct proto_ops pppol2tp_ops;
-/* Retrieves the pppol2tp socket associated to a session.
- * A reference is held on the returned socket, so this function must be paired
- * with sock_put().
- */
+/* Retrieves the pppol2tp socket associated to a session. */
static struct sock *pppol2tp_session_get_sock(struct l2tp_session *session)
{
struct pppol2tp_session *ps = l2tp_session_priv(session);
- struct sock *sk;
-
- rcu_read_lock();
- sk = rcu_dereference(ps->sk);
- if (sk)
- sock_hold(sk);
- rcu_read_unlock();
- return sk;
+ return rcu_dereference(ps->sk);
}
/* Helpers to obtain tunnel/session contexts from sockets.
*/
-static inline struct l2tp_session *pppol2tp_sock_to_session(struct sock *sk)
+static struct l2tp_session *pppol2tp_sock_to_session(struct sock *sk)
{
struct l2tp_session *session;
- if (sk == NULL)
+ if (!sk)
return NULL;
- sock_hold(sk);
- session = (struct l2tp_session *)(sk->sk_user_data);
- if (session == NULL) {
- sock_put(sk);
- goto out;
+ rcu_read_lock();
+ session = rcu_dereference_sk_user_data(sk);
+ if (session && refcount_inc_not_zero(&session->ref_count)) {
+ rcu_read_unlock();
+ WARN_ON_ONCE(session->magic != L2TP_SESSION_MAGIC);
+ return session;
}
+ rcu_read_unlock();
- BUG_ON(session->magic != L2TP_SESSION_MAGIC);
-
-out:
- return session;
+ return NULL;
}
/*****************************************************************************
@@ -193,8 +176,7 @@ static int pppol2tp_recvmsg(struct socket *sock, struct msghdr *msg,
goto end;
err = 0;
- skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT,
- flags & MSG_DONTWAIT, &err);
+ skb = skb_recv_datagram(sk, flags, &err);
if (!skb)
goto end;
@@ -214,21 +196,20 @@ end:
static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len)
{
- struct pppol2tp_session *ps = l2tp_session_priv(session);
- struct sock *sk = NULL;
+ struct sock *sk;
/* If the socket is bound, send it in to PPP's input queue. Otherwise
* queue it on the session socket.
*/
rcu_read_lock();
- sk = rcu_dereference(ps->sk);
- if (sk == NULL)
+ sk = pppol2tp_session_get_sock(session);
+ if (!sk)
goto no_sock;
/* If the first two bytes are 0xFF03, consider that it is the PPP's
* Address and Control fields and skip them. The L2TP module has always
* worked this way, although, in theory, the use of these fields should
- * be negociated and handled at the PPP layer. These fields are
+ * be negotiated and handled at the PPP layer. These fields are
* constant: 0xFF is the All-Stations Address and 0x03 the Unnumbered
* Information command with Poll/Final bit set to zero (RFC 1662).
*/
@@ -239,17 +220,9 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int
if (sk->sk_state & PPPOX_BOUND) {
struct pppox_sock *po;
- l2tp_dbg(session, L2TP_MSG_DATA,
- "%s: recv %d byte data frame, passing to ppp\n",
- session->name, data_len);
-
po = pppox_sk(sk);
ppp_input(&po->chan, skb);
} else {
- l2tp_dbg(session, L2TP_MSG_DATA,
- "%s: recv %d byte data frame, passing to L2TP socket\n",
- session->name, data_len);
-
if (sock_queue_rcv_skb(sk, skb) < 0) {
atomic_long_inc(&session->stats.rx_errors);
kfree_skb(skb);
@@ -261,7 +234,7 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int
no_sock:
rcu_read_unlock();
- l2tp_info(session, L2TP_MSG_DATA, "%s: no socket\n", session->name);
+ pr_warn_ratelimited("%s: no socket in recv\n", session->name);
kfree_skb(skb);
}
@@ -290,7 +263,7 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m,
/* Get session and tunnel contexts */
error = -EBADF;
session = pppol2tp_sock_to_session(sk);
- if (session == NULL)
+ if (!session)
goto error;
tunnel = session->tunnel;
@@ -326,15 +299,15 @@ static int pppol2tp_sendmsg(struct socket *sock, struct msghdr *m,
}
local_bh_disable();
- l2tp_xmit_skb(session, skb, session->hdr_len);
+ l2tp_xmit_skb(session, skb);
local_bh_enable();
- sock_put(sk);
+ l2tp_session_put(session);
return total_len;
error_put_sess:
- sock_put(sk);
+ l2tp_session_put(session);
error:
return error;
}
@@ -355,7 +328,7 @@ error:
*/
static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
{
- struct sock *sk = (struct sock *) chan->private;
+ struct sock *sk = (struct sock *)chan->private;
struct l2tp_session *session;
struct l2tp_tunnel *tunnel;
int uhlen, headroom;
@@ -365,7 +338,7 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
/* Get session and tunnel contexts from the socket */
session = pppol2tp_sock_to_session(sk);
- if (session == NULL)
+ if (!session)
goto abort;
tunnel = session->tunnel;
@@ -385,15 +358,15 @@ static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
skb->data[1] = PPP_UI;
local_bh_disable();
- l2tp_xmit_skb(session, skb, session->hdr_len);
+ l2tp_xmit_skb(session, skb);
local_bh_enable();
- sock_put(sk);
+ l2tp_session_put(session);
return 1;
abort_put_sess:
- sock_put(sk);
+ l2tp_session_put(session);
abort:
/* Free the original skb */
kfree_skb(skb);
@@ -404,28 +377,32 @@ abort:
* Session (and tunnel control) socket create/destroy.
*****************************************************************************/
-static void pppol2tp_put_sk(struct rcu_head *head)
-{
- struct pppol2tp_session *ps;
-
- ps = container_of(head, typeof(*ps), rcu);
- sock_put(ps->__sk);
-}
-
/* Really kill the session socket. (Called from sock_put() if
* refcnt == 0.)
*/
static void pppol2tp_session_destruct(struct sock *sk)
{
- struct l2tp_session *session = sk->sk_user_data;
-
skb_queue_purge(&sk->sk_receive_queue);
skb_queue_purge(&sk->sk_write_queue);
+}
- if (session) {
- sk->sk_user_data = NULL;
- BUG_ON(session->magic != L2TP_SESSION_MAGIC);
- l2tp_session_dec_refcount(session);
+static void pppol2tp_session_close(struct l2tp_session *session)
+{
+ struct pppol2tp_session *ps;
+
+ ps = l2tp_session_priv(session);
+ mutex_lock(&ps->sk_lock);
+ ps->__sk = rcu_dereference_protected(ps->sk,
+ lockdep_is_held(&ps->sk_lock));
+ RCU_INIT_POINTER(ps->sk, NULL);
+ mutex_unlock(&ps->sk_lock);
+ if (ps->__sk) {
+ /* detach socket */
+ rcu_assign_sk_user_data(ps->__sk, NULL);
+ sock_put(ps->__sk);
+
+ /* drop ref taken when we referenced socket via sk_user_data */
+ l2tp_session_put(session);
}
}
@@ -454,30 +431,13 @@ static int pppol2tp_release(struct socket *sock)
session = pppol2tp_sock_to_session(sk);
if (session) {
- struct pppol2tp_session *ps;
-
l2tp_session_delete(session);
-
- ps = l2tp_session_priv(session);
- mutex_lock(&ps->sk_lock);
- ps->__sk = rcu_dereference_protected(ps->sk,
- lockdep_is_held(&ps->sk_lock));
- RCU_INIT_POINTER(ps->sk, NULL);
- mutex_unlock(&ps->sk_lock);
- call_rcu(&ps->rcu, pppol2tp_put_sk);
-
- /* Rely on the sock_put() call at the end of the function for
- * dropping the reference held by pppol2tp_sock_to_session().
- * The last reference will be dropped by pppol2tp_put_sk().
- */
+ /* drop ref taken by pppol2tp_sock_to_session */
+ l2tp_session_put(session);
}
release_sock(sk);
- /* This will delete the session context via
- * pppol2tp_session_destruct() if the socket's refcnt drops to
- * zero.
- */
sock_put(sk);
return 0;
@@ -516,6 +476,7 @@ static int pppol2tp_create(struct net *net, struct socket *sock, int kern)
goto out;
sock_init_data(sock, sk);
+ sock_set_flag(sk, SOCK_RCU_FREE);
sock->state = SS_UNCONNECTED;
sock->ops = &pppol2tp_ops;
@@ -538,13 +499,14 @@ static void pppol2tp_show(struct seq_file *m, void *arg)
struct l2tp_session *session = arg;
struct sock *sk;
+ rcu_read_lock();
sk = pppol2tp_session_get_sock(session);
if (sk) {
struct pppox_sock *po = pppox_sk(sk);
seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan));
- sock_put(sk);
}
+ rcu_read_unlock();
}
static void pppol2tp_session_init(struct l2tp_session *session)
@@ -552,6 +514,7 @@ static void pppol2tp_session_init(struct l2tp_session *session)
struct pppol2tp_session *ps;
session->recv_skb = pppol2tp_recv;
+ session->session_close = pppol2tp_session_close;
if (IS_ENABLED(CONFIG_L2TP_DEBUGFS))
session->show = pppol2tp_show;
@@ -660,9 +623,68 @@ static int pppol2tp_tunnel_mtu(const struct l2tp_tunnel *tunnel)
return mtu - PPPOL2TP_HEADER_OVERHEAD;
}
+static struct l2tp_tunnel *pppol2tp_tunnel_get(struct net *net,
+ const struct l2tp_connect_info *info,
+ bool *new_tunnel)
+{
+ struct l2tp_tunnel *tunnel;
+ int error;
+
+ *new_tunnel = false;
+
+ tunnel = l2tp_tunnel_get(net, info->tunnel_id);
+
+ /* Special case: create tunnel context if session_id and
+ * peer_session_id is 0. Otherwise look up tunnel using supplied
+ * tunnel id.
+ */
+ if (!info->session_id && !info->peer_session_id) {
+ if (!tunnel) {
+ struct l2tp_tunnel_cfg tcfg = {
+ .encap = L2TP_ENCAPTYPE_UDP,
+ };
+
+ /* Prevent l2tp_tunnel_register() from trying to set up
+ * a kernel socket.
+ */
+ if (info->fd < 0)
+ return ERR_PTR(-EBADF);
+
+ error = l2tp_tunnel_create(info->fd,
+ info->version,
+ info->tunnel_id,
+ info->peer_tunnel_id, &tcfg,
+ &tunnel);
+ if (error < 0)
+ return ERR_PTR(error);
+
+ refcount_inc(&tunnel->ref_count);
+ error = l2tp_tunnel_register(tunnel, net, &tcfg);
+ if (error < 0) {
+ kfree(tunnel);
+ return ERR_PTR(error);
+ }
+
+ *new_tunnel = true;
+ }
+ } else {
+ /* Error if we can't find the tunnel */
+ if (!tunnel)
+ return ERR_PTR(-ENOENT);
+
+ /* Error if socket is not prepped */
+ if (!tunnel->sock) {
+ l2tp_tunnel_put(tunnel);
+ return ERR_PTR(-ENOENT);
+ }
+ }
+
+ return tunnel;
+}
+
/* connect() handler. Attach a PPPoX socket to a tunnel UDP socket
*/
-static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
+static int pppol2tp_connect(struct socket *sock, struct sockaddr_unsized *uservaddr,
int sockaddr_len, int flags)
{
struct sock *sk = sock->sk;
@@ -673,7 +695,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
struct pppol2tp_session *ps;
struct l2tp_session_cfg cfg = { 0, };
bool drop_refcnt = false;
- bool drop_tunnel = false;
bool new_session = false;
bool new_tunnel = false;
int error;
@@ -682,6 +703,14 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
if (error < 0)
return error;
+ /* Don't bind if tunnel_id is 0 */
+ if (!info.tunnel_id)
+ return -EINVAL;
+
+ tunnel = pppol2tp_tunnel_get(sock_net(sk), &info, &new_tunnel);
+ if (IS_ERR(tunnel))
+ return PTR_ERR(tunnel);
+
lock_sock(sk);
/* Check for already bound sockets */
@@ -694,67 +723,11 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
if (sk->sk_user_data)
goto end; /* socket is already attached */
- /* Don't bind if tunnel_id is 0 */
- error = -EINVAL;
- if (!info.tunnel_id)
- goto end;
-
- tunnel = l2tp_tunnel_get(sock_net(sk), info.tunnel_id);
- if (tunnel)
- drop_tunnel = true;
-
- /* Special case: create tunnel context if session_id and
- * peer_session_id is 0. Otherwise look up tunnel using supplied
- * tunnel id.
- */
- if (!info.session_id && !info.peer_session_id) {
- if (tunnel == NULL) {
- struct l2tp_tunnel_cfg tcfg = {
- .encap = L2TP_ENCAPTYPE_UDP,
- .debug = 0,
- };
-
- /* Prevent l2tp_tunnel_register() from trying to set up
- * a kernel socket.
- */
- if (info.fd < 0) {
- error = -EBADF;
- goto end;
- }
-
- error = l2tp_tunnel_create(sock_net(sk), info.fd,
- info.version,
- info.tunnel_id,
- info.peer_tunnel_id, &tcfg,
- &tunnel);
- if (error < 0)
- goto end;
-
- l2tp_tunnel_inc_refcount(tunnel);
- error = l2tp_tunnel_register(tunnel, sock_net(sk),
- &tcfg);
- if (error < 0) {
- kfree(tunnel);
- goto end;
- }
- drop_tunnel = true;
- new_tunnel = true;
- }
- } else {
- /* Error if we can't find the tunnel */
- error = -ENOENT;
- if (tunnel == NULL)
- goto end;
-
- /* Error if socket is not prepped */
- if (tunnel->sock == NULL)
- goto end;
- }
-
if (tunnel->peer_tunnel_id == 0)
tunnel->peer_tunnel_id = info.peer_tunnel_id;
- session = l2tp_tunnel_get_session(tunnel, info.session_id);
+ session = l2tp_session_get(sock_net(sk), tunnel->sock, tunnel->version,
+ info.tunnel_id, info.session_id);
if (session) {
drop_refcnt = true;
@@ -787,18 +760,20 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
goto end;
}
+ drop_refcnt = true;
+
pppol2tp_session_init(session);
ps = l2tp_session_priv(session);
- l2tp_session_inc_refcount(session);
+ refcount_inc(&session->ref_count);
mutex_lock(&ps->sk_lock);
error = l2tp_session_register(session, tunnel);
if (error < 0) {
mutex_unlock(&ps->sk_lock);
- kfree(session);
+ l2tp_session_put(session);
goto end;
}
- drop_refcnt = true;
+
new_session = true;
}
@@ -807,8 +782,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
* the internal context for use by ioctl() and sockopt()
* handlers.
*/
- if ((session->session_id == 0) &&
- (session->peer_session_id == 0)) {
+ if (session->session_id == 0 && session->peer_session_id == 0) {
error = 0;
goto out_no_ppp;
}
@@ -822,6 +796,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
po->chan.private = sk;
po->chan.ops = &pppol2tp_chan_ops;
po->chan.mtu = pppol2tp_tunnel_mtu(tunnel);
+ po->chan.direct_xmit = true;
error = ppp_register_net_channel(sock_net(sk), &po->chan);
if (error) {
@@ -831,19 +806,18 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
out_no_ppp:
/* This is how we get the session context from the socket. */
- sk->sk_user_data = session;
+ sock_hold(sk);
+ rcu_assign_sk_user_data(sk, session);
rcu_assign_pointer(ps->sk, sk);
mutex_unlock(&ps->sk_lock);
/* Keep the reference we've grabbed on the session: sk doesn't expect
- * the session to disappear. pppol2tp_session_destruct() is responsible
+ * the session to disappear. pppol2tp_session_close() is responsible
* for dropping it.
*/
drop_refcnt = false;
sk->sk_state = PPPOX_CONNECTED;
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: created\n",
- session->name);
end:
if (error) {
@@ -853,9 +827,8 @@ end:
l2tp_tunnel_delete(tunnel);
}
if (drop_refcnt)
- l2tp_session_dec_refcount(session);
- if (drop_tunnel)
- l2tp_tunnel_dec_refcount(tunnel);
+ l2tp_session_put(session);
+ l2tp_tunnel_put(tunnel);
release_sock(sk);
return error;
@@ -895,7 +868,7 @@ static int pppol2tp_session_create(struct net *net, struct l2tp_tunnel *tunnel,
return 0;
err_sess:
- kfree(session);
+ l2tp_session_put(session);
err:
return error;
}
@@ -916,22 +889,23 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr,
struct pppol2tp_session *pls;
error = -ENOTCONN;
- if (sk == NULL)
+ if (!sk)
goto end;
if (!(sk->sk_state & PPPOX_CONNECTED))
goto end;
error = -EBADF;
session = pppol2tp_sock_to_session(sk);
- if (session == NULL)
+ if (!session)
goto end;
pls = l2tp_session_priv(session);
tunnel = session->tunnel;
inet = inet_sk(tunnel->sock);
- if ((tunnel->version == 2) && (tunnel->sock->sk_family == AF_INET)) {
+ if (tunnel->version == 2 && tunnel->sock->sk_family == AF_INET) {
struct sockaddr_pppol2tp sp;
+
len = sizeof(sp);
memset(&sp, 0, len);
sp.sa_family = AF_PPPOX;
@@ -947,8 +921,7 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr,
sp.pppol2tp.addr.sin_addr.s_addr = inet->inet_daddr;
memcpy(uaddr, &sp, len);
#if IS_ENABLED(CONFIG_IPV6)
- } else if ((tunnel->version == 2) &&
- (tunnel->sock->sk_family == AF_INET6)) {
+ } else if (tunnel->version == 2 && tunnel->sock->sk_family == AF_INET6) {
struct sockaddr_pppol2tpin6 sp;
len = sizeof(sp);
@@ -966,8 +939,7 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr,
memcpy(&sp.pppol2tp.addr.sin6_addr, &tunnel->sock->sk_v6_daddr,
sizeof(tunnel->sock->sk_v6_daddr));
memcpy(uaddr, &sp, len);
- } else if ((tunnel->version == 3) &&
- (tunnel->sock->sk_family == AF_INET6)) {
+ } else if (tunnel->version == 3 && tunnel->sock->sk_family == AF_INET6) {
struct sockaddr_pppol2tpv3in6 sp;
len = sizeof(sp);
@@ -988,6 +960,7 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr,
#endif
} else if (tunnel->version == 3) {
struct sockaddr_pppol2tpv3 sp;
+
len = sizeof(sp);
memset(&sp, 0, len);
sp.sa_family = AF_PPPOX;
@@ -1006,7 +979,7 @@ static int pppol2tp_getname(struct socket *sock, struct sockaddr *uaddr,
error = len;
- sock_put(sk);
+ l2tp_session_put(session);
end:
return error;
}
@@ -1050,17 +1023,18 @@ static int pppol2tp_tunnel_copy_stats(struct pppol2tp_ioc_stats *stats,
/* If session_id is set, search the corresponding session in the
* context of this tunnel and record the session's statistics.
*/
- session = l2tp_tunnel_get_session(tunnel, stats->session_id);
+ session = l2tp_session_get(tunnel->l2tp_net, tunnel->sock, tunnel->version,
+ tunnel->tunnel_id, stats->session_id);
if (!session)
return -EBADR;
if (session->pwtype != L2TP_PWTYPE_PPP) {
- l2tp_session_dec_refcount(session);
+ l2tp_session_put(session);
return -EBADR;
}
pppol2tp_copy_stats(stats, &session->stats);
- l2tp_session_dec_refcount(session);
+ l2tp_session_put(session);
return 0;
}
@@ -1070,7 +1044,6 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
{
struct pppol2tp_ioc_stats stats;
struct l2tp_session *session;
- int val;
switch (cmd) {
case PPPIOCGMRU:
@@ -1079,6 +1052,9 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
if (!session)
return -ENOTCONN;
+ if (WARN_ON(session->magic != L2TP_SESSION_MAGIC))
+ return -EBADF;
+
/* Not defined for tunnels */
if (!session->session_id && !session->peer_session_id)
return -ENOSYS;
@@ -1093,11 +1069,14 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
if (!session)
return -ENOTCONN;
+ if (WARN_ON(session->magic != L2TP_SESSION_MAGIC))
+ return -EBADF;
+
/* Not defined for tunnels */
if (!session->session_id && !session->peer_session_id)
return -ENOSYS;
- if (get_user(val, (int __user *)arg))
+ if (!access_ok((int __user *)arg, sizeof(int)))
return -EFAULT;
break;
@@ -1106,6 +1085,9 @@ static int pppol2tp_ioctl(struct socket *sock, unsigned int cmd,
if (!session)
return -ENOTCONN;
+ if (WARN_ON(session->magic != L2TP_SESSION_MAGIC))
+ return -EBADF;
+
/* Session 0 represents the parent tunnel */
if (!session->session_id && !session->peer_session_id) {
u32 session_id;
@@ -1160,9 +1142,7 @@ static int pppol2tp_tunnel_setsockopt(struct sock *sk,
switch (optname) {
case PPPOL2TP_SO_DEBUG:
- tunnel->debug = val;
- l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: set debug=%x\n",
- tunnel->name, tunnel->debug);
+ /* Tunnel debug flags option is deprecated */
break;
default:
@@ -1183,18 +1163,15 @@ static int pppol2tp_session_setsockopt(struct sock *sk,
switch (optname) {
case PPPOL2TP_SO_RECVSEQ:
- if ((val != 0) && (val != 1)) {
+ if (val != 0 && val != 1) {
err = -EINVAL;
break;
}
session->recv_seq = !!val;
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: set recv_seq=%d\n",
- session->name, session->recv_seq);
break;
case PPPOL2TP_SO_SENDSEQ:
- if ((val != 0) && (val != 1)) {
+ if (val != 0 && val != 1) {
err = -EINVAL;
break;
}
@@ -1205,34 +1182,24 @@ static int pppol2tp_session_setsockopt(struct sock *sk,
po->chan.hdrlen = val ? PPPOL2TP_L2TP_HDR_SIZE_SEQ :
PPPOL2TP_L2TP_HDR_SIZE_NOSEQ;
}
- l2tp_session_set_header_len(session, session->tunnel->version);
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: set send_seq=%d\n",
- session->name, session->send_seq);
+ l2tp_session_set_header_len(session, session->tunnel->version,
+ session->tunnel->encap);
break;
case PPPOL2TP_SO_LNSMODE:
- if ((val != 0) && (val != 1)) {
+ if (val != 0 && val != 1) {
err = -EINVAL;
break;
}
session->lns_mode = !!val;
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: set lns_mode=%d\n",
- session->name, session->lns_mode);
break;
case PPPOL2TP_SO_DEBUG:
- session->debug = val;
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: set debug=%x\n",
- session->name, session->debug);
+ /* Session debug flags option is deprecated */
break;
case PPPOL2TP_SO_REORDERTO:
session->reorder_timeout = msecs_to_jiffies(val);
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: set reorder_timeout=%d\n",
- session->name, session->reorder_timeout);
break;
default:
@@ -1249,7 +1216,7 @@ static int pppol2tp_session_setsockopt(struct sock *sk,
* session or the special tunnel type.
*/
static int pppol2tp_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
struct l2tp_session *session;
@@ -1263,30 +1230,29 @@ static int pppol2tp_setsockopt(struct socket *sock, int level, int optname,
if (optlen < sizeof(int))
return -EINVAL;
- if (get_user(val, (int __user *)optval))
+ if (copy_from_sockptr(&val, optval, sizeof(int)))
return -EFAULT;
err = -ENOTCONN;
- if (sk->sk_user_data == NULL)
+ if (!sk->sk_user_data)
goto end;
/* Get session context from the socket */
err = -EBADF;
session = pppol2tp_sock_to_session(sk);
- if (session == NULL)
+ if (!session)
goto end;
/* Special case: if session_id == 0x0000, treat as operation on tunnel
*/
- if ((session->session_id == 0) &&
- (session->peer_session_id == 0)) {
+ if (session->session_id == 0 && session->peer_session_id == 0) {
tunnel = session->tunnel;
err = pppol2tp_tunnel_setsockopt(sk, tunnel, optname, val);
} else {
err = pppol2tp_session_setsockopt(sk, session, optname, val);
}
- sock_put(sk);
+ l2tp_session_put(session);
end:
return err;
}
@@ -1301,9 +1267,8 @@ static int pppol2tp_tunnel_getsockopt(struct sock *sk,
switch (optname) {
case PPPOL2TP_SO_DEBUG:
- *val = tunnel->debug;
- l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: get debug=%x\n",
- tunnel->name, tunnel->debug);
+ /* Tunnel debug flags option is deprecated */
+ *val = 0;
break;
default:
@@ -1325,32 +1290,23 @@ static int pppol2tp_session_getsockopt(struct sock *sk,
switch (optname) {
case PPPOL2TP_SO_RECVSEQ:
*val = session->recv_seq;
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: get recv_seq=%d\n", session->name, *val);
break;
case PPPOL2TP_SO_SENDSEQ:
*val = session->send_seq;
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: get send_seq=%d\n", session->name, *val);
break;
case PPPOL2TP_SO_LNSMODE:
*val = session->lns_mode;
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: get lns_mode=%d\n", session->name, *val);
break;
case PPPOL2TP_SO_DEBUG:
- *val = session->debug;
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: get debug=%d\n",
- session->name, *val);
+ /* Session debug flags option is deprecated */
+ *val = 0;
break;
case PPPOL2TP_SO_REORDERTO:
- *val = (int) jiffies_to_msecs(session->reorder_timeout);
- l2tp_info(session, L2TP_MSG_CONTROL,
- "%s: get reorder_timeout=%d\n", session->name, *val);
+ *val = (int)jiffies_to_msecs(session->reorder_timeout);
break;
default:
@@ -1380,24 +1336,23 @@ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname,
if (get_user(len, optlen))
return -EFAULT;
- len = min_t(unsigned int, len, sizeof(int));
-
if (len < 0)
return -EINVAL;
+ len = min_t(unsigned int, len, sizeof(int));
+
err = -ENOTCONN;
- if (sk->sk_user_data == NULL)
+ if (!sk->sk_user_data)
goto end;
/* Get the session context */
err = -EBADF;
session = pppol2tp_sock_to_session(sk);
- if (session == NULL)
+ if (!session)
goto end;
/* Special case: if session_id == 0x0000, treat as operation on tunnel */
- if ((session->session_id == 0) &&
- (session->peer_session_id == 0)) {
+ if (session->session_id == 0 && session->peer_session_id == 0) {
tunnel = session->tunnel;
err = pppol2tp_tunnel_getsockopt(sk, tunnel, optname, &val);
if (err)
@@ -1412,13 +1367,13 @@ static int pppol2tp_getsockopt(struct socket *sock, int level, int optname,
if (put_user(len, optlen))
goto end_put_sess;
- if (copy_to_user((void __user *) optval, &val, len))
+ if (copy_to_user((void __user *)optval, &val, len))
goto end_put_sess;
err = 0;
end_put_sess:
- sock_put(sk);
+ l2tp_session_put(session);
end:
return err;
}
@@ -1429,14 +1384,12 @@ end:
* L2TPv2, we dump only L2TPv2 tunnels and sessions here.
*****************************************************************************/
-static unsigned int pppol2tp_net_id;
-
#ifdef CONFIG_PROC_FS
struct pppol2tp_seq_data {
struct seq_net_private p;
- int tunnel_idx; /* current tunnel */
- int session_idx; /* index of session within current tunnel */
+ unsigned long tkey; /* lookup key of current tunnel */
+ unsigned long skey; /* lookup key of current session */
struct l2tp_tunnel *tunnel;
struct l2tp_session *session; /* NULL means get next tunnel */
};
@@ -1445,17 +1398,17 @@ static void pppol2tp_next_tunnel(struct net *net, struct pppol2tp_seq_data *pd)
{
/* Drop reference taken during previous invocation */
if (pd->tunnel)
- l2tp_tunnel_dec_refcount(pd->tunnel);
+ l2tp_tunnel_put(pd->tunnel);
for (;;) {
- pd->tunnel = l2tp_tunnel_get_nth(net, pd->tunnel_idx);
- pd->tunnel_idx++;
+ pd->tunnel = l2tp_tunnel_get_next(net, &pd->tkey);
+ pd->tkey++;
/* Only accept L2TPv2 tunnels */
if (!pd->tunnel || pd->tunnel->version == 2)
return;
- l2tp_tunnel_dec_refcount(pd->tunnel);
+ l2tp_tunnel_put(pd->tunnel);
}
}
@@ -1463,13 +1416,15 @@ static void pppol2tp_next_session(struct net *net, struct pppol2tp_seq_data *pd)
{
/* Drop reference taken during previous invocation */
if (pd->session)
- l2tp_session_dec_refcount(pd->session);
+ l2tp_session_put(pd->session);
- pd->session = l2tp_session_get_nth(pd->tunnel, pd->session_idx);
- pd->session_idx++;
+ pd->session = l2tp_session_get_next(net, pd->tunnel->sock,
+ pd->tunnel->version,
+ pd->tunnel->tunnel_id, &pd->skey);
+ pd->skey++;
- if (pd->session == NULL) {
- pd->session_idx = 0;
+ if (!pd->session) {
+ pd->skey = 0;
pppol2tp_next_tunnel(net, pd);
}
}
@@ -1483,17 +1438,21 @@ static void *pppol2tp_seq_start(struct seq_file *m, loff_t *offs)
if (!pos)
goto out;
- BUG_ON(m->private == NULL);
+ if (WARN_ON(!m->private)) {
+ pd = NULL;
+ goto out;
+ }
+
pd = m->private;
net = seq_file_net(m);
- if (pd->tunnel == NULL)
+ if (!pd->tunnel)
pppol2tp_next_tunnel(net, pd);
else
pppol2tp_next_session(net, pd);
/* NULL tunnel and session indicates end of list */
- if ((pd->tunnel == NULL) && (pd->session == NULL))
+ if (!pd->tunnel && !pd->session)
pd = NULL;
out:
@@ -1517,11 +1476,11 @@ static void pppol2tp_seq_stop(struct seq_file *p, void *v)
* or pppol2tp_next_tunnel().
*/
if (pd->session) {
- l2tp_session_dec_refcount(pd->session);
+ l2tp_session_put(pd->session);
pd->session = NULL;
}
if (pd->tunnel) {
- l2tp_tunnel_dec_refcount(pd->tunnel);
+ l2tp_tunnel_put(pd->tunnel);
pd->tunnel = NULL;
}
}
@@ -1532,10 +1491,10 @@ static void pppol2tp_seq_tunnel_show(struct seq_file *m, void *v)
seq_printf(m, "\nTUNNEL '%s', %c %d\n",
tunnel->name,
- (tunnel == tunnel->sock->sk_user_data) ? 'Y' : 'N',
+ tunnel->sock ? 'Y' : 'N',
refcount_read(&tunnel->ref_count) - 1);
seq_printf(m, " %08x %ld/%ld/%ld %ld/%ld/%ld\n",
- tunnel->debug,
+ 0,
atomic_long_read(&tunnel->stats.tx_packets),
atomic_long_read(&tunnel->stats.tx_bytes),
atomic_long_read(&tunnel->stats.tx_errors),
@@ -1556,10 +1515,12 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v)
if (tunnel->sock) {
struct inet_sock *inet = inet_sk(tunnel->sock);
+
ip = ntohl(inet->inet_saddr);
port = ntohs(inet->inet_sport);
}
+ rcu_read_lock();
sk = pppol2tp_session_get_sock(session);
if (sk) {
state = sk->sk_state;
@@ -1569,8 +1530,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v)
user_data_ok = 'N';
}
- seq_printf(m, " SESSION '%s' %08X/%d %04X/%04X -> "
- "%04X/%04X %d %c\n",
+ seq_printf(m, " SESSION '%s' %08X/%d %04X/%04X -> %04X/%04X %d %c\n",
session->name, ip, port,
tunnel->tunnel_id,
session->session_id,
@@ -1581,9 +1541,9 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v)
session->recv_seq ? 'R' : '-',
session->send_seq ? 'S' : '-',
session->lns_mode ? "LNS" : "LAC",
- session->debug,
+ 0,
jiffies_to_msecs(session->reorder_timeout));
- seq_printf(m, " %hu/%hu %ld/%ld/%ld %ld/%ld/%ld\n",
+ seq_printf(m, " %u/%u %ld/%ld/%ld %ld/%ld/%ld\n",
session->nr, session->ns,
atomic_long_read(&session->stats.tx_packets),
atomic_long_read(&session->stats.tx_bytes),
@@ -1596,8 +1556,8 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v)
struct pppox_sock *po = pppox_sk(sk);
seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan));
- sock_put(sk);
}
+ rcu_read_unlock();
}
static int pppol2tp_seq_show(struct seq_file *m, void *v)
@@ -1609,8 +1569,7 @@ static int pppol2tp_seq_show(struct seq_file *m, void *v)
seq_puts(m, "PPPoL2TP driver info, " PPPOL2TP_DRV_VERSION "\n");
seq_puts(m, "TUNNEL name, user-data-ok session-count\n");
seq_puts(m, " debug tx-pkts/bytes/errs rx-pkts/bytes/errs\n");
- seq_puts(m, " SESSION name, addr/port src-tid/sid "
- "dest-tid/sid state user-data-ok\n");
+ seq_puts(m, " SESSION name, addr/port src-tid/sid dest-tid/sid state user-data-ok\n");
seq_puts(m, " mtu/mru/rcvseq/sendseq/lns debug reorderto\n");
seq_puts(m, " nr/ns tx-pkts/bytes/errs rx-pkts/bytes/errs\n");
goto out;
@@ -1643,7 +1602,7 @@ static __net_init int pppol2tp_init_net(struct net *net)
int err = 0;
pde = proc_create_net("pppol2tp", 0444, net->proc_net,
- &pppol2tp_seq_ops, sizeof(struct pppol2tp_seq_data));
+ &pppol2tp_seq_ops, sizeof(struct pppol2tp_seq_data));
if (!pde) {
err = -ENOMEM;
goto out;
@@ -1661,7 +1620,6 @@ static __net_exit void pppol2tp_exit_net(struct net *net)
static struct pernet_operations pppol2tp_net_ops = {
.init = pppol2tp_init_net,
.exit = pppol2tp_exit_net,
- .id = &pppol2tp_net_id,
};
/*****************************************************************************
@@ -1686,6 +1644,9 @@ static const struct proto_ops pppol2tp_ops = {
.recvmsg = pppol2tp_recvmsg,
.mmap = sock_no_mmap,
.ioctl = pppox_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = pppox_compat_ioctl,
+#endif
};
static const struct pppox_proto pppol2tp_proto = {
diff --git a/net/l2tp/trace.h b/net/l2tp/trace.h
new file mode 100644
index 000000000000..8596eaa12a2e
--- /dev/null
+++ b/net/l2tp/trace.h
@@ -0,0 +1,211 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM l2tp
+
+#if !defined(_TRACE_L2TP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_L2TP_H
+
+#include <linux/tracepoint.h>
+#include <linux/l2tp.h>
+#include "l2tp_core.h"
+
+#define encap_type_name(e) { L2TP_ENCAPTYPE_##e, #e }
+#define show_encap_type_name(val) \
+ __print_symbolic(val, \
+ encap_type_name(UDP), \
+ encap_type_name(IP))
+
+#define pw_type_name(p) { L2TP_PWTYPE_##p, #p }
+#define show_pw_type_name(val) \
+ __print_symbolic(val, \
+ pw_type_name(ETH_VLAN), \
+ pw_type_name(ETH), \
+ pw_type_name(PPP), \
+ pw_type_name(PPP_AC), \
+ pw_type_name(IP))
+
+DECLARE_EVENT_CLASS(tunnel_only_evt,
+ TP_PROTO(struct l2tp_tunnel *tunnel),
+ TP_ARGS(tunnel),
+ TP_STRUCT__entry(
+ __array(char, name, L2TP_TUNNEL_NAME_MAX)
+ ),
+ TP_fast_assign(
+ memcpy(__entry->name, tunnel->name, L2TP_TUNNEL_NAME_MAX);
+ ),
+ TP_printk("%s", __entry->name)
+);
+
+DECLARE_EVENT_CLASS(session_only_evt,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session),
+ TP_STRUCT__entry(
+ __array(char, name, L2TP_SESSION_NAME_MAX)
+ ),
+ TP_fast_assign(
+ memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX);
+ ),
+ TP_printk("%s", __entry->name)
+);
+
+TRACE_EVENT(register_tunnel,
+ TP_PROTO(struct l2tp_tunnel *tunnel),
+ TP_ARGS(tunnel),
+ TP_STRUCT__entry(
+ __array(char, name, L2TP_TUNNEL_NAME_MAX)
+ __field(int, fd)
+ __field(u32, tid)
+ __field(u32, ptid)
+ __field(int, version)
+ __field(enum l2tp_encap_type, encap)
+ ),
+ TP_fast_assign(
+ memcpy(__entry->name, tunnel->name, L2TP_TUNNEL_NAME_MAX);
+ __entry->fd = tunnel->fd;
+ __entry->tid = tunnel->tunnel_id;
+ __entry->ptid = tunnel->peer_tunnel_id;
+ __entry->version = tunnel->version;
+ __entry->encap = tunnel->encap;
+ ),
+ TP_printk("%s: type=%s encap=%s version=L2TPv%d tid=%u ptid=%u fd=%d",
+ __entry->name,
+ __entry->fd > 0 ? "managed" : "unmanaged",
+ show_encap_type_name(__entry->encap),
+ __entry->version,
+ __entry->tid,
+ __entry->ptid,
+ __entry->fd)
+);
+
+DEFINE_EVENT(tunnel_only_evt, delete_tunnel,
+ TP_PROTO(struct l2tp_tunnel *tunnel),
+ TP_ARGS(tunnel)
+);
+
+DEFINE_EVENT(tunnel_only_evt, free_tunnel,
+ TP_PROTO(struct l2tp_tunnel *tunnel),
+ TP_ARGS(tunnel)
+);
+
+TRACE_EVENT(register_session,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session),
+ TP_STRUCT__entry(
+ __array(char, name, L2TP_SESSION_NAME_MAX)
+ __field(u32, tid)
+ __field(u32, ptid)
+ __field(u32, sid)
+ __field(u32, psid)
+ __field(enum l2tp_pwtype, pwtype)
+ ),
+ TP_fast_assign(
+ memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX);
+ __entry->tid = session->tunnel ? session->tunnel->tunnel_id : 0;
+ __entry->ptid = session->tunnel ? session->tunnel->peer_tunnel_id : 0;
+ __entry->sid = session->session_id;
+ __entry->psid = session->peer_session_id;
+ __entry->pwtype = session->pwtype;
+ ),
+ TP_printk("%s: pseudowire=%s sid=%u psid=%u tid=%u ptid=%u",
+ __entry->name,
+ show_pw_type_name(__entry->pwtype),
+ __entry->sid,
+ __entry->psid,
+ __entry->sid,
+ __entry->psid)
+);
+
+DEFINE_EVENT(session_only_evt, delete_session,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session)
+);
+
+DEFINE_EVENT(session_only_evt, free_session,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session)
+);
+
+DEFINE_EVENT(session_only_evt, session_seqnum_lns_enable,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session)
+);
+
+DEFINE_EVENT(session_only_evt, session_seqnum_lns_disable,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session)
+);
+
+DECLARE_EVENT_CLASS(session_seqnum_evt,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session),
+ TP_STRUCT__entry(
+ __array(char, name, L2TP_SESSION_NAME_MAX)
+ __field(u32, ns)
+ __field(u32, nr)
+ ),
+ TP_fast_assign(
+ memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX);
+ __entry->ns = session->ns;
+ __entry->nr = session->nr;
+ ),
+ TP_printk("%s: ns=%u nr=%u",
+ __entry->name,
+ __entry->ns,
+ __entry->nr)
+);
+
+DEFINE_EVENT(session_seqnum_evt, session_seqnum_update,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session)
+);
+
+DEFINE_EVENT(session_seqnum_evt, session_seqnum_reset,
+ TP_PROTO(struct l2tp_session *session),
+ TP_ARGS(session)
+);
+
+DECLARE_EVENT_CLASS(session_pkt_discard_evt,
+ TP_PROTO(struct l2tp_session *session, u32 pkt_ns),
+ TP_ARGS(session, pkt_ns),
+ TP_STRUCT__entry(
+ __array(char, name, L2TP_SESSION_NAME_MAX)
+ __field(u32, pkt_ns)
+ __field(u32, my_nr)
+ __field(u32, reorder_q_len)
+ ),
+ TP_fast_assign(
+ memcpy(__entry->name, session->name, L2TP_SESSION_NAME_MAX);
+ __entry->pkt_ns = pkt_ns,
+ __entry->my_nr = session->nr;
+ __entry->reorder_q_len = skb_queue_len(&session->reorder_q);
+ ),
+ TP_printk("%s: pkt_ns=%u my_nr=%u reorder_q_len=%u",
+ __entry->name,
+ __entry->pkt_ns,
+ __entry->my_nr,
+ __entry->reorder_q_len)
+);
+
+DEFINE_EVENT(session_pkt_discard_evt, session_pkt_expired,
+ TP_PROTO(struct l2tp_session *session, u32 pkt_ns),
+ TP_ARGS(session, pkt_ns)
+);
+
+DEFINE_EVENT(session_pkt_discard_evt, session_pkt_outside_rx_window,
+ TP_PROTO(struct l2tp_session *session, u32 pkt_ns),
+ TP_ARGS(session, pkt_ns)
+);
+
+DEFINE_EVENT(session_pkt_discard_evt, session_pkt_oos,
+ TP_PROTO(struct l2tp_session *session, u32 pkt_ns),
+ TP_ARGS(session, pkt_ns)
+);
+
+#endif /* _TRACE_L2TP_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/net/l3mdev/Kconfig b/net/l3mdev/Kconfig
index 5d47325037bc..2b2861e1fb7d 100644
--- a/net/l3mdev/Kconfig
+++ b/net/l3mdev/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Configuration for L3 master device support
#
@@ -5,6 +6,6 @@
config NET_L3_MASTER_DEV
bool "L3 Master device support"
depends on INET || IPV6
- ---help---
+ help
This module provides glue between core networking code and device
drivers to support L3 master devices like VRF.
diff --git a/net/l3mdev/Makefile b/net/l3mdev/Makefile
index 84a53a6f609a..9e7da0acc58c 100644
--- a/net/l3mdev/Makefile
+++ b/net/l3mdev/Makefile
@@ -1,5 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the L3 device API
#
-obj-$(CONFIG_NET_L3_MASTER_DEV) += l3mdev.o
+obj-y += l3mdev.o
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index 8da86ceca33d..5432a5f2dfc8 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -1,20 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* net/l3mdev/l3mdev.c - L3 master device implementation
* Copyright (c) 2015 Cumulus Networks
* Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
*/
#include <linux/netdevice.h>
#include <net/fib_rules.h>
#include <net/l3mdev.h>
+static DEFINE_SPINLOCK(l3mdev_lock);
+
+struct l3mdev_handler {
+ lookup_by_table_id_t dev_lookup;
+};
+
+static struct l3mdev_handler l3mdev_handlers[L3MDEV_TYPE_MAX + 1];
+
+static int l3mdev_check_type(enum l3mdev_type l3type)
+{
+ if (l3type <= L3MDEV_TYPE_UNSPEC || l3type > L3MDEV_TYPE_MAX)
+ return -EINVAL;
+
+ return 0;
+}
+
+int l3mdev_table_lookup_register(enum l3mdev_type l3type,
+ lookup_by_table_id_t fn)
+{
+ struct l3mdev_handler *hdlr;
+ int res;
+
+ res = l3mdev_check_type(l3type);
+ if (res)
+ return res;
+
+ hdlr = &l3mdev_handlers[l3type];
+
+ spin_lock(&l3mdev_lock);
+
+ if (hdlr->dev_lookup) {
+ res = -EBUSY;
+ goto unlock;
+ }
+
+ hdlr->dev_lookup = fn;
+ res = 0;
+
+unlock:
+ spin_unlock(&l3mdev_lock);
+
+ return res;
+}
+EXPORT_SYMBOL_GPL(l3mdev_table_lookup_register);
+
+void l3mdev_table_lookup_unregister(enum l3mdev_type l3type,
+ lookup_by_table_id_t fn)
+{
+ struct l3mdev_handler *hdlr;
+
+ if (l3mdev_check_type(l3type))
+ return;
+
+ hdlr = &l3mdev_handlers[l3type];
+
+ spin_lock(&l3mdev_lock);
+
+ if (hdlr->dev_lookup == fn)
+ hdlr->dev_lookup = NULL;
+
+ spin_unlock(&l3mdev_lock);
+}
+EXPORT_SYMBOL_GPL(l3mdev_table_lookup_unregister);
+
+int l3mdev_ifindex_lookup_by_table_id(enum l3mdev_type l3type,
+ struct net *net, u32 table_id)
+{
+ lookup_by_table_id_t lookup;
+ struct l3mdev_handler *hdlr;
+ int ifindex = -EINVAL;
+ int res;
+
+ res = l3mdev_check_type(l3type);
+ if (res)
+ return res;
+
+ hdlr = &l3mdev_handlers[l3type];
+
+ spin_lock(&l3mdev_lock);
+
+ lookup = hdlr->dev_lookup;
+ if (!lookup)
+ goto unlock;
+
+ ifindex = lookup(net, table_id);
+
+unlock:
+ spin_unlock(&l3mdev_lock);
+
+ return ifindex;
+}
+EXPORT_SYMBOL_GPL(l3mdev_ifindex_lookup_by_table_id);
+
/**
- * l3mdev_master_ifindex - get index of L3 master device
+ * l3mdev_master_ifindex_rcu - get index of L3 master device
* @dev: targeted interface
*/
@@ -47,7 +136,25 @@ int l3mdev_master_ifindex_rcu(const struct net_device *dev)
EXPORT_SYMBOL_GPL(l3mdev_master_ifindex_rcu);
/**
- * l3mdev_fib_table - get FIB table id associated with an L3
+ * l3mdev_master_upper_ifindex_by_index_rcu - get index of upper l3 master
+ * device
+ * @net: network namespace for device index lookup
+ * @ifindex: targeted interface
+ */
+int l3mdev_master_upper_ifindex_by_index_rcu(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+
+ dev = dev_get_by_index_rcu(net, ifindex);
+ while (dev && !netif_is_l3_master(dev))
+ dev = netdev_master_upper_dev_get_rcu(dev);
+
+ return dev ? dev->ifindex : 0;
+}
+EXPORT_SYMBOL_GPL(l3mdev_master_upper_ifindex_by_index_rcu);
+
+/**
+ * l3mdev_fib_table_rcu - get FIB table id associated with an L3
* master interface
* @dev: targeted interface
*/
@@ -104,6 +211,8 @@ EXPORT_SYMBOL_GPL(l3mdev_fib_table_by_index);
* local and multicast addresses
* @net: network namespace for device index lookup
* @fl6: IPv6 flow struct for lookup
+ * This function does not hold refcnt on the returned dst.
+ * Caller must hold rcu_read_lock().
*/
struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
@@ -112,9 +221,8 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
struct dst_entry *dst = NULL;
struct net_device *dev;
+ WARN_ON_ONCE(!rcu_read_lock_held());
if (fl6->flowi6_oif) {
- rcu_read_lock();
-
dev = dev_get_by_index_rcu(net, fl6->flowi6_oif);
if (dev && netif_is_l3_slave(dev))
dev = netdev_master_upper_dev_get_rcu(dev);
@@ -122,8 +230,6 @@ struct dst_entry *l3mdev_link_scope_lookup(struct net *net,
if (dev && netif_is_l3_master(dev) &&
dev->l3mdev_ops->l3mdev_link_scope_lookup)
dst = dev->l3mdev_ops->l3mdev_link_scope_lookup(dev, fl6);
-
- rcu_read_unlock();
}
return dst;
@@ -135,6 +241,7 @@ EXPORT_SYMBOL_GPL(l3mdev_link_scope_lookup);
* L3 master device
* @net: network namespace for device index lookup
* @fl: flow struct
+ * @arg: store the table the rule matched with here
*/
int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
@@ -143,25 +250,19 @@ int l3mdev_fib_rule_match(struct net *net, struct flowi *fl,
struct net_device *dev;
int rc = 0;
- rcu_read_lock();
+ /* update flow ensures flowi_l3mdev is set when relevant */
+ if (!fl->flowi_l3mdev)
+ return 0;
- dev = dev_get_by_index_rcu(net, fl->flowi_oif);
- if (dev && netif_is_l3_master(dev) &&
- dev->l3mdev_ops->l3mdev_fib_table) {
- arg->table = dev->l3mdev_ops->l3mdev_fib_table(dev);
- rc = 1;
- goto out;
- }
+ rcu_read_lock();
- dev = dev_get_by_index_rcu(net, fl->flowi_iif);
+ dev = dev_get_by_index_rcu(net, fl->flowi_l3mdev);
if (dev && netif_is_l3_master(dev) &&
dev->l3mdev_ops->l3mdev_fib_table) {
arg->table = dev->l3mdev_ops->l3mdev_fib_table(dev);
rc = 1;
- goto out;
}
-out:
rcu_read_unlock();
return rc;
@@ -170,31 +271,30 @@ out:
void l3mdev_update_flow(struct net *net, struct flowi *fl)
{
struct net_device *dev;
- int ifindex;
rcu_read_lock();
if (fl->flowi_oif) {
dev = dev_get_by_index_rcu(net, fl->flowi_oif);
if (dev) {
- ifindex = l3mdev_master_ifindex_rcu(dev);
- if (ifindex) {
- fl->flowi_oif = ifindex;
- fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF;
- goto out;
+ if (!fl->flowi_l3mdev) {
+ fl->flowi_l3mdev = l3mdev_master_ifindex_rcu(dev);
+ fl->flowi_flags |= FLOWI_FLAG_L3MDEV_OIF;
}
+
+ /* oif set to L3mdev directs lookup to its table;
+ * reset to avoid oif match in fib_lookup
+ */
+ if (netif_is_l3_master(dev))
+ fl->flowi_oif = 0;
+ goto out;
}
}
- if (fl->flowi_iif) {
+ if (fl->flowi_iif > LOOPBACK_IFINDEX && !fl->flowi_l3mdev) {
dev = dev_get_by_index_rcu(net, fl->flowi_iif);
- if (dev) {
- ifindex = l3mdev_master_ifindex_rcu(dev);
- if (ifindex) {
- fl->flowi_iif = ifindex;
- fl->flowi_flags |= FLOWI_FLAG_SKIP_NH_OIF;
- }
- }
+ if (dev)
+ fl->flowi_l3mdev = l3mdev_master_ifindex_rcu(dev);
}
out:
diff --git a/net/lapb/Kconfig b/net/lapb/Kconfig
index 6481839b76c9..da87b47f0dff 100644
--- a/net/lapb/Kconfig
+++ b/net/lapb/Kconfig
@@ -1,10 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# LAPB Data Link Drive
#
config LAPB
tristate "LAPB Data Link Driver"
- ---help---
+ help
Link Access Procedure, Balanced (LAPB) is the data link layer (i.e.
the lower) part of the X.25 protocol. It offers a reliable
connection service to exchange data frames with one other host, and
@@ -14,7 +15,7 @@ config LAPB
currently supports LAPB only over Ethernet connections. If you want
to use LAPB connections over Ethernet, say Y here and to "LAPB over
Ethernet driver" below. Read
- <file:Documentation/networking/lapb-module.txt> for technical
+ <file:Documentation/networking/lapb-module.rst> for technical
details.
To compile this driver as a module, choose M here: the
diff --git a/net/lapb/Makefile b/net/lapb/Makefile
index fff797dfc88c..7be91b4c0ca0 100644
--- a/net/lapb/Makefile
+++ b/net/lapb/Makefile
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
#
# Makefile for the Linux LAPB layer.
#
diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c
index db6e0afe3a20..a0596e1f91da 100644
--- a/net/lapb/lapb_iface.c
+++ b/net/lapb/lapb_iface.c
@@ -1,14 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* LAPB release 002
*
* This code REQUIRES 2.1.15 or higher/ NET3.038
*
- * This module:
- * This module is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* History
* LAPB 001 Jonathan Naylor Started Coding
* LAPB 002 Jonathan Naylor New timer architecture.
@@ -73,7 +68,6 @@ static void __lapb_remove_cb(struct lapb_cb *lapb)
lapb_put(lapb);
}
}
-EXPORT_SYMBOL(lapb_register);
/*
* Add a socket to the bound sockets list.
@@ -86,11 +80,9 @@ static void __lapb_insert_cb(struct lapb_cb *lapb)
static struct lapb_cb *__lapb_devtostruct(struct net_device *dev)
{
- struct list_head *entry;
struct lapb_cb *lapb, *use = NULL;
- list_for_each(entry, &lapb_list) {
- lapb = list_entry(entry, struct lapb_cb, node);
+ list_for_each_entry(lapb, &lapb_list, node) {
if (lapb->dev == dev) {
use = lapb;
break;
@@ -120,7 +112,6 @@ static struct lapb_cb *lapb_create_cb(void)
{
struct lapb_cb *lapb = kzalloc(sizeof(*lapb), GFP_ATOMIC);
-
if (!lapb)
goto out;
@@ -129,6 +120,8 @@ static struct lapb_cb *lapb_create_cb(void)
timer_setup(&lapb->t1timer, NULL, 0);
timer_setup(&lapb->t2timer, NULL, 0);
+ lapb->t1timer_running = false;
+ lapb->t2timer_running = false;
lapb->t1 = LAPB_DEFAULT_T1;
lapb->t2 = LAPB_DEFAULT_T2;
@@ -136,6 +129,8 @@ static struct lapb_cb *lapb_create_cb(void)
lapb->mode = LAPB_DEFAULT_MODE;
lapb->window = LAPB_DEFAULT_WINDOW;
lapb->state = LAPB_STATE_0;
+
+ spin_lock_init(&lapb->lock);
refcount_set(&lapb->refcnt, 1);
out:
return lapb;
@@ -172,6 +167,7 @@ out:
write_unlock_bh(&lapb_list_lock);
return rc;
}
+EXPORT_SYMBOL(lapb_register);
int lapb_unregister(struct net_device *dev)
{
@@ -182,12 +178,25 @@ int lapb_unregister(struct net_device *dev)
lapb = __lapb_devtostruct(dev);
if (!lapb)
goto out;
+ lapb_put(lapb);
+
+ /* Wait for other refs to "lapb" to drop */
+ while (refcount_read(&lapb->refcnt) > 2)
+ usleep_range(1, 10);
+
+ spin_lock_bh(&lapb->lock);
lapb_stop_t1timer(lapb);
lapb_stop_t2timer(lapb);
lapb_clear_queues(lapb);
+ spin_unlock_bh(&lapb->lock);
+
+ /* Wait for running timers to stop */
+ timer_delete_sync(&lapb->t1timer);
+ timer_delete_sync(&lapb->t2timer);
+
__lapb_remove_cb(lapb);
lapb_put(lapb);
@@ -206,6 +215,8 @@ int lapb_getparms(struct net_device *dev, struct lapb_parms_struct *parms)
if (!lapb)
goto out;
+ spin_lock_bh(&lapb->lock);
+
parms->t1 = lapb->t1 / HZ;
parms->t2 = lapb->t2 / HZ;
parms->n2 = lapb->n2;
@@ -224,6 +235,7 @@ int lapb_getparms(struct net_device *dev, struct lapb_parms_struct *parms)
else
parms->t2timer = (lapb->t2timer.expires - jiffies) / HZ;
+ spin_unlock_bh(&lapb->lock);
lapb_put(lapb);
rc = LAPB_OK;
out:
@@ -239,6 +251,8 @@ int lapb_setparms(struct net_device *dev, struct lapb_parms_struct *parms)
if (!lapb)
goto out;
+ spin_lock_bh(&lapb->lock);
+
rc = LAPB_INVALUE;
if (parms->t1 < 1 || parms->t2 < 1 || parms->n2 < 1)
goto out_put;
@@ -261,6 +275,7 @@ int lapb_setparms(struct net_device *dev, struct lapb_parms_struct *parms)
rc = LAPB_OK;
out_put:
+ spin_unlock_bh(&lapb->lock);
lapb_put(lapb);
out:
return rc;
@@ -275,6 +290,8 @@ int lapb_connect_request(struct net_device *dev)
if (!lapb)
goto out;
+ spin_lock_bh(&lapb->lock);
+
rc = LAPB_OK;
if (lapb->state == LAPB_STATE_1)
goto out_put;
@@ -290,24 +307,18 @@ int lapb_connect_request(struct net_device *dev)
rc = LAPB_OK;
out_put:
+ spin_unlock_bh(&lapb->lock);
lapb_put(lapb);
out:
return rc;
}
EXPORT_SYMBOL(lapb_connect_request);
-int lapb_disconnect_request(struct net_device *dev)
+static int __lapb_disconnect_request(struct lapb_cb *lapb)
{
- struct lapb_cb *lapb = lapb_devtostruct(dev);
- int rc = LAPB_BADTOKEN;
-
- if (!lapb)
- goto out;
-
switch (lapb->state) {
case LAPB_STATE_0:
- rc = LAPB_NOTCONNECTED;
- goto out_put;
+ return LAPB_NOTCONNECTED;
case LAPB_STATE_1:
lapb_dbg(1, "(%p) S1 TX DISC(1)\n", lapb->dev);
@@ -315,12 +326,10 @@ int lapb_disconnect_request(struct net_device *dev)
lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND);
lapb->state = LAPB_STATE_0;
lapb_start_t1timer(lapb);
- rc = LAPB_NOTCONNECTED;
- goto out_put;
+ return LAPB_NOTCONNECTED;
case LAPB_STATE_2:
- rc = LAPB_OK;
- goto out_put;
+ return LAPB_OK;
}
lapb_clear_queues(lapb);
@@ -333,8 +342,22 @@ int lapb_disconnect_request(struct net_device *dev)
lapb_dbg(1, "(%p) S3 DISC(1)\n", lapb->dev);
lapb_dbg(0, "(%p) S3 -> S2\n", lapb->dev);
- rc = LAPB_OK;
-out_put:
+ return LAPB_OK;
+}
+
+int lapb_disconnect_request(struct net_device *dev)
+{
+ struct lapb_cb *lapb = lapb_devtostruct(dev);
+ int rc = LAPB_BADTOKEN;
+
+ if (!lapb)
+ goto out;
+
+ spin_lock_bh(&lapb->lock);
+
+ rc = __lapb_disconnect_request(lapb);
+
+ spin_unlock_bh(&lapb->lock);
lapb_put(lapb);
out:
return rc;
@@ -349,6 +372,8 @@ int lapb_data_request(struct net_device *dev, struct sk_buff *skb)
if (!lapb)
goto out;
+ spin_lock_bh(&lapb->lock);
+
rc = LAPB_NOTCONNECTED;
if (lapb->state != LAPB_STATE_3 && lapb->state != LAPB_STATE_4)
goto out_put;
@@ -357,6 +382,7 @@ int lapb_data_request(struct net_device *dev, struct sk_buff *skb)
lapb_kick(lapb);
rc = LAPB_OK;
out_put:
+ spin_unlock_bh(&lapb->lock);
lapb_put(lapb);
out:
return rc;
@@ -369,7 +395,9 @@ int lapb_data_received(struct net_device *dev, struct sk_buff *skb)
int rc = LAPB_BADTOKEN;
if (lapb) {
+ spin_lock_bh(&lapb->lock);
lapb_data_input(lapb, skb);
+ spin_unlock_bh(&lapb->lock);
lapb_put(lapb);
rc = LAPB_OK;
}
@@ -423,14 +451,98 @@ int lapb_data_transmit(struct lapb_cb *lapb, struct sk_buff *skb)
return used;
}
+/* Handle device status changes. */
+static int lapb_device_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct lapb_cb *lapb;
+
+ if (!net_eq(dev_net(dev), &init_net))
+ return NOTIFY_DONE;
+
+ if (dev->type != ARPHRD_X25)
+ return NOTIFY_DONE;
+
+ lapb = lapb_devtostruct(dev);
+ if (!lapb)
+ return NOTIFY_DONE;
+
+ spin_lock_bh(&lapb->lock);
+
+ switch (event) {
+ case NETDEV_UP:
+ lapb_dbg(0, "(%p) Interface up: %s\n", dev, dev->name);
+
+ if (netif_carrier_ok(dev)) {
+ lapb_dbg(0, "(%p): Carrier is already up: %s\n", dev,
+ dev->name);
+ if (lapb->mode & LAPB_DCE) {
+ lapb_start_t1timer(lapb);
+ } else {
+ if (lapb->state == LAPB_STATE_0) {
+ lapb->state = LAPB_STATE_1;
+ lapb_establish_data_link(lapb);
+ }
+ }
+ }
+ break;
+ case NETDEV_GOING_DOWN:
+ if (netif_carrier_ok(dev))
+ __lapb_disconnect_request(lapb);
+ break;
+ case NETDEV_DOWN:
+ lapb_dbg(0, "(%p) Interface down: %s\n", dev, dev->name);
+ lapb_dbg(0, "(%p) S%d -> S0\n", dev, lapb->state);
+ lapb_clear_queues(lapb);
+ lapb->state = LAPB_STATE_0;
+ lapb->n2count = 0;
+ lapb_stop_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ break;
+ case NETDEV_CHANGE:
+ if (netif_carrier_ok(dev)) {
+ lapb_dbg(0, "(%p): Carrier detected: %s\n", dev,
+ dev->name);
+ if (lapb->mode & LAPB_DCE) {
+ lapb_start_t1timer(lapb);
+ } else {
+ if (lapb->state == LAPB_STATE_0) {
+ lapb->state = LAPB_STATE_1;
+ lapb_establish_data_link(lapb);
+ }
+ }
+ } else {
+ lapb_dbg(0, "(%p) Carrier lost: %s\n", dev, dev->name);
+ lapb_dbg(0, "(%p) S%d -> S0\n", dev, lapb->state);
+ lapb_clear_queues(lapb);
+ lapb->state = LAPB_STATE_0;
+ lapb->n2count = 0;
+ lapb_stop_t1timer(lapb);
+ lapb_stop_t2timer(lapb);
+ }
+ break;
+ }
+
+ spin_unlock_bh(&lapb->lock);
+ lapb_put(lapb);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block lapb_dev_notifier = {
+ .notifier_call = lapb_device_event,
+};
+
static int __init lapb_init(void)
{
- return 0;
+ return register_netdevice_notifier(&lapb_dev_notifier);
}
static void __exit lapb_exit(void)
{
WARN_ON(!list_empty(&lapb_list));
+
+ unregister_netdevice_notifier(&lapb_dev_notifier);
}
MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>");
diff --git a/net/lapb/lapb_in.c b/net/lapb/lapb_in.c
index d5d2110eb717..38ae23c09e83 100644
--- a/net/lapb/lapb_in.c
+++ b/net/lapb/lapb_in.c
@@ -1,14 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* LAPB release 002
*
* This code REQUIRES 2.1.15 or higher/ NET3.038
*
- * This module:
- * This module is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* History
* LAPB 001 Jonathan Naulor Started Coding
* LAPB 002 Jonathan Naylor New timer architecture.
diff --git a/net/lapb/lapb_out.c b/net/lapb/lapb_out.c
index eda726e22f64..a966d29c772d 100644
--- a/net/lapb/lapb_out.c
+++ b/net/lapb/lapb_out.c
@@ -1,14 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* LAPB release 002
*
* This code REQUIRES 2.1.15 or higher/ NET3.038
*
- * This module:
- * This module is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* History
* LAPB 001 Jonathan Naylor Started Coding
* LAPB 002 Jonathan Naylor New timer architecture.
@@ -87,7 +82,8 @@ void lapb_kick(struct lapb_cb *lapb)
skb = skb_dequeue(&lapb->write_queue);
do {
- if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) {
+ skbn = skb_copy(skb, GFP_ATOMIC);
+ if (!skbn) {
skb_queue_head(&lapb->write_queue, skb);
break;
}
diff --git a/net/lapb/lapb_subr.c b/net/lapb/lapb_subr.c
index 75efde3e616c..592a22d86a97 100644
--- a/net/lapb/lapb_subr.c
+++ b/net/lapb/lapb_subr.c
@@ -1,14 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* LAPB release 002
*
* This code REQUIRES 2.1.15 or higher/ NET3.038
*
- * This module:
- * This module is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* History
* LAPB 001 Jonathan Naylor Started Coding
*/
diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c
index 5d4ae01951b5..9fde6cf20f10 100644
--- a/net/lapb/lapb_timer.c
+++ b/net/lapb/lapb_timer.c
@@ -1,14 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* LAPB release 002
*
* This code REQUIRES 2.1.15 or higher/ NET3.038
*
- * This module:
- * This module is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
* History
* LAPB 001 Jonathan Naylor Started Coding
* LAPB 002 Jonathan Naylor New timer architecture.
@@ -40,61 +35,88 @@ static void lapb_t2timer_expiry(struct timer_list *);
void lapb_start_t1timer(struct lapb_cb *lapb)
{
- del_timer(&lapb->t1timer);
+ timer_delete(&lapb->t1timer);
lapb->t1timer.function = lapb_t1timer_expiry;
lapb->t1timer.expires = jiffies + lapb->t1;
+ lapb->t1timer_running = true;
add_timer(&lapb->t1timer);
}
void lapb_start_t2timer(struct lapb_cb *lapb)
{
- del_timer(&lapb->t2timer);
+ timer_delete(&lapb->t2timer);
lapb->t2timer.function = lapb_t2timer_expiry;
lapb->t2timer.expires = jiffies + lapb->t2;
+ lapb->t2timer_running = true;
add_timer(&lapb->t2timer);
}
void lapb_stop_t1timer(struct lapb_cb *lapb)
{
- del_timer(&lapb->t1timer);
+ lapb->t1timer_running = false;
+ timer_delete(&lapb->t1timer);
}
void lapb_stop_t2timer(struct lapb_cb *lapb)
{
- del_timer(&lapb->t2timer);
+ lapb->t2timer_running = false;
+ timer_delete(&lapb->t2timer);
}
int lapb_t1timer_running(struct lapb_cb *lapb)
{
- return timer_pending(&lapb->t1timer);
+ return lapb->t1timer_running;
}
static void lapb_t2timer_expiry(struct timer_list *t)
{
- struct lapb_cb *lapb = from_timer(lapb, t, t2timer);
+ struct lapb_cb *lapb = timer_container_of(lapb, t, t2timer);
+
+ spin_lock_bh(&lapb->lock);
+ if (timer_pending(&lapb->t2timer)) /* A new timer has been set up */
+ goto out;
+ if (!lapb->t2timer_running) /* The timer has been stopped */
+ goto out;
if (lapb->condition & LAPB_ACK_PENDING_CONDITION) {
lapb->condition &= ~LAPB_ACK_PENDING_CONDITION;
lapb_timeout_response(lapb);
}
+ lapb->t2timer_running = false;
+
+out:
+ spin_unlock_bh(&lapb->lock);
}
static void lapb_t1timer_expiry(struct timer_list *t)
{
- struct lapb_cb *lapb = from_timer(lapb, t, t1timer);
+ struct lapb_cb *lapb = timer_container_of(lapb, t, t1timer);
+
+ spin_lock_bh(&lapb->lock);
+ if (timer_pending(&lapb->t1timer)) /* A new timer has been set up */
+ goto out;
+ if (!lapb->t1timer_running) /* The timer has been stopped */
+ goto out;
switch (lapb->state) {
/*
- * If we are a DCE, keep going DM .. DM .. DM
+ * If we are a DCE, send DM up to N2 times, then switch to
+ * STATE_1 and send SABM(E).
*/
case LAPB_STATE_0:
- if (lapb->mode & LAPB_DCE)
+ if (lapb->mode & LAPB_DCE &&
+ lapb->n2count != lapb->n2) {
+ lapb->n2count++;
lapb_send_control(lapb, LAPB_DM, LAPB_POLLOFF, LAPB_RESPONSE);
+ } else {
+ lapb->state = LAPB_STATE_1;
+ lapb_establish_data_link(lapb);
+ }
break;
/*
@@ -106,7 +128,8 @@ static void lapb_t1timer_expiry(struct timer_list *t)
lapb->state = LAPB_STATE_0;
lapb_disconnect_indication(lapb, LAPB_TIMEDOUT);
lapb_dbg(0, "(%p) S1 -> S0\n", lapb->dev);
- return;
+ lapb->t1timer_running = false;
+ goto out;
} else {
lapb->n2count++;
if (lapb->mode & LAPB_EXTENDED) {
@@ -130,7 +153,8 @@ static void lapb_t1timer_expiry(struct timer_list *t)
lapb->state = LAPB_STATE_0;
lapb_disconnect_confirmation(lapb, LAPB_TIMEDOUT);
lapb_dbg(0, "(%p) S2 -> S0\n", lapb->dev);
- return;
+ lapb->t1timer_running = false;
+ goto out;
} else {
lapb->n2count++;
lapb_dbg(1, "(%p) S2 TX DISC(1)\n", lapb->dev);
@@ -148,7 +172,8 @@ static void lapb_t1timer_expiry(struct timer_list *t)
lapb_stop_t2timer(lapb);
lapb_disconnect_indication(lapb, LAPB_TIMEDOUT);
lapb_dbg(0, "(%p) S3 -> S0\n", lapb->dev);
- return;
+ lapb->t1timer_running = false;
+ goto out;
} else {
lapb->n2count++;
lapb_requeue_frames(lapb);
@@ -165,7 +190,8 @@ static void lapb_t1timer_expiry(struct timer_list *t)
lapb->state = LAPB_STATE_0;
lapb_disconnect_indication(lapb, LAPB_TIMEDOUT);
lapb_dbg(0, "(%p) S4 -> S0\n", lapb->dev);
- return;
+ lapb->t1timer_running = false;
+ goto out;
} else {
lapb->n2count++;
lapb_transmit_frmr(lapb);
@@ -174,4 +200,7 @@ static void lapb_t1timer_expiry(struct timer_list *t)
}
lapb_start_t1timer(lapb);
+
+out:
+ spin_unlock_bh(&lapb->lock);
}
diff --git a/net/llc/Kconfig b/net/llc/Kconfig
index 176a6c1521a5..7f79f5e134f9 100644
--- a/net/llc/Kconfig
+++ b/net/llc/Kconfig
@@ -1,6 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
config LLC
tristate
- depends on NET
config LLC2
tristate "ANSI/IEEE 802.2 LLC type 2 Support"
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 1beeea9549fa..59d593bb5d18 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -47,7 +47,7 @@ static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout);
#if 0
#define dprintk(args...) printk(KERN_DEBUG args)
#else
-#define dprintk(args...)
+#define dprintk(args...) do {} while (0)
#endif
/* Maybe we'll add some more in the future. */
@@ -98,8 +98,16 @@ static inline u8 llc_ui_header_len(struct sock *sk, struct sockaddr_llc *addr)
{
u8 rc = LLC_PDU_LEN_U;
- if (addr->sllc_test || addr->sllc_xid)
+ if (addr->sllc_test)
rc = LLC_PDU_LEN_U;
+ else if (addr->sllc_xid)
+ /* We need to expand header to sizeof(struct llc_xid_info)
+ * since llc_pdu_init_as_xid_cmd() sets 4,5,6 bytes of LLC header
+ * as XID PDU. In llc_ui_sendmsg() we reserved header size and then
+ * filled all other space with user data. If we won't reserve this
+ * bytes, llc_pdu_init_as_xid_cmd() will overwrite user data
+ */
+ rc = LLC_PDU_LEN_U_XID;
else if (sk->sk_type == SOCK_STREAM)
rc = LLC_PDU_LEN_I;
return rc;
@@ -113,22 +121,26 @@ static inline u8 llc_ui_header_len(struct sock *sk, struct sockaddr_llc *addr)
*
* Send data via reliable llc2 connection.
* Returns 0 upon success, non-zero if action did not succeed.
+ *
+ * This function always consumes a reference to the skb.
*/
static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock)
{
struct llc_sock* llc = llc_sk(sk);
- int rc = 0;
if (unlikely(llc_data_accept_state(llc->state) ||
llc->remote_busy_flag ||
llc->p_flag)) {
long timeout = sock_sndtimeo(sk, noblock);
+ int rc;
rc = llc_ui_wait_for_busy_core(sk, timeout);
+ if (rc) {
+ kfree_skb(skb);
+ return rc;
+ }
}
- if (unlikely(!rc))
- rc = llc_build_and_send_pkt(sk, skb);
- return rc;
+ return llc_build_and_send_pkt(sk, skb);
}
static void llc_ui_sk_init(struct socket *sock, struct sock *sk)
@@ -198,7 +210,7 @@ static int llc_ui_release(struct socket *sock)
dprintk("%s: closing local(%02X) remote(%02X)\n", __func__,
llc->laddr.lsap, llc->daddr.lsap);
if (!llc_send_disc(sk))
- llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo);
+ llc_ui_wait_for_disc(sk, READ_ONCE(sk->sk_rcvtimeo));
if (!sock_flag(sk, SOCK_ZAPPED)) {
struct llc_sap *sap = llc->sap;
@@ -212,9 +224,10 @@ static int llc_ui_release(struct socket *sock)
} else {
release_sock(sk);
}
- if (llc->dev)
- dev_put(llc->dev);
+ netdev_put(llc->dev, &llc->dev_tracker);
sock_put(sk);
+ sock_orphan(sk);
+ sock->sk = NULL;
llc_sk_free(sk);
out:
return 0;
@@ -264,21 +277,26 @@ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr)
{
struct sock *sk = sock->sk;
struct llc_sock *llc = llc_sk(sk);
+ struct net_device *dev = NULL;
struct llc_sap *sap;
int rc = -EINVAL;
if (!sock_flag(sk, SOCK_ZAPPED))
goto out;
+ if (!addr->sllc_arphrd)
+ addr->sllc_arphrd = ARPHRD_ETHER;
+ if (addr->sllc_arphrd != ARPHRD_ETHER)
+ goto out;
rc = -ENODEV;
if (sk->sk_bound_dev_if) {
- llc->dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if);
- if (llc->dev && addr->sllc_arphrd != llc->dev->type) {
- dev_put(llc->dev);
- llc->dev = NULL;
+ dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if);
+ if (dev && addr->sllc_arphrd != dev->type) {
+ dev_put(dev);
+ dev = NULL;
}
} else
- llc->dev = dev_getfirstbyhwtype(&init_net, addr->sllc_arphrd);
- if (!llc->dev)
+ dev = dev_getfirstbyhwtype(&init_net, addr->sllc_arphrd);
+ if (!dev)
goto out;
rc = -EUSERS;
llc->laddr.lsap = llc_ui_autoport();
@@ -288,6 +306,12 @@ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr)
sap = llc_sap_open(llc->laddr.lsap, NULL);
if (!sap)
goto out;
+
+ /* Note: We do not expect errors from this point. */
+ llc->dev = dev;
+ netdev_tracker_alloc(llc->dev, &llc->dev_tracker, GFP_KERNEL);
+ dev = NULL;
+
memcpy(llc->laddr.mac, llc->dev->dev_addr, IFHWADDRLEN);
memcpy(&llc->addr, addr, sizeof(llc->addr));
/* assign new connection to its SAP */
@@ -295,6 +319,7 @@ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr)
sock_reset_flag(sk, SOCK_ZAPPED);
rc = 0;
out:
+ dev_put(dev);
return rc;
}
@@ -312,47 +337,48 @@ out:
* otherwise all hell will break loose.
* Returns: 0 upon success, negative otherwise.
*/
-static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen)
+static int llc_ui_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addrlen)
{
struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr;
struct sock *sk = sock->sk;
struct llc_sock *llc = llc_sk(sk);
+ struct net_device *dev = NULL;
struct llc_sap *sap;
int rc = -EINVAL;
- dprintk("%s: binding %02X\n", __func__, addr->sllc_sap);
-
lock_sock(sk);
if (unlikely(!sock_flag(sk, SOCK_ZAPPED) || addrlen != sizeof(*addr)))
goto out;
rc = -EAFNOSUPPORT;
- if (unlikely(addr->sllc_family != AF_LLC))
+ if (!addr->sllc_arphrd)
+ addr->sllc_arphrd = ARPHRD_ETHER;
+ if (unlikely(addr->sllc_family != AF_LLC || addr->sllc_arphrd != ARPHRD_ETHER))
goto out;
+ dprintk("%s: binding %02X\n", __func__, addr->sllc_sap);
rc = -ENODEV;
rcu_read_lock();
if (sk->sk_bound_dev_if) {
- llc->dev = dev_get_by_index_rcu(&init_net, sk->sk_bound_dev_if);
- if (llc->dev) {
- if (!addr->sllc_arphrd)
- addr->sllc_arphrd = llc->dev->type;
+ dev = dev_get_by_index_rcu(&init_net, sk->sk_bound_dev_if);
+ if (dev) {
if (is_zero_ether_addr(addr->sllc_mac))
- memcpy(addr->sllc_mac, llc->dev->dev_addr,
+ memcpy(addr->sllc_mac, dev->dev_addr,
IFHWADDRLEN);
- if (addr->sllc_arphrd != llc->dev->type ||
+ if (addr->sllc_arphrd != dev->type ||
!ether_addr_equal(addr->sllc_mac,
- llc->dev->dev_addr)) {
+ dev->dev_addr)) {
rc = -EINVAL;
- llc->dev = NULL;
+ dev = NULL;
}
}
- } else
- llc->dev = dev_getbyhwaddr_rcu(&init_net, addr->sllc_arphrd,
+ } else {
+ dev = dev_getbyhwaddr_rcu(&init_net, addr->sllc_arphrd,
addr->sllc_mac);
- if (llc->dev)
- dev_hold(llc->dev);
+ }
+ dev_hold(dev);
rcu_read_unlock();
- if (!llc->dev)
+ if (!dev)
goto out;
+
if (!addr->sllc_sap) {
rc = -EUSERS;
addr->sllc_sap = llc_ui_autoport();
@@ -378,12 +404,18 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen)
memcpy(laddr.mac, addr->sllc_mac, IFHWADDRLEN);
laddr.lsap = addr->sllc_sap;
rc = -EADDRINUSE; /* mac + sap clash. */
- ask = llc_lookup_established(sap, &daddr, &laddr);
+ ask = llc_lookup_established(sap, &daddr, &laddr, &init_net);
if (ask) {
sock_put(ask);
goto out_put;
}
}
+
+ /* Note: We do not expect errors from this point. */
+ llc->dev = dev;
+ netdev_tracker_alloc(llc->dev, &llc->dev_tracker, GFP_KERNEL);
+ dev = NULL;
+
llc->laddr.lsap = addr->sllc_sap;
memcpy(llc->laddr.mac, addr->sllc_mac, IFHWADDRLEN);
memcpy(&llc->addr, addr, sizeof(llc->addr));
@@ -394,6 +426,7 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen)
out_put:
llc_sap_put(sap);
out:
+ dev_put(dev);
release_sock(sk);
return rc;
}
@@ -422,7 +455,7 @@ static int llc_ui_shutdown(struct socket *sock, int how)
goto out;
rc = llc_send_disc(sk);
if (!rc)
- rc = llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo);
+ rc = llc_ui_wait_for_disc(sk, READ_ONCE(sk->sk_rcvtimeo));
/* Wake up anyone sleeping in poll */
sk->sk_state_change(sk);
out:
@@ -444,7 +477,7 @@ out:
* This function will autobind if user did not previously call bind.
* Returns: 0 upon success, negative otherwise.
*/
-static int llc_ui_connect(struct socket *sock, struct sockaddr *uaddr,
+static int llc_ui_connect(struct socket *sock, struct sockaddr_unsized *uaddr,
int addrlen, int flags)
{
struct sock *sk = sock->sk;
@@ -552,7 +585,8 @@ static int llc_ui_wait_for_disc(struct sock *sk, long timeout)
add_wait_queue(sk_sleep(sk), &wait);
while (1) {
- if (sk_wait_event(sk, &timeout, sk->sk_state == TCP_CLOSE, &wait))
+ if (sk_wait_event(sk, &timeout,
+ READ_ONCE(sk->sk_state) == TCP_CLOSE, &wait))
break;
rc = -ERESTARTSYS;
if (signal_pending(current))
@@ -572,7 +606,8 @@ static bool llc_ui_wait_for_conn(struct sock *sk, long timeout)
add_wait_queue(sk_sleep(sk), &wait);
while (1) {
- if (sk_wait_event(sk, &timeout, sk->sk_state != TCP_SYN_SENT, &wait))
+ if (sk_wait_event(sk, &timeout,
+ READ_ONCE(sk->sk_state) != TCP_SYN_SENT, &wait))
break;
if (signal_pending(current) || !timeout)
break;
@@ -591,7 +626,7 @@ static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout)
while (1) {
rc = 0;
if (sk_wait_event(sk, &timeout,
- (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ (READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN) ||
(!llc_data_accept_state(llc->state) &&
!llc->remote_busy_flag &&
!llc->p_flag), &wait))
@@ -653,14 +688,13 @@ static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb)
* llc_ui_accept - accept a new incoming connection.
* @sock: Socket which connections arrive on.
* @newsock: Socket to move incoming connection to.
- * @flags: User specified operational flags.
- * @kern: If the socket is kernel internal
+ * @arg: User specified arguments
*
* Accept a new incoming connection.
* Returns 0 upon success, negative otherwise.
*/
-static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags,
- bool kern)
+static int llc_ui_accept(struct socket *sock, struct socket *newsock,
+ struct proto_accept_arg *arg)
{
struct sock *sk = sock->sk, *newsk;
struct llc_sock *llc, *newllc;
@@ -678,7 +712,7 @@ static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags,
goto out;
/* wait for a connection to arrive. */
if (skb_queue_empty(&sk->sk_receive_queue)) {
- rc = llc_wait_data(sk, sk->sk_rcvtimeo);
+ rc = llc_wait_data(sk, READ_ONCE(sk->sk_rcvtimeo));
if (rc)
goto out;
}
@@ -702,7 +736,7 @@ static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags,
/* put original socket back into a clean listen state. */
sk->sk_state = TCP_LISTEN;
- sk->sk_ack_backlog--;
+ sk_acceptq_removed(sk);
dprintk("%s: ok success on %02X, client on %02X\n", __func__,
llc_sk(sk)->addr.sllc_sap, newllc->daddr.lsap);
frees:
@@ -730,7 +764,6 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
struct sk_buff *skb = NULL;
struct sock *sk = sock->sk;
struct llc_sock *llc = llc_sk(sk);
- unsigned long cpu_flags;
size_t copied = 0;
u32 peek_seq = 0;
u32 *seq, skb_len;
@@ -778,7 +811,7 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
}
/* Well, if we have backlog, try to process it now yet. */
- if (copied >= target && !sk->sk_backlog.tail)
+ if (copied >= target && !READ_ONCE(sk->sk_backlog.tail))
break;
if (copied) {
@@ -854,16 +887,15 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
if (sk->sk_type != SOCK_STREAM)
goto copy_uaddr;
- if (!(flags & MSG_PEEK)) {
- spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);
- sk_eat_skb(sk, skb);
- spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
- *seq = 0;
- }
-
/* Partial read */
if (used + offset < skb_len)
continue;
+
+ if (!(flags & MSG_PEEK)) {
+ skb_unlink(skb, &sk->sk_receive_queue);
+ kfree_skb(skb);
+ *seq = 0;
+ }
} while (len > 0);
out:
@@ -878,9 +910,8 @@ copy_uaddr:
llc_cmsg_rcv(msg, skb);
if (!(flags & MSG_PEEK)) {
- spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);
- sk_eat_skb(sk, skb);
- spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
+ skb_unlink(skb, &sk->sk_receive_queue);
+ kfree_skb(skb);
*seq = 0;
}
@@ -898,24 +929,25 @@ copy_uaddr:
*/
static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
+ DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name);
struct sock *sk = sock->sk;
struct llc_sock *llc = llc_sk(sk);
- DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name);
int flags = msg->msg_flags;
int noblock = flags & MSG_DONTWAIT;
- struct sk_buff *skb;
+ int rc = -EINVAL, copied = 0, hdrlen, hh_len;
+ struct sk_buff *skb = NULL;
+ struct net_device *dev;
size_t size = 0;
- int rc = -EINVAL, copied = 0, hdrlen;
dprintk("%s: sending from %02X to %02X\n", __func__,
llc->laddr.lsap, llc->daddr.lsap);
lock_sock(sk);
if (addr) {
if (msg->msg_namelen < sizeof(*addr))
- goto release;
+ goto out;
} else {
if (llc_ui_addr_null(&llc->addr))
- goto release;
+ goto out;
addr = &llc->addr;
}
/* must bind connection to sap if user hasn't done it. */
@@ -923,53 +955,62 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
/* bind to sap with null dev, exclusive. */
rc = llc_ui_autobind(sock, addr);
if (rc)
- goto release;
+ goto out;
}
- hdrlen = llc->dev->hard_header_len + llc_ui_header_len(sk, addr);
+ dev = llc->dev;
+ hh_len = LL_RESERVED_SPACE(dev);
+ hdrlen = llc_ui_header_len(sk, addr);
size = hdrlen + len;
- if (size > llc->dev->mtu)
- size = llc->dev->mtu;
+ size = min_t(size_t, size, READ_ONCE(dev->mtu));
copied = size - hdrlen;
rc = -EINVAL;
if (copied < 0)
- goto release;
+ goto out;
release_sock(sk);
- skb = sock_alloc_send_skb(sk, size, noblock, &rc);
+ skb = sock_alloc_send_skb(sk, hh_len + size, noblock, &rc);
lock_sock(sk);
if (!skb)
- goto release;
- skb->dev = llc->dev;
+ goto out;
+ if (sock_flag(sk, SOCK_ZAPPED) ||
+ llc->dev != dev ||
+ hdrlen != llc_ui_header_len(sk, addr) ||
+ hh_len != LL_RESERVED_SPACE(dev) ||
+ size > READ_ONCE(dev->mtu))
+ goto out;
+ skb->dev = dev;
skb->protocol = llc_proto_type(addr->sllc_arphrd);
- skb_reserve(skb, hdrlen);
+ skb_reserve(skb, hh_len + hdrlen);
rc = memcpy_from_msg(skb_put(skb, copied), msg, copied);
if (rc)
goto out;
if (sk->sk_type == SOCK_DGRAM || addr->sllc_ua) {
llc_build_and_send_ui_pkt(llc->sap, skb, addr->sllc_mac,
addr->sllc_sap);
+ skb = NULL;
goto out;
}
if (addr->sllc_test) {
llc_build_and_send_test_pkt(llc->sap, skb, addr->sllc_mac,
addr->sllc_sap);
+ skb = NULL;
goto out;
}
if (addr->sllc_xid) {
llc_build_and_send_xid_pkt(llc->sap, skb, addr->sllc_mac,
addr->sllc_sap);
+ skb = NULL;
goto out;
}
rc = -ENOPROTOOPT;
if (!(sk->sk_type == SOCK_STREAM && !addr->sllc_ua))
goto out;
rc = llc_ui_send_data(sk, skb, noblock);
+ skb = NULL;
out:
- if (rc) {
- kfree_skb(skb);
-release:
+ kfree_skb(skb);
+ if (rc)
dprintk("%s: failed sending from %02X to %02X: %d\n",
__func__, llc->laddr.lsap, llc->daddr.lsap, rc);
- }
release_sock(sk);
return rc ? : copied;
}
@@ -978,7 +1019,6 @@ release:
* llc_ui_getname - return the address info of a socket
* @sock: Socket to get address of.
* @uaddr: Address structure to return information.
- * @uaddrlen: Length of address structure.
* @peer: Does user want local or remote address information.
*
* Return the address information of a socket.
@@ -1048,7 +1088,7 @@ static int llc_ui_ioctl(struct socket *sock, unsigned int cmd,
* Set various connection specific parameters.
*/
static int llc_ui_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, unsigned int optlen)
+ sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
struct llc_sock *llc = llc_sk(sk);
@@ -1058,7 +1098,7 @@ static int llc_ui_setsockopt(struct socket *sock, int level, int optname,
lock_sock(sk);
if (unlikely(level != SOL_LLC || optlen != sizeof(int)))
goto out;
- rc = get_user(opt, (int __user *)optval);
+ rc = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen);
if (rc)
goto out;
rc = -EINVAL;
@@ -1201,7 +1241,6 @@ static const struct proto_ops llc_ui_ops = {
.sendmsg = llc_ui_sendmsg,
.recvmsg = llc_ui_recvmsg,
.mmap = sock_no_mmap,
- .sendpage = sock_no_sendpage,
};
static const char llc_proc_err_msg[] __initconst =
diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c
index 4d78375f9872..0779daa8aa8f 100644
--- a/net/llc/llc_c_ac.c
+++ b/net/llc/llc_c_ac.c
@@ -51,7 +51,7 @@ int llc_conn_ac_clear_remote_busy(struct sock *sk, struct sk_buff *skb)
struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
llc->remote_busy_flag = 0;
- del_timer(&llc->busy_state_timer.timer);
+ timer_delete(&llc->busy_state_timer.timer);
nr = LLC_I_GET_NR(pdu);
llc_conn_resend_i_pdu_as_cmd(sk, nr, 0);
}
@@ -191,7 +191,7 @@ int llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2(struct sock *sk,
struct llc_sock *llc = llc_sk(sk);
if (llc->data_flag == 2)
- del_timer(&llc->rej_sent_timer.timer);
+ timer_delete(&llc->rej_sent_timer.timer);
return 0;
}
@@ -372,6 +372,7 @@ int llc_conn_ac_send_i_cmd_p_set_1(struct sock *sk, struct sk_buff *skb)
llc_pdu_init_as_i_cmd(skb, 1, llc->vS, llc->vR);
rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac);
if (likely(!rc)) {
+ skb_get(skb);
llc_conn_send_pdu(sk, skb);
llc_conn_ac_inc_vs_by_1(sk, skb);
}
@@ -389,7 +390,8 @@ static int llc_conn_ac_send_i_cmd_p_set_0(struct sock *sk, struct sk_buff *skb)
llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR);
rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac);
if (likely(!rc)) {
- rc = llc_conn_send_pdu(sk, skb);
+ skb_get(skb);
+ llc_conn_send_pdu(sk, skb);
llc_conn_ac_inc_vs_by_1(sk, skb);
}
return rc;
@@ -406,6 +408,7 @@ int llc_conn_ac_send_i_xxx_x_set_0(struct sock *sk, struct sk_buff *skb)
llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR);
rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac);
if (likely(!rc)) {
+ skb_get(skb);
llc_conn_send_pdu(sk, skb);
llc_conn_ac_inc_vs_by_1(sk, skb);
}
@@ -778,7 +781,7 @@ int llc_conn_ac_send_sabme_cmd_p_set_x(struct sock *sk, struct sk_buff *skb)
if (nskb) {
struct llc_sap *sap = llc->sap;
- u8 *dmac = llc->daddr.mac;
+ const u8 *dmac = llc->daddr.mac;
if (llc->dev->flags & IFF_LOOPBACK)
dmac = llc->dev->dev_addr;
@@ -916,7 +919,8 @@ static int llc_conn_ac_send_i_rsp_f_set_ackpf(struct sock *sk,
llc_pdu_init_as_i_cmd(skb, llc->ack_pf, llc->vS, llc->vR);
rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac);
if (likely(!rc)) {
- rc = llc_conn_send_pdu(sk, skb);
+ skb_get(skb);
+ llc_conn_send_pdu(sk, skb);
llc_conn_ac_inc_vs_by_1(sk, skb);
}
return rc;
@@ -1107,9 +1111,9 @@ int llc_conn_ac_stop_other_timers(struct sock *sk, struct sk_buff *skb)
{
struct llc_sock *llc = llc_sk(sk);
- del_timer(&llc->rej_sent_timer.timer);
- del_timer(&llc->pf_cycle_timer.timer);
- del_timer(&llc->busy_state_timer.timer);
+ timer_delete(&llc->rej_sent_timer.timer);
+ timer_delete(&llc->pf_cycle_timer.timer);
+ timer_delete(&llc->busy_state_timer.timer);
llc->ack_must_be_send = 0;
llc->ack_pf = 0;
return 0;
@@ -1145,7 +1149,7 @@ int llc_conn_ac_start_ack_tmr_if_not_running(struct sock *sk,
int llc_conn_ac_stop_ack_timer(struct sock *sk, struct sk_buff *skb)
{
- del_timer(&llc_sk(sk)->ack_timer.timer);
+ timer_delete(&llc_sk(sk)->ack_timer.timer);
return 0;
}
@@ -1153,14 +1157,14 @@ int llc_conn_ac_stop_p_timer(struct sock *sk, struct sk_buff *skb)
{
struct llc_sock *llc = llc_sk(sk);
- del_timer(&llc->pf_cycle_timer.timer);
+ timer_delete(&llc->pf_cycle_timer.timer);
llc_conn_set_p_flag(sk, 0);
return 0;
}
int llc_conn_ac_stop_rej_timer(struct sock *sk, struct sk_buff *skb)
{
- del_timer(&llc_sk(sk)->rej_sent_timer.timer);
+ timer_delete(&llc_sk(sk)->rej_sent_timer.timer);
return 0;
}
@@ -1176,7 +1180,7 @@ int llc_conn_ac_upd_nr_received(struct sock *sk, struct sk_buff *skb)
/* On loopback we don't queue I frames in unack_pdu_q queue. */
if (acked > 0 || (llc->dev->flags & IFF_LOOPBACK)) {
llc->retry_count = 0;
- del_timer(&llc->ack_timer.timer);
+ timer_delete(&llc->ack_timer.timer);
if (llc->failed_data_req) {
/* already, we did not accept data from upper layer
* (tx_window full or unacceptable state). Now, we
@@ -1331,28 +1335,31 @@ static void llc_conn_tmr_common_cb(struct sock *sk, u8 type)
void llc_conn_pf_cycle_tmr_cb(struct timer_list *t)
{
- struct llc_sock *llc = from_timer(llc, t, pf_cycle_timer.timer);
+ struct llc_sock *llc = timer_container_of(llc, t,
+ pf_cycle_timer.timer);
llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_P_TMR);
}
void llc_conn_busy_tmr_cb(struct timer_list *t)
{
- struct llc_sock *llc = from_timer(llc, t, busy_state_timer.timer);
+ struct llc_sock *llc = timer_container_of(llc, t,
+ busy_state_timer.timer);
llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_BUSY_TMR);
}
void llc_conn_ack_tmr_cb(struct timer_list *t)
{
- struct llc_sock *llc = from_timer(llc, t, ack_timer.timer);
+ struct llc_sock *llc = timer_container_of(llc, t, ack_timer.timer);
llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_ACK_TMR);
}
void llc_conn_rej_tmr_cb(struct timer_list *t)
{
- struct llc_sock *llc = from_timer(llc, t, rej_sent_timer.timer);
+ struct llc_sock *llc = timer_container_of(llc, t,
+ rej_sent_timer.timer);
llc_conn_tmr_common_cb(&llc->sk, LLC_CONN_EV_TYPE_REJ_TMR);
}
diff --git a/net/llc/llc_c_ev.c b/net/llc/llc_c_ev.c
index 523fdd1cf781..d6627a80cb45 100644
--- a/net/llc/llc_c_ev.c
+++ b/net/llc/llc_c_ev.c
@@ -608,7 +608,7 @@ int llc_conn_ev_qlfy_p_flag_eq_1(struct sock *sk, struct sk_buff *skb)
}
/**
- * conn_ev_qlfy_last_frame_eq_1 - checks if frame is last in tx window
+ * llc_conn_ev_qlfy_last_frame_eq_1 - checks if frame is last in tx window
* @sk: current connection structure.
* @skb: current event.
*
@@ -624,7 +624,7 @@ int llc_conn_ev_qlfy_last_frame_eq_1(struct sock *sk, struct sk_buff *skb)
}
/**
- * conn_ev_qlfy_last_frame_eq_0 - checks if frame isn't last in tx window
+ * llc_conn_ev_qlfy_last_frame_eq_0 - checks if frame isn't last in tx window
* @sk: current connection structure.
* @skb: current event.
*
diff --git a/net/llc/llc_c_st.c b/net/llc/llc_c_st.c
index 2467573b5f84..1c267db304df 100644
--- a/net/llc/llc_c_st.c
+++ b/net/llc/llc_c_st.c
@@ -42,7 +42,7 @@ static const llc_conn_action_t llc_common_actions_1[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_common_state_trans_1 = {
+static const struct llc_conn_state_trans llc_common_state_trans_1 = {
.ev = llc_conn_ev_disc_req,
.next_state = LLC_CONN_STATE_D_CONN,
.ev_qualifiers = NONE,
@@ -59,7 +59,7 @@ static const llc_conn_action_t llc_common_actions_2[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_common_state_trans_2 = {
+static const struct llc_conn_state_trans llc_common_state_trans_2 = {
.ev = llc_conn_ev_rst_req,
.next_state = LLC_CONN_STATE_RESET,
.ev_qualifiers = NONE,
@@ -79,7 +79,7 @@ static const llc_conn_action_t llc_common_actions_3[] = {
[8] = NULL,
};
-static struct llc_conn_state_trans llc_common_state_trans_3 = {
+static const struct llc_conn_state_trans llc_common_state_trans_3 = {
.ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = NONE,
@@ -95,7 +95,7 @@ static const llc_conn_action_t llc_common_actions_4[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_common_state_trans_4 = {
+static const struct llc_conn_state_trans llc_common_state_trans_4 = {
.ev = llc_conn_ev_rx_disc_cmd_pbit_set_x,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = NONE,
@@ -114,7 +114,7 @@ static const llc_conn_action_t llc_common_actions_5[] = {
[7] = NULL,
};
-static struct llc_conn_state_trans llc_common_state_trans_5 = {
+static const struct llc_conn_state_trans llc_common_state_trans_5 = {
.ev = llc_conn_ev_rx_frmr_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_RESET,
.ev_qualifiers = NONE,
@@ -129,7 +129,7 @@ static const llc_conn_action_t llc_common_actions_6[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_common_state_trans_6 = {
+static const struct llc_conn_state_trans llc_common_state_trans_6 = {
.ev = llc_conn_ev_rx_dm_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = NONE,
@@ -145,7 +145,7 @@ static const llc_conn_action_t llc_common_actions_7a[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_common_state_trans_7a = {
+static const struct llc_conn_state_trans llc_common_state_trans_7a = {
.ev = llc_conn_ev_rx_zzz_cmd_pbit_set_x_inval_nr,
.next_state = LLC_CONN_STATE_ERROR,
.ev_qualifiers = NONE,
@@ -161,7 +161,7 @@ static const llc_conn_action_t llc_common_actions_7b[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_common_state_trans_7b = {
+static const struct llc_conn_state_trans llc_common_state_trans_7b = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_x_inval_ns,
.next_state = LLC_CONN_STATE_ERROR,
.ev_qualifiers = NONE,
@@ -177,7 +177,7 @@ static const llc_conn_action_t llc_common_actions_8a[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_common_state_trans_8a = {
+static const struct llc_conn_state_trans llc_common_state_trans_8a = {
.ev = llc_conn_ev_rx_zzz_rsp_fbit_set_x_inval_nr,
.next_state = LLC_CONN_STATE_ERROR,
.ev_qualifiers = NONE,
@@ -193,7 +193,7 @@ static const llc_conn_action_t llc_common_actions_8b[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_common_state_trans_8b = {
+static const struct llc_conn_state_trans llc_common_state_trans_8b = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_x_inval_ns,
.next_state = LLC_CONN_STATE_ERROR,
.ev_qualifiers = NONE,
@@ -209,7 +209,7 @@ static const llc_conn_action_t llc_common_actions_8c[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_common_state_trans_8c = {
+static const struct llc_conn_state_trans llc_common_state_trans_8c = {
.ev = llc_conn_ev_rx_bad_pdu,
.next_state = LLC_CONN_STATE_ERROR,
.ev_qualifiers = NONE,
@@ -225,7 +225,7 @@ static const llc_conn_action_t llc_common_actions_9[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_common_state_trans_9 = {
+static const struct llc_conn_state_trans llc_common_state_trans_9 = {
.ev = llc_conn_ev_rx_ua_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_ERROR,
.ev_qualifiers = NONE,
@@ -247,7 +247,7 @@ static const llc_conn_action_t llc_common_actions_10[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_common_state_trans_10 = {
+static const struct llc_conn_state_trans llc_common_state_trans_10 = {
.ev = llc_conn_ev_rx_xxx_rsp_fbit_set_1,
.next_state = LLC_CONN_STATE_ERROR,
.ev_qualifiers = llc_common_ev_qfyrs_10,
@@ -270,7 +270,7 @@ static const llc_conn_action_t llc_common_actions_11a[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_common_state_trans_11a = {
+static const struct llc_conn_state_trans llc_common_state_trans_11a = {
.ev = llc_conn_ev_p_tmr_exp,
.next_state = LLC_CONN_STATE_RESET,
.ev_qualifiers = llc_common_ev_qfyrs_11a,
@@ -292,7 +292,7 @@ static const llc_conn_action_t llc_common_actions_11b[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_common_state_trans_11b = {
+static const struct llc_conn_state_trans llc_common_state_trans_11b = {
.ev = llc_conn_ev_ack_tmr_exp,
.next_state = LLC_CONN_STATE_RESET,
.ev_qualifiers = llc_common_ev_qfyrs_11b,
@@ -314,7 +314,7 @@ static const llc_conn_action_t llc_common_actions_11c[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_common_state_trans_11c = {
+static const struct llc_conn_state_trans llc_common_state_trans_11c = {
.ev = llc_conn_ev_rej_tmr_exp,
.next_state = LLC_CONN_STATE_RESET,
.ev_qualifiers = llc_common_ev_qfyrs_11c,
@@ -336,7 +336,7 @@ static const llc_conn_action_t llc_common_actions_11d[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_common_state_trans_11d = {
+static const struct llc_conn_state_trans llc_common_state_trans_11d = {
.ev = llc_conn_ev_busy_tmr_exp,
.next_state = LLC_CONN_STATE_RESET,
.ev_qualifiers = llc_common_ev_qfyrs_11d,
@@ -347,7 +347,7 @@ static struct llc_conn_state_trans llc_common_state_trans_11d = {
* Common dummy state transition; must be last entry for all state
* transition groups - it'll be on .bss, so will be zeroed.
*/
-static struct llc_conn_state_trans llc_common_state_trans_end;
+static const struct llc_conn_state_trans llc_common_state_trans_end;
/* LLC_CONN_STATE_ADM transitions */
/* State transitions for LLC_CONN_EV_CONN_REQ event */
@@ -359,7 +359,7 @@ static const llc_conn_action_t llc_adm_actions_1[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_adm_state_trans_1 = {
+static const struct llc_conn_state_trans llc_adm_state_trans_1 = {
.ev = llc_conn_ev_conn_req,
.next_state = LLC_CONN_STATE_SETUP,
.ev_qualifiers = NONE,
@@ -378,7 +378,7 @@ static const llc_conn_action_t llc_adm_actions_2[] = {
[7] = NULL,
};
-static struct llc_conn_state_trans llc_adm_state_trans_2 = {
+static const struct llc_conn_state_trans llc_adm_state_trans_2 = {
.ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = NONE,
@@ -392,7 +392,7 @@ static const llc_conn_action_t llc_adm_actions_3[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_adm_state_trans_3 = {
+static const struct llc_conn_state_trans llc_adm_state_trans_3 = {
.ev = llc_conn_ev_rx_disc_cmd_pbit_set_x,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = NONE,
@@ -406,7 +406,7 @@ static const llc_conn_action_t llc_adm_actions_4[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_adm_state_trans_4 = {
+static const struct llc_conn_state_trans llc_adm_state_trans_4 = {
.ev = llc_conn_ev_rx_xxx_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = NONE,
@@ -419,7 +419,7 @@ static const llc_conn_action_t llc_adm_actions_5[] = {
[1] = NULL,
};
-static struct llc_conn_state_trans llc_adm_state_trans_5 = {
+static const struct llc_conn_state_trans llc_adm_state_trans_5 = {
.ev = llc_conn_ev_rx_any_frame,
.next_state = LLC_CONN_OUT_OF_SVC,
.ev_qualifiers = NONE,
@@ -430,7 +430,7 @@ static struct llc_conn_state_trans llc_adm_state_trans_5 = {
* Array of pointers;
* one to each transition
*/
-static struct llc_conn_state_trans *llc_adm_state_transitions[] = {
+static const struct llc_conn_state_trans *llc_adm_state_transitions[] = {
[0] = &llc_adm_state_trans_1, /* Request */
[1] = &llc_common_state_trans_end,
[2] = &llc_common_state_trans_end, /* local_busy */
@@ -453,7 +453,7 @@ static const llc_conn_action_t llc_setup_actions_1[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_setup_state_trans_1 = {
+static const struct llc_conn_state_trans llc_setup_state_trans_1 = {
.ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x,
.next_state = LLC_CONN_STATE_SETUP,
.ev_qualifiers = NONE,
@@ -477,7 +477,7 @@ static const llc_conn_action_t llc_setup_actions_2[] = {
[6] = NULL,
};
-static struct llc_conn_state_trans llc_setup_state_trans_2 = {
+static const struct llc_conn_state_trans llc_setup_state_trans_2 = {
.ev = llc_conn_ev_rx_ua_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_setup_ev_qfyrs_2,
@@ -498,7 +498,7 @@ static const llc_conn_action_t llc_setup_actions_3[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_setup_state_trans_3 = {
+static const struct llc_conn_state_trans llc_setup_state_trans_3 = {
.ev = llc_conn_ev_ack_tmr_exp,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_setup_ev_qfyrs_3,
@@ -519,7 +519,7 @@ static const llc_conn_action_t llc_setup_actions_4[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_setup_state_trans_4 = {
+static const struct llc_conn_state_trans llc_setup_state_trans_4 = {
.ev = llc_conn_ev_rx_disc_cmd_pbit_set_x,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = llc_setup_ev_qfyrs_4,
@@ -539,7 +539,7 @@ static const llc_conn_action_t llc_setup_actions_5[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_setup_state_trans_5 = {
+static const struct llc_conn_state_trans llc_setup_state_trans_5 = {
.ev = llc_conn_ev_rx_dm_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = llc_setup_ev_qfyrs_5,
@@ -560,7 +560,7 @@ static const llc_conn_action_t llc_setup_actions_7[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_setup_state_trans_7 = {
+static const struct llc_conn_state_trans llc_setup_state_trans_7 = {
.ev = llc_conn_ev_ack_tmr_exp,
.next_state = LLC_CONN_STATE_SETUP,
.ev_qualifiers = llc_setup_ev_qfyrs_7,
@@ -581,7 +581,7 @@ static const llc_conn_action_t llc_setup_actions_8[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_setup_state_trans_8 = {
+static const struct llc_conn_state_trans llc_setup_state_trans_8 = {
.ev = llc_conn_ev_ack_tmr_exp,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = llc_setup_ev_qfyrs_8,
@@ -592,7 +592,7 @@ static struct llc_conn_state_trans llc_setup_state_trans_8 = {
* Array of pointers;
* one to each transition
*/
-static struct llc_conn_state_trans *llc_setup_state_transitions[] = {
+static const struct llc_conn_state_trans *llc_setup_state_transitions[] = {
[0] = &llc_common_state_trans_end, /* Request */
[1] = &llc_common_state_trans_end, /* local busy */
[2] = &llc_common_state_trans_end, /* init_pf_cycle */
@@ -622,7 +622,7 @@ static const llc_conn_action_t llc_normal_actions_1[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_1 = {
+static const struct llc_conn_state_trans llc_normal_state_trans_1 = {
.ev = llc_conn_ev_data_req,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_normal_ev_qfyrs_1,
@@ -643,7 +643,7 @@ static const llc_conn_action_t llc_normal_actions_2[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_2 = {
+static const struct llc_conn_state_trans llc_normal_state_trans_2 = {
.ev = llc_conn_ev_data_req,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_normal_ev_qfyrs_2,
@@ -660,7 +660,7 @@ static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_2_1[] = {
/* just one member, NULL, .bss zeroes it */
static const llc_conn_action_t llc_normal_actions_2_1[1];
-static struct llc_conn_state_trans llc_normal_state_trans_2_1 = {
+static const struct llc_conn_state_trans llc_normal_state_trans_2_1 = {
.ev = llc_conn_ev_data_req,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_normal_ev_qfyrs_2_1,
@@ -680,7 +680,7 @@ static const llc_conn_action_t llc_normal_actions_3[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_3 = {
+static const struct llc_conn_state_trans llc_normal_state_trans_3 = {
.ev = llc_conn_ev_local_busy_detected,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_normal_ev_qfyrs_3,
@@ -700,7 +700,7 @@ static const llc_conn_action_t llc_normal_actions_4[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_4 = {
+static const struct llc_conn_state_trans llc_normal_state_trans_4 = {
.ev = llc_conn_ev_local_busy_detected,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_normal_ev_qfyrs_4,
@@ -723,7 +723,7 @@ static const llc_conn_action_t llc_normal_actions_5a[] = {
[6] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_5a = {
+static const struct llc_conn_state_trans llc_normal_state_trans_5a = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = llc_normal_ev_qfyrs_5a,
@@ -746,7 +746,7 @@ static const llc_conn_action_t llc_normal_actions_5b[] = {
[6] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_5b = {
+static const struct llc_conn_state_trans llc_normal_state_trans_5b = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = llc_normal_ev_qfyrs_5b,
@@ -769,7 +769,7 @@ static const llc_conn_action_t llc_normal_actions_5c[] = {
[6] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_5c = {
+static const struct llc_conn_state_trans llc_normal_state_trans_5c = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = llc_normal_ev_qfyrs_5c,
@@ -790,7 +790,7 @@ static const llc_conn_action_t llc_normal_actions_6a[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_6a = {
+static const struct llc_conn_state_trans llc_normal_state_trans_6a = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = llc_normal_ev_qfyrs_6a,
@@ -811,7 +811,7 @@ static const llc_conn_action_t llc_normal_actions_6b[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_6b = {
+static const struct llc_conn_state_trans llc_normal_state_trans_6b = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = llc_normal_ev_qfyrs_6b,
@@ -827,7 +827,7 @@ static const llc_conn_action_t llc_normal_actions_7[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_7 = {
+static const struct llc_conn_state_trans llc_normal_state_trans_7 = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = NONE,
@@ -850,7 +850,7 @@ static const llc_conn_action_t llc_normal_actions_8[] = {
[6] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_8a = {
+static const struct llc_conn_state_trans llc_normal_state_trans_8a = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_normal_ev_qfyrs_8a,
@@ -863,7 +863,7 @@ static const llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_8b[] = {
[1] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_8b = {
+static const struct llc_conn_state_trans llc_normal_state_trans_8b = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_normal_ev_qfyrs_8b,
@@ -884,7 +884,7 @@ static const llc_conn_action_t llc_normal_actions_9a[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_9a = {
+static const struct llc_conn_state_trans llc_normal_state_trans_9a = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_normal_ev_qfyrs_9a,
@@ -905,7 +905,7 @@ static const llc_conn_action_t llc_normal_actions_9b[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_9b = {
+static const struct llc_conn_state_trans llc_normal_state_trans_9b = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_normal_ev_qfyrs_9b,
@@ -922,7 +922,7 @@ static const llc_conn_action_t llc_normal_actions_10[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_10 = {
+static const struct llc_conn_state_trans llc_normal_state_trans_10 = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = NONE,
@@ -937,7 +937,7 @@ static const llc_conn_action_t llc_normal_actions_11a[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_11a = {
+static const struct llc_conn_state_trans llc_normal_state_trans_11a = {
.ev = llc_conn_ev_rx_rr_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = NONE,
@@ -952,7 +952,7 @@ static const llc_conn_action_t llc_normal_actions_11b[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_11b = {
+static const struct llc_conn_state_trans llc_normal_state_trans_11b = {
.ev = llc_conn_ev_rx_rr_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = NONE,
@@ -973,7 +973,7 @@ static const llc_conn_action_t llc_normal_actions_11c[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_11c = {
+static const struct llc_conn_state_trans llc_normal_state_trans_11c = {
.ev = llc_conn_ev_rx_rr_rsp_fbit_set_1,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_normal_ev_qfyrs_11c,
@@ -990,7 +990,7 @@ static const llc_conn_action_t llc_normal_actions_12[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_12 = {
+static const struct llc_conn_state_trans llc_normal_state_trans_12 = {
.ev = llc_conn_ev_rx_rr_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = NONE,
@@ -1005,7 +1005,7 @@ static const llc_conn_action_t llc_normal_actions_13a[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_13a = {
+static const struct llc_conn_state_trans llc_normal_state_trans_13a = {
.ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = NONE,
@@ -1020,7 +1020,7 @@ static const llc_conn_action_t llc_normal_actions_13b[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_13b = {
+static const struct llc_conn_state_trans llc_normal_state_trans_13b = {
.ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = NONE,
@@ -1040,7 +1040,7 @@ static const llc_conn_action_t llc_normal_actions_13c[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_13c = {
+static const struct llc_conn_state_trans llc_normal_state_trans_13c = {
.ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_normal_ev_qfyrs_13c,
@@ -1057,7 +1057,7 @@ static const llc_conn_action_t llc_normal_actions_14[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_14 = {
+static const struct llc_conn_state_trans llc_normal_state_trans_14 = {
.ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = NONE,
@@ -1080,7 +1080,7 @@ static const llc_conn_action_t llc_normal_actions_15a[] = {
[6] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_15a = {
+static const struct llc_conn_state_trans llc_normal_state_trans_15a = {
.ev = llc_conn_ev_rx_rej_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_normal_ev_qfyrs_15a,
@@ -1103,7 +1103,7 @@ static const llc_conn_action_t llc_normal_actions_15b[] = {
[6] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_15b = {
+static const struct llc_conn_state_trans llc_normal_state_trans_15b = {
.ev = llc_conn_ev_rx_rej_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_normal_ev_qfyrs_15b,
@@ -1125,7 +1125,7 @@ static const llc_conn_action_t llc_normal_actions_16a[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_16a = {
+static const struct llc_conn_state_trans llc_normal_state_trans_16a = {
.ev = llc_conn_ev_rx_rej_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_normal_ev_qfyrs_16a,
@@ -1147,7 +1147,7 @@ static const llc_conn_action_t llc_normal_actions_16b[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_16b = {
+static const struct llc_conn_state_trans llc_normal_state_trans_16b = {
.ev = llc_conn_ev_rx_rej_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_normal_ev_qfyrs_16b,
@@ -1164,7 +1164,7 @@ static const llc_conn_action_t llc_normal_actions_17[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_17 = {
+static const struct llc_conn_state_trans llc_normal_state_trans_17 = {
.ev = llc_conn_ev_rx_rej_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = NONE,
@@ -1183,7 +1183,7 @@ static const llc_conn_action_t llc_normal_actions_18[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_18 = {
+static const struct llc_conn_state_trans llc_normal_state_trans_18 = {
.ev = llc_conn_ev_init_p_f_cycle,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_normal_ev_qfyrs_18,
@@ -1205,7 +1205,7 @@ static const llc_conn_action_t llc_normal_actions_19[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_19 = {
+static const struct llc_conn_state_trans llc_normal_state_trans_19 = {
.ev = llc_conn_ev_p_tmr_exp,
.next_state = LLC_CONN_STATE_AWAIT,
.ev_qualifiers = llc_normal_ev_qfyrs_19,
@@ -1228,7 +1228,7 @@ static const llc_conn_action_t llc_normal_actions_20a[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_20a = {
+static const struct llc_conn_state_trans llc_normal_state_trans_20a = {
.ev = llc_conn_ev_ack_tmr_exp,
.next_state = LLC_CONN_STATE_AWAIT,
.ev_qualifiers = llc_normal_ev_qfyrs_20a,
@@ -1251,7 +1251,7 @@ static const llc_conn_action_t llc_normal_actions_20b[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_20b = {
+static const struct llc_conn_state_trans llc_normal_state_trans_20b = {
.ev = llc_conn_ev_busy_tmr_exp,
.next_state = LLC_CONN_STATE_AWAIT,
.ev_qualifiers = llc_normal_ev_qfyrs_20b,
@@ -1270,7 +1270,7 @@ static const llc_conn_action_t llc_normal_actions_21[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_normal_state_trans_21 = {
+static const struct llc_conn_state_trans llc_normal_state_trans_21 = {
.ev = llc_conn_ev_tx_buffer_full,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_normal_ev_qfyrs_21,
@@ -1281,7 +1281,7 @@ static struct llc_conn_state_trans llc_normal_state_trans_21 = {
* Array of pointers;
* one to each transition
*/
-static struct llc_conn_state_trans *llc_normal_state_transitions[] = {
+static const struct llc_conn_state_trans *llc_normal_state_transitions[] = {
[0] = &llc_normal_state_trans_1, /* Requests */
[1] = &llc_normal_state_trans_2,
[2] = &llc_normal_state_trans_2_1,
@@ -1354,7 +1354,7 @@ static const llc_conn_action_t llc_busy_actions_1[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_1 = {
+static const struct llc_conn_state_trans llc_busy_state_trans_1 = {
.ev = llc_conn_ev_data_req,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_1,
@@ -1374,7 +1374,7 @@ static const llc_conn_action_t llc_busy_actions_2[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_2 = {
+static const struct llc_conn_state_trans llc_busy_state_trans_2 = {
.ev = llc_conn_ev_data_req,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_2,
@@ -1391,7 +1391,7 @@ static const llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_2_1[] = {
/* just one member, NULL, .bss zeroes it */
static const llc_conn_action_t llc_busy_actions_2_1[1];
-static struct llc_conn_state_trans llc_busy_state_trans_2_1 = {
+static const struct llc_conn_state_trans llc_busy_state_trans_2_1 = {
.ev = llc_conn_ev_data_req,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_2_1,
@@ -1411,7 +1411,7 @@ static const llc_conn_action_t llc_busy_actions_3[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_3 = {
+static const struct llc_conn_state_trans llc_busy_state_trans_3 = {
.ev = llc_conn_ev_local_busy_cleared,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = llc_busy_ev_qfyrs_3,
@@ -1431,7 +1431,7 @@ static const llc_conn_action_t llc_busy_actions_4[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_4 = {
+static const struct llc_conn_state_trans llc_busy_state_trans_4 = {
.ev = llc_conn_ev_local_busy_cleared,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = llc_busy_ev_qfyrs_4,
@@ -1450,7 +1450,7 @@ static const llc_conn_action_t llc_busy_actions_5[] = {
[1] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_5 = {
+static const struct llc_conn_state_trans llc_busy_state_trans_5 = {
.ev = llc_conn_ev_local_busy_cleared,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_busy_ev_qfyrs_5,
@@ -1469,7 +1469,7 @@ static const llc_conn_action_t llc_busy_actions_6[] = {
[1] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_6 = {
+static const struct llc_conn_state_trans llc_busy_state_trans_6 = {
.ev = llc_conn_ev_local_busy_cleared,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_busy_ev_qfyrs_6,
@@ -1488,7 +1488,7 @@ static const llc_conn_action_t llc_busy_actions_7[] = {
[1] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_7 = {
+static const struct llc_conn_state_trans llc_busy_state_trans_7 = {
.ev = llc_conn_ev_local_busy_cleared,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = llc_busy_ev_qfyrs_7,
@@ -1507,7 +1507,7 @@ static const llc_conn_action_t llc_busy_actions_8[] = {
[1] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_8 = {
+static const struct llc_conn_state_trans llc_busy_state_trans_8 = {
.ev = llc_conn_ev_local_busy_cleared,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = llc_busy_ev_qfyrs_8,
@@ -1529,7 +1529,7 @@ static const llc_conn_action_t llc_busy_actions_9a[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_9a = {
+static const struct llc_conn_state_trans llc_busy_state_trans_9a = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_x_unexpd_ns,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_9a,
@@ -1551,7 +1551,7 @@ static const llc_conn_action_t llc_busy_actions_9b[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_9b = {
+static const struct llc_conn_state_trans llc_busy_state_trans_9b = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_9b,
@@ -1571,7 +1571,7 @@ static const llc_conn_action_t llc_busy_actions_10a[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_10a = {
+static const struct llc_conn_state_trans llc_busy_state_trans_10a = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_10a,
@@ -1591,7 +1591,7 @@ static const llc_conn_action_t llc_busy_actions_10b[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_10b = {
+static const struct llc_conn_state_trans llc_busy_state_trans_10b = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_10b,
@@ -1606,7 +1606,7 @@ static const llc_conn_action_t llc_busy_actions_11[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_11 = {
+static const struct llc_conn_state_trans llc_busy_state_trans_11 = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = NONE,
@@ -1624,7 +1624,7 @@ static const llc_conn_action_t llc_busy_actions_12[] = {
[6] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_12 = {
+static const struct llc_conn_state_trans llc_busy_state_trans_12 = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = NONE,
@@ -1649,7 +1649,7 @@ static const llc_conn_action_t llc_busy_actions_13a[] = {
[8] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_13a = {
+static const struct llc_conn_state_trans llc_busy_state_trans_13a = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_13a,
@@ -1674,7 +1674,7 @@ static const llc_conn_action_t llc_busy_actions_13b[] = {
[8] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_13b = {
+static const struct llc_conn_state_trans llc_busy_state_trans_13b = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_13b,
@@ -1697,7 +1697,7 @@ static const llc_conn_action_t llc_busy_actions_14a[] = {
[6] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_14a = {
+static const struct llc_conn_state_trans llc_busy_state_trans_14a = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_14a,
@@ -1720,7 +1720,7 @@ static const llc_conn_action_t llc_busy_actions_14b[] = {
[6] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_14b = {
+static const struct llc_conn_state_trans llc_busy_state_trans_14b = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_14b,
@@ -1735,7 +1735,7 @@ static const llc_conn_action_t llc_busy_actions_15a[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_15a = {
+static const struct llc_conn_state_trans llc_busy_state_trans_15a = {
.ev = llc_conn_ev_rx_rr_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = NONE,
@@ -1750,7 +1750,7 @@ static const llc_conn_action_t llc_busy_actions_15b[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_15b = {
+static const struct llc_conn_state_trans llc_busy_state_trans_15b = {
.ev = llc_conn_ev_rx_rr_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = NONE,
@@ -1770,7 +1770,7 @@ static const llc_conn_action_t llc_busy_actions_15c[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_15c = {
+static const struct llc_conn_state_trans llc_busy_state_trans_15c = {
.ev = llc_conn_ev_rx_rr_rsp_fbit_set_1,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_15c,
@@ -1785,7 +1785,7 @@ static const llc_conn_action_t llc_busy_actions_16[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_16 = {
+static const struct llc_conn_state_trans llc_busy_state_trans_16 = {
.ev = llc_conn_ev_rx_rr_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = NONE,
@@ -1800,7 +1800,7 @@ static const llc_conn_action_t llc_busy_actions_17a[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_17a = {
+static const struct llc_conn_state_trans llc_busy_state_trans_17a = {
.ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = NONE,
@@ -1815,7 +1815,7 @@ static const llc_conn_action_t llc_busy_actions_17b[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_17b = {
+static const struct llc_conn_state_trans llc_busy_state_trans_17b = {
.ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = NONE,
@@ -1835,7 +1835,7 @@ static const llc_conn_action_t llc_busy_actions_17c[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_17c = {
+static const struct llc_conn_state_trans llc_busy_state_trans_17c = {
.ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_17c,
@@ -1850,7 +1850,7 @@ static const llc_conn_action_t llc_busy_actions_18[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_18 = {
+static const struct llc_conn_state_trans llc_busy_state_trans_18 = {
.ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = NONE,
@@ -1872,7 +1872,7 @@ static const llc_conn_action_t llc_busy_actions_19a[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_19a = {
+static const struct llc_conn_state_trans llc_busy_state_trans_19a = {
.ev = llc_conn_ev_rx_rej_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_19a,
@@ -1894,7 +1894,7 @@ static const llc_conn_action_t llc_busy_actions_19b[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_19b = {
+static const struct llc_conn_state_trans llc_busy_state_trans_19b = {
.ev = llc_conn_ev_rx_rej_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_19b,
@@ -1915,7 +1915,7 @@ static const llc_conn_action_t llc_busy_actions_20a[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_20a = {
+static const struct llc_conn_state_trans llc_busy_state_trans_20a = {
.ev = llc_conn_ev_rx_rej_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_20a,
@@ -1936,7 +1936,7 @@ static const llc_conn_action_t llc_busy_actions_20b[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_20b = {
+static const struct llc_conn_state_trans llc_busy_state_trans_20b = {
.ev = llc_conn_ev_rx_rej_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_20b,
@@ -1953,7 +1953,7 @@ static const llc_conn_action_t llc_busy_actions_21[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_21 = {
+static const struct llc_conn_state_trans llc_busy_state_trans_21 = {
.ev = llc_conn_ev_rx_rej_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = NONE,
@@ -1972,7 +1972,7 @@ static const llc_conn_action_t llc_busy_actions_22[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_22 = {
+static const struct llc_conn_state_trans llc_busy_state_trans_22 = {
.ev = llc_conn_ev_init_p_f_cycle,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_22,
@@ -1993,7 +1993,7 @@ static const llc_conn_action_t llc_busy_actions_23[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_23 = {
+static const struct llc_conn_state_trans llc_busy_state_trans_23 = {
.ev = llc_conn_ev_p_tmr_exp,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_23,
@@ -2015,7 +2015,7 @@ static const llc_conn_action_t llc_busy_actions_24a[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_24a = {
+static const struct llc_conn_state_trans llc_busy_state_trans_24a = {
.ev = llc_conn_ev_ack_tmr_exp,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_24a,
@@ -2037,7 +2037,7 @@ static const llc_conn_action_t llc_busy_actions_24b[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_24b = {
+static const struct llc_conn_state_trans llc_busy_state_trans_24b = {
.ev = llc_conn_ev_busy_tmr_exp,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_24b,
@@ -2060,7 +2060,7 @@ static const llc_conn_action_t llc_busy_actions_25[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_25 = {
+static const struct llc_conn_state_trans llc_busy_state_trans_25 = {
.ev = llc_conn_ev_rej_tmr_exp,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_25,
@@ -2079,7 +2079,7 @@ static const llc_conn_action_t llc_busy_actions_26[] = {
[1] = NULL,
};
-static struct llc_conn_state_trans llc_busy_state_trans_26 = {
+static const struct llc_conn_state_trans llc_busy_state_trans_26 = {
.ev = llc_conn_ev_rej_tmr_exp,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_busy_ev_qfyrs_26,
@@ -2090,7 +2090,7 @@ static struct llc_conn_state_trans llc_busy_state_trans_26 = {
* Array of pointers;
* one to each transition
*/
-static struct llc_conn_state_trans *llc_busy_state_transitions[] = {
+static const struct llc_conn_state_trans *llc_busy_state_transitions[] = {
[0] = &llc_common_state_trans_1, /* Request */
[1] = &llc_common_state_trans_2,
[2] = &llc_busy_state_trans_1,
@@ -2166,7 +2166,7 @@ static const llc_conn_action_t llc_reject_actions_1[] = {
[1] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_1 = {
+static const struct llc_conn_state_trans llc_reject_state_trans_1 = {
.ev = llc_conn_ev_data_req,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = llc_reject_ev_qfyrs_1,
@@ -2185,7 +2185,7 @@ static const llc_conn_action_t llc_reject_actions_2[] = {
[1] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_2 = {
+static const struct llc_conn_state_trans llc_reject_state_trans_2 = {
.ev = llc_conn_ev_data_req,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = llc_reject_ev_qfyrs_2,
@@ -2202,7 +2202,7 @@ static const llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_2_1[] = {
/* just one member, NULL, .bss zeroes it */
static const llc_conn_action_t llc_reject_actions_2_1[1];
-static struct llc_conn_state_trans llc_reject_state_trans_2_1 = {
+static const struct llc_conn_state_trans llc_reject_state_trans_2_1 = {
.ev = llc_conn_ev_data_req,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = llc_reject_ev_qfyrs_2_1,
@@ -2222,7 +2222,7 @@ static const llc_conn_action_t llc_reject_actions_3[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_3 = {
+static const struct llc_conn_state_trans llc_reject_state_trans_3 = {
.ev = llc_conn_ev_local_busy_detected,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_reject_ev_qfyrs_3,
@@ -2241,7 +2241,7 @@ static const llc_conn_action_t llc_reject_actions_4[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_4 = {
+static const struct llc_conn_state_trans llc_reject_state_trans_4 = {
.ev = llc_conn_ev_local_busy_detected,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = llc_reject_ev_qfyrs_4,
@@ -2256,7 +2256,7 @@ static const llc_conn_action_t llc_reject_actions_5a[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_5a = {
+static const struct llc_conn_state_trans llc_reject_state_trans_5a = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = NONE,
@@ -2271,7 +2271,7 @@ static const llc_conn_action_t llc_reject_actions_5b[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_5b = {
+static const struct llc_conn_state_trans llc_reject_state_trans_5b = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = NONE,
@@ -2291,7 +2291,7 @@ static const llc_conn_action_t llc_reject_actions_5c[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_5c = {
+static const struct llc_conn_state_trans llc_reject_state_trans_5c = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = llc_reject_ev_qfyrs_5c,
@@ -2305,7 +2305,7 @@ static const llc_conn_action_t llc_reject_actions_6[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_6 = {
+static const struct llc_conn_state_trans llc_reject_state_trans_6 = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = NONE,
@@ -2330,7 +2330,7 @@ static const llc_conn_action_t llc_reject_actions_7a[] = {
};
-static struct llc_conn_state_trans llc_reject_state_trans_7a = {
+static const struct llc_conn_state_trans llc_reject_state_trans_7a = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_reject_ev_qfyrs_7a,
@@ -2354,7 +2354,7 @@ static const llc_conn_action_t llc_reject_actions_7b[] = {
[7] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_7b = {
+static const struct llc_conn_state_trans llc_reject_state_trans_7b = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_reject_ev_qfyrs_7b,
@@ -2376,7 +2376,7 @@ static const llc_conn_action_t llc_reject_actions_8a[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_8a = {
+static const struct llc_conn_state_trans llc_reject_state_trans_8a = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_reject_ev_qfyrs_8a,
@@ -2398,7 +2398,7 @@ static const llc_conn_action_t llc_reject_actions_8b[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_8b = {
+static const struct llc_conn_state_trans llc_reject_state_trans_8b = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_reject_ev_qfyrs_8b,
@@ -2415,7 +2415,7 @@ static const llc_conn_action_t llc_reject_actions_9[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_9 = {
+static const struct llc_conn_state_trans llc_reject_state_trans_9 = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = NONE,
@@ -2430,7 +2430,7 @@ static const llc_conn_action_t llc_reject_actions_10a[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_10a = {
+static const struct llc_conn_state_trans llc_reject_state_trans_10a = {
.ev = llc_conn_ev_rx_rr_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = NONE,
@@ -2445,7 +2445,7 @@ static const llc_conn_action_t llc_reject_actions_10b[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_10b = {
+static const struct llc_conn_state_trans llc_reject_state_trans_10b = {
.ev = llc_conn_ev_rx_rr_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = NONE,
@@ -2465,7 +2465,7 @@ static const llc_conn_action_t llc_reject_actions_10c[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_10c = {
+static const struct llc_conn_state_trans llc_reject_state_trans_10c = {
.ev = llc_conn_ev_rx_rr_rsp_fbit_set_1,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = llc_reject_ev_qfyrs_10c,
@@ -2480,7 +2480,7 @@ static const llc_conn_action_t llc_reject_actions_11[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_11 = {
+static const struct llc_conn_state_trans llc_reject_state_trans_11 = {
.ev = llc_conn_ev_rx_rr_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = NONE,
@@ -2495,7 +2495,7 @@ static const llc_conn_action_t llc_reject_actions_12a[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_12a = {
+static const struct llc_conn_state_trans llc_reject_state_trans_12a = {
.ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = NONE,
@@ -2510,7 +2510,7 @@ static const llc_conn_action_t llc_reject_actions_12b[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_12b = {
+static const struct llc_conn_state_trans llc_reject_state_trans_12b = {
.ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = NONE,
@@ -2530,7 +2530,7 @@ static const llc_conn_action_t llc_reject_actions_12c[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_12c = {
+static const struct llc_conn_state_trans llc_reject_state_trans_12c = {
.ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = llc_reject_ev_qfyrs_12c,
@@ -2545,7 +2545,7 @@ static const llc_conn_action_t llc_reject_actions_13[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_13 = {
+static const struct llc_conn_state_trans llc_reject_state_trans_13 = {
.ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = NONE,
@@ -2567,7 +2567,7 @@ static const llc_conn_action_t llc_reject_actions_14a[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_14a = {
+static const struct llc_conn_state_trans llc_reject_state_trans_14a = {
.ev = llc_conn_ev_rx_rej_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = llc_reject_ev_qfyrs_14a,
@@ -2589,7 +2589,7 @@ static const llc_conn_action_t llc_reject_actions_14b[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_14b = {
+static const struct llc_conn_state_trans llc_reject_state_trans_14b = {
.ev = llc_conn_ev_rx_rej_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = llc_reject_ev_qfyrs_14b,
@@ -2610,7 +2610,7 @@ static const llc_conn_action_t llc_reject_actions_15a[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_15a = {
+static const struct llc_conn_state_trans llc_reject_state_trans_15a = {
.ev = llc_conn_ev_rx_rej_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = llc_reject_ev_qfyrs_15a,
@@ -2631,7 +2631,7 @@ static const llc_conn_action_t llc_reject_actions_15b[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_15b = {
+static const struct llc_conn_state_trans llc_reject_state_trans_15b = {
.ev = llc_conn_ev_rx_rej_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = llc_reject_ev_qfyrs_15b,
@@ -2647,7 +2647,7 @@ static const llc_conn_action_t llc_reject_actions_16[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_16 = {
+static const struct llc_conn_state_trans llc_reject_state_trans_16 = {
.ev = llc_conn_ev_rx_rej_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = NONE,
@@ -2666,7 +2666,7 @@ static const llc_conn_action_t llc_reject_actions_17[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_17 = {
+static const struct llc_conn_state_trans llc_reject_state_trans_17 = {
.ev = llc_conn_ev_init_p_f_cycle,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = llc_reject_ev_qfyrs_17,
@@ -2688,7 +2688,7 @@ static const llc_conn_action_t llc_reject_actions_18[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_18 = {
+static const struct llc_conn_state_trans llc_reject_state_trans_18 = {
.ev = llc_conn_ev_rej_tmr_exp,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = llc_reject_ev_qfyrs_18,
@@ -2710,7 +2710,7 @@ static const llc_conn_action_t llc_reject_actions_19[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_19 = {
+static const struct llc_conn_state_trans llc_reject_state_trans_19 = {
.ev = llc_conn_ev_p_tmr_exp,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = llc_reject_ev_qfyrs_19,
@@ -2733,7 +2733,7 @@ static const llc_conn_action_t llc_reject_actions_20a[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_20a = {
+static const struct llc_conn_state_trans llc_reject_state_trans_20a = {
.ev = llc_conn_ev_ack_tmr_exp,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = llc_reject_ev_qfyrs_20a,
@@ -2756,7 +2756,7 @@ static const llc_conn_action_t llc_reject_actions_20b[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_reject_state_trans_20b = {
+static const struct llc_conn_state_trans llc_reject_state_trans_20b = {
.ev = llc_conn_ev_busy_tmr_exp,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = llc_reject_ev_qfyrs_20b,
@@ -2767,7 +2767,7 @@ static struct llc_conn_state_trans llc_reject_state_trans_20b = {
* Array of pointers;
* one to each transition
*/
-static struct llc_conn_state_trans *llc_reject_state_transitions[] = {
+static const struct llc_conn_state_trans *llc_reject_state_transitions[] = {
[0] = &llc_common_state_trans_1, /* Request */
[1] = &llc_common_state_trans_2,
[2] = &llc_common_state_trans_end,
@@ -2834,7 +2834,7 @@ static const llc_conn_ev_qfyr_t llc_await_ev_qfyrs_1_0[] = {
/* just one member, NULL, .bss zeroes it */
static const llc_conn_action_t llc_await_actions_1_0[1];
-static struct llc_conn_state_trans llc_await_state_trans_1_0 = {
+static const struct llc_conn_state_trans llc_await_state_trans_1_0 = {
.ev = llc_conn_ev_data_req,
.next_state = LLC_CONN_STATE_AWAIT,
.ev_qualifiers = llc_await_ev_qfyrs_1_0,
@@ -2848,7 +2848,7 @@ static const llc_conn_action_t llc_await_actions_1[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_1 = {
+static const struct llc_conn_state_trans llc_await_state_trans_1 = {
.ev = llc_conn_ev_local_busy_detected,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = NONE,
@@ -2867,7 +2867,7 @@ static const llc_conn_action_t llc_await_actions_2[] = {
[7] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_2 = {
+static const struct llc_conn_state_trans llc_await_state_trans_2 = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = NONE,
@@ -2883,7 +2883,7 @@ static const llc_conn_action_t llc_await_actions_3a[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_3a = {
+static const struct llc_conn_state_trans llc_await_state_trans_3a = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = NONE,
@@ -2899,7 +2899,7 @@ static const llc_conn_action_t llc_await_actions_3b[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_3b = {
+static const struct llc_conn_state_trans llc_await_state_trans_3b = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = NONE,
@@ -2916,7 +2916,7 @@ static const llc_conn_action_t llc_await_actions_4[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_4 = {
+static const struct llc_conn_state_trans llc_await_state_trans_4 = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = NONE,
@@ -2935,7 +2935,7 @@ static const llc_conn_action_t llc_await_actions_5[] = {
[7] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_5 = {
+static const struct llc_conn_state_trans llc_await_state_trans_5 = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_1,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = NONE,
@@ -2952,7 +2952,7 @@ static const llc_conn_action_t llc_await_actions_6a[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_6a = {
+static const struct llc_conn_state_trans llc_await_state_trans_6a = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT,
.ev_qualifiers = NONE,
@@ -2969,7 +2969,7 @@ static const llc_conn_action_t llc_await_actions_6b[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_6b = {
+static const struct llc_conn_state_trans llc_await_state_trans_6b = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT,
.ev_qualifiers = NONE,
@@ -2986,7 +2986,7 @@ static const llc_conn_action_t llc_await_actions_7[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_7 = {
+static const struct llc_conn_state_trans llc_await_state_trans_7 = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_AWAIT,
.ev_qualifiers = NONE,
@@ -3003,7 +3003,7 @@ static const llc_conn_action_t llc_await_actions_8a[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_8a = {
+static const struct llc_conn_state_trans llc_await_state_trans_8a = {
.ev = llc_conn_ev_rx_rr_rsp_fbit_set_1,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = NONE,
@@ -3020,7 +3020,7 @@ static const llc_conn_action_t llc_await_actions_8b[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_8b = {
+static const struct llc_conn_state_trans llc_await_state_trans_8b = {
.ev = llc_conn_ev_rx_rej_rsp_fbit_set_1,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = NONE,
@@ -3035,7 +3035,7 @@ static const llc_conn_action_t llc_await_actions_9a[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_9a = {
+static const struct llc_conn_state_trans llc_await_state_trans_9a = {
.ev = llc_conn_ev_rx_rr_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT,
.ev_qualifiers = NONE,
@@ -3050,7 +3050,7 @@ static const llc_conn_action_t llc_await_actions_9b[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_9b = {
+static const struct llc_conn_state_trans llc_await_state_trans_9b = {
.ev = llc_conn_ev_rx_rr_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT,
.ev_qualifiers = NONE,
@@ -3065,7 +3065,7 @@ static const llc_conn_action_t llc_await_actions_9c[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_9c = {
+static const struct llc_conn_state_trans llc_await_state_trans_9c = {
.ev = llc_conn_ev_rx_rej_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT,
.ev_qualifiers = NONE,
@@ -3080,7 +3080,7 @@ static const llc_conn_action_t llc_await_actions_9d[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_9d = {
+static const struct llc_conn_state_trans llc_await_state_trans_9d = {
.ev = llc_conn_ev_rx_rej_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT,
.ev_qualifiers = NONE,
@@ -3096,7 +3096,7 @@ static const llc_conn_action_t llc_await_actions_10a[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_10a = {
+static const struct llc_conn_state_trans llc_await_state_trans_10a = {
.ev = llc_conn_ev_rx_rr_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_AWAIT,
.ev_qualifiers = NONE,
@@ -3112,7 +3112,7 @@ static const llc_conn_action_t llc_await_actions_10b[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_10b = {
+static const struct llc_conn_state_trans llc_await_state_trans_10b = {
.ev = llc_conn_ev_rx_rej_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_AWAIT,
.ev_qualifiers = NONE,
@@ -3128,7 +3128,7 @@ static const llc_conn_action_t llc_await_actions_11[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_11 = {
+static const struct llc_conn_state_trans llc_await_state_trans_11 = {
.ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = NONE,
@@ -3143,7 +3143,7 @@ static const llc_conn_action_t llc_await_actions_12a[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_12a = {
+static const struct llc_conn_state_trans llc_await_state_trans_12a = {
.ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT,
.ev_qualifiers = NONE,
@@ -3158,7 +3158,7 @@ static const llc_conn_action_t llc_await_actions_12b[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_12b = {
+static const struct llc_conn_state_trans llc_await_state_trans_12b = {
.ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT,
.ev_qualifiers = NONE,
@@ -3174,7 +3174,7 @@ static const llc_conn_action_t llc_await_actions_13[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_13 = {
+static const struct llc_conn_state_trans llc_await_state_trans_13 = {
.ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_AWAIT,
.ev_qualifiers = NONE,
@@ -3194,7 +3194,7 @@ static const llc_conn_action_t llc_await_actions_14[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_await_state_trans_14 = {
+static const struct llc_conn_state_trans llc_await_state_trans_14 = {
.ev = llc_conn_ev_p_tmr_exp,
.next_state = LLC_CONN_STATE_AWAIT,
.ev_qualifiers = llc_await_ev_qfyrs_14,
@@ -3205,7 +3205,7 @@ static struct llc_conn_state_trans llc_await_state_trans_14 = {
* Array of pointers;
* one to each transition
*/
-static struct llc_conn_state_trans *llc_await_state_transitions[] = {
+static const struct llc_conn_state_trans *llc_await_state_transitions[] = {
[0] = &llc_common_state_trans_1, /* Request */
[1] = &llc_common_state_trans_2,
[2] = &llc_await_state_trans_1_0,
@@ -3263,7 +3263,7 @@ static const llc_conn_ev_qfyr_t llc_await_busy_ev_qfyrs_1_0[] = {
/* just one member, NULL, .bss zeroes it */
static const llc_conn_action_t llc_await_busy_actions_1_0[1];
-static struct llc_conn_state_trans llc_await_busy_state_trans_1_0 = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_1_0 = {
.ev = llc_conn_ev_data_req,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = llc_await_busy_ev_qfyrs_1_0,
@@ -3282,7 +3282,7 @@ static const llc_conn_action_t llc_await_busy_actions_1[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_1 = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_1 = {
.ev = llc_conn_ev_local_busy_cleared,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = llc_await_busy_ev_qfyrs_1,
@@ -3300,7 +3300,7 @@ static const llc_conn_action_t llc_await_busy_actions_2[] = {
[1] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_2 = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_2 = {
.ev = llc_conn_ev_local_busy_cleared,
.next_state = LLC_CONN_STATE_AWAIT,
.ev_qualifiers = llc_await_busy_ev_qfyrs_2,
@@ -3318,7 +3318,7 @@ static const llc_conn_action_t llc_await_busy_actions_3[] = {
[1] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_3 = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_3 = {
.ev = llc_conn_ev_local_busy_cleared,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = llc_await_busy_ev_qfyrs_3,
@@ -3337,7 +3337,7 @@ static const llc_conn_action_t llc_await_busy_actions_4[] = {
[7] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_4 = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_4 = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = NONE,
@@ -3353,7 +3353,7 @@ static const llc_conn_action_t llc_await_busy_actions_5a[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_5a = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_5a = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = NONE,
@@ -3369,7 +3369,7 @@ static const llc_conn_action_t llc_await_busy_actions_5b[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_5b = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_5b = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = NONE,
@@ -3385,7 +3385,7 @@ static const llc_conn_action_t llc_await_busy_actions_6[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_6 = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_6 = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = NONE,
@@ -3406,7 +3406,7 @@ static const llc_conn_action_t llc_await_busy_actions_7[] = {
[9] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_7 = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_7 = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_1,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = NONE,
@@ -3424,7 +3424,7 @@ static const llc_conn_action_t llc_await_busy_actions_8a[] = {
[6] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_8a = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_8a = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = NONE,
@@ -3442,7 +3442,7 @@ static const llc_conn_action_t llc_await_busy_actions_8b[] = {
[6] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_8b = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_8b = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = NONE,
@@ -3460,7 +3460,7 @@ static const llc_conn_action_t llc_await_busy_actions_9[] = {
[6] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_9 = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_9 = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = NONE,
@@ -3477,7 +3477,7 @@ static const llc_conn_action_t llc_await_busy_actions_10a[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_10a = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_10a = {
.ev = llc_conn_ev_rx_rr_rsp_fbit_set_1,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = NONE,
@@ -3494,7 +3494,7 @@ static const llc_conn_action_t llc_await_busy_actions_10b[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_10b = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_10b = {
.ev = llc_conn_ev_rx_rej_rsp_fbit_set_1,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = NONE,
@@ -3509,7 +3509,7 @@ static const llc_conn_action_t llc_await_busy_actions_11a[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_11a = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_11a = {
.ev = llc_conn_ev_rx_rr_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = NONE,
@@ -3524,7 +3524,7 @@ static const llc_conn_action_t llc_await_busy_actions_11b[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_11b = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_11b = {
.ev = llc_conn_ev_rx_rr_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = NONE,
@@ -3539,7 +3539,7 @@ static const llc_conn_action_t llc_await_busy_actions_11c[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_11c = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_11c = {
.ev = llc_conn_ev_rx_rej_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = NONE,
@@ -3554,7 +3554,7 @@ static const llc_conn_action_t llc_await_busy_actions_11d[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_11d = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_11d = {
.ev = llc_conn_ev_rx_rej_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = NONE,
@@ -3570,7 +3570,7 @@ static const llc_conn_action_t llc_await_busy_actions_12a[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_12a = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_12a = {
.ev = llc_conn_ev_rx_rr_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = NONE,
@@ -3586,7 +3586,7 @@ static const llc_conn_action_t llc_await_busy_actions_12b[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_12b = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_12b = {
.ev = llc_conn_ev_rx_rej_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = NONE,
@@ -3602,7 +3602,7 @@ static const llc_conn_action_t llc_await_busy_actions_13[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_13 = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_13 = {
.ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1,
.next_state = LLC_CONN_STATE_BUSY,
.ev_qualifiers = NONE,
@@ -3617,7 +3617,7 @@ static const llc_conn_action_t llc_await_busy_actions_14a[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_14a = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_14a = {
.ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = NONE,
@@ -3632,7 +3632,7 @@ static const llc_conn_action_t llc_await_busy_actions_14b[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_14b = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_14b = {
.ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = NONE,
@@ -3648,7 +3648,7 @@ static const llc_conn_action_t llc_await_busy_actions_15[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_15 = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_15 = {
.ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = NONE,
@@ -3668,7 +3668,7 @@ static const llc_conn_action_t llc_await_busy_actions_16[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_await_busy_state_trans_16 = {
+static const struct llc_conn_state_trans llc_await_busy_state_trans_16 = {
.ev = llc_conn_ev_p_tmr_exp,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = llc_await_busy_ev_qfyrs_16,
@@ -3679,7 +3679,7 @@ static struct llc_conn_state_trans llc_await_busy_state_trans_16 = {
* Array of pointers;
* one to each transition
*/
-static struct llc_conn_state_trans *llc_await_busy_state_transitions[] = {
+static const struct llc_conn_state_trans *llc_await_busy_state_transitions[] = {
[0] = &llc_common_state_trans_1, /* Request */
[1] = &llc_common_state_trans_2,
[2] = &llc_await_busy_state_trans_1_0,
@@ -3739,7 +3739,7 @@ static const llc_conn_ev_qfyr_t llc_await_reject_ev_qfyrs_1_0[] = {
/* just one member, NULL, .bss zeroes it */
static const llc_conn_action_t llc_await_reject_actions_1_0[1];
-static struct llc_conn_state_trans llc_await_reject_state_trans_1_0 = {
+static const struct llc_conn_state_trans llc_await_reject_state_trans_1_0 = {
.ev = llc_conn_ev_data_req,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = llc_await_reject_ev_qfyrs_1_0,
@@ -3753,7 +3753,7 @@ static const llc_conn_action_t llc_await_rejct_actions_1[] = {
[2] = NULL
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_1 = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_1 = {
.ev = llc_conn_ev_local_busy_detected,
.next_state = LLC_CONN_STATE_AWAIT_BUSY,
.ev_qualifiers = NONE,
@@ -3767,7 +3767,7 @@ static const llc_conn_action_t llc_await_rejct_actions_2a[] = {
[2] = NULL
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_2a = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_2a = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = NONE,
@@ -3781,7 +3781,7 @@ static const llc_conn_action_t llc_await_rejct_actions_2b[] = {
[2] = NULL
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_2b = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_2b = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = NONE,
@@ -3796,7 +3796,7 @@ static const llc_conn_action_t llc_await_rejct_actions_3[] = {
[3] = NULL
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_3 = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_3 = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = NONE,
@@ -3816,7 +3816,7 @@ static const llc_conn_action_t llc_await_rejct_actions_4[] = {
[8] = NULL,
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_4 = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_4 = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_1,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = NONE,
@@ -3834,7 +3834,7 @@ static const llc_conn_action_t llc_await_rejct_actions_5a[] = {
[6] = NULL,
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_5a = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_5a = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT,
.ev_qualifiers = NONE,
@@ -3852,7 +3852,7 @@ static const llc_conn_action_t llc_await_rejct_actions_5b[] = {
[6] = NULL,
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_5b = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_5b = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT,
.ev_qualifiers = NONE,
@@ -3870,7 +3870,7 @@ static const llc_conn_action_t llc_await_rejct_actions_6[] = {
[6] = NULL,
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_6 = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_6 = {
.ev = llc_conn_ev_rx_i_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_AWAIT,
.ev_qualifiers = NONE,
@@ -3887,7 +3887,7 @@ static const llc_conn_action_t llc_await_rejct_actions_7a[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_7a = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_7a = {
.ev = llc_conn_ev_rx_rr_rsp_fbit_set_1,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = NONE,
@@ -3904,7 +3904,7 @@ static const llc_conn_action_t llc_await_rejct_actions_7b[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_7b = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_7b = {
.ev = llc_conn_ev_rx_rej_rsp_fbit_set_1,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = NONE,
@@ -3921,7 +3921,7 @@ static const llc_conn_action_t llc_await_rejct_actions_7c[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_7c = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_7c = {
.ev = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = NONE,
@@ -3936,7 +3936,7 @@ static const llc_conn_action_t llc_await_rejct_actions_8a[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_8a = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_8a = {
.ev = llc_conn_ev_rx_rr_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = NONE,
@@ -3951,7 +3951,7 @@ static const llc_conn_action_t llc_await_rejct_actions_8b[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_8b = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_8b = {
.ev = llc_conn_ev_rx_rr_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = NONE,
@@ -3966,7 +3966,7 @@ static const llc_conn_action_t llc_await_rejct_actions_8c[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_8c = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_8c = {
.ev = llc_conn_ev_rx_rej_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = NONE,
@@ -3981,7 +3981,7 @@ static const llc_conn_action_t llc_await_rejct_actions_8d[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_8d = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_8d = {
.ev = llc_conn_ev_rx_rej_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = NONE,
@@ -3997,7 +3997,7 @@ static const llc_conn_action_t llc_await_rejct_actions_9a[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_9a = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_9a = {
.ev = llc_conn_ev_rx_rr_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = NONE,
@@ -4013,7 +4013,7 @@ static const llc_conn_action_t llc_await_rejct_actions_9b[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_9b = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_9b = {
.ev = llc_conn_ev_rx_rej_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = NONE,
@@ -4029,7 +4029,7 @@ static const llc_conn_action_t llc_await_rejct_actions_10[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_10 = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_10 = {
.ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1,
.next_state = LLC_CONN_STATE_REJ,
.ev_qualifiers = NONE,
@@ -4044,7 +4044,7 @@ static const llc_conn_action_t llc_await_rejct_actions_11a[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_11a = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_11a = {
.ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = NONE,
@@ -4059,7 +4059,7 @@ static const llc_conn_action_t llc_await_rejct_actions_11b[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_11b = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_11b = {
.ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = NONE,
@@ -4075,7 +4075,7 @@ static const llc_conn_action_t llc_await_rejct_actions_12[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_12 = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_12 = {
.ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = NONE,
@@ -4095,7 +4095,7 @@ static const llc_conn_action_t llc_await_rejct_actions_13[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_await_rejct_state_trans_13 = {
+static const struct llc_conn_state_trans llc_await_rejct_state_trans_13 = {
.ev = llc_conn_ev_p_tmr_exp,
.next_state = LLC_CONN_STATE_AWAIT_REJ,
.ev_qualifiers = llc_await_rejct_ev_qfyrs_13,
@@ -4106,7 +4106,7 @@ static struct llc_conn_state_trans llc_await_rejct_state_trans_13 = {
* Array of pointers;
* one to each transition
*/
-static struct llc_conn_state_trans *llc_await_rejct_state_transitions[] = {
+static const struct llc_conn_state_trans *llc_await_rejct_state_transitions[] = {
[0] = &llc_await_reject_state_trans_1_0,
[1] = &llc_common_state_trans_1, /* requests */
[2] = &llc_common_state_trans_2,
@@ -4171,7 +4171,7 @@ static const llc_conn_action_t llc_d_conn_actions_1[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_d_conn_state_trans_1 = {
+static const struct llc_conn_state_trans llc_d_conn_state_trans_1 = {
.ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = llc_d_conn_ev_qfyrs_1,
@@ -4194,7 +4194,7 @@ static const llc_conn_action_t llc_d_conn_actions_1_1[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_d_conn_state_trans_1_1 = {
+static const struct llc_conn_state_trans llc_d_conn_state_trans_1_1 = {
.ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = llc_d_conn_ev_qfyrs_1_1,
@@ -4218,7 +4218,7 @@ static const llc_conn_action_t llc_d_conn_actions_2[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_d_conn_state_trans_2 = {
+static const struct llc_conn_state_trans llc_d_conn_state_trans_2 = {
.ev = llc_conn_ev_rx_ua_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = llc_d_conn_ev_qfyrs_2,
@@ -4241,7 +4241,7 @@ static const llc_conn_action_t llc_d_conn_actions_2_1[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_d_conn_state_trans_2_1 = {
+static const struct llc_conn_state_trans llc_d_conn_state_trans_2_1 = {
.ev = llc_conn_ev_rx_ua_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = llc_d_conn_ev_qfyrs_2_1,
@@ -4254,7 +4254,7 @@ static const llc_conn_action_t llc_d_conn_actions_3[] = {
[1] = NULL,
};
-static struct llc_conn_state_trans llc_d_conn_state_trans_3 = {
+static const struct llc_conn_state_trans llc_d_conn_state_trans_3 = {
.ev = llc_conn_ev_rx_disc_cmd_pbit_set_x,
.next_state = LLC_CONN_STATE_D_CONN,
.ev_qualifiers = NONE,
@@ -4277,7 +4277,7 @@ static const llc_conn_action_t llc_d_conn_actions_4[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_d_conn_state_trans_4 = {
+static const struct llc_conn_state_trans llc_d_conn_state_trans_4 = {
.ev = llc_conn_ev_rx_dm_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = llc_d_conn_ev_qfyrs_4,
@@ -4299,7 +4299,7 @@ static const llc_conn_action_t llc_d_conn_actions_4_1[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_d_conn_state_trans_4_1 = {
+static const struct llc_conn_state_trans llc_d_conn_state_trans_4_1 = {
.ev = llc_conn_ev_rx_dm_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = llc_d_conn_ev_qfyrs_4_1,
@@ -4318,7 +4318,7 @@ static const llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_5[] = {
/* just one member, NULL, .bss zeroes it */
static const llc_conn_action_t llc_d_conn_actions_5[1];
-static struct llc_conn_state_trans llc_d_conn_state_trans_5 = {
+static const struct llc_conn_state_trans llc_d_conn_state_trans_5 = {
.ev = llc_conn_ev_data_req,
.next_state = LLC_CONN_STATE_D_CONN,
.ev_qualifiers = llc_d_conn_ev_qfyrs_5,
@@ -4338,7 +4338,7 @@ static const llc_conn_action_t llc_d_conn_actions_6[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_d_conn_state_trans_6 = {
+static const struct llc_conn_state_trans llc_d_conn_state_trans_6 = {
.ev = llc_conn_ev_ack_tmr_exp,
.next_state = LLC_CONN_STATE_D_CONN,
.ev_qualifiers = llc_d_conn_ev_qfyrs_6,
@@ -4359,7 +4359,7 @@ static const llc_conn_action_t llc_d_conn_actions_7[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_d_conn_state_trans_7 = {
+static const struct llc_conn_state_trans llc_d_conn_state_trans_7 = {
.ev = llc_conn_ev_ack_tmr_exp,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = llc_d_conn_ev_qfyrs_7,
@@ -4379,7 +4379,7 @@ static const llc_conn_action_t llc_d_conn_actions_8[] = {
[1] = NULL,
};
-static struct llc_conn_state_trans llc_d_conn_state_trans_8 = {
+static const struct llc_conn_state_trans llc_d_conn_state_trans_8 = {
.ev = llc_conn_ev_ack_tmr_exp,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = llc_d_conn_ev_qfyrs_8,
@@ -4390,7 +4390,7 @@ static struct llc_conn_state_trans llc_d_conn_state_trans_8 = {
* Array of pointers;
* one to each transition
*/
-static struct llc_conn_state_trans *llc_d_conn_state_transitions[] = {
+static const struct llc_conn_state_trans *llc_d_conn_state_transitions[] = {
[0] = &llc_d_conn_state_trans_5, /* Request */
[1] = &llc_common_state_trans_end,
[2] = &llc_common_state_trans_end, /* Local busy */
@@ -4419,7 +4419,7 @@ static const llc_conn_action_t llc_rst_actions_1[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_rst_state_trans_1 = {
+static const struct llc_conn_state_trans llc_rst_state_trans_1 = {
.ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x,
.next_state = LLC_CONN_STATE_RESET,
.ev_qualifiers = NONE,
@@ -4447,7 +4447,7 @@ static const llc_conn_action_t llc_rst_actions_2[] = {
[7] = NULL,
};
-static struct llc_conn_state_trans llc_rst_state_trans_2 = {
+static const struct llc_conn_state_trans llc_rst_state_trans_2 = {
.ev = llc_conn_ev_rx_ua_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_rst_ev_qfyrs_2,
@@ -4475,7 +4475,7 @@ static const llc_conn_action_t llc_rst_actions_2_1[] = {
[7] = NULL,
};
-static struct llc_conn_state_trans llc_rst_state_trans_2_1 = {
+static const struct llc_conn_state_trans llc_rst_state_trans_2_1 = {
.ev = llc_conn_ev_rx_ua_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_rst_ev_qfyrs_2_1,
@@ -4495,7 +4495,7 @@ static const llc_conn_action_t llc_rst_actions_3[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_rst_state_trans_3 = {
+static const struct llc_conn_state_trans llc_rst_state_trans_3 = {
.ev = llc_conn_ev_ack_tmr_exp,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = llc_rst_ev_qfyrs_3,
@@ -4518,7 +4518,7 @@ static const llc_conn_action_t llc_rst_actions_4[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_rst_state_trans_4 = {
+static const struct llc_conn_state_trans llc_rst_state_trans_4 = {
.ev = llc_conn_ev_rx_disc_cmd_pbit_set_x,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = llc_rst_ev_qfyrs_4,
@@ -4541,7 +4541,7 @@ static const llc_conn_action_t llc_rst_actions_4_1[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_rst_state_trans_4_1 = {
+static const struct llc_conn_state_trans llc_rst_state_trans_4_1 = {
.ev = llc_conn_ev_rx_disc_cmd_pbit_set_x,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = llc_rst_ev_qfyrs_4_1,
@@ -4564,7 +4564,7 @@ static const llc_conn_action_t llc_rst_actions_5[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_rst_state_trans_5 = {
+static const struct llc_conn_state_trans llc_rst_state_trans_5 = {
.ev = llc_conn_ev_rx_dm_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = llc_rst_ev_qfyrs_5,
@@ -4586,7 +4586,7 @@ static const llc_conn_action_t llc_rst_actions_5_1[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_rst_state_trans_5_1 = {
+static const struct llc_conn_state_trans llc_rst_state_trans_5_1 = {
.ev = llc_conn_ev_rx_dm_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = llc_rst_ev_qfyrs_5_1,
@@ -4602,7 +4602,7 @@ static const llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_6[] = {
/* just one member, NULL, .bss zeroes it */
static const llc_conn_action_t llc_rst_actions_6[1];
-static struct llc_conn_state_trans llc_rst_state_trans_6 = {
+static const struct llc_conn_state_trans llc_rst_state_trans_6 = {
.ev = llc_conn_ev_data_req,
.next_state = LLC_CONN_STATE_RESET,
.ev_qualifiers = llc_rst_ev_qfyrs_6,
@@ -4623,7 +4623,7 @@ static const llc_conn_action_t llc_rst_actions_7[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_rst_state_trans_7 = {
+static const struct llc_conn_state_trans llc_rst_state_trans_7 = {
.ev = llc_conn_ev_ack_tmr_exp,
.next_state = LLC_CONN_STATE_RESET,
.ev_qualifiers = llc_rst_ev_qfyrs_7,
@@ -4644,7 +4644,7 @@ static const llc_conn_action_t llc_rst_actions_8[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_rst_state_trans_8 = {
+static const struct llc_conn_state_trans llc_rst_state_trans_8 = {
.ev = llc_conn_ev_ack_tmr_exp,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = llc_rst_ev_qfyrs_8,
@@ -4665,7 +4665,7 @@ static const llc_conn_action_t llc_rst_actions_8_1[] = {
[2] = NULL,
};
-static struct llc_conn_state_trans llc_rst_state_trans_8_1 = {
+static const struct llc_conn_state_trans llc_rst_state_trans_8_1 = {
.ev = llc_conn_ev_ack_tmr_exp,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = llc_rst_ev_qfyrs_8_1,
@@ -4676,7 +4676,7 @@ static struct llc_conn_state_trans llc_rst_state_trans_8_1 = {
* Array of pointers;
* one to each transition
*/
-static struct llc_conn_state_trans *llc_rst_state_transitions[] = {
+static const struct llc_conn_state_trans *llc_rst_state_transitions[] = {
[0] = &llc_rst_state_trans_6, /* Request */
[1] = &llc_common_state_trans_end,
[2] = &llc_common_state_trans_end, /* Local busy */
@@ -4710,7 +4710,7 @@ static const llc_conn_action_t llc_error_actions_1[] = {
[8] = NULL,
};
-static struct llc_conn_state_trans llc_error_state_trans_1 = {
+static const struct llc_conn_state_trans llc_error_state_trans_1 = {
.ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x,
.next_state = LLC_CONN_STATE_NORMAL,
.ev_qualifiers = NONE,
@@ -4726,7 +4726,7 @@ static const llc_conn_action_t llc_error_actions_2[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_error_state_trans_2 = {
+static const struct llc_conn_state_trans llc_error_state_trans_2 = {
.ev = llc_conn_ev_rx_disc_cmd_pbit_set_x,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = NONE,
@@ -4741,7 +4741,7 @@ static const llc_conn_action_t llc_error_actions_3[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_error_state_trans_3 = {
+static const struct llc_conn_state_trans llc_error_state_trans_3 = {
.ev = llc_conn_ev_rx_dm_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = NONE,
@@ -4757,7 +4757,7 @@ static const llc_conn_action_t llc_error_actions_4[] = {
[4] = NULL,
};
-static struct llc_conn_state_trans llc_error_state_trans_4 = {
+static const struct llc_conn_state_trans llc_error_state_trans_4 = {
.ev = llc_conn_ev_rx_frmr_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_RESET,
.ev_qualifiers = NONE,
@@ -4770,7 +4770,7 @@ static const llc_conn_action_t llc_error_actions_5[] = {
[1] = NULL,
};
-static struct llc_conn_state_trans llc_error_state_trans_5 = {
+static const struct llc_conn_state_trans llc_error_state_trans_5 = {
.ev = llc_conn_ev_rx_xxx_cmd_pbit_set_x,
.next_state = LLC_CONN_STATE_ERROR,
.ev_qualifiers = NONE,
@@ -4778,7 +4778,7 @@ static struct llc_conn_state_trans llc_error_state_trans_5 = {
};
/* State transitions for LLC_CONN_EV_RX_XXX_RSP_Fbit_SET_X event */
-static struct llc_conn_state_trans llc_error_state_trans_6 = {
+static const struct llc_conn_state_trans llc_error_state_trans_6 = {
.ev = llc_conn_ev_rx_xxx_rsp_fbit_set_x,
.next_state = LLC_CONN_STATE_ERROR,
.ev_qualifiers = NONE,
@@ -4798,7 +4798,7 @@ static const llc_conn_action_t llc_error_actions_7[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_error_state_trans_7 = {
+static const struct llc_conn_state_trans llc_error_state_trans_7 = {
.ev = llc_conn_ev_ack_tmr_exp,
.next_state = LLC_CONN_STATE_ERROR,
.ev_qualifiers = llc_error_ev_qfyrs_7,
@@ -4820,7 +4820,7 @@ static const llc_conn_action_t llc_error_actions_8[] = {
[5] = NULL,
};
-static struct llc_conn_state_trans llc_error_state_trans_8 = {
+static const struct llc_conn_state_trans llc_error_state_trans_8 = {
.ev = llc_conn_ev_ack_tmr_exp,
.next_state = LLC_CONN_STATE_RESET,
.ev_qualifiers = llc_error_ev_qfyrs_8,
@@ -4836,7 +4836,7 @@ static const llc_conn_ev_qfyr_t llc_error_ev_qfyrs_9[] = {
/* just one member, NULL, .bss zeroes it */
static const llc_conn_action_t llc_error_actions_9[1];
-static struct llc_conn_state_trans llc_error_state_trans_9 = {
+static const struct llc_conn_state_trans llc_error_state_trans_9 = {
.ev = llc_conn_ev_data_req,
.next_state = LLC_CONN_STATE_ERROR,
.ev_qualifiers = llc_error_ev_qfyrs_9,
@@ -4847,7 +4847,7 @@ static struct llc_conn_state_trans llc_error_state_trans_9 = {
* Array of pointers;
* one to each transition
*/
-static struct llc_conn_state_trans *llc_error_state_transitions[] = {
+static const struct llc_conn_state_trans *llc_error_state_transitions[] = {
[0] = &llc_error_state_trans_9, /* Request */
[1] = &llc_common_state_trans_end,
[2] = &llc_common_state_trans_end, /* Local busy */
@@ -4873,7 +4873,7 @@ static const llc_conn_action_t llc_temp_actions_1[] = {
[3] = NULL,
};
-static struct llc_conn_state_trans llc_temp_state_trans_1 = {
+static const struct llc_conn_state_trans llc_temp_state_trans_1 = {
.ev = llc_conn_ev_disc_req,
.next_state = LLC_CONN_STATE_ADM,
.ev_qualifiers = NONE,
@@ -4884,7 +4884,7 @@ static struct llc_conn_state_trans llc_temp_state_trans_1 = {
* Array of pointers;
* one to each transition
*/
-static struct llc_conn_state_trans *llc_temp_state_transitions[] = {
+static const struct llc_conn_state_trans *llc_temp_state_transitions[] = {
[0] = &llc_temp_state_trans_1, /* requests */
[1] = &llc_common_state_trans_end,
[2] = &llc_common_state_trans_end, /* local busy */
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index c0ac522b48a1..5c0ac243b248 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -14,14 +14,15 @@
#include <linux/init.h>
#include <linux/slab.h>
-#include <net/llc_sap.h>
-#include <net/llc_conn.h>
-#include <net/sock.h>
-#include <net/tcp_states.h>
-#include <net/llc_c_ev.h>
+#include <net/llc.h>
#include <net/llc_c_ac.h>
+#include <net/llc_c_ev.h>
#include <net/llc_c_st.h>
+#include <net/llc_conn.h>
#include <net/llc_pdu.h>
+#include <net/llc_sap.h>
+#include <net/sock.h>
+#include <net/tcp_states.h>
#if 0
#define dprintk(args...) printk(KERN_DEBUG args)
@@ -30,13 +31,13 @@
#endif
static int llc_find_offset(int state, int ev_type);
-static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *skb);
+static void llc_conn_send_pdus(struct sock *sk);
static int llc_conn_service(struct sock *sk, struct sk_buff *skb);
static int llc_exec_conn_trans_actions(struct sock *sk,
- struct llc_conn_state_trans *trans,
+ const struct llc_conn_state_trans *trans,
struct sk_buff *ev);
-static struct llc_conn_state_trans *llc_qualify_conn_ev(struct sock *sk,
- struct sk_buff *skb);
+static const struct llc_conn_state_trans *llc_qualify_conn_ev(struct sock *sk,
+ struct sk_buff *skb);
/* Offset table on connection states transition diagram */
static int llc_offset_table[NBR_CONN_STATES][NBR_CONN_EV];
@@ -55,6 +56,8 @@ int sysctl_llc2_busy_timeout = LLC2_BUSY_TIME * HZ;
* (executing it's actions and changing state), upper layer will be
* indicated or confirmed, if needed. Returns 0 for success, 1 for
* failure. The socket lock has to be held before calling this function.
+ *
+ * This function always consumes a reference to the skb.
*/
int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
{
@@ -62,12 +65,6 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
struct llc_sock *llc = llc_sk(skb->sk);
struct llc_conn_state_ev *ev = llc_conn_ev(skb);
- /*
- * We have to hold the skb, because llc_conn_service will kfree it in
- * the sending path and we need to look at the skb->cb, where we encode
- * llc_conn_state_ev.
- */
- skb_get(skb);
ev->ind_prim = ev->cfm_prim = 0;
/*
* Send event to state machine
@@ -75,21 +72,12 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
rc = llc_conn_service(skb->sk, skb);
if (unlikely(rc != 0)) {
printk(KERN_ERR "%s: llc_conn_service failed\n", __func__);
- goto out_kfree_skb;
- }
-
- if (unlikely(!ev->ind_prim && !ev->cfm_prim)) {
- /* indicate or confirm not required */
- if (!skb->next)
- goto out_kfree_skb;
goto out_skb_put;
}
- if (unlikely(ev->ind_prim && ev->cfm_prim)) /* Paranoia */
- skb_get(skb);
-
switch (ev->ind_prim) {
case LLC_DATA_PRIM:
+ skb_get(skb);
llc_save_primitive(sk, skb, LLC_DATA_PRIM);
if (unlikely(sock_queue_rcv_skb(sk, skb))) {
/*
@@ -106,6 +94,7 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
* skb->sk pointing to the newly created struct sock in
* llc_conn_handler. -acme
*/
+ skb_get(skb);
skb_queue_tail(&sk->sk_receive_queue, skb);
sk->sk_state_change(sk);
break;
@@ -121,7 +110,6 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
sk->sk_state_change(sk);
}
}
- kfree_skb(skb);
sock_put(sk);
break;
case LLC_RESET_PRIM:
@@ -130,14 +118,11 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
* RESET is not being notified to upper layers for now
*/
printk(KERN_INFO "%s: received a reset ind!\n", __func__);
- kfree_skb(skb);
break;
default:
- if (ev->ind_prim) {
+ if (ev->ind_prim)
printk(KERN_INFO "%s: received unknown %d prim!\n",
__func__, ev->ind_prim);
- kfree_skb(skb);
- }
/* No indication */
break;
}
@@ -179,25 +164,22 @@ int llc_conn_state_process(struct sock *sk, struct sk_buff *skb)
printk(KERN_INFO "%s: received a reset conf!\n", __func__);
break;
default:
- if (ev->cfm_prim) {
+ if (ev->cfm_prim)
printk(KERN_INFO "%s: received unknown %d prim!\n",
__func__, ev->cfm_prim);
- break;
- }
- goto out_skb_put; /* No confirmation */
+ /* No confirmation */
+ break;
}
-out_kfree_skb:
- kfree_skb(skb);
out_skb_put:
kfree_skb(skb);
return rc;
}
-int llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb)
+void llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb)
{
/* queue PDU to send to MAC layer */
skb_queue_tail(&sk->sk_write_queue, skb);
- return llc_conn_send_pdus(sk, skb);
+ llc_conn_send_pdus(sk);
}
/**
@@ -255,7 +237,7 @@ void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit)
if (howmany_resend > 0)
llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO;
/* any PDUs to re-send are queued up; start sending to MAC */
- llc_conn_send_pdus(sk, NULL);
+ llc_conn_send_pdus(sk);
out:;
}
@@ -296,15 +278,15 @@ void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit)
if (howmany_resend > 0)
llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO;
/* any PDUs to re-send are queued up; start sending to MAC */
- llc_conn_send_pdus(sk, NULL);
+ llc_conn_send_pdus(sk);
out:;
}
/**
* llc_conn_remove_acked_pdus - Removes acknowledged pdus from tx queue
* @sk: active connection
- * nr: NR
- * how_many_unacked: size of pdu_unack_q after removing acked pdus
+ * @nr: NR
+ * @how_many_unacked: size of pdu_unack_q after removing acked pdus
*
* Removes acknowledged pdus from transmit queue (pdu_unack_q). Returns
* the number of pdus that removed from queue.
@@ -340,16 +322,12 @@ out:
/**
* llc_conn_send_pdus - Sends queued PDUs
* @sk: active connection
- * @hold_skb: the skb held by caller, or NULL if does not care
*
- * Sends queued pdus to MAC layer for transmission. When @hold_skb is
- * NULL, always return 0. Otherwise, return 0 if @hold_skb is sent
- * successfully, or 1 for failure.
+ * Sends queued pdus to MAC layer for transmission.
*/
-static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *hold_skb)
+static void llc_conn_send_pdus(struct sock *sk)
{
struct sk_buff *skb;
- int ret = 0;
while ((skb = skb_dequeue(&sk->sk_write_queue)) != NULL) {
struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb);
@@ -361,20 +339,10 @@ static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *hold_skb)
skb_queue_tail(&llc_sk(sk)->pdu_unack_q, skb);
if (!skb2)
break;
- dev_queue_xmit(skb2);
- } else {
- bool is_target = skb == hold_skb;
- int rc;
-
- if (is_target)
- skb_get(skb);
- rc = dev_queue_xmit(skb);
- if (is_target)
- ret = rc;
+ skb = skb2;
}
+ dev_queue_xmit(skb);
}
-
- return ret;
}
/**
@@ -388,9 +356,9 @@ static int llc_conn_send_pdus(struct sock *sk, struct sk_buff *hold_skb)
*/
static int llc_conn_service(struct sock *sk, struct sk_buff *skb)
{
- int rc = 1;
+ const struct llc_conn_state_trans *trans;
struct llc_sock *llc = llc_sk(sk);
- struct llc_conn_state_trans *trans;
+ int rc = 1;
if (llc->state > NBR_CONN_STATES)
goto out;
@@ -416,10 +384,10 @@ out:
* This function finds transition that matches with happened event.
* Returns pointer to found transition on success, %NULL otherwise.
*/
-static struct llc_conn_state_trans *llc_qualify_conn_ev(struct sock *sk,
- struct sk_buff *skb)
+static const struct llc_conn_state_trans *llc_qualify_conn_ev(struct sock *sk,
+ struct sk_buff *skb)
{
- struct llc_conn_state_trans **next_trans;
+ const struct llc_conn_state_trans **next_trans;
const llc_conn_ev_qfyr_t *next_qualifier;
struct llc_conn_state_ev *ev = llc_conn_ev(skb);
struct llc_sock *llc = llc_sk(sk);
@@ -464,7 +432,7 @@ static struct llc_conn_state_trans *llc_qualify_conn_ev(struct sock *sk,
* success, 1 to indicate failure of at least one action.
*/
static int llc_exec_conn_trans_actions(struct sock *sk,
- struct llc_conn_state_trans *trans,
+ const struct llc_conn_state_trans *trans,
struct sk_buff *skb)
{
int rc = 0;
@@ -486,11 +454,13 @@ static int llc_exec_conn_trans_actions(struct sock *sk,
static inline bool llc_estab_match(const struct llc_sap *sap,
const struct llc_addr *daddr,
const struct llc_addr *laddr,
- const struct sock *sk)
+ const struct sock *sk,
+ const struct net *net)
{
struct llc_sock *llc = llc_sk(sk);
- return llc->laddr.lsap == laddr->lsap &&
+ return net_eq(sock_net(sk), net) &&
+ llc->laddr.lsap == laddr->lsap &&
llc->daddr.lsap == daddr->lsap &&
ether_addr_equal(llc->laddr.mac, laddr->mac) &&
ether_addr_equal(llc->daddr.mac, daddr->mac);
@@ -501,6 +471,7 @@ static inline bool llc_estab_match(const struct llc_sap *sap,
* @sap: SAP
* @daddr: address of remote LLC (MAC + SAP)
* @laddr: address of local LLC (MAC + SAP)
+ * @net: netns to look up a socket in
*
* Search connection list of the SAP and finds connection using the remote
* mac, remote sap, local mac, and local sap. Returns pointer for
@@ -509,7 +480,8 @@ static inline bool llc_estab_match(const struct llc_sap *sap,
*/
static struct sock *__llc_lookup_established(struct llc_sap *sap,
struct llc_addr *daddr,
- struct llc_addr *laddr)
+ struct llc_addr *laddr,
+ const struct net *net)
{
struct sock *rc;
struct hlist_nulls_node *node;
@@ -519,12 +491,12 @@ static struct sock *__llc_lookup_established(struct llc_sap *sap,
rcu_read_lock();
again:
sk_nulls_for_each_rcu(rc, node, laddr_hb) {
- if (llc_estab_match(sap, daddr, laddr, rc)) {
+ if (llc_estab_match(sap, daddr, laddr, rc, net)) {
/* Extra checks required by SLAB_TYPESAFE_BY_RCU */
if (unlikely(!refcount_inc_not_zero(&rc->sk_refcnt)))
goto again;
if (unlikely(llc_sk(rc)->sap != sap ||
- !llc_estab_match(sap, daddr, laddr, rc))) {
+ !llc_estab_match(sap, daddr, laddr, rc, net))) {
sock_put(rc);
continue;
}
@@ -546,29 +518,33 @@ found:
struct sock *llc_lookup_established(struct llc_sap *sap,
struct llc_addr *daddr,
- struct llc_addr *laddr)
+ struct llc_addr *laddr,
+ const struct net *net)
{
struct sock *sk;
local_bh_disable();
- sk = __llc_lookup_established(sap, daddr, laddr);
+ sk = __llc_lookup_established(sap, daddr, laddr, net);
local_bh_enable();
return sk;
}
static inline bool llc_listener_match(const struct llc_sap *sap,
const struct llc_addr *laddr,
- const struct sock *sk)
+ const struct sock *sk,
+ const struct net *net)
{
struct llc_sock *llc = llc_sk(sk);
- return sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN &&
+ return net_eq(sock_net(sk), net) &&
+ sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN &&
llc->laddr.lsap == laddr->lsap &&
ether_addr_equal(llc->laddr.mac, laddr->mac);
}
static struct sock *__llc_lookup_listener(struct llc_sap *sap,
- struct llc_addr *laddr)
+ struct llc_addr *laddr,
+ const struct net *net)
{
struct sock *rc;
struct hlist_nulls_node *node;
@@ -578,12 +554,12 @@ static struct sock *__llc_lookup_listener(struct llc_sap *sap,
rcu_read_lock();
again:
sk_nulls_for_each_rcu(rc, node, laddr_hb) {
- if (llc_listener_match(sap, laddr, rc)) {
+ if (llc_listener_match(sap, laddr, rc, net)) {
/* Extra checks required by SLAB_TYPESAFE_BY_RCU */
if (unlikely(!refcount_inc_not_zero(&rc->sk_refcnt)))
goto again;
if (unlikely(llc_sk(rc)->sap != sap ||
- !llc_listener_match(sap, laddr, rc))) {
+ !llc_listener_match(sap, laddr, rc, net))) {
sock_put(rc);
continue;
}
@@ -607,6 +583,7 @@ found:
* llc_lookup_listener - Finds listener for local MAC + SAP
* @sap: SAP
* @laddr: address of local LLC (MAC + SAP)
+ * @net: netns to look up a socket in
*
* Search connection list of the SAP and finds connection listening on
* local mac, and local sap. Returns pointer for parent socket found,
@@ -614,24 +591,26 @@ found:
* Caller has to make sure local_bh is disabled.
*/
static struct sock *llc_lookup_listener(struct llc_sap *sap,
- struct llc_addr *laddr)
+ struct llc_addr *laddr,
+ const struct net *net)
{
+ struct sock *rc = __llc_lookup_listener(sap, laddr, net);
static struct llc_addr null_addr;
- struct sock *rc = __llc_lookup_listener(sap, laddr);
if (!rc)
- rc = __llc_lookup_listener(sap, &null_addr);
+ rc = __llc_lookup_listener(sap, &null_addr, net);
return rc;
}
static struct sock *__llc_lookup(struct llc_sap *sap,
struct llc_addr *daddr,
- struct llc_addr *laddr)
+ struct llc_addr *laddr,
+ const struct net *net)
{
- struct sock *sk = __llc_lookup_established(sap, daddr, laddr);
+ struct sock *sk = __llc_lookup_established(sap, daddr, laddr, net);
- return sk ? : llc_lookup_listener(sap, laddr);
+ return sk ? : llc_lookup_listener(sap, laddr, net);
}
/**
@@ -656,8 +635,8 @@ u8 llc_data_accept_state(u8 state)
*/
static u16 __init llc_find_next_offset(struct llc_conn_state *state, u16 offset)
{
+ const struct llc_conn_state_trans **next_trans;
u16 cnt = 0;
- struct llc_conn_state_trans **next_trans;
for (next_trans = state->transitions + offset;
(*next_trans)->ev; next_trans++)
@@ -734,6 +713,7 @@ void llc_sap_add_socket(struct llc_sap *sap, struct sock *sk)
llc_sk(sk)->sap = sap;
spin_lock_bh(&sap->sk_lock);
+ sock_set_flag(sk, SOCK_RCU_FREE);
sap->sk_count++;
sk_nulls_add_node_rcu(sk, laddr_hb);
hlist_add_head(&llc->dev_hash_node, dev_hb);
@@ -808,7 +788,7 @@ void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb)
llc_pdu_decode_da(skb, daddr.mac);
llc_pdu_decode_dsap(skb, &daddr.lsap);
- sk = __llc_lookup(sap, &saddr, &daddr);
+ sk = __llc_lookup(sap, &saddr, &daddr, dev_net(skb->dev));
if (!sk)
goto drop;
@@ -845,7 +825,7 @@ void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb)
else {
dprintk("%s: adding to backlog...\n", __func__);
llc_set_backlog_type(skb, LLC_PACKET);
- if (sk_add_backlog(sk, skb, sk->sk_rcvbuf))
+ if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))
goto drop_unlock;
}
out:
@@ -938,8 +918,11 @@ static void llc_sk_init(struct sock *sk)
/**
* llc_sk_alloc - Allocates LLC sock
+ * @net: network namespace
* @family: upper layer protocol family
* @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ * @prot: struct proto associated with this new sock instance
+ * @kern: is this to be a kernel socket?
*
* Allocates a LLC sock and initializes it. Returns the new LLC sock
* or %NULL if there's no memory available for one
@@ -966,15 +949,15 @@ void llc_sk_stop_all_timers(struct sock *sk, bool sync)
struct llc_sock *llc = llc_sk(sk);
if (sync) {
- del_timer_sync(&llc->pf_cycle_timer.timer);
- del_timer_sync(&llc->ack_timer.timer);
- del_timer_sync(&llc->rej_sent_timer.timer);
- del_timer_sync(&llc->busy_state_timer.timer);
+ timer_delete_sync(&llc->pf_cycle_timer.timer);
+ timer_delete_sync(&llc->ack_timer.timer);
+ timer_delete_sync(&llc->rej_sent_timer.timer);
+ timer_delete_sync(&llc->busy_state_timer.timer);
} else {
- del_timer(&llc->pf_cycle_timer.timer);
- del_timer(&llc->ack_timer.timer);
- del_timer(&llc->rej_sent_timer.timer);
- del_timer(&llc->busy_state_timer.timer);
+ timer_delete(&llc->pf_cycle_timer.timer);
+ timer_delete(&llc->ack_timer.timer);
+ timer_delete(&llc->rej_sent_timer.timer);
+ timer_delete(&llc->busy_state_timer.timer);
}
llc->ack_must_be_send = 0;
@@ -983,7 +966,7 @@ void llc_sk_stop_all_timers(struct sock *sk, bool sync)
/**
* llc_sk_free - Frees a LLC socket
- * @sk - socket to free
+ * @sk: - socket to free
*
* Frees a LLC socket
*/
diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c
index 260b3dc1b4a2..4f16d9c88350 100644
--- a/net/llc/llc_core.c
+++ b/net/llc/llc_core.c
@@ -59,10 +59,10 @@ out:
}
/**
- * llc_sap_find - searchs a SAP in station
+ * llc_sap_find - searches a SAP in station
* @sap_value: sap to be found
*
- * Searchs for a sap in the sap list of the LLC's station upon the sap ID.
+ * Searches for a sap in the sap list of the LLC's station upon the sap ID.
* If the sap is found it will be refcounted and the user will have to do
* a llc_sap_put after use.
* Returns the sap or %NULL if not found.
@@ -127,9 +127,7 @@ void llc_sap_close(struct llc_sap *sap)
list_del_rcu(&sap->node);
spin_unlock_bh(&llc_sap_list_lock);
- synchronize_rcu();
-
- kfree(sap);
+ kfree_rcu(sap, rcu);
}
static struct packet_type llc_packet_type __read_mostly = {
@@ -137,22 +135,15 @@ static struct packet_type llc_packet_type __read_mostly = {
.func = llc_rcv,
};
-static struct packet_type llc_tr_packet_type __read_mostly = {
- .type = cpu_to_be16(ETH_P_TR_802_2),
- .func = llc_rcv,
-};
-
static int __init llc_init(void)
{
dev_add_pack(&llc_packet_type);
- dev_add_pack(&llc_tr_packet_type);
return 0;
}
static void __exit llc_exit(void)
{
dev_remove_pack(&llc_packet_type);
- dev_remove_pack(&llc_tr_packet_type);
}
module_init(llc_init);
diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c
index 8db03c2d5440..58a5f419adc6 100644
--- a/net/llc/llc_if.c
+++ b/net/llc/llc_if.c
@@ -38,6 +38,8 @@
* closed and -EBUSY when sending data is not permitted in this state or
* LLC has send an I pdu with p bit set to 1 and is waiting for it's
* response.
+ *
+ * This function always consumes a reference to the skb.
*/
int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb)
{
@@ -46,20 +48,22 @@ int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb)
struct llc_sock *llc = llc_sk(sk);
if (unlikely(llc->state == LLC_CONN_STATE_ADM))
- goto out;
+ goto out_free;
rc = -EBUSY;
if (unlikely(llc_data_accept_state(llc->state) || /* data_conn_refuse */
llc->p_flag)) {
llc->failed_data_req = 1;
- goto out;
+ goto out_free;
}
ev = llc_conn_ev(skb);
ev->type = LLC_CONN_EV_TYPE_PRIM;
ev->prim = LLC_DATA_PRIM;
ev->prim_type = LLC_PRIM_TYPE_REQ;
skb->dev = llc->dev;
- rc = llc_conn_state_process(sk, skb);
-out:
+ return llc_conn_state_process(sk, skb);
+
+out_free:
+ kfree_skb(skb);
return rc;
}
@@ -76,7 +80,7 @@ out:
* establishment will inform to upper layer via calling it's confirm
* function and passing proper information.
*/
-int llc_establish_connection(struct sock *sk, u8 *lmac, u8 *dmac, u8 dsap)
+int llc_establish_connection(struct sock *sk, const u8 *lmac, u8 *dmac, u8 dsap)
{
int rc = -EISCONN;
struct llc_addr laddr, daddr;
@@ -88,7 +92,7 @@ int llc_establish_connection(struct sock *sk, u8 *lmac, u8 *dmac, u8 dsap)
daddr.lsap = dsap;
memcpy(daddr.mac, dmac, sizeof(daddr.mac));
memcpy(laddr.mac, lmac, sizeof(laddr.mac));
- existing = llc_lookup_established(llc->sap, &daddr, &laddr);
+ existing = llc_lookup_established(llc->sap, &daddr, &laddr, sock_net(sk));
if (existing) {
if (existing->sk_state == TCP_ESTABLISHED) {
sk = existing;
diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c
index 82cb93f66b9b..61b0159b2fbe 100644
--- a/net/llc/llc_input.c
+++ b/net/llc/llc_input.c
@@ -124,11 +124,17 @@ static inline int llc_fixup_skb(struct sk_buff *skb)
if (unlikely(!pskb_may_pull(skb, llc_len)))
return 0;
- skb->transport_header += llc_len;
skb_pull(skb, llc_len);
+ skb_reset_transport_header(skb);
if (skb->protocol == htons(ETH_P_802_2)) {
- __be16 pdulen = eth_hdr(skb)->h_proto;
- s32 data_size = ntohs(pdulen) - llc_len;
+ __be16 pdulen;
+ s32 data_size;
+
+ if (skb->mac_len < ETH_HLEN)
+ return 0;
+
+ pdulen = eth_hdr(skb)->h_proto;
+ data_size = ntohs(pdulen) - llc_len;
if (data_size < 0 ||
!pskb_may_pull(skb, data_size))
@@ -144,6 +150,7 @@ static inline int llc_fixup_skb(struct sk_buff *skb)
* @skb: received pdu
* @dev: device that receive pdu
* @pt: packet type
+ * @orig_dev: the original receive net device
*
* When the system receives a 802.2 frame this function is called. It
* checks SAP and connection of received pdu and passes frame to
@@ -162,9 +169,6 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev,
void (*sta_handler)(struct sk_buff *skb);
void (*sap_handler)(struct llc_sap *sap, struct sk_buff *skb);
- if (!net_eq(dev_net(dev), &init_net))
- goto drop;
-
/*
* When the interface is in promisc. mode, drop all the crap that it
* receives, do not try to analyse it.
diff --git a/net/llc/llc_output.c b/net/llc/llc_output.c
index 94425e421213..5a6466fc626a 100644
--- a/net/llc/llc_output.c
+++ b/net/llc/llc_output.c
@@ -1,16 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* llc_output.c - LLC minimal output path
*
* Copyright (c) 1997 by Procom Technology, Inc.
* 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * This program can be redistributed or modified under the terms of the
- * GNU General Public License version 2 as published by the Free Software
- * Foundation.
- * This program is distributed without any warranty or implied warranty
- * of merchantability or fitness for a particular purpose.
- *
- * See the GNU General Public License version 2 for more details.
*/
#include <linux/if_arp.h>
@@ -63,7 +56,7 @@ int llc_mac_hdr_init(struct sk_buff *skb,
* package primitive as an event and send to SAP event handler
*/
int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb,
- unsigned char *dmac, unsigned char dsap)
+ const unsigned char *dmac, unsigned char dsap)
{
int rc;
llc_pdu_header_init(skb, LLC_PDU_TYPE_U, sap->laddr.lsap,
@@ -72,6 +65,8 @@ int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb,
rc = llc_mac_hdr_init(skb, skb->dev->dev_addr, dmac);
if (likely(!rc))
rc = dev_queue_xmit(skb);
+ else
+ kfree_skb(skb);
return rc;
}
diff --git a/net/llc/llc_pdu.c b/net/llc/llc_pdu.c
index 2e6cb79196bb..63749dde542f 100644
--- a/net/llc/llc_pdu.c
+++ b/net/llc/llc_pdu.c
@@ -24,8 +24,8 @@ void llc_pdu_set_cmd_rsp(struct sk_buff *skb, u8 pdu_type)
}
/**
- * pdu_set_pf_bit - sets poll/final bit in LLC header
- * @pdu_frame: input frame that p/f bit must be set into it.
+ * llc_pdu_set_pf_bit - sets poll/final bit in LLC header
+ * @skb: Frame to set bit in
* @bit_value: poll/final bit (0 or 1).
*
* This function sets poll/final bit in LLC header (based on type of PDU).
diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c
index f3a36c16a5e7..aa81c67b24a1 100644
--- a/net/llc/llc_proc.c
+++ b/net/llc/llc_proc.c
@@ -26,7 +26,7 @@
#include <net/llc_c_st.h>
#include <net/llc_conn.h>
-static void llc_ui_format_mac(struct seq_file *seq, u8 *addr)
+static void llc_ui_format_mac(struct seq_file *seq, const u8 *addr)
{
seq_printf(seq, "%pM", addr);
}
@@ -56,7 +56,7 @@ found:
return sk;
}
-static void *llc_seq_start(struct seq_file *seq, loff_t *pos)
+static void *llc_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU)
{
loff_t l = *pos;
@@ -151,7 +151,7 @@ static int llc_seq_socket_show(struct seq_file *seq, void *v)
sk_wmem_alloc_get(sk),
sk_rmem_alloc_get(sk) - llc->copied_seq,
sk->sk_state,
- from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
+ from_kuid_munged(seq_user_ns(seq), sk_uid(sk)),
llc->link);
out:
return 0;
@@ -195,7 +195,7 @@ static int llc_seq_core_show(struct seq_file *seq, void *v)
timer_pending(&llc->pf_cycle_timer.timer),
timer_pending(&llc->rej_sent_timer.timer),
timer_pending(&llc->busy_state_timer.timer),
- !!sk->sk_backlog.tail, !!sk->sk_lock.owned);
+ !!sk->sk_backlog.tail, sock_owned_by_user_nocheck(sk));
out:
return 0;
}
diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c
index a94bd56bcac6..7a0cae9a8111 100644
--- a/net/llc/llc_s_ac.c
+++ b/net/llc/llc_s_ac.c
@@ -24,10 +24,10 @@
#include <net/llc_s_ac.h>
#include <net/llc_s_ev.h>
#include <net/llc_sap.h>
-
+#include <net/sock.h>
/**
- * llc_sap_action_unit_data_ind - forward UI PDU to network layer
+ * llc_sap_action_unitdata_ind - forward UI PDU to network layer
* @sap: SAP
* @skb: the event to forward
*
@@ -40,6 +40,26 @@ int llc_sap_action_unitdata_ind(struct llc_sap *sap, struct sk_buff *skb)
return 0;
}
+static int llc_prepare_and_xmit(struct sk_buff *skb)
+{
+ struct llc_sap_state_ev *ev = llc_sap_ev(skb);
+ struct sk_buff *nskb;
+ int rc;
+
+ rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac);
+ if (rc)
+ return rc;
+
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return -ENOMEM;
+
+ if (skb->sk)
+ skb_set_owner_w(nskb, skb->sk);
+
+ return dev_queue_xmit(nskb);
+}
+
/**
* llc_sap_action_send_ui - sends UI PDU resp to UNITDATA REQ to MAC layer
* @sap: SAP
@@ -52,15 +72,12 @@ int llc_sap_action_unitdata_ind(struct llc_sap *sap, struct sk_buff *skb)
int llc_sap_action_send_ui(struct llc_sap *sap, struct sk_buff *skb)
{
struct llc_sap_state_ev *ev = llc_sap_ev(skb);
- int rc;
llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap,
ev->daddr.lsap, LLC_PDU_CMD);
llc_pdu_init_as_ui_cmd(skb);
- rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac);
- if (likely(!rc))
- rc = dev_queue_xmit(skb);
- return rc;
+
+ return llc_prepare_and_xmit(skb);
}
/**
@@ -75,15 +92,12 @@ int llc_sap_action_send_ui(struct llc_sap *sap, struct sk_buff *skb)
int llc_sap_action_send_xid_c(struct llc_sap *sap, struct sk_buff *skb)
{
struct llc_sap_state_ev *ev = llc_sap_ev(skb);
- int rc;
- llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap,
+ llc_pdu_header_init(skb, LLC_PDU_TYPE_U_XID, ev->saddr.lsap,
ev->daddr.lsap, LLC_PDU_CMD);
llc_pdu_init_as_xid_cmd(skb, LLC_XID_NULL_CLASS_2, 0);
- rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac);
- if (likely(!rc))
- rc = dev_queue_xmit(skb);
- return rc;
+
+ return llc_prepare_and_xmit(skb);
}
/**
@@ -129,15 +143,12 @@ out:
int llc_sap_action_send_test_c(struct llc_sap *sap, struct sk_buff *skb)
{
struct llc_sap_state_ev *ev = llc_sap_ev(skb);
- int rc;
llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap,
ev->daddr.lsap, LLC_PDU_CMD);
llc_pdu_init_as_test_cmd(skb);
- rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac);
- if (likely(!rc))
- rc = dev_queue_xmit(skb);
- return rc;
+
+ return llc_prepare_and_xmit(skb);
}
int llc_sap_action_send_test_r(struct llc_sap *sap, struct sk_buff *skb)
@@ -147,6 +158,9 @@ int llc_sap_action_send_test_r(struct llc_sap *sap, struct sk_buff *skb)
int rc = 1;
u32 data_size;
+ if (skb->mac_len < ETH_HLEN)
+ return 1;
+
llc_pdu_decode_sa(skb, mac_da);
llc_pdu_decode_da(skb, mac_sa);
llc_pdu_decode_ssap(skb, &dsap);
diff --git a/net/llc/llc_s_st.c b/net/llc/llc_s_st.c
index 308c616883a4..acccc827c562 100644
--- a/net/llc/llc_s_st.c
+++ b/net/llc/llc_s_st.c
@@ -24,7 +24,7 @@
* last entry for this state
* all members are zeros, .bss zeroes it
*/
-static struct llc_sap_state_trans llc_sap_state_trans_end;
+static const struct llc_sap_state_trans llc_sap_state_trans_end;
/* state LLC_SAP_STATE_INACTIVE transition for
* LLC_SAP_EV_ACTIVATION_REQ event
@@ -34,14 +34,14 @@ static const llc_sap_action_t llc_sap_inactive_state_actions_1[] = {
[1] = NULL,
};
-static struct llc_sap_state_trans llc_sap_inactive_state_trans_1 = {
+static const struct llc_sap_state_trans llc_sap_inactive_state_trans_1 = {
.ev = llc_sap_ev_activation_req,
.next_state = LLC_SAP_STATE_ACTIVE,
.ev_actions = llc_sap_inactive_state_actions_1,
};
/* array of pointers; one to each transition */
-static struct llc_sap_state_trans *llc_sap_inactive_state_transitions[] = {
+static const struct llc_sap_state_trans *llc_sap_inactive_state_transitions[] = {
[0] = &llc_sap_inactive_state_trans_1,
[1] = &llc_sap_state_trans_end,
};
@@ -52,7 +52,7 @@ static const llc_sap_action_t llc_sap_active_state_actions_1[] = {
[1] = NULL,
};
-static struct llc_sap_state_trans llc_sap_active_state_trans_1 = {
+static const struct llc_sap_state_trans llc_sap_active_state_trans_1 = {
.ev = llc_sap_ev_rx_ui,
.next_state = LLC_SAP_STATE_ACTIVE,
.ev_actions = llc_sap_active_state_actions_1,
@@ -64,7 +64,7 @@ static const llc_sap_action_t llc_sap_active_state_actions_2[] = {
[1] = NULL,
};
-static struct llc_sap_state_trans llc_sap_active_state_trans_2 = {
+static const struct llc_sap_state_trans llc_sap_active_state_trans_2 = {
.ev = llc_sap_ev_unitdata_req,
.next_state = LLC_SAP_STATE_ACTIVE,
.ev_actions = llc_sap_active_state_actions_2,
@@ -76,7 +76,7 @@ static const llc_sap_action_t llc_sap_active_state_actions_3[] = {
[1] = NULL,
};
-static struct llc_sap_state_trans llc_sap_active_state_trans_3 = {
+static const struct llc_sap_state_trans llc_sap_active_state_trans_3 = {
.ev = llc_sap_ev_xid_req,
.next_state = LLC_SAP_STATE_ACTIVE,
.ev_actions = llc_sap_active_state_actions_3,
@@ -88,7 +88,7 @@ static const llc_sap_action_t llc_sap_active_state_actions_4[] = {
[1] = NULL,
};
-static struct llc_sap_state_trans llc_sap_active_state_trans_4 = {
+static const struct llc_sap_state_trans llc_sap_active_state_trans_4 = {
.ev = llc_sap_ev_rx_xid_c,
.next_state = LLC_SAP_STATE_ACTIVE,
.ev_actions = llc_sap_active_state_actions_4,
@@ -100,7 +100,7 @@ static const llc_sap_action_t llc_sap_active_state_actions_5[] = {
[1] = NULL,
};
-static struct llc_sap_state_trans llc_sap_active_state_trans_5 = {
+static const struct llc_sap_state_trans llc_sap_active_state_trans_5 = {
.ev = llc_sap_ev_rx_xid_r,
.next_state = LLC_SAP_STATE_ACTIVE,
.ev_actions = llc_sap_active_state_actions_5,
@@ -112,7 +112,7 @@ static const llc_sap_action_t llc_sap_active_state_actions_6[] = {
[1] = NULL,
};
-static struct llc_sap_state_trans llc_sap_active_state_trans_6 = {
+static const struct llc_sap_state_trans llc_sap_active_state_trans_6 = {
.ev = llc_sap_ev_test_req,
.next_state = LLC_SAP_STATE_ACTIVE,
.ev_actions = llc_sap_active_state_actions_6,
@@ -124,7 +124,7 @@ static const llc_sap_action_t llc_sap_active_state_actions_7[] = {
[1] = NULL,
};
-static struct llc_sap_state_trans llc_sap_active_state_trans_7 = {
+static const struct llc_sap_state_trans llc_sap_active_state_trans_7 = {
.ev = llc_sap_ev_rx_test_c,
.next_state = LLC_SAP_STATE_ACTIVE,
.ev_actions = llc_sap_active_state_actions_7
@@ -136,7 +136,7 @@ static const llc_sap_action_t llc_sap_active_state_actions_8[] = {
[1] = NULL,
};
-static struct llc_sap_state_trans llc_sap_active_state_trans_8 = {
+static const struct llc_sap_state_trans llc_sap_active_state_trans_8 = {
.ev = llc_sap_ev_rx_test_r,
.next_state = LLC_SAP_STATE_ACTIVE,
.ev_actions = llc_sap_active_state_actions_8,
@@ -150,14 +150,14 @@ static const llc_sap_action_t llc_sap_active_state_actions_9[] = {
[1] = NULL,
};
-static struct llc_sap_state_trans llc_sap_active_state_trans_9 = {
+static const struct llc_sap_state_trans llc_sap_active_state_trans_9 = {
.ev = llc_sap_ev_deactivation_req,
.next_state = LLC_SAP_STATE_INACTIVE,
.ev_actions = llc_sap_active_state_actions_9
};
/* array of pointers; one to each transition */
-static struct llc_sap_state_trans *llc_sap_active_state_transitions[] = {
+static const struct llc_sap_state_trans *llc_sap_active_state_transitions[] = {
[0] = &llc_sap_active_state_trans_2,
[1] = &llc_sap_active_state_trans_1,
[2] = &llc_sap_active_state_trans_3,
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c
index a7f7b8ff4729..6cd03c2ae7d5 100644
--- a/net/llc/llc_sap.c
+++ b/net/llc/llc_sap.c
@@ -37,6 +37,7 @@ static int llc_mac_header_len(unsigned short devtype)
/**
* llc_alloc_frame - allocates sk_buff for frame
+ * @sk: socket to allocate frame to
* @dev: network device this skb will be sent over
* @type: pdu type to allocate
* @data_size: data size to allocate
@@ -113,12 +114,12 @@ void llc_sap_rtn_pdu(struct llc_sap *sap, struct sk_buff *skb)
* Returns the pointer to found transition on success or %NULL for
* failure.
*/
-static struct llc_sap_state_trans *llc_find_sap_trans(struct llc_sap *sap,
- struct sk_buff *skb)
+static const struct llc_sap_state_trans *llc_find_sap_trans(struct llc_sap *sap,
+ struct sk_buff *skb)
{
int i = 0;
- struct llc_sap_state_trans *rc = NULL;
- struct llc_sap_state_trans **next_trans;
+ const struct llc_sap_state_trans *rc = NULL;
+ const struct llc_sap_state_trans **next_trans;
struct llc_sap_state *curr_state = &llc_sap_state_table[sap->state - 1];
/*
* Search thru events for this state until list exhausted or until
@@ -142,7 +143,7 @@ static struct llc_sap_state_trans *llc_find_sap_trans(struct llc_sap *sap,
* Returns 0 for success and 1 for failure of at least one action.
*/
static int llc_exec_sap_trans_actions(struct llc_sap *sap,
- struct llc_sap_state_trans *trans,
+ const struct llc_sap_state_trans *trans,
struct sk_buff *skb)
{
int rc = 0;
@@ -165,8 +166,8 @@ static int llc_exec_sap_trans_actions(struct llc_sap *sap,
*/
static int llc_sap_next_state(struct llc_sap *sap, struct sk_buff *skb)
{
+ const struct llc_sap_state_trans *trans;
int rc = 1;
- struct llc_sap_state_trans *trans;
if (sap->state > LLC_NR_SAP_STATES)
goto out;
@@ -197,29 +198,22 @@ out:
* After executing actions of the event, upper layer will be indicated
* if needed(on receiving an UI frame). sk can be null for the
* datalink_proto case.
+ *
+ * This function always consumes a reference to the skb.
*/
static void llc_sap_state_process(struct llc_sap *sap, struct sk_buff *skb)
{
struct llc_sap_state_ev *ev = llc_sap_ev(skb);
- /*
- * We have to hold the skb, because llc_sap_next_state
- * will kfree it in the sending path and we need to
- * look at the skb->cb, where we encode llc_sap_state_ev.
- */
- skb_get(skb);
ev->ind_cfm_flag = 0;
llc_sap_next_state(sap, skb);
- if (ev->ind_cfm_flag == LLC_IND) {
- if (skb->sk->sk_state == TCP_LISTEN)
- kfree_skb(skb);
- else {
- llc_save_primitive(skb->sk, skb, ev->prim);
- /* queue skb to the user. */
- if (sock_queue_rcv_skb(skb->sk, skb))
- kfree_skb(skb);
- }
+ if (ev->ind_cfm_flag == LLC_IND && skb->sk->sk_state != TCP_LISTEN) {
+ llc_save_primitive(skb->sk, skb, ev->prim);
+
+ /* queue skb to the user. */
+ if (sock_queue_rcv_skb(skb->sk, skb) == 0)
+ return;
}
kfree_skb(skb);
}
@@ -280,6 +274,7 @@ void llc_build_and_send_xid_pkt(struct llc_sap *sap, struct sk_buff *skb,
* llc_sap_rcv - sends received pdus to the sap state machine
* @sap: current sap component structure.
* @skb: received frame.
+ * @sk: socket to associate to frame
*
* Sends received pdus to the sap state machine.
*/
@@ -299,25 +294,29 @@ static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb,
static inline bool llc_dgram_match(const struct llc_sap *sap,
const struct llc_addr *laddr,
- const struct sock *sk)
+ const struct sock *sk,
+ const struct net *net)
{
struct llc_sock *llc = llc_sk(sk);
return sk->sk_type == SOCK_DGRAM &&
- llc->laddr.lsap == laddr->lsap &&
- ether_addr_equal(llc->laddr.mac, laddr->mac);
+ net_eq(sock_net(sk), net) &&
+ llc->laddr.lsap == laddr->lsap &&
+ ether_addr_equal(llc->laddr.mac, laddr->mac);
}
/**
* llc_lookup_dgram - Finds dgram socket for the local sap/mac
* @sap: SAP
* @laddr: address of local LLC (MAC + SAP)
+ * @net: netns to look up a socket in
*
* Search socket list of the SAP and finds connection using the local
* mac, and local sap. Returns pointer for socket found, %NULL otherwise.
*/
static struct sock *llc_lookup_dgram(struct llc_sap *sap,
- const struct llc_addr *laddr)
+ const struct llc_addr *laddr,
+ const struct net *net)
{
struct sock *rc;
struct hlist_nulls_node *node;
@@ -327,12 +326,12 @@ static struct sock *llc_lookup_dgram(struct llc_sap *sap,
rcu_read_lock_bh();
again:
sk_nulls_for_each_rcu(rc, node, laddr_hb) {
- if (llc_dgram_match(sap, laddr, rc)) {
+ if (llc_dgram_match(sap, laddr, rc, net)) {
/* Extra checks required by SLAB_TYPESAFE_BY_RCU */
if (unlikely(!refcount_inc_not_zero(&rc->sk_refcnt)))
goto again;
if (unlikely(llc_sk(rc)->sap != sap ||
- !llc_dgram_match(sap, laddr, rc))) {
+ !llc_dgram_match(sap, laddr, rc, net))) {
sock_put(rc);
continue;
}
@@ -386,6 +385,7 @@ static void llc_do_mcast(struct llc_sap *sap, struct sk_buff *skb,
* llc_sap_mcast - Deliver multicast PDU's to all matching datagram sockets.
* @sap: SAP
* @laddr: address of local LLC (MAC + SAP)
+ * @skb: PDU to deliver
*
* Search socket list of the SAP and finds connections with same sap.
* Deliver clone to each.
@@ -433,7 +433,7 @@ void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb)
llc_sap_mcast(sap, &laddr, skb);
kfree_skb(skb);
} else {
- struct sock *sk = llc_lookup_dgram(sap, &laddr);
+ struct sock *sk = llc_lookup_dgram(sap, &laddr, dev_net(skb->dev));
if (sk) {
llc_sap_rcv(sap, skb, sk);
sock_put(sk);
diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c
index 204a8351efff..f50654292510 100644
--- a/net/llc/llc_station.c
+++ b/net/llc/llc_station.c
@@ -32,7 +32,7 @@ static int llc_stat_ev_rx_null_dsap_xid_c(struct sk_buff *skb)
return LLC_PDU_IS_CMD(pdu) && /* command PDU */
LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */
LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_XID &&
- !pdu->dsap ? 0 : 1; /* NULL DSAP value */
+ !pdu->dsap; /* NULL DSAP value */
}
static int llc_stat_ev_rx_null_dsap_test_c(struct sk_buff *skb)
@@ -42,7 +42,7 @@ static int llc_stat_ev_rx_null_dsap_test_c(struct sk_buff *skb)
return LLC_PDU_IS_CMD(pdu) && /* command PDU */
LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */
LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_TEST &&
- !pdu->dsap ? 0 : 1; /* NULL DSAP */
+ !pdu->dsap; /* NULL DSAP */
}
static int llc_station_ac_send_xid_r(struct sk_buff *skb)
@@ -54,7 +54,6 @@ static int llc_station_ac_send_xid_r(struct sk_buff *skb)
if (!nskb)
goto out;
- rc = 0;
llc_pdu_decode_sa(skb, mac_da);
llc_pdu_decode_ssap(skb, &dsap);
llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, dsap, LLC_PDU_RSP);
@@ -77,13 +76,15 @@ static int llc_station_ac_send_test_r(struct sk_buff *skb)
u32 data_size;
struct sk_buff *nskb;
+ if (skb->mac_len < ETH_HLEN)
+ goto out;
+
/* The test request command is type U (llc_len = 3) */
data_size = ntohs(eth_hdr(skb)->h_proto) - 3;
nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U, data_size);
if (!nskb)
goto out;
- rc = 0;
llc_pdu_decode_sa(skb, mac_da);
llc_pdu_decode_ssap(skb, &dsap);
llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, dsap, LLC_PDU_RSP);
diff --git a/net/llc/sysctl_net_llc.c b/net/llc/sysctl_net_llc.c
index 8443a6d841b0..c8d88e2508fc 100644
--- a/net/llc/sysctl_net_llc.c
+++ b/net/llc/sysctl_net_llc.c
@@ -11,10 +11,6 @@
#include <net/net_namespace.h>
#include <net/llc.h>
-#ifndef CONFIG_SYSCTL
-#error This file should not be compiled without CONFIG_SYSCTL defined
-#endif
-
static struct ctl_table llc2_timeout_table[] = {
{
.procname = "ack",
@@ -44,11 +40,6 @@ static struct ctl_table llc2_timeout_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- { },
-};
-
-static struct ctl_table llc_station_table[] = {
- { },
};
static struct ctl_table_header *llc2_timeout_header;
@@ -56,8 +47,9 @@ static struct ctl_table_header *llc_station_header;
int __init llc_sysctl_init(void)
{
+ struct ctl_table empty[1] = {};
llc2_timeout_header = register_net_sysctl(&init_net, "net/llc/llc2/timeout", llc2_timeout_table);
- llc_station_header = register_net_sysctl(&init_net, "net/llc/station", llc_station_table);
+ llc_station_header = register_net_sysctl_sz(&init_net, "net/llc/station", empty, 0);
if (!llc2_timeout_header || !llc_station_header) {
llc_sysctl_exit();
diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
index 76e30f4797fb..cf0f7780fb10 100644
--- a/net/mac80211/Kconfig
+++ b/net/mac80211/Kconfig
@@ -1,14 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
config MAC80211
tristate "Generic IEEE 802.11 Networking Stack (mac80211)"
depends on CFG80211
select CRYPTO
- select CRYPTO_ARC4
+ select CRYPTO_LIB_ARC4
select CRYPTO_AES
select CRYPTO_CCM
select CRYPTO_GCM
select CRYPTO_CMAC
select CRC32
- ---help---
+ help
This option enables the hardware independent IEEE 802.11
networking stack.
@@ -24,28 +25,14 @@ config MAC80211_RC_MINSTREL
bool "Minstrel" if EXPERT
select MAC80211_HAS_RC
default y
- ---help---
+ help
This option enables the 'minstrel' TX rate control algorithm
-config MAC80211_RC_MINSTREL_HT
- bool "Minstrel 802.11n support" if EXPERT
- depends on MAC80211_RC_MINSTREL
- default y
- ---help---
- This option enables the 'minstrel_ht' TX rate control algorithm
-
-config MAC80211_RC_MINSTREL_VHT
- bool "Minstrel 802.11ac support" if EXPERT
- depends on MAC80211_RC_MINSTREL_HT
- default n
- ---help---
- This option enables VHT in the 'minstrel_ht' TX rate control algorithm
-
choice
prompt "Default rate control algorithm"
depends on MAC80211_HAS_RC
default MAC80211_RC_DEFAULT_MINSTREL
- ---help---
+ help
This option selects the default rate control algorithm
mac80211 will use. Note that this default can still be
overridden through the ieee80211_default_rc_algo module
@@ -54,7 +41,7 @@ choice
config MAC80211_RC_DEFAULT_MINSTREL
bool "Minstrel"
depends on MAC80211_RC_MINSTREL
- ---help---
+ help
Select Minstrel as the default rate control algorithm.
@@ -62,8 +49,7 @@ endchoice
config MAC80211_RC_DEFAULT
string
- default "minstrel_ht" if MAC80211_RC_DEFAULT_MINSTREL && MAC80211_RC_MINSTREL_HT
- default "minstrel" if MAC80211_RC_DEFAULT_MINSTREL
+ default "minstrel_ht" if MAC80211_RC_DEFAULT_MINSTREL
default ""
endif
@@ -71,29 +57,38 @@ endif
comment "Some wireless drivers require a rate control algorithm"
depends on MAC80211 && MAC80211_HAS_RC=n
+config MAC80211_KUNIT_TEST
+ tristate "KUnit tests for mac80211" if !KUNIT_ALL_TESTS
+ depends on KUNIT
+ depends on MAC80211
+ default KUNIT_ALL_TESTS
+ help
+ Enable this option to test mac80211 internals with kunit.
+
+ If unsure, say N.
+
config MAC80211_MESH
- bool "Enable mac80211 mesh networking (pre-802.11s) support"
+ bool "Enable mac80211 mesh networking support"
depends on MAC80211
- ---help---
- This options enables support of Draft 802.11s mesh networking.
- The implementation is based on Draft 2.08 of the Mesh Networking
- amendment. However, no compliance with that draft is claimed or even
- possible, as drafts leave a number of identifiers to be defined after
- ratification. For more information visit http://o11s.org/.
+ help
+ Select this option to enable 802.11 mesh operation in mac80211
+ drivers that support it. 802.11 mesh connects multiple stations
+ over (possibly multi-hop) wireless links to form a single logical
+ LAN.
config MAC80211_LEDS
bool "Enable LED triggers"
depends on MAC80211
- depends on LEDS_CLASS
+ depends on LEDS_CLASS=y || LEDS_CLASS=MAC80211
select LEDS_TRIGGERS
- ---help---
+ help
This option enables a few LED triggers for different
packet receive/transmit events.
config MAC80211_DEBUGFS
bool "Export mac80211 internals in DebugFS"
- depends on MAC80211 && DEBUG_FS
- ---help---
+ depends on MAC80211 && CFG80211_DEBUGFS
+ help
Select this to see extensive information about
the internal state of mac80211 in debugfs.
@@ -101,8 +96,8 @@ config MAC80211_DEBUGFS
config MAC80211_MESSAGE_TRACING
bool "Trace all mac80211 debug messages"
- depends on MAC80211
- ---help---
+ depends on MAC80211 && TRACING
+ help
Select this option to have mac80211 register the
mac80211_msg trace subsystem with tracepoints to
collect all debugging messages, independent of
@@ -115,13 +110,13 @@ config MAC80211_MESSAGE_TRACING
menuconfig MAC80211_DEBUG_MENU
bool "Select mac80211 debugging features"
depends on MAC80211
- ---help---
+ help
This option collects various mac80211 debug settings.
config MAC80211_NOINLINE
bool "Do not inline TX/RX handlers"
depends on MAC80211_DEBUG_MENU
- ---help---
+ help
This option affects code generation in mac80211, when
selected some functions are marked "noinline" to allow
easier debugging of problems in the transmit and receive
@@ -137,7 +132,7 @@ config MAC80211_NOINLINE
config MAC80211_VERBOSE_DEBUG
bool "Verbose debugging output"
depends on MAC80211_DEBUG_MENU
- ---help---
+ help
Selecting this option causes mac80211 to print out
many debugging messages. It should not be selected
on production systems as some of the messages are
@@ -148,7 +143,7 @@ config MAC80211_VERBOSE_DEBUG
config MAC80211_MLME_DEBUG
bool "Verbose managed MLME output"
depends on MAC80211_DEBUG_MENU
- ---help---
+ help
Selecting this option causes mac80211 to print out
debugging messages for the managed-mode MLME. It
should not be selected on production systems as some
@@ -159,7 +154,7 @@ config MAC80211_MLME_DEBUG
config MAC80211_STA_DEBUG
bool "Verbose station debugging"
depends on MAC80211_DEBUG_MENU
- ---help---
+ help
Selecting this option causes mac80211 to print out
debugging messages for station addition/removal.
@@ -168,7 +163,7 @@ config MAC80211_STA_DEBUG
config MAC80211_HT_DEBUG
bool "Verbose HT debugging"
depends on MAC80211_DEBUG_MENU
- ---help---
+ help
This option enables 802.11n High Throughput features
debug tracing output.
@@ -180,7 +175,7 @@ config MAC80211_HT_DEBUG
config MAC80211_OCB_DEBUG
bool "Verbose OCB debugging"
depends on MAC80211_DEBUG_MENU
- ---help---
+ help
Selecting this option causes mac80211 to print out
very verbose OCB debugging messages. It should not
be selected on production systems as those messages
@@ -191,7 +186,7 @@ config MAC80211_OCB_DEBUG
config MAC80211_IBSS_DEBUG
bool "Verbose IBSS debugging"
depends on MAC80211_DEBUG_MENU
- ---help---
+ help
Selecting this option causes mac80211 to print out
very verbose IBSS debugging messages. It should not
be selected on production systems as those messages
@@ -202,7 +197,7 @@ config MAC80211_IBSS_DEBUG
config MAC80211_PS_DEBUG
bool "Verbose powersave mode debugging"
depends on MAC80211_DEBUG_MENU
- ---help---
+ help
Selecting this option causes mac80211 to print out very
verbose power save mode debugging messages (when mac80211
is an AP and has power saving stations.)
@@ -215,7 +210,7 @@ config MAC80211_MPL_DEBUG
bool "Verbose mesh peer link debugging"
depends on MAC80211_DEBUG_MENU
depends on MAC80211_MESH
- ---help---
+ help
Selecting this option causes mac80211 to print out very
verbose mesh peer link debugging messages (when mac80211
is taking part in a mesh network).
@@ -228,7 +223,7 @@ config MAC80211_MPATH_DEBUG
bool "Verbose mesh path debugging"
depends on MAC80211_DEBUG_MENU
depends on MAC80211_MESH
- ---help---
+ help
Selecting this option causes mac80211 to print out very
verbose mesh path selection debugging messages (when mac80211
is taking part in a mesh network).
@@ -241,7 +236,7 @@ config MAC80211_MHWMP_DEBUG
bool "Verbose mesh HWMP routing debugging"
depends on MAC80211_DEBUG_MENU
depends on MAC80211_MESH
- ---help---
+ help
Selecting this option causes mac80211 to print out very
verbose mesh routing (HWMP) debugging messages (when mac80211
is taking part in a mesh network).
@@ -254,7 +249,7 @@ config MAC80211_MESH_SYNC_DEBUG
bool "Verbose mesh synchronization debugging"
depends on MAC80211_DEBUG_MENU
depends on MAC80211_MESH
- ---help---
+ help
Selecting this option causes mac80211 to print out very verbose mesh
synchronization debugging messages (when mac80211 is taking part in a
mesh network).
@@ -265,7 +260,7 @@ config MAC80211_MESH_CSA_DEBUG
bool "Verbose mesh channel switch debugging"
depends on MAC80211_DEBUG_MENU
depends on MAC80211_MESH
- ---help---
+ help
Selecting this option causes mac80211 to print out very verbose mesh
channel switch debugging messages (when mac80211 is taking part in a
mesh network).
@@ -276,7 +271,7 @@ config MAC80211_MESH_PS_DEBUG
bool "Verbose mesh powersave debugging"
depends on MAC80211_DEBUG_MENU
depends on MAC80211_MESH
- ---help---
+ help
Selecting this option causes mac80211 to print out very verbose mesh
powersave debugging messages (when mac80211 is taking part in a
mesh network).
@@ -286,7 +281,7 @@ config MAC80211_MESH_PS_DEBUG
config MAC80211_TDLS_DEBUG
bool "Verbose TDLS debugging"
depends on MAC80211_DEBUG_MENU
- ---help---
+ help
Selecting this option causes mac80211 to print out very
verbose TDLS selection debugging messages (when mac80211
is a TDLS STA).
@@ -299,7 +294,7 @@ config MAC80211_DEBUG_COUNTERS
bool "Extra statistics for TX/RX debugging"
depends on MAC80211_DEBUG_MENU
depends on MAC80211_DEBUGFS
- ---help---
+ help
Selecting this option causes mac80211 to keep additional
and very verbose statistics about TX and RX handler use
as well as a few selected dot11 counters. These will be
@@ -313,7 +308,7 @@ config MAC80211_DEBUG_COUNTERS
config MAC80211_STA_HASH_MAX_SIZE
int "Station hash table maximum size" if MAC80211_DEBUG_MENU
default 0
- ---help---
+ help
Setting this option to a low value (e.g. 4) allows testing the
hash table with collisions relatively deterministically (just
connect more stations than the number selected here.)
diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index bb707789ef2b..a33884967f21 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -13,8 +13,10 @@ mac80211-y := \
ht.o agg-tx.o agg-rx.o \
vht.o \
he.o \
+ s1g.o \
ibss.o \
iface.o \
+ link.o \
rate.o \
michael.o \
tkip.o \
@@ -27,12 +29,14 @@ mac80211-y := \
spectmgmt.o \
tx.o \
key.o \
- util.o \
+ util.o parse.o \
wme.o \
chan.o \
trace.o mlme.o \
tdls.o \
- ocb.o
+ ocb.o \
+ airtime.o \
+ eht.o
mac80211-$(CONFIG_MAC80211_LEDS) += led.o
mac80211-$(CONFIG_MAC80211_DEBUGFS) += \
@@ -53,13 +57,16 @@ mac80211-$(CONFIG_PM) += pm.o
CFLAGS_trace.o := -I$(src)
-rc80211_minstrel-y := rc80211_minstrel.o
-rc80211_minstrel-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_debugfs.o
+rc80211_minstrel-y := \
+ rc80211_minstrel_ht.o
-rc80211_minstrel_ht-y := rc80211_minstrel_ht.o
-rc80211_minstrel_ht-$(CONFIG_MAC80211_DEBUGFS) += rc80211_minstrel_ht_debugfs.o
+rc80211_minstrel-$(CONFIG_MAC80211_DEBUGFS) += \
+ rc80211_minstrel_ht_debugfs.o
mac80211-$(CONFIG_MAC80211_RC_MINSTREL) += $(rc80211_minstrel-y)
-mac80211-$(CONFIG_MAC80211_RC_MINSTREL_HT) += $(rc80211_minstrel_ht-y)
+
+obj-y += tests/
+
+mac80211-y += wbrf.o
ccflags-y += -DDEBUG
diff --git a/net/mac80211/aead_api.c b/net/mac80211/aead_api.c
index 160f9df30402..b00d6f5b33f4 100644
--- a/net/mac80211/aead_api.c
+++ b/net/mac80211/aead_api.c
@@ -1,13 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2003-2004, Instant802 Networks, Inc.
* Copyright 2005-2006, Devicescape Software, Inc.
* Copyright 2014-2015, Qualcomm Atheros, Inc.
*
* Rewrite: Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/kernel.h>
@@ -26,6 +23,7 @@ int aead_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len,
struct aead_request *aead_req;
int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm);
u8 *__aad;
+ int ret;
aead_req = kzalloc(reqsize + aad_len, GFP_ATOMIC);
if (!aead_req)
@@ -43,10 +41,10 @@ int aead_encrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len,
aead_request_set_crypt(aead_req, sg, sg, data_len, b_0);
aead_request_set_ad(aead_req, sg[0].length);
- crypto_aead_encrypt(aead_req);
- kzfree(aead_req);
+ ret = crypto_aead_encrypt(aead_req);
+ kfree_sensitive(aead_req);
- return 0;
+ return ret;
}
int aead_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len,
@@ -79,7 +77,7 @@ int aead_decrypt(struct crypto_aead *tfm, u8 *b_0, u8 *aad, size_t aad_len,
aead_request_set_ad(aead_req, sg[0].length);
err = crypto_aead_decrypt(aead_req);
- kzfree(aead_req);
+ kfree_sensitive(aead_req);
return err;
}
diff --git a/net/mac80211/aead_api.h b/net/mac80211/aead_api.h
index 5e39ea843bbf..7d463b80926a 100644
--- a/net/mac80211/aead_api.h
+++ b/net/mac80211/aead_api.h
@@ -1,8 +1,4 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
+/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef _AEAD_API_H
#define _AEAD_API_H
diff --git a/net/mac80211/aes_ccm.h b/net/mac80211/aes_ccm.h
index e9b7ca0bde5b..96256193cf49 100644
--- a/net/mac80211/aes_ccm.h
+++ b/net/mac80211/aes_ccm.h
@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright 2003-2004, Instant802 Networks, Inc.
* Copyright 2006, Devicescape Software, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#ifndef AES_CCM_H
diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c
index 2fb65588490c..0827965455dc 100644
--- a/net/mac80211/aes_cmac.c
+++ b/net/mac80211/aes_cmac.c
@@ -1,10 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* AES-128-CMAC with TLen 16 for IEEE 802.11w BIP
* Copyright 2008, Jouni Malinen <j@w1.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
+ * Copyright (C) 2020 Intel Corporation
*/
#include <linux/kernel.h>
@@ -18,39 +16,48 @@
#include "key.h"
#include "aes_cmac.h"
-#define CMAC_TLEN 8 /* CMAC TLen = 64 bits (8 octets) */
-#define CMAC_TLEN_256 16 /* CMAC TLen = 128 bits (16 octets) */
#define AAD_LEN 20
-static const u8 zero[CMAC_TLEN_256];
+static const u8 zero[IEEE80211_CMAC_256_MIC_LEN];
-void ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad,
- const u8 *data, size_t data_len, u8 *mic)
+int ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad,
+ const u8 *data, size_t data_len, u8 *mic,
+ unsigned int mic_len)
{
+ int err;
SHASH_DESC_ON_STACK(desc, tfm);
u8 out[AES_BLOCK_SIZE];
+ const __le16 *fc;
desc->tfm = tfm;
- crypto_shash_init(desc);
- crypto_shash_update(desc, aad, AAD_LEN);
- crypto_shash_update(desc, data, data_len - CMAC_TLEN);
- crypto_shash_finup(desc, zero, CMAC_TLEN, out);
+ err = crypto_shash_init(desc);
+ if (err)
+ return err;
+ err = crypto_shash_update(desc, aad, AAD_LEN);
+ if (err)
+ return err;
+ fc = (const __le16 *)aad;
+ if (ieee80211_is_beacon(*fc)) {
+ /* mask Timestamp field to zero */
+ err = crypto_shash_update(desc, zero, 8);
+ if (err)
+ return err;
+ err = crypto_shash_update(desc, data + 8,
+ data_len - 8 - mic_len);
+ if (err)
+ return err;
+ } else {
+ err = crypto_shash_update(desc, data, data_len - mic_len);
+ if (err)
+ return err;
+ }
+ err = crypto_shash_finup(desc, zero, mic_len, out);
+ if (err)
+ return err;
+ memcpy(mic, out, mic_len);
- memcpy(mic, out, CMAC_TLEN);
-}
-
-void ieee80211_aes_cmac_256(struct crypto_shash *tfm, const u8 *aad,
- const u8 *data, size_t data_len, u8 *mic)
-{
- SHASH_DESC_ON_STACK(desc, tfm);
-
- desc->tfm = tfm;
-
- crypto_shash_init(desc);
- crypto_shash_update(desc, aad, AAD_LEN);
- crypto_shash_update(desc, data, data_len - CMAC_TLEN_256);
- crypto_shash_finup(desc, zero, CMAC_TLEN_256, mic);
+ return 0;
}
struct crypto_shash *ieee80211_aes_cmac_key_setup(const u8 key[],
@@ -59,8 +66,14 @@ struct crypto_shash *ieee80211_aes_cmac_key_setup(const u8 key[],
struct crypto_shash *tfm;
tfm = crypto_alloc_shash("cmac(aes)", 0, 0);
- if (!IS_ERR(tfm))
- crypto_shash_setkey(tfm, key, key_len);
+ if (!IS_ERR(tfm)) {
+ int err = crypto_shash_setkey(tfm, key, key_len);
+
+ if (err) {
+ crypto_free_shash(tfm);
+ return ERR_PTR(err);
+ }
+ }
return tfm;
}
diff --git a/net/mac80211/aes_cmac.h b/net/mac80211/aes_cmac.h
index fef531f42003..5f971a8298cb 100644
--- a/net/mac80211/aes_cmac.h
+++ b/net/mac80211/aes_cmac.h
@@ -1,9 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright 2008, Jouni Malinen <j@w1.fi>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#ifndef AES_CMAC_H
@@ -14,10 +11,9 @@
struct crypto_shash *ieee80211_aes_cmac_key_setup(const u8 key[],
size_t key_len);
-void ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad,
- const u8 *data, size_t data_len, u8 *mic);
-void ieee80211_aes_cmac_256(struct crypto_shash *tfm, const u8 *aad,
- const u8 *data, size_t data_len, u8 *mic);
+int ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad,
+ const u8 *data, size_t data_len, u8 *mic,
+ unsigned int mic_len);
void ieee80211_aes_cmac_key_free(struct crypto_shash *tfm);
#endif /* AES_CMAC_H */
diff --git a/net/mac80211/aes_gcm.h b/net/mac80211/aes_gcm.h
index d2b096033009..b14093b2f7a9 100644
--- a/net/mac80211/aes_gcm.h
+++ b/net/mac80211/aes_gcm.h
@@ -1,9 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright 2014-2015, Qualcomm Atheros, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#ifndef AES_GCM_H
diff --git a/net/mac80211/aes_gmac.c b/net/mac80211/aes_gmac.c
index bd72a862ddb7..811a83d8d525 100644
--- a/net/mac80211/aes_gmac.c
+++ b/net/mac80211/aes_gmac.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* AES-GMAC for IEEE 802.11 BIP-GMAC-128 and BIP-GMAC-256
* Copyright 2015, Qualcomm Atheros, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#include <linux/kernel.h>
@@ -20,27 +17,42 @@
int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce,
const u8 *data, size_t data_len, u8 *mic)
{
- struct scatterlist sg[4];
+ struct scatterlist sg[5];
u8 *zero, *__aad, iv[AES_BLOCK_SIZE];
struct aead_request *aead_req;
int reqsize = sizeof(*aead_req) + crypto_aead_reqsize(tfm);
+ const __le16 *fc;
+ int ret;
- if (data_len < GMAC_MIC_LEN)
+ if (data_len < IEEE80211_GMAC_MIC_LEN)
return -EINVAL;
- aead_req = kzalloc(reqsize + GMAC_MIC_LEN + GMAC_AAD_LEN, GFP_ATOMIC);
+ aead_req = kzalloc(reqsize + IEEE80211_GMAC_MIC_LEN + GMAC_AAD_LEN,
+ GFP_ATOMIC);
if (!aead_req)
return -ENOMEM;
zero = (u8 *)aead_req + reqsize;
- __aad = zero + GMAC_MIC_LEN;
+ __aad = zero + IEEE80211_GMAC_MIC_LEN;
memcpy(__aad, aad, GMAC_AAD_LEN);
- sg_init_table(sg, 4);
- sg_set_buf(&sg[0], __aad, GMAC_AAD_LEN);
- sg_set_buf(&sg[1], data, data_len - GMAC_MIC_LEN);
- sg_set_buf(&sg[2], zero, GMAC_MIC_LEN);
- sg_set_buf(&sg[3], mic, GMAC_MIC_LEN);
+ fc = (const __le16 *)aad;
+ if (ieee80211_is_beacon(*fc)) {
+ /* mask Timestamp field to zero */
+ sg_init_table(sg, 5);
+ sg_set_buf(&sg[0], __aad, GMAC_AAD_LEN);
+ sg_set_buf(&sg[1], zero, 8);
+ sg_set_buf(&sg[2], data + 8,
+ data_len - 8 - IEEE80211_GMAC_MIC_LEN);
+ sg_set_buf(&sg[3], zero, IEEE80211_GMAC_MIC_LEN);
+ sg_set_buf(&sg[4], mic, IEEE80211_GMAC_MIC_LEN);
+ } else {
+ sg_init_table(sg, 4);
+ sg_set_buf(&sg[0], __aad, GMAC_AAD_LEN);
+ sg_set_buf(&sg[1], data, data_len - IEEE80211_GMAC_MIC_LEN);
+ sg_set_buf(&sg[2], zero, IEEE80211_GMAC_MIC_LEN);
+ sg_set_buf(&sg[3], mic, IEEE80211_GMAC_MIC_LEN);
+ }
memcpy(iv, nonce, GMAC_NONCE_LEN);
memset(iv + GMAC_NONCE_LEN, 0, sizeof(iv) - GMAC_NONCE_LEN);
@@ -50,10 +62,10 @@ int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce,
aead_request_set_crypt(aead_req, sg, sg, 0, iv);
aead_request_set_ad(aead_req, GMAC_AAD_LEN + data_len);
- crypto_aead_encrypt(aead_req);
- kzfree(aead_req);
+ ret = crypto_aead_encrypt(aead_req);
+ kfree_sensitive(aead_req);
- return 0;
+ return ret;
}
struct crypto_aead *ieee80211_aes_gmac_key_setup(const u8 key[],
@@ -68,7 +80,7 @@ struct crypto_aead *ieee80211_aes_gmac_key_setup(const u8 key[],
err = crypto_aead_setkey(tfm, key, key_len);
if (!err)
- err = crypto_aead_setauthsize(tfm, GMAC_MIC_LEN);
+ err = crypto_aead_setauthsize(tfm, IEEE80211_GMAC_MIC_LEN);
if (!err)
return tfm;
diff --git a/net/mac80211/aes_gmac.h b/net/mac80211/aes_gmac.h
index 32e6442c95be..206136b60bca 100644
--- a/net/mac80211/aes_gmac.h
+++ b/net/mac80211/aes_gmac.h
@@ -1,9 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright 2015, Qualcomm Atheros, Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
*/
#ifndef AES_GMAC_H
@@ -12,7 +9,6 @@
#include <linux/crypto.h>
#define GMAC_AAD_LEN 20
-#define GMAC_MIC_LEN 16
#define GMAC_NONCE_LEN 12
struct crypto_aead *ieee80211_aes_gmac_key_setup(const u8 key[],
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 6a4f154c99f6..7da909d78c68 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* HT handling
*
@@ -8,11 +9,7 @@
* Copyright 2007, Michael Wu <flamingice@sourmilk.net>
* Copyright 2007-2010, Intel Corporation
* Copyright(c) 2015-2017 Intel Deutschland GmbH
- * Copyright (C) 2018 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
+ * Copyright (C) 2018-2025 Intel Corporation
*/
/**
@@ -58,8 +55,8 @@ static void ieee80211_free_tid_rx(struct rcu_head *h)
kfree(tid_rx);
}
-void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
- u16 initiator, u16 reason, bool tx)
+void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
+ u16 initiator, u16 reason, bool tx)
{
struct ieee80211_local *local = sta->local;
struct tid_ampdu_rx *tid_rx;
@@ -72,10 +69,10 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
.ssn = 0,
};
- lockdep_assert_held(&sta->ampdu_mlme.mtx);
+ lockdep_assert_wiphy(sta->local->hw.wiphy);
tid_rx = rcu_dereference_protected(sta->ampdu_mlme.tid_rx[tid],
- lockdep_is_held(&sta->ampdu_mlme.mtx));
+ lockdep_is_held(&sta->local->hw.wiphy->mtx));
if (!test_bit(tid, sta->ampdu_mlme.agg_session_valid))
return;
@@ -106,25 +103,17 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
if (!tid_rx)
return;
- del_timer_sync(&tid_rx->session_timer);
+ timer_delete_sync(&tid_rx->session_timer);
/* make sure ieee80211_sta_reorder_release() doesn't re-arm the timer */
spin_lock_bh(&tid_rx->reorder_lock);
tid_rx->removed = true;
spin_unlock_bh(&tid_rx->reorder_lock);
- del_timer_sync(&tid_rx->reorder_timer);
+ timer_delete_sync(&tid_rx->reorder_timer);
call_rcu(&tid_rx->rcu_head, ieee80211_free_tid_rx);
}
-void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
- u16 initiator, u16 reason, bool tx)
-{
- mutex_lock(&sta->ampdu_mlme.mtx);
- ___ieee80211_stop_rx_ba_session(sta, tid, initiator, reason, tx);
- mutex_unlock(&sta->ampdu_mlme.mtx);
-}
-
void ieee80211_stop_rx_ba_session(struct ieee80211_vif *vif, u16 ba_rx_bitmap,
const u8 *addr)
{
@@ -143,7 +132,7 @@ void ieee80211_stop_rx_ba_session(struct ieee80211_vif *vif, u16 ba_rx_bitmap,
if (ba_rx_bitmap & BIT(i))
set_bit(i, sta->ampdu_mlme.tid_rx_stop_requested);
- ieee80211_queue_work(&sta->local->hw, &sta->ampdu_mlme.work);
+ wiphy_work_queue(sta->local->hw.wiphy, &sta->ampdu_mlme.work);
rcu_read_unlock();
}
EXPORT_SYMBOL(ieee80211_stop_rx_ba_session);
@@ -154,7 +143,8 @@ EXPORT_SYMBOL(ieee80211_stop_rx_ba_session);
*/
static void sta_rx_agg_session_timer_expired(struct timer_list *t)
{
- struct tid_ampdu_rx *tid_rx = from_timer(tid_rx, t, session_timer);
+ struct tid_ampdu_rx *tid_rx = timer_container_of(tid_rx, t,
+ session_timer);
struct sta_info *sta = tid_rx->sta;
u8 tid = tid_rx->tid;
unsigned long timeout;
@@ -169,69 +159,123 @@ static void sta_rx_agg_session_timer_expired(struct timer_list *t)
sta->sta.addr, tid);
set_bit(tid, sta->ampdu_mlme.tid_rx_timer_expired);
- ieee80211_queue_work(&sta->local->hw, &sta->ampdu_mlme.work);
+ wiphy_work_queue(sta->local->hw.wiphy, &sta->ampdu_mlme.work);
}
static void sta_rx_agg_reorder_timer_expired(struct timer_list *t)
{
- struct tid_ampdu_rx *tid_rx = from_timer(tid_rx, t, reorder_timer);
+ struct tid_ampdu_rx *tid_rx = timer_container_of(tid_rx, t,
+ reorder_timer);
rcu_read_lock();
ieee80211_release_reorder_timeout(tid_rx->sta, tid_rx->tid);
rcu_read_unlock();
}
-static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid,
+void ieee80211_add_addbaext(struct sk_buff *skb,
+ const u8 req_addba_ext_data,
+ u16 buf_size)
+{
+ struct ieee80211_addba_ext_ie *addba_ext;
+ u8 *pos;
+
+ pos = skb_put_zero(skb, 2 + sizeof(struct ieee80211_addba_ext_ie));
+ *pos++ = WLAN_EID_ADDBA_EXT;
+ *pos++ = sizeof(struct ieee80211_addba_ext_ie);
+ addba_ext = (struct ieee80211_addba_ext_ie *)pos;
+
+ addba_ext->data = IEEE80211_ADDBA_EXT_NO_FRAG;
+ if (req_addba_ext_data)
+ addba_ext->data &= req_addba_ext_data;
+
+ addba_ext->data |=
+ u8_encode_bits(buf_size >> IEEE80211_ADDBA_EXT_BUF_SIZE_SHIFT,
+ IEEE80211_ADDBA_EXT_BUF_SIZE_MASK);
+}
+
+u8 ieee80211_retrieve_addba_ext_data(struct sta_info *sta,
+ const void *elem_data, ssize_t elem_len,
+ u16 *buf_size)
+{
+ struct ieee802_11_elems *elems;
+ u8 buf_size_1k, data = 0;
+
+ if (!sta->sta.deflink.he_cap.has_he)
+ return 0;
+
+ if (elem_len <= 0)
+ return 0;
+
+ elems = ieee802_11_parse_elems(elem_data, elem_len,
+ IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_ACTION,
+ NULL);
+
+ if (!elems || elems->parse_error || !elems->addba_ext_ie)
+ goto free;
+
+ data = elems->addba_ext_ie->data;
+
+ if (buf_size &&
+ (sta->sta.valid_links || sta->sta.deflink.eht_cap.has_eht)) {
+ buf_size_1k = u8_get_bits(elems->addba_ext_ie->data,
+ IEEE80211_ADDBA_EXT_BUF_SIZE_MASK);
+ *buf_size |= (u16)buf_size_1k <<
+ IEEE80211_ADDBA_EXT_BUF_SIZE_SHIFT;
+ }
+
+free:
+ kfree(elems);
+
+ return data;
+}
+
+static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid,
u8 dialog_token, u16 status, u16 policy,
- u16 buf_size, u16 timeout)
+ u16 buf_size, u16 timeout,
+ const u8 req_addba_ext_data)
{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
struct ieee80211_local *local = sdata->local;
struct sk_buff *skb;
struct ieee80211_mgmt *mgmt;
bool amsdu = ieee80211_hw_check(&local->hw, SUPPORTS_AMSDU_IN_AMPDU);
u16 capab;
- skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom);
+ skb = dev_alloc_skb(sizeof(*mgmt) +
+ 2 + sizeof(struct ieee80211_addba_ext_ie) +
+ local->hw.extra_tx_headroom);
if (!skb)
return;
skb_reserve(skb, local->hw.extra_tx_headroom);
- mgmt = skb_put_zero(skb, 24);
- memcpy(mgmt->da, da, ETH_ALEN);
- memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
- if (sdata->vif.type == NL80211_IFTYPE_AP ||
- sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
- sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
- memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN);
- else if (sdata->vif.type == NL80211_IFTYPE_STATION)
- memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN);
- else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
- memcpy(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN);
-
- mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
- IEEE80211_STYPE_ACTION);
+ mgmt = ieee80211_mgmt_ba(skb, da, sdata);
skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_resp));
mgmt->u.action.category = WLAN_CATEGORY_BACK;
mgmt->u.action.u.addba_resp.action_code = WLAN_ACTION_ADDBA_RESP;
mgmt->u.action.u.addba_resp.dialog_token = dialog_token;
- capab = (u16)(amsdu << 0); /* bit 0 A-MSDU support */
- capab |= (u16)(policy << 1); /* bit 1 aggregation policy */
- capab |= (u16)(tid << 2); /* bit 5:2 TID number */
- capab |= (u16)(buf_size << 6); /* bit 15:6 max size of aggregation */
+ capab = u16_encode_bits(amsdu, IEEE80211_ADDBA_PARAM_AMSDU_MASK);
+ capab |= u16_encode_bits(policy, IEEE80211_ADDBA_PARAM_POLICY_MASK);
+ capab |= u16_encode_bits(tid, IEEE80211_ADDBA_PARAM_TID_MASK);
+ capab |= u16_encode_bits(buf_size, IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK);
mgmt->u.action.u.addba_resp.capab = cpu_to_le16(capab);
mgmt->u.action.u.addba_resp.timeout = cpu_to_le16(timeout);
mgmt->u.action.u.addba_resp.status = cpu_to_le16(status);
+ if (sta->sta.valid_links || sta->sta.deflink.he_cap.has_he)
+ ieee80211_add_addbaext(skb, req_addba_ext_data, buf_size);
+
ieee80211_tx_skb(sdata, skb);
}
-void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
- u8 dialog_token, u16 timeout,
- u16 start_seq_num, u16 ba_policy, u16 tid,
- u16 buf_size, bool tx, bool auto_seq)
+void __ieee80211_start_rx_ba_session(struct sta_info *sta,
+ u8 dialog_token, u16 timeout,
+ u16 start_seq_num, u16 ba_policy, u16 tid,
+ u16 buf_size, bool tx, bool auto_seq,
+ const u8 addba_ext_data)
{
struct ieee80211_local *local = sta->sdata->local;
struct tid_ampdu_rx *tid_agg_rx;
@@ -247,6 +291,8 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
u16 status = WLAN_STATUS_REQUEST_DECLINED;
u16 max_buf_size;
+ lockdep_assert_wiphy(sta->local->hw.wiphy);
+
if (tid >= IEEE80211_FIRST_TSPEC_TSID) {
ht_dbg(sta->sdata,
"STA %pM requests BA session on unsupported tid %d\n",
@@ -254,9 +300,12 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
goto end;
}
- if (!sta->sta.ht_cap.ht_supported) {
+ if (!sta->sta.valid_links &&
+ !sta->sta.deflink.ht_cap.ht_supported &&
+ !sta->sta.deflink.he_cap.has_he &&
+ !sta->sta.deflink.s1g_cap.s1g) {
ht_dbg(sta->sdata,
- "STA %pM erroneously requests BA session on tid %d w/o QoS\n",
+ "STA %pM erroneously requests BA session on tid %d w/o HT\n",
sta->sta.addr, tid);
/* send a response anyway, it's an error case if we get here */
goto end;
@@ -269,8 +318,10 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
goto end;
}
- if (sta->sta.he_cap.has_he)
- max_buf_size = IEEE80211_MAX_AMPDU_BUF;
+ if (sta->sta.valid_links || sta->sta.deflink.eht_cap.has_eht)
+ max_buf_size = IEEE80211_MAX_AMPDU_BUF_EHT;
+ else if (sta->sta.deflink.he_cap.has_he)
+ max_buf_size = IEEE80211_MAX_AMPDU_BUF_HE;
else
max_buf_size = IEEE80211_MAX_AMPDU_BUF_HT;
@@ -279,7 +330,9 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
* and if buffer size does not exceeds max value */
/* XXX: check own ht delayed BA capability?? */
if (((ba_policy != 1) &&
- (!(sta->sta.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA))) ||
+ (sta->sta.valid_links ||
+ !(sta->sta.deflink.ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA) ||
+ !(sta->sta.deflink.s1g_cap.cap[3] & S1G_CAP3_HT_DELAYED_BA))) ||
(buf_size > max_buf_size)) {
status = WLAN_STATUS_INVALID_QOS_PARAM;
ht_dbg_ratelimited(sta->sdata,
@@ -299,9 +352,6 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
ht_dbg(sta->sdata, "AddBA Req buf_size=%d for %pM\n",
buf_size, sta->sta.addr);
- /* examine state machine */
- lockdep_assert_held(&sta->ampdu_mlme.mtx);
-
if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) {
if (sta->ampdu_mlme.tid_rx_token[tid] == dialog_token) {
struct tid_ampdu_rx *tid_rx;
@@ -311,7 +361,7 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
sta->sta.addr, tid);
/* We have no API to update the timeout value in the
* driver so reject the timeout update if the timeout
- * changed. If if did not change, i.e., no real update,
+ * changed. If it did not change, i.e., no real update,
* just reply with success.
*/
rcu_read_lock();
@@ -329,9 +379,9 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta,
sta->sta.addr, tid);
/* delete existing Rx BA session on the same tid */
- ___ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_RECIPIENT,
- WLAN_STATUS_UNSPECIFIED_QOS,
- false);
+ __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_RECIPIENT,
+ WLAN_STATUS_UNSPECIFIED_QOS,
+ false);
}
if (ieee80211_hw_check(&local->hw, SUPPORTS_REORDERING_BUFFER)) {
@@ -413,22 +463,9 @@ end:
}
if (tx)
- ieee80211_send_addba_resp(sta->sdata, sta->sta.addr, tid,
+ ieee80211_send_addba_resp(sta, sta->sta.addr, tid,
dialog_token, status, 1, buf_size,
- timeout);
-}
-
-static void __ieee80211_start_rx_ba_session(struct sta_info *sta,
- u8 dialog_token, u16 timeout,
- u16 start_seq_num, u16 ba_policy,
- u16 tid, u16 buf_size, bool tx,
- bool auto_seq)
-{
- mutex_lock(&sta->ampdu_mlme.mtx);
- ___ieee80211_start_rx_ba_session(sta, dialog_token, timeout,
- start_seq_num, ba_policy, tid,
- buf_size, tx, auto_seq);
- mutex_unlock(&sta->ampdu_mlme.mtx);
+ timeout, addba_ext_data);
}
void ieee80211_process_addba_request(struct ieee80211_local *local,
@@ -437,7 +474,7 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
size_t len)
{
u16 capab, tid, timeout, ba_policy, buf_size, start_seq_num;
- u8 dialog_token;
+ u8 dialog_token, addba_ext_data;
/* extract session parameters from addba request frame */
dialog_token = mgmt->u.action.u.addba_req.dialog_token;
@@ -450,16 +487,23 @@ void ieee80211_process_addba_request(struct ieee80211_local *local,
tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2;
buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
+ addba_ext_data =
+ ieee80211_retrieve_addba_ext_data(sta,
+ mgmt->u.action.u.addba_req.variable,
+ len -
+ offsetof(typeof(*mgmt),
+ u.action.u.addba_req.variable),
+ &buf_size);
+
__ieee80211_start_rx_ba_session(sta, dialog_token, timeout,
start_seq_num, ba_policy, tid,
- buf_size, true, false);
+ buf_size, true, false, addba_ext_data);
}
void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif,
const u8 *addr, unsigned int tid)
{
struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
- struct ieee80211_local *local = sdata->local;
struct sta_info *sta;
rcu_read_lock();
@@ -468,7 +512,7 @@ void ieee80211_manage_rx_ba_offl(struct ieee80211_vif *vif,
goto unlock;
set_bit(tid, sta->ampdu_mlme.tid_rx_manage_offl);
- ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work);
+ wiphy_work_queue(sta->local->hw.wiphy, &sta->ampdu_mlme.work);
unlock:
rcu_read_unlock();
}
@@ -478,7 +522,6 @@ void ieee80211_rx_ba_timer_expired(struct ieee80211_vif *vif,
const u8 *addr, unsigned int tid)
{
struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
- struct ieee80211_local *local = sdata->local;
struct sta_info *sta;
rcu_read_lock();
@@ -487,7 +530,7 @@ void ieee80211_rx_ba_timer_expired(struct ieee80211_vif *vif,
goto unlock;
set_bit(tid, sta->ampdu_mlme.tid_rx_timer_expired);
- ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work);
+ wiphy_work_queue(sta->local->hw.wiphy, &sta->ampdu_mlme.work);
unlock:
rcu_read_unlock();
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 69e831bc317b..d981b0fc57bf 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* HT handling
*
@@ -8,11 +9,7 @@
* Copyright 2007, Michael Wu <flamingice@sourmilk.net>
* Copyright 2007-2010, Intel Corporation
* Copyright(c) 2015-2017 Intel Deutschland GmbH
- * Copyright (C) 2018 Intel Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
+ * Copyright (C) 2018 - 2024 Intel Corporation
*/
#include <linux/ieee80211.h>
@@ -61,36 +58,24 @@
* complete.
*/
-static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
- const u8 *da, u16 tid,
+static void ieee80211_send_addba_request(struct sta_info *sta, u16 tid,
u8 dialog_token, u16 start_seq_num,
u16 agg_size, u16 timeout)
{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
struct ieee80211_local *local = sdata->local;
struct sk_buff *skb;
struct ieee80211_mgmt *mgmt;
u16 capab;
- skb = dev_alloc_skb(sizeof(*mgmt) + local->hw.extra_tx_headroom);
-
+ skb = dev_alloc_skb(sizeof(*mgmt) +
+ 2 + sizeof(struct ieee80211_addba_ext_ie) +
+ local->hw.extra_tx_headroom);
if (!skb)
return;
skb_reserve(skb, local->hw.extra_tx_headroom);
- mgmt = skb_put_zero(skb, 24);
- memcpy(mgmt->da, da, ETH_ALEN);
- memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
- if (sdata->vif.type == NL80211_IFTYPE_AP ||
- sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
- sdata->vif.type == NL80211_IFTYPE_MESH_POINT)
- memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN);
- else if (sdata->vif.type == NL80211_IFTYPE_STATION)
- memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN);
- else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
- memcpy(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN);
-
- mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
- IEEE80211_STYPE_ACTION);
+ mgmt = ieee80211_mgmt_ba(skb, sta->sta.addr, sdata);
skb_put(skb, 1 + sizeof(mgmt->u.action.u.addba_req));
@@ -98,10 +83,10 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
mgmt->u.action.u.addba_req.action_code = WLAN_ACTION_ADDBA_REQ;
mgmt->u.action.u.addba_req.dialog_token = dialog_token;
- capab = (u16)(1 << 0); /* bit 0 A-MSDU support */
- capab |= (u16)(1 << 1); /* bit 1 aggregation policy */
- capab |= (u16)(tid << 2); /* bit 5:2 TID number */
- capab |= (u16)(agg_size << 6); /* bit 15:6 max size of aggergation */
+ capab = IEEE80211_ADDBA_PARAM_AMSDU_MASK;
+ capab |= IEEE80211_ADDBA_PARAM_POLICY_MASK;
+ capab |= u16_encode_bits(tid, IEEE80211_ADDBA_PARAM_TID_MASK);
+ capab |= u16_encode_bits(agg_size, IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK);
mgmt->u.action.u.addba_req.capab = cpu_to_le16(capab);
@@ -109,7 +94,10 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
mgmt->u.action.u.addba_req.start_seq_num =
cpu_to_le16(start_seq_num << 4);
- ieee80211_tx_skb(sdata, skb);
+ if (sta->sta.deflink.he_cap.has_he)
+ ieee80211_add_addbaext(skb, 0, agg_size);
+
+ ieee80211_tx_skb_tid(sdata, skb, tid, -1);
}
void ieee80211_send_bar(struct ieee80211_vif *vif, u8 *ra, u16 tid, u16 ssn)
@@ -138,14 +126,14 @@ void ieee80211_send_bar(struct ieee80211_vif *vif, u8 *ra, u16 tid, u16 ssn)
IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT |
IEEE80211_TX_CTL_REQ_TX_STATUS;
- ieee80211_tx_skb_tid(sdata, skb, tid);
+ ieee80211_tx_skb_tid(sdata, skb, tid, -1);
}
EXPORT_SYMBOL(ieee80211_send_bar);
void ieee80211_assign_tid_tx(struct sta_info *sta, int tid,
struct tid_ampdu_tx *tid_tx)
{
- lockdep_assert_held(&sta->ampdu_mlme.mtx);
+ lockdep_assert_wiphy(sta->local->hw.wiphy);
lockdep_assert_held(&sta->lock);
rcu_assign_pointer(sta->ampdu_mlme.tid_tx[tid], tid_tx);
}
@@ -216,6 +204,8 @@ ieee80211_agg_start_txq(struct sta_info *sta, int tid, bool enable)
struct ieee80211_txq *txq = sta->sta.txq[tid];
struct txq_info *txqi;
+ lockdep_assert_wiphy(sta->local->hw.wiphy);
+
if (!txq)
return;
@@ -229,7 +219,7 @@ ieee80211_agg_start_txq(struct sta_info *sta, int tid, bool enable)
clear_bit(IEEE80211_TXQ_STOP, &txqi->flags);
local_bh_disable();
rcu_read_lock();
- drv_wake_tx_queue(sta->sdata->local, txqi);
+ schedule_and_wake_txq(sta->sdata->local, txqi);
rcu_read_unlock();
local_bh_enable();
}
@@ -272,7 +262,7 @@ static void ieee80211_remove_tid_tx(struct sta_info *sta, int tid)
{
struct tid_ampdu_tx *tid_tx;
- lockdep_assert_held(&sta->ampdu_mlme.mtx);
+ lockdep_assert_wiphy(sta->local->hw.wiphy);
lockdep_assert_held(&sta->lock);
tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
@@ -293,13 +283,12 @@ static void ieee80211_remove_tid_tx(struct sta_info *sta, int tid)
ieee80211_assign_tid_tx(sta, tid, NULL);
ieee80211_agg_splice_finish(sta->sdata, tid);
- ieee80211_agg_start_txq(sta, tid, false);
kfree_rcu(tid_tx, rcu_head);
}
-int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
- enum ieee80211_agg_stop_reason reason)
+int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
+ enum ieee80211_agg_stop_reason reason)
{
struct ieee80211_local *local = sta->local;
struct tid_ampdu_tx *tid_tx;
@@ -313,7 +302,7 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
};
int ret;
- lockdep_assert_held(&sta->ampdu_mlme.mtx);
+ lockdep_assert_wiphy(sta->local->hw.wiphy);
switch (reason) {
case AGG_STOP_DECLINED:
@@ -366,13 +355,15 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
set_bit(HT_AGG_STATE_STOPPING, &tid_tx->state);
+ ieee80211_agg_stop_txq(sta, tid);
+
spin_unlock_bh(&sta->lock);
ht_dbg(sta->sdata, "Tx BA session stop requested for %pM tid %u\n",
sta->sta.addr, tid);
- del_timer_sync(&tid_tx->addba_resp_timer);
- del_timer_sync(&tid_tx->session_timer);
+ timer_delete_sync(&tid_tx->addba_resp_timer);
+ timer_delete_sync(&tid_tx->session_timer);
/*
* After this packets are no longer handed right through
@@ -431,7 +422,8 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
*/
static void sta_addba_resp_timer_expired(struct timer_list *t)
{
- struct tid_ampdu_tx *tid_tx = from_timer(tid_tx, t, addba_resp_timer);
+ struct tid_ampdu_tx *tid_tx = timer_container_of(tid_tx, t,
+ addba_resp_timer);
struct sta_info *sta = tid_tx->sta;
u8 tid = tid_tx->tid;
@@ -449,6 +441,54 @@ static void sta_addba_resp_timer_expired(struct timer_list *t)
ieee80211_stop_tx_ba_session(&sta->sta, tid);
}
+static void ieee80211_send_addba_with_timeout(struct sta_info *sta,
+ struct tid_ampdu_tx *tid_tx)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct ieee80211_local *local = sta->local;
+ u8 tid = tid_tx->tid;
+ u16 buf_size;
+
+ if (WARN_ON_ONCE(test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state) ||
+ test_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state)))
+ return;
+
+ lockdep_assert_wiphy(sta->local->hw.wiphy);
+
+ /* activate the timer for the recipient's addBA response */
+ mod_timer(&tid_tx->addba_resp_timer, jiffies + ADDBA_RESP_INTERVAL);
+ ht_dbg(sdata, "activated addBA response timer on %pM tid %d\n",
+ sta->sta.addr, tid);
+
+ spin_lock_bh(&sta->lock);
+ sta->ampdu_mlme.last_addba_req_time[tid] = jiffies;
+ sta->ampdu_mlme.addba_req_num[tid]++;
+ spin_unlock_bh(&sta->lock);
+
+ if (sta->sta.valid_links ||
+ sta->sta.deflink.eht_cap.has_eht ||
+ ieee80211_hw_check(&local->hw, STRICT)) {
+ buf_size = local->hw.max_tx_aggregation_subframes;
+ } else if (sta->sta.deflink.he_cap.has_he) {
+ buf_size = min_t(u16, local->hw.max_tx_aggregation_subframes,
+ IEEE80211_MAX_AMPDU_BUF_HE);
+ } else {
+ /*
+ * We really should use what the driver told us it will
+ * transmit as the maximum, but certain APs (e.g. the
+ * LinkSys WRT120N with FW v1.0.07 build 002 Jun 18 2012)
+ * will crash when we use a lower number.
+ */
+ buf_size = IEEE80211_MAX_AMPDU_BUF_HT;
+ }
+
+ /* send AddBA request */
+ ieee80211_send_addba_request(sta, tid, tid_tx->dialog_token,
+ tid_tx->ssn, buf_size, tid_tx->timeout);
+
+ WARN_ON(test_and_set_bit(HT_AGG_STATE_SENT_ADDBA, &tid_tx->state));
+}
+
void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
{
struct tid_ampdu_tx *tid_tx;
@@ -463,7 +503,6 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
.timeout = 0,
};
int ret;
- u16 buf_size;
tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
@@ -474,8 +513,6 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
*/
clear_bit(HT_AGG_STATE_WANT_START, &tid_tx->state);
- ieee80211_agg_stop_txq(sta, tid);
-
/*
* Make sure no packets are being processed. This ensures that
* we have a valid starting sequence number and that in-flight
@@ -486,7 +523,17 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
params.ssn = sta->tid_seq[tid] >> 4;
ret = drv_ampdu_action(local, sdata, &params);
- if (ret) {
+ tid_tx->ssn = params.ssn;
+ if (ret == IEEE80211_AMPDU_TX_START_DELAY_ADDBA) {
+ return;
+ } else if (ret == IEEE80211_AMPDU_TX_START_IMMEDIATE) {
+ /*
+ * We didn't send the request yet, so don't need to check
+ * here if we already got a response, just mark as driver
+ * ready immediately.
+ */
+ set_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state);
+ } else if (ret) {
ht_dbg(sdata,
"BA request denied - HW unavailable for %pM tid %d\n",
sta->sta.addr, tid);
@@ -502,33 +549,25 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
return;
}
- /* activate the timer for the recipient's addBA response */
- mod_timer(&tid_tx->addba_resp_timer, jiffies + ADDBA_RESP_INTERVAL);
- ht_dbg(sdata, "activated addBA response timer on %pM tid %d\n",
- sta->sta.addr, tid);
+ ieee80211_send_addba_with_timeout(sta, tid_tx);
+}
- spin_lock_bh(&sta->lock);
- sta->ampdu_mlme.last_addba_req_time[tid] = jiffies;
- sta->ampdu_mlme.addba_req_num[tid]++;
- spin_unlock_bh(&sta->lock);
+void ieee80211_refresh_tx_agg_session_timer(struct ieee80211_sta *pubsta,
+ u16 tid)
+{
+ struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+ struct tid_ampdu_tx *tid_tx;
- if (sta->sta.he_cap.has_he) {
- buf_size = local->hw.max_tx_aggregation_subframes;
- } else {
- /*
- * We really should use what the driver told us it will
- * transmit as the maximum, but certain APs (e.g. the
- * LinkSys WRT120N with FW v1.0.07 build 002 Jun 18 2012)
- * will crash when we use a lower number.
- */
- buf_size = IEEE80211_MAX_AMPDU_BUF_HT;
- }
+ if (WARN_ON_ONCE(tid >= IEEE80211_NUM_TIDS))
+ return;
- /* send AddBA request */
- ieee80211_send_addba_request(sdata, sta->sta.addr, tid,
- tid_tx->dialog_token, params.ssn,
- buf_size, tid_tx->timeout);
+ tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]);
+ if (!tid_tx)
+ return;
+
+ tid_tx->last_tx = jiffies;
}
+EXPORT_SYMBOL(ieee80211_refresh_tx_agg_session_timer);
/*
* After accepting the AddBA Response we activated a timer,
@@ -536,7 +575,8 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
*/
static void sta_tx_agg_session_timer_expired(struct timer_list *t)
{
- struct tid_ampdu_tx *tid_tx = from_timer(tid_tx, t, session_timer);
+ struct tid_ampdu_tx *tid_tx = timer_container_of(tid_tx, t,
+ session_timer);
struct sta_info *sta = tid_tx->sta;
u8 tid = tid_tx->tid;
unsigned long timeout;
@@ -572,7 +612,12 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid,
"Requested to start BA session on reserved tid=%d", tid))
return -EINVAL;
- if (!pubsta->ht_cap.ht_supported)
+ if (!pubsta->valid_links &&
+ !pubsta->deflink.ht_cap.ht_supported &&
+ !pubsta->deflink.vht_cap.vht_supported &&
+ !pubsta->deflink.he_cap.has_he &&
+ !pubsta->deflink.eht_cap.has_eht &&
+ !pubsta->deflink.s1g_cap.s1g)
return -EINVAL;
if (WARN_ON_ONCE(!local->ops->ampdu_action))
@@ -603,6 +648,14 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid,
return -EINVAL;
}
+ if (test_sta_flag(sta, WLAN_STA_MFP) &&
+ !test_sta_flag(sta, WLAN_STA_AUTHORIZED)) {
+ ht_dbg(sdata,
+ "MFP STA not authorized - deny BA session request %pM tid %d\n",
+ sta->sta.addr, tid);
+ return -EINVAL;
+ }
+
/*
* 802.11n-2009 11.5.1.1: If the initiating STA is an HT STA, is a
* member of an IBSS, and has no other existing Block Ack agreement
@@ -616,7 +669,7 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid,
* is set when we receive a bss info from a probe response or a beacon.
*/
if (sta->sdata->vif.type == NL80211_IFTYPE_ADHOC &&
- !sta->sta.ht_cap.ht_supported) {
+ !sta->sta.deflink.ht_cap.ht_supported) {
ht_dbg(sdata,
"BA request denied - IBSS STA %pM does not advertise HT support\n",
pubsta->addr);
@@ -687,7 +740,7 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid,
*/
sta->ampdu_mlme.tid_start_tx[tid] = tid_tx;
- ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work);
+ wiphy_work_queue(local->hw.wiphy, &sta->ampdu_mlme.work);
/* this flow continues off the work */
err_unlock_sta:
@@ -708,7 +761,7 @@ static void ieee80211_agg_tx_operational(struct ieee80211_local *local,
.ssn = 0,
};
- lockdep_assert_held(&sta->ampdu_mlme.mtx);
+ lockdep_assert_wiphy(sta->local->hw.wiphy);
tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
params.buf_size = tid_tx->buf_size;
@@ -745,9 +798,21 @@ void ieee80211_start_tx_ba_cb(struct sta_info *sta, int tid,
struct ieee80211_sub_if_data *sdata = sta->sdata;
struct ieee80211_local *local = sdata->local;
+ lockdep_assert_wiphy(sta->local->hw.wiphy);
+
if (WARN_ON(test_and_set_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state)))
return;
+ if (test_bit(HT_AGG_STATE_STOPPING, &tid_tx->state) ||
+ test_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state))
+ return;
+
+ if (!test_bit(HT_AGG_STATE_SENT_ADDBA, &tid_tx->state)) {
+ ieee80211_send_addba_with_timeout(sta, tid_tx);
+ /* RESPONSE_RECEIVED state would trigger the flow again */
+ return;
+ }
+
if (test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state))
ieee80211_agg_tx_operational(local, sta, tid);
}
@@ -794,26 +859,12 @@ void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif,
goto out;
set_bit(HT_AGG_STATE_START_CB, &tid_tx->state);
- ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work);
+ wiphy_work_queue(local->hw.wiphy, &sta->ampdu_mlme.work);
out:
rcu_read_unlock();
}
EXPORT_SYMBOL(ieee80211_start_tx_ba_cb_irqsafe);
-int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
- enum ieee80211_agg_stop_reason reason)
-{
- int ret;
-
- mutex_lock(&sta->ampdu_mlme.mtx);
-
- ret = ___ieee80211_stop_tx_ba_session(sta, tid, reason);
-
- mutex_unlock(&sta->ampdu_mlme.mtx);
-
- return ret;
-}
-
int ieee80211_stop_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid)
{
struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
@@ -848,7 +899,7 @@ int ieee80211_stop_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid)
}
set_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state);
- ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work);
+ wiphy_work_queue(local->hw.wiphy, &sta->ampdu_mlme.work);
unlock:
spin_unlock_bh(&sta->lock);
@@ -861,6 +912,7 @@ void ieee80211_stop_tx_ba_cb(struct sta_info *sta, int tid,
{
struct ieee80211_sub_if_data *sdata = sta->sdata;
bool send_delba = false;
+ bool start_txq = false;
ht_dbg(sdata, "Stopping Tx BA session for %pM tid %d\n",
sta->sta.addr, tid);
@@ -878,10 +930,14 @@ void ieee80211_stop_tx_ba_cb(struct sta_info *sta, int tid,
send_delba = true;
ieee80211_remove_tid_tx(sta, tid);
+ start_txq = true;
unlock_sta:
spin_unlock_bh(&sta->lock);
+ if (start_txq)
+ ieee80211_agg_start_txq(sta, tid, false);
+
if (send_delba)
ieee80211_send_delba(sdata, sta->sta.addr, tid,
WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE);
@@ -903,7 +959,7 @@ void ieee80211_stop_tx_ba_cb_irqsafe(struct ieee80211_vif *vif,
goto out;
set_bit(HT_AGG_STATE_STOP_CB, &tid_tx->state);
- ieee80211_queue_work(&local->hw, &sta->ampdu_mlme.work);
+ wiphy_work_queue(local->hw.wiphy, &sta->ampdu_mlme.work);
out:
rcu_read_unlock();
}
@@ -920,29 +976,36 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
u16 capab, tid, buf_size;
bool amsdu;
+ lockdep_assert_wiphy(sta->local->hw.wiphy);
+
capab = le16_to_cpu(mgmt->u.action.u.addba_resp.capab);
amsdu = capab & IEEE80211_ADDBA_PARAM_AMSDU_MASK;
- tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2;
- buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
+ tid = u16_get_bits(capab, IEEE80211_ADDBA_PARAM_TID_MASK);
+ buf_size = u16_get_bits(capab, IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK);
+
+ ieee80211_retrieve_addba_ext_data(sta,
+ mgmt->u.action.u.addba_resp.variable,
+ len - offsetof(typeof(*mgmt),
+ u.action.u.addba_resp.variable),
+ &buf_size);
+
buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes);
txq = sta->sta.txq[tid];
if (!amsdu && txq)
set_bit(IEEE80211_TXQ_NO_AMSDU, &to_txq_info(txq)->flags);
- mutex_lock(&sta->ampdu_mlme.mtx);
-
tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
if (!tid_tx)
- goto out;
+ return;
if (mgmt->u.action.u.addba_resp.dialog_token != tid_tx->dialog_token) {
ht_dbg(sta->sdata, "wrong addBA response token, %pM tid %d\n",
sta->sta.addr, tid);
- goto out;
+ return;
}
- del_timer_sync(&tid_tx->addba_resp_timer);
+ timer_delete_sync(&tid_tx->addba_resp_timer);
ht_dbg(sta->sdata, "switched off addBA timer for %pM tid %d\n",
sta->sta.addr, tid);
@@ -957,7 +1020,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
ht_dbg(sta->sdata,
"got addBA resp for %pM tid %d but we already gave up\n",
sta->sta.addr, tid);
- goto out;
+ return;
}
/*
@@ -971,7 +1034,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
if (test_and_set_bit(HT_AGG_STATE_RESPONSE_RECEIVED,
&tid_tx->state)) {
/* ignore duplicate response */
- goto out;
+ return;
}
tid_tx->buf_size = buf_size;
@@ -992,9 +1055,6 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
}
} else {
- ___ieee80211_stop_tx_ba_session(sta, tid, AGG_STOP_DECLINED);
+ __ieee80211_stop_tx_ba_session(sta, tid, AGG_STOP_DECLINED);
}
-
- out:
- mutex_unlock(&sta->ampdu_mlme.mtx);
}
diff --git a/net/mac80211/airtime.c b/net/mac80211/airtime.c
new file mode 100644
index 000000000000..c61df637232a
--- /dev/null
+++ b/net/mac80211/airtime.c
@@ -0,0 +1,837 @@
+// SPDX-License-Identifier: ISC
+/*
+ * Copyright (C) 2019 Felix Fietkau <nbd@nbd.name>
+ * Copyright (C) 2021-2022 Intel Corporation
+ */
+
+#include <net/mac80211.h>
+#include "ieee80211_i.h"
+#include "sta_info.h"
+
+#define AVG_PKT_SIZE 1024
+
+/* Number of bits for an average sized packet */
+#define MCS_NBITS (AVG_PKT_SIZE << 3)
+
+/* Number of kilo-symbols (symbols * 1024) for a packet with (bps) bits per
+ * symbol. We use k-symbols to avoid rounding in the _TIME macros below.
+ */
+#define MCS_N_KSYMS(bps) DIV_ROUND_UP(MCS_NBITS << 10, (bps))
+
+/* Transmission time (in 1024 * usec) for a packet containing (ksyms) * 1024
+ * symbols.
+ */
+#define MCS_SYMBOL_TIME(sgi, ksyms) \
+ (sgi ? \
+ ((ksyms) * 4 * 18) / 20 : /* 3.6 us per sym */ \
+ ((ksyms) * 4) /* 4.0 us per sym */ \
+ )
+
+/* Transmit duration for the raw data part of an average sized packet */
+#define MCS_DURATION(streams, sgi, bps) \
+ ((u32)MCS_SYMBOL_TIME(sgi, MCS_N_KSYMS((streams) * (bps))))
+
+#define MCS_DURATION_S(shift, streams, sgi, bps) \
+ ((u16)((MCS_DURATION(streams, sgi, bps) >> shift)))
+
+/* These should match the values in enum nl80211_he_gi */
+#define HE_GI_08 0
+#define HE_GI_16 1
+#define HE_GI_32 2
+
+/* Transmission time (1024 usec) for a packet containing (ksyms) * k-symbols */
+#define HE_SYMBOL_TIME(gi, ksyms) \
+ (gi == HE_GI_08 ? \
+ ((ksyms) * 16 * 17) / 20 : /* 13.6 us per sym */ \
+ (gi == HE_GI_16 ? \
+ ((ksyms) * 16 * 18) / 20 : /* 14.4 us per sym */ \
+ ((ksyms) * 16) /* 16.0 us per sym */ \
+ ))
+
+/* Transmit duration for the raw data part of an average sized packet */
+#define HE_DURATION(streams, gi, bps) \
+ ((u32)HE_SYMBOL_TIME(gi, MCS_N_KSYMS((streams) * (bps))))
+
+#define HE_DURATION_S(shift, streams, gi, bps) \
+ (HE_DURATION(streams, gi, bps) >> shift)
+
+/* gi in HE/EHT is identical. It matches enum nl80211_eht_gi as well */
+#define EHT_GI_08 HE_GI_08
+#define EHT_GI_16 HE_GI_16
+#define EHT_GI_32 HE_GI_32
+
+#define EHT_DURATION(streams, gi, bps) \
+ HE_DURATION(streams, gi, bps)
+#define EHT_DURATION_S(shift, streams, gi, bps) \
+ HE_DURATION_S(shift, streams, gi, bps)
+
+#define BW_20 0
+#define BW_40 1
+#define BW_80 2
+#define BW_160 3
+#define BW_320 4
+
+/*
+ * Define group sort order: HT40 -> SGI -> #streams
+ */
+#define IEEE80211_MAX_STREAMS 4
+#define IEEE80211_HT_STREAM_GROUPS 4 /* BW(=2) * SGI(=2) */
+#define IEEE80211_VHT_STREAM_GROUPS 8 /* BW(=4) * SGI(=2) */
+
+#define IEEE80211_HE_MAX_STREAMS 8
+#define IEEE80211_HE_STREAM_GROUPS 12 /* BW(=4) * GI(=3) */
+
+#define IEEE80211_EHT_MAX_STREAMS 8
+#define IEEE80211_EHT_STREAM_GROUPS 15 /* BW(=5) * GI(=3) */
+
+#define IEEE80211_HT_GROUPS_NB (IEEE80211_MAX_STREAMS * \
+ IEEE80211_HT_STREAM_GROUPS)
+#define IEEE80211_VHT_GROUPS_NB (IEEE80211_MAX_STREAMS * \
+ IEEE80211_VHT_STREAM_GROUPS)
+#define IEEE80211_HE_GROUPS_NB (IEEE80211_HE_MAX_STREAMS * \
+ IEEE80211_HE_STREAM_GROUPS)
+#define IEEE80211_EHT_GROUPS_NB (IEEE80211_EHT_MAX_STREAMS * \
+ IEEE80211_EHT_STREAM_GROUPS)
+
+#define IEEE80211_HT_GROUP_0 0
+#define IEEE80211_VHT_GROUP_0 (IEEE80211_HT_GROUP_0 + IEEE80211_HT_GROUPS_NB)
+#define IEEE80211_HE_GROUP_0 (IEEE80211_VHT_GROUP_0 + IEEE80211_VHT_GROUPS_NB)
+#define IEEE80211_EHT_GROUP_0 (IEEE80211_HE_GROUP_0 + IEEE80211_HE_GROUPS_NB)
+
+#define MCS_GROUP_RATES 14
+
+#define HT_GROUP_IDX(_streams, _sgi, _ht40) \
+ IEEE80211_HT_GROUP_0 + \
+ IEEE80211_MAX_STREAMS * 2 * _ht40 + \
+ IEEE80211_MAX_STREAMS * _sgi + \
+ _streams - 1
+
+#define _MAX(a, b) (((a)>(b))?(a):(b))
+
+#define GROUP_SHIFT(duration) \
+ _MAX(0, 16 - __builtin_clz(duration))
+
+/* MCS rate information for an MCS group */
+#define __MCS_GROUP(_streams, _sgi, _ht40, _s) \
+ [HT_GROUP_IDX(_streams, _sgi, _ht40)] = { \
+ .shift = _s, \
+ .duration = { \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 54 : 26), \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 108 : 52), \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 162 : 78), \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 216 : 104), \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 324 : 156), \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 432 : 208), \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 486 : 234), \
+ MCS_DURATION_S(_s, _streams, _sgi, _ht40 ? 540 : 260) \
+ } \
+}
+
+#define MCS_GROUP_SHIFT(_streams, _sgi, _ht40) \
+ GROUP_SHIFT(MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26))
+
+#define MCS_GROUP(_streams, _sgi, _ht40) \
+ __MCS_GROUP(_streams, _sgi, _ht40, \
+ MCS_GROUP_SHIFT(_streams, _sgi, _ht40))
+
+#define VHT_GROUP_IDX(_streams, _sgi, _bw) \
+ (IEEE80211_VHT_GROUP_0 + \
+ IEEE80211_MAX_STREAMS * 2 * (_bw) + \
+ IEEE80211_MAX_STREAMS * (_sgi) + \
+ (_streams) - 1)
+
+#define BW2VBPS(_bw, r4, r3, r2, r1) \
+ (_bw == BW_160 ? r4 : _bw == BW_80 ? r3 : _bw == BW_40 ? r2 : r1)
+
+#define __VHT_GROUP(_streams, _sgi, _bw, _s) \
+ [VHT_GROUP_IDX(_streams, _sgi, _bw)] = { \
+ .shift = _s, \
+ .duration = { \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 234, 117, 54, 26)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 468, 234, 108, 52)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 702, 351, 162, 78)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 936, 468, 216, 104)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 1404, 702, 324, 156)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 1872, 936, 432, 208)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 2106, 1053, 486, 234)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 2340, 1170, 540, 260)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 2808, 1404, 648, 312)), \
+ MCS_DURATION_S(_s, _streams, _sgi, \
+ BW2VBPS(_bw, 3120, 1560, 720, 346)) \
+ } \
+}
+
+#define VHT_GROUP_SHIFT(_streams, _sgi, _bw) \
+ GROUP_SHIFT(MCS_DURATION(_streams, _sgi, \
+ BW2VBPS(_bw, 243, 117, 54, 26)))
+
+#define VHT_GROUP(_streams, _sgi, _bw) \
+ __VHT_GROUP(_streams, _sgi, _bw, \
+ VHT_GROUP_SHIFT(_streams, _sgi, _bw))
+
+
+#define HE_GROUP_IDX(_streams, _gi, _bw) \
+ (IEEE80211_HE_GROUP_0 + \
+ IEEE80211_HE_MAX_STREAMS * 3 * (_bw) + \
+ IEEE80211_HE_MAX_STREAMS * (_gi) + \
+ (_streams) - 1)
+
+#define __HE_GROUP(_streams, _gi, _bw, _s) \
+ [HE_GROUP_IDX(_streams, _gi, _bw)] = { \
+ .shift = _s, \
+ .duration = { \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 979, 489, 230, 115)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 1958, 979, 475, 230)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 2937, 1468, 705, 345)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 3916, 1958, 936, 475)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 5875, 2937, 1411, 705)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 7833, 3916, 1872, 936)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 8827, 4406, 2102, 1051)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 9806, 4896, 2347, 1166)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 11764, 5875, 2808, 1411)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 13060, 6523, 3124, 1555)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 14702, 7344, 3513, 1756)), \
+ HE_DURATION_S(_s, _streams, _gi, \
+ BW2VBPS(_bw, 16329, 8164, 3902, 1944)) \
+ } \
+}
+
+#define HE_GROUP_SHIFT(_streams, _gi, _bw) \
+ GROUP_SHIFT(HE_DURATION(_streams, _gi, \
+ BW2VBPS(_bw, 979, 489, 230, 115)))
+
+#define HE_GROUP(_streams, _gi, _bw) \
+ __HE_GROUP(_streams, _gi, _bw, \
+ HE_GROUP_SHIFT(_streams, _gi, _bw))
+
+#define EHT_BW2VBPS(_bw, r5, r4, r3, r2, r1) \
+ ((_bw) == BW_320 ? r5 : BW2VBPS(_bw, r4, r3, r2, r1))
+
+#define EHT_GROUP_IDX(_streams, _gi, _bw) \
+ (IEEE80211_EHT_GROUP_0 + \
+ IEEE80211_EHT_MAX_STREAMS * 3 * (_bw) + \
+ IEEE80211_EHT_MAX_STREAMS * (_gi) + \
+ (_streams) - 1)
+
+#define __EHT_GROUP(_streams, _gi, _bw, _s) \
+ [EHT_GROUP_IDX(_streams, _gi, _bw)] = { \
+ .shift = _s, \
+ .duration = { \
+ EHT_DURATION_S(_s, _streams, _gi, \
+ EHT_BW2VBPS(_bw, 1960, 980, 490, 234, 117)), \
+ EHT_DURATION_S(_s, _streams, _gi, \
+ EHT_BW2VBPS(_bw, 3920, 1960, 980, 468, 234)), \
+ EHT_DURATION_S(_s, _streams, _gi, \
+ EHT_BW2VBPS(_bw, 5880, 2937, 1470, 702, 351)), \
+ EHT_DURATION_S(_s, _streams, _gi, \
+ EHT_BW2VBPS(_bw, 7840, 3920, 1960, 936, 468)), \
+ EHT_DURATION_S(_s, _streams, _gi, \
+ EHT_BW2VBPS(_bw, 11760, 5880, 2940, 1404, 702)), \
+ EHT_DURATION_S(_s, _streams, _gi, \
+ EHT_BW2VBPS(_bw, 15680, 7840, 3920, 1872, 936)), \
+ EHT_DURATION_S(_s, _streams, _gi, \
+ EHT_BW2VBPS(_bw, 17640, 8820, 4410, 2106, 1053)), \
+ EHT_DURATION_S(_s, _streams, _gi, \
+ EHT_BW2VBPS(_bw, 19600, 9800, 4900, 2340, 1170)), \
+ EHT_DURATION_S(_s, _streams, _gi, \
+ EHT_BW2VBPS(_bw, 23520, 11760, 5880, 2808, 1404)), \
+ EHT_DURATION_S(_s, _streams, _gi, \
+ EHT_BW2VBPS(_bw, 26133, 13066, 6533, 3120, 1560)), \
+ EHT_DURATION_S(_s, _streams, _gi, \
+ EHT_BW2VBPS(_bw, 29400, 14700, 7350, 3510, 1755)), \
+ EHT_DURATION_S(_s, _streams, _gi, \
+ EHT_BW2VBPS(_bw, 32666, 16333, 8166, 3900, 1950)), \
+ EHT_DURATION_S(_s, _streams, _gi, \
+ EHT_BW2VBPS(_bw, 35280, 17640, 8820, 4212, 2106)), \
+ EHT_DURATION_S(_s, _streams, _gi, \
+ EHT_BW2VBPS(_bw, 39200, 19600, 9800, 4680, 2340)) \
+ } \
+}
+
+#define EHT_GROUP_SHIFT(_streams, _gi, _bw) \
+ GROUP_SHIFT(EHT_DURATION(_streams, _gi, \
+ EHT_BW2VBPS(_bw, 1960, 980, 490, 234, 117)))
+
+#define EHT_GROUP(_streams, _gi, _bw) \
+ __EHT_GROUP(_streams, _gi, _bw, \
+ EHT_GROUP_SHIFT(_streams, _gi, _bw))
+
+#define EHT_GROUP_RANGE(_gi, _bw) \
+ EHT_GROUP(1, _gi, _bw), \
+ EHT_GROUP(2, _gi, _bw), \
+ EHT_GROUP(3, _gi, _bw), \
+ EHT_GROUP(4, _gi, _bw), \
+ EHT_GROUP(5, _gi, _bw), \
+ EHT_GROUP(6, _gi, _bw), \
+ EHT_GROUP(7, _gi, _bw), \
+ EHT_GROUP(8, _gi, _bw)
+
+struct mcs_group {
+ u8 shift;
+ u16 duration[MCS_GROUP_RATES];
+};
+
+static const struct mcs_group airtime_mcs_groups[] = {
+ MCS_GROUP(1, 0, BW_20),
+ MCS_GROUP(2, 0, BW_20),
+ MCS_GROUP(3, 0, BW_20),
+ MCS_GROUP(4, 0, BW_20),
+
+ MCS_GROUP(1, 1, BW_20),
+ MCS_GROUP(2, 1, BW_20),
+ MCS_GROUP(3, 1, BW_20),
+ MCS_GROUP(4, 1, BW_20),
+
+ MCS_GROUP(1, 0, BW_40),
+ MCS_GROUP(2, 0, BW_40),
+ MCS_GROUP(3, 0, BW_40),
+ MCS_GROUP(4, 0, BW_40),
+
+ MCS_GROUP(1, 1, BW_40),
+ MCS_GROUP(2, 1, BW_40),
+ MCS_GROUP(3, 1, BW_40),
+ MCS_GROUP(4, 1, BW_40),
+
+ VHT_GROUP(1, 0, BW_20),
+ VHT_GROUP(2, 0, BW_20),
+ VHT_GROUP(3, 0, BW_20),
+ VHT_GROUP(4, 0, BW_20),
+
+ VHT_GROUP(1, 1, BW_20),
+ VHT_GROUP(2, 1, BW_20),
+ VHT_GROUP(3, 1, BW_20),
+ VHT_GROUP(4, 1, BW_20),
+
+ VHT_GROUP(1, 0, BW_40),
+ VHT_GROUP(2, 0, BW_40),
+ VHT_GROUP(3, 0, BW_40),
+ VHT_GROUP(4, 0, BW_40),
+
+ VHT_GROUP(1, 1, BW_40),
+ VHT_GROUP(2, 1, BW_40),
+ VHT_GROUP(3, 1, BW_40),
+ VHT_GROUP(4, 1, BW_40),
+
+ VHT_GROUP(1, 0, BW_80),
+ VHT_GROUP(2, 0, BW_80),
+ VHT_GROUP(3, 0, BW_80),
+ VHT_GROUP(4, 0, BW_80),
+
+ VHT_GROUP(1, 1, BW_80),
+ VHT_GROUP(2, 1, BW_80),
+ VHT_GROUP(3, 1, BW_80),
+ VHT_GROUP(4, 1, BW_80),
+
+ VHT_GROUP(1, 0, BW_160),
+ VHT_GROUP(2, 0, BW_160),
+ VHT_GROUP(3, 0, BW_160),
+ VHT_GROUP(4, 0, BW_160),
+
+ VHT_GROUP(1, 1, BW_160),
+ VHT_GROUP(2, 1, BW_160),
+ VHT_GROUP(3, 1, BW_160),
+ VHT_GROUP(4, 1, BW_160),
+
+ HE_GROUP(1, HE_GI_08, BW_20),
+ HE_GROUP(2, HE_GI_08, BW_20),
+ HE_GROUP(3, HE_GI_08, BW_20),
+ HE_GROUP(4, HE_GI_08, BW_20),
+ HE_GROUP(5, HE_GI_08, BW_20),
+ HE_GROUP(6, HE_GI_08, BW_20),
+ HE_GROUP(7, HE_GI_08, BW_20),
+ HE_GROUP(8, HE_GI_08, BW_20),
+
+ HE_GROUP(1, HE_GI_16, BW_20),
+ HE_GROUP(2, HE_GI_16, BW_20),
+ HE_GROUP(3, HE_GI_16, BW_20),
+ HE_GROUP(4, HE_GI_16, BW_20),
+ HE_GROUP(5, HE_GI_16, BW_20),
+ HE_GROUP(6, HE_GI_16, BW_20),
+ HE_GROUP(7, HE_GI_16, BW_20),
+ HE_GROUP(8, HE_GI_16, BW_20),
+
+ HE_GROUP(1, HE_GI_32, BW_20),
+ HE_GROUP(2, HE_GI_32, BW_20),
+ HE_GROUP(3, HE_GI_32, BW_20),
+ HE_GROUP(4, HE_GI_32, BW_20),
+ HE_GROUP(5, HE_GI_32, BW_20),
+ HE_GROUP(6, HE_GI_32, BW_20),
+ HE_GROUP(7, HE_GI_32, BW_20),
+ HE_GROUP(8, HE_GI_32, BW_20),
+
+ HE_GROUP(1, HE_GI_08, BW_40),
+ HE_GROUP(2, HE_GI_08, BW_40),
+ HE_GROUP(3, HE_GI_08, BW_40),
+ HE_GROUP(4, HE_GI_08, BW_40),
+ HE_GROUP(5, HE_GI_08, BW_40),
+ HE_GROUP(6, HE_GI_08, BW_40),
+ HE_GROUP(7, HE_GI_08, BW_40),
+ HE_GROUP(8, HE_GI_08, BW_40),
+
+ HE_GROUP(1, HE_GI_16, BW_40),
+ HE_GROUP(2, HE_GI_16, BW_40),
+ HE_GROUP(3, HE_GI_16, BW_40),
+ HE_GROUP(4, HE_GI_16, BW_40),
+ HE_GROUP(5, HE_GI_16, BW_40),
+ HE_GROUP(6, HE_GI_16, BW_40),
+ HE_GROUP(7, HE_GI_16, BW_40),
+ HE_GROUP(8, HE_GI_16, BW_40),
+
+ HE_GROUP(1, HE_GI_32, BW_40),
+ HE_GROUP(2, HE_GI_32, BW_40),
+ HE_GROUP(3, HE_GI_32, BW_40),
+ HE_GROUP(4, HE_GI_32, BW_40),
+ HE_GROUP(5, HE_GI_32, BW_40),
+ HE_GROUP(6, HE_GI_32, BW_40),
+ HE_GROUP(7, HE_GI_32, BW_40),
+ HE_GROUP(8, HE_GI_32, BW_40),
+
+ HE_GROUP(1, HE_GI_08, BW_80),
+ HE_GROUP(2, HE_GI_08, BW_80),
+ HE_GROUP(3, HE_GI_08, BW_80),
+ HE_GROUP(4, HE_GI_08, BW_80),
+ HE_GROUP(5, HE_GI_08, BW_80),
+ HE_GROUP(6, HE_GI_08, BW_80),
+ HE_GROUP(7, HE_GI_08, BW_80),
+ HE_GROUP(8, HE_GI_08, BW_80),
+
+ HE_GROUP(1, HE_GI_16, BW_80),
+ HE_GROUP(2, HE_GI_16, BW_80),
+ HE_GROUP(3, HE_GI_16, BW_80),
+ HE_GROUP(4, HE_GI_16, BW_80),
+ HE_GROUP(5, HE_GI_16, BW_80),
+ HE_GROUP(6, HE_GI_16, BW_80),
+ HE_GROUP(7, HE_GI_16, BW_80),
+ HE_GROUP(8, HE_GI_16, BW_80),
+
+ HE_GROUP(1, HE_GI_32, BW_80),
+ HE_GROUP(2, HE_GI_32, BW_80),
+ HE_GROUP(3, HE_GI_32, BW_80),
+ HE_GROUP(4, HE_GI_32, BW_80),
+ HE_GROUP(5, HE_GI_32, BW_80),
+ HE_GROUP(6, HE_GI_32, BW_80),
+ HE_GROUP(7, HE_GI_32, BW_80),
+ HE_GROUP(8, HE_GI_32, BW_80),
+
+ HE_GROUP(1, HE_GI_08, BW_160),
+ HE_GROUP(2, HE_GI_08, BW_160),
+ HE_GROUP(3, HE_GI_08, BW_160),
+ HE_GROUP(4, HE_GI_08, BW_160),
+ HE_GROUP(5, HE_GI_08, BW_160),
+ HE_GROUP(6, HE_GI_08, BW_160),
+ HE_GROUP(7, HE_GI_08, BW_160),
+ HE_GROUP(8, HE_GI_08, BW_160),
+
+ HE_GROUP(1, HE_GI_16, BW_160),
+ HE_GROUP(2, HE_GI_16, BW_160),
+ HE_GROUP(3, HE_GI_16, BW_160),
+ HE_GROUP(4, HE_GI_16, BW_160),
+ HE_GROUP(5, HE_GI_16, BW_160),
+ HE_GROUP(6, HE_GI_16, BW_160),
+ HE_GROUP(7, HE_GI_16, BW_160),
+ HE_GROUP(8, HE_GI_16, BW_160),
+
+ HE_GROUP(1, HE_GI_32, BW_160),
+ HE_GROUP(2, HE_GI_32, BW_160),
+ HE_GROUP(3, HE_GI_32, BW_160),
+ HE_GROUP(4, HE_GI_32, BW_160),
+ HE_GROUP(5, HE_GI_32, BW_160),
+ HE_GROUP(6, HE_GI_32, BW_160),
+ HE_GROUP(7, HE_GI_32, BW_160),
+ HE_GROUP(8, HE_GI_32, BW_160),
+
+ EHT_GROUP_RANGE(EHT_GI_08, BW_20),
+ EHT_GROUP_RANGE(EHT_GI_16, BW_20),
+ EHT_GROUP_RANGE(EHT_GI_32, BW_20),
+
+ EHT_GROUP_RANGE(EHT_GI_08, BW_40),
+ EHT_GROUP_RANGE(EHT_GI_16, BW_40),
+ EHT_GROUP_RANGE(EHT_GI_32, BW_40),
+
+ EHT_GROUP_RANGE(EHT_GI_08, BW_80),
+ EHT_GROUP_RANGE(EHT_GI_16, BW_80),
+ EHT_GROUP_RANGE(EHT_GI_32, BW_80),
+
+ EHT_GROUP_RANGE(EHT_GI_08, BW_160),
+ EHT_GROUP_RANGE(EHT_GI_16, BW_160),
+ EHT_GROUP_RANGE(EHT_GI_32, BW_160),
+
+ EHT_GROUP_RANGE(EHT_GI_08, BW_320),
+ EHT_GROUP_RANGE(EHT_GI_16, BW_320),
+ EHT_GROUP_RANGE(EHT_GI_32, BW_320),
+};
+
+static u32
+ieee80211_calc_legacy_rate_duration(u16 bitrate, bool short_pre,
+ bool cck, int len)
+{
+ u32 duration;
+
+ if (cck) {
+ duration = 144 + 48; /* preamble + PLCP */
+ if (short_pre)
+ duration >>= 1;
+
+ duration += 10; /* SIFS */
+ } else {
+ duration = 20 + 16; /* premable + SIFS */
+ }
+
+ len <<= 3;
+ duration += (len * 10) / bitrate;
+
+ return duration;
+}
+
+static u32 ieee80211_get_rate_duration(struct ieee80211_hw *hw,
+ struct ieee80211_rx_status *status,
+ u32 *overhead)
+{
+ bool sgi = status->enc_flags & RX_ENC_FLAG_SHORT_GI;
+ int bw, streams;
+ int group, idx;
+ u32 duration;
+
+ switch (status->bw) {
+ case RATE_INFO_BW_20:
+ bw = BW_20;
+ break;
+ case RATE_INFO_BW_40:
+ bw = BW_40;
+ break;
+ case RATE_INFO_BW_80:
+ bw = BW_80;
+ break;
+ case RATE_INFO_BW_160:
+ bw = BW_160;
+ break;
+ case RATE_INFO_BW_320:
+ bw = BW_320;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ switch (status->encoding) {
+ case RX_ENC_VHT:
+ streams = status->nss;
+ idx = status->rate_idx;
+ group = VHT_GROUP_IDX(streams, sgi, bw);
+ break;
+ case RX_ENC_HT:
+ streams = ((status->rate_idx >> 3) & 3) + 1;
+ idx = status->rate_idx & 7;
+ group = HT_GROUP_IDX(streams, sgi, bw);
+ break;
+ case RX_ENC_HE:
+ streams = status->nss;
+ idx = status->rate_idx;
+ group = HE_GROUP_IDX(streams, status->he_gi, bw);
+ break;
+ case RX_ENC_EHT:
+ streams = status->nss;
+ idx = status->rate_idx;
+ group = EHT_GROUP_IDX(streams, status->eht.gi, bw);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+
+ switch (status->encoding) {
+ case RX_ENC_EHT:
+ case RX_ENC_HE:
+ if (WARN_ON_ONCE(streams > 8))
+ return 0;
+ break;
+ default:
+ if (WARN_ON_ONCE(streams > 4))
+ return 0;
+ break;
+ }
+
+ if (idx >= MCS_GROUP_RATES)
+ return 0;
+
+ duration = airtime_mcs_groups[group].duration[idx];
+ duration <<= airtime_mcs_groups[group].shift;
+ *overhead = 36 + (streams << 2);
+
+ return duration;
+}
+
+
+u32 ieee80211_calc_rx_airtime(struct ieee80211_hw *hw,
+ struct ieee80211_rx_status *status,
+ int len)
+{
+ struct ieee80211_supported_band *sband;
+ u32 duration, overhead = 0;
+
+ if (status->encoding == RX_ENC_LEGACY) {
+ const struct ieee80211_rate *rate;
+ bool sp = status->enc_flags & RX_ENC_FLAG_SHORTPRE;
+ bool cck;
+
+ /* on 60GHz or sub-1GHz band, there are no legacy rates */
+ if (WARN_ON_ONCE(status->band == NL80211_BAND_60GHZ ||
+ status->band == NL80211_BAND_S1GHZ))
+ return 0;
+
+ sband = hw->wiphy->bands[status->band];
+ if (!sband || status->rate_idx >= sband->n_bitrates)
+ return 0;
+
+ rate = &sband->bitrates[status->rate_idx];
+ cck = rate->flags & IEEE80211_RATE_MANDATORY_B;
+
+ return ieee80211_calc_legacy_rate_duration(rate->bitrate, sp,
+ cck, len);
+ }
+
+ duration = ieee80211_get_rate_duration(hw, status, &overhead);
+ if (!duration)
+ return 0;
+
+ duration *= len;
+ duration /= AVG_PKT_SIZE;
+ duration /= 1024;
+
+ return duration + overhead;
+}
+EXPORT_SYMBOL_GPL(ieee80211_calc_rx_airtime);
+
+static bool ieee80211_fill_rate_info(struct ieee80211_hw *hw,
+ struct ieee80211_rx_status *stat, u8 band,
+ struct rate_info *ri)
+{
+ struct ieee80211_supported_band *sband = hw->wiphy->bands[band];
+ int i;
+
+ if (!ri || !sband)
+ return false;
+
+ stat->bw = ri->bw;
+ stat->nss = ri->nss;
+ stat->rate_idx = ri->mcs;
+
+ if (ri->flags & RATE_INFO_FLAGS_EHT_MCS)
+ stat->encoding = RX_ENC_EHT;
+ else if (ri->flags & RATE_INFO_FLAGS_HE_MCS)
+ stat->encoding = RX_ENC_HE;
+ else if (ri->flags & RATE_INFO_FLAGS_VHT_MCS)
+ stat->encoding = RX_ENC_VHT;
+ else if (ri->flags & RATE_INFO_FLAGS_MCS)
+ stat->encoding = RX_ENC_HT;
+ else
+ stat->encoding = RX_ENC_LEGACY;
+
+ if (ri->flags & RATE_INFO_FLAGS_SHORT_GI)
+ stat->enc_flags |= RX_ENC_FLAG_SHORT_GI;
+
+ switch (stat->encoding) {
+ case RX_ENC_EHT:
+ stat->eht.gi = ri->eht_gi;
+ break;
+ default:
+ stat->he_gi = ri->he_gi;
+ break;
+ }
+
+ if (stat->encoding != RX_ENC_LEGACY)
+ return true;
+
+ stat->rate_idx = 0;
+ for (i = 0; i < sband->n_bitrates; i++) {
+ if (ri->legacy != sband->bitrates[i].bitrate)
+ continue;
+
+ stat->rate_idx = i;
+ return true;
+ }
+
+ return false;
+}
+
+static int ieee80211_fill_rx_status(struct ieee80211_rx_status *stat,
+ struct ieee80211_hw *hw,
+ struct ieee80211_tx_rate *rate,
+ struct rate_info *ri, u8 band, int len)
+{
+ memset(stat, 0, sizeof(*stat));
+ stat->band = band;
+
+ if (ieee80211_fill_rate_info(hw, stat, band, ri))
+ return 0;
+
+ if (!ieee80211_rate_valid(rate))
+ return -1;
+
+ if (rate->flags & IEEE80211_TX_RC_160_MHZ_WIDTH)
+ stat->bw = RATE_INFO_BW_160;
+ else if (rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH)
+ stat->bw = RATE_INFO_BW_80;
+ else if (rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH)
+ stat->bw = RATE_INFO_BW_40;
+ else
+ stat->bw = RATE_INFO_BW_20;
+
+ stat->enc_flags = 0;
+ if (rate->flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE)
+ stat->enc_flags |= RX_ENC_FLAG_SHORTPRE;
+ if (rate->flags & IEEE80211_TX_RC_SHORT_GI)
+ stat->enc_flags |= RX_ENC_FLAG_SHORT_GI;
+
+ stat->rate_idx = rate->idx;
+ if (rate->flags & IEEE80211_TX_RC_VHT_MCS) {
+ stat->encoding = RX_ENC_VHT;
+ stat->rate_idx = ieee80211_rate_get_vht_mcs(rate);
+ stat->nss = ieee80211_rate_get_vht_nss(rate);
+ } else if (rate->flags & IEEE80211_TX_RC_MCS) {
+ stat->encoding = RX_ENC_HT;
+ } else {
+ stat->encoding = RX_ENC_LEGACY;
+ }
+
+ return 0;
+}
+
+static u32 ieee80211_calc_tx_airtime_rate(struct ieee80211_hw *hw,
+ struct ieee80211_tx_rate *rate,
+ struct rate_info *ri,
+ u8 band, int len)
+{
+ struct ieee80211_rx_status stat;
+
+ if (ieee80211_fill_rx_status(&stat, hw, rate, ri, band, len))
+ return 0;
+
+ return ieee80211_calc_rx_airtime(hw, &stat, len);
+}
+
+u32 ieee80211_calc_tx_airtime(struct ieee80211_hw *hw,
+ struct ieee80211_tx_info *info,
+ int len)
+{
+ u32 duration = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(info->status.rates); i++) {
+ struct ieee80211_tx_rate *rate = &info->status.rates[i];
+ u32 cur_duration;
+
+ cur_duration = ieee80211_calc_tx_airtime_rate(hw, rate, NULL,
+ info->band, len);
+ if (!cur_duration)
+ break;
+
+ duration += cur_duration * rate->count;
+ }
+
+ return duration;
+}
+EXPORT_SYMBOL_GPL(ieee80211_calc_tx_airtime);
+
+u32 ieee80211_calc_expected_tx_airtime(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif,
+ struct ieee80211_sta *pubsta,
+ int len, bool ampdu)
+{
+ struct ieee80211_supported_band *sband;
+ struct ieee80211_chanctx_conf *conf;
+ int rateidx;
+ bool cck, short_pream;
+ u32 basic_rates;
+ u8 band = 0;
+ u16 rate;
+
+ len += 38; /* Ethernet header length */
+
+ conf = rcu_dereference(vif->bss_conf.chanctx_conf);
+ if (conf)
+ band = conf->def.chan->band;
+
+ if (pubsta) {
+ struct sta_info *sta = container_of(pubsta, struct sta_info,
+ sta);
+ struct ieee80211_rx_status stat;
+ struct ieee80211_tx_rate *tx_rate = &sta->deflink.tx_stats.last_rate;
+ struct rate_info *ri = &sta->deflink.tx_stats.last_rate_info;
+ u32 duration, overhead;
+ u8 agg_shift;
+
+ if (ieee80211_fill_rx_status(&stat, hw, tx_rate, ri, band, len))
+ return 0;
+
+ if (stat.encoding == RX_ENC_LEGACY || !ampdu)
+ return ieee80211_calc_rx_airtime(hw, &stat, len);
+
+ duration = ieee80211_get_rate_duration(hw, &stat, &overhead);
+ /*
+ * Assume that HT/VHT transmission on any AC except VO will
+ * use aggregation. Since we don't have reliable reporting
+ * of aggregation length, assume an average size based on the
+ * tx rate.
+ * This will not be very accurate, but much better than simply
+ * assuming un-aggregated tx in all cases.
+ */
+ if (duration > 400 * 1024) /* <= VHT20 MCS2 1S */
+ agg_shift = 1;
+ else if (duration > 250 * 1024) /* <= VHT20 MCS3 1S or MCS1 2S */
+ agg_shift = 2;
+ else if (duration > 150 * 1024) /* <= VHT20 MCS5 1S or MCS2 2S */
+ agg_shift = 3;
+ else if (duration > 70 * 1024) /* <= VHT20 MCS5 2S */
+ agg_shift = 4;
+ else if (stat.encoding != RX_ENC_HE ||
+ duration > 20 * 1024) /* <= HE40 MCS6 2S */
+ agg_shift = 5;
+ else
+ agg_shift = 6;
+
+ duration *= len;
+ duration /= AVG_PKT_SIZE;
+ duration /= 1024;
+ duration += (overhead >> agg_shift);
+
+ return max_t(u32, duration, 4);
+ }
+
+ if (!conf)
+ return 0;
+
+ /* No station to get latest rate from, so calculate the worst-case
+ * duration using the lowest configured basic rate.
+ */
+ sband = hw->wiphy->bands[band];
+
+ basic_rates = vif->bss_conf.basic_rates;
+ short_pream = vif->bss_conf.use_short_preamble;
+
+ rateidx = basic_rates ? ffs(basic_rates) - 1 : 0;
+ rate = sband->bitrates[rateidx].bitrate;
+ cck = sband->bitrates[rateidx].flags & IEEE80211_RATE_MANDATORY_B;
+
+ return ieee80211_calc_legacy_rate_duration(rate, short_pream, cck, len);
+}
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 5d22eda8a6b1..b51c2c8584ae 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -1,12 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mac80211 configuration hooks for cfg80211
*
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2015 Intel Mobile Communications GmbH
* Copyright (C) 2015-2017 Intel Deutschland GmbH
- * Copyright (C) 2018 Intel Corporation
- *
- * This file is GPLv2 as found in COPYING.
+ * Copyright (C) 2018-2025 Intel Corporation
*/
#include <linux/ieee80211.h>
@@ -15,6 +14,7 @@
#include <linux/slab.h>
#include <net/net_namespace.h>
#include <linux/rcupdate.h>
+#include <linux/fips.h>
#include <linux/if_ether.h>
#include <net/cfg80211.h>
#include "ieee80211_i.h"
@@ -23,6 +23,30 @@
#include "mesh.h"
#include "wme.h"
+static struct ieee80211_link_data *
+ieee80211_link_or_deflink(struct ieee80211_sub_if_data *sdata, int link_id,
+ bool require_valid)
+{
+ struct ieee80211_link_data *link;
+
+ if (link_id < 0) {
+ /*
+ * For keys, if sdata is not an MLD, we might not use
+ * the return value at all (if it's not a pairwise key),
+ * so in that case (require_valid==false) don't error.
+ */
+ if (require_valid && ieee80211_vif_is_mld(&sdata->vif))
+ return ERR_PTR(-EINVAL);
+
+ return &sdata->deflink;
+ }
+
+ link = sdata_dereference(sdata->link[link_id], sdata);
+ if (!link)
+ return ERR_PTR(-ENOLINK);
+ return link;
+}
+
static void ieee80211_set_mu_mimo_follow(struct ieee80211_sub_if_data *sdata,
struct vif_params *params)
{
@@ -39,11 +63,14 @@ static void ieee80211_set_mu_mimo_follow(struct ieee80211_sub_if_data *sdata,
memcpy(sdata->vif.bss_conf.mu_group.position,
params->vht_mumimo_groups + WLAN_MEMBERSHIP_LEN,
WLAN_USER_POSITION_LEN);
- ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_MU_GROUPS);
+
/* don't care about endianness - just check for 0 */
memcpy(&membership, params->vht_mumimo_groups,
WLAN_MEMBERSHIP_LEN);
mu_mimo_groups = membership != 0;
+
+ /* Unset following if configured explicitly */
+ eth_broadcast_addr(sdata->u.mntr.mu_follow_addr);
}
if (params->vht_mumimo_follow_addr) {
@@ -51,45 +78,65 @@ static void ieee80211_set_mu_mimo_follow(struct ieee80211_sub_if_data *sdata,
is_valid_ether_addr(params->vht_mumimo_follow_addr);
ether_addr_copy(sdata->u.mntr.mu_follow_addr,
params->vht_mumimo_follow_addr);
+
+ /* Unset current membership until a management frame is RXed */
+ memset(sdata->vif.bss_conf.mu_group.membership, 0,
+ WLAN_MEMBERSHIP_LEN);
}
- sdata->vif.mu_mimo_owner = mu_mimo_groups || mu_mimo_follow;
+ sdata->vif.bss_conf.mu_mimo_owner = mu_mimo_groups || mu_mimo_follow;
+
+ /* Notify only after setting mu_mimo_owner */
+ if (sdata->vif.bss_conf.mu_mimo_owner &&
+ sdata->flags & IEEE80211_SDATA_IN_DRIVER)
+ ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+ BSS_CHANGED_MU_GROUPS);
}
static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata,
struct vif_params *params)
{
struct ieee80211_local *local = sdata->local;
- struct ieee80211_sub_if_data *monitor_sdata;
+ struct ieee80211_sub_if_data *monitor_sdata = NULL;
/* check flags first */
if (params->flags && ieee80211_sdata_running(sdata)) {
- u32 mask = MONITOR_FLAG_COOK_FRAMES | MONITOR_FLAG_ACTIVE;
+ u32 mask = MONITOR_FLAG_ACTIVE;
/*
- * Prohibit MONITOR_FLAG_COOK_FRAMES and
- * MONITOR_FLAG_ACTIVE to be changed while the
- * interface is up.
+ * Prohibit MONITOR_FLAG_ACTIVE to be changed
+ * while the interface is up.
* Else we would need to add a lot of cruft
* to update everything:
- * cooked_mntrs, monitor and all fif_* counters
+ * monitor and all fif_* counters
* reconfigure hardware
*/
if ((params->flags & mask) != (sdata->u.mntr.flags & mask))
return -EBUSY;
}
- /* also validate MU-MIMO change */
- monitor_sdata = rtnl_dereference(local->monitor_sdata);
-
- if (!monitor_sdata &&
+ /* validate whether MU-MIMO can be configured */
+ if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) &&
+ !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR) &&
(params->vht_mumimo_groups || params->vht_mumimo_follow_addr))
return -EOPNOTSUPP;
+ /* Also update dependent monitor_sdata if required */
+ if (test_bit(SDATA_STATE_RUNNING, &sdata->state) &&
+ !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR))
+ monitor_sdata = wiphy_dereference(local->hw.wiphy,
+ local->monitor_sdata);
+
/* apply all changes now - no failures allowed */
- if (monitor_sdata)
- ieee80211_set_mu_mimo_follow(monitor_sdata, params);
+ if (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) ||
+ ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) {
+ /* This is copied in when the VIF is activated */
+ ieee80211_set_mu_mimo_follow(sdata, params);
+
+ if (monitor_sdata)
+ ieee80211_set_mu_mimo_follow(monitor_sdata, params);
+ }
if (params->flags) {
if (ieee80211_sdata_running(sdata)) {
@@ -111,6 +158,51 @@ static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata,
return 0;
}
+static int ieee80211_set_ap_mbssid_options(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_mbssid_config *params,
+ struct ieee80211_bss_conf *link_conf)
+{
+ struct ieee80211_sub_if_data *tx_sdata;
+ struct ieee80211_bss_conf *old;
+
+ link_conf->bssid_index = 0;
+ link_conf->nontransmitted = false;
+ link_conf->ema_ap = false;
+ link_conf->bssid_indicator = 0;
+
+ if (sdata->vif.type != NL80211_IFTYPE_AP || !params->tx_wdev)
+ return -EINVAL;
+
+ old = sdata_dereference(link_conf->tx_bss_conf, sdata);
+ if (old)
+ return -EALREADY;
+
+ tx_sdata = IEEE80211_WDEV_TO_SUB_IF(params->tx_wdev);
+ if (!tx_sdata)
+ return -EINVAL;
+
+ if (tx_sdata == sdata) {
+ rcu_assign_pointer(link_conf->tx_bss_conf, link_conf);
+ } else {
+ struct ieee80211_bss_conf *tx_bss_conf;
+
+ tx_bss_conf = sdata_dereference(tx_sdata->vif.link_conf[params->tx_link_id],
+ sdata);
+ if (rcu_access_pointer(tx_bss_conf->tx_bss_conf) != tx_bss_conf)
+ return -EINVAL;
+
+ rcu_assign_pointer(link_conf->tx_bss_conf, tx_bss_conf);
+
+ link_conf->nontransmitted = true;
+ link_conf->bssid_index = params->index;
+ link_conf->bssid_indicator = tx_bss_conf->bssid_indicator;
+ }
+ if (params->ema)
+ link_conf->ema_ap = true;
+
+ return 0;
+}
+
static struct wireless_dev *ieee80211_add_iface(struct wiphy *wiphy,
const char *name,
unsigned char name_assign_type,
@@ -136,6 +228,24 @@ static struct wireless_dev *ieee80211_add_iface(struct wiphy *wiphy,
}
}
+ /* Let the driver know that an interface is going to be added.
+ * Indicate so only for interface types that will be added to the
+ * driver.
+ */
+ switch (type) {
+ case NL80211_IFTYPE_AP_VLAN:
+ break;
+ case NL80211_IFTYPE_MONITOR:
+ if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) ||
+ !(params->flags & MONITOR_FLAG_ACTIVE))
+ break;
+ fallthrough;
+ default:
+ drv_prep_add_interface(local,
+ ieee80211_vif_type_p2p(&sdata->vif));
+ break;
+ }
+
return wdev;
}
@@ -152,19 +262,40 @@ static int ieee80211_change_iface(struct wiphy *wiphy,
struct vif_params *params)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
int ret;
+ lockdep_assert_wiphy(local->hw.wiphy);
+
ret = ieee80211_if_change_type(sdata, type);
if (ret)
return ret;
- if (type == NL80211_IFTYPE_AP_VLAN &&
- params && params->use_4addr == 0) {
+ if (type == NL80211_IFTYPE_AP_VLAN && params->use_4addr == 0) {
RCU_INIT_POINTER(sdata->u.vlan.sta, NULL);
ieee80211_check_fast_rx_iface(sdata);
- } else if (type == NL80211_IFTYPE_STATION &&
- params && params->use_4addr >= 0) {
+ } else if (type == NL80211_IFTYPE_STATION && params->use_4addr >= 0) {
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+
+ if (params->use_4addr == ifmgd->use_4addr)
+ return 0;
+
+ /* FIXME: no support for 4-addr MLO yet */
+ if (ieee80211_vif_is_mld(&sdata->vif))
+ return -EOPNOTSUPP;
+
sdata->u.mgd.use_4addr = params->use_4addr;
+ if (!ifmgd->associated)
+ return 0;
+
+ sta = sta_info_get(sdata, sdata->deflink.u.mgd.bssid);
+ if (sta)
+ drv_sta_set_4addr(local, sdata, &sta->sta,
+ params->use_4addr);
+
+ if (params->use_4addr)
+ ieee80211_send_4addr_nullfunc(local, sdata);
}
if (sdata->vif.type == NL80211_IFTYPE_MONITOR) {
@@ -182,9 +313,9 @@ static int ieee80211_start_p2p_device(struct wiphy *wiphy,
struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
int ret;
- mutex_lock(&sdata->local->chanctx_mtx);
- ret = ieee80211_check_combinations(sdata, NULL, 0, 0);
- mutex_unlock(&sdata->local->chanctx_mtx);
+ lockdep_assert_wiphy(sdata->local->hw.wiphy);
+
+ ret = ieee80211_check_combinations(sdata, NULL, 0, 0, -1);
if (ret < 0)
return ret;
@@ -197,6 +328,96 @@ static void ieee80211_stop_p2p_device(struct wiphy *wiphy,
ieee80211_sdata_stop(IEEE80211_WDEV_TO_SUB_IF(wdev));
}
+static void ieee80211_nan_conf_free(struct cfg80211_nan_conf *conf)
+{
+ kfree(conf->cluster_id);
+ kfree(conf->extra_nan_attrs);
+ kfree(conf->vendor_elems);
+ memset(conf, 0, sizeof(*conf));
+}
+
+static void ieee80211_stop_nan(struct wiphy *wiphy,
+ struct wireless_dev *wdev)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+
+ if (!sdata->u.nan.started)
+ return;
+
+ drv_stop_nan(sdata->local, sdata);
+ sdata->u.nan.started = false;
+
+ ieee80211_nan_conf_free(&sdata->u.nan.conf);
+
+ ieee80211_sdata_stop(sdata);
+ ieee80211_recalc_idle(sdata->local);
+}
+
+static int ieee80211_nan_conf_copy(struct cfg80211_nan_conf *dst,
+ struct cfg80211_nan_conf *src,
+ u32 changes)
+{
+ if (changes & CFG80211_NAN_CONF_CHANGED_PREF)
+ dst->master_pref = src->master_pref;
+
+ if (changes & CFG80211_NAN_CONF_CHANGED_BANDS)
+ dst->bands = src->bands;
+
+ if (changes & CFG80211_NAN_CONF_CHANGED_CONFIG) {
+ dst->scan_period = src->scan_period;
+ dst->scan_dwell_time = src->scan_dwell_time;
+ dst->discovery_beacon_interval =
+ src->discovery_beacon_interval;
+ dst->enable_dw_notification = src->enable_dw_notification;
+ memcpy(&dst->band_cfgs, &src->band_cfgs,
+ sizeof(dst->band_cfgs));
+
+ kfree(dst->cluster_id);
+ dst->cluster_id = NULL;
+
+ kfree(dst->extra_nan_attrs);
+ dst->extra_nan_attrs = NULL;
+ dst->extra_nan_attrs_len = 0;
+
+ kfree(dst->vendor_elems);
+ dst->vendor_elems = NULL;
+ dst->vendor_elems_len = 0;
+
+ if (src->cluster_id) {
+ dst->cluster_id = kmemdup(src->cluster_id, ETH_ALEN,
+ GFP_KERNEL);
+ if (!dst->cluster_id)
+ goto no_mem;
+ }
+
+ if (src->extra_nan_attrs && src->extra_nan_attrs_len) {
+ dst->extra_nan_attrs = kmemdup(src->extra_nan_attrs,
+ src->extra_nan_attrs_len,
+ GFP_KERNEL);
+ if (!dst->extra_nan_attrs)
+ goto no_mem;
+
+ dst->extra_nan_attrs_len = src->extra_nan_attrs_len;
+ }
+
+ if (src->vendor_elems && src->vendor_elems_len) {
+ dst->vendor_elems = kmemdup(src->vendor_elems,
+ src->vendor_elems_len,
+ GFP_KERNEL);
+ if (!dst->vendor_elems)
+ goto no_mem;
+
+ dst->vendor_elems_len = src->vendor_elems_len;
+ }
+ }
+
+ return 0;
+
+no_mem:
+ ieee80211_nan_conf_free(dst);
+ return -ENOMEM;
+}
+
static int ieee80211_start_nan(struct wiphy *wiphy,
struct wireless_dev *wdev,
struct cfg80211_nan_conf *conf)
@@ -204,9 +425,12 @@ static int ieee80211_start_nan(struct wiphy *wiphy,
struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
int ret;
- mutex_lock(&sdata->local->chanctx_mtx);
- ret = ieee80211_check_combinations(sdata, NULL, 0, 0);
- mutex_unlock(&sdata->local->chanctx_mtx);
+ lockdep_assert_wiphy(sdata->local->hw.wiphy);
+
+ if (sdata->u.nan.started)
+ return -EALREADY;
+
+ ret = ieee80211_check_combinations(sdata, NULL, 0, 0, -1);
if (ret < 0)
return ret;
@@ -215,21 +439,21 @@ static int ieee80211_start_nan(struct wiphy *wiphy,
return ret;
ret = drv_start_nan(sdata->local, sdata, conf);
- if (ret)
+ if (ret) {
ieee80211_sdata_stop(sdata);
+ return ret;
+ }
- sdata->u.nan.conf = *conf;
-
- return ret;
-}
+ sdata->u.nan.started = true;
+ ieee80211_recalc_idle(sdata->local);
-static void ieee80211_stop_nan(struct wiphy *wiphy,
- struct wireless_dev *wdev)
-{
- struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ ret = ieee80211_nan_conf_copy(&sdata->u.nan.conf, conf, 0xFFFFFFFF);
+ if (ret) {
+ ieee80211_stop_nan(wiphy, wdev);
+ return ret;
+ }
- drv_stop_nan(sdata->local, sdata);
- ieee80211_sdata_stop(sdata);
+ return 0;
}
static int ieee80211_nan_change_conf(struct wiphy *wiphy,
@@ -238,7 +462,7 @@ static int ieee80211_nan_change_conf(struct wiphy *wiphy,
u32 changes)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
- struct cfg80211_nan_conf new_conf;
+ struct cfg80211_nan_conf new_conf = {};
int ret = 0;
if (sdata->vif.type != NL80211_IFTYPE_NAN)
@@ -247,17 +471,28 @@ static int ieee80211_nan_change_conf(struct wiphy *wiphy,
if (!ieee80211_sdata_running(sdata))
return -ENETDOWN;
- new_conf = sdata->u.nan.conf;
+ if (!changes)
+ return 0;
- if (changes & CFG80211_NAN_CONF_CHANGED_PREF)
- new_conf.master_pref = conf->master_pref;
+ /* First make a full copy of the previous configuration and then apply
+ * the changes. This might be a little wasteful, but it is simpler.
+ */
+ ret = ieee80211_nan_conf_copy(&new_conf, &sdata->u.nan.conf,
+ 0xFFFFFFFF);
+ if (ret < 0)
+ return ret;
- if (changes & CFG80211_NAN_CONF_CHANGED_BANDS)
- new_conf.bands = conf->bands;
+ ret = ieee80211_nan_conf_copy(&new_conf, conf, changes);
+ if (ret < 0)
+ return ret;
ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes);
- if (!ret)
+ if (ret) {
+ ieee80211_nan_conf_free(&new_conf);
+ } else {
+ ieee80211_nan_conf_free(&sdata->u.nan.conf);
sdata->u.nan.conf = new_conf;
+ }
return ret;
}
@@ -353,52 +588,88 @@ static int ieee80211_set_noack_map(struct wiphy *wiphy,
return 0;
}
+static int ieee80211_set_tx(struct ieee80211_sub_if_data *sdata,
+ const u8 *mac_addr, u8 key_idx)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_key *key;
+ struct sta_info *sta;
+ int ret = -EINVAL;
+
+ if (!wiphy_ext_feature_isset(local->hw.wiphy,
+ NL80211_EXT_FEATURE_EXT_KEY_ID))
+ return -EINVAL;
+
+ sta = sta_info_get_bss(sdata, mac_addr);
+
+ if (!sta)
+ return -EINVAL;
+
+ if (sta->ptk_idx == key_idx)
+ return 0;
+
+ key = wiphy_dereference(local->hw.wiphy, sta->ptk[key_idx]);
+
+ if (key && key->conf.flags & IEEE80211_KEY_FLAG_NO_AUTO_TX)
+ ret = ieee80211_set_tx_key(key);
+
+ return ret;
+}
+
static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
- u8 key_idx, bool pairwise, const u8 *mac_addr,
- struct key_params *params)
+ int link_id, u8 key_idx, bool pairwise,
+ const u8 *mac_addr, struct key_params *params)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_link_data *link =
+ ieee80211_link_or_deflink(sdata, link_id, false);
struct ieee80211_local *local = sdata->local;
struct sta_info *sta = NULL;
- const struct ieee80211_cipher_scheme *cs = NULL;
struct ieee80211_key *key;
int err;
+ lockdep_assert_wiphy(local->hw.wiphy);
+
if (!ieee80211_sdata_running(sdata))
return -ENETDOWN;
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+
+ if (WARN_ON(pairwise && link_id >= 0))
+ return -EINVAL;
+
+ if (pairwise && params->mode == NL80211_KEY_SET_TX)
+ return ieee80211_set_tx(sdata, mac_addr, key_idx);
+
/* reject WEP and TKIP keys if WEP failed to initialize */
switch (params->cipher) {
case WLAN_CIPHER_SUITE_WEP40:
case WLAN_CIPHER_SUITE_TKIP:
case WLAN_CIPHER_SUITE_WEP104:
- if (IS_ERR(local->wep_tx_tfm))
+ if (link_id >= 0)
+ return -EINVAL;
+ if (WARN_ON_ONCE(fips_enabled))
return -EINVAL;
- break;
- case WLAN_CIPHER_SUITE_CCMP:
- case WLAN_CIPHER_SUITE_CCMP_256:
- case WLAN_CIPHER_SUITE_AES_CMAC:
- case WLAN_CIPHER_SUITE_BIP_CMAC_256:
- case WLAN_CIPHER_SUITE_BIP_GMAC_128:
- case WLAN_CIPHER_SUITE_BIP_GMAC_256:
- case WLAN_CIPHER_SUITE_GCMP:
- case WLAN_CIPHER_SUITE_GCMP_256:
break;
default:
- cs = ieee80211_cs_get(local, params->cipher, sdata->vif.type);
break;
}
key = ieee80211_key_alloc(params->cipher, key_idx, params->key_len,
- params->key, params->seq_len, params->seq,
- cs);
+ params->key, params->seq_len, params->seq);
if (IS_ERR(key))
return PTR_ERR(key);
- if (pairwise)
+ if (pairwise) {
key->conf.flags |= IEEE80211_KEY_FLAG_PAIRWISE;
+ key->conf.link_id = -1;
+ } else {
+ key->conf.link_id = link->link_id;
+ }
- mutex_lock(&local->sta_mtx);
+ if (params->mode == NL80211_KEY_NO_TX)
+ key->conf.flags |= IEEE80211_KEY_FLAG_NO_AUTO_TX;
if (mac_addr) {
sta = sta_info_get_bss(sdata, mac_addr);
@@ -414,8 +685,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
*/
if (!sta || !test_sta_flag(sta, WLAN_STA_ASSOC)) {
ieee80211_key_free_unused(key);
- err = -ENOENT;
- goto out_unlock;
+ return -ENOENT;
}
}
@@ -453,69 +723,102 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
break;
}
- if (sta)
- sta->cipher_scheme = cs;
-
- err = ieee80211_key_link(key, sdata, sta);
-
- out_unlock:
- mutex_unlock(&local->sta_mtx);
+ err = ieee80211_key_link(key, link, sta);
+ /* KRACK protection, shouldn't happen but just silently accept key */
+ if (err == -EALREADY)
+ err = 0;
return err;
}
-static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev,
- u8 key_idx, bool pairwise, const u8 *mac_addr)
+static struct ieee80211_key *
+ieee80211_lookup_key(struct ieee80211_sub_if_data *sdata, int link_id,
+ u8 key_idx, bool pairwise, const u8 *mac_addr)
{
- struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
- struct ieee80211_local *local = sdata->local;
- struct sta_info *sta;
- struct ieee80211_key *key = NULL;
- int ret;
+ struct ieee80211_local *local __maybe_unused = sdata->local;
+ struct ieee80211_link_data *link = &sdata->deflink;
+ struct ieee80211_key *key;
- mutex_lock(&local->sta_mtx);
- mutex_lock(&local->key_mtx);
+ if (link_id >= 0) {
+ link = sdata_dereference(sdata->link[link_id], sdata);
+ if (!link)
+ return NULL;
+ }
if (mac_addr) {
- ret = -ENOENT;
+ struct sta_info *sta;
+ struct link_sta_info *link_sta;
sta = sta_info_get_bss(sdata, mac_addr);
if (!sta)
- goto out_unlock;
+ return NULL;
- if (pairwise)
- key = key_mtx_dereference(local, sta->ptk[key_idx]);
- else
- key = key_mtx_dereference(local, sta->gtk[key_idx]);
- } else
- key = key_mtx_dereference(local, sdata->keys[key_idx]);
+ if (link_id >= 0) {
+ link_sta = rcu_dereference_check(sta->link[link_id],
+ lockdep_is_held(&local->hw.wiphy->mtx));
+ if (!link_sta)
+ return NULL;
+ } else {
+ link_sta = &sta->deflink;
+ }
+
+ if (pairwise && key_idx < NUM_DEFAULT_KEYS)
+ return wiphy_dereference(local->hw.wiphy,
+ sta->ptk[key_idx]);
+
+ if (!pairwise &&
+ key_idx < NUM_DEFAULT_KEYS +
+ NUM_DEFAULT_MGMT_KEYS +
+ NUM_DEFAULT_BEACON_KEYS)
+ return wiphy_dereference(local->hw.wiphy,
+ link_sta->gtk[key_idx]);
- if (!key) {
- ret = -ENOENT;
- goto out_unlock;
+ return NULL;
}
- ieee80211_key_free(key, sdata->vif.type == NL80211_IFTYPE_STATION);
+ if (pairwise && key_idx < NUM_DEFAULT_KEYS)
+ return wiphy_dereference(local->hw.wiphy, sdata->keys[key_idx]);
- ret = 0;
- out_unlock:
- mutex_unlock(&local->key_mtx);
- mutex_unlock(&local->sta_mtx);
+ key = wiphy_dereference(local->hw.wiphy, link->gtk[key_idx]);
+ if (key)
+ return key;
- return ret;
+ /* or maybe it was a WEP key */
+ if (key_idx < NUM_DEFAULT_KEYS)
+ return wiphy_dereference(local->hw.wiphy, sdata->keys[key_idx]);
+
+ return NULL;
+}
+
+static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev,
+ int link_id, u8 key_idx, bool pairwise,
+ const u8 *mac_addr)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_key *key;
+
+ lockdep_assert_wiphy(local->hw.wiphy);
+
+ key = ieee80211_lookup_key(sdata, link_id, key_idx, pairwise, mac_addr);
+ if (!key)
+ return -ENOENT;
+
+ ieee80211_key_free(key, sdata->vif.type == NL80211_IFTYPE_STATION);
+
+ return 0;
}
static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
- u8 key_idx, bool pairwise, const u8 *mac_addr,
- void *cookie,
+ int link_id, u8 key_idx, bool pairwise,
+ const u8 *mac_addr, void *cookie,
void (*callback)(void *cookie,
struct key_params *params))
{
struct ieee80211_sub_if_data *sdata;
- struct sta_info *sta = NULL;
u8 seq[6] = {0};
struct key_params params;
- struct ieee80211_key *key = NULL;
+ struct ieee80211_key *key;
u64 pn64;
u32 iv32;
u16 iv16;
@@ -526,19 +829,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
rcu_read_lock();
- if (mac_addr) {
- sta = sta_info_get_bss(sdata, mac_addr);
- if (!sta)
- goto out;
-
- if (pairwise && key_idx < NUM_DEFAULT_KEYS)
- key = rcu_dereference(sta->ptk[key_idx]);
- else if (!pairwise &&
- key_idx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS)
- key = rcu_dereference(sta->gtk[key_idx]);
- } else
- key = rcu_dereference(sdata->keys[key_idx]);
-
+ key = ieee80211_lookup_key(sdata, link_id, key_idx, pairwise, mac_addr);
if (!key)
goto out;
@@ -574,12 +865,12 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
case WLAN_CIPHER_SUITE_BIP_CMAC_256:
BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) !=
offsetof(typeof(kseq), aes_cmac));
- /* fall through */
+ fallthrough;
case WLAN_CIPHER_SUITE_BIP_GMAC_128:
case WLAN_CIPHER_SUITE_BIP_GMAC_256:
BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) !=
offsetof(typeof(kseq), aes_gmac));
- /* fall through */
+ fallthrough;
case WLAN_CIPHER_SUITE_GCMP:
case WLAN_CIPHER_SUITE_GCMP_256:
BUILD_BUG_ON(offsetof(typeof(kseq), ccmp) !=
@@ -612,9 +903,6 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
break;
}
- params.key = key->conf.key;
- params.key_len = key->conf.keylen;
-
callback(cookie, &params);
err = 0;
@@ -625,23 +913,49 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
static int ieee80211_config_default_key(struct wiphy *wiphy,
struct net_device *dev,
- u8 key_idx, bool uni,
+ int link_id, u8 key_idx, bool uni,
bool multi)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_link_data *link =
+ ieee80211_link_or_deflink(sdata, link_id, false);
+
+ if (IS_ERR(link))
+ return PTR_ERR(link);
- ieee80211_set_default_key(sdata, key_idx, uni, multi);
+ ieee80211_set_default_key(link, key_idx, uni, multi);
return 0;
}
static int ieee80211_config_default_mgmt_key(struct wiphy *wiphy,
struct net_device *dev,
- u8 key_idx)
+ int link_id, u8 key_idx)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_link_data *link =
+ ieee80211_link_or_deflink(sdata, link_id, true);
+
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+
+ ieee80211_set_default_mgmt_key(link, key_idx);
+
+ return 0;
+}
+
+static int ieee80211_config_default_beacon_key(struct wiphy *wiphy,
+ struct net_device *dev,
+ int link_id, u8 key_idx)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_link_data *link =
+ ieee80211_link_or_deflink(sdata, link_id, true);
- ieee80211_set_default_mgmt_key(sdata, key_idx);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+
+ ieee80211_set_default_beacon_key(link, key_idx);
return 0;
}
@@ -660,14 +974,11 @@ void sta_set_rate_info_tx(struct sta_info *sta,
rinfo->nss = ieee80211_rate_get_vht_nss(rate);
} else {
struct ieee80211_supported_band *sband;
- int shift = ieee80211_vif_get_shift(&sta->sdata->vif);
- u16 brate;
sband = ieee80211_get_sband(sta->sdata);
- if (sband) {
- brate = sband->bitrates[rate->idx].bitrate;
- rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift);
- }
+ WARN_ON_ONCE(sband && !sband->bitrates);
+ if (sband && sband->bitrates)
+ rinfo->legacy = sband->bitrates[rate->idx].bitrate;
}
if (rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH)
rinfo->bw = RATE_INFO_BW_40;
@@ -689,16 +1000,21 @@ static int ieee80211_dump_station(struct wiphy *wiphy, struct net_device *dev,
struct sta_info *sta;
int ret = -ENOENT;
- mutex_lock(&local->sta_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
sta = sta_info_get_by_idx(sdata, idx);
if (sta) {
ret = 0;
memcpy(mac, sta->sta.addr, ETH_ALEN);
sta_set_sinfo(sta, sinfo, true);
- }
- mutex_unlock(&local->sta_mtx);
+ /* Add accumulated removed link data to sinfo data for
+ * consistency for MLO
+ */
+ if (sinfo->valid_links)
+ sta_set_accumulated_removed_links_sinfo(sta, sinfo);
+
+ }
return ret;
}
@@ -719,59 +1035,73 @@ static int ieee80211_get_station(struct wiphy *wiphy, struct net_device *dev,
struct sta_info *sta;
int ret = -ENOENT;
- mutex_lock(&local->sta_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
sta = sta_info_get_bss(sdata, mac);
if (sta) {
ret = 0;
sta_set_sinfo(sta, sinfo, true);
- }
- mutex_unlock(&local->sta_mtx);
+ /* Add accumulated removed link data to sinfo data for
+ * consistency for MLO
+ */
+ if (sinfo->valid_links)
+ sta_set_accumulated_removed_links_sinfo(sta, sinfo);
+ }
return ret;
}
static int ieee80211_set_monitor_channel(struct wiphy *wiphy,
+ struct net_device *dev,
struct cfg80211_chan_def *chandef)
{
struct ieee80211_local *local = wiphy_priv(wiphy);
struct ieee80211_sub_if_data *sdata;
- int ret = 0;
+ struct ieee80211_chan_req chanreq = { .oper = *chandef };
+ int ret;
- if (cfg80211_chandef_identical(&local->monitor_chandef, chandef))
- return 0;
+ lockdep_assert_wiphy(local->hw.wiphy);
- mutex_lock(&local->mtx);
- if (local->use_chanctx) {
- sdata = rtnl_dereference(local->monitor_sdata);
- if (sdata) {
- ieee80211_vif_release_channel(sdata);
- ret = ieee80211_vif_use_channel(sdata, chandef,
- IEEE80211_CHANCTX_EXCLUSIVE);
- }
- } else if (local->open_count == local->monitors) {
- local->_oper_chandef = *chandef;
- ieee80211_hw_config(local, 0);
+ sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ if (!ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) {
+ if (cfg80211_chandef_identical(&local->monitor_chanreq.oper,
+ &chanreq.oper))
+ return 0;
+
+ sdata = wiphy_dereference(wiphy, local->monitor_sdata);
+ if (!sdata)
+ goto done;
}
- if (ret == 0)
- local->monitor_chandef = *chandef;
- mutex_unlock(&local->mtx);
+ if (rcu_access_pointer(sdata->deflink.conf->chanctx_conf) &&
+ cfg80211_chandef_identical(&sdata->vif.bss_conf.chanreq.oper,
+ &chanreq.oper))
+ return 0;
- return ret;
+ ieee80211_link_release_channel(&sdata->deflink);
+ ret = ieee80211_link_use_channel(&sdata->deflink, &chanreq,
+ IEEE80211_CHANCTX_SHARED);
+ if (ret)
+ return ret;
+done:
+ local->monitor_chanreq = chanreq;
+ return 0;
}
-static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata,
- const u8 *resp, size_t resp_len,
- const struct ieee80211_csa_settings *csa)
+static int
+ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata,
+ const u8 *resp, size_t resp_len,
+ const struct ieee80211_csa_settings *csa,
+ const struct ieee80211_color_change_settings *cca,
+ struct ieee80211_link_data *link)
{
struct probe_resp *new, *old;
if (!resp || !resp_len)
return 1;
- old = sdata_dereference(sdata->u.ap.probe_resp, sdata);
+ old = sdata_dereference(link->u.ap.probe_resp, sdata);
new = kzalloc(sizeof(struct probe_resp) + resp_len, GFP_KERNEL);
if (!new)
@@ -781,28 +1111,220 @@ static int ieee80211_set_probe_resp(struct ieee80211_sub_if_data *sdata,
memcpy(new->data, resp, resp_len);
if (csa)
- memcpy(new->csa_counter_offsets, csa->counter_offsets_presp,
+ memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_presp,
csa->n_counter_offsets_presp *
- sizeof(new->csa_counter_offsets[0]));
+ sizeof(new->cntdwn_counter_offsets[0]));
+ else if (cca)
+ new->cntdwn_counter_offsets[0] = cca->counter_offset_presp;
+
+ rcu_assign_pointer(link->u.ap.probe_resp, new);
+ if (old)
+ kfree_rcu(old, rcu_head);
+
+ return 0;
+}
+
+static int ieee80211_set_fils_discovery(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_fils_discovery *params,
+ struct ieee80211_link_data *link,
+ struct ieee80211_bss_conf *link_conf,
+ u64 *changed)
+{
+ struct fils_discovery_data *new, *old = NULL;
+ struct ieee80211_fils_discovery *fd;
+
+ if (!params->update)
+ return 0;
+
+ fd = &link_conf->fils_discovery;
+ fd->min_interval = params->min_interval;
+ fd->max_interval = params->max_interval;
+
+ old = sdata_dereference(link->u.ap.fils_discovery, sdata);
+ if (old)
+ kfree_rcu(old, rcu_head);
+
+ if (params->tmpl && params->tmpl_len) {
+ new = kzalloc(sizeof(*new) + params->tmpl_len, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+ new->len = params->tmpl_len;
+ memcpy(new->data, params->tmpl, params->tmpl_len);
+ rcu_assign_pointer(link->u.ap.fils_discovery, new);
+ } else {
+ RCU_INIT_POINTER(link->u.ap.fils_discovery, NULL);
+ }
+
+ *changed |= BSS_CHANGED_FILS_DISCOVERY;
+ return 0;
+}
+
+static int
+ieee80211_set_unsol_bcast_probe_resp(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_unsol_bcast_probe_resp *params,
+ struct ieee80211_link_data *link,
+ struct ieee80211_bss_conf *link_conf,
+ u64 *changed)
+{
+ struct unsol_bcast_probe_resp_data *new, *old = NULL;
+
+ if (!params->update)
+ return 0;
+
+ link_conf->unsol_bcast_probe_resp_interval = params->interval;
- rcu_assign_pointer(sdata->u.ap.probe_resp, new);
+ old = sdata_dereference(link->u.ap.unsol_bcast_probe_resp, sdata);
if (old)
kfree_rcu(old, rcu_head);
+ if (params->tmpl && params->tmpl_len) {
+ new = kzalloc(sizeof(*new) + params->tmpl_len, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+ new->len = params->tmpl_len;
+ memcpy(new->data, params->tmpl, params->tmpl_len);
+ rcu_assign_pointer(link->u.ap.unsol_bcast_probe_resp, new);
+ } else {
+ RCU_INIT_POINTER(link->u.ap.unsol_bcast_probe_resp, NULL);
+ }
+
+ *changed |= BSS_CHANGED_UNSOL_BCAST_PROBE_RESP;
return 0;
}
-static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
- struct cfg80211_beacon_data *params,
- const struct ieee80211_csa_settings *csa)
+static int
+ieee80211_set_s1g_short_beacon(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_link_data *link,
+ struct cfg80211_s1g_short_beacon *params)
{
+ struct s1g_short_beacon_data *new;
+ struct s1g_short_beacon_data *old =
+ sdata_dereference(link->u.ap.s1g_short_beacon, sdata);
+ size_t new_len =
+ sizeof(*new) + params->short_head_len + params->short_tail_len;
+
+ if (!params->update)
+ return 0;
+
+ if (!params->short_head)
+ return -EINVAL;
+
+ new = kzalloc(new_len, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ /* Memory layout: | struct | head | tail | */
+ new->short_head = (u8 *)new + sizeof(*new);
+ new->short_head_len = params->short_head_len;
+ memcpy(new->short_head, params->short_head, params->short_head_len);
+
+ if (params->short_tail) {
+ new->short_tail = new->short_head + params->short_head_len;
+ new->short_tail_len = params->short_tail_len;
+ memcpy(new->short_tail, params->short_tail,
+ params->short_tail_len);
+ }
+
+ rcu_assign_pointer(link->u.ap.s1g_short_beacon, new);
+
+ if (old)
+ kfree_rcu(old, rcu_head);
+
+ return 0;
+}
+
+static int ieee80211_set_ftm_responder_params(
+ struct ieee80211_sub_if_data *sdata,
+ const u8 *lci, size_t lci_len,
+ const u8 *civicloc, size_t civicloc_len,
+ struct ieee80211_bss_conf *link_conf)
+{
+ struct ieee80211_ftm_responder_params *new, *old;
+ u8 *pos;
+ int len;
+
+ if (!lci_len && !civicloc_len)
+ return 0;
+
+ old = link_conf->ftmr_params;
+ len = lci_len + civicloc_len;
+
+ new = kzalloc(sizeof(*new) + len, GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ pos = (u8 *)(new + 1);
+ if (lci_len) {
+ new->lci_len = lci_len;
+ new->lci = pos;
+ memcpy(pos, lci, lci_len);
+ pos += lci_len;
+ }
+
+ if (civicloc_len) {
+ new->civicloc_len = civicloc_len;
+ new->civicloc = pos;
+ memcpy(pos, civicloc, civicloc_len);
+ pos += civicloc_len;
+ }
+
+ link_conf->ftmr_params = new;
+ kfree(old);
+
+ return 0;
+}
+
+static int
+ieee80211_copy_mbssid_beacon(u8 *pos, struct cfg80211_mbssid_elems *dst,
+ struct cfg80211_mbssid_elems *src)
+{
+ int i, offset = 0;
+
+ dst->cnt = src->cnt;
+ for (i = 0; i < src->cnt; i++) {
+ memcpy(pos + offset, src->elem[i].data, src->elem[i].len);
+ dst->elem[i].len = src->elem[i].len;
+ dst->elem[i].data = pos + offset;
+ offset += dst->elem[i].len;
+ }
+
+ return offset;
+}
+
+static int
+ieee80211_copy_rnr_beacon(u8 *pos, struct cfg80211_rnr_elems *dst,
+ struct cfg80211_rnr_elems *src)
+{
+ int i, offset = 0;
+
+ dst->cnt = src->cnt;
+ for (i = 0; i < src->cnt; i++) {
+ memcpy(pos + offset, src->elem[i].data, src->elem[i].len);
+ dst->elem[i].len = src->elem[i].len;
+ dst->elem[i].data = pos + offset;
+ offset += dst->elem[i].len;
+ }
+
+ return offset;
+}
+
+static int
+ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_link_data *link,
+ struct cfg80211_beacon_data *params,
+ const struct ieee80211_csa_settings *csa,
+ const struct ieee80211_color_change_settings *cca,
+ u64 *changed)
+{
+ struct cfg80211_mbssid_elems *mbssid = NULL;
+ struct cfg80211_rnr_elems *rnr = NULL;
struct beacon_data *new, *old;
int new_head_len, new_tail_len;
int size, err;
- u32 changed = BSS_CHANGED_BEACON;
-
- old = sdata_dereference(sdata->u.ap.beacon, sdata);
+ u64 _changed = BSS_CHANGED_BEACON;
+ struct ieee80211_bss_conf *link_conf = link->conf;
+ old = sdata_dereference(link->u.ap.beacon, sdata);
/* Need to have a beacon head if we don't have one yet */
if (!params->head && !old)
@@ -823,6 +1345,27 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
size = sizeof(*new) + new_head_len + new_tail_len;
+ /* new or old multiple BSSID elements? */
+ if (params->mbssid_ies) {
+ mbssid = params->mbssid_ies;
+ size += struct_size(new->mbssid_ies, elem, mbssid->cnt);
+ if (params->rnr_ies) {
+ rnr = params->rnr_ies;
+ size += struct_size(new->rnr_ies, elem, rnr->cnt);
+ }
+ size += ieee80211_get_mbssid_beacon_len(mbssid, rnr,
+ mbssid->cnt);
+ } else if (old && old->mbssid_ies) {
+ mbssid = old->mbssid_ies;
+ size += struct_size(new->mbssid_ies, elem, mbssid->cnt);
+ if (old && old->rnr_ies) {
+ rnr = old->rnr_ies;
+ size += struct_size(new->rnr_ies, elem, rnr->cnt);
+ }
+ size += ieee80211_get_mbssid_beacon_len(mbssid, rnr,
+ mbssid->cnt);
+ }
+
new = kzalloc(size, GFP_KERNEL);
if (!new)
return -ENOMEM;
@@ -831,18 +1374,41 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
/*
* pointers go into the block we allocated,
- * memory is | beacon_data | head | tail |
+ * memory is | beacon_data | head | tail | mbssid_ies | rnr_ies
*/
new->head = ((u8 *) new) + sizeof(*new);
new->tail = new->head + new_head_len;
new->head_len = new_head_len;
new->tail_len = new_tail_len;
+ /* copy in optional mbssid_ies */
+ if (mbssid) {
+ u8 *pos = new->tail + new->tail_len;
+
+ new->mbssid_ies = (void *)pos;
+ pos += struct_size(new->mbssid_ies, elem, mbssid->cnt);
+ pos += ieee80211_copy_mbssid_beacon(pos, new->mbssid_ies,
+ mbssid);
+ if (rnr) {
+ new->rnr_ies = (void *)pos;
+ pos += struct_size(new->rnr_ies, elem, rnr->cnt);
+ ieee80211_copy_rnr_beacon(pos, new->rnr_ies, rnr);
+ }
+ /* update bssid_indicator */
+ if (new->mbssid_ies->cnt && new->mbssid_ies->elem[0].len > 2)
+ link_conf->bssid_indicator =
+ *(new->mbssid_ies->elem[0].data + 2);
+ else
+ link_conf->bssid_indicator = 0;
+ }
if (csa) {
- new->csa_current_counter = csa->count;
- memcpy(new->csa_counter_offsets, csa->counter_offsets_beacon,
+ new->cntdwn_current_counter = csa->count;
+ memcpy(new->cntdwn_counter_offsets, csa->counter_offsets_beacon,
csa->n_counter_offsets_beacon *
- sizeof(new->csa_counter_offsets[0]));
+ sizeof(new->cntdwn_counter_offsets[0]));
+ } else if (cca) {
+ new->cntdwn_current_counter = cca->count;
+ new->cntdwn_counter_offsets[0] = cca->counter_offset_beacon;
}
/* copy in head */
@@ -859,18 +1425,63 @@ static int ieee80211_assign_beacon(struct ieee80211_sub_if_data *sdata,
memcpy(new->tail, old->tail, new_tail_len);
err = ieee80211_set_probe_resp(sdata, params->probe_resp,
- params->probe_resp_len, csa);
- if (err < 0)
+ params->probe_resp_len, csa, cca, link);
+ if (err < 0) {
+ kfree(new);
return err;
+ }
if (err == 0)
- changed |= BSS_CHANGED_AP_PROBE_RESP;
+ _changed |= BSS_CHANGED_AP_PROBE_RESP;
+
+ if (params->ftm_responder != -1) {
+ link_conf->ftm_responder = params->ftm_responder;
+ err = ieee80211_set_ftm_responder_params(sdata,
+ params->lci,
+ params->lci_len,
+ params->civicloc,
+ params->civicloc_len,
+ link_conf);
+
+ if (err < 0) {
+ kfree(new);
+ return err;
+ }
+
+ _changed |= BSS_CHANGED_FTM_RESPONDER;
+ }
- rcu_assign_pointer(sdata->u.ap.beacon, new);
+ rcu_assign_pointer(link->u.ap.beacon, new);
+ sdata->u.ap.active = true;
if (old)
kfree_rcu(old, rcu_head);
- return changed;
+ *changed |= _changed;
+ return 0;
+}
+
+static u8 ieee80211_num_beaconing_links(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_link_data *link;
+ u8 link_id, num = 0;
+
+ if (sdata->vif.type != NL80211_IFTYPE_AP &&
+ sdata->vif.type != NL80211_IFTYPE_P2P_GO)
+ return num;
+
+ /* non-MLO mode of operation also uses link_id 0 in sdata so it is
+ * safe to directly proceed with the below loop
+ */
+ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) {
+ link = sdata_dereference(sdata->link[link_id], sdata);
+ if (!link)
+ continue;
+
+ if (sdata_dereference(link->u.ap.beacon, sdata))
+ num++;
+ }
+
+ return num;
}
static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
@@ -880,45 +1491,142 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
struct ieee80211_local *local = sdata->local;
struct beacon_data *old;
struct ieee80211_sub_if_data *vlan;
- u32 changed = BSS_CHANGED_BEACON_INT |
+ u64 changed = BSS_CHANGED_BEACON_INT |
BSS_CHANGED_BEACON_ENABLED |
BSS_CHANGED_BEACON |
- BSS_CHANGED_SSID |
BSS_CHANGED_P2P_PS |
- BSS_CHANGED_TXPOWER;
- int err;
+ BSS_CHANGED_TXPOWER |
+ BSS_CHANGED_TWT;
+ int i, err;
+ int prev_beacon_int;
+ unsigned int link_id = params->beacon.link_id;
+ struct ieee80211_link_data *link;
+ struct ieee80211_bss_conf *link_conf;
+ struct ieee80211_chan_req chanreq = { .oper = params->chandef };
+ u64 tsf;
+
+ lockdep_assert_wiphy(local->hw.wiphy);
+
+ link = sdata_dereference(sdata->link[link_id], sdata);
+ if (!link)
+ return -ENOLINK;
- old = sdata_dereference(sdata->u.ap.beacon, sdata);
+ link_conf = link->conf;
+
+ old = sdata_dereference(link->u.ap.beacon, sdata);
if (old)
return -EALREADY;
- switch (params->smps_mode) {
- case NL80211_SMPS_OFF:
- sdata->smps_mode = IEEE80211_SMPS_OFF;
- break;
- case NL80211_SMPS_STATIC:
- sdata->smps_mode = IEEE80211_SMPS_STATIC;
- break;
- case NL80211_SMPS_DYNAMIC:
- sdata->smps_mode = IEEE80211_SMPS_DYNAMIC;
- break;
- default:
- return -EINVAL;
- }
- sdata->u.ap.req_smps = sdata->smps_mode;
+ link->smps_mode = IEEE80211_SMPS_OFF;
+
+ link->needed_rx_chains = sdata->local->rx_chains;
+
+ prev_beacon_int = link_conf->beacon_int;
+ link_conf->beacon_int = params->beacon_interval;
+
+ if (params->ht_cap)
+ link_conf->ht_ldpc =
+ params->ht_cap->cap_info &
+ cpu_to_le16(IEEE80211_HT_CAP_LDPC_CODING);
+
+ if (params->vht_cap) {
+ link_conf->vht_ldpc =
+ params->vht_cap->vht_cap_info &
+ cpu_to_le32(IEEE80211_VHT_CAP_RXLDPC);
+ link_conf->vht_su_beamformer =
+ params->vht_cap->vht_cap_info &
+ cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE);
+ link_conf->vht_su_beamformee =
+ params->vht_cap->vht_cap_info &
+ cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE);
+ link_conf->vht_mu_beamformer =
+ params->vht_cap->vht_cap_info &
+ cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE);
+ link_conf->vht_mu_beamformee =
+ params->vht_cap->vht_cap_info &
+ cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE);
+ }
+
+ if (params->he_cap && params->he_oper) {
+ link_conf->he_support = true;
+ link_conf->htc_trig_based_pkt_ext =
+ le32_get_bits(params->he_oper->he_oper_params,
+ IEEE80211_HE_OPERATION_DFLT_PE_DURATION_MASK);
+ link_conf->frame_time_rts_th =
+ le32_get_bits(params->he_oper->he_oper_params,
+ IEEE80211_HE_OPERATION_RTS_THRESHOLD_MASK);
+ changed |= BSS_CHANGED_HE_OBSS_PD;
+
+ if (params->beacon.he_bss_color.enabled)
+ changed |= BSS_CHANGED_HE_BSS_COLOR;
+ }
+
+ if (params->he_cap) {
+ link_conf->he_ldpc =
+ params->he_cap->phy_cap_info[1] &
+ IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD;
+ link_conf->he_su_beamformer =
+ params->he_cap->phy_cap_info[3] &
+ IEEE80211_HE_PHY_CAP3_SU_BEAMFORMER;
+ link_conf->he_su_beamformee =
+ params->he_cap->phy_cap_info[4] &
+ IEEE80211_HE_PHY_CAP4_SU_BEAMFORMEE;
+ link_conf->he_mu_beamformer =
+ params->he_cap->phy_cap_info[4] &
+ IEEE80211_HE_PHY_CAP4_MU_BEAMFORMER;
+ link_conf->he_full_ul_mumimo =
+ params->he_cap->phy_cap_info[2] &
+ IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO;
+ }
+
+ if (params->eht_cap) {
+ if (!link_conf->he_support)
+ return -EOPNOTSUPP;
- sdata->needed_rx_chains = sdata->local->rx_chains;
+ link_conf->eht_support = true;
+
+ link_conf->eht_su_beamformer =
+ params->eht_cap->fixed.phy_cap_info[0] &
+ IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER;
+ link_conf->eht_su_beamformee =
+ params->eht_cap->fixed.phy_cap_info[0] &
+ IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE;
+ link_conf->eht_mu_beamformer =
+ params->eht_cap->fixed.phy_cap_info[7] &
+ (IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ |
+ IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ |
+ IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_320MHZ);
+ link_conf->eht_80mhz_full_bw_ul_mumimo =
+ params->eht_cap->fixed.phy_cap_info[7] &
+ (IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ |
+ IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ |
+ IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_320MHZ);
+ link_conf->eht_disable_mcs15 =
+ u8_get_bits(params->eht_oper->params,
+ IEEE80211_EHT_OPER_MCS15_DISABLE);
+ } else {
+ link_conf->eht_su_beamformer = false;
+ link_conf->eht_su_beamformee = false;
+ link_conf->eht_mu_beamformer = false;
+ }
- sdata->vif.bss_conf.beacon_int = params->beacon_interval;
+ if (sdata->vif.type == NL80211_IFTYPE_AP &&
+ params->mbssid_config.tx_wdev) {
+ err = ieee80211_set_ap_mbssid_options(sdata,
+ &params->mbssid_config,
+ link_conf);
+ if (err)
+ return err;
+ }
- mutex_lock(&local->mtx);
- err = ieee80211_vif_use_channel(sdata, &params->chandef,
- IEEE80211_CHANCTX_SHARED);
+ err = ieee80211_link_use_channel(link, &chanreq,
+ IEEE80211_CHANCTX_SHARED);
if (!err)
- ieee80211_vif_copy_chanctx_to_vlans(sdata, false);
- mutex_unlock(&local->mtx);
- if (err)
+ ieee80211_link_copy_chanctx_to_vlans(link, false);
+ if (err) {
+ link_conf->beacon_int = prev_beacon_int;
return err;
+ }
/*
* Apply control port protocol, this allows us to
@@ -928,9 +1636,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
sdata->control_port_no_encrypt = params->crypto.control_port_no_encrypt;
sdata->control_port_over_nl80211 =
params->crypto.control_port_over_nl80211;
- sdata->encrypt_headroom = ieee80211_cs_headroom(sdata->local,
- &params->crypto,
- sdata->vif.type);
+ sdata->control_port_no_preauth =
+ params->crypto.control_port_no_preauth;
list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) {
vlan->control_port_protocol =
@@ -939,203 +1646,296 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
params->crypto.control_port_no_encrypt;
vlan->control_port_over_nl80211 =
params->crypto.control_port_over_nl80211;
- vlan->encrypt_headroom =
- ieee80211_cs_headroom(sdata->local,
- &params->crypto,
- vlan->vif.type);
+ vlan->control_port_no_preauth =
+ params->crypto.control_port_no_preauth;
}
- sdata->vif.bss_conf.dtim_period = params->dtim_period;
- sdata->vif.bss_conf.enable_beacon = true;
- sdata->vif.bss_conf.allow_p2p_go_ps = sdata->vif.p2p;
+ link_conf->dtim_period = params->dtim_period;
+ link_conf->enable_beacon = true;
+ link_conf->allow_p2p_go_ps = sdata->vif.p2p;
+ link_conf->twt_responder = params->twt_responder;
+ link_conf->he_obss_pd = params->he_obss_pd;
+ link_conf->he_bss_color = params->beacon.he_bss_color;
+ link_conf->s1g_long_beacon_period = params->s1g_long_beacon_period;
+ sdata->vif.cfg.s1g = params->chandef.chan->band == NL80211_BAND_S1GHZ;
- sdata->vif.bss_conf.ssid_len = params->ssid_len;
+ sdata->vif.cfg.ssid_len = params->ssid_len;
if (params->ssid_len)
- memcpy(sdata->vif.bss_conf.ssid, params->ssid,
+ memcpy(sdata->vif.cfg.ssid, params->ssid,
params->ssid_len);
- sdata->vif.bss_conf.hidden_ssid =
+ link_conf->hidden_ssid =
(params->hidden_ssid != NL80211_HIDDEN_SSID_NOT_IN_USE);
- memset(&sdata->vif.bss_conf.p2p_noa_attr, 0,
- sizeof(sdata->vif.bss_conf.p2p_noa_attr));
- sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow =
+ memset(&link_conf->p2p_noa_attr, 0,
+ sizeof(link_conf->p2p_noa_attr));
+ link_conf->p2p_noa_attr.oppps_ctwindow =
params->p2p_ctwindow & IEEE80211_P2P_OPPPS_CTWINDOW_MASK;
if (params->p2p_opp_ps)
- sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow |=
+ link_conf->p2p_noa_attr.oppps_ctwindow |=
IEEE80211_P2P_OPPPS_ENABLE_BIT;
- err = ieee80211_assign_beacon(sdata, &params->beacon, NULL);
- if (err < 0) {
- ieee80211_vif_release_channel(sdata);
- return err;
+ sdata->beacon_rate_set = false;
+ if (wiphy_ext_feature_isset(local->hw.wiphy,
+ NL80211_EXT_FEATURE_BEACON_RATE_LEGACY)) {
+ for (i = 0; i < NUM_NL80211_BANDS; i++) {
+ sdata->beacon_rateidx_mask[i] =
+ params->beacon_rate.control[i].legacy;
+ if (sdata->beacon_rateidx_mask[i])
+ sdata->beacon_rate_set = true;
+ }
}
- changed |= err;
- err = drv_start_ap(sdata->local, sdata);
+ if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL))
+ link_conf->beacon_tx_rate = params->beacon_rate;
+
+ err = ieee80211_assign_beacon(sdata, link, &params->beacon, NULL, NULL,
+ &changed);
+ if (err < 0)
+ goto error;
+
+ err = ieee80211_set_fils_discovery(sdata, &params->fils_discovery,
+ link, link_conf, &changed);
+ if (err < 0)
+ goto error;
+
+ err = ieee80211_set_unsol_bcast_probe_resp(sdata,
+ &params->unsol_bcast_probe_resp,
+ link, link_conf, &changed);
+ if (err < 0)
+ goto error;
+
+ if (sdata->vif.cfg.s1g) {
+ err = ieee80211_set_s1g_short_beacon(sdata, link,
+ &params->s1g_short_beacon);
+ if (err < 0)
+ goto error;
+ }
+
+ err = drv_start_ap(sdata->local, sdata, link_conf);
if (err) {
- old = sdata_dereference(sdata->u.ap.beacon, sdata);
+ old = sdata_dereference(link->u.ap.beacon, sdata);
if (old)
kfree_rcu(old, rcu_head);
- RCU_INIT_POINTER(sdata->u.ap.beacon, NULL);
- ieee80211_vif_release_channel(sdata);
- return err;
+ RCU_INIT_POINTER(link->u.ap.beacon, NULL);
+
+ if (ieee80211_num_beaconing_links(sdata) == 0)
+ sdata->u.ap.active = false;
+
+ goto error;
}
- ieee80211_recalc_dtim(local, sdata);
- ieee80211_bss_info_change_notify(sdata, changed);
+ tsf = drv_get_tsf(local, sdata);
+ ieee80211_recalc_dtim(sdata, tsf);
+
+ if (link->u.ap.s1g_short_beacon)
+ ieee80211_recalc_sb_count(sdata, tsf);
+
+ ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_SSID);
+ ieee80211_link_info_change_notify(sdata, link, changed);
+
+ if (ieee80211_num_beaconing_links(sdata) <= 1)
+ netif_carrier_on(dev);
- netif_carrier_on(dev);
list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
netif_carrier_on(vlan->dev);
return 0;
+
+error:
+ ieee80211_link_release_channel(link);
+
+ return err;
}
static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev,
- struct cfg80211_beacon_data *params)
+ struct cfg80211_ap_update *params)
+
{
- struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_link_data *link;
+ struct cfg80211_beacon_data *beacon = &params->beacon;
struct beacon_data *old;
int err;
+ struct ieee80211_bss_conf *link_conf;
+ u64 changed = 0;
- sdata = IEEE80211_DEV_TO_SUB_IF(dev);
- sdata_assert_lock(sdata);
+ lockdep_assert_wiphy(wiphy);
+
+ link = sdata_dereference(sdata->link[beacon->link_id], sdata);
+ if (!link)
+ return -ENOLINK;
+
+ link_conf = link->conf;
- /* don't allow changing the beacon while CSA is in place - offset
+ /* don't allow changing the beacon while a countdown is in place - offset
* of channel switch counter may change
*/
- if (sdata->vif.csa_active)
+ if (link_conf->csa_active || link_conf->color_change_active)
return -EBUSY;
- old = sdata_dereference(sdata->u.ap.beacon, sdata);
+ old = sdata_dereference(link->u.ap.beacon, sdata);
if (!old)
return -ENOENT;
- err = ieee80211_assign_beacon(sdata, params, NULL);
+ err = ieee80211_assign_beacon(sdata, link, beacon, NULL, NULL,
+ &changed);
+ if (err < 0)
+ return err;
+
+ err = ieee80211_set_fils_discovery(sdata, &params->fils_discovery,
+ link, link_conf, &changed);
+ if (err < 0)
+ return err;
+
+ err = ieee80211_set_unsol_bcast_probe_resp(sdata,
+ &params->unsol_bcast_probe_resp,
+ link, link_conf, &changed);
if (err < 0)
return err;
- ieee80211_bss_info_change_notify(sdata, err);
+
+ if (link->u.ap.s1g_short_beacon) {
+ err = ieee80211_set_s1g_short_beacon(sdata, link,
+ &params->s1g_short_beacon);
+ if (err < 0)
+ return err;
+ }
+
+ if (beacon->he_bss_color_valid &&
+ beacon->he_bss_color.enabled != link_conf->he_bss_color.enabled) {
+ link_conf->he_bss_color.enabled = beacon->he_bss_color.enabled;
+ changed |= BSS_CHANGED_HE_BSS_COLOR;
+ }
+
+ ieee80211_link_info_change_notify(sdata, link, changed);
return 0;
}
-static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev)
+static void ieee80211_free_next_beacon(struct ieee80211_link_data *link)
+{
+ if (!link->u.ap.next_beacon)
+ return;
+
+ kfree(link->u.ap.next_beacon->mbssid_ies);
+ kfree(link->u.ap.next_beacon->rnr_ies);
+ kfree(link->u.ap.next_beacon);
+ link->u.ap.next_beacon = NULL;
+}
+
+static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
+ unsigned int link_id)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
struct ieee80211_sub_if_data *vlan;
struct ieee80211_local *local = sdata->local;
struct beacon_data *old_beacon;
struct probe_resp *old_probe_resp;
+ struct fils_discovery_data *old_fils_discovery;
+ struct unsol_bcast_probe_resp_data *old_unsol_bcast_probe_resp;
+ struct s1g_short_beacon_data *old_s1g_short_beacon;
struct cfg80211_chan_def chandef;
+ struct ieee80211_link_data *link =
+ sdata_dereference(sdata->link[link_id], sdata);
+ struct ieee80211_bss_conf *link_conf = link->conf;
+ LIST_HEAD(keys);
- sdata_assert_lock(sdata);
+ lockdep_assert_wiphy(local->hw.wiphy);
- old_beacon = sdata_dereference(sdata->u.ap.beacon, sdata);
+ old_beacon = sdata_dereference(link->u.ap.beacon, sdata);
if (!old_beacon)
return -ENOENT;
- old_probe_resp = sdata_dereference(sdata->u.ap.probe_resp, sdata);
-
- /* abort any running channel switch */
- mutex_lock(&local->mtx);
- sdata->vif.csa_active = false;
- if (sdata->csa_block_tx) {
- ieee80211_wake_vif_queues(local, sdata,
- IEEE80211_QUEUE_STOP_REASON_CSA);
- sdata->csa_block_tx = false;
- }
-
- mutex_unlock(&local->mtx);
-
- kfree(sdata->u.ap.next_beacon);
- sdata->u.ap.next_beacon = NULL;
+ old_probe_resp = sdata_dereference(link->u.ap.probe_resp,
+ sdata);
+ old_fils_discovery = sdata_dereference(link->u.ap.fils_discovery,
+ sdata);
+ old_unsol_bcast_probe_resp =
+ sdata_dereference(link->u.ap.unsol_bcast_probe_resp,
+ sdata);
+ old_s1g_short_beacon =
+ sdata_dereference(link->u.ap.s1g_short_beacon, sdata);
+
+ /* abort any running channel switch or color change */
+ link_conf->csa_active = false;
+ link_conf->color_change_active = false;
+ ieee80211_vif_unblock_queues_csa(sdata);
+
+ ieee80211_free_next_beacon(link);
/* turn off carrier for this interface and dependent VLANs */
list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
netif_carrier_off(vlan->dev);
- netif_carrier_off(dev);
+
+ if (ieee80211_num_beaconing_links(sdata) <= 1) {
+ netif_carrier_off(dev);
+ sdata->u.ap.active = false;
+ }
/* remove beacon and probe response */
- RCU_INIT_POINTER(sdata->u.ap.beacon, NULL);
- RCU_INIT_POINTER(sdata->u.ap.probe_resp, NULL);
+ RCU_INIT_POINTER(link->u.ap.beacon, NULL);
+ RCU_INIT_POINTER(link->u.ap.probe_resp, NULL);
+ RCU_INIT_POINTER(link->u.ap.fils_discovery, NULL);
+ RCU_INIT_POINTER(link->u.ap.unsol_bcast_probe_resp, NULL);
+ RCU_INIT_POINTER(link->u.ap.s1g_short_beacon, NULL);
kfree_rcu(old_beacon, rcu_head);
if (old_probe_resp)
kfree_rcu(old_probe_resp, rcu_head);
- sdata->u.ap.driver_smps_mode = IEEE80211_SMPS_OFF;
+ if (old_fils_discovery)
+ kfree_rcu(old_fils_discovery, rcu_head);
+ if (old_unsol_bcast_probe_resp)
+ kfree_rcu(old_unsol_bcast_probe_resp, rcu_head);
+ if (old_s1g_short_beacon)
+ kfree_rcu(old_s1g_short_beacon, rcu_head);
+
+ kfree(link_conf->ftmr_params);
+ link_conf->ftmr_params = NULL;
+
+ link_conf->bssid_index = 0;
+ link_conf->nontransmitted = false;
+ link_conf->ema_ap = false;
+ link_conf->bssid_indicator = 0;
+ link_conf->fils_discovery.min_interval = 0;
+ link_conf->fils_discovery.max_interval = 0;
+ link_conf->unsol_bcast_probe_resp_interval = 0;
+
+ __sta_info_flush(sdata, true, link_id, NULL);
+
+ ieee80211_remove_link_keys(link, &keys);
+ if (!list_empty(&keys)) {
+ synchronize_net();
+ ieee80211_free_key_list(local, &keys);
+ }
- __sta_info_flush(sdata, true);
- ieee80211_free_keys(sdata, true);
+ ieee80211_stop_mbssid(sdata);
+ RCU_INIT_POINTER(link_conf->tx_bss_conf, NULL);
- sdata->vif.bss_conf.enable_beacon = false;
- sdata->vif.bss_conf.ssid_len = 0;
+ link_conf->enable_beacon = false;
+ sdata->beacon_rate_set = false;
+ sdata->vif.cfg.ssid_len = 0;
+ sdata->vif.cfg.s1g = false;
clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state);
- ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED);
+ ieee80211_link_info_change_notify(sdata, link,
+ BSS_CHANGED_BEACON_ENABLED);
- if (sdata->wdev.cac_started) {
- chandef = sdata->vif.bss_conf.chandef;
- cancel_delayed_work_sync(&sdata->dfs_cac_timer_work);
+ if (sdata->wdev.links[link_id].cac_started) {
+ chandef = link_conf->chanreq.oper;
+ wiphy_delayed_work_cancel(wiphy, &link->dfs_cac_timer_work);
cfg80211_cac_event(sdata->dev, &chandef,
NL80211_RADAR_CAC_ABORTED,
- GFP_KERNEL);
+ GFP_KERNEL, link_id);
}
- drv_stop_ap(sdata->local, sdata);
+ drv_stop_ap(sdata->local, sdata, link_conf);
/* free all potentially still buffered bcast frames */
local->total_ps_buffered -= skb_queue_len(&sdata->u.ap.ps.bc_buf);
ieee80211_purge_tx_queue(&local->hw, &sdata->u.ap.ps.bc_buf);
- mutex_lock(&local->mtx);
- ieee80211_vif_copy_chanctx_to_vlans(sdata, true);
- ieee80211_vif_release_channel(sdata);
- mutex_unlock(&local->mtx);
+ ieee80211_link_copy_chanctx_to_vlans(link, true);
+ ieee80211_link_release_channel(link);
return 0;
}
-/* Layer 2 Update frame (802.2 Type 1 LLC XID Update response) */
-struct iapp_layer2_update {
- u8 da[ETH_ALEN]; /* broadcast */
- u8 sa[ETH_ALEN]; /* STA addr */
- __be16 len; /* 6 */
- u8 dsap; /* 0 */
- u8 ssap; /* 0 */
- u8 control;
- u8 xid_info[3];
-} __packed;
-
-static void ieee80211_send_layer2_update(struct sta_info *sta)
-{
- struct iapp_layer2_update *msg;
- struct sk_buff *skb;
-
- /* Send Level 2 Update Frame to update forwarding tables in layer 2
- * bridge devices */
-
- skb = dev_alloc_skb(sizeof(*msg));
- if (!skb)
- return;
- msg = skb_put(skb, sizeof(*msg));
-
- /* 802.2 Type 1 Logical Link Control (LLC) Exchange Identifier (XID)
- * Update response frame; IEEE Std 802.2-1998, 5.4.1.2.1 */
-
- eth_broadcast_addr(msg->da);
- memcpy(msg->sa, sta->sta.addr, ETH_ALEN);
- msg->len = htons(6);
- msg->dsap = 0;
- msg->ssap = 0x01; /* NULL LSAP, CR Bit: Response */
- msg->control = 0xaf; /* XID response lsb.1111F101.
- * F=0 (no poll command; unsolicited frame) */
- msg->xid_info[0] = 0x81; /* XID format identifier */
- msg->xid_info[1] = 1; /* LLC types/classes: Type 1 LLC */
- msg->xid_info[2] = 0; /* XID sender's receive window size (RW) */
-
- skb->dev = sta->sdata->dev;
- skb->protocol = eth_type_trans(skb, sta->sdata->dev);
- memset(skb->cb, 0, sizeof(skb->cb));
- netif_rx_ni(skb);
-}
-
static int sta_apply_auth_flags(struct ieee80211_local *local,
struct sta_info *sta,
u32 mask, u32 set)
@@ -1159,7 +1959,7 @@ static int sta_apply_auth_flags(struct ieee80211_local *local,
* before drv_sta_state() is called.
*/
if (!test_sta_flag(sta, WLAN_STA_RATE_CONTROL))
- rate_control_rate_init(sta);
+ rate_control_rate_init_all_links(sta);
ret = sta_info_move_state(sta, IEEE80211_STA_ASSOC);
if (ret)
@@ -1202,7 +2002,7 @@ static void sta_apply_mesh_params(struct ieee80211_local *local,
{
#ifdef CONFIG_MAC80211_MESH
struct ieee80211_sub_if_data *sdata = sta->sdata;
- u32 changed = 0;
+ u64 changed = 0;
if (params->sta_modify_mask & STATION_PARAM_APPLY_PLINK_STATE) {
switch (params->plink_state) {
@@ -1215,6 +2015,11 @@ static void sta_apply_mesh_params(struct ieee80211_local *local,
ieee80211_mps_sta_status_update(sta);
changed |= ieee80211_mps_set_sta_local_pm(sta,
sdata->u.mesh.mshcfg.power_mode);
+
+ ewma_mesh_tx_rate_avg_init(&sta->mesh->tx_rate_avg);
+ /* init at low value */
+ ewma_mesh_tx_rate_avg_add(&sta->mesh->tx_rate_avg, 10);
+
break;
case NL80211_PLINK_LISTEN:
case NL80211_PLINK_BLOCKED:
@@ -1256,19 +2061,151 @@ static void sta_apply_mesh_params(struct ieee80211_local *local,
#endif
}
-static int sta_apply_parameters(struct ieee80211_local *local,
- struct sta_info *sta,
- struct station_parameters *params)
+enum sta_link_apply_mode {
+ STA_LINK_MODE_NEW,
+ STA_LINK_MODE_STA_MODIFY,
+ STA_LINK_MODE_LINK_MODIFY,
+};
+
+static int sta_link_apply_parameters(struct ieee80211_local *local,
+ struct sta_info *sta,
+ enum sta_link_apply_mode mode,
+ struct link_station_parameters *params)
{
- int ret = 0;
struct ieee80211_supported_band *sband;
struct ieee80211_sub_if_data *sdata = sta->sdata;
- u32 mask, set;
+ u32 link_id = params->link_id < 0 ? 0 : params->link_id;
+ struct ieee80211_link_data *link =
+ sdata_dereference(sdata->link[link_id], sdata);
+ struct link_sta_info *link_sta =
+ rcu_dereference_protected(sta->link[link_id],
+ lockdep_is_held(&local->hw.wiphy->mtx));
+ bool changes = params->link_mac ||
+ params->txpwr_set ||
+ params->supported_rates_len ||
+ params->ht_capa ||
+ params->vht_capa ||
+ params->he_capa ||
+ params->eht_capa ||
+ params->s1g_capa ||
+ params->opmode_notif_used;
+
+ switch (mode) {
+ case STA_LINK_MODE_NEW:
+ if (!params->link_mac)
+ return -EINVAL;
+ break;
+ case STA_LINK_MODE_LINK_MODIFY:
+ break;
+ case STA_LINK_MODE_STA_MODIFY:
+ if (params->link_id >= 0)
+ break;
+ if (!changes)
+ return 0;
+ break;
+ }
- sband = ieee80211_get_sband(sdata);
+ if (!link || !link_sta)
+ return -EINVAL;
+
+ sband = ieee80211_get_link_sband(link);
if (!sband)
return -EINVAL;
+ if (params->link_mac) {
+ if (mode == STA_LINK_MODE_NEW) {
+ memcpy(link_sta->addr, params->link_mac, ETH_ALEN);
+ memcpy(link_sta->pub->addr, params->link_mac, ETH_ALEN);
+ } else if (!ether_addr_equal(link_sta->addr,
+ params->link_mac)) {
+ return -EINVAL;
+ }
+ }
+
+ if (params->txpwr_set) {
+ int ret;
+
+ link_sta->pub->txpwr.type = params->txpwr.type;
+ if (params->txpwr.type == NL80211_TX_POWER_LIMITED)
+ link_sta->pub->txpwr.power = params->txpwr.power;
+ ret = drv_sta_set_txpwr(local, sdata, sta);
+ if (ret)
+ return ret;
+ }
+
+ if (params->supported_rates &&
+ params->supported_rates_len &&
+ !ieee80211_parse_bitrates(link->conf->chanreq.oper.width,
+ sband, params->supported_rates,
+ params->supported_rates_len,
+ &link_sta->pub->supp_rates[sband->band]))
+ return -EINVAL;
+
+ if (params->ht_capa)
+ ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
+ params->ht_capa, link_sta);
+
+ /* VHT can override some HT caps such as the A-MSDU max length */
+ if (params->vht_capa)
+ ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
+ params->vht_capa, NULL,
+ link_sta);
+
+ if (params->he_capa)
+ ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband,
+ (void *)params->he_capa,
+ params->he_capa_len,
+ (void *)params->he_6ghz_capa,
+ link_sta);
+
+ if (params->he_capa && params->eht_capa)
+ ieee80211_eht_cap_ie_to_sta_eht_cap(sdata, sband,
+ (u8 *)params->he_capa,
+ params->he_capa_len,
+ params->eht_capa,
+ params->eht_capa_len,
+ link_sta);
+
+ if (params->s1g_capa)
+ ieee80211_s1g_cap_to_sta_s1g_cap(sdata, params->s1g_capa,
+ link_sta);
+
+ ieee80211_sta_init_nss(link_sta);
+
+ if (params->opmode_notif_used) {
+ enum nl80211_chan_width width = link->conf->chanreq.oper.width;
+
+ switch (width) {
+ case NL80211_CHAN_WIDTH_20:
+ case NL80211_CHAN_WIDTH_40:
+ case NL80211_CHAN_WIDTH_80:
+ case NL80211_CHAN_WIDTH_160:
+ case NL80211_CHAN_WIDTH_80P80:
+ case NL80211_CHAN_WIDTH_320: /* not VHT, allowed for HE/EHT */
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* returned value is only needed for rc update, but the
+ * rc isn't initialized here yet, so ignore it
+ */
+ __ieee80211_vht_handle_opmode(sdata, link_sta,
+ params->opmode_notif,
+ sband->band);
+ }
+
+ return 0;
+}
+
+static int sta_apply_parameters(struct ieee80211_local *local,
+ struct sta_info *sta,
+ struct station_parameters *params)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ u32 mask, set;
+ int ret = 0;
+
mask = params->sta_flags_mask;
set = params->sta_flags_set;
@@ -1300,7 +2237,7 @@ static int sta_apply_parameters(struct ieee80211_local *local,
sta->sta.wme = set & BIT(NL80211_STA_FLAG_WME);
/* auth flags will be set later for TDLS,
- * and for unassociated stations that move to assocaited */
+ * and for unassociated stations that move to associated */
if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER) &&
!((mask & BIT(NL80211_STA_FLAG_ASSOCIATED)) &&
(set & BIT(NL80211_STA_FLAG_ASSOCIATED)))) {
@@ -1331,9 +2268,12 @@ static int sta_apply_parameters(struct ieee80211_local *local,
clear_sta_flag(sta, WLAN_STA_TDLS_PEER);
}
+ if (mask & BIT(NL80211_STA_FLAG_SPP_AMSDU))
+ sta->sta.spp_amsdu = set & BIT(NL80211_STA_FLAG_SPP_AMSDU);
+
/* mark TDLS channel switch support, if the AP allows it */
if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) &&
- !sdata->u.mgd.tdls_chan_switch_prohibited &&
+ !sdata->deflink.u.mgd.tdls_chan_switch_prohibited &&
params->ext_capab_len >= 4 &&
params->ext_capab[3] & WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH)
set_sta_flag(sta, WLAN_STA_TDLS_CHAN_SWITCH);
@@ -1350,40 +2290,21 @@ static int sta_apply_parameters(struct ieee80211_local *local,
sta->sta.max_sp = params->max_sp;
}
- /* The sender might not have sent the last bit, consider it to be 0 */
- if (params->ext_capab_len >= 8) {
- u8 val = (params->ext_capab[7] &
- WLAN_EXT_CAPA8_MAX_MSDU_IN_AMSDU_LSB) >> 7;
-
- /* we did get all the bits, take the MSB as well */
- if (params->ext_capab_len >= 9) {
- u8 val_msb = params->ext_capab[8] &
- WLAN_EXT_CAPA9_MAX_MSDU_IN_AMSDU_MSB;
- val_msb <<= 1;
- val |= val_msb;
- }
-
- switch (val) {
- case 1:
- sta->sta.max_amsdu_subframes = 32;
- break;
- case 2:
- sta->sta.max_amsdu_subframes = 16;
- break;
- case 3:
- sta->sta.max_amsdu_subframes = 8;
- break;
- default:
- sta->sta.max_amsdu_subframes = 0;
- }
- }
+ ieee80211_sta_set_max_amsdu_subframes(sta, params->ext_capab,
+ params->ext_capab_len);
/*
* cfg80211 validates this (1-2007) and allows setting the AID
- * only when creating a new station entry
+ * only when creating a new station entry. For S1G APs, the current
+ * implementation supports a maximum of 1600 AIDs.
*/
- if (params->aid)
+ if (params->aid) {
+ if (sdata->vif.cfg.s1g &&
+ params->aid > IEEE80211_MAX_SUPPORTED_S1G_AID)
+ return -EINVAL;
+
sta->sta.aid = params->aid;
+ }
/*
* Some of the following updates would be racy if called on an
@@ -1396,34 +2317,13 @@ static int sta_apply_parameters(struct ieee80211_local *local,
if (params->listen_interval >= 0)
sta->listen_interval = params->listen_interval;
- if (params->supported_rates) {
- ieee80211_parse_bitrates(&sdata->vif.bss_conf.chandef,
- sband, params->supported_rates,
- params->supported_rates_len,
- &sta->sta.supp_rates[sband->band]);
- }
-
- if (params->ht_capa)
- ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
- params->ht_capa, sta);
-
- /* VHT can override some HT caps such as the A-MSDU max length */
- if (params->vht_capa)
- ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
- params->vht_capa, sta);
+ if (params->eml_cap_present)
+ sta->sta.eml_cap = params->eml_cap;
- if (params->he_capa)
- ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband,
- (void *)params->he_capa,
- params->he_capa_len, sta);
-
- if (params->opmode_notif_used) {
- /* returned value is only needed for rc update, but the
- * rc isn't initialized here yet, so ignore it
- */
- __ieee80211_vht_handle_opmode(sdata, sta, params->opmode_notif,
- sband->band);
- }
+ ret = sta_link_apply_parameters(local, sta, STA_LINK_MODE_STA_MODIFY,
+ &params->link_sta_params);
+ if (ret)
+ return ret;
if (params->support_p2p_ps >= 0)
sta->sta.support_p2p_ps = params->support_p2p_ps;
@@ -1431,6 +2331,9 @@ static int sta_apply_parameters(struct ieee80211_local *local,
if (ieee80211_vif_is_mesh(&sdata->vif))
sta_apply_mesh_params(local, sta, params);
+ if (params->airtime_weight)
+ sta->airtime_weight = params->airtime_weight;
+
/* set the STA state after all sta info from usermode has been set */
if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) ||
set & BIT(NL80211_STA_FLAG_ASSOCIATED)) {
@@ -1439,6 +2342,10 @@ static int sta_apply_parameters(struct ieee80211_local *local,
return ret;
}
+ /* Mark the STA as MLO if MLD MAC address is available */
+ if (params->link_sta_params.mld_mac)
+ sta->sta.mlo = true;
+
return 0;
}
@@ -1450,7 +2357,8 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
struct sta_info *sta;
struct ieee80211_sub_if_data *sdata;
int err;
- int layer2_update;
+
+ lockdep_assert_wiphy(local->hw.wiphy);
if (params->vlan) {
sdata = IEEE80211_DEV_TO_SUB_IF(params->vlan);
@@ -1464,16 +2372,37 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
if (ether_addr_equal(mac, sdata->vif.addr))
return -EINVAL;
- if (is_multicast_ether_addr(mac))
+ if (!is_valid_ether_addr(mac))
+ return -EINVAL;
+
+ if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER) &&
+ sdata->vif.type == NL80211_IFTYPE_STATION &&
+ !sdata->u.mgd.associated)
return -EINVAL;
- sta = sta_info_alloc(sdata, mac, GFP_KERNEL);
+ /*
+ * If we have a link ID, it can be a non-MLO station on an AP MLD,
+ * but we need to have a link_mac in that case as well, so use the
+ * STA's MAC address in that case.
+ */
+ if (params->link_sta_params.link_id >= 0)
+ sta = sta_info_alloc_with_link(sdata, mac,
+ params->link_sta_params.link_id,
+ params->link_sta_params.link_mac ?: mac,
+ GFP_KERNEL);
+ else
+ sta = sta_info_alloc(sdata, mac, GFP_KERNEL);
+
if (!sta)
return -ENOMEM;
if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER))
sta->sta.tdls = true;
+ /* Though the mutex is not needed here (since the station is not
+ * visible yet), sta_apply_parameters (and inner functions) require
+ * the mutex due to other paths.
+ */
err = sta_apply_parameters(local, sta, params);
if (err) {
sta_info_free(local, sta);
@@ -1487,23 +2416,9 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
*/
if (!test_sta_flag(sta, WLAN_STA_TDLS_PEER) &&
test_sta_flag(sta, WLAN_STA_ASSOC))
- rate_control_rate_init(sta);
-
- layer2_update = sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
- sdata->vif.type == NL80211_IFTYPE_AP;
-
- err = sta_info_insert_rcu(sta);
- if (err) {
- rcu_read_unlock();
- return err;
- }
-
- if (layer2_update)
- ieee80211_send_layer2_update(sta);
+ rate_control_rate_init_all_links(sta);
- rcu_read_unlock();
-
- return 0;
+ return sta_info_insert(sta);
}
static int ieee80211_del_station(struct wiphy *wiphy, struct net_device *dev,
@@ -1516,7 +2431,7 @@ static int ieee80211_del_station(struct wiphy *wiphy, struct net_device *dev,
if (params->mac)
return sta_info_destroy_addr_bss(sdata, params->mac);
- sta_info_flush(sdata);
+ sta_info_flush(sdata, params->link_id);
return 0;
}
@@ -1531,13 +2446,11 @@ static int ieee80211_change_station(struct wiphy *wiphy,
enum cfg80211_station_type statype;
int err;
- mutex_lock(&local->sta_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
sta = sta_info_get_bss(sdata, mac);
- if (!sta) {
- err = -ENOENT;
- goto out_err;
- }
+ if (!sta)
+ return -ENOENT;
switch (sdata->vif.type) {
case NL80211_IFTYPE_MESH_POINT:
@@ -1567,25 +2480,23 @@ static int ieee80211_change_station(struct wiphy *wiphy,
statype = CFG80211_STA_AP_CLIENT_UNASSOC;
break;
default:
- err = -EOPNOTSUPP;
- goto out_err;
+ return -EOPNOTSUPP;
}
err = cfg80211_check_station_change(wiphy, params, statype);
if (err)
- goto out_err;
+ return err;
if (params->vlan && params->vlan != sta->sdata->dev) {
vlansdata = IEEE80211_DEV_TO_SUB_IF(params->vlan);
if (params->vlan->ieee80211_ptr->use_4addr) {
- if (vlansdata->u.vlan.sta) {
- err = -EBUSY;
- goto out_err;
- }
+ if (vlansdata->u.vlan.sta)
+ return -EBUSY;
rcu_assign_pointer(vlansdata->u.vlan.sta, sta);
__ieee80211_check_fast_rx_iface(vlansdata);
+ drv_sta_set_4addr(local, sta->sdata, &sta->sta, true);
}
if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
@@ -1596,33 +2507,19 @@ static int ieee80211_change_station(struct wiphy *wiphy,
ieee80211_vif_dec_num_mcast(sta->sdata);
sta->sdata = vlansdata;
+ ieee80211_check_fast_rx(sta);
ieee80211_check_fast_xmit(sta);
- if (test_sta_flag(sta, WLAN_STA_AUTHORIZED))
+ if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) {
ieee80211_vif_inc_num_mcast(sta->sdata);
-
- ieee80211_send_layer2_update(sta);
+ cfg80211_send_layer2_update(sta->sdata->dev,
+ sta->sta.addr);
+ }
}
err = sta_apply_parameters(local, sta, params);
if (err)
- goto out_err;
-
- mutex_unlock(&local->sta_mtx);
-
- if ((sdata->vif.type == NL80211_IFTYPE_AP ||
- sdata->vif.type == NL80211_IFTYPE_AP_VLAN) &&
- sta->known_smps_mode != sta->sdata->bss->req_smps &&
- test_sta_flag(sta, WLAN_STA_AUTHORIZED) &&
- sta_info_tx_streams(sta) != 1) {
- ht_dbg(sta->sdata,
- "%pM just authorized and MIMO capable - update SMPS\n",
- sta->sta.addr);
- ieee80211_send_smps_action(sta->sdata,
- sta->sdata->bss->req_smps,
- sta->sta.addr,
- sta->sdata->vif.bss_conf.bssid);
- }
+ return err;
if (sdata->vif.type == NL80211_IFTYPE_STATION &&
params->sta_flags_mask & BIT(NL80211_STA_FLAG_AUTHORIZED)) {
@@ -1631,9 +2528,6 @@ static int ieee80211_change_station(struct wiphy *wiphy,
}
return 0;
-out_err:
- mutex_unlock(&local->sta_mtx);
- return err;
}
#ifdef CONFIG_MAC80211_MESH
@@ -1726,7 +2620,9 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop,
MPATH_INFO_EXPTIME |
MPATH_INFO_DISCOVERY_TIMEOUT |
MPATH_INFO_DISCOVERY_RETRIES |
- MPATH_INFO_FLAGS;
+ MPATH_INFO_FLAGS |
+ MPATH_INFO_HOP_COUNT |
+ MPATH_INFO_PATH_CHANGE;
pinfo->frame_qlen = mpath->frame_queue.qlen;
pinfo->sn = mpath->sn;
@@ -1746,6 +2642,8 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop,
pinfo->flags |= NL80211_MPATH_FLAG_FIXED;
if (mpath->flags & MESH_PATH_RESOLVED)
pinfo->flags |= NL80211_MPATH_FLAG_RESOLVED;
+ pinfo->hop_count = mpath->hop_count;
+ pinfo->path_change_count = mpath->path_change_count;
}
static int ieee80211_get_mpath(struct wiphy *wiphy, struct net_device *dev,
@@ -1861,13 +2759,12 @@ static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh,
const struct mesh_setup *setup)
{
u8 *new_ie;
- const u8 *old_ie;
struct ieee80211_sub_if_data *sdata = container_of(ifmsh,
struct ieee80211_sub_if_data, u.mesh);
+ int i;
/* allocate information elements */
new_ie = NULL;
- old_ie = ifmsh->ie;
if (setup->ie_len) {
new_ie = kmemdup(setup->ie, setup->ie_len,
@@ -1877,7 +2774,6 @@ static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh,
}
ifmsh->ie_len = setup->ie_len;
ifmsh->ie = new_ie;
- kfree(old_ie);
/* now copy the rest of the setup parameters */
ifmsh->mesh_id_len = setup->mesh_id_len;
@@ -1902,6 +2798,17 @@ static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh,
sdata->vif.bss_conf.beacon_int = setup->beacon_interval;
sdata->vif.bss_conf.dtim_period = setup->dtim_period;
+ sdata->beacon_rate_set = false;
+ if (wiphy_ext_feature_isset(sdata->local->hw.wiphy,
+ NL80211_EXT_FEATURE_BEACON_RATE_LEGACY)) {
+ for (i = 0; i < NUM_NL80211_BANDS; i++) {
+ sdata->beacon_rateidx_mask[i] =
+ setup->beacon_rate.control[i].legacy;
+ if (sdata->beacon_rateidx_mask[i])
+ sdata->beacon_rate_set = true;
+ }
+ }
+
return 0;
}
@@ -1986,13 +2893,14 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy,
* devices that report signal in dBm.
*/
if (!ieee80211_hw_check(&sdata->local->hw, SIGNAL_DBM))
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
conf->rssi_threshold = nconf->rssi_threshold;
}
if (_chg_mesh_attr(NL80211_MESHCONF_HT_OPMODE, mask)) {
conf->ht_opmode = nconf->ht_opmode;
sdata->vif.bss_conf.ht_operation_mode = nconf->ht_opmode;
- ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_HT);
+ ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+ BSS_CHANGED_HT);
}
if (_chg_mesh_attr(NL80211_MESHCONF_HWMP_PATH_TO_ROOT_TIMEOUT, mask))
conf->dot11MeshHWMPactivePathToRootTimeout =
@@ -2012,6 +2920,14 @@ static int ieee80211_update_mesh_config(struct wiphy *wiphy,
nconf->dot11MeshAwakeWindowDuration;
if (_chg_mesh_attr(NL80211_MESHCONF_PLINK_TIMEOUT, mask))
conf->plink_timeout = nconf->plink_timeout;
+ if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_GATE, mask))
+ conf->dot11MeshConnectedToMeshGate =
+ nconf->dot11MeshConnectedToMeshGate;
+ if (_chg_mesh_attr(NL80211_MESHCONF_NOLEARN, mask))
+ conf->dot11MeshNolearn = nconf->dot11MeshNolearn;
+ if (_chg_mesh_attr(NL80211_MESHCONF_CONNECTED_TO_AS, mask))
+ conf->dot11MeshConnectedToAuthServer =
+ nconf->dot11MeshConnectedToAuthServer;
ieee80211_mbss_info_change_notify(sdata, BSS_CHANGED_BEACON);
return 0;
}
@@ -2021,9 +2937,12 @@ static int ieee80211_join_mesh(struct wiphy *wiphy, struct net_device *dev,
const struct mesh_setup *setup)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_chan_req chanreq = { .oper = setup->chandef };
struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
int err;
+ lockdep_assert_wiphy(sdata->local->hw.wiphy);
+
memcpy(&ifmsh->mshcfg, conf, sizeof(struct mesh_config));
err = copy_mesh_setup(ifmsh, setup);
if (err)
@@ -2032,13 +2951,11 @@ static int ieee80211_join_mesh(struct wiphy *wiphy, struct net_device *dev,
sdata->control_port_over_nl80211 = setup->control_port_over_nl80211;
/* can mesh use other SMPS modes? */
- sdata->smps_mode = IEEE80211_SMPS_OFF;
- sdata->needed_rx_chains = sdata->local->rx_chains;
+ sdata->deflink.smps_mode = IEEE80211_SMPS_OFF;
+ sdata->deflink.needed_rx_chains = sdata->local->rx_chains;
- mutex_lock(&sdata->local->mtx);
- err = ieee80211_vif_use_channel(sdata, &setup->chandef,
- IEEE80211_CHANCTX_SHARED);
- mutex_unlock(&sdata->local->mtx);
+ err = ieee80211_link_use_channel(&sdata->deflink, &chanreq,
+ IEEE80211_CHANCTX_SHARED);
if (err)
return err;
@@ -2049,10 +2966,11 @@ static int ieee80211_leave_mesh(struct wiphy *wiphy, struct net_device *dev)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ lockdep_assert_wiphy(sdata->local->hw.wiphy);
+
ieee80211_stop_mesh(sdata);
- mutex_lock(&sdata->local->mtx);
- ieee80211_vif_release_channel(sdata);
- mutex_unlock(&sdata->local->mtx);
+ ieee80211_link_release_channel(&sdata->deflink);
+ kfree(sdata->u.mesh.ie);
return 0;
}
@@ -2063,48 +2981,53 @@ static int ieee80211_change_bss(struct wiphy *wiphy,
struct bss_parameters *params)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_link_data *link;
struct ieee80211_supported_band *sband;
- u32 changed = 0;
+ u64 changed = 0;
+
+ link = ieee80211_link_or_deflink(sdata, params->link_id, true);
+ if (IS_ERR(link))
+ return PTR_ERR(link);
- if (!sdata_dereference(sdata->u.ap.beacon, sdata))
+ if (!sdata_dereference(link->u.ap.beacon, sdata))
return -ENOENT;
- sband = ieee80211_get_sband(sdata);
+ sband = ieee80211_get_link_sband(link);
if (!sband)
return -EINVAL;
+ if (params->basic_rates) {
+ if (!ieee80211_parse_bitrates(link->conf->chanreq.oper.width,
+ wiphy->bands[sband->band],
+ params->basic_rates,
+ params->basic_rates_len,
+ &link->conf->basic_rates))
+ return -EINVAL;
+ changed |= BSS_CHANGED_BASIC_RATES;
+ ieee80211_check_rate_mask(link);
+ }
+
if (params->use_cts_prot >= 0) {
- sdata->vif.bss_conf.use_cts_prot = params->use_cts_prot;
+ link->conf->use_cts_prot = params->use_cts_prot;
changed |= BSS_CHANGED_ERP_CTS_PROT;
}
if (params->use_short_preamble >= 0) {
- sdata->vif.bss_conf.use_short_preamble =
- params->use_short_preamble;
+ link->conf->use_short_preamble = params->use_short_preamble;
changed |= BSS_CHANGED_ERP_PREAMBLE;
}
- if (!sdata->vif.bss_conf.use_short_slot &&
- sband->band == NL80211_BAND_5GHZ) {
- sdata->vif.bss_conf.use_short_slot = true;
+ if (!link->conf->use_short_slot &&
+ (sband->band == NL80211_BAND_5GHZ ||
+ sband->band == NL80211_BAND_6GHZ)) {
+ link->conf->use_short_slot = true;
changed |= BSS_CHANGED_ERP_SLOT;
}
if (params->use_short_slot_time >= 0) {
- sdata->vif.bss_conf.use_short_slot =
- params->use_short_slot_time;
+ link->conf->use_short_slot = params->use_short_slot_time;
changed |= BSS_CHANGED_ERP_SLOT;
}
- if (params->basic_rates) {
- ieee80211_parse_bitrates(&sdata->vif.bss_conf.chandef,
- wiphy->bands[sband->band],
- params->basic_rates,
- params->basic_rates_len,
- &sdata->vif.bss_conf.basic_rates);
- changed |= BSS_CHANGED_BASIC_RATES;
- ieee80211_check_rate_mask(sdata);
- }
-
if (params->ap_isolate >= 0) {
if (params->ap_isolate)
sdata->flags |= IEEE80211_SDATA_DONT_BRIDGE_PACKETS;
@@ -2114,30 +3037,29 @@ static int ieee80211_change_bss(struct wiphy *wiphy,
}
if (params->ht_opmode >= 0) {
- sdata->vif.bss_conf.ht_operation_mode =
- (u16) params->ht_opmode;
+ link->conf->ht_operation_mode = (u16)params->ht_opmode;
changed |= BSS_CHANGED_HT;
}
if (params->p2p_ctwindow >= 0) {
- sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow &=
+ link->conf->p2p_noa_attr.oppps_ctwindow &=
~IEEE80211_P2P_OPPPS_CTWINDOW_MASK;
- sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow |=
+ link->conf->p2p_noa_attr.oppps_ctwindow |=
params->p2p_ctwindow & IEEE80211_P2P_OPPPS_CTWINDOW_MASK;
changed |= BSS_CHANGED_P2P_PS;
}
if (params->p2p_opp_ps > 0) {
- sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow |=
+ link->conf->p2p_noa_attr.oppps_ctwindow |=
IEEE80211_P2P_OPPPS_ENABLE_BIT;
changed |= BSS_CHANGED_P2P_PS;
} else if (params->p2p_opp_ps == 0) {
- sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow &=
+ link->conf->p2p_noa_attr.oppps_ctwindow &=
~IEEE80211_P2P_OPPPS_ENABLE_BIT;
changed |= BSS_CHANGED_P2P_PS;
}
- ieee80211_bss_info_change_notify(sdata, changed);
+ ieee80211_link_info_change_notify(sdata, link, changed);
return 0;
}
@@ -2148,6 +3070,8 @@ static int ieee80211_set_txq_params(struct wiphy *wiphy,
{
struct ieee80211_local *local = wiphy_priv(wiphy);
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_link_data *link =
+ ieee80211_link_or_deflink(sdata, params->link_id, true);
struct ieee80211_tx_queue_params p;
if (!local->ops->conf_tx)
@@ -2156,6 +3080,9 @@ static int ieee80211_set_txq_params(struct wiphy *wiphy,
if (local->hw.queues < IEEE80211_NUM_ACS)
return -EOPNOTSUPP;
+ if (IS_ERR(link))
+ return PTR_ERR(link);
+
memset(&p, 0, sizeof(p));
p.aifs = params->aifs;
p.cw_max = params->cwmax;
@@ -2170,15 +3097,16 @@ static int ieee80211_set_txq_params(struct wiphy *wiphy,
ieee80211_regulatory_limit_wmm_params(sdata, &p, params->ac);
- sdata->tx_conf[params->ac] = p;
- if (drv_conf_tx(local, sdata, params->ac, &p)) {
+ link->tx_conf[params->ac] = p;
+ if (drv_conf_tx(local, link, params->ac, &p)) {
wiphy_debug(local->hw.wiphy,
"failed to set TX queue parameters for AC %d\n",
params->ac);
return -EINVAL;
}
- ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_QOS);
+ ieee80211_link_info_change_notify(sdata, link,
+ BSS_CHANGED_QOS);
return 0;
}
@@ -2203,6 +3131,9 @@ static int ieee80211_scan(struct wiphy *wiphy,
struct cfg80211_scan_request *req)
{
struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_link_data *link;
+ struct ieee80211_channel *chan;
+ int radio_idx;
sdata = IEEE80211_WDEV_TO_SUB_IF(req->wdev);
@@ -2221,19 +3152,29 @@ static int ieee80211_scan(struct wiphy *wiphy,
* for now fall through to allow scanning only when
* beaconing hasn't been configured yet
*/
- /* fall through */
+ fallthrough;
case NL80211_IFTYPE_AP:
/*
* If the scan has been forced (and the driver supports
* forcing), don't care about being beaconing already.
* This will create problems to the attached stations (e.g. all
- * the frames sent while scanning on other channel will be
+ * the frames sent while scanning on other channel will be
* lost)
*/
- if (sdata->u.ap.beacon &&
- (!(wiphy->features & NL80211_FEATURE_AP_SCAN) ||
- !(req->flags & NL80211_SCAN_FLAG_AP)))
- return -EOPNOTSUPP;
+ for_each_link_data(sdata, link) {
+ /* if the link is not beaconing, ignore it */
+ if (!sdata_dereference(link->u.ap.beacon, sdata))
+ continue;
+
+ chan = link->conf->chanreq.oper.chan;
+ radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chan);
+
+ if (ieee80211_is_radio_idx_in_scan_req(wiphy, req,
+ radio_idx) &&
+ (!(wiphy->features & NL80211_FEATURE_AP_SCAN) ||
+ !(req->flags & NL80211_SCAN_FLAG_AP)))
+ return -EOPNOTSUPP;
+ }
break;
case NL80211_IFTYPE_NAN:
default:
@@ -2327,12 +3268,15 @@ static int ieee80211_set_mcast_rate(struct wiphy *wiphy, struct net_device *dev,
memcpy(sdata->vif.bss_conf.mcast_rate, rate,
sizeof(int) * NUM_NL80211_BANDS);
- ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_MCAST_RATE);
+ if (ieee80211_sdata_running(sdata))
+ ieee80211_link_info_change_notify(sdata, &sdata->deflink,
+ BSS_CHANGED_MCAST_RATE);
return 0;
}
-static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
+static int ieee80211_set_wiphy_params(struct wiphy *wiphy, int radio_idx,
+ u32 changed)
{
struct ieee80211_local *local = wiphy_priv(wiphy);
int err;
@@ -2340,7 +3284,8 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
if (changed & WIPHY_PARAM_FRAG_THRESHOLD) {
ieee80211_check_fast_xmit_all(local);
- err = drv_set_frag_threshold(local, wiphy->frag_threshold);
+ err = drv_set_frag_threshold(local, radio_idx,
+ wiphy->frag_threshold);
if (err) {
ieee80211_check_fast_xmit_all(local);
@@ -2354,14 +3299,23 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
coverage_class = changed & WIPHY_PARAM_COVERAGE_CLASS ?
wiphy->coverage_class : -1;
- err = drv_set_coverage_class(local, coverage_class);
+ err = drv_set_coverage_class(local, radio_idx,
+ coverage_class);
if (err)
return err;
}
if (changed & WIPHY_PARAM_RTS_THRESHOLD) {
- err = drv_set_rts_threshold(local, wiphy->rts_threshold);
+ u32 rts_threshold;
+
+ if ((radio_idx == -1) || (radio_idx >= wiphy->n_radio))
+ rts_threshold = wiphy->rts_threshold;
+ else
+ rts_threshold =
+ wiphy->radio_cfg[radio_idx].rts_threshold;
+
+ err = drv_set_rts_threshold(local, radio_idx, rts_threshold);
if (err)
return err;
@@ -2379,18 +3333,19 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed)
}
if (changed &
(WIPHY_PARAM_RETRY_SHORT | WIPHY_PARAM_RETRY_LONG))
- ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_RETRY_LIMITS);
+ ieee80211_hw_config(local, radio_idx,
+ IEEE80211_CONF_CHANGE_RETRY_LIMITS);
if (changed & (WIPHY_PARAM_TXQ_LIMIT |
WIPHY_PARAM_TXQ_MEMORY_LIMIT |
WIPHY_PARAM_TXQ_QUANTUM))
- ieee80211_txq_set_params(local);
+ ieee80211_txq_set_params(local, radio_idx);
return 0;
}
static int ieee80211_set_tx_power(struct wiphy *wiphy,
- struct wireless_dev *wdev,
+ struct wireless_dev *wdev, int radio_idx,
enum nl80211_tx_power_setting type, int mbm)
{
struct ieee80211_local *local = wiphy_priv(wiphy);
@@ -2398,109 +3353,152 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy,
enum nl80211_tx_power_setting txp_type = type;
bool update_txp_type = false;
bool has_monitor = false;
+ int user_power_level;
+ int old_power = local->user_power_level;
+
+ lockdep_assert_wiphy(local->hw.wiphy);
+
+ switch (type) {
+ case NL80211_TX_POWER_AUTOMATIC:
+ user_power_level = IEEE80211_UNSET_POWER_LEVEL;
+ txp_type = NL80211_TX_POWER_LIMITED;
+ break;
+ case NL80211_TX_POWER_LIMITED:
+ case NL80211_TX_POWER_FIXED:
+ if (mbm < 0 || (mbm % 100))
+ return -EOPNOTSUPP;
+ user_power_level = MBM_TO_DBM(mbm);
+ break;
+ default:
+ return -EINVAL;
+ }
if (wdev) {
sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
- if (sdata->vif.type == NL80211_IFTYPE_MONITOR) {
- sdata = rtnl_dereference(local->monitor_sdata);
- if (!sdata)
+ if (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
+ !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) {
+ if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF))
return -EOPNOTSUPP;
- }
- switch (type) {
- case NL80211_TX_POWER_AUTOMATIC:
- sdata->user_power_level = IEEE80211_UNSET_POWER_LEVEL;
- txp_type = NL80211_TX_POWER_LIMITED;
- break;
- case NL80211_TX_POWER_LIMITED:
- case NL80211_TX_POWER_FIXED:
- if (mbm < 0 || (mbm % 100))
+ sdata = wiphy_dereference(local->hw.wiphy,
+ local->monitor_sdata);
+ if (!sdata)
return -EOPNOTSUPP;
- sdata->user_power_level = MBM_TO_DBM(mbm);
- break;
}
- if (txp_type != sdata->vif.bss_conf.txpower_type) {
- update_txp_type = true;
- sdata->vif.bss_conf.txpower_type = txp_type;
- }
+ for (int link_id = 0;
+ link_id < ARRAY_SIZE(sdata->link);
+ link_id++) {
+ struct ieee80211_link_data *link =
+ wiphy_dereference(wiphy, sdata->link[link_id]);
- ieee80211_recalc_txpower(sdata, update_txp_type);
+ if (!link)
+ continue;
+ link->user_power_level = user_power_level;
+
+ if (txp_type != link->conf->txpower_type) {
+ update_txp_type = true;
+ link->conf->txpower_type = txp_type;
+ }
+
+ ieee80211_recalc_txpower(link, update_txp_type);
+ }
return 0;
}
- switch (type) {
- case NL80211_TX_POWER_AUTOMATIC:
- local->user_power_level = IEEE80211_UNSET_POWER_LEVEL;
- txp_type = NL80211_TX_POWER_LIMITED;
- break;
- case NL80211_TX_POWER_LIMITED:
- case NL80211_TX_POWER_FIXED:
- if (mbm < 0 || (mbm % 100))
- return -EOPNOTSUPP;
- local->user_power_level = MBM_TO_DBM(mbm);
- break;
- }
+ local->user_power_level = user_power_level;
- mutex_lock(&local->iflist_mtx);
list_for_each_entry(sdata, &local->interfaces, list) {
- if (sdata->vif.type == NL80211_IFTYPE_MONITOR) {
+ if (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
+ !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) {
has_monitor = true;
continue;
}
- sdata->user_power_level = local->user_power_level;
- if (txp_type != sdata->vif.bss_conf.txpower_type)
- update_txp_type = true;
- sdata->vif.bss_conf.txpower_type = txp_type;
+
+ for (int link_id = 0;
+ link_id < ARRAY_SIZE(sdata->link);
+ link_id++) {
+ struct ieee80211_link_data *link =
+ wiphy_dereference(wiphy, sdata->link[link_id]);
+
+ if (!link)
+ continue;
+
+ link->user_power_level = local->user_power_level;
+ if (txp_type != link->conf->txpower_type)
+ update_txp_type = true;
+ link->conf->txpower_type = txp_type;
+ }
}
list_for_each_entry(sdata, &local->interfaces, list) {
- if (sdata->vif.type == NL80211_IFTYPE_MONITOR)
+ if (sdata->vif.type == NL80211_IFTYPE_MONITOR &&
+ !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR))
continue;
- ieee80211_recalc_txpower(sdata, update_txp_type);
+
+ for (int link_id = 0;
+ link_id < ARRAY_SIZE(sdata->link);
+ link_id++) {
+ struct ieee80211_link_data *link =
+ wiphy_dereference(wiphy, sdata->link[link_id]);
+
+ if (!link)
+ continue;
+
+ ieee80211_recalc_txpower(link, update_txp_type);
+ }
}
- mutex_unlock(&local->iflist_mtx);
if (has_monitor) {
- sdata = rtnl_dereference(local->monitor_sdata);
- if (sdata) {
- sdata->user_power_level = local->user_power_level;
+ sdata = wiphy_dereference(local->hw.wiphy,
+ local->monitor_sdata);
+ if (sdata && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF)) {
+ sdata->deflink.user_power_level = local->user_power_level;
if (txp_type != sdata->vif.bss_conf.txpower_type)
update_txp_type = true;
sdata->vif.bss_conf.txpower_type = txp_type;
- ieee80211_recalc_txpower(sdata, update_txp_type);
+ ieee80211_recalc_txpower(&sdata->deflink,
+ update_txp_type);
}
}
+ if (local->emulate_chanctx &&
+ (old_power != local->user_power_level))
+ ieee80211_hw_conf_chan(local);
+
return 0;
}
static int ieee80211_get_tx_power(struct wiphy *wiphy,
struct wireless_dev *wdev,
+ int radio_idx,
+ unsigned int link_id,
int *dbm)
{
struct ieee80211_local *local = wiphy_priv(wiphy);
struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ struct ieee80211_link_data *link_data;
- if (local->ops->get_txpower)
- return drv_get_txpower(local, sdata, dbm);
+ if (local->ops->get_txpower &&
+ (sdata->flags & IEEE80211_SDATA_IN_DRIVER))
+ return drv_get_txpower(local, sdata, link_id, dbm);
- if (!local->use_chanctx)
+ if (local->emulate_chanctx) {
*dbm = local->hw.conf.power_level;
- else
- *dbm = sdata->vif.bss_conf.txpower;
-
- return 0;
-}
+ } else {
+ link_data = wiphy_dereference(wiphy, sdata->link[link_id]);
-static int ieee80211_set_wds_peer(struct wiphy *wiphy, struct net_device *dev,
- const u8 *addr)
-{
- struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ if (link_data)
+ *dbm = link_data->conf->txpower;
+ else
+ return -ENOLINK;
+ }
- memcpy(&sdata->u.wds.remote_addr, addr, ETH_ALEN);
+ /* INT_MIN indicates no power level was set yet */
+ if (*dbm == INT_MIN)
+ return -EINVAL;
return 0;
}
@@ -2548,75 +3546,8 @@ static int ieee80211_testmode_dump(struct wiphy *wiphy,
}
#endif
-int __ieee80211_request_smps_ap(struct ieee80211_sub_if_data *sdata,
- enum ieee80211_smps_mode smps_mode)
-{
- struct sta_info *sta;
- enum ieee80211_smps_mode old_req;
-
- if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_AP))
- return -EINVAL;
-
- if (sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT)
- return 0;
-
- old_req = sdata->u.ap.req_smps;
- sdata->u.ap.req_smps = smps_mode;
-
- /* AUTOMATIC doesn't mean much for AP - don't allow it */
- if (old_req == smps_mode ||
- smps_mode == IEEE80211_SMPS_AUTOMATIC)
- return 0;
-
- ht_dbg(sdata,
- "SMPS %d requested in AP mode, sending Action frame to %d stations\n",
- smps_mode, atomic_read(&sdata->u.ap.num_mcast_sta));
-
- mutex_lock(&sdata->local->sta_mtx);
- list_for_each_entry(sta, &sdata->local->sta_list, list) {
- /*
- * Only stations associated to our AP and
- * associated VLANs
- */
- if (sta->sdata->bss != &sdata->u.ap)
- continue;
-
- /* This station doesn't support MIMO - skip it */
- if (sta_info_tx_streams(sta) == 1)
- continue;
-
- /*
- * Don't wake up a STA just to send the action frame
- * unless we are getting more restrictive.
- */
- if (test_sta_flag(sta, WLAN_STA_PS_STA) &&
- !ieee80211_smps_is_restrictive(sta->known_smps_mode,
- smps_mode)) {
- ht_dbg(sdata, "Won't send SMPS to sleeping STA %pM\n",
- sta->sta.addr);
- continue;
- }
-
- /*
- * If the STA is not authorized, wait until it gets
- * authorized and the action frame will be sent then.
- */
- if (!test_sta_flag(sta, WLAN_STA_AUTHORIZED))
- continue;
-
- ht_dbg(sdata, "Sending SMPS to %pM\n", sta->sta.addr);
- ieee80211_send_smps_action(sdata, smps_mode, sta->sta.addr,
- sdata->vif.bss_conf.bssid);
- }
- mutex_unlock(&sdata->local->sta_mtx);
-
- sdata->smps_mode = smps_mode;
- ieee80211_queue_work(&sdata->local->hw, &sdata->recalc_smps);
-
- return 0;
-}
-
int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_link_data *link,
enum ieee80211_smps_mode smps_mode)
{
const u8 *ap;
@@ -2625,13 +3556,22 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta;
bool tdls_peer_found = false;
- lockdep_assert_held(&sdata->wdev.mtx);
+ lockdep_assert_wiphy(sdata->local->hw.wiphy);
if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION))
return -EINVAL;
- old_req = sdata->u.mgd.req_smps;
- sdata->u.mgd.req_smps = smps_mode;
+ if (!ieee80211_vif_link_active(&sdata->vif, link->link_id))
+ return 0;
+
+ old_req = link->u.mgd.req_smps;
+ link->u.mgd.req_smps = smps_mode;
+
+ /* The driver indicated that EML is enabled for the interface, which
+ * implies that SMPS flows towards the AP should be stopped.
+ */
+ if (sdata->vif.driver_flags & IEEE80211_VIF_EML_ACTIVE)
+ return 0;
if (old_req == smps_mode &&
smps_mode != IEEE80211_SMPS_AUTOMATIC)
@@ -2643,10 +3583,10 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
* the new value until we associate.
*/
if (!sdata->u.mgd.associated ||
- sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT)
+ link->conf->chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT)
return 0;
- ap = sdata->u.mgd.associated->bssid;
+ ap = sdata->vif.cfg.ap_addr;
rcu_read_lock();
list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) {
@@ -2668,11 +3608,13 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata,
/* send SM PS frame to AP */
err = ieee80211_send_smps_action(sdata, smps_mode,
- ap, ap);
+ ap, ap,
+ ieee80211_vif_is_mld(&sdata->vif) ?
+ link->link_id : -1);
if (err)
- sdata->u.mgd.req_smps = old_req;
+ link->u.mgd.req_smps = old_req;
else if (smps_mode != IEEE80211_SMPS_OFF && tdls_peer_found)
- ieee80211_teardown_tdls_peers(sdata);
+ ieee80211_teardown_tdls_peers(link);
return err;
}
@@ -2682,6 +3624,7 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev,
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
+ unsigned int link_id;
if (sdata->vif.type != NL80211_IFTYPE_STATION)
return -EOPNOTSUPP;
@@ -2697,12 +3640,19 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev,
local->dynamic_ps_forced_timeout = timeout;
/* no change, but if automatic follow powersave */
- sdata_lock(sdata);
- __ieee80211_request_smps_mgd(sdata, sdata->u.mgd.req_smps);
- sdata_unlock(sdata);
+ for (link_id = 0; link_id < ARRAY_SIZE(sdata->link); link_id++) {
+ struct ieee80211_link_data *link;
+
+ link = sdata_dereference(sdata->link[link_id], sdata);
+
+ if (!link)
+ continue;
+ __ieee80211_request_smps_mgd(sdata, link,
+ link->u.mgd.req_smps);
+ }
if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS))
- ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS);
+ ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS);
ieee80211_recalc_ps(local);
ieee80211_recalc_ps_vif(sdata);
@@ -2711,32 +3661,57 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev,
return 0;
}
+static void ieee80211_set_cqm_rssi_link(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_link_data *link,
+ s32 rssi_thold, u32 rssi_hyst,
+ s32 rssi_low, s32 rssi_high)
+{
+ struct ieee80211_bss_conf *conf;
+
+ if (!link || !link->conf)
+ return;
+
+ conf = link->conf;
+
+ if (rssi_thold && rssi_hyst &&
+ rssi_thold == conf->cqm_rssi_thold &&
+ rssi_hyst == conf->cqm_rssi_hyst)
+ return;
+
+ conf->cqm_rssi_thold = rssi_thold;
+ conf->cqm_rssi_hyst = rssi_hyst;
+ conf->cqm_rssi_low = rssi_low;
+ conf->cqm_rssi_high = rssi_high;
+ link->u.mgd.last_cqm_event_signal = 0;
+
+ if (!ieee80211_vif_link_active(&sdata->vif, link->link_id))
+ return;
+
+ if (sdata->u.mgd.associated &&
+ (sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI))
+ ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_CQM);
+}
+
static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy,
struct net_device *dev,
s32 rssi_thold, u32 rssi_hyst)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
struct ieee80211_vif *vif = &sdata->vif;
- struct ieee80211_bss_conf *bss_conf = &vif->bss_conf;
+ int link_id;
- if (rssi_thold == bss_conf->cqm_rssi_thold &&
- rssi_hyst == bss_conf->cqm_rssi_hyst)
- return 0;
-
- if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER &&
- !(sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI))
+ if (vif->driver_flags & IEEE80211_VIF_BEACON_FILTER &&
+ !(vif->driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI))
return -EOPNOTSUPP;
- bss_conf->cqm_rssi_thold = rssi_thold;
- bss_conf->cqm_rssi_hyst = rssi_hyst;
- bss_conf->cqm_rssi_low = 0;
- bss_conf->cqm_rssi_high = 0;
- sdata->u.mgd.last_cqm_event_signal = 0;
+ /* For MLD, handle CQM change on all the active links */
+ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) {
+ struct ieee80211_link_data *link =
+ sdata_dereference(sdata->link[link_id], sdata);
- /* tell the driver upon association, unless already associated */
- if (sdata->u.mgd.associated &&
- sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)
- ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_CQM);
+ ieee80211_set_cqm_rssi_link(sdata, link, rssi_thold, rssi_hyst,
+ 0, 0);
+ }
return 0;
}
@@ -2747,27 +3722,26 @@ static int ieee80211_set_cqm_rssi_range_config(struct wiphy *wiphy,
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
struct ieee80211_vif *vif = &sdata->vif;
- struct ieee80211_bss_conf *bss_conf = &vif->bss_conf;
+ int link_id;
- if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)
+ if (vif->driver_flags & IEEE80211_VIF_BEACON_FILTER)
return -EOPNOTSUPP;
- bss_conf->cqm_rssi_low = rssi_low;
- bss_conf->cqm_rssi_high = rssi_high;
- bss_conf->cqm_rssi_thold = 0;
- bss_conf->cqm_rssi_hyst = 0;
- sdata->u.mgd.last_cqm_event_signal = 0;
+ /* For MLD, handle CQM change on all the active links */
+ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) {
+ struct ieee80211_link_data *link =
+ sdata_dereference(sdata->link[link_id], sdata);
- /* tell the driver upon association, unless already associated */
- if (sdata->u.mgd.associated &&
- sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI)
- ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_CQM);
+ ieee80211_set_cqm_rssi_link(sdata, link, 0, 0,
+ rssi_low, rssi_high);
+ }
return 0;
}
static int ieee80211_set_bitrate_mask(struct wiphy *wiphy,
struct net_device *dev,
+ unsigned int link_id,
const u8 *addr,
const struct cfg80211_bitrate_mask *mask)
{
@@ -2784,10 +3758,12 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy,
* to send something, and if we're an AP we have to be able to do
* so at a basic rate so that all clients can receive it.
*/
- if (rcu_access_pointer(sdata->vif.chanctx_conf) &&
- sdata->vif.bss_conf.chandef.chan) {
+ if (rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) &&
+ sdata->vif.bss_conf.chanreq.oper.chan) {
u32 basic_rates = sdata->vif.bss_conf.basic_rates;
- enum nl80211_band band = sdata->vif.bss_conf.chandef.chan->band;
+ enum nl80211_band band;
+
+ band = sdata->vif.bss_conf.chanreq.oper.chan->band;
if (!(mask->control[band].legacy & basic_rates))
return -EINVAL;
@@ -2816,14 +3792,14 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy,
continue;
for (j = 0; j < IEEE80211_HT_MCS_MASK_LEN; j++) {
- if (~sdata->rc_rateidx_mcs_mask[i][j]) {
+ if (sdata->rc_rateidx_mcs_mask[i][j] != 0xff) {
sdata->rc_has_mcs_mask[i] = true;
break;
}
}
for (j = 0; j < NL80211_VHT_NSS_MAX; j++) {
- if (~sdata->rc_rateidx_vht_mcs_mask[i][j]) {
+ if (sdata->rc_rateidx_vht_mcs_mask[i][j] != 0xffff) {
sdata->rc_has_vht_mcs_mask[i] = true;
break;
}
@@ -2833,37 +3809,100 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy,
return 0;
}
+static bool ieee80211_is_scan_ongoing(struct wiphy *wiphy,
+ struct ieee80211_local *local,
+ struct cfg80211_chan_def *chandef)
+{
+ struct cfg80211_scan_request *scan_req;
+ int chan_radio_idx, req_radio_idx;
+ struct ieee80211_roc_work *roc;
+
+ if (list_empty(&local->roc_list) && !local->scanning)
+ return false;
+
+ req_radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chandef->chan);
+
+ if (local->scanning) {
+ scan_req = wiphy_dereference(wiphy, local->scan_req);
+ /*
+ * Scan is going on but info is not there. Should not happen
+ * but if it does, let's not take risk and assume we can't use
+ * the hw hence return true
+ */
+ if (WARN_ON_ONCE(!scan_req))
+ return true;
+
+ return ieee80211_is_radio_idx_in_scan_req(wiphy, scan_req,
+ req_radio_idx);
+ }
+
+ list_for_each_entry(roc, &local->roc_list, list) {
+ chan_radio_idx = cfg80211_get_radio_idx_by_chan(wiphy,
+ roc->chan);
+ if (chan_radio_idx == req_radio_idx)
+ return true;
+ }
+
+ return false;
+}
+
static int ieee80211_start_radar_detection(struct wiphy *wiphy,
struct net_device *dev,
struct cfg80211_chan_def *chandef,
- u32 cac_time_ms)
+ u32 cac_time_ms, int link_id)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_chan_req chanreq = { .oper = *chandef };
struct ieee80211_local *local = sdata->local;
+ struct ieee80211_link_data *link_data;
int err;
- mutex_lock(&local->mtx);
- if (!list_empty(&local->roc_list) || local->scanning) {
- err = -EBUSY;
- goto out_unlock;
- }
+ lockdep_assert_wiphy(local->hw.wiphy);
+
+ if (ieee80211_is_scan_ongoing(wiphy, local, chandef))
+ return -EBUSY;
+
+ link_data = sdata_dereference(sdata->link[link_id], sdata);
+ if (!link_data)
+ return -ENOLINK;
/* whatever, but channel contexts should not complain about that one */
- sdata->smps_mode = IEEE80211_SMPS_OFF;
- sdata->needed_rx_chains = local->rx_chains;
+ link_data->smps_mode = IEEE80211_SMPS_OFF;
+ link_data->needed_rx_chains = local->rx_chains;
- err = ieee80211_vif_use_channel(sdata, chandef,
- IEEE80211_CHANCTX_SHARED);
+ err = ieee80211_link_use_channel(link_data, &chanreq,
+ IEEE80211_CHANCTX_SHARED);
if (err)
- goto out_unlock;
+ return err;
- ieee80211_queue_delayed_work(&sdata->local->hw,
- &sdata->dfs_cac_timer_work,
- msecs_to_jiffies(cac_time_ms));
+ wiphy_delayed_work_queue(wiphy, &link_data->dfs_cac_timer_work,
+ msecs_to_jiffies(cac_time_ms));
- out_unlock:
- mutex_unlock(&local->mtx);
- return err;
+ return 0;
+}
+
+static void ieee80211_end_cac(struct wiphy *wiphy,
+ struct net_device *dev, unsigned int link_id)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_link_data *link_data;
+
+ lockdep_assert_wiphy(local->hw.wiphy);
+
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ link_data = sdata_dereference(sdata->link[link_id], sdata);
+ if (!link_data)
+ continue;
+
+ wiphy_delayed_work_cancel(wiphy,
+ &link_data->dfs_cac_timer_work);
+
+ if (sdata->wdev.links[link_id].cac_started) {
+ ieee80211_link_release_channel(link_data);
+ sdata->wdev.links[link_id].cac_started = false;
+ }
+ }
}
static struct cfg80211_beacon_data *
@@ -2875,12 +3914,40 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon)
len = beacon->head_len + beacon->tail_len + beacon->beacon_ies_len +
beacon->proberesp_ies_len + beacon->assocresp_ies_len +
- beacon->probe_resp_len;
+ beacon->probe_resp_len + beacon->lci_len + beacon->civicloc_len;
+
+ if (beacon->mbssid_ies)
+ len += ieee80211_get_mbssid_beacon_len(beacon->mbssid_ies,
+ beacon->rnr_ies,
+ beacon->mbssid_ies->cnt);
new_beacon = kzalloc(sizeof(*new_beacon) + len, GFP_KERNEL);
if (!new_beacon)
return NULL;
+ if (beacon->mbssid_ies && beacon->mbssid_ies->cnt) {
+ new_beacon->mbssid_ies =
+ kzalloc(struct_size(new_beacon->mbssid_ies,
+ elem, beacon->mbssid_ies->cnt),
+ GFP_KERNEL);
+ if (!new_beacon->mbssid_ies) {
+ kfree(new_beacon);
+ return NULL;
+ }
+
+ if (beacon->rnr_ies && beacon->rnr_ies->cnt) {
+ new_beacon->rnr_ies =
+ kzalloc(struct_size(new_beacon->rnr_ies,
+ elem, beacon->rnr_ies->cnt),
+ GFP_KERNEL);
+ if (!new_beacon->rnr_ies) {
+ kfree(new_beacon->mbssid_ies);
+ kfree(new_beacon);
+ return NULL;
+ }
+ }
+ }
+
pos = (u8 *)(new_beacon + 1);
if (beacon->head_len) {
new_beacon->head_len = beacon->head_len;
@@ -2918,47 +3985,116 @@ cfg80211_beacon_dup(struct cfg80211_beacon_data *beacon)
memcpy(pos, beacon->probe_resp, beacon->probe_resp_len);
pos += beacon->probe_resp_len;
}
+ if (beacon->mbssid_ies && beacon->mbssid_ies->cnt) {
+ pos += ieee80211_copy_mbssid_beacon(pos,
+ new_beacon->mbssid_ies,
+ beacon->mbssid_ies);
+ if (beacon->rnr_ies && beacon->rnr_ies->cnt)
+ pos += ieee80211_copy_rnr_beacon(pos,
+ new_beacon->rnr_ies,
+ beacon->rnr_ies);
+ }
+
+ /* might copy -1, meaning no changes requested */
+ new_beacon->ftm_responder = beacon->ftm_responder;
+ if (beacon->lci) {
+ new_beacon->lci_len = beacon->lci_len;
+ new_beacon->lci = pos;
+ memcpy(pos, beacon->lci, beacon->lci_len);
+ pos += beacon->lci_len;
+ }
+ if (beacon->civicloc) {
+ new_beacon->civicloc_len = beacon->civicloc_len;
+ new_beacon->civicloc = pos;
+ memcpy(pos, beacon->civicloc, beacon->civicloc_len);
+ pos += beacon->civicloc_len;
+ }
return new_beacon;
}
-void ieee80211_csa_finish(struct ieee80211_vif *vif)
+void ieee80211_csa_finish(struct ieee80211_vif *vif, unsigned int link_id)
{
struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_bss_conf *tx_bss_conf;
+ struct ieee80211_link_data *link_data;
- ieee80211_queue_work(&sdata->local->hw,
- &sdata->csa_finalize_work);
+ if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS))
+ return;
+
+ rcu_read_lock();
+
+ link_data = rcu_dereference(sdata->link[link_id]);
+ if (WARN_ON(!link_data)) {
+ rcu_read_unlock();
+ return;
+ }
+
+ tx_bss_conf = rcu_dereference(link_data->conf->tx_bss_conf);
+ if (tx_bss_conf == link_data->conf) {
+ /* Trigger ieee80211_csa_finish() on the non-transmitting
+ * interfaces when channel switch is received on
+ * transmitting interface
+ */
+ struct ieee80211_link_data *iter;
+
+ for_each_sdata_link_rcu(local, iter) {
+ if (iter->sdata == sdata ||
+ rcu_access_pointer(iter->conf->tx_bss_conf) != tx_bss_conf)
+ continue;
+
+ wiphy_work_queue(iter->sdata->local->hw.wiphy,
+ &iter->csa.finalize_work);
+ }
+ }
+
+ wiphy_work_queue(local->hw.wiphy, &link_data->csa.finalize_work);
+
+ rcu_read_unlock();
}
EXPORT_SYMBOL(ieee80211_csa_finish);
-static int ieee80211_set_after_csa_beacon(struct ieee80211_sub_if_data *sdata,
- u32 *changed)
+void ieee80211_channel_switch_disconnect(struct ieee80211_vif *vif)
{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ struct ieee80211_local *local = sdata->local;
+
+ sdata_info(sdata, "channel switch failed, disconnecting\n");
+ wiphy_work_queue(local->hw.wiphy, &ifmgd->csa_connection_drop_work);
+}
+EXPORT_SYMBOL(ieee80211_channel_switch_disconnect);
+
+static int ieee80211_set_after_csa_beacon(struct ieee80211_link_data *link_data,
+ u64 *changed)
+{
+ struct ieee80211_sub_if_data *sdata = link_data->sdata;
int err;
switch (sdata->vif.type) {
case NL80211_IFTYPE_AP:
- err = ieee80211_assign_beacon(sdata, sdata->u.ap.next_beacon,
- NULL);
- kfree(sdata->u.ap.next_beacon);
- sdata->u.ap.next_beacon = NULL;
+ if (!link_data->u.ap.next_beacon)
+ return -EINVAL;
+
+ err = ieee80211_assign_beacon(sdata, link_data,
+ link_data->u.ap.next_beacon,
+ NULL, NULL, changed);
+ ieee80211_free_next_beacon(link_data);
if (err < 0)
return err;
- *changed |= err;
break;
case NL80211_IFTYPE_ADHOC:
- err = ieee80211_ibss_finish_csa(sdata);
+ err = ieee80211_ibss_finish_csa(sdata, changed);
if (err < 0)
return err;
- *changed |= err;
break;
#ifdef CONFIG_MAC80211_MESH
case NL80211_IFTYPE_MESH_POINT:
- err = ieee80211_mesh_finish_csa(sdata);
+ err = ieee80211_mesh_finish_csa(sdata, changed);
if (err < 0)
return err;
- *changed |= err;
break;
#endif
default:
@@ -2969,15 +4105,15 @@ static int ieee80211_set_after_csa_beacon(struct ieee80211_sub_if_data *sdata,
return 0;
}
-static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata)
+static int __ieee80211_csa_finalize(struct ieee80211_link_data *link_data)
{
+ struct ieee80211_sub_if_data *sdata = link_data->sdata;
struct ieee80211_local *local = sdata->local;
- u32 changed = 0;
+ struct ieee80211_bss_conf *link_conf = link_data->conf;
+ u64 changed = 0;
int err;
- sdata_assert_lock(sdata);
- lockdep_assert_held(&local->mtx);
- lockdep_assert_held(&local->chanctx_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
/*
* using reservation isn't immediate as it may be deferred until later
@@ -2986,92 +4122,86 @@ static int __ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata)
* completed successfully
*/
- if (sdata->reserved_chanctx) {
+ if (link_data->reserved_chanctx) {
/*
* with multi-vif csa driver may call ieee80211_csa_finish()
* many times while waiting for other interfaces to use their
* reservations
*/
- if (sdata->reserved_ready)
+ if (link_data->reserved_ready)
return 0;
- return ieee80211_vif_use_reserved_context(sdata);
+ return ieee80211_link_use_reserved_context(link_data);
}
- if (!cfg80211_chandef_identical(&sdata->vif.bss_conf.chandef,
- &sdata->csa_chandef))
+ if (!cfg80211_chandef_identical(&link_conf->chanreq.oper,
+ &link_data->csa.chanreq.oper))
return -EINVAL;
- sdata->vif.csa_active = false;
+ link_conf->csa_active = false;
- err = ieee80211_set_after_csa_beacon(sdata, &changed);
+ err = ieee80211_set_after_csa_beacon(link_data, &changed);
if (err)
return err;
- ieee80211_bss_info_change_notify(sdata, changed);
+ ieee80211_link_info_change_notify(sdata, link_data, changed);
- if (sdata->csa_block_tx) {
- ieee80211_wake_vif_queues(local, sdata,
- IEEE80211_QUEUE_STOP_REASON_CSA);
- sdata->csa_block_tx = false;
- }
+ ieee80211_vif_unblock_queues_csa(sdata);
- err = drv_post_channel_switch(sdata);
+ err = drv_post_channel_switch(link_data);
if (err)
return err;
- cfg80211_ch_switch_notify(sdata->dev, &sdata->csa_chandef);
+ cfg80211_ch_switch_notify(sdata->dev, &link_data->csa.chanreq.oper,
+ link_data->link_id);
return 0;
}
-static void ieee80211_csa_finalize(struct ieee80211_sub_if_data *sdata)
+static void ieee80211_csa_finalize(struct ieee80211_link_data *link_data)
{
- if (__ieee80211_csa_finalize(sdata)) {
- sdata_info(sdata, "failed to finalize CSA, disconnecting\n");
+ struct ieee80211_sub_if_data *sdata = link_data->sdata;
+
+ if (__ieee80211_csa_finalize(link_data)) {
+ sdata_info(sdata, "failed to finalize CSA on link %d, disconnecting\n",
+ link_data->link_id);
cfg80211_stop_iface(sdata->local->hw.wiphy, &sdata->wdev,
GFP_KERNEL);
}
}
-void ieee80211_csa_finalize_work(struct work_struct *work)
+void ieee80211_csa_finalize_work(struct wiphy *wiphy, struct wiphy_work *work)
{
- struct ieee80211_sub_if_data *sdata =
- container_of(work, struct ieee80211_sub_if_data,
- csa_finalize_work);
+ struct ieee80211_link_data *link =
+ container_of(work, struct ieee80211_link_data, csa.finalize_work);
+ struct ieee80211_sub_if_data *sdata = link->sdata;
struct ieee80211_local *local = sdata->local;
- sdata_lock(sdata);
- mutex_lock(&local->mtx);
- mutex_lock(&local->chanctx_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
/* AP might have been stopped while waiting for the lock. */
- if (!sdata->vif.csa_active)
- goto unlock;
+ if (!link->conf->csa_active)
+ return;
if (!ieee80211_sdata_running(sdata))
- goto unlock;
-
- ieee80211_csa_finalize(sdata);
+ return;
-unlock:
- mutex_unlock(&local->chanctx_mtx);
- mutex_unlock(&local->mtx);
- sdata_unlock(sdata);
+ ieee80211_csa_finalize(link);
}
-static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata,
+static int ieee80211_set_csa_beacon(struct ieee80211_link_data *link_data,
struct cfg80211_csa_settings *params,
- u32 *changed)
+ u64 *changed)
{
+ struct ieee80211_sub_if_data *sdata = link_data->sdata;
struct ieee80211_csa_settings csa = {};
int err;
switch (sdata->vif.type) {
case NL80211_IFTYPE_AP:
- sdata->u.ap.next_beacon =
+ link_data->u.ap.next_beacon =
cfg80211_beacon_dup(&params->beacon_after);
- if (!sdata->u.ap.next_beacon)
+ if (!link_data->u.ap.next_beacon)
return -ENOMEM;
/*
@@ -3094,10 +4224,12 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata,
break;
if ((params->n_counter_offsets_beacon >
- IEEE80211_MAX_CSA_COUNTERS_NUM) ||
+ IEEE80211_MAX_CNTDWN_COUNTERS_NUM) ||
(params->n_counter_offsets_presp >
- IEEE80211_MAX_CSA_COUNTERS_NUM))
+ IEEE80211_MAX_CNTDWN_COUNTERS_NUM)) {
+ ieee80211_free_next_beacon(link_data);
return -EINVAL;
+ }
csa.counter_offsets_beacon = params->counter_offsets_beacon;
csa.counter_offsets_presp = params->counter_offsets_presp;
@@ -3105,16 +4237,17 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata,
csa.n_counter_offsets_presp = params->n_counter_offsets_presp;
csa.count = params->count;
- err = ieee80211_assign_beacon(sdata, &params->beacon_csa, &csa);
+ err = ieee80211_assign_beacon(sdata, link_data,
+ &params->beacon_csa, &csa,
+ NULL, changed);
if (err < 0) {
- kfree(sdata->u.ap.next_beacon);
+ ieee80211_free_next_beacon(link_data);
return err;
}
- *changed |= err;
break;
case NL80211_IFTYPE_ADHOC:
- if (!sdata->vif.bss_conf.ibss_joined)
+ if (!sdata->vif.cfg.ibss_joined)
return -EINVAL;
if (params->chandef.width != sdata->u.ibss.chandef.width)
@@ -3125,6 +4258,7 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata,
if (cfg80211_get_chandef_type(&params->chandef) !=
cfg80211_get_chandef_type(&sdata->u.ibss.chandef))
return -EINVAL;
+ break;
case NL80211_CHAN_WIDTH_5:
case NL80211_CHAN_WIDTH_10:
case NL80211_CHAN_WIDTH_20_NOHT:
@@ -3141,10 +4275,9 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata,
/* see comments in the NL80211_IFTYPE_AP block */
if (params->count > 1) {
- err = ieee80211_ibss_csa_beacon(sdata, params);
+ err = ieee80211_ibss_csa_beacon(sdata, params, changed);
if (err < 0)
return err;
- *changed |= err;
}
ieee80211_send_action_csa(sdata, params);
@@ -3154,11 +4287,8 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata,
case NL80211_IFTYPE_MESH_POINT: {
struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
- if (params->chandef.width != sdata->vif.bss_conf.chandef.width)
- return -EINVAL;
-
/* changes into another band are not supported */
- if (sdata->vif.bss_conf.chandef.chan->band !=
+ if (sdata->vif.bss_conf.chanreq.oper.chan->band !=
params->chandef.chan->band)
return -EINVAL;
@@ -3172,12 +4302,11 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata,
/* see comments in the NL80211_IFTYPE_AP block */
if (params->count > 1) {
- err = ieee80211_mesh_csa_beacon(sdata, params);
+ err = ieee80211_mesh_csa_beacon(sdata, params, changed);
if (err < 0) {
ifmsh->csa_role = IEEE80211_MESH_CSA_ROLE_NONE;
return err;
}
- *changed |= err;
}
if (ifmsh->csa_role == IEEE80211_MESH_CSA_ROLE_INIT)
@@ -3193,95 +4322,129 @@ static int ieee80211_set_csa_beacon(struct ieee80211_sub_if_data *sdata,
return 0;
}
+static void ieee80211_color_change_abort(struct ieee80211_link_data *link)
+{
+ link->conf->color_change_active = false;
+
+ ieee80211_free_next_beacon(link);
+
+ cfg80211_color_change_aborted_notify(link->sdata->dev, link->link_id);
+}
+
static int
__ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
struct cfg80211_csa_settings *params)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_chan_req chanreq = { .oper = params->chandef };
struct ieee80211_local *local = sdata->local;
- struct ieee80211_channel_switch ch_switch;
+ struct ieee80211_channel_switch ch_switch = {
+ .link_id = params->link_id,
+ };
struct ieee80211_chanctx_conf *conf;
struct ieee80211_chanctx *chanctx;
- u32 changed = 0;
+ struct ieee80211_bss_conf *link_conf;
+ struct ieee80211_link_data *link_data;
+ u64 changed = 0;
+ u8 link_id = params->link_id;
int err;
- sdata_assert_lock(sdata);
- lockdep_assert_held(&local->mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
- if (!list_empty(&local->roc_list) || local->scanning)
+ if (ieee80211_is_scan_ongoing(wiphy, local, &params->chandef))
return -EBUSY;
- if (sdata->wdev.cac_started)
+ if (sdata->wdev.links[link_id].cac_started)
return -EBUSY;
- if (cfg80211_chandef_identical(&params->chandef,
- &sdata->vif.bss_conf.chandef))
+ if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS))
+ return -EINVAL;
+
+ link_data = wiphy_dereference(wiphy, sdata->link[link_id]);
+ if (!link_data)
+ return -ENOLINK;
+
+ link_conf = link_data->conf;
+
+ if (chanreq.oper.punctured && !link_conf->eht_support)
return -EINVAL;
/* don't allow another channel switch if one is already active. */
- if (sdata->vif.csa_active)
+ if (link_conf->csa_active)
return -EBUSY;
- mutex_lock(&local->chanctx_mtx);
- conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
- lockdep_is_held(&local->chanctx_mtx));
+ conf = wiphy_dereference(wiphy, link_conf->chanctx_conf);
if (!conf) {
err = -EBUSY;
goto out;
}
+ if (params->chandef.chan->freq_offset) {
+ /* this may work, but is untested */
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+
+ err = ieee80211_set_unsol_bcast_probe_resp(sdata,
+ &params->unsol_bcast_probe_resp,
+ link_data, link_conf, &changed);
+ if (err)
+ goto out;
+
chanctx = container_of(conf, struct ieee80211_chanctx, conf);
ch_switch.timestamp = 0;
ch_switch.device_timestamp = 0;
ch_switch.block_tx = params->block_tx;
- ch_switch.chandef = params->chandef;
+ ch_switch.chandef = chanreq.oper;
ch_switch.count = params->count;
err = drv_pre_channel_switch(sdata, &ch_switch);
if (err)
goto out;
- err = ieee80211_vif_reserve_chanctx(sdata, &params->chandef,
- chanctx->mode,
- params->radar_required);
+ err = ieee80211_link_reserve_chanctx(link_data, &chanreq,
+ chanctx->mode,
+ params->radar_required);
if (err)
goto out;
/* if reservation is invalid then this will fail */
- err = ieee80211_check_combinations(sdata, NULL, chanctx->mode, 0);
+ err = ieee80211_check_combinations(sdata, NULL, chanctx->mode, 0, -1);
if (err) {
- ieee80211_vif_unreserve_chanctx(sdata);
+ ieee80211_link_unreserve_chanctx(link_data);
goto out;
}
- err = ieee80211_set_csa_beacon(sdata, params, &changed);
+ /* if there is a color change in progress, abort it */
+ if (link_conf->color_change_active)
+ ieee80211_color_change_abort(link_data);
+
+ err = ieee80211_set_csa_beacon(link_data, params, &changed);
if (err) {
- ieee80211_vif_unreserve_chanctx(sdata);
+ ieee80211_link_unreserve_chanctx(link_data);
goto out;
}
- sdata->csa_chandef = params->chandef;
- sdata->csa_block_tx = params->block_tx;
- sdata->vif.csa_active = true;
+ link_data->csa.chanreq = chanreq;
+ link_conf->csa_active = true;
- if (sdata->csa_block_tx)
- ieee80211_stop_vif_queues(local, sdata,
- IEEE80211_QUEUE_STOP_REASON_CSA);
+ if (params->block_tx)
+ ieee80211_vif_block_queues_csa(sdata);
- cfg80211_ch_switch_started_notify(sdata->dev, &sdata->csa_chandef,
- params->count);
+ cfg80211_ch_switch_started_notify(sdata->dev,
+ &link_data->csa.chanreq.oper, link_id,
+ params->count, params->block_tx);
if (changed) {
- ieee80211_bss_info_change_notify(sdata, changed);
- drv_channel_switch_beacon(sdata, &params->chandef);
+ ieee80211_link_info_change_notify(sdata, link_data, changed);
+ drv_channel_switch_beacon(sdata, &link_data->csa.chanreq.oper);
} else {
/* if the beacon didn't change, we can finalize immediately */
- ieee80211_csa_finalize(sdata);
+ ieee80211_csa_finalize(link_data);
}
out:
- mutex_unlock(&local->chanctx_mtx);
return err;
}
@@ -3290,18 +4453,15 @@ int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
struct ieee80211_local *local = sdata->local;
- int err;
- mutex_lock(&local->mtx);
- err = __ieee80211_channel_switch(wiphy, dev, params);
- mutex_unlock(&local->mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
- return err;
+ return __ieee80211_channel_switch(wiphy, dev, params);
}
u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local)
{
- lockdep_assert_held(&local->mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
local->roc_cookie_counter++;
@@ -3325,7 +4485,7 @@ int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb,
spin_lock_irqsave(&local->ack_status_lock, spin_flags);
id = idr_alloc(&local->ack_status_frames, ack_skb,
- 1, 0x10000, GFP_ATOMIC);
+ 1, 0x2000, GFP_ATOMIC);
spin_unlock_irqrestore(&local->ack_status_lock, spin_flags);
if (id < 0) {
@@ -3333,7 +4493,8 @@ int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb,
return -ENOMEM;
}
- IEEE80211_SKB_CB(skb)->ack_frame_id = id;
+ IEEE80211_SKB_CB(skb)->status_data_idr = 1;
+ IEEE80211_SKB_CB(skb)->status_data = id;
*cookie = ieee80211_mgmt_tx_cookie(local);
IEEE80211_SKB_CB(ack_skb)->ack.cookie = *cookie;
@@ -3341,58 +4502,68 @@ int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb,
return 0;
}
-static void ieee80211_mgmt_frame_register(struct wiphy *wiphy,
+static void
+ieee80211_update_mgmt_frame_registrations(struct wiphy *wiphy,
struct wireless_dev *wdev,
- u16 frame_type, bool reg)
+ struct mgmt_frame_regs *upd)
{
struct ieee80211_local *local = wiphy_priv(wiphy);
struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ u32 preq_mask = BIT(IEEE80211_STYPE_PROBE_REQ >> 4);
+ u32 action_mask = BIT(IEEE80211_STYPE_ACTION >> 4);
+ bool global_change, intf_change;
+
+ global_change =
+ (local->probe_req_reg != !!(upd->global_stypes & preq_mask)) ||
+ (local->rx_mcast_action_reg !=
+ !!(upd->global_mcast_stypes & action_mask));
+ local->probe_req_reg = upd->global_stypes & preq_mask;
+ local->rx_mcast_action_reg = upd->global_mcast_stypes & action_mask;
+
+ intf_change = (sdata->vif.probe_req_reg !=
+ !!(upd->interface_stypes & preq_mask)) ||
+ (sdata->vif.rx_mcast_action_reg !=
+ !!(upd->interface_mcast_stypes & action_mask));
+ sdata->vif.probe_req_reg = upd->interface_stypes & preq_mask;
+ sdata->vif.rx_mcast_action_reg =
+ upd->interface_mcast_stypes & action_mask;
+
+ if (!local->open_count)
+ return;
- switch (frame_type) {
- case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ:
- if (reg) {
- local->probe_req_reg++;
- sdata->vif.probe_req_reg++;
- } else {
- if (local->probe_req_reg)
- local->probe_req_reg--;
-
- if (sdata->vif.probe_req_reg)
- sdata->vif.probe_req_reg--;
- }
-
- if (!local->open_count)
- break;
-
- if (sdata->vif.probe_req_reg == 1)
- drv_config_iface_filter(local, sdata, FIF_PROBE_REQ,
- FIF_PROBE_REQ);
- else if (sdata->vif.probe_req_reg == 0)
- drv_config_iface_filter(local, sdata, 0,
- FIF_PROBE_REQ);
+ if (intf_change && ieee80211_sdata_running(sdata))
+ drv_config_iface_filter(local, sdata,
+ sdata->vif.probe_req_reg ?
+ FIF_PROBE_REQ : 0,
+ FIF_PROBE_REQ);
+ if (global_change)
ieee80211_configure_filter(local);
- break;
- default:
- break;
- }
}
-static int ieee80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant)
+static int ieee80211_set_antenna(struct wiphy *wiphy, int radio_idx,
+ u32 tx_ant, u32 rx_ant)
{
struct ieee80211_local *local = wiphy_priv(wiphy);
+ int ret;
if (local->started)
return -EOPNOTSUPP;
- return drv_set_antenna(local, tx_ant, rx_ant);
+ ret = drv_set_antenna(local, tx_ant, rx_ant);
+ if (ret)
+ return ret;
+
+ local->rx_chains = hweight8(rx_ant);
+ return 0;
}
-static int ieee80211_get_antenna(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant)
+static int ieee80211_get_antenna(struct wiphy *wiphy, int radio_idx,
+ u32 *tx_ant, u32 *rx_ant)
{
struct ieee80211_local *local = wiphy_priv(wiphy);
- return drv_get_antenna(local, tx_ant, rx_ant);
+ return drv_get_antenna(local, radio_idx, tx_ant, rx_ant);
}
static int ieee80211_set_rekey_data(struct wiphy *wiphy,
@@ -3427,22 +4598,23 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev,
int ret;
/* the lock is needed to assign the cookie later */
- mutex_lock(&local->mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
rcu_read_lock();
- chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ sta = sta_info_get_bss(sdata, peer);
+ if (!sta) {
+ ret = -ENOLINK;
+ goto unlock;
+ }
+
+ qos = sta->sta.wme;
+
+ chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
if (WARN_ON(!chanctx_conf)) {
ret = -EINVAL;
goto unlock;
}
band = chanctx_conf->def.chan->band;
- sta = sta_info_get_bss(sdata, peer);
- if (sta) {
- qos = sta->sta.wme;
- } else {
- ret = -ENOLINK;
- goto unlock;
- }
if (qos) {
fc = cpu_to_le16(IEEE80211_FTYPE_DATA |
@@ -3491,40 +4663,45 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev,
}
local_bh_disable();
- ieee80211_xmit(sdata, sta, skb, 0);
+ ieee80211_xmit(sdata, sta, skb);
local_bh_enable();
ret = 0;
unlock:
rcu_read_unlock();
- mutex_unlock(&local->mtx);
return ret;
}
static int ieee80211_cfg_get_channel(struct wiphy *wiphy,
struct wireless_dev *wdev,
+ unsigned int link_id,
struct cfg80211_chan_def *chandef)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
struct ieee80211_local *local = wiphy_priv(wiphy);
struct ieee80211_chanctx_conf *chanctx_conf;
+ struct ieee80211_link_data *link;
int ret = -ENODATA;
rcu_read_lock();
- chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+ link = rcu_dereference(sdata->link[link_id]);
+ if (!link) {
+ ret = -ENOLINK;
+ goto out;
+ }
+
+ chanctx_conf = rcu_dereference(link->conf->chanctx_conf);
if (chanctx_conf) {
- *chandef = sdata->vif.bss_conf.chandef;
+ *chandef = link->conf->chanreq.oper;
ret = 0;
} else if (local->open_count > 0 &&
- local->open_count == local->monitors &&
+ local->open_count == local->virt_monitors &&
sdata->vif.type == NL80211_IFTYPE_MONITOR) {
- if (local->use_chanctx)
- *chandef = local->monitor_chandef;
- else
- *chandef = local->_oper_chandef;
+ *chandef = local->monitor_chanreq.oper;
ret = 0;
}
+out:
rcu_read_unlock();
return ret;
@@ -3564,15 +4741,20 @@ static int ieee80211_set_qos_map(struct wiphy *wiphy,
static int ieee80211_set_ap_chanwidth(struct wiphy *wiphy,
struct net_device *dev,
+ unsigned int link_id,
struct cfg80211_chan_def *chandef)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_link_data *link;
+ struct ieee80211_chan_req chanreq = { .oper = *chandef };
int ret;
- u32 changed = 0;
+ u64 changed = 0;
- ret = ieee80211_vif_change_bandwidth(sdata, chandef, &changed);
+ link = sdata_dereference(sdata->link[link_id], sdata);
+
+ ret = ieee80211_link_change_chanreq(link, &chanreq, &changed);
if (ret == 0)
- ieee80211_bss_info_change_notify(sdata, changed);
+ ieee80211_link_info_change_notify(sdata, link, changed);
return ret;
}
@@ -3772,11 +4954,7 @@ static int ieee80211_get_txq_stats(struct wiphy *wiphy,
struct ieee80211_sub_if_data *sdata;
int ret = 0;
- if (!local->ops->wake_tx_queue)
- return 1;
-
spin_lock_bh(&local->fq.lock);
- rcu_read_lock();
if (wdev) {
sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
@@ -3802,12 +4980,559 @@ static int ieee80211_get_txq_stats(struct wiphy *wiphy,
}
out:
- rcu_read_unlock();
spin_unlock_bh(&local->fq.lock);
return ret;
}
+static int
+ieee80211_get_ftm_responder_stats(struct wiphy *wiphy,
+ struct net_device *dev,
+ struct cfg80211_ftm_responder_stats *ftm_stats)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ return drv_get_ftm_responder_stats(local, sdata, ftm_stats);
+}
+
+static int
+ieee80211_start_pmsr(struct wiphy *wiphy, struct wireless_dev *dev,
+ struct cfg80211_pmsr_request *request)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(dev);
+
+ return drv_start_pmsr(local, sdata, request);
+}
+
+static void
+ieee80211_abort_pmsr(struct wiphy *wiphy, struct wireless_dev *dev,
+ struct cfg80211_pmsr_request *request)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(dev);
+
+ return drv_abort_pmsr(local, sdata, request);
+}
+
+static int ieee80211_set_tid_config(struct wiphy *wiphy,
+ struct net_device *dev,
+ struct cfg80211_tid_config *tid_conf)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct sta_info *sta;
+
+ lockdep_assert_wiphy(sdata->local->hw.wiphy);
+
+ if (!sdata->local->ops->set_tid_config)
+ return -EOPNOTSUPP;
+
+ if (!tid_conf->peer)
+ return drv_set_tid_config(sdata->local, sdata, NULL, tid_conf);
+
+ sta = sta_info_get_bss(sdata, tid_conf->peer);
+ if (!sta)
+ return -ENOENT;
+
+ return drv_set_tid_config(sdata->local, sdata, &sta->sta, tid_conf);
+}
+
+static int ieee80211_reset_tid_config(struct wiphy *wiphy,
+ struct net_device *dev,
+ const u8 *peer, u8 tids)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct sta_info *sta;
+
+ lockdep_assert_wiphy(sdata->local->hw.wiphy);
+
+ if (!sdata->local->ops->reset_tid_config)
+ return -EOPNOTSUPP;
+
+ if (!peer)
+ return drv_reset_tid_config(sdata->local, sdata, NULL, tids);
+
+ sta = sta_info_get_bss(sdata, peer);
+ if (!sta)
+ return -ENOENT;
+
+ return drv_reset_tid_config(sdata->local, sdata, &sta->sta, tids);
+}
+
+static int ieee80211_set_sar_specs(struct wiphy *wiphy,
+ struct cfg80211_sar_specs *sar)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+
+ if (!local->ops->set_sar_specs)
+ return -EOPNOTSUPP;
+
+ return local->ops->set_sar_specs(&local->hw, sar);
+}
+
+static int
+ieee80211_set_after_color_change_beacon(struct ieee80211_link_data *link,
+ u64 *changed)
+{
+ struct ieee80211_sub_if_data *sdata = link->sdata;
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP: {
+ int ret;
+
+ if (!link->u.ap.next_beacon)
+ return -EINVAL;
+
+ ret = ieee80211_assign_beacon(sdata, link,
+ link->u.ap.next_beacon,
+ NULL, NULL, changed);
+ ieee80211_free_next_beacon(link);
+
+ if (ret < 0)
+ return ret;
+
+ break;
+ }
+ default:
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int
+ieee80211_set_color_change_beacon(struct ieee80211_link_data *link,
+ struct cfg80211_color_change_settings *params,
+ u64 *changed)
+{
+ struct ieee80211_sub_if_data *sdata = link->sdata;
+ struct ieee80211_color_change_settings color_change = {};
+ int err;
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP:
+ link->u.ap.next_beacon =
+ cfg80211_beacon_dup(&params->beacon_next);
+ if (!link->u.ap.next_beacon)
+ return -ENOMEM;
+
+ if (params->count <= 1)
+ break;
+
+ color_change.counter_offset_beacon =
+ params->counter_offset_beacon;
+ color_change.counter_offset_presp =
+ params->counter_offset_presp;
+ color_change.count = params->count;
+
+ err = ieee80211_assign_beacon(sdata, link,
+ &params->beacon_color_change,
+ NULL, &color_change, changed);
+ if (err < 0) {
+ ieee80211_free_next_beacon(link);
+ return err;
+ }
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
+static void
+ieee80211_color_change_bss_config_notify(struct ieee80211_link_data *link,
+ u8 color, int enable, u64 changed)
+{
+ struct ieee80211_sub_if_data *sdata = link->sdata;
+
+ lockdep_assert_wiphy(sdata->local->hw.wiphy);
+
+ link->conf->he_bss_color.color = color;
+ link->conf->he_bss_color.enabled = enable;
+ changed |= BSS_CHANGED_HE_BSS_COLOR;
+
+ ieee80211_link_info_change_notify(sdata, link, changed);
+
+ if (!link->conf->nontransmitted &&
+ rcu_access_pointer(link->conf->tx_bss_conf)) {
+ struct ieee80211_link_data *tmp;
+
+ for_each_sdata_link(sdata->local, tmp) {
+ if (tmp->sdata == sdata ||
+ rcu_access_pointer(tmp->conf->tx_bss_conf) != link->conf)
+ continue;
+
+ tmp->conf->he_bss_color.color = color;
+ tmp->conf->he_bss_color.enabled = enable;
+ ieee80211_link_info_change_notify(tmp->sdata, tmp,
+ BSS_CHANGED_HE_BSS_COLOR);
+ }
+ }
+}
+
+static int ieee80211_color_change_finalize(struct ieee80211_link_data *link)
+{
+ struct ieee80211_sub_if_data *sdata = link->sdata;
+ struct ieee80211_local *local = sdata->local;
+ u64 changed = 0;
+ int err;
+
+ lockdep_assert_wiphy(local->hw.wiphy);
+
+ link->conf->color_change_active = false;
+
+ err = ieee80211_set_after_color_change_beacon(link, &changed);
+ if (err) {
+ cfg80211_color_change_aborted_notify(sdata->dev, link->link_id);
+ return err;
+ }
+
+ ieee80211_color_change_bss_config_notify(link,
+ link->conf->color_change_color,
+ 1, changed);
+ cfg80211_color_change_notify(sdata->dev, link->link_id);
+
+ return 0;
+}
+
+void ieee80211_color_change_finalize_work(struct wiphy *wiphy,
+ struct wiphy_work *work)
+{
+ struct ieee80211_link_data *link =
+ container_of(work, struct ieee80211_link_data,
+ color_change_finalize_work);
+ struct ieee80211_sub_if_data *sdata = link->sdata;
+ struct ieee80211_bss_conf *link_conf = link->conf;
+ struct ieee80211_local *local = sdata->local;
+
+ lockdep_assert_wiphy(local->hw.wiphy);
+
+ /* AP might have been stopped while waiting for the lock. */
+ if (!link_conf->color_change_active)
+ return;
+
+ if (!ieee80211_sdata_running(sdata))
+ return;
+
+ ieee80211_color_change_finalize(link);
+}
+
+void ieee80211_color_collision_detection_work(struct wiphy *wiphy,
+ struct wiphy_work *work)
+{
+ struct ieee80211_link_data *link =
+ container_of(work, struct ieee80211_link_data,
+ color_collision_detect_work.work);
+ struct ieee80211_sub_if_data *sdata = link->sdata;
+
+ cfg80211_obss_color_collision_notify(sdata->dev, link->color_bitmap,
+ link->link_id);
+}
+
+void ieee80211_color_change_finish(struct ieee80211_vif *vif, u8 link_id)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_link_data *link;
+
+ if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS))
+ return;
+
+ rcu_read_lock();
+
+ link = rcu_dereference(sdata->link[link_id]);
+ if (WARN_ON(!link)) {
+ rcu_read_unlock();
+ return;
+ }
+
+ wiphy_work_queue(sdata->local->hw.wiphy,
+ &link->color_change_finalize_work);
+
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(ieee80211_color_change_finish);
+
+void
+ieee80211_obss_color_collision_notify(struct ieee80211_vif *vif,
+ u64 color_bitmap, u8 link_id)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+ struct ieee80211_link_data *link;
+
+ if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS))
+ return;
+
+ rcu_read_lock();
+
+ link = rcu_dereference(sdata->link[link_id]);
+ if (WARN_ON(!link)) {
+ rcu_read_unlock();
+ return;
+ }
+
+ if (link->conf->color_change_active || link->conf->csa_active) {
+ rcu_read_unlock();
+ return;
+ }
+
+ if (wiphy_delayed_work_pending(sdata->local->hw.wiphy,
+ &link->color_collision_detect_work)) {
+ rcu_read_unlock();
+ return;
+ }
+
+ link->color_bitmap = color_bitmap;
+ /* queue the color collision detection event every 500 ms in order to
+ * avoid sending too much netlink messages to userspace.
+ */
+ wiphy_delayed_work_queue(sdata->local->hw.wiphy,
+ &link->color_collision_detect_work,
+ msecs_to_jiffies(500));
+
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(ieee80211_obss_color_collision_notify);
+
+static int
+ieee80211_color_change(struct wiphy *wiphy, struct net_device *dev,
+ struct cfg80211_color_change_settings *params)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_bss_conf *link_conf;
+ struct ieee80211_link_data *link;
+ u8 link_id = params->link_id;
+ u64 changed = 0;
+ int err;
+
+ lockdep_assert_wiphy(local->hw.wiphy);
+
+ if (WARN_ON(link_id >= IEEE80211_MLD_MAX_NUM_LINKS))
+ return -EINVAL;
+
+ link = wiphy_dereference(wiphy, sdata->link[link_id]);
+ if (!link)
+ return -ENOLINK;
+
+ link_conf = link->conf;
+
+ if (link_conf->nontransmitted)
+ return -EINVAL;
+
+ /* don't allow another color change if one is already active or if csa
+ * is active
+ */
+ if (link_conf->color_change_active || link_conf->csa_active) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ err = ieee80211_set_unsol_bcast_probe_resp(sdata,
+ &params->unsol_bcast_probe_resp,
+ link, link_conf, &changed);
+ if (err)
+ goto out;
+
+ err = ieee80211_set_color_change_beacon(link, params, &changed);
+ if (err)
+ goto out;
+
+ link_conf->color_change_active = true;
+ link_conf->color_change_color = params->color;
+
+ cfg80211_color_change_started_notify(sdata->dev, params->count, link_id);
+
+ if (changed)
+ ieee80211_color_change_bss_config_notify(link, 0, 0, changed);
+ else
+ /* if the beacon didn't change, we can finalize immediately */
+ ieee80211_color_change_finalize(link);
+
+out:
+
+ return err;
+}
+
+static int
+ieee80211_set_radar_background(struct wiphy *wiphy,
+ struct cfg80211_chan_def *chandef)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+
+ if (!local->ops->set_radar_background)
+ return -EOPNOTSUPP;
+
+ return local->ops->set_radar_background(&local->hw, chandef);
+}
+
+static int ieee80211_add_intf_link(struct wiphy *wiphy,
+ struct wireless_dev *wdev,
+ unsigned int link_id)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+
+ lockdep_assert_wiphy(sdata->local->hw.wiphy);
+
+ if (wdev->use_4addr)
+ return -EOPNOTSUPP;
+
+ return ieee80211_vif_set_links(sdata, wdev->valid_links, 0);
+}
+
+static void ieee80211_del_intf_link(struct wiphy *wiphy,
+ struct wireless_dev *wdev,
+ unsigned int link_id)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ u16 new_links = wdev->valid_links & ~BIT(link_id);
+
+ lockdep_assert_wiphy(sdata->local->hw.wiphy);
+
+ /* During the link teardown process, certain functions require the
+ * link_id to remain in the valid_links bitmap. Therefore, instead
+ * of removing the link_id from the bitmap, pass a masked value to
+ * simulate as if link_id does not exist anymore.
+ */
+ ieee80211_vif_set_links(sdata, new_links, 0);
+}
+
+static int
+ieee80211_add_link_station(struct wiphy *wiphy, struct net_device *dev,
+ struct link_station_parameters *params)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+ struct sta_info *sta;
+ int ret;
+
+ lockdep_assert_wiphy(local->hw.wiphy);
+
+ sta = sta_info_get_bss(sdata, params->mld_mac);
+ if (!sta)
+ return -ENOENT;
+
+ if (!sta->sta.valid_links)
+ return -EINVAL;
+
+ if (sta->sta.valid_links & BIT(params->link_id))
+ return -EALREADY;
+
+ ret = ieee80211_sta_allocate_link(sta, params->link_id);
+ if (ret)
+ return ret;
+
+ ret = sta_link_apply_parameters(local, sta, STA_LINK_MODE_NEW, params);
+ if (ret) {
+ ieee80211_sta_free_link(sta, params->link_id);
+ return ret;
+ }
+
+ if (test_sta_flag(sta, WLAN_STA_ASSOC)) {
+ struct link_sta_info *link_sta;
+
+ link_sta = sdata_dereference(sta->link[params->link_id], sdata);
+ rate_control_rate_init(link_sta);
+ }
+
+ /* ieee80211_sta_activate_link frees the link upon failure */
+ return ieee80211_sta_activate_link(sta, params->link_id);
+}
+
+static int
+ieee80211_mod_link_station(struct wiphy *wiphy, struct net_device *dev,
+ struct link_station_parameters *params)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+ struct sta_info *sta;
+
+ lockdep_assert_wiphy(local->hw.wiphy);
+
+ sta = sta_info_get_bss(sdata, params->mld_mac);
+ if (!sta)
+ return -ENOENT;
+
+ if (!(sta->sta.valid_links & BIT(params->link_id)))
+ return -EINVAL;
+
+ return sta_link_apply_parameters(local, sta, STA_LINK_MODE_LINK_MODIFY,
+ params);
+}
+
+static int
+ieee80211_del_link_station(struct wiphy *wiphy, struct net_device *dev,
+ struct link_station_del_parameters *params)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct sta_info *sta;
+
+ lockdep_assert_wiphy(sdata->local->hw.wiphy);
+
+ sta = sta_info_get_bss(sdata, params->mld_mac);
+ if (!sta)
+ return -ENOENT;
+
+ if (!(sta->sta.valid_links & BIT(params->link_id)))
+ return -EINVAL;
+
+ /* must not create a STA without links */
+ if (sta->sta.valid_links == BIT(params->link_id))
+ return -EINVAL;
+
+ ieee80211_sta_remove_link(sta, params->link_id);
+
+ return 0;
+}
+
+static int ieee80211_set_hw_timestamp(struct wiphy *wiphy,
+ struct net_device *dev,
+ struct cfg80211_set_hw_timestamp *hwts)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+ struct ieee80211_local *local = sdata->local;
+
+ if (!local->ops->set_hw_timestamp)
+ return -EOPNOTSUPP;
+
+ if (!check_sdata_in_driver(sdata))
+ return -EIO;
+
+ return local->ops->set_hw_timestamp(&local->hw, &sdata->vif, hwts);
+}
+
+static int
+ieee80211_set_ttlm(struct wiphy *wiphy, struct net_device *dev,
+ struct cfg80211_ttlm_params *params)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ lockdep_assert_wiphy(sdata->local->hw.wiphy);
+
+ return ieee80211_req_neg_ttlm(sdata, params);
+}
+
+static int
+ieee80211_assoc_ml_reconf(struct wiphy *wiphy, struct net_device *dev,
+ struct cfg80211_ml_reconf_req *req)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ lockdep_assert_wiphy(sdata->local->hw.wiphy);
+
+ return ieee80211_mgd_assoc_ml_reconf(sdata, req);
+}
+
+static int
+ieee80211_set_epcs(struct wiphy *wiphy, struct net_device *dev, bool enable)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ return ieee80211_mgd_set_epcs(sdata, enable);
+}
+
const struct cfg80211_ops mac80211_config_ops = {
.add_virtual_intf = ieee80211_add_iface,
.del_virtual_intf = ieee80211_del_iface,
@@ -3819,6 +5544,7 @@ const struct cfg80211_ops mac80211_config_ops = {
.get_key = ieee80211_get_key,
.set_default_key = ieee80211_config_default_key,
.set_default_mgmt_key = ieee80211_config_default_mgmt_key,
+ .set_default_beacon_key = ieee80211_config_default_beacon_key,
.start_ap = ieee80211_start_ap,
.change_beacon = ieee80211_change_beacon,
.stop_ap = ieee80211_stop_ap,
@@ -3844,6 +5570,7 @@ const struct cfg80211_ops mac80211_config_ops = {
.join_ocb = ieee80211_join_ocb,
.leave_ocb = ieee80211_leave_ocb,
.change_bss = ieee80211_change_bss,
+ .inform_bss = ieee80211_inform_bss,
.set_txq_params = ieee80211_set_txq_params,
.set_monitor_channel = ieee80211_set_monitor_channel,
.suspend = ieee80211_suspend,
@@ -3862,7 +5589,6 @@ const struct cfg80211_ops mac80211_config_ops = {
.set_wiphy_params = ieee80211_set_wiphy_params,
.set_tx_power = ieee80211_set_tx_power,
.get_tx_power = ieee80211_get_tx_power,
- .set_wds_peer = ieee80211_set_wds_peer,
.rfkill_poll = ieee80211_rfkill_poll,
CFG80211_TESTMODE_CMD(ieee80211_testmode_cmd)
CFG80211_TESTMODE_DUMP(ieee80211_testmode_dump)
@@ -3874,7 +5600,8 @@ const struct cfg80211_ops mac80211_config_ops = {
.mgmt_tx_cancel_wait = ieee80211_mgmt_tx_cancel_wait,
.set_cqm_rssi_config = ieee80211_set_cqm_rssi_config,
.set_cqm_rssi_range_config = ieee80211_set_cqm_rssi_range_config,
- .mgmt_frame_register = ieee80211_mgmt_frame_register,
+ .update_mgmt_frame_registrations =
+ ieee80211_update_mgmt_frame_registrations,
.set_antenna = ieee80211_set_antenna,
.get_antenna = ieee80211_get_antenna,
.set_rekey_data = ieee80211_set_rekey_data,
@@ -3889,6 +5616,7 @@ const struct cfg80211_ops mac80211_config_ops = {
#endif
.get_channel = ieee80211_cfg_get_channel,
.start_radar_detection = ieee80211_start_radar_detection,
+ .end_cac = ieee80211_end_cac,
.channel_switch = ieee80211_channel_switch,
.set_qos_map = ieee80211_set_qos_map,
.set_ap_chanwidth = ieee80211_set_ap_chanwidth,
@@ -3902,4 +5630,23 @@ const struct cfg80211_ops mac80211_config_ops = {
.set_multicast_to_unicast = ieee80211_set_multicast_to_unicast,
.tx_control_port = ieee80211_tx_control_port,
.get_txq_stats = ieee80211_get_txq_stats,
+ .get_ftm_responder_stats = ieee80211_get_ftm_responder_stats,
+ .start_pmsr = ieee80211_start_pmsr,
+ .abort_pmsr = ieee80211_abort_pmsr,
+ .probe_mesh_link = ieee80211_probe_mesh_link,
+ .set_tid_config = ieee80211_set_tid_config,
+ .reset_tid_config = ieee80211_reset_tid_config,
+ .set_sar_specs = ieee80211_set_sar_specs,
+ .color_change = ieee80211_color_change,
+ .set_radar_background = ieee80211_set_radar_background,
+ .add_intf_link = ieee80211_add_intf_link,
+ .del_intf_link = ieee80211_del_intf_link,
+ .add_link_station = ieee80211_add_link_station,
+ .mod_link_station = ieee80211_mod_link_station,
+ .del_link_station = ieee80211_del_link_station,
+ .set_hw_timestamp = ieee80211_set_hw_timestamp,
+ .set_ttlm = ieee80211_set_ttlm,
+ .get_radio_mask = ieee80211_get_radio_mask,
+ .assoc_ml_reconf = ieee80211_assoc_ml_reconf,
+ .set_epcs = ieee80211_set_epcs,
};
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index d9558ffb8acf..d0bfb1216401 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -1,5 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mac80211 - channel management
+ * Copyright 2020 - 2025 Intel Corporation
*/
#include <linux/nl80211.h>
@@ -8,16 +10,133 @@
#include <net/cfg80211.h>
#include "ieee80211_i.h"
#include "driver-ops.h"
+#include "rate.h"
+
+struct ieee80211_chanctx_user_iter {
+ struct ieee80211_chan_req *chanreq;
+ struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_link_data *link;
+ enum nl80211_iftype iftype;
+ bool reserved, radar_required, done;
+ enum {
+ CHANCTX_ITER_POS_ASSIGNED,
+ CHANCTX_ITER_POS_RESERVED,
+ CHANCTX_ITER_POS_DONE,
+ } per_link;
+};
+
+enum ieee80211_chanctx_iter_type {
+ CHANCTX_ITER_ALL,
+ CHANCTX_ITER_RESERVED,
+ CHANCTX_ITER_ASSIGNED,
+};
+
+static void ieee80211_chanctx_user_iter_next(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx,
+ struct ieee80211_chanctx_user_iter *iter,
+ enum ieee80211_chanctx_iter_type type,
+ bool start)
+{
+ lockdep_assert_wiphy(local->hw.wiphy);
+
+ if (start) {
+ memset(iter, 0, sizeof(*iter));
+ goto next_interface;
+ }
+
+next_link:
+ for (int link_id = iter->link ? iter->link->link_id : 0;
+ link_id < ARRAY_SIZE(iter->sdata->link);
+ link_id++) {
+ struct ieee80211_link_data *link;
+
+ link = sdata_dereference(iter->sdata->link[link_id],
+ iter->sdata);
+ if (!link)
+ continue;
+
+ switch (iter->per_link) {
+ case CHANCTX_ITER_POS_ASSIGNED:
+ iter->per_link = CHANCTX_ITER_POS_RESERVED;
+ if (type != CHANCTX_ITER_RESERVED &&
+ rcu_access_pointer(link->conf->chanctx_conf) == &ctx->conf) {
+ iter->link = link;
+ iter->reserved = false;
+ iter->radar_required = link->radar_required;
+ iter->chanreq = &link->conf->chanreq;
+ return;
+ }
+ fallthrough;
+ case CHANCTX_ITER_POS_RESERVED:
+ iter->per_link = CHANCTX_ITER_POS_DONE;
+ if (type != CHANCTX_ITER_ASSIGNED &&
+ link->reserved_chanctx == ctx) {
+ iter->link = link;
+ iter->reserved = true;
+ iter->radar_required =
+ link->reserved_radar_required;
+
+ iter->chanreq = &link->reserved;
+ return;
+ }
+ fallthrough;
+ case CHANCTX_ITER_POS_DONE:
+ iter->per_link = CHANCTX_ITER_POS_ASSIGNED;
+ continue;
+ }
+ }
+
+next_interface:
+ /* next (or first) interface */
+ iter->sdata = list_prepare_entry(iter->sdata, &local->interfaces, list);
+ list_for_each_entry_continue(iter->sdata, &local->interfaces, list) {
+ /* AP_VLAN has a chanctx pointer but follows AP */
+ if (iter->sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ continue;
+
+ iter->link = NULL;
+ iter->per_link = CHANCTX_ITER_POS_ASSIGNED;
+ iter->iftype = iter->sdata->vif.type;
+ goto next_link;
+ }
+
+ iter->done = true;
+}
+
+#define for_each_chanctx_user_assigned(local, ctx, iter) \
+ for (ieee80211_chanctx_user_iter_next(local, ctx, iter, \
+ CHANCTX_ITER_ASSIGNED, \
+ true); \
+ !((iter)->done); \
+ ieee80211_chanctx_user_iter_next(local, ctx, iter, \
+ CHANCTX_ITER_ASSIGNED, \
+ false))
+
+#define for_each_chanctx_user_reserved(local, ctx, iter) \
+ for (ieee80211_chanctx_user_iter_next(local, ctx, iter, \
+ CHANCTX_ITER_RESERVED, \
+ true); \
+ !((iter)->done); \
+ ieee80211_chanctx_user_iter_next(local, ctx, iter, \
+ CHANCTX_ITER_RESERVED, \
+ false))
+
+#define for_each_chanctx_user_all(local, ctx, iter) \
+ for (ieee80211_chanctx_user_iter_next(local, ctx, iter, \
+ CHANCTX_ITER_ALL, \
+ true); \
+ !((iter)->done); \
+ ieee80211_chanctx_user_iter_next(local, ctx, iter, \
+ CHANCTX_ITER_ALL, \
+ false))
static int ieee80211_chanctx_num_assigned(struct ieee80211_local *local,
struct ieee80211_chanctx *ctx)
{
- struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_chanctx_user_iter iter;
int num = 0;
- lockdep_assert_held(&local->chanctx_mtx);
-
- list_for_each_entry(sdata, &ctx->assigned_vifs, assigned_chanctx_list)
+ for_each_chanctx_user_assigned(local, ctx, &iter)
num++;
return num;
@@ -26,12 +145,10 @@ static int ieee80211_chanctx_num_assigned(struct ieee80211_local *local,
static int ieee80211_chanctx_num_reserved(struct ieee80211_local *local,
struct ieee80211_chanctx *ctx)
{
- struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_chanctx_user_iter iter;
int num = 0;
- lockdep_assert_held(&local->chanctx_mtx);
-
- list_for_each_entry(sdata, &ctx->reserved_vifs, reserved_chanctx_list)
+ for_each_chanctx_user_reserved(local, ctx, &iter)
num++;
return num;
@@ -40,122 +157,168 @@ static int ieee80211_chanctx_num_reserved(struct ieee80211_local *local,
int ieee80211_chanctx_refcount(struct ieee80211_local *local,
struct ieee80211_chanctx *ctx)
{
- return ieee80211_chanctx_num_assigned(local, ctx) +
- ieee80211_chanctx_num_reserved(local, ctx);
+ struct ieee80211_chanctx_user_iter iter;
+ int num = 0;
+
+ for_each_chanctx_user_all(local, ctx, &iter)
+ num++;
+
+ return num;
}
-static int ieee80211_num_chanctx(struct ieee80211_local *local)
+static int ieee80211_num_chanctx(struct ieee80211_local *local, int radio_idx)
{
struct ieee80211_chanctx *ctx;
int num = 0;
- lockdep_assert_held(&local->chanctx_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
- list_for_each_entry(ctx, &local->chanctx_list, list)
+ list_for_each_entry(ctx, &local->chanctx_list, list) {
+ if (radio_idx >= 0 && ctx->conf.radio_idx != radio_idx)
+ continue;
num++;
+ }
return num;
}
-static bool ieee80211_can_create_new_chanctx(struct ieee80211_local *local)
+static bool ieee80211_can_create_new_chanctx(struct ieee80211_local *local,
+ int radio_idx)
{
- lockdep_assert_held(&local->chanctx_mtx);
- return ieee80211_num_chanctx(local) < ieee80211_max_num_channels(local);
+ lockdep_assert_wiphy(local->hw.wiphy);
+
+ return ieee80211_num_chanctx(local, radio_idx) <
+ ieee80211_max_num_channels(local, radio_idx);
}
static struct ieee80211_chanctx *
-ieee80211_vif_get_chanctx(struct ieee80211_sub_if_data *sdata)
+ieee80211_link_get_chanctx(struct ieee80211_link_data *link)
{
- struct ieee80211_local *local __maybe_unused = sdata->local;
+ struct ieee80211_local *local __maybe_unused = link->sdata->local;
struct ieee80211_chanctx_conf *conf;
- conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
- lockdep_is_held(&local->chanctx_mtx));
+ conf = rcu_dereference_protected(link->conf->chanctx_conf,
+ lockdep_is_held(&local->hw.wiphy->mtx));
if (!conf)
return NULL;
return container_of(conf, struct ieee80211_chanctx, conf);
}
-static const struct cfg80211_chan_def *
-ieee80211_chanctx_reserved_chandef(struct ieee80211_local *local,
- struct ieee80211_chanctx *ctx,
- const struct cfg80211_chan_def *compat)
+bool ieee80211_chanreq_identical(const struct ieee80211_chan_req *a,
+ const struct ieee80211_chan_req *b)
{
- struct ieee80211_sub_if_data *sdata;
+ if (!cfg80211_chandef_identical(&a->oper, &b->oper))
+ return false;
+ if (!a->ap.chan && !b->ap.chan)
+ return true;
+ return cfg80211_chandef_identical(&a->ap, &b->ap);
+}
- lockdep_assert_held(&local->chanctx_mtx);
+static const struct ieee80211_chan_req *
+ieee80211_chanreq_compatible(const struct ieee80211_chan_req *a,
+ const struct ieee80211_chan_req *b,
+ struct ieee80211_chan_req *tmp)
+{
+ const struct cfg80211_chan_def *compat;
- list_for_each_entry(sdata, &ctx->reserved_vifs,
- reserved_chanctx_list) {
- if (!compat)
- compat = &sdata->reserved_chandef;
+ if (a->ap.chan && b->ap.chan &&
+ !cfg80211_chandef_identical(&a->ap, &b->ap))
+ return NULL;
- compat = cfg80211_chandef_compatible(&sdata->reserved_chandef,
- compat);
- if (!compat)
- break;
- }
+ compat = cfg80211_chandef_compatible(&a->oper, &b->oper);
+ if (!compat)
+ return NULL;
- return compat;
+ /* Note: later code assumes this always fills & returns tmp if compat */
+ tmp->oper = *compat;
+ tmp->ap = a->ap.chan ? a->ap : b->ap;
+ return tmp;
}
-static const struct cfg80211_chan_def *
-ieee80211_chanctx_non_reserved_chandef(struct ieee80211_local *local,
- struct ieee80211_chanctx *ctx,
- const struct cfg80211_chan_def *compat)
+static const struct ieee80211_chan_req *
+ieee80211_chanctx_compatible(struct ieee80211_chanctx *ctx,
+ const struct ieee80211_chan_req *req,
+ struct ieee80211_chan_req *tmp)
{
- struct ieee80211_sub_if_data *sdata;
+ const struct ieee80211_chan_req *ret;
+ struct ieee80211_chan_req tmp2;
- lockdep_assert_held(&local->chanctx_mtx);
+ *tmp = (struct ieee80211_chan_req){
+ .oper = ctx->conf.def,
+ .ap = ctx->conf.ap,
+ };
- list_for_each_entry(sdata, &ctx->assigned_vifs,
- assigned_chanctx_list) {
- if (sdata->reserved_chanctx != NULL)
- continue;
+ ret = ieee80211_chanreq_compatible(tmp, req, &tmp2);
+ if (!ret)
+ return NULL;
+ *tmp = *ret;
+ return tmp;
+}
- if (!compat)
- compat = &sdata->vif.bss_conf.chandef;
+static const struct ieee80211_chan_req *
+ieee80211_chanctx_reserved_chanreq(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx,
+ const struct ieee80211_chan_req *req,
+ struct ieee80211_chan_req *tmp)
+{
+ struct ieee80211_chanctx_user_iter iter;
- compat = cfg80211_chandef_compatible(
- &sdata->vif.bss_conf.chandef, compat);
- if (!compat)
+ lockdep_assert_wiphy(local->hw.wiphy);
+
+ if (WARN_ON(!req))
+ return NULL;
+
+ for_each_chanctx_user_reserved(local, ctx, &iter) {
+ req = ieee80211_chanreq_compatible(iter.chanreq, req, tmp);
+ if (!req)
break;
}
- return compat;
+ return req;
}
-static const struct cfg80211_chan_def *
-ieee80211_chanctx_combined_chandef(struct ieee80211_local *local,
- struct ieee80211_chanctx *ctx,
- const struct cfg80211_chan_def *compat)
+static const struct ieee80211_chan_req *
+ieee80211_chanctx_non_reserved_chandef(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx,
+ const struct ieee80211_chan_req *compat,
+ struct ieee80211_chan_req *tmp)
{
- lockdep_assert_held(&local->chanctx_mtx);
+ const struct ieee80211_chan_req *comp_def = compat;
+ struct ieee80211_chanctx_user_iter iter;
- compat = ieee80211_chanctx_reserved_chandef(local, ctx, compat);
- if (!compat)
- return NULL;
+ lockdep_assert_wiphy(local->hw.wiphy);
- compat = ieee80211_chanctx_non_reserved_chandef(local, ctx, compat);
- if (!compat)
- return NULL;
+ for_each_chanctx_user_assigned(local, ctx, &iter) {
+ if (iter.link->reserved_chanctx)
+ continue;
- return compat;
+ comp_def = ieee80211_chanreq_compatible(iter.chanreq,
+ comp_def, tmp);
+ if (!comp_def)
+ break;
+ }
+
+ return comp_def;
}
static bool
-ieee80211_chanctx_can_reserve_chandef(struct ieee80211_local *local,
- struct ieee80211_chanctx *ctx,
- const struct cfg80211_chan_def *def)
+ieee80211_chanctx_can_reserve(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx,
+ const struct ieee80211_chan_req *req)
{
- lockdep_assert_held(&local->chanctx_mtx);
+ struct ieee80211_chan_req tmp;
- if (ieee80211_chanctx_combined_chandef(local, ctx, def))
- return true;
+ lockdep_assert_wiphy(local->hw.wiphy);
+
+ if (!ieee80211_chanctx_reserved_chanreq(local, ctx, req, &tmp))
+ return false;
+
+ if (!ieee80211_chanctx_non_reserved_chandef(local, ctx, req, &tmp))
+ return false;
- if (!list_empty(&ctx->reserved_vifs) &&
- ieee80211_chanctx_reserved_chandef(local, ctx, def))
+ if (ieee80211_chanctx_num_reserved(local, ctx) != 0 &&
+ ieee80211_chanctx_reserved_chanreq(local, ctx, req, &tmp))
return true;
return false;
@@ -163,12 +326,12 @@ ieee80211_chanctx_can_reserve_chandef(struct ieee80211_local *local,
static struct ieee80211_chanctx *
ieee80211_find_reservation_chanctx(struct ieee80211_local *local,
- const struct cfg80211_chan_def *chandef,
+ const struct ieee80211_chan_req *chanreq,
enum ieee80211_chanctx_mode mode)
{
struct ieee80211_chanctx *ctx;
- lockdep_assert_held(&local->chanctx_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
if (mode == IEEE80211_CHANCTX_EXCLUSIVE)
return NULL;
@@ -180,8 +343,7 @@ ieee80211_find_reservation_chanctx(struct ieee80211_local *local,
if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE)
continue;
- if (!ieee80211_chanctx_can_reserve_chandef(local, ctx,
- chandef))
+ if (!ieee80211_chanctx_can_reserve(local, ctx, chanreq))
continue;
return ctx;
@@ -190,11 +352,30 @@ ieee80211_find_reservation_chanctx(struct ieee80211_local *local,
return NULL;
}
-enum nl80211_chan_width ieee80211_get_sta_bw(struct ieee80211_sta *sta)
+static enum nl80211_chan_width ieee80211_get_sta_bw(struct sta_info *sta,
+ unsigned int link_id)
{
- switch (sta->bandwidth) {
+ enum ieee80211_sta_rx_bandwidth width;
+ struct link_sta_info *link_sta;
+
+ link_sta = wiphy_dereference(sta->local->hw.wiphy, sta->link[link_id]);
+
+ /* no effect if this STA has no presence on this link */
+ if (!link_sta)
+ return NL80211_CHAN_WIDTH_20_NOHT;
+
+ /*
+ * We assume that TX/RX might be asymmetric (so e.g. VHT operating
+ * mode notification changes what a STA wants to receive, but not
+ * necessarily what it will transmit to us), and therefore use the
+ * capabilities here. Calling it RX bandwidth capability is a bit
+ * wrong though, since capabilities are in fact symmetric.
+ */
+ width = ieee80211_sta_cap_rx_bw(link_sta);
+
+ switch (width) {
case IEEE80211_STA_RX_BW_20:
- if (sta->ht_cap.ht_supported)
+ if (link_sta->pub->ht_cap.ht_supported)
return NL80211_CHAN_WIDTH_20;
else
return NL80211_CHAN_WIDTH_20_NOHT;
@@ -213,6 +394,8 @@ enum nl80211_chan_width ieee80211_get_sta_bw(struct ieee80211_sta *sta)
* might be smaller than the configured bw (160).
*/
return NL80211_CHAN_WIDTH_160;
+ case IEEE80211_STA_RX_BW_320:
+ return NL80211_CHAN_WIDTH_320;
default:
WARN_ON(1);
return NL80211_CHAN_WIDTH_20;
@@ -220,81 +403,99 @@ enum nl80211_chan_width ieee80211_get_sta_bw(struct ieee80211_sta *sta)
}
static enum nl80211_chan_width
-ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata)
+ieee80211_get_max_required_bw(struct ieee80211_link_data *link)
{
+ struct ieee80211_sub_if_data *sdata = link->sdata;
+ unsigned int link_id = link->link_id;
enum nl80211_chan_width max_bw = NL80211_CHAN_WIDTH_20_NOHT;
struct sta_info *sta;
- rcu_read_lock();
- list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) {
+ lockdep_assert_wiphy(sdata->local->hw.wiphy);
+
+ list_for_each_entry(sta, &sdata->local->sta_list, list) {
if (sdata != sta->sdata &&
!(sta->sdata->bss && sta->sdata->bss == sdata->bss))
continue;
- max_bw = max(max_bw, ieee80211_get_sta_bw(&sta->sta));
+ max_bw = max(max_bw, ieee80211_get_sta_bw(sta, link_id));
}
- rcu_read_unlock();
return max_bw;
}
static enum nl80211_chan_width
ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local,
- struct ieee80211_chanctx_conf *conf)
+ struct ieee80211_chanctx *ctx,
+ struct ieee80211_link_data *rsvd_for,
+ bool check_reserved)
{
struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_link_data *link;
enum nl80211_chan_width max_bw = NL80211_CHAN_WIDTH_20_NOHT;
- rcu_read_lock();
- list_for_each_entry_rcu(sdata, &local->interfaces, list) {
- struct ieee80211_vif *vif = &sdata->vif;
- enum nl80211_chan_width width = NL80211_CHAN_WIDTH_20_NOHT;
+ if (WARN_ON(check_reserved && rsvd_for))
+ return ctx->conf.def.width;
- if (!ieee80211_sdata_running(sdata))
- continue;
+ for_each_sdata_link(local, link) {
+ enum nl80211_chan_width width = NL80211_CHAN_WIDTH_20_NOHT;
- if (rcu_access_pointer(sdata->vif.chanctx_conf) != conf)
+ if (check_reserved) {
+ if (link->reserved_chanctx != ctx)
+ continue;
+ } else if (link != rsvd_for &&
+ rcu_access_pointer(link->conf->chanctx_conf) != &ctx->conf)
continue;
- switch (vif->type) {
- case NL80211_IFTYPE_AP:
- case NL80211_IFTYPE_AP_VLAN:
- width = ieee80211_get_max_required_bw(sdata);
- break;
+ switch (link->sdata->vif.type) {
case NL80211_IFTYPE_STATION:
+ if (!link->sdata->vif.cfg.assoc) {
+ /*
+ * The AP's sta->bandwidth may not yet be set
+ * at this point (pre-association), so simply
+ * take the width from the chandef. We cannot
+ * have TDLS peers yet (only after association).
+ */
+ width = link->conf->chanreq.oper.width;
+ break;
+ }
/*
- * The ap's sta->bandwidth is not set yet at this
- * point, so take the width from the chandef, but
- * account also for TDLS peers
+ * otherwise just use min_def like in AP, depending on what
+ * we currently think the AP STA (and possibly TDLS peers)
+ * require(s)
*/
- width = max(vif->bss_conf.chandef.width,
- ieee80211_get_max_required_bw(sdata));
+ fallthrough;
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ width = ieee80211_get_max_required_bw(link);
break;
case NL80211_IFTYPE_P2P_DEVICE:
case NL80211_IFTYPE_NAN:
continue;
+ case NL80211_IFTYPE_MONITOR:
+ WARN_ON_ONCE(!ieee80211_hw_check(&local->hw,
+ NO_VIRTUAL_MONITOR));
+ fallthrough;
case NL80211_IFTYPE_ADHOC:
- case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_MESH_POINT:
case NL80211_IFTYPE_OCB:
- width = vif->bss_conf.chandef.width;
+ width = link->conf->chanreq.oper.width;
break;
+ case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_UNSPECIFIED:
case NUM_NL80211_IFTYPES:
- case NL80211_IFTYPE_MONITOR:
case NL80211_IFTYPE_P2P_CLIENT:
case NL80211_IFTYPE_P2P_GO:
WARN_ON_ONCE(1);
}
+
max_bw = max(max_bw, width);
}
/* use the configured bandwidth in case of monitor interface */
- sdata = rcu_dereference(local->monitor_sdata);
- if (sdata && rcu_access_pointer(sdata->vif.chanctx_conf) == conf)
- max_bw = max(max_bw, conf->def.width);
-
- rcu_read_unlock();
+ sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata);
+ if (sdata &&
+ rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) == &ctx->conf)
+ max_bw = max(max_bw, ctx->conf.def.width);
return max_bw;
}
@@ -304,74 +505,247 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local,
* the max of min required widths of all the interfaces bound to this
* channel context.
*/
-void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local,
- struct ieee80211_chanctx *ctx)
+static u32
+__ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx,
+ struct ieee80211_link_data *rsvd_for,
+ bool check_reserved)
{
enum nl80211_chan_width max_bw;
struct cfg80211_chan_def min_def;
- lockdep_assert_held(&local->chanctx_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
- /* don't optimize 5MHz, 10MHz, and radar_enabled confs */
+ /* don't optimize non-20MHz based and radar_enabled confs */
if (ctx->conf.def.width == NL80211_CHAN_WIDTH_5 ||
ctx->conf.def.width == NL80211_CHAN_WIDTH_10 ||
+ ctx->conf.def.width == NL80211_CHAN_WIDTH_1 ||
+ ctx->conf.def.width == NL80211_CHAN_WIDTH_2 ||
+ ctx->conf.def.width == NL80211_CHAN_WIDTH_4 ||
+ ctx->conf.def.width == NL80211_CHAN_WIDTH_8 ||
+ ctx->conf.def.width == NL80211_CHAN_WIDTH_16 ||
ctx->conf.radar_enabled) {
ctx->conf.min_def = ctx->conf.def;
- return;
+ return 0;
}
- max_bw = ieee80211_get_chanctx_max_required_bw(local, &ctx->conf);
+ max_bw = ieee80211_get_chanctx_max_required_bw(local, ctx, rsvd_for,
+ check_reserved);
/* downgrade chandef up to max_bw */
min_def = ctx->conf.def;
while (min_def.width > max_bw)
- ieee80211_chandef_downgrade(&min_def);
+ ieee80211_chandef_downgrade(&min_def, NULL);
if (cfg80211_chandef_identical(&ctx->conf.min_def, &min_def))
- return;
+ return 0;
ctx->conf.min_def = min_def;
if (!ctx->driver_present)
- return;
+ return 0;
- drv_change_chanctx(local, ctx, IEEE80211_CHANCTX_CHANGE_MIN_WIDTH);
+ return IEEE80211_CHANCTX_CHANGE_MIN_DEF;
}
-static void ieee80211_change_chanctx(struct ieee80211_local *local,
+static void ieee80211_chan_bw_change(struct ieee80211_local *local,
struct ieee80211_chanctx *ctx,
- const struct cfg80211_chan_def *chandef)
+ bool reserved, bool narrowed)
{
- if (cfg80211_chandef_identical(&ctx->conf.def, chandef)) {
- ieee80211_recalc_chanctx_min_def(local, ctx);
+ struct sta_info *sta;
+ struct ieee80211_supported_band *sband =
+ local->hw.wiphy->bands[ctx->conf.def.chan->band];
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(sta, &local->sta_list,
+ list) {
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ enum ieee80211_sta_rx_bandwidth new_sta_bw;
+ unsigned int link_id;
+
+ if (!ieee80211_sdata_running(sta->sdata))
+ continue;
+
+ for (link_id = 0; link_id < ARRAY_SIZE(sta->sdata->link); link_id++) {
+ struct ieee80211_link_data *link =
+ rcu_dereference(sdata->link[link_id]);
+ struct ieee80211_bss_conf *link_conf;
+ struct cfg80211_chan_def *new_chandef;
+ struct link_sta_info *link_sta;
+
+ if (!link)
+ continue;
+
+ link_conf = link->conf;
+
+ if (rcu_access_pointer(link_conf->chanctx_conf) != &ctx->conf)
+ continue;
+
+ link_sta = rcu_dereference(sta->link[link_id]);
+ if (!link_sta)
+ continue;
+
+ if (reserved)
+ new_chandef = &link->reserved.oper;
+ else
+ new_chandef = &link_conf->chanreq.oper;
+
+ new_sta_bw = _ieee80211_sta_cur_vht_bw(link_sta,
+ new_chandef);
+
+ /* nothing change */
+ if (new_sta_bw == link_sta->pub->bandwidth)
+ continue;
+
+ /* vif changed to narrow BW and narrow BW for station wasn't
+ * requested or vice versa */
+ if ((new_sta_bw < link_sta->pub->bandwidth) == !narrowed)
+ continue;
+
+ link_sta->pub->bandwidth = new_sta_bw;
+ rate_control_rate_update(local, sband, link_sta,
+ IEEE80211_RC_BW_CHANGED);
+ }
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * recalc the min required chan width of the channel context, which is
+ * the max of min required widths of all the interfaces bound to this
+ * channel context.
+ */
+static void
+_ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx,
+ struct ieee80211_link_data *rsvd_for,
+ bool check_reserved)
+{
+ u32 changed = __ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for,
+ check_reserved);
+
+ if (!changed)
return;
+
+ /* check is BW narrowed */
+ ieee80211_chan_bw_change(local, ctx, false, true);
+
+ drv_change_chanctx(local, ctx, changed);
+
+ /* check is BW wider */
+ ieee80211_chan_bw_change(local, ctx, false, false);
+}
+
+void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx)
+{
+ _ieee80211_recalc_chanctx_min_def(local, ctx, NULL, false);
+}
+
+static void _ieee80211_change_chanctx(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx,
+ struct ieee80211_chanctx *old_ctx,
+ const struct ieee80211_chan_req *chanreq,
+ struct ieee80211_link_data *rsvd_for)
+{
+ const struct cfg80211_chan_def *chandef = &chanreq->oper;
+ struct ieee80211_chan_req ctx_req = {
+ .oper = ctx->conf.def,
+ .ap = ctx->conf.ap,
+ };
+ u32 changed = 0;
+
+ /* 5/10 MHz not handled here */
+ switch (chandef->width) {
+ case NL80211_CHAN_WIDTH_1:
+ case NL80211_CHAN_WIDTH_2:
+ case NL80211_CHAN_WIDTH_4:
+ case NL80211_CHAN_WIDTH_8:
+ case NL80211_CHAN_WIDTH_16:
+ /*
+ * mac80211 currently only supports sharing identical
+ * chanctx's for S1G interfaces.
+ */
+ WARN_ON(!ieee80211_chanreq_identical(&ctx_req, chanreq));
+ return;
+ case NL80211_CHAN_WIDTH_20_NOHT:
+ case NL80211_CHAN_WIDTH_20:
+ case NL80211_CHAN_WIDTH_40:
+ case NL80211_CHAN_WIDTH_80:
+ case NL80211_CHAN_WIDTH_80P80:
+ case NL80211_CHAN_WIDTH_160:
+ case NL80211_CHAN_WIDTH_320:
+ break;
+ default:
+ WARN_ON(1);
}
- WARN_ON(!cfg80211_chandef_compatible(&ctx->conf.def, chandef));
+ /* Check maybe BW narrowed - we do this _before_ calling recalc_chanctx_min_def
+ * due to maybe not returning from it, e.g in case new context was added
+ * first time with all parameters up to date.
+ */
+ ieee80211_chan_bw_change(local, old_ctx, false, true);
- ctx->conf.def = *chandef;
- drv_change_chanctx(local, ctx, IEEE80211_CHANCTX_CHANGE_WIDTH);
- ieee80211_recalc_chanctx_min_def(local, ctx);
+ if (ieee80211_chanreq_identical(&ctx_req, chanreq)) {
+ _ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for, false);
+ return;
+ }
+
+ WARN_ON(ieee80211_chanctx_refcount(local, ctx) > 1 &&
+ !cfg80211_chandef_compatible(&ctx->conf.def, &chanreq->oper));
- if (!local->use_chanctx) {
- local->_oper_chandef = *chandef;
- ieee80211_hw_config(local, 0);
+ ieee80211_remove_wbrf(local, &ctx->conf.def);
+
+ if (!cfg80211_chandef_identical(&ctx->conf.def, &chanreq->oper)) {
+ if (ctx->conf.def.width != chanreq->oper.width)
+ changed |= IEEE80211_CHANCTX_CHANGE_WIDTH;
+ if (ctx->conf.def.punctured != chanreq->oper.punctured)
+ changed |= IEEE80211_CHANCTX_CHANGE_PUNCTURING;
}
+ if (!cfg80211_chandef_identical(&ctx->conf.ap, &chanreq->ap))
+ changed |= IEEE80211_CHANCTX_CHANGE_AP;
+ ctx->conf.def = *chandef;
+ ctx->conf.ap = chanreq->ap;
+
+ /* check if min chanctx also changed */
+ changed |= __ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for,
+ false);
+
+ ieee80211_add_wbrf(local, &ctx->conf.def);
+
+ drv_change_chanctx(local, ctx, changed);
+
+ /* check if BW is wider */
+ ieee80211_chan_bw_change(local, old_ctx, false, false);
+}
+
+static void ieee80211_change_chanctx(struct ieee80211_local *local,
+ struct ieee80211_chanctx *ctx,
+ struct ieee80211_chanctx *old_ctx,
+ const struct ieee80211_chan_req *chanreq)
+{
+ _ieee80211_change_chanctx(local, ctx, old_ctx, chanreq, NULL);
}
+/* Note: if successful, the returned chanctx is reserved for the link */
static struct ieee80211_chanctx *
ieee80211_find_chanctx(struct ieee80211_local *local,
- const struct cfg80211_chan_def *chandef,
+ struct ieee80211_link_data *link,
+ const struct ieee80211_chan_req *chanreq,
enum ieee80211_chanctx_mode mode)
{
+ struct ieee80211_chan_req tmp;
struct ieee80211_chanctx *ctx;
- lockdep_assert_held(&local->chanctx_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
if (mode == IEEE80211_CHANCTX_EXCLUSIVE)
return NULL;
+ if (WARN_ON(link->reserved_chanctx))
+ return NULL;
+
list_for_each_entry(ctx, &local->chanctx_list, list) {
- const struct cfg80211_chan_def *compat;
+ const struct ieee80211_chan_req *compat;
if (ctx->replace_state != IEEE80211_CHANCTX_REPLACE_NONE)
continue;
@@ -379,16 +753,24 @@ ieee80211_find_chanctx(struct ieee80211_local *local,
if (ctx->mode == IEEE80211_CHANCTX_EXCLUSIVE)
continue;
- compat = cfg80211_chandef_compatible(&ctx->conf.def, chandef);
+ compat = ieee80211_chanctx_compatible(ctx, chanreq, &tmp);
if (!compat)
continue;
- compat = ieee80211_chanctx_reserved_chandef(local, ctx,
- compat);
+ compat = ieee80211_chanctx_reserved_chanreq(local, ctx,
+ compat, &tmp);
if (!compat)
continue;
- ieee80211_change_chanctx(local, ctx, compat);
+ /*
+ * Reserve the chanctx temporarily, as the driver might change
+ * active links during callbacks we make into it below and/or
+ * later during assignment, which could (otherwise) cause the
+ * context to actually be removed.
+ */
+ link->reserved_chanctx = ctx;
+
+ ieee80211_change_chanctx(local, ctx, ctx, compat);
return ctx;
}
@@ -396,20 +778,29 @@ ieee80211_find_chanctx(struct ieee80211_local *local,
return NULL;
}
-bool ieee80211_is_radar_required(struct ieee80211_local *local)
+bool ieee80211_is_radar_required(struct ieee80211_local *local,
+ struct cfg80211_scan_request *req)
{
- struct ieee80211_sub_if_data *sdata;
+ struct wiphy *wiphy = local->hw.wiphy;
+ struct ieee80211_link_data *link;
+ struct ieee80211_channel *chan;
+ int radio_idx;
- lockdep_assert_held(&local->mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
- rcu_read_lock();
- list_for_each_entry_rcu(sdata, &local->interfaces, list) {
- if (sdata->radar_required) {
- rcu_read_unlock();
- return true;
+ if (!req)
+ return false;
+
+ for_each_sdata_link(local, link) {
+ if (link->radar_required) {
+ chan = link->conf->chanreq.oper.chan;
+ radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chan);
+
+ if (ieee80211_is_radio_idx_in_scan_req(wiphy, req,
+ radio_idx))
+ return true;
}
}
- rcu_read_unlock();
return false;
}
@@ -418,51 +809,41 @@ static bool
ieee80211_chanctx_radar_required(struct ieee80211_local *local,
struct ieee80211_chanctx *ctx)
{
- struct ieee80211_chanctx_conf *conf = &ctx->conf;
- struct ieee80211_sub_if_data *sdata;
- bool required = false;
+ struct ieee80211_chanctx_user_iter iter;
- lockdep_assert_held(&local->chanctx_mtx);
- lockdep_assert_held(&local->mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
- rcu_read_lock();
- list_for_each_entry_rcu(sdata, &local->interfaces, list) {
- if (!ieee80211_sdata_running(sdata))
- continue;
- if (rcu_access_pointer(sdata->vif.chanctx_conf) != conf)
- continue;
- if (!sdata->radar_required)
- continue;
-
- required = true;
- break;
+ for_each_chanctx_user_assigned(local, ctx, &iter) {
+ if (iter.radar_required)
+ return true;
}
- rcu_read_unlock();
- return required;
+ return false;
}
static struct ieee80211_chanctx *
ieee80211_alloc_chanctx(struct ieee80211_local *local,
- const struct cfg80211_chan_def *chandef,
- enum ieee80211_chanctx_mode mode)
+ const struct ieee80211_chan_req *chanreq,
+ enum ieee80211_chanctx_mode mode,
+ int radio_idx)
{
struct ieee80211_chanctx *ctx;
- lockdep_assert_held(&local->chanctx_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
ctx = kzalloc(sizeof(*ctx) + local->hw.chanctx_data_size, GFP_KERNEL);
if (!ctx)
return NULL;
- INIT_LIST_HEAD(&ctx->assigned_vifs);
- INIT_LIST_HEAD(&ctx->reserved_vifs);
- ctx->conf.def = *chandef;
+ ctx->conf.def = chanreq->oper;
+ ctx->conf.ap = chanreq->ap;
ctx->conf.rx_chains_static = 1;
ctx->conf.rx_chains_dynamic = 1;
ctx->mode = mode;
ctx->conf.radar_enabled = false;
- ieee80211_recalc_chanctx_min_def(local, ctx);
+ ctx->conf.radio_idx = radio_idx;
+ ctx->radar_detected = false;
+ __ieee80211_recalc_chanctx_min_def(local, ctx, NULL, false);
return ctx;
}
@@ -473,26 +854,19 @@ static int ieee80211_add_chanctx(struct ieee80211_local *local,
u32 changed;
int err;
- lockdep_assert_held(&local->mtx);
- lockdep_assert_held(&local->chanctx_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
- if (!local->use_chanctx)
- local->hw.conf.radar_enabled = ctx->conf.radar_enabled;
+ ieee80211_add_wbrf(local, &ctx->conf.def);
/* turn idle off *before* setting channel -- some drivers need that */
changed = ieee80211_idle_off(local);
if (changed)
- ieee80211_hw_config(local, changed);
+ ieee80211_hw_config(local, -1, changed);
- if (!local->use_chanctx) {
- local->_oper_chandef = ctx->conf.def;
- ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
- } else {
- err = drv_add_chanctx(local, ctx);
- if (err) {
- ieee80211_recalc_idle(local);
- return err;
- }
+ err = drv_add_chanctx(local, ctx);
+ if (err) {
+ ieee80211_recalc_idle(local);
+ return err;
}
return 0;
@@ -500,65 +874,56 @@ static int ieee80211_add_chanctx(struct ieee80211_local *local,
static struct ieee80211_chanctx *
ieee80211_new_chanctx(struct ieee80211_local *local,
- const struct cfg80211_chan_def *chandef,
- enum ieee80211_chanctx_mode mode)
+ const struct ieee80211_chan_req *chanreq,
+ enum ieee80211_chanctx_mode mode,
+ bool assign_on_failure,
+ int radio_idx)
{
struct ieee80211_chanctx *ctx;
int err;
- lockdep_assert_held(&local->mtx);
- lockdep_assert_held(&local->chanctx_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
- ctx = ieee80211_alloc_chanctx(local, chandef, mode);
+ ctx = ieee80211_alloc_chanctx(local, chanreq, mode, radio_idx);
if (!ctx)
return ERR_PTR(-ENOMEM);
err = ieee80211_add_chanctx(local, ctx);
- if (err) {
+ if (!assign_on_failure && err) {
kfree(ctx);
return ERR_PTR(err);
}
+ /* We ignored a driver error, see _ieee80211_set_active_links */
+ WARN_ON_ONCE(err && !local->in_reconfig);
list_add_rcu(&ctx->list, &local->chanctx_list);
return ctx;
}
static void ieee80211_del_chanctx(struct ieee80211_local *local,
- struct ieee80211_chanctx *ctx)
+ struct ieee80211_chanctx *ctx,
+ bool skip_idle_recalc)
{
- lockdep_assert_held(&local->chanctx_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
- if (!local->use_chanctx) {
- struct cfg80211_chan_def *chandef = &local->_oper_chandef;
- chandef->width = NL80211_CHAN_WIDTH_20_NOHT;
- chandef->center_freq1 = chandef->chan->center_freq;
- chandef->center_freq2 = 0;
+ drv_remove_chanctx(local, ctx);
- /* NOTE: Disabling radar is only valid here for
- * single channel context. To be sure, check it ...
- */
- WARN_ON(local->hw.conf.radar_enabled &&
- !list_empty(&local->chanctx_list));
-
- local->hw.conf.radar_enabled = false;
+ if (!skip_idle_recalc)
+ ieee80211_recalc_idle(local);
- ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
- } else {
- drv_remove_chanctx(local, ctx);
- }
-
- ieee80211_recalc_idle(local);
+ ieee80211_remove_wbrf(local, &ctx->conf.def);
}
static void ieee80211_free_chanctx(struct ieee80211_local *local,
- struct ieee80211_chanctx *ctx)
+ struct ieee80211_chanctx *ctx,
+ bool skip_idle_recalc)
{
- lockdep_assert_held(&local->chanctx_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
WARN_ON_ONCE(ieee80211_chanctx_refcount(local, ctx) != 0);
list_del_rcu(&ctx->list);
- ieee80211_del_chanctx(local, ctx);
+ ieee80211_del_chanctx(local, ctx, skip_idle_recalc);
kfree_rcu(ctx, rcu_head);
}
@@ -566,50 +931,57 @@ void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local,
struct ieee80211_chanctx *ctx)
{
struct ieee80211_chanctx_conf *conf = &ctx->conf;
- struct ieee80211_sub_if_data *sdata;
- const struct cfg80211_chan_def *compat = NULL;
+ const struct ieee80211_chan_req *compat = NULL;
+ struct ieee80211_chanctx_user_iter iter;
+ struct ieee80211_chan_req tmp;
struct sta_info *sta;
- lockdep_assert_held(&local->chanctx_mtx);
-
- rcu_read_lock();
- list_for_each_entry_rcu(sdata, &local->interfaces, list) {
-
- if (!ieee80211_sdata_running(sdata))
- continue;
- if (rcu_access_pointer(sdata->vif.chanctx_conf) != conf)
- continue;
- if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
- continue;
+ lockdep_assert_wiphy(local->hw.wiphy);
+ for_each_chanctx_user_assigned(local, ctx, &iter) {
if (!compat)
- compat = &sdata->vif.bss_conf.chandef;
+ compat = iter.chanreq;
- compat = cfg80211_chandef_compatible(
- &sdata->vif.bss_conf.chandef, compat);
+ compat = ieee80211_chanreq_compatible(iter.chanreq,
+ compat, &tmp);
if (WARN_ON_ONCE(!compat))
- break;
+ return;
}
+ if (WARN_ON_ONCE(!compat))
+ return;
+
/* TDLS peers can sometimes affect the chandef width */
- list_for_each_entry_rcu(sta, &local->sta_list, list) {
+ list_for_each_entry(sta, &local->sta_list, list) {
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct ieee80211_chan_req tdls_chanreq = {};
+ struct ieee80211_link_data *link;
+ int tdls_link_id;
+
if (!sta->uploaded ||
!test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW) ||
!test_sta_flag(sta, WLAN_STA_AUTHORIZED) ||
!sta->tdls_chandef.chan)
continue;
- compat = cfg80211_chandef_compatible(&sta->tdls_chandef,
- compat);
+ tdls_link_id = ieee80211_tdls_sta_link_id(sta);
+ link = sdata_dereference(sdata->link[tdls_link_id], sdata);
+ if (!link)
+ continue;
+
+ if (rcu_access_pointer(link->conf->chanctx_conf) != conf)
+ continue;
+
+ tdls_chanreq.oper = sta->tdls_chandef;
+
+ /* note this always fills and returns &tmp if compat */
+ compat = ieee80211_chanreq_compatible(&tdls_chanreq,
+ compat, &tmp);
if (WARN_ON_ONCE(!compat))
- break;
+ return;
}
- rcu_read_unlock();
- if (!compat)
- return;
-
- ieee80211_change_chanctx(local, ctx, compat);
+ ieee80211_change_chanctx(local, ctx, ctx, compat);
}
static void ieee80211_recalc_radar_chanctx(struct ieee80211_local *local,
@@ -617,9 +989,7 @@ static void ieee80211_recalc_radar_chanctx(struct ieee80211_local *local,
{
bool radar_enabled;
- lockdep_assert_held(&local->chanctx_mtx);
- /* for ieee80211_is_radar_required */
- lockdep_assert_held(&local->mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
radar_enabled = ieee80211_chanctx_radar_required(local, chanctx);
@@ -628,50 +998,51 @@ static void ieee80211_recalc_radar_chanctx(struct ieee80211_local *local,
chanctx->conf.radar_enabled = radar_enabled;
- if (!local->use_chanctx) {
- local->hw.conf.radar_enabled = chanctx->conf.radar_enabled;
- ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL);
- }
-
drv_change_chanctx(local, chanctx, IEEE80211_CHANCTX_CHANGE_RADAR);
}
-static int ieee80211_assign_vif_chanctx(struct ieee80211_sub_if_data *sdata,
- struct ieee80211_chanctx *new_ctx)
+static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link,
+ struct ieee80211_chanctx *new_ctx,
+ bool assign_on_failure)
{
+ struct ieee80211_sub_if_data *sdata = link->sdata;
struct ieee80211_local *local = sdata->local;
struct ieee80211_chanctx_conf *conf;
struct ieee80211_chanctx *curr_ctx = NULL;
- int ret = 0;
+ bool new_idle;
+ int ret;
if (WARN_ON(sdata->vif.type == NL80211_IFTYPE_NAN))
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
- conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
- lockdep_is_held(&local->chanctx_mtx));
+ conf = rcu_dereference_protected(link->conf->chanctx_conf,
+ lockdep_is_held(&local->hw.wiphy->mtx));
- if (conf) {
+ if (conf && !local->in_reconfig) {
curr_ctx = container_of(conf, struct ieee80211_chanctx, conf);
- drv_unassign_vif_chanctx(local, sdata, curr_ctx);
+ drv_unassign_vif_chanctx(local, sdata, link->conf, curr_ctx);
conf = NULL;
- list_del(&sdata->assigned_chanctx_list);
}
if (new_ctx) {
- ret = drv_assign_vif_chanctx(local, sdata, new_ctx);
- if (ret)
- goto out;
+ /* recalc considering the link we'll use it for now */
+ _ieee80211_recalc_chanctx_min_def(local, new_ctx, link, false);
- conf = &new_ctx->conf;
- list_add(&sdata->assigned_chanctx_list,
- &new_ctx->assigned_vifs);
- }
+ ret = drv_assign_vif_chanctx(local, sdata, link->conf, new_ctx);
+ if (assign_on_failure || !ret) {
+ /* Need to continue, see _ieee80211_set_active_links */
+ WARN_ON_ONCE(ret && !local->in_reconfig);
+ ret = 0;
-out:
- rcu_assign_pointer(sdata->vif.chanctx_conf, conf);
+ /* succeeded, so commit it to the data structures */
+ conf = &new_ctx->conf;
+ }
+ } else {
+ ret = 0;
+ }
- sdata->vif.bss_conf.idle = !conf;
+ rcu_assign_pointer(link->conf->chanctx_conf, conf);
if (curr_ctx && ieee80211_chanctx_num_assigned(local, curr_ctx) > 0) {
ieee80211_recalc_chanctx_chantype(local, curr_ctx);
@@ -681,14 +1052,31 @@ out:
}
if (new_ctx && ieee80211_chanctx_num_assigned(local, new_ctx) > 0) {
- ieee80211_recalc_txpower(sdata, false);
+ ieee80211_recalc_txpower(link, false);
ieee80211_recalc_chanctx_min_def(local, new_ctx);
}
- if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
- sdata->vif.type != NL80211_IFTYPE_MONITOR)
- ieee80211_bss_info_change_notify(sdata,
- BSS_CHANGED_IDLE);
+ if (conf) {
+ new_idle = false;
+ } else {
+ struct ieee80211_link_data *tmp;
+
+ new_idle = true;
+ for_each_sdata_link(local, tmp) {
+ if (rcu_access_pointer(tmp->conf->chanctx_conf)) {
+ new_idle = false;
+ break;
+ }
+ }
+ }
+
+ if (new_idle != sdata->vif.cfg.idle) {
+ sdata->vif.cfg.idle = new_idle;
+
+ if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
+ sdata->vif.type != NL80211_IFTYPE_MONITOR)
+ ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_IDLE);
+ }
ieee80211_check_fast_xmit_iface(sdata);
@@ -698,57 +1086,53 @@ out:
void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local,
struct ieee80211_chanctx *chanctx)
{
+ struct ieee80211_chanctx_user_iter iter;
struct ieee80211_sub_if_data *sdata;
u8 rx_chains_static, rx_chains_dynamic;
- lockdep_assert_held(&local->chanctx_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
rx_chains_static = 1;
rx_chains_dynamic = 1;
- rcu_read_lock();
- list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ for_each_chanctx_user_assigned(local, chanctx, &iter) {
u8 needed_static, needed_dynamic;
- if (!ieee80211_sdata_running(sdata))
- continue;
-
- if (rcu_access_pointer(sdata->vif.chanctx_conf) !=
- &chanctx->conf)
- continue;
-
- switch (sdata->vif.type) {
- case NL80211_IFTYPE_P2P_DEVICE:
- case NL80211_IFTYPE_NAN:
- continue;
+ switch (iter.iftype) {
case NL80211_IFTYPE_STATION:
- if (!sdata->u.mgd.associated)
+ if (!iter.sdata->u.mgd.associated)
+ continue;
+ break;
+ case NL80211_IFTYPE_MONITOR:
+ if (!ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR))
continue;
break;
- case NL80211_IFTYPE_AP_VLAN:
- continue;
case NL80211_IFTYPE_AP:
case NL80211_IFTYPE_ADHOC:
- case NL80211_IFTYPE_WDS:
case NL80211_IFTYPE_MESH_POINT:
case NL80211_IFTYPE_OCB:
break;
default:
- WARN_ON_ONCE(1);
+ continue;
+ }
+
+ if (iter.iftype == NL80211_IFTYPE_MONITOR) {
+ rx_chains_dynamic = rx_chains_static = local->rx_chains;
+ break;
}
- switch (sdata->smps_mode) {
+ switch (iter.link->smps_mode) {
default:
WARN_ONCE(1, "Invalid SMPS mode %d\n",
- sdata->smps_mode);
- /* fall through */
+ iter.link->smps_mode);
+ fallthrough;
case IEEE80211_SMPS_OFF:
- needed_static = sdata->needed_rx_chains;
- needed_dynamic = sdata->needed_rx_chains;
+ needed_static = iter.link->needed_rx_chains;
+ needed_dynamic = iter.link->needed_rx_chains;
break;
case IEEE80211_SMPS_DYNAMIC:
needed_static = 1;
- needed_dynamic = sdata->needed_rx_chains;
+ needed_dynamic = iter.link->needed_rx_chains;
break;
case IEEE80211_SMPS_STATIC:
needed_static = 1;
@@ -761,23 +1145,11 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local,
}
/* Disable SMPS for the monitor interface */
- sdata = rcu_dereference(local->monitor_sdata);
+ sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata);
if (sdata &&
- rcu_access_pointer(sdata->vif.chanctx_conf) == &chanctx->conf)
+ rcu_access_pointer(sdata->vif.bss_conf.chanctx_conf) == &chanctx->conf)
rx_chains_dynamic = rx_chains_static = local->rx_chains;
- rcu_read_unlock();
-
- if (!local->use_chanctx) {
- if (rx_chains_static > 1)
- local->smps_mode = IEEE80211_SMPS_OFF;
- else if (rx_chains_dynamic > 1)
- local->smps_mode = IEEE80211_SMPS_DYNAMIC;
- else
- local->smps_mode = IEEE80211_SMPS_STATIC;
- ieee80211_hw_config(local, 0);
- }
-
if (rx_chains_static == chanctx->conf.rx_chains_static &&
rx_chains_dynamic == chanctx->conf.rx_chains_dynamic)
return;
@@ -788,9 +1160,12 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local,
}
static void
-__ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata,
- bool clear)
+__ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link,
+ bool clear)
{
+ struct ieee80211_sub_if_data *sdata = link->sdata;
+ unsigned int link_id = link->link_id;
+ struct ieee80211_bss_conf *link_conf = link->conf;
struct ieee80211_local *local __maybe_unused = sdata->local;
struct ieee80211_sub_if_data *vlan;
struct ieee80211_chanctx_conf *conf;
@@ -798,7 +1173,7 @@ __ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata,
if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_AP))
return;
- lockdep_assert_held(&local->mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
/* Check that conf exists, even when clearing this function
* must be called with the AP's channel context still there
@@ -806,45 +1181,51 @@ __ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata,
* channel context pointer for a while, possibly pointing
* to a channel context that has already been freed.
*/
- conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
- lockdep_is_held(&local->chanctx_mtx));
+ conf = rcu_dereference_protected(link_conf->chanctx_conf,
+ lockdep_is_held(&local->hw.wiphy->mtx));
WARN_ON(!conf);
if (clear)
conf = NULL;
- list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
- rcu_assign_pointer(vlan->vif.chanctx_conf, conf);
+ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) {
+ struct ieee80211_bss_conf *vlan_conf;
+
+ vlan_conf = wiphy_dereference(local->hw.wiphy,
+ vlan->vif.link_conf[link_id]);
+ if (WARN_ON(!vlan_conf))
+ continue;
+
+ rcu_assign_pointer(vlan_conf->chanctx_conf, conf);
+ }
}
-void ieee80211_vif_copy_chanctx_to_vlans(struct ieee80211_sub_if_data *sdata,
- bool clear)
+void ieee80211_link_copy_chanctx_to_vlans(struct ieee80211_link_data *link,
+ bool clear)
{
- struct ieee80211_local *local = sdata->local;
+ struct ieee80211_local *local = link->sdata->local;
- mutex_lock(&local->chanctx_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
- __ieee80211_vif_copy_chanctx_to_vlans(sdata, clear);
-
- mutex_unlock(&local->chanctx_mtx);
+ __ieee80211_link_copy_chanctx_to_vlans(link, clear);
}
-int ieee80211_vif_unreserve_chanctx(struct ieee80211_sub_if_data *sdata)
+void ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link)
{
- struct ieee80211_chanctx *ctx = sdata->reserved_chanctx;
+ struct ieee80211_sub_if_data *sdata = link->sdata;
+ struct ieee80211_chanctx *ctx = link->reserved_chanctx;
- lockdep_assert_held(&sdata->local->chanctx_mtx);
+ lockdep_assert_wiphy(sdata->local->hw.wiphy);
if (WARN_ON(!ctx))
- return -EINVAL;
+ return;
- list_del(&sdata->reserved_chanctx_list);
- sdata->reserved_chanctx = NULL;
+ link->reserved_chanctx = NULL;
if (ieee80211_chanctx_refcount(sdata->local, ctx) == 0) {
if (ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER) {
if (WARN_ON(!ctx->replace_ctx))
- return -EINVAL;
+ return;
WARN_ON(ctx->replace_ctx->replace_state !=
IEEE80211_CHANCTX_WILL_BE_REPLACED);
@@ -857,123 +1238,170 @@ int ieee80211_vif_unreserve_chanctx(struct ieee80211_sub_if_data *sdata)
list_del_rcu(&ctx->list);
kfree_rcu(ctx, rcu_head);
} else {
- ieee80211_free_chanctx(sdata->local, ctx);
+ ieee80211_free_chanctx(sdata->local, ctx, false);
}
}
-
- return 0;
}
-int ieee80211_vif_reserve_chanctx(struct ieee80211_sub_if_data *sdata,
- const struct cfg80211_chan_def *chandef,
- enum ieee80211_chanctx_mode mode,
- bool radar_required)
+static struct ieee80211_chanctx *
+ieee80211_replace_chanctx(struct ieee80211_local *local,
+ const struct ieee80211_chan_req *chanreq,
+ enum ieee80211_chanctx_mode mode,
+ struct ieee80211_chanctx *curr_ctx)
{
- struct ieee80211_local *local = sdata->local;
- struct ieee80211_chanctx *new_ctx, *curr_ctx, *ctx;
+ struct ieee80211_chanctx *new_ctx, *ctx;
+ struct wiphy *wiphy = local->hw.wiphy;
+ const struct wiphy_radio *radio;
- lockdep_assert_held(&local->chanctx_mtx);
+ if (!curr_ctx ||
+ curr_ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED ||
+ ieee80211_chanctx_num_reserved(local, curr_ctx) != 0) {
+ /*
+ * Another link already requested this context for a
+ * reservation. Find another one hoping all links assigned
+ * to it will also switch soon enough.
+ *
+ * TODO: This needs a little more work as some cases
+ * (more than 2 chanctx capable devices) may fail which could
+ * otherwise succeed provided some channel context juggling was
+ * performed.
+ *
+ * Consider ctx1..3, link1..6, each ctx has 2 links. link1 and
+ * link2 from ctx1 request new different chandefs starting 2
+ * in-place reservations with ctx4 and ctx5 replacing ctx1 and
+ * ctx2 respectively. Next link5 and link6 from ctx3 reserve
+ * ctx4. If link3 and link4 remain on ctx2 as they are then this
+ * fails unless `replace_ctx` from ctx5 is replaced with ctx3.
+ */
+ list_for_each_entry(ctx, &local->chanctx_list, list) {
+ if (ctx->replace_state !=
+ IEEE80211_CHANCTX_REPLACE_NONE)
+ continue;
- curr_ctx = ieee80211_vif_get_chanctx(sdata);
- if (curr_ctx && local->use_chanctx && !local->ops->switch_vif_chanctx)
- return -ENOTSUPP;
+ if (ieee80211_chanctx_num_reserved(local, ctx) != 0)
+ continue;
- new_ctx = ieee80211_find_reservation_chanctx(local, chandef, mode);
- if (!new_ctx) {
- if (ieee80211_can_create_new_chanctx(local)) {
- new_ctx = ieee80211_new_chanctx(local, chandef, mode);
- if (IS_ERR(new_ctx))
- return PTR_ERR(new_ctx);
- } else {
- if (!curr_ctx ||
- (curr_ctx->replace_state ==
- IEEE80211_CHANCTX_WILL_BE_REPLACED) ||
- !list_empty(&curr_ctx->reserved_vifs)) {
- /*
- * Another vif already requested this context
- * for a reservation. Find another one hoping
- * all vifs assigned to it will also switch
- * soon enough.
- *
- * TODO: This needs a little more work as some
- * cases (more than 2 chanctx capable devices)
- * may fail which could otherwise succeed
- * provided some channel context juggling was
- * performed.
- *
- * Consider ctx1..3, vif1..6, each ctx has 2
- * vifs. vif1 and vif2 from ctx1 request new
- * different chandefs starting 2 in-place
- * reserations with ctx4 and ctx5 replacing
- * ctx1 and ctx2 respectively. Next vif5 and
- * vif6 from ctx3 reserve ctx4. If vif3 and
- * vif4 remain on ctx2 as they are then this
- * fails unless `replace_ctx` from ctx5 is
- * replaced with ctx3.
- */
- list_for_each_entry(ctx, &local->chanctx_list,
- list) {
- if (ctx->replace_state !=
- IEEE80211_CHANCTX_REPLACE_NONE)
- continue;
-
- if (!list_empty(&ctx->reserved_vifs))
- continue;
-
- curr_ctx = ctx;
- break;
- }
+ if (ctx->conf.radio_idx >= 0) {
+ radio = &wiphy->radio[ctx->conf.radio_idx];
+ if (!cfg80211_radio_chandef_valid(radio, &chanreq->oper))
+ continue;
}
- /*
- * If that's true then all available contexts already
- * have reservations and cannot be used.
- */
- if (!curr_ctx ||
- (curr_ctx->replace_state ==
- IEEE80211_CHANCTX_WILL_BE_REPLACED) ||
- !list_empty(&curr_ctx->reserved_vifs))
- return -EBUSY;
+ curr_ctx = ctx;
+ break;
+ }
+ }
- new_ctx = ieee80211_alloc_chanctx(local, chandef, mode);
- if (!new_ctx)
- return -ENOMEM;
+ /*
+ * If that's true then all available contexts already have reservations
+ * and cannot be used.
+ */
+ if (!curr_ctx ||
+ curr_ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED ||
+ ieee80211_chanctx_num_reserved(local, curr_ctx) != 0)
+ return ERR_PTR(-EBUSY);
- new_ctx->replace_ctx = curr_ctx;
- new_ctx->replace_state =
- IEEE80211_CHANCTX_REPLACES_OTHER;
+ new_ctx = ieee80211_alloc_chanctx(local, chanreq, mode, -1);
+ if (!new_ctx)
+ return ERR_PTR(-ENOMEM);
- curr_ctx->replace_ctx = new_ctx;
- curr_ctx->replace_state =
- IEEE80211_CHANCTX_WILL_BE_REPLACED;
+ new_ctx->replace_ctx = curr_ctx;
+ new_ctx->replace_state = IEEE80211_CHANCTX_REPLACES_OTHER;
- list_add_rcu(&new_ctx->list, &local->chanctx_list);
- }
+ curr_ctx->replace_ctx = new_ctx;
+ curr_ctx->replace_state = IEEE80211_CHANCTX_WILL_BE_REPLACED;
+
+ list_add_rcu(&new_ctx->list, &local->chanctx_list);
+
+ return new_ctx;
+}
+
+static bool
+ieee80211_find_available_radio(struct ieee80211_local *local,
+ const struct ieee80211_chan_req *chanreq,
+ u32 radio_mask, int *radio_idx)
+{
+ struct wiphy *wiphy = local->hw.wiphy;
+ const struct wiphy_radio *radio;
+ int i;
+
+ *radio_idx = -1;
+ if (!wiphy->n_radio)
+ return true;
+
+ for (i = 0; i < wiphy->n_radio; i++) {
+ if (!(radio_mask & BIT(i)))
+ continue;
+
+ radio = &wiphy->radio[i];
+ if (!cfg80211_radio_chandef_valid(radio, &chanreq->oper))
+ continue;
+
+ if (!ieee80211_can_create_new_chanctx(local, i))
+ continue;
+
+ *radio_idx = i;
+ return true;
}
- list_add(&sdata->reserved_chanctx_list, &new_ctx->reserved_vifs);
- sdata->reserved_chanctx = new_ctx;
- sdata->reserved_chandef = *chandef;
- sdata->reserved_radar_required = radar_required;
- sdata->reserved_ready = false;
+ return false;
+}
+
+int ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link,
+ const struct ieee80211_chan_req *chanreq,
+ enum ieee80211_chanctx_mode mode,
+ bool radar_required)
+{
+ struct ieee80211_sub_if_data *sdata = link->sdata;
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_chanctx *new_ctx, *curr_ctx;
+ int radio_idx;
+
+ lockdep_assert_wiphy(local->hw.wiphy);
+
+ curr_ctx = ieee80211_link_get_chanctx(link);
+ if (curr_ctx && !local->ops->switch_vif_chanctx)
+ return -EOPNOTSUPP;
+
+ new_ctx = ieee80211_find_reservation_chanctx(local, chanreq, mode);
+ if (!new_ctx) {
+ if (ieee80211_can_create_new_chanctx(local, -1) &&
+ ieee80211_find_available_radio(local, chanreq,
+ sdata->wdev.radio_mask,
+ &radio_idx))
+ new_ctx = ieee80211_new_chanctx(local, chanreq, mode,
+ false, radio_idx);
+ else
+ new_ctx = ieee80211_replace_chanctx(local, chanreq,
+ mode, curr_ctx);
+ if (IS_ERR(new_ctx))
+ return PTR_ERR(new_ctx);
+ }
+
+ link->reserved_chanctx = new_ctx;
+ link->reserved = *chanreq;
+ link->reserved_radar_required = radar_required;
+ link->reserved_ready = false;
return 0;
}
static void
-ieee80211_vif_chanctx_reservation_complete(struct ieee80211_sub_if_data *sdata)
+ieee80211_link_chanctx_reservation_complete(struct ieee80211_link_data *link)
{
+ struct ieee80211_sub_if_data *sdata = link->sdata;
+
switch (sdata->vif.type) {
case NL80211_IFTYPE_ADHOC:
case NL80211_IFTYPE_AP:
case NL80211_IFTYPE_MESH_POINT:
case NL80211_IFTYPE_OCB:
- ieee80211_queue_work(&sdata->local->hw,
- &sdata->csa_finalize_work);
+ wiphy_work_queue(sdata->local->hw.wiphy,
+ &link->csa.finalize_work);
break;
case NL80211_IFTYPE_STATION:
- ieee80211_queue_work(&sdata->local->hw,
- &sdata->u.mgd.chswitch_work);
+ wiphy_hrtimer_work_queue(sdata->local->hw.wiphy,
+ &link->u.mgd.csa.switch_work, 0);
break;
case NL80211_IFTYPE_UNSPECIFIED:
case NL80211_IFTYPE_AP_VLAN:
@@ -990,37 +1418,49 @@ ieee80211_vif_chanctx_reservation_complete(struct ieee80211_sub_if_data *sdata)
}
static void
-ieee80211_vif_update_chandef(struct ieee80211_sub_if_data *sdata,
- const struct cfg80211_chan_def *chandef)
+ieee80211_link_update_chanreq(struct ieee80211_link_data *link,
+ const struct ieee80211_chan_req *chanreq)
{
+ struct ieee80211_sub_if_data *sdata = link->sdata;
+ unsigned int link_id = link->link_id;
struct ieee80211_sub_if_data *vlan;
- sdata->vif.bss_conf.chandef = *chandef;
+ link->conf->chanreq = *chanreq;
if (sdata->vif.type != NL80211_IFTYPE_AP)
return;
- list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list)
- vlan->vif.bss_conf.chandef = *chandef;
+ list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) {
+ struct ieee80211_bss_conf *vlan_conf;
+
+ vlan_conf = wiphy_dereference(sdata->local->hw.wiphy,
+ vlan->vif.link_conf[link_id]);
+ if (WARN_ON(!vlan_conf))
+ continue;
+
+ vlan_conf->chanreq = *chanreq;
+ }
}
static int
-ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata)
+ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link)
{
+ struct ieee80211_sub_if_data *sdata = link->sdata;
+ struct ieee80211_bss_conf *link_conf = link->conf;
struct ieee80211_local *local = sdata->local;
struct ieee80211_vif_chanctx_switch vif_chsw[1] = {};
struct ieee80211_chanctx *old_ctx, *new_ctx;
- const struct cfg80211_chan_def *chandef;
- u32 changed = 0;
+ const struct ieee80211_chan_req *chanreq;
+ struct ieee80211_chan_req tmp;
+ u64 changed = 0;
int err;
- lockdep_assert_held(&local->mtx);
- lockdep_assert_held(&local->chanctx_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
- new_ctx = sdata->reserved_chanctx;
- old_ctx = ieee80211_vif_get_chanctx(sdata);
+ new_ctx = link->reserved_chanctx;
+ old_ctx = ieee80211_link_get_chanctx(link);
- if (WARN_ON(!sdata->reserved_ready))
+ if (WARN_ON(!link->reserved_ready))
return -EBUSY;
if (WARN_ON(!new_ctx))
@@ -1033,69 +1473,72 @@ ieee80211_vif_use_reserved_reassign(struct ieee80211_sub_if_data *sdata)
IEEE80211_CHANCTX_REPLACES_OTHER))
return -EINVAL;
- chandef = ieee80211_chanctx_non_reserved_chandef(local, new_ctx,
- &sdata->reserved_chandef);
- if (WARN_ON(!chandef))
+ chanreq = ieee80211_chanctx_non_reserved_chandef(local, new_ctx,
+ &link->reserved,
+ &tmp);
+ if (WARN_ON(!chanreq))
return -EINVAL;
- ieee80211_change_chanctx(local, new_ctx, chandef);
+ if (link_conf->chanreq.oper.width != link->reserved.oper.width)
+ changed = BSS_CHANGED_BANDWIDTH;
+
+ ieee80211_link_update_chanreq(link, &link->reserved);
+
+ _ieee80211_change_chanctx(local, new_ctx, old_ctx, chanreq, link);
vif_chsw[0].vif = &sdata->vif;
vif_chsw[0].old_ctx = &old_ctx->conf;
vif_chsw[0].new_ctx = &new_ctx->conf;
+ vif_chsw[0].link_conf = link->conf;
- list_del(&sdata->reserved_chanctx_list);
- sdata->reserved_chanctx = NULL;
+ link->reserved_chanctx = NULL;
err = drv_switch_vif_chanctx(local, vif_chsw, 1,
CHANCTX_SWMODE_REASSIGN_VIF);
if (err) {
if (ieee80211_chanctx_refcount(local, new_ctx) == 0)
- ieee80211_free_chanctx(local, new_ctx);
+ ieee80211_free_chanctx(local, new_ctx, false);
goto out;
}
- list_move(&sdata->assigned_chanctx_list, &new_ctx->assigned_vifs);
- rcu_assign_pointer(sdata->vif.chanctx_conf, &new_ctx->conf);
+ link->radar_required = link->reserved_radar_required;
+ rcu_assign_pointer(link_conf->chanctx_conf, &new_ctx->conf);
if (sdata->vif.type == NL80211_IFTYPE_AP)
- __ieee80211_vif_copy_chanctx_to_vlans(sdata, false);
+ __ieee80211_link_copy_chanctx_to_vlans(link, false);
ieee80211_check_fast_xmit_iface(sdata);
if (ieee80211_chanctx_refcount(local, old_ctx) == 0)
- ieee80211_free_chanctx(local, old_ctx);
-
- if (sdata->vif.bss_conf.chandef.width != sdata->reserved_chandef.width)
- changed = BSS_CHANGED_BANDWIDTH;
-
- ieee80211_vif_update_chandef(sdata, &sdata->reserved_chandef);
+ ieee80211_free_chanctx(local, old_ctx, false);
+ ieee80211_recalc_chanctx_min_def(local, new_ctx);
ieee80211_recalc_smps_chanctx(local, new_ctx);
ieee80211_recalc_radar_chanctx(local, new_ctx);
- ieee80211_recalc_chanctx_min_def(local, new_ctx);
if (changed)
- ieee80211_bss_info_change_notify(sdata, changed);
+ ieee80211_link_info_change_notify(sdata, link, changed);
out:
- ieee80211_vif_chanctx_reservation_complete(sdata);
+ ieee80211_link_chanctx_reservation_complete(link);
return err;
}
static int
-ieee80211_vif_use_reserved_assign(struct ieee80211_sub_if_data *sdata)
+ieee80211_link_use_reserved_assign(struct ieee80211_link_data *link)
{
+ struct ieee80211_sub_if_data *sdata = link->sdata;
struct ieee80211_local *local = sdata->local;
struct ieee80211_chanctx *old_ctx, *new_ctx;
- const struct cfg80211_chan_def *chandef;
+ const struct ieee80211_chan_req *chanreq;
+ struct ieee80211_chan_req tmp;
int err;
- old_ctx = ieee80211_vif_get_chanctx(sdata);
- new_ctx = sdata->reserved_chanctx;
+ old_ctx = ieee80211_link_get_chanctx(link);
+ new_ctx = link->reserved_chanctx;
- if (WARN_ON(!sdata->reserved_ready))
+ if (WARN_ON(!link->reserved_ready))
return -EINVAL;
if (WARN_ON(old_ctx))
@@ -1108,38 +1551,39 @@ ieee80211_vif_use_reserved_assign(struct ieee80211_sub_if_data *sdata)
IEEE80211_CHANCTX_REPLACES_OTHER))
return -EINVAL;
- chandef = ieee80211_chanctx_non_reserved_chandef(local, new_ctx,
- &sdata->reserved_chandef);
- if (WARN_ON(!chandef))
+ chanreq = ieee80211_chanctx_non_reserved_chandef(local, new_ctx,
+ &link->reserved,
+ &tmp);
+ if (WARN_ON(!chanreq))
return -EINVAL;
- ieee80211_change_chanctx(local, new_ctx, chandef);
+ ieee80211_change_chanctx(local, new_ctx, new_ctx, chanreq);
- list_del(&sdata->reserved_chanctx_list);
- sdata->reserved_chanctx = NULL;
+ link->reserved_chanctx = NULL;
- err = ieee80211_assign_vif_chanctx(sdata, new_ctx);
+ err = ieee80211_assign_link_chanctx(link, new_ctx, false);
if (err) {
if (ieee80211_chanctx_refcount(local, new_ctx) == 0)
- ieee80211_free_chanctx(local, new_ctx);
+ ieee80211_free_chanctx(local, new_ctx, false);
goto out;
}
out:
- ieee80211_vif_chanctx_reservation_complete(sdata);
+ ieee80211_link_chanctx_reservation_complete(link);
return err;
}
static bool
-ieee80211_vif_has_in_place_reservation(struct ieee80211_sub_if_data *sdata)
+ieee80211_link_has_in_place_reservation(struct ieee80211_link_data *link)
{
+ struct ieee80211_sub_if_data *sdata = link->sdata;
struct ieee80211_chanctx *old_ctx, *new_ctx;
- lockdep_assert_held(&sdata->local->chanctx_mtx);
+ lockdep_assert_wiphy(sdata->local->hw.wiphy);
- new_ctx = sdata->reserved_chanctx;
- old_ctx = ieee80211_vif_get_chanctx(sdata);
+ new_ctx = link->reserved_chanctx;
+ old_ctx = ieee80211_link_get_chanctx(link);
if (!old_ctx)
return false;
@@ -1156,35 +1600,14 @@ ieee80211_vif_has_in_place_reservation(struct ieee80211_sub_if_data *sdata)
return true;
}
-static int ieee80211_chsw_switch_hwconf(struct ieee80211_local *local,
- struct ieee80211_chanctx *new_ctx)
-{
- const struct cfg80211_chan_def *chandef;
-
- lockdep_assert_held(&local->mtx);
- lockdep_assert_held(&local->chanctx_mtx);
-
- chandef = ieee80211_chanctx_reserved_chandef(local, new_ctx, NULL);
- if (WARN_ON(!chandef))
- return -EINVAL;
-
- local->hw.conf.radar_enabled = new_ctx->conf.radar_enabled;
- local->_oper_chandef = *chandef;
- ieee80211_hw_config(local, 0);
-
- return 0;
-}
-
static int ieee80211_chsw_switch_vifs(struct ieee80211_local *local,
int n_vifs)
{
struct ieee80211_vif_chanctx_switch *vif_chsw;
- struct ieee80211_sub_if_data *sdata;
struct ieee80211_chanctx *ctx, *old_ctx;
int i, err;
- lockdep_assert_held(&local->mtx);
- lockdep_assert_held(&local->chanctx_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
vif_chsw = kcalloc(n_vifs, sizeof(vif_chsw[0]), GFP_KERNEL);
if (!vif_chsw)
@@ -1192,6 +1615,8 @@ static int ieee80211_chsw_switch_vifs(struct ieee80211_local *local,
i = 0;
list_for_each_entry(ctx, &local->chanctx_list, list) {
+ struct ieee80211_chanctx_user_iter iter;
+
if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
continue;
@@ -1200,16 +1625,15 @@ static int ieee80211_chsw_switch_vifs(struct ieee80211_local *local,
goto out;
}
- list_for_each_entry(sdata, &ctx->reserved_vifs,
- reserved_chanctx_list) {
- if (!ieee80211_vif_has_in_place_reservation(
- sdata))
+ for_each_chanctx_user_reserved(local, ctx, &iter) {
+ if (!ieee80211_link_has_in_place_reservation(iter.link))
continue;
- old_ctx = ieee80211_vif_get_chanctx(sdata);
- vif_chsw[i].vif = &sdata->vif;
+ old_ctx = ieee80211_link_get_chanctx(iter.link);
+ vif_chsw[i].vif = &iter.sdata->vif;
vif_chsw[i].old_ctx = &old_ctx->conf;
vif_chsw[i].new_ctx = &ctx->conf;
+ vif_chsw[i].link_conf = iter.link->conf;
i++;
}
@@ -1228,17 +1652,16 @@ static int ieee80211_chsw_switch_ctxs(struct ieee80211_local *local)
struct ieee80211_chanctx *ctx;
int err;
- lockdep_assert_held(&local->mtx);
- lockdep_assert_held(&local->chanctx_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
list_for_each_entry(ctx, &local->chanctx_list, list) {
if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
continue;
- if (!list_empty(&ctx->replace_ctx->assigned_vifs))
+ if (ieee80211_chanctx_num_assigned(local, ctx) != 0)
continue;
- ieee80211_del_chanctx(local, ctx->replace_ctx);
+ ieee80211_del_chanctx(local, ctx->replace_ctx, false);
err = ieee80211_add_chanctx(local, ctx);
if (err)
goto err;
@@ -1252,10 +1675,10 @@ err:
if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
continue;
- if (!list_empty(&ctx->replace_ctx->assigned_vifs))
+ if (ieee80211_chanctx_num_assigned(local, ctx) != 0)
continue;
- ieee80211_del_chanctx(local, ctx);
+ ieee80211_del_chanctx(local, ctx, false);
WARN_ON(ieee80211_add_chanctx(local, ctx->replace_ctx));
}
@@ -1264,14 +1687,11 @@ err:
static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
{
- struct ieee80211_sub_if_data *sdata, *sdata_tmp;
struct ieee80211_chanctx *ctx, *ctx_tmp, *old_ctx;
- struct ieee80211_chanctx *new_ctx = NULL;
int err, n_assigned, n_reserved, n_ready;
int n_ctx = 0, n_vifs_switch = 0, n_vifs_assign = 0, n_vifs_ctxless = 0;
- lockdep_assert_held(&local->mtx);
- lockdep_assert_held(&local->chanctx_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
/*
* If there are 2 independent pairs of channel contexts performing
@@ -1290,6 +1710,8 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
*/
list_for_each_entry(ctx, &local->chanctx_list, list) {
+ struct ieee80211_chanctx_user_iter iter;
+
if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
continue;
@@ -1298,21 +1720,17 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
goto err;
}
- if (!local->use_chanctx)
- new_ctx = ctx;
-
n_ctx++;
n_assigned = 0;
n_reserved = 0;
n_ready = 0;
- list_for_each_entry(sdata, &ctx->replace_ctx->assigned_vifs,
- assigned_chanctx_list) {
+ for_each_chanctx_user_assigned(local, ctx->replace_ctx, &iter) {
n_assigned++;
- if (sdata->reserved_chanctx) {
+ if (iter.link->reserved_chanctx) {
n_reserved++;
- if (sdata->reserved_ready)
+ if (iter.link->reserved_ready)
n_ready++;
}
}
@@ -1329,13 +1747,12 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
}
ctx->conf.radar_enabled = false;
- list_for_each_entry(sdata, &ctx->reserved_vifs,
- reserved_chanctx_list) {
- if (ieee80211_vif_has_in_place_reservation(sdata) &&
- !sdata->reserved_ready)
+ for_each_chanctx_user_reserved(local, ctx, &iter) {
+ if (ieee80211_link_has_in_place_reservation(iter.link) &&
+ !iter.link->reserved_ready)
return -EAGAIN;
- old_ctx = ieee80211_vif_get_chanctx(sdata);
+ old_ctx = ieee80211_link_get_chanctx(iter.link);
if (old_ctx) {
if (old_ctx->replace_state ==
IEEE80211_CHANCTX_WILL_BE_REPLACED)
@@ -1346,7 +1763,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
n_vifs_ctxless++;
}
- if (sdata->reserved_radar_required)
+ if (iter.radar_required)
ctx->conf.radar_enabled = true;
}
}
@@ -1354,32 +1771,48 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
if (WARN_ON(n_ctx == 0) ||
WARN_ON(n_vifs_switch == 0 &&
n_vifs_assign == 0 &&
- n_vifs_ctxless == 0) ||
- WARN_ON(n_ctx > 1 && !local->use_chanctx) ||
- WARN_ON(!new_ctx && !local->use_chanctx)) {
+ n_vifs_ctxless == 0)) {
err = -EINVAL;
goto err;
}
+ /* update station rate control and min width before switch */
+ list_for_each_entry(ctx, &local->chanctx_list, list) {
+ struct ieee80211_chanctx_user_iter iter;
+
+ if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
+ continue;
+
+ if (WARN_ON(!ctx->replace_ctx)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ for_each_chanctx_user_reserved(local, ctx, &iter) {
+ if (!ieee80211_link_has_in_place_reservation(iter.link))
+ continue;
+
+ ieee80211_chan_bw_change(local,
+ ieee80211_link_get_chanctx(iter.link),
+ true, true);
+ }
+
+ _ieee80211_recalc_chanctx_min_def(local, ctx, NULL, true);
+ }
+
/*
* All necessary vifs are ready. Perform the switch now depending on
* reservations and driver capabilities.
*/
- if (local->use_chanctx) {
- if (n_vifs_switch > 0) {
- err = ieee80211_chsw_switch_vifs(local, n_vifs_switch);
- if (err)
- goto err;
- }
+ if (n_vifs_switch > 0) {
+ err = ieee80211_chsw_switch_vifs(local, n_vifs_switch);
+ if (err)
+ goto err;
+ }
- if (n_vifs_assign > 0 || n_vifs_ctxless > 0) {
- err = ieee80211_chsw_switch_ctxs(local);
- if (err)
- goto err;
- }
- } else {
- err = ieee80211_chsw_switch_hwconf(local, new_ctx);
+ if (n_vifs_assign > 0 || n_vifs_ctxless > 0) {
+ err = ieee80211_chsw_switch_ctxs(local);
if (err)
goto err;
}
@@ -1389,6 +1822,8 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
* context(s).
*/
list_for_each_entry(ctx, &local->chanctx_list, list) {
+ struct ieee80211_chanctx_user_iter iter;
+
if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
continue;
@@ -1397,33 +1832,36 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
goto err;
}
- list_for_each_entry(sdata, &ctx->reserved_vifs,
- reserved_chanctx_list) {
- u32 changed = 0;
+ for_each_chanctx_user_reserved(local, ctx, &iter) {
+ struct ieee80211_link_data *link = iter.link;
+ struct ieee80211_sub_if_data *sdata = iter.sdata;
+ struct ieee80211_bss_conf *link_conf = link->conf;
+ u64 changed = 0;
- if (!ieee80211_vif_has_in_place_reservation(sdata))
+ if (!ieee80211_link_has_in_place_reservation(link))
continue;
- rcu_assign_pointer(sdata->vif.chanctx_conf, &ctx->conf);
+ rcu_assign_pointer(link_conf->chanctx_conf,
+ &ctx->conf);
if (sdata->vif.type == NL80211_IFTYPE_AP)
- __ieee80211_vif_copy_chanctx_to_vlans(sdata,
- false);
+ __ieee80211_link_copy_chanctx_to_vlans(link,
+ false);
ieee80211_check_fast_xmit_iface(sdata);
- sdata->radar_required = sdata->reserved_radar_required;
+ link->radar_required = iter.radar_required;
- if (sdata->vif.bss_conf.chandef.width !=
- sdata->reserved_chandef.width)
+ if (link_conf->chanreq.oper.width != iter.chanreq->oper.width)
changed = BSS_CHANGED_BANDWIDTH;
- ieee80211_vif_update_chandef(sdata, &sdata->reserved_chandef);
+ ieee80211_link_update_chanreq(link, &link->reserved);
if (changed)
- ieee80211_bss_info_change_notify(sdata,
- changed);
+ ieee80211_link_info_change_notify(sdata,
+ link,
+ changed);
- ieee80211_recalc_txpower(sdata, false);
+ ieee80211_recalc_txpower(link, false);
}
ieee80211_recalc_chanctx_chantype(local, ctx);
@@ -1431,17 +1869,14 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
ieee80211_recalc_radar_chanctx(local, ctx);
ieee80211_recalc_chanctx_min_def(local, ctx);
- list_for_each_entry_safe(sdata, sdata_tmp, &ctx->reserved_vifs,
- reserved_chanctx_list) {
- if (ieee80211_vif_get_chanctx(sdata) != ctx)
+ for_each_chanctx_user_reserved(local, ctx, &iter) {
+ if (ieee80211_link_get_chanctx(iter.link) != ctx)
continue;
- list_del(&sdata->reserved_chanctx_list);
- list_move(&sdata->assigned_chanctx_list,
- &ctx->assigned_vifs);
- sdata->reserved_chanctx = NULL;
+ iter.link->reserved_chanctx = NULL;
- ieee80211_vif_chanctx_reservation_complete(sdata);
+ ieee80211_link_chanctx_reservation_complete(iter.link);
+ ieee80211_chan_bw_change(local, ctx, false, false);
}
/*
@@ -1451,31 +1886,27 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
* reservation for originally requested interface has already
* succeeded at this point.
*/
- list_for_each_entry_safe(sdata, sdata_tmp, &ctx->reserved_vifs,
- reserved_chanctx_list) {
- if (WARN_ON(ieee80211_vif_has_in_place_reservation(
- sdata)))
- continue;
+ for_each_chanctx_user_reserved(local, ctx, &iter) {
+ struct ieee80211_link_data *link = iter.link;
- if (WARN_ON(sdata->reserved_chanctx != ctx))
+ if (WARN_ON(ieee80211_link_has_in_place_reservation(link)))
continue;
- if (!sdata->reserved_ready)
+ if (!link->reserved_ready)
continue;
- if (ieee80211_vif_get_chanctx(sdata))
- err = ieee80211_vif_use_reserved_reassign(
- sdata);
+ if (ieee80211_link_get_chanctx(link))
+ err = ieee80211_link_use_reserved_reassign(link);
else
- err = ieee80211_vif_use_reserved_assign(sdata);
+ err = ieee80211_link_use_reserved_assign(link);
if (err) {
- sdata_info(sdata,
- "failed to finalize (re-)assign reservation (err=%d)\n",
- err);
- ieee80211_vif_unreserve_chanctx(sdata);
+ link_info(link,
+ "failed to finalize (re-)assign reservation (err=%d)\n",
+ err);
+ ieee80211_link_unreserve_chanctx(link);
cfg80211_stop_iface(local->hw.wiphy,
- &sdata->wdev,
+ &link->sdata->wdev,
GFP_KERNEL);
}
}
@@ -1501,103 +1932,126 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local)
err:
list_for_each_entry(ctx, &local->chanctx_list, list) {
+ struct ieee80211_chanctx_user_iter iter;
+
if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER)
continue;
- list_for_each_entry_safe(sdata, sdata_tmp, &ctx->reserved_vifs,
- reserved_chanctx_list) {
- ieee80211_vif_unreserve_chanctx(sdata);
- ieee80211_vif_chanctx_reservation_complete(sdata);
+ for_each_chanctx_user_reserved(local, ctx, &iter) {
+ ieee80211_link_unreserve_chanctx(iter.link);
+ ieee80211_link_chanctx_reservation_complete(iter.link);
}
}
return err;
}
-static void __ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata)
+void __ieee80211_link_release_channel(struct ieee80211_link_data *link,
+ bool skip_idle_recalc)
{
+ struct ieee80211_sub_if_data *sdata = link->sdata;
+ struct ieee80211_bss_conf *link_conf = link->conf;
struct ieee80211_local *local = sdata->local;
struct ieee80211_chanctx_conf *conf;
struct ieee80211_chanctx *ctx;
bool use_reserved_switch = false;
- lockdep_assert_held(&local->chanctx_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
- conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
- lockdep_is_held(&local->chanctx_mtx));
+ conf = rcu_dereference_protected(link_conf->chanctx_conf,
+ lockdep_is_held(&local->hw.wiphy->mtx));
if (!conf)
return;
ctx = container_of(conf, struct ieee80211_chanctx, conf);
- if (sdata->reserved_chanctx) {
- if (sdata->reserved_chanctx->replace_state ==
- IEEE80211_CHANCTX_REPLACES_OTHER &&
- ieee80211_chanctx_num_reserved(local,
- sdata->reserved_chanctx) > 1)
+ if (link->reserved_chanctx) {
+ if (link->reserved_chanctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER &&
+ ieee80211_chanctx_num_reserved(local, link->reserved_chanctx) > 1)
use_reserved_switch = true;
- ieee80211_vif_unreserve_chanctx(sdata);
+ ieee80211_link_unreserve_chanctx(link);
}
- ieee80211_assign_vif_chanctx(sdata, NULL);
+ ieee80211_assign_link_chanctx(link, NULL, false);
if (ieee80211_chanctx_refcount(local, ctx) == 0)
- ieee80211_free_chanctx(local, ctx);
+ ieee80211_free_chanctx(local, ctx, skip_idle_recalc);
- sdata->radar_required = false;
+ link->radar_required = false;
/* Unreserving may ready an in-place reservation. */
if (use_reserved_switch)
ieee80211_vif_use_reserved_switch(local);
}
-int ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata,
- const struct cfg80211_chan_def *chandef,
- enum ieee80211_chanctx_mode mode)
+int _ieee80211_link_use_channel(struct ieee80211_link_data *link,
+ const struct ieee80211_chan_req *chanreq,
+ enum ieee80211_chanctx_mode mode,
+ bool assign_on_failure)
{
+ struct ieee80211_sub_if_data *sdata = link->sdata;
struct ieee80211_local *local = sdata->local;
struct ieee80211_chanctx *ctx;
u8 radar_detect_width = 0;
+ bool reserved = false;
+ int radio_idx;
int ret;
- lockdep_assert_held(&local->mtx);
-
- WARN_ON(sdata->dev && netif_carrier_ok(sdata->dev));
+ lockdep_assert_wiphy(local->hw.wiphy);
- mutex_lock(&local->chanctx_mtx);
+ if (!ieee80211_vif_link_active(&sdata->vif, link->link_id)) {
+ ieee80211_link_update_chanreq(link, chanreq);
+ return 0;
+ }
ret = cfg80211_chandef_dfs_required(local->hw.wiphy,
- chandef,
+ &chanreq->oper,
sdata->wdev.iftype);
if (ret < 0)
goto out;
if (ret > 0)
- radar_detect_width = BIT(chandef->width);
+ radar_detect_width = BIT(chanreq->oper.width);
- sdata->radar_required = ret;
+ link->radar_required = ret;
- ret = ieee80211_check_combinations(sdata, chandef, mode,
- radar_detect_width);
+ ret = ieee80211_check_combinations(sdata, &chanreq->oper, mode,
+ radar_detect_width, -1);
if (ret < 0)
goto out;
- __ieee80211_vif_release_channel(sdata);
-
- ctx = ieee80211_find_chanctx(local, chandef, mode);
- if (!ctx)
- ctx = ieee80211_new_chanctx(local, chandef, mode);
+ if (!local->in_reconfig)
+ __ieee80211_link_release_channel(link, false);
+
+ ctx = ieee80211_find_chanctx(local, link, chanreq, mode);
+ /* Note: context is now reserved */
+ if (ctx)
+ reserved = true;
+ else if (!ieee80211_find_available_radio(local, chanreq,
+ sdata->wdev.radio_mask,
+ &radio_idx))
+ ctx = ERR_PTR(-EBUSY);
+ else
+ ctx = ieee80211_new_chanctx(local, chanreq, mode,
+ assign_on_failure, radio_idx);
if (IS_ERR(ctx)) {
ret = PTR_ERR(ctx);
goto out;
}
- ieee80211_vif_update_chandef(sdata, chandef);
+ ieee80211_link_update_chanreq(link, chanreq);
+
+ ret = ieee80211_assign_link_chanctx(link, ctx, assign_on_failure);
+
+ if (reserved) {
+ /* remove reservation */
+ WARN_ON(link->reserved_chanctx != ctx);
+ link->reserved_chanctx = NULL;
+ }
- ret = ieee80211_assign_vif_chanctx(sdata, ctx);
if (ret) {
/* if assign fails refcount stays the same */
if (ieee80211_chanctx_refcount(local, ctx) == 0)
- ieee80211_free_chanctx(local, ctx);
+ ieee80211_free_chanctx(local, ctx, false);
goto out;
}
@@ -1605,24 +2059,23 @@ int ieee80211_vif_use_channel(struct ieee80211_sub_if_data *sdata,
ieee80211_recalc_radar_chanctx(local, ctx);
out:
if (ret)
- sdata->radar_required = false;
+ link->radar_required = false;
- mutex_unlock(&local->chanctx_mtx);
return ret;
}
-int ieee80211_vif_use_reserved_context(struct ieee80211_sub_if_data *sdata)
+int ieee80211_link_use_reserved_context(struct ieee80211_link_data *link)
{
+ struct ieee80211_sub_if_data *sdata = link->sdata;
struct ieee80211_local *local = sdata->local;
struct ieee80211_chanctx *new_ctx;
struct ieee80211_chanctx *old_ctx;
int err;
- lockdep_assert_held(&local->mtx);
- lockdep_assert_held(&local->chanctx_mtx);
+ lockdep_assert_wiphy(local->hw.wiphy);
- new_ctx = sdata->reserved_chanctx;
- old_ctx = ieee80211_vif_get_chanctx(sdata);
+ new_ctx = link->reserved_chanctx;
+ old_ctx = ieee80211_link_get_chanctx(link);
if (WARN_ON(!new_ctx))
return -EINVAL;
@@ -1631,19 +2084,16 @@ int ieee80211_vif_use_reserved_context(struct ieee80211_sub_if_data *sdata)
IEEE80211_CHANCTX_WILL_BE_REPLACED))
return -EINVAL;
- if (WARN_ON(sdata->reserved_ready))
+ if (WARN_ON(link->reserved_ready))
return -EINVAL;
- sdata->reserved_ready = true;
+ link->reserved_ready = true;
if (new_ctx->replace_state == IEEE80211_CHANCTX_REPLACE_NONE) {
if (old_ctx)
- err = ieee80211_vif_use_reserved_reassign(sdata);
- else
- err = ieee80211_vif_use_reserved_assign(sdata);
+ return ieee80211_link_use_reserved_reassign(link);
- if (err)
- return err;
+ return ieee80211_link_use_reserved_assign(link);
}
/*
@@ -1675,59 +2125,90 @@ int ieee80211_vif_use_reserved_context(struct ieee80211_sub_if_data *sdata)
return 0;
}
-int ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata,
- const struct cfg80211_chan_def *chandef,
- u32 *changed)
+/*
+ * This is similar to ieee80211_chanctx_compatible(), but rechecks
+ * against all the links actually using it (except the one that's
+ * passed, since that one is changing).
+ * This is done in order to allow changes to the AP's bandwidth for
+ * wider bandwidth OFDMA purposes, which wouldn't be treated as
+ * compatible by ieee80211_chanctx_recheck() but is OK if the link
+ * requesting the update is the only one using it.
+ */
+static const struct ieee80211_chan_req *
+ieee80211_chanctx_recheck(struct ieee80211_local *local,
+ struct ieee80211_link_data *skip_link,
+ struct ieee80211_chanctx *ctx,
+ const struct ieee80211_chan_req *req,
+ struct ieee80211_chan_req *tmp)
{
+ const struct ieee80211_chan_req *ret = req;
+ struct ieee80211_chanctx_user_iter iter;
+
+ lockdep_assert_wiphy(local->hw.wiphy);
+
+ for_each_chanctx_user_all(local, ctx, &iter) {
+ if (iter.link == skip_link)
+ continue;
+
+ ret = ieee80211_chanreq_compatible(ret, iter.chanreq, tmp);
+ if (!ret)
+ return NULL;
+ }
+
+ *tmp = *ret;
+ return tmp;
+}
+
+int ieee80211_link_change_chanreq(struct ieee80211_link_data *link,
+ const struct ieee80211_chan_req *chanreq,
+ u64 *changed)
+{
+ struct ieee80211_sub_if_data *sdata = link->sdata;
+ struct ieee80211_bss_conf *link_conf = link->conf;
struct ieee80211_local *local = sdata->local;
struct ieee80211_chanctx_conf *conf;
struct ieee80211_chanctx *ctx;
- const struct cfg80211_chan_def *compat;
- int ret;
+ const struct ieee80211_chan_req *compat;
+ struct ieee80211_chan_req tmp;
+
+ lockdep_assert_wiphy(local->hw.wiphy);
- if (!cfg80211_chandef_usable(sdata->local->hw.wiphy, chandef,
+ if (!cfg80211_chandef_usable(sdata->local->hw.wiphy,
+ &chanreq->oper,
IEEE80211_CHAN_DISABLED))
return -EINVAL;
- mutex_lock(&local->chanctx_mtx);
- if (cfg80211_chandef_identical(chandef, &sdata->vif.bss_conf.chandef)) {
- ret = 0;
- goto out;
- }
+ /* for non-HT 20 MHz the rest doesn't matter */
+ if (chanreq->oper.width == NL80211_CHAN_WIDTH_20_NOHT &&
+ cfg80211_chandef_identical(&chanreq->oper, &link_conf->chanreq.oper))
+ return 0;
- if (chandef->width == NL80211_CHAN_WIDTH_20_NOHT ||
- sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT) {
- ret = -EINVAL;
- goto out;
- }
+ /* but you cannot switch to/from it */
+ if (chanreq->oper.width == NL80211_CHAN_WIDTH_20_NOHT ||
+ link_conf->chanreq.oper.width == NL80211_CHAN_WIDTH_20_NOHT)
+ return -EINVAL;
- conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
- lockdep_is_held(&local->chanctx_mtx));
- if (!conf) {
- ret = -EINVAL;
- goto out;
- }
+ conf = rcu_dereference_protected(link_conf->chanctx_conf,
+ lockdep_is_held(&local->hw.wiphy->mtx));
+ if (!conf)
+ return -EINVAL;
ctx = container_of(conf, struct ieee80211_chanctx, conf);
- compat = cfg80211_chandef_compatible(&conf->def, chandef);
- if (!compat) {
- ret = -EINVAL;
- goto out;
- }
+ compat = ieee80211_chanctx_recheck(local, link, ctx, chanreq, &tmp);
+ if (!compat)
+ return -EINVAL;
switch (ctx->replace_state) {
case IEEE80211_CHANCTX_REPLACE_NONE:
- if (!ieee80211_chanctx_reserved_chandef(local, ctx, compat)) {
- ret = -EBUSY;
- goto out;
- }
+ if (!ieee80211_chanctx_reserved_chanreq(local, ctx, compat,
+ &tmp))
+ return -EBUSY;
break;
case IEEE80211_CHANCTX_WILL_BE_REPLACED:
/* TODO: Perhaps the bandwidth change could be treated as a
* reservation itself? */
- ret = -EBUSY;
- goto out;
+ return -EBUSY;
case IEEE80211_CHANCTX_REPLACES_OTHER:
/* channel context that is going to replace another channel
* context doesn't really exist and shouldn't be assigned
@@ -1736,45 +2217,49 @@ int ieee80211_vif_change_bandwidth(struct ieee80211_sub_if_data *sdata,
break;
}
- ieee80211_vif_update_chandef(sdata, chandef);
+ ieee80211_link_update_chanreq(link, chanreq);
ieee80211_recalc_chanctx_chantype(local, ctx);
*changed |= BSS_CHANGED_BANDWIDTH;
- ret = 0;
- out:
- mutex_unlock(&local->chanctx_mtx);
- return ret;
+ return 0;
}
-void ieee80211_vif_release_channel(struct ieee80211_sub_if_data *sdata)
+void ieee80211_link_release_channel(struct ieee80211_link_data *link)
{
- WARN_ON(sdata->dev && netif_carrier_ok(sdata->dev));
+ struct ieee80211_sub_if_data *sdata = link->sdata;
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
+ return;
- lockdep_assert_held(&sdata->local->mtx);
+ lockdep_assert_wiphy(sdata->local->hw.wiphy);
- mutex_lock(&sdata->local->chanctx_mtx);
- __ieee80211_vif_release_channel(sdata);
- mutex_unlock(&sdata->local->chanctx_mtx);
+ if (rcu_access_pointer(link->conf->chanctx_conf))
+ __ieee80211_link_release_channel(link, false);
}
-void ieee80211_vif_vlan_copy_chanctx(struct ieee80211_sub_if_data *sdata)
+void ieee80211_link_vlan_copy_chanctx(struct ieee80211_link_data *link)
{
+ struct ieee80211_sub_if_data *sdata = link->sdata;
+ unsigned int link_id = link->link_id;
+ struct ieee80211_bss_conf *link_conf = link->conf;
+ struct ieee80211_bss_conf *ap_conf;
struct ieee80211_local *local = sdata->local;
struct ieee80211_sub_if_data *ap;
struct ieee80211_chanctx_conf *conf;
+ lockdep_assert_wiphy(local->hw.wiphy);
+
if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->bss))
return;
ap = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap);
- mutex_lock(&local->chanctx_mtx);
-
- conf = rcu_dereference_protected(ap->vif.chanctx_conf,
- lockdep_is_held(&local->chanctx_mtx));
- rcu_assign_pointer(sdata->vif.chanctx_conf, conf);
- mutex_unlock(&local->chanctx_mtx);
+ ap_conf = wiphy_dereference(local->hw.wiphy,
+ ap->vif.link_conf[link_id]);
+ conf = wiphy_dereference(local->hw.wiphy,
+ ap_conf->chanctx_conf);
+ rcu_assign_pointer(link_conf->chanctx_conf, conf);
}
void ieee80211_iter_chan_contexts_atomic(
@@ -1794,3 +2279,21 @@ void ieee80211_iter_chan_contexts_atomic(
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(ieee80211_iter_chan_contexts_atomic);
+
+void ieee80211_iter_chan_contexts_mtx(
+ struct ieee80211_hw *hw,
+ void (*iter)(struct ieee80211_hw *hw,
+ struct ieee80211_chanctx_conf *chanctx_conf,
+ void *data),
+ void *iter_data)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct ieee80211_chanctx *ctx;
+
+ lockdep_assert_wiphy(hw->wiphy);
+
+ list_for_each_entry(ctx, &local->chanctx_list, list)
+ if (ctx->driver_present)
+ iter(hw, &ctx->conf, iter_data);
+}
+EXPORT_SYMBOL_GPL(ieee80211_iter_chan_contexts_mtx);
diff --git a/net/mac80211/debug.h b/net/mac80211/debug.h
index d90a8f9cc3fd..ef7c1a68d88d 100644
--- a/net/mac80211/debug.h
+++ b/net/mac80211/debug.h
@@ -1,6 +1,11 @@
/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Portions
+ * Copyright (C) 2022 - 2025 Intel Corporation
+ */
#ifndef __MAC80211_DEBUG_H
#define __MAC80211_DEBUG_H
+#include <linux/once_lite.h>
#include <net/cfg80211.h>
#ifdef CONFIG_MAC80211_OCB_DEBUG
@@ -130,6 +135,46 @@ do { \
#define sdata_dbg(sdata, fmt, ...) \
_sdata_dbg(1, sdata, fmt, ##__VA_ARGS__)
+#define link_info(link, fmt, ...) \
+ do { \
+ if (ieee80211_vif_is_mld(&(link)->sdata->vif)) \
+ _sdata_info((link)->sdata, "[link %d] " fmt, \
+ (link)->link_id, \
+ ##__VA_ARGS__); \
+ else \
+ _sdata_info((link)->sdata, fmt, ##__VA_ARGS__); \
+ } while (0)
+#define link_err(link, fmt, ...) \
+ do { \
+ if (ieee80211_vif_is_mld(&(link)->sdata->vif)) \
+ _sdata_err((link)->sdata, "[link %d] " fmt, \
+ (link)->link_id, \
+ ##__VA_ARGS__); \
+ else \
+ _sdata_err((link)->sdata, fmt, ##__VA_ARGS__); \
+ } while (0)
+#define link_err_once(link, fmt, ...) \
+ DO_ONCE_LITE(link_err, link, fmt, ##__VA_ARGS__)
+#define link_id_info(sdata, link_id, fmt, ...) \
+ do { \
+ if (ieee80211_vif_is_mld(&sdata->vif)) \
+ _sdata_info(sdata, "[link %d] " fmt, link_id, \
+ ##__VA_ARGS__); \
+ else \
+ _sdata_info(sdata, fmt, ##__VA_ARGS__); \
+ } while (0)
+#define _link_id_dbg(print, sdata, link_id, fmt, ...) \
+ do { \
+ if (ieee80211_vif_is_mld(&(sdata)->vif)) \
+ _sdata_dbg(print, sdata, "[link %d] " fmt, \
+ link_id, ##__VA_ARGS__); \
+ else \
+ _sdata_dbg(print, sdata, fmt, ##__VA_ARGS__); \
+ } while (0)
+#define link_dbg(link, fmt, ...) \
+ _link_id_dbg(1, (link)->sdata, (link)->link_id, \
+ fmt, ##__VA_ARGS__)
+
#define ht_dbg(sdata, fmt, ...) \
_sdata_dbg(MAC80211_HT_DEBUG, \
sdata, fmt, ##__VA_ARGS__)
@@ -193,6 +238,9 @@ do { \
#define mlme_dbg(sdata, fmt, ...) \
_sdata_dbg(MAC80211_MLME_DEBUG, \
sdata, fmt, ##__VA_ARGS__)
+#define mlme_link_id_dbg(sdata, link_id, fmt, ...) \
+ _link_id_dbg(MAC80211_MLME_DEBUG, sdata, link_id, \
+ fmt, ##__VA_ARGS__)
#define mlme_dbg_ratelimited(sdata, fmt, ...) \
_sdata_dbg(MAC80211_MLME_DEBUG && net_ratelimit(), \
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index b5adf3625d16..d02f07368c51 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -1,11 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mac80211 debugfs for wireless PHYs
*
* Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
- *
- * GPLv2
- *
+ * Copyright (C) 2018 - 2019, 2021-2025 Intel Corporation
*/
#include <linux/debugfs.h>
@@ -43,9 +42,8 @@ static ssize_t name## _read(struct file *file, char __user *userbuf, \
}
#define DEBUGFS_READONLY_FILE_OPS(name) \
-static const struct file_operations name## _ops = { \
+static const struct debugfs_short_fops name## _ops = { \
.read = name## _read, \
- .open = simple_open, \
.llseek = generic_file_llseek, \
};
@@ -54,12 +52,14 @@ static const struct file_operations name## _ops = { \
DEBUGFS_READONLY_FILE_OPS(name)
#define DEBUGFS_ADD(name) \
- debugfs_create_file(#name, 0400, phyd, local, &name## _ops);
+ debugfs_create_file(#name, 0400, phyd, local, &name## _ops)
#define DEBUGFS_ADD_MODE(name, mode) \
debugfs_create_file(#name, mode, phyd, local, &name## _ops);
+DEBUGFS_READONLY_FILE(hw_conf, "%x",
+ local->hw.conf.flags);
DEBUGFS_READONLY_FILE(user_power, "%d",
local->user_power_level);
DEBUGFS_READONLY_FILE(power, "%d",
@@ -82,7 +82,6 @@ static ssize_t aqm_read(struct file *file,
int len = 0;
spin_lock_bh(&local->fq.lock);
- rcu_read_lock();
len = scnprintf(buf, sizeof(buf),
"access name value\n"
@@ -105,7 +104,6 @@ static ssize_t aqm_read(struct file *file,
fq->limit,
fq->quantum);
- rcu_read_unlock();
spin_unlock_bh(&local->fq.lock);
return simple_read_from_buffer(user_buf, count, ppos,
@@ -119,18 +117,17 @@ static ssize_t aqm_write(struct file *file,
{
struct ieee80211_local *local = file->private_data;
char buf[100];
- size_t len;
- if (count > sizeof(buf))
+ if (count >= sizeof(buf))
return -EINVAL;
if (copy_from_user(buf, user_buf, count))
return -EFAULT;
- buf[sizeof(buf) - 1] = '\0';
- len = strlen(buf);
- if (len > 0 && buf[len-1] == '\n')
- buf[len-1] = 0;
+ if (count && buf[count - 1] == '\n')
+ buf[count - 1] = '\0';
+ else
+ buf[count] = '\0';
if (sscanf(buf, "fq_limit %u", &local->fq.limit) == 1)
return count;
@@ -142,10 +139,268 @@ static ssize_t aqm_write(struct file *file,
return -EINVAL;
}
-static const struct file_operations aqm_ops = {
+static const struct debugfs_short_fops aqm_ops = {
.write = aqm_write,
.read = aqm_read,
- .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static ssize_t airtime_flags_read(struct file *file,
+ char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct ieee80211_local *local = file->private_data;
+ char buf[128] = {}, *pos, *end;
+
+ pos = buf;
+ end = pos + sizeof(buf) - 1;
+
+ if (local->airtime_flags & AIRTIME_USE_TX)
+ pos += scnprintf(pos, end - pos, "AIRTIME_TX\t(%lx)\n",
+ AIRTIME_USE_TX);
+ if (local->airtime_flags & AIRTIME_USE_RX)
+ pos += scnprintf(pos, end - pos, "AIRTIME_RX\t(%lx)\n",
+ AIRTIME_USE_RX);
+
+ return simple_read_from_buffer(user_buf, count, ppos, buf,
+ strlen(buf));
+}
+
+static ssize_t airtime_flags_write(struct file *file,
+ const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct ieee80211_local *local = file->private_data;
+ char buf[16];
+
+ if (count >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(buf, user_buf, count))
+ return -EFAULT;
+
+ if (count && buf[count - 1] == '\n')
+ buf[count - 1] = '\0';
+ else
+ buf[count] = '\0';
+
+ if (kstrtou16(buf, 0, &local->airtime_flags))
+ return -EINVAL;
+
+ return count;
+}
+
+static const struct debugfs_short_fops airtime_flags_ops = {
+ .write = airtime_flags_write,
+ .read = airtime_flags_read,
+ .llseek = default_llseek,
+};
+
+static ssize_t aql_pending_read(struct file *file,
+ char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct ieee80211_local *local = file->private_data;
+ char buf[400];
+ int len = 0;
+
+ len = scnprintf(buf, sizeof(buf),
+ "AC AQL pending\n"
+ "VO %u us\n"
+ "VI %u us\n"
+ "BE %u us\n"
+ "BK %u us\n"
+ "total %u us\n",
+ atomic_read(&local->aql_ac_pending_airtime[IEEE80211_AC_VO]),
+ atomic_read(&local->aql_ac_pending_airtime[IEEE80211_AC_VI]),
+ atomic_read(&local->aql_ac_pending_airtime[IEEE80211_AC_BE]),
+ atomic_read(&local->aql_ac_pending_airtime[IEEE80211_AC_BK]),
+ atomic_read(&local->aql_total_pending_airtime));
+ return simple_read_from_buffer(user_buf, count, ppos,
+ buf, len);
+}
+
+static const struct debugfs_short_fops aql_pending_ops = {
+ .read = aql_pending_read,
+ .llseek = default_llseek,
+};
+
+static ssize_t aql_txq_limit_read(struct file *file,
+ char __user *user_buf,
+ size_t count,
+ loff_t *ppos)
+{
+ struct ieee80211_local *local = file->private_data;
+ char buf[400];
+ int len = 0;
+
+ len = scnprintf(buf, sizeof(buf),
+ "AC AQL limit low AQL limit high\n"
+ "VO %u %u\n"
+ "VI %u %u\n"
+ "BE %u %u\n"
+ "BK %u %u\n",
+ local->aql_txq_limit_low[IEEE80211_AC_VO],
+ local->aql_txq_limit_high[IEEE80211_AC_VO],
+ local->aql_txq_limit_low[IEEE80211_AC_VI],
+ local->aql_txq_limit_high[IEEE80211_AC_VI],
+ local->aql_txq_limit_low[IEEE80211_AC_BE],
+ local->aql_txq_limit_high[IEEE80211_AC_BE],
+ local->aql_txq_limit_low[IEEE80211_AC_BK],
+ local->aql_txq_limit_high[IEEE80211_AC_BK]);
+ return simple_read_from_buffer(user_buf, count, ppos,
+ buf, len);
+}
+
+static ssize_t aql_txq_limit_write(struct file *file,
+ const char __user *user_buf,
+ size_t count,
+ loff_t *ppos)
+{
+ struct ieee80211_local *local = file->private_data;
+ char buf[100];
+ u32 ac, q_limit_low, q_limit_high, q_limit_low_old, q_limit_high_old;
+ struct sta_info *sta;
+
+ if (count >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(buf, user_buf, count))
+ return -EFAULT;
+
+ if (count && buf[count - 1] == '\n')
+ buf[count - 1] = '\0';
+ else
+ buf[count] = '\0';
+
+ if (sscanf(buf, "%u %u %u", &ac, &q_limit_low, &q_limit_high) != 3)
+ return -EINVAL;
+
+ if (ac >= IEEE80211_NUM_ACS)
+ return -EINVAL;
+
+ q_limit_low_old = local->aql_txq_limit_low[ac];
+ q_limit_high_old = local->aql_txq_limit_high[ac];
+
+ guard(wiphy)(local->hw.wiphy);
+
+ local->aql_txq_limit_low[ac] = q_limit_low;
+ local->aql_txq_limit_high[ac] = q_limit_high;
+
+ list_for_each_entry(sta, &local->sta_list, list) {
+ /* If a sta has customized queue limits, keep it */
+ if (sta->airtime[ac].aql_limit_low == q_limit_low_old &&
+ sta->airtime[ac].aql_limit_high == q_limit_high_old) {
+ sta->airtime[ac].aql_limit_low = q_limit_low;
+ sta->airtime[ac].aql_limit_high = q_limit_high;
+ }
+ }
+
+ return count;
+}
+
+static const struct debugfs_short_fops aql_txq_limit_ops = {
+ .write = aql_txq_limit_write,
+ .read = aql_txq_limit_read,
+ .llseek = default_llseek,
+};
+
+static ssize_t aql_enable_read(struct file *file, char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ char buf[3];
+ int len;
+
+ len = scnprintf(buf, sizeof(buf), "%d\n",
+ !static_key_false(&aql_disable.key));
+
+ return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static ssize_t aql_enable_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ bool aql_disabled = static_key_false(&aql_disable.key);
+ char buf[3];
+ size_t len;
+
+ if (count > sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(buf, user_buf, count))
+ return -EFAULT;
+
+ buf[sizeof(buf) - 1] = '\0';
+ len = strlen(buf);
+ if (len > 0 && buf[len - 1] == '\n')
+ buf[len - 1] = 0;
+
+ if (buf[0] == '0' && buf[1] == '\0') {
+ if (!aql_disabled)
+ static_branch_inc(&aql_disable);
+ } else if (buf[0] == '1' && buf[1] == '\0') {
+ if (aql_disabled)
+ static_branch_dec(&aql_disable);
+ } else {
+ return -EINVAL;
+ }
+
+ return count;
+}
+
+static const struct debugfs_short_fops aql_enable_ops = {
+ .write = aql_enable_write,
+ .read = aql_enable_read,
+ .llseek = default_llseek,
+};
+
+static ssize_t force_tx_status_read(struct file *file,
+ char __user *user_buf,
+ size_t count,
+ loff_t *ppos)
+{
+ struct ieee80211_local *local = file->private_data;
+ char buf[3];
+ int len = 0;
+
+ len = scnprintf(buf, sizeof(buf), "%d\n", (int)local->force_tx_status);
+
+ return simple_read_from_buffer(user_buf, count, ppos,
+ buf, len);
+}
+
+static ssize_t force_tx_status_write(struct file *file,
+ const char __user *user_buf,
+ size_t count,
+ loff_t *ppos)
+{
+ struct ieee80211_local *local = file->private_data;
+ char buf[3];
+
+ if (count >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(buf, user_buf, count))
+ return -EFAULT;
+
+ if (count && buf[count - 1] == '\n')
+ buf[count - 1] = '\0';
+ else
+ buf[count] = '\0';
+
+ if (buf[0] == '0' && buf[1] == '\0')
+ local->force_tx_status = 0;
+ else if (buf[0] == '1' && buf[1] == '\0')
+ local->force_tx_status = 1;
+ else
+ return -EINVAL;
+
+ return count;
+}
+
+static const struct debugfs_short_fops force_tx_status_ops = {
+ .write = force_tx_status_write,
+ .read = force_tx_status_read,
.llseek = default_llseek,
};
@@ -154,18 +409,24 @@ static ssize_t reset_write(struct file *file, const char __user *user_buf,
size_t count, loff_t *ppos)
{
struct ieee80211_local *local = file->private_data;
+ int ret;
rtnl_lock();
+ wiphy_lock(local->hw.wiphy);
__ieee80211_suspend(&local->hw, NULL);
- __ieee80211_resume(&local->hw);
+ ret = __ieee80211_resume(&local->hw);
+ wiphy_unlock(local->hw.wiphy);
+
+ if (ret)
+ cfg80211_shutdown_all_interfaces(local->hw.wiphy);
+
rtnl_unlock();
return count;
}
-static const struct file_operations reset_ops = {
+static const struct debugfs_short_fops reset_ops = {
.write = reset_write,
- .open = simple_open,
.llseek = noop_llseek,
};
#endif
@@ -185,6 +446,7 @@ static const char *hw_flag_names[] = {
FLAG(SUPPORTS_DYNAMIC_PS),
FLAG(MFP_CAPABLE),
FLAG(WANT_MONITOR_VIF),
+ FLAG(NO_VIRTUAL_MONITOR),
FLAG(NO_AUTO_VIF),
FLAG(SW_CRYPTO_CONTROL),
FLAG(SUPPORT_FAST_XMIT),
@@ -212,8 +474,22 @@ static const char *hw_flag_names[] = {
FLAG(REPORTS_LOW_ACK),
FLAG(SUPPORTS_TX_FRAG),
FLAG(SUPPORTS_TDLS_BUFFER_STA),
- FLAG(DEAUTH_NEED_MGD_TX_PREP),
FLAG(DOESNT_SUPPORT_QOS_NDP),
+ FLAG(BUFF_MMPDU_TXQ),
+ FLAG(SUPPORTS_VHT_EXT_NSS_BW),
+ FLAG(STA_MMPDU_TXQ),
+ FLAG(TX_STATUS_NO_AMPDU_LEN),
+ FLAG(SUPPORTS_MULTI_BSSID),
+ FLAG(SUPPORTS_ONLY_HE_MULTI_BSSID),
+ FLAG(AMPDU_KEYBORDER_SUPPORT),
+ FLAG(SUPPORTS_TX_ENCAP_OFFLOAD),
+ FLAG(SUPPORTS_RX_DECAP_OFFLOAD),
+ FLAG(SUPPORTS_CONC_MON_RX_DECAP),
+ FLAG(DETECTS_COLOR_COLLISION),
+ FLAG(MLO_MCAST_MULTI_LINK_TX),
+ FLAG(DISALLOW_PUNCTURING),
+ FLAG(HANDLES_QUIET_CSA),
+ FLAG(STRICT),
#undef FLAG
};
@@ -246,6 +522,46 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf,
return rv;
}
+static ssize_t hwflags_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct ieee80211_local *local = file->private_data;
+ char buf[100];
+ int val;
+
+ if (count >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(buf, user_buf, count))
+ return -EFAULT;
+
+ if (count && buf[count - 1] == '\n')
+ buf[count - 1] = '\0';
+ else
+ buf[count] = '\0';
+
+ if (sscanf(buf, "strict=%d", &val) == 1) {
+ switch (val) {
+ case 0:
+ ieee80211_hw_set(&local->hw, STRICT);
+ return count;
+ case 1:
+ __clear_bit(IEEE80211_HW_STRICT, local->hw.flags);
+ return count;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ return -EINVAL;
+}
+
+static const struct file_operations hwflags_ops = {
+ .open = simple_open,
+ .read = hwflags_read,
+ .write = hwflags_write,
+};
+
static ssize_t misc_read(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
@@ -296,7 +612,6 @@ static ssize_t queues_read(struct file *file, char __user *user_buf,
return simple_read_from_buffer(user_buf, count, ppos, buf, res);
}
-DEBUGFS_READONLY_FILE_OPS(hwflags);
DEBUGFS_READONLY_FILE_OPS(queues);
DEBUGFS_READONLY_FILE_OPS(misc);
@@ -312,9 +627,9 @@ static ssize_t format_devstat_counter(struct ieee80211_local *local,
char buf[20];
int res;
- rtnl_lock();
+ wiphy_lock(local->hw.wiphy);
res = drv_get_stats(local, &stats);
- rtnl_unlock();
+ wiphy_unlock(local->hw.wiphy);
if (res)
return res;
res = printvalue(&stats, buf, sizeof(buf));
@@ -338,14 +653,15 @@ static ssize_t stats_ ##name## _read(struct file *file, \
print_devstats_##name); \
} \
\
-static const struct file_operations stats_ ##name## _ops = { \
+static const struct debugfs_short_fops stats_ ##name## _ops = { \
.read = stats_ ##name## _read, \
- .open = simple_open, \
.llseek = generic_file_llseek, \
};
+#ifdef CONFIG_MAC80211_DEBUG_COUNTERS
#define DEBUGFS_STATS_ADD(name) \
debugfs_create_u32(#name, 0400, statsd, &local->name);
+#endif
#define DEBUGFS_DEVSTATS_ADD(name) \
debugfs_create_file(#name, 0400, statsd, local, &stats_ ##name## _ops);
@@ -372,18 +688,22 @@ void debugfs_hw_add(struct ieee80211_local *local)
#ifdef CONFIG_PM
DEBUGFS_ADD_MODE(reset, 0200);
#endif
- DEBUGFS_ADD(hwflags);
+ DEBUGFS_ADD_MODE(hwflags, 0600);
DEBUGFS_ADD(user_power);
DEBUGFS_ADD(power);
+ DEBUGFS_ADD(hw_conf);
+ DEBUGFS_ADD_MODE(force_tx_status, 0600);
+ DEBUGFS_ADD_MODE(aql_enable, 0600);
+ DEBUGFS_ADD(aql_pending);
+ DEBUGFS_ADD_MODE(aqm, 0600);
- if (local->ops->wake_tx_queue)
- DEBUGFS_ADD_MODE(aqm, 0600);
+ DEBUGFS_ADD_MODE(airtime_flags, 0600);
- statsd = debugfs_create_dir("statistics", phyd);
+ DEBUGFS_ADD(aql_txq_limit);
+ debugfs_create_u32("aql_threshold", 0600,
+ phyd, &local->aql_threshold);
- /* if the dir failed, don't put all the other things into the root! */
- if (!statsd)
- return;
+ statsd = debugfs_create_dir("statistics", phyd);
#ifdef CONFIG_MAC80211_DEBUG_COUNTERS
DEBUGFS_STATS_ADD(dot11TransmittedFragmentCount);
@@ -395,7 +715,6 @@ void debugfs_hw_add(struct ieee80211_local *local)
DEBUGFS_STATS_ADD(dot11ReceivedFragmentCount);
DEBUGFS_STATS_ADD(dot11MulticastReceivedFrameCount);
DEBUGFS_STATS_ADD(dot11TransmittedFrameCount);
- DEBUGFS_STATS_ADD(tx_handlers_drop);
DEBUGFS_STATS_ADD(tx_handlers_queued);
DEBUGFS_STATS_ADD(tx_handlers_drop_wep);
DEBUGFS_STATS_ADD(tx_handlers_drop_not_assoc);
diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c
index a2ef95f16f11..117f58af5ff9 100644
--- a/net/mac80211/debugfs_key.c
+++ b/net/mac80211/debugfs_key.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2003-2005 Devicescape Software, Inc.
* Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
* Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
* Copyright (C) 2015 Intel Deutschland GmbH
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
+ * Copyright (C) 2021-2023 Intel Corporation
*/
#include <linux/kobject.h>
@@ -25,21 +23,18 @@ static ssize_t key_##name##_read(struct file *file, \
return mac80211_format_buffer(userbuf, count, ppos, \
format_string, key->prop); \
}
-#define KEY_READ_D(name) KEY_READ(name, name, "%d\n")
#define KEY_READ_X(name) KEY_READ(name, name, "0x%x\n")
#define KEY_OPS(name) \
-static const struct file_operations key_ ##name## _ops = { \
+static const struct debugfs_short_fops key_ ##name## _ops = { \
.read = key_##name##_read, \
- .open = simple_open, \
.llseek = generic_file_llseek, \
}
#define KEY_OPS_W(name) \
-static const struct file_operations key_ ##name## _ops = { \
+static const struct debugfs_short_fops key_ ##name## _ops = { \
.read = key_##name##_read, \
.write = key_##name##_write, \
- .open = simple_open, \
.llseek = generic_file_llseek, \
}
@@ -52,9 +47,8 @@ static const struct file_operations key_ ##name## _ops = { \
#define KEY_CONF_READ_D(name) KEY_CONF_READ(name, "%d\n")
#define KEY_CONF_OPS(name) \
-static const struct file_operations key_ ##name## _ops = { \
+static const struct debugfs_short_fops key_ ##name## _ops = { \
.read = key_conf_##name##_read, \
- .open = simple_open, \
.llseek = generic_file_llseek, \
}
@@ -322,7 +316,7 @@ KEY_OPS(key);
#define DEBUGFS_ADD(name) \
debugfs_create_file(#name, 0400, key->debugfs.dir, \
- key, &key_##name##_ops);
+ key, &key_##name##_ops)
#define DEBUGFS_ADD_W(name) \
debugfs_create_file(#name, 0600, key->debugfs.dir, \
key, &key_##name##_ops);
@@ -342,9 +336,6 @@ void ieee80211_debugfs_key_add(struct ieee80211_key *key)
key->debugfs.dir = debugfs_create_dir(buf,
key->local->debugfs.keys);
- if (!key->debugfs.dir)
- return;
-
sta = key->sta;
if (sta) {
sprintf(buf, "../../netdev:%s/stations/%pM",
@@ -384,14 +375,14 @@ void ieee80211_debugfs_key_update_default(struct ieee80211_sub_if_data *sdata)
if (!sdata->vif.debugfs_dir)
return;
- lockdep_assert_held(&sdata->local->key_mtx);
+ lockdep_assert_wiphy(sdata->local->hw.wiphy);
debugfs_remove(sdata->debugfs.default_unicast_key);
sdata->debugfs.default_unicast_key = NULL;
if (sdata->default_unicast_key) {
- key = key_mtx_dereference(sdata->local,
- sdata->default_unicast_key);
+ key = wiphy_dereference(sdata->local->hw.wiphy,
+ sdata->default_unicast_key);
sprintf(buf, "../keys/%d", key->debugfs.cnt);
sdata->debugfs.default_unicast_key =
debugfs_create_symlink("default_unicast_key",
@@ -401,9 +392,9 @@ void ieee80211_debugfs_key_update_default(struct ieee80211_sub_if_data *sdata)
debugfs_remove(sdata->debugfs.default_multicast_key);
sdata->debugfs.default_multicast_key = NULL;
- if (sdata->default_multicast_key) {
- key = key_mtx_dereference(sdata->local,
- sdata->default_multicast_key);
+ if (sdata->deflink.default_multicast_key) {
+ key = wiphy_dereference(sdata->local->hw.wiphy,
+ sdata->deflink.default_multicast_key);
sprintf(buf, "../keys/%d", key->debugfs.cnt);
sdata->debugfs.default_multicast_key =
debugfs_create_symlink("default_multicast_key",
@@ -411,25 +402,6 @@ void ieee80211_debugfs_key_update_default(struct ieee80211_sub_if_data *sdata)
}
}
-void ieee80211_debugfs_key_add_mgmt_default(struct ieee80211_sub_if_data *sdata)
-{
- char buf[50];
- struct ieee80211_key *key;
-
- if (!sdata->vif.debugfs_dir)
- return;
-
- key = key_mtx_dereference(sdata->local,
- sdata->default_mgmt_key);
- if (key) {
- sprintf(buf, "../keys/%d", key->debugfs.cnt);
- sdata->debugfs.default_mgmt_key =
- debugfs_create_symlink("default_mgmt_key",
- sdata->vif.debugfs_dir, buf);
- } else
- ieee80211_debugfs_key_remove_mgmt_default(sdata);
-}
-
void ieee80211_debugfs_key_remove_mgmt_default(struct ieee80211_sub_if_data *sdata)
{
if (!sdata)
@@ -439,9 +411,12 @@ void ieee80211_debugfs_key_remove_mgmt_default(struct ieee80211_sub_if_data *sda
sdata->debugfs.default_mgmt_key = NULL;
}
-void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key,
- struct sta_info *sta)
+void
+ieee80211_debugfs_key_remove_beacon_default(struct ieee80211_sub_if_data *sdata)
{
- debugfs_remove(key->debugfs.stalink);
- key->debugfs.stalink = NULL;
+ if (!sdata)
+ return;
+
+ debugfs_remove(sdata->debugfs.default_beacon_key);
+ sdata->debugfs.default_beacon_key = NULL;
}
diff --git a/net/mac80211/debugfs_key.h b/net/mac80211/debugfs_key.h
index 1cd7b8bff56c..e17a48d5c6cc 100644
--- a/net/mac80211/debugfs_key.h
+++ b/net/mac80211/debugfs_key.h
@@ -6,12 +6,10 @@
void ieee80211_debugfs_key_add(struct ieee80211_key *key);
void ieee80211_debugfs_key_remove(struct ieee80211_key *key);
void ieee80211_debugfs_key_update_default(struct ieee80211_sub_if_data *sdata);
-void ieee80211_debugfs_key_add_mgmt_default(
- struct ieee80211_sub_if_data *sdata);
void ieee80211_debugfs_key_remove_mgmt_default(
struct ieee80211_sub_if_data *sdata);
-void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key,
- struct sta_info *sta);
+void ieee80211_debugfs_key_remove_beacon_default(
+ struct ieee80211_sub_if_data *sdata);
#else
static inline void ieee80211_debugfs_key_add(struct ieee80211_key *key)
{}
@@ -20,14 +18,11 @@ static inline void ieee80211_debugfs_key_remove(struct ieee80211_key *key)
static inline void ieee80211_debugfs_key_update_default(
struct ieee80211_sub_if_data *sdata)
{}
-static inline void ieee80211_debugfs_key_add_mgmt_default(
- struct ieee80211_sub_if_data *sdata)
-{}
static inline void ieee80211_debugfs_key_remove_mgmt_default(
struct ieee80211_sub_if_data *sdata)
{}
-static inline void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key,
- struct sta_info *sta)
+static inline void ieee80211_debugfs_key_remove_beacon_default(
+ struct ieee80211_sub_if_data *sdata)
{}
#endif
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index c813207bb123..30a5a978a678 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -1,10 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
* Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
+ * Copyright (C) 2020-2023 Intel Corporation
*/
#include <linux/kernel.h>
@@ -24,110 +22,208 @@
#include "debugfs_netdev.h"
#include "driver-ops.h"
-static ssize_t ieee80211_if_read(
- struct ieee80211_sub_if_data *sdata,
+struct ieee80211_if_read_sdata_data {
+ ssize_t (*format)(const struct ieee80211_sub_if_data *, char *, int);
+ struct ieee80211_sub_if_data *sdata;
+};
+
+static ssize_t ieee80211_if_read_sdata_handler(struct wiphy *wiphy,
+ struct file *file,
+ char *buf,
+ size_t bufsize,
+ void *data)
+{
+ struct ieee80211_if_read_sdata_data *d = data;
+
+ return d->format(d->sdata, buf, bufsize);
+}
+
+static ssize_t ieee80211_if_read_sdata(
+ struct file *file,
char __user *userbuf,
size_t count, loff_t *ppos,
- ssize_t (*format)(const struct ieee80211_sub_if_data *, char *, int))
+ ssize_t (*format)(const struct ieee80211_sub_if_data *sdata, char *, int))
{
+ struct ieee80211_sub_if_data *sdata = file->private_data;
+ struct ieee80211_if_read_sdata_data data = {
+ .format = format,
+ .sdata = sdata,
+ };
char buf[200];
- ssize_t ret = -EINVAL;
- read_lock(&dev_base_lock);
- ret = (*format)(sdata, buf, sizeof(buf));
- read_unlock(&dev_base_lock);
+ return wiphy_locked_debugfs_read(sdata->local->hw.wiphy,
+ file, buf, sizeof(buf),
+ userbuf, count, ppos,
+ ieee80211_if_read_sdata_handler,
+ &data);
+}
+
+struct ieee80211_if_write_sdata_data {
+ ssize_t (*write)(struct ieee80211_sub_if_data *, const char *, int);
+ struct ieee80211_sub_if_data *sdata;
+};
- if (ret >= 0)
- ret = simple_read_from_buffer(userbuf, count, ppos, buf, ret);
+static ssize_t ieee80211_if_write_sdata_handler(struct wiphy *wiphy,
+ struct file *file,
+ char *buf,
+ size_t count,
+ void *data)
+{
+ struct ieee80211_if_write_sdata_data *d = data;
- return ret;
+ return d->write(d->sdata, buf, count);
}
-static ssize_t ieee80211_if_write(
- struct ieee80211_sub_if_data *sdata,
+static ssize_t ieee80211_if_write_sdata(
+ struct file *file,
const char __user *userbuf,
size_t count, loff_t *ppos,
- ssize_t (*write)(struct ieee80211_sub_if_data *, const char *, int))
+ ssize_t (*write)(struct ieee80211_sub_if_data *sdata, const char *, int))
{
+ struct ieee80211_sub_if_data *sdata = file->private_data;
+ struct ieee80211_if_write_sdata_data data = {
+ .write = write,
+ .sdata = sdata,
+ };
char buf[64];
- ssize_t ret;
- if (count >= sizeof(buf))
- return -E2BIG;
+ return wiphy_locked_debugfs_write(sdata->local->hw.wiphy,
+ file, buf, sizeof(buf),
+ userbuf, count,
+ ieee80211_if_write_sdata_handler,
+ &data);
+}
- if (copy_from_user(buf, userbuf, count))
- return -EFAULT;
- buf[count] = '\0';
+struct ieee80211_if_read_link_data {
+ ssize_t (*format)(const struct ieee80211_link_data *, char *, int);
+ struct ieee80211_link_data *link;
+};
- ret = -ENODEV;
- rtnl_lock();
- ret = (*write)(sdata, buf, count);
- rtnl_unlock();
+static ssize_t ieee80211_if_read_link_handler(struct wiphy *wiphy,
+ struct file *file,
+ char *buf,
+ size_t bufsize,
+ void *data)
+{
+ struct ieee80211_if_read_link_data *d = data;
- return ret;
+ return d->format(d->link, buf, bufsize);
}
-#define IEEE80211_IF_FMT(name, field, format_string) \
+static ssize_t ieee80211_if_read_link(
+ struct file *file,
+ char __user *userbuf,
+ size_t count, loff_t *ppos,
+ ssize_t (*format)(const struct ieee80211_link_data *link, char *, int))
+{
+ struct ieee80211_link_data *link = file->private_data;
+ struct ieee80211_if_read_link_data data = {
+ .format = format,
+ .link = link,
+ };
+ char buf[200];
+
+ return wiphy_locked_debugfs_read(link->sdata->local->hw.wiphy,
+ file, buf, sizeof(buf),
+ userbuf, count, ppos,
+ ieee80211_if_read_link_handler,
+ &data);
+}
+
+struct ieee80211_if_write_link_data {
+ ssize_t (*write)(struct ieee80211_link_data *, const char *, int);
+ struct ieee80211_link_data *link;
+};
+
+static ssize_t ieee80211_if_write_link_handler(struct wiphy *wiphy,
+ struct file *file,
+ char *buf,
+ size_t count,
+ void *data)
+{
+ struct ieee80211_if_write_sdata_data *d = data;
+
+ return d->write(d->sdata, buf, count);
+}
+
+static ssize_t ieee80211_if_write_link(
+ struct file *file,
+ const char __user *userbuf,
+ size_t count, loff_t *ppos,
+ ssize_t (*write)(struct ieee80211_link_data *link, const char *, int))
+{
+ struct ieee80211_link_data *link = file->private_data;
+ struct ieee80211_if_write_link_data data = {
+ .write = write,
+ .link = link,
+ };
+ char buf[64];
+
+ return wiphy_locked_debugfs_write(link->sdata->local->hw.wiphy,
+ file, buf, sizeof(buf),
+ userbuf, count,
+ ieee80211_if_write_link_handler,
+ &data);
+}
+
+#define IEEE80211_IF_FMT(name, type, field, format_string) \
static ssize_t ieee80211_if_fmt_##name( \
- const struct ieee80211_sub_if_data *sdata, char *buf, \
+ const type *data, char *buf, \
int buflen) \
{ \
- return scnprintf(buf, buflen, format_string, sdata->field); \
-}
-#define IEEE80211_IF_FMT_DEC(name, field) \
- IEEE80211_IF_FMT(name, field, "%d\n")
-#define IEEE80211_IF_FMT_HEX(name, field) \
- IEEE80211_IF_FMT(name, field, "%#x\n")
-#define IEEE80211_IF_FMT_LHEX(name, field) \
- IEEE80211_IF_FMT(name, field, "%#lx\n")
-#define IEEE80211_IF_FMT_SIZE(name, field) \
- IEEE80211_IF_FMT(name, field, "%zd\n")
-
-#define IEEE80211_IF_FMT_HEXARRAY(name, field) \
+ return scnprintf(buf, buflen, format_string, data->field); \
+}
+#define IEEE80211_IF_FMT_DEC(name, type, field) \
+ IEEE80211_IF_FMT(name, type, field, "%d\n")
+#define IEEE80211_IF_FMT_HEX(name, type, field) \
+ IEEE80211_IF_FMT(name, type, field, "%#x\n")
+#define IEEE80211_IF_FMT_LHEX(name, type, field) \
+ IEEE80211_IF_FMT(name, type, field, "%#lx\n")
+
+#define IEEE80211_IF_FMT_HEXARRAY(name, type, field) \
static ssize_t ieee80211_if_fmt_##name( \
- const struct ieee80211_sub_if_data *sdata, \
+ const type *data, \
char *buf, int buflen) \
{ \
char *p = buf; \
int i; \
- for (i = 0; i < sizeof(sdata->field); i++) { \
+ for (i = 0; i < sizeof(data->field); i++) { \
p += scnprintf(p, buflen + buf - p, "%.2x ", \
- sdata->field[i]); \
+ data->field[i]); \
} \
p += scnprintf(p, buflen + buf - p, "\n"); \
return p - buf; \
}
-#define IEEE80211_IF_FMT_ATOMIC(name, field) \
+#define IEEE80211_IF_FMT_ATOMIC(name, type, field) \
static ssize_t ieee80211_if_fmt_##name( \
- const struct ieee80211_sub_if_data *sdata, \
+ const type *data, \
char *buf, int buflen) \
{ \
- return scnprintf(buf, buflen, "%d\n", atomic_read(&sdata->field));\
+ return scnprintf(buf, buflen, "%d\n", atomic_read(&data->field));\
}
-#define IEEE80211_IF_FMT_MAC(name, field) \
+#define IEEE80211_IF_FMT_MAC(name, type, field) \
static ssize_t ieee80211_if_fmt_##name( \
- const struct ieee80211_sub_if_data *sdata, char *buf, \
+ const type *data, char *buf, \
int buflen) \
{ \
- return scnprintf(buf, buflen, "%pM\n", sdata->field); \
+ return scnprintf(buf, buflen, "%pM\n", data->field); \
}
-#define IEEE80211_IF_FMT_JIFFIES_TO_MS(name, field) \
+#define IEEE80211_IF_FMT_JIFFIES_TO_MS(name, type, field) \
static ssize_t ieee80211_if_fmt_##name( \
- const struct ieee80211_sub_if_data *sdata, \
+ const type *data, \
char *buf, int buflen) \
{ \
return scnprintf(buf, buflen, "%d\n", \
- jiffies_to_msecs(sdata->field)); \
+ jiffies_to_msecs(data->field)); \
}
#define _IEEE80211_IF_FILE_OPS(name, _read, _write) \
-static const struct file_operations name##_ops = { \
+static const struct debugfs_short_fops name##_ops = { \
.read = (_read), \
.write = (_write), \
- .open = simple_open, \
.llseek = generic_file_llseek, \
}
@@ -136,9 +232,9 @@ static ssize_t ieee80211_if_read_##name(struct file *file, \
char __user *userbuf, \
size_t count, loff_t *ppos) \
{ \
- return ieee80211_if_read(file->private_data, \
- userbuf, count, ppos, \
- ieee80211_if_fmt_##name); \
+ return ieee80211_if_read_sdata(file, \
+ userbuf, count, ppos, \
+ ieee80211_if_fmt_##name); \
}
#define _IEEE80211_IF_FILE_W_FN(name) \
@@ -146,8 +242,9 @@ static ssize_t ieee80211_if_write_##name(struct file *file, \
const char __user *userbuf, \
size_t count, loff_t *ppos) \
{ \
- return ieee80211_if_write(file->private_data, userbuf, count, \
- ppos, ieee80211_if_parse_##name); \
+ return ieee80211_if_write_sdata(file, userbuf, \
+ count, ppos, \
+ ieee80211_if_parse_##name); \
}
#define IEEE80211_IF_FILE_R(name) \
@@ -165,9 +262,47 @@ static ssize_t ieee80211_if_write_##name(struct file *file, \
ieee80211_if_write_##name)
#define IEEE80211_IF_FILE(name, field, format) \
- IEEE80211_IF_FMT_##format(name, field) \
+ IEEE80211_IF_FMT_##format(name, struct ieee80211_sub_if_data, field) \
IEEE80211_IF_FILE_R(name)
+#define _IEEE80211_IF_LINK_R_FN(name) \
+static ssize_t ieee80211_if_read_##name(struct file *file, \
+ char __user *userbuf, \
+ size_t count, loff_t *ppos) \
+{ \
+ return ieee80211_if_read_link(file, \
+ userbuf, count, ppos, \
+ ieee80211_if_fmt_##name); \
+}
+
+#define _IEEE80211_IF_LINK_W_FN(name) \
+static ssize_t ieee80211_if_write_##name(struct file *file, \
+ const char __user *userbuf, \
+ size_t count, loff_t *ppos) \
+{ \
+ return ieee80211_if_write_link(file, userbuf, \
+ count, ppos, \
+ ieee80211_if_parse_##name); \
+}
+
+#define IEEE80211_IF_LINK_FILE_R(name) \
+ _IEEE80211_IF_LINK_R_FN(name) \
+ _IEEE80211_IF_FILE_OPS(link_##name, ieee80211_if_read_##name, NULL)
+
+#define IEEE80211_IF_LINK_FILE_W(name) \
+ _IEEE80211_IF_LINK_W_FN(name) \
+ _IEEE80211_IF_FILE_OPS(link_##name, NULL, ieee80211_if_write_##name)
+
+#define IEEE80211_IF_LINK_FILE_RW(name) \
+ _IEEE80211_IF_LINK_R_FN(name) \
+ _IEEE80211_IF_LINK_W_FN(name) \
+ _IEEE80211_IF_FILE_OPS(link_##name, ieee80211_if_read_##name, \
+ ieee80211_if_write_##name)
+
+#define IEEE80211_IF_LINK_FILE(name, field, format) \
+ IEEE80211_IF_FMT_##format(name, struct ieee80211_link_data, field) \
+ IEEE80211_IF_LINK_FILE_R(name)
+
/* common attributes */
IEEE80211_IF_FILE(rc_rateidx_mask_2ghz, rc_rateidx_mask[NL80211_BAND_2GHZ],
HEX);
@@ -212,9 +347,9 @@ IEEE80211_IF_FILE_R(rc_rateidx_vht_mcs_mask_5ghz);
IEEE80211_IF_FILE(flags, flags, HEX);
IEEE80211_IF_FILE(state, state, LHEX);
-IEEE80211_IF_FILE(txpower, vif.bss_conf.txpower, DEC);
-IEEE80211_IF_FILE(ap_power_level, ap_power_level, DEC);
-IEEE80211_IF_FILE(user_power_level, user_power_level, DEC);
+IEEE80211_IF_LINK_FILE(txpower, conf->txpower, DEC);
+IEEE80211_IF_LINK_FILE(ap_power_level, ap_power_level, DEC);
+IEEE80211_IF_LINK_FILE(user_power_level, user_power_level, DEC);
static ssize_t
ieee80211_if_fmt_hw_queues(const struct ieee80211_sub_if_data *sdata,
@@ -237,15 +372,21 @@ ieee80211_if_fmt_hw_queues(const struct ieee80211_sub_if_data *sdata,
IEEE80211_IF_FILE_R(hw_queues);
/* STA attributes */
-IEEE80211_IF_FILE(bssid, u.mgd.bssid, MAC);
-IEEE80211_IF_FILE(aid, u.mgd.aid, DEC);
+IEEE80211_IF_FILE(bssid, deflink.u.mgd.bssid, MAC);
+IEEE80211_IF_FILE(aid, vif.cfg.aid, DEC);
IEEE80211_IF_FILE(beacon_timeout, u.mgd.beacon_timeout, JIFFIES_TO_MS);
-static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata,
+static int ieee80211_set_smps(struct ieee80211_link_data *link,
enum ieee80211_smps_mode smps_mode)
{
+ struct ieee80211_sub_if_data *sdata = link->sdata;
struct ieee80211_local *local = sdata->local;
- int err;
+
+ /* The driver indicated that EML is enabled for the interface, thus do
+ * not allow to override the SMPS state.
+ */
+ if (sdata->vif.driver_flags & IEEE80211_VIF_EML_ACTIVE)
+ return -EOPNOTSUPP;
if (!(local->hw.wiphy->features & NL80211_FEATURE_STATIC_SMPS) &&
smps_mode == IEEE80211_SMPS_STATIC)
@@ -257,18 +398,10 @@ static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata,
smps_mode == IEEE80211_SMPS_AUTOMATIC))
return -EINVAL;
- if (sdata->vif.type != NL80211_IFTYPE_STATION &&
- sdata->vif.type != NL80211_IFTYPE_AP)
+ if (sdata->vif.type != NL80211_IFTYPE_STATION)
return -EOPNOTSUPP;
- sdata_lock(sdata);
- if (sdata->vif.type == NL80211_IFTYPE_STATION)
- err = __ieee80211_request_smps_mgd(sdata, smps_mode);
- else
- err = __ieee80211_request_smps_ap(sdata, smps_mode);
- sdata_unlock(sdata);
-
- return err;
+ return __ieee80211_request_smps_mgd(link->sdata, link, smps_mode);
}
static const char *smps_modes[IEEE80211_SMPS_NUM_MODES] = {
@@ -278,28 +411,24 @@ static const char *smps_modes[IEEE80211_SMPS_NUM_MODES] = {
[IEEE80211_SMPS_DYNAMIC] = "dynamic",
};
-static ssize_t ieee80211_if_fmt_smps(const struct ieee80211_sub_if_data *sdata,
+static ssize_t ieee80211_if_fmt_smps(const struct ieee80211_link_data *link,
char *buf, int buflen)
{
- if (sdata->vif.type == NL80211_IFTYPE_STATION)
- return snprintf(buf, buflen, "request: %s\nused: %s\n",
- smps_modes[sdata->u.mgd.req_smps],
- smps_modes[sdata->smps_mode]);
- if (sdata->vif.type == NL80211_IFTYPE_AP)
+ if (link->sdata->vif.type == NL80211_IFTYPE_STATION)
return snprintf(buf, buflen, "request: %s\nused: %s\n",
- smps_modes[sdata->u.ap.req_smps],
- smps_modes[sdata->smps_mode]);
+ smps_modes[link->u.mgd.req_smps],
+ smps_modes[link->smps_mode]);
return -EINVAL;
}
-static ssize_t ieee80211_if_parse_smps(struct ieee80211_sub_if_data *sdata,
+static ssize_t ieee80211_if_parse_smps(struct ieee80211_link_data *link,
const char *buf, int buflen)
{
enum ieee80211_smps_mode mode;
for (mode = 0; mode < IEEE80211_SMPS_NUM_MODES; mode++) {
if (strncmp(buf, smps_modes[mode], buflen) == 0) {
- int err = ieee80211_set_smps(sdata, mode);
+ int err = ieee80211_set_smps(link, mode);
if (!err)
return buflen;
return err;
@@ -308,7 +437,7 @@ static ssize_t ieee80211_if_parse_smps(struct ieee80211_sub_if_data *sdata,
return -EINVAL;
}
-IEEE80211_IF_FILE_RW(smps);
+IEEE80211_IF_LINK_FILE_RW(smps);
static ssize_t ieee80211_if_parse_tkip_mic_test(
struct ieee80211_sub_if_data *sdata, const char *buf, int buflen)
@@ -344,16 +473,13 @@ static ssize_t ieee80211_if_parse_tkip_mic_test(
case NL80211_IFTYPE_STATION:
fc |= cpu_to_le16(IEEE80211_FCTL_TODS);
/* BSSID SA DA */
- sdata_lock(sdata);
if (!sdata->u.mgd.associated) {
- sdata_unlock(sdata);
dev_kfree_skb(skb);
return -ENOTCONN;
}
- memcpy(hdr->addr1, sdata->u.mgd.associated->bssid, ETH_ALEN);
+ memcpy(hdr->addr1, sdata->deflink.u.mgd.bssid, ETH_ALEN);
memcpy(hdr->addr2, sdata->vif.addr, ETH_ALEN);
memcpy(hdr->addr3, addr, ETH_ALEN);
- sdata_unlock(sdata);
break;
default:
dev_kfree_skb(skb);
@@ -379,7 +505,7 @@ IEEE80211_IF_FILE_W(tkip_mic_test);
static ssize_t ieee80211_if_parse_beacon_loss(
struct ieee80211_sub_if_data *sdata, const char *buf, int buflen)
{
- if (!ieee80211_sdata_running(sdata) || !sdata->vif.bss_conf.assoc)
+ if (!ieee80211_sdata_running(sdata) || !sdata->vif.cfg.assoc)
return -ENOTCONN;
ieee80211_beacon_loss(&sdata->vif);
@@ -490,11 +616,15 @@ static ssize_t ieee80211_if_fmt_aqm(
const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
{
struct ieee80211_local *local = sdata->local;
- struct txq_info *txqi = to_txq_info(sdata->vif.txq);
+ struct txq_info *txqi;
int len;
+ if (!sdata->vif.txq)
+ return 0;
+
+ txqi = to_txq_info(sdata->vif.txq);
+
spin_lock_bh(&local->fq.lock);
- rcu_read_lock();
len = scnprintf(buf,
buflen,
@@ -511,7 +641,6 @@ static ssize_t ieee80211_if_fmt_aqm(
txqi->tin.tx_bytes,
txqi->tin.tx_packets);
- rcu_read_unlock();
spin_unlock_bh(&local->fq.lock);
return len;
@@ -573,14 +702,37 @@ static ssize_t ieee80211_if_parse_tsf(
}
}
- ieee80211_recalc_dtim(local, sdata);
+ ieee80211_recalc_dtim(sdata, drv_get_tsf(local, sdata));
return buflen;
}
IEEE80211_IF_FILE_RW(tsf);
+static ssize_t ieee80211_if_fmt_valid_links(const struct ieee80211_sub_if_data *sdata,
+ char *buf, int buflen)
+{
+ return snprintf(buf, buflen, "0x%x\n", sdata->vif.valid_links);
+}
+IEEE80211_IF_FILE_R(valid_links);
+
+static ssize_t ieee80211_if_fmt_active_links(const struct ieee80211_sub_if_data *sdata,
+ char *buf, int buflen)
+{
+ return snprintf(buf, buflen, "0x%x\n", sdata->vif.active_links);
+}
+
+static ssize_t ieee80211_if_parse_active_links(struct ieee80211_sub_if_data *sdata,
+ const char *buf, int buflen)
+{
+ u16 active_links;
+
+ if (kstrtou16(buf, 0, &active_links) || !active_links)
+ return -EINVAL;
+
+ return ieee80211_set_active_links(&sdata->vif, active_links) ?: buflen;
+}
+IEEE80211_IF_FILE_RW(active_links);
-/* WDS attributes */
-IEEE80211_IF_FILE(peer, u.wds.remote_addr, MAC);
+IEEE80211_IF_LINK_FILE(addr, conf->addr, MAC);
#ifdef CONFIG_MAC80211_MESH
IEEE80211_IF_FILE(estab_plinks, u.mesh.estab_plinks, ATOMIC);
@@ -590,8 +742,6 @@ IEEE80211_IF_FILE(fwded_mcast, u.mesh.mshstats.fwded_mcast, DEC);
IEEE80211_IF_FILE(fwded_unicast, u.mesh.mshstats.fwded_unicast, DEC);
IEEE80211_IF_FILE(fwded_frames, u.mesh.mshstats.fwded_frames, DEC);
IEEE80211_IF_FILE(dropped_frames_ttl, u.mesh.mshstats.dropped_frames_ttl, DEC);
-IEEE80211_IF_FILE(dropped_frames_congestion,
- u.mesh.mshstats.dropped_frames_congestion, DEC);
IEEE80211_IF_FILE(dropped_frames_no_route,
u.mesh.mshstats.dropped_frames_no_route, DEC);
@@ -641,11 +791,29 @@ IEEE80211_IF_FILE(dot11MeshHWMPconfirmationInterval,
IEEE80211_IF_FILE(power_mode, u.mesh.mshcfg.power_mode, DEC);
IEEE80211_IF_FILE(dot11MeshAwakeWindowDuration,
u.mesh.mshcfg.dot11MeshAwakeWindowDuration, DEC);
+IEEE80211_IF_FILE(dot11MeshConnectedToMeshGate,
+ u.mesh.mshcfg.dot11MeshConnectedToMeshGate, DEC);
+IEEE80211_IF_FILE(dot11MeshNolearn, u.mesh.mshcfg.dot11MeshNolearn, DEC);
+IEEE80211_IF_FILE(dot11MeshConnectedToAuthServer,
+ u.mesh.mshcfg.dot11MeshConnectedToAuthServer, DEC);
#endif
#define DEBUGFS_ADD_MODE(name, mode) \
debugfs_create_file(#name, mode, sdata->vif.debugfs_dir, \
- sdata, &name##_ops);
+ sdata, &name##_ops)
+
+#define DEBUGFS_ADD_X(_bits, _name, _mode) \
+ debugfs_create_x##_bits(#_name, _mode, sdata->vif.debugfs_dir, \
+ &sdata->vif._name)
+
+#define DEBUGFS_ADD_X8(_name, _mode) \
+ DEBUGFS_ADD_X(8, _name, _mode)
+
+#define DEBUGFS_ADD_X16(_name, _mode) \
+ DEBUGFS_ADD_X(16, _name, _mode)
+
+#define DEBUGFS_ADD_X32(_name, _mode) \
+ DEBUGFS_ADD_X(32, _name, _mode)
#define DEBUGFS_ADD(name) DEBUGFS_ADD_MODE(name, 0400)
@@ -659,7 +827,8 @@ static void add_common_files(struct ieee80211_sub_if_data *sdata)
DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_5ghz);
DEBUGFS_ADD(hw_queues);
- if (sdata->local->ops->wake_tx_queue)
+ if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE &&
+ sdata->vif.type != NL80211_IFTYPE_NAN)
DEBUGFS_ADD(aqm);
}
@@ -668,18 +837,19 @@ static void add_sta_files(struct ieee80211_sub_if_data *sdata)
DEBUGFS_ADD(bssid);
DEBUGFS_ADD(aid);
DEBUGFS_ADD(beacon_timeout);
- DEBUGFS_ADD_MODE(smps, 0600);
DEBUGFS_ADD_MODE(tkip_mic_test, 0200);
DEBUGFS_ADD_MODE(beacon_loss, 0200);
DEBUGFS_ADD_MODE(uapsd_queues, 0600);
DEBUGFS_ADD_MODE(uapsd_max_sp_len, 0600);
DEBUGFS_ADD_MODE(tdls_wider_bw, 0600);
+ DEBUGFS_ADD_MODE(valid_links, 0400);
+ DEBUGFS_ADD_MODE(active_links, 0600);
+ DEBUGFS_ADD_X16(dormant_links, 0400);
}
static void add_ap_files(struct ieee80211_sub_if_data *sdata)
{
DEBUGFS_ADD(num_mcast_sta);
- DEBUGFS_ADD_MODE(smps, 0600);
DEBUGFS_ADD(num_sta_ps);
DEBUGFS_ADD(dtim_count);
DEBUGFS_ADD(num_buffered_multicast);
@@ -699,11 +869,6 @@ static void add_ibss_files(struct ieee80211_sub_if_data *sdata)
DEBUGFS_ADD_MODE(tsf, 0600);
}
-static void add_wds_files(struct ieee80211_sub_if_data *sdata)
-{
- DEBUGFS_ADD(peer);
-}
-
#ifdef CONFIG_MAC80211_MESH
static void add_mesh_files(struct ieee80211_sub_if_data *sdata)
@@ -717,14 +882,13 @@ static void add_mesh_stats(struct ieee80211_sub_if_data *sdata)
struct dentry *dir = debugfs_create_dir("mesh_stats",
sdata->vif.debugfs_dir);
#define MESHSTATS_ADD(name)\
- debugfs_create_file(#name, 0400, dir, sdata, &name##_ops);
+ debugfs_create_file(#name, 0400, dir, sdata, &name##_ops)
MESHSTATS_ADD(fwded_mcast);
MESHSTATS_ADD(fwded_unicast);
MESHSTATS_ADD(fwded_frames);
MESHSTATS_ADD(dropped_frames_ttl);
MESHSTATS_ADD(dropped_frames_no_route);
- MESHSTATS_ADD(dropped_frames_congestion);
#undef MESHSTATS_ADD
}
@@ -734,7 +898,7 @@ static void add_mesh_config(struct ieee80211_sub_if_data *sdata)
sdata->vif.debugfs_dir);
#define MESHPARAMS_ADD(name) \
- debugfs_create_file(#name, 0600, dir, sdata, &name##_ops);
+ debugfs_create_file(#name, 0600, dir, sdata, &name##_ops)
MESHPARAMS_ADD(dot11MeshMaxRetries);
MESHPARAMS_ADD(dot11MeshRetryTimeout);
@@ -762,6 +926,9 @@ static void add_mesh_config(struct ieee80211_sub_if_data *sdata)
MESHPARAMS_ADD(dot11MeshHWMPconfirmationInterval);
MESHPARAMS_ADD(power_mode);
MESHPARAMS_ADD(dot11MeshAwakeWindowDuration);
+ MESHPARAMS_ADD(dot11MeshConnectedToMeshGate);
+ MESHPARAMS_ADD(dot11MeshNolearn);
+ MESHPARAMS_ADD(dot11MeshConnectedToAuthServer);
#undef MESHPARAMS_ADD
}
#endif
@@ -773,9 +940,6 @@ static void add_files(struct ieee80211_sub_if_data *sdata)
DEBUGFS_ADD(flags);
DEBUGFS_ADD(state);
- DEBUGFS_ADD(txpower);
- DEBUGFS_ADD(user_power_level);
- DEBUGFS_ADD(ap_power_level);
if (sdata->vif.type != NL80211_IFTYPE_MONITOR)
add_common_files(sdata);
@@ -800,25 +964,51 @@ static void add_files(struct ieee80211_sub_if_data *sdata)
case NL80211_IFTYPE_AP_VLAN:
add_vlan_files(sdata);
break;
- case NL80211_IFTYPE_WDS:
- add_wds_files(sdata);
+ default:
+ break;
+ }
+}
+
+#undef DEBUGFS_ADD_MODE
+#undef DEBUGFS_ADD
+
+#define DEBUGFS_ADD_MODE(dentry, name, mode) \
+ debugfs_create_file(#name, mode, dentry, \
+ link, &link_##name##_ops)
+
+#define DEBUGFS_ADD(dentry, name) DEBUGFS_ADD_MODE(dentry, name, 0400)
+
+static void add_link_files(struct ieee80211_link_data *link,
+ struct dentry *dentry)
+{
+ DEBUGFS_ADD(dentry, txpower);
+ DEBUGFS_ADD(dentry, user_power_level);
+ DEBUGFS_ADD(dentry, ap_power_level);
+
+ switch (link->sdata->vif.type) {
+ case NL80211_IFTYPE_STATION:
+ DEBUGFS_ADD_MODE(dentry, smps, 0600);
break;
default:
break;
}
}
-void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata)
+static void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata,
+ bool mld_vif)
{
char buf[10+IFNAMSIZ];
sprintf(buf, "netdev:%s", sdata->name);
sdata->vif.debugfs_dir = debugfs_create_dir(buf,
sdata->local->hw.wiphy->debugfsdir);
- if (sdata->vif.debugfs_dir)
- sdata->debugfs.subdir_stations = debugfs_create_dir("stations",
- sdata->vif.debugfs_dir);
+ /* deflink also has this */
+ sdata->deflink.debugfs_dir = sdata->vif.debugfs_dir;
+ sdata->debugfs.subdir_stations = debugfs_create_dir("stations",
+ sdata->vif.debugfs_dir);
add_files(sdata);
+ if (!mld_vif)
+ add_link_files(&sdata->deflink, sdata->vif.debugfs_dir);
}
void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata)
@@ -833,17 +1023,82 @@ void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata)
void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata)
{
- struct dentry *dir;
- char buf[10 + IFNAMSIZ];
+ debugfs_change_name(sdata->vif.debugfs_dir, "netdev:%s", sdata->name);
+}
- dir = sdata->vif.debugfs_dir;
+void ieee80211_debugfs_recreate_netdev(struct ieee80211_sub_if_data *sdata,
+ bool mld_vif)
+{
+ ieee80211_debugfs_remove_netdev(sdata);
+ ieee80211_debugfs_add_netdev(sdata, mld_vif);
- if (!dir)
+ if (sdata->flags & IEEE80211_SDATA_IN_DRIVER) {
+ drv_vif_add_debugfs(sdata->local, sdata);
+ if (!mld_vif)
+ ieee80211_link_debugfs_drv_add(&sdata->deflink);
+ }
+}
+
+void ieee80211_link_debugfs_add(struct ieee80211_link_data *link)
+{
+ char link_dir_name[10];
+
+ if (WARN_ON(!link->sdata->vif.debugfs_dir || link->debugfs_dir))
return;
- sprintf(buf, "netdev:%s", sdata->name);
- if (!debugfs_rename(dir->d_parent, dir, dir->d_parent, buf))
- sdata_err(sdata,
- "debugfs: failed to rename debugfs dir to %s\n",
- buf);
+ /* For now, this should not be called for non-MLO capable drivers */
+ if (WARN_ON(!(link->sdata->local->hw.wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO)))
+ return;
+
+ snprintf(link_dir_name, sizeof(link_dir_name),
+ "link-%d", link->link_id);
+
+ link->debugfs_dir =
+ debugfs_create_dir(link_dir_name,
+ link->sdata->vif.debugfs_dir);
+
+ DEBUGFS_ADD(link->debugfs_dir, addr);
+ add_link_files(link, link->debugfs_dir);
+}
+
+void ieee80211_link_debugfs_remove(struct ieee80211_link_data *link)
+{
+ if (!link->sdata->vif.debugfs_dir || !link->debugfs_dir) {
+ link->debugfs_dir = NULL;
+ return;
+ }
+
+ if (link->debugfs_dir == link->sdata->vif.debugfs_dir) {
+ WARN_ON(link != &link->sdata->deflink);
+ link->debugfs_dir = NULL;
+ return;
+ }
+
+ debugfs_remove_recursive(link->debugfs_dir);
+ link->debugfs_dir = NULL;
+}
+
+void ieee80211_link_debugfs_drv_add(struct ieee80211_link_data *link)
+{
+ if (link->sdata->vif.type == NL80211_IFTYPE_MONITOR ||
+ WARN_ON(!link->debugfs_dir))
+ return;
+
+ drv_link_add_debugfs(link->sdata->local, link->sdata,
+ link->conf, link->debugfs_dir);
+}
+
+void ieee80211_link_debugfs_drv_remove(struct ieee80211_link_data *link)
+{
+ if (!link || !link->debugfs_dir)
+ return;
+
+ if (WARN_ON(link->debugfs_dir == link->sdata->vif.debugfs_dir))
+ return;
+
+ /* Recreate the directory excluding the driver data */
+ debugfs_remove_recursive(link->debugfs_dir);
+ link->debugfs_dir = NULL;
+
+ ieee80211_link_debugfs_add(link);
}
diff --git a/net/mac80211/debugfs_netdev.h b/net/mac80211/debugfs_netdev.h
index a7e9d8d518f9..a02ec0a413f6 100644
--- a/net/mac80211/debugfs_netdev.h
+++ b/net/mac80211/debugfs_netdev.h
@@ -1,4 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Portions:
+ * Copyright (C) 2023 Intel Corporation
+ */
/* routines exported for debugfs handling */
#ifndef __IEEE80211_DEBUGFS_NETDEV_H
@@ -7,19 +11,35 @@
#include "ieee80211_i.h"
#ifdef CONFIG_MAC80211_DEBUGFS
-void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata);
void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata);
void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata);
+void ieee80211_debugfs_recreate_netdev(struct ieee80211_sub_if_data *sdata,
+ bool mld_vif);
+
+void ieee80211_link_debugfs_add(struct ieee80211_link_data *link);
+void ieee80211_link_debugfs_remove(struct ieee80211_link_data *link);
+
+void ieee80211_link_debugfs_drv_add(struct ieee80211_link_data *link);
+void ieee80211_link_debugfs_drv_remove(struct ieee80211_link_data *link);
#else
-static inline void ieee80211_debugfs_add_netdev(
- struct ieee80211_sub_if_data *sdata)
-{}
static inline void ieee80211_debugfs_remove_netdev(
struct ieee80211_sub_if_data *sdata)
{}
static inline void ieee80211_debugfs_rename_netdev(
struct ieee80211_sub_if_data *sdata)
{}
+static inline void ieee80211_debugfs_recreate_netdev(
+ struct ieee80211_sub_if_data *sdata, bool mld_vif)
+{}
+static inline void ieee80211_link_debugfs_add(struct ieee80211_link_data *link)
+{}
+static inline void ieee80211_link_debugfs_remove(struct ieee80211_link_data *link)
+{}
+
+static inline void ieee80211_link_debugfs_drv_add(struct ieee80211_link_data *link)
+{}
+static inline void ieee80211_link_debugfs_drv_remove(struct ieee80211_link_data *link)
+{}
#endif
#endif /* __IEEE80211_DEBUGFS_NETDEV_H */
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 4105081dc1df..ef75255d47d5 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -1,13 +1,11 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2003-2005 Devicescape Software, Inc.
* Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
* Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright(c) 2016 Intel Deutschland GmbH
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
+ * Copyright (C) 2018 - 2023 Intel Corporation
*/
#include <linux/debugfs.h>
@@ -18,7 +16,7 @@
#include "sta_info.h"
#include "driver-ops.h"
-/* sta attributtes */
+/* sta attributes */
#define STA_READ(name, field, format_string) \
static ssize_t sta_ ##name## _read(struct file *file, \
@@ -32,17 +30,15 @@ static ssize_t sta_ ##name## _read(struct file *file, \
#define STA_READ_D(name, field) STA_READ(name, field, "%d\n")
#define STA_OPS(name) \
-static const struct file_operations sta_ ##name## _ops = { \
+static const struct debugfs_short_fops sta_ ##name## _ops = { \
.read = sta_##name##_read, \
- .open = simple_open, \
.llseek = generic_file_llseek, \
}
#define STA_OPS_RW(name) \
-static const struct file_operations sta_ ##name## _ops = { \
+static const struct debugfs_short_fops sta_ ##name## _ops = { \
.read = sta_##name##_read, \
.write = sta_##name##_write, \
- .open = simple_open, \
.llseek = generic_file_llseek, \
}
@@ -80,6 +76,8 @@ static const char * const sta_flag_names[] = {
FLAG(MPSP_OWNER),
FLAG(MPSP_RECIPIENT),
FLAG(PS_DELIVER),
+ FLAG(USES_ENCRYPTION),
+ FLAG(DECAP_OFFLOAD),
#undef FLAG
};
@@ -140,7 +138,7 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf,
{
struct sta_info *sta = file->private_data;
struct ieee80211_local *local = sta->local;
- size_t bufsz = AQM_TXQ_ENTRY_LEN*(IEEE80211_NUM_TIDS+1);
+ size_t bufsz = AQM_TXQ_ENTRY_LEN * (IEEE80211_NUM_TIDS + 2);
char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf;
struct txq_info *txqi;
ssize_t rv;
@@ -150,22 +148,17 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf,
return -ENOMEM;
spin_lock_bh(&local->fq.lock);
- rcu_read_lock();
p += scnprintf(p,
- bufsz+buf-p,
- "target %uus interval %uus ecn %s\n",
- codel_time_to_us(sta->cparams.target),
- codel_time_to_us(sta->cparams.interval),
- sta->cparams.ecn ? "yes" : "no");
- p += scnprintf(p,
- bufsz+buf-p,
+ bufsz + buf - p,
"tid ac backlog-bytes backlog-packets new-flows drops marks overlimit collisions tx-bytes tx-packets flags\n");
- for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
+ for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) {
+ if (!sta->sta.txq[i])
+ continue;
txqi = to_txq_info(sta->sta.txq[i]);
- p += scnprintf(p, bufsz+buf-p,
- "%d %d %u %u %u %u %u %u %u %u %u 0x%lx(%s%s%s)\n",
+ p += scnprintf(p, bufsz + buf - p,
+ "%d %d %u %u %u %u %u %u %u %u %u 0x%lx(%s%s%s%s)\n",
txqi->txq.tid,
txqi->txq.ac,
txqi->tin.backlog_bytes,
@@ -178,12 +171,12 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf,
txqi->tin.tx_bytes,
txqi->tin.tx_packets,
txqi->flags,
- txqi->flags & (1<<IEEE80211_TXQ_STOP) ? "STOP" : "RUN",
- txqi->flags & (1<<IEEE80211_TXQ_AMPDU) ? " AMPDU" : "",
- txqi->flags & (1<<IEEE80211_TXQ_NO_AMSDU) ? " NO-AMSDU" : "");
+ test_bit(IEEE80211_TXQ_STOP, &txqi->flags) ? "STOP" : "RUN",
+ test_bit(IEEE80211_TXQ_AMPDU, &txqi->flags) ? " AMPDU" : "",
+ test_bit(IEEE80211_TXQ_NO_AMSDU, &txqi->flags) ? " NO-AMSDU" : "",
+ test_bit(IEEE80211_TXQ_DIRTY, &txqi->flags) ? " DIRTY" : "");
}
- rcu_read_unlock();
spin_unlock_bh(&local->fq.lock);
rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
@@ -192,68 +185,193 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf,
}
STA_OPS(aqm);
-static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf,
- size_t count, loff_t *ppos)
+static ssize_t sta_airtime_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct sta_info *sta = file->private_data;
+ struct ieee80211_local *local = sta->sdata->local;
+ size_t bufsz = 400;
+ char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf;
+ u64 rx_airtime = 0, tx_airtime = 0;
+ s32 deficit[IEEE80211_NUM_ACS];
+ ssize_t rv;
+ int ac;
+
+ if (!buf)
+ return -ENOMEM;
+
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
+ spin_lock_bh(&local->active_txq_lock[ac]);
+ rx_airtime += sta->airtime[ac].rx_airtime;
+ tx_airtime += sta->airtime[ac].tx_airtime;
+ deficit[ac] = sta->airtime[ac].deficit;
+ spin_unlock_bh(&local->active_txq_lock[ac]);
+ }
+
+ p += scnprintf(p, bufsz + buf - p,
+ "RX: %llu us\nTX: %llu us\nWeight: %u\n"
+ "Deficit: VO: %d us VI: %d us BE: %d us BK: %d us\n",
+ rx_airtime, tx_airtime, sta->airtime_weight,
+ deficit[0], deficit[1], deficit[2], deficit[3]);
+
+ rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+ kfree(buf);
+ return rv;
+}
+
+static ssize_t sta_airtime_write(struct file *file, const char __user *userbuf,
+ size_t count, loff_t *ppos)
{
- char buf[71 + IEEE80211_NUM_TIDS * 40], *p = buf;
- int i;
struct sta_info *sta = file->private_data;
+ struct ieee80211_local *local = sta->sdata->local;
+ int ac;
+
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
+ spin_lock_bh(&local->active_txq_lock[ac]);
+ sta->airtime[ac].rx_airtime = 0;
+ sta->airtime[ac].tx_airtime = 0;
+ sta->airtime[ac].deficit = sta->airtime_weight;
+ spin_unlock_bh(&local->active_txq_lock[ac]);
+ }
+
+ return count;
+}
+STA_OPS_RW(airtime);
+
+static ssize_t sta_aql_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct sta_info *sta = file->private_data;
+ struct ieee80211_local *local = sta->sdata->local;
+ size_t bufsz = 400;
+ char *buf = kzalloc(bufsz, GFP_KERNEL), *p = buf;
+ u32 q_depth[IEEE80211_NUM_ACS];
+ u32 q_limit_l[IEEE80211_NUM_ACS], q_limit_h[IEEE80211_NUM_ACS];
+ ssize_t rv;
+ int ac;
+
+ if (!buf)
+ return -ENOMEM;
+
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
+ spin_lock_bh(&local->active_txq_lock[ac]);
+ q_limit_l[ac] = sta->airtime[ac].aql_limit_low;
+ q_limit_h[ac] = sta->airtime[ac].aql_limit_high;
+ spin_unlock_bh(&local->active_txq_lock[ac]);
+ q_depth[ac] = atomic_read(&sta->airtime[ac].aql_tx_pending);
+ }
+
+ p += scnprintf(p, bufsz + buf - p,
+ "Q depth: VO: %u us VI: %u us BE: %u us BK: %u us\n"
+ "Q limit[low/high]: VO: %u/%u VI: %u/%u BE: %u/%u BK: %u/%u\n",
+ q_depth[0], q_depth[1], q_depth[2], q_depth[3],
+ q_limit_l[0], q_limit_h[0], q_limit_l[1], q_limit_h[1],
+ q_limit_l[2], q_limit_h[2], q_limit_l[3], q_limit_h[3]);
+
+ rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+ kfree(buf);
+ return rv;
+}
+
+static ssize_t sta_aql_write(struct file *file, const char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct sta_info *sta = file->private_data;
+ u32 ac, q_limit_l, q_limit_h;
+ char _buf[100] = {}, *buf = _buf;
+
+ if (count > sizeof(_buf))
+ return -EINVAL;
+
+ if (copy_from_user(buf, userbuf, count))
+ return -EFAULT;
+
+ buf[sizeof(_buf) - 1] = '\0';
+ if (sscanf(buf, "limit %u %u %u", &ac, &q_limit_l, &q_limit_h)
+ != 3)
+ return -EINVAL;
+
+ if (ac >= IEEE80211_NUM_ACS)
+ return -EINVAL;
+
+ sta->airtime[ac].aql_limit_low = q_limit_l;
+ sta->airtime[ac].aql_limit_high = q_limit_h;
+
+ return count;
+}
+STA_OPS_RW(aql);
+
+
+static ssize_t sta_agg_status_do_read(struct wiphy *wiphy, struct file *file,
+ char *buf, size_t bufsz, void *data)
+{
+ struct sta_info *sta = data;
+ char *p = buf;
+ int i;
struct tid_ampdu_rx *tid_rx;
struct tid_ampdu_tx *tid_tx;
- rcu_read_lock();
-
- p += scnprintf(p, sizeof(buf) + buf - p, "next dialog_token: %#02x\n",
+ p += scnprintf(p, bufsz + buf - p, "next dialog_token: %#02x\n",
sta->ampdu_mlme.dialog_token_allocator + 1);
- p += scnprintf(p, sizeof(buf) + buf - p,
+ p += scnprintf(p, bufsz + buf - p,
"TID\t\tRX\tDTKN\tSSN\t\tTX\tDTKN\tpending\n");
for (i = 0; i < IEEE80211_NUM_TIDS; i++) {
bool tid_rx_valid;
- tid_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[i]);
- tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[i]);
+ tid_rx = wiphy_dereference(wiphy, sta->ampdu_mlme.tid_rx[i]);
+ tid_tx = wiphy_dereference(wiphy, sta->ampdu_mlme.tid_tx[i]);
tid_rx_valid = test_bit(i, sta->ampdu_mlme.agg_session_valid);
- p += scnprintf(p, sizeof(buf) + buf - p, "%02d", i);
- p += scnprintf(p, sizeof(buf) + buf - p, "\t\t%x",
+ p += scnprintf(p, bufsz + buf - p, "%02d", i);
+ p += scnprintf(p, bufsz + buf - p, "\t\t%x",
tid_rx_valid);
- p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.2x",
+ p += scnprintf(p, bufsz + buf - p, "\t%#.2x",
tid_rx_valid ?
sta->ampdu_mlme.tid_rx_token[i] : 0);
- p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.3x",
+ p += scnprintf(p, bufsz + buf - p, "\t%#.3x",
tid_rx ? tid_rx->ssn : 0);
- p += scnprintf(p, sizeof(buf) + buf - p, "\t\t%x", !!tid_tx);
- p += scnprintf(p, sizeof(buf) + buf - p, "\t%#.2x",
+ p += scnprintf(p, bufsz + buf - p, "\t\t%x", !!tid_tx);
+ p += scnprintf(p, bufsz + buf - p, "\t%#.2x",
tid_tx ? tid_tx->dialog_token : 0);
- p += scnprintf(p, sizeof(buf) + buf - p, "\t%03d",
+ p += scnprintf(p, bufsz + buf - p, "\t%03d",
tid_tx ? skb_queue_len(&tid_tx->pending) : 0);
- p += scnprintf(p, sizeof(buf) + buf - p, "\n");
+ p += scnprintf(p, bufsz + buf - p, "\n");
}
- rcu_read_unlock();
- return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+ return p - buf;
}
-static ssize_t sta_agg_status_write(struct file *file, const char __user *userbuf,
- size_t count, loff_t *ppos)
+static ssize_t sta_agg_status_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
{
- char _buf[25] = {}, *buf = _buf;
struct sta_info *sta = file->private_data;
+ struct wiphy *wiphy = sta->local->hw.wiphy;
+ size_t bufsz = 71 + IEEE80211_NUM_TIDS * 40;
+ char *buf = kmalloc(bufsz, GFP_KERNEL);
+ ssize_t ret;
+
+ if (!buf)
+ return -ENOMEM;
+
+ ret = wiphy_locked_debugfs_read(wiphy, file, buf, bufsz,
+ userbuf, count, ppos,
+ sta_agg_status_do_read, sta);
+ kfree(buf);
+
+ return ret;
+}
+
+static ssize_t sta_agg_status_do_write(struct wiphy *wiphy, struct file *file,
+ char *buf, size_t count, void *data)
+{
+ struct sta_info *sta = data;
bool start, tx;
unsigned long tid;
- char *pos;
+ char *pos = buf;
int ret, timeout = 5000;
- if (count > sizeof(_buf))
- return -EINVAL;
-
- if (copy_from_user(buf, userbuf, count))
- return -EFAULT;
-
- buf[sizeof(_buf) - 1] = '\0';
- pos = buf;
buf = strsep(&pos, " ");
if (!buf)
return -EINVAL;
@@ -305,25 +423,66 @@ static ssize_t sta_agg_status_write(struct file *file, const char __user *userbu
return ret ?: count;
}
+
+static ssize_t sta_agg_status_write(struct file *file,
+ const char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct sta_info *sta = file->private_data;
+ struct wiphy *wiphy = sta->local->hw.wiphy;
+ char _buf[26];
+
+ return wiphy_locked_debugfs_write(wiphy, file, _buf, sizeof(_buf),
+ userbuf, count,
+ sta_agg_status_do_write, sta);
+}
STA_OPS_RW(agg_status);
-static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf,
- size_t count, loff_t *ppos)
+/* link sta attributes */
+#define LINK_STA_OPS(name) \
+static const struct debugfs_short_fops link_sta_ ##name## _ops = { \
+ .read = link_sta_##name##_read, \
+ .llseek = generic_file_llseek, \
+}
+
+static ssize_t link_sta_addr_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
+{
+ struct link_sta_info *link_sta = file->private_data;
+ u8 mac[MAC_ADDR_STR_LEN + 2];
+
+ snprintf(mac, sizeof(mac), "%pM\n", link_sta->pub->addr);
+
+ return simple_read_from_buffer(userbuf, count, ppos, mac,
+ MAC_ADDR_STR_LEN + 1);
+}
+
+LINK_STA_OPS(addr);
+
+static ssize_t link_sta_ht_capa_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
{
#define PRINT_HT_CAP(_cond, _str) \
do { \
if (_cond) \
- p += scnprintf(p, sizeof(buf)+buf-p, "\t" _str "\n"); \
+ p += scnprintf(p, bufsz + buf - p, "\t" _str "\n"); \
} while (0)
- char buf[512], *p = buf;
+ char *buf, *p;
int i;
- struct sta_info *sta = file->private_data;
- struct ieee80211_sta_ht_cap *htc = &sta->sta.ht_cap;
+ ssize_t bufsz = 512;
+ struct link_sta_info *link_sta = file->private_data;
+ struct ieee80211_sta_ht_cap *htc = &link_sta->pub->ht_cap;
+ ssize_t ret;
- p += scnprintf(p, sizeof(buf) + buf - p, "ht %ssupported\n",
+ buf = kzalloc(bufsz, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ p = buf;
+
+ p += scnprintf(p, bufsz + buf - p, "ht %ssupported\n",
htc->ht_supported ? "" : "not ");
if (htc->ht_supported) {
- p += scnprintf(p, sizeof(buf)+buf-p, "cap: %#.4x\n", htc->cap);
+ p += scnprintf(p, bufsz + buf - p, "cap: %#.4x\n", htc->cap);
PRINT_HT_CAP((htc->cap & BIT(0)), "RX LDPC");
PRINT_HT_CAP((htc->cap & BIT(1)), "HT20/HT40");
@@ -365,81 +524,90 @@ static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf,
PRINT_HT_CAP((htc->cap & BIT(15)), "L-SIG TXOP protection");
- p += scnprintf(p, sizeof(buf)+buf-p, "ampdu factor/density: %d/%d\n",
+ p += scnprintf(p, bufsz + buf - p, "ampdu factor/density: %d/%d\n",
htc->ampdu_factor, htc->ampdu_density);
- p += scnprintf(p, sizeof(buf)+buf-p, "MCS mask:");
+ p += scnprintf(p, bufsz + buf - p, "MCS mask:");
for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++)
- p += scnprintf(p, sizeof(buf)+buf-p, " %.2x",
+ p += scnprintf(p, bufsz + buf - p, " %.2x",
htc->mcs.rx_mask[i]);
- p += scnprintf(p, sizeof(buf)+buf-p, "\n");
+ p += scnprintf(p, bufsz + buf - p, "\n");
/* If not set this is meaningless */
if (le16_to_cpu(htc->mcs.rx_highest)) {
- p += scnprintf(p, sizeof(buf)+buf-p,
+ p += scnprintf(p, bufsz + buf - p,
"MCS rx highest: %d Mbps\n",
le16_to_cpu(htc->mcs.rx_highest));
}
- p += scnprintf(p, sizeof(buf)+buf-p, "MCS tx params: %x\n",
+ p += scnprintf(p, bufsz + buf - p, "MCS tx params: %x\n",
htc->mcs.tx_params);
}
- return simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+ ret = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
+ kfree(buf);
+ return ret;
}
-STA_OPS(ht_capa);
+LINK_STA_OPS(ht_capa);
-static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf,
- size_t count, loff_t *ppos)
+static ssize_t link_sta_vht_capa_read(struct file *file, char __user *userbuf,
+ size_t count, loff_t *ppos)
{
- char buf[512], *p = buf;
- struct sta_info *sta = file->private_data;
- struct ieee80211_sta_vht_cap *vhtc = &sta->sta.vht_cap;
+ char *buf, *p;
+ struct link_sta_info *link_sta = file->private_data;
+ struct ieee80211_sta_vht_cap *vhtc = &link_sta->pub->vht_cap;
+ ssize_t ret;
+ ssize_t bufsz = 512;
- p += scnprintf(p, sizeof(buf) + buf - p, "VHT %ssupported\n",
+ buf = kzalloc(bufsz, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ p = buf;
+
+ p += scnprintf(p, bufsz + buf - p, "VHT %ssupported\n",
vhtc->vht_supported ? "" : "not ");
if (vhtc->vht_supported) {
- p += scnprintf(p, sizeof(buf) + buf - p, "cap: %#.8x\n",
+ p += scnprintf(p, bufsz + buf - p, "cap: %#.8x\n",
vhtc->cap);
#define PFLAG(a, b) \
do { \
if (vhtc->cap & IEEE80211_VHT_CAP_ ## a) \
- p += scnprintf(p, sizeof(buf) + buf - p, \
+ p += scnprintf(p, bufsz + buf - p, \
"\t\t%s\n", b); \
} while (0)
switch (vhtc->cap & 0x3) {
case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895:
- p += scnprintf(p, sizeof(buf) + buf - p,
+ p += scnprintf(p, bufsz + buf - p,
"\t\tMAX-MPDU-3895\n");
break;
case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991:
- p += scnprintf(p, sizeof(buf) + buf - p,
+ p += scnprintf(p, bufsz + buf - p,
"\t\tMAX-MPDU-7991\n");
break;
case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454:
- p += scnprintf(p, sizeof(buf) + buf - p,
+ p += scnprintf(p, bufsz + buf - p,
"\t\tMAX-MPDU-11454\n");
break;
default:
- p += scnprintf(p, sizeof(buf) + buf - p,
+ p += scnprintf(p, bufsz + buf - p,
"\t\tMAX-MPDU-UNKNOWN\n");
}
switch (vhtc->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) {
case 0:
- p += scnprintf(p, sizeof(buf) + buf - p,
+ p += scnprintf(p, bufsz + buf - p,
"\t\t80Mhz\n");
break;
case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ:
- p += scnprintf(p, sizeof(buf) + buf - p,
+ p += scnprintf(p, bufsz + buf - p,
"\t\t160Mhz\n");
break;
case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ:
- p += scnprintf(p, sizeof(buf) + buf - p,
+ p += scnprintf(p, bufsz + buf - p,
"\t\t80+80Mhz\n");
break;
default:
- p += scnprintf(p, sizeof(buf) + buf - p,
+ p += scnprintf(p, bufsz + buf - p,
"\t\tUNKNOWN-MHZ: 0x%x\n",
(vhtc->cap >> 2) & 0x3);
}
@@ -447,15 +615,15 @@ static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf,
PFLAG(SHORT_GI_80, "SHORT-GI-80");
PFLAG(SHORT_GI_160, "SHORT-GI-160");
PFLAG(TXSTBC, "TXSTBC");
- p += scnprintf(p, sizeof(buf) + buf - p,
+ p += scnprintf(p, bufsz + buf - p,
"\t\tRXSTBC_%d\n", (vhtc->cap >> 8) & 0x7);
PFLAG(SU_BEAMFORMER_CAPABLE, "SU-BEAMFORMER-CAPABLE");
PFLAG(SU_BEAMFORMEE_CAPABLE, "SU-BEAMFORMEE-CAPABLE");
- p += scnprintf(p, sizeof(buf) + buf - p,
+ p += scnprintf(p, bufsz + buf - p,
"\t\tBEAMFORMEE-STS: 0x%x\n",
(vhtc->cap & IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK) >>
IEEE80211_VHT_CAP_BEAMFORMEE_STS_SHIFT);
- p += scnprintf(p, sizeof(buf) + buf - p,
+ p += scnprintf(p, bufsz + buf - p,
"\t\tSOUNDING-DIMENSIONS: 0x%x\n",
(vhtc->cap & IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MASK)
>> IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_SHIFT);
@@ -463,55 +631,609 @@ static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf,
PFLAG(MU_BEAMFORMEE_CAPABLE, "MU-BEAMFORMEE-CAPABLE");
PFLAG(VHT_TXOP_PS, "TXOP-PS");
PFLAG(HTC_VHT, "HTC-VHT");
- p += scnprintf(p, sizeof(buf) + buf - p,
+ p += scnprintf(p, bufsz + buf - p,
"\t\tMPDU-LENGTH-EXPONENT: 0x%x\n",
(vhtc->cap & IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK) >>
IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT);
PFLAG(VHT_LINK_ADAPTATION_VHT_UNSOL_MFB,
"LINK-ADAPTATION-VHT-UNSOL-MFB");
- p += scnprintf(p, sizeof(buf) + buf - p,
+ p += scnprintf(p, bufsz + buf - p,
"\t\tLINK-ADAPTATION-VHT-MRQ-MFB: 0x%x\n",
(vhtc->cap & IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB) >> 26);
PFLAG(RX_ANTENNA_PATTERN, "RX-ANTENNA-PATTERN");
PFLAG(TX_ANTENNA_PATTERN, "TX-ANTENNA-PATTERN");
- p += scnprintf(p, sizeof(buf)+buf-p, "RX MCS: %.4x\n",
+ p += scnprintf(p, bufsz + buf - p, "RX MCS: %.4x\n",
le16_to_cpu(vhtc->vht_mcs.rx_mcs_map));
if (vhtc->